From d2e1911345ac035a1de371c5e30d45e2df4eab3d Mon Sep 17 00:00:00 2001
From: Basilisk-Dev <basiliskdev@protonmail.com>
Date: Mon, 23 Feb 2026 16:10:38 -0500
Subject: [PATCH] Issue #2925 - Upgrade libvpx to 1.16.0

---
 media/libvpx/README_MOZILLA                   |     2 +-
 media/libvpx/config/generic/vp8_rtcd.h        |    71 +-
 media/libvpx/config/generic/vp9_rtcd.h        |    52 +-
 media/libvpx/config/generic/vpx_config.asm    |    33 +-
 media/libvpx/config/generic/vpx_config.h      |    29 +-
 media/libvpx/config/generic/vpx_dsp_rtcd.h    |   452 +-
 media/libvpx/config/generic/vpx_scale_rtcd.h  |    16 +-
 media/libvpx/config/linux/arm/vp8_rtcd.h      |   167 +-
 media/libvpx/config/linux/arm/vp9_rtcd.h      |    93 +-
 media/libvpx/config/linux/arm/vpx_config.asm  |    33 +-
 media/libvpx/config/linux/arm/vpx_config.h    |    29 +-
 media/libvpx/config/linux/arm/vpx_dsp_rtcd.h  |  1236 +-
 .../libvpx/config/linux/arm/vpx_scale_rtcd.h  |    16 +-
 media/libvpx/config/linux/arm64/vp8_rtcd.h    |   211 +
 media/libvpx/config/linux/arm64/vp9_rtcd.h    |   125 +
 .../libvpx/config/linux/arm64/vpx_config.asm  |    97 +
 media/libvpx/config/linux/arm64/vpx_config.c  |    10 +
 media/libvpx/config/linux/arm64/vpx_config.h  |   108 +
 .../libvpx/config/linux/arm64/vpx_dsp_rtcd.h  |  1197 +
 .../config/linux/arm64/vpx_scale_rtcd.h       |    85 +
 media/libvpx/config/linux/ia32/vp8_rtcd.h     |   206 +-
 media/libvpx/config/linux/ia32/vp9_rtcd.h     |   117 +-
 media/libvpx/config/linux/ia32/vpx_config.asm |    29 +-
 media/libvpx/config/linux/ia32/vpx_config.h   |    29 +-
 media/libvpx/config/linux/ia32/vpx_dsp_rtcd.h |  1229 +-
 .../libvpx/config/linux/ia32/vpx_scale_rtcd.h |    16 +-
 .../config/linux/loongarch64/vp8_rtcd.h       |   229 +
 .../config/linux/loongarch64/vp9_rtcd.h       |   118 +
 .../config/linux/loongarch64/vpx_config.asm   |    97 +
 .../config/linux/loongarch64/vpx_config.c     |    10 +
 .../config/linux/loongarch64/vpx_config.h     |   108 +
 .../config/linux/loongarch64/vpx_dsp_rtcd.h   |   929 +
 .../config/linux/loongarch64/vpx_scale_rtcd.h |    84 +
 media/libvpx/config/linux/mips32/vp8_rtcd.h   |   370 +
 media/libvpx/config/linux/mips32/vp9_rtcd.h   |   244 +
 .../libvpx/config/linux/mips32/vpx_config.asm |    97 +
 media/libvpx/config/linux/mips32/vpx_config.c |    10 +
 media/libvpx/config/linux/mips32/vpx_config.h |   108 +
 .../libvpx/config/linux/mips32/vpx_dsp_rtcd.h |  1524 ++
 .../config/linux/mips32/vpx_scale_rtcd.h      |   176 +
 media/libvpx/config/linux/mips64/vp8_rtcd.h   |   259 +
 media/libvpx/config/linux/mips64/vp9_rtcd.h   |   138 +
 .../libvpx/config/linux/mips64/vpx_config.asm |    97 +
 media/libvpx/config/linux/mips64/vpx_config.c |    10 +
 media/libvpx/config/linux/mips64/vpx_config.h |   108 +
 .../libvpx/config/linux/mips64/vpx_dsp_rtcd.h |  1029 +
 .../config/linux/mips64/vpx_scale_rtcd.h      |    96 +
 media/libvpx/config/linux/ppc64le/vp8_rtcd.h  |   180 +
 media/libvpx/config/linux/ppc64le/vp9_rtcd.h  |   132 +
 .../config/linux/ppc64le/vpx_config.asm       |    97 +
 .../libvpx/config/linux/ppc64le/vpx_config.c  |    10 +
 .../libvpx/config/linux/ppc64le/vpx_config.h  |   108 +
 .../config/linux/ppc64le/vpx_dsp_rtcd.h       |  1015 +
 .../config/linux/ppc64le/vpx_scale_rtcd.h     |    83 +
 media/libvpx/config/linux/x64/vp8_rtcd.h      |   168 +-
 media/libvpx/config/linux/x64/vp9_rtcd.h      |   112 +-
 media/libvpx/config/linux/x64/vpx_config.asm  |    29 +-
 media/libvpx/config/linux/x64/vpx_config.h    |    29 +-
 media/libvpx/config/linux/x64/vpx_dsp_rtcd.h  |  1113 +-
 .../libvpx/config/linux/x64/vpx_scale_rtcd.h  |    16 +-
 media/libvpx/config/mac/arm64/vp8_rtcd.h      |   211 +
 media/libvpx/config/mac/arm64/vp9_rtcd.h      |   125 +
 media/libvpx/config/mac/arm64/vpx_config.asm  |    97 +
 media/libvpx/config/mac/arm64/vpx_config.c    |    10 +
 media/libvpx/config/mac/arm64/vpx_config.h    |   108 +
 media/libvpx/config/mac/arm64/vpx_dsp_rtcd.h  |  1197 +
 .../libvpx/config/mac/arm64/vpx_scale_rtcd.h  |    85 +
 media/libvpx/config/mac/ia32/vp8_rtcd.h       |   206 +-
 media/libvpx/config/mac/ia32/vp9_rtcd.h       |   117 +-
 media/libvpx/config/mac/ia32/vpx_config.asm   |    29 +-
 media/libvpx/config/mac/ia32/vpx_config.h     |    29 +-
 media/libvpx/config/mac/ia32/vpx_dsp_rtcd.h   |  1229 +-
 media/libvpx/config/mac/ia32/vpx_scale_rtcd.h |    16 +-
 media/libvpx/config/mac/x64/vp8_rtcd.h        |   168 +-
 media/libvpx/config/mac/x64/vp9_rtcd.h        |   112 +-
 media/libvpx/config/mac/x64/vpx_config.asm    |    29 +-
 media/libvpx/config/mac/x64/vpx_config.h      |    29 +-
 media/libvpx/config/mac/x64/vpx_dsp_rtcd.h    |  1113 +-
 media/libvpx/config/mac/x64/vpx_scale_rtcd.h  |    16 +-
 media/libvpx/config/vpx_version.h             |    12 +-
 media/libvpx/config/win/ia32/vp8_rtcd.h       |   206 +-
 media/libvpx/config/win/ia32/vp9_rtcd.h       |   117 +-
 media/libvpx/config/win/ia32/vpx_config.asm   |    29 +-
 media/libvpx/config/win/ia32/vpx_config.h     |    29 +-
 media/libvpx/config/win/ia32/vpx_dsp_rtcd.h   |  1229 +-
 media/libvpx/config/win/ia32/vpx_scale_rtcd.h |    16 +-
 media/libvpx/config/win/x64/vp8_rtcd.h        |   168 +-
 media/libvpx/config/win/x64/vp9_rtcd.h        |   112 +-
 media/libvpx/config/win/x64/vpx_config.asm    |    29 +-
 media/libvpx/config/win/x64/vpx_config.c      |     2 +-
 media/libvpx/config/win/x64/vpx_config.h      |    31 +-
 media/libvpx/config/win/x64/vpx_dsp_rtcd.h    |  1113 +-
 media/libvpx/config/win/x64/vpx_scale_rtcd.h  |    16 +-
 media/libvpx/generate_sources_mozbuild.sh     |    48 +-
 media/libvpx/libvpx/.clang-format             |    84 +-
 media/libvpx/libvpx/.mailmap                  |    27 +-
 media/libvpx/libvpx/AUTHORS                   |   108 +-
 media/libvpx/libvpx/CHANGELOG                 |   492 +-
 media/libvpx/libvpx/CONTRIBUTING.md           |    29 +
 media/libvpx/libvpx/README                    |   133 +-
 media/libvpx/libvpx/args.c                    |    35 +-
 media/libvpx/libvpx/args.h                    |    20 +-
 media/libvpx/libvpx/build/make/Android.mk     |    56 +-
 media/libvpx/libvpx/build/make/Makefile       |    66 +-
 .../libvpx/libvpx/build/make/ads2armasm_ms.pl |     2 +-
 media/libvpx/libvpx/build/make/ads2gas.pl     |   161 +-
 .../libvpx/libvpx/build/make/ads2gas_apple.pl |   146 +-
 media/libvpx/libvpx/build/make/configure.sh   |   659 +-
 .../libvpx/libvpx/build/make/gen_asm_deps.sh  |     2 +-
 .../libvpx/libvpx/build/make/gen_msvs_sln.sh  |    35 +-
 .../libvpx/build/make/gen_msvs_vcxproj.sh     |    88 +-
 media/libvpx/libvpx/build/make/iosbuild.sh    |    19 +-
 media/libvpx/libvpx/build/make/msvs_common.sh |    12 +-
 media/libvpx/libvpx/build/make/rtcd.pl        |   162 +-
 media/libvpx/libvpx/build/make/thumb.pm       |    12 +-
 media/libvpx/libvpx/build/make/version.sh     |     4 +
 media/libvpx/libvpx/codereview.settings       |     5 +-
 media/libvpx/libvpx/configure                 |   191 +-
 media/libvpx/libvpx/examples.mk               |    73 +-
 .../libvpx/examples/decode_with_drops.c       |     2 +-
 media/libvpx/libvpx/examples/postproc.c       |     6 +-
 media/libvpx/libvpx/examples/resize_util.c    |   123 -
 media/libvpx/libvpx/examples/set_maps.c       |     2 +-
 media/libvpx/libvpx/examples/simple_decoder.c |     2 +-
 media/libvpx/libvpx/examples/simple_encoder.c |     2 +-
 .../libvpx/{vpx => examples}/svc_context.h    |    27 +-
 .../{vpx/src => examples}/svc_encodeframe.c   |   184 +-
 .../libvpx/libvpx/examples/twopass_encoder.c  |     7 +-
 .../examples/vp8_multi_resolution_encoder.c   |    54 +-
 media/libvpx/libvpx/examples/vp8cx_set_ref.c  |     2 +-
 .../libvpx/examples/vp9_lossless_encoder.c    |     2 +-
 .../libvpx/examples/vp9_spatial_svc_encoder.c |   870 +-
 media/libvpx/libvpx/examples/vp9cx_set_ref.c  |   126 +-
 .../libvpx/libvpx/examples/vpx_dec_fuzzer.cc  |   159 +
 .../libvpx/libvpx/examples/vpx_enc_fuzzer.cc  |   236 +
 .../examples/vpx_temporal_svc_encoder.c       |   338 +-
 media/libvpx/libvpx/ivfdec.c                  |     8 +-
 media/libvpx/libvpx/ivfdec.h                  |     6 +-
 media/libvpx/libvpx/ivfenc.c                  |    30 +-
 media/libvpx/libvpx/ivfenc.h                  |    13 +-
 media/libvpx/libvpx/libs.doxy_template        |    60 +-
 media/libvpx/libvpx/libs.mk                   |   270 +-
 media/libvpx/libvpx/mainpage.dox              |     2 +
 media/libvpx/libvpx/md5_utils.c               |    20 +-
 media/libvpx/libvpx/md5_utils.h               |     6 +-
 media/libvpx/libvpx/rate_hist.c               |    70 +-
 media/libvpx/libvpx/rate_hist.h               |     6 +-
 media/libvpx/libvpx/test/acm_random.h         |    49 +-
 .../libvpx/test/active_map_refresh_test.cc    |    20 +-
 media/libvpx/libvpx/test/active_map_test.cc   |    24 +-
 media/libvpx/libvpx/test/add_noise_test.cc    |    58 +-
 .../libvpx/test/alt_ref_aq_segment_test.cc    |    20 +-
 media/libvpx/libvpx/test/altref_test.cc       |    43 +-
 media/libvpx/libvpx/test/android/Android.mk   |    11 +
 media/libvpx/libvpx/test/android/README       |    11 +-
 media/libvpx/libvpx/test/android/get_files.py |    17 +-
 media/libvpx/libvpx/test/aq_segment_test.cc   |    20 +-
 media/libvpx/libvpx/test/avg_test.cc          |   529 +-
 media/libvpx/libvpx/test/bench.cc             |    39 +
 media/libvpx/libvpx/test/bench.h              |    32 +
 media/libvpx/libvpx/test/blockiness_test.cc   |    37 +-
 media/libvpx/libvpx/test/borders_test.cc      |    24 +-
 media/libvpx/libvpx/test/buffer.h             |   382 +
 .../libvpx/libvpx/test/byte_alignment_test.cc |    30 +-
 media/libvpx/libvpx/test/clear_system_state.h |    16 +-
 media/libvpx/libvpx/test/codec_factory.h      |    97 +-
 .../libvpx/libvpx/test/comp_avg_pred_test.cc  |   276 +
 media/libvpx/libvpx/test/config_test.cc       |    14 +-
 media/libvpx/libvpx/test/consistency_test.cc  |    39 +-
 media/libvpx/libvpx/test/convolve_test.cc     |  1423 +-
 media/libvpx/libvpx/test/cpu_speed_test.cc    |    25 +-
 media/libvpx/libvpx/test/cq_test.cc           |    34 +-
 media/libvpx/libvpx/test/cx_set_ref.sh        |     2 +-
 media/libvpx/libvpx/test/datarate_test.cc     |  1476 --
 media/libvpx/libvpx/test/dct16x16_test.cc     |   380 +-
 media/libvpx/libvpx/test/dct32x32_test.cc     |   348 +-
 media/libvpx/libvpx/test/dct_partial_test.cc  |   184 +
 media/libvpx/libvpx/test/dct_test.cc          |   791 +
 media/libvpx/libvpx/test/decode_api_test.cc   |   140 +-
 media/libvpx/libvpx/test/decode_corrupted.cc  |   104 +
 media/libvpx/libvpx/test/decode_perf_test.cc  |    50 +-
 media/libvpx/libvpx/test/decode_svc_test.cc   |    30 +-
 .../libvpx/libvpx/test/decode_test_driver.cc  |    27 +-
 media/libvpx/libvpx/test/decode_test_driver.h |    10 +-
 media/libvpx/libvpx/test/decode_to_md5.sh     |     2 +-
 media/libvpx/libvpx/test/decode_with_drops.sh |    10 +-
 media/libvpx/libvpx/test/encode_api_test.cc   |  2266 +-
 media/libvpx/libvpx/test/encode_perf_test.cc  |    28 +-
 .../libvpx/libvpx/test/encode_test_driver.cc  |    35 +-
 media/libvpx/libvpx/test/encode_test_driver.h |    85 +-
 .../libvpx/test/error_resilience_test.cc      |    50 +-
 .../libvpx/test/external_frame_buffer_test.cc |   123 +-
 media/libvpx/libvpx/test/fdct4x4_test.cc      |   511 -
 media/libvpx/libvpx/test/fdct8x8_test.cc      |   217 +-
 media/libvpx/libvpx/test/frame_size_tests.cc  |   165 +-
 media/libvpx/libvpx/test/hadamard_test.cc     |   401 +-
 media/libvpx/libvpx/test/i420_video_source.h  |     6 +-
 media/libvpx/libvpx/test/idct8x8_test.cc      |     3 +-
 media/libvpx/libvpx/test/idct_test.cc         |   208 +-
 media/libvpx/libvpx/test/init_vpx_test.cc     |    99 +
 media/libvpx/libvpx/test/init_vpx_test.h      |    18 +
 media/libvpx/libvpx/test/invalid_file_test.cc |    68 +-
 media/libvpx/libvpx/test/ivf_video_source.h   |    38 +-
 media/libvpx/libvpx/test/keyframe_test.cc     |   134 +-
 media/libvpx/libvpx/test/level_test.cc        |    53 +-
 media/libvpx/libvpx/test/lpf_test.cc          |   171 +-
 media/libvpx/libvpx/test/md5_helper.h         |     8 +-
 media/libvpx/libvpx/test/minmax_test.cc       |   145 +-
 .../libvpx/libvpx/test/non_greedy_mv_test.cc  |   200 +
 media/libvpx/libvpx/test/partial_idct_test.cc |   593 +-
 media/libvpx/libvpx/test/postproc.sh          |     2 +-
 media/libvpx/libvpx/test/pp_filter_test.cc    |   664 +-
 media/libvpx/libvpx/test/predict_test.cc      |   110 +-
 media/libvpx/libvpx/test/quantize_test.cc     |    71 +-
 media/libvpx/libvpx/test/realtime_test.cc     |    89 +-
 .../libvpx/libvpx/test/register_state_check.h |    56 +-
 media/libvpx/libvpx/test/resize_test.cc       |   401 +-
 media/libvpx/libvpx/test/resize_util.sh       |    69 -
 media/libvpx/libvpx/test/sad_test.cc          |  1297 +-
 media/libvpx/libvpx/test/set_maps.sh          |     2 +-
 media/libvpx/libvpx/test/set_roi.cc           |    14 +-
 media/libvpx/libvpx/test/simple_decoder.sh    |     2 +-
 media/libvpx/libvpx/test/simple_encoder.sh    |     2 +-
 media/libvpx/libvpx/test/stress.sh            |    80 +-
 media/libvpx/libvpx/test/sum_squares_test.cc  |   251 +-
 media/libvpx/libvpx/test/superframe_test.cc   |    28 +-
 media/libvpx/libvpx/test/svc_datarate_test.cc |  2208 ++
 .../libvpx/libvpx/test/svc_end_to_end_test.cc |   825 +
 media/libvpx/libvpx/test/svc_test.cc          |   874 +-
 media/libvpx/libvpx/test/svc_test.h           |    67 +
 media/libvpx/libvpx/test/test-data.mk         |    42 +-
 media/libvpx/libvpx/test/test-data.sha1       |    39 +-
 media/libvpx/libvpx/test/test.mk              |    70 +-
 .../libvpx/test/test_intra_pred_speed.cc      |   296 +-
 media/libvpx/libvpx/test/test_libvpx.cc       |    61 +-
 media/libvpx/libvpx/test/test_rc_interface.cc |    16 +
 media/libvpx/libvpx/test/test_vector_test.cc  |   105 +-
 media/libvpx/libvpx/test/test_vectors.cc      |     2 +
 media/libvpx/libvpx/test/test_vectors.h       |     6 +-
 .../libvpx/test/tile_independence_test.cc     |    18 +-
 media/libvpx/libvpx/test/timestamp_test.cc    |   109 +
 media/libvpx/libvpx/test/tools_common.sh      |    17 +-
 media/libvpx/libvpx/test/twopass_encoder.sh   |    11 +-
 media/libvpx/libvpx/test/user_priv_test.cc    |    14 +-
 media/libvpx/libvpx/test/util.h               |    12 +-
 media/libvpx/libvpx/test/variance_test.cc     |  1891 +-
 media/libvpx/libvpx/test/video_source.h       |    80 +-
 .../libvpx/libvpx/test/vp8_boolcoder_test.cc  |     5 +-
 media/libvpx/libvpx/test/vp8_datarate_test.cc |   508 +
 media/libvpx/libvpx/test/vp8_decrypt_test.cc  |     2 +-
 .../libvpx/test/vp8_denoiser_sse2_test.cc     |    20 +-
 media/libvpx/libvpx/test/vp8_fdct4x4_test.cc  |    69 +-
 .../libvpx/libvpx/test/vp8_fragments_test.cc  |    14 +-
 .../test/vp8_multi_resolution_encoder.sh      |    24 +-
 .../libvpx/test/vp8_ratectrl_rtc_test.cc      |   424 +
 media/libvpx/libvpx/test/vp9_arf_freq_test.cc |    33 +-
 ..._block_test.cc => vp9_block_error_test.cc} |   122 +-
 .../libvpx/libvpx/test/vp9_boolcoder_test.cc  |    32 +-
 .../libvpx/test/vp9_c_vs_simd_encode.sh       |   420 +
 media/libvpx/libvpx/test/vp9_datarate_test.cc |  1151 +
 media/libvpx/libvpx/test/vp9_decrypt_test.cc  |     2 +-
 ...iser_sse2_test.cc => vp9_denoiser_test.cc} |    65 +-
 .../test/vp9_encoder_parms_get_to_decoder.cc  |    34 +-
 .../libvpx/libvpx/test/vp9_end_to_end_test.cc |   263 +-
 media/libvpx/libvpx/test/vp9_ethread_test.cc  |   355 +-
 .../libvpx/test/vp9_ext_ratectrl_test.cc      |   271 +
 .../libvpx/test/vp9_frame_parallel_test.cc    |   217 -
 .../libvpx/libvpx/test/vp9_intrapred_test.cc  |   521 +-
 media/libvpx/libvpx/test/vp9_lossless_test.cc |    29 +-
 .../libvpx/test/vp9_motion_vector_test.cc     |   102 +
 media/libvpx/libvpx/test/vp9_quantize_test.cc |   930 +-
 .../libvpx/test/vp9_ratectrl_rtc_test.cc      |   675 +
 media/libvpx/libvpx/test/vp9_roi_test.cc      |   148 +
 media/libvpx/libvpx/test/vp9_scale_test.cc    |   214 +
 .../libvpx/test/vp9_skip_loopfilter_test.cc   |    37 +-
 .../libvpx/test/vp9_spatial_svc_encoder.sh    |    72 -
 media/libvpx/libvpx/test/vp9_subtract_test.cc |   333 +-
 media/libvpx/libvpx/test/vp9_thread_test.cc   |   195 +-
 media/libvpx/libvpx/test/vpx_image_test.cc    |   128 +
 media/libvpx/libvpx/test/vpx_scale_test.cc    |   197 +-
 media/libvpx/libvpx/test/vpx_scale_test.h     |   201 +
 .../libvpx/test/vpx_temporal_svc_encoder.sh   |   146 +-
 media/libvpx/libvpx/test/vpxdec.sh            |    39 +-
 media/libvpx/libvpx/test/vpxenc.sh            |   196 +-
 media/libvpx/libvpx/test/webm_video_source.h  |    34 +-
 media/libvpx/libvpx/test/y4m_test.cc          |   105 +-
 media/libvpx/libvpx/test/y4m_video_source.h   |    45 +-
 .../libvpx/test/yuv_temporal_filter_test.cc   |   727 +
 media/libvpx/libvpx/test/yuv_video_source.h   |    41 +-
 .../third_party/googletest/README.libvpx      |    28 +-
 .../third_party/googletest/src/.clang-format  |     4 +
 .../libvpx/third_party/googletest/src/CHANGES |   157 -
 .../third_party/googletest/src/CONTRIBUTORS   |    28 +
 .../libvpx/third_party/googletest/src/README  |   435 -
 .../third_party/googletest/src/README.md      |   217 +
 .../include/gtest/gtest-assertion-result.h    |   237 +
 .../src/include/gtest/gtest-death-test.h      |   345 +
 .../src/include/gtest/gtest-matchers.h        |   956 +
 .../src/include/gtest/gtest-message.h         |   218 +
 .../src/include/gtest/gtest-param-test.h      |   510 +
 .../src/include/gtest/gtest-printers.h        |  1048 +
 .../googletest/src/include/gtest/gtest-spi.h  |   248 +
 .../src/include/gtest/gtest-test-part.h       |   190 +
 .../src/include/gtest/gtest-typed-test.h      |   331 +
 .../googletest/src/include/gtest/gtest.h      | 19368 +---------------
 .../src/include/gtest/gtest_pred_impl.h       |   279 +
 .../googletest/src/include/gtest/gtest_prod.h |    60 +
 .../include/gtest/internal/custom/README.md   |    44 +
 .../gtest/internal/custom/gtest-port.h        |    68 +
 .../gtest/internal/custom/gtest-printers.h    |    42 +
 .../src/include/gtest/internal/custom/gtest.h |    37 +
 .../internal/gtest-death-test-internal.h      |   306 +
 .../include/gtest/internal/gtest-filepath.h   |   210 +
 .../include/gtest/internal/gtest-internal.h   |  1570 ++
 .../include/gtest/internal/gtest-param-util.h |   956 +
 .../include/gtest/internal/gtest-port-arch.h  |   116 +
 .../src/include/gtest/internal/gtest-port.h   |  2413 ++
 .../src/include/gtest/internal/gtest-string.h |   177 +
 .../include/gtest/internal/gtest-type-util.h  |   186 +
 .../googletest/src/src/gtest-all.cc           |  9565 +-------
 .../src/src/gtest-assertion-result.cc         |    77 +
 .../googletest/src/src/gtest-death-test.cc    |  1620 ++
 .../googletest/src/src/gtest-filepath.cc      |   367 +
 .../googletest/src/src/gtest-internal-inl.h   |  1212 +
 .../googletest/src/src/gtest-matchers.cc      |    98 +
 .../googletest/src/src/gtest-port.cc          |  1394 ++
 .../googletest/src/src/gtest-printers.cc      |   553 +
 .../googletest/src/src/gtest-test-part.cc     |   105 +
 .../googletest/src/src/gtest-typed-test.cc    |   104 +
 .../third_party/googletest/src/src/gtest.cc   |  6795 ++++++
 .../googletest/src/src/gtest_main.cc          |    19 +-
 .../libvpx/third_party/libwebm/AUTHORS.TXT    |     1 +
 .../libvpx/third_party/libwebm/Android.mk     |     8 +-
 .../libvpx/third_party/libwebm/README.libvpx  |    16 +-
 .../third_party/libwebm/common/file_util.cc   |    19 +-
 .../third_party/libwebm/common/file_util.h    |     5 +-
 .../third_party/libwebm/common/hdr_util.cc    |    11 +-
 .../third_party/libwebm/common/hdr_util.h     |     2 +-
 .../third_party/libwebm/common/webmids.h      |     1 +
 .../third_party/libwebm/mkvmuxer/mkvmuxer.cc  |   234 +-
 .../third_party/libwebm/mkvmuxer/mkvmuxer.h   |    15 +-
 .../libwebm/mkvmuxer/mkvmuxerutil.cc          |    32 +-
 .../libwebm/mkvmuxer/mkvmuxerutil.h           |     7 +-
 .../third_party/libwebm/mkvmuxer/mkvwriter.cc |    12 +-
 .../libwebm/mkvparser/mkvparser.cc            |   225 +-
 .../third_party/libwebm/mkvparser/mkvparser.h |     6 +-
 .../libwebm/mkvparser/mkvreader.cc            |    12 +-
 .../libvpx/libvpx/third_party/libyuv/LICENSE  |    29 +
 .../libvpx/third_party/libyuv/README.libvpx   |    23 +-
 .../libyuv/include/libyuv/basic_types.h       |   109 +-
 .../libyuv/include/libyuv/compare.h           |    93 +-
 .../libyuv/include/libyuv/convert.h           |   421 +-
 .../libyuv/include/libyuv/convert_argb.h      |   676 +-
 .../libyuv/include/libyuv/convert_from.h      |   377 +-
 .../libyuv/include/libyuv/convert_from_argb.h |   283 +-
 .../libyuv/include/libyuv/cpu_id.h            |    75 +-
 .../libyuv/include/libyuv/macros_msa.h        |   233 +
 .../libyuv/include/libyuv/mjpeg_decoder.h     |    33 +-
 .../libyuv/include/libyuv/planar_functions.h  |  1248 +-
 .../libyuv/include/libyuv/rotate.h            |   143 +-
 .../libyuv/include/libyuv/rotate_argb.h       |    14 +-
 .../libyuv/include/libyuv/rotate_row.h        |   203 +-
 .../third_party/libyuv/include/libyuv/row.h   |  4065 +++-
 .../third_party/libyuv/include/libyuv/scale.h |   110 +-
 .../libyuv/include/libyuv/scale_argb.h        |    60 +-
 .../libyuv/include/libyuv/scale_row.h         |  1083 +-
 .../libyuv/include/libyuv/version.h           |     6 +-
 .../libyuv/include/libyuv/video_common.h      |    52 +-
 .../third_party/libyuv/source/compare.cc      |   267 +-
 .../libyuv/source/compare_common.cc           |    70 +-
 .../third_party/libyuv/source/compare_gcc.cc  |   427 +-
 .../third_party/libyuv/source/compare_msa.cc  |    97 +
 .../third_party/libyuv/source/compare_neon.cc |    94 +-
 .../libyuv/source/compare_neon64.cc           |    88 +-
 .../third_party/libyuv/source/compare_win.cc  |   119 +-
 .../third_party/libyuv/source/convert.cc      |   963 +-
 .../third_party/libyuv/source/convert_argb.cc |  1777 +-
 .../third_party/libyuv/source/convert_from.cc |  1165 +-
 .../libyuv/source/convert_from_argb.cc        |   839 +-
 .../third_party/libyuv/source/convert_jpeg.cc |   243 +-
 .../libyuv/source/convert_to_argb.cc          |   246 +-
 .../libyuv/source/convert_to_i420.cc          |   302 +-
 .../third_party/libyuv/source/cpu_id.cc       |   208 +-
 .../libyuv/source/mjpeg_decoder.cc            |   126 +-
 .../libyuv/source/mjpeg_validate.cc           |    11 +-
 .../libyuv/source/planar_functions.cc         |  1876 +-
 .../third_party/libyuv/source/rotate.cc       |   377 +-
 .../third_party/libyuv/source/rotate_any.cc   |    57 +-
 .../third_party/libyuv/source/rotate_argb.cc  |   163 +-
 .../libyuv/source/rotate_common.cc            |    40 +-
 .../third_party/libyuv/source/rotate_gcc.cc   |   660 +-
 .../third_party/libyuv/source/rotate_mips.cc  |   484 -
 .../third_party/libyuv/source/rotate_msa.cc   |   250 +
 .../third_party/libyuv/source/rotate_neon.cc  |   567 +-
 .../libyuv/source/rotate_neon64.cc            |   685 +-
 .../third_party/libyuv/source/rotate_win.cc   |    51 +-
 .../third_party/libyuv/source/row_any.cc      |   937 +-
 .../third_party/libyuv/source/row_common.cc   |  2514 +-
 .../third_party/libyuv/source/row_gcc.cc      |  9987 ++++----
 .../third_party/libyuv/source/row_mips.cc     |   782 -
 .../third_party/libyuv/source/row_msa.cc      |  3512 +++
 .../third_party/libyuv/source/row_neon.cc     |  4374 ++--
 .../third_party/libyuv/source/row_neon64.cc   |  4147 ++--
 .../third_party/libyuv/source/row_win.cc      |  3943 ++--
 .../libvpx/third_party/libyuv/source/scale.cc |   987 +-
 .../third_party/libyuv/source/scale_any.cc    |   489 +-
 .../third_party/libyuv/source/scale_argb.cc   |   573 +-
 .../third_party/libyuv/source/scale_common.cc |   808 +-
 .../third_party/libyuv/source/scale_gcc.cc    |  2280 +-
 .../third_party/libyuv/source/scale_mips.cc   |   644 -
 .../third_party/libyuv/source/scale_msa.cc    |   949 +
 .../third_party/libyuv/source/scale_neon.cc   |  1453 +-
 .../third_party/libyuv/source/scale_neon64.cc |  1582 +-
 .../third_party/libyuv/source/scale_win.cc    |   861 +-
 .../third_party/libyuv/source/video_common.cc |    51 +-
 .../libvpx/libvpx/third_party/nalloc/LICENSE  |    21 +
 .../libvpx/third_party/nalloc/README.libvpx   |    11 +
 .../libvpx/libvpx/third_party/nalloc/nalloc.h |   342 +
 .../libvpx/third_party/x86inc/README.libvpx   |     9 +-
 .../libvpx/third_party/x86inc/x86inc.asm      |   750 +-
 media/libvpx/libvpx/tools.mk                  |    25 +-
 .../3D-Reconstruction/MotionEST/Anandan.py    |   193 +
 .../3D-Reconstruction/MotionEST/Exhaust.py    |   259 +
 .../MotionEST/GroundTruth.py                  |    48 +
 .../MotionEST/HornSchunck.py                  |   212 +
 .../3D-Reconstruction/MotionEST/MotionEST.py  |   117 +
 .../MotionEST/SearchSmooth.py                 |   221 +
 .../tools/3D-Reconstruction/MotionEST/Util.py |    46 +
 .../tools/3D-Reconstruction/genY4M/genY4M.py  |    85 +
 .../sketch_3D_reconstruction/BVH.pde          |   163 +
 .../sketch_3D_reconstruction/Camera.pde       |   138 +
 .../sketch_3D_reconstruction/MotionField.pde  |   102 +
 .../sketch_3D_reconstruction/PointCloud.pde   |   138 +
 .../sketch_3D_reconstruction/Ray_Tracing.pde  |    61 +
 .../sketch_3D_reconstruction/Scene.pde        |    59 +
 .../sketch_3D_reconstruction/Transform.pde    |    82 +
 .../sketch_3D_reconstruction/Util.pde         |    28 +
 .../sketch_3D_reconstruction.pde              |    74 +
 media/libvpx/libvpx/tools/README.pgo.md       |    24 +
 media/libvpx/libvpx/tools/all_builds.py       |    72 -
 .../libvpx/tools/author_first_release.sh      |    15 -
 media/libvpx/libvpx/tools/cpplint.py          |  3428 ++-
 media/libvpx/libvpx/tools/diff.py             |     2 +-
 media/libvpx/libvpx/tools/ftfy.sh             |   158 -
 media/libvpx/libvpx/tools/intersect-diffs.py  |     4 +-
 media/libvpx/libvpx/tools/lint-hunks.py       |    38 +-
 .../tools/non_greedy_mv/non_greedy_mv.py      |   195 +
 media/libvpx/libvpx/tools/set_analyzer_env.sh |   135 +
 media/libvpx/libvpx/tools/tiny_ssim.c         |   623 +-
 media/libvpx/libvpx/tools/wrap-commit-msg.py  |     2 +-
 media/libvpx/libvpx/tools_common.c            |   369 +-
 media/libvpx/libvpx/tools_common.h            |    81 +-
 media/libvpx/libvpx/usage_cx.dox              |     2 +
 media/libvpx/libvpx/usage_dx.dox              |     2 +
 media/libvpx/libvpx/video_common.h            |     6 +-
 media/libvpx/libvpx/video_reader.c            |    32 +-
 media/libvpx/libvpx/video_reader.h            |     6 +-
 media/libvpx/libvpx/video_writer.c            |    14 +-
 media/libvpx/libvpx/video_writer.h            |     6 +-
 media/libvpx/libvpx/vp8/common/alloccommon.h  |     8 +-
 .../libvpx/vp8/common/arm/loopfilter_arm.c    |    22 +-
 .../libvpx/vp8/common/arm/loopfilter_arm.h    |    31 +
 .../common/arm/neon/bilinearpredict_neon.c    |    36 +-
 .../libvpx/vp8/common/arm/neon/copymem_neon.c |     2 +
 .../vp8/common/arm/neon/dequantizeb_neon.c    |     1 +
 .../vp8/common/arm/neon/idct_blk_neon.c       |   251 +-
 .../common/arm/neon/idct_dequant_0_2x_neon.c  |    59 -
 .../arm/neon/idct_dequant_full_2x_neon.c      |   182 -
 .../libvpx/vp8/common/arm/neon/iwalsh_neon.c  |     2 +
 .../loopfiltersimplehorizontaledge_neon.c     |     2 +
 .../neon/loopfiltersimpleverticaledge_neon.c  |     2 +
 .../vp8/common/arm/neon/mbloopfilter_neon.c   |     2 +
 .../vp8/common/arm/neon/sixtappredict_neon.c  |    46 +-
 .../vp8/common/arm/neon/vp8_loopfilter_neon.c |     2 +
 media/libvpx/libvpx/vp8/common/blockd.c       |    12 +-
 media/libvpx/libvpx/vp8/common/blockd.h       |    32 +-
 .../libvpx/vp8/common/coefupdateprobs.h       |     6 +-
 media/libvpx/libvpx/vp8/common/common.h       |    24 +-
 .../libvpx/vp8/common/default_coef_probs.h    |     8 +-
 media/libvpx/libvpx/vp8/common/entropy.c      |    20 +-
 media/libvpx/libvpx/vp8/common/entropy.h      |     6 +-
 media/libvpx/libvpx/vp8/common/entropymode.c  |    23 +-
 media/libvpx/libvpx/vp8/common/entropymode.h  |    10 +-
 media/libvpx/libvpx/vp8/common/entropymv.h    |     6 +-
 media/libvpx/libvpx/vp8/common/extend.c       |    54 +-
 media/libvpx/libvpx/vp8/common/extend.h       |     6 +-
 media/libvpx/libvpx/vp8/common/filter.h       |     6 +-
 media/libvpx/libvpx/vp8/common/findnearmv.c   |    32 +-
 media/libvpx/libvpx/vp8/common/findnearmv.h   |     8 +-
 .../vp8/common/generic/systemdependent.c      |    57 +-
 media/libvpx/libvpx/vp8/common/header.h       |     6 +-
 media/libvpx/libvpx/vp8/common/idct_blk.c     |    26 +-
 media/libvpx/libvpx/vp8/common/invtrans.h     |     6 +-
 .../libvpx/vp8/common/loongarch/idct_lsx.c    |   322 +
 .../common/loongarch/loopfilter_filters_lsx.c |   743 +
 .../vp8/common/loongarch/sixtap_filter_lsx.c  |  1904 ++
 media/libvpx/libvpx/vp8/common/loopfilter.h   |    12 +-
 .../libvpx/vp8/common/loopfilter_filters.c    |   124 +-
 media/libvpx/libvpx/vp8/common/mfqe.c         |    19 +-
 .../vp8/common/mips/dspr2/filter_dspr2.c      |    26 +-
 .../vp8/common/mips/dspr2/idct_blk_dspr2.c    |    20 +-
 .../mips/dspr2/vp8_loopfilter_filters_dspr2.c |    12 +-
 .../libvpx/vp8/common/mips/mmi/copymem_mmi.c  |   114 +
 .../vp8/common/mips/mmi/dequantize_mmi.c      |   115 +
 .../libvpx/vp8/common/mips/mmi/idct_blk_mmi.c |    70 +
 .../libvpx/vp8/common/mips/mmi/idctllm_mmi.c  |   335 +
 .../common/mips/mmi/loopfilter_filters_mmi.c  |  1415 ++
 .../vp8/common/mips/mmi/sixtap_filter_mmi.c   |   427 +
 .../libvpx/vp8/common/mips/msa/idct_msa.c     |    58 +-
 .../vp8/common/mips/msa/sixtap_filter_msa.c   |   181 +-
 .../vp8/common/mips/msa/vp8_macros_msa.h      |   262 +-
 media/libvpx/libvpx/vp8/common/modecont.c     |    36 +-
 media/libvpx/libvpx/vp8/common/modecont.h     |     6 +-
 media/libvpx/libvpx/vp8/common/mv.h           |     6 +-
 media/libvpx/libvpx/vp8/common/onyx.h         |    65 +-
 media/libvpx/libvpx/vp8/common/onyxc_int.h    |     7 +-
 media/libvpx/libvpx/vp8/common/onyxd.h        |    19 +-
 media/libvpx/libvpx/vp8/common/postproc.c     |   140 +-
 media/libvpx/libvpx/vp8/common/postproc.h     |    15 +-
 media/libvpx/libvpx/vp8/common/ppflags.h      |     6 +-
 media/libvpx/libvpx/vp8/common/quant_common.h |     6 +-
 media/libvpx/libvpx/vp8/common/reconinter.c   |     7 +
 media/libvpx/libvpx/vp8/common/reconinter.h   |    29 +-
 media/libvpx/libvpx/vp8/common/reconintra.c   |     8 +
 media/libvpx/libvpx/vp8/common/reconintra.h   |     6 +-
 .../libvpx/libvpx/vp8/common/reconintra4x4.c  |    16 +-
 .../libvpx/libvpx/vp8/common/reconintra4x4.h  |     8 +-
 media/libvpx/libvpx/vp8/common/rtcd.c         |     2 +-
 media/libvpx/libvpx/vp8/common/rtcd_defs.pl   |   148 +-
 .../libvpx/vp8/common/setupintrarecon.h       |     6 +-
 .../libvpx/libvpx/vp8/common/swapyv12buffer.h |     6 +-
 .../libvpx/vp8/common/systemdependent.h       |     6 +-
 media/libvpx/libvpx/vp8/common/threading.h    |   217 +-
 media/libvpx/libvpx/vp8/common/treecoder.c    |     9 +-
 media/libvpx/libvpx/vp8/common/treecoder.h    |     8 +-
 .../libvpx/vp8/common/vp8_entropymodedata.h   |     8 +-
 .../libvpx/libvpx/vp8/common/vp8_loopfilter.c |    17 +-
 .../libvpx/vp8/common/vp8_skin_detection.c    |   109 +
 .../libvpx/vp8/common/vp8_skin_detection.h    |    47 +
 .../vp8/common/x86/bilinear_filter_sse2.c     |   336 +
 .../libvpx/vp8/common/x86/dequantize_mmx.asm  |     5 +-
 .../libvpx/libvpx/vp8/common/x86/filter_x86.c |    29 -
 .../libvpx/libvpx/vp8/common/x86/filter_x86.h |    33 -
 .../libvpx/vp8/common/x86/idct_blk_sse2.c     |    24 +-
 .../libvpx/vp8/common/x86/idctllm_mmx.asm     |     5 +-
 .../libvpx/vp8/common/x86/idctllm_sse2.asm    |    10 +-
 .../libvpx/vp8/common/x86/iwalsh_sse2.asm     |     6 +-
 .../x86/loopfilter_block_sse2_x86_64.asm      |     6 +-
 .../libvpx/vp8/common/x86/loopfilter_sse2.asm |    22 +-
 .../libvpx/vp8/common/x86/loopfilter_x86.c    |     6 +-
 .../libvpx/vp8/common/x86/mfqe_sse2.asm       |     8 +-
 .../libvpx/vp8/common/x86/recon_mmx.asm       |     5 +-
 .../libvpx/vp8/common/x86/recon_sse2.asm      |     4 +-
 .../libvpx/vp8/common/x86/subpixel_mmx.asm    |   281 +-
 .../libvpx/vp8/common/x86/subpixel_sse2.asm   |   431 +-
 .../libvpx/vp8/common/x86/subpixel_ssse3.asm  |    17 +-
 .../libvpx/vp8/common/x86/vp8_asm_stubs.c     |    13 +-
 media/libvpx/libvpx/vp8/decoder/dboolhuff.c   |    10 +-
 media/libvpx/libvpx/vp8/decoder/dboolhuff.h   |    12 +-
 media/libvpx/libvpx/vp8/decoder/decodeframe.c |    69 +-
 media/libvpx/libvpx/vp8/decoder/decodemv.c    |    21 +-
 media/libvpx/libvpx/vp8/decoder/decodemv.h    |     6 +-
 .../libvpx/vp8/decoder/decoderthreading.h     |     8 +-
 media/libvpx/libvpx/vp8/decoder/detokenize.c  |     6 +-
 media/libvpx/libvpx/vp8/decoder/detokenize.h  |     6 +-
 media/libvpx/libvpx/vp8/decoder/ec_types.h    |    10 +-
 .../libvpx/vp8/decoder/error_concealment.c    |    10 +-
 .../libvpx/vp8/decoder/error_concealment.h    |     6 +-
 media/libvpx/libvpx/vp8/decoder/onyxd_if.c    |    43 +-
 media/libvpx/libvpx/vp8/decoder/onyxd_int.h   |    54 +-
 media/libvpx/libvpx/vp8/decoder/threading.c   |   198 +-
 media/libvpx/libvpx/vp8/decoder/treereader.h  |     8 +-
 .../vp8/encoder/arm/neon/fastquantizeb_neon.c |    12 +-
 .../vp8/encoder/arm/neon/shortfdct_neon.c     |     2 +
 .../encoder/arm/neon/vp8_shortwalsh4x4_neon.c |     2 +
 media/libvpx/libvpx/vp8/encoder/bitstream.c   |   190 +-
 media/libvpx/libvpx/vp8/encoder/bitstream.h   |    14 +-
 media/libvpx/libvpx/vp8/encoder/block.h       |     6 +-
 media/libvpx/libvpx/vp8/encoder/boolhuff.c    |    26 +-
 media/libvpx/libvpx/vp8/encoder/boolhuff.h    |    70 +-
 .../libvpx/vp8/{common => encoder}/copy_c.c   |     0
 .../libvpx/vp8/encoder/dct_value_cost.h       |     6 +-
 .../libvpx/vp8/encoder/dct_value_tokens.h     |     6 +-
 .../libvpx/vp8/encoder/defaultcoefcounts.h    |     6 +-
 media/libvpx/libvpx/vp8/encoder/denoising.c   |    49 +-
 media/libvpx/libvpx/vp8/encoder/denoising.h   |     6 +-
 media/libvpx/libvpx/vp8/encoder/encodeframe.c |   151 +-
 media/libvpx/libvpx/vp8/encoder/encodeframe.h |    31 +-
 media/libvpx/libvpx/vp8/encoder/encodeintra.c |     3 +-
 media/libvpx/libvpx/vp8/encoder/encodeintra.h |     8 +-
 media/libvpx/libvpx/vp8/encoder/encodemb.c    |    12 +-
 media/libvpx/libvpx/vp8/encoder/encodemb.h    |     6 +-
 media/libvpx/libvpx/vp8/encoder/encodemv.c    |    36 +-
 media/libvpx/libvpx/vp8/encoder/encodemv.h    |     6 +-
 media/libvpx/libvpx/vp8/encoder/ethreading.c  |   154 +-
 media/libvpx/libvpx/vp8/encoder/ethreading.h  |    32 +
 media/libvpx/libvpx/vp8/encoder/firstpass.c   |   274 +-
 media/libvpx/libvpx/vp8/encoder/firstpass.h   |     6 +-
 media/libvpx/libvpx/vp8/encoder/lookahead.c   |     4 +-
 media/libvpx/libvpx/vp8/encoder/lookahead.h   |     8 +-
 .../libvpx/vp8/encoder/loongarch/dct_lsx.c    |   161 +
 .../vp8/encoder/loongarch/encodeopt_lsx.c     |    82 +
 .../vp8/encoder/loongarch/vp8_quantize_lsx.c  |   145 +
 media/libvpx/libvpx/vp8/encoder/mcomp.c       |   505 +-
 media/libvpx/libvpx/vp8/encoder/mcomp.h       |    42 +-
 .../libvpx/vp8/encoder/mips/mmi/dct_mmi.c     |   434 +
 .../vp8/encoder/mips/mmi/vp8_quantize_mmi.c   |   263 +
 media/libvpx/libvpx/vp8/encoder/modecosts.h   |     8 +-
 media/libvpx/libvpx/vp8/encoder/mr_dissim.c   |     2 +-
 media/libvpx/libvpx/vp8/encoder/mr_dissim.h   |     6 +-
 media/libvpx/libvpx/vp8/encoder/onyx_if.c     |   991 +-
 media/libvpx/libvpx/vp8/encoder/onyx_int.h    |    96 +-
 media/libvpx/libvpx/vp8/encoder/pickinter.c   |   172 +-
 media/libvpx/libvpx/vp8/encoder/pickinter.h   |     6 +-
 media/libvpx/libvpx/vp8/encoder/picklpf.c     |    11 +-
 media/libvpx/libvpx/vp8/encoder/picklpf.h     |    30 +
 media/libvpx/libvpx/vp8/encoder/quantize.h    |     6 +-
 media/libvpx/libvpx/vp8/encoder/ratectrl.c    |   260 +-
 media/libvpx/libvpx/vp8/encoder/ratectrl.h    |     6 +-
 media/libvpx/libvpx/vp8/encoder/rdopt.c       |   157 +-
 media/libvpx/libvpx/vp8/encoder/rdopt.h       |    27 +-
 .../libvpx/libvpx/vp8/encoder/segmentation.c  |     4 +-
 .../libvpx/libvpx/vp8/encoder/segmentation.h  |    10 +-
 .../libvpx/vp8/encoder/temporal_filter.c      |     4 +-
 .../libvpx/vp8/encoder/temporal_filter.h      |    26 +
 media/libvpx/libvpx/vp8/encoder/tokenize.c    |    70 -
 media/libvpx/libvpx/vp8/encoder/tokenize.h    |    16 +-
 media/libvpx/libvpx/vp8/encoder/treewriter.h  |    22 +-
 .../libvpx/libvpx/vp8/encoder/vp8_quantize.c  |    13 +-
 .../{encodeopt.asm => block_error_sse2.asm}   |     8 +-
 .../vp8/{common => encoder}/x86/copy_sse2.asm |     3 +-
 .../vp8/{common => encoder}/x86/copy_sse3.asm |     3 +-
 .../libvpx/vp8/encoder/x86/dct_sse2.asm       |     6 +-
 .../libvpx/vp8/encoder/x86/denoising_sse2.c   |     2 +-
 .../libvpx/vp8/encoder/x86/fwalsh_sse2.asm    |     4 +-
 .../libvpx/vp8/encoder/x86/quantize_mmx.asm   |   286 -
 .../libvpx/vp8/encoder/x86/quantize_sse4.c    |   121 +-
 .../x86/temporal_filter_apply_sse2.asm        |     6 +-
 .../vp8/encoder/x86/vp8_enc_stubs_mmx.c       |    34 -
 ...{quantize_ssse3.c => vp8_quantize_ssse3.c} |    40 +-
 media/libvpx/libvpx/vp8/vp8_common.mk         |    28 +-
 media/libvpx/libvpx/vp8/vp8_cx_iface.c        |   368 +-
 media/libvpx/libvpx/vp8/vp8_dx_iface.c        |   174 +-
 media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc   |   440 +
 media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h    |    66 +
 media/libvpx/libvpx/vp8/vp8cx.mk              |    23 +-
 .../arm/neon/vp9_highbd_iht16x16_add_neon.c   |   446 +
 .../arm/neon/vp9_highbd_iht4x4_add_neon.c     |   181 +
 .../arm/neon/vp9_highbd_iht8x8_add_neon.c     |   345 +
 .../common/arm/neon/vp9_iht16x16_add_neon.c   |   279 +
 .../vp9/common/arm/neon/vp9_iht4x4_add_neon.c |   238 +-
 .../vp9/common/arm/neon/vp9_iht8x8_add_neon.c |   542 +-
 .../libvpx/vp9/common/arm/neon/vp9_iht_neon.h |   272 +
 .../vp9/common/mips/msa/vp9_idct16x16_msa.c   |     1 +
 .../vp9/common/mips/msa/vp9_idct4x4_msa.c     |     1 +
 .../vp9/common/mips/msa/vp9_idct8x8_msa.c     |     1 +
 .../libvpx/vp9/common/ppc/vp9_idct_vsx.c      |   116 +
 .../libvpx/vp9/common/vp9_alloccommon.c       |    74 +-
 .../libvpx/vp9/common/vp9_alloccommon.h       |    13 +-
 media/libvpx/libvpx/vp9/common/vp9_blockd.c   |     5 +-
 media/libvpx/libvpx/vp9/common/vp9_blockd.h   |    46 +-
 media/libvpx/libvpx/vp9/common/vp9_common.h   |    39 +-
 .../libvpx/vp9/common/vp9_common_data.c       |     2 +-
 .../libvpx/vp9/common/vp9_common_data.h       |     6 +-
 .../libvpx/libvpx/vp9/common/vp9_debugmodes.c |     2 +-
 media/libvpx/libvpx/vp9/common/vp9_entropy.c  |     2 +
 media/libvpx/libvpx/vp9/common/vp9_entropy.h  |     7 +-
 .../libvpx/vp9/common/vp9_entropymode.c       |    24 +-
 .../libvpx/vp9/common/vp9_entropymode.h       |     8 +-
 .../libvpx/libvpx/vp9/common/vp9_entropymv.c  |     4 +-
 .../libvpx/libvpx/vp9/common/vp9_entropymv.h  |    10 +-
 media/libvpx/libvpx/vp9/common/vp9_enums.h    |     8 +-
 media/libvpx/libvpx/vp9/common/vp9_filter.c   |    18 +-
 media/libvpx/libvpx/vp9/common/vp9_filter.h   |     9 +-
 .../libvpx/vp9/common/vp9_frame_buffers.c     |    14 +-
 .../libvpx/vp9/common/vp9_frame_buffers.h     |     6 +-
 media/libvpx/libvpx/vp9/common/vp9_idct.c     |    33 +-
 media/libvpx/libvpx/vp9/common/vp9_idct.h     |    22 +-
 .../libvpx/libvpx/vp9/common/vp9_loopfilter.c |    70 +-
 .../libvpx/libvpx/vp9/common/vp9_loopfilter.h |    14 +-
 media/libvpx/libvpx/vp9/common/vp9_mfqe.c     |     2 +-
 media/libvpx/libvpx/vp9/common/vp9_mfqe.h     |     6 +-
 media/libvpx/libvpx/vp9/common/vp9_mv.h       |     8 +-
 .../libvpx/vp9/common/vp9_mvref_common.h      |    10 +-
 .../libvpx/libvpx/vp9/common/vp9_onyxc_int.h  |   125 +-
 media/libvpx/libvpx/vp9/common/vp9_postproc.c |    58 +-
 media/libvpx/libvpx/vp9/common/vp9_postproc.h |    17 +-
 media/libvpx/libvpx/vp9/common/vp9_ppflags.h  |     6 +-
 .../libvpx/vp9/common/vp9_pred_common.c       |    31 +-
 .../libvpx/vp9/common/vp9_pred_common.h       |    16 +-
 .../libvpx/vp9/common/vp9_quant_common.h      |     6 +-
 .../libvpx/libvpx/vp9/common/vp9_reconinter.c |    47 +-
 .../libvpx/libvpx/vp9/common/vp9_reconinter.h |    33 +-
 .../libvpx/libvpx/vp9/common/vp9_reconintra.h |     6 +-
 media/libvpx/libvpx/vp9/common/vp9_rtcd.c     |     6 +-
 .../libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl |   221 +-
 media/libvpx/libvpx/vp9/common/vp9_scale.h    |    10 +-
 media/libvpx/libvpx/vp9/common/vp9_scan.c     |   297 +-
 media/libvpx/libvpx/vp9/common/vp9_scan.h     |    18 +-
 .../libvpx/libvpx/vp9/common/vp9_seg_common.h |    11 +-
 .../libvpx/vp9/common/vp9_thread_common.c     |   282 +-
 .../libvpx/vp9/common/vp9_thread_common.h     |    33 +-
 .../libvpx/vp9/common/vp9_tile_common.h       |     6 +-
 .../common/x86/vp9_highbd_iht16x16_add_sse4.c |   419 +
 .../common/x86/vp9_highbd_iht4x4_add_sse4.c   |   131 +
 .../common/x86/vp9_highbd_iht8x8_add_sse4.c   |   255 +
 .../vp9/common/x86/vp9_idct_intrin_sse2.c     |   181 +-
 .../libvpx/vp9/common/x86/vp9_mfqe_sse2.asm   |     8 +-
 .../libvpx/vp9/decoder/vp9_decodeframe.c      |  1281 +-
 .../libvpx/vp9/decoder/vp9_decodeframe.h      |     6 +-
 .../libvpx/libvpx/vp9/decoder/vp9_decodemv.c  |   150 +-
 .../libvpx/libvpx/vp9/decoder/vp9_decodemv.h  |     6 +-
 media/libvpx/libvpx/vp9/decoder/vp9_decoder.c |   258 +-
 media/libvpx/libvpx/vp9/decoder/vp9_decoder.h |    75 +-
 .../libvpx/vp9/decoder/vp9_detokenize.c       |    64 +-
 .../libvpx/vp9/decoder/vp9_detokenize.h       |    11 +-
 media/libvpx/libvpx/vp9/decoder/vp9_dsubexp.h |     6 +-
 media/libvpx/libvpx/vp9/decoder/vp9_dthread.c |   190 -
 media/libvpx/libvpx/vp9/decoder/vp9_dthread.h |    74 -
 .../libvpx/libvpx/vp9/decoder/vp9_job_queue.c |   125 +
 .../libvpx/libvpx/vp9/decoder/vp9_job_queue.h |    45 +
 .../vp9/encoder/arm/neon/vp9_dct_neon.c       |  2172 +-
 .../vp9/encoder/arm/neon/vp9_denoiser_neon.c  |   356 +
 .../arm/neon/vp9_diamond_search_sad_neon.c    |   296 +
 .../vp9/encoder/arm/neon/vp9_error_neon.c     |    99 +-
 .../vp9/encoder/arm/neon/vp9_error_sve.c      |    78 +
 .../encoder/arm/neon/vp9_frame_scale_neon.c   |   844 +
 .../encoder/arm/neon/vp9_highbd_error_neon.c  |    49 +
 .../neon/vp9_highbd_temporal_filter_neon.c    |  1076 +
 .../neon/vp9_highbd_temporal_filter_sve2.c    |   285 +
 .../vp9/encoder/arm/neon/vp9_quantize_neon.c  |   470 +-
 .../arm/neon/vp9_temporal_filter_neon.c       |  1103 +
 .../neon/vp9_temporal_filter_neon_dotprod.c   |   367 +
 .../arm/neon/vp9_temporal_filter_neon_i8mm.c  |   351 +
 .../vp9/encoder/mips/msa/vp9_error_msa.c      |     3 +
 .../vp9/encoder/mips/msa/vp9_fdct16x16_msa.c  |     1 +
 .../vp9/encoder/mips/msa/vp9_fdct4x4_msa.c    |     1 +
 .../vp9/encoder/mips/msa/vp9_fdct8x8_msa.c    |     1 +
 .../vp9/encoder/mips/msa/vp9_fdct_msa.h       |     6 +-
 .../mips/msa/vp9_temporal_filter_msa.c        |   283 -
 .../libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c |   287 +
 .../libvpx/vp9/encoder/vp9_alt_ref_aq.c       |     2 +-
 .../libvpx/vp9/encoder/vp9_alt_ref_aq.h       |     8 +-
 media/libvpx/libvpx/vp9/encoder/vp9_aq_360.h  |     6 +-
 .../libvpx/vp9/encoder/vp9_aq_complexity.c    |     2 +-
 .../libvpx/vp9/encoder/vp9_aq_complexity.h    |     6 +-
 .../libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c |   384 +-
 .../libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h |    25 +-
 .../libvpx/vp9/encoder/vp9_aq_variance.c      |    79 +-
 .../libvpx/vp9/encoder/vp9_aq_variance.h      |    10 +-
 .../libvpx/libvpx/vp9/encoder/vp9_bitstream.c |   309 +-
 .../libvpx/libvpx/vp9/encoder/vp9_bitstream.h |    21 +-
 media/libvpx/libvpx/vp9/encoder/vp9_block.h   |    88 +-
 .../libvpx/vp9/encoder/vp9_blockiness.c       |     1 +
 .../libvpx/vp9/encoder/vp9_blockiness.h       |    26 +
 .../libvpx/vp9/encoder/vp9_context_tree.c     |    49 +-
 .../libvpx/vp9/encoder/vp9_context_tree.h     |    18 +-
 media/libvpx/libvpx/vp9/encoder/vp9_cost.h    |    11 +-
 media/libvpx/libvpx/vp9/encoder/vp9_dct.c     |   108 -
 .../libvpx/libvpx/vp9/encoder/vp9_denoiser.c  |   480 +-
 .../libvpx/libvpx/vp9/encoder/vp9_denoiser.h  |    60 +-
 .../libvpx/vp9/encoder/vp9_encodeframe.c      |  3379 ++-
 .../libvpx/vp9/encoder/vp9_encodeframe.h      |    24 +-
 .../libvpx/libvpx/vp9/encoder/vp9_encodemb.c  |   906 +-
 .../libvpx/libvpx/vp9/encoder/vp9_encodemb.h  |    21 +-
 .../libvpx/libvpx/vp9/encoder/vp9_encodemv.h  |     8 +-
 media/libvpx/libvpx/vp9/encoder/vp9_encoder.c |  4301 ++--
 media/libvpx/libvpx/vp9/encoder/vp9_encoder.h |   746 +-
 media/libvpx/libvpx/vp9/encoder/vp9_ethread.c |   661 +-
 media/libvpx/libvpx/vp9/encoder/vp9_ethread.h |    52 +-
 .../libvpx/vp9/encoder/vp9_ext_ratectrl.c     |   257 +
 .../libvpx/vp9/encoder/vp9_ext_ratectrl.h     |    63 +
 media/libvpx/libvpx/vp9/encoder/vp9_extend.c  |    61 +-
 media/libvpx/libvpx/vp9/encoder/vp9_extend.h  |     9 +-
 .../libvpx/libvpx/vp9/encoder/vp9_firstpass.c |  3824 +--
 .../libvpx/libvpx/vp9/encoder/vp9_firstpass.h |   202 +-
 .../libvpx/vp9/encoder/vp9_firstpass_stats.h  |    54 +
 .../libvpx/vp9/encoder/vp9_frame_scale.c      |   136 +
 .../libvpx/libvpx/vp9/encoder/vp9_job_queue.h |    46 +
 .../libvpx/libvpx/vp9/encoder/vp9_lookahead.c |   126 +-
 .../libvpx/libvpx/vp9/encoder/vp9_lookahead.h |    47 +-
 media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.c |    53 +-
 media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.h |    10 +-
 media/libvpx/libvpx/vp9/encoder/vp9_mcomp.c   |  1554 +-
 media/libvpx/libvpx/vp9/encoder/vp9_mcomp.h   |   110 +-
 .../libvpx/vp9/encoder/vp9_multi_thread.c     |   342 +
 .../libvpx/vp9/encoder/vp9_multi_thread.h     |    41 +
 .../libvpx/vp9/encoder/vp9_noise_estimate.c   |   211 +-
 .../libvpx/vp9/encoder/vp9_noise_estimate.h   |     9 +-
 .../libvpx/vp9/encoder/vp9_non_greedy_mv.c    |   536 +
 .../libvpx/vp9/encoder/vp9_non_greedy_mv.h    |   129 +
 .../libvpx/vp9/encoder/vp9_partition_models.h |   975 +
 media/libvpx/libvpx/vp9/encoder/vp9_picklpf.c |    33 +-
 media/libvpx/libvpx/vp9/encoder/vp9_picklpf.h |     6 +-
 .../libvpx/libvpx/vp9/encoder/vp9_pickmode.c  |  1592 +-
 .../libvpx/libvpx/vp9/encoder/vp9_pickmode.h  |     6 +-
 .../libvpx/libvpx/vp9/encoder/vp9_quantize.c  |   241 +-
 .../libvpx/libvpx/vp9/encoder/vp9_quantize.h  |    11 +-
 .../libvpx/libvpx/vp9/encoder/vp9_ratectrl.c  |  1962 +-
 .../libvpx/libvpx/vp9/encoder/vp9_ratectrl.h  |    90 +-
 media/libvpx/libvpx/vp9/encoder/vp9_rd.c      |   373 +-
 media/libvpx/libvpx/vp9/encoder/vp9_rd.h      |    64 +-
 media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c   |  1457 +-
 media/libvpx/libvpx/vp9/encoder/vp9_rdopt.h   |    10 +-
 media/libvpx/libvpx/vp9/encoder/vp9_resize.c  |   102 +-
 media/libvpx/libvpx/vp9/encoder/vp9_resize.h  |    39 +-
 .../libvpx/vp9/encoder/vp9_segmentation.c     |    56 +-
 .../libvpx/vp9/encoder/vp9_segmentation.h     |    11 +-
 .../libvpx/vp9/encoder/vp9_skin_detection.c   |   233 +-
 .../libvpx/vp9/encoder/vp9_skin_detection.h   |    19 +-
 .../libvpx/vp9/encoder/vp9_speed_features.c   |   742 +-
 .../libvpx/vp9/encoder/vp9_speed_features.h   |   266 +-
 media/libvpx/libvpx/vp9/encoder/vp9_subexp.c  |    43 +-
 media/libvpx/libvpx/vp9/encoder/vp9_subexp.h  |    19 +-
 .../libvpx/vp9/encoder/vp9_svc_layercontext.c |   978 +-
 .../libvpx/vp9/encoder/vp9_svc_layercontext.h |   172 +-
 .../libvpx/vp9/encoder/vp9_temporal_filter.c  |  1513 +-
 .../libvpx/vp9/encoder/vp9_temporal_filter.h  |    54 +-
 .../encoder/vp9_temporal_filter_constants.h   |   410 +
 .../libvpx/libvpx/vp9/encoder/vp9_tokenize.c  |     8 +-
 .../libvpx/libvpx/vp9/encoder/vp9_tokenize.h  |    25 +-
 .../libvpx/libvpx/vp9/encoder/vp9_tpl_model.c |  1791 ++
 .../libvpx/libvpx/vp9/encoder/vp9_tpl_model.h |    47 +
 .../libvpx/vp9/encoder/vp9_treewriter.h       |     6 +-
 .../encoder/x86/highbd_temporal_filter_avx2.c |   263 +
 .../encoder/x86/highbd_temporal_filter_sse4.c |   893 +
 .../x86/highbd_temporal_filter_ssse3.c        |   233 +
 .../vp9/encoder/x86/temporal_filter_avx2.c    |   441 +
 .../vp9/encoder/x86/temporal_filter_sse4.c    |   875 +
 .../vp9/encoder/x86/temporal_filter_ssse3.c   |   279 +
 .../vp9/encoder/x86/vp9_dct_intrin_sse2.c     |   550 +-
 .../libvpx/vp9/encoder/x86/vp9_dct_sse2.asm   |    23 +-
 .../libvpx/vp9/encoder/x86/vp9_dct_ssse3.c    |   467 -
 .../vp9/encoder/x86/vp9_denoiser_sse2.c       |     1 -
 .../encoder/x86/vp9_diamond_search_sad_avx.c  |   310 -
 .../libvpx/vp9/encoder/x86/vp9_error_avx2.c   |   161 +
 .../vp9/encoder/x86/vp9_error_intrin_avx2.c   |    71 -
 .../libvpx/vp9/encoder/x86/vp9_error_sse2.asm |    63 +-
 .../vp9/encoder/x86/vp9_frame_scale_ssse3.c   |  1037 +-
 .../x86/vp9_highbd_block_error_intrin_sse2.c  |    19 +-
 .../vp9/encoder/x86/vp9_highbd_error_avx.asm  |   261 -
 .../vp9/encoder/x86/vp9_highbd_error_sse2.asm |    98 -
 .../vp9/encoder/x86/vp9_quantize_avx2.c       |   439 +
 .../vp9/encoder/x86/vp9_quantize_sse2.c       |   272 +-
 .../vp9/encoder/x86/vp9_quantize_ssse3.c      |   252 +
 .../encoder/x86/vp9_quantize_ssse3_x86_64.asm |   201 -
 .../x86/vp9_temporal_filter_apply_sse2.asm    |   212 -
 media/libvpx/libvpx/vp9/ratectrl_rtc.cc       |   354 +
 media/libvpx/libvpx/vp9/ratectrl_rtc.h        |   106 +
 media/libvpx/libvpx/vp9/vp9_common.mk         |    45 +-
 media/libvpx/libvpx/vp9/vp9_cx_iface.c        |  1514 +-
 media/libvpx/libvpx/vp9/vp9_cx_iface.h        |    49 +
 media/libvpx/libvpx/vp9/vp9_dx_iface.c        |   679 +-
 media/libvpx/libvpx/vp9/vp9_dx_iface.h        |    33 +-
 media/libvpx/libvpx/vp9/vp9_iface_common.c    |   136 +
 media/libvpx/libvpx/vp9/vp9_iface_common.h    |   143 +-
 media/libvpx/libvpx/vp9/vp9cx.mk              |    88 +-
 media/libvpx/libvpx/vp9/vp9dx.mk              |     4 +-
 media/libvpx/libvpx/vpx/exports_spatial_svc   |     6 -
 .../libvpx/vpx/internal/vpx_codec_internal.h  |    77 +-
 .../libvpx/vpx/internal/vpx_ratectrl_rtc.h    |    80 +
 media/libvpx/libvpx/vpx/src/vpx_codec.c       |     9 +-
 media/libvpx/libvpx/vpx/src/vpx_decoder.c     |    22 +-
 media/libvpx/libvpx/vpx/src/vpx_encoder.c     |    80 +-
 media/libvpx/libvpx/vpx/src/vpx_image.c       |   131 +-
 media/libvpx/libvpx/vpx/vp8.h                 |    27 +-
 media/libvpx/libvpx/vpx/vp8cx.h               |   498 +-
 media/libvpx/libvpx/vpx/vp8dx.h               |    68 +-
 media/libvpx/libvpx/vpx/vpx_codec.h           |    96 +-
 media/libvpx/libvpx/vpx/vpx_codec.mk          |     9 +-
 media/libvpx/libvpx/vpx/vpx_decoder.h         |    72 +-
 media/libvpx/libvpx/vpx/vpx_encoder.h         |   342 +-
 media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h    |   605 +
 media/libvpx/libvpx/vpx/vpx_frame_buffer.h    |    14 +-
 media/libvpx/libvpx/vpx/vpx_image.h           |   104 +-
 media/libvpx/libvpx/vpx/vpx_integer.h         |    43 +-
 media/libvpx/libvpx/vpx/vpx_tpl.h             |    69 +
 media/libvpx/libvpx/vpx_dsp/add_noise.c       |     3 +
 media/libvpx/libvpx/vpx_dsp/arm/avg_neon.c    |   191 +-
 .../libvpx/libvpx/vpx_dsp/arm/avg_pred_neon.c |    65 +
 .../libvpx/libvpx/vpx_dsp/arm/deblock_neon.c  |   104 +-
 .../libvpx/vpx_dsp/arm/fdct16x16_neon.c       |   439 +
 .../libvpx/vpx_dsp/arm/fdct16x16_neon.h       |   318 +
 .../libvpx/vpx_dsp/arm/fdct32x32_neon.c       |   419 +
 .../libvpx/vpx_dsp/arm/fdct32x32_neon.h       |  2919 +++
 .../libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.c  |    85 +
 .../libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.h  |   105 +
 .../libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.c  |   143 +
 .../libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.h  |   307 +
 media/libvpx/libvpx/vpx_dsp/arm/fdct_neon.h   |   542 +
 .../libvpx/vpx_dsp/arm/fdct_partial_neon.c    |   183 +
 .../libvpx/libvpx/vpx_dsp/arm/fwd_txfm_neon.c |   220 -
 .../libvpx/libvpx/vpx_dsp/arm/hadamard_neon.c |    85 +-
 .../libvpx/vpx_dsp/arm/highbd_avg_neon.c      |   140 +
 .../libvpx/vpx_dsp/arm/highbd_avg_pred_neon.c |    64 +
 .../vpx_dsp/arm/highbd_convolve8_neon.h       |    46 +
 .../libvpx/vpx_dsp/arm/highbd_convolve8_sve.h |    99 +
 .../libvpx/vpx_dsp/arm/highbd_hadamard_neon.c |   215 +
 .../vpx_dsp/arm/highbd_idct16x16_add_neon.c   |  1361 ++
 .../arm/highbd_idct32x32_1024_add_neon.c      |   640 +
 .../arm/highbd_idct32x32_135_add_neon.c       |   757 +
 .../arm/highbd_idct32x32_34_add_neon.c        |   625 +
 .../vpx_dsp/arm/highbd_idct32x32_add_neon.c   |    88 +
 .../vpx_dsp/arm/highbd_idct4x4_add_neon.c     |   142 +-
 .../vpx_dsp/arm/highbd_idct8x8_add_neon.c     |   625 +-
 .../libvpx/vpx_dsp/arm/highbd_idct_neon.h     |   474 +
 .../vpx_dsp/arm/highbd_intrapred_neon.c       |  1886 +-
 .../vpx_dsp/arm/highbd_loopfilter_neon.c      |    15 +
 .../libvpx/vpx_dsp/arm/highbd_quantize_neon.c |   300 +
 .../libvpx/vpx_dsp/arm/highbd_sad4d_neon.c    |   273 +
 .../libvpx/vpx_dsp/arm/highbd_sad_neon.c      |   452 +
 .../libvpx/vpx_dsp/arm/highbd_sse_neon.c      |   238 +
 .../vpx_dsp/arm/highbd_subpel_variance_neon.c |   586 +
 .../libvpx/vpx_dsp/arm/highbd_variance_neon.c |   436 +
 .../arm/highbd_variance_neon_dotprod.c        |    96 +
 .../libvpx/vpx_dsp/arm/highbd_variance_sve.c  |   344 +
 .../vpx_dsp/arm/highbd_vpx_convolve8_neon.c   |  1940 +-
 .../vpx_dsp/arm/highbd_vpx_convolve8_sve.c    |   271 +
 .../vpx_dsp/arm/highbd_vpx_convolve8_sve2.c   |   660 +
 .../arm/highbd_vpx_convolve_avg_neon.c        |    20 +-
 .../arm/highbd_vpx_convolve_copy_neon.c       |   124 +-
 .../vpx_dsp/arm/highbd_vpx_convolve_neon.c    |    65 -
 .../vpx_dsp/arm/idct16x16_1_add_neon.asm      |   196 -
 .../libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c |     3 +-
 .../libvpx/vpx_dsp/arm/idct16x16_add_neon.asm |  1176 -
 .../libvpx/vpx_dsp/arm/idct16x16_add_neon.c   |  1932 +-
 .../libvpx/vpx_dsp/arm/idct16x16_neon.c       |   160 -
 .../vpx_dsp/arm/idct32x32_135_add_neon.c      |   908 +-
 .../libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c |     3 +-
 .../vpx_dsp/arm/idct32x32_34_add_neon.c       |   683 +-
 .../libvpx/vpx_dsp/arm/idct32x32_add_neon.c   |   967 +-
 .../libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c   |     4 +-
 .../libvpx/vpx_dsp/arm/idct4x4_add_neon.asm   |     2 +-
 .../libvpx/vpx_dsp/arm/idct4x4_add_neon.c     |    57 +-
 .../libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm |    86 -
 .../libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c   |     3 +-
 .../libvpx/vpx_dsp/arm/idct8x8_add_neon.asm   |   507 -
 .../libvpx/vpx_dsp/arm/idct8x8_add_neon.c     |   113 +-
 media/libvpx/libvpx/vpx_dsp/arm/idct_neon.h   |  1008 +-
 .../libvpx/vpx_dsp/arm/intrapred_neon.c       |  1345 +-
 .../libvpx/vpx_dsp/arm/loopfilter_8_neon.asm  |     2 +-
 .../libvpx/vpx_dsp/arm/loopfilter_neon.c      |    17 +-
 media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h    |   757 +
 .../libvpx/libvpx/vpx_dsp/arm/quantize_neon.c |   286 +
 media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c  |   372 +-
 .../libvpx/vpx_dsp/arm/sad4d_neon_dotprod.c   |   176 +
 media/libvpx/libvpx/vpx_dsp/arm/sad_neon.c    |   516 +-
 .../libvpx/vpx_dsp/arm/sad_neon_dotprod.c     |   247 +
 media/libvpx/libvpx/vpx_dsp/arm/sse_neon.c    |   188 +
 .../libvpx/vpx_dsp/arm/sse_neon_dotprod.c     |   197 +
 .../libvpx/vpx_dsp/arm/subpel_variance_neon.c |   556 +-
 .../libvpx/libvpx/vpx_dsp/arm/subtract_neon.c |   140 +-
 media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h    |   275 +
 .../libvpx/vpx_dsp/arm/sum_squares_neon.c     |   100 +
 .../libvpx/vpx_dsp/arm/sum_squares_sve.c      |    73 +
 .../libvpx/vpx_dsp/arm/transpose_neon.h       |   685 +-
 .../libvpx/libvpx/vpx_dsp/arm/variance_neon.c |   655 +-
 .../vpx_dsp/arm/variance_neon_dotprod.c       |   298 +
 ..._convolve8_avg_horiz_filter_type1_neon.asm |   438 +
 ..._convolve8_avg_horiz_filter_type2_neon.asm |   439 +
 .../arm/vpx_convolve8_avg_neon_asm.asm        |   292 -
 ...x_convolve8_avg_vert_filter_type1_neon.asm |   486 +
 ...x_convolve8_avg_vert_filter_type2_neon.asm |   487 +
 .../vpx_convolve8_horiz_filter_type1_neon.asm |   415 +
 .../vpx_convolve8_horiz_filter_type2_neon.asm |   415 +
 .../libvpx/vpx_dsp/arm/vpx_convolve8_neon.c   |  1356 +-
 .../libvpx/vpx_dsp/arm/vpx_convolve8_neon.h   |   172 +
 .../vpx_dsp/arm/vpx_convolve8_neon_asm.asm    |   270 -
 .../vpx_dsp/arm/vpx_convolve8_neon_asm.c      |    41 +
 .../vpx_dsp/arm/vpx_convolve8_neon_asm.h      |    29 +
 .../vpx_dsp/arm/vpx_convolve8_neon_dotprod.c  |  1024 +
 .../vpx_dsp/arm/vpx_convolve8_neon_i8mm.c     |   943 +
 .../vpx_convolve8_vert_filter_type1_neon.asm  |   457 +
 .../vpx_convolve8_vert_filter_type2_neon.asm  |   455 +
 .../vpx_dsp/arm/vpx_convolve_avg_neon.c       |    22 +-
 .../vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm |     2 +-
 .../vpx_dsp/arm/vpx_convolve_copy_neon.c      |    26 +-
 .../arm/vpx_convolve_copy_neon_asm.asm        |     2 +-
 .../libvpx/vpx_dsp/arm/vpx_convolve_neon.c    |    68 +-
 .../libvpx/vpx_dsp/arm/vpx_neon_sve2_bridge.h |    46 +
 .../libvpx/vpx_dsp/arm/vpx_neon_sve_bridge.h  |    56 +
 .../vpx_dsp/arm/vpx_scaled_convolve8_neon.c   |   316 +
 media/libvpx/libvpx/vpx_dsp/avg.c             |   248 +-
 media/libvpx/libvpx/vpx_dsp/bitreader.h       |    37 +-
 .../libvpx/libvpx/vpx_dsp/bitreader_buffer.c  |     8 +-
 .../libvpx/libvpx/vpx_dsp/bitreader_buffer.h  |     6 +-
 media/libvpx/libvpx/vpx_dsp/bitwriter.c       |    32 +-
 media/libvpx/libvpx/vpx_dsp/bitwriter.h       |    67 +-
 .../libvpx/libvpx/vpx_dsp/bitwriter_buffer.c  |    25 +-
 .../libvpx/libvpx/vpx_dsp/bitwriter_buffer.h  |    22 +-
 media/libvpx/libvpx/vpx_dsp/deblock.c         |    48 +-
 media/libvpx/libvpx/vpx_dsp/fastssim.c        |    56 +-
 media/libvpx/libvpx/vpx_dsp/fwd_txfm.c        |    69 +-
 media/libvpx/libvpx/vpx_dsp/fwd_txfm.h        |     6 +-
 media/libvpx/libvpx/vpx_dsp/intrapred.c       |   300 +-
 media/libvpx/libvpx/vpx_dsp/inv_txfm.c        |  1866 +-
 media/libvpx/libvpx/vpx_dsp/inv_txfm.h        |     7 +-
 .../libvpx/libvpx/vpx_dsp/loongarch/avg_lsx.c |    90 +
 .../libvpx/vpx_dsp/loongarch/avg_pred_lsx.c   |    83 +
 .../loongarch/bitdepth_conversion_lsx.h       |    41 +
 .../vpx_dsp/loongarch/fwd_dct32x32_lsx.c      |  1176 +
 .../libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.c   |   350 +
 .../libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.h   |   381 +
 .../libvpx/vpx_dsp/loongarch/idct32x32_lsx.c  |   834 +
 .../libvpx/vpx_dsp/loongarch/intrapred_lsx.c  |    98 +
 .../vpx_dsp/loongarch/loopfilter_16_lsx.c     |  1320 ++
 .../vpx_dsp/loongarch/loopfilter_4_lsx.c      |   214 +
 .../vpx_dsp/loongarch/loopfilter_8_lsx.c      |   458 +
 .../libvpx/vpx_dsp/loongarch/loopfilter_lsx.h |   167 +
 .../libvpx/vpx_dsp/loongarch/quantize_lsx.c   |   244 +
 .../libvpx/libvpx/vpx_dsp/loongarch/sad_lsx.c |   717 +
 .../loongarch/sub_pixel_variance_lsx.c        |   874 +
 .../libvpx/vpx_dsp/loongarch/subtract_lsx.c   |   371 +
 .../vpx_dsp/loongarch/txfm_macros_lsx.h       |    48 +
 .../libvpx/vpx_dsp/loongarch/variance_lsx.c   |   263 +
 .../libvpx/vpx_dsp/loongarch/variance_lsx.h   |    62 +
 .../loongarch/vpx_convolve8_avg_horiz_lsx.c   |   972 +
 .../vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c |   737 +
 .../loongarch/vpx_convolve8_avg_vert_lsx.c    |   918 +
 .../loongarch/vpx_convolve8_horiz_lsx.c       |   814 +
 .../vpx_dsp/loongarch/vpx_convolve8_lsx.c     |   697 +
 .../loongarch/vpx_convolve8_vert_lsx.c        |   825 +
 .../vpx_dsp/loongarch/vpx_convolve_avg_lsx.c  |   321 +
 .../vpx_dsp/loongarch/vpx_convolve_copy_lsx.c |   437 +
 .../vpx_dsp/loongarch/vpx_convolve_lsx.h      |   138 +
 media/libvpx/libvpx/vpx_dsp/loopfilter.c      |   220 +-
 .../libvpx/vpx_dsp/mips/add_noise_msa.c       |     4 +-
 media/libvpx/libvpx/vpx_dsp/mips/avg_msa.c    |   675 +
 .../libvpx/libvpx/vpx_dsp/mips/common_dspr2.h |     6 +-
 .../libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c |    11 +-
 .../vpx_dsp/mips/convolve2_avg_horiz_dspr2.c  |     9 +-
 .../vpx_dsp/mips/convolve2_horiz_dspr2.c      |    11 +-
 .../vpx_dsp/mips/convolve2_vert_dspr2.c       |    11 +-
 .../libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c |    44 +-
 .../vpx_dsp/mips/convolve8_avg_horiz_dspr2.c  |    17 +-
 .../libvpx/vpx_dsp/mips/convolve8_dspr2.c     |    25 +-
 .../vpx_dsp/mips/convolve8_horiz_dspr2.c      |    15 +-
 .../vpx_dsp/mips/convolve8_vert_dspr2.c       |    15 +-
 .../vpx_dsp/mips/convolve_common_dspr2.h      |    28 +-
 .../libvpx/libvpx/vpx_dsp/mips/deblock_msa.c  |   151 +-
 .../libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c    |    33 +-
 .../libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.c |    45 +-
 .../libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.h |    22 +-
 .../libvpx/vpx_dsp/mips/idct16x16_msa.c       |     1 +
 .../libvpx/vpx_dsp/mips/idct32x32_msa.c       |     1 +
 .../libvpx/libvpx/vpx_dsp/mips/idct4x4_msa.c  |     1 +
 .../libvpx/libvpx/vpx_dsp/mips/idct8x8_msa.c  |     1 +
 .../libvpx/vpx_dsp/mips/inv_txfm_dspr2.h      |     7 +-
 .../libvpx/libvpx/vpx_dsp/mips/inv_txfm_msa.h |     6 +-
 .../libvpx/vpx_dsp/mips/itrans4_dspr2.c       |     1 +
 .../libvpx/vpx_dsp/mips/loopfilter_16_msa.c   |    64 +-
 .../libvpx/vpx_dsp/mips/loopfilter_4_msa.c    |     1 +
 .../libvpx/vpx_dsp/mips/loopfilter_8_msa.c    |     1 +
 .../vpx_dsp/mips/loopfilter_filters_dspr2.h   |     6 +-
 .../vpx_dsp/mips/loopfilter_macros_dspr2.h    |     6 +-
 .../vpx_dsp/mips/loopfilter_masks_dspr2.h     |     6 +-
 .../libvpx/vpx_dsp/mips/loopfilter_msa.h      |     6 +-
 media/libvpx/libvpx/vpx_dsp/mips/macros_msa.h |   596 +-
 media/libvpx/libvpx/vpx_dsp/mips/sad_mmi.c    |   807 +
 media/libvpx/libvpx/vpx_dsp/mips/sad_msa.c    |   750 +-
 .../vpx_dsp/mips/sub_pixel_variance_msa.c     |    61 +-
 .../libvpx/libvpx/vpx_dsp/mips/subtract_mmi.c |   306 +
 .../libvpx/vpx_dsp/mips/sum_squares_msa.c     |   129 +
 .../libvpx/vpx_dsp/mips/txfm_macros_msa.h     |     6 +-
 .../libvpx/libvpx/vpx_dsp/mips/variance_mmi.c |  1357 ++
 .../libvpx/libvpx/vpx_dsp/mips/variance_msa.c |     5 +-
 .../mips/vpx_convolve8_avg_horiz_msa.c        |   123 +-
 .../vpx_dsp/mips/vpx_convolve8_avg_msa.c      |   105 +-
 .../vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c |    82 +-
 .../vpx_dsp/mips/vpx_convolve8_horiz_msa.c    |    15 +-
 .../libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c   |   716 +
 .../libvpx/vpx_dsp/mips/vpx_convolve8_msa.c   |   622 +-
 .../vpx_dsp/mips/vpx_convolve8_vert_msa.c     |    15 +-
 .../vpx_dsp/mips/vpx_convolve_avg_msa.c       |    14 +-
 .../vpx_dsp/mips/vpx_convolve_copy_msa.c      |    14 +-
 .../libvpx/vpx_dsp/mips/vpx_convolve_msa.h    |    23 +-
 media/libvpx/libvpx/vpx_dsp/postproc.h        |     6 +-
 .../vpx_dsp/ppc/bitdepth_conversion_vsx.h     |    47 +
 media/libvpx/libvpx/vpx_dsp/ppc/deblock_vsx.c |   374 +
 .../libvpx/libvpx/vpx_dsp/ppc/fdct32x32_vsx.c |   553 +
 .../libvpx/libvpx/vpx_dsp/ppc/hadamard_vsx.c  |   119 +
 .../libvpx/libvpx/vpx_dsp/ppc/intrapred_vsx.c |   767 +
 .../libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c  |  1828 ++
 .../libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.h  |    48 +
 .../libvpx/libvpx/vpx_dsp/ppc/quantize_vsx.c  |   301 +
 media/libvpx/libvpx/vpx_dsp/ppc/sad_vsx.c     |   261 +
 .../libvpx/libvpx/vpx_dsp/ppc/subtract_vsx.c  |   117 +
 .../libvpx/libvpx/vpx_dsp/ppc/transpose_vsx.h |   133 +
 .../libvpx/vpx_dsp/ppc/txfm_common_vsx.h      |    90 +
 media/libvpx/libvpx/vpx_dsp/ppc/types_vsx.h   |   108 +
 .../libvpx/libvpx/vpx_dsp/ppc/variance_vsx.c  |   271 +
 .../libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c     |   408 +
 media/libvpx/libvpx/vpx_dsp/prob.h            |    12 +-
 media/libvpx/libvpx/vpx_dsp/psnr.c            |   102 +-
 media/libvpx/libvpx/vpx_dsp/psnr.h            |    48 +-
 media/libvpx/libvpx/vpx_dsp/psnrhvs.c         |    18 +-
 media/libvpx/libvpx/vpx_dsp/quantize.c        |   336 +-
 media/libvpx/libvpx/vpx_dsp/quantize.h        |    32 +-
 media/libvpx/libvpx/vpx_dsp/sad.c             |   200 +-
 media/libvpx/libvpx/vpx_dsp/skin_detection.c  |    79 +
 media/libvpx/libvpx/vpx_dsp/skin_detection.h  |    24 +
 media/libvpx/libvpx/vpx_dsp/sse.c             |    59 +
 media/libvpx/libvpx/vpx_dsp/ssim.c            |    31 +-
 media/libvpx/libvpx/vpx_dsp/ssim.h            |     6 +-
 media/libvpx/libvpx/vpx_dsp/subtract.c        |    28 +-
 media/libvpx/libvpx/vpx_dsp/sum_squares.c     |     5 +-
 media/libvpx/libvpx/vpx_dsp/txfm_common.h     |    76 +-
 media/libvpx/libvpx/vpx_dsp/variance.c        |   567 +-
 media/libvpx/libvpx/vpx_dsp/variance.h        |    55 +-
 media/libvpx/libvpx/vpx_dsp/vpx_convolve.c    |   406 +-
 media/libvpx/libvpx/vpx_dsp/vpx_convolve.h    |    18 +-
 media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk        |   227 +-
 media/libvpx/libvpx/vpx_dsp/vpx_dsp_common.h  |    37 +-
 media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c    |     2 +-
 .../libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl       |  2103 +-
 media/libvpx/libvpx/vpx_dsp/vpx_filter.h      |    27 +-
 .../libvpx/vpx_dsp/x86/add_noise_sse2.asm     |     8 +-
 .../libvpx/vpx_dsp/x86/avg_intrin_avx2.c      |   519 +
 .../libvpx/vpx_dsp/x86/avg_intrin_sse2.c      |   272 +-
 .../libvpx/libvpx/vpx_dsp/x86/avg_pred_avx2.c |   111 +
 .../libvpx/libvpx/vpx_dsp/x86/avg_pred_sse2.c |    69 +
 .../libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm   |    83 +-
 .../vpx_dsp/x86/bitdepth_conversion_avx2.h    |    44 +
 .../vpx_dsp/x86/bitdepth_conversion_sse2.asm  |    90 +
 .../{fdct.h => bitdepth_conversion_sse2.h}    |    17 +-
 media/libvpx/libvpx/vpx_dsp/x86/convolve.h    |   353 +-
 .../libvpx/libvpx/vpx_dsp/x86/convolve_avx2.h |   161 +
 .../libvpx/libvpx/vpx_dsp/x86/convolve_sse2.h |    88 +
 .../libvpx/vpx_dsp/x86/convolve_ssse3.h       |   112 +
 .../libvpx/vpx_dsp/x86/deblock_sse2.asm       |   241 +-
 .../vpx_dsp/x86/fwd_dct32x32_impl_avx2.h      |   256 +-
 .../vpx_dsp/x86/fwd_dct32x32_impl_sse2.h      |   262 +-
 .../libvpx/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c |   377 +
 .../libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h   |    10 +-
 .../libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h |    10 +-
 .../vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm     |   416 +-
 .../libvpx/vpx_dsp/x86/highbd_convolve_avx2.c |  1495 ++
 .../vpx_dsp/x86/highbd_idct16x16_add_sse2.c   |   355 +
 .../vpx_dsp/x86/highbd_idct16x16_add_sse4.c   |   349 +
 .../vpx_dsp/x86/highbd_idct32x32_add_sse2.c   |   782 +
 .../vpx_dsp/x86/highbd_idct32x32_add_sse4.c   |   765 +
 .../vpx_dsp/x86/highbd_idct4x4_add_sse2.c     |   160 +
 .../vpx_dsp/x86/highbd_idct4x4_add_sse4.c     |    47 +
 .../vpx_dsp/x86/highbd_idct8x8_add_sse2.c     |   213 +
 .../vpx_dsp/x86/highbd_idct8x8_add_sse4.c     |   210 +
 .../x86/highbd_intrapred_intrin_sse2.c        |   534 +
 .../x86/highbd_intrapred_intrin_ssse3.c       |   930 +
 .../vpx_dsp/x86/highbd_intrapred_sse2.asm     |    16 +-
 .../libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h |   404 +
 .../libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h |   112 +
 .../vpx_dsp/x86/highbd_loopfilter_sse2.c      |   375 +-
 .../vpx_dsp/x86/highbd_quantize_intrin_avx2.c |   254 +
 .../vpx_dsp/x86/highbd_quantize_intrin_sse2.c |   187 +-
 .../libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c    |   462 +
 .../libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm  |    43 +-
 .../libvpx/vpx_dsp/x86/highbd_sad_avx2.c      |   522 +
 .../libvpx/vpx_dsp/x86/highbd_sad_sse2.asm    |    63 +-
 .../x86/highbd_subpel_variance_impl_sse2.asm  |   398 +-
 .../vpx_dsp/x86/highbd_variance_impl_sse2.asm |    22 +-
 .../libvpx/vpx_dsp/x86/highbd_variance_sse2.c |   242 +-
 .../libvpx/vpx_dsp/x86/intrapred_sse2.asm     |     2 +-
 .../libvpx/libvpx/vpx_dsp/x86/inv_txfm_avx2.c |   626 +
 .../libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.c |  4318 +---
 .../libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.h |   820 +-
 .../libvpx/vpx_dsp/x86/inv_txfm_ssse3.c       |   364 +
 .../libvpx/vpx_dsp/x86/inv_txfm_ssse3.h       |   110 +
 .../vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm     |  1793 --
 .../libvpx/vpx_dsp/x86/inv_wht_sse2.asm       |    12 +-
 .../libvpx/vpx_dsp/x86/loopfilter_avx2.c      |   202 +-
 ...filter_intrin_sse2.c => loopfilter_sse2.c} |   597 +-
 media/libvpx/libvpx/vpx_dsp/x86/mem_sse2.h    |   154 +
 .../libvpx/vpx_dsp/x86/post_proc_sse2.c       |   141 +
 .../libvpx/libvpx/vpx_dsp/x86/quantize_avx.c  |   254 +
 .../libvpx/libvpx/vpx_dsp/x86/quantize_avx2.c |   290 +
 .../vpx_dsp/x86/quantize_avx_x86_64.asm       |   544 -
 .../libvpx/libvpx/vpx_dsp/x86/quantize_sse2.c |   254 +-
 .../libvpx/libvpx/vpx_dsp/x86/quantize_sse2.h |   126 +
 .../libvpx/vpx_dsp/x86/quantize_ssse3.c       |   228 +
 .../libvpx/vpx_dsp/x86/quantize_ssse3.h       |    51 +
 .../vpx_dsp/x86/quantize_ssse3_x86_64.asm     |   346 -
 media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx2.c  |   306 +-
 .../libvpx/libvpx/vpx_dsp/x86/sad4d_avx512.c  |   105 +
 .../libvpx/libvpx/vpx_dsp/x86/sad4d_sse2.asm  |    43 +-
 media/libvpx/libvpx/vpx_dsp/x86/sad_avx2.c    |   174 +-
 media/libvpx/libvpx/vpx_dsp/x86/sad_avx512.c  |    88 +
 media/libvpx/libvpx/vpx_dsp/x86/sad_sse2.asm  |    74 +-
 media/libvpx/libvpx/vpx_dsp/x86/sad_sse3.asm  |   374 -
 media/libvpx/libvpx/vpx_dsp/x86/sad_sse4.asm  |   359 -
 media/libvpx/libvpx/vpx_dsp/x86/sad_ssse3.asm |   370 -
 media/libvpx/libvpx/vpx_dsp/x86/sse_avx2.c    |   368 +
 media/libvpx/libvpx/vpx_dsp/x86/sse_sse4.c    |   312 +
 .../libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm    |    97 +-
 .../vpx_dsp/x86/subpel_variance_sse2.asm      |   359 +-
 .../libvpx/libvpx/vpx_dsp/x86/subtract_avx2.c |   203 +
 .../libvpx/vpx_dsp/x86/subtract_sse2.asm      |     1 +
 .../libvpx/vpx_dsp/x86/sum_squares_sse2.c     |   191 +-
 .../libvpx/vpx_dsp/x86/transpose_sse2.h       |   367 +
 .../libvpx/vpx_dsp/x86/txfm_common_sse2.h     |     9 +-
 .../libvpx/libvpx/vpx_dsp/x86/variance_avx2.c |   899 +-
 .../libvpx/vpx_dsp/x86/variance_impl_avx2.c   |   708 -
 .../libvpx/libvpx/vpx_dsp/x86/variance_sse2.c |   708 +-
 .../libvpx/libvpx/vpx_dsp/x86/vpx_asm_stubs.c |   162 -
 .../vpx_dsp/x86/vpx_convolve_copy_sse2.asm    |    10 +-
 .../vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm |    42 +-
 .../x86/vpx_high_subpixel_bilinear_sse2.asm   |    40 +-
 .../vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c |  1161 +
 .../vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c |  1674 +-
 .../x86/vpx_subpixel_8t_intrin_ssse3.c        |  1281 +-
 .../vpx_dsp/x86/vpx_subpixel_8t_sse2.asm      |    26 +-
 .../vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm     |    50 +-
 .../x86/vpx_subpixel_bilinear_sse2.asm        |    26 +-
 .../x86/vpx_subpixel_bilinear_ssse3.asm       |    26 +-
 .../libvpx/vpx_mem/include/vpx_mem_intrnl.h   |     8 +-
 media/libvpx/libvpx/vpx_mem/vpx_mem.c         |    11 +-
 media/libvpx/libvpx/vpx_mem/vpx_mem.h         |    15 +-
 .../libvpx/vpx_ports/aarch32_cpudetect.c      |    90 +
 .../libvpx/vpx_ports/aarch64_cpudetect.c      |   241 +
 media/libvpx/libvpx/vpx_ports/arm.h           |    22 +-
 media/libvpx/libvpx/vpx_ports/arm_cpudetect.c |   154 -
 media/libvpx/libvpx/vpx_ports/arm_cpudetect.h |    52 +
 media/libvpx/libvpx/vpx_ports/asmdefs_mmi.h   |    81 +
 media/libvpx/libvpx/vpx_ports/bitops.h        |    31 +-
 .../libvpx/vpx_ports/compiler_attributes.h    |    69 +
 .../libvpx/vpx_ports/emmintrin_compat.h       |     6 +-
 media/libvpx/libvpx/vpx_ports/emms_mmx.asm    |    18 +
 .../libvpx/vpx_ports/{config.h => emms_mmx.c} |     9 +-
 .../{emms.asm => float_control_word.asm}      |     9 +-
 media/libvpx/libvpx/vpx_ports/loongarch.h     |    29 +
 .../libvpx/vpx_ports/loongarch_cpudetect.c    |    40 +
 media/libvpx/libvpx/vpx_ports/mem.h           |    36 +-
 media/libvpx/libvpx/vpx_ports/mem_ops.h       |     7 +-
 .../libvpx/libvpx/vpx_ports/mem_ops_aligned.h |     6 +-
 media/libvpx/libvpx/vpx_ports/mips.h          |    27 +
 .../libvpx/libvpx/vpx_ports/mips_cpudetect.c  |    57 +
 media/libvpx/libvpx/vpx_ports/msvc.h          |    32 -
 media/libvpx/libvpx/vpx_ports/ppc.h           |    29 +
 media/libvpx/libvpx/vpx_ports/ppc_cpudetect.c |    80 +
 media/libvpx/libvpx/vpx_ports/static_assert.h |    30 +
 media/libvpx/libvpx/vpx_ports/system_state.h  |    22 +-
 media/libvpx/libvpx/vpx_ports/vpx_once.h      |    31 +-
 media/libvpx/libvpx/vpx_ports/vpx_ports.mk    |    40 +-
 media/libvpx/libvpx/vpx_ports/vpx_timer.h     |    38 +-
 media/libvpx/libvpx/vpx_ports/x86.h           |   196 +-
 .../libvpx/vpx_ports/x86_abi_support.asm      |    75 +-
 .../libvpx/vpx_scale/generic/gen_scalers.c    |     4 +-
 .../libvpx/vpx_scale/generic/vpx_scale.c      |     7 +-
 .../libvpx/vpx_scale/generic/yv12config.c     |    73 +-
 .../libvpx/vpx_scale/generic/yv12extend.c     |    67 +-
 media/libvpx/libvpx/vpx_scale/vpx_scale.h     |     6 +-
 .../libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c  |     2 +-
 .../libvpx/libvpx/vpx_scale/vpx_scale_rtcd.pl |    12 +
 media/libvpx/libvpx/vpx_scale/yv12config.h    |    10 +-
 media/libvpx/libvpx/vpx_util/endian_inl.h     |     6 +-
 .../libvpx/vpx_util/loongson_intrinsics.h     |  2090 ++
 media/libvpx/libvpx/vpx_util/vpx_atomics.h    |   110 +
 media/libvpx/libvpx/vpx_util/vpx_debug_util.c |   282 +
 media/libvpx/libvpx/vpx_util/vpx_debug_util.h |    70 +
 media/libvpx/libvpx/vpx_util/vpx_pthread.h    |   150 +
 media/libvpx/libvpx/vpx_util/vpx_thread.c     |    93 +-
 media/libvpx/libvpx/vpx_util/vpx_thread.h     |   349 +-
 media/libvpx/libvpx/vpx_util/vpx_timestamp.h  |    49 +
 media/libvpx/libvpx/vpx_util/vpx_util.mk      |     7 +
 .../libvpx/vpx_util/vpx_write_yuv_frame.c     |    46 +
 .../libvpx/vpx_util/vpx_write_yuv_frame.h     |    27 +
 media/libvpx/libvpx/vpx_version.h             |    11 +
 media/libvpx/libvpx/vpxdec.c                  |   191 +-
 media/libvpx/libvpx/vpxenc.c                  |   695 +-
 media/libvpx/libvpx/vpxenc.h                  |     7 +-
 media/libvpx/libvpx/vpxstats.c                |     2 +-
 media/libvpx/libvpx/vpxstats.h                |     6 +-
 media/libvpx/libvpx/warnings.c                |     2 +-
 media/libvpx/libvpx/warnings.h                |     6 +-
 media/libvpx/libvpx/webmdec.cc                |    49 +-
 media/libvpx/libvpx/webmdec.h                 |     8 +-
 media/libvpx/libvpx/webmenc.cc                |     4 +-
 media/libvpx/libvpx/webmenc.h                 |     6 +-
 media/libvpx/libvpx/y4menc.c                  |    44 +-
 media/libvpx/libvpx/y4menc.h                  |     6 +-
 media/libvpx/libvpx/y4minput.c                |   717 +-
 media/libvpx/libvpx/y4minput.h                |    18 +-
 media/libvpx/moz.build                        |    55 +-
 media/libvpx/sources.mozbuild                 |  1297 +-
 1282 files changed, 260258 insertions(+), 118999 deletions(-)
 create mode 100644 media/libvpx/config/linux/arm64/vp8_rtcd.h
 create mode 100644 media/libvpx/config/linux/arm64/vp9_rtcd.h
 create mode 100644 media/libvpx/config/linux/arm64/vpx_config.asm
 create mode 100644 media/libvpx/config/linux/arm64/vpx_config.c
 create mode 100644 media/libvpx/config/linux/arm64/vpx_config.h
 create mode 100644 media/libvpx/config/linux/arm64/vpx_dsp_rtcd.h
 create mode 100644 media/libvpx/config/linux/arm64/vpx_scale_rtcd.h
 create mode 100644 media/libvpx/config/linux/loongarch64/vp8_rtcd.h
 create mode 100644 media/libvpx/config/linux/loongarch64/vp9_rtcd.h
 create mode 100644 media/libvpx/config/linux/loongarch64/vpx_config.asm
 create mode 100644 media/libvpx/config/linux/loongarch64/vpx_config.c
 create mode 100644 media/libvpx/config/linux/loongarch64/vpx_config.h
 create mode 100644 media/libvpx/config/linux/loongarch64/vpx_dsp_rtcd.h
 create mode 100644 media/libvpx/config/linux/loongarch64/vpx_scale_rtcd.h
 create mode 100644 media/libvpx/config/linux/mips32/vp8_rtcd.h
 create mode 100644 media/libvpx/config/linux/mips32/vp9_rtcd.h
 create mode 100644 media/libvpx/config/linux/mips32/vpx_config.asm
 create mode 100644 media/libvpx/config/linux/mips32/vpx_config.c
 create mode 100644 media/libvpx/config/linux/mips32/vpx_config.h
 create mode 100644 media/libvpx/config/linux/mips32/vpx_dsp_rtcd.h
 create mode 100644 media/libvpx/config/linux/mips32/vpx_scale_rtcd.h
 create mode 100644 media/libvpx/config/linux/mips64/vp8_rtcd.h
 create mode 100644 media/libvpx/config/linux/mips64/vp9_rtcd.h
 create mode 100644 media/libvpx/config/linux/mips64/vpx_config.asm
 create mode 100644 media/libvpx/config/linux/mips64/vpx_config.c
 create mode 100644 media/libvpx/config/linux/mips64/vpx_config.h
 create mode 100644 media/libvpx/config/linux/mips64/vpx_dsp_rtcd.h
 create mode 100644 media/libvpx/config/linux/mips64/vpx_scale_rtcd.h
 create mode 100644 media/libvpx/config/linux/ppc64le/vp8_rtcd.h
 create mode 100644 media/libvpx/config/linux/ppc64le/vp9_rtcd.h
 create mode 100644 media/libvpx/config/linux/ppc64le/vpx_config.asm
 create mode 100644 media/libvpx/config/linux/ppc64le/vpx_config.c
 create mode 100644 media/libvpx/config/linux/ppc64le/vpx_config.h
 create mode 100644 media/libvpx/config/linux/ppc64le/vpx_dsp_rtcd.h
 create mode 100644 media/libvpx/config/linux/ppc64le/vpx_scale_rtcd.h
 create mode 100644 media/libvpx/config/mac/arm64/vp8_rtcd.h
 create mode 100644 media/libvpx/config/mac/arm64/vp9_rtcd.h
 create mode 100644 media/libvpx/config/mac/arm64/vpx_config.asm
 create mode 100644 media/libvpx/config/mac/arm64/vpx_config.c
 create mode 100644 media/libvpx/config/mac/arm64/vpx_config.h
 create mode 100644 media/libvpx/config/mac/arm64/vpx_dsp_rtcd.h
 create mode 100644 media/libvpx/config/mac/arm64/vpx_scale_rtcd.h
 create mode 100644 media/libvpx/libvpx/CONTRIBUTING.md
 delete mode 100644 media/libvpx/libvpx/examples/resize_util.c
 rename media/libvpx/libvpx/{vpx => examples}/svc_context.h (83%)
 rename media/libvpx/libvpx/{vpx/src => examples}/svc_encodeframe.c (80%)
 create mode 100644 media/libvpx/libvpx/examples/vpx_dec_fuzzer.cc
 create mode 100644 media/libvpx/libvpx/examples/vpx_enc_fuzzer.cc
 create mode 100644 media/libvpx/libvpx/test/bench.cc
 create mode 100644 media/libvpx/libvpx/test/bench.h
 create mode 100644 media/libvpx/libvpx/test/buffer.h
 create mode 100644 media/libvpx/libvpx/test/comp_avg_pred_test.cc
 delete mode 100644 media/libvpx/libvpx/test/datarate_test.cc
 create mode 100644 media/libvpx/libvpx/test/dct_partial_test.cc
 create mode 100644 media/libvpx/libvpx/test/dct_test.cc
 create mode 100644 media/libvpx/libvpx/test/decode_corrupted.cc
 delete mode 100644 media/libvpx/libvpx/test/fdct4x4_test.cc
 create mode 100644 media/libvpx/libvpx/test/init_vpx_test.cc
 create mode 100644 media/libvpx/libvpx/test/init_vpx_test.h
 create mode 100644 media/libvpx/libvpx/test/non_greedy_mv_test.cc
 delete mode 100644 media/libvpx/libvpx/test/resize_util.sh
 create mode 100644 media/libvpx/libvpx/test/svc_datarate_test.cc
 create mode 100644 media/libvpx/libvpx/test/svc_end_to_end_test.cc
 create mode 100644 media/libvpx/libvpx/test/svc_test.h
 create mode 100644 media/libvpx/libvpx/test/test_rc_interface.cc
 create mode 100644 media/libvpx/libvpx/test/timestamp_test.cc
 create mode 100644 media/libvpx/libvpx/test/vp8_datarate_test.cc
 create mode 100644 media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc
 rename media/libvpx/libvpx/test/{vp9_error_block_test.cc => vp9_block_error_test.cc} (58%)
 create mode 100755 media/libvpx/libvpx/test/vp9_c_vs_simd_encode.sh
 create mode 100644 media/libvpx/libvpx/test/vp9_datarate_test.cc
 rename media/libvpx/libvpx/test/{vp9_denoiser_sse2_test.cc => vp9_denoiser_test.cc} (50%)
 create mode 100644 media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc
 delete mode 100644 media/libvpx/libvpx/test/vp9_frame_parallel_test.cc
 create mode 100644 media/libvpx/libvpx/test/vp9_motion_vector_test.cc
 create mode 100644 media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc
 create mode 100644 media/libvpx/libvpx/test/vp9_roi_test.cc
 create mode 100644 media/libvpx/libvpx/test/vp9_scale_test.cc
 delete mode 100644 media/libvpx/libvpx/test/vp9_spatial_svc_encoder.sh
 create mode 100644 media/libvpx/libvpx/test/vpx_image_test.cc
 create mode 100644 media/libvpx/libvpx/test/vpx_scale_test.h
 create mode 100644 media/libvpx/libvpx/test/yuv_temporal_filter_test.cc
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/.clang-format
 delete mode 100644 media/libvpx/libvpx/third_party/googletest/src/CHANGES
 delete mode 100644 media/libvpx/libvpx/third_party/googletest/src/README
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/README.md
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-assertion-result.h
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-matchers.h
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-message.h
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/README.md
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest.h
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/src/gtest-assertion-result.cc
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/src/gtest-death-test.cc
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/src/gtest-filepath.cc
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/src/gtest-internal-inl.h
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/src/gtest-matchers.cc
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/src/gtest-port.cc
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/src/gtest-printers.cc
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/src/gtest-test-part.cc
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/src/gtest-typed-test.cc
 create mode 100644 media/libvpx/libvpx/third_party/googletest/src/src/gtest.cc
 create mode 100644 media/libvpx/libvpx/third_party/libyuv/LICENSE
 create mode 100644 media/libvpx/libvpx/third_party/libyuv/include/libyuv/macros_msa.h
 create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/compare_msa.cc
 delete mode 100644 media/libvpx/libvpx/third_party/libyuv/source/rotate_mips.cc
 create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/rotate_msa.cc
 delete mode 100644 media/libvpx/libvpx/third_party/libyuv/source/row_mips.cc
 create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/row_msa.cc
 delete mode 100644 media/libvpx/libvpx/third_party/libyuv/source/scale_mips.cc
 create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/scale_msa.cc
 create mode 100644 media/libvpx/libvpx/third_party/nalloc/LICENSE
 create mode 100644 media/libvpx/libvpx/third_party/nalloc/README.libvpx
 create mode 100644 media/libvpx/libvpx/third_party/nalloc/nalloc.h
 create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/Anandan.py
 create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/Exhaust.py
 create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/GroundTruth.py
 create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/HornSchunck.py
 create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/MotionEST.py
 create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/SearchSmooth.py
 create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/Util.py
 create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/genY4M/genY4M.py
 create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/BVH.pde
 create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Camera.pde
 create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/MotionField.pde
 create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/PointCloud.pde
 create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Ray_Tracing.pde
 create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Scene.pde
 create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Transform.pde
 create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Util.pde
 create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/sketch_3D_reconstruction.pde
 create mode 100644 media/libvpx/libvpx/tools/README.pgo.md
 delete mode 100644 media/libvpx/libvpx/tools/all_builds.py
 delete mode 100644 media/libvpx/libvpx/tools/author_first_release.sh
 delete mode 100644 media/libvpx/libvpx/tools/ftfy.sh
 create mode 100644 media/libvpx/libvpx/tools/non_greedy_mv/non_greedy_mv.py
 create mode 100644 media/libvpx/libvpx/tools/set_analyzer_env.sh
 create mode 100644 media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.h
 delete mode 100644 media/libvpx/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c
 delete mode 100644 media/libvpx/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c
 create mode 100644 media/libvpx/libvpx/vp8/common/loongarch/idct_lsx.c
 create mode 100644 media/libvpx/libvpx/vp8/common/loongarch/loopfilter_filters_lsx.c
 create mode 100644 media/libvpx/libvpx/vp8/common/loongarch/sixtap_filter_lsx.c
 create mode 100644 media/libvpx/libvpx/vp8/common/mips/mmi/copymem_mmi.c
 create mode 100644 media/libvpx/libvpx/vp8/common/mips/mmi/dequantize_mmi.c
 create mode 100644 media/libvpx/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c
 create mode 100644 media/libvpx/libvpx/vp8/common/mips/mmi/idctllm_mmi.c
 create mode 100644 media/libvpx/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c
 create mode 100644 media/libvpx/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c
 create mode 100644 media/libvpx/libvpx/vp8/common/vp8_skin_detection.c
 create mode 100644 media/libvpx/libvpx/vp8/common/vp8_skin_detection.h
 create mode 100644 media/libvpx/libvpx/vp8/common/x86/bilinear_filter_sse2.c
 delete mode 100644 media/libvpx/libvpx/vp8/common/x86/filter_x86.c
 delete mode 100644 media/libvpx/libvpx/vp8/common/x86/filter_x86.h
 rename media/libvpx/libvpx/vp8/{common => encoder}/copy_c.c (100%)
 create mode 100644 media/libvpx/libvpx/vp8/encoder/ethreading.h
 create mode 100644 media/libvpx/libvpx/vp8/encoder/loongarch/dct_lsx.c
 create mode 100644 media/libvpx/libvpx/vp8/encoder/loongarch/encodeopt_lsx.c
 create mode 100644 media/libvpx/libvpx/vp8/encoder/loongarch/vp8_quantize_lsx.c
 create mode 100644 media/libvpx/libvpx/vp8/encoder/mips/mmi/dct_mmi.c
 create mode 100644 media/libvpx/libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c
 create mode 100644 media/libvpx/libvpx/vp8/encoder/picklpf.h
 create mode 100644 media/libvpx/libvpx/vp8/encoder/temporal_filter.h
 rename media/libvpx/libvpx/vp8/encoder/x86/{encodeopt.asm => block_error_sse2.asm} (97%)
 rename media/libvpx/libvpx/vp8/{common => encoder}/x86/copy_sse2.asm (98%)
 rename media/libvpx/libvpx/vp8/{common => encoder}/x86/copy_sse3.asm (99%)
 delete mode 100644 media/libvpx/libvpx/vp8/encoder/x86/quantize_mmx.asm
 delete mode 100644 media/libvpx/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c
 rename media/libvpx/libvpx/vp8/encoder/x86/{quantize_ssse3.c => vp8_quantize_ssse3.c} (76%)
 create mode 100644 media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc
 create mode 100644 media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h
 create mode 100644 media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c
 create mode 100644 media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c
 create mode 100644 media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c
 create mode 100644 media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht16x16_add_neon.c
 create mode 100644 media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht_neon.h
 create mode 100644 media/libvpx/libvpx/vp9/common/ppc/vp9_idct_vsx.c
 create mode 100644 media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c
 create mode 100644 media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c
 create mode 100644 media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c
 delete mode 100644 media/libvpx/libvpx/vp9/decoder/vp9_dthread.c
 delete mode 100644 media/libvpx/libvpx/vp9/decoder/vp9_dthread.h
 create mode 100644 media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c
 create mode 100644 media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h
 create mode 100644 media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c
 create mode 100644 media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
 create mode 100644 media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_sve.c
 create mode 100644 media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c
 create mode 100644 media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_error_neon.c
 create mode 100644 media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c
 create mode 100644 media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_sve2.c
 create mode 100644 media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c
 create mode 100644 media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon_dotprod.c
 create mode 100644 media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon_i8mm.c
 delete mode 100644 media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c
 create mode 100644 media/libvpx/libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c
 create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_blockiness.h
 create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c
 create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h
 create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_firstpass_stats.h
 create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_frame_scale.c
 create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_job_queue.h
 create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c
 create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.h
 create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.c
 create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.h
 create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_partition_models.h
 create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter_constants.h
 create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c
 create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h
 create mode 100644 media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_avx2.c
 create mode 100644 media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c
 create mode 100644 media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_ssse3.c
 create mode 100644 media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_avx2.c
 create mode 100644 media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_sse4.c
 create mode 100644 media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_ssse3.c
 delete mode 100644 media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c
 delete mode 100644 media/libvpx/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
 create mode 100644 media/libvpx/libvpx/vp9/encoder/x86/vp9_error_avx2.c
 delete mode 100644 media/libvpx/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c
 delete mode 100644 media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_error_avx.asm
 delete mode 100644 media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_error_sse2.asm
 create mode 100644 media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c
 create mode 100644 media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c
 delete mode 100644 media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
 delete mode 100644 media/libvpx/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
 create mode 100644 media/libvpx/libvpx/vp9/ratectrl_rtc.cc
 create mode 100644 media/libvpx/libvpx/vp9/ratectrl_rtc.h
 create mode 100644 media/libvpx/libvpx/vp9/vp9_cx_iface.h
 create mode 100644 media/libvpx/libvpx/vp9/vp9_iface_common.c
 delete mode 100644 media/libvpx/libvpx/vpx/exports_spatial_svc
 create mode 100644 media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h
 create mode 100644 media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h
 create mode 100644 media/libvpx/libvpx/vpx/vpx_tpl.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/avg_pred_neon.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/fdct_neon.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/fdct_partial_neon.c
 delete mode 100644 media/libvpx/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_neon.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_pred_neon.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_convolve8_neon.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_convolve8_sve.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_hadamard_neon.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_idct_neon.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_quantize_neon.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_sad4d_neon.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_sad_neon.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_sse_neon.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon_dotprod.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_sve.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve2.c
 delete mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c
 delete mode 100644 media/libvpx/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm
 delete mode 100644 media/libvpx/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm
 delete mode 100644 media/libvpx/libvpx/vpx_dsp/arm/idct16x16_neon.c
 delete mode 100644 media/libvpx/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm
 delete mode 100644 media/libvpx/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/quantize_neon.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon_dotprod.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/sad_neon_dotprod.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/sse_neon.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/sse_neon_dotprod.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/sum_squares_neon.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/sum_squares_sve.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/variance_neon_dotprod.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm
 delete mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
 delete mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve2_bridge.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve_bridge.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/avg_lsx.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/avg_pred_lsx.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/bitdepth_conversion_lsx.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/fwd_dct32x32_lsx.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/idct32x32_lsx.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/intrapred_lsx.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_16_lsx.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_4_lsx.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_8_lsx.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_lsx.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/quantize_lsx.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/sad_lsx.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/sub_pixel_variance_lsx.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/subtract_lsx.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/txfm_macros_lsx.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_lsx.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_lsx.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/sad_mmi.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/subtract_mmi.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/sum_squares_msa.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/variance_mmi.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/bitdepth_conversion_vsx.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/deblock_vsx.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/fdct32x32_vsx.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/hadamard_vsx.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/intrapred_vsx.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/quantize_vsx.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/sad_vsx.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/subtract_vsx.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/transpose_vsx.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/txfm_common_vsx.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/types_vsx.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/variance_vsx.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/skin_detection.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/skin_detection.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/sse.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_avx2.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/avg_pred_avx2.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/avg_pred_sse2.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.asm
 rename media/libvpx/libvpx/vpx_dsp/x86/{fdct.h => bitdepth_conversion_sse2.h} (76%)
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/convolve_avx2.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/convolve_sse2.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/convolve_ssse3.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_avx2.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_avx2.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h
 delete mode 100644 media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
 rename media/libvpx/libvpx/vpx_dsp/x86/{loopfilter_intrin_sse2.c => loopfilter_sse2.c} (80%)
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/mem_sse2.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/post_proc_sse2.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/quantize_avx.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/quantize_avx2.c
 delete mode 100644 media/libvpx/libvpx/vpx_dsp/x86/quantize_avx_x86_64.asm
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.h
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.h
 delete mode 100644 media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx512.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/sad_avx512.c
 delete mode 100644 media/libvpx/libvpx/vpx_dsp/x86/sad_sse3.asm
 delete mode 100644 media/libvpx/libvpx/vpx_dsp/x86/sad_sse4.asm
 delete mode 100644 media/libvpx/libvpx/vpx_dsp/x86/sad_ssse3.asm
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/sse_avx2.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/sse_sse4.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/subtract_avx2.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/transpose_sse2.h
 delete mode 100644 media/libvpx/libvpx/vpx_dsp/x86/variance_impl_avx2.c
 delete mode 100644 media/libvpx/libvpx/vpx_dsp/x86/vpx_asm_stubs.c
 create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
 create mode 100644 media/libvpx/libvpx/vpx_ports/aarch32_cpudetect.c
 create mode 100644 media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c
 delete mode 100644 media/libvpx/libvpx/vpx_ports/arm_cpudetect.c
 create mode 100644 media/libvpx/libvpx/vpx_ports/arm_cpudetect.h
 create mode 100644 media/libvpx/libvpx/vpx_ports/asmdefs_mmi.h
 create mode 100644 media/libvpx/libvpx/vpx_ports/compiler_attributes.h
 create mode 100644 media/libvpx/libvpx/vpx_ports/emms_mmx.asm
 rename media/libvpx/libvpx/vpx_ports/{config.h => emms_mmx.c} (66%)
 rename media/libvpx/libvpx/vpx_ports/{emms.asm => float_control_word.asm} (81%)
 create mode 100644 media/libvpx/libvpx/vpx_ports/loongarch.h
 create mode 100644 media/libvpx/libvpx/vpx_ports/loongarch_cpudetect.c
 create mode 100644 media/libvpx/libvpx/vpx_ports/mips.h
 create mode 100644 media/libvpx/libvpx/vpx_ports/mips_cpudetect.c
 delete mode 100644 media/libvpx/libvpx/vpx_ports/msvc.h
 create mode 100644 media/libvpx/libvpx/vpx_ports/ppc.h
 create mode 100644 media/libvpx/libvpx/vpx_ports/ppc_cpudetect.c
 create mode 100644 media/libvpx/libvpx/vpx_ports/static_assert.h
 create mode 100644 media/libvpx/libvpx/vpx_util/loongson_intrinsics.h
 create mode 100644 media/libvpx/libvpx/vpx_util/vpx_atomics.h
 create mode 100644 media/libvpx/libvpx/vpx_util/vpx_debug_util.c
 create mode 100644 media/libvpx/libvpx/vpx_util/vpx_debug_util.h
 create mode 100644 media/libvpx/libvpx/vpx_util/vpx_pthread.h
 create mode 100644 media/libvpx/libvpx/vpx_util/vpx_timestamp.h
 create mode 100644 media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.c
 create mode 100644 media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.h
 create mode 100644 media/libvpx/libvpx/vpx_version.h

diff --git a/media/libvpx/README_MOZILLA b/media/libvpx/README_MOZILLA
index 855d24111d..a04dea3485 100644
--- a/media/libvpx/README_MOZILLA
+++ b/media/libvpx/README_MOZILLA
@@ -8,4 +8,4 @@ The libvpx git repository is:
 
     https://chromium.googlesource.com/webm/libvpx
 
-The git commit ID used was v1.6.1
+The git commit ID used was v1.16.0
diff --git a/media/libvpx/config/generic/vp8_rtcd.h b/media/libvpx/config/generic/vp8_rtcd.h
index d0aebb66de..c68b01f24f 100644
--- a/media/libvpx/config/generic/vp8_rtcd.h
+++ b/media/libvpx/config/generic/vp8_rtcd.h
@@ -1,3 +1,14 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
 #ifndef VP8_RTCD_H_
 #define VP8_RTCD_H_
 
@@ -26,31 +37,34 @@ struct yv12_buffer_config;
 extern "C" {
 #endif
 
-void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 #define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_c
 
-void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 #define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_c
 
-void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 #define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_c
 
-void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 #define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_c
 
 int vp8_block_error_c(short *coeff, short *dqcoeff);
 #define vp8_block_error vp8_block_error_c
 
-void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
+#define vp8_copy32xn vp8_copy32xn_c
+
+void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
 #define vp8_copy_mem16x16 vp8_copy_mem16x16_c
 
-void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
 #define vp8_copy_mem8x4 vp8_copy_mem8x4_c
 
-void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
 #define vp8_copy_mem8x8 vp8_copy_mem8x8_c
 
-void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
+void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
 #define vp8_dc_only_idct_add vp8_dc_only_idct_add_c
 
 int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
@@ -59,7 +73,7 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride,
 int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
 #define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_c
 
-void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride);
+void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride);
 #define vp8_dequant_idct_add vp8_dequant_idct_add_c
 
 void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
@@ -68,7 +82,7 @@ void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u,
 void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
 #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c
 
-void vp8_dequantize_b_c(struct blockd*, short *dqc);
+void vp8_dequantize_b_c(struct blockd*, short *DQC);
 #define vp8_dequantize_b vp8_dequantize_b_c
 
 int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
@@ -77,31 +91,28 @@ int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct block
 void vp8_fast_quantize_b_c(struct block *, struct blockd *);
 #define vp8_fast_quantize_b vp8_fast_quantize_b_c
 
-int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-#define vp8_full_search_sad vp8_full_search_sad_c
-
-void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 #define vp8_loop_filter_bh vp8_loop_filter_bh_c
 
-void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 #define vp8_loop_filter_bv vp8_loop_filter_bv_c
 
-void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 #define vp8_loop_filter_mbh vp8_loop_filter_mbh_c
 
-void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 #define vp8_loop_filter_mbv vp8_loop_filter_mbv_c
 
-void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 #define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_c
 
-void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 #define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_c
 
-void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 #define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_c
 
-void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 #define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_c
 
 int vp8_mbblock_error_c(struct macroblock *mb, int dc);
@@ -110,7 +121,7 @@ int vp8_mbblock_error_c(struct macroblock *mb, int dc);
 int vp8_mbuverror_c(struct macroblock *mb);
 #define vp8_mbuverror vp8_mbuverror_c
 
-int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
 #define vp8_refining_search_sad vp8_refining_search_sad_c
 
 void vp8_regular_quantize_b_c(struct block *, struct blockd *);
@@ -122,28 +133,28 @@ void vp8_short_fdct4x4_c(short *input, short *output, int pitch);
 void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
 #define vp8_short_fdct8x4 vp8_short_fdct8x4_c
 
-void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
+void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
 #define vp8_short_idct4x4llm vp8_short_idct4x4llm_c
 
-void vp8_short_inv_walsh4x4_c(short *input, short *output);
+void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff);
 #define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_c
 
-void vp8_short_inv_walsh4x4_1_c(short *input, short *output);
+void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff);
 #define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c
 
 void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
 #define vp8_short_walsh4x4 vp8_short_walsh4x4_c
 
-void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 #define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_c
 
-void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 #define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_c
 
-void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 #define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_c
 
-void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 #define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_c
 
 void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count);
@@ -163,4 +174,4 @@ static void setup_rtcd_internal(void)
 }  // extern "C"
 #endif
 
-#endif
+#endif  // VP8_RTCD_H_
diff --git a/media/libvpx/config/generic/vp9_rtcd.h b/media/libvpx/config/generic/vp9_rtcd.h
index acda9dadf1..c914437a70 100644
--- a/media/libvpx/config/generic/vp9_rtcd.h
+++ b/media/libvpx/config/generic/vp9_rtcd.h
@@ -1,3 +1,14 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
 #ifndef VP9_RTCD_H_
 #define VP9_RTCD_H_
 
@@ -14,12 +25,18 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
+#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER
+#include "vp9/encoder/vp9_temporal_filter.h"
+#endif
 
 struct macroblockd;
 
 /* Encoder forward decls */
 struct macroblock;
-struct vp9_variance_vtable;
+struct macroblock_plane;
+struct vp9_sad_table;
+struct ScanOrder;
 struct search_site_config;
 struct mv;
 union int_mv;
@@ -29,18 +46,18 @@ struct yv12_buffer_config;
 extern "C" {
 #endif
 
+void vp9_apply_temporal_filter_c(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+#define vp9_apply_temporal_filter vp9_apply_temporal_filter_c
+
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 #define vp9_block_error vp9_block_error_c
 
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
 #define vp9_block_error_fp vp9_block_error_fp_c
 
-int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
+int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv);
 #define vp9_diamond_search_sad vp9_diamond_search_sad_c
 
-void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_fdct8x8_quant vp9_fdct8x8_quant_c
-
 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
 #define vp9_fht16x16 vp9_fht16x16_c
 
@@ -50,13 +67,10 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_t
 void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
 #define vp9_fht8x8 vp9_fht8x8_c
 
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-#define vp9_full_search_sad vp9_full_search_sad_c
-
 void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 #define vp9_fwht4x4 vp9_fwht4x4_c
 
-void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
+void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
 
 void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
@@ -65,17 +79,23 @@ void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int
 void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht8x8_64_add vp9_iht8x8_64_add_c
 
-void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vp9_quantize_fp vp9_quantize_fp_c
 
-void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
 
-void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
+void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
 #define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
 
-void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
+void vpx_convolve12_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve12 vpx_convolve12_c
+
+void vpx_convolve12_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve12_horiz vpx_convolve12_horiz_c
+
+void vpx_convolve12_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve12_vert vpx_convolve12_vert_c
 
 void vp9_rtcd(void);
 
@@ -91,4 +111,4 @@ static void setup_rtcd_internal(void)
 }  // extern "C"
 #endif
 
-#endif
+#endif  // VP9_RTCD_H_
diff --git a/media/libvpx/config/generic/vpx_config.asm b/media/libvpx/config/generic/vpx_config.asm
index b4366a5a8e..cabc9f0a07 100644
--- a/media/libvpx/config/generic/vpx_config.asm
+++ b/media/libvpx/config/generic/vpx_config.asm
@@ -1,12 +1,19 @@
 @ This file was created from a .asm file
 @  using the ads2gas.pl script.
-	.equ DO1STROUNDING, 0
-.equ ARCH_ARM ,  0
-.equ ARCH_MIPS ,  0
-.equ ARCH_X86 ,  0
-.equ ARCH_X86_64 ,  0
-.equ HAVE_NEON ,  0
+.syntax unified
+.equ VPX_ARCH_ARM ,  0
+.equ VPX_ARCH_AARCH64 ,  0
+.equ VPX_ARCH_MIPS ,  0
+.equ VPX_ARCH_X86 ,  0
+.equ VPX_ARCH_X86_64 ,  0
+.equ VPX_ARCH_PPC ,  0
+.equ VPX_ARCH_LOONGARCH ,  0
 .equ HAVE_NEON_ASM ,  0
+.equ HAVE_NEON ,  0
+.equ HAVE_NEON_DOTPROD ,  0
+.equ HAVE_NEON_I8MM ,  0
+.equ HAVE_SVE ,  0
+.equ HAVE_SVE2 ,  0
 .equ HAVE_MIPS32 ,  0
 .equ HAVE_DSPR2 ,  0
 .equ HAVE_MSA ,  0
@@ -19,6 +26,11 @@
 .equ HAVE_SSE4_1 ,  0
 .equ HAVE_AVX ,  0
 .equ HAVE_AVX2 ,  0
+.equ HAVE_AVX512 ,  0
+.equ HAVE_VSX ,  0
+.equ HAVE_MMI ,  0
+.equ HAVE_LSX ,  0
+.equ HAVE_LASX ,  0
 .equ HAVE_VPX_PORTS ,  1
 .equ HAVE_PTHREAD_H ,  1
 .equ CONFIG_DEPENDENCY_TRACKING ,  1
@@ -75,8 +87,11 @@
 .equ CONFIG_BETTER_HW_COMPATIBILITY ,  0
 .equ CONFIG_EXPERIMENTAL ,  0
 .equ CONFIG_SIZE_LIMIT ,  1
-.equ CONFIG_SPATIAL_SVC ,  0
+.equ CONFIG_ALWAYS_ADJUST_BPM ,  0
+.equ CONFIG_BITSTREAM_DEBUG ,  0
+.equ CONFIG_MISMATCH_DEBUG ,  0
 .equ CONFIG_FP_MB_STATS ,  0
 .equ CONFIG_EMULATE_HARDWARE ,  0
-.equ CONFIG_MISC_FIXES ,  0
-	.section	.note.GNU-stack,"",%progbits
+.equ CONFIG_NON_GREEDY_MV ,  0
+.equ CONFIG_COLLECT_COMPONENT_TIMING ,  0
+    .section .note.GNU-stack,"",%progbits
diff --git a/media/libvpx/config/generic/vpx_config.h b/media/libvpx/config/generic/vpx_config.h
index 8cc215eb5a..09d09da6da 100644
--- a/media/libvpx/config/generic/vpx_config.h
+++ b/media/libvpx/config/generic/vpx_config.h
@@ -10,12 +10,19 @@
 #define VPX_CONFIG_H
 #define RESTRICT    
 #define INLINE      inline
-#define ARCH_ARM 0
-#define ARCH_MIPS 0
-#define ARCH_X86 0
-#define ARCH_X86_64 0
-#define HAVE_NEON 0
+#define VPX_ARCH_ARM 0
+#define VPX_ARCH_AARCH64 0
+#define VPX_ARCH_MIPS 0
+#define VPX_ARCH_X86 0
+#define VPX_ARCH_X86_64 0
+#define VPX_ARCH_PPC 0
+#define VPX_ARCH_LOONGARCH 0
 #define HAVE_NEON_ASM 0
+#define HAVE_NEON 0
+#define HAVE_NEON_DOTPROD 0
+#define HAVE_NEON_I8MM 0
+#define HAVE_SVE 0
+#define HAVE_SVE2 0
 #define HAVE_MIPS32 0
 #define HAVE_DSPR2 0
 #define HAVE_MSA 0
@@ -28,6 +35,11 @@
 #define HAVE_SSE4_1 0
 #define HAVE_AVX 0
 #define HAVE_AVX2 0
+#define HAVE_AVX512 0
+#define HAVE_VSX 0
+#define HAVE_MMI 0
+#define HAVE_LSX 0
+#define HAVE_LASX 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_PTHREAD_H 1
 #define CONFIG_DEPENDENCY_TRACKING 1
@@ -84,10 +96,13 @@
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 1
-#define CONFIG_SPATIAL_SVC 0
+#define CONFIG_ALWAYS_ADJUST_BPM 0
+#define CONFIG_BITSTREAM_DEBUG 0
+#define CONFIG_MISMATCH_DEBUG 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_EMULATE_HARDWARE 0
-#define CONFIG_MISC_FIXES 0
+#define CONFIG_NON_GREEDY_MV 0
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
 #define DECODE_WIDTH_LIMIT 8192
 #define DECODE_HEIGHT_LIMIT 4608
 #endif /* VPX_CONFIG_H */
diff --git a/media/libvpx/config/generic/vpx_dsp_rtcd.h b/media/libvpx/config/generic/vpx_dsp_rtcd.h
index 523bf33aed..ea44ef01ca 100644
--- a/media/libvpx/config/generic/vpx_dsp_rtcd.h
+++ b/media/libvpx/config/generic/vpx_dsp_rtcd.h
@@ -1,3 +1,14 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
 #ifndef VPX_DSP_RTCD_H_
 #define VPX_DSP_RTCD_H_
 
@@ -13,6 +24,11 @@
 
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#if CONFIG_VP9_ENCODER
+ struct macroblock_plane;
+ struct ScanOrder;
+#endif
 
 
 #ifdef __cplusplus
@@ -28,187 +44,154 @@ unsigned int vpx_avg_8x8_c(const uint8_t *, int p);
 void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
 #define vpx_comp_avg_pred vpx_comp_avg_pred_c
 
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8 vpx_convolve8_c
 
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg vpx_convolve8_avg_c
 
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_c
 
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_c
 
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_horiz vpx_convolve8_horiz_c
 
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_vert vpx_convolve8_vert_c
 
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve_avg vpx_convolve_avg_c
 
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve_copy vpx_convolve_copy_c
 
-void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c
 
-void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c
 
-void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c
 
-void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
 
-void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c
 
-void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c
 
-void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c
 
-void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c
 
-void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c
 
-void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_c
 
-void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_c
 
-void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_c
 
-void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_c
 
-void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_c
 
-void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_c
 
-void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c
 
-void vpx_d207e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207e_predictor_16x16 vpx_d207e_predictor_16x16_c
-
-void vpx_d207e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207e_predictor_32x32 vpx_d207e_predictor_32x32_c
-
-void vpx_d207e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207e_predictor_4x4 vpx_d207e_predictor_4x4_c
-
-void vpx_d207e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207e_predictor_8x8 vpx_d207e_predictor_8x8_c
-
-void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_c
 
-void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c
 
-void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_c
 
-void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_c
 
-void vpx_d45e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45e_predictor_16x16 vpx_d45e_predictor_16x16_c
-
-void vpx_d45e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45e_predictor_32x32 vpx_d45e_predictor_32x32_c
-
-void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c
 
-void vpx_d45e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45e_predictor_8x8 vpx_d45e_predictor_8x8_c
-
-void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c
 
-void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_c
 
-void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_c
 
-void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c
 
-void vpx_d63e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63e_predictor_16x16 vpx_d63e_predictor_16x16_c
-
-void vpx_d63e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63e_predictor_32x32 vpx_d63e_predictor_32x32_c
-
-void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c
 
-void vpx_d63e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63e_predictor_8x8 vpx_d63e_predictor_8x8_c
-
-void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63f_predictor_4x4 vpx_d63f_predictor_4x4_c
-
-void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_c
 
-void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_c
 
-void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_c
 
-void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_c
 
-void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_c
 
-void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_c
 
-void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_c
 
-void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_c
 
-void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_c
 
-void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_c
 
-void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_c
 
-void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_c
 
-void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_c
 
-void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_c
 
-void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_c
 
-void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_c
 
 void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
@@ -238,37 +221,40 @@ void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
 void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
 #define vpx_fdct8x8_1 vpx_fdct8x8_1_c
 
-void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
 #define vpx_get16x16var vpx_get16x16var_c
 
-unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride);
+unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride);
 #define vpx_get4x4sse_cs vpx_get4x4sse_cs_c
 
-void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
 #define vpx_get8x8var vpx_get8x8var_c
 
 unsigned int vpx_get_mb_ss_c(const int16_t *);
 #define vpx_get_mb_ss vpx_get_mb_ss_c
 
-void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_16x16 vpx_h_predictor_16x16_c
 
-void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_32x32 vpx_h_predictor_32x32_c
 
-void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_4x4 vpx_h_predictor_4x4_c
 
-void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_c
 
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
 #define vpx_hadamard_16x16 vpx_hadamard_16x16_c
 
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+#define vpx_hadamard_32x32 vpx_hadamard_32x32_c
+
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
 #define vpx_hadamard_8x8 vpx_hadamard_8x8_c
 
-void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
 
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
@@ -280,6 +266,9 @@ void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_256_add vpx_idct16x16_256_add_c
 
+void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_38_add vpx_idct16x16_38_add_c
+
 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c
 
@@ -310,7 +299,7 @@ void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
 #define vpx_int_pro_col vpx_int_pro_col_c
 
-void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
+void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
 #define vpx_int_pro_row vpx_int_pro_row_c
 
 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
@@ -358,22 +347,22 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, co
 void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
 #define vpx_minmax_8x8 vpx_minmax_8x8_c
 
-unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_mse16x16 vpx_mse16x16_c
 
-unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_mse16x8 vpx_mse16x8_c
 
-unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_mse8x16 vpx_mse8x16_c
 
-unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_mse8x8 vpx_mse8x8_c
 
-void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vpx_quantize_b vpx_quantize_b_c
 
-void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c
 
 unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -382,22 +371,16 @@ unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_
 unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x16_avg vpx_sad16x16_avg_c
 
-void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x16x3 vpx_sad16x16x3_c
-
-void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x16x4d vpx_sad16x16x4d_c
 
-void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x16x8 vpx_sad16x16x8_c
-
 unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad16x32 vpx_sad16x32_c
 
 unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x32_avg vpx_sad16x32_avg_c
 
-void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x32x4d vpx_sad16x32x4d_c
 
 unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -406,22 +389,16 @@ unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t
 unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x8_avg vpx_sad16x8_avg_c
 
-void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x8x3 vpx_sad16x8x3_c
-
-void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x8x4d vpx_sad16x8x4d_c
 
-void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x8x8 vpx_sad16x8x8_c
-
 unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad32x16 vpx_sad32x16_c
 
 unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x16_avg vpx_sad32x16_avg_c
 
-void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x16x4d vpx_sad32x16x4d_c
 
 unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -430,22 +407,16 @@ unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_
 unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x32_avg vpx_sad32x32_avg_c
 
-void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x3 vpx_sad32x32x3_c
-
-void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x32x4d vpx_sad32x32x4d_c
 
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_c
-
 unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad32x64 vpx_sad32x64_c
 
 unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x64_avg vpx_sad32x64_avg_c
 
-void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x64x4d vpx_sad32x64x4d_c
 
 unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -454,34 +425,25 @@ unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t
 unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad4x4_avg vpx_sad4x4_avg_c
 
-void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x4x3 vpx_sad4x4x3_c
-
-void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad4x4x4d vpx_sad4x4x4d_c
 
-void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x4x8 vpx_sad4x4x8_c
-
 unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad4x8 vpx_sad4x8_c
 
 unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad4x8_avg vpx_sad4x8_avg_c
 
-void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad4x8x4d vpx_sad4x8x4d_c
 
-void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x8 vpx_sad4x8x8_c
-
 unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad64x32 vpx_sad64x32_c
 
 unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad64x32_avg vpx_sad64x32_avg_c
 
-void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad64x32x4d vpx_sad64x32x4d_c
 
 unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -490,154 +452,214 @@ unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_
 unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad64x64_avg vpx_sad64x64_avg_c
 
-void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x3 vpx_sad64x64x3_c
-
-void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad64x64x4d vpx_sad64x64x4d_c
 
-void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x8 vpx_sad64x64x8_c
-
 unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x16 vpx_sad8x16_c
 
 unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x16_avg vpx_sad8x16_avg_c
 
-void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x16x3 vpx_sad8x16x3_c
-
-void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x16x4d vpx_sad8x16x4d_c
 
-void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x16x8 vpx_sad8x16x8_c
-
 unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x4 vpx_sad8x4_c
 
 unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x4_avg vpx_sad8x4_avg_c
 
-void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x4x4d vpx_sad8x4x4d_c
 
-void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x8 vpx_sad8x4x8_c
-
 unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x8 vpx_sad8x8_c
 
 unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x8_avg vpx_sad8x8_avg_c
 
-void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x8x3 vpx_sad8x8x3_c
-
-void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x8x4d vpx_sad8x8x4d_c
 
-void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x8x8 vpx_sad8x8x8_c
+unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x16 vpx_sad_skip_16x16_c
+
+void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x16x4d vpx_sad_skip_16x16x4d_c
+
+unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x32 vpx_sad_skip_16x32_c
+
+void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x32x4d vpx_sad_skip_16x32x4d_c
+
+unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x8 vpx_sad_skip_16x8_c
+
+void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x8x4d vpx_sad_skip_16x8x4d_c
+
+unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_32x16 vpx_sad_skip_32x16_c
+
+void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_32x16x4d vpx_sad_skip_32x16x4d_c
+
+unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_32x32 vpx_sad_skip_32x32_c
+
+void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_32x32x4d vpx_sad_skip_32x32x4d_c
+
+unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_32x64 vpx_sad_skip_32x64_c
+
+void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_32x64x4d vpx_sad_skip_32x64x4d_c
+
+unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c
+
+void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c
+
+unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_c
+
+void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_c
+
+unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_64x32 vpx_sad_skip_64x32_c
+
+void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_64x32x4d vpx_sad_skip_64x32x4d_c
+
+unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_64x64 vpx_sad_skip_64x64_c
+
+void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_64x64x4d vpx_sad_skip_64x64x4d_c
+
+unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_c
+
+void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_c
+
+unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c
+
+void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c
+
+unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_c
+
+void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_c
 
 int vpx_satd_c(const int16_t *coeff, int length);
 #define vpx_satd vpx_satd_c
 
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_2d vpx_scaled_2d_c
 
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
 
-void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
 
-void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
 
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_horiz vpx_scaled_horiz_c
 
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_vert vpx_scaled_vert_c
 
-uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+#define vpx_sse vpx_sse_c
+
+uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 #define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_c
 
-uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 #define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_c
 
-uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 #define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_c
 
-uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 #define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_c
 
-uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 #define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_c
 
-uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 #define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_c
 
-uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 #define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_c
 
-uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 #define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_c
 
-uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 #define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_c
 
-uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 #define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_c
 
-uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 #define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_c
 
-uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 #define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_c
 
-uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 #define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_c
 
-uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 #define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_c
 
-uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 #define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_c
 
-uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 #define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_c
 
-uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 #define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_c
 
-uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 #define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_c
 
-uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 #define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_c
 
-uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 #define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c
 
-uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 #define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c
 
-uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 #define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_c
 
-uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 #define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_c
 
-uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 #define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_c
 
-uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 #define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_c
 
-uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 #define vpx_sub_pixel_variance8x8 vpx_sub_pixel_variance8x8_c
 
 void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
@@ -646,70 +668,70 @@ void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_
 uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size);
 #define vpx_sum_squares_2d_i16 vpx_sum_squares_2d_i16_c
 
-void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_c
 
-void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_c
 
-void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_c
 
-void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_c
 
-void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_v_predictor_16x16 vpx_v_predictor_16x16_c
 
-void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_v_predictor_32x32 vpx_v_predictor_32x32_c
 
-void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_v_predictor_4x4 vpx_v_predictor_4x4_c
 
-void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_v_predictor_8x8 vpx_v_predictor_8x8_c
 
-unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_variance16x16 vpx_variance16x16_c
 
-unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_variance16x32 vpx_variance16x32_c
 
-unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_variance16x8 vpx_variance16x8_c
 
-unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_variance32x16 vpx_variance32x16_c
 
-unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_variance32x32 vpx_variance32x32_c
 
-unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_variance32x64 vpx_variance32x64_c
 
-unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_variance4x4 vpx_variance4x4_c
 
-unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_variance4x8 vpx_variance4x8_c
 
-unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_variance64x32 vpx_variance64x32_c
 
-unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_variance64x64 vpx_variance64x64_c
 
-unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_variance8x16 vpx_variance8x16_c
 
-unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_variance8x4 vpx_variance8x4_c
 
-unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_variance8x8 vpx_variance8x8_c
 
-void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c
 
 int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl);
@@ -729,4 +751,4 @@ static void setup_rtcd_internal(void)
 }  // extern "C"
 #endif
 
-#endif
+#endif  // VPX_DSP_RTCD_H_
diff --git a/media/libvpx/config/generic/vpx_scale_rtcd.h b/media/libvpx/config/generic/vpx_scale_rtcd.h
index f419cc7a5f..58485cb00a 100644
--- a/media/libvpx/config/generic/vpx_scale_rtcd.h
+++ b/media/libvpx/config/generic/vpx_scale_rtcd.h
@@ -1,3 +1,14 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
 #ifndef VPX_SCALE_RTCD_H_
 #define VPX_SCALE_RTCD_H_
 
@@ -46,6 +57,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
 void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
 #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
 
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
 void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
 #define vpx_yv12_copy_y vpx_yv12_copy_y_c
 
@@ -63,4 +77,4 @@ static void setup_rtcd_internal(void)
 }  // extern "C"
 #endif
 
-#endif
+#endif  // VPX_SCALE_RTCD_H_
diff --git a/media/libvpx/config/linux/arm/vp8_rtcd.h b/media/libvpx/config/linux/arm/vp8_rtcd.h
index 098795508a..688aa3ead5 100644
--- a/media/libvpx/config/linux/arm/vp8_rtcd.h
+++ b/media/libvpx/config/linux/arm/vp8_rtcd.h
@@ -1,3 +1,14 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
 #ifndef VP8_RTCD_H_
 #define VP8_RTCD_H_
 
@@ -26,40 +37,43 @@ struct yv12_buffer_config;
 extern "C" {
 #endif
 
-void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict16x16_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict16x16_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict4x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x8_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
 int vp8_block_error_c(short *coeff, short *dqcoeff);
 #define vp8_block_error vp8_block_error_c
 
-void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem16x16_neon(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
+#define vp8_copy32xn vp8_copy32xn_c
 
-void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem8x4_neon(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem16x16_neon(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
 
-void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem8x8_neon(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem8x4_neon(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
 
-void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
-void vp8_dc_only_idct_add_neon(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
-RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem8x8_neon(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+
+void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+void vp8_dc_only_idct_add_neon(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
 
 int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
 int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
@@ -69,9 +83,9 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, u
 int vp8_denoiser_filter_uv_neon(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
 RTCD_EXTERN int (*vp8_denoiser_filter_uv)(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
 
-void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride);
-void vp8_dequant_idct_add_neon(short *input, short *dq, unsigned char *output, int stride);
-RTCD_EXTERN void (*vp8_dequant_idct_add)(short *input, short *dq, unsigned char *output, int stride);
+void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride);
+void vp8_dequant_idct_add_neon(short *input, short *dq, unsigned char *dest, int stride);
+RTCD_EXTERN void (*vp8_dequant_idct_add)(short *input, short *dq, unsigned char *dest, int stride);
 
 void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
 void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
@@ -81,9 +95,9 @@ void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int
 void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
 RTCD_EXTERN void (*vp8_dequant_idct_add_y_block)(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
 
-void vp8_dequantize_b_c(struct blockd*, short *dqc);
-void vp8_dequantize_b_neon(struct blockd*, short *dqc);
-RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *dqc);
+void vp8_dequantize_b_c(struct blockd*, short *DQC);
+void vp8_dequantize_b_neon(struct blockd*, short *DQC);
+RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *DQC);
 
 int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
 #define vp8_diamond_search_sad vp8_diamond_search_sad_c
@@ -92,40 +106,37 @@ void vp8_fast_quantize_b_c(struct block *, struct blockd *);
 void vp8_fast_quantize_b_neon(struct block *, struct blockd *);
 RTCD_EXTERN void (*vp8_fast_quantize_b)(struct block *, struct blockd *);
 
-int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-#define vp8_full_search_sad vp8_full_search_sad_c
+void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 
-void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_bh_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+RTCD_EXTERN void (*vp8_loop_filter_bv)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 
-void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_bv_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-RTCD_EXTERN void (*vp8_loop_filter_bv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+RTCD_EXTERN void (*vp8_loop_filter_mbh)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 
-void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_mbh_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-RTCD_EXTERN void (*vp8_loop_filter_mbh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 
-void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_mbv_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+RTCD_EXTERN void (*vp8_loop_filter_simple_bh)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 
-void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_bhs_neon(unsigned char *y, int ystride, const unsigned char *blimit);
-RTCD_EXTERN void (*vp8_loop_filter_simple_bh)(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_bvs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 
-void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_bvs_neon(unsigned char *y, int ystride, const unsigned char *blimit);
-RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 
-void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_mbhs_neon(unsigned char *y, int ystride, const unsigned char *blimit);
-RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y, int ystride, const unsigned char *blimit);
-
-void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_mbvs_neon(unsigned char *y, int ystride, const unsigned char *blimit);
-RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 
 int vp8_mbblock_error_c(struct macroblock *mb, int dc);
 #define vp8_mbblock_error vp8_mbblock_error_c
@@ -133,7 +144,7 @@ int vp8_mbblock_error_c(struct macroblock *mb, int dc);
 int vp8_mbuverror_c(struct macroblock *mb);
 #define vp8_mbuverror vp8_mbuverror_c
 
-int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
 #define vp8_refining_search_sad vp8_refining_search_sad_c
 
 void vp8_regular_quantize_b_c(struct block *, struct blockd *);
@@ -147,36 +158,36 @@ void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
 void vp8_short_fdct8x4_neon(short *input, short *output, int pitch);
 RTCD_EXTERN void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
 
-void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
-void vp8_short_idct4x4llm_neon(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
-RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
+void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+void vp8_short_idct4x4llm_neon(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
 
-void vp8_short_inv_walsh4x4_c(short *input, short *output);
-void vp8_short_inv_walsh4x4_neon(short *input, short *output);
-RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *output);
+void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff);
+void vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff);
+RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *mb_dqcoeff);
 
-void vp8_short_inv_walsh4x4_1_c(short *input, short *output);
+void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff);
 #define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c
 
 void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
 void vp8_short_walsh4x4_neon(short *input, short *output, int pitch);
 RTCD_EXTERN void (*vp8_short_walsh4x4)(short *input, short *output, int pitch);
 
-void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict16x16_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict4x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x8_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
 void vp8_rtcd(void);
 
@@ -261,4 +272,4 @@ static void setup_rtcd_internal(void)
 }  // extern "C"
 #endif
 
-#endif
+#endif  // VP8_RTCD_H_
diff --git a/media/libvpx/config/linux/arm/vp9_rtcd.h b/media/libvpx/config/linux/arm/vp9_rtcd.h
index 4a41b66711..bd174d6bdc 100644
--- a/media/libvpx/config/linux/arm/vp9_rtcd.h
+++ b/media/libvpx/config/linux/arm/vp9_rtcd.h
@@ -1,3 +1,14 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
 #ifndef VP9_RTCD_H_
 #define VP9_RTCD_H_
 
@@ -14,12 +25,18 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
+#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER
+#include "vp9/encoder/vp9_temporal_filter.h"
+#endif
 
 struct macroblockd;
 
 /* Encoder forward decls */
 struct macroblock;
-struct vp9_variance_vtable;
+struct macroblock_plane;
+struct vp9_sad_table;
+struct ScanOrder;
 struct search_site_config;
 struct mv;
 union int_mv;
@@ -30,36 +47,35 @@ extern "C" {
 #endif
 
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
-#define vp9_block_error vp9_block_error_c
+int64_t vp9_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
+RTCD_EXTERN int64_t (*vp9_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
-int64_t vp9_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
-RTCD_EXTERN int64_t (*vp9_block_error_fp)(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+RTCD_EXTERN int64_t (*vp9_block_error_fp)(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
 
-int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-#define vp9_diamond_search_sad vp9_diamond_search_sad_c
-
-void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_fdct8x8_quant_neon(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-RTCD_EXTERN void (*vp9_fdct8x8_quant)(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv);
+int vp9_diamond_search_sad_neon(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv);
+RTCD_EXTERN int (*vp9_diamond_search_sad)(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv);
 
 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-#define vp9_fht16x16 vp9_fht16x16_c
+void vp9_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+RTCD_EXTERN void (*vp9_fht16x16)(const int16_t *input, tran_low_t *output, int stride, int tx_type);
 
 void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-#define vp9_fht4x4 vp9_fht4x4_c
+void vp9_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+RTCD_EXTERN void (*vp9_fht4x4)(const int16_t *input, tran_low_t *output, int stride, int tx_type);
 
 void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-#define vp9_fht8x8 vp9_fht8x8_c
-
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-#define vp9_full_search_sad vp9_full_search_sad_c
+void vp9_fht8x8_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+RTCD_EXTERN void (*vp9_fht8x8)(const int16_t *input, tran_low_t *output, int stride, int tx_type);
 
 void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 #define vp9_fwht4x4 vp9_fwht4x4_c
 
-void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
-#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
+void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+RTCD_EXTERN void (*vp9_iht16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 
 void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
@@ -69,18 +85,17 @@ void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int
 void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 
-void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 
-void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 
-void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
-#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
-
-void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
+void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+void vp9_scale_and_extend_frame_neon(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
 
 void vp9_rtcd(void);
 
@@ -94,16 +109,30 @@ static void setup_rtcd_internal(void)
 
     (void)flags;
 
+    vp9_block_error = vp9_block_error_c;
+    if (flags & HAS_NEON) vp9_block_error = vp9_block_error_neon;
     vp9_block_error_fp = vp9_block_error_fp_c;
     if (flags & HAS_NEON) vp9_block_error_fp = vp9_block_error_fp_neon;
-    vp9_fdct8x8_quant = vp9_fdct8x8_quant_c;
-    if (flags & HAS_NEON) vp9_fdct8x8_quant = vp9_fdct8x8_quant_neon;
+    vp9_diamond_search_sad = vp9_diamond_search_sad_c;
+    if (flags & HAS_NEON) vp9_diamond_search_sad = vp9_diamond_search_sad_neon;
+    vp9_fht16x16 = vp9_fht16x16_c;
+    if (flags & HAS_NEON) vp9_fht16x16 = vp9_fht16x16_neon;
+    vp9_fht4x4 = vp9_fht4x4_c;
+    if (flags & HAS_NEON) vp9_fht4x4 = vp9_fht4x4_neon;
+    vp9_fht8x8 = vp9_fht8x8_c;
+    if (flags & HAS_NEON) vp9_fht8x8 = vp9_fht8x8_neon;
+    vp9_iht16x16_256_add = vp9_iht16x16_256_add_c;
+    if (flags & HAS_NEON) vp9_iht16x16_256_add = vp9_iht16x16_256_add_neon;
     vp9_iht4x4_16_add = vp9_iht4x4_16_add_c;
     if (flags & HAS_NEON) vp9_iht4x4_16_add = vp9_iht4x4_16_add_neon;
     vp9_iht8x8_64_add = vp9_iht8x8_64_add_c;
     if (flags & HAS_NEON) vp9_iht8x8_64_add = vp9_iht8x8_64_add_neon;
     vp9_quantize_fp = vp9_quantize_fp_c;
     if (flags & HAS_NEON) vp9_quantize_fp = vp9_quantize_fp_neon;
+    vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_c;
+    if (flags & HAS_NEON) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_neon;
+    vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c;
+    if (flags & HAS_NEON) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_neon;
 }
 #endif
 
@@ -111,4 +140,4 @@ static void setup_rtcd_internal(void)
 }  // extern "C"
 #endif
 
-#endif
+#endif  // VP9_RTCD_H_
diff --git a/media/libvpx/config/linux/arm/vpx_config.asm b/media/libvpx/config/linux/arm/vpx_config.asm
index 0aced5ddc3..9ea8e5f6e0 100644
--- a/media/libvpx/config/linux/arm/vpx_config.asm
+++ b/media/libvpx/config/linux/arm/vpx_config.asm
@@ -1,12 +1,19 @@
 @ This file was created from a .asm file
 @  using the ads2gas.pl script.
-	.equ DO1STROUNDING, 0
-.equ ARCH_ARM ,  1
-.equ ARCH_MIPS ,  0
-.equ ARCH_X86 ,  0
-.equ ARCH_X86_64 ,  0
-.equ HAVE_NEON ,  1
+.syntax unified
+.equ VPX_ARCH_ARM ,  1
+.equ VPX_ARCH_AARCH64 ,  0
+.equ VPX_ARCH_MIPS ,  0
+.equ VPX_ARCH_X86 ,  0
+.equ VPX_ARCH_X86_64 ,  0
+.equ VPX_ARCH_PPC ,  0
+.equ VPX_ARCH_LOONGARCH ,  0
 .equ HAVE_NEON_ASM ,  1
+.equ HAVE_NEON ,  1
+.equ HAVE_NEON_DOTPROD ,  0
+.equ HAVE_NEON_I8MM ,  0
+.equ HAVE_SVE ,  0
+.equ HAVE_SVE2 ,  0
 .equ HAVE_MIPS32 ,  0
 .equ HAVE_DSPR2 ,  0
 .equ HAVE_MSA ,  0
@@ -19,6 +26,11 @@
 .equ HAVE_SSE4_1 ,  0
 .equ HAVE_AVX ,  0
 .equ HAVE_AVX2 ,  0
+.equ HAVE_AVX512 ,  0
+.equ HAVE_VSX ,  0
+.equ HAVE_MMI ,  0
+.equ HAVE_LSX ,  0
+.equ HAVE_LASX ,  0
 .equ HAVE_VPX_PORTS ,  1
 .equ HAVE_PTHREAD_H ,  1
 .equ CONFIG_DEPENDENCY_TRACKING ,  1
@@ -75,8 +87,11 @@
 .equ CONFIG_BETTER_HW_COMPATIBILITY ,  0
 .equ CONFIG_EXPERIMENTAL ,  0
 .equ CONFIG_SIZE_LIMIT ,  1
-.equ CONFIG_SPATIAL_SVC ,  0
+.equ CONFIG_ALWAYS_ADJUST_BPM ,  0
+.equ CONFIG_BITSTREAM_DEBUG ,  0
+.equ CONFIG_MISMATCH_DEBUG ,  0
 .equ CONFIG_FP_MB_STATS ,  0
 .equ CONFIG_EMULATE_HARDWARE ,  0
-.equ CONFIG_MISC_FIXES ,  0
-	.section	.note.GNU-stack,"",%progbits
+.equ CONFIG_NON_GREEDY_MV ,  0
+.equ CONFIG_COLLECT_COMPONENT_TIMING ,  0
+    .section .note.GNU-stack,"",%progbits
diff --git a/media/libvpx/config/linux/arm/vpx_config.h b/media/libvpx/config/linux/arm/vpx_config.h
index 6b3d8581e2..1a7d21384d 100644
--- a/media/libvpx/config/linux/arm/vpx_config.h
+++ b/media/libvpx/config/linux/arm/vpx_config.h
@@ -10,12 +10,19 @@
 #define VPX_CONFIG_H
 #define RESTRICT    
 #define INLINE      inline
-#define ARCH_ARM 1
-#define ARCH_MIPS 0
-#define ARCH_X86 0
-#define ARCH_X86_64 0
-#define HAVE_NEON 1
+#define VPX_ARCH_ARM 1
+#define VPX_ARCH_AARCH64 0
+#define VPX_ARCH_MIPS 0
+#define VPX_ARCH_X86 0
+#define VPX_ARCH_X86_64 0
+#define VPX_ARCH_PPC 0
+#define VPX_ARCH_LOONGARCH 0
 #define HAVE_NEON_ASM 1
+#define HAVE_NEON 1
+#define HAVE_NEON_DOTPROD 0
+#define HAVE_NEON_I8MM 0
+#define HAVE_SVE 0
+#define HAVE_SVE2 0
 #define HAVE_MIPS32 0
 #define HAVE_DSPR2 0
 #define HAVE_MSA 0
@@ -28,6 +35,11 @@
 #define HAVE_SSE4_1 0
 #define HAVE_AVX 0
 #define HAVE_AVX2 0
+#define HAVE_AVX512 0
+#define HAVE_VSX 0
+#define HAVE_MMI 0
+#define HAVE_LSX 0
+#define HAVE_LASX 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_PTHREAD_H 1
 #define CONFIG_DEPENDENCY_TRACKING 1
@@ -84,10 +96,13 @@
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 1
-#define CONFIG_SPATIAL_SVC 0
+#define CONFIG_ALWAYS_ADJUST_BPM 0
+#define CONFIG_BITSTREAM_DEBUG 0
+#define CONFIG_MISMATCH_DEBUG 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_EMULATE_HARDWARE 0
-#define CONFIG_MISC_FIXES 0
+#define CONFIG_NON_GREEDY_MV 0
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
 #define DECODE_WIDTH_LIMIT 8192
 #define DECODE_HEIGHT_LIMIT 4608
 #endif /* VPX_CONFIG_H */
diff --git a/media/libvpx/config/linux/arm/vpx_dsp_rtcd.h b/media/libvpx/config/linux/arm/vpx_dsp_rtcd.h
index 760ec9a364..0c84d180ad 100644
--- a/media/libvpx/config/linux/arm/vpx_dsp_rtcd.h
+++ b/media/libvpx/config/linux/arm/vpx_dsp_rtcd.h
@@ -1,3 +1,14 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
 #ifndef VPX_DSP_RTCD_H_
 #define VPX_DSP_RTCD_H_
 
@@ -13,6 +24,11 @@
 
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#if CONFIG_VP9_ENCODER
+ struct macroblock_plane;
+ struct ScanOrder;
+#endif
 
 
 #ifdef __cplusplus
@@ -28,243 +44,234 @@ unsigned int vpx_avg_8x8_neon(const uint8_t *, int p);
 RTCD_EXTERN unsigned int (*vpx_avg_8x8)(const uint8_t *, int p);
 
 void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
-#define vpx_comp_avg_pred vpx_comp_avg_pred_c
+void vpx_comp_avg_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+RTCD_EXTERN void (*vpx_comp_avg_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
 
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c
+void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d117_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c
+void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d117_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c
+void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d117_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
+void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d117_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d135_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d135_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d135_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d135_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d135_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d135_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d135_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d135_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c
+void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_c
+void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_c
+void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_c
+void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_c
+void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_c
+void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_c
+void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c
+void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207e_predictor_16x16 vpx_d207e_predictor_16x16_c
+void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207e_predictor_32x32 vpx_d207e_predictor_32x32_c
+void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207e_predictor_4x4 vpx_d207e_predictor_4x4_c
+void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207e_predictor_8x8 vpx_d207e_predictor_8x8_c
+void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_d45e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45e_predictor_16x16 vpx_d45e_predictor_16x16_c
-
-void vpx_d45e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45e_predictor_32x32 vpx_d45e_predictor_32x32_c
-
-void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c
 
-void vpx_d45e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45e_predictor_8x8 vpx_d45e_predictor_8x8_c
+void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c
+void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_c
+void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_c
+void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c
-
-void vpx_d63e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63e_predictor_16x16 vpx_d63e_predictor_16x16_c
-
-void vpx_d63e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63e_predictor_32x32 vpx_d63e_predictor_32x32_c
-
-void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c
 
-void vpx_d63e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63e_predictor_8x8 vpx_d63e_predictor_8x8_c
+void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63f_predictor_4x4 vpx_d63f_predictor_4x4_c
+void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_128_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_128_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_128_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_128_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_left_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_left_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_left_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_left_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_top_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_top_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_top_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_top_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
 void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct16x16 vpx_fdct16x16_c
+void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride);
+RTCD_EXTERN void (*vpx_fdct16x16)(const int16_t *input, tran_low_t *output, int stride);
 
 void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct16x16_1 vpx_fdct16x16_1_c
+void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, int stride);
+RTCD_EXTERN void (*vpx_fdct16x16_1)(const int16_t *input, tran_low_t *output, int stride);
 
 void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct32x32 vpx_fdct32x32_c
+void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride);
+RTCD_EXTERN void (*vpx_fdct32x32)(const int16_t *input, tran_low_t *output, int stride);
 
 void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct32x32_1 vpx_fdct32x32_1_c
+void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, int stride);
+RTCD_EXTERN void (*vpx_fdct32x32_1)(const int16_t *input, tran_low_t *output, int stride);
 
 void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct32x32_rd vpx_fdct32x32_rd_c
+void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, int stride);
+RTCD_EXTERN void (*vpx_fdct32x32_rd)(const int16_t *input, tran_low_t *output, int stride);
 
 void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct4x4 vpx_fdct4x4_c
+void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *output, int stride);
+RTCD_EXTERN void (*vpx_fdct4x4)(const int16_t *input, tran_low_t *output, int stride);
 
 void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct4x4_1 vpx_fdct4x4_1_c
+void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride);
+RTCD_EXTERN void (*vpx_fdct4x4_1)(const int16_t *input, tran_low_t *output, int stride);
 
 void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
 void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *output, int stride);
@@ -274,46 +281,50 @@ void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
 void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride);
 RTCD_EXTERN void (*vpx_fdct8x8_1)(const int16_t *input, tran_low_t *output, int stride);
 
-void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vpx_get16x16var_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
 
-unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride);
-unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride);
-RTCD_EXTERN unsigned int (*vpx_get4x4sse_cs)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride);
+unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride);
+unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_get4x4sse_cs)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride);
 
-void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vpx_get8x8var_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
 
 unsigned int vpx_get_mb_ss_c(const int16_t *);
 #define vpx_get_mb_ss vpx_get_mb_ss_c
 
-void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_h_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_h_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_h_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_h_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride, int16_t *coeff);
-RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
 
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, int16_t *coeff);
-RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+RTCD_EXTERN void (*vpx_hadamard_32x32)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
 
-void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+
+void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
 
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
@@ -328,6 +339,10 @@ void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride)
 void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
+void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct16x16_38_add)(const tran_low_t *input, uint8_t *dest, int stride);
+
 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride);
@@ -368,9 +383,9 @@ int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
 int16_t vpx_int_pro_col_neon(const uint8_t *ref, const int width);
 RTCD_EXTERN int16_t (*vpx_int_pro_col)(const uint8_t *ref, const int width);
 
-void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
-void vpx_int_pro_row_neon(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
-RTCD_EXTERN void (*vpx_int_pro_row)(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
+void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
+void vpx_int_pro_row_neon(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
+RTCD_EXTERN void (*vpx_int_pro_row)(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
 
 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
@@ -430,391 +445,514 @@ void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *mi
 void vpx_minmax_8x8_neon(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
 RTCD_EXTERN void (*vpx_minmax_8x8)(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
 
-unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse16x16_neon(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define vpx_mse16x8 vpx_mse16x8_c
+unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define vpx_mse8x16 vpx_mse8x16_c
+unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define vpx_mse8x8 vpx_mse8x8_c
+unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vpx_quantize_b vpx_quantize_b_c
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 
-void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vpx_quantize_b_32x32)(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 
 unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 RTCD_EXTERN unsigned int (*vpx_sad16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 
 unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad16x16_avg vpx_sad16x16_avg_c
+unsigned int vpx_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad16x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x16x3 vpx_sad16x16x3_c
-
-void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-
-void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x16x8 vpx_sad16x16x8_c
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_sad16x32 vpx_sad16x32_c
+unsigned int vpx_sad16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 
 unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad16x32_avg vpx_sad16x32_avg_c
+unsigned int vpx_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad16x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x32x4d vpx_sad16x32x4d_c
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 RTCD_EXTERN unsigned int (*vpx_sad16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 
 unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad16x8_avg vpx_sad16x8_avg_c
+unsigned int vpx_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad16x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x8x3 vpx_sad16x8x3_c
-
-void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x8x4d vpx_sad16x8x4d_c
-
-void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x8x8 vpx_sad16x8x8_c
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_sad32x16 vpx_sad32x16_c
+unsigned int vpx_sad32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 
 unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad32x16_avg vpx_sad32x16_avg_c
+unsigned int vpx_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad32x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x16x4d vpx_sad32x16x4d_c
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 RTCD_EXTERN unsigned int (*vpx_sad32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 
 unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad32x32_avg vpx_sad32x32_avg_c
+unsigned int vpx_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad32x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x3 vpx_sad32x32x3_c
-
-void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_c
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_sad32x64 vpx_sad32x64_c
+unsigned int vpx_sad32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 
 unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad32x64_avg vpx_sad32x64_avg_c
+unsigned int vpx_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad32x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x64x4d vpx_sad32x64x4d_c
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 RTCD_EXTERN unsigned int (*vpx_sad4x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 
 unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad4x4_avg vpx_sad4x4_avg_c
+unsigned int vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad4x4_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x4x3 vpx_sad4x4x3_c
-
-void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x4x4d vpx_sad4x4x4d_c
-
-void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x4x8 vpx_sad4x4x8_c
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad4x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_sad4x8 vpx_sad4x8_c
+unsigned int vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 
 unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad4x8_avg vpx_sad4x8_avg_c
+unsigned int vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad4x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x4d vpx_sad4x8x4d_c
-
-void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x8 vpx_sad4x8x8_c
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_sad64x32 vpx_sad64x32_c
+unsigned int vpx_sad64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 
 unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad64x32_avg vpx_sad64x32_avg_c
+unsigned int vpx_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad64x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x32x4d vpx_sad64x32x4d_c
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 RTCD_EXTERN unsigned int (*vpx_sad64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 
 unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad64x64_avg vpx_sad64x64_avg_c
+unsigned int vpx_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad64x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x3 vpx_sad64x64x3_c
-
-void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-
-void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x8 vpx_sad64x64x8_c
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 RTCD_EXTERN unsigned int (*vpx_sad8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 
 unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad8x16_avg vpx_sad8x16_avg_c
+unsigned int vpx_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad8x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x16x3 vpx_sad8x16x3_c
-
-void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x16x4d vpx_sad8x16x4d_c
-
-void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x16x8 vpx_sad8x16x8_c
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_sad8x4 vpx_sad8x4_c
+unsigned int vpx_sad8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 
 unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad8x4_avg vpx_sad8x4_avg_c
+unsigned int vpx_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad8x4_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x4d vpx_sad8x4x4d_c
-
-void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x8 vpx_sad8x4x8_c
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad8x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 RTCD_EXTERN unsigned int (*vpx_sad8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 
 unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad8x8_avg vpx_sad8x8_avg_c
+unsigned int vpx_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad8x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x8x3 vpx_sad8x8x3_c
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
-void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x8x4d vpx_sad8x8x4d_c
+unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 
-void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x8x8 vpx_sad8x8x8_c
+void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_4x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_4x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_8x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 int vpx_satd_c(const int16_t *coeff, int length);
 int vpx_satd_neon(const int16_t *coeff, int length);
 RTCD_EXTERN int (*vpx_satd)(const int16_t *coeff, int length);
 
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vpx_scaled_2d vpx_scaled_2d_c
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
 
-void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
 
-void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
 
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_horiz vpx_scaled_horiz_c
 
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_vert vpx_scaled_vert_c
 
-uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_c
+int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+int64_t vpx_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+RTCD_EXTERN int64_t (*vpx_sse)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
 
-uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_c
+uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_c
+uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_c
+uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_c
+uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_c
+uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_c
+uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_c
+uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_c
+uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_c
+uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_c
+uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_c
+uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_c
+uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_c
+uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_c
+uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_c
+uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_c
+uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c
+uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c
+uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_c
+uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_c
+uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_c
+uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
 void vpx_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
 RTCD_EXTERN void (*vpx_subtract_block)(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
 
 uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size);
-#define vpx_sum_squares_2d_i16 vpx_sum_squares_2d_i16_c
+uint64_t vpx_sum_squares_2d_i16_neon(const int16_t *src, int stride, int size);
+RTCD_EXTERN uint64_t (*vpx_sum_squares_2d_i16)(const int16_t *src, int stride, int size);
 
-void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_tm_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_tm_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_tm_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_tm_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_tm_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_tm_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_tm_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_tm_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_v_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_v_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_v_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_v_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_v_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_v_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance16x32 vpx_variance16x32_c
+unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance32x16 vpx_variance32x16_c
+unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance4x4 vpx_variance4x4_c
+unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance4x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance4x8 vpx_variance4x8_c
+unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance8x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance8x4 vpx_variance8x4_c
+unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c
 
 int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl);
@@ -837,6 +975,8 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_NEON) vpx_avg_4x4 = vpx_avg_4x4_neon;
     vpx_avg_8x8 = vpx_avg_8x8_c;
     if (flags & HAS_NEON) vpx_avg_8x8 = vpx_avg_8x8_neon;
+    vpx_comp_avg_pred = vpx_comp_avg_pred_c;
+    if (flags & HAS_NEON) vpx_comp_avg_pred = vpx_comp_avg_pred_neon;
     vpx_convolve8 = vpx_convolve8_c;
     if (flags & HAS_NEON) vpx_convolve8 = vpx_convolve8_neon;
     vpx_convolve8_avg = vpx_convolve8_avg_c;
@@ -853,6 +993,14 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_NEON) vpx_convolve_avg = vpx_convolve_avg_neon;
     vpx_convolve_copy = vpx_convolve_copy_c;
     if (flags & HAS_NEON) vpx_convolve_copy = vpx_convolve_copy_neon;
+    vpx_d117_predictor_16x16 = vpx_d117_predictor_16x16_c;
+    if (flags & HAS_NEON) vpx_d117_predictor_16x16 = vpx_d117_predictor_16x16_neon;
+    vpx_d117_predictor_32x32 = vpx_d117_predictor_32x32_c;
+    if (flags & HAS_NEON) vpx_d117_predictor_32x32 = vpx_d117_predictor_32x32_neon;
+    vpx_d117_predictor_4x4 = vpx_d117_predictor_4x4_c;
+    if (flags & HAS_NEON) vpx_d117_predictor_4x4 = vpx_d117_predictor_4x4_neon;
+    vpx_d117_predictor_8x8 = vpx_d117_predictor_8x8_c;
+    if (flags & HAS_NEON) vpx_d117_predictor_8x8 = vpx_d117_predictor_8x8_neon;
     vpx_d135_predictor_16x16 = vpx_d135_predictor_16x16_c;
     if (flags & HAS_NEON) vpx_d135_predictor_16x16 = vpx_d135_predictor_16x16_neon;
     vpx_d135_predictor_32x32 = vpx_d135_predictor_32x32_c;
@@ -861,6 +1009,22 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_NEON) vpx_d135_predictor_4x4 = vpx_d135_predictor_4x4_neon;
     vpx_d135_predictor_8x8 = vpx_d135_predictor_8x8_c;
     if (flags & HAS_NEON) vpx_d135_predictor_8x8 = vpx_d135_predictor_8x8_neon;
+    vpx_d153_predictor_16x16 = vpx_d153_predictor_16x16_c;
+    if (flags & HAS_NEON) vpx_d153_predictor_16x16 = vpx_d153_predictor_16x16_neon;
+    vpx_d153_predictor_32x32 = vpx_d153_predictor_32x32_c;
+    if (flags & HAS_NEON) vpx_d153_predictor_32x32 = vpx_d153_predictor_32x32_neon;
+    vpx_d153_predictor_4x4 = vpx_d153_predictor_4x4_c;
+    if (flags & HAS_NEON) vpx_d153_predictor_4x4 = vpx_d153_predictor_4x4_neon;
+    vpx_d153_predictor_8x8 = vpx_d153_predictor_8x8_c;
+    if (flags & HAS_NEON) vpx_d153_predictor_8x8 = vpx_d153_predictor_8x8_neon;
+    vpx_d207_predictor_16x16 = vpx_d207_predictor_16x16_c;
+    if (flags & HAS_NEON) vpx_d207_predictor_16x16 = vpx_d207_predictor_16x16_neon;
+    vpx_d207_predictor_32x32 = vpx_d207_predictor_32x32_c;
+    if (flags & HAS_NEON) vpx_d207_predictor_32x32 = vpx_d207_predictor_32x32_neon;
+    vpx_d207_predictor_4x4 = vpx_d207_predictor_4x4_c;
+    if (flags & HAS_NEON) vpx_d207_predictor_4x4 = vpx_d207_predictor_4x4_neon;
+    vpx_d207_predictor_8x8 = vpx_d207_predictor_8x8_c;
+    if (flags & HAS_NEON) vpx_d207_predictor_8x8 = vpx_d207_predictor_8x8_neon;
     vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_c;
     if (flags & HAS_NEON) vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_neon;
     vpx_d45_predictor_32x32 = vpx_d45_predictor_32x32_c;
@@ -869,6 +1033,14 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_NEON) vpx_d45_predictor_4x4 = vpx_d45_predictor_4x4_neon;
     vpx_d45_predictor_8x8 = vpx_d45_predictor_8x8_c;
     if (flags & HAS_NEON) vpx_d45_predictor_8x8 = vpx_d45_predictor_8x8_neon;
+    vpx_d63_predictor_16x16 = vpx_d63_predictor_16x16_c;
+    if (flags & HAS_NEON) vpx_d63_predictor_16x16 = vpx_d63_predictor_16x16_neon;
+    vpx_d63_predictor_32x32 = vpx_d63_predictor_32x32_c;
+    if (flags & HAS_NEON) vpx_d63_predictor_32x32 = vpx_d63_predictor_32x32_neon;
+    vpx_d63_predictor_4x4 = vpx_d63_predictor_4x4_c;
+    if (flags & HAS_NEON) vpx_d63_predictor_4x4 = vpx_d63_predictor_4x4_neon;
+    vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_c;
+    if (flags & HAS_NEON) vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_neon;
     vpx_dc_128_predictor_16x16 = vpx_dc_128_predictor_16x16_c;
     if (flags & HAS_NEON) vpx_dc_128_predictor_16x16 = vpx_dc_128_predictor_16x16_neon;
     vpx_dc_128_predictor_32x32 = vpx_dc_128_predictor_32x32_c;
@@ -901,6 +1073,20 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_NEON) vpx_dc_top_predictor_4x4 = vpx_dc_top_predictor_4x4_neon;
     vpx_dc_top_predictor_8x8 = vpx_dc_top_predictor_8x8_c;
     if (flags & HAS_NEON) vpx_dc_top_predictor_8x8 = vpx_dc_top_predictor_8x8_neon;
+    vpx_fdct16x16 = vpx_fdct16x16_c;
+    if (flags & HAS_NEON) vpx_fdct16x16 = vpx_fdct16x16_neon;
+    vpx_fdct16x16_1 = vpx_fdct16x16_1_c;
+    if (flags & HAS_NEON) vpx_fdct16x16_1 = vpx_fdct16x16_1_neon;
+    vpx_fdct32x32 = vpx_fdct32x32_c;
+    if (flags & HAS_NEON) vpx_fdct32x32 = vpx_fdct32x32_neon;
+    vpx_fdct32x32_1 = vpx_fdct32x32_1_c;
+    if (flags & HAS_NEON) vpx_fdct32x32_1 = vpx_fdct32x32_1_neon;
+    vpx_fdct32x32_rd = vpx_fdct32x32_rd_c;
+    if (flags & HAS_NEON) vpx_fdct32x32_rd = vpx_fdct32x32_rd_neon;
+    vpx_fdct4x4 = vpx_fdct4x4_c;
+    if (flags & HAS_NEON) vpx_fdct4x4 = vpx_fdct4x4_neon;
+    vpx_fdct4x4_1 = vpx_fdct4x4_1_c;
+    if (flags & HAS_NEON) vpx_fdct4x4_1 = vpx_fdct4x4_1_neon;
     vpx_fdct8x8 = vpx_fdct8x8_c;
     if (flags & HAS_NEON) vpx_fdct8x8 = vpx_fdct8x8_neon;
     vpx_fdct8x8_1 = vpx_fdct8x8_1_c;
@@ -921,6 +1107,8 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_NEON) vpx_h_predictor_8x8 = vpx_h_predictor_8x8_neon;
     vpx_hadamard_16x16 = vpx_hadamard_16x16_c;
     if (flags & HAS_NEON) vpx_hadamard_16x16 = vpx_hadamard_16x16_neon;
+    vpx_hadamard_32x32 = vpx_hadamard_32x32_c;
+    if (flags & HAS_NEON) vpx_hadamard_32x32 = vpx_hadamard_32x32_neon;
     vpx_hadamard_8x8 = vpx_hadamard_8x8_c;
     if (flags & HAS_NEON) vpx_hadamard_8x8 = vpx_hadamard_8x8_neon;
     vpx_idct16x16_10_add = vpx_idct16x16_10_add_c;
@@ -929,6 +1117,8 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_NEON) vpx_idct16x16_1_add = vpx_idct16x16_1_add_neon;
     vpx_idct16x16_256_add = vpx_idct16x16_256_add_c;
     if (flags & HAS_NEON) vpx_idct16x16_256_add = vpx_idct16x16_256_add_neon;
+    vpx_idct16x16_38_add = vpx_idct16x16_38_add_c;
+    if (flags & HAS_NEON) vpx_idct16x16_38_add = vpx_idct16x16_38_add_neon;
     vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_c;
     if (flags & HAS_NEON) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_neon;
     vpx_idct32x32_135_add = vpx_idct32x32_135_add_c;
@@ -979,38 +1169,208 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_NEON) vpx_minmax_8x8 = vpx_minmax_8x8_neon;
     vpx_mse16x16 = vpx_mse16x16_c;
     if (flags & HAS_NEON) vpx_mse16x16 = vpx_mse16x16_neon;
+    vpx_mse16x8 = vpx_mse16x8_c;
+    if (flags & HAS_NEON) vpx_mse16x8 = vpx_mse16x8_neon;
+    vpx_mse8x16 = vpx_mse8x16_c;
+    if (flags & HAS_NEON) vpx_mse8x16 = vpx_mse8x16_neon;
+    vpx_mse8x8 = vpx_mse8x8_c;
+    if (flags & HAS_NEON) vpx_mse8x8 = vpx_mse8x8_neon;
+    vpx_quantize_b = vpx_quantize_b_c;
+    if (flags & HAS_NEON) vpx_quantize_b = vpx_quantize_b_neon;
+    vpx_quantize_b_32x32 = vpx_quantize_b_32x32_c;
+    if (flags & HAS_NEON) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_neon;
     vpx_sad16x16 = vpx_sad16x16_c;
     if (flags & HAS_NEON) vpx_sad16x16 = vpx_sad16x16_neon;
+    vpx_sad16x16_avg = vpx_sad16x16_avg_c;
+    if (flags & HAS_NEON) vpx_sad16x16_avg = vpx_sad16x16_avg_neon;
     vpx_sad16x16x4d = vpx_sad16x16x4d_c;
     if (flags & HAS_NEON) vpx_sad16x16x4d = vpx_sad16x16x4d_neon;
+    vpx_sad16x32 = vpx_sad16x32_c;
+    if (flags & HAS_NEON) vpx_sad16x32 = vpx_sad16x32_neon;
+    vpx_sad16x32_avg = vpx_sad16x32_avg_c;
+    if (flags & HAS_NEON) vpx_sad16x32_avg = vpx_sad16x32_avg_neon;
+    vpx_sad16x32x4d = vpx_sad16x32x4d_c;
+    if (flags & HAS_NEON) vpx_sad16x32x4d = vpx_sad16x32x4d_neon;
     vpx_sad16x8 = vpx_sad16x8_c;
     if (flags & HAS_NEON) vpx_sad16x8 = vpx_sad16x8_neon;
+    vpx_sad16x8_avg = vpx_sad16x8_avg_c;
+    if (flags & HAS_NEON) vpx_sad16x8_avg = vpx_sad16x8_avg_neon;
+    vpx_sad16x8x4d = vpx_sad16x8x4d_c;
+    if (flags & HAS_NEON) vpx_sad16x8x4d = vpx_sad16x8x4d_neon;
+    vpx_sad32x16 = vpx_sad32x16_c;
+    if (flags & HAS_NEON) vpx_sad32x16 = vpx_sad32x16_neon;
+    vpx_sad32x16_avg = vpx_sad32x16_avg_c;
+    if (flags & HAS_NEON) vpx_sad32x16_avg = vpx_sad32x16_avg_neon;
+    vpx_sad32x16x4d = vpx_sad32x16x4d_c;
+    if (flags & HAS_NEON) vpx_sad32x16x4d = vpx_sad32x16x4d_neon;
     vpx_sad32x32 = vpx_sad32x32_c;
     if (flags & HAS_NEON) vpx_sad32x32 = vpx_sad32x32_neon;
+    vpx_sad32x32_avg = vpx_sad32x32_avg_c;
+    if (flags & HAS_NEON) vpx_sad32x32_avg = vpx_sad32x32_avg_neon;
     vpx_sad32x32x4d = vpx_sad32x32x4d_c;
     if (flags & HAS_NEON) vpx_sad32x32x4d = vpx_sad32x32x4d_neon;
+    vpx_sad32x64 = vpx_sad32x64_c;
+    if (flags & HAS_NEON) vpx_sad32x64 = vpx_sad32x64_neon;
+    vpx_sad32x64_avg = vpx_sad32x64_avg_c;
+    if (flags & HAS_NEON) vpx_sad32x64_avg = vpx_sad32x64_avg_neon;
+    vpx_sad32x64x4d = vpx_sad32x64x4d_c;
+    if (flags & HAS_NEON) vpx_sad32x64x4d = vpx_sad32x64x4d_neon;
     vpx_sad4x4 = vpx_sad4x4_c;
     if (flags & HAS_NEON) vpx_sad4x4 = vpx_sad4x4_neon;
+    vpx_sad4x4_avg = vpx_sad4x4_avg_c;
+    if (flags & HAS_NEON) vpx_sad4x4_avg = vpx_sad4x4_avg_neon;
+    vpx_sad4x4x4d = vpx_sad4x4x4d_c;
+    if (flags & HAS_NEON) vpx_sad4x4x4d = vpx_sad4x4x4d_neon;
+    vpx_sad4x8 = vpx_sad4x8_c;
+    if (flags & HAS_NEON) vpx_sad4x8 = vpx_sad4x8_neon;
+    vpx_sad4x8_avg = vpx_sad4x8_avg_c;
+    if (flags & HAS_NEON) vpx_sad4x8_avg = vpx_sad4x8_avg_neon;
+    vpx_sad4x8x4d = vpx_sad4x8x4d_c;
+    if (flags & HAS_NEON) vpx_sad4x8x4d = vpx_sad4x8x4d_neon;
+    vpx_sad64x32 = vpx_sad64x32_c;
+    if (flags & HAS_NEON) vpx_sad64x32 = vpx_sad64x32_neon;
+    vpx_sad64x32_avg = vpx_sad64x32_avg_c;
+    if (flags & HAS_NEON) vpx_sad64x32_avg = vpx_sad64x32_avg_neon;
+    vpx_sad64x32x4d = vpx_sad64x32x4d_c;
+    if (flags & HAS_NEON) vpx_sad64x32x4d = vpx_sad64x32x4d_neon;
     vpx_sad64x64 = vpx_sad64x64_c;
     if (flags & HAS_NEON) vpx_sad64x64 = vpx_sad64x64_neon;
+    vpx_sad64x64_avg = vpx_sad64x64_avg_c;
+    if (flags & HAS_NEON) vpx_sad64x64_avg = vpx_sad64x64_avg_neon;
     vpx_sad64x64x4d = vpx_sad64x64x4d_c;
     if (flags & HAS_NEON) vpx_sad64x64x4d = vpx_sad64x64x4d_neon;
     vpx_sad8x16 = vpx_sad8x16_c;
     if (flags & HAS_NEON) vpx_sad8x16 = vpx_sad8x16_neon;
+    vpx_sad8x16_avg = vpx_sad8x16_avg_c;
+    if (flags & HAS_NEON) vpx_sad8x16_avg = vpx_sad8x16_avg_neon;
+    vpx_sad8x16x4d = vpx_sad8x16x4d_c;
+    if (flags & HAS_NEON) vpx_sad8x16x4d = vpx_sad8x16x4d_neon;
+    vpx_sad8x4 = vpx_sad8x4_c;
+    if (flags & HAS_NEON) vpx_sad8x4 = vpx_sad8x4_neon;
+    vpx_sad8x4_avg = vpx_sad8x4_avg_c;
+    if (flags & HAS_NEON) vpx_sad8x4_avg = vpx_sad8x4_avg_neon;
+    vpx_sad8x4x4d = vpx_sad8x4x4d_c;
+    if (flags & HAS_NEON) vpx_sad8x4x4d = vpx_sad8x4x4d_neon;
     vpx_sad8x8 = vpx_sad8x8_c;
     if (flags & HAS_NEON) vpx_sad8x8 = vpx_sad8x8_neon;
+    vpx_sad8x8_avg = vpx_sad8x8_avg_c;
+    if (flags & HAS_NEON) vpx_sad8x8_avg = vpx_sad8x8_avg_neon;
+    vpx_sad8x8x4d = vpx_sad8x8x4d_c;
+    if (flags & HAS_NEON) vpx_sad8x8x4d = vpx_sad8x8x4d_neon;
+    vpx_sad_skip_16x16 = vpx_sad_skip_16x16_c;
+    if (flags & HAS_NEON) vpx_sad_skip_16x16 = vpx_sad_skip_16x16_neon;
+    vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_c;
+    if (flags & HAS_NEON) vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_neon;
+    vpx_sad_skip_16x32 = vpx_sad_skip_16x32_c;
+    if (flags & HAS_NEON) vpx_sad_skip_16x32 = vpx_sad_skip_16x32_neon;
+    vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_c;
+    if (flags & HAS_NEON) vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_neon;
+    vpx_sad_skip_16x8 = vpx_sad_skip_16x8_c;
+    if (flags & HAS_NEON) vpx_sad_skip_16x8 = vpx_sad_skip_16x8_neon;
+    vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_c;
+    if (flags & HAS_NEON) vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_neon;
+    vpx_sad_skip_32x16 = vpx_sad_skip_32x16_c;
+    if (flags & HAS_NEON) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_neon;
+    vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_c;
+    if (flags & HAS_NEON) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_neon;
+    vpx_sad_skip_32x32 = vpx_sad_skip_32x32_c;
+    if (flags & HAS_NEON) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_neon;
+    vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_c;
+    if (flags & HAS_NEON) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_neon;
+    vpx_sad_skip_32x64 = vpx_sad_skip_32x64_c;
+    if (flags & HAS_NEON) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_neon;
+    vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_c;
+    if (flags & HAS_NEON) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_neon;
+    vpx_sad_skip_4x4 = vpx_sad_skip_4x4_c;
+    if (flags & HAS_NEON) vpx_sad_skip_4x4 = vpx_sad_skip_4x4_neon;
+    vpx_sad_skip_4x4x4d = vpx_sad_skip_4x4x4d_c;
+    if (flags & HAS_NEON) vpx_sad_skip_4x4x4d = vpx_sad_skip_4x4x4d_neon;
+    vpx_sad_skip_4x8 = vpx_sad_skip_4x8_c;
+    if (flags & HAS_NEON) vpx_sad_skip_4x8 = vpx_sad_skip_4x8_neon;
+    vpx_sad_skip_4x8x4d = vpx_sad_skip_4x8x4d_c;
+    if (flags & HAS_NEON) vpx_sad_skip_4x8x4d = vpx_sad_skip_4x8x4d_neon;
+    vpx_sad_skip_64x32 = vpx_sad_skip_64x32_c;
+    if (flags & HAS_NEON) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_neon;
+    vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_c;
+    if (flags & HAS_NEON) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_neon;
+    vpx_sad_skip_64x64 = vpx_sad_skip_64x64_c;
+    if (flags & HAS_NEON) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_neon;
+    vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_c;
+    if (flags & HAS_NEON) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_neon;
+    vpx_sad_skip_8x16 = vpx_sad_skip_8x16_c;
+    if (flags & HAS_NEON) vpx_sad_skip_8x16 = vpx_sad_skip_8x16_neon;
+    vpx_sad_skip_8x16x4d = vpx_sad_skip_8x16x4d_c;
+    if (flags & HAS_NEON) vpx_sad_skip_8x16x4d = vpx_sad_skip_8x16x4d_neon;
+    vpx_sad_skip_8x4 = vpx_sad_skip_8x4_c;
+    if (flags & HAS_NEON) vpx_sad_skip_8x4 = vpx_sad_skip_8x4_neon;
+    vpx_sad_skip_8x4x4d = vpx_sad_skip_8x4x4d_c;
+    if (flags & HAS_NEON) vpx_sad_skip_8x4x4d = vpx_sad_skip_8x4x4d_neon;
+    vpx_sad_skip_8x8 = vpx_sad_skip_8x8_c;
+    if (flags & HAS_NEON) vpx_sad_skip_8x8 = vpx_sad_skip_8x8_neon;
+    vpx_sad_skip_8x8x4d = vpx_sad_skip_8x8x4d_c;
+    if (flags & HAS_NEON) vpx_sad_skip_8x8x4d = vpx_sad_skip_8x8x4d_neon;
     vpx_satd = vpx_satd_c;
     if (flags & HAS_NEON) vpx_satd = vpx_satd_neon;
+    vpx_scaled_2d = vpx_scaled_2d_c;
+    if (flags & HAS_NEON) vpx_scaled_2d = vpx_scaled_2d_neon;
+    vpx_sse = vpx_sse_c;
+    if (flags & HAS_NEON) vpx_sse = vpx_sse_neon;
+    vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_c;
+    if (flags & HAS_NEON) vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_neon;
+    vpx_sub_pixel_avg_variance16x32 = vpx_sub_pixel_avg_variance16x32_c;
+    if (flags & HAS_NEON) vpx_sub_pixel_avg_variance16x32 = vpx_sub_pixel_avg_variance16x32_neon;
+    vpx_sub_pixel_avg_variance16x8 = vpx_sub_pixel_avg_variance16x8_c;
+    if (flags & HAS_NEON) vpx_sub_pixel_avg_variance16x8 = vpx_sub_pixel_avg_variance16x8_neon;
+    vpx_sub_pixel_avg_variance32x16 = vpx_sub_pixel_avg_variance32x16_c;
+    if (flags & HAS_NEON) vpx_sub_pixel_avg_variance32x16 = vpx_sub_pixel_avg_variance32x16_neon;
+    vpx_sub_pixel_avg_variance32x32 = vpx_sub_pixel_avg_variance32x32_c;
+    if (flags & HAS_NEON) vpx_sub_pixel_avg_variance32x32 = vpx_sub_pixel_avg_variance32x32_neon;
+    vpx_sub_pixel_avg_variance32x64 = vpx_sub_pixel_avg_variance32x64_c;
+    if (flags & HAS_NEON) vpx_sub_pixel_avg_variance32x64 = vpx_sub_pixel_avg_variance32x64_neon;
+    vpx_sub_pixel_avg_variance4x4 = vpx_sub_pixel_avg_variance4x4_c;
+    if (flags & HAS_NEON) vpx_sub_pixel_avg_variance4x4 = vpx_sub_pixel_avg_variance4x4_neon;
+    vpx_sub_pixel_avg_variance4x8 = vpx_sub_pixel_avg_variance4x8_c;
+    if (flags & HAS_NEON) vpx_sub_pixel_avg_variance4x8 = vpx_sub_pixel_avg_variance4x8_neon;
+    vpx_sub_pixel_avg_variance64x32 = vpx_sub_pixel_avg_variance64x32_c;
+    if (flags & HAS_NEON) vpx_sub_pixel_avg_variance64x32 = vpx_sub_pixel_avg_variance64x32_neon;
+    vpx_sub_pixel_avg_variance64x64 = vpx_sub_pixel_avg_variance64x64_c;
+    if (flags & HAS_NEON) vpx_sub_pixel_avg_variance64x64 = vpx_sub_pixel_avg_variance64x64_neon;
+    vpx_sub_pixel_avg_variance8x16 = vpx_sub_pixel_avg_variance8x16_c;
+    if (flags & HAS_NEON) vpx_sub_pixel_avg_variance8x16 = vpx_sub_pixel_avg_variance8x16_neon;
+    vpx_sub_pixel_avg_variance8x4 = vpx_sub_pixel_avg_variance8x4_c;
+    if (flags & HAS_NEON) vpx_sub_pixel_avg_variance8x4 = vpx_sub_pixel_avg_variance8x4_neon;
+    vpx_sub_pixel_avg_variance8x8 = vpx_sub_pixel_avg_variance8x8_c;
+    if (flags & HAS_NEON) vpx_sub_pixel_avg_variance8x8 = vpx_sub_pixel_avg_variance8x8_neon;
     vpx_sub_pixel_variance16x16 = vpx_sub_pixel_variance16x16_c;
     if (flags & HAS_NEON) vpx_sub_pixel_variance16x16 = vpx_sub_pixel_variance16x16_neon;
+    vpx_sub_pixel_variance16x32 = vpx_sub_pixel_variance16x32_c;
+    if (flags & HAS_NEON) vpx_sub_pixel_variance16x32 = vpx_sub_pixel_variance16x32_neon;
+    vpx_sub_pixel_variance16x8 = vpx_sub_pixel_variance16x8_c;
+    if (flags & HAS_NEON) vpx_sub_pixel_variance16x8 = vpx_sub_pixel_variance16x8_neon;
+    vpx_sub_pixel_variance32x16 = vpx_sub_pixel_variance32x16_c;
+    if (flags & HAS_NEON) vpx_sub_pixel_variance32x16 = vpx_sub_pixel_variance32x16_neon;
     vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_c;
     if (flags & HAS_NEON) vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_neon;
+    vpx_sub_pixel_variance32x64 = vpx_sub_pixel_variance32x64_c;
+    if (flags & HAS_NEON) vpx_sub_pixel_variance32x64 = vpx_sub_pixel_variance32x64_neon;
+    vpx_sub_pixel_variance4x4 = vpx_sub_pixel_variance4x4_c;
+    if (flags & HAS_NEON) vpx_sub_pixel_variance4x4 = vpx_sub_pixel_variance4x4_neon;
+    vpx_sub_pixel_variance4x8 = vpx_sub_pixel_variance4x8_c;
+    if (flags & HAS_NEON) vpx_sub_pixel_variance4x8 = vpx_sub_pixel_variance4x8_neon;
+    vpx_sub_pixel_variance64x32 = vpx_sub_pixel_variance64x32_c;
+    if (flags & HAS_NEON) vpx_sub_pixel_variance64x32 = vpx_sub_pixel_variance64x32_neon;
     vpx_sub_pixel_variance64x64 = vpx_sub_pixel_variance64x64_c;
     if (flags & HAS_NEON) vpx_sub_pixel_variance64x64 = vpx_sub_pixel_variance64x64_neon;
+    vpx_sub_pixel_variance8x16 = vpx_sub_pixel_variance8x16_c;
+    if (flags & HAS_NEON) vpx_sub_pixel_variance8x16 = vpx_sub_pixel_variance8x16_neon;
+    vpx_sub_pixel_variance8x4 = vpx_sub_pixel_variance8x4_c;
+    if (flags & HAS_NEON) vpx_sub_pixel_variance8x4 = vpx_sub_pixel_variance8x4_neon;
     vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_c;
     if (flags & HAS_NEON) vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_neon;
     vpx_subtract_block = vpx_subtract_block_c;
     if (flags & HAS_NEON) vpx_subtract_block = vpx_subtract_block_neon;
+    vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_c;
+    if (flags & HAS_NEON) vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_neon;
     vpx_tm_predictor_16x16 = vpx_tm_predictor_16x16_c;
     if (flags & HAS_NEON) vpx_tm_predictor_16x16 = vpx_tm_predictor_16x16_neon;
     vpx_tm_predictor_32x32 = vpx_tm_predictor_32x32_c;
@@ -1029,18 +1389,28 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_NEON) vpx_v_predictor_8x8 = vpx_v_predictor_8x8_neon;
     vpx_variance16x16 = vpx_variance16x16_c;
     if (flags & HAS_NEON) vpx_variance16x16 = vpx_variance16x16_neon;
+    vpx_variance16x32 = vpx_variance16x32_c;
+    if (flags & HAS_NEON) vpx_variance16x32 = vpx_variance16x32_neon;
     vpx_variance16x8 = vpx_variance16x8_c;
     if (flags & HAS_NEON) vpx_variance16x8 = vpx_variance16x8_neon;
+    vpx_variance32x16 = vpx_variance32x16_c;
+    if (flags & HAS_NEON) vpx_variance32x16 = vpx_variance32x16_neon;
     vpx_variance32x32 = vpx_variance32x32_c;
     if (flags & HAS_NEON) vpx_variance32x32 = vpx_variance32x32_neon;
     vpx_variance32x64 = vpx_variance32x64_c;
     if (flags & HAS_NEON) vpx_variance32x64 = vpx_variance32x64_neon;
+    vpx_variance4x4 = vpx_variance4x4_c;
+    if (flags & HAS_NEON) vpx_variance4x4 = vpx_variance4x4_neon;
+    vpx_variance4x8 = vpx_variance4x8_c;
+    if (flags & HAS_NEON) vpx_variance4x8 = vpx_variance4x8_neon;
     vpx_variance64x32 = vpx_variance64x32_c;
     if (flags & HAS_NEON) vpx_variance64x32 = vpx_variance64x32_neon;
     vpx_variance64x64 = vpx_variance64x64_c;
     if (flags & HAS_NEON) vpx_variance64x64 = vpx_variance64x64_neon;
     vpx_variance8x16 = vpx_variance8x16_c;
     if (flags & HAS_NEON) vpx_variance8x16 = vpx_variance8x16_neon;
+    vpx_variance8x4 = vpx_variance8x4_c;
+    if (flags & HAS_NEON) vpx_variance8x4 = vpx_variance8x4_neon;
     vpx_variance8x8 = vpx_variance8x8_c;
     if (flags & HAS_NEON) vpx_variance8x8 = vpx_variance8x8_neon;
     vpx_vector_var = vpx_vector_var_c;
@@ -1052,4 +1422,4 @@ static void setup_rtcd_internal(void)
 }  // extern "C"
 #endif
 
-#endif
+#endif  // VPX_DSP_RTCD_H_
diff --git a/media/libvpx/config/linux/arm/vpx_scale_rtcd.h b/media/libvpx/config/linux/arm/vpx_scale_rtcd.h
index a1564b7ad6..ca594ea140 100644
--- a/media/libvpx/config/linux/arm/vpx_scale_rtcd.h
+++ b/media/libvpx/config/linux/arm/vpx_scale_rtcd.h
@@ -1,3 +1,14 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
 #ifndef VPX_SCALE_RTCD_H_
 #define VPX_SCALE_RTCD_H_
 
@@ -46,6 +57,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
 void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
 #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
 
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
 void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
 #define vpx_yv12_copy_y vpx_yv12_copy_y_c
 
@@ -68,4 +82,4 @@ static void setup_rtcd_internal(void)
 }  // extern "C"
 #endif
 
-#endif
+#endif  // VPX_SCALE_RTCD_H_
diff --git a/media/libvpx/config/linux/arm64/vp8_rtcd.h b/media/libvpx/config/linux/arm64/vp8_rtcd.h
new file mode 100644
index 0000000000..36e6855a6d
--- /dev/null
+++ b/media/libvpx/config/linux/arm64/vp8_rtcd.h
@@ -0,0 +1,211 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
+#ifndef VP8_RTCD_H_
+#define VP8_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * VP8
+ */
+
+struct blockd;
+struct macroblockd;
+struct loop_filter_info;
+
+/* Encoder forward decls */
+struct block;
+struct macroblock;
+struct variance_vtable;
+union int_mv;
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict16x16_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_neon
+
+void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_neon
+
+void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_neon
+
+void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_neon
+
+int vp8_block_error_c(short *coeff, short *dqcoeff);
+#define vp8_block_error vp8_block_error_c
+
+void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
+#define vp8_copy32xn vp8_copy32xn_c
+
+void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem16x16_neon(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+#define vp8_copy_mem16x16 vp8_copy_mem16x16_neon
+
+void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem8x4_neon(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+#define vp8_copy_mem8x4 vp8_copy_mem8x4_neon
+
+void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem8x8_neon(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+#define vp8_copy_mem8x8 vp8_copy_mem8x8_neon
+
+void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+void vp8_dc_only_idct_add_neon(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+#define vp8_dc_only_idct_add vp8_dc_only_idct_add_neon
+
+int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+#define vp8_denoiser_filter vp8_denoiser_filter_neon
+
+int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+int vp8_denoiser_filter_uv_neon(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_neon
+
+void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride);
+void vp8_dequant_idct_add_neon(short *input, short *dq, unsigned char *dest, int stride);
+#define vp8_dequant_idct_add vp8_dequant_idct_add_neon
+
+void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
+void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
+#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon
+
+void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
+void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
+#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_neon
+
+void vp8_dequantize_b_c(struct blockd*, short *DQC);
+void vp8_dequantize_b_neon(struct blockd*, short *DQC);
+#define vp8_dequantize_b vp8_dequantize_b_neon
+
+int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_diamond_search_sad vp8_diamond_search_sad_c
+
+void vp8_fast_quantize_b_c(struct block *, struct blockd *);
+void vp8_fast_quantize_b_neon(struct block *, struct blockd *);
+#define vp8_fast_quantize_b vp8_fast_quantize_b_neon
+
+void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_bh vp8_loop_filter_bh_neon
+
+void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_bv vp8_loop_filter_bv_neon
+
+void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_mbh vp8_loop_filter_mbh_neon
+
+void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_mbv vp8_loop_filter_mbv_neon
+
+void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_neon
+
+void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_bvs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_neon
+
+void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_mbh vp8_loop_filter_mbhs_neon
+
+void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_mbv vp8_loop_filter_mbvs_neon
+
+int vp8_mbblock_error_c(struct macroblock *mb, int dc);
+#define vp8_mbblock_error vp8_mbblock_error_c
+
+int vp8_mbuverror_c(struct macroblock *mb);
+#define vp8_mbuverror vp8_mbuverror_c
+
+int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_refining_search_sad vp8_refining_search_sad_c
+
+void vp8_regular_quantize_b_c(struct block *, struct blockd *);
+#define vp8_regular_quantize_b vp8_regular_quantize_b_c
+
+void vp8_short_fdct4x4_c(short *input, short *output, int pitch);
+void vp8_short_fdct4x4_neon(short *input, short *output, int pitch);
+#define vp8_short_fdct4x4 vp8_short_fdct4x4_neon
+
+void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
+void vp8_short_fdct8x4_neon(short *input, short *output, int pitch);
+#define vp8_short_fdct8x4 vp8_short_fdct8x4_neon
+
+void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+void vp8_short_idct4x4llm_neon(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+#define vp8_short_idct4x4llm vp8_short_idct4x4llm_neon
+
+void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff);
+void vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff);
+#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_neon
+
+void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff);
+#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c
+
+void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
+void vp8_short_walsh4x4_neon(short *input, short *output, int pitch);
+#define vp8_short_walsh4x4 vp8_short_walsh4x4_neon
+
+void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_neon
+
+void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_neon
+
+void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_neon
+
+void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_neon
+
+void vp8_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/arm.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = arm_cpu_caps();
+
+    (void)flags;
+
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_RTCD_H_
diff --git a/media/libvpx/config/linux/arm64/vp9_rtcd.h b/media/libvpx/config/linux/arm64/vp9_rtcd.h
new file mode 100644
index 0000000000..e1b572fe62
--- /dev/null
+++ b/media/libvpx/config/linux/arm64/vp9_rtcd.h
@@ -0,0 +1,125 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
+#ifndef VP9_RTCD_H_
+#define VP9_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * VP9
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
+#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER
+#include "vp9/encoder/vp9_temporal_filter.h"
+#endif
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct macroblock_plane;
+struct vp9_sad_table;
+struct ScanOrder;
+struct search_site_config;
+struct mv;
+union int_mv;
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
+int64_t vp9_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
+int64_t vp9_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
+RTCD_EXTERN int64_t (*vp9_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
+
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+RTCD_EXTERN int64_t (*vp9_block_error_fp)(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+
+int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv);
+int vp9_diamond_search_sad_neon(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv);
+#define vp9_diamond_search_sad vp9_diamond_search_sad_neon
+
+void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+void vp9_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht16x16 vp9_fht16x16_neon
+
+void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+void vp9_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht4x4 vp9_fht4x4_neon
+
+void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+void vp9_fht8x8_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht8x8 vp9_fht8x8_neon
+
+void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_fwht4x4 vp9_fwht4x4_c
+
+void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+#define vp9_iht16x16_256_add vp9_iht16x16_256_add_neon
+
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+#define vp9_iht4x4_16_add vp9_iht4x4_16_add_neon
+
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+#define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon
+
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+#define vp9_quantize_fp vp9_quantize_fp_neon
+
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_neon
+
+void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+void vp9_scale_and_extend_frame_neon(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_neon
+
+void vp9_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/arm.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = arm_cpu_caps();
+
+    (void)flags;
+
+    vp9_block_error = vp9_block_error_neon;
+    if (flags & HAS_SVE) vp9_block_error = vp9_block_error_sve;
+    vp9_block_error_fp = vp9_block_error_fp_neon;
+    if (flags & HAS_SVE) vp9_block_error_fp = vp9_block_error_fp_sve;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_RTCD_H_
diff --git a/media/libvpx/config/linux/arm64/vpx_config.asm b/media/libvpx/config/linux/arm64/vpx_config.asm
new file mode 100644
index 0000000000..6758dcb4d3
--- /dev/null
+++ b/media/libvpx/config/linux/arm64/vpx_config.asm
@@ -0,0 +1,97 @@
+@ This file was created from a .asm file
+@  using the ads2gas.pl script.
+.syntax unified
+.equ VPX_ARCH_ARM ,  1
+.equ VPX_ARCH_AARCH64 ,  1
+.equ VPX_ARCH_MIPS ,  0
+.equ VPX_ARCH_X86 ,  0
+.equ VPX_ARCH_X86_64 ,  0
+.equ VPX_ARCH_PPC ,  0
+.equ VPX_ARCH_LOONGARCH ,  0
+.equ HAVE_NEON_ASM ,  0
+.equ HAVE_NEON ,  1
+.equ HAVE_NEON_DOTPROD ,  1
+.equ HAVE_NEON_I8MM ,  1
+.equ HAVE_SVE ,  1
+.equ HAVE_SVE2 ,  1
+.equ HAVE_MIPS32 ,  0
+.equ HAVE_DSPR2 ,  0
+.equ HAVE_MSA ,  0
+.equ HAVE_MIPS64 ,  0
+.equ HAVE_MMX ,  0
+.equ HAVE_SSE ,  0
+.equ HAVE_SSE2 ,  0
+.equ HAVE_SSE3 ,  0
+.equ HAVE_SSSE3 ,  0
+.equ HAVE_SSE4_1 ,  0
+.equ HAVE_AVX ,  0
+.equ HAVE_AVX2 ,  0
+.equ HAVE_AVX512 ,  0
+.equ HAVE_VSX ,  0
+.equ HAVE_MMI ,  0
+.equ HAVE_LSX ,  0
+.equ HAVE_LASX ,  0
+.equ HAVE_VPX_PORTS ,  1
+.equ HAVE_PTHREAD_H ,  1
+.equ CONFIG_DEPENDENCY_TRACKING ,  1
+.equ CONFIG_EXTERNAL_BUILD ,  1
+.equ CONFIG_INSTALL_DOCS ,  0
+.equ CONFIG_INSTALL_BINS ,  1
+.equ CONFIG_INSTALL_LIBS ,  1
+.equ CONFIG_INSTALL_SRCS ,  0
+.equ CONFIG_DEBUG ,  0
+.equ CONFIG_GPROF ,  0
+.equ CONFIG_GCOV ,  0
+.equ CONFIG_RVCT ,  0
+.equ CONFIG_GCC ,  1
+.equ CONFIG_MSVS ,  0
+.equ CONFIG_PIC ,  1
+.equ CONFIG_BIG_ENDIAN ,  0
+.equ CONFIG_CODEC_SRCS ,  0
+.equ CONFIG_DEBUG_LIBS ,  0
+.equ CONFIG_DEQUANT_TOKENS ,  0
+.equ CONFIG_DC_RECON ,  0
+.equ CONFIG_RUNTIME_CPU_DETECT ,  1
+.equ CONFIG_POSTPROC ,  0
+.equ CONFIG_VP9_POSTPROC ,  0
+.equ CONFIG_MULTITHREAD ,  1
+.equ CONFIG_INTERNAL_STATS ,  0
+.equ CONFIG_VP8_ENCODER ,  1
+.equ CONFIG_VP8_DECODER ,  1
+.equ CONFIG_VP9_ENCODER ,  1
+.equ CONFIG_VP9_DECODER ,  1
+.equ CONFIG_VP8 ,  1
+.equ CONFIG_VP9 ,  1
+.equ CONFIG_ENCODERS ,  1
+.equ CONFIG_DECODERS ,  1
+.equ CONFIG_STATIC_MSVCRT ,  0
+.equ CONFIG_SPATIAL_RESAMPLING ,  1
+.equ CONFIG_REALTIME_ONLY ,  1
+.equ CONFIG_ONTHEFLY_BITPACKING ,  0
+.equ CONFIG_ERROR_CONCEALMENT ,  0
+.equ CONFIG_SHARED ,  0
+.equ CONFIG_STATIC ,  1
+.equ CONFIG_SMALL ,  0
+.equ CONFIG_POSTPROC_VISUALIZER ,  0
+.equ CONFIG_OS_SUPPORT ,  1
+.equ CONFIG_UNIT_TESTS ,  0
+.equ CONFIG_WEBM_IO ,  1
+.equ CONFIG_LIBYUV ,  1
+.equ CONFIG_DECODE_PERF_TESTS ,  0
+.equ CONFIG_ENCODE_PERF_TESTS ,  0
+.equ CONFIG_MULTI_RES_ENCODING ,  1
+.equ CONFIG_TEMPORAL_DENOISING ,  1
+.equ CONFIG_VP9_TEMPORAL_DENOISING ,  0
+.equ CONFIG_COEFFICIENT_RANGE_CHECKING ,  0
+.equ CONFIG_VP9_HIGHBITDEPTH ,  0
+.equ CONFIG_BETTER_HW_COMPATIBILITY ,  0
+.equ CONFIG_EXPERIMENTAL ,  0
+.equ CONFIG_SIZE_LIMIT ,  1
+.equ CONFIG_ALWAYS_ADJUST_BPM ,  0
+.equ CONFIG_BITSTREAM_DEBUG ,  0
+.equ CONFIG_MISMATCH_DEBUG ,  0
+.equ CONFIG_FP_MB_STATS ,  0
+.equ CONFIG_EMULATE_HARDWARE ,  0
+.equ CONFIG_NON_GREEDY_MV ,  0
+.equ CONFIG_COLLECT_COMPONENT_TIMING ,  0
+    .section .note.GNU-stack,"",%progbits
diff --git a/media/libvpx/config/linux/arm64/vpx_config.c b/media/libvpx/config/linux/arm64/vpx_config.c
new file mode 100644
index 0000000000..d9ea4527ca
--- /dev/null
+++ b/media/libvpx/config/linux/arm64/vpx_config.c
@@ -0,0 +1,10 @@
+/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */
+/*  */
+/* Use of this source code is governed by a BSD-style license */
+/* that can be found in the LICENSE file in the root of the source */
+/* tree. An additional intellectual property rights grant can be found */
+/* in the file PATENTS.  All contributing project authors may */
+/* be found in the AUTHORS file in the root of the source tree. */
+#include "vpx/vpx_codec.h"
+static const char* const cfg = "--target=arm64-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --enable-runtime-cpu-detect --enable-realtime-only";
+const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/media/libvpx/config/linux/arm64/vpx_config.h b/media/libvpx/config/linux/arm64/vpx_config.h
new file mode 100644
index 0000000000..b5163a213c
--- /dev/null
+++ b/media/libvpx/config/linux/arm64/vpx_config.h
@@ -0,0 +1,108 @@
+/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */
+/*  */
+/* Use of this source code is governed by a BSD-style license */
+/* that can be found in the LICENSE file in the root of the source */
+/* tree. An additional intellectual property rights grant can be found */
+/* in the file PATENTS.  All contributing project authors may */
+/* be found in the AUTHORS file in the root of the source tree. */
+/* This file automatically generated by configure. Do not edit! */
+#ifndef VPX_CONFIG_H
+#define VPX_CONFIG_H
+#define RESTRICT    
+#define INLINE      inline
+#define VPX_ARCH_ARM 1
+#define VPX_ARCH_AARCH64 1
+#define VPX_ARCH_MIPS 0
+#define VPX_ARCH_X86 0
+#define VPX_ARCH_X86_64 0
+#define VPX_ARCH_PPC 0
+#define VPX_ARCH_LOONGARCH 0
+#define HAVE_NEON_ASM 0
+#define HAVE_NEON 1
+#define HAVE_NEON_DOTPROD 1
+#define HAVE_NEON_I8MM 1
+#define HAVE_SVE 1
+#define HAVE_SVE2 1
+#define HAVE_MIPS32 0
+#define HAVE_DSPR2 0
+#define HAVE_MSA 0
+#define HAVE_MIPS64 0
+#define HAVE_MMX 0
+#define HAVE_SSE 0
+#define HAVE_SSE2 0
+#define HAVE_SSE3 0
+#define HAVE_SSSE3 0
+#define HAVE_SSE4_1 0
+#define HAVE_AVX 0
+#define HAVE_AVX2 0
+#define HAVE_AVX512 0
+#define HAVE_VSX 0
+#define HAVE_MMI 0
+#define HAVE_LSX 0
+#define HAVE_LASX 0
+#define HAVE_VPX_PORTS 1
+#define HAVE_PTHREAD_H 1
+#define CONFIG_DEPENDENCY_TRACKING 1
+#define CONFIG_EXTERNAL_BUILD 1
+#define CONFIG_INSTALL_DOCS 0
+#define CONFIG_INSTALL_BINS 1
+#define CONFIG_INSTALL_LIBS 1
+#define CONFIG_INSTALL_SRCS 0
+#define CONFIG_DEBUG 0
+#define CONFIG_GPROF 0
+#define CONFIG_GCOV 0
+#define CONFIG_RVCT 0
+#define CONFIG_GCC 1
+#define CONFIG_MSVS 0
+#define CONFIG_PIC 1
+#define CONFIG_BIG_ENDIAN 0
+#define CONFIG_CODEC_SRCS 0
+#define CONFIG_DEBUG_LIBS 0
+#define CONFIG_DEQUANT_TOKENS 0
+#define CONFIG_DC_RECON 0
+#define CONFIG_RUNTIME_CPU_DETECT 1
+#define CONFIG_POSTPROC 0
+#define CONFIG_VP9_POSTPROC 0
+#define CONFIG_MULTITHREAD 1
+#define CONFIG_INTERNAL_STATS 0
+#define CONFIG_VP8_ENCODER 1
+#define CONFIG_VP8_DECODER 1
+#define CONFIG_VP9_ENCODER 1
+#define CONFIG_VP9_DECODER 1
+#define CONFIG_VP8 1
+#define CONFIG_VP9 1
+#define CONFIG_ENCODERS 1
+#define CONFIG_DECODERS 1
+#define CONFIG_STATIC_MSVCRT 0
+#define CONFIG_SPATIAL_RESAMPLING 1
+#define CONFIG_REALTIME_ONLY 1
+#define CONFIG_ONTHEFLY_BITPACKING 0
+#define CONFIG_ERROR_CONCEALMENT 0
+#define CONFIG_SHARED 0
+#define CONFIG_STATIC 1
+#define CONFIG_SMALL 0
+#define CONFIG_POSTPROC_VISUALIZER 0
+#define CONFIG_OS_SUPPORT 1
+#define CONFIG_UNIT_TESTS 0
+#define CONFIG_WEBM_IO 1
+#define CONFIG_LIBYUV 1
+#define CONFIG_DECODE_PERF_TESTS 0
+#define CONFIG_ENCODE_PERF_TESTS 0
+#define CONFIG_MULTI_RES_ENCODING 1
+#define CONFIG_TEMPORAL_DENOISING 1
+#define CONFIG_VP9_TEMPORAL_DENOISING 0
+#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_VP9_HIGHBITDEPTH 0
+#define CONFIG_BETTER_HW_COMPATIBILITY 0
+#define CONFIG_EXPERIMENTAL 0
+#define CONFIG_SIZE_LIMIT 1
+#define CONFIG_ALWAYS_ADJUST_BPM 0
+#define CONFIG_BITSTREAM_DEBUG 0
+#define CONFIG_MISMATCH_DEBUG 0
+#define CONFIG_FP_MB_STATS 0
+#define CONFIG_EMULATE_HARDWARE 0
+#define CONFIG_NON_GREEDY_MV 0
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
+#define DECODE_WIDTH_LIMIT 8192
+#define DECODE_HEIGHT_LIMIT 4608
+#endif /* VPX_CONFIG_H */
diff --git a/media/libvpx/config/linux/arm64/vpx_dsp_rtcd.h b/media/libvpx/config/linux/arm64/vpx_dsp_rtcd.h
new file mode 100644
index 0000000000..3c34cafc59
--- /dev/null
+++ b/media/libvpx/config/linux/arm64/vpx_dsp_rtcd.h
@@ -0,0 +1,1197 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
+#ifndef VPX_DSP_RTCD_H_
+#define VPX_DSP_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * DSP
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#if CONFIG_VP9_ENCODER
+ struct macroblock_plane;
+ struct ScanOrder;
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned int vpx_avg_4x4_c(const uint8_t *, int p);
+unsigned int vpx_avg_4x4_neon(const uint8_t *, int p);
+#define vpx_avg_4x4 vpx_avg_4x4_neon
+
+unsigned int vpx_avg_8x8_c(const uint8_t *, int p);
+unsigned int vpx_avg_8x8_neon(const uint8_t *, int p);
+#define vpx_avg_8x8 vpx_avg_8x8_neon
+
+void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+void vpx_comp_avg_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+#define vpx_comp_avg_pred vpx_comp_avg_pred_neon
+
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve_avg vpx_convolve_avg_neon
+
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve_copy vpx_convolve_copy_neon
+
+void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_neon
+
+void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_neon
+
+void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_neon
+
+void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_neon
+
+void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_neon
+
+void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_neon
+
+void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_neon
+
+void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_neon
+
+void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_neon
+
+void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_neon
+
+void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_neon
+
+void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_neon
+
+void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_neon
+
+void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_neon
+
+void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_neon
+
+void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_neon
+
+void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_neon
+
+void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_neon
+
+void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_neon
+
+void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_neon
+
+void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c
+
+void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_neon
+
+void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_neon
+
+void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_neon
+
+void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_neon
+
+void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c
+
+void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_neon
+
+void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_neon
+
+void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_neon
+
+void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_neon
+
+void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_neon
+
+void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_neon
+
+void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_neon
+
+void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_neon
+
+void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_neon
+
+void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_neon
+
+void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_neon
+
+void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_neon
+
+void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_neon
+
+void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_neon
+
+void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_neon
+
+void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_neon
+
+void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16 vpx_fdct16x16_neon
+
+void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16_1 vpx_fdct16x16_1_neon
+
+void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32 vpx_fdct32x32_neon
+
+void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_1 vpx_fdct32x32_1_neon
+
+void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_rd vpx_fdct32x32_rd_neon
+
+void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4 vpx_fdct4x4_neon
+
+void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4_1 vpx_fdct4x4_1_neon
+
+void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct8x8 vpx_fdct8x8_neon
+
+void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct8x8_1 vpx_fdct8x8_1_neon
+
+void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+
+unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride);
+unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride);
+unsigned int vpx_get4x4sse_cs_neon_dotprod(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_get4x4sse_cs)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride);
+
+void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get8x8var_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+
+unsigned int vpx_get_mb_ss_c(const int16_t *);
+#define vpx_get_mb_ss vpx_get_mb_ss_c
+
+void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_16x16 vpx_h_predictor_16x16_neon
+
+void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_32x32 vpx_h_predictor_32x32_neon
+
+void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_4x4 vpx_h_predictor_4x4_neon
+
+void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_neon
+
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+#define vpx_hadamard_16x16 vpx_hadamard_16x16_neon
+
+void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+#define vpx_hadamard_32x32 vpx_hadamard_32x32_neon
+
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+#define vpx_hadamard_8x8 vpx_hadamard_8x8_neon
+
+void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
+
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_10_add vpx_idct16x16_10_add_neon
+
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_1_add vpx_idct16x16_1_add_neon
+
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_256_add vpx_idct16x16_256_add_neon
+
+void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_38_add vpx_idct16x16_38_add_neon
+
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_neon
+
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct32x32_135_add vpx_idct32x32_135_add_neon
+
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct32x32_1_add vpx_idct32x32_1_add_neon
+
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct32x32_34_add vpx_idct32x32_34_add_neon
+
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct4x4_16_add vpx_idct4x4_16_add_neon
+
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct4x4_1_add vpx_idct4x4_1_add_neon
+
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct8x8_12_add vpx_idct8x8_12_add_neon
+
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct8x8_1_add vpx_idct8x8_1_add_neon
+
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct8x8_64_add vpx_idct8x8_64_add_neon
+
+int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
+int16_t vpx_int_pro_col_neon(const uint8_t *ref, const int width);
+#define vpx_int_pro_col vpx_int_pro_col_neon
+
+void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
+void vpx_int_pro_row_neon(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
+#define vpx_int_pro_row vpx_int_pro_row_neon
+
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
+
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
+
+void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_horizontal_16_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_horizontal_16 vpx_lpf_horizontal_16_neon
+
+void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_horizontal_16_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_horizontal_16_dual vpx_lpf_horizontal_16_dual_neon
+
+void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_horizontal_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_horizontal_4 vpx_lpf_horizontal_4_neon
+
+void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_horizontal_4_dual vpx_lpf_horizontal_4_dual_neon
+
+void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_horizontal_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_horizontal_8 vpx_lpf_horizontal_8_neon
+
+void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_horizontal_8_dual vpx_lpf_horizontal_8_dual_neon
+
+void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_vertical_16_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_16 vpx_lpf_vertical_16_neon
+
+void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_16_dual vpx_lpf_vertical_16_dual_neon
+
+void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_vertical_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_4 vpx_lpf_vertical_4_neon
+
+void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_vertical_4_dual vpx_lpf_vertical_4_dual_neon
+
+void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_vertical_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_8 vpx_lpf_vertical_8_neon
+
+void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_neon
+
+void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+void vpx_minmax_8x8_neon(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vpx_minmax_8x8 vpx_minmax_8x8_neon
+
+unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse8x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse8x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+#define vpx_quantize_b vpx_quantize_b_neon
+
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_neon
+
+unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad16x16_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad16x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x16x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad16x32_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad16x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad16x8_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad16x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x8x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad32x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad32x16_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad32x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x16x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad32x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad32x32_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad32x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad32x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad32x64_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad32x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x64x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x4 vpx_sad4x4_neon
+
+unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x4_avg vpx_sad4x4_avg_neon
+
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad4x4x4d vpx_sad4x4x4d_neon
+
+unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x8 vpx_sad4x8_neon
+
+unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x8_avg vpx_sad4x8_avg_neon
+
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad4x8x4d vpx_sad4x8x4d_neon
+
+unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad64x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad64x32_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad64x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad64x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad64x64_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad64x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x16 vpx_sad8x16_neon
+
+unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x16_avg vpx_sad8x16_avg_neon
+
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad8x16x4d vpx_sad8x16x4d_neon
+
+unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x4 vpx_sad8x4_neon
+
+unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x4_avg vpx_sad8x4_avg_neon
+
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad8x4x4d vpx_sad8x4x4d_neon
+
+unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x8 vpx_sad8x8_neon
+
+unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x8_avg vpx_sad8x8_avg_neon
+
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad8x8x4d vpx_sad8x8x4d_neon
+
+unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x16x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x8x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x16x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x64x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_neon
+
+void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_neon
+
+unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_neon
+
+void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_neon
+
+unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x64x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_neon
+
+void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_neon
+
+unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_neon
+
+void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_neon
+
+unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_neon
+
+void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_neon
+
+int vpx_satd_c(const int16_t *coeff, int length);
+int vpx_satd_neon(const int16_t *coeff, int length);
+#define vpx_satd vpx_satd_neon
+
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_2d vpx_scaled_2d_neon
+
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
+
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
+
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
+
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_horiz vpx_scaled_horiz_c
+
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_vert vpx_scaled_vert_c
+
+int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+int64_t vpx_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+int64_t vpx_sse_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+RTCD_EXTERN int64_t (*vpx_sse)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+
+uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_neon
+
+uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_neon
+
+uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_neon
+
+uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_neon
+
+uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_neon
+
+uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_neon
+
+uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_neon
+
+uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_neon
+
+uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_neon
+
+uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_neon
+
+uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_neon
+
+uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_neon
+
+uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_neon
+
+uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_neon
+
+uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_neon
+
+uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_neon
+
+uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_neon
+
+uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_neon
+
+uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_neon
+
+uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_neon
+
+uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_neon
+
+uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_neon
+
+uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_neon
+
+uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_neon
+
+uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_neon
+
+uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x8 vpx_sub_pixel_variance8x8_neon
+
+void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+void vpx_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+#define vpx_subtract_block vpx_subtract_block_neon
+
+uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size);
+uint64_t vpx_sum_squares_2d_i16_neon(const int16_t *src, int stride, int size);
+uint64_t vpx_sum_squares_2d_i16_sve(const int16_t *src, int stride, int size);
+RTCD_EXTERN uint64_t (*vpx_sum_squares_2d_i16)(const int16_t *src, int stride, int size);
+
+void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_neon
+
+void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_neon
+
+void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_neon
+
+void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_neon
+
+void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_16x16 vpx_v_predictor_16x16_neon
+
+void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_32x32 vpx_v_predictor_32x32_neon
+
+void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_4x4 vpx_v_predictor_4x4_neon
+
+void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_8x8 vpx_v_predictor_8x8_neon
+
+unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x4_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance4x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x4_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c
+
+int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl);
+int vpx_vector_var_neon(const int16_t *ref, const int16_t *src, const int bwl);
+#define vpx_vector_var vpx_vector_var_neon
+
+void vpx_dsp_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/arm.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = arm_cpu_caps();
+
+    (void)flags;
+
+    vpx_convolve8 = vpx_convolve8_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_convolve8 = vpx_convolve8_neon_dotprod;
+    if (flags & HAS_NEON_I8MM) vpx_convolve8 = vpx_convolve8_neon_i8mm;
+    vpx_convolve8_avg = vpx_convolve8_avg_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_convolve8_avg = vpx_convolve8_avg_neon_dotprod;
+    if (flags & HAS_NEON_I8MM) vpx_convolve8_avg = vpx_convolve8_avg_neon_i8mm;
+    vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_neon_dotprod;
+    if (flags & HAS_NEON_I8MM) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_neon_i8mm;
+    vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_neon_dotprod;
+    if (flags & HAS_NEON_I8MM) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_neon_i8mm;
+    vpx_convolve8_horiz = vpx_convolve8_horiz_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_convolve8_horiz = vpx_convolve8_horiz_neon_dotprod;
+    if (flags & HAS_NEON_I8MM) vpx_convolve8_horiz = vpx_convolve8_horiz_neon_i8mm;
+    vpx_convolve8_vert = vpx_convolve8_vert_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_convolve8_vert = vpx_convolve8_vert_neon_dotprod;
+    if (flags & HAS_NEON_I8MM) vpx_convolve8_vert = vpx_convolve8_vert_neon_i8mm;
+    vpx_get16x16var = vpx_get16x16var_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_get16x16var = vpx_get16x16var_neon_dotprod;
+    vpx_get4x4sse_cs = vpx_get4x4sse_cs_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_get4x4sse_cs = vpx_get4x4sse_cs_neon_dotprod;
+    vpx_get8x8var = vpx_get8x8var_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_get8x8var = vpx_get8x8var_neon_dotprod;
+    vpx_mse16x16 = vpx_mse16x16_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_mse16x16 = vpx_mse16x16_neon_dotprod;
+    vpx_mse16x8 = vpx_mse16x8_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_mse16x8 = vpx_mse16x8_neon_dotprod;
+    vpx_mse8x16 = vpx_mse8x16_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_mse8x16 = vpx_mse8x16_neon_dotprod;
+    vpx_mse8x8 = vpx_mse8x8_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_mse8x8 = vpx_mse8x8_neon_dotprod;
+    vpx_sad16x16 = vpx_sad16x16_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad16x16 = vpx_sad16x16_neon_dotprod;
+    vpx_sad16x16_avg = vpx_sad16x16_avg_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad16x16_avg = vpx_sad16x16_avg_neon_dotprod;
+    vpx_sad16x16x4d = vpx_sad16x16x4d_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad16x16x4d = vpx_sad16x16x4d_neon_dotprod;
+    vpx_sad16x32 = vpx_sad16x32_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad16x32 = vpx_sad16x32_neon_dotprod;
+    vpx_sad16x32_avg = vpx_sad16x32_avg_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad16x32_avg = vpx_sad16x32_avg_neon_dotprod;
+    vpx_sad16x32x4d = vpx_sad16x32x4d_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad16x32x4d = vpx_sad16x32x4d_neon_dotprod;
+    vpx_sad16x8 = vpx_sad16x8_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad16x8 = vpx_sad16x8_neon_dotprod;
+    vpx_sad16x8_avg = vpx_sad16x8_avg_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad16x8_avg = vpx_sad16x8_avg_neon_dotprod;
+    vpx_sad16x8x4d = vpx_sad16x8x4d_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad16x8x4d = vpx_sad16x8x4d_neon_dotprod;
+    vpx_sad32x16 = vpx_sad32x16_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad32x16 = vpx_sad32x16_neon_dotprod;
+    vpx_sad32x16_avg = vpx_sad32x16_avg_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad32x16_avg = vpx_sad32x16_avg_neon_dotprod;
+    vpx_sad32x16x4d = vpx_sad32x16x4d_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad32x16x4d = vpx_sad32x16x4d_neon_dotprod;
+    vpx_sad32x32 = vpx_sad32x32_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad32x32 = vpx_sad32x32_neon_dotprod;
+    vpx_sad32x32_avg = vpx_sad32x32_avg_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad32x32_avg = vpx_sad32x32_avg_neon_dotprod;
+    vpx_sad32x32x4d = vpx_sad32x32x4d_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad32x32x4d = vpx_sad32x32x4d_neon_dotprod;
+    vpx_sad32x64 = vpx_sad32x64_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad32x64 = vpx_sad32x64_neon_dotprod;
+    vpx_sad32x64_avg = vpx_sad32x64_avg_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad32x64_avg = vpx_sad32x64_avg_neon_dotprod;
+    vpx_sad32x64x4d = vpx_sad32x64x4d_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad32x64x4d = vpx_sad32x64x4d_neon_dotprod;
+    vpx_sad64x32 = vpx_sad64x32_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad64x32 = vpx_sad64x32_neon_dotprod;
+    vpx_sad64x32_avg = vpx_sad64x32_avg_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad64x32_avg = vpx_sad64x32_avg_neon_dotprod;
+    vpx_sad64x32x4d = vpx_sad64x32x4d_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad64x32x4d = vpx_sad64x32x4d_neon_dotprod;
+    vpx_sad64x64 = vpx_sad64x64_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad64x64 = vpx_sad64x64_neon_dotprod;
+    vpx_sad64x64_avg = vpx_sad64x64_avg_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad64x64_avg = vpx_sad64x64_avg_neon_dotprod;
+    vpx_sad64x64x4d = vpx_sad64x64x4d_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad64x64x4d = vpx_sad64x64x4d_neon_dotprod;
+    vpx_sad_skip_16x16 = vpx_sad_skip_16x16_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x16 = vpx_sad_skip_16x16_neon_dotprod;
+    vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_neon_dotprod;
+    vpx_sad_skip_16x32 = vpx_sad_skip_16x32_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x32 = vpx_sad_skip_16x32_neon_dotprod;
+    vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_neon_dotprod;
+    vpx_sad_skip_16x8 = vpx_sad_skip_16x8_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x8 = vpx_sad_skip_16x8_neon_dotprod;
+    vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_neon_dotprod;
+    vpx_sad_skip_32x16 = vpx_sad_skip_32x16_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_neon_dotprod;
+    vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_neon_dotprod;
+    vpx_sad_skip_32x32 = vpx_sad_skip_32x32_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_neon_dotprod;
+    vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_neon_dotprod;
+    vpx_sad_skip_32x64 = vpx_sad_skip_32x64_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_neon_dotprod;
+    vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_neon_dotprod;
+    vpx_sad_skip_64x32 = vpx_sad_skip_64x32_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_neon_dotprod;
+    vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_neon_dotprod;
+    vpx_sad_skip_64x64 = vpx_sad_skip_64x64_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_neon_dotprod;
+    vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_neon_dotprod;
+    vpx_sse = vpx_sse_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sse = vpx_sse_neon_dotprod;
+    vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_neon;
+    if (flags & HAS_SVE) vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_sve;
+    vpx_variance16x16 = vpx_variance16x16_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_variance16x16 = vpx_variance16x16_neon_dotprod;
+    vpx_variance16x32 = vpx_variance16x32_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_variance16x32 = vpx_variance16x32_neon_dotprod;
+    vpx_variance16x8 = vpx_variance16x8_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_variance16x8 = vpx_variance16x8_neon_dotprod;
+    vpx_variance32x16 = vpx_variance32x16_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_variance32x16 = vpx_variance32x16_neon_dotprod;
+    vpx_variance32x32 = vpx_variance32x32_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_variance32x32 = vpx_variance32x32_neon_dotprod;
+    vpx_variance32x64 = vpx_variance32x64_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_variance32x64 = vpx_variance32x64_neon_dotprod;
+    vpx_variance4x4 = vpx_variance4x4_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_variance4x4 = vpx_variance4x4_neon_dotprod;
+    vpx_variance4x8 = vpx_variance4x8_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_variance4x8 = vpx_variance4x8_neon_dotprod;
+    vpx_variance64x32 = vpx_variance64x32_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_variance64x32 = vpx_variance64x32_neon_dotprod;
+    vpx_variance64x64 = vpx_variance64x64_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_variance64x64 = vpx_variance64x64_neon_dotprod;
+    vpx_variance8x16 = vpx_variance8x16_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_variance8x16 = vpx_variance8x16_neon_dotprod;
+    vpx_variance8x4 = vpx_variance8x4_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_variance8x4 = vpx_variance8x4_neon_dotprod;
+    vpx_variance8x8 = vpx_variance8x8_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_variance8x8 = vpx_variance8x8_neon_dotprod;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_RTCD_H_
diff --git a/media/libvpx/config/linux/arm64/vpx_scale_rtcd.h b/media/libvpx/config/linux/arm64/vpx_scale_rtcd.h
new file mode 100644
index 0000000000..ca594ea140
--- /dev/null
+++ b/media/libvpx/config/linux/arm64/vpx_scale_rtcd.h
@@ -0,0 +1,85 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
+#ifndef VPX_SCALE_RTCD_H_
+#define VPX_SCALE_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c
+
+void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c
+
+void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c
+
+void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+#define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c
+
+void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+#define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c
+
+void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c
+
+void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c
+
+void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vp8_yv12_copy_frame vp8_yv12_copy_frame_c
+
+void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf);
+#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c
+
+void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
+#define vpx_extend_frame_borders vpx_extend_frame_borders_c
+
+void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
+#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
+
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
+void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_y vpx_yv12_copy_y_c
+
+void vpx_scale_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/arm.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = arm_cpu_caps();
+
+    (void)flags;
+
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_SCALE_RTCD_H_
diff --git a/media/libvpx/config/linux/ia32/vp8_rtcd.h b/media/libvpx/config/linux/ia32/vp8_rtcd.h
index 5db5bbad85..ca42bb8a5e 100644
--- a/media/libvpx/config/linux/ia32/vp8_rtcd.h
+++ b/media/libvpx/config/linux/ia32/vp8_rtcd.h
@@ -1,3 +1,14 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
 #ifndef VP8_RTCD_H_
 #define VP8_RTCD_H_
 
@@ -26,57 +37,48 @@ struct yv12_buffer_config;
 extern "C" {
 #endif
 
-void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict4x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-
-void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp8_blend_b vp8_blend_b_c
-
-void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp8_blend_mb_inner vp8_blend_mb_inner_c
-
-void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp8_blend_mb_outer vp8_blend_mb_outer_c
+void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
 int vp8_block_error_c(short *coeff, short *dqcoeff);
 int vp8_block_error_sse2(short *coeff, short *dqcoeff);
 RTCD_EXTERN int (*vp8_block_error)(short *coeff, short *dqcoeff);
 
-void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
-void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
-void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
-RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
+void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
+void vp8_copy32xn_sse2(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
+void vp8_copy32xn_sse3(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
+RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
 
-void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
 
-void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem8x4_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
 
-void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem8x8_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
 
-void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
-void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
-RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
+void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
 
 int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
 int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
@@ -86,9 +88,9 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, u
 int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
 RTCD_EXTERN int (*vp8_denoiser_filter_uv)(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
 
-void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride);
-void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *output, int stride);
-RTCD_EXTERN void (*vp8_dequant_idct_add)(short *input, short *dq, unsigned char *output, int stride);
+void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride);
+void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *dest, int stride);
+RTCD_EXTERN void (*vp8_dequant_idct_add)(short *input, short *dq, unsigned char *dest, int stride);
 
 void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
 void vp8_dequant_idct_add_uv_block_sse2(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
@@ -98,9 +100,9 @@ void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int
 void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
 RTCD_EXTERN void (*vp8_dequant_idct_add_y_block)(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
 
-void vp8_dequantize_b_c(struct blockd*, short *dqc);
-void vp8_dequantize_b_mmx(struct blockd*, short *dqc);
-RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *dqc);
+void vp8_dequantize_b_c(struct blockd*, short *DQC);
+void vp8_dequantize_b_mmx(struct blockd*, short *DQC);
+RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *DQC);
 
 int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
 int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
@@ -122,42 +124,37 @@ void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char
 void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight);
 RTCD_EXTERN void (*vp8_filter_by_weight8x8)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight);
 
-int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 
-void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+RTCD_EXTERN void (*vp8_loop_filter_bv)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 
-void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_bv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-RTCD_EXTERN void (*vp8_loop_filter_bv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+RTCD_EXTERN void (*vp8_loop_filter_mbh)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 
-void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_mbh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-RTCD_EXTERN void (*vp8_loop_filter_mbh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 
-void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+RTCD_EXTERN void (*vp8_loop_filter_simple_bh)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 
-void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_bhs_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
-RTCD_EXTERN void (*vp8_loop_filter_simple_bh)(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 
-void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
-RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 
-void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
-RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y, int ystride, const unsigned char *blimit);
-
-void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
-RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 
 int vp8_mbblock_error_c(struct macroblock *mb, int dc);
 int vp8_mbblock_error_sse2(struct macroblock *mb, int dc);
@@ -167,9 +164,9 @@ int vp8_mbuverror_c(struct macroblock *mb);
 int vp8_mbuverror_sse2(struct macroblock *mb);
 RTCD_EXTERN int (*vp8_mbuverror)(struct macroblock *mb);
 
-int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
 
 void vp8_regular_quantize_b_c(struct block *, struct blockd *);
 void vp8_regular_quantize_b_sse2(struct block *, struct blockd *);
@@ -184,40 +181,40 @@ void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
 void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch);
 RTCD_EXTERN void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
 
-void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
-void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
-RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
+void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
 
-void vp8_short_inv_walsh4x4_c(short *input, short *output);
-void vp8_short_inv_walsh4x4_sse2(short *input, short *output);
-RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *output);
+void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff);
+void vp8_short_inv_walsh4x4_sse2(short *input, short *mb_dqcoeff);
+RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *mb_dqcoeff);
 
-void vp8_short_inv_walsh4x4_1_c(short *input, short *output);
+void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff);
 #define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c
 
 void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
 void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch);
 RTCD_EXTERN void (*vp8_short_walsh4x4)(short *input, short *output, int pitch);
 
-void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
 void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count);
 void vp8_temporal_filter_apply_sse2(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count);
@@ -237,9 +234,9 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_sse2;
     if (flags & HAS_SSSE3) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_ssse3;
     vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_c;
-    if (flags & HAS_MMX) vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_mmx;
+    if (flags & HAS_SSE2) vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_sse2;
     vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_c;
-    if (flags & HAS_MMX) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_mmx;
+    if (flags & HAS_SSE2) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_sse2;
     vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_c;
     if (flags & HAS_SSE2) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_sse2;
     if (flags & HAS_SSSE3) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_ssse3;
@@ -277,9 +274,6 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vp8_filter_by_weight16x16 = vp8_filter_by_weight16x16_sse2;
     vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_c;
     if (flags & HAS_SSE2) vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_sse2;
-    vp8_full_search_sad = vp8_full_search_sad_c;
-    if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3;
-    if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8;
     vp8_loop_filter_bh = vp8_loop_filter_bh_c;
     if (flags & HAS_SSE2) vp8_loop_filter_bh = vp8_loop_filter_bh_sse2;
     vp8_loop_filter_bv = vp8_loop_filter_bv_c;
@@ -336,4 +330,4 @@ static void setup_rtcd_internal(void)
 }  // extern "C"
 #endif
 
-#endif
+#endif  // VP8_RTCD_H_
diff --git a/media/libvpx/config/linux/ia32/vp9_rtcd.h b/media/libvpx/config/linux/ia32/vp9_rtcd.h
index da53895ca4..28c82a831c 100644
--- a/media/libvpx/config/linux/ia32/vp9_rtcd.h
+++ b/media/libvpx/config/linux/ia32/vp9_rtcd.h
@@ -1,3 +1,14 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
 #ifndef VP9_RTCD_H_
 #define VP9_RTCD_H_
 
@@ -14,12 +25,18 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
+#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER
+#include "vp9/encoder/vp9_temporal_filter.h"
+#endif
 
 struct macroblockd;
 
 /* Encoder forward decls */
 struct macroblock;
-struct vp9_variance_vtable;
+struct macroblock_plane;
+struct vp9_sad_table;
+struct ScanOrder;
 struct search_site_config;
 struct mv;
 union int_mv;
@@ -29,23 +46,22 @@ struct yv12_buffer_config;
 extern "C" {
 #endif
 
+void vp9_apply_temporal_filter_c(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+void vp9_apply_temporal_filter_sse4_1(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+RTCD_EXTERN void (*vp9_apply_temporal_filter)(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 int64_t vp9_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 RTCD_EXTERN int64_t (*vp9_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
-int64_t vp9_block_error_fp_sse2(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
-RTCD_EXTERN int64_t (*vp9_block_error_fp)(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+RTCD_EXTERN int64_t (*vp9_block_error_fp)(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
 
-int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-int vp9_diamond_search_sad_avx(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-RTCD_EXTERN int (*vp9_diamond_search_sad)(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-
-void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-RTCD_EXTERN void (*vp9_fdct8x8_quant)(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv);
+#define vp9_diamond_search_sad vp9_diamond_search_sad_c
 
 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
 void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type);
@@ -67,18 +83,13 @@ void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride, uint8_t *dst,
 void vp9_filter_by_weight8x8_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight);
 RTCD_EXTERN void (*vp9_filter_by_weight8x8)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight);
 
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-int vp9_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-int vp9_full_search_sadx8(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-RTCD_EXTERN int (*vp9_full_search_sad)(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-
 void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 void vp9_fwht4x4_sse2(const int16_t *input, tran_low_t *output, int stride);
 RTCD_EXTERN void (*vp9_fwht4x4)(const int16_t *input, tran_low_t *output, int stride);
 
-void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
-void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
-RTCD_EXTERN void (*vp9_iht16x16_256_add)(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
+void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+RTCD_EXTERN void (*vp9_iht16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 
 void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
@@ -88,20 +99,35 @@ void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int
 void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 
-void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 
-void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 
-void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
-void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
-RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
+void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
 
-void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
-void vp9_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
-RTCD_EXTERN void (*vp9_temporal_filter_apply)(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+void vpx_convolve12_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve12)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve12_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve12_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve12_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve12_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
 void vp9_rtcd(void);
 
@@ -113,16 +139,14 @@ static void setup_rtcd_internal(void)
 
     (void)flags;
 
+    vp9_apply_temporal_filter = vp9_apply_temporal_filter_c;
+    if (flags & HAS_SSE4_1) vp9_apply_temporal_filter = vp9_apply_temporal_filter_sse4_1;
     vp9_block_error = vp9_block_error_c;
     if (flags & HAS_SSE2) vp9_block_error = vp9_block_error_sse2;
     if (flags & HAS_AVX2) vp9_block_error = vp9_block_error_avx2;
     vp9_block_error_fp = vp9_block_error_fp_c;
     if (flags & HAS_SSE2) vp9_block_error_fp = vp9_block_error_fp_sse2;
-    vp9_diamond_search_sad = vp9_diamond_search_sad_c;
-    if (flags & HAS_AVX) vp9_diamond_search_sad = vp9_diamond_search_sad_avx;
-    vp9_fdct8x8_quant = vp9_fdct8x8_quant_c;
-    if (flags & HAS_SSE2) vp9_fdct8x8_quant = vp9_fdct8x8_quant_sse2;
-    if (flags & HAS_SSSE3) vp9_fdct8x8_quant = vp9_fdct8x8_quant_ssse3;
+    if (flags & HAS_AVX2) vp9_block_error_fp = vp9_block_error_fp_avx2;
     vp9_fht16x16 = vp9_fht16x16_c;
     if (flags & HAS_SSE2) vp9_fht16x16 = vp9_fht16x16_sse2;
     vp9_fht4x4 = vp9_fht4x4_c;
@@ -133,9 +157,6 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vp9_filter_by_weight16x16 = vp9_filter_by_weight16x16_sse2;
     vp9_filter_by_weight8x8 = vp9_filter_by_weight8x8_c;
     if (flags & HAS_SSE2) vp9_filter_by_weight8x8 = vp9_filter_by_weight8x8_sse2;
-    vp9_full_search_sad = vp9_full_search_sad_c;
-    if (flags & HAS_SSE3) vp9_full_search_sad = vp9_full_search_sadx3;
-    if (flags & HAS_SSE4_1) vp9_full_search_sad = vp9_full_search_sadx8;
     vp9_fwht4x4 = vp9_fwht4x4_c;
     if (flags & HAS_SSE2) vp9_fwht4x4 = vp9_fwht4x4_sse2;
     vp9_iht16x16_256_add = vp9_iht16x16_256_add_c;
@@ -146,10 +167,22 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vp9_iht8x8_64_add = vp9_iht8x8_64_add_sse2;
     vp9_quantize_fp = vp9_quantize_fp_c;
     if (flags & HAS_SSE2) vp9_quantize_fp = vp9_quantize_fp_sse2;
+    if (flags & HAS_SSSE3) vp9_quantize_fp = vp9_quantize_fp_ssse3;
+    if (flags & HAS_AVX2) vp9_quantize_fp = vp9_quantize_fp_avx2;
+    vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_c;
+    if (flags & HAS_SSSE3) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_ssse3;
+    if (flags & HAS_AVX2) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_avx2;
     vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c;
     if (flags & HAS_SSSE3) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_ssse3;
-    vp9_temporal_filter_apply = vp9_temporal_filter_apply_c;
-    if (flags & HAS_SSE2) vp9_temporal_filter_apply = vp9_temporal_filter_apply_sse2;
+    vpx_convolve12 = vpx_convolve12_c;
+    if (flags & HAS_SSSE3) vpx_convolve12 = vpx_convolve12_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve12 = vpx_convolve12_avx2;
+    vpx_convolve12_horiz = vpx_convolve12_horiz_c;
+    if (flags & HAS_SSSE3) vpx_convolve12_horiz = vpx_convolve12_horiz_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve12_horiz = vpx_convolve12_horiz_avx2;
+    vpx_convolve12_vert = vpx_convolve12_vert_c;
+    if (flags & HAS_SSSE3) vpx_convolve12_vert = vpx_convolve12_vert_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve12_vert = vpx_convolve12_vert_avx2;
 }
 #endif
 
@@ -157,4 +190,4 @@ static void setup_rtcd_internal(void)
 }  // extern "C"
 #endif
 
-#endif
+#endif  // VP9_RTCD_H_
diff --git a/media/libvpx/config/linux/ia32/vpx_config.asm b/media/libvpx/config/linux/ia32/vpx_config.asm
index 2efc8b978b..bafe400119 100644
--- a/media/libvpx/config/linux/ia32/vpx_config.asm
+++ b/media/libvpx/config/linux/ia32/vpx_config.asm
@@ -1,9 +1,16 @@
-%define ARCH_ARM 0
-%define ARCH_MIPS 0
-%define ARCH_X86 1
-%define ARCH_X86_64 0
-%define HAVE_NEON 0
+%define VPX_ARCH_ARM 0
+%define VPX_ARCH_AARCH64 0
+%define VPX_ARCH_MIPS 0
+%define VPX_ARCH_X86 1
+%define VPX_ARCH_X86_64 0
+%define VPX_ARCH_PPC 0
+%define VPX_ARCH_LOONGARCH 0
 %define HAVE_NEON_ASM 0
+%define HAVE_NEON 0
+%define HAVE_NEON_DOTPROD 0
+%define HAVE_NEON_I8MM 0
+%define HAVE_SVE 0
+%define HAVE_SVE2 0
 %define HAVE_MIPS32 0
 %define HAVE_DSPR2 0
 %define HAVE_MSA 0
@@ -16,6 +23,11 @@
 %define HAVE_SSE4_1 1
 %define HAVE_AVX 1
 %define HAVE_AVX2 1
+%define HAVE_AVX512 1
+%define HAVE_VSX 0
+%define HAVE_MMI 0
+%define HAVE_LSX 0
+%define HAVE_LASX 0
 %define HAVE_VPX_PORTS 1
 %define HAVE_PTHREAD_H 1
 %define CONFIG_DEPENDENCY_TRACKING 1
@@ -72,7 +84,10 @@
 %define CONFIG_BETTER_HW_COMPATIBILITY 0
 %define CONFIG_EXPERIMENTAL 0
 %define CONFIG_SIZE_LIMIT 1
-%define CONFIG_SPATIAL_SVC 0
+%define CONFIG_ALWAYS_ADJUST_BPM 0
+%define CONFIG_BITSTREAM_DEBUG 0
+%define CONFIG_MISMATCH_DEBUG 0
 %define CONFIG_FP_MB_STATS 0
 %define CONFIG_EMULATE_HARDWARE 0
-%define CONFIG_MISC_FIXES 0
+%define CONFIG_NON_GREEDY_MV 0
+%define CONFIG_COLLECT_COMPONENT_TIMING 0
diff --git a/media/libvpx/config/linux/ia32/vpx_config.h b/media/libvpx/config/linux/ia32/vpx_config.h
index b26f9a51e8..a4cc48afe1 100644
--- a/media/libvpx/config/linux/ia32/vpx_config.h
+++ b/media/libvpx/config/linux/ia32/vpx_config.h
@@ -10,12 +10,19 @@
 #define VPX_CONFIG_H
 #define RESTRICT    
 #define INLINE      inline
-#define ARCH_ARM 0
-#define ARCH_MIPS 0
-#define ARCH_X86 1
-#define ARCH_X86_64 0
-#define HAVE_NEON 0
+#define VPX_ARCH_ARM 0
+#define VPX_ARCH_AARCH64 0
+#define VPX_ARCH_MIPS 0
+#define VPX_ARCH_X86 1
+#define VPX_ARCH_X86_64 0
+#define VPX_ARCH_PPC 0
+#define VPX_ARCH_LOONGARCH 0
 #define HAVE_NEON_ASM 0
+#define HAVE_NEON 0
+#define HAVE_NEON_DOTPROD 0
+#define HAVE_NEON_I8MM 0
+#define HAVE_SVE 0
+#define HAVE_SVE2 0
 #define HAVE_MIPS32 0
 #define HAVE_DSPR2 0
 #define HAVE_MSA 0
@@ -28,6 +35,11 @@
 #define HAVE_SSE4_1 1
 #define HAVE_AVX 1
 #define HAVE_AVX2 1
+#define HAVE_AVX512 1
+#define HAVE_VSX 0
+#define HAVE_MMI 0
+#define HAVE_LSX 0
+#define HAVE_LASX 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_PTHREAD_H 1
 #define CONFIG_DEPENDENCY_TRACKING 1
@@ -84,10 +96,13 @@
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 1
-#define CONFIG_SPATIAL_SVC 0
+#define CONFIG_ALWAYS_ADJUST_BPM 0
+#define CONFIG_BITSTREAM_DEBUG 0
+#define CONFIG_MISMATCH_DEBUG 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_EMULATE_HARDWARE 0
-#define CONFIG_MISC_FIXES 0
+#define CONFIG_NON_GREEDY_MV 0
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
 #define DECODE_WIDTH_LIMIT 8192
 #define DECODE_HEIGHT_LIMIT 4608
 #endif /* VPX_CONFIG_H */
diff --git a/media/libvpx/config/linux/ia32/vpx_dsp_rtcd.h b/media/libvpx/config/linux/ia32/vpx_dsp_rtcd.h
index 112e326a64..0a6266a322 100644
--- a/media/libvpx/config/linux/ia32/vpx_dsp_rtcd.h
+++ b/media/libvpx/config/linux/ia32/vpx_dsp_rtcd.h
@@ -1,3 +1,14 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
 #ifndef VPX_DSP_RTCD_H_
 #define VPX_DSP_RTCD_H_
 
@@ -13,6 +24,11 @@
 
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#if CONFIG_VP9_ENCODER
+ struct macroblock_plane;
+ struct ScanOrder;
+#endif
 
 
 #ifdef __cplusplus
@@ -28,242 +44,215 @@ unsigned int vpx_avg_8x8_sse2(const uint8_t *, int p);
 RTCD_EXTERN unsigned int (*vpx_avg_8x8)(const uint8_t *, int p);
 
 void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
-#define vpx_comp_avg_pred vpx_comp_avg_pred_c
+void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+void vpx_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+RTCD_EXTERN void (*vpx_comp_avg_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
 
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c
 
-void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c
 
-void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c
 
-void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
 
-void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c
 
-void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c
 
-void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c
 
-void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c
 
-void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207e_predictor_16x16 vpx_d207e_predictor_16x16_c
+void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207e_predictor_32x32 vpx_d207e_predictor_32x32_c
+void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207e_predictor_4x4 vpx_d207e_predictor_4x4_c
+void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207e_predictor_8x8 vpx_d207e_predictor_8x8_c
+void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_d45e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45e_predictor_16x16 vpx_d45e_predictor_16x16_c
-
-void vpx_d45e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45e_predictor_32x32 vpx_d45e_predictor_32x32_c
-
-void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c
 
-void vpx_d45e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45e_predictor_8x8 vpx_d45e_predictor_8x8_c
+void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_d63e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63e_predictor_16x16 vpx_d63e_predictor_16x16_c
-
-void vpx_d63e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63e_predictor_32x32 vpx_d63e_predictor_32x32_c
-
-void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c
 
-void vpx_d63e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63e_predictor_8x8 vpx_d63e_predictor_8x8_c
+void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63f_predictor_4x4 vpx_d63f_predictor_4x4_c
+void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_128_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_128_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_128_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_128_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_left_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_left_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_left_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_left_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_top_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_top_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_top_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_top_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
 void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
 void vpx_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct16x16_avx2(const int16_t *input, tran_low_t *output, int stride);
 RTCD_EXTERN void (*vpx_fdct16x16)(const int16_t *input, tran_low_t *output, int stride);
 
 void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
@@ -300,47 +289,53 @@ void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
 void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride);
 RTCD_EXTERN void (*vpx_fdct8x8_1)(const int16_t *input, tran_low_t *output, int stride);
 
-void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vpx_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vpx_get16x16var_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
 
-unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride);
+unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride);
 #define vpx_get4x4sse_cs vpx_get4x4sse_cs_c
 
-void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vpx_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get8x8var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
 
 unsigned int vpx_get_mb_ss_c(const int16_t *);
 unsigned int vpx_get_mb_ss_sse2(const int16_t *);
 RTCD_EXTERN unsigned int (*vpx_get_mb_ss)(const int16_t *);
 
-void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_h_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_h_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_h_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_h_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_16x16_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff);
-RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
 
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_8x8_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff);
-RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+RTCD_EXTERN void (*vpx_hadamard_32x32)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
 
-void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+
+void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
 
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
@@ -353,14 +348,22 @@ RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest,
 
 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest, int stride);
 RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
+void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct16x16_38_add)(const tran_low_t *input, uint8_t *dest, int stride);
+
 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest, int stride);
 RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
 void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest, int stride);
 RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
@@ -369,6 +372,7 @@ RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest,
 
 void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
 RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
 void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
@@ -381,6 +385,7 @@ RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, in
 
 void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
 RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
 void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
@@ -395,9 +400,9 @@ int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
 int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width);
 RTCD_EXTERN int16_t (*vpx_int_pro_col)(const uint8_t *ref, const int width);
 
-void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
-void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
-RTCD_EXTERN void (*vpx_int_pro_row)(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
+void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
+void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
+RTCD_EXTERN void (*vpx_int_pro_row)(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
 
 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
@@ -456,9 +461,9 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, co
 void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
 RTCD_EXTERN void (*vpx_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
 
-void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vpx_mbpost_proc_across_ip_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-RTCD_EXTERN void (*vpx_mbpost_proc_across_ip)(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols,int flimit);
+void vpx_mbpost_proc_across_ip_sse2(unsigned char *src, int pitch, int rows, int cols,int flimit);
+RTCD_EXTERN void (*vpx_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols,int flimit);
 
 void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
 void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
@@ -468,22 +473,23 @@ void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *mi
 void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
 RTCD_EXTERN void (*vpx_minmax_8x8)(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
 
-unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_mse8x16)(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_mse8x8)(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
 void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch);
 void vpx_plane_add_noise_sse2(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch);
@@ -493,12 +499,18 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *d
 void vpx_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
 RTCD_EXTERN void (*vpx_post_proc_down_and_across_mb_row)(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
 
-void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 
-void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vpx_quantize_b_32x32)(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 
 unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -508,18 +520,9 @@ unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad16x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x16x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
-void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-
-void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -529,9 +532,9 @@ unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad16x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -541,18 +544,9 @@ unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uin
 unsigned int vpx_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad16x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x8x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
-void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-
-void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -564,9 +558,9 @@ unsigned int vpx_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_sad32x16_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad32x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -578,16 +572,10 @@ unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_sad32x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad32x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x3 vpx_sad32x32x3_c
-
-void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_c
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -599,9 +587,9 @@ unsigned int vpx_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_sad32x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad32x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad4x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -611,17 +599,9 @@ unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad4x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad4x4_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad4x4x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad4x4x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
-void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad4x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-
-void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad4x4x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad4x4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad4x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -631,47 +611,43 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad4x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad4x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-
-void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x8 vpx_sad4x8x8_c
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad64x32_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 RTCD_EXTERN unsigned int (*vpx_sad64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 
 unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 unsigned int vpx_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 unsigned int vpx_sad64x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad64x32_avg_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad64x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad64x64_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 RTCD_EXTERN unsigned int (*vpx_sad64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 
 unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 unsigned int vpx_sad64x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad64x64_avg_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad64x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x3 vpx_sad64x64x3_c
-
-void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-
-void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x8 vpx_sad64x64x8_c
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -681,17 +657,9 @@ unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uin
 unsigned int vpx_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad8x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad8x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x16x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
-void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-
-void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad8x16x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -701,12 +669,9 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad8x4_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-
-void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x8 vpx_sad8x4x8_c
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad8x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -716,273 +681,392 @@ unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad8x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad8x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x8x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
-void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 
-void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad8x8x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x16x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c
+
+void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c
+
+unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x32_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x32x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x64_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c
+
+void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c
+
+unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 int vpx_satd_c(const int16_t *coeff, int length);
 int vpx_satd_sse2(const int16_t *coeff, int length);
+int vpx_satd_avx2(const int16_t *coeff, int length);
 RTCD_EXTERN int (*vpx_satd)(const int16_t *coeff, int length);
 
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
 
-void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
 
-void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
 
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_horiz vpx_scaled_horiz_c
 
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_vert vpx_scaled_vert_c
 
-uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+int64_t vpx_sse_sse4_1(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+int64_t vpx_sse_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+RTCD_EXTERN int64_t (*vpx_sse)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
 
-uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
 void vpx_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
 RTCD_EXTERN void (*vpx_subtract_block)(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
 
 uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size);
 uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size);
 RTCD_EXTERN uint64_t (*vpx_sum_squares_2d_i16)(const int16_t *src, int stride, int size);
 
-void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_tm_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_tm_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_tm_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_tm_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_tm_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_tm_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_tm_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_tm_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_v_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_v_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_v_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_v_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_v_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_v_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance4x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance4x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c
 
 int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl);
@@ -1003,6 +1087,9 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vpx_avg_4x4 = vpx_avg_4x4_sse2;
     vpx_avg_8x8 = vpx_avg_8x8_c;
     if (flags & HAS_SSE2) vpx_avg_8x8 = vpx_avg_8x8_sse2;
+    vpx_comp_avg_pred = vpx_comp_avg_pred_c;
+    if (flags & HAS_SSE2) vpx_comp_avg_pred = vpx_comp_avg_pred_sse2;
+    if (flags & HAS_AVX2) vpx_comp_avg_pred = vpx_comp_avg_pred_avx2;
     vpx_convolve8 = vpx_convolve8_c;
     if (flags & HAS_SSE2) vpx_convolve8 = vpx_convolve8_sse2;
     if (flags & HAS_SSSE3) vpx_convolve8 = vpx_convolve8_ssse3;
@@ -1010,12 +1097,15 @@ static void setup_rtcd_internal(void)
     vpx_convolve8_avg = vpx_convolve8_avg_c;
     if (flags & HAS_SSE2) vpx_convolve8_avg = vpx_convolve8_avg_sse2;
     if (flags & HAS_SSSE3) vpx_convolve8_avg = vpx_convolve8_avg_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve8_avg = vpx_convolve8_avg_avx2;
     vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_c;
     if (flags & HAS_SSE2) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_sse2;
     if (flags & HAS_SSSE3) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_avx2;
     vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_c;
     if (flags & HAS_SSE2) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_sse2;
     if (flags & HAS_SSSE3) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_avx2;
     vpx_convolve8_horiz = vpx_convolve8_horiz_c;
     if (flags & HAS_SSE2) vpx_convolve8_horiz = vpx_convolve8_horiz_sse2;
     if (flags & HAS_SSSE3) vpx_convolve8_horiz = vpx_convolve8_horiz_ssse3;
@@ -1094,6 +1184,7 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vpx_dc_top_predictor_8x8 = vpx_dc_top_predictor_8x8_sse2;
     vpx_fdct16x16 = vpx_fdct16x16_c;
     if (flags & HAS_SSE2) vpx_fdct16x16 = vpx_fdct16x16_sse2;
+    if (flags & HAS_AVX2) vpx_fdct16x16 = vpx_fdct16x16_avx2;
     vpx_fdct16x16_1 = vpx_fdct16x16_1_c;
     if (flags & HAS_SSE2) vpx_fdct16x16_1 = vpx_fdct16x16_1_sse2;
     vpx_fdct32x32 = vpx_fdct32x32_c;
@@ -1129,6 +1220,10 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vpx_h_predictor_8x8 = vpx_h_predictor_8x8_sse2;
     vpx_hadamard_16x16 = vpx_hadamard_16x16_c;
     if (flags & HAS_SSE2) vpx_hadamard_16x16 = vpx_hadamard_16x16_sse2;
+    if (flags & HAS_AVX2) vpx_hadamard_16x16 = vpx_hadamard_16x16_avx2;
+    vpx_hadamard_32x32 = vpx_hadamard_32x32_c;
+    if (flags & HAS_SSE2) vpx_hadamard_32x32 = vpx_hadamard_32x32_sse2;
+    if (flags & HAS_AVX2) vpx_hadamard_32x32 = vpx_hadamard_32x32_avx2;
     vpx_hadamard_8x8 = vpx_hadamard_8x8_c;
     if (flags & HAS_SSE2) vpx_hadamard_8x8 = vpx_hadamard_8x8_sse2;
     vpx_idct16x16_10_add = vpx_idct16x16_10_add_c;
@@ -1137,20 +1232,28 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vpx_idct16x16_1_add = vpx_idct16x16_1_add_sse2;
     vpx_idct16x16_256_add = vpx_idct16x16_256_add_c;
     if (flags & HAS_SSE2) vpx_idct16x16_256_add = vpx_idct16x16_256_add_sse2;
+    if (flags & HAS_AVX2) vpx_idct16x16_256_add = vpx_idct16x16_256_add_avx2;
+    vpx_idct16x16_38_add = vpx_idct16x16_38_add_c;
+    if (flags & HAS_SSE2) vpx_idct16x16_38_add = vpx_idct16x16_38_add_sse2;
     vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_c;
     if (flags & HAS_SSE2) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_sse2;
+    if (flags & HAS_AVX2) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_avx2;
     vpx_idct32x32_135_add = vpx_idct32x32_135_add_c;
-    if (flags & HAS_SSE2) vpx_idct32x32_135_add = vpx_idct32x32_1024_add_sse2;
+    if (flags & HAS_SSE2) vpx_idct32x32_135_add = vpx_idct32x32_135_add_sse2;
+    if (flags & HAS_SSSE3) vpx_idct32x32_135_add = vpx_idct32x32_135_add_ssse3;
+    if (flags & HAS_AVX2) vpx_idct32x32_135_add = vpx_idct32x32_135_add_avx2;
     vpx_idct32x32_1_add = vpx_idct32x32_1_add_c;
     if (flags & HAS_SSE2) vpx_idct32x32_1_add = vpx_idct32x32_1_add_sse2;
     vpx_idct32x32_34_add = vpx_idct32x32_34_add_c;
     if (flags & HAS_SSE2) vpx_idct32x32_34_add = vpx_idct32x32_34_add_sse2;
+    if (flags & HAS_SSSE3) vpx_idct32x32_34_add = vpx_idct32x32_34_add_ssse3;
     vpx_idct4x4_16_add = vpx_idct4x4_16_add_c;
     if (flags & HAS_SSE2) vpx_idct4x4_16_add = vpx_idct4x4_16_add_sse2;
     vpx_idct4x4_1_add = vpx_idct4x4_1_add_c;
     if (flags & HAS_SSE2) vpx_idct4x4_1_add = vpx_idct4x4_1_add_sse2;
     vpx_idct8x8_12_add = vpx_idct8x8_12_add_c;
     if (flags & HAS_SSE2) vpx_idct8x8_12_add = vpx_idct8x8_12_add_sse2;
+    if (flags & HAS_SSSE3) vpx_idct8x8_12_add = vpx_idct8x8_12_add_ssse3;
     vpx_idct8x8_1_add = vpx_idct8x8_1_add_c;
     if (flags & HAS_SSE2) vpx_idct8x8_1_add = vpx_idct8x8_1_add_sse2;
     vpx_idct8x8_64_add = vpx_idct8x8_64_add_c;
@@ -1198,6 +1301,7 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) vpx_mse16x16 = vpx_mse16x16_avx2;
     vpx_mse16x8 = vpx_mse16x8_c;
     if (flags & HAS_SSE2) vpx_mse16x8 = vpx_mse16x8_sse2;
+    if (flags & HAS_AVX2) vpx_mse16x8 = vpx_mse16x8_avx2;
     vpx_mse8x16 = vpx_mse8x16_c;
     if (flags & HAS_SSE2) vpx_mse8x16 = vpx_mse8x16_sse2;
     vpx_mse8x8 = vpx_mse8x8_c;
@@ -1208,17 +1312,19 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vpx_post_proc_down_and_across_mb_row = vpx_post_proc_down_and_across_mb_row_sse2;
     vpx_quantize_b = vpx_quantize_b_c;
     if (flags & HAS_SSE2) vpx_quantize_b = vpx_quantize_b_sse2;
+    if (flags & HAS_SSSE3) vpx_quantize_b = vpx_quantize_b_ssse3;
+    if (flags & HAS_AVX) vpx_quantize_b = vpx_quantize_b_avx;
+    if (flags & HAS_AVX2) vpx_quantize_b = vpx_quantize_b_avx2;
+    vpx_quantize_b_32x32 = vpx_quantize_b_32x32_c;
+    if (flags & HAS_SSSE3) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_ssse3;
+    if (flags & HAS_AVX) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx;
+    if (flags & HAS_AVX2) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx2;
     vpx_sad16x16 = vpx_sad16x16_c;
     if (flags & HAS_SSE2) vpx_sad16x16 = vpx_sad16x16_sse2;
     vpx_sad16x16_avg = vpx_sad16x16_avg_c;
     if (flags & HAS_SSE2) vpx_sad16x16_avg = vpx_sad16x16_avg_sse2;
-    vpx_sad16x16x3 = vpx_sad16x16x3_c;
-    if (flags & HAS_SSE3) vpx_sad16x16x3 = vpx_sad16x16x3_sse3;
-    if (flags & HAS_SSSE3) vpx_sad16x16x3 = vpx_sad16x16x3_ssse3;
     vpx_sad16x16x4d = vpx_sad16x16x4d_c;
     if (flags & HAS_SSE2) vpx_sad16x16x4d = vpx_sad16x16x4d_sse2;
-    vpx_sad16x16x8 = vpx_sad16x16x8_c;
-    if (flags & HAS_SSE4_1) vpx_sad16x16x8 = vpx_sad16x16x8_sse4_1;
     vpx_sad16x32 = vpx_sad16x32_c;
     if (flags & HAS_SSE2) vpx_sad16x32 = vpx_sad16x32_sse2;
     vpx_sad16x32_avg = vpx_sad16x32_avg_c;
@@ -1229,13 +1335,8 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vpx_sad16x8 = vpx_sad16x8_sse2;
     vpx_sad16x8_avg = vpx_sad16x8_avg_c;
     if (flags & HAS_SSE2) vpx_sad16x8_avg = vpx_sad16x8_avg_sse2;
-    vpx_sad16x8x3 = vpx_sad16x8x3_c;
-    if (flags & HAS_SSE3) vpx_sad16x8x3 = vpx_sad16x8x3_sse3;
-    if (flags & HAS_SSSE3) vpx_sad16x8x3 = vpx_sad16x8x3_ssse3;
     vpx_sad16x8x4d = vpx_sad16x8x4d_c;
     if (flags & HAS_SSE2) vpx_sad16x8x4d = vpx_sad16x8x4d_sse2;
-    vpx_sad16x8x8 = vpx_sad16x8x8_c;
-    if (flags & HAS_SSE4_1) vpx_sad16x8x8 = vpx_sad16x8x8_sse4_1;
     vpx_sad32x16 = vpx_sad32x16_c;
     if (flags & HAS_SSE2) vpx_sad32x16 = vpx_sad32x16_sse2;
     if (flags & HAS_AVX2) vpx_sad32x16 = vpx_sad32x16_avx2;
@@ -1265,12 +1366,8 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vpx_sad4x4 = vpx_sad4x4_sse2;
     vpx_sad4x4_avg = vpx_sad4x4_avg_c;
     if (flags & HAS_SSE2) vpx_sad4x4_avg = vpx_sad4x4_avg_sse2;
-    vpx_sad4x4x3 = vpx_sad4x4x3_c;
-    if (flags & HAS_SSE3) vpx_sad4x4x3 = vpx_sad4x4x3_sse3;
     vpx_sad4x4x4d = vpx_sad4x4x4d_c;
     if (flags & HAS_SSE2) vpx_sad4x4x4d = vpx_sad4x4x4d_sse2;
-    vpx_sad4x4x8 = vpx_sad4x4x8_c;
-    if (flags & HAS_SSE4_1) vpx_sad4x4x8 = vpx_sad4x4x8_sse4_1;
     vpx_sad4x8 = vpx_sad4x8_c;
     if (flags & HAS_SSE2) vpx_sad4x8 = vpx_sad4x8_sse2;
     vpx_sad4x8_avg = vpx_sad4x8_avg_c;
@@ -1280,30 +1377,31 @@ static void setup_rtcd_internal(void)
     vpx_sad64x32 = vpx_sad64x32_c;
     if (flags & HAS_SSE2) vpx_sad64x32 = vpx_sad64x32_sse2;
     if (flags & HAS_AVX2) vpx_sad64x32 = vpx_sad64x32_avx2;
+    if (flags & HAS_AVX512) vpx_sad64x32 = vpx_sad64x32_avx512;
     vpx_sad64x32_avg = vpx_sad64x32_avg_c;
     if (flags & HAS_SSE2) vpx_sad64x32_avg = vpx_sad64x32_avg_sse2;
     if (flags & HAS_AVX2) vpx_sad64x32_avg = vpx_sad64x32_avg_avx2;
+    if (flags & HAS_AVX512) vpx_sad64x32_avg = vpx_sad64x32_avg_avx512;
     vpx_sad64x32x4d = vpx_sad64x32x4d_c;
     if (flags & HAS_SSE2) vpx_sad64x32x4d = vpx_sad64x32x4d_sse2;
     vpx_sad64x64 = vpx_sad64x64_c;
     if (flags & HAS_SSE2) vpx_sad64x64 = vpx_sad64x64_sse2;
     if (flags & HAS_AVX2) vpx_sad64x64 = vpx_sad64x64_avx2;
+    if (flags & HAS_AVX512) vpx_sad64x64 = vpx_sad64x64_avx512;
     vpx_sad64x64_avg = vpx_sad64x64_avg_c;
     if (flags & HAS_SSE2) vpx_sad64x64_avg = vpx_sad64x64_avg_sse2;
     if (flags & HAS_AVX2) vpx_sad64x64_avg = vpx_sad64x64_avg_avx2;
+    if (flags & HAS_AVX512) vpx_sad64x64_avg = vpx_sad64x64_avg_avx512;
     vpx_sad64x64x4d = vpx_sad64x64x4d_c;
     if (flags & HAS_SSE2) vpx_sad64x64x4d = vpx_sad64x64x4d_sse2;
     if (flags & HAS_AVX2) vpx_sad64x64x4d = vpx_sad64x64x4d_avx2;
+    if (flags & HAS_AVX512) vpx_sad64x64x4d = vpx_sad64x64x4d_avx512;
     vpx_sad8x16 = vpx_sad8x16_c;
     if (flags & HAS_SSE2) vpx_sad8x16 = vpx_sad8x16_sse2;
     vpx_sad8x16_avg = vpx_sad8x16_avg_c;
     if (flags & HAS_SSE2) vpx_sad8x16_avg = vpx_sad8x16_avg_sse2;
-    vpx_sad8x16x3 = vpx_sad8x16x3_c;
-    if (flags & HAS_SSE3) vpx_sad8x16x3 = vpx_sad8x16x3_sse3;
     vpx_sad8x16x4d = vpx_sad8x16x4d_c;
     if (flags & HAS_SSE2) vpx_sad8x16x4d = vpx_sad8x16x4d_sse2;
-    vpx_sad8x16x8 = vpx_sad8x16x8_c;
-    if (flags & HAS_SSE4_1) vpx_sad8x16x8 = vpx_sad8x16x8_sse4_1;
     vpx_sad8x4 = vpx_sad8x4_c;
     if (flags & HAS_SSE2) vpx_sad8x4 = vpx_sad8x4_sse2;
     vpx_sad8x4_avg = vpx_sad8x4_avg_c;
@@ -1314,16 +1412,74 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vpx_sad8x8 = vpx_sad8x8_sse2;
     vpx_sad8x8_avg = vpx_sad8x8_avg_c;
     if (flags & HAS_SSE2) vpx_sad8x8_avg = vpx_sad8x8_avg_sse2;
-    vpx_sad8x8x3 = vpx_sad8x8x3_c;
-    if (flags & HAS_SSE3) vpx_sad8x8x3 = vpx_sad8x8x3_sse3;
     vpx_sad8x8x4d = vpx_sad8x8x4d_c;
     if (flags & HAS_SSE2) vpx_sad8x8x4d = vpx_sad8x8x4d_sse2;
-    vpx_sad8x8x8 = vpx_sad8x8x8_c;
-    if (flags & HAS_SSE4_1) vpx_sad8x8x8 = vpx_sad8x8x8_sse4_1;
+    vpx_sad_skip_16x16 = vpx_sad_skip_16x16_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_16x16 = vpx_sad_skip_16x16_sse2;
+    vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_sse2;
+    vpx_sad_skip_16x32 = vpx_sad_skip_16x32_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_16x32 = vpx_sad_skip_16x32_sse2;
+    vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_sse2;
+    vpx_sad_skip_16x8 = vpx_sad_skip_16x8_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_16x8 = vpx_sad_skip_16x8_sse2;
+    vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_sse2;
+    vpx_sad_skip_32x16 = vpx_sad_skip_32x16_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_avx2;
+    vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_avx2;
+    vpx_sad_skip_32x32 = vpx_sad_skip_32x32_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_avx2;
+    vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_avx2;
+    vpx_sad_skip_32x64 = vpx_sad_skip_32x64_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_avx2;
+    vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_avx2;
+    vpx_sad_skip_4x8 = vpx_sad_skip_4x8_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_4x8 = vpx_sad_skip_4x8_sse2;
+    vpx_sad_skip_4x8x4d = vpx_sad_skip_4x8x4d_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_4x8x4d = vpx_sad_skip_4x8x4d_sse2;
+    vpx_sad_skip_64x32 = vpx_sad_skip_64x32_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_avx2;
+    if (flags & HAS_AVX512) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_avx512;
+    vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_avx2;
+    if (flags & HAS_AVX512) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_avx512;
+    vpx_sad_skip_64x64 = vpx_sad_skip_64x64_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_avx2;
+    if (flags & HAS_AVX512) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_avx512;
+    vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_avx2;
+    if (flags & HAS_AVX512) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_avx512;
+    vpx_sad_skip_8x16 = vpx_sad_skip_8x16_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_8x16 = vpx_sad_skip_8x16_sse2;
+    vpx_sad_skip_8x16x4d = vpx_sad_skip_8x16x4d_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_8x16x4d = vpx_sad_skip_8x16x4d_sse2;
+    vpx_sad_skip_8x8 = vpx_sad_skip_8x8_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_8x8 = vpx_sad_skip_8x8_sse2;
+    vpx_sad_skip_8x8x4d = vpx_sad_skip_8x8x4d_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_8x8x4d = vpx_sad_skip_8x8x4d_sse2;
     vpx_satd = vpx_satd_c;
     if (flags & HAS_SSE2) vpx_satd = vpx_satd_sse2;
+    if (flags & HAS_AVX2) vpx_satd = vpx_satd_avx2;
     vpx_scaled_2d = vpx_scaled_2d_c;
     if (flags & HAS_SSSE3) vpx_scaled_2d = vpx_scaled_2d_ssse3;
+    vpx_sse = vpx_sse_c;
+    if (flags & HAS_SSE4_1) vpx_sse = vpx_sse_sse4_1;
+    if (flags & HAS_AVX2) vpx_sse = vpx_sse_avx2;
     vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_c;
     if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_sse2;
     if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_ssse3;
@@ -1408,6 +1564,7 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_ssse3;
     vpx_subtract_block = vpx_subtract_block_c;
     if (flags & HAS_SSE2) vpx_subtract_block = vpx_subtract_block_sse2;
+    if (flags & HAS_AVX2) vpx_subtract_block = vpx_subtract_block_avx2;
     vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_c;
     if (flags & HAS_SSE2) vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_sse2;
     vpx_tm_predictor_16x16 = vpx_tm_predictor_16x16_c;
@@ -1431,8 +1588,10 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) vpx_variance16x16 = vpx_variance16x16_avx2;
     vpx_variance16x32 = vpx_variance16x32_c;
     if (flags & HAS_SSE2) vpx_variance16x32 = vpx_variance16x32_sse2;
+    if (flags & HAS_AVX2) vpx_variance16x32 = vpx_variance16x32_avx2;
     vpx_variance16x8 = vpx_variance16x8_c;
     if (flags & HAS_SSE2) vpx_variance16x8 = vpx_variance16x8_sse2;
+    if (flags & HAS_AVX2) vpx_variance16x8 = vpx_variance16x8_avx2;
     vpx_variance32x16 = vpx_variance32x16_c;
     if (flags & HAS_SSE2) vpx_variance32x16 = vpx_variance32x16_sse2;
     if (flags & HAS_AVX2) vpx_variance32x16 = vpx_variance32x16_avx2;
@@ -1441,6 +1600,7 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) vpx_variance32x32 = vpx_variance32x32_avx2;
     vpx_variance32x64 = vpx_variance32x64_c;
     if (flags & HAS_SSE2) vpx_variance32x64 = vpx_variance32x64_sse2;
+    if (flags & HAS_AVX2) vpx_variance32x64 = vpx_variance32x64_avx2;
     vpx_variance4x4 = vpx_variance4x4_c;
     if (flags & HAS_SSE2) vpx_variance4x4 = vpx_variance4x4_sse2;
     vpx_variance4x8 = vpx_variance4x8_c;
@@ -1453,10 +1613,13 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) vpx_variance64x64 = vpx_variance64x64_avx2;
     vpx_variance8x16 = vpx_variance8x16_c;
     if (flags & HAS_SSE2) vpx_variance8x16 = vpx_variance8x16_sse2;
+    if (flags & HAS_AVX2) vpx_variance8x16 = vpx_variance8x16_avx2;
     vpx_variance8x4 = vpx_variance8x4_c;
     if (flags & HAS_SSE2) vpx_variance8x4 = vpx_variance8x4_sse2;
+    if (flags & HAS_AVX2) vpx_variance8x4 = vpx_variance8x4_avx2;
     vpx_variance8x8 = vpx_variance8x8_c;
     if (flags & HAS_SSE2) vpx_variance8x8 = vpx_variance8x8_sse2;
+    if (flags & HAS_AVX2) vpx_variance8x8 = vpx_variance8x8_avx2;
     vpx_vector_var = vpx_vector_var_c;
     if (flags & HAS_SSE2) vpx_vector_var = vpx_vector_var_sse2;
 }
@@ -1466,4 +1629,4 @@ static void setup_rtcd_internal(void)
 }  // extern "C"
 #endif
 
-#endif
+#endif  // VPX_DSP_RTCD_H_
diff --git a/media/libvpx/config/linux/ia32/vpx_scale_rtcd.h b/media/libvpx/config/linux/ia32/vpx_scale_rtcd.h
index ddf7d01cca..18e5b71579 100644
--- a/media/libvpx/config/linux/ia32/vpx_scale_rtcd.h
+++ b/media/libvpx/config/linux/ia32/vpx_scale_rtcd.h
@@ -1,3 +1,14 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
 #ifndef VPX_SCALE_RTCD_H_
 #define VPX_SCALE_RTCD_H_
 
@@ -46,6 +57,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
 void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
 #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
 
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
 void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
 #define vpx_yv12_copy_y vpx_yv12_copy_y_c
 
@@ -66,4 +80,4 @@ static void setup_rtcd_internal(void)
 }  // extern "C"
 #endif
 
-#endif
+#endif  // VPX_SCALE_RTCD_H_
diff --git a/media/libvpx/config/linux/loongarch64/vp8_rtcd.h b/media/libvpx/config/linux/loongarch64/vp8_rtcd.h
new file mode 100644
index 0000000000..460f493156
--- /dev/null
+++ b/media/libvpx/config/linux/loongarch64/vp8_rtcd.h
@@ -0,0 +1,229 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
+#ifndef VP8_RTCD_H_
+#define VP8_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * VP8
+ */
+
+struct blockd;
+struct macroblockd;
+struct loop_filter_info;
+
+/* Encoder forward decls */
+struct block;
+struct macroblock;
+struct variance_vtable;
+union int_mv;
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_c
+
+void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_c
+
+void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_c
+
+void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_c
+
+int vp8_block_error_c(short *coeff, short *dqcoeff);
+int vp8_block_error_lsx(short *coeff, short *dqcoeff);
+RTCD_EXTERN int (*vp8_block_error)(short *coeff, short *dqcoeff);
+
+void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
+#define vp8_copy32xn vp8_copy32xn_c
+
+void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+#define vp8_copy_mem16x16 vp8_copy_mem16x16_c
+
+void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+#define vp8_copy_mem8x4 vp8_copy_mem8x4_c
+
+void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+#define vp8_copy_mem8x8 vp8_copy_mem8x8_c
+
+void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+void vp8_dc_only_idct_add_lsx(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+
+int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+#define vp8_denoiser_filter vp8_denoiser_filter_c
+
+int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_c
+
+void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride);
+#define vp8_dequant_idct_add vp8_dequant_idct_add_c
+
+void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
+void vp8_dequant_idct_add_uv_block_lsx(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
+RTCD_EXTERN void (*vp8_dequant_idct_add_uv_block)(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
+
+void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
+void vp8_dequant_idct_add_y_block_lsx(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
+RTCD_EXTERN void (*vp8_dequant_idct_add_y_block)(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
+
+void vp8_dequantize_b_c(struct blockd*, short *DQC);
+#define vp8_dequantize_b vp8_dequantize_b_c
+
+int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+RTCD_EXTERN int (*vp8_diamond_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+
+void vp8_fast_quantize_b_c(struct block *, struct blockd *);
+#define vp8_fast_quantize_b vp8_fast_quantize_b_c
+
+void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bh_lsx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+
+void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bv_lsx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+RTCD_EXTERN void (*vp8_loop_filter_bv)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+
+void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbh_lsx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+RTCD_EXTERN void (*vp8_loop_filter_mbh)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+
+void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbv_lsx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+
+void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_c
+
+void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_c
+
+void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_c
+
+void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_c
+
+int vp8_mbblock_error_c(struct macroblock *mb, int dc);
+int vp8_mbblock_error_lsx(struct macroblock *mb, int dc);
+RTCD_EXTERN int (*vp8_mbblock_error)(struct macroblock *mb, int dc);
+
+int vp8_mbuverror_c(struct macroblock *mb);
+#define vp8_mbuverror vp8_mbuverror_c
+
+int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_refining_search_sad vp8_refining_search_sad_c
+
+void vp8_regular_quantize_b_c(struct block *, struct blockd *);
+void vp8_regular_quantize_b_lsx(struct block *, struct blockd *);
+RTCD_EXTERN void (*vp8_regular_quantize_b)(struct block *, struct blockd *);
+
+void vp8_short_fdct4x4_c(short *input, short *output, int pitch);
+void vp8_short_fdct4x4_lsx(short *input, short *output, int pitch);
+RTCD_EXTERN void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
+
+void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
+void vp8_short_fdct8x4_lsx(short *input, short *output, int pitch);
+RTCD_EXTERN void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
+
+void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+#define vp8_short_idct4x4llm vp8_short_idct4x4llm_c
+
+void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff);
+#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_c
+
+void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff);
+#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c
+
+void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
+#define vp8_short_walsh4x4 vp8_short_walsh4x4_c
+
+void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict16x16_lsx(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+
+void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict4x4_lsx(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+
+void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_c
+
+void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x8_lsx(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+
+void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count);
+#define vp8_temporal_filter_apply vp8_temporal_filter_apply_c
+
+void vp8_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/loongarch.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = loongarch_cpu_caps();
+
+    (void)flags;
+    vp8_block_error = vp8_block_error_c;
+    if (flags & HAS_LSX) vp8_block_error = vp8_block_error_lsx;
+    vp8_dc_only_idct_add = vp8_dc_only_idct_add_c;
+    if (flags & HAS_LSX) vp8_dc_only_idct_add = vp8_dc_only_idct_add_lsx;
+    vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_c;
+    if (flags & HAS_LSX) vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_lsx;
+    vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_c;
+    if (flags & HAS_LSX) vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_lsx;
+    vp8_diamond_search_sad = vp8_diamond_search_sad_c;
+    if (flags & HAS_LSX) vp8_diamond_search_sad = vp8_diamond_search_sadx4;
+    vp8_loop_filter_bh = vp8_loop_filter_bh_c;
+    if (flags & HAS_LSX) vp8_loop_filter_bh = vp8_loop_filter_bh_lsx;
+    vp8_loop_filter_bv = vp8_loop_filter_bv_c;
+    if (flags & HAS_LSX) vp8_loop_filter_bv = vp8_loop_filter_bv_lsx;
+    vp8_loop_filter_mbh = vp8_loop_filter_mbh_c;
+    if (flags & HAS_LSX) vp8_loop_filter_mbh = vp8_loop_filter_mbh_lsx;
+    vp8_loop_filter_mbv = vp8_loop_filter_mbv_c;
+    if (flags & HAS_LSX) vp8_loop_filter_mbv = vp8_loop_filter_mbv_lsx;
+    vp8_mbblock_error = vp8_mbblock_error_c;
+    if (flags & HAS_LSX) vp8_mbblock_error = vp8_mbblock_error_lsx;
+    vp8_regular_quantize_b = vp8_regular_quantize_b_c;
+    if (flags & HAS_LSX) vp8_regular_quantize_b = vp8_regular_quantize_b_lsx;
+    vp8_short_fdct4x4 = vp8_short_fdct4x4_c;
+    if (flags & HAS_LSX) vp8_short_fdct4x4 = vp8_short_fdct4x4_lsx;
+    vp8_short_fdct8x4 = vp8_short_fdct8x4_c;
+    if (flags & HAS_LSX) vp8_short_fdct8x4 = vp8_short_fdct8x4_lsx;
+    vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_c;
+    if (flags & HAS_LSX) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_lsx;
+    vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_c;
+    if (flags & HAS_LSX) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_lsx;
+    vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_c;
+    if (flags & HAS_LSX) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_lsx;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_RTCD_H_
diff --git a/media/libvpx/config/linux/loongarch64/vp9_rtcd.h b/media/libvpx/config/linux/loongarch64/vp9_rtcd.h
new file mode 100644
index 0000000000..80428e6571
--- /dev/null
+++ b/media/libvpx/config/linux/loongarch64/vp9_rtcd.h
@@ -0,0 +1,118 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
+#ifndef VP9_RTCD_H_
+#define VP9_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * VP9
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
+#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER
+#include "vp9/encoder/vp9_temporal_filter.h"
+#endif
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct macroblock_plane;
+struct vp9_sad_table;
+struct ScanOrder;
+struct search_site_config;
+struct mv;
+union int_mv;
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp9_apply_temporal_filter_c(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+#define vp9_apply_temporal_filter vp9_apply_temporal_filter_c
+
+int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
+#define vp9_block_error vp9_block_error_c
+
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+#define vp9_block_error_fp vp9_block_error_fp_c
+
+int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv);
+#define vp9_diamond_search_sad vp9_diamond_search_sad_c
+
+void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht16x16 vp9_fht16x16_c
+
+void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht4x4 vp9_fht4x4_c
+
+void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht8x8 vp9_fht8x8_c
+
+void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_fwht4x4 vp9_fwht4x4_c
+
+void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
+
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c
+
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c
+
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+#define vp9_quantize_fp vp9_quantize_fp_c
+
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
+
+void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
+
+void vpx_convolve12_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve12 vpx_convolve12_c
+
+void vpx_convolve12_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve12_horiz vpx_convolve12_horiz_c
+
+void vpx_convolve12_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve12_vert vpx_convolve12_vert_c
+
+void vp9_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/loongarch.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = loongarch_cpu_caps();
+
+    (void)flags;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_RTCD_H_
diff --git a/media/libvpx/config/linux/loongarch64/vpx_config.asm b/media/libvpx/config/linux/loongarch64/vpx_config.asm
new file mode 100644
index 0000000000..d23406cac5
--- /dev/null
+++ b/media/libvpx/config/linux/loongarch64/vpx_config.asm
@@ -0,0 +1,97 @@
+@ This file was created from a .asm file
+@  using the ads2gas.pl script.
+.syntax unified
+.equ VPX_ARCH_ARM ,  0
+.equ VPX_ARCH_AARCH64 ,  0
+.equ VPX_ARCH_MIPS ,  0
+.equ VPX_ARCH_X86 ,  0
+.equ VPX_ARCH_X86_64 ,  0
+.equ VPX_ARCH_PPC ,  0
+.equ VPX_ARCH_LOONGARCH ,  1
+.equ HAVE_NEON_ASM ,  0
+.equ HAVE_NEON ,  0
+.equ HAVE_NEON_DOTPROD ,  0
+.equ HAVE_NEON_I8MM ,  0
+.equ HAVE_SVE ,  0
+.equ HAVE_SVE2 ,  0
+.equ HAVE_MIPS32 ,  0
+.equ HAVE_DSPR2 ,  0
+.equ HAVE_MSA ,  0
+.equ HAVE_MIPS64 ,  0
+.equ HAVE_MMX ,  0
+.equ HAVE_SSE ,  0
+.equ HAVE_SSE2 ,  0
+.equ HAVE_SSE3 ,  0
+.equ HAVE_SSSE3 ,  0
+.equ HAVE_SSE4_1 ,  0
+.equ HAVE_AVX ,  0
+.equ HAVE_AVX2 ,  0
+.equ HAVE_AVX512 ,  0
+.equ HAVE_VSX ,  0
+.equ HAVE_MMI ,  0
+.equ HAVE_LSX ,  1
+.equ HAVE_LASX ,  1
+.equ HAVE_VPX_PORTS ,  1
+.equ HAVE_PTHREAD_H ,  1
+.equ CONFIG_DEPENDENCY_TRACKING ,  1
+.equ CONFIG_EXTERNAL_BUILD ,  1
+.equ CONFIG_INSTALL_DOCS ,  0
+.equ CONFIG_INSTALL_BINS ,  1
+.equ CONFIG_INSTALL_LIBS ,  1
+.equ CONFIG_INSTALL_SRCS ,  0
+.equ CONFIG_DEBUG ,  0
+.equ CONFIG_GPROF ,  0
+.equ CONFIG_GCOV ,  0
+.equ CONFIG_RVCT ,  0
+.equ CONFIG_GCC ,  1
+.equ CONFIG_MSVS ,  0
+.equ CONFIG_PIC ,  1
+.equ CONFIG_BIG_ENDIAN ,  0
+.equ CONFIG_CODEC_SRCS ,  0
+.equ CONFIG_DEBUG_LIBS ,  0
+.equ CONFIG_DEQUANT_TOKENS ,  0
+.equ CONFIG_DC_RECON ,  0
+.equ CONFIG_RUNTIME_CPU_DETECT ,  1
+.equ CONFIG_POSTPROC ,  0
+.equ CONFIG_VP9_POSTPROC ,  0
+.equ CONFIG_MULTITHREAD ,  1
+.equ CONFIG_INTERNAL_STATS ,  0
+.equ CONFIG_VP8_ENCODER ,  1
+.equ CONFIG_VP8_DECODER ,  1
+.equ CONFIG_VP9_ENCODER ,  1
+.equ CONFIG_VP9_DECODER ,  1
+.equ CONFIG_VP8 ,  1
+.equ CONFIG_VP9 ,  1
+.equ CONFIG_ENCODERS ,  1
+.equ CONFIG_DECODERS ,  1
+.equ CONFIG_STATIC_MSVCRT ,  0
+.equ CONFIG_SPATIAL_RESAMPLING ,  1
+.equ CONFIG_REALTIME_ONLY ,  0
+.equ CONFIG_ONTHEFLY_BITPACKING ,  0
+.equ CONFIG_ERROR_CONCEALMENT ,  0
+.equ CONFIG_SHARED ,  0
+.equ CONFIG_STATIC ,  1
+.equ CONFIG_SMALL ,  0
+.equ CONFIG_POSTPROC_VISUALIZER ,  0
+.equ CONFIG_OS_SUPPORT ,  1
+.equ CONFIG_UNIT_TESTS ,  0
+.equ CONFIG_WEBM_IO ,  1
+.equ CONFIG_LIBYUV ,  1
+.equ CONFIG_DECODE_PERF_TESTS ,  0
+.equ CONFIG_ENCODE_PERF_TESTS ,  0
+.equ CONFIG_MULTI_RES_ENCODING ,  1
+.equ CONFIG_TEMPORAL_DENOISING ,  1
+.equ CONFIG_VP9_TEMPORAL_DENOISING ,  0
+.equ CONFIG_COEFFICIENT_RANGE_CHECKING ,  0
+.equ CONFIG_VP9_HIGHBITDEPTH ,  0
+.equ CONFIG_BETTER_HW_COMPATIBILITY ,  0
+.equ CONFIG_EXPERIMENTAL ,  0
+.equ CONFIG_SIZE_LIMIT ,  1
+.equ CONFIG_ALWAYS_ADJUST_BPM ,  0
+.equ CONFIG_BITSTREAM_DEBUG ,  0
+.equ CONFIG_MISMATCH_DEBUG ,  0
+.equ CONFIG_FP_MB_STATS ,  0
+.equ CONFIG_EMULATE_HARDWARE ,  0
+.equ CONFIG_NON_GREEDY_MV ,  0
+.equ CONFIG_COLLECT_COMPONENT_TIMING ,  0
+    .section .note.GNU-stack,"",%progbits
diff --git a/media/libvpx/config/linux/loongarch64/vpx_config.c b/media/libvpx/config/linux/loongarch64/vpx_config.c
new file mode 100644
index 0000000000..8ee96400cc
--- /dev/null
+++ b/media/libvpx/config/linux/loongarch64/vpx_config.c
@@ -0,0 +1,10 @@
+/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */
+/*  */
+/* Use of this source code is governed by a BSD-style license */
+/* that can be found in the LICENSE file in the root of the source */
+/* tree. An additional intellectual property rights grant can be found */
+/* in the file PATENTS.  All contributing project authors may */
+/* be found in the AUTHORS file in the root of the source tree. */
+#include "vpx/vpx_codec.h"
+static const char* const cfg = "--target=loongarch64-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --enable-runtime-cpu-detect";
+const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/media/libvpx/config/linux/loongarch64/vpx_config.h b/media/libvpx/config/linux/loongarch64/vpx_config.h
new file mode 100644
index 0000000000..e104a8b366
--- /dev/null
+++ b/media/libvpx/config/linux/loongarch64/vpx_config.h
@@ -0,0 +1,108 @@
+/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */
+/*  */
+/* Use of this source code is governed by a BSD-style license */
+/* that can be found in the LICENSE file in the root of the source */
+/* tree. An additional intellectual property rights grant can be found */
+/* in the file PATENTS.  All contributing project authors may */
+/* be found in the AUTHORS file in the root of the source tree. */
+/* This file automatically generated by configure. Do not edit! */
+#ifndef VPX_CONFIG_H
+#define VPX_CONFIG_H
+#define RESTRICT    
+#define INLINE      inline
+#define VPX_ARCH_ARM 0
+#define VPX_ARCH_AARCH64 0
+#define VPX_ARCH_MIPS 0
+#define VPX_ARCH_X86 0
+#define VPX_ARCH_X86_64 0
+#define VPX_ARCH_PPC 0
+#define VPX_ARCH_LOONGARCH 1
+#define HAVE_NEON_ASM 0
+#define HAVE_NEON 0
+#define HAVE_NEON_DOTPROD 0
+#define HAVE_NEON_I8MM 0
+#define HAVE_SVE 0
+#define HAVE_SVE2 0
+#define HAVE_MIPS32 0
+#define HAVE_DSPR2 0
+#define HAVE_MSA 0
+#define HAVE_MIPS64 0
+#define HAVE_MMX 0
+#define HAVE_SSE 0
+#define HAVE_SSE2 0
+#define HAVE_SSE3 0
+#define HAVE_SSSE3 0
+#define HAVE_SSE4_1 0
+#define HAVE_AVX 0
+#define HAVE_AVX2 0
+#define HAVE_AVX512 0
+#define HAVE_VSX 0
+#define HAVE_MMI 0
+#define HAVE_LSX 1
+#define HAVE_LASX 1
+#define HAVE_VPX_PORTS 1
+#define HAVE_PTHREAD_H 1
+#define CONFIG_DEPENDENCY_TRACKING 1
+#define CONFIG_EXTERNAL_BUILD 1
+#define CONFIG_INSTALL_DOCS 0
+#define CONFIG_INSTALL_BINS 1
+#define CONFIG_INSTALL_LIBS 1
+#define CONFIG_INSTALL_SRCS 0
+#define CONFIG_DEBUG 0
+#define CONFIG_GPROF 0
+#define CONFIG_GCOV 0
+#define CONFIG_RVCT 0
+#define CONFIG_GCC 1
+#define CONFIG_MSVS 0
+#define CONFIG_PIC 1
+#define CONFIG_BIG_ENDIAN 0
+#define CONFIG_CODEC_SRCS 0
+#define CONFIG_DEBUG_LIBS 0
+#define CONFIG_DEQUANT_TOKENS 0
+#define CONFIG_DC_RECON 0
+#define CONFIG_RUNTIME_CPU_DETECT 1
+#define CONFIG_POSTPROC 0
+#define CONFIG_VP9_POSTPROC 0
+#define CONFIG_MULTITHREAD 1
+#define CONFIG_INTERNAL_STATS 0
+#define CONFIG_VP8_ENCODER 1
+#define CONFIG_VP8_DECODER 1
+#define CONFIG_VP9_ENCODER 1
+#define CONFIG_VP9_DECODER 1
+#define CONFIG_VP8 1
+#define CONFIG_VP9 1
+#define CONFIG_ENCODERS 1
+#define CONFIG_DECODERS 1
+#define CONFIG_STATIC_MSVCRT 0
+#define CONFIG_SPATIAL_RESAMPLING 1
+#define CONFIG_REALTIME_ONLY 0
+#define CONFIG_ONTHEFLY_BITPACKING 0
+#define CONFIG_ERROR_CONCEALMENT 0
+#define CONFIG_SHARED 0
+#define CONFIG_STATIC 1
+#define CONFIG_SMALL 0
+#define CONFIG_POSTPROC_VISUALIZER 0
+#define CONFIG_OS_SUPPORT 1
+#define CONFIG_UNIT_TESTS 0
+#define CONFIG_WEBM_IO 1
+#define CONFIG_LIBYUV 1
+#define CONFIG_DECODE_PERF_TESTS 0
+#define CONFIG_ENCODE_PERF_TESTS 0
+#define CONFIG_MULTI_RES_ENCODING 1
+#define CONFIG_TEMPORAL_DENOISING 1
+#define CONFIG_VP9_TEMPORAL_DENOISING 0
+#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_VP9_HIGHBITDEPTH 0
+#define CONFIG_BETTER_HW_COMPATIBILITY 0
+#define CONFIG_EXPERIMENTAL 0
+#define CONFIG_SIZE_LIMIT 1
+#define CONFIG_ALWAYS_ADJUST_BPM 0
+#define CONFIG_BITSTREAM_DEBUG 0
+#define CONFIG_MISMATCH_DEBUG 0
+#define CONFIG_FP_MB_STATS 0
+#define CONFIG_EMULATE_HARDWARE 0
+#define CONFIG_NON_GREEDY_MV 0
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
+#define DECODE_WIDTH_LIMIT 8192
+#define DECODE_HEIGHT_LIMIT 4608
+#endif /* VPX_CONFIG_H */
diff --git a/media/libvpx/config/linux/loongarch64/vpx_dsp_rtcd.h b/media/libvpx/config/linux/loongarch64/vpx_dsp_rtcd.h
new file mode 100644
index 0000000000..06b98b8b24
--- /dev/null
+++ b/media/libvpx/config/linux/loongarch64/vpx_dsp_rtcd.h
@@ -0,0 +1,929 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
+#ifndef VPX_DSP_RTCD_H_
+#define VPX_DSP_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * DSP
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#if CONFIG_VP9_ENCODER
+ struct macroblock_plane;
+ struct ScanOrder;
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned int vpx_avg_4x4_c(const uint8_t *, int p);
+#define vpx_avg_4x4 vpx_avg_4x4_c
+
+unsigned int vpx_avg_8x8_c(const uint8_t *, int p);
+#define vpx_avg_8x8 vpx_avg_8x8_c
+
+void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+void vpx_comp_avg_pred_lsx(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+RTCD_EXTERN void (*vpx_comp_avg_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_lsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_lsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_lsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_lsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_lsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_lsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_avg_lsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_copy_lsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c
+
+void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c
+
+void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c
+
+void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
+
+void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c
+
+void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c
+
+void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c
+
+void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c
+
+void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c
+
+void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_c
+
+void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_c
+
+void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_c
+
+void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_c
+
+void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_c
+
+void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_c
+
+void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c
+
+void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_c
+
+void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c
+
+void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_c
+
+void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_c
+
+void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c
+
+void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c
+
+void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_c
+
+void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_c
+
+void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c
+
+void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c
+
+void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_c
+
+void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_c
+
+void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_c
+
+void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_c
+
+void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_c
+
+void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_c
+
+void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_c
+
+void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_c
+
+void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_16x16_lsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+
+void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_c
+
+void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_c
+
+void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_8x8_lsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+
+void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_c
+
+void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_c
+
+void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_c
+
+void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_c
+
+void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct16x16_lsx(const int16_t *input, tran_low_t *output, int stride);
+RTCD_EXTERN void (*vpx_fdct16x16)(const int16_t *input, tran_low_t *output, int stride);
+
+void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16_1 vpx_fdct16x16_1_c
+
+void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct32x32_lsx(const int16_t *input, tran_low_t *output, int stride);
+RTCD_EXTERN void (*vpx_fdct32x32)(const int16_t *input, tran_low_t *output, int stride);
+
+void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_1 vpx_fdct32x32_1_c
+
+void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct32x32_rd_lsx(const int16_t *input, tran_low_t *output, int stride);
+RTCD_EXTERN void (*vpx_fdct32x32_rd)(const int16_t *input, tran_low_t *output, int stride);
+
+void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct4x4_lsx(const int16_t *input, tran_low_t *output, int stride);
+RTCD_EXTERN void (*vpx_fdct4x4)(const int16_t *input, tran_low_t *output, int stride);
+
+void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4_1 vpx_fdct4x4_1_c
+
+void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct8x8_lsx(const int16_t *input, tran_low_t *output, int stride);
+RTCD_EXTERN void (*vpx_fdct8x8)(const int16_t *input, tran_low_t *output, int stride);
+
+void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct8x8_1 vpx_fdct8x8_1_c
+
+void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+
+unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride);
+#define vpx_get4x4sse_cs vpx_get4x4sse_cs_c
+
+void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_get8x8var vpx_get8x8var_c
+
+unsigned int vpx_get_mb_ss_c(const int16_t *);
+#define vpx_get_mb_ss vpx_get_mb_ss_c
+
+void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_16x16 vpx_h_predictor_16x16_c
+
+void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_32x32 vpx_h_predictor_32x32_c
+
+void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_4x4 vpx_h_predictor_4x4_c
+
+void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_c
+
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_lsx(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+
+void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+#define vpx_hadamard_32x32 vpx_hadamard_32x32_c
+
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_8x8_lsx(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+
+void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
+
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
+
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_1_add vpx_idct16x16_1_add_c
+
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_256_add vpx_idct16x16_256_add_c
+
+void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_38_add vpx_idct16x16_38_add_c
+
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_lsx(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride);
+
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_lsx(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride);
+
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1_add_lsx(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int stride);
+
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_lsx(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride);
+
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct4x4_16_add vpx_idct4x4_16_add_c
+
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct4x4_1_add vpx_idct4x4_1_add_c
+
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct8x8_12_add vpx_idct8x8_12_add_c
+
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct8x8_1_add vpx_idct8x8_1_add_c
+
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct8x8_64_add vpx_idct8x8_64_add_c
+
+int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
+#define vpx_int_pro_col vpx_int_pro_col_c
+
+void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
+#define vpx_int_pro_row vpx_int_pro_row_c
+
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
+
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
+
+void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_horizontal_16 vpx_lpf_horizontal_16_c
+
+void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_horizontal_16_dual_lsx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*vpx_lpf_horizontal_16_dual)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_horizontal_4_lsx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*vpx_lpf_horizontal_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_horizontal_4_dual_lsx(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*vpx_lpf_horizontal_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
+void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_horizontal_8_lsx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*vpx_lpf_horizontal_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_horizontal_8_dual_lsx(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*vpx_lpf_horizontal_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
+void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_16 vpx_lpf_vertical_16_c
+
+void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_vertical_16_dual_lsx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*vpx_lpf_vertical_16_dual)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_vertical_4_lsx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*vpx_lpf_vertical_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_vertical_4_dual_lsx(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*vpx_lpf_vertical_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
+void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_vertical_8_lsx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*vpx_lpf_vertical_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_vertical_8_dual_lsx(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*vpx_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
+void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vpx_minmax_8x8 vpx_minmax_8x8_c
+
+unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_mse16x8 vpx_mse16x8_c
+
+unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_mse8x16 vpx_mse8x16_c
+
+unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_mse8x8 vpx_mse8x8_c
+
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_lsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_32x32_lsx(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vpx_quantize_b_32x32)(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+
+unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x16_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x16_avg vpx_sad16x16_avg_c
+
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x16x4d_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x32 vpx_sad16x32_c
+
+unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x32_avg vpx_sad16x32_avg_c
+
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad16x32x4d vpx_sad16x32x4d_c
+
+unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x8 vpx_sad16x8_c
+
+unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x8_avg vpx_sad16x8_avg_c
+
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad16x8x4d vpx_sad16x8x4d_c
+
+unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x16 vpx_sad32x16_c
+
+unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x16_avg vpx_sad32x16_avg_c
+
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad32x16x4d vpx_sad32x16x4d_c
+
+unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad32x32_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad32x32_avg_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad32x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x64 vpx_sad32x64_c
+
+unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x64_avg vpx_sad32x64_avg_c
+
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x64x4d_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x4 vpx_sad4x4_c
+
+unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x4_avg vpx_sad4x4_avg_c
+
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad4x4x4d vpx_sad4x4x4d_c
+
+unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x8 vpx_sad4x8_c
+
+unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x8_avg vpx_sad4x8_avg_c
+
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad4x8x4d vpx_sad4x8x4d_c
+
+unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad64x32 vpx_sad64x32_c
+
+unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad64x32_avg vpx_sad64x32_avg_c
+
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x32x4d_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad64x64_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad64x64_avg_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad64x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x16 vpx_sad8x16_c
+
+unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x16_avg vpx_sad8x16_avg_c
+
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad8x16x4d vpx_sad8x16x4d_c
+
+unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x4 vpx_sad8x4_c
+
+unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x4_avg vpx_sad8x4_avg_c
+
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad8x4x4d vpx_sad8x4x4d_c
+
+unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad8x8_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x8_avg vpx_sad8x8_avg_c
+
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x8x4d_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x16 vpx_sad_skip_16x16_c
+
+void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x16x4d vpx_sad_skip_16x16x4d_c
+
+unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x32 vpx_sad_skip_16x32_c
+
+void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x32x4d vpx_sad_skip_16x32x4d_c
+
+unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x8 vpx_sad_skip_16x8_c
+
+void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x8x4d vpx_sad_skip_16x8x4d_c
+
+unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_32x16 vpx_sad_skip_32x16_c
+
+void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_32x16x4d vpx_sad_skip_32x16x4d_c
+
+unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_32x32 vpx_sad_skip_32x32_c
+
+void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_32x32x4d vpx_sad_skip_32x32x4d_c
+
+unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_32x64 vpx_sad_skip_32x64_c
+
+void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_32x64x4d vpx_sad_skip_32x64x4d_c
+
+unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c
+
+void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c
+
+unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_c
+
+void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_c
+
+unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_64x32 vpx_sad_skip_64x32_c
+
+void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_64x32x4d vpx_sad_skip_64x32x4d_c
+
+unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_64x64 vpx_sad_skip_64x64_c
+
+void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_64x64x4d vpx_sad_skip_64x64x4d_c
+
+unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_c
+
+void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_c
+
+unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c
+
+void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c
+
+unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_c
+
+void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_c
+
+int vpx_satd_c(const int16_t *coeff, int length);
+#define vpx_satd vpx_satd_c
+
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_2d vpx_scaled_2d_c
+
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
+
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
+
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
+
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_horiz vpx_scaled_horiz_c
+
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_vert vpx_scaled_vert_c
+
+int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+#define vpx_sse vpx_sse_c
+
+uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_lsx(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+
+uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_lsx(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_c
+
+uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_c
+
+uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_c
+
+uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_lsx(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_c
+
+uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c
+
+uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c
+
+uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_c
+
+uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_c
+
+uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_c
+
+uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_c
+
+uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x8_lsx(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+void vpx_subtract_block_lsx(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+RTCD_EXTERN void (*vpx_subtract_block)(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+
+uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size);
+#define vpx_sum_squares_2d_i16 vpx_sum_squares_2d_i16_c
+
+void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_c
+
+void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_c
+
+void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_c
+
+void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_c
+
+void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_16x16 vpx_v_predictor_16x16_c
+
+void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_32x32 vpx_v_predictor_32x32_c
+
+void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_4x4 vpx_v_predictor_4x4_c
+
+void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_8x8 vpx_v_predictor_8x8_c
+
+unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x32 vpx_variance16x32_c
+
+unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x8 vpx_variance16x8_c
+
+unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x16 vpx_variance32x16_c
+
+unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x64 vpx_variance32x64_c
+
+unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance4x4 vpx_variance4x4_c
+
+unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance4x8 vpx_variance4x8_c
+
+unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance64x32 vpx_variance64x32_c
+
+unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x16 vpx_variance8x16_c
+
+unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x4 vpx_variance8x4_c
+
+unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c
+
+int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl);
+#define vpx_vector_var vpx_vector_var_c
+
+void vpx_dsp_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/loongarch.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = loongarch_cpu_caps();
+
+    (void)flags;
+    vpx_comp_avg_pred = vpx_comp_avg_pred_c;
+    if (flags & HAS_LSX) vpx_comp_avg_pred = vpx_comp_avg_pred_lsx;
+    vpx_convolve8 = vpx_convolve8_c;
+    if (flags & HAS_LSX) vpx_convolve8 = vpx_convolve8_lsx;
+    vpx_convolve8_avg = vpx_convolve8_avg_c;
+    if (flags & HAS_LSX) vpx_convolve8_avg = vpx_convolve8_avg_lsx;
+    vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_c;
+    if (flags & HAS_LSX) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_lsx;
+    vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_c;
+    if (flags & HAS_LSX) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_lsx;
+    vpx_convolve8_horiz = vpx_convolve8_horiz_c;
+    if (flags & HAS_LSX) vpx_convolve8_horiz = vpx_convolve8_horiz_lsx;
+    vpx_convolve8_vert = vpx_convolve8_vert_c;
+    if (flags & HAS_LSX) vpx_convolve8_vert = vpx_convolve8_vert_lsx;
+    vpx_convolve_avg = vpx_convolve_avg_c;
+    if (flags & HAS_LSX) vpx_convolve_avg = vpx_convolve_avg_lsx;
+    vpx_convolve_copy = vpx_convolve_copy_c;
+    if (flags & HAS_LSX) vpx_convolve_copy = vpx_convolve_copy_lsx;
+    vpx_dc_predictor_16x16 = vpx_dc_predictor_16x16_c;
+    if (flags & HAS_LSX) vpx_dc_predictor_16x16 = vpx_dc_predictor_16x16_lsx;
+    vpx_dc_predictor_8x8 = vpx_dc_predictor_8x8_c;
+    if (flags & HAS_LSX) vpx_dc_predictor_8x8 = vpx_dc_predictor_8x8_lsx;
+    vpx_fdct16x16 = vpx_fdct16x16_c;
+    if (flags & HAS_LSX) vpx_fdct16x16 = vpx_fdct16x16_lsx;
+    vpx_fdct32x32 = vpx_fdct32x32_c;
+    if (flags & HAS_LSX) vpx_fdct32x32 = vpx_fdct32x32_lsx;
+    vpx_fdct32x32_rd = vpx_fdct32x32_rd_c;
+    if (flags & HAS_LSX) vpx_fdct32x32_rd = vpx_fdct32x32_rd_lsx;
+    vpx_fdct4x4 = vpx_fdct4x4_c;
+    if (flags & HAS_LSX) vpx_fdct4x4 = vpx_fdct4x4_lsx;
+    vpx_fdct8x8 = vpx_fdct8x8_c;
+    if (flags & HAS_LSX) vpx_fdct8x8 = vpx_fdct8x8_lsx;
+    vpx_get16x16var = vpx_get16x16var_c;
+    if (flags & HAS_LSX) vpx_get16x16var = vpx_get16x16var_lsx;
+    vpx_hadamard_16x16 = vpx_hadamard_16x16_c;
+    if (flags & HAS_LSX) vpx_hadamard_16x16 = vpx_hadamard_16x16_lsx;
+    vpx_hadamard_8x8 = vpx_hadamard_8x8_c;
+    if (flags & HAS_LSX) vpx_hadamard_8x8 = vpx_hadamard_8x8_lsx;
+    vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_c;
+    if (flags & HAS_LSX) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_lsx;
+    vpx_idct32x32_135_add = vpx_idct32x32_135_add_c;
+    if (flags & HAS_LSX) vpx_idct32x32_135_add = vpx_idct32x32_1024_add_lsx;
+    vpx_idct32x32_1_add = vpx_idct32x32_1_add_c;
+    if (flags & HAS_LSX) vpx_idct32x32_1_add = vpx_idct32x32_1_add_lsx;
+    vpx_idct32x32_34_add = vpx_idct32x32_34_add_c;
+    if (flags & HAS_LSX) vpx_idct32x32_34_add = vpx_idct32x32_34_add_lsx;
+    vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_c;
+    if (flags & HAS_LSX) vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_lsx;
+    vpx_lpf_horizontal_4 = vpx_lpf_horizontal_4_c;
+    if (flags & HAS_LSX) vpx_lpf_horizontal_4 = vpx_lpf_horizontal_4_lsx;
+    vpx_lpf_horizontal_4_dual = vpx_lpf_horizontal_4_dual_c;
+    if (flags & HAS_LSX) vpx_lpf_horizontal_4_dual = vpx_lpf_horizontal_4_dual_lsx;
+    vpx_lpf_horizontal_8 = vpx_lpf_horizontal_8_c;
+    if (flags & HAS_LSX) vpx_lpf_horizontal_8 = vpx_lpf_horizontal_8_lsx;
+    vpx_lpf_horizontal_8_dual = vpx_lpf_horizontal_8_dual_c;
+    if (flags & HAS_LSX) vpx_lpf_horizontal_8_dual = vpx_lpf_horizontal_8_dual_lsx;
+    vpx_lpf_vertical_16_dual = vpx_lpf_vertical_16_dual_c;
+    if (flags & HAS_LSX) vpx_lpf_vertical_16_dual = vpx_lpf_vertical_16_dual_lsx;
+    vpx_lpf_vertical_4 = vpx_lpf_vertical_4_c;
+    if (flags & HAS_LSX) vpx_lpf_vertical_4 = vpx_lpf_vertical_4_lsx;
+    vpx_lpf_vertical_4_dual = vpx_lpf_vertical_4_dual_c;
+    if (flags & HAS_LSX) vpx_lpf_vertical_4_dual = vpx_lpf_vertical_4_dual_lsx;
+    vpx_lpf_vertical_8 = vpx_lpf_vertical_8_c;
+    if (flags & HAS_LSX) vpx_lpf_vertical_8 = vpx_lpf_vertical_8_lsx;
+    vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_c;
+    if (flags & HAS_LSX) vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_lsx;
+    vpx_mse16x16 = vpx_mse16x16_c;
+    if (flags & HAS_LSX) vpx_mse16x16 = vpx_mse16x16_lsx;
+    vpx_quantize_b = vpx_quantize_b_c;
+    if (flags & HAS_LSX) vpx_quantize_b = vpx_quantize_b_lsx;
+    vpx_quantize_b_32x32 = vpx_quantize_b_32x32_c;
+    if (flags & HAS_LSX) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_lsx;
+    vpx_sad16x16 = vpx_sad16x16_c;
+    if (flags & HAS_LSX) vpx_sad16x16 = vpx_sad16x16_lsx;
+    vpx_sad16x16x4d = vpx_sad16x16x4d_c;
+    if (flags & HAS_LSX) vpx_sad16x16x4d = vpx_sad16x16x4d_lsx;
+    vpx_sad32x32 = vpx_sad32x32_c;
+    if (flags & HAS_LSX) vpx_sad32x32 = vpx_sad32x32_lsx;
+    vpx_sad32x32_avg = vpx_sad32x32_avg_c;
+    if (flags & HAS_LSX) vpx_sad32x32_avg = vpx_sad32x32_avg_lsx;
+    vpx_sad32x32x4d = vpx_sad32x32x4d_c;
+    if (flags & HAS_LSX) vpx_sad32x32x4d = vpx_sad32x32x4d_lsx;
+    vpx_sad32x64x4d = vpx_sad32x64x4d_c;
+    if (flags & HAS_LSX) vpx_sad32x64x4d = vpx_sad32x64x4d_lsx;
+    vpx_sad64x32x4d = vpx_sad64x32x4d_c;
+    if (flags & HAS_LSX) vpx_sad64x32x4d = vpx_sad64x32x4d_lsx;
+    vpx_sad64x64 = vpx_sad64x64_c;
+    if (flags & HAS_LSX) vpx_sad64x64 = vpx_sad64x64_lsx;
+    vpx_sad64x64_avg = vpx_sad64x64_avg_c;
+    if (flags & HAS_LSX) vpx_sad64x64_avg = vpx_sad64x64_avg_lsx;
+    vpx_sad64x64x4d = vpx_sad64x64x4d_c;
+    if (flags & HAS_LSX) vpx_sad64x64x4d = vpx_sad64x64x4d_lsx;
+    vpx_sad8x8 = vpx_sad8x8_c;
+    if (flags & HAS_LSX) vpx_sad8x8 = vpx_sad8x8_lsx;
+    vpx_sad8x8x4d = vpx_sad8x8x4d_c;
+    if (flags & HAS_LSX) vpx_sad8x8x4d = vpx_sad8x8x4d_lsx;
+    vpx_sub_pixel_avg_variance64x64 = vpx_sub_pixel_avg_variance64x64_c;
+    if (flags & HAS_LSX) vpx_sub_pixel_avg_variance64x64 = vpx_sub_pixel_avg_variance64x64_lsx;
+    vpx_sub_pixel_variance16x16 = vpx_sub_pixel_variance16x16_c;
+    if (flags & HAS_LSX) vpx_sub_pixel_variance16x16 = vpx_sub_pixel_variance16x16_lsx;
+    vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_c;
+    if (flags & HAS_LSX) vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_lsx;
+    vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_c;
+    if (flags & HAS_LSX) vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_lsx;
+    vpx_subtract_block = vpx_subtract_block_c;
+    if (flags & HAS_LSX) vpx_subtract_block = vpx_subtract_block_lsx;
+    vpx_variance16x16 = vpx_variance16x16_c;
+    if (flags & HAS_LSX) vpx_variance16x16 = vpx_variance16x16_lsx;
+    vpx_variance32x32 = vpx_variance32x32_c;
+    if (flags & HAS_LSX) vpx_variance32x32 = vpx_variance32x32_lsx;
+    vpx_variance64x64 = vpx_variance64x64_c;
+    if (flags & HAS_LSX) vpx_variance64x64 = vpx_variance64x64_lsx;
+    vpx_variance8x8 = vpx_variance8x8_c;
+    if (flags & HAS_LSX) vpx_variance8x8 = vpx_variance8x8_lsx;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_RTCD_H_
diff --git a/media/libvpx/config/linux/loongarch64/vpx_scale_rtcd.h b/media/libvpx/config/linux/loongarch64/vpx_scale_rtcd.h
new file mode 100644
index 0000000000..9fb8e25d07
--- /dev/null
+++ b/media/libvpx/config/linux/loongarch64/vpx_scale_rtcd.h
@@ -0,0 +1,84 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
+#ifndef VPX_SCALE_RTCD_H_
+#define VPX_SCALE_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c
+
+void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c
+
+void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c
+
+void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+#define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c
+
+void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+#define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c
+
+void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c
+
+void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c
+
+void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vp8_yv12_copy_frame vp8_yv12_copy_frame_c
+
+void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf);
+#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c
+
+void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
+#define vpx_extend_frame_borders vpx_extend_frame_borders_c
+
+void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
+#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
+
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
+void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_y vpx_yv12_copy_y_c
+
+void vpx_scale_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/loongarch.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = loongarch_cpu_caps();
+
+    (void)flags;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_SCALE_RTCD_H_
diff --git a/media/libvpx/config/linux/mips32/vp8_rtcd.h b/media/libvpx/config/linux/mips32/vp8_rtcd.h
new file mode 100644
index 0000000000..32c48c547b
--- /dev/null
+++ b/media/libvpx/config/linux/mips32/vp8_rtcd.h
@@ -0,0 +1,370 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
+#ifndef VP8_RTCD_H_
+#define VP8_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * VP8
+ */
+
+struct blockd;
+struct macroblockd;
+struct loop_filter_info;
+
+/* Encoder forward decls */
+struct block;
+struct macroblock;
+struct variance_vtable;
+union int_mv;
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_c
+
+void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_c
+
+void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_c
+
+void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_c
+
+int vp8_block_error_c(short *coeff, short *dqcoeff);
+#define vp8_block_error vp8_block_error_c
+
+void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
+#define vp8_copy32xn vp8_copy32xn_c
+
+void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+#define vp8_copy_mem16x16 vp8_copy_mem16x16_c
+
+void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+#define vp8_copy_mem8x4 vp8_copy_mem8x4_c
+
+void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+#define vp8_copy_mem8x8 vp8_copy_mem8x8_c
+
+void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+#define vp8_dc_only_idct_add vp8_dc_only_idct_add_c
+
+int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+#define vp8_denoiser_filter vp8_denoiser_filter_c
+
+int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_c
+
+void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride);
+#define vp8_dequant_idct_add vp8_dequant_idct_add_c
+
+void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
+#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_c
+
+void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
+#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c
+
+void vp8_dequantize_b_c(struct blockd*, short *DQC);
+#define vp8_dequantize_b vp8_dequantize_b_c
+
+int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_diamond_search_sad vp8_diamond_search_sad_c
+
+void vp8_fast_quantize_b_c(struct block *, struct blockd *);
+#define vp8_fast_quantize_b vp8_fast_quantize_b_c
+
+void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_bh vp8_loop_filter_bh_c
+
+void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_bv vp8_loop_filter_bv_c
+
+void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_mbh vp8_loop_filter_mbh_c
+
+void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_mbv vp8_loop_filter_mbv_c
+
+void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_c
+
+void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_c
+
+void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_c
+
+void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_c
+
+int vp8_mbblock_error_c(struct macroblock *mb, int dc);
+#define vp8_mbblock_error vp8_mbblock_error_c
+
+int vp8_mbuverror_c(struct macroblock *mb);
+#define vp8_mbuverror vp8_mbuverror_c
+
+int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_refining_search_sad vp8_refining_search_sad_c
+
+void vp8_regular_quantize_b_c(struct block *, struct blockd *);
+#define vp8_regular_quantize_b vp8_regular_quantize_b_c
+
+void vp8_short_fdct4x4_c(short *input, short *output, int pitch);
+#define vp8_short_fdct4x4 vp8_short_fdct4x4_c
+
+void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
+#define vp8_short_fdct8x4 vp8_short_fdct8x4_c
+
+void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+#define vp8_short_idct4x4llm vp8_short_idct4x4llm_c
+
+void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff);
+#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_c
+
+void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff);
+#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c
+
+void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
+#define vp8_short_walsh4x4 vp8_short_walsh4x4_c
+
+void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_c
+
+void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_c
+
+void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_c
+
+void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_c
+
+void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count);
+#define vp8_temporal_filter_apply vp8_temporal_filter_apply_c
+
+void vp8_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+static void setup_rtcd_internal(void)
+{
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_RTCD_H_
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
+#ifndef VP8_RTCD_H_
+#define VP8_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * VP8
+ */
+
+struct blockd;
+struct macroblockd;
+struct loop_filter_info;
+
+/* Encoder forward decls */
+struct block;
+struct macroblock;
+struct variance_vtable;
+union int_mv;
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_c
+
+void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_c
+
+void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_c
+
+void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_c
+
+int vp8_block_error_c(short *coeff, short *dqcoeff);
+#define vp8_block_error vp8_block_error_c
+
+void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
+#define vp8_copy32xn vp8_copy32xn_c
+
+void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+#define vp8_copy_mem16x16 vp8_copy_mem16x16_c
+
+void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+#define vp8_copy_mem8x4 vp8_copy_mem8x4_c
+
+void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+#define vp8_copy_mem8x8 vp8_copy_mem8x8_c
+
+void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+#define vp8_dc_only_idct_add vp8_dc_only_idct_add_c
+
+int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+#define vp8_denoiser_filter vp8_denoiser_filter_c
+
+int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_c
+
+void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride);
+#define vp8_dequant_idct_add vp8_dequant_idct_add_c
+
+void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
+#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_c
+
+void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
+#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c
+
+void vp8_dequantize_b_c(struct blockd*, short *DQC);
+#define vp8_dequantize_b vp8_dequantize_b_c
+
+int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_diamond_search_sad vp8_diamond_search_sad_c
+
+void vp8_fast_quantize_b_c(struct block *, struct blockd *);
+#define vp8_fast_quantize_b vp8_fast_quantize_b_c
+
+void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_bh vp8_loop_filter_bh_c
+
+void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_bv vp8_loop_filter_bv_c
+
+void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_mbh vp8_loop_filter_mbh_c
+
+void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_mbv vp8_loop_filter_mbv_c
+
+void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_c
+
+void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_c
+
+void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_c
+
+void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_c
+
+int vp8_mbblock_error_c(struct macroblock *mb, int dc);
+#define vp8_mbblock_error vp8_mbblock_error_c
+
+int vp8_mbuverror_c(struct macroblock *mb);
+#define vp8_mbuverror vp8_mbuverror_c
+
+int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_refining_search_sad vp8_refining_search_sad_c
+
+void vp8_regular_quantize_b_c(struct block *, struct blockd *);
+#define vp8_regular_quantize_b vp8_regular_quantize_b_c
+
+void vp8_short_fdct4x4_c(short *input, short *output, int pitch);
+#define vp8_short_fdct4x4 vp8_short_fdct4x4_c
+
+void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
+#define vp8_short_fdct8x4 vp8_short_fdct8x4_c
+
+void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+#define vp8_short_idct4x4llm vp8_short_idct4x4llm_c
+
+void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff);
+#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_c
+
+void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff);
+#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c
+
+void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
+#define vp8_short_walsh4x4 vp8_short_walsh4x4_c
+
+void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_c
+
+void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_c
+
+void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_c
+
+void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_c
+
+void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count);
+#define vp8_temporal_filter_apply vp8_temporal_filter_apply_c
+
+void vp8_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/mips.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = mips_cpu_caps();
+
+    (void)flags;
+
+#if HAVE_DSPR2
+void vpx_dsputil_static_init();
+#if CONFIG_VP8
+void dsputil_static_init();
+#endif
+
+vpx_dsputil_static_init();
+#if CONFIG_VP8
+dsputil_static_init();
+#endif
+#endif
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_RTCD_H_
diff --git a/media/libvpx/config/linux/mips32/vp9_rtcd.h b/media/libvpx/config/linux/mips32/vp9_rtcd.h
new file mode 100644
index 0000000000..876acc10ef
--- /dev/null
+++ b/media/libvpx/config/linux/mips32/vp9_rtcd.h
@@ -0,0 +1,244 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
+#ifndef VP9_RTCD_H_
+#define VP9_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * VP9
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
+#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER
+#include "vp9/encoder/vp9_temporal_filter.h"
+#endif
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct macroblock_plane;
+struct vp9_sad_table;
+struct ScanOrder;
+struct search_site_config;
+struct mv;
+union int_mv;
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp9_apply_temporal_filter_c(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+#define vp9_apply_temporal_filter vp9_apply_temporal_filter_c
+
+int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
+#define vp9_block_error vp9_block_error_c
+
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+#define vp9_block_error_fp vp9_block_error_fp_c
+
+int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv);
+#define vp9_diamond_search_sad vp9_diamond_search_sad_c
+
+void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht16x16 vp9_fht16x16_c
+
+void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht4x4 vp9_fht4x4_c
+
+void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht8x8 vp9_fht8x8_c
+
+void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_fwht4x4 vp9_fwht4x4_c
+
+void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
+
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c
+
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c
+
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+#define vp9_quantize_fp vp9_quantize_fp_c
+
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
+
+void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
+
+void vpx_convolve12_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve12 vpx_convolve12_c
+
+void vpx_convolve12_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve12_horiz vpx_convolve12_horiz_c
+
+void vpx_convolve12_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve12_vert vpx_convolve12_vert_c
+
+void vp9_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+static void setup_rtcd_internal(void)
+{
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_RTCD_H_
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
+#ifndef VP9_RTCD_H_
+#define VP9_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * VP9
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
+#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER
+#include "vp9/encoder/vp9_temporal_filter.h"
+#endif
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct macroblock_plane;
+struct vp9_sad_table;
+struct ScanOrder;
+struct search_site_config;
+struct mv;
+union int_mv;
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp9_apply_temporal_filter_c(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+#define vp9_apply_temporal_filter vp9_apply_temporal_filter_c
+
+int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
+#define vp9_block_error vp9_block_error_c
+
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+#define vp9_block_error_fp vp9_block_error_fp_c
+
+int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv);
+#define vp9_diamond_search_sad vp9_diamond_search_sad_c
+
+void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht16x16 vp9_fht16x16_c
+
+void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht4x4 vp9_fht4x4_c
+
+void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht8x8 vp9_fht8x8_c
+
+void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_fwht4x4 vp9_fwht4x4_c
+
+void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
+
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c
+
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c
+
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+#define vp9_quantize_fp vp9_quantize_fp_c
+
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
+
+void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
+
+void vpx_convolve12_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve12 vpx_convolve12_c
+
+void vpx_convolve12_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve12_horiz vpx_convolve12_horiz_c
+
+void vpx_convolve12_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve12_vert vpx_convolve12_vert_c
+
+void vp9_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/mips.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = mips_cpu_caps();
+
+    (void)flags;
+
+#if HAVE_DSPR2
+void vpx_dsputil_static_init();
+#if CONFIG_VP8
+void dsputil_static_init();
+#endif
+
+vpx_dsputil_static_init();
+#if CONFIG_VP8
+dsputil_static_init();
+#endif
+#endif
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_RTCD_H_
diff --git a/media/libvpx/config/linux/mips32/vpx_config.asm b/media/libvpx/config/linux/mips32/vpx_config.asm
new file mode 100644
index 0000000000..799a79a073
--- /dev/null
+++ b/media/libvpx/config/linux/mips32/vpx_config.asm
@@ -0,0 +1,97 @@
+@ This file was created from a .asm file
+@  using the ads2gas.pl script.
+.syntax unified
+.equ VPX_ARCH_ARM ,  0
+.equ VPX_ARCH_AARCH64 ,  0
+.equ VPX_ARCH_MIPS ,  1
+.equ VPX_ARCH_X86 ,  0
+.equ VPX_ARCH_X86_64 ,  0
+.equ VPX_ARCH_PPC ,  0
+.equ VPX_ARCH_LOONGARCH ,  0
+.equ HAVE_NEON_ASM ,  0
+.equ HAVE_NEON ,  0
+.equ HAVE_NEON_DOTPROD ,  0
+.equ HAVE_NEON_I8MM ,  0
+.equ HAVE_SVE ,  0
+.equ HAVE_SVE2 ,  0
+.equ HAVE_MIPS32 ,  1
+.equ HAVE_DSPR2 ,  0
+.equ HAVE_MSA ,  0
+.equ HAVE_MIPS64 ,  0
+.equ HAVE_MMX ,  0
+.equ HAVE_SSE ,  0
+.equ HAVE_SSE2 ,  0
+.equ HAVE_SSE3 ,  0
+.equ HAVE_SSSE3 ,  0
+.equ HAVE_SSE4_1 ,  0
+.equ HAVE_AVX ,  0
+.equ HAVE_AVX2 ,  0
+.equ HAVE_AVX512 ,  0
+.equ HAVE_VSX ,  0
+.equ HAVE_MMI ,  0
+.equ HAVE_LSX ,  0
+.equ HAVE_LASX ,  0
+.equ HAVE_VPX_PORTS ,  1
+.equ HAVE_PTHREAD_H ,  1
+.equ CONFIG_DEPENDENCY_TRACKING ,  1
+.equ CONFIG_EXTERNAL_BUILD ,  1
+.equ CONFIG_INSTALL_DOCS ,  0
+.equ CONFIG_INSTALL_BINS ,  1
+.equ CONFIG_INSTALL_LIBS ,  1
+.equ CONFIG_INSTALL_SRCS ,  0
+.equ CONFIG_DEBUG ,  0
+.equ CONFIG_GPROF ,  0
+.equ CONFIG_GCOV ,  0
+.equ CONFIG_RVCT ,  0
+.equ CONFIG_GCC ,  1
+.equ CONFIG_MSVS ,  0
+.equ CONFIG_PIC ,  1
+.equ CONFIG_BIG_ENDIAN ,  0
+.equ CONFIG_CODEC_SRCS ,  0
+.equ CONFIG_DEBUG_LIBS ,  0
+.equ CONFIG_DEQUANT_TOKENS ,  1
+.equ CONFIG_DC_RECON ,  1
+.equ CONFIG_RUNTIME_CPU_DETECT ,  0
+.equ CONFIG_POSTPROC ,  0
+.equ CONFIG_VP9_POSTPROC ,  0
+.equ CONFIG_MULTITHREAD ,  1
+.equ CONFIG_INTERNAL_STATS ,  0
+.equ CONFIG_VP8_ENCODER ,  1
+.equ CONFIG_VP8_DECODER ,  1
+.equ CONFIG_VP9_ENCODER ,  1
+.equ CONFIG_VP9_DECODER ,  1
+.equ CONFIG_VP8 ,  1
+.equ CONFIG_VP9 ,  1
+.equ CONFIG_ENCODERS ,  1
+.equ CONFIG_DECODERS ,  1
+.equ CONFIG_STATIC_MSVCRT ,  0
+.equ CONFIG_SPATIAL_RESAMPLING ,  1
+.equ CONFIG_REALTIME_ONLY ,  0
+.equ CONFIG_ONTHEFLY_BITPACKING ,  0
+.equ CONFIG_ERROR_CONCEALMENT ,  0
+.equ CONFIG_SHARED ,  0
+.equ CONFIG_STATIC ,  1
+.equ CONFIG_SMALL ,  0
+.equ CONFIG_POSTPROC_VISUALIZER ,  0
+.equ CONFIG_OS_SUPPORT ,  1
+.equ CONFIG_UNIT_TESTS ,  0
+.equ CONFIG_WEBM_IO ,  1
+.equ CONFIG_LIBYUV ,  1
+.equ CONFIG_DECODE_PERF_TESTS ,  0
+.equ CONFIG_ENCODE_PERF_TESTS ,  0
+.equ CONFIG_MULTI_RES_ENCODING ,  1
+.equ CONFIG_TEMPORAL_DENOISING ,  1
+.equ CONFIG_VP9_TEMPORAL_DENOISING ,  0
+.equ CONFIG_COEFFICIENT_RANGE_CHECKING ,  0
+.equ CONFIG_VP9_HIGHBITDEPTH ,  0
+.equ CONFIG_BETTER_HW_COMPATIBILITY ,  0
+.equ CONFIG_EXPERIMENTAL ,  0
+.equ CONFIG_SIZE_LIMIT ,  1
+.equ CONFIG_ALWAYS_ADJUST_BPM ,  0
+.equ CONFIG_BITSTREAM_DEBUG ,  0
+.equ CONFIG_MISMATCH_DEBUG ,  0
+.equ CONFIG_FP_MB_STATS ,  0
+.equ CONFIG_EMULATE_HARDWARE ,  0
+.equ CONFIG_NON_GREEDY_MV ,  0
+.equ CONFIG_COLLECT_COMPONENT_TIMING ,  0
+    .section .note.GNU-stack,"",%progbits
diff --git a/media/libvpx/config/linux/mips32/vpx_config.c b/media/libvpx/config/linux/mips32/vpx_config.c
new file mode 100644
index 0000000000..c05113f670
--- /dev/null
+++ b/media/libvpx/config/linux/mips32/vpx_config.c
@@ -0,0 +1,10 @@
+/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */
+/*  */
+/* Use of this source code is governed by a BSD-style license */
+/* that can be found in the LICENSE file in the root of the source */
+/* tree. An additional intellectual property rights grant can be found */
+/* in the file PATENTS.  All contributing project authors may */
+/* be found in the AUTHORS file in the root of the source tree. */
+#include "vpx/vpx_codec.h"
+static const char* const cfg = "--target=mips32-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --enable-runtime-cpu-detect";
+const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/media/libvpx/config/linux/mips32/vpx_config.h b/media/libvpx/config/linux/mips32/vpx_config.h
new file mode 100644
index 0000000000..863749cd8c
--- /dev/null
+++ b/media/libvpx/config/linux/mips32/vpx_config.h
@@ -0,0 +1,108 @@
+/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */
+/*  */
+/* Use of this source code is governed by a BSD-style license */
+/* that can be found in the LICENSE file in the root of the source */
+/* tree. An additional intellectual property rights grant can be found */
+/* in the file PATENTS.  All contributing project authors may */
+/* be found in the AUTHORS file in the root of the source tree. */
+/* This file automatically generated by configure. Do not edit! */
+#ifndef VPX_CONFIG_H
+#define VPX_CONFIG_H
+#define RESTRICT    
+#define INLINE      inline
+#define VPX_ARCH_ARM 0
+#define VPX_ARCH_AARCH64 0
+#define VPX_ARCH_MIPS 1
+#define VPX_ARCH_X86 0
+#define VPX_ARCH_X86_64 0
+#define VPX_ARCH_PPC 0
+#define VPX_ARCH_LOONGARCH 0
+#define HAVE_NEON_ASM 0
+#define HAVE_NEON 0
+#define HAVE_NEON_DOTPROD 0
+#define HAVE_NEON_I8MM 0
+#define HAVE_SVE 0
+#define HAVE_SVE2 0
+#define HAVE_MIPS32 1
+#define HAVE_DSPR2 0
+#define HAVE_MSA 0
+#define HAVE_MIPS64 0
+#define HAVE_MMX 0
+#define HAVE_SSE 0
+#define HAVE_SSE2 0
+#define HAVE_SSE3 0
+#define HAVE_SSSE3 0
+#define HAVE_SSE4_1 0
+#define HAVE_AVX 0
+#define HAVE_AVX2 0
+#define HAVE_AVX512 0
+#define HAVE_VSX 0
+#define HAVE_MMI 0
+#define HAVE_LSX 0
+#define HAVE_LASX 0
+#define HAVE_VPX_PORTS 1
+#define HAVE_PTHREAD_H 1
+#define CONFIG_DEPENDENCY_TRACKING 1
+#define CONFIG_EXTERNAL_BUILD 1
+#define CONFIG_INSTALL_DOCS 0
+#define CONFIG_INSTALL_BINS 1
+#define CONFIG_INSTALL_LIBS 1
+#define CONFIG_INSTALL_SRCS 0
+#define CONFIG_DEBUG 0
+#define CONFIG_GPROF 0
+#define CONFIG_GCOV 0
+#define CONFIG_RVCT 0
+#define CONFIG_GCC 1
+#define CONFIG_MSVS 0
+#define CONFIG_PIC 1
+#define CONFIG_BIG_ENDIAN 0
+#define CONFIG_CODEC_SRCS 0
+#define CONFIG_DEBUG_LIBS 0
+#define CONFIG_DEQUANT_TOKENS 1
+#define CONFIG_DC_RECON 1
+#define CONFIG_RUNTIME_CPU_DETECT 0
+#define CONFIG_POSTPROC 0
+#define CONFIG_VP9_POSTPROC 0
+#define CONFIG_MULTITHREAD 1
+#define CONFIG_INTERNAL_STATS 0
+#define CONFIG_VP8_ENCODER 1
+#define CONFIG_VP8_DECODER 1
+#define CONFIG_VP9_ENCODER 1
+#define CONFIG_VP9_DECODER 1
+#define CONFIG_VP8 1
+#define CONFIG_VP9 1
+#define CONFIG_ENCODERS 1
+#define CONFIG_DECODERS 1
+#define CONFIG_STATIC_MSVCRT 0
+#define CONFIG_SPATIAL_RESAMPLING 1
+#define CONFIG_REALTIME_ONLY 0
+#define CONFIG_ONTHEFLY_BITPACKING 0
+#define CONFIG_ERROR_CONCEALMENT 0
+#define CONFIG_SHARED 0
+#define CONFIG_STATIC 1
+#define CONFIG_SMALL 0
+#define CONFIG_POSTPROC_VISUALIZER 0
+#define CONFIG_OS_SUPPORT 1
+#define CONFIG_UNIT_TESTS 0
+#define CONFIG_WEBM_IO 1
+#define CONFIG_LIBYUV 1
+#define CONFIG_DECODE_PERF_TESTS 0
+#define CONFIG_ENCODE_PERF_TESTS 0
+#define CONFIG_MULTI_RES_ENCODING 1
+#define CONFIG_TEMPORAL_DENOISING 1
+#define CONFIG_VP9_TEMPORAL_DENOISING 0
+#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_VP9_HIGHBITDEPTH 0
+#define CONFIG_BETTER_HW_COMPATIBILITY 0
+#define CONFIG_EXPERIMENTAL 0
+#define CONFIG_SIZE_LIMIT 1
+#define CONFIG_ALWAYS_ADJUST_BPM 0
+#define CONFIG_BITSTREAM_DEBUG 0
+#define CONFIG_MISMATCH_DEBUG 0
+#define CONFIG_FP_MB_STATS 0
+#define CONFIG_EMULATE_HARDWARE 0
+#define CONFIG_NON_GREEDY_MV 0
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
+#define DECODE_WIDTH_LIMIT 8192
+#define DECODE_HEIGHT_LIMIT 4608
+#endif /* VPX_CONFIG_H */
diff --git a/media/libvpx/config/linux/mips32/vpx_dsp_rtcd.h b/media/libvpx/config/linux/mips32/vpx_dsp_rtcd.h
new file mode 100644
index 0000000000..291bf5d03b
--- /dev/null
+++ b/media/libvpx/config/linux/mips32/vpx_dsp_rtcd.h
@@ -0,0 +1,1524 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
+#ifndef VPX_DSP_RTCD_H_
+#define VPX_DSP_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * DSP
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#if CONFIG_VP9_ENCODER
+ struct macroblock_plane;
+ struct ScanOrder;
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned int vpx_avg_4x4_c(const uint8_t *, int p);
+#define vpx_avg_4x4 vpx_avg_4x4_c
+
+unsigned int vpx_avg_8x8_c(const uint8_t *, int p);
+#define vpx_avg_8x8 vpx_avg_8x8_c
+
+void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+#define vpx_comp_avg_pred vpx_comp_avg_pred_c
+
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve8 vpx_convolve8_c
+
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg vpx_convolve8_avg_c
+
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_c
+
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_c
+
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve8_horiz vpx_convolve8_horiz_c
+
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve8_vert vpx_convolve8_vert_c
+
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve_avg vpx_convolve_avg_c
+
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve_copy vpx_convolve_copy_c
+
+void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c
+
+void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c
+
+void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c
+
+void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
+
+void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c
+
+void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c
+
+void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c
+
+void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c
+
+void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c
+
+void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_c
+
+void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_c
+
+void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_c
+
+void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_c
+
+void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_c
+
+void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_c
+
+void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c
+
+void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_c
+
+void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c
+
+void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_c
+
+void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_c
+
+void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c
+
+void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c
+
+void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_c
+
+void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_c
+
+void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c
+
+void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c
+
+void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_c
+
+void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_c
+
+void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_c
+
+void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_c
+
+void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_c
+
+void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_c
+
+void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_c
+
+void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_c
+
+void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_c
+
+void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_c
+
+void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_c
+
+void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_c
+
+void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_c
+
+void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_c
+
+void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_c
+
+void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_c
+
+void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16 vpx_fdct16x16_c
+
+void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16_1 vpx_fdct16x16_1_c
+
+void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32 vpx_fdct32x32_c
+
+void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_1 vpx_fdct32x32_1_c
+
+void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_rd vpx_fdct32x32_rd_c
+
+void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4 vpx_fdct4x4_c
+
+void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4_1 vpx_fdct4x4_1_c
+
+void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct8x8 vpx_fdct8x8_c
+
+void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct8x8_1 vpx_fdct8x8_1_c
+
+void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_get16x16var vpx_get16x16var_c
+
+unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride);
+#define vpx_get4x4sse_cs vpx_get4x4sse_cs_c
+
+void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_get8x8var vpx_get8x8var_c
+
+unsigned int vpx_get_mb_ss_c(const int16_t *);
+#define vpx_get_mb_ss vpx_get_mb_ss_c
+
+void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_16x16 vpx_h_predictor_16x16_c
+
+void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_32x32 vpx_h_predictor_32x32_c
+
+void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_4x4 vpx_h_predictor_4x4_c
+
+void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_c
+
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+#define vpx_hadamard_16x16 vpx_hadamard_16x16_c
+
+void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+#define vpx_hadamard_32x32 vpx_hadamard_32x32_c
+
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+#define vpx_hadamard_8x8 vpx_hadamard_8x8_c
+
+void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
+
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
+
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_1_add vpx_idct16x16_1_add_c
+
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_256_add vpx_idct16x16_256_add_c
+
+void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_38_add vpx_idct16x16_38_add_c
+
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c
+
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct32x32_135_add vpx_idct32x32_135_add_c
+
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct32x32_1_add vpx_idct32x32_1_add_c
+
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct32x32_34_add vpx_idct32x32_34_add_c
+
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct4x4_16_add vpx_idct4x4_16_add_c
+
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct4x4_1_add vpx_idct4x4_1_add_c
+
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct8x8_12_add vpx_idct8x8_12_add_c
+
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct8x8_1_add vpx_idct8x8_1_add_c
+
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct8x8_64_add vpx_idct8x8_64_add_c
+
+int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
+#define vpx_int_pro_col vpx_int_pro_col_c
+
+void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
+#define vpx_int_pro_row vpx_int_pro_row_c
+
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
+
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
+
+void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_horizontal_16 vpx_lpf_horizontal_16_c
+
+void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_horizontal_16_dual vpx_lpf_horizontal_16_dual_c
+
+void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_horizontal_4 vpx_lpf_horizontal_4_c
+
+void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_horizontal_4_dual vpx_lpf_horizontal_4_dual_c
+
+void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_horizontal_8 vpx_lpf_horizontal_8_c
+
+void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_horizontal_8_dual vpx_lpf_horizontal_8_dual_c
+
+void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_16 vpx_lpf_vertical_16_c
+
+void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_16_dual vpx_lpf_vertical_16_dual_c
+
+void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_4 vpx_lpf_vertical_4_c
+
+void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_vertical_4_dual vpx_lpf_vertical_4_dual_c
+
+void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_8 vpx_lpf_vertical_8_c
+
+void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_c
+
+void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vpx_minmax_8x8 vpx_minmax_8x8_c
+
+unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_mse16x16 vpx_mse16x16_c
+
+unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_mse16x8 vpx_mse16x8_c
+
+unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_mse8x16 vpx_mse8x16_c
+
+unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_mse8x8 vpx_mse8x8_c
+
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+#define vpx_quantize_b vpx_quantize_b_c
+
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c
+
+unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x16 vpx_sad16x16_c
+
+unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x16_avg vpx_sad16x16_avg_c
+
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad16x16x4d vpx_sad16x16x4d_c
+
+unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x32 vpx_sad16x32_c
+
+unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x32_avg vpx_sad16x32_avg_c
+
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad16x32x4d vpx_sad16x32x4d_c
+
+unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x8 vpx_sad16x8_c
+
+unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x8_avg vpx_sad16x8_avg_c
+
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad16x8x4d vpx_sad16x8x4d_c
+
+unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x16 vpx_sad32x16_c
+
+unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x16_avg vpx_sad32x16_avg_c
+
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad32x16x4d vpx_sad32x16x4d_c
+
+unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x32 vpx_sad32x32_c
+
+unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x32_avg vpx_sad32x32_avg_c
+
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad32x32x4d vpx_sad32x32x4d_c
+
+unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x64 vpx_sad32x64_c
+
+unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x64_avg vpx_sad32x64_avg_c
+
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad32x64x4d vpx_sad32x64x4d_c
+
+unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x4 vpx_sad4x4_c
+
+unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x4_avg vpx_sad4x4_avg_c
+
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad4x4x4d vpx_sad4x4x4d_c
+
+unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x8 vpx_sad4x8_c
+
+unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x8_avg vpx_sad4x8_avg_c
+
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad4x8x4d vpx_sad4x8x4d_c
+
+unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad64x32 vpx_sad64x32_c
+
+unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad64x32_avg vpx_sad64x32_avg_c
+
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad64x32x4d vpx_sad64x32x4d_c
+
+unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad64x64 vpx_sad64x64_c
+
+unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad64x64_avg vpx_sad64x64_avg_c
+
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad64x64x4d vpx_sad64x64x4d_c
+
+unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x16 vpx_sad8x16_c
+
+unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x16_avg vpx_sad8x16_avg_c
+
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad8x16x4d vpx_sad8x16x4d_c
+
+unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x4 vpx_sad8x4_c
+
+unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x4_avg vpx_sad8x4_avg_c
+
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad8x4x4d vpx_sad8x4x4d_c
+
+unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x8 vpx_sad8x8_c
+
+unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x8_avg vpx_sad8x8_avg_c
+
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad8x8x4d vpx_sad8x8x4d_c
+
+unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x16 vpx_sad_skip_16x16_c
+
+void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x16x4d vpx_sad_skip_16x16x4d_c
+
+unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x32 vpx_sad_skip_16x32_c
+
+void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x32x4d vpx_sad_skip_16x32x4d_c
+
+unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x8 vpx_sad_skip_16x8_c
+
+void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x8x4d vpx_sad_skip_16x8x4d_c
+
+unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_32x16 vpx_sad_skip_32x16_c
+
+void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_32x16x4d vpx_sad_skip_32x16x4d_c
+
+unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_32x32 vpx_sad_skip_32x32_c
+
+void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_32x32x4d vpx_sad_skip_32x32x4d_c
+
+unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_32x64 vpx_sad_skip_32x64_c
+
+void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_32x64x4d vpx_sad_skip_32x64x4d_c
+
+unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c
+
+void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c
+
+unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_c
+
+void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_c
+
+unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_64x32 vpx_sad_skip_64x32_c
+
+void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_64x32x4d vpx_sad_skip_64x32x4d_c
+
+unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_64x64 vpx_sad_skip_64x64_c
+
+void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_64x64x4d vpx_sad_skip_64x64x4d_c
+
+unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_c
+
+void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_c
+
+unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c
+
+void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c
+
+unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_c
+
+void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_c
+
+int vpx_satd_c(const int16_t *coeff, int length);
+#define vpx_satd vpx_satd_c
+
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_2d vpx_scaled_2d_c
+
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
+
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
+
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
+
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_horiz vpx_scaled_horiz_c
+
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_vert vpx_scaled_vert_c
+
+int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+#define vpx_sse vpx_sse_c
+
+uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_c
+
+uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_c
+
+uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_c
+
+uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_c
+
+uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_c
+
+uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_c
+
+uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c
+
+uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c
+
+uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_c
+
+uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_c
+
+uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_c
+
+uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_c
+
+uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x8 vpx_sub_pixel_variance8x8_c
+
+void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+#define vpx_subtract_block vpx_subtract_block_c
+
+uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size);
+#define vpx_sum_squares_2d_i16 vpx_sum_squares_2d_i16_c
+
+void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_c
+
+void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_c
+
+void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_c
+
+void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_c
+
+void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_16x16 vpx_v_predictor_16x16_c
+
+void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_32x32 vpx_v_predictor_32x32_c
+
+void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_4x4 vpx_v_predictor_4x4_c
+
+void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_8x8 vpx_v_predictor_8x8_c
+
+unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x16 vpx_variance16x16_c
+
+unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x32 vpx_variance16x32_c
+
+unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x8 vpx_variance16x8_c
+
+unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x16 vpx_variance32x16_c
+
+unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x32 vpx_variance32x32_c
+
+unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x64 vpx_variance32x64_c
+
+unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance4x4 vpx_variance4x4_c
+
+unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance4x8 vpx_variance4x8_c
+
+unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance64x32 vpx_variance64x32_c
+
+unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance64x64 vpx_variance64x64_c
+
+unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x16 vpx_variance8x16_c
+
+unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x4 vpx_variance8x4_c
+
+unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x8 vpx_variance8x8_c
+
+void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c
+
+int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl);
+#define vpx_vector_var vpx_vector_var_c
+
+void vpx_dsp_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+static void setup_rtcd_internal(void)
+{
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_RTCD_H_
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
+#ifndef VPX_DSP_RTCD_H_
+#define VPX_DSP_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * DSP
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#if CONFIG_VP9_ENCODER
+ struct macroblock_plane;
+ struct ScanOrder;
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned int vpx_avg_4x4_c(const uint8_t *, int p);
+#define vpx_avg_4x4 vpx_avg_4x4_c
+
+unsigned int vpx_avg_8x8_c(const uint8_t *, int p);
+#define vpx_avg_8x8 vpx_avg_8x8_c
+
+void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+#define vpx_comp_avg_pred vpx_comp_avg_pred_c
+
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve8 vpx_convolve8_c
+
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg vpx_convolve8_avg_c
+
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_c
+
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_c
+
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve8_horiz vpx_convolve8_horiz_c
+
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve8_vert vpx_convolve8_vert_c
+
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve_avg vpx_convolve_avg_c
+
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve_copy vpx_convolve_copy_c
+
+void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c
+
+void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c
+
+void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c
+
+void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
+
+void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c
+
+void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c
+
+void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c
+
+void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c
+
+void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c
+
+void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_c
+
+void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_c
+
+void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_c
+
+void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_c
+
+void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_c
+
+void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_c
+
+void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c
+
+void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_c
+
+void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c
+
+void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_c
+
+void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_c
+
+void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c
+
+void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c
+
+void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_c
+
+void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_c
+
+void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c
+
+void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c
+
+void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_c
+
+void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_c
+
+void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_c
+
+void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_c
+
+void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_c
+
+void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_c
+
+void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_c
+
+void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_c
+
+void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_c
+
+void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_c
+
+void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_c
+
+void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_c
+
+void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_c
+
+void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_c
+
+void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_c
+
+void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_c
+
+void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16 vpx_fdct16x16_c
+
+void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16_1 vpx_fdct16x16_1_c
+
+void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32 vpx_fdct32x32_c
+
+void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_1 vpx_fdct32x32_1_c
+
+void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_rd vpx_fdct32x32_rd_c
+
+void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4 vpx_fdct4x4_c
+
+void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4_1 vpx_fdct4x4_1_c
+
+void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct8x8 vpx_fdct8x8_c
+
+void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct8x8_1 vpx_fdct8x8_1_c
+
+void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_get16x16var vpx_get16x16var_c
+
+unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride);
+#define vpx_get4x4sse_cs vpx_get4x4sse_cs_c
+
+void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_get8x8var vpx_get8x8var_c
+
+unsigned int vpx_get_mb_ss_c(const int16_t *);
+#define vpx_get_mb_ss vpx_get_mb_ss_c
+
+void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_16x16 vpx_h_predictor_16x16_c
+
+void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_32x32 vpx_h_predictor_32x32_c
+
+void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_4x4 vpx_h_predictor_4x4_c
+
+void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_c
+
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+#define vpx_hadamard_16x16 vpx_hadamard_16x16_c
+
+void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+#define vpx_hadamard_32x32 vpx_hadamard_32x32_c
+
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+#define vpx_hadamard_8x8 vpx_hadamard_8x8_c
+
+void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
+
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
+
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_1_add vpx_idct16x16_1_add_c
+
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_256_add vpx_idct16x16_256_add_c
+
+void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_38_add vpx_idct16x16_38_add_c
+
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c
+
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct32x32_135_add vpx_idct32x32_135_add_c
+
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct32x32_1_add vpx_idct32x32_1_add_c
+
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct32x32_34_add vpx_idct32x32_34_add_c
+
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct4x4_16_add vpx_idct4x4_16_add_c
+
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct4x4_1_add vpx_idct4x4_1_add_c
+
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct8x8_12_add vpx_idct8x8_12_add_c
+
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct8x8_1_add vpx_idct8x8_1_add_c
+
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct8x8_64_add vpx_idct8x8_64_add_c
+
+int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
+#define vpx_int_pro_col vpx_int_pro_col_c
+
+void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
+#define vpx_int_pro_row vpx_int_pro_row_c
+
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
+
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
+
+void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_horizontal_16 vpx_lpf_horizontal_16_c
+
+void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_horizontal_16_dual vpx_lpf_horizontal_16_dual_c
+
+void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_horizontal_4 vpx_lpf_horizontal_4_c
+
+void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_horizontal_4_dual vpx_lpf_horizontal_4_dual_c
+
+void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_horizontal_8 vpx_lpf_horizontal_8_c
+
+void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_horizontal_8_dual vpx_lpf_horizontal_8_dual_c
+
+void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_16 vpx_lpf_vertical_16_c
+
+void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_16_dual vpx_lpf_vertical_16_dual_c
+
+void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_4 vpx_lpf_vertical_4_c
+
+void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_vertical_4_dual vpx_lpf_vertical_4_dual_c
+
+void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_8 vpx_lpf_vertical_8_c
+
+void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_c
+
+void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vpx_minmax_8x8 vpx_minmax_8x8_c
+
+unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_mse16x16 vpx_mse16x16_c
+
+unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_mse16x8 vpx_mse16x8_c
+
+unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_mse8x16 vpx_mse8x16_c
+
+unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_mse8x8 vpx_mse8x8_c
+
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+#define vpx_quantize_b vpx_quantize_b_c
+
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c
+
+unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x16 vpx_sad16x16_c
+
+unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x16_avg vpx_sad16x16_avg_c
+
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad16x16x4d vpx_sad16x16x4d_c
+
+unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x32 vpx_sad16x32_c
+
+unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x32_avg vpx_sad16x32_avg_c
+
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad16x32x4d vpx_sad16x32x4d_c
+
+unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x8 vpx_sad16x8_c
+
+unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x8_avg vpx_sad16x8_avg_c
+
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad16x8x4d vpx_sad16x8x4d_c
+
+unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x16 vpx_sad32x16_c
+
+unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x16_avg vpx_sad32x16_avg_c
+
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad32x16x4d vpx_sad32x16x4d_c
+
+unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x32 vpx_sad32x32_c
+
+unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x32_avg vpx_sad32x32_avg_c
+
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad32x32x4d vpx_sad32x32x4d_c
+
+unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x64 vpx_sad32x64_c
+
+unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x64_avg vpx_sad32x64_avg_c
+
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad32x64x4d vpx_sad32x64x4d_c
+
+unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x4 vpx_sad4x4_c
+
+unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x4_avg vpx_sad4x4_avg_c
+
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad4x4x4d vpx_sad4x4x4d_c
+
+unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x8 vpx_sad4x8_c
+
+unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x8_avg vpx_sad4x8_avg_c
+
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad4x8x4d vpx_sad4x8x4d_c
+
+unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad64x32 vpx_sad64x32_c
+
+unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad64x32_avg vpx_sad64x32_avg_c
+
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad64x32x4d vpx_sad64x32x4d_c
+
+unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad64x64 vpx_sad64x64_c
+
+unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad64x64_avg vpx_sad64x64_avg_c
+
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad64x64x4d vpx_sad64x64x4d_c
+
+unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x16 vpx_sad8x16_c
+
+unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x16_avg vpx_sad8x16_avg_c
+
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad8x16x4d vpx_sad8x16x4d_c
+
+unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x4 vpx_sad8x4_c
+
+unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x4_avg vpx_sad8x4_avg_c
+
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad8x4x4d vpx_sad8x4x4d_c
+
+unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x8 vpx_sad8x8_c
+
+unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x8_avg vpx_sad8x8_avg_c
+
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad8x8x4d vpx_sad8x8x4d_c
+
+unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x16 vpx_sad_skip_16x16_c
+
+void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x16x4d vpx_sad_skip_16x16x4d_c
+
+unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x32 vpx_sad_skip_16x32_c
+
+void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x32x4d vpx_sad_skip_16x32x4d_c
+
+unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x8 vpx_sad_skip_16x8_c
+
+void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x8x4d vpx_sad_skip_16x8x4d_c
+
+unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_32x16 vpx_sad_skip_32x16_c
+
+void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_32x16x4d vpx_sad_skip_32x16x4d_c
+
+unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_32x32 vpx_sad_skip_32x32_c
+
+void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_32x32x4d vpx_sad_skip_32x32x4d_c
+
+unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_32x64 vpx_sad_skip_32x64_c
+
+void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_32x64x4d vpx_sad_skip_32x64x4d_c
+
+unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c
+
+void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c
+
+unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_c
+
+void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_c
+
+unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_64x32 vpx_sad_skip_64x32_c
+
+void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_64x32x4d vpx_sad_skip_64x32x4d_c
+
+unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_64x64 vpx_sad_skip_64x64_c
+
+void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_64x64x4d vpx_sad_skip_64x64x4d_c
+
+unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_c
+
+void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_c
+
+unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c
+
+void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c
+
+unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_c
+
+void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_c
+
+int vpx_satd_c(const int16_t *coeff, int length);
+#define vpx_satd vpx_satd_c
+
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_2d vpx_scaled_2d_c
+
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
+
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
+
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
+
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_horiz vpx_scaled_horiz_c
+
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_vert vpx_scaled_vert_c
+
+int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+#define vpx_sse vpx_sse_c
+
+uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_c
+
+uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_c
+
+uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_c
+
+uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_c
+
+uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_c
+
+uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_c
+
+uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c
+
+uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c
+
+uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_c
+
+uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_c
+
+uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_c
+
+uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_c
+
+uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x8 vpx_sub_pixel_variance8x8_c
+
+void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+#define vpx_subtract_block vpx_subtract_block_c
+
+uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size);
+#define vpx_sum_squares_2d_i16 vpx_sum_squares_2d_i16_c
+
+void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_c
+
+void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_c
+
+void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_c
+
+void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_c
+
+void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_16x16 vpx_v_predictor_16x16_c
+
+void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_32x32 vpx_v_predictor_32x32_c
+
+void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_4x4 vpx_v_predictor_4x4_c
+
+void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_8x8 vpx_v_predictor_8x8_c
+
+unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x16 vpx_variance16x16_c
+
+unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x32 vpx_variance16x32_c
+
+unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x8 vpx_variance16x8_c
+
+unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x16 vpx_variance32x16_c
+
+unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x32 vpx_variance32x32_c
+
+unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x64 vpx_variance32x64_c
+
+unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance4x4 vpx_variance4x4_c
+
+unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance4x8 vpx_variance4x8_c
+
+unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance64x32 vpx_variance64x32_c
+
+unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance64x64 vpx_variance64x64_c
+
+unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x16 vpx_variance8x16_c
+
+unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x4 vpx_variance8x4_c
+
+unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x8 vpx_variance8x8_c
+
+void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c
+
+int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl);
+#define vpx_vector_var vpx_vector_var_c
+
+void vpx_dsp_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/mips.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = mips_cpu_caps();
+
+    (void)flags;
+
+#if HAVE_DSPR2
+void vpx_dsputil_static_init();
+#if CONFIG_VP8
+void dsputil_static_init();
+#endif
+
+vpx_dsputil_static_init();
+#if CONFIG_VP8
+dsputil_static_init();
+#endif
+#endif
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_RTCD_H_
diff --git a/media/libvpx/config/linux/mips32/vpx_scale_rtcd.h b/media/libvpx/config/linux/mips32/vpx_scale_rtcd.h
new file mode 100644
index 0000000000..982dbe30d9
--- /dev/null
+++ b/media/libvpx/config/linux/mips32/vpx_scale_rtcd.h
@@ -0,0 +1,176 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
+#ifndef VPX_SCALE_RTCD_H_
+#define VPX_SCALE_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c
+
+void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c
+
+void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c
+
+void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+#define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c
+
+void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+#define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c
+
+void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c
+
+void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c
+
+void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vp8_yv12_copy_frame vp8_yv12_copy_frame_c
+
+void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf);
+#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c
+
+void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
+#define vpx_extend_frame_borders vpx_extend_frame_borders_c
+
+void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
+#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
+
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
+void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_y vpx_yv12_copy_y_c
+
+void vpx_scale_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+static void setup_rtcd_internal(void)
+{
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_SCALE_RTCD_H_
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
+#ifndef VPX_SCALE_RTCD_H_
+#define VPX_SCALE_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c
+
+void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c
+
+void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c
+
+void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+#define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c
+
+void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+#define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c
+
+void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c
+
+void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c
+
+void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vp8_yv12_copy_frame vp8_yv12_copy_frame_c
+
+void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf);
+#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c
+
+void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
+#define vpx_extend_frame_borders vpx_extend_frame_borders_c
+
+void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
+#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
+
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
+void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_y vpx_yv12_copy_y_c
+
+void vpx_scale_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/mips.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = mips_cpu_caps();
+
+    (void)flags;
+
+#if HAVE_DSPR2
+void vpx_dsputil_static_init();
+#if CONFIG_VP8
+void dsputil_static_init();
+#endif
+
+vpx_dsputil_static_init();
+#if CONFIG_VP8
+dsputil_static_init();
+#endif
+#endif
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_SCALE_RTCD_H_
diff --git a/media/libvpx/config/linux/mips64/vp8_rtcd.h b/media/libvpx/config/linux/mips64/vp8_rtcd.h
new file mode 100644
index 0000000000..426c1a6d37
--- /dev/null
+++ b/media/libvpx/config/linux/mips64/vp8_rtcd.h
@@ -0,0 +1,259 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
+#ifndef VP8_RTCD_H_
+#define VP8_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * VP8
+ */
+
+struct blockd;
+struct macroblockd;
+struct loop_filter_info;
+
+/* Encoder forward decls */
+struct block;
+struct macroblock;
+struct variance_vtable;
+union int_mv;
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict16x16_msa(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_msa
+
+void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict4x4_msa(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_msa
+
+void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict8x4_msa(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_msa
+
+void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict8x8_msa(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_msa
+
+int vp8_block_error_c(short *coeff, short *dqcoeff);
+int vp8_block_error_msa(short *coeff, short *dqcoeff);
+#define vp8_block_error vp8_block_error_msa
+
+void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
+#define vp8_copy32xn vp8_copy32xn_c
+
+void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem16x16_mmi(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem16x16_msa(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+#define vp8_copy_mem16x16 vp8_copy_mem16x16_msa
+
+void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem8x4_mmi(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem8x4_msa(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+#define vp8_copy_mem8x4 vp8_copy_mem8x4_msa
+
+void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem8x8_mmi(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem8x8_msa(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+#define vp8_copy_mem8x8 vp8_copy_mem8x8_msa
+
+void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+void vp8_dc_only_idct_add_mmi(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+void vp8_dc_only_idct_add_msa(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+#define vp8_dc_only_idct_add vp8_dc_only_idct_add_msa
+
+int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+int vp8_denoiser_filter_msa(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+#define vp8_denoiser_filter vp8_denoiser_filter_msa
+
+int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+int vp8_denoiser_filter_uv_msa(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_msa
+
+void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride);
+void vp8_dequant_idct_add_mmi(short *input, short *dq, unsigned char *dest, int stride);
+void vp8_dequant_idct_add_msa(short *input, short *dq, unsigned char *dest, int stride);
+#define vp8_dequant_idct_add vp8_dequant_idct_add_msa
+
+void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
+void vp8_dequant_idct_add_uv_block_mmi(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
+void vp8_dequant_idct_add_uv_block_msa(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
+#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_msa
+
+void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
+void vp8_dequant_idct_add_y_block_mmi(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
+void vp8_dequant_idct_add_y_block_msa(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
+#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_msa
+
+void vp8_dequantize_b_c(struct blockd*, short *DQC);
+void vp8_dequantize_b_mmi(struct blockd*, short *DQC);
+void vp8_dequantize_b_msa(struct blockd*, short *DQC);
+#define vp8_dequantize_b vp8_dequantize_b_msa
+
+int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_diamond_search_sad vp8_diamond_search_sadx4
+
+void vp8_fast_quantize_b_c(struct block *, struct blockd *);
+void vp8_fast_quantize_b_mmi(struct block *, struct blockd *);
+void vp8_fast_quantize_b_msa(struct block *, struct blockd *);
+#define vp8_fast_quantize_b vp8_fast_quantize_b_msa
+
+void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bh_mmi(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bh_msa(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_bh vp8_loop_filter_bh_msa
+
+void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bv_mmi(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bv_msa(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_bv vp8_loop_filter_bv_msa
+
+void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbh_mmi(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbh_msa(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_mbh vp8_loop_filter_mbh_msa
+
+void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbv_mmi(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbv_msa(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_mbv vp8_loop_filter_mbv_msa
+
+void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_bhs_mmi(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_bhs_msa(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_msa
+
+void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_bvs_mmi(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_bvs_msa(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_msa
+
+void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_simple_horizontal_edge_msa(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_msa
+
+void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_simple_vertical_edge_msa(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_msa
+
+int vp8_mbblock_error_c(struct macroblock *mb, int dc);
+int vp8_mbblock_error_msa(struct macroblock *mb, int dc);
+#define vp8_mbblock_error vp8_mbblock_error_msa
+
+int vp8_mbuverror_c(struct macroblock *mb);
+int vp8_mbuverror_msa(struct macroblock *mb);
+#define vp8_mbuverror vp8_mbuverror_msa
+
+int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_refining_search_sad vp8_refining_search_sadx4
+
+void vp8_regular_quantize_b_c(struct block *, struct blockd *);
+void vp8_regular_quantize_b_mmi(struct block *, struct blockd *);
+void vp8_regular_quantize_b_msa(struct block *, struct blockd *);
+#define vp8_regular_quantize_b vp8_regular_quantize_b_msa
+
+void vp8_short_fdct4x4_c(short *input, short *output, int pitch);
+void vp8_short_fdct4x4_mmi(short *input, short *output, int pitch);
+void vp8_short_fdct4x4_msa(short *input, short *output, int pitch);
+#define vp8_short_fdct4x4 vp8_short_fdct4x4_msa
+
+void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
+void vp8_short_fdct8x4_mmi(short *input, short *output, int pitch);
+void vp8_short_fdct8x4_msa(short *input, short *output, int pitch);
+#define vp8_short_fdct8x4 vp8_short_fdct8x4_msa
+
+void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+void vp8_short_idct4x4llm_mmi(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+void vp8_short_idct4x4llm_msa(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+#define vp8_short_idct4x4llm vp8_short_idct4x4llm_msa
+
+void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff);
+void vp8_short_inv_walsh4x4_mmi(short *input, short *mb_dqcoeff);
+void vp8_short_inv_walsh4x4_msa(short *input, short *mb_dqcoeff);
+#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_msa
+
+void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff);
+#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c
+
+void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
+void vp8_short_walsh4x4_mmi(short *input, short *output, int pitch);
+void vp8_short_walsh4x4_msa(short *input, short *output, int pitch);
+#define vp8_short_walsh4x4 vp8_short_walsh4x4_msa
+
+void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict16x16_mmi(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict16x16_msa(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_msa
+
+void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict4x4_mmi(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict4x4_msa(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_msa
+
+void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x4_mmi(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x4_msa(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_msa
+
+void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x8_mmi(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x8_msa(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_msa
+
+void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count);
+void vp8_temporal_filter_apply_msa(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count);
+#define vp8_temporal_filter_apply vp8_temporal_filter_apply_msa
+
+void vp8_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/mips.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = mips_cpu_caps();
+
+    (void)flags;
+
+#if HAVE_DSPR2
+void vpx_dsputil_static_init();
+#if CONFIG_VP8
+void dsputil_static_init();
+#endif
+
+vpx_dsputil_static_init();
+#if CONFIG_VP8
+dsputil_static_init();
+#endif
+#endif
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_RTCD_H_
diff --git a/media/libvpx/config/linux/mips64/vp9_rtcd.h b/media/libvpx/config/linux/mips64/vp9_rtcd.h
new file mode 100644
index 0000000000..4ab5c98198
--- /dev/null
+++ b/media/libvpx/config/linux/mips64/vp9_rtcd.h
@@ -0,0 +1,138 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
+#ifndef VP9_RTCD_H_
+#define VP9_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * VP9
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
+#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER
+#include "vp9/encoder/vp9_temporal_filter.h"
+#endif
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct macroblock_plane;
+struct vp9_sad_table;
+struct ScanOrder;
+struct search_site_config;
+struct mv;
+union int_mv;
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp9_apply_temporal_filter_c(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+#define vp9_apply_temporal_filter vp9_apply_temporal_filter_c
+
+int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
+int64_t vp9_block_error_msa(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
+#define vp9_block_error vp9_block_error_msa
+
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+#define vp9_block_error_fp vp9_block_error_fp_c
+
+int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv);
+#define vp9_diamond_search_sad vp9_diamond_search_sad_c
+
+void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+void vp9_fht16x16_msa(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht16x16 vp9_fht16x16_msa
+
+void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+void vp9_fht4x4_msa(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht4x4 vp9_fht4x4_msa
+
+void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+void vp9_fht8x8_msa(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht8x8 vp9_fht8x8_msa
+
+void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+void vp9_fwht4x4_msa(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_fwht4x4 vp9_fwht4x4_msa
+
+void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht16x16_256_add_msa(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+#define vp9_iht16x16_256_add vp9_iht16x16_256_add_msa
+
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht4x4_16_add_msa(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+#define vp9_iht4x4_16_add vp9_iht4x4_16_add_msa
+
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht8x8_64_add_msa(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+#define vp9_iht8x8_64_add vp9_iht8x8_64_add_msa
+
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+#define vp9_quantize_fp vp9_quantize_fp_c
+
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
+
+void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
+
+void vpx_convolve12_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve12 vpx_convolve12_c
+
+void vpx_convolve12_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve12_horiz vpx_convolve12_horiz_c
+
+void vpx_convolve12_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve12_vert vpx_convolve12_vert_c
+
+void vp9_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/mips.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = mips_cpu_caps();
+
+    (void)flags;
+
+#if HAVE_DSPR2
+void vpx_dsputil_static_init();
+#if CONFIG_VP8
+void dsputil_static_init();
+#endif
+
+vpx_dsputil_static_init();
+#if CONFIG_VP8
+dsputil_static_init();
+#endif
+#endif
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_RTCD_H_
diff --git a/media/libvpx/config/linux/mips64/vpx_config.asm b/media/libvpx/config/linux/mips64/vpx_config.asm
new file mode 100644
index 0000000000..0a8b7a6c85
--- /dev/null
+++ b/media/libvpx/config/linux/mips64/vpx_config.asm
@@ -0,0 +1,97 @@
+@ This file was created from a .asm file
+@  using the ads2gas.pl script.
+.syntax unified
+.equ VPX_ARCH_ARM ,  0
+.equ VPX_ARCH_AARCH64 ,  0
+.equ VPX_ARCH_MIPS ,  1
+.equ VPX_ARCH_X86 ,  0
+.equ VPX_ARCH_X86_64 ,  0
+.equ VPX_ARCH_PPC ,  0
+.equ VPX_ARCH_LOONGARCH ,  0
+.equ HAVE_NEON_ASM ,  0
+.equ HAVE_NEON ,  0
+.equ HAVE_NEON_DOTPROD ,  0
+.equ HAVE_NEON_I8MM ,  0
+.equ HAVE_SVE ,  0
+.equ HAVE_SVE2 ,  0
+.equ HAVE_MIPS32 ,  0
+.equ HAVE_DSPR2 ,  0
+.equ HAVE_MSA ,  1
+.equ HAVE_MIPS64 ,  1
+.equ HAVE_MMX ,  0
+.equ HAVE_SSE ,  0
+.equ HAVE_SSE2 ,  0
+.equ HAVE_SSE3 ,  0
+.equ HAVE_SSSE3 ,  0
+.equ HAVE_SSE4_1 ,  0
+.equ HAVE_AVX ,  0
+.equ HAVE_AVX2 ,  0
+.equ HAVE_AVX512 ,  0
+.equ HAVE_VSX ,  0
+.equ HAVE_MMI ,  1
+.equ HAVE_LSX ,  0
+.equ HAVE_LASX ,  0
+.equ HAVE_VPX_PORTS ,  1
+.equ HAVE_PTHREAD_H ,  1
+.equ CONFIG_DEPENDENCY_TRACKING ,  1
+.equ CONFIG_EXTERNAL_BUILD ,  1
+.equ CONFIG_INSTALL_DOCS ,  0
+.equ CONFIG_INSTALL_BINS ,  1
+.equ CONFIG_INSTALL_LIBS ,  1
+.equ CONFIG_INSTALL_SRCS ,  0
+.equ CONFIG_DEBUG ,  0
+.equ CONFIG_GPROF ,  0
+.equ CONFIG_GCOV ,  0
+.equ CONFIG_RVCT ,  0
+.equ CONFIG_GCC ,  1
+.equ CONFIG_MSVS ,  0
+.equ CONFIG_PIC ,  1
+.equ CONFIG_BIG_ENDIAN ,  0
+.equ CONFIG_CODEC_SRCS ,  0
+.equ CONFIG_DEBUG_LIBS ,  0
+.equ CONFIG_DEQUANT_TOKENS ,  1
+.equ CONFIG_DC_RECON ,  1
+.equ CONFIG_RUNTIME_CPU_DETECT ,  0
+.equ CONFIG_POSTPROC ,  0
+.equ CONFIG_VP9_POSTPROC ,  0
+.equ CONFIG_MULTITHREAD ,  1
+.equ CONFIG_INTERNAL_STATS ,  0
+.equ CONFIG_VP8_ENCODER ,  1
+.equ CONFIG_VP8_DECODER ,  1
+.equ CONFIG_VP9_ENCODER ,  1
+.equ CONFIG_VP9_DECODER ,  1
+.equ CONFIG_VP8 ,  1
+.equ CONFIG_VP9 ,  1
+.equ CONFIG_ENCODERS ,  1
+.equ CONFIG_DECODERS ,  1
+.equ CONFIG_STATIC_MSVCRT ,  0
+.equ CONFIG_SPATIAL_RESAMPLING ,  1
+.equ CONFIG_REALTIME_ONLY ,  0
+.equ CONFIG_ONTHEFLY_BITPACKING ,  0
+.equ CONFIG_ERROR_CONCEALMENT ,  0
+.equ CONFIG_SHARED ,  0
+.equ CONFIG_STATIC ,  1
+.equ CONFIG_SMALL ,  0
+.equ CONFIG_POSTPROC_VISUALIZER ,  0
+.equ CONFIG_OS_SUPPORT ,  1
+.equ CONFIG_UNIT_TESTS ,  0
+.equ CONFIG_WEBM_IO ,  1
+.equ CONFIG_LIBYUV ,  0
+.equ CONFIG_DECODE_PERF_TESTS ,  0
+.equ CONFIG_ENCODE_PERF_TESTS ,  0
+.equ CONFIG_MULTI_RES_ENCODING ,  1
+.equ CONFIG_TEMPORAL_DENOISING ,  1
+.equ CONFIG_VP9_TEMPORAL_DENOISING ,  0
+.equ CONFIG_COEFFICIENT_RANGE_CHECKING ,  0
+.equ CONFIG_VP9_HIGHBITDEPTH ,  0
+.equ CONFIG_BETTER_HW_COMPATIBILITY ,  0
+.equ CONFIG_EXPERIMENTAL ,  0
+.equ CONFIG_SIZE_LIMIT ,  1
+.equ CONFIG_ALWAYS_ADJUST_BPM ,  0
+.equ CONFIG_BITSTREAM_DEBUG ,  0
+.equ CONFIG_MISMATCH_DEBUG ,  0
+.equ CONFIG_FP_MB_STATS ,  0
+.equ CONFIG_EMULATE_HARDWARE ,  0
+.equ CONFIG_NON_GREEDY_MV ,  0
+.equ CONFIG_COLLECT_COMPONENT_TIMING ,  0
+    .section .note.GNU-stack,"",%progbits
diff --git a/media/libvpx/config/linux/mips64/vpx_config.c b/media/libvpx/config/linux/mips64/vpx_config.c
new file mode 100644
index 0000000000..29d6e5c004
--- /dev/null
+++ b/media/libvpx/config/linux/mips64/vpx_config.c
@@ -0,0 +1,10 @@
+/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */
+/*  */
+/* Use of this source code is governed by a BSD-style license */
+/* that can be found in the LICENSE file in the root of the source */
+/* tree. An additional intellectual property rights grant can be found */
+/* in the file PATENTS.  All contributing project authors may */
+/* be found in the AUTHORS file in the root of the source tree. */
+#include "vpx/vpx_codec.h"
+static const char* const cfg = "--target=mips64-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --enable-runtime-cpu-detect --cpu=loongson3";
+const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/media/libvpx/config/linux/mips64/vpx_config.h b/media/libvpx/config/linux/mips64/vpx_config.h
new file mode 100644
index 0000000000..ceed69be84
--- /dev/null
+++ b/media/libvpx/config/linux/mips64/vpx_config.h
@@ -0,0 +1,108 @@
+/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */
+/*  */
+/* Use of this source code is governed by a BSD-style license */
+/* that can be found in the LICENSE file in the root of the source */
+/* tree. An additional intellectual property rights grant can be found */
+/* in the file PATENTS.  All contributing project authors may */
+/* be found in the AUTHORS file in the root of the source tree. */
+/* This file automatically generated by configure. Do not edit! */
+#ifndef VPX_CONFIG_H
+#define VPX_CONFIG_H
+#define RESTRICT    
+#define INLINE      inline
+#define VPX_ARCH_ARM 0
+#define VPX_ARCH_AARCH64 0
+#define VPX_ARCH_MIPS 1
+#define VPX_ARCH_X86 0
+#define VPX_ARCH_X86_64 0
+#define VPX_ARCH_PPC 0
+#define VPX_ARCH_LOONGARCH 0
+#define HAVE_NEON_ASM 0
+#define HAVE_NEON 0
+#define HAVE_NEON_DOTPROD 0
+#define HAVE_NEON_I8MM 0
+#define HAVE_SVE 0
+#define HAVE_SVE2 0
+#define HAVE_MIPS32 0
+#define HAVE_DSPR2 0
+#define HAVE_MSA 1
+#define HAVE_MIPS64 1
+#define HAVE_MMX 0
+#define HAVE_SSE 0
+#define HAVE_SSE2 0
+#define HAVE_SSE3 0
+#define HAVE_SSSE3 0
+#define HAVE_SSE4_1 0
+#define HAVE_AVX 0
+#define HAVE_AVX2 0
+#define HAVE_AVX512 0
+#define HAVE_VSX 0
+#define HAVE_MMI 1
+#define HAVE_LSX 0
+#define HAVE_LASX 0
+#define HAVE_VPX_PORTS 1
+#define HAVE_PTHREAD_H 1
+#define CONFIG_DEPENDENCY_TRACKING 1
+#define CONFIG_EXTERNAL_BUILD 1
+#define CONFIG_INSTALL_DOCS 0
+#define CONFIG_INSTALL_BINS 1
+#define CONFIG_INSTALL_LIBS 1
+#define CONFIG_INSTALL_SRCS 0
+#define CONFIG_DEBUG 0
+#define CONFIG_GPROF 0
+#define CONFIG_GCOV 0
+#define CONFIG_RVCT 0
+#define CONFIG_GCC 1
+#define CONFIG_MSVS 0
+#define CONFIG_PIC 1
+#define CONFIG_BIG_ENDIAN 0
+#define CONFIG_CODEC_SRCS 0
+#define CONFIG_DEBUG_LIBS 0
+#define CONFIG_DEQUANT_TOKENS 1
+#define CONFIG_DC_RECON 1
+#define CONFIG_RUNTIME_CPU_DETECT 0
+#define CONFIG_POSTPROC 0
+#define CONFIG_VP9_POSTPROC 0
+#define CONFIG_MULTITHREAD 1
+#define CONFIG_INTERNAL_STATS 0
+#define CONFIG_VP8_ENCODER 1
+#define CONFIG_VP8_DECODER 1
+#define CONFIG_VP9_ENCODER 1
+#define CONFIG_VP9_DECODER 1
+#define CONFIG_VP8 1
+#define CONFIG_VP9 1
+#define CONFIG_ENCODERS 1
+#define CONFIG_DECODERS 1
+#define CONFIG_STATIC_MSVCRT 0
+#define CONFIG_SPATIAL_RESAMPLING 1
+#define CONFIG_REALTIME_ONLY 0
+#define CONFIG_ONTHEFLY_BITPACKING 0
+#define CONFIG_ERROR_CONCEALMENT 0
+#define CONFIG_SHARED 0
+#define CONFIG_STATIC 1
+#define CONFIG_SMALL 0
+#define CONFIG_POSTPROC_VISUALIZER 0
+#define CONFIG_OS_SUPPORT 1
+#define CONFIG_UNIT_TESTS 0
+#define CONFIG_WEBM_IO 1
+#define CONFIG_LIBYUV 0
+#define CONFIG_DECODE_PERF_TESTS 0
+#define CONFIG_ENCODE_PERF_TESTS 0
+#define CONFIG_MULTI_RES_ENCODING 1
+#define CONFIG_TEMPORAL_DENOISING 1
+#define CONFIG_VP9_TEMPORAL_DENOISING 0
+#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_VP9_HIGHBITDEPTH 0
+#define CONFIG_BETTER_HW_COMPATIBILITY 0
+#define CONFIG_EXPERIMENTAL 0
+#define CONFIG_SIZE_LIMIT 1
+#define CONFIG_ALWAYS_ADJUST_BPM 0
+#define CONFIG_BITSTREAM_DEBUG 0
+#define CONFIG_MISMATCH_DEBUG 0
+#define CONFIG_FP_MB_STATS 0
+#define CONFIG_EMULATE_HARDWARE 0
+#define CONFIG_NON_GREEDY_MV 0
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
+#define DECODE_WIDTH_LIMIT 8192
+#define DECODE_HEIGHT_LIMIT 4608
+#endif /* VPX_CONFIG_H */
diff --git a/media/libvpx/config/linux/mips64/vpx_dsp_rtcd.h b/media/libvpx/config/linux/mips64/vpx_dsp_rtcd.h
new file mode 100644
index 0000000000..bf43d539bd
--- /dev/null
+++ b/media/libvpx/config/linux/mips64/vpx_dsp_rtcd.h
@@ -0,0 +1,1029 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
+#ifndef VPX_DSP_RTCD_H_
+#define VPX_DSP_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * DSP
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#if CONFIG_VP9_ENCODER
+ struct macroblock_plane;
+ struct ScanOrder;
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned int vpx_avg_4x4_c(const uint8_t *, int p);
+unsigned int vpx_avg_4x4_msa(const uint8_t *, int p);
+#define vpx_avg_4x4 vpx_avg_4x4_msa
+
+unsigned int vpx_avg_8x8_c(const uint8_t *, int p);
+unsigned int vpx_avg_8x8_msa(const uint8_t *, int p);
+#define vpx_avg_8x8 vpx_avg_8x8_msa
+
+void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+#define vpx_comp_avg_pred vpx_comp_avg_pred_c
+
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_mmi(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve8 vpx_convolve8_msa
+
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_mmi(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg vpx_convolve8_avg_msa
+
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_msa
+
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_msa
+
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve8_horiz vpx_convolve8_horiz_msa
+
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_mmi(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve8_vert vpx_convolve8_vert_msa
+
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_avg_mmi(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve_avg vpx_convolve_avg_msa
+
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve_copy vpx_convolve_copy_msa
+
+void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c
+
+void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c
+
+void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c
+
+void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
+
+void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c
+
+void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c
+
+void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c
+
+void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c
+
+void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c
+
+void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_c
+
+void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_c
+
+void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_c
+
+void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_c
+
+void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_c
+
+void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_c
+
+void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c
+
+void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_c
+
+void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c
+
+void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_c
+
+void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_c
+
+void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c
+
+void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c
+
+void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_c
+
+void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_c
+
+void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c
+
+void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c
+
+void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_msa
+
+void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_msa
+
+void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_msa
+
+void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_msa
+
+void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_msa
+
+void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_msa
+
+void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_msa
+
+void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_msa
+
+void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_msa
+
+void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_msa
+
+void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_msa
+
+void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_msa
+
+void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_msa
+
+void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_msa
+
+void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_msa
+
+void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_msa
+
+void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct16x16_msa(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16 vpx_fdct16x16_msa
+
+void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct16x16_1_msa(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16_1 vpx_fdct16x16_1_msa
+
+void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct32x32_msa(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32 vpx_fdct32x32_msa
+
+void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct32x32_1_msa(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_1 vpx_fdct32x32_1_msa
+
+void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct32x32_rd_msa(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_rd vpx_fdct32x32_rd_msa
+
+void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct4x4_msa(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4 vpx_fdct4x4_msa
+
+void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4_1 vpx_fdct4x4_1_c
+
+void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct8x8_msa(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct8x8 vpx_fdct8x8_msa
+
+void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct8x8_1_msa(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct8x8_1 vpx_fdct8x8_1_msa
+
+void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_get16x16var vpx_get16x16var_msa
+
+unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride);
+unsigned int vpx_get4x4sse_cs_msa(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride);
+#define vpx_get4x4sse_cs vpx_get4x4sse_cs_msa
+
+void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get8x8var_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_get8x8var vpx_get8x8var_msa
+
+unsigned int vpx_get_mb_ss_c(const int16_t *);
+unsigned int vpx_get_mb_ss_msa(const int16_t *);
+#define vpx_get_mb_ss vpx_get_mb_ss_msa
+
+void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_16x16 vpx_h_predictor_16x16_msa
+
+void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_32x32 vpx_h_predictor_32x32_msa
+
+void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_4x4 vpx_h_predictor_4x4_msa
+
+void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_msa
+
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_msa(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+#define vpx_hadamard_16x16 vpx_hadamard_16x16_msa
+
+void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+#define vpx_hadamard_32x32 vpx_hadamard_32x32_c
+
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_8x8_msa(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+#define vpx_hadamard_8x8 vpx_hadamard_8x8_msa
+
+void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
+
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_10_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_10_add vpx_idct16x16_10_add_msa
+
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_1_add vpx_idct16x16_1_add_msa
+
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_256_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_256_add vpx_idct16x16_256_add_msa
+
+void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_256_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_38_add vpx_idct16x16_256_add_msa
+
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_msa
+
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct32x32_135_add vpx_idct32x32_1024_add_msa
+
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct32x32_1_add vpx_idct32x32_1_add_msa
+
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct32x32_34_add vpx_idct32x32_34_add_msa
+
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_16_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct4x4_16_add vpx_idct4x4_16_add_msa
+
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct4x4_1_add vpx_idct4x4_1_add_msa
+
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct8x8_12_add vpx_idct8x8_12_add_msa
+
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct8x8_1_add vpx_idct8x8_1_add_msa
+
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_64_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct8x8_64_add vpx_idct8x8_64_add_msa
+
+int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
+int16_t vpx_int_pro_col_msa(const uint8_t *ref, const int width);
+#define vpx_int_pro_col vpx_int_pro_col_msa
+
+void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
+void vpx_int_pro_row_msa(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
+#define vpx_int_pro_row vpx_int_pro_row_msa
+
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_iwht4x4_16_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_msa
+
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_iwht4x4_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_msa
+
+void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_horizontal_16_msa(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_horizontal_16 vpx_lpf_horizontal_16_msa
+
+void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_horizontal_16_dual_msa(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_horizontal_16_dual vpx_lpf_horizontal_16_dual_msa
+
+void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_horizontal_4_msa(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_horizontal_4 vpx_lpf_horizontal_4_msa
+
+void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_horizontal_4_dual_msa(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_horizontal_4_dual vpx_lpf_horizontal_4_dual_msa
+
+void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_horizontal_8_msa(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_horizontal_8 vpx_lpf_horizontal_8_msa
+
+void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_horizontal_8_dual_msa(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_horizontal_8_dual vpx_lpf_horizontal_8_dual_msa
+
+void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_vertical_16_msa(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_16 vpx_lpf_vertical_16_msa
+
+void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_vertical_16_dual_msa(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_16_dual vpx_lpf_vertical_16_dual_msa
+
+void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_vertical_4_msa(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_4 vpx_lpf_vertical_4_msa
+
+void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_vertical_4_dual_msa(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_vertical_4_dual vpx_lpf_vertical_4_dual_msa
+
+void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_vertical_8_msa(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_8 vpx_lpf_vertical_8_msa
+
+void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_vertical_8_dual_msa(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_msa
+
+void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+void vpx_minmax_8x8_msa(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vpx_minmax_8x8 vpx_minmax_8x8_msa
+
+unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_mse16x16 vpx_mse16x16_msa
+
+unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x8_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_mse16x8 vpx_mse16x8_msa
+
+unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse8x16_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse8x16_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_mse8x16 vpx_mse8x16_msa
+
+unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse8x8_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse8x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_mse8x8 vpx_mse8x8_msa
+
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+#define vpx_quantize_b vpx_quantize_b_c
+
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c
+
+unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x16_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x16_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x16 vpx_sad16x16_msa
+
+unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad16x16_avg_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad16x16_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x16_avg vpx_sad16x16_avg_msa
+
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x16x4d_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x16x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad16x16x4d vpx_sad16x16x4d_msa
+
+unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x32_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x32_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x32 vpx_sad16x32_msa
+
+unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad16x32_avg_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad16x32_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x32_avg vpx_sad16x32_avg_msa
+
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x32x4d_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x32x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad16x32x4d vpx_sad16x32x4d_msa
+
+unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x8_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x8 vpx_sad16x8_msa
+
+unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad16x8_avg_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad16x8_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x8_avg vpx_sad16x8_avg_msa
+
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x8x4d_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x8x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad16x8x4d vpx_sad16x8x4d_msa
+
+unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad32x16_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad32x16_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x16 vpx_sad32x16_msa
+
+unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad32x16_avg_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad32x16_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x16_avg vpx_sad32x16_avg_msa
+
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x16x4d_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x16x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad32x16x4d vpx_sad32x16x4d_msa
+
+unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad32x32_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad32x32_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x32 vpx_sad32x32_msa
+
+unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad32x32_avg_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad32x32_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x32_avg vpx_sad32x32_avg_msa
+
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad32x32x4d vpx_sad32x32x4d_msa
+
+unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad32x64_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad32x64_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x64 vpx_sad32x64_msa
+
+unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad32x64_avg_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad32x64_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x64_avg vpx_sad32x64_avg_msa
+
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x64x4d_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x64x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad32x64x4d vpx_sad32x64x4d_msa
+
+unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad4x4_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad4x4_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x4 vpx_sad4x4_msa
+
+unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad4x4_avg_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad4x4_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x4_avg vpx_sad4x4_avg_msa
+
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x4x4d_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x4x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad4x4x4d vpx_sad4x4x4d_msa
+
+unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad4x8_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad4x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x8 vpx_sad4x8_msa
+
+unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad4x8_avg_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad4x8_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x8_avg vpx_sad4x8_avg_msa
+
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x8x4d_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x8x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad4x8x4d vpx_sad4x8x4d_msa
+
+unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad64x32_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad64x32_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad64x32 vpx_sad64x32_msa
+
+unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad64x32_avg_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad64x32_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad64x32_avg vpx_sad64x32_avg_msa
+
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x32x4d_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x32x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad64x32x4d vpx_sad64x32x4d_msa
+
+unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad64x64_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad64x64_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad64x64 vpx_sad64x64_msa
+
+unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad64x64_avg_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad64x64_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad64x64_avg vpx_sad64x64_avg_msa
+
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad64x64x4d vpx_sad64x64x4d_msa
+
+unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad8x16_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad8x16_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x16 vpx_sad8x16_msa
+
+unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad8x16_avg_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad8x16_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x16_avg vpx_sad8x16_avg_msa
+
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x16x4d_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x16x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad8x16x4d vpx_sad8x16x4d_msa
+
+unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad8x4_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad8x4_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x4 vpx_sad8x4_msa
+
+unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad8x4_avg_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad8x4_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x4_avg vpx_sad8x4_avg_msa
+
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x4x4d_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x4x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad8x4x4d vpx_sad8x4x4d_msa
+
+unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad8x8_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad8x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x8 vpx_sad8x8_msa
+
+unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad8x8_avg_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad8x8_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x8_avg vpx_sad8x8_avg_msa
+
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x8x4d_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x8x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad8x8x4d vpx_sad8x8x4d_msa
+
+unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x16 vpx_sad_skip_16x16_c
+
+void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x16x4d vpx_sad_skip_16x16x4d_c
+
+unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x32 vpx_sad_skip_16x32_c
+
+void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x32x4d vpx_sad_skip_16x32x4d_c
+
+unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x8 vpx_sad_skip_16x8_c
+
+void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x8x4d vpx_sad_skip_16x8x4d_c
+
+unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_32x16 vpx_sad_skip_32x16_c
+
+void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_32x16x4d vpx_sad_skip_32x16x4d_c
+
+unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_32x32 vpx_sad_skip_32x32_c
+
+void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_32x32x4d vpx_sad_skip_32x32x4d_c
+
+unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_32x64 vpx_sad_skip_32x64_c
+
+void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_32x64x4d vpx_sad_skip_32x64x4d_c
+
+unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c
+
+void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c
+
+unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_c
+
+void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_c
+
+unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_64x32 vpx_sad_skip_64x32_c
+
+void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_64x32x4d vpx_sad_skip_64x32x4d_c
+
+unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_64x64 vpx_sad_skip_64x64_c
+
+void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_64x64x4d vpx_sad_skip_64x64x4d_c
+
+unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_c
+
+void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_c
+
+unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c
+
+void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c
+
+unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_c
+
+void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_c
+
+int vpx_satd_c(const int16_t *coeff, int length);
+int vpx_satd_msa(const int16_t *coeff, int length);
+#define vpx_satd vpx_satd_msa
+
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_scaled_2d_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_2d vpx_scaled_2d_msa
+
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
+
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
+
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
+
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_horiz vpx_scaled_horiz_c
+
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_vert vpx_scaled_vert_c
+
+int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+#define vpx_sse vpx_sse_c
+
+uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x16_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x16_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_msa
+
+uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x32_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x32_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_msa
+
+uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x8_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x8_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_msa
+
+uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x16_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x16_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_msa
+
+uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_msa
+
+uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x64_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_msa
+
+uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x4_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x4_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_msa
+
+uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x8_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x8_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_msa
+
+uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x32_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x32_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_msa
+
+uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_msa
+
+uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x16_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x16_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_msa
+
+uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x4_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x4_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_msa
+
+uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x8_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x8_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_msa
+
+uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_msa
+
+uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x32_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x32_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_msa
+
+uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_msa
+
+uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x16_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x16_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_msa
+
+uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_msa
+
+uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x64_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x64_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_msa
+
+uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_msa
+
+uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x8_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x8_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_msa
+
+uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x32_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x32_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_msa
+
+uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_msa
+
+uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_msa
+
+uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x4_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x4_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_msa
+
+uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x8_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x8_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x8 vpx_sub_pixel_variance8x8_msa
+
+void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+void vpx_subtract_block_mmi(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+void vpx_subtract_block_msa(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+#define vpx_subtract_block vpx_subtract_block_msa
+
+uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size);
+uint64_t vpx_sum_squares_2d_i16_msa(const int16_t *src, int stride, int size);
+#define vpx_sum_squares_2d_i16 vpx_sum_squares_2d_i16_msa
+
+void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_msa
+
+void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_msa
+
+void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_msa
+
+void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_msa
+
+void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_16x16 vpx_v_predictor_16x16_msa
+
+void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_32x32 vpx_v_predictor_32x32_msa
+
+void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_4x4 vpx_v_predictor_4x4_msa
+
+void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_8x8 vpx_v_predictor_8x8_msa
+
+unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x16 vpx_variance16x16_msa
+
+unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x32_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x32_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x32 vpx_variance16x32_msa
+
+unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance16x8 vpx_variance16x8_msa
+
+unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x16_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x16_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x16 vpx_variance32x16_msa
+
+unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x32 vpx_variance32x32_msa
+
+unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x64_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x64_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance32x64 vpx_variance32x64_msa
+
+unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x4_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x4_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance4x4 vpx_variance4x4_msa
+
+unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x8_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance4x8 vpx_variance4x8_msa
+
+unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance64x32 vpx_variance64x32_msa
+
+unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance64x64 vpx_variance64x64_msa
+
+unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x16 vpx_variance8x16_msa
+
+unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x4_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x4_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x4 vpx_variance8x4_msa
+
+unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_variance8x8 vpx_variance8x8_msa
+
+void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c
+
+int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl);
+int vpx_vector_var_msa(const int16_t *ref, const int16_t *src, const int bwl);
+#define vpx_vector_var vpx_vector_var_msa
+
+void vpx_dsp_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/mips.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = mips_cpu_caps();
+
+    (void)flags;
+
+#if HAVE_DSPR2
+void vpx_dsputil_static_init();
+#if CONFIG_VP8
+void dsputil_static_init();
+#endif
+
+vpx_dsputil_static_init();
+#if CONFIG_VP8
+dsputil_static_init();
+#endif
+#endif
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_RTCD_H_
diff --git a/media/libvpx/config/linux/mips64/vpx_scale_rtcd.h b/media/libvpx/config/linux/mips64/vpx_scale_rtcd.h
new file mode 100644
index 0000000000..c0d59a108d
--- /dev/null
+++ b/media/libvpx/config/linux/mips64/vpx_scale_rtcd.h
@@ -0,0 +1,96 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
+#ifndef VPX_SCALE_RTCD_H_
+#define VPX_SCALE_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c
+
+void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c
+
+void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c
+
+void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+#define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c
+
+void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+#define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c
+
+void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c
+
+void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c
+
+void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vp8_yv12_copy_frame vp8_yv12_copy_frame_c
+
+void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf);
+#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c
+
+void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
+#define vpx_extend_frame_borders vpx_extend_frame_borders_c
+
+void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
+#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
+
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
+void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_y vpx_yv12_copy_y_c
+
+void vpx_scale_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/mips.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = mips_cpu_caps();
+
+    (void)flags;
+
+#if HAVE_DSPR2
+void vpx_dsputil_static_init();
+#if CONFIG_VP8
+void dsputil_static_init();
+#endif
+
+vpx_dsputil_static_init();
+#if CONFIG_VP8
+dsputil_static_init();
+#endif
+#endif
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_SCALE_RTCD_H_
diff --git a/media/libvpx/config/linux/ppc64le/vp8_rtcd.h b/media/libvpx/config/linux/ppc64le/vp8_rtcd.h
new file mode 100644
index 0000000000..1b7314e6bd
--- /dev/null
+++ b/media/libvpx/config/linux/ppc64le/vp8_rtcd.h
@@ -0,0 +1,180 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
+#ifndef VP8_RTCD_H_
+#define VP8_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * VP8
+ */
+
+struct blockd;
+struct macroblockd;
+struct loop_filter_info;
+
+/* Encoder forward decls */
+struct block;
+struct macroblock;
+struct variance_vtable;
+union int_mv;
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_c
+
+void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_c
+
+void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_c
+
+void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_c
+
+int vp8_block_error_c(short *coeff, short *dqcoeff);
+#define vp8_block_error vp8_block_error_c
+
+void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
+#define vp8_copy32xn vp8_copy32xn_c
+
+void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+#define vp8_copy_mem16x16 vp8_copy_mem16x16_c
+
+void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+#define vp8_copy_mem8x4 vp8_copy_mem8x4_c
+
+void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+#define vp8_copy_mem8x8 vp8_copy_mem8x8_c
+
+void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+#define vp8_dc_only_idct_add vp8_dc_only_idct_add_c
+
+int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+#define vp8_denoiser_filter vp8_denoiser_filter_c
+
+int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_c
+
+void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride);
+#define vp8_dequant_idct_add vp8_dequant_idct_add_c
+
+void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
+#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_c
+
+void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
+#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c
+
+void vp8_dequantize_b_c(struct blockd*, short *DQC);
+#define vp8_dequantize_b vp8_dequantize_b_c
+
+int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_diamond_search_sad vp8_diamond_search_sad_c
+
+void vp8_fast_quantize_b_c(struct block *, struct blockd *);
+#define vp8_fast_quantize_b vp8_fast_quantize_b_c
+
+void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_bh vp8_loop_filter_bh_c
+
+void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_bv vp8_loop_filter_bv_c
+
+void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_mbh vp8_loop_filter_mbh_c
+
+void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_mbv vp8_loop_filter_mbv_c
+
+void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_c
+
+void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_c
+
+void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_c
+
+void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_c
+
+int vp8_mbblock_error_c(struct macroblock *mb, int dc);
+#define vp8_mbblock_error vp8_mbblock_error_c
+
+int vp8_mbuverror_c(struct macroblock *mb);
+#define vp8_mbuverror vp8_mbuverror_c
+
+int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_refining_search_sad vp8_refining_search_sad_c
+
+void vp8_regular_quantize_b_c(struct block *, struct blockd *);
+#define vp8_regular_quantize_b vp8_regular_quantize_b_c
+
+void vp8_short_fdct4x4_c(short *input, short *output, int pitch);
+#define vp8_short_fdct4x4 vp8_short_fdct4x4_c
+
+void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
+#define vp8_short_fdct8x4 vp8_short_fdct8x4_c
+
+void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+#define vp8_short_idct4x4llm vp8_short_idct4x4llm_c
+
+void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff);
+#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_c
+
+void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff);
+#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c
+
+void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
+#define vp8_short_walsh4x4 vp8_short_walsh4x4_c
+
+void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_c
+
+void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_c
+
+void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_c
+
+void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_c
+
+void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count);
+#define vp8_temporal_filter_apply vp8_temporal_filter_apply_c
+
+void vp8_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/ppc.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = ppc_simd_caps();
+    (void)flags;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_RTCD_H_
diff --git a/media/libvpx/config/linux/ppc64le/vp9_rtcd.h b/media/libvpx/config/linux/ppc64le/vp9_rtcd.h
new file mode 100644
index 0000000000..1d4de12268
--- /dev/null
+++ b/media/libvpx/config/linux/ppc64le/vp9_rtcd.h
@@ -0,0 +1,132 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
+#ifndef VP9_RTCD_H_
+#define VP9_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * VP9
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
+#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER
+#include "vp9/encoder/vp9_temporal_filter.h"
+#endif
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct macroblock_plane;
+struct vp9_sad_table;
+struct ScanOrder;
+struct search_site_config;
+struct mv;
+union int_mv;
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp9_apply_temporal_filter_c(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+#define vp9_apply_temporal_filter vp9_apply_temporal_filter_c
+
+int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
+#define vp9_block_error vp9_block_error_c
+
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+#define vp9_block_error_fp vp9_block_error_fp_c
+
+int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv);
+#define vp9_diamond_search_sad vp9_diamond_search_sad_c
+
+void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht16x16 vp9_fht16x16_c
+
+void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht4x4 vp9_fht4x4_c
+
+void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht8x8 vp9_fht8x8_c
+
+void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_fwht4x4 vp9_fwht4x4_c
+
+void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+RTCD_EXTERN void (*vp9_iht16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+
+void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
+
+void vpx_convolve12_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve12 vpx_convolve12_c
+
+void vpx_convolve12_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve12_horiz vpx_convolve12_horiz_c
+
+void vpx_convolve12_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve12_vert vpx_convolve12_vert_c
+
+void vp9_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/ppc.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = ppc_simd_caps();
+    (void)flags;
+    vp9_iht16x16_256_add = vp9_iht16x16_256_add_c;
+    if (flags & HAS_VSX) vp9_iht16x16_256_add = vp9_iht16x16_256_add_vsx;
+    vp9_iht4x4_16_add = vp9_iht4x4_16_add_c;
+    if (flags & HAS_VSX) vp9_iht4x4_16_add = vp9_iht4x4_16_add_vsx;
+    vp9_iht8x8_64_add = vp9_iht8x8_64_add_c;
+    if (flags & HAS_VSX) vp9_iht8x8_64_add = vp9_iht8x8_64_add_vsx;
+    vp9_quantize_fp = vp9_quantize_fp_c;
+    if (flags & HAS_VSX) vp9_quantize_fp = vp9_quantize_fp_vsx;
+    vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_c;
+    if (flags & HAS_VSX) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_vsx;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_RTCD_H_
diff --git a/media/libvpx/config/linux/ppc64le/vpx_config.asm b/media/libvpx/config/linux/ppc64le/vpx_config.asm
new file mode 100644
index 0000000000..543a627257
--- /dev/null
+++ b/media/libvpx/config/linux/ppc64le/vpx_config.asm
@@ -0,0 +1,97 @@
+@ This file was created from a .asm file
+@  using the ads2gas.pl script.
+.syntax unified
+.equ VPX_ARCH_ARM ,  0
+.equ VPX_ARCH_AARCH64 ,  0
+.equ VPX_ARCH_MIPS ,  0
+.equ VPX_ARCH_X86 ,  0
+.equ VPX_ARCH_X86_64 ,  0
+.equ VPX_ARCH_PPC ,  1
+.equ VPX_ARCH_LOONGARCH ,  0
+.equ HAVE_NEON_ASM ,  0
+.equ HAVE_NEON ,  0
+.equ HAVE_NEON_DOTPROD ,  0
+.equ HAVE_NEON_I8MM ,  0
+.equ HAVE_SVE ,  0
+.equ HAVE_SVE2 ,  0
+.equ HAVE_MIPS32 ,  0
+.equ HAVE_DSPR2 ,  0
+.equ HAVE_MSA ,  0
+.equ HAVE_MIPS64 ,  0
+.equ HAVE_MMX ,  0
+.equ HAVE_SSE ,  0
+.equ HAVE_SSE2 ,  0
+.equ HAVE_SSE3 ,  0
+.equ HAVE_SSSE3 ,  0
+.equ HAVE_SSE4_1 ,  0
+.equ HAVE_AVX ,  0
+.equ HAVE_AVX2 ,  0
+.equ HAVE_AVX512 ,  0
+.equ HAVE_VSX ,  1
+.equ HAVE_MMI ,  0
+.equ HAVE_LSX ,  0
+.equ HAVE_LASX ,  0
+.equ HAVE_VPX_PORTS ,  1
+.equ HAVE_PTHREAD_H ,  1
+.equ CONFIG_DEPENDENCY_TRACKING ,  1
+.equ CONFIG_EXTERNAL_BUILD ,  1
+.equ CONFIG_INSTALL_DOCS ,  0
+.equ CONFIG_INSTALL_BINS ,  1
+.equ CONFIG_INSTALL_LIBS ,  1
+.equ CONFIG_INSTALL_SRCS ,  0
+.equ CONFIG_DEBUG ,  0
+.equ CONFIG_GPROF ,  0
+.equ CONFIG_GCOV ,  0
+.equ CONFIG_RVCT ,  0
+.equ CONFIG_GCC ,  1
+.equ CONFIG_MSVS ,  0
+.equ CONFIG_PIC ,  1
+.equ CONFIG_BIG_ENDIAN ,  0
+.equ CONFIG_CODEC_SRCS ,  0
+.equ CONFIG_DEBUG_LIBS ,  0
+.equ CONFIG_DEQUANT_TOKENS ,  0
+.equ CONFIG_DC_RECON ,  0
+.equ CONFIG_RUNTIME_CPU_DETECT ,  1
+.equ CONFIG_POSTPROC ,  0
+.equ CONFIG_VP9_POSTPROC ,  0
+.equ CONFIG_MULTITHREAD ,  1
+.equ CONFIG_INTERNAL_STATS ,  0
+.equ CONFIG_VP8_ENCODER ,  1
+.equ CONFIG_VP8_DECODER ,  1
+.equ CONFIG_VP9_ENCODER ,  1
+.equ CONFIG_VP9_DECODER ,  1
+.equ CONFIG_VP8 ,  1
+.equ CONFIG_VP9 ,  1
+.equ CONFIG_ENCODERS ,  1
+.equ CONFIG_DECODERS ,  1
+.equ CONFIG_STATIC_MSVCRT ,  0
+.equ CONFIG_SPATIAL_RESAMPLING ,  1
+.equ CONFIG_REALTIME_ONLY ,  0
+.equ CONFIG_ONTHEFLY_BITPACKING ,  0
+.equ CONFIG_ERROR_CONCEALMENT ,  0
+.equ CONFIG_SHARED ,  0
+.equ CONFIG_STATIC ,  1
+.equ CONFIG_SMALL ,  0
+.equ CONFIG_POSTPROC_VISUALIZER ,  0
+.equ CONFIG_OS_SUPPORT ,  1
+.equ CONFIG_UNIT_TESTS ,  0
+.equ CONFIG_WEBM_IO ,  1
+.equ CONFIG_LIBYUV ,  1
+.equ CONFIG_DECODE_PERF_TESTS ,  0
+.equ CONFIG_ENCODE_PERF_TESTS ,  0
+.equ CONFIG_MULTI_RES_ENCODING ,  1
+.equ CONFIG_TEMPORAL_DENOISING ,  1
+.equ CONFIG_VP9_TEMPORAL_DENOISING ,  0
+.equ CONFIG_COEFFICIENT_RANGE_CHECKING ,  0
+.equ CONFIG_VP9_HIGHBITDEPTH ,  0
+.equ CONFIG_BETTER_HW_COMPATIBILITY ,  0
+.equ CONFIG_EXPERIMENTAL ,  0
+.equ CONFIG_SIZE_LIMIT ,  1
+.equ CONFIG_ALWAYS_ADJUST_BPM ,  0
+.equ CONFIG_BITSTREAM_DEBUG ,  0
+.equ CONFIG_MISMATCH_DEBUG ,  0
+.equ CONFIG_FP_MB_STATS ,  0
+.equ CONFIG_EMULATE_HARDWARE ,  0
+.equ CONFIG_NON_GREEDY_MV ,  0
+.equ CONFIG_COLLECT_COMPONENT_TIMING ,  0
+    .section .note.GNU-stack,"",%progbits
diff --git a/media/libvpx/config/linux/ppc64le/vpx_config.c b/media/libvpx/config/linux/ppc64le/vpx_config.c
new file mode 100644
index 0000000000..edd90f65bc
--- /dev/null
+++ b/media/libvpx/config/linux/ppc64le/vpx_config.c
@@ -0,0 +1,10 @@
+/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */
+/*  */
+/* Use of this source code is governed by a BSD-style license */
+/* that can be found in the LICENSE file in the root of the source */
+/* tree. An additional intellectual property rights grant can be found */
+/* in the file PATENTS.  All contributing project authors may */
+/* be found in the AUTHORS file in the root of the source tree. */
+#include "vpx/vpx_codec.h"
+static const char* const cfg = "--target=ppc64le-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --enable-runtime-cpu-detect --enable-vsx";
+const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/media/libvpx/config/linux/ppc64le/vpx_config.h b/media/libvpx/config/linux/ppc64le/vpx_config.h
new file mode 100644
index 0000000000..883e4d686e
--- /dev/null
+++ b/media/libvpx/config/linux/ppc64le/vpx_config.h
@@ -0,0 +1,108 @@
+/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */
+/*  */
+/* Use of this source code is governed by a BSD-style license */
+/* that can be found in the LICENSE file in the root of the source */
+/* tree. An additional intellectual property rights grant can be found */
+/* in the file PATENTS.  All contributing project authors may */
+/* be found in the AUTHORS file in the root of the source tree. */
+/* This file automatically generated by configure. Do not edit! */
+#ifndef VPX_CONFIG_H
+#define VPX_CONFIG_H
+#define RESTRICT    
+#define INLINE      inline
+#define VPX_ARCH_ARM 0
+#define VPX_ARCH_AARCH64 0
+#define VPX_ARCH_MIPS 0
+#define VPX_ARCH_X86 0
+#define VPX_ARCH_X86_64 0
+#define VPX_ARCH_PPC 1
+#define VPX_ARCH_LOONGARCH 0
+#define HAVE_NEON_ASM 0
+#define HAVE_NEON 0
+#define HAVE_NEON_DOTPROD 0
+#define HAVE_NEON_I8MM 0
+#define HAVE_SVE 0
+#define HAVE_SVE2 0
+#define HAVE_MIPS32 0
+#define HAVE_DSPR2 0
+#define HAVE_MSA 0
+#define HAVE_MIPS64 0
+#define HAVE_MMX 0
+#define HAVE_SSE 0
+#define HAVE_SSE2 0
+#define HAVE_SSE3 0
+#define HAVE_SSSE3 0
+#define HAVE_SSE4_1 0
+#define HAVE_AVX 0
+#define HAVE_AVX2 0
+#define HAVE_AVX512 0
+#define HAVE_VSX 1
+#define HAVE_MMI 0
+#define HAVE_LSX 0
+#define HAVE_LASX 0
+#define HAVE_VPX_PORTS 1
+#define HAVE_PTHREAD_H 1
+#define CONFIG_DEPENDENCY_TRACKING 1
+#define CONFIG_EXTERNAL_BUILD 1
+#define CONFIG_INSTALL_DOCS 0
+#define CONFIG_INSTALL_BINS 1
+#define CONFIG_INSTALL_LIBS 1
+#define CONFIG_INSTALL_SRCS 0
+#define CONFIG_DEBUG 0
+#define CONFIG_GPROF 0
+#define CONFIG_GCOV 0
+#define CONFIG_RVCT 0
+#define CONFIG_GCC 1
+#define CONFIG_MSVS 0
+#define CONFIG_PIC 1
+#define CONFIG_BIG_ENDIAN 0
+#define CONFIG_CODEC_SRCS 0
+#define CONFIG_DEBUG_LIBS 0
+#define CONFIG_DEQUANT_TOKENS 0
+#define CONFIG_DC_RECON 0
+#define CONFIG_RUNTIME_CPU_DETECT 1
+#define CONFIG_POSTPROC 0
+#define CONFIG_VP9_POSTPROC 0
+#define CONFIG_MULTITHREAD 1
+#define CONFIG_INTERNAL_STATS 0
+#define CONFIG_VP8_ENCODER 1
+#define CONFIG_VP8_DECODER 1
+#define CONFIG_VP9_ENCODER 1
+#define CONFIG_VP9_DECODER 1
+#define CONFIG_VP8 1
+#define CONFIG_VP9 1
+#define CONFIG_ENCODERS 1
+#define CONFIG_DECODERS 1
+#define CONFIG_STATIC_MSVCRT 0
+#define CONFIG_SPATIAL_RESAMPLING 1
+#define CONFIG_REALTIME_ONLY 0
+#define CONFIG_ONTHEFLY_BITPACKING 0
+#define CONFIG_ERROR_CONCEALMENT 0
+#define CONFIG_SHARED 0
+#define CONFIG_STATIC 1
+#define CONFIG_SMALL 0
+#define CONFIG_POSTPROC_VISUALIZER 0
+#define CONFIG_OS_SUPPORT 1
+#define CONFIG_UNIT_TESTS 0
+#define CONFIG_WEBM_IO 1
+#define CONFIG_LIBYUV 1
+#define CONFIG_DECODE_PERF_TESTS 0
+#define CONFIG_ENCODE_PERF_TESTS 0
+#define CONFIG_MULTI_RES_ENCODING 1
+#define CONFIG_TEMPORAL_DENOISING 1
+#define CONFIG_VP9_TEMPORAL_DENOISING 0
+#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_VP9_HIGHBITDEPTH 0
+#define CONFIG_BETTER_HW_COMPATIBILITY 0
+#define CONFIG_EXPERIMENTAL 0
+#define CONFIG_SIZE_LIMIT 1
+#define CONFIG_ALWAYS_ADJUST_BPM 0
+#define CONFIG_BITSTREAM_DEBUG 0
+#define CONFIG_MISMATCH_DEBUG 0
+#define CONFIG_FP_MB_STATS 0
+#define CONFIG_EMULATE_HARDWARE 0
+#define CONFIG_NON_GREEDY_MV 0
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
+#define DECODE_WIDTH_LIMIT 8192
+#define DECODE_HEIGHT_LIMIT 4608
+#endif /* VPX_CONFIG_H */
diff --git a/media/libvpx/config/linux/ppc64le/vpx_dsp_rtcd.h b/media/libvpx/config/linux/ppc64le/vpx_dsp_rtcd.h
new file mode 100644
index 0000000000..596c863f88
--- /dev/null
+++ b/media/libvpx/config/linux/ppc64le/vpx_dsp_rtcd.h
@@ -0,0 +1,1015 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
+#ifndef VPX_DSP_RTCD_H_
+#define VPX_DSP_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * DSP
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#if CONFIG_VP9_ENCODER
+ struct macroblock_plane;
+ struct ScanOrder;
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned int vpx_avg_4x4_c(const uint8_t *, int p);
+#define vpx_avg_4x4 vpx_avg_4x4_c
+
+unsigned int vpx_avg_8x8_c(const uint8_t *, int p);
+#define vpx_avg_8x8 vpx_avg_8x8_c
+
+void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+void vpx_comp_avg_pred_vsx(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+RTCD_EXTERN void (*vpx_comp_avg_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_avg_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_copy_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c
+
+void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c
+
+void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c
+
+void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
+
+void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c
+
+void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c
+
+void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c
+
+void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c
+
+void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c
+
+void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_c
+
+void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_c
+
+void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_c
+
+void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_c
+
+void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_c
+
+void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_c
+
+void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c
+
+void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+
+void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+
+void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_c
+
+void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_c
+
+void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c
+
+void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+
+void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+
+void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_c
+
+void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c
+
+void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c
+
+void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+
+void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+
+void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_c
+
+void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_c
+
+void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+
+void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+
+void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_c
+
+void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_c
+
+void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+
+void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+
+void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_c
+
+void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_c
+
+void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+
+void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+
+void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_c
+
+void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_c
+
+void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16 vpx_fdct16x16_c
+
+void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16_1 vpx_fdct16x16_1_c
+
+void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32 vpx_fdct32x32_c
+
+void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_1 vpx_fdct32x32_1_c
+
+void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct32x32_rd_vsx(const int16_t *input, tran_low_t *output, int stride);
+RTCD_EXTERN void (*vpx_fdct32x32_rd)(const int16_t *input, tran_low_t *output, int stride);
+
+void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4 vpx_fdct4x4_c
+
+void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4_1 vpx_fdct4x4_1_c
+
+void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct8x8 vpx_fdct8x8_c
+
+void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct8x8_1 vpx_fdct8x8_1_c
+
+void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+
+unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride);
+unsigned int vpx_get4x4sse_cs_vsx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_get4x4sse_cs)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride);
+
+void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get8x8var_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+
+unsigned int vpx_get_mb_ss_c(const int16_t *);
+unsigned int vpx_get_mb_ss_vsx(const int16_t *);
+RTCD_EXTERN unsigned int (*vpx_get_mb_ss)(const int16_t *);
+
+void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_h_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+
+void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_h_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+
+void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_4x4 vpx_h_predictor_4x4_c
+
+void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_c
+
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_vsx(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+
+void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+#define vpx_hadamard_32x32 vpx_hadamard_32x32_c
+
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_8x8_vsx(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+
+void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
+
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
+
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_1_add vpx_idct16x16_1_add_c
+
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride);
+
+void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_38_add vpx_idct16x16_38_add_c
+
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_vsx(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride);
+
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct32x32_135_add vpx_idct32x32_135_add_c
+
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct32x32_1_add vpx_idct32x32_1_add_c
+
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct32x32_34_add vpx_idct32x32_34_add_c
+
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride);
+
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct4x4_1_add vpx_idct4x4_1_add_c
+
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct8x8_12_add vpx_idct8x8_12_add_c
+
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct8x8_1_add vpx_idct8x8_1_add_c
+
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride);
+
+int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
+#define vpx_int_pro_col vpx_int_pro_col_c
+
+void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
+#define vpx_int_pro_row vpx_int_pro_row_c
+
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_iwht4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_iwht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride);
+
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
+
+void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_horizontal_16 vpx_lpf_horizontal_16_c
+
+void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_horizontal_16_dual vpx_lpf_horizontal_16_dual_c
+
+void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_horizontal_4 vpx_lpf_horizontal_4_c
+
+void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_horizontal_4_dual vpx_lpf_horizontal_4_dual_c
+
+void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_horizontal_8 vpx_lpf_horizontal_8_c
+
+void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_horizontal_8_dual vpx_lpf_horizontal_8_dual_c
+
+void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_16 vpx_lpf_vertical_16_c
+
+void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_16_dual vpx_lpf_vertical_16_dual_c
+
+void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_4 vpx_lpf_vertical_4_c
+
+void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_vertical_4_dual vpx_lpf_vertical_4_dual_c
+
+void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_8 vpx_lpf_vertical_8_c
+
+void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_c
+
+void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vpx_minmax_8x8 vpx_minmax_8x8_c
+
+unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x8_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse8x16_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse8x8_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_32x32_vsx(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vpx_quantize_b_32x32)(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+
+unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x16_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad16x16_avg_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad16x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x16x4d_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x32_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad16x32_avg_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad16x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x32x4d_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x8_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad16x8_avg_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad16x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x8x4d_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad32x16_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad32x16_avg_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad32x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x16x4d_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad32x32_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad32x32_avg_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad32x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad32x64_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad32x64_avg_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad32x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x64x4d_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x4 vpx_sad4x4_c
+
+unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x4_avg vpx_sad4x4_avg_c
+
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad4x4x4d vpx_sad4x4x4d_c
+
+unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x8 vpx_sad4x8_c
+
+unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x8_avg vpx_sad4x8_avg_c
+
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad4x8x4d vpx_sad4x8x4d_c
+
+unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad64x32_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad64x32_avg_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad64x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x32x4d_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad64x64_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad64x64_avg_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad64x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad8x16_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x16_avg vpx_sad8x16_avg_c
+
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad8x16x4d vpx_sad8x16x4d_c
+
+unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad8x4_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x4_avg vpx_sad8x4_avg_c
+
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad8x4x4d vpx_sad8x4x4d_c
+
+unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad8x8_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x8_avg vpx_sad8x8_avg_c
+
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad8x8x4d vpx_sad8x8x4d_c
+
+unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x16 vpx_sad_skip_16x16_c
+
+void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x16x4d vpx_sad_skip_16x16x4d_c
+
+unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x32 vpx_sad_skip_16x32_c
+
+void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x32x4d vpx_sad_skip_16x32x4d_c
+
+unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x8 vpx_sad_skip_16x8_c
+
+void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x8x4d vpx_sad_skip_16x8x4d_c
+
+unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_32x16 vpx_sad_skip_32x16_c
+
+void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_32x16x4d vpx_sad_skip_32x16x4d_c
+
+unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_32x32 vpx_sad_skip_32x32_c
+
+void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_32x32x4d vpx_sad_skip_32x32x4d_c
+
+unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_32x64 vpx_sad_skip_32x64_c
+
+void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_32x64x4d vpx_sad_skip_32x64x4d_c
+
+unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c
+
+void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c
+
+unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_c
+
+void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_c
+
+unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_64x32 vpx_sad_skip_64x32_c
+
+void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_64x32x4d vpx_sad_skip_64x32x4d_c
+
+unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_64x64 vpx_sad_skip_64x64_c
+
+void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_64x64x4d vpx_sad_skip_64x64x4d_c
+
+unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_c
+
+void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_c
+
+unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c
+
+void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c
+
+unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_c
+
+void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_c
+
+int vpx_satd_c(const int16_t *coeff, int length);
+#define vpx_satd vpx_satd_c
+
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_2d vpx_scaled_2d_c
+
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
+
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
+
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
+
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_horiz vpx_scaled_horiz_c
+
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_vert vpx_scaled_vert_c
+
+int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+#define vpx_sse vpx_sse_c
+
+uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_c
+
+uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_c
+
+uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_c
+
+uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_c
+
+uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_c
+
+uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_c
+
+uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c
+
+uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c
+
+uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_c
+
+uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_c
+
+uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_c
+
+uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_c
+
+uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x8 vpx_sub_pixel_variance8x8_c
+
+void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+void vpx_subtract_block_vsx(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+RTCD_EXTERN void (*vpx_subtract_block)(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+
+uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size);
+#define vpx_sum_squares_2d_i16 vpx_sum_squares_2d_i16_c
+
+void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_tm_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+
+void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_tm_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+
+void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_c
+
+void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_c
+
+void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_v_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+
+void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_v_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+
+void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_4x4 vpx_v_predictor_4x4_c
+
+void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_8x8 vpx_v_predictor_8x8_c
+
+unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x32_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x16_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x64_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x4_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance4x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x8_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x4_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c
+
+int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl);
+#define vpx_vector_var vpx_vector_var_c
+
+void vpx_dsp_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/ppc.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = ppc_simd_caps();
+    (void)flags;
+    vpx_comp_avg_pred = vpx_comp_avg_pred_c;
+    if (flags & HAS_VSX) vpx_comp_avg_pred = vpx_comp_avg_pred_vsx;
+    vpx_convolve8 = vpx_convolve8_c;
+    if (flags & HAS_VSX) vpx_convolve8 = vpx_convolve8_vsx;
+    vpx_convolve8_avg = vpx_convolve8_avg_c;
+    if (flags & HAS_VSX) vpx_convolve8_avg = vpx_convolve8_avg_vsx;
+    vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_c;
+    if (flags & HAS_VSX) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_vsx;
+    vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_c;
+    if (flags & HAS_VSX) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_vsx;
+    vpx_convolve8_horiz = vpx_convolve8_horiz_c;
+    if (flags & HAS_VSX) vpx_convolve8_horiz = vpx_convolve8_horiz_vsx;
+    vpx_convolve8_vert = vpx_convolve8_vert_c;
+    if (flags & HAS_VSX) vpx_convolve8_vert = vpx_convolve8_vert_vsx;
+    vpx_convolve_avg = vpx_convolve_avg_c;
+    if (flags & HAS_VSX) vpx_convolve_avg = vpx_convolve_avg_vsx;
+    vpx_convolve_copy = vpx_convolve_copy_c;
+    if (flags & HAS_VSX) vpx_convolve_copy = vpx_convolve_copy_vsx;
+    vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_c;
+    if (flags & HAS_VSX) vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_vsx;
+    vpx_d45_predictor_32x32 = vpx_d45_predictor_32x32_c;
+    if (flags & HAS_VSX) vpx_d45_predictor_32x32 = vpx_d45_predictor_32x32_vsx;
+    vpx_d63_predictor_16x16 = vpx_d63_predictor_16x16_c;
+    if (flags & HAS_VSX) vpx_d63_predictor_16x16 = vpx_d63_predictor_16x16_vsx;
+    vpx_d63_predictor_32x32 = vpx_d63_predictor_32x32_c;
+    if (flags & HAS_VSX) vpx_d63_predictor_32x32 = vpx_d63_predictor_32x32_vsx;
+    vpx_dc_128_predictor_16x16 = vpx_dc_128_predictor_16x16_c;
+    if (flags & HAS_VSX) vpx_dc_128_predictor_16x16 = vpx_dc_128_predictor_16x16_vsx;
+    vpx_dc_128_predictor_32x32 = vpx_dc_128_predictor_32x32_c;
+    if (flags & HAS_VSX) vpx_dc_128_predictor_32x32 = vpx_dc_128_predictor_32x32_vsx;
+    vpx_dc_left_predictor_16x16 = vpx_dc_left_predictor_16x16_c;
+    if (flags & HAS_VSX) vpx_dc_left_predictor_16x16 = vpx_dc_left_predictor_16x16_vsx;
+    vpx_dc_left_predictor_32x32 = vpx_dc_left_predictor_32x32_c;
+    if (flags & HAS_VSX) vpx_dc_left_predictor_32x32 = vpx_dc_left_predictor_32x32_vsx;
+    vpx_dc_predictor_16x16 = vpx_dc_predictor_16x16_c;
+    if (flags & HAS_VSX) vpx_dc_predictor_16x16 = vpx_dc_predictor_16x16_vsx;
+    vpx_dc_predictor_32x32 = vpx_dc_predictor_32x32_c;
+    if (flags & HAS_VSX) vpx_dc_predictor_32x32 = vpx_dc_predictor_32x32_vsx;
+    vpx_dc_top_predictor_16x16 = vpx_dc_top_predictor_16x16_c;
+    if (flags & HAS_VSX) vpx_dc_top_predictor_16x16 = vpx_dc_top_predictor_16x16_vsx;
+    vpx_dc_top_predictor_32x32 = vpx_dc_top_predictor_32x32_c;
+    if (flags & HAS_VSX) vpx_dc_top_predictor_32x32 = vpx_dc_top_predictor_32x32_vsx;
+    vpx_fdct32x32_rd = vpx_fdct32x32_rd_c;
+    if (flags & HAS_VSX) vpx_fdct32x32_rd = vpx_fdct32x32_rd_vsx;
+    vpx_get16x16var = vpx_get16x16var_c;
+    if (flags & HAS_VSX) vpx_get16x16var = vpx_get16x16var_vsx;
+    vpx_get4x4sse_cs = vpx_get4x4sse_cs_c;
+    if (flags & HAS_VSX) vpx_get4x4sse_cs = vpx_get4x4sse_cs_vsx;
+    vpx_get8x8var = vpx_get8x8var_c;
+    if (flags & HAS_VSX) vpx_get8x8var = vpx_get8x8var_vsx;
+    vpx_get_mb_ss = vpx_get_mb_ss_c;
+    if (flags & HAS_VSX) vpx_get_mb_ss = vpx_get_mb_ss_vsx;
+    vpx_h_predictor_16x16 = vpx_h_predictor_16x16_c;
+    if (flags & HAS_VSX) vpx_h_predictor_16x16 = vpx_h_predictor_16x16_vsx;
+    vpx_h_predictor_32x32 = vpx_h_predictor_32x32_c;
+    if (flags & HAS_VSX) vpx_h_predictor_32x32 = vpx_h_predictor_32x32_vsx;
+    vpx_hadamard_16x16 = vpx_hadamard_16x16_c;
+    if (flags & HAS_VSX) vpx_hadamard_16x16 = vpx_hadamard_16x16_vsx;
+    vpx_hadamard_8x8 = vpx_hadamard_8x8_c;
+    if (flags & HAS_VSX) vpx_hadamard_8x8 = vpx_hadamard_8x8_vsx;
+    vpx_idct16x16_256_add = vpx_idct16x16_256_add_c;
+    if (flags & HAS_VSX) vpx_idct16x16_256_add = vpx_idct16x16_256_add_vsx;
+    vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_c;
+    if (flags & HAS_VSX) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_vsx;
+    vpx_idct4x4_16_add = vpx_idct4x4_16_add_c;
+    if (flags & HAS_VSX) vpx_idct4x4_16_add = vpx_idct4x4_16_add_vsx;
+    vpx_idct8x8_64_add = vpx_idct8x8_64_add_c;
+    if (flags & HAS_VSX) vpx_idct8x8_64_add = vpx_idct8x8_64_add_vsx;
+    vpx_iwht4x4_16_add = vpx_iwht4x4_16_add_c;
+    if (flags & HAS_VSX) vpx_iwht4x4_16_add = vpx_iwht4x4_16_add_vsx;
+    vpx_mse16x16 = vpx_mse16x16_c;
+    if (flags & HAS_VSX) vpx_mse16x16 = vpx_mse16x16_vsx;
+    vpx_mse16x8 = vpx_mse16x8_c;
+    if (flags & HAS_VSX) vpx_mse16x8 = vpx_mse16x8_vsx;
+    vpx_mse8x16 = vpx_mse8x16_c;
+    if (flags & HAS_VSX) vpx_mse8x16 = vpx_mse8x16_vsx;
+    vpx_mse8x8 = vpx_mse8x8_c;
+    if (flags & HAS_VSX) vpx_mse8x8 = vpx_mse8x8_vsx;
+    vpx_quantize_b = vpx_quantize_b_c;
+    if (flags & HAS_VSX) vpx_quantize_b = vpx_quantize_b_vsx;
+    vpx_quantize_b_32x32 = vpx_quantize_b_32x32_c;
+    if (flags & HAS_VSX) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_vsx;
+    vpx_sad16x16 = vpx_sad16x16_c;
+    if (flags & HAS_VSX) vpx_sad16x16 = vpx_sad16x16_vsx;
+    vpx_sad16x16_avg = vpx_sad16x16_avg_c;
+    if (flags & HAS_VSX) vpx_sad16x16_avg = vpx_sad16x16_avg_vsx;
+    vpx_sad16x16x4d = vpx_sad16x16x4d_c;
+    if (flags & HAS_VSX) vpx_sad16x16x4d = vpx_sad16x16x4d_vsx;
+    vpx_sad16x32 = vpx_sad16x32_c;
+    if (flags & HAS_VSX) vpx_sad16x32 = vpx_sad16x32_vsx;
+    vpx_sad16x32_avg = vpx_sad16x32_avg_c;
+    if (flags & HAS_VSX) vpx_sad16x32_avg = vpx_sad16x32_avg_vsx;
+    vpx_sad16x32x4d = vpx_sad16x32x4d_c;
+    if (flags & HAS_VSX) vpx_sad16x32x4d = vpx_sad16x32x4d_vsx;
+    vpx_sad16x8 = vpx_sad16x8_c;
+    if (flags & HAS_VSX) vpx_sad16x8 = vpx_sad16x8_vsx;
+    vpx_sad16x8_avg = vpx_sad16x8_avg_c;
+    if (flags & HAS_VSX) vpx_sad16x8_avg = vpx_sad16x8_avg_vsx;
+    vpx_sad16x8x4d = vpx_sad16x8x4d_c;
+    if (flags & HAS_VSX) vpx_sad16x8x4d = vpx_sad16x8x4d_vsx;
+    vpx_sad32x16 = vpx_sad32x16_c;
+    if (flags & HAS_VSX) vpx_sad32x16 = vpx_sad32x16_vsx;
+    vpx_sad32x16_avg = vpx_sad32x16_avg_c;
+    if (flags & HAS_VSX) vpx_sad32x16_avg = vpx_sad32x16_avg_vsx;
+    vpx_sad32x16x4d = vpx_sad32x16x4d_c;
+    if (flags & HAS_VSX) vpx_sad32x16x4d = vpx_sad32x16x4d_vsx;
+    vpx_sad32x32 = vpx_sad32x32_c;
+    if (flags & HAS_VSX) vpx_sad32x32 = vpx_sad32x32_vsx;
+    vpx_sad32x32_avg = vpx_sad32x32_avg_c;
+    if (flags & HAS_VSX) vpx_sad32x32_avg = vpx_sad32x32_avg_vsx;
+    vpx_sad32x32x4d = vpx_sad32x32x4d_c;
+    if (flags & HAS_VSX) vpx_sad32x32x4d = vpx_sad32x32x4d_vsx;
+    vpx_sad32x64 = vpx_sad32x64_c;
+    if (flags & HAS_VSX) vpx_sad32x64 = vpx_sad32x64_vsx;
+    vpx_sad32x64_avg = vpx_sad32x64_avg_c;
+    if (flags & HAS_VSX) vpx_sad32x64_avg = vpx_sad32x64_avg_vsx;
+    vpx_sad32x64x4d = vpx_sad32x64x4d_c;
+    if (flags & HAS_VSX) vpx_sad32x64x4d = vpx_sad32x64x4d_vsx;
+    vpx_sad64x32 = vpx_sad64x32_c;
+    if (flags & HAS_VSX) vpx_sad64x32 = vpx_sad64x32_vsx;
+    vpx_sad64x32_avg = vpx_sad64x32_avg_c;
+    if (flags & HAS_VSX) vpx_sad64x32_avg = vpx_sad64x32_avg_vsx;
+    vpx_sad64x32x4d = vpx_sad64x32x4d_c;
+    if (flags & HAS_VSX) vpx_sad64x32x4d = vpx_sad64x32x4d_vsx;
+    vpx_sad64x64 = vpx_sad64x64_c;
+    if (flags & HAS_VSX) vpx_sad64x64 = vpx_sad64x64_vsx;
+    vpx_sad64x64_avg = vpx_sad64x64_avg_c;
+    if (flags & HAS_VSX) vpx_sad64x64_avg = vpx_sad64x64_avg_vsx;
+    vpx_sad64x64x4d = vpx_sad64x64x4d_c;
+    if (flags & HAS_VSX) vpx_sad64x64x4d = vpx_sad64x64x4d_vsx;
+    vpx_sad8x16 = vpx_sad8x16_c;
+    if (flags & HAS_VSX) vpx_sad8x16 = vpx_sad8x16_vsx;
+    vpx_sad8x4 = vpx_sad8x4_c;
+    if (flags & HAS_VSX) vpx_sad8x4 = vpx_sad8x4_vsx;
+    vpx_sad8x8 = vpx_sad8x8_c;
+    if (flags & HAS_VSX) vpx_sad8x8 = vpx_sad8x8_vsx;
+    vpx_subtract_block = vpx_subtract_block_c;
+    if (flags & HAS_VSX) vpx_subtract_block = vpx_subtract_block_vsx;
+    vpx_tm_predictor_16x16 = vpx_tm_predictor_16x16_c;
+    if (flags & HAS_VSX) vpx_tm_predictor_16x16 = vpx_tm_predictor_16x16_vsx;
+    vpx_tm_predictor_32x32 = vpx_tm_predictor_32x32_c;
+    if (flags & HAS_VSX) vpx_tm_predictor_32x32 = vpx_tm_predictor_32x32_vsx;
+    vpx_v_predictor_16x16 = vpx_v_predictor_16x16_c;
+    if (flags & HAS_VSX) vpx_v_predictor_16x16 = vpx_v_predictor_16x16_vsx;
+    vpx_v_predictor_32x32 = vpx_v_predictor_32x32_c;
+    if (flags & HAS_VSX) vpx_v_predictor_32x32 = vpx_v_predictor_32x32_vsx;
+    vpx_variance16x16 = vpx_variance16x16_c;
+    if (flags & HAS_VSX) vpx_variance16x16 = vpx_variance16x16_vsx;
+    vpx_variance16x32 = vpx_variance16x32_c;
+    if (flags & HAS_VSX) vpx_variance16x32 = vpx_variance16x32_vsx;
+    vpx_variance16x8 = vpx_variance16x8_c;
+    if (flags & HAS_VSX) vpx_variance16x8 = vpx_variance16x8_vsx;
+    vpx_variance32x16 = vpx_variance32x16_c;
+    if (flags & HAS_VSX) vpx_variance32x16 = vpx_variance32x16_vsx;
+    vpx_variance32x32 = vpx_variance32x32_c;
+    if (flags & HAS_VSX) vpx_variance32x32 = vpx_variance32x32_vsx;
+    vpx_variance32x64 = vpx_variance32x64_c;
+    if (flags & HAS_VSX) vpx_variance32x64 = vpx_variance32x64_vsx;
+    vpx_variance4x4 = vpx_variance4x4_c;
+    if (flags & HAS_VSX) vpx_variance4x4 = vpx_variance4x4_vsx;
+    vpx_variance4x8 = vpx_variance4x8_c;
+    if (flags & HAS_VSX) vpx_variance4x8 = vpx_variance4x8_vsx;
+    vpx_variance64x32 = vpx_variance64x32_c;
+    if (flags & HAS_VSX) vpx_variance64x32 = vpx_variance64x32_vsx;
+    vpx_variance64x64 = vpx_variance64x64_c;
+    if (flags & HAS_VSX) vpx_variance64x64 = vpx_variance64x64_vsx;
+    vpx_variance8x16 = vpx_variance8x16_c;
+    if (flags & HAS_VSX) vpx_variance8x16 = vpx_variance8x16_vsx;
+    vpx_variance8x4 = vpx_variance8x4_c;
+    if (flags & HAS_VSX) vpx_variance8x4 = vpx_variance8x4_vsx;
+    vpx_variance8x8 = vpx_variance8x8_c;
+    if (flags & HAS_VSX) vpx_variance8x8 = vpx_variance8x8_vsx;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_RTCD_H_
diff --git a/media/libvpx/config/linux/ppc64le/vpx_scale_rtcd.h b/media/libvpx/config/linux/ppc64le/vpx_scale_rtcd.h
new file mode 100644
index 0000000000..11c5b6d933
--- /dev/null
+++ b/media/libvpx/config/linux/ppc64le/vpx_scale_rtcd.h
@@ -0,0 +1,83 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
+#ifndef VPX_SCALE_RTCD_H_
+#define VPX_SCALE_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c
+
+void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c
+
+void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c
+
+void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+#define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c
+
+void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+#define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c
+
+void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c
+
+void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c
+
+void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vp8_yv12_copy_frame vp8_yv12_copy_frame_c
+
+void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf);
+#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c
+
+void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
+#define vpx_extend_frame_borders vpx_extend_frame_borders_c
+
+void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
+#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
+
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
+void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_y vpx_yv12_copy_y_c
+
+void vpx_scale_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/ppc.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = ppc_simd_caps();
+    (void)flags;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_SCALE_RTCD_H_
diff --git a/media/libvpx/config/linux/x64/vp8_rtcd.h b/media/libvpx/config/linux/x64/vp8_rtcd.h
index 4728639704..9e7746b813 100644
--- a/media/libvpx/config/linux/x64/vp8_rtcd.h
+++ b/media/libvpx/config/linux/x64/vp8_rtcd.h
@@ -1,3 +1,14 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
 #ifndef VP8_RTCD_H_
 #define VP8_RTCD_H_
 
@@ -26,56 +37,47 @@ struct yv12_buffer_config;
 extern "C" {
 #endif
 
-void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_mmx
+void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict4x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_sse2
 
-void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_mmx
+void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_sse2
 
-void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-
-void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp8_blend_b vp8_blend_b_c
-
-void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp8_blend_mb_inner vp8_blend_mb_inner_c
-
-void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp8_blend_mb_outer vp8_blend_mb_outer_c
+void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
 int vp8_block_error_c(short *coeff, short *dqcoeff);
 int vp8_block_error_sse2(short *coeff, short *dqcoeff);
 #define vp8_block_error vp8_block_error_sse2
 
-void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
-void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
-void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
-RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
+void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
+void vp8_copy32xn_sse2(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
+void vp8_copy32xn_sse3(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
+RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
 
-void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
 #define vp8_copy_mem16x16 vp8_copy_mem16x16_sse2
 
-void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem8x4_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
 #define vp8_copy_mem8x4 vp8_copy_mem8x4_mmx
 
-void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem8x8_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
 #define vp8_copy_mem8x8 vp8_copy_mem8x8_mmx
 
-void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
-void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
+void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
 #define vp8_dc_only_idct_add vp8_dc_only_idct_add_mmx
 
 int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
@@ -86,8 +88,8 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, u
 int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
 #define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_sse2
 
-void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride);
-void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *output, int stride);
+void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride);
+void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *dest, int stride);
 #define vp8_dequant_idct_add vp8_dequant_idct_add_mmx
 
 void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
@@ -98,8 +100,8 @@ void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int
 void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
 #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2
 
-void vp8_dequantize_b_c(struct blockd*, short *dqc);
-void vp8_dequantize_b_mmx(struct blockd*, short *dqc);
+void vp8_dequantize_b_c(struct blockd*, short *DQC);
+void vp8_dequantize_b_mmx(struct blockd*, short *DQC);
 #define vp8_dequantize_b vp8_dequantize_b_mmx
 
 int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
@@ -122,41 +124,36 @@ void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char
 void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight);
 #define vp8_filter_by_weight8x8 vp8_filter_by_weight8x8_sse2
 
-int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-
-void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 #define vp8_loop_filter_bh vp8_loop_filter_bh_sse2
 
-void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_bv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 #define vp8_loop_filter_bv vp8_loop_filter_bv_sse2
 
-void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_mbh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 #define vp8_loop_filter_mbh vp8_loop_filter_mbh_sse2
 
-void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 #define vp8_loop_filter_mbv vp8_loop_filter_mbv_sse2
 
-void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_bhs_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 #define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_sse2
 
-void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 #define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_sse2
 
-void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 #define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_sse2
 
-void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 #define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_sse2
 
 int vp8_mbblock_error_c(struct macroblock *mb, int dc);
@@ -167,8 +164,8 @@ int vp8_mbuverror_c(struct macroblock *mb);
 int vp8_mbuverror_sse2(struct macroblock *mb);
 #define vp8_mbuverror vp8_mbuverror_sse2
 
-int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
 #define vp8_refining_search_sad vp8_refining_search_sadx4
 
 void vp8_regular_quantize_b_c(struct block *, struct blockd *);
@@ -184,40 +181,40 @@ void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
 void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch);
 #define vp8_short_fdct8x4 vp8_short_fdct8x4_sse2
 
-void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
-void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
+void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
 #define vp8_short_idct4x4llm vp8_short_idct4x4llm_mmx
 
-void vp8_short_inv_walsh4x4_c(short *input, short *output);
-void vp8_short_inv_walsh4x4_sse2(short *input, short *output);
+void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff);
+void vp8_short_inv_walsh4x4_sse2(short *input, short *mb_dqcoeff);
 #define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_sse2
 
-void vp8_short_inv_walsh4x4_1_c(short *input, short *output);
+void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff);
 #define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c
 
 void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
 void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch);
 #define vp8_short_walsh4x4 vp8_short_walsh4x4_sse2
 
-void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
 void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count);
 void vp8_temporal_filter_apply_sse2(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count);
@@ -241,9 +238,6 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE3) vp8_copy32xn = vp8_copy32xn_sse3;
     vp8_fast_quantize_b = vp8_fast_quantize_b_sse2;
     if (flags & HAS_SSSE3) vp8_fast_quantize_b = vp8_fast_quantize_b_ssse3;
-    vp8_full_search_sad = vp8_full_search_sad_c;
-    if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3;
-    if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8;
     vp8_regular_quantize_b = vp8_regular_quantize_b_sse2;
     if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1;
     vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2;
@@ -261,4 +255,4 @@ static void setup_rtcd_internal(void)
 }  // extern "C"
 #endif
 
-#endif
+#endif  // VP8_RTCD_H_
diff --git a/media/libvpx/config/linux/x64/vp9_rtcd.h b/media/libvpx/config/linux/x64/vp9_rtcd.h
index 47a708444d..52d6f0657d 100644
--- a/media/libvpx/config/linux/x64/vp9_rtcd.h
+++ b/media/libvpx/config/linux/x64/vp9_rtcd.h
@@ -1,3 +1,14 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
 #ifndef VP9_RTCD_H_
 #define VP9_RTCD_H_
 
@@ -14,12 +25,18 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
+#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER
+#include "vp9/encoder/vp9_temporal_filter.h"
+#endif
 
 struct macroblockd;
 
 /* Encoder forward decls */
 struct macroblock;
-struct vp9_variance_vtable;
+struct macroblock_plane;
+struct vp9_sad_table;
+struct ScanOrder;
 struct search_site_config;
 struct mv;
 union int_mv;
@@ -29,23 +46,22 @@ struct yv12_buffer_config;
 extern "C" {
 #endif
 
+void vp9_apply_temporal_filter_c(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+void vp9_apply_temporal_filter_sse4_1(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+RTCD_EXTERN void (*vp9_apply_temporal_filter)(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 int64_t vp9_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 RTCD_EXTERN int64_t (*vp9_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
-int64_t vp9_block_error_fp_sse2(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
-#define vp9_block_error_fp vp9_block_error_fp_sse2
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+RTCD_EXTERN int64_t (*vp9_block_error_fp)(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
 
-int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-int vp9_diamond_search_sad_avx(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-RTCD_EXTERN int (*vp9_diamond_search_sad)(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-
-void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-RTCD_EXTERN void (*vp9_fdct8x8_quant)(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv);
+#define vp9_diamond_search_sad vp9_diamond_search_sad_c
 
 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
 void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type);
@@ -67,17 +83,12 @@ void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride, uint8_t *dst,
 void vp9_filter_by_weight8x8_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight);
 #define vp9_filter_by_weight8x8 vp9_filter_by_weight8x8_sse2
 
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-int vp9_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-int vp9_full_search_sadx8(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-RTCD_EXTERN int (*vp9_full_search_sad)(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-
 void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 void vp9_fwht4x4_sse2(const int16_t *input, tran_low_t *output, int stride);
 #define vp9_fwht4x4 vp9_fwht4x4_sse2
 
-void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
-void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
+void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2
 
 void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
@@ -88,22 +99,35 @@ void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int
 void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht8x8_64_add vp9_iht8x8_64_add_sse2
 
-void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 
-void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 
-void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
-void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
-RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
+void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
 
-void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
-void vp9_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_sse2
+void vpx_convolve12_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve12)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve12_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve12_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve12_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve12_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
 void vp9_rtcd(void);
 
@@ -115,21 +139,29 @@ static void setup_rtcd_internal(void)
 
     (void)flags;
 
+    vp9_apply_temporal_filter = vp9_apply_temporal_filter_c;
+    if (flags & HAS_SSE4_1) vp9_apply_temporal_filter = vp9_apply_temporal_filter_sse4_1;
     vp9_block_error = vp9_block_error_sse2;
     if (flags & HAS_AVX2) vp9_block_error = vp9_block_error_avx2;
-    vp9_diamond_search_sad = vp9_diamond_search_sad_c;
-    if (flags & HAS_AVX) vp9_diamond_search_sad = vp9_diamond_search_sad_avx;
-    vp9_fdct8x8_quant = vp9_fdct8x8_quant_sse2;
-    if (flags & HAS_SSSE3) vp9_fdct8x8_quant = vp9_fdct8x8_quant_ssse3;
-    vp9_full_search_sad = vp9_full_search_sad_c;
-    if (flags & HAS_SSE3) vp9_full_search_sad = vp9_full_search_sadx3;
-    if (flags & HAS_SSE4_1) vp9_full_search_sad = vp9_full_search_sadx8;
+    vp9_block_error_fp = vp9_block_error_fp_sse2;
+    if (flags & HAS_AVX2) vp9_block_error_fp = vp9_block_error_fp_avx2;
     vp9_quantize_fp = vp9_quantize_fp_sse2;
     if (flags & HAS_SSSE3) vp9_quantize_fp = vp9_quantize_fp_ssse3;
+    if (flags & HAS_AVX2) vp9_quantize_fp = vp9_quantize_fp_avx2;
     vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_c;
     if (flags & HAS_SSSE3) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_ssse3;
+    if (flags & HAS_AVX2) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_avx2;
     vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c;
     if (flags & HAS_SSSE3) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_ssse3;
+    vpx_convolve12 = vpx_convolve12_c;
+    if (flags & HAS_SSSE3) vpx_convolve12 = vpx_convolve12_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve12 = vpx_convolve12_avx2;
+    vpx_convolve12_horiz = vpx_convolve12_horiz_c;
+    if (flags & HAS_SSSE3) vpx_convolve12_horiz = vpx_convolve12_horiz_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve12_horiz = vpx_convolve12_horiz_avx2;
+    vpx_convolve12_vert = vpx_convolve12_vert_c;
+    if (flags & HAS_SSSE3) vpx_convolve12_vert = vpx_convolve12_vert_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve12_vert = vpx_convolve12_vert_avx2;
 }
 #endif
 
@@ -137,4 +169,4 @@ static void setup_rtcd_internal(void)
 }  // extern "C"
 #endif
 
-#endif
+#endif  // VP9_RTCD_H_
diff --git a/media/libvpx/config/linux/x64/vpx_config.asm b/media/libvpx/config/linux/x64/vpx_config.asm
index e474409958..83a97b3c7c 100644
--- a/media/libvpx/config/linux/x64/vpx_config.asm
+++ b/media/libvpx/config/linux/x64/vpx_config.asm
@@ -1,9 +1,16 @@
-%define ARCH_ARM 0
-%define ARCH_MIPS 0
-%define ARCH_X86 0
-%define ARCH_X86_64 1
-%define HAVE_NEON 0
+%define VPX_ARCH_ARM 0
+%define VPX_ARCH_AARCH64 0
+%define VPX_ARCH_MIPS 0
+%define VPX_ARCH_X86 0
+%define VPX_ARCH_X86_64 1
+%define VPX_ARCH_PPC 0
+%define VPX_ARCH_LOONGARCH 0
 %define HAVE_NEON_ASM 0
+%define HAVE_NEON 0
+%define HAVE_NEON_DOTPROD 0
+%define HAVE_NEON_I8MM 0
+%define HAVE_SVE 0
+%define HAVE_SVE2 0
 %define HAVE_MIPS32 0
 %define HAVE_DSPR2 0
 %define HAVE_MSA 0
@@ -16,6 +23,11 @@
 %define HAVE_SSE4_1 1
 %define HAVE_AVX 1
 %define HAVE_AVX2 1
+%define HAVE_AVX512 1
+%define HAVE_VSX 0
+%define HAVE_MMI 0
+%define HAVE_LSX 0
+%define HAVE_LASX 0
 %define HAVE_VPX_PORTS 1
 %define HAVE_PTHREAD_H 1
 %define CONFIG_DEPENDENCY_TRACKING 1
@@ -72,7 +84,10 @@
 %define CONFIG_BETTER_HW_COMPATIBILITY 0
 %define CONFIG_EXPERIMENTAL 0
 %define CONFIG_SIZE_LIMIT 1
-%define CONFIG_SPATIAL_SVC 0
+%define CONFIG_ALWAYS_ADJUST_BPM 0
+%define CONFIG_BITSTREAM_DEBUG 0
+%define CONFIG_MISMATCH_DEBUG 0
 %define CONFIG_FP_MB_STATS 0
 %define CONFIG_EMULATE_HARDWARE 0
-%define CONFIG_MISC_FIXES 0
+%define CONFIG_NON_GREEDY_MV 0
+%define CONFIG_COLLECT_COMPONENT_TIMING 0
diff --git a/media/libvpx/config/linux/x64/vpx_config.h b/media/libvpx/config/linux/x64/vpx_config.h
index 23f32d1487..b0bfa58fe5 100644
--- a/media/libvpx/config/linux/x64/vpx_config.h
+++ b/media/libvpx/config/linux/x64/vpx_config.h
@@ -10,12 +10,19 @@
 #define VPX_CONFIG_H
 #define RESTRICT    
 #define INLINE      inline
-#define ARCH_ARM 0
-#define ARCH_MIPS 0
-#define ARCH_X86 0
-#define ARCH_X86_64 1
-#define HAVE_NEON 0
+#define VPX_ARCH_ARM 0
+#define VPX_ARCH_AARCH64 0
+#define VPX_ARCH_MIPS 0
+#define VPX_ARCH_X86 0
+#define VPX_ARCH_X86_64 1
+#define VPX_ARCH_PPC 0
+#define VPX_ARCH_LOONGARCH 0
 #define HAVE_NEON_ASM 0
+#define HAVE_NEON 0
+#define HAVE_NEON_DOTPROD 0
+#define HAVE_NEON_I8MM 0
+#define HAVE_SVE 0
+#define HAVE_SVE2 0
 #define HAVE_MIPS32 0
 #define HAVE_DSPR2 0
 #define HAVE_MSA 0
@@ -28,6 +35,11 @@
 #define HAVE_SSE4_1 1
 #define HAVE_AVX 1
 #define HAVE_AVX2 1
+#define HAVE_AVX512 1
+#define HAVE_VSX 0
+#define HAVE_MMI 0
+#define HAVE_LSX 0
+#define HAVE_LASX 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_PTHREAD_H 1
 #define CONFIG_DEPENDENCY_TRACKING 1
@@ -84,10 +96,13 @@
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 1
-#define CONFIG_SPATIAL_SVC 0
+#define CONFIG_ALWAYS_ADJUST_BPM 0
+#define CONFIG_BITSTREAM_DEBUG 0
+#define CONFIG_MISMATCH_DEBUG 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_EMULATE_HARDWARE 0
-#define CONFIG_MISC_FIXES 0
+#define CONFIG_NON_GREEDY_MV 0
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
 #define DECODE_WIDTH_LIMIT 8192
 #define DECODE_HEIGHT_LIMIT 4608
 #endif /* VPX_CONFIG_H */
diff --git a/media/libvpx/config/linux/x64/vpx_dsp_rtcd.h b/media/libvpx/config/linux/x64/vpx_dsp_rtcd.h
index 6f181c0a0e..8e17c8ab8a 100644
--- a/media/libvpx/config/linux/x64/vpx_dsp_rtcd.h
+++ b/media/libvpx/config/linux/x64/vpx_dsp_rtcd.h
@@ -1,3 +1,14 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
 #ifndef VPX_DSP_RTCD_H_
 #define VPX_DSP_RTCD_H_
 
@@ -13,6 +24,11 @@
 
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#if CONFIG_VP9_ENCODER
+ struct macroblock_plane;
+ struct ScanOrder;
+#endif
 
 
 #ifdef __cplusplus
@@ -28,243 +44,216 @@ unsigned int vpx_avg_8x8_sse2(const uint8_t *, int p);
 #define vpx_avg_8x8 vpx_avg_8x8_sse2
 
 void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
-#define vpx_comp_avg_pred vpx_comp_avg_pred_c
+void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+void vpx_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+RTCD_EXTERN void (*vpx_comp_avg_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
 
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve_avg vpx_convolve_avg_sse2
 
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve_copy vpx_convolve_copy_sse2
 
-void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c
 
-void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c
 
-void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c
 
-void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
 
-void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c
 
-void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c
 
-void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c
 
-void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c
 
-void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_sse2
 
-void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207e_predictor_16x16 vpx_d207e_predictor_16x16_c
+void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207e_predictor_32x32 vpx_d207e_predictor_32x32_c
+void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207e_predictor_4x4 vpx_d207e_predictor_4x4_c
-
-void vpx_d207e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207e_predictor_8x8 vpx_d207e_predictor_8x8_c
-
-void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_sse2
 
-void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_sse2
 
-void vpx_d45e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45e_predictor_16x16 vpx_d45e_predictor_16x16_c
-
-void vpx_d45e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45e_predictor_32x32 vpx_d45e_predictor_32x32_c
-
-void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c
 
-void vpx_d45e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45e_predictor_8x8 vpx_d45e_predictor_8x8_c
+void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_d63e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63e_predictor_16x16 vpx_d63e_predictor_16x16_c
-
-void vpx_d63e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63e_predictor_32x32 vpx_d63e_predictor_32x32_c
-
-void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c
 
-void vpx_d63e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63e_predictor_8x8 vpx_d63e_predictor_8x8_c
-
-void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63f_predictor_4x4 vpx_d63f_predictor_4x4_c
-
-void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_sse2
 
-void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_sse2
 
-void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_sse2
 
-void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_sse2
 
-void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_sse2
 
-void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_sse2
 
-void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_sse2
 
-void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_sse2
 
-void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_sse2
 
-void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_sse2
 
-void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_sse2
 
-void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_sse2
 
-void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_sse2
 
-void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_sse2
 
-void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_sse2
 
-void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_sse2
 
 void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
 void vpx_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct16x16 vpx_fdct16x16_sse2
+void vpx_fdct16x16_avx2(const int16_t *input, tran_low_t *output, int stride);
+RTCD_EXTERN void (*vpx_fdct16x16)(const int16_t *input, tran_low_t *output, int stride);
 
 void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
 void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output, int stride);
@@ -301,48 +290,54 @@ void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
 void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride);
 #define vpx_fdct8x8_1 vpx_fdct8x8_1_sse2
 
-void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vpx_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vpx_get16x16var_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
 
-unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride);
+unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride);
 #define vpx_get4x4sse_cs vpx_get4x4sse_cs_c
 
-void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vpx_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get8x8var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
 #define vpx_get8x8var vpx_get8x8var_sse2
 
 unsigned int vpx_get_mb_ss_c(const int16_t *);
 unsigned int vpx_get_mb_ss_sse2(const int16_t *);
 #define vpx_get_mb_ss vpx_get_mb_ss_sse2
 
-void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_16x16 vpx_h_predictor_16x16_sse2
 
-void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_32x32 vpx_h_predictor_32x32_sse2
 
-void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_4x4 vpx_h_predictor_4x4_sse2
 
-void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_sse2
 
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_16x16_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff);
-#define vpx_hadamard_16x16 vpx_hadamard_16x16_sse2
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
 
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_8x8_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_8x8_ssse3(const int16_t *src_diff, int src_stride, int16_t *coeff);
-RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+RTCD_EXTERN void (*vpx_hadamard_32x32)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
 
-void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_8x8_ssse3(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+
+void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
 
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
@@ -355,16 +350,22 @@ void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride
 
 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct16x16_256_add vpx_idct16x16_256_add_sse2
+void vpx_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride);
+
+void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_38_add vpx_idct16x16_38_add_sse2
 
 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest, int stride);
 RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
 void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest, int stride);
 RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
@@ -395,15 +396,14 @@ void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 
 void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
-RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct8x8_64_add vpx_idct8x8_64_add_sse2
 
 int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
 int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width);
 #define vpx_int_pro_col vpx_int_pro_col_sse2
 
-void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
-void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
+void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
+void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
 #define vpx_int_pro_row vpx_int_pro_row_sse2
 
 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
@@ -463,8 +463,8 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, co
 void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
 #define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_sse2
 
-void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vpx_mbpost_proc_across_ip_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols,int flimit);
+void vpx_mbpost_proc_across_ip_sse2(unsigned char *src, int pitch, int rows, int cols,int flimit);
 #define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_sse2
 
 void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
@@ -475,21 +475,22 @@ void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *mi
 void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
 #define vpx_minmax_8x8 vpx_minmax_8x8_sse2
 
-unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define vpx_mse16x8 vpx_mse16x8_sse2
+unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_mse8x16 vpx_mse8x16_sse2
 
-unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_mse8x8 vpx_mse8x8_sse2
 
 void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch);
@@ -500,16 +501,18 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *d
 void vpx_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
 #define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_sse2
 
-void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 
-void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-RTCD_EXTERN void (*vpx_quantize_b_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vpx_quantize_b_32x32)(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 
 unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -519,19 +522,10 @@ unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x16_avg vpx_sad16x16_avg_sse2
 
-void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x16x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
-void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x16x4d vpx_sad16x16x4d_sse2
 
-void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
 unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad16x32 vpx_sad16x32_sse2
@@ -540,8 +534,8 @@ unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x32_avg vpx_sad16x32_avg_sse2
 
-void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x32x4d vpx_sad16x32x4d_sse2
 
 unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -552,19 +546,10 @@ unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uin
 unsigned int vpx_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x8_avg vpx_sad16x8_avg_sse2
 
-void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x8x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
-void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x8x4d vpx_sad16x8x4d_sse2
 
-void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
 unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -575,8 +560,8 @@ unsigned int vpx_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_sad32x16_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad32x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x16x4d vpx_sad32x16x4d_sse2
 
 unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -589,16 +574,10 @@ unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_sad32x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad32x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x3 vpx_sad32x32x3_c
-
-void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_c
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -610,8 +589,8 @@ unsigned int vpx_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_sad32x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad32x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x64x4d vpx_sad32x64x4d_sse2
 
 unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -622,18 +601,10 @@ unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad4x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad4x4_avg vpx_sad4x4_avg_sse2
 
-void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad4x4x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad4x4x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
-void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad4x4x4d vpx_sad4x4x4d_sse2
 
-void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad4x4x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad4x4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
 unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad4x8 vpx_sad4x8_sse2
@@ -642,47 +613,43 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad4x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad4x8_avg vpx_sad4x8_avg_sse2
 
-void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad4x8x4d vpx_sad4x8x4d_sse2
 
-void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x8 vpx_sad4x8x8_c
-
 unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad64x32_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 RTCD_EXTERN unsigned int (*vpx_sad64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 
 unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 unsigned int vpx_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 unsigned int vpx_sad64x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad64x32_avg_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad64x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad64x32x4d vpx_sad64x32x4d_sse2
 
 unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad64x64_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 RTCD_EXTERN unsigned int (*vpx_sad64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 
 unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 unsigned int vpx_sad64x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad64x64_avg_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad64x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x3 vpx_sad64x64x3_c
-
-void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-
-void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x8 vpx_sad64x64x8_c
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -692,18 +659,10 @@ unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uin
 unsigned int vpx_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x16_avg vpx_sad8x16_avg_sse2
 
-void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad8x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x16x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
-void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x16x4d vpx_sad8x16x4d_sse2
 
-void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad8x16x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
 unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x4 vpx_sad8x4_sse2
@@ -712,13 +671,10 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x4_avg vpx_sad8x4_avg_sse2
 
-void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x4x4d vpx_sad8x4x4d_sse2
 
-void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x8 vpx_sad8x4x8_c
-
 unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x8 vpx_sad8x8_sse2
@@ -727,273 +683,392 @@ unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x8_avg vpx_sad8x8_avg_sse2
 
-void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad8x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x8x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
-void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x8x4d vpx_sad8x8x4d_sse2
 
-void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad8x8x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x16 vpx_sad_skip_16x16_sse2
+
+void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x16x4d vpx_sad_skip_16x16x4d_sse2
+
+unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x32 vpx_sad_skip_16x32_sse2
+
+void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x32x4d vpx_sad_skip_16x32x4d_sse2
+
+unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x8 vpx_sad_skip_16x8_sse2
+
+void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x8x4d vpx_sad_skip_16x8x4d_sse2
+
+unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x16x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c
+
+void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c
+
+unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_sse2
+
+void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_sse2
+
+unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x32_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x32x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x64_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_sse2
+
+void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_sse2
+
+unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c
+
+void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c
+
+unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_sse2
+
+void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_sse2
 
 int vpx_satd_c(const int16_t *coeff, int length);
 int vpx_satd_sse2(const int16_t *coeff, int length);
-#define vpx_satd vpx_satd_sse2
+int vpx_satd_avx2(const int16_t *coeff, int length);
+RTCD_EXTERN int (*vpx_satd)(const int16_t *coeff, int length);
 
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
 
-void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
 
-void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
 
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_horiz vpx_scaled_horiz_c
 
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_vert vpx_scaled_vert_c
 
-uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+int64_t vpx_sse_sse4_1(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+int64_t vpx_sse_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+RTCD_EXTERN int64_t (*vpx_sse)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
 
-uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
 void vpx_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
-#define vpx_subtract_block vpx_subtract_block_sse2
+void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+RTCD_EXTERN void (*vpx_subtract_block)(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
 
 uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size);
 uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size);
 #define vpx_sum_squares_2d_i16 vpx_sum_squares_2d_i16_sse2
 
-void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_sse2
 
-void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_sse2
 
-void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_sse2
 
-void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_sse2
 
-void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_v_predictor_16x16 vpx_v_predictor_16x16_sse2
 
-void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_v_predictor_32x32 vpx_v_predictor_32x32_sse2
 
-void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_v_predictor_4x4 vpx_v_predictor_4x4_sse2
 
-void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_v_predictor_8x8 vpx_v_predictor_8x8_sse2
 
-unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance16x32 vpx_variance16x32_sse2
+unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance16x8 vpx_variance16x8_sse2
+unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance32x64 vpx_variance32x64_sse2
+unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_variance4x4 vpx_variance4x4_sse2
 
-unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_variance4x8 vpx_variance4x8_sse2
 
-unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance8x16 vpx_variance8x16_sse2
+unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance8x4 vpx_variance8x4_sse2
+unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance8x8 vpx_variance8x8_sse2
+unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c
 
 int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl);
@@ -1010,15 +1085,20 @@ static void setup_rtcd_internal(void)
 
     (void)flags;
 
+    vpx_comp_avg_pred = vpx_comp_avg_pred_sse2;
+    if (flags & HAS_AVX2) vpx_comp_avg_pred = vpx_comp_avg_pred_avx2;
     vpx_convolve8 = vpx_convolve8_sse2;
     if (flags & HAS_SSSE3) vpx_convolve8 = vpx_convolve8_ssse3;
     if (flags & HAS_AVX2) vpx_convolve8 = vpx_convolve8_avx2;
     vpx_convolve8_avg = vpx_convolve8_avg_sse2;
     if (flags & HAS_SSSE3) vpx_convolve8_avg = vpx_convolve8_avg_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve8_avg = vpx_convolve8_avg_avx2;
     vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_sse2;
     if (flags & HAS_SSSE3) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_avx2;
     vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_sse2;
     if (flags & HAS_SSSE3) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_avx2;
     vpx_convolve8_horiz = vpx_convolve8_horiz_sse2;
     if (flags & HAS_SSSE3) vpx_convolve8_horiz = vpx_convolve8_horiz_ssse3;
     if (flags & HAS_AVX2) vpx_convolve8_horiz = vpx_convolve8_horiz_avx2;
@@ -1051,6 +1131,8 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSSE3) vpx_d63_predictor_4x4 = vpx_d63_predictor_4x4_ssse3;
     vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_c;
     if (flags & HAS_SSSE3) vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_ssse3;
+    vpx_fdct16x16 = vpx_fdct16x16_sse2;
+    if (flags & HAS_AVX2) vpx_fdct16x16 = vpx_fdct16x16_avx2;
     vpx_fdct32x32 = vpx_fdct32x32_sse2;
     if (flags & HAS_AVX2) vpx_fdct32x32 = vpx_fdct32x32_avx2;
     vpx_fdct32x32_rd = vpx_fdct32x32_rd_sse2;
@@ -1059,40 +1141,39 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSSE3) vpx_fdct8x8 = vpx_fdct8x8_ssse3;
     vpx_get16x16var = vpx_get16x16var_sse2;
     if (flags & HAS_AVX2) vpx_get16x16var = vpx_get16x16var_avx2;
+    vpx_hadamard_16x16 = vpx_hadamard_16x16_sse2;
+    if (flags & HAS_AVX2) vpx_hadamard_16x16 = vpx_hadamard_16x16_avx2;
+    vpx_hadamard_32x32 = vpx_hadamard_32x32_sse2;
+    if (flags & HAS_AVX2) vpx_hadamard_32x32 = vpx_hadamard_32x32_avx2;
     vpx_hadamard_8x8 = vpx_hadamard_8x8_sse2;
     if (flags & HAS_SSSE3) vpx_hadamard_8x8 = vpx_hadamard_8x8_ssse3;
+    vpx_idct16x16_256_add = vpx_idct16x16_256_add_sse2;
+    if (flags & HAS_AVX2) vpx_idct16x16_256_add = vpx_idct16x16_256_add_avx2;
     vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_sse2;
-    if (flags & HAS_SSSE3) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_ssse3;
-    vpx_idct32x32_135_add = vpx_idct32x32_1024_add_sse2;
+    if (flags & HAS_AVX2) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_avx2;
+    vpx_idct32x32_135_add = vpx_idct32x32_135_add_sse2;
     if (flags & HAS_SSSE3) vpx_idct32x32_135_add = vpx_idct32x32_135_add_ssse3;
+    if (flags & HAS_AVX2) vpx_idct32x32_135_add = vpx_idct32x32_135_add_avx2;
     vpx_idct32x32_34_add = vpx_idct32x32_34_add_sse2;
     if (flags & HAS_SSSE3) vpx_idct32x32_34_add = vpx_idct32x32_34_add_ssse3;
     vpx_idct8x8_12_add = vpx_idct8x8_12_add_sse2;
     if (flags & HAS_SSSE3) vpx_idct8x8_12_add = vpx_idct8x8_12_add_ssse3;
-    vpx_idct8x8_64_add = vpx_idct8x8_64_add_sse2;
-    if (flags & HAS_SSSE3) vpx_idct8x8_64_add = vpx_idct8x8_64_add_ssse3;
     vpx_lpf_horizontal_16 = vpx_lpf_horizontal_16_sse2;
     if (flags & HAS_AVX2) vpx_lpf_horizontal_16 = vpx_lpf_horizontal_16_avx2;
     vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_sse2;
     if (flags & HAS_AVX2) vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_avx2;
     vpx_mse16x16 = vpx_mse16x16_sse2;
     if (flags & HAS_AVX2) vpx_mse16x16 = vpx_mse16x16_avx2;
+    vpx_mse16x8 = vpx_mse16x8_sse2;
+    if (flags & HAS_AVX2) vpx_mse16x8 = vpx_mse16x8_avx2;
     vpx_quantize_b = vpx_quantize_b_sse2;
     if (flags & HAS_SSSE3) vpx_quantize_b = vpx_quantize_b_ssse3;
     if (flags & HAS_AVX) vpx_quantize_b = vpx_quantize_b_avx;
+    if (flags & HAS_AVX2) vpx_quantize_b = vpx_quantize_b_avx2;
     vpx_quantize_b_32x32 = vpx_quantize_b_32x32_c;
     if (flags & HAS_SSSE3) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_ssse3;
     if (flags & HAS_AVX) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx;
-    vpx_sad16x16x3 = vpx_sad16x16x3_c;
-    if (flags & HAS_SSE3) vpx_sad16x16x3 = vpx_sad16x16x3_sse3;
-    if (flags & HAS_SSSE3) vpx_sad16x16x3 = vpx_sad16x16x3_ssse3;
-    vpx_sad16x16x8 = vpx_sad16x16x8_c;
-    if (flags & HAS_SSE4_1) vpx_sad16x16x8 = vpx_sad16x16x8_sse4_1;
-    vpx_sad16x8x3 = vpx_sad16x8x3_c;
-    if (flags & HAS_SSE3) vpx_sad16x8x3 = vpx_sad16x8x3_sse3;
-    if (flags & HAS_SSSE3) vpx_sad16x8x3 = vpx_sad16x8x3_ssse3;
-    vpx_sad16x8x8 = vpx_sad16x8x8_c;
-    if (flags & HAS_SSE4_1) vpx_sad16x8x8 = vpx_sad16x8x8_sse4_1;
+    if (flags & HAS_AVX2) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx2;
     vpx_sad32x16 = vpx_sad32x16_sse2;
     if (flags & HAS_AVX2) vpx_sad32x16 = vpx_sad32x16_avx2;
     vpx_sad32x16_avg = vpx_sad32x16_avg_sse2;
@@ -1107,30 +1188,52 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) vpx_sad32x64 = vpx_sad32x64_avx2;
     vpx_sad32x64_avg = vpx_sad32x64_avg_sse2;
     if (flags & HAS_AVX2) vpx_sad32x64_avg = vpx_sad32x64_avg_avx2;
-    vpx_sad4x4x3 = vpx_sad4x4x3_c;
-    if (flags & HAS_SSE3) vpx_sad4x4x3 = vpx_sad4x4x3_sse3;
-    vpx_sad4x4x8 = vpx_sad4x4x8_c;
-    if (flags & HAS_SSE4_1) vpx_sad4x4x8 = vpx_sad4x4x8_sse4_1;
     vpx_sad64x32 = vpx_sad64x32_sse2;
     if (flags & HAS_AVX2) vpx_sad64x32 = vpx_sad64x32_avx2;
+    if (flags & HAS_AVX512) vpx_sad64x32 = vpx_sad64x32_avx512;
     vpx_sad64x32_avg = vpx_sad64x32_avg_sse2;
     if (flags & HAS_AVX2) vpx_sad64x32_avg = vpx_sad64x32_avg_avx2;
+    if (flags & HAS_AVX512) vpx_sad64x32_avg = vpx_sad64x32_avg_avx512;
     vpx_sad64x64 = vpx_sad64x64_sse2;
     if (flags & HAS_AVX2) vpx_sad64x64 = vpx_sad64x64_avx2;
+    if (flags & HAS_AVX512) vpx_sad64x64 = vpx_sad64x64_avx512;
     vpx_sad64x64_avg = vpx_sad64x64_avg_sse2;
     if (flags & HAS_AVX2) vpx_sad64x64_avg = vpx_sad64x64_avg_avx2;
+    if (flags & HAS_AVX512) vpx_sad64x64_avg = vpx_sad64x64_avg_avx512;
     vpx_sad64x64x4d = vpx_sad64x64x4d_sse2;
     if (flags & HAS_AVX2) vpx_sad64x64x4d = vpx_sad64x64x4d_avx2;
-    vpx_sad8x16x3 = vpx_sad8x16x3_c;
-    if (flags & HAS_SSE3) vpx_sad8x16x3 = vpx_sad8x16x3_sse3;
-    vpx_sad8x16x8 = vpx_sad8x16x8_c;
-    if (flags & HAS_SSE4_1) vpx_sad8x16x8 = vpx_sad8x16x8_sse4_1;
-    vpx_sad8x8x3 = vpx_sad8x8x3_c;
-    if (flags & HAS_SSE3) vpx_sad8x8x3 = vpx_sad8x8x3_sse3;
-    vpx_sad8x8x8 = vpx_sad8x8x8_c;
-    if (flags & HAS_SSE4_1) vpx_sad8x8x8 = vpx_sad8x8x8_sse4_1;
+    if (flags & HAS_AVX512) vpx_sad64x64x4d = vpx_sad64x64x4d_avx512;
+    vpx_sad_skip_32x16 = vpx_sad_skip_32x16_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_avx2;
+    vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_avx2;
+    vpx_sad_skip_32x32 = vpx_sad_skip_32x32_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_avx2;
+    vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_avx2;
+    vpx_sad_skip_32x64 = vpx_sad_skip_32x64_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_avx2;
+    vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_avx2;
+    vpx_sad_skip_64x32 = vpx_sad_skip_64x32_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_avx2;
+    if (flags & HAS_AVX512) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_avx512;
+    vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_avx2;
+    if (flags & HAS_AVX512) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_avx512;
+    vpx_sad_skip_64x64 = vpx_sad_skip_64x64_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_avx2;
+    if (flags & HAS_AVX512) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_avx512;
+    vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_avx2;
+    if (flags & HAS_AVX512) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_avx512;
+    vpx_satd = vpx_satd_sse2;
+    if (flags & HAS_AVX2) vpx_satd = vpx_satd_avx2;
     vpx_scaled_2d = vpx_scaled_2d_c;
     if (flags & HAS_SSSE3) vpx_scaled_2d = vpx_scaled_2d_ssse3;
+    vpx_sse = vpx_sse_c;
+    if (flags & HAS_SSE4_1) vpx_sse = vpx_sse_sse4_1;
+    if (flags & HAS_AVX2) vpx_sse = vpx_sse_avx2;
     vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_sse2;
     if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_ssse3;
     vpx_sub_pixel_avg_variance16x32 = vpx_sub_pixel_avg_variance16x32_sse2;
@@ -1187,16 +1290,30 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x4 = vpx_sub_pixel_variance8x4_ssse3;
     vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_sse2;
     if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_ssse3;
+    vpx_subtract_block = vpx_subtract_block_sse2;
+    if (flags & HAS_AVX2) vpx_subtract_block = vpx_subtract_block_avx2;
     vpx_variance16x16 = vpx_variance16x16_sse2;
     if (flags & HAS_AVX2) vpx_variance16x16 = vpx_variance16x16_avx2;
+    vpx_variance16x32 = vpx_variance16x32_sse2;
+    if (flags & HAS_AVX2) vpx_variance16x32 = vpx_variance16x32_avx2;
+    vpx_variance16x8 = vpx_variance16x8_sse2;
+    if (flags & HAS_AVX2) vpx_variance16x8 = vpx_variance16x8_avx2;
     vpx_variance32x16 = vpx_variance32x16_sse2;
     if (flags & HAS_AVX2) vpx_variance32x16 = vpx_variance32x16_avx2;
     vpx_variance32x32 = vpx_variance32x32_sse2;
     if (flags & HAS_AVX2) vpx_variance32x32 = vpx_variance32x32_avx2;
+    vpx_variance32x64 = vpx_variance32x64_sse2;
+    if (flags & HAS_AVX2) vpx_variance32x64 = vpx_variance32x64_avx2;
     vpx_variance64x32 = vpx_variance64x32_sse2;
     if (flags & HAS_AVX2) vpx_variance64x32 = vpx_variance64x32_avx2;
     vpx_variance64x64 = vpx_variance64x64_sse2;
     if (flags & HAS_AVX2) vpx_variance64x64 = vpx_variance64x64_avx2;
+    vpx_variance8x16 = vpx_variance8x16_sse2;
+    if (flags & HAS_AVX2) vpx_variance8x16 = vpx_variance8x16_avx2;
+    vpx_variance8x4 = vpx_variance8x4_sse2;
+    if (flags & HAS_AVX2) vpx_variance8x4 = vpx_variance8x4_avx2;
+    vpx_variance8x8 = vpx_variance8x8_sse2;
+    if (flags & HAS_AVX2) vpx_variance8x8 = vpx_variance8x8_avx2;
 }
 #endif
 
@@ -1204,4 +1321,4 @@ static void setup_rtcd_internal(void)
 }  // extern "C"
 #endif
 
-#endif
+#endif  // VPX_DSP_RTCD_H_
diff --git a/media/libvpx/config/linux/x64/vpx_scale_rtcd.h b/media/libvpx/config/linux/x64/vpx_scale_rtcd.h
index ddf7d01cca..18e5b71579 100644
--- a/media/libvpx/config/linux/x64/vpx_scale_rtcd.h
+++ b/media/libvpx/config/linux/x64/vpx_scale_rtcd.h
@@ -1,3 +1,14 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
 #ifndef VPX_SCALE_RTCD_H_
 #define VPX_SCALE_RTCD_H_
 
@@ -46,6 +57,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
 void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
 #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
 
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
 void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
 #define vpx_yv12_copy_y vpx_yv12_copy_y_c
 
@@ -66,4 +80,4 @@ static void setup_rtcd_internal(void)
 }  // extern "C"
 #endif
 
-#endif
+#endif  // VPX_SCALE_RTCD_H_
diff --git a/media/libvpx/config/mac/arm64/vp8_rtcd.h b/media/libvpx/config/mac/arm64/vp8_rtcd.h
new file mode 100644
index 0000000000..36e6855a6d
--- /dev/null
+++ b/media/libvpx/config/mac/arm64/vp8_rtcd.h
@@ -0,0 +1,211 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
+#ifndef VP8_RTCD_H_
+#define VP8_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * VP8
+ */
+
+struct blockd;
+struct macroblockd;
+struct loop_filter_info;
+
+/* Encoder forward decls */
+struct block;
+struct macroblock;
+struct variance_vtable;
+union int_mv;
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict16x16_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_neon
+
+void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_neon
+
+void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_neon
+
+void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_neon
+
+int vp8_block_error_c(short *coeff, short *dqcoeff);
+#define vp8_block_error vp8_block_error_c
+
+void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
+#define vp8_copy32xn vp8_copy32xn_c
+
+void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem16x16_neon(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+#define vp8_copy_mem16x16 vp8_copy_mem16x16_neon
+
+void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem8x4_neon(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+#define vp8_copy_mem8x4 vp8_copy_mem8x4_neon
+
+void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem8x8_neon(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+#define vp8_copy_mem8x8 vp8_copy_mem8x8_neon
+
+void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+void vp8_dc_only_idct_add_neon(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+#define vp8_dc_only_idct_add vp8_dc_only_idct_add_neon
+
+int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+#define vp8_denoiser_filter vp8_denoiser_filter_neon
+
+int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+int vp8_denoiser_filter_uv_neon(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
+#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_neon
+
+void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride);
+void vp8_dequant_idct_add_neon(short *input, short *dq, unsigned char *dest, int stride);
+#define vp8_dequant_idct_add vp8_dequant_idct_add_neon
+
+void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
+void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
+#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon
+
+void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
+void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
+#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_neon
+
+void vp8_dequantize_b_c(struct blockd*, short *DQC);
+void vp8_dequantize_b_neon(struct blockd*, short *DQC);
+#define vp8_dequantize_b vp8_dequantize_b_neon
+
+int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_diamond_search_sad vp8_diamond_search_sad_c
+
+void vp8_fast_quantize_b_c(struct block *, struct blockd *);
+void vp8_fast_quantize_b_neon(struct block *, struct blockd *);
+#define vp8_fast_quantize_b vp8_fast_quantize_b_neon
+
+void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_bh vp8_loop_filter_bh_neon
+
+void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_bv vp8_loop_filter_bv_neon
+
+void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_mbh vp8_loop_filter_mbh_neon
+
+void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+#define vp8_loop_filter_mbv vp8_loop_filter_mbv_neon
+
+void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_neon
+
+void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_bvs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_neon
+
+void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_mbh vp8_loop_filter_mbhs_neon
+
+void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+#define vp8_loop_filter_simple_mbv vp8_loop_filter_mbvs_neon
+
+int vp8_mbblock_error_c(struct macroblock *mb, int dc);
+#define vp8_mbblock_error vp8_mbblock_error_c
+
+int vp8_mbuverror_c(struct macroblock *mb);
+#define vp8_mbuverror vp8_mbuverror_c
+
+int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+#define vp8_refining_search_sad vp8_refining_search_sad_c
+
+void vp8_regular_quantize_b_c(struct block *, struct blockd *);
+#define vp8_regular_quantize_b vp8_regular_quantize_b_c
+
+void vp8_short_fdct4x4_c(short *input, short *output, int pitch);
+void vp8_short_fdct4x4_neon(short *input, short *output, int pitch);
+#define vp8_short_fdct4x4 vp8_short_fdct4x4_neon
+
+void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
+void vp8_short_fdct8x4_neon(short *input, short *output, int pitch);
+#define vp8_short_fdct8x4 vp8_short_fdct8x4_neon
+
+void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+void vp8_short_idct4x4llm_neon(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+#define vp8_short_idct4x4llm vp8_short_idct4x4llm_neon
+
+void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff);
+void vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff);
+#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_neon
+
+void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff);
+#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c
+
+void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
+void vp8_short_walsh4x4_neon(short *input, short *output, int pitch);
+#define vp8_short_walsh4x4 vp8_short_walsh4x4_neon
+
+void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_neon
+
+void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_neon
+
+void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_neon
+
+void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_neon
+
+void vp8_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/arm.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = arm_cpu_caps();
+
+    (void)flags;
+
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_RTCD_H_
diff --git a/media/libvpx/config/mac/arm64/vp9_rtcd.h b/media/libvpx/config/mac/arm64/vp9_rtcd.h
new file mode 100644
index 0000000000..e1b572fe62
--- /dev/null
+++ b/media/libvpx/config/mac/arm64/vp9_rtcd.h
@@ -0,0 +1,125 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
+#ifndef VP9_RTCD_H_
+#define VP9_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * VP9
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
+#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER
+#include "vp9/encoder/vp9_temporal_filter.h"
+#endif
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct macroblock_plane;
+struct vp9_sad_table;
+struct ScanOrder;
+struct search_site_config;
+struct mv;
+union int_mv;
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
+int64_t vp9_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
+int64_t vp9_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
+RTCD_EXTERN int64_t (*vp9_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
+
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+RTCD_EXTERN int64_t (*vp9_block_error_fp)(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+
+int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv);
+int vp9_diamond_search_sad_neon(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv);
+#define vp9_diamond_search_sad vp9_diamond_search_sad_neon
+
+void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+void vp9_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht16x16 vp9_fht16x16_neon
+
+void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+void vp9_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht4x4 vp9_fht4x4_neon
+
+void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+void vp9_fht8x8_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht8x8 vp9_fht8x8_neon
+
+void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_fwht4x4 vp9_fwht4x4_c
+
+void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+#define vp9_iht16x16_256_add vp9_iht16x16_256_add_neon
+
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+#define vp9_iht4x4_16_add vp9_iht4x4_16_add_neon
+
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+#define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon
+
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+#define vp9_quantize_fp vp9_quantize_fp_neon
+
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_neon
+
+void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+void vp9_scale_and_extend_frame_neon(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_neon
+
+void vp9_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/arm.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = arm_cpu_caps();
+
+    (void)flags;
+
+    vp9_block_error = vp9_block_error_neon;
+    if (flags & HAS_SVE) vp9_block_error = vp9_block_error_sve;
+    vp9_block_error_fp = vp9_block_error_fp_neon;
+    if (flags & HAS_SVE) vp9_block_error_fp = vp9_block_error_fp_sve;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_RTCD_H_
diff --git a/media/libvpx/config/mac/arm64/vpx_config.asm b/media/libvpx/config/mac/arm64/vpx_config.asm
new file mode 100644
index 0000000000..cd4868978f
--- /dev/null
+++ b/media/libvpx/config/mac/arm64/vpx_config.asm
@@ -0,0 +1,97 @@
+@ This file was created from a .asm file
+@  using the ads2gas.pl script.
+.syntax unified
+.equ VPX_ARCH_ARM ,  1
+.equ VPX_ARCH_AARCH64 ,  1
+.equ VPX_ARCH_MIPS ,  0
+.equ VPX_ARCH_X86 ,  0
+.equ VPX_ARCH_X86_64 ,  0
+.equ VPX_ARCH_PPC ,  0
+.equ VPX_ARCH_LOONGARCH ,  0
+.equ HAVE_NEON_ASM ,  0
+.equ HAVE_NEON ,  1
+.equ HAVE_NEON_DOTPROD ,  1
+.equ HAVE_NEON_I8MM ,  1
+.equ HAVE_SVE ,  1
+.equ HAVE_SVE2 ,  1
+.equ HAVE_MIPS32 ,  0
+.equ HAVE_DSPR2 ,  0
+.equ HAVE_MSA ,  0
+.equ HAVE_MIPS64 ,  0
+.equ HAVE_MMX ,  0
+.equ HAVE_SSE ,  0
+.equ HAVE_SSE2 ,  0
+.equ HAVE_SSE3 ,  0
+.equ HAVE_SSSE3 ,  0
+.equ HAVE_SSE4_1 ,  0
+.equ HAVE_AVX ,  0
+.equ HAVE_AVX2 ,  0
+.equ HAVE_AVX512 ,  0
+.equ HAVE_VSX ,  0
+.equ HAVE_MMI ,  0
+.equ HAVE_LSX ,  0
+.equ HAVE_LASX ,  0
+.equ HAVE_VPX_PORTS ,  1
+.equ HAVE_PTHREAD_H ,  1
+.equ CONFIG_DEPENDENCY_TRACKING ,  1
+.equ CONFIG_EXTERNAL_BUILD ,  1
+.equ CONFIG_INSTALL_DOCS ,  0
+.equ CONFIG_INSTALL_BINS ,  1
+.equ CONFIG_INSTALL_LIBS ,  1
+.equ CONFIG_INSTALL_SRCS ,  0
+.equ CONFIG_DEBUG ,  0
+.equ CONFIG_GPROF ,  0
+.equ CONFIG_GCOV ,  0
+.equ CONFIG_RVCT ,  0
+.equ CONFIG_GCC ,  1
+.equ CONFIG_MSVS ,  0
+.equ CONFIG_PIC ,  1
+.equ CONFIG_BIG_ENDIAN ,  0
+.equ CONFIG_CODEC_SRCS ,  0
+.equ CONFIG_DEBUG_LIBS ,  0
+.equ CONFIG_DEQUANT_TOKENS ,  0
+.equ CONFIG_DC_RECON ,  0
+.equ CONFIG_RUNTIME_CPU_DETECT ,  1
+.equ CONFIG_POSTPROC ,  0
+.equ CONFIG_VP9_POSTPROC ,  0
+.equ CONFIG_MULTITHREAD ,  1
+.equ CONFIG_INTERNAL_STATS ,  0
+.equ CONFIG_VP8_ENCODER ,  1
+.equ CONFIG_VP8_DECODER ,  1
+.equ CONFIG_VP9_ENCODER ,  1
+.equ CONFIG_VP9_DECODER ,  1
+.equ CONFIG_VP8 ,  1
+.equ CONFIG_VP9 ,  1
+.equ CONFIG_ENCODERS ,  1
+.equ CONFIG_DECODERS ,  1
+.equ CONFIG_STATIC_MSVCRT ,  0
+.equ CONFIG_SPATIAL_RESAMPLING ,  1
+.equ CONFIG_REALTIME_ONLY ,  1
+.equ CONFIG_ONTHEFLY_BITPACKING ,  0
+.equ CONFIG_ERROR_CONCEALMENT ,  0
+.equ CONFIG_SHARED ,  0
+.equ CONFIG_STATIC ,  1
+.equ CONFIG_SMALL ,  0
+.equ CONFIG_POSTPROC_VISUALIZER ,  0
+.equ CONFIG_OS_SUPPORT ,  1
+.equ CONFIG_UNIT_TESTS ,  0
+.equ CONFIG_WEBM_IO ,  0
+.equ CONFIG_LIBYUV ,  0
+.equ CONFIG_DECODE_PERF_TESTS ,  0
+.equ CONFIG_ENCODE_PERF_TESTS ,  0
+.equ CONFIG_MULTI_RES_ENCODING ,  1
+.equ CONFIG_TEMPORAL_DENOISING ,  1
+.equ CONFIG_VP9_TEMPORAL_DENOISING ,  0
+.equ CONFIG_COEFFICIENT_RANGE_CHECKING ,  0
+.equ CONFIG_VP9_HIGHBITDEPTH ,  0
+.equ CONFIG_BETTER_HW_COMPATIBILITY ,  0
+.equ CONFIG_EXPERIMENTAL ,  0
+.equ CONFIG_SIZE_LIMIT ,  1
+.equ CONFIG_ALWAYS_ADJUST_BPM ,  0
+.equ CONFIG_BITSTREAM_DEBUG ,  0
+.equ CONFIG_MISMATCH_DEBUG ,  0
+.equ CONFIG_FP_MB_STATS ,  0
+.equ CONFIG_EMULATE_HARDWARE ,  0
+.equ CONFIG_NON_GREEDY_MV ,  0
+.equ CONFIG_COLLECT_COMPONENT_TIMING ,  0
+    .section .note.GNU-stack,"",%progbits
diff --git a/media/libvpx/config/mac/arm64/vpx_config.c b/media/libvpx/config/mac/arm64/vpx_config.c
new file mode 100644
index 0000000000..9689084601
--- /dev/null
+++ b/media/libvpx/config/mac/arm64/vpx_config.c
@@ -0,0 +1,10 @@
+/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */
+/*  */
+/* Use of this source code is governed by a BSD-style license */
+/* that can be found in the LICENSE file in the root of the source */
+/* tree. An additional intellectual property rights grant can be found */
+/* in the file PATENTS.  All contributing project authors may */
+/* be found in the AUTHORS file in the root of the source tree. */
+#include "vpx/vpx_codec.h"
+static const char* const cfg = "--target=arm64-darwin-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --enable-runtime-cpu-detect --enable-realtime-only";
+const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/media/libvpx/config/mac/arm64/vpx_config.h b/media/libvpx/config/mac/arm64/vpx_config.h
new file mode 100644
index 0000000000..a9808406c8
--- /dev/null
+++ b/media/libvpx/config/mac/arm64/vpx_config.h
@@ -0,0 +1,108 @@
+/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */
+/*  */
+/* Use of this source code is governed by a BSD-style license */
+/* that can be found in the LICENSE file in the root of the source */
+/* tree. An additional intellectual property rights grant can be found */
+/* in the file PATENTS.  All contributing project authors may */
+/* be found in the AUTHORS file in the root of the source tree. */
+/* This file automatically generated by configure. Do not edit! */
+#ifndef VPX_CONFIG_H
+#define VPX_CONFIG_H
+#define RESTRICT    
+#define INLINE      inline
+#define VPX_ARCH_ARM 1
+#define VPX_ARCH_AARCH64 1
+#define VPX_ARCH_MIPS 0
+#define VPX_ARCH_X86 0
+#define VPX_ARCH_X86_64 0
+#define VPX_ARCH_PPC 0
+#define VPX_ARCH_LOONGARCH 0
+#define HAVE_NEON_ASM 0
+#define HAVE_NEON 1
+#define HAVE_NEON_DOTPROD 1
+#define HAVE_NEON_I8MM 1
+#define HAVE_SVE 1
+#define HAVE_SVE2 1
+#define HAVE_MIPS32 0
+#define HAVE_DSPR2 0
+#define HAVE_MSA 0
+#define HAVE_MIPS64 0
+#define HAVE_MMX 0
+#define HAVE_SSE 0
+#define HAVE_SSE2 0
+#define HAVE_SSE3 0
+#define HAVE_SSSE3 0
+#define HAVE_SSE4_1 0
+#define HAVE_AVX 0
+#define HAVE_AVX2 0
+#define HAVE_AVX512 0
+#define HAVE_VSX 0
+#define HAVE_MMI 0
+#define HAVE_LSX 0
+#define HAVE_LASX 0
+#define HAVE_VPX_PORTS 1
+#define HAVE_PTHREAD_H 1
+#define CONFIG_DEPENDENCY_TRACKING 1
+#define CONFIG_EXTERNAL_BUILD 1
+#define CONFIG_INSTALL_DOCS 0
+#define CONFIG_INSTALL_BINS 1
+#define CONFIG_INSTALL_LIBS 1
+#define CONFIG_INSTALL_SRCS 0
+#define CONFIG_DEBUG 0
+#define CONFIG_GPROF 0
+#define CONFIG_GCOV 0
+#define CONFIG_RVCT 0
+#define CONFIG_GCC 1
+#define CONFIG_MSVS 0
+#define CONFIG_PIC 1
+#define CONFIG_BIG_ENDIAN 0
+#define CONFIG_CODEC_SRCS 0
+#define CONFIG_DEBUG_LIBS 0
+#define CONFIG_DEQUANT_TOKENS 0
+#define CONFIG_DC_RECON 0
+#define CONFIG_RUNTIME_CPU_DETECT 1
+#define CONFIG_POSTPROC 0
+#define CONFIG_VP9_POSTPROC 0
+#define CONFIG_MULTITHREAD 1
+#define CONFIG_INTERNAL_STATS 0
+#define CONFIG_VP8_ENCODER 1
+#define CONFIG_VP8_DECODER 1
+#define CONFIG_VP9_ENCODER 1
+#define CONFIG_VP9_DECODER 1
+#define CONFIG_VP8 1
+#define CONFIG_VP9 1
+#define CONFIG_ENCODERS 1
+#define CONFIG_DECODERS 1
+#define CONFIG_STATIC_MSVCRT 0
+#define CONFIG_SPATIAL_RESAMPLING 1
+#define CONFIG_REALTIME_ONLY 1
+#define CONFIG_ONTHEFLY_BITPACKING 0
+#define CONFIG_ERROR_CONCEALMENT 0
+#define CONFIG_SHARED 0
+#define CONFIG_STATIC 1
+#define CONFIG_SMALL 0
+#define CONFIG_POSTPROC_VISUALIZER 0
+#define CONFIG_OS_SUPPORT 1
+#define CONFIG_UNIT_TESTS 0
+#define CONFIG_WEBM_IO 0
+#define CONFIG_LIBYUV 0
+#define CONFIG_DECODE_PERF_TESTS 0
+#define CONFIG_ENCODE_PERF_TESTS 0
+#define CONFIG_MULTI_RES_ENCODING 1
+#define CONFIG_TEMPORAL_DENOISING 1
+#define CONFIG_VP9_TEMPORAL_DENOISING 0
+#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_VP9_HIGHBITDEPTH 0
+#define CONFIG_BETTER_HW_COMPATIBILITY 0
+#define CONFIG_EXPERIMENTAL 0
+#define CONFIG_SIZE_LIMIT 1
+#define CONFIG_ALWAYS_ADJUST_BPM 0
+#define CONFIG_BITSTREAM_DEBUG 0
+#define CONFIG_MISMATCH_DEBUG 0
+#define CONFIG_FP_MB_STATS 0
+#define CONFIG_EMULATE_HARDWARE 0
+#define CONFIG_NON_GREEDY_MV 0
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
+#define DECODE_WIDTH_LIMIT 8192
+#define DECODE_HEIGHT_LIMIT 4608
+#endif /* VPX_CONFIG_H */
diff --git a/media/libvpx/config/mac/arm64/vpx_dsp_rtcd.h b/media/libvpx/config/mac/arm64/vpx_dsp_rtcd.h
new file mode 100644
index 0000000000..3c34cafc59
--- /dev/null
+++ b/media/libvpx/config/mac/arm64/vpx_dsp_rtcd.h
@@ -0,0 +1,1197 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
+#ifndef VPX_DSP_RTCD_H_
+#define VPX_DSP_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * DSP
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#if CONFIG_VP9_ENCODER
+ struct macroblock_plane;
+ struct ScanOrder;
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned int vpx_avg_4x4_c(const uint8_t *, int p);
+unsigned int vpx_avg_4x4_neon(const uint8_t *, int p);
+#define vpx_avg_4x4 vpx_avg_4x4_neon
+
+unsigned int vpx_avg_8x8_c(const uint8_t *, int p);
+unsigned int vpx_avg_8x8_neon(const uint8_t *, int p);
+#define vpx_avg_8x8 vpx_avg_8x8_neon
+
+void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+void vpx_comp_avg_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+#define vpx_comp_avg_pred vpx_comp_avg_pred_neon
+
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve_avg vpx_convolve_avg_neon
+
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_convolve_copy vpx_convolve_copy_neon
+
+void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_neon
+
+void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_neon
+
+void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_neon
+
+void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_neon
+
+void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_neon
+
+void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_neon
+
+void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_neon
+
+void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_neon
+
+void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_neon
+
+void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_neon
+
+void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_neon
+
+void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_neon
+
+void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_neon
+
+void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_neon
+
+void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_neon
+
+void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_neon
+
+void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_neon
+
+void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_neon
+
+void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_neon
+
+void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_neon
+
+void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c
+
+void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_neon
+
+void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_neon
+
+void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_neon
+
+void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_neon
+
+void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c
+
+void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_neon
+
+void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_neon
+
+void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_neon
+
+void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_neon
+
+void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_neon
+
+void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_neon
+
+void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_neon
+
+void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_neon
+
+void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_neon
+
+void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_neon
+
+void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_neon
+
+void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_neon
+
+void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_neon
+
+void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_neon
+
+void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_neon
+
+void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_neon
+
+void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16 vpx_fdct16x16_neon
+
+void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16_1 vpx_fdct16x16_1_neon
+
+void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32 vpx_fdct32x32_neon
+
+void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_1 vpx_fdct32x32_1_neon
+
+void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_rd vpx_fdct32x32_rd_neon
+
+void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4 vpx_fdct4x4_neon
+
+void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4_1 vpx_fdct4x4_1_neon
+
+void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct8x8 vpx_fdct8x8_neon
+
+void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct8x8_1 vpx_fdct8x8_1_neon
+
+void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+
+unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride);
+unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride);
+unsigned int vpx_get4x4sse_cs_neon_dotprod(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_get4x4sse_cs)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride);
+
+void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get8x8var_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+
+unsigned int vpx_get_mb_ss_c(const int16_t *);
+#define vpx_get_mb_ss vpx_get_mb_ss_c
+
+void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_16x16 vpx_h_predictor_16x16_neon
+
+void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_32x32 vpx_h_predictor_32x32_neon
+
+void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_4x4 vpx_h_predictor_4x4_neon
+
+void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_neon
+
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+#define vpx_hadamard_16x16 vpx_hadamard_16x16_neon
+
+void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+#define vpx_hadamard_32x32 vpx_hadamard_32x32_neon
+
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+#define vpx_hadamard_8x8 vpx_hadamard_8x8_neon
+
+void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
+
+void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_10_add vpx_idct16x16_10_add_neon
+
+void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_1_add vpx_idct16x16_1_add_neon
+
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_256_add vpx_idct16x16_256_add_neon
+
+void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_38_add vpx_idct16x16_38_add_neon
+
+void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_neon
+
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct32x32_135_add vpx_idct32x32_135_add_neon
+
+void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct32x32_1_add vpx_idct32x32_1_add_neon
+
+void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct32x32_34_add vpx_idct32x32_34_add_neon
+
+void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct4x4_16_add vpx_idct4x4_16_add_neon
+
+void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct4x4_1_add vpx_idct4x4_1_add_neon
+
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct8x8_12_add vpx_idct8x8_12_add_neon
+
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct8x8_1_add vpx_idct8x8_1_add_neon
+
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct8x8_64_add vpx_idct8x8_64_add_neon
+
+int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
+int16_t vpx_int_pro_col_neon(const uint8_t *ref, const int width);
+#define vpx_int_pro_col vpx_int_pro_col_neon
+
+void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
+void vpx_int_pro_row_neon(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
+#define vpx_int_pro_row vpx_int_pro_row_neon
+
+void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
+
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
+
+void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_horizontal_16_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_horizontal_16 vpx_lpf_horizontal_16_neon
+
+void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_horizontal_16_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_horizontal_16_dual vpx_lpf_horizontal_16_dual_neon
+
+void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_horizontal_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_horizontal_4 vpx_lpf_horizontal_4_neon
+
+void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_horizontal_4_dual vpx_lpf_horizontal_4_dual_neon
+
+void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_horizontal_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_horizontal_8 vpx_lpf_horizontal_8_neon
+
+void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_horizontal_8_dual vpx_lpf_horizontal_8_dual_neon
+
+void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_vertical_16_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_16 vpx_lpf_vertical_16_neon
+
+void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_16_dual vpx_lpf_vertical_16_dual_neon
+
+void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_vertical_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_4 vpx_lpf_vertical_4_neon
+
+void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_vertical_4_dual vpx_lpf_vertical_4_dual_neon
+
+void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vpx_lpf_vertical_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vpx_lpf_vertical_8 vpx_lpf_vertical_8_neon
+
+void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_neon
+
+void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+void vpx_minmax_8x8_neon(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vpx_minmax_8x8 vpx_minmax_8x8_neon
+
+unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse8x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse8x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+#define vpx_quantize_b vpx_quantize_b_neon
+
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_neon
+
+unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad16x16_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad16x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x16x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad16x32_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad16x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad16x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad16x8_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad16x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x8x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad32x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad32x16_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad32x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x16x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad32x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad32x32_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad32x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad32x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad32x64_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad32x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x64x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x4 vpx_sad4x4_neon
+
+unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x4_avg vpx_sad4x4_avg_neon
+
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad4x4x4d vpx_sad4x4x4d_neon
+
+unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x8 vpx_sad4x8_neon
+
+unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x8_avg vpx_sad4x8_avg_neon
+
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad4x8x4d vpx_sad4x8x4d_neon
+
+unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad64x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad64x32_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad64x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad64x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad64x64_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+RTCD_EXTERN unsigned int (*vpx_sad64x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x16 vpx_sad8x16_neon
+
+unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x16_avg vpx_sad8x16_avg_neon
+
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad8x16x4d vpx_sad8x16x4d_neon
+
+unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x4 vpx_sad8x4_neon
+
+unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x4_avg vpx_sad8x4_avg_neon
+
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad8x4x4d vpx_sad8x4x4d_neon
+
+unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x8 vpx_sad8x8_neon
+
+unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x8_avg vpx_sad8x8_avg_neon
+
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad8x8x4d vpx_sad8x8x4d_neon
+
+unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x16x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x8x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x16x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x64x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_neon
+
+void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_neon
+
+unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_neon
+
+void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_neon
+
+unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x64x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_neon
+
+void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_neon
+
+unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_neon
+
+void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_neon
+
+unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_neon
+
+void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_neon
+
+int vpx_satd_c(const int16_t *coeff, int length);
+int vpx_satd_neon(const int16_t *coeff, int length);
+#define vpx_satd vpx_satd_neon
+
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_2d vpx_scaled_2d_neon
+
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
+
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
+
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
+
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_horiz vpx_scaled_horiz_c
+
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_vert vpx_scaled_vert_c
+
+int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+int64_t vpx_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+int64_t vpx_sse_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+RTCD_EXTERN int64_t (*vpx_sse)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+
+uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_neon
+
+uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_neon
+
+uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_neon
+
+uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_neon
+
+uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_neon
+
+uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_neon
+
+uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_neon
+
+uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_neon
+
+uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_neon
+
+uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_neon
+
+uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_neon
+
+uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_neon
+
+uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_neon
+
+uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_neon
+
+uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_neon
+
+uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_neon
+
+uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_neon
+
+uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_neon
+
+uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_neon
+
+uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_neon
+
+uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_neon
+
+uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_neon
+
+uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_neon
+
+uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_neon
+
+uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_neon
+
+uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance8x8 vpx_sub_pixel_variance8x8_neon
+
+void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+void vpx_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+#define vpx_subtract_block vpx_subtract_block_neon
+
+uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size);
+uint64_t vpx_sum_squares_2d_i16_neon(const int16_t *src, int stride, int size);
+uint64_t vpx_sum_squares_2d_i16_sve(const int16_t *src, int stride, int size);
+RTCD_EXTERN uint64_t (*vpx_sum_squares_2d_i16)(const int16_t *src, int stride, int size);
+
+void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_neon
+
+void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_neon
+
+void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_neon
+
+void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_neon
+
+void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_16x16 vpx_v_predictor_16x16_neon
+
+void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_32x32 vpx_v_predictor_32x32_neon
+
+void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_4x4 vpx_v_predictor_4x4_neon
+
+void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_v_predictor_8x8 vpx_v_predictor_8x8_neon
+
+unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x4_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance4x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x4_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c
+
+int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl);
+int vpx_vector_var_neon(const int16_t *ref, const int16_t *src, const int bwl);
+#define vpx_vector_var vpx_vector_var_neon
+
+void vpx_dsp_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/arm.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = arm_cpu_caps();
+
+    (void)flags;
+
+    vpx_convolve8 = vpx_convolve8_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_convolve8 = vpx_convolve8_neon_dotprod;
+    if (flags & HAS_NEON_I8MM) vpx_convolve8 = vpx_convolve8_neon_i8mm;
+    vpx_convolve8_avg = vpx_convolve8_avg_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_convolve8_avg = vpx_convolve8_avg_neon_dotprod;
+    if (flags & HAS_NEON_I8MM) vpx_convolve8_avg = vpx_convolve8_avg_neon_i8mm;
+    vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_neon_dotprod;
+    if (flags & HAS_NEON_I8MM) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_neon_i8mm;
+    vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_neon_dotprod;
+    if (flags & HAS_NEON_I8MM) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_neon_i8mm;
+    vpx_convolve8_horiz = vpx_convolve8_horiz_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_convolve8_horiz = vpx_convolve8_horiz_neon_dotprod;
+    if (flags & HAS_NEON_I8MM) vpx_convolve8_horiz = vpx_convolve8_horiz_neon_i8mm;
+    vpx_convolve8_vert = vpx_convolve8_vert_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_convolve8_vert = vpx_convolve8_vert_neon_dotprod;
+    if (flags & HAS_NEON_I8MM) vpx_convolve8_vert = vpx_convolve8_vert_neon_i8mm;
+    vpx_get16x16var = vpx_get16x16var_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_get16x16var = vpx_get16x16var_neon_dotprod;
+    vpx_get4x4sse_cs = vpx_get4x4sse_cs_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_get4x4sse_cs = vpx_get4x4sse_cs_neon_dotprod;
+    vpx_get8x8var = vpx_get8x8var_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_get8x8var = vpx_get8x8var_neon_dotprod;
+    vpx_mse16x16 = vpx_mse16x16_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_mse16x16 = vpx_mse16x16_neon_dotprod;
+    vpx_mse16x8 = vpx_mse16x8_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_mse16x8 = vpx_mse16x8_neon_dotprod;
+    vpx_mse8x16 = vpx_mse8x16_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_mse8x16 = vpx_mse8x16_neon_dotprod;
+    vpx_mse8x8 = vpx_mse8x8_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_mse8x8 = vpx_mse8x8_neon_dotprod;
+    vpx_sad16x16 = vpx_sad16x16_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad16x16 = vpx_sad16x16_neon_dotprod;
+    vpx_sad16x16_avg = vpx_sad16x16_avg_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad16x16_avg = vpx_sad16x16_avg_neon_dotprod;
+    vpx_sad16x16x4d = vpx_sad16x16x4d_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad16x16x4d = vpx_sad16x16x4d_neon_dotprod;
+    vpx_sad16x32 = vpx_sad16x32_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad16x32 = vpx_sad16x32_neon_dotprod;
+    vpx_sad16x32_avg = vpx_sad16x32_avg_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad16x32_avg = vpx_sad16x32_avg_neon_dotprod;
+    vpx_sad16x32x4d = vpx_sad16x32x4d_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad16x32x4d = vpx_sad16x32x4d_neon_dotprod;
+    vpx_sad16x8 = vpx_sad16x8_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad16x8 = vpx_sad16x8_neon_dotprod;
+    vpx_sad16x8_avg = vpx_sad16x8_avg_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad16x8_avg = vpx_sad16x8_avg_neon_dotprod;
+    vpx_sad16x8x4d = vpx_sad16x8x4d_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad16x8x4d = vpx_sad16x8x4d_neon_dotprod;
+    vpx_sad32x16 = vpx_sad32x16_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad32x16 = vpx_sad32x16_neon_dotprod;
+    vpx_sad32x16_avg = vpx_sad32x16_avg_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad32x16_avg = vpx_sad32x16_avg_neon_dotprod;
+    vpx_sad32x16x4d = vpx_sad32x16x4d_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad32x16x4d = vpx_sad32x16x4d_neon_dotprod;
+    vpx_sad32x32 = vpx_sad32x32_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad32x32 = vpx_sad32x32_neon_dotprod;
+    vpx_sad32x32_avg = vpx_sad32x32_avg_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad32x32_avg = vpx_sad32x32_avg_neon_dotprod;
+    vpx_sad32x32x4d = vpx_sad32x32x4d_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad32x32x4d = vpx_sad32x32x4d_neon_dotprod;
+    vpx_sad32x64 = vpx_sad32x64_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad32x64 = vpx_sad32x64_neon_dotprod;
+    vpx_sad32x64_avg = vpx_sad32x64_avg_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad32x64_avg = vpx_sad32x64_avg_neon_dotprod;
+    vpx_sad32x64x4d = vpx_sad32x64x4d_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad32x64x4d = vpx_sad32x64x4d_neon_dotprod;
+    vpx_sad64x32 = vpx_sad64x32_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad64x32 = vpx_sad64x32_neon_dotprod;
+    vpx_sad64x32_avg = vpx_sad64x32_avg_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad64x32_avg = vpx_sad64x32_avg_neon_dotprod;
+    vpx_sad64x32x4d = vpx_sad64x32x4d_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad64x32x4d = vpx_sad64x32x4d_neon_dotprod;
+    vpx_sad64x64 = vpx_sad64x64_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad64x64 = vpx_sad64x64_neon_dotprod;
+    vpx_sad64x64_avg = vpx_sad64x64_avg_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad64x64_avg = vpx_sad64x64_avg_neon_dotprod;
+    vpx_sad64x64x4d = vpx_sad64x64x4d_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad64x64x4d = vpx_sad64x64x4d_neon_dotprod;
+    vpx_sad_skip_16x16 = vpx_sad_skip_16x16_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x16 = vpx_sad_skip_16x16_neon_dotprod;
+    vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_neon_dotprod;
+    vpx_sad_skip_16x32 = vpx_sad_skip_16x32_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x32 = vpx_sad_skip_16x32_neon_dotprod;
+    vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_neon_dotprod;
+    vpx_sad_skip_16x8 = vpx_sad_skip_16x8_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x8 = vpx_sad_skip_16x8_neon_dotprod;
+    vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_neon_dotprod;
+    vpx_sad_skip_32x16 = vpx_sad_skip_32x16_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_neon_dotprod;
+    vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_neon_dotprod;
+    vpx_sad_skip_32x32 = vpx_sad_skip_32x32_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_neon_dotprod;
+    vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_neon_dotprod;
+    vpx_sad_skip_32x64 = vpx_sad_skip_32x64_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_neon_dotprod;
+    vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_neon_dotprod;
+    vpx_sad_skip_64x32 = vpx_sad_skip_64x32_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_neon_dotprod;
+    vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_neon_dotprod;
+    vpx_sad_skip_64x64 = vpx_sad_skip_64x64_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_neon_dotprod;
+    vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_neon_dotprod;
+    vpx_sse = vpx_sse_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_sse = vpx_sse_neon_dotprod;
+    vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_neon;
+    if (flags & HAS_SVE) vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_sve;
+    vpx_variance16x16 = vpx_variance16x16_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_variance16x16 = vpx_variance16x16_neon_dotprod;
+    vpx_variance16x32 = vpx_variance16x32_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_variance16x32 = vpx_variance16x32_neon_dotprod;
+    vpx_variance16x8 = vpx_variance16x8_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_variance16x8 = vpx_variance16x8_neon_dotprod;
+    vpx_variance32x16 = vpx_variance32x16_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_variance32x16 = vpx_variance32x16_neon_dotprod;
+    vpx_variance32x32 = vpx_variance32x32_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_variance32x32 = vpx_variance32x32_neon_dotprod;
+    vpx_variance32x64 = vpx_variance32x64_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_variance32x64 = vpx_variance32x64_neon_dotprod;
+    vpx_variance4x4 = vpx_variance4x4_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_variance4x4 = vpx_variance4x4_neon_dotprod;
+    vpx_variance4x8 = vpx_variance4x8_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_variance4x8 = vpx_variance4x8_neon_dotprod;
+    vpx_variance64x32 = vpx_variance64x32_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_variance64x32 = vpx_variance64x32_neon_dotprod;
+    vpx_variance64x64 = vpx_variance64x64_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_variance64x64 = vpx_variance64x64_neon_dotprod;
+    vpx_variance8x16 = vpx_variance8x16_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_variance8x16 = vpx_variance8x16_neon_dotprod;
+    vpx_variance8x4 = vpx_variance8x4_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_variance8x4 = vpx_variance8x4_neon_dotprod;
+    vpx_variance8x8 = vpx_variance8x8_neon;
+    if (flags & HAS_NEON_DOTPROD) vpx_variance8x8 = vpx_variance8x8_neon_dotprod;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_RTCD_H_
diff --git a/media/libvpx/config/mac/arm64/vpx_scale_rtcd.h b/media/libvpx/config/mac/arm64/vpx_scale_rtcd.h
new file mode 100644
index 0000000000..ca594ea140
--- /dev/null
+++ b/media/libvpx/config/mac/arm64/vpx_scale_rtcd.h
@@ -0,0 +1,85 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
+#ifndef VPX_SCALE_RTCD_H_
+#define VPX_SCALE_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c
+
+void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c
+
+void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c
+
+void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+#define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c
+
+void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+#define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c
+
+void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c
+
+void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
+#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c
+
+void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vp8_yv12_copy_frame vp8_yv12_copy_frame_c
+
+void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf);
+#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c
+
+void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
+#define vpx_extend_frame_borders vpx_extend_frame_borders_c
+
+void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
+#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
+
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
+void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_y vpx_yv12_copy_y_c
+
+void vpx_scale_rtcd(void);
+
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/arm.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = arm_cpu_caps();
+
+    (void)flags;
+
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_SCALE_RTCD_H_
diff --git a/media/libvpx/config/mac/ia32/vp8_rtcd.h b/media/libvpx/config/mac/ia32/vp8_rtcd.h
index 5db5bbad85..ca42bb8a5e 100644
--- a/media/libvpx/config/mac/ia32/vp8_rtcd.h
+++ b/media/libvpx/config/mac/ia32/vp8_rtcd.h
@@ -1,3 +1,14 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
 #ifndef VP8_RTCD_H_
 #define VP8_RTCD_H_
 
@@ -26,57 +37,48 @@ struct yv12_buffer_config;
 extern "C" {
 #endif
 
-void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict4x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-
-void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp8_blend_b vp8_blend_b_c
-
-void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp8_blend_mb_inner vp8_blend_mb_inner_c
-
-void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp8_blend_mb_outer vp8_blend_mb_outer_c
+void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
 int vp8_block_error_c(short *coeff, short *dqcoeff);
 int vp8_block_error_sse2(short *coeff, short *dqcoeff);
 RTCD_EXTERN int (*vp8_block_error)(short *coeff, short *dqcoeff);
 
-void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
-void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
-void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
-RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
+void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
+void vp8_copy32xn_sse2(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
+void vp8_copy32xn_sse3(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
+RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
 
-void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
 
-void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem8x4_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
 
-void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem8x8_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
 
-void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
-void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
-RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
+void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
 
 int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
 int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
@@ -86,9 +88,9 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, u
 int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
 RTCD_EXTERN int (*vp8_denoiser_filter_uv)(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
 
-void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride);
-void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *output, int stride);
-RTCD_EXTERN void (*vp8_dequant_idct_add)(short *input, short *dq, unsigned char *output, int stride);
+void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride);
+void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *dest, int stride);
+RTCD_EXTERN void (*vp8_dequant_idct_add)(short *input, short *dq, unsigned char *dest, int stride);
 
 void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
 void vp8_dequant_idct_add_uv_block_sse2(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
@@ -98,9 +100,9 @@ void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int
 void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
 RTCD_EXTERN void (*vp8_dequant_idct_add_y_block)(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
 
-void vp8_dequantize_b_c(struct blockd*, short *dqc);
-void vp8_dequantize_b_mmx(struct blockd*, short *dqc);
-RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *dqc);
+void vp8_dequantize_b_c(struct blockd*, short *DQC);
+void vp8_dequantize_b_mmx(struct blockd*, short *DQC);
+RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *DQC);
 
 int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
 int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
@@ -122,42 +124,37 @@ void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char
 void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight);
 RTCD_EXTERN void (*vp8_filter_by_weight8x8)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight);
 
-int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 
-void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+RTCD_EXTERN void (*vp8_loop_filter_bv)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 
-void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_bv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-RTCD_EXTERN void (*vp8_loop_filter_bv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+RTCD_EXTERN void (*vp8_loop_filter_mbh)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 
-void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_mbh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-RTCD_EXTERN void (*vp8_loop_filter_mbh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 
-void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+RTCD_EXTERN void (*vp8_loop_filter_simple_bh)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 
-void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_bhs_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
-RTCD_EXTERN void (*vp8_loop_filter_simple_bh)(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 
-void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
-RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 
-void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
-RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y, int ystride, const unsigned char *blimit);
-
-void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
-RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 
 int vp8_mbblock_error_c(struct macroblock *mb, int dc);
 int vp8_mbblock_error_sse2(struct macroblock *mb, int dc);
@@ -167,9 +164,9 @@ int vp8_mbuverror_c(struct macroblock *mb);
 int vp8_mbuverror_sse2(struct macroblock *mb);
 RTCD_EXTERN int (*vp8_mbuverror)(struct macroblock *mb);
 
-int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
 
 void vp8_regular_quantize_b_c(struct block *, struct blockd *);
 void vp8_regular_quantize_b_sse2(struct block *, struct blockd *);
@@ -184,40 +181,40 @@ void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
 void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch);
 RTCD_EXTERN void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
 
-void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
-void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
-RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
+void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
 
-void vp8_short_inv_walsh4x4_c(short *input, short *output);
-void vp8_short_inv_walsh4x4_sse2(short *input, short *output);
-RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *output);
+void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff);
+void vp8_short_inv_walsh4x4_sse2(short *input, short *mb_dqcoeff);
+RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *mb_dqcoeff);
 
-void vp8_short_inv_walsh4x4_1_c(short *input, short *output);
+void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff);
 #define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c
 
 void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
 void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch);
 RTCD_EXTERN void (*vp8_short_walsh4x4)(short *input, short *output, int pitch);
 
-void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
 void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count);
 void vp8_temporal_filter_apply_sse2(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count);
@@ -237,9 +234,9 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_sse2;
     if (flags & HAS_SSSE3) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_ssse3;
     vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_c;
-    if (flags & HAS_MMX) vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_mmx;
+    if (flags & HAS_SSE2) vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_sse2;
     vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_c;
-    if (flags & HAS_MMX) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_mmx;
+    if (flags & HAS_SSE2) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_sse2;
     vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_c;
     if (flags & HAS_SSE2) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_sse2;
     if (flags & HAS_SSSE3) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_ssse3;
@@ -277,9 +274,6 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vp8_filter_by_weight16x16 = vp8_filter_by_weight16x16_sse2;
     vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_c;
     if (flags & HAS_SSE2) vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_sse2;
-    vp8_full_search_sad = vp8_full_search_sad_c;
-    if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3;
-    if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8;
     vp8_loop_filter_bh = vp8_loop_filter_bh_c;
     if (flags & HAS_SSE2) vp8_loop_filter_bh = vp8_loop_filter_bh_sse2;
     vp8_loop_filter_bv = vp8_loop_filter_bv_c;
@@ -336,4 +330,4 @@ static void setup_rtcd_internal(void)
 }  // extern "C"
 #endif
 
-#endif
+#endif  // VP8_RTCD_H_
diff --git a/media/libvpx/config/mac/ia32/vp9_rtcd.h b/media/libvpx/config/mac/ia32/vp9_rtcd.h
index da53895ca4..28c82a831c 100644
--- a/media/libvpx/config/mac/ia32/vp9_rtcd.h
+++ b/media/libvpx/config/mac/ia32/vp9_rtcd.h
@@ -1,3 +1,14 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
 #ifndef VP9_RTCD_H_
 #define VP9_RTCD_H_
 
@@ -14,12 +25,18 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
+#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER
+#include "vp9/encoder/vp9_temporal_filter.h"
+#endif
 
 struct macroblockd;
 
 /* Encoder forward decls */
 struct macroblock;
-struct vp9_variance_vtable;
+struct macroblock_plane;
+struct vp9_sad_table;
+struct ScanOrder;
 struct search_site_config;
 struct mv;
 union int_mv;
@@ -29,23 +46,22 @@ struct yv12_buffer_config;
 extern "C" {
 #endif
 
+void vp9_apply_temporal_filter_c(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+void vp9_apply_temporal_filter_sse4_1(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+RTCD_EXTERN void (*vp9_apply_temporal_filter)(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 int64_t vp9_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 RTCD_EXTERN int64_t (*vp9_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
-int64_t vp9_block_error_fp_sse2(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
-RTCD_EXTERN int64_t (*vp9_block_error_fp)(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+RTCD_EXTERN int64_t (*vp9_block_error_fp)(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
 
-int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-int vp9_diamond_search_sad_avx(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-RTCD_EXTERN int (*vp9_diamond_search_sad)(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-
-void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-RTCD_EXTERN void (*vp9_fdct8x8_quant)(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv);
+#define vp9_diamond_search_sad vp9_diamond_search_sad_c
 
 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
 void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type);
@@ -67,18 +83,13 @@ void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride, uint8_t *dst,
 void vp9_filter_by_weight8x8_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight);
 RTCD_EXTERN void (*vp9_filter_by_weight8x8)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight);
 
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-int vp9_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-int vp9_full_search_sadx8(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-RTCD_EXTERN int (*vp9_full_search_sad)(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-
 void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 void vp9_fwht4x4_sse2(const int16_t *input, tran_low_t *output, int stride);
 RTCD_EXTERN void (*vp9_fwht4x4)(const int16_t *input, tran_low_t *output, int stride);
 
-void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
-void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
-RTCD_EXTERN void (*vp9_iht16x16_256_add)(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
+void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+RTCD_EXTERN void (*vp9_iht16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 
 void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
@@ -88,20 +99,35 @@ void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int
 void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 
-void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 
-void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 
-void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
-void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
-RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
+void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
 
-void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
-void vp9_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
-RTCD_EXTERN void (*vp9_temporal_filter_apply)(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+void vpx_convolve12_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve12)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve12_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve12_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve12_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve12_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
 void vp9_rtcd(void);
 
@@ -113,16 +139,14 @@ static void setup_rtcd_internal(void)
 
     (void)flags;
 
+    vp9_apply_temporal_filter = vp9_apply_temporal_filter_c;
+    if (flags & HAS_SSE4_1) vp9_apply_temporal_filter = vp9_apply_temporal_filter_sse4_1;
     vp9_block_error = vp9_block_error_c;
     if (flags & HAS_SSE2) vp9_block_error = vp9_block_error_sse2;
     if (flags & HAS_AVX2) vp9_block_error = vp9_block_error_avx2;
     vp9_block_error_fp = vp9_block_error_fp_c;
     if (flags & HAS_SSE2) vp9_block_error_fp = vp9_block_error_fp_sse2;
-    vp9_diamond_search_sad = vp9_diamond_search_sad_c;
-    if (flags & HAS_AVX) vp9_diamond_search_sad = vp9_diamond_search_sad_avx;
-    vp9_fdct8x8_quant = vp9_fdct8x8_quant_c;
-    if (flags & HAS_SSE2) vp9_fdct8x8_quant = vp9_fdct8x8_quant_sse2;
-    if (flags & HAS_SSSE3) vp9_fdct8x8_quant = vp9_fdct8x8_quant_ssse3;
+    if (flags & HAS_AVX2) vp9_block_error_fp = vp9_block_error_fp_avx2;
     vp9_fht16x16 = vp9_fht16x16_c;
     if (flags & HAS_SSE2) vp9_fht16x16 = vp9_fht16x16_sse2;
     vp9_fht4x4 = vp9_fht4x4_c;
@@ -133,9 +157,6 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vp9_filter_by_weight16x16 = vp9_filter_by_weight16x16_sse2;
     vp9_filter_by_weight8x8 = vp9_filter_by_weight8x8_c;
     if (flags & HAS_SSE2) vp9_filter_by_weight8x8 = vp9_filter_by_weight8x8_sse2;
-    vp9_full_search_sad = vp9_full_search_sad_c;
-    if (flags & HAS_SSE3) vp9_full_search_sad = vp9_full_search_sadx3;
-    if (flags & HAS_SSE4_1) vp9_full_search_sad = vp9_full_search_sadx8;
     vp9_fwht4x4 = vp9_fwht4x4_c;
     if (flags & HAS_SSE2) vp9_fwht4x4 = vp9_fwht4x4_sse2;
     vp9_iht16x16_256_add = vp9_iht16x16_256_add_c;
@@ -146,10 +167,22 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vp9_iht8x8_64_add = vp9_iht8x8_64_add_sse2;
     vp9_quantize_fp = vp9_quantize_fp_c;
     if (flags & HAS_SSE2) vp9_quantize_fp = vp9_quantize_fp_sse2;
+    if (flags & HAS_SSSE3) vp9_quantize_fp = vp9_quantize_fp_ssse3;
+    if (flags & HAS_AVX2) vp9_quantize_fp = vp9_quantize_fp_avx2;
+    vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_c;
+    if (flags & HAS_SSSE3) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_ssse3;
+    if (flags & HAS_AVX2) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_avx2;
     vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c;
     if (flags & HAS_SSSE3) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_ssse3;
-    vp9_temporal_filter_apply = vp9_temporal_filter_apply_c;
-    if (flags & HAS_SSE2) vp9_temporal_filter_apply = vp9_temporal_filter_apply_sse2;
+    vpx_convolve12 = vpx_convolve12_c;
+    if (flags & HAS_SSSE3) vpx_convolve12 = vpx_convolve12_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve12 = vpx_convolve12_avx2;
+    vpx_convolve12_horiz = vpx_convolve12_horiz_c;
+    if (flags & HAS_SSSE3) vpx_convolve12_horiz = vpx_convolve12_horiz_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve12_horiz = vpx_convolve12_horiz_avx2;
+    vpx_convolve12_vert = vpx_convolve12_vert_c;
+    if (flags & HAS_SSSE3) vpx_convolve12_vert = vpx_convolve12_vert_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve12_vert = vpx_convolve12_vert_avx2;
 }
 #endif
 
@@ -157,4 +190,4 @@ static void setup_rtcd_internal(void)
 }  // extern "C"
 #endif
 
-#endif
+#endif  // VP9_RTCD_H_
diff --git a/media/libvpx/config/mac/ia32/vpx_config.asm b/media/libvpx/config/mac/ia32/vpx_config.asm
index 2efc8b978b..bafe400119 100644
--- a/media/libvpx/config/mac/ia32/vpx_config.asm
+++ b/media/libvpx/config/mac/ia32/vpx_config.asm
@@ -1,9 +1,16 @@
-%define ARCH_ARM 0
-%define ARCH_MIPS 0
-%define ARCH_X86 1
-%define ARCH_X86_64 0
-%define HAVE_NEON 0
+%define VPX_ARCH_ARM 0
+%define VPX_ARCH_AARCH64 0
+%define VPX_ARCH_MIPS 0
+%define VPX_ARCH_X86 1
+%define VPX_ARCH_X86_64 0
+%define VPX_ARCH_PPC 0
+%define VPX_ARCH_LOONGARCH 0
 %define HAVE_NEON_ASM 0
+%define HAVE_NEON 0
+%define HAVE_NEON_DOTPROD 0
+%define HAVE_NEON_I8MM 0
+%define HAVE_SVE 0
+%define HAVE_SVE2 0
 %define HAVE_MIPS32 0
 %define HAVE_DSPR2 0
 %define HAVE_MSA 0
@@ -16,6 +23,11 @@
 %define HAVE_SSE4_1 1
 %define HAVE_AVX 1
 %define HAVE_AVX2 1
+%define HAVE_AVX512 1
+%define HAVE_VSX 0
+%define HAVE_MMI 0
+%define HAVE_LSX 0
+%define HAVE_LASX 0
 %define HAVE_VPX_PORTS 1
 %define HAVE_PTHREAD_H 1
 %define CONFIG_DEPENDENCY_TRACKING 1
@@ -72,7 +84,10 @@
 %define CONFIG_BETTER_HW_COMPATIBILITY 0
 %define CONFIG_EXPERIMENTAL 0
 %define CONFIG_SIZE_LIMIT 1
-%define CONFIG_SPATIAL_SVC 0
+%define CONFIG_ALWAYS_ADJUST_BPM 0
+%define CONFIG_BITSTREAM_DEBUG 0
+%define CONFIG_MISMATCH_DEBUG 0
 %define CONFIG_FP_MB_STATS 0
 %define CONFIG_EMULATE_HARDWARE 0
-%define CONFIG_MISC_FIXES 0
+%define CONFIG_NON_GREEDY_MV 0
+%define CONFIG_COLLECT_COMPONENT_TIMING 0
diff --git a/media/libvpx/config/mac/ia32/vpx_config.h b/media/libvpx/config/mac/ia32/vpx_config.h
index b26f9a51e8..a4cc48afe1 100644
--- a/media/libvpx/config/mac/ia32/vpx_config.h
+++ b/media/libvpx/config/mac/ia32/vpx_config.h
@@ -10,12 +10,19 @@
 #define VPX_CONFIG_H
 #define RESTRICT    
 #define INLINE      inline
-#define ARCH_ARM 0
-#define ARCH_MIPS 0
-#define ARCH_X86 1
-#define ARCH_X86_64 0
-#define HAVE_NEON 0
+#define VPX_ARCH_ARM 0
+#define VPX_ARCH_AARCH64 0
+#define VPX_ARCH_MIPS 0
+#define VPX_ARCH_X86 1
+#define VPX_ARCH_X86_64 0
+#define VPX_ARCH_PPC 0
+#define VPX_ARCH_LOONGARCH 0
 #define HAVE_NEON_ASM 0
+#define HAVE_NEON 0
+#define HAVE_NEON_DOTPROD 0
+#define HAVE_NEON_I8MM 0
+#define HAVE_SVE 0
+#define HAVE_SVE2 0
 #define HAVE_MIPS32 0
 #define HAVE_DSPR2 0
 #define HAVE_MSA 0
@@ -28,6 +35,11 @@
 #define HAVE_SSE4_1 1
 #define HAVE_AVX 1
 #define HAVE_AVX2 1
+#define HAVE_AVX512 1
+#define HAVE_VSX 0
+#define HAVE_MMI 0
+#define HAVE_LSX 0
+#define HAVE_LASX 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_PTHREAD_H 1
 #define CONFIG_DEPENDENCY_TRACKING 1
@@ -84,10 +96,13 @@
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 1
-#define CONFIG_SPATIAL_SVC 0
+#define CONFIG_ALWAYS_ADJUST_BPM 0
+#define CONFIG_BITSTREAM_DEBUG 0
+#define CONFIG_MISMATCH_DEBUG 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_EMULATE_HARDWARE 0
-#define CONFIG_MISC_FIXES 0
+#define CONFIG_NON_GREEDY_MV 0
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
 #define DECODE_WIDTH_LIMIT 8192
 #define DECODE_HEIGHT_LIMIT 4608
 #endif /* VPX_CONFIG_H */
diff --git a/media/libvpx/config/mac/ia32/vpx_dsp_rtcd.h b/media/libvpx/config/mac/ia32/vpx_dsp_rtcd.h
index 112e326a64..0a6266a322 100644
--- a/media/libvpx/config/mac/ia32/vpx_dsp_rtcd.h
+++ b/media/libvpx/config/mac/ia32/vpx_dsp_rtcd.h
@@ -1,3 +1,14 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
 #ifndef VPX_DSP_RTCD_H_
 #define VPX_DSP_RTCD_H_
 
@@ -13,6 +24,11 @@
 
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#if CONFIG_VP9_ENCODER
+ struct macroblock_plane;
+ struct ScanOrder;
+#endif
 
 
 #ifdef __cplusplus
@@ -28,242 +44,215 @@ unsigned int vpx_avg_8x8_sse2(const uint8_t *, int p);
 RTCD_EXTERN unsigned int (*vpx_avg_8x8)(const uint8_t *, int p);
 
 void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
-#define vpx_comp_avg_pred vpx_comp_avg_pred_c
+void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+void vpx_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+RTCD_EXTERN void (*vpx_comp_avg_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
 
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c
 
-void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c
 
-void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c
 
-void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
 
-void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c
 
-void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c
 
-void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c
 
-void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c
 
-void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207e_predictor_16x16 vpx_d207e_predictor_16x16_c
+void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207e_predictor_32x32 vpx_d207e_predictor_32x32_c
+void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207e_predictor_4x4 vpx_d207e_predictor_4x4_c
+void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207e_predictor_8x8 vpx_d207e_predictor_8x8_c
+void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_d45e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45e_predictor_16x16 vpx_d45e_predictor_16x16_c
-
-void vpx_d45e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45e_predictor_32x32 vpx_d45e_predictor_32x32_c
-
-void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c
 
-void vpx_d45e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45e_predictor_8x8 vpx_d45e_predictor_8x8_c
+void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_d63e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63e_predictor_16x16 vpx_d63e_predictor_16x16_c
-
-void vpx_d63e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63e_predictor_32x32 vpx_d63e_predictor_32x32_c
-
-void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c
 
-void vpx_d63e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63e_predictor_8x8 vpx_d63e_predictor_8x8_c
+void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63f_predictor_4x4 vpx_d63f_predictor_4x4_c
+void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_128_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_128_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_128_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_128_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_left_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_left_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_left_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_left_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_top_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_top_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_top_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_top_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
 void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
 void vpx_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct16x16_avx2(const int16_t *input, tran_low_t *output, int stride);
 RTCD_EXTERN void (*vpx_fdct16x16)(const int16_t *input, tran_low_t *output, int stride);
 
 void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
@@ -300,47 +289,53 @@ void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
 void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride);
 RTCD_EXTERN void (*vpx_fdct8x8_1)(const int16_t *input, tran_low_t *output, int stride);
 
-void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vpx_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vpx_get16x16var_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
 
-unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride);
+unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride);
 #define vpx_get4x4sse_cs vpx_get4x4sse_cs_c
 
-void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vpx_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get8x8var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
 
 unsigned int vpx_get_mb_ss_c(const int16_t *);
 unsigned int vpx_get_mb_ss_sse2(const int16_t *);
 RTCD_EXTERN unsigned int (*vpx_get_mb_ss)(const int16_t *);
 
-void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_h_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_h_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_h_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_h_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_16x16_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff);
-RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
 
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_8x8_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff);
-RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+RTCD_EXTERN void (*vpx_hadamard_32x32)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
 
-void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+
+void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
 
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
@@ -353,14 +348,22 @@ RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest,
 
 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest, int stride);
 RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
+void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct16x16_38_add)(const tran_low_t *input, uint8_t *dest, int stride);
+
 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest, int stride);
 RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
 void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest, int stride);
 RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
@@ -369,6 +372,7 @@ RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest,
 
 void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
 RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
 void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
@@ -381,6 +385,7 @@ RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, in
 
 void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
 RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
 void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
@@ -395,9 +400,9 @@ int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
 int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width);
 RTCD_EXTERN int16_t (*vpx_int_pro_col)(const uint8_t *ref, const int width);
 
-void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
-void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
-RTCD_EXTERN void (*vpx_int_pro_row)(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
+void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
+void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
+RTCD_EXTERN void (*vpx_int_pro_row)(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
 
 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
@@ -456,9 +461,9 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, co
 void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
 RTCD_EXTERN void (*vpx_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
 
-void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vpx_mbpost_proc_across_ip_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-RTCD_EXTERN void (*vpx_mbpost_proc_across_ip)(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols,int flimit);
+void vpx_mbpost_proc_across_ip_sse2(unsigned char *src, int pitch, int rows, int cols,int flimit);
+RTCD_EXTERN void (*vpx_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols,int flimit);
 
 void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
 void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
@@ -468,22 +473,23 @@ void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *mi
 void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
 RTCD_EXTERN void (*vpx_minmax_8x8)(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
 
-unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_mse8x16)(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_mse8x8)(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
 void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch);
 void vpx_plane_add_noise_sse2(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch);
@@ -493,12 +499,18 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *d
 void vpx_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
 RTCD_EXTERN void (*vpx_post_proc_down_and_across_mb_row)(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
 
-void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 
-void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vpx_quantize_b_32x32)(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 
 unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -508,18 +520,9 @@ unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad16x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x16x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
-void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-
-void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -529,9 +532,9 @@ unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad16x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -541,18 +544,9 @@ unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uin
 unsigned int vpx_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad16x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x8x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
-void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-
-void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -564,9 +558,9 @@ unsigned int vpx_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_sad32x16_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad32x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -578,16 +572,10 @@ unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_sad32x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad32x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x3 vpx_sad32x32x3_c
-
-void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_c
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -599,9 +587,9 @@ unsigned int vpx_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_sad32x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad32x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad4x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -611,17 +599,9 @@ unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad4x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad4x4_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad4x4x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad4x4x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
-void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad4x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-
-void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad4x4x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad4x4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad4x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -631,47 +611,43 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad4x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad4x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-
-void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x8 vpx_sad4x8x8_c
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad64x32_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 RTCD_EXTERN unsigned int (*vpx_sad64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 
 unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 unsigned int vpx_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 unsigned int vpx_sad64x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad64x32_avg_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad64x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad64x64_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 RTCD_EXTERN unsigned int (*vpx_sad64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 
 unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 unsigned int vpx_sad64x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad64x64_avg_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad64x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x3 vpx_sad64x64x3_c
-
-void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-
-void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x8 vpx_sad64x64x8_c
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -681,17 +657,9 @@ unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uin
 unsigned int vpx_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad8x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad8x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x16x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
-void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-
-void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad8x16x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -701,12 +669,9 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad8x4_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-
-void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x8 vpx_sad8x4x8_c
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad8x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -716,273 +681,392 @@ unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad8x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad8x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x8x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
-void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 
-void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad8x8x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x16x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c
+
+void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c
+
+unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x32_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x32x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x64_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c
+
+void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c
+
+unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 int vpx_satd_c(const int16_t *coeff, int length);
 int vpx_satd_sse2(const int16_t *coeff, int length);
+int vpx_satd_avx2(const int16_t *coeff, int length);
 RTCD_EXTERN int (*vpx_satd)(const int16_t *coeff, int length);
 
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
 
-void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
 
-void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
 
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_horiz vpx_scaled_horiz_c
 
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_vert vpx_scaled_vert_c
 
-uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+int64_t vpx_sse_sse4_1(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+int64_t vpx_sse_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+RTCD_EXTERN int64_t (*vpx_sse)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
 
-uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
 void vpx_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
 RTCD_EXTERN void (*vpx_subtract_block)(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
 
 uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size);
 uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size);
 RTCD_EXTERN uint64_t (*vpx_sum_squares_2d_i16)(const int16_t *src, int stride, int size);
 
-void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_tm_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_tm_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_tm_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_tm_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_tm_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_tm_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_tm_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_tm_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_v_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_v_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_v_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_v_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_v_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_v_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance4x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance4x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c
 
 int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl);
@@ -1003,6 +1087,9 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vpx_avg_4x4 = vpx_avg_4x4_sse2;
     vpx_avg_8x8 = vpx_avg_8x8_c;
     if (flags & HAS_SSE2) vpx_avg_8x8 = vpx_avg_8x8_sse2;
+    vpx_comp_avg_pred = vpx_comp_avg_pred_c;
+    if (flags & HAS_SSE2) vpx_comp_avg_pred = vpx_comp_avg_pred_sse2;
+    if (flags & HAS_AVX2) vpx_comp_avg_pred = vpx_comp_avg_pred_avx2;
     vpx_convolve8 = vpx_convolve8_c;
     if (flags & HAS_SSE2) vpx_convolve8 = vpx_convolve8_sse2;
     if (flags & HAS_SSSE3) vpx_convolve8 = vpx_convolve8_ssse3;
@@ -1010,12 +1097,15 @@ static void setup_rtcd_internal(void)
     vpx_convolve8_avg = vpx_convolve8_avg_c;
     if (flags & HAS_SSE2) vpx_convolve8_avg = vpx_convolve8_avg_sse2;
     if (flags & HAS_SSSE3) vpx_convolve8_avg = vpx_convolve8_avg_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve8_avg = vpx_convolve8_avg_avx2;
     vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_c;
     if (flags & HAS_SSE2) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_sse2;
     if (flags & HAS_SSSE3) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_avx2;
     vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_c;
     if (flags & HAS_SSE2) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_sse2;
     if (flags & HAS_SSSE3) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_avx2;
     vpx_convolve8_horiz = vpx_convolve8_horiz_c;
     if (flags & HAS_SSE2) vpx_convolve8_horiz = vpx_convolve8_horiz_sse2;
     if (flags & HAS_SSSE3) vpx_convolve8_horiz = vpx_convolve8_horiz_ssse3;
@@ -1094,6 +1184,7 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vpx_dc_top_predictor_8x8 = vpx_dc_top_predictor_8x8_sse2;
     vpx_fdct16x16 = vpx_fdct16x16_c;
     if (flags & HAS_SSE2) vpx_fdct16x16 = vpx_fdct16x16_sse2;
+    if (flags & HAS_AVX2) vpx_fdct16x16 = vpx_fdct16x16_avx2;
     vpx_fdct16x16_1 = vpx_fdct16x16_1_c;
     if (flags & HAS_SSE2) vpx_fdct16x16_1 = vpx_fdct16x16_1_sse2;
     vpx_fdct32x32 = vpx_fdct32x32_c;
@@ -1129,6 +1220,10 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vpx_h_predictor_8x8 = vpx_h_predictor_8x8_sse2;
     vpx_hadamard_16x16 = vpx_hadamard_16x16_c;
     if (flags & HAS_SSE2) vpx_hadamard_16x16 = vpx_hadamard_16x16_sse2;
+    if (flags & HAS_AVX2) vpx_hadamard_16x16 = vpx_hadamard_16x16_avx2;
+    vpx_hadamard_32x32 = vpx_hadamard_32x32_c;
+    if (flags & HAS_SSE2) vpx_hadamard_32x32 = vpx_hadamard_32x32_sse2;
+    if (flags & HAS_AVX2) vpx_hadamard_32x32 = vpx_hadamard_32x32_avx2;
     vpx_hadamard_8x8 = vpx_hadamard_8x8_c;
     if (flags & HAS_SSE2) vpx_hadamard_8x8 = vpx_hadamard_8x8_sse2;
     vpx_idct16x16_10_add = vpx_idct16x16_10_add_c;
@@ -1137,20 +1232,28 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vpx_idct16x16_1_add = vpx_idct16x16_1_add_sse2;
     vpx_idct16x16_256_add = vpx_idct16x16_256_add_c;
     if (flags & HAS_SSE2) vpx_idct16x16_256_add = vpx_idct16x16_256_add_sse2;
+    if (flags & HAS_AVX2) vpx_idct16x16_256_add = vpx_idct16x16_256_add_avx2;
+    vpx_idct16x16_38_add = vpx_idct16x16_38_add_c;
+    if (flags & HAS_SSE2) vpx_idct16x16_38_add = vpx_idct16x16_38_add_sse2;
     vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_c;
     if (flags & HAS_SSE2) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_sse2;
+    if (flags & HAS_AVX2) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_avx2;
     vpx_idct32x32_135_add = vpx_idct32x32_135_add_c;
-    if (flags & HAS_SSE2) vpx_idct32x32_135_add = vpx_idct32x32_1024_add_sse2;
+    if (flags & HAS_SSE2) vpx_idct32x32_135_add = vpx_idct32x32_135_add_sse2;
+    if (flags & HAS_SSSE3) vpx_idct32x32_135_add = vpx_idct32x32_135_add_ssse3;
+    if (flags & HAS_AVX2) vpx_idct32x32_135_add = vpx_idct32x32_135_add_avx2;
     vpx_idct32x32_1_add = vpx_idct32x32_1_add_c;
     if (flags & HAS_SSE2) vpx_idct32x32_1_add = vpx_idct32x32_1_add_sse2;
     vpx_idct32x32_34_add = vpx_idct32x32_34_add_c;
     if (flags & HAS_SSE2) vpx_idct32x32_34_add = vpx_idct32x32_34_add_sse2;
+    if (flags & HAS_SSSE3) vpx_idct32x32_34_add = vpx_idct32x32_34_add_ssse3;
     vpx_idct4x4_16_add = vpx_idct4x4_16_add_c;
     if (flags & HAS_SSE2) vpx_idct4x4_16_add = vpx_idct4x4_16_add_sse2;
     vpx_idct4x4_1_add = vpx_idct4x4_1_add_c;
     if (flags & HAS_SSE2) vpx_idct4x4_1_add = vpx_idct4x4_1_add_sse2;
     vpx_idct8x8_12_add = vpx_idct8x8_12_add_c;
     if (flags & HAS_SSE2) vpx_idct8x8_12_add = vpx_idct8x8_12_add_sse2;
+    if (flags & HAS_SSSE3) vpx_idct8x8_12_add = vpx_idct8x8_12_add_ssse3;
     vpx_idct8x8_1_add = vpx_idct8x8_1_add_c;
     if (flags & HAS_SSE2) vpx_idct8x8_1_add = vpx_idct8x8_1_add_sse2;
     vpx_idct8x8_64_add = vpx_idct8x8_64_add_c;
@@ -1198,6 +1301,7 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) vpx_mse16x16 = vpx_mse16x16_avx2;
     vpx_mse16x8 = vpx_mse16x8_c;
     if (flags & HAS_SSE2) vpx_mse16x8 = vpx_mse16x8_sse2;
+    if (flags & HAS_AVX2) vpx_mse16x8 = vpx_mse16x8_avx2;
     vpx_mse8x16 = vpx_mse8x16_c;
     if (flags & HAS_SSE2) vpx_mse8x16 = vpx_mse8x16_sse2;
     vpx_mse8x8 = vpx_mse8x8_c;
@@ -1208,17 +1312,19 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vpx_post_proc_down_and_across_mb_row = vpx_post_proc_down_and_across_mb_row_sse2;
     vpx_quantize_b = vpx_quantize_b_c;
     if (flags & HAS_SSE2) vpx_quantize_b = vpx_quantize_b_sse2;
+    if (flags & HAS_SSSE3) vpx_quantize_b = vpx_quantize_b_ssse3;
+    if (flags & HAS_AVX) vpx_quantize_b = vpx_quantize_b_avx;
+    if (flags & HAS_AVX2) vpx_quantize_b = vpx_quantize_b_avx2;
+    vpx_quantize_b_32x32 = vpx_quantize_b_32x32_c;
+    if (flags & HAS_SSSE3) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_ssse3;
+    if (flags & HAS_AVX) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx;
+    if (flags & HAS_AVX2) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx2;
     vpx_sad16x16 = vpx_sad16x16_c;
     if (flags & HAS_SSE2) vpx_sad16x16 = vpx_sad16x16_sse2;
     vpx_sad16x16_avg = vpx_sad16x16_avg_c;
     if (flags & HAS_SSE2) vpx_sad16x16_avg = vpx_sad16x16_avg_sse2;
-    vpx_sad16x16x3 = vpx_sad16x16x3_c;
-    if (flags & HAS_SSE3) vpx_sad16x16x3 = vpx_sad16x16x3_sse3;
-    if (flags & HAS_SSSE3) vpx_sad16x16x3 = vpx_sad16x16x3_ssse3;
     vpx_sad16x16x4d = vpx_sad16x16x4d_c;
     if (flags & HAS_SSE2) vpx_sad16x16x4d = vpx_sad16x16x4d_sse2;
-    vpx_sad16x16x8 = vpx_sad16x16x8_c;
-    if (flags & HAS_SSE4_1) vpx_sad16x16x8 = vpx_sad16x16x8_sse4_1;
     vpx_sad16x32 = vpx_sad16x32_c;
     if (flags & HAS_SSE2) vpx_sad16x32 = vpx_sad16x32_sse2;
     vpx_sad16x32_avg = vpx_sad16x32_avg_c;
@@ -1229,13 +1335,8 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vpx_sad16x8 = vpx_sad16x8_sse2;
     vpx_sad16x8_avg = vpx_sad16x8_avg_c;
     if (flags & HAS_SSE2) vpx_sad16x8_avg = vpx_sad16x8_avg_sse2;
-    vpx_sad16x8x3 = vpx_sad16x8x3_c;
-    if (flags & HAS_SSE3) vpx_sad16x8x3 = vpx_sad16x8x3_sse3;
-    if (flags & HAS_SSSE3) vpx_sad16x8x3 = vpx_sad16x8x3_ssse3;
     vpx_sad16x8x4d = vpx_sad16x8x4d_c;
     if (flags & HAS_SSE2) vpx_sad16x8x4d = vpx_sad16x8x4d_sse2;
-    vpx_sad16x8x8 = vpx_sad16x8x8_c;
-    if (flags & HAS_SSE4_1) vpx_sad16x8x8 = vpx_sad16x8x8_sse4_1;
     vpx_sad32x16 = vpx_sad32x16_c;
     if (flags & HAS_SSE2) vpx_sad32x16 = vpx_sad32x16_sse2;
     if (flags & HAS_AVX2) vpx_sad32x16 = vpx_sad32x16_avx2;
@@ -1265,12 +1366,8 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vpx_sad4x4 = vpx_sad4x4_sse2;
     vpx_sad4x4_avg = vpx_sad4x4_avg_c;
     if (flags & HAS_SSE2) vpx_sad4x4_avg = vpx_sad4x4_avg_sse2;
-    vpx_sad4x4x3 = vpx_sad4x4x3_c;
-    if (flags & HAS_SSE3) vpx_sad4x4x3 = vpx_sad4x4x3_sse3;
     vpx_sad4x4x4d = vpx_sad4x4x4d_c;
     if (flags & HAS_SSE2) vpx_sad4x4x4d = vpx_sad4x4x4d_sse2;
-    vpx_sad4x4x8 = vpx_sad4x4x8_c;
-    if (flags & HAS_SSE4_1) vpx_sad4x4x8 = vpx_sad4x4x8_sse4_1;
     vpx_sad4x8 = vpx_sad4x8_c;
     if (flags & HAS_SSE2) vpx_sad4x8 = vpx_sad4x8_sse2;
     vpx_sad4x8_avg = vpx_sad4x8_avg_c;
@@ -1280,30 +1377,31 @@ static void setup_rtcd_internal(void)
     vpx_sad64x32 = vpx_sad64x32_c;
     if (flags & HAS_SSE2) vpx_sad64x32 = vpx_sad64x32_sse2;
     if (flags & HAS_AVX2) vpx_sad64x32 = vpx_sad64x32_avx2;
+    if (flags & HAS_AVX512) vpx_sad64x32 = vpx_sad64x32_avx512;
     vpx_sad64x32_avg = vpx_sad64x32_avg_c;
     if (flags & HAS_SSE2) vpx_sad64x32_avg = vpx_sad64x32_avg_sse2;
     if (flags & HAS_AVX2) vpx_sad64x32_avg = vpx_sad64x32_avg_avx2;
+    if (flags & HAS_AVX512) vpx_sad64x32_avg = vpx_sad64x32_avg_avx512;
     vpx_sad64x32x4d = vpx_sad64x32x4d_c;
     if (flags & HAS_SSE2) vpx_sad64x32x4d = vpx_sad64x32x4d_sse2;
     vpx_sad64x64 = vpx_sad64x64_c;
     if (flags & HAS_SSE2) vpx_sad64x64 = vpx_sad64x64_sse2;
     if (flags & HAS_AVX2) vpx_sad64x64 = vpx_sad64x64_avx2;
+    if (flags & HAS_AVX512) vpx_sad64x64 = vpx_sad64x64_avx512;
     vpx_sad64x64_avg = vpx_sad64x64_avg_c;
     if (flags & HAS_SSE2) vpx_sad64x64_avg = vpx_sad64x64_avg_sse2;
     if (flags & HAS_AVX2) vpx_sad64x64_avg = vpx_sad64x64_avg_avx2;
+    if (flags & HAS_AVX512) vpx_sad64x64_avg = vpx_sad64x64_avg_avx512;
     vpx_sad64x64x4d = vpx_sad64x64x4d_c;
     if (flags & HAS_SSE2) vpx_sad64x64x4d = vpx_sad64x64x4d_sse2;
     if (flags & HAS_AVX2) vpx_sad64x64x4d = vpx_sad64x64x4d_avx2;
+    if (flags & HAS_AVX512) vpx_sad64x64x4d = vpx_sad64x64x4d_avx512;
     vpx_sad8x16 = vpx_sad8x16_c;
     if (flags & HAS_SSE2) vpx_sad8x16 = vpx_sad8x16_sse2;
     vpx_sad8x16_avg = vpx_sad8x16_avg_c;
     if (flags & HAS_SSE2) vpx_sad8x16_avg = vpx_sad8x16_avg_sse2;
-    vpx_sad8x16x3 = vpx_sad8x16x3_c;
-    if (flags & HAS_SSE3) vpx_sad8x16x3 = vpx_sad8x16x3_sse3;
     vpx_sad8x16x4d = vpx_sad8x16x4d_c;
     if (flags & HAS_SSE2) vpx_sad8x16x4d = vpx_sad8x16x4d_sse2;
-    vpx_sad8x16x8 = vpx_sad8x16x8_c;
-    if (flags & HAS_SSE4_1) vpx_sad8x16x8 = vpx_sad8x16x8_sse4_1;
     vpx_sad8x4 = vpx_sad8x4_c;
     if (flags & HAS_SSE2) vpx_sad8x4 = vpx_sad8x4_sse2;
     vpx_sad8x4_avg = vpx_sad8x4_avg_c;
@@ -1314,16 +1412,74 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vpx_sad8x8 = vpx_sad8x8_sse2;
     vpx_sad8x8_avg = vpx_sad8x8_avg_c;
     if (flags & HAS_SSE2) vpx_sad8x8_avg = vpx_sad8x8_avg_sse2;
-    vpx_sad8x8x3 = vpx_sad8x8x3_c;
-    if (flags & HAS_SSE3) vpx_sad8x8x3 = vpx_sad8x8x3_sse3;
     vpx_sad8x8x4d = vpx_sad8x8x4d_c;
     if (flags & HAS_SSE2) vpx_sad8x8x4d = vpx_sad8x8x4d_sse2;
-    vpx_sad8x8x8 = vpx_sad8x8x8_c;
-    if (flags & HAS_SSE4_1) vpx_sad8x8x8 = vpx_sad8x8x8_sse4_1;
+    vpx_sad_skip_16x16 = vpx_sad_skip_16x16_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_16x16 = vpx_sad_skip_16x16_sse2;
+    vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_sse2;
+    vpx_sad_skip_16x32 = vpx_sad_skip_16x32_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_16x32 = vpx_sad_skip_16x32_sse2;
+    vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_sse2;
+    vpx_sad_skip_16x8 = vpx_sad_skip_16x8_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_16x8 = vpx_sad_skip_16x8_sse2;
+    vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_sse2;
+    vpx_sad_skip_32x16 = vpx_sad_skip_32x16_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_avx2;
+    vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_avx2;
+    vpx_sad_skip_32x32 = vpx_sad_skip_32x32_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_avx2;
+    vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_avx2;
+    vpx_sad_skip_32x64 = vpx_sad_skip_32x64_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_avx2;
+    vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_avx2;
+    vpx_sad_skip_4x8 = vpx_sad_skip_4x8_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_4x8 = vpx_sad_skip_4x8_sse2;
+    vpx_sad_skip_4x8x4d = vpx_sad_skip_4x8x4d_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_4x8x4d = vpx_sad_skip_4x8x4d_sse2;
+    vpx_sad_skip_64x32 = vpx_sad_skip_64x32_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_avx2;
+    if (flags & HAS_AVX512) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_avx512;
+    vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_avx2;
+    if (flags & HAS_AVX512) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_avx512;
+    vpx_sad_skip_64x64 = vpx_sad_skip_64x64_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_avx2;
+    if (flags & HAS_AVX512) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_avx512;
+    vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_avx2;
+    if (flags & HAS_AVX512) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_avx512;
+    vpx_sad_skip_8x16 = vpx_sad_skip_8x16_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_8x16 = vpx_sad_skip_8x16_sse2;
+    vpx_sad_skip_8x16x4d = vpx_sad_skip_8x16x4d_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_8x16x4d = vpx_sad_skip_8x16x4d_sse2;
+    vpx_sad_skip_8x8 = vpx_sad_skip_8x8_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_8x8 = vpx_sad_skip_8x8_sse2;
+    vpx_sad_skip_8x8x4d = vpx_sad_skip_8x8x4d_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_8x8x4d = vpx_sad_skip_8x8x4d_sse2;
     vpx_satd = vpx_satd_c;
     if (flags & HAS_SSE2) vpx_satd = vpx_satd_sse2;
+    if (flags & HAS_AVX2) vpx_satd = vpx_satd_avx2;
     vpx_scaled_2d = vpx_scaled_2d_c;
     if (flags & HAS_SSSE3) vpx_scaled_2d = vpx_scaled_2d_ssse3;
+    vpx_sse = vpx_sse_c;
+    if (flags & HAS_SSE4_1) vpx_sse = vpx_sse_sse4_1;
+    if (flags & HAS_AVX2) vpx_sse = vpx_sse_avx2;
     vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_c;
     if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_sse2;
     if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_ssse3;
@@ -1408,6 +1564,7 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_ssse3;
     vpx_subtract_block = vpx_subtract_block_c;
     if (flags & HAS_SSE2) vpx_subtract_block = vpx_subtract_block_sse2;
+    if (flags & HAS_AVX2) vpx_subtract_block = vpx_subtract_block_avx2;
     vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_c;
     if (flags & HAS_SSE2) vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_sse2;
     vpx_tm_predictor_16x16 = vpx_tm_predictor_16x16_c;
@@ -1431,8 +1588,10 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) vpx_variance16x16 = vpx_variance16x16_avx2;
     vpx_variance16x32 = vpx_variance16x32_c;
     if (flags & HAS_SSE2) vpx_variance16x32 = vpx_variance16x32_sse2;
+    if (flags & HAS_AVX2) vpx_variance16x32 = vpx_variance16x32_avx2;
     vpx_variance16x8 = vpx_variance16x8_c;
     if (flags & HAS_SSE2) vpx_variance16x8 = vpx_variance16x8_sse2;
+    if (flags & HAS_AVX2) vpx_variance16x8 = vpx_variance16x8_avx2;
     vpx_variance32x16 = vpx_variance32x16_c;
     if (flags & HAS_SSE2) vpx_variance32x16 = vpx_variance32x16_sse2;
     if (flags & HAS_AVX2) vpx_variance32x16 = vpx_variance32x16_avx2;
@@ -1441,6 +1600,7 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) vpx_variance32x32 = vpx_variance32x32_avx2;
     vpx_variance32x64 = vpx_variance32x64_c;
     if (flags & HAS_SSE2) vpx_variance32x64 = vpx_variance32x64_sse2;
+    if (flags & HAS_AVX2) vpx_variance32x64 = vpx_variance32x64_avx2;
     vpx_variance4x4 = vpx_variance4x4_c;
     if (flags & HAS_SSE2) vpx_variance4x4 = vpx_variance4x4_sse2;
     vpx_variance4x8 = vpx_variance4x8_c;
@@ -1453,10 +1613,13 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) vpx_variance64x64 = vpx_variance64x64_avx2;
     vpx_variance8x16 = vpx_variance8x16_c;
     if (flags & HAS_SSE2) vpx_variance8x16 = vpx_variance8x16_sse2;
+    if (flags & HAS_AVX2) vpx_variance8x16 = vpx_variance8x16_avx2;
     vpx_variance8x4 = vpx_variance8x4_c;
     if (flags & HAS_SSE2) vpx_variance8x4 = vpx_variance8x4_sse2;
+    if (flags & HAS_AVX2) vpx_variance8x4 = vpx_variance8x4_avx2;
     vpx_variance8x8 = vpx_variance8x8_c;
     if (flags & HAS_SSE2) vpx_variance8x8 = vpx_variance8x8_sse2;
+    if (flags & HAS_AVX2) vpx_variance8x8 = vpx_variance8x8_avx2;
     vpx_vector_var = vpx_vector_var_c;
     if (flags & HAS_SSE2) vpx_vector_var = vpx_vector_var_sse2;
 }
@@ -1466,4 +1629,4 @@ static void setup_rtcd_internal(void)
 }  // extern "C"
 #endif
 
-#endif
+#endif  // VPX_DSP_RTCD_H_
diff --git a/media/libvpx/config/mac/ia32/vpx_scale_rtcd.h b/media/libvpx/config/mac/ia32/vpx_scale_rtcd.h
index ddf7d01cca..18e5b71579 100644
--- a/media/libvpx/config/mac/ia32/vpx_scale_rtcd.h
+++ b/media/libvpx/config/mac/ia32/vpx_scale_rtcd.h
@@ -1,3 +1,14 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
 #ifndef VPX_SCALE_RTCD_H_
 #define VPX_SCALE_RTCD_H_
 
@@ -46,6 +57,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
 void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
 #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
 
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
 void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
 #define vpx_yv12_copy_y vpx_yv12_copy_y_c
 
@@ -66,4 +80,4 @@ static void setup_rtcd_internal(void)
 }  // extern "C"
 #endif
 
-#endif
+#endif  // VPX_SCALE_RTCD_H_
diff --git a/media/libvpx/config/mac/x64/vp8_rtcd.h b/media/libvpx/config/mac/x64/vp8_rtcd.h
index 4728639704..9e7746b813 100644
--- a/media/libvpx/config/mac/x64/vp8_rtcd.h
+++ b/media/libvpx/config/mac/x64/vp8_rtcd.h
@@ -1,3 +1,14 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
 #ifndef VP8_RTCD_H_
 #define VP8_RTCD_H_
 
@@ -26,56 +37,47 @@ struct yv12_buffer_config;
 extern "C" {
 #endif
 
-void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_mmx
+void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict4x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_sse2
 
-void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_mmx
+void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_sse2
 
-void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-
-void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp8_blend_b vp8_blend_b_c
-
-void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp8_blend_mb_inner vp8_blend_mb_inner_c
-
-void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp8_blend_mb_outer vp8_blend_mb_outer_c
+void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
 int vp8_block_error_c(short *coeff, short *dqcoeff);
 int vp8_block_error_sse2(short *coeff, short *dqcoeff);
 #define vp8_block_error vp8_block_error_sse2
 
-void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
-void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
-void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
-RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
+void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
+void vp8_copy32xn_sse2(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
+void vp8_copy32xn_sse3(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
+RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
 
-void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
 #define vp8_copy_mem16x16 vp8_copy_mem16x16_sse2
 
-void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem8x4_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
 #define vp8_copy_mem8x4 vp8_copy_mem8x4_mmx
 
-void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem8x8_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
 #define vp8_copy_mem8x8 vp8_copy_mem8x8_mmx
 
-void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
-void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
+void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
 #define vp8_dc_only_idct_add vp8_dc_only_idct_add_mmx
 
 int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
@@ -86,8 +88,8 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, u
 int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
 #define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_sse2
 
-void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride);
-void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *output, int stride);
+void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride);
+void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *dest, int stride);
 #define vp8_dequant_idct_add vp8_dequant_idct_add_mmx
 
 void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
@@ -98,8 +100,8 @@ void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int
 void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
 #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2
 
-void vp8_dequantize_b_c(struct blockd*, short *dqc);
-void vp8_dequantize_b_mmx(struct blockd*, short *dqc);
+void vp8_dequantize_b_c(struct blockd*, short *DQC);
+void vp8_dequantize_b_mmx(struct blockd*, short *DQC);
 #define vp8_dequantize_b vp8_dequantize_b_mmx
 
 int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
@@ -122,41 +124,36 @@ void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char
 void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight);
 #define vp8_filter_by_weight8x8 vp8_filter_by_weight8x8_sse2
 
-int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-
-void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 #define vp8_loop_filter_bh vp8_loop_filter_bh_sse2
 
-void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_bv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 #define vp8_loop_filter_bv vp8_loop_filter_bv_sse2
 
-void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_mbh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 #define vp8_loop_filter_mbh vp8_loop_filter_mbh_sse2
 
-void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 #define vp8_loop_filter_mbv vp8_loop_filter_mbv_sse2
 
-void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_bhs_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 #define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_sse2
 
-void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 #define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_sse2
 
-void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 #define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_sse2
 
-void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 #define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_sse2
 
 int vp8_mbblock_error_c(struct macroblock *mb, int dc);
@@ -167,8 +164,8 @@ int vp8_mbuverror_c(struct macroblock *mb);
 int vp8_mbuverror_sse2(struct macroblock *mb);
 #define vp8_mbuverror vp8_mbuverror_sse2
 
-int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
 #define vp8_refining_search_sad vp8_refining_search_sadx4
 
 void vp8_regular_quantize_b_c(struct block *, struct blockd *);
@@ -184,40 +181,40 @@ void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
 void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch);
 #define vp8_short_fdct8x4 vp8_short_fdct8x4_sse2
 
-void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
-void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
+void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
 #define vp8_short_idct4x4llm vp8_short_idct4x4llm_mmx
 
-void vp8_short_inv_walsh4x4_c(short *input, short *output);
-void vp8_short_inv_walsh4x4_sse2(short *input, short *output);
+void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff);
+void vp8_short_inv_walsh4x4_sse2(short *input, short *mb_dqcoeff);
 #define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_sse2
 
-void vp8_short_inv_walsh4x4_1_c(short *input, short *output);
+void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff);
 #define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c
 
 void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
 void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch);
 #define vp8_short_walsh4x4 vp8_short_walsh4x4_sse2
 
-void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
 void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count);
 void vp8_temporal_filter_apply_sse2(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count);
@@ -241,9 +238,6 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE3) vp8_copy32xn = vp8_copy32xn_sse3;
     vp8_fast_quantize_b = vp8_fast_quantize_b_sse2;
     if (flags & HAS_SSSE3) vp8_fast_quantize_b = vp8_fast_quantize_b_ssse3;
-    vp8_full_search_sad = vp8_full_search_sad_c;
-    if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3;
-    if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8;
     vp8_regular_quantize_b = vp8_regular_quantize_b_sse2;
     if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1;
     vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2;
@@ -261,4 +255,4 @@ static void setup_rtcd_internal(void)
 }  // extern "C"
 #endif
 
-#endif
+#endif  // VP8_RTCD_H_
diff --git a/media/libvpx/config/mac/x64/vp9_rtcd.h b/media/libvpx/config/mac/x64/vp9_rtcd.h
index 47a708444d..52d6f0657d 100644
--- a/media/libvpx/config/mac/x64/vp9_rtcd.h
+++ b/media/libvpx/config/mac/x64/vp9_rtcd.h
@@ -1,3 +1,14 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
 #ifndef VP9_RTCD_H_
 #define VP9_RTCD_H_
 
@@ -14,12 +25,18 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
+#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER
+#include "vp9/encoder/vp9_temporal_filter.h"
+#endif
 
 struct macroblockd;
 
 /* Encoder forward decls */
 struct macroblock;
-struct vp9_variance_vtable;
+struct macroblock_plane;
+struct vp9_sad_table;
+struct ScanOrder;
 struct search_site_config;
 struct mv;
 union int_mv;
@@ -29,23 +46,22 @@ struct yv12_buffer_config;
 extern "C" {
 #endif
 
+void vp9_apply_temporal_filter_c(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+void vp9_apply_temporal_filter_sse4_1(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+RTCD_EXTERN void (*vp9_apply_temporal_filter)(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 int64_t vp9_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 RTCD_EXTERN int64_t (*vp9_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
-int64_t vp9_block_error_fp_sse2(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
-#define vp9_block_error_fp vp9_block_error_fp_sse2
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+RTCD_EXTERN int64_t (*vp9_block_error_fp)(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
 
-int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-int vp9_diamond_search_sad_avx(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-RTCD_EXTERN int (*vp9_diamond_search_sad)(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-
-void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-RTCD_EXTERN void (*vp9_fdct8x8_quant)(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv);
+#define vp9_diamond_search_sad vp9_diamond_search_sad_c
 
 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
 void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type);
@@ -67,17 +83,12 @@ void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride, uint8_t *dst,
 void vp9_filter_by_weight8x8_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight);
 #define vp9_filter_by_weight8x8 vp9_filter_by_weight8x8_sse2
 
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-int vp9_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-int vp9_full_search_sadx8(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-RTCD_EXTERN int (*vp9_full_search_sad)(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-
 void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 void vp9_fwht4x4_sse2(const int16_t *input, tran_low_t *output, int stride);
 #define vp9_fwht4x4 vp9_fwht4x4_sse2
 
-void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
-void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
+void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2
 
 void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
@@ -88,22 +99,35 @@ void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int
 void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht8x8_64_add vp9_iht8x8_64_add_sse2
 
-void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 
-void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 
-void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
-void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
-RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
+void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
 
-void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
-void vp9_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_sse2
+void vpx_convolve12_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve12)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve12_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve12_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve12_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve12_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
 void vp9_rtcd(void);
 
@@ -115,21 +139,29 @@ static void setup_rtcd_internal(void)
 
     (void)flags;
 
+    vp9_apply_temporal_filter = vp9_apply_temporal_filter_c;
+    if (flags & HAS_SSE4_1) vp9_apply_temporal_filter = vp9_apply_temporal_filter_sse4_1;
     vp9_block_error = vp9_block_error_sse2;
     if (flags & HAS_AVX2) vp9_block_error = vp9_block_error_avx2;
-    vp9_diamond_search_sad = vp9_diamond_search_sad_c;
-    if (flags & HAS_AVX) vp9_diamond_search_sad = vp9_diamond_search_sad_avx;
-    vp9_fdct8x8_quant = vp9_fdct8x8_quant_sse2;
-    if (flags & HAS_SSSE3) vp9_fdct8x8_quant = vp9_fdct8x8_quant_ssse3;
-    vp9_full_search_sad = vp9_full_search_sad_c;
-    if (flags & HAS_SSE3) vp9_full_search_sad = vp9_full_search_sadx3;
-    if (flags & HAS_SSE4_1) vp9_full_search_sad = vp9_full_search_sadx8;
+    vp9_block_error_fp = vp9_block_error_fp_sse2;
+    if (flags & HAS_AVX2) vp9_block_error_fp = vp9_block_error_fp_avx2;
     vp9_quantize_fp = vp9_quantize_fp_sse2;
     if (flags & HAS_SSSE3) vp9_quantize_fp = vp9_quantize_fp_ssse3;
+    if (flags & HAS_AVX2) vp9_quantize_fp = vp9_quantize_fp_avx2;
     vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_c;
     if (flags & HAS_SSSE3) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_ssse3;
+    if (flags & HAS_AVX2) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_avx2;
     vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c;
     if (flags & HAS_SSSE3) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_ssse3;
+    vpx_convolve12 = vpx_convolve12_c;
+    if (flags & HAS_SSSE3) vpx_convolve12 = vpx_convolve12_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve12 = vpx_convolve12_avx2;
+    vpx_convolve12_horiz = vpx_convolve12_horiz_c;
+    if (flags & HAS_SSSE3) vpx_convolve12_horiz = vpx_convolve12_horiz_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve12_horiz = vpx_convolve12_horiz_avx2;
+    vpx_convolve12_vert = vpx_convolve12_vert_c;
+    if (flags & HAS_SSSE3) vpx_convolve12_vert = vpx_convolve12_vert_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve12_vert = vpx_convolve12_vert_avx2;
 }
 #endif
 
@@ -137,4 +169,4 @@ static void setup_rtcd_internal(void)
 }  // extern "C"
 #endif
 
-#endif
+#endif  // VP9_RTCD_H_
diff --git a/media/libvpx/config/mac/x64/vpx_config.asm b/media/libvpx/config/mac/x64/vpx_config.asm
index e474409958..83a97b3c7c 100644
--- a/media/libvpx/config/mac/x64/vpx_config.asm
+++ b/media/libvpx/config/mac/x64/vpx_config.asm
@@ -1,9 +1,16 @@
-%define ARCH_ARM 0
-%define ARCH_MIPS 0
-%define ARCH_X86 0
-%define ARCH_X86_64 1
-%define HAVE_NEON 0
+%define VPX_ARCH_ARM 0
+%define VPX_ARCH_AARCH64 0
+%define VPX_ARCH_MIPS 0
+%define VPX_ARCH_X86 0
+%define VPX_ARCH_X86_64 1
+%define VPX_ARCH_PPC 0
+%define VPX_ARCH_LOONGARCH 0
 %define HAVE_NEON_ASM 0
+%define HAVE_NEON 0
+%define HAVE_NEON_DOTPROD 0
+%define HAVE_NEON_I8MM 0
+%define HAVE_SVE 0
+%define HAVE_SVE2 0
 %define HAVE_MIPS32 0
 %define HAVE_DSPR2 0
 %define HAVE_MSA 0
@@ -16,6 +23,11 @@
 %define HAVE_SSE4_1 1
 %define HAVE_AVX 1
 %define HAVE_AVX2 1
+%define HAVE_AVX512 1
+%define HAVE_VSX 0
+%define HAVE_MMI 0
+%define HAVE_LSX 0
+%define HAVE_LASX 0
 %define HAVE_VPX_PORTS 1
 %define HAVE_PTHREAD_H 1
 %define CONFIG_DEPENDENCY_TRACKING 1
@@ -72,7 +84,10 @@
 %define CONFIG_BETTER_HW_COMPATIBILITY 0
 %define CONFIG_EXPERIMENTAL 0
 %define CONFIG_SIZE_LIMIT 1
-%define CONFIG_SPATIAL_SVC 0
+%define CONFIG_ALWAYS_ADJUST_BPM 0
+%define CONFIG_BITSTREAM_DEBUG 0
+%define CONFIG_MISMATCH_DEBUG 0
 %define CONFIG_FP_MB_STATS 0
 %define CONFIG_EMULATE_HARDWARE 0
-%define CONFIG_MISC_FIXES 0
+%define CONFIG_NON_GREEDY_MV 0
+%define CONFIG_COLLECT_COMPONENT_TIMING 0
diff --git a/media/libvpx/config/mac/x64/vpx_config.h b/media/libvpx/config/mac/x64/vpx_config.h
index 23f32d1487..b0bfa58fe5 100644
--- a/media/libvpx/config/mac/x64/vpx_config.h
+++ b/media/libvpx/config/mac/x64/vpx_config.h
@@ -10,12 +10,19 @@
 #define VPX_CONFIG_H
 #define RESTRICT    
 #define INLINE      inline
-#define ARCH_ARM 0
-#define ARCH_MIPS 0
-#define ARCH_X86 0
-#define ARCH_X86_64 1
-#define HAVE_NEON 0
+#define VPX_ARCH_ARM 0
+#define VPX_ARCH_AARCH64 0
+#define VPX_ARCH_MIPS 0
+#define VPX_ARCH_X86 0
+#define VPX_ARCH_X86_64 1
+#define VPX_ARCH_PPC 0
+#define VPX_ARCH_LOONGARCH 0
 #define HAVE_NEON_ASM 0
+#define HAVE_NEON 0
+#define HAVE_NEON_DOTPROD 0
+#define HAVE_NEON_I8MM 0
+#define HAVE_SVE 0
+#define HAVE_SVE2 0
 #define HAVE_MIPS32 0
 #define HAVE_DSPR2 0
 #define HAVE_MSA 0
@@ -28,6 +35,11 @@
 #define HAVE_SSE4_1 1
 #define HAVE_AVX 1
 #define HAVE_AVX2 1
+#define HAVE_AVX512 1
+#define HAVE_VSX 0
+#define HAVE_MMI 0
+#define HAVE_LSX 0
+#define HAVE_LASX 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_PTHREAD_H 1
 #define CONFIG_DEPENDENCY_TRACKING 1
@@ -84,10 +96,13 @@
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 1
-#define CONFIG_SPATIAL_SVC 0
+#define CONFIG_ALWAYS_ADJUST_BPM 0
+#define CONFIG_BITSTREAM_DEBUG 0
+#define CONFIG_MISMATCH_DEBUG 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_EMULATE_HARDWARE 0
-#define CONFIG_MISC_FIXES 0
+#define CONFIG_NON_GREEDY_MV 0
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
 #define DECODE_WIDTH_LIMIT 8192
 #define DECODE_HEIGHT_LIMIT 4608
 #endif /* VPX_CONFIG_H */
diff --git a/media/libvpx/config/mac/x64/vpx_dsp_rtcd.h b/media/libvpx/config/mac/x64/vpx_dsp_rtcd.h
index 6f181c0a0e..8e17c8ab8a 100644
--- a/media/libvpx/config/mac/x64/vpx_dsp_rtcd.h
+++ b/media/libvpx/config/mac/x64/vpx_dsp_rtcd.h
@@ -1,3 +1,14 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
 #ifndef VPX_DSP_RTCD_H_
 #define VPX_DSP_RTCD_H_
 
@@ -13,6 +24,11 @@
 
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#if CONFIG_VP9_ENCODER
+ struct macroblock_plane;
+ struct ScanOrder;
+#endif
 
 
 #ifdef __cplusplus
@@ -28,243 +44,216 @@ unsigned int vpx_avg_8x8_sse2(const uint8_t *, int p);
 #define vpx_avg_8x8 vpx_avg_8x8_sse2
 
 void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
-#define vpx_comp_avg_pred vpx_comp_avg_pred_c
+void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+void vpx_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+RTCD_EXTERN void (*vpx_comp_avg_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
 
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve_avg vpx_convolve_avg_sse2
 
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve_copy vpx_convolve_copy_sse2
 
-void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c
 
-void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c
 
-void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c
 
-void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
 
-void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c
 
-void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c
 
-void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c
 
-void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c
 
-void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_sse2
 
-void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207e_predictor_16x16 vpx_d207e_predictor_16x16_c
+void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207e_predictor_32x32 vpx_d207e_predictor_32x32_c
+void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207e_predictor_4x4 vpx_d207e_predictor_4x4_c
-
-void vpx_d207e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207e_predictor_8x8 vpx_d207e_predictor_8x8_c
-
-void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_sse2
 
-void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_sse2
 
-void vpx_d45e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45e_predictor_16x16 vpx_d45e_predictor_16x16_c
-
-void vpx_d45e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45e_predictor_32x32 vpx_d45e_predictor_32x32_c
-
-void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c
 
-void vpx_d45e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45e_predictor_8x8 vpx_d45e_predictor_8x8_c
+void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_d63e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63e_predictor_16x16 vpx_d63e_predictor_16x16_c
-
-void vpx_d63e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63e_predictor_32x32 vpx_d63e_predictor_32x32_c
-
-void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c
 
-void vpx_d63e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63e_predictor_8x8 vpx_d63e_predictor_8x8_c
-
-void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63f_predictor_4x4 vpx_d63f_predictor_4x4_c
-
-void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_sse2
 
-void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_sse2
 
-void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_sse2
 
-void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_sse2
 
-void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_sse2
 
-void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_sse2
 
-void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_sse2
 
-void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_sse2
 
-void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_sse2
 
-void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_sse2
 
-void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_sse2
 
-void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_sse2
 
-void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_sse2
 
-void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_sse2
 
-void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_sse2
 
-void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_sse2
 
 void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
 void vpx_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct16x16 vpx_fdct16x16_sse2
+void vpx_fdct16x16_avx2(const int16_t *input, tran_low_t *output, int stride);
+RTCD_EXTERN void (*vpx_fdct16x16)(const int16_t *input, tran_low_t *output, int stride);
 
 void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
 void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output, int stride);
@@ -301,48 +290,54 @@ void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
 void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride);
 #define vpx_fdct8x8_1 vpx_fdct8x8_1_sse2
 
-void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vpx_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vpx_get16x16var_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
 
-unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride);
+unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride);
 #define vpx_get4x4sse_cs vpx_get4x4sse_cs_c
 
-void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vpx_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get8x8var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
 #define vpx_get8x8var vpx_get8x8var_sse2
 
 unsigned int vpx_get_mb_ss_c(const int16_t *);
 unsigned int vpx_get_mb_ss_sse2(const int16_t *);
 #define vpx_get_mb_ss vpx_get_mb_ss_sse2
 
-void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_16x16 vpx_h_predictor_16x16_sse2
 
-void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_32x32 vpx_h_predictor_32x32_sse2
 
-void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_4x4 vpx_h_predictor_4x4_sse2
 
-void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_sse2
 
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_16x16_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff);
-#define vpx_hadamard_16x16 vpx_hadamard_16x16_sse2
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
 
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_8x8_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_8x8_ssse3(const int16_t *src_diff, int src_stride, int16_t *coeff);
-RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+RTCD_EXTERN void (*vpx_hadamard_32x32)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
 
-void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_8x8_ssse3(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+
+void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
 
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
@@ -355,16 +350,22 @@ void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride
 
 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct16x16_256_add vpx_idct16x16_256_add_sse2
+void vpx_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride);
+
+void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_38_add vpx_idct16x16_38_add_sse2
 
 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest, int stride);
 RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
 void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest, int stride);
 RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
@@ -395,15 +396,14 @@ void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 
 void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
-RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct8x8_64_add vpx_idct8x8_64_add_sse2
 
 int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
 int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width);
 #define vpx_int_pro_col vpx_int_pro_col_sse2
 
-void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
-void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
+void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
+void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
 #define vpx_int_pro_row vpx_int_pro_row_sse2
 
 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
@@ -463,8 +463,8 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, co
 void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
 #define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_sse2
 
-void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vpx_mbpost_proc_across_ip_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols,int flimit);
+void vpx_mbpost_proc_across_ip_sse2(unsigned char *src, int pitch, int rows, int cols,int flimit);
 #define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_sse2
 
 void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
@@ -475,21 +475,22 @@ void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *mi
 void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
 #define vpx_minmax_8x8 vpx_minmax_8x8_sse2
 
-unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define vpx_mse16x8 vpx_mse16x8_sse2
+unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_mse8x16 vpx_mse8x16_sse2
 
-unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_mse8x8 vpx_mse8x8_sse2
 
 void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch);
@@ -500,16 +501,18 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *d
 void vpx_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
 #define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_sse2
 
-void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 
-void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-RTCD_EXTERN void (*vpx_quantize_b_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vpx_quantize_b_32x32)(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 
 unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -519,19 +522,10 @@ unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x16_avg vpx_sad16x16_avg_sse2
 
-void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x16x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
-void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x16x4d vpx_sad16x16x4d_sse2
 
-void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
 unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad16x32 vpx_sad16x32_sse2
@@ -540,8 +534,8 @@ unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x32_avg vpx_sad16x32_avg_sse2
 
-void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x32x4d vpx_sad16x32x4d_sse2
 
 unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -552,19 +546,10 @@ unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uin
 unsigned int vpx_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x8_avg vpx_sad16x8_avg_sse2
 
-void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x8x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
-void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x8x4d vpx_sad16x8x4d_sse2
 
-void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
 unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -575,8 +560,8 @@ unsigned int vpx_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_sad32x16_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad32x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x16x4d vpx_sad32x16x4d_sse2
 
 unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -589,16 +574,10 @@ unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_sad32x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad32x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x3 vpx_sad32x32x3_c
-
-void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_c
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -610,8 +589,8 @@ unsigned int vpx_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_sad32x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad32x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x64x4d vpx_sad32x64x4d_sse2
 
 unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -622,18 +601,10 @@ unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad4x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad4x4_avg vpx_sad4x4_avg_sse2
 
-void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad4x4x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad4x4x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
-void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad4x4x4d vpx_sad4x4x4d_sse2
 
-void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad4x4x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad4x4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
 unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad4x8 vpx_sad4x8_sse2
@@ -642,47 +613,43 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad4x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad4x8_avg vpx_sad4x8_avg_sse2
 
-void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad4x8x4d vpx_sad4x8x4d_sse2
 
-void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x8 vpx_sad4x8x8_c
-
 unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad64x32_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 RTCD_EXTERN unsigned int (*vpx_sad64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 
 unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 unsigned int vpx_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 unsigned int vpx_sad64x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad64x32_avg_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad64x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad64x32x4d vpx_sad64x32x4d_sse2
 
 unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad64x64_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 RTCD_EXTERN unsigned int (*vpx_sad64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 
 unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 unsigned int vpx_sad64x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad64x64_avg_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad64x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x3 vpx_sad64x64x3_c
-
-void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-
-void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x8 vpx_sad64x64x8_c
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -692,18 +659,10 @@ unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uin
 unsigned int vpx_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x16_avg vpx_sad8x16_avg_sse2
 
-void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad8x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x16x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
-void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x16x4d vpx_sad8x16x4d_sse2
 
-void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad8x16x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
 unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x4 vpx_sad8x4_sse2
@@ -712,13 +671,10 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x4_avg vpx_sad8x4_avg_sse2
 
-void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x4x4d vpx_sad8x4x4d_sse2
 
-void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x8 vpx_sad8x4x8_c
-
 unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x8 vpx_sad8x8_sse2
@@ -727,273 +683,392 @@ unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x8_avg vpx_sad8x8_avg_sse2
 
-void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad8x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x8x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
-void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x8x4d vpx_sad8x8x4d_sse2
 
-void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad8x8x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x16 vpx_sad_skip_16x16_sse2
+
+void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x16x4d vpx_sad_skip_16x16x4d_sse2
+
+unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x32 vpx_sad_skip_16x32_sse2
+
+void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x32x4d vpx_sad_skip_16x32x4d_sse2
+
+unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x8 vpx_sad_skip_16x8_sse2
+
+void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x8x4d vpx_sad_skip_16x8x4d_sse2
+
+unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x16x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c
+
+void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c
+
+unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_sse2
+
+void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_sse2
+
+unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x32_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x32x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x64_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_sse2
+
+void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_sse2
+
+unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c
+
+void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c
+
+unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_sse2
+
+void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_sse2
 
 int vpx_satd_c(const int16_t *coeff, int length);
 int vpx_satd_sse2(const int16_t *coeff, int length);
-#define vpx_satd vpx_satd_sse2
+int vpx_satd_avx2(const int16_t *coeff, int length);
+RTCD_EXTERN int (*vpx_satd)(const int16_t *coeff, int length);
 
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
 
-void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
 
-void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
 
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_horiz vpx_scaled_horiz_c
 
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_vert vpx_scaled_vert_c
 
-uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+int64_t vpx_sse_sse4_1(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+int64_t vpx_sse_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+RTCD_EXTERN int64_t (*vpx_sse)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
 
-uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
 void vpx_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
-#define vpx_subtract_block vpx_subtract_block_sse2
+void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+RTCD_EXTERN void (*vpx_subtract_block)(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
 
 uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size);
 uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size);
 #define vpx_sum_squares_2d_i16 vpx_sum_squares_2d_i16_sse2
 
-void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_sse2
 
-void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_sse2
 
-void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_sse2
 
-void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_sse2
 
-void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_v_predictor_16x16 vpx_v_predictor_16x16_sse2
 
-void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_v_predictor_32x32 vpx_v_predictor_32x32_sse2
 
-void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_v_predictor_4x4 vpx_v_predictor_4x4_sse2
 
-void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_v_predictor_8x8 vpx_v_predictor_8x8_sse2
 
-unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance16x32 vpx_variance16x32_sse2
+unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance16x8 vpx_variance16x8_sse2
+unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance32x64 vpx_variance32x64_sse2
+unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_variance4x4 vpx_variance4x4_sse2
 
-unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_variance4x8 vpx_variance4x8_sse2
 
-unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance8x16 vpx_variance8x16_sse2
+unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance8x4 vpx_variance8x4_sse2
+unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance8x8 vpx_variance8x8_sse2
+unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c
 
 int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl);
@@ -1010,15 +1085,20 @@ static void setup_rtcd_internal(void)
 
     (void)flags;
 
+    vpx_comp_avg_pred = vpx_comp_avg_pred_sse2;
+    if (flags & HAS_AVX2) vpx_comp_avg_pred = vpx_comp_avg_pred_avx2;
     vpx_convolve8 = vpx_convolve8_sse2;
     if (flags & HAS_SSSE3) vpx_convolve8 = vpx_convolve8_ssse3;
     if (flags & HAS_AVX2) vpx_convolve8 = vpx_convolve8_avx2;
     vpx_convolve8_avg = vpx_convolve8_avg_sse2;
     if (flags & HAS_SSSE3) vpx_convolve8_avg = vpx_convolve8_avg_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve8_avg = vpx_convolve8_avg_avx2;
     vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_sse2;
     if (flags & HAS_SSSE3) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_avx2;
     vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_sse2;
     if (flags & HAS_SSSE3) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_avx2;
     vpx_convolve8_horiz = vpx_convolve8_horiz_sse2;
     if (flags & HAS_SSSE3) vpx_convolve8_horiz = vpx_convolve8_horiz_ssse3;
     if (flags & HAS_AVX2) vpx_convolve8_horiz = vpx_convolve8_horiz_avx2;
@@ -1051,6 +1131,8 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSSE3) vpx_d63_predictor_4x4 = vpx_d63_predictor_4x4_ssse3;
     vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_c;
     if (flags & HAS_SSSE3) vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_ssse3;
+    vpx_fdct16x16 = vpx_fdct16x16_sse2;
+    if (flags & HAS_AVX2) vpx_fdct16x16 = vpx_fdct16x16_avx2;
     vpx_fdct32x32 = vpx_fdct32x32_sse2;
     if (flags & HAS_AVX2) vpx_fdct32x32 = vpx_fdct32x32_avx2;
     vpx_fdct32x32_rd = vpx_fdct32x32_rd_sse2;
@@ -1059,40 +1141,39 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSSE3) vpx_fdct8x8 = vpx_fdct8x8_ssse3;
     vpx_get16x16var = vpx_get16x16var_sse2;
     if (flags & HAS_AVX2) vpx_get16x16var = vpx_get16x16var_avx2;
+    vpx_hadamard_16x16 = vpx_hadamard_16x16_sse2;
+    if (flags & HAS_AVX2) vpx_hadamard_16x16 = vpx_hadamard_16x16_avx2;
+    vpx_hadamard_32x32 = vpx_hadamard_32x32_sse2;
+    if (flags & HAS_AVX2) vpx_hadamard_32x32 = vpx_hadamard_32x32_avx2;
     vpx_hadamard_8x8 = vpx_hadamard_8x8_sse2;
     if (flags & HAS_SSSE3) vpx_hadamard_8x8 = vpx_hadamard_8x8_ssse3;
+    vpx_idct16x16_256_add = vpx_idct16x16_256_add_sse2;
+    if (flags & HAS_AVX2) vpx_idct16x16_256_add = vpx_idct16x16_256_add_avx2;
     vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_sse2;
-    if (flags & HAS_SSSE3) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_ssse3;
-    vpx_idct32x32_135_add = vpx_idct32x32_1024_add_sse2;
+    if (flags & HAS_AVX2) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_avx2;
+    vpx_idct32x32_135_add = vpx_idct32x32_135_add_sse2;
     if (flags & HAS_SSSE3) vpx_idct32x32_135_add = vpx_idct32x32_135_add_ssse3;
+    if (flags & HAS_AVX2) vpx_idct32x32_135_add = vpx_idct32x32_135_add_avx2;
     vpx_idct32x32_34_add = vpx_idct32x32_34_add_sse2;
     if (flags & HAS_SSSE3) vpx_idct32x32_34_add = vpx_idct32x32_34_add_ssse3;
     vpx_idct8x8_12_add = vpx_idct8x8_12_add_sse2;
     if (flags & HAS_SSSE3) vpx_idct8x8_12_add = vpx_idct8x8_12_add_ssse3;
-    vpx_idct8x8_64_add = vpx_idct8x8_64_add_sse2;
-    if (flags & HAS_SSSE3) vpx_idct8x8_64_add = vpx_idct8x8_64_add_ssse3;
     vpx_lpf_horizontal_16 = vpx_lpf_horizontal_16_sse2;
     if (flags & HAS_AVX2) vpx_lpf_horizontal_16 = vpx_lpf_horizontal_16_avx2;
     vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_sse2;
     if (flags & HAS_AVX2) vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_avx2;
     vpx_mse16x16 = vpx_mse16x16_sse2;
     if (flags & HAS_AVX2) vpx_mse16x16 = vpx_mse16x16_avx2;
+    vpx_mse16x8 = vpx_mse16x8_sse2;
+    if (flags & HAS_AVX2) vpx_mse16x8 = vpx_mse16x8_avx2;
     vpx_quantize_b = vpx_quantize_b_sse2;
     if (flags & HAS_SSSE3) vpx_quantize_b = vpx_quantize_b_ssse3;
     if (flags & HAS_AVX) vpx_quantize_b = vpx_quantize_b_avx;
+    if (flags & HAS_AVX2) vpx_quantize_b = vpx_quantize_b_avx2;
     vpx_quantize_b_32x32 = vpx_quantize_b_32x32_c;
     if (flags & HAS_SSSE3) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_ssse3;
     if (flags & HAS_AVX) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx;
-    vpx_sad16x16x3 = vpx_sad16x16x3_c;
-    if (flags & HAS_SSE3) vpx_sad16x16x3 = vpx_sad16x16x3_sse3;
-    if (flags & HAS_SSSE3) vpx_sad16x16x3 = vpx_sad16x16x3_ssse3;
-    vpx_sad16x16x8 = vpx_sad16x16x8_c;
-    if (flags & HAS_SSE4_1) vpx_sad16x16x8 = vpx_sad16x16x8_sse4_1;
-    vpx_sad16x8x3 = vpx_sad16x8x3_c;
-    if (flags & HAS_SSE3) vpx_sad16x8x3 = vpx_sad16x8x3_sse3;
-    if (flags & HAS_SSSE3) vpx_sad16x8x3 = vpx_sad16x8x3_ssse3;
-    vpx_sad16x8x8 = vpx_sad16x8x8_c;
-    if (flags & HAS_SSE4_1) vpx_sad16x8x8 = vpx_sad16x8x8_sse4_1;
+    if (flags & HAS_AVX2) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx2;
     vpx_sad32x16 = vpx_sad32x16_sse2;
     if (flags & HAS_AVX2) vpx_sad32x16 = vpx_sad32x16_avx2;
     vpx_sad32x16_avg = vpx_sad32x16_avg_sse2;
@@ -1107,30 +1188,52 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) vpx_sad32x64 = vpx_sad32x64_avx2;
     vpx_sad32x64_avg = vpx_sad32x64_avg_sse2;
     if (flags & HAS_AVX2) vpx_sad32x64_avg = vpx_sad32x64_avg_avx2;
-    vpx_sad4x4x3 = vpx_sad4x4x3_c;
-    if (flags & HAS_SSE3) vpx_sad4x4x3 = vpx_sad4x4x3_sse3;
-    vpx_sad4x4x8 = vpx_sad4x4x8_c;
-    if (flags & HAS_SSE4_1) vpx_sad4x4x8 = vpx_sad4x4x8_sse4_1;
     vpx_sad64x32 = vpx_sad64x32_sse2;
     if (flags & HAS_AVX2) vpx_sad64x32 = vpx_sad64x32_avx2;
+    if (flags & HAS_AVX512) vpx_sad64x32 = vpx_sad64x32_avx512;
     vpx_sad64x32_avg = vpx_sad64x32_avg_sse2;
     if (flags & HAS_AVX2) vpx_sad64x32_avg = vpx_sad64x32_avg_avx2;
+    if (flags & HAS_AVX512) vpx_sad64x32_avg = vpx_sad64x32_avg_avx512;
     vpx_sad64x64 = vpx_sad64x64_sse2;
     if (flags & HAS_AVX2) vpx_sad64x64 = vpx_sad64x64_avx2;
+    if (flags & HAS_AVX512) vpx_sad64x64 = vpx_sad64x64_avx512;
     vpx_sad64x64_avg = vpx_sad64x64_avg_sse2;
     if (flags & HAS_AVX2) vpx_sad64x64_avg = vpx_sad64x64_avg_avx2;
+    if (flags & HAS_AVX512) vpx_sad64x64_avg = vpx_sad64x64_avg_avx512;
     vpx_sad64x64x4d = vpx_sad64x64x4d_sse2;
     if (flags & HAS_AVX2) vpx_sad64x64x4d = vpx_sad64x64x4d_avx2;
-    vpx_sad8x16x3 = vpx_sad8x16x3_c;
-    if (flags & HAS_SSE3) vpx_sad8x16x3 = vpx_sad8x16x3_sse3;
-    vpx_sad8x16x8 = vpx_sad8x16x8_c;
-    if (flags & HAS_SSE4_1) vpx_sad8x16x8 = vpx_sad8x16x8_sse4_1;
-    vpx_sad8x8x3 = vpx_sad8x8x3_c;
-    if (flags & HAS_SSE3) vpx_sad8x8x3 = vpx_sad8x8x3_sse3;
-    vpx_sad8x8x8 = vpx_sad8x8x8_c;
-    if (flags & HAS_SSE4_1) vpx_sad8x8x8 = vpx_sad8x8x8_sse4_1;
+    if (flags & HAS_AVX512) vpx_sad64x64x4d = vpx_sad64x64x4d_avx512;
+    vpx_sad_skip_32x16 = vpx_sad_skip_32x16_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_avx2;
+    vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_avx2;
+    vpx_sad_skip_32x32 = vpx_sad_skip_32x32_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_avx2;
+    vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_avx2;
+    vpx_sad_skip_32x64 = vpx_sad_skip_32x64_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_avx2;
+    vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_avx2;
+    vpx_sad_skip_64x32 = vpx_sad_skip_64x32_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_avx2;
+    if (flags & HAS_AVX512) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_avx512;
+    vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_avx2;
+    if (flags & HAS_AVX512) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_avx512;
+    vpx_sad_skip_64x64 = vpx_sad_skip_64x64_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_avx2;
+    if (flags & HAS_AVX512) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_avx512;
+    vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_avx2;
+    if (flags & HAS_AVX512) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_avx512;
+    vpx_satd = vpx_satd_sse2;
+    if (flags & HAS_AVX2) vpx_satd = vpx_satd_avx2;
     vpx_scaled_2d = vpx_scaled_2d_c;
     if (flags & HAS_SSSE3) vpx_scaled_2d = vpx_scaled_2d_ssse3;
+    vpx_sse = vpx_sse_c;
+    if (flags & HAS_SSE4_1) vpx_sse = vpx_sse_sse4_1;
+    if (flags & HAS_AVX2) vpx_sse = vpx_sse_avx2;
     vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_sse2;
     if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_ssse3;
     vpx_sub_pixel_avg_variance16x32 = vpx_sub_pixel_avg_variance16x32_sse2;
@@ -1187,16 +1290,30 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x4 = vpx_sub_pixel_variance8x4_ssse3;
     vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_sse2;
     if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_ssse3;
+    vpx_subtract_block = vpx_subtract_block_sse2;
+    if (flags & HAS_AVX2) vpx_subtract_block = vpx_subtract_block_avx2;
     vpx_variance16x16 = vpx_variance16x16_sse2;
     if (flags & HAS_AVX2) vpx_variance16x16 = vpx_variance16x16_avx2;
+    vpx_variance16x32 = vpx_variance16x32_sse2;
+    if (flags & HAS_AVX2) vpx_variance16x32 = vpx_variance16x32_avx2;
+    vpx_variance16x8 = vpx_variance16x8_sse2;
+    if (flags & HAS_AVX2) vpx_variance16x8 = vpx_variance16x8_avx2;
     vpx_variance32x16 = vpx_variance32x16_sse2;
     if (flags & HAS_AVX2) vpx_variance32x16 = vpx_variance32x16_avx2;
     vpx_variance32x32 = vpx_variance32x32_sse2;
     if (flags & HAS_AVX2) vpx_variance32x32 = vpx_variance32x32_avx2;
+    vpx_variance32x64 = vpx_variance32x64_sse2;
+    if (flags & HAS_AVX2) vpx_variance32x64 = vpx_variance32x64_avx2;
     vpx_variance64x32 = vpx_variance64x32_sse2;
     if (flags & HAS_AVX2) vpx_variance64x32 = vpx_variance64x32_avx2;
     vpx_variance64x64 = vpx_variance64x64_sse2;
     if (flags & HAS_AVX2) vpx_variance64x64 = vpx_variance64x64_avx2;
+    vpx_variance8x16 = vpx_variance8x16_sse2;
+    if (flags & HAS_AVX2) vpx_variance8x16 = vpx_variance8x16_avx2;
+    vpx_variance8x4 = vpx_variance8x4_sse2;
+    if (flags & HAS_AVX2) vpx_variance8x4 = vpx_variance8x4_avx2;
+    vpx_variance8x8 = vpx_variance8x8_sse2;
+    if (flags & HAS_AVX2) vpx_variance8x8 = vpx_variance8x8_avx2;
 }
 #endif
 
@@ -1204,4 +1321,4 @@ static void setup_rtcd_internal(void)
 }  // extern "C"
 #endif
 
-#endif
+#endif  // VPX_DSP_RTCD_H_
diff --git a/media/libvpx/config/mac/x64/vpx_scale_rtcd.h b/media/libvpx/config/mac/x64/vpx_scale_rtcd.h
index ddf7d01cca..18e5b71579 100644
--- a/media/libvpx/config/mac/x64/vpx_scale_rtcd.h
+++ b/media/libvpx/config/mac/x64/vpx_scale_rtcd.h
@@ -1,3 +1,14 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
 #ifndef VPX_SCALE_RTCD_H_
 #define VPX_SCALE_RTCD_H_
 
@@ -46,6 +57,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
 void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
 #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
 
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
 void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
 #define vpx_yv12_copy_y vpx_yv12_copy_y_c
 
@@ -66,4 +80,4 @@ static void setup_rtcd_internal(void)
 }  // extern "C"
 #endif
 
-#endif
+#endif  // VPX_SCALE_RTCD_H_
diff --git a/media/libvpx/config/vpx_version.h b/media/libvpx/config/vpx_version.h
index 24da169b4f..ba9b63a4a3 100644
--- a/media/libvpx/config/vpx_version.h
+++ b/media/libvpx/config/vpx_version.h
@@ -1,7 +1,11 @@
+// This file is generated. Do not edit.
+#ifndef VPX_VERSION_H_
+#define VPX_VERSION_H_
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  6
-#define VERSION_PATCH  1
+#define VERSION_MINOR  16
+#define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.6.1"
-#define VERSION_STRING      " v1.6.1"
+#define VERSION_STRING_NOSP "v1.16.0"
+#define VERSION_STRING      " v1.16.0"
+#endif  // VPX_VERSION_H_
diff --git a/media/libvpx/config/win/ia32/vp8_rtcd.h b/media/libvpx/config/win/ia32/vp8_rtcd.h
index 5db5bbad85..ca42bb8a5e 100644
--- a/media/libvpx/config/win/ia32/vp8_rtcd.h
+++ b/media/libvpx/config/win/ia32/vp8_rtcd.h
@@ -1,3 +1,14 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
 #ifndef VP8_RTCD_H_
 #define VP8_RTCD_H_
 
@@ -26,57 +37,48 @@ struct yv12_buffer_config;
 extern "C" {
 #endif
 
-void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict4x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-
-void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp8_blend_b vp8_blend_b_c
-
-void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp8_blend_mb_inner vp8_blend_mb_inner_c
-
-void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp8_blend_mb_outer vp8_blend_mb_outer_c
+void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
 int vp8_block_error_c(short *coeff, short *dqcoeff);
 int vp8_block_error_sse2(short *coeff, short *dqcoeff);
 RTCD_EXTERN int (*vp8_block_error)(short *coeff, short *dqcoeff);
 
-void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
-void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
-void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
-RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
+void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
+void vp8_copy32xn_sse2(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
+void vp8_copy32xn_sse3(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
+RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
 
-void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
 
-void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem8x4_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
 
-void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem8x8_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
 
-void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
-void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
-RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
+void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
 
 int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
 int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
@@ -86,9 +88,9 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, u
 int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
 RTCD_EXTERN int (*vp8_denoiser_filter_uv)(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
 
-void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride);
-void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *output, int stride);
-RTCD_EXTERN void (*vp8_dequant_idct_add)(short *input, short *dq, unsigned char *output, int stride);
+void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride);
+void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *dest, int stride);
+RTCD_EXTERN void (*vp8_dequant_idct_add)(short *input, short *dq, unsigned char *dest, int stride);
 
 void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
 void vp8_dequant_idct_add_uv_block_sse2(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
@@ -98,9 +100,9 @@ void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int
 void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
 RTCD_EXTERN void (*vp8_dequant_idct_add_y_block)(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
 
-void vp8_dequantize_b_c(struct blockd*, short *dqc);
-void vp8_dequantize_b_mmx(struct blockd*, short *dqc);
-RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *dqc);
+void vp8_dequantize_b_c(struct blockd*, short *DQC);
+void vp8_dequantize_b_mmx(struct blockd*, short *DQC);
+RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *DQC);
 
 int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
 int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
@@ -122,42 +124,37 @@ void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char
 void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight);
 RTCD_EXTERN void (*vp8_filter_by_weight8x8)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight);
 
-int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 
-void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+RTCD_EXTERN void (*vp8_loop_filter_bv)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 
-void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_bv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-RTCD_EXTERN void (*vp8_loop_filter_bv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+RTCD_EXTERN void (*vp8_loop_filter_mbh)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 
-void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_mbh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-RTCD_EXTERN void (*vp8_loop_filter_mbh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 
-void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+RTCD_EXTERN void (*vp8_loop_filter_simple_bh)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 
-void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_bhs_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
-RTCD_EXTERN void (*vp8_loop_filter_simple_bh)(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 
-void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
-RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 
-void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
-RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y, int ystride, const unsigned char *blimit);
-
-void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
-RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 
 int vp8_mbblock_error_c(struct macroblock *mb, int dc);
 int vp8_mbblock_error_sse2(struct macroblock *mb, int dc);
@@ -167,9 +164,9 @@ int vp8_mbuverror_c(struct macroblock *mb);
 int vp8_mbuverror_sse2(struct macroblock *mb);
 RTCD_EXTERN int (*vp8_mbuverror)(struct macroblock *mb);
 
-int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
 
 void vp8_regular_quantize_b_c(struct block *, struct blockd *);
 void vp8_regular_quantize_b_sse2(struct block *, struct blockd *);
@@ -184,40 +181,40 @@ void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
 void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch);
 RTCD_EXTERN void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
 
-void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
-void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
-RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
+void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
 
-void vp8_short_inv_walsh4x4_c(short *input, short *output);
-void vp8_short_inv_walsh4x4_sse2(short *input, short *output);
-RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *output);
+void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff);
+void vp8_short_inv_walsh4x4_sse2(short *input, short *mb_dqcoeff);
+RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *mb_dqcoeff);
 
-void vp8_short_inv_walsh4x4_1_c(short *input, short *output);
+void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff);
 #define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c
 
 void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
 void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch);
 RTCD_EXTERN void (*vp8_short_walsh4x4)(short *input, short *output, int pitch);
 
-void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
 void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count);
 void vp8_temporal_filter_apply_sse2(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count);
@@ -237,9 +234,9 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_sse2;
     if (flags & HAS_SSSE3) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_ssse3;
     vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_c;
-    if (flags & HAS_MMX) vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_mmx;
+    if (flags & HAS_SSE2) vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_sse2;
     vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_c;
-    if (flags & HAS_MMX) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_mmx;
+    if (flags & HAS_SSE2) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_sse2;
     vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_c;
     if (flags & HAS_SSE2) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_sse2;
     if (flags & HAS_SSSE3) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_ssse3;
@@ -277,9 +274,6 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vp8_filter_by_weight16x16 = vp8_filter_by_weight16x16_sse2;
     vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_c;
     if (flags & HAS_SSE2) vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_sse2;
-    vp8_full_search_sad = vp8_full_search_sad_c;
-    if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3;
-    if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8;
     vp8_loop_filter_bh = vp8_loop_filter_bh_c;
     if (flags & HAS_SSE2) vp8_loop_filter_bh = vp8_loop_filter_bh_sse2;
     vp8_loop_filter_bv = vp8_loop_filter_bv_c;
@@ -336,4 +330,4 @@ static void setup_rtcd_internal(void)
 }  // extern "C"
 #endif
 
-#endif
+#endif  // VP8_RTCD_H_
diff --git a/media/libvpx/config/win/ia32/vp9_rtcd.h b/media/libvpx/config/win/ia32/vp9_rtcd.h
index da53895ca4..28c82a831c 100644
--- a/media/libvpx/config/win/ia32/vp9_rtcd.h
+++ b/media/libvpx/config/win/ia32/vp9_rtcd.h
@@ -1,3 +1,14 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
 #ifndef VP9_RTCD_H_
 #define VP9_RTCD_H_
 
@@ -14,12 +25,18 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
+#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER
+#include "vp9/encoder/vp9_temporal_filter.h"
+#endif
 
 struct macroblockd;
 
 /* Encoder forward decls */
 struct macroblock;
-struct vp9_variance_vtable;
+struct macroblock_plane;
+struct vp9_sad_table;
+struct ScanOrder;
 struct search_site_config;
 struct mv;
 union int_mv;
@@ -29,23 +46,22 @@ struct yv12_buffer_config;
 extern "C" {
 #endif
 
+void vp9_apply_temporal_filter_c(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+void vp9_apply_temporal_filter_sse4_1(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+RTCD_EXTERN void (*vp9_apply_temporal_filter)(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 int64_t vp9_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 RTCD_EXTERN int64_t (*vp9_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
-int64_t vp9_block_error_fp_sse2(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
-RTCD_EXTERN int64_t (*vp9_block_error_fp)(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+RTCD_EXTERN int64_t (*vp9_block_error_fp)(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
 
-int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-int vp9_diamond_search_sad_avx(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-RTCD_EXTERN int (*vp9_diamond_search_sad)(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-
-void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-RTCD_EXTERN void (*vp9_fdct8x8_quant)(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv);
+#define vp9_diamond_search_sad vp9_diamond_search_sad_c
 
 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
 void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type);
@@ -67,18 +83,13 @@ void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride, uint8_t *dst,
 void vp9_filter_by_weight8x8_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight);
 RTCD_EXTERN void (*vp9_filter_by_weight8x8)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight);
 
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-int vp9_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-int vp9_full_search_sadx8(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-RTCD_EXTERN int (*vp9_full_search_sad)(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-
 void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 void vp9_fwht4x4_sse2(const int16_t *input, tran_low_t *output, int stride);
 RTCD_EXTERN void (*vp9_fwht4x4)(const int16_t *input, tran_low_t *output, int stride);
 
-void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
-void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
-RTCD_EXTERN void (*vp9_iht16x16_256_add)(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
+void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+RTCD_EXTERN void (*vp9_iht16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 
 void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
@@ -88,20 +99,35 @@ void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int
 void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 
-void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 
-void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 
-void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
-void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
-RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
+void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
 
-void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
-void vp9_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
-RTCD_EXTERN void (*vp9_temporal_filter_apply)(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
+void vpx_convolve12_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve12)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve12_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve12_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve12_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve12_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
 void vp9_rtcd(void);
 
@@ -113,16 +139,14 @@ static void setup_rtcd_internal(void)
 
     (void)flags;
 
+    vp9_apply_temporal_filter = vp9_apply_temporal_filter_c;
+    if (flags & HAS_SSE4_1) vp9_apply_temporal_filter = vp9_apply_temporal_filter_sse4_1;
     vp9_block_error = vp9_block_error_c;
     if (flags & HAS_SSE2) vp9_block_error = vp9_block_error_sse2;
     if (flags & HAS_AVX2) vp9_block_error = vp9_block_error_avx2;
     vp9_block_error_fp = vp9_block_error_fp_c;
     if (flags & HAS_SSE2) vp9_block_error_fp = vp9_block_error_fp_sse2;
-    vp9_diamond_search_sad = vp9_diamond_search_sad_c;
-    if (flags & HAS_AVX) vp9_diamond_search_sad = vp9_diamond_search_sad_avx;
-    vp9_fdct8x8_quant = vp9_fdct8x8_quant_c;
-    if (flags & HAS_SSE2) vp9_fdct8x8_quant = vp9_fdct8x8_quant_sse2;
-    if (flags & HAS_SSSE3) vp9_fdct8x8_quant = vp9_fdct8x8_quant_ssse3;
+    if (flags & HAS_AVX2) vp9_block_error_fp = vp9_block_error_fp_avx2;
     vp9_fht16x16 = vp9_fht16x16_c;
     if (flags & HAS_SSE2) vp9_fht16x16 = vp9_fht16x16_sse2;
     vp9_fht4x4 = vp9_fht4x4_c;
@@ -133,9 +157,6 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vp9_filter_by_weight16x16 = vp9_filter_by_weight16x16_sse2;
     vp9_filter_by_weight8x8 = vp9_filter_by_weight8x8_c;
     if (flags & HAS_SSE2) vp9_filter_by_weight8x8 = vp9_filter_by_weight8x8_sse2;
-    vp9_full_search_sad = vp9_full_search_sad_c;
-    if (flags & HAS_SSE3) vp9_full_search_sad = vp9_full_search_sadx3;
-    if (flags & HAS_SSE4_1) vp9_full_search_sad = vp9_full_search_sadx8;
     vp9_fwht4x4 = vp9_fwht4x4_c;
     if (flags & HAS_SSE2) vp9_fwht4x4 = vp9_fwht4x4_sse2;
     vp9_iht16x16_256_add = vp9_iht16x16_256_add_c;
@@ -146,10 +167,22 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vp9_iht8x8_64_add = vp9_iht8x8_64_add_sse2;
     vp9_quantize_fp = vp9_quantize_fp_c;
     if (flags & HAS_SSE2) vp9_quantize_fp = vp9_quantize_fp_sse2;
+    if (flags & HAS_SSSE3) vp9_quantize_fp = vp9_quantize_fp_ssse3;
+    if (flags & HAS_AVX2) vp9_quantize_fp = vp9_quantize_fp_avx2;
+    vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_c;
+    if (flags & HAS_SSSE3) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_ssse3;
+    if (flags & HAS_AVX2) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_avx2;
     vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c;
     if (flags & HAS_SSSE3) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_ssse3;
-    vp9_temporal_filter_apply = vp9_temporal_filter_apply_c;
-    if (flags & HAS_SSE2) vp9_temporal_filter_apply = vp9_temporal_filter_apply_sse2;
+    vpx_convolve12 = vpx_convolve12_c;
+    if (flags & HAS_SSSE3) vpx_convolve12 = vpx_convolve12_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve12 = vpx_convolve12_avx2;
+    vpx_convolve12_horiz = vpx_convolve12_horiz_c;
+    if (flags & HAS_SSSE3) vpx_convolve12_horiz = vpx_convolve12_horiz_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve12_horiz = vpx_convolve12_horiz_avx2;
+    vpx_convolve12_vert = vpx_convolve12_vert_c;
+    if (flags & HAS_SSSE3) vpx_convolve12_vert = vpx_convolve12_vert_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve12_vert = vpx_convolve12_vert_avx2;
 }
 #endif
 
@@ -157,4 +190,4 @@ static void setup_rtcd_internal(void)
 }  // extern "C"
 #endif
 
-#endif
+#endif  // VP9_RTCD_H_
diff --git a/media/libvpx/config/win/ia32/vpx_config.asm b/media/libvpx/config/win/ia32/vpx_config.asm
index 4d443be604..a157619c4b 100644
--- a/media/libvpx/config/win/ia32/vpx_config.asm
+++ b/media/libvpx/config/win/ia32/vpx_config.asm
@@ -1,9 +1,16 @@
-%define ARCH_ARM 0
-%define ARCH_MIPS 0
-%define ARCH_X86 1
-%define ARCH_X86_64 0
-%define HAVE_NEON 0
+%define VPX_ARCH_ARM 0
+%define VPX_ARCH_AARCH64 0
+%define VPX_ARCH_MIPS 0
+%define VPX_ARCH_X86 1
+%define VPX_ARCH_X86_64 0
+%define VPX_ARCH_PPC 0
+%define VPX_ARCH_LOONGARCH 0
 %define HAVE_NEON_ASM 0
+%define HAVE_NEON 0
+%define HAVE_NEON_DOTPROD 0
+%define HAVE_NEON_I8MM 0
+%define HAVE_SVE 0
+%define HAVE_SVE2 0
 %define HAVE_MIPS32 0
 %define HAVE_DSPR2 0
 %define HAVE_MSA 0
@@ -16,6 +23,11 @@
 %define HAVE_SSE4_1 1
 %define HAVE_AVX 1
 %define HAVE_AVX2 1
+%define HAVE_AVX512 1
+%define HAVE_VSX 0
+%define HAVE_MMI 0
+%define HAVE_LSX 0
+%define HAVE_LASX 0
 %define HAVE_VPX_PORTS 1
 %define HAVE_PTHREAD_H 0
 %define CONFIG_DEPENDENCY_TRACKING 1
@@ -72,7 +84,10 @@
 %define CONFIG_BETTER_HW_COMPATIBILITY 0
 %define CONFIG_EXPERIMENTAL 0
 %define CONFIG_SIZE_LIMIT 1
-%define CONFIG_SPATIAL_SVC 0
+%define CONFIG_ALWAYS_ADJUST_BPM 0
+%define CONFIG_BITSTREAM_DEBUG 0
+%define CONFIG_MISMATCH_DEBUG 0
 %define CONFIG_FP_MB_STATS 0
 %define CONFIG_EMULATE_HARDWARE 0
-%define CONFIG_MISC_FIXES 0
+%define CONFIG_NON_GREEDY_MV 0
+%define CONFIG_COLLECT_COMPONENT_TIMING 0
diff --git a/media/libvpx/config/win/ia32/vpx_config.h b/media/libvpx/config/win/ia32/vpx_config.h
index 1aef765a31..37719d5b8d 100644
--- a/media/libvpx/config/win/ia32/vpx_config.h
+++ b/media/libvpx/config/win/ia32/vpx_config.h
@@ -10,12 +10,19 @@
 #define VPX_CONFIG_H
 #define RESTRICT    
 #define INLINE      inline
-#define ARCH_ARM 0
-#define ARCH_MIPS 0
-#define ARCH_X86 1
-#define ARCH_X86_64 0
-#define HAVE_NEON 0
+#define VPX_ARCH_ARM 0
+#define VPX_ARCH_AARCH64 0
+#define VPX_ARCH_MIPS 0
+#define VPX_ARCH_X86 1
+#define VPX_ARCH_X86_64 0
+#define VPX_ARCH_PPC 0
+#define VPX_ARCH_LOONGARCH 0
 #define HAVE_NEON_ASM 0
+#define HAVE_NEON 0
+#define HAVE_NEON_DOTPROD 0
+#define HAVE_NEON_I8MM 0
+#define HAVE_SVE 0
+#define HAVE_SVE2 0
 #define HAVE_MIPS32 0
 #define HAVE_DSPR2 0
 #define HAVE_MSA 0
@@ -28,6 +35,11 @@
 #define HAVE_SSE4_1 1
 #define HAVE_AVX 1
 #define HAVE_AVX2 1
+#define HAVE_AVX512 1
+#define HAVE_VSX 0
+#define HAVE_MMI 0
+#define HAVE_LSX 0
+#define HAVE_LASX 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_PTHREAD_H 0
 #define CONFIG_DEPENDENCY_TRACKING 1
@@ -84,10 +96,13 @@
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 1
-#define CONFIG_SPATIAL_SVC 0
+#define CONFIG_ALWAYS_ADJUST_BPM 0
+#define CONFIG_BITSTREAM_DEBUG 0
+#define CONFIG_MISMATCH_DEBUG 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_EMULATE_HARDWARE 0
-#define CONFIG_MISC_FIXES 0
+#define CONFIG_NON_GREEDY_MV 0
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
 #define DECODE_WIDTH_LIMIT 8192
 #define DECODE_HEIGHT_LIMIT 4608
 #endif /* VPX_CONFIG_H */
diff --git a/media/libvpx/config/win/ia32/vpx_dsp_rtcd.h b/media/libvpx/config/win/ia32/vpx_dsp_rtcd.h
index 112e326a64..0a6266a322 100644
--- a/media/libvpx/config/win/ia32/vpx_dsp_rtcd.h
+++ b/media/libvpx/config/win/ia32/vpx_dsp_rtcd.h
@@ -1,3 +1,14 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
 #ifndef VPX_DSP_RTCD_H_
 #define VPX_DSP_RTCD_H_
 
@@ -13,6 +24,11 @@
 
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#if CONFIG_VP9_ENCODER
+ struct macroblock_plane;
+ struct ScanOrder;
+#endif
 
 
 #ifdef __cplusplus
@@ -28,242 +44,215 @@ unsigned int vpx_avg_8x8_sse2(const uint8_t *, int p);
 RTCD_EXTERN unsigned int (*vpx_avg_8x8)(const uint8_t *, int p);
 
 void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
-#define vpx_comp_avg_pred vpx_comp_avg_pred_c
+void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+void vpx_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+RTCD_EXTERN void (*vpx_comp_avg_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
 
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c
 
-void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c
 
-void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c
 
-void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
 
-void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c
 
-void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c
 
-void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c
 
-void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c
 
-void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207e_predictor_16x16 vpx_d207e_predictor_16x16_c
+void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207e_predictor_32x32 vpx_d207e_predictor_32x32_c
+void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207e_predictor_4x4 vpx_d207e_predictor_4x4_c
+void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207e_predictor_8x8 vpx_d207e_predictor_8x8_c
+void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_d45e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45e_predictor_16x16 vpx_d45e_predictor_16x16_c
-
-void vpx_d45e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45e_predictor_32x32 vpx_d45e_predictor_32x32_c
-
-void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c
 
-void vpx_d45e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45e_predictor_8x8 vpx_d45e_predictor_8x8_c
+void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_d63e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63e_predictor_16x16 vpx_d63e_predictor_16x16_c
-
-void vpx_d63e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63e_predictor_32x32 vpx_d63e_predictor_32x32_c
-
-void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c
 
-void vpx_d63e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63e_predictor_8x8 vpx_d63e_predictor_8x8_c
+void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63f_predictor_4x4 vpx_d63f_predictor_4x4_c
+void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_128_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_128_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_128_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_128_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_left_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_left_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_left_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_left_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_top_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_top_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_top_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_top_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
 void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
 void vpx_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct16x16_avx2(const int16_t *input, tran_low_t *output, int stride);
 RTCD_EXTERN void (*vpx_fdct16x16)(const int16_t *input, tran_low_t *output, int stride);
 
 void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
@@ -300,47 +289,53 @@ void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
 void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride);
 RTCD_EXTERN void (*vpx_fdct8x8_1)(const int16_t *input, tran_low_t *output, int stride);
 
-void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vpx_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vpx_get16x16var_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
 
-unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride);
+unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride);
 #define vpx_get4x4sse_cs vpx_get4x4sse_cs_c
 
-void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vpx_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get8x8var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
 
 unsigned int vpx_get_mb_ss_c(const int16_t *);
 unsigned int vpx_get_mb_ss_sse2(const int16_t *);
 RTCD_EXTERN unsigned int (*vpx_get_mb_ss)(const int16_t *);
 
-void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_h_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_h_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_h_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_h_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_16x16_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff);
-RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
 
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_8x8_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff);
-RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+RTCD_EXTERN void (*vpx_hadamard_32x32)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
 
-void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+
+void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
 
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
@@ -353,14 +348,22 @@ RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest,
 
 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest, int stride);
 RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
+void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct16x16_38_add)(const tran_low_t *input, uint8_t *dest, int stride);
+
 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest, int stride);
 RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
 void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest, int stride);
 RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
@@ -369,6 +372,7 @@ RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest,
 
 void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
 RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
 void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
@@ -381,6 +385,7 @@ RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, in
 
 void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
 RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
 void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
@@ -395,9 +400,9 @@ int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
 int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width);
 RTCD_EXTERN int16_t (*vpx_int_pro_col)(const uint8_t *ref, const int width);
 
-void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
-void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
-RTCD_EXTERN void (*vpx_int_pro_row)(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
+void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
+void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
+RTCD_EXTERN void (*vpx_int_pro_row)(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
 
 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
@@ -456,9 +461,9 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, co
 void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
 RTCD_EXTERN void (*vpx_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
 
-void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vpx_mbpost_proc_across_ip_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-RTCD_EXTERN void (*vpx_mbpost_proc_across_ip)(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols,int flimit);
+void vpx_mbpost_proc_across_ip_sse2(unsigned char *src, int pitch, int rows, int cols,int flimit);
+RTCD_EXTERN void (*vpx_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols,int flimit);
 
 void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
 void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
@@ -468,22 +473,23 @@ void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *mi
 void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
 RTCD_EXTERN void (*vpx_minmax_8x8)(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
 
-unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_mse8x16)(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_mse8x8)(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
 void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch);
 void vpx_plane_add_noise_sse2(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch);
@@ -493,12 +499,18 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *d
 void vpx_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
 RTCD_EXTERN void (*vpx_post_proc_down_and_across_mb_row)(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
 
-void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 
-void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vpx_quantize_b_32x32)(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 
 unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -508,18 +520,9 @@ unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad16x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x16x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
-void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-
-void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -529,9 +532,9 @@ unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad16x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -541,18 +544,9 @@ unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uin
 unsigned int vpx_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad16x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x8x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
-void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-
-void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -564,9 +558,9 @@ unsigned int vpx_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_sad32x16_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad32x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -578,16 +572,10 @@ unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_sad32x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad32x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x3 vpx_sad32x32x3_c
-
-void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_c
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -599,9 +587,9 @@ unsigned int vpx_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_sad32x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad32x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad4x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -611,17 +599,9 @@ unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad4x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad4x4_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad4x4x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad4x4x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
-void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad4x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-
-void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad4x4x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad4x4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad4x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -631,47 +611,43 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad4x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad4x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-
-void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x8 vpx_sad4x8x8_c
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad64x32_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 RTCD_EXTERN unsigned int (*vpx_sad64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 
 unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 unsigned int vpx_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 unsigned int vpx_sad64x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad64x32_avg_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad64x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad64x64_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 RTCD_EXTERN unsigned int (*vpx_sad64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 
 unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 unsigned int vpx_sad64x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad64x64_avg_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad64x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x3 vpx_sad64x64x3_c
-
-void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-
-void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x8 vpx_sad64x64x8_c
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -681,17 +657,9 @@ unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uin
 unsigned int vpx_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad8x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad8x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x16x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
-void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-
-void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad8x16x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -701,12 +669,9 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad8x4_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-
-void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x8 vpx_sad8x4x8_c
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad8x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -716,273 +681,392 @@ unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad8x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad8x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x8x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
-void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 
-void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad8x8x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x16x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c
+
+void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c
+
+unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x32_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x32x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x64_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c
+
+void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c
+
+unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 int vpx_satd_c(const int16_t *coeff, int length);
 int vpx_satd_sse2(const int16_t *coeff, int length);
+int vpx_satd_avx2(const int16_t *coeff, int length);
 RTCD_EXTERN int (*vpx_satd)(const int16_t *coeff, int length);
 
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
 
-void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
 
-void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
 
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_horiz vpx_scaled_horiz_c
 
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_vert vpx_scaled_vert_c
 
-uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+int64_t vpx_sse_sse4_1(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+int64_t vpx_sse_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+RTCD_EXTERN int64_t (*vpx_sse)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
 
-uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
 void vpx_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
 RTCD_EXTERN void (*vpx_subtract_block)(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
 
 uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size);
 uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size);
 RTCD_EXTERN uint64_t (*vpx_sum_squares_2d_i16)(const int16_t *src, int stride, int size);
 
-void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_tm_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_tm_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_tm_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_tm_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_tm_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_tm_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_tm_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_tm_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_v_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_v_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_v_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_v_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_v_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_v_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance4x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance4x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c
 
 int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl);
@@ -1003,6 +1087,9 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vpx_avg_4x4 = vpx_avg_4x4_sse2;
     vpx_avg_8x8 = vpx_avg_8x8_c;
     if (flags & HAS_SSE2) vpx_avg_8x8 = vpx_avg_8x8_sse2;
+    vpx_comp_avg_pred = vpx_comp_avg_pred_c;
+    if (flags & HAS_SSE2) vpx_comp_avg_pred = vpx_comp_avg_pred_sse2;
+    if (flags & HAS_AVX2) vpx_comp_avg_pred = vpx_comp_avg_pred_avx2;
     vpx_convolve8 = vpx_convolve8_c;
     if (flags & HAS_SSE2) vpx_convolve8 = vpx_convolve8_sse2;
     if (flags & HAS_SSSE3) vpx_convolve8 = vpx_convolve8_ssse3;
@@ -1010,12 +1097,15 @@ static void setup_rtcd_internal(void)
     vpx_convolve8_avg = vpx_convolve8_avg_c;
     if (flags & HAS_SSE2) vpx_convolve8_avg = vpx_convolve8_avg_sse2;
     if (flags & HAS_SSSE3) vpx_convolve8_avg = vpx_convolve8_avg_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve8_avg = vpx_convolve8_avg_avx2;
     vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_c;
     if (flags & HAS_SSE2) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_sse2;
     if (flags & HAS_SSSE3) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_avx2;
     vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_c;
     if (flags & HAS_SSE2) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_sse2;
     if (flags & HAS_SSSE3) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_avx2;
     vpx_convolve8_horiz = vpx_convolve8_horiz_c;
     if (flags & HAS_SSE2) vpx_convolve8_horiz = vpx_convolve8_horiz_sse2;
     if (flags & HAS_SSSE3) vpx_convolve8_horiz = vpx_convolve8_horiz_ssse3;
@@ -1094,6 +1184,7 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vpx_dc_top_predictor_8x8 = vpx_dc_top_predictor_8x8_sse2;
     vpx_fdct16x16 = vpx_fdct16x16_c;
     if (flags & HAS_SSE2) vpx_fdct16x16 = vpx_fdct16x16_sse2;
+    if (flags & HAS_AVX2) vpx_fdct16x16 = vpx_fdct16x16_avx2;
     vpx_fdct16x16_1 = vpx_fdct16x16_1_c;
     if (flags & HAS_SSE2) vpx_fdct16x16_1 = vpx_fdct16x16_1_sse2;
     vpx_fdct32x32 = vpx_fdct32x32_c;
@@ -1129,6 +1220,10 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vpx_h_predictor_8x8 = vpx_h_predictor_8x8_sse2;
     vpx_hadamard_16x16 = vpx_hadamard_16x16_c;
     if (flags & HAS_SSE2) vpx_hadamard_16x16 = vpx_hadamard_16x16_sse2;
+    if (flags & HAS_AVX2) vpx_hadamard_16x16 = vpx_hadamard_16x16_avx2;
+    vpx_hadamard_32x32 = vpx_hadamard_32x32_c;
+    if (flags & HAS_SSE2) vpx_hadamard_32x32 = vpx_hadamard_32x32_sse2;
+    if (flags & HAS_AVX2) vpx_hadamard_32x32 = vpx_hadamard_32x32_avx2;
     vpx_hadamard_8x8 = vpx_hadamard_8x8_c;
     if (flags & HAS_SSE2) vpx_hadamard_8x8 = vpx_hadamard_8x8_sse2;
     vpx_idct16x16_10_add = vpx_idct16x16_10_add_c;
@@ -1137,20 +1232,28 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vpx_idct16x16_1_add = vpx_idct16x16_1_add_sse2;
     vpx_idct16x16_256_add = vpx_idct16x16_256_add_c;
     if (flags & HAS_SSE2) vpx_idct16x16_256_add = vpx_idct16x16_256_add_sse2;
+    if (flags & HAS_AVX2) vpx_idct16x16_256_add = vpx_idct16x16_256_add_avx2;
+    vpx_idct16x16_38_add = vpx_idct16x16_38_add_c;
+    if (flags & HAS_SSE2) vpx_idct16x16_38_add = vpx_idct16x16_38_add_sse2;
     vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_c;
     if (flags & HAS_SSE2) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_sse2;
+    if (flags & HAS_AVX2) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_avx2;
     vpx_idct32x32_135_add = vpx_idct32x32_135_add_c;
-    if (flags & HAS_SSE2) vpx_idct32x32_135_add = vpx_idct32x32_1024_add_sse2;
+    if (flags & HAS_SSE2) vpx_idct32x32_135_add = vpx_idct32x32_135_add_sse2;
+    if (flags & HAS_SSSE3) vpx_idct32x32_135_add = vpx_idct32x32_135_add_ssse3;
+    if (flags & HAS_AVX2) vpx_idct32x32_135_add = vpx_idct32x32_135_add_avx2;
     vpx_idct32x32_1_add = vpx_idct32x32_1_add_c;
     if (flags & HAS_SSE2) vpx_idct32x32_1_add = vpx_idct32x32_1_add_sse2;
     vpx_idct32x32_34_add = vpx_idct32x32_34_add_c;
     if (flags & HAS_SSE2) vpx_idct32x32_34_add = vpx_idct32x32_34_add_sse2;
+    if (flags & HAS_SSSE3) vpx_idct32x32_34_add = vpx_idct32x32_34_add_ssse3;
     vpx_idct4x4_16_add = vpx_idct4x4_16_add_c;
     if (flags & HAS_SSE2) vpx_idct4x4_16_add = vpx_idct4x4_16_add_sse2;
     vpx_idct4x4_1_add = vpx_idct4x4_1_add_c;
     if (flags & HAS_SSE2) vpx_idct4x4_1_add = vpx_idct4x4_1_add_sse2;
     vpx_idct8x8_12_add = vpx_idct8x8_12_add_c;
     if (flags & HAS_SSE2) vpx_idct8x8_12_add = vpx_idct8x8_12_add_sse2;
+    if (flags & HAS_SSSE3) vpx_idct8x8_12_add = vpx_idct8x8_12_add_ssse3;
     vpx_idct8x8_1_add = vpx_idct8x8_1_add_c;
     if (flags & HAS_SSE2) vpx_idct8x8_1_add = vpx_idct8x8_1_add_sse2;
     vpx_idct8x8_64_add = vpx_idct8x8_64_add_c;
@@ -1198,6 +1301,7 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) vpx_mse16x16 = vpx_mse16x16_avx2;
     vpx_mse16x8 = vpx_mse16x8_c;
     if (flags & HAS_SSE2) vpx_mse16x8 = vpx_mse16x8_sse2;
+    if (flags & HAS_AVX2) vpx_mse16x8 = vpx_mse16x8_avx2;
     vpx_mse8x16 = vpx_mse8x16_c;
     if (flags & HAS_SSE2) vpx_mse8x16 = vpx_mse8x16_sse2;
     vpx_mse8x8 = vpx_mse8x8_c;
@@ -1208,17 +1312,19 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vpx_post_proc_down_and_across_mb_row = vpx_post_proc_down_and_across_mb_row_sse2;
     vpx_quantize_b = vpx_quantize_b_c;
     if (flags & HAS_SSE2) vpx_quantize_b = vpx_quantize_b_sse2;
+    if (flags & HAS_SSSE3) vpx_quantize_b = vpx_quantize_b_ssse3;
+    if (flags & HAS_AVX) vpx_quantize_b = vpx_quantize_b_avx;
+    if (flags & HAS_AVX2) vpx_quantize_b = vpx_quantize_b_avx2;
+    vpx_quantize_b_32x32 = vpx_quantize_b_32x32_c;
+    if (flags & HAS_SSSE3) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_ssse3;
+    if (flags & HAS_AVX) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx;
+    if (flags & HAS_AVX2) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx2;
     vpx_sad16x16 = vpx_sad16x16_c;
     if (flags & HAS_SSE2) vpx_sad16x16 = vpx_sad16x16_sse2;
     vpx_sad16x16_avg = vpx_sad16x16_avg_c;
     if (flags & HAS_SSE2) vpx_sad16x16_avg = vpx_sad16x16_avg_sse2;
-    vpx_sad16x16x3 = vpx_sad16x16x3_c;
-    if (flags & HAS_SSE3) vpx_sad16x16x3 = vpx_sad16x16x3_sse3;
-    if (flags & HAS_SSSE3) vpx_sad16x16x3 = vpx_sad16x16x3_ssse3;
     vpx_sad16x16x4d = vpx_sad16x16x4d_c;
     if (flags & HAS_SSE2) vpx_sad16x16x4d = vpx_sad16x16x4d_sse2;
-    vpx_sad16x16x8 = vpx_sad16x16x8_c;
-    if (flags & HAS_SSE4_1) vpx_sad16x16x8 = vpx_sad16x16x8_sse4_1;
     vpx_sad16x32 = vpx_sad16x32_c;
     if (flags & HAS_SSE2) vpx_sad16x32 = vpx_sad16x32_sse2;
     vpx_sad16x32_avg = vpx_sad16x32_avg_c;
@@ -1229,13 +1335,8 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vpx_sad16x8 = vpx_sad16x8_sse2;
     vpx_sad16x8_avg = vpx_sad16x8_avg_c;
     if (flags & HAS_SSE2) vpx_sad16x8_avg = vpx_sad16x8_avg_sse2;
-    vpx_sad16x8x3 = vpx_sad16x8x3_c;
-    if (flags & HAS_SSE3) vpx_sad16x8x3 = vpx_sad16x8x3_sse3;
-    if (flags & HAS_SSSE3) vpx_sad16x8x3 = vpx_sad16x8x3_ssse3;
     vpx_sad16x8x4d = vpx_sad16x8x4d_c;
     if (flags & HAS_SSE2) vpx_sad16x8x4d = vpx_sad16x8x4d_sse2;
-    vpx_sad16x8x8 = vpx_sad16x8x8_c;
-    if (flags & HAS_SSE4_1) vpx_sad16x8x8 = vpx_sad16x8x8_sse4_1;
     vpx_sad32x16 = vpx_sad32x16_c;
     if (flags & HAS_SSE2) vpx_sad32x16 = vpx_sad32x16_sse2;
     if (flags & HAS_AVX2) vpx_sad32x16 = vpx_sad32x16_avx2;
@@ -1265,12 +1366,8 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vpx_sad4x4 = vpx_sad4x4_sse2;
     vpx_sad4x4_avg = vpx_sad4x4_avg_c;
     if (flags & HAS_SSE2) vpx_sad4x4_avg = vpx_sad4x4_avg_sse2;
-    vpx_sad4x4x3 = vpx_sad4x4x3_c;
-    if (flags & HAS_SSE3) vpx_sad4x4x3 = vpx_sad4x4x3_sse3;
     vpx_sad4x4x4d = vpx_sad4x4x4d_c;
     if (flags & HAS_SSE2) vpx_sad4x4x4d = vpx_sad4x4x4d_sse2;
-    vpx_sad4x4x8 = vpx_sad4x4x8_c;
-    if (flags & HAS_SSE4_1) vpx_sad4x4x8 = vpx_sad4x4x8_sse4_1;
     vpx_sad4x8 = vpx_sad4x8_c;
     if (flags & HAS_SSE2) vpx_sad4x8 = vpx_sad4x8_sse2;
     vpx_sad4x8_avg = vpx_sad4x8_avg_c;
@@ -1280,30 +1377,31 @@ static void setup_rtcd_internal(void)
     vpx_sad64x32 = vpx_sad64x32_c;
     if (flags & HAS_SSE2) vpx_sad64x32 = vpx_sad64x32_sse2;
     if (flags & HAS_AVX2) vpx_sad64x32 = vpx_sad64x32_avx2;
+    if (flags & HAS_AVX512) vpx_sad64x32 = vpx_sad64x32_avx512;
     vpx_sad64x32_avg = vpx_sad64x32_avg_c;
     if (flags & HAS_SSE2) vpx_sad64x32_avg = vpx_sad64x32_avg_sse2;
     if (flags & HAS_AVX2) vpx_sad64x32_avg = vpx_sad64x32_avg_avx2;
+    if (flags & HAS_AVX512) vpx_sad64x32_avg = vpx_sad64x32_avg_avx512;
     vpx_sad64x32x4d = vpx_sad64x32x4d_c;
     if (flags & HAS_SSE2) vpx_sad64x32x4d = vpx_sad64x32x4d_sse2;
     vpx_sad64x64 = vpx_sad64x64_c;
     if (flags & HAS_SSE2) vpx_sad64x64 = vpx_sad64x64_sse2;
     if (flags & HAS_AVX2) vpx_sad64x64 = vpx_sad64x64_avx2;
+    if (flags & HAS_AVX512) vpx_sad64x64 = vpx_sad64x64_avx512;
     vpx_sad64x64_avg = vpx_sad64x64_avg_c;
     if (flags & HAS_SSE2) vpx_sad64x64_avg = vpx_sad64x64_avg_sse2;
     if (flags & HAS_AVX2) vpx_sad64x64_avg = vpx_sad64x64_avg_avx2;
+    if (flags & HAS_AVX512) vpx_sad64x64_avg = vpx_sad64x64_avg_avx512;
     vpx_sad64x64x4d = vpx_sad64x64x4d_c;
     if (flags & HAS_SSE2) vpx_sad64x64x4d = vpx_sad64x64x4d_sse2;
     if (flags & HAS_AVX2) vpx_sad64x64x4d = vpx_sad64x64x4d_avx2;
+    if (flags & HAS_AVX512) vpx_sad64x64x4d = vpx_sad64x64x4d_avx512;
     vpx_sad8x16 = vpx_sad8x16_c;
     if (flags & HAS_SSE2) vpx_sad8x16 = vpx_sad8x16_sse2;
     vpx_sad8x16_avg = vpx_sad8x16_avg_c;
     if (flags & HAS_SSE2) vpx_sad8x16_avg = vpx_sad8x16_avg_sse2;
-    vpx_sad8x16x3 = vpx_sad8x16x3_c;
-    if (flags & HAS_SSE3) vpx_sad8x16x3 = vpx_sad8x16x3_sse3;
     vpx_sad8x16x4d = vpx_sad8x16x4d_c;
     if (flags & HAS_SSE2) vpx_sad8x16x4d = vpx_sad8x16x4d_sse2;
-    vpx_sad8x16x8 = vpx_sad8x16x8_c;
-    if (flags & HAS_SSE4_1) vpx_sad8x16x8 = vpx_sad8x16x8_sse4_1;
     vpx_sad8x4 = vpx_sad8x4_c;
     if (flags & HAS_SSE2) vpx_sad8x4 = vpx_sad8x4_sse2;
     vpx_sad8x4_avg = vpx_sad8x4_avg_c;
@@ -1314,16 +1412,74 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vpx_sad8x8 = vpx_sad8x8_sse2;
     vpx_sad8x8_avg = vpx_sad8x8_avg_c;
     if (flags & HAS_SSE2) vpx_sad8x8_avg = vpx_sad8x8_avg_sse2;
-    vpx_sad8x8x3 = vpx_sad8x8x3_c;
-    if (flags & HAS_SSE3) vpx_sad8x8x3 = vpx_sad8x8x3_sse3;
     vpx_sad8x8x4d = vpx_sad8x8x4d_c;
     if (flags & HAS_SSE2) vpx_sad8x8x4d = vpx_sad8x8x4d_sse2;
-    vpx_sad8x8x8 = vpx_sad8x8x8_c;
-    if (flags & HAS_SSE4_1) vpx_sad8x8x8 = vpx_sad8x8x8_sse4_1;
+    vpx_sad_skip_16x16 = vpx_sad_skip_16x16_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_16x16 = vpx_sad_skip_16x16_sse2;
+    vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_sse2;
+    vpx_sad_skip_16x32 = vpx_sad_skip_16x32_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_16x32 = vpx_sad_skip_16x32_sse2;
+    vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_sse2;
+    vpx_sad_skip_16x8 = vpx_sad_skip_16x8_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_16x8 = vpx_sad_skip_16x8_sse2;
+    vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_sse2;
+    vpx_sad_skip_32x16 = vpx_sad_skip_32x16_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_avx2;
+    vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_avx2;
+    vpx_sad_skip_32x32 = vpx_sad_skip_32x32_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_avx2;
+    vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_avx2;
+    vpx_sad_skip_32x64 = vpx_sad_skip_32x64_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_avx2;
+    vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_avx2;
+    vpx_sad_skip_4x8 = vpx_sad_skip_4x8_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_4x8 = vpx_sad_skip_4x8_sse2;
+    vpx_sad_skip_4x8x4d = vpx_sad_skip_4x8x4d_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_4x8x4d = vpx_sad_skip_4x8x4d_sse2;
+    vpx_sad_skip_64x32 = vpx_sad_skip_64x32_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_avx2;
+    if (flags & HAS_AVX512) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_avx512;
+    vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_avx2;
+    if (flags & HAS_AVX512) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_avx512;
+    vpx_sad_skip_64x64 = vpx_sad_skip_64x64_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_avx2;
+    if (flags & HAS_AVX512) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_avx512;
+    vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_avx2;
+    if (flags & HAS_AVX512) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_avx512;
+    vpx_sad_skip_8x16 = vpx_sad_skip_8x16_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_8x16 = vpx_sad_skip_8x16_sse2;
+    vpx_sad_skip_8x16x4d = vpx_sad_skip_8x16x4d_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_8x16x4d = vpx_sad_skip_8x16x4d_sse2;
+    vpx_sad_skip_8x8 = vpx_sad_skip_8x8_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_8x8 = vpx_sad_skip_8x8_sse2;
+    vpx_sad_skip_8x8x4d = vpx_sad_skip_8x8x4d_c;
+    if (flags & HAS_SSE2) vpx_sad_skip_8x8x4d = vpx_sad_skip_8x8x4d_sse2;
     vpx_satd = vpx_satd_c;
     if (flags & HAS_SSE2) vpx_satd = vpx_satd_sse2;
+    if (flags & HAS_AVX2) vpx_satd = vpx_satd_avx2;
     vpx_scaled_2d = vpx_scaled_2d_c;
     if (flags & HAS_SSSE3) vpx_scaled_2d = vpx_scaled_2d_ssse3;
+    vpx_sse = vpx_sse_c;
+    if (flags & HAS_SSE4_1) vpx_sse = vpx_sse_sse4_1;
+    if (flags & HAS_AVX2) vpx_sse = vpx_sse_avx2;
     vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_c;
     if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_sse2;
     if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_ssse3;
@@ -1408,6 +1564,7 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_ssse3;
     vpx_subtract_block = vpx_subtract_block_c;
     if (flags & HAS_SSE2) vpx_subtract_block = vpx_subtract_block_sse2;
+    if (flags & HAS_AVX2) vpx_subtract_block = vpx_subtract_block_avx2;
     vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_c;
     if (flags & HAS_SSE2) vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_sse2;
     vpx_tm_predictor_16x16 = vpx_tm_predictor_16x16_c;
@@ -1431,8 +1588,10 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) vpx_variance16x16 = vpx_variance16x16_avx2;
     vpx_variance16x32 = vpx_variance16x32_c;
     if (flags & HAS_SSE2) vpx_variance16x32 = vpx_variance16x32_sse2;
+    if (flags & HAS_AVX2) vpx_variance16x32 = vpx_variance16x32_avx2;
     vpx_variance16x8 = vpx_variance16x8_c;
     if (flags & HAS_SSE2) vpx_variance16x8 = vpx_variance16x8_sse2;
+    if (flags & HAS_AVX2) vpx_variance16x8 = vpx_variance16x8_avx2;
     vpx_variance32x16 = vpx_variance32x16_c;
     if (flags & HAS_SSE2) vpx_variance32x16 = vpx_variance32x16_sse2;
     if (flags & HAS_AVX2) vpx_variance32x16 = vpx_variance32x16_avx2;
@@ -1441,6 +1600,7 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) vpx_variance32x32 = vpx_variance32x32_avx2;
     vpx_variance32x64 = vpx_variance32x64_c;
     if (flags & HAS_SSE2) vpx_variance32x64 = vpx_variance32x64_sse2;
+    if (flags & HAS_AVX2) vpx_variance32x64 = vpx_variance32x64_avx2;
     vpx_variance4x4 = vpx_variance4x4_c;
     if (flags & HAS_SSE2) vpx_variance4x4 = vpx_variance4x4_sse2;
     vpx_variance4x8 = vpx_variance4x8_c;
@@ -1453,10 +1613,13 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) vpx_variance64x64 = vpx_variance64x64_avx2;
     vpx_variance8x16 = vpx_variance8x16_c;
     if (flags & HAS_SSE2) vpx_variance8x16 = vpx_variance8x16_sse2;
+    if (flags & HAS_AVX2) vpx_variance8x16 = vpx_variance8x16_avx2;
     vpx_variance8x4 = vpx_variance8x4_c;
     if (flags & HAS_SSE2) vpx_variance8x4 = vpx_variance8x4_sse2;
+    if (flags & HAS_AVX2) vpx_variance8x4 = vpx_variance8x4_avx2;
     vpx_variance8x8 = vpx_variance8x8_c;
     if (flags & HAS_SSE2) vpx_variance8x8 = vpx_variance8x8_sse2;
+    if (flags & HAS_AVX2) vpx_variance8x8 = vpx_variance8x8_avx2;
     vpx_vector_var = vpx_vector_var_c;
     if (flags & HAS_SSE2) vpx_vector_var = vpx_vector_var_sse2;
 }
@@ -1466,4 +1629,4 @@ static void setup_rtcd_internal(void)
 }  // extern "C"
 #endif
 
-#endif
+#endif  // VPX_DSP_RTCD_H_
diff --git a/media/libvpx/config/win/ia32/vpx_scale_rtcd.h b/media/libvpx/config/win/ia32/vpx_scale_rtcd.h
index ddf7d01cca..18e5b71579 100644
--- a/media/libvpx/config/win/ia32/vpx_scale_rtcd.h
+++ b/media/libvpx/config/win/ia32/vpx_scale_rtcd.h
@@ -1,3 +1,14 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
 #ifndef VPX_SCALE_RTCD_H_
 #define VPX_SCALE_RTCD_H_
 
@@ -46,6 +57,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
 void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
 #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
 
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
 void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
 #define vpx_yv12_copy_y vpx_yv12_copy_y_c
 
@@ -66,4 +80,4 @@ static void setup_rtcd_internal(void)
 }  // extern "C"
 #endif
 
-#endif
+#endif  // VPX_SCALE_RTCD_H_
diff --git a/media/libvpx/config/win/x64/vp8_rtcd.h b/media/libvpx/config/win/x64/vp8_rtcd.h
index 4728639704..9e7746b813 100644
--- a/media/libvpx/config/win/x64/vp8_rtcd.h
+++ b/media/libvpx/config/win/x64/vp8_rtcd.h
@@ -1,3 +1,14 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
 #ifndef VP8_RTCD_H_
 #define VP8_RTCD_H_
 
@@ -26,56 +37,47 @@ struct yv12_buffer_config;
 extern "C" {
 #endif
 
-void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_mmx
+void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict4x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_sse2
 
-void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_mmx
+void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_sse2
 
-void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-
-void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp8_blend_b vp8_blend_b_c
-
-void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp8_blend_mb_inner vp8_blend_mb_inner_c
-
-void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride);
-#define vp8_blend_mb_outer vp8_blend_mb_outer_c
+void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
 int vp8_block_error_c(short *coeff, short *dqcoeff);
 int vp8_block_error_sse2(short *coeff, short *dqcoeff);
 #define vp8_block_error vp8_block_error_sse2
 
-void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
-void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
-void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
-RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
+void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
+void vp8_copy32xn_sse2(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
+void vp8_copy32xn_sse3(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
+RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
 
-void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
 #define vp8_copy_mem16x16 vp8_copy_mem16x16_sse2
 
-void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem8x4_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
 #define vp8_copy_mem8x4 vp8_copy_mem8x4_mmx
 
-void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem8x8_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
 #define vp8_copy_mem8x8 vp8_copy_mem8x8_mmx
 
-void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
-void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
+void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
 #define vp8_dc_only_idct_add vp8_dc_only_idct_add_mmx
 
 int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
@@ -86,8 +88,8 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, u
 int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
 #define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_sse2
 
-void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride);
-void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *output, int stride);
+void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride);
+void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *dest, int stride);
 #define vp8_dequant_idct_add vp8_dequant_idct_add_mmx
 
 void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
@@ -98,8 +100,8 @@ void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int
 void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
 #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2
 
-void vp8_dequantize_b_c(struct blockd*, short *dqc);
-void vp8_dequantize_b_mmx(struct blockd*, short *dqc);
+void vp8_dequantize_b_c(struct blockd*, short *DQC);
+void vp8_dequantize_b_mmx(struct blockd*, short *DQC);
 #define vp8_dequantize_b vp8_dequantize_b_mmx
 
 int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
@@ -122,41 +124,36 @@ void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char
 void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight);
 #define vp8_filter_by_weight8x8 vp8_filter_by_weight8x8_sse2
 
-int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-
-void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 #define vp8_loop_filter_bh vp8_loop_filter_bh_sse2
 
-void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_bv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 #define vp8_loop_filter_bv vp8_loop_filter_bv_sse2
 
-void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_mbh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 #define vp8_loop_filter_mbh vp8_loop_filter_mbh_sse2
 
-void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 #define vp8_loop_filter_mbv vp8_loop_filter_mbv_sse2
 
-void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_bhs_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 #define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_sse2
 
-void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 #define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_sse2
 
-void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 #define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_sse2
 
-void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 #define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_sse2
 
 int vp8_mbblock_error_c(struct macroblock *mb, int dc);
@@ -167,8 +164,8 @@ int vp8_mbuverror_c(struct macroblock *mb);
 int vp8_mbuverror_sse2(struct macroblock *mb);
 #define vp8_mbuverror vp8_mbuverror_sse2
 
-int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
 #define vp8_refining_search_sad vp8_refining_search_sadx4
 
 void vp8_regular_quantize_b_c(struct block *, struct blockd *);
@@ -184,40 +181,40 @@ void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
 void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch);
 #define vp8_short_fdct8x4 vp8_short_fdct8x4_sse2
 
-void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
-void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
+void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
 #define vp8_short_idct4x4llm vp8_short_idct4x4llm_mmx
 
-void vp8_short_inv_walsh4x4_c(short *input, short *output);
-void vp8_short_inv_walsh4x4_sse2(short *input, short *output);
+void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff);
+void vp8_short_inv_walsh4x4_sse2(short *input, short *mb_dqcoeff);
 #define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_sse2
 
-void vp8_short_inv_walsh4x4_1_c(short *input, short *output);
+void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff);
 #define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c
 
 void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
 void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch);
 #define vp8_short_walsh4x4 vp8_short_walsh4x4_sse2
 
-void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
 void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count);
 void vp8_temporal_filter_apply_sse2(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count);
@@ -241,9 +238,6 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE3) vp8_copy32xn = vp8_copy32xn_sse3;
     vp8_fast_quantize_b = vp8_fast_quantize_b_sse2;
     if (flags & HAS_SSSE3) vp8_fast_quantize_b = vp8_fast_quantize_b_ssse3;
-    vp8_full_search_sad = vp8_full_search_sad_c;
-    if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3;
-    if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8;
     vp8_regular_quantize_b = vp8_regular_quantize_b_sse2;
     if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1;
     vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2;
@@ -261,4 +255,4 @@ static void setup_rtcd_internal(void)
 }  // extern "C"
 #endif
 
-#endif
+#endif  // VP8_RTCD_H_
diff --git a/media/libvpx/config/win/x64/vp9_rtcd.h b/media/libvpx/config/win/x64/vp9_rtcd.h
index 47a708444d..52d6f0657d 100644
--- a/media/libvpx/config/win/x64/vp9_rtcd.h
+++ b/media/libvpx/config/win/x64/vp9_rtcd.h
@@ -1,3 +1,14 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
 #ifndef VP9_RTCD_H_
 #define VP9_RTCD_H_
 
@@ -14,12 +25,18 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
+#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER
+#include "vp9/encoder/vp9_temporal_filter.h"
+#endif
 
 struct macroblockd;
 
 /* Encoder forward decls */
 struct macroblock;
-struct vp9_variance_vtable;
+struct macroblock_plane;
+struct vp9_sad_table;
+struct ScanOrder;
 struct search_site_config;
 struct mv;
 union int_mv;
@@ -29,23 +46,22 @@ struct yv12_buffer_config;
 extern "C" {
 #endif
 
+void vp9_apply_temporal_filter_c(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+void vp9_apply_temporal_filter_sse4_1(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+RTCD_EXTERN void (*vp9_apply_temporal_filter)(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 int64_t vp9_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 RTCD_EXTERN int64_t (*vp9_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
-int64_t vp9_block_error_fp_sse2(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
-#define vp9_block_error_fp vp9_block_error_fp_sse2
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+RTCD_EXTERN int64_t (*vp9_block_error_fp)(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
 
-int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-int vp9_diamond_search_sad_avx(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-RTCD_EXTERN int (*vp9_diamond_search_sad)(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-
-void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-RTCD_EXTERN void (*vp9_fdct8x8_quant)(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv);
+#define vp9_diamond_search_sad vp9_diamond_search_sad_c
 
 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
 void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type);
@@ -67,17 +83,12 @@ void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride, uint8_t *dst,
 void vp9_filter_by_weight8x8_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight);
 #define vp9_filter_by_weight8x8 vp9_filter_by_weight8x8_sse2
 
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-int vp9_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-int vp9_full_search_sadx8(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-RTCD_EXTERN int (*vp9_full_search_sad)(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-
 void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 void vp9_fwht4x4_sse2(const int16_t *input, tran_low_t *output, int stride);
 #define vp9_fwht4x4 vp9_fwht4x4_sse2
 
-void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
-void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
+void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2
 
 void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
@@ -88,22 +99,35 @@ void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int
 void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht8x8_64_add vp9_iht8x8_64_add_sse2
 
-void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 
-void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 
-void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
-void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
-RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst);
+void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
 
-void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
-void vp9_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_sse2
+void vpx_convolve12_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve12)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve12_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve12_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+void vpx_convolve12_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve12_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve12_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
 void vp9_rtcd(void);
 
@@ -115,21 +139,29 @@ static void setup_rtcd_internal(void)
 
     (void)flags;
 
+    vp9_apply_temporal_filter = vp9_apply_temporal_filter_c;
+    if (flags & HAS_SSE4_1) vp9_apply_temporal_filter = vp9_apply_temporal_filter_sse4_1;
     vp9_block_error = vp9_block_error_sse2;
     if (flags & HAS_AVX2) vp9_block_error = vp9_block_error_avx2;
-    vp9_diamond_search_sad = vp9_diamond_search_sad_c;
-    if (flags & HAS_AVX) vp9_diamond_search_sad = vp9_diamond_search_sad_avx;
-    vp9_fdct8x8_quant = vp9_fdct8x8_quant_sse2;
-    if (flags & HAS_SSSE3) vp9_fdct8x8_quant = vp9_fdct8x8_quant_ssse3;
-    vp9_full_search_sad = vp9_full_search_sad_c;
-    if (flags & HAS_SSE3) vp9_full_search_sad = vp9_full_search_sadx3;
-    if (flags & HAS_SSE4_1) vp9_full_search_sad = vp9_full_search_sadx8;
+    vp9_block_error_fp = vp9_block_error_fp_sse2;
+    if (flags & HAS_AVX2) vp9_block_error_fp = vp9_block_error_fp_avx2;
     vp9_quantize_fp = vp9_quantize_fp_sse2;
     if (flags & HAS_SSSE3) vp9_quantize_fp = vp9_quantize_fp_ssse3;
+    if (flags & HAS_AVX2) vp9_quantize_fp = vp9_quantize_fp_avx2;
     vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_c;
     if (flags & HAS_SSSE3) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_ssse3;
+    if (flags & HAS_AVX2) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_avx2;
     vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c;
     if (flags & HAS_SSSE3) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_ssse3;
+    vpx_convolve12 = vpx_convolve12_c;
+    if (flags & HAS_SSSE3) vpx_convolve12 = vpx_convolve12_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve12 = vpx_convolve12_avx2;
+    vpx_convolve12_horiz = vpx_convolve12_horiz_c;
+    if (flags & HAS_SSSE3) vpx_convolve12_horiz = vpx_convolve12_horiz_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve12_horiz = vpx_convolve12_horiz_avx2;
+    vpx_convolve12_vert = vpx_convolve12_vert_c;
+    if (flags & HAS_SSSE3) vpx_convolve12_vert = vpx_convolve12_vert_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve12_vert = vpx_convolve12_vert_avx2;
 }
 #endif
 
@@ -137,4 +169,4 @@ static void setup_rtcd_internal(void)
 }  // extern "C"
 #endif
 
-#endif
+#endif  // VP9_RTCD_H_
diff --git a/media/libvpx/config/win/x64/vpx_config.asm b/media/libvpx/config/win/x64/vpx_config.asm
index 27e5dabd9d..2044a9e9b8 100644
--- a/media/libvpx/config/win/x64/vpx_config.asm
+++ b/media/libvpx/config/win/x64/vpx_config.asm
@@ -1,9 +1,16 @@
-%define ARCH_ARM 0
-%define ARCH_MIPS 0
-%define ARCH_X86 0
-%define ARCH_X86_64 1
-%define HAVE_NEON 0
+%define VPX_ARCH_ARM 0
+%define VPX_ARCH_AARCH64 0
+%define VPX_ARCH_MIPS 0
+%define VPX_ARCH_X86 0
+%define VPX_ARCH_X86_64 1
+%define VPX_ARCH_PPC 0
+%define VPX_ARCH_LOONGARCH 0
 %define HAVE_NEON_ASM 0
+%define HAVE_NEON 0
+%define HAVE_NEON_DOTPROD 0
+%define HAVE_NEON_I8MM 0
+%define HAVE_SVE 0
+%define HAVE_SVE2 0
 %define HAVE_MIPS32 0
 %define HAVE_DSPR2 0
 %define HAVE_MSA 0
@@ -16,6 +23,11 @@
 %define HAVE_SSE4_1 1
 %define HAVE_AVX 1
 %define HAVE_AVX2 1
+%define HAVE_AVX512 1
+%define HAVE_VSX 0
+%define HAVE_MMI 0
+%define HAVE_LSX 0
+%define HAVE_LASX 0
 %define HAVE_VPX_PORTS 1
 %define HAVE_PTHREAD_H 0
 %define CONFIG_DEPENDENCY_TRACKING 1
@@ -72,7 +84,10 @@
 %define CONFIG_BETTER_HW_COMPATIBILITY 0
 %define CONFIG_EXPERIMENTAL 0
 %define CONFIG_SIZE_LIMIT 1
-%define CONFIG_SPATIAL_SVC 0
+%define CONFIG_ALWAYS_ADJUST_BPM 0
+%define CONFIG_BITSTREAM_DEBUG 0
+%define CONFIG_MISMATCH_DEBUG 0
 %define CONFIG_FP_MB_STATS 0
 %define CONFIG_EMULATE_HARDWARE 0
-%define CONFIG_MISC_FIXES 0
+%define CONFIG_NON_GREEDY_MV 0
+%define CONFIG_COLLECT_COMPONENT_TIMING 0
diff --git a/media/libvpx/config/win/x64/vpx_config.c b/media/libvpx/config/win/x64/vpx_config.c
index 8e75cba148..ace29b7b87 100644
--- a/media/libvpx/config/win/x64/vpx_config.c
+++ b/media/libvpx/config/win/x64/vpx_config.c
@@ -6,5 +6,5 @@
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
 #include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=x86_64-win64-vs12 --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --enable-postproc --enable-vp9-postproc --as=yasm";
+static const char* const cfg = "--target=x86_64-win64-vs17 --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --enable-postproc --enable-vp9-postproc --as=yasm";
 const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/media/libvpx/config/win/x64/vpx_config.h b/media/libvpx/config/win/x64/vpx_config.h
index 435815ad7a..d758715c3e 100644
--- a/media/libvpx/config/win/x64/vpx_config.h
+++ b/media/libvpx/config/win/x64/vpx_config.h
@@ -9,13 +9,20 @@
 #ifndef VPX_CONFIG_H
 #define VPX_CONFIG_H
 #define RESTRICT    
-#define INLINE      __forceinline
-#define ARCH_ARM 0
-#define ARCH_MIPS 0
-#define ARCH_X86 0
-#define ARCH_X86_64 1
-#define HAVE_NEON 0
+#define INLINE      __inline
+#define VPX_ARCH_ARM 0
+#define VPX_ARCH_AARCH64 0
+#define VPX_ARCH_MIPS 0
+#define VPX_ARCH_X86 0
+#define VPX_ARCH_X86_64 1
+#define VPX_ARCH_PPC 0
+#define VPX_ARCH_LOONGARCH 0
 #define HAVE_NEON_ASM 0
+#define HAVE_NEON 0
+#define HAVE_NEON_DOTPROD 0
+#define HAVE_NEON_I8MM 0
+#define HAVE_SVE 0
+#define HAVE_SVE2 0
 #define HAVE_MIPS32 0
 #define HAVE_DSPR2 0
 #define HAVE_MSA 0
@@ -28,6 +35,11 @@
 #define HAVE_SSE4_1 1
 #define HAVE_AVX 1
 #define HAVE_AVX2 1
+#define HAVE_AVX512 1
+#define HAVE_VSX 0
+#define HAVE_MMI 0
+#define HAVE_LSX 0
+#define HAVE_LASX 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_PTHREAD_H 0
 #define CONFIG_DEPENDENCY_TRACKING 1
@@ -84,10 +96,13 @@
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 1
-#define CONFIG_SPATIAL_SVC 0
+#define CONFIG_ALWAYS_ADJUST_BPM 0
+#define CONFIG_BITSTREAM_DEBUG 0
+#define CONFIG_MISMATCH_DEBUG 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_EMULATE_HARDWARE 0
-#define CONFIG_MISC_FIXES 0
+#define CONFIG_NON_GREEDY_MV 0
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
 #define DECODE_WIDTH_LIMIT 8192
 #define DECODE_HEIGHT_LIMIT 4608
 #endif /* VPX_CONFIG_H */
diff --git a/media/libvpx/config/win/x64/vpx_dsp_rtcd.h b/media/libvpx/config/win/x64/vpx_dsp_rtcd.h
index 6f181c0a0e..8e17c8ab8a 100644
--- a/media/libvpx/config/win/x64/vpx_dsp_rtcd.h
+++ b/media/libvpx/config/win/x64/vpx_dsp_rtcd.h
@@ -1,3 +1,14 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
 #ifndef VPX_DSP_RTCD_H_
 #define VPX_DSP_RTCD_H_
 
@@ -13,6 +24,11 @@
 
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#if CONFIG_VP9_ENCODER
+ struct macroblock_plane;
+ struct ScanOrder;
+#endif
 
 
 #ifdef __cplusplus
@@ -28,243 +44,216 @@ unsigned int vpx_avg_8x8_sse2(const uint8_t *, int p);
 #define vpx_avg_8x8 vpx_avg_8x8_sse2
 
 void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
-#define vpx_comp_avg_pred vpx_comp_avg_pred_c
+void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+void vpx_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+RTCD_EXTERN void (*vpx_comp_avg_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
 
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve_avg vpx_convolve_avg_sse2
 
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve_copy vpx_convolve_copy_sse2
 
-void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c
 
-void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c
 
-void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c
 
-void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
 
-void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c
 
-void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c
 
-void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c
 
-void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c
 
-void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_sse2
 
-void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207e_predictor_16x16 vpx_d207e_predictor_16x16_c
+void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207e_predictor_32x32 vpx_d207e_predictor_32x32_c
+void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207e_predictor_4x4 vpx_d207e_predictor_4x4_c
-
-void vpx_d207e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207e_predictor_8x8 vpx_d207e_predictor_8x8_c
-
-void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_sse2
 
-void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_sse2
 
-void vpx_d45e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45e_predictor_16x16 vpx_d45e_predictor_16x16_c
-
-void vpx_d45e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45e_predictor_32x32 vpx_d45e_predictor_32x32_c
-
-void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c
 
-void vpx_d45e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d45e_predictor_8x8 vpx_d45e_predictor_8x8_c
+void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_d63e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63e_predictor_16x16 vpx_d63e_predictor_16x16_c
-
-void vpx_d63e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63e_predictor_32x32 vpx_d63e_predictor_32x32_c
-
-void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c
 
-void vpx_d63e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63e_predictor_8x8 vpx_d63e_predictor_8x8_c
-
-void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63f_predictor_4x4 vpx_d63f_predictor_4x4_c
-
-void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_sse2
 
-void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_sse2
 
-void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_sse2
 
-void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_sse2
 
-void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_sse2
 
-void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_sse2
 
-void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_sse2
 
-void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_sse2
 
-void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_sse2
 
-void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_sse2
 
-void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_sse2
 
-void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_sse2
 
-void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_sse2
 
-void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_sse2
 
-void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_sse2
 
-void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_sse2
 
 void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
 void vpx_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct16x16 vpx_fdct16x16_sse2
+void vpx_fdct16x16_avx2(const int16_t *input, tran_low_t *output, int stride);
+RTCD_EXTERN void (*vpx_fdct16x16)(const int16_t *input, tran_low_t *output, int stride);
 
 void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
 void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output, int stride);
@@ -301,48 +290,54 @@ void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
 void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride);
 #define vpx_fdct8x8_1 vpx_fdct8x8_1_sse2
 
-void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vpx_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vpx_get16x16var_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
 
-unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride);
+unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride);
 #define vpx_get4x4sse_cs vpx_get4x4sse_cs_c
 
-void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vpx_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get8x8var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
 #define vpx_get8x8var vpx_get8x8var_sse2
 
 unsigned int vpx_get_mb_ss_c(const int16_t *);
 unsigned int vpx_get_mb_ss_sse2(const int16_t *);
 #define vpx_get_mb_ss vpx_get_mb_ss_sse2
 
-void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_16x16 vpx_h_predictor_16x16_sse2
 
-void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_32x32 vpx_h_predictor_32x32_sse2
 
-void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_4x4 vpx_h_predictor_4x4_sse2
 
-void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_sse2
 
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_16x16_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff);
-#define vpx_hadamard_16x16 vpx_hadamard_16x16_sse2
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
 
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_8x8_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_8x8_ssse3(const int16_t *src_diff, int src_stride, int16_t *coeff);
-RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+RTCD_EXTERN void (*vpx_hadamard_32x32)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
 
-void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_8x8_ssse3(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+
+void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
 
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
@@ -355,16 +350,22 @@ void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride
 
 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct16x16_256_add vpx_idct16x16_256_add_sse2
+void vpx_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest, int stride);
+RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride);
+
+void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_38_add vpx_idct16x16_38_add_sse2
 
 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest, int stride);
 RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
 void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest, int stride);
 RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride);
 
 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
@@ -395,15 +396,14 @@ void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 
 void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
-RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct8x8_64_add vpx_idct8x8_64_add_sse2
 
 int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
 int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width);
 #define vpx_int_pro_col vpx_int_pro_col_sse2
 
-void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
-void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
+void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
+void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
 #define vpx_int_pro_row vpx_int_pro_row_sse2
 
 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
@@ -463,8 +463,8 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, co
 void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
 #define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_sse2
 
-void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vpx_mbpost_proc_across_ip_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols,int flimit);
+void vpx_mbpost_proc_across_ip_sse2(unsigned char *src, int pitch, int rows, int cols,int flimit);
 #define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_sse2
 
 void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
@@ -475,21 +475,22 @@ void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *mi
 void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
 #define vpx_minmax_8x8 vpx_minmax_8x8_sse2
 
-unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define vpx_mse16x8 vpx_mse16x8_sse2
+unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_mse8x16 vpx_mse8x16_sse2
 
-unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_mse8x8 vpx_mse8x8_sse2
 
 void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch);
@@ -500,16 +501,18 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *d
 void vpx_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
 #define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_sse2
 
-void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 
-void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-RTCD_EXTERN void (*vpx_quantize_b_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+RTCD_EXTERN void (*vpx_quantize_b_32x32)(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 
 unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -519,19 +522,10 @@ unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x16_avg vpx_sad16x16_avg_sse2
 
-void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x16x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
-void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x16x4d vpx_sad16x16x4d_sse2
 
-void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
 unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad16x32 vpx_sad16x32_sse2
@@ -540,8 +534,8 @@ unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x32_avg vpx_sad16x32_avg_sse2
 
-void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x32x4d vpx_sad16x32x4d_sse2
 
 unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -552,19 +546,10 @@ unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uin
 unsigned int vpx_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x8_avg vpx_sad16x8_avg_sse2
 
-void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x8x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
-void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x8x4d vpx_sad16x8x4d_sse2
 
-void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
 unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -575,8 +560,8 @@ unsigned int vpx_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_sad32x16_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad32x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x16x4d vpx_sad32x16x4d_sse2
 
 unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -589,16 +574,10 @@ unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_sad32x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad32x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x3 vpx_sad32x32x3_c
-
-void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_c
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -610,8 +589,8 @@ unsigned int vpx_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_sad32x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad32x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x64x4d vpx_sad32x64x4d_sse2
 
 unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -622,18 +601,10 @@ unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad4x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad4x4_avg vpx_sad4x4_avg_sse2
 
-void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad4x4x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad4x4x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
-void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad4x4x4d vpx_sad4x4x4d_sse2
 
-void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad4x4x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad4x4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
 unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad4x8 vpx_sad4x8_sse2
@@ -642,47 +613,43 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad4x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad4x8_avg vpx_sad4x8_avg_sse2
 
-void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad4x8x4d vpx_sad4x8x4d_sse2
 
-void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x8 vpx_sad4x8x8_c
-
 unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad64x32_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 RTCD_EXTERN unsigned int (*vpx_sad64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 
 unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 unsigned int vpx_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 unsigned int vpx_sad64x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad64x32_avg_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad64x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad64x32x4d vpx_sad64x32x4d_sse2
 
 unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad64x64_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 RTCD_EXTERN unsigned int (*vpx_sad64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 
 unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 unsigned int vpx_sad64x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_sad64x64_avg_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad64x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x3 vpx_sad64x64x3_c
-
-void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-
-void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x8 vpx_sad64x64x8_c
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 
 unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -692,18 +659,10 @@ unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uin
 unsigned int vpx_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x16_avg vpx_sad8x16_avg_sse2
 
-void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad8x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x16x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
-void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x16x4d vpx_sad8x16x4d_sse2
 
-void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad8x16x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
 unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x4 vpx_sad8x4_sse2
@@ -712,13 +671,10 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x4_avg vpx_sad8x4_avg_sse2
 
-void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x4x4d vpx_sad8x4x4d_sse2
 
-void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x8 vpx_sad8x4x8_c
-
 unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x8 vpx_sad8x8_sse2
@@ -727,273 +683,392 @@ unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x8_avg vpx_sad8x8_avg_sse2
 
-void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad8x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x8x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-
-void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x8x4d vpx_sad8x8x4d_sse2
 
-void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad8x8x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
+unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x16 vpx_sad_skip_16x16_sse2
+
+void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x16x4d vpx_sad_skip_16x16x4d_sse2
+
+unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x32 vpx_sad_skip_16x32_sse2
+
+void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x32x4d vpx_sad_skip_16x32x4d_sse2
+
+unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x8 vpx_sad_skip_16x8_sse2
+
+void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x8x4d vpx_sad_skip_16x8x4d_sse2
+
+unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x16x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c
+
+void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c
+
+unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_sse2
+
+void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_sse2
+
+unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x32_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x32x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x64_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+RTCD_EXTERN unsigned int (*vpx_sad_skip_64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+
+void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+RTCD_EXTERN void (*vpx_sad_skip_64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+
+unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_sse2
+
+void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_sse2
+
+unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c
+
+void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c
+
+unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_sse2
+
+void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_sse2
 
 int vpx_satd_c(const int16_t *coeff, int length);
 int vpx_satd_sse2(const int16_t *coeff, int length);
-#define vpx_satd vpx_satd_sse2
+int vpx_satd_avx2(const int16_t *coeff, int length);
+RTCD_EXTERN int (*vpx_satd)(const int16_t *coeff, int length);
 
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
 
-void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
 
-void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
 
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_horiz vpx_scaled_horiz_c
 
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_vert vpx_scaled_vert_c
 
-uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+int64_t vpx_sse_sse4_1(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+int64_t vpx_sse_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
+RTCD_EXTERN int64_t (*vpx_sse)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height);
 
-uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 
-uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
-uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
 void vpx_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
-#define vpx_subtract_block vpx_subtract_block_sse2
+void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+RTCD_EXTERN void (*vpx_subtract_block)(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
 
 uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size);
 uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size);
 #define vpx_sum_squares_2d_i16 vpx_sum_squares_2d_i16_sse2
 
-void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_sse2
 
-void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_sse2
 
-void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_sse2
 
-void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_sse2
 
-void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_v_predictor_16x16 vpx_v_predictor_16x16_sse2
 
-void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_v_predictor_32x32 vpx_v_predictor_32x32_sse2
 
-void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_v_predictor_4x4 vpx_v_predictor_4x4_sse2
 
-void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_v_predictor_8x8 vpx_v_predictor_8x8_sse2
 
-unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance16x32 vpx_variance16x32_sse2
+unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance16x8 vpx_variance16x8_sse2
+unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance32x64 vpx_variance32x64_sse2
+unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_variance4x4 vpx_variance4x4_sse2
 
-unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_variance4x8 vpx_variance4x8_sse2
 
-unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance8x16 vpx_variance8x16_sse2
+unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance8x4 vpx_variance8x4_sse2
+unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance8x8 vpx_variance8x8_sse2
+unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c
 
 int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl);
@@ -1010,15 +1085,20 @@ static void setup_rtcd_internal(void)
 
     (void)flags;
 
+    vpx_comp_avg_pred = vpx_comp_avg_pred_sse2;
+    if (flags & HAS_AVX2) vpx_comp_avg_pred = vpx_comp_avg_pred_avx2;
     vpx_convolve8 = vpx_convolve8_sse2;
     if (flags & HAS_SSSE3) vpx_convolve8 = vpx_convolve8_ssse3;
     if (flags & HAS_AVX2) vpx_convolve8 = vpx_convolve8_avx2;
     vpx_convolve8_avg = vpx_convolve8_avg_sse2;
     if (flags & HAS_SSSE3) vpx_convolve8_avg = vpx_convolve8_avg_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve8_avg = vpx_convolve8_avg_avx2;
     vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_sse2;
     if (flags & HAS_SSSE3) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_avx2;
     vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_sse2;
     if (flags & HAS_SSSE3) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_ssse3;
+    if (flags & HAS_AVX2) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_avx2;
     vpx_convolve8_horiz = vpx_convolve8_horiz_sse2;
     if (flags & HAS_SSSE3) vpx_convolve8_horiz = vpx_convolve8_horiz_ssse3;
     if (flags & HAS_AVX2) vpx_convolve8_horiz = vpx_convolve8_horiz_avx2;
@@ -1051,6 +1131,8 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSSE3) vpx_d63_predictor_4x4 = vpx_d63_predictor_4x4_ssse3;
     vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_c;
     if (flags & HAS_SSSE3) vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_ssse3;
+    vpx_fdct16x16 = vpx_fdct16x16_sse2;
+    if (flags & HAS_AVX2) vpx_fdct16x16 = vpx_fdct16x16_avx2;
     vpx_fdct32x32 = vpx_fdct32x32_sse2;
     if (flags & HAS_AVX2) vpx_fdct32x32 = vpx_fdct32x32_avx2;
     vpx_fdct32x32_rd = vpx_fdct32x32_rd_sse2;
@@ -1059,40 +1141,39 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSSE3) vpx_fdct8x8 = vpx_fdct8x8_ssse3;
     vpx_get16x16var = vpx_get16x16var_sse2;
     if (flags & HAS_AVX2) vpx_get16x16var = vpx_get16x16var_avx2;
+    vpx_hadamard_16x16 = vpx_hadamard_16x16_sse2;
+    if (flags & HAS_AVX2) vpx_hadamard_16x16 = vpx_hadamard_16x16_avx2;
+    vpx_hadamard_32x32 = vpx_hadamard_32x32_sse2;
+    if (flags & HAS_AVX2) vpx_hadamard_32x32 = vpx_hadamard_32x32_avx2;
     vpx_hadamard_8x8 = vpx_hadamard_8x8_sse2;
     if (flags & HAS_SSSE3) vpx_hadamard_8x8 = vpx_hadamard_8x8_ssse3;
+    vpx_idct16x16_256_add = vpx_idct16x16_256_add_sse2;
+    if (flags & HAS_AVX2) vpx_idct16x16_256_add = vpx_idct16x16_256_add_avx2;
     vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_sse2;
-    if (flags & HAS_SSSE3) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_ssse3;
-    vpx_idct32x32_135_add = vpx_idct32x32_1024_add_sse2;
+    if (flags & HAS_AVX2) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_avx2;
+    vpx_idct32x32_135_add = vpx_idct32x32_135_add_sse2;
     if (flags & HAS_SSSE3) vpx_idct32x32_135_add = vpx_idct32x32_135_add_ssse3;
+    if (flags & HAS_AVX2) vpx_idct32x32_135_add = vpx_idct32x32_135_add_avx2;
     vpx_idct32x32_34_add = vpx_idct32x32_34_add_sse2;
     if (flags & HAS_SSSE3) vpx_idct32x32_34_add = vpx_idct32x32_34_add_ssse3;
     vpx_idct8x8_12_add = vpx_idct8x8_12_add_sse2;
     if (flags & HAS_SSSE3) vpx_idct8x8_12_add = vpx_idct8x8_12_add_ssse3;
-    vpx_idct8x8_64_add = vpx_idct8x8_64_add_sse2;
-    if (flags & HAS_SSSE3) vpx_idct8x8_64_add = vpx_idct8x8_64_add_ssse3;
     vpx_lpf_horizontal_16 = vpx_lpf_horizontal_16_sse2;
     if (flags & HAS_AVX2) vpx_lpf_horizontal_16 = vpx_lpf_horizontal_16_avx2;
     vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_sse2;
     if (flags & HAS_AVX2) vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_avx2;
     vpx_mse16x16 = vpx_mse16x16_sse2;
     if (flags & HAS_AVX2) vpx_mse16x16 = vpx_mse16x16_avx2;
+    vpx_mse16x8 = vpx_mse16x8_sse2;
+    if (flags & HAS_AVX2) vpx_mse16x8 = vpx_mse16x8_avx2;
     vpx_quantize_b = vpx_quantize_b_sse2;
     if (flags & HAS_SSSE3) vpx_quantize_b = vpx_quantize_b_ssse3;
     if (flags & HAS_AVX) vpx_quantize_b = vpx_quantize_b_avx;
+    if (flags & HAS_AVX2) vpx_quantize_b = vpx_quantize_b_avx2;
     vpx_quantize_b_32x32 = vpx_quantize_b_32x32_c;
     if (flags & HAS_SSSE3) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_ssse3;
     if (flags & HAS_AVX) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx;
-    vpx_sad16x16x3 = vpx_sad16x16x3_c;
-    if (flags & HAS_SSE3) vpx_sad16x16x3 = vpx_sad16x16x3_sse3;
-    if (flags & HAS_SSSE3) vpx_sad16x16x3 = vpx_sad16x16x3_ssse3;
-    vpx_sad16x16x8 = vpx_sad16x16x8_c;
-    if (flags & HAS_SSE4_1) vpx_sad16x16x8 = vpx_sad16x16x8_sse4_1;
-    vpx_sad16x8x3 = vpx_sad16x8x3_c;
-    if (flags & HAS_SSE3) vpx_sad16x8x3 = vpx_sad16x8x3_sse3;
-    if (flags & HAS_SSSE3) vpx_sad16x8x3 = vpx_sad16x8x3_ssse3;
-    vpx_sad16x8x8 = vpx_sad16x8x8_c;
-    if (flags & HAS_SSE4_1) vpx_sad16x8x8 = vpx_sad16x8x8_sse4_1;
+    if (flags & HAS_AVX2) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx2;
     vpx_sad32x16 = vpx_sad32x16_sse2;
     if (flags & HAS_AVX2) vpx_sad32x16 = vpx_sad32x16_avx2;
     vpx_sad32x16_avg = vpx_sad32x16_avg_sse2;
@@ -1107,30 +1188,52 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) vpx_sad32x64 = vpx_sad32x64_avx2;
     vpx_sad32x64_avg = vpx_sad32x64_avg_sse2;
     if (flags & HAS_AVX2) vpx_sad32x64_avg = vpx_sad32x64_avg_avx2;
-    vpx_sad4x4x3 = vpx_sad4x4x3_c;
-    if (flags & HAS_SSE3) vpx_sad4x4x3 = vpx_sad4x4x3_sse3;
-    vpx_sad4x4x8 = vpx_sad4x4x8_c;
-    if (flags & HAS_SSE4_1) vpx_sad4x4x8 = vpx_sad4x4x8_sse4_1;
     vpx_sad64x32 = vpx_sad64x32_sse2;
     if (flags & HAS_AVX2) vpx_sad64x32 = vpx_sad64x32_avx2;
+    if (flags & HAS_AVX512) vpx_sad64x32 = vpx_sad64x32_avx512;
     vpx_sad64x32_avg = vpx_sad64x32_avg_sse2;
     if (flags & HAS_AVX2) vpx_sad64x32_avg = vpx_sad64x32_avg_avx2;
+    if (flags & HAS_AVX512) vpx_sad64x32_avg = vpx_sad64x32_avg_avx512;
     vpx_sad64x64 = vpx_sad64x64_sse2;
     if (flags & HAS_AVX2) vpx_sad64x64 = vpx_sad64x64_avx2;
+    if (flags & HAS_AVX512) vpx_sad64x64 = vpx_sad64x64_avx512;
     vpx_sad64x64_avg = vpx_sad64x64_avg_sse2;
     if (flags & HAS_AVX2) vpx_sad64x64_avg = vpx_sad64x64_avg_avx2;
+    if (flags & HAS_AVX512) vpx_sad64x64_avg = vpx_sad64x64_avg_avx512;
     vpx_sad64x64x4d = vpx_sad64x64x4d_sse2;
     if (flags & HAS_AVX2) vpx_sad64x64x4d = vpx_sad64x64x4d_avx2;
-    vpx_sad8x16x3 = vpx_sad8x16x3_c;
-    if (flags & HAS_SSE3) vpx_sad8x16x3 = vpx_sad8x16x3_sse3;
-    vpx_sad8x16x8 = vpx_sad8x16x8_c;
-    if (flags & HAS_SSE4_1) vpx_sad8x16x8 = vpx_sad8x16x8_sse4_1;
-    vpx_sad8x8x3 = vpx_sad8x8x3_c;
-    if (flags & HAS_SSE3) vpx_sad8x8x3 = vpx_sad8x8x3_sse3;
-    vpx_sad8x8x8 = vpx_sad8x8x8_c;
-    if (flags & HAS_SSE4_1) vpx_sad8x8x8 = vpx_sad8x8x8_sse4_1;
+    if (flags & HAS_AVX512) vpx_sad64x64x4d = vpx_sad64x64x4d_avx512;
+    vpx_sad_skip_32x16 = vpx_sad_skip_32x16_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_avx2;
+    vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_avx2;
+    vpx_sad_skip_32x32 = vpx_sad_skip_32x32_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_avx2;
+    vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_avx2;
+    vpx_sad_skip_32x64 = vpx_sad_skip_32x64_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_avx2;
+    vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_avx2;
+    vpx_sad_skip_64x32 = vpx_sad_skip_64x32_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_avx2;
+    if (flags & HAS_AVX512) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_avx512;
+    vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_avx2;
+    if (flags & HAS_AVX512) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_avx512;
+    vpx_sad_skip_64x64 = vpx_sad_skip_64x64_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_avx2;
+    if (flags & HAS_AVX512) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_avx512;
+    vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_sse2;
+    if (flags & HAS_AVX2) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_avx2;
+    if (flags & HAS_AVX512) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_avx512;
+    vpx_satd = vpx_satd_sse2;
+    if (flags & HAS_AVX2) vpx_satd = vpx_satd_avx2;
     vpx_scaled_2d = vpx_scaled_2d_c;
     if (flags & HAS_SSSE3) vpx_scaled_2d = vpx_scaled_2d_ssse3;
+    vpx_sse = vpx_sse_c;
+    if (flags & HAS_SSE4_1) vpx_sse = vpx_sse_sse4_1;
+    if (flags & HAS_AVX2) vpx_sse = vpx_sse_avx2;
     vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_sse2;
     if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_ssse3;
     vpx_sub_pixel_avg_variance16x32 = vpx_sub_pixel_avg_variance16x32_sse2;
@@ -1187,16 +1290,30 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x4 = vpx_sub_pixel_variance8x4_ssse3;
     vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_sse2;
     if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_ssse3;
+    vpx_subtract_block = vpx_subtract_block_sse2;
+    if (flags & HAS_AVX2) vpx_subtract_block = vpx_subtract_block_avx2;
     vpx_variance16x16 = vpx_variance16x16_sse2;
     if (flags & HAS_AVX2) vpx_variance16x16 = vpx_variance16x16_avx2;
+    vpx_variance16x32 = vpx_variance16x32_sse2;
+    if (flags & HAS_AVX2) vpx_variance16x32 = vpx_variance16x32_avx2;
+    vpx_variance16x8 = vpx_variance16x8_sse2;
+    if (flags & HAS_AVX2) vpx_variance16x8 = vpx_variance16x8_avx2;
     vpx_variance32x16 = vpx_variance32x16_sse2;
     if (flags & HAS_AVX2) vpx_variance32x16 = vpx_variance32x16_avx2;
     vpx_variance32x32 = vpx_variance32x32_sse2;
     if (flags & HAS_AVX2) vpx_variance32x32 = vpx_variance32x32_avx2;
+    vpx_variance32x64 = vpx_variance32x64_sse2;
+    if (flags & HAS_AVX2) vpx_variance32x64 = vpx_variance32x64_avx2;
     vpx_variance64x32 = vpx_variance64x32_sse2;
     if (flags & HAS_AVX2) vpx_variance64x32 = vpx_variance64x32_avx2;
     vpx_variance64x64 = vpx_variance64x64_sse2;
     if (flags & HAS_AVX2) vpx_variance64x64 = vpx_variance64x64_avx2;
+    vpx_variance8x16 = vpx_variance8x16_sse2;
+    if (flags & HAS_AVX2) vpx_variance8x16 = vpx_variance8x16_avx2;
+    vpx_variance8x4 = vpx_variance8x4_sse2;
+    if (flags & HAS_AVX2) vpx_variance8x4 = vpx_variance8x4_avx2;
+    vpx_variance8x8 = vpx_variance8x8_sse2;
+    if (flags & HAS_AVX2) vpx_variance8x8 = vpx_variance8x8_avx2;
 }
 #endif
 
@@ -1204,4 +1321,4 @@ static void setup_rtcd_internal(void)
 }  // extern "C"
 #endif
 
-#endif
+#endif  // VPX_DSP_RTCD_H_
diff --git a/media/libvpx/config/win/x64/vpx_scale_rtcd.h b/media/libvpx/config/win/x64/vpx_scale_rtcd.h
index ddf7d01cca..18e5b71579 100644
--- a/media/libvpx/config/win/x64/vpx_scale_rtcd.h
+++ b/media/libvpx/config/win/x64/vpx_scale_rtcd.h
@@ -1,3 +1,14 @@
+/*
+ *  Copyright (c) 2026 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
 #ifndef VPX_SCALE_RTCD_H_
 #define VPX_SCALE_RTCD_H_
 
@@ -46,6 +57,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
 void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
 #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
 
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
 void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
 #define vpx_yv12_copy_y vpx_yv12_copy_y_c
 
@@ -66,4 +80,4 @@ static void setup_rtcd_internal(void)
 }  // extern "C"
 #endif
 
-#endif
+#endif  // VPX_SCALE_RTCD_H_
diff --git a/media/libvpx/generate_sources_mozbuild.sh b/media/libvpx/generate_sources_mozbuild.sh
index 57d648fc60..cb0c2793ac 100644
--- a/media/libvpx/generate_sources_mozbuild.sh
+++ b/media/libvpx/generate_sources_mozbuild.sh
@@ -195,14 +195,24 @@ all_platforms="--enable-external-build --disable-examples --disable-install-docs
 all_platforms="${all_platforms} --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic"
 x86_platforms="--enable-postproc --enable-vp9-postproc --as=yasm"
 arm_platforms="--enable-runtime-cpu-detect --enable-realtime-only"
+aarch64_platforms="--enable-runtime-cpu-detect --enable-realtime-only"
+mips64_platforms="--enable-runtime-cpu-detect --cpu=loongson3"
+ppc64le_platforms="--enable-runtime-cpu-detect --enable-vsx"
+other_arch_platforms="--enable-runtime-cpu-detect"
 gen_config_files linux/x64 "--target=x86_64-linux-gcc ${all_platforms} ${x86_platforms}"
 gen_config_files linux/ia32 "--target=x86-linux-gcc ${all_platforms} ${x86_platforms}"
 gen_config_files mac/x64 "--target=x86_64-darwin9-gcc ${all_platforms} ${x86_platforms}"
 gen_config_files mac/ia32 "--target=x86-darwin9-gcc ${all_platforms} ${x86_platforms}"
-gen_config_files win/x64 "--target=x86_64-win64-vs12 ${all_platforms} ${x86_platforms}"
+gen_config_files win/x64 "--target=x86_64-win64-vs17 ${all_platforms} ${x86_platforms}"
 gen_config_files win/ia32 "--target=x86-win32-gcc ${all_platforms} ${x86_platforms}"
 
 gen_config_files linux/arm "--target=armv7-linux-gcc ${all_platforms} ${arm_platforms}"
+gen_config_files linux/arm64 "--target=arm64-linux-gcc ${all_platforms} ${aarch64_platforms}"
+gen_config_files mac/arm64 "--target=arm64-darwin-gcc ${all_platforms} ${aarch64_platforms}"
+gen_config_files linux/mips32 "--target=mips32-linux-gcc ${all_platforms} ${other_arch_platforms}"
+gen_config_files linux/mips64 "--target=mips64-linux-gcc ${all_platforms} ${mips64_platforms}"
+gen_config_files linux/ppc64le "--target=ppc64le-linux-gcc ${all_platforms} ${ppc64le_platforms}"
+gen_config_files linux/loongarch64 "--target=loongarch64-linux-gcc ${all_platforms} ${other_arch_platforms}"
 
 gen_config_files generic "--target=generic-gnu ${all_platforms}"
 
@@ -224,6 +234,12 @@ gen_rtcd_header win/x64 x86_64
 gen_rtcd_header win/ia32 x86
 
 gen_rtcd_header linux/arm armv7
+gen_rtcd_header linux/arm64 arm64
+gen_rtcd_header mac/arm64 arm64
+gen_rtcd_header linux/mips32 mips32
+gen_rtcd_header linux/mips64 mips64
+gen_rtcd_header linux/ppc64le ppc64le
+gen_rtcd_header linux/loongarch64 loongarch64
 
 gen_rtcd_header generic generic
 
@@ -257,6 +273,36 @@ make_clean
 make libvpx_srcs.txt target=libs $config > /dev/null
 convert_srcs_to_project_files libvpx_srcs.txt ARM
 
+echo "Generate AARCH64 source list."
+config=$(print_config linux/arm64)
+make_clean
+make libvpx_srcs.txt target=libs $config > /dev/null
+convert_srcs_to_project_files libvpx_srcs.txt AARCH64
+
+echo "Generate MIPS32 source list."
+config=$(print_config linux/mips32)
+make_clean
+make libvpx_srcs.txt target=libs $config > /dev/null
+convert_srcs_to_project_files libvpx_srcs.txt MIPS32
+
+echo "Generate MIPS64 source list."
+config=$(print_config linux/mips64)
+make_clean
+make libvpx_srcs.txt target=libs $config > /dev/null
+convert_srcs_to_project_files libvpx_srcs.txt MIPS64
+
+echo "Generate PPC64LE source list."
+config=$(print_config linux/ppc64le)
+make_clean
+make libvpx_srcs.txt target=libs $config > /dev/null
+convert_srcs_to_project_files libvpx_srcs.txt PPC64LE
+
+echo "Generate LOONGARCH64 source list."
+config=$(print_config linux/loongarch64)
+make_clean
+make libvpx_srcs.txt target=libs $config > /dev/null
+convert_srcs_to_project_files libvpx_srcs.txt LOONGARCH64
+
 echo "Generate generic source list."
 config=$(print_config generic)
 make_clean
diff --git a/media/libvpx/libvpx/.clang-format b/media/libvpx/libvpx/.clang-format
index d91cf89aac..a8bc4967c3 100644
--- a/media/libvpx/libvpx/.clang-format
+++ b/media/libvpx/libvpx/.clang-format
@@ -1,91 +1,9 @@
 ---
 Language:        Cpp
-# BasedOnStyle:  Google
-# Generated with clang-format 3.8.1
-AccessModifierOffset: -1
-AlignAfterOpenBracket: Align
-AlignConsecutiveAssignments: false
-AlignConsecutiveDeclarations: false
-AlignEscapedNewlinesLeft: true
-AlignOperands:   true
-AlignTrailingComments: true
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortBlocksOnASingleLine: false
+BasedOnStyle:  Google
 AllowShortCaseLabelsOnASingleLine: true
-AllowShortFunctionsOnASingleLine: All
-AllowShortIfStatementsOnASingleLine: true
-AllowShortLoopsOnASingleLine: true
-AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakAfterReturnType: None
-AlwaysBreakBeforeMultilineStrings: true
-AlwaysBreakTemplateDeclarations: true
-BinPackArguments: true
-BinPackParameters: true
-BraceWrapping:
-  AfterClass:      false
-  AfterControlStatement: false
-  AfterEnum:       false
-  AfterFunction:   false
-  AfterNamespace:  false
-  AfterObjCDeclaration: false
-  AfterStruct:     false
-  AfterUnion:      false
-  BeforeCatch:     false
-  BeforeElse:      false
-  IndentBraces:    false
-BreakBeforeBinaryOperators: None
-BreakBeforeBraces: Attach
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializersBeforeComma: false
-ColumnLimit:     80
-CommentPragmas:  '^ IWYU pragma:'
 ConstructorInitializerAllOnOneLineOrOnePerLine: false
-ConstructorInitializerIndentWidth: 4
-ContinuationIndentWidth: 4
 Cpp11BracedListStyle: false
 DerivePointerAlignment: false
-DisableFormat:   false
-ExperimentalAutoDetectBinPacking: false
-ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
-IncludeCategories:
-  - Regex:           '^<.*\.h>'
-    Priority:        1
-  - Regex:           '^<.*'
-    Priority:        2
-  - Regex:           '.*'
-    Priority:        3
-IndentCaseLabels: true
-IndentWidth:     2
-IndentWrappedFunctionNames: false
-KeepEmptyLinesAtTheStartOfBlocks: false
-MacroBlockBegin: ''
-MacroBlockEnd:   ''
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-ObjCBlockIndentWidth: 2
-ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: false
-PenaltyBreakBeforeFirstCallParameter: 1
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakString: 1000
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 200
 PointerAlignment: Right
-ReflowComments:  true
 SortIncludes:    false
-SpaceAfterCStyleCast: false
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeParens: ControlStatements
-SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 2
-SpacesInAngles:  false
-SpacesInContainerLiterals: true
-SpacesInCStyleCastParentheses: false
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-Standard:        Auto
-TabWidth:        8
-UseTab:          Never
-...
-
diff --git a/media/libvpx/libvpx/.mailmap b/media/libvpx/libvpx/.mailmap
index 94cb1ecfe7..2fc6c7d39d 100644
--- a/media/libvpx/libvpx/.mailmap
+++ b/media/libvpx/libvpx/.mailmap
@@ -1,37 +1,58 @@
 Adrian Grange <agrange@google.com>
-Aℓex Converse <aconverse@google.com>
-Aℓex Converse <aconverse@google.com> <alex.converse@gmail.com>
+Aℓex Converse <alexconv@twitch.tv>
+Aℓex Converse <alexconv@twitch.tv> <aconverse@google.com>
+Aℓex Converse <alexconv@twitch.tv> <alex.converse@gmail.com>
 Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
 Alpha Lam <hclam@google.com> <hclam@chromium.org>
+Angie Chiang <angiebird@google.com>
+Bohan Li <bohanli@google.com>
+Chris Cunningham <chcunningham@chromium.org>
+Chi Yo Tsai <chiyotsai@google.com>
 Daniele Castagna <dcastagna@chromium.org> <dcastagna@google.com>
 Deb Mukherjee <debargha@google.com>
+Elliott Karpilovsky <elliottk@google.com>
 Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com>
+Fyodor Kyslov <kyslov@google.com>
+Gregor Jasny <gjasny@gmail.com>
+Gregor Jasny <gjasny@gmail.com> <gjasny@googlemail.com>
 Guillaume Martres <gmartres@google.com> <smarter3@gmail.com>
 Hangyu Kuang <hkuang@google.com>
 Hui Su <huisu@google.com>
 Jacky Chen <jackychen@google.com>
 Jim Bankoski <jimbankoski@google.com>
 Johann Koenig <johannkoenig@google.com>
+Johann Koenig <johannkoenig@google.com> <johannkoenig@dhcp-172-19-7-52.mtv.corp.google.com>
 Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
-Johann Koenig <johannkoenig@google.com> <johann.koenig@gmail.com>
 Johann Koenig <johannkoenig@google.com> <johannkoenig@chromium.org>
+Johann <johann@duck.com> <johann.koenig@gmail.com>
 John Koleszar <jkoleszar@google.com>
 Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
+Konstantinos Margaritis <konma@vectorcamp.gr> <konstantinos@vectorcamp.gr>
 Marco Paniconi <marpan@google.com>
 Marco Paniconi <marpan@google.com> <marpan@chromium.org>
+Martin Storsjö <martin@martin.st>
+Michael Horowitz <mhoro@webrtc.org> <mhoro@google.com>
 Pascal Massimino <pascal.massimino@gmail.com>
 Paul Wilkins <paulwilkins@google.com>
+Peter Boström <pbos@chromium.org> <pbos@google.com>
 Peter de Rivaz <peter.derivaz@gmail.com>
 Peter de Rivaz <peter.derivaz@gmail.com> <peter.derivaz@argondesign.com>
 Ralph Giles <giles@xiph.org> <giles@entropywave.com>
 Ralph Giles <giles@xiph.org> <giles@mozilla.com>
 Ronald S. Bultje <rsbultje@gmail.com> <rbultje@google.com>
+Sai Deng <sdeng@google.com>
 Sami Pietilä <samipietila@google.com>
+Shiyou Yin <yinshiyou-hf@loongson.cn>
 Tamar Levy <tamar.levy@intel.com>
 Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com>
 Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
 Timothy B. Terriberry <tterribe@xiph.org> <tterriberry@mozilla.com>
 Tom Finegan <tomfinegan@google.com>
 Tom Finegan <tomfinegan@google.com> <tomfinegan@chromium.org>
+Urvang Joshi <urvang@google.com> <urvang@chromium.org>
+Yaowu Xu <yaowu@google.com> <adam@xuyaowu.com>
 Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
 Yaowu Xu <yaowu@google.com> <Yaowu Xu>
+Venkatarama NG. Avadhani <venkatarama.avadhani@ittiam.com>
+Vitaly Buka <vitalybuka@chromium.org> <vitlaybuka@chromium.org>
+Xiwei Gu <guxiwei-hf@loongson.cn>
diff --git a/media/libvpx/libvpx/AUTHORS b/media/libvpx/libvpx/AUTHORS
index 87a5e845cf..7d264b36ac 100644
--- a/media/libvpx/libvpx/AUTHORS
+++ b/media/libvpx/libvpx/AUTHORS
@@ -3,60 +3,102 @@
 
 Aaron Watry <awatry@gmail.com>
 Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
-Adam Xu <adam@xuyaowu.com>
+Adam B. Goode <adam.mckee84@gmail.com>
 Adrian Grange <agrange@google.com>
-Aℓex Converse <aconverse@google.com>
 Ahmad Sharif <asharif@google.com>
+Aidan Welch <aidansw@yahoo.com>
 Aleksey Vasenev <margtu-fivt@ya.ru>
 Alexander Potapenko <glider@google.com>
 Alexander Voronov <avoronov@graphics.cs.msu.ru>
+Alexandra Hájková <alexandra.khirnova@gmail.com>
+Aℓex Converse <alexconv@twitch.tv>
+Alex Davicenko <alex.davicenko@arm.com>
 Alexis Ballier <aballier@gentoo.org>
 Alok Ahuja <waveletcoeff@gmail.com>
 Alpha Lam <hclam@google.com>
 A.Mahfoodh <ab.mahfoodh@gmail.com>
 Ami Fischman <fischman@chromium.org>
 Andoni Morales Alastruey <ylatuya@gmail.com>
+Andres Calderon Jaramillo <andrescj@chromium.org>
 Andres Mejia <mcitadel@gmail.com>
+Andrew Lewis <andrewlewis@google.com>
 Andrew Russell <anrussell@google.com>
+Andrew Salkeld <andrew.salkeld@arm.com>
+Angie Chen <yunqi@google.com>
 Angie Chiang <angiebird@google.com>
+Anton Venema <anton.venema@liveswitch.com>
+Anupam Pandey <anupam.pandey@ittiam.com>
 Aron Rosenberg <arosenberg@logitech.com>
 Attila Nagy <attilanagy@google.com>
+Birk Magnussen <birk.magnussen@googlemail.com>
+Bohan Li <bohanli@google.com>
+Brian Foley <bpfoley@google.com>
 Brion Vibber <bvibber@wikimedia.org>
+Casey Smalley <casey.smalley@arm.com>
 changjun.yang <changjun.yang@intel.com>
 Charles 'Buck' Krasic <ckrasic@google.com>
+Cheng Chen <chengchen@google.com>
+Chen Wang <wangchen20@iscas.ac.cn>
+Cherma Rajan A <cherma.rajan@ittiam.com>
+Chi Yo Tsai <chiyotsai@google.com>
 chm <chm@rock-chips.com>
+Chris Cunningham <chcunningham@chromium.org>
 Christian Duvivier <cduvivier@google.com>
+Chunbo Hua <chunbo.hua@intel.com>
+Chun-Min Chang <chun.m.chang@gmail.com>
+Clement Courbet <courbet@google.com>
+Daniel Cheng <dcheng@chromium.org>
 Daniele Castagna <dcastagna@chromium.org>
 Daniel Kang <ddkang@google.com>
+Daniel Sommermann <dcsommer@gmail.com>
+Dan Zhu <zxdan@google.com>
+David Benjamin <davidben@google.com>
 Deb Mukherjee <debargha@google.com>
 Deepa K G <deepa.kg@ittiam.com>
+Diksha Singh <diksha.singh@ittiam.com>
 Dim Temp <dimtemp0@gmail.com>
 Dmitry Kovalev <dkovalev@google.com>
 Dragan Mrdjan <dmrdjan@mips.com>
 Ed Baker <edward.baker@intel.com>
 Ehsan Akhgari <ehsan.akhgari@gmail.com>
+Elliott Karpilovsky <elliottk@google.com>
 Erik Niemeyer <erik.a.niemeyer@intel.com>
 Fabio Pedretti <fabio.ped@libero.it>
+Feng, Feifei <Feifei.Feng@partner.bmw.de>
+Florian Mayer <fmayer@google.com>
 Frank Galligan <fgalligan@google.com>
 Fredrik Söderquist <fs@opera.com>
 Fritz Koenig <frkoenig@google.com>
+Fyodor Kyslov <kyslov@google.com>
 Gabriel Marin <gmx@chromium.org>
 Gaute Strokkenes <gaute.strokkenes@broadcom.com>
+George Steed <george.steed@arm.com>
+Gerda Zsejke More <gerdazsejke.more@arm.com>
 Geza Lore <gezalore@gmail.com>
 Ghislain MARY <ghislainmary2@gmail.com>
 Giuseppe Scrivano <gscrivano@gnu.org>
 Gordana Cmiljanovic <gordana.cmiljanovic@imgtec.com>
+Gregor Jasny <gjasny@gmail.com>
 Guillaume Martres <gmartres@google.com>
 Guillermo Ballester Valor <gbvalor@gmail.com>
+Hang Nguyen <hnt@google.com>
 Hangyu Kuang <hkuang@google.com>
 Hanno Böck <hanno@hboeck.de>
+Han Shen <shenhan@google.com>
+Hao Chen <chenhao@loongson.cn>
+Hari Limaye <hari.limaye@arm.com>
+Harish Mahendrakar <harish.mahendrakar@ittiam.com>
 Henrik Lundin <hlundin@google.com>
+Hien Ho <hienho@google.com>
+Hirokazu Honda <hiroh@chromium.org>
 Hui Su <huisu@google.com>
+Ilya Kurdyukov <jpegqs@gmail.com>
 Ivan Krasin <krasin@chromium.org>
 Ivan Maltz <ivanmaltz@google.com>
 Jacek Caban <cjacek@gmail.com>
 Jacky Chen <jackychen@google.com>
 James Berry <jamesberry@google.com>
+James Touton <bekenn@gmail.com>
 James Yu <james.yu@linaro.org>
 James Zern <jzern@google.com>
 Jan Gerber <j@mailb.org>
@@ -66,44 +108,71 @@ Jean-Yves Avenard <jyavenard@mozilla.com>
 Jeff Faust <jfaust@google.com>
 Jeff Muizelaar <jmuizelaar@mozilla.com>
 Jeff Petkau <jpet@chromium.org>
+Jeremy Dorfman <jdorfman@google.com>
+Jeremy Leconte <jleconte@google.com>
 Jerome Jiang <jianj@google.com>
 Jia Jia <jia.jia@linaro.org>
+Jianhui Dai <jianhui.j.dai@intel.com>
 Jian Zhou <zhoujian@google.com>
 Jim Bankoski <jimbankoski@google.com>
+jinbo <jinbo-hf@loongson.cn>
+Jin Bo <jinbo@loongson.cn>
 Jingning Han <jingning@google.com>
+Joel Fernandes <joelaf@google.com>
 Joey Parrish <joeyparrish@google.com>
+Johann <johann@duck.com>
 Johann Koenig <johannkoenig@google.com>
 John Koleszar <jkoleszar@google.com>
 Johnny Klonaris <google@jawknee.com>
 John Stark <jhnstrk@gmail.com>
+Jonathan Wright <jonathan.wright@arm.com>
+Jon Kunkee <jkunkee@microsoft.com>
+Jorge E. Moreira <jemoreira@google.com>
 Joshua Bleecher Snyder <josh@treelinelabs.com>
 Joshua Litt <joshualitt@google.com>
 Julia Robson <juliamrobson@gmail.com>
 Justin Clift <justin@salasaga.org>
 Justin Lebar <justin.lebar@gmail.com>
 Kaustubh Raste <kaustubh.raste@imgtec.com>
+Kexy Biscuit <kexybiscuit@aosc.io>
 KO Myung-Hun <komh@chollian.net>
+Konstantinos Margaritis <konma@vectorcamp.gr>
+Kyle Siefring <kylesiefring@gmail.com>
 Lawrence Velázquez <larryv@macports.org>
+L. E. Segovia <amy@amyspark.me>
 Linfeng Zhang <linfengz@google.com>
+Lin Zheng <linzhen@google.com>
+Liu Peng <pengliu.mail@gmail.com>
 Lou Quillio <louquillio@google.com>
 Luca Barbato <lu_zero@gentoo.org>
+Luc Trudeau <luc@trud.ca>
+Lu Wang <wanglu@loongson.cn>
 Makoto Kato <makoto.kt@gmail.com>
 Mans Rullgard <mans@mansr.com>
 Marco Paniconi <marpan@google.com>
 Mark Mentovai <mark@chromium.org>
 Martin Ettl <ettl.martin78@googlemail.com>
-Martin Storsjo <martin@martin.st>
+Martin Storsjö <martin@martin.st>
 Matthew Heaney <matthewjheaney@chromium.org>
+Matthias Räncker <theonetruecamper@gmx.de>
+Michael Horowitz <mhoro@webrtc.org>
 Michael Kohler <michaelkohler@live.com>
+Michał Janiszewski <janisozaur@gmail.com>
 Mike Frysinger <vapier@chromium.org>
 Mike Hommey <mhommey@mozilla.com>
 Mikhal Shemer <mikhal@google.com>
+Mikko Koivisto <mikko.koivisto@unikie.com>
 Min Chen <chenm003@gmail.com>
 Minghai Shang <minghai@google.com>
 Min Ye <yeemmi@google.com>
+Mirko Bonadei <mbonadei@google.com>
+Moriyoshi Koizumi <mozo@mozo.jp>
 Morton Jonuschat <yabawock@gmail.com>
 Nathan E. Egge <negge@mozilla.com>
+Neeraj Gadgil <neeraj.gadgil@ittiam.com>
+Neil Birkbeck <neil.birkbeck@gmail.com>
 Nico Weber <thakis@chromium.org>
+Niveditha Rau <niveditha.rau@gmail.com>
 Parag Salasakar <img.mips1@gmail.com>
 Pascal Massimino <pascal.massimino@gmail.com>
 Patrik Westin <patrik.westin@gmail.com>
@@ -111,29 +180,47 @@ Paul Wilkins <paulwilkins@google.com>
 Pavol Rusnak <stick@gk2.sk>
 Paweł Hajdan <phajdan@google.com>
 Pengchong Jin <pengchong@google.com>
-Peter Boström <pbos@google.com>
+Peter Boström <pbos@chromium.org>
+Peter Collingbourne <pcc@chromium.org>
 Peter de Rivaz <peter.derivaz@gmail.com>
+Peter Kasting <pkasting@chromium.org>
 Philip Jägenstedt <philipj@opera.com>
+Philippe Antoine <p.antoine@catenacyber.fr>
+Philipp Hancke <phancke@meta.com>
 Priit Laes <plaes@plaes.org>
 Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
 Rafaël Carré <funman@videolan.org>
+Rafael de Lucena Valle <rafaeldelucena@gmail.com>
+Rahul Chaudhry <rahulchaudhry@google.com>
 Ralph Giles <giles@xiph.org>
 Ranjit Kumar Tulabandu <ranjit.tulabandu@ittiam.com>
+Raphael Kubo da Costa <raphael.kubo.da.costa@intel.com>
+Ravi Chaudhary <ravi.chaudhary@ittiam.com>
+Ritu Baldwa <ritu.baldwa@ittiam.com>
 Rob Bradford <rob@linux.intel.com>
 Ronald S. Bultje <rsbultje@gmail.com>
 Rui Ueyama <ruiu@google.com>
+Sai Deng <sdeng@google.com>
+Salome Thirot <salome.thirot@arm.com>
 Sami Pietilä <samipietila@google.com>
+Sam James <sam@gentoo.org>
 Sarah Parker <sarahparker@google.com>
 Sasi Inguva <isasi@google.com>
 Scott Graham <scottmg@chromium.org>
 Scott LaVarnway <slavarnway@google.com>
 Sean McGovern <gseanmcg@gmail.com>
 Sergey Kolomenkin <kolomenkin@gmail.com>
+Sergey Silkin <ssilkin@google.com>
 Sergey Ulanov <sergeyu@chromium.org>
 Shimon Doodkin <helpmepro1@gmail.com>
+Shiyou Yin <yinshiyou-hf@loongson.cn>
+Shubham Tandle <shubham.tandle@ittiam.com>
 Shunyao Li <shunyaoli@google.com>
+Sreerenj Balachandran <bsreerenj@gmail.com>
 Stefan Holmer <holmer@google.com>
 Suman Sunkara <sunkaras@google.com>
+Supradeep T R <supradeep.tr@ittiam.com>
+Sylvestre Ledru <sylvestre@mozilla.com>
 Taekhyun Kim <takim@nvidia.com>
 Takanori MATSUURA <t.matsuu@gmail.com>
 Tamar Levy <tamar.levy@intel.com>
@@ -145,13 +232,26 @@ Timothy B. Terriberry <tterribe@xiph.org>
 Tom Finegan <tomfinegan@google.com>
 Tristan Matthews <le.businessman@gmail.com>
 Urvang Joshi <urvang@google.com>
+Venkatarama NG. Avadhani <venkatarama.avadhani@ittiam.com>
 Vignesh Venkatasubramanian <vigneshv@google.com>
+Vitaly Buka <vitalybuka@chromium.org>
+Vlad Tsyrklevich <vtsyrklevich@chromium.org>
+Wan-Teh Chang <wtc@google.com>
+Wonkap Jang <wonkap@google.com>
+Xiahong Bao <xiahong.bao@nxp.com>
+Xiwei Gu <guxiwei-hf@loongson.cn>
 Yaowu Xu <yaowu@google.com>
 Yi Luo <luoyi@google.com>
+Yongseok Jeon <ysjeon741@gmail.com>
 Yongzhe Wang <yongzhe@google.com>
+yuanhecai <yuanhecai@loongson.cn>
+Yue Chen <yuec@google.com>
+Yun Liu <yliuyliu@google.com>
 Yunqing Wang <yunqingwang@google.com>
 Yury Gitman <yuryg@google.com>
 Zoe Liu <zoeliu@google.com>
+Zoltan Kuscsik <zoltan@s57.io>
+zuxy <zuxy.meng@gmail.com>
 Google Inc.
 The Mozilla Foundation
 The Xiph.Org Foundation
diff --git a/media/libvpx/libvpx/CHANGELOG b/media/libvpx/libvpx/CHANGELOG
index 7e7aec67ac..df77c20235 100644
--- a/media/libvpx/libvpx/CHANGELOG
+++ b/media/libvpx/libvpx/CHANGELOG
@@ -1,3 +1,493 @@
+2026-01-06 v1.16.0 "Xenonetta Duck"
+  This release includes Arm SVE2 and Neon optimizations for 12-tap filters,
+  AVX512 implementations for SAD, support for per-frame and per-spatial-layer
+  PSNR calculation, and numerous bug fixes.
+
+  - Upgrading:
+    This release is ABI incompatible with the previous release.
+
+    Unit tests require C++17 to build.
+
+    Support for 32-bit iOS targets (armv7, armv7s, and i386) has been removed.
+
+  - Enhancement:
+    Optimized Arm SVE2 and Neon implementations for 12-tap convolution filters.
+    Optimized Neon High Bitdepth (HBD) SAD and sad_avg functions.
+    Added Arm Neon DotProd and I8MM implementations for vpx_convolve12.
+    Added AVX512 implementations for SAD64 and sad_skip functions.
+    Added SSSE3 and AVX2 implementations for 12-tap temporal filter prediction.
+    Added support for per-frame and per-spatial-layer PSNR calculation.
+
+    Adjusted temporal filter strength to improve visual quality and reduce block
+    artifacts.
+
+    Added support for darwin24 (macOS 15) and darwin25 (macOS 26).
+    libwebm is upgraded to commit b4f01ea.
+
+  - Bug fixes:
+    Fix to heap buffer overflow in vp9_deblock, vp9_post_proc_frame, and
+    vp9_pack_bitstream.
+
+    Fix to integer overflow in vp9_highbd_post_proc, vp9_rc_regulate_q,
+    tiny_ssim, and vp9_calc_pframe_target_size_one_pass_cbr.
+
+    Fix to use-of-uninitialized-value in vp9_highbd_post_proc, mfqe, and
+    vp8_datarate_test.
+
+    Fix to out-of-bounds in log_tile_cols_from_picsize_level.
+    Fix to double free on initialization failure in vpx_codec_enc_init_multi.
+    Fix to division-by-zero crash in vpxenc with 0 FPS numerator input.
+
+    Fix to various build failures for Arm/SVE2, macOS cross-compilation, and
+    Xcode 16.
+
+2025-05-28 v1.15.2 "Wigeon Duck"
+  This release fixes CVE-2025-5283 (bug webm:413411335), and is ABI compatible
+  with the previous release.
+
+2025-01-09 v1.15.1 "Wigeon Duck"
+  This release bumps up the SO major version and fixes the language about ABI
+  compatibility in the previous release changelog.
+
+2024-10-22 v1.15.0 "Wigeon Duck"
+  This release includes new codec control for key frame filtering, more Neon
+  optimizations, improvements to RTC encoding and bug fixes.
+
+  - Upgrading:
+    This release is ABI incompatible with the previous release.
+
+    It is strongly recommended to skip this release and upgrade to v1.15.1 since
+    the shared object was versioned incorrectly, as shown in
+    https://issues.webmproject.org/issues/384672478.
+
+    Temporal filtering improvement that can be turned on with the new codec
+    control VP9E_SET_KEY_FRAME_FILTERING, which gives 1+% BD-rate saving with
+    minimal encoder time increase.
+
+    libwebm is upgraded to libwebm-1.0.0.31-10-g3b63004
+
+  - Enhancement:
+    Neon optimization speed up
+      1-3% speed up across speed 5 to 10 for RTC
+      3% speed up for speed 0 and 1 for VoD in standard bitdepth
+      3% and 7% speed up for speed 0 and 1 respectively for VoD in high bitdepth
+    Scene detection is allowed for all RTC speeds (>=5)
+    Support profile guided optimizations
+
+    Delta quantization parameters for UV channels for vp8 is supported in RTC
+    rate control library
+
+    Rate control parameters are reset and maximum QP is enforced on scene
+    changes in SVC when there is no inter-layer prediction
+
+  - Bug fixes:
+    Fix to Uninitialized scalar variable in `vp9_rd_pick_inter_mode_sb()`
+    Fix to Integer-overflow in `resize_multistep`
+    Fix to Heap-buffer-overflow in `vpx_sad64x64_avx2`
+    Fix to Crash in `vpx_sad8x8_sse2`
+    Fix to Assertion in `write_modes`
+    Support profile guided optimizations
+    Fix to Integer-overflow in `encode_frame_to_data_rate`
+    Fix to Integer-overflow in `vp9_svc_check_reset_layer_rc_flag`
+    Fix to core dump error from /usr/bin/tools/tiny_ssim --help
+    Fix to use-of-uninitialized-value in `vp9_setup_tpl_stats`
+    Fix to Undefined-shift in `vp9_cyclic_refresh_setup`
+    Fix to redundant `&& __GNUC__` preproc check
+    Fix to valgrind warning in EncodeAPI.OssFuzz69906
+    Fix to Index-out-of-bounds in `vp8_rd_pick_inter_mode`
+    Fix to Integer-overflow in `vp8_pick_frame_size`
+    Fix to Use-of-uninitialized-value in `vpx_codec_peek_stream_info`
+    Fix to log clutters with the message "Warning: Desired height too large"
+    Fix to Integer-overflow in `vp9_svc_adjust_avg_frame_qindex`
+
+    Fix to integer overflows caused by huge target bitrate, frame rate, or
+    g_timebase numerator or denominator
+
+    Fix to missing license headers
+    Fix to build failure for Android Armv7
+    Fix to integer overflows in image helpers
+    Fix to Integer-overflow in `vp9_calc_iframe_target_size_one_pass_cbr`
+    Fix to Heap-buffer-overflow in `vp9_pick_inter_mode`
+    Fix to Segv in `vp9_multi_thread_tile_init`
+    Fix to Use-of-uninitialized-value in `vp9_row_mt_sync_mem_dealloc`
+    Fix to Crash in `mbloop_filter_vertical_edge_c`
+    Fix to Check failed in CheckUnwind
+    Fix to Heap-buffer-overflow in `write_modes_b` and `vpx_write`
+    Fix to Possible signed integer overflow found in `vpx_codec_encode`
+    Fix to build conflicts between Abseil and libaom/libvpx in Win ARM64 builds
+    Fix to build failures on aarch64
+    Fix to Data race in libvpx ARM NEON
+    Fix to Heap-buffer-overflow in `scale_plane_1_to_2_phase_0`
+    Fix to integer overflow in `encode_mb_row`
+    Fix to Floating-point-exception in `vp8_pick_frame_size`
+    Fix to Heap-buffer-overflow in `vp9_enc_setup_mi`
+    Fix to build failure with --target=arm64-win64-vs17
+    Fix to heap-buffer-overflow write in `vpx_img_read()`
+    Fix to C vs armv8-linux-gcc encode mismatches for `y4m_360p_10bit_input`
+    Fix to Null-dereference READ in `ml_predict_var_rd_partitioning`
+    Fix to Heap-buffer-overflow in `vpx_scaled_2d_ssse3`
+    Fix to Crash in `convolve_horiz`
+    Fix to Ill in `vpx_scaled_2d_ssse3`
+    Fix to Global-buffer-overflow in `cost_coeffs`
+
+2024-05-21 v1.14.1 "Venetian Duck"
+  This release includes enhancements and bug fixes.
+
+  - Upgrading:
+    This release is ABI compatible with the previous release.
+
+  - Enhancement:
+    Improved the detection of compiler support for AArch64 extensions,
+    particularly SVE.
+
+    Added vpx_codec_get_global_headers() support for VP9.
+
+  - Bug fixes:
+    Added buffer bounds checks to vpx_writer and vpx_write_bit_buffer.
+    Fix to GetSegmentationData() crash in aq_mode=0 for RTC rate control.
+    Fix to alloc for row_base_thresh_freq_fac.
+    Free row mt memory before freeing cpi->tile_data.
+    Fix to buffer alloc for vp9_bitstream_worker_data.
+    Fix to VP8 race issue for multi-thread with pnsr_calc.
+    Fix to uv width/height in vp9_scale_and_extend_frame_ssse3.
+    Fix to integer division by zero and overflow in calc_pframe_target_size().
+    Fix to integer overflow in vpx_img_alloc() & vpx_img_wrap()(CVE-2024-5197).
+    Fix to UBSan error in vp9_rc_update_framerate().
+    Fix to UBSan errors in vp8_new_framerate().
+    Fix to integer overflow in vp8 encodeframe.c.
+    Handle EINTR from sem_wait().
+
+2024-01-02 v1.14.0 "Venetian Duck"
+  This release drops support for old C compilers, such as Visual Studio 2012
+  and older, that disallow mixing variable declarations and statements (a C99
+  feature). It adds support for run-time CPU feature detection for Arm
+  platforms, as well as support for darwin23 (macOS 14).
+
+  - Upgrading:
+    This release is ABI incompatible with the previous release.
+
+    Various new features for rate control library for real-time: SVC parallel
+    encoding, loopfilter level, support for frame dropping, and screen content.
+
+    New callback function send_tpl_gop_stats for vp9 external rate control
+    library, which can be used to transmit TPL stats for a group of pictures. A
+    public header vpx_tpl.h is added for the definition of TPL stats used in
+    this callback.
+
+    libwebm is upgraded to libwebm-1.0.0.29-9-g1930e3c.
+
+  - Enhancement:
+    Improvements on Neon optimizations: VoD: 12-35% speed up for bitdepth 8,
+    68%-151% speed up for high bitdepth.
+
+    Improvements on AVX2 and SSE optimizations.
+    Improvements on LSX optimizations for LoongArch.
+    42-49% speedup on speed 0 VoD encoding.
+    Android API level predicates.
+
+  - Bug fixes:
+    Fix to missing prototypes from the rtcd header.
+    Fix to segfault when total size is enlarged but width is smaller.
+    Fix to the build for arm64ec using MSVC.
+    Fix to copy BLOCK_8X8's mi to PICK_MODE_CONTEXT::mic.
+    Fix to -Wshadow warnings.
+    Fix to heap overflow in vpx_get4x4sse_cs_neon.
+    Fix to buffer overrun in highbd Neon subpel variance filters.
+    Added bitexact encode test script.
+    Fix to -Wl,-z,defs with Clang's sanitizers.
+    Fix to decoder stability after error & continued decoding.
+    Fix to mismatch of VP9 encode with NEON intrinsics with C only version.
+    Fix to Arm64 MSVC compile vpx_highbd_fdct4x4_neon.
+    Fix to fragments count before use.
+    Fix to a case where target bandwidth is 0 for SVC.
+    Fix mask in vp9_quantize_avx2,highbd_get_max_lane_eob.
+    Fix to int overflow in vp9_calc_pframe_target_size_one_pass_cbr.
+    Fix to integer overflow in vp8,ratectrl.c.
+    Fix to integer overflow in vp9 svc.
+    Fix to avg_frame_bandwidth overflow.
+    Fix to per frame qp for temporal layers.
+    Fix to unsigned integer overflow in sse computation.
+    Fix to uninitialized mesh feature for BEST mode.
+    Fix to overflow in highbd temporal_filter.
+    Fix to unaligned loads w/w==4 in vpx_convolve_copy_neon.
+    Skip arm64_neon.h workaround w/VS >= 2019.
+    Fix to c vs avx mismatch of diamond_search_sad().
+    Fix to c vs intrinsic mismatch of vpx_hadamard_32x32() function.
+    Fix to a bug in vpx_hadamard_32x32_neon().
+    Fix to Clang -Wunreachable-code-aggressive warnings.
+    Fix to a bug in vpx_highbd_hadamard_32x32_neon().
+    Fix to -Wunreachable-code in mfqe_partition.
+    Force mode search on 64x64 if no mode is selected.
+    Fix to ubsan failure caused by left shift of negative.
+    Fix to integer overflow in calc_pframe_target_size.
+    Fix to float-cast-overflow in vp8_change_config().
+    Fix to a null ptr before use.
+    Conditionally skip using inter frames in speed features.
+    Remove invalid reference frames.
+    Disable intra mode search speed features conditionally.
+    Set nonrd keyframe under dynamic change of deadline for rtc.
+    Fix to scaled reference offsets.
+    Set skip_recode=0 in nonrd_pick_sb_modes.
+    Fix to an edge case when downsizing to one.
+    Fix to a bug in frame scaling.
+    Fix to pred buffer stride.
+    Fix to a bug in simple motion search.
+    Update frame size in actual encoding.
+
+2023-09-29 v1.13.1 "Ugly Duckling"
+  This release contains two security related fixes. One each for VP8 and VP9.
+
+  - Upgrading:
+    This release is ABI compatible with the previous release.
+
+  - Bug fixes:
+    https://crbug.com/1486441 (CVE-2023-5217)
+    Fix to a crash related to VP9 encoding (#1642, CVE-2023-6349)
+
+2023-01-31 v1.13.0 "Ugly Duckling"
+  This release includes more Neon and AVX2 optimizations, adds a new codec
+  control to set per frame QP, upgrades GoogleTest to v1.12.1, and includes
+  numerous bug fixes.
+
+  - Upgrading:
+    This release is ABI incompatible with the previous release.
+
+    New codec control VP9E_SET_QUANTIZER_ONE_PASS to set per frame QP.
+
+    GoogleTest is upgraded to v1.12.1.
+
+    .clang-format is upgraded to clang-format-11.
+
+    VPX_EXT_RATECTRL_ABI_VERSION was bumped due to incompatible changes to the
+    feature of using external rate control models for vp9.
+
+  - Enhancement:
+    Numerous improvements on Neon optimizations.
+    Numerous improvements on AVX2 optimizations.
+    Additional ARM targets added for Visual Studio.
+
+  - Bug fixes:
+    Fix to calculating internal stats when frame dropped.
+    Fix to segfault for external resize test in vp9.
+    Fix to build system with replacing egrep with grep -E.
+    Fix to a few bugs with external RTC rate control library.
+    Fix to make SVC work with VBR.
+    Fix to key frame setting in VP9 external RC.
+    Fix to -Wimplicit-int (Clang 16).
+    Fix to VP8 external RC for buffer levels.
+    Fix to VP8 external RC for dynamic update of layers.
+    Fix to VP9 auto level.
+    Fix to off-by-one error of max w/h in validate_config.
+    Fix to make SVC work for Profile 1.
+
+2022-06-17 v1.12.0 "Torrent Duck"
+  This release adds optimizations for Loongarch, adds support for vp8 in the
+  real-time rate control library, upgrades GoogleTest to v1.11.0, updates
+  libwebm to libwebm-1.0.0.28-20-g206d268, and includes numerous bug fixes.
+
+  - Upgrading:
+    This release is ABI compatible with the previous release.
+
+    vp8 support in the real-time rate control library.
+    New codec control VP8E_SET_RTC_EXTERNAL_RATECTRL is added.
+
+    Configure support for darwin21 is added.
+
+    GoogleTest is upgraded to v1.11.0.
+
+    libwebm is updated to libwebm-1.0.0.28-20-g206d268.
+
+    Allow SimpleEncode environment to take target level as input to match
+    the level conformance in vp9.
+
+  - Enhancement:
+    Numerous improvements on checking memory allocations.
+    Optimizations for Loongarch.
+    Code clean-up.
+
+  - Bug fixes:
+    Fix to a crash related to {vp8/vp9}_set_roi_map.
+    Fix to compiling failure with -Wformat-nonliteral.
+    Fix to integer overflow with vp9 with high resolution content.
+    Fix to AddNoiseTest failure with ARMv7.
+    Fix to libvpx Null-dereference READ in vp8.
+
+2021-09-27 v1.11.0 "Smew Duck"
+  This maintenance release adds support for VBR mode in VP9 rate control
+  interface, new codec controls to get quantization parameters and loop filter
+  levels, and includes several improvements to NEON and numerous bug fixes.
+
+  - Upgrading:
+    This release is ABI incompatible with the previous release.
+    New codec control is added to get quantization parameters and loop filter
+    levels.
+
+    VBR mode is supported in VP9 rate control library.
+
+  - Enhancement:
+    Numerous improvements for Neon optimizations.
+    Code clean-up and refactoring.
+    Calculation of rd multiplier is changed with BDRATE gains.
+
+  - Bug fixes:
+    Fix to overflow on duration.
+    Fix to several instances of -Wunused-but-set-variable.
+    Fix to avoid chroma resampling for 420mpeg2 input.
+    Fix to overflow in calc_iframe_target_size.
+    Fix to disallow skipping transform and quantization.
+    Fix some -Wsign-compare warnings in simple_encode.
+    Fix input file path in simple_encode_test.
+    Fix valid range for under/over_shoot pct.
+
+2021-03-09 v1.10.0 "Ruddy Duck"
+  This maintenance release adds support for darwin20 and new codec controls, as
+  well as numerous bug fixes.
+
+  - Upgrading:
+    This release is ABI incompatible with the previous release.
+    New codec control is added to disable loopfilter for VP9.
+
+    New encoder control is added to disable feature to increase Q on overshoot
+    detection for CBR.
+
+    Configure support for darwin20 is added.
+
+    New codec control is added for VP9 rate control. The control ID of this
+    interface is VP9E_SET_EXTERNAL_RATE_CONTROL. To make VP9 use a customized
+    external rate control model, users will have to implement each callback
+    function in vpx_rc_funcs_t and register them using libvpx API
+    vpx_codec_control_() with the control ID.
+
+  - Enhancement:
+    Use -std=gnu++11 instead of -std=c++11 for c++ files.
+
+  - Bug fixes:
+    Override assembler with --as option of configure for MSVS.
+    Fix several compilation issues with gcc 4.8.5.
+    Fix to resetting rate control for temporal layers.
+    Fix to the rate control stats of SVC example encoder when number of spatial
+    layers is 1.
+    Fix to reusing motion vectors from the base spatial layer in SVC.
+    2 pass related flags removed from SVC example encoder.
+
+2020-07-29 v1.9.0 "Quacking Duck"
+  This release adds support for NV12, a separate library for rate control, as
+  well as incremental improvements.
+
+  - Upgrading:
+    This release is ABI compatible with the previous release.
+    NV12 support is added to this release.
+    A new interface is added for VP9 rate control. The new library libvp9rc.a
+    must be linked by applications.
+    Googletest is updated to v1.10.0.
+    simple_encode.cc is compiled into a new library libsimple_encode.a with
+    CONFIG_RATE_CTRL.
+
+  - Enhancement:
+    Various changes to improve VP9 SVC, rate control, quality and speed to real
+    time encoding.
+
+  - Bug fixes:
+    Fix key frame update refresh simulcast flexible svc.
+    Fix to disable_16x16part speed feature for real time encoding.
+    Fix some signed integer overflows for VP9 rate control.
+    Fix initialization of delta_q_uv.
+    Fix condition in regulate_q for cyclic refresh.
+    Various fixes to dynamic resizing for VP9 SVC.
+
+2019-12-09 v1.8.2 "Pekin Duck"
+  This release collects incremental improvements to many aspects of the library.
+
+  - Upgrading:
+    This release is ABI compatible with the previous release.
+    ARCH_* defines have been removed in favor of VPX_ARCH_*.
+
+2019-07-15 v1.8.1 "Orpington Duck"
+  This release collects incremental improvements to many aspects of the library.
+
+  - Upgrading:
+    This release is ABI incompatible with the previous release.
+    VP8E_SET_CPUUSED now accepts values up to 9 for vp9.
+    VPX_CTRL_VP9E_SET_MAX_INTER_BITRATE_PCT had a spelling fix (was VP8E).
+    The --sdk-path option has been removed. If you were using it to build for
+      Android please read build/make/Android.mk for alternatives.
+    All PPC optimizations have been disabled:
+      https://bugs.chromium.org/p/webm/issues/detail?id=1522.
+
+  - Enhancements:
+    Various changes to improve encoder rate control, quality and speed
+      for practically every use case.
+
+  - Bug fixes:
+    vp9-rtc: Fix color artifacts for speed >= 8.
+
+2019-01-31 v1.8.0 "Northern Shoveler Duck"
+  This release focused on encoding performance for realtime and VOD use cases.
+
+  - Upgrading:
+    This release is ABI incompatible with the previous release. This adds and
+    improves several vp9 controls. Most are related to SVC:
+      VP9E_SET_SVC_FRAME_DROP_LAYER:
+        - Frame dropping in SVC.
+      VP9E_SET_SVC_INTER_LAYER_PRED:
+        - Inter-layer prediction in SVC.
+      VP9E_SET_SVC_GF_TEMPORAL_REF:
+        - Enable long term temporal reference in SVC.
+      VP9E_SET_SVC_REF_FRAME_CONFIG/VP9E_GET_SVC_REF_FRAME_CONFIG:
+        - Extend and improve this control for better flexibility in setting SVC
+          pattern dynamically.
+      VP9E_SET_POSTENCODE_DROP:
+        - Allow for post-encode frame dropping (applies to non-SVC too).
+      VP9E_SET_SVC_SPATIAL_LAYER_SYNC:
+        - Enable spatial layer sync frames.
+      VP9E_SET_SVC_LAYER_ID:
+        - Extend api to specify temporal id for each spatial layers.
+      VP9E_SET_ROI_MAP:
+        - Extend Region of Interest functionality to VP9.
+
+  - Enhancements:
+    2 pass vp9 encoding has improved substantially. When using --auto-alt-ref=6,
+    we see approximately 8% for VBR and 10% for CQ. When using --auto-alt-ref=1,
+    the gains are approximately 4% for VBR and 5% for CQ.
+
+    For real-time encoding, speed 7 has improved by ~5-10%. Encodes targeted at
+    screen sharing have improved when the content changes significantly (slide
+    sharing) or scrolls. There is a new speed 9 setting for mobile devices which
+    is about 10-20% faster than speed 8.
+
+  - Bug fixes:
+    VP9 denoiser issue.
+    VP9 partition issue for 1080p.
+    VP9 rate control improvments.
+    Postprocessing Multi Frame Quality Enhancement (MFQE) issue.
+    VP8 multithread decoder issues.
+    A variety of fuzzing issues.
+
+2018-01-04 v1.7.0 "Mandarin Duck"
+  This release focused on high bit depth performance (10/12 bit) and vp9
+  encoding improvements.
+
+  - Upgrading:
+    This release is ABI incompatible due to new vp9 encoder features.
+
+    Frame parallel decoding for vp9 has been removed.
+
+  - Enhancements:
+    vp9 encoding supports additional threads with --row-mt. This can be greater
+    than the number of tiles.
+
+    Two new vp9 encoder options have been added:
+      --corpus-complexity
+      --tune-content=film
+
+    Additional tooling for respecting the vp9 "level" profiles has been added.
+
+  - Bug fixes:
+    A variety of fuzzing issues.
+    vp8 threading fix for ARM.
+    Codec control VP9_SET_SKIP_LOOP_FILTER fixed.
+    Reject invalid multi resolution configurations.
+
 2017-01-09 v1.6.1 "Long Tailed Duck"
   This release improves upon the VP9 encoder and speeds up the encoding and
   decoding processes.
@@ -272,7 +762,7 @@
       of particular interest to real time streaming applications.
 
       Temporal scalability allows the encoder to produce a stream that can
-      be decimated to different frame rates, with independent rate targetting
+      be decimated to different frame rates, with independent rate targeting
       for each substream.
 
       Multiframe quality enhancement postprocessing can make visual quality
diff --git a/media/libvpx/libvpx/CONTRIBUTING.md b/media/libvpx/libvpx/CONTRIBUTING.md
new file mode 100644
index 0000000000..7a73a30317
--- /dev/null
+++ b/media/libvpx/libvpx/CONTRIBUTING.md
@@ -0,0 +1,29 @@
+# How to Contribute
+
+We'd love to accept your patches and contributions to this project. There are
+just a few small guidelines you need to follow.
+
+## Contributor License Agreement
+
+Contributions to this project must be accompanied by a Contributor License
+Agreement. You (or your employer) retain the copyright to your contribution;
+this simply gives us permission to use and redistribute your contributions as
+part of the project. Head over to <https://cla.developers.google.com/> to see
+your current agreements on file or to sign a new one.
+
+You generally only need to submit a CLA once, so if you've already submitted one
+(even if it was for a different project), you probably don't need to do it
+again.
+
+## Code reviews
+
+All submissions, including submissions by project members, require review. We
+use a [Gerrit](https://www.gerritcodereview.com) instance hosted at
+https://chromium-review.googlesource.com for this purpose. See the
+[WebM Project page](https://www.webmproject.org/code/contribute/submitting-patches/)
+for additional details.
+
+## Community Guidelines
+
+This project follows
+[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
diff --git a/media/libvpx/libvpx/README b/media/libvpx/libvpx/README
index 6d29968865..e47aa20ad4 100644
--- a/media/libvpx/libvpx/README
+++ b/media/libvpx/libvpx/README
@@ -1,5 +1,3 @@
-README - 9 January 2017
-
 Welcome to the WebM VP8/VP9 Codec SDK!
 
 COMPILING THE APPLICATIONS/LIBRARIES:
@@ -9,22 +7,31 @@ COMPILING THE APPLICATIONS/LIBRARIES:
 
   1. Prerequisites
 
-    * All x86 targets require the Yasm[1] assembler be installed.
-    * All Windows builds require that Cygwin[2] be installed.
-    * Building the documentation requires Doxygen[3]. If you do not
+    * All x86 targets require the NASM[0] or Yasm[1] assembler be installed[2].
+    * All Windows builds require that Cygwin[3] or MSYS2[4] be installed.
+    * Building the documentation requires Doxygen[5]. If you do not
       have this package, the install-docs option will be disabled.
-    * Downloading the data for the unit tests requires curl[4] and sha1sum.
+    * Downloading the data for the unit tests requires curl[6] and sha1sum.
       sha1sum is provided via the GNU coreutils, installed by default on
       many *nix platforms, as well as MinGW and Cygwin. If coreutils is not
       available, a compatible version of sha1sum can be built from
-      source[5]. These requirements are optional if not running the unit
+      source[7]. These requirements are optional if not running the unit
       tests.
 
+    [0]: https://www.nasm.us/
     [1]: http://www.tortall.net/projects/yasm
-    [2]: http://www.cygwin.com
-    [3]: http://www.doxygen.org
-    [4]: http://curl.haxx.se
-    [5]: http://www.microbrew.org/tools/md5sha1sum/
+    [2]: For Visual Studio the base yasm binary (not vsyasm) should be in the
+         PATH for Visual Studio. For VS2017 it is sufficient to rename
+         yasm-<version>-<arch>.exe to yasm.exe and place it in:
+         Program Files (x86)/Microsoft Visual Studio/2017/<level>/Common7/Tools/
+         The MSYS2 version of the yasm binary can also be used and avoids an
+         issue caused by a missing Visual C++ Redistributable install (Visual
+         Studio 2010, MSVCR100.dll).
+    [3]: http://www.cygwin.com
+    [4]: http://www.msys2.org/
+    [5]: http://www.doxygen.org
+    [6]: http://curl.haxx.se
+    [7]: http://www.microbrew.org/tools/md5sha1sum/
 
   2. Out-of-tree builds
   Out of tree builds are a supported method of building the application. For
@@ -41,7 +48,16 @@ COMPILING THE APPLICATIONS/LIBRARIES:
   used to get a list of supported options:
     $ ../libvpx/configure --help
 
-  4. Cross development
+  4. Compiler analyzers
+  Compilers have added sanitizers which instrument binaries with information
+  about address calculation, memory usage, threading, undefined behavior, and
+  other common errors. To simplify building libvpx with some of these features
+  use tools/set_analyzer_env.sh before running configure. It will set the
+  compiler and necessary flags for building as well as environment variables
+  read by the analyzer when testing the binaries.
+    $ source ../libvpx/tools/set_analyzer_env.sh address
+
+  5. Cross development
   For cross development, the most notable option is the --target option. The
   most up-to-date list of supported targets can be found at the bottom of the
   --help output of the configure script. As of this writing, the list of
@@ -49,19 +65,36 @@ COMPILING THE APPLICATIONS/LIBRARIES:
 
     arm64-android-gcc
     arm64-darwin-gcc
+    arm64-darwin20-gcc
+    arm64-darwin21-gcc
+    arm64-darwin22-gcc
+    arm64-darwin23-gcc
+    arm64-darwin24-gcc
+    arm64-darwin25-gcc
     arm64-linux-gcc
+    arm64-win64-gcc
+    arm64-win64-vs15
+    arm64-win64-vs16
+    arm64-win64-vs16-clangcl
+    arm64-win64-vs17
+    arm64-win64-vs17-clangcl
     armv7-android-gcc
     armv7-darwin-gcc
     armv7-linux-rvct
     armv7-linux-gcc
     armv7-none-rvct
-    armv7-win32-vs11
-    armv7-win32-vs12
+    armv7-win32-gcc
     armv7-win32-vs14
+    armv7-win32-vs15
+    armv7-win32-vs16
+    armv7-win32-vs17
     armv7s-darwin-gcc
     armv8-linux-gcc
+    loongarch32-linux-gcc
+    loongarch64-linux-gcc
     mips32-linux-gcc
     mips64-linux-gcc
+    ppc64le-linux-gcc
     sparc-solaris-gcc
     x86-android-gcc
     x86-darwin8-gcc
@@ -74,16 +107,18 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     x86-darwin13-gcc
     x86-darwin14-gcc
     x86-darwin15-gcc
+    x86-darwin16-gcc
+    x86-darwin17-gcc
     x86-iphonesimulator-gcc
     x86-linux-gcc
     x86-linux-icc
     x86-os2-gcc
     x86-solaris-gcc
     x86-win32-gcc
-    x86-win32-vs10
-    x86-win32-vs11
-    x86-win32-vs12
     x86-win32-vs14
+    x86-win32-vs15
+    x86-win32-vs16
+    x86-win32-vs17
     x86_64-android-gcc
     x86_64-darwin9-gcc
     x86_64-darwin10-gcc
@@ -92,15 +127,25 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     x86_64-darwin13-gcc
     x86_64-darwin14-gcc
     x86_64-darwin15-gcc
+    x86_64-darwin16-gcc
+    x86_64-darwin17-gcc
+    x86_64-darwin18-gcc
+    x86_64-darwin19-gcc
+    x86_64-darwin20-gcc
+    x86_64-darwin21-gcc
+    x86_64-darwin22-gcc
+    x86_64-darwin23-gcc
+    x86_64-darwin24-gcc
+    x86_64-darwin25-gcc
     x86_64-iphonesimulator-gcc
     x86_64-linux-gcc
     x86_64-linux-icc
     x86_64-solaris-gcc
     x86_64-win64-gcc
-    x86_64-win64-vs10
-    x86_64-win64-vs11
-    x86_64-win64-vs12
     x86_64-win64-vs14
+    x86_64-win64-vs15
+    x86_64-win64-vs16
+    x86_64-win64-vs17
     generic-gnu
 
   The generic-gnu target, in conjunction with the CROSS environment variable,
@@ -113,10 +158,10 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     $ CROSS=mipsel-linux-uclibc- ../libvpx/configure
 
   In addition, the executables to be invoked can be overridden by specifying the
-  environment variables: CC, AR, LD, AS, STRIP, NM. Additional flags can be
-  passed to these executables with CFLAGS, LDFLAGS, and ASFLAGS.
+  environment variables: AR, AS, CC, CXX, LD, STRIP. Additional flags can be
+  passed to these executables with ASFLAGS, CFLAGS, CXXFLAGS, and LDFLAGS.
 
-  5. Configuration errors
+  6. Configuration errors
   If the configuration step fails, the first step is to look in the error log.
   This defaults to config.log. This should give a good indication of what went
   wrong. If not, contact us for support.
@@ -144,7 +189,49 @@ CODE STYLE:
 
   See also: http://clang.llvm.org/docs/ClangFormat.html
 
+PROFILE GUIDED OPTIMIZATION (PGO)
+  Profile Guided Optimization can be enabled for Clang builds using the
+  commands:
+
+  $ export CC=clang
+  $ export CXX=clang++
+  $ ../libvpx/configure  --enable-profile
+  $ make
+
+  Generate one or multiple PGO profile files by running vpxdec or vpxenc. For
+  example:
+
+  $ ./vpxdec ../vpx/out_ful/vp90-2-sintel_1280x546_tile_1x4_1257kbps.webm \
+    -o - > /dev/null
+
+  To convert and merge the raw profile files, use the llvm-profdata tool:
+
+  $ llvm-profdata merge -o perf.profdata default_8382761441159425451_0.profraw
+
+  Then, rebuild the project with the new profile file:
+
+  $ make clean
+  $ ../libvpx/configure --use-profile=perf.profdata
+  $ make
+
+  Note: Always use the llvm-profdata from the toolchain that is used for
+  compiling the PGO-enabled binary.
+
+  To observe the improvements from a PGO-enabled build, enable and compare the
+  list of failed optimizations by using the -Rpass-missed compiler flag. For
+  example, to list the failed loop vectorizations:
+
+  $ ../libvpx/configure --use-profile=perf.profdata \
+    --extra-cflags=-Rpass-missed=loop-vectorize
+
+  For guidance on utilizing PGO files to identify potential optimization
+  opportunities, see: tools/README.pgo.md
+
 SUPPORT
   This library is an open source project supported by its community. Please
   email webm-discuss@webmproject.org for help.
 
+BUG REPORTS
+  Bug reports can be filed in the libvpx issue tracker:
+  https://issues.webmproject.org/.
+  For security reports, select 'Security report' from the Template dropdown.
diff --git a/media/libvpx/libvpx/args.c b/media/libvpx/libvpx/args.c
index a87b138b9d..f7dfacf8a4 100644
--- a/media/libvpx/libvpx/args.c
+++ b/media/libvpx/libvpx/args.c
@@ -8,21 +8,23 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
 #include "args.h"
 
 #include "vpx/vpx_integer.h"
-#include "vpx_ports/msvc.h"
 
-#if defined(__GNUC__) && __GNUC__
-extern void die(const char *fmt, ...) __attribute__((noreturn));
+#if defined(__GNUC__)
+__attribute__((noreturn)) extern void die(const char *fmt, ...);
+#elif defined(_MSC_VER)
+__declspec(noreturn) extern void die(const char *fmt, ...);
 #else
 extern void die(const char *fmt, ...);
 #endif
 
-struct arg arg_init(char **argv) {
+static struct arg arg_init(char **argv) {
   struct arg a;
 
   a.argv = argv;
@@ -81,6 +83,7 @@ const char *arg_next(struct arg *arg) {
 
 char **argv_dup(int argc, const char **argv) {
   char **new_argv = malloc((argc + 1) * sizeof(*argv));
+  if (!new_argv) return NULL;
 
   memcpy(new_argv, argv, argc * sizeof(*argv));
   new_argv[argc] = NULL;
@@ -88,24 +91,31 @@ char **argv_dup(int argc, const char **argv) {
 }
 
 void arg_show_usage(FILE *fp, const struct arg_def *const *defs) {
-  char option_text[40] = { 0 };
-
   for (; *defs; defs++) {
     const struct arg_def *def = *defs;
     char *short_val = def->has_val ? " <arg>" : "";
     char *long_val = def->has_val ? "=<arg>" : "";
+    int n = 0;
 
+    // Short options are indented with two spaces. Long options are indented
+    // with 12 spaces.
     if (def->short_name && def->long_name) {
       char *comma = def->has_val ? "," : ",      ";
 
-      snprintf(option_text, 37, "-%s%s%s --%s%6s", def->short_name, short_val,
-               comma, def->long_name, long_val);
+      n = fprintf(fp, "  -%s%s%s --%s%s", def->short_name, short_val, comma,
+                  def->long_name, long_val);
     } else if (def->short_name)
-      snprintf(option_text, 37, "-%s%s", def->short_name, short_val);
+      n = fprintf(fp, "  -%s%s", def->short_name, short_val);
     else if (def->long_name)
-      snprintf(option_text, 37, "          --%s%s", def->long_name, long_val);
+      n = fprintf(fp, "            --%s%s", def->long_name, long_val);
 
-    fprintf(fp, "  %-37s\t%s\n", option_text, def->desc);
+    // Descriptions are indented with 40 spaces. If an option is 40 characters
+    // or longer, its description starts on the next line.
+    if (n < 40)
+      for (int i = 0; i < 40 - n; i++) fputc(' ', fp);
+    else
+      fputs("\n                                        ", fp);
+    fprintf(fp, "%s\n", def->desc);
 
     if (def->enums) {
       const struct arg_enum_list *listptr;
@@ -132,7 +142,6 @@ unsigned int arg_parse_uint(const struct arg *arg) {
   }
 
   die("Option %s: Invalid character '%c'\n", arg->name, *endptr);
-  return 0;
 }
 
 int arg_parse_int(const struct arg *arg) {
@@ -149,7 +158,6 @@ int arg_parse_int(const struct arg *arg) {
   }
 
   die("Option %s: Invalid character '%c'\n", arg->name, *endptr);
-  return 0;
 }
 
 struct vpx_rational {
@@ -206,7 +214,6 @@ int arg_parse_enum(const struct arg *arg) {
     if (!strcmp(arg->val, listptr->name)) return listptr->val;
 
   die("Option %s: Invalid value '%s'\n", arg->name, arg->val);
-  return 0;
 }
 
 int arg_parse_enum_or_int(const struct arg *arg) {
diff --git a/media/libvpx/libvpx/args.h b/media/libvpx/libvpx/args.h
index 54abe04607..6b3cc1c28b 100644
--- a/media/libvpx/libvpx/args.h
+++ b/media/libvpx/libvpx/args.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef ARGS_H_
-#define ARGS_H_
+#ifndef VPX_ARGS_H_
+#define VPX_ARGS_H_
 #include <stdio.h>
 
 #ifdef __cplusplus
@@ -28,8 +28,7 @@ struct arg_enum_list {
   const char *name;
   int val;
 };
-#define ARG_ENUM_LIST_END \
-  { 0 }
+#define ARG_ENUM_LIST_END { 0 }
 
 typedef struct arg_def {
   const char *short_name;
@@ -38,19 +37,16 @@ typedef struct arg_def {
   const char *desc;
   const struct arg_enum_list *enums;
 } arg_def_t;
-#define ARG_DEF(s, l, v, d) \
-  { s, l, v, d, NULL }
-#define ARG_DEF_ENUM(s, l, v, d, e) \
-  { s, l, v, d, e }
-#define ARG_DEF_LIST_END \
-  { 0 }
+#define ARG_DEF(s, l, v, d) { s, l, v, d, NULL }
+#define ARG_DEF_ENUM(s, l, v, d, e) { s, l, v, d, e }
+#define ARG_DEF_LIST_END { 0 }
 
-struct arg arg_init(char **argv);
 int arg_match(struct arg *arg_, const struct arg_def *def, char **argv);
 const char *arg_next(struct arg *arg);
 void arg_show_usage(FILE *fp, const struct arg_def *const *defs);
 char **argv_dup(int argc, const char **argv);
 
+// Note: arg_match() must be called before invoking these functions.
 unsigned int arg_parse_uint(const struct arg *arg);
 int arg_parse_int(const struct arg *arg);
 struct vpx_rational arg_parse_rational(const struct arg *arg);
@@ -60,4 +56,4 @@ int arg_parse_enum_or_int(const struct arg *arg);
 }  // extern "C"
 #endif
 
-#endif  // ARGS_H_
+#endif  // VPX_ARGS_H_
diff --git a/media/libvpx/libvpx/build/make/Android.mk b/media/libvpx/libvpx/build/make/Android.mk
index a88f90056e..3663c38305 100644
--- a/media/libvpx/libvpx/build/make/Android.mk
+++ b/media/libvpx/libvpx/build/make/Android.mk
@@ -8,17 +8,15 @@
 ##  be found in the AUTHORS file in the root of the source tree.
 ##
 
+# Ignore this file during non-NDK builds.
+ifdef NDK_ROOT
 #
 # This file is to be used for compiling libvpx for Android using the NDK.
 # In an Android project place a libvpx checkout in the jni directory.
 # Run the configure script from the jni directory.  Base libvpx
 # encoder/decoder configuration will look similar to:
-# ./libvpx/configure --target=armv7-android-gcc --disable-examples \
-#                    --sdk-path=/opt/android-ndk-r6b/
-#
-# When targeting Android, realtime-only is enabled by default.  This can
-# be overridden by adding the command line flag:
-#  --disable-realtime-only
+# ./libvpx/configure --target=arm64-android-gcc --disable-examples \
+#                    --enable-external-build
 #
 # This will create .mk files that contain variables that contain the
 # source files to compile.
@@ -27,39 +25,25 @@
 # Android.mk file in the libvpx directory:
 # LOCAL_PATH := $(call my-dir)
 # include $(CLEAR_VARS)
-# include jni/libvpx/build/make/Android.mk
+# include libvpx/build/make/Android.mk
 #
-# By default libvpx will detect at runtime the existance of NEON extension.
-# For this we import the 'cpufeatures' module from the NDK sources.
-# libvpx can also be configured without this runtime detection method.
-# Configuring with --disable-runtime-cpu-detect will assume presence of NEON.
-# Configuring with --disable-runtime-cpu-detect --disable-neon \
-#     --disable-neon-asm
-# will remove any NEON dependency.
+# By default libvpx will use the 'cpufeatures' module from the NDK. This allows
+# the library to be built with all available optimizations (SSE2->AVX512 for
+# x86, NEON for arm, DSPr2 for mips). This can be disabled with
+#   --disable-runtime-cpu-detect
+# but the resulting library *must* be run on devices supporting all of the
+# enabled extensions. They can be disabled individually with
+#   --disable-{sse2, sse3, ssse3, sse4_1, avx, avx2, avx512}
+#   --disable-neon{, -asm, -neon-dotprod, -neon-i8mm}
+#   --disable-sve
+#   --disable-{dspr2, msa}
 
 #
-# Running ndk-build will build libvpx and include it in your project.
+# Running ndk-build will build libvpx and include it in your project. Set
+# APP_ABI to match the --target passed to configure:
+# https://developer.android.com/ndk/guides/application_mk#app_abi.
 #
 
-# Alternatively, building the examples and unit tests can be accomplished in the
-# following way:
-#
-# Create a standalone toolchain from the NDK:
-# https://developer.android.com/ndk/guides/standalone_toolchain.html
-#
-# For example - to test on arm64 devices with clang:
-# $NDK/build/tools/make_standalone_toolchain.py \
-#   --arch arm64 --install-dir=/tmp/my-android-toolchain
-# export PATH=/tmp/my-android-toolchain/bin:$PATH
-# CROSS=aarch64-linux-android- CC=clang CXX=clang++ /path/to/libvpx/configure \
-#   --target=arm64-android-gcc
-#
-# Push the resulting binaries to a device and run them:
-# adb push test_libvpx /data/tmp/test_libvpx
-# adb shell /data/tmp/test_libvpx --gtest_filter=\*Sixtap\*
-#
-# Make sure to push the test data as well and set LIBVPX_TEST_DATA
-
 CONFIG_DIR := $(LOCAL_PATH)/
 LIBVPX_PATH := $(LOCAL_PATH)/libvpx
 ASM_CNV_PATH_LOCAL := $(TARGET_ARCH_ABI)/ads2gas
@@ -183,6 +167,9 @@ LOCAL_CFLAGS += \
     -I$(ASM_CNV_PATH)/libvpx
 
 LOCAL_MODULE := libvpx
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-BSD
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_NOTICE_FILE := $(LOCAL_PATH)/../../LICENSE $(LOCAL_PATH)/../../PATENTS
 
 ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes)
   LOCAL_STATIC_LIBRARIES := cpufeatures
@@ -226,3 +213,4 @@ endif
 ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes)
 $(call import-module,android/cpufeatures)
 endif
+endif  # NDK_ROOT
diff --git a/media/libvpx/libvpx/build/make/Makefile b/media/libvpx/libvpx/build/make/Makefile
index cba605786c..39dff825b6 100644
--- a/media/libvpx/libvpx/build/make/Makefile
+++ b/media/libvpx/libvpx/build/make/Makefile
@@ -21,9 +21,9 @@ all: .DEFAULT
 clean:: .DEFAULT
 exampletest: .DEFAULT
 install:: .DEFAULT
-test:: .DEFAULT
-test-no-data-check:: .DEFAULT
-testdata:: .DEFAULT
+test: .DEFAULT
+test-no-data-check: .DEFAULT
+testdata: .DEFAULT
 utiltest: .DEFAULT
 exampletest-no-data-check utiltest-no-data-check: .DEFAULT
 test_%: .DEFAULT ;
@@ -99,6 +99,7 @@ distclean: clean
       rm -f Makefile; \
       rm -f config.log config.mk; \
       rm -f vpx_config.[hc] vpx_config.asm; \
+      rm -f arm_neon.h; \
     else \
       rm -f $(target)-$(TOOLCHAIN).mk; \
     fi
@@ -110,13 +111,13 @@ exampletest:
 .PHONY: install
 install::
 .PHONY: test
-test::
+test:
 .PHONY: testdata
-testdata::
+testdata:
 .PHONY: utiltest
 utiltest:
 .PHONY: test-no-data-check exampletest-no-data-check utiltest-no-data-check
-test-no-data-check::
+test-no-data-check:
 exampletest-no-data-check utiltest-no-data-check:
 
 # Force to realign stack always on OS/2
@@ -124,6 +125,7 @@ ifeq ($(TOOLCHAIN), x86-os2-gcc)
 CFLAGS += -mstackrealign
 endif
 
+# x86[_64]
 $(BUILD_PFX)%_mmx.c.d: CFLAGS += -mmmx
 $(BUILD_PFX)%_mmx.c.o: CFLAGS += -mmmx
 $(BUILD_PFX)%_sse2.c.d: CFLAGS += -msse2
@@ -138,6 +140,32 @@ $(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx
 $(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx
 $(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2
 $(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2
+$(BUILD_PFX)%_avx512.c.d: CFLAGS += -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl
+$(BUILD_PFX)%_avx512.c.o: CFLAGS += -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl
+
+# AARCH64
+$(BUILD_PFX)%_neon_dotprod.c.d: CFLAGS += -march=armv8.2-a+dotprod
+$(BUILD_PFX)%_neon_dotprod.c.o: CFLAGS += -march=armv8.2-a+dotprod
+$(BUILD_PFX)%_neon_i8mm.c.d: CFLAGS += -march=armv8.2-a+dotprod+i8mm
+$(BUILD_PFX)%_neon_i8mm.c.o: CFLAGS += -march=armv8.2-a+dotprod+i8mm
+$(BUILD_PFX)%_sve.c.d: CFLAGS += -march=armv8.2-a+dotprod+i8mm+sve
+$(BUILD_PFX)%_sve.c.o: CFLAGS += -march=armv8.2-a+dotprod+i8mm+sve
+$(BUILD_PFX)%_sve2.c.d: CFLAGS += -march=armv9-a+i8mm+sve2
+$(BUILD_PFX)%_sve2.c.o: CFLAGS += -march=armv9-a+i8mm+sve2
+
+# POWER
+$(BUILD_PFX)%_vsx.c.d: CFLAGS += -maltivec -mvsx
+$(BUILD_PFX)%_vsx.c.o: CFLAGS += -maltivec -mvsx
+
+# MIPS
+$(BUILD_PFX)%_msa.c.d: CFLAGS += -mmsa
+$(BUILD_PFX)%_msa.c.o: CFLAGS += -mmsa
+
+# LOONGARCH
+$(BUILD_PFX)%_lsx.c.d:  CFLAGS += -mlsx
+$(BUILD_PFX)%_lsx.c.o:  CFLAGS += -mlsx
+$(BUILD_PFX)%_lasx.c.d: CFLAGS += -mlasx
+$(BUILD_PFX)%_lasx.c.o: CFLAGS += -mlasx
 
 $(BUILD_PFX)%.c.d: %.c
 	$(if $(quiet),@echo "    [DEP] $@")
@@ -286,6 +314,19 @@ $(1):
 	$(qexec)$$(AR) $$(ARFLAGS) $$@ $$^
 endef
 
+# Don't use -Wl,-z,defs with Clang's sanitizers.
+#
+# Clang's AddressSanitizer documentation says "When linking shared libraries,
+# the AddressSanitizer run-time is not linked, so -Wl,-z,defs may cause link
+# errors (don't use it with AddressSanitizer)." See
+# https://clang.llvm.org/docs/AddressSanitizer.html#usage.
+NO_UNDEFINED := -Wl,-z,defs
+ifeq ($(findstring clang,$(CC)),clang)
+    ifneq ($(filter -fsanitize=%,$(LDFLAGS)),)
+        NO_UNDEFINED :=
+    endif
+endif
+
 define so_template
 # Not using a pattern rule here because we don't want to generate empty
 # archives when they are listed as a dependency in files not responsible
@@ -295,7 +336,8 @@ define so_template
 $(1):
 	$(if $(quiet),@echo "    [LD] $$@")
 	$(qexec)$$(LD) -shared $$(LDFLAGS) \
-            -Wl,--no-undefined -Wl,-soname,$$(SONAME) \
+            $(NO_UNDEFINED) \
+            -Wl,-soname,$$(SONAME) \
             -Wl,--version-script,$$(EXPORTS_FILE) -o $$@ \
             $$(filter %.o,$$^) $$(extralibs)
 endef
@@ -422,10 +464,10 @@ ifneq ($(call enabled,DIST-SRCS),)
     DIST-SRCS-$(CONFIG_MSVS)  += build/make/gen_msvs_vcxproj.sh
     DIST-SRCS-$(CONFIG_MSVS)  += build/make/msvs_common.sh
     DIST-SRCS-$(CONFIG_RVCT) += build/make/armlink_adapter.sh
-    DIST-SRCS-$(ARCH_ARM)    += build/make/ads2gas.pl
-    DIST-SRCS-$(ARCH_ARM)    += build/make/ads2gas_apple.pl
-    DIST-SRCS-$(ARCH_ARM)    += build/make/ads2armasm_ms.pl
-    DIST-SRCS-$(ARCH_ARM)    += build/make/thumb.pm
+    DIST-SRCS-$(VPX_ARCH_ARM) += build/make/ads2gas.pl
+    DIST-SRCS-$(VPX_ARCH_ARM) += build/make/ads2gas_apple.pl
+    DIST-SRCS-$(VPX_ARCH_ARM) += build/make/ads2armasm_ms.pl
+    DIST-SRCS-$(VPX_ARCH_ARM) += build/make/thumb.pm
     DIST-SRCS-yes            += $(target:-$(TOOLCHAIN)=).mk
 endif
 INSTALL-SRCS := $(call cond_enabled,CONFIG_INSTALL_SRCS,INSTALL-SRCS)
@@ -447,6 +489,6 @@ INSTALL_TARGETS += .install-docs .install-srcs .install-libs .install-bins
 all: $(BUILD_TARGETS)
 install:: $(INSTALL_TARGETS)
 dist: $(INSTALL_TARGETS)
-test::
+test:
 
 .SUFFIXES:  # Delete default suffix rules
diff --git a/media/libvpx/libvpx/build/make/ads2armasm_ms.pl b/media/libvpx/libvpx/build/make/ads2armasm_ms.pl
index 2a2c470ff8..dd4e0318c4 100644
--- a/media/libvpx/libvpx/build/make/ads2armasm_ms.pl
+++ b/media/libvpx/libvpx/build/make/ads2armasm_ms.pl
@@ -28,7 +28,7 @@ while (<STDIN>)
     s/qsubaddx/qsax/i;
     s/qaddsubx/qasx/i;
 
-    thumb::FixThumbInstructions($_, 1);
+    thumb::FixThumbInstructions($_);
 
     s/ldrneb/ldrbne/i;
     s/ldrneh/ldrhne/i;
diff --git a/media/libvpx/libvpx/build/make/ads2gas.pl b/media/libvpx/libvpx/build/make/ads2gas.pl
index 029cc4a56f..c301b7f829 100644
--- a/media/libvpx/libvpx/build/make/ads2gas.pl
+++ b/media/libvpx/libvpx/build/make/ads2gas.pl
@@ -23,16 +23,17 @@ use lib $FindBin::Bin;
 use thumb;
 
 my $thumb = 0;
+my $elf = 1;
 
 foreach my $arg (@ARGV) {
     $thumb = 1 if ($arg eq "-thumb");
+    $elf = 0 if ($arg eq "-noelf");
 }
 
 print "@ This file was created from a .asm file\n";
 print "@  using the ads2gas.pl script.\n";
-print "\t.equ DO1STROUNDING, 0\n";
+print ".syntax unified\n";
 if ($thumb) {
-    print "\t.syntax unified\n";
     print "\t.thumb\n";
 }
 
@@ -41,39 +42,11 @@ if ($thumb) {
 
 while (<STDIN>)
 {
-    undef $comment;
-    undef $line;
-    $comment_char = ";";
-    $comment_sub = "@";
-
-    # Handle comments.
-    if (/$comment_char/)
-    {
-      $comment = "";
-      ($line, $comment) = /(.*?)$comment_char(.*)/;
-      $_ = $line;
-    }
-
     # Load and store alignment
     s/@/,:/g;
 
-    # Hexadecimal constants prefaced by 0x
-    s/#&/#0x/g;
-
-    # Convert :OR: to |
-    s/:OR:/ | /g;
-
-    # Convert :AND: to &
-    s/:AND:/ & /g;
-
-    # Convert :NOT: to ~
-    s/:NOT:/ ~ /g;
-
-    # Convert :SHL: to <<
-    s/:SHL:/ << /g;
-
-    # Convert :SHR: to >>
-    s/:SHR:/ >> /g;
+    # Comment character
+    s/;/@/;
 
     # Convert ELSE to .else
     s/\bELSE\b/.else/g;
@@ -81,128 +54,83 @@ while (<STDIN>)
     # Convert ENDIF to .endif
     s/\bENDIF\b/.endif/g;
 
-    # Convert ELSEIF to .elseif
-    s/\bELSEIF\b/.elseif/g;
-
-    # Convert LTORG to .ltorg
-    s/\bLTORG\b/.ltorg/g;
-
-    # Convert endfunc to nothing.
-    s/\bendfunc\b//ig;
-
-    # Convert FUNCTION to nothing.
-    s/\bFUNCTION\b//g;
-    s/\bfunction\b//g;
-
-    s/\bENTRY\b//g;
-    s/\bMSARMASM\b/0/g;
-    s/^\s+end\s+$//g;
-
-    # Convert IF :DEF:to .if
-    # gcc doesn't have the ability to do a conditional
-    # if defined variable that is set by IF :DEF: on
-    # armasm, so convert it to a normal .if and then
-    # make sure to define a value elesewhere
-    if (s/\bIF :DEF:\b/.if /g)
-    {
-        s/=/==/g;
-    }
-
     # Convert IF to .if
-    if (s/\bIF\b/.if/g)
-    {
+    if (s/\bIF\b/.if/g) {
         s/=+/==/g;
     }
 
     # Convert INCLUDE to .INCLUDE "file"
-    s/INCLUDE(\s*)(.*)$/.include $1\"$2\"/;
-
-    # Code directive (ARM vs Thumb)
-    s/CODE([0-9][0-9])/.code $1/;
+    s/INCLUDE\s?(.*)$/.include \"$1\"/;
 
     # No AREA required
     # But ALIGNs in AREA must be obeyed
-    s/^\s*AREA.*ALIGN=([0-9])$/.text\n.p2align $1/;
+    s/^(\s*)\bAREA\b.*ALIGN=([0-9])$/$1.text\n$1.p2align $2/;
     # If no ALIGN, strip the AREA and align to 4 bytes
-    s/^\s*AREA.*$/.text\n.p2align 2/;
+    s/^(\s*)\bAREA\b.*$/$1.text\n$1.p2align 2/;
 
-    # DCD to .word
-    # This one is for incoming symbols
-    s/DCD\s+\|(\w*)\|/.long $1/;
+    # Make function visible to linker.
+    if ($elf) {
+        s/(\s*)EXPORT\s+\|([\$\w]*)\|/$1.global $2\n$1.type $2, function/;
+    } else {
+        s/(\s*)EXPORT\s+\|([\$\w]*)\|/$1.global $2/;
+    }
 
-    # DCW to .short
-    s/DCW\s+\|(\w*)\|/.short $1/;
-    s/DCW(.*)/.short $1/;
-
-    # Constants defined in scope
-    s/DCD(.*)/.long $1/;
-    s/DCB(.*)/.byte $1/;
-
-    # Make function visible to linker, and make additional symbol with
-    # prepended underscore
-    s/EXPORT\s+\|([\$\w]*)\|/.global $1 \n\t.type $1, function/;
-    s/IMPORT\s+\|([\$\w]*)\|/.global $1/;
-
-    s/EXPORT\s+([\$\w]*)/.global $1/;
-    s/export\s+([\$\w]*)/.global $1/;
-
-    # No vertical bars required; make additional symbol with prepended
-    # underscore
-    s/^\|(\$?\w+)\|/_$1\n\t$1:/g;
+    # No vertical bars on function names
+    s/^\|(\$?\w+)\|/$1/g;
 
     # Labels need trailing colon
-#   s/^(\w+)/$1:/ if !/EQU/;
-    # put the colon at the end of the line in the macro
     s/^([a-zA-Z_0-9\$]+)/$1:/ if !/EQU/;
 
     # ALIGN directive
     s/\bALIGN\b/.balign/g;
 
     if ($thumb) {
-        # ARM code - we force everything to thumb with the declaration in the header
-        s/\sARM//g;
+        # ARM code - we force everything to thumb with the declaration in the
+        # header
+        s/\bARM\b//g;
     } else {
         # ARM code
-        s/\sARM/.arm/g;
+        s/\bARM\b/.arm/g;
     }
 
     # push/pop
     s/(push\s+)(r\d+)/stmdb sp\!, \{$2\}/g;
     s/(pop\s+)(r\d+)/ldmia sp\!, \{$2\}/g;
 
-    # NEON code
-    s/(vld1.\d+\s+)(q\d+)/$1\{$2\}/g;
-    s/(vtbl.\d+\s+[^,]+),([^,]+)/$1,\{$2\}/g;
-
     if ($thumb) {
-        thumb::FixThumbInstructions($_, 0);
+        thumb::FixThumbInstructions($_);
     }
 
     # eabi_attributes numerical equivalents can be found in the
     # "ARM IHI 0045C" document.
 
-    # REQUIRE8 Stack is required to be 8-byte aligned
-    s/\sREQUIRE8/.eabi_attribute 24, 1 \@Tag_ABI_align_needed/g;
+    if ($elf) {
+        # REQUIRE8 Stack is required to be 8-byte aligned
+        s/\bREQUIRE8\b/.eabi_attribute 24, 1 \@Tag_ABI_align_needed/g;
 
-    # PRESERVE8 Stack 8-byte align is preserved
-    s/\sPRESERVE8/.eabi_attribute 25, 1 \@Tag_ABI_align_preserved/g;
+        # PRESERVE8 Stack 8-byte align is preserved
+        s/\bPRESERVE8\b/.eabi_attribute 25, 1 \@Tag_ABI_align_preserved/g;
+    } else {
+        s/\bREQUIRE8\b//;
+        s/\bPRESERVE8\b//;
+    }
 
     # Use PROC and ENDP to give the symbols a .size directive.
     # This makes them show up properly in debugging tools like gdb and valgrind.
-    if (/\bPROC\b/)
-    {
+    if (/\bPROC\b/) {
         my $proc;
-        /^_([\.0-9A-Z_a-z]\w+)\b/;
+        # Match the function name so it can be stored in $proc
+        /^([\.0-9A-Z_a-z]\w+)\b/;
         $proc = $1;
         push(@proc_stack, $proc) if ($proc);
         s/\bPROC\b/@ $&/;
     }
-    if (/\bENDP\b/)
-    {
+
+    if (/\bENDP\b/) {
         my $proc;
         s/\bENDP\b/@ $&/;
         $proc = pop(@proc_stack);
-        $_ = "\t.size $proc, .-$proc".$_ if ($proc);
+        $_ = ".size $proc, .-$proc".$_ if ($proc and $elf);
     }
 
     # EQU directive
@@ -210,19 +138,20 @@ while (<STDIN>)
 
     # Begin macro definition
     if (/\bMACRO\b/) {
+        # Process next line down, which will be the macro definition
         $_ = <STDIN>;
         s/^/.macro/;
-        s/\$//g;                # remove formal param reference
-        s/;/@/g;                # change comment characters
+        s/\$//g;             # Remove $ from the variables in the declaration
     }
 
-    # For macros, use \ to reference formal params
-    s/\$/\\/g;                  # End macro definition
-    s/\bMEND\b/.endm/;              # No need to tell it where to stop assembling
+    s/\$/\\/g;               # Use \ to reference formal parameters
+    # End macro definition
+
+    s/\bMEND\b/.endm/;       # No need to tell it where to stop assembling
     next if /^\s*END\s*$/;
+    s/[ \t]+$//;
     print;
-    print "$comment_sub$comment\n" if defined $comment;
 }
 
 # Mark that this object doesn't need an executable stack.
-printf ("\t.section\t.note.GNU-stack,\"\",\%\%progbits\n");
+printf ("    .section .note.GNU-stack,\"\",\%\%progbits\n") if $elf;
diff --git a/media/libvpx/libvpx/build/make/ads2gas_apple.pl b/media/libvpx/libvpx/build/make/ads2gas_apple.pl
index e1ae7b4f87..62491c1918 100644
--- a/media/libvpx/libvpx/build/make/ads2gas_apple.pl
+++ b/media/libvpx/libvpx/build/make/ads2gas_apple.pl
@@ -20,19 +20,14 @@
 
 print "@ This file was created from a .asm file\n";
 print "@  using the ads2gas_apple.pl script.\n\n";
-print "\t.set WIDE_REFERENCE, 0\n";
-print "\t.set ARCHITECTURE, 5\n";
-print "\t.set DO1STROUNDING, 0\n";
+print ".syntax unified\n";
 
-my %register_aliases;
 my %macro_aliases;
 
 my @mapping_list = ("\$0", "\$1", "\$2", "\$3", "\$4", "\$5", "\$6", "\$7", "\$8", "\$9");
 
 my @incoming_array;
 
-my @imported_functions;
-
 # Perl trim function to remove whitespace from the start and end of the string
 sub trim($)
 {
@@ -48,25 +43,7 @@ while (<STDIN>)
     s/@/,:/g;
 
     # Comment character
-    s/;/ @/g;
-
-    # Hexadecimal constants prefaced by 0x
-    s/#&/#0x/g;
-
-    # Convert :OR: to |
-    s/:OR:/ | /g;
-
-    # Convert :AND: to &
-    s/:AND:/ & /g;
-
-    # Convert :NOT: to ~
-    s/:NOT:/ ~ /g;
-
-    # Convert :SHL: to <<
-    s/:SHL:/ << /g;
-
-    # Convert :SHR: to >>
-    s/:SHR:/ >> /g;
+    s/;/@/;
 
     # Convert ELSE to .else
     s/\bELSE\b/.else/g;
@@ -74,131 +51,64 @@ while (<STDIN>)
     # Convert ENDIF to .endif
     s/\bENDIF\b/.endif/g;
 
-    # Convert ELSEIF to .elseif
-    s/\bELSEIF\b/.elseif/g;
-
-    # Convert LTORG to .ltorg
-    s/\bLTORG\b/.ltorg/g;
-
-    # Convert IF :DEF:to .if
-    # gcc doesn't have the ability to do a conditional
-    # if defined variable that is set by IF :DEF: on
-    # armasm, so convert it to a normal .if and then
-    # make sure to define a value elesewhere
-    if (s/\bIF :DEF:\b/.if /g)
-    {
-        s/=/==/g;
-    }
-
     # Convert IF to .if
-    if (s/\bIF\b/.if/g)
-    {
-        s/=/==/g;
+    if (s/\bIF\b/.if/g) {
+        s/=+/==/g;
     }
 
     # Convert INCLUDE to .INCLUDE "file"
-    s/INCLUDE(\s*)(.*)$/.include $1\"$2\"/;
-
-    # Code directive (ARM vs Thumb)
-    s/CODE([0-9][0-9])/.code $1/;
+    s/INCLUDE\s?(.*)$/.include \"$1\"/;
 
     # No AREA required
     # But ALIGNs in AREA must be obeyed
-    s/^\s*AREA.*ALIGN=([0-9])$/.text\n.p2align $1/;
+    s/^(\s*)\bAREA\b.*ALIGN=([0-9])$/$1.text\n$1.p2align $2/;
     # If no ALIGN, strip the AREA and align to 4 bytes
-    s/^\s*AREA.*$/.text\n.p2align 2/;
+    s/^(\s*)\bAREA\b.*$/$1.text\n$1.p2align 2/;
 
-    # DCD to .word
-    # This one is for incoming symbols
-    s/DCD\s+\|(\w*)\|/.long $1/;
+    # Make function visible to linker.
+    s/EXPORT\s+\|([\$\w]*)\|/.globl _$1/;
 
-    # DCW to .short
-    s/DCW\s+\|(\w*)\|/.short $1/;
-    s/DCW(.*)/.short $1/;
+    # No vertical bars on function names
+    s/^\|(\$?\w+)\|/$1/g;
 
-    # Constants defined in scope
-    s/DCD(.*)/.long $1/;
-    s/DCB(.*)/.byte $1/;
+    # Labels and functions need a leading underscore and trailing colon
+    s/^([a-zA-Z_0-9\$]+)/_$1:/ if !/EQU/;
 
-    # Make function visible to linker, and make additional symbol with
-    # prepended underscore
-    s/EXPORT\s+\|([\$\w]*)\|/.globl _$1\n\t.globl $1/;
-
-    # Prepend imported functions with _
-    if (s/IMPORT\s+\|([\$\w]*)\|/.globl $1/)
-    {
-        $function = trim($1);
-        push(@imported_functions, $function);
-    }
-
-    foreach $function (@imported_functions)
-    {
-        s/$function/_$function/;
-    }
-
-    # No vertical bars required; make additional symbol with prepended
-    # underscore
-    s/^\|(\$?\w+)\|/_$1\n\t$1:/g;
-
-    # Labels need trailing colon
-#   s/^(\w+)/$1:/ if !/EQU/;
-    # put the colon at the end of the line in the macro
-    s/^([a-zA-Z_0-9\$]+)/$1:/ if !/EQU/;
+    # Branches need to call the correct, underscored, function
+    s/^(\s+b[egln]?[teq]?\s+)([a-zA-Z_0-9\$]+)/$1 _$2/ if !/EQU/;
 
     # ALIGN directive
     s/\bALIGN\b/.balign/g;
 
     # Strip ARM
-    s/\sARM/@ ARM/g;
+    s/\s+ARM//;
 
     # Strip REQUIRE8
-    #s/\sREQUIRE8/@ REQUIRE8/g;
-    s/\sREQUIRE8/@ /g;
+    s/\s+REQUIRE8//;
 
     # Strip PRESERVE8
-    s/\sPRESERVE8/@ PRESERVE8/g;
+    s/\s+PRESERVE8//;
 
     # Strip PROC and ENDPROC
-    s/\bPROC\b/@/g;
-    s/\bENDP\b/@/g;
+    s/\bPROC\b//g;
+    s/\bENDP\b//g;
 
     # EQU directive
-    s/(.*)EQU(.*)/.set $1, $2/;
+    s/(\S+\s+)EQU(\s+\S+)/.equ $1, $2/;
 
     # Begin macro definition
-    if (/\bMACRO\b/)
-    {
+    if (/\bMACRO\b/) {
         # Process next line down, which will be the macro definition
         $_ = <STDIN>;
-
-        $trimmed = trim($_);
-
-        # remove commas that are separating list
-        $trimmed =~ s/,//g;
-
-        # string to array
-        @incoming_array = split(/\s+/, $trimmed);
-
-        print ".macro @incoming_array[0]\n";
-
-        # remove the first element, as that is the name of the macro
-        shift (@incoming_array);
-
-        @macro_aliases{@incoming_array} = @mapping_list;
-
-        next;
+        s/^/.macro/;
+        s/\$//g;             # Remove $ from the variables in the declaration
     }
 
-    while (($key, $value) = each(%macro_aliases))
-    {
-        $key =~ s/\$/\\\$/;
-        s/$key\b/$value/g;
-    }
+    s/\$/\\/g;               # Use \ to reference formal parameters
+    # End macro definition
 
-    # For macros, use \ to reference formal params
-#   s/\$/\\/g;                  # End macro definition
-    s/\bMEND\b/.endm/;              # No need to tell it where to stop assembling
+    s/\bMEND\b/.endm/;       # No need to tell it where to stop assembling
     next if /^\s*END\s*$/;
-
+    s/[ \t]+$//;
     print;
 }
diff --git a/media/libvpx/libvpx/build/make/configure.sh b/media/libvpx/libvpx/build/make/configure.sh
index 007e020002..21407f3d89 100644
--- a/media/libvpx/libvpx/build/make/configure.sh
+++ b/media/libvpx/libvpx/build/make/configure.sh
@@ -74,6 +74,8 @@ Build options:
   --cpu=CPU                   optimize for a specific cpu rather than a family
   --extra-cflags=ECFLAGS      add ECFLAGS to CFLAGS [$CFLAGS]
   --extra-cxxflags=ECXXFLAGS  add ECXXFLAGS to CXXFLAGS [$CXXFLAGS]
+  --use-profile=PROFILE_FILE
+                              Use PROFILE_FILE for PGO
   ${toggle_extra_warnings}    emit harmless warnings (always non-fatal)
   ${toggle_werror}            treat warnings as errors, if possible
                               (not available with all compilers)
@@ -81,6 +83,7 @@ Build options:
   ${toggle_pic}               turn on/off Position Independent Code
   ${toggle_ccache}            turn on/off compiler cache
   ${toggle_debug}             enable/disable debug mode
+  ${toggle_profile}           enable/disable profiling
   ${toggle_gprof}             enable/disable gprof profiling instrumentation
   ${toggle_gcov}              enable/disable gcov coverage instrumentation
   ${toggle_thumb}             enable/disable building arm assembly in thumb mode
@@ -262,6 +265,9 @@ if [ -z "$source_path" ] || [ "$source_path" = "." ]; then
   source_path="`pwd`"
   disable_feature source_path_used
 fi
+# Makefiles greedily process the '#' character as a comment, even if it is
+# inside quotes. So, this character must be escaped in all paths in Makefiles.
+source_path_mk=$(echo $source_path | sed -e 's;\#;\\\#;g')
 
 if test ! -z "$TMPDIR" ; then
   TMPDIRx="${TMPDIR}"
@@ -319,6 +325,12 @@ check_ld() {
     && check_cmd ${LD} ${LDFLAGS} "$@" -o ${TMP_X} ${TMP_O} ${extralibs}
 }
 
+check_lib() {
+  log check_lib "$@"
+  check_cc $@ \
+    && check_cmd ${LD} ${LDFLAGS} -o ${TMP_X} ${TMP_O} "$@" ${extralibs}
+}
+
 check_header(){
   log check_header "$@"
   header=$1
@@ -403,6 +415,90 @@ check_gcc_machine_option() {
   fi
 }
 
+# tests for -m$2, -m$3, -m$4... toggling the feature given in $1.
+check_gcc_machine_options() {
+  feature="$1"
+  shift
+  flags="-m$1"
+  shift
+  for opt in $*; do
+    flags="$flags -m$opt"
+  done
+
+  if enabled gcc && ! disabled "$feature" && ! check_cflags $flags; then
+    RTCD_OPTIONS="${RTCD_OPTIONS}--disable-$feature "
+  else
+    soft_enable "$feature"
+  fi
+}
+
+check_neon_sve_bridge_compiles() {
+  if enabled sve; then
+    check_cc -march=armv8.2-a+dotprod+i8mm+sve <<EOF
+#ifndef __ARM_NEON_SVE_BRIDGE
+#error 1
+#endif
+#include <arm_sve.h>
+#include <arm_neon_sve_bridge.h>
+EOF
+    compile_result=$?
+    if [ ${compile_result} -eq 0 ]; then
+      # Check whether the compiler can compile SVE functions that require
+      # backup/restore of SVE registers according to AAPCS. Clang for Windows
+      # used to fail this, see
+      # https://github.com/llvm/llvm-project/issues/80009.
+      check_cc -march=armv8.2-a+dotprod+i8mm+sve <<EOF
+#include <arm_sve.h>
+void other(void);
+svfloat32_t func(svfloat32_t a) {
+  other();
+  return a;
+}
+EOF
+      compile_result=$?
+    fi
+
+    if [ ${compile_result} -ne 0 ]; then
+      log_echo "  disabling sve: arm_neon_sve_bridge.h not supported by compiler"
+      log_echo "  disabling sve2: arm_neon_sve_bridge.h not supported by compiler"
+      disable_feature sve
+      disable_feature sve2
+      RTCD_OPTIONS="${RTCD_OPTIONS}--disable-sve --disable-sve2 "
+    fi
+  fi
+}
+
+check_gcc_avx512_compiles() {
+  if disabled gcc; then
+    return
+  fi
+
+  check_cc -mavx512f <<EOF
+#include <immintrin.h>
+void f(void) {
+  __m512i x = _mm512_set1_epi16(0);
+  (void)x;
+}
+EOF
+  compile_result=$?
+  if [ ${compile_result} -ne 0 ]; then
+    log_echo "    disabling avx512: not supported by compiler"
+    disable_feature avx512
+    RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx512 "
+  fi
+}
+
+check_inline_asm() {
+  log check_inline_asm "$@"
+  name="$1"
+  code="$2"
+  shift 2
+  disable_feature $name
+  check_cc "$@" <<EOF && enable_feature $name
+void foo(void) { __asm__ volatile($code); }
+EOF
+}
+
 write_common_config_banner() {
   print_webm_license config.mk "##" ""
   echo '# This file automatically generated by configure. Do not edit!' >> config.mk
@@ -438,11 +534,11 @@ write_common_target_config_mk() {
 
   cat >> $1 << EOF
 # This file automatically generated by configure. Do not edit!
-SRC_PATH="$source_path"
-SRC_PATH_BARE=$source_path
+SRC_PATH="$source_path_mk"
+SRC_PATH_BARE=$source_path_mk
 BUILD_PFX=${BUILD_PFX}
 TOOLCHAIN=${toolchain}
-ASM_CONVERSION=${asm_conversion_cmd:-${source_path}/build/make/ads2gas.pl}
+ASM_CONVERSION=${asm_conversion_cmd:-${source_path_mk}/build/make/ads2gas.pl}
 GEN_VCPROJ=${gen_vcproj_cmd}
 MSVS_ARCH_DIR=${msvs_arch_dir}
 
@@ -452,7 +548,6 @@ AR=${AR}
 LD=${LD}
 AS=${AS}
 STRIP=${STRIP}
-NM=${NM}
 
 CFLAGS  = ${CFLAGS}
 CXXFLAGS  = ${CXXFLAGS}
@@ -464,6 +559,8 @@ AS_SFX    = ${AS_SFX:-.asm}
 EXE_SFX   = ${EXE_SFX}
 VCPROJ_SFX = ${VCPROJ_SFX}
 RTCD_OPTIONS = ${RTCD_OPTIONS}
+LIBWEBM_CXXFLAGS = ${LIBWEBM_CXXFLAGS}
+LIBYUV_CXXFLAGS = ${LIBYUV_CXXFLAGS}
 EOF
 
   if enabled rvct; then cat >> $1 << EOF
@@ -474,10 +571,10 @@ fmt_deps = sed -e 's;^\([a-zA-Z0-9_]*\)\.o;\${@:.d=.o} \$@;'
 EOF
   fi
 
-  print_config_mk ARCH   "${1}" ${ARCH_LIST}
-  print_config_mk HAVE   "${1}" ${HAVE_LIST}
-  print_config_mk CONFIG "${1}" ${CONFIG_LIST}
-  print_config_mk HAVE   "${1}" gnu_strip
+  print_config_mk VPX_ARCH "${1}" ${ARCH_LIST}
+  print_config_mk HAVE     "${1}" ${HAVE_LIST}
+  print_config_mk CONFIG   "${1}" ${CONFIG_LIST}
+  print_config_mk HAVE     "${1}" gnu_strip
 
   enabled msvs && echo "CONFIG_VS_VERSION=${vs_version}" >> "${1}"
 
@@ -494,15 +591,33 @@ write_common_target_config_h() {
 #define RESTRICT    ${RESTRICT}
 #define INLINE      ${INLINE}
 EOF
-  print_config_h ARCH   "${TMP_H}" ${ARCH_LIST}
-  print_config_h HAVE   "${TMP_H}" ${HAVE_LIST}
-  print_config_h CONFIG "${TMP_H}" ${CONFIG_LIST}
-  print_config_vars_h   "${TMP_H}" ${VAR_LIST}
+  print_config_h VPX_ARCH "${TMP_H}" ${ARCH_LIST}
+  print_config_h HAVE     "${TMP_H}" ${HAVE_LIST}
+  print_config_h CONFIG   "${TMP_H}" ${CONFIG_LIST}
+  print_config_vars_h     "${TMP_H}" ${VAR_LIST}
   echo "#endif /* VPX_CONFIG_H */" >> ${TMP_H}
   mkdir -p `dirname "$1"`
   cmp "$1" ${TMP_H} >/dev/null 2>&1 || mv ${TMP_H} "$1"
 }
 
+write_win_arm64_neon_h_workaround() {
+  print_webm_license ${TMP_H} "/*" " */"
+  cat >> ${TMP_H} << EOF
+/* This file automatically generated by configure. Do not edit! */
+#ifndef VPX_WIN_ARM_NEON_H_WORKAROUND
+#define VPX_WIN_ARM_NEON_H_WORKAROUND
+/* The Windows SDK has arm_neon.h, but unlike on other platforms it is
+ * ARM32-only. ARM64 NEON support is provided by arm64_neon.h, a proper
+ * superset of arm_neon.h. Work around this by providing a more local
+ * arm_neon.h that simply #includes arm64_neon.h.
+ */
+#include <arm64_neon.h>
+#endif /* VPX_WIN_ARM_NEON_H_WORKAROUND */
+EOF
+  mkdir -p `dirname "$1"`
+  cmp "$1" ${TMP_H} >/dev/null 2>&1 || mv ${TMP_H} "$1"
+}
+
 process_common_cmdline() {
   for opt in "$@"; do
     optval="${opt#*=}"
@@ -534,6 +649,9 @@ process_common_cmdline() {
       --extra-cxxflags=*)
         extra_cxxflags="${optval}"
         ;;
+      --use-profile=*)
+        pgo_file=${optval}
+        ;;
       --enable-?*|--disable-?*)
         eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
         if is_in ${option} ${ARCH_EXT_LIST}; then
@@ -585,11 +703,7 @@ process_common_cmdline() {
       --libdir=*)
         libdir="${optval}"
         ;;
-      --sdk-path=*)
-        [ -d "${optval}" ] || die "Not a directory: ${optval}"
-        sdk_path="${optval}"
-        ;;
-      --libc|--as|--prefix|--libdir|--sdk-path)
+      --libc|--as|--prefix|--libdir)
         die "Option ${opt} requires argument"
         ;;
       --help|-h)
@@ -634,7 +748,6 @@ setup_gnu_toolchain() {
   LD=${LD:-${CROSS}${link_with_cc:-ld}}
   AS=${AS:-${CROSS}as}
   STRIP=${STRIP:-${CROSS}strip}
-  NM=${NM:-${CROSS}nm}
   AS_SFX=.S
   EXE_SFX=
 }
@@ -674,7 +787,6 @@ check_xcode_minimum_version() {
 process_common_toolchain() {
   if [ -z "$toolchain" ]; then
     gcctarget="${CHOST:-$(gcc -dumpmachine 2> /dev/null)}"
-
     # detect tgt_isa
     case "$gcctarget" in
       aarch64*)
@@ -697,37 +809,39 @@ process_common_toolchain() {
       *sparc*)
         tgt_isa=sparc
         ;;
+      power*64le*-*)
+        tgt_isa=ppc64le
+        ;;
+      *mips64el*)
+        tgt_isa=mips64
+        ;;
+      *mips32el*)
+        tgt_isa=mips32
+        ;;
+      loongarch32*)
+        tgt_isa=loongarch32
+        ;;
+      loongarch64*)
+        tgt_isa=loongarch64
+        ;;
     esac
 
     # detect tgt_os
     case "$gcctarget" in
-      *darwin10*)
+      *darwin1[0-9]*)
         tgt_isa=x86_64
-        tgt_os=darwin10
+        tgt_os=`echo $gcctarget | sed 's/.*\(darwin1[0-9]\).*/\1/'`
         ;;
-      *darwin11*)
-        tgt_isa=x86_64
-        tgt_os=darwin11
-        ;;
-      *darwin12*)
-        tgt_isa=x86_64
-        tgt_os=darwin12
-        ;;
-      *darwin13*)
-        tgt_isa=x86_64
-        tgt_os=darwin13
-        ;;
-      *darwin14*)
-        tgt_isa=x86_64
-        tgt_os=darwin14
-        ;;
-      *darwin15*)
-        tgt_isa=x86_64
-        tgt_os=darwin15
+      *darwin2[0-5]*)
+        tgt_isa=`uname -m`
+        tgt_os=`echo $gcctarget | sed 's/.*\(darwin2[0-9]\).*/\1/'`
         ;;
       x86_64*mingw32*)
         tgt_os=win64
         ;;
+      x86_64*cygwin*)
+        tgt_os=win64
+        ;;
       *mingw32*|*cygwin*)
         [ -z "$tgt_isa" ] && tgt_isa=x86
         tgt_os=win32
@@ -769,16 +883,34 @@ process_common_toolchain() {
 
   # Enable the architecture family
   case ${tgt_isa} in
+    arm64 | armv8)
+      enable_feature arm
+      enable_feature aarch64
+      ;;
     arm*)
       enable_feature arm
       ;;
     mips*)
       enable_feature mips
       ;;
+    ppc*)
+      enable_feature ppc
+      ;;
+    loongarch*)
+      soft_enable lsx
+      soft_enable lasx
+      enable_feature loongarch
+      ;;
   esac
 
-  # PIC is probably what we want when building shared libs
+  # Position independent code (PIC) is probably what we want when building
+  # shared libs or position independent executable (PIE) targets.
   enabled shared && soft_enable pic
+  check_cpp << EOF || soft_enable pic
+#if !(__pie__ || __PIE__)
+#error Neither __pie__ or __PIE__ are set
+#endif
+EOF
 
   # Minimum iOS version for all target platforms (darwin and iphonesimulator).
   # Shared library framework builds are only possible on iOS 8 and later.
@@ -787,13 +919,13 @@ process_common_toolchain() {
     IOS_VERSION_MIN="8.0"
   else
     IOS_VERSION_OPTIONS=""
-    IOS_VERSION_MIN="6.0"
+    IOS_VERSION_MIN="7.0"
   fi
 
   # Handle darwin variants. Newer SDKs allow targeting older
   # platforms, so use the newest one available.
   case ${toolchain} in
-    arm*-darwin*)
+    arm*-darwin-*)
       add_cflags "-miphoneos-version-min=${IOS_VERSION_MIN}"
       iphoneos_sdk_dir="$(show_darwin_sdk_path iphoneos)"
       if [ -d "${iphoneos_sdk_dir}" ]; then
@@ -801,7 +933,7 @@ process_common_toolchain() {
         add_ldflags "-isysroot ${iphoneos_sdk_dir}"
       fi
       ;;
-    x86*-darwin*)
+    *-darwin*)
       osx_sdk_dir="$(show_darwin_sdk_path macosx)"
       if [ -d "${osx_sdk_dir}" ]; then
         add_cflags  "-isysroot ${osx_sdk_dir}"
@@ -843,6 +975,26 @@ process_common_toolchain() {
       add_cflags  "-mmacosx-version-min=10.11"
       add_ldflags "-mmacosx-version-min=10.11"
       ;;
+    *-darwin16-*)
+      add_cflags  "-mmacosx-version-min=10.12"
+      add_ldflags "-mmacosx-version-min=10.12"
+      ;;
+    *-darwin17-*)
+      add_cflags  "-mmacosx-version-min=10.13"
+      add_ldflags "-mmacosx-version-min=10.13"
+      ;;
+    *-darwin18-*)
+      add_cflags  "-mmacosx-version-min=10.14"
+      add_ldflags "-mmacosx-version-min=10.14"
+      ;;
+    *-darwin19-*)
+      add_cflags  "-mmacosx-version-min=10.15"
+      add_ldflags "-mmacosx-version-min=10.15"
+      ;;
+    *-darwin2[0-5]-*)
+      add_cflags  "-arch ${toolchain%%-*}"
+      add_ldflags "-arch ${toolchain%%-*}"
+      ;;
     *-iphonesimulator-*)
       add_cflags  "-miphoneos-version-min=${IOS_VERSION_MIN}"
       add_ldflags "-miphoneos-version-min=${IOS_VERSION_MIN}"
@@ -864,34 +1016,36 @@ process_common_toolchain() {
       ;;
   esac
 
-  # Process ARM architecture variants
+  # Process architecture variants
   case ${toolchain} in
     arm*)
-      # on arm, isa versions are supersets
-      case ${tgt_isa} in
-        arm64|armv8)
-          soft_enable neon
+      case ${toolchain} in
+        armv7*-darwin*)
+          # Runtime cpu detection is not defined for these targets.
+          enabled runtime_cpu_detect && disable_feature runtime_cpu_detect
           ;;
-        armv7|armv7s)
-          soft_enable neon
-          # Only enable neon_asm when neon is also enabled.
-          enabled neon && soft_enable neon_asm
-          # If someone tries to force it through, die.
-          if disabled neon && enabled neon_asm; then
-            die "Disabling neon while keeping neon-asm is not supported"
-          fi
+        *)
+          soft_enable runtime_cpu_detect
           ;;
       esac
 
-      asm_conversion_cmd="cat"
+      if [ ${tgt_isa} = "armv7" ] || [ ${tgt_isa} = "armv7s" ]; then
+        soft_enable neon
+        # Only enable neon_asm when neon is also enabled.
+        enabled neon && soft_enable neon_asm
+        # If someone tries to force it through, die.
+        if disabled neon && enabled neon_asm; then
+          die "Disabling neon while keeping neon-asm is not supported"
+        fi
+      fi
 
+      asm_conversion_cmd="cat"
       case ${tgt_cc} in
         gcc)
           link_with_cc=gcc
           setup_gnu_toolchain
           arch_int=${tgt_isa##armv}
           arch_int=${arch_int%%te}
-          check_add_asflags --defsym ARCHITECTURE=${arch_int}
           tune_cflags="-mtune="
           if [ ${tgt_isa} = "armv7" ] || [ ${tgt_isa} = "armv7s" ]; then
             if [ -z "${float_abi}" ]; then
@@ -917,7 +1071,17 @@ EOF
           fi
 
           enabled debug && add_asflags -g
-          asm_conversion_cmd="${source_path}/build/make/ads2gas.pl"
+          asm_conversion_cmd="${source_path_mk}/build/make/ads2gas.pl"
+
+          case ${tgt_os} in
+            win*)
+              asm_conversion_cmd="$asm_conversion_cmd -noelf"
+              AS="$CC -c"
+              EXE_SFX=.exe
+              enable_feature thumb
+              ;;
+          esac
+
           if enabled thumb; then
             asm_conversion_cmd="$asm_conversion_cmd -thumb"
             check_add_cflags -mthumb
@@ -925,18 +1089,44 @@ EOF
           fi
           ;;
         vs*)
-          asm_conversion_cmd="${source_path}/build/make/ads2armasm_ms.pl"
-          AS_SFX=.S
-          msvs_arch_dir=arm-msvs
-          disable_feature multithread
-          disable_feature unit_tests
-          vs_version=${tgt_cc##vs}
-          if [ $vs_version -ge 12 ]; then
-            # MSVC 2013 doesn't allow doing plain .exe projects for ARM,
-            # only "AppContainerApplication" which requires an AppxManifest.
-            # Therefore disable the examples, just build the library.
-            disable_feature examples
-            disable_feature tools
+          # A number of ARM-based Windows platforms are constrained by their
+          # respective SDKs' limitations. Fortunately, these are all 32-bit ABIs
+          # and so can be selected as 'win32'.
+          if [ ${tgt_os} = "win32" ]; then
+            asm_conversion_cmd="${source_path_mk}/build/make/ads2armasm_ms.pl"
+            AS_SFX=.S
+            msvs_arch_dir=arm-msvs
+            disable_feature multithread
+            disable_feature unit_tests
+            if [ ${tgt_cc##vs} -ge 12 ]; then
+              # MSVC 2013 doesn't allow doing plain .exe projects for ARM32,
+              # only "AppContainerApplication" which requires an AppxManifest.
+              # Therefore disable the examples, just build the library.
+              disable_feature examples
+              disable_feature tools
+            fi
+          else
+            # Windows 10 on ARM, on the other hand, has full Windows SDK support
+            # for building Win32 ARM64 applications in addition to ARM64
+            # Windows Store apps. It is the only 64-bit ARM ABI that
+            # Windows supports, so it is the default definition of 'win64'.
+            # ARM64 build support officially shipped in Visual Studio 15.9.0.
+
+            # Because the ARM64 Windows SDK's arm_neon.h is ARM32-specific
+            # while LLVM's is not, probe its validity.
+            if enabled neon; then
+              if [ -n "${CC}" ]; then
+                check_header arm_neon.h || check_header arm64_neon.h && \
+                    enable_feature win_arm64_neon_h_workaround
+              else
+                # If a probe is not possible, assume this is the pure Windows
+                # SDK and so the workaround is necessary when using Visual
+                # Studio < 2019.
+                if [ ${tgt_cc##vs} -lt 16 ]; then
+                  enable_feature win_arm64_neon_h_workaround
+                fi
+              fi
+            fi
           fi
           ;;
         rvct)
@@ -945,7 +1135,6 @@ EOF
           AS=armasm
           LD="${source_path}/build/make/armlink_adapter.sh"
           STRIP=arm-none-linux-gnueabi-strip
-          NM=arm-none-linux-gnueabi-nm
           tune_cflags="--cpu="
           tune_asflags="--cpu="
           if [ -z "${tune_cpu}" ]; then
@@ -964,7 +1153,6 @@ EOF
           fi
           arch_int=${tgt_isa##armv}
           arch_int=${arch_int%%te}
-          check_add_asflags --pd "\"ARCHITECTURE SETA ${arch_int}\""
           enabled debug && add_asflags -g
           add_cflags --gnu
           add_cflags --enum_is_int
@@ -979,109 +1167,77 @@ EOF
           ;;
 
         android*)
-          if [ -n "${sdk_path}" ]; then
-            SDK_PATH=${sdk_path}
-            COMPILER_LOCATION=`find "${SDK_PATH}" \
-              -name "arm-linux-androideabi-gcc*" -print -quit`
-            TOOLCHAIN_PATH=${COMPILER_LOCATION%/*}/arm-linux-androideabi-
-            CC=${TOOLCHAIN_PATH}gcc
-            CXX=${TOOLCHAIN_PATH}g++
-            AR=${TOOLCHAIN_PATH}ar
-            LD=${TOOLCHAIN_PATH}gcc
-            AS=${TOOLCHAIN_PATH}as
-            STRIP=${TOOLCHAIN_PATH}strip
-            NM=${TOOLCHAIN_PATH}nm
-
-            if [ -z "${alt_libc}" ]; then
-              alt_libc=`find "${SDK_PATH}" -name arch-arm -print | \
-                awk '{n = split($0,a,"/"); \
-                split(a[n-1],b,"-"); \
-                print $0 " " b[2]}' | \
-                sort -g -k 2 | \
-                awk '{ print $1 }' | tail -1`
-            fi
-
-            if [ -d "${alt_libc}" ]; then
-              add_cflags "--sysroot=${alt_libc}"
-              add_ldflags "--sysroot=${alt_libc}"
-            fi
-
-            # linker flag that routes around a CPU bug in some
-            # Cortex-A8 implementations (NDK Dev Guide)
-            add_ldflags "-Wl,--fix-cortex-a8"
-
-            enable_feature pic
-            soft_enable realtime_only
-            if [ ${tgt_isa} = "armv7" ]; then
-              soft_enable runtime_cpu_detect
-            fi
-            if enabled runtime_cpu_detect; then
-              add_cflags "-I${SDK_PATH}/sources/android/cpufeatures"
-            fi
-          else
-            echo "Assuming standalone build with NDK toolchain."
-            echo "See build/make/Android.mk for details."
-            check_add_ldflags -static
-            soft_enable unit_tests
-          fi
-          ;;
-
-        darwin*)
-          XCRUN_FIND="xcrun --sdk iphoneos --find"
-          CXX="$(${XCRUN_FIND} clang++)"
-          CC="$(${XCRUN_FIND} clang)"
-          AR="$(${XCRUN_FIND} ar)"
-          AS="$(${XCRUN_FIND} as)"
-          STRIP="$(${XCRUN_FIND} strip)"
-          NM="$(${XCRUN_FIND} nm)"
-          RANLIB="$(${XCRUN_FIND} ranlib)"
-          AS_SFX=.S
-          LD="${CXX:-$(${XCRUN_FIND} ld)}"
-
-          # ASFLAGS is written here instead of using check_add_asflags
-          # because we need to overwrite all of ASFLAGS and purge the
-          # options that were put in above
-          ASFLAGS="-arch ${tgt_isa} -g"
-
-          add_cflags -arch ${tgt_isa}
-          add_ldflags -arch ${tgt_isa}
-
-          alt_libc="$(show_darwin_sdk_path iphoneos)"
-          if [ -d "${alt_libc}" ]; then
-            add_cflags -isysroot ${alt_libc}
-          fi
-
-          if [ "${LD}" = "${CXX}" ]; then
-            add_ldflags -miphoneos-version-min="${IOS_VERSION_MIN}"
-          else
-            add_ldflags -ios_version_min "${IOS_VERSION_MIN}"
-          fi
-
-          for d in lib usr/lib usr/lib/system; do
-            try_dir="${alt_libc}/${d}"
-            [ -d "${try_dir}" ] && add_ldflags -L"${try_dir}"
-          done
-
-          case ${tgt_isa} in
-            armv7|armv7s|armv8|arm64)
-              if enabled neon && ! check_xcode_minimum_version; then
-                soft_disable neon
-                log_echo "  neon disabled: upgrade Xcode (need v6.3+)."
-                if enabled neon_asm; then
-                  soft_disable neon_asm
-                  log_echo "  neon_asm disabled: upgrade Xcode (need v6.3+)."
-                fi
-              fi
+          echo "Assuming standalone build with NDK toolchain."
+          echo "See build/make/Android.mk for details."
+          check_add_ldflags -static
+          soft_enable unit_tests
+          case "$AS" in
+            *clang)
+              # The GNU Assembler was removed in the r24 version of the NDK.
+              # clang's internal assembler works, but `-c` is necessary to
+              # avoid linking.
+              add_asflags -c
               ;;
           esac
+          ;;
 
-          asm_conversion_cmd="${source_path}/build/make/ads2gas_apple.pl"
+        darwin)
+          if ! enabled external_build; then
+            XCRUN_FIND="xcrun --sdk iphoneos --find"
+            CXX="$(${XCRUN_FIND} clang++)"
+            CC="$(${XCRUN_FIND} clang)"
+            AR="$(${XCRUN_FIND} ar)"
+            AS="$(${XCRUN_FIND} as)"
+            STRIP="$(${XCRUN_FIND} strip)"
+            AS_SFX=.S
+            LD="${CXX:-$(${XCRUN_FIND} ld)}"
 
-          if [ "$(show_darwin_sdk_major_version iphoneos)" -gt 8 ]; then
-            check_add_cflags -fembed-bitcode
-            check_add_asflags -fembed-bitcode
-            check_add_ldflags -fembed-bitcode
+            # ASFLAGS is written here instead of using check_add_asflags
+            # because we need to overwrite all of ASFLAGS and purge the
+            # options that were put in above
+            ASFLAGS="-arch ${tgt_isa} -g"
+
+            add_cflags -arch ${tgt_isa}
+            add_ldflags -arch ${tgt_isa}
+
+            alt_libc="$(show_darwin_sdk_path iphoneos)"
+            if [ -d "${alt_libc}" ]; then
+              add_cflags -isysroot ${alt_libc}
+            fi
+
+            if [ "${LD}" = "${CXX}" ]; then
+              add_ldflags -miphoneos-version-min="${IOS_VERSION_MIN}"
+            else
+              add_ldflags -ios_version_min "${IOS_VERSION_MIN}"
+            fi
+
+            for d in lib usr/lib usr/lib/system; do
+              try_dir="${alt_libc}/${d}"
+              [ -d "${try_dir}" ] && add_ldflags -L"${try_dir}"
+            done
+
+            case ${tgt_isa} in
+              armv7|armv7s|armv8|arm64)
+                if enabled neon && ! check_xcode_minimum_version; then
+                  soft_disable neon
+                  log_echo "  neon disabled: upgrade Xcode (need v6.3+)."
+                  if enabled neon_asm; then
+                    soft_disable neon_asm
+                    log_echo "  neon_asm disabled: upgrade Xcode (need v6.3+)."
+                  fi
+                fi
+                ;;
+            esac
+
+            if [ "$(show_darwin_sdk_major_version iphoneos)" -gt 8 ] \
+               && [ "$(show_xcode_version | cut -d. -f1)" -lt 16 ]; then
+              check_add_cflags -fembed-bitcode
+              check_add_asflags -fembed-bitcode
+              check_add_ldflags -fembed-bitcode
+            fi
           fi
+
+          asm_conversion_cmd="${source_path_mk}/build/make/ads2gas_apple.pl"
           ;;
 
         linux*)
@@ -1108,6 +1264,38 @@ EOF
           fi
           ;;
       esac
+
+      # AArch64 ISA extensions are treated as supersets.
+      if [ ${tgt_isa} = "arm64" ] || [ ${tgt_isa} = "armv8" ]; then
+        aarch64_arch_flag_neon="arch=armv8-a"
+        aarch64_arch_flag_neon_dotprod="arch=armv8.2-a+dotprod"
+        aarch64_arch_flag_neon_i8mm="arch=armv8.2-a+dotprod+i8mm"
+        aarch64_arch_flag_sve="arch=armv8.2-a+dotprod+i8mm+sve"
+        aarch64_arch_flag_sve2="arch=armv9-a+sve2"
+        for ext in ${ARCH_EXT_LIST_AARCH64}; do
+          if [ "$disable_exts" = "yes" ]; then
+            RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} "
+            soft_disable $ext
+          else
+            # Check the compiler supports the -march flag for the extension.
+            # This needs to happen after toolchain/OS inspection so we handle
+            # $CROSS etc correctly when checking for flags, else these will
+            # always fail.
+            flag="$(eval echo \$"aarch64_arch_flag_${ext}")"
+            check_gcc_machine_option "${flag}" "${ext}"
+            if ! enabled $ext; then
+              # Disable higher order extensions to simplify dependencies.
+              disable_exts="yes"
+              RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} "
+              soft_disable $ext
+            fi
+          fi
+        done
+        if enabled sve; then
+          check_neon_sve_bridge_compiles
+        fi
+      fi
+
       ;;
     mips*)
       link_with_cc=gcc
@@ -1135,12 +1323,24 @@ EOF
             check_add_asflags -mips64r6 -mabi=64 -mhard-float -mfp64
             check_add_ldflags -mips64r6 -mabi=64 -mfp64
             ;;
+          loongson3*)
+            check_cflags -march=loongson3a && soft_enable mmi \
+              || disable_feature mmi
+            check_cflags -mmsa && soft_enable msa \
+              || disable_feature msa
+            tgt_isa=loongson3a
+            ;;
         esac
 
+        if enabled mmi || enabled msa; then
+          soft_enable runtime_cpu_detect
+        fi
+
         if enabled msa; then
-          add_cflags -mmsa
-          add_asflags -mmsa
-          add_ldflags -mmsa
+          # TODO(libyuv:793)
+          # The new mips functions in libyuv do not build
+          # with the toolchains we currently use for testing.
+          soft_disable libyuv
         fi
       fi
 
@@ -1148,8 +1348,25 @@ EOF
       check_add_asflags -march=${tgt_isa}
       check_add_asflags -KPIC
       ;;
+    ppc64le*)
+      link_with_cc=gcc
+      setup_gnu_toolchain
+      # Do not enable vsx by default.
+      # https://bugs.chromium.org/p/webm/issues/detail?id=1522
+      enabled vsx || RTCD_OPTIONS="${RTCD_OPTIONS}--disable-vsx "
+      if [ -n "${tune_cpu}" ]; then
+        case ${tune_cpu} in
+          power?)
+            tune_cflags="-mcpu="
+            ;;
+        esac
+      fi
+      ;;
     x86*)
       case  ${tgt_os} in
+        android)
+          soft_enable realtime_only
+          ;;
         win*)
           enabled gcc && add_cflags -fno-common
           ;;
@@ -1164,6 +1381,10 @@ EOF
           AS=${AS:-nasm}
           add_ldflags -Zhigh-mem
           ;;
+        darwin*)
+          enabled x86 && darwin_arch="-arch i386" || darwin_arch="-arch x86_64"
+          add_cflags  ${darwin_arch}
+          add_ldflags ${darwin_arch}
       esac
 
       AS="${alt_as:-${AS:-auto}}"
@@ -1196,24 +1417,12 @@ EOF
           enabled optimizations && disabled gprof && check_add_cflags -fomit-frame-pointer
           ;;
         vs*)
-          # When building with Microsoft Visual Studio the assembler is
-          # invoked directly. Checking at configure time is unnecessary.
-          # Skip the check by setting AS arbitrarily
-          AS=msvs
           msvs_arch_dir=x86-msvs
-          vc_version=${tgt_cc##vs}
-          case $vc_version in
-            7|8|9|10)
-              echo "${tgt_cc} does not support avx/avx2, disabling....."
-              RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx --disable-avx2 "
-              soft_disable avx
-              soft_disable avx2
-              ;;
-          esac
-          case $vc_version in
-            7|8|9)
-              echo "${tgt_cc} omits stdint.h, disabling webm-io..."
-              soft_disable webm_io
+          case ${tgt_cc##vs} in
+            14)
+              echo "${tgt_cc} does not support avx512, disabling....."
+              RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx512 "
+              soft_disable avx512
               ;;
           esac
           ;;
@@ -1246,8 +1455,13 @@ EOF
         elif disabled $ext; then
           disable_exts="yes"
         else
-          # use the shortened version for the flag: sse4_1 -> sse4
-          check_gcc_machine_option ${ext%_*} $ext
+          if [ "$ext" = "avx512" ]; then
+            check_gcc_machine_options $ext avx512f avx512cd avx512bw avx512dq avx512vl
+            check_gcc_avx512_compiles
+          else
+            # use the shortened version for the flag: sse4_1 -> sse4
+            check_gcc_machine_option ${ext%_*} $ext
+          fi
         fi
       done
 
@@ -1273,7 +1487,6 @@ EOF
         esac
         log_echo "  using $AS"
       fi
-      [ "${AS##*/}" = nasm ] && add_asflags -Ox
       AS_SFX=.asm
       case  ${tgt_os} in
         win32)
@@ -1282,7 +1495,7 @@ EOF
           EXE_SFX=.exe
           ;;
         win64)
-          add_asflags -f x64
+          add_asflags -f win64
           enabled debug && add_asflags -g cv8
           EXE_SFX=.exe
           ;;
@@ -1294,9 +1507,6 @@ EOF
           ;;
         darwin*)
           add_asflags -f macho${bits}
-          enabled x86 && darwin_arch="-arch i386" || darwin_arch="-arch x86_64"
-          add_cflags  ${darwin_arch}
-          add_ldflags ${darwin_arch}
           # -mdynamic-no-pic is still a bit of voodoo -- it was required at
           # one time, but does not seem to be now, and it breaks some of the
           # code that still relies on inline assembly.
@@ -1309,7 +1519,8 @@ EOF
           add_cflags  ${sim_arch}
           add_ldflags ${sim_arch}
 
-          if [ "$(show_darwin_sdk_major_version iphonesimulator)" -gt 8 ]; then
+          if [ "$(disabled external_build)" ] &&
+              [ "$(show_darwin_sdk_major_version iphonesimulator)" -gt 8 ]; then
             # yasm v1.3.0 doesn't know what -fembed-bitcode means, so turning it
             # on is pointless (unless building a C-only lib). Warn the user, but
             # do nothing here.
@@ -1326,6 +1537,15 @@ EOF
           ;;
       esac
       ;;
+    loongarch*)
+      link_with_cc=gcc
+      setup_gnu_toolchain
+
+      enabled lsx && check_inline_asm lsx '"vadd.b $vr0, $vr1, $vr1"'
+      enabled lsx && soft_enable runtime_cpu_detect
+      enabled lasx && check_inline_asm lasx '"xvadd.b $xr0, $xr1, $xr1"'
+      enabled lasx && soft_enable runtime_cpu_detect
+      ;;
     *-gcc|generic-gnu)
       link_with_cc=gcc
       enable_feature gcc
@@ -1333,6 +1553,14 @@ EOF
       ;;
   esac
 
+  # Enable PGO
+  if [ -n "${pgo_file}" ]; then
+   check_add_cflags -fprofile-use=${pgo_file} || \
+     die "-fprofile-use is not supported by compiler"
+   check_add_ldflags -fprofile-use=${pgo_file} || \
+     die "-fprofile-use is not supported by linker"
+  fi
+
   # Try to enable CPU specific tuning
   if [ -n "${tune_cpu}" ]; then
     if [ -n "${tune_cflags}" ]; then
@@ -1353,6 +1581,9 @@ EOF
   else
     check_add_cflags -DNDEBUG
   fi
+  enabled profile &&
+    check_add_cflags -fprofile-generate &&
+    check_add_ldflags -fprofile-generate
 
   enabled gprof && check_add_cflags -pg && check_add_ldflags -pg
   enabled gcov &&
@@ -1387,7 +1618,7 @@ EOF
 
     # Try to find which inline keywords are supported
     check_cc <<EOF && INLINE="inline"
-static inline function() {}
+static inline int function(void) {}
 EOF
 
   # Almost every platform uses pthreads.
@@ -1399,7 +1630,11 @@ EOF
         # bionic includes basic pthread functionality, obviating -lpthread.
         ;;
       *)
-        check_header pthread.h && add_extralibs -lpthread
+        check_header pthread.h && check_lib -lpthread <<EOF && add_extralibs -lpthread || disable_feature pthread_h
+#include <pthread.h>
+#include <stddef.h>
+int main(void) { return pthread_create(NULL, NULL, NULL, NULL); }
+EOF
         ;;
     esac
   fi
@@ -1416,6 +1651,26 @@ EOF
           echo "msa optimizations are available only for little endian platforms"
           disable_feature msa
         fi
+        if enabled mmi; then
+          echo "mmi optimizations are available only for little endian platforms"
+          disable_feature mmi
+        fi
+      fi
+      ;;
+  esac
+
+  # only for LOONGARCH platforms
+  case ${toolchain} in
+    loongarch*)
+      if enabled big_endian; then
+        if enabled lsx; then
+          echo "lsx optimizations are available only for little endian platforms"
+          disable_feature lsx
+        fi
+        if enabled lasx; then
+          echo "lasx optimizations are available only for little endian platforms"
+          disable_feature lasx
+        fi
       fi
       ;;
   esac
diff --git a/media/libvpx/libvpx/build/make/gen_asm_deps.sh b/media/libvpx/libvpx/build/make/gen_asm_deps.sh
index 6a7bff9ebc..3bd4d125f1 100644
--- a/media/libvpx/libvpx/build/make/gen_asm_deps.sh
+++ b/media/libvpx/libvpx/build/make/gen_asm_deps.sh
@@ -42,7 +42,7 @@ done
 
 [ -n "$srcfile" ] || show_help
 sfx=${sfx:-asm}
-includes=$(LC_ALL=C egrep -i "include +\"?[a-z0-9_/]+\.${sfx}" $srcfile |
+includes=$(LC_ALL=C grep -E -i "include +\"?[a-z0-9_/]+\.${sfx}" $srcfile |
            perl -p -e "s;.*?([a-z0-9_/]+.${sfx}).*;\1;")
 #" restore editor state
 for inc in ${includes}; do
diff --git a/media/libvpx/libvpx/build/make/gen_msvs_sln.sh b/media/libvpx/libvpx/build/make/gen_msvs_sln.sh
index 7d5f468109..0b312850fe 100644
--- a/media/libvpx/libvpx/build/make/gen_msvs_sln.sh
+++ b/media/libvpx/libvpx/build/make/gen_msvs_sln.sh
@@ -25,7 +25,7 @@ files.
 Options:
     --help                      Print this message
     --out=outfile               Redirect output to a file
-    --ver=version               Version (7,8,9,10,11,12,14) of visual studio to generate for
+    --ver=version               Version (14-17) of visual studio to generate for
     --target=isa-os-cc          Target specifier
 EOF
     exit 1
@@ -213,13 +213,15 @@ for opt in "$@"; do
     ;;
     --dep=*) eval "${optval%%:*}_deps=\"\${${optval%%:*}_deps} ${optval##*:}\""
     ;;
-    --ver=*) vs_ver="$optval"
-             case $optval in
-             10|11|12|14)
-             ;;
-             *) die Unrecognized Visual Studio Version in $opt
-             ;;
-             esac
+    --ver=*)
+      vs_ver="$optval"
+      case $optval in
+        14) vs_year=2015 ;;
+        15) vs_year=2017 ;;
+        16) vs_year=2019 ;;
+        17) vs_year=2022 ;;
+        *) die Unrecognized Visual Studio Version in $opt ;;
+      esac
     ;;
     --target=*) target="${optval}"
     ;;
@@ -230,18 +232,11 @@ for opt in "$@"; do
 done
 outfile=${outfile:-/dev/stdout}
 mkoutfile=${mkoutfile:-/dev/stdout}
-case "${vs_ver:-10}" in
-    10) sln_vers="11.00"
-       sln_vers_str="Visual Studio 2010"
-    ;;
-    11) sln_vers="12.00"
-       sln_vers_str="Visual Studio 2012"
-    ;;
-    12) sln_vers="12.00"
-       sln_vers_str="Visual Studio 2013"
-    ;;
-    14) sln_vers="14.00"
-       sln_vers_str="Visual Studio 2015"
+case "${vs_ver}" in
+    1[4-7])
+      # VS has used Format Version 12.00 continuously since vs11.
+      sln_vers="12.00"
+      sln_vers_str="Visual Studio ${vs_year}"
     ;;
 esac
 sfx=vcxproj
diff --git a/media/libvpx/libvpx/build/make/gen_msvs_vcxproj.sh b/media/libvpx/libvpx/build/make/gen_msvs_vcxproj.sh
index 2cf62c117c..1e1db05bb2 100644
--- a/media/libvpx/libvpx/build/make/gen_msvs_vcxproj.sh
+++ b/media/libvpx/libvpx/build/make/gen_msvs_vcxproj.sh
@@ -34,7 +34,7 @@ Options:
     --name=project_name         Name of the project (required)
     --proj-guid=GUID            GUID to use for the project
     --module-def=filename       File containing export definitions (for DLLs)
-    --ver=version               Version (10,11,12,14) of visual studio to generate for
+    --ver=version               Version (14-16) of visual studio to generate for
     --src-path-bare=dir         Path to root of source tree
     -Ipath/to/include           Additional include directories
     -DFLAG[=value]              Preprocessor macros to define
@@ -82,7 +82,7 @@ generate_filter() {
                        | sed -e "s,$src_path_bare,," \
                              -e 's/^[\./]\+//g' -e 's,[:/ ],_,g')
 
-                if ([ "$pat" == "asm" ] || [ "$pat" == "s" ] || [ "$pat" == "S" ]) && $asm_use_custom_step; then
+                if ([ "$pat" == "asm" ] || [ "$pat" == "s" ] || [ "$pat" == "S" ]) && $uses_asm; then
                     # Avoid object file name collisions, i.e. vpx_config.c and
                     # vpx_config.asm produce the same object file without
                     # this additional suffix.
@@ -141,7 +141,17 @@ for opt in "$@"; do
     case "$opt" in
         --help|-h) show_help
         ;;
-        --target=*) target="${optval}"
+        --target=*)
+            target="${optval}"
+            platform_toolset=$(echo ${target} | awk 'BEGIN{FS="-"}{print $4}')
+            case "$platform_toolset" in
+                clangcl) platform_toolset="ClangCl"
+                ;;
+                "")
+                ;;
+                *) die Unrecognized Visual Studio Platform Toolset in $opt
+                ;;
+            esac
         ;;
         --out=*) outfile="$optval"
         ;;
@@ -157,6 +167,8 @@ for opt in "$@"; do
         ;;
         --lib) proj_kind="lib"
         ;;
+        --as=*) as="${optval}"
+        ;;
         --src-path-bare=*)
             src_path_bare=$(fix_path "$optval")
             src_path_bare=${src_path_bare%/}
@@ -168,7 +180,7 @@ for opt in "$@"; do
         --ver=*)
             vs_ver="$optval"
             case "$optval" in
-                10|11|12|14)
+                1[4-7])
                 ;;
                 *) die Unrecognized Visual Studio Version in $opt
                 ;;
@@ -215,13 +227,7 @@ fix_file_list file_list
 
 outfile=${outfile:-/dev/stdout}
 guid=${guid:-`generate_uuid`}
-asm_use_custom_step=false
 uses_asm=${uses_asm:-false}
-case "${vs_ver:-11}" in
-    10|11|12|14)
-       asm_use_custom_step=$uses_asm
-    ;;
-esac
 
 [ -n "$name" ] || die "Project name (--name) must be specified!"
 [ -n "$target" ] || die "Target (--target) must be specified!"
@@ -253,13 +259,22 @@ libs=${libs// /;}
 case "$target" in
     x86_64*)
         platforms[0]="x64"
-        asm_Debug_cmdline="yasm -Xvc -g cv8 -f win64 ${yasmincs} &quot;%(FullPath)&quot;"
-        asm_Release_cmdline="yasm -Xvc -f win64 ${yasmincs} &quot;%(FullPath)&quot;"
+        asm_Debug_cmdline="${as} -Xvc -gcv8 -f win64 ${yasmincs} &quot;%(FullPath)&quot;"
+        asm_Release_cmdline="${as} -Xvc -f win64 ${yasmincs} &quot;%(FullPath)&quot;"
     ;;
     x86*)
         platforms[0]="Win32"
-        asm_Debug_cmdline="yasm -Xvc -g cv8 -f win32 ${yasmincs} &quot;%(FullPath)&quot;"
-        asm_Release_cmdline="yasm -Xvc -f win32 ${yasmincs} &quot;%(FullPath)&quot;"
+        asm_Debug_cmdline="${as} -Xvc -gcv8 -f win32 ${yasmincs} &quot;%(FullPath)&quot;"
+        asm_Release_cmdline="${as} -Xvc -f win32 ${yasmincs} &quot;%(FullPath)&quot;"
+    ;;
+    arm64*)
+        platforms[0]="ARM64"
+        # As of Visual Studio 2022 17.5.5, clang-cl does not support ARM64EC.
+        if [ "$vs_ver" -ge 17 -a "$platform_toolset" != "ClangCl" ]; then
+            platforms[1]="ARM64EC"
+        fi
+        asm_Debug_cmdline="armasm64 -nologo -oldit &quot;%(FullPath)&quot;"
+        asm_Release_cmdline="armasm64 -nologo -oldit &quot;%(FullPath)&quot;"
     ;;
     arm*)
         platforms[0]="ARM"
@@ -307,6 +322,16 @@ generate_vcxproj() {
             tag_content ApplicationType "Windows Store"
             tag_content ApplicationTypeRevision 8.1
         fi
+        if [ "${platforms[0]}" = "ARM64" ]; then
+            # Require the first Visual Studio version to have ARM64 support.
+            tag_content MinimumVisualStudioVersion 15.9
+        fi
+        if [ $vs_ver -eq 15 ] && [ "${platforms[0]}" = "ARM64" ]; then
+            # Since VS 15 does not have a 'use latest SDK version' facility,
+            # specifically require the contemporaneous SDK with official ARM64
+            # support.
+            tag_content WindowsTargetPlatformVersion 10.0.17763.0
+        fi
     close_tag PropertyGroup
 
     tag Import \
@@ -324,28 +349,21 @@ generate_vcxproj() {
             else
                 tag_content ConfigurationType StaticLibrary
             fi
-            if [ "$vs_ver" = "11" ]; then
-                if [ "$plat" = "ARM" ]; then
-                    # Setting the wp80 toolchain automatically sets the
-                    # WINAPI_FAMILY define, which is required for building
-                    # code for arm with the windows headers. Alternatively,
-                    # one could add AppContainerApplication=true in the Globals
-                    # section and add PrecompiledHeader=NotUsing and
-                    # CompileAsWinRT=false in ClCompile and SubSystem=Console
-                    # in Link.
-                    tag_content PlatformToolset v110_wp80
-                else
-                    tag_content PlatformToolset v110
+            if [ -n "$platform_toolset" ]; then
+                tag_content PlatformToolset "$platform_toolset"
+            else
+                if [ "$vs_ver" = "14" ]; then
+                    tag_content PlatformToolset v140
+                fi
+                if [ "$vs_ver" = "15" ]; then
+                    tag_content PlatformToolset v141
+                fi
+                if [ "$vs_ver" = "16" ]; then
+                    tag_content PlatformToolset v142
+                fi
+                if [ "$vs_ver" = "17" ]; then
+                    tag_content PlatformToolset v143
                 fi
-            fi
-            if [ "$vs_ver" = "12" ]; then
-                # Setting a PlatformToolset indicating windows phone isn't
-                # enough to build code for arm with MSVC 2013, one strictly
-                # has to enable AppContainerApplication as well.
-                tag_content PlatformToolset v120
-            fi
-            if [ "$vs_ver" = "14" ]; then
-                tag_content PlatformToolset v140
             fi
             tag_content CharacterSet Unicode
             if [ "$config" = "Release" ]; then
diff --git a/media/libvpx/libvpx/build/make/iosbuild.sh b/media/libvpx/libvpx/build/make/iosbuild.sh
index c703f22b0c..d9594b9816 100644
--- a/media/libvpx/libvpx/build/make/iosbuild.sh
+++ b/media/libvpx/libvpx/build/make/iosbuild.sh
@@ -30,13 +30,9 @@ SCRIPT_DIR=$(dirname "$0")
 LIBVPX_SOURCE_DIR=$(cd ${SCRIPT_DIR}/../..; pwd)
 LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo)
 ORIG_PWD="$(pwd)"
-ARM_TARGETS="arm64-darwin-gcc
-             armv7-darwin-gcc
-             armv7s-darwin-gcc"
-SIM_TARGETS="x86-iphonesimulator-gcc
-             x86_64-iphonesimulator-gcc"
-OSX_TARGETS="x86-darwin15-gcc
-             x86_64-darwin15-gcc"
+ARM_TARGETS="arm64-darwin-gcc"
+SIM_TARGETS="x86_64-iphonesimulator-gcc"
+OSX_TARGETS="x86_64-darwin16-gcc"
 TARGETS="${ARM_TARGETS} ${SIM_TARGETS}"
 
 # Configures for the target specified by $1, and invokes make with the dist
@@ -132,7 +128,8 @@ create_vpx_framework_config_shim() {
   done
 
   # Consume the last line of output from the loop: We don't want it.
-  sed -i '' -e '$d' "${config_file}"
+  sed -i.bak -e '$d' "${config_file}"
+  rm "${config_file}.bak"
 
   printf "#endif\n\n" >> "${config_file}"
   printf "#endif  // ${include_guard}" >> "${config_file}"
@@ -244,7 +241,7 @@ build_framework() {
 # Trap function. Cleans up the subtree used to build all targets contained in
 # $TARGETS.
 cleanup() {
-  local readonly res=$?
+  local res=$?
   cd "${ORIG_PWD}"
 
   if [ $res -ne 0 ]; then
@@ -271,7 +268,7 @@ cat << EOF
     --help: Display this message and exit.
     --enable-shared: Build a dynamic framework for use on iOS 8 or later.
     --extra-configure-args <args>: Extra args to pass when configuring libvpx.
-    --macosx: Uses darwin15 targets instead of iphonesimulator targets for x86
+    --macosx: Uses darwin16 targets instead of iphonesimulator targets for x86
               and x86_64. Allows linking to framework when builds target MacOSX
               instead of iOS.
     --preserve-build-output: Do not delete the build directory.
@@ -350,7 +347,7 @@ if [ "$ENABLE_SHARED" = "yes" ]; then
   IOS_VERSION_MIN="8.0"
 else
   IOS_VERSION_OPTIONS=""
-  IOS_VERSION_MIN="6.0"
+  IOS_VERSION_MIN="7.0"
 fi
 
 if [ "${VERBOSE}" = "yes" ]; then
diff --git a/media/libvpx/libvpx/build/make/msvs_common.sh b/media/libvpx/libvpx/build/make/msvs_common.sh
index 88f1cf9b57..3989fec0d5 100644
--- a/media/libvpx/libvpx/build/make/msvs_common.sh
+++ b/media/libvpx/libvpx/build/make/msvs_common.sh
@@ -9,7 +9,8 @@
 ##  be found in the AUTHORS file in the root of the source tree.
 ##
 
-if [ "$(uname -o 2>/dev/null)" = "Cygwin" ] \
+shell_name="$(uname -o 2>/dev/null)"
+if [[ "$shell_name" = "Cygwin" || "$shell_name" = "Msys" ]] \
    && cygpath --help >/dev/null 2>&1; then
     FIXPATH='cygpath -m'
 else
@@ -41,6 +42,15 @@ fix_path() {
 # Corrects the paths in file_list in one pass for efficiency.
 # $1 is the name of the array to be modified.
 fix_file_list() {
+    if [ "${FIXPATH}" = "echo_path" ] ; then
+      # When used with echo_path, fix_file_list is a no-op. Avoid warning about
+      # unsupported 'declare -n' when it is not important.
+      return 0
+    elif [ "${BASH_VERSINFO}" -lt 4 ] ; then
+      echo "Cygwin path conversion has failed. Please use a version of bash"
+      echo "which supports nameref (-n), introduced in bash 4.3"
+      return 1
+    fi
     declare -n array_ref=$1
     files=$(fix_path "${array_ref[@]}")
     local IFS=$'\n'
diff --git a/media/libvpx/libvpx/build/make/rtcd.pl b/media/libvpx/libvpx/build/make/rtcd.pl
index 9e746c46d0..156199abd7 100644
--- a/media/libvpx/libvpx/build/make/rtcd.pl
+++ b/media/libvpx/libvpx/build/make/rtcd.pl
@@ -1,4 +1,13 @@
 #!/usr/bin/env perl
+##
+##  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
 
 no strict 'refs';
 use warnings;
@@ -64,6 +73,10 @@ sub vpx_config($) {
 }
 
 sub specialize {
+  if (@_ <= 1) {
+    die "'specialize' must be called with a function name and at least one ",
+        "architecture ('C' is implied): \n@_\n";
+  }
   my $fn=$_[0];
   shift;
   foreach my $opt (@_) {
@@ -199,7 +212,20 @@ sub filter {
 #
 sub common_top() {
   my $include_guard = uc($opts{sym})."_H_";
+  my @time = localtime;
+  my $year = $time[5] + 1900;
   print <<EOF;
+/*
+ *  Copyright (c) ${year} The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is generated. Do not edit.
 #ifndef ${include_guard}
 #define ${include_guard}
 
@@ -228,13 +254,14 @@ EOF
 }
 
 sub common_bottom() {
+  my $include_guard = uc($opts{sym})."_H_";
   print <<EOF;
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif
+#endif  // ${include_guard}
 EOF
 }
 
@@ -305,14 +332,26 @@ EOF
 
 sub mips() {
   determine_indirection("c", @ALL_ARCHS);
+
+  # Assign the helper variable for each enabled extension
+  foreach my $opt (@ALL_ARCHS) {
+    my $opt_uc = uc $opt;
+    eval "\$have_${opt}=\"flags & HAS_${opt_uc}\"";
+  }
+
   common_top;
 
   print <<EOF;
 #include "vpx_config.h"
 
 #ifdef RTCD_C
+#include "vpx_ports/mips.h"
 static void setup_rtcd_internal(void)
 {
+    int flags = mips_cpu_caps();
+
+    (void)flags;
+
 EOF
 
   set_function_pointers("c", @ALL_ARCHS);
@@ -335,6 +374,67 @@ EOF
   common_bottom;
 }
 
+sub ppc() {
+  determine_indirection("c", @ALL_ARCHS);
+
+  # Assign the helper variable for each enabled extension
+  foreach my $opt (@ALL_ARCHS) {
+    my $opt_uc = uc $opt;
+    eval "\$have_${opt}=\"flags & HAS_${opt_uc}\"";
+  }
+
+  common_top;
+  print <<EOF;
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/ppc.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = ppc_simd_caps();
+    (void)flags;
+EOF
+
+  set_function_pointers("c", @ALL_ARCHS);
+
+  print <<EOF;
+}
+#endif
+EOF
+  common_bottom;
+}
+
+sub loongarch() {
+  determine_indirection("c", @ALL_ARCHS);
+
+  # Assign the helper variable for each enabled extension
+  foreach my $opt (@ALL_ARCHS) {
+    my $opt_uc = uc $opt;
+    eval "\$have_${opt}=\"flags & HAS_${opt_uc}\"";
+  }
+
+  common_top;
+  print <<EOF;
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/loongarch.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = loongarch_cpu_caps();
+
+    (void)flags;
+EOF
+
+  set_function_pointers("c", @ALL_ARCHS);
+
+  print <<EOF;
+}
+#endif
+EOF
+  common_bottom;
+}
+
 sub unoptimized() {
   determine_indirection "c";
   common_top;
@@ -355,41 +455,83 @@ EOF
   common_bottom;
 }
 
+# List of architectures in low-to-high preference order.
+my @PRIORITY_ARCH = qw/
+  c
+  mmx sse sse2 sse3 ssse3 sse4_1 sse4_2 avx avx2 avx512
+  arm_crc32 neon neon_dotprod neon_i8mm sve sve2
+  rvv
+  vsx
+  dspr2 msa
+/;
+my %PRIORITY_INDEX;
+for (my $i = 0; $i < @PRIORITY_ARCH; $i++) {
+  $PRIORITY_INDEX{$PRIORITY_ARCH[$i]} = $i;
+}
+
 #
 # Main Driver
 #
 
 &require("c");
+&require(sort { $PRIORITY_INDEX{$a} <=> $PRIORITY_INDEX{$b} } keys %required);
 if ($opts{arch} eq 'x86') {
-  @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2/);
+  @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2 avx512/);
   x86;
 } elsif ($opts{arch} eq 'x86_64') {
-  @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2/);
-  @REQUIRES = filter(keys %required ? keys %required : qw/mmx sse sse2/);
-  &require(@REQUIRES);
+  @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2 avx512/);
+  if (keys %required == 0) {
+    @REQUIRES = filter(qw/mmx sse sse2/);
+    &require(@REQUIRES);
+  }
   x86;
 } elsif ($opts{arch} eq 'mips32' || $opts{arch} eq 'mips64') {
+  my $have_dspr2 = 0;
+  my $have_msa = 0;
+  my $have_mmi = 0;
   @ALL_ARCHS = filter("$opts{arch}");
   open CONFIG_FILE, $opts{config} or
     die "Error opening config file '$opts{config}': $!\n";
   while (<CONFIG_FILE>) {
     if (/HAVE_DSPR2=yes/) {
-      @ALL_ARCHS = filter("$opts{arch}", qw/dspr2/);
-      last;
+      $have_dspr2 = 1;
     }
     if (/HAVE_MSA=yes/) {
-      @ALL_ARCHS = filter("$opts{arch}", qw/msa/);
-      last;
+      $have_msa = 1;
+    }
+    if (/HAVE_MMI=yes/) {
+      $have_mmi = 1;
     }
   }
   close CONFIG_FILE;
+  if ($have_dspr2 == 1) {
+    @ALL_ARCHS = filter("$opts{arch}", qw/dspr2/);
+  } elsif ($have_msa == 1 && $have_mmi == 1) {
+    @ALL_ARCHS = filter("$opts{arch}", qw/mmi msa/);
+  } elsif ($have_msa == 1) {
+    @ALL_ARCHS = filter("$opts{arch}", qw/msa/);
+  } elsif ($have_mmi == 1) {
+    @ALL_ARCHS = filter("$opts{arch}", qw/mmi/);
+  } else {
+    unoptimized;
+  }
   mips;
 } elsif ($opts{arch} =~ /armv7\w?/) {
   @ALL_ARCHS = filter(qw/neon_asm neon/);
   arm;
 } elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) {
-  @ALL_ARCHS = filter(qw/neon/);
+  @ALL_ARCHS = filter(qw/neon neon_dotprod neon_i8mm sve sve2/);
+  if (keys %required == 0) {
+    @REQUIRES = filter(qw/neon/);
+    &require(@REQUIRES);
+  }
   arm;
+} elsif ($opts{arch} =~ /^ppc/ ) {
+  @ALL_ARCHS = filter(qw/vsx/);
+  ppc;
+} elsif ($opts{arch} =~ /loongarch/ ) {
+  @ALL_ARCHS = filter(qw/lsx lasx/);
+  loongarch;
 } else {
   unoptimized;
 }
diff --git a/media/libvpx/libvpx/build/make/thumb.pm b/media/libvpx/libvpx/build/make/thumb.pm
index 483c2539c6..ef4b316771 100644
--- a/media/libvpx/libvpx/build/make/thumb.pm
+++ b/media/libvpx/libvpx/build/make/thumb.pm
@@ -11,11 +11,8 @@
 
 package thumb;
 
-sub FixThumbInstructions($$)
+sub FixThumbInstructions($)
 {
-    my $short_branches = $_[1];
-    my $branch_shift_offset = $short_branches ? 1 : 0;
-
     # Write additions with shifts, such as "add r10, r11, lsl #8",
     # in three operand form, "add r10, r10, r11, lsl #8".
     s/(add\s+)(r\d+),\s*(r\d+),\s*(lsl #\d+)/$1$2, $2, $3, $4/g;
@@ -54,13 +51,6 @@ sub FixThumbInstructions($$)
     # "addne r0, r0, r2".
     s/^(\s*)((ldr|str)(ne)?[bhd]?)(\s+)(\w+),(\s*\w+,)?\s*\[(\w+)\],\s*(\w+)/$1$2$5$6,$7 [$8]\n$1add$4$5$8, $8, $9/g;
 
-    # Convert a conditional addition to the pc register into a series of
-    # instructions. This converts "addlt pc, pc, r3, lsl #2" into
-    # "itttt lt", "movlt.n r12, pc", "addlt.w r12, #12",
-    # "addlt.w r12, r12, r3, lsl #2", "movlt.n pc, r12".
-    # This assumes that r12 is free at this point.
-    s/^(\s*)addlt(\s+)pc,\s*pc,\s*(\w+),\s*lsl\s*#(\d+)/$1itttt$2lt\n$1movlt.n$2r12, pc\n$1addlt.w$2r12, #12\n$1addlt.w$2r12, r12, $3, lsl #($4-$branch_shift_offset)\n$1movlt.n$2pc, r12/g;
-
     # Convert "mov pc, lr" into "bx lr", since the former only works
     # for switching from arm to thumb (and only in armv7), but not
     # from thumb to arm.
diff --git a/media/libvpx/libvpx/build/make/version.sh b/media/libvpx/libvpx/build/make/version.sh
index 6967527771..8f717cc96f 100644
--- a/media/libvpx/libvpx/build/make/version.sh
+++ b/media/libvpx/libvpx/build/make/version.sh
@@ -60,6 +60,9 @@ if [ ${bare} ]; then
     echo "${changelog_version}${git_version_id}" > $$.tmp
 else
     cat<<EOF>$$.tmp
+// This file is generated. Do not edit.
+#ifndef VPX_VERSION_H_
+#define VPX_VERSION_H_
 #define VERSION_MAJOR  $major_version
 #define VERSION_MINOR  $minor_version
 #define VERSION_PATCH  $patch_version
@@ -67,6 +70,7 @@ else
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
 #define ${id}_NOSP "${version_str}"
 #define ${id}      " ${version_str}"
+#endif  // VPX_VERSION_H_
 EOF
 fi
 if [ -n "$out_file" ]; then
diff --git a/media/libvpx/libvpx/codereview.settings b/media/libvpx/libvpx/codereview.settings
index 34c6f1d9de..ccba2eeed2 100644
--- a/media/libvpx/libvpx/codereview.settings
+++ b/media/libvpx/libvpx/codereview.settings
@@ -1,5 +1,4 @@
-# This file is used by gcl to get repository specific information.
-GERRIT_HOST: chromium-review.googlesource.com
-GERRIT_PORT: 29418
+# This file is used by git cl to get repository specific information.
+GERRIT_HOST: True
 CODE_REVIEW_SERVER: chromium-review.googlesource.com
 GERRIT_SQUASH_UPLOADS: False
diff --git a/media/libvpx/libvpx/configure b/media/libvpx/libvpx/configure
index fb732acf3e..8eee4e4425 100644
--- a/media/libvpx/libvpx/configure
+++ b/media/libvpx/libvpx/configure
@@ -31,7 +31,6 @@ Advanced options:
   --libc=PATH                     path to alternate libc
   --size-limit=WxH                max size to allow in the decoder
   --as={yasm|nasm|auto}           use specified assembler [auto, yasm preferred]
-  --sdk-path=PATH                 path to root of sdk (android builds only)
   ${toggle_codec_srcs}            in/exclude codec library source code
   ${toggle_debug_libs}            in/exclude debug version of libraries
   ${toggle_static_msvcrt}         use static MSVCRT (VS builds only)
@@ -100,19 +99,36 @@ EOF
 # alphabetically by architecture, generic-gnu last.
 all_platforms="${all_platforms} arm64-android-gcc"
 all_platforms="${all_platforms} arm64-darwin-gcc"
+all_platforms="${all_platforms} arm64-darwin20-gcc"
+all_platforms="${all_platforms} arm64-darwin21-gcc"
+all_platforms="${all_platforms} arm64-darwin22-gcc"
+all_platforms="${all_platforms} arm64-darwin23-gcc"
+all_platforms="${all_platforms} arm64-darwin24-gcc"
+all_platforms="${all_platforms} arm64-darwin25-gcc"
 all_platforms="${all_platforms} arm64-linux-gcc"
+all_platforms="${all_platforms} arm64-win64-gcc"
+all_platforms="${all_platforms} arm64-win64-vs15"
+all_platforms="${all_platforms} arm64-win64-vs16"
+all_platforms="${all_platforms} arm64-win64-vs16-clangcl"
+all_platforms="${all_platforms} arm64-win64-vs17"
+all_platforms="${all_platforms} arm64-win64-vs17-clangcl"
 all_platforms="${all_platforms} armv7-android-gcc"   #neon Cortex-A8
 all_platforms="${all_platforms} armv7-darwin-gcc"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-rvct"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-gcc"     #neon Cortex-A8
 all_platforms="${all_platforms} armv7-none-rvct"     #neon Cortex-A8
-all_platforms="${all_platforms} armv7-win32-vs11"
-all_platforms="${all_platforms} armv7-win32-vs12"
+all_platforms="${all_platforms} armv7-win32-gcc"
 all_platforms="${all_platforms} armv7-win32-vs14"
+all_platforms="${all_platforms} armv7-win32-vs15"
+all_platforms="${all_platforms} armv7-win32-vs16"
+all_platforms="${all_platforms} armv7-win32-vs17"
 all_platforms="${all_platforms} armv7s-darwin-gcc"
 all_platforms="${all_platforms} armv8-linux-gcc"
+all_platforms="${all_platforms} loongarch32-linux-gcc"
+all_platforms="${all_platforms} loongarch64-linux-gcc"
 all_platforms="${all_platforms} mips32-linux-gcc"
 all_platforms="${all_platforms} mips64-linux-gcc"
+all_platforms="${all_platforms} ppc64le-linux-gcc"
 all_platforms="${all_platforms} sparc-solaris-gcc"
 all_platforms="${all_platforms} x86-android-gcc"
 all_platforms="${all_platforms} x86-darwin8-gcc"
@@ -125,16 +141,18 @@ all_platforms="${all_platforms} x86-darwin12-gcc"
 all_platforms="${all_platforms} x86-darwin13-gcc"
 all_platforms="${all_platforms} x86-darwin14-gcc"
 all_platforms="${all_platforms} x86-darwin15-gcc"
+all_platforms="${all_platforms} x86-darwin16-gcc"
+all_platforms="${all_platforms} x86-darwin17-gcc"
 all_platforms="${all_platforms} x86-iphonesimulator-gcc"
 all_platforms="${all_platforms} x86-linux-gcc"
 all_platforms="${all_platforms} x86-linux-icc"
 all_platforms="${all_platforms} x86-os2-gcc"
 all_platforms="${all_platforms} x86-solaris-gcc"
 all_platforms="${all_platforms} x86-win32-gcc"
-all_platforms="${all_platforms} x86-win32-vs10"
-all_platforms="${all_platforms} x86-win32-vs11"
-all_platforms="${all_platforms} x86-win32-vs12"
 all_platforms="${all_platforms} x86-win32-vs14"
+all_platforms="${all_platforms} x86-win32-vs15"
+all_platforms="${all_platforms} x86-win32-vs16"
+all_platforms="${all_platforms} x86-win32-vs17"
 all_platforms="${all_platforms} x86_64-android-gcc"
 all_platforms="${all_platforms} x86_64-darwin9-gcc"
 all_platforms="${all_platforms} x86_64-darwin10-gcc"
@@ -143,15 +161,25 @@ all_platforms="${all_platforms} x86_64-darwin12-gcc"
 all_platforms="${all_platforms} x86_64-darwin13-gcc"
 all_platforms="${all_platforms} x86_64-darwin14-gcc"
 all_platforms="${all_platforms} x86_64-darwin15-gcc"
+all_platforms="${all_platforms} x86_64-darwin16-gcc"
+all_platforms="${all_platforms} x86_64-darwin17-gcc"
+all_platforms="${all_platforms} x86_64-darwin18-gcc"
+all_platforms="${all_platforms} x86_64-darwin19-gcc"
+all_platforms="${all_platforms} x86_64-darwin20-gcc"
+all_platforms="${all_platforms} x86_64-darwin21-gcc"
+all_platforms="${all_platforms} x86_64-darwin22-gcc"
+all_platforms="${all_platforms} x86_64-darwin23-gcc"
+all_platforms="${all_platforms} x86_64-darwin24-gcc"
+all_platforms="${all_platforms} x86_64-darwin25-gcc"
 all_platforms="${all_platforms} x86_64-iphonesimulator-gcc"
 all_platforms="${all_platforms} x86_64-linux-gcc"
 all_platforms="${all_platforms} x86_64-linux-icc"
 all_platforms="${all_platforms} x86_64-solaris-gcc"
 all_platforms="${all_platforms} x86_64-win64-gcc"
-all_platforms="${all_platforms} x86_64-win64-vs10"
-all_platforms="${all_platforms} x86_64-win64-vs11"
-all_platforms="${all_platforms} x86_64-win64-vs12"
 all_platforms="${all_platforms} x86_64-win64-vs14"
+all_platforms="${all_platforms} x86_64-win64-vs15"
+all_platforms="${all_platforms} x86_64-win64-vs16"
+all_platforms="${all_platforms} x86_64-win64-vs17"
 all_platforms="${all_platforms} generic-gnu"
 
 # all_targets is a list of all targets that can be configured
@@ -163,11 +191,14 @@ for t in ${all_targets}; do
     [ -f "${source_path}/${t}.mk" ] && enable_feature ${t}
 done
 
+if ! diff --version >/dev/null; then
+  die "diff missing: Try installing diffutils via your package manager."
+fi
+
 if ! perl --version >/dev/null; then
     die "Perl is required to build"
 fi
 
-
 if [ "`cd \"${source_path}\" && pwd`" != "`pwd`" ]; then
   # test to see if source_path already configured
   if [ -f "${source_path}/vpx_config.h" ]; then
@@ -220,10 +251,22 @@ CODEC_FAMILIES="
 
 ARCH_LIST="
     arm
+    aarch64
     mips
     x86
     x86_64
+    ppc
+    loongarch
 "
+
+ARCH_EXT_LIST_AARCH64="
+    neon
+    neon_dotprod
+    neon_i8mm
+    sve
+    sve2
+"
+
 ARCH_EXT_LIST_X86="
     mmx
     sse
@@ -233,10 +276,18 @@ ARCH_EXT_LIST_X86="
     sse4_1
     avx
     avx2
+    avx512
 "
+
+ARCH_EXT_LIST_LOONGSON="
+    mmi
+    lsx
+    lasx
+"
+
 ARCH_EXT_LIST="
-    neon
     neon_asm
+    ${ARCH_EXT_LIST_AARCH64}
 
     mips32
     dspr2
@@ -244,6 +295,10 @@ ARCH_EXT_LIST="
     mips64
 
     ${ARCH_EXT_LIST_X86}
+
+    vsx
+
+    ${ARCH_EXT_LIST_LOONGSON}
 "
 HAVE_LIST="
     ${ARCH_EXT_LIST}
@@ -252,10 +307,10 @@ HAVE_LIST="
     unistd_h
 "
 EXPERIMENT_LIST="
-    spatial_svc
     fp_mb_stats
     emulate_hardware
-    misc_fixes
+    non_greedy_mv
+    collect_component_timing
 "
 CONFIG_LIST="
     dependency_tracking
@@ -310,6 +365,9 @@ CONFIG_LIST="
     better_hw_compatibility
     experimental
     size_limit
+    always_adjust_bpm
+    bitstream_debug
+    mismatch_debug
     ${EXPERIMENT_LIST}
 "
 CMDLINE_SELECT="
@@ -322,6 +380,7 @@ CMDLINE_SELECT="
     install_libs
     install_srcs
     debug
+    profile
     gprof
     gcov
     pic
@@ -369,6 +428,9 @@ CMDLINE_SELECT="
     better_hw_compatibility
     vp9_highbitdepth
     experimental
+    always_adjust_bpm
+    bitstream_debug
+    mismatch_debug
 "
 
 process_cmdline() {
@@ -399,6 +461,12 @@ process_cmdline() {
 }
 
 post_process_cmdline() {
+    if enabled coefficient_range_checking; then
+      echo "coefficient-range-checking is for decoders only, disabling encoders:"
+      soft_disable vp8_encoder
+      soft_disable vp9_encoder
+    fi
+
     c=""
 
     # Enable all detected codecs, if they haven't been disabled
@@ -420,6 +488,7 @@ process_targets() {
     enabled child || write_common_config_banner
     write_common_target_config_h ${BUILD_PFX}vpx_config.h
     write_common_config_targets
+    enabled win_arm64_neon_h_workaround && write_win_arm64_neon_h_workaround ${BUILD_PFX}arm_neon.h
 
     # Calculate the default distribution name, based on the enabled features
     cf=""
@@ -496,7 +565,7 @@ process_detect() {
         # here rather than at option parse time because the target auto-detect
         # magic happens after the command line has been parsed.
         case "${tgt_os}" in
-        linux|os2|darwin*|iphonesimulator*)
+        linux|os2|solaris|darwin*|iphonesimulator*)
             # Supported platforms
             ;;
         *)
@@ -548,16 +617,30 @@ process_detect() {
         check_ld() {
             true
         }
+        check_lib() {
+            true
+        }
     fi
     check_header stdio.h || die "Unable to invoke compiler: ${CC} ${CFLAGS}"
     check_ld <<EOF || die "Toolchain is unable to link executables"
 int main(void) {return 0;}
 EOF
     # check system headers
-    check_header pthread.h
+
+    # Use both check_header and check_lib here, since check_lib
+    # could be a stub that always returns true.
+    check_header pthread.h && check_lib -lpthread <<EOF || disable_feature pthread_h
+#include <pthread.h>
+#include <stddef.h>
+int main(void) { return pthread_create(NULL, NULL, NULL, NULL); }
+EOF
     check_header unistd.h # for sysconf(3) and friends.
 
     check_header vpx/vpx_integer.h -I${source_path} && enable_feature vpx_ports
+
+    if enabled neon && ! enabled external_build; then
+      check_header arm_neon.h || die "Unable to find arm_neon.h"
+    fi
 }
 
 process_toolchain() {
@@ -567,32 +650,66 @@ process_toolchain() {
     if enabled gcc; then
         enabled werror && check_add_cflags -Werror
         check_add_cflags -Wall
-        check_add_cflags -Wdeclaration-after-statement
         check_add_cflags -Wdisabled-optimization
+        check_add_cflags -Wextra-semi
+        check_add_cflags -Wextra-semi-stmt
         check_add_cflags -Wfloat-conversion
+        check_add_cflags -Wformat=2
+        check_add_cflags -Wparentheses-equality
         check_add_cflags -Wpointer-arith
         check_add_cflags -Wtype-limits
         check_add_cflags -Wcast-qual
         check_add_cflags -Wvla
         check_add_cflags -Wimplicit-function-declaration
+        check_add_cflags -Wmissing-declarations
+        check_add_cflags -Wmissing-prototypes
+        check_add_cflags -Wshadow
+        check_add_cflags -Wstrict-prototypes
         check_add_cflags -Wuninitialized
+        check_add_cflags -Wunreachable-code-aggressive
         check_add_cflags -Wunused
-        # -Wextra has some tricky cases. Rather than fix them all now, get the
-        # flag for as many files as possible and fix the remaining issues
-        # piecemeal.
-        # https://bugs.chromium.org/p/webm/issues/detail?id=1069
         check_add_cflags -Wextra
         # check_add_cflags also adds to cxxflags. gtest does not do well with
-        # -Wundef so add it explicitly to CFLAGS only.
+        # these flags so add them explicitly to CFLAGS only.
         check_cflags -Wundef && add_cflags_only -Wundef
+        check_cflags -Wframe-larger-than=52000 && \
+          add_cflags_only -Wframe-larger-than=52000
         if enabled mips || [ -z "${INLINE}" ]; then
           enabled extra_warnings || check_add_cflags -Wno-unused-function
         fi
-        if ! enabled vp9_highbitdepth; then
-          # Avoid this warning for third_party C++ sources. Some reorganization
-          # would be needed to apply this only to test/*.cc.
-          check_cflags -Wshorten-64-to-32 && add_cflags_only -Wshorten-64-to-32
+        # Enforce C99 for C files. Allow GNU extensions.
+        check_cflags -std=gnu99 && add_cflags_only -std=gnu99
+        # Avoid this warning for third_party C++ sources. Some reorganization
+        # would be needed to apply this only to test/*.cc.
+        check_cflags -Wshorten-64-to-32 && add_cflags_only -Wshorten-64-to-32
+
+        # Do not allow implicit vector type conversions on Clang builds (this
+        # is already the default on GCC builds).
+        check_add_cflags -flax-vector-conversions=none
+
+        # Quiet gcc 6 vs 7 abi warnings:
+        # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77728
+        if enabled arm; then
+          check_add_cxxflags -Wno-psabi
         fi
+
+        # Enforce C++17 compatibility.
+        check_add_cxxflags -Wc++20-extensions
+        check_add_cxxflags -Wc++23-extensions
+        check_add_cxxflags -Wnon-virtual-dtor
+
+        # disable some warnings specific to libyuv / libwebm.
+        check_cxxflags -Wno-missing-declarations \
+          && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-declarations"
+        check_cxxflags -Wno-missing-prototypes \
+          && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-prototypes"
+        check_cxxflags -Wno-pass-failed \
+          && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-pass-failed"
+        check_cxxflags -Wno-shadow \
+          && LIBWEBM_CXXFLAGS="${LIBWEBM_CXXFLAGS} -Wno-shadow" \
+          && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-shadow"
+        check_cxxflags -Wno-unused-parameter \
+          && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-unused-parameter"
     fi
 
     if enabled icc; then
@@ -644,7 +761,7 @@ process_toolchain() {
              gen_vcproj_cmd=${source_path}/build/make/gen_msvs_vcxproj.sh
              enabled werror && gen_vcproj_cmd="${gen_vcproj_cmd} --enable-werror"
              all_targets="${all_targets} solution"
-             INLINE="__forceinline"
+             INLINE="__inline"
         ;;
     esac
 
@@ -663,39 +780,33 @@ process_toolchain() {
             soft_enable libyuv
         ;;
         *-android-*)
-            soft_enable webm_io
+            check_add_cxxflags -std=gnu++17 && soft_enable webm_io
             soft_enable libyuv
             # GTestLog must be modified to use Android logging utilities.
         ;;
         *-darwin-*)
+            check_add_cxxflags -std=gnu++17
             # iOS/ARM builds do not work with gtest. This does not match
             # x86 targets.
         ;;
         *-iphonesimulator-*)
-            soft_enable webm_io
+            check_add_cxxflags -std=gnu++17 && soft_enable webm_io
             soft_enable libyuv
         ;;
         *-win*)
             # Some mingw toolchains don't have pthread available by default.
             # Treat these more like visual studio where threading in gtest
             # would be disabled for the same reason.
-            check_cxx "$@" <<EOF && soft_enable unit_tests
-int z;
-EOF
-            check_cxx "$@" <<EOF && soft_enable webm_io
-int z;
-EOF
+            check_add_cxxflags -std=gnu++17 && soft_enable unit_tests \
+              && soft_enable webm_io
             check_cxx "$@" <<EOF && soft_enable libyuv
 int z;
 EOF
         ;;
         *)
-            enabled pthread_h && check_cxx "$@" <<EOF && soft_enable unit_tests
-int z;
-EOF
-            check_cxx "$@" <<EOF && soft_enable webm_io
-int z;
-EOF
+            enabled pthread_h && check_add_cxxflags -std=gnu++17 \
+              && soft_enable unit_tests
+            check_add_cxxflags -std=gnu++17 && soft_enable webm_io
             check_cxx "$@" <<EOF && soft_enable libyuv
 int z;
 EOF
diff --git a/media/libvpx/libvpx/examples.mk b/media/libvpx/libvpx/examples.mk
index 38c4d75c51..a58679942d 100644
--- a/media/libvpx/libvpx/examples.mk
+++ b/media/libvpx/libvpx/examples.mk
@@ -23,7 +23,7 @@ LIBYUV_SRCS +=  third_party/libyuv/include/libyuv/basic_types.h  \
                 third_party/libyuv/source/row_any.cc \
                 third_party/libyuv/source/row_common.cc \
                 third_party/libyuv/source/row_gcc.cc \
-                third_party/libyuv/source/row_mips.cc \
+                third_party/libyuv/source/row_msa.cc \
                 third_party/libyuv/source/row_neon.cc \
                 third_party/libyuv/source/row_neon64.cc \
                 third_party/libyuv/source/row_win.cc \
@@ -31,7 +31,7 @@ LIBYUV_SRCS +=  third_party/libyuv/include/libyuv/basic_types.h  \
                 third_party/libyuv/source/scale_any.cc \
                 third_party/libyuv/source/scale_common.cc \
                 third_party/libyuv/source/scale_gcc.cc \
-                third_party/libyuv/source/scale_mips.cc \
+                third_party/libyuv/source/scale_msa.cc \
                 third_party/libyuv/source/scale_neon.cc \
                 third_party/libyuv/source/scale_neon64.cc \
                 third_party/libyuv/source/scale_win.cc \
@@ -57,6 +57,7 @@ LIBWEBM_PARSER_SRCS = third_party/libwebm/mkvparser/mkvparser.cc \
 # Add compile flags and include path for libwebm sources.
 ifeq ($(CONFIG_WEBM_IO),yes)
   CXXFLAGS     += -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS
+  $(BUILD_PFX)third_party/libwebm/%.cc.o: CXXFLAGS += $(LIBWEBM_CXXFLAGS)
   INC_PATH-yes += $(SRC_PATH_BARE)/third_party/libwebm
 endif
 
@@ -65,22 +66,21 @@ endif
 # while EXAMPLES demonstrate specific portions of the API.
 UTILS-$(CONFIG_DECODERS)    += vpxdec.c
 vpxdec.SRCS                 += md5_utils.c md5_utils.h
+vpxdec.SRCS                 += vpx_ports/compiler_attributes.h
 vpxdec.SRCS                 += vpx_ports/mem_ops.h
 vpxdec.SRCS                 += vpx_ports/mem_ops_aligned.h
-vpxdec.SRCS                 += vpx_ports/msvc.h
 vpxdec.SRCS                 += vpx_ports/vpx_timer.h
 vpxdec.SRCS                 += vpx/vpx_integer.h
 vpxdec.SRCS                 += args.c args.h
 vpxdec.SRCS                 += ivfdec.c ivfdec.h
+vpxdec.SRCS                 += y4minput.c y4minput.h
 vpxdec.SRCS                 += tools_common.c tools_common.h
 vpxdec.SRCS                 += y4menc.c y4menc.h
 ifeq ($(CONFIG_LIBYUV),yes)
   vpxdec.SRCS                 += $(LIBYUV_SRCS)
-  $(BUILD_PFX)third_party/libyuv/%.cc.o: CXXFLAGS += -Wno-unused-parameter
+  $(BUILD_PFX)third_party/libyuv/%.cc.o: CXXFLAGS += ${LIBYUV_CXXFLAGS}
 endif
 ifeq ($(CONFIG_WEBM_IO),yes)
-  vpxdec.SRCS                 += $(LIBWEBM_COMMON_SRCS)
-  vpxdec.SRCS                 += $(LIBWEBM_MUXER_SRCS)
   vpxdec.SRCS                 += $(LIBWEBM_PARSER_SRCS)
   vpxdec.SRCS                 += webmdec.cc webmdec.h
 endif
@@ -95,7 +95,6 @@ vpxenc.SRCS                 += tools_common.c tools_common.h
 vpxenc.SRCS                 += warnings.c warnings.h
 vpxenc.SRCS                 += vpx_ports/mem_ops.h
 vpxenc.SRCS                 += vpx_ports/mem_ops_aligned.h
-vpxenc.SRCS                 += vpx_ports/msvc.h
 vpxenc.SRCS                 += vpx_ports/vpx_timer.h
 vpxenc.SRCS                 += vpxstats.c vpxstats.h
 ifeq ($(CONFIG_LIBYUV),yes)
@@ -109,110 +108,108 @@ ifeq ($(CONFIG_WEBM_IO),yes)
 endif
 vpxenc.GUID                  = 548DEC74-7A15-4B2B-AFC3-AA102E7C25C1
 vpxenc.DESCRIPTION           = Full featured encoder
-ifeq ($(CONFIG_SPATIAL_SVC),yes)
-  EXAMPLES-$(CONFIG_VP9_ENCODER)      += vp9_spatial_svc_encoder.c
-  vp9_spatial_svc_encoder.SRCS        += args.c args.h
-  vp9_spatial_svc_encoder.SRCS        += ivfenc.c ivfenc.h
-  vp9_spatial_svc_encoder.SRCS        += tools_common.c tools_common.h
-  vp9_spatial_svc_encoder.SRCS        += video_common.h
-  vp9_spatial_svc_encoder.SRCS        += video_writer.h video_writer.c
-  vp9_spatial_svc_encoder.SRCS        += vpx_ports/msvc.h
-  vp9_spatial_svc_encoder.SRCS        += vpxstats.c vpxstats.h
-  vp9_spatial_svc_encoder.GUID        = 4A38598D-627D-4505-9C7B-D4020C84100D
-  vp9_spatial_svc_encoder.DESCRIPTION = VP9 Spatial SVC Encoder
-endif
 
-ifneq ($(CONFIG_SHARED),yes)
-EXAMPLES-$(CONFIG_VP9_ENCODER)    += resize_util.c
-endif
+EXAMPLES-$(CONFIG_VP9_ENCODER)      += vp9_spatial_svc_encoder.c
+vp9_spatial_svc_encoder.SRCS        += args.c args.h
+vp9_spatial_svc_encoder.SRCS        += ivfenc.c ivfenc.h
+vp9_spatial_svc_encoder.SRCS        += y4minput.c y4minput.h
+vp9_spatial_svc_encoder.SRCS        += tools_common.c tools_common.h
+vp9_spatial_svc_encoder.SRCS        += video_common.h
+vp9_spatial_svc_encoder.SRCS        += video_writer.h video_writer.c
+vp9_spatial_svc_encoder.SRCS        += vpxstats.c vpxstats.h
+vp9_spatial_svc_encoder.SRCS        += examples/svc_encodeframe.c
+vp9_spatial_svc_encoder.SRCS        += examples/svc_context.h
+vp9_spatial_svc_encoder.GUID        = 4A38598D-627D-4505-9C7B-D4020C84100D
+vp9_spatial_svc_encoder.DESCRIPTION = VP9 Spatial SVC Encoder
 
 EXAMPLES-$(CONFIG_ENCODERS)          += vpx_temporal_svc_encoder.c
 vpx_temporal_svc_encoder.SRCS        += ivfenc.c ivfenc.h
+vpx_temporal_svc_encoder.SRCS        += y4minput.c y4minput.h
 vpx_temporal_svc_encoder.SRCS        += tools_common.c tools_common.h
 vpx_temporal_svc_encoder.SRCS        += video_common.h
 vpx_temporal_svc_encoder.SRCS        += video_writer.h video_writer.c
-vpx_temporal_svc_encoder.SRCS        += vpx_ports/msvc.h
 vpx_temporal_svc_encoder.GUID        = B18C08F2-A439-4502-A78E-849BE3D60947
 vpx_temporal_svc_encoder.DESCRIPTION = Temporal SVC Encoder
 EXAMPLES-$(CONFIG_DECODERS)        += simple_decoder.c
 simple_decoder.GUID                 = D3BBF1E9-2427-450D-BBFF-B2843C1D44CC
 simple_decoder.SRCS                += ivfdec.h ivfdec.c
+simple_decoder.SRCS                += y4minput.c y4minput.h
 simple_decoder.SRCS                += tools_common.h tools_common.c
 simple_decoder.SRCS                += video_common.h
 simple_decoder.SRCS                += video_reader.h video_reader.c
 simple_decoder.SRCS                += vpx_ports/mem_ops.h
 simple_decoder.SRCS                += vpx_ports/mem_ops_aligned.h
-simple_decoder.SRCS                += vpx_ports/msvc.h
 simple_decoder.DESCRIPTION          = Simplified decoder loop
 EXAMPLES-$(CONFIG_DECODERS)        += postproc.c
 postproc.SRCS                      += ivfdec.h ivfdec.c
+postproc.SRCS                      += y4minput.c y4minput.h
 postproc.SRCS                      += tools_common.h tools_common.c
 postproc.SRCS                      += video_common.h
 postproc.SRCS                      += video_reader.h video_reader.c
 postproc.SRCS                      += vpx_ports/mem_ops.h
 postproc.SRCS                      += vpx_ports/mem_ops_aligned.h
-postproc.SRCS                      += vpx_ports/msvc.h
 postproc.GUID                       = 65E33355-F35E-4088-884D-3FD4905881D7
 postproc.DESCRIPTION                = Decoder postprocessor control
 EXAMPLES-$(CONFIG_DECODERS)        += decode_to_md5.c
 decode_to_md5.SRCS                 += md5_utils.h md5_utils.c
 decode_to_md5.SRCS                 += ivfdec.h ivfdec.c
+decode_to_md5.SRCS                 += y4minput.c y4minput.h
 decode_to_md5.SRCS                 += tools_common.h tools_common.c
 decode_to_md5.SRCS                 += video_common.h
 decode_to_md5.SRCS                 += video_reader.h video_reader.c
+decode_to_md5.SRCS                 += vpx_ports/compiler_attributes.h
 decode_to_md5.SRCS                 += vpx_ports/mem_ops.h
 decode_to_md5.SRCS                 += vpx_ports/mem_ops_aligned.h
-decode_to_md5.SRCS                 += vpx_ports/msvc.h
 decode_to_md5.GUID                  = 59120B9B-2735-4BFE-B022-146CA340FE42
 decode_to_md5.DESCRIPTION           = Frame by frame MD5 checksum
 EXAMPLES-$(CONFIG_ENCODERS)     += simple_encoder.c
 simple_encoder.SRCS             += ivfenc.h ivfenc.c
+simple_encoder.SRCS             += y4minput.c y4minput.h
 simple_encoder.SRCS             += tools_common.h tools_common.c
 simple_encoder.SRCS             += video_common.h
 simple_encoder.SRCS             += video_writer.h video_writer.c
-simple_encoder.SRCS             += vpx_ports/msvc.h
 simple_encoder.GUID              = 4607D299-8A71-4D2C-9B1D-071899B6FBFD
 simple_encoder.DESCRIPTION       = Simplified encoder loop
 EXAMPLES-$(CONFIG_VP9_ENCODER)  += vp9_lossless_encoder.c
 vp9_lossless_encoder.SRCS       += ivfenc.h ivfenc.c
+vp9_lossless_encoder.SRCS       += y4minput.c y4minput.h
 vp9_lossless_encoder.SRCS       += tools_common.h tools_common.c
 vp9_lossless_encoder.SRCS       += video_common.h
 vp9_lossless_encoder.SRCS       += video_writer.h video_writer.c
-vp9_lossless_encoder.SRCS       += vpx_ports/msvc.h
 vp9_lossless_encoder.GUID        = B63C7C88-5348-46DC-A5A6-CC151EF93366
 vp9_lossless_encoder.DESCRIPTION = Simplified lossless VP9 encoder
 EXAMPLES-$(CONFIG_ENCODERS)     += twopass_encoder.c
 twopass_encoder.SRCS            += ivfenc.h ivfenc.c
+twopass_encoder.SRCS            += y4minput.c y4minput.h
 twopass_encoder.SRCS            += tools_common.h tools_common.c
 twopass_encoder.SRCS            += video_common.h
 twopass_encoder.SRCS            += video_writer.h video_writer.c
-twopass_encoder.SRCS            += vpx_ports/msvc.h
 twopass_encoder.GUID             = 73494FA6-4AF9-4763-8FBB-265C92402FD8
 twopass_encoder.DESCRIPTION      = Two-pass encoder loop
 EXAMPLES-$(CONFIG_DECODERS)     += decode_with_drops.c
 decode_with_drops.SRCS          += ivfdec.h ivfdec.c
+decode_with_drops.SRCS          += y4minput.c y4minput.h
 decode_with_drops.SRCS          += tools_common.h tools_common.c
 decode_with_drops.SRCS          += video_common.h
 decode_with_drops.SRCS          += video_reader.h video_reader.c
 decode_with_drops.SRCS          += vpx_ports/mem_ops.h
 decode_with_drops.SRCS          += vpx_ports/mem_ops_aligned.h
-decode_with_drops.SRCS          += vpx_ports/msvc.h
 decode_with_drops.GUID           = CE5C53C4-8DDA-438A-86ED-0DDD3CDB8D26
 decode_with_drops.DESCRIPTION    = Drops frames while decoding
 EXAMPLES-$(CONFIG_ENCODERS)        += set_maps.c
 set_maps.SRCS                      += ivfenc.h ivfenc.c
+set_maps.SRCS                      += y4minput.c y4minput.h
 set_maps.SRCS                      += tools_common.h tools_common.c
 set_maps.SRCS                      += video_common.h
 set_maps.SRCS                      += video_writer.h video_writer.c
-set_maps.SRCS                      += vpx_ports/msvc.h
 set_maps.GUID                       = ECB2D24D-98B8-4015-A465-A4AF3DCC145F
 set_maps.DESCRIPTION                = Set active and ROI maps
 EXAMPLES-$(CONFIG_VP8_ENCODER)     += vp8cx_set_ref.c
 vp8cx_set_ref.SRCS                 += ivfenc.h ivfenc.c
+vp8cx_set_ref.SRCS                 += y4minput.c y4minput.h
 vp8cx_set_ref.SRCS                 += tools_common.h tools_common.c
 vp8cx_set_ref.SRCS                 += video_common.h
 vp8cx_set_ref.SRCS                 += video_writer.h video_writer.c
-vp8cx_set_ref.SRCS                 += vpx_ports/msvc.h
 vp8cx_set_ref.GUID                  = C5E31F7F-96F6-48BD-BD3E-10EBF6E8057A
 vp8cx_set_ref.DESCRIPTION           = VP8 set encoder reference frame
 
@@ -220,6 +217,7 @@ ifeq ($(CONFIG_VP9_ENCODER),yes)
 ifeq ($(CONFIG_DECODERS),yes)
 EXAMPLES-yes                       += vp9cx_set_ref.c
 vp9cx_set_ref.SRCS                 += ivfenc.h ivfenc.c
+vp9cx_set_ref.SRCS                 += y4minput.c y4minput.h
 vp9cx_set_ref.SRCS                 += tools_common.h tools_common.c
 vp9cx_set_ref.SRCS                 += video_common.h
 vp9cx_set_ref.SRCS                 += video_writer.h video_writer.c
@@ -232,9 +230,9 @@ ifeq ($(CONFIG_MULTI_RES_ENCODING),yes)
 ifeq ($(CONFIG_LIBYUV),yes)
 EXAMPLES-$(CONFIG_VP8_ENCODER)          += vp8_multi_resolution_encoder.c
 vp8_multi_resolution_encoder.SRCS       += ivfenc.h ivfenc.c
+vp8_multi_resolution_encoder.SRCS       += y4minput.c y4minput.h
 vp8_multi_resolution_encoder.SRCS       += tools_common.h tools_common.c
 vp8_multi_resolution_encoder.SRCS       += video_writer.h video_writer.c
-vp8_multi_resolution_encoder.SRCS       += vpx_ports/msvc.h
 vp8_multi_resolution_encoder.SRCS       += $(LIBYUV_SRCS)
 vp8_multi_resolution_encoder.GUID        = 04f8738e-63c8-423b-90fa-7c2703a374de
 vp8_multi_resolution_encoder.DESCRIPTION = VP8 Multiple-resolution Encoding
@@ -359,6 +357,7 @@ $(1): $($(1:.$(VCPROJ_SFX)=).SRCS) vpx.$(VCPROJ_SFX)
             --ver=$$(CONFIG_VS_VERSION)\
             --proj-guid=$$($$(@:.$(VCPROJ_SFX)=).GUID)\
             --src-path-bare="$(SRC_PATH_BARE)" \
+            --as=$$(AS) \
             $$(if $$(CONFIG_STATIC_MSVCRT),--static-crt) \
             --out=$$@ $$(INTERNAL_CFLAGS) $$(CFLAGS) \
             $$(INTERNAL_LDFLAGS) $$(LDFLAGS) -l$$(CODEC_LIB) $$^
@@ -370,6 +369,13 @@ INSTALL-BINS-$(CONFIG_MSVS) += $(foreach p,$(VS_PLATFORMS),\
 $(foreach proj,$(call enabled,PROJECTS),\
     $(eval $(call vcproj_template,$(proj))))
 
+# Generate a list of all enabled sources, in particular for exporting to gyp
+# based build systems.
+vpxdec_srcs.txt:
+	@echo "    [CREATE] $@"
+	@echo $(vpxdec.SRCS) | xargs -n1 echo | LC_ALL=C sort -u > $@
+CLEAN-OBJS += vpxdec_srcs.txt
+
 #
 # Documentation Rules
 #
@@ -403,3 +409,4 @@ CLEAN-OBJS += examples.doxy samples.dox $(ALL_EXAMPLES:.c=.dox)
 DOCS-yes += examples.doxy samples.dox
 examples.doxy: samples.dox $(ALL_EXAMPLES:.c=.dox)
 	@echo "INPUT += $^" > $@
+	@echo "ENABLED_SECTIONS += samples" >> $@
diff --git a/media/libvpx/libvpx/examples/decode_with_drops.c b/media/libvpx/libvpx/examples/decode_with_drops.c
index e69e2a9f9b..03c79a4561 100644
--- a/media/libvpx/libvpx/examples/decode_with_drops.c
+++ b/media/libvpx/libvpx/examples/decode_with_drops.c
@@ -106,7 +106,7 @@ int main(int argc, char **argv) {
   printf("Using %s\n", vpx_codec_iface_name(decoder->codec_interface()));
 
   if (vpx_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
-    die_codec(&codec, "Failed to initialize decoder.");
+    die("Failed to initialize decoder.");
 
   while (vpx_video_reader_read_frame(reader)) {
     vpx_codec_iter_t iter = NULL;
diff --git a/media/libvpx/libvpx/examples/postproc.c b/media/libvpx/libvpx/examples/postproc.c
index 15713b946a..b53c15ea15 100644
--- a/media/libvpx/libvpx/examples/postproc.c
+++ b/media/libvpx/libvpx/examples/postproc.c
@@ -86,9 +86,9 @@ int main(int argc, char **argv) {
   res = vpx_codec_dec_init(&codec, decoder->codec_interface(), NULL,
                            VPX_CODEC_USE_POSTPROC);
   if (res == VPX_CODEC_INCAPABLE)
-    die_codec(&codec, "Postproc not supported by this decoder.");
+    die("Postproc not supported by this decoder.");
 
-  if (res) die_codec(&codec, "Failed to initialize decoder.");
+  if (res) die("Failed to initialize decoder.");
 
   while (vpx_video_reader_read_frame(reader)) {
     vpx_codec_iter_t iter = NULL;
@@ -109,7 +109,7 @@ int main(int argc, char **argv) {
                                 0 };
       if (vpx_codec_control(&codec, VP8_SET_POSTPROC, &pp))
         die_codec(&codec, "Failed to turn on postproc.");
-    };
+    }
 
     // Decode the frame with 15ms deadline
     if (vpx_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 15000))
diff --git a/media/libvpx/libvpx/examples/resize_util.c b/media/libvpx/libvpx/examples/resize_util.c
deleted file mode 100644
index 7e529b2e20..0000000000
--- a/media/libvpx/libvpx/examples/resize_util.c
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-#include <limits.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "../tools_common.h"
-#include "../vp9/encoder/vp9_resize.h"
-
-static const char *exec_name = NULL;
-
-static void usage() {
-  printf("Usage:\n");
-  printf("%s <input_yuv> <width>x<height> <target_width>x<target_height> ",
-         exec_name);
-  printf("<output_yuv> [<frames>]\n");
-}
-
-void usage_exit(void) {
-  usage();
-  exit(EXIT_FAILURE);
-}
-
-static int parse_dim(char *v, int *width, int *height) {
-  char *x = strchr(v, 'x');
-  if (x == NULL) x = strchr(v, 'X');
-  if (x == NULL) return 0;
-  *width = atoi(v);
-  *height = atoi(&x[1]);
-  if (*width <= 0 || *height <= 0)
-    return 0;
-  else
-    return 1;
-}
-
-int main(int argc, char *argv[]) {
-  char *fin, *fout;
-  FILE *fpin, *fpout;
-  uint8_t *inbuf, *outbuf;
-  uint8_t *inbuf_u, *outbuf_u;
-  uint8_t *inbuf_v, *outbuf_v;
-  int f, frames;
-  int width, height, target_width, target_height;
-
-  exec_name = argv[0];
-
-  if (argc < 5) {
-    printf("Incorrect parameters:\n");
-    usage();
-    return 1;
-  }
-
-  fin = argv[1];
-  fout = argv[4];
-  if (!parse_dim(argv[2], &width, &height)) {
-    printf("Incorrect parameters: %s\n", argv[2]);
-    usage();
-    return 1;
-  }
-  if (!parse_dim(argv[3], &target_width, &target_height)) {
-    printf("Incorrect parameters: %s\n", argv[3]);
-    usage();
-    return 1;
-  }
-
-  fpin = fopen(fin, "rb");
-  if (fpin == NULL) {
-    printf("Can't open file %s to read\n", fin);
-    usage();
-    return 1;
-  }
-  fpout = fopen(fout, "wb");
-  if (fpout == NULL) {
-    printf("Can't open file %s to write\n", fout);
-    usage();
-    return 1;
-  }
-  if (argc >= 6)
-    frames = atoi(argv[5]);
-  else
-    frames = INT_MAX;
-
-  printf("Input size:  %dx%d\n", width, height);
-  printf("Target size: %dx%d, Frames: ", target_width, target_height);
-  if (frames == INT_MAX)
-    printf("All\n");
-  else
-    printf("%d\n", frames);
-
-  inbuf = (uint8_t *)malloc(width * height * 3 / 2);
-  outbuf = (uint8_t *)malloc(target_width * target_height * 3 / 2);
-  inbuf_u = inbuf + width * height;
-  inbuf_v = inbuf_u + width * height / 4;
-  outbuf_u = outbuf + target_width * target_height;
-  outbuf_v = outbuf_u + target_width * target_height / 4;
-  f = 0;
-  while (f < frames) {
-    if (fread(inbuf, width * height * 3 / 2, 1, fpin) != 1) break;
-    vp9_resize_frame420(inbuf, width, inbuf_u, inbuf_v, width / 2, height,
-                        width, outbuf, target_width, outbuf_u, outbuf_v,
-                        target_width / 2, target_height, target_width);
-    fwrite(outbuf, target_width * target_height * 3 / 2, 1, fpout);
-    f++;
-  }
-  printf("%d frames processed\n", f);
-  fclose(fpin);
-  fclose(fpout);
-
-  free(inbuf);
-  free(outbuf);
-  return 0;
-}
diff --git a/media/libvpx/libvpx/examples/set_maps.c b/media/libvpx/libvpx/examples/set_maps.c
index c0c7d10e72..867e473aea 100644
--- a/media/libvpx/libvpx/examples/set_maps.c
+++ b/media/libvpx/libvpx/examples/set_maps.c
@@ -209,7 +209,7 @@ int main(int argc, char **argv) {
     die("Failed to open %s for reading.", argv[4]);
 
   if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
-    die_codec(&codec, "Failed to initialize encoder");
+    die("Failed to initialize encoder");
 
   // Encode frames.
   while (vpx_img_read(&raw, infile)) {
diff --git a/media/libvpx/libvpx/examples/simple_decoder.c b/media/libvpx/libvpx/examples/simple_decoder.c
index 2bb1a05245..d089e826d5 100644
--- a/media/libvpx/libvpx/examples/simple_decoder.c
+++ b/media/libvpx/libvpx/examples/simple_decoder.c
@@ -118,7 +118,7 @@ int main(int argc, char **argv) {
   printf("Using %s\n", vpx_codec_iface_name(decoder->codec_interface()));
 
   if (vpx_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
-    die_codec(&codec, "Failed to initialize decoder.");
+    die("Failed to initialize decoder.");
 
   while (vpx_video_reader_read_frame(reader)) {
     vpx_codec_iter_t iter = NULL;
diff --git a/media/libvpx/libvpx/examples/simple_encoder.c b/media/libvpx/libvpx/examples/simple_encoder.c
index dde6344f8d..dffdd6d7da 100644
--- a/media/libvpx/libvpx/examples/simple_encoder.c
+++ b/media/libvpx/libvpx/examples/simple_encoder.c
@@ -218,7 +218,7 @@ int main(int argc, char **argv) {
     die("Failed to open %s for reading.", infile_arg);
 
   if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
-    die_codec(&codec, "Failed to initialize encoder");
+    die("Failed to initialize encoder");
 
   // Encode frames.
   while (vpx_img_read(&raw, infile)) {
diff --git a/media/libvpx/libvpx/vpx/svc_context.h b/media/libvpx/libvpx/examples/svc_context.h
similarity index 83%
rename from media/libvpx/libvpx/vpx/svc_context.h
rename to media/libvpx/libvpx/examples/svc_context.h
index 462785075c..7ca987dc83 100644
--- a/media/libvpx/libvpx/vpx/svc_context.h
+++ b/media/libvpx/libvpx/examples/svc_context.h
@@ -13,11 +13,11 @@
  * spatial SVC frame
  */
 
-#ifndef VPX_SVC_CONTEXT_H_
-#define VPX_SVC_CONTEXT_H_
+#ifndef VPX_EXAMPLES_SVC_CONTEXT_H_
+#define VPX_EXAMPLES_SVC_CONTEXT_H_
 
-#include "./vp8cx.h"
-#include "./vpx_encoder.h"
+#include "vpx/vp8cx.h"
+#include "vpx/vpx_encoder.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -35,12 +35,11 @@ typedef struct {
   int temporal_layers;  // number of temporal layers
   int temporal_layering_mode;
   SVC_LOG_LEVEL log_level;  // amount of information to display
-  int log_print;       // when set, printf log messages instead of returning the
-                       // message with svc_get_message
-  int output_rc_stat;  // for outputting rc stats
-  int speed;           // speed setting for codec
+  int output_rc_stat;       // for outputting rc stats
+  int speed;                // speed setting for codec
   int threads;
   int aqmode;  // turns on aq-mode=3 (cyclic_refresh): 0=off, 1=on.
+  int use_psnr;
   // private storage for vpx_svc_encode
   void *internal;
 } SvcContext;
@@ -60,6 +59,7 @@ typedef struct SvcInternal {
   double psnr_sum[VPX_SS_MAX_LAYERS][COMPONENTS];  // total/Y/U/V
   uint64_t sse_sum[VPX_SS_MAX_LAYERS][COMPONENTS];
   uint32_t bytes_sum[VPX_SS_MAX_LAYERS];
+  int number_of_frames[VPX_SS_MAX_LAYERS];
 
   // codec encoding values
   int width;    // width of highest layer
@@ -67,11 +67,9 @@ typedef struct SvcInternal {
   int kf_dist;  // distance between keyframes
 
   // state variables
-  int psnr_pkt_received;
   int layer;
   int use_multiple_frame_contexts;
 
-  char message_buffer[2048];
   vpx_codec_ctx_t *codec_ctx;
 } SvcInternal_t;
 
@@ -106,15 +104,10 @@ void vpx_svc_release(SvcContext *svc_ctx);
 /**
  * dump accumulated statistics and reset accumulated values
  */
-const char *vpx_svc_dump_statistics(SvcContext *svc_ctx);
-
-/**
- *  get status message from previous encode
- */
-const char *vpx_svc_get_message(const SvcContext *svc_ctx);
+void vpx_svc_dump_statistics(SvcContext *svc_ctx);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VPX_SVC_CONTEXT_H_
+#endif  // VPX_EXAMPLES_SVC_CONTEXT_H_
diff --git a/media/libvpx/libvpx/vpx/src/svc_encodeframe.c b/media/libvpx/libvpx/examples/svc_encodeframe.c
similarity index 80%
rename from media/libvpx/libvpx/vpx/src/svc_encodeframe.c
rename to media/libvpx/libvpx/examples/svc_encodeframe.c
index c2f80d8851..54843f0f0f 100644
--- a/media/libvpx/libvpx/vpx/src/svc_encodeframe.c
+++ b/media/libvpx/libvpx/examples/svc_encodeframe.c
@@ -21,8 +21,9 @@
 #include <stdlib.h>
 #include <string.h>
 #define VPX_DISABLE_CTRL_TYPECHECKS 1
+#include "../tools_common.h"
 #include "./vpx_config.h"
-#include "vpx/svc_context.h"
+#include "./svc_context.h"
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
 #include "vpx_mem/vpx_mem.h"
@@ -95,17 +96,12 @@ static const SvcInternal_t *get_const_svc_internal(const SvcContext *svc_ctx) {
   return (const SvcInternal_t *)svc_ctx->internal;
 }
 
-static void svc_log_reset(SvcContext *svc_ctx) {
-  SvcInternal_t *const si = (SvcInternal_t *)svc_ctx->internal;
-  si->message_buffer[0] = '\0';
-}
-
-static int svc_log(SvcContext *svc_ctx, SVC_LOG_LEVEL level, const char *fmt,
-                   ...) {
+static VPX_TOOLS_FORMAT_PRINTF(3, 4) int svc_log(SvcContext *svc_ctx,
+                                                 SVC_LOG_LEVEL level,
+                                                 const char *fmt, ...) {
   char buf[512];
   int retval = 0;
   va_list ap;
-  SvcInternal_t *const si = get_svc_internal(svc_ctx);
 
   if (level > svc_ctx->log_level) {
     return retval;
@@ -115,25 +111,17 @@ static int svc_log(SvcContext *svc_ctx, SVC_LOG_LEVEL level, const char *fmt,
   retval = vsnprintf(buf, sizeof(buf), fmt, ap);
   va_end(ap);
 
-  if (svc_ctx->log_print) {
-    printf("%s", buf);
-  } else {
-    strncat(si->message_buffer, buf,
-            sizeof(si->message_buffer) - strlen(si->message_buffer) - 1);
-  }
+  printf("%s", buf);
 
-  if (level == SVC_LOG_ERROR) {
-    si->codec_ctx->err_detail = si->message_buffer;
-  }
   return retval;
 }
 
 static vpx_codec_err_t extract_option(LAYER_OPTION_TYPE type, char *input,
                                       int *value0, int *value1) {
   if (type == SCALE_FACTOR) {
-    *value0 = strtol(input, &input, 10);
+    *value0 = (int)strtol(input, &input, 10);
     if (*input++ != '/') return VPX_CODEC_INVALID_PARAM;
-    *value1 = strtol(input, &input, 10);
+    *value1 = (int)strtol(input, &input, 10);
 
     if (*value0 < option_min_values[SCALE_FACTOR] ||
         *value1 < option_min_values[SCALE_FACTOR] ||
@@ -169,6 +157,7 @@ static vpx_codec_err_t parse_layer_options_from_string(SvcContext *svc_ctx,
     return VPX_CODEC_INVALID_PARAM;
 
   input_string = strdup(input);
+  if (input_string == NULL) return VPX_CODEC_MEM_ERROR;
   token = strtok_r(input_string, delim, &save_ptr);
   for (i = 0; i < num_layers; ++i) {
     if (token != NULL) {
@@ -208,6 +197,7 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) {
 
   if (options == NULL) return VPX_CODEC_OK;
   input_string = strdup(options);
+  if (input_string == NULL) return VPX_CODEC_MEM_ERROR;
 
   // parse option name
   option_name = strtok_r(input_string, "=", &input_ptr);
@@ -276,7 +266,7 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) {
   if (alt_ref_enabled > REF_FRAMES - svc_ctx->spatial_layers) {
     svc_log(svc_ctx, SVC_LOG_ERROR,
             "svc: auto alt ref: Maxinum %d(REF_FRAMES - layers) layers could"
-            "enabled auto alt reference frame, but % layers are enabled\n",
+            "enabled auto alt reference frame, but %d layers are enabled\n",
             REF_FRAMES - svc_ctx->spatial_layers, alt_ref_enabled);
     res = VPX_CODEC_INVALID_PARAM;
   }
@@ -289,13 +279,13 @@ vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options) {
   if (svc_ctx == NULL || options == NULL || si == NULL) {
     return VPX_CODEC_INVALID_PARAM;
   }
-  strncpy(si->options, options, sizeof(si->options));
+  strncpy(si->options, options, sizeof(si->options) - 1);
   si->options[sizeof(si->options) - 1] = '\0';
   return VPX_CODEC_OK;
 }
 
-vpx_codec_err_t assign_layer_bitrates(const SvcContext *svc_ctx,
-                                      vpx_codec_enc_cfg_t *const enc_cfg) {
+static vpx_codec_err_t assign_layer_bitrates(
+    const SvcContext *svc_ctx, vpx_codec_enc_cfg_t *const enc_cfg) {
   int i;
   const SvcInternal_t *const si = get_const_svc_internal(svc_ctx);
   int sl, tl, spatial_layer_target;
@@ -391,7 +381,7 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
                              vpx_codec_iface_t *iface,
                              vpx_codec_enc_cfg_t *enc_cfg) {
   vpx_codec_err_t res;
-  int i, sl, tl;
+  int sl, tl;
   SvcInternal_t *const si = get_svc_internal(svc_ctx);
   if (svc_ctx == NULL || codec_ctx == NULL || iface == NULL ||
       enc_cfg == NULL) {
@@ -436,10 +426,14 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
       si->svc_params.scaling_factor_num[sl] = DEFAULT_SCALE_FACTORS_NUM_2x[sl2];
       si->svc_params.scaling_factor_den[sl] = DEFAULT_SCALE_FACTORS_DEN_2x[sl2];
     }
+    if (svc_ctx->spatial_layers == 1) {
+      si->svc_params.scaling_factor_num[0] = 1;
+      si->svc_params.scaling_factor_den[0] = 1;
+    }
   }
   for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) {
     for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
-      i = sl * svc_ctx->temporal_layers + tl;
+      const int i = sl * svc_ctx->temporal_layers + tl;
       si->svc_params.max_quantizers[i] = MAX_QUANTIZER;
       si->svc_params.min_quantizers[i] = 0;
       if (enc_cfg->rc_end_usage == VPX_CBR &&
@@ -464,11 +458,11 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
     svc_ctx->temporal_layers = VPX_TS_MAX_LAYERS;
 
   if (svc_ctx->temporal_layers * svc_ctx->spatial_layers > VPX_MAX_LAYERS) {
-    svc_log(svc_ctx, SVC_LOG_ERROR,
-            "spatial layers * temporal layers exceeds the maximum number of "
-            "allowed layers of %d\n",
-            svc_ctx->spatial_layers * svc_ctx->temporal_layers,
-            (int)VPX_MAX_LAYERS);
+    svc_log(
+        svc_ctx, SVC_LOG_ERROR,
+        "spatial layers * temporal layers (%d) exceeds the maximum number of "
+        "allowed layers of %d\n",
+        svc_ctx->spatial_layers * svc_ctx->temporal_layers, VPX_MAX_LAYERS);
     return VPX_CODEC_INVALID_PARAM;
   }
   res = assign_layer_bitrates(svc_ctx, enc_cfg);
@@ -481,11 +475,6 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
     return VPX_CODEC_INVALID_PARAM;
   }
 
-#if CONFIG_SPATIAL_SVC
-  for (i = 0; i < svc_ctx->spatial_layers; ++i)
-    enc_cfg->ss_enable_auto_alt_ref[i] = si->enable_auto_alt_ref[i];
-#endif
-
   if (svc_ctx->temporal_layers > 1) {
     int i;
     for (i = 0; i < svc_ctx->temporal_layers; ++i) {
@@ -510,14 +499,28 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
     enc_cfg->rc_buf_initial_sz = 500;
     enc_cfg->rc_buf_optimal_sz = 600;
     enc_cfg->rc_buf_sz = 1000;
-    enc_cfg->rc_dropframe_thresh = 0;
+  }
+
+  for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) {
+    for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
+      const int i = sl * svc_ctx->temporal_layers + tl;
+      if (enc_cfg->rc_end_usage == VPX_CBR &&
+          enc_cfg->g_pass == VPX_RC_ONE_PASS) {
+        si->svc_params.max_quantizers[i] = enc_cfg->rc_max_quantizer;
+        si->svc_params.min_quantizers[i] = enc_cfg->rc_min_quantizer;
+      }
+    }
   }
 
   if (enc_cfg->g_error_resilient == 0 && si->use_multiple_frame_contexts == 0)
     enc_cfg->g_error_resilient = 1;
 
   // Initialize codec
-  res = vpx_codec_enc_init(codec_ctx, iface, enc_cfg, VPX_CODEC_USE_PSNR);
+  vpx_codec_flags_t flags = 0;
+  if (svc_ctx->use_psnr) {
+    flags |= VPX_CODEC_USE_PSNR;
+  }
+  res = vpx_codec_enc_init(codec_ctx, iface, enc_cfg, flags);
   if (res != VPX_CODEC_OK) {
     svc_log(svc_ctx, SVC_LOG_ERROR, "svc_enc_init error\n");
     return res;
@@ -537,100 +540,27 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
                                struct vpx_image *rawimg, vpx_codec_pts_t pts,
                                int64_t duration, int deadline) {
   vpx_codec_err_t res;
-  vpx_codec_iter_t iter;
-  const vpx_codec_cx_pkt_t *cx_pkt;
   SvcInternal_t *const si = get_svc_internal(svc_ctx);
   if (svc_ctx == NULL || codec_ctx == NULL || si == NULL) {
     return VPX_CODEC_INVALID_PARAM;
   }
 
-  svc_log_reset(svc_ctx);
-
   res =
       vpx_codec_encode(codec_ctx, rawimg, pts, (uint32_t)duration, 0, deadline);
   if (res != VPX_CODEC_OK) {
     return res;
   }
-  // save compressed data
-  iter = NULL;
-  while ((cx_pkt = vpx_codec_get_cx_data(codec_ctx, &iter))) {
-    switch (cx_pkt->kind) {
-#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION)
-#if CONFIG_SPATIAL_SVC
-      case VPX_CODEC_SPATIAL_SVC_LAYER_PSNR: {
-        int i;
-        for (i = 0; i < svc_ctx->spatial_layers; ++i) {
-          int j;
-          svc_log(svc_ctx, SVC_LOG_DEBUG,
-                  "SVC frame: %d, layer: %d, PSNR(Total/Y/U/V): "
-                  "%2.3f  %2.3f  %2.3f  %2.3f \n",
-                  si->psnr_pkt_received, i, cx_pkt->data.layer_psnr[i].psnr[0],
-                  cx_pkt->data.layer_psnr[i].psnr[1],
-                  cx_pkt->data.layer_psnr[i].psnr[2],
-                  cx_pkt->data.layer_psnr[i].psnr[3]);
-          svc_log(svc_ctx, SVC_LOG_DEBUG,
-                  "SVC frame: %d, layer: %d, SSE(Total/Y/U/V): "
-                  "%2.3f  %2.3f  %2.3f  %2.3f \n",
-                  si->psnr_pkt_received, i, cx_pkt->data.layer_psnr[i].sse[0],
-                  cx_pkt->data.layer_psnr[i].sse[1],
-                  cx_pkt->data.layer_psnr[i].sse[2],
-                  cx_pkt->data.layer_psnr[i].sse[3]);
-
-          for (j = 0; j < COMPONENTS; ++j) {
-            si->psnr_sum[i][j] += cx_pkt->data.layer_psnr[i].psnr[j];
-            si->sse_sum[i][j] += cx_pkt->data.layer_psnr[i].sse[j];
-          }
-        }
-        ++si->psnr_pkt_received;
-        break;
-      }
-      case VPX_CODEC_SPATIAL_SVC_LAYER_SIZES: {
-        int i;
-        for (i = 0; i < svc_ctx->spatial_layers; ++i)
-          si->bytes_sum[i] += cx_pkt->data.layer_sizes[i];
-        break;
-      }
-#endif
-#endif
-      case VPX_CODEC_PSNR_PKT: {
-#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION)
-        int j;
-        svc_log(svc_ctx, SVC_LOG_DEBUG,
-                "frame: %d, layer: %d, PSNR(Total/Y/U/V): "
-                "%2.3f  %2.3f  %2.3f  %2.3f \n",
-                si->psnr_pkt_received, 0, cx_pkt->data.layer_psnr[0].psnr[0],
-                cx_pkt->data.layer_psnr[0].psnr[1],
-                cx_pkt->data.layer_psnr[0].psnr[2],
-                cx_pkt->data.layer_psnr[0].psnr[3]);
-        for (j = 0; j < COMPONENTS; ++j) {
-          si->psnr_sum[0][j] += cx_pkt->data.layer_psnr[0].psnr[j];
-          si->sse_sum[0][j] += cx_pkt->data.layer_psnr[0].sse[j];
-        }
-#endif
-      }
-        ++si->psnr_pkt_received;
-        break;
-      default: { break; }
-    }
-  }
 
   return VPX_CODEC_OK;
 }
 
-const char *vpx_svc_get_message(const SvcContext *svc_ctx) {
-  const SvcInternal_t *const si = get_const_svc_internal(svc_ctx);
-  if (svc_ctx == NULL || si == NULL) return NULL;
-  return si->message_buffer;
-}
-
 static double calc_psnr(double d) {
   if (d == 0) return 100;
   return -10.0 * log(d) / log(10.0);
 }
 
 // dump accumulated statistics and reset accumulated values
-const char *vpx_svc_dump_statistics(SvcContext *svc_ctx) {
-  int number_of_frames;
+void vpx_svc_dump_statistics(SvcContext *svc_ctx) {
   int i, j;
   uint32_t bytes_total = 0;
   double scale[COMPONENTS];
@@ -639,23 +569,20 @@ const char *vpx_svc_dump_statistics(SvcContext *svc_ctx) {
   double y_scale;
 
   SvcInternal_t *const si = get_svc_internal(svc_ctx);
-  if (svc_ctx == NULL || si == NULL) return NULL;
-
-  svc_log_reset(svc_ctx);
-
-  number_of_frames = si->psnr_pkt_received;
-  if (number_of_frames <= 0) return vpx_svc_get_message(svc_ctx);
+  if (svc_ctx == NULL || si == NULL) return;
 
   svc_log(svc_ctx, SVC_LOG_INFO, "\n");
   for (i = 0; i < svc_ctx->spatial_layers; ++i) {
     svc_log(svc_ctx, SVC_LOG_INFO,
-            "Layer %d Average PSNR=[%2.3f, %2.3f, %2.3f, %2.3f], Bytes=[%u]\n",
-            i, (double)si->psnr_sum[i][0] / number_of_frames,
-            (double)si->psnr_sum[i][1] / number_of_frames,
-            (double)si->psnr_sum[i][2] / number_of_frames,
-            (double)si->psnr_sum[i][3] / number_of_frames, si->bytes_sum[i]);
+            "Layer %d Average PSNR=[%2.3f, %2.3f, %2.3f, %2.3f], Bytes=[%u], "
+            "Number_of_frames %d \n",
+            i, si->psnr_sum[i][0] / si->number_of_frames[i],
+            si->psnr_sum[i][1] / si->number_of_frames[i],
+            si->psnr_sum[i][2] / si->number_of_frames[i],
+            si->psnr_sum[i][3] / si->number_of_frames[i], si->bytes_sum[i],
+            si->number_of_frames[i]);
     // the following psnr calculation is deduced from ffmpeg.c#print_report
-    y_scale = si->width * si->height * 255.0 * 255.0 * number_of_frames;
+    y_scale = si->width * si->height * 255.0 * 255.0 * si->number_of_frames[i];
     scale[1] = y_scale;
     scale[2] = scale[3] = y_scale / 4;  // U or V
     scale[0] = y_scale * 1.5;           // total
@@ -672,19 +599,18 @@ const char *vpx_svc_dump_statistics(SvcContext *svc_ctx) {
             mse[1], mse[2], mse[3]);
 
     bytes_total += si->bytes_sum[i];
-    // Clear sums for next time.
+  }
+  // Clear sums for next time.
+  for (i = 0; i < svc_ctx->spatial_layers; ++i) {
     si->bytes_sum[i] = 0;
+    si->number_of_frames[i] = 0;
     for (j = 0; j < COMPONENTS; ++j) {
       si->psnr_sum[i][j] = 0;
       si->sse_sum[i][j] = 0;
     }
   }
 
-  // only display statistics once
-  si->psnr_pkt_received = 0;
-
   svc_log(svc_ctx, SVC_LOG_INFO, "Total Bytes=[%u]\n", bytes_total);
-  return vpx_svc_get_message(svc_ctx);
 }
 
 void vpx_svc_release(SvcContext *svc_ctx) {
diff --git a/media/libvpx/libvpx/examples/twopass_encoder.c b/media/libvpx/libvpx/examples/twopass_encoder.c
index 4e63a7a6c9..07a10d9cf3 100644
--- a/media/libvpx/libvpx/examples/twopass_encoder.c
+++ b/media/libvpx/libvpx/examples/twopass_encoder.c
@@ -84,6 +84,7 @@ static int get_frame_stats(vpx_codec_ctx_t *ctx, const vpx_image_t *img,
       const uint8_t *const pkt_buf = pkt->data.twopass_stats.buf;
       const size_t pkt_size = pkt->data.twopass_stats.sz;
       stats->buf = realloc(stats->buf, stats->sz + pkt_size);
+      if (!stats->buf) die("Failed to reallocate stats buffer.");
       memcpy((uint8_t *)stats->buf + stats->sz, pkt_buf, pkt_size);
       stats->sz += pkt_size;
     }
@@ -128,7 +129,7 @@ static vpx_fixed_buf_t pass0(vpx_image_t *raw, FILE *infile,
   vpx_fixed_buf_t stats = { NULL, 0 };
 
   if (vpx_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0))
-    die_codec(&codec, "Failed to initialize encoder");
+    die("Failed to initialize encoder");
 
   // Calculate frame statistics.
   while (vpx_img_read(raw, infile)) {
@@ -164,7 +165,7 @@ static void pass1(vpx_image_t *raw, FILE *infile, const char *outfile_name,
   if (!writer) die("Failed to open %s for writing", outfile_name);
 
   if (vpx_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0))
-    die_codec(&codec, "Failed to initialize encoder");
+    die("Failed to initialize encoder");
 
   // Encode frames.
   while (vpx_img_read(raw, infile)) {
@@ -221,7 +222,7 @@ int main(int argc, char **argv) {
     die("Invalid frame size: %dx%d", w, h);
 
   if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, w, h, 1))
-    die("Failed to allocate image", w, h);
+    die("Failed to allocate image (%dx%d)", w, h);
 
   printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
 
diff --git a/media/libvpx/libvpx/examples/vp8_multi_resolution_encoder.c b/media/libvpx/libvpx/examples/vp8_multi_resolution_encoder.c
index 0b9663c777..60161da90a 100644
--- a/media/libvpx/libvpx/examples/vp8_multi_resolution_encoder.c
+++ b/media/libvpx/libvpx/examples/vp8_multi_resolution_encoder.c
@@ -25,7 +25,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdarg.h>
-#include <string.h>
 #include <math.h>
 #include <assert.h>
 #include <sys/time.h>
@@ -61,7 +60,7 @@ void usage_exit(void) { exit(EXIT_FAILURE); }
 
 int (*read_frame_p)(FILE *f, vpx_image_t *img);
 
-static int read_frame(FILE *f, vpx_image_t *img) {
+static int mulres_read_frame(FILE *f, vpx_image_t *img) {
   size_t nbytes, to_read;
   int res = 1;
 
@@ -75,7 +74,7 @@ static int read_frame(FILE *f, vpx_image_t *img) {
   return res;
 }
 
-static int read_frame_by_row(FILE *f, vpx_image_t *img) {
+static int mulres_read_frame_by_row(FILE *f, vpx_image_t *img) {
   size_t nbytes, to_read;
   int res = 1;
   int plane;
@@ -151,7 +150,7 @@ static void write_ivf_frame_header(FILE *outfile,
   if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return;
 
   pts = pkt->data.frame.pts;
-  mem_put_le32(header, pkt->data.frame.sz);
+  mem_put_le32(header, (int)pkt->data.frame.sz);
   mem_put_le32(header + 4, pts & 0xFFFFFFFF);
   mem_put_le32(header + 8, pts >> 32);
 
@@ -190,7 +189,7 @@ static void set_temporal_layer_pattern(int num_temporal_layers,
       cfg->ts_layer_id[0] = 0;
       cfg->ts_layer_id[1] = 1;
       // Use 60/40 bit allocation as example.
-      cfg->ts_target_bitrate[0] = 0.6f * bitrate;
+      cfg->ts_target_bitrate[0] = (int)(0.6f * bitrate);
       cfg->ts_target_bitrate[1] = bitrate;
 
       /* 0=L, 1=GF */
@@ -241,8 +240,8 @@ static void set_temporal_layer_pattern(int num_temporal_layers,
       cfg->ts_layer_id[2] = 1;
       cfg->ts_layer_id[3] = 2;
       // Use 45/20/35 bit allocation as example.
-      cfg->ts_target_bitrate[0] = 0.45f * bitrate;
-      cfg->ts_target_bitrate[1] = 0.65f * bitrate;
+      cfg->ts_target_bitrate[0] = (int)(0.45f * bitrate);
+      cfg->ts_target_bitrate[1] = (int)(0.65f * bitrate);
       cfg->ts_target_bitrate[2] = bitrate;
 
       /* 0=L, 1=GF, 2=ARF */
@@ -294,8 +293,8 @@ int main(int argc, char **argv) {
   vpx_codec_err_t res[NUM_ENCODERS];
 
   int i;
-  long width;
-  long height;
+  int width;
+  int height;
   int length_frame;
   int frame_avail;
   int got_data;
@@ -347,12 +346,12 @@ int main(int argc, char **argv) {
 
   printf("Using %s\n", vpx_codec_iface_name(interface));
 
-  width = strtol(argv[1], NULL, 0);
-  height = strtol(argv[2], NULL, 0);
-  framerate = strtol(argv[3], NULL, 0);
+  width = (int)strtol(argv[1], NULL, 0);
+  height = (int)strtol(argv[2], NULL, 0);
+  framerate = (int)strtol(argv[3], NULL, 0);
 
   if (width < 16 || width % 2 || height < 16 || height % 2)
-    die("Invalid resolution: %ldx%ld", width, height);
+    die("Invalid resolution: %dx%d", width, height);
 
   /* Open input video file for encoding */
   if (!(infile = fopen(argv[4], "rb")))
@@ -371,15 +370,16 @@ int main(int argc, char **argv) {
 
   // Bitrates per spatial layer: overwrite default rates above.
   for (i = 0; i < NUM_ENCODERS; i++) {
-    target_bitrate[i] = strtol(argv[NUM_ENCODERS + 5 + i], NULL, 0);
+    target_bitrate[i] = (int)strtol(argv[NUM_ENCODERS + 5 + i], NULL, 0);
   }
 
   // Temporal layers per spatial layers: overwrite default settings above.
   for (i = 0; i < NUM_ENCODERS; i++) {
-    num_temporal_layers[i] = strtol(argv[2 * NUM_ENCODERS + 5 + i], NULL, 0);
+    num_temporal_layers[i] =
+        (int)strtol(argv[2 * NUM_ENCODERS + 5 + i], NULL, 0);
     if (num_temporal_layers[i] < 1 || num_temporal_layers[i] > 3)
       die("Invalid temporal layers: %d, Must be 1, 2, or 3. \n",
-          num_temporal_layers);
+          num_temporal_layers[i]);
   }
 
   /* Open file to write out each spatially downsampled input stream. */
@@ -391,9 +391,9 @@ int main(int argc, char **argv) {
     downsampled_input[i] = fopen(filename, "wb");
   }
 
-  key_frame_insert = strtol(argv[3 * NUM_ENCODERS + 5], NULL, 0);
+  key_frame_insert = (int)strtol(argv[3 * NUM_ENCODERS + 5], NULL, 0);
 
-  show_psnr = strtol(argv[3 * NUM_ENCODERS + 6], NULL, 0);
+  show_psnr = (int)strtol(argv[3 * NUM_ENCODERS + 6], NULL, 0);
 
   /* Populate default encoder configuration */
   for (i = 0; i < NUM_ENCODERS; i++) {
@@ -437,7 +437,7 @@ int main(int argc, char **argv) {
 
   /* Other-resolution encoder settings */
   for (i = 1; i < NUM_ENCODERS; i++) {
-    memcpy(&cfg[i], &cfg[0], sizeof(vpx_codec_enc_cfg_t));
+    cfg[i] = cfg[0];
 
     cfg[i].rc_target_bitrate = target_bitrate[i];
 
@@ -467,12 +467,12 @@ int main(int argc, char **argv) {
   /* Allocate image for each encoder */
   for (i = 0; i < NUM_ENCODERS; i++)
     if (!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 32))
-      die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h);
+      die("Failed to allocate image (%dx%d)", cfg[i].g_w, cfg[i].g_h);
 
-  if (raw[0].stride[VPX_PLANE_Y] == raw[0].d_w)
-    read_frame_p = read_frame;
+  if (raw[0].stride[VPX_PLANE_Y] == (int)raw[0].d_w)
+    read_frame_p = mulres_read_frame;
   else
-    read_frame_p = read_frame_by_row;
+    read_frame_p = mulres_read_frame_by_row;
 
   for (i = 0; i < NUM_ENCODERS; i++)
     if (outfile[i]) write_ivf_file_header(outfile[i], &cfg[i], 0);
@@ -558,7 +558,8 @@ int main(int argc, char **argv) {
         /* Write out down-sampled input. */
         length_frame = cfg[i].g_w * cfg[i].g_h * 3 / 2;
         if (fwrite(raw[i].planes[0], 1, length_frame,
-                   downsampled_input[NUM_ENCODERS - i - 1]) != length_frame) {
+                   downsampled_input[NUM_ENCODERS - i - 1]) !=
+            (unsigned int)length_frame) {
           return EXIT_FAILURE;
         }
       }
@@ -619,10 +620,6 @@ int main(int argc, char **argv) {
             break;
           default: break;
         }
-        printf(pkt[i]->kind == VPX_CODEC_CX_FRAME_PKT &&
-                       (pkt[i]->data.frame.flags & VPX_FRAME_IS_KEY)
-                   ? "K"
-                   : "");
         fflush(stdout);
       }
     }
@@ -663,7 +660,6 @@ int main(int argc, char **argv) {
       write_ivf_file_header(outfile[i], &cfg[i], frame_cnt - 1);
     fclose(outfile[i]);
   }
-  printf("\n");
 
   return EXIT_SUCCESS;
 }
diff --git a/media/libvpx/libvpx/examples/vp8cx_set_ref.c b/media/libvpx/libvpx/examples/vp8cx_set_ref.c
index 846477c61e..ca528f9e90 100644
--- a/media/libvpx/libvpx/examples/vp8cx_set_ref.c
+++ b/media/libvpx/libvpx/examples/vp8cx_set_ref.c
@@ -155,7 +155,7 @@ int main(int argc, char **argv) {
     die("Failed to open %s for reading.", argv[3]);
 
   if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
-    die_codec(&codec, "Failed to initialize encoder");
+    die("Failed to initialize encoder");
 
   // Encode frames.
   while (vpx_img_read(&raw, infile)) {
diff --git a/media/libvpx/libvpx/examples/vp9_lossless_encoder.c b/media/libvpx/libvpx/examples/vp9_lossless_encoder.c
index cb5ca6bfe0..c4eb3a8b17 100644
--- a/media/libvpx/libvpx/examples/vp9_lossless_encoder.c
+++ b/media/libvpx/libvpx/examples/vp9_lossless_encoder.c
@@ -110,7 +110,7 @@ int main(int argc, char **argv) {
     die("Failed to open %s for reading.", argv[3]);
 
   if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
-    die_codec(&codec, "Failed to initialize encoder");
+    die("Failed to initialize encoder");
 
   if (vpx_codec_control_(&codec, VP9E_SET_LOSSLESS, 1))
     die_codec(&codec, "Failed to use lossless mode");
diff --git a/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c b/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c
index 0e409387b3..16e6aba6cd 100644
--- a/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c
+++ b/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c
@@ -14,8 +14,11 @@
  * that benefit from a scalable bitstream.
  */
 
+#include <assert.h>
+#include <limits.h>
 #include <math.h>
 #include <stdarg.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
@@ -24,14 +27,22 @@
 #include "../tools_common.h"
 #include "../video_writer.h"
 
+#include "../vpx_ports/bitops.h"
 #include "../vpx_ports/vpx_timer.h"
-#include "vpx/svc_context.h"
+#include "./svc_context.h"
 #include "vpx/vp8cx.h"
+#include "vpx/vpx_decoder.h"
 #include "vpx/vpx_encoder.h"
 #include "../vpxstats.h"
-#include "vp9/encoder/vp9_encoder.h"
+#include "./y4minput.h"
+
+#define OUTPUT_FRAME_STATS 0
 #define OUTPUT_RC_STATS 1
 
+#define SIMULCAST_MODE 0
+
+static const arg_def_t outputfile =
+    ARG_DEF("o", "output", 1, "Output filename");
 static const arg_def_t skip_frames_arg =
     ARG_DEF("s", "skip-frames", 1, "input frames to skip");
 static const arg_def_t frames_arg =
@@ -60,12 +71,6 @@ static const arg_def_t kf_dist_arg =
     ARG_DEF("k", "kf-dist", 1, "number of frames between keyframes");
 static const arg_def_t scale_factors_arg =
     ARG_DEF("r", "scale-factors", 1, "scale factors (lowest to highest layer)");
-static const arg_def_t passes_arg =
-    ARG_DEF("p", "passes", 1, "Number of passes (1/2)");
-static const arg_def_t pass_arg =
-    ARG_DEF(NULL, "pass", 1, "Pass to execute (1/2)");
-static const arg_def_t fpf_name_arg =
-    ARG_DEF(NULL, "fpf", 1, "First pass statistics file name");
 static const arg_def_t min_q_arg =
     ARG_DEF(NULL, "min-q", 1, "Minimum quantizer");
 static const arg_def_t max_q_arg =
@@ -86,6 +91,21 @@ static const arg_def_t aqmode_arg =
     ARG_DEF("aq", "aqmode", 1, "aq-mode off/on");
 static const arg_def_t bitrates_arg =
     ARG_DEF("bl", "bitrates", 1, "bitrates[sl * num_tl + tl]");
+static const arg_def_t dropframe_thresh_arg =
+    ARG_DEF(NULL, "drop-frame", 1, "Temporal resampling threshold (buf %)");
+static const arg_def_t psnr_arg =
+    ARG_DEF(NULL, "psnr", 1, "Enable PSNR computation and statistics");
+static const struct arg_enum_list tune_content_enum[] = {
+  { "default", VP9E_CONTENT_DEFAULT },
+  { "screen", VP9E_CONTENT_SCREEN },
+  { "film", VP9E_CONTENT_FILM },
+  { NULL, 0 }
+};
+
+static const arg_def_t tune_content_arg = ARG_DEF_ENUM(
+    NULL, "tune-content", 1, "Tune content type", tune_content_enum);
+static const arg_def_t inter_layer_pred_arg = ARG_DEF(
+    NULL, "inter-layer-pred", 1, "0 - 3: On, Off, Key-frames, Constrained");
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static const struct arg_enum_list bitdepth_enum[] = {
@@ -97,6 +117,7 @@ static const arg_def_t bitdepth_arg = ARG_DEF_ENUM(
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 static const arg_def_t *svc_args[] = { &frames_arg,
+                                       &outputfile,
                                        &width_arg,
                                        &height_arg,
                                        &timebase_arg,
@@ -105,9 +126,6 @@ static const arg_def_t *svc_args[] = { &frames_arg,
                                        &spatial_layers_arg,
                                        &kf_dist_arg,
                                        &scale_factors_arg,
-                                       &passes_arg,
-                                       &pass_arg,
-                                       &fpf_name_arg,
                                        &min_q_arg,
                                        &max_q_arg,
                                        &min_bitrate_arg,
@@ -127,6 +145,10 @@ static const arg_def_t *svc_args[] = { &frames_arg,
                                        &speed_arg,
                                        &rc_end_usage_arg,
                                        &bitrates_arg,
+                                       &dropframe_thresh_arg,
+                                       &tune_content_arg,
+                                       &inter_layer_pred_arg,
+                                       &psnr_arg,
                                        NULL };
 
 static const uint32_t default_frames_to_skip = 0;
@@ -145,20 +167,19 @@ static const int32_t default_speed = -1;    // -1 means use library default.
 static const uint32_t default_threads = 0;  // zero means use library default.
 
 typedef struct {
-  const char *input_filename;
   const char *output_filename;
   uint32_t frames_to_code;
   uint32_t frames_to_skip;
   struct VpxInputContext input_ctx;
   stats_io_t rc_stats;
-  int passes;
-  int pass;
+  int tune_content;
+  int inter_layer_pred;
 } AppInput;
 
 static const char *exec_name;
 
 void usage_exit(void) {
-  fprintf(stderr, "Usage: %s <options> input_filename output_filename\n",
+  fprintf(stderr, "Usage: %s <options> input_filename -o output_filename\n",
           exec_name);
   fprintf(stderr, "Options:\n");
   arg_show_usage(stderr, svc_args);
@@ -168,14 +189,11 @@ void usage_exit(void) {
 static void parse_command_line(int argc, const char **argv_,
                                AppInput *app_input, SvcContext *svc_ctx,
                                vpx_codec_enc_cfg_t *enc_cfg) {
-  struct arg arg = { 0 };
+  struct arg arg;
   char **argv = NULL;
   char **argi = NULL;
   char **argj = NULL;
   vpx_codec_err_t res;
-  int passes = 0;
-  int pass = 0;
-  const char *fpf_file_name = NULL;
   unsigned int min_bitrate = 0;
   unsigned int max_bitrate = 0;
   char string_options[1024] = { 0 };
@@ -190,6 +208,7 @@ static void parse_command_line(int argc, const char **argv_,
 #endif
   svc_ctx->speed = default_speed;
   svc_ctx->threads = default_threads;
+  svc_ctx->use_psnr = 0;
 
   // start with default encoder configuration
   res = vpx_codec_enc_config_default(vpx_codec_vp9_cx(), enc_cfg, 0);
@@ -212,11 +231,17 @@ static void parse_command_line(int argc, const char **argv_,
 
   // process command line options
   argv = argv_dup(argc - 1, argv_ + 1);
+  if (!argv) {
+    fprintf(stderr, "Error allocating argument list\n");
+    exit(EXIT_FAILURE);
+  }
   for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
     arg.argv_step = 1;
 
     if (arg_match(&arg, &frames_arg, argi)) {
       app_input->frames_to_code = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &outputfile, argi)) {
+      app_input->output_filename = arg.val;
     } else if (arg_match(&arg, &width_arg, argi)) {
       enc_cfg->g_w = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &height_arg, argi)) {
@@ -237,6 +262,9 @@ static void parse_command_line(int argc, const char **argv_,
 #endif
     } else if (arg_match(&arg, &speed_arg, argi)) {
       svc_ctx->speed = arg_parse_uint(&arg);
+      if (svc_ctx->speed > 9) {
+        warn("Mapping speed %d to speed 9.\n", svc_ctx->speed);
+      }
     } else if (arg_match(&arg, &aqmode_arg, argi)) {
       svc_ctx->aqmode = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &threads_arg, argi)) {
@@ -251,29 +279,25 @@ static void parse_command_line(int argc, const char **argv_,
       enc_cfg->kf_min_dist = arg_parse_uint(&arg);
       enc_cfg->kf_max_dist = enc_cfg->kf_min_dist;
     } else if (arg_match(&arg, &scale_factors_arg, argi)) {
-      snprintf(string_options, sizeof(string_options), "%s scale-factors=%s",
-               string_options, arg.val);
+      strncat(string_options, " scale-factors=",
+              sizeof(string_options) - strlen(string_options) - 1);
+      strncat(string_options, arg.val,
+              sizeof(string_options) - strlen(string_options) - 1);
     } else if (arg_match(&arg, &bitrates_arg, argi)) {
-      snprintf(string_options, sizeof(string_options), "%s bitrates=%s",
-               string_options, arg.val);
-    } else if (arg_match(&arg, &passes_arg, argi)) {
-      passes = arg_parse_uint(&arg);
-      if (passes < 1 || passes > 2) {
-        die("Error: Invalid number of passes (%d)\n", passes);
-      }
-    } else if (arg_match(&arg, &pass_arg, argi)) {
-      pass = arg_parse_uint(&arg);
-      if (pass < 1 || pass > 2) {
-        die("Error: Invalid pass selected (%d)\n", pass);
-      }
-    } else if (arg_match(&arg, &fpf_name_arg, argi)) {
-      fpf_file_name = arg.val;
+      strncat(string_options, " bitrates=",
+              sizeof(string_options) - strlen(string_options) - 1);
+      strncat(string_options, arg.val,
+              sizeof(string_options) - strlen(string_options) - 1);
     } else if (arg_match(&arg, &min_q_arg, argi)) {
-      snprintf(string_options, sizeof(string_options), "%s min-quantizers=%s",
-               string_options, arg.val);
+      strncat(string_options, " min-quantizers=",
+              sizeof(string_options) - strlen(string_options) - 1);
+      strncat(string_options, arg.val,
+              sizeof(string_options) - strlen(string_options) - 1);
     } else if (arg_match(&arg, &max_q_arg, argi)) {
-      snprintf(string_options, sizeof(string_options), "%s max-quantizers=%s",
-               string_options, arg.val);
+      strncat(string_options, " max-quantizers=",
+              sizeof(string_options) - strlen(string_options) - 1);
+      strncat(string_options, arg.val,
+              sizeof(string_options) - strlen(string_options) - 1);
     } else if (arg_match(&arg, &min_bitrate_arg, argi)) {
       min_bitrate = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &max_bitrate_arg, argi)) {
@@ -300,9 +324,16 @@ static void parse_command_line(int argc, const char **argv_,
           break;
         default:
           die("Error: Invalid bit depth selected (%d)\n", enc_cfg->g_bit_depth);
-          break;
       }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+    } else if (arg_match(&arg, &dropframe_thresh_arg, argi)) {
+      enc_cfg->rc_dropframe_thresh = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &tune_content_arg, argi)) {
+      app_input->tune_content = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &inter_layer_pred_arg, argi)) {
+      app_input->inter_layer_pred = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &psnr_arg, argi)) {
+      svc_ctx->use_psnr = arg_parse_uint(&arg);
     } else {
       ++argj;
     }
@@ -312,35 +343,7 @@ static void parse_command_line(int argc, const char **argv_,
   if (strlen(string_options) > 0)
     vpx_svc_set_options(svc_ctx, string_options + 1);
 
-  if (passes == 0 || passes == 1) {
-    if (pass) {
-      fprintf(stderr, "pass is ignored since there's only one pass\n");
-    }
-    enc_cfg->g_pass = VPX_RC_ONE_PASS;
-  } else {
-    if (pass == 0) {
-      die("pass must be specified when passes is 2\n");
-    }
-
-    if (fpf_file_name == NULL) {
-      die("fpf must be specified when passes is 2\n");
-    }
-
-    if (pass == 1) {
-      enc_cfg->g_pass = VPX_RC_FIRST_PASS;
-      if (!stats_open_file(&app_input->rc_stats, fpf_file_name, 0)) {
-        fatal("Failed to open statistics store");
-      }
-    } else {
-      enc_cfg->g_pass = VPX_RC_LAST_PASS;
-      if (!stats_open_file(&app_input->rc_stats, fpf_file_name, 1)) {
-        fatal("Failed to open statistics store");
-      }
-      enc_cfg->rc_twopass_stats_in = stats_get(&app_input->rc_stats);
-    }
-    app_input->passes = passes;
-    app_input->pass = pass;
-  }
+  enc_cfg->g_pass = VPX_RC_ONE_PASS;
 
   if (enc_cfg->rc_target_bitrate > 0) {
     if (min_bitrate > 0) {
@@ -358,13 +361,20 @@ static void parse_command_line(int argc, const char **argv_,
     if (argi[0][0] == '-' && strlen(argi[0]) > 1)
       die("Error: Unrecognized option %s\n", *argi);
 
-  if (argv[0] == NULL || argv[1] == 0) {
+  if (argv[0] == NULL) {
     usage_exit();
   }
-  app_input->input_filename = argv[0];
-  app_input->output_filename = argv[1];
+  app_input->input_ctx.filename = argv[0];
   free(argv);
 
+  open_input_file(&app_input->input_ctx);
+  if (app_input->input_ctx.file_type == FILE_TYPE_Y4M) {
+    enc_cfg->g_w = app_input->input_ctx.width;
+    enc_cfg->g_h = app_input->input_ctx.height;
+    enc_cfg->g_timebase.den = app_input->input_ctx.framerate.numerator;
+    enc_cfg->g_timebase.num = app_input->input_ctx.framerate.denominator;
+  }
+
   if (enc_cfg->g_w < 16 || enc_cfg->g_w % 2 || enc_cfg->g_h < 16 ||
       enc_cfg->g_h % 2)
     die("Invalid resolution: %d x %d\n", enc_cfg->g_w, enc_cfg->g_h);
@@ -429,8 +439,9 @@ static void set_rate_control_stats(struct RateControlStats *rc,
         rc->layer_framerate[layer] = framerate / cfg->ts_rate_decimator[tl];
       if (tl > 0) {
         rc->layer_pfb[layer] =
-            1000.0 * (cfg->layer_target_bitrate[layer] -
-                      cfg->layer_target_bitrate[layer - 1]) /
+            1000.0 *
+            (cfg->layer_target_bitrate[layer] -
+             cfg->layer_target_bitrate[layer - 1]) /
             (rc->layer_framerate[layer] - rc->layer_framerate[layer - 1]);
       } else {
         rc->layer_pfb[layer] = 1000.0 * cfg->layer_target_bitrate[layer] /
@@ -502,14 +513,13 @@ static void printout_rate_control_summary(struct RateControlStats *rc,
   printf("Average, rms-variance, and percent-fluct: %f %f %f \n",
          rc->avg_st_encoding_bitrate, sqrt(rc->variance_st_encoding_bitrate),
          perc_fluctuation);
-  if (frame_cnt != tot_num_frames)
-    die("Error: Number of input frames not equal to output encoded frames != "
-        "%d tot_num_frames = %d\n",
-        frame_cnt, tot_num_frames);
+  printf("Num of input, num of encoded (super) frames: %d %d \n", frame_cnt,
+         tot_num_frames);
 }
 
-vpx_codec_err_t parse_superframe_index(const uint8_t *data, size_t data_sz,
-                                       uint32_t sizes[8], int *count) {
+static vpx_codec_err_t parse_superframe_index(const uint8_t *data,
+                                              size_t data_sz, uint64_t sizes[8],
+                                              int *count) {
   // A chunk ending with a byte matching 0xc0 is an invalid chunk unless
   // it is a super frame index. If the last byte of real video compression
   // data is 0xc0 the encoder must add a 0 byte. If we have the marker but
@@ -561,105 +571,392 @@ vpx_codec_err_t parse_superframe_index(const uint8_t *data, size_t data_sz,
 // bypass/flexible mode. The pattern corresponds to the pattern
 // VP9E_TEMPORAL_LAYERING_MODE_0101 (temporal_layering_mode == 2) used in
 // non-flexible mode.
-void set_frame_flags_bypass_mode(int sl, int tl, int num_spatial_layers,
-                                 int is_key_frame,
-                                 vpx_svc_ref_frame_config_t *ref_frame_config) {
+static void set_frame_flags_bypass_mode_ex0(
+    int tl, int num_spatial_layers, int is_key_frame,
+    vpx_svc_ref_frame_config_t *ref_frame_config) {
+  int sl;
+  for (sl = 0; sl < num_spatial_layers; ++sl)
+    ref_frame_config->update_buffer_slot[sl] = 0;
+
   for (sl = 0; sl < num_spatial_layers; ++sl) {
+    // Set the buffer idx.
+    if (tl == 0) {
+      ref_frame_config->lst_fb_idx[sl] = sl;
+      if (sl) {
+        if (is_key_frame) {
+          ref_frame_config->lst_fb_idx[sl] = sl - 1;
+          ref_frame_config->gld_fb_idx[sl] = sl;
+        } else {
+          ref_frame_config->gld_fb_idx[sl] = sl - 1;
+        }
+      } else {
+        ref_frame_config->gld_fb_idx[sl] = 0;
+      }
+      ref_frame_config->alt_fb_idx[sl] = 0;
+    } else if (tl == 1) {
+      ref_frame_config->lst_fb_idx[sl] = sl;
+      ref_frame_config->gld_fb_idx[sl] =
+          (sl == 0) ? 0 : num_spatial_layers + sl - 1;
+      ref_frame_config->alt_fb_idx[sl] = num_spatial_layers + sl;
+    }
+    // Set the reference and update flags.
     if (!tl) {
       if (!sl) {
-        ref_frame_config->frame_flags[sl] =
-            VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF |
-            VP8_EFLAG_NO_UPD_ARF;
+        // Base spatial and base temporal (sl = 0, tl = 0)
+        ref_frame_config->reference_last[sl] = 1;
+        ref_frame_config->reference_golden[sl] = 0;
+        ref_frame_config->reference_alt_ref[sl] = 0;
+        ref_frame_config->update_buffer_slot[sl] |=
+            1 << ref_frame_config->lst_fb_idx[sl];
       } else {
         if (is_key_frame) {
-          ref_frame_config->frame_flags[sl] =
-              VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_ARF |
-              VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
+          ref_frame_config->reference_last[sl] = 1;
+          ref_frame_config->reference_golden[sl] = 0;
+          ref_frame_config->reference_alt_ref[sl] = 0;
+          ref_frame_config->update_buffer_slot[sl] |=
+              1 << ref_frame_config->gld_fb_idx[sl];
         } else {
-          ref_frame_config->frame_flags[sl] =
-              VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
+          // Non-zero spatiall layer.
+          ref_frame_config->reference_last[sl] = 1;
+          ref_frame_config->reference_golden[sl] = 1;
+          ref_frame_config->reference_alt_ref[sl] = 1;
+          ref_frame_config->update_buffer_slot[sl] |=
+              1 << ref_frame_config->lst_fb_idx[sl];
         }
       }
     } else if (tl == 1) {
       if (!sl) {
-        ref_frame_config->frame_flags[sl] =
-            VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST |
-            VP8_EFLAG_NO_UPD_GF;
+        // Base spatial and top temporal (tl = 1)
+        ref_frame_config->reference_last[sl] = 1;
+        ref_frame_config->reference_golden[sl] = 0;
+        ref_frame_config->reference_alt_ref[sl] = 0;
+        ref_frame_config->update_buffer_slot[sl] |=
+            1 << ref_frame_config->alt_fb_idx[sl];
       } else {
-        ref_frame_config->frame_flags[sl] =
-            VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF;
+        // Non-zero spatial.
+        if (sl < num_spatial_layers - 1) {
+          ref_frame_config->reference_last[sl] = 1;
+          ref_frame_config->reference_golden[sl] = 1;
+          ref_frame_config->reference_alt_ref[sl] = 0;
+          ref_frame_config->update_buffer_slot[sl] |=
+              1 << ref_frame_config->alt_fb_idx[sl];
+        } else if (sl == num_spatial_layers - 1) {
+          // Top spatial and top temporal (non-reference -- doesn't update any
+          // reference buffers)
+          ref_frame_config->reference_last[sl] = 1;
+          ref_frame_config->reference_golden[sl] = 1;
+          ref_frame_config->reference_alt_ref[sl] = 0;
+        }
       }
     }
-    if (tl == 0) {
-      ref_frame_config->lst_fb_idx[sl] = sl;
-      if (sl)
-        ref_frame_config->gld_fb_idx[sl] = sl - 1;
-      else
-        ref_frame_config->gld_fb_idx[sl] = 0;
-      ref_frame_config->alt_fb_idx[sl] = 0;
-    } else if (tl == 1) {
-      ref_frame_config->lst_fb_idx[sl] = sl;
-      ref_frame_config->gld_fb_idx[sl] = num_spatial_layers + sl - 1;
-      ref_frame_config->alt_fb_idx[sl] = num_spatial_layers + sl;
-    }
   }
 }
 
+// Example pattern for 2 spatial layers and 2 temporal layers used in the
+// bypass/flexible mode, except only 1 spatial layer when temporal_layer_id = 1.
+static void set_frame_flags_bypass_mode_ex1(
+    int tl, int num_spatial_layers, int is_key_frame,
+    vpx_svc_ref_frame_config_t *ref_frame_config) {
+  int sl;
+  for (sl = 0; sl < num_spatial_layers; ++sl)
+    ref_frame_config->update_buffer_slot[sl] = 0;
+
+  if (tl == 0) {
+    if (is_key_frame) {
+      ref_frame_config->lst_fb_idx[1] = 0;
+      ref_frame_config->gld_fb_idx[1] = 1;
+    } else {
+      ref_frame_config->lst_fb_idx[1] = 1;
+      ref_frame_config->gld_fb_idx[1] = 0;
+    }
+    ref_frame_config->alt_fb_idx[1] = 0;
+
+    ref_frame_config->lst_fb_idx[0] = 0;
+    ref_frame_config->gld_fb_idx[0] = 0;
+    ref_frame_config->alt_fb_idx[0] = 0;
+  }
+  if (tl == 1) {
+    ref_frame_config->lst_fb_idx[0] = 0;
+    ref_frame_config->gld_fb_idx[0] = 1;
+    ref_frame_config->alt_fb_idx[0] = 2;
+
+    ref_frame_config->lst_fb_idx[1] = 1;
+    ref_frame_config->gld_fb_idx[1] = 2;
+    ref_frame_config->alt_fb_idx[1] = 3;
+  }
+  // Set the reference and update flags.
+  if (tl == 0) {
+    // Base spatial and base temporal (sl = 0, tl = 0)
+    ref_frame_config->reference_last[0] = 1;
+    ref_frame_config->reference_golden[0] = 0;
+    ref_frame_config->reference_alt_ref[0] = 0;
+    ref_frame_config->update_buffer_slot[0] |=
+        1 << ref_frame_config->lst_fb_idx[0];
+
+    if (is_key_frame) {
+      ref_frame_config->reference_last[1] = 1;
+      ref_frame_config->reference_golden[1] = 0;
+      ref_frame_config->reference_alt_ref[1] = 0;
+      ref_frame_config->update_buffer_slot[1] |=
+          1 << ref_frame_config->gld_fb_idx[1];
+    } else {
+      // Non-zero spatiall layer.
+      ref_frame_config->reference_last[1] = 1;
+      ref_frame_config->reference_golden[1] = 1;
+      ref_frame_config->reference_alt_ref[1] = 1;
+      ref_frame_config->update_buffer_slot[1] |=
+          1 << ref_frame_config->lst_fb_idx[1];
+    }
+  }
+  if (tl == 1) {
+    // Top spatial and top temporal (non-reference -- doesn't update any
+    // reference buffers)
+    ref_frame_config->reference_last[1] = 1;
+    ref_frame_config->reference_golden[1] = 0;
+    ref_frame_config->reference_alt_ref[1] = 0;
+  }
+}
+
+#if CONFIG_VP9_DECODER && !SIMULCAST_MODE
+static void test_decode(vpx_codec_ctx_t *encoder, vpx_codec_ctx_t *decoder,
+                        const int frames_out, int *mismatch_seen) {
+  vpx_image_t enc_img, dec_img;
+  struct vp9_ref_frame ref_enc, ref_dec;
+  if (*mismatch_seen) return;
+  /* Get the internal reference frame */
+  ref_enc.idx = 0;
+  ref_dec.idx = 0;
+  vpx_codec_control(encoder, VP9_GET_REFERENCE, &ref_enc);
+  enc_img = ref_enc.img;
+  vpx_codec_control(decoder, VP9_GET_REFERENCE, &ref_dec);
+  dec_img = ref_dec.img;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if ((enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) !=
+      (dec_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH)) {
+    if (enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+      vpx_img_alloc(&enc_img, enc_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH,
+                    enc_img.d_w, enc_img.d_h, 16);
+      vpx_img_truncate_16_to_8(&enc_img, &ref_enc.img);
+    }
+    if (dec_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+      vpx_img_alloc(&dec_img, dec_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH,
+                    dec_img.d_w, dec_img.d_h, 16);
+      vpx_img_truncate_16_to_8(&dec_img, &ref_dec.img);
+    }
+  }
+#endif
+
+  if (!compare_img(&enc_img, &dec_img)) {
+    int y[4], u[4], v[4];
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+      find_mismatch_high(&enc_img, &dec_img, y, u, v);
+    } else {
+      find_mismatch(&enc_img, &dec_img, y, u, v);
+    }
+#else
+    find_mismatch(&enc_img, &dec_img, y, u, v);
+#endif
+    decoder->err = 1;
+    printf(
+        "Encode/decode mismatch on frame %d at"
+        " Y[%d, %d] {%d/%d},"
+        " U[%d, %d] {%d/%d},"
+        " V[%d, %d] {%d/%d}\n",
+        frames_out, y[0], y[1], y[2], y[3], u[0], u[1], u[2], u[3], v[0], v[1],
+        v[2], v[3]);
+    *mismatch_seen = frames_out;
+  }
+
+  vpx_img_free(&enc_img);
+  vpx_img_free(&dec_img);
+}
+#endif
+
+#if OUTPUT_RC_STATS
+static void svc_output_rc_stats(
+    vpx_codec_ctx_t *codec, vpx_codec_enc_cfg_t *enc_cfg,
+    vpx_svc_layer_id_t *layer_id, const vpx_codec_cx_pkt_t *cx_pkt,
+    struct RateControlStats *rc, VpxVideoWriter **outfile,
+    const uint32_t frame_cnt, const double framerate) {
+  int num_layers_encoded = 0;
+  unsigned int sl, tl;
+  uint64_t sizes[8];
+  uint64_t sizes_parsed[8];
+  int count = 0;
+  double sum_bitrate = 0.0;
+  double sum_bitrate2 = 0.0;
+  memset(sizes, 0, sizeof(sizes));
+  memset(sizes_parsed, 0, sizeof(sizes_parsed));
+  vpx_codec_control(codec, VP9E_GET_SVC_LAYER_ID, layer_id);
+  parse_superframe_index(cx_pkt->data.frame.buf, cx_pkt->data.frame.sz,
+                         sizes_parsed, &count);
+  if (enc_cfg->ss_number_layers == 1) {
+    sizes[0] = cx_pkt->data.frame.sz;
+  } else {
+    for (sl = 0; sl < enc_cfg->ss_number_layers; ++sl) {
+      sizes[sl] = 0;
+      if (cx_pkt->data.frame.spatial_layer_encoded[sl]) {
+        sizes[sl] = sizes_parsed[num_layers_encoded];
+        num_layers_encoded++;
+      }
+    }
+  }
+  for (sl = 0; sl < enc_cfg->ss_number_layers; ++sl) {
+    unsigned int sl2;
+    uint64_t tot_size = 0;
+#if SIMULCAST_MODE
+    for (sl2 = 0; sl2 < sl; ++sl2) {
+      if (cx_pkt->data.frame.spatial_layer_encoded[sl2]) tot_size += sizes[sl2];
+    }
+    vpx_video_writer_write_frame(outfile[sl],
+                                 (uint8_t *)(cx_pkt->data.frame.buf) + tot_size,
+                                 (size_t)(sizes[sl]), cx_pkt->data.frame.pts);
+#else
+    for (sl2 = 0; sl2 <= sl; ++sl2) {
+      if (cx_pkt->data.frame.spatial_layer_encoded[sl2]) tot_size += sizes[sl2];
+    }
+    if (tot_size > 0)
+      vpx_video_writer_write_frame(outfile[sl], cx_pkt->data.frame.buf,
+                                   (size_t)(tot_size), cx_pkt->data.frame.pts);
+#endif  // SIMULCAST_MODE
+  }
+  for (sl = 0; sl < enc_cfg->ss_number_layers; ++sl) {
+    if (cx_pkt->data.frame.spatial_layer_encoded[sl]) {
+      for (tl = layer_id->temporal_layer_id; tl < enc_cfg->ts_number_layers;
+           ++tl) {
+        const int layer = sl * enc_cfg->ts_number_layers + tl;
+        ++rc->layer_tot_enc_frames[layer];
+        rc->layer_encoding_bitrate[layer] += 8.0 * sizes[sl];
+        // Keep count of rate control stats per layer, for non-key
+        // frames.
+        if (tl == (unsigned int)layer_id->temporal_layer_id &&
+            !(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY)) {
+          rc->layer_avg_frame_size[layer] += 8.0 * sizes[sl];
+          rc->layer_avg_rate_mismatch[layer] +=
+              fabs(8.0 * sizes[sl] - rc->layer_pfb[layer]) /
+              rc->layer_pfb[layer];
+          ++rc->layer_enc_frames[layer];
+        }
+      }
+    }
+  }
+
+  // Update for short-time encoding bitrate states, for moving
+  // window of size rc->window, shifted by rc->window / 2.
+  // Ignore first window segment, due to key frame.
+  if (frame_cnt > (unsigned int)rc->window_size) {
+    for (sl = 0; sl < enc_cfg->ss_number_layers; ++sl) {
+      if (cx_pkt->data.frame.spatial_layer_encoded[sl])
+        sum_bitrate += 0.001 * 8.0 * sizes[sl] * framerate;
+    }
+    if (frame_cnt % rc->window_size == 0) {
+      rc->window_count += 1;
+      rc->avg_st_encoding_bitrate += sum_bitrate / rc->window_size;
+      rc->variance_st_encoding_bitrate +=
+          (sum_bitrate / rc->window_size) * (sum_bitrate / rc->window_size);
+    }
+  }
+
+  // Second shifted window.
+  if (frame_cnt > (unsigned int)(rc->window_size + rc->window_size / 2)) {
+    for (sl = 0; sl < enc_cfg->ss_number_layers; ++sl) {
+      sum_bitrate2 += 0.001 * 8.0 * sizes[sl] * framerate;
+    }
+
+    if (frame_cnt > (unsigned int)(2 * rc->window_size) &&
+        frame_cnt % rc->window_size == 0) {
+      rc->window_count += 1;
+      rc->avg_st_encoding_bitrate += sum_bitrate2 / rc->window_size;
+      rc->variance_st_encoding_bitrate +=
+          (sum_bitrate2 / rc->window_size) * (sum_bitrate2 / rc->window_size);
+    }
+  }
+}
+#endif
+
 int main(int argc, const char **argv) {
-  AppInput app_input = { 0 };
+  AppInput app_input;
   VpxVideoWriter *writer = NULL;
-  VpxVideoInfo info = { 0 };
-  vpx_codec_ctx_t codec;
+  VpxVideoInfo info;
+  vpx_codec_ctx_t encoder;
   vpx_codec_enc_cfg_t enc_cfg;
   SvcContext svc_ctx;
+  vpx_svc_frame_drop_t svc_drop_frame;
   uint32_t i;
   uint32_t frame_cnt = 0;
   vpx_image_t raw;
   vpx_codec_err_t res;
   int pts = 0;            /* PTS starts at 0 */
   int frame_duration = 1; /* 1 timebase tick per frame */
-  FILE *infile = NULL;
   int end_of_stream = 0;
+#if OUTPUT_FRAME_STATS
   int frames_received = 0;
+#endif
 #if OUTPUT_RC_STATS
-  VpxVideoWriter *outfile[VPX_TS_MAX_LAYERS] = { NULL };
+  VpxVideoWriter *outfile[VPX_SS_MAX_LAYERS] = { NULL };
   struct RateControlStats rc;
   vpx_svc_layer_id_t layer_id;
   vpx_svc_ref_frame_config_t ref_frame_config;
-  unsigned int sl, tl;
-  double sum_bitrate = 0.0;
-  double sum_bitrate2 = 0.0;
+  unsigned int sl;
   double framerate = 30.0;
 #endif
   struct vpx_usec_timer timer;
   int64_t cx_time = 0;
+#if CONFIG_INTERNAL_STATS
+  FILE *f = fopen("opsnr.stt", "a");
+#endif
+#if CONFIG_VP9_DECODER && !SIMULCAST_MODE
+  int mismatch_seen = 0;
+  vpx_codec_ctx_t decoder;
+#endif
   memset(&svc_ctx, 0, sizeof(svc_ctx));
-  svc_ctx.log_print = 1;
+  memset(&app_input, 0, sizeof(AppInput));
+  memset(&info, 0, sizeof(VpxVideoInfo));
+  memset(&layer_id, 0, sizeof(vpx_svc_layer_id_t));
+  memset(&rc, 0, sizeof(struct RateControlStats));
   exec_name = argv[0];
+
+  /* Setup default input stream settings */
+  app_input.input_ctx.framerate.numerator = 30;
+  app_input.input_ctx.framerate.denominator = 1;
+  app_input.input_ctx.only_i420 = 1;
+  app_input.input_ctx.bit_depth = 0;
+
   parse_command_line(argc, argv, &app_input, &svc_ctx, &enc_cfg);
 
+  // Y4M reader handles its own allocation.
+  if (app_input.input_ctx.file_type != FILE_TYPE_Y4M) {
 // Allocate image buffer
 #if CONFIG_VP9_HIGHBITDEPTH
-  if (!vpx_img_alloc(&raw, enc_cfg.g_input_bit_depth == 8 ? VPX_IMG_FMT_I420
-                                                          : VPX_IMG_FMT_I42016,
-                     enc_cfg.g_w, enc_cfg.g_h, 32)) {
-    die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h);
-  }
+    if (!vpx_img_alloc(&raw,
+                       enc_cfg.g_input_bit_depth == 8 ? VPX_IMG_FMT_I420
+                                                      : VPX_IMG_FMT_I42016,
+                       enc_cfg.g_w, enc_cfg.g_h, 32)) {
+      die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h);
+    }
 #else
-  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, enc_cfg.g_w, enc_cfg.g_h, 32)) {
-    die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h);
-  }
+    if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, enc_cfg.g_w, enc_cfg.g_h, 32)) {
+      die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h);
+    }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-
-  if (!(infile = fopen(app_input.input_filename, "rb")))
-    die("Failed to open %s for reading\n", app_input.input_filename);
+  }
 
   // Initialize codec
-  if (vpx_svc_init(&svc_ctx, &codec, vpx_codec_vp9_cx(), &enc_cfg) !=
+  if (vpx_svc_init(&svc_ctx, &encoder, vpx_codec_vp9_cx(), &enc_cfg) !=
       VPX_CODEC_OK)
     die("Failed to initialize encoder\n");
+#if CONFIG_VP9_DECODER && !SIMULCAST_MODE
+  if (vpx_codec_dec_init(
+          &decoder, get_vpx_decoder_by_name("vp9")->codec_interface(), NULL, 0))
+    die("Failed to initialize decoder\n");
+#endif
 
 #if OUTPUT_RC_STATS
+  rc.window_count = 1;
+  rc.window_size = 15;  // Silence a static analysis warning.
+  rc.avg_st_encoding_bitrate = 0.0;
+  rc.variance_st_encoding_bitrate = 0.0;
   if (svc_ctx.output_rc_stat) {
     set_rate_control_stats(&rc, &enc_cfg);
     framerate = enc_cfg.g_timebase.den / enc_cfg.g_timebase.num;
@@ -667,48 +964,79 @@ int main(int argc, const char **argv) {
 #endif
 
   info.codec_fourcc = VP9_FOURCC;
+  info.frame_width = enc_cfg.g_w;
+  info.frame_height = enc_cfg.g_h;
   info.time_base.numerator = enc_cfg.g_timebase.num;
   info.time_base.denominator = enc_cfg.g_timebase.den;
 
-  if (!(app_input.passes == 2 && app_input.pass == 1)) {
-    // We don't save the bitstream for the 1st pass on two pass rate control
-    writer =
-        vpx_video_writer_open(app_input.output_filename, kContainerIVF, &info);
-    if (!writer)
-      die("Failed to open %s for writing\n", app_input.output_filename);
-  }
+  writer =
+      vpx_video_writer_open(app_input.output_filename, kContainerIVF, &info);
+  if (!writer)
+    die("Failed to open %s for writing\n", app_input.output_filename);
+
 #if OUTPUT_RC_STATS
-  // For now, just write temporal layer streams.
-  // TODO(marpan): do spatial by re-writing superframe.
+  // Write out spatial layer stream.
+  // TODO(marpan/jianj): allow for writing each spatial and temporal stream.
   if (svc_ctx.output_rc_stat) {
-    for (tl = 0; tl < enc_cfg.ts_number_layers; ++tl) {
+    for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
       char file_name[PATH_MAX];
 
-      snprintf(file_name, sizeof(file_name), "%s_t%d.ivf",
-               app_input.output_filename, tl);
-      outfile[tl] = vpx_video_writer_open(file_name, kContainerIVF, &info);
-      if (!outfile[tl]) die("Failed to open %s for writing", file_name);
+      snprintf(file_name, sizeof(file_name), "%s_s%d.ivf",
+               app_input.output_filename, sl);
+      outfile[sl] = vpx_video_writer_open(file_name, kContainerIVF, &info);
+      if (!outfile[sl]) die("Failed to open %s for writing", file_name);
     }
   }
 #endif
 
   // skip initial frames
-  for (i = 0; i < app_input.frames_to_skip; ++i) vpx_img_read(&raw, infile);
+  for (i = 0; i < app_input.frames_to_skip; ++i)
+    read_frame(&app_input.input_ctx, &raw);
 
   if (svc_ctx.speed != -1)
-    vpx_codec_control(&codec, VP8E_SET_CPUUSED, svc_ctx.speed);
-  if (svc_ctx.threads)
-    vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (svc_ctx.threads >> 1));
+    vpx_codec_control(&encoder, VP8E_SET_CPUUSED, svc_ctx.speed);
+  if (svc_ctx.threads) {
+    vpx_codec_control(&encoder, VP9E_SET_TILE_COLUMNS,
+                      get_msb(svc_ctx.threads));
+    if (svc_ctx.threads > 1)
+      vpx_codec_control(&encoder, VP9E_SET_ROW_MT, 1);
+    else
+      vpx_codec_control(&encoder, VP9E_SET_ROW_MT, 0);
+  }
   if (svc_ctx.speed >= 5 && svc_ctx.aqmode == 1)
-    vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
+    vpx_codec_control(&encoder, VP9E_SET_AQ_MODE, 3);
   if (svc_ctx.speed >= 5)
-    vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
+    vpx_codec_control(&encoder, VP8E_SET_STATIC_THRESHOLD, 1);
+  vpx_codec_control(&encoder, VP8E_SET_MAX_INTRA_BITRATE_PCT, 900);
+
+  vpx_codec_control(&encoder, VP9E_SET_SVC_INTER_LAYER_PRED,
+                    app_input.inter_layer_pred);
+
+  vpx_codec_control(&encoder, VP9E_SET_NOISE_SENSITIVITY, 0);
+
+  vpx_codec_control(&encoder, VP9E_SET_TUNE_CONTENT, app_input.tune_content);
+
+  vpx_codec_control(&encoder, VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, 0);
+  vpx_codec_control(&encoder, VP9E_SET_DISABLE_LOOPFILTER, 0);
+
+  svc_drop_frame.framedrop_mode = FULL_SUPERFRAME_DROP;
+  for (sl = 0; sl < (unsigned int)svc_ctx.spatial_layers; ++sl)
+    svc_drop_frame.framedrop_thresh[sl] = enc_cfg.rc_dropframe_thresh;
+  svc_drop_frame.max_consec_drop = INT_MAX;
+  vpx_codec_control(&encoder, VP9E_SET_SVC_FRAME_DROP_LAYER, &svc_drop_frame);
 
   // Encode frames
   while (!end_of_stream) {
     vpx_codec_iter_t iter = NULL;
     const vpx_codec_cx_pkt_t *cx_pkt;
-    if (frame_cnt >= app_input.frames_to_code || !vpx_img_read(&raw, infile)) {
+    // Example patterns for bypass/flexible mode:
+    // example_pattern = 0: 2 temporal layers, and spatial_layers = 1,2,3. Exact
+    // to fixed SVC patterns. example_pattern = 1: 2 spatial and 2 temporal
+    // layers, with SL0 only has TL0, and SL1 has both TL0 and TL1. This example
+    // uses the extended API.
+    int example_pattern = 0;
+    if (frame_cnt >= app_input.frames_to_code ||
+        !read_frame(&app_input.input_ctx, &raw)) {
       // We need one extra vpx_svc_encode call at end of stream to flush
       // encoder and get remaining data
       end_of_stream = 1;
@@ -716,149 +1044,148 @@ int main(int argc, const char **argv) {
 
     // For BYPASS/FLEXIBLE mode, set the frame flags (reference and updates)
     // and the buffer indices for each spatial layer of the current
-    // (super)frame to be encoded. The temporal layer_id for the current frame
-    // also needs to be set.
+    // (super)frame to be encoded. The spatial and temporal layer_id for the
+    // current frame also needs to be set.
     // TODO(marpan): Should rename the "VP9E_TEMPORAL_LAYERING_MODE_BYPASS"
     // mode to "VP9E_LAYERING_MODE_BYPASS".
     if (svc_ctx.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
       layer_id.spatial_layer_id = 0;
       // Example for 2 temporal layers.
-      if (frame_cnt % 2 == 0)
+      if (frame_cnt % 2 == 0) {
         layer_id.temporal_layer_id = 0;
-      else
+        for (i = 0; i < VPX_SS_MAX_LAYERS; i++)
+          layer_id.temporal_layer_id_per_spatial[i] = 0;
+      } else {
         layer_id.temporal_layer_id = 1;
-      // Note that we only set the temporal layer_id, since we are calling
-      // the encode for the whole superframe. The encoder will internally loop
-      // over all the spatial layers for the current superframe.
-      vpx_codec_control(&codec, VP9E_SET_SVC_LAYER_ID, &layer_id);
-      set_frame_flags_bypass_mode(sl, layer_id.temporal_layer_id,
-                                  svc_ctx.spatial_layers, frame_cnt == 0,
-                                  &ref_frame_config);
-      vpx_codec_control(&codec, VP9E_SET_SVC_REF_FRAME_CONFIG,
+        for (i = 0; i < VPX_SS_MAX_LAYERS; i++)
+          layer_id.temporal_layer_id_per_spatial[i] = 1;
+      }
+      if (example_pattern == 1) {
+        // example_pattern 1 is hard-coded for 2 spatial and 2 temporal layers.
+        assert(svc_ctx.spatial_layers == 2);
+        assert(svc_ctx.temporal_layers == 2);
+        if (frame_cnt % 2 == 0) {
+          // Spatial layer 0 and 1 are encoded.
+          layer_id.temporal_layer_id_per_spatial[0] = 0;
+          layer_id.temporal_layer_id_per_spatial[1] = 0;
+          layer_id.spatial_layer_id = 0;
+        } else {
+          // Only spatial layer 1 is encoded here.
+          layer_id.temporal_layer_id_per_spatial[1] = 1;
+          layer_id.spatial_layer_id = 1;
+        }
+      }
+      vpx_codec_control(&encoder, VP9E_SET_SVC_LAYER_ID, &layer_id);
+      // TODO(jianj): Fix the parameter passing for "is_key_frame" in
+      // set_frame_flags_bypass_model() for case of periodic key frames.
+      if (example_pattern == 0) {
+        set_frame_flags_bypass_mode_ex0(layer_id.temporal_layer_id,
+                                        svc_ctx.spatial_layers, frame_cnt == 0,
+                                        &ref_frame_config);
+      } else if (example_pattern == 1) {
+        set_frame_flags_bypass_mode_ex1(layer_id.temporal_layer_id,
+                                        svc_ctx.spatial_layers, frame_cnt == 0,
+                                        &ref_frame_config);
+      }
+      ref_frame_config.duration[0] = frame_duration * 1;
+      ref_frame_config.duration[1] = frame_duration * 1;
+
+      vpx_codec_control(&encoder, VP9E_SET_SVC_REF_FRAME_CONFIG,
                         &ref_frame_config);
       // Keep track of input frames, to account for frame drops in rate control
       // stats/metrics.
-      for (sl = 0; sl < (unsigned int)enc_cfg.ss_number_layers; ++sl) {
+      for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
         ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers +
                                 layer_id.temporal_layer_id];
       }
+    } else {
+      // For the fixed pattern SVC, temporal layer is given by superframe count.
+      unsigned int tl = 0;
+      if (enc_cfg.ts_number_layers == 2)
+        tl = (frame_cnt % 2 != 0);
+      else if (enc_cfg.ts_number_layers == 3) {
+        if (frame_cnt % 2 != 0) tl = 2;
+        if ((frame_cnt > 1) && ((frame_cnt - 2) % 4 == 0)) tl = 1;
+      }
+      for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl)
+        ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers + tl];
     }
 
     vpx_usec_timer_start(&timer);
     res = vpx_svc_encode(
-        &svc_ctx, &codec, (end_of_stream ? NULL : &raw), pts, frame_duration,
+        &svc_ctx, &encoder, (end_of_stream ? NULL : &raw), pts, frame_duration,
         svc_ctx.speed >= 5 ? VPX_DL_REALTIME : VPX_DL_GOOD_QUALITY);
     vpx_usec_timer_mark(&timer);
     cx_time += vpx_usec_timer_elapsed(&timer);
 
-    printf("%s", vpx_svc_get_message(&svc_ctx));
     fflush(stdout);
     if (res != VPX_CODEC_OK) {
-      die_codec(&codec, "Failed to encode frame");
+      die_codec(&encoder, "Failed to encode frame");
     }
 
-    while ((cx_pkt = vpx_codec_get_cx_data(&codec, &iter)) != NULL) {
+    while ((cx_pkt = vpx_codec_get_cx_data(&encoder, &iter)) != NULL) {
       switch (cx_pkt->kind) {
         case VPX_CODEC_CX_FRAME_PKT: {
           SvcInternal_t *const si = (SvcInternal_t *)svc_ctx.internal;
           if (cx_pkt->data.frame.sz > 0) {
-#if OUTPUT_RC_STATS
-            uint32_t sizes[8];
-            int count = 0;
-#endif
             vpx_video_writer_write_frame(writer, cx_pkt->data.frame.buf,
                                          cx_pkt->data.frame.sz,
                                          cx_pkt->data.frame.pts);
 #if OUTPUT_RC_STATS
-            // TODO(marpan): Put this (to line728) in separate function.
             if (svc_ctx.output_rc_stat) {
-              vpx_codec_control(&codec, VP9E_GET_SVC_LAYER_ID, &layer_id);
-              parse_superframe_index(cx_pkt->data.frame.buf,
-                                     cx_pkt->data.frame.sz, sizes, &count);
-              // Note computing input_layer_frames here won't account for frame
-              // drops in rate control stats.
-              // TODO(marpan): Fix this for non-bypass mode so we can get stats
-              // for dropped frames.
-              if (svc_ctx.temporal_layering_mode !=
-                  VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
-                for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
-                  ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers +
-                                          layer_id.temporal_layer_id];
-                }
-              }
-              for (tl = layer_id.temporal_layer_id;
-                   tl < enc_cfg.ts_number_layers; ++tl) {
-                vpx_video_writer_write_frame(
-                    outfile[tl], cx_pkt->data.frame.buf, cx_pkt->data.frame.sz,
-                    cx_pkt->data.frame.pts);
-              }
-
-              for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
-                for (tl = layer_id.temporal_layer_id;
-                     tl < enc_cfg.ts_number_layers; ++tl) {
-                  const int layer = sl * enc_cfg.ts_number_layers + tl;
-                  ++rc.layer_tot_enc_frames[layer];
-                  rc.layer_encoding_bitrate[layer] += 8.0 * sizes[sl];
-                  // Keep count of rate control stats per layer, for non-key
-                  // frames.
-                  if (tl == (unsigned int)layer_id.temporal_layer_id &&
-                      !(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY)) {
-                    rc.layer_avg_frame_size[layer] += 8.0 * sizes[sl];
-                    rc.layer_avg_rate_mismatch[layer] +=
-                        fabs(8.0 * sizes[sl] - rc.layer_pfb[layer]) /
-                        rc.layer_pfb[layer];
-                    ++rc.layer_enc_frames[layer];
-                  }
-                }
-              }
-
-              // Update for short-time encoding bitrate states, for moving
-              // window of size rc->window, shifted by rc->window / 2.
-              // Ignore first window segment, due to key frame.
-              if (frame_cnt > (unsigned int)rc.window_size) {
-                tl = layer_id.temporal_layer_id;
-                for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
-                  sum_bitrate += 0.001 * 8.0 * sizes[sl] * framerate;
-                }
-                if (frame_cnt % rc.window_size == 0) {
-                  rc.window_count += 1;
-                  rc.avg_st_encoding_bitrate += sum_bitrate / rc.window_size;
-                  rc.variance_st_encoding_bitrate +=
-                      (sum_bitrate / rc.window_size) *
-                      (sum_bitrate / rc.window_size);
-                  sum_bitrate = 0.0;
-                }
-              }
-
-              // Second shifted window.
-              if (frame_cnt >
-                  (unsigned int)(rc.window_size + rc.window_size / 2)) {
-                tl = layer_id.temporal_layer_id;
-                for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
-                  sum_bitrate2 += 0.001 * 8.0 * sizes[sl] * framerate;
-                }
-
-                if (frame_cnt > (unsigned int)(2 * rc.window_size) &&
-                    frame_cnt % rc.window_size == 0) {
-                  rc.window_count += 1;
-                  rc.avg_st_encoding_bitrate += sum_bitrate2 / rc.window_size;
-                  rc.variance_st_encoding_bitrate +=
-                      (sum_bitrate2 / rc.window_size) *
-                      (sum_bitrate2 / rc.window_size);
-                  sum_bitrate2 = 0.0;
-                }
-              }
+              svc_output_rc_stats(&encoder, &enc_cfg, &layer_id, cx_pkt, &rc,
+                                  outfile, frame_cnt, framerate);
             }
 #endif
           }
-          /*
+#if OUTPUT_FRAME_STATS
           printf("SVC frame: %d, kf: %d, size: %d, pts: %d\n", frames_received,
                  !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY),
                  (int)cx_pkt->data.frame.sz, (int)cx_pkt->data.frame.pts);
-          */
-          if (enc_cfg.ss_number_layers == 1 && enc_cfg.ts_number_layers == 1)
-            si->bytes_sum[0] += (int)cx_pkt->data.frame.sz;
           ++frames_received;
+#endif
+          if (enc_cfg.ss_number_layers > 1) {
+            uint64_t sizes[8] = { 0 };
+            int count = 0;
+            int num_layers_encoded = 0;
+            parse_superframe_index(cx_pkt->data.frame.buf,
+                                   cx_pkt->data.frame.sz, sizes, &count);
+            for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
+              if (cx_pkt->data.frame.spatial_layer_encoded[sl]) {
+                si->bytes_sum[sl] += (int)sizes[num_layers_encoded];
+                num_layers_encoded++;
+              }
+            }
+          } else {
+            si->bytes_sum[0] += (int)cx_pkt->data.frame.sz;
+          }
+#if CONFIG_VP9_DECODER && !SIMULCAST_MODE
+          if (vpx_codec_decode(&decoder, cx_pkt->data.frame.buf,
+                               (unsigned int)cx_pkt->data.frame.sz, NULL, 0))
+            die_codec(&decoder, "Failed to decode frame.");
+          vpx_codec_control(&encoder, VP9E_GET_SVC_LAYER_ID, &layer_id);
+          // Don't look for mismatch on top spatial and top temporal layers as
+          // they are non reference frames. Don't look at frames whose top
+          // spatial layer is dropped.
+          if ((enc_cfg.ss_number_layers > 1 || enc_cfg.ts_number_layers > 1) &&
+              cx_pkt->data.frame
+                  .spatial_layer_encoded[enc_cfg.ss_number_layers - 1] &&
+              !(layer_id.temporal_layer_id > 0 &&
+                layer_id.temporal_layer_id ==
+                    (int)enc_cfg.ts_number_layers - 1)) {
+            test_decode(&encoder, &decoder, frame_cnt, &mismatch_seen);
+          }
+#endif
+          break;
+        }
+        case VPX_CODEC_PSNR_PKT: {
+          SvcInternal_t *const si = (SvcInternal_t *)svc_ctx.internal;
+          sl = cx_pkt->data.psnr.spatial_layer_id;
+          si->number_of_frames[sl]++;
+          for (int j = 0; j < 4; ++j) {
+            si->psnr_sum[sl][j] += cx_pkt->data.psnr.psnr[j];
+            si->sse_sum[sl][j] += cx_pkt->data.psnr.sse[j];
+          }
           break;
         }
         case VPX_CODEC_STATS_PKT: {
@@ -866,7 +1193,9 @@ int main(int argc, const char **argv) {
                       cx_pkt->data.twopass_stats.sz);
           break;
         }
-        default: { break; }
+        default: {
+          break;
+        }
       }
     }
 
@@ -876,41 +1205,44 @@ int main(int argc, const char **argv) {
     }
   }
 
-  // Compensate for the extra frame count for the bypass mode.
-  if (svc_ctx.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
-    for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
-      const int layer =
-          sl * enc_cfg.ts_number_layers + layer_id.temporal_layer_id;
-      --rc.layer_input_frames[layer];
-    }
-  }
-
   printf("Processed %d frames\n", frame_cnt);
-  fclose(infile);
+
+  close_input_file(&app_input.input_ctx);
+
 #if OUTPUT_RC_STATS
   if (svc_ctx.output_rc_stat) {
     printout_rate_control_summary(&rc, &enc_cfg, frame_cnt);
     printf("\n");
   }
 #endif
-  if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
-  if (app_input.passes == 2) stats_close(&app_input.rc_stats, 1);
+  if (vpx_codec_destroy(&encoder))
+    die_codec(&encoder, "Failed to destroy codec");
   if (writer) {
     vpx_video_writer_close(writer);
   }
 #if OUTPUT_RC_STATS
   if (svc_ctx.output_rc_stat) {
-    for (tl = 0; tl < enc_cfg.ts_number_layers; ++tl) {
-      vpx_video_writer_close(outfile[tl]);
+    for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
+      vpx_video_writer_close(outfile[sl]);
     }
   }
+#endif
+#if CONFIG_INTERNAL_STATS
+  if (mismatch_seen) {
+    fprintf(f, "First mismatch occurred in frame %d\n", mismatch_seen);
+  } else {
+    fprintf(f, "No mismatch detected in recon buffers\n");
+  }
+  fclose(f);
 #endif
   printf("Frame cnt and encoding time/FPS stats for encoding: %d %f %f \n",
          frame_cnt, 1000 * (float)cx_time / (double)(frame_cnt * 1000000),
          1000000 * (double)frame_cnt / (double)cx_time);
-  vpx_img_free(&raw);
+  if (app_input.input_ctx.file_type != FILE_TYPE_Y4M) {
+    vpx_img_free(&raw);
+  }
   // display average size, psnr
-  printf("%s", vpx_svc_dump_statistics(&svc_ctx));
+  vpx_svc_dump_statistics(&svc_ctx);
   vpx_svc_release(&svc_ctx);
   return EXIT_SUCCESS;
 }
diff --git a/media/libvpx/libvpx/examples/vp9cx_set_ref.c b/media/libvpx/libvpx/examples/vp9cx_set_ref.c
index 3472689db2..6e12d668b0 100644
--- a/media/libvpx/libvpx/examples/vp9cx_set_ref.c
+++ b/media/libvpx/libvpx/examples/vp9cx_set_ref.c
@@ -60,7 +60,7 @@
 
 static const char *exec_name;
 
-void usage_exit() {
+void usage_exit(void) {
   fprintf(stderr,
           "Usage: %s <width> <height> <infile> <outfile> "
           "<frame> <limit(optional)>\n",
@@ -68,128 +68,6 @@ void usage_exit() {
   exit(EXIT_FAILURE);
 }
 
-static int compare_img(const vpx_image_t *const img1,
-                       const vpx_image_t *const img2) {
-  uint32_t l_w = img1->d_w;
-  uint32_t c_w = (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
-  const uint32_t c_h =
-      (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
-  uint32_t i;
-  int match = 1;
-
-  match &= (img1->fmt == img2->fmt);
-  match &= (img1->d_w == img2->d_w);
-  match &= (img1->d_h == img2->d_h);
-
-  for (i = 0; i < img1->d_h; ++i)
-    match &= (memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y],
-                     img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y],
-                     l_w) == 0);
-
-  for (i = 0; i < c_h; ++i)
-    match &= (memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U],
-                     img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U],
-                     c_w) == 0);
-
-  for (i = 0; i < c_h; ++i)
-    match &= (memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V],
-                     img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V],
-                     c_w) == 0);
-
-  return match;
-}
-
-#define mmin(a, b) ((a) < (b) ? (a) : (b))
-static void find_mismatch(const vpx_image_t *const img1,
-                          const vpx_image_t *const img2, int yloc[4],
-                          int uloc[4], int vloc[4]) {
-  const uint32_t bsize = 64;
-  const uint32_t bsizey = bsize >> img1->y_chroma_shift;
-  const uint32_t bsizex = bsize >> img1->x_chroma_shift;
-  const uint32_t c_w =
-      (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
-  const uint32_t c_h =
-      (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
-  int match = 1;
-  uint32_t i, j;
-  yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1;
-  for (i = 0, match = 1; match && i < img1->d_h; i += bsize) {
-    for (j = 0; match && j < img1->d_w; j += bsize) {
-      int k, l;
-      const int si = mmin(i + bsize, img1->d_h) - i;
-      const int sj = mmin(j + bsize, img1->d_w) - j;
-      for (k = 0; match && k < si; ++k) {
-        for (l = 0; match && l < sj; ++l) {
-          if (*(img1->planes[VPX_PLANE_Y] +
-                (i + k) * img1->stride[VPX_PLANE_Y] + j + l) !=
-              *(img2->planes[VPX_PLANE_Y] +
-                (i + k) * img2->stride[VPX_PLANE_Y] + j + l)) {
-            yloc[0] = i + k;
-            yloc[1] = j + l;
-            yloc[2] = *(img1->planes[VPX_PLANE_Y] +
-                        (i + k) * img1->stride[VPX_PLANE_Y] + j + l);
-            yloc[3] = *(img2->planes[VPX_PLANE_Y] +
-                        (i + k) * img2->stride[VPX_PLANE_Y] + j + l);
-            match = 0;
-            break;
-          }
-        }
-      }
-    }
-  }
-
-  uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1;
-  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
-    for (j = 0; match && j < c_w; j += bsizex) {
-      int k, l;
-      const int si = mmin(i + bsizey, c_h - i);
-      const int sj = mmin(j + bsizex, c_w - j);
-      for (k = 0; match && k < si; ++k) {
-        for (l = 0; match && l < sj; ++l) {
-          if (*(img1->planes[VPX_PLANE_U] +
-                (i + k) * img1->stride[VPX_PLANE_U] + j + l) !=
-              *(img2->planes[VPX_PLANE_U] +
-                (i + k) * img2->stride[VPX_PLANE_U] + j + l)) {
-            uloc[0] = i + k;
-            uloc[1] = j + l;
-            uloc[2] = *(img1->planes[VPX_PLANE_U] +
-                        (i + k) * img1->stride[VPX_PLANE_U] + j + l);
-            uloc[3] = *(img2->planes[VPX_PLANE_U] +
-                        (i + k) * img2->stride[VPX_PLANE_U] + j + l);
-            match = 0;
-            break;
-          }
-        }
-      }
-    }
-  }
-  vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1;
-  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
-    for (j = 0; match && j < c_w; j += bsizex) {
-      int k, l;
-      const int si = mmin(i + bsizey, c_h - i);
-      const int sj = mmin(j + bsizex, c_w - j);
-      for (k = 0; match && k < si; ++k) {
-        for (l = 0; match && l < sj; ++l) {
-          if (*(img1->planes[VPX_PLANE_V] +
-                (i + k) * img1->stride[VPX_PLANE_V] + j + l) !=
-              *(img2->planes[VPX_PLANE_V] +
-                (i + k) * img2->stride[VPX_PLANE_V] + j + l)) {
-            vloc[0] = i + k;
-            vloc[1] = j + l;
-            vloc[2] = *(img1->planes[VPX_PLANE_V] +
-                        (i + k) * img1->stride[VPX_PLANE_V] + j + l);
-            vloc[3] = *(img2->planes[VPX_PLANE_V] +
-                        (i + k) * img2->stride[VPX_PLANE_V] + j + l);
-            match = 0;
-            break;
-          }
-        }
-      }
-    }
-  }
-}
-
 static void testing_decode(vpx_codec_ctx_t *encoder, vpx_codec_ctx_t *decoder,
                            unsigned int frame_out, int *mismatch_seen) {
   vpx_image_t enc_img, dec_img;
@@ -373,7 +251,7 @@ int main(int argc, char **argv) {
     die("Failed to open %s for reading.", infile_arg);
 
   if (vpx_codec_enc_init(&ecodec, encoder->codec_interface(), &cfg, 0))
-    die_codec(&ecodec, "Failed to initialize encoder");
+    die("Failed to initialize encoder");
 
   // Disable alt_ref.
   if (vpx_codec_control(&ecodec, VP8E_SET_ENABLEAUTOALTREF, 0))
diff --git a/media/libvpx/libvpx/examples/vpx_dec_fuzzer.cc b/media/libvpx/libvpx/examples/vpx_dec_fuzzer.cc
new file mode 100644
index 0000000000..7af31095b9
--- /dev/null
+++ b/media/libvpx/libvpx/examples/vpx_dec_fuzzer.cc
@@ -0,0 +1,159 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * Fuzzer for libvpx decoders
+ * ==========================
+ * Requirements
+ * --------------
+ * Requires Clang 6.0 or above as -fsanitize=fuzzer is used as a linker
+ * option.
+
+ * Steps to build
+ * --------------
+ * Clone libvpx repository
+   $git clone https://chromium.googlesource.com/webm/libvpx
+
+ * Create a directory in parallel to libvpx and change directory
+   $mkdir vpx_dec_fuzzer
+   $cd vpx_dec_fuzzer/
+
+ * Enable sanitizers (Supported: address integer memory thread undefined)
+   $source ../libvpx/tools/set_analyzer_env.sh address
+
+ * Configure libvpx.
+ * Note --size-limit and VPX_MAX_ALLOCABLE_MEMORY are defined to avoid
+ * Out of memory errors when running generated fuzzer binary
+   $../libvpx/configure --disable-unit-tests --size-limit=12288x12288 \
+   --extra-cflags="-fsanitize=fuzzer-no-link \
+   -DVPX_MAX_ALLOCABLE_MEMORY=1073741824" \
+   --disable-webm-io --enable-debug --disable-vp8-encoder \
+   --disable-vp9-encoder --disable-examples
+
+ * Build libvpx
+   $make -j32
+
+ * Build vp9 fuzzer
+   $ $CXX $CXXFLAGS -std=gnu++17 -DDECODER=vp9 \
+   -fsanitize=fuzzer -I../libvpx -I. -Wl,--start-group \
+   ../libvpx/examples/vpx_dec_fuzzer.cc -o ./vpx_dec_fuzzer_vp9 \
+   ./libvpx.a -Wl,--end-group
+
+ * DECODER should be defined as vp9 or vp8 to enable vp9/vp8
+ *
+ * create a corpus directory and copy some ivf files there.
+ * Based on which codec (vp8/vp9) is being tested, it is recommended to
+ * have corresponding ivf files in corpus directory
+ * Empty corpus directoy also is acceptable, though not recommended
+   $mkdir CORPUS && cp some-files CORPUS
+
+ * Run fuzzing:
+   $./vpx_dec_fuzzer_vp9 CORPUS
+
+ * References:
+ * http://llvm.org/docs/LibFuzzer.html
+ * https://github.com/google/oss-fuzz
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <algorithm>
+#include <memory>
+
+#include "third_party/nalloc/nalloc.h"
+#include "vpx/vp8dx.h"
+#include "vpx/vpx_decoder.h"
+#include "vpx_ports/mem_ops.h"
+
+#define IVF_FRAME_HDR_SZ (4 + 8) /* 4 byte size + 8 byte timestamp */
+#define IVF_FILE_HDR_SZ 32
+
+#define VPXD_INTERFACE(name) VPXD_INTERFACE_(name)
+#define VPXD_INTERFACE_(name) vpx_codec_##name##_dx()
+
+extern "C" void usage_exit(void) { exit(EXIT_FAILURE); }
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  if (size <= IVF_FILE_HDR_SZ) {
+    return 0;
+  }
+  nalloc_init(nullptr);
+
+  vpx_codec_ctx_t codec;
+  // Set thread count in the range [1, 64].
+  const unsigned int threads = (data[IVF_FILE_HDR_SZ] & 0x3f) + 1;
+  vpx_codec_dec_cfg_t cfg = { threads, 0, 0 };
+  vpx_codec_flags_t flags = 0;
+  if ((data[IVF_FILE_HDR_SZ] & 0x40) != 0) {
+    flags |= VPX_CODEC_USE_POSTPROC;
+  }
+  vpx_codec_err_t err =
+      vpx_codec_dec_init(&codec, VPXD_INTERFACE(DECODER), &cfg, flags);
+  if (err == VPX_CODEC_INCAPABLE) {
+    // vpx_codec_dec_init may fail with VPX_CODEC_USE_POSTPROC
+    // if the library is configured with --disable-postproc.
+    flags = 0;
+    if (vpx_codec_dec_init(&codec, VPXD_INTERFACE(DECODER), &cfg, flags)) {
+      return 0;
+    }
+  } else if (err != 0) {
+    return 0;
+  }
+
+  nalloc_start(data, size);
+
+  if (threads > 1) {
+    const int enable = (data[IVF_FILE_HDR_SZ] & 0xa0) != 0;
+    err = vpx_codec_control(&codec, VP9D_SET_LOOP_FILTER_OPT, enable);
+  }
+
+  data += IVF_FILE_HDR_SZ;
+  size -= IVF_FILE_HDR_SZ;
+
+  int frame_cnt = 0;
+  while (size > IVF_FRAME_HDR_SZ) {
+    size_t frame_size = mem_get_le32(data);
+    size -= IVF_FRAME_HDR_SZ;
+    data += IVF_FRAME_HDR_SZ;
+    frame_size = std::min(size, frame_size);
+
+    vpx_codec_stream_info_t stream_info;
+    stream_info.sz = sizeof(stream_info);
+    err = vpx_codec_peek_stream_info(VPXD_INTERFACE(DECODER), data, size,
+                                     &stream_info);
+
+    ++frame_cnt;
+    if (flags & VPX_CODEC_USE_POSTPROC) {
+      if (frame_cnt % 16 == 4) {
+        vp8_postproc_cfg_t pp = { 0, 0, 0 };
+        if (vpx_codec_control(&codec, VP8_SET_POSTPROC, &pp)) goto fail;
+      } else if (frame_cnt % 16 == 12) {
+        vp8_postproc_cfg_t pp = { VP8_DEBLOCK | VP8_DEMACROBLOCK | VP8_MFQE, 4,
+                                  0 };
+        if (vpx_codec_control(&codec, VP8_SET_POSTPROC, &pp)) goto fail;
+      }
+    }
+
+    err = vpx_codec_decode(&codec, data, frame_size, nullptr, 0);
+    static_cast<void>(err);
+    vpx_codec_iter_t iter = nullptr;
+    vpx_image_t *img = nullptr;
+    while ((img = vpx_codec_get_frame(&codec, &iter)) != nullptr) {
+    }
+    data += frame_size;
+    size -= frame_size;
+  }
+fail:
+  vpx_codec_destroy(&codec);
+  nalloc_end();
+  return 0;
+}
diff --git a/media/libvpx/libvpx/examples/vpx_enc_fuzzer.cc b/media/libvpx/libvpx/examples/vpx_enc_fuzzer.cc
new file mode 100644
index 0000000000..4b99663a1a
--- /dev/null
+++ b/media/libvpx/libvpx/examples/vpx_enc_fuzzer.cc
@@ -0,0 +1,236 @@
+/*
+ *  Copyright (c) 2025 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * Fuzzer for libvpx encoders
+ * ==========================
+ * Requirements
+ * --------------
+ * Requires Clang 6.0 or above as -fsanitize=fuzzer is used as a linker
+ * option.
+
+ * Steps to build
+ * --------------
+ * Clone libvpx repository
+   $git clone https://chromium.googlesource.com/webm/libvpx
+
+ * Create a directory in parallel to libvpx and change directory
+   $mkdir vpx_enc_fuzzer
+   $cd vpx_enc_fuzzer/
+
+ * Enable sanitizers (Supported: address integer memory thread undefined)
+   $source ../libvpx/tools/set_analyzer_env.sh address
+
+ * Configure libvpx.
+ * Note --size-limit and VPX_MAX_ALLOCABLE_MEMORY are defined to avoid
+ * Out of memory errors when running generated fuzzer binary
+   $../libvpx/configure --disable-unit-tests --size-limit=12288x12288 \
+   --extra-cflags="-fsanitize=fuzzer-no-link \
+   -DVPX_MAX_ALLOCABLE_MEMORY=1073741824" \
+   --disable-webm-io --enable-debug --enable-vp8-encoder \
+   --enable-vp9-encoder --disable-examples
+
+ * Build libvpx
+   $make -j32
+
+ * Build vp9 fuzzer
+   $ $CXX $CXXFLAGS -std=gnu++17 -DENCODER=vp9 \
+   -fsanitize=fuzzer -I../libvpx -I. -Wl,--start-group \
+   ../libvpx/examples/vpx_enc_fuzzer.cc -o ./vpx_enc_fuzzer_vp9 \
+   ./libvpx.a -Wl,--end-group
+
+ * ENCODER should be defined as vp9 or vp8 to enable vp9/vp8
+ *
+ * create a corpus directory and copy some ivf files there.
+ * Based on which codec (vp8/vp9) is being tested, it is recommended to
+ * have corresponding ivf files in corpus directory
+ * Empty corpus directory also is acceptable, though not recommended
+   $mkdir CORPUS && cp some-files CORPUS
+
+ * Run fuzzing:
+   $./vpx_enc_fuzzer_vp9 CORPUS
+
+ * References:
+ * http://llvm.org/docs/LibFuzzer.html
+ * https://github.com/google/oss-fuzz
+ */
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx/vp8cx.h"
+#include "vpx/vpx_encoder.h"
+#include "vpx_ports/mem_ops.h"
+#include "third_party/nalloc/nalloc.h"
+
+// fuzz header to have config options, before raw image data
+#define FUZZ_HDR_SZ 32
+
+#define VPXC_INTERFACE(name) VPXC_INTERFACE_(name)
+#define VPXC_INTERFACE_(name) vpx_codec_##name##_cx()
+
+extern "C" void usage_exit(void) { exit(EXIT_FAILURE); }
+
+static int vpx_img_plane_width(const vpx_image_t *img, int plane) {
+  if (plane > 0 && img->x_chroma_shift > 0)
+    return (img->d_w + 1) >> img->x_chroma_shift;
+  else
+    return img->d_w;
+}
+
+static int vpx_img_plane_height(const vpx_image_t *img, int plane) {
+  if (plane > 0 && img->y_chroma_shift > 0)
+    return (img->d_h + 1) >> img->y_chroma_shift;
+  else
+    return img->d_h;
+}
+
+static int fuzz_vpx_img_read(vpx_image_t *img, const uint8_t *data,
+                             size_t size) {
+  int plane;
+  // TODO: wtc - Need to clamp the sample values so that they are in range
+  // For example, if the bit depth is 10, the sample values must be <= 1023.
+  assert(img->bit_depth == 8);
+  const size_t bytespp = (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
+
+  if (size == 0) return 0;
+  size_t used = 0;
+  for (plane = 0; plane < 3; ++plane) {
+    unsigned char *buf = img->planes[plane];
+    const int stride = img->stride[plane];
+    int w = vpx_img_plane_width(img, plane);
+    const int h = vpx_img_plane_height(img, plane);
+    int y;
+
+    // Assuming that for nv12 we read all chroma data at once
+    if (img->fmt == VPX_IMG_FMT_NV12 && plane > 1) break;
+    // Fixing NV12 chroma width if it is odd
+    if (img->fmt == VPX_IMG_FMT_NV12 && plane == 1) w = (w + 1) & ~1;
+
+    for (y = 0; y < h; ++y) {
+      size_t nb = bytespp * w;
+      if (nb > size - used) {
+        nb = size - used;
+      }
+      memcpy(buf, data, nb);
+      memset(buf + nb, 0, bytespp * w - nb);
+      buf += stride;
+      data += nb;
+      used += nb;
+    }
+  }
+
+  return used;
+}
+
+static int encode_frame(vpx_codec_ctx_t *codec, vpx_image_t *img,
+                        int frame_index, int flags, FILE *out,
+                        vpx_enc_deadline_t quality) {
+  int got_pkts = 0;
+  vpx_codec_iter_t iter = NULL;
+  const vpx_codec_cx_pkt_t *pkt = NULL;
+  const vpx_codec_err_t res =
+      vpx_codec_encode(codec, img, frame_index, 1, flags, quality);
+  if (res != VPX_CODEC_OK) return 0;
+
+  while ((pkt = vpx_codec_get_cx_data(codec, &iter)) != NULL) {
+    got_pkts = 1;
+
+    if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
+      if (fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, out) !=
+          pkt->data.frame.sz)
+        return 0;
+    }
+  }
+
+  return got_pkts;
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  if (size <= FUZZ_HDR_SZ) {
+    return 0;
+  }
+  nalloc_init(nullptr);
+
+  int keyframe_interval = 0;
+  int frame_count = 0;
+  vpx_codec_ctx_t codec;
+  vpx_image_t raw;
+  vpx_codec_enc_cfg_t cfg;
+  vpx_enc_deadline_t quality = VPX_DL_GOOD_QUALITY;
+
+  if ((data[0] & 0x80) != 0) {
+    keyframe_interval = 8;
+  }
+  if ((data[0] & 0x40) != 0) {
+    quality = VPX_DL_REALTIME;
+  } else if ((data[0] & 0x20) != 0) {
+    quality = VPX_DL_BEST_QUALITY;
+  }
+
+  if (vpx_codec_enc_config_default(VPXC_INTERFACE(ENCODER), &cfg, 0)) abort();
+  FILE *out = fopen("/dev/null", "wb");
+
+  switch (data[0] & 0x1F) {
+    case 0: cfg.g_w = 64; cfg.g_h = 1;
+    case 1: cfg.g_w = 1; cfg.g_h = 48;
+    case 2: cfg.g_w = 1; cfg.g_h = 1;
+    case 3: cfg.g_w = 4; cfg.g_h = 4;
+    case 4: cfg.g_w = 16; cfg.g_h = 16;
+    default: cfg.g_w = 64; cfg.g_h = 48;
+  }
+  cfg.g_timebase.num = 1;
+  cfg.g_timebase.den = 30;  // fps
+  cfg.rc_target_bitrate = 200;
+  cfg.g_error_resilient = 1;
+
+  if (vpx_codec_enc_init(&codec, VPXC_INTERFACE(ENCODER), &cfg, 0)) {
+    return 0;
+  }
+
+  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h, 1)) {
+    goto fail;
+  }
+
+  nalloc_start(data, size);
+  // We may want to add more config options (for more complex encoders as seen
+  // in the examples) in the future while still maintaining the same format (so
+  // that generated corpus is still valid). So we reserve FUZZ_HDR_SZ=32 bytes
+  // for this even if we just use one byte so far.
+  data += FUZZ_HDR_SZ;
+  size -= FUZZ_HDR_SZ;
+
+  // Encode frames.
+  while (1) {
+    int flags = 0;
+    size_t size_read = fuzz_vpx_img_read(&raw, data, size);
+    if (size_read == 0) break;
+    data += size_read;
+    size -= size_read;
+    if (keyframe_interval > 0 && frame_count % keyframe_interval == 0)
+      flags |= VPX_EFLAG_FORCE_KF;
+    encode_frame(&codec, &raw, frame_count++, flags, out, quality);
+  }
+
+  // Flush encoder.
+  while (encode_frame(&codec, NULL, -1, 0, out, quality)) {
+  }
+
+fail:
+  nalloc_end();
+  vpx_img_free(&raw);
+  vpx_codec_destroy(&codec);
+  fclose(out);
+  return 0;
+}
diff --git a/media/libvpx/libvpx/examples/vpx_temporal_svc_encoder.c b/media/libvpx/libvpx/examples/vpx_temporal_svc_encoder.c
index b906980835..01badbe0cf 100644
--- a/media/libvpx/libvpx/examples/vpx_temporal_svc_encoder.c
+++ b/media/libvpx/libvpx/examples/vpx_temporal_svc_encoder.c
@@ -19,24 +19,38 @@
 #include <string.h>
 
 #include "./vpx_config.h"
+#include "./y4minput.h"
 #include "../vpx_ports/vpx_timer.h"
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
+#include "vpx_ports/bitops.h"
 
 #include "../tools_common.h"
 #include "../video_writer.h"
 
+#define ROI_MAP 0
+
+#define zero(Dest) memset(&(Dest), 0, sizeof(Dest))
+
 static const char *exec_name;
 
 void usage_exit(void) { exit(EXIT_FAILURE); }
 
-// Denoiser states, for temporal denoising.
-enum denoiserState {
-  kDenoiserOff,
-  kDenoiserOnYOnly,
-  kDenoiserOnYUV,
-  kDenoiserOnYUVAggressive,
-  kDenoiserOnAdaptive
+// Denoiser states for vp8, for temporal denoising.
+enum denoiserStateVp8 {
+  kVp8DenoiserOff,
+  kVp8DenoiserOnYOnly,
+  kVp8DenoiserOnYUV,
+  kVp8DenoiserOnYUVAggressive,
+  kVp8DenoiserOnAdaptive
+};
+
+// Denoiser states for vp9, for temporal denoising.
+enum denoiserStateVp9 {
+  kVp9DenoiserOff,
+  kVp9DenoiserOnYOnly,
+  // For SVC: denoise the top two spatial layers.
+  kVp9DenoiserOnYTwoSpatialLayers
 };
 
 static int mode_to_num_layers[13] = { 1, 2, 2, 3, 3, 3, 3, 5, 2, 3, 3, 3, 3 };
@@ -79,19 +93,21 @@ struct RateControlMetrics {
 // in the stream.
 static void set_rate_control_metrics(struct RateControlMetrics *rc,
                                      vpx_codec_enc_cfg_t *cfg) {
-  unsigned int i = 0;
+  int i = 0;
   // Set the layer (cumulative) framerate and the target layer (non-cumulative)
   // per-frame-bandwidth, for the rate control encoding stats below.
   const double framerate = cfg->g_timebase.den / cfg->g_timebase.num;
+  const int ts_number_layers = cfg->ts_number_layers;
   rc->layer_framerate[0] = framerate / cfg->ts_rate_decimator[0];
   rc->layer_pfb[0] =
       1000.0 * rc->layer_target_bitrate[0] / rc->layer_framerate[0];
-  for (i = 0; i < cfg->ts_number_layers; ++i) {
+  for (i = 0; i < ts_number_layers; ++i) {
     if (i > 0) {
       rc->layer_framerate[i] = framerate / cfg->ts_rate_decimator[i];
-      rc->layer_pfb[i] = 1000.0 * (rc->layer_target_bitrate[i] -
-                                   rc->layer_target_bitrate[i - 1]) /
-                         (rc->layer_framerate[i] - rc->layer_framerate[i - 1]);
+      rc->layer_pfb[i] =
+          1000.0 *
+          (rc->layer_target_bitrate[i] - rc->layer_target_bitrate[i - 1]) /
+          (rc->layer_framerate[i] - rc->layer_framerate[i - 1]);
     }
     rc->layer_input_frames[i] = 0;
     rc->layer_enc_frames[i] = 0;
@@ -104,6 +120,9 @@ static void set_rate_control_metrics(struct RateControlMetrics *rc,
   rc->window_size = 15;
   rc->avg_st_encoding_bitrate = 0.0;
   rc->variance_st_encoding_bitrate = 0.0;
+  // Target bandwidth for the whole stream.
+  // Set to layer_target_bitrate for highest layer (total bitrate).
+  cfg->rc_target_bitrate = rc->layer_target_bitrate[ts_number_layers - 1];
 }
 
 static void printout_rate_control_summary(struct RateControlMetrics *rc,
@@ -154,6 +173,107 @@ static void printout_rate_control_summary(struct RateControlMetrics *rc,
     die("Error: Number of input frames not equal to output! \n");
 }
 
+#if ROI_MAP
+static void set_roi_map(const char *enc_name, vpx_codec_enc_cfg_t *cfg,
+                        vpx_roi_map_t *roi) {
+  unsigned int i, j;
+  int block_size = 0;
+  uint8_t is_vp8 = strncmp(enc_name, "vp8", 3) == 0 ? 1 : 0;
+  uint8_t is_vp9 = strncmp(enc_name, "vp9", 3) == 0 ? 1 : 0;
+  if (!is_vp8 && !is_vp9) {
+    die("unsupported codec.");
+  }
+  zero(*roi);
+
+  block_size = is_vp9 && !is_vp8 ? 8 : 16;
+
+  // ROI is based on the segments (4 for vp8, 8 for vp9), smallest unit for
+  // segment is 16x16 for vp8, 8x8 for vp9.
+  roi->rows = (cfg->g_h + block_size - 1) / block_size;
+  roi->cols = (cfg->g_w + block_size - 1) / block_size;
+
+  // Applies delta QP on the segment blocks, varies from -63 to 63.
+  // Setting to negative means lower QP (better quality).
+  // Below we set delta_q to the extreme (-63) to show strong effect.
+  // VP8 uses the first 4 segments. VP9 uses all 8 segments.
+  zero(roi->delta_q);
+  roi->delta_q[1] = -63;
+
+  // Applies delta loopfilter strength on the segment blocks, varies from -63 to
+  // 63. Setting to positive means stronger loopfilter. VP8 uses the first 4
+  // segments. VP9 uses all 8 segments.
+  zero(roi->delta_lf);
+
+  if (is_vp8) {
+    // Applies skip encoding threshold on the segment blocks, varies from 0 to
+    // UINT_MAX. Larger value means more skipping of encoding is possible.
+    // This skip threshold only applies on delta frames.
+    zero(roi->static_threshold);
+  }
+
+  if (is_vp9) {
+    // Apply skip segment. Setting to 1 means this block will be copied from
+    // previous frame.
+    zero(roi->skip);
+  }
+
+  if (is_vp9) {
+    // Apply ref frame segment.
+    // -1 : Do not apply this segment.
+    //  0 : Froce using intra.
+    //  1 : Force using last.
+    //  2 : Force using golden.
+    //  3 : Force using alfref but not used in non-rd pickmode for 0 lag.
+    memset(roi->ref_frame, -1, sizeof(roi->ref_frame));
+    roi->ref_frame[1] = 1;
+  }
+
+  // Use 2 states: 1 is center square, 0 is the rest.
+  roi->roi_map =
+      (uint8_t *)calloc(roi->rows * roi->cols, sizeof(*roi->roi_map));
+  for (i = 0; i < roi->rows; ++i) {
+    for (j = 0; j < roi->cols; ++j) {
+      if (i > (roi->rows >> 2) && i < ((roi->rows * 3) >> 2) &&
+          j > (roi->cols >> 2) && j < ((roi->cols * 3) >> 2)) {
+        roi->roi_map[i * roi->cols + j] = 1;
+      }
+    }
+  }
+}
+
+static void set_roi_skip_map(vpx_codec_enc_cfg_t *cfg, vpx_roi_map_t *roi,
+                             int *skip_map, int *prev_mask_map, int frame_num) {
+  const int block_size = 8;
+  unsigned int i, j;
+  roi->rows = (cfg->g_h + block_size - 1) / block_size;
+  roi->cols = (cfg->g_w + block_size - 1) / block_size;
+  zero(roi->skip);
+  zero(roi->delta_q);
+  zero(roi->delta_lf);
+  memset(roi->ref_frame, -1, sizeof(roi->ref_frame));
+  roi->ref_frame[1] = 1;
+  // Use segment 3 for skip.
+  roi->skip[3] = 1;
+  roi->roi_map =
+      (uint8_t *)calloc(roi->rows * roi->cols, sizeof(*roi->roi_map));
+  for (i = 0; i < roi->rows; ++i) {
+    for (j = 0; j < roi->cols; ++j) {
+      const int idx = i * roi->cols + j;
+      // Use segment 3 for skip.
+      // prev_mask_map keeps track of blocks that have been stably on segment 3
+      // for the past 10 frames. Only skip when the block is on segment 3 in
+      // both current map and prev_mask_map.
+      if (skip_map[idx] == 1 && prev_mask_map[idx] == 1) roi->roi_map[idx] = 3;
+      // Reset it every 10 frames so it doesn't propagate for too many frames.
+      if (frame_num % 10 == 0)
+        prev_mask_map[idx] = skip_map[idx];
+      else if (prev_mask_map[idx] == 1 && skip_map[idx] == 0)
+        prev_mask_map[idx] = 0;
+    }
+  }
+}
+#endif
+
 // Temporal scaling parameters:
 // NOTE: The 3 prediction frames cannot be used interchangeably due to
 // differences in the way they are handled throughout the code. The
@@ -486,6 +606,28 @@ static void set_temporal_layer_pattern(int layering_mode,
   }
 }
 
+#if ROI_MAP
+static int read_mask(FILE *mask_file, int *seg_map, int allowed_mask_rows,
+                     int allowed_mask_cols) {
+  int mask_rows, mask_cols, i, j;
+  int *map_start = seg_map;
+  if (fscanf(mask_file, "%d %d\n", &mask_cols, &mask_rows) != 2) return 0;
+  if (mask_rows != allowed_mask_rows || mask_cols != allowed_mask_cols) {
+    return 0;
+  }
+  for (i = 0; i < mask_rows; i++) {
+    for (j = 0; j < mask_cols; j++) {
+      if (fscanf(mask_file, "%d ", &seg_map[j]) != 1) return 0;
+      // reverse the bit
+      seg_map[j] = 1 - seg_map[j];
+    }
+    seg_map += mask_cols;
+  }
+  seg_map = map_start;
+  return 1;
+}
+#endif
+
 int main(int argc, char **argv) {
   VpxVideoWriter *outfile[VPX_TS_MAX_LAYERS] = { NULL };
   vpx_codec_ctx_t codec;
@@ -495,6 +637,7 @@ int main(int argc, char **argv) {
   vpx_codec_err_t res;
   unsigned int width;
   unsigned int height;
+  uint32_t error_resilient = 0;
   int speed;
   int frame_avail;
   int got_data;
@@ -505,16 +648,15 @@ int main(int argc, char **argv) {
   int layering_mode = 0;
   int layer_flags[VPX_TS_MAX_PERIODICITY] = { 0 };
   int flag_periodicity = 1;
-#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
-  vpx_svc_layer_id_t layer_id = { 0, 0 };
-#else
-  vpx_svc_layer_id_t layer_id = { 0 };
+#if ROI_MAP
+  vpx_roi_map_t roi;
 #endif
+  vpx_svc_layer_id_t layer_id;
   const VpxInterface *encoder = NULL;
-  FILE *infile = NULL;
+  struct VpxInputContext input_ctx;
   struct RateControlMetrics rc;
   int64_t cx_time = 0;
-  const int min_args_base = 12;
+  const int min_args_base = 13;
 #if CONFIG_VP9_HIGHBITDEPTH
   vpx_bit_depth_t bit_depth = VPX_BITS_8;
   int input_bit_depth = 8;
@@ -525,18 +667,36 @@ int main(int argc, char **argv) {
   double sum_bitrate = 0.0;
   double sum_bitrate2 = 0.0;
   double framerate = 30.0;
+#if ROI_MAP
+  FILE *mask_file = NULL;
+  int block_size = 8;
+  int mask_rows = 0;
+  int mask_cols = 0;
+  int *mask_map;
+  int *prev_mask_map;
+#endif
+  zero(rc.layer_target_bitrate);
+  memset(&layer_id, 0, sizeof(vpx_svc_layer_id_t));
+  memset(&input_ctx, 0, sizeof(input_ctx));
+  /* Setup default input stream settings */
+  input_ctx.framerate.numerator = 30;
+  input_ctx.framerate.denominator = 1;
+  input_ctx.only_i420 = 1;
+  input_ctx.bit_depth = 0;
 
   exec_name = argv[0];
   // Check usage and arguments.
   if (argc < min_args) {
 #if CONFIG_VP9_HIGHBITDEPTH
     die("Usage: %s <infile> <outfile> <codec_type(vp8/vp9)> <width> <height> "
-        "<rate_num> <rate_den> <speed> <frame_drop_threshold> <threads> <mode> "
+        "<rate_num> <rate_den> <speed> <frame_drop_threshold> "
+        "<error_resilient> <threads> <mode> "
         "<Rate_0> ... <Rate_nlayers-1> <bit-depth> \n",
         argv[0]);
 #else
     die("Usage: %s <infile> <outfile> <codec_type(vp8/vp9)> <width> <height> "
-        "<rate_num> <rate_den> <speed> <frame_drop_threshold> <threads> <mode> "
+        "<rate_num> <rate_den> <speed> <frame_drop_threshold> "
+        "<error_resilient> <threads> <mode> "
         "<Rate_0> ... <Rate_nlayers-1> \n",
         argv[0]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -553,14 +713,23 @@ int main(int argc, char **argv) {
     die("Invalid resolution: %d x %d", width, height);
   }
 
-  layering_mode = (int)strtol(argv[11], NULL, 0);
+  layering_mode = (int)strtol(argv[12], NULL, 0);
   if (layering_mode < 0 || layering_mode > 13) {
-    die("Invalid layering mode (0..12) %s", argv[11]);
+    die("Invalid layering mode (0..12) %s", argv[12]);
   }
 
+#if ROI_MAP
+  if (argc != min_args + mode_to_num_layers[layering_mode] + 1) {
+    die("Invalid number of arguments");
+  }
+#else
   if (argc != min_args + mode_to_num_layers[layering_mode]) {
     die("Invalid number of arguments");
   }
+#endif
+
+  input_ctx.filename = argv[1];
+  open_input_file(&input_ctx);
 
 #if CONFIG_VP9_HIGHBITDEPTH
   switch (strtol(argv[argc - 1], NULL, 0)) {
@@ -578,14 +747,22 @@ int main(int argc, char **argv) {
       break;
     default: die("Invalid bit depth (8, 10, 12) %s", argv[argc - 1]);
   }
-  if (!vpx_img_alloc(
-          &raw, bit_depth == VPX_BITS_8 ? VPX_IMG_FMT_I420 : VPX_IMG_FMT_I42016,
-          width, height, 32)) {
-    die("Failed to allocate image", width, height);
+
+  // Y4M reader has its own allocation.
+  if (input_ctx.file_type != FILE_TYPE_Y4M) {
+    if (!vpx_img_alloc(
+            &raw,
+            bit_depth == VPX_BITS_8 ? VPX_IMG_FMT_I420 : VPX_IMG_FMT_I42016,
+            width, height, 32)) {
+      die("Failed to allocate image (%dx%d)", width, height);
+    }
   }
 #else
-  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, width, height, 32)) {
-    die("Failed to allocate image", width, height);
+  // Y4M reader has its own allocation.
+  if (input_ctx.file_type != FILE_TYPE_Y4M) {
+    if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, width, height, 32)) {
+      die("Failed to allocate image (%dx%d)", width, height);
+    }
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -616,14 +793,17 @@ int main(int argc, char **argv) {
   if (speed < 0) {
     die("Invalid speed setting: must be positive");
   }
+  if (strncmp(encoder->name, "vp9", 3) == 0 && speed > 9) {
+    warn("Mapping speed %d to speed 9.\n", speed);
+  }
 
   for (i = min_args_base;
        (int)i < min_args_base + mode_to_num_layers[layering_mode]; ++i) {
-    rc.layer_target_bitrate[i - 12] = (int)strtol(argv[i], NULL, 0);
+    rc.layer_target_bitrate[i - 13] = (int)strtol(argv[i], NULL, 0);
     if (strncmp(encoder->name, "vp8", 3) == 0)
-      cfg.ts_target_bitrate[i - 12] = rc.layer_target_bitrate[i - 12];
+      cfg.ts_target_bitrate[i - 13] = rc.layer_target_bitrate[i - 13];
     else if (strncmp(encoder->name, "vp9", 3) == 0)
-      cfg.layer_target_bitrate[i - 12] = rc.layer_target_bitrate[i - 12];
+      cfg.layer_target_bitrate[i - 13] = rc.layer_target_bitrate[i - 13];
   }
 
   // Real time parameters.
@@ -634,7 +814,7 @@ int main(int argc, char **argv) {
   if (strncmp(encoder->name, "vp9", 3) == 0) cfg.rc_max_quantizer = 52;
   cfg.rc_undershoot_pct = 50;
   cfg.rc_overshoot_pct = 50;
-  cfg.rc_buf_initial_sz = 500;
+  cfg.rc_buf_initial_sz = 600;
   cfg.rc_buf_optimal_sz = 600;
   cfg.rc_buf_sz = 1000;
 
@@ -642,10 +822,14 @@ int main(int argc, char **argv) {
   cfg.rc_resize_allowed = 0;
 
   // Use 1 thread as default.
-  cfg.g_threads = (unsigned int)strtoul(argv[10], NULL, 0);
+  cfg.g_threads = (unsigned int)strtoul(argv[11], NULL, 0);
 
+  error_resilient = (uint32_t)strtoul(argv[10], NULL, 0);
+  if (error_resilient != 0 && error_resilient != 1) {
+    die("Invalid value for error resilient (0, 1): %d.", error_resilient);
+  }
   // Enable error resilient mode.
-  cfg.g_error_resilient = 1;
+  cfg.g_error_resilient = error_resilient;
   cfg.g_lag_in_frames = 0;
   cfg.kf_mode = VPX_KF_AUTO;
 
@@ -659,13 +843,15 @@ int main(int argc, char **argv) {
 
   set_rate_control_metrics(&rc, &cfg);
 
-  // Target bandwidth for the whole stream.
-  // Set to layer_target_bitrate for highest layer (total bitrate).
-  cfg.rc_target_bitrate = rc.layer_target_bitrate[cfg.ts_number_layers - 1];
-
-  // Open input file.
-  if (!(infile = fopen(argv[1], "rb"))) {
-    die("Failed to open %s for reading", argv[1]);
+  if (input_ctx.file_type == FILE_TYPE_Y4M) {
+    if (input_ctx.width != cfg.g_w || input_ctx.height != cfg.g_h) {
+      die("Incorrect width or height: %d x %d", cfg.g_w, cfg.g_h);
+    }
+    if (input_ctx.framerate.numerator != cfg.g_timebase.den ||
+        input_ctx.framerate.denominator != cfg.g_timebase.num) {
+      die("Incorrect framerate: numerator %d denominator %d",
+          cfg.g_timebase.num, cfg.g_timebase.den);
+    }
   }
 
   framerate = cfg.g_timebase.den / cfg.g_timebase.num;
@@ -696,25 +882,45 @@ int main(int argc, char **argv) {
 #else
   if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-    die_codec(&codec, "Failed to initialize encoder");
+    die("Failed to initialize encoder");
+
+#if ROI_MAP
+  mask_rows = (cfg.g_h + block_size - 1) / block_size;
+  mask_cols = (cfg.g_w + block_size - 1) / block_size;
+  mask_map = (int *)calloc(mask_rows * mask_cols, sizeof(*mask_map));
+  prev_mask_map = (int *)calloc(mask_rows * mask_cols, sizeof(*mask_map));
+#endif
 
   if (strncmp(encoder->name, "vp8", 3) == 0) {
     vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed);
-    vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOff);
+    vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kVp8DenoiserOff);
     vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
     vpx_codec_control(&codec, VP8E_SET_GF_CBR_BOOST_PCT, 0);
+#if ROI_MAP
+    set_roi_map(encoder->name, &cfg, &roi);
+    if (vpx_codec_control(&codec, VP8E_SET_ROI_MAP, &roi))
+      die_codec(&codec, "Failed to set ROI map");
+#endif
   } else if (strncmp(encoder->name, "vp9", 3) == 0) {
     vpx_svc_extra_cfg_t svc_params;
     memset(&svc_params, 0, sizeof(svc_params));
+    vpx_codec_control(&codec, VP9E_SET_POSTENCODE_DROP, 0);
+    vpx_codec_control(&codec, VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, 0);
     vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed);
     vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
     vpx_codec_control(&codec, VP9E_SET_GF_CBR_BOOST_PCT, 0);
     vpx_codec_control(&codec, VP9E_SET_FRAME_PARALLEL_DECODING, 0);
     vpx_codec_control(&codec, VP9E_SET_FRAME_PERIODIC_BOOST, 0);
-    vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, kDenoiserOff);
+    vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, kVp9DenoiserOff);
     vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
     vpx_codec_control(&codec, VP9E_SET_TUNE_CONTENT, 0);
-    vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (cfg.g_threads >> 1));
+    vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, get_msb(cfg.g_threads));
+    vpx_codec_control(&codec, VP9E_SET_DISABLE_LOOPFILTER, 0);
+
+    if (cfg.g_threads > 1)
+      vpx_codec_control(&codec, VP9E_SET_ROW_MT, 1);
+    else
+      vpx_codec_control(&codec, VP9E_SET_ROW_MT, 0);
     if (vpx_codec_control(&codec, VP9E_SET_SVC, layering_mode > 0 ? 1 : 0))
       die_codec(&codec, "Failed to set SVC");
     for (i = 0; i < cfg.ts_number_layers; ++i) {
@@ -733,7 +939,7 @@ int main(int argc, char **argv) {
   // For generating smaller key frames, use a smaller max_intra_size_pct
   // value, like 100 or 200.
   {
-    const int max_intra_size_pct = 900;
+    const int max_intra_size_pct = 1000;
     vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT,
                       max_intra_size_pct);
   }
@@ -743,12 +949,14 @@ int main(int argc, char **argv) {
     struct vpx_usec_timer timer;
     vpx_codec_iter_t iter = NULL;
     const vpx_codec_cx_pkt_t *pkt;
-#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
+#if ROI_MAP
+    char mask_file_name[255];
+#endif
     // Update the temporal layer_id. No spatial layers in this test.
     layer_id.spatial_layer_id = 0;
-#endif
     layer_id.temporal_layer_id =
         cfg.ts_layer_id[frame_cnt % cfg.ts_periodicity];
+    layer_id.temporal_layer_id_per_spatial[0] = layer_id.temporal_layer_id;
     if (strncmp(encoder->name, "vp9", 3) == 0) {
       vpx_codec_control(&codec, VP9E_SET_SVC_LAYER_ID, &layer_id);
     } else if (strncmp(encoder->name, "vp8", 3) == 0) {
@@ -757,7 +965,24 @@ int main(int argc, char **argv) {
     }
     flags = layer_flags[frame_cnt % flag_periodicity];
     if (layering_mode == 0) flags = 0;
-    frame_avail = vpx_img_read(&raw, infile);
+#if ROI_MAP
+    snprintf(mask_file_name, sizeof(mask_file_name), "%s%05d.txt",
+             argv[argc - 1], frame_cnt);
+    mask_file = fopen(mask_file_name, "r");
+    if (mask_file != NULL) {
+      int mask_is_valid = read_mask(mask_file, mask_map, mask_rows, mask_cols);
+      fclose(mask_file);
+      if (mask_is_valid) {
+        // set_roi_map(encoder->name, &cfg, &roi);
+        set_roi_skip_map(&cfg, &roi, mask_map, prev_mask_map, frame_cnt);
+        if (vpx_codec_control(&codec, VP9E_SET_ROI_MAP, &roi))
+          die_codec(&codec, "Failed to set ROI map");
+      } else {
+        die_codec(&codec, "Mask input is invalid for ROI map");
+      }
+    }
+#endif
+    frame_avail = read_frame(&input_ctx, &raw);
     if (frame_avail) ++rc.layer_input_frames[layer_id.temporal_layer_id];
     vpx_usec_timer_start(&timer);
     if (vpx_codec_encode(&codec, frame_avail ? &raw : NULL, pts, 1, flags,
@@ -794,6 +1019,7 @@ int main(int argc, char **argv) {
           // Update for short-time encoding bitrate states, for moving window
           // of size rc->window, shifted by rc->window / 2.
           // Ignore first window segment, due to key frame.
+          if (rc.window_size == 0) rc.window_size = 15;
           if (frame_cnt > rc.window_size) {
             sum_bitrate += 0.001 * 8.0 * pkt->data.frame.sz * framerate;
             if (frame_cnt % rc.window_size == 0) {
@@ -825,7 +1051,11 @@ int main(int argc, char **argv) {
     ++frame_cnt;
     pts += frame_duration;
   }
-  fclose(infile);
+#if ROI_MAP
+  free(mask_map);
+  free(prev_mask_map);
+#endif
+  close_input_file(&input_ctx);
   printout_rate_control_summary(&rc, &cfg, frame_cnt);
   printf("\n");
   printf("Frame cnt and encoding time/FPS stats for encoding: %d %f %f \n",
@@ -837,6 +1067,12 @@ int main(int argc, char **argv) {
   // Try to rewrite the output file headers with the actual frame count.
   for (i = 0; i < cfg.ts_number_layers; ++i) vpx_video_writer_close(outfile[i]);
 
-  vpx_img_free(&raw);
+  if (input_ctx.file_type != FILE_TYPE_Y4M) {
+    vpx_img_free(&raw);
+  }
+
+#if ROI_MAP
+  free(roi.roi_map);
+#endif
   return EXIT_SUCCESS;
 }
diff --git a/media/libvpx/libvpx/ivfdec.c b/media/libvpx/libvpx/ivfdec.c
index f64e594ab0..3e179bc6ed 100644
--- a/media/libvpx/libvpx/ivfdec.c
+++ b/media/libvpx/libvpx/ivfdec.c
@@ -76,12 +76,12 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
   size_t frame_size = 0;
 
   if (fread(raw_header, IVF_FRAME_HDR_SZ, 1, infile) != 1) {
-    if (!feof(infile)) warn("Failed to read frame size\n");
+    if (!feof(infile)) warn("Failed to read frame size");
   } else {
     frame_size = mem_get_le32(raw_header);
 
     if (frame_size > 256 * 1024 * 1024) {
-      warn("Read invalid frame size (%u)\n", (unsigned int)frame_size);
+      warn("Read invalid frame size (%u)", (unsigned int)frame_size);
       frame_size = 0;
     }
 
@@ -92,7 +92,7 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
         *buffer = new_buffer;
         *buffer_size = 2 * frame_size;
       } else {
-        warn("Failed to allocate compressed data buffer\n");
+        warn("Failed to allocate compressed data buffer");
         frame_size = 0;
       }
     }
@@ -100,7 +100,7 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
 
   if (!feof(infile)) {
     if (fread(*buffer, 1, frame_size, infile) != frame_size) {
-      warn("Failed to read full frame\n");
+      warn("Failed to read full frame");
       return 1;
     }
 
diff --git a/media/libvpx/libvpx/ivfdec.h b/media/libvpx/libvpx/ivfdec.h
index af725572b4..847cd79f3f 100644
--- a/media/libvpx/libvpx/ivfdec.h
+++ b/media/libvpx/libvpx/ivfdec.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef IVFDEC_H_
-#define IVFDEC_H_
+#ifndef VPX_IVFDEC_H_
+#define VPX_IVFDEC_H_
 
 #include "./tools_common.h"
 
@@ -25,4 +25,4 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
 } /* extern "C" */
 #endif
 
-#endif  // IVFDEC_H_
+#endif  // VPX_IVFDEC_H_
diff --git a/media/libvpx/libvpx/ivfenc.c b/media/libvpx/libvpx/ivfenc.c
index a50d31839d..2e8e04283a 100644
--- a/media/libvpx/libvpx/ivfenc.c
+++ b/media/libvpx/libvpx/ivfenc.c
@@ -13,27 +13,35 @@
 #include "vpx/vpx_encoder.h"
 #include "vpx_ports/mem_ops.h"
 
-void ivf_write_file_header(FILE *outfile, const struct vpx_codec_enc_cfg *cfg,
-                           unsigned int fourcc, int frame_cnt) {
+void ivf_write_file_header_with_video_info(FILE *outfile, unsigned int fourcc,
+                                           int frame_cnt, int frame_width,
+                                           int frame_height,
+                                           vpx_rational_t timebase) {
   char header[32];
 
   header[0] = 'D';
   header[1] = 'K';
   header[2] = 'I';
   header[3] = 'F';
-  mem_put_le16(header + 4, 0);                     // version
-  mem_put_le16(header + 6, 32);                    // header size
-  mem_put_le32(header + 8, fourcc);                // fourcc
-  mem_put_le16(header + 12, cfg->g_w);             // width
-  mem_put_le16(header + 14, cfg->g_h);             // height
-  mem_put_le32(header + 16, cfg->g_timebase.den);  // rate
-  mem_put_le32(header + 20, cfg->g_timebase.num);  // scale
-  mem_put_le32(header + 24, frame_cnt);            // length
-  mem_put_le32(header + 28, 0);                    // unused
+  mem_put_le16(header + 4, 0);              // version
+  mem_put_le16(header + 6, 32);             // header size
+  mem_put_le32(header + 8, fourcc);         // fourcc
+  mem_put_le16(header + 12, frame_width);   // width
+  mem_put_le16(header + 14, frame_height);  // height
+  mem_put_le32(header + 16, timebase.den);  // rate
+  mem_put_le32(header + 20, timebase.num);  // scale
+  mem_put_le32(header + 24, frame_cnt);     // length
+  mem_put_le32(header + 28, 0);             // unused
 
   fwrite(header, 1, 32, outfile);
 }
 
+void ivf_write_file_header(FILE *outfile, const struct vpx_codec_enc_cfg *cfg,
+                           unsigned int fourcc, int frame_cnt) {
+  ivf_write_file_header_with_video_info(outfile, fourcc, frame_cnt, cfg->g_w,
+                                        cfg->g_h, cfg->g_timebase);
+}
+
 void ivf_write_frame_header(FILE *outfile, int64_t pts, size_t frame_size) {
   char header[12];
 
diff --git a/media/libvpx/libvpx/ivfenc.h b/media/libvpx/libvpx/ivfenc.h
index ebdce47be8..27b6910805 100644
--- a/media/libvpx/libvpx/ivfenc.h
+++ b/media/libvpx/libvpx/ivfenc.h
@@ -7,11 +7,13 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef IVFENC_H_
-#define IVFENC_H_
+#ifndef VPX_IVFENC_H_
+#define VPX_IVFENC_H_
 
 #include "./tools_common.h"
 
+#include "vpx/vpx_encoder.h"
+
 struct vpx_codec_enc_cfg;
 struct vpx_codec_cx_pkt;
 
@@ -19,6 +21,11 @@ struct vpx_codec_cx_pkt;
 extern "C" {
 #endif
 
+void ivf_write_file_header_with_video_info(FILE *outfile, unsigned int fourcc,
+                                           int frame_cnt, int frame_width,
+                                           int frame_height,
+                                           vpx_rational_t timebase);
+
 void ivf_write_file_header(FILE *outfile, const struct vpx_codec_enc_cfg *cfg,
                            uint32_t fourcc, int frame_cnt);
 
@@ -30,4 +37,4 @@ void ivf_write_frame_size(FILE *outfile, size_t frame_size);
 } /* extern "C" */
 #endif
 
-#endif  // IVFENC_H_
+#endif  // VPX_IVFENC_H_
diff --git a/media/libvpx/libvpx/libs.doxy_template b/media/libvpx/libvpx/libs.doxy_template
index 5a8f847280..6d05162d00 100644
--- a/media/libvpx/libvpx/libs.doxy_template
+++ b/media/libvpx/libvpx/libs.doxy_template
@@ -654,12 +654,6 @@ VERBATIM_HEADERS       = YES
 
 ALPHABETICAL_INDEX     = NO
 
-# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
-# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
-# in which this list will be split (can be a number in the range [1..20])
-
-COLS_IN_ALPHA_INDEX    = 5
-
 # In case all classes in a project start with a common prefix, all
 # classes will be put under the same header in the alphabetical index.
 # The IGNORE_PREFIX tag can be used to specify one or more prefixes that
@@ -943,18 +937,6 @@ GENERATE_XML           = NO
 
 XML_OUTPUT             = xml
 
-# The XML_SCHEMA tag can be used to specify an XML schema,
-# which can be used by a validating XML parser to check the
-# syntax of the XML files.
-
-XML_SCHEMA             =
-
-# The XML_DTD tag can be used to specify an XML DTD,
-# which can be used by a validating XML parser to check the
-# syntax of the XML files.
-
-XML_DTD                =
-
 # If the XML_PROGRAMLISTING tag is set to YES Doxygen will
 # dump the program listings (including syntax highlighting
 # and cross-referencing information) to the XML output. Note that
@@ -1111,32 +1093,10 @@ ALLEXTERNALS           = NO
 
 EXTERNAL_GROUPS        = YES
 
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of `which perl').
-
-PERL_PATH              = /usr/bin/perl
-
 #---------------------------------------------------------------------------
 # Configuration options related to the dot tool
 #---------------------------------------------------------------------------
 
-# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
-# generate a inheritance diagram (in HTML, RTF and la_te_x) for classes with base
-# or super classes. Setting the tag to NO turns the diagrams off. Note that
-# this option is superseded by the HAVE_DOT option below. This is only a
-# fallback. It is recommended to install and use dot, since it yields more
-# powerful graphs.
-
-CLASS_DIAGRAMS         = YES
-
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see http://www.mcternan.me.uk/mscgen/) to
-# produce the chart and insert it in the documentation. The MSCGEN_PATH tag allows you to
-# specify the directory where the mscgen tool resides. If left empty the tool is assumed to
-# be found in the default search path.
-
-MSCGEN_PATH            =
-
 # If set to YES, the inheritance and collaboration graphs will hide
 # inheritance and usage relations if the target is undocumented
 # or is not a class.
@@ -1150,10 +1110,14 @@ HIDE_UNDOC_RELATIONS   = YES
 
 HAVE_DOT               = NO
 
-# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
-# will generate a graph for each documented class showing the direct and
-# indirect inheritance relations. Setting this tag to YES will force the
-# the CLASS_DIAGRAMS tag to NO.
+# If the CLASS_GRAPH tag is set to YES (or GRAPH) then doxygen will generate a
+# graph for each documented class showing the direct and indirect inheritance
+# relations. In case HAVE_DOT is set as well dot will be used to draw the graph,
+# otherwise the built-in generator will be used. If the CLASS_GRAPH tag is set
+# to TEXT the direct and indirect inheritance relations will be shown as texts /
+# links.
+# Possible values are: NO, YES, TEXT and GRAPH.
+# The default value is: YES.
 
 CLASS_GRAPH            = YES
 
@@ -1259,14 +1223,6 @@ DOT_GRAPH_MAX_NODES    = 50
 
 MAX_DOT_GRAPH_DEPTH    = 0
 
-# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
-# background. This is disabled by default, which results in a white background.
-# Warning: Depending on the platform used, enabling this option may lead to
-# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
-# read).
-
-DOT_TRANSPARENT        = YES
-
 # Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
 # files in one run (i.e. multiple -o and -T options on the command line). This
 # makes dot run faster, but since only newer versions of dot (>1.8.10)
diff --git a/media/libvpx/libvpx/libs.mk b/media/libvpx/libvpx/libs.mk
index 36935bd1e6..483da0fdd2 100644
--- a/media/libvpx/libvpx/libs.mk
+++ b/media/libvpx/libvpx/libs.mk
@@ -11,7 +11,7 @@
 
 # ARM assembly files are written in RVCT-style. We use some make magic to
 # filter those files to allow GCC compilation
-ifeq ($(ARCH_ARM),yes)
+ifeq ($(VPX_ARCH_ARM),yes)
   ASM:=$(if $(filter yes,$(CONFIG_GCC)$(CONFIG_MSVS)),.asm.S,.asm)
 else
   ASM:=.asm
@@ -63,6 +63,7 @@ ifeq ($(CONFIG_VP8_ENCODER),yes)
   CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_CX_SRCS))
   CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_CX_EXPORTS))
   INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h
+  INSTALL-LIBS-yes += include/vpx/vpx_ext_ratectrl.h
   INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/%
   CODEC_DOC_SECTIONS += vp8 vp8_encoder
 endif
@@ -87,13 +88,35 @@ ifeq ($(CONFIG_VP9_ENCODER),yes)
   CODEC_SRCS-yes += $(addprefix $(VP9_PREFIX),$(call enabled,VP9_CX_SRCS))
   CODEC_EXPORTS-yes += $(addprefix $(VP9_PREFIX),$(VP9_CX_EXPORTS))
   CODEC_SRCS-yes += $(VP9_PREFIX)vp9cx.mk vpx/vp8.h vpx/vp8cx.h
+  CODEC_SRCS-yes += vpx/vpx_ext_ratectrl.h
   INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h
-  INSTALL-LIBS-$(CONFIG_SPATIAL_SVC) += include/vpx/svc_context.h
+  INSTALL-LIBS-yes += include/vpx/vpx_ext_ratectrl.h
   INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP9_PREFIX)/%
-  CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h
+  CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h vpx/vpx_ext_ratectrl.h
   CODEC_DOC_SECTIONS += vp9 vp9_encoder
 endif
 
+RC_RTC_SRCS := vpx/vp8.h vpx/vp8cx.h
+RC_RTC_SRCS += vpx/vpx_ext_ratectrl.h
+RC_RTC_SRCS += vpx/internal/vpx_ratectrl_rtc.h
+ifeq ($(CONFIG_VP9_ENCODER),yes)
+  VP9_PREFIX=vp9/
+  RC_RTC_SRCS += $(addprefix $(VP9_PREFIX),$(call enabled,VP9_CX_SRCS))
+  RC_RTC_SRCS += $(VP9_PREFIX)vp9cx.mk
+  RC_RTC_SRCS += $(VP9_PREFIX)ratectrl_rtc.cc
+  RC_RTC_SRCS += $(VP9_PREFIX)ratectrl_rtc.h
+  INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP9_PREFIX)ratectrl_rtc.cc
+  INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP9_PREFIX)ratectrl_rtc.h
+endif
+ifeq ($(CONFIG_VP8_ENCODER),yes)
+  VP8_PREFIX=vp8/
+  RC_RTC_SRCS += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_CX_SRCS))
+  RC_RTC_SRCS += $(VP8_PREFIX)vp8_ratectrl_rtc.cc
+  RC_RTC_SRCS += $(VP8_PREFIX)vp8_ratectrl_rtc.h
+  INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP8_PREFIX)vp8_ratectrl_rtc.cc
+  INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP8_PREFIX)vp8_ratectrl_rtc.h
+endif
+
 ifeq ($(CONFIG_VP9_DECODER),yes)
   VP9_PREFIX=vp9/
   include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9dx.mk
@@ -113,16 +136,10 @@ ifeq ($(CONFIG_DECODERS),yes)
   CODEC_DOC_SECTIONS += decoder
 endif
 
-# Suppress -Wextra warnings in third party code.
-$(BUILD_PFX)third_party/googletest/%.cc.o: CXXFLAGS += -Wno-missing-field-initializers
-# Suppress -Wextra warnings in first party code pending investigation.
-# https://bugs.chromium.org/p/webm/issues/detail?id=1069
-$(BUILD_PFX)vp8/encoder/onyx_if.c.o: CFLAGS += -Wno-unknown-warning-option -Wno-clobbered
-$(BUILD_PFX)vp8/decoder/onyxd_if.c.o: CFLAGS += -Wno-unknown-warning-option -Wno-clobbered
-
 ifeq ($(CONFIG_MSVS),yes)
 CODEC_LIB=$(if $(CONFIG_STATIC_MSVCRT),vpxmt,vpxmd)
 GTEST_LIB=$(if $(CONFIG_STATIC_MSVCRT),gtestmt,gtestmd)
+RC_RTC_LIB=$(if $(CONFIG_STATIC_MSVCRT),vpxrcmt,vpxrcmd)
 # This variable uses deferred expansion intentionally, since the results of
 # $(wildcard) may change during the course of the Make.
 VS_PLATFORMS = $(foreach d,$(wildcard */Release/$(CODEC_LIB).lib),$(word 1,$(subst /, ,$(d))))
@@ -147,14 +164,12 @@ CODEC_SRCS-yes += vpx_ports/mem_ops_aligned.h
 CODEC_SRCS-yes += vpx_ports/vpx_once.h
 CODEC_SRCS-yes += $(BUILD_PFX)vpx_config.c
 INSTALL-SRCS-no += $(BUILD_PFX)vpx_config.c
-ifeq ($(ARCH_X86)$(ARCH_X86_64),yes)
+ifeq ($(VPX_ARCH_X86)$(VPX_ARCH_X86_64),yes)
 INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += third_party/x86inc/x86inc.asm
+INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += vpx_dsp/x86/bitdepth_conversion_sse2.asm
 endif
 CODEC_EXPORTS-yes += vpx/exports_com
 CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc
-ifeq ($(CONFIG_SPATIAL_SVC),yes)
-CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_spatial_svc
-endif
 CODEC_EXPORTS-$(CONFIG_DECODERS) += vpx/exports_dec
 
 INSTALL-LIBS-yes += include/vpx/vpx_codec.h
@@ -163,19 +178,25 @@ INSTALL-LIBS-yes += include/vpx/vpx_image.h
 INSTALL-LIBS-yes += include/vpx/vpx_integer.h
 INSTALL-LIBS-$(CONFIG_DECODERS) += include/vpx/vpx_decoder.h
 INSTALL-LIBS-$(CONFIG_ENCODERS) += include/vpx/vpx_encoder.h
+INSTALL-LIBS-$(CONFIG_ENCODERS) += include/vpx/vpx_tpl.h
 ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
 ifeq ($(CONFIG_MSVS),yes)
 INSTALL-LIBS-yes                  += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/$(CODEC_LIB).lib)
+ifeq ($(CONFIG_STATIC),yes)
 INSTALL-LIBS-$(CONFIG_DEBUG_LIBS) += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/$(CODEC_LIB)d.lib)
+endif
 INSTALL-LIBS-$(CONFIG_SHARED) += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/vpx.dll)
 INSTALL-LIBS-$(CONFIG_SHARED) += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/vpx.exp)
 endif
 else
 INSTALL-LIBS-$(CONFIG_STATIC) += $(LIBSUBDIR)/libvpx.a
+ifeq ($(CONFIG_STATIC),yes)
 INSTALL-LIBS-$(CONFIG_DEBUG_LIBS) += $(LIBSUBDIR)/libvpx_g.a
 endif
+endif
 
 CODEC_SRCS=$(call enabled,CODEC_SRCS)
+
 INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(CODEC_SRCS)
 INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(call enabled,CODEC_EXPORTS)
 
@@ -187,6 +208,18 @@ libvpx_srcs.txt:
 	@echo $(CODEC_SRCS) | xargs -n1 echo | LC_ALL=C sort -u > $@
 CLEAN-OBJS += libvpx_srcs.txt
 
+libvpxrc_srcs.txt:
+	@echo "    [CREATE] $@"
+	@echo $(RC_RTC_SRCS) | xargs -n1 echo | LC_ALL=C sort -u > $@
+CLEAN-OBJS += libvpxrc_srcs.txt
+
+# Assembly files that are included, but don't define symbols themselves.
+# Filtered out to avoid Windows build warnings.
+ASM_INCLUDES := \
+    third_party/x86inc/x86inc.asm \
+    vpx_config.asm \
+    vpx_ports/x86_abi_support.asm \
+    vpx_dsp/x86/bitdepth_conversion_sse2.asm \
 
 ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
 ifeq ($(CONFIG_MSVS),yes)
@@ -198,12 +231,7 @@ vpx.def: $(call enabled,CODEC_EXPORTS)
             --out=$@ $^
 CLEAN-OBJS += vpx.def
 
-# Assembly files that are included, but don't define symbols themselves.
-# Filtered out to avoid Visual Studio build warnings.
-ASM_INCLUDES := \
-    third_party/x86inc/x86inc.asm \
-    vpx_config.asm \
-    vpx_ports/x86_abi_support.asm \
+vpx.$(VCPROJ_SFX): VCPROJ_SRCS=$(filter-out $(addprefix %, $(ASM_INCLUDES)), $^)
 
 vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def
 	@echo "    [CREATE] $@"
@@ -217,7 +245,16 @@ vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def
             --ver=$(CONFIG_VS_VERSION) \
             --src-path-bare="$(SRC_PATH_BARE)" \
             --out=$@ $(CFLAGS) \
-            $(filter-out $(addprefix %, $(ASM_INCLUDES)), $^) \
+            --as=$(AS) \
+            $(filter $(SRC_PATH_BARE)/vp8/%.c, $(VCPROJ_SRCS)) \
+            $(filter $(SRC_PATH_BARE)/vp8/%.h, $(VCPROJ_SRCS)) \
+            $(filter $(SRC_PATH_BARE)/vp9/%.c, $(VCPROJ_SRCS)) \
+            $(filter $(SRC_PATH_BARE)/vp9/%.h, $(VCPROJ_SRCS)) \
+            $(filter $(SRC_PATH_BARE)/vpx/%, $(VCPROJ_SRCS)) \
+            $(filter $(SRC_PATH_BARE)/vpx_dsp/%, $(VCPROJ_SRCS)) \
+            $(filter-out $(addprefix $(SRC_PATH_BARE)/, \
+                           vp8/%.c vp8/%.h vp9/%.c vp9/%.h vpx/% vpx_dsp/%), \
+              $(VCPROJ_SRCS)) \
             --src-path-bare="$(SRC_PATH_BARE)" \
 
 PROJECTS-yes += vpx.$(VCPROJ_SFX)
@@ -225,15 +262,58 @@ PROJECTS-yes += vpx.$(VCPROJ_SFX)
 vpx.$(VCPROJ_SFX): vpx_config.asm
 vpx.$(VCPROJ_SFX): $(RTCD)
 
-endif
-else
-LIBVPX_OBJS=$(call objs,$(CODEC_SRCS))
+vpxrc.$(VCPROJ_SFX): \
+    VCPROJ_SRCS=$(filter-out $(addprefix %, $(ASM_INCLUDES)), $^)
+
+vpxrc.$(VCPROJ_SFX): $(RC_RTC_SRCS)
+	@echo "    [CREATE] $@"
+	$(qexec)$(GEN_VCPROJ) \
+            $(if $(CONFIG_SHARED),--dll,--lib) \
+            --target=$(TOOLCHAIN) \
+            $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
+            --name=vpxrc \
+            --proj-guid=C26FF952-9494-4838-9A3F-7F3D4F613385 \
+            --ver=$(CONFIG_VS_VERSION) \
+            --src-path-bare="$(SRC_PATH_BARE)" \
+            --out=$@ $(CFLAGS) \
+            --as=$(AS) \
+            $(filter $(SRC_PATH_BARE)/vp9/%.c, $(VCPROJ_SRCS)) \
+            $(filter $(SRC_PATH_BARE)/vp9/%.cc, $(VCPROJ_SRCS)) \
+            $(filter $(SRC_PATH_BARE)/vp9/%.h, $(VCPROJ_SRCS)) \
+            $(filter $(SRC_PATH_BARE)/vpx/%, $(VCPROJ_SRCS)) \
+            $(filter $(SRC_PATH_BARE)/vpx_dsp/%, $(VCPROJ_SRCS)) \
+            $(filter-out $(addprefix $(SRC_PATH_BARE)/, \
+                           vp8/%.c vp8/%.h vp9/%.c vp9/%.cc vp9/%.h vpx/% \
+                           vpx_dsp/%), \
+              $(VCPROJ_SRCS)) \
+            --src-path-bare="$(SRC_PATH_BARE)" \
+
+PROJECTS-yes += vpxrc.$(VCPROJ_SFX)
+
+vpxrc.$(VCPROJ_SFX): vpx_config.asm
+vpxrc.$(VCPROJ_SFX): $(RTCD)
+
+endif # ifeq ($(CONFIG_MSVS),yes)
+else # ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
+LIBVPX_OBJS=$(call objs, $(filter-out $(ASM_INCLUDES), $(CODEC_SRCS)))
 OBJS-yes += $(LIBVPX_OBJS)
 LIBS-$(if yes,$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a
 $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
 
-SO_VERSION_MAJOR := 4
-SO_VERSION_MINOR := 1
+# Updating version info.
+# https://www.gnu.org/software/libtool/manual/libtool.html#Updating-version-info
+# For libtool: c=<current>, a=<age>, r=<revision>
+# libtool generates .so file as .so.[c-a].a.r, while -version-info c:r:a is
+# passed to libtool.
+#
+# libvpx library file is generated as libvpx.so.<MAJOR>.<MINOR>.<PATCH>
+# MAJOR = c-a, MINOR = a, PATCH = r
+#
+# To determine SO_VERSION_{MAJOR,MINOR,PATCH}, calculate c,a,r with current
+# SO_VERSION_* then follow the rules in the link to detemine the new version
+# (c1, a1, r1) and set MAJOR to [c1-a1], MINOR to a1 and PATCH to r1
+SO_VERSION_MAJOR := 12
+SO_VERSION_MINOR := 0
 SO_VERSION_PATCH := 0
 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
 LIBVPX_SO               := libvpx.$(SO_VERSION_MAJOR).dylib
@@ -273,18 +353,6 @@ $(BUILD_PFX)$(LIBVPX_SO): extralibs += -lm
 $(BUILD_PFX)$(LIBVPX_SO): SONAME = libvpx.so.$(SO_VERSION_MAJOR)
 $(BUILD_PFX)$(LIBVPX_SO): EXPORTS_FILE = $(EXPORT_FILE)
 
-libvpx.ver: $(call enabled,CODEC_EXPORTS)
-	@echo "    [CREATE] $@"
-	$(qexec)echo "{ global:" > $@
-	$(qexec)for f in $?; do awk '{print $$2";"}' < $$f >>$@; done
-	$(qexec)echo "local: *; };" >> $@
-CLEAN-OBJS += libvpx.ver
-
-libvpx.syms: $(call enabled,CODEC_EXPORTS)
-	@echo "    [CREATE] $@"
-	$(qexec)awk '{print "_"$$2}' $^ >$@
-CLEAN-OBJS += libvpx.syms
-
 libvpx.def: $(call enabled,CODEC_EXPORTS)
 	@echo "    [CREATE] $@"
 	$(qexec)echo LIBRARY $(LIBVPX_SO:.dll=) INITINSTANCE TERMINSTANCE > $@
@@ -342,22 +410,42 @@ endif
 INSTALL-LIBS-yes += $(LIBSUBDIR)/pkgconfig/vpx.pc
 INSTALL_MAPS += $(LIBSUBDIR)/pkgconfig/%.pc %.pc
 CLEAN-OBJS += vpx.pc
+
+ifeq ($(CONFIG_ENCODERS),yes)
+  RC_RTC_OBJS=$(call objs,$(RC_RTC_SRCS))
+  OBJS-yes += $(RC_RTC_OBJS)
+  LIBS-yes += $(BUILD_PFX)libvpxrc.a $(BUILD_PFX)libvpxrc_g.a
+  $(BUILD_PFX)libvpxrc_g.a: $(RC_RTC_OBJS)
 endif
 
+endif # ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
+
+libvpx.ver: $(call enabled,CODEC_EXPORTS)
+	@echo "    [CREATE] $@"
+	$(qexec)echo "{ global:" > $@
+	$(qexec)for f in $?; do awk '{print $$2";"}' < $$f >>$@; done
+	$(qexec)echo "local: *; };" >> $@
+CLEAN-OBJS += libvpx.ver
+
+libvpx.syms: $(call enabled,CODEC_EXPORTS)
+	@echo "    [CREATE] $@"
+	$(qexec)awk '{print "_"$$2}' $^ >$@
+CLEAN-OBJS += libvpx.syms
+
 #
 # Rule to make assembler configuration file from C configuration file
 #
-ifeq ($(ARCH_X86)$(ARCH_X86_64),yes)
+ifeq ($(VPX_ARCH_X86)$(VPX_ARCH_X86_64),yes)
 # YASM
 $(BUILD_PFX)vpx_config.asm: $(BUILD_PFX)vpx_config.h
 	@echo "    [CREATE] $@"
-	@egrep "#define [A-Z0-9_]+ [01]" $< \
+	@LC_ALL=C grep -E "#define [A-Z0-9_]+ [01]" $< \
 	    | awk '{print $$2 " equ " $$3}' > $@
 else
 ADS2GAS=$(if $(filter yes,$(CONFIG_GCC)),| $(ASM_CONVERSION))
 $(BUILD_PFX)vpx_config.asm: $(BUILD_PFX)vpx_config.h
 	@echo "    [CREATE] $@"
-	@egrep "#define [A-Z0-9_]+ [01]" $< \
+	@LC_ALL=C grep -E "#define [A-Z0-9_]+ [01]" $< \
 	    | awk '{print $$2 " EQU " $$3}' $(ADS2GAS) > $@
 	@echo "        END" $(ADS2GAS) >> $@
 CLEAN-OBJS += $(BUILD_PFX)vpx_config.asm
@@ -387,27 +475,56 @@ ifeq ($(CONFIG_UNIT_TESTS),yes)
 LIBVPX_TEST_DATA_PATH ?= .
 
 include $(SRC_PATH_BARE)/test/test.mk
-LIBVPX_TEST_SRCS=$(addprefix test/,$(call enabled,LIBVPX_TEST_SRCS))
+
+# addprefix_clean behaves like addprefix if the target doesn't start with "../"
+# However, if the target starts with "../", instead of adding prefix,
+# it will remove "../".
+# Using addprefix_clean, we can avoid two different targets building the
+# same file, i.e.
+# test/../ivfenc.c.d: ivfenc.o
+# ivfenc.c.d: ivfenc.o
+# Note that the other way to solve this problem is using "realpath".
+# The "realpath" is supported by make 3.81 or later.
+addprefix_clean=$(patsubst $(1)../%,%,$(addprefix $(1), $(2)))
+LIBVPX_TEST_SRCS=$(call addprefix_clean,test/,$(call enabled,LIBVPX_TEST_SRCS))
+
 LIBVPX_TEST_BIN=./test_libvpx$(EXE_SFX)
 LIBVPX_TEST_DATA=$(addprefix $(LIBVPX_TEST_DATA_PATH)/,\
                      $(call enabled,LIBVPX_TEST_DATA))
 libvpx_test_data_url=https://storage.googleapis.com/downloads.webmproject.org/test_data/libvpx/$(1)
 
 TEST_INTRA_PRED_SPEED_BIN=./test_intra_pred_speed$(EXE_SFX)
-TEST_INTRA_PRED_SPEED_SRCS=$(addprefix test/,$(call enabled,TEST_INTRA_PRED_SPEED_SRCS))
+TEST_INTRA_PRED_SPEED_SRCS=$(call addprefix_clean,test/,\
+                           $(call enabled,TEST_INTRA_PRED_SPEED_SRCS))
 TEST_INTRA_PRED_SPEED_OBJS := $(sort $(call objs,$(TEST_INTRA_PRED_SPEED_SRCS)))
 
+ifeq ($(CONFIG_ENCODERS),yes)
+RC_INTERFACE_TEST_BIN=./test_rc_interface$(EXE_SFX)
+RC_INTERFACE_TEST_SRCS=$(call addprefix_clean,test/,\
+                       $(call enabled,RC_INTERFACE_TEST_SRCS))
+RC_INTERFACE_TEST_OBJS := $(sort $(call objs,$(RC_INTERFACE_TEST_SRCS)))
+endif
+
 libvpx_test_srcs.txt:
 	@echo "    [CREATE] $@"
 	@echo $(LIBVPX_TEST_SRCS) | xargs -n1 echo | LC_ALL=C sort -u > $@
 CLEAN-OBJS += libvpx_test_srcs.txt
 
+# Attempt to download the file using curl, retrying once if it fails for a
+# partial file (18).
 $(LIBVPX_TEST_DATA): $(SRC_PATH_BARE)/test/test-data.sha1
 	@echo "    [DOWNLOAD] $@"
-	$(qexec)trap 'rm -f $@' INT TERM &&\
-            curl --retry 1 -L -o $@ $(call libvpx_test_data_url,$(@F))
+	$(qexec)( \
+	  trap 'rm -f $@' INT TERM; \
+	  curl="curl -S -s --retry 1 -L -o $@ $(call libvpx_test_data_url,$(@F))"; \
+	  $$curl; ret=$$?; \
+	  case "$$ret" in \
+	    18) $$curl -C - ;; \
+	    *) exit $$ret ;; \
+	  esac \
+	)
 
-testdata:: $(LIBVPX_TEST_DATA)
+testdata: $(LIBVPX_TEST_DATA)
 	$(qexec)[ -x "$$(which sha1sum)" ] && sha1sum=sha1sum;\
           [ -x "$$(which shasum)" ] && sha1sum=shasum;\
           [ -x "$$(which sha1)" ] && sha1sum=sha1;\
@@ -416,7 +533,7 @@ testdata:: $(LIBVPX_TEST_DATA)
             echo "Checking test data:";\
             for f in $(call enabled,LIBVPX_TEST_DATA); do\
                 grep $$f $(SRC_PATH_BARE)/test/test-data.sha1 |\
-                    (cd $(LIBVPX_TEST_DATA_PATH); $${sha1sum} -c);\
+                    (cd "$(LIBVPX_TEST_DATA_PATH)"; $${sha1sum} -c);\
             done; \
         else\
             echo "Skipping test data integrity check, sha1sum not found.";\
@@ -435,6 +552,7 @@ gtest.$(VCPROJ_SFX): $(SRC_PATH_BARE)/third_party/googletest/src/src/gtest-all.c
             --proj-guid=EC00E1EC-AF68-4D92-A255-181690D1C9B1 \
             --ver=$(CONFIG_VS_VERSION) \
             --src-path-bare="$(SRC_PATH_BARE)" \
+            --as=$(AS) \
             -D_VARIADIC_MAX=10 \
             --out=gtest.$(VCPROJ_SFX) $(SRC_PATH_BARE)/third_party/googletest/src/src/gtest-all.cc \
             -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" -I"$(SRC_PATH_BARE)/third_party/googletest/src"
@@ -451,6 +569,7 @@ test_libvpx.$(VCPROJ_SFX): $(LIBVPX_TEST_SRCS) vpx.$(VCPROJ_SFX) gtest.$(VCPROJ_
             --proj-guid=CD837F5F-52D8-4314-A370-895D614166A7 \
             --ver=$(CONFIG_VS_VERSION) \
             --src-path-bare="$(SRC_PATH_BARE)" \
+            --as=$(AS) \
             $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
             --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \
             -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \
@@ -473,12 +592,35 @@ test_intra_pred_speed.$(VCPROJ_SFX): $(TEST_INTRA_PRED_SPEED_SRCS) vpx.$(VCPROJ_
             --proj-guid=CD837F5F-52D8-4314-A370-895D614166A7 \
             --ver=$(CONFIG_VS_VERSION) \
             --src-path-bare="$(SRC_PATH_BARE)" \
+            --as=$(AS) \
             $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
             --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \
             -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \
             -L. -l$(CODEC_LIB) -l$(GTEST_LIB) $^
 endif  # TEST_INTRA_PRED_SPEED
-endif
+
+ifeq ($(CONFIG_ENCODERS),yes)
+ifneq ($(strip $(RC_INTERFACE_TEST_OBJS)),)
+PROJECTS-$(CONFIG_MSVS) += test_rc_interface.$(VCPROJ_SFX)
+test_rc_interface.$(VCPROJ_SFX): $(RC_INTERFACE_TEST_SRCS) vpx.$(VCPROJ_SFX) \
+	vpxrc.$(VCPROJ_SFX) gtest.$(VCPROJ_SFX)
+	@echo "    [CREATE] $@"
+	$(qexec)$(GEN_VCPROJ) \
+            --exe \
+            --target=$(TOOLCHAIN) \
+            --name=test_rc_interface \
+            -D_VARIADIC_MAX=10 \
+            --proj-guid=30458F88-1BC6-4689-B41C-50F3737AAB27 \
+            --ver=$(CONFIG_VS_VERSION) \
+            --as=$(AS) \
+            --src-path-bare="$(SRC_PATH_BARE)" \
+            $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
+            --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \
+            -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \
+            -L. -l$(CODEC_LIB) -l$(RC_RTC_LIB) -l$(GTEST_LIB) $^
+endif  # RC_INTERFACE_TEST
+endif  # CONFIG_ENCODERS
+endif  # CONFIG_MSVS
 else
 
 include $(SRC_PATH_BARE)/third_party/googletest/gtest.mk
@@ -519,31 +661,46 @@ $(eval $(call linkerxx_template,$(TEST_INTRA_PRED_SPEED_BIN), \
               -L. -lvpx -lgtest $(extralibs) -lm))
 endif  # TEST_INTRA_PRED_SPEED
 
-endif  # CONFIG_UNIT_TESTS
+ifeq ($(CONFIG_ENCODERS),yes)
+ifneq ($(strip $(RC_INTERFACE_TEST_OBJS)),)
+$(RC_INTERFACE_TEST_OBJS) $(RC_INTERFACE_TEST_OBJS:.o=.d): \
+  CXXFLAGS += $(GTEST_INCLUDES)
+OBJS-yes += $(RC_INTERFACE_TEST_OBJS)
+BINS-yes += $(RC_INTERFACE_TEST_BIN)
+
+$(RC_INTERFACE_TEST_BIN): $(TEST_LIBS) libvpxrc.a
+$(eval $(call linkerxx_template,$(RC_INTERFACE_TEST_BIN), \
+              $(RC_INTERFACE_TEST_OBJS) \
+              -L. -lvpx -lgtest -lvpxrc $(extralibs) -lm))
+endif  # RC_INTERFACE_TEST
+endif  # CONFIG_ENCODERS
+
+endif  # CONFIG_EXTERNAL_BUILD
 
 # Install test sources only if codec source is included
 INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(patsubst $(SRC_PATH_BARE)/%,%,\
     $(shell find $(SRC_PATH_BARE)/third_party/googletest -type f))
 INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(LIBVPX_TEST_SRCS)
 INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(TEST_INTRA_PRED_SPEED_SRCS)
+INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(RC_INTERFACE_TEST_SRCS)
 
 define test_shard_template
-test:: test_shard.$(1)
-test-no-data-check:: test_shard_ndc.$(1)
+test: test_shard.$(1)
+test-no-data-check: test_shard_ndc.$(1)
 test_shard.$(1) test_shard_ndc.$(1): $(LIBVPX_TEST_BIN)
 	@set -e; \
 	 export GTEST_SHARD_INDEX=$(1); \
 	 export GTEST_TOTAL_SHARDS=$(2); \
 	 $(LIBVPX_TEST_BIN)
 test_shard.$(1): testdata
-.PHONY: test_shard.$(1)
+.PHONY: test_shard.$(1) test_shard_ndc.$(1)
 endef
 
 NUM_SHARDS := 10
 SHARDS := 0 1 2 3 4 5 6 7 8 9
 $(foreach s,$(SHARDS),$(eval $(call test_shard_template,$(s),$(NUM_SHARDS))))
 
-endif
+endif  # CONFIG_UNIT_TESTS
 
 ##
 ## documentation directives
@@ -566,6 +723,7 @@ endif
 
 ## Update the global src list
 SRCS += $(CODEC_SRCS) $(LIBVPX_TEST_SRCS) $(GTEST_SRCS)
+SRCS += $(RC_INTERFACE_TEST_SRCS)
 
 ##
 ## vpxdec/vpxenc tests.
@@ -582,10 +740,10 @@ TEST_BIN_PATH := $(addsuffix /$(TGT_OS:win64=x64)/Release, $(TEST_BIN_PATH))
 endif
 utiltest utiltest-no-data-check:
 	$(qexec)$(SRC_PATH_BARE)/test/vpxdec.sh \
-		--test-data-path $(LIBVPX_TEST_DATA_PATH) \
+		--test-data-path "$(LIBVPX_TEST_DATA_PATH)" \
 		--bin-path $(TEST_BIN_PATH)
 	$(qexec)$(SRC_PATH_BARE)/test/vpxenc.sh \
-		--test-data-path $(LIBVPX_TEST_DATA_PATH) \
+		--test-data-path "$(LIBVPX_TEST_DATA_PATH)" \
 		--bin-path $(TEST_BIN_PATH)
 utiltest: testdata
 else
@@ -609,7 +767,7 @@ EXAMPLES_BIN_PATH := $(TGT_OS:win64=x64)/Release
 endif
 exampletest exampletest-no-data-check: examples
 	$(qexec)$(SRC_PATH_BARE)/test/examples.sh \
-		--test-data-path $(LIBVPX_TEST_DATA_PATH) \
+		--test-data-path "$(LIBVPX_TEST_DATA_PATH)" \
 		--bin-path $(EXAMPLES_BIN_PATH)
 exampletest: testdata
 else
diff --git a/media/libvpx/libvpx/mainpage.dox b/media/libvpx/libvpx/mainpage.dox
index ec202fa4fb..4b0dff0871 100644
--- a/media/libvpx/libvpx/mainpage.dox
+++ b/media/libvpx/libvpx/mainpage.dox
@@ -25,8 +25,10 @@
     release.
   - The \ref readme contains instructions on recompiling the sample applications.
   - Read the \ref usage "usage" for a narrative on codec usage.
+  \if samples
   - Read the \ref samples "sample code" for examples of how to interact with the
     codec.
+  \endif
   - \ref codec reference
   \if encoder
   - \ref encoder reference
diff --git a/media/libvpx/libvpx/md5_utils.c b/media/libvpx/libvpx/md5_utils.c
index 093798b833..abd8d43c39 100644
--- a/media/libvpx/libvpx/md5_utils.c
+++ b/media/libvpx/libvpx/md5_utils.c
@@ -23,6 +23,7 @@
 #include <string.h> /* for memcpy() */
 
 #include "md5_utils.h"
+#include "vpx_ports/compiler_attributes.h"
 
 static void byteSwap(UWORD32 *buf, unsigned words) {
   md5byte *p;
@@ -145,25 +146,14 @@ void MD5Final(md5byte digest[16], struct MD5Context *ctx) {
 #define MD5STEP(f, w, x, y, z, in, s) \
   (w += f(x, y, z) + in, w = (w << s | w >> (32 - s)) + x)
 
-#if defined(__clang__) && defined(__has_attribute)
-#if __has_attribute(no_sanitize)
-#define VPX_NO_UNSIGNED_OVERFLOW_CHECK \
-  __attribute__((no_sanitize("unsigned-integer-overflow")))
-#endif
-#endif
-
-#ifndef VPX_NO_UNSIGNED_OVERFLOW_CHECK
-#define VPX_NO_UNSIGNED_OVERFLOW_CHECK
-#endif
-
 /*
  * The core of the MD5 algorithm, this alters an existing MD5 hash to
  * reflect the addition of 16 longwords of new data.  MD5Update blocks
  * the data and converts bytes into longwords for this routine.
  */
-VPX_NO_UNSIGNED_OVERFLOW_CHECK void MD5Transform(UWORD32 buf[4],
-                                                 UWORD32 const in[16]) {
-  register UWORD32 a, b, c, d;
+VPX_NO_UNSIGNED_OVERFLOW_CHECK VPX_NO_UNSIGNED_SHIFT_CHECK void MD5Transform(
+    UWORD32 buf[4], UWORD32 const in[16]) {
+  UWORD32 a, b, c, d;
 
   a = buf[0];
   b = buf[1];
@@ -244,6 +234,4 @@ VPX_NO_UNSIGNED_OVERFLOW_CHECK void MD5Transform(UWORD32 buf[4],
   buf[3] += d;
 }
 
-#undef VPX_NO_UNSIGNED_OVERFLOW_CHECK
-
 #endif
diff --git a/media/libvpx/libvpx/md5_utils.h b/media/libvpx/libvpx/md5_utils.h
index bd4991b3ad..e0d5a2d1fb 100644
--- a/media/libvpx/libvpx/md5_utils.h
+++ b/media/libvpx/libvpx/md5_utils.h
@@ -20,8 +20,8 @@
  * Still in the public domain.
  */
 
-#ifndef MD5_UTILS_H_
-#define MD5_UTILS_H_
+#ifndef VPX_MD5_UTILS_H_
+#define VPX_MD5_UTILS_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -46,4 +46,4 @@ void MD5Transform(UWORD32 buf[4], UWORD32 const in[16]);
 }  // extern "C"
 #endif
 
-#endif  // MD5_UTILS_H_
+#endif  // VPX_MD5_UTILS_H_
diff --git a/media/libvpx/libvpx/rate_hist.c b/media/libvpx/libvpx/rate_hist.c
index 872a10bae0..6a056cac10 100644
--- a/media/libvpx/libvpx/rate_hist.c
+++ b/media/libvpx/libvpx/rate_hist.c
@@ -9,10 +9,11 @@
  */
 
 #include <assert.h>
-#include <stdlib.h>
 #include <limits.h>
-#include <stdio.h>
 #include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
 
 #include "./rate_hist.h"
 
@@ -37,12 +38,19 @@ struct rate_hist {
 struct rate_hist *init_rate_histogram(const vpx_codec_enc_cfg_t *cfg,
                                       const vpx_rational_t *fps) {
   int i;
-  struct rate_hist *hist = malloc(sizeof(*hist));
+  struct rate_hist *hist = calloc(1, sizeof(*hist));
+
+  if (hist == NULL || cfg == NULL || fps == NULL || fps->num == 0 ||
+      fps->den == 0) {
+    destroy_rate_histogram(hist);
+    return NULL;
+  }
 
   // Determine the number of samples in the buffer. Use the file's framerate
   // to determine the number of frames in rc_buf_sz milliseconds, with an
   // adjustment (5/4) to account for alt-refs
-  hist->samples = cfg->rc_buf_sz * 5 / 4 * fps->num / fps->den / 1000;
+  hist->samples =
+      (int)((int64_t)cfg->rc_buf_sz * 5 / 4 * fps->num / fps->den / 1000);
 
   // prevent division by zero
   if (hist->samples == 0) hist->samples = 1;
@@ -80,7 +88,11 @@ void update_rate_histogram(struct rate_hist *hist,
                       (uint64_t)cfg->g_timebase.num /
                       (uint64_t)cfg->g_timebase.den;
 
-  int idx = hist->frames++ % hist->samples;
+  int idx;
+
+  if (hist == NULL || cfg == NULL || pkt == NULL) return;
+
+  idx = hist->frames++ % hist->samples;
   hist->pts[idx] = now;
   hist->sz[idx] = (int)pkt->data.frame.sz;
 
@@ -116,9 +128,14 @@ void update_rate_histogram(struct rate_hist *hist,
 static int merge_hist_buckets(struct hist_bucket *bucket, int max_buckets,
                               int *num_buckets) {
   int small_bucket = 0, merge_bucket = INT_MAX, big_bucket = 0;
-  int buckets = *num_buckets;
+  int buckets;
   int i;
 
+  assert(bucket != NULL);
+  assert(num_buckets != NULL);
+
+  buckets = *num_buckets;
+
   /* Find the extrema for this list of buckets */
   big_bucket = small_bucket = 0;
   for (i = 0; i < buckets; i++) {
@@ -178,38 +195,42 @@ static int merge_hist_buckets(struct hist_bucket *bucket, int max_buckets,
 
 static void show_histogram(const struct hist_bucket *bucket, int buckets,
                            int total, int scale) {
-  const char *pat1, *pat2;
+  int width1, width2;
   int i;
 
+  if (!buckets) return;
+  assert(bucket != NULL);
+  assert(buckets > 0);
+
   switch ((int)(log(bucket[buckets - 1].high) / log(10)) + 1) {
     case 1:
     case 2:
-      pat1 = "%4d %2s: ";
-      pat2 = "%4d-%2d: ";
+      width1 = 4;
+      width2 = 2;
       break;
     case 3:
-      pat1 = "%5d %3s: ";
-      pat2 = "%5d-%3d: ";
+      width1 = 5;
+      width2 = 3;
       break;
     case 4:
-      pat1 = "%6d %4s: ";
-      pat2 = "%6d-%4d: ";
+      width1 = 6;
+      width2 = 4;
       break;
     case 5:
-      pat1 = "%7d %5s: ";
-      pat2 = "%7d-%5d: ";
+      width1 = 7;
+      width2 = 5;
       break;
     case 6:
-      pat1 = "%8d %6s: ";
-      pat2 = "%8d-%6d: ";
+      width1 = 8;
+      width2 = 6;
       break;
     case 7:
-      pat1 = "%9d %7s: ";
-      pat2 = "%9d-%7d: ";
+      width1 = 9;
+      width2 = 7;
       break;
     default:
-      pat1 = "%12d %10s: ";
-      pat2 = "%12d-%10d: ";
+      width1 = 12;
+      width2 = 10;
       break;
   }
 
@@ -224,9 +245,10 @@ static void show_histogram(const struct hist_bucket *bucket, int buckets,
     assert(len <= HIST_BAR_MAX);
 
     if (bucket[i].low == bucket[i].high)
-      fprintf(stderr, pat1, bucket[i].low, "");
+      fprintf(stderr, "%*d %*s: ", width1, bucket[i].low, width2, "");
     else
-      fprintf(stderr, pat2, bucket[i].low, bucket[i].high);
+      fprintf(stderr, "%*d-%*d: ", width1, bucket[i].low, width2,
+              bucket[i].high);
 
     for (j = 0; j < HIST_BAR_MAX; j++) fprintf(stderr, j < len ? "=" : " ");
     fprintf(stderr, "\t%5d (%6.2f%%)\n", bucket[i].count, pct);
@@ -259,6 +281,8 @@ void show_rate_histogram(struct rate_hist *hist, const vpx_codec_enc_cfg_t *cfg,
   int i, scale;
   int buckets = 0;
 
+  if (hist == NULL || cfg == NULL) return;
+
   for (i = 0; i < RATE_BINS; i++) {
     if (hist->bucket[i].low == INT_MAX) continue;
     hist->bucket[buckets++] = hist->bucket[i];
diff --git a/media/libvpx/libvpx/rate_hist.h b/media/libvpx/libvpx/rate_hist.h
index 00a1676a61..d6a4c68519 100644
--- a/media/libvpx/libvpx/rate_hist.h
+++ b/media/libvpx/libvpx/rate_hist.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef RATE_HIST_H_
-#define RATE_HIST_H_
+#ifndef VPX_RATE_HIST_H_
+#define VPX_RATE_HIST_H_
 
 #include "vpx/vpx_encoder.h"
 
@@ -37,4 +37,4 @@ void show_rate_histogram(struct rate_hist *hist, const vpx_codec_enc_cfg_t *cfg,
 }  // extern "C"
 #endif
 
-#endif  // RATE_HIST_H_
+#endif  // VPX_RATE_HIST_H_
diff --git a/media/libvpx/libvpx/test/acm_random.h b/media/libvpx/libvpx/test/acm_random.h
index c2f6b0e410..6ebb60028e 100644
--- a/media/libvpx/libvpx/test/acm_random.h
+++ b/media/libvpx/libvpx/test/acm_random.h
@@ -8,10 +8,14 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef TEST_ACM_RANDOM_H_
-#define TEST_ACM_RANDOM_H_
+#ifndef VPX_TEST_ACM_RANDOM_H_
+#define VPX_TEST_ACM_RANDOM_H_
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include <assert.h>
+
+#include <limits>
+
+#include "gtest/gtest.h"
 
 #include "vpx/vpx_integer.h"
 
@@ -24,37 +28,56 @@ class ACMRandom {
   explicit ACMRandom(int seed) : random_(seed) {}
 
   void Reset(int seed) { random_.Reseed(seed); }
-  uint16_t Rand16(void) {
+  uint16_t Rand16() {
     const uint32_t value =
         random_.Generate(testing::internal::Random::kMaxRange);
     return (value >> 15) & 0xffff;
   }
 
-  int16_t Rand9Signed(void) {
-    // Use 9 bits: values between 255 (0x0FF) and -256 (0x100).
-    const uint32_t value = random_.Generate(512);
-    return static_cast<int16_t>(value) - 256;
+  int32_t Rand20Signed() {
+    // Use 20 bits: values between 524287 and -524288.
+    const uint32_t value = random_.Generate(1048576);
+    return static_cast<int32_t>(value) - 524288;
   }
 
-  uint8_t Rand8(void) {
+  int16_t Rand16Signed() {
+    // Use 16 bits: values between 32767 and -32768.
+    return static_cast<int16_t>(random_.Generate(65536));
+  }
+
+  uint16_t Rand12() {
+    const uint32_t value =
+        random_.Generate(testing::internal::Random::kMaxRange);
+    // There's a bit more entropy in the upper bits of this implementation.
+    return (value >> 19) & 0xfff;
+  }
+
+  uint8_t Rand8() {
     const uint32_t value =
         random_.Generate(testing::internal::Random::kMaxRange);
     // There's a bit more entropy in the upper bits of this implementation.
     return (value >> 23) & 0xff;
   }
 
-  uint8_t Rand8Extremes(void) {
+  uint8_t Rand8Extremes() {
     // Returns a random value near 0 or near 255, to better exercise
     // saturation behavior.
     const uint8_t r = Rand8();
-    return r < 128 ? r << 4 : r >> 4;
+    return static_cast<uint8_t>((r < 128) ? r << 4 : r >> 4);
+  }
+
+  uint32_t RandRange(const uint32_t range) {
+    // testing::internal::Random::Generate provides values in the range
+    // testing::internal::Random::kMaxRange.
+    assert(range <= testing::internal::Random::kMaxRange);
+    return random_.Generate(range);
   }
 
   int PseudoUniform(int range) { return random_.Generate(range); }
 
   int operator()(int n) { return PseudoUniform(n); }
 
-  static int DeterministicSeed(void) { return 0xbaba; }
+  static int DeterministicSeed() { return 0xbaba; }
 
  private:
   testing::internal::Random random_;
@@ -62,4 +85,4 @@ class ACMRandom {
 
 }  // namespace libvpx_test
 
-#endif  // TEST_ACM_RANDOM_H_
+#endif  // VPX_TEST_ACM_RANDOM_H_
diff --git a/media/libvpx/libvpx/test/active_map_refresh_test.cc b/media/libvpx/libvpx/test/active_map_refresh_test.cc
index d893635505..a0b46059a7 100644
--- a/media/libvpx/libvpx/test/active_map_refresh_test.cc
+++ b/media/libvpx/libvpx/test/active_map_refresh_test.cc
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 #include <algorithm>
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/util.h"
@@ -62,25 +62,25 @@ class ActiveMapRefreshTest
       public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
  protected:
   ActiveMapRefreshTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~ActiveMapRefreshTest() {}
+  ~ActiveMapRefreshTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
     cpu_used_ = GET_PARAM(2);
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     ::libvpx_test::Y4mVideoSource *y4m_video =
         static_cast<libvpx_test::Y4mVideoSource *>(video);
-    if (video->frame() == 1) {
+    if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
       encoder->Control(VP9E_SET_AQ_MODE, kAqModeCyclicRefresh);
     } else if (video->frame() >= 2 && video->img()) {
       vpx_image_t *current = video->img();
       vpx_image_t *previous = y4m_holder_->img();
-      ASSERT_TRUE(previous != NULL);
+      ASSERT_NE(previous, nullptr);
       vpx_active_map_t map = vpx_active_map_t();
       const int width = static_cast<int>(current->d_w);
       const int height = static_cast<int>(current->d_h);
@@ -122,7 +122,7 @@ TEST_P(ActiveMapRefreshTest, Test) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
-VP9_INSTANTIATE_TEST_CASE(ActiveMapRefreshTest,
-                          ::testing::Values(::libvpx_test::kRealTime),
-                          ::testing::Range(5, 6));
+VP9_INSTANTIATE_TEST_SUITE(ActiveMapRefreshTest,
+                           ::testing::Values(::libvpx_test::kRealTime),
+                           ::testing::Range(5, 6));
 }  // namespace
diff --git a/media/libvpx/libvpx/test/active_map_test.cc b/media/libvpx/libvpx/test/active_map_test.cc
index 1d24f956f5..e8976b416a 100644
--- a/media/libvpx/libvpx/test/active_map_test.cc
+++ b/media/libvpx/libvpx/test/active_map_test.cc
@@ -9,7 +9,7 @@
  */
 #include <climits>
 #include <vector>
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
@@ -19,24 +19,26 @@ namespace {
 
 class ActiveMapTest
     : public ::libvpx_test::EncoderTest,
-      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+      public ::libvpx_test::CodecTestWith3Params<libvpx_test::TestMode, int,
+                                                 int> {
  protected:
   static const int kWidth = 208;
   static const int kHeight = 144;
 
   ActiveMapTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~ActiveMapTest() {}
+  ~ActiveMapTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
     cpu_used_ = GET_PARAM(2);
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
+      encoder->Control(VP9E_SET_AQ_MODE, GET_PARAM(3));
     } else if (video->frame() == 3) {
       vpx_active_map_t map = vpx_active_map_t();
       /* clang-format off */
@@ -62,7 +64,7 @@ class ActiveMapTest
       vpx_active_map_t map = vpx_active_map_t();
       map.cols = (kWidth + 15) / 16;
       map.rows = (kHeight + 15) / 16;
-      map.active_map = NULL;
+      map.active_map = nullptr;
       encoder->Control(VP8E_SET_ACTIVEMAP, &map);
     }
   }
@@ -85,7 +87,7 @@ TEST_P(ActiveMapTest, Test) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
-VP9_INSTANTIATE_TEST_CASE(ActiveMapTest,
-                          ::testing::Values(::libvpx_test::kRealTime),
-                          ::testing::Range(0, 9));
+VP9_INSTANTIATE_TEST_SUITE(ActiveMapTest,
+                           ::testing::Values(::libvpx_test::kRealTime),
+                           ::testing::Range(5, 10), ::testing::Values(0, 3));
 }  // namespace
diff --git a/media/libvpx/libvpx/test/add_noise_test.cc b/media/libvpx/libvpx/test/add_noise_test.cc
index eae32c33bb..89c821fe20 100644
--- a/media/libvpx/libvpx/test/add_noise_test.cc
+++ b/media/libvpx/libvpx/test/add_noise_test.cc
@@ -8,11 +8,15 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 #include <math.h>
+#include <tuple>
+
+#include "gtest/gtest.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/util.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_config.h"
 #include "vpx_dsp/postproc.h"
 #include "vpx_mem/vpx_mem.h"
 
@@ -20,15 +24,17 @@ namespace {
 
 static const int kNoiseSize = 3072;
 
-// TODO(jimbankoski): make width and height integers not unsigned.
-typedef void (*AddNoiseFunc)(uint8_t *start, const int8_t *noise,
-                             int blackclamp, int whiteclamp, int width,
-                             int height, int pitch);
+using AddNoiseFunc = void (*)(uint8_t *start, const int8_t *noise,
+                              int blackclamp, int whiteclamp, int width,
+                              int height, int pitch);
 
-class AddNoiseTest : public ::testing::TestWithParam<AddNoiseFunc> {
+using AddNoiseTestFPParam = std::tuple<double, AddNoiseFunc>;
+
+class AddNoiseTest : public ::testing::Test,
+                     public ::testing::WithParamInterface<AddNoiseTestFPParam> {
  public:
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-  virtual ~AddNoiseTest() {}
+  void TearDown() override { libvpx_test::ClearSystemState(); }
+  ~AddNoiseTest() override = default;
 };
 
 double stddev6(char a, char b, char c, char d, char e, char f) {
@@ -44,14 +50,14 @@ TEST_P(AddNoiseTest, CheckNoiseAdded) {
   const int height = 64;
   const int image_size = width * height;
   int8_t noise[kNoiseSize];
-  const int clamp = vpx_setup_noise(4.4, noise, kNoiseSize);
+  const int clamp = vpx_setup_noise(GET_PARAM(0), noise, kNoiseSize);
   uint8_t *const s =
       reinterpret_cast<uint8_t *>(vpx_calloc(image_size, sizeof(*s)));
-  ASSERT_TRUE(s != NULL);
+  ASSERT_NE(s, nullptr);
   memset(s, 99, image_size * sizeof(*s));
 
   ASM_REGISTER_STATE_CHECK(
-      GetParam()(s, noise, clamp, clamp, width, height, width));
+      GET_PARAM(1)(s, noise, clamp, clamp, width, height, width));
 
   // Check to make sure we don't end up having either the same or no added
   // noise either vertically or horizontally.
@@ -70,7 +76,7 @@ TEST_P(AddNoiseTest, CheckNoiseAdded) {
   memset(s, 255, image_size);
 
   ASM_REGISTER_STATE_CHECK(
-      GetParam()(s, noise, clamp, clamp, width, height, width));
+      GET_PARAM(1)(s, noise, clamp, clamp, width, height, width));
 
   // Check to make sure don't roll over.
   for (int i = 0; i < image_size; ++i) {
@@ -81,7 +87,7 @@ TEST_P(AddNoiseTest, CheckNoiseAdded) {
   memset(s, 0, image_size);
 
   ASM_REGISTER_STATE_CHECK(
-      GetParam()(s, noise, clamp, clamp, width, height, width));
+      GET_PARAM(1)(s, noise, clamp, clamp, width, height, width));
 
   // Check to make sure don't roll under.
   for (int i = 0; i < image_size; ++i) {
@@ -100,15 +106,15 @@ TEST_P(AddNoiseTest, CheckCvsAssembly) {
 
   uint8_t *const s = reinterpret_cast<uint8_t *>(vpx_calloc(image_size, 1));
   uint8_t *const d = reinterpret_cast<uint8_t *>(vpx_calloc(image_size, 1));
-  ASSERT_TRUE(s != NULL);
-  ASSERT_TRUE(d != NULL);
+  ASSERT_NE(s, nullptr);
+  ASSERT_NE(d, nullptr);
 
   memset(s, 99, image_size);
   memset(d, 99, image_size);
 
   srand(0);
   ASM_REGISTER_STATE_CHECK(
-      GetParam()(s, noise, clamp, clamp, width, height, width));
+      GET_PARAM(1)(s, noise, clamp, clamp, width, height, width));
   srand(0);
   ASM_REGISTER_STATE_CHECK(
       vpx_plane_add_noise_c(d, noise, clamp, clamp, width, height, width));
@@ -121,16 +127,24 @@ TEST_P(AddNoiseTest, CheckCvsAssembly) {
   vpx_free(s);
 }
 
-INSTANTIATE_TEST_CASE_P(C, AddNoiseTest,
-                        ::testing::Values(vpx_plane_add_noise_c));
+using std::make_tuple;
+
+INSTANTIATE_TEST_SUITE_P(
+    C, AddNoiseTest,
+    ::testing::Values(make_tuple(3.25, vpx_plane_add_noise_c),
+                      make_tuple(4.4, vpx_plane_add_noise_c)));
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, AddNoiseTest,
-                        ::testing::Values(vpx_plane_add_noise_sse2));
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, AddNoiseTest,
+    ::testing::Values(make_tuple(3.25, vpx_plane_add_noise_sse2),
+                      make_tuple(4.4, vpx_plane_add_noise_sse2)));
 #endif
 
 #if HAVE_MSA
-INSTANTIATE_TEST_CASE_P(MSA, AddNoiseTest,
-                        ::testing::Values(vpx_plane_add_noise_msa));
+INSTANTIATE_TEST_SUITE_P(
+    MSA, AddNoiseTest,
+    ::testing::Values(make_tuple(3.25, vpx_plane_add_noise_msa),
+                      make_tuple(4.4, vpx_plane_add_noise_msa)));
 #endif
 }  // namespace
diff --git a/media/libvpx/libvpx/test/alt_ref_aq_segment_test.cc b/media/libvpx/libvpx/test/alt_ref_aq_segment_test.cc
index 64a3011eb9..ade82d7ed2 100644
--- a/media/libvpx/libvpx/test/alt_ref_aq_segment_test.cc
+++ b/media/libvpx/libvpx/test/alt_ref_aq_segment_test.cc
@@ -7,7 +7,7 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
@@ -20,9 +20,9 @@ class AltRefAqSegmentTest
       public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
  protected:
   AltRefAqSegmentTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~AltRefAqSegmentTest() {}
+  ~AltRefAqSegmentTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
     set_cpu_used_ = GET_PARAM(2);
@@ -30,9 +30,9 @@ class AltRefAqSegmentTest
     alt_ref_aq_mode_ = 0;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
       encoder->Control(VP9E_SET_ALT_REF_AQ, alt_ref_aq_mode_);
       encoder->Control(VP9E_SET_AQ_MODE, aq_mode_);
@@ -150,8 +150,8 @@ TEST_P(AltRefAqSegmentTest, TestNoMisMatchAltRefAQ4) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
-VP9_INSTANTIATE_TEST_CASE(AltRefAqSegmentTest,
-                          ::testing::Values(::libvpx_test::kOnePassGood,
-                                            ::libvpx_test::kTwoPassGood),
-                          ::testing::Range(2, 5));
+VP9_INSTANTIATE_TEST_SUITE(AltRefAqSegmentTest,
+                           ::testing::Values(::libvpx_test::kOnePassGood,
+                                             ::libvpx_test::kTwoPassGood),
+                           ::testing::Range(2, 5));
 }  // namespace
diff --git a/media/libvpx/libvpx/test/altref_test.cc b/media/libvpx/libvpx/test/altref_test.cc
index f9308c2717..e98edcba4a 100644
--- a/media/libvpx/libvpx/test/altref_test.cc
+++ b/media/libvpx/libvpx/test/altref_test.cc
@@ -7,11 +7,12 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
 #include "test/util.h"
+#include "vpx_config.h"
 namespace {
 
 #if CONFIG_VP8_ENCODER
@@ -24,24 +25,24 @@ class AltRefTest : public ::libvpx_test::EncoderTest,
                    public ::libvpx_test::CodecTestWithParam<int> {
  protected:
   AltRefTest() : EncoderTest(GET_PARAM(0)), altref_count_(0) {}
-  virtual ~AltRefTest() {}
+  ~AltRefTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(libvpx_test::kTwoPassGood);
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) { altref_count_ = 0; }
+  void BeginPassHook(unsigned int /*pass*/) override { altref_count_ = 0; }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
       encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
       encoder->Control(VP8E_SET_CPUUSED, 3);
     }
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (pkt->data.frame.flags & VPX_FRAME_IS_INVISIBLE) ++altref_count_;
   }
 
@@ -63,8 +64,8 @@ TEST_P(AltRefTest, MonotonicTimestamps) {
   EXPECT_GE(altref_count(), 1);
 }
 
-VP8_INSTANTIATE_TEST_CASE(AltRefTest,
-                          ::testing::Range(kLookAheadMin, kLookAheadMax));
+VP8_INSTANTIATE_TEST_SUITE(AltRefTest,
+                           ::testing::Range(kLookAheadMin, kLookAheadMax));
 
 #endif  // CONFIG_VP8_ENCODER
 
@@ -75,17 +76,17 @@ class AltRefForcedKeyTestLarge
   AltRefForcedKeyTestLarge()
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
         cpu_used_(GET_PARAM(2)), forced_kf_frame_num_(1), frame_num_(0) {}
-  virtual ~AltRefForcedKeyTestLarge() {}
+  ~AltRefForcedKeyTestLarge() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
     cfg_.rc_end_usage = VPX_VBR;
     cfg_.g_threads = 0;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
       encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
@@ -100,7 +101,7 @@ class AltRefForcedKeyTestLarge
         (video->frame() == forced_kf_frame_num_) ? VPX_EFLAG_FORCE_KF : 0;
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (frame_num_ == forced_kf_frame_num_) {
       ASSERT_TRUE(!!(pkt->data.frame.flags & VPX_FRAME_IS_KEY))
           << "Frame #" << frame_num_ << " isn't a keyframe!";
@@ -142,11 +143,11 @@ TEST_P(AltRefForcedKeyTestLarge, ForcedFrameIsKey) {
   }
 }
 
-VP8_INSTANTIATE_TEST_CASE(AltRefForcedKeyTestLarge,
-                          ::testing::Values(::libvpx_test::kOnePassGood),
-                          ::testing::Range(0, 9));
+VP8_INSTANTIATE_TEST_SUITE(AltRefForcedKeyTestLarge,
+                           ::testing::Values(::libvpx_test::kOnePassGood),
+                           ::testing::Range(0, 9));
 
-VP9_INSTANTIATE_TEST_CASE(AltRefForcedKeyTestLarge,
-                          ::testing::Values(::libvpx_test::kOnePassGood),
-                          ::testing::Range(0, 9));
+VP9_INSTANTIATE_TEST_SUITE(AltRefForcedKeyTestLarge,
+                           ::testing::Values(::libvpx_test::kOnePassGood),
+                           ::testing::Range(0, 9));
 }  // namespace
diff --git a/media/libvpx/libvpx/test/android/Android.mk b/media/libvpx/libvpx/test/android/Android.mk
index 48872a2b65..9a7533ebba 100644
--- a/media/libvpx/libvpx/test/android/Android.mk
+++ b/media/libvpx/libvpx/test/android/Android.mk
@@ -10,6 +10,9 @@
 # The test app itself runs on the command line through adb shell
 # The paths are really messed up as the libvpx make file
 # expects to be made from a parent directory.
+
+# Ignore this file during non-NDK builds.
+ifdef NDK_ROOT
 CUR_WD := $(call my-dir)
 BINDINGS_DIR := $(CUR_WD)/../../..
 LOCAL_PATH := $(CUR_WD)/../../..
@@ -32,7 +35,11 @@ LOCAL_CPP_EXTENSION := .cc
 LOCAL_MODULE := gtest
 LOCAL_C_INCLUDES := $(LOCAL_PATH)/third_party/googletest/src/
 LOCAL_C_INCLUDES += $(LOCAL_PATH)/third_party/googletest/src/include/
+LOCAL_EXPORT_C_INCLUDES := $(LOCAL_PATH)/third_party/googletest/src/include/
 LOCAL_SRC_FILES := ./third_party/googletest/src/src/gtest-all.cc
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-BSD
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_NOTICE_FILE := $(LOCAL_PATH)/../../LICENSE $(LOCAL_PATH)/../../PATENTS
 include $(BUILD_STATIC_LIBRARY)
 
 #libvpx_test
@@ -47,6 +54,9 @@ else
   LOCAL_STATIC_LIBRARIES += vpx
 endif
 
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-BSD
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_NOTICE_FILE := $(LOCAL_PATH)/../../LICENSE $(LOCAL_PATH)/../../PATENTS
 include $(LOCAL_PATH)/test/test.mk
 LOCAL_C_INCLUDES := $(BINDINGS_DIR)
 FILTERED_SRC := $(sort $(filter %.cc %.c, $(LIBVPX_TEST_SRCS-yes)))
@@ -54,3 +64,4 @@ LOCAL_SRC_FILES := $(addprefix ./test/, $(FILTERED_SRC))
 # some test files depend on *_rtcd.h, ensure they're generated first.
 $(eval $(call rtcd_dep_template))
 include $(BUILD_EXECUTABLE)
+endif  # NDK_ROOT
diff --git a/media/libvpx/libvpx/test/android/README b/media/libvpx/libvpx/test/android/README
index 4a1adcf7f4..0cd30779d4 100644
--- a/media/libvpx/libvpx/test/android/README
+++ b/media/libvpx/libvpx/test/android/README
@@ -3,19 +3,20 @@ Android.mk will build vpx unittests on android.
 ./libvpx/configure --target=armv7-android-gcc --enable-external-build \
   --enable-postproc --disable-install-srcs --enable-multi-res-encoding \
   --enable-temporal-denoising --disable-unit-tests --disable-install-docs \
-  --disable-examples --disable-runtime-cpu-detect --sdk-path=$NDK
+  --disable-examples --disable-runtime-cpu-detect
 
 2) From the parent directory, invoke ndk-build:
 NDK_PROJECT_PATH=. ndk-build APP_BUILD_SCRIPT=./libvpx/test/android/Android.mk \
   APP_ABI=armeabi-v7a APP_PLATFORM=android-18 APP_OPTIM=release \
-  APP_STL=gnustl_static
+  APP_STL=c++_static
 
-Note: Both adb and ndk-build are available prebuilt at:
-  https://chromium.googlesource.com/android_tools
+Note: Both adb and ndk-build are available at:
+  https://developer.android.com/studio#downloads
+  https://developer.android.com/ndk/downloads
 
 3) Run get_files.py to download the test files:
 python get_files.py -i /path/to/test-data.sha1 -o /path/to/put/files \
-  -u http://downloads.webmproject.org/test_data/libvpx
+  -u https://storage.googleapis.com/downloads.webmproject.org/test_data/libvpx
 
 4) Transfer files to device using adb. Ensure you have proper permissions for
 the target
diff --git a/media/libvpx/libvpx/test/android/get_files.py b/media/libvpx/libvpx/test/android/get_files.py
index 1c69740d2b..98ce7b1947 100644
--- a/media/libvpx/libvpx/test/android/get_files.py
+++ b/media/libvpx/libvpx/test/android/get_files.py
@@ -38,7 +38,7 @@ def get_file_sha(filename):
         buf = file.read(HASH_CHUNK)
       return sha_hash.hexdigest()
   except IOError:
-    print "Error reading " + filename
+    print("Error reading " + filename)
 
 # Downloads a file from a url, and then checks the sha against the passed
 # in sha
@@ -67,7 +67,7 @@ try:
       getopt.getopt(sys.argv[1:], \
                     "u:i:o:", ["url=", "input_csv=", "output_dir="])
 except:
-  print 'get_files.py -u <url> -i <input_csv> -o <output_dir>'
+  print('get_files.py -u <url> -i <input_csv> -o <output_dir>')
   sys.exit(2)
 
 for opt, arg in opts:
@@ -79,7 +79,7 @@ for opt, arg in opts:
     local_resource_path = os.path.join(arg)
 
 if len(sys.argv) != 7:
-  print "Expects two paths and a url!"
+  print("Expects two paths and a url!")
   exit(1)
 
 if not os.path.isdir(local_resource_path):
@@ -89,7 +89,7 @@ file_list_csv = open(file_list_path, "rb")
 
 # Our 'csv' file uses multiple spaces as a delimiter, python's
 # csv class only uses single character delimiters, so we convert them below
-file_list_reader = csv.reader((re.sub(' +', ' ', line) \
+file_list_reader = csv.reader((re.sub(' +', ' ', line.decode('utf-8')) \
     for line in file_list_csv), delimiter = ' ')
 
 file_shas = []
@@ -104,15 +104,16 @@ for row in file_list_reader:
 file_list_csv.close()
 
 # Download files, only if they don't already exist and have correct shas
-for filename, sha in itertools.izip(file_names, file_shas):
+for filename, sha in zip(file_names, file_shas):
+  filename = filename.lstrip('*')
   path = os.path.join(local_resource_path, filename)
   if os.path.isfile(path) \
       and get_file_sha(path) == sha:
-    print path + ' exists, skipping'
+    print(path + ' exists, skipping')
     continue
   for retry in range(0, ftp_retries):
-    print "Downloading " + path
+    print("Downloading " + path)
     if not download_and_check_sha(url, filename, sha):
-      print "Sha does not match, retrying..."
+      print("Sha does not match, retrying...")
     else:
       break
diff --git a/media/libvpx/libvpx/test/aq_segment_test.cc b/media/libvpx/libvpx/test/aq_segment_test.cc
index 1c2147fbb2..f7e9a118fc 100644
--- a/media/libvpx/libvpx/test/aq_segment_test.cc
+++ b/media/libvpx/libvpx/test/aq_segment_test.cc
@@ -7,7 +7,7 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
@@ -20,18 +20,18 @@ class AqSegmentTest
       public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
  protected:
   AqSegmentTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~AqSegmentTest() {}
+  ~AqSegmentTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
     set_cpu_used_ = GET_PARAM(2);
     aq_mode_ = 0;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
       encoder->Control(VP9E_SET_AQ_MODE, aq_mode_);
       encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 100);
@@ -102,8 +102,8 @@ TEST_P(AqSegmentTest, TestNoMisMatchAQ3) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
-VP9_INSTANTIATE_TEST_CASE(AqSegmentTest,
-                          ::testing::Values(::libvpx_test::kRealTime,
-                                            ::libvpx_test::kOnePassGood),
-                          ::testing::Range(3, 9));
+VP9_INSTANTIATE_TEST_SUITE(AqSegmentTest,
+                           ::testing::Values(::libvpx_test::kRealTime,
+                                             ::libvpx_test::kOnePassGood),
+                           ::testing::Range(3, 9));
 }  // namespace
diff --git a/media/libvpx/libvpx/test/avg_test.cc b/media/libvpx/libvpx/test/avg_test.cc
index 272b99695e..42614c9bd3 100644
--- a/media/libvpx/libvpx/test/avg_test.cc
+++ b/media/libvpx/libvpx/test/avg_test.cc
@@ -11,9 +11,11 @@
 #include <limits.h>
 #include <stdio.h>
 #include <string.h>
+#include <tuple>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
+#include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 
@@ -21,39 +23,43 @@
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
+#include "vpx/vpx_codec.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_timer.h"
 
 using libvpx_test::ACMRandom;
 
 namespace {
+
+template <typename Pixel>
 class AverageTestBase : public ::testing::Test {
  public:
-  AverageTestBase(int width, int height) : width_(width), height_(height) {}
+  AverageTestBase(int width, int height)
+      : width_(width), height_(height), source_data_(nullptr),
+        source_stride_(0), bit_depth_(8) {}
 
-  static void SetUpTestCase() {
-    source_data_ = reinterpret_cast<uint8_t *>(
-        vpx_memalign(kDataAlignment, kDataBlockSize));
-  }
-
-  static void TearDownTestCase() {
+  void TearDown() override {
     vpx_free(source_data_);
-    source_data_ = NULL;
+    source_data_ = nullptr;
+    libvpx_test::ClearSystemState();
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
  protected:
   // Handle blocks up to 4 blocks 64x64 with stride up to 128
   static const int kDataAlignment = 16;
   static const int kDataBlockSize = 64 * 128;
 
-  virtual void SetUp() {
+  void SetUp() override {
+    source_data_ = reinterpret_cast<Pixel *>(
+        vpx_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
+    ASSERT_NE(source_data_, nullptr);
     source_stride_ = (width_ + 31) & ~31;
+    bit_depth_ = 8;
     rnd_.Reset(ACMRandom::DeterministicSeed());
   }
 
   // Sum Pixels
-  static unsigned int ReferenceAverage8x8(const uint8_t *source, int pitch) {
+  static unsigned int ReferenceAverage8x8(const Pixel *source, int pitch) {
     unsigned int average = 0;
     for (int h = 0; h < 8; ++h) {
       for (int w = 0; w < 8; ++w) average += source[h * pitch + w];
@@ -61,7 +67,7 @@ class AverageTestBase : public ::testing::Test {
     return ((average + 32) >> 6);
   }
 
-  static unsigned int ReferenceAverage4x4(const uint8_t *source, int pitch) {
+  static unsigned int ReferenceAverage4x4(const Pixel *source, int pitch) {
     unsigned int average = 0;
     for (int h = 0; h < 4; ++h) {
       for (int w = 0; w < 4; ++w) average += source[h * pitch + w];
@@ -69,7 +75,7 @@ class AverageTestBase : public ::testing::Test {
     return ((average + 8) >> 4);
   }
 
-  void FillConstant(uint8_t fill_constant) {
+  void FillConstant(Pixel fill_constant) {
     for (int i = 0; i < width_ * height_; ++i) {
       source_data_[i] = fill_constant;
     }
@@ -77,21 +83,22 @@ class AverageTestBase : public ::testing::Test {
 
   void FillRandom() {
     for (int i = 0; i < width_ * height_; ++i) {
-      source_data_[i] = rnd_.Rand8();
+      source_data_[i] = rnd_.Rand16() & ((1 << bit_depth_) - 1);
     }
   }
 
   int width_, height_;
-  static uint8_t *source_data_;
+  Pixel *source_data_;
   int source_stride_;
+  int bit_depth_;
 
   ACMRandom rnd_;
 };
-typedef unsigned int (*AverageFunction)(const uint8_t *s, int pitch);
+using AverageFunction = unsigned int (*)(const uint8_t *s, int pitch);
 
-typedef std::tr1::tuple<int, int, int, int, AverageFunction> AvgFunc;
+using AvgFunc = std::tuple<int, int, int, int, AverageFunction>;
 
-class AverageTest : public AverageTestBase,
+class AverageTest : public AverageTestBase<uint8_t>,
                     public ::testing::WithParamInterface<AvgFunc> {
  public:
   AverageTest() : AverageTestBase(GET_PARAM(0), GET_PARAM(1)) {}
@@ -117,38 +124,75 @@ class AverageTest : public AverageTestBase,
   }
 };
 
-typedef void (*IntProRowFunc)(int16_t hbuf[16], uint8_t const *ref,
-                              const int ref_stride, const int height);
+#if CONFIG_VP9_HIGHBITDEPTH
+class AverageTestHBD : public AverageTestBase<uint16_t>,
+                       public ::testing::WithParamInterface<AvgFunc> {
+ public:
+  AverageTestHBD() : AverageTestBase(GET_PARAM(0), GET_PARAM(1)) {}
 
-typedef std::tr1::tuple<int, IntProRowFunc, IntProRowFunc> IntProRowParam;
+ protected:
+  void CheckAverages() {
+    const int block_size = GET_PARAM(3);
+    unsigned int expected = 0;
+    if (block_size == 8) {
+      expected =
+          ReferenceAverage8x8(source_data_ + GET_PARAM(2), source_stride_);
+    } else if (block_size == 4) {
+      expected =
+          ReferenceAverage4x4(source_data_ + GET_PARAM(2), source_stride_);
+    }
 
-class IntProRowTest : public AverageTestBase,
+    ASM_REGISTER_STATE_CHECK(GET_PARAM(4)(
+        CONVERT_TO_BYTEPTR(source_data_ + GET_PARAM(2)), source_stride_));
+    unsigned int actual = GET_PARAM(4)(
+        CONVERT_TO_BYTEPTR(source_data_ + GET_PARAM(2)), source_stride_);
+
+    EXPECT_EQ(expected, actual);
+  }
+};
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_NEON || HAVE_SSE2 || HAVE_MSA
+using IntProRowFunc = void (*)(int16_t hbuf[16], uint8_t const *ref,
+                               const int ref_stride, const int height);
+
+using IntProRowParam = std::tuple<int, IntProRowFunc, IntProRowFunc>;
+
+class IntProRowTest : public AverageTestBase<uint8_t>,
                       public ::testing::WithParamInterface<IntProRowParam> {
  public:
   IntProRowTest()
-      : AverageTestBase(16, GET_PARAM(0)), hbuf_asm_(NULL), hbuf_c_(NULL) {
+      : AverageTestBase(16, GET_PARAM(0)), hbuf_asm_(nullptr),
+        hbuf_c_(nullptr) {
     asm_func_ = GET_PARAM(1);
     c_func_ = GET_PARAM(2);
   }
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
+    source_data_ = reinterpret_cast<uint8_t *>(
+        vpx_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
+    ASSERT_NE(source_data_, nullptr);
+
     hbuf_asm_ = reinterpret_cast<int16_t *>(
         vpx_memalign(kDataAlignment, sizeof(*hbuf_asm_) * 16));
     hbuf_c_ = reinterpret_cast<int16_t *>(
         vpx_memalign(kDataAlignment, sizeof(*hbuf_c_) * 16));
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
+    vpx_free(source_data_);
+    source_data_ = nullptr;
     vpx_free(hbuf_c_);
-    hbuf_c_ = NULL;
+    hbuf_c_ = nullptr;
     vpx_free(hbuf_asm_);
-    hbuf_asm_ = NULL;
+    hbuf_asm_ = nullptr;
   }
 
   void RunComparison() {
-    ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, 0, height_));
-    ASM_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, 0, height_));
+    ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, width_, height_));
+    ASM_REGISTER_STATE_CHECK(
+        asm_func_(hbuf_asm_, source_data_, width_, height_));
     EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
         << "Output mismatch";
   }
@@ -159,12 +203,13 @@ class IntProRowTest : public AverageTestBase,
   int16_t *hbuf_asm_;
   int16_t *hbuf_c_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(IntProRowTest);
 
-typedef int16_t (*IntProColFunc)(uint8_t const *ref, const int width);
+using IntProColFunc = int16_t (*)(uint8_t const *ref, const int width);
 
-typedef std::tr1::tuple<int, IntProColFunc, IntProColFunc> IntProColParam;
+using IntProColParam = std::tuple<int, IntProColFunc, IntProColFunc>;
 
-class IntProColTest : public AverageTestBase,
+class IntProColTest : public AverageTestBase<uint8_t>,
                       public ::testing::WithParamInterface<IntProColParam> {
  public:
   IntProColTest() : AverageTestBase(GET_PARAM(0), 1), sum_asm_(0), sum_c_(0) {
@@ -185,34 +230,34 @@ class IntProColTest : public AverageTestBase,
   int16_t sum_asm_;
   int16_t sum_c_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(IntProColTest);
+#endif  // HAVE_NEON || HAVE_SSE2 || HAVE_MSA
 
-typedef int (*SatdFunc)(const int16_t *coeffs, int length);
-typedef std::tr1::tuple<int, SatdFunc> SatdTestParam;
+using SatdFunc = int (*)(const tran_low_t *coeffs, int length);
+using SatdTestParam = std::tuple<int, SatdFunc>;
 
 class SatdTest : public ::testing::Test,
                  public ::testing::WithParamInterface<SatdTestParam> {
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     satd_size_ = GET_PARAM(0);
     satd_func_ = GET_PARAM(1);
     rnd_.Reset(ACMRandom::DeterministicSeed());
-    src_ = reinterpret_cast<int16_t *>(
+    src_ = reinterpret_cast<tran_low_t *>(
         vpx_memalign(16, sizeof(*src_) * satd_size_));
-    ASSERT_TRUE(src_ != NULL);
+    ASSERT_NE(src_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     libvpx_test::ClearSystemState();
     vpx_free(src_);
   }
 
-  void FillConstant(const int16_t val) {
+  void FillConstant(const tran_low_t val) {
     for (int i = 0; i < satd_size_; ++i) src_[i] = val;
   }
 
-  void FillRandom() {
-    for (int i = 0; i < satd_size_; ++i) src_[i] = rnd_.Rand16();
-  }
+  virtual void FillRandom() = 0;
 
   void Check(const int expected) {
     int total;
@@ -220,15 +265,84 @@ class SatdTest : public ::testing::Test,
     EXPECT_EQ(expected, total);
   }
 
+  tran_low_t *GetCoeff() const { return src_; }
+
   int satd_size_;
+  ACMRandom rnd_;
+  tran_low_t *src_;
 
  private:
-  int16_t *src_;
   SatdFunc satd_func_;
-  ACMRandom rnd_;
 };
 
-uint8_t *AverageTestBase::source_data_ = NULL;
+class SatdLowbdTest : public SatdTest {
+ protected:
+  void FillRandom() override {
+    for (int i = 0; i < satd_size_; ++i) {
+      const int16_t tmp = rnd_.Rand16Signed();
+      src_[i] = (tran_low_t)tmp;
+    }
+  }
+};
+
+using BlockErrorFunc = int64_t (*)(const tran_low_t *coeff,
+                                   const tran_low_t *dqcoeff, int block_size);
+using BlockErrorTestFPParam = std::tuple<int, BlockErrorFunc>;
+
+class BlockErrorTestFP
+    : public ::testing::Test,
+      public ::testing::WithParamInterface<BlockErrorTestFPParam> {
+ protected:
+  void SetUp() override {
+    txfm_size_ = GET_PARAM(0);
+    block_error_func_ = GET_PARAM(1);
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    coeff_ = reinterpret_cast<tran_low_t *>(
+        vpx_memalign(16, sizeof(*coeff_) * txfm_size_));
+    dqcoeff_ = reinterpret_cast<tran_low_t *>(
+        vpx_memalign(16, sizeof(*dqcoeff_) * txfm_size_));
+    ASSERT_NE(coeff_, nullptr);
+    ASSERT_NE(dqcoeff_, nullptr);
+  }
+
+  void TearDown() override {
+    libvpx_test::ClearSystemState();
+    vpx_free(coeff_);
+    vpx_free(dqcoeff_);
+  }
+
+  void FillConstant(const tran_low_t coeff_val, const tran_low_t dqcoeff_val) {
+    for (int i = 0; i < txfm_size_; ++i) coeff_[i] = coeff_val;
+    for (int i = 0; i < txfm_size_; ++i) dqcoeff_[i] = dqcoeff_val;
+  }
+
+  void FillRandom() {
+    // Just two fixed seeds
+    rnd_.Reset(0xb0b9);
+    for (int i = 0; i < txfm_size_; ++i) coeff_[i] = rnd_.Rand16() >> 1;
+    rnd_.Reset(0xb0c8);
+    for (int i = 0; i < txfm_size_; ++i) dqcoeff_[i] = rnd_.Rand16() >> 1;
+  }
+
+  void Check(const int64_t expected) {
+    int64_t total;
+    ASM_REGISTER_STATE_CHECK(
+        total = block_error_func_(coeff_, dqcoeff_, txfm_size_));
+    EXPECT_EQ(expected, total);
+  }
+
+  tran_low_t *GetCoeff() const { return coeff_; }
+
+  tran_low_t *GetDQCoeff() const { return dqcoeff_; }
+
+  int txfm_size_;
+
+ private:
+  tran_low_t *coeff_;
+  tran_low_t *dqcoeff_;
+  BlockErrorFunc block_error_func_;
+  ACMRandom rnd_;
+};
 
 TEST_P(AverageTest, MinValue) {
   FillConstant(0);
@@ -248,7 +362,29 @@ TEST_P(AverageTest, Random) {
     CheckAverages();
   }
 }
+#if CONFIG_VP9_HIGHBITDEPTH
+TEST_P(AverageTestHBD, MinValue) {
+  FillConstant(0);
+  CheckAverages();
+}
 
+TEST_P(AverageTestHBD, MaxValue) {
+  FillConstant((1 << VPX_BITS_12) - 1);
+  CheckAverages();
+}
+
+TEST_P(AverageTestHBD, Random) {
+  bit_depth_ = VPX_BITS_12;
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  for (int i = 0; i < 1000; i++) {
+    FillRandom();
+    CheckAverages();
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_NEON || HAVE_SSE2 || HAVE_MSA
 TEST_P(IntProRowTest, MinValue) {
   FillConstant(0);
   RunComparison();
@@ -278,28 +414,29 @@ TEST_P(IntProColTest, Random) {
   FillRandom();
   RunComparison();
 }
+#endif
 
-TEST_P(SatdTest, MinValue) {
+TEST_P(SatdLowbdTest, MinValue) {
   const int kMin = -32640;
   const int expected = -kMin * satd_size_;
   FillConstant(kMin);
   Check(expected);
 }
 
-TEST_P(SatdTest, MaxValue) {
+TEST_P(SatdLowbdTest, MaxValue) {
   const int kMax = 32640;
   const int expected = kMax * satd_size_;
   FillConstant(kMax);
   Check(expected);
 }
 
-TEST_P(SatdTest, Random) {
+TEST_P(SatdLowbdTest, Random) {
   int expected;
   switch (satd_size_) {
-    case 16: expected = 205298; break;
-    case 64: expected = 1113950; break;
-    case 256: expected = 4268415; break;
-    case 1024: expected = 16954082; break;
+    case 16: expected = 261036; break;
+    case 64: expected = 991732; break;
+    case 256: expected = 4136358; break;
+    case 1024: expected = 16677592; break;
     default:
       FAIL() << "Invalid satd size (" << satd_size_
              << ") valid: 16/64/256/1024";
@@ -308,21 +445,173 @@ TEST_P(SatdTest, Random) {
   Check(expected);
 }
 
-using std::tr1::make_tuple;
+TEST_P(SatdLowbdTest, DISABLED_Speed) {
+  const int kCountSpeedTestBlock = 20000;
+  vpx_usec_timer timer;
+  const int blocksize = GET_PARAM(0);
+  FillRandom();
+  tran_low_t *coeff = GetCoeff();
 
-INSTANTIATE_TEST_CASE_P(
+  vpx_usec_timer_start(&timer);
+  for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+    GET_PARAM(1)(coeff, blocksize);
+  }
+  vpx_usec_timer_mark(&timer);
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+class SatdHighbdTest : public SatdTest {
+ protected:
+  void FillRandom() override {
+    for (int i = 0; i < satd_size_; ++i) {
+      src_[i] = rnd_.Rand20Signed();
+    }
+  }
+};
+
+TEST_P(SatdHighbdTest, MinValue) {
+  const int kMin = -524280;
+  const int expected = -kMin * satd_size_;
+  FillConstant(kMin);
+  Check(expected);
+}
+
+TEST_P(SatdHighbdTest, MaxValue) {
+  const int kMax = 524280;
+  const int expected = kMax * satd_size_;
+  FillConstant(kMax);
+  Check(expected);
+}
+
+TEST_P(SatdHighbdTest, Random) {
+  int expected;
+  switch (satd_size_) {
+    case 16: expected = 5249712; break;
+    case 64: expected = 18362120; break;
+    case 256: expected = 66100520; break;
+    case 1024: expected = 266094734; break;
+    default:
+      FAIL() << "Invalid satd size (" << satd_size_
+             << ") valid: 16/64/256/1024";
+  }
+  FillRandom();
+  Check(expected);
+}
+
+TEST_P(SatdHighbdTest, DISABLED_Speed) {
+  const int kCountSpeedTestBlock = 20000;
+  vpx_usec_timer timer;
+  const int blocksize = GET_PARAM(0);
+  FillRandom();
+  tran_low_t *coeff = GetCoeff();
+
+  vpx_usec_timer_start(&timer);
+  for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+    GET_PARAM(1)(coeff, blocksize);
+  }
+  vpx_usec_timer_mark(&timer);
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+TEST_P(BlockErrorTestFP, MinValue) {
+  const int64_t kMin = -32640;
+  const int64_t expected = kMin * kMin * txfm_size_;
+  FillConstant(kMin, 0);
+  Check(expected);
+}
+
+TEST_P(BlockErrorTestFP, MaxValue) {
+  const int64_t kMax = 32640;
+  const int64_t expected = kMax * kMax * txfm_size_;
+  FillConstant(kMax, 0);
+  Check(expected);
+}
+
+TEST_P(BlockErrorTestFP, Random) {
+  int64_t expected;
+  switch (txfm_size_) {
+    case 16: expected = 2051681432; break;
+    case 64: expected = 11075114379; break;
+    case 256: expected = 44386271116; break;
+    case 1024: expected = 184774996089; break;
+    default:
+      FAIL() << "Invalid satd size (" << txfm_size_
+             << ") valid: 16/64/256/1024";
+  }
+  FillRandom();
+  Check(expected);
+}
+
+TEST_P(BlockErrorTestFP, DISABLED_Speed) {
+  const int kCountSpeedTestBlock = 20000;
+  vpx_usec_timer timer;
+  const int blocksize = GET_PARAM(0);
+  FillRandom();
+  tran_low_t *coeff = GetCoeff();
+  tran_low_t *dqcoeff = GetDQCoeff();
+
+  vpx_usec_timer_start(&timer);
+  for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+    GET_PARAM(1)(coeff, dqcoeff, blocksize);
+  }
+  vpx_usec_timer_mark(&timer);
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time);
+}
+
+using std::make_tuple;
+
+INSTANTIATE_TEST_SUITE_P(
     C, AverageTest,
     ::testing::Values(make_tuple(16, 16, 1, 8, &vpx_avg_8x8_c),
                       make_tuple(16, 16, 1, 4, &vpx_avg_4x4_c)));
 
-INSTANTIATE_TEST_CASE_P(C, SatdTest,
-                        ::testing::Values(make_tuple(16, &vpx_satd_c),
-                                          make_tuple(64, &vpx_satd_c),
-                                          make_tuple(256, &vpx_satd_c),
-                                          make_tuple(1024, &vpx_satd_c)));
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    C, AverageTestHBD,
+    ::testing::Values(make_tuple(16, 16, 1, 8, &vpx_highbd_avg_8x8_c),
+                      make_tuple(16, 16, 1, 4, &vpx_highbd_avg_4x4_c)));
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, AverageTestHBD,
+    ::testing::Values(make_tuple(16, 16, 1, 8, &vpx_highbd_avg_8x8_sse2),
+                      make_tuple(16, 16, 1, 4, &vpx_highbd_avg_4x4_sse2)));
+#endif  // HAVE_SSE2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AverageTestHBD,
+    ::testing::Values(make_tuple(16, 16, 1, 8, &vpx_highbd_avg_8x8_neon),
+                      make_tuple(16, 16, 1, 4, &vpx_highbd_avg_4x4_neon)));
+#endif  // HAVE_NEON
+
+INSTANTIATE_TEST_SUITE_P(C, SatdHighbdTest,
+                         ::testing::Values(make_tuple(16, &vpx_satd_c),
+                                           make_tuple(64, &vpx_satd_c),
+                                           make_tuple(256, &vpx_satd_c),
+                                           make_tuple(1024, &vpx_satd_c)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+INSTANTIATE_TEST_SUITE_P(C, SatdLowbdTest,
+                         ::testing::Values(make_tuple(16, &vpx_satd_c),
+                                           make_tuple(64, &vpx_satd_c),
+                                           make_tuple(256, &vpx_satd_c),
+                                           make_tuple(1024, &vpx_satd_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+    C, BlockErrorTestFP,
+    ::testing::Values(make_tuple(16, &vp9_block_error_fp_c),
+                      make_tuple(64, &vp9_block_error_fp_c),
+                      make_tuple(256, &vp9_block_error_fp_c),
+                      make_tuple(1024, &vp9_block_error_fp_c)));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
     SSE2, AverageTest,
     ::testing::Values(make_tuple(16, 16, 0, 8, &vpx_avg_8x8_sse2),
                       make_tuple(16, 16, 5, 8, &vpx_avg_8x8_sse2),
@@ -331,29 +620,60 @@ INSTANTIATE_TEST_CASE_P(
                       make_tuple(16, 16, 5, 4, &vpx_avg_4x4_sse2),
                       make_tuple(32, 32, 15, 4, &vpx_avg_4x4_sse2)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, IntProRowTest,
     ::testing::Values(make_tuple(16, &vpx_int_pro_row_sse2, &vpx_int_pro_row_c),
                       make_tuple(32, &vpx_int_pro_row_sse2, &vpx_int_pro_row_c),
                       make_tuple(64, &vpx_int_pro_row_sse2,
                                  &vpx_int_pro_row_c)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, IntProColTest,
     ::testing::Values(make_tuple(16, &vpx_int_pro_col_sse2, &vpx_int_pro_col_c),
                       make_tuple(32, &vpx_int_pro_col_sse2, &vpx_int_pro_col_c),
                       make_tuple(64, &vpx_int_pro_col_sse2,
                                  &vpx_int_pro_col_c)));
 
-INSTANTIATE_TEST_CASE_P(SSE2, SatdTest,
-                        ::testing::Values(make_tuple(16, &vpx_satd_sse2),
-                                          make_tuple(64, &vpx_satd_sse2),
-                                          make_tuple(256, &vpx_satd_sse2),
-                                          make_tuple(1024, &vpx_satd_sse2)));
+INSTANTIATE_TEST_SUITE_P(SSE2, SatdLowbdTest,
+                         ::testing::Values(make_tuple(16, &vpx_satd_sse2),
+                                           make_tuple(64, &vpx_satd_sse2),
+                                           make_tuple(256, &vpx_satd_sse2),
+                                           make_tuple(1024, &vpx_satd_sse2)));
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, BlockErrorTestFP,
+    ::testing::Values(make_tuple(16, &vp9_block_error_fp_sse2),
+                      make_tuple(64, &vp9_block_error_fp_sse2),
+                      make_tuple(256, &vp9_block_error_fp_sse2),
+                      make_tuple(1024, &vp9_block_error_fp_sse2)));
+#endif  // HAVE_SSE2
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, SatdLowbdTest,
+                         ::testing::Values(make_tuple(16, &vpx_satd_avx2),
+                                           make_tuple(64, &vpx_satd_avx2),
+                                           make_tuple(256, &vpx_satd_avx2),
+                                           make_tuple(1024, &vpx_satd_avx2)));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, SatdHighbdTest,
+    ::testing::Values(make_tuple(16, &vpx_highbd_satd_avx2),
+                      make_tuple(64, &vpx_highbd_satd_avx2),
+                      make_tuple(256, &vpx_highbd_satd_avx2),
+                      make_tuple(1024, &vpx_highbd_satd_avx2)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, BlockErrorTestFP,
+    ::testing::Values(make_tuple(16, &vp9_block_error_fp_avx2),
+                      make_tuple(64, &vp9_block_error_fp_avx2),
+                      make_tuple(256, &vp9_block_error_fp_avx2),
+                      make_tuple(1024, &vp9_block_error_fp_avx2)));
 #endif
 
 #if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     NEON, AverageTest,
     ::testing::Values(make_tuple(16, 16, 0, 8, &vpx_avg_8x8_neon),
                       make_tuple(16, 16, 5, 8, &vpx_avg_8x8_neon),
@@ -362,29 +682,54 @@ INSTANTIATE_TEST_CASE_P(
                       make_tuple(16, 16, 5, 4, &vpx_avg_4x4_neon),
                       make_tuple(32, 32, 15, 4, &vpx_avg_4x4_neon)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     NEON, IntProRowTest,
     ::testing::Values(make_tuple(16, &vpx_int_pro_row_neon, &vpx_int_pro_row_c),
                       make_tuple(32, &vpx_int_pro_row_neon, &vpx_int_pro_row_c),
                       make_tuple(64, &vpx_int_pro_row_neon,
                                  &vpx_int_pro_row_c)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     NEON, IntProColTest,
     ::testing::Values(make_tuple(16, &vpx_int_pro_col_neon, &vpx_int_pro_col_c),
                       make_tuple(32, &vpx_int_pro_col_neon, &vpx_int_pro_col_c),
                       make_tuple(64, &vpx_int_pro_col_neon,
                                  &vpx_int_pro_col_c)));
 
-INSTANTIATE_TEST_CASE_P(NEON, SatdTest,
-                        ::testing::Values(make_tuple(16, &vpx_satd_neon),
-                                          make_tuple(64, &vpx_satd_neon),
-                                          make_tuple(256, &vpx_satd_neon),
-                                          make_tuple(1024, &vpx_satd_neon)));
-#endif
+INSTANTIATE_TEST_SUITE_P(NEON, SatdLowbdTest,
+                         ::testing::Values(make_tuple(16, &vpx_satd_neon),
+                                           make_tuple(64, &vpx_satd_neon),
+                                           make_tuple(256, &vpx_satd_neon),
+                                           make_tuple(1024, &vpx_satd_neon)));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    NEON, SatdHighbdTest,
+    ::testing::Values(make_tuple(16, &vpx_highbd_satd_neon),
+                      make_tuple(64, &vpx_highbd_satd_neon),
+                      make_tuple(256, &vpx_highbd_satd_neon),
+                      make_tuple(1024, &vpx_highbd_satd_neon)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, BlockErrorTestFP,
+    ::testing::Values(make_tuple(16, &vp9_block_error_fp_neon),
+                      make_tuple(64, &vp9_block_error_fp_neon),
+                      make_tuple(256, &vp9_block_error_fp_neon),
+                      make_tuple(1024, &vp9_block_error_fp_neon)));
+#endif  // HAVE_NEON
+
+#if HAVE_SVE
+INSTANTIATE_TEST_SUITE_P(
+    SVE, BlockErrorTestFP,
+    ::testing::Values(make_tuple(16, &vp9_block_error_fp_sve),
+                      make_tuple(64, &vp9_block_error_fp_sve),
+                      make_tuple(256, &vp9_block_error_fp_sve),
+                      make_tuple(1024, &vp9_block_error_fp_sve)));
+#endif  // HAVE_SVE
 
 #if HAVE_MSA
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     MSA, AverageTest,
     ::testing::Values(make_tuple(16, 16, 0, 8, &vpx_avg_8x8_msa),
                       make_tuple(16, 16, 5, 8, &vpx_avg_8x8_msa),
@@ -392,6 +737,30 @@ INSTANTIATE_TEST_CASE_P(
                       make_tuple(16, 16, 0, 4, &vpx_avg_4x4_msa),
                       make_tuple(16, 16, 5, 4, &vpx_avg_4x4_msa),
                       make_tuple(32, 32, 15, 4, &vpx_avg_4x4_msa)));
-#endif
+
+INSTANTIATE_TEST_SUITE_P(
+    MSA, IntProRowTest,
+    ::testing::Values(make_tuple(16, &vpx_int_pro_row_msa, &vpx_int_pro_row_c),
+                      make_tuple(32, &vpx_int_pro_row_msa, &vpx_int_pro_row_c),
+                      make_tuple(64, &vpx_int_pro_row_msa,
+                                 &vpx_int_pro_row_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+    MSA, IntProColTest,
+    ::testing::Values(make_tuple(16, &vpx_int_pro_col_msa, &vpx_int_pro_col_c),
+                      make_tuple(32, &vpx_int_pro_col_msa, &vpx_int_pro_col_c),
+                      make_tuple(64, &vpx_int_pro_col_msa,
+                                 &vpx_int_pro_col_c)));
+
+// TODO(jingning): Remove the highbitdepth flag once the SIMD functions are
+// in place.
+#if !CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(MSA, SatdLowbdTest,
+                         ::testing::Values(make_tuple(16, &vpx_satd_msa),
+                                           make_tuple(64, &vpx_satd_msa),
+                                           make_tuple(256, &vpx_satd_msa),
+                                           make_tuple(1024, &vpx_satd_msa)));
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_MSA
 
 }  // namespace
diff --git a/media/libvpx/libvpx/test/bench.cc b/media/libvpx/libvpx/test/bench.cc
new file mode 100644
index 0000000000..0783f2a734
--- /dev/null
+++ b/media/libvpx/libvpx/test/bench.cc
@@ -0,0 +1,39 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <algorithm>
+#include <cstdlib>
+
+#include "test/bench.h"
+#include "vpx_ports/vpx_timer.h"
+
+void AbstractBench::RunNTimes(int n) {
+  for (int r = 0; r < VPX_BENCH_ROBUST_ITER; r++) {
+    vpx_usec_timer timer;
+    vpx_usec_timer_start(&timer);
+    for (int j = 0; j < n; ++j) {
+      Run();
+    }
+    vpx_usec_timer_mark(&timer);
+    times_[r] = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  }
+}
+
+void AbstractBench::PrintMedian(const char *title) {
+  std::sort(times_, times_ + VPX_BENCH_ROBUST_ITER);
+  const int med = times_[VPX_BENCH_ROBUST_ITER >> 1];
+  int sad = 0;
+  for (int t = 0; t < VPX_BENCH_ROBUST_ITER; t++) {
+    sad += abs(times_[t] - med);
+  }
+  printf("[%10s] %s %.1f ms ( ±%.1f ms )\n", "BENCH ", title, med / 1000.0,
+         sad / (VPX_BENCH_ROBUST_ITER * 1000.0));
+}
diff --git a/media/libvpx/libvpx/test/bench.h b/media/libvpx/libvpx/test/bench.h
new file mode 100644
index 0000000000..203e4d247e
--- /dev/null
+++ b/media/libvpx/libvpx/test/bench.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_TEST_BENCH_H_
+#define VPX_TEST_BENCH_H_
+
+// Number of iterations used to compute median run time.
+#define VPX_BENCH_ROBUST_ITER 15
+
+class AbstractBench {
+ public:
+  virtual ~AbstractBench() = default;
+
+  void RunNTimes(int n);
+  void PrintMedian(const char *title);
+
+ protected:
+  // Implement this method and put the code to benchmark in it.
+  virtual void Run() = 0;
+
+ private:
+  int times_[VPX_BENCH_ROBUST_ITER];
+};
+
+#endif  // VPX_TEST_BENCH_H_
diff --git a/media/libvpx/libvpx/test/blockiness_test.cc b/media/libvpx/libvpx/test/blockiness_test.cc
index 2fa10192f1..6e8301bb52 100644
--- a/media/libvpx/libvpx/test/blockiness_test.cc
+++ b/media/libvpx/libvpx/test/blockiness_test.cc
@@ -11,8 +11,9 @@
 #include <limits.h>
 #include <stdio.h>
 #include <string.h>
+#include <tuple>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "./vpx_config.h"
 #if CONFIG_VP9_ENCODER
@@ -25,10 +26,7 @@
 #include "test/util.h"
 
 #include "vpx_mem/vpx_mem.h"
-
-extern "C" double vp9_get_blockiness(const unsigned char *img1, int img1_pitch,
-                                     const unsigned char *img2, int img2_pitch,
-                                     int width, int height);
+#include "vp9/encoder/vp9_blockiness.h"
 
 using libvpx_test::ACMRandom;
 
@@ -37,28 +35,28 @@ class BlockinessTestBase : public ::testing::Test {
  public:
   BlockinessTestBase(int width, int height) : width_(width), height_(height) {}
 
-  static void SetUpTestCase() {
+  static void SetUpTestSuite() {
     source_data_ = reinterpret_cast<uint8_t *>(
         vpx_memalign(kDataAlignment, kDataBufferSize));
     reference_data_ = reinterpret_cast<uint8_t *>(
         vpx_memalign(kDataAlignment, kDataBufferSize));
   }
 
-  static void TearDownTestCase() {
+  static void TearDownTestSuite() {
     vpx_free(source_data_);
-    source_data_ = NULL;
+    source_data_ = nullptr;
     vpx_free(reference_data_);
-    reference_data_ = NULL;
+    reference_data_ = nullptr;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   // Handle frames up to 640x480
   static const int kDataAlignment = 16;
   static const int kDataBufferSize = 640 * 480;
 
-  virtual void SetUp() {
+  void SetUp() override {
     source_stride_ = (width_ + 31) & ~31;
     reference_stride_ = width_ * 2;
     rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -141,7 +139,7 @@ class BlockinessTestBase : public ::testing::Test {
 };
 
 #if CONFIG_VP9_ENCODER
-typedef std::tr1::tuple<int, int> BlockinessParam;
+using BlockinessParam = std::tuple<int, int>;
 class BlockinessVP9Test
     : public BlockinessTestBase,
       public ::testing::WithParamInterface<BlockinessParam> {
@@ -156,8 +154,8 @@ class BlockinessVP9Test
 };
 #endif  // CONFIG_VP9_ENCODER
 
-uint8_t *BlockinessTestBase::source_data_ = NULL;
-uint8_t *BlockinessTestBase::reference_data_ = NULL;
+uint8_t *BlockinessTestBase::source_data_ = nullptr;
+uint8_t *BlockinessTestBase::reference_data_ = nullptr;
 
 #if CONFIG_VP9_ENCODER
 TEST_P(BlockinessVP9Test, SourceBlockierThanReference) {
@@ -208,16 +206,17 @@ TEST_P(BlockinessVP9Test, WorstCaseBlockiness) {
 }
 #endif  // CONFIG_VP9_ENCODER
 
-using std::tr1::make_tuple;
+using std::make_tuple;
 
 //------------------------------------------------------------------------------
 // C functions
 
 #if CONFIG_VP9_ENCODER
-const BlockinessParam c_vp9_tests[] = {
-  make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238),
-};
-INSTANTIATE_TEST_CASE_P(C, BlockinessVP9Test, ::testing::ValuesIn(c_vp9_tests));
+const BlockinessParam c_vp9_tests[] = { make_tuple(320, 240),
+                                        make_tuple(318, 242),
+                                        make_tuple(318, 238) };
+INSTANTIATE_TEST_SUITE_P(C, BlockinessVP9Test,
+                         ::testing::ValuesIn(c_vp9_tests));
 #endif
 
 }  // namespace
diff --git a/media/libvpx/libvpx/test/borders_test.cc b/media/libvpx/libvpx/test/borders_test.cc
index e66ff02e25..57ee179aaf 100644
--- a/media/libvpx/libvpx/test/borders_test.cc
+++ b/media/libvpx/libvpx/test/borders_test.cc
@@ -9,11 +9,12 @@
  */
 #include <climits>
 #include <vector>
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
 #include "test/util.h"
+#include "vpx_config.h"
 
 namespace {
 
@@ -22,16 +23,16 @@ class BordersTest
       public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
  protected:
   BordersTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~BordersTest() {}
+  ~BordersTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, 1);
       encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
       encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
@@ -40,7 +41,7 @@ class BordersTest
     }
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) {
     }
   }
@@ -79,6 +80,11 @@ TEST_P(BordersTest, TestLowBitrate) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
-VP9_INSTANTIATE_TEST_CASE(BordersTest,
-                          ::testing::Values(::libvpx_test::kTwoPassGood));
+#if CONFIG_REALTIME_ONLY
+VP9_INSTANTIATE_TEST_SUITE(BordersTest,
+                           ::testing::Values(::libvpx_test::kRealTime));
+#else
+VP9_INSTANTIATE_TEST_SUITE(BordersTest,
+                           ::testing::Values(::libvpx_test::kTwoPassGood));
+#endif
 }  // namespace
diff --git a/media/libvpx/libvpx/test/buffer.h b/media/libvpx/libvpx/test/buffer.h
new file mode 100644
index 0000000000..f2846323b3
--- /dev/null
+++ b/media/libvpx/libvpx/test/buffer.h
@@ -0,0 +1,382 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_TEST_BUFFER_H_
+#define VPX_TEST_BUFFER_H_
+
+#include <stdio.h>
+
+#include <limits>
+
+#include "gtest/gtest.h"
+
+#include "test/acm_random.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+
+namespace libvpx_test {
+
+template <typename T>
+class Buffer {
+ public:
+  Buffer(int width, int height, int top_padding, int left_padding,
+         int right_padding, int bottom_padding)
+      : width_(width), height_(height), top_padding_(top_padding),
+        left_padding_(left_padding), right_padding_(right_padding),
+        bottom_padding_(bottom_padding), alignment_(0), padding_value_(0),
+        stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(nullptr) {}
+
+  Buffer(int width, int height, int top_padding, int left_padding,
+         int right_padding, int bottom_padding, unsigned int alignment)
+      : width_(width), height_(height), top_padding_(top_padding),
+        left_padding_(left_padding), right_padding_(right_padding),
+        bottom_padding_(bottom_padding), alignment_(alignment),
+        padding_value_(0), stride_(0), raw_size_(0), num_elements_(0),
+        raw_buffer_(nullptr) {}
+
+  Buffer(int width, int height, int padding)
+      : width_(width), height_(height), top_padding_(padding),
+        left_padding_(padding), right_padding_(padding),
+        bottom_padding_(padding), alignment_(0), padding_value_(0), stride_(0),
+        raw_size_(0), num_elements_(0), raw_buffer_(nullptr) {}
+
+  Buffer(int width, int height, int padding, unsigned int alignment)
+      : width_(width), height_(height), top_padding_(padding),
+        left_padding_(padding), right_padding_(padding),
+        bottom_padding_(padding), alignment_(alignment), padding_value_(0),
+        stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(nullptr) {}
+
+  ~Buffer() {
+    if (alignment_) {
+      vpx_free(raw_buffer_);
+    } else {
+      delete[] raw_buffer_;
+    }
+  }
+
+  T *TopLeftPixel() const;
+
+  int stride() const { return stride_; }
+
+  // Set the buffer (excluding padding) to 'value'.
+  void Set(const T value);
+
+  // Set the buffer (excluding padding) to the output of ACMRandom function
+  // 'rand_func'.
+  void Set(ACMRandom *rand_class, T (ACMRandom::*rand_func)());
+
+  // Set the buffer (excluding padding) to the output of ACMRandom function
+  // 'RandRange' with range 'low' to 'high' which typically must be within
+  // testing::internal::Random::kMaxRange (1u << 31). However, because we want
+  // to allow negative low (and high) values, it is restricted to INT32_MAX
+  // here.
+  void Set(ACMRandom *rand_class, const T low, const T high);
+
+  // Copy the contents of Buffer 'a' (excluding padding).
+  void CopyFrom(const Buffer<T> &a);
+
+  void DumpBuffer() const;
+
+  // Highlight the differences between two buffers if they are the same size.
+  void PrintDifference(const Buffer<T> &a) const;
+
+  bool HasPadding() const;
+
+  // Sets all the values in the buffer to 'padding_value'.
+  void SetPadding(const T padding_value);
+
+  // Checks if all the values (excluding padding) are equal to 'value' if the
+  // Buffers are the same size.
+  bool CheckValues(const T value) const;
+
+  // Check that padding matches the expected value or there is no padding.
+  bool CheckPadding() const;
+
+  // Compare the non-padding portion of two buffers if they are the same size.
+  bool CheckValues(const Buffer<T> &a) const;
+
+  bool Init() {
+    if (raw_buffer_ != nullptr) return false;
+    EXPECT_GT(width_, 0);
+    EXPECT_GT(height_, 0);
+    EXPECT_GE(top_padding_, 0);
+    EXPECT_GE(left_padding_, 0);
+    EXPECT_GE(right_padding_, 0);
+    EXPECT_GE(bottom_padding_, 0);
+    stride_ = left_padding_ + width_ + right_padding_;
+    num_elements_ = stride_ * (top_padding_ + height_ + bottom_padding_);
+    raw_size_ = num_elements_ * sizeof(T);
+    if (alignment_) {
+      EXPECT_GE(alignment_, sizeof(T));
+      // Ensure alignment of the first value will be preserved.
+      EXPECT_EQ((left_padding_ * sizeof(T)) % alignment_, 0u);
+      // Ensure alignment of the subsequent rows will be preserved when there is
+      // a stride.
+      if (stride_ != width_) {
+        EXPECT_EQ((stride_ * sizeof(T)) % alignment_, 0u);
+      }
+      raw_buffer_ = reinterpret_cast<T *>(vpx_memalign(alignment_, raw_size_));
+    } else {
+      raw_buffer_ = new (std::nothrow) T[num_elements_];
+    }
+    EXPECT_NE(raw_buffer_, nullptr);
+    SetPadding(std::numeric_limits<T>::max());
+    return !::testing::Test::HasFailure();
+  }
+
+ private:
+  bool BufferSizesMatch(const Buffer<T> &a) const;
+
+  const int width_;
+  const int height_;
+  const int top_padding_;
+  const int left_padding_;
+  const int right_padding_;
+  const int bottom_padding_;
+  const unsigned int alignment_;
+  T padding_value_;
+  int stride_;
+  int raw_size_;
+  int num_elements_;
+  T *raw_buffer_;
+};
+
+template <typename T>
+T *Buffer<T>::TopLeftPixel() const {
+  if (!raw_buffer_) return nullptr;
+  return raw_buffer_ + (top_padding_ * stride_) + left_padding_;
+}
+
+template <typename T>
+void Buffer<T>::Set(const T value) {
+  if (!raw_buffer_) return;
+  T *src = TopLeftPixel();
+  for (int height = 0; height < height_; ++height) {
+    for (int width = 0; width < width_; ++width) {
+      src[width] = value;
+    }
+    src += stride_;
+  }
+}
+
+template <typename T>
+void Buffer<T>::Set(ACMRandom *rand_class, T (ACMRandom::*rand_func)()) {
+  if (!raw_buffer_) return;
+  T *src = TopLeftPixel();
+  for (int height = 0; height < height_; ++height) {
+    for (int width = 0; width < width_; ++width) {
+      src[width] = (*rand_class.*rand_func)();
+    }
+    src += stride_;
+  }
+}
+
+template <typename T>
+void Buffer<T>::Set(ACMRandom *rand_class, const T low, const T high) {
+  if (!raw_buffer_) return;
+
+  EXPECT_LE(low, high);
+  EXPECT_LE(static_cast<int64_t>(high) - low,
+            std::numeric_limits<int32_t>::max());
+
+  T *src = TopLeftPixel();
+  for (int height = 0; height < height_; ++height) {
+    for (int width = 0; width < width_; ++width) {
+      // 'low' will be promoted to unsigned given the return type of RandRange.
+      // Store the value as an int to avoid unsigned overflow warnings when
+      // 'low' is negative.
+      const int32_t value =
+          static_cast<int32_t>((*rand_class).RandRange(high - low));
+      src[width] = static_cast<T>(value + low);
+    }
+    src += stride_;
+  }
+}
+
+template <typename T>
+void Buffer<T>::CopyFrom(const Buffer<T> &a) {
+  if (!raw_buffer_) return;
+  if (!BufferSizesMatch(a)) return;
+
+  T *a_src = a.TopLeftPixel();
+  T *b_src = this->TopLeftPixel();
+  for (int height = 0; height < height_; ++height) {
+    for (int width = 0; width < width_; ++width) {
+      b_src[width] = a_src[width];
+    }
+    a_src += a.stride();
+    b_src += this->stride();
+  }
+}
+
+template <typename T>
+void Buffer<T>::DumpBuffer() const {
+  if (!raw_buffer_) return;
+  for (int height = 0; height < height_ + top_padding_ + bottom_padding_;
+       ++height) {
+    for (int width = 0; width < stride_; ++width) {
+      printf("%4d", raw_buffer_[height + width * stride_]);
+    }
+    printf("\n");
+  }
+}
+
+template <typename T>
+bool Buffer<T>::HasPadding() const {
+  if (!raw_buffer_) return false;
+  return top_padding_ || left_padding_ || right_padding_ || bottom_padding_;
+}
+
+template <typename T>
+void Buffer<T>::PrintDifference(const Buffer<T> &a) const {
+  if (!raw_buffer_) return;
+  if (!BufferSizesMatch(a)) return;
+
+  T *a_src = a.TopLeftPixel();
+  T *b_src = TopLeftPixel();
+
+  printf("This buffer:\n");
+  for (int height = 0; height < height_; ++height) {
+    for (int width = 0; width < width_; ++width) {
+      if (a_src[width] != b_src[width]) {
+        printf("*%3d", b_src[width]);
+      } else {
+        printf("%4d", b_src[width]);
+      }
+    }
+    printf("\n");
+    a_src += a.stride();
+    b_src += this->stride();
+  }
+
+  a_src = a.TopLeftPixel();
+  b_src = TopLeftPixel();
+
+  printf("Reference buffer:\n");
+  for (int height = 0; height < height_; ++height) {
+    for (int width = 0; width < width_; ++width) {
+      if (a_src[width] != b_src[width]) {
+        printf("*%3d", a_src[width]);
+      } else {
+        printf("%4d", a_src[width]);
+      }
+    }
+    printf("\n");
+    a_src += a.stride();
+    b_src += this->stride();
+  }
+}
+
+template <typename T>
+void Buffer<T>::SetPadding(const T padding_value) {
+  if (!raw_buffer_) return;
+  padding_value_ = padding_value;
+
+  T *src = raw_buffer_;
+  for (int i = 0; i < num_elements_; ++i) {
+    src[i] = padding_value;
+  }
+}
+
+template <typename T>
+bool Buffer<T>::CheckValues(const T value) const {
+  if (!raw_buffer_) return false;
+  T *src = TopLeftPixel();
+  for (int height = 0; height < height_; ++height) {
+    for (int width = 0; width < width_; ++width) {
+      if (value != src[width]) {
+        return false;
+      }
+    }
+    src += stride_;
+  }
+  return true;
+}
+
+template <typename T>
+bool Buffer<T>::CheckPadding() const {
+  if (!raw_buffer_) return false;
+  if (!HasPadding()) return true;
+
+  // Top padding.
+  T const *top = raw_buffer_;
+  for (int i = 0; i < stride_ * top_padding_; ++i) {
+    if (padding_value_ != top[i]) {
+      return false;
+    }
+  }
+
+  // Left padding.
+  T const *left = TopLeftPixel() - left_padding_;
+  for (int height = 0; height < height_; ++height) {
+    for (int width = 0; width < left_padding_; ++width) {
+      if (padding_value_ != left[width]) {
+        return false;
+      }
+    }
+    left += stride_;
+  }
+
+  // Right padding.
+  T const *right = TopLeftPixel() + width_;
+  for (int height = 0; height < height_; ++height) {
+    for (int width = 0; width < right_padding_; ++width) {
+      if (padding_value_ != right[width]) {
+        return false;
+      }
+    }
+    right += stride_;
+  }
+
+  // Bottom padding
+  T const *bottom = raw_buffer_ + (top_padding_ + height_) * stride_;
+  for (int i = 0; i < stride_ * bottom_padding_; ++i) {
+    if (padding_value_ != bottom[i]) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+template <typename T>
+bool Buffer<T>::CheckValues(const Buffer<T> &a) const {
+  if (!raw_buffer_) return false;
+  if (!BufferSizesMatch(a)) return false;
+
+  T *a_src = a.TopLeftPixel();
+  T *b_src = this->TopLeftPixel();
+  for (int height = 0; height < height_; ++height) {
+    for (int width = 0; width < width_; ++width) {
+      if (a_src[width] != b_src[width]) {
+        return false;
+      }
+    }
+    a_src += a.stride();
+    b_src += this->stride();
+  }
+  return true;
+}
+
+template <typename T>
+bool Buffer<T>::BufferSizesMatch(const Buffer<T> &a) const {
+  if (!raw_buffer_) return false;
+  if (a.width_ != this->width_ || a.height_ != this->height_) {
+    printf(
+        "Reference buffer of size %dx%d does not match this buffer which is "
+        "size %dx%d\n",
+        a.width_, a.height_, this->width_, this->height_);
+    return false;
+  }
+
+  return true;
+}
+}  // namespace libvpx_test
+#endif  // VPX_TEST_BUFFER_H_
diff --git a/media/libvpx/libvpx/test/byte_alignment_test.cc b/media/libvpx/libvpx/test/byte_alignment_test.cc
index d78294d10f..ba6fffc524 100644
--- a/media/libvpx/libvpx/test/byte_alignment_test.cc
+++ b/media/libvpx/libvpx/test/byte_alignment_test.cc
@@ -55,23 +55,24 @@ const ByteAlignmentTestParam kBaTestParams[] = {
 class ByteAlignmentTest
     : public ::testing::TestWithParam<ByteAlignmentTestParam> {
  protected:
-  ByteAlignmentTest() : video_(NULL), decoder_(NULL), md5_file_(NULL) {}
+  ByteAlignmentTest()
+      : video_(nullptr), decoder_(nullptr), md5_file_(nullptr) {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     video_ = new libvpx_test::WebMVideoSource(kVP9TestFile);
-    ASSERT_TRUE(video_ != NULL);
+    ASSERT_NE(video_, nullptr);
     video_->Init();
     video_->Begin();
 
     const vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
     decoder_ = new libvpx_test::VP9Decoder(cfg, 0);
-    ASSERT_TRUE(decoder_ != NULL);
+    ASSERT_NE(decoder_, nullptr);
 
     OpenMd5File(kVP9Md5File);
   }
 
-  virtual void TearDown() {
-    if (md5_file_ != NULL) fclose(md5_file_);
+  void TearDown() override {
+    if (md5_file_ != nullptr) fclose(md5_file_);
 
     delete decoder_;
     delete video_;
@@ -90,7 +91,7 @@ class ByteAlignmentTest
   }
 
   vpx_codec_err_t DecodeRemainingFrames(int byte_alignment_to_check) {
-    for (; video_->cxdata() != NULL; video_->Next()) {
+    for (; video_->cxdata() != nullptr; video_->Next()) {
       const vpx_codec_err_t res =
           decoder_->DecodeFrame(video_->cxdata(), video_->frame_size());
       if (res != VPX_CODEC_OK) return res;
@@ -113,7 +114,7 @@ class ByteAlignmentTest
     const vpx_image_t *img;
 
     // Get decompressed data
-    while ((img = dec_iter.Next()) != NULL) {
+    while ((img = dec_iter.Next()) != nullptr) {
       if (byte_alignment_to_check == kLegacyByteAlignment) {
         CheckByteAlignment(img->planes[0], kLegacyYPlaneByteAlignment);
       } else {
@@ -128,12 +129,12 @@ class ByteAlignmentTest
   // TODO(fgalligan): Move the MD5 testing code into another class.
   void OpenMd5File(const std::string &md5_file_name_) {
     md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_);
-    ASSERT_TRUE(md5_file_ != NULL) << "MD5 file open failed. Filename: "
-                                   << md5_file_name_;
+    ASSERT_NE(md5_file_, nullptr)
+        << "MD5 file open failed. Filename: " << md5_file_name_;
   }
 
   void CheckMd5(const vpx_image_t &img) {
-    ASSERT_TRUE(md5_file_ != NULL);
+    ASSERT_NE(md5_file_, nullptr);
     char expected_md5[33];
     char junk[128];
 
@@ -171,12 +172,13 @@ TEST_F(ByteAlignmentTest, SwitchByteAlignment) {
 TEST_P(ByteAlignmentTest, TestAlignment) {
   const ByteAlignmentTestParam t = GetParam();
   SetByteAlignment(t.byte_alignment, t.expected_value);
-  if (t.decode_remaining)
+  if (t.decode_remaining) {
     ASSERT_EQ(VPX_CODEC_OK, DecodeRemainingFrames(t.byte_alignment));
+  }
 }
 
-INSTANTIATE_TEST_CASE_P(Alignments, ByteAlignmentTest,
-                        ::testing::ValuesIn(kBaTestParams));
+INSTANTIATE_TEST_SUITE_P(Alignments, ByteAlignmentTest,
+                         ::testing::ValuesIn(kBaTestParams));
 
 #endif  // CONFIG_WEBM_IO
 
diff --git a/media/libvpx/libvpx/test/clear_system_state.h b/media/libvpx/libvpx/test/clear_system_state.h
index 044a5c7583..ba3c0b386a 100644
--- a/media/libvpx/libvpx/test/clear_system_state.h
+++ b/media/libvpx/libvpx/test/clear_system_state.h
@@ -7,23 +7,17 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef TEST_CLEAR_SYSTEM_STATE_H_
-#define TEST_CLEAR_SYSTEM_STATE_H_
+#ifndef VPX_TEST_CLEAR_SYSTEM_STATE_H_
+#define VPX_TEST_CLEAR_SYSTEM_STATE_H_
 
 #include "./vpx_config.h"
-#if ARCH_X86 || ARCH_X86_64
-#include "vpx_ports/x86.h"
-#endif
+#include "vpx_ports/system_state.h"
 
 namespace libvpx_test {
 
 // Reset system to a known state. This function should be used for all non-API
 // test cases.
-inline void ClearSystemState() {
-#if ARCH_X86 || ARCH_X86_64
-  vpx_reset_mmx_state();
-#endif
-}
+inline void ClearSystemState() { vpx_clear_system_state(); }
 
 }  // namespace libvpx_test
-#endif  // TEST_CLEAR_SYSTEM_STATE_H_
+#endif  // VPX_TEST_CLEAR_SYSTEM_STATE_H_
diff --git a/media/libvpx/libvpx/test/codec_factory.h b/media/libvpx/libvpx/test/codec_factory.h
index d5882ed9c8..179ccdf011 100644
--- a/media/libvpx/libvpx/test/codec_factory.h
+++ b/media/libvpx/libvpx/test/codec_factory.h
@@ -7,8 +7,10 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef TEST_CODEC_FACTORY_H_
-#define TEST_CODEC_FACTORY_H_
+#ifndef VPX_TEST_CODEC_FACTORY_H_
+#define VPX_TEST_CODEC_FACTORY_H_
+
+#include <tuple>
 
 #include "./vpx_config.h"
 #include "vpx/vpx_decoder.h"
@@ -38,7 +40,7 @@ class CodecFactory {
                                  const vpx_codec_flags_t flags) const = 0;
 
   virtual Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg,
-                                 unsigned long deadline,
+                                 vpx_enc_deadline_t deadline,
                                  const unsigned long init_flags,
                                  TwopassStatsStore *stats) const = 0;
 
@@ -53,23 +55,22 @@ class CodecFactory {
 template <class T1>
 class CodecTestWithParam
     : public ::testing::TestWithParam<
-          std::tr1::tuple<const libvpx_test::CodecFactory *, T1> > {};
+          std::tuple<const libvpx_test::CodecFactory *, T1> > {};
 
 template <class T1, class T2>
 class CodecTestWith2Params
     : public ::testing::TestWithParam<
-          std::tr1::tuple<const libvpx_test::CodecFactory *, T1, T2> > {};
+          std::tuple<const libvpx_test::CodecFactory *, T1, T2> > {};
 
 template <class T1, class T2, class T3>
 class CodecTestWith3Params
     : public ::testing::TestWithParam<
-          std::tr1::tuple<const libvpx_test::CodecFactory *, T1, T2, T3> > {};
+          std::tuple<const libvpx_test::CodecFactory *, T1, T2, T3> > {};
 
 template <class T1, class T2, class T3, class T4>
 class CodecTestWith4Params
     : public ::testing::TestWithParam<
-          std::tr1::tuple<const libvpx_test::CodecFactory *, T1, T2, T3, T4> > {
-};
+          std::tuple<const libvpx_test::CodecFactory *, T1, T2, T3, T4> > {};
 
 /*
  * VP8 Codec Definitions
@@ -83,27 +84,27 @@ class VP8Decoder : public Decoder {
       : Decoder(cfg, flag) {}
 
  protected:
-  virtual vpx_codec_iface_t *CodecInterface() const {
+  vpx_codec_iface_t *CodecInterface() const override {
 #if CONFIG_VP8_DECODER
     return &vpx_codec_vp8_dx_algo;
 #else
-    return NULL;
+    return nullptr;
 #endif
   }
 };
 
 class VP8Encoder : public Encoder {
  public:
-  VP8Encoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline,
+  VP8Encoder(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline,
              const unsigned long init_flags, TwopassStatsStore *stats)
       : Encoder(cfg, deadline, init_flags, stats) {}
 
  protected:
-  virtual vpx_codec_iface_t *CodecInterface() const {
+  vpx_codec_iface_t *CodecInterface() const override {
 #if CONFIG_VP8_ENCODER
     return &vpx_codec_vp8_cx_algo;
 #else
-    return NULL;
+    return nullptr;
 #endif
   }
 };
@@ -112,25 +113,24 @@ class VP8CodecFactory : public CodecFactory {
  public:
   VP8CodecFactory() : CodecFactory() {}
 
-  virtual Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg) const {
+  Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg) const override {
     return CreateDecoder(cfg, 0);
   }
 
-  virtual Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg,
-                                 const vpx_codec_flags_t flags) const {
+  Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg,
+                         const vpx_codec_flags_t flags) const override {
 #if CONFIG_VP8_DECODER
     return new VP8Decoder(cfg, flags);
 #else
     (void)cfg;
     (void)flags;
-    return NULL;
+    return nullptr;
 #endif
   }
 
-  virtual Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg,
-                                 unsigned long deadline,
-                                 const unsigned long init_flags,
-                                 TwopassStatsStore *stats) const {
+  Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline,
+                         const unsigned long init_flags,
+                         TwopassStatsStore *stats) const override {
 #if CONFIG_VP8_ENCODER
     return new VP8Encoder(cfg, deadline, init_flags, stats);
 #else
@@ -138,12 +138,12 @@ class VP8CodecFactory : public CodecFactory {
     (void)deadline;
     (void)init_flags;
     (void)stats;
-    return NULL;
+    return nullptr;
 #endif
   }
 
-  virtual vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg,
-                                               int usage) const {
+  vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg,
+                                       int usage) const override {
 #if CONFIG_VP8_ENCODER
     return vpx_codec_enc_config_default(&vpx_codec_vp8_cx_algo, cfg, usage);
 #else
@@ -156,15 +156,17 @@ class VP8CodecFactory : public CodecFactory {
 
 const libvpx_test::VP8CodecFactory kVP8;
 
-#define VP8_INSTANTIATE_TEST_CASE(test, ...)                                \
-  INSTANTIATE_TEST_CASE_P(                                                  \
+#define VP8_INSTANTIATE_TEST_SUITE(test, ...)                               \
+  INSTANTIATE_TEST_SUITE_P(                                                 \
       VP8, test,                                                            \
       ::testing::Combine(                                                   \
           ::testing::Values(static_cast<const libvpx_test::CodecFactory *>( \
               &libvpx_test::kVP8)),                                         \
           __VA_ARGS__))
 #else
-#define VP8_INSTANTIATE_TEST_CASE(test, ...)
+// static_assert() is used to avoid warnings about an extra ';' outside of a
+// function.
+#define VP8_INSTANTIATE_TEST_SUITE(test, ...) static_assert(CONFIG_VP8 == 0, "")
 #endif  // CONFIG_VP8
 
 /*
@@ -179,27 +181,27 @@ class VP9Decoder : public Decoder {
       : Decoder(cfg, flag) {}
 
  protected:
-  virtual vpx_codec_iface_t *CodecInterface() const {
+  vpx_codec_iface_t *CodecInterface() const override {
 #if CONFIG_VP9_DECODER
     return &vpx_codec_vp9_dx_algo;
 #else
-    return NULL;
+    return nullptr;
 #endif
   }
 };
 
 class VP9Encoder : public Encoder {
  public:
-  VP9Encoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline,
+  VP9Encoder(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline,
              const unsigned long init_flags, TwopassStatsStore *stats)
       : Encoder(cfg, deadline, init_flags, stats) {}
 
  protected:
-  virtual vpx_codec_iface_t *CodecInterface() const {
+  vpx_codec_iface_t *CodecInterface() const override {
 #if CONFIG_VP9_ENCODER
     return &vpx_codec_vp9_cx_algo;
 #else
-    return NULL;
+    return nullptr;
 #endif
   }
 };
@@ -208,25 +210,24 @@ class VP9CodecFactory : public CodecFactory {
  public:
   VP9CodecFactory() : CodecFactory() {}
 
-  virtual Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg) const {
+  Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg) const override {
     return CreateDecoder(cfg, 0);
   }
 
-  virtual Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg,
-                                 const vpx_codec_flags_t flags) const {
+  Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg,
+                         const vpx_codec_flags_t flags) const override {
 #if CONFIG_VP9_DECODER
     return new VP9Decoder(cfg, flags);
 #else
     (void)cfg;
     (void)flags;
-    return NULL;
+    return nullptr;
 #endif
   }
 
-  virtual Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg,
-                                 unsigned long deadline,
-                                 const unsigned long init_flags,
-                                 TwopassStatsStore *stats) const {
+  Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline,
+                         const unsigned long init_flags,
+                         TwopassStatsStore *stats) const override {
 #if CONFIG_VP9_ENCODER
     return new VP9Encoder(cfg, deadline, init_flags, stats);
 #else
@@ -234,12 +235,12 @@ class VP9CodecFactory : public CodecFactory {
     (void)deadline;
     (void)init_flags;
     (void)stats;
-    return NULL;
+    return nullptr;
 #endif
   }
 
-  virtual vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg,
-                                               int usage) const {
+  vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg,
+                                       int usage) const override {
 #if CONFIG_VP9_ENCODER
     return vpx_codec_enc_config_default(&vpx_codec_vp9_cx_algo, cfg, usage);
 #else
@@ -252,16 +253,18 @@ class VP9CodecFactory : public CodecFactory {
 
 const libvpx_test::VP9CodecFactory kVP9;
 
-#define VP9_INSTANTIATE_TEST_CASE(test, ...)                                \
-  INSTANTIATE_TEST_CASE_P(                                                  \
+#define VP9_INSTANTIATE_TEST_SUITE(test, ...)                               \
+  INSTANTIATE_TEST_SUITE_P(                                                 \
       VP9, test,                                                            \
       ::testing::Combine(                                                   \
           ::testing::Values(static_cast<const libvpx_test::CodecFactory *>( \
               &libvpx_test::kVP9)),                                         \
           __VA_ARGS__))
 #else
-#define VP9_INSTANTIATE_TEST_CASE(test, ...)
+// static_assert() is used to avoid warnings about an extra ';' outside of a
+// function.
+#define VP9_INSTANTIATE_TEST_SUITE(test, ...) static_assert(CONFIG_VP9 == 0, "")
 #endif  // CONFIG_VP9
 
 }  // namespace libvpx_test
-#endif  // TEST_CODEC_FACTORY_H_
+#endif  // VPX_TEST_CODEC_FACTORY_H_
diff --git a/media/libvpx/libvpx/test/comp_avg_pred_test.cc b/media/libvpx/libvpx/test/comp_avg_pred_test.cc
new file mode 100644
index 0000000000..de9842a88f
--- /dev/null
+++ b/media/libvpx/libvpx/test/comp_avg_pred_test.cc
@@ -0,0 +1,276 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "gtest/gtest.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/buffer.h"
+#include "test/register_state_check.h"
+#include "vpx_config.h"
+#include "vpx_ports/vpx_timer.h"
+
+namespace {
+
+using ::libvpx_test::ACMRandom;
+using ::libvpx_test::Buffer;
+
+template <typename Pixel>
+Pixel avg_with_rounding(Pixel a, Pixel b) {
+  return (a + b + 1) >> 1;
+}
+
+template <typename Pixel>
+void reference_pred(const Buffer<Pixel> &pred, const Buffer<Pixel> &ref,
+                    int width, int height, Buffer<Pixel> *avg) {
+  ASSERT_NE(avg->TopLeftPixel(), nullptr);
+  ASSERT_NE(pred.TopLeftPixel(), nullptr);
+  ASSERT_NE(ref.TopLeftPixel(), nullptr);
+
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      avg->TopLeftPixel()[y * avg->stride() + x] =
+          avg_with_rounding<Pixel>(pred.TopLeftPixel()[y * pred.stride() + x],
+                                   ref.TopLeftPixel()[y * ref.stride() + x]);
+    }
+  }
+}
+
+using AvgPredFunc = void (*)(uint8_t *a, const uint8_t *b, int w, int h,
+                             const uint8_t *c, int c_stride);
+
+template <int bitdepth, typename Pixel>
+class AvgPredTest : public ::testing::TestWithParam<AvgPredFunc> {
+ public:
+  void SetUp() override {
+    avg_pred_func_ = GetParam();
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+  void TestSizeCombinations();
+  void TestCompareReferenceRandom();
+  void TestSpeed();
+
+ protected:
+  AvgPredFunc avg_pred_func_;
+  ACMRandom rnd_;
+};
+
+template <int bitdepth, typename Pixel>
+void AvgPredTest<bitdepth, Pixel>::TestSizeCombinations() {
+  // This is called as part of the sub pixel variance. As such it must be one of
+  // the variance block sizes.
+  for (int width_pow = 2; width_pow <= 6; ++width_pow) {
+    for (int height_pow = width_pow - 1; height_pow <= width_pow + 1;
+         ++height_pow) {
+      // Don't test 4x2 or 64x128
+      if (height_pow == 1 || height_pow == 7) continue;
+
+      // The sse2 special-cases when ref width == stride, so make sure to test
+      // it.
+      for (int ref_padding = 0; ref_padding < 2; ref_padding++) {
+        const int width = 1 << width_pow;
+        const int height = 1 << height_pow;
+        // Only the reference buffer may have a stride not equal to width.
+        Buffer<Pixel> ref = Buffer<Pixel>(width, height, ref_padding ? 8 : 0);
+        ASSERT_TRUE(ref.Init());
+        Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 32);
+        ASSERT_TRUE(pred.Init());
+        Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 32);
+        ASSERT_TRUE(avg_ref.Init());
+        Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 32);
+        ASSERT_TRUE(avg_chk.Init());
+        const int bitdepth_mask = (1 << bitdepth) - 1;
+        for (int h = 0; h < height; ++h) {
+          for (int w = 0; w < width; ++w) {
+            ref.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask;
+          }
+        }
+        for (int h = 0; h < height; ++h) {
+          for (int w = 0; w < width; ++w) {
+            pred.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask;
+          }
+        }
+
+        reference_pred<Pixel>(pred, ref, width, height, &avg_ref);
+        ASM_REGISTER_STATE_CHECK(avg_pred_func_(
+            (uint8_t *)avg_chk.TopLeftPixel(), (uint8_t *)pred.TopLeftPixel(),
+            width, height, (uint8_t *)ref.TopLeftPixel(), ref.stride()));
+
+        EXPECT_TRUE(avg_chk.CheckValues(avg_ref));
+        if (HasFailure()) {
+          printf("Width: %d Height: %d\n", width, height);
+          avg_chk.PrintDifference(avg_ref);
+          return;
+        }
+      }
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void AvgPredTest<bitdepth, Pixel>::TestCompareReferenceRandom() {
+  const int width = 64;
+  const int height = 32;
+  Buffer<Pixel> ref = Buffer<Pixel>(width, height, 8);
+  ASSERT_TRUE(ref.Init());
+  Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 32);
+  ASSERT_TRUE(pred.Init());
+  Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 32);
+  ASSERT_TRUE(avg_ref.Init());
+  Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 32);
+  ASSERT_TRUE(avg_chk.Init());
+
+  for (int i = 0; i < 500; ++i) {
+    const int bitdepth_mask = (1 << bitdepth) - 1;
+    for (int h = 0; h < height; ++h) {
+      for (int w = 0; w < width; ++w) {
+        ref.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask;
+      }
+    }
+    for (int h = 0; h < height; ++h) {
+      for (int w = 0; w < width; ++w) {
+        pred.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask;
+      }
+    }
+
+    reference_pred<Pixel>(pred, ref, width, height, &avg_ref);
+    ASM_REGISTER_STATE_CHECK(avg_pred_func_(
+        (uint8_t *)avg_chk.TopLeftPixel(), (uint8_t *)pred.TopLeftPixel(),
+        width, height, (uint8_t *)ref.TopLeftPixel(), ref.stride()));
+    EXPECT_TRUE(avg_chk.CheckValues(avg_ref));
+    if (HasFailure()) {
+      printf("Width: %d Height: %d\n", width, height);
+      avg_chk.PrintDifference(avg_ref);
+      return;
+    }
+  }
+}
+
+template <int bitdepth, typename Pixel>
+void AvgPredTest<bitdepth, Pixel>::TestSpeed() {
+  for (int width_pow = 2; width_pow <= 6; ++width_pow) {
+    for (int height_pow = width_pow - 1; height_pow <= width_pow + 1;
+         ++height_pow) {
+      // Don't test 4x2 or 64x128
+      if (height_pow == 1 || height_pow == 7) continue;
+
+      for (int ref_padding = 0; ref_padding < 2; ref_padding++) {
+        const int width = 1 << width_pow;
+        const int height = 1 << height_pow;
+        Buffer<Pixel> ref = Buffer<Pixel>(width, height, ref_padding ? 8 : 0);
+        ASSERT_TRUE(ref.Init());
+        Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 32);
+        ASSERT_TRUE(pred.Init());
+        Buffer<Pixel> avg = Buffer<Pixel>(width, height, 0, 32);
+        ASSERT_TRUE(avg.Init());
+        const int bitdepth_mask = (1 << bitdepth) - 1;
+        for (int h = 0; h < height; ++h) {
+          for (int w = 0; w < width; ++w) {
+            ref.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask;
+          }
+        }
+        for (int h = 0; h < height; ++h) {
+          for (int w = 0; w < width; ++w) {
+            pred.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask;
+          }
+        }
+
+        vpx_usec_timer timer;
+        vpx_usec_timer_start(&timer);
+        for (int i = 0; i < 100000000 / (width * height); ++i) {
+          avg_pred_func_((uint8_t *)avg.TopLeftPixel(),
+                         (uint8_t *)pred.TopLeftPixel(), width, height,
+                         (uint8_t *)ref.TopLeftPixel(), ref.stride());
+        }
+        vpx_usec_timer_mark(&timer);
+
+        const int elapsed_time =
+            static_cast<int>(vpx_usec_timer_elapsed(&timer));
+        printf("Average Test (ref_padding: %d) %dx%d time: %5d us\n",
+               ref_padding, width, height, elapsed_time);
+      }
+    }
+  }
+}
+
+using AvgPredTestLBD = AvgPredTest<8, uint8_t>;
+
+TEST_P(AvgPredTestLBD, SizeCombinations) { TestSizeCombinations(); }
+
+TEST_P(AvgPredTestLBD, CompareReferenceRandom) { TestCompareReferenceRandom(); }
+
+TEST_P(AvgPredTestLBD, DISABLED_Speed) { TestSpeed(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AvgPredTestLBD,
+                         ::testing::Values(&vpx_comp_avg_pred_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, AvgPredTestLBD,
+                         ::testing::Values(&vpx_comp_avg_pred_sse2));
+#endif  // HAVE_SSE2
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AvgPredTestLBD,
+                         ::testing::Values(&vpx_comp_avg_pred_avx2));
+#endif  // HAVE_AVX2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AvgPredTestLBD,
+                         ::testing::Values(&vpx_comp_avg_pred_neon));
+#endif  // HAVE_NEON
+
+#if HAVE_VSX
+INSTANTIATE_TEST_SUITE_P(VSX, AvgPredTestLBD,
+                         ::testing::Values(&vpx_comp_avg_pred_vsx));
+#endif  // HAVE_VSX
+
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(LSX, AvgPredTestLBD,
+                         ::testing::Values(&vpx_comp_avg_pred_lsx));
+#endif  // HAVE_LSX
+
+#if CONFIG_VP9_HIGHBITDEPTH
+using HighbdAvgPredFunc = void (*)(uint16_t *a, const uint16_t *b, int w, int h,
+                                   const uint16_t *c, int c_stride);
+
+template <HighbdAvgPredFunc fn>
+void highbd_wrapper(uint8_t *a, const uint8_t *b, int w, int h,
+                    const uint8_t *c, int c_stride) {
+  fn((uint16_t *)a, (const uint16_t *)b, w, h, (const uint16_t *)c, c_stride);
+}
+
+using AvgPredTestHBD = AvgPredTest<12, uint16_t>;
+
+TEST_P(AvgPredTestHBD, SizeCombinations) { TestSizeCombinations(); }
+
+TEST_P(AvgPredTestHBD, CompareReferenceRandom) { TestCompareReferenceRandom(); }
+
+TEST_P(AvgPredTestHBD, DISABLED_Speed) { TestSpeed(); }
+
+INSTANTIATE_TEST_SUITE_P(
+    C, AvgPredTestHBD,
+    ::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_c>));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, AvgPredTestHBD,
+    ::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_sse2>));
+#endif  // HAVE_SSE2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AvgPredTestHBD,
+    ::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_neon>));
+#endif  // HAVE_NEON
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}  // namespace
diff --git a/media/libvpx/libvpx/test/config_test.cc b/media/libvpx/libvpx/test/config_test.cc
index b2f8ea5ed3..d3aca4cfb5 100644
--- a/media/libvpx/libvpx/test/config_test.cc
+++ b/media/libvpx/libvpx/test/config_test.cc
@@ -7,7 +7,7 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/util.h"
@@ -22,24 +22,24 @@ class ConfigTest
   ConfigTest()
       : EncoderTest(GET_PARAM(0)), frame_count_in_(0), frame_count_out_(0),
         frame_count_max_(0) {}
-  virtual ~ConfigTest() {}
+  ~ConfigTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     frame_count_in_ = 0;
     frame_count_out_ = 0;
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource * /*video*/) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource * /*video*/) override {
     ++frame_count_in_;
     abort_ |= (frame_count_in_ >= frame_count_max_);
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {
+  void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) override {
     ++frame_count_out_;
   }
 
@@ -58,5 +58,5 @@ TEST_P(ConfigTest, LagIsDisabled) {
   EXPECT_EQ(frame_count_in_, frame_count_out_);
 }
 
-VP8_INSTANTIATE_TEST_CASE(ConfigTest, ONE_PASS_TEST_MODES);
+VP8_INSTANTIATE_TEST_SUITE(ConfigTest, ONE_PASS_TEST_MODES);
 }  // namespace
diff --git a/media/libvpx/libvpx/test/consistency_test.cc b/media/libvpx/libvpx/test/consistency_test.cc
index 37b4a45e54..3e73bec492 100644
--- a/media/libvpx/libvpx/test/consistency_test.cc
+++ b/media/libvpx/libvpx/test/consistency_test.cc
@@ -11,8 +11,9 @@
 #include <limits.h>
 #include <stdio.h>
 #include <string.h>
+#include <tuple>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "./vpx_config.h"
 #if CONFIG_VP9_ENCODER
@@ -38,7 +39,7 @@ class ConsistencyTestBase : public ::testing::Test {
  public:
   ConsistencyTestBase(int width, int height) : width_(width), height_(height) {}
 
-  static void SetUpTestCase() {
+  static void SetUpTestSuite() {
     source_data_[0] = reinterpret_cast<uint8_t *>(
         vpx_memalign(kDataAlignment, kDataBufferSize));
     reference_data_[0] = reinterpret_cast<uint8_t *>(
@@ -51,27 +52,27 @@ class ConsistencyTestBase : public ::testing::Test {
   }
 
   static void ClearSsim() { memset(ssim_array_, 0, kDataBufferSize / 16); }
-  static void TearDownTestCase() {
+  static void TearDownTestSuite() {
     vpx_free(source_data_[0]);
-    source_data_[0] = NULL;
+    source_data_[0] = nullptr;
     vpx_free(reference_data_[0]);
-    reference_data_[0] = NULL;
+    reference_data_[0] = nullptr;
     vpx_free(source_data_[1]);
-    source_data_[1] = NULL;
+    source_data_[1] = nullptr;
     vpx_free(reference_data_[1]);
-    reference_data_[1] = NULL;
+    reference_data_[1] = nullptr;
 
     delete[] ssim_array_;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   // Handle frames up to 640x480
   static const int kDataAlignment = 16;
   static const int kDataBufferSize = 640 * 480;
 
-  virtual void SetUp() {
+  void SetUp() override {
     source_stride_ = (width_ + 31) & ~31;
     reference_stride_ = width_ * 2;
     rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -127,7 +128,7 @@ class ConsistencyTestBase : public ::testing::Test {
 };
 
 #if CONFIG_VP9_ENCODER
-typedef std::tr1::tuple<int, int> ConsistencyParam;
+using ConsistencyParam = std::tuple<int, int>;
 class ConsistencyVP9Test
     : public ConsistencyTestBase,
       public ::testing::WithParamInterface<ConsistencyParam> {
@@ -144,9 +145,9 @@ class ConsistencyVP9Test
 };
 #endif  // CONFIG_VP9_ENCODER
 
-uint8_t *ConsistencyTestBase::source_data_[2] = { NULL, NULL };
-uint8_t *ConsistencyTestBase::reference_data_[2] = { NULL, NULL };
-Ssimv *ConsistencyTestBase::ssim_array_ = NULL;
+uint8_t *ConsistencyTestBase::source_data_[2] = { nullptr, nullptr };
+uint8_t *ConsistencyTestBase::reference_data_[2] = { nullptr, nullptr };
+Ssimv *ConsistencyTestBase::ssim_array_ = nullptr;
 
 #if CONFIG_VP9_ENCODER
 TEST_P(ConsistencyVP9Test, ConsistencyIsZero) {
@@ -198,17 +199,17 @@ TEST_P(ConsistencyVP9Test, ConsistencyIsZero) {
 }
 #endif  // CONFIG_VP9_ENCODER
 
-using std::tr1::make_tuple;
+using std::make_tuple;
 
 //------------------------------------------------------------------------------
 // C functions
 
 #if CONFIG_VP9_ENCODER
-const ConsistencyParam c_vp9_tests[] = {
-  make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238),
-};
-INSTANTIATE_TEST_CASE_P(C, ConsistencyVP9Test,
-                        ::testing::ValuesIn(c_vp9_tests));
+const ConsistencyParam c_vp9_tests[] = { make_tuple(320, 240),
+                                         make_tuple(318, 242),
+                                         make_tuple(318, 238) };
+INSTANTIATE_TEST_SUITE_P(C, ConsistencyVP9Test,
+                         ::testing::ValuesIn(c_vp9_tests));
 #endif
 
 }  // namespace
diff --git a/media/libvpx/libvpx/test/convolve_test.cc b/media/libvpx/libvpx/test/convolve_test.cc
index 0056c602b6..6eecbd3e5d 100644
--- a/media/libvpx/libvpx/test/convolve_test.cc
+++ b/media/libvpx/libvpx/test/convolve_test.cc
@@ -9,8 +9,9 @@
  */
 
 #include <string.h>
+#include <tuple>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
@@ -25,22 +26,30 @@
 #include "vpx_dsp/vpx_filter.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
+#include "vpx_ports/vpx_timer.h"
 
 namespace {
 
 static const unsigned int kMaxDimension = 64;
 
-typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int filter_x_stride,
-                             const int16_t *filter_y, int filter_y_stride,
-                             int w, int h);
+using ConvolveFunc = void (*)(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const InterpKernel *filter, int x0_q4,
+                              int x_step_q4, int y0_q4, int y_step_q4, int w,
+                              int h);
+#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER
+using ConvolveFunc12Tap = void (*)(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const InterpKernel12 *filter, int x0_q4,
+                                   int x_step_q4, int y0_q4, int y_step_q4,
+                                   int w, int h);
+#endif
 
-typedef void (*WrapperFilterBlock2d8Func)(
-    const uint8_t *src_ptr, const unsigned int src_stride,
-    const int16_t *hfilter, const int16_t *vfilter, uint8_t *dst_ptr,
-    unsigned int dst_stride, unsigned int output_width,
-    unsigned int output_height, int use_highbd);
+using WrapperFilterBlock2d8Func =
+    void (*)(const uint8_t *src_ptr, const unsigned int src_stride,
+             const int16_t *hfilter, const int16_t *vfilter, uint8_t *dst_ptr,
+             unsigned int dst_stride, unsigned int output_width,
+             unsigned int output_height, int use_highbd);
 
 struct ConvolveFunctions {
   ConvolveFunctions(ConvolveFunc copy, ConvolveFunc avg, ConvolveFunc h8,
@@ -76,7 +85,26 @@ struct ConvolveFunctions {
   int use_highbd_;  // 0 if high bitdepth not used, else the actual bit depth.
 };
 
-typedef std::tr1::tuple<int, int, const ConvolveFunctions *> ConvolveParam;
+using ConvolveParam = std::tuple<int, int, const ConvolveFunctions *>;
+
+#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER
+struct ConvolveFunctions12Tap {
+  ConvolveFunctions12Tap(ConvolveFunc12Tap h12, ConvolveFunc12Tap v12,
+                         ConvolveFunc12Tap hv12, int bd)
+      : use_highbd_(bd) {
+    h12_ = h12;
+    v12_ = v12;
+    hv12_ = hv12;
+  }
+
+  ConvolveFunc12Tap h12_;
+  ConvolveFunc12Tap v12_;
+  ConvolveFunc12Tap hv12_;
+  int use_highbd_;  // 0 if high bitdepth not used, else the actual bit depth.
+};
+
+using Convolve12TapParam = std::tuple<int, int, const ConvolveFunctions12Tap *>;
+#endif
 
 #define ALL_SIZES(convolve_fn)                                            \
   make_tuple(4, 4, &convolve_fn), make_tuple(8, 4, &convolve_fn),         \
@@ -86,7 +114,13 @@ typedef std::tr1::tuple<int, int, const ConvolveFunctions *> ConvolveParam;
       make_tuple(16, 32, &convolve_fn), make_tuple(32, 32, &convolve_fn), \
       make_tuple(64, 32, &convolve_fn), make_tuple(32, 64, &convolve_fn), \
       make_tuple(64, 64, &convolve_fn)
-
+#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER
+#define ALL_SIZES_12TAP(convolve_fn)                                      \
+  make_tuple(8, 8, &convolve_fn), make_tuple(16, 8, &convolve_fn),        \
+      make_tuple(8, 16, &convolve_fn), make_tuple(16, 16, &convolve_fn),  \
+      make_tuple(32, 16, &convolve_fn), make_tuple(16, 32, &convolve_fn), \
+      make_tuple(32, 32, &convolve_fn)
+#endif
 // Reference 8-tap subpixel filter, slightly modified to fit into this test.
 #define VP9_FILTER_WEIGHT 128
 #define VP9_FILTER_SHIFT 7
@@ -113,6 +147,7 @@ void filter_block2d_8_c(const uint8_t *src_ptr, const unsigned int src_stride,
   // and filter_max_width          = 16
   //
   uint8_t intermediate_buffer[71 * kMaxDimension];
+  vp9_zero(intermediate_buffer);
   const int intermediate_next_stride =
       1 - static_cast<int>(intermediate_height * output_width);
 
@@ -212,6 +247,8 @@ void highbd_filter_block2d_8_c(const uint16_t *src_ptr,
   const int intermediate_next_stride =
       1 - static_cast<int>(intermediate_height * output_width);
 
+  vp9_zero(intermediate_buffer);
+
   // Horizontal pass (src -> transposed intermediate).
   {
     uint16_t *output_ptr = intermediate_buffer;
@@ -239,7 +276,7 @@ void highbd_filter_block2d_8_c(const uint16_t *src_ptr,
 
   // Vertical pass (transposed intermediate -> dst).
   {
-    uint16_t *src_ptr = intermediate_buffer;
+    src_ptr = intermediate_buffer;
     const int dst_next_row_stride = dst_stride - output_width;
     unsigned int i, j;
     for (i = 0; i < output_height; ++i) {
@@ -300,9 +337,9 @@ void wrapper_filter_average_block2d_8_c(
     filter_average_block2d_8_c(src_ptr, src_stride, hfilter, vfilter, dst_ptr,
                                dst_stride, output_width, output_height);
   } else {
-    highbd_filter_average_block2d_8_c(CONVERT_TO_SHORTPTR(src_ptr), src_stride,
+    highbd_filter_average_block2d_8_c(CAST_TO_SHORTPTR(src_ptr), src_stride,
                                       hfilter, vfilter,
-                                      CONVERT_TO_SHORTPTR(dst_ptr), dst_stride,
+                                      CAST_TO_SHORTPTR(dst_ptr), dst_stride,
                                       output_width, output_height, use_highbd);
   }
 #else
@@ -323,8 +360,8 @@ void wrapper_filter_block2d_8_c(const uint8_t *src_ptr,
     filter_block2d_8_c(src_ptr, src_stride, hfilter, vfilter, dst_ptr,
                        dst_stride, output_width, output_height);
   } else {
-    highbd_filter_block2d_8_c(CONVERT_TO_SHORTPTR(src_ptr), src_stride, hfilter,
-                              vfilter, CONVERT_TO_SHORTPTR(dst_ptr), dst_stride,
+    highbd_filter_block2d_8_c(CAST_TO_SHORTPTR(src_ptr), src_stride, hfilter,
+                              vfilter, CAST_TO_SHORTPTR(dst_ptr), dst_stride,
                               output_width, output_height, use_highbd);
   }
 #else
@@ -336,7 +373,7 @@ void wrapper_filter_block2d_8_c(const uint8_t *src_ptr,
 
 class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
  public:
-  static void SetUpTestCase() {
+  static void SetUpTestSuite() {
     // Force input_ to be unaligned, output to be 16 byte aligned.
     input_ = reinterpret_cast<uint8_t *>(
                  vpx_memalign(kDataAlignment, kInputBufferSize + 1)) +
@@ -356,22 +393,22 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
 #endif
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
-  static void TearDownTestCase() {
+  static void TearDownTestSuite() {
     vpx_free(input_ - 1);
-    input_ = NULL;
+    input_ = nullptr;
     vpx_free(output_);
-    output_ = NULL;
+    output_ = nullptr;
     vpx_free(output_ref_);
-    output_ref_ = NULL;
+    output_ref_ = nullptr;
 #if CONFIG_VP9_HIGHBITDEPTH
     vpx_free(input16_ - 1);
-    input16_ = NULL;
+    input16_ = nullptr;
     vpx_free(output16_);
-    output16_ = NULL;
+    output16_ = nullptr;
     vpx_free(output16_ref_);
-    output16_ref_ = NULL;
+    output16_ref_ = nullptr;
 #endif
   }
 
@@ -398,7 +435,7 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
             i % kOuterBlockSize >= (BorderLeft() + Width()));
   }
 
-  virtual void SetUp() {
+  void SetUp() override {
     UUT_ = GET_PARAM(2);
 #if CONFIG_VP9_HIGHBITDEPTH
     if (UUT_->use_highbd_ != 0) {
@@ -411,8 +448,14 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
     for (int i = 0; i < kOutputBufferSize; ++i) {
       if (IsIndexInBorder(i)) {
         output_[i] = 255;
+#if CONFIG_VP9_HIGHBITDEPTH
+        output16_[i] = mask_;
+#endif
       } else {
         output_[i] = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+        output16_[i] = 0;
+#endif
       }
     }
 
@@ -449,7 +492,9 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
 
   void CheckGuardBlocks() {
     for (int i = 0; i < kOutputBufferSize; ++i) {
-      if (IsIndexInBorder(i)) EXPECT_EQ(255, output_[i]);
+      if (IsIndexInBorder(i)) {
+        EXPECT_EQ(255, output_[i]);
+      }
     }
   }
 
@@ -459,7 +504,7 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
     if (UUT_->use_highbd_ == 0) {
       return input_ + offset;
     } else {
-      return CONVERT_TO_BYTEPTR(input16_) + offset;
+      return CAST_TO_BYTEPTR(input16_ + offset);
     }
 #else
     return input_ + offset;
@@ -472,7 +517,7 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
     if (UUT_->use_highbd_ == 0) {
       return output_ + offset;
     } else {
-      return CONVERT_TO_BYTEPTR(output16_) + offset;
+      return CAST_TO_BYTEPTR(output16_ + offset);
     }
 #else
     return output_ + offset;
@@ -485,7 +530,7 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
     if (UUT_->use_highbd_ == 0) {
       return output_ref_ + offset;
     } else {
-      return CONVERT_TO_BYTEPTR(output16_ref_) + offset;
+      return CAST_TO_BYTEPTR(output16_ref_ + offset);
     }
 #else
     return output_ref_ + offset;
@@ -497,7 +542,7 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
     if (UUT_->use_highbd_ == 0) {
       return list[index];
     } else {
-      return CONVERT_TO_SHORTPTR(list)[index];
+      return CAST_TO_SHORTPTR(list)[index];
     }
 #else
     return list[index];
@@ -509,7 +554,7 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
     if (UUT_->use_highbd_ == 0) {
       list[index] = (uint8_t)val;
     } else {
-      CONVERT_TO_SHORTPTR(list)[index] = val;
+      CAST_TO_SHORTPTR(list)[index] = val;
     }
 #else
     list[index] = (uint8_t)val;
@@ -528,23 +573,679 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
 #endif
 };
 
-uint8_t *ConvolveTest::input_ = NULL;
-uint8_t *ConvolveTest::output_ = NULL;
-uint8_t *ConvolveTest::output_ref_ = NULL;
+uint8_t *ConvolveTest::input_ = nullptr;
+uint8_t *ConvolveTest::output_ = nullptr;
+uint8_t *ConvolveTest::output_ref_ = nullptr;
 #if CONFIG_VP9_HIGHBITDEPTH
-uint16_t *ConvolveTest::input16_ = NULL;
-uint16_t *ConvolveTest::output16_ = NULL;
-uint16_t *ConvolveTest::output16_ref_ = NULL;
+uint16_t *ConvolveTest::input16_ = nullptr;
+uint16_t *ConvolveTest::output16_ = nullptr;
+uint16_t *ConvolveTest::output16_ref_ = nullptr;
+#endif
+#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER
+class ConvolveTest12Tap : public ::testing::TestWithParam<Convolve12TapParam> {
+ public:
+  static void SetUpTestSuite() {
+    // Force input_ to be unaligned, output to be 16 byte aligned.
+    input_ = reinterpret_cast<uint8_t *>(
+                 vpx_memalign(kDataAlignment, kInputBufferSize + 1)) +
+             1;
+    output_ = reinterpret_cast<uint8_t *>(
+        vpx_memalign(kDataAlignment, kOutputBufferSize));
+#if CONFIG_VP9_HIGHBITDEPTH
+    input16_ = reinterpret_cast<uint16_t *>(vpx_memalign(
+                   kDataAlignment, (kInputBufferSize + 1) * sizeof(uint16_t))) +
+               1;
+    output16_ = reinterpret_cast<uint16_t *>(
+        vpx_memalign(kDataAlignment, (kOutputBufferSize) * sizeof(uint16_t)));
+#endif
+  }
+
+  void TearDown() override { libvpx_test::ClearSystemState(); }
+
+  static void TearDownTestSuite() {
+    vpx_free(input_ - 1);
+    input_ = nullptr;
+    vpx_free(output_);
+    output_ = nullptr;
+#if CONFIG_VP9_HIGHBITDEPTH
+    vpx_free(input16_ - 1);
+    input16_ = nullptr;
+    vpx_free(output16_);
+    output16_ = nullptr;
+#endif
+  }
+
+ protected:
+  static const int kDataAlignment = 16;
+  static const int kOuterBlockSize = 256;
+  static const int kInputStride = kOuterBlockSize;
+  static const int kOutputStride = kOuterBlockSize;
+  static const int kInputBufferSize = kOuterBlockSize * kOuterBlockSize;
+  static const int kOutputBufferSize = kOuterBlockSize * kOuterBlockSize;
+
+  int Width() const { return GET_PARAM(0); }
+  int Height() const { return GET_PARAM(1); }
+  int BorderLeft() const {
+    const int center = (kOuterBlockSize - Width()) / 2;
+    return (center + (kDataAlignment - 1)) & ~(kDataAlignment - 1);
+  }
+  int BorderTop() const { return (kOuterBlockSize - Height()) / 2; }
+
+  bool IsIndexInBorder(int i) {
+    return (i < BorderTop() * kOuterBlockSize ||
+            i >= (BorderTop() + Height()) * kOuterBlockSize ||
+            i % kOuterBlockSize < BorderLeft() ||
+            i % kOuterBlockSize >= (BorderLeft() + Width()));
+  }
+
+  void SetUp() override {
+    UUT_ = GET_PARAM(2);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (UUT_->use_highbd_ != 0) {
+      mask_ = (1 << UUT_->use_highbd_) - 1;
+    } else {
+      mask_ = 255;
+    }
+#endif
+    /* Set up guard blocks for an inner block centered in the outer block */
+    for (int i = 0; i < kOutputBufferSize; ++i) {
+      if (IsIndexInBorder(i)) {
+        output_[i] = 255;
+#if CONFIG_VP9_HIGHBITDEPTH
+        output16_[i] = mask_;
+#endif
+      } else {
+        output_[i] = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+        output16_[i] = 0;
+#endif
+      }
+    }
+
+    ::libvpx_test::ACMRandom prng;
+    for (int i = 0; i < kInputBufferSize; ++i) {
+      if (i & 1) {
+        input_[i] = 255;
+#if CONFIG_VP9_HIGHBITDEPTH
+        input16_[i] = mask_;
+#endif
+      } else {
+        input_[i] = prng.Rand8Extremes();
+#if CONFIG_VP9_HIGHBITDEPTH
+        input16_[i] = prng.Rand16() & mask_;
+#endif
+      }
+    }
+  }
+
+  void SetConstantInput(int value) {
+    memset(input_, value, kInputBufferSize);
+#if CONFIG_VP9_HIGHBITDEPTH
+    vpx_memset16(input16_, value, kInputBufferSize);
+#endif
+  }
+
+  void CheckGuardBlocks() {
+    for (int i = 0; i < kOutputBufferSize; ++i) {
+      if (IsIndexInBorder(i)) {
+        EXPECT_EQ(255, output_[i]);
+      }
+    }
+  }
+
+  uint8_t *input() const {
+    const int offset = BorderTop() * kOuterBlockSize + BorderLeft();
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (UUT_->use_highbd_ == 0) {
+      return input_ + offset;
+    } else {
+      return CAST_TO_BYTEPTR(input16_ + offset);
+    }
+#else
+    return input_ + offset;
+#endif
+  }
+
+  uint8_t *output() const {
+    const int offset = BorderTop() * kOuterBlockSize + BorderLeft();
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (UUT_->use_highbd_ == 0) {
+      return output_ + offset;
+    } else {
+      return CAST_TO_BYTEPTR(output16_ + offset);
+    }
+#else
+    return output_ + offset;
+#endif
+  }
+
+  uint16_t lookup(uint8_t *list, int index) const {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (UUT_->use_highbd_ == 0) {
+      return list[index];
+    } else {
+      return CAST_TO_SHORTPTR(list)[index];
+    }
+#else
+    return list[index];
+#endif
+  }
+
+  void assign_val(uint8_t *list, int index, uint16_t val) const {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (UUT_->use_highbd_ == 0) {
+      list[index] = (uint8_t)val;
+    } else {
+      CAST_TO_SHORTPTR(list)[index] = val;
+    }
+#else
+    list[index] = (uint8_t)val;
+#endif
+  }
+  const ConvolveFunctions12Tap *UUT_;
+  static uint8_t *input_;
+  static uint8_t *output_;
+#if CONFIG_VP9_HIGHBITDEPTH
+  static uint16_t *input16_;
+  static uint16_t *output16_;
+  int mask_;
+#endif
+};
+
+uint8_t *ConvolveTest12Tap::input_ = nullptr;
+uint8_t *ConvolveTest12Tap::output_ = nullptr;
+#if CONFIG_VP9_HIGHBITDEPTH
+uint16_t *ConvolveTest12Tap::input16_ = nullptr;
+uint16_t *ConvolveTest12Tap::output16_ = nullptr;
+#endif
+
+TEST_P(ConvolveTest12Tap, MatchesReferenceSubpixelFilter) {
+  uint8_t *const in = input();
+  uint8_t *const out = output();
+#if CONFIG_VP9_HIGHBITDEPTH
+  uint8_t ref8[kOutputStride * kMaxDimension];
+  uint16_t ref16[kOutputStride * kMaxDimension];
+  uint8_t *ref;
+  if (UUT_->use_highbd_ == 0) {
+    ref = ref8;
+  } else {
+    ref = CAST_TO_BYTEPTR(ref16);
+  }
+#else
+  uint8_t ref[kOutputStride * kMaxDimension];
+#endif
+
+  // Populate ref and out with some random data
+  ::libvpx_test::ACMRandom prng;
+  for (int y = 0; y < Height(); ++y) {
+    for (int x = 0; x < Width(); ++x) {
+      uint16_t r;
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (UUT_->use_highbd_ == 0 || UUT_->use_highbd_ == 8) {
+        r = prng.Rand8Extremes();
+      } else {
+        r = prng.Rand16() & mask_;
+      }
+#else
+      r = prng.Rand8Extremes();
+#endif
+
+      assign_val(out, y * kOutputStride + x, r);
+      assign_val(ref, y * kOutputStride + x, r);
+    }
+  }
+
+  const InterpKernel12 *filters = sub_pel_filters_12;
+  for (int filter_x = 0; filter_x < 16; ++filter_x) {
+    for (int filter_y = 0; filter_y < 16; ++filter_y) {
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (UUT_->use_highbd_ == 0) {
+        vpx_convolve12_c(in, kInputStride, ref, kOutputStride, filters,
+                         filter_x, 16, filter_y, 16, Width(), Height());
+      } else {
+        vpx_highbd_convolve12_c(CAST_TO_SHORTPTR(in), kInputStride,
+                                CAST_TO_SHORTPTR(ref), kOutputStride, filters,
+                                filter_x, 16, filter_y, 16, Width(), Height(),
+                                UUT_->use_highbd_);
+      }
+#else
+      vpx_convolve12_c(in, kInputStride, ref, kOutputStride, filters, filter_x,
+                       16, filter_y, 16, Width(), Height());
+#endif
+      if (filter_x && filter_y)
+        ASM_REGISTER_STATE_CHECK(
+            UUT_->hv12_(in, kInputStride, out, kOutputStride, filters, filter_x,
+                        16, filter_y, 16, Width(), Height()));
+      else if (filter_y)
+        ASM_REGISTER_STATE_CHECK(UUT_->v12_(in, kInputStride, out,
+                                            kOutputStride, filters, 0, 16,
+                                            filter_y, 16, Width(), Height()));
+      else if (filter_x)
+        ASM_REGISTER_STATE_CHECK(UUT_->h12_(in, kInputStride, out,
+                                            kOutputStride, filters, filter_x,
+                                            16, 0, 16, Width(), Height()));
+      else
+        continue;
+
+      CheckGuardBlocks();
+
+      for (int y = 0; y < Height(); ++y) {
+        for (int x = 0; x < Width(); ++x)
+          ASSERT_EQ(lookup(ref, y * kOutputStride + x),
+                    lookup(out, y * kOutputStride + x))
+              << "mismatch at (" << x << "," << y << "), "
+              << "filters ("
+              << "," << filter_x << "," << filter_y << ")";
+      }
+    }
+  }
+}
+
+TEST_P(ConvolveTest12Tap, FilterExtremes) {
+  uint8_t *const in = input();
+  uint8_t *const out = output();
+#if CONFIG_VP9_HIGHBITDEPTH
+  uint8_t ref8[kOutputStride * kMaxDimension];
+  uint16_t ref16[kOutputStride * kMaxDimension];
+  uint8_t *ref;
+  if (UUT_->use_highbd_ == 0) {
+    ref = ref8;
+  } else {
+    ref = CAST_TO_BYTEPTR(ref16);
+  }
+#else
+  uint8_t ref[kOutputStride * kMaxDimension];
+#endif
+
+  // Populate ref and out with some random data
+  ::libvpx_test::ACMRandom prng;
+  for (int y = 0; y < Height(); ++y) {
+    for (int x = 0; x < Width(); ++x) {
+      uint16_t r;
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (UUT_->use_highbd_ == 0 || UUT_->use_highbd_ == 8) {
+        r = prng.Rand8Extremes();
+      } else {
+        r = prng.Rand16() & mask_;
+      }
+#else
+      r = prng.Rand8Extremes();
+#endif
+      assign_val(out, y * kOutputStride + x, r);
+      assign_val(ref, y * kOutputStride + x, r);
+    }
+  }
+
+  for (int axis = 0; axis < 2; axis++) {
+    int seed_val = 0;
+    while (seed_val < 256) {
+      for (int y = 0; y < 8; ++y) {
+        for (int x = 0; x < 8; ++x) {
+#if CONFIG_VP9_HIGHBITDEPTH
+          assign_val(in, y * kOutputStride + x - MAX_FILTER_TAP / 2 + 1,
+                     ((seed_val >> (axis ? y : x)) & 1) * mask_);
+#else
+          assign_val(in, y * kOutputStride + x - MAX_FILTER_TAP / 2 + 1,
+                     ((seed_val >> (axis ? y : x)) & 1) * 255);
+#endif
+          if (axis) seed_val++;
+        }
+        if (axis) {
+          seed_val -= 8;
+        } else {
+          seed_val++;
+        }
+      }
+      if (axis) seed_val += 8;
+
+      const InterpKernel12 *filters = sub_pel_filters_12;
+      for (int filter_x = 0; filter_x < 16; ++filter_x) {
+        for (int filter_y = 0; filter_y < 16; ++filter_y) {
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (UUT_->use_highbd_ == 0) {
+            vpx_convolve12_c(in, kInputStride, ref, kOutputStride, filters,
+                             filter_x, 16, filter_y, 16, Width(), Height());
+          } else {
+            vpx_highbd_convolve12_c(CAST_TO_SHORTPTR(in), kInputStride,
+                                    CAST_TO_SHORTPTR(ref), kOutputStride,
+                                    filters, filter_x, 16, filter_y, 16,
+                                    Width(), Height(), UUT_->use_highbd_);
+          }
+#else
+          vpx_convolve12_c(in, kInputStride, ref, kOutputStride, filters,
+                           filter_x, 16, filter_y, 16, Width(), Height());
+#endif
+          if (filter_x && filter_y)
+            ASM_REGISTER_STATE_CHECK(
+                UUT_->hv12_(in, kInputStride, out, kOutputStride, filters,
+                            filter_x, 16, filter_y, 16, Width(), Height()));
+          else if (filter_y)
+            ASM_REGISTER_STATE_CHECK(
+                UUT_->v12_(in, kInputStride, out, kOutputStride, filters, 0, 16,
+                           filter_y, 16, Width(), Height()));
+          else if (filter_x)
+            ASM_REGISTER_STATE_CHECK(
+                UUT_->h12_(in, kInputStride, out, kOutputStride, filters,
+                           filter_x, 16, 0, 16, Width(), Height()));
+          else
+            continue;
+
+          for (int y = 0; y < Height(); ++y) {
+            for (int x = 0; x < Width(); ++x)
+              ASSERT_EQ(lookup(ref, y * kOutputStride + x),
+                        lookup(out, y * kOutputStride + x))
+                  << "mismatch at (" << x << "," << y << "), "
+                  << "filters ("
+                  << "," << filter_x << "," << filter_y << ")";
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST_P(ConvolveTest12Tap, DISABLED_12Tap_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel12 *const twelvetap = sub_pel_filters_12;
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  SetConstantInput(127);
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->hv12_(in, kInputStride, out, kOutputStride, twelvetap, 8, 16, 8, 16,
+                width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve12_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest12Tap, DISABLED_12Tap_Horiz_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel12 *const twelvetap = sub_pel_filters_12;
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  SetConstantInput(127);
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->h12_(in, kInputStride, out, kOutputStride, twelvetap, 8, 16, 8, 16,
+               width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve12_horiz_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest12Tap, DISABLED_12Tap_Vert_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel12 *const twelvetap = sub_pel_filters_12;
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  SetConstantInput(127);
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->v12_(in, kInputStride, out, kOutputStride, twelvetap, 8, 16, 8, 16,
+               width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve12_vert_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
 #endif
 
 TEST_P(ConvolveTest, GuardBlocks) { CheckGuardBlocks(); }
 
+TEST_P(ConvolveTest, DISABLED_Copy_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->copy_[0](in, kInputStride, out, kOutputStride, nullptr, 0, 0, 0, 0,
+                   width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve_copy_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_Avg_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->copy_[1](in, kInputStride, out, kOutputStride, nullptr, 0, 0, 0, 0,
+                   width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve_avg_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_Scale_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP];
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  SetConstantInput(127);
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->shv8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
+                   width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve_scale_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_8Tap_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  SetConstantInput(127);
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->hv8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
+                  width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve8_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_8Tap_Horiz_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  SetConstantInput(127);
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->h8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
+                 width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve8_horiz_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_8Tap_Vert_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  SetConstantInput(127);
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->v8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
+                 width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve8_vert_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_4Tap_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel *const fourtap = vp9_filter_kernels[FOURTAP];
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  SetConstantInput(127);
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->hv8_[0](in, kInputStride, out, kOutputStride, fourtap, 8, 16, 8, 16,
+                  width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve4_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_4Tap_Horiz_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel *const fourtap = vp9_filter_kernels[FOURTAP];
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  SetConstantInput(127);
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->h8_[0](in, kInputStride, out, kOutputStride, fourtap, 8, 16, 8, 16,
+                 width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve4_horiz_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_4Tap_Vert_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel *const fourtap = vp9_filter_kernels[FOURTAP];
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  SetConstantInput(127);
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->v8_[0](in, kInputStride, out, kOutputStride, fourtap, 8, 16, 8, 16,
+                 width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve4_vert_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+TEST_P(ConvolveTest, DISABLED_8Tap_Avg_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  SetConstantInput(127);
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->hv8_[1](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
+                  width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve8_avg_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
 TEST_P(ConvolveTest, Copy) {
   uint8_t *const in = input();
   uint8_t *const out = output();
 
   ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](in, kInputStride, out, kOutputStride,
-                                          NULL, 0, NULL, 0, Width(), Height()));
+                                          nullptr, 0, 0, 0, 0, Width(),
+                                          Height()));
 
   CheckGuardBlocks();
 
@@ -563,7 +1264,8 @@ TEST_P(ConvolveTest, Avg) {
   CopyOutputToRef();
 
   ASM_REGISTER_STATE_CHECK(UUT_->copy_[1](in, kInputStride, out, kOutputStride,
-                                          NULL, 0, NULL, 0, Width(), Height()));
+                                          nullptr, 0, 0, 0, 0, Width(),
+                                          Height()));
 
   CheckGuardBlocks();
 
@@ -580,12 +1282,10 @@ TEST_P(ConvolveTest, Avg) {
 TEST_P(ConvolveTest, CopyHoriz) {
   uint8_t *const in = input();
   uint8_t *const out = output();
-  DECLARE_ALIGNED(256, const int16_t,
-                  filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 };
 
   ASM_REGISTER_STATE_CHECK(UUT_->sh8_[0](in, kInputStride, out, kOutputStride,
-                                         filter8, 16, filter8, 16, Width(),
-                                         Height()));
+                                         vp9_filter_kernels[0], 0, 16, 0, 16,
+                                         Width(), Height()));
 
   CheckGuardBlocks();
 
@@ -600,12 +1300,10 @@ TEST_P(ConvolveTest, CopyHoriz) {
 TEST_P(ConvolveTest, CopyVert) {
   uint8_t *const in = input();
   uint8_t *const out = output();
-  DECLARE_ALIGNED(256, const int16_t,
-                  filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 };
 
   ASM_REGISTER_STATE_CHECK(UUT_->sv8_[0](in, kInputStride, out, kOutputStride,
-                                         filter8, 16, filter8, 16, Width(),
-                                         Height()));
+                                         vp9_filter_kernels[0], 0, 16, 0, 16,
+                                         Width(), Height()));
 
   CheckGuardBlocks();
 
@@ -620,12 +1318,10 @@ TEST_P(ConvolveTest, CopyVert) {
 TEST_P(ConvolveTest, Copy2D) {
   uint8_t *const in = input();
   uint8_t *const out = output();
-  DECLARE_ALIGNED(256, const int16_t,
-                  filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 };
 
   ASM_REGISTER_STATE_CHECK(UUT_->shv8_[0](in, kInputStride, out, kOutputStride,
-                                          filter8, 16, filter8, 16, Width(),
-                                          Height()));
+                                          vp9_filter_kernels[0], 0, 16, 0, 16,
+                                          Width(), Height()));
 
   CheckGuardBlocks();
 
@@ -637,7 +1333,7 @@ TEST_P(ConvolveTest, Copy2D) {
   }
 }
 
-const int kNumFilterBanks = 4;
+const int kNumFilterBanks = 5;
 const int kNumFilters = 16;
 
 TEST(ConvolveTest, FiltersWontSaturateWhenAddedPairwise) {
@@ -661,7 +1357,6 @@ TEST(ConvolveTest, FiltersWontSaturateWhenAddedPairwise) {
   }
 }
 
-const int16_t kInvalidFilter[8] = { 0 };
 const WrapperFilterBlock2d8Func wrapper_filter_block2d_8[2] = {
   wrapper_filter_block2d_8_c, wrapper_filter_average_block2d_8_c
 };
@@ -677,7 +1372,7 @@ TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) {
     if (UUT_->use_highbd_ == 0) {
       ref = ref8;
     } else {
-      ref = CONVERT_TO_BYTEPTR(ref16);
+      ref = CAST_TO_BYTEPTR(ref16);
     }
 #else
     uint8_t ref[kOutputStride * kMaxDimension];
@@ -714,21 +1409,21 @@ TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) {
                                       Width(), Height(), UUT_->use_highbd_);
 
           if (filter_x && filter_y)
-            ASM_REGISTER_STATE_CHECK(UUT_->hv8_[i](
-                in, kInputStride, out, kOutputStride, filters[filter_x], 16,
-                filters[filter_y], 16, Width(), Height()));
+            ASM_REGISTER_STATE_CHECK(
+                UUT_->hv8_[i](in, kInputStride, out, kOutputStride, filters,
+                              filter_x, 16, filter_y, 16, Width(), Height()));
           else if (filter_y)
-            ASM_REGISTER_STATE_CHECK(UUT_->v8_[i](
-                in, kInputStride, out, kOutputStride, kInvalidFilter, 16,
-                filters[filter_y], 16, Width(), Height()));
+            ASM_REGISTER_STATE_CHECK(
+                UUT_->v8_[i](in, kInputStride, out, kOutputStride, filters, 0,
+                             16, filter_y, 16, Width(), Height()));
           else if (filter_x)
-            ASM_REGISTER_STATE_CHECK(UUT_->h8_[i](
-                in, kInputStride, out, kOutputStride, filters[filter_x], 16,
-                kInvalidFilter, 16, Width(), Height()));
+            ASM_REGISTER_STATE_CHECK(
+                UUT_->h8_[i](in, kInputStride, out, kOutputStride, filters,
+                             filter_x, 16, 0, 16, Width(), Height()));
           else
-            ASM_REGISTER_STATE_CHECK(UUT_->copy_[i](
-                in, kInputStride, out, kOutputStride, kInvalidFilter, 0,
-                kInvalidFilter, 0, Width(), Height()));
+            ASM_REGISTER_STATE_CHECK(
+                UUT_->copy_[i](in, kInputStride, out, kOutputStride, nullptr, 0,
+                               0, 0, 0, Width(), Height()));
 
           CheckGuardBlocks();
 
@@ -756,7 +1451,7 @@ TEST_P(ConvolveTest, FilterExtremes) {
   if (UUT_->use_highbd_ == 0) {
     ref = ref8;
   } else {
-    ref = CONVERT_TO_BYTEPTR(ref16);
+    ref = CAST_TO_BYTEPTR(ref16);
   }
 #else
   uint8_t ref[kOutputStride * kMaxDimension];
@@ -812,21 +1507,21 @@ TEST_P(ConvolveTest, FilterExtremes) {
                                        filters[filter_y], ref, kOutputStride,
                                        Width(), Height(), UUT_->use_highbd_);
             if (filter_x && filter_y)
-              ASM_REGISTER_STATE_CHECK(UUT_->hv8_[0](
-                  in, kInputStride, out, kOutputStride, filters[filter_x], 16,
-                  filters[filter_y], 16, Width(), Height()));
+              ASM_REGISTER_STATE_CHECK(
+                  UUT_->hv8_[0](in, kInputStride, out, kOutputStride, filters,
+                                filter_x, 16, filter_y, 16, Width(), Height()));
             else if (filter_y)
-              ASM_REGISTER_STATE_CHECK(UUT_->v8_[0](
-                  in, kInputStride, out, kOutputStride, kInvalidFilter, 16,
-                  filters[filter_y], 16, Width(), Height()));
+              ASM_REGISTER_STATE_CHECK(
+                  UUT_->v8_[0](in, kInputStride, out, kOutputStride, filters, 0,
+                               16, filter_y, 16, Width(), Height()));
             else if (filter_x)
-              ASM_REGISTER_STATE_CHECK(UUT_->h8_[0](
-                  in, kInputStride, out, kOutputStride, filters[filter_x], 16,
-                  kInvalidFilter, 16, Width(), Height()));
+              ASM_REGISTER_STATE_CHECK(
+                  UUT_->h8_[0](in, kInputStride, out, kOutputStride, filters,
+                               filter_x, 16, 0, 16, Width(), Height()));
             else
-              ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](
-                  in, kInputStride, out, kOutputStride, kInvalidFilter, 0,
-                  kInvalidFilter, 0, Width(), Height()));
+              ASM_REGISTER_STATE_CHECK(
+                  UUT_->copy_[0](in, kInputStride, out, kOutputStride, nullptr,
+                                 0, 0, 0, 0, Width(), Height()));
 
             for (int y = 0; y < Height(); ++y) {
               for (int x = 0; x < Width(); ++x)
@@ -845,47 +1540,66 @@ TEST_P(ConvolveTest, FilterExtremes) {
 
 /* This test exercises that enough rows and columns are filtered with every
    possible initial fractional positions and scaling steps. */
+#if !CONFIG_VP9_HIGHBITDEPTH
+static const ConvolveFunc scaled_2d_c_funcs[2] = { vpx_scaled_2d_c,
+                                                   vpx_scaled_avg_2d_c };
+
 TEST_P(ConvolveTest, CheckScalingFiltering) {
   uint8_t *const in = input();
   uint8_t *const out = output();
-  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP];
+  uint8_t ref[kOutputStride * kMaxDimension];
 
-  SetConstantInput(127);
+  ::libvpx_test::ACMRandom prng;
+  for (int y = 0; y < Height(); ++y) {
+    for (int x = 0; x < Width(); ++x) {
+      const uint16_t r = prng.Rand8Extremes();
+      assign_val(in, y * kInputStride + x, r);
+    }
+  }
 
-  for (int frac = 0; frac < 16; ++frac) {
-    for (int step = 1; step <= 32; ++step) {
-      /* Test the horizontal and vertical filters in combination. */
-      ASM_REGISTER_STATE_CHECK(
-          UUT_->shv8_[0](in, kInputStride, out, kOutputStride, eighttap[frac],
-                         step, eighttap[frac], step, Width(), Height()));
+  for (int i = 0; i < 2; ++i) {
+    for (INTERP_FILTER filter_type = 0; filter_type < 4; ++filter_type) {
+      const InterpKernel *const eighttap = vp9_filter_kernels[filter_type];
+      for (int frac = 0; frac < 16; ++frac) {
+        for (int step = 1; step <= 32; ++step) {
+          /* Test the horizontal and vertical filters in combination. */
+          scaled_2d_c_funcs[i](in, kInputStride, ref, kOutputStride, eighttap,
+                               frac, step, frac, step, Width(), Height());
+          ASM_REGISTER_STATE_CHECK(
+              UUT_->shv8_[i](in, kInputStride, out, kOutputStride, eighttap,
+                             frac, step, frac, step, Width(), Height()));
 
-      CheckGuardBlocks();
+          CheckGuardBlocks();
 
-      for (int y = 0; y < Height(); ++y) {
-        for (int x = 0; x < Width(); ++x) {
-          ASSERT_EQ(lookup(in, y * kInputStride + x),
-                    lookup(out, y * kOutputStride + x))
-              << "x == " << x << ", y == " << y << ", frac == " << frac
-              << ", step == " << step;
+          for (int y = 0; y < Height(); ++y) {
+            for (int x = 0; x < Width(); ++x) {
+              ASSERT_EQ(lookup(ref, y * kOutputStride + x),
+                        lookup(out, y * kOutputStride + x))
+                  << "x == " << x << ", y == " << y << ", frac == " << frac
+                  << ", step == " << step;
+            }
+          }
         }
       }
     }
   }
 }
+#endif
 
-using std::tr1::make_tuple;
+using std::make_tuple;
 
 #if CONFIG_VP9_HIGHBITDEPTH
 #define WRAP(func, bd)                                                       \
   void wrap_##func##_##bd(                                                   \
       const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride,    \
-      const int16_t *filter_y, int filter_y_stride, int w, int h) {          \
-    vpx_highbd_##func(src, src_stride, dst, dst_stride, filter_x,            \
-                      filter_x_stride, filter_y, filter_y_stride, w, h, bd); \
+      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,           \
+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {               \
+    vpx_highbd_##func(reinterpret_cast<const uint16_t *>(src), src_stride,   \
+                      reinterpret_cast<uint16_t *>(dst), dst_stride, filter, \
+                      x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);         \
   }
 
-#if HAVE_SSE2 && ARCH_X86_64
+#if HAVE_SSE2 && VPX_ARCH_X86_64
 WRAP(convolve_copy_sse2, 8)
 WRAP(convolve_avg_sse2, 8)
 WRAP(convolve_copy_sse2, 10)
@@ -910,7 +1624,36 @@ WRAP(convolve8_vert_sse2, 12)
 WRAP(convolve8_avg_vert_sse2, 12)
 WRAP(convolve8_sse2, 12)
 WRAP(convolve8_avg_sse2, 12)
-#endif  // HAVE_SSE2 && ARCH_X86_64
+#endif  // HAVE_SSE2 && VPX_ARCH_X86_64
+
+#if HAVE_AVX2
+WRAP(convolve_copy_avx2, 8)
+WRAP(convolve_avg_avx2, 8)
+WRAP(convolve8_horiz_avx2, 8)
+WRAP(convolve8_avg_horiz_avx2, 8)
+WRAP(convolve8_vert_avx2, 8)
+WRAP(convolve8_avg_vert_avx2, 8)
+WRAP(convolve8_avx2, 8)
+WRAP(convolve8_avg_avx2, 8)
+
+WRAP(convolve_copy_avx2, 10)
+WRAP(convolve_avg_avx2, 10)
+WRAP(convolve8_avx2, 10)
+WRAP(convolve8_horiz_avx2, 10)
+WRAP(convolve8_vert_avx2, 10)
+WRAP(convolve8_avg_avx2, 10)
+WRAP(convolve8_avg_horiz_avx2, 10)
+WRAP(convolve8_avg_vert_avx2, 10)
+
+WRAP(convolve_copy_avx2, 12)
+WRAP(convolve_avg_avx2, 12)
+WRAP(convolve8_avx2, 12)
+WRAP(convolve8_horiz_avx2, 12)
+WRAP(convolve8_vert_avx2, 12)
+WRAP(convolve8_avg_avx2, 12)
+WRAP(convolve8_avg_horiz_avx2, 12)
+WRAP(convolve8_avg_vert_avx2, 12)
+#endif  // HAVE_AVX2
 
 #if HAVE_NEON
 WRAP(convolve_copy_neon, 8)
@@ -939,6 +1682,30 @@ WRAP(convolve8_neon, 12)
 WRAP(convolve8_avg_neon, 12)
 #endif  // HAVE_NEON
 
+#if HAVE_SVE
+WRAP(convolve8_horiz_sve, 8)
+WRAP(convolve8_avg_horiz_sve, 8)
+WRAP(convolve8_horiz_sve, 10)
+WRAP(convolve8_avg_horiz_sve, 10)
+WRAP(convolve8_horiz_sve, 12)
+WRAP(convolve8_avg_horiz_sve, 12)
+#endif  // HAVE_SVE
+
+#if HAVE_SVE2
+WRAP(convolve8_sve2, 8)
+WRAP(convolve8_avg_sve2, 8)
+WRAP(convolve8_vert_sve2, 8)
+WRAP(convolve8_avg_vert_sve2, 8)
+WRAP(convolve8_sve2, 10)
+WRAP(convolve8_avg_sve2, 10)
+WRAP(convolve8_vert_sve2, 10)
+WRAP(convolve8_avg_vert_sve2, 10)
+WRAP(convolve8_sve2, 12)
+WRAP(convolve8_avg_sve2, 12)
+WRAP(convolve8_vert_sve2, 12)
+WRAP(convolve8_avg_vert_sve2, 12)
+#endif  // HAVE_SVE2
+
 WRAP(convolve_copy_c, 8)
 WRAP(convolve_avg_c, 8)
 WRAP(convolve8_horiz_c, 8)
@@ -986,9 +1753,9 @@ const ConvolveFunctions convolve12_c(
     wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12,
     wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12,
     wrap_convolve8_avg_c_12, 12);
-const ConvolveParam kArrayConvolve_c[] = {
-  ALL_SIZES(convolve8_c), ALL_SIZES(convolve10_c), ALL_SIZES(convolve12_c)
-};
+const ConvolveParam kArrayConvolve_c[] = { ALL_SIZES(convolve8_c),
+                                           ALL_SIZES(convolve10_c),
+                                           ALL_SIZES(convolve12_c) };
 
 #else
 const ConvolveFunctions convolve8_c(
@@ -999,9 +1766,107 @@ const ConvolveFunctions convolve8_c(
     vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
 const ConvolveParam kArrayConvolve_c[] = { ALL_SIZES(convolve8_c) };
 #endif
-INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::ValuesIn(kArrayConvolve_c));
+INSTANTIATE_TEST_SUITE_P(C, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve_c));
+#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER
+#if CONFIG_VP9_HIGHBITDEPTH
+#define WRAP12TAP(func, bd)                                                  \
+  void wrap_##func##_##bd(                                                   \
+      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
+      ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4,         \
+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {               \
+    vpx_highbd_##func(reinterpret_cast<const uint16_t *>(src), src_stride,   \
+                      reinterpret_cast<uint16_t *>(dst), dst_stride, filter, \
+                      x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);         \
+  }
 
-#if HAVE_SSE2 && ARCH_X86_64
+#if HAVE_AVX2
+WRAP12TAP(convolve12_horiz_avx2, 8)
+WRAP12TAP(convolve12_vert_avx2, 8)
+WRAP12TAP(convolve12_avx2, 8)
+WRAP12TAP(convolve12_horiz_avx2, 10)
+WRAP12TAP(convolve12_vert_avx2, 10)
+WRAP12TAP(convolve12_avx2, 10)
+WRAP12TAP(convolve12_horiz_avx2, 12)
+WRAP12TAP(convolve12_vert_avx2, 12)
+WRAP12TAP(convolve12_avx2, 12)
+#endif  // HAVE_AVX2
+
+#if HAVE_SSSE3
+WRAP12TAP(convolve12_horiz_ssse3, 8)
+WRAP12TAP(convolve12_vert_ssse3, 8)
+WRAP12TAP(convolve12_ssse3, 8)
+WRAP12TAP(convolve12_horiz_ssse3, 10)
+WRAP12TAP(convolve12_vert_ssse3, 10)
+WRAP12TAP(convolve12_ssse3, 10)
+WRAP12TAP(convolve12_horiz_ssse3, 12)
+WRAP12TAP(convolve12_vert_ssse3, 12)
+WRAP12TAP(convolve12_ssse3, 12)
+#endif  // HAVE_SSSE3
+
+#if HAVE_NEON
+WRAP12TAP(convolve12_horiz_neon, 8)
+WRAP12TAP(convolve12_vert_neon, 8)
+WRAP12TAP(convolve12_neon, 8)
+WRAP12TAP(convolve12_horiz_neon, 10)
+WRAP12TAP(convolve12_vert_neon, 10)
+WRAP12TAP(convolve12_neon, 10)
+WRAP12TAP(convolve12_horiz_neon, 12)
+WRAP12TAP(convolve12_vert_neon, 12)
+WRAP12TAP(convolve12_neon, 12)
+#endif  // HAVE_NEON
+
+#if HAVE_SVE2
+WRAP12TAP(convolve12_horiz_sve2, 8)
+WRAP12TAP(convolve12_vert_sve2, 8)
+WRAP12TAP(convolve12_sve2, 8)
+WRAP12TAP(convolve12_horiz_sve2, 10)
+WRAP12TAP(convolve12_vert_sve2, 10)
+WRAP12TAP(convolve12_sve2, 10)
+WRAP12TAP(convolve12_horiz_sve2, 12)
+WRAP12TAP(convolve12_vert_sve2, 12)
+WRAP12TAP(convolve12_sve2, 12)
+#endif  // HAVE_SVE2
+
+WRAP12TAP(convolve12_horiz_c, 8)
+WRAP12TAP(convolve12_vert_c, 8)
+WRAP12TAP(convolve12_c, 8)
+WRAP12TAP(convolve12_horiz_c, 10)
+WRAP12TAP(convolve12_vert_c, 10)
+WRAP12TAP(convolve12_c, 10)
+WRAP12TAP(convolve12_horiz_c, 12)
+WRAP12TAP(convolve12_vert_c, 12)
+WRAP12TAP(convolve12_c, 12)
+#undef WRAP12TAP
+
+const ConvolveFunctions12Tap convolve12tap_8bit_c(wrap_convolve12_horiz_c_8,
+                                                  wrap_convolve12_vert_c_8,
+                                                  wrap_convolve12_c_8, 8);
+
+const ConvolveFunctions12Tap convolve12tap_10bit_c(wrap_convolve12_horiz_c_10,
+                                                   wrap_convolve12_vert_c_10,
+                                                   wrap_convolve12_c_10, 10);
+
+const ConvolveFunctions12Tap convolve12tap_12bit_c(wrap_convolve12_horiz_c_12,
+                                                   wrap_convolve12_vert_c_12,
+                                                   wrap_convolve12_c_12, 12);
+
+const Convolve12TapParam kArrayConvolve12Tap_c[] = {
+  ALL_SIZES_12TAP(convolve12tap_8bit_c), ALL_SIZES_12TAP(convolve12tap_10bit_c),
+  ALL_SIZES_12TAP(convolve12tap_12bit_c)
+};
+#else
+const ConvolveFunctions12Tap convolve12Tap_c(vpx_convolve12_horiz_c,
+                                             vpx_convolve12_vert_c,
+                                             vpx_convolve12_c, 0);
+const Convolve12TapParam kArrayConvolve12Tap_c[] = { ALL_SIZES_12TAP(
+    convolve12Tap_c) };
+#endif
+INSTANTIATE_TEST_SUITE_P(C, ConvolveTest12Tap,
+                         ::testing::ValuesIn(kArrayConvolve12Tap_c));
+#endif
+
+#if HAVE_SSE2 && VPX_ARCH_X86_64
 #if CONFIG_VP9_HIGHBITDEPTH
 const ConvolveFunctions convolve8_sse2(
     wrap_convolve_copy_sse2_8, wrap_convolve_avg_sse2_8,
@@ -1040,8 +1905,8 @@ const ConvolveFunctions convolve8_sse2(
 
 const ConvolveParam kArrayConvolve_sse2[] = { ALL_SIZES(convolve8_sse2) };
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest,
-                        ::testing::ValuesIn(kArrayConvolve_sse2));
+INSTANTIATE_TEST_SUITE_P(SSE2, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve_sse2));
 #endif
 
 #if HAVE_SSSE3
@@ -1053,22 +1918,113 @@ const ConvolveFunctions convolve8_ssse3(
     vpx_scaled_avg_vert_c, vpx_scaled_2d_ssse3, vpx_scaled_avg_2d_c, 0);
 
 const ConvolveParam kArrayConvolve8_ssse3[] = { ALL_SIZES(convolve8_ssse3) };
-INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest,
-                        ::testing::ValuesIn(kArrayConvolve8_ssse3));
+INSTANTIATE_TEST_SUITE_P(SSSE3, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve8_ssse3));
+
+#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER
+#if CONFIG_VP9_HIGHBITDEPTH
+const ConvolveFunctions12Tap convolve12tap_8bit_ssse3(
+    wrap_convolve12_horiz_ssse3_8, wrap_convolve12_vert_ssse3_8,
+    wrap_convolve12_ssse3_8, 8);
+
+const ConvolveFunctions12Tap convolve12tap_10bit_ssse3(
+    wrap_convolve12_horiz_ssse3_10, wrap_convolve12_vert_ssse3_10,
+    wrap_convolve12_ssse3_10, 10);
+
+const ConvolveFunctions12Tap convolve12tap_12bit_ssse3(
+    wrap_convolve12_horiz_ssse3_12, wrap_convolve12_vert_ssse3_12,
+    wrap_convolve12_ssse3_12, 12);
+
+const Convolve12TapParam kArrayConvolve12Tap_ssse3[] = {
+  ALL_SIZES_12TAP(convolve12tap_8bit_ssse3),
+  ALL_SIZES_12TAP(convolve12tap_10bit_ssse3),
+  ALL_SIZES_12TAP(convolve12tap_12bit_ssse3)
+};
+#else
+const ConvolveFunctions12Tap convolve12_ssse3(vpx_convolve12_horiz_ssse3,
+                                              vpx_convolve12_vert_ssse3,
+                                              vpx_convolve12_ssse3, 0);
+const Convolve12TapParam kArrayConvolve12Tap_ssse3[] = { ALL_SIZES_12TAP(
+    convolve12_ssse3) };
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+INSTANTIATE_TEST_SUITE_P(SSSE3, ConvolveTest12Tap,
+                         ::testing::ValuesIn(kArrayConvolve12Tap_ssse3));
+#endif  // !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER
 #endif
 
-#if HAVE_AVX2 && HAVE_SSSE3
+#if HAVE_AVX2
+#if CONFIG_VP9_HIGHBITDEPTH
+const ConvolveFunctions convolve8_avx2(
+    wrap_convolve_copy_avx2_8, wrap_convolve_avg_avx2_8,
+    wrap_convolve8_horiz_avx2_8, wrap_convolve8_avg_horiz_avx2_8,
+    wrap_convolve8_vert_avx2_8, wrap_convolve8_avg_vert_avx2_8,
+    wrap_convolve8_avx2_8, wrap_convolve8_avg_avx2_8, wrap_convolve8_horiz_c_8,
+    wrap_convolve8_avg_horiz_c_8, wrap_convolve8_vert_c_8,
+    wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8, 8);
+const ConvolveFunctions convolve10_avx2(
+    wrap_convolve_copy_avx2_10, wrap_convolve_avg_avx2_10,
+    wrap_convolve8_horiz_avx2_10, wrap_convolve8_avg_horiz_avx2_10,
+    wrap_convolve8_vert_avx2_10, wrap_convolve8_avg_vert_avx2_10,
+    wrap_convolve8_avx2_10, wrap_convolve8_avg_avx2_10,
+    wrap_convolve8_horiz_c_10, wrap_convolve8_avg_horiz_c_10,
+    wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10,
+    wrap_convolve8_avg_c_10, 10);
+const ConvolveFunctions convolve12_avx2(
+    wrap_convolve_copy_avx2_12, wrap_convolve_avg_avx2_12,
+    wrap_convolve8_horiz_avx2_12, wrap_convolve8_avg_horiz_avx2_12,
+    wrap_convolve8_vert_avx2_12, wrap_convolve8_avg_vert_avx2_12,
+    wrap_convolve8_avx2_12, wrap_convolve8_avg_avx2_12,
+    wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12,
+    wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12,
+    wrap_convolve8_avg_c_12, 12);
+const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES(convolve8_avx2),
+                                               ALL_SIZES(convolve10_avx2),
+                                               ALL_SIZES(convolve12_avx2) };
+INSTANTIATE_TEST_SUITE_P(AVX2, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve8_avx2));
+#else   // !CONFIG_VP9_HIGHBITDEPTH
 const ConvolveFunctions convolve8_avx2(
     vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_avx2,
-    vpx_convolve8_avg_horiz_ssse3, vpx_convolve8_vert_avx2,
-    vpx_convolve8_avg_vert_ssse3, vpx_convolve8_avx2, vpx_convolve8_avg_ssse3,
+    vpx_convolve8_avg_horiz_avx2, vpx_convolve8_vert_avx2,
+    vpx_convolve8_avg_vert_avx2, vpx_convolve8_avx2, vpx_convolve8_avg_avx2,
     vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
     vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
-
 const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES(convolve8_avx2) };
-INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest,
-                        ::testing::ValuesIn(kArrayConvolve8_avx2));
-#endif  // HAVE_AVX2 && HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(AVX2, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve8_avx2));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER
+#if CONFIG_VP9_HIGHBITDEPTH
+const ConvolveFunctions12Tap convolve12Tap_8bit_avx2(
+    wrap_convolve12_horiz_avx2_8, wrap_convolve12_vert_avx2_8,
+    wrap_convolve12_avx2_8, 8);
+
+const ConvolveFunctions12Tap convolve12Tap_10bit_avx2(
+    wrap_convolve12_horiz_avx2_10, wrap_convolve12_vert_avx2_10,
+    wrap_convolve12_avx2_10, 10);
+
+const ConvolveFunctions12Tap convolve12Tap_12bit_avx2(
+    wrap_convolve12_horiz_avx2_12, wrap_convolve12_vert_avx2_12,
+    wrap_convolve12_avx2_12, 12);
+
+const Convolve12TapParam kArrayConvolve12Tap_avx2[] = {
+  ALL_SIZES_12TAP(convolve12Tap_8bit_avx2),
+  ALL_SIZES_12TAP(convolve12Tap_10bit_avx2),
+  ALL_SIZES_12TAP(convolve12Tap_12bit_avx2)
+};
+#else
+const ConvolveFunctions12Tap convolve12Tap_avx2(vpx_convolve12_horiz_avx2,
+                                                vpx_convolve12_vert_avx2,
+                                                vpx_convolve12_avx2, 0);
+const Convolve12TapParam kArrayConvolve12Tap_avx2[] = { ALL_SIZES_12TAP(
+    convolve12Tap_avx2) };
+#endif
+INSTANTIATE_TEST_SUITE_P(AVX2, ConvolveTest12Tap,
+                         ::testing::ValuesIn(kArrayConvolve12Tap_avx2));
+#endif
+#endif  // HAVE_AVX2
 
 #if HAVE_NEON
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -1105,14 +2061,188 @@ const ConvolveFunctions convolve8_neon(
     vpx_convolve8_avg_horiz_neon, vpx_convolve8_vert_neon,
     vpx_convolve8_avg_vert_neon, vpx_convolve8_neon, vpx_convolve8_avg_neon,
     vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
-    vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+    vpx_scaled_avg_vert_c, vpx_scaled_2d_neon, vpx_scaled_avg_2d_c, 0);
 
 const ConvolveParam kArrayConvolve_neon[] = { ALL_SIZES(convolve8_neon) };
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest,
-                        ::testing::ValuesIn(kArrayConvolve_neon));
+INSTANTIATE_TEST_SUITE_P(NEON, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve_neon));
+
+#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER
+#if CONFIG_VP9_HIGHBITDEPTH
+const ConvolveFunctions12Tap convolve12tap_8bit_neon(
+    wrap_convolve12_horiz_neon_8, wrap_convolve12_vert_neon_8,
+    wrap_convolve12_neon_8, 8);
+
+const ConvolveFunctions12Tap convolve12tap_10bit_neon(
+    wrap_convolve12_horiz_neon_10, wrap_convolve12_vert_neon_10,
+    wrap_convolve12_neon_10, 10);
+
+const ConvolveFunctions12Tap convolve12tap_12bit_neon(
+    wrap_convolve12_horiz_neon_12, wrap_convolve12_vert_neon_12,
+    wrap_convolve12_neon_12, 12);
+
+const Convolve12TapParam kArrayConvolve12Tap_neon[] = {
+  ALL_SIZES_12TAP(convolve12tap_8bit_neon),
+  ALL_SIZES_12TAP(convolve12tap_10bit_neon),
+  ALL_SIZES_12TAP(convolve12tap_12bit_neon)
+};
+
+#else
+
+const ConvolveFunctions12Tap convolve12Tap_neon(vpx_convolve12_horiz_neon,
+                                                vpx_convolve12_vert_neon,
+                                                vpx_convolve12_neon, 0);
+const Convolve12TapParam kArrayConvolve12Tap_neon[] = { ALL_SIZES_12TAP(
+    convolve12Tap_neon) };
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+INSTANTIATE_TEST_SUITE_P(NEON, ConvolveTest12Tap,
+                         ::testing::ValuesIn(kArrayConvolve12Tap_neon));
+#endif  // !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER
 #endif  // HAVE_NEON
 
+#if HAVE_NEON_DOTPROD
+const ConvolveFunctions convolve8_neon_dotprod(
+    vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_neon_dotprod,
+    vpx_convolve8_avg_horiz_neon_dotprod, vpx_convolve8_vert_neon_dotprod,
+    vpx_convolve8_avg_vert_neon_dotprod, vpx_convolve8_neon_dotprod,
+    vpx_convolve8_avg_neon_dotprod, vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c,
+    vpx_scaled_vert_c, vpx_scaled_avg_vert_c, vpx_scaled_2d_c,
+    vpx_scaled_avg_2d_c, 0);
+
+const ConvolveParam kArrayConvolve_neon_dotprod[] = { ALL_SIZES(
+    convolve8_neon_dotprod) };
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve_neon_dotprod));
+
+#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER
+const ConvolveFunctions12Tap convolve12Tap_neon_dotprod(
+    vpx_convolve12_horiz_neon_dotprod, vpx_convolve12_vert_neon_dotprod,
+    vpx_convolve12_neon_dotprod, 0);
+const Convolve12TapParam kArrayConvolve12Tap_neon_dotprod[] = { ALL_SIZES_12TAP(
+    convolve12Tap_neon_dotprod) };
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, ConvolveTest12Tap,
+                         ::testing::ValuesIn(kArrayConvolve12Tap_neon_dotprod));
+#endif
+#endif  // HAVE_NEON_DOTPROD
+
+#if HAVE_SVE
+#if CONFIG_VP9_HIGHBITDEPTH
+const ConvolveFunctions convolve8_sve(
+    wrap_convolve_copy_c_8, wrap_convolve_avg_c_8, wrap_convolve8_horiz_sve_8,
+    wrap_convolve8_avg_horiz_sve_8, wrap_convolve8_vert_c_8,
+    wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8,
+    wrap_convolve8_horiz_c_8, wrap_convolve8_avg_horiz_c_8,
+    wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8,
+    wrap_convolve8_avg_c_8, 8);
+const ConvolveFunctions convolve10_sve(
+    wrap_convolve_copy_c_10, wrap_convolve_avg_c_10,
+    wrap_convolve8_horiz_sve_10, wrap_convolve8_avg_horiz_sve_10,
+    wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10,
+    wrap_convolve8_avg_c_10, wrap_convolve8_horiz_c_10,
+    wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_c_10,
+    wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, wrap_convolve8_avg_c_10,
+    10);
+const ConvolveFunctions convolve12_sve(
+    wrap_convolve_copy_c_12, wrap_convolve_avg_c_12,
+    wrap_convolve8_horiz_sve_12, wrap_convolve8_avg_horiz_sve_12,
+    wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12,
+    wrap_convolve8_avg_c_12, wrap_convolve8_horiz_c_12,
+    wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_c_12,
+    wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, wrap_convolve8_avg_c_12,
+    12);
+
+const ConvolveParam kArrayConvolve_sve[] = { ALL_SIZES(convolve8_sve),
+                                             ALL_SIZES(convolve10_sve),
+                                             ALL_SIZES(convolve12_sve) };
+INSTANTIATE_TEST_SUITE_P(SVE, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve_sve));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_SVE
+
+#if HAVE_SVE2
+#if CONFIG_VP9_HIGHBITDEPTH
+const ConvolveFunctions convolve8_sve2(
+    wrap_convolve_copy_c_8, wrap_convolve_avg_c_8, wrap_convolve8_horiz_c_8,
+    wrap_convolve8_avg_horiz_c_8, wrap_convolve8_vert_sve2_8,
+    wrap_convolve8_avg_vert_sve2_8, wrap_convolve8_sve2_8,
+    wrap_convolve8_avg_sve2_8, wrap_convolve8_horiz_c_8,
+    wrap_convolve8_avg_horiz_c_8, wrap_convolve8_vert_c_8,
+    wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8, 8);
+const ConvolveFunctions convolve10_sve2(
+    wrap_convolve_copy_c_10, wrap_convolve_avg_c_10, wrap_convolve8_horiz_c_10,
+    wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_sve2_10,
+    wrap_convolve8_avg_vert_sve2_10, wrap_convolve8_sve2_10,
+    wrap_convolve8_avg_sve2_10, wrap_convolve8_horiz_c_10,
+    wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_c_10,
+    wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, wrap_convolve8_avg_c_10,
+    10);
+const ConvolveFunctions convolve12_sve2(
+    wrap_convolve_copy_c_12, wrap_convolve_avg_c_12, wrap_convolve8_horiz_c_12,
+    wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_sve2_12,
+    wrap_convolve8_avg_vert_sve2_12, wrap_convolve8_sve2_12,
+    wrap_convolve8_avg_sve2_12, wrap_convolve8_horiz_c_12,
+    wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_c_12,
+    wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, wrap_convolve8_avg_c_12,
+    12);
+
+const ConvolveParam kArrayConvolve_sve2[] = { ALL_SIZES(convolve8_sve2),
+                                              ALL_SIZES(convolve10_sve2),
+                                              ALL_SIZES(convolve12_sve2) };
+INSTANTIATE_TEST_SUITE_P(SVE2, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve_sve2));
+
+#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER
+const ConvolveFunctions12Tap convolve12tap_8bit_sve2(
+    wrap_convolve12_horiz_sve2_8, wrap_convolve12_vert_sve2_8,
+    wrap_convolve12_sve2_8, 8);
+
+const ConvolveFunctions12Tap convolve12tap_10bit_sve2(
+    wrap_convolve12_horiz_sve2_10, wrap_convolve12_vert_sve2_10,
+    wrap_convolve12_sve2_10, 10);
+
+const ConvolveFunctions12Tap convolve12tap_12bit_sve2(
+    wrap_convolve12_horiz_sve2_12, wrap_convolve12_vert_sve2_12,
+    wrap_convolve12_sve2_12, 12);
+
+const Convolve12TapParam kArrayConvolve12Tap_sve2[] = {
+  ALL_SIZES_12TAP(convolve12tap_8bit_sve2),
+  ALL_SIZES_12TAP(convolve12tap_10bit_sve2),
+  ALL_SIZES_12TAP(convolve12tap_12bit_sve2)
+};
+
+INSTANTIATE_TEST_SUITE_P(SVE2, ConvolveTest12Tap,
+                         ::testing::ValuesIn(kArrayConvolve12Tap_sve2));
+#endif  // !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_SVE2
+
+#if HAVE_NEON_I8MM
+const ConvolveFunctions convolve8_neon_i8mm(
+    vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_neon_i8mm,
+    vpx_convolve8_avg_horiz_neon_i8mm, vpx_convolve8_vert_neon_i8mm,
+    vpx_convolve8_avg_vert_neon_i8mm, vpx_convolve8_neon_i8mm,
+    vpx_convolve8_avg_neon_i8mm, vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c,
+    vpx_scaled_vert_c, vpx_scaled_avg_vert_c, vpx_scaled_2d_c,
+    vpx_scaled_avg_2d_c, 0);
+
+const ConvolveParam kArrayConvolve_neon_i8mm[] = { ALL_SIZES(
+    convolve8_neon_i8mm) };
+INSTANTIATE_TEST_SUITE_P(NEON_I8MM, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve_neon_i8mm));
+
+#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER
+const ConvolveFunctions12Tap convolve12Tap_neon_i8mm(
+    vpx_convolve12_horiz_neon_i8mm, vpx_convolve12_vert_neon_i8mm,
+    vpx_convolve12_neon_i8mm, 0);
+const Convolve12TapParam kArrayConvolve12Tap_neon_i8mm[] = { ALL_SIZES_12TAP(
+    convolve12Tap_neon_i8mm) };
+INSTANTIATE_TEST_SUITE_P(NEON_I8MM, ConvolveTest12Tap,
+                         ::testing::ValuesIn(kArrayConvolve12Tap_neon_i8mm));
+#endif
+#endif  // HAVE_NEON_I8MM
+
 #if HAVE_DSPR2
 const ConvolveFunctions convolve8_dspr2(
     vpx_convolve_copy_dspr2, vpx_convolve_avg_dspr2, vpx_convolve8_horiz_dspr2,
@@ -1122,8 +2252,8 @@ const ConvolveFunctions convolve8_dspr2(
     vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
 
 const ConvolveParam kArrayConvolve8_dspr2[] = { ALL_SIZES(convolve8_dspr2) };
-INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest,
-                        ::testing::ValuesIn(kArrayConvolve8_dspr2));
+INSTANTIATE_TEST_SUITE_P(DSPR2, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve8_dspr2));
 #endif  // HAVE_DSPR2
 
 #if HAVE_MSA
@@ -1132,10 +2262,47 @@ const ConvolveFunctions convolve8_msa(
     vpx_convolve8_avg_horiz_msa, vpx_convolve8_vert_msa,
     vpx_convolve8_avg_vert_msa, vpx_convolve8_msa, vpx_convolve8_avg_msa,
     vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
-    vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+    vpx_scaled_avg_vert_c, vpx_scaled_2d_msa, vpx_scaled_avg_2d_c, 0);
 
 const ConvolveParam kArrayConvolve8_msa[] = { ALL_SIZES(convolve8_msa) };
-INSTANTIATE_TEST_CASE_P(MSA, ConvolveTest,
-                        ::testing::ValuesIn(kArrayConvolve8_msa));
+INSTANTIATE_TEST_SUITE_P(MSA, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve8_msa));
 #endif  // HAVE_MSA
+
+#if HAVE_LSX
+const ConvolveFunctions convolve8_lsx(
+    vpx_convolve_copy_lsx, vpx_convolve_avg_lsx, vpx_convolve8_horiz_lsx,
+    vpx_convolve8_avg_horiz_lsx, vpx_convolve8_vert_lsx,
+    vpx_convolve8_avg_vert_lsx, vpx_convolve8_lsx, vpx_convolve8_avg_lsx,
+    vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
+    vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+
+const ConvolveParam kArrayConvolve8_lsx[] = { ALL_SIZES(convolve8_lsx) };
+INSTANTIATE_TEST_SUITE_P(LSX, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve8_lsx));
+#endif  // HAVE_LSX
+
+#if HAVE_VSX
+const ConvolveFunctions convolve8_vsx(
+    vpx_convolve_copy_vsx, vpx_convolve_avg_vsx, vpx_convolve8_horiz_vsx,
+    vpx_convolve8_avg_horiz_vsx, vpx_convolve8_vert_vsx,
+    vpx_convolve8_avg_vert_vsx, vpx_convolve8_vsx, vpx_convolve8_avg_vsx,
+    vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
+    vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+const ConvolveParam kArrayConvolve_vsx[] = { ALL_SIZES(convolve8_vsx) };
+INSTANTIATE_TEST_SUITE_P(VSX, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve_vsx));
+#endif  // HAVE_VSX
+
+#if HAVE_MMI
+const ConvolveFunctions convolve8_mmi(
+    vpx_convolve_copy_c, vpx_convolve_avg_mmi, vpx_convolve8_horiz_mmi,
+    vpx_convolve8_avg_horiz_mmi, vpx_convolve8_vert_mmi,
+    vpx_convolve8_avg_vert_mmi, vpx_convolve8_mmi, vpx_convolve8_avg_mmi,
+    vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
+    vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+const ConvolveParam kArrayConvolve_mmi[] = { ALL_SIZES(convolve8_mmi) };
+INSTANTIATE_TEST_SUITE_P(MMI, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve_mmi));
+#endif  // HAVE_MMI
 }  // namespace
diff --git a/media/libvpx/libvpx/test/cpu_speed_test.cc b/media/libvpx/libvpx/test/cpu_speed_test.cc
index 404b5b44f4..6e0a046633 100644
--- a/media/libvpx/libvpx/test/cpu_speed_test.cc
+++ b/media/libvpx/libvpx/test/cpu_speed_test.cc
@@ -7,7 +7,7 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
@@ -26,9 +26,9 @@ class CpuSpeedTest
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
         set_cpu_used_(GET_PARAM(2)), min_psnr_(kMaxPSNR),
         tune_content_(VP9E_CONTENT_DEFAULT) {}
-  virtual ~CpuSpeedTest() {}
+  ~CpuSpeedTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
     if (encoding_mode_ != ::libvpx_test::kRealTime) {
@@ -40,11 +40,11 @@ class CpuSpeedTest
     }
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) { min_psnr_ = kMaxPSNR; }
+  void BeginPassHook(unsigned int /*pass*/) override { min_psnr_ = kMaxPSNR; }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
       encoder->Control(VP9E_SET_TUNE_CONTENT, tune_content_);
       if (encoding_mode_ != ::libvpx_test::kRealTime) {
@@ -56,7 +56,7 @@ class CpuSpeedTest
     }
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (pkt->data.psnr.psnr[0] < min_psnr_) min_psnr_ = pkt->data.psnr.psnr[0];
   }
 
@@ -105,7 +105,7 @@ TEST_P(CpuSpeedTest, TestTuneScreen) {
   ::libvpx_test::Y4mVideoSource video("screendata.y4m", 0, 25);
   cfg_.g_timebase = video.timebase();
   cfg_.rc_2pass_vbr_minsection_pct = 5;
-  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_2pass_vbr_maxsection_pct = 2000;
   cfg_.rc_target_bitrate = 2000;
   cfg_.rc_max_quantizer = 63;
   cfg_.rc_min_quantizer = 0;
@@ -148,9 +148,6 @@ TEST_P(CpuSpeedTest, TestLowBitrate) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
-VP9_INSTANTIATE_TEST_CASE(CpuSpeedTest,
-                          ::testing::Values(::libvpx_test::kTwoPassGood,
-                                            ::libvpx_test::kOnePassGood,
-                                            ::libvpx_test::kRealTime),
-                          ::testing::Range(0, 9));
+VP9_INSTANTIATE_TEST_SUITE(CpuSpeedTest, ONE_PASS_TEST_MODES,
+                           ::testing::Range(0, 10));
 }  // namespace
diff --git a/media/libvpx/libvpx/test/cq_test.cc b/media/libvpx/libvpx/test/cq_test.cc
index 20e1f0f3de..103d1b8dca 100644
--- a/media/libvpx/libvpx/test/cq_test.cc
+++ b/media/libvpx/libvpx/test/cq_test.cc
@@ -9,11 +9,12 @@
  */
 #include <cmath>
 #include <map>
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
 #include "test/util.h"
+#include "vpx_config.h"
 
 namespace {
 
@@ -27,11 +28,11 @@ class CQTest : public ::libvpx_test::EncoderTest,
                public ::libvpx_test::CodecTestWithParam<int> {
  public:
   // maps the cqlevel to the bitrate produced.
-  typedef std::map<int, uint32_t> BitrateMap;
+  using BitrateMap = std::map<int, uint32_t>;
 
-  static void SetUpTestCase() { bitrates_.clear(); }
+  static void SetUpTestSuite() { bitrates_.clear(); }
 
-  static void TearDownTestCase() {
+  static void TearDownTestSuite() {
     ASSERT_TRUE(!HasFailure())
         << "skipping bitrate validation due to earlier failure.";
     uint32_t prev_actual_bitrate = kCQTargetBitrate;
@@ -50,22 +51,22 @@ class CQTest : public ::libvpx_test::EncoderTest,
     init_flags_ = VPX_CODEC_USE_PSNR;
   }
 
-  virtual ~CQTest() {}
+  ~CQTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(libvpx_test::kTwoPassGood);
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     file_size_ = 0;
     psnr_ = 0.0;
     n_frames_ = 0;
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
       if (cfg_.rc_end_usage == VPX_CQ) {
         encoder->Control(VP8E_SET_CQ_LEVEL, cq_level_);
       }
@@ -73,12 +74,12 @@ class CQTest : public ::libvpx_test::EncoderTest,
     }
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     psnr_ += pow(10.0, pkt->data.psnr.psnr[0] / 10.0);
     n_frames_++;
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     file_size_ += pkt->data.frame.sz;
   }
 
@@ -104,6 +105,10 @@ CQTest::BitrateMap CQTest::bitrates_;
 
 TEST_P(CQTest, LinearPSNRIsHigherForCQLevel) {
   const vpx_rational timebase = { 33333333, 1000000000 };
+#if CONFIG_REALTIME_ONlY
+  GTEST_SKIP()
+      << "Non-zero g_lag_in_frames is unsupported with CONFIG_REALTIME_ONLY";
+#else
   cfg_.g_timebase = timebase;
   cfg_.rc_target_bitrate = kCQTargetBitrate;
   cfg_.g_lag_in_frames = 25;
@@ -124,8 +129,9 @@ TEST_P(CQTest, LinearPSNRIsHigherForCQLevel) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   const double vbr_psnr_lin = GetLinearPSNROverBitrate();
   EXPECT_GE(cq_psnr_lin, vbr_psnr_lin);
+#endif  // CONFIG_REALTIME_ONLY
 }
 
-VP8_INSTANTIATE_TEST_CASE(CQTest, ::testing::Range(kCQLevelMin, kCQLevelMax,
-                                                   kCQLevelStep));
+VP8_INSTANTIATE_TEST_SUITE(CQTest, ::testing::Range(kCQLevelMin, kCQLevelMax,
+                                                    kCQLevelStep));
 }  // namespace
diff --git a/media/libvpx/libvpx/test/cx_set_ref.sh b/media/libvpx/libvpx/test/cx_set_ref.sh
index 0a58dc187b..0a3d50ce1f 100644
--- a/media/libvpx/libvpx/test/cx_set_ref.sh
+++ b/media/libvpx/libvpx/test/cx_set_ref.sh
@@ -38,7 +38,7 @@ vpx_set_ref() {
 
   eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT_WIDTH}" \
       "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \
-      "${ref_frame_num}" ${devnull}
+      "${ref_frame_num}" ${devnull} || return 1
 
   [ -e "${output_file}" ] || return 1
 }
diff --git a/media/libvpx/libvpx/test/datarate_test.cc b/media/libvpx/libvpx/test/datarate_test.cc
deleted file mode 100644
index 98d77285ab..0000000000
--- a/media/libvpx/libvpx/test/datarate_test.cc
+++ /dev/null
@@ -1,1476 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#include "./vpx_config.h"
-#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "test/codec_factory.h"
-#include "test/encode_test_driver.h"
-#include "test/i420_video_source.h"
-#include "test/util.h"
-#include "test/y4m_video_source.h"
-#include "vpx/vpx_codec.h"
-
-namespace {
-
-class DatarateTestLarge
-    : public ::libvpx_test::EncoderTest,
-      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
- public:
-  DatarateTestLarge() : EncoderTest(GET_PARAM(0)) {}
-
-  virtual ~DatarateTestLarge() {}
-
- protected:
-  virtual void SetUp() {
-    InitializeConfig();
-    SetMode(GET_PARAM(1));
-    set_cpu_used_ = GET_PARAM(2);
-    ResetModel();
-  }
-
-  virtual void ResetModel() {
-    last_pts_ = 0;
-    bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz;
-    frame_number_ = 0;
-    first_drop_ = 0;
-    bits_total_ = 0;
-    duration_ = 0.0;
-    denoiser_offon_test_ = 0;
-    denoiser_offon_period_ = -1;
-    gf_boost_ = 0;
-  }
-
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 0) {
-      encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_);
-      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
-      encoder->Control(VP8E_SET_GF_CBR_BOOST_PCT, gf_boost_);
-    }
-
-    if (denoiser_offon_test_) {
-      ASSERT_GT(denoiser_offon_period_, 0)
-          << "denoiser_offon_period_ is not positive.";
-      if ((video->frame() + 1) % denoiser_offon_period_ == 0) {
-        // Flip denoiser_on_ periodically
-        denoiser_on_ ^= 1;
-      }
-      encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_);
-    }
-
-    const vpx_rational_t tb = video->timebase();
-    timebase_ = static_cast<double>(tb.num) / tb.den;
-    duration_ = 0;
-  }
-
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
-    // Time since last timestamp = duration.
-    vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
-
-    // TODO(jimbankoski): Remove these lines when the issue:
-    // http://code.google.com/p/webm/issues/detail?id=496 is fixed.
-    // For now the codec assumes buffer starts at starting buffer rate
-    // plus one frame's time.
-    if (last_pts_ == 0) duration = 1;
-
-    // Add to the buffer the bits we'd expect from a constant bitrate server.
-    bits_in_buffer_model_ += static_cast<int64_t>(
-        duration * timebase_ * cfg_.rc_target_bitrate * 1000);
-
-    /* Test the buffer model here before subtracting the frame. Do so because
-     * the way the leaky bucket model works in libvpx is to allow the buffer to
-     * empty - and then stop showing frames until we've got enough bits to
-     * show one. As noted in comment below (issue 495), this does not currently
-     * apply to key frames. For now exclude key frames in condition below. */
-    const bool key_frame =
-        (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false;
-    if (!key_frame) {
-      ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame "
-                                          << pkt->data.frame.pts;
-    }
-
-    const int64_t frame_size_in_bits = pkt->data.frame.sz * 8;
-
-    // Subtract from the buffer the bits associated with a played back frame.
-    bits_in_buffer_model_ -= frame_size_in_bits;
-
-    // Update the running total of bits for end of test datarate checks.
-    bits_total_ += frame_size_in_bits;
-
-    // If first drop not set and we have a drop set it to this time.
-    if (!first_drop_ && duration > 1) first_drop_ = last_pts_ + 1;
-
-    // Update the most recent pts.
-    last_pts_ = pkt->data.frame.pts;
-
-    // We update this so that we can calculate the datarate minus the last
-    // frame encoded in the file.
-    bits_in_last_frame_ = frame_size_in_bits;
-
-    ++frame_number_;
-  }
-
-  virtual void EndPassHook(void) {
-    if (bits_total_) {
-      const double file_size_in_kb = bits_total_ / 1000.;  // bits per kilobit
-
-      duration_ = (last_pts_ + 1) * timebase_;
-
-      // Effective file datarate includes the time spent prebuffering.
-      effective_datarate_ = (bits_total_ - bits_in_last_frame_) / 1000.0 /
-                            (cfg_.rc_buf_initial_sz / 1000.0 + duration_);
-
-      file_datarate_ = file_size_in_kb / duration_;
-    }
-  }
-
-  vpx_codec_pts_t last_pts_;
-  int64_t bits_in_buffer_model_;
-  double timebase_;
-  int frame_number_;
-  vpx_codec_pts_t first_drop_;
-  int64_t bits_total_;
-  double duration_;
-  double file_datarate_;
-  double effective_datarate_;
-  int64_t bits_in_last_frame_;
-  int denoiser_on_;
-  int denoiser_offon_test_;
-  int denoiser_offon_period_;
-  int set_cpu_used_;
-  int gf_boost_;
-};
-
-#if CONFIG_TEMPORAL_DENOISING
-// Check basic datarate targeting, for a single bitrate, but loop over the
-// various denoiser settings.
-TEST_P(DatarateTestLarge, DenoiserLevels) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = VPX_CBR;
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 140);
-  for (int j = 1; j < 5; ++j) {
-    // Run over the denoiser levels.
-    // For the temporal denoiser (#if CONFIG_TEMPORAL_DENOISING) the level j
-    // refers to the 4 denoiser modes: denoiserYonly, denoiserOnYUV,
-    // denoiserOnAggressive, and denoiserOnAdaptive.
-    denoiser_on_ = j;
-    cfg_.rc_target_bitrate = 300;
-    ResetModel();
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
-        << " The datarate for the file exceeds the target!";
-
-    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
-        << " The datarate for the file missed the target!";
-  }
-}
-
-// Check basic datarate targeting, for a single bitrate, when denoiser is off
-// and on.
-TEST_P(DatarateTestLarge, DenoiserOffOn) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = VPX_CBR;
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 299);
-  cfg_.rc_target_bitrate = 300;
-  ResetModel();
-  // The denoiser is off by default.
-  denoiser_on_ = 0;
-  // Set the offon test flag.
-  denoiser_offon_test_ = 1;
-  denoiser_offon_period_ = 100;
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
-      << " The datarate for the file exceeds the target!";
-  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
-      << " The datarate for the file missed the target!";
-}
-#endif  // CONFIG_TEMPORAL_DENOISING
-
-TEST_P(DatarateTestLarge, BasicBufferModel) {
-  denoiser_on_ = 0;
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = VPX_CBR;
-  // 2 pass cbr datarate control has a bug hidden by the small # of
-  // frames selected in this encode. The problem is that even if the buffer is
-  // negative we produce a keyframe on a cutscene. Ignoring datarate
-  // constraints
-  // TODO(jimbankoski): ( Fix when issue
-  // http://code.google.com/p/webm/issues/detail?id=495 is addressed. )
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 140);
-
-  // There is an issue for low bitrates in real-time mode, where the
-  // effective_datarate slightly overshoots the target bitrate.
-  // This is same the issue as noted about (#495).
-  // TODO(jimbankoski/marpan): Update test to run for lower bitrates (< 100),
-  // when the issue is resolved.
-  for (int i = 100; i < 800; i += 200) {
-    cfg_.rc_target_bitrate = i;
-    ResetModel();
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
-        << " The datarate for the file exceeds the target!";
-    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
-        << " The datarate for the file missed the target!";
-  }
-}
-
-TEST_P(DatarateTestLarge, ChangingDropFrameThresh) {
-  denoiser_on_ = 0;
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_max_quantizer = 36;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.rc_target_bitrate = 200;
-  cfg_.kf_mode = VPX_KF_DISABLED;
-
-  const int frame_count = 40;
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, frame_count);
-
-  // Here we check that the first dropped frame gets earlier and earlier
-  // as the drop frame threshold is increased.
-
-  const int kDropFrameThreshTestStep = 30;
-  vpx_codec_pts_t last_drop = frame_count;
-  for (int i = 1; i < 91; i += kDropFrameThreshTestStep) {
-    cfg_.rc_dropframe_thresh = i;
-    ResetModel();
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_LE(first_drop_, last_drop)
-        << " The first dropped frame for drop_thresh " << i
-        << " > first dropped frame for drop_thresh "
-        << i - kDropFrameThreshTestStep;
-    last_drop = first_drop_;
-  }
-}
-
-// Disabled for tsan, see:
-// https://bugs.chromium.org/p/webm/issues/detail?id=1049
-#if defined(__has_feature)
-#if __has_feature(thread_sanitizer)
-#define BUILDING_WITH_TSAN
-#endif
-#endif
-#ifndef BUILDING_WITH_TSAN
-TEST_P(DatarateTestLarge, DropFramesMultiThreads) {
-  denoiser_on_ = 0;
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_dropframe_thresh = 30;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_threads = 2;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 140);
-  cfg_.rc_target_bitrate = 200;
-  ResetModel();
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
-      << " The datarate for the file exceeds the target!";
-
-  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
-      << " The datarate for the file missed the target!";
-}
-#endif  // !BUILDING_WITH_TSAN
-
-class DatarateTestRealTime : public DatarateTestLarge {
- public:
-  virtual ~DatarateTestRealTime() {}
-};
-
-#if CONFIG_TEMPORAL_DENOISING
-// Check basic datarate targeting, for a single bitrate, but loop over the
-// various denoiser settings.
-TEST_P(DatarateTestRealTime, DenoiserLevels) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = VPX_CBR;
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 140);
-  for (int j = 1; j < 5; ++j) {
-    // Run over the denoiser levels.
-    // For the temporal denoiser (#if CONFIG_TEMPORAL_DENOISING) the level j
-    // refers to the 4 denoiser modes: denoiserYonly, denoiserOnYUV,
-    // denoiserOnAggressive, and denoiserOnAdaptive.
-    denoiser_on_ = j;
-    cfg_.rc_target_bitrate = 300;
-    ResetModel();
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
-        << " The datarate for the file exceeds the target!";
-    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
-        << " The datarate for the file missed the target!";
-  }
-}
-
-// Check basic datarate targeting, for a single bitrate, when denoiser is off
-// and on.
-TEST_P(DatarateTestRealTime, DenoiserOffOn) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = VPX_CBR;
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 299);
-  cfg_.rc_target_bitrate = 300;
-  ResetModel();
-  // The denoiser is off by default.
-  denoiser_on_ = 0;
-  // Set the offon test flag.
-  denoiser_offon_test_ = 1;
-  denoiser_offon_period_ = 100;
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
-      << " The datarate for the file exceeds the target!";
-  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
-      << " The datarate for the file missed the target!";
-}
-#endif  // CONFIG_TEMPORAL_DENOISING
-
-TEST_P(DatarateTestRealTime, BasicBufferModel) {
-  denoiser_on_ = 0;
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = VPX_CBR;
-  // 2 pass cbr datarate control has a bug hidden by the small # of
-  // frames selected in this encode. The problem is that even if the buffer is
-  // negative we produce a keyframe on a cutscene, ignoring datarate
-  // constraints
-  // TODO(jimbankoski): Fix when issue
-  // http://bugs.chromium.org/p/webm/issues/detail?id=495 is addressed.
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 140);
-
-  // There is an issue for low bitrates in real-time mode, where the
-  // effective_datarate slightly overshoots the target bitrate.
-  // This is same the issue as noted above (#495).
-  // TODO(jimbankoski/marpan): Update test to run for lower bitrates (< 100),
-  // when the issue is resolved.
-  for (int i = 100; i <= 700; i += 200) {
-    cfg_.rc_target_bitrate = i;
-    ResetModel();
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
-        << " The datarate for the file exceeds the target!";
-    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
-        << " The datarate for the file missed the target!";
-  }
-}
-
-TEST_P(DatarateTestRealTime, ChangingDropFrameThresh) {
-  denoiser_on_ = 0;
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_max_quantizer = 36;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.rc_target_bitrate = 200;
-  cfg_.kf_mode = VPX_KF_DISABLED;
-
-  const int frame_count = 40;
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, frame_count);
-
-  // Check that the first dropped frame gets earlier and earlier
-  // as the drop frame threshold is increased.
-
-  const int kDropFrameThreshTestStep = 30;
-  vpx_codec_pts_t last_drop = frame_count;
-  for (int i = 1; i < 91; i += kDropFrameThreshTestStep) {
-    cfg_.rc_dropframe_thresh = i;
-    ResetModel();
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_LE(first_drop_, last_drop)
-        << " The first dropped frame for drop_thresh " << i
-        << " > first dropped frame for drop_thresh "
-        << i - kDropFrameThreshTestStep;
-    last_drop = first_drop_;
-  }
-}
-
-// Disabled for tsan, see:
-// https://bugs.chromium.org/p/webm/issues/detail?id=1049
-
-#ifndef BUILDING_WITH_TSAN
-TEST_P(DatarateTestRealTime, DropFramesMultiThreads) {
-  denoiser_on_ = 0;
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_dropframe_thresh = 30;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = VPX_CBR;
-  // Encode using multiple threads.
-  cfg_.g_threads = 2;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 140);
-  cfg_.rc_target_bitrate = 200;
-  ResetModel();
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
-      << " The datarate for the file exceeds the target!";
-
-  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
-      << " The datarate for the file missed the target!";
-}
-#endif
-
-TEST_P(DatarateTestRealTime, GFBoost) {
-  denoiser_on_ = 0;
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_dropframe_thresh = 0;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_error_resilient = 0;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 300);
-  cfg_.rc_target_bitrate = 300;
-  ResetModel();
-  // Apply a gf boost.
-  gf_boost_ = 50;
-
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
-      << " The datarate for the file exceeds the target!";
-
-  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
-      << " The datarate for the file missed the target!";
-}
-
-class DatarateTestVP9Large
-    : public ::libvpx_test::EncoderTest,
-      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
- public:
-  DatarateTestVP9Large() : EncoderTest(GET_PARAM(0)) {}
-
- protected:
-  virtual ~DatarateTestVP9Large() {}
-
-  virtual void SetUp() {
-    InitializeConfig();
-    SetMode(GET_PARAM(1));
-    set_cpu_used_ = GET_PARAM(2);
-    ResetModel();
-  }
-
-  virtual void ResetModel() {
-    last_pts_ = 0;
-    bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz;
-    frame_number_ = 0;
-    tot_frame_number_ = 0;
-    first_drop_ = 0;
-    num_drops_ = 0;
-    // Denoiser is off by default.
-    denoiser_on_ = 0;
-    // For testing up to 3 layers.
-    for (int i = 0; i < 3; ++i) {
-      bits_total_[i] = 0;
-    }
-    denoiser_offon_test_ = 0;
-    denoiser_offon_period_ = -1;
-  }
-
-  //
-  // Frame flags and layer id for temporal layers.
-  //
-
-  // For two layers, test pattern is:
-  //   1     3
-  // 0    2     .....
-  // For three layers, test pattern is:
-  //   1      3    5      7
-  //      2           6
-  // 0          4            ....
-  // LAST is always update on base/layer 0, GOLDEN is updated on layer 1.
-  // For this 3 layer example, the 2nd enhancement layer (layer 2) does not
-  // update any reference frames.
-  int SetFrameFlags(int frame_num, int num_temp_layers) {
-    int frame_flags = 0;
-    if (num_temp_layers == 2) {
-      if (frame_num % 2 == 0) {
-        // Layer 0: predict from L and ARF, update L.
-        frame_flags =
-            VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
-      } else {
-        // Layer 1: predict from L, G and ARF, and update G.
-        frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST |
-                      VP8_EFLAG_NO_UPD_ENTROPY;
-      }
-    } else if (num_temp_layers == 3) {
-      if (frame_num % 4 == 0) {
-        // Layer 0: predict from L and ARF; update L.
-        frame_flags =
-            VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_GF;
-      } else if ((frame_num - 2) % 4 == 0) {
-        // Layer 1: predict from L, G, ARF; update G.
-        frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST;
-      } else if ((frame_num - 1) % 2 == 0) {
-        // Layer 2: predict from L, G, ARF; update none.
-        frame_flags =
-            VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST;
-      }
-    }
-    return frame_flags;
-  }
-
-  int SetLayerId(int frame_num, int num_temp_layers) {
-    int layer_id = 0;
-    if (num_temp_layers == 2) {
-      if (frame_num % 2 == 0) {
-        layer_id = 0;
-      } else {
-        layer_id = 1;
-      }
-    } else if (num_temp_layers == 3) {
-      if (frame_num % 4 == 0) {
-        layer_id = 0;
-      } else if ((frame_num - 2) % 4 == 0) {
-        layer_id = 1;
-      } else if ((frame_num - 1) % 2 == 0) {
-        layer_id = 2;
-      }
-    }
-    return layer_id;
-  }
-
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 0) encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
-
-    if (denoiser_offon_test_) {
-      ASSERT_GT(denoiser_offon_period_, 0)
-          << "denoiser_offon_period_ is not positive.";
-      if ((video->frame() + 1) % denoiser_offon_period_ == 0) {
-        // Flip denoiser_on_ periodically
-        denoiser_on_ ^= 1;
-      }
-    }
-
-    encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_);
-
-    if (cfg_.ts_number_layers > 1) {
-      if (video->frame() == 0) {
-        encoder->Control(VP9E_SET_SVC, 1);
-      }
-      vpx_svc_layer_id_t layer_id;
-      layer_id.spatial_layer_id = 0;
-      frame_flags_ = SetFrameFlags(video->frame(), cfg_.ts_number_layers);
-      layer_id.temporal_layer_id =
-          SetLayerId(video->frame(), cfg_.ts_number_layers);
-      encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
-    }
-    const vpx_rational_t tb = video->timebase();
-    timebase_ = static_cast<double>(tb.num) / tb.den;
-    duration_ = 0;
-  }
-
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
-    // Time since last timestamp = duration.
-    vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
-
-    if (duration > 1) {
-      // If first drop not set and we have a drop set it to this time.
-      if (!first_drop_) first_drop_ = last_pts_ + 1;
-      // Update the number of frame drops.
-      num_drops_ += static_cast<int>(duration - 1);
-      // Update counter for total number of frames (#frames input to encoder).
-      // Needed for setting the proper layer_id below.
-      tot_frame_number_ += static_cast<int>(duration - 1);
-    }
-
-    int layer = SetLayerId(tot_frame_number_, cfg_.ts_number_layers);
-
-    // Add to the buffer the bits we'd expect from a constant bitrate server.
-    bits_in_buffer_model_ += static_cast<int64_t>(
-        duration * timebase_ * cfg_.rc_target_bitrate * 1000);
-
-    // Buffer should not go negative.
-    ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame "
-                                        << pkt->data.frame.pts;
-
-    const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
-
-    // Update the total encoded bits. For temporal layers, update the cumulative
-    // encoded bits per layer.
-    for (int i = layer; i < static_cast<int>(cfg_.ts_number_layers); ++i) {
-      bits_total_[i] += frame_size_in_bits;
-    }
-
-    // Update the most recent pts.
-    last_pts_ = pkt->data.frame.pts;
-    ++frame_number_;
-    ++tot_frame_number_;
-  }
-
-  virtual void EndPassHook(void) {
-    for (int layer = 0; layer < static_cast<int>(cfg_.ts_number_layers);
-         ++layer) {
-      duration_ = (last_pts_ + 1) * timebase_;
-      if (bits_total_[layer]) {
-        // Effective file datarate:
-        effective_datarate_[layer] = (bits_total_[layer] / 1000.0) / duration_;
-      }
-    }
-  }
-
-  vpx_codec_pts_t last_pts_;
-  double timebase_;
-  int frame_number_;      // Counter for number of non-dropped/encoded frames.
-  int tot_frame_number_;  // Counter for total number of input frames.
-  int64_t bits_total_[3];
-  double duration_;
-  double effective_datarate_[3];
-  int set_cpu_used_;
-  int64_t bits_in_buffer_model_;
-  vpx_codec_pts_t first_drop_;
-  int num_drops_;
-  int denoiser_on_;
-  int denoiser_offon_test_;
-  int denoiser_offon_period_;
-};
-
-// Check basic rate targeting for VBR mode with 0 lag.
-TEST_P(DatarateTestVP9Large, BasicRateTargetingVBRLagZero) {
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.g_error_resilient = 0;
-  cfg_.rc_end_usage = VPX_VBR;
-  cfg_.g_lag_in_frames = 0;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 300);
-  for (int i = 400; i <= 800; i += 400) {
-    cfg_.rc_target_bitrate = i;
-    ResetModel();
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75)
-        << " The datarate for the file is lower than target by too much!";
-    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25)
-        << " The datarate for the file is greater than target by too much!";
-  }
-}
-
-// Check basic rate targeting for VBR mode with non-zero lag.
-TEST_P(DatarateTestVP9Large, BasicRateTargetingVBRLagNonZero) {
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.g_error_resilient = 0;
-  cfg_.rc_end_usage = VPX_VBR;
-  // For non-zero lag, rate control will work (be within bounds) for
-  // real-time mode.
-  if (deadline_ == VPX_DL_REALTIME) {
-    cfg_.g_lag_in_frames = 15;
-  } else {
-    cfg_.g_lag_in_frames = 0;
-  }
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 300);
-  for (int i = 400; i <= 800; i += 400) {
-    cfg_.rc_target_bitrate = i;
-    ResetModel();
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75)
-        << " The datarate for the file is lower than target by too much!";
-    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25)
-        << " The datarate for the file is greater than target by too much!";
-  }
-}
-
-// Check basic rate targeting for CBR mode.
-TEST_P(DatarateTestVP9Large, BasicRateTargeting) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 140);
-  for (int i = 150; i < 800; i += 200) {
-    cfg_.rc_target_bitrate = i;
-    ResetModel();
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
-        << " The datarate for the file is lower than target by too much!";
-    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
-        << " The datarate for the file is greater than target by too much!";
-  }
-}
-
-// Check basic rate targeting for CBR mode, with 2 threads and dropped frames.
-TEST_P(DatarateTestVP9Large, BasicRateTargetingDropFramesMultiThreads) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_dropframe_thresh = 30;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-  // Encode using multiple threads.
-  cfg_.g_threads = 2;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 140);
-  cfg_.rc_target_bitrate = 200;
-  ResetModel();
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
-      << " The datarate for the file is lower than target by too much!";
-  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
-      << " The datarate for the file is greater than target by too much!";
-}
-
-// Check basic rate targeting for CBR.
-TEST_P(DatarateTestVP9Large, BasicRateTargeting444) {
-  ::libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140);
-
-  cfg_.g_profile = 1;
-  cfg_.g_timebase = video.timebase();
-
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-
-  for (int i = 250; i < 900; i += 200) {
-    cfg_.rc_target_bitrate = i;
-    ResetModel();
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(static_cast<double>(cfg_.rc_target_bitrate),
-              effective_datarate_[0] * 0.85)
-        << " The datarate for the file exceeds the target by too much!";
-    ASSERT_LE(static_cast<double>(cfg_.rc_target_bitrate),
-              effective_datarate_[0] * 1.15)
-        << " The datarate for the file missed the target!"
-        << cfg_.rc_target_bitrate << " " << effective_datarate_;
-  }
-}
-
-// Check that (1) the first dropped frame gets earlier and earlier
-// as the drop frame threshold is increased, and (2) that the total number of
-// frame drops does not decrease as we increase frame drop threshold.
-// Use a lower qp-max to force some frame drops.
-TEST_P(DatarateTestVP9Large, ChangingDropFrameThresh) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_undershoot_pct = 20;
-  cfg_.rc_undershoot_pct = 20;
-  cfg_.rc_dropframe_thresh = 10;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 50;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.rc_target_bitrate = 200;
-  cfg_.g_lag_in_frames = 0;
-  // TODO(marpan): Investigate datarate target failures with a smaller keyframe
-  // interval (128).
-  cfg_.kf_max_dist = 9999;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 140);
-
-  const int kDropFrameThreshTestStep = 30;
-  vpx_codec_pts_t last_drop = 140;
-  int last_num_drops = 0;
-  for (int i = 10; i < 100; i += kDropFrameThreshTestStep) {
-    cfg_.rc_dropframe_thresh = i;
-    ResetModel();
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
-        << " The datarate for the file is lower than target by too much!";
-    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
-        << " The datarate for the file is greater than target by too much!";
-    ASSERT_LE(first_drop_, last_drop)
-        << " The first dropped frame for drop_thresh " << i
-        << " > first dropped frame for drop_thresh "
-        << i - kDropFrameThreshTestStep;
-    ASSERT_GE(num_drops_, last_num_drops * 0.85)
-        << " The number of dropped frames for drop_thresh " << i
-        << " < number of dropped frames for drop_thresh "
-        << i - kDropFrameThreshTestStep;
-    last_drop = first_drop_;
-    last_num_drops = num_drops_;
-  }
-}
-
-// Check basic rate targeting for 2 temporal layers.
-TEST_P(DatarateTestVP9Large, BasicRateTargeting2TemporalLayers) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-
-  // 2 Temporal layers, no spatial layers: Framerate decimation (2, 1).
-  cfg_.ss_number_layers = 1;
-  cfg_.ts_number_layers = 2;
-  cfg_.ts_rate_decimator[0] = 2;
-  cfg_.ts_rate_decimator[1] = 1;
-
-  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
-
-  if (deadline_ == VPX_DL_REALTIME) cfg_.g_error_resilient = 1;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 200);
-  for (int i = 200; i <= 800; i += 200) {
-    cfg_.rc_target_bitrate = i;
-    ResetModel();
-    // 60-40 bitrate allocation for 2 temporal layers.
-    cfg_.layer_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100;
-    cfg_.layer_target_bitrate[1] = cfg_.rc_target_bitrate;
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
-      ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.85)
-          << " The datarate for the file is lower than target by too much, "
-             "for layer: "
-          << j;
-      ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.15)
-          << " The datarate for the file is greater than target by too much, "
-             "for layer: "
-          << j;
-    }
-  }
-}
-
-// Check basic rate targeting for 3 temporal layers.
-TEST_P(DatarateTestVP9Large, BasicRateTargeting3TemporalLayers) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-
-  // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1).
-  cfg_.ss_number_layers = 1;
-  cfg_.ts_number_layers = 3;
-  cfg_.ts_rate_decimator[0] = 4;
-  cfg_.ts_rate_decimator[1] = 2;
-  cfg_.ts_rate_decimator[2] = 1;
-
-  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 200);
-  for (int i = 200; i <= 800; i += 200) {
-    cfg_.rc_target_bitrate = i;
-    ResetModel();
-    // 40-20-40 bitrate allocation for 3 temporal layers.
-    cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
-    cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
-    cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate;
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
-      // TODO(yaowu): Work out more stable rc control strategy and
-      //              Adjust the thresholds to be tighter than .75.
-      ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.75)
-          << " The datarate for the file is lower than target by too much, "
-             "for layer: "
-          << j;
-      // TODO(yaowu): Work out more stable rc control strategy and
-      //              Adjust the thresholds to be tighter than 1.25.
-      ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.25)
-          << " The datarate for the file is greater than target by too much, "
-             "for layer: "
-          << j;
-    }
-  }
-}
-
-// Check basic rate targeting for 3 temporal layers, with frame dropping.
-// Only for one (low) bitrate with lower max_quantizer, and somewhat higher
-// frame drop threshold, to force frame dropping.
-TEST_P(DatarateTestVP9Large, BasicRateTargeting3TemporalLayersFrameDropping) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  // Set frame drop threshold and rc_max_quantizer to force some frame drops.
-  cfg_.rc_dropframe_thresh = 20;
-  cfg_.rc_max_quantizer = 45;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-
-  // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1).
-  cfg_.ss_number_layers = 1;
-  cfg_.ts_number_layers = 3;
-  cfg_.ts_rate_decimator[0] = 4;
-  cfg_.ts_rate_decimator[1] = 2;
-  cfg_.ts_rate_decimator[2] = 1;
-
-  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 200);
-  cfg_.rc_target_bitrate = 200;
-  ResetModel();
-  // 40-20-40 bitrate allocation for 3 temporal layers.
-  cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
-  cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
-  cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate;
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
-    ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.85)
-        << " The datarate for the file is lower than target by too much, "
-           "for layer: "
-        << j;
-    ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.15)
-        << " The datarate for the file is greater than target by too much, "
-           "for layer: "
-        << j;
-    // Expect some frame drops in this test: for this 200 frames test,
-    // expect at least 10% and not more than 60% drops.
-    ASSERT_GE(num_drops_, 20);
-    ASSERT_LE(num_drops_, 130);
-  }
-}
-
-#if CONFIG_VP9_TEMPORAL_DENOISING
-class DatarateTestVP9LargeDenoiser : public DatarateTestVP9Large {
- public:
-  virtual ~DatarateTestVP9LargeDenoiser() {}
-};
-
-// Check basic datarate targeting, for a single bitrate, when denoiser is on.
-TEST_P(DatarateTestVP9LargeDenoiser, LowNoise) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_min_quantizer = 2;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 140);
-
-  // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING),
-  // there is only one denoiser mode: denoiserYonly(which is 1),
-  // but may add more modes in the future.
-  cfg_.rc_target_bitrate = 300;
-  ResetModel();
-  // Turn on the denoiser.
-  denoiser_on_ = 1;
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
-      << " The datarate for the file is lower than target by too much!";
-  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
-      << " The datarate for the file is greater than target by too much!";
-}
-
-// Check basic datarate targeting, for a single bitrate, when denoiser is on,
-// for clip with high noise level.
-TEST_P(DatarateTestVP9LargeDenoiser, HighNoise) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_min_quantizer = 2;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-
-  ::libvpx_test::Y4mVideoSource video("noisy_clip_640_360.y4m", 0, 200);
-
-  // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING),
-  // there is only one denoiser mode: denoiserYonly(which is 1),
-  // but may add more modes in the future.
-  cfg_.rc_target_bitrate = 1000;
-  ResetModel();
-  // Turn on the denoiser.
-  denoiser_on_ = 1;
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
-      << " The datarate for the file is lower than target by too much!";
-  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
-      << " The datarate for the file is greater than target by too much!";
-}
-
-// Check basic datarate targeting, for a single bitrate, when denoiser is off
-// and on.
-TEST_P(DatarateTestVP9LargeDenoiser, DenoiserOffOn) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_min_quantizer = 2;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 299);
-
-  // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING),
-  // there is only one denoiser mode: denoiserYonly(which is 1),
-  // but may add more modes in the future.
-  cfg_.rc_target_bitrate = 300;
-  ResetModel();
-  // The denoiser is off by default.
-  denoiser_on_ = 0;
-  // Set the offon test flag.
-  denoiser_offon_test_ = 1;
-  denoiser_offon_period_ = 100;
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
-      << " The datarate for the file is lower than target by too much!";
-  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
-      << " The datarate for the file is greater than target by too much!";
-}
-#endif  // CONFIG_VP9_TEMPORAL_DENOISING
-
-class DatarateOnePassCbrSvc
-    : public ::libvpx_test::EncoderTest,
-      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
- public:
-  DatarateOnePassCbrSvc() : EncoderTest(GET_PARAM(0)) {
-    memset(&svc_params_, 0, sizeof(svc_params_));
-  }
-  virtual ~DatarateOnePassCbrSvc() {}
-
- protected:
-  virtual void SetUp() {
-    InitializeConfig();
-    SetMode(GET_PARAM(1));
-    speed_setting_ = GET_PARAM(2);
-    ResetModel();
-  }
-  virtual void ResetModel() {
-    last_pts_ = 0;
-    bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz;
-    frame_number_ = 0;
-    first_drop_ = 0;
-    bits_total_ = 0;
-    duration_ = 0.0;
-    mismatch_psnr_ = 0.0;
-    mismatch_nframes_ = 0;
-  }
-  virtual void BeginPassHook(unsigned int /*pass*/) {}
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 0) {
-      int i;
-      for (i = 0; i < VPX_MAX_LAYERS; ++i) {
-        svc_params_.max_quantizers[i] = 63;
-        svc_params_.min_quantizers[i] = 0;
-      }
-      svc_params_.speed_per_layer[0] = 5;
-      for (i = 1; i < VPX_SS_MAX_LAYERS; ++i) {
-        svc_params_.speed_per_layer[i] = speed_setting_;
-      }
-      encoder->Control(VP9E_SET_SVC, 1);
-      encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
-      encoder->Control(VP8E_SET_CPUUSED, speed_setting_);
-      encoder->Control(VP9E_SET_TILE_COLUMNS, 0);
-      encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 300);
-      encoder->Control(VP9E_SET_TILE_COLUMNS, (cfg_.g_threads >> 1));
-      encoder->Control(VP8E_SET_STATIC_THRESHOLD, 1);
-    }
-    const vpx_rational_t tb = video->timebase();
-    timebase_ = static_cast<double>(tb.num) / tb.den;
-    duration_ = 0;
-  }
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
-    vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
-    if (last_pts_ == 0) duration = 1;
-    bits_in_buffer_model_ += static_cast<int64_t>(
-        duration * timebase_ * cfg_.rc_target_bitrate * 1000);
-    const bool key_frame =
-        (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false;
-    if (!key_frame) {
-      // TODO(marpan): This check currently fails for some of the SVC tests,
-      // re-enable when issue (webm:1350) is resolved.
-      //  ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame "
-      //                                      << pkt->data.frame.pts;
-    }
-    const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
-    bits_in_buffer_model_ -= static_cast<int64_t>(frame_size_in_bits);
-    bits_total_ += frame_size_in_bits;
-    if (!first_drop_ && duration > 1) first_drop_ = last_pts_ + 1;
-    last_pts_ = pkt->data.frame.pts;
-    bits_in_last_frame_ = frame_size_in_bits;
-    ++frame_number_;
-  }
-  virtual void EndPassHook(void) {
-    if (bits_total_) {
-      const double file_size_in_kb = bits_total_ / 1000.;  // bits per kilobit
-      duration_ = (last_pts_ + 1) * timebase_;
-      file_datarate_ = file_size_in_kb / duration_;
-    }
-  }
-
-  virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) {
-    double mismatch_psnr = compute_psnr(img1, img2);
-    mismatch_psnr_ += mismatch_psnr;
-    ++mismatch_nframes_;
-  }
-
-  unsigned int GetMismatchFrames() { return mismatch_nframes_; }
-
-  vpx_codec_pts_t last_pts_;
-  int64_t bits_in_buffer_model_;
-  double timebase_;
-  int frame_number_;
-  vpx_codec_pts_t first_drop_;
-  int64_t bits_total_;
-  double duration_;
-  double file_datarate_;
-  size_t bits_in_last_frame_;
-  vpx_svc_extra_cfg_t svc_params_;
-  int speed_setting_;
-  double mismatch_psnr_;
-  int mismatch_nframes_;
-};
-static void assign_layer_bitrates(vpx_codec_enc_cfg_t *const enc_cfg,
-                                  const vpx_svc_extra_cfg_t *svc_params,
-                                  int spatial_layers, int temporal_layers,
-                                  int temporal_layering_mode) {
-  int sl, spatial_layer_target;
-  float total = 0;
-  float alloc_ratio[VPX_MAX_LAYERS] = { 0 };
-  for (sl = 0; sl < spatial_layers; ++sl) {
-    if (svc_params->scaling_factor_den[sl] > 0) {
-      alloc_ratio[sl] = (float)(svc_params->scaling_factor_num[sl] * 1.0 /
-                                svc_params->scaling_factor_den[sl]);
-      total += alloc_ratio[sl];
-    }
-  }
-  for (sl = 0; sl < spatial_layers; ++sl) {
-    enc_cfg->ss_target_bitrate[sl] = spatial_layer_target =
-        (unsigned int)(enc_cfg->rc_target_bitrate * alloc_ratio[sl] / total);
-    const int index = sl * temporal_layers;
-    if (temporal_layering_mode == 3) {
-      enc_cfg->layer_target_bitrate[index] = spatial_layer_target >> 1;
-      enc_cfg->layer_target_bitrate[index + 1] =
-          (spatial_layer_target >> 1) + (spatial_layer_target >> 2);
-      enc_cfg->layer_target_bitrate[index + 2] = spatial_layer_target;
-    } else if (temporal_layering_mode == 2) {
-      enc_cfg->layer_target_bitrate[index] = spatial_layer_target * 2 / 3;
-      enc_cfg->layer_target_bitrate[index + 1] = spatial_layer_target;
-    }
-  }
-}
-
-// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and
-// 3 temporal layers. Run CIF clip with 1 thread.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayers) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-  cfg_.ss_number_layers = 2;
-  cfg_.ts_number_layers = 3;
-  cfg_.ts_rate_decimator[0] = 4;
-  cfg_.ts_rate_decimator[1] = 2;
-  cfg_.ts_rate_decimator[2] = 1;
-  cfg_.g_error_resilient = 1;
-  cfg_.g_threads = 1;
-  cfg_.temporal_layering_mode = 3;
-  svc_params_.scaling_factor_num[0] = 144;
-  svc_params_.scaling_factor_den[0] = 288;
-  svc_params_.scaling_factor_num[1] = 288;
-  svc_params_.scaling_factor_den[1] = 288;
-  cfg_.rc_dropframe_thresh = 10;
-  cfg_.kf_max_dist = 9999;
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 200);
-  // TODO(marpan): Check that effective_datarate for each layer hits the
-  // layer target_bitrate.
-  for (int i = 200; i <= 800; i += 200) {
-    cfg_.rc_target_bitrate = i;
-    ResetModel();
-    assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                          cfg_.ts_number_layers, cfg_.temporal_layering_mode);
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
-        << " The datarate for the file exceeds the target by too much!";
-    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
-        << " The datarate for the file is lower than the target by too much!";
-    EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
-  }
-}
-
-// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 3
-// temporal layers. Run CIF clip with 1 thread, and few short key frame periods.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayersSmallKf) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-  cfg_.ss_number_layers = 2;
-  cfg_.ts_number_layers = 3;
-  cfg_.ts_rate_decimator[0] = 4;
-  cfg_.ts_rate_decimator[1] = 2;
-  cfg_.ts_rate_decimator[2] = 1;
-  cfg_.g_error_resilient = 1;
-  cfg_.g_threads = 1;
-  cfg_.temporal_layering_mode = 3;
-  svc_params_.scaling_factor_num[0] = 144;
-  svc_params_.scaling_factor_den[0] = 288;
-  svc_params_.scaling_factor_num[1] = 288;
-  svc_params_.scaling_factor_den[1] = 288;
-  cfg_.rc_dropframe_thresh = 10;
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 200);
-  cfg_.rc_target_bitrate = 400;
-  // For this 3 temporal layer case, pattern repeats every 4 frames, so choose
-  // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2).
-  for (int j = 64; j <= 67; j++) {
-    cfg_.kf_max_dist = j;
-    ResetModel();
-    assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                          cfg_.ts_number_layers, cfg_.temporal_layering_mode);
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
-        << " The datarate for the file exceeds the target by too much!";
-    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
-        << " The datarate for the file is lower than the target by too much!";
-    EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
-  }
-}
-
-// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and
-// 3 temporal layers. Run HD clip with 4 threads.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayers4threads) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-  cfg_.ss_number_layers = 2;
-  cfg_.ts_number_layers = 3;
-  cfg_.ts_rate_decimator[0] = 4;
-  cfg_.ts_rate_decimator[1] = 2;
-  cfg_.ts_rate_decimator[2] = 1;
-  cfg_.g_error_resilient = 1;
-  cfg_.g_threads = 4;
-  cfg_.temporal_layering_mode = 3;
-  svc_params_.scaling_factor_num[0] = 144;
-  svc_params_.scaling_factor_den[0] = 288;
-  svc_params_.scaling_factor_num[1] = 288;
-  svc_params_.scaling_factor_den[1] = 288;
-  cfg_.rc_dropframe_thresh = 10;
-  cfg_.kf_max_dist = 9999;
-  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
-  cfg_.rc_target_bitrate = 800;
-  ResetModel();
-  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                        cfg_.ts_number_layers, cfg_.temporal_layering_mode);
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
-      << " The datarate for the file exceeds the target by too much!";
-  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
-      << " The datarate for the file is lower than the target by too much!";
-  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
-}
-
-// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and
-// 3 temporal layers. Run CIF clip with 1 thread.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SpatialLayers) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-  cfg_.ss_number_layers = 3;
-  cfg_.ts_number_layers = 3;
-  cfg_.ts_rate_decimator[0] = 4;
-  cfg_.ts_rate_decimator[1] = 2;
-  cfg_.ts_rate_decimator[2] = 1;
-  cfg_.g_error_resilient = 1;
-  cfg_.g_threads = 1;
-  cfg_.temporal_layering_mode = 3;
-  svc_params_.scaling_factor_num[0] = 72;
-  svc_params_.scaling_factor_den[0] = 288;
-  svc_params_.scaling_factor_num[1] = 144;
-  svc_params_.scaling_factor_den[1] = 288;
-  svc_params_.scaling_factor_num[2] = 288;
-  svc_params_.scaling_factor_den[2] = 288;
-  cfg_.rc_dropframe_thresh = 10;
-  cfg_.kf_max_dist = 9999;
-  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
-  cfg_.rc_target_bitrate = 800;
-  ResetModel();
-  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                        cfg_.ts_number_layers, cfg_.temporal_layering_mode);
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
-      << " The datarate for the file exceeds the target by too much!";
-  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.22)
-      << " The datarate for the file is lower than the target by too much!";
-  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
-}
-
-// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3
-// temporal layers. Run CIF clip with 1 thread, and few short key frame periods.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SpatialLayersSmallKf) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-  cfg_.ss_number_layers = 3;
-  cfg_.ts_number_layers = 3;
-  cfg_.ts_rate_decimator[0] = 4;
-  cfg_.ts_rate_decimator[1] = 2;
-  cfg_.ts_rate_decimator[2] = 1;
-  cfg_.g_error_resilient = 1;
-  cfg_.g_threads = 1;
-  cfg_.temporal_layering_mode = 3;
-  svc_params_.scaling_factor_num[0] = 72;
-  svc_params_.scaling_factor_den[0] = 288;
-  svc_params_.scaling_factor_num[1] = 144;
-  svc_params_.scaling_factor_den[1] = 288;
-  svc_params_.scaling_factor_num[2] = 288;
-  svc_params_.scaling_factor_den[2] = 288;
-  cfg_.rc_dropframe_thresh = 10;
-  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
-  cfg_.rc_target_bitrate = 800;
-  // For this 3 temporal layer case, pattern repeats every 4 frames, so choose
-  // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2).
-  for (int j = 32; j <= 35; j++) {
-    cfg_.kf_max_dist = j;
-    ResetModel();
-    assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                          cfg_.ts_number_layers, cfg_.temporal_layering_mode);
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
-        << " The datarate for the file exceeds the target by too much!";
-    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.30)
-        << " The datarate for the file is lower than the target by too much!";
-    EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
-  }
-}
-
-// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and
-// 3 temporal layers. Run HD clip with 4 threads.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SpatialLayers4threads) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-  cfg_.ss_number_layers = 3;
-  cfg_.ts_number_layers = 3;
-  cfg_.ts_rate_decimator[0] = 4;
-  cfg_.ts_rate_decimator[1] = 2;
-  cfg_.ts_rate_decimator[2] = 1;
-  cfg_.g_error_resilient = 1;
-  cfg_.g_threads = 4;
-  cfg_.temporal_layering_mode = 3;
-  svc_params_.scaling_factor_num[0] = 72;
-  svc_params_.scaling_factor_den[0] = 288;
-  svc_params_.scaling_factor_num[1] = 144;
-  svc_params_.scaling_factor_den[1] = 288;
-  svc_params_.scaling_factor_num[2] = 288;
-  svc_params_.scaling_factor_den[2] = 288;
-  cfg_.rc_dropframe_thresh = 10;
-  cfg_.kf_max_dist = 9999;
-  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
-  cfg_.rc_target_bitrate = 800;
-  ResetModel();
-  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                        cfg_.ts_number_layers, cfg_.temporal_layering_mode);
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
-      << " The datarate for the file exceeds the target by too much!";
-  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.22)
-      << " The datarate for the file is lower than the target by too much!";
-  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
-}
-
-// Run SVC encoder for 1 temporal layer, 2 spatial layers, with spatial
-// downscale 5x5.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayers5x5MultipleRuns) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-  cfg_.ss_number_layers = 2;
-  cfg_.ts_number_layers = 1;
-  cfg_.ts_rate_decimator[0] = 1;
-  cfg_.g_error_resilient = 1;
-  cfg_.g_threads = 3;
-  cfg_.temporal_layering_mode = 0;
-  svc_params_.scaling_factor_num[0] = 256;
-  svc_params_.scaling_factor_den[0] = 1280;
-  svc_params_.scaling_factor_num[1] = 1280;
-  svc_params_.scaling_factor_den[1] = 1280;
-  cfg_.rc_dropframe_thresh = 0;
-  cfg_.kf_max_dist = 999999;
-  cfg_.kf_min_dist = 0;
-  cfg_.ss_target_bitrate[0] = 300;
-  cfg_.ss_target_bitrate[1] = 1400;
-  cfg_.layer_target_bitrate[0] = 300;
-  cfg_.layer_target_bitrate[1] = 1400;
-  cfg_.rc_target_bitrate = 1700;
-  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
-  ResetModel();
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
-}
-
-VP8_INSTANTIATE_TEST_CASE(DatarateTestLarge, ALL_TEST_MODES,
-                          ::testing::Values(0));
-VP8_INSTANTIATE_TEST_CASE(DatarateTestRealTime,
-                          ::testing::Values(::libvpx_test::kRealTime),
-                          ::testing::Values(-6, -12));
-VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9Large,
-                          ::testing::Values(::libvpx_test::kOnePassGood,
-                                            ::libvpx_test::kRealTime),
-                          ::testing::Range(2, 9));
-#if CONFIG_VP9_TEMPORAL_DENOISING
-VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9LargeDenoiser,
-                          ::testing::Values(::libvpx_test::kRealTime),
-                          ::testing::Range(5, 9));
-#endif
-VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvc,
-                          ::testing::Values(::libvpx_test::kRealTime),
-                          ::testing::Range(5, 9));
-}  // namespace
diff --git a/media/libvpx/libvpx/test/dct16x16_test.cc b/media/libvpx/libvpx/test/dct16x16_test.cc
index f9745ed81f..a3382ab64e 100644
--- a/media/libvpx/libvpx/test/dct16x16_test.cc
+++ b/media/libvpx/libvpx/test/dct16x16_test.cc
@@ -11,8 +11,9 @@
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
+#include <tuple>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "./vp9_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
@@ -24,8 +25,9 @@
 #include "vp9/common/vp9_scan.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_config.h"
 #include "vpx_ports/mem.h"
-#include "vpx_ports/msvc.h"  // for round()
+#include "vpx_ports/vpx_timer.h"
 
 using libvpx_test::ACMRandom;
 
@@ -222,17 +224,16 @@ void reference_16x16_dct_2d(int16_t input[256], double output[256]) {
   }
 }
 
-typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
-typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
-typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
-                        int tx_type);
-typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
-                        int tx_type);
+using FdctFunc = void (*)(const int16_t *in, tran_low_t *out, int stride);
+using IdctFunc = void (*)(const tran_low_t *in, uint8_t *out, int stride);
+using FhtFunc = void (*)(const int16_t *in, tran_low_t *out, int stride,
+                         int tx_type);
+using IhtFunc = void (*)(const tran_low_t *in, uint8_t *out, int stride,
+                         int tx_type);
 
-typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct16x16Param;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht16x16Param;
-typedef std::tr1::tuple<IdctFunc, IdctFunc, int, vpx_bit_depth_t>
-    Idct16x16Param;
+using Dct16x16Param = std::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t>;
+using Ht16x16Param = std::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t>;
+using Idct16x16Param = std::tuple<IdctFunc, IdctFunc, int, vpx_bit_depth_t>;
 
 void fdct16x16_ref(const int16_t *in, tran_low_t *out, int stride,
                    int /*tx_type*/) {
@@ -255,11 +256,11 @@ void iht16x16_ref(const tran_low_t *in, uint8_t *dest, int stride,
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void idct16x16_10(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct16x16_256_add_c(in, out, stride, 10);
+  vpx_highbd_idct16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
 }
 
 void idct16x16_12(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct16x16_256_add_c(in, out, stride, 12);
+  vpx_highbd_idct16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
 }
 
 void idct16x16_10_ref(const tran_low_t *in, uint8_t *out, int stride,
@@ -273,43 +274,43 @@ void idct16x16_12_ref(const tran_low_t *in, uint8_t *out, int stride,
 }
 
 void iht16x16_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
-  vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 10);
+  vp9_highbd_iht16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10);
 }
 
 void iht16x16_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
-  vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 12);
+  vp9_highbd_iht16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12);
 }
 
 #if HAVE_SSE2
 void idct16x16_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct16x16_10_add_c(in, out, stride, 10);
+  vpx_highbd_idct16x16_10_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
 }
 
 void idct16x16_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct16x16_10_add_c(in, out, stride, 12);
+  vpx_highbd_idct16x16_10_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
 }
 
 void idct16x16_256_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct16x16_256_add_sse2(in, out, stride, 10);
+  vpx_highbd_idct16x16_256_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);
 }
 
 void idct16x16_256_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct16x16_256_add_sse2(in, out, stride, 12);
+  vpx_highbd_idct16x16_256_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);
 }
 
 void idct16x16_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct16x16_10_add_sse2(in, out, stride, 10);
+  vpx_highbd_idct16x16_10_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);
 }
 
 void idct16x16_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct16x16_10_add_sse2(in, out, stride, 12);
+  vpx_highbd_idct16x16_10_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);
 }
 #endif  // HAVE_SSE2
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 class Trans16x16TestBase {
  public:
-  virtual ~Trans16x16TestBase() {}
+  virtual ~Trans16x16TestBase() = default;
 
  protected:
   virtual void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) = 0;
@@ -353,7 +354,7 @@ class Trans16x16TestBase {
 #if CONFIG_VP9_HIGHBITDEPTH
       } else {
         ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+            RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_));
 #endif
       }
 
@@ -475,10 +476,10 @@ class Trans16x16TestBase {
         ASM_REGISTER_STATE_CHECK(RunInvTxfm(output_ref_block, dst, pitch_));
 #if CONFIG_VP9_HIGHBITDEPTH
       } else {
-        inv_txfm_ref(output_ref_block, CONVERT_TO_BYTEPTR(ref16), pitch_,
+        inv_txfm_ref(output_ref_block, CAST_TO_BYTEPTR(ref16), pitch_,
                      tx_type_);
         ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(output_ref_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+            RunInvTxfm(output_ref_block, CAST_TO_BYTEPTR(dst16), pitch_));
 #endif
       }
       if (bit_depth_ == VPX_BITS_8) {
@@ -530,8 +531,7 @@ class Trans16x16TestBase {
         ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, 16));
 #if CONFIG_VP9_HIGHBITDEPTH
       } else {
-        ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), 16));
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), 16));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       }
 
@@ -543,18 +543,56 @@ class Trans16x16TestBase {
         const uint32_t diff = dst[j] - src[j];
 #endif  // CONFIG_VP9_HIGHBITDEPTH
         const uint32_t error = diff * diff;
-        EXPECT_GE(1u, error) << "Error: 16x16 IDCT has error " << error
-                             << " at index " << j;
+        EXPECT_GE(1u, error)
+            << "Error: 16x16 IDCT has error " << error << " at index " << j;
       }
     }
   }
 
+  void RunSpeedTest() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 10000;
+    int c_sum_time = 0;
+    int simd_sum_time = 0;
+
+    DECLARE_ALIGNED(32, int16_t, input_block[kNumCoeffs]);
+    DECLARE_ALIGNED(32, tran_low_t, output_ref_block[kNumCoeffs]);
+    DECLARE_ALIGNED(32, tran_low_t, output_block[kNumCoeffs]);
+
+    // Initialize a test block with input range [-mask_, mask_].
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+    }
+
+    vpx_usec_timer timer_c;
+    vpx_usec_timer_start(&timer_c);
+    for (int i = 0; i < count_test_block; ++i) {
+      vpx_fdct16x16_c(input_block, output_ref_block, pitch_);
+    }
+    vpx_usec_timer_mark(&timer_c);
+    c_sum_time += static_cast<int>(vpx_usec_timer_elapsed(&timer_c));
+
+    vpx_usec_timer timer_mod;
+    vpx_usec_timer_start(&timer_mod);
+    for (int i = 0; i < count_test_block; ++i) {
+      RunFwdTxfm(input_block, output_block, pitch_);
+    }
+
+    vpx_usec_timer_mark(&timer_mod);
+    simd_sum_time += static_cast<int>(vpx_usec_timer_elapsed(&timer_mod));
+
+    printf(
+        "c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time,
+        simd_sum_time,
+        (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
+  }
+
   void CompareInvReference(IdctFunc ref_txfm, int thresh) {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     const int count_test_block = 10000;
     const int eob = 10;
     const int16_t *scan = vp9_default_scan_orders[TX_16X16].scan;
-    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
+    DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]);
     DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
     DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]);
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -585,9 +623,9 @@ class Trans16x16TestBase {
         ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
       } else {
 #if CONFIG_VP9_HIGHBITDEPTH
-        ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_);
+        ref_txfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_);
         ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));
+            RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       }
 
@@ -605,6 +643,80 @@ class Trans16x16TestBase {
     }
   }
 
+  void RunInvTrans16x16SpeedTest(IdctFunc ref_txfm, int thresh) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 10000;
+    const int eob = 10;
+    const int16_t *scan = vp9_default_scan_orders[TX_16X16].scan;
+    int64_t c_sum_time = 0;
+    int64_t simd_sum_time = 0;
+    DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      if (j < eob) {
+        // Random values less than the threshold, either positive or negative
+        coeff[scan[j]] = rnd(thresh);
+      } else {
+        coeff[scan[j]] = 0;
+      }
+      if (bit_depth_ == VPX_BITS_8) {
+        dst[j] = 0;
+        ref[j] = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        dst16[j] = 0;
+        ref16[j] = 0;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+    }
+
+    if (bit_depth_ == VPX_BITS_8) {
+      vpx_usec_timer timer_c;
+      vpx_usec_timer_start(&timer_c);
+      for (int i = 0; i < count_test_block; ++i) {
+        ref_txfm(coeff, ref, pitch_);
+      }
+      vpx_usec_timer_mark(&timer_c);
+      c_sum_time += vpx_usec_timer_elapsed(&timer_c);
+
+      vpx_usec_timer timer_mod;
+      vpx_usec_timer_start(&timer_mod);
+      for (int i = 0; i < count_test_block; ++i) {
+        RunInvTxfm(coeff, dst, pitch_);
+      }
+      vpx_usec_timer_mark(&timer_mod);
+      simd_sum_time += vpx_usec_timer_elapsed(&timer_mod);
+    } else {
+#if CONFIG_VP9_HIGHBITDEPTH
+      vpx_usec_timer timer_c;
+      vpx_usec_timer_start(&timer_c);
+      for (int i = 0; i < count_test_block; ++i) {
+        ref_txfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_);
+      }
+      vpx_usec_timer_mark(&timer_c);
+      c_sum_time += vpx_usec_timer_elapsed(&timer_c);
+
+      vpx_usec_timer timer_mod;
+      vpx_usec_timer_start(&timer_mod);
+      for (int i = 0; i < count_test_block; ++i) {
+        RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_);
+      }
+      vpx_usec_timer_mark(&timer_mod);
+      simd_sum_time += vpx_usec_timer_elapsed(&timer_mod);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
+    printf(
+        "c_time = %" PRId64 " \t simd_time = %" PRId64 " \t Gain = %4.2f \n",
+        c_sum_time, simd_sum_time,
+        (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
+  }
+
   int pitch_;
   int tx_type_;
   vpx_bit_depth_t bit_depth_;
@@ -616,9 +728,9 @@ class Trans16x16TestBase {
 class Trans16x16DCT : public Trans16x16TestBase,
                       public ::testing::TestWithParam<Dct16x16Param> {
  public:
-  virtual ~Trans16x16DCT() {}
+  ~Trans16x16DCT() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     fwd_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     tx_type_ = GET_PARAM(2);
@@ -637,13 +749,13 @@ class Trans16x16DCT : public Trans16x16TestBase,
     inv_txfm_ref = idct16x16_ref;
 #endif
   }
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) override {
     fwd_txfm_(in, out, stride);
   }
-  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override {
     inv_txfm_(out, dst, stride);
   }
 
@@ -665,12 +777,14 @@ TEST_P(Trans16x16DCT, QuantCheck) {
 
 TEST_P(Trans16x16DCT, InvAccuracyCheck) { RunInvAccuracyCheck(); }
 
+TEST_P(Trans16x16DCT, DISABLED_Speed) { RunSpeedTest(); }
+
 class Trans16x16HT : public Trans16x16TestBase,
                      public ::testing::TestWithParam<Ht16x16Param> {
  public:
-  virtual ~Trans16x16HT() {}
+  ~Trans16x16HT() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     fwd_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     tx_type_ = GET_PARAM(2);
@@ -689,13 +803,13 @@ class Trans16x16HT : public Trans16x16TestBase,
     inv_txfm_ref = iht16x16_ref;
 #endif
   }
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) override {
     fwd_txfm_(in, out, stride, tx_type_);
   }
-  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override {
     inv_txfm_(out, dst, stride, tx_type_);
   }
 
@@ -718,9 +832,9 @@ TEST_P(Trans16x16HT, QuantCheck) {
 class InvTrans16x16DCT : public Trans16x16TestBase,
                          public ::testing::TestWithParam<Idct16x16Param> {
  public:
-  virtual ~InvTrans16x16DCT() {}
+  ~InvTrans16x16DCT() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     ref_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     thresh_ = GET_PARAM(2);
@@ -728,11 +842,12 @@ class InvTrans16x16DCT : public Trans16x16TestBase,
     pitch_ = 16;
     mask_ = (1 << bit_depth_) - 1;
   }
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(int16_t * /*in*/, tran_low_t * /*out*/, int /*stride*/) {}
-  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+  void RunFwdTxfm(int16_t * /*in*/, tran_low_t * /*out*/,
+                  int /*stride*/) override {}
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override {
     inv_txfm_(out, dst, stride);
   }
 
@@ -740,89 +855,34 @@ class InvTrans16x16DCT : public Trans16x16TestBase,
   IdctFunc inv_txfm_;
   int thresh_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(InvTrans16x16DCT);
 
 TEST_P(InvTrans16x16DCT, CompareReference) {
   CompareInvReference(ref_txfm_, thresh_);
 }
 
-class PartialTrans16x16Test : public ::testing::TestWithParam<
-                                  std::tr1::tuple<FdctFunc, vpx_bit_depth_t> > {
- public:
-  virtual ~PartialTrans16x16Test() {}
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    bit_depth_ = GET_PARAM(1);
-  }
-
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
-  vpx_bit_depth_t bit_depth_;
-  FdctFunc fwd_txfm_;
-};
-
-TEST_P(PartialTrans16x16Test, Extremes) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  const int16_t maxval =
-      static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
-#else
-  const int16_t maxval = 255;
-#endif
-  const int minval = -maxval;
-  DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
-
-  for (int i = 0; i < kNumCoeffs; ++i) input[i] = maxval;
-  output[0] = 0;
-  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16));
-  EXPECT_EQ((maxval * kNumCoeffs) >> 1, output[0]);
-
-  for (int i = 0; i < kNumCoeffs; ++i) input[i] = minval;
-  output[0] = 0;
-  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16));
-  EXPECT_EQ((minval * kNumCoeffs) >> 1, output[0]);
+TEST_P(InvTrans16x16DCT, DISABLED_Speed) {
+  RunInvTrans16x16SpeedTest(ref_txfm_, thresh_);
 }
 
-TEST_P(PartialTrans16x16Test, Random) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  const int16_t maxval =
-      static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
-#else
-  const int16_t maxval = 255;
-#endif
-  DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-
-  int sum = 0;
-  for (int i = 0; i < kNumCoeffs; ++i) {
-    const int val = (i & 1) ? -rnd(maxval + 1) : rnd(maxval + 1);
-    input[i] = val;
-    sum += val;
-  }
-  output[0] = 0;
-  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16));
-  EXPECT_EQ(sum >> 1, output[0]);
-}
-
-using std::tr1::make_tuple;
+using std::make_tuple;
 
 #if CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     C, Trans16x16DCT,
     ::testing::Values(
         make_tuple(&vpx_highbd_fdct16x16_c, &idct16x16_10, 0, VPX_BITS_10),
         make_tuple(&vpx_highbd_fdct16x16_c, &idct16x16_12, 0, VPX_BITS_12),
         make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c, 0, VPX_BITS_8)));
 #else
-INSTANTIATE_TEST_CASE_P(C, Trans16x16DCT,
-                        ::testing::Values(make_tuple(&vpx_fdct16x16_c,
-                                                     &vpx_idct16x16_256_add_c,
-                                                     0, VPX_BITS_8)));
+INSTANTIATE_TEST_SUITE_P(C, Trans16x16DCT,
+                         ::testing::Values(make_tuple(&vpx_fdct16x16_c,
+                                                      &vpx_idct16x16_256_add_c,
+                                                      0, VPX_BITS_8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     C, Trans16x16HT,
     ::testing::Values(
         make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 0, VPX_BITS_10),
@@ -837,37 +897,45 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
-    C, PartialTrans16x16Test,
-    ::testing::Values(make_tuple(&vpx_highbd_fdct16x16_1_c, VPX_BITS_8),
-                      make_tuple(&vpx_highbd_fdct16x16_1_c, VPX_BITS_10),
-                      make_tuple(&vpx_highbd_fdct16x16_1_c, VPX_BITS_12)));
 #else
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     C, Trans16x16HT,
     ::testing::Values(
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(C, PartialTrans16x16Test,
-                        ::testing::Values(make_tuple(&vpx_fdct16x16_1_c,
-                                                     VPX_BITS_8)));
+
+INSTANTIATE_TEST_SUITE_P(C, InvTrans16x16DCT,
+                         ::testing::Values(make_tuple(&vpx_idct16x16_256_add_c,
+                                                      &vpx_idct16x16_256_add_c,
+                                                      6225, VPX_BITS_8)));
+
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     NEON, Trans16x16DCT,
-    ::testing::Values(make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_neon,
-                                 0, VPX_BITS_8)));
-#endif
+    ::testing::Values(make_tuple(&vpx_fdct16x16_neon,
+                                 &vpx_idct16x16_256_add_neon, 0, VPX_BITS_8)));
+#endif  // HAVE_NEON && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_NEON && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_SUITE_P(
+    NEON, Trans16x16DCT,
+    ::testing::Values(
+        make_tuple(&vpx_highbd_fdct16x16_neon, &idct16x16_10, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct16x16_neon, &idct16x16_12, 0, VPX_BITS_12),
+        make_tuple(&vpx_fdct16x16_neon, &vpx_idct16x16_256_add_c, 0,
+                   VPX_BITS_8)));
+#endif  // HAVE_NEON && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, Trans16x16DCT,
     ::testing::Values(make_tuple(&vpx_fdct16x16_sse2,
                                  &vpx_idct16x16_256_add_sse2, 0, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, Trans16x16HT,
     ::testing::Values(make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2,
                                  0, VPX_BITS_8),
@@ -877,13 +945,27 @@ INSTANTIATE_TEST_CASE_P(
                                  2, VPX_BITS_8),
                       make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2,
                                  3, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans16x16Test,
-                        ::testing::Values(make_tuple(&vpx_fdct16x16_1_sse2,
-                                                     VPX_BITS_8)));
+
+INSTANTIATE_TEST_SUITE_P(SSE2, InvTrans16x16DCT,
+                         ::testing::Values(make_tuple(
+                             &vpx_idct16x16_256_add_c,
+                             &vpx_idct16x16_256_add_sse2, 6225, VPX_BITS_8)));
 #endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
+#if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, Trans16x16DCT,
+    ::testing::Values(make_tuple(&vpx_fdct16x16_avx2,
+                                 &vpx_idct16x16_256_add_sse2, 0, VPX_BITS_8)));
+
+INSTANTIATE_TEST_SUITE_P(AVX2, InvTrans16x16DCT,
+                         ::testing::Values(make_tuple(
+                             &vpx_idct16x16_256_add_c,
+                             &vpx_idct16x16_256_add_avx2, 6225, VPX_BITS_8)));
+#endif  // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
 #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, Trans16x16DCT,
     ::testing::Values(
         make_tuple(&vpx_highbd_fdct16x16_sse2, &idct16x16_10, 0, VPX_BITS_10),
@@ -894,7 +976,7 @@ INSTANTIATE_TEST_CASE_P(
                    VPX_BITS_12),
         make_tuple(&vpx_fdct16x16_sse2, &vpx_idct16x16_256_add_c, 0,
                    VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, Trans16x16HT,
     ::testing::Values(
         make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8),
@@ -904,7 +986,7 @@ INSTANTIATE_TEST_CASE_P(
                    VPX_BITS_8)));
 // Optimizations take effect at a threshold of 3155, so we use a value close to
 // that to test both branches.
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, InvTrans16x16DCT,
     ::testing::Values(make_tuple(&idct16x16_10_add_10_c,
                                  &idct16x16_10_add_10_sse2, 3167, VPX_BITS_10),
@@ -914,17 +996,14 @@ INSTANTIATE_TEST_CASE_P(
                                  &idct16x16_10_add_12_sse2, 3167, VPX_BITS_12),
                       make_tuple(&idct16x16_12, &idct16x16_256_add_12_sse2,
                                  3167, VPX_BITS_12)));
-INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans16x16Test,
-                        ::testing::Values(make_tuple(&vpx_fdct16x16_1_sse2,
-                                                     VPX_BITS_8)));
 #endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(MSA, Trans16x16DCT,
-                        ::testing::Values(make_tuple(&vpx_fdct16x16_msa,
-                                                     &vpx_idct16x16_256_add_msa,
-                                                     0, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
+    MSA, Trans16x16DCT,
+    ::testing::Values(make_tuple(&vpx_fdct16x16_msa, &vpx_idct16x16_256_add_msa,
+                                 0, VPX_BITS_8)));
+INSTANTIATE_TEST_SUITE_P(
     MSA, Trans16x16HT,
     ::testing::Values(
         make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 0, VPX_BITS_8),
@@ -932,8 +1011,19 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 2, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 3,
                    VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(MSA, PartialTrans16x16Test,
-                        ::testing::Values(make_tuple(&vpx_fdct16x16_1_msa,
-                                                     VPX_BITS_8)));
 #endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_SUITE_P(
+    VSX, Trans16x16DCT,
+    ::testing::Values(make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_vsx,
+                                 0, VPX_BITS_8)));
+#endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_SUITE_P(LSX, Trans16x16DCT,
+                         ::testing::Values(make_tuple(&vpx_fdct16x16_lsx,
+                                                      &vpx_idct16x16_256_add_c,
+                                                      0, VPX_BITS_8)));
+#endif  // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
diff --git a/media/libvpx/libvpx/test/dct32x32_test.cc b/media/libvpx/libvpx/test/dct32x32_test.cc
index a168e690ee..77447481cb 100644
--- a/media/libvpx/libvpx/test/dct32x32_test.cc
+++ b/media/libvpx/libvpx/test/dct32x32_test.cc
@@ -11,21 +11,24 @@
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
+#include <tuple>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
+#include "test/bench.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_scan.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
-#include "vpx_ports/msvc.h"  // for round()
+#include "vpx_ports/vpx_timer.h"
 
 using libvpx_test::ACMRandom;
 
@@ -63,26 +66,30 @@ void reference_32x32_dct_2d(const int16_t input[kNumCoeffs],
   }
 }
 
-typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);
-typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);
+using FwdTxfmFunc = void (*)(const int16_t *in, tran_low_t *out, int stride);
+using InvTxfmFunc = void (*)(const tran_low_t *in, uint8_t *out, int stride);
 
-typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmFunc, int, vpx_bit_depth_t>
-    Trans32x32Param;
+using Trans32x32Param =
+    std::tuple<FwdTxfmFunc, InvTxfmFunc, int, vpx_bit_depth_t>;
+
+using InvTrans32x32Param =
+    std::tuple<InvTxfmFunc, InvTxfmFunc, int, vpx_bit_depth_t, int, int>;
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct32x32_1024_add_c(in, out, stride, 10);
+  vpx_highbd_idct32x32_1024_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
 }
 
 void idct32x32_12(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct32x32_1024_add_c(in, out, stride, 12);
+  vpx_highbd_idct32x32_1024_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-class Trans32x32Test : public ::testing::TestWithParam<Trans32x32Param> {
+class Trans32x32Test : public AbstractBench,
+                       public ::testing::TestWithParam<Trans32x32Param> {
  public:
-  virtual ~Trans32x32Test() {}
-  virtual void SetUp() {
+  ~Trans32x32Test() override = default;
+  void SetUp() override {
     fwd_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     version_ = GET_PARAM(2);  // 0: high precision forward transform
@@ -91,7 +98,7 @@ class Trans32x32Test : public ::testing::TestWithParam<Trans32x32Param> {
     mask_ = (1 << bit_depth_) - 1;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   int version_;
@@ -99,8 +106,14 @@ class Trans32x32Test : public ::testing::TestWithParam<Trans32x32Param> {
   int mask_;
   FwdTxfmFunc fwd_txfm_;
   InvTxfmFunc inv_txfm_;
+
+  int16_t *bench_in_;
+  tran_low_t *bench_out_;
+  void Run() override;
 };
 
+void Trans32x32Test::Run() { fwd_txfm_(bench_in_, bench_out_, 32); }
+
 TEST_P(Trans32x32Test, AccuracyCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   uint32_t max_error = 0;
@@ -137,7 +150,7 @@ TEST_P(Trans32x32Test, AccuracyCheck) {
 #if CONFIG_VP9_HIGHBITDEPTH
     } else {
       ASM_REGISTER_STATE_CHECK(
-          inv_txfm_(test_temp_block, CONVERT_TO_BYTEPTR(dst16), 32));
+          inv_txfm_(test_temp_block, CAST_TO_BYTEPTR(dst16), 32));
 #endif
     }
 
@@ -237,6 +250,19 @@ TEST_P(Trans32x32Test, MemCheck) {
   }
 }
 
+TEST_P(Trans32x32Test, DISABLED_Speed) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+  DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
+
+  bench_in_ = input_extreme_block;
+  bench_out_ = output_block;
+
+  RunNTimes(INT16_MAX);
+  PrintMedian("32x32");
+}
+
 TEST_P(Trans32x32Test, InverseAccuracy) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   const int count_test_block = 1000;
@@ -275,7 +301,7 @@ TEST_P(Trans32x32Test, InverseAccuracy) {
       ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));
 #if CONFIG_VP9_HIGHBITDEPTH
     } else {
-      ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, CONVERT_TO_BYTEPTR(dst16), 32));
+      ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, CAST_TO_BYTEPTR(dst16), 32));
 #endif
     }
     for (int j = 0; j < kNumCoeffs; ++j) {
@@ -292,71 +318,178 @@ TEST_P(Trans32x32Test, InverseAccuracy) {
   }
 }
 
-class PartialTrans32x32Test
-    : public ::testing::TestWithParam<
-          std::tr1::tuple<FwdTxfmFunc, vpx_bit_depth_t> > {
+class InvTrans32x32Test : public ::testing::TestWithParam<InvTrans32x32Param> {
  public:
-  virtual ~PartialTrans32x32Test() {}
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    bit_depth_ = GET_PARAM(1);
+  ~InvTrans32x32Test() override = default;
+  void SetUp() override {
+    ref_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    version_ = GET_PARAM(2);  // 0: high precision forward transform
+                              // 1: low precision version for rd loop
+    bit_depth_ = GET_PARAM(3);
+    eob_ = GET_PARAM(4);
+    thresh_ = GET_PARAM(4);
+    mask_ = (1 << bit_depth_) - 1;
+    pitch_ = 32;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
+  void RunRefTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+    ref_txfm_(out, dst, stride);
+  }
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride);
+  }
+  int version_;
   vpx_bit_depth_t bit_depth_;
-  FwdTxfmFunc fwd_txfm_;
+  int mask_;
+  int eob_;
+  int thresh_;
+
+  InvTxfmFunc ref_txfm_;
+  InvTxfmFunc inv_txfm_;
+  int pitch_;
+
+  void RunInvTrans32x32SpeedTest() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 10000;
+    int64_t c_sum_time = 0;
+    int64_t simd_sum_time = 0;
+    const int16_t *scan = vp9_default_scan_orders[TX_32X32].scan;
+    DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      if (j < eob_) {
+        // Random values less than the threshold, either positive or negative
+        coeff[scan[j]] = rnd(thresh_);
+      } else {
+        coeff[scan[j]] = 0;
+      }
+      if (bit_depth_ == VPX_BITS_8) {
+        dst[j] = 0;
+        ref[j] = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        dst16[j] = 0;
+        ref16[j] = 0;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+    }
+
+    if (bit_depth_ == VPX_BITS_8) {
+      vpx_usec_timer timer_c;
+      vpx_usec_timer_start(&timer_c);
+      for (int i = 0; i < count_test_block; ++i) {
+        RunRefTxfm(coeff, ref, pitch_);
+      }
+      vpx_usec_timer_mark(&timer_c);
+      c_sum_time += vpx_usec_timer_elapsed(&timer_c);
+
+      vpx_usec_timer timer_mod;
+      vpx_usec_timer_start(&timer_mod);
+      for (int i = 0; i < count_test_block; ++i) {
+        RunInvTxfm(coeff, dst, pitch_);
+      }
+      vpx_usec_timer_mark(&timer_mod);
+      simd_sum_time += vpx_usec_timer_elapsed(&timer_mod);
+    } else {
+#if CONFIG_VP9_HIGHBITDEPTH
+      vpx_usec_timer timer_c;
+      vpx_usec_timer_start(&timer_c);
+      for (int i = 0; i < count_test_block; ++i) {
+        RunRefTxfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_);
+      }
+      vpx_usec_timer_mark(&timer_c);
+      c_sum_time += vpx_usec_timer_elapsed(&timer_c);
+
+      vpx_usec_timer timer_mod;
+      vpx_usec_timer_start(&timer_mod);
+      for (int i = 0; i < count_test_block; ++i) {
+        RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_);
+      }
+      vpx_usec_timer_mark(&timer_mod);
+      simd_sum_time += vpx_usec_timer_elapsed(&timer_mod);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
+    printf(
+        "c_time = %" PRId64 " \t simd_time = %" PRId64 " \t Gain = %4.2f \n",
+        c_sum_time, simd_sum_time,
+        (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
+  }
+
+  void CompareInvReference32x32() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 10000;
+    const int eob = 31;
+    const int16_t *scan = vp9_default_scan_orders[TX_32X32].scan;
+    DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    for (int i = 0; i < count_test_block; ++i) {
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        if (j < eob) {
+          coeff[scan[j]] = rnd.Rand8Extremes();
+        } else {
+          coeff[scan[j]] = 0;
+        }
+        if (bit_depth_ == VPX_BITS_8) {
+          dst[j] = 0;
+          ref[j] = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          dst16[j] = 0;
+          ref16[j] = 0;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        }
+      }
+      if (bit_depth_ == VPX_BITS_8) {
+        RunRefTxfm(coeff, ref, pitch_);
+        RunInvTxfm(coeff, dst, pitch_);
+      } else {
+#if CONFIG_VP9_HIGHBITDEPTH
+        RunRefTxfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_);
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j];
+#else
+        const uint32_t diff = dst[j] - ref[j];
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t error = diff * diff;
+        EXPECT_EQ(0u, error) << "Error: 32x32 IDCT Comparison has error "
+                             << error << " at index " << j;
+      }
+    }
+  }
 };
 
-TEST_P(PartialTrans32x32Test, Extremes) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  const int16_t maxval =
-      static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
-#else
-  const int16_t maxval = 255;
-#endif
-  const int minval = -maxval;
-  DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(InvTrans32x32Test);
 
-  for (int i = 0; i < kNumCoeffs; ++i) input[i] = maxval;
-  output[0] = 0;
-  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32));
-  EXPECT_EQ((maxval * kNumCoeffs) >> 3, output[0]);
+TEST_P(InvTrans32x32Test, DISABLED_Speed) { RunInvTrans32x32SpeedTest(); }
+TEST_P(InvTrans32x32Test, CompareReference) { CompareInvReference32x32(); }
 
-  for (int i = 0; i < kNumCoeffs; ++i) input[i] = minval;
-  output[0] = 0;
-  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32));
-  EXPECT_EQ((minval * kNumCoeffs) >> 3, output[0]);
-}
-
-TEST_P(PartialTrans32x32Test, Random) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  const int16_t maxval =
-      static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
-#else
-  const int16_t maxval = 255;
-#endif
-  DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-
-  int sum = 0;
-  for (int i = 0; i < kNumCoeffs; ++i) {
-    const int val = (i & 1) ? -rnd(maxval + 1) : rnd(maxval + 1);
-    input[i] = val;
-    sum += val;
-  }
-  output[0] = 0;
-  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32));
-  EXPECT_EQ(sum >> 3, output[0]);
-}
-
-using std::tr1::make_tuple;
+using std::make_tuple;
 
 #if CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     C, Trans32x32Test,
     ::testing::Values(
         make_tuple(&vpx_highbd_fdct32x32_c, &idct32x32_10, 0, VPX_BITS_10),
@@ -366,46 +499,51 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, 0, VPX_BITS_8),
         make_tuple(&vpx_fdct32x32_rd_c, &vpx_idct32x32_1024_add_c, 1,
                    VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
-    C, PartialTrans32x32Test,
-    ::testing::Values(make_tuple(&vpx_highbd_fdct32x32_1_c, VPX_BITS_8),
-                      make_tuple(&vpx_highbd_fdct32x32_1_c, VPX_BITS_10),
-                      make_tuple(&vpx_highbd_fdct32x32_1_c, VPX_BITS_12)));
 #else
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     C, Trans32x32Test,
     ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, 0,
                                  VPX_BITS_8),
                       make_tuple(&vpx_fdct32x32_rd_c, &vpx_idct32x32_1024_add_c,
                                  1, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(C, PartialTrans32x32Test,
-                        ::testing::Values(make_tuple(&vpx_fdct32x32_1_c,
-                                                     VPX_BITS_8)));
+
+INSTANTIATE_TEST_SUITE_P(
+    C, InvTrans32x32Test,
+    ::testing::Values(
+        (make_tuple(&vpx_idct32x32_1024_add_c, &vpx_idct32x32_1024_add_c, 0,
+                    VPX_BITS_8, 32, 6225)),
+        make_tuple(&vpx_idct32x32_135_add_c, &vpx_idct32x32_135_add_c, 0,
+                   VPX_BITS_8, 16, 6255)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(
+#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_SUITE_P(
     NEON, Trans32x32Test,
-    ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_neon,
-                                 0, VPX_BITS_8),
-                      make_tuple(&vpx_fdct32x32_rd_c,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_neon,
+                                 &vpx_idct32x32_1024_add_neon, 0, VPX_BITS_8),
+                      make_tuple(&vpx_fdct32x32_rd_neon,
                                  &vpx_idct32x32_1024_add_neon, 1, VPX_BITS_8)));
-#endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#endif  // HAVE_NEON && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, Trans32x32Test,
     ::testing::Values(make_tuple(&vpx_fdct32x32_sse2,
                                  &vpx_idct32x32_1024_add_sse2, 0, VPX_BITS_8),
                       make_tuple(&vpx_fdct32x32_rd_sse2,
                                  &vpx_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans32x32Test,
-                        ::testing::Values(make_tuple(&vpx_fdct32x32_1_sse2,
-                                                     VPX_BITS_8)));
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, InvTrans32x32Test,
+    ::testing::Values(
+        (make_tuple(&vpx_idct32x32_1024_add_c, &vpx_idct32x32_1024_add_sse2, 0,
+                    VPX_BITS_8, 32, 6225)),
+        make_tuple(&vpx_idct32x32_135_add_c, &vpx_idct32x32_135_add_sse2, 0,
+                   VPX_BITS_8, 16, 6225)));
 #endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, Trans32x32Test,
     ::testing::Values(
         make_tuple(&vpx_highbd_fdct32x32_sse2, &idct32x32_10, 0, VPX_BITS_10),
@@ -418,29 +556,49 @@ INSTANTIATE_TEST_CASE_P(
                    VPX_BITS_8),
         make_tuple(&vpx_fdct32x32_rd_sse2, &vpx_idct32x32_1024_add_c, 1,
                    VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans32x32Test,
-                        ::testing::Values(make_tuple(&vpx_fdct32x32_1_sse2,
-                                                     VPX_BITS_8)));
 #endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     AVX2, Trans32x32Test,
     ::testing::Values(make_tuple(&vpx_fdct32x32_avx2,
                                  &vpx_idct32x32_1024_add_sse2, 0, VPX_BITS_8),
                       make_tuple(&vpx_fdct32x32_rd_avx2,
                                  &vpx_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
+
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, InvTrans32x32Test,
+    ::testing::Values(
+        (make_tuple(&vpx_idct32x32_1024_add_c, &vpx_idct32x32_1024_add_avx2, 0,
+                    VPX_BITS_8, 32, 6225)),
+        make_tuple(&vpx_idct32x32_135_add_c, &vpx_idct32x32_135_add_avx2, 0,
+                   VPX_BITS_8, 16, 6225)));
 #endif  // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     MSA, Trans32x32Test,
     ::testing::Values(make_tuple(&vpx_fdct32x32_msa,
                                  &vpx_idct32x32_1024_add_msa, 0, VPX_BITS_8),
                       make_tuple(&vpx_fdct32x32_rd_msa,
                                  &vpx_idct32x32_1024_add_msa, 1, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(MSA, PartialTrans32x32Test,
-                        ::testing::Values(make_tuple(&vpx_fdct32x32_1_msa,
-                                                     VPX_BITS_8)));
 #endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_SUITE_P(
+    VSX, Trans32x32Test,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_vsx,
+                                 0, VPX_BITS_8),
+                      make_tuple(&vpx_fdct32x32_rd_vsx,
+                                 &vpx_idct32x32_1024_add_vsx, 1, VPX_BITS_8)));
+#endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_SUITE_P(
+    LSX, Trans32x32Test,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_lsx,
+                                 &vpx_idct32x32_1024_add_lsx, 0, VPX_BITS_8),
+                      make_tuple(&vpx_fdct32x32_rd_lsx,
+                                 &vpx_idct32x32_1024_add_lsx, 1, VPX_BITS_8)));
+#endif  // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
diff --git a/media/libvpx/libvpx/test/dct_partial_test.cc b/media/libvpx/libvpx/test/dct_partial_test.cc
new file mode 100644
index 0000000000..dc4921ab94
--- /dev/null
+++ b/media/libvpx/libvpx/test/dct_partial_test.cc
@@ -0,0 +1,184 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits>
+#include <tuple>
+
+#include "gtest/gtest.h"
+
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/buffer.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vpx_config.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+using libvpx_test::ACMRandom;
+using libvpx_test::Buffer;
+using std::make_tuple;
+using std::tuple;
+
+namespace {
+using PartialFdctFunc = void (*)(const int16_t *in, tran_low_t *out,
+                                 int stride);
+
+using PartialFdctParam = tuple<PartialFdctFunc, int /*size*/, vpx_bit_depth_t>;
+
+tran_low_t partial_fdct_ref(const Buffer<int16_t> &in, int size) {
+  int64_t sum = 0;
+  if (in.TopLeftPixel() != nullptr) {
+    for (int y = 0; y < size; ++y) {
+      for (int x = 0; x < size; ++x) {
+        sum += in.TopLeftPixel()[y * in.stride() + x];
+      }
+    }
+  } else {
+    assert(0);
+  }
+
+  switch (size) {
+    case 4: sum *= 2; break;
+    case 8: /*sum = sum;*/ break;
+    case 16: sum >>= 1; break;
+    case 32: sum >>= 3; break;
+  }
+
+  return static_cast<tran_low_t>(sum);
+}
+
+class PartialFdctTest : public ::testing::TestWithParam<PartialFdctParam> {
+ public:
+  PartialFdctTest() {
+    fwd_txfm_ = GET_PARAM(0);
+    size_ = GET_PARAM(1);
+    bit_depth_ = GET_PARAM(2);
+  }
+
+  void TearDown() override { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunTest() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int16_t maxvalue =
+        clip_pixel_highbd(std::numeric_limits<int16_t>::max(), bit_depth_);
+    const int16_t minvalue = -maxvalue;
+    Buffer<int16_t> input_block =
+        Buffer<int16_t>(size_, size_, 8, size_ == 4 ? 0 : 16);
+    ASSERT_TRUE(input_block.Init());
+    Buffer<tran_low_t> output_block = Buffer<tran_low_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(output_block.Init());
+
+    if (output_block.TopLeftPixel() != nullptr) {
+      for (int i = 0; i < 100; ++i) {
+        if (i == 0) {
+          input_block.Set(maxvalue);
+        } else if (i == 1) {
+          input_block.Set(minvalue);
+        } else {
+          input_block.Set(&rnd, minvalue, maxvalue);
+        }
+
+        ASM_REGISTER_STATE_CHECK(fwd_txfm_(input_block.TopLeftPixel(),
+                                           output_block.TopLeftPixel(),
+                                           input_block.stride()));
+
+        EXPECT_EQ(partial_fdct_ref(input_block, size_),
+                  output_block.TopLeftPixel()[0]);
+      }
+    } else {
+      assert(0);
+    }
+  }
+
+  PartialFdctFunc fwd_txfm_;
+  vpx_bit_depth_t bit_depth_;
+  int size_;
+};
+
+TEST_P(PartialFdctTest, PartialFdctTest) { RunTest(); }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    C, PartialFdctTest,
+    ::testing::Values(make_tuple(&vpx_highbd_fdct32x32_1_c, 32, VPX_BITS_12),
+                      make_tuple(&vpx_highbd_fdct32x32_1_c, 32, VPX_BITS_10),
+                      make_tuple(&vpx_fdct32x32_1_c, 32, VPX_BITS_8),
+                      make_tuple(&vpx_highbd_fdct16x16_1_c, 16, VPX_BITS_12),
+                      make_tuple(&vpx_highbd_fdct16x16_1_c, 16, VPX_BITS_10),
+                      make_tuple(&vpx_fdct16x16_1_c, 16, VPX_BITS_8),
+                      make_tuple(&vpx_highbd_fdct8x8_1_c, 8, VPX_BITS_12),
+                      make_tuple(&vpx_highbd_fdct8x8_1_c, 8, VPX_BITS_10),
+                      make_tuple(&vpx_fdct8x8_1_c, 8, VPX_BITS_8),
+                      make_tuple(&vpx_fdct4x4_1_c, 4, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_SUITE_P(
+    C, PartialFdctTest,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_1_c, 32, VPX_BITS_8),
+                      make_tuple(&vpx_fdct16x16_1_c, 16, VPX_BITS_8),
+                      make_tuple(&vpx_fdct8x8_1_c, 8, VPX_BITS_8),
+                      make_tuple(&vpx_fdct4x4_1_c, 4, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, PartialFdctTest,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_1_sse2, 32, VPX_BITS_8),
+                      make_tuple(&vpx_fdct16x16_1_sse2, 16, VPX_BITS_8),
+                      make_tuple(&vpx_fdct8x8_1_sse2, 8, VPX_BITS_8),
+                      make_tuple(&vpx_fdct4x4_1_sse2, 4, VPX_BITS_8)));
+#endif  // HAVE_SSE2
+
+#if HAVE_NEON
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    NEON, PartialFdctTest,
+    ::testing::Values(make_tuple(&vpx_highbd_fdct32x32_1_neon, 32, VPX_BITS_12),
+                      make_tuple(&vpx_highbd_fdct32x32_1_neon, 32, VPX_BITS_10),
+                      make_tuple(&vpx_highbd_fdct32x32_1_neon, 32, VPX_BITS_8),
+                      make_tuple(&vpx_highbd_fdct16x16_1_neon, 16, VPX_BITS_12),
+                      make_tuple(&vpx_highbd_fdct16x16_1_neon, 16, VPX_BITS_10),
+                      make_tuple(&vpx_highbd_fdct16x16_1_neon, 16, VPX_BITS_8),
+                      make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_12),
+                      make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_10),
+                      make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_8),
+                      make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_12),
+                      make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_10),
+                      make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_SUITE_P(
+    NEON, PartialFdctTest,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_1_neon, 32, VPX_BITS_8),
+                      make_tuple(&vpx_fdct16x16_1_neon, 16, VPX_BITS_8),
+                      make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_8),
+                      make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_NEON
+
+#if HAVE_MSA
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(MSA, PartialFdctTest,
+                         ::testing::Values(make_tuple(&vpx_fdct8x8_1_msa, 8,
+                                                      VPX_BITS_8)));
+#else   // !CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    MSA, PartialFdctTest,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_1_msa, 32, VPX_BITS_8),
+                      make_tuple(&vpx_fdct16x16_1_msa, 16, VPX_BITS_8),
+                      make_tuple(&vpx_fdct8x8_1_msa, 8, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_MSA
+}  // namespace
diff --git a/media/libvpx/libvpx/test/dct_test.cc b/media/libvpx/libvpx/test/dct_test.cc
new file mode 100644
index 0000000000..ffc6b93ed9
--- /dev/null
+++ b/media/libvpx/libvpx/test/dct_test.cc
@@ -0,0 +1,791 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <tuple>
+
+#include "gtest/gtest.h"
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/buffer.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vpx_config.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+using libvpx_test::ACMRandom;
+using libvpx_test::Buffer;
+using std::make_tuple;
+using std::tuple;
+
+namespace {
+using FdctFunc = void (*)(const int16_t *in, tran_low_t *out, int stride);
+using IdctFunc = void (*)(const tran_low_t *in, uint8_t *out, int stride);
+using FhtFunc = void (*)(const int16_t *in, tran_low_t *out, int stride,
+                         int tx_type);
+using FhtFuncRef = void (*)(const Buffer<int16_t> &in, Buffer<tran_low_t> *out,
+                            int size, int tx_type);
+using IhtFunc = void (*)(const tran_low_t *in, uint8_t *out, int stride,
+                         int tx_type);
+using IhtWithBdFunc = void (*)(const tran_low_t *in, uint8_t *out, int stride,
+                               int tx_type, int bd);
+
+template <FdctFunc fn>
+void fdct_wrapper(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
+  (void)tx_type;
+  fn(in, out, stride);
+}
+
+template <IdctFunc fn>
+void idct_wrapper(const tran_low_t *in, uint8_t *out, int stride, int tx_type,
+                  int bd) {
+  (void)tx_type;
+  (void)bd;
+  fn(in, out, stride);
+}
+
+template <IhtFunc fn>
+void iht_wrapper(const tran_low_t *in, uint8_t *out, int stride, int tx_type,
+                 int bd) {
+  (void)bd;
+  fn(in, out, stride, tx_type);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+using HighbdIdctFunc = void (*)(const tran_low_t *in, uint16_t *out, int stride,
+                                int bd);
+
+using HighbdIhtFunc = void (*)(const tran_low_t *in, uint16_t *out, int stride,
+                               int tx_type, int bd);
+
+template <HighbdIdctFunc fn>
+void highbd_idct_wrapper(const tran_low_t *in, uint8_t *out, int stride,
+                         int tx_type, int bd) {
+  (void)tx_type;
+  fn(in, CAST_TO_SHORTPTR(out), stride, bd);
+}
+
+template <HighbdIhtFunc fn>
+void highbd_iht_wrapper(const tran_low_t *in, uint8_t *out, int stride,
+                        int tx_type, int bd) {
+  fn(in, CAST_TO_SHORTPTR(out), stride, tx_type, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+struct FuncInfo {
+  FhtFunc ft_func;
+  IhtWithBdFunc it_func;
+  int size;
+  int pixel_size;
+};
+
+/* forward transform, inverse transform, size, transform type, bit depth */
+using DctParam = tuple<int, const FuncInfo *, int, vpx_bit_depth_t>;
+
+void fdct_ref(const Buffer<int16_t> &in, Buffer<tran_low_t> *out, int size,
+              int /*tx_type*/) {
+  const int16_t *i = in.TopLeftPixel();
+  const int i_stride = in.stride();
+  tran_low_t *o = out->TopLeftPixel();
+  if (size == 4) {
+    vpx_fdct4x4_c(i, o, i_stride);
+  } else if (size == 8) {
+    vpx_fdct8x8_c(i, o, i_stride);
+  } else if (size == 16) {
+    vpx_fdct16x16_c(i, o, i_stride);
+  } else if (size == 32) {
+    vpx_fdct32x32_c(i, o, i_stride);
+  }
+}
+
+void fht_ref(const Buffer<int16_t> &in, Buffer<tran_low_t> *out, int size,
+             int tx_type) {
+  const int16_t *i = in.TopLeftPixel();
+  const int i_stride = in.stride();
+  tran_low_t *o = out->TopLeftPixel();
+  if (size == 4) {
+    vp9_fht4x4_c(i, o, i_stride, tx_type);
+  } else if (size == 8) {
+    vp9_fht8x8_c(i, o, i_stride, tx_type);
+  } else if (size == 16) {
+    vp9_fht16x16_c(i, o, i_stride, tx_type);
+  }
+}
+
+void fwht_ref(const Buffer<int16_t> &in, Buffer<tran_low_t> *out, int size,
+              int /*tx_type*/) {
+  ASSERT_EQ(size, 4);
+  vp9_fwht4x4_c(in.TopLeftPixel(), out->TopLeftPixel(), in.stride());
+}
+
+class TransTestBase : public ::testing::TestWithParam<DctParam> {
+ public:
+  void SetUp() override {
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    const int idx = GET_PARAM(0);
+    const FuncInfo *func_info = &(GET_PARAM(1)[idx]);
+    tx_type_ = GET_PARAM(2);
+    bit_depth_ = GET_PARAM(3);
+    fwd_txfm_ = func_info->ft_func;
+    inv_txfm_ = func_info->it_func;
+    size_ = func_info->size;
+    pixel_size_ = func_info->pixel_size;
+    max_pixel_value_ = (1 << bit_depth_) - 1;
+
+    // Randomize stride_ to a value less than or equal to 1024
+    stride_ = rnd_(1024) + 1;
+    if (stride_ < size_) {
+      stride_ = size_;
+    }
+    // Align stride_ to 16 if it's bigger than 16.
+    if (stride_ > 16) {
+      stride_ &= ~15;
+    }
+
+    block_size_ = size_ * stride_;
+
+    src_ = reinterpret_cast<uint8_t *>(
+        vpx_memalign(16, pixel_size_ * block_size_));
+    ASSERT_NE(src_, nullptr);
+    dst_ = reinterpret_cast<uint8_t *>(
+        vpx_memalign(16, pixel_size_ * block_size_));
+    ASSERT_NE(dst_, nullptr);
+  }
+
+  void TearDown() override {
+    vpx_free(src_);
+    src_ = nullptr;
+    vpx_free(dst_);
+    dst_ = nullptr;
+    libvpx_test::ClearSystemState();
+  }
+
+  void InitMem() {
+    if (pixel_size_ == 1 && bit_depth_ > VPX_BITS_8) return;
+    if (pixel_size_ == 1) {
+      for (int j = 0; j < block_size_; ++j) {
+        src_[j] = rnd_.Rand16() & max_pixel_value_;
+      }
+      for (int j = 0; j < block_size_; ++j) {
+        dst_[j] = rnd_.Rand16() & max_pixel_value_;
+      }
+    } else {
+      ASSERT_EQ(pixel_size_, 2);
+      uint16_t *const src = reinterpret_cast<uint16_t *>(src_);
+      uint16_t *const dst = reinterpret_cast<uint16_t *>(dst_);
+      for (int j = 0; j < block_size_; ++j) {
+        src[j] = rnd_.Rand16() & max_pixel_value_;
+      }
+      for (int j = 0; j < block_size_; ++j) {
+        dst[j] = rnd_.Rand16() & max_pixel_value_;
+      }
+    }
+  }
+
+  void RunFwdTxfm(const Buffer<int16_t> &in, Buffer<tran_low_t> *out) {
+    fwd_txfm_(in.TopLeftPixel(), out->TopLeftPixel(), in.stride(), tx_type_);
+  }
+
+  void RunInvTxfm(const Buffer<tran_low_t> &in, uint8_t *out) {
+    inv_txfm_(in.TopLeftPixel(), out, stride_, tx_type_, bit_depth_);
+  }
+
+ protected:
+  void RunAccuracyCheck(int limit) {
+    if (pixel_size_ == 1 && bit_depth_ > VPX_BITS_8) return;
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    Buffer<int16_t> test_input_block =
+        Buffer<int16_t>(size_, size_, 8, size_ == 4 ? 0 : 16);
+    ASSERT_TRUE(test_input_block.Init());
+    ASSERT_NE(test_input_block.TopLeftPixel(), nullptr);
+    Buffer<tran_low_t> test_temp_block =
+        Buffer<tran_low_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(test_temp_block.Init());
+    uint32_t max_error = 0;
+    int64_t total_error = 0;
+    const int count_test_block = 10000;
+    for (int i = 0; i < count_test_block; ++i) {
+      InitMem();
+      for (int h = 0; h < size_; ++h) {
+        for (int w = 0; w < size_; ++w) {
+          if (pixel_size_ == 1) {
+            test_input_block.TopLeftPixel()[h * test_input_block.stride() + w] =
+                src_[h * stride_ + w] - dst_[h * stride_ + w];
+          } else {
+            ASSERT_EQ(pixel_size_, 2);
+            const uint16_t *const src = reinterpret_cast<uint16_t *>(src_);
+            const uint16_t *const dst = reinterpret_cast<uint16_t *>(dst_);
+            test_input_block.TopLeftPixel()[h * test_input_block.stride() + w] =
+                src[h * stride_ + w] - dst[h * stride_ + w];
+          }
+        }
+      }
+
+      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block, &test_temp_block));
+      ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst_));
+
+      for (int h = 0; h < size_; ++h) {
+        for (int w = 0; w < size_; ++w) {
+          int diff;
+          if (pixel_size_ == 1) {
+            diff = dst_[h * stride_ + w] - src_[h * stride_ + w];
+          } else {
+            ASSERT_EQ(pixel_size_, 2);
+            const uint16_t *const src = reinterpret_cast<uint16_t *>(src_);
+            const uint16_t *const dst = reinterpret_cast<uint16_t *>(dst_);
+            diff = dst[h * stride_ + w] - src[h * stride_ + w];
+          }
+          const uint32_t error = diff * diff;
+          if (max_error < error) max_error = error;
+          total_error += error;
+        }
+      }
+    }
+
+    EXPECT_GE(static_cast<uint32_t>(limit), max_error)
+        << "Error: " << size_ << "x" << size_
+        << " transform/inverse transform has an individual round trip error > "
+        << limit;
+
+    EXPECT_GE(count_test_block * limit, total_error)
+        << "Error: " << size_ << "x" << size_
+        << " transform/inverse transform has average round trip error > "
+        << limit << " per block";
+  }
+
+  void RunCoeffCheck() {
+    if (pixel_size_ == 1 && bit_depth_ > VPX_BITS_8) return;
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 5000;
+    Buffer<int16_t> input_block =
+        Buffer<int16_t>(size_, size_, 8, size_ == 4 ? 0 : 16);
+    ASSERT_TRUE(input_block.Init());
+    Buffer<tran_low_t> output_ref_block = Buffer<tran_low_t>(size_, size_, 0);
+    ASSERT_TRUE(output_ref_block.Init());
+    Buffer<tran_low_t> output_block = Buffer<tran_low_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(output_block.Init());
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-max_pixel_value_,
+      // max_pixel_value_].
+      input_block.Set(&rnd, -max_pixel_value_, max_pixel_value_);
+
+      fwd_txfm_ref(input_block, &output_ref_block, size_, tx_type_);
+      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, &output_block));
+
+      // The minimum quant value is 4.
+      EXPECT_TRUE(output_block.CheckValues(output_ref_block));
+      if (::testing::Test::HasFailure()) {
+        printf("Size: %d Transform type: %d\n", size_, tx_type_);
+        output_block.PrintDifference(output_ref_block);
+        return;
+      }
+    }
+  }
+
+  void RunMemCheck() {
+    if (pixel_size_ == 1 && bit_depth_ > VPX_BITS_8) return;
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 5000;
+    Buffer<int16_t> input_extreme_block =
+        Buffer<int16_t>(size_, size_, 8, size_ == 4 ? 0 : 16);
+    ASSERT_TRUE(input_extreme_block.Init());
+    Buffer<tran_low_t> output_ref_block = Buffer<tran_low_t>(size_, size_, 0);
+    ASSERT_TRUE(output_ref_block.Init());
+    Buffer<tran_low_t> output_block = Buffer<tran_low_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(output_block.Init());
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with -max_pixel_value_ or max_pixel_value_.
+      if (i == 0) {
+        input_extreme_block.Set(max_pixel_value_);
+      } else if (i == 1) {
+        input_extreme_block.Set(-max_pixel_value_);
+      } else {
+        ASSERT_NE(input_extreme_block.TopLeftPixel(), nullptr);
+        for (int h = 0; h < size_; ++h) {
+          for (int w = 0; w < size_; ++w) {
+            input_extreme_block
+                .TopLeftPixel()[h * input_extreme_block.stride() + w] =
+                rnd.Rand8() % 2 ? max_pixel_value_ : -max_pixel_value_;
+          }
+        }
+      }
+
+      fwd_txfm_ref(input_extreme_block, &output_ref_block, size_, tx_type_);
+      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_extreme_block, &output_block));
+
+      // The minimum quant value is 4.
+      EXPECT_TRUE(output_block.CheckValues(output_ref_block));
+      ASSERT_NE(output_block.TopLeftPixel(), nullptr);
+      for (int h = 0; h < size_; ++h) {
+        for (int w = 0; w < size_; ++w) {
+          EXPECT_GE(
+              4 * DCT_MAX_VALUE << (bit_depth_ - 8),
+              abs(output_block.TopLeftPixel()[h * output_block.stride() + w]))
+              << "Error: " << size_ << "x" << size_
+              << " transform has coefficient larger than 4*DCT_MAX_VALUE"
+              << " at " << w << "," << h;
+          if (::testing::Test::HasFailure()) {
+            printf("Size: %d Transform type: %d\n", size_, tx_type_);
+            output_block.DumpBuffer();
+            return;
+          }
+        }
+      }
+    }
+  }
+
+  void RunInvAccuracyCheck(int limit) {
+    if (pixel_size_ == 1 && bit_depth_ > VPX_BITS_8) return;
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+    Buffer<int16_t> in = Buffer<int16_t>(size_, size_, 4);
+    ASSERT_TRUE(in.Init());
+    Buffer<tran_low_t> coeff = Buffer<tran_low_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(coeff.Init());
+
+    for (int i = 0; i < count_test_block; ++i) {
+      InitMem();
+      ASSERT_NE(in.TopLeftPixel(), nullptr);
+      // Initialize a test block with input range [-max_pixel_value_,
+      // max_pixel_value_].
+      for (int h = 0; h < size_; ++h) {
+        for (int w = 0; w < size_; ++w) {
+          if (pixel_size_ == 1) {
+            in.TopLeftPixel()[h * in.stride() + w] =
+                src_[h * stride_ + w] - dst_[h * stride_ + w];
+          } else {
+            ASSERT_EQ(pixel_size_, 2);
+            const uint16_t *const src = reinterpret_cast<uint16_t *>(src_);
+            const uint16_t *const dst = reinterpret_cast<uint16_t *>(dst_);
+            in.TopLeftPixel()[h * in.stride() + w] =
+                src[h * stride_ + w] - dst[h * stride_ + w];
+          }
+        }
+      }
+
+      fwd_txfm_ref(in, &coeff, size_, tx_type_);
+
+      ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst_));
+
+      for (int h = 0; h < size_; ++h) {
+        for (int w = 0; w < size_; ++w) {
+          int diff;
+          if (pixel_size_ == 1) {
+            diff = dst_[h * stride_ + w] - src_[h * stride_ + w];
+          } else {
+            ASSERT_EQ(pixel_size_, 2);
+            const uint16_t *const src = reinterpret_cast<uint16_t *>(src_);
+            const uint16_t *const dst = reinterpret_cast<uint16_t *>(dst_);
+            diff = dst[h * stride_ + w] - src[h * stride_ + w];
+          }
+          const uint32_t error = diff * diff;
+          EXPECT_GE(static_cast<uint32_t>(limit), error)
+              << "Error: " << size_ << "x" << size_
+              << " inverse transform has error " << error << " at " << w << ","
+              << h;
+          if (::testing::Test::HasFailure()) {
+            printf("Size: %d Transform type: %d\n", size_, tx_type_);
+            return;
+          }
+        }
+      }
+    }
+  }
+
+  FhtFunc fwd_txfm_;
+  FhtFuncRef fwd_txfm_ref;
+  IhtWithBdFunc inv_txfm_;
+  ACMRandom rnd_;
+  uint8_t *src_;
+  uint8_t *dst_;
+  vpx_bit_depth_t bit_depth_;
+  int tx_type_;
+  int max_pixel_value_;
+  int size_;
+  int stride_;
+  int pixel_size_;
+  int block_size_;
+};
+
+/* -------------------------------------------------------------------------- */
+
+class TransDCT : public TransTestBase {
+ public:
+  TransDCT() { fwd_txfm_ref = fdct_ref; }
+};
+
+TEST_P(TransDCT, AccuracyCheck) {
+  int t = 1;
+  if (size_ == 16 && bit_depth_ > 10 && pixel_size_ == 2) {
+    t = 2;
+  } else if (size_ == 32 && bit_depth_ > 10 && pixel_size_ == 2) {
+    t = 7;
+  }
+  RunAccuracyCheck(t);
+}
+
+TEST_P(TransDCT, CoeffCheck) { RunCoeffCheck(); }
+
+TEST_P(TransDCT, MemCheck) { RunMemCheck(); }
+
+TEST_P(TransDCT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
+
+static const FuncInfo dct_c_func_info[] = {
+#if CONFIG_VP9_HIGHBITDEPTH
+  { &fdct_wrapper<vpx_highbd_fdct4x4_c>,
+    &highbd_idct_wrapper<vpx_highbd_idct4x4_16_add_c>, 4, 2 },
+  { &fdct_wrapper<vpx_highbd_fdct8x8_c>,
+    &highbd_idct_wrapper<vpx_highbd_idct8x8_64_add_c>, 8, 2 },
+  { &fdct_wrapper<vpx_highbd_fdct16x16_c>,
+    &highbd_idct_wrapper<vpx_highbd_idct16x16_256_add_c>, 16, 2 },
+  { &fdct_wrapper<vpx_highbd_fdct32x32_c>,
+    &highbd_idct_wrapper<vpx_highbd_idct32x32_1024_add_c>, 32, 2 },
+#endif
+  { &fdct_wrapper<vpx_fdct4x4_c>, &idct_wrapper<vpx_idct4x4_16_add_c>, 4, 1 },
+  { &fdct_wrapper<vpx_fdct8x8_c>, &idct_wrapper<vpx_idct8x8_64_add_c>, 8, 1 },
+  { &fdct_wrapper<vpx_fdct16x16_c>, &idct_wrapper<vpx_idct16x16_256_add_c>, 16,
+    1 },
+  { &fdct_wrapper<vpx_fdct32x32_c>, &idct_wrapper<vpx_idct32x32_1024_add_c>, 32,
+    1 }
+};
+
+INSTANTIATE_TEST_SUITE_P(
+    C, TransDCT,
+    ::testing::Combine(
+        ::testing::Range(0, static_cast<int>(sizeof(dct_c_func_info) /
+                                             sizeof(dct_c_func_info[0]))),
+        ::testing::Values(dct_c_func_info), ::testing::Values(0),
+        ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12)));
+
+#if !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_SSE2
+static const FuncInfo dct_sse2_func_info[] = {
+#if CONFIG_VP9_HIGHBITDEPTH
+  { &fdct_wrapper<vpx_highbd_fdct4x4_sse2>,
+    &highbd_idct_wrapper<vpx_highbd_idct4x4_16_add_sse2>, 4, 2 },
+  { &fdct_wrapper<vpx_highbd_fdct8x8_sse2>,
+    &highbd_idct_wrapper<vpx_highbd_idct8x8_64_add_sse2>, 8, 2 },
+  { &fdct_wrapper<vpx_highbd_fdct16x16_sse2>,
+    &highbd_idct_wrapper<vpx_highbd_idct16x16_256_add_sse2>, 16, 2 },
+  { &fdct_wrapper<vpx_highbd_fdct32x32_sse2>,
+    &highbd_idct_wrapper<vpx_highbd_idct32x32_1024_add_sse2>, 32, 2 },
+#endif
+  { &fdct_wrapper<vpx_fdct4x4_sse2>, &idct_wrapper<vpx_idct4x4_16_add_sse2>, 4,
+    1 },
+  { &fdct_wrapper<vpx_fdct8x8_sse2>, &idct_wrapper<vpx_idct8x8_64_add_sse2>, 8,
+    1 },
+  { &fdct_wrapper<vpx_fdct16x16_sse2>,
+    &idct_wrapper<vpx_idct16x16_256_add_sse2>, 16, 1 },
+  { &fdct_wrapper<vpx_fdct32x32_sse2>,
+    &idct_wrapper<vpx_idct32x32_1024_add_sse2>, 32, 1 }
+};
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, TransDCT,
+    ::testing::Combine(
+        ::testing::Range(0, static_cast<int>(sizeof(dct_sse2_func_info) /
+                                             sizeof(dct_sse2_func_info[0]))),
+        ::testing::Values(dct_sse2_func_info), ::testing::Values(0),
+        ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12)));
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64
+// vpx_fdct8x8_ssse3 is only available in 64 bit builds.
+static const FuncInfo dct_ssse3_func_info = {
+  &fdct_wrapper<vpx_fdct8x8_ssse3>, &idct_wrapper<vpx_idct8x8_64_add_sse2>, 8, 1
+};
+
+// TODO(johannkoenig): high bit depth fdct8x8.
+INSTANTIATE_TEST_SUITE_P(SSSE3, TransDCT,
+                         ::testing::Values(make_tuple(0, &dct_ssse3_func_info,
+                                                      0, VPX_BITS_8)));
+#endif  // HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64
+
+#if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH
+static const FuncInfo dct_avx2_func_info = {
+  &fdct_wrapper<vpx_fdct32x32_avx2>, &idct_wrapper<vpx_idct32x32_1024_add_sse2>,
+  32, 1
+};
+
+// TODO(johannkoenig): high bit depth fdct32x32.
+INSTANTIATE_TEST_SUITE_P(AVX2, TransDCT,
+                         ::testing::Values(make_tuple(0, &dct_avx2_func_info, 0,
+                                                      VPX_BITS_8)));
+#endif  // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_NEON
+#if CONFIG_VP9_HIGHBITDEPTH
+static const FuncInfo dct_neon_func_info[] = {
+  { &fdct_wrapper<vpx_highbd_fdct4x4_neon>,
+    &highbd_idct_wrapper<vpx_highbd_idct4x4_16_add_neon>, 4, 2 },
+  { &fdct_wrapper<vpx_highbd_fdct8x8_neon>,
+    &highbd_idct_wrapper<vpx_highbd_idct8x8_64_add_neon>, 8, 2 },
+  { &fdct_wrapper<vpx_highbd_fdct16x16_neon>,
+    &highbd_idct_wrapper<vpx_highbd_idct16x16_256_add_neon>, 16, 2 },
+  /* { &fdct_wrapper<vpx_highbd_fdct32x32_neon>,
+       &highbd_idct_wrapper<vpx_highbd_idct32x32_1024_add_neon>, 32, 2 },*/
+};
+#else
+static const FuncInfo dct_neon_func_info[4] = {
+  { &fdct_wrapper<vpx_fdct4x4_neon>, &idct_wrapper<vpx_idct4x4_16_add_neon>, 4,
+    1 },
+  { &fdct_wrapper<vpx_fdct8x8_neon>, &idct_wrapper<vpx_idct8x8_64_add_neon>, 8,
+    1 },
+  { &fdct_wrapper<vpx_fdct16x16_neon>,
+    &idct_wrapper<vpx_idct16x16_256_add_neon>, 16, 1 },
+  { &fdct_wrapper<vpx_fdct32x32_neon>,
+    &idct_wrapper<vpx_idct32x32_1024_add_neon>, 32, 1 }
+};
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, TransDCT,
+    ::testing::Combine(
+        ::testing::Range(0, static_cast<int>(sizeof(dct_neon_func_info) /
+                                             sizeof(dct_neon_func_info[0]))),
+        ::testing::Values(dct_neon_func_info), ::testing::Values(0),
+        ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12)));
+#endif  // HAVE_NEON
+
+#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH
+static const FuncInfo dct_msa_func_info[4] = {
+  { &fdct_wrapper<vpx_fdct4x4_msa>, &idct_wrapper<vpx_idct4x4_16_add_msa>, 4,
+    1 },
+  { &fdct_wrapper<vpx_fdct8x8_msa>, &idct_wrapper<vpx_idct8x8_64_add_msa>, 8,
+    1 },
+  { &fdct_wrapper<vpx_fdct16x16_msa>, &idct_wrapper<vpx_idct16x16_256_add_msa>,
+    16, 1 },
+  { &fdct_wrapper<vpx_fdct32x32_msa>, &idct_wrapper<vpx_idct32x32_1024_add_msa>,
+    32, 1 }
+};
+
+INSTANTIATE_TEST_SUITE_P(
+    MSA, TransDCT,
+    ::testing::Combine(::testing::Range(0, 4),
+                       ::testing::Values(dct_msa_func_info),
+                       ::testing::Values(0), ::testing::Values(VPX_BITS_8)));
+#endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH
+static const FuncInfo dct_vsx_func_info = {
+  &fdct_wrapper<vpx_fdct4x4_c>, &idct_wrapper<vpx_idct4x4_16_add_vsx>, 4, 1
+};
+
+INSTANTIATE_TEST_SUITE_P(VSX, TransDCT,
+                         ::testing::Values(make_tuple(0, &dct_vsx_func_info, 0,
+                                                      VPX_BITS_8)));
+#endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH &&
+
+#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
+static const FuncInfo dct_lsx_func_info[4] = {
+  { &fdct_wrapper<vpx_fdct4x4_lsx>, &idct_wrapper<vpx_idct4x4_16_add_c>, 4, 1 },
+  { &fdct_wrapper<vpx_fdct8x8_lsx>, &idct_wrapper<vpx_idct8x8_64_add_c>, 8, 1 },
+  { &fdct_wrapper<vpx_fdct16x16_lsx>, &idct_wrapper<vpx_idct16x16_256_add_c>,
+    16, 1 },
+  { &fdct_wrapper<vpx_fdct32x32_lsx>, &idct_wrapper<vpx_idct32x32_1024_add_lsx>,
+    32, 1 }
+};
+
+INSTANTIATE_TEST_SUITE_P(
+    LSX, TransDCT,
+    ::testing::Combine(::testing::Range(0, 4),
+                       ::testing::Values(dct_lsx_func_info),
+                       ::testing::Values(0), ::testing::Values(VPX_BITS_8)));
+#endif  // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
+
+#endif  // !CONFIG_EMULATE_HARDWARE
+
+/* -------------------------------------------------------------------------- */
+
+class TransHT : public TransTestBase {
+ public:
+  TransHT() { fwd_txfm_ref = fht_ref; }
+};
+
+TEST_P(TransHT, AccuracyCheck) {
+  RunAccuracyCheck(size_ == 16 && bit_depth_ > 10 && pixel_size_ == 2 ? 2 : 1);
+}
+
+TEST_P(TransHT, CoeffCheck) { RunCoeffCheck(); }
+
+TEST_P(TransHT, MemCheck) { RunMemCheck(); }
+
+TEST_P(TransHT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
+
+static const FuncInfo ht_c_func_info[] = {
+#if CONFIG_VP9_HIGHBITDEPTH
+  { &vp9_highbd_fht4x4_c, &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_c>, 4,
+    2 },
+  { &vp9_highbd_fht8x8_c, &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_c>, 8,
+    2 },
+  { &vp9_highbd_fht16x16_c, &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>,
+    16, 2 },
+#endif
+  { &vp9_fht4x4_c, &iht_wrapper<vp9_iht4x4_16_add_c>, 4, 1 },
+  { &vp9_fht8x8_c, &iht_wrapper<vp9_iht8x8_64_add_c>, 8, 1 },
+  { &vp9_fht16x16_c, &iht_wrapper<vp9_iht16x16_256_add_c>, 16, 1 }
+};
+
+INSTANTIATE_TEST_SUITE_P(
+    C, TransHT,
+    ::testing::Combine(
+        ::testing::Range(0, static_cast<int>(sizeof(ht_c_func_info) /
+                                             sizeof(ht_c_func_info[0]))),
+        ::testing::Values(ht_c_func_info), ::testing::Range(0, 4),
+        ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12)));
+
+#if !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_NEON
+
+static const FuncInfo ht_neon_func_info[] = {
+#if CONFIG_VP9_HIGHBITDEPTH
+  { &vp9_highbd_fht4x4_c, &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4,
+    2 },
+  { &vp9_highbd_fht4x4_neon, &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>,
+    4, 2 },
+  { &vp9_highbd_fht8x8_c, &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_neon>, 8,
+    2 },
+  { &vp9_highbd_fht8x8_neon, &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_neon>,
+    8, 2 },
+  { &vp9_highbd_fht16x16_c,
+    &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_neon>, 16, 2 },
+  { &vp9_highbd_fht16x16_neon,
+    &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_neon>, 16, 2 },
+#endif
+  { &vp9_fht4x4_c, &iht_wrapper<vp9_iht4x4_16_add_neon>, 4, 1 },
+  { &vp9_fht4x4_neon, &iht_wrapper<vp9_iht4x4_16_add_neon>, 4, 1 },
+  { &vp9_fht8x8_c, &iht_wrapper<vp9_iht8x8_64_add_neon>, 8, 1 },
+  { &vp9_fht8x8_neon, &iht_wrapper<vp9_iht8x8_64_add_neon>, 8, 1 },
+  { &vp9_fht16x16_c, &iht_wrapper<vp9_iht16x16_256_add_neon>, 16, 1 },
+  { &vp9_fht16x16_neon, &iht_wrapper<vp9_iht16x16_256_add_neon>, 16, 1 }
+};
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, TransHT,
+    ::testing::Combine(
+        ::testing::Range(0, static_cast<int>(sizeof(ht_neon_func_info) /
+                                             sizeof(ht_neon_func_info[0]))),
+        ::testing::Values(ht_neon_func_info), ::testing::Range(0, 4),
+        ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12)));
+#endif  // HAVE_NEON
+
+#if HAVE_SSE2
+
+static const FuncInfo ht_sse2_func_info[3] = {
+  { &vp9_fht4x4_sse2, &iht_wrapper<vp9_iht4x4_16_add_sse2>, 4, 1 },
+  { &vp9_fht8x8_sse2, &iht_wrapper<vp9_iht8x8_64_add_sse2>, 8, 1 },
+  { &vp9_fht16x16_sse2, &iht_wrapper<vp9_iht16x16_256_add_sse2>, 16, 1 }
+};
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, TransHT,
+    ::testing::Combine(::testing::Range(0, 3),
+                       ::testing::Values(ht_sse2_func_info),
+                       ::testing::Range(0, 4), ::testing::Values(VPX_BITS_8)));
+#endif  // HAVE_SSE2
+
+#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+static const FuncInfo ht_sse4_1_func_info[3] = {
+  { &vp9_highbd_fht4x4_c, &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_sse4_1>,
+    4, 2 },
+  { vp9_highbd_fht8x8_c, &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>,
+    8, 2 },
+  { &vp9_highbd_fht16x16_c,
+    &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16, 2 }
+};
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, TransHT,
+    ::testing::Combine(::testing::Range(0, 3),
+                       ::testing::Values(ht_sse4_1_func_info),
+                       ::testing::Range(0, 4),
+                       ::testing::Values(VPX_BITS_8, VPX_BITS_10,
+                                         VPX_BITS_12)));
+#endif  // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_VSX && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH
+static const FuncInfo ht_vsx_func_info[3] = {
+  { &vp9_fht4x4_c, &iht_wrapper<vp9_iht4x4_16_add_vsx>, 4, 1 },
+  { &vp9_fht8x8_c, &iht_wrapper<vp9_iht8x8_64_add_vsx>, 8, 1 },
+  { &vp9_fht16x16_c, &iht_wrapper<vp9_iht16x16_256_add_vsx>, 16, 1 }
+};
+
+INSTANTIATE_TEST_SUITE_P(VSX, TransHT,
+                         ::testing::Combine(::testing::Range(0, 3),
+                                            ::testing::Values(ht_vsx_func_info),
+                                            ::testing::Range(0, 4),
+                                            ::testing::Values(VPX_BITS_8)));
+#endif  // HAVE_VSX
+#endif  // !CONFIG_EMULATE_HARDWARE
+
+/* -------------------------------------------------------------------------- */
+
+class TransWHT : public TransTestBase {
+ public:
+  TransWHT() { fwd_txfm_ref = fwht_ref; }
+};
+
+TEST_P(TransWHT, AccuracyCheck) { RunAccuracyCheck(0); }
+
+TEST_P(TransWHT, CoeffCheck) { RunCoeffCheck(); }
+
+TEST_P(TransWHT, MemCheck) { RunMemCheck(); }
+
+TEST_P(TransWHT, InvAccuracyCheck) { RunInvAccuracyCheck(0); }
+
+static const FuncInfo wht_c_func_info[] = {
+#if CONFIG_VP9_HIGHBITDEPTH
+  { &fdct_wrapper<vp9_highbd_fwht4x4_c>,
+    &highbd_idct_wrapper<vpx_highbd_iwht4x4_16_add_c>, 4, 2 },
+#endif
+  { &fdct_wrapper<vp9_fwht4x4_c>, &idct_wrapper<vpx_iwht4x4_16_add_c>, 4, 1 }
+};
+
+INSTANTIATE_TEST_SUITE_P(
+    C, TransWHT,
+    ::testing::Combine(
+        ::testing::Range(0, static_cast<int>(sizeof(wht_c_func_info) /
+                                             sizeof(wht_c_func_info[0]))),
+        ::testing::Values(wht_c_func_info), ::testing::Values(0),
+        ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12)));
+
+#if HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
+static const FuncInfo wht_sse2_func_info = {
+  &fdct_wrapper<vp9_fwht4x4_sse2>, &idct_wrapper<vpx_iwht4x4_16_add_sse2>, 4, 1
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE2, TransWHT,
+                         ::testing::Values(make_tuple(0, &wht_sse2_func_info, 0,
+                                                      VPX_BITS_8)));
+#endif  // HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_VSX && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH
+static const FuncInfo wht_vsx_func_info = {
+  &fdct_wrapper<vp9_fwht4x4_c>, &idct_wrapper<vpx_iwht4x4_16_add_vsx>, 4, 1
+};
+
+INSTANTIATE_TEST_SUITE_P(VSX, TransWHT,
+                         ::testing::Values(make_tuple(0, &wht_vsx_func_info, 0,
+                                                      VPX_BITS_8)));
+#endif  // HAVE_VSX && !CONFIG_EMULATE_HARDWARE
+
+}  // namespace
diff --git a/media/libvpx/libvpx/test/decode_api_test.cc b/media/libvpx/libvpx/test/decode_api_test.cc
index 593637780e..d7436ca3c2 100644
--- a/media/libvpx/libvpx/test/decode_api_test.cc
+++ b/media/libvpx/libvpx/test/decode_api_test.cc
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "./vpx_config.h"
 #include "test/ivf_video_source.h"
@@ -20,7 +20,7 @@ namespace {
 #define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))
 
 TEST(DecodeAPI, InvalidParams) {
-  static const vpx_codec_iface_t *kCodecs[] = {
+  static vpx_codec_iface_t *kCodecs[] = {
 #if CONFIG_VP8_DECODER
     &vpx_codec_vp8_dx_algo,
 #endif
@@ -31,27 +31,33 @@ TEST(DecodeAPI, InvalidParams) {
   uint8_t buf[1] = { 0 };
   vpx_codec_ctx_t dec;
 
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_dec_init(NULL, NULL, NULL, 0));
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_dec_init(&dec, NULL, NULL, 0));
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_decode(NULL, NULL, 0, NULL, 0));
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_decode(NULL, buf, 0, NULL, 0));
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
-            vpx_codec_decode(NULL, buf, NELEMENTS(buf), NULL, 0));
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
-            vpx_codec_decode(NULL, NULL, NELEMENTS(buf), NULL, 0));
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_destroy(NULL));
-  EXPECT_TRUE(vpx_codec_error(NULL) != NULL);
+  EXPECT_EQ(vpx_codec_dec_init(nullptr, nullptr, nullptr, 0),
+            VPX_CODEC_INVALID_PARAM);
+  EXPECT_EQ(vpx_codec_dec_init(&dec, nullptr, nullptr, 0),
+            VPX_CODEC_INVALID_PARAM);
+  EXPECT_EQ(vpx_codec_decode(nullptr, nullptr, 0, nullptr, 0),
+            VPX_CODEC_INVALID_PARAM);
+  EXPECT_EQ(vpx_codec_decode(nullptr, buf, 0, nullptr, 0),
+            VPX_CODEC_INVALID_PARAM);
+  EXPECT_EQ(vpx_codec_decode(nullptr, buf, NELEMENTS(buf), nullptr, 0),
+            VPX_CODEC_INVALID_PARAM);
+  EXPECT_EQ(vpx_codec_decode(nullptr, nullptr, NELEMENTS(buf), nullptr, 0),
+            VPX_CODEC_INVALID_PARAM);
+  EXPECT_EQ(vpx_codec_destroy(nullptr), VPX_CODEC_INVALID_PARAM);
+  EXPECT_NE(vpx_codec_error(nullptr), nullptr);
+  EXPECT_EQ(vpx_codec_error_detail(nullptr), nullptr);
 
   for (int i = 0; i < NELEMENTS(kCodecs); ++i) {
     EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
-              vpx_codec_dec_init(NULL, kCodecs[i], NULL, 0));
+              vpx_codec_dec_init(nullptr, kCodecs[i], nullptr, 0));
 
-    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, kCodecs[i], NULL, 0));
+    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, kCodecs[i], nullptr, 0));
     EXPECT_EQ(VPX_CODEC_UNSUP_BITSTREAM,
-              vpx_codec_decode(&dec, buf, NELEMENTS(buf), NULL, 0));
+              vpx_codec_decode(&dec, buf, NELEMENTS(buf), nullptr, 0));
     EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
-              vpx_codec_decode(&dec, NULL, NELEMENTS(buf), NULL, 0));
-    EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_decode(&dec, buf, 0, NULL, 0));
+              vpx_codec_decode(&dec, nullptr, NELEMENTS(buf), nullptr, 0));
+    EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+              vpx_codec_decode(&dec, buf, 0, nullptr, 0));
 
     EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&dec));
   }
@@ -62,11 +68,12 @@ TEST(DecodeAPI, OptionalParams) {
   vpx_codec_ctx_t dec;
 
 #if CONFIG_ERROR_CONCEALMENT
-  EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, &vpx_codec_vp8_dx_algo, NULL,
-                                             VPX_CODEC_USE_ERROR_CONCEALMENT));
+  EXPECT_EQ(VPX_CODEC_OK,
+            vpx_codec_dec_init(&dec, &vpx_codec_vp8_dx_algo, nullptr,
+                               VPX_CODEC_USE_ERROR_CONCEALMENT));
 #else
   EXPECT_EQ(VPX_CODEC_INCAPABLE,
-            vpx_codec_dec_init(&dec, &vpx_codec_vp8_dx_algo, NULL,
+            vpx_codec_dec_init(&dec, &vpx_codec_vp8_dx_algo, nullptr,
                                VPX_CODEC_USE_ERROR_CONCEALMENT));
 #endif  // CONFIG_ERROR_CONCEALMENT
 }
@@ -90,30 +97,30 @@ void TestVp9Controls(vpx_codec_ctx_t *dec) {
       default: EXPECT_EQ(VPX_CODEC_OK, res) << kControls[i]; break;
     }
     EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
-              vpx_codec_control_(dec, kControls[i], NULL));
+              vpx_codec_control_(dec, kControls[i], nullptr));
   }
 
   vp9_ref_frame_t ref;
   ref.idx = 0;
   EXPECT_EQ(VPX_CODEC_ERROR, vpx_codec_control(dec, VP9_GET_REFERENCE, &ref));
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
-            vpx_codec_control(dec, VP9_GET_REFERENCE, NULL));
+            vpx_codec_control(dec, VP9_GET_REFERENCE, nullptr));
 
   vpx_ref_frame_t ref_copy;
   const int width = 352;
   const int height = 288;
-  ASSERT_TRUE(
-      vpx_img_alloc(&ref_copy.img, VPX_IMG_FMT_I420, width, height, 1) != NULL);
+  EXPECT_NE(vpx_img_alloc(&ref_copy.img, VPX_IMG_FMT_I420, width, height, 1),
+            nullptr);
   ref_copy.frame_type = VP8_LAST_FRAME;
   EXPECT_EQ(VPX_CODEC_ERROR,
             vpx_codec_control(dec, VP8_COPY_REFERENCE, &ref_copy));
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
-            vpx_codec_control(dec, VP8_COPY_REFERENCE, NULL));
+            vpx_codec_control(dec, VP8_COPY_REFERENCE, nullptr));
   vpx_img_free(&ref_copy.img);
 }
 
 TEST(DecodeAPI, Vp9InvalidDecode) {
-  const vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo;
+  vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo;
   const char filename[] =
       "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf";
   libvpx_test::IVFVideoSource video(filename);
@@ -122,24 +129,46 @@ TEST(DecodeAPI, Vp9InvalidDecode) {
   ASSERT_TRUE(!HasFailure());
 
   vpx_codec_ctx_t dec;
-  EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, codec, NULL, 0));
+  EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, codec, nullptr, 0));
   const uint32_t frame_size = static_cast<uint32_t>(video.frame_size());
 #if CONFIG_VP9_HIGHBITDEPTH
   EXPECT_EQ(VPX_CODEC_MEM_ERROR,
-            vpx_codec_decode(&dec, video.cxdata(), frame_size, NULL, 0));
+            vpx_codec_decode(&dec, video.cxdata(), frame_size, nullptr, 0));
 #else
   EXPECT_EQ(VPX_CODEC_UNSUP_BITSTREAM,
-            vpx_codec_decode(&dec, video.cxdata(), frame_size, NULL, 0));
+            vpx_codec_decode(&dec, video.cxdata(), frame_size, nullptr, 0));
 #endif
-  vpx_codec_iter_t iter = NULL;
-  EXPECT_EQ(NULL, vpx_codec_get_frame(&dec, &iter));
+  vpx_codec_iter_t iter = nullptr;
+  EXPECT_EQ(nullptr, vpx_codec_get_frame(&dec, &iter));
 
   TestVp9Controls(&dec);
   EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&dec));
 }
 
-TEST(DecodeAPI, Vp9PeekSI) {
-  const vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo;
+void TestPeekInfo(const uint8_t *const data, uint32_t data_sz,
+                  uint32_t peek_size) {
+  vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo;
+  // Verify behavior of vpx_codec_decode. vpx_codec_decode doesn't even get
+  // to decoder_peek_si_internal on frames of size < 8.
+  if (data_sz >= 8) {
+    vpx_codec_ctx_t dec;
+    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, codec, nullptr, 0));
+    EXPECT_EQ((data_sz < peek_size) ? VPX_CODEC_UNSUP_BITSTREAM
+                                    : VPX_CODEC_CORRUPT_FRAME,
+              vpx_codec_decode(&dec, data, data_sz, nullptr, 0));
+    vpx_codec_iter_t iter = nullptr;
+    EXPECT_EQ(nullptr, vpx_codec_get_frame(&dec, &iter));
+    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&dec));
+  }
+
+  // Verify behavior of vpx_codec_peek_stream_info.
+  vpx_codec_stream_info_t si;
+  si.sz = sizeof(si);
+  EXPECT_EQ((data_sz < peek_size) ? VPX_CODEC_UNSUP_BITSTREAM : VPX_CODEC_OK,
+            vpx_codec_peek_stream_info(codec, data, data_sz, &si));
+}
+
+TEST(DecodeAPI, Vp9PeekStreamInfo) {
   // The first 9 bytes are valid and the rest of the bytes are made up. Until
   // size 10, this should return VPX_CODEC_UNSUP_BITSTREAM and after that it
   // should return VPX_CODEC_CORRUPT_FRAME.
@@ -150,26 +179,37 @@ TEST(DecodeAPI, Vp9PeekSI) {
   };
 
   for (uint32_t data_sz = 1; data_sz <= 32; ++data_sz) {
-    // Verify behavior of vpx_codec_decode. vpx_codec_decode doesn't even get
-    // to decoder_peek_si_internal on frames of size < 8.
-    if (data_sz >= 8) {
-      vpx_codec_ctx_t dec;
-      EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, codec, NULL, 0));
-      EXPECT_EQ(
-          (data_sz < 10) ? VPX_CODEC_UNSUP_BITSTREAM : VPX_CODEC_CORRUPT_FRAME,
-          vpx_codec_decode(&dec, data, data_sz, NULL, 0));
-      vpx_codec_iter_t iter = NULL;
-      EXPECT_EQ(NULL, vpx_codec_get_frame(&dec, &iter));
-      EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&dec));
-    }
+    TestPeekInfo(data, data_sz, 10);
+  }
+}
 
-    // Verify behavior of vpx_codec_peek_stream_info.
-    vpx_codec_stream_info_t si;
-    si.sz = sizeof(si);
-    EXPECT_EQ((data_sz < 10) ? VPX_CODEC_UNSUP_BITSTREAM : VPX_CODEC_OK,
-              vpx_codec_peek_stream_info(codec, data, data_sz, &si));
+TEST(DecodeAPI, Vp9PeekStreamInfoTruncated) {
+  // This profile 1 header requires 10.25 bytes, ensure
+  // vpx_codec_peek_stream_info doesn't over read.
+  const uint8_t profile1_data[10] = { 0xa4, 0xe9, 0x30, 0x68, 0x53,
+                                      0xe9, 0x30, 0x68, 0x53, 0x04 };
+
+  for (uint32_t data_sz = 1; data_sz <= 10; ++data_sz) {
+    TestPeekInfo(profile1_data, data_sz, 11);
   }
 }
 #endif  // CONFIG_VP9_DECODER
 
+TEST(DecodeAPI, HighBitDepthCapability) {
+// VP8 should not claim VP9 HBD as a capability.
+#if CONFIG_VP8_DECODER
+  const vpx_codec_caps_t vp8_caps = vpx_codec_get_caps(&vpx_codec_vp8_dx_algo);
+  EXPECT_EQ(vp8_caps & VPX_CODEC_CAP_HIGHBITDEPTH, 0);
+#endif
+
+#if CONFIG_VP9_DECODER
+  const vpx_codec_caps_t vp9_caps = vpx_codec_get_caps(&vpx_codec_vp9_dx_algo);
+#if CONFIG_VP9_HIGHBITDEPTH
+  EXPECT_EQ(vp9_caps & VPX_CODEC_CAP_HIGHBITDEPTH, VPX_CODEC_CAP_HIGHBITDEPTH);
+#else
+  EXPECT_EQ(vp9_caps & VPX_CODEC_CAP_HIGHBITDEPTH, 0);
+#endif
+#endif
+}
+
 }  // namespace
diff --git a/media/libvpx/libvpx/test/decode_corrupted.cc b/media/libvpx/libvpx/test/decode_corrupted.cc
new file mode 100644
index 0000000000..55919d0f49
--- /dev/null
+++ b/media/libvpx/libvpx/test/decode_corrupted.cc
@@ -0,0 +1,104 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <tuple>
+
+#include "gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/i420_video_source.h"
+#include "vpx_config.h"
+#include "vpx_mem/vpx_mem.h"
+
+namespace {
+
+class DecodeCorruptedFrameTest
+    : public ::libvpx_test::EncoderTest,
+      public ::testing::TestWithParam<
+          std::tuple<const libvpx_test::CodecFactory *> > {
+ public:
+  DecodeCorruptedFrameTest() : EncoderTest(GET_PARAM(0)) {}
+
+ protected:
+  ~DecodeCorruptedFrameTest() override = default;
+
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    cfg_.g_lag_in_frames = 0;
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 600;
+
+    // Set small key frame distance such that we insert more key frames.
+    cfg_.kf_max_dist = 3;
+    dec_cfg_.threads = 1;
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) encoder->Control(VP8E_SET_CPUUSED, 7);
+  }
+
+  void MismatchHook(const vpx_image_t * /*img1*/,
+                    const vpx_image_t * /*img2*/) override {}
+
+  const vpx_codec_cx_pkt_t *MutateEncoderOutputHook(
+      const vpx_codec_cx_pkt_t *pkt) override {
+    // Don't edit frame packet on key frame.
+    if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) return pkt;
+    if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return pkt;
+
+    modified_pkt_ = *pkt;
+
+    // Halve the size so it's corrupted to decoder.
+    modified_pkt_.data.frame.sz = modified_pkt_.data.frame.sz / 2;
+
+    return &modified_pkt_;
+  }
+
+  bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+                          const libvpx_test::VideoSource & /*video*/,
+                          libvpx_test::Decoder *decoder) override {
+    EXPECT_NE(res_dec, VPX_CODEC_MEM_ERROR) << decoder->DecodeError();
+    return VPX_CODEC_MEM_ERROR != res_dec;
+  }
+
+  vpx_codec_cx_pkt_t modified_pkt_;
+};
+
+TEST_P(DecodeCorruptedFrameTest, DecodeCorruptedFrame) {
+  cfg_.rc_target_bitrate = 200;
+  cfg_.g_error_resilient = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+#if CONFIG_VP9
+INSTANTIATE_TEST_SUITE_P(
+    VP9, DecodeCorruptedFrameTest,
+    ::testing::Values(
+        static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)));
+#endif  // CONFIG_VP9
+
+#if CONFIG_VP8
+INSTANTIATE_TEST_SUITE_P(
+    VP8, DecodeCorruptedFrameTest,
+    ::testing::Values(
+        static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP8)));
+#endif  // CONFIG_VP8
+
+}  // namespace
diff --git a/media/libvpx/libvpx/test/decode_perf_test.cc b/media/libvpx/libvpx/test/decode_perf_test.cc
index ee26c3c046..69c30a9e80 100644
--- a/media/libvpx/libvpx/test/decode_perf_test.cc
+++ b/media/libvpx/libvpx/test/decode_perf_test.cc
@@ -9,6 +9,8 @@
  */
 
 #include <string>
+#include <tuple>
+
 #include "test/codec_factory.h"
 #include "test/decode_test_driver.h"
 #include "test/encode_test_driver.h"
@@ -17,11 +19,11 @@
 #include "test/md5_helper.h"
 #include "test/util.h"
 #include "test/webm_video_source.h"
+#include "vpx/vpx_codec.h"
 #include "vpx_ports/vpx_timer.h"
 #include "./ivfenc.h"
-#include "./vpx_version.h"
 
-using std::tr1::make_tuple;
+using std::make_tuple;
 
 namespace {
 
@@ -34,7 +36,7 @@ const char kNewEncodeOutputFile[] = "new_encode.ivf";
 /*
  DecodePerfTest takes a tuple of filename + number of threads to decode with
  */
-typedef std::tr1::tuple<const char *, unsigned> DecodePerfParam;
+using DecodePerfParam = std::tuple<const char *, unsigned int>;
 
 const DecodePerfParam kVP9DecodePerfVectors[] = {
   make_tuple("vp90-2-bbb_426x240_tile_1x1_180kbps.webm", 1),
@@ -85,7 +87,7 @@ TEST_P(DecodePerfTest, PerfTest) {
   vpx_usec_timer t;
   vpx_usec_timer_start(&t);
 
-  for (video.Begin(); video.cxdata() != NULL; video.Next()) {
+  for (video.Begin(); video.cxdata() != nullptr; video.Next()) {
     decoder.DecodeFrame(video.cxdata(), video.frame_size());
   }
 
@@ -96,7 +98,7 @@ TEST_P(DecodePerfTest, PerfTest) {
 
   printf("{\n");
   printf("\t\"type\" : \"decode_perf_test\",\n");
-  printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP);
+  printf("\t\"version\" : \"%s\",\n", vpx_codec_version_str());
   printf("\t\"videoName\" : \"%s\",\n", video_name);
   printf("\t\"threadCount\" : %u,\n", threads);
   printf("\t\"decodeTimeSecs\" : %f,\n", elapsed_secs);
@@ -105,8 +107,8 @@ TEST_P(DecodePerfTest, PerfTest) {
   printf("}\n");
 }
 
-INSTANTIATE_TEST_CASE_P(VP9, DecodePerfTest,
-                        ::testing::ValuesIn(kVP9DecodePerfVectors));
+INSTANTIATE_TEST_SUITE_P(VP9, DecodePerfTest,
+                         ::testing::ValuesIn(kVP9DecodePerfVectors));
 
 class VP9NewEncodeDecodePerfTest
     : public ::libvpx_test::EncoderTest,
@@ -114,11 +116,11 @@ class VP9NewEncodeDecodePerfTest
  protected:
   VP9NewEncodeDecodePerfTest()
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), speed_(0),
-        outfile_(0), out_frames_(0) {}
+        outfile_(nullptr), out_frames_(0) {}
 
-  virtual ~VP9NewEncodeDecodePerfTest() {}
+  ~VP9NewEncodeDecodePerfTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
 
@@ -135,33 +137,33 @@ class VP9NewEncodeDecodePerfTest
     cfg_.rc_end_usage = VPX_VBR;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, speed_);
       encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1);
       encoder->Control(VP9E_SET_TILE_COLUMNS, 2);
     }
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     const std::string data_path = getenv("LIBVPX_TEST_DATA_PATH");
     const std::string path_to_source = data_path + "/" + kNewEncodeOutputFile;
     outfile_ = fopen(path_to_source.c_str(), "wb");
-    ASSERT_TRUE(outfile_ != NULL);
+    ASSERT_NE(outfile_, nullptr);
   }
 
-  virtual void EndPassHook() {
-    if (outfile_ != NULL) {
+  void EndPassHook() override {
+    if (outfile_ != nullptr) {
       if (!fseek(outfile_, 0, SEEK_SET)) {
         ivf_write_file_header(outfile_, &cfg_, VP9_FOURCC, out_frames_);
       }
       fclose(outfile_);
-      outfile_ = NULL;
+      outfile_ = nullptr;
     }
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     ++out_frames_;
 
     // Write initial file header if first frame.
@@ -175,7 +177,7 @@ class VP9NewEncodeDecodePerfTest
               pkt->data.frame.sz);
   }
 
-  virtual bool DoDecode() const { return false; }
+  bool DoDecode() const override { return false; }
 
   void set_speed(unsigned int speed) { speed_ = speed; }
 
@@ -234,7 +236,7 @@ TEST_P(VP9NewEncodeDecodePerfTest, PerfTest) {
   vpx_usec_timer t;
   vpx_usec_timer_start(&t);
 
-  for (decode_video.Begin(); decode_video.cxdata() != NULL;
+  for (decode_video.Begin(); decode_video.cxdata() != nullptr;
        decode_video.Next()) {
     decoder.DecodeFrame(decode_video.cxdata(), decode_video.frame_size());
   }
@@ -247,7 +249,7 @@ TEST_P(VP9NewEncodeDecodePerfTest, PerfTest) {
 
   printf("{\n");
   printf("\t\"type\" : \"decode_perf_test\",\n");
-  printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP);
+  printf("\t\"version\" : \"%s\",\n", vpx_codec_version_str());
   printf("\t\"videoName\" : \"%s\",\n", kNewEncodeOutputFile);
   printf("\t\"threadCount\" : %u,\n", threads);
   printf("\t\"decodeTimeSecs\" : %f,\n", elapsed_secs);
@@ -256,6 +258,6 @@ TEST_P(VP9NewEncodeDecodePerfTest, PerfTest) {
   printf("}\n");
 }
 
-VP9_INSTANTIATE_TEST_CASE(VP9NewEncodeDecodePerfTest,
-                          ::testing::Values(::libvpx_test::kTwoPassGood));
+VP9_INSTANTIATE_TEST_SUITE(VP9NewEncodeDecodePerfTest,
+                           ::testing::Values(::libvpx_test::kTwoPassGood));
 }  // namespace
diff --git a/media/libvpx/libvpx/test/decode_svc_test.cc b/media/libvpx/libvpx/test/decode_svc_test.cc
index 69f62f13bd..7098e7b270 100644
--- a/media/libvpx/libvpx/test/decode_svc_test.cc
+++ b/media/libvpx/libvpx/test/decode_svc_test.cc
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <memory>
 #include <string>
 
 #include "test/codec_factory.h"
@@ -24,17 +25,16 @@ class DecodeSvcTest : public ::libvpx_test::DecoderTest,
                       public ::libvpx_test::CodecTestWithParam<const char *> {
  protected:
   DecodeSvcTest() : DecoderTest(GET_PARAM(::libvpx_test::kCodecFactoryParam)) {}
-  virtual ~DecodeSvcTest() {}
+  ~DecodeSvcTest() override = default;
 
-  virtual void PreDecodeFrameHook(
-      const libvpx_test::CompressedVideoSource &video,
-      libvpx_test::Decoder *decoder) {
+  void PreDecodeFrameHook(const libvpx_test::CompressedVideoSource &video,
+                          libvpx_test::Decoder *decoder) override {
     if (video.frame_number() == 0)
       decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER, spatial_layer_);
   }
 
-  virtual void DecompressedFrameHook(const vpx_image_t &img,
-                                     const unsigned int frame_number) {
+  void DecompressedFrameHook(const vpx_image_t &img,
+                             const unsigned int frame_number) override {
     ASSERT_EQ(img.d_w, width_);
     ASSERT_EQ(img.d_h, height_);
     total_frames_ = frame_number;
@@ -53,9 +53,9 @@ class DecodeSvcTest : public ::libvpx_test::DecoderTest,
 // number of frames decoded. This results in 1/4x1/4 resolution (320x180).
 TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer0) {
   const std::string filename = GET_PARAM(1);
-  testing::internal::scoped_ptr<libvpx_test::CompressedVideoSource> video;
+  std::unique_ptr<libvpx_test::CompressedVideoSource> video;
   video.reset(new libvpx_test::IVFVideoSource(filename));
-  ASSERT_TRUE(video.get() != NULL);
+  ASSERT_NE(video.get(), nullptr);
   video->Init();
   total_frames_ = 0;
   spatial_layer_ = 0;
@@ -70,9 +70,9 @@ TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer0) {
 // number of frames decoded. This results in 1/2x1/2 resolution (640x360).
 TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer1) {
   const std::string filename = GET_PARAM(1);
-  testing::internal::scoped_ptr<libvpx_test::CompressedVideoSource> video;
+  std::unique_ptr<libvpx_test::CompressedVideoSource> video;
   video.reset(new libvpx_test::IVFVideoSource(filename));
-  ASSERT_TRUE(video.get() != NULL);
+  ASSERT_NE(video.get(), nullptr);
   video->Init();
   total_frames_ = 0;
   spatial_layer_ = 1;
@@ -87,9 +87,9 @@ TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer1) {
 // number of frames decoded. This results in the full resolution (1280x720).
 TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer2) {
   const std::string filename = GET_PARAM(1);
-  testing::internal::scoped_ptr<libvpx_test::CompressedVideoSource> video;
+  std::unique_ptr<libvpx_test::CompressedVideoSource> video;
   video.reset(new libvpx_test::IVFVideoSource(filename));
-  ASSERT_TRUE(video.get() != NULL);
+  ASSERT_NE(video.get(), nullptr);
   video->Init();
   total_frames_ = 0;
   spatial_layer_ = 2;
@@ -105,9 +105,9 @@ TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer2) {
 // the decoding should result in the full resolution (1280x720).
 TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer10) {
   const std::string filename = GET_PARAM(1);
-  testing::internal::scoped_ptr<libvpx_test::CompressedVideoSource> video;
+  std::unique_ptr<libvpx_test::CompressedVideoSource> video;
   video.reset(new libvpx_test::IVFVideoSource(filename));
-  ASSERT_TRUE(video.get() != NULL);
+  ASSERT_NE(video.get(), nullptr);
   video->Init();
   total_frames_ = 0;
   spatial_layer_ = 10;
@@ -117,7 +117,7 @@ TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer10) {
   ASSERT_EQ(total_frames_, kNumFrames);
 }
 
-VP9_INSTANTIATE_TEST_CASE(
+VP9_INSTANTIATE_TEST_SUITE(
     DecodeSvcTest, ::testing::ValuesIn(libvpx_test::kVP9TestVectorsSvc,
                                        libvpx_test::kVP9TestVectorsSvc +
                                            libvpx_test::kNumVP9TestVectorsSvc));
diff --git a/media/libvpx/libvpx/test/decode_test_driver.cc b/media/libvpx/libvpx/test/decode_test_driver.cc
index b738e0db1c..af87bc25fb 100644
--- a/media/libvpx/libvpx/test/decode_test_driver.cc
+++ b/media/libvpx/libvpx/test/decode_test_driver.cc
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "test/codec_factory.h"
 #include "test/decode_test_driver.h"
@@ -26,7 +26,7 @@ vpx_codec_err_t Decoder::PeekStream(const uint8_t *cxdata, size_t size,
 }
 
 vpx_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, size_t size) {
-  return DecodeFrame(cxdata, size, NULL);
+  return DecodeFrame(cxdata, size, nullptr);
 }
 
 vpx_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, size_t size,
@@ -52,21 +52,22 @@ void DecoderTest::HandlePeekResult(Decoder *const decoder,
     /* Vp8's implementation of PeekStream returns an error if the frame you
      * pass it is not a keyframe, so we only expect VPX_CODEC_OK on the first
      * frame, which must be a keyframe. */
-    if (video->frame_number() == 0)
-      ASSERT_EQ(VPX_CODEC_OK, res_peek) << "Peek return failed: "
-                                        << vpx_codec_err_to_string(res_peek);
+    if (video->frame_number() == 0) {
+      ASSERT_EQ(VPX_CODEC_OK, res_peek)
+          << "Peek return failed: " << vpx_codec_err_to_string(res_peek);
+    }
   } else {
     /* The Vp9 implementation of PeekStream returns an error only if the
      * data passed to it isn't a valid Vp9 chunk. */
-    ASSERT_EQ(VPX_CODEC_OK, res_peek) << "Peek return failed: "
-                                      << vpx_codec_err_to_string(res_peek);
+    ASSERT_EQ(VPX_CODEC_OK, res_peek)
+        << "Peek return failed: " << vpx_codec_err_to_string(res_peek);
   }
 }
 
 void DecoderTest::RunLoop(CompressedVideoSource *video,
                           const vpx_codec_dec_cfg_t &dec_cfg) {
   Decoder *const decoder = codec_->CreateDecoder(dec_cfg, flags_);
-  ASSERT_TRUE(decoder != NULL);
+  ASSERT_NE(decoder, nullptr);
   bool end_of_file = false;
 
   // Decode frames.
@@ -77,7 +78,7 @@ void DecoderTest::RunLoop(CompressedVideoSource *video,
     vpx_codec_stream_info_t stream_info;
     stream_info.sz = sizeof(stream_info);
 
-    if (video->cxdata() != NULL) {
+    if (video->cxdata() != nullptr) {
       const vpx_codec_err_t res_peek = decoder->PeekStream(
           video->cxdata(), video->frame_size(), &stream_info);
       HandlePeekResult(decoder, video, res_peek);
@@ -88,16 +89,16 @@ void DecoderTest::RunLoop(CompressedVideoSource *video,
       if (!HandleDecodeResult(res_dec, *video, decoder)) break;
     } else {
       // Signal end of the file to the decoder.
-      const vpx_codec_err_t res_dec = decoder->DecodeFrame(NULL, 0);
+      const vpx_codec_err_t res_dec = decoder->DecodeFrame(nullptr, 0);
       ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
       end_of_file = true;
     }
 
     DxDataIterator dec_iter = decoder->GetDxData();
-    const vpx_image_t *img = NULL;
+    const vpx_image_t *img = nullptr;
 
     // Get decompressed data
-    while ((img = dec_iter.Next())) {
+    while (!::testing::Test::HasFailure() && (img = dec_iter.Next())) {
       DecompressedFrameHook(*img, video->frame_number());
     }
   }
@@ -110,7 +111,7 @@ void DecoderTest::RunLoop(CompressedVideoSource *video) {
 }
 
 void DecoderTest::set_cfg(const vpx_codec_dec_cfg_t &dec_cfg) {
-  memcpy(&cfg_, &dec_cfg, sizeof(cfg_));
+  cfg_ = dec_cfg;
 }
 
 void DecoderTest::set_flags(const vpx_codec_flags_t flags) { flags_ = flags; }
diff --git a/media/libvpx/libvpx/test/decode_test_driver.h b/media/libvpx/libvpx/test/decode_test_driver.h
index 644fc9e90d..81f5001ebc 100644
--- a/media/libvpx/libvpx/test/decode_test_driver.h
+++ b/media/libvpx/libvpx/test/decode_test_driver.h
@@ -8,10 +8,10 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef TEST_DECODE_TEST_DRIVER_H_
-#define TEST_DECODE_TEST_DRIVER_H_
+#ifndef VPX_TEST_DECODE_TEST_DRIVER_H_
+#define VPX_TEST_DECODE_TEST_DRIVER_H_
 #include <cstring>
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "./vpx_config.h"
 #include "vpx/vpx_decoder.h"
 
@@ -24,7 +24,7 @@ class CompressedVideoSource;
 class DxDataIterator {
  public:
   explicit DxDataIterator(vpx_codec_ctx_t *decoder)
-      : decoder_(decoder), iter_(NULL) {}
+      : decoder_(decoder), iter_(nullptr) {}
 
   const vpx_image_t *Next() { return vpx_codec_get_frame(decoder_, &iter_); }
 
@@ -159,4 +159,4 @@ class DecoderTest {
 
 }  // namespace libvpx_test
 
-#endif  // TEST_DECODE_TEST_DRIVER_H_
+#endif  // VPX_TEST_DECODE_TEST_DRIVER_H_
diff --git a/media/libvpx/libvpx/test/decode_to_md5.sh b/media/libvpx/libvpx/test/decode_to_md5.sh
index 854b74f84f..15eee39fac 100644
--- a/media/libvpx/libvpx/test/decode_to_md5.sh
+++ b/media/libvpx/libvpx/test/decode_to_md5.sh
@@ -40,7 +40,7 @@ decode_to_md5() {
   fi
 
   eval "${VPX_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \
-      ${devnull}
+      ${devnull} || return 1
 
   [ -e "${output_file}" ] || return 1
 
diff --git a/media/libvpx/libvpx/test/decode_with_drops.sh b/media/libvpx/libvpx/test/decode_with_drops.sh
index 9b2edb6429..2c826045b3 100644
--- a/media/libvpx/libvpx/test/decode_with_drops.sh
+++ b/media/libvpx/libvpx/test/decode_with_drops.sh
@@ -40,7 +40,7 @@ decode_with_drops() {
   fi
 
   eval "${VPX_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \
-      "${drop_mode}" ${devnull}
+      "${drop_mode}" ${devnull} || return 1
 
   [ -e "${output_file}" ] || return 1
 }
@@ -52,10 +52,10 @@ decode_with_drops() {
 decode_with_drops_vp8() {
   if [ "$(vp8_decode_available)" = "yes" ]; then
     # Test sequence mode: Drop frames 2-28.
-    decode_with_drops "${VP8_IVF_FILE}" "vp8" "2-28"
+    decode_with_drops "${VP8_IVF_FILE}" "vp8" "2-28" || return 1
 
     # Test pattern mode: Drop 3 of every 4 frames.
-    decode_with_drops "${VP8_IVF_FILE}" "vp8" "3/4"
+    decode_with_drops "${VP8_IVF_FILE}" "vp8" "3/4" || return 1
   fi
 }
 
@@ -66,10 +66,10 @@ decode_with_drops_vp8() {
 decode_with_drops_vp9() {
   if [ "$(vp9_decode_available)" = "yes" ]; then
     # Test sequence mode: Drop frames 2-28.
-    decode_with_drops "${VP9_IVF_FILE}" "vp9" "2-19"
+    decode_with_drops "${VP9_IVF_FILE}" "vp9" "2-19" || return 1
 
     # Test pattern mode: Drop 3 of every 4 frames.
-    decode_with_drops "${VP9_IVF_FILE}" "vp9" "3/4"
+    decode_with_drops "${VP9_IVF_FILE}" "vp9" "3/4" || return 1
   fi
 }
 
diff --git a/media/libvpx/libvpx/test/encode_api_test.cc b/media/libvpx/libvpx/test/encode_api_test.cc
index 419e38506c..554b9e7fa9 100644
--- a/media/libvpx/libvpx/test/encode_api_test.cc
+++ b/media/libvpx/libvpx/test/encode_api_test.cc
@@ -8,25 +8,113 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include <cassert>
+#include <climits>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <initializer_list>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/video_source.h"
+#include "test/y4m_video_source.h"
 
 #include "./vpx_config.h"
 #include "vpx/vp8cx.h"
+#include "vpx/vpx_codec.h"
 #include "vpx/vpx_encoder.h"
+#include "vpx/vpx_image.h"
 
 namespace {
 
-#define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))
-
-TEST(EncodeAPI, InvalidParams) {
-  static const vpx_codec_iface_t *kCodecs[] = {
+vpx_codec_iface_t *kCodecIfaces[] = {
 #if CONFIG_VP8_ENCODER
-    &vpx_codec_vp8_cx_algo,
+  &vpx_codec_vp8_cx_algo,
 #endif
 #if CONFIG_VP9_ENCODER
-    &vpx_codec_vp9_cx_algo,
+  &vpx_codec_vp9_cx_algo,
 #endif
-  };
+};
+
+bool IsVP9(vpx_codec_iface_t *iface) {
+  static const char kVP9Name[] = "WebM Project VP9";
+  return strncmp(kVP9Name, vpx_codec_iface_name(iface), sizeof(kVP9Name) - 1) ==
+         0;
+}
+
+void *Memset16(void *dest, int val, size_t length) {
+  uint16_t *dest16 = reinterpret_cast<uint16_t *>(dest);
+  for (size_t i = 0; i < length; i++) {
+    *dest16++ = val;
+  }
+  return dest;
+}
+
+vpx_image_t *CreateImage(vpx_bit_depth_t bit_depth, vpx_img_fmt_t fmt,
+                         unsigned int width, unsigned int height) {
+  assert(fmt != VPX_IMG_FMT_NV12);
+  if (bit_depth > VPX_BITS_8) {
+    fmt = static_cast<vpx_img_fmt_t>(fmt | VPX_IMG_FMT_HIGHBITDEPTH);
+  }
+  vpx_image_t *image = vpx_img_alloc(nullptr, fmt, width, height, 1);
+  if (!image) return image;
+
+  const int val = 1 << (bit_depth - 1);
+  const unsigned int uv_h =
+      (image->d_h + image->y_chroma_shift) >> image->y_chroma_shift;
+  const unsigned int uv_w =
+      (image->d_w + image->x_chroma_shift) >> image->x_chroma_shift;
+  if (bit_depth > VPX_BITS_8) {
+    for (unsigned int i = 0; i < image->d_h; ++i) {
+      Memset16(image->planes[0] + i * image->stride[0], val, image->d_w);
+    }
+    for (unsigned int i = 0; i < uv_h; ++i) {
+      Memset16(image->planes[1] + i * image->stride[1], val, uv_w);
+      Memset16(image->planes[2] + i * image->stride[2], val, uv_w);
+    }
+  } else {
+    for (unsigned int i = 0; i < image->d_h; ++i) {
+      memset(image->planes[0] + i * image->stride[0], val, image->d_w);
+    }
+    for (unsigned int i = 0; i < uv_h; ++i) {
+      memset(image->planes[1] + i * image->stride[1], val, uv_w);
+      memset(image->planes[2] + i * image->stride[2], val, uv_w);
+    }
+  }
+
+  return image;
+}
+
+void InitCodec(vpx_codec_iface_t &iface, int width, int height,
+               vpx_codec_ctx_t *enc, vpx_codec_enc_cfg_t *cfg) {
+  cfg->g_w = width;
+  cfg->g_h = height;
+  cfg->g_lag_in_frames = 0;
+  cfg->g_pass = VPX_RC_ONE_PASS;
+  ASSERT_EQ(vpx_codec_enc_init(enc, &iface, cfg, 0), VPX_CODEC_OK);
+
+  ASSERT_EQ(vpx_codec_control_(enc, VP8E_SET_CPUUSED, 2), VPX_CODEC_OK);
+}
+
+// Encodes 1 frame of size |cfg.g_w| x |cfg.g_h| setting |enc|'s configuration
+// to |cfg|.
+void EncodeWithConfig(const vpx_codec_enc_cfg_t &cfg, vpx_codec_ctx_t *enc) {
+  libvpx_test::DummyVideoSource video;
+  video.SetSize(cfg.g_w, cfg.g_h);
+  video.Begin();
+  EXPECT_EQ(vpx_codec_enc_config_set(enc, &cfg), VPX_CODEC_OK)
+      << vpx_codec_error_detail(enc);
+
+  EXPECT_EQ(vpx_codec_encode(enc, video.img(), video.pts(), video.duration(),
+                             /*flags=*/0, VPX_DL_GOOD_QUALITY),
+            VPX_CODEC_OK)
+      << vpx_codec_error_detail(enc);
+}
+
+TEST(EncodeAPI, InvalidParams) {
   uint8_t buf[1] = { 0 };
   vpx_image_t img;
   vpx_codec_ctx_t enc;
@@ -34,32 +122,2162 @@ TEST(EncodeAPI, InvalidParams) {
 
   EXPECT_EQ(&img, vpx_img_wrap(&img, VPX_IMG_FMT_I420, 1, 1, 1, buf));
 
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_enc_init(NULL, NULL, NULL, 0));
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_enc_init(&enc, NULL, NULL, 0));
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_encode(NULL, NULL, 0, 0, 0, 0));
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_encode(NULL, &img, 0, 0, 0, 0));
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_destroy(NULL));
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
-            vpx_codec_enc_config_default(NULL, NULL, 0));
+            vpx_codec_enc_init(nullptr, nullptr, nullptr, 0));
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
-            vpx_codec_enc_config_default(NULL, &cfg, 0));
-  EXPECT_TRUE(vpx_codec_error(NULL) != NULL);
+            vpx_codec_enc_init(&enc, nullptr, nullptr, 0));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+            vpx_codec_encode(nullptr, nullptr, 0, 0, 0, 0));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+            vpx_codec_encode(nullptr, &img, 0, 0, 0, 0));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_destroy(nullptr));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+            vpx_codec_enc_config_default(nullptr, nullptr, 0));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+            vpx_codec_enc_config_default(nullptr, &cfg, 0));
+  EXPECT_NE(vpx_codec_error(nullptr), nullptr);
 
-  for (int i = 0; i < NELEMENTS(kCodecs); ++i) {
-    SCOPED_TRACE(vpx_codec_iface_name(kCodecs[i]));
+  for (const auto *iface : kCodecIfaces) {
+    SCOPED_TRACE(vpx_codec_iface_name(iface));
     EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
-              vpx_codec_enc_init(NULL, kCodecs[i], NULL, 0));
+              vpx_codec_enc_init(nullptr, iface, nullptr, 0));
     EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
-              vpx_codec_enc_init(&enc, kCodecs[i], NULL, 0));
+              vpx_codec_enc_init(&enc, iface, nullptr, 0));
     EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
-              vpx_codec_enc_config_default(kCodecs[i], &cfg, 1));
+              vpx_codec_enc_config_default(iface, &cfg, 1));
 
-    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_config_default(kCodecs[i], &cfg, 0));
-    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_init(&enc, kCodecs[i], &cfg, 0));
-    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_encode(&enc, NULL, 0, 0, 0, 0));
+    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_config_default(iface, &cfg, 0));
+    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_init(&enc, iface, &cfg, 0));
+    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_encode(&enc, nullptr, 0, 0, 0, 0));
 
     EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&enc));
   }
 }
 
+TEST(EncodeAPI, HighBitDepthCapability) {
+// VP8 should not claim VP9 HBD as a capability.
+#if CONFIG_VP8_ENCODER
+  const vpx_codec_caps_t vp8_caps = vpx_codec_get_caps(&vpx_codec_vp8_cx_algo);
+  EXPECT_EQ(vp8_caps & VPX_CODEC_CAP_HIGHBITDEPTH, 0);
+#endif
+
+#if CONFIG_VP9_ENCODER
+  const vpx_codec_caps_t vp9_caps = vpx_codec_get_caps(&vpx_codec_vp9_cx_algo);
+#if CONFIG_VP9_HIGHBITDEPTH
+  EXPECT_EQ(vp9_caps & VPX_CODEC_CAP_HIGHBITDEPTH, VPX_CODEC_CAP_HIGHBITDEPTH);
+#else
+  EXPECT_EQ(vp9_caps & VPX_CODEC_CAP_HIGHBITDEPTH, 0);
+#endif
+#endif
+}
+
+#if CONFIG_VP8_ENCODER
+TEST(EncodeAPI, ImageSizeSetting) {
+  const int width = 711;
+  const int height = 360;
+  const int bps = 12;
+  vpx_image_t img;
+  vpx_codec_ctx_t enc;
+  vpx_codec_enc_cfg_t cfg;
+  uint8_t *img_buf = reinterpret_cast<uint8_t *>(
+      calloc(width * height * bps / 8, sizeof(*img_buf)));
+  vpx_codec_enc_config_default(vpx_codec_vp8_cx(), &cfg, 0);
+
+  cfg.g_w = width;
+  cfg.g_h = height;
+
+  vpx_img_wrap(&img, VPX_IMG_FMT_I420, width, height, 1, img_buf);
+
+  vpx_codec_enc_init(&enc, vpx_codec_vp8_cx(), &cfg, 0);
+
+  EXPECT_EQ(VPX_CODEC_OK, vpx_codec_encode(&enc, &img, 0, 1, 0, 0));
+
+  free(img_buf);
+
+  vpx_codec_destroy(&enc);
+}
+
+// Verifies the fix for a float-cast-overflow in vp8_change_config().
+//
+// Causes cpi->framerate to become the largest possible value (10,000,000) in
+// VP8 by setting cfg.g_timebase to 1/10000000 and passing a duration of 1 to
+// vpx_codec_encode().
+TEST(EncodeAPI, HugeFramerateVp8) {
+  vpx_codec_iface_t *const iface = vpx_codec_vp8_cx();
+  vpx_codec_enc_cfg_t cfg;
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+  cfg.g_w = 271;
+  cfg.g_h = 1080;
+  cfg.g_timebase.num = 1;
+  // Largest value (VP8's TICKS_PER_SEC) such that frame duration is nonzero (1
+  // tick).
+  cfg.g_timebase.den = 10000000;
+  cfg.g_pass = VPX_RC_ONE_PASS;
+  cfg.g_lag_in_frames = 0;
+  cfg.rc_end_usage = VPX_CBR;
+
+  vpx_codec_ctx_t enc;
+  // Before we encode the first frame, cpi->framerate is set to a guess (the
+  // reciprocal of cfg.g_timebase). If this guess doesn't seem reasonable
+  // (> 180), cpi->framerate is set to 30.
+  ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, -12), VPX_CODEC_OK);
+
+  vpx_image_t *const image =
+      vpx_img_alloc(nullptr, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h, 1);
+  ASSERT_NE(image, nullptr);
+
+  for (unsigned int i = 0; i < image->d_h; ++i) {
+    memset(image->planes[0] + i * image->stride[0], 128, image->d_w);
+  }
+  const unsigned int uv_h = (image->d_h + 1) / 2;
+  const unsigned int uv_w = (image->d_w + 1) / 2;
+  for (unsigned int i = 0; i < uv_h; ++i) {
+    memset(image->planes[1] + i * image->stride[1], 128, uv_w);
+    memset(image->planes[2] + i * image->stride[2], 128, uv_w);
+  }
+
+  // Encode a frame.
+  // Up to this point cpi->framerate is 30. Now pass a duration of only 1. This
+  // causes cpi->framerate to become 10,000,000.
+  ASSERT_EQ(vpx_codec_encode(&enc, image, 0, 1, 0, VPX_DL_REALTIME),
+            VPX_CODEC_OK);
+
+  // Change to the same config. Since cpi->framerate is now huge, when it is
+  // used to calculate raw_target_rate (bit rate of uncompressed frames), the
+  // result is likely to overflow an unsigned int.
+  ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK);
+
+  vpx_img_free(image);
+  ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+}
+
+// A test that reproduces https://crbug.com/webm/1831.
+TEST(EncodeAPI, RandomPixelsVp8) {
+  // Initialize libvpx encoder
+  vpx_codec_iface_t *const iface = vpx_codec_vp8_cx();
+  vpx_codec_enc_cfg_t cfg;
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+
+  cfg.rc_target_bitrate = 2000;
+  cfg.g_w = 1280;
+  cfg.g_h = 720;
+
+  vpx_codec_ctx_t enc;
+  ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+  // Generate random frame data and encode
+  libvpx_test::RandomVideoSource video;
+  video.SetSize(cfg.g_w, cfg.g_h);
+  video.SetImageFormat(VPX_IMG_FMT_I420);
+  video.Begin();
+  ASSERT_EQ(vpx_codec_encode(&enc, video.img(), video.pts(), video.duration(),
+                             /*flags=*/0, VPX_DL_BEST_QUALITY),
+            VPX_CODEC_OK);
+
+  // Destroy libvpx encoder
+  vpx_codec_destroy(&enc);
+}
+
+TEST(EncodeAPI, ChangeToL1T3AndSetBitrateVp8) {
+  // Initialize libvpx encoder
+  vpx_codec_iface_t *const iface = vpx_codec_vp8_cx();
+  vpx_codec_enc_cfg_t cfg;
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+
+  cfg.g_threads = 1;
+  cfg.g_profile = 0;
+  cfg.g_w = 1;
+  cfg.g_h = 64;
+  cfg.g_bit_depth = VPX_BITS_8;
+  cfg.g_input_bit_depth = 8;
+  cfg.g_timebase.num = 1;
+  cfg.g_timebase.den = 1000000;
+  cfg.g_pass = VPX_RC_ONE_PASS;
+  cfg.g_lag_in_frames = 0;
+  cfg.rc_dropframe_thresh = 0;  // Don't drop frames
+  cfg.rc_resize_allowed = 0;
+  cfg.rc_end_usage = VPX_VBR;
+  cfg.rc_target_bitrate = 10;
+  cfg.rc_min_quantizer = 2;
+  cfg.rc_max_quantizer = 58;
+  cfg.kf_mode = VPX_KF_AUTO;
+  cfg.kf_min_dist = 0;
+  cfg.kf_max_dist = 10000;
+
+  vpx_codec_ctx_t enc;
+  ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, -6), VPX_CODEC_OK);
+
+  // Generate random frame data and encode
+  uint8_t img[1 * 64 * 3 / 2];
+  libvpx_test::ACMRandom rng;
+  for (size_t i = 0; i < sizeof(img); ++i) {
+    img[i] = rng.Rand8();
+  }
+  vpx_image_t img_wrapper;
+  ASSERT_EQ(
+      vpx_img_wrap(&img_wrapper, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h, 1, img),
+      &img_wrapper);
+  vpx_enc_frame_flags_t flags = VPX_EFLAG_FORCE_KF;
+  ASSERT_EQ(
+      vpx_codec_encode(&enc, &img_wrapper, 0, 500000, flags, VPX_DL_REALTIME),
+      VPX_CODEC_OK);
+  ASSERT_EQ(vpx_codec_encode(&enc, nullptr, -1, 0, 0, 0), VPX_CODEC_OK);
+
+  cfg.rc_target_bitrate = 4294967;
+  // Set the scalability mode to L1T3.
+  cfg.ts_number_layers = 3;
+  cfg.ts_periodicity = 4;
+  cfg.ts_layer_id[0] = 0;
+  cfg.ts_layer_id[1] = 2;
+  cfg.ts_layer_id[2] = 1;
+  cfg.ts_layer_id[3] = 2;
+  cfg.ts_rate_decimator[0] = 4;
+  cfg.ts_rate_decimator[1] = 2;
+  cfg.ts_rate_decimator[2] = 1;
+  // Bitrate allocation L0: 50% L1: 20% L2: 30%
+  cfg.layer_target_bitrate[0] = cfg.ts_target_bitrate[0] =
+      50 * cfg.rc_target_bitrate / 100;
+  cfg.layer_target_bitrate[1] = cfg.ts_target_bitrate[1] =
+      70 * cfg.rc_target_bitrate / 100;
+  cfg.layer_target_bitrate[2] = cfg.ts_target_bitrate[2] =
+      cfg.rc_target_bitrate;
+  cfg.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_0212;
+  cfg.g_error_resilient = VPX_ERROR_RESILIENT_DEFAULT;
+  ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK);
+
+  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_TEMPORAL_LAYER_ID, 2),
+            VPX_CODEC_OK);
+
+  constexpr vpx_enc_frame_flags_t VP8_UPDATE_NOTHING =
+      VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_LAST;
+  // Layer 2: only reference last frame, no updates
+  // It only depends on layer 0
+  flags = VP8_UPDATE_NOTHING | VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_REF_GF;
+  ASSERT_EQ(
+      vpx_codec_encode(&enc, &img_wrapper, 0, 500000, flags, VPX_DL_REALTIME),
+      VPX_CODEC_OK);
+
+  // Destroy libvpx encoder
+  vpx_codec_destroy(&enc);
+}
+
+// Emulates the WebCodecs VideoEncoder interface.
+class VP8Encoder {
+ public:
+  explicit VP8Encoder(int speed) : speed_(speed) {}
+  ~VP8Encoder();
+
+  void Configure(unsigned int threads, unsigned int width, unsigned int height,
+                 vpx_rc_mode end_usage, vpx_enc_deadline_t deadline);
+  void Encode(bool key_frame);
+
+ private:
+  const int speed_;
+  bool initialized_ = false;
+  vpx_codec_enc_cfg_t cfg_;
+  vpx_codec_ctx_t enc_;
+  int frame_index_ = 0;
+  vpx_enc_deadline_t deadline_ = 0;
+};
+
+VP8Encoder::~VP8Encoder() {
+  if (initialized_) {
+    EXPECT_EQ(vpx_codec_destroy(&enc_), VPX_CODEC_OK);
+  }
+}
+
+void VP8Encoder::Configure(unsigned int threads, unsigned int width,
+                           unsigned int height, vpx_rc_mode end_usage,
+                           vpx_enc_deadline_t deadline) {
+  deadline_ = deadline;
+
+  if (!initialized_) {
+    vpx_codec_iface_t *const iface = vpx_codec_vp8_cx();
+    ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg_, /*usage=*/0),
+              VPX_CODEC_OK);
+    cfg_.g_threads = threads;
+    cfg_.g_w = width;
+    cfg_.g_h = height;
+    cfg_.g_timebase.num = 1;
+    cfg_.g_timebase.den = 1000 * 1000;  // microseconds
+    cfg_.g_pass = VPX_RC_ONE_PASS;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.rc_end_usage = end_usage;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 58;
+    ASSERT_EQ(vpx_codec_enc_init(&enc_, iface, &cfg_, 0), VPX_CODEC_OK);
+    ASSERT_EQ(vpx_codec_control(&enc_, VP8E_SET_CPUUSED, speed_), VPX_CODEC_OK);
+    initialized_ = true;
+    return;
+  }
+
+  cfg_.g_threads = threads;
+  cfg_.g_w = width;
+  cfg_.g_h = height;
+  cfg_.rc_end_usage = end_usage;
+  ASSERT_EQ(vpx_codec_enc_config_set(&enc_, &cfg_), VPX_CODEC_OK)
+      << vpx_codec_error_detail(&enc_);
+}
+
+void VP8Encoder::Encode(bool key_frame) {
+  assert(initialized_);
+  const vpx_codec_cx_pkt_t *pkt;
+  vpx_image_t *image =
+      CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg_.g_w, cfg_.g_h);
+  ASSERT_NE(image, nullptr);
+  const vpx_enc_frame_flags_t flags = key_frame ? VPX_EFLAG_FORCE_KF : 0;
+  ASSERT_EQ(vpx_codec_encode(&enc_, image, frame_index_, 1, flags, deadline_),
+            VPX_CODEC_OK);
+  ++frame_index_;
+  vpx_codec_iter_t iter = nullptr;
+  while ((pkt = vpx_codec_get_cx_data(&enc_, &iter)) != nullptr) {
+    ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT);
+    if (key_frame) {
+      ASSERT_EQ(pkt->data.frame.flags & VPX_FRAME_IS_KEY, VPX_FRAME_IS_KEY);
+    }
+  }
+  vpx_img_free(image);
+}
+
+// This is the reproducer testcase for crbug.com/324459561. However,
+// just running this test is not enough to reproduce the bug. We also
+// need to send signals to the test.
+TEST(EncodeAPI, Chromium324459561) {
+  VP8Encoder encoder(-12);
+
+  encoder.Configure(11, 1685, 652, VPX_CBR, VPX_DL_REALTIME);
+
+  encoder.Encode(true);
+  encoder.Encode(true);
+  encoder.Encode(true);
+
+  encoder.Configure(0, 1685, 1, VPX_VBR, VPX_DL_REALTIME);
+}
+
+TEST(EncodeAPI, VP8GlobalHeaders) {
+  constexpr int kWidth = 320;
+  constexpr int kHeight = 240;
+
+  vpx_codec_enc_cfg_t cfg = {};
+  struct Encoder {
+    ~Encoder() { EXPECT_EQ(vpx_codec_destroy(&ctx), VPX_CODEC_OK); }
+    vpx_codec_ctx_t ctx = {};
+  } enc;
+
+  ASSERT_EQ(vpx_codec_enc_config_default(vpx_codec_vp8_cx(), &cfg, 0),
+            VPX_CODEC_OK);
+  ASSERT_NO_FATAL_FAILURE(
+      InitCodec(*vpx_codec_vp8_cx(), kWidth, kHeight, &enc.ctx, &cfg));
+  EXPECT_EQ(vpx_codec_get_global_headers(&enc.ctx), nullptr);
+  EXPECT_NO_FATAL_FAILURE(EncodeWithConfig(cfg, &enc.ctx));
+  EXPECT_EQ(vpx_codec_get_global_headers(&enc.ctx), nullptr);
+}
+
+// Encode a few frames for 2 temporal layers realtime mode.
+// Set duration to be very large on first frame, much smaller
+// on second frames, with the timestamp (pts) parameter very
+// inconsistent with the duration (i.e, pts != prev_pts + duration).
+// This reproduces the issue found in the bug: 431520320.
+TEST(EncodeAPI, Vp8ChromiumIssue431520320) {
+  // Initialize libvpx encoder.
+  vpx_codec_iface_t *const iface = vpx_codec_vp8_cx();
+  vpx_codec_ctx_t enc;
+  vpx_codec_enc_cfg_t cfg;
+
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+
+  cfg.g_w = 320;
+  cfg.g_h = 240;
+  cfg.g_lag_in_frames = 0;
+  cfg.rc_target_bitrate = 500;
+
+  // 2-layers, 2-frame period.
+  int ids[2] = { 0, 1 };
+  cfg.ts_periodicity = 2;
+  cfg.ts_number_layers = 2;
+  cfg.ts_rate_decimator[0] = 2;
+  cfg.ts_rate_decimator[1] = 1;
+  cfg.ts_target_bitrate[0] = 300;
+  cfg.ts_target_bitrate[1] = 500;
+  memcpy(cfg.ts_layer_id, ids, sizeof(ids));
+
+  ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+  // Create input image.
+  vpx_image_t *const image =
+      CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+  ASSERT_NE(image, nullptr);
+
+  // Encode first frame.
+  ASSERT_EQ(
+      vpx_codec_encode(&enc, image, 0, /*duration=*/800000, 0, VPX_DL_REALTIME),
+      VPX_CODEC_OK);
+
+  // Encode second frame.
+  ASSERT_EQ(vpx_codec_encode(&enc, image, 40000, /*duration=*/40000, 0,
+                             VPX_DL_REALTIME),
+            VPX_CODEC_OK);
+
+  // Encode third frame.
+  ASSERT_EQ(vpx_codec_encode(&enc, image, 80000, /*duration=*/40000, 0,
+                             VPX_DL_REALTIME),
+            VPX_CODEC_OK);
+
+  // Free resources.
+  vpx_img_free(image);
+  ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+}
+
+TEST(EncodeAPI, AomediaIssue3509VbrMinSection2PercentVP8) {
+  // Initialize libvpx encoder.
+  vpx_codec_iface_t *const iface = vpx_codec_vp8_cx();
+  vpx_codec_ctx_t enc;
+  vpx_codec_enc_cfg_t cfg;
+
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+
+  cfg.g_w = 1920;
+  cfg.g_h = 1080;
+  cfg.g_lag_in_frames = 0;
+  cfg.rc_target_bitrate = 1000000;
+  // Set this to more than 1 percent to cause a signed integer overflow in the
+  // multiplication cpi->av_per_frame_bandwidth *
+  // cpi->oxcf.two_pass_vbrmin_section in vp8_new_framerate() if the
+  // multiplication is done in the `int` type.
+  cfg.rc_2pass_vbr_minsection_pct = 2;
+
+  ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+  // Create input image.
+  vpx_image_t *const image =
+      CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+  ASSERT_NE(image, nullptr);
+
+  // Encode frame.
+  // `duration` can go as high as 300, but the UBSan error is gone if
+  // `duration` is 301 or higher.
+  ASSERT_EQ(
+      vpx_codec_encode(&enc, image, 0, /*duration=*/300, 0, VPX_DL_REALTIME),
+      VPX_CODEC_OK);
+
+  // Free resources.
+  vpx_img_free(image);
+  ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+}
+
+TEST(EncodeAPI, AomediaIssue3509VbrMinSection101PercentVP8) {
+  // Initialize libvpx encoder.
+  vpx_codec_iface_t *const iface = vpx_codec_vp8_cx();
+  vpx_codec_ctx_t enc;
+  vpx_codec_enc_cfg_t cfg;
+
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+
+  cfg.g_w = 1920;
+  cfg.g_h = 1080;
+  cfg.g_lag_in_frames = 0;
+  cfg.rc_target_bitrate = 1000000;
+  // Set this to more than 100 percent to cause an error when vbr_min_bits is
+  // cast to `int` in vp8_new_framerate() if vbr_min_bits is not clamped to
+  // INT_MAX.
+  cfg.rc_2pass_vbr_minsection_pct = 101;
+
+  ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+  // Create input image.
+  vpx_image_t *const image =
+      CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+  ASSERT_NE(image, nullptr);
+
+  // Encode frame.
+  // `duration` can go as high as 300, but the UBSan error is gone if
+  // `duration` is 301 or higher.
+  ASSERT_EQ(
+      vpx_codec_encode(&enc, image, 0, /*duration=*/300, 0, VPX_DL_REALTIME),
+      VPX_CODEC_OK);
+
+  // Free resources.
+  vpx_img_free(image);
+  ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+}
+
+TEST(EncodeAPI, OssFuzz69100) {
+  // Initialize libvpx encoder.
+  vpx_codec_iface_t *const iface = vpx_codec_vp8_cx();
+  vpx_codec_ctx_t enc;
+  vpx_codec_enc_cfg_t cfg;
+
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+
+  cfg.g_w = 64;
+  cfg.g_h = 64;
+  cfg.g_lag_in_frames = 25;
+  cfg.g_timebase.num = 1;
+  cfg.g_timebase.den = 6240592;
+  cfg.rc_target_bitrate = 1202607620;
+  cfg.kf_max_dist = 24377;
+
+  ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, 1), VPX_CODEC_OK);
+  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_ARNR_MAXFRAMES, 0), VPX_CODEC_OK);
+  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_ARNR_STRENGTH, 3), VPX_CODEC_OK);
+  ASSERT_EQ(vpx_codec_control_(&enc, VP8E_SET_ARNR_TYPE, 3),
+            VPX_CODEC_OK);  // deprecated
+  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_NOISE_SENSITIVITY, 0),
+            VPX_CODEC_OK);
+  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_TOKEN_PARTITIONS, 0),
+            VPX_CODEC_OK);
+  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_STATIC_THRESHOLD, 0),
+            VPX_CODEC_OK);
+
+  libvpx_test::RandomVideoSource video;
+  video.set_limit(30);
+  video.SetSize(cfg.g_w, cfg.g_h);
+  video.SetImageFormat(VPX_IMG_FMT_I420);
+  video.Begin();
+  do {
+    ASSERT_EQ(vpx_codec_encode(&enc, video.img(), video.pts(), video.duration(),
+                               /*flags=*/0, VPX_DL_GOOD_QUALITY),
+              VPX_CODEC_OK);
+    video.Next();
+  } while (video.img() != nullptr);
+
+  ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+}
+
+void EncodeOssFuzz69906(int cpu_used, vpx_enc_deadline_t deadline) {
+  char str[80];
+  snprintf(str, sizeof(str), "cpu_used: %d deadline: %d", cpu_used,
+           static_cast<int>(deadline));
+  SCOPED_TRACE(str);
+
+  // Initialize libvpx encoder.
+  vpx_codec_iface_t *const iface = vpx_codec_vp8_cx();
+  vpx_codec_ctx_t enc;
+  vpx_codec_enc_cfg_t cfg;
+
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+
+  cfg.g_w = 4097;
+  cfg.g_h = 16;
+  cfg.rc_target_bitrate = 1237084865;
+  cfg.kf_max_dist = 4336;
+
+  ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, cpu_used), VPX_CODEC_OK);
+  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_ARNR_MAXFRAMES, 0), VPX_CODEC_OK);
+  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_ARNR_STRENGTH, 3), VPX_CODEC_OK);
+  ASSERT_EQ(vpx_codec_control_(&enc, VP8E_SET_ARNR_TYPE, 3),
+            VPX_CODEC_OK);  // deprecated
+  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_NOISE_SENSITIVITY, 0),
+            VPX_CODEC_OK);
+  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_TOKEN_PARTITIONS, 0),
+            VPX_CODEC_OK);
+  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_STATIC_THRESHOLD, 0),
+            VPX_CODEC_OK);
+
+  libvpx_test::Y4mVideoSource video("repro-oss-fuzz-69906.y4m", /*start=*/0,
+                                    /*limit=*/3);
+  video.Begin();
+  do {
+    ASSERT_EQ(vpx_codec_encode(&enc, video.img(), video.pts(), video.duration(),
+                               /*flags=*/0, deadline),
+              VPX_CODEC_OK);
+    video.Next();
+  } while (video.img() != nullptr);
+
+  ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+}
+
+TEST(EncodeAPI, OssFuzz69906) {
+  // Note the original bug report was for speed 1, good quality. The remainder
+  // of the settings are for added coverage.
+  for (int cpu_used = 0; cpu_used <= 5; ++cpu_used) {
+    EncodeOssFuzz69906(cpu_used, VPX_DL_GOOD_QUALITY);
+  }
+
+  for (int cpu_used = -16; cpu_used <= -5; ++cpu_used) {
+    EncodeOssFuzz69906(cpu_used, VPX_DL_REALTIME);
+  }
+}
+#endif  // CONFIG_VP8_ENCODER
+
+// Set up 2 spatial streams with 2 temporal layers per stream, and generate
+// invalid configuration by setting the temporal layer rate allocation
+// (ts_target_bitrate[]) to 0 for both layers. This should fail independent of
+// CONFIG_MULTI_RES_ENCODING.
+TEST(EncodeAPI, MultiResEncode) {
+  const int width = 1280;
+  const int height = 720;
+  const int width_down = width / 2;
+  const int height_down = height / 2;
+  const int target_bitrate = 1000;
+  const int framerate = 30;
+
+  for (const auto *iface : kCodecIfaces) {
+    vpx_codec_ctx_t enc[2];
+    vpx_codec_enc_cfg_t cfg[2];
+    vpx_rational_t dsf[2] = { { 2, 1 }, { 2, 1 } };
+
+    memset(enc, 0, sizeof(enc));
+
+    for (int i = 0; i < 2; i++) {
+      vpx_codec_enc_config_default(iface, &cfg[i], 0);
+    }
+
+    /* Highest-resolution encoder settings */
+    cfg[0].g_w = width;
+    cfg[0].g_h = height;
+    cfg[0].rc_dropframe_thresh = 0;
+    cfg[0].rc_end_usage = VPX_CBR;
+    cfg[0].rc_resize_allowed = 0;
+    cfg[0].rc_min_quantizer = 2;
+    cfg[0].rc_max_quantizer = 56;
+    cfg[0].rc_undershoot_pct = 100;
+    cfg[0].rc_overshoot_pct = 15;
+    cfg[0].rc_buf_initial_sz = 500;
+    cfg[0].rc_buf_optimal_sz = 600;
+    cfg[0].rc_buf_sz = 1000;
+    cfg[0].g_error_resilient = 1; /* Enable error resilient mode */
+    cfg[0].g_lag_in_frames = 0;
+
+    cfg[0].kf_mode = VPX_KF_AUTO;
+    cfg[0].kf_min_dist = 3000;
+    cfg[0].kf_max_dist = 3000;
+
+    cfg[0].rc_target_bitrate = target_bitrate; /* Set target bitrate */
+    cfg[0].g_timebase.num = 1;                 /* Set fps */
+    cfg[0].g_timebase.den = framerate;
+
+    cfg[1] = cfg[0];
+    cfg[1].rc_target_bitrate = 500;
+    cfg[1].g_w = width_down;
+    cfg[1].g_h = height_down;
+
+    for (int i = 0; i < 2; i++) {
+      cfg[i].ts_number_layers = 2;
+      cfg[i].ts_periodicity = 2;
+      cfg[i].ts_rate_decimator[0] = 2;
+      cfg[i].ts_rate_decimator[1] = 1;
+      cfg[i].ts_layer_id[0] = 0;
+      cfg[i].ts_layer_id[1] = 1;
+      // Invalid parameters.
+      cfg[i].ts_target_bitrate[0] = 0;
+      cfg[i].ts_target_bitrate[1] = 0;
+    }
+
+    // VP9 should report incapable, VP8 invalid for all configurations.
+    EXPECT_EQ(IsVP9(iface) ? VPX_CODEC_INCAPABLE : VPX_CODEC_INVALID_PARAM,
+              vpx_codec_enc_init_multi(&enc[0], iface, &cfg[0], 2, 0, &dsf[0]));
+
+    for (int i = 0; i < 2; i++) {
+      vpx_codec_destroy(&enc[i]);
+    }
+  }
+}
+
+TEST(EncodeAPI, SetRoi) {
+  static struct {
+    vpx_codec_iface_t *iface;
+    int ctrl_id;
+  } kCodecs[] = {
+#if CONFIG_VP8_ENCODER
+    { &vpx_codec_vp8_cx_algo, VP8E_SET_ROI_MAP },
+#endif
+#if CONFIG_VP9_ENCODER
+    { &vpx_codec_vp9_cx_algo, VP9E_SET_ROI_MAP },
+#endif
+  };
+  constexpr int kWidth = 64;
+  constexpr int kHeight = 64;
+
+  for (const auto &codec : kCodecs) {
+    SCOPED_TRACE(vpx_codec_iface_name(codec.iface));
+    vpx_codec_ctx_t enc;
+    vpx_codec_enc_cfg_t cfg;
+
+    EXPECT_EQ(vpx_codec_enc_config_default(codec.iface, &cfg, 0), VPX_CODEC_OK);
+    cfg.g_w = kWidth;
+    cfg.g_h = kHeight;
+    EXPECT_EQ(vpx_codec_enc_init(&enc, codec.iface, &cfg, 0), VPX_CODEC_OK);
+
+    vpx_roi_map_t roi = {};
+    uint8_t roi_map[kWidth * kHeight] = {};
+    if (IsVP9(codec.iface)) {
+      roi.rows = (cfg.g_w + 7) >> 3;
+      roi.cols = (cfg.g_h + 7) >> 3;
+    } else {
+      roi.rows = (cfg.g_w + 15) >> 4;
+      roi.cols = (cfg.g_h + 15) >> 4;
+    }
+    EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), VPX_CODEC_OK);
+
+    roi.roi_map = roi_map;
+    // VP8 only. This value isn't range checked.
+    roi.static_threshold[1] = 1000;
+    roi.static_threshold[2] = UINT_MAX / 2 + 1;
+    roi.static_threshold[3] = UINT_MAX;
+
+    for (const auto delta : { -63, -1, 0, 1, 63 }) {
+      for (int i = 0; i < 8; ++i) {
+        roi.delta_q[i] = delta;
+        roi.delta_lf[i] = delta;
+        // VP9 only.
+        roi.skip[i] ^= 1;
+        roi.ref_frame[i] = (roi.ref_frame[i] + 1) % 4;
+        EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), VPX_CODEC_OK);
+      }
+    }
+
+    vpx_codec_err_t expected_error;
+    for (const auto delta : { -64, 64, INT_MIN, INT_MAX }) {
+      expected_error = VPX_CODEC_INVALID_PARAM;
+      for (int i = 0; i < 8; ++i) {
+        roi.delta_q[i] = delta;
+        // The max segment count for VP8 is 4, the remainder of the entries are
+        // ignored.
+        if (i >= 4 && !IsVP9(codec.iface)) expected_error = VPX_CODEC_OK;
+
+        EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error)
+            << "delta_q[" << i << "]: " << delta;
+        roi.delta_q[i] = 0;
+
+        roi.delta_lf[i] = delta;
+        EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error)
+            << "delta_lf[" << i << "]: " << delta;
+        roi.delta_lf[i] = 0;
+      }
+    }
+
+    // VP8 should ignore skip[] and ref_frame[] values.
+    expected_error =
+        IsVP9(codec.iface) ? VPX_CODEC_INVALID_PARAM : VPX_CODEC_OK;
+    for (const auto skip : { -2, 2, INT_MIN, INT_MAX }) {
+      for (int i = 0; i < 8; ++i) {
+        roi.skip[i] = skip;
+        EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error)
+            << "skip[" << i << "]: " << skip;
+        roi.skip[i] = 0;
+      }
+    }
+
+    // VP9 allows negative values to be used to disable segmentation.
+    for (int ref_frame = -3; ref_frame < 0; ++ref_frame) {
+      for (int i = 0; i < 8; ++i) {
+        roi.ref_frame[i] = ref_frame;
+        EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), VPX_CODEC_OK)
+            << "ref_frame[" << i << "]: " << ref_frame;
+        roi.ref_frame[i] = 0;
+      }
+    }
+
+    for (const auto ref_frame : { 4, INT_MIN, INT_MAX }) {
+      for (int i = 0; i < 8; ++i) {
+        roi.ref_frame[i] = ref_frame;
+        EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error)
+            << "ref_frame[" << i << "]: " << ref_frame;
+        roi.ref_frame[i] = 0;
+      }
+    }
+
+    EXPECT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+  }
+}
+
+TEST(EncodeAPI, ConfigChangeThreadCount) {
+  constexpr int kWidth = 1920;
+  constexpr int kHeight = 1080;
+
+  for (const auto *iface : kCodecIfaces) {
+    SCOPED_TRACE(vpx_codec_iface_name(iface));
+    for (int i = 0; i < (IsVP9(iface) ? 2 : 1); ++i) {
+      vpx_codec_enc_cfg_t cfg = {};
+      struct Encoder {
+        ~Encoder() { EXPECT_EQ(vpx_codec_destroy(&ctx), VPX_CODEC_OK); }
+        vpx_codec_ctx_t ctx = {};
+      } enc;
+
+      ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+      EXPECT_NO_FATAL_FAILURE(
+          InitCodec(*iface, kWidth, kHeight, &enc.ctx, &cfg));
+      if (IsVP9(iface)) {
+        EXPECT_EQ(vpx_codec_control_(&enc.ctx, VP9E_SET_TILE_COLUMNS, 6),
+                  VPX_CODEC_OK);
+        EXPECT_EQ(vpx_codec_control_(&enc.ctx, VP9E_SET_ROW_MT, i),
+                  VPX_CODEC_OK);
+      }
+
+      for (const auto threads : { 1, 4, 8, 6, 2, 1 }) {
+        cfg.g_threads = threads;
+        EXPECT_NO_FATAL_FAILURE(EncodeWithConfig(cfg, &enc.ctx))
+            << "iteration: " << i << " threads: " << threads;
+      }
+    }
+  }
+}
+
+TEST(EncodeAPI, ConfigResizeChangeThreadCount) {
+  constexpr int kInitWidth = 1024;
+  constexpr int kInitHeight = 1024;
+
+  for (const auto *iface : kCodecIfaces) {
+    SCOPED_TRACE(vpx_codec_iface_name(iface));
+    for (int i = 0; i < (IsVP9(iface) ? 2 : 1); ++i) {
+      vpx_codec_enc_cfg_t cfg = {};
+      struct Encoder {
+        ~Encoder() { EXPECT_EQ(vpx_codec_destroy(&ctx), VPX_CODEC_OK); }
+        vpx_codec_ctx_t ctx = {};
+      } enc;
+
+      ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+      // Start in threaded mode to ensure resolution and thread related
+      // allocations are updated correctly across changes in resolution and
+      // thread counts. See https://crbug.com/1486441.
+      cfg.g_threads = 4;
+      EXPECT_NO_FATAL_FAILURE(
+          InitCodec(*iface, kInitWidth, kInitHeight, &enc.ctx, &cfg));
+      if (IsVP9(iface)) {
+        EXPECT_EQ(vpx_codec_control_(&enc.ctx, VP9E_SET_TILE_COLUMNS, 6),
+                  VPX_CODEC_OK);
+        EXPECT_EQ(vpx_codec_control_(&enc.ctx, VP9E_SET_ROW_MT, i),
+                  VPX_CODEC_OK);
+      }
+
+      cfg.g_w = 1000;
+      cfg.g_h = 608;
+      EXPECT_EQ(vpx_codec_enc_config_set(&enc.ctx, &cfg), VPX_CODEC_OK)
+          << vpx_codec_error_detail(&enc.ctx);
+
+      cfg.g_w = 1000;
+      cfg.g_h = 720;
+
+      for (const auto threads : { 1, 4, 8, 6, 2, 1 }) {
+        cfg.g_threads = threads;
+        EXPECT_NO_FATAL_FAILURE(EncodeWithConfig(cfg, &enc.ctx))
+            << "iteration: " << i << " threads: " << threads;
+      }
+    }
+  }
+}
+
+TEST(EncodeAPI, ConfigResizeBiggerAfterInit) {
+  for (const auto *iface : kCodecIfaces) {
+    SCOPED_TRACE(vpx_codec_iface_name(iface));
+    vpx_codec_enc_cfg_t cfg;
+    vpx_codec_ctx_t enc;
+
+    ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+    EXPECT_NO_FATAL_FAILURE(InitCodec(*iface, 1, 1, &enc, &cfg));
+
+    cfg.g_w = 1920;
+    cfg.g_h = 1;
+    EXPECT_EQ(vpx_codec_enc_config_set(&enc, &cfg),
+              IsVP9(iface) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM);
+
+    EXPECT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+  }
+}
+
+TEST(EncodeAPI, ConfigResizeBiggerAfterEncode) {
+  for (const auto *iface : kCodecIfaces) {
+    SCOPED_TRACE(vpx_codec_iface_name(iface));
+    vpx_codec_enc_cfg_t cfg;
+    vpx_codec_ctx_t enc;
+
+    ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+    EXPECT_NO_FATAL_FAILURE(InitCodec(*iface, 1, 1, &enc, &cfg));
+    EXPECT_NO_FATAL_FAILURE(EncodeWithConfig(cfg, &enc));
+
+    cfg.g_w = 1920;
+    cfg.g_h = 1;
+    EXPECT_EQ(vpx_codec_enc_config_set(&enc, &cfg),
+              IsVP9(iface) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM);
+
+    cfg.g_w = 1920;
+    cfg.g_h = 1080;
+    EXPECT_EQ(vpx_codec_enc_config_set(&enc, &cfg),
+              IsVP9(iface) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM);
+
+    EXPECT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+  }
+}
+
+TEST(EncodeAPI, PtsSmallerThanInitialPts) {
+  for (const auto *iface : kCodecIfaces) {
+    // Initialize libvpx encoder.
+    vpx_codec_ctx_t enc;
+    vpx_codec_enc_cfg_t cfg;
+
+    ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+
+    ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+    // Create input image.
+    vpx_image_t *const image =
+        CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+    ASSERT_NE(image, nullptr);
+
+    // Encode frame.
+    ASSERT_EQ(vpx_codec_encode(&enc, image, 12, 1, 0, VPX_DL_BEST_QUALITY),
+              VPX_CODEC_OK);
+    ASSERT_EQ(vpx_codec_encode(&enc, image, 13, 1, 0, VPX_DL_BEST_QUALITY),
+              VPX_CODEC_OK);
+    // pts (10) is smaller than the initial pts (12).
+    ASSERT_EQ(vpx_codec_encode(&enc, image, 10, 1, 0, VPX_DL_BEST_QUALITY),
+              VPX_CODEC_INVALID_PARAM);
+
+    // Free resources.
+    vpx_img_free(image);
+    ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+  }
+}
+
+TEST(EncodeAPI, PtsOrDurationTooBig) {
+  for (const auto *iface : kCodecIfaces) {
+    // Initialize libvpx encoder.
+    vpx_codec_ctx_t enc;
+    vpx_codec_enc_cfg_t cfg;
+
+    ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+
+    ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+    // Create input image.
+    vpx_image_t *const image =
+        CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+    ASSERT_NE(image, nullptr);
+
+    // Encode frame.
+    ASSERT_EQ(vpx_codec_encode(&enc, image, 0, 1, 0, VPX_DL_BEST_QUALITY),
+              VPX_CODEC_OK);
+#if ULONG_MAX > INT64_MAX
+    // duration is too big.
+    ASSERT_EQ(vpx_codec_encode(&enc, image, 0, (1ul << 63), 0, 2),
+              VPX_CODEC_INVALID_PARAM);
+#endif
+    // pts, when converted to ticks, is too big.
+    ASSERT_EQ(vpx_codec_encode(&enc, image, INT64_MAX / 1000000 + 1, 1, 0,
+                               VPX_DL_BEST_QUALITY),
+              VPX_CODEC_INVALID_PARAM);
+#if ULONG_MAX > INT64_MAX
+    // duration is too big.
+    ASSERT_EQ(
+        vpx_codec_encode(&enc, image, 0, (1ul << 63), 0, VPX_DL_BEST_QUALITY),
+        VPX_CODEC_INVALID_PARAM);
+    // pts + duration is too big.
+    ASSERT_EQ(
+        vpx_codec_encode(&enc, image, 1, INT64_MAX, 0, VPX_DL_BEST_QUALITY),
+        VPX_CODEC_INVALID_PARAM);
+#endif
+    // pts + duration, when converted to ticks, is too big.
+#if ULONG_MAX > INT64_MAX
+    ASSERT_EQ(vpx_codec_encode(&enc, image, 0, 0xbd6b566b15c7, 0,
+                               VPX_DL_BEST_QUALITY),
+              VPX_CODEC_INVALID_PARAM);
+#endif
+    ASSERT_EQ(vpx_codec_encode(&enc, image, INT64_MAX / 1000000, 1, 0,
+                               VPX_DL_BEST_QUALITY),
+              VPX_CODEC_INVALID_PARAM);
+
+    // Free resources.
+    vpx_img_free(image);
+    ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+  }
+}
+
+TEST(EncodeAPI, PerFramePsnr) {
+  for (const auto *iface : kCodecIfaces) {
+    SCOPED_TRACE(vpx_codec_iface_name(iface));
+    vpx_codec_enc_cfg_t cfg;
+    ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+    cfg.g_lag_in_frames = 0;
+
+    vpx_codec_ctx_t enc;
+    ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+    vpx_image_t *const image =
+        CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+    ASSERT_NE(image, nullptr);
+
+    vpx_enc_frame_flags_t psnr_flags = VPX_EFLAG_CALCULATE_PSNR;
+    ASSERT_EQ(vpx_codec_encode(&enc, image, /*pts=*/0, /*duration=*/1,
+                               psnr_flags, VPX_DL_REALTIME),
+              VPX_CODEC_OK);
+
+    const vpx_codec_cx_pkt_t *pkt;
+    vpx_codec_iter_t iter = nullptr;
+    bool had_psnr = false;
+    while ((pkt = vpx_codec_get_cx_data(&enc, &iter)) != nullptr) {
+      if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) {
+        ASSERT_EQ(pkt->kind, VPX_CODEC_PSNR_PKT);
+        had_psnr = true;
+      }
+    }
+    EXPECT_TRUE(had_psnr);
+
+    vpx_enc_frame_flags_t no_psnr_flags = 0;
+    ASSERT_EQ(vpx_codec_encode(&enc, image, /*pts=*/1, /*duration=*/1,
+                               no_psnr_flags, VPX_DL_REALTIME),
+              VPX_CODEC_OK);
+
+    iter = nullptr;
+    had_psnr = false;
+    while ((pkt = vpx_codec_get_cx_data(&enc, &iter)) != nullptr) {
+      if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) {
+        ASSERT_EQ(pkt->kind, VPX_CODEC_PSNR_PKT);
+        had_psnr = true;
+      }
+    }
+#if CONFIG_INTERNAL_STATS
+    // CONFIG_INTERNAL_STATS unconditionally generates PSNR.
+    EXPECT_TRUE(had_psnr);
+#else
+    EXPECT_FALSE(had_psnr);
+#endif  // CONFIG_INTERNAL_STATS
+
+    // Free resources.
+    vpx_img_free(image);
+    ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+  }
+}
+
+#if CONFIG_VP9_ENCODER
+// Frame size needed to trigger the overflow exceeds the max buffer allowed on
+// 32-bit systems defined by VPX_MAX_ALLOCABLE_MEMORY
+#if VPX_ARCH_X86_64 || VPX_ARCH_AARCH64
+TEST(EncodeAPI, ConfigLargeTargetBitrateVp9) {
+#ifdef CHROMIUM
+  GTEST_SKIP() << "Under Chromium's configuration the allocator is unable"
+                  "to provide the space required for the frames below.";
+#else
+  constexpr int kWidth = 12383;
+  constexpr int kHeight = 8192;
+  constexpr auto *iface = &vpx_codec_vp9_cx_algo;
+  SCOPED_TRACE(vpx_codec_iface_name(iface));
+  vpx_codec_enc_cfg_t cfg = {};
+  struct Encoder {
+    ~Encoder() { EXPECT_EQ(vpx_codec_destroy(&ctx), VPX_CODEC_OK); }
+    vpx_codec_ctx_t ctx = {};
+  } enc;
+
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+  // The following setting will cause avg_frame_bandwidth in rate control to be
+  // larger than INT_MAX
+  cfg.rc_target_bitrate = INT_MAX;
+  // Framerate 0.1 (equivalent to timebase 10) is the smallest framerate allowed
+  // by libvpx
+  cfg.g_timebase.den = 1;
+  cfg.g_timebase.num = 10;
+  EXPECT_NO_FATAL_FAILURE(InitCodec(*iface, kWidth, kHeight, &enc.ctx, &cfg))
+      << "target bitrate: " << cfg.rc_target_bitrate << " framerate: "
+      << static_cast<double>(cfg.g_timebase.den) / cfg.g_timebase.num;
+#endif  // defined(CHROMIUM)
+}
+#endif  // VPX_ARCH_X86_64 || VPX_ARCH_AARCH64
+
+// Emulates the WebCodecs VideoEncoder interface.
+class VP9Encoder {
+ public:
+  explicit VP9Encoder(int speed)
+      : speed_(speed), row_mt_(0), bit_depth_(VPX_BITS_8),
+        fmt_(VPX_IMG_FMT_I420) {}
+  // The image format `fmt` must not have the VPX_IMG_FMT_HIGHBITDEPTH bit set.
+  // If bit_depth > 8, we will set the VPX_IMG_FMT_HIGHBITDEPTH bit before
+  // passing the image format to vpx_img_alloc().
+  VP9Encoder(int speed, unsigned int row_mt, vpx_bit_depth_t bit_depth,
+             vpx_img_fmt_t fmt)
+      : speed_(speed), row_mt_(row_mt), bit_depth_(bit_depth), fmt_(fmt) {}
+  ~VP9Encoder();
+
+  void Configure(unsigned int threads, unsigned int width, unsigned int height,
+                 vpx_rc_mode end_usage, vpx_enc_deadline_t deadline);
+  void Encode(bool key_frame);
+
+ private:
+  const int speed_;
+  const unsigned int row_mt_;
+  const vpx_bit_depth_t bit_depth_;
+  const vpx_img_fmt_t fmt_;
+  bool initialized_ = false;
+  vpx_codec_enc_cfg_t cfg_;
+  vpx_codec_ctx_t enc_;
+  int frame_index_ = 0;
+  vpx_enc_deadline_t deadline_ = 0;
+};
+
+VP9Encoder::~VP9Encoder() {
+  if (initialized_) {
+    EXPECT_EQ(vpx_codec_destroy(&enc_), VPX_CODEC_OK);
+  }
+}
+
+void VP9Encoder::Configure(unsigned int threads, unsigned int width,
+                           unsigned int height, vpx_rc_mode end_usage,
+                           vpx_enc_deadline_t deadline) {
+  deadline_ = deadline;
+
+  if (!initialized_) {
+    ASSERT_EQ(fmt_ & VPX_IMG_FMT_HIGHBITDEPTH, 0);
+    const bool high_bit_depth = bit_depth_ > VPX_BITS_8;
+    const bool is_420 = fmt_ == VPX_IMG_FMT_I420;
+    vpx_codec_iface_t *const iface = vpx_codec_vp9_cx();
+    ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg_, /*usage=*/0),
+              VPX_CODEC_OK);
+    cfg_.g_threads = threads;
+    // In profiles 0 and 2, only 4:2:0 format is allowed. In profiles 1 and 3,
+    // all other subsampling formats are allowed. In profiles 0 and 1, only bit
+    // depth 8 is allowed. In profiles 2 and 3, only bit depths 10 and 12 are
+    // allowed.
+    cfg_.g_profile = 2 * high_bit_depth + !is_420;
+    cfg_.g_w = width;
+    cfg_.g_h = height;
+    cfg_.g_bit_depth = bit_depth_;
+    cfg_.g_input_bit_depth = bit_depth_;
+    cfg_.g_timebase.num = 1;
+    cfg_.g_timebase.den = 1000 * 1000;  // microseconds
+    cfg_.g_pass = VPX_RC_ONE_PASS;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.rc_end_usage = end_usage;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 58;
+    ASSERT_EQ(
+        vpx_codec_enc_init(&enc_, iface, &cfg_,
+                           high_bit_depth ? VPX_CODEC_USE_HIGHBITDEPTH : 0),
+        VPX_CODEC_OK);
+    ASSERT_EQ(vpx_codec_control(&enc_, VP8E_SET_CPUUSED, speed_), VPX_CODEC_OK);
+    ASSERT_EQ(vpx_codec_control(&enc_, VP9E_SET_ROW_MT, row_mt_), VPX_CODEC_OK);
+    initialized_ = true;
+    return;
+  }
+
+  cfg_.g_threads = threads;
+  cfg_.g_w = width;
+  cfg_.g_h = height;
+  cfg_.rc_end_usage = end_usage;
+  ASSERT_EQ(vpx_codec_enc_config_set(&enc_, &cfg_), VPX_CODEC_OK)
+      << vpx_codec_error_detail(&enc_);
+}
+
+void VP9Encoder::Encode(bool key_frame) {
+  assert(initialized_);
+  const vpx_codec_cx_pkt_t *pkt;
+  vpx_image_t *image = CreateImage(bit_depth_, fmt_, cfg_.g_w, cfg_.g_h);
+  ASSERT_NE(image, nullptr);
+  const vpx_enc_frame_flags_t frame_flags = key_frame ? VPX_EFLAG_FORCE_KF : 0;
+  ASSERT_EQ(
+      vpx_codec_encode(&enc_, image, frame_index_, 1, frame_flags, deadline_),
+      VPX_CODEC_OK);
+  ++frame_index_;
+  vpx_codec_iter_t iter = nullptr;
+  while ((pkt = vpx_codec_get_cx_data(&enc_, &iter)) != nullptr) {
+    ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT);
+  }
+  vpx_img_free(image);
+}
+
+// This is a test case from clusterfuzz.
+TEST(EncodeAPI, PrevMiCheckNullptr) {
+  VP9Encoder encoder(0);
+  encoder.Configure(0, 1554, 644, VPX_VBR, VPX_DL_REALTIME);
+
+  // First step: encode, without forcing KF.
+  encoder.Encode(false);
+  // Second step: change config
+  encoder.Configure(0, 1131, 644, VPX_CBR, VPX_DL_GOOD_QUALITY);
+  // Third step: encode, without forcing KF
+  encoder.Encode(false);
+}
+
+// This is a test case from clusterfuzz: based on b/310477034.
+// Encode a few frames with multiple change config calls
+// with different frame sizes.
+TEST(EncodeAPI, MultipleChangeConfigResize) {
+  VP9Encoder encoder(3);
+
+  // Set initial config.
+  encoder.Configure(3, 41, 1, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode first frame.
+  encoder.Encode(true);
+
+  // Change config.
+  encoder.Configure(16, 31, 1, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Change config again.
+  encoder.Configure(0, 17, 1, VPX_CBR, VPX_DL_REALTIME);
+
+  // Encode 2nd frame with new config, set delta frame.
+  encoder.Encode(false);
+
+  // Encode 3rd frame with same config, set delta frame.
+  encoder.Encode(false);
+}
+
+// This is a test case from clusterfuzz: based on b/310663186.
+// Encode set of frames while varying the deadline on the fly from
+// good to realtime to best and back to realtime.
+TEST(EncodeAPI, DynamicDeadlineChange) {
+  // Use realtime speed: 5 to 9.
+  VP9Encoder encoder(5);
+
+  // Set initial config, in particular set deadline to GOOD mode.
+  encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 1st frame.
+  encoder.Encode(true);
+
+  // Encode 2nd frame, delta frame.
+  encoder.Encode(false);
+
+  // Change config: change deadline to REALTIME.
+  encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode 3rd frame with new config, set key frame.
+  encoder.Encode(true);
+
+  // Encode 4th frame with same config, delta frame.
+  encoder.Encode(false);
+
+  // Encode 5th frame with same config, key frame.
+  encoder.Encode(true);
+
+  // Change config: change deadline to BEST.
+  encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_BEST_QUALITY);
+
+  // Encode 6th frame with new config, set delta frame.
+  encoder.Encode(false);
+
+  // Change config: change deadline to REALTIME.
+  encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode 7th frame with new config, set delta frame.
+  encoder.Encode(false);
+
+  // Encode 8th frame with new config, set key frame.
+  encoder.Encode(true);
+
+  // Encode 9th frame with new config, set delta frame.
+  encoder.Encode(false);
+}
+
+TEST(EncodeAPI, Buganizer310340241) {
+  VP9Encoder encoder(-6);
+
+  // Set initial config, in particular set deadline to GOOD mode.
+  encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 1st frame.
+  encoder.Encode(true);
+
+  // Encode 2nd frame, delta frame.
+  encoder.Encode(false);
+
+  // Change config: change deadline to REALTIME.
+  encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode 3rd frame with new config, set key frame.
+  encoder.Encode(true);
+}
+
+// This is a test case from clusterfuzz: based on b/312517065.
+TEST(EncodeAPI, Buganizer312517065) {
+  VP9Encoder encoder(4);
+  encoder.Configure(0, 1060, 437, VPX_CBR, VPX_DL_REALTIME);
+  encoder.Encode(true);
+  encoder.Configure(10, 33, 437, VPX_VBR, VPX_DL_GOOD_QUALITY);
+  encoder.Encode(false);
+  encoder.Configure(6, 327, 269, VPX_VBR, VPX_DL_GOOD_QUALITY);
+  encoder.Configure(15, 1060, 437, VPX_CBR, VPX_DL_REALTIME);
+  encoder.Encode(false);
+}
+
+// This is a test case from clusterfuzz: based on b/311489136.
+// Encode a few frames with multiple change config calls
+// with different frame sizes.
+TEST(EncodeAPI, Buganizer311489136) {
+  VP9Encoder encoder(1);
+
+  // Set initial config.
+  encoder.Configure(12, 1678, 620, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode first frame.
+  encoder.Encode(true);
+
+  // Change config.
+  encoder.Configure(3, 1678, 202, VPX_CBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 2nd frame with new config, set delta frame.
+  encoder.Encode(false);
+
+  // Change config again.
+  encoder.Configure(8, 1037, 476, VPX_CBR, VPX_DL_REALTIME);
+
+  // Encode 3rd frame with new config, set delta frame.
+  encoder.Encode(false);
+
+  // Change config again.
+  encoder.Configure(0, 580, 620, VPX_CBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 4th frame with same config, set delta frame.
+  encoder.Encode(false);
+}
+
+// This is a test case from clusterfuzz: based on b/312656387.
+// Encode a few frames with multiple change config calls
+// with different frame sizes.
+TEST(EncodeAPI, Buganizer312656387) {
+  VP9Encoder encoder(1);
+
+  // Set initial config.
+  encoder.Configure(16, 1, 1024, VPX_CBR, VPX_DL_REALTIME);
+
+  // Change config.
+  encoder.Configure(15, 1, 1024, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode first frame.
+  encoder.Encode(true);
+
+  // Change config again.
+  encoder.Configure(14, 1, 595, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 2nd frame with new config.
+  encoder.Encode(true);
+
+  // Change config again.
+  encoder.Configure(2, 1, 1024, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 3rd frame with new config, set delta frame.
+  encoder.Encode(false);
+}
+
+// This is a test case from clusterfuzz: based on b/310329177.
+// Encode a few frames with multiple change config calls
+// with different frame sizes.
+TEST(EncodeAPI, Buganizer310329177) {
+  VP9Encoder encoder(6);
+
+  // Set initial config.
+  encoder.Configure(10, 41, 1, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode first frame.
+  encoder.Encode(true);
+
+  // Change config.
+  encoder.Configure(16, 1, 1, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode 2nd frame with new config, set delta frame.
+  encoder.Encode(false);
+}
+
+// This is a test case from clusterfuzz: based on b/311394513.
+// Encode a few frames with multiple change config calls
+// with different frame sizes.
+TEST(EncodeAPI, Buganizer311394513) {
+  VP9Encoder encoder(-7);
+
+  // Set initial config.
+  encoder.Configure(0, 5, 9, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode first frame.
+  encoder.Encode(false);
+
+  // Change config.
+  encoder.Configure(5, 2, 1, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode 2nd frame with new config.
+  encoder.Encode(true);
+}
+
+TEST(EncodeAPI, Buganizer311985118) {
+  VP9Encoder encoder(0);
+
+  // Set initial config, in particular set deadline to GOOD mode.
+  encoder.Configure(12, 1678, 620, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 1st frame.
+  encoder.Encode(false);
+
+  // Change config: change threads and width.
+  encoder.Configure(0, 1574, 620, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Change config: change threads, width and height.
+  encoder.Configure(16, 837, 432, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 2nd frame.
+  encoder.Encode(false);
+}
+
+// This is a test case from clusterfuzz: based on b/314857577.
+// Encode a few frames with multiple change config calls
+// with different frame sizes.
+TEST(EncodeAPI, Buganizer314857577) {
+  VP9Encoder encoder(4);
+
+  // Set initial config.
+  encoder.Configure(12, 1060, 437, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode first frame.
+  encoder.Encode(false);
+
+  // Change config.
+  encoder.Configure(16, 1060, 1, VPX_CBR, VPX_DL_REALTIME);
+
+  // Encode 2nd frame with new config.
+  encoder.Encode(false);
+
+  // Encode 3rd frame with new config.
+  encoder.Encode(true);
+
+  // Change config.
+  encoder.Configure(15, 33, 437, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 4th frame with new config.
+  encoder.Encode(true);
+
+  // Encode 5th frame with new config.
+  encoder.Encode(false);
+
+  // Change config.
+  encoder.Configure(5, 327, 269, VPX_VBR, VPX_DL_REALTIME);
+
+  // Change config.
+  encoder.Configure(15, 1060, 437, VPX_CBR, VPX_DL_REALTIME);
+
+  // Encode 6th frame with new config.
+  encoder.Encode(false);
+
+  // Encode 7th frame with new config.
+  encoder.Encode(false);
+
+  // Change config.
+  encoder.Configure(4, 1060, 437, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode 8th frame with new config.
+  encoder.Encode(false);
+}
+
+TEST(EncodeAPI, Buganizer312875957PredBufferStride) {
+  VP9Encoder encoder(-1);
+
+  encoder.Configure(12, 1678, 620, VPX_VBR, VPX_DL_REALTIME);
+  encoder.Encode(true);
+  encoder.Encode(false);
+  encoder.Configure(0, 456, 486, VPX_VBR, VPX_DL_REALTIME);
+  encoder.Encode(true);
+  encoder.Configure(0, 1678, 620, VPX_CBR, 1000000);
+  encoder.Encode(false);
+  encoder.Encode(false);
+}
+
+// This is a test case from clusterfuzz: based on b/311294795
+// Encode a few frames with multiple change config calls
+// with different frame sizes.
+TEST(EncodeAPI, Buganizer311294795) {
+  VP9Encoder encoder(1);
+
+  // Set initial config.
+  encoder.Configure(12, 1678, 620, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode first frame.
+  encoder.Encode(false);
+
+  // Change config.
+  encoder.Configure(16, 632, 620, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 2nd frame with new config
+  encoder.Encode(true);
+
+  // Change config.
+  encoder.Configure(16, 1678, 342, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 3rd frame with new config.
+  encoder.Encode(false);
+
+  // Change config.
+  encoder.Configure(0, 1574, 618, VPX_VBR, VPX_DL_REALTIME);
+  // Encode more frames with new config.
+  encoder.Encode(false);
+  encoder.Encode(false);
+}
+
+// Test case to capture assert issue triggered in
+// vp9_bitstream.c for good_quality, speed 1, lossless;
+// See comment#22 in issue:433941753.
+TEST(EncodeAPI, AssertIssueGoodQualitySpeed1Lossless) {
+  vpx_codec_iface_t *const iface = vpx_codec_vp9_cx();
+  vpx_codec_ctx_t enc;
+  vpx_codec_enc_cfg_t cfg;
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+  cfg.g_w = 1540;
+  cfg.g_h = 838;
+  cfg.g_profile = 0;
+  cfg.g_bit_depth = VPX_BITS_8;
+  cfg.g_timebase.num = 1;
+  cfg.g_timebase.den = 10000;
+  cfg.g_pass = VPX_RC_ONE_PASS;
+  cfg.g_lag_in_frames = 0;
+  cfg.rc_end_usage = VPX_VBR;
+  cfg.g_threads = 1;
+  cfg.rc_target_bitrate = 10000;
+  ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+  ASSERT_EQ(vpx_codec_control(&enc, VP9E_SET_LOSSLESS, 1), VPX_CODEC_OK);
+  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, 1), VPX_CODEC_OK);
+  libvpx_test::RandomVideoSource video;
+  video.SetSize(cfg.g_w, cfg.g_h);
+  video.SetImageFormat(VPX_IMG_FMT_I420);
+  video.set_limit(20);
+  video.Begin();
+  do {
+    ASSERT_EQ(vpx_codec_encode(&enc, video.img(), video.pts(), video.duration(),
+                               0, VPX_DL_GOOD_QUALITY),
+              VPX_CODEC_OK);
+    video.Next();
+  } while (video.img() != nullptr);
+  ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+}
+
+TEST(EncodeAPI, Buganizer317105128) {
+  VP9Encoder encoder(-9);
+  encoder.Configure(0, 1, 1, VPX_CBR, VPX_DL_GOOD_QUALITY);
+  encoder.Configure(16, 1920, 1, VPX_CBR, VPX_DL_REALTIME);
+}
+
+TEST(EncodeAPI, Buganizer319964497) {
+  VP9Encoder encoder(7);
+  encoder.Configure(/*threads=*/1, /*width=*/320, /*height=*/240, VPX_VBR,
+                    VPX_DL_REALTIME);
+  encoder.Encode(/*key_frame=*/true);
+  encoder.Encode(/*key_frame=*/true);
+  encoder.Encode(/*key_frame=*/false);
+  encoder.Configure(/*threads=*/1, /*width=*/1, /*height=*/1, VPX_VBR,
+                    VPX_DL_REALTIME);
+  encoder.Encode(/*key_frame=*/false);
+  encoder.Configure(/*threads=*/1, /*width=*/2, /*height=*/2, VPX_CBR,
+                    VPX_DL_REALTIME);
+  encoder.Encode(/*key_frame=*/false);
+}
+
+TEST(EncodeAPI, Buganizer329088759RowMT0) {
+  VP9Encoder encoder(8, 0, VPX_BITS_8, VPX_IMG_FMT_I444);
+  encoder.Configure(/*threads=*/8, /*width=*/1686, /*height=*/398, VPX_VBR,
+                    VPX_DL_REALTIME);
+  encoder.Encode(/*key_frame=*/true);
+  encoder.Encode(/*key_frame=*/false);
+  encoder.Configure(/*threads=*/0, /*width=*/1686, /*height=*/1, VPX_VBR,
+                    VPX_DL_REALTIME);
+  encoder.Encode(/*key_frame=*/true);
+  encoder.Configure(/*threads=*/0, /*width=*/1482, /*height=*/113, VPX_CBR,
+                    VPX_DL_REALTIME);
+  encoder.Encode(/*key_frame=*/true);
+  encoder.Configure(/*threads=*/0, /*width=*/881, /*height=*/59, VPX_CBR,
+                    VPX_DL_REALTIME);
+  encoder.Configure(/*threads=*/13, /*width=*/1271, /*height=*/385, VPX_CBR,
+                    VPX_DL_REALTIME);
+  encoder.Encode(/*key_frame=*/false);
+  encoder.Configure(/*threads=*/2, /*width=*/1, /*height=*/62, VPX_VBR,
+                    VPX_DL_REALTIME);
+}
+
+TEST(EncodeAPI, Buganizer329088759RowMT1) {
+  VP9Encoder encoder(8, 1, VPX_BITS_8, VPX_IMG_FMT_I444);
+  encoder.Configure(/*threads=*/8, /*width=*/1686, /*height=*/398, VPX_VBR,
+                    VPX_DL_REALTIME);
+  encoder.Encode(/*key_frame=*/true);
+  encoder.Encode(/*key_frame=*/false);
+  // Needs to set threads to non-zero to repro the issue.
+  encoder.Configure(/*threads=*/2, /*width=*/1686, /*height=*/1, VPX_VBR,
+                    VPX_DL_REALTIME);
+  encoder.Encode(/*key_frame=*/true);
+  encoder.Configure(/*threads=*/2, /*width=*/1482, /*height=*/113, VPX_CBR,
+                    VPX_DL_REALTIME);
+  encoder.Encode(/*key_frame=*/true);
+  encoder.Configure(/*threads=*/2, /*width=*/881, /*height=*/59, VPX_CBR,
+                    VPX_DL_REALTIME);
+  encoder.Configure(/*threads=*/13, /*width=*/1271, /*height=*/385, VPX_CBR,
+                    VPX_DL_REALTIME);
+  encoder.Encode(/*key_frame=*/false);
+  encoder.Configure(/*threads=*/2, /*width=*/1, /*height=*/62, VPX_VBR,
+                    VPX_DL_REALTIME);
+}
+
+TEST(EncodeAPI, Buganizer331086799) {
+  VP9Encoder encoder(6, 1, VPX_BITS_8, VPX_IMG_FMT_I420);
+  encoder.Configure(0, 1385, 1, VPX_CBR, VPX_DL_REALTIME);
+  encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_REALTIME);
+  encoder.Encode(false);
+  encoder.Configure(16, 1385, 1, VPX_VBR, VPX_DL_GOOD_QUALITY);
+  encoder.Encode(false);
+  encoder.Encode(false);
+  encoder.Configure(0, 1, 1, VPX_CBR, VPX_DL_REALTIME);
+  encoder.Encode(true);
+}
+
+TEST(EncodeAPI, Buganizer331108729) {
+  VP9Encoder encoder(1, 1, VPX_BITS_8, VPX_IMG_FMT_I422);
+  encoder.Configure(0, 1919, 260, VPX_VBR, VPX_DL_REALTIME);
+  encoder.Configure(9, 440, 1, VPX_CBR, VPX_DL_GOOD_QUALITY);
+  encoder.Encode(true);
+  encoder.Configure(8, 1919, 260, VPX_VBR, VPX_DL_REALTIME);
+  encoder.Encode(false);
+}
+
+TEST(EncodeAPI, Buganizer331108922BitDepth8) {
+  VP9Encoder encoder(9, 1, VPX_BITS_8, VPX_IMG_FMT_I420);
+  encoder.Configure(/*threads=*/1, /*width=*/1, /*height=*/1080, VPX_VBR,
+                    VPX_DL_REALTIME);
+  encoder.Encode(/*key_frame=*/false);
+  encoder.Configure(/*threads=*/0, /*width=*/1, /*height=*/1080, VPX_CBR,
+                    VPX_DL_GOOD_QUALITY);
+  encoder.Configure(/*threads=*/16, /*width=*/1, /*height=*/394, VPX_CBR,
+                    VPX_DL_REALTIME);
+  encoder.Encode(/*key_frame=*/false);
+  encoder.Encode(/*key_frame=*/true);
+  encoder.Configure(/*threads=*/16, /*width=*/1, /*height=*/798, VPX_CBR,
+                    VPX_DL_REALTIME);
+  encoder.Encode(/*key_frame=*/false);
+}
+
+// Encode some frames, flip from BEST_QUALITY to REALTIME after 2 frames.
+// This test is taken from the code snippet in issue:441668134.
+TEST(EncodeAPI, Buganizer441668134) {
+  // Get VP9 encoder interface.
+  vpx_codec_iface_t *iface = vpx_codec_vp9_cx();
+  // Initialize encoder configuration with default values.
+  vpx_codec_enc_cfg_t cfg;
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+  cfg.g_lag_in_frames = 0;
+  cfg.rc_max_quantizer = 0;
+  unsigned long init_flags = 0;
+  vpx_codec_ctx_t ctx;
+  ASSERT_EQ(vpx_codec_enc_init(&ctx, iface, &cfg, init_flags), VPX_CODEC_OK);
+  ASSERT_EQ(vpx_codec_control_(&ctx, VP8E_SET_CPUUSED, 9), 0);
+  ASSERT_EQ(vpx_codec_control_(&ctx, VP9E_SET_DELTA_Q_UV, -15), 0);
+  // Image allocation.
+  vpx_img_fmt_t img_fmt = VPX_IMG_FMT_I420;
+  vpx_image_t *img = vpx_img_alloc(NULL, img_fmt, cfg.g_w, cfg.g_h, 32);
+  for (unsigned int y = 0; y < img->d_h; y++) {
+    for (unsigned int x = 0; x < img->d_w; x++) {
+      img->planes[0][y * img->stride[0] + x] = ((x ^ y) * 127) & 0xFF;
+    }
+  }
+  const unsigned int uv_height = (img->d_h + 1) >> 1;
+  for (int i : { VPX_PLANE_U, VPX_PLANE_V }) {
+    memset(img->planes[i], 0, img->stride[i] * uv_height);
+  }
+  // Encode some frames.
+  int num_frames = 6;
+  static constexpr int kChoices[6] = { 1, 1, 0, 0, 0, 0 };
+  for (int frame = 0; frame < num_frames; frame++) {
+    vpx_enc_deadline_t deadline = VPX_DL_REALTIME;
+    uint8_t dl_choice = kChoices[frame];
+    if (dl_choice == 1) deadline = VPX_DL_BEST_QUALITY;
+    // Encode frame.
+    ASSERT_EQ(vpx_codec_encode(&ctx, img, frame, 1, 0, deadline), VPX_CODEC_OK);
+  }
+  vpx_img_free(img);
+  vpx_codec_destroy(&ctx);
+}
+
+// Encode a few frames, with realtime mode and tile_rows set to 1,
+// with row-mt enabled. This triggers an assertion in vp9_bitstream.c (in
+// function write_modes()), as in the issue:442105459. In this test it happens
+// on very first encoded frame since lag_in_frames = 0. Issue is due to enabling
+// TILE_ROWS, with number of tile_rows more than the number of superblocks.
+// This test sets 2 tile_rows with height corresponding to 1 superblock (sb).
+TEST(EncodeAPI, Buganizer442105459_2RowTiles) {
+  // Initialize VP9 encoder interface
+  vpx_codec_iface_t *iface = vpx_codec_vp9_cx();
+  // Get default encoder configuration
+  vpx_codec_enc_cfg_t cfg;
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+  // Configure encoder
+  cfg.g_w = 946u;
+  cfg.g_h = 64u;  // 1 sb row, 2 tile_rows set below.
+  cfg.g_threads = 1;
+  cfg.g_profile = 0;
+  cfg.g_bit_depth = VPX_BITS_8;
+  // Rate control targeting deeper encoding paths
+  cfg.rc_target_bitrate = 100;
+  cfg.rc_min_quantizer = 0;
+  cfg.rc_max_quantizer = 0;
+  cfg.rc_end_usage = VPX_VBR;
+  cfg.ss_number_layers = 1;
+  cfg.g_lag_in_frames = 0;
+  // Initialize encoder context
+  vpx_codec_ctx_t ctx;
+  ASSERT_EQ(vpx_codec_enc_init(&ctx, iface, &cfg, 0), VPX_CODEC_OK);
+  // Set control parameters
+  vpx_codec_control_(&ctx, VP8E_SET_CPUUSED, -5);
+  vpx_codec_control_(&ctx, VP9E_SET_TILE_ROWS, 1);
+  vpx_codec_control_(&ctx, VP9E_SET_TILE_COLUMNS, 1);
+  vpx_codec_control_(&ctx, VP9E_SET_ROW_MT, 1);
+  // Image format selection
+  vpx_img_fmt_t img_fmt = VPX_IMG_FMT_I420;
+  // Allocate image with varied alignment
+  vpx_image_t *img = vpx_img_alloc(nullptr, img_fmt, cfg.g_w, cfg.g_h, 1);
+  for (unsigned int y = 0; y < img->d_h; y++) {
+    for (unsigned int x = 0; x < img->d_w; x++) {
+      img->planes[0][y * img->stride[0] + x] = ((x ^ y) * 127) & 0xFF;
+    }
+  }
+  const unsigned int uv_height = (img->d_h + 1) >> 1;
+  for (int i : { VPX_PLANE_U, VPX_PLANE_V }) {
+    memset(img->planes[i], 0, img->stride[i] * uv_height);
+  }
+  // Encode with dynamic configuration changes
+  int num_frames = 2;
+  // Per-frame constants captured from the original run (indices consumed per
+  // frame)
+  const vpx_codec_pts_t frame_pts_mul[] = { 33333UL, 33333UL };
+  const unsigned long frame_durations[] = { 33333UL, 33333UL };
+  const vpx_enc_deadline_t frame_deadlines[] = { VPX_DL_REALTIME,
+                                                 VPX_DL_REALTIME };
+  for (int frame = 0; frame < num_frames; frame++) {
+    // Encode frame
+    vpx_codec_pts_t pts = frame * frame_pts_mul[frame];
+    unsigned long duration = frame_durations[frame];
+    vpx_enc_deadline_t deadline = frame_deadlines[frame];
+    ASSERT_EQ(vpx_codec_encode(&ctx, img, pts, duration, /*flags*/ 0, deadline),
+              VPX_CODEC_OK);
+  }
+  // Flush encoder.
+  ASSERT_EQ(vpx_codec_encode(&ctx, nullptr, 0, 0, 0, VPX_DL_REALTIME), 0);
+  // Get remaining data
+  vpx_codec_iter_t iter = nullptr;
+  while (vpx_codec_get_cx_data(&ctx, &iter) != nullptr) {
+    // Process remaining packets
+  }
+  vpx_img_free(img);
+  vpx_codec_destroy(&ctx);
+}
+
+// Encode a few frames, with realtime mode and tile_rows set to 1,
+// with row-mt enabled. This triggers an assertion in vp9_bitstream.c (in
+// function write_modes()), as in the issue:442105459. In this test it happens
+// on very first encoded frame since lag_in_frames = 0. Issue is due to enabling
+// TILE_ROWS, with number of tile_rows more than the number of superblocks.
+// This test sets 4 tile_rows with height corresponding to 3 superblocks.
+TEST(EncodeAPI, Buganizer442105459_4RowTiles) {
+  // Initialize VP9 encoder interface
+  vpx_codec_iface_t *iface = vpx_codec_vp9_cx();
+  // Get default encoder configuration
+  vpx_codec_enc_cfg_t cfg;
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+  // Configure encoder
+  cfg.g_w = 946u;
+  cfg.g_h = 192u;  // 3 sb rows, 4 tile_rows set below.
+  cfg.g_threads = 1;
+  cfg.g_profile = 0;
+  cfg.g_bit_depth = VPX_BITS_8;
+  // Rate control targeting deeper encoding paths
+  cfg.rc_target_bitrate = 100;
+  cfg.rc_min_quantizer = 0;
+  cfg.rc_max_quantizer = 0;
+  cfg.rc_end_usage = VPX_VBR;
+  cfg.ss_number_layers = 1;
+  cfg.g_lag_in_frames = 0;
+  // Initialize encoder context
+  vpx_codec_ctx_t ctx;
+  ASSERT_EQ(vpx_codec_enc_init(&ctx, iface, &cfg, 0), VPX_CODEC_OK);
+  // Set control parameters
+  vpx_codec_control_(&ctx, VP8E_SET_CPUUSED, -5);
+  vpx_codec_control_(&ctx, VP9E_SET_TILE_ROWS, 2);
+  vpx_codec_control_(&ctx, VP9E_SET_TILE_COLUMNS, 1);
+  vpx_codec_control_(&ctx, VP9E_SET_ROW_MT, 1);
+  // Image format selection
+  vpx_img_fmt_t img_fmt = VPX_IMG_FMT_I420;
+  // Allocate image with varied alignment
+  vpx_image_t *img = vpx_img_alloc(nullptr, img_fmt, cfg.g_w, cfg.g_h, 1);
+  for (unsigned int y = 0; y < img->d_h; y++) {
+    for (unsigned int x = 0; x < img->d_w; x++) {
+      img->planes[0][y * img->stride[0] + x] = ((x ^ y) * 127) & 0xFF;
+    }
+  }
+  const unsigned int uv_height = (img->d_h + 1) >> 1;
+  for (int i : { VPX_PLANE_U, VPX_PLANE_V }) {
+    memset(img->planes[i], 0, img->stride[i] * uv_height);
+  }
+  // Encode with dynamic configuration changes
+  int num_frames = 2;
+  // Per-frame constants captured from the original run (indices consumed per
+  // frame)
+  const vpx_codec_pts_t frame_pts_mul[] = { 33333UL, 33333UL };
+  const unsigned long frame_durations[] = { 33333UL, 33333UL };
+  const vpx_enc_deadline_t frame_deadlines[] = { VPX_DL_REALTIME,
+                                                 VPX_DL_REALTIME };
+  for (int frame = 0; frame < num_frames; frame++) {
+    // Encode frame
+    vpx_codec_pts_t pts = frame * frame_pts_mul[frame];
+    unsigned long duration = frame_durations[frame];
+    vpx_enc_deadline_t deadline = frame_deadlines[frame];
+    ASSERT_EQ(vpx_codec_encode(&ctx, img, pts, duration, /*flags*/ 0, deadline),
+              VPX_CODEC_OK);
+  }
+  // Flush encoder.
+  ASSERT_EQ(vpx_codec_encode(&ctx, nullptr, 0, 0, 0, VPX_DL_REALTIME), 0);
+  // Get remaining data
+  vpx_codec_iter_t iter = nullptr;
+  while (vpx_codec_get_cx_data(&ctx, &iter) != nullptr) {
+    // Process remaining packets
+  }
+  vpx_img_free(img);
+  vpx_codec_destroy(&ctx);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+TEST(EncodeAPI, Buganizer329674887RowMT0BitDepth12) {
+  VP9Encoder encoder(8, 0, VPX_BITS_12, VPX_IMG_FMT_I444);
+  encoder.Configure(/*threads=*/2, /*width=*/1030, /*height=*/583, VPX_VBR,
+                    VPX_DL_REALTIME);
+  encoder.Encode(/*key_frame=*/true);
+  encoder.Configure(/*threads=*/0, /*width=*/1030, /*height=*/1, VPX_CBR,
+                    VPX_DL_REALTIME);
+  encoder.Encode(/*key_frame=*/true);
+  encoder.Configure(/*threads=*/0, /*width=*/548, /*height=*/322, VPX_VBR,
+                    VPX_DL_REALTIME);
+  encoder.Encode(/*key_frame=*/false);
+  encoder.Configure(/*threads=*/16, /*width=*/24, /*height=*/583, VPX_CBR,
+                    VPX_DL_GOOD_QUALITY);
+}
+
+TEST(EncodeAPI, Buganizer329179808RowMT0BitDepth10) {
+  VP9Encoder encoder(4, 0, VPX_BITS_10, VPX_IMG_FMT_I444);
+  encoder.Configure(/*threads=*/16, /*width=*/1488, /*height=*/5, VPX_VBR,
+                    VPX_DL_REALTIME);
+  encoder.Encode(/*key_frame=*/true);
+  encoder.Configure(/*threads=*/16, /*width=*/839, /*height=*/1, VPX_CBR,
+                    VPX_DL_REALTIME);
+  encoder.Encode(/*key_frame=*/false);
+  encoder.Configure(/*threads=*/11, /*width=*/657, /*height=*/5, VPX_CBR,
+                    VPX_DL_REALTIME);
+  encoder.Encode(/*key_frame=*/false);
+}
+
+TEST(EncodeAPI, Buganizer329179808RowMT1BitDepth10) {
+  VP9Encoder encoder(4, 1, VPX_BITS_10, VPX_IMG_FMT_I444);
+  encoder.Configure(/*threads=*/16, /*width=*/1488, /*height=*/5, VPX_VBR,
+                    VPX_DL_REALTIME);
+  encoder.Encode(/*key_frame=*/true);
+  encoder.Configure(/*threads=*/16, /*width=*/839, /*height=*/1, VPX_CBR,
+                    VPX_DL_REALTIME);
+  encoder.Encode(/*key_frame=*/false);
+  encoder.Configure(/*threads=*/11, /*width=*/657, /*height=*/5, VPX_CBR,
+                    VPX_DL_REALTIME);
+  encoder.Encode(/*key_frame=*/false);
+}
+
+TEST(EncodeAPI, Buganizer331108922BitDepth12) {
+  VP9Encoder encoder(9, 1, VPX_BITS_12, VPX_IMG_FMT_I444);
+  encoder.Configure(/*threads=*/1, /*width=*/1, /*height=*/1080, VPX_VBR,
+                    VPX_DL_REALTIME);
+  encoder.Encode(/*key_frame=*/false);
+  encoder.Configure(/*threads=*/0, /*width=*/1, /*height=*/1080, VPX_CBR,
+                    VPX_DL_GOOD_QUALITY);
+  encoder.Configure(/*threads=*/16, /*width=*/1, /*height=*/394, VPX_CBR,
+                    VPX_DL_REALTIME);
+  encoder.Encode(/*key_frame=*/false);
+  encoder.Encode(/*key_frame=*/true);
+  encoder.Configure(/*threads=*/16, /*width=*/1, /*height=*/798, VPX_CBR,
+                    VPX_DL_REALTIME);
+  encoder.Encode(/*key_frame=*/false);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+TEST(EncodeAPI, VP9GlobalHeaders) {
+  constexpr int kWidth = 320;
+  constexpr int kHeight = 240;
+
+  libvpx_test::DummyVideoSource video;
+  video.SetSize(kWidth, kHeight);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int profiles[] = { 0, 1, 2, 3 };
+#else
+  const int profiles[] = { 0, 1 };
+#endif
+  char str[80];
+  for (const int profile : profiles) {
+    std::vector<vpx_bit_depth_t> bitdepths;
+    std::vector<vpx_img_fmt_t> formats;
+    switch (profile) {
+      case 0:
+        bitdepths = { VPX_BITS_8 };
+        formats = { VPX_IMG_FMT_I420 };
+        break;
+      case 1:
+        bitdepths = { VPX_BITS_8 };
+        formats = { VPX_IMG_FMT_I422, VPX_IMG_FMT_I444 };
+        break;
+#if CONFIG_VP9_HIGHBITDEPTH
+      case 2:
+        bitdepths = { VPX_BITS_10, VPX_BITS_12 };
+        formats = { VPX_IMG_FMT_I42016 };
+        break;
+      case 3:
+        bitdepths = { VPX_BITS_10, VPX_BITS_12 };
+        formats = { VPX_IMG_FMT_I42216, VPX_IMG_FMT_I44416 };
+        break;
+#endif
+    }
+
+    for (const auto format : formats) {
+      for (const auto bitdepth : bitdepths) {
+        snprintf(str, sizeof(str), "profile: %d bitdepth: %d format: %d",
+                 profile, bitdepth, format);
+        SCOPED_TRACE(str);
+
+        vpx_codec_enc_cfg_t cfg = {};
+        struct Encoder {
+          ~Encoder() { EXPECT_EQ(vpx_codec_destroy(&ctx), VPX_CODEC_OK); }
+          vpx_codec_ctx_t ctx = {};
+        } enc;
+        vpx_codec_ctx_t *const ctx = &enc.ctx;
+
+        ASSERT_EQ(vpx_codec_enc_config_default(vpx_codec_vp9_cx(), &cfg, 0),
+                  VPX_CODEC_OK);
+        cfg.g_w = kWidth;
+        cfg.g_h = kHeight;
+        cfg.g_lag_in_frames = 0;
+        cfg.g_pass = VPX_RC_ONE_PASS;
+        cfg.g_profile = profile;
+        cfg.g_bit_depth = bitdepth;
+        ASSERT_EQ(
+            vpx_codec_enc_init(ctx, vpx_codec_vp9_cx(), &cfg,
+                               bitdepth == 8 ? 0 : VPX_CODEC_USE_HIGHBITDEPTH),
+            VPX_CODEC_OK);
+        ASSERT_EQ(vpx_codec_control_(ctx, VP8E_SET_CPUUSED, 2), VPX_CODEC_OK);
+        ASSERT_EQ(vpx_codec_control_(ctx, VP9E_SET_TARGET_LEVEL, 62),
+                  VPX_CODEC_OK);
+
+        vpx_fixed_buf_t *global_headers = vpx_codec_get_global_headers(ctx);
+        EXPECT_NE(global_headers, nullptr);
+        EXPECT_EQ(global_headers->sz, size_t{ 9 });
+
+        video.SetImageFormat(format);
+        video.Begin();
+        EXPECT_EQ(
+            vpx_codec_encode(ctx, video.img(), video.pts(), video.duration(),
+                             /*flags=*/0, VPX_DL_GOOD_QUALITY),
+            VPX_CODEC_OK)
+            << vpx_codec_error_detail(ctx);
+
+        global_headers = vpx_codec_get_global_headers(ctx);
+        EXPECT_NE(global_headers, nullptr);
+        EXPECT_EQ(global_headers->sz, size_t{ 12 });
+        uint8_t chroma_subsampling;
+        if ((format & VPX_IMG_FMT_I420) == VPX_IMG_FMT_I420) {
+          chroma_subsampling = 1;
+        } else if ((format & VPX_IMG_FMT_I422) == VPX_IMG_FMT_I422) {
+          chroma_subsampling = 2;
+        } else {  // VPX_IMG_FMT_I444
+          chroma_subsampling = 3;
+        }
+        const uint8_t expected_headers[] = { 1,
+                                             1,
+                                             static_cast<uint8_t>(profile),
+                                             2,
+                                             1,
+                                             /*level,*/ 3,
+                                             1,
+                                             static_cast<uint8_t>(bitdepth),
+                                             4,
+                                             1,
+                                             chroma_subsampling };
+        const uint8_t *actual_headers =
+            reinterpret_cast<const uint8_t *>(global_headers->buf);
+        for (int i = 0; i < 5; ++i) {
+          EXPECT_EQ(expected_headers[i], actual_headers[i]) << "index: " << i;
+        }
+        EXPECT_NE(actual_headers[6], 0);  // level
+        for (int i = 5; i < 11; ++i) {
+          EXPECT_EQ(expected_headers[i], actual_headers[i + 1])
+              << "index: " << i + 1;
+        }
+      }
+    }
+  }
+}
+
+TEST(EncodeAPI, AomediaIssue3509VbrMinSection2PercentVP9) {
+  // Initialize libvpx encoder.
+  vpx_codec_iface_t *const iface = vpx_codec_vp9_cx();
+  vpx_codec_ctx_t enc;
+  vpx_codec_enc_cfg_t cfg;
+
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+
+  cfg.g_w = 1920;
+  cfg.g_h = 1080;
+  cfg.g_lag_in_frames = 0;
+  cfg.rc_target_bitrate = 1000000;
+  // Set this to more than 1 percent to cause a signed integer overflow in the
+  // multiplication rc->avg_frame_bandwidth * oxcf->rc_cfg.vbrmin_section in
+  // vp9_rc_update_framerate() if the multiplication is done in the `int` type.
+  cfg.rc_2pass_vbr_minsection_pct = 2;
+
+  ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+  // Create input image.
+  vpx_image_t *const image =
+      CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+  ASSERT_NE(image, nullptr);
+
+  // Encode frame.
+  // `duration` can go as high as 300, but the UBSan error is gone if
+  // `duration` is 301 or higher.
+  ASSERT_EQ(
+      vpx_codec_encode(&enc, image, 0, /*duration=*/300, 0, VPX_DL_REALTIME),
+      VPX_CODEC_OK);
+
+  // Free resources.
+  vpx_img_free(image);
+  ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+}
+
+TEST(EncodeAPI, AomediaIssue3509VbrMinSection101PercentVP9) {
+  // Initialize libvpx encoder.
+  vpx_codec_iface_t *const iface = vpx_codec_vp9_cx();
+  vpx_codec_ctx_t enc;
+  vpx_codec_enc_cfg_t cfg;
+
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+
+  cfg.g_w = 1920;
+  cfg.g_h = 1080;
+  cfg.g_lag_in_frames = 0;
+  cfg.rc_target_bitrate = 1000000;
+  // Set this to more than 100 percent to cause an error when vbr_min_bits is
+  // cast to `int` in vp9_rc_update_framerate() if vbr_min_bits is not clamped
+  // to INT_MAX.
+  cfg.rc_2pass_vbr_minsection_pct = 101;
+
+  ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+  // Create input image.
+  vpx_image_t *const image =
+      CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+  ASSERT_NE(image, nullptr);
+
+  // Encode frame.
+  // `duration` can go as high as 300, but the UBSan error is gone if
+  // `duration` is 301 or higher.
+  ASSERT_EQ(
+      vpx_codec_encode(&enc, image, 0, /*duration=*/300, 0, VPX_DL_REALTIME),
+      VPX_CODEC_OK);
+
+  // Free resources.
+  vpx_img_free(image);
+  ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+}
+
+TEST(EncodeAPI, Chromium352414650) {
+  // Initialize libvpx encoder.
+  vpx_codec_iface_t *const iface = vpx_codec_vp9_cx();
+  vpx_codec_ctx_t enc;
+  vpx_codec_enc_cfg_t cfg;
+
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+
+  cfg.g_w = 1024;
+  cfg.g_h = 1024;
+  cfg.g_profile = 0;
+  cfg.g_pass = VPX_RC_ONE_PASS;
+  cfg.g_lag_in_frames = 0;
+  cfg.rc_max_quantizer = 58;
+  cfg.rc_min_quantizer = 2;
+  cfg.g_threads = 4;
+  cfg.rc_resize_allowed = 0;
+  cfg.rc_dropframe_thresh = 0;
+  cfg.g_timebase.num = 1;
+  cfg.g_timebase.den = 1000000;
+  cfg.kf_min_dist = 0;
+  cfg.kf_max_dist = 10000;
+  cfg.rc_end_usage = VPX_CBR;
+  cfg.rc_target_bitrate = 754974;
+  cfg.ts_number_layers = 3;
+  cfg.ts_periodicity = 4;
+  cfg.ts_layer_id[0] = 0;
+  cfg.ts_layer_id[1] = 2;
+  cfg.ts_layer_id[2] = 1;
+  cfg.ts_layer_id[3] = 2;
+  cfg.ts_rate_decimator[0] = 4;
+  cfg.ts_rate_decimator[1] = 2;
+  cfg.ts_rate_decimator[2] = 1;
+  cfg.layer_target_bitrate[0] = 2147483;
+  cfg.layer_target_bitrate[1] = 3006476;
+  cfg.layer_target_bitrate[2] = 4294967;
+  cfg.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_0212;
+  cfg.g_error_resilient = VPX_ERROR_RESILIENT_DEFAULT;
+
+  ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, 7), VPX_CODEC_OK);
+  ASSERT_EQ(vpx_codec_control(&enc, VP9E_SET_TILE_COLUMNS, 2), VPX_CODEC_OK);
+  ASSERT_EQ(vpx_codec_control(&enc, VP9E_SET_ROW_MT, 1), VPX_CODEC_OK);
+
+  vpx_svc_extra_cfg_t svc_cfg = {};
+  svc_cfg.max_quantizers[0] = svc_cfg.max_quantizers[1] =
+      svc_cfg.max_quantizers[2] = 58;
+  svc_cfg.min_quantizers[0] = svc_cfg.min_quantizers[1] =
+      svc_cfg.min_quantizers[2] = 2;
+  svc_cfg.scaling_factor_num[0] = svc_cfg.scaling_factor_num[1] =
+      svc_cfg.scaling_factor_num[2] = 1;
+  svc_cfg.scaling_factor_den[0] = svc_cfg.scaling_factor_den[1] =
+      svc_cfg.scaling_factor_den[2] = 1;
+  ASSERT_EQ(vpx_codec_control(&enc, VP9E_SET_SVC_PARAMETERS, &svc_cfg),
+            VPX_CODEC_OK);
+  ASSERT_EQ(vpx_codec_control(&enc, VP9E_SET_SVC, 1), VPX_CODEC_OK);
+  ASSERT_EQ(vpx_codec_control(&enc, VP9E_SET_AQ_MODE, 3), VPX_CODEC_OK);
+  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_STATIC_THRESHOLD, 1),
+            VPX_CODEC_OK);
+  ASSERT_EQ(vpx_codec_control(&enc, VP9E_SET_COLOR_SPACE, VPX_CS_SMPTE_170),
+            VPX_CODEC_OK);
+  ASSERT_EQ(vpx_codec_control(&enc, VP9E_SET_COLOR_RANGE, VPX_CR_STUDIO_RANGE),
+            VPX_CODEC_OK);
+
+  // Create input image.
+  vpx_image_t *const image =
+      CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+  ASSERT_NE(image, nullptr);
+
+  // Encode frame.
+  ASSERT_EQ(vpx_codec_encode(&enc, image, 0, /*duration=*/500000,
+                             VPX_EFLAG_FORCE_KF, VPX_DL_REALTIME),
+            VPX_CODEC_OK);
+
+  // Free resources.
+  vpx_img_free(image);
+  ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+}
+
+TEST(EncodeAPI, PerFramePsnrNotSupportedWithLagInFrames) {
+  vpx_codec_iface_t *const iface = vpx_codec_vp9_cx();
+  vpx_codec_enc_cfg_t cfg;
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+  ASSERT_NE(cfg.g_lag_in_frames, 0u);
+
+  vpx_codec_ctx_t enc;
+  ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+  vpx_image_t *const image =
+      CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+  ASSERT_NE(image, nullptr);
+
+  vpx_enc_frame_flags_t psnr_flags = VPX_EFLAG_CALCULATE_PSNR;
+  ASSERT_EQ(vpx_codec_encode(&enc, image, /*pts=*/0, /*duration=*/1, psnr_flags,
+                             VPX_DL_REALTIME),
+            VPX_CODEC_INCAPABLE);
+
+  // Free resources.
+  vpx_img_free(image);
+  ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+}
+#endif  // CONFIG_VP9_ENCODER
+
 }  // namespace
diff --git a/media/libvpx/libvpx/test/encode_perf_test.cc b/media/libvpx/libvpx/test/encode_perf_test.cc
index 0bb435502b..75d6838f60 100644
--- a/media/libvpx/libvpx/test/encode_perf_test.cc
+++ b/media/libvpx/libvpx/test/encode_perf_test.cc
@@ -7,15 +7,15 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <cstdio>
 #include <string>
-#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "./vpx_config.h"
-#include "./vpx_version.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
 #include "test/util.h"
 #include "test/y4m_video_source.h"
+#include "vpx/vpx_codec.h"
 #include "vpx_ports/vpx_timer.h"
 
 namespace {
@@ -48,7 +48,7 @@ const EncodePerfTestVideo kVP9EncodePerfTestVectors[] = {
   EncodePerfTestVideo("niklas_1280_720_30.yuv", 1280, 720, 600, 470),
 };
 
-const int kEncodePerfTestSpeeds[] = { 5, 6, 7, 8 };
+const int kEncodePerfTestSpeeds[] = { 5, 6, 7, 8, 9 };
 const int kEncodePerfTestThreads[] = { 1, 2, 4 };
 
 #define NELEMENTS(x) (sizeof((x)) / sizeof((x)[0]))
@@ -61,9 +61,9 @@ class VP9EncodePerfTest
       : EncoderTest(GET_PARAM(0)), min_psnr_(kMaxPsnr), nframes_(0),
         encoding_mode_(GET_PARAM(1)), speed_(0), threads_(1) {}
 
-  virtual ~VP9EncodePerfTest() {}
+  ~VP9EncodePerfTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
 
@@ -82,8 +82,8 @@ class VP9EncodePerfTest
     cfg_.g_threads = threads_;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       const int log2_tile_columns = 3;
       encoder->Control(VP8E_SET_CPUUSED, speed_);
@@ -93,19 +93,19 @@ class VP9EncodePerfTest
     }
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     min_psnr_ = kMaxPsnr;
     nframes_ = 0;
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (pkt->data.psnr.psnr[0] < min_psnr_) {
       min_psnr_ = pkt->data.psnr.psnr[0];
     }
   }
 
   // for performance reasons don't decode
-  virtual bool DoDecode() const { return false; }
+  bool DoDecode() const override { return false; }
 
   double min_psnr() const { return min_psnr_; }
 
@@ -169,7 +169,7 @@ TEST_P(VP9EncodePerfTest, PerfTest) {
 
         printf("{\n");
         printf("\t\"type\" : \"encode_perf_test\",\n");
-        printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP);
+        printf("\t\"version\" : \"%s\",\n", vpx_codec_version_str());
         printf("\t\"videoName\" : \"%s\",\n", display_name.c_str());
         printf("\t\"encodeTimeSecs\" : %f,\n", elapsed_secs);
         printf("\t\"totalFrames\" : %u,\n", frames);
@@ -183,6 +183,6 @@ TEST_P(VP9EncodePerfTest, PerfTest) {
   }
 }
 
-VP9_INSTANTIATE_TEST_CASE(VP9EncodePerfTest,
-                          ::testing::Values(::libvpx_test::kRealTime));
+VP9_INSTANTIATE_TEST_SUITE(VP9EncodePerfTest,
+                           ::testing::Values(::libvpx_test::kRealTime));
 }  // namespace
diff --git a/media/libvpx/libvpx/test/encode_test_driver.cc b/media/libvpx/libvpx/test/encode_test_driver.cc
index 632c98f05a..770f410d80 100644
--- a/media/libvpx/libvpx/test/encode_test_driver.cc
+++ b/media/libvpx/libvpx/test/encode_test_driver.cc
@@ -8,9 +8,10 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <memory>
 #include <string>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "./vpx_config.h"
 #include "test/codec_factory.h"
@@ -51,7 +52,8 @@ void Encoder::InitEncoder(VideoSource *video) {
   }
 }
 
-void Encoder::EncodeFrame(VideoSource *video, const unsigned long frame_flags) {
+void Encoder::EncodeFrame(VideoSource *video,
+                          const vpx_enc_frame_flags_t frame_flags) {
   if (video->img()) {
     EncodeFrameInternal(*video, frame_flags);
   } else {
@@ -69,7 +71,7 @@ void Encoder::EncodeFrame(VideoSource *video, const unsigned long frame_flags) {
 }
 
 void Encoder::EncodeFrameInternal(const VideoSource &video,
-                                  const unsigned long frame_flags) {
+                                  const vpx_enc_frame_flags_t frame_flags) {
   vpx_codec_err_t res;
   const vpx_image_t *img = video.img();
 
@@ -90,7 +92,7 @@ void Encoder::EncodeFrameInternal(const VideoSource &video,
 
 void Encoder::Flush() {
   const vpx_codec_err_t res =
-      vpx_codec_encode(&encoder_, NULL, 0, 0, 0, deadline_);
+      vpx_codec_encode(&encoder_, nullptr, 0, 0, 0, deadline_);
   if (!encoder_.priv)
     ASSERT_EQ(VPX_CODEC_ERROR, res) << EncoderError();
   else
@@ -128,6 +130,8 @@ static bool compare_img(const vpx_image_t *img1, const vpx_image_t *img2) {
   bool match = (img1->fmt == img2->fmt) && (img1->cs == img2->cs) &&
                (img1->d_w == img2->d_w) && (img1->d_h == img2->d_h);
 
+  if (!match) return false;
+
   const unsigned int width_y = img1->d_w;
   const unsigned int height_y = img1->d_h;
   unsigned int i;
@@ -166,7 +170,7 @@ void EncoderTest::RunLoop(VideoSource *video) {
 
   ASSERT_TRUE(passes_ == 1 || passes_ == 2);
   for (unsigned int pass = 0; pass < passes_; pass++) {
-    last_pts_ = 0;
+    vpx_codec_pts_t last_pts = 0;
 
     if (passes_ == 1) {
       cfg_.g_pass = VPX_RC_ONE_PASS;
@@ -177,9 +181,9 @@ void EncoderTest::RunLoop(VideoSource *video) {
     }
 
     BeginPassHook(pass);
-    testing::internal::scoped_ptr<Encoder> encoder(
+    std::unique_ptr<Encoder> encoder(
         codec_->CreateEncoder(cfg_, deadline_, init_flags_, &stats_));
-    ASSERT_TRUE(encoder.get() != NULL);
+    ASSERT_NE(encoder.get(), nullptr);
 
     ASSERT_NO_FATAL_FAILURE(video->Begin());
     encoder->InitEncoder(video);
@@ -191,16 +195,18 @@ void EncoderTest::RunLoop(VideoSource *video) {
     if (init_flags_ & VPX_CODEC_USE_OUTPUT_PARTITION) {
       dec_init_flags |= VPX_CODEC_USE_INPUT_FRAGMENTS;
     }
-    testing::internal::scoped_ptr<Decoder> decoder(
+    std::unique_ptr<Decoder> decoder(
         codec_->CreateDecoder(dec_cfg, dec_init_flags));
     bool again;
     for (again = true; again; video->Next()) {
-      again = (video->img() != NULL);
+      again = (video->img() != nullptr);
 
       PreEncodeFrameHook(video);
       PreEncodeFrameHook(video, encoder.get());
       encoder->EncodeFrame(video, frame_flags_);
 
+      PostEncodeFrameHook(encoder.get());
+
       CxDataIterator iter = encoder->GetCxData();
 
       bool has_cxdata = false;
@@ -211,7 +217,8 @@ void EncoderTest::RunLoop(VideoSource *video) {
         switch (pkt->kind) {
           case VPX_CODEC_CX_FRAME_PKT:
             has_cxdata = true;
-            if (decoder.get() != NULL && DoDecode()) {
+            if (decoder != nullptr && DoDecode()) {
+              PreDecodeFrameHook(video, decoder.get());
               vpx_codec_err_t res_dec = decoder->DecodeFrame(
                   (const uint8_t *)pkt->data.frame.buf, pkt->data.frame.sz);
 
@@ -219,20 +226,22 @@ void EncoderTest::RunLoop(VideoSource *video) {
 
               has_dxdata = true;
             }
-            ASSERT_GE(pkt->data.frame.pts, last_pts_);
-            last_pts_ = pkt->data.frame.pts;
+            ASSERT_GE(pkt->data.frame.pts, last_pts);
+            last_pts = pkt->data.frame.pts;
             FramePktHook(pkt);
             break;
 
           case VPX_CODEC_PSNR_PKT: PSNRPktHook(pkt); break;
 
+          case VPX_CODEC_STATS_PKT: StatsPktHook(pkt); break;
+
           default: break;
         }
       }
 
       // Flush the decoder when there are no more fragments.
       if ((init_flags_ & VPX_CODEC_USE_OUTPUT_PARTITION) && has_dxdata) {
-        const vpx_codec_err_t res_dec = decoder->DecodeFrame(NULL, 0);
+        const vpx_codec_err_t res_dec = decoder->DecodeFrame(nullptr, 0);
         if (!HandleDecodeResult(res_dec, *video, decoder.get())) break;
       }
 
diff --git a/media/libvpx/libvpx/test/encode_test_driver.h b/media/libvpx/libvpx/test/encode_test_driver.h
index 09b1a78344..c9a7c154f2 100644
--- a/media/libvpx/libvpx/test/encode_test_driver.h
+++ b/media/libvpx/libvpx/test/encode_test_driver.h
@@ -7,19 +7,19 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef TEST_ENCODE_TEST_DRIVER_H_
-#define TEST_ENCODE_TEST_DRIVER_H_
+#ifndef VPX_TEST_ENCODE_TEST_DRIVER_H_
+#define VPX_TEST_ENCODE_TEST_DRIVER_H_
 
 #include <string>
 #include <vector>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "./vpx_config.h"
 #if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
 #include "vpx/vp8cx.h"
 #endif
-#include "vpx/vpx_encoder.h"
+#include "vpx/vpx_tpl.h"
 
 namespace libvpx_test {
 
@@ -33,15 +33,24 @@ enum TestMode {
   kTwoPassGood,
   kTwoPassBest
 };
+
+#if CONFIG_REALTIME_ONLY
+#define ALL_TEST_MODES ::testing::Values(::libvpx_test::kRealTime)
+#define ONE_PASS_TEST_MODES ::testing::Values(::libvpx_test::kRealTime)
+#define ONE_OR_TWO_PASS_TEST_MODES ::testing::Values(::libvpx_test::kRealTime)
+#else
 #define ALL_TEST_MODES                                                        \
   ::testing::Values(::libvpx_test::kRealTime, ::libvpx_test::kOnePassGood,    \
                     ::libvpx_test::kOnePassBest, ::libvpx_test::kTwoPassGood, \
                     ::libvpx_test::kTwoPassBest)
-
 #define ONE_PASS_TEST_MODES                                                \
   ::testing::Values(::libvpx_test::kRealTime, ::libvpx_test::kOnePassGood, \
                     ::libvpx_test::kOnePassBest)
 
+#define ONE_OR_TWO_PASS_TEST_MODES \
+  ::testing::Values(::libvpx_test::kOnePassGood, ::libvpx_test::kTwoPassGood)
+#endif
+
 #define TWO_PASS_TEST_MODES \
   ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kTwoPassBest)
 
@@ -49,7 +58,7 @@ enum TestMode {
 class CxDataIterator {
  public:
   explicit CxDataIterator(vpx_codec_ctx_t *encoder)
-      : encoder_(encoder), iter_(NULL) {}
+      : encoder_(encoder), iter_(nullptr) {}
 
   const vpx_codec_cx_pkt_t *Next() {
     return vpx_codec_get_cx_data(encoder_, &iter_);
@@ -86,7 +95,7 @@ class TwopassStatsStore {
 // level of abstraction will be fleshed out as more tests are written.
 class Encoder {
  public:
-  Encoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline,
+  Encoder(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline,
           const unsigned long init_flags, TwopassStatsStore *stats)
       : cfg_(cfg), deadline_(deadline), init_flags_(init_flags), stats_(stats) {
     memset(&encoder_, 0, sizeof(encoder_));
@@ -103,7 +112,7 @@ class Encoder {
   }
   // This is a thin wrapper around vpx_codec_encode(), so refer to
   // vpx_encoder.h for its semantics.
-  void EncodeFrame(VideoSource *video, const unsigned long frame_flags);
+  void EncodeFrame(VideoSource *video, vpx_enc_frame_flags_t frame_flags);
 
   // Convenience wrapper for EncodeFrame()
   void EncodeFrame(VideoSource *video) { EncodeFrame(video, 0); }
@@ -128,24 +137,56 @@ class Encoder {
     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
   }
 
+  void Control(int ctrl_id, struct vpx_svc_ref_frame_config *arg) {
+    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
+
   void Control(int ctrl_id, struct vpx_svc_parameters *arg) {
     const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
   }
+
+  void Control(int ctrl_id, struct vpx_svc_frame_drop *arg) {
+    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
+
+  void Control(int ctrl_id, struct vpx_svc_spatial_layer_sync *arg) {
+    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
+
+#if CONFIG_VP9_ENCODER
+  void Control(int ctrl_id, vpx_rc_funcs_t *arg) {
+    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
+
+  void Control(int ctrl_id, VpxTplGopStats *arg) {
+    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
+#endif  // CONFIG_VP9_ENCODER
+
 #if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
   void Control(int ctrl_id, vpx_active_map_t *arg) {
     const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
   }
-#endif
 
+  void Control(int ctrl_id, vpx_roi_map_t *arg) {
+    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
+#endif
   void Config(const vpx_codec_enc_cfg_t *cfg) {
     const vpx_codec_err_t res = vpx_codec_enc_config_set(&encoder_, cfg);
     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
     cfg_ = *cfg;
   }
 
-  void set_deadline(unsigned long deadline) { deadline_ = deadline; }
+  void set_deadline(vpx_enc_deadline_t deadline) { deadline_ = deadline; }
 
  protected:
   virtual vpx_codec_iface_t *CodecInterface() const = 0;
@@ -157,14 +198,14 @@ class Encoder {
 
   // Encode an image
   void EncodeFrameInternal(const VideoSource &video,
-                           const unsigned long frame_flags);
+                           vpx_enc_frame_flags_t frame_flags);
 
   // Flush the encoder on EOS
   void Flush();
 
   vpx_codec_ctx_t encoder_;
   vpx_codec_enc_cfg_t cfg_;
-  unsigned long deadline_;
+  vpx_enc_deadline_t deadline_;
   unsigned long init_flags_;
   TwopassStatsStore *stats_;
 };
@@ -179,8 +220,7 @@ class Encoder {
 class EncoderTest {
  protected:
   explicit EncoderTest(const CodecFactory *codec)
-      : codec_(codec), abort_(false), init_flags_(0), frame_flags_(0),
-        last_pts_(0) {
+      : codec_(codec), abort_(false), init_flags_(0), frame_flags_(0) {
     // Default to 1 thread.
     cfg_.g_threads = 1;
   }
@@ -212,12 +252,20 @@ class EncoderTest {
   virtual void PreEncodeFrameHook(VideoSource * /*video*/,
                                   Encoder * /*encoder*/) {}
 
+  virtual void PreDecodeFrameHook(VideoSource * /*video*/,
+                                  Decoder * /*decoder*/) {}
+
+  virtual void PostEncodeFrameHook(Encoder * /*encoder*/) {}
+
   // Hook to be called on every compressed data packet.
   virtual void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {}
 
   // Hook to be called on every PSNR packet.
   virtual void PSNRPktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {}
 
+  // Hook to be called on every first pass stats packet.
+  virtual void StatsPktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {}
+
   // Hook to determine whether the encode loop should continue.
   virtual bool Continue() const {
     return !(::testing::Test::HasFatalFailure() || abort_);
@@ -225,7 +273,7 @@ class EncoderTest {
 
   const CodecFactory *codec_;
   // Hook to determine whether to decode frame after encoding
-  virtual bool DoDecode() const { return 1; }
+  virtual bool DoDecode() const { return true; }
 
   // Hook to handle encode/decode mismatch
   virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2);
@@ -252,13 +300,12 @@ class EncoderTest {
   vpx_codec_enc_cfg_t cfg_;
   vpx_codec_dec_cfg_t dec_cfg_;
   unsigned int passes_;
-  unsigned long deadline_;
+  vpx_enc_deadline_t deadline_;
   TwopassStatsStore stats_;
   unsigned long init_flags_;
-  unsigned long frame_flags_;
-  vpx_codec_pts_t last_pts_;
+  vpx_enc_frame_flags_t frame_flags_;
 };
 
 }  // namespace libvpx_test
 
-#endif  // TEST_ENCODE_TEST_DRIVER_H_
+#endif  // VPX_TEST_ENCODE_TEST_DRIVER_H_
diff --git a/media/libvpx/libvpx/test/error_resilience_test.cc b/media/libvpx/libvpx/test/error_resilience_test.cc
index 030b67c572..9bd43b72bf 100644
--- a/media/libvpx/libvpx/test/error_resilience_test.cc
+++ b/media/libvpx/libvpx/test/error_resilience_test.cc
@@ -8,11 +8,12 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
 #include "test/util.h"
+#include "vpx_config.h"
 
 namespace {
 
@@ -30,7 +31,7 @@ class ErrorResilienceTestLarge
     Reset();
   }
 
-  virtual ~ErrorResilienceTestLarge() {}
+  ~ErrorResilienceTestLarge() override = default;
 
   void Reset() {
     error_nframes_ = 0;
@@ -38,19 +39,19 @@ class ErrorResilienceTestLarge
     pattern_switch_ = 0;
   }
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     psnr_ = 0.0;
     nframes_ = 0;
     mismatch_psnr_ = 0.0;
     mismatch_nframes_ = 0;
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     psnr_ += pkt->data.psnr.psnr[0];
     nframes_++;
   }
@@ -90,7 +91,7 @@ class ErrorResilienceTestLarge
     return frame_flags;
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video) override {
     frame_flags_ &=
         ~(VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF);
     // For temporal layer case.
@@ -129,21 +130,21 @@ class ErrorResilienceTestLarge
     return 0.0;
   }
 
-  virtual bool DoDecode() const {
+  bool DoDecode() const override {
     if (error_nframes_ > 0 &&
         (cfg_.g_pass == VPX_RC_LAST_PASS || cfg_.g_pass == VPX_RC_ONE_PASS)) {
       for (unsigned int i = 0; i < error_nframes_; ++i) {
         if (error_frames_[i] == nframes_ - 1) {
           std::cout << "             Skipping decoding frame: "
                     << error_frames_[i] << "\n";
-          return 0;
+          return false;
         }
       }
     }
-    return 1;
+    return true;
   }
 
-  virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) {
+  void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) override {
     double mismatch_psnr = compute_psnr(img1, img2);
     mismatch_psnr_ += mismatch_psnr;
     ++mismatch_nframes_;
@@ -194,6 +195,10 @@ class ErrorResilienceTestLarge
 };
 
 TEST_P(ErrorResilienceTestLarge, OnVersusOff) {
+#if CONFIG_REALTIME_ONLY
+  GTEST_SKIP()
+      << "Non-zero g_lag_in_frames is unsupported with CONFIG_REALTIME_ONLY";
+#else
   const vpx_rational timebase = { 33333333, 1000000000 };
   cfg_.g_timebase = timebase;
   cfg_.rc_target_bitrate = 2000;
@@ -222,6 +227,7 @@ TEST_P(ErrorResilienceTestLarge, OnVersusOff) {
     EXPECT_GE(psnr_ratio, 0.9);
     EXPECT_LE(psnr_ratio, 1.1);
   }
+#endif  // CONFIG_REALTIME_ONLY
 }
 
 // Check for successful decoding and no encoder/decoder mismatch
@@ -381,7 +387,7 @@ class ErrorResilienceTestLargeCodecControls
     Reset();
   }
 
-  virtual ~ErrorResilienceTestLargeCodecControls() {}
+  ~ErrorResilienceTestLargeCodecControls() override = default;
 
   void Reset() {
     last_pts_ = 0;
@@ -393,7 +399,7 @@ class ErrorResilienceTestLargeCodecControls
     duration_ = 0.0;
   }
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
   }
@@ -460,8 +466,8 @@ class ErrorResilienceTestLargeCodecControls
     return layer_id;
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
     if (cfg_.ts_number_layers > 1) {
       int layer_id = SetLayerId(video->frame(), cfg_.ts_number_layers);
       int frame_flags = SetFrameFlags(video->frame(), cfg_.ts_number_layers);
@@ -476,7 +482,7 @@ class ErrorResilienceTestLargeCodecControls
     }
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     // Time since last timestamp = duration.
     vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
     if (duration > 1) {
@@ -496,7 +502,7 @@ class ErrorResilienceTestLargeCodecControls
     ++tot_frame_number_;
   }
 
-  virtual void EndPassHook(void) {
+  void EndPassHook() override {
     duration_ = (last_pts_ + 1) * timebase_;
     if (cfg_.ts_number_layers > 1) {
       for (int layer = 0; layer < static_cast<int>(cfg_.ts_number_layers);
@@ -573,10 +579,10 @@ TEST_P(ErrorResilienceTestLargeCodecControls, CodecControl3TemporalLayers) {
   }
 }
 
-VP8_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES,
-                          ::testing::Values(true));
-VP8_INSTANTIATE_TEST_CASE(ErrorResilienceTestLargeCodecControls,
-                          ONE_PASS_TEST_MODES);
-VP9_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES,
-                          ::testing::Values(true));
+VP8_INSTANTIATE_TEST_SUITE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES,
+                           ::testing::Values(true));
+VP8_INSTANTIATE_TEST_SUITE(ErrorResilienceTestLargeCodecControls,
+                           ONE_PASS_TEST_MODES);
+VP9_INSTANTIATE_TEST_SUITE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES,
+                           ::testing::Values(true));
 }  // namespace
diff --git a/media/libvpx/libvpx/test/external_frame_buffer_test.cc b/media/libvpx/libvpx/test/external_frame_buffer_test.cc
index f9686695a7..7b9a836fbc 100644
--- a/media/libvpx/libvpx/test/external_frame_buffer_test.cc
+++ b/media/libvpx/libvpx/test/external_frame_buffer_test.cc
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <memory>
 #include <string>
 
 #include "./vpx_config.h"
@@ -34,7 +35,8 @@ struct ExternalFrameBuffer {
 // Class to manipulate a list of external frame buffers.
 class ExternalFrameBufferList {
  public:
-  ExternalFrameBufferList() : num_buffers_(0), ext_fb_list_(NULL) {}
+  ExternalFrameBufferList()
+      : num_buffers_(0), num_used_buffers_(0), ext_fb_list_(nullptr) {}
 
   virtual ~ExternalFrameBufferList() {
     for (int i = 0; i < num_buffers_; ++i) {
@@ -49,7 +51,7 @@ class ExternalFrameBufferList {
 
     num_buffers_ = num_buffers;
     ext_fb_list_ = new ExternalFrameBuffer[num_buffers_];
-    EXPECT_TRUE(ext_fb_list_ != NULL);
+    EXPECT_NE(ext_fb_list_, nullptr);
     memset(ext_fb_list_, 0, sizeof(ext_fb_list_[0]) * num_buffers_);
     return true;
   }
@@ -59,7 +61,7 @@ class ExternalFrameBufferList {
   // frame buffer is in use by libvpx. Finally sets |fb| to point to the
   // external frame buffer. Returns < 0 on an error.
   int GetFreeFrameBuffer(size_t min_size, vpx_codec_frame_buffer_t *fb) {
-    EXPECT_TRUE(fb != NULL);
+    EXPECT_NE(fb, nullptr);
     const int idx = FindFreeBufferIndex();
     if (idx == num_buffers_) return -1;
 
@@ -71,19 +73,21 @@ class ExternalFrameBufferList {
     }
 
     SetFrameBuffer(idx, fb);
+
+    num_used_buffers_++;
     return 0;
   }
 
   // Test function that will not allocate any data for the frame buffer.
   // Returns < 0 on an error.
   int GetZeroFrameBuffer(size_t min_size, vpx_codec_frame_buffer_t *fb) {
-    EXPECT_TRUE(fb != NULL);
+    EXPECT_NE(fb, nullptr);
     const int idx = FindFreeBufferIndex();
     if (idx == num_buffers_) return -1;
 
     if (ext_fb_list_[idx].size < min_size) {
       delete[] ext_fb_list_[idx].data;
-      ext_fb_list_[idx].data = NULL;
+      ext_fb_list_[idx].data = nullptr;
       ext_fb_list_[idx].size = min_size;
     }
 
@@ -94,25 +98,26 @@ class ExternalFrameBufferList {
   // Marks the external frame buffer that |fb| is pointing to as free.
   // Returns < 0 on an error.
   int ReturnFrameBuffer(vpx_codec_frame_buffer_t *fb) {
-    if (fb == NULL) {
-      EXPECT_TRUE(fb != NULL);
+    if (fb == nullptr) {
+      EXPECT_NE(fb, nullptr);
       return -1;
     }
     ExternalFrameBuffer *const ext_fb =
         reinterpret_cast<ExternalFrameBuffer *>(fb->priv);
-    if (ext_fb == NULL) {
-      EXPECT_TRUE(ext_fb != NULL);
+    if (ext_fb == nullptr) {
+      EXPECT_NE(ext_fb, nullptr);
       return -1;
     }
     EXPECT_EQ(1, ext_fb->in_use);
     ext_fb->in_use = 0;
+    num_used_buffers_--;
     return 0;
   }
 
-  // Checks that the ximage data is contained within the external frame buffer
-  // private data passed back in the ximage.
-  void CheckXImageFrameBuffer(const vpx_image_t *img) {
-    if (img->fb_priv != NULL) {
+  // Checks that the vpx_image_t data is contained within the external frame
+  // buffer private data passed back in the vpx_image_t.
+  void CheckImageFrameBuffer(const vpx_image_t *img) {
+    if (img->fb_priv != nullptr) {
       const struct ExternalFrameBuffer *const ext_fb =
           reinterpret_cast<ExternalFrameBuffer *>(img->fb_priv);
 
@@ -121,6 +126,8 @@ class ExternalFrameBufferList {
     }
   }
 
+  int num_used_buffers() const { return num_used_buffers_; }
+
  private:
   // Returns the index of the first free frame buffer. Returns |num_buffers_|
   // if there are no free frame buffers.
@@ -136,7 +143,7 @@ class ExternalFrameBufferList {
   // Sets |fb| to an external frame buffer. idx is the index into the frame
   // buffer list.
   void SetFrameBuffer(int idx, vpx_codec_frame_buffer_t *fb) {
-    ASSERT_TRUE(fb != NULL);
+    ASSERT_NE(fb, nullptr);
     fb->data = ext_fb_list_[idx].data;
     fb->size = ext_fb_list_[idx].size;
     ASSERT_EQ(0, ext_fb_list_[idx].in_use);
@@ -145,6 +152,7 @@ class ExternalFrameBufferList {
   }
 
   int num_buffers_;
+  int num_used_buffers_;
   ExternalFrameBuffer *ext_fb_list_;
 };
 
@@ -200,15 +208,14 @@ class ExternalFrameBufferMD5Test
  protected:
   ExternalFrameBufferMD5Test()
       : DecoderTest(GET_PARAM(::libvpx_test::kCodecFactoryParam)),
-        md5_file_(NULL), num_buffers_(0) {}
+        md5_file_(nullptr), num_buffers_(0) {}
 
-  virtual ~ExternalFrameBufferMD5Test() {
-    if (md5_file_ != NULL) fclose(md5_file_);
+  ~ExternalFrameBufferMD5Test() override {
+    if (md5_file_ != nullptr) fclose(md5_file_);
   }
 
-  virtual void PreDecodeFrameHook(
-      const libvpx_test::CompressedVideoSource &video,
-      libvpx_test::Decoder *decoder) {
+  void PreDecodeFrameHook(const libvpx_test::CompressedVideoSource &video,
+                          libvpx_test::Decoder *decoder) override {
     if (num_buffers_ > 0 && video.frame_number() == 0) {
       // Have libvpx use frame buffers we create.
       ASSERT_TRUE(fb_list_.CreateBufferList(num_buffers_));
@@ -220,13 +227,13 @@ class ExternalFrameBufferMD5Test
 
   void OpenMD5File(const std::string &md5_file_name_) {
     md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_);
-    ASSERT_TRUE(md5_file_ != NULL) << "Md5 file open failed. Filename: "
-                                   << md5_file_name_;
+    ASSERT_NE(md5_file_, nullptr)
+        << "Md5 file open failed. Filename: " << md5_file_name_;
   }
 
-  virtual void DecompressedFrameHook(const vpx_image_t &img,
-                                     const unsigned int frame_number) {
-    ASSERT_TRUE(md5_file_ != NULL);
+  void DecompressedFrameHook(const vpx_image_t &img,
+                             const unsigned int frame_number) override {
+    ASSERT_NE(md5_file_, nullptr);
     char expected_md5[33];
     char junk[128];
 
@@ -273,26 +280,30 @@ class ExternalFrameBufferMD5Test
 
 #if CONFIG_WEBM_IO
 const char kVP9TestFile[] = "vp90-2-02-size-lf-1920x1080.webm";
+const char kVP9NonRefTestFile[] = "vp90-2-22-svc_1280x720_1.webm";
 
 // Class for testing passing in external frame buffers to libvpx.
 class ExternalFrameBufferTest : public ::testing::Test {
  protected:
-  ExternalFrameBufferTest() : video_(NULL), decoder_(NULL), num_buffers_(0) {}
+  ExternalFrameBufferTest()
+      : video_(nullptr), decoder_(nullptr), num_buffers_(0) {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     video_ = new libvpx_test::WebMVideoSource(kVP9TestFile);
-    ASSERT_TRUE(video_ != NULL);
+    ASSERT_NE(video_, nullptr);
     video_->Init();
     video_->Begin();
 
     vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
     decoder_ = new libvpx_test::VP9Decoder(cfg, 0);
-    ASSERT_TRUE(decoder_ != NULL);
+    ASSERT_NE(decoder_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     delete decoder_;
+    decoder_ = nullptr;
     delete video_;
+    video_ = nullptr;
   }
 
   // Passes the external frame buffer information to libvpx.
@@ -316,7 +327,7 @@ class ExternalFrameBufferTest : public ::testing::Test {
   }
 
   vpx_codec_err_t DecodeRemainingFrames() {
-    for (; video_->cxdata() != NULL; video_->Next()) {
+    for (; video_->cxdata() != nullptr; video_->Next()) {
       const vpx_codec_err_t res =
           decoder_->DecodeFrame(video_->cxdata(), video_->frame_size());
       if (res != VPX_CODEC_OK) return res;
@@ -325,14 +336,13 @@ class ExternalFrameBufferTest : public ::testing::Test {
     return VPX_CODEC_OK;
   }
 
- private:
   void CheckDecodedFrames() {
     libvpx_test::DxDataIterator dec_iter = decoder_->GetDxData();
-    const vpx_image_t *img = NULL;
+    const vpx_image_t *img = nullptr;
 
     // Get decompressed data
-    while ((img = dec_iter.Next()) != NULL) {
-      fb_list_.CheckXImageFrameBuffer(img);
+    while ((img = dec_iter.Next()) != nullptr) {
+      fb_list_.CheckImageFrameBuffer(img);
     }
   }
 
@@ -341,6 +351,25 @@ class ExternalFrameBufferTest : public ::testing::Test {
   int num_buffers_;
   ExternalFrameBufferList fb_list_;
 };
+
+class ExternalFrameBufferNonRefTest : public ExternalFrameBufferTest {
+ protected:
+  void SetUp() override {
+    video_ = new libvpx_test::WebMVideoSource(kVP9NonRefTestFile);
+    ASSERT_NE(video_, nullptr);
+    video_->Init();
+    video_->Begin();
+
+    vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
+    decoder_ = new libvpx_test::VP9Decoder(cfg, 0);
+    ASSERT_NE(decoder_, nullptr);
+  }
+
+  virtual void CheckFrameBufferRelease() {
+    TearDown();
+    ASSERT_EQ(0, fb_list_.num_used_buffers());
+  }
+};
 #endif  // CONFIG_WEBM_IO
 
 // This test runs through the set of test vectors, and decodes them.
@@ -364,7 +393,7 @@ TEST_P(ExternalFrameBufferMD5Test, ExtFBMD5Match) {
 #endif
 
   // Open compressed video file.
-  testing::internal::scoped_ptr<libvpx_test::CompressedVideoSource> video;
+  std::unique_ptr<libvpx_test::CompressedVideoSource> video;
   if (filename.substr(filename.length() - 3, 3) == "ivf") {
     video.reset(new libvpx_test::IVFVideoSource(filename));
   } else {
@@ -376,7 +405,7 @@ TEST_P(ExternalFrameBufferMD5Test, ExtFBMD5Match) {
     return;
 #endif
   }
-  ASSERT_TRUE(video.get() != NULL);
+  ASSERT_NE(video.get(), nullptr);
   video->Init();
 
   // Construct md5 file name.
@@ -419,6 +448,8 @@ TEST_F(ExternalFrameBufferTest, NotEnoughBuffers) {
             SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer,
                                     release_vp9_frame_buffer));
   ASSERT_EQ(VPX_CODEC_OK, DecodeOneFrame());
+  // Only run this on long clips. Decoding a very short clip will return
+  // VPX_CODEC_OK even with only 2 buffers.
   ASSERT_EQ(VPX_CODEC_MEM_ERROR, DecodeRemainingFrames());
 }
 
@@ -451,13 +482,14 @@ TEST_F(ExternalFrameBufferTest, NullGetFunction) {
   const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
   ASSERT_EQ(
       VPX_CODEC_INVALID_PARAM,
-      SetFrameBufferFunctions(num_buffers, NULL, release_vp9_frame_buffer));
+      SetFrameBufferFunctions(num_buffers, nullptr, release_vp9_frame_buffer));
 }
 
 TEST_F(ExternalFrameBufferTest, NullReleaseFunction) {
   const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
-  ASSERT_EQ(VPX_CODEC_INVALID_PARAM,
-            SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer, NULL));
+  ASSERT_EQ(
+      VPX_CODEC_INVALID_PARAM,
+      SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer, nullptr));
 }
 
 TEST_F(ExternalFrameBufferTest, SetAfterDecode) {
@@ -467,9 +499,18 @@ TEST_F(ExternalFrameBufferTest, SetAfterDecode) {
             SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer,
                                     release_vp9_frame_buffer));
 }
+
+TEST_F(ExternalFrameBufferNonRefTest, ReleaseNonRefFrameBuffer) {
+  const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(VPX_CODEC_OK,
+            SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer,
+                                    release_vp9_frame_buffer));
+  ASSERT_EQ(VPX_CODEC_OK, DecodeRemainingFrames());
+  CheckFrameBufferRelease();
+}
 #endif  // CONFIG_WEBM_IO
 
-VP9_INSTANTIATE_TEST_CASE(
+VP9_INSTANTIATE_TEST_SUITE(
     ExternalFrameBufferMD5Test,
     ::testing::ValuesIn(libvpx_test::kVP9TestVectors,
                         libvpx_test::kVP9TestVectors +
diff --git a/media/libvpx/libvpx/test/fdct4x4_test.cc b/media/libvpx/libvpx/test/fdct4x4_test.cc
deleted file mode 100644
index 444b0209d5..0000000000
--- a/media/libvpx/libvpx/test/fdct4x4_test.cc
+++ /dev/null
@@ -1,511 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "third_party/googletest/src/include/gtest/gtest.h"
-
-#include "./vp9_rtcd.h"
-#include "./vpx_dsp_rtcd.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
-#include "vp9/common/vp9_entropy.h"
-#include "vpx/vpx_codec.h"
-#include "vpx/vpx_integer.h"
-#include "vpx_ports/mem.h"
-
-using libvpx_test::ACMRandom;
-
-namespace {
-const int kNumCoeffs = 16;
-typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
-typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
-typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
-                        int tx_type);
-typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
-                        int tx_type);
-
-typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct4x4Param;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht4x4Param;
-
-void fdct4x4_ref(const int16_t *in, tran_low_t *out, int stride,
-                 int /*tx_type*/) {
-  vpx_fdct4x4_c(in, out, stride);
-}
-
-void fht4x4_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
-  vp9_fht4x4_c(in, out, stride, tx_type);
-}
-
-void fwht4x4_ref(const int16_t *in, tran_low_t *out, int stride,
-                 int /*tx_type*/) {
-  vp9_fwht4x4_c(in, out, stride);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void idct4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct4x4_16_add_c(in, out, stride, 10);
-}
-
-void idct4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct4x4_16_add_c(in, out, stride, 12);
-}
-
-void iht4x4_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
-  vp9_highbd_iht4x4_16_add_c(in, out, stride, tx_type, 10);
-}
-
-void iht4x4_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
-  vp9_highbd_iht4x4_16_add_c(in, out, stride, tx_type, 12);
-}
-
-void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_iwht4x4_16_add_c(in, out, stride, 10);
-}
-
-void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_iwht4x4_16_add_c(in, out, stride, 12);
-}
-
-#if HAVE_SSE2
-void idct4x4_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct4x4_16_add_sse2(in, out, stride, 10);
-}
-
-void idct4x4_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct4x4_16_add_sse2(in, out, stride, 12);
-}
-#endif  // HAVE_SSE2
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-class Trans4x4TestBase {
- public:
-  virtual ~Trans4x4TestBase() {}
-
- protected:
-  virtual void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) = 0;
-
-  virtual void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) = 0;
-
-  void RunAccuracyCheck(int limit) {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    uint32_t max_error = 0;
-    int64_t total_error = 0;
-    const int count_test_block = 10000;
-    for (int i = 0; i < count_test_block; ++i) {
-      DECLARE_ALIGNED(16, int16_t, test_input_block[kNumCoeffs]);
-      DECLARE_ALIGNED(16, tran_low_t, test_temp_block[kNumCoeffs]);
-      DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
-      DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
-#if CONFIG_VP9_HIGHBITDEPTH
-      DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
-      DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
-#endif
-
-      // Initialize a test block with input range [-255, 255].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        if (bit_depth_ == VPX_BITS_8) {
-          src[j] = rnd.Rand8();
-          dst[j] = rnd.Rand8();
-          test_input_block[j] = src[j] - dst[j];
-#if CONFIG_VP9_HIGHBITDEPTH
-        } else {
-          src16[j] = rnd.Rand16() & mask_;
-          dst16[j] = rnd.Rand16() & mask_;
-          test_input_block[j] = src16[j] - dst16[j];
-#endif
-        }
-      }
-
-      ASM_REGISTER_STATE_CHECK(
-          RunFwdTxfm(test_input_block, test_temp_block, pitch_));
-      if (bit_depth_ == VPX_BITS_8) {
-        ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
-#if CONFIG_VP9_HIGHBITDEPTH
-      } else {
-        ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
-#endif
-      }
-
-      for (int j = 0; j < kNumCoeffs; ++j) {
-#if CONFIG_VP9_HIGHBITDEPTH
-        const int diff =
-            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
-#else
-        ASSERT_EQ(VPX_BITS_8, bit_depth_);
-        const int diff = dst[j] - src[j];
-#endif
-        const uint32_t error = diff * diff;
-        if (max_error < error) max_error = error;
-        total_error += error;
-      }
-    }
-
-    EXPECT_GE(static_cast<uint32_t>(limit), max_error)
-        << "Error: 4x4 FHT/IHT has an individual round trip error > " << limit;
-
-    EXPECT_GE(count_test_block * limit, total_error)
-        << "Error: 4x4 FHT/IHT has average round trip error > " << limit
-        << " per block";
-  }
-
-  void RunCoeffCheck() {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 5000;
-    DECLARE_ALIGNED(16, int16_t, input_block[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
-
-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-mask_, mask_].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
-      }
-
-      fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);
-      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
-
-      // The minimum quant value is 4.
-      for (int j = 0; j < kNumCoeffs; ++j)
-        EXPECT_EQ(output_block[j], output_ref_block[j]);
-    }
-  }
-
-  void RunMemCheck() {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 5000;
-    DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
-
-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-mask_, mask_].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
-      }
-      if (i == 0) {
-        for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = mask_;
-      } else if (i == 1) {
-        for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = -mask_;
-      }
-
-      fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
-      ASM_REGISTER_STATE_CHECK(
-          RunFwdTxfm(input_extreme_block, output_block, pitch_));
-
-      // The minimum quant value is 4.
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        EXPECT_EQ(output_block[j], output_ref_block[j]);
-        EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j]))
-            << "Error: 4x4 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
-      }
-    }
-  }
-
-  void RunInvAccuracyCheck(int limit) {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 1000;
-    DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
-#if CONFIG_VP9_HIGHBITDEPTH
-    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
-#endif
-
-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-mask_, mask_].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        if (bit_depth_ == VPX_BITS_8) {
-          src[j] = rnd.Rand8();
-          dst[j] = rnd.Rand8();
-          in[j] = src[j] - dst[j];
-#if CONFIG_VP9_HIGHBITDEPTH
-        } else {
-          src16[j] = rnd.Rand16() & mask_;
-          dst16[j] = rnd.Rand16() & mask_;
-          in[j] = src16[j] - dst16[j];
-#endif
-        }
-      }
-
-      fwd_txfm_ref(in, coeff, pitch_, tx_type_);
-
-      if (bit_depth_ == VPX_BITS_8) {
-        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
-#if CONFIG_VP9_HIGHBITDEPTH
-      } else {
-        ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));
-#endif
-      }
-
-      for (int j = 0; j < kNumCoeffs; ++j) {
-#if CONFIG_VP9_HIGHBITDEPTH
-        const int diff =
-            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
-#else
-        const int diff = dst[j] - src[j];
-#endif
-        const uint32_t error = diff * diff;
-        EXPECT_GE(static_cast<uint32_t>(limit), error)
-            << "Error: 4x4 IDCT has error " << error << " at index " << j;
-      }
-    }
-  }
-
-  int pitch_;
-  int tx_type_;
-  FhtFunc fwd_txfm_ref;
-  vpx_bit_depth_t bit_depth_;
-  int mask_;
-};
-
-class Trans4x4DCT : public Trans4x4TestBase,
-                    public ::testing::TestWithParam<Dct4x4Param> {
- public:
-  virtual ~Trans4x4DCT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    tx_type_ = GET_PARAM(2);
-    pitch_ = 4;
-    fwd_txfm_ref = fdct4x4_ref;
-    bit_depth_ = GET_PARAM(3);
-    mask_ = (1 << bit_depth_) - 1;
-  }
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
-    fwd_txfm_(in, out, stride);
-  }
-  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride);
-  }
-
-  FdctFunc fwd_txfm_;
-  IdctFunc inv_txfm_;
-};
-
-TEST_P(Trans4x4DCT, AccuracyCheck) { RunAccuracyCheck(1); }
-
-TEST_P(Trans4x4DCT, CoeffCheck) { RunCoeffCheck(); }
-
-TEST_P(Trans4x4DCT, MemCheck) { RunMemCheck(); }
-
-TEST_P(Trans4x4DCT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
-
-class Trans4x4HT : public Trans4x4TestBase,
-                   public ::testing::TestWithParam<Ht4x4Param> {
- public:
-  virtual ~Trans4x4HT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    tx_type_ = GET_PARAM(2);
-    pitch_ = 4;
-    fwd_txfm_ref = fht4x4_ref;
-    bit_depth_ = GET_PARAM(3);
-    mask_ = (1 << bit_depth_) - 1;
-  }
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
-    fwd_txfm_(in, out, stride, tx_type_);
-  }
-
-  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride, tx_type_);
-  }
-
-  FhtFunc fwd_txfm_;
-  IhtFunc inv_txfm_;
-};
-
-TEST_P(Trans4x4HT, AccuracyCheck) { RunAccuracyCheck(1); }
-
-TEST_P(Trans4x4HT, CoeffCheck) { RunCoeffCheck(); }
-
-TEST_P(Trans4x4HT, MemCheck) { RunMemCheck(); }
-
-TEST_P(Trans4x4HT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
-
-class Trans4x4WHT : public Trans4x4TestBase,
-                    public ::testing::TestWithParam<Dct4x4Param> {
- public:
-  virtual ~Trans4x4WHT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    tx_type_ = GET_PARAM(2);
-    pitch_ = 4;
-    fwd_txfm_ref = fwht4x4_ref;
-    bit_depth_ = GET_PARAM(3);
-    mask_ = (1 << bit_depth_) - 1;
-  }
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
-    fwd_txfm_(in, out, stride);
-  }
-  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride);
-  }
-
-  FdctFunc fwd_txfm_;
-  IdctFunc inv_txfm_;
-};
-
-TEST_P(Trans4x4WHT, AccuracyCheck) { RunAccuracyCheck(0); }
-
-TEST_P(Trans4x4WHT, CoeffCheck) { RunCoeffCheck(); }
-
-TEST_P(Trans4x4WHT, MemCheck) { RunMemCheck(); }
-
-TEST_P(Trans4x4WHT, InvAccuracyCheck) { RunInvAccuracyCheck(0); }
-using std::tr1::make_tuple;
-
-#if CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    C, Trans4x4DCT,
-    ::testing::Values(
-        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_10, 0, VPX_BITS_10),
-        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_12, 0, VPX_BITS_12),
-        make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 0, VPX_BITS_8)));
-#else
-INSTANTIATE_TEST_CASE_P(C, Trans4x4DCT,
-                        ::testing::Values(make_tuple(&vpx_fdct4x4_c,
-                                                     &vpx_idct4x4_16_add_c, 0,
-                                                     VPX_BITS_8)));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-#if CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    C, Trans4x4HT,
-    ::testing::Values(
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 0, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 1, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 2, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 3, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 0, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 1, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 2, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 3, VPX_BITS_12),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
-#else
-INSTANTIATE_TEST_CASE_P(
-    C, Trans4x4HT,
-    ::testing::Values(
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-#if CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    C, Trans4x4WHT,
-    ::testing::Values(
-        make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_10, 0, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_12, 0, VPX_BITS_12),
-        make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8)));
-#else
-INSTANTIATE_TEST_CASE_P(C, Trans4x4WHT,
-                        ::testing::Values(make_tuple(&vp9_fwht4x4_c,
-                                                     &vpx_iwht4x4_16_add_c, 0,
-                                                     VPX_BITS_8)));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(NEON, Trans4x4DCT,
-                        ::testing::Values(make_tuple(&vpx_fdct4x4_c,
-                                                     &vpx_idct4x4_16_add_neon,
-                                                     0, VPX_BITS_8)));
-#if !CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    NEON, Trans4x4HT,
-    ::testing::Values(
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8)));
-#endif  // !CONFIG_VP9_HIGHBITDEPTH
-#endif  // HAVE_NEON && !CONFIG_EMULATE_HARDWARE
-
-#if HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(
-    SSE2, Trans4x4WHT,
-    ::testing::Values(
-        make_tuple(&vp9_fwht4x4_sse2, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8),
-        make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_sse2, 0, VPX_BITS_8)));
-#endif
-
-#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(SSE2, Trans4x4DCT,
-                        ::testing::Values(make_tuple(&vpx_fdct4x4_sse2,
-                                                     &vpx_idct4x4_16_add_sse2,
-                                                     0, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
-    SSE2, Trans4x4HT,
-    ::testing::Values(
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3, VPX_BITS_8)));
-#endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-
-#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(
-    SSE2, Trans4x4DCT,
-    ::testing::Values(
-        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_10_sse2, 0, VPX_BITS_10),
-        make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_10_sse2, 0, VPX_BITS_10),
-        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_12_sse2, 0, VPX_BITS_12),
-        make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_12_sse2, 0, VPX_BITS_12),
-        make_tuple(&vpx_fdct4x4_sse2, &vpx_idct4x4_16_add_c, 0, VPX_BITS_8)));
-
-INSTANTIATE_TEST_CASE_P(
-    SSE2, Trans4x4HT,
-    ::testing::Values(
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
-#endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-
-#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(MSA, Trans4x4DCT,
-                        ::testing::Values(make_tuple(&vpx_fdct4x4_msa,
-                                                     &vpx_idct4x4_16_add_msa, 0,
-                                                     VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
-    MSA, Trans4x4HT,
-    ::testing::Values(
-        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 3, VPX_BITS_8)));
-#endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-}  // namespace
diff --git a/media/libvpx/libvpx/test/fdct8x8_test.cc b/media/libvpx/libvpx/test/fdct8x8_test.cc
index e403404906..4c2cbf43fd 100644
--- a/media/libvpx/libvpx/test/fdct8x8_test.cc
+++ b/media/libvpx/libvpx/test/fdct8x8_test.cc
@@ -11,8 +11,9 @@
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
+#include <tuple>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "./vp9_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
@@ -22,6 +23,7 @@
 #include "test/util.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_scan.h"
+#include "vpx_config.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
@@ -36,16 +38,16 @@ const double kPi = 3.141592653589793238462643383279502884;
 const int kSignBiasMaxDiff255 = 1500;
 const int kSignBiasMaxDiff15 = 10000;
 
-typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
-typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
-typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
-                        int tx_type);
-typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
-                        int tx_type);
+using FdctFunc = void (*)(const int16_t *in, tran_low_t *out, int stride);
+using IdctFunc = void (*)(const tran_low_t *in, uint8_t *out, int stride);
+using FhtFunc = void (*)(const int16_t *in, tran_low_t *out, int stride,
+                         int tx_type);
+using IhtFunc = void (*)(const tran_low_t *in, uint8_t *out, int stride,
+                         int tx_type);
 
-typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct8x8Param;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht8x8Param;
-typedef std::tr1::tuple<IdctFunc, IdctFunc, int, vpx_bit_depth_t> Idct8x8Param;
+using Dct8x8Param = std::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t>;
+using Ht8x8Param = std::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t>;
+using Idct8x8Param = std::tuple<IdctFunc, IdctFunc, int, vpx_bit_depth_t>;
 
 void reference_8x8_dct_1d(const double in[8], double out[8]) {
   const double kInvSqrt2 = 0.707106781186547524400844362104;
@@ -88,52 +90,64 @@ void fht8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void idct8x8_10(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_64_add_c(in, out, stride, 10);
+  vpx_highbd_idct8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
 }
 
 void idct8x8_12(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_64_add_c(in, out, stride, 12);
+  vpx_highbd_idct8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
 }
 
 void iht8x8_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
-  vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 10);
+  vp9_highbd_iht8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10);
 }
 
 void iht8x8_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
-  vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 12);
+  vp9_highbd_iht8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12);
 }
 
 #if HAVE_SSE2
 
 void idct8x8_12_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_12_add_c(in, out, stride, 10);
+  vpx_highbd_idct8x8_12_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
 }
 
 void idct8x8_12_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_12_add_c(in, out, stride, 12);
+  vpx_highbd_idct8x8_12_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
 }
 
 void idct8x8_12_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_12_add_sse2(in, out, stride, 10);
+  vpx_highbd_idct8x8_12_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);
 }
 
 void idct8x8_12_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_12_add_sse2(in, out, stride, 12);
+  vpx_highbd_idct8x8_12_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);
 }
 
 void idct8x8_64_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_64_add_sse2(in, out, stride, 10);
+  vpx_highbd_idct8x8_64_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);
 }
 
 void idct8x8_64_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_64_add_sse2(in, out, stride, 12);
+  vpx_highbd_idct8x8_64_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);
 }
 #endif  // HAVE_SSE2
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+// Visual Studio 2022 (cl.exe) < 17.12.3 targeting AArch64 with optimizations
+// enabled produces invalid code in RunExtremalCheck() and
+// RunInvAccuracyCheck(). See:
+// https://developercommunity.visualstudio.com/t/1770-preview-1:-Misoptimization-for-AR/10369786
+#if defined(_MSC_FULL_VER) && _MSC_FULL_VER < 194234435 && \
+    defined(_M_ARM64) && !defined(__clang__)
+#define AOM_WORK_AROUND_MSVC_BUG_10369786
+#endif
+
+#ifdef AOM_WORK_AROUND_MSVC_BUG_10369786
+#pragma optimize("", off)
+#endif
 class FwdTrans8x8TestBase {
  public:
-  virtual ~FwdTrans8x8TestBase() {}
+  virtual ~FwdTrans8x8TestBase() = default;
 
  protected:
   virtual void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) = 0;
@@ -169,7 +183,7 @@ class FwdTrans8x8TestBase {
     for (int j = 0; j < 64; ++j) {
       const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
       const int max_diff = kSignBiasMaxDiff255;
-      EXPECT_LT(diff, max_diff << (bit_depth_ - 8))
+      ASSERT_LT(diff, max_diff << (bit_depth_ - 8))
           << "Error: 8x8 FDCT/FHT has a sign bias > "
           << 1. * max_diff / count_test_block * 100 << "%"
           << " for input range [-255, 255] at index " << j
@@ -200,7 +214,7 @@ class FwdTrans8x8TestBase {
     for (int j = 0; j < 64; ++j) {
       const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
       const int max_diff = kSignBiasMaxDiff15;
-      EXPECT_LT(diff, max_diff << (bit_depth_ - 8))
+      ASSERT_LT(diff, max_diff << (bit_depth_ - 8))
           << "Error: 8x8 FDCT/FHT has a sign bias > "
           << 1. * max_diff / count_test_block * 100 << "%"
           << " for input range [-15, 15] at index " << j
@@ -257,7 +271,7 @@ class FwdTrans8x8TestBase {
 #if CONFIG_VP9_HIGHBITDEPTH
       } else {
         ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+            RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_));
 #endif
       }
 
@@ -274,11 +288,11 @@ class FwdTrans8x8TestBase {
       }
     }
 
-    EXPECT_GE(1 << 2 * (bit_depth_ - 8), max_error)
+    ASSERT_GE(1 << 2 * (bit_depth_ - 8), max_error)
         << "Error: 8x8 FDCT/IDCT or FHT/IHT has an individual"
         << " roundtrip error > 1";
 
-    EXPECT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 5, total_error)
+    ASSERT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 5, total_error)
         << "Error: 8x8 FDCT/IDCT or FHT/IHT has average roundtrip "
         << "error > 1/5 per block";
   }
@@ -340,7 +354,7 @@ class FwdTrans8x8TestBase {
 #if CONFIG_VP9_HIGHBITDEPTH
       } else {
         ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+            RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_));
 #endif
       }
 
@@ -359,17 +373,17 @@ class FwdTrans8x8TestBase {
         total_coeff_error += abs(coeff_diff);
       }
 
-      EXPECT_GE(1 << 2 * (bit_depth_ - 8), max_error)
+      ASSERT_GE(1 << 2 * (bit_depth_ - 8), max_error)
           << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has"
-          << "an individual roundtrip error > 1";
+          << " an individual roundtrip error > 1";
 
-      EXPECT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 5, total_error)
+      ASSERT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 5, total_error)
           << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has average"
           << " roundtrip error > 1/5 per block";
 
-      EXPECT_EQ(0, total_coeff_error)
+      ASSERT_EQ(0, total_coeff_error)
           << "Error: Extremal 8x8 FDCT/FHT has"
-          << "overflow issues in the intermediate steps > 1";
+          << " overflow issues in the intermediate steps > 1";
     }
   }
 
@@ -413,7 +427,7 @@ class FwdTrans8x8TestBase {
 #if CONFIG_VP9_HIGHBITDEPTH
       } else {
         ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));
+            RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_));
 #endif
       }
 
@@ -425,7 +439,7 @@ class FwdTrans8x8TestBase {
         const int diff = dst[j] - src[j];
 #endif
         const uint32_t error = diff * diff;
-        EXPECT_GE(1u << 2 * (bit_depth_ - 8), error)
+        ASSERT_GE(1u << 2 * (bit_depth_ - 8), error)
             << "Error: 8x8 IDCT has error " << error << " at index " << j;
       }
     }
@@ -455,7 +469,7 @@ class FwdTrans8x8TestBase {
       for (int j = 0; j < kNumCoeffs; ++j) {
         const int32_t diff = coeff[j] - coeff_r[j];
         const uint32_t error = diff * diff;
-        EXPECT_GE(9u << 2 * (bit_depth_ - 8), error)
+        ASSERT_GE(9u << 2 * (bit_depth_ - 8), error)
             << "Error: 8x8 DCT has error " << error << " at index " << j;
       }
     }
@@ -497,9 +511,9 @@ class FwdTrans8x8TestBase {
         ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
 #if CONFIG_VP9_HIGHBITDEPTH
       } else {
-        ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_);
+        ref_txfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_);
         ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));
+            RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_));
 #endif
       }
 
@@ -511,8 +525,8 @@ class FwdTrans8x8TestBase {
         const int diff = dst[j] - ref[j];
 #endif
         const uint32_t error = diff * diff;
-        EXPECT_EQ(0u, error) << "Error: 8x8 IDCT has error " << error
-                             << " at index " << j;
+        ASSERT_EQ(0u, error)
+            << "Error: 8x8 IDCT has error " << error << " at index " << j;
       }
     }
   }
@@ -522,13 +536,16 @@ class FwdTrans8x8TestBase {
   vpx_bit_depth_t bit_depth_;
   int mask_;
 };
+#ifdef AOM_WORK_AROUND_MSVC_BUG_10369786
+#pragma optimize("", on)
+#endif
 
 class FwdTrans8x8DCT : public FwdTrans8x8TestBase,
                        public ::testing::TestWithParam<Dct8x8Param> {
  public:
-  virtual ~FwdTrans8x8DCT() {}
+  ~FwdTrans8x8DCT() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     fwd_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     tx_type_ = GET_PARAM(2);
@@ -538,13 +555,13 @@ class FwdTrans8x8DCT : public FwdTrans8x8TestBase,
     mask_ = (1 << bit_depth_) - 1;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) override {
     fwd_txfm_(in, out, stride);
   }
-  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override {
     inv_txfm_(out, dst, stride);
   }
 
@@ -565,9 +582,9 @@ TEST_P(FwdTrans8x8DCT, InvAccuracyCheck) { RunInvAccuracyCheck(); }
 class FwdTrans8x8HT : public FwdTrans8x8TestBase,
                       public ::testing::TestWithParam<Ht8x8Param> {
  public:
-  virtual ~FwdTrans8x8HT() {}
+  ~FwdTrans8x8HT() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     fwd_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     tx_type_ = GET_PARAM(2);
@@ -577,13 +594,13 @@ class FwdTrans8x8HT : public FwdTrans8x8TestBase,
     mask_ = (1 << bit_depth_) - 1;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) override {
     fwd_txfm_(in, out, stride, tx_type_);
   }
-  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override {
     inv_txfm_(out, dst, stride, tx_type_);
   }
 
@@ -597,12 +614,13 @@ TEST_P(FwdTrans8x8HT, RoundTripErrorCheck) { RunRoundTripErrorCheck(); }
 
 TEST_P(FwdTrans8x8HT, ExtremalCheck) { RunExtremalCheck(); }
 
+#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 class InvTrans8x8DCT : public FwdTrans8x8TestBase,
                        public ::testing::TestWithParam<Idct8x8Param> {
  public:
-  virtual ~InvTrans8x8DCT() {}
+  ~InvTrans8x8DCT() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     ref_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     thresh_ = GET_PARAM(2);
@@ -611,41 +629,44 @@ class InvTrans8x8DCT : public FwdTrans8x8TestBase,
     mask_ = (1 << bit_depth_) - 1;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override {
     inv_txfm_(out, dst, stride);
   }
-  void RunFwdTxfm(int16_t * /*out*/, tran_low_t * /*dst*/, int /*stride*/) {}
+  void RunFwdTxfm(int16_t * /*out*/, tran_low_t * /*dst*/,
+                  int /*stride*/) override {}
 
   IdctFunc ref_txfm_;
   IdctFunc inv_txfm_;
   int thresh_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(InvTrans8x8DCT);
 
 TEST_P(InvTrans8x8DCT, CompareReference) {
   CompareInvReference(ref_txfm_, thresh_);
 }
+#endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
-using std::tr1::make_tuple;
+using std::make_tuple;
 
 #if CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     C, FwdTrans8x8DCT,
     ::testing::Values(
         make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c, 0, VPX_BITS_8),
         make_tuple(&vpx_highbd_fdct8x8_c, &idct8x8_10, 0, VPX_BITS_10),
         make_tuple(&vpx_highbd_fdct8x8_c, &idct8x8_12, 0, VPX_BITS_12)));
 #else
-INSTANTIATE_TEST_CASE_P(C, FwdTrans8x8DCT,
-                        ::testing::Values(make_tuple(&vpx_fdct8x8_c,
-                                                     &vpx_idct8x8_64_add_c, 0,
-                                                     VPX_BITS_8)));
+INSTANTIATE_TEST_SUITE_P(C, FwdTrans8x8DCT,
+                         ::testing::Values(make_tuple(&vpx_fdct8x8_c,
+                                                      &vpx_idct8x8_64_add_c, 0,
+                                                      VPX_BITS_8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     C, FwdTrans8x8HT,
     ::testing::Values(
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8),
@@ -661,7 +682,7 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8),
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));
 #else
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     C, FwdTrans8x8HT,
     ::testing::Values(
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8),
@@ -671,32 +692,28 @@ INSTANTIATE_TEST_CASE_P(
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
-#if CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(NEON, FwdTrans8x8DCT,
-                        ::testing::Values(make_tuple(&vpx_fdct8x8_c,
-                                                     &vpx_idct8x8_64_add_neon,
-                                                     0, VPX_BITS_8)));
-#else   // !CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(NEON, FwdTrans8x8DCT,
-                        ::testing::Values(make_tuple(&vpx_fdct8x8_neon,
-                                                     &vpx_idct8x8_64_add_neon,
-                                                     0, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(NEON, FwdTrans8x8DCT,
+                         ::testing::Values(make_tuple(&vpx_fdct8x8_neon,
+                                                      &vpx_idct8x8_64_add_neon,
+                                                      0, VPX_BITS_8)));
+
+#if !CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
     NEON, FwdTrans8x8HT,
     ::testing::Values(
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 0, VPX_BITS_8),
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 1, VPX_BITS_8),
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 2, VPX_BITS_8),
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 3, VPX_BITS_8)));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-#endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_NEON && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(SSE2, FwdTrans8x8DCT,
-                        ::testing::Values(make_tuple(&vpx_fdct8x8_sse2,
-                                                     &vpx_idct8x8_64_add_sse2,
-                                                     0, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(SSE2, FwdTrans8x8DCT,
+                         ::testing::Values(make_tuple(&vpx_fdct8x8_sse2,
+                                                      &vpx_idct8x8_64_add_sse2,
+                                                      0, VPX_BITS_8)));
+INSTANTIATE_TEST_SUITE_P(
     SSE2, FwdTrans8x8HT,
     ::testing::Values(
         make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 0, VPX_BITS_8),
@@ -706,7 +723,7 @@ INSTANTIATE_TEST_CASE_P(
 #endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, FwdTrans8x8DCT,
     ::testing::Values(make_tuple(&vpx_fdct8x8_sse2, &vpx_idct8x8_64_add_c, 0,
                                  VPX_BITS_8),
@@ -719,7 +736,7 @@ INSTANTIATE_TEST_CASE_P(
                       make_tuple(&vpx_highbd_fdct8x8_sse2,
                                  &idct8x8_64_add_12_sse2, 12, VPX_BITS_12)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, FwdTrans8x8HT,
     ::testing::Values(
         make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8),
@@ -729,7 +746,7 @@ INSTANTIATE_TEST_CASE_P(
 
 // Optimizations take effect at a threshold of 6201, so we use a value close to
 // that to test both branches.
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, InvTrans8x8DCT,
     ::testing::Values(
         make_tuple(&idct8x8_12_add_10_c, &idct8x8_12_add_10_sse2, 6225,
@@ -740,20 +757,20 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&idct8x8_12, &idct8x8_64_add_12_sse2, 6225, VPX_BITS_12)));
 #endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
-#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH && \
+#if HAVE_SSSE3 && VPX_ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH && \
     !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(SSSE3, FwdTrans8x8DCT,
-                        ::testing::Values(make_tuple(&vpx_fdct8x8_ssse3,
-                                                     &vpx_idct8x8_64_add_ssse3,
-                                                     0, VPX_BITS_8)));
+INSTANTIATE_TEST_SUITE_P(SSSE3, FwdTrans8x8DCT,
+                         ::testing::Values(make_tuple(&vpx_fdct8x8_ssse3,
+                                                      &vpx_idct8x8_64_add_sse2,
+                                                      0, VPX_BITS_8)));
 #endif
 
 #if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(MSA, FwdTrans8x8DCT,
-                        ::testing::Values(make_tuple(&vpx_fdct8x8_msa,
-                                                     &vpx_idct8x8_64_add_msa, 0,
-                                                     VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(MSA, FwdTrans8x8DCT,
+                         ::testing::Values(make_tuple(&vpx_fdct8x8_msa,
+                                                      &vpx_idct8x8_64_add_msa,
+                                                      0, VPX_BITS_8)));
+INSTANTIATE_TEST_SUITE_P(
     MSA, FwdTrans8x8HT,
     ::testing::Values(
         make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 0, VPX_BITS_8),
@@ -761,4 +778,18 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 2, VPX_BITS_8),
         make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 3, VPX_BITS_8)));
 #endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_SUITE_P(VSX, FwdTrans8x8DCT,
+                         ::testing::Values(make_tuple(&vpx_fdct8x8_c,
+                                                      &vpx_idct8x8_64_add_vsx,
+                                                      0, VPX_BITS_8)));
+#endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_SUITE_P(LSX, FwdTrans8x8DCT,
+                         ::testing::Values(make_tuple(&vpx_fdct8x8_lsx,
+                                                      &vpx_idct8x8_64_add_c, 0,
+                                                      VPX_BITS_8)));
+#endif  // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
diff --git a/media/libvpx/libvpx/test/frame_size_tests.cc b/media/libvpx/libvpx/test/frame_size_tests.cc
index 5a9b166e5b..a86ca9a42a 100644
--- a/media/libvpx/libvpx/test/frame_size_tests.cc
+++ b/media/libvpx/libvpx/test/frame_size_tests.cc
@@ -7,34 +7,95 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include <memory>
+
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
+#include "test/register_state_check.h"
 #include "test/video_source.h"
+#include "vpx_config.h"
 
 namespace {
 
+class EncoderWithExpectedError : public ::libvpx_test::Encoder {
+ public:
+  EncoderWithExpectedError(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline,
+                           const unsigned long init_flags,  // NOLINT
+                           ::libvpx_test::TwopassStatsStore *stats)
+      : ::libvpx_test::Encoder(cfg, deadline, init_flags, stats) {}
+  // This overrides with expected error code.
+  void EncodeFrame(::libvpx_test::VideoSource *video,
+                   const unsigned long frame_flags,  // NOLINT
+                   const vpx_codec_err_t expected_err) {
+    if (video->img()) {
+      EncodeFrameInternal(*video, frame_flags, expected_err);
+    } else {
+      Flush();
+    }
+
+    // Handle twopass stats
+    ::libvpx_test::CxDataIterator iter = GetCxData();
+
+    while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) {
+      if (pkt->kind != VPX_CODEC_STATS_PKT) continue;
+
+      stats_->Append(*pkt);
+    }
+  }
+
+ protected:
+  void EncodeFrameInternal(const ::libvpx_test::VideoSource &video,
+                           const unsigned long frame_flags,  // NOLINT
+                           const vpx_codec_err_t expected_err) {
+    vpx_codec_err_t res;
+    const vpx_image_t *img = video.img();
+
+    // Handle frame resizing
+    if (cfg_.g_w != img->d_w || cfg_.g_h != img->d_h) {
+      cfg_.g_w = img->d_w;
+      cfg_.g_h = img->d_h;
+      res = vpx_codec_enc_config_set(&encoder_, &cfg_);
+      ASSERT_EQ(res, VPX_CODEC_OK) << EncoderError();
+    }
+
+    // Encode the frame
+    API_REGISTER_STATE_CHECK(res = vpx_codec_encode(&encoder_, img, video.pts(),
+                                                    video.duration(),
+                                                    frame_flags, deadline_));
+    ASSERT_EQ(expected_err, res) << EncoderError();
+  }
+
+  vpx_codec_iface_t *CodecInterface() const override {
+#if CONFIG_VP9_ENCODER
+    return &vpx_codec_vp9_cx_algo;
+#else
+    return nullptr;
+#endif
+  }
+};
+
 class VP9FrameSizeTestsLarge : public ::libvpx_test::EncoderTest,
                                public ::testing::Test {
  protected:
   VP9FrameSizeTestsLarge()
       : EncoderTest(&::libvpx_test::kVP9), expected_res_(VPX_CODEC_OK) {}
-  virtual ~VP9FrameSizeTestsLarge() {}
+  ~VP9FrameSizeTestsLarge() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
   }
 
-  virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
-                                  const libvpx_test::VideoSource & /*video*/,
-                                  libvpx_test::Decoder *decoder) {
+  bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+                          const libvpx_test::VideoSource & /*video*/,
+                          libvpx_test::Decoder *decoder) override {
     EXPECT_EQ(expected_res_, res_dec) << decoder->DecodeError();
     return !::testing::Test::HasFailure();
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, 7);
       encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
       encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
@@ -43,28 +104,98 @@ class VP9FrameSizeTestsLarge : public ::libvpx_test::EncoderTest,
     }
   }
 
-  int expected_res_;
+  using ::libvpx_test::EncoderTest::RunLoop;
+  virtual void RunLoop(::libvpx_test::VideoSource *video,
+                       const vpx_codec_err_t expected_err) {
+    stats_.Reset();
+
+    ASSERT_TRUE(passes_ == 1 || passes_ == 2);
+    for (unsigned int pass = 0; pass < passes_; pass++) {
+      vpx_codec_pts_t last_pts = 0;
+
+      if (passes_ == 1) {
+        cfg_.g_pass = VPX_RC_ONE_PASS;
+      } else if (pass == 0) {
+        cfg_.g_pass = VPX_RC_FIRST_PASS;
+      } else {
+        cfg_.g_pass = VPX_RC_LAST_PASS;
+      }
+
+      BeginPassHook(pass);
+      std::unique_ptr<EncoderWithExpectedError> encoder(
+          new EncoderWithExpectedError(cfg_, deadline_, init_flags_, &stats_));
+      ASSERT_NE(encoder.get(), nullptr);
+
+      ASSERT_NO_FATAL_FAILURE(video->Begin());
+      encoder->InitEncoder(video);
+      ASSERT_FALSE(::testing::Test::HasFatalFailure());
+      for (bool again = true; again; video->Next()) {
+        again = (video->img() != nullptr);
+
+        PreEncodeFrameHook(video, encoder.get());
+        encoder->EncodeFrame(video, frame_flags_, expected_err);
+
+        PostEncodeFrameHook(encoder.get());
+
+        ::libvpx_test::CxDataIterator iter = encoder->GetCxData();
+
+        while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) {
+          pkt = MutateEncoderOutputHook(pkt);
+          again = true;
+          switch (pkt->kind) {
+            case VPX_CODEC_CX_FRAME_PKT:
+              ASSERT_GE(pkt->data.frame.pts, last_pts);
+              last_pts = pkt->data.frame.pts;
+              FramePktHook(pkt);
+              break;
+
+            case VPX_CODEC_PSNR_PKT: PSNRPktHook(pkt); break;
+            case VPX_CODEC_STATS_PKT: StatsPktHook(pkt); break;
+            default: break;
+          }
+        }
+
+        if (!Continue()) break;
+      }
+
+      EndPassHook();
+
+      if (!Continue()) break;
+    }
+  }
+
+  vpx_codec_err_t expected_res_;
 };
 
 TEST_F(VP9FrameSizeTestsLarge, TestInvalidSizes) {
+#ifdef CHROMIUM
+  GTEST_SKIP() << "16K framebuffers are not supported by Chromium's allocator.";
+#else
   ::libvpx_test::RandomVideoSource video;
 
 #if CONFIG_SIZE_LIMIT
   video.SetSize(DECODE_WIDTH_LIMIT + 16, DECODE_HEIGHT_LIMIT + 16);
   video.set_limit(2);
-  expected_res_ = VPX_CODEC_CORRUPT_FRAME;
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  expected_res_ = VPX_CODEC_MEM_ERROR;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video, expected_res_));
+#endif
+
 #endif
 }
 
 TEST_F(VP9FrameSizeTestsLarge, ValidSizes) {
+#ifdef CHROMIUM
+  GTEST_SKIP()
+      << "Under Chromium's configuration the allocator is unable to provide"
+         "the space required for a single frame at the maximum resolution.";
+#else
   ::libvpx_test::RandomVideoSource video;
 
 #if CONFIG_SIZE_LIMIT
   video.SetSize(DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT);
   video.set_limit(2);
   expected_res_ = VPX_CODEC_OK;
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_NO_FATAL_FAILURE(::libvpx_test::EncoderTest::RunLoop(&video));
 #else
 // This test produces a pretty large single frame allocation,  (roughly
 // 25 megabits). The encoder allocates a good number of these frames
@@ -73,15 +204,17 @@ TEST_F(VP9FrameSizeTestsLarge, ValidSizes) {
 // size or almost 1 gig of memory.
 // In total the allocations will exceed 2GiB which may cause a failure with
 // mingw + wine, use a smaller size in that case.
-#if defined(_WIN32) && !defined(_WIN64) || defined(__OS2__)
+#if defined(_WIN32) && !defined(_WIN64)
   video.SetSize(4096, 3072);
 #else
   video.SetSize(4096, 4096);
 #endif
   video.set_limit(2);
   expected_res_ = VPX_CODEC_OK;
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_NO_FATAL_FAILURE(::libvpx_test::EncoderTest::RunLoop(&video));
 #endif
+
+#endif  // defined(CHROMIUM)
 }
 
 TEST_F(VP9FrameSizeTestsLarge, OneByOneVideo) {
@@ -90,6 +223,6 @@ TEST_F(VP9FrameSizeTestsLarge, OneByOneVideo) {
   video.SetSize(1, 1);
   video.set_limit(2);
   expected_res_ = VPX_CODEC_OK;
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_NO_FATAL_FAILURE(::libvpx_test::EncoderTest::RunLoop(&video));
 }
 }  // namespace
diff --git a/media/libvpx/libvpx/test/hadamard_test.cc b/media/libvpx/libvpx/test/hadamard_test.cc
index e7715958ed..e0cc8f30c2 100644
--- a/media/libvpx/libvpx/test/hadamard_test.cc
+++ b/media/libvpx/libvpx/test/hadamard_test.cc
@@ -10,26 +10,29 @@
 
 #include <algorithm>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/vpx_timer.h"
 
 #include "test/acm_random.h"
 #include "test/register_state_check.h"
+#include "vpx_config.h"
 
 namespace {
 
 using ::libvpx_test::ACMRandom;
 
-typedef void (*HadamardFunc)(const int16_t *a, int a_stride, int16_t *b);
+using HadamardFunc = void (*)(const int16_t *a, ptrdiff_t a_stride,
+                              tran_low_t *b);
 
-void hadamard_loop(const int16_t *a, int a_stride, int16_t *out) {
-  int16_t b[8];
+void hadamard_loop(const tran_low_t *a, tran_low_t *out) {
+  tran_low_t b[8];
   for (int i = 0; i < 8; i += 2) {
-    b[i + 0] = a[i * a_stride] + a[(i + 1) * a_stride];
-    b[i + 1] = a[i * a_stride] - a[(i + 1) * a_stride];
+    b[i + 0] = a[i * 8] + a[(i + 1) * 8];
+    b[i + 1] = a[i * 8] - a[(i + 1) * 8];
   }
-  int16_t c[8];
+  tran_low_t c[8];
   for (int i = 0; i < 8; i += 4) {
     c[i + 0] = b[i + 0] + b[i + 2];
     c[i + 1] = b[i + 1] + b[i + 3];
@@ -46,18 +49,19 @@ void hadamard_loop(const int16_t *a, int a_stride, int16_t *out) {
   out[5] = c[3] - c[7];
 }
 
-void reference_hadamard8x8(const int16_t *a, int a_stride, int16_t *b) {
-  int16_t buf[64];
+void reference_hadamard8x8(const int16_t *a, int a_stride, tran_low_t *b) {
+  tran_low_t input[64];
+  tran_low_t buf[64];
   for (int i = 0; i < 8; ++i) {
-    hadamard_loop(a + i, a_stride, buf + i * 8);
-  }
-
-  for (int i = 0; i < 8; ++i) {
-    hadamard_loop(buf + i, 8, b + i * 8);
+    for (int j = 0; j < 8; ++j) {
+      input[i * 8 + j] = static_cast<tran_low_t>(a[i * a_stride + j]);
+    }
   }
+  for (int i = 0; i < 8; ++i) hadamard_loop(input + i, buf + i * 8);
+  for (int i = 0; i < 8; ++i) hadamard_loop(buf + i, b + i * 8);
 }
 
-void reference_hadamard16x16(const int16_t *a, int a_stride, int16_t *b) {
+void reference_hadamard16x16(const int16_t *a, int a_stride, tran_low_t *b) {
   /* The source is a 16x16 block. The destination is rearranged to 8x32.
    * Input is 9 bit. */
   reference_hadamard8x8(a + 0 + 0 * a_stride, a_stride, b + 0);
@@ -68,16 +72,16 @@ void reference_hadamard16x16(const int16_t *a, int a_stride, int16_t *b) {
   /* Overlay the 8x8 blocks and combine. */
   for (int i = 0; i < 64; ++i) {
     /* 8x8 steps the range up to 15 bits. */
-    const int16_t a0 = b[0];
-    const int16_t a1 = b[64];
-    const int16_t a2 = b[128];
-    const int16_t a3 = b[192];
+    const tran_low_t a0 = b[0];
+    const tran_low_t a1 = b[64];
+    const tran_low_t a2 = b[128];
+    const tran_low_t a3 = b[192];
 
     /* Prevent the result from escaping int16_t. */
-    const int16_t b0 = (a0 + a1) >> 1;
-    const int16_t b1 = (a0 - a1) >> 1;
-    const int16_t b2 = (a2 + a3) >> 1;
-    const int16_t b3 = (a2 - a3) >> 1;
+    const tran_low_t b0 = (a0 + a1) >> 1;
+    const tran_low_t b1 = (a0 - a1) >> 1;
+    const tran_low_t b2 = (a2 + a3) >> 1;
+    const tran_low_t b3 = (a2 - a3) >> 1;
 
     /* Store a 16 bit value. */
     b[0] = b0 + b2;
@@ -89,132 +93,289 @@ void reference_hadamard16x16(const int16_t *a, int a_stride, int16_t *b) {
   }
 }
 
-class HadamardTestBase : public ::testing::TestWithParam<HadamardFunc> {
+void reference_hadamard32x32(const int16_t *a, int a_stride, tran_low_t *b) {
+  reference_hadamard16x16(a + 0 + 0 * a_stride, a_stride, b + 0);
+  reference_hadamard16x16(a + 16 + 0 * a_stride, a_stride, b + 256);
+  reference_hadamard16x16(a + 0 + 16 * a_stride, a_stride, b + 512);
+  reference_hadamard16x16(a + 16 + 16 * a_stride, a_stride, b + 768);
+
+  for (int i = 0; i < 256; ++i) {
+    const tran_low_t a0 = b[0];
+    const tran_low_t a1 = b[256];
+    const tran_low_t a2 = b[512];
+    const tran_low_t a3 = b[768];
+
+    const tran_low_t b0 = (a0 + a1) >> 2;
+    const tran_low_t b1 = (a0 - a1) >> 2;
+    const tran_low_t b2 = (a2 + a3) >> 2;
+    const tran_low_t b3 = (a2 - a3) >> 2;
+
+    b[0] = b0 + b2;
+    b[256] = b1 + b3;
+    b[512] = b0 - b2;
+    b[768] = b1 - b3;
+
+    ++b;
+  }
+}
+
+struct HadamardFuncWithSize {
+  HadamardFuncWithSize(HadamardFunc f, int s) : func(f), block_size(s) {}
+  HadamardFunc func;
+  int block_size;
+};
+
+std::ostream &operator<<(std::ostream &os, const HadamardFuncWithSize &hfs) {
+  return os << "block size: " << hfs.block_size;
+}
+
+class HadamardTestBase : public ::testing::TestWithParam<HadamardFuncWithSize> {
  public:
-  virtual void SetUp() {
-    h_func_ = GetParam();
+  void SetUp() override {
+    h_func_ = GetParam().func;
+    bwh_ = GetParam().block_size;
+    block_size_ = bwh_ * bwh_;
     rnd_.Reset(ACMRandom::DeterministicSeed());
   }
 
+  // The Rand() function generates values in the range [-((1 << BitDepth) - 1),
+  // (1 << BitDepth) - 1]. This is because the input to the Hadamard transform
+  // is the residual pixel, which is defined as 'source pixel - predicted
+  // pixel'. Source pixel and predicted pixel take values in the range
+  // [0, (1 << BitDepth) - 1] and thus the residual pixel ranges from
+  // -((1 << BitDepth) - 1) to ((1 << BitDepth) - 1).
+  virtual int16_t Rand() = 0;
+
+  void ReferenceHadamard(const int16_t *a, int a_stride, tran_low_t *b,
+                         int bwh) {
+    if (bwh == 32)
+      reference_hadamard32x32(a, a_stride, b);
+    else if (bwh == 16)
+      reference_hadamard16x16(a, a_stride, b);
+    else
+      reference_hadamard8x8(a, a_stride, b);
+  }
+
+  void CompareReferenceRandom() {
+    const int kMaxBlockSize = 32 * 32;
+    DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize]);
+    DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]);
+    memset(a, 0, sizeof(a));
+    memset(b, 0, sizeof(b));
+
+    tran_low_t b_ref[kMaxBlockSize];
+    memset(b_ref, 0, sizeof(b_ref));
+
+    for (int i = 0; i < block_size_; ++i) a[i] = Rand();
+
+    ReferenceHadamard(a, bwh_, b_ref, bwh_);
+    ASM_REGISTER_STATE_CHECK(h_func_(a, bwh_, b));
+
+    // The order of the output is not important. Sort before checking.
+    std::sort(b, b + block_size_);
+    std::sort(b_ref, b_ref + block_size_);
+    EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
+  }
+
+  void ExtremeValuesTest() {
+    const int kMaxBlockSize = 32 * 32;
+    DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxBlockSize]);
+    DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]);
+    memset(b, 0, sizeof(b));
+
+    tran_low_t b_ref[kMaxBlockSize];
+    memset(b_ref, 0, sizeof(b_ref));
+
+    for (int i = 0; i < 2; ++i) {
+      // Initialize a test block with input range [-mask_, mask_].
+      const int sign = (i == 0) ? 1 : -1;
+      for (int j = 0; j < kMaxBlockSize; ++j)
+        input_extreme_block[j] = sign * 255;
+
+      ReferenceHadamard(input_extreme_block, bwh_, b_ref, bwh_);
+      ASM_REGISTER_STATE_CHECK(h_func_(input_extreme_block, bwh_, b));
+
+      // The order of the output is not important. Sort before checking.
+      std::sort(b, b + block_size_);
+      std::sort(b_ref, b_ref + block_size_);
+      EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
+    }
+  }
+
+  void VaryStride() {
+    const int kMaxBlockSize = 32 * 32;
+    DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize * 8]);
+    DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]);
+    memset(a, 0, sizeof(a));
+    for (int i = 0; i < block_size_ * 8; ++i) a[i] = Rand();
+
+    tran_low_t b_ref[kMaxBlockSize];
+    for (int i = 8; i < 64; i += 8) {
+      memset(b, 0, sizeof(b));
+      memset(b_ref, 0, sizeof(b_ref));
+
+      ReferenceHadamard(a, i, b_ref, bwh_);
+      ASM_REGISTER_STATE_CHECK(h_func_(a, i, b));
+
+      // The order of the output is not important. Sort before checking.
+      std::sort(b, b + block_size_);
+      std::sort(b_ref, b_ref + block_size_);
+      EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
+    }
+  }
+
+  void SpeedTest(int times) {
+    const int kMaxBlockSize = 32 * 32;
+    DECLARE_ALIGNED(16, int16_t, input[kMaxBlockSize]);
+    DECLARE_ALIGNED(16, tran_low_t, output[kMaxBlockSize]);
+    memset(input, 1, sizeof(input));
+    memset(output, 0, sizeof(output));
+
+    vpx_usec_timer timer;
+    vpx_usec_timer_start(&timer);
+    for (int i = 0; i < times; ++i) {
+      h_func_(input, bwh_, output);
+    }
+    vpx_usec_timer_mark(&timer);
+
+    const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+    printf("Hadamard%dx%d[%12d runs]: %d us\n", bwh_, bwh_, times,
+           elapsed_time);
+  }
+
  protected:
+  int bwh_;
+  int block_size_;
   HadamardFunc h_func_;
   ACMRandom rnd_;
 };
 
-class Hadamard8x8Test : public HadamardTestBase {};
-
-TEST_P(Hadamard8x8Test, CompareReferenceRandom) {
-  DECLARE_ALIGNED(16, int16_t, a[64]);
-  DECLARE_ALIGNED(16, int16_t, b[64]);
-  int16_t b_ref[64];
-  for (int i = 0; i < 64; ++i) {
-    a[i] = rnd_.Rand9Signed();
+class HadamardLowbdTest : public HadamardTestBase {
+ protected:
+  // Use values between -255 (0xFF01) and 255 (0x00FF)
+  int16_t Rand() override {
+    int16_t src = rnd_.Rand8();
+    int16_t pred = rnd_.Rand8();
+    return src - pred;
   }
-  memset(b, 0, sizeof(b));
-  memset(b_ref, 0, sizeof(b_ref));
+};
 
-  reference_hadamard8x8(a, 8, b_ref);
-  ASM_REGISTER_STATE_CHECK(h_func_(a, 8, b));
+TEST_P(HadamardLowbdTest, CompareReferenceRandom) { CompareReferenceRandom(); }
 
-  // The order of the output is not important. Sort before checking.
-  std::sort(b, b + 64);
-  std::sort(b_ref, b_ref + 64);
-  EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
+TEST_P(HadamardLowbdTest, ExtremeValuesTest) { ExtremeValuesTest(); }
+
+TEST_P(HadamardLowbdTest, VaryStride) { VaryStride(); }
+
+TEST_P(HadamardLowbdTest, DISABLED_Speed) {
+  SpeedTest(10);
+  SpeedTest(10000);
+  SpeedTest(10000000);
 }
 
-TEST_P(Hadamard8x8Test, VaryStride) {
-  DECLARE_ALIGNED(16, int16_t, a[64 * 8]);
-  DECLARE_ALIGNED(16, int16_t, b[64]);
-  int16_t b_ref[64];
-  for (int i = 0; i < 64 * 8; ++i) {
-    a[i] = rnd_.Rand9Signed();
-  }
-
-  for (int i = 8; i < 64; i += 8) {
-    memset(b, 0, sizeof(b));
-    memset(b_ref, 0, sizeof(b_ref));
-
-    reference_hadamard8x8(a, i, b_ref);
-    ASM_REGISTER_STATE_CHECK(h_func_(a, i, b));
-
-    // The order of the output is not important. Sort before checking.
-    std::sort(b, b + 64);
-    std::sort(b_ref, b_ref + 64);
-    EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
-  }
-}
-
-INSTANTIATE_TEST_CASE_P(C, Hadamard8x8Test,
-                        ::testing::Values(&vpx_hadamard_8x8_c));
+INSTANTIATE_TEST_SUITE_P(
+    C, HadamardLowbdTest,
+    ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_c, 8),
+                      HadamardFuncWithSize(&vpx_hadamard_16x16_c, 16),
+                      HadamardFuncWithSize(&vpx_hadamard_32x32_c, 32)));
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, Hadamard8x8Test,
-                        ::testing::Values(&vpx_hadamard_8x8_sse2));
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, HadamardLowbdTest,
+    ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_sse2, 8),
+                      HadamardFuncWithSize(&vpx_hadamard_16x16_sse2, 16),
+                      HadamardFuncWithSize(&vpx_hadamard_32x32_sse2, 32)));
 #endif  // HAVE_SSE2
 
-#if HAVE_SSSE3 && ARCH_X86_64
-INSTANTIATE_TEST_CASE_P(SSSE3, Hadamard8x8Test,
-                        ::testing::Values(&vpx_hadamard_8x8_ssse3));
-#endif  // HAVE_SSSE3 && ARCH_X86_64
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, HadamardLowbdTest,
+    ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_16x16_avx2, 16),
+                      HadamardFuncWithSize(&vpx_hadamard_32x32_avx2, 32)));
+#endif  // HAVE_AVX2
+
+#if HAVE_SSSE3 && VPX_ARCH_X86_64
+INSTANTIATE_TEST_SUITE_P(
+    SSSE3, HadamardLowbdTest,
+    ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_ssse3, 8)));
+#endif  // HAVE_SSSE3 && VPX_ARCH_X86_64
 
 #if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(NEON, Hadamard8x8Test,
-                        ::testing::Values(&vpx_hadamard_8x8_neon));
+INSTANTIATE_TEST_SUITE_P(
+    NEON, HadamardLowbdTest,
+    ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_neon, 8),
+                      HadamardFuncWithSize(&vpx_hadamard_16x16_neon, 16),
+                      HadamardFuncWithSize(&vpx_hadamard_32x32_neon, 32)));
 #endif  // HAVE_NEON
 
-class Hadamard16x16Test : public HadamardTestBase {};
+// TODO(jingning): Remove highbitdepth flag when the SIMD functions are
+// in place and turn on the unit test.
+#if !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_MSA
+INSTANTIATE_TEST_SUITE_P(
+    MSA, HadamardLowbdTest,
+    ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_msa, 8),
+                      HadamardFuncWithSize(&vpx_hadamard_16x16_msa, 16)));
+#endif  // HAVE_MSA
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
 
-TEST_P(Hadamard16x16Test, CompareReferenceRandom) {
-  DECLARE_ALIGNED(16, int16_t, a[16 * 16]);
-  DECLARE_ALIGNED(16, int16_t, b[16 * 16]);
-  int16_t b_ref[16 * 16];
-  for (int i = 0; i < 16 * 16; ++i) {
-    a[i] = rnd_.Rand9Signed();
+#if HAVE_VSX
+INSTANTIATE_TEST_SUITE_P(
+    VSX, HadamardLowbdTest,
+    ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_vsx, 8),
+                      HadamardFuncWithSize(&vpx_hadamard_16x16_vsx, 16)));
+#endif  // HAVE_VSX
+
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(
+    LSX, HadamardLowbdTest,
+    ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_lsx, 8),
+                      HadamardFuncWithSize(&vpx_hadamard_16x16_lsx, 16)));
+#endif  // HAVE_LSX
+
+#if CONFIG_VP9_HIGHBITDEPTH
+class HadamardHighbdTest : public HadamardTestBase {
+ protected:
+  // Use values between -4095 (0xF001) and 4095 (0x0FFF)
+  int16_t Rand() override {
+    int16_t src = rnd_.Rand12();
+    int16_t pred = rnd_.Rand12();
+    return src - pred;
   }
-  memset(b, 0, sizeof(b));
-  memset(b_ref, 0, sizeof(b_ref));
+};
 
-  reference_hadamard16x16(a, 16, b_ref);
-  ASM_REGISTER_STATE_CHECK(h_func_(a, 16, b));
+TEST_P(HadamardHighbdTest, CompareReferenceRandom) { CompareReferenceRandom(); }
 
-  // The order of the output is not important. Sort before checking.
-  std::sort(b, b + 16 * 16);
-  std::sort(b_ref, b_ref + 16 * 16);
-  EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
+TEST_P(HadamardHighbdTest, VaryStride) { VaryStride(); }
+
+TEST_P(HadamardHighbdTest, DISABLED_Speed) {
+  SpeedTest(10);
+  SpeedTest(10000);
+  SpeedTest(10000000);
 }
 
-TEST_P(Hadamard16x16Test, VaryStride) {
-  DECLARE_ALIGNED(16, int16_t, a[16 * 16 * 8]);
-  DECLARE_ALIGNED(16, int16_t, b[16 * 16]);
-  int16_t b_ref[16 * 16];
-  for (int i = 0; i < 16 * 16 * 8; ++i) {
-    a[i] = rnd_.Rand9Signed();
-  }
+INSTANTIATE_TEST_SUITE_P(
+    C, HadamardHighbdTest,
+    ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_c, 8),
+                      HadamardFuncWithSize(&vpx_highbd_hadamard_16x16_c, 16),
+                      HadamardFuncWithSize(&vpx_highbd_hadamard_32x32_c, 32)));
 
-  for (int i = 8; i < 64; i += 8) {
-    memset(b, 0, sizeof(b));
-    memset(b_ref, 0, sizeof(b_ref));
-
-    reference_hadamard16x16(a, i, b_ref);
-    ASM_REGISTER_STATE_CHECK(h_func_(a, i, b));
-
-    // The order of the output is not important. Sort before checking.
-    std::sort(b, b + 16 * 16);
-    std::sort(b_ref, b_ref + 16 * 16);
-    EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
-  }
-}
-
-INSTANTIATE_TEST_CASE_P(C, Hadamard16x16Test,
-                        ::testing::Values(&vpx_hadamard_16x16_c));
-
-#if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, Hadamard16x16Test,
-                        ::testing::Values(&vpx_hadamard_16x16_sse2));
-#endif  // HAVE_SSE2
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, HadamardHighbdTest,
+    ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_avx2, 8),
+                      HadamardFuncWithSize(&vpx_highbd_hadamard_16x16_avx2, 16),
+                      HadamardFuncWithSize(&vpx_highbd_hadamard_32x32_avx2,
+                                           32)));
+#endif  // HAVE_AVX2
 
 #if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(NEON, Hadamard16x16Test,
-                        ::testing::Values(&vpx_hadamard_16x16_neon));
-#endif  // HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, HadamardHighbdTest,
+    ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_neon, 8),
+                      HadamardFuncWithSize(&vpx_highbd_hadamard_16x16_neon, 16),
+                      HadamardFuncWithSize(&vpx_highbd_hadamard_32x32_neon,
+                                           32)));
+#endif
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace
diff --git a/media/libvpx/libvpx/test/i420_video_source.h b/media/libvpx/libvpx/test/i420_video_source.h
index 49573823b4..97473b5c2f 100644
--- a/media/libvpx/libvpx/test/i420_video_source.h
+++ b/media/libvpx/libvpx/test/i420_video_source.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef TEST_I420_VIDEO_SOURCE_H_
-#define TEST_I420_VIDEO_SOURCE_H_
+#ifndef VPX_TEST_I420_VIDEO_SOURCE_H_
+#define VPX_TEST_I420_VIDEO_SOURCE_H_
 #include <cstdio>
 #include <cstdlib>
 #include <string>
@@ -30,4 +30,4 @@ class I420VideoSource : public YUVVideoSource {
 
 }  // namespace libvpx_test
 
-#endif  // TEST_I420_VIDEO_SOURCE_H_
+#endif  // VPX_TEST_I420_VIDEO_SOURCE_H_
diff --git a/media/libvpx/libvpx/test/idct8x8_test.cc b/media/libvpx/libvpx/test/idct8x8_test.cc
index 7951bb93c9..28ab257e6e 100644
--- a/media/libvpx/libvpx/test/idct8x8_test.cc
+++ b/media/libvpx/libvpx/test/idct8x8_test.cc
@@ -12,12 +12,11 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
-#include "vpx_ports/msvc.h"  // for round()
 
 using libvpx_test::ACMRandom;
 
diff --git a/media/libvpx/libvpx/test/idct_test.cc b/media/libvpx/libvpx/test/idct_test.cc
index 700da77e3e..ed969945f8 100644
--- a/media/libvpx/libvpx/test/idct_test.cc
+++ b/media/libvpx/libvpx/test/idct_test.cc
@@ -11,120 +11,170 @@
 #include "./vpx_config.h"
 #include "./vp8_rtcd.h"
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
+#include "test/buffer.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "vpx/vpx_integer.h"
 
-typedef void (*IdctFunc)(int16_t *input, unsigned char *pred_ptr,
-                         int pred_stride, unsigned char *dst_ptr,
-                         int dst_stride);
+using IdctFunc = void (*)(int16_t *input, unsigned char *pred_ptr,
+                          int pred_stride, unsigned char *dst_ptr,
+                          int dst_stride);
 namespace {
+
+using libvpx_test::Buffer;
+
 class IDCTTest : public ::testing::TestWithParam<IdctFunc> {
  protected:
-  virtual void SetUp() {
-    int i;
-
+  void SetUp() override {
     UUT = GetParam();
-    memset(input, 0, sizeof(input));
-    /* Set up guard blocks */
-    for (i = 0; i < 256; i++) output[i] = ((i & 0xF) < 4 && (i < 64)) ? 0 : -1;
+
+    input = new Buffer<int16_t>(4, 4, 0);
+    ASSERT_NE(input, nullptr);
+    ASSERT_TRUE(input->Init());
+    predict = new Buffer<uint8_t>(4, 4, 3);
+    ASSERT_NE(predict, nullptr);
+    ASSERT_TRUE(predict->Init());
+    output = new Buffer<uint8_t>(4, 4, 3);
+    ASSERT_NE(output, nullptr);
+    ASSERT_TRUE(output->Init());
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override {
+    delete input;
+    delete predict;
+    delete output;
+    libvpx_test::ClearSystemState();
+  }
 
   IdctFunc UUT;
-  int16_t input[16];
-  unsigned char output[256];
-  unsigned char predict[256];
+  Buffer<int16_t> *input;
+  Buffer<uint8_t> *predict;
+  Buffer<uint8_t> *output;
 };
 
-TEST_P(IDCTTest, TestGuardBlocks) {
-  int i;
-
-  for (i = 0; i < 256; i++) {
-    if ((i & 0xF) < 4 && i < 64)
-      EXPECT_EQ(0, output[i]) << i;
-    else
-      EXPECT_EQ(255, output[i]);
-  }
-}
-
 TEST_P(IDCTTest, TestAllZeros) {
-  int i;
+  // When the input is '0' the output will be '0'.
+  input->Set(0);
+  predict->Set(0);
+  output->Set(0);
 
-  ASM_REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
+  ASM_REGISTER_STATE_CHECK(UUT(input->TopLeftPixel(), predict->TopLeftPixel(),
+                               predict->stride(), output->TopLeftPixel(),
+                               output->stride()));
 
-  for (i = 0; i < 256; i++) {
-    if ((i & 0xF) < 4 && i < 64)
-      EXPECT_EQ(0, output[i]) << "i==" << i;
-    else
-      EXPECT_EQ(255, output[i]) << "i==" << i;
-  }
+  ASSERT_TRUE(input->CheckValues(0));
+  ASSERT_TRUE(input->CheckPadding());
+  ASSERT_TRUE(output->CheckValues(0));
+  ASSERT_TRUE(output->CheckPadding());
 }
 
 TEST_P(IDCTTest, TestAllOnes) {
-  int i;
+  input->Set(0);
+  ASSERT_NE(input->TopLeftPixel(), nullptr);
+  // When the first element is '4' it will fill the output buffer with '1'.
+  input->TopLeftPixel()[0] = 4;
+  predict->Set(0);
+  output->Set(0);
 
-  input[0] = 4;
-  ASM_REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
+  ASM_REGISTER_STATE_CHECK(UUT(input->TopLeftPixel(), predict->TopLeftPixel(),
+                               predict->stride(), output->TopLeftPixel(),
+                               output->stride()));
 
-  for (i = 0; i < 256; i++) {
-    if ((i & 0xF) < 4 && i < 64)
-      EXPECT_EQ(1, output[i]) << "i==" << i;
-    else
-      EXPECT_EQ(255, output[i]) << "i==" << i;
-  }
+  ASSERT_TRUE(output->CheckValues(1));
+  ASSERT_TRUE(output->CheckPadding());
 }
 
 TEST_P(IDCTTest, TestAddOne) {
-  int i;
+  // Set the transform output to '1' and make sure it gets added to the
+  // prediction buffer.
+  input->Set(0);
+  ASSERT_NE(input->TopLeftPixel(), nullptr);
+  input->TopLeftPixel()[0] = 4;
+  output->Set(0);
 
-  for (i = 0; i < 256; i++) predict[i] = i;
-  input[0] = 4;
-  ASM_REGISTER_STATE_CHECK(UUT(input, predict, 16, output, 16));
-
-  for (i = 0; i < 256; i++) {
-    if ((i & 0xF) < 4 && i < 64)
-      EXPECT_EQ(i + 1, output[i]) << "i==" << i;
-    else
-      EXPECT_EQ(255, output[i]) << "i==" << i;
+  uint8_t *pred = predict->TopLeftPixel();
+  for (int y = 0; y < 4; ++y) {
+    for (int x = 0; x < 4; ++x) {
+      pred[y * predict->stride() + x] = y * 4 + x;
+    }
   }
+
+  ASM_REGISTER_STATE_CHECK(UUT(input->TopLeftPixel(), predict->TopLeftPixel(),
+                               predict->stride(), output->TopLeftPixel(),
+                               output->stride()));
+
+  uint8_t const *out = output->TopLeftPixel();
+  for (int y = 0; y < 4; ++y) {
+    for (int x = 0; x < 4; ++x) {
+      EXPECT_EQ(1 + y * 4 + x, out[y * output->stride() + x]);
+    }
+  }
+
+  if (HasFailure()) {
+    output->DumpBuffer();
+  }
+
+  ASSERT_TRUE(output->CheckPadding());
 }
 
 TEST_P(IDCTTest, TestWithData) {
-  int i;
+  // Test a single known input.
+  predict->Set(0);
 
-  for (i = 0; i < 16; i++) input[i] = i;
-
-  ASM_REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
-
-  for (i = 0; i < 256; i++) {
-    if ((i & 0xF) > 3 || i > 63)
-      EXPECT_EQ(255, output[i]) << "i==" << i;
-    else if (i == 0)
-      EXPECT_EQ(11, output[i]) << "i==" << i;
-    else if (i == 34)
-      EXPECT_EQ(1, output[i]) << "i==" << i;
-    else if (i == 2 || i == 17 || i == 32)
-      EXPECT_EQ(3, output[i]) << "i==" << i;
-    else
-      EXPECT_EQ(0, output[i]) << "i==" << i;
+  int16_t *in = input->TopLeftPixel();
+  for (int y = 0; y < 4; ++y) {
+    for (int x = 0; x < 4; ++x) {
+      in[y * input->stride() + x] = y * 4 + x;
+    }
   }
+
+  ASM_REGISTER_STATE_CHECK(UUT(input->TopLeftPixel(), predict->TopLeftPixel(),
+                               predict->stride(), output->TopLeftPixel(),
+                               output->stride()));
+
+  uint8_t *out = output->TopLeftPixel();
+  for (int y = 0; y < 4; ++y) {
+    for (int x = 0; x < 4; ++x) {
+      switch (y * 4 + x) {
+        case 0: EXPECT_EQ(11, out[y * output->stride() + x]); break;
+        case 2:
+        case 5:
+        case 8: EXPECT_EQ(3, out[y * output->stride() + x]); break;
+        case 10: EXPECT_EQ(1, out[y * output->stride() + x]); break;
+        default: EXPECT_EQ(0, out[y * output->stride() + x]);
+      }
+    }
+  }
+
+  if (HasFailure()) {
+    output->DumpBuffer();
+  }
+
+  ASSERT_TRUE(output->CheckPadding());
 }
 
-INSTANTIATE_TEST_CASE_P(C, IDCTTest, ::testing::Values(vp8_short_idct4x4llm_c));
+INSTANTIATE_TEST_SUITE_P(C, IDCTTest,
+                         ::testing::Values(vp8_short_idct4x4llm_c));
+
 #if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(NEON, IDCTTest,
-                        ::testing::Values(vp8_short_idct4x4llm_neon));
-#endif
+INSTANTIATE_TEST_SUITE_P(NEON, IDCTTest,
+                         ::testing::Values(vp8_short_idct4x4llm_neon));
+#endif  // HAVE_NEON
+
 #if HAVE_MMX
-INSTANTIATE_TEST_CASE_P(MMX, IDCTTest,
-                        ::testing::Values(vp8_short_idct4x4llm_mmx));
-#endif
+INSTANTIATE_TEST_SUITE_P(MMX, IDCTTest,
+                         ::testing::Values(vp8_short_idct4x4llm_mmx));
+#endif  // HAVE_MMX
+
 #if HAVE_MSA
-INSTANTIATE_TEST_CASE_P(MSA, IDCTTest,
-                        ::testing::Values(vp8_short_idct4x4llm_msa));
-#endif
-}
+INSTANTIATE_TEST_SUITE_P(MSA, IDCTTest,
+                         ::testing::Values(vp8_short_idct4x4llm_msa));
+#endif  // HAVE_MSA
+
+#if HAVE_MMI
+INSTANTIATE_TEST_SUITE_P(MMI, IDCTTest,
+                         ::testing::Values(vp8_short_idct4x4llm_mmi));
+#endif  // HAVE_MMI
+}  // namespace
diff --git a/media/libvpx/libvpx/test/init_vpx_test.cc b/media/libvpx/libvpx/test/init_vpx_test.cc
new file mode 100644
index 0000000000..11e3863c6d
--- /dev/null
+++ b/media/libvpx/libvpx/test/init_vpx_test.cc
@@ -0,0 +1,99 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "test/init_vpx_test.h"
+
+#include "./vpx_config.h"
+
+#if !CONFIG_SHARED
+#include <string>
+#include "gtest/gtest.h"
+#if VPX_ARCH_ARM
+#include "vpx_ports/arm.h"
+#endif
+#if VPX_ARCH_X86 || VPX_ARCH_X86_64
+#include "vpx_ports/x86.h"
+#endif
+extern "C" {
+#if CONFIG_VP8
+extern void vp8_rtcd();
+#endif  // CONFIG_VP8
+#if CONFIG_VP9
+extern void vp9_rtcd();
+#endif  // CONFIG_VP9
+extern void vpx_dsp_rtcd();
+extern void vpx_scale_rtcd();
+}
+
+#if VPX_ARCH_ARM || VPX_ARCH_X86 || VPX_ARCH_X86_64
+static void append_negative_gtest_filter(const char *str) {
+  std::string filter = GTEST_FLAG_GET(filter);
+  // Negative patterns begin with one '-' followed by a ':' separated list.
+  if (filter.find('-') == std::string::npos) filter += '-';
+  filter += str;
+  GTEST_FLAG_SET(filter, filter);
+}
+#endif  // VPX_ARCH_ARM || VPX_ARCH_X86 || VPX_ARCH_X86_64
+#endif  // !CONFIG_SHARED
+
+namespace libvpx_test {
+void init_vpx_test() {
+#if !CONFIG_SHARED
+#if VPX_ARCH_AARCH64
+  const int caps = arm_cpu_caps();
+  if (!(caps & HAS_NEON_DOTPROD)) {
+    append_negative_gtest_filter(":NEON_DOTPROD.*:NEON_DOTPROD/*");
+  }
+  if (!(caps & HAS_NEON_I8MM)) {
+    append_negative_gtest_filter(":NEON_I8MM.*:NEON_I8MM/*");
+  }
+  if (!(caps & HAS_SVE)) {
+    append_negative_gtest_filter(":SVE.*:SVE/*");
+  }
+  if (!(caps & HAS_SVE2)) {
+    append_negative_gtest_filter(":SVE2.*:SVE2/*");
+  }
+#elif VPX_ARCH_ARM
+  const int caps = arm_cpu_caps();
+  if (!(caps & HAS_NEON)) append_negative_gtest_filter(":NEON.*:NEON/*");
+#endif  // VPX_ARCH_ARM
+
+#if VPX_ARCH_X86 || VPX_ARCH_X86_64
+  const int simd_caps = x86_simd_caps();
+  if (!(simd_caps & HAS_MMX)) append_negative_gtest_filter(":MMX.*:MMX/*");
+  if (!(simd_caps & HAS_SSE)) append_negative_gtest_filter(":SSE.*:SSE/*");
+  if (!(simd_caps & HAS_SSE2)) append_negative_gtest_filter(":SSE2.*:SSE2/*");
+  if (!(simd_caps & HAS_SSE3)) append_negative_gtest_filter(":SSE3.*:SSE3/*");
+  if (!(simd_caps & HAS_SSSE3)) {
+    append_negative_gtest_filter(":SSSE3.*:SSSE3/*");
+  }
+  if (!(simd_caps & HAS_SSE4_1)) {
+    append_negative_gtest_filter(":SSE4_1.*:SSE4_1/*");
+  }
+  if (!(simd_caps & HAS_AVX)) append_negative_gtest_filter(":AVX.*:AVX/*");
+  if (!(simd_caps & HAS_AVX2)) append_negative_gtest_filter(":AVX2.*:AVX2/*");
+  if (!(simd_caps & HAS_AVX512)) {
+    append_negative_gtest_filter(":AVX512.*:AVX512/*");
+  }
+#endif  // VPX_ARCH_X86 || VPX_ARCH_X86_64
+
+  // Shared library builds don't support whitebox tests that exercise internal
+  // symbols.
+#if CONFIG_VP8
+  vp8_rtcd();
+#endif  // CONFIG_VP8
+#if CONFIG_VP9
+  vp9_rtcd();
+#endif  // CONFIG_VP9
+  vpx_dsp_rtcd();
+  vpx_scale_rtcd();
+#endif  // !CONFIG_SHARED
+}
+}  // namespace libvpx_test
diff --git a/media/libvpx/libvpx/test/init_vpx_test.h b/media/libvpx/libvpx/test/init_vpx_test.h
new file mode 100644
index 0000000000..5e0dbb0e7e
--- /dev/null
+++ b/media/libvpx/libvpx/test/init_vpx_test.h
@@ -0,0 +1,18 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_INIT_VPX_TEST_H_
+#define TEST_INIT_VPX_TEST_H_
+
+namespace libvpx_test {
+void init_vpx_test();
+}
+
+#endif  // TEST_INIT_VPX_TEST_H_
diff --git a/media/libvpx/libvpx/test/invalid_file_test.cc b/media/libvpx/libvpx/test/invalid_file_test.cc
index eae81faa13..0b895ed902 100644
--- a/media/libvpx/libvpx/test/invalid_file_test.cc
+++ b/media/libvpx/libvpx/test/invalid_file_test.cc
@@ -10,9 +10,10 @@
 
 #include <cstdio>
 #include <cstdlib>
+#include <memory>
 #include <string>
 #include <vector>
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "./vpx_config.h"
 #include "test/codec_factory.h"
 #include "test/decode_test_driver.h"
@@ -37,23 +38,22 @@ std::ostream &operator<<(std::ostream &os, const DecodeParam &dp) {
 class InvalidFileTest : public ::libvpx_test::DecoderTest,
                         public ::libvpx_test::CodecTestWithParam<DecodeParam> {
  protected:
-  InvalidFileTest() : DecoderTest(GET_PARAM(0)), res_file_(NULL) {}
+  InvalidFileTest() : DecoderTest(GET_PARAM(0)), res_file_(nullptr) {}
 
-  virtual ~InvalidFileTest() {
-    if (res_file_ != NULL) fclose(res_file_);
+  ~InvalidFileTest() override {
+    if (res_file_ != nullptr) fclose(res_file_);
   }
 
   void OpenResFile(const std::string &res_file_name_) {
     res_file_ = libvpx_test::OpenTestDataFile(res_file_name_);
-    ASSERT_TRUE(res_file_ != NULL) << "Result file open failed. Filename: "
-                                   << res_file_name_;
+    ASSERT_NE(res_file_, nullptr)
+        << "Result file open failed. Filename: " << res_file_name_;
   }
 
-  virtual bool HandleDecodeResult(
-      const vpx_codec_err_t res_dec,
-      const libvpx_test::CompressedVideoSource &video,
-      libvpx_test::Decoder *decoder) {
-    EXPECT_TRUE(res_file_ != NULL);
+  bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+                          const libvpx_test::CompressedVideoSource &video,
+                          libvpx_test::Decoder *decoder) override {
+    EXPECT_NE(res_file_, nullptr);
     int expected_res_dec;
 
     // Read integer result.
@@ -89,7 +89,7 @@ class InvalidFileTest : public ::libvpx_test::DecoderTest,
     const std::string filename = input.filename;
 
     // Open compressed video file.
-    testing::internal::scoped_ptr<libvpx_test::CompressedVideoSource> video;
+    std::unique_ptr<libvpx_test::CompressedVideoSource> video;
     if (filename.substr(filename.length() - 3, 3) == "ivf") {
       video.reset(new libvpx_test::IVFVideoSource(filename));
     } else if (filename.substr(filename.length() - 4, 4) == "webm") {
@@ -101,7 +101,7 @@ class InvalidFileTest : public ::libvpx_test::DecoderTest,
       return;
 #endif
     }
-    ASSERT_TRUE(video.get() != NULL);
+    ASSERT_NE(video.get(), nullptr);
     video->Init();
 
     // Construct result file name. The file holds a list of expected integer
@@ -120,11 +120,26 @@ class InvalidFileTest : public ::libvpx_test::DecoderTest,
 
 TEST_P(InvalidFileTest, ReturnCode) { RunTest(); }
 
+#if CONFIG_VP8_DECODER
+const DecodeParam kVP8InvalidFileTests[] = {
+  { 1, "invalid-bug-1443.ivf" },
+  { 1, "invalid-bug-148271109.ivf" },
+  { 1, "invalid-token-partition.ivf" },
+  { 1, "invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf" },
+};
+
+VP8_INSTANTIATE_TEST_SUITE(InvalidFileTest,
+                           ::testing::ValuesIn(kVP8InvalidFileTests));
+#endif  // CONFIG_VP8_DECODER
+
 #if CONFIG_VP9_DECODER
 const DecodeParam kVP9InvalidFileTests[] = {
   { 1, "invalid-vp90-02-v2.webm" },
 #if CONFIG_VP9_HIGHBITDEPTH
   { 1, "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf" },
+  { 1,
+    "invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-."
+    "ivf" },
 #endif
   { 1, "invalid-vp90-03-v3.webm" },
   { 1, "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf" },
@@ -132,7 +147,7 @@ const DecodeParam kVP9InvalidFileTests[] = {
 // This file will cause a large allocation which is expected to fail in 32-bit
 // environments. Test x86 for coverage purposes as the allocation failure will
 // be in platform agnostic code.
-#if ARCH_X86
+#if VPX_ARCH_X86
   { 1, "invalid-vp90-2-00-quantizer-63.ivf.kf_65527x61446.ivf" },
 #endif
   { 1, "invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf" },
@@ -147,8 +162,8 @@ const DecodeParam kVP9InvalidFileTests[] = {
   { 1, "invalid-crbug-667044.webm" },
 };
 
-VP9_INSTANTIATE_TEST_CASE(InvalidFileTest,
-                          ::testing::ValuesIn(kVP9InvalidFileTests));
+VP9_INSTANTIATE_TEST_SUITE(InvalidFileTest,
+                           ::testing::ValuesIn(kVP9InvalidFileTests));
 #endif  // CONFIG_VP9_DECODER
 
 // This class will include test vectors that are expected to fail
@@ -156,20 +171,20 @@ VP9_INSTANTIATE_TEST_CASE(InvalidFileTest,
 class InvalidFileInvalidPeekTest : public InvalidFileTest {
  protected:
   InvalidFileInvalidPeekTest() : InvalidFileTest() {}
-  virtual void HandlePeekResult(libvpx_test::Decoder *const /*decoder*/,
-                                libvpx_test::CompressedVideoSource * /*video*/,
-                                const vpx_codec_err_t /*res_peek*/) {}
+  void HandlePeekResult(libvpx_test::Decoder *const /*decoder*/,
+                        libvpx_test::CompressedVideoSource * /*video*/,
+                        const vpx_codec_err_t /*res_peek*/) override {}
 };
 
 TEST_P(InvalidFileInvalidPeekTest, ReturnCode) { RunTest(); }
 
 #if CONFIG_VP8_DECODER
-const DecodeParam kVP8InvalidFileTests[] = {
+const DecodeParam kVP8InvalidPeekTests[] = {
   { 1, "invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf" },
 };
 
-VP8_INSTANTIATE_TEST_CASE(InvalidFileInvalidPeekTest,
-                          ::testing::ValuesIn(kVP8InvalidFileTests));
+VP8_INSTANTIATE_TEST_SUITE(InvalidFileInvalidPeekTest,
+                           ::testing::ValuesIn(kVP8InvalidPeekTests));
 #endif  // CONFIG_VP8_DECODER
 
 #if CONFIG_VP9_DECODER
@@ -177,8 +192,9 @@ const DecodeParam kVP9InvalidFileInvalidPeekTests[] = {
   { 1, "invalid-vp90-01-v3.webm" },
 };
 
-VP9_INSTANTIATE_TEST_CASE(InvalidFileInvalidPeekTest,
-                          ::testing::ValuesIn(kVP9InvalidFileInvalidPeekTests));
+VP9_INSTANTIATE_TEST_SUITE(
+    InvalidFileInvalidPeekTest,
+    ::testing::ValuesIn(kVP9InvalidFileInvalidPeekTests));
 
 const DecodeParam kMultiThreadedVP9InvalidFileTests[] = {
   { 4, "invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm" },
@@ -190,9 +206,11 @@ const DecodeParam kMultiThreadedVP9InvalidFileTests[] = {
   { 2, "invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf" },
   { 4, "invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf" },
   { 2, "invalid-crbug-629481.webm" },
+  { 3, "invalid-crbug-1558.ivf" },
+  { 4, "invalid-crbug-1562.ivf" },
 };
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     VP9MultiThreaded, InvalidFileTest,
     ::testing::Combine(
         ::testing::Values(
diff --git a/media/libvpx/libvpx/test/ivf_video_source.h b/media/libvpx/libvpx/test/ivf_video_source.h
index b87624a11f..3ccac62b51 100644
--- a/media/libvpx/libvpx/test/ivf_video_source.h
+++ b/media/libvpx/libvpx/test/ivf_video_source.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef TEST_IVF_VIDEO_SOURCE_H_
-#define TEST_IVF_VIDEO_SOURCE_H_
+#ifndef VPX_TEST_IVF_VIDEO_SOURCE_H_
+#define VPX_TEST_IVF_VIDEO_SOURCE_H_
 #include <cstdio>
 #include <cstdlib>
 #include <new>
@@ -16,7 +16,7 @@
 #include "test/video_source.h"
 
 namespace libvpx_test {
-const unsigned int kCodeBufferSize = 256 * 1024;
+const unsigned int kCodeBufferSize = 256 * 1024 * 1024;
 const unsigned int kIvfFileHdrSize = 32;
 const unsigned int kIvfFrameHdrSize = 12;
 
@@ -29,26 +29,26 @@ static unsigned int MemGetLe32(const uint8_t *mem) {
 class IVFVideoSource : public CompressedVideoSource {
  public:
   explicit IVFVideoSource(const std::string &file_name)
-      : file_name_(file_name), input_file_(NULL), compressed_frame_buf_(NULL),
-        frame_sz_(0), frame_(0), end_of_file_(false) {}
+      : file_name_(file_name), input_file_(nullptr),
+        compressed_frame_buf_(nullptr), frame_sz_(0), frame_(0),
+        end_of_file_(false) {}
 
-  virtual ~IVFVideoSource() {
+  ~IVFVideoSource() override {
     delete[] compressed_frame_buf_;
 
     if (input_file_) fclose(input_file_);
   }
 
-  virtual void Init() {
+  void Init() override {
     // Allocate a buffer for read in the compressed video frame.
     compressed_frame_buf_ = new uint8_t[libvpx_test::kCodeBufferSize];
-    ASSERT_TRUE(compressed_frame_buf_ != NULL)
-        << "Allocate frame buffer failed";
+    ASSERT_NE(compressed_frame_buf_, nullptr) << "Allocate frame buffer failed";
   }
 
-  virtual void Begin() {
+  void Begin() override {
     input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
-                                     << file_name_;
+    ASSERT_NE(input_file_, nullptr)
+        << "Input file open failed. Filename: " << file_name_;
 
     // Read file header
     uint8_t file_hdr[kIvfFileHdrSize];
@@ -62,13 +62,13 @@ class IVFVideoSource : public CompressedVideoSource {
     FillFrame();
   }
 
-  virtual void Next() {
+  void Next() override {
     ++frame_;
     FillFrame();
   }
 
   void FillFrame() {
-    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_NE(input_file_, nullptr);
     uint8_t frame_hdr[kIvfFrameHdrSize];
     // Check frame header and read a frame from input_file.
     if (fread(frame_hdr, 1, kIvfFrameHdrSize, input_file_) !=
@@ -86,11 +86,11 @@ class IVFVideoSource : public CompressedVideoSource {
     }
   }
 
-  virtual const uint8_t *cxdata() const {
-    return end_of_file_ ? NULL : compressed_frame_buf_;
+  const uint8_t *cxdata() const override {
+    return end_of_file_ ? nullptr : compressed_frame_buf_;
   }
-  virtual size_t frame_size() const { return frame_sz_; }
-  virtual unsigned int frame_number() const { return frame_; }
+  size_t frame_size() const override { return frame_sz_; }
+  unsigned int frame_number() const override { return frame_; }
 
  protected:
   std::string file_name_;
@@ -103,4 +103,4 @@ class IVFVideoSource : public CompressedVideoSource {
 
 }  // namespace libvpx_test
 
-#endif  // TEST_IVF_VIDEO_SOURCE_H_
+#endif  // VPX_TEST_IVF_VIDEO_SOURCE_H_
diff --git a/media/libvpx/libvpx/test/keyframe_test.cc b/media/libvpx/libvpx/test/keyframe_test.cc
index 38bd923b7d..c49ea91bd2 100644
--- a/media/libvpx/libvpx/test/keyframe_test.cc
+++ b/media/libvpx/libvpx/test/keyframe_test.cc
@@ -8,12 +8,18 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 #include <climits>
+#include <cstring>
 #include <vector>
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
 #include "test/util.h"
+#include "./vpx_config.h"
+#include "vpx/vp8cx.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_encoder.h"
+#include "vpx/vpx_image.h"
 
 namespace {
 
@@ -22,9 +28,9 @@ class KeyframeTest
       public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
  protected:
   KeyframeTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~KeyframeTest() {}
+  ~KeyframeTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
     kf_count_ = 0;
@@ -33,17 +39,17 @@ class KeyframeTest
     set_cpu_used_ = 0;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (kf_do_force_kf_) {
       frame_flags_ = (video->frame() % 3) ? 0 : VPX_EFLAG_FORCE_KF;
     }
-    if (set_cpu_used_ && video->frame() == 1) {
+    if (set_cpu_used_ && video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
     }
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) {
       kf_pts_list_.push_back(pkt->data.frame.pts);
       kf_count_++;
@@ -68,7 +74,9 @@ TEST_P(KeyframeTest, TestRandomVideoSource) {
 
   // In realtime mode - auto placed keyframes are exceedingly rare,  don't
   // bother with this check   if(GetParam() > 0)
-  if (GET_PARAM(1) > 0) EXPECT_GT(kf_count_, 1);
+  if (GET_PARAM(1) > 0) {
+    EXPECT_GT(kf_count_, 1);
+  }
 }
 
 TEST_P(KeyframeTest, TestDisableKeyframes) {
@@ -128,19 +136,121 @@ TEST_P(KeyframeTest, TestAutoKeyframe) {
 
   // In realtime mode - auto placed keyframes are exceedingly rare,  don't
   // bother with this check
-  if (GET_PARAM(1) > 0)
+  if (GET_PARAM(1) > 0) {
     EXPECT_EQ(2u, kf_pts_list_.size()) << " Not the right number of keyframes ";
+  }
 
   // Verify that keyframes match the file keyframes in the file.
   for (std::vector<vpx_codec_pts_t>::const_iterator iter = kf_pts_list_.begin();
        iter != kf_pts_list_.end(); ++iter) {
     if (deadline_ == VPX_DL_REALTIME && *iter > 0)
-      EXPECT_EQ(0, (*iter - 1) % 30) << "Unexpected keyframe at frame "
-                                     << *iter;
+      EXPECT_EQ(0, (*iter - 1) % 30)
+          << "Unexpected keyframe at frame " << *iter;
     else
       EXPECT_EQ(0, *iter % 30) << "Unexpected keyframe at frame " << *iter;
   }
 }
 
-VP8_INSTANTIATE_TEST_CASE(KeyframeTest, ALL_TEST_MODES);
+VP8_INSTANTIATE_TEST_SUITE(KeyframeTest, ALL_TEST_MODES);
+
+bool IsVP9(vpx_codec_iface_t *iface) {
+  static const char kVP9Name[] = "WebM Project VP9";
+  return strncmp(kVP9Name, vpx_codec_iface_name(iface), sizeof(kVP9Name) - 1) ==
+         0;
+}
+
+vpx_image_t *CreateGrayImage(vpx_img_fmt_t fmt, unsigned int w,
+                             unsigned int h) {
+  vpx_image_t *const image = vpx_img_alloc(nullptr, fmt, w, h, 1);
+  if (!image) return image;
+
+  for (unsigned int i = 0; i < image->d_h; ++i) {
+    memset(image->planes[0] + i * image->stride[0], 128, image->d_w);
+  }
+  const unsigned int uv_h = (image->d_h + 1) / 2;
+  const unsigned int uv_w = (image->d_w + 1) / 2;
+  for (unsigned int i = 0; i < uv_h; ++i) {
+    memset(image->planes[1] + i * image->stride[1], 128, uv_w);
+    memset(image->planes[2] + i * image->stride[2], 128, uv_w);
+  }
+  return image;
+}
+
+// Tests kf_max_dist in one-pass encoding with zero lag.
+void TestKeyframeMaximumInterval(vpx_codec_iface_t *iface,
+                                 vpx_enc_deadline_t deadline,
+                                 unsigned int kf_max_dist) {
+  vpx_codec_enc_cfg_t cfg;
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, /*usage=*/0),
+            VPX_CODEC_OK);
+  cfg.g_w = 320;
+  cfg.g_h = 240;
+  cfg.g_pass = VPX_RC_ONE_PASS;
+  cfg.g_lag_in_frames = 0;
+  cfg.kf_mode = VPX_KF_AUTO;
+  cfg.kf_min_dist = 0;
+  cfg.kf_max_dist = kf_max_dist;
+
+  vpx_codec_ctx_t enc;
+  ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+  const int speed = IsVP9(iface) ? 9 : -12;
+  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, speed), VPX_CODEC_OK);
+
+  vpx_image_t *image = CreateGrayImage(VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+  ASSERT_NE(image, nullptr);
+
+  // Encode frames.
+  const vpx_codec_cx_pkt_t *pkt;
+  const unsigned int num_frames = kf_max_dist == 0 ? 4 : 3 * kf_max_dist + 1;
+  for (unsigned int i = 0; i < num_frames; ++i) {
+    ASSERT_EQ(vpx_codec_encode(&enc, image, i, 1, 0, deadline), VPX_CODEC_OK);
+    vpx_codec_iter_t iter = nullptr;
+    while ((pkt = vpx_codec_get_cx_data(&enc, &iter)) != nullptr) {
+      ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT);
+      if (kf_max_dist == 0 || i % kf_max_dist == 0) {
+        ASSERT_EQ(pkt->data.frame.flags & VPX_FRAME_IS_KEY, VPX_FRAME_IS_KEY);
+      } else {
+        ASSERT_EQ(pkt->data.frame.flags & VPX_FRAME_IS_KEY, 0u);
+      }
+    }
+  }
+
+  // Flush the encoder.
+  bool got_data;
+  do {
+    ASSERT_EQ(vpx_codec_encode(&enc, nullptr, 0, 1, 0, deadline), VPX_CODEC_OK);
+    got_data = false;
+    vpx_codec_iter_t iter = nullptr;
+    while ((pkt = vpx_codec_get_cx_data(&enc, &iter)) != nullptr) {
+      ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT);
+      got_data = true;
+    }
+  } while (got_data);
+
+  vpx_img_free(image);
+  ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+}
+
+TEST(KeyframeIntervalTest, KeyframeMaximumInterval) {
+  std::vector<vpx_codec_iface_t *> ifaces;
+#if CONFIG_VP8_ENCODER
+  ifaces.push_back(vpx_codec_vp8_cx());
+#endif
+#if CONFIG_VP9_ENCODER
+  ifaces.push_back(vpx_codec_vp9_cx());
+#endif
+  for (vpx_codec_iface_t *iface : ifaces) {
+    for (vpx_enc_deadline_t deadline :
+         { VPX_DL_REALTIME, VPX_DL_GOOD_QUALITY, VPX_DL_BEST_QUALITY }) {
+      // Test 0 and 1 (both mean all intra), some powers of 2, some multiples
+      // of 10, and some prime numbers.
+      for (unsigned int kf_max_dist :
+           { 0, 1, 2, 3, 4, 7, 10, 13, 16, 20, 23, 29, 32 }) {
+        TestKeyframeMaximumInterval(iface, deadline, kf_max_dist);
+      }
+    }
+  }
+}
+
 }  // namespace
diff --git a/media/libvpx/libvpx/test/level_test.cc b/media/libvpx/libvpx/test/level_test.cc
index 67a794e6f8..03217a293e 100644
--- a/media/libvpx/libvpx/test/level_test.cc
+++ b/media/libvpx/libvpx/test/level_test.cc
@@ -7,11 +7,12 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
 #include "test/util.h"
+#include "vpx_config.h"
 
 namespace {
 class LevelTest
@@ -22,9 +23,9 @@ class LevelTest
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
         cpu_used_(GET_PARAM(2)), min_gf_internal_(24), target_level_(0),
         level_(0) {}
-  virtual ~LevelTest() {}
+  ~LevelTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
     if (encoding_mode_ != ::libvpx_test::kRealTime) {
@@ -41,8 +42,8 @@ class LevelTest
     cfg_.rc_min_quantizer = 0;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
       encoder->Control(VP9E_SET_TARGET_LEVEL, target_level_);
@@ -66,34 +67,46 @@ class LevelTest
   int level_;
 };
 
-TEST_P(LevelTest, TestTargetLevel11) {
+TEST_P(LevelTest, TestTargetLevel11Large) {
+#if CONFIG_REALTIME_ONLY
+  GTEST_SKIP();
+#else
   ASSERT_NE(encoding_mode_, ::libvpx_test::kRealTime);
   ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
-                                       90);
+                                       60);
   target_level_ = 11;
   cfg_.rc_target_bitrate = 150;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_EQ(target_level_, level_);
+  ASSERT_GE(target_level_, level_);
+#endif
 }
 
-TEST_P(LevelTest, TestTargetLevel20) {
+TEST_P(LevelTest, TestTargetLevel20Large) {
+#if CONFIG_REALTIME_ONLY
+  GTEST_SKIP();
+#else
   ASSERT_NE(encoding_mode_, ::libvpx_test::kRealTime);
   ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 90);
+                                       30, 1, 0, 60);
   target_level_ = 20;
   cfg_.rc_target_bitrate = 1200;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_EQ(target_level_, level_);
+  ASSERT_GE(target_level_, level_);
+#endif
 }
 
-TEST_P(LevelTest, TestTargetLevel31) {
+TEST_P(LevelTest, TestTargetLevel31Large) {
+#if CONFIG_REALTIME_ONLY
+  GTEST_SKIP();
+#else
   ASSERT_NE(encoding_mode_, ::libvpx_test::kRealTime);
   ::libvpx_test::I420VideoSource video("niklas_1280_720_30.y4m", 1280, 720, 30,
                                        1, 0, 60);
   target_level_ = 31;
   cfg_.rc_target_bitrate = 8000;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_EQ(target_level_, level_);
+  ASSERT_GE(target_level_, level_);
+#endif
 }
 
 // Test for keeping level stats only
@@ -103,11 +116,11 @@ TEST_P(LevelTest, TestTargetLevel0) {
   target_level_ = 0;
   min_gf_internal_ = 4;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_EQ(11, level_);
+  ASSERT_GE(11, level_);
 
   cfg_.rc_target_bitrate = 1600;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_EQ(20, level_);
+  ASSERT_GE(20, level_);
 }
 
 // Test for level control being turned off
@@ -120,7 +133,7 @@ TEST_P(LevelTest, TestTargetLevel255) {
 
 TEST_P(LevelTest, TestTargetLevelApi) {
   ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0, 1);
-  static const vpx_codec_iface_t *codec = &vpx_codec_vp9_cx_algo;
+  static vpx_codec_iface_t *codec = &vpx_codec_vp9_cx_algo;
   vpx_codec_ctx_t enc;
   vpx_codec_enc_cfg_t cfg;
   EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_config_default(codec, &cfg, 0));
@@ -130,7 +143,7 @@ TEST_P(LevelTest, TestTargetLevelApi) {
     if (level == 10 || level == 11 || level == 20 || level == 21 ||
         level == 30 || level == 31 || level == 40 || level == 41 ||
         level == 50 || level == 51 || level == 52 || level == 60 ||
-        level == 61 || level == 62 || level == 0 || level == 255)
+        level == 61 || level == 62 || level == 0 || level == 1 || level == 255)
       EXPECT_EQ(VPX_CODEC_OK,
                 vpx_codec_control(&enc, VP9E_SET_TARGET_LEVEL, level));
     else
@@ -140,8 +153,6 @@ TEST_P(LevelTest, TestTargetLevelApi) {
   EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&enc));
 }
 
-VP9_INSTANTIATE_TEST_CASE(LevelTest,
-                          ::testing::Values(::libvpx_test::kTwoPassGood,
-                                            ::libvpx_test::kOnePassGood),
-                          ::testing::Range(0, 9));
+VP9_INSTANTIATE_TEST_SUITE(LevelTest, ONE_OR_TWO_PASS_TEST_MODES,
+                           ::testing::Range(0, 9));
 }  // namespace
diff --git a/media/libvpx/libvpx/test/lpf_test.cc b/media/libvpx/libvpx/test/lpf_test.cc
index 4fca7d49c0..b1688f7d26 100644
--- a/media/libvpx/libvpx/test/lpf_test.cc
+++ b/media/libvpx/libvpx/test/lpf_test.cc
@@ -11,8 +11,9 @@
 #include <cmath>
 #include <cstdlib>
 #include <string>
+#include <tuple>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
@@ -35,29 +36,29 @@ const int kNumCoeffs = 1024;
 const int number_of_iterations = 10000;
 
 #if CONFIG_VP9_HIGHBITDEPTH
-typedef uint16_t Pixel;
+using Pixel = uint16_t;
 #define PIXEL_WIDTH 16
 
-typedef void (*loop_op_t)(Pixel *s, int p, const uint8_t *blimit,
-                          const uint8_t *limit, const uint8_t *thresh, int bd);
-typedef void (*dual_loop_op_t)(Pixel *s, int p, const uint8_t *blimit0,
-                               const uint8_t *limit0, const uint8_t *thresh0,
-                               const uint8_t *blimit1, const uint8_t *limit1,
-                               const uint8_t *thresh1, int bd);
+using loop_op_t = void (*)(Pixel *s, int p, const uint8_t *blimit,
+                           const uint8_t *limit, const uint8_t *thresh, int bd);
+using dual_loop_op_t = void (*)(Pixel *s, int p, const uint8_t *blimit0,
+                                const uint8_t *limit0, const uint8_t *thresh0,
+                                const uint8_t *blimit1, const uint8_t *limit1,
+                                const uint8_t *thresh1, int bd);
 #else
-typedef uint8_t Pixel;
+using Pixel = uint8_t;
 #define PIXEL_WIDTH 8
 
-typedef void (*loop_op_t)(Pixel *s, int p, const uint8_t *blimit,
-                          const uint8_t *limit, const uint8_t *thresh);
-typedef void (*dual_loop_op_t)(Pixel *s, int p, const uint8_t *blimit0,
-                               const uint8_t *limit0, const uint8_t *thresh0,
-                               const uint8_t *blimit1, const uint8_t *limit1,
-                               const uint8_t *thresh1);
+using loop_op_t = void (*)(Pixel *s, int p, const uint8_t *blimit,
+                           const uint8_t *limit, const uint8_t *thresh);
+using dual_loop_op_t = void (*)(Pixel *s, int p, const uint8_t *blimit0,
+                                const uint8_t *limit0, const uint8_t *thresh0,
+                                const uint8_t *blimit1, const uint8_t *limit1,
+                                const uint8_t *thresh1);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-typedef std::tr1::tuple<loop_op_t, loop_op_t, int> loop8_param_t;
-typedef std::tr1::tuple<dual_loop_op_t, dual_loop_op_t, int> dualloop8_param_t;
+using loop8_param_t = std::tuple<loop_op_t, loop_op_t, int>;
+using dualloop8_param_t = std::tuple<dual_loop_op_t, dual_loop_op_t, int>;
 
 void InitInput(Pixel *s, Pixel *ref_s, ACMRandom *rnd, const uint8_t limit,
                const int mask, const int32_t p, const int i) {
@@ -74,9 +75,9 @@ void InitInput(Pixel *s, Pixel *ref_s, ACMRandom *rnd, const uint8_t limit,
         if (j < 1) {
           tmp_s[j] = rnd->Rand16();
         } else if (val & 0x20) {  // Increment by a value within the limit.
-          tmp_s[j] = tmp_s[j - 1] + (limit - 1);
+          tmp_s[j] = static_cast<uint16_t>(tmp_s[j - 1] + (limit - 1));
         } else {  // Decrement by a value within the limit.
-          tmp_s[j] = tmp_s[j - 1] - (limit - 1);
+          tmp_s[j] = static_cast<uint16_t>(tmp_s[j - 1] - (limit - 1));
         }
         j++;
       }
@@ -93,11 +94,11 @@ void InitInput(Pixel *s, Pixel *ref_s, ACMRandom *rnd, const uint8_t limit,
         if (j < 1) {
           tmp_s[j] = rnd->Rand16();
         } else if (val & 0x20) {  // Increment by a value within the limit.
-          tmp_s[(j % 32) * 32 + j / 32] =
-              tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] + (limit - 1);
+          tmp_s[(j % 32) * 32 + j / 32] = static_cast<uint16_t>(
+              tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] + (limit - 1));
         } else {  // Decrement by a value within the limit.
-          tmp_s[(j % 32) * 32 + j / 32] =
-              tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] - (limit - 1);
+          tmp_s[(j % 32) * 32 + j / 32] = static_cast<uint16_t>(
+              tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] - (limit - 1));
         }
         j++;
       }
@@ -114,17 +115,29 @@ void InitInput(Pixel *s, Pixel *ref_s, ACMRandom *rnd, const uint8_t limit,
   }
 }
 
+uint8_t GetOuterThresh(ACMRandom *rnd) {
+  return static_cast<uint8_t>(rnd->RandRange(3 * MAX_LOOP_FILTER + 5));
+}
+
+uint8_t GetInnerThresh(ACMRandom *rnd) {
+  return static_cast<uint8_t>(rnd->RandRange(MAX_LOOP_FILTER + 1));
+}
+
+uint8_t GetHevThresh(ACMRandom *rnd) {
+  return static_cast<uint8_t>(rnd->RandRange(MAX_LOOP_FILTER + 1) >> 4);
+}
+
 class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
  public:
-  virtual ~Loop8Test6Param() {}
-  virtual void SetUp() {
+  ~Loop8Test6Param() override = default;
+  void SetUp() override {
     loopfilter_op_ = GET_PARAM(0);
     ref_loopfilter_op_ = GET_PARAM(1);
     bit_depth_ = GET_PARAM(2);
     mask_ = (1 << bit_depth_) - 1;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   int bit_depth_;
@@ -132,18 +145,21 @@ class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
   loop_op_t loopfilter_op_;
   loop_op_t ref_loopfilter_op_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test6Param);
 
+#if HAVE_NEON || HAVE_SSE2 || (HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH) || \
+    (HAVE_DSPR2 || HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH)
 class Loop8Test9Param : public ::testing::TestWithParam<dualloop8_param_t> {
  public:
-  virtual ~Loop8Test9Param() {}
-  virtual void SetUp() {
+  ~Loop8Test9Param() override = default;
+  void SetUp() override {
     loopfilter_op_ = GET_PARAM(0);
     ref_loopfilter_op_ = GET_PARAM(1);
     bit_depth_ = GET_PARAM(2);
     mask_ = (1 << bit_depth_) - 1;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   int bit_depth_;
@@ -151,6 +167,9 @@ class Loop8Test9Param : public ::testing::TestWithParam<dualloop8_param_t> {
   dual_loop_op_t loopfilter_op_;
   dual_loop_op_t ref_loopfilter_op_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test9Param);
+#endif  // HAVE_NEON || HAVE_SSE2 || (HAVE_DSPR2 || HAVE_MSA &&
+        // (!CONFIG_VP9_HIGHBITDEPTH) || (HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH))
 
 TEST_P(Loop8Test6Param, OperationCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -162,15 +181,15 @@ TEST_P(Loop8Test6Param, OperationCheck) {
   int first_failure = -1;
   for (int i = 0; i < count_test_block; ++i) {
     int err_count = 0;
-    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    uint8_t tmp = GetOuterThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    tmp = GetInnerThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
+    tmp = GetHevThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
@@ -221,15 +240,15 @@ TEST_P(Loop8Test6Param, ValueCheck) {
 
   for (int i = 0; i < count_test_block; ++i) {
     int err_count = 0;
-    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    uint8_t tmp = GetOuterThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    tmp = GetInnerThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
+    tmp = GetHevThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
@@ -262,6 +281,8 @@ TEST_P(Loop8Test6Param, ValueCheck) {
       << "First failed at test case " << first_failure;
 }
 
+#if HAVE_NEON || HAVE_SSE2 || (HAVE_LSX && (!CONFIG_VP9_HIGHBITDEPTH)) || \
+    (HAVE_DSPR2 || HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH))
 TEST_P(Loop8Test9Param, OperationCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   const int count_test_block = number_of_iterations;
@@ -271,27 +292,27 @@ TEST_P(Loop8Test9Param, OperationCheck) {
   int first_failure = -1;
   for (int i = 0; i < count_test_block; ++i) {
     int err_count = 0;
-    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    uint8_t tmp = GetOuterThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                      tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    tmp = GetInnerThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
+    tmp = GetHevThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                      tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    tmp = GetOuterThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                      tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    tmp = GetInnerThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
+    tmp = GetHevThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                      tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
@@ -334,27 +355,27 @@ TEST_P(Loop8Test9Param, ValueCheck) {
   int first_failure = -1;
   for (int i = 0; i < count_test_block; ++i) {
     int err_count = 0;
-    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    uint8_t tmp = GetOuterThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                      tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    tmp = GetInnerThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
+    tmp = GetHevThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                      tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    tmp = GetOuterThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                      tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    tmp = GetInnerThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
+    tmp = GetHevThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                      tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
@@ -389,12 +410,15 @@ TEST_P(Loop8Test9Param, ValueCheck) {
          "loopfilter output. "
       << "First failed at test case " << first_failure;
 }
+#endif  // HAVE_NEON || HAVE_SSE2 || (HAVE_DSPR2 || HAVE_MSA &&
+        // (!CONFIG_VP9_HIGHBITDEPTH)) || (HAVE_LSX &&
+        // (!CONFIG_VP9_HIGHBITDEPTH))
 
-using std::tr1::make_tuple;
+using std::make_tuple;
 
 #if HAVE_SSE2
 #if CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, Loop8Test6Param,
     ::testing::Values(make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,
                                  &vpx_highbd_lpf_horizontal_4_c, 8),
@@ -445,7 +469,7 @@ INSTANTIATE_TEST_CASE_P(
                       make_tuple(&vpx_highbd_lpf_vertical_16_dual_sse2,
                                  &vpx_highbd_lpf_vertical_16_dual_c, 12)));
 #else
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, Loop8Test6Param,
     ::testing::Values(
         make_tuple(&vpx_lpf_horizontal_4_sse2, &vpx_lpf_horizontal_4_c, 8),
@@ -462,7 +486,7 @@ INSTANTIATE_TEST_CASE_P(
 #endif
 
 #if HAVE_AVX2 && (!CONFIG_VP9_HIGHBITDEPTH)
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     AVX2, Loop8Test6Param,
     ::testing::Values(make_tuple(&vpx_lpf_horizontal_16_avx2,
                                  &vpx_lpf_horizontal_16_c, 8),
@@ -472,7 +496,7 @@ INSTANTIATE_TEST_CASE_P(
 
 #if HAVE_SSE2
 #if CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, Loop8Test9Param,
     ::testing::Values(make_tuple(&vpx_highbd_lpf_horizontal_4_dual_sse2,
                                  &vpx_highbd_lpf_horizontal_4_dual_c, 8),
@@ -499,7 +523,7 @@ INSTANTIATE_TEST_CASE_P(
                       make_tuple(&vpx_highbd_lpf_vertical_8_dual_sse2,
                                  &vpx_highbd_lpf_vertical_8_dual_c, 12)));
 #else
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, Loop8Test9Param,
     ::testing::Values(make_tuple(&vpx_lpf_horizontal_4_dual_sse2,
                                  &vpx_lpf_horizontal_4_dual_c, 8),
@@ -514,7 +538,7 @@ INSTANTIATE_TEST_CASE_P(
 
 #if HAVE_NEON
 #if CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     NEON, Loop8Test6Param,
     ::testing::Values(make_tuple(&vpx_highbd_lpf_horizontal_4_neon,
                                  &vpx_highbd_lpf_horizontal_4_c, 8),
@@ -564,7 +588,7 @@ INSTANTIATE_TEST_CASE_P(
                                  &vpx_highbd_lpf_vertical_16_dual_c, 10),
                       make_tuple(&vpx_highbd_lpf_vertical_16_dual_neon,
                                  &vpx_highbd_lpf_vertical_16_dual_c, 12)));
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     NEON, Loop8Test9Param,
     ::testing::Values(make_tuple(&vpx_highbd_lpf_horizontal_4_dual_neon,
                                  &vpx_highbd_lpf_horizontal_4_dual_c, 8),
@@ -591,7 +615,7 @@ INSTANTIATE_TEST_CASE_P(
                       make_tuple(&vpx_highbd_lpf_vertical_8_dual_neon,
                                  &vpx_highbd_lpf_vertical_8_dual_c, 12)));
 #else
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     NEON, Loop8Test6Param,
     ::testing::Values(
         make_tuple(&vpx_lpf_horizontal_16_neon, &vpx_lpf_horizontal_16_c, 8),
@@ -604,7 +628,7 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vpx_lpf_vertical_8_neon, &vpx_lpf_vertical_8_c, 8),
         make_tuple(&vpx_lpf_horizontal_4_neon, &vpx_lpf_horizontal_4_c, 8),
         make_tuple(&vpx_lpf_vertical_4_neon, &vpx_lpf_vertical_4_c, 8)));
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     NEON, Loop8Test9Param,
     ::testing::Values(make_tuple(&vpx_lpf_horizontal_8_dual_neon,
                                  &vpx_lpf_horizontal_8_dual_c, 8),
@@ -618,7 +642,7 @@ INSTANTIATE_TEST_CASE_P(
 #endif  // HAVE_NEON
 
 #if HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     DSPR2, Loop8Test6Param,
     ::testing::Values(
         make_tuple(&vpx_lpf_horizontal_4_dspr2, &vpx_lpf_horizontal_4_c, 8),
@@ -632,7 +656,7 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vpx_lpf_vertical_16_dual_dspr2, &vpx_lpf_vertical_16_dual_c,
                    8)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     DSPR2, Loop8Test9Param,
     ::testing::Values(make_tuple(&vpx_lpf_horizontal_4_dual_dspr2,
                                  &vpx_lpf_horizontal_4_dual_c, 8),
@@ -645,7 +669,7 @@ INSTANTIATE_TEST_CASE_P(
 #endif  // HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH
 
 #if HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     MSA, Loop8Test6Param,
     ::testing::Values(
         make_tuple(&vpx_lpf_horizontal_4_msa, &vpx_lpf_horizontal_4_c, 8),
@@ -657,7 +681,7 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vpx_lpf_vertical_8_msa, &vpx_lpf_vertical_8_c, 8),
         make_tuple(&vpx_lpf_vertical_16_msa, &vpx_lpf_vertical_16_c, 8)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     MSA, Loop8Test9Param,
     ::testing::Values(make_tuple(&vpx_lpf_horizontal_4_dual_msa,
                                  &vpx_lpf_horizontal_4_dual_c, 8),
@@ -669,4 +693,29 @@ INSTANTIATE_TEST_CASE_P(
                                  &vpx_lpf_vertical_8_dual_c, 8)));
 #endif  // HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
 
+#if HAVE_LSX && (!CONFIG_VP9_HIGHBITDEPTH)
+INSTANTIATE_TEST_SUITE_P(
+    LSX, Loop8Test6Param,
+    ::testing::Values(
+        make_tuple(&vpx_lpf_horizontal_4_lsx, &vpx_lpf_horizontal_4_c, 8),
+        make_tuple(&vpx_lpf_horizontal_8_lsx, &vpx_lpf_horizontal_8_c, 8),
+        make_tuple(&vpx_lpf_horizontal_16_dual_lsx,
+                   &vpx_lpf_horizontal_16_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_4_lsx, &vpx_lpf_vertical_4_c, 8),
+        make_tuple(&vpx_lpf_vertical_8_lsx, &vpx_lpf_vertical_8_c, 8),
+        make_tuple(&vpx_lpf_vertical_16_dual_lsx, &vpx_lpf_vertical_16_dual_c,
+                   8)));
+
+INSTANTIATE_TEST_SUITE_P(
+    LSX, Loop8Test9Param,
+    ::testing::Values(make_tuple(&vpx_lpf_horizontal_4_dual_lsx,
+                                 &vpx_lpf_horizontal_4_dual_c, 8),
+                      make_tuple(&vpx_lpf_horizontal_8_dual_lsx,
+                                 &vpx_lpf_horizontal_8_dual_c, 8),
+                      make_tuple(&vpx_lpf_vertical_4_dual_lsx,
+                                 &vpx_lpf_vertical_4_dual_c, 8),
+                      make_tuple(&vpx_lpf_vertical_8_dual_lsx,
+                                 &vpx_lpf_vertical_8_dual_c, 8)));
+#endif  // HAVE_LSX && (!CONFIG_VP9_HIGHBITDEPTH)
+
 }  // namespace
diff --git a/media/libvpx/libvpx/test/md5_helper.h b/media/libvpx/libvpx/test/md5_helper.h
index ef310a2d90..9095d96a8a 100644
--- a/media/libvpx/libvpx/test/md5_helper.h
+++ b/media/libvpx/libvpx/test/md5_helper.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef TEST_MD5_HELPER_H_
-#define TEST_MD5_HELPER_H_
+#ifndef VPX_TEST_MD5_HELPER_H_
+#define VPX_TEST_MD5_HELPER_H_
 
 #include "./md5_utils.h"
 #include "vpx/vpx_decoder.h"
@@ -47,7 +47,7 @@ class MD5 {
     MD5Update(&md5_, data, static_cast<uint32_t>(size));
   }
 
-  const char *Get(void) {
+  const char *Get() {
     static const char hex[16] = {
       '0', '1', '2', '3', '4', '5', '6', '7',
       '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
@@ -72,4 +72,4 @@ class MD5 {
 
 }  // namespace libvpx_test
 
-#endif  // TEST_MD5_HELPER_H_
+#endif  // VPX_TEST_MD5_HELPER_H_
diff --git a/media/libvpx/libvpx/test/minmax_test.cc b/media/libvpx/libvpx/test/minmax_test.cc
index e51c9fd48c..8c758ab45f 100644
--- a/media/libvpx/libvpx/test/minmax_test.cc
+++ b/media/libvpx/libvpx/test/minmax_test.cc
@@ -11,10 +11,12 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
+#include "vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
 
 #include "test/acm_random.h"
 #include "test/register_state_check.h"
@@ -23,12 +25,12 @@ namespace {
 
 using ::libvpx_test::ACMRandom;
 
-typedef void (*MinMaxFunc)(const uint8_t *a, int a_stride, const uint8_t *b,
-                           int b_stride, int *min, int *max);
+using MinMaxFunc = void (*)(const uint8_t *a, int a_stride, const uint8_t *b,
+                            int b_stride, int *min, int *max);
 
 class MinMaxTest : public ::testing::TestWithParam<MinMaxFunc> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     mm_func_ = GetParam();
     rnd_.Reset(ACMRandom::DeterministicSeed());
   }
@@ -107,24 +109,141 @@ TEST_P(MinMaxTest, CompareReferenceAndVaryStride) {
       int min_ref, max_ref, min, max;
       reference_minmax(a, a_stride, b, b_stride, &min_ref, &max_ref);
       ASM_REGISTER_STATE_CHECK(mm_func_(a, a_stride, b, b_stride, &min, &max));
-      EXPECT_EQ(max_ref, max) << "when a_stride = " << a_stride
-                              << " and b_stride = " << b_stride;
-      EXPECT_EQ(min_ref, min) << "when a_stride = " << a_stride
-                              << " and b_stride = " << b_stride;
+      EXPECT_EQ(max_ref, max)
+          << "when a_stride = " << a_stride << " and b_stride = " << b_stride;
+      EXPECT_EQ(min_ref, min)
+          << "when a_stride = " << a_stride << " and b_stride = " << b_stride;
     }
   }
 }
 
-INSTANTIATE_TEST_CASE_P(C, MinMaxTest, ::testing::Values(&vpx_minmax_8x8_c));
+#if CONFIG_VP9_HIGHBITDEPTH
+
+using HBDMinMaxTest = MinMaxTest;
+
+void highbd_reference_minmax(const uint8_t *a, int a_stride, const uint8_t *b,
+                             int b_stride, int *min_ret, int *max_ret) {
+  int min = 65535;
+  int max = 0;
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a);
+  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b);
+  for (int i = 0; i < 8; i++) {
+    for (int j = 0; j < 8; j++) {
+      const int diff = abs(a_ptr[i * a_stride + j] - b_ptr[i * b_stride + j]);
+      if (min > diff) min = diff;
+      if (max < diff) max = diff;
+    }
+  }
+
+  *min_ret = min;
+  *max_ret = max;
+}
+
+TEST_P(HBDMinMaxTest, MinValue) {
+  uint8_t *a = CONVERT_TO_BYTEPTR(
+      reinterpret_cast<uint16_t *>(vpx_malloc(64 * sizeof(uint16_t))));
+  uint8_t *b = CONVERT_TO_BYTEPTR(
+      reinterpret_cast<uint16_t *>(vpx_malloc(64 * sizeof(uint16_t))));
+  for (int i = 0; i < 64; i++) {
+    vpx_memset16(CONVERT_TO_SHORTPTR(a), 0, 64);
+    vpx_memset16(CONVERT_TO_SHORTPTR(b), 65535, 64);
+    CONVERT_TO_SHORTPTR(b)[i] = i;  // Set a minimum difference of i.
+
+    int min, max;
+    ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
+    EXPECT_EQ(65535, max);
+    EXPECT_EQ(i, min);
+  }
+  vpx_free(CONVERT_TO_SHORTPTR(a));
+  vpx_free(CONVERT_TO_SHORTPTR(b));
+}
+
+TEST_P(HBDMinMaxTest, MaxValue) {
+  uint8_t *a = CONVERT_TO_BYTEPTR(
+      reinterpret_cast<uint16_t *>(vpx_malloc(64 * sizeof(uint16_t))));
+  uint8_t *b = CONVERT_TO_BYTEPTR(
+      reinterpret_cast<uint16_t *>(vpx_malloc(64 * sizeof(uint16_t))));
+  for (int i = 0; i < 64; i++) {
+    vpx_memset16(CONVERT_TO_SHORTPTR(a), 0, 64);
+    vpx_memset16(CONVERT_TO_SHORTPTR(b), 0, 64);
+    CONVERT_TO_SHORTPTR(b)[i] = i;  // Set a minimum difference of i.
+
+    int min, max;
+    ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
+    EXPECT_EQ(i, max);
+    EXPECT_EQ(0, min);
+  }
+  vpx_free(CONVERT_TO_SHORTPTR(a));
+  vpx_free(CONVERT_TO_SHORTPTR(b));
+}
+
+TEST_P(HBDMinMaxTest, CompareReference) {
+  uint8_t *a = CONVERT_TO_BYTEPTR(
+      reinterpret_cast<uint16_t *>(vpx_malloc(64 * sizeof(uint16_t))));
+  uint8_t *b = CONVERT_TO_BYTEPTR(
+      reinterpret_cast<uint16_t *>(vpx_malloc(64 * sizeof(uint16_t))));
+  for (int j = 0; j < 64; j++) {
+    CONVERT_TO_SHORTPTR(a)[j] = rnd_.Rand16();
+    CONVERT_TO_SHORTPTR(b)[j] = rnd_.Rand16();
+  }
+
+  int min_ref, max_ref, min, max;
+  highbd_reference_minmax(a, 8, b, 8, &min_ref, &max_ref);
+  ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
+  vpx_free(CONVERT_TO_SHORTPTR(a));
+  vpx_free(CONVERT_TO_SHORTPTR(b));
+  EXPECT_EQ(max_ref, max);
+  EXPECT_EQ(min_ref, min);
+}
+
+TEST_P(HBDMinMaxTest, CompareReferenceAndVaryStride) {
+  uint8_t *a = CONVERT_TO_BYTEPTR(
+      reinterpret_cast<uint16_t *>(vpx_malloc((8 * 64) * sizeof(uint16_t))));
+  uint8_t *b = CONVERT_TO_BYTEPTR(
+      reinterpret_cast<uint16_t *>(vpx_malloc((8 * 64) * sizeof(uint16_t))));
+  for (int i = 0; i < 8 * 64; i++) {
+    CONVERT_TO_SHORTPTR(a)[i] = rnd_.Rand16();
+    CONVERT_TO_SHORTPTR(b)[i] = rnd_.Rand16();
+  }
+  for (int a_stride = 8; a_stride <= 64; a_stride += 8) {
+    for (int b_stride = 8; b_stride <= 64; b_stride += 8) {
+      int min_ref, max_ref, min, max;
+      highbd_reference_minmax(a, a_stride, b, b_stride, &min_ref, &max_ref);
+      ASM_REGISTER_STATE_CHECK(mm_func_(a, a_stride, b, b_stride, &min, &max));
+      EXPECT_EQ(max_ref, max)
+          << "when a_stride = " << a_stride << " and b_stride = " << b_stride;
+      EXPECT_EQ(min_ref, min)
+          << "when a_stride = " << a_stride << " and b_stride = " << b_stride;
+    }
+  }
+  vpx_free(CONVERT_TO_SHORTPTR(a));
+  vpx_free(CONVERT_TO_SHORTPTR(b));
+}
+#endif
+
+INSTANTIATE_TEST_SUITE_P(C, MinMaxTest, ::testing::Values(&vpx_minmax_8x8_c));
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(C, HBDMinMaxTest,
+                         ::testing::Values(&vpx_highbd_minmax_8x8_c));
+#endif
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, MinMaxTest,
-                        ::testing::Values(&vpx_minmax_8x8_sse2));
+INSTANTIATE_TEST_SUITE_P(SSE2, MinMaxTest,
+                         ::testing::Values(&vpx_minmax_8x8_sse2));
 #endif
 
 #if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(NEON, MinMaxTest,
-                        ::testing::Values(&vpx_minmax_8x8_neon));
+INSTANTIATE_TEST_SUITE_P(NEON, MinMaxTest,
+                         ::testing::Values(&vpx_minmax_8x8_neon));
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(NEON, HBDMinMaxTest,
+                         ::testing::Values(&vpx_highbd_minmax_8x8_neon));
+#endif
+#endif
+
+#if HAVE_MSA
+INSTANTIATE_TEST_SUITE_P(MSA, MinMaxTest,
+                         ::testing::Values(&vpx_minmax_8x8_msa));
 #endif
 
 }  // namespace
diff --git a/media/libvpx/libvpx/test/non_greedy_mv_test.cc b/media/libvpx/libvpx/test/non_greedy_mv_test.cc
new file mode 100644
index 0000000000..6b5dcc651b
--- /dev/null
+++ b/media/libvpx/libvpx/test/non_greedy_mv_test.cc
@@ -0,0 +1,200 @@
+/*
+ *  Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include "gtest/gtest.h"
+#include "vp9/encoder/vp9_non_greedy_mv.h"
+#include "./vpx_dsp_rtcd.h"
+
+namespace {
+
+static void read_in_mf(const char *filename, int *rows_ptr, int *cols_ptr,
+                       MV **buffer_ptr) {
+  FILE *input = fopen(filename, "rb");
+  int row, col;
+  int idx;
+
+  ASSERT_NE(input, nullptr) << "Cannot open file: " << filename << std::endl;
+
+  fscanf(input, "%d,%d\n", rows_ptr, cols_ptr);
+
+  *buffer_ptr = (MV *)malloc((*rows_ptr) * (*cols_ptr) * sizeof(MV));
+
+  for (idx = 0; idx < (*rows_ptr) * (*cols_ptr); ++idx) {
+    fscanf(input, "%d,%d;", &row, &col);
+    (*buffer_ptr)[idx].row = row;
+    (*buffer_ptr)[idx].col = col;
+  }
+  fclose(input);
+}
+
+static void read_in_local_var(const char *filename, int *rows_ptr,
+                              int *cols_ptr,
+                              int (**M_ptr)[MF_LOCAL_STRUCTURE_SIZE]) {
+  FILE *input = fopen(filename, "rb");
+  int M00, M01, M10, M11;
+  int idx;
+  int int_type;
+
+  ASSERT_NE(input, nullptr) << "Cannot open file: " << filename << std::endl;
+
+  fscanf(input, "%d,%d\n", rows_ptr, cols_ptr);
+
+  *M_ptr = (int(*)[MF_LOCAL_STRUCTURE_SIZE])malloc(
+      (*rows_ptr) * (*cols_ptr) * MF_LOCAL_STRUCTURE_SIZE * sizeof(int_type));
+
+  for (idx = 0; idx < (*rows_ptr) * (*cols_ptr); ++idx) {
+    fscanf(input, "%d,%d,%d,%d;", &M00, &M01, &M10, &M11);
+    (*M_ptr)[idx][0] = M00;
+    (*M_ptr)[idx][1] = M01;
+    (*M_ptr)[idx][2] = M10;
+    (*M_ptr)[idx][3] = M11;
+  }
+  fclose(input);
+}
+
+static void compare_mf(const MV *mf1, const MV *mf2, int rows, int cols,
+                       float *mean_ptr, float *std_ptr) {
+  float float_type;
+  float *diffs = (float *)malloc(rows * cols * sizeof(float_type));
+  int idx;
+  float accu = 0.0f;
+  for (idx = 0; idx < rows * cols; ++idx) {
+    MV mv1 = mf1[idx];
+    MV mv2 = mf2[idx];
+    float row_diff2 = (float)((mv1.row - mv2.row) * (mv1.row - mv2.row));
+    float col_diff2 = (float)((mv1.col - mv2.col) * (mv1.col - mv2.col));
+    diffs[idx] = sqrt(row_diff2 + col_diff2);
+    accu += diffs[idx];
+  }
+  *mean_ptr = accu / rows / cols;
+  *std_ptr = 0;
+  for (idx = 0; idx < rows * cols; ++idx) {
+    *std_ptr += (diffs[idx] - (*mean_ptr)) * (diffs[idx] - (*mean_ptr));
+  }
+  *std_ptr = sqrt(*std_ptr / rows / cols);
+  free(diffs);
+}
+
+static void load_frame_info(const char *filename,
+                            YV12_BUFFER_CONFIG *ref_frame_ptr) {
+  FILE *input = fopen(filename, "rb");
+  int idx;
+  uint8_t data_type;
+
+  ASSERT_NE(input, nullptr) << "Cannot open file: " << filename << std::endl;
+
+  fscanf(input, "%d,%d\n", &(ref_frame_ptr->y_height),
+         &(ref_frame_ptr->y_width));
+
+  ref_frame_ptr->y_buffer = (uint8_t *)malloc(
+      (ref_frame_ptr->y_width) * (ref_frame_ptr->y_height) * sizeof(data_type));
+
+  for (idx = 0; idx < (ref_frame_ptr->y_width) * (ref_frame_ptr->y_height);
+       ++idx) {
+    int value;
+    fscanf(input, "%d,", &value);
+    ref_frame_ptr->y_buffer[idx] = (uint8_t)value;
+  }
+
+  ref_frame_ptr->y_stride = ref_frame_ptr->y_width;
+  fclose(input);
+}
+
+static int compare_local_var(const int (*local_var1)[MF_LOCAL_STRUCTURE_SIZE],
+                             const int (*local_var2)[MF_LOCAL_STRUCTURE_SIZE],
+                             int rows, int cols) {
+  int diff = 0;
+  int outter_idx, inner_idx;
+  for (outter_idx = 0; outter_idx < rows * cols; ++outter_idx) {
+    for (inner_idx = 0; inner_idx < MF_LOCAL_STRUCTURE_SIZE; ++inner_idx) {
+      diff += abs(local_var1[outter_idx][inner_idx] -
+                  local_var2[outter_idx][inner_idx]);
+    }
+  }
+  return diff / rows / cols;
+}
+
+TEST(non_greedy_mv, smooth_mf) {
+  const char *search_mf_file = "non_greedy_mv_test_files/exhaust_16x16.txt";
+  const char *local_var_file = "non_greedy_mv_test_files/localVar_16x16.txt";
+  const char *estimation_file = "non_greedy_mv_test_files/estimation_16x16.txt";
+  const char *ground_truth_file =
+      "non_greedy_mv_test_files/ground_truth_16x16.txt";
+  BLOCK_SIZE bsize = BLOCK_32X32;
+  MV *search_mf = nullptr;
+  MV *smooth_mf = nullptr;
+  MV *estimation = nullptr;
+  MV *ground_truth = nullptr;
+  int(*local_var)[MF_LOCAL_STRUCTURE_SIZE] = nullptr;
+  int rows = 0, cols = 0;
+
+  int alpha = 100, max_iter = 100;
+
+  read_in_mf(search_mf_file, &rows, &cols, &search_mf);
+  read_in_local_var(local_var_file, &rows, &cols, &local_var);
+  read_in_mf(estimation_file, &rows, &cols, &estimation);
+  read_in_mf(ground_truth_file, &rows, &cols, &ground_truth);
+
+  float sm_mean, sm_std;
+  float est_mean, est_std;
+
+  smooth_mf = (MV *)malloc(rows * cols * sizeof(MV));
+  vp9_get_smooth_motion_field(search_mf, local_var, rows, cols, bsize, alpha,
+                              max_iter, smooth_mf);
+
+  compare_mf(smooth_mf, ground_truth, rows, cols, &sm_mean, &sm_std);
+  compare_mf(smooth_mf, estimation, rows, cols, &est_mean, &est_std);
+
+  EXPECT_LE(sm_mean, 3);
+  EXPECT_LE(est_mean, 2);
+
+  free(search_mf);
+  free(local_var);
+  free(estimation);
+  free(ground_truth);
+  free(smooth_mf);
+}
+
+TEST(non_greedy_mv, local_var) {
+  const char *ref_frame_file = "non_greedy_mv_test_files/ref_frame_16x16.txt";
+  const char *cur_frame_file = "non_greedy_mv_test_files/cur_frame_16x16.txt";
+  const char *gt_local_var_file = "non_greedy_mv_test_files/localVar_16x16.txt";
+  const char *search_mf_file = "non_greedy_mv_test_files/exhaust_16x16.txt";
+  BLOCK_SIZE bsize = BLOCK_16X16;
+  int(*gt_local_var)[MF_LOCAL_STRUCTURE_SIZE] = nullptr;
+  int(*est_local_var)[MF_LOCAL_STRUCTURE_SIZE] = nullptr;
+  YV12_BUFFER_CONFIG ref_frame, cur_frame;
+  int rows, cols;
+  MV *search_mf;
+  int int_type;
+  int local_var_diff;
+  vp9_variance_fn_ptr_t fn;
+
+  load_frame_info(ref_frame_file, &ref_frame);
+  load_frame_info(cur_frame_file, &cur_frame);
+  read_in_mf(search_mf_file, &rows, &cols, &search_mf);
+
+  fn.sdf = vpx_sad16x16;
+  est_local_var = (int(*)[MF_LOCAL_STRUCTURE_SIZE])malloc(
+      rows * cols * MF_LOCAL_STRUCTURE_SIZE * sizeof(int_type));
+  vp9_get_local_structure(&cur_frame, &ref_frame, search_mf, &fn, rows, cols,
+                          bsize, est_local_var);
+  read_in_local_var(gt_local_var_file, &rows, &cols, &gt_local_var);
+
+  local_var_diff = compare_local_var(est_local_var, gt_local_var, rows, cols);
+
+  EXPECT_LE(local_var_diff, 1);
+
+  free(gt_local_var);
+  free(est_local_var);
+  free(ref_frame.y_buffer);
+}
+}  // namespace
diff --git a/media/libvpx/libvpx/test/partial_idct_test.cc b/media/libvpx/libvpx/test/partial_idct_test.cc
index 7e901bd033..da391b1a7e 100644
--- a/media/libvpx/libvpx/test/partial_idct_test.cc
+++ b/media/libvpx/libvpx/test/partial_idct_test.cc
@@ -11,10 +11,10 @@
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
-
 #include <limits>
+#include <tuple>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "./vp9_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
@@ -25,16 +25,17 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_scan.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_config.h"
 #include "vpx_ports/vpx_timer.h"
 
 using libvpx_test::ACMRandom;
 
 namespace {
 
-typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);
-typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);
-typedef void (*InvTxfmWithBdFunc)(const tran_low_t *in, uint8_t *out,
-                                  int stride, int bd);
+using FwdTxfmFunc = void (*)(const int16_t *in, tran_low_t *out, int stride);
+using InvTxfmFunc = void (*)(const tran_low_t *in, uint8_t *out, int stride);
+using InvTxfmWithBdFunc = void (*)(const tran_low_t *in, uint8_t *out,
+                                   int stride, int bd);
 
 template <InvTxfmFunc fn>
 void wrapper(const tran_low_t *in, uint8_t *out, int stride, int bd) {
@@ -43,53 +44,29 @@ void wrapper(const tran_low_t *in, uint8_t *out, int stride, int bd) {
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-template <InvTxfmWithBdFunc fn>
+using InvTxfmHighbdFunc = void (*)(const tran_low_t *in, uint16_t *out,
+                                   int stride, int bd);
+
+template <InvTxfmHighbdFunc fn>
 void highbd_wrapper(const tran_low_t *in, uint8_t *out, int stride, int bd) {
-  fn(in, CONVERT_TO_BYTEPTR(out), stride, bd);
+  fn(in, CAST_TO_SHORTPTR(out), stride, bd);
 }
 #endif
 
-typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmWithBdFunc, InvTxfmWithBdFunc,
-                        TX_SIZE, int, int, int>
-    PartialInvTxfmParam;
+using PartialInvTxfmParam =
+    std::tuple<FwdTxfmFunc, InvTxfmWithBdFunc, InvTxfmWithBdFunc, TX_SIZE, int,
+               int, int>;
 const int kMaxNumCoeffs = 1024;
 const int kCountTestBlock = 1000;
 
-// https://bugs.chromium.org/p/webm/issues/detail?id=1332
-// The functions specified do not pass with INT16_MIN/MAX. They fail at the
-// value specified, but pass when 1 is added/subtracted.
-int16_t MaxSupportedCoeff(InvTxfmWithBdFunc a) {
-#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_EMULATE_HARDWARE
-  if (a == &wrapper<vpx_idct8x8_64_add_ssse3> ||
-      a == &wrapper<vpx_idct8x8_12_add_ssse3>) {
-    return 23625 - 1;
-  }
-#else
-  (void)a;
-#endif
-  return std::numeric_limits<int16_t>::max();
-}
-
-int16_t MinSupportedCoeff(InvTxfmWithBdFunc a) {
-#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_EMULATE_HARDWARE
-  if (a == &wrapper<vpx_idct8x8_64_add_ssse3> ||
-      a == &wrapper<vpx_idct8x8_12_add_ssse3>) {
-    return -23625 + 1;
-  }
-#else
-  (void)a;
-#endif
-  return std::numeric_limits<int16_t>::min();
-}
-
 class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
  public:
-  virtual ~PartialIDctTest() {}
-  virtual void SetUp() {
+  ~PartialIDctTest() override = default;
+  void SetUp() override {
     rnd_.Reset(ACMRandom::DeterministicSeed());
-    ftxfm_ = GET_PARAM(0);
-    full_itxfm_ = GET_PARAM(1);
-    partial_itxfm_ = GET_PARAM(2);
+    fwd_txfm_ = GET_PARAM(0);
+    full_inv_txfm_ = GET_PARAM(1);
+    partial_inv_txfm_ = GET_PARAM(2);
     tx_size_ = GET_PARAM(3);
     last_nonzero_ = GET_PARAM(4);
     bit_depth_ = GET_PARAM(5);
@@ -101,7 +78,7 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
       case TX_8X8: size_ = 8; break;
       case TX_16X16: size_ = 16; break;
       case TX_32X32: size_ = 32; break;
-      default: FAIL() << "Wrong Size!"; break;
+      default: FAIL() << "Wrong Size!";
     }
 
     // Randomize stride_ to a value less than or equal to 1024
@@ -125,13 +102,13 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
         vpx_memalign(16, pixel_size_ * output_block_size_));
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     vpx_free(input_block_);
-    input_block_ = NULL;
+    input_block_ = nullptr;
     vpx_free(output_block_);
-    output_block_ = NULL;
+    output_block_ = nullptr;
     vpx_free(output_block_ref_);
-    output_block_ref_ = NULL;
+    output_block_ref_ = nullptr;
     libvpx_test::ClearSystemState();
   }
 
@@ -153,12 +130,12 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
   }
 
   void InitInput() {
-    const int max_coeff = 32766 / 4;
-    int max_energy_leftover = max_coeff * max_coeff;
+    const int64_t max_coeff = (32766 << (bit_depth_ - 8)) / 4;
+    int64_t max_energy_leftover = max_coeff * max_coeff;
     for (int j = 0; j < last_nonzero_; ++j) {
-      int16_t coeff = static_cast<int16_t>(sqrt(1.0 * max_energy_leftover) *
-                                           (rnd_.Rand16() - 32768) / 65536);
-      max_energy_leftover -= coeff * coeff;
+      tran_low_t coeff = static_cast<tran_low_t>(
+          sqrt(1.0 * max_energy_leftover) * (rnd_.Rand16() - 32768) / 65536);
+      max_energy_leftover -= static_cast<int64_t>(coeff) * coeff;
       if (max_energy_leftover < 0) {
         max_energy_leftover = 0;
         coeff = 0;
@@ -167,6 +144,36 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
     }
   }
 
+  void PrintDiff() {
+    if (memcmp(output_block_ref_, output_block_,
+               pixel_size_ * output_block_size_)) {
+      uint16_t ref, opt;
+      for (int y = 0; y < size_; y++) {
+        for (int x = 0; x < size_; x++) {
+          if (pixel_size_ == 1) {
+            ref = output_block_ref_[y * stride_ + x];
+            opt = output_block_[y * stride_ + x];
+          } else {
+            ref = reinterpret_cast<uint16_t *>(
+                output_block_ref_)[y * stride_ + x];
+            opt = reinterpret_cast<uint16_t *>(output_block_)[y * stride_ + x];
+          }
+          if (ref != opt) {
+            printf("dest[%d][%d] diff:%6d (ref),%6d (opt)\n", y, x, ref, opt);
+          }
+        }
+      }
+
+      printf("\ninput_block_:\n");
+      for (int y = 0; y < size_; y++) {
+        for (int x = 0; x < size_; x++) {
+          printf("%6d,", input_block_[y * size_ + x]);
+        }
+        printf("\n");
+      }
+    }
+  }
+
  protected:
   int last_nonzero_;
   TX_SIZE tx_size_;
@@ -180,34 +187,43 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
   int output_block_size_;
   int bit_depth_;
   int mask_;
-  FwdTxfmFunc ftxfm_;
-  InvTxfmWithBdFunc full_itxfm_;
-  InvTxfmWithBdFunc partial_itxfm_;
+  FwdTxfmFunc fwd_txfm_;
+  InvTxfmWithBdFunc full_inv_txfm_;
+  InvTxfmWithBdFunc partial_inv_txfm_;
   ACMRandom rnd_;
 };
 
 TEST_P(PartialIDctTest, RunQuantCheck) {
+  const int count_test_block = (size_ != 4) ? kCountTestBlock : 65536;
   DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxNumCoeffs]);
   DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kMaxNumCoeffs]);
 
   InitMem();
-  for (int i = 0; i < kCountTestBlock * kCountTestBlock; ++i) {
+
+  for (int i = 0; i < count_test_block; ++i) {
     // Initialize a test block with input range [-mask_, mask_].
-    if (i == 0) {
-      for (int k = 0; k < input_block_size_; ++k) {
-        input_extreme_block[k] = mask_;
-      }
-    } else if (i == 1) {
-      for (int k = 0; k < input_block_size_; ++k) {
-        input_extreme_block[k] = -mask_;
+    if (size_ != 4) {
+      if (i == 0) {
+        for (int k = 0; k < input_block_size_; ++k) {
+          input_extreme_block[k] = mask_;
+        }
+      } else if (i == 1) {
+        for (int k = 0; k < input_block_size_; ++k) {
+          input_extreme_block[k] = -mask_;
+        }
+      } else {
+        for (int k = 0; k < input_block_size_; ++k) {
+          input_extreme_block[k] = rnd_.Rand8() % 2 ? mask_ : -mask_;
+        }
       }
     } else {
+      // Try all possible combinations.
       for (int k = 0; k < input_block_size_; ++k) {
-        input_extreme_block[k] = rnd_.Rand8() % 2 ? mask_ : -mask_;
+        input_extreme_block[k] = (i & (1 << k)) ? mask_ : -mask_;
       }
     }
 
-    ftxfm_(input_extreme_block, output_ref_block, size_);
+    fwd_txfm_(input_extreme_block, output_ref_block, size_);
 
     // quantization with minimum allowed step sizes
     input_block_[0] = (output_ref_block[0] / 4) * 4;
@@ -217,9 +233,9 @@ TEST_P(PartialIDctTest, RunQuantCheck) {
     }
 
     ASM_REGISTER_STATE_CHECK(
-        full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_));
+        full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_));
     ASM_REGISTER_STATE_CHECK(
-        partial_itxfm_(input_block_, output_block_, stride_, bit_depth_));
+        partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_));
     ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
                         pixel_size_ * output_block_size_))
         << "Error: partial inverse transform produces different results";
@@ -232,9 +248,9 @@ TEST_P(PartialIDctTest, ResultsMatch) {
     InitInput();
 
     ASM_REGISTER_STATE_CHECK(
-        full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_));
+        full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_));
     ASM_REGISTER_STATE_CHECK(
-        partial_itxfm_(input_block_, output_block_, stride_, bit_depth_));
+        partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_));
     ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
                         pixel_size_ * output_block_size_))
         << "Error: partial inverse transform produces different results";
@@ -249,9 +265,9 @@ TEST_P(PartialIDctTest, AddOutputBlock) {
     }
 
     ASM_REGISTER_STATE_CHECK(
-        full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_));
+        full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_));
     ASM_REGISTER_STATE_CHECK(
-        partial_itxfm_(input_block_, output_block_, stride_, bit_depth_));
+        partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_));
     ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
                         pixel_size_ * output_block_size_))
         << "Error: Transform results are not correctly added to output.";
@@ -259,8 +275,8 @@ TEST_P(PartialIDctTest, AddOutputBlock) {
 }
 
 TEST_P(PartialIDctTest, SingleExtremeCoeff) {
-  const int16_t max_coeff = MaxSupportedCoeff(partial_itxfm_);
-  const int16_t min_coeff = MinSupportedCoeff(partial_itxfm_);
+  const int16_t max_coeff = std::numeric_limits<int16_t>::max();
+  const int16_t min_coeff = std::numeric_limits<int16_t>::min();
   for (int i = 0; i < last_nonzero_; ++i) {
     memset(input_block_, 0, sizeof(*input_block_) * input_block_size_);
     // Run once for min and once for max.
@@ -272,9 +288,9 @@ TEST_P(PartialIDctTest, SingleExtremeCoeff) {
       input_block_[vp9_default_scan_orders[tx_size_].scan[i]] = coeff;
 
       ASM_REGISTER_STATE_CHECK(
-          full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_));
+          full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_));
       ASM_REGISTER_STATE_CHECK(
-          partial_itxfm_(input_block_, output_block_, stride_, bit_depth_));
+          partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_));
       ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
                           pixel_size_ * output_block_size_))
           << "Error: Fails with single coeff of " << coeff << " at " << i
@@ -291,26 +307,26 @@ TEST_P(PartialIDctTest, DISABLED_Speed) {
 
   for (int i = 0; i < kCountSpeedTestBlock; ++i) {
     ASM_REGISTER_STATE_CHECK(
-        full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_));
+        full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_));
   }
   vpx_usec_timer timer;
   vpx_usec_timer_start(&timer);
   for (int i = 0; i < kCountSpeedTestBlock; ++i) {
-    partial_itxfm_(input_block_, output_block_, stride_, bit_depth_);
+    partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_);
   }
   libvpx_test::ClearSystemState();
   vpx_usec_timer_mark(&timer);
   const int elapsed_time =
       static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
-  printf("idct%dx%d_%d (bitdepth %d) time: %5d ms ", size_, size_,
-         last_nonzero_, bit_depth_, elapsed_time);
-
+  printf("idct%dx%d_%d (%s %d) time: %5d ms\n", size_, size_, last_nonzero_,
+         (pixel_size_ == 1) ? "bitdepth" : "high bitdepth", bit_depth_,
+         elapsed_time);
   ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
                       pixel_size_ * output_block_size_))
       << "Error: partial inverse transform produces different results";
 }
 
-using std::tr1::make_tuple;
+using std::make_tuple;
 
 const PartialInvTxfmParam c_partial_idct_tests[] = {
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -323,6 +339,15 @@ const PartialInvTxfmParam c_partial_idct_tests[] = {
   make_tuple(
       &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
       &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>, TX_32X32, 1024, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>, TX_32X32, 135, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>, TX_32X32, 135, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>, TX_32X32, 135, 12, 2),
   make_tuple(
       &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
       &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>, TX_32X32, 34, 8, 2),
@@ -350,6 +375,15 @@ const PartialInvTxfmParam c_partial_idct_tests[] = {
   make_tuple(
       &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
       &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>, TX_16X16, 256, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>, TX_16X16, 38, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>, TX_16X16, 38, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>, TX_16X16, 38, 12, 2),
   make_tuple(
       &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
       &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>, TX_16X16, 10, 8, 2),
@@ -424,6 +458,8 @@ const PartialInvTxfmParam c_partial_idct_tests[] = {
              &wrapper<vpx_idct32x32_1_add_c>, TX_32X32, 1, 8, 1),
   make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
              &wrapper<vpx_idct16x16_256_add_c>, TX_16X16, 256, 8, 1),
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
+             &wrapper<vpx_idct16x16_38_add_c>, TX_16X16, 38, 8, 1),
   make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
              &wrapper<vpx_idct16x16_10_add_c>, TX_16X16, 10, 8, 1),
   make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
@@ -440,12 +476,89 @@ const PartialInvTxfmParam c_partial_idct_tests[] = {
              &wrapper<vpx_idct4x4_1_add_c>, TX_4X4, 1, 8, 1)
 };
 
-INSTANTIATE_TEST_CASE_P(C, PartialIDctTest,
-                        ::testing::ValuesIn(c_partial_idct_tests));
+INSTANTIATE_TEST_SUITE_P(C, PartialIDctTest,
+                         ::testing::ValuesIn(c_partial_idct_tests));
 
-#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
+#if !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_NEON
 const PartialInvTxfmParam neon_partial_idct_tests[] = {
 #if CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_neon>, TX_32X32,
+             1024, 8, 2),
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_neon>, TX_32X32,
+             1024, 10, 2),
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_neon>, TX_32X32,
+             1024, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_135_add_neon>, TX_32X32, 135, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_135_add_neon>, TX_32X32, 135, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_135_add_neon>, TX_32X32, 135, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_34_add_neon>, TX_32X32, 34, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_34_add_neon>, TX_32X32, 34, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_34_add_neon>, TX_32X32, 34, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_1_add_neon>, TX_32X32, 1, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_1_add_neon>, TX_32X32, 1, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_1_add_neon>, TX_32X32, 1, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_256_add_neon>, TX_16X16, 256, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_256_add_neon>, TX_16X16, 256, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_256_add_neon>, TX_16X16, 256, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_38_add_neon>, TX_16X16, 38, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_38_add_neon>, TX_16X16, 38, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_38_add_neon>, TX_16X16, 38, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_10_add_neon>, TX_16X16, 10, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_10_add_neon>, TX_16X16, 10, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_10_add_neon>, TX_16X16, 10, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_1_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_1_add_neon>, TX_16X16, 1, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_1_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_1_add_neon>, TX_16X16, 1, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_1_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_1_add_neon>, TX_16X16, 1, 12, 2),
   make_tuple(&vpx_highbd_fdct8x8_c,
              &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
              &highbd_wrapper<vpx_highbd_idct8x8_64_add_neon>, TX_8X8, 64, 8, 2),
@@ -488,46 +601,78 @@ const PartialInvTxfmParam neon_partial_idct_tests[] = {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
              &wrapper<vpx_idct32x32_1024_add_neon>, TX_32X32, 1024, 8, 1),
-  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_135_add_c>,
              &wrapper<vpx_idct32x32_135_add_neon>, TX_32X32, 135, 8, 1),
-  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_34_add_c>,
              &wrapper<vpx_idct32x32_34_add_neon>, TX_32X32, 34, 8, 1),
-  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1_add_c>,
              &wrapper<vpx_idct32x32_1_add_neon>, TX_32X32, 1, 8, 1),
   make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
              &wrapper<vpx_idct16x16_256_add_neon>, TX_16X16, 256, 8, 1),
-  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_38_add_c>,
+             &wrapper<vpx_idct16x16_38_add_neon>, TX_16X16, 38, 8, 1),
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_10_add_c>,
              &wrapper<vpx_idct16x16_10_add_neon>, TX_16X16, 10, 8, 1),
-  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_1_add_c>,
              &wrapper<vpx_idct16x16_1_add_neon>, TX_16X16, 1, 8, 1),
   make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
              &wrapper<vpx_idct8x8_64_add_neon>, TX_8X8, 64, 8, 1),
-  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_12_add_c>,
              &wrapper<vpx_idct8x8_12_add_neon>, TX_8X8, 12, 8, 1),
-  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_1_add_c>,
              &wrapper<vpx_idct8x8_1_add_neon>, TX_8X8, 1, 8, 1),
   make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_16_add_c>,
              &wrapper<vpx_idct4x4_16_add_neon>, TX_4X4, 16, 8, 1),
-  make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_16_add_c>,
+  make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_1_add_c>,
              &wrapper<vpx_idct4x4_1_add_neon>, TX_4X4, 1, 8, 1)
 };
 
-INSTANTIATE_TEST_CASE_P(NEON, PartialIDctTest,
-                        ::testing::ValuesIn(neon_partial_idct_tests));
-#endif  // HAVE_NEON && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_SUITE_P(NEON, PartialIDctTest,
+                         ::testing::ValuesIn(neon_partial_idct_tests));
+#endif  // HAVE_NEON
 
-#if HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
+#if HAVE_SSE2
 // 32x32_135_ is implemented using the 1024 version.
 const PartialInvTxfmParam sse2_partial_idct_tests[] = {
 #if CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_sse2>, TX_32X32,
+             1024, 8, 2),
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_sse2>, TX_32X32,
+             1024, 10, 2),
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_sse2>, TX_32X32,
+             1024, 12, 2),
   make_tuple(
-      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_135_add_sse2>, TX_32X32, 135, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_135_add_sse2>, TX_32X32, 135, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_135_add_sse2>, TX_32X32, 135, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_34_add_sse2>, TX_32X32, 34, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_34_add_sse2>, TX_32X32, 34, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_34_add_sse2>, TX_32X32, 34, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1_add_c>,
       &highbd_wrapper<vpx_highbd_idct32x32_1_add_sse2>, TX_32X32, 1, 8, 2),
   make_tuple(
-      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1_add_c>,
       &highbd_wrapper<vpx_highbd_idct32x32_1_add_sse2>, TX_32X32, 1, 10, 2),
   make_tuple(
-      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1_add_c>,
       &highbd_wrapper<vpx_highbd_idct32x32_1_add_sse2>, TX_32X32, 1, 12, 2),
   make_tuple(
       &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
@@ -539,14 +684,32 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = {
       &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
       &highbd_wrapper<vpx_highbd_idct16x16_256_add_sse2>, TX_16X16, 256, 12, 2),
   make_tuple(
-      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse2>, TX_16X16, 38, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse2>, TX_16X16, 38, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse2>, TX_16X16, 38, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
       &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse2>, TX_16X16, 10, 8, 2),
   make_tuple(
-      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
       &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse2>, TX_16X16, 10, 10, 2),
   make_tuple(
-      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
       &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse2>, TX_16X16, 10, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_1_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_1_add_sse2>, TX_16X16, 1, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_1_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_1_add_sse2>, TX_16X16, 1, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_1_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_1_add_sse2>, TX_16X16, 1, 12, 2),
   make_tuple(&vpx_highbd_fdct8x8_c,
              &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
              &highbd_wrapper<vpx_highbd_idct8x8_64_add_sse2>, TX_8X8, 64, 8, 2),
@@ -557,14 +720,20 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = {
       &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
       &highbd_wrapper<vpx_highbd_idct8x8_64_add_sse2>, TX_8X8, 64, 12, 2),
   make_tuple(&vpx_highbd_fdct8x8_c,
-             &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
+             &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>,
              &highbd_wrapper<vpx_highbd_idct8x8_12_add_sse2>, TX_8X8, 12, 8, 2),
   make_tuple(
-      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
+      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>,
       &highbd_wrapper<vpx_highbd_idct8x8_12_add_sse2>, TX_8X8, 12, 10, 2),
   make_tuple(
-      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
+      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>,
       &highbd_wrapper<vpx_highbd_idct8x8_12_add_sse2>, TX_8X8, 12, 12, 2),
+  make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_1_add_c>,
+             &highbd_wrapper<vpx_highbd_idct8x8_1_add_sse2>, TX_8X8, 1, 8, 2),
+  make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_1_add_c>,
+             &highbd_wrapper<vpx_highbd_idct8x8_1_add_sse2>, TX_8X8, 1, 10, 2),
+  make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_1_add_c>,
+             &highbd_wrapper<vpx_highbd_idct8x8_1_add_sse2>, TX_8X8, 1, 12, 2),
   make_tuple(&vpx_highbd_fdct4x4_c,
              &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
              &highbd_wrapper<vpx_highbd_idct4x4_16_add_sse2>, TX_4X4, 16, 8, 2),
@@ -574,119 +743,233 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = {
   make_tuple(
       &vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
       &highbd_wrapper<vpx_highbd_idct4x4_16_add_sse2>, TX_4X4, 16, 12, 2),
+  make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_1_add_c>,
+             &highbd_wrapper<vpx_highbd_idct4x4_1_add_sse2>, TX_4X4, 1, 8, 2),
+  make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_1_add_c>,
+             &highbd_wrapper<vpx_highbd_idct4x4_1_add_sse2>, TX_4X4, 1, 10, 2),
+  make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_1_add_c>,
+             &highbd_wrapper<vpx_highbd_idct4x4_1_add_sse2>, TX_4X4, 1, 12, 2),
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
              &wrapper<vpx_idct32x32_1024_add_sse2>, TX_32X32, 1024, 8, 1),
-  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
-             &wrapper<vpx_idct32x32_1024_add_sse2>, TX_32X32, 135, 8, 1),
-  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_135_add_c>,
+             &wrapper<vpx_idct32x32_135_add_sse2>, TX_32X32, 135, 8, 1),
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_34_add_c>,
              &wrapper<vpx_idct32x32_34_add_sse2>, TX_32X32, 34, 8, 1),
-  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1_add_c>,
              &wrapper<vpx_idct32x32_1_add_sse2>, TX_32X32, 1, 8, 1),
   make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
              &wrapper<vpx_idct16x16_256_add_sse2>, TX_16X16, 256, 8, 1),
-  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_38_add_c>,
+             &wrapper<vpx_idct16x16_38_add_sse2>, TX_16X16, 38, 8, 1),
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_10_add_c>,
              &wrapper<vpx_idct16x16_10_add_sse2>, TX_16X16, 10, 8, 1),
-  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_1_add_c>,
              &wrapper<vpx_idct16x16_1_add_sse2>, TX_16X16, 1, 8, 1),
   make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
              &wrapper<vpx_idct8x8_64_add_sse2>, TX_8X8, 64, 8, 1),
-  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_12_add_c>,
              &wrapper<vpx_idct8x8_12_add_sse2>, TX_8X8, 12, 8, 1),
-  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_1_add_c>,
              &wrapper<vpx_idct8x8_1_add_sse2>, TX_8X8, 1, 8, 1),
   make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_16_add_c>,
              &wrapper<vpx_idct4x4_16_add_sse2>, TX_4X4, 16, 8, 1),
-  make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_16_add_c>,
+  make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_1_add_c>,
              &wrapper<vpx_idct4x4_1_add_sse2>, TX_4X4, 1, 8, 1)
 };
 
-INSTANTIATE_TEST_CASE_P(SSE2, PartialIDctTest,
-                        ::testing::ValuesIn(sse2_partial_idct_tests));
+INSTANTIATE_TEST_SUITE_P(SSE2, PartialIDctTest,
+                         ::testing::ValuesIn(sse2_partial_idct_tests));
 
-#endif  // HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
+#endif  // HAVE_SSE2
 
-#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_EMULATE_HARDWARE
+#if HAVE_SSSE3
 const PartialInvTxfmParam ssse3_partial_idct_tests[] = {
-  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
-             &wrapper<vpx_idct32x32_1024_add_ssse3>, TX_32X32, 1024, 8, 1),
-  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_135_add_c>,
              &wrapper<vpx_idct32x32_135_add_ssse3>, TX_32X32, 135, 8, 1),
-  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_34_add_c>,
              &wrapper<vpx_idct32x32_34_add_ssse3>, TX_32X32, 34, 8, 1),
-  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
-             &wrapper<vpx_idct8x8_64_add_ssse3>, TX_8X8, 64, 8, 1),
-  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_12_add_c>,
              &wrapper<vpx_idct8x8_12_add_ssse3>, TX_8X8, 12, 8, 1)
 };
 
-INSTANTIATE_TEST_CASE_P(SSSE3, PartialIDctTest,
-                        ::testing::ValuesIn(ssse3_partial_idct_tests));
-#endif  // HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_SUITE_P(SSSE3, PartialIDctTest,
+                         ::testing::ValuesIn(ssse3_partial_idct_tests));
+#endif  // HAVE_SSSE3
 
-#if HAVE_DSPR2 && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+const PartialInvTxfmParam sse4_1_partial_idct_tests[] = {
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_sse4_1>, TX_32X32,
+             1024, 8, 2),
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_sse4_1>, TX_32X32,
+             1024, 10, 2),
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_sse4_1>, TX_32X32,
+             1024, 12, 2),
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_135_add_sse4_1>, TX_32X32,
+             135, 8, 2),
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_135_add_sse4_1>, TX_32X32,
+             135, 10, 2),
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_135_add_sse4_1>, TX_32X32,
+             135, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_34_add_sse4_1>, TX_32X32, 34, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_34_add_sse4_1>, TX_32X32, 34, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_34_add_sse4_1>, TX_32X32, 34, 12, 2),
+  make_tuple(&vpx_highbd_fdct16x16_c,
+             &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+             &highbd_wrapper<vpx_highbd_idct16x16_256_add_sse4_1>, TX_16X16,
+             256, 8, 2),
+  make_tuple(&vpx_highbd_fdct16x16_c,
+             &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+             &highbd_wrapper<vpx_highbd_idct16x16_256_add_sse4_1>, TX_16X16,
+             256, 10, 2),
+  make_tuple(&vpx_highbd_fdct16x16_c,
+             &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+             &highbd_wrapper<vpx_highbd_idct16x16_256_add_sse4_1>, TX_16X16,
+             256, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse4_1>, TX_16X16, 38, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse4_1>, TX_16X16, 38, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse4_1>, TX_16X16, 38, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse4_1>, TX_16X16, 10, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse4_1>, TX_16X16, 10, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse4_1>, TX_16X16, 10, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
+      &highbd_wrapper<vpx_highbd_idct8x8_64_add_sse4_1>, TX_8X8, 64, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
+      &highbd_wrapper<vpx_highbd_idct8x8_64_add_sse4_1>, TX_8X8, 64, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
+      &highbd_wrapper<vpx_highbd_idct8x8_64_add_sse4_1>, TX_8X8, 64, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>,
+      &highbd_wrapper<vpx_highbd_idct8x8_12_add_sse4_1>, TX_8X8, 12, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>,
+      &highbd_wrapper<vpx_highbd_idct8x8_12_add_sse4_1>, TX_8X8, 12, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>,
+      &highbd_wrapper<vpx_highbd_idct8x8_12_add_sse4_1>, TX_8X8, 12, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
+      &highbd_wrapper<vpx_highbd_idct4x4_16_add_sse4_1>, TX_4X4, 16, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
+      &highbd_wrapper<vpx_highbd_idct4x4_16_add_sse4_1>, TX_4X4, 16, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
+      &highbd_wrapper<vpx_highbd_idct4x4_16_add_sse4_1>, TX_4X4, 16, 12, 2)
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE4_1, PartialIDctTest,
+                         ::testing::ValuesIn(sse4_1_partial_idct_tests));
+#endif  // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH
 const PartialInvTxfmParam dspr2_partial_idct_tests[] = {
   make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
              &wrapper<vpx_idct32x32_1024_add_dspr2>, TX_32X32, 1024, 8, 1),
-  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
-             &wrapper<vpx_idct32x32_1024_add_dspr2>, TX_32X32, 135, 8, 1),
-  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_34_add_c>,
              &wrapper<vpx_idct32x32_34_add_dspr2>, TX_32X32, 34, 8, 1),
-  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1_add_c>,
              &wrapper<vpx_idct32x32_1_add_dspr2>, TX_32X32, 1, 8, 1),
   make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
              &wrapper<vpx_idct16x16_256_add_dspr2>, TX_16X16, 256, 8, 1),
-  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_10_add_c>,
              &wrapper<vpx_idct16x16_10_add_dspr2>, TX_16X16, 10, 8, 1),
-  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_1_add_c>,
              &wrapper<vpx_idct16x16_1_add_dspr2>, TX_16X16, 1, 8, 1),
   make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
              &wrapper<vpx_idct8x8_64_add_dspr2>, TX_8X8, 64, 8, 1),
-  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_12_add_c>,
              &wrapper<vpx_idct8x8_12_add_dspr2>, TX_8X8, 12, 8, 1),
-  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_1_add_c>,
              &wrapper<vpx_idct8x8_1_add_dspr2>, TX_8X8, 1, 8, 1),
   make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_16_add_c>,
              &wrapper<vpx_idct4x4_16_add_dspr2>, TX_4X4, 16, 8, 1),
-  make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_16_add_c>,
+  make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_1_add_c>,
              &wrapper<vpx_idct4x4_1_add_dspr2>, TX_4X4, 1, 8, 1)
 };
 
-INSTANTIATE_TEST_CASE_P(DSPR2, PartialIDctTest,
-                        ::testing::ValuesIn(dspr2_partial_idct_tests));
-#endif  // HAVE_DSPR2 && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(DSPR2, PartialIDctTest,
+                         ::testing::ValuesIn(dspr2_partial_idct_tests));
+#endif  // HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH
 
-#if HAVE_MSA && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH
 // 32x32_135_ is implemented using the 1024 version.
 const PartialInvTxfmParam msa_partial_idct_tests[] = {
   make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
              &wrapper<vpx_idct32x32_1024_add_msa>, TX_32X32, 1024, 8, 1),
-  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
-             &wrapper<vpx_idct32x32_1024_add_msa>, TX_32X32, 135, 8, 1),
-  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_34_add_c>,
              &wrapper<vpx_idct32x32_34_add_msa>, TX_32X32, 34, 8, 1),
-  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1_add_c>,
              &wrapper<vpx_idct32x32_1_add_msa>, TX_32X32, 1, 8, 1),
   make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
              &wrapper<vpx_idct16x16_256_add_msa>, TX_16X16, 256, 8, 1),
-  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_10_add_c>,
              &wrapper<vpx_idct16x16_10_add_msa>, TX_16X16, 10, 8, 1),
-  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_1_add_c>,
              &wrapper<vpx_idct16x16_1_add_msa>, TX_16X16, 1, 8, 1),
   make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
              &wrapper<vpx_idct8x8_64_add_msa>, TX_8X8, 64, 8, 1),
-  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_12_add_c>,
              &wrapper<vpx_idct8x8_12_add_msa>, TX_8X8, 12, 8, 1),
-  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
+  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_1_add_c>,
              &wrapper<vpx_idct8x8_1_add_msa>, TX_8X8, 1, 8, 1),
   make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_16_add_c>,
              &wrapper<vpx_idct4x4_16_add_msa>, TX_4X4, 16, 8, 1),
-  make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_16_add_c>,
+  make_tuple(&vpx_fdct4x4_c, &wrapper<vpx_idct4x4_1_add_c>,
              &wrapper<vpx_idct4x4_1_add_msa>, TX_4X4, 1, 8, 1)
 };
 
-INSTANTIATE_TEST_CASE_P(MSA, PartialIDctTest,
-                        ::testing::ValuesIn(msa_partial_idct_tests));
-#endif  // HAVE_MSA && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(MSA, PartialIDctTest,
+                         ::testing::ValuesIn(msa_partial_idct_tests));
+#endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
+const PartialInvTxfmParam lsx_partial_idct_tests[] = {
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+             &wrapper<vpx_idct32x32_1024_add_lsx>, TX_32X32, 1024, 8, 1),
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_34_add_c>,
+             &wrapper<vpx_idct32x32_34_add_lsx>, TX_32X32, 34, 8, 1),
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1_add_c>,
+             &wrapper<vpx_idct32x32_1_add_lsx>, TX_32X32, 1, 8, 1),
+};
+
+INSTANTIATE_TEST_SUITE_P(LSX, PartialIDctTest,
+                         ::testing::ValuesIn(lsx_partial_idct_tests));
+#endif  // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
+
+#endif  // !CONFIG_EMULATE_HARDWARE
 
 }  // namespace
diff --git a/media/libvpx/libvpx/test/postproc.sh b/media/libvpx/libvpx/test/postproc.sh
index 939a3e7620..91ca9b26fe 100644
--- a/media/libvpx/libvpx/test/postproc.sh
+++ b/media/libvpx/libvpx/test/postproc.sh
@@ -38,7 +38,7 @@ postproc() {
   fi
 
   eval "${VPX_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \
-      ${devnull}
+      ${devnull} || return 1
 
   [ -e "${output_file}" ] || return 1
 }
diff --git a/media/libvpx/libvpx/test/pp_filter_test.cc b/media/libvpx/libvpx/test/pp_filter_test.cc
index 4b4795accf..bc0566a4f2 100644
--- a/media/libvpx/libvpx/test/pp_filter_test.cc
+++ b/media/libvpx/libvpx/test/pp_filter_test.cc
@@ -7,30 +7,36 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+
 #include <limits.h>
+
+#include <memory>
+
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
+#include "gtest/gtest.h"
 #include "test/acm_random.h"
+#include "test/bench.h"
+#include "test/buffer.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
-#include "third_party/googletest/src/include/gtest/gtest.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
 
 using libvpx_test::ACMRandom;
+using libvpx_test::Buffer;
 
-typedef void (*VpxPostProcDownAndAcrossMbRowFunc)(
+using VpxPostProcDownAndAcrossMbRowFunc = void (*)(
     unsigned char *src_ptr, unsigned char *dst_ptr, int src_pixels_per_line,
     int dst_pixels_per_line, int cols, unsigned char *flimit, int size);
 
-typedef void (*VpxMbPostProcAcrossIpFunc)(unsigned char *src, int pitch,
-                                          int rows, int cols, int flimit);
+using VpxMbPostProcAcrossIpFunc = void (*)(unsigned char *src, int pitch,
+                                           int rows, int cols, int flimit);
 
-typedef void (*VpxMbPostProcDownFunc)(unsigned char *dst, int pitch, int rows,
-                                      int cols, int flimit);
+using VpxMbPostProcDownFunc = void (*)(unsigned char *dst, int pitch, int rows,
+                                       int cols, int flimit);
 
 namespace {
-
 // Compute the filter level used in post proc from the loop filter strength
 int q2mbl(int x) {
   if (x < 20) x = 20;
@@ -40,184 +46,192 @@ int q2mbl(int x) {
 }
 
 class VpxPostProcDownAndAcrossMbRowTest
-    : public ::testing::TestWithParam<VpxPostProcDownAndAcrossMbRowFunc> {
+    : public AbstractBench,
+      public ::testing::TestWithParam<VpxPostProcDownAndAcrossMbRowFunc> {
  public:
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  VpxPostProcDownAndAcrossMbRowTest()
+      : mb_post_proc_down_and_across_(GetParam()) {}
+  void TearDown() override { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void Run() override;
+
+  const VpxPostProcDownAndAcrossMbRowFunc mb_post_proc_down_and_across_;
+  // Size of the underlying data block that will be filtered.
+  int block_width_;
+  int block_height_;
+  Buffer<uint8_t> *src_image_;
+  Buffer<uint8_t> *dst_image_;
+  uint8_t *flimits_;
 };
 
+void VpxPostProcDownAndAcrossMbRowTest::Run() {
+  mb_post_proc_down_and_across_(
+      src_image_->TopLeftPixel(), dst_image_->TopLeftPixel(),
+      src_image_->stride(), dst_image_->stride(), block_width_, flimits_, 16);
+}
+
 // Test routine for the VPx post-processing function
 // vpx_post_proc_down_and_across_mb_row_c.
 
 TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) {
   // Size of the underlying data block that will be filtered.
-  const int block_width = 16;
-  const int block_height = 16;
+  block_width_ = 16;
+  block_height_ = 16;
 
   // 5-tap filter needs 2 padding rows above and below the block in the input.
-  const int input_width = block_width;
-  const int input_height = block_height + 4;
-  const int input_stride = input_width;
-  const int input_size = input_width * input_height;
+  Buffer<uint8_t> src_image = Buffer<uint8_t>(block_width_, block_height_, 2);
+  ASSERT_TRUE(src_image.Init());
 
   // Filter extends output block by 8 samples at left and right edges.
-  const int output_width = block_width + 16;
-  const int output_height = block_height;
-  const int output_stride = output_width;
-  const int output_size = output_width * output_height;
-
-  uint8_t *const src_image = new uint8_t[input_size];
-  ASSERT_TRUE(src_image != NULL);
-
   // Though the left padding is only 8 bytes, the assembly code tries to
   // read 16 bytes before the pointer.
-  uint8_t *const dst_image = new uint8_t[output_size + 8];
-  ASSERT_TRUE(dst_image != NULL);
+  Buffer<uint8_t> dst_image =
+      Buffer<uint8_t>(block_width_, block_height_, 8, 16, 8, 8);
+  ASSERT_TRUE(dst_image.Init());
 
-  // Pointers to top-left pixel of block in the input and output images.
-  uint8_t *const src_image_ptr = src_image + (input_stride << 1);
-
-  // The assembly works in increments of 16. The first read may be offset by
-  // this amount.
-  uint8_t *const dst_image_ptr = dst_image + 16;
-  uint8_t *const flimits =
-      reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
-  (void)memset(flimits, 255, block_width);
+  flimits_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width_));
+  (void)memset(flimits_, 255, block_width_);
 
   // Initialize pixels in the input:
   //   block pixels to value 1,
   //   border pixels to value 10.
-  (void)memset(src_image, 10, input_size);
-  uint8_t *pixel_ptr = src_image_ptr;
-  for (int i = 0; i < block_height; ++i) {
-    for (int j = 0; j < block_width; ++j) {
-      pixel_ptr[j] = 1;
-    }
-    pixel_ptr += input_stride;
-  }
+  src_image.SetPadding(10);
+  src_image.Set(1);
 
   // Initialize pixels in the output to 99.
-  (void)memset(dst_image, 99, output_size);
+  dst_image.Set(99);
 
-  ASM_REGISTER_STATE_CHECK(GetParam()(src_image_ptr, dst_image_ptr,
-                                      input_stride, output_stride, block_width,
-                                      flimits, 16));
+  ASM_REGISTER_STATE_CHECK(mb_post_proc_down_and_across_(
+      src_image.TopLeftPixel(), dst_image.TopLeftPixel(), src_image.stride(),
+      dst_image.stride(), block_width_, flimits_, 16));
 
-  static const uint8_t kExpectedOutput[block_height] = {
-    4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4
-  };
+  static const uint8_t kExpectedOutput[] = { 4, 3, 1, 1, 1, 1, 1, 1,
+                                             1, 1, 1, 1, 1, 1, 3, 4 };
 
-  pixel_ptr = dst_image_ptr;
-  for (int i = 0; i < block_height; ++i) {
-    for (int j = 0; j < block_width; ++j) {
-      ASSERT_EQ(kExpectedOutput[i], pixel_ptr[j]) << "at (" << i << ", " << j
-                                                  << ")";
+  uint8_t *pixel_ptr = dst_image.TopLeftPixel();
+  for (int i = 0; i < block_height_; ++i) {
+    for (int j = 0; j < block_width_; ++j) {
+      ASSERT_EQ(kExpectedOutput[i], pixel_ptr[j])
+          << "at (" << i << ", " << j << ")";
     }
-    pixel_ptr += output_stride;
+    pixel_ptr += dst_image.stride();
   }
 
-  delete[] src_image;
-  delete[] dst_image;
-  vpx_free(flimits);
-};
+  vpx_free(flimits_);
+}
 
 TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) {
   // Size of the underlying data block that will be filtered.
   // Y blocks are always a multiple of 16 wide and exactly 16 high. U and V
   // blocks are always a multiple of 8 wide and exactly 8 high.
-  const int block_width = 136;
-  const int block_height = 16;
+  block_width_ = 136;
+  block_height_ = 16;
 
   // 5-tap filter needs 2 padding rows above and below the block in the input.
   // SSE2 reads in blocks of 16. Pad an extra 8 in case the width is not %16.
-  const int input_width = block_width;
-  const int input_height = block_height + 4 + 8;
-  const int input_stride = input_width;
-  const int input_size = input_stride * input_height;
+  Buffer<uint8_t> src_image =
+      Buffer<uint8_t>(block_width_, block_height_, 2, 2, 10, 2);
+  ASSERT_TRUE(src_image.Init());
 
   // Filter extends output block by 8 samples at left and right edges.
+  // Though the left padding is only 8 bytes, there is 'above' padding as well
+  // so when the assembly code tries to read 16 bytes before the pointer it is
+  // not a problem.
   // SSE2 reads in blocks of 16. Pad an extra 8 in case the width is not %16.
-  const int output_width = block_width + 24;
-  const int output_height = block_height;
-  const int output_stride = output_width;
-  const int output_size = output_stride * output_height;
-
-  uint8_t *const src_image = new uint8_t[input_size];
-  ASSERT_TRUE(src_image != NULL);
-
-  // Though the left padding is only 8 bytes, the assembly code tries to
-  // read 16 bytes before the pointer.
-  uint8_t *const dst_image = new uint8_t[output_size + 8];
-  ASSERT_TRUE(dst_image != NULL);
-  uint8_t *const dst_image_ref = new uint8_t[output_size + 8];
-  ASSERT_TRUE(dst_image_ref != NULL);
-
-  // Pointers to top-left pixel of block in the input and output images.
-  uint8_t *const src_image_ptr = src_image + (input_stride << 1);
-
-  // The assembly works in increments of 16. The first read may be offset by
-  // this amount.
-  uint8_t *const dst_image_ptr = dst_image + 16;
-  uint8_t *const dst_image_ref_ptr = dst_image + 16;
+  Buffer<uint8_t> dst_image =
+      Buffer<uint8_t>(block_width_, block_height_, 8, 8, 16, 8);
+  ASSERT_TRUE(dst_image.Init());
+  Buffer<uint8_t> dst_image_ref =
+      Buffer<uint8_t>(block_width_, block_height_, 8);
+  ASSERT_TRUE(dst_image_ref.Init());
 
   // Filter values are set in blocks of 16 for Y and 8 for U/V. Each macroblock
   // can have a different filter. SSE2 assembly reads flimits in blocks of 16 so
   // it must be padded out.
-  const int flimits_width = block_width % 16 ? block_width + 8 : block_width;
-  uint8_t *const flimits =
-      reinterpret_cast<uint8_t *>(vpx_memalign(16, flimits_width));
+  const int flimits_width = block_width_ % 16 ? block_width_ + 8 : block_width_;
+  flimits_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, flimits_width));
 
   ACMRandom rnd;
   rnd.Reset(ACMRandom::DeterministicSeed());
   // Initialize pixels in the input:
   //   block pixels to random values.
   //   border pixels to value 10.
-  (void)memset(src_image, 10, input_size);
-  uint8_t *pixel_ptr = src_image_ptr;
-  for (int i = 0; i < block_height; ++i) {
-    for (int j = 0; j < block_width; ++j) {
-      pixel_ptr[j] = rnd.Rand8();
-    }
-    pixel_ptr += input_stride;
-  }
+  src_image.SetPadding(10);
+  src_image.Set(&rnd, &ACMRandom::Rand8);
 
-  for (int blocks = 0; blocks < block_width; blocks += 8) {
-    (void)memset(flimits, 0, sizeof(*flimits) * flimits_width);
+  for (int blocks = 0; blocks < block_width_; blocks += 8) {
+    (void)memset(flimits_, 0, sizeof(*flimits_) * flimits_width);
 
     for (int f = 0; f < 255; f++) {
-      (void)memset(flimits + blocks, f, sizeof(*flimits) * 8);
-
-      (void)memset(dst_image, 0, output_size);
-      (void)memset(dst_image_ref, 0, output_size);
+      (void)memset(flimits_ + blocks, f, sizeof(*flimits_) * 8);
+      dst_image.Set(0);
+      dst_image_ref.Set(0);
 
       vpx_post_proc_down_and_across_mb_row_c(
-          src_image_ptr, dst_image_ref_ptr, input_stride, output_stride,
-          block_width, flimits, block_height);
-      ASM_REGISTER_STATE_CHECK(GetParam()(src_image_ptr, dst_image_ptr,
-                                          input_stride, output_stride,
-                                          block_width, flimits, 16));
+          src_image.TopLeftPixel(), dst_image_ref.TopLeftPixel(),
+          src_image.stride(), dst_image_ref.stride(), block_width_, flimits_,
+          block_height_);
+      ASM_REGISTER_STATE_CHECK(mb_post_proc_down_and_across_(
+          src_image.TopLeftPixel(), dst_image.TopLeftPixel(),
+          src_image.stride(), dst_image.stride(), block_width_, flimits_,
+          block_height_));
 
-      for (int i = 0; i < block_height; ++i) {
-        for (int j = 0; j < block_width; ++j) {
-          ASSERT_EQ(dst_image_ref_ptr[j + i * output_stride],
-                    dst_image_ptr[j + i * output_stride])
-              << "at (" << i << ", " << j << ")";
-        }
-      }
+      ASSERT_TRUE(dst_image.CheckValues(dst_image_ref));
     }
   }
 
-  delete[] src_image;
-  delete[] dst_image;
-  delete[] dst_image_ref;
-  vpx_free(flimits);
+  vpx_free(flimits_);
+}
+
+TEST_P(VpxPostProcDownAndAcrossMbRowTest, DISABLED_Speed) {
+  // Size of the underlying data block that will be filtered.
+  block_width_ = 16;
+  block_height_ = 16;
+
+  // 5-tap filter needs 2 padding rows above and below the block in the input.
+  Buffer<uint8_t> src_image = Buffer<uint8_t>(block_width_, block_height_, 2);
+  ASSERT_TRUE(src_image.Init());
+  this->src_image_ = &src_image;
+
+  // Filter extends output block by 8 samples at left and right edges.
+  // Though the left padding is only 8 bytes, the assembly code tries to
+  // read 16 bytes before the pointer.
+  Buffer<uint8_t> dst_image =
+      Buffer<uint8_t>(block_width_, block_height_, 8, 16, 8, 8);
+  ASSERT_TRUE(dst_image.Init());
+  this->dst_image_ = &dst_image;
+
+  flimits_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width_));
+  (void)memset(flimits_, 255, block_width_);
+
+  // Initialize pixels in the input:
+  //   block pixels to value 1,
+  //   border pixels to value 10.
+  src_image.SetPadding(10);
+  src_image.Set(1);
+
+  // Initialize pixels in the output to 99.
+  dst_image.Set(99);
+
+  RunNTimes(INT16_MAX);
+  PrintMedian("16x16");
+
+  vpx_free(flimits_);
 }
 
 class VpxMbPostProcAcrossIpTest
-    : public ::testing::TestWithParam<VpxMbPostProcAcrossIpFunc> {
+    : public AbstractBench,
+      public ::testing::TestWithParam<VpxMbPostProcAcrossIpFunc> {
  public:
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  VpxMbPostProcAcrossIpTest()
+      : rows_(16), cols_(16), mb_post_proc_across_ip_(GetParam()),
+        src_(Buffer<uint8_t>(rows_, cols_, 8, 8, 17, 8)) {}
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
+  void Run() override;
+
   void SetCols(unsigned char *s, int rows, int cols, int src_width) {
     for (int r = 0; r < rows; r++) {
       for (int c = 0; c < cols; c++) {
@@ -231,8 +245,8 @@ class VpxMbPostProcAcrossIpTest
                      int rows, int cols, int src_pitch) {
     for (int r = 0; r < rows; r++) {
       for (int c = 0; c < cols; c++) {
-        ASSERT_EQ(expected_output[c], src_c[c]) << "at (" << r << ", " << c
-                                                << ")";
+        ASSERT_EQ(expected_output[c], src_c[c])
+            << "at (" << r << ", " << c << ")";
       }
       src_c += src_pitch;
     }
@@ -244,159 +258,113 @@ class VpxMbPostProcAcrossIpTest
         GetParam()(s, src_width, rows, cols, filter_level));
     RunComparison(expected_output, s, rows, cols, src_width);
   }
+
+  const int rows_;
+  const int cols_;
+  const VpxMbPostProcAcrossIpFunc mb_post_proc_across_ip_;
+  Buffer<uint8_t> src_;
 };
 
+void VpxMbPostProcAcrossIpTest::Run() {
+  mb_post_proc_across_ip_(src_.TopLeftPixel(), src_.stride(), rows_, cols_,
+                          q2mbl(0));
+}
+
 TEST_P(VpxMbPostProcAcrossIpTest, CheckLowFilterOutput) {
-  const int rows = 16;
-  const int cols = 16;
-  const int src_left_padding = 8;
-  const int src_right_padding = 17;
-  const int src_width = cols + src_left_padding + src_right_padding;
-  const int src_size = rows * src_width;
+  ASSERT_TRUE(src_.Init());
+  src_.SetPadding(10);
+  SetCols(src_.TopLeftPixel(), rows_, cols_, src_.stride());
 
-  unsigned char *const src = new unsigned char[src_size];
-  ASSERT_TRUE(src != NULL);
-  memset(src, 10, src_size);
-  unsigned char *const s = src + src_left_padding;
-  SetCols(s, rows, cols, src_width);
+  Buffer<uint8_t> expected_output = Buffer<uint8_t>(cols_, rows_, 0);
+  ASSERT_TRUE(expected_output.Init());
+  SetCols(expected_output.TopLeftPixel(), rows_, cols_,
+          expected_output.stride());
 
-  unsigned char *expected_output = new unsigned char[rows * cols];
-  ASSERT_TRUE(expected_output != NULL);
-  SetCols(expected_output, rows, cols, cols);
-
-  RunFilterLevel(s, rows, cols, src_width, q2mbl(0), expected_output);
-  delete[] src;
-  delete[] expected_output;
+  RunFilterLevel(src_.TopLeftPixel(), rows_, cols_, src_.stride(), q2mbl(0),
+                 expected_output.TopLeftPixel());
 }
 
 TEST_P(VpxMbPostProcAcrossIpTest, CheckMediumFilterOutput) {
-  const int rows = 16;
-  const int cols = 16;
-  const int src_left_padding = 8;
-  const int src_right_padding = 17;
-  const int src_width = cols + src_left_padding + src_right_padding;
-  const int src_size = rows * src_width;
+  ASSERT_TRUE(src_.Init());
+  src_.SetPadding(10);
+  SetCols(src_.TopLeftPixel(), rows_, cols_, src_.stride());
 
-  unsigned char *const src = new unsigned char[src_size];
-  ASSERT_TRUE(src != NULL);
-  memset(src, 10, src_size);
-  unsigned char *const s = src + src_left_padding;
-
-  SetCols(s, rows, cols, src_width);
-  static const unsigned char kExpectedOutput[cols] = {
+  static const unsigned char kExpectedOutput[] = {
     2, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 13
   };
 
-  RunFilterLevel(s, rows, cols, src_width, q2mbl(70), kExpectedOutput);
-
-  delete[] src;
+  RunFilterLevel(src_.TopLeftPixel(), rows_, cols_, src_.stride(), q2mbl(70),
+                 kExpectedOutput);
 }
 
 TEST_P(VpxMbPostProcAcrossIpTest, CheckHighFilterOutput) {
-  const int rows = 16;
-  const int cols = 16;
-  const int src_left_padding = 8;
-  const int src_right_padding = 17;
-  const int src_width = cols + src_left_padding + src_right_padding;
-  const int src_size = rows * src_width;
+  ASSERT_TRUE(src_.Init());
+  src_.SetPadding(10);
+  SetCols(src_.TopLeftPixel(), rows_, cols_, src_.stride());
 
-  unsigned char *const src = new unsigned char[src_size];
-  ASSERT_TRUE(src != NULL);
-  unsigned char *const s = src + src_left_padding;
-
-  memset(src, 10, src_size);
-  SetCols(s, rows, cols, src_width);
-  static const unsigned char kExpectedOutput[cols] = {
+  static const unsigned char kExpectedOutput[] = {
     2, 2, 3, 4, 4, 5, 6, 7, 8, 9, 10, 11, 11, 12, 13, 13
   };
 
-  RunFilterLevel(s, rows, cols, src_width, INT_MAX, kExpectedOutput);
+  RunFilterLevel(src_.TopLeftPixel(), rows_, cols_, src_.stride(), INT_MAX,
+                 kExpectedOutput);
 
-  memset(src, 10, src_size);
-  SetCols(s, rows, cols, src_width);
-  RunFilterLevel(s, rows, cols, src_width, q2mbl(100), kExpectedOutput);
+  SetCols(src_.TopLeftPixel(), rows_, cols_, src_.stride());
 
-  delete[] src;
+  RunFilterLevel(src_.TopLeftPixel(), rows_, cols_, src_.stride(), q2mbl(100),
+                 kExpectedOutput);
 }
 
 TEST_P(VpxMbPostProcAcrossIpTest, CheckCvsAssembly) {
-  const int rows = 16;
-  const int cols = 16;
-  const int src_left_padding = 8;
-  const int src_right_padding = 17;
-  const int src_width = cols + src_left_padding + src_right_padding;
-  const int src_size = rows * src_width;
-
-  unsigned char *const c_mem = new unsigned char[src_size];
-  unsigned char *const asm_mem = new unsigned char[src_size];
-  ASSERT_TRUE(c_mem != NULL);
-  ASSERT_TRUE(asm_mem != NULL);
-  unsigned char *const src_c = c_mem + src_left_padding;
-  unsigned char *const src_asm = asm_mem + src_left_padding;
+  Buffer<uint8_t> c_mem = Buffer<uint8_t>(cols_, rows_, 8, 8, 17, 8);
+  ASSERT_TRUE(c_mem.Init());
+  Buffer<uint8_t> asm_mem = Buffer<uint8_t>(cols_, rows_, 8, 8, 17, 8);
+  ASSERT_TRUE(asm_mem.Init());
 
   // When level >= 100, the filter behaves the same as the level = INT_MAX
   // When level < 20, it behaves the same as the level = 0
   for (int level = 0; level < 100; level++) {
-    memset(c_mem, 10, src_size);
-    memset(asm_mem, 10, src_size);
-    SetCols(src_c, rows, cols, src_width);
-    SetCols(src_asm, rows, cols, src_width);
+    c_mem.SetPadding(10);
+    asm_mem.SetPadding(10);
+    SetCols(c_mem.TopLeftPixel(), rows_, cols_, c_mem.stride());
+    SetCols(asm_mem.TopLeftPixel(), rows_, cols_, asm_mem.stride());
 
-    vpx_mbpost_proc_across_ip_c(src_c, src_width, rows, cols, q2mbl(level));
-    ASM_REGISTER_STATE_CHECK(
-        GetParam()(src_asm, src_width, rows, cols, q2mbl(level)));
+    vpx_mbpost_proc_across_ip_c(c_mem.TopLeftPixel(), c_mem.stride(), rows_,
+                                cols_, q2mbl(level));
+    ASM_REGISTER_STATE_CHECK(GetParam()(
+        asm_mem.TopLeftPixel(), asm_mem.stride(), rows_, cols_, q2mbl(level)));
 
-    RunComparison(src_c, src_asm, rows, cols, src_width);
+    ASSERT_TRUE(asm_mem.CheckValues(c_mem));
   }
+}
 
-  delete[] c_mem;
-  delete[] asm_mem;
+TEST_P(VpxMbPostProcAcrossIpTest, DISABLED_Speed) {
+  ASSERT_TRUE(src_.Init());
+  src_.SetPadding(10);
+
+  SetCols(src_.TopLeftPixel(), rows_, cols_, src_.stride());
+
+  RunNTimes(100000);
+  PrintMedian("16x16");
 }
 
 class VpxMbPostProcDownTest
-    : public ::testing::TestWithParam<VpxMbPostProcDownFunc> {
+    : public AbstractBench,
+      public ::testing::TestWithParam<VpxMbPostProcDownFunc> {
  public:
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  VpxMbPostProcDownTest()
+      : rows_(16), cols_(16), mb_post_proc_down_(GetParam()),
+        src_c_(Buffer<uint8_t>(rows_, cols_, 8, 8, 8, 17)) {}
+
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  void SetRows(unsigned char *src_c, int rows, int cols) {
+  void Run() override;
+
+  void SetRows(unsigned char *src_c, int rows, int cols, int src_width) {
     for (int r = 0; r < rows; r++) {
       memset(src_c, r, cols);
-      src_c += cols;
-    }
-  }
-
-  void SetRandom(unsigned char *src_c, unsigned char *src_asm, int rows,
-                 int cols, int src_pitch) {
-    ACMRandom rnd;
-    rnd.Reset(ACMRandom::DeterministicSeed());
-
-    // Add some random noise to the input
-    for (int r = 0; r < rows; r++) {
-      for (int c = 0; c < cols; c++) {
-        const int noise = rnd(4);
-        src_c[c] = r + noise;
-        src_asm[c] = r + noise;
-      }
-      src_c += src_pitch;
-      src_asm += src_pitch;
-    }
-  }
-
-  void SetRandomSaturation(unsigned char *src_c, unsigned char *src_asm,
-                           int rows, int cols, int src_pitch) {
-    ACMRandom rnd;
-    rnd.Reset(ACMRandom::DeterministicSeed());
-
-    // Add some random noise to the input
-    for (int r = 0; r < rows; r++) {
-      for (int c = 0; c < cols; c++) {
-        const int noise = 3 * rnd(2);
-        src_c[c] = r + noise;
-        src_asm[c] = r + noise;
-      }
-      src_c += src_pitch;
-      src_asm += src_pitch;
+      src_c += src_width;
     }
   }
 
@@ -404,48 +372,38 @@ class VpxMbPostProcDownTest
                      int rows, int cols, int src_pitch) {
     for (int r = 0; r < rows; r++) {
       for (int c = 0; c < cols; c++) {
-        ASSERT_EQ(expected_output[r * rows + c], src_c[c]) << "at (" << r
-                                                           << ", " << c << ")";
+        ASSERT_EQ(expected_output[r * rows + c], src_c[c])
+            << "at (" << r << ", " << c << ")";
       }
       src_c += src_pitch;
     }
   }
 
-  void RunComparison(unsigned char *src_c, unsigned char *src_asm, int rows,
-                     int cols, int src_pitch) {
-    for (int r = 0; r < rows; r++) {
-      for (int c = 0; c < cols; c++) {
-        ASSERT_EQ(src_c[c], src_asm[c]) << "at (" << r << ", " << c << ")";
-      }
-      src_c += src_pitch;
-      src_asm += src_pitch;
-    }
-  }
-
   void RunFilterLevel(unsigned char *s, int rows, int cols, int src_width,
                       int filter_level, const unsigned char *expected_output) {
     ASM_REGISTER_STATE_CHECK(
-        GetParam()(s, src_width, rows, cols, filter_level));
+        mb_post_proc_down_(s, src_width, rows, cols, filter_level));
     RunComparison(expected_output, s, rows, cols, src_width);
   }
+
+  const int rows_;
+  const int cols_;
+  const VpxMbPostProcDownFunc mb_post_proc_down_;
+  Buffer<uint8_t> src_c_;
 };
 
+void VpxMbPostProcDownTest::Run() {
+  mb_post_proc_down_(src_c_.TopLeftPixel(), src_c_.stride(), rows_, cols_,
+                     q2mbl(0));
+}
+
 TEST_P(VpxMbPostProcDownTest, CheckHighFilterOutput) {
-  const int rows = 16;
-  const int cols = 16;
-  const int src_pitch = cols;
-  const int src_top_padding = 8;
-  const int src_bottom_padding = 17;
+  ASSERT_TRUE(src_c_.Init());
+  src_c_.SetPadding(10);
 
-  const int src_size = cols * (rows + src_top_padding + src_bottom_padding);
-  unsigned char *const c_mem = new unsigned char[src_size];
-  ASSERT_TRUE(c_mem != NULL);
-  memset(c_mem, 10, src_size);
-  unsigned char *const src_c = c_mem + src_top_padding * src_pitch;
+  SetRows(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride());
 
-  SetRows(src_c, rows, cols);
-
-  static const unsigned char kExpectedOutput[rows * cols] = {
+  static const unsigned char kExpectedOutput[] = {
     2,  2,  1,  1,  2,  2,  2,  2,  2,  2,  1,  1,  2,  2,  2,  2,  2,  2,  2,
     2,  3,  2,  2,  2,  2,  2,  2,  2,  3,  2,  2,  2,  3,  3,  3,  3,  3,  3,
     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  4,  4,  3,  4,  4,  3,  3,  3,
@@ -462,31 +420,22 @@ TEST_P(VpxMbPostProcDownTest, CheckHighFilterOutput) {
     13, 13, 13, 13, 14, 13, 13, 13, 13
   };
 
-  RunFilterLevel(src_c, rows, cols, src_pitch, INT_MAX, kExpectedOutput);
+  RunFilterLevel(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride(), INT_MAX,
+                 kExpectedOutput);
 
-  memset(c_mem, 10, src_size);
-  SetRows(src_c, rows, cols);
-  RunFilterLevel(src_c, rows, cols, src_pitch, q2mbl(100), kExpectedOutput);
-
-  delete[] c_mem;
+  src_c_.SetPadding(10);
+  SetRows(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride());
+  RunFilterLevel(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride(),
+                 q2mbl(100), kExpectedOutput);
 }
 
 TEST_P(VpxMbPostProcDownTest, CheckMediumFilterOutput) {
-  const int rows = 16;
-  const int cols = 16;
-  const int src_pitch = cols;
-  const int src_top_padding = 8;
-  const int src_bottom_padding = 17;
+  ASSERT_TRUE(src_c_.Init());
+  src_c_.SetPadding(10);
 
-  const int src_size = cols * (rows + src_top_padding + src_bottom_padding);
-  unsigned char *const c_mem = new unsigned char[src_size];
-  ASSERT_TRUE(c_mem != NULL);
-  memset(c_mem, 10, src_size);
-  unsigned char *const src_c = c_mem + src_top_padding * src_pitch;
+  SetRows(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride());
 
-  SetRows(src_c, rows, cols);
-
-  static const unsigned char kExpectedOutput[rows * cols] = {
+  static const unsigned char kExpectedOutput[] = {
     2,  2,  1,  1,  2,  2,  2,  2,  2,  2,  1,  1,  2,  2,  2,  2,  2,  2,  2,
     2,  3,  2,  2,  2,  2,  2,  2,  2,  3,  2,  2,  2,  2,  2,  2,  2,  2,  2,
     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,  3,  3,  3,  3,
@@ -503,113 +452,124 @@ TEST_P(VpxMbPostProcDownTest, CheckMediumFilterOutput) {
     13, 13, 13, 13, 14, 13, 13, 13, 13
   };
 
-  RunFilterLevel(src_c, rows, cols, src_pitch, q2mbl(70), kExpectedOutput);
-
-  delete[] c_mem;
+  RunFilterLevel(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride(),
+                 q2mbl(70), kExpectedOutput);
 }
 
 TEST_P(VpxMbPostProcDownTest, CheckLowFilterOutput) {
-  const int rows = 16;
-  const int cols = 16;
-  const int src_pitch = cols;
-  const int src_top_padding = 8;
-  const int src_bottom_padding = 17;
+  ASSERT_TRUE(src_c_.Init());
+  src_c_.SetPadding(10);
 
-  const int src_size = cols * (rows + src_top_padding + src_bottom_padding);
-  unsigned char *const c_mem = new unsigned char[src_size];
-  ASSERT_TRUE(c_mem != NULL);
-  memset(c_mem, 10, src_size);
-  unsigned char *const src_c = c_mem + src_top_padding * src_pitch;
+  SetRows(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride());
 
-  SetRows(src_c, rows, cols);
+  std::unique_ptr<unsigned char[]> expected_output(
+      new unsigned char[rows_ * cols_]);
+  ASSERT_NE(expected_output, nullptr);
+  SetRows(expected_output.get(), rows_, cols_, cols_);
 
-  unsigned char *expected_output = new unsigned char[rows * cols];
-  ASSERT_TRUE(expected_output != NULL);
-  SetRows(expected_output, rows, cols);
-
-  RunFilterLevel(src_c, rows, cols, src_pitch, q2mbl(0), expected_output);
-
-  delete[] c_mem;
-  delete[] expected_output;
+  RunFilterLevel(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride(), q2mbl(0),
+                 expected_output.get());
 }
 
 TEST_P(VpxMbPostProcDownTest, CheckCvsAssembly) {
-  const int rows = 16;
-  const int cols = 16;
-  const int src_pitch = cols;
-  const int src_top_padding = 8;
-  const int src_bottom_padding = 17;
-  const int src_size = cols * (rows + src_top_padding + src_bottom_padding);
-  unsigned char *const c_mem = new unsigned char[src_size];
-  unsigned char *const asm_mem = new unsigned char[src_size];
-  ASSERT_TRUE(c_mem != NULL);
-  ASSERT_TRUE(asm_mem != NULL);
-  unsigned char *const src_c = c_mem + src_top_padding * src_pitch;
-  unsigned char *const src_asm = asm_mem + src_top_padding * src_pitch;
+  ACMRandom rnd;
+  rnd.Reset(ACMRandom::DeterministicSeed());
+
+  ASSERT_TRUE(src_c_.Init());
+  Buffer<uint8_t> src_asm = Buffer<uint8_t>(cols_, rows_, 8, 8, 8, 17);
+  ASSERT_TRUE(src_asm.Init());
 
   for (int level = 0; level < 100; level++) {
-    memset(c_mem, 10, src_size);
-    memset(asm_mem, 10, src_size);
-    SetRandom(src_c, src_asm, rows, cols, src_pitch);
-    vpx_mbpost_proc_down_c(src_c, src_pitch, rows, cols, q2mbl(level));
-    ASM_REGISTER_STATE_CHECK(
-        GetParam()(src_asm, src_pitch, rows, cols, q2mbl(level)));
-    RunComparison(src_c, src_asm, rows, cols, src_pitch);
+    src_c_.SetPadding(10);
+    src_asm.SetPadding(10);
+    src_c_.Set(&rnd, &ACMRandom::Rand8);
+    src_asm.CopyFrom(src_c_);
 
-    memset(c_mem, 10, src_size);
-    memset(asm_mem, 10, src_size);
-    SetRandomSaturation(src_c, src_asm, rows, cols, src_pitch);
-    vpx_mbpost_proc_down_c(src_c, src_pitch, rows, cols, q2mbl(level));
-    ASM_REGISTER_STATE_CHECK(
-        GetParam()(src_asm, src_pitch, rows, cols, q2mbl(level)));
-    RunComparison(src_c, src_asm, rows, cols, src_pitch);
+    vpx_mbpost_proc_down_c(src_c_.TopLeftPixel(), src_c_.stride(), rows_, cols_,
+                           q2mbl(level));
+    ASM_REGISTER_STATE_CHECK(mb_post_proc_down_(
+        src_asm.TopLeftPixel(), src_asm.stride(), rows_, cols_, q2mbl(level)));
+    ASSERT_TRUE(src_asm.CheckValues(src_c_));
+
+    src_c_.SetPadding(10);
+    src_asm.SetPadding(10);
+    src_c_.Set(&rnd, &ACMRandom::Rand8Extremes);
+    src_asm.CopyFrom(src_c_);
+
+    vpx_mbpost_proc_down_c(src_c_.TopLeftPixel(), src_c_.stride(), rows_, cols_,
+                           q2mbl(level));
+    ASM_REGISTER_STATE_CHECK(mb_post_proc_down_(
+        src_asm.TopLeftPixel(), src_asm.stride(), rows_, cols_, q2mbl(level)));
+    ASSERT_TRUE(src_asm.CheckValues(src_c_));
   }
-
-  delete[] c_mem;
-  delete[] asm_mem;
 }
 
-INSTANTIATE_TEST_CASE_P(
+TEST_P(VpxMbPostProcDownTest, DISABLED_Speed) {
+  ASSERT_TRUE(src_c_.Init());
+  src_c_.SetPadding(10);
+
+  SetRows(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride());
+
+  RunNTimes(100000);
+  PrintMedian("16x16");
+}
+
+INSTANTIATE_TEST_SUITE_P(
     C, VpxPostProcDownAndAcrossMbRowTest,
     ::testing::Values(vpx_post_proc_down_and_across_mb_row_c));
 
-INSTANTIATE_TEST_CASE_P(C, VpxMbPostProcAcrossIpTest,
-                        ::testing::Values(vpx_mbpost_proc_across_ip_c));
+INSTANTIATE_TEST_SUITE_P(C, VpxMbPostProcAcrossIpTest,
+                         ::testing::Values(vpx_mbpost_proc_across_ip_c));
 
-INSTANTIATE_TEST_CASE_P(C, VpxMbPostProcDownTest,
-                        ::testing::Values(vpx_mbpost_proc_down_c));
+INSTANTIATE_TEST_SUITE_P(C, VpxMbPostProcDownTest,
+                         ::testing::Values(vpx_mbpost_proc_down_c));
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, VpxPostProcDownAndAcrossMbRowTest,
     ::testing::Values(vpx_post_proc_down_and_across_mb_row_sse2));
 
-INSTANTIATE_TEST_CASE_P(SSE2, VpxMbPostProcAcrossIpTest,
-                        ::testing::Values(vpx_mbpost_proc_across_ip_sse2));
+INSTANTIATE_TEST_SUITE_P(SSE2, VpxMbPostProcAcrossIpTest,
+                         ::testing::Values(vpx_mbpost_proc_across_ip_sse2));
 
-INSTANTIATE_TEST_CASE_P(SSE2, VpxMbPostProcDownTest,
-                        ::testing::Values(vpx_mbpost_proc_down_sse2));
+INSTANTIATE_TEST_SUITE_P(SSE2, VpxMbPostProcDownTest,
+                         ::testing::Values(vpx_mbpost_proc_down_sse2));
 #endif  // HAVE_SSE2
 
 #if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     NEON, VpxPostProcDownAndAcrossMbRowTest,
     ::testing::Values(vpx_post_proc_down_and_across_mb_row_neon));
 
-INSTANTIATE_TEST_CASE_P(NEON, VpxMbPostProcAcrossIpTest,
-                        ::testing::Values(vpx_mbpost_proc_across_ip_neon));
+INSTANTIATE_TEST_SUITE_P(NEON, VpxMbPostProcAcrossIpTest,
+                         ::testing::Values(vpx_mbpost_proc_across_ip_neon));
+
+INSTANTIATE_TEST_SUITE_P(NEON, VpxMbPostProcDownTest,
+                         ::testing::Values(vpx_mbpost_proc_down_neon));
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     MSA, VpxPostProcDownAndAcrossMbRowTest,
     ::testing::Values(vpx_post_proc_down_and_across_mb_row_msa));
 
-INSTANTIATE_TEST_CASE_P(MSA, VpxMbPostProcAcrossIpTest,
-                        ::testing::Values(vpx_mbpost_proc_across_ip_msa));
+INSTANTIATE_TEST_SUITE_P(MSA, VpxMbPostProcAcrossIpTest,
+                         ::testing::Values(vpx_mbpost_proc_across_ip_msa));
 
-INSTANTIATE_TEST_CASE_P(MSA, VpxMbPostProcDownTest,
-                        ::testing::Values(vpx_mbpost_proc_down_msa));
+INSTANTIATE_TEST_SUITE_P(MSA, VpxMbPostProcDownTest,
+                         ::testing::Values(vpx_mbpost_proc_down_msa));
 #endif  // HAVE_MSA
 
+#if HAVE_VSX
+INSTANTIATE_TEST_SUITE_P(
+    VSX, VpxPostProcDownAndAcrossMbRowTest,
+    ::testing::Values(vpx_post_proc_down_and_across_mb_row_vsx));
+
+INSTANTIATE_TEST_SUITE_P(VSX, VpxMbPostProcAcrossIpTest,
+                         ::testing::Values(vpx_mbpost_proc_across_ip_vsx));
+
+INSTANTIATE_TEST_SUITE_P(VSX, VpxMbPostProcDownTest,
+                         ::testing::Values(vpx_mbpost_proc_down_vsx));
+#endif  // HAVE_VSX
+
 }  // namespace
diff --git a/media/libvpx/libvpx/test/predict_test.cc b/media/libvpx/libvpx/test/predict_test.cc
index a6e2b3cf32..cc290bf50c 100644
--- a/media/libvpx/libvpx/test/predict_test.cc
+++ b/media/libvpx/libvpx/test/predict_test.cc
@@ -8,14 +8,17 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <tuple>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "./vp8_rtcd.h"
 #include "./vpx_config.h"
 #include "test/acm_random.h"
+#include "test/bench.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
@@ -25,23 +28,24 @@
 namespace {
 
 using libvpx_test::ACMRandom;
-using std::tr1::make_tuple;
+using std::make_tuple;
 
-typedef void (*PredictFunc)(uint8_t *src_ptr, int src_pixels_per_line,
-                            int xoffset, int yoffset, uint8_t *dst_ptr,
-                            int dst_pitch);
+using PredictFunc = void (*)(uint8_t *src_ptr, int src_pixels_per_line,
+                             int xoffset, int yoffset, uint8_t *dst_ptr,
+                             int dst_pitch);
 
-typedef std::tr1::tuple<int, int, PredictFunc> PredictParam;
+using PredictParam = std::tuple<int, int, PredictFunc>;
 
-class PredictTestBase : public ::testing::TestWithParam<PredictParam> {
+class PredictTestBase : public AbstractBench,
+                        public ::testing::TestWithParam<PredictParam> {
  public:
   PredictTestBase()
       : width_(GET_PARAM(0)), height_(GET_PARAM(1)), predict_(GET_PARAM(2)),
-        src_(NULL), padded_dst_(NULL), dst_(NULL), dst_c_(NULL) {}
+        src_(nullptr), padded_dst_(nullptr), dst_(nullptr), dst_c_(nullptr) {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     src_ = new uint8_t[kSrcSize];
-    ASSERT_TRUE(src_ != NULL);
+    ASSERT_NE(src_, nullptr);
 
     // padded_dst_ provides a buffer of kBorderSize around the destination
     // memory to facilitate detecting out of bounds writes.
@@ -49,25 +53,25 @@ class PredictTestBase : public ::testing::TestWithParam<PredictParam> {
     padded_dst_size_ = dst_stride_ * (kBorderSize + height_ + kBorderSize);
     padded_dst_ =
         reinterpret_cast<uint8_t *>(vpx_memalign(16, padded_dst_size_));
-    ASSERT_TRUE(padded_dst_ != NULL);
+    ASSERT_NE(padded_dst_, nullptr);
     dst_ = padded_dst_ + (kBorderSize * dst_stride_) + kBorderSize;
 
     dst_c_ = new uint8_t[16 * 16];
-    ASSERT_TRUE(dst_c_ != NULL);
+    ASSERT_NE(dst_c_, nullptr);
 
     memset(src_, 0, kSrcSize);
     memset(padded_dst_, 128, padded_dst_size_);
     memset(dst_c_, 0, 16 * 16);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     delete[] src_;
-    src_ = NULL;
+    src_ = nullptr;
     vpx_free(padded_dst_);
-    padded_dst_ = NULL;
-    dst_ = NULL;
+    padded_dst_ = nullptr;
+    dst_ = nullptr;
     delete[] dst_c_;
-    dst_c_ = NULL;
+    dst_c_ = nullptr;
     libvpx_test::ClearSystemState();
   }
 
@@ -204,7 +208,20 @@ class PredictTestBase : public ::testing::TestWithParam<PredictParam> {
       }
     }
   }
-};
+
+  void Run() override {
+    for (int xoffset = 0; xoffset < 8; ++xoffset) {
+      for (int yoffset = 0; yoffset < 8; ++yoffset) {
+        if (xoffset == 0 && yoffset == 0) {
+          continue;
+        }
+
+        predict_(&src_[kSrcStride * 2 + 2], kSrcStride, xoffset, yoffset, dst_,
+                 dst_stride_);
+      }
+    }
+  }
+};  // namespace
 
 class SixtapPredictTest : public PredictTestBase {};
 
@@ -281,14 +298,14 @@ TEST_P(SixtapPredictTest, TestWithPresetData) {
       CompareBuffers(kExpectedDst, kExpectedDstStride, dst_, dst_stride_));
 }
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     C, SixtapPredictTest,
     ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_c),
                       make_tuple(8, 8, &vp8_sixtap_predict8x8_c),
                       make_tuple(8, 4, &vp8_sixtap_predict8x4_c),
                       make_tuple(4, 4, &vp8_sixtap_predict4x4_c)));
 #if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     NEON, SixtapPredictTest,
     ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_neon),
                       make_tuple(8, 8, &vp8_sixtap_predict8x8_neon),
@@ -296,19 +313,19 @@ INSTANTIATE_TEST_CASE_P(
                       make_tuple(4, 4, &vp8_sixtap_predict4x4_neon)));
 #endif
 #if HAVE_MMX
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     MMX, SixtapPredictTest,
     ::testing::Values(make_tuple(4, 4, &vp8_sixtap_predict4x4_mmx)));
 #endif
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, SixtapPredictTest,
     ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_sse2),
                       make_tuple(8, 8, &vp8_sixtap_predict8x8_sse2),
                       make_tuple(8, 4, &vp8_sixtap_predict8x4_sse2)));
 #endif
 #if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSSE3, SixtapPredictTest,
     ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_ssse3),
                       make_tuple(8, 8, &vp8_sixtap_predict8x8_ssse3),
@@ -316,7 +333,7 @@ INSTANTIATE_TEST_CASE_P(
                       make_tuple(4, 4, &vp8_sixtap_predict4x4_ssse3)));
 #endif
 #if HAVE_MSA
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     MSA, SixtapPredictTest,
     ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_msa),
                       make_tuple(8, 8, &vp8_sixtap_predict8x8_msa),
@@ -324,6 +341,23 @@ INSTANTIATE_TEST_CASE_P(
                       make_tuple(4, 4, &vp8_sixtap_predict4x4_msa)));
 #endif
 
+#if HAVE_MMI
+INSTANTIATE_TEST_SUITE_P(
+    MMI, SixtapPredictTest,
+    ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_mmi),
+                      make_tuple(8, 8, &vp8_sixtap_predict8x8_mmi),
+                      make_tuple(8, 4, &vp8_sixtap_predict8x4_mmi),
+                      make_tuple(4, 4, &vp8_sixtap_predict4x4_mmi)));
+#endif
+
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(
+    LSX, SixtapPredictTest,
+    ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_lsx),
+                      make_tuple(8, 8, &vp8_sixtap_predict8x8_lsx),
+                      make_tuple(4, 4, &vp8_sixtap_predict4x4_lsx)));
+#endif
+
 class BilinearPredictTest : public PredictTestBase {};
 
 TEST_P(BilinearPredictTest, TestWithRandomData) {
@@ -332,41 +366,45 @@ TEST_P(BilinearPredictTest, TestWithRandomData) {
 TEST_P(BilinearPredictTest, TestWithUnalignedDst) {
   TestWithUnalignedDst(vp8_bilinear_predict16x16_c);
 }
+TEST_P(BilinearPredictTest, DISABLED_Speed) {
+  const int kCountSpeedTestBlock = 5000000 / (width_ * height_);
+  RunNTimes(kCountSpeedTestBlock);
 
-INSTANTIATE_TEST_CASE_P(
+  char title[16];
+  snprintf(title, sizeof(title), "%dx%d", width_, height_);
+  PrintMedian(title);
+}
+
+INSTANTIATE_TEST_SUITE_P(
     C, BilinearPredictTest,
     ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_c),
                       make_tuple(8, 8, &vp8_bilinear_predict8x8_c),
                       make_tuple(8, 4, &vp8_bilinear_predict8x4_c),
                       make_tuple(4, 4, &vp8_bilinear_predict4x4_c)));
 #if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     NEON, BilinearPredictTest,
     ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_neon),
                       make_tuple(8, 8, &vp8_bilinear_predict8x8_neon),
                       make_tuple(8, 4, &vp8_bilinear_predict8x4_neon),
                       make_tuple(4, 4, &vp8_bilinear_predict4x4_neon)));
 #endif
-#if HAVE_MMX
-INSTANTIATE_TEST_CASE_P(
-    MMX, BilinearPredictTest,
-    ::testing::Values(make_tuple(8, 4, &vp8_bilinear_predict8x4_mmx),
-                      make_tuple(4, 4, &vp8_bilinear_predict4x4_mmx)));
-#endif
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, BilinearPredictTest,
     ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_sse2),
-                      make_tuple(8, 8, &vp8_bilinear_predict8x8_sse2)));
+                      make_tuple(8, 8, &vp8_bilinear_predict8x8_sse2),
+                      make_tuple(8, 4, &vp8_bilinear_predict8x4_sse2),
+                      make_tuple(4, 4, &vp8_bilinear_predict4x4_sse2)));
 #endif
 #if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSSE3, BilinearPredictTest,
     ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_ssse3),
                       make_tuple(8, 8, &vp8_bilinear_predict8x8_ssse3)));
 #endif
 #if HAVE_MSA
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     MSA, BilinearPredictTest,
     ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_msa),
                       make_tuple(8, 8, &vp8_bilinear_predict8x8_msa),
diff --git a/media/libvpx/libvpx/test/quantize_test.cc b/media/libvpx/libvpx/test/quantize_test.cc
index 69da8994ca..8185dd6356 100644
--- a/media/libvpx/libvpx/test/quantize_test.cc
+++ b/media/libvpx/libvpx/test/quantize_test.cc
@@ -9,12 +9,14 @@
  */
 
 #include <string.h>
+#include <tuple>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
-#include "./vpx_config.h"
 #include "./vp8_rtcd.h"
+#include "./vpx_config.h"
 #include "test/acm_random.h"
+#include "test/bench.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
@@ -31,12 +33,12 @@ namespace {
 const int kNumBlocks = 25;
 const int kNumBlockEntries = 16;
 
-typedef void (*VP8Quantize)(BLOCK *b, BLOCKD *d);
+using VP8Quantize = void (*)(BLOCK *b, BLOCKD *d);
 
-typedef std::tr1::tuple<VP8Quantize, VP8Quantize> VP8QuantizeParam;
+using VP8QuantizeParam = std::tuple<VP8Quantize, VP8Quantize>;
 
 using libvpx_test::ACMRandom;
-using std::tr1::make_tuple;
+using std::make_tuple;
 
 // Create and populate a VP8_COMP instance which has a complete set of
 // quantization inputs as well as a second MACROBLOCKD for output.
@@ -44,9 +46,9 @@ class QuantizeTestBase {
  public:
   virtual ~QuantizeTestBase() {
     vp8_remove_compressor(&vp8_comp_);
-    vp8_comp_ = NULL;
+    vp8_comp_ = nullptr;
     vpx_free(macroblockd_dst_);
-    macroblockd_dst_ = NULL;
+    macroblockd_dst_ = nullptr;
     libvpx_test::ClearSystemState();
   }
 
@@ -69,7 +71,7 @@ class QuantizeTestBase {
     // Copy macroblockd from the reference to get pre-set-up dequant values.
     macroblockd_dst_ = reinterpret_cast<MACROBLOCKD *>(
         vpx_memalign(32, sizeof(*macroblockd_dst_)));
-    memcpy(macroblockd_dst_, &vp8_comp_->mb.e_mbd, sizeof(*macroblockd_dst_));
+    *macroblockd_dst_ = vp8_comp_->mb.e_mbd;
     // Fix block pointers - currently they point to the blocks in the reference
     // structure.
     vp8_setup_block_dptrs(macroblockd_dst_);
@@ -78,7 +80,7 @@ class QuantizeTestBase {
   void UpdateQuantizer(int q) {
     vp8_set_quantizer(vp8_comp_, q);
 
-    memcpy(macroblockd_dst_, &vp8_comp_->mb.e_mbd, sizeof(*macroblockd_dst_));
+    *macroblockd_dst_ = vp8_comp_->mb.e_mbd;
     vp8_setup_block_dptrs(macroblockd_dst_);
   }
 
@@ -116,14 +118,19 @@ class QuantizeTestBase {
 };
 
 class QuantizeTest : public QuantizeTestBase,
-                     public ::testing::TestWithParam<VP8QuantizeParam> {
+                     public ::testing::TestWithParam<VP8QuantizeParam>,
+                     public AbstractBench {
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     SetupCompressor();
     asm_quant_ = GET_PARAM(0);
     c_quant_ = GET_PARAM(1);
   }
 
+  void Run() override {
+    asm_quant_(&vp8_comp_->mb.block[0], &macroblockd_dst_->block[0]);
+  }
+
   void RunComparison() {
     for (int i = 0; i < kNumBlocks; ++i) {
       ASM_REGISTER_STATE_CHECK(
@@ -139,6 +146,7 @@ class QuantizeTest : public QuantizeTestBase,
   VP8Quantize asm_quant_;
   VP8Quantize c_quant_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(QuantizeTest);
 
 TEST_P(QuantizeTest, TestZeroInput) {
   FillCoeffConstant(0);
@@ -166,8 +174,15 @@ TEST_P(QuantizeTest, TestMultipleQ) {
   }
 }
 
+TEST_P(QuantizeTest, DISABLED_Speed) {
+  FillCoeffRandom();
+
+  RunNTimes(10000000);
+  PrintMedian("vp8 quantize");
+}
+
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, QuantizeTest,
     ::testing::Values(
         make_tuple(&vp8_fast_quantize_b_sse2, &vp8_fast_quantize_b_c),
@@ -175,29 +190,45 @@ INSTANTIATE_TEST_CASE_P(
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(SSSE3, QuantizeTest,
-                        ::testing::Values(make_tuple(&vp8_fast_quantize_b_ssse3,
-                                                     &vp8_fast_quantize_b_c)));
+INSTANTIATE_TEST_SUITE_P(
+    SSSE3, QuantizeTest,
+    ::testing::Values(make_tuple(&vp8_fast_quantize_b_ssse3,
+                                 &vp8_fast_quantize_b_c)));
 #endif  // HAVE_SSSE3
 
 #if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE4_1, QuantizeTest,
     ::testing::Values(make_tuple(&vp8_regular_quantize_b_sse4_1,
                                  &vp8_regular_quantize_b_c)));
 #endif  // HAVE_SSE4_1
 
 #if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(NEON, QuantizeTest,
-                        ::testing::Values(make_tuple(&vp8_fast_quantize_b_neon,
-                                                     &vp8_fast_quantize_b_c)));
+INSTANTIATE_TEST_SUITE_P(NEON, QuantizeTest,
+                         ::testing::Values(make_tuple(&vp8_fast_quantize_b_neon,
+                                                      &vp8_fast_quantize_b_c)));
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     MSA, QuantizeTest,
     ::testing::Values(
         make_tuple(&vp8_fast_quantize_b_msa, &vp8_fast_quantize_b_c),
         make_tuple(&vp8_regular_quantize_b_msa, &vp8_regular_quantize_b_c)));
 #endif  // HAVE_MSA
+
+#if HAVE_MMI
+INSTANTIATE_TEST_SUITE_P(
+    MMI, QuantizeTest,
+    ::testing::Values(
+        make_tuple(&vp8_fast_quantize_b_mmi, &vp8_fast_quantize_b_c),
+        make_tuple(&vp8_regular_quantize_b_mmi, &vp8_regular_quantize_b_c)));
+#endif  // HAVE_MMI
+
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(
+    LSX, QuantizeTest,
+    ::testing::Values(make_tuple(&vp8_regular_quantize_b_lsx,
+                                 &vp8_regular_quantize_b_c)));
+#endif  // HAVE_LSX
 }  // namespace
diff --git a/media/libvpx/libvpx/test/realtime_test.cc b/media/libvpx/libvpx/test/realtime_test.cc
index 63f1ac3c29..b0cc929dda 100644
--- a/media/libvpx/libvpx/test/realtime_test.cc
+++ b/media/libvpx/libvpx/test/realtime_test.cc
@@ -7,11 +7,14 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <limits.h>
+
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/util.h"
 #include "test/video_source.h"
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "vpx_config.h"
 
 namespace {
 
@@ -24,40 +27,96 @@ class RealtimeTest
       public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
  protected:
   RealtimeTest() : EncoderTest(GET_PARAM(0)), frame_packets_(0) {}
-  virtual ~RealtimeTest() {}
+  ~RealtimeTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     cfg_.g_lag_in_frames = 0;
     SetMode(::libvpx_test::kRealTime);
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
+#if !CONFIG_REALTIME_ONLY
     // TODO(tomfinegan): We're changing the pass value here to make sure
     // we get frames when real time mode is combined with |g_pass| set to
     // VPX_RC_FIRST_PASS. This is necessary because EncoderTest::RunLoop() sets
     // the pass value based on the mode passed into EncoderTest::SetMode(),
     // which overrides the one specified in SetUp() above.
     cfg_.g_pass = VPX_RC_FIRST_PASS;
+#endif
   }
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0 && set_cpu_used_) {
+      encoder->Control(VP8E_SET_CPUUSED, 8);
+    }
+  }
+
+  void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) override {
     frame_packets_++;
   }
 
+  bool IsVP9() const {
+#if CONFIG_VP9_ENCODER
+    return codec_ == &libvpx_test::kVP9;
+#else
+    return false;
+#endif
+  }
+
+  void TestIntegerOverflow(unsigned int width, unsigned int height) {
+    ::libvpx_test::RandomVideoSource video;
+    video.SetSize(width, height);
+    video.set_limit(20);
+    cfg_.rc_target_bitrate = UINT_MAX;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void TestEncode() {
+    ::libvpx_test::RandomVideoSource video;
+    video.SetSize(kVideoSourceWidth, kVideoSourceHeight);
+    video.set_limit(kFramesToEncode);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    EXPECT_EQ(kFramesToEncode, frame_packets_);
+  }
+
   int frame_packets_;
+  bool set_cpu_used_ = true;
 };
 
-TEST_P(RealtimeTest, RealtimeFirstPassProducesFrames) {
-  ::libvpx_test::RandomVideoSource video;
-  video.SetSize(kVideoSourceWidth, kVideoSourceHeight);
-  video.set_limit(kFramesToEncode);
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  EXPECT_EQ(kFramesToEncode, frame_packets_);
+TEST_P(RealtimeTest, RealtimeFirstPassProducesFrames) { TestEncode(); }
+
+TEST_P(RealtimeTest, RealtimeDefaultCpuUsed) {
+  set_cpu_used_ = false;
+  TestEncode();
 }
 
-VP8_INSTANTIATE_TEST_CASE(RealtimeTest,
-                          ::testing::Values(::libvpx_test::kRealTime));
-VP9_INSTANTIATE_TEST_CASE(RealtimeTest,
-                          ::testing::Values(::libvpx_test::kRealTime));
+TEST_P(RealtimeTest, IntegerOverflow) { TestIntegerOverflow(2048, 2048); }
+
+TEST_P(RealtimeTest, IntegerOverflowLarge) {
+#ifdef CHROMIUM
+  GTEST_SKIP() << "16K framebuffers are not supported by Chromium's allocator.";
+#else
+  if (IsVP9()) {
+#if VPX_ARCH_AARCH64 || VPX_ARCH_X86_64
+    TestIntegerOverflow(16384, 16384);
+#else
+    TestIntegerOverflow(4096, 4096);
+#endif
+  } else {
+    GTEST_SKIP()
+        << "TODO(https://crbug.com/webm/1748,https://crbug.com/webm/1751):"
+        << " Enable this test after bitstream errors & undefined sanitizer "
+           "warnings are fixed.";
+    // TestIntegerOverflow(16383, 16383);
+  }
+#endif  // defined(CHROMIUM)
+}
+
+VP8_INSTANTIATE_TEST_SUITE(RealtimeTest,
+                           ::testing::Values(::libvpx_test::kRealTime));
+VP9_INSTANTIATE_TEST_SUITE(RealtimeTest,
+                           ::testing::Values(::libvpx_test::kRealTime));
 
 }  // namespace
diff --git a/media/libvpx/libvpx/test/register_state_check.h b/media/libvpx/libvpx/test/register_state_check.h
index 84641c8e99..96795f65be 100644
--- a/media/libvpx/libvpx/test/register_state_check.h
+++ b/media/libvpx/libvpx/test/register_state_check.h
@@ -8,10 +8,10 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef TEST_REGISTER_STATE_CHECK_H_
-#define TEST_REGISTER_STATE_CHECK_H_
+#ifndef VPX_TEST_REGISTER_STATE_CHECK_H_
+#define VPX_TEST_REGISTER_STATE_CHECK_H_
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
@@ -28,13 +28,14 @@
 //   See platform implementations of RegisterStateCheckXXX for details.
 //
 
-#if defined(_WIN64)
+#if defined(_WIN64) && VPX_ARCH_X86_64
 
 #undef NOMINMAX
 #define NOMINMAX
 #ifndef WIN32_LEAN_AND_MEAN
 #define WIN32_LEAN_AND_MEAN
 #endif
+#include <intrin.h>
 #include <windows.h>
 #include <winnt.h>
 
@@ -55,7 +56,7 @@ class RegisterStateCheck {
  private:
   static bool StoreRegisters(CONTEXT *const context) {
     const HANDLE this_thread = GetCurrentThread();
-    EXPECT_TRUE(this_thread != NULL);
+    EXPECT_NE(this_thread, nullptr);
     context->ContextFlags = CONTEXT_FLOATING_POINT;
     const bool context_saved = GetThreadContext(this_thread, context) == TRUE;
     EXPECT_TRUE(context_saved) << "GetLastError: " << GetLastError();
@@ -81,10 +82,13 @@ class RegisterStateCheck {
   CONTEXT pre_context_;
 };
 
-#define ASM_REGISTER_STATE_CHECK(statement)    \
-  do {                                         \
-    libvpx_test::RegisterStateCheck reg_check; \
-    statement;                                 \
+#define ASM_REGISTER_STATE_CHECK(statement)      \
+  do {                                           \
+    {                                            \
+      libvpx_test::RegisterStateCheck reg_check; \
+      statement;                                 \
+    }                                            \
+    _ReadWriteBarrier();                         \
   } while (false)
 
 }  // namespace libvpx_test
@@ -113,19 +117,30 @@ class RegisterStateCheck {
     int64_t post_store[8];
     vpx_push_neon(post_store);
     for (int i = 0; i < 8; ++i) {
-      EXPECT_EQ(pre_store_[i], post_store[i]) << "d" << i + 8
-                                              << " has been modified";
+      EXPECT_EQ(pre_store_[i], post_store[i])
+          << "d" << i + 8 << " has been modified";
     }
   }
 
   int64_t pre_store_[8];
 };
 
+#if defined(__GNUC__)
+#define ASM_REGISTER_STATE_CHECK(statement)      \
+  do {                                           \
+    {                                            \
+      libvpx_test::RegisterStateCheck reg_check; \
+      statement;                                 \
+    }                                            \
+    __asm__ volatile("" ::: "memory");           \
+  } while (false)
+#else
 #define ASM_REGISTER_STATE_CHECK(statement)    \
   do {                                         \
     libvpx_test::RegisterStateCheck reg_check; \
     statement;                                 \
   } while (false)
+#endif
 
 }  // namespace libvpx_test
 
@@ -138,9 +153,9 @@ class RegisterStateCheck {};
 
 }  // namespace libvpx_test
 
-#endif  // _WIN64
+#endif  // _WIN64 && VPX_ARCH_X86_64
 
-#if ARCH_X86 || ARCH_X86_64
+#if VPX_ARCH_X86 || VPX_ARCH_X86_64
 #if defined(__GNUC__)
 
 namespace libvpx_test {
@@ -169,19 +184,22 @@ class RegisterStateCheckMMX {
   uint16_t pre_fpu_env_[14];
 };
 
-#define API_REGISTER_STATE_CHECK(statement)       \
-  do {                                            \
-    libvpx_test::RegisterStateCheckMMX reg_check; \
-    ASM_REGISTER_STATE_CHECK(statement);          \
+#define API_REGISTER_STATE_CHECK(statement)             \
+  do {                                                  \
+    {                                                   \
+      libvpx_test::RegisterStateCheckMMX reg_check_mmx; \
+      ASM_REGISTER_STATE_CHECK(statement);              \
+    }                                                   \
+    __asm__ volatile("" ::: "memory");                  \
   } while (false)
 
 }  // namespace libvpx_test
 
 #endif  // __GNUC__
-#endif  // ARCH_X86 || ARCH_X86_64
+#endif  // VPX_ARCH_X86 || VPX_ARCH_X86_64
 
 #ifndef API_REGISTER_STATE_CHECK
 #define API_REGISTER_STATE_CHECK ASM_REGISTER_STATE_CHECK
 #endif
 
-#endif  // TEST_REGISTER_STATE_CHECK_H_
+#endif  // VPX_TEST_REGISTER_STATE_CHECK_H_
diff --git a/media/libvpx/libvpx/test/resize_test.cc b/media/libvpx/libvpx/test/resize_test.cc
index c9950dd433..1a8319481d 100644
--- a/media/libvpx/libvpx/test/resize_test.cc
+++ b/media/libvpx/libvpx/test/resize_test.cc
@@ -7,16 +7,15 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include <stdio.h>
-
 #include <climits>
 #include <vector>
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
-#include "test/video_source.h"
 #include "test/util.h"
+#include "test/video_source.h"
+#include "vpx_config.h"
 
 // Enable(1) or Disable(0) writing of the compressed bitstream.
 #define WRITE_COMPRESSED_STREAM 0
@@ -93,10 +92,29 @@ struct FrameInfo {
 
 void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
                          unsigned int initial_h, unsigned int *w,
-                         unsigned int *h, int flag_codec) {
+                         unsigned int *h, bool flag_codec,
+                         bool smaller_width_larger_size,
+                         bool random_input_one_half_only) {
+  *w = initial_w;
+  *h = initial_h;
+
+  if (random_input_one_half_only == 1) {
+    if (frame < 100) {
+      return;
+    }
+    *w = initial_w / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (smaller_width_larger_size) {
+    if (frame < 30) {
+      return;
+    }
+    *w = initial_w * 7 / 10;
+    *h = initial_h * 16 / 10;
+    return;
+  }
   if (frame < 10) {
-    *w = initial_w;
-    *h = initial_h;
     return;
   }
   if (frame < 20) {
@@ -110,8 +128,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     return;
   }
   if (frame < 40) {
-    *w = initial_w;
-    *h = initial_h;
     return;
   }
   if (frame < 50) {
@@ -125,8 +141,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     return;
   }
   if (frame < 70) {
-    *w = initial_w;
-    *h = initial_h;
     return;
   }
   if (frame < 80) {
@@ -145,8 +159,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     return;
   }
   if (frame < 110) {
-    *w = initial_w;
-    *h = initial_h;
     return;
   }
   if (frame < 120) {
@@ -165,8 +177,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     return;
   }
   if (frame < 150) {
-    *w = initial_w;
-    *h = initial_h;
     return;
   }
   if (frame < 160) {
@@ -185,8 +195,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     return;
   }
   if (frame < 190) {
-    *w = initial_w;
-    *h = initial_h;
     return;
   }
   if (frame < 200) {
@@ -205,8 +213,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     return;
   }
   if (frame < 230) {
-    *w = initial_w;
-    *h = initial_h;
     return;
   }
   if (frame < 240) {
@@ -220,8 +226,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     return;
   }
   if (frame < 260) {
-    *w = initial_w;
-    *h = initial_h;
     return;
   }
   // Go down very low.
@@ -234,34 +238,52 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     // Cases that only works for VP9.
     // For VP9: Swap width and height of original.
     if (frame < 320) {
-      *w = initial_h;
-      *h = initial_w;
       return;
     }
   }
-  *w = initial_w;
-  *h = initial_h;
 }
 
 class ResizingVideoSource : public ::libvpx_test::DummyVideoSource {
  public:
-  ResizingVideoSource() {
-    SetSize(kInitialWidth, kInitialHeight);
+  ResizingVideoSource(int width, int height)
+      : smaller_width_larger_size_(false), random_input_one_half_only_(false),
+        configured_width_(width), configured_height_(height) {
     limit_ = 350;
+    SetSize(configured_width_, configured_height_);
   }
-  int flag_codec_;
-  virtual ~ResizingVideoSource() {}
+  bool flag_codec_;
+  bool smaller_width_larger_size_;
+  bool random_input_one_half_only_;
+  // configured_width_/height_ is the configured resolution when codec is
+  // created.
+  int configured_width_;
+  int configured_height_;
+  ~ResizingVideoSource() override = default;
 
  protected:
-  virtual void Next() {
+  void Next() override {
     ++frame_;
-    unsigned int width;
-    unsigned int height;
-    ScaleForFrameNumber(frame_, kInitialWidth, kInitialHeight, &width, &height,
-                        flag_codec_);
+    unsigned int width = 0;
+    unsigned int height = 0;
+    ScaleForFrameNumber(frame_, configured_width_, configured_height_, &width,
+                        &height, flag_codec_, smaller_width_larger_size_,
+                        random_input_one_half_only_);
     SetSize(width, height);
     FillFrame();
   }
+
+  void FillFrame() override {
+    if (img_) {
+      memset(img_->img_data, 0, raw_sz_);
+      if (random_input_one_half_only_) {
+        libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+        unsigned char *image = img_->planes[0];
+        for (size_t i = 0; i < raw_sz_; ++i) {
+          image[i] = rnd.Rand8();
+        }
+      }
+    }
+  }
 };
 
 class ResizeTest
@@ -270,38 +292,60 @@ class ResizeTest
  protected:
   ResizeTest() : EncoderTest(GET_PARAM(0)) {}
 
-  virtual ~ResizeTest() {}
+  ~ResizeTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
   }
 
-  virtual void DecompressedFrameHook(const vpx_image_t &img,
-                                     vpx_codec_pts_t pts) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
+    ASSERT_NE(static_cast<int>(pkt->data.frame.width[0]), 0);
+    ASSERT_NE(static_cast<int>(pkt->data.frame.height[0]), 0);
+    encode_frame_width_.push_back(pkt->data.frame.width[0]);
+    encode_frame_height_.push_back(pkt->data.frame.height[0]);
+  }
+
+  unsigned int GetFrameWidth(size_t idx) const {
+    return encode_frame_width_[idx];
+  }
+
+  unsigned int GetFrameHeight(size_t idx) const {
+    return encode_frame_height_[idx];
+  }
+
+  void DecompressedFrameHook(const vpx_image_t &img,
+                             vpx_codec_pts_t pts) override {
     frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
   }
 
   std::vector<FrameInfo> frame_info_list_;
+  std::vector<unsigned int> encode_frame_width_;
+  std::vector<unsigned int> encode_frame_height_;
 };
 
 TEST_P(ResizeTest, TestExternalResizeWorks) {
-  ResizingVideoSource video;
-  video.flag_codec_ = 0;
+  ResizingVideoSource video(kInitialWidth, kInitialHeight);
+  video.flag_codec_ = false;
+  video.smaller_width_larger_size_ = false;
   cfg_.g_lag_in_frames = 0;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
-  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
-       info != frame_info_list_.end(); ++info) {
-    const unsigned int frame = static_cast<unsigned>(info->pts);
+  for (const auto &info : frame_info_list_) {
+    const unsigned int frame = static_cast<unsigned>(info.pts);
     unsigned int expected_w;
     unsigned int expected_h;
+    const size_t idx = &info - &frame_info_list_[0];
+    ASSERT_EQ(info.w, GetFrameWidth(idx));
+    ASSERT_EQ(info.h, GetFrameHeight(idx));
     ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w,
-                        &expected_h, 0);
-    EXPECT_EQ(expected_w, info->w) << "Frame " << frame
-                                   << " had unexpected width";
-    EXPECT_EQ(expected_h, info->h) << "Frame " << frame
-                                   << " had unexpected height";
+                        &expected_h, video.flag_codec_,
+                        video.smaller_width_larger_size_,
+                        /*random_input_one_half_only=*/false);
+    EXPECT_EQ(expected_w, info.w)
+        << "Frame " << frame << " had unexpected width";
+    EXPECT_EQ(expected_h, info.h)
+        << "Frame " << frame << " had unexpected height";
   }
 }
 
@@ -312,32 +356,32 @@ class ResizeInternalTest : public ResizeTest {
  protected:
 #if WRITE_COMPRESSED_STREAM
   ResizeInternalTest()
-      : ResizeTest(), frame0_psnr_(0.0), outfile_(NULL), out_frames_(0) {}
+      : ResizeTest(), frame0_psnr_(0.0), outfile_(nullptr), out_frames_(0) {}
 #else
   ResizeInternalTest() : ResizeTest(), frame0_psnr_(0.0) {}
 #endif
 
-  virtual ~ResizeInternalTest() {}
+  ~ResizeInternalTest() override = default;
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
 #if WRITE_COMPRESSED_STREAM
     outfile_ = fopen("vp90-2-05-resize.ivf", "wb");
 #endif
   }
 
-  virtual void EndPassHook() {
+  void EndPassHook() override {
 #if WRITE_COMPRESSED_STREAM
     if (outfile_) {
       if (!fseek(outfile_, 0, SEEK_SET))
         write_ivf_file_header(&cfg_, out_frames_, outfile_);
       fclose(outfile_);
-      outfile_ = NULL;
+      outfile_ = nullptr;
     }
 #endif
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
     if (change_config_) {
       int new_q = 60;
       if (video->frame() == 0) {
@@ -362,13 +406,13 @@ class ResizeInternalTest : public ResizeTest {
     }
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0];
     EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0);
   }
 
 #if WRITE_COMPRESSED_STREAM
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     ++out_frames_;
 
     // Write initial file header if first frame.
@@ -404,15 +448,14 @@ TEST_P(ResizeInternalTest, TestInternalResizeWorks) {
   cfg_.g_lag_in_frames = 0;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
-  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
-       info != frame_info_list_.end(); ++info) {
-    const vpx_codec_pts_t pts = info->pts;
+  for (const auto &info : frame_info_list_) {
+    const vpx_codec_pts_t pts = info.pts;
     if (pts >= kStepDownFrame && pts < kStepUpFrame) {
-      ASSERT_EQ(282U, info->w) << "Frame " << pts << " had unexpected width";
-      ASSERT_EQ(173U, info->h) << "Frame " << pts << " had unexpected height";
+      ASSERT_EQ(282U, info.w) << "Frame " << pts << " had unexpected width";
+      ASSERT_EQ(173U, info.h) << "Frame " << pts << " had unexpected height";
     } else {
-      EXPECT_EQ(352U, info->w) << "Frame " << pts << " had unexpected width";
-      EXPECT_EQ(288U, info->h) << "Frame " << pts << " had unexpected height";
+      EXPECT_EQ(352U, info.w) << "Frame " << pts << " had unexpected width";
+      EXPECT_EQ(288U, info.h) << "Frame " << pts << " had unexpected height";
     }
   }
 }
@@ -431,13 +474,17 @@ class ResizeRealtimeTest
       public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
  protected:
   ResizeRealtimeTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~ResizeRealtimeTest() {}
+  ~ResizeRealtimeTest() override = default;
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP9E_SET_AQ_MODE, 3);
       encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+      if (cfg_.g_threads > 0) {
+        encoder->Control(VP9E_SET_ROW_MT, 1);
+        encoder->Control(VP9E_SET_TILE_COLUMNS, cfg_.g_threads >> 1);
+      }
     }
 
     if (change_bitrate_ && video->frame() == 120) {
@@ -447,25 +494,40 @@ class ResizeRealtimeTest
     }
   }
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
     set_cpu_used_ = GET_PARAM(2);
   }
 
-  virtual void DecompressedFrameHook(const vpx_image_t &img,
-                                     vpx_codec_pts_t pts) {
+  void DecompressedFrameHook(const vpx_image_t &img,
+                             vpx_codec_pts_t pts) override {
     frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
   }
 
-  virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) {
+  void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) override {
     double mismatch_psnr = compute_psnr(img1, img2);
     mismatch_psnr_ += mismatch_psnr;
     ++mismatch_nframes_;
   }
 
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
+    ASSERT_NE(static_cast<int>(pkt->data.frame.width[0]), 0);
+    ASSERT_NE(static_cast<int>(pkt->data.frame.height[0]), 0);
+    encode_frame_width_.push_back(pkt->data.frame.width[0]);
+    encode_frame_height_.push_back(pkt->data.frame.height[0]);
+  }
+
   unsigned int GetMismatchFrames() { return mismatch_nframes_; }
 
+  unsigned int GetFrameWidth(size_t idx) const {
+    return encode_frame_width_[idx];
+  }
+
+  unsigned int GetFrameHeight(size_t idx) const {
+    return encode_frame_height_[idx];
+  }
+
   void DefaultConfig() {
     cfg_.rc_buf_initial_sz = 500;
     cfg_.rc_buf_optimal_sz = 600;
@@ -493,11 +555,14 @@ class ResizeRealtimeTest
   bool change_bitrate_;
   double mismatch_psnr_;
   int mismatch_nframes_;
+  std::vector<unsigned int> encode_frame_width_;
+  std::vector<unsigned int> encode_frame_height_;
 };
 
 TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
-  ResizingVideoSource video;
-  video.flag_codec_ = 1;
+  ResizingVideoSource video(kInitialWidth, kInitialHeight);
+  video.flag_codec_ = true;
+  video.smaller_width_larger_size_ = false;
   DefaultConfig();
   // Disable internal resize for this test.
   cfg_.rc_resize_allowed = 0;
@@ -506,18 +571,80 @@ TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
   mismatch_nframes_ = 0;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
-  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
-       info != frame_info_list_.end(); ++info) {
-    const unsigned int frame = static_cast<unsigned>(info->pts);
+  for (const auto &info : frame_info_list_) {
+    const unsigned int frame = static_cast<unsigned>(info.pts);
     unsigned int expected_w;
     unsigned int expected_h;
     ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w,
-                        &expected_h, 1);
-    EXPECT_EQ(expected_w, info->w) << "Frame " << frame
-                                   << " had unexpected width";
-    EXPECT_EQ(expected_h, info->h) << "Frame " << frame
-                                   << " had unexpected height";
-    EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+                        &expected_h, video.flag_codec_,
+                        video.smaller_width_larger_size_,
+                        /*random_input_one_half_only=*/false);
+    EXPECT_EQ(expected_w, info.w)
+        << "Frame " << frame << " had unexpected width";
+    EXPECT_EQ(expected_h, info.h)
+        << "Frame " << frame << " had unexpected height";
+    EXPECT_EQ(GetMismatchFrames(), static_cast<unsigned int>(0));
+  }
+}
+
+// This test uses 4 threads with small keyframe spacing, random input,
+// and uses 640x480 as initial resolution.
+TEST_P(ResizeRealtimeTest, TestExternalResizeWorks4Threads) {
+  ResizingVideoSource video(640, 480);
+  video.flag_codec_ = true;
+  video.smaller_width_larger_size_ = false;
+  video.random_input_one_half_only_ = true;
+  DefaultConfig();
+  // Disable internal resize for this test.
+  cfg_.rc_resize_allowed = 0;
+  cfg_.g_threads = 4;
+  cfg_.kf_max_dist = 50;
+  cfg_.kf_min_dist = 50;
+  change_bitrate_ = false;
+  mismatch_psnr_ = 0.0;
+  mismatch_nframes_ = 0;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  for (const auto &info : frame_info_list_) {
+    const unsigned int frame = static_cast<unsigned>(info.pts);
+    unsigned int expected_w;
+    unsigned int expected_h;
+    ScaleForFrameNumber(frame, 640, 480, &expected_w, &expected_h,
+                        video.flag_codec_, video.smaller_width_larger_size_,
+                        video.random_input_one_half_only_);
+    EXPECT_EQ(expected_w, info.w)
+        << "Frame " << frame << " had unexpected width";
+    EXPECT_EQ(expected_h, info.h)
+        << "Frame " << frame << " had unexpected height";
+    EXPECT_EQ(GetMismatchFrames(), static_cast<unsigned int>(0));
+  }
+}
+
+TEST_P(ResizeRealtimeTest, TestExternalResizeSmallerWidthBiggerSize) {
+  ResizingVideoSource video(kInitialWidth, kInitialHeight);
+  video.flag_codec_ = true;
+  video.smaller_width_larger_size_ = true;
+  DefaultConfig();
+  // Disable internal resize for this test.
+  cfg_.rc_resize_allowed = 0;
+  change_bitrate_ = false;
+  mismatch_psnr_ = 0.0;
+  mismatch_nframes_ = 0;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  for (const auto &info : frame_info_list_) {
+    const unsigned int frame = static_cast<unsigned>(info.pts);
+    unsigned int expected_w;
+    unsigned int expected_h;
+    ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w,
+                        &expected_h, video.flag_codec_,
+                        video.smaller_width_larger_size_,
+                        /*random_input_one_half_only=*/false);
+    EXPECT_EQ(expected_w, info.w)
+        << "Frame " << frame << " had unexpected width";
+    EXPECT_EQ(expected_h, info.h)
+        << "Frame " << frame << " had unexpected height";
+    EXPECT_EQ(GetMismatchFrames(), static_cast<unsigned int>(0));
   }
 }
 
@@ -525,37 +652,37 @@ TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
 // Run at low bitrate, with resize_allowed = 1, and verify that we get
 // one resize down event.
 TEST_P(ResizeRealtimeTest, TestInternalResizeDown) {
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 299);
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 299);
   DefaultConfig();
-  cfg_.g_w = 352;
-  cfg_.g_h = 288;
+  cfg_.g_w = 640;
+  cfg_.g_h = 480;
   change_bitrate_ = false;
   mismatch_psnr_ = 0.0;
   mismatch_nframes_ = 0;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
+#if CONFIG_VP9_DECODER
   unsigned int last_w = cfg_.g_w;
   unsigned int last_h = cfg_.g_h;
   int resize_count = 0;
-  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
-       info != frame_info_list_.end(); ++info) {
-    if (info->w != last_w || info->h != last_h) {
+  for (const auto &info : frame_info_list_) {
+    if (info.w != last_w || info.h != last_h) {
       // Verify that resize down occurs.
-      ASSERT_LT(info->w, last_w);
-      ASSERT_LT(info->h, last_h);
-      last_w = info->w;
-      last_h = info->h;
+      ASSERT_LT(info.w, last_w);
+      ASSERT_LT(info.h, last_h);
+      last_w = info.w;
+      last_h = info.h;
       resize_count++;
     }
   }
 
-#if CONFIG_VP9_DECODER
   // Verify that we get 1 resize down event in this test.
   ASSERT_EQ(1, resize_count) << "Resizing should occur.";
-  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+  EXPECT_EQ(GetMismatchFrames(), static_cast<unsigned int>(0));
 #else
-  printf("Warning: VP9 decoder unavailable, unable to check resize count!\n");
+  GTEST_SKIP()
+      << "Warning: VP9 decoder unavailable, unable to check resize count!\n";
 #endif
 }
 
@@ -563,11 +690,11 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDown) {
 // Start at low target bitrate, raise the bitrate in the middle of the clip,
 // scaling-up should occur after bitrate changed.
 TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 359);
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
   DefaultConfig();
-  cfg_.g_w = 352;
-  cfg_.g_h = 288;
+  cfg_.g_w = 640;
+  cfg_.g_h = 480;
   change_bitrate_ = true;
   mismatch_psnr_ = 0.0;
   mismatch_nframes_ = 0;
@@ -580,30 +707,33 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
   unsigned int last_w = cfg_.g_w;
   unsigned int last_h = cfg_.g_h;
   int resize_count = 0;
-  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
-       info != frame_info_list_.end(); ++info) {
-    if (info->w != last_w || info->h != last_h) {
+  for (const auto &info : frame_info_list_) {
+    const size_t idx = &info - &frame_info_list_[0];
+    ASSERT_EQ(info.w, GetFrameWidth(idx));
+    ASSERT_EQ(info.h, GetFrameHeight(idx));
+    if (info.w != last_w || info.h != last_h) {
       resize_count++;
-      if (resize_count == 1) {
+      if (resize_count <= 2) {
         // Verify that resize down occurs.
-        ASSERT_LT(info->w, last_w);
-        ASSERT_LT(info->h, last_h);
-      } else if (resize_count == 2) {
+        ASSERT_LT(info.w, last_w);
+        ASSERT_LT(info.h, last_h);
+      } else if (resize_count > 2) {
         // Verify that resize up occurs.
-        ASSERT_GT(info->w, last_w);
-        ASSERT_GT(info->h, last_h);
+        ASSERT_GT(info.w, last_w);
+        ASSERT_GT(info.h, last_h);
       }
-      last_w = info->w;
-      last_h = info->h;
+      last_w = info.w;
+      last_h = info.h;
     }
   }
 
 #if CONFIG_VP9_DECODER
-  // Verify that we get 2 resize events in this test.
-  ASSERT_EQ(resize_count, 2) << "Resizing should occur twice.";
-  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+  // Verify that we get 4 resize events in this test.
+  ASSERT_EQ(resize_count, 4) << "Resizing should occur twice.";
+  EXPECT_EQ(GetMismatchFrames(), static_cast<unsigned int>(0));
 #else
-  printf("Warning: VP9 decoder unavailable, unable to check resize count!\n");
+  GTEST_SKIP()
+      << "Warning: VP9 decoder unavailable, unable to check resize count!\n";
 #endif
 }
 
@@ -617,32 +747,32 @@ class ResizeCspTest : public ResizeTest {
  protected:
 #if WRITE_COMPRESSED_STREAM
   ResizeCspTest()
-      : ResizeTest(), frame0_psnr_(0.0), outfile_(NULL), out_frames_(0) {}
+      : ResizeTest(), frame0_psnr_(0.0), outfile_(nullptr), out_frames_(0) {}
 #else
   ResizeCspTest() : ResizeTest(), frame0_psnr_(0.0) {}
 #endif
 
-  virtual ~ResizeCspTest() {}
+  ~ResizeCspTest() override = default;
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
 #if WRITE_COMPRESSED_STREAM
     outfile_ = fopen("vp91-2-05-cspchape.ivf", "wb");
 #endif
   }
 
-  virtual void EndPassHook() {
+  void EndPassHook() override {
 #if WRITE_COMPRESSED_STREAM
     if (outfile_) {
       if (!fseek(outfile_, 0, SEEK_SET))
         write_ivf_file_header(&cfg_, out_frames_, outfile_);
       fclose(outfile_);
-      outfile_ = NULL;
+      outfile_ = nullptr;
     }
 #endif
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
     if (CspForFrameNumber(video->frame()) != VPX_IMG_FMT_I420 &&
         cfg_.g_profile != 1) {
       cfg_.g_profile = 1;
@@ -655,13 +785,13 @@ class ResizeCspTest : public ResizeTest {
     }
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0];
     EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0);
   }
 
 #if WRITE_COMPRESSED_STREAM
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     ++out_frames_;
 
     // Write initial file header if first frame.
@@ -687,10 +817,10 @@ class ResizingCspVideoSource : public ::libvpx_test::DummyVideoSource {
     limit_ = 30;
   }
 
-  virtual ~ResizingCspVideoSource() {}
+  ~ResizingCspVideoSource() override = default;
 
  protected:
-  virtual void Next() {
+  void Next() override {
     ++frame_;
     SetImageFormat(CspForFrameNumber(frame_));
     FillFrame();
@@ -705,14 +835,13 @@ TEST_P(ResizeCspTest, TestResizeCspWorks) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
-VP8_INSTANTIATE_TEST_CASE(ResizeTest, ONE_PASS_TEST_MODES);
-VP9_INSTANTIATE_TEST_CASE(ResizeTest,
-                          ::testing::Values(::libvpx_test::kRealTime));
-VP9_INSTANTIATE_TEST_CASE(ResizeInternalTest,
-                          ::testing::Values(::libvpx_test::kOnePassBest));
-VP9_INSTANTIATE_TEST_CASE(ResizeRealtimeTest,
-                          ::testing::Values(::libvpx_test::kRealTime),
-                          ::testing::Range(5, 9));
-VP9_INSTANTIATE_TEST_CASE(ResizeCspTest,
-                          ::testing::Values(::libvpx_test::kRealTime));
+VP8_INSTANTIATE_TEST_SUITE(ResizeTest, ONE_PASS_TEST_MODES);
+VP9_INSTANTIATE_TEST_SUITE(ResizeTest, ONE_PASS_TEST_MODES);
+VP9_INSTANTIATE_TEST_SUITE(ResizeInternalTest,
+                           ::testing::Values(::libvpx_test::kOnePassBest));
+VP9_INSTANTIATE_TEST_SUITE(ResizeRealtimeTest,
+                           ::testing::Values(::libvpx_test::kRealTime),
+                           ::testing::Range(5, 9));
+VP9_INSTANTIATE_TEST_SUITE(ResizeCspTest,
+                           ::testing::Values(::libvpx_test::kRealTime));
 }  // namespace
diff --git a/media/libvpx/libvpx/test/resize_util.sh b/media/libvpx/libvpx/test/resize_util.sh
deleted file mode 100644
index 5e472716da..0000000000
--- a/media/libvpx/libvpx/test/resize_util.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/bin/sh
-##
-##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-##
-##  Use of this source code is governed by a BSD-style license
-##  that can be found in the LICENSE file in the root of the source
-##  tree. An additional intellectual property rights grant can be found
-##  in the file PATENTS.  All contributing project authors may
-##  be found in the AUTHORS file in the root of the source tree.
-##
-##  This file tests the libvpx resize_util example code. To add new tests to
-##  this file, do the following:
-##    1. Write a shell function (this is your test).
-##    2. Add the function to resize_util_tests (on a new line).
-##
-. $(dirname $0)/tools_common.sh
-
-# Environment check: $YUV_RAW_INPUT is required.
-resize_util_verify_environment() {
-  if [ ! -e "${YUV_RAW_INPUT}" ]; then
-    echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
-    return 1
-  fi
-}
-
-# Resizes $YUV_RAW_INPUT using the resize_util example. $1 is the output
-# dimensions that will be passed to resize_util.
-resize_util() {
-  local resizer="${LIBVPX_BIN_PATH}/resize_util${VPX_TEST_EXE_SUFFIX}"
-  local output_file="${VPX_TEST_OUTPUT_DIR}/resize_util.raw"
-  local frames_to_resize="10"
-  local target_dimensions="$1"
-
-  # resize_util is available only when CONFIG_SHARED is disabled.
-  if [ -z "$(vpx_config_option_enabled CONFIG_SHARED)" ]; then
-    if [ ! -x "${resizer}" ]; then
-      elog "${resizer} does not exist or is not executable."
-      return 1
-    fi
-
-    eval "${VPX_TEST_PREFIX}" "${resizer}" "${YUV_RAW_INPUT}" \
-        "${YUV_RAW_INPUT_WIDTH}x${YUV_RAW_INPUT_HEIGHT}" \
-        "${target_dimensions}" "${output_file}" ${frames_to_resize} \
-        ${devnull}
-
-    [ -e "${output_file}" ] || return 1
-  fi
-}
-
-# Halves each dimension of $YUV_RAW_INPUT using resize_util().
-resize_down() {
-  local target_width=$((${YUV_RAW_INPUT_WIDTH} / 2))
-  local target_height=$((${YUV_RAW_INPUT_HEIGHT} / 2))
-
-  resize_util "${target_width}x${target_height}"
-}
-
-# Doubles each dimension of $YUV_RAW_INPUT using resize_util().
-resize_up() {
-  local target_width=$((${YUV_RAW_INPUT_WIDTH} * 2))
-  local target_height=$((${YUV_RAW_INPUT_HEIGHT} * 2))
-
-  resize_util "${target_width}x${target_height}"
-}
-
-resize_util_tests="resize_down
-                   resize_up"
-
-run_tests resize_util_verify_environment "${resize_util_tests}"
diff --git a/media/libvpx/libvpx/test/sad_test.cc b/media/libvpx/libvpx/test/sad_test.cc
index 837b08fbdf..d8d304f3c6 100644
--- a/media/libvpx/libvpx/test/sad_test.cc
+++ b/media/libvpx/libvpx/test/sad_test.cc
@@ -8,21 +8,27 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <stdio.h>
 #include <string.h>
 #include <limits.h>
-#include <stdio.h>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
+#include "test/bench.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "vpx/vpx_codec.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
+#include "vpx_ports/vpx_timer.h"
+
+// const[expr] should be sufficient for DECLARE_ALIGNED but early
+// implementations of c++11 appear to have some issues with it.
+#define kDataAlignment 32
 
 template <typename Function>
 struct TestParams {
@@ -32,19 +38,32 @@ struct TestParams {
   Function func;
 };
 
-typedef unsigned int (*SadMxNFunc)(const uint8_t *src_ptr, int src_stride,
-                                   const uint8_t *ref_ptr, int ref_stride);
-typedef TestParams<SadMxNFunc> SadMxNParam;
+using SadMxNFunc = unsigned int (*)(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride);
+using SadMxNParam = TestParams<SadMxNFunc>;
 
-typedef unsigned int (*SadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride,
-                                      const uint8_t *ref_ptr, int ref_stride,
-                                      const uint8_t *second_pred);
-typedef TestParams<SadMxNAvgFunc> SadMxNAvgParam;
+using SadSkipMxNFunc = unsigned int (*)(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride);
+using SadSkipMxNParam = TestParams<SadSkipMxNFunc>;
 
-typedef void (*SadMxNx4Func)(const uint8_t *src_ptr, int src_stride,
-                             const uint8_t *const ref_ptr[], int ref_stride,
-                             unsigned int *sad_array);
-typedef TestParams<SadMxNx4Func> SadMxNx4Param;
+using SadMxNAvgFunc = unsigned int (*)(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
+                                       const uint8_t *second_pred);
+using SadMxNAvgParam = TestParams<SadMxNAvgFunc>;
+
+using SadMxNx4Func = void (*)(const uint8_t *src_ptr, int src_stride,
+                              const uint8_t *const ref_ptr[], int ref_stride,
+                              unsigned int *sad_array);
+using SadMxNx4Param = TestParams<SadMxNx4Func>;
+
+using SadSkipMxNx4Func = void (*)(const uint8_t *src_ptr, int src_stride,
+                                  const uint8_t *const ref_ptr[],
+                                  int ref_stride, unsigned int *sad_array);
+using SadSkipMxNx4Param = TestParams<SadSkipMxNx4Func>;
+
+using SadMxNx8Func = void (*)(const uint8_t *src_ptr, int src_stride,
+                              const uint8_t *ref_ptr, int ref_stride,
+                              unsigned int *sad_array);
 
 using libvpx_test::ACMRandom;
 
@@ -54,7 +73,7 @@ class SADTestBase : public ::testing::TestWithParam<ParamType> {
  public:
   explicit SADTestBase(const ParamType &params) : params_(params) {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     source_data8_ = reinterpret_cast<uint8_t *>(
         vpx_memalign(kDataAlignment, kDataBlockSize));
     reference_data8_ = reinterpret_cast<uint8_t *>(
@@ -84,53 +103,64 @@ class SADTestBase : public ::testing::TestWithParam<ParamType> {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     }
     mask_ = (1 << bit_depth_) - 1;
-    source_stride_ = (params_.width + 31) & ~31;
+    source_stride_ = (params_.width + 63) & ~63;
     reference_stride_ = params_.width * 2;
     rnd_.Reset(ACMRandom::DeterministicSeed());
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     vpx_free(source_data8_);
-    source_data8_ = NULL;
+    source_data8_ = nullptr;
     vpx_free(reference_data8_);
-    reference_data8_ = NULL;
+    reference_data8_ = nullptr;
     vpx_free(second_pred8_);
-    second_pred8_ = NULL;
+    second_pred8_ = nullptr;
     vpx_free(source_data16_);
-    source_data16_ = NULL;
+    source_data16_ = nullptr;
     vpx_free(reference_data16_);
-    reference_data16_ = NULL;
+    reference_data16_ = nullptr;
     vpx_free(second_pred16_);
-    second_pred16_ = NULL;
+    second_pred16_ = nullptr;
 
     libvpx_test::ClearSystemState();
   }
 
  protected:
   // Handle blocks up to 4 blocks 64x64 with stride up to 128
-  static const int kDataAlignment = 16;
+  // crbug.com/webm/1660
   static const int kDataBlockSize = 64 * 128;
   static const int kDataBufferSize = 4 * kDataBlockSize;
 
-  uint8_t *GetReference(int block_idx) const {
+  int GetBlockRefOffset(int block_idx) const {
+    return block_idx * kDataBlockSize;
+  }
+
+  uint8_t *GetReferenceFromOffset(int ref_offset) const {
+    assert((params_.height - 1) * reference_stride_ + params_.width - 1 +
+               ref_offset <
+           kDataBufferSize);
 #if CONFIG_VP9_HIGHBITDEPTH
     if (use_high_bit_depth_) {
       return CONVERT_TO_BYTEPTR(CONVERT_TO_SHORTPTR(reference_data_) +
-                                block_idx * kDataBlockSize);
+                                ref_offset);
     }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-    return reference_data_ + block_idx * kDataBlockSize;
+    return reference_data_ + ref_offset;
+  }
+
+  uint8_t *GetReference(int block_idx) const {
+    return GetReferenceFromOffset(GetBlockRefOffset(block_idx));
   }
 
   // Sum of Absolute Differences. Given two blocks, calculate the absolute
   // difference between two pixels in the same relative location; accumulate.
-  uint32_t ReferenceSAD(int block_idx) const {
+  uint32_t ReferenceSAD(int ref_offset) const {
     uint32_t sad = 0;
-    const uint8_t *const reference8 = GetReference(block_idx);
+    const uint8_t *const reference8 = GetReferenceFromOffset(ref_offset);
     const uint8_t *const source8 = source_data_;
 #if CONFIG_VP9_HIGHBITDEPTH
     const uint16_t *const reference16 =
-        CONVERT_TO_SHORTPTR(GetReference(block_idx));
+        CONVERT_TO_SHORTPTR(GetReferenceFromOffset(ref_offset));
     const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     for (int h = 0; h < params_.height; ++h) {
@@ -149,6 +179,34 @@ class SADTestBase : public ::testing::TestWithParam<ParamType> {
     return sad;
   }
 
+  // Sum of Absolute Differences Skip rows. Given two blocks, calculate the
+  // absolute difference between two pixels in the same relative location every
+  // other row; accumulate and double the result at the end.
+  uint32_t ReferenceSADSkip(int ref_offset) const {
+    uint32_t sad = 0;
+    const uint8_t *const reference8 = GetReferenceFromOffset(ref_offset);
+    const uint8_t *const source8 = source_data_;
+#if CONFIG_VP9_HIGHBITDEPTH
+    const uint16_t *const reference16 =
+        CONVERT_TO_SHORTPTR(GetReferenceFromOffset(ref_offset));
+    const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    for (int h = 0; h < params_.height; h += 2) {
+      for (int w = 0; w < params_.width; ++w) {
+        if (!use_high_bit_depth_) {
+          sad += abs(source8[h * source_stride_ + w] -
+                     reference8[h * reference_stride_ + w]);
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          sad += abs(source16[h * source_stride_ + w] -
+                     reference16[h * reference_stride_ + w]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        }
+      }
+    }
+    return sad * 2;
+  }
+
   // Sum of Absolute Differences Average. Given two blocks, and a prediction
   // calculate the absolute difference between one pixel and average of the
   // corresponding and predicted pixels; accumulate.
@@ -201,24 +259,28 @@ class SADTestBase : public ::testing::TestWithParam<ParamType> {
     }
   }
 
-  void FillRandom(uint8_t *data, int stride) {
+  void FillRandomWH(uint8_t *data, int stride, int w, int h) {
     uint8_t *data8 = data;
 #if CONFIG_VP9_HIGHBITDEPTH
     uint16_t *data16 = CONVERT_TO_SHORTPTR(data);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-    for (int h = 0; h < params_.height; ++h) {
-      for (int w = 0; w < params_.width; ++w) {
+    for (int r = 0; r < h; ++r) {
+      for (int c = 0; c < w; ++c) {
         if (!use_high_bit_depth_) {
-          data8[h * stride + w] = rnd_.Rand8();
+          data8[r * stride + c] = rnd_.Rand8();
 #if CONFIG_VP9_HIGHBITDEPTH
         } else {
-          data16[h * stride + w] = rnd_.Rand16() & mask_;
+          data16[r * stride + c] = rnd_.Rand16() & mask_;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
         }
       }
     }
   }
 
+  void FillRandom(uint8_t *data, int stride) {
+    FillRandomWH(data, stride, params_.width, params_.height);
+  }
+
   uint32_t mask_;
   vpx_bit_depth_t bit_depth_;
   int source_stride_;
@@ -253,18 +315,45 @@ class SADx4Test : public SADTestBase<SadMxNx4Param> {
   }
 
   void CheckSADs() const {
-    uint32_t reference_sad, exp_sad[4];
+    uint32_t reference_sad;
+    DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]);
 
     SADs(exp_sad);
     for (int block = 0; block < 4; ++block) {
-      reference_sad = ReferenceSAD(block);
+      reference_sad = ReferenceSAD(GetBlockRefOffset(block));
 
       EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block;
     }
   }
 };
 
-class SADTest : public SADTestBase<SadMxNParam> {
+class SADSkipx4Test : public SADTestBase<SadMxNx4Param> {
+ public:
+  SADSkipx4Test() : SADTestBase(GetParam()) {}
+
+ protected:
+  void SADs(unsigned int *results) const {
+    const uint8_t *references[] = { GetReference(0), GetReference(1),
+                                    GetReference(2), GetReference(3) };
+
+    ASM_REGISTER_STATE_CHECK(params_.func(
+        source_data_, source_stride_, references, reference_stride_, results));
+  }
+
+  void CheckSADs() const {
+    uint32_t reference_sad;
+    DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]);
+
+    SADs(exp_sad);
+    for (int block = 0; block < 4; ++block) {
+      reference_sad = ReferenceSADSkip(GetBlockRefOffset(block));
+
+      EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block;
+    }
+  }
+};
+
+class SADTest : public AbstractBench, public SADTestBase<SadMxNParam> {
  public:
   SADTest() : SADTestBase(GetParam()) {}
 
@@ -279,14 +368,46 @@ class SADTest : public SADTestBase<SadMxNParam> {
   }
 
   void CheckSAD() const {
-    const unsigned int reference_sad = ReferenceSAD(0);
+    const unsigned int reference_sad = ReferenceSAD(GetBlockRefOffset(0));
     const unsigned int exp_sad = SAD(0);
 
     ASSERT_EQ(reference_sad, exp_sad);
   }
+
+  void Run() override {
+    params_.func(source_data_, source_stride_, reference_data_,
+                 reference_stride_);
+  }
 };
 
-class SADavgTest : public SADTestBase<SadMxNAvgParam> {
+class SADSkipTest : public AbstractBench, public SADTestBase<SadMxNParam> {
+ public:
+  SADSkipTest() : SADTestBase(GetParam()) {}
+
+ protected:
+  unsigned int SAD(int block_idx) const {
+    unsigned int ret;
+    const uint8_t *const reference = GetReference(block_idx);
+
+    ASM_REGISTER_STATE_CHECK(ret = params_.func(source_data_, source_stride_,
+                                                reference, reference_stride_));
+    return ret;
+  }
+
+  void CheckSAD() const {
+    const unsigned int reference_sad = ReferenceSADSkip(GetBlockRefOffset(0));
+    const unsigned int exp_sad = SAD(0);
+
+    ASSERT_EQ(reference_sad, exp_sad);
+  }
+
+  void Run() override {
+    params_.func(source_data_, source_stride_, reference_data_,
+                 reference_stride_);
+  }
+};
+
+class SADavgTest : public AbstractBench, public SADTestBase<SadMxNAvgParam> {
  public:
   SADavgTest() : SADTestBase(GetParam()) {}
 
@@ -307,6 +428,11 @@ class SADavgTest : public SADTestBase<SadMxNAvgParam> {
 
     ASSERT_EQ(reference_sad, exp_sad);
   }
+
+  void Run() override {
+    params_.func(source_data_, source_stride_, reference_data_,
+                 reference_stride_, second_pred_);
+  }
 };
 
 TEST_P(SADTest, MaxRef) {
@@ -350,6 +476,69 @@ TEST_P(SADTest, ShortSrc) {
   source_stride_ = tmp_stride;
 }
 
+TEST_P(SADTest, DISABLED_Speed) {
+  const int kCountSpeedTestBlock = 50000000 / (params_.width * params_.height);
+  FillRandom(source_data_, source_stride_);
+
+  RunNTimes(kCountSpeedTestBlock);
+
+  char title[16];
+  snprintf(title, sizeof(title), "%dx%d", params_.width, params_.height);
+  PrintMedian(title);
+}
+
+TEST_P(SADSkipTest, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(reference_data_, reference_stride_, mask_);
+  CheckSAD();
+}
+
+TEST_P(SADSkipTest, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(reference_data_, reference_stride_, 0);
+  CheckSAD();
+}
+
+TEST_P(SADSkipTest, ShortRef) {
+  const int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipTest, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  const int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipTest, ShortSrc) {
+  const int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  source_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipTest, DISABLED_Speed) {
+  const int kCountSpeedTestBlock = 50000000 / (params_.width * params_.height);
+  FillRandom(source_data_, source_stride_);
+
+  RunNTimes(kCountSpeedTestBlock);
+
+  char title[16];
+  snprintf(title, sizeof(title), "%dx%d", params_.width, params_.height);
+  PrintMedian(title);
+}
+
 TEST_P(SADavgTest, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
   FillConstant(reference_data_, reference_stride_, mask_);
@@ -395,6 +584,19 @@ TEST_P(SADavgTest, ShortSrc) {
   source_stride_ = tmp_stride;
 }
 
+TEST_P(SADavgTest, DISABLED_Speed) {
+  const int kCountSpeedTestBlock = 50000000 / (params_.width * params_.height);
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  FillRandom(second_pred_, params_.width);
+
+  RunNTimes(kCountSpeedTestBlock);
+
+  char title[16];
+  snprintf(title, sizeof(title), "%dx%d", params_.width, params_.height);
+  PrintMedian(title);
+}
+
 TEST_P(SADx4Test, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
   FillConstant(GetReference(0), reference_stride_, mask_);
@@ -463,6 +665,136 @@ TEST_P(SADx4Test, SrcAlignedByWidth) {
   source_data_ = tmp_source_data;
 }
 
+TEST_P(SADx4Test, DISABLED_Speed) {
+  int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  const int kCountSpeedTestBlock = 500000000 / (params_.width * params_.height);
+  uint32_t reference_sad[4];
+  DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]);
+  vpx_usec_timer timer;
+  for (int block = 0; block < 4; ++block) {
+    reference_sad[block] = ReferenceSAD(GetBlockRefOffset(block));
+  }
+  vpx_usec_timer_start(&timer);
+  for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+    SADs(exp_sad);
+  }
+  vpx_usec_timer_mark(&timer);
+  for (int block = 0; block < 4; ++block) {
+    EXPECT_EQ(reference_sad[block], exp_sad[block]) << "block " << block;
+  }
+  const int elapsed_time =
+      static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
+  printf("sad%dx%dx4 (%2dbit) time: %5d ms\n", params_.width, params_.height,
+         bit_depth_, elapsed_time);
+
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipx4Test, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(GetReference(0), reference_stride_, mask_);
+  FillConstant(GetReference(1), reference_stride_, mask_);
+  FillConstant(GetReference(2), reference_stride_, mask_);
+  FillConstant(GetReference(3), reference_stride_, mask_);
+  CheckSADs();
+}
+
+TEST_P(SADSkipx4Test, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(GetReference(0), reference_stride_, 0);
+  FillConstant(GetReference(1), reference_stride_, 0);
+  FillConstant(GetReference(2), reference_stride_, 0);
+  FillConstant(GetReference(3), reference_stride_, 0);
+  CheckSADs();
+}
+
+TEST_P(SADSkipx4Test, ShortRef) {
+  int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipx4Test, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipx4Test, ShortSrc) {
+  int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  source_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipx4Test, SrcAlignedByWidth) {
+  uint8_t *tmp_source_data = source_data_;
+  source_data_ += params_.width;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  source_data_ = tmp_source_data;
+}
+
+TEST_P(SADSkipx4Test, DISABLED_Speed) {
+  int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  const int kCountSpeedTestBlock = 500000000 / (params_.width * params_.height);
+  uint32_t reference_sad[4];
+  DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]);
+  vpx_usec_timer timer;
+  for (int block = 0; block < 4; ++block) {
+    reference_sad[block] = ReferenceSADSkip(GetBlockRefOffset(block));
+  }
+  vpx_usec_timer_start(&timer);
+  for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+    SADs(exp_sad);
+  }
+  vpx_usec_timer_mark(&timer);
+  for (int block = 0; block < 4; ++block) {
+    EXPECT_EQ(reference_sad[block], exp_sad[block]) << "block " << block;
+  }
+  const int elapsed_time =
+      static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
+  printf("sad%dx%dx4 (%2dbit) time: %5d ms\n", params_.width, params_.height,
+         bit_depth_, elapsed_time);
+
+  reference_stride_ = tmp_stride;
+}
+
 //------------------------------------------------------------------------------
 // C functions
 const SadMxNParam c_tests[] = {
@@ -521,7 +853,57 @@ const SadMxNParam c_tests[] = {
   SadMxNParam(4, 4, &vpx_highbd_sad4x4_c, 12),
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 };
-INSTANTIATE_TEST_CASE_P(C, SADTest, ::testing::ValuesIn(c_tests));
+INSTANTIATE_TEST_SUITE_P(C, SADTest, ::testing::ValuesIn(c_tests));
+
+const SadSkipMxNParam skip_c_tests[] = {
+  SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_c),
+  SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_c),
+  SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_c),
+  SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_c),
+  SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_c),
+  SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_c),
+  SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_c),
+  SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_c),
+  SadSkipMxNParam(8, 16, &vpx_sad_skip_8x16_c),
+  SadSkipMxNParam(8, 8, &vpx_sad_skip_8x8_c),
+  SadSkipMxNParam(4, 8, &vpx_sad_skip_4x8_c),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_c, 8),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_c, 8),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_c, 8),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_c, 8),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_c, 8),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_c, 8),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_c, 8),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_c, 8),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_c, 8),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_c, 8),
+  SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_c, 8),
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_c, 10),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_c, 10),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_c, 10),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_c, 10),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_c, 10),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_c, 10),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_c, 10),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_c, 10),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_c, 10),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_c, 10),
+  SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_c, 10),
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_c, 12),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_c, 12),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_c, 12),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_c, 12),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_c, 12),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_c, 12),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_c, 12),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_c, 12),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_c, 12),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_c, 12),
+  SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_c, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(C, SADSkipTest, ::testing::ValuesIn(skip_c_tests));
 
 const SadMxNAvgParam avg_c_tests[] = {
   SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_c),
@@ -579,7 +961,7 @@ const SadMxNAvgParam avg_c_tests[] = {
   SadMxNAvgParam(4, 4, &vpx_highbd_sad4x4_avg_c, 12),
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 };
-INSTANTIATE_TEST_CASE_P(C, SADavgTest, ::testing::ValuesIn(avg_c_tests));
+INSTANTIATE_TEST_SUITE_P(C, SADavgTest, ::testing::ValuesIn(avg_c_tests));
 
 const SadMxNx4Param x4d_c_tests[] = {
   SadMxNx4Param(64, 64, &vpx_sad64x64x4d_c),
@@ -637,28 +1019,418 @@ const SadMxNx4Param x4d_c_tests[] = {
   SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_c, 12),
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 };
-INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests));
+INSTANTIATE_TEST_SUITE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests));
+
+const SadSkipMxNx4Param skip_x4d_c_tests[] = {
+  SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_c),
+  SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_c),
+  SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_c),
+  SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_c),
+  SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_c),
+  SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_c),
+  SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_c),
+  SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_c),
+  SadSkipMxNx4Param(8, 16, &vpx_sad_skip_8x16x4d_c),
+  SadSkipMxNx4Param(8, 8, &vpx_sad_skip_8x8x4d_c),
+  SadSkipMxNx4Param(4, 8, &vpx_sad_skip_4x8x4d_c),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_c, 8),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_c, 8),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_c, 8),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_c, 8),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_c, 8),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_c, 8),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_c, 8),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_c, 8),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_c, 8),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_c, 8),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_c, 8),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_c, 10),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_c, 10),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_c, 10),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_c, 10),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_c, 10),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_c, 10),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_c, 10),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_c, 10),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_c, 10),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_c, 10),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_c, 10),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_c, 12),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_c, 12),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_c, 12),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_c, 12),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_c, 12),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_c, 12),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_c, 12),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_c, 12),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_c, 12),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_c, 12),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_c, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(C, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_c_tests));
 
 //------------------------------------------------------------------------------
 // ARM functions
 #if HAVE_NEON
 const SadMxNParam neon_tests[] = {
   SadMxNParam(64, 64, &vpx_sad64x64_neon),
+  SadMxNParam(64, 32, &vpx_sad64x32_neon),
   SadMxNParam(32, 32, &vpx_sad32x32_neon),
+  SadMxNParam(16, 32, &vpx_sad16x32_neon),
   SadMxNParam(16, 16, &vpx_sad16x16_neon),
   SadMxNParam(16, 8, &vpx_sad16x8_neon),
   SadMxNParam(8, 16, &vpx_sad8x16_neon),
   SadMxNParam(8, 8, &vpx_sad8x8_neon),
+  SadMxNParam(8, 4, &vpx_sad8x4_neon),
+  SadMxNParam(4, 8, &vpx_sad4x8_neon),
   SadMxNParam(4, 4, &vpx_sad4x4_neon),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadMxNParam(4, 4, &vpx_highbd_sad4x4_neon, 8),
+  SadMxNParam(4, 8, &vpx_highbd_sad4x8_neon, 8),
+  SadMxNParam(8, 4, &vpx_highbd_sad8x4_neon, 8),
+  SadMxNParam(8, 8, &vpx_highbd_sad8x8_neon, 8),
+  SadMxNParam(8, 16, &vpx_highbd_sad8x16_neon, 8),
+  SadMxNParam(16, 8, &vpx_highbd_sad16x8_neon, 8),
+  SadMxNParam(16, 16, &vpx_highbd_sad16x16_neon, 8),
+  SadMxNParam(16, 32, &vpx_highbd_sad16x32_neon, 8),
+  SadMxNParam(32, 32, &vpx_highbd_sad32x32_neon, 8),
+  SadMxNParam(32, 64, &vpx_highbd_sad32x64_neon, 8),
+  SadMxNParam(64, 32, &vpx_highbd_sad64x32_neon, 8),
+  SadMxNParam(64, 64, &vpx_highbd_sad64x64_neon, 8),
+  SadMxNParam(4, 4, &vpx_highbd_sad4x4_neon, 10),
+  SadMxNParam(4, 8, &vpx_highbd_sad4x8_neon, 10),
+  SadMxNParam(8, 4, &vpx_highbd_sad8x4_neon, 10),
+  SadMxNParam(8, 8, &vpx_highbd_sad8x8_neon, 10),
+  SadMxNParam(8, 16, &vpx_highbd_sad8x16_neon, 10),
+  SadMxNParam(16, 8, &vpx_highbd_sad16x8_neon, 10),
+  SadMxNParam(16, 16, &vpx_highbd_sad16x16_neon, 10),
+  SadMxNParam(16, 32, &vpx_highbd_sad16x32_neon, 10),
+  SadMxNParam(32, 32, &vpx_highbd_sad32x32_neon, 10),
+  SadMxNParam(32, 64, &vpx_highbd_sad32x64_neon, 10),
+  SadMxNParam(64, 32, &vpx_highbd_sad64x32_neon, 10),
+  SadMxNParam(64, 64, &vpx_highbd_sad64x64_neon, 10),
+  SadMxNParam(4, 4, &vpx_highbd_sad4x4_neon, 12),
+  SadMxNParam(4, 8, &vpx_highbd_sad4x8_neon, 12),
+  SadMxNParam(8, 4, &vpx_highbd_sad8x4_neon, 12),
+  SadMxNParam(8, 8, &vpx_highbd_sad8x8_neon, 12),
+  SadMxNParam(8, 16, &vpx_highbd_sad8x16_neon, 12),
+  SadMxNParam(16, 8, &vpx_highbd_sad16x8_neon, 12),
+  SadMxNParam(16, 16, &vpx_highbd_sad16x16_neon, 12),
+  SadMxNParam(16, 32, &vpx_highbd_sad16x32_neon, 12),
+  SadMxNParam(32, 32, &vpx_highbd_sad32x32_neon, 12),
+  SadMxNParam(32, 64, &vpx_highbd_sad32x64_neon, 12),
+  SadMxNParam(64, 32, &vpx_highbd_sad64x32_neon, 12),
+  SadMxNParam(64, 64, &vpx_highbd_sad64x64_neon, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 };
-INSTANTIATE_TEST_CASE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests));
+INSTANTIATE_TEST_SUITE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests));
+
+#if HAVE_NEON_DOTPROD
+const SadMxNParam neon_dotprod_tests[] = {
+  SadMxNParam(64, 64, &vpx_sad64x64_neon_dotprod),
+  SadMxNParam(64, 32, &vpx_sad64x32_neon_dotprod),
+  SadMxNParam(32, 64, &vpx_sad32x64_neon_dotprod),
+  SadMxNParam(32, 32, &vpx_sad32x32_neon_dotprod),
+  SadMxNParam(32, 16, &vpx_sad32x16_neon_dotprod),
+  SadMxNParam(16, 32, &vpx_sad16x32_neon_dotprod),
+  SadMxNParam(16, 16, &vpx_sad16x16_neon_dotprod),
+  SadMxNParam(16, 8, &vpx_sad16x8_neon_dotprod),
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADTest,
+                         ::testing::ValuesIn(neon_dotprod_tests));
+#endif  // HAVE_NEON_DOTPROD
+
+const SadSkipMxNParam skip_neon_tests[] = {
+  SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_neon),
+  SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_neon),
+  SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_neon),
+  SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_neon),
+  SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_neon),
+  SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_neon),
+  SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_neon),
+  SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_neon),
+  SadSkipMxNParam(8, 16, &vpx_sad_skip_8x16_neon),
+  SadSkipMxNParam(8, 8, &vpx_sad_skip_8x8_neon),
+  SadSkipMxNParam(8, 4, &vpx_sad_skip_8x4_neon),
+  SadSkipMxNParam(4, 8, &vpx_sad_skip_4x8_neon),
+  SadSkipMxNParam(4, 4, &vpx_sad_skip_4x4_neon),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNParam(4, 4, &vpx_highbd_sad_skip_4x4_neon, 8),
+  SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_neon, 8),
+  SadSkipMxNParam(8, 4, &vpx_highbd_sad_skip_8x4_neon, 8),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_neon, 8),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_neon, 8),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_neon, 8),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_neon, 8),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_neon, 8),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_neon, 8),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_neon, 8),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_neon, 8),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_neon, 8),
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_neon, 8),
+  SadSkipMxNParam(4, 4, &vpx_highbd_sad_skip_4x4_neon, 10),
+  SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_neon, 10),
+  SadSkipMxNParam(8, 4, &vpx_highbd_sad_skip_8x4_neon, 10),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_neon, 10),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_neon, 10),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_neon, 10),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_neon, 10),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_neon, 10),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_neon, 10),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_neon, 10),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_neon, 10),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_neon, 10),
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_neon, 10),
+  SadSkipMxNParam(4, 4, &vpx_highbd_sad_skip_4x4_neon, 12),
+  SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_neon, 12),
+  SadSkipMxNParam(8, 4, &vpx_highbd_sad_skip_8x4_neon, 12),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_neon, 12),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_neon, 12),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_neon, 12),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_neon, 12),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_neon, 12),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_neon, 12),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_neon, 12),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_neon, 12),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_neon, 12),
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_neon, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(NEON, SADSkipTest,
+                         ::testing::ValuesIn(skip_neon_tests));
+
+#if HAVE_NEON_DOTPROD
+const SadSkipMxNParam skip_neon_dotprod_tests[] = {
+  SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_neon_dotprod),
+  SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_neon_dotprod),
+  SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_neon_dotprod),
+  SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_neon_dotprod),
+  SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_neon_dotprod),
+  SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_neon_dotprod),
+  SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_neon_dotprod),
+  SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_neon_dotprod),
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADSkipTest,
+                         ::testing::ValuesIn(skip_neon_dotprod_tests));
+#endif  // HAVE_NEON_DOTPROD
+
+const SadMxNAvgParam avg_neon_tests[] = {
+  SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_neon),
+  SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_neon),
+  SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_neon),
+  SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_neon),
+  SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_neon),
+  SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_neon),
+  SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_neon),
+  SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_neon),
+  SadMxNAvgParam(8, 16, &vpx_sad8x16_avg_neon),
+  SadMxNAvgParam(8, 8, &vpx_sad8x8_avg_neon),
+  SadMxNAvgParam(8, 4, &vpx_sad8x4_avg_neon),
+  SadMxNAvgParam(4, 8, &vpx_sad4x8_avg_neon),
+  SadMxNAvgParam(4, 4, &vpx_sad4x4_avg_neon),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadMxNAvgParam(4, 4, &vpx_highbd_sad4x4_avg_neon, 8),
+  SadMxNAvgParam(4, 8, &vpx_highbd_sad4x8_avg_neon, 8),
+  SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_neon, 8),
+  SadMxNAvgParam(8, 8, &vpx_highbd_sad8x8_avg_neon, 8),
+  SadMxNAvgParam(8, 16, &vpx_highbd_sad8x16_avg_neon, 8),
+  SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_neon, 8),
+  SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_neon, 8),
+  SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_neon, 8),
+  SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_neon, 8),
+  SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_neon, 8),
+  SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_neon, 8),
+  SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_neon, 8),
+  SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_neon, 8),
+  SadMxNAvgParam(4, 4, &vpx_highbd_sad4x4_avg_neon, 10),
+  SadMxNAvgParam(4, 8, &vpx_highbd_sad4x8_avg_neon, 10),
+  SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_neon, 10),
+  SadMxNAvgParam(8, 8, &vpx_highbd_sad8x8_avg_neon, 10),
+  SadMxNAvgParam(8, 16, &vpx_highbd_sad8x16_avg_neon, 10),
+  SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_neon, 10),
+  SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_neon, 10),
+  SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_neon, 10),
+  SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_neon, 10),
+  SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_neon, 10),
+  SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_neon, 10),
+  SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_neon, 10),
+  SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_neon, 10),
+  SadMxNAvgParam(4, 4, &vpx_highbd_sad4x4_avg_neon, 12),
+  SadMxNAvgParam(4, 8, &vpx_highbd_sad4x8_avg_neon, 12),
+  SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_neon, 12),
+  SadMxNAvgParam(8, 8, &vpx_highbd_sad8x8_avg_neon, 12),
+  SadMxNAvgParam(8, 16, &vpx_highbd_sad8x16_avg_neon, 12),
+  SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_neon, 12),
+  SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_neon, 12),
+  SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_neon, 12),
+  SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_neon, 12),
+  SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_neon, 12),
+  SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_neon, 12),
+  SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_neon, 12),
+  SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_neon, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(NEON, SADavgTest, ::testing::ValuesIn(avg_neon_tests));
+
+#if HAVE_NEON_DOTPROD
+const SadMxNAvgParam avg_neon_dotprod_tests[] = {
+  SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_neon_dotprod),
+  SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_neon_dotprod),
+  SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_neon_dotprod),
+  SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_neon_dotprod),
+  SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_neon_dotprod),
+  SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_neon_dotprod),
+  SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_neon_dotprod),
+  SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_neon_dotprod),
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADavgTest,
+                         ::testing::ValuesIn(avg_neon_dotprod_tests));
+#endif  // HAVE_NEON_DOTPROD
 
 const SadMxNx4Param x4d_neon_tests[] = {
   SadMxNx4Param(64, 64, &vpx_sad64x64x4d_neon),
+  SadMxNx4Param(64, 32, &vpx_sad64x32x4d_neon),
+  SadMxNx4Param(32, 64, &vpx_sad32x64x4d_neon),
   SadMxNx4Param(32, 32, &vpx_sad32x32x4d_neon),
+  SadMxNx4Param(32, 16, &vpx_sad32x16x4d_neon),
+  SadMxNx4Param(16, 32, &vpx_sad16x32x4d_neon),
   SadMxNx4Param(16, 16, &vpx_sad16x16x4d_neon),
+  SadMxNx4Param(16, 8, &vpx_sad16x8x4d_neon),
+  SadMxNx4Param(8, 16, &vpx_sad8x16x4d_neon),
+  SadMxNx4Param(8, 8, &vpx_sad8x8x4d_neon),
+  SadMxNx4Param(8, 4, &vpx_sad8x4x4d_neon),
+  SadMxNx4Param(4, 8, &vpx_sad4x8x4d_neon),
+  SadMxNx4Param(4, 4, &vpx_sad4x4x4d_neon),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_neon, 8),
+  SadMxNx4Param(4, 8, &vpx_highbd_sad4x8x4d_neon, 8),
+  SadMxNx4Param(8, 4, &vpx_highbd_sad8x4x4d_neon, 8),
+  SadMxNx4Param(8, 8, &vpx_highbd_sad8x8x4d_neon, 8),
+  SadMxNx4Param(8, 16, &vpx_highbd_sad8x16x4d_neon, 8),
+  SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_neon, 8),
+  SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_neon, 8),
+  SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_neon, 8),
+  SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_neon, 8),
+  SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_neon, 8),
+  SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_neon, 8),
+  SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_neon, 8),
+  SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_neon, 10),
+  SadMxNx4Param(4, 8, &vpx_highbd_sad4x8x4d_neon, 10),
+  SadMxNx4Param(8, 4, &vpx_highbd_sad8x4x4d_neon, 10),
+  SadMxNx4Param(8, 8, &vpx_highbd_sad8x8x4d_neon, 10),
+  SadMxNx4Param(8, 16, &vpx_highbd_sad8x16x4d_neon, 10),
+  SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_neon, 10),
+  SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_neon, 10),
+  SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_neon, 10),
+  SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_neon, 10),
+  SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_neon, 10),
+  SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_neon, 10),
+  SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_neon, 10),
+  SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_neon, 12),
+  SadMxNx4Param(4, 8, &vpx_highbd_sad4x8x4d_neon, 12),
+  SadMxNx4Param(8, 4, &vpx_highbd_sad8x4x4d_neon, 12),
+  SadMxNx4Param(8, 8, &vpx_highbd_sad8x8x4d_neon, 12),
+  SadMxNx4Param(8, 16, &vpx_highbd_sad8x16x4d_neon, 12),
+  SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_neon, 12),
+  SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_neon, 12),
+  SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_neon, 12),
+  SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_neon, 12),
+  SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_neon, 12),
+  SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_neon, 12),
+  SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_neon, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 };
-INSTANTIATE_TEST_CASE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests));
+INSTANTIATE_TEST_SUITE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests));
+
+#if HAVE_NEON_DOTPROD
+const SadMxNx4Param x4d_neon_dotprod_tests[] = {
+  SadMxNx4Param(64, 64, &vpx_sad64x64x4d_neon_dotprod),
+  SadMxNx4Param(64, 32, &vpx_sad64x32x4d_neon_dotprod),
+  SadMxNx4Param(32, 64, &vpx_sad32x64x4d_neon_dotprod),
+  SadMxNx4Param(32, 32, &vpx_sad32x32x4d_neon_dotprod),
+  SadMxNx4Param(32, 16, &vpx_sad32x16x4d_neon_dotprod),
+  SadMxNx4Param(16, 32, &vpx_sad16x32x4d_neon_dotprod),
+  SadMxNx4Param(16, 16, &vpx_sad16x16x4d_neon_dotprod),
+  SadMxNx4Param(16, 8, &vpx_sad16x8x4d_neon_dotprod),
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADx4Test,
+                         ::testing::ValuesIn(x4d_neon_dotprod_tests));
+#endif  // HAVE_NEON_DOTPROD
+
+const SadSkipMxNx4Param skip_x4d_neon_tests[] = {
+  SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_neon),
+  SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_neon),
+  SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_neon),
+  SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_neon),
+  SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_neon),
+  SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_neon),
+  SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_neon),
+  SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_neon),
+  SadSkipMxNx4Param(8, 16, &vpx_sad_skip_8x16x4d_neon),
+  SadSkipMxNx4Param(8, 8, &vpx_sad_skip_8x8x4d_neon),
+  SadSkipMxNx4Param(8, 4, &vpx_sad_skip_8x4x4d_neon),
+  SadSkipMxNx4Param(4, 8, &vpx_sad_skip_4x8x4d_neon),
+  SadSkipMxNx4Param(4, 4, &vpx_sad_skip_4x4x4d_neon),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNx4Param(4, 4, &vpx_highbd_sad_skip_4x4x4d_neon, 8),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_neon, 8),
+  SadSkipMxNx4Param(8, 4, &vpx_highbd_sad_skip_8x4x4d_neon, 8),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_neon, 8),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_neon, 8),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_neon, 8),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_neon, 8),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_neon, 8),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_neon, 8),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_neon, 8),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_neon, 8),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_neon, 8),
+  SadSkipMxNx4Param(4, 4, &vpx_highbd_sad_skip_4x4x4d_neon, 10),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_neon, 10),
+  SadSkipMxNx4Param(8, 4, &vpx_highbd_sad_skip_8x4x4d_neon, 10),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_neon, 10),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_neon, 10),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_neon, 10),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_neon, 10),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_neon, 10),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_neon, 10),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_neon, 10),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_neon, 10),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_neon, 10),
+  SadSkipMxNx4Param(4, 4, &vpx_highbd_sad_skip_4x4x4d_neon, 12),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_neon, 12),
+  SadSkipMxNx4Param(8, 4, &vpx_highbd_sad_skip_8x4x4d_neon, 12),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_neon, 12),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_neon, 12),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_neon, 12),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_neon, 12),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_neon, 12),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_neon, 12),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_neon, 12),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_neon, 12),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_neon, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(NEON, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_neon_tests));
+
+#if HAVE_NEONE_DOTPROD
+const SadSkipMxNx4Param skip_x4d_neon_dotprod_tests[] = {
+  SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_neon_dotprod),
+  SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_neon_dotprod),
+  SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_neon_dotprod),
+  SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_neon_dotprod),
+  SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_neon_dotprod),
+  SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_neon_dotprod),
+  SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_neon_dotprod),
+  SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_neon_dotprod),
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_neon_dotprod_tests));
+#endif  // HAVE_NEON_DOTPROD
 #endif  // HAVE_NEON
 
 //------------------------------------------------------------------------------
@@ -714,7 +1486,55 @@ const SadMxNParam sse2_tests[] = {
   SadMxNParam(8, 4, &vpx_highbd_sad8x4_sse2, 12),
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 };
-INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
+INSTANTIATE_TEST_SUITE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
+
+const SadSkipMxNParam skip_sse2_tests[] = {
+  SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_sse2),
+  SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_sse2),
+  SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_sse2),
+  SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_sse2),
+  SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_sse2),
+  SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_sse2),
+  SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_sse2),
+  SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_sse2),
+  SadSkipMxNParam(8, 16, &vpx_sad_skip_8x16_sse2),
+  SadSkipMxNParam(8, 8, &vpx_sad_skip_8x8_sse2),
+  SadSkipMxNParam(4, 8, &vpx_sad_skip_4x8_sse2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_sse2, 8),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_sse2, 8),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_sse2, 8),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_sse2, 8),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_sse2, 8),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_sse2, 8),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_sse2, 8),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_sse2, 8),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_sse2, 8),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_sse2, 8),
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_sse2, 10),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_sse2, 10),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_sse2, 10),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_sse2, 10),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_sse2, 10),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_sse2, 10),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_sse2, 10),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_sse2, 10),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_sse2, 10),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_sse2, 10),
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_sse2, 12),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_sse2, 12),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_sse2, 12),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_sse2, 12),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_sse2, 12),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_sse2, 12),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_sse2, 12),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_sse2, 12),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_sse2, 12),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_sse2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, SADSkipTest,
+                         ::testing::ValuesIn(skip_sse2_tests));
 
 const SadMxNAvgParam avg_sse2_tests[] = {
   SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_sse2),
@@ -766,7 +1586,7 @@ const SadMxNAvgParam avg_sse2_tests[] = {
   SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_sse2, 12),
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 };
-INSTANTIATE_TEST_CASE_P(SSE2, SADavgTest, ::testing::ValuesIn(avg_sse2_tests));
+INSTANTIATE_TEST_SUITE_P(SSE2, SADavgTest, ::testing::ValuesIn(avg_sse2_tests));
 
 const SadMxNx4Param x4d_sse2_tests[] = {
   SadMxNx4Param(64, 64, &vpx_sad64x64x4d_sse2),
@@ -824,7 +1644,58 @@ const SadMxNx4Param x4d_sse2_tests[] = {
   SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_sse2, 12),
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 };
-INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests));
+INSTANTIATE_TEST_SUITE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests));
+
+const SadSkipMxNx4Param skip_x4d_sse2_tests[] = {
+  SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_sse2),
+  SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_sse2),
+  SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_sse2),
+  SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_sse2),
+  SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_sse2),
+  SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_sse2),
+  SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_sse2),
+  SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_sse2),
+  SadSkipMxNx4Param(8, 16, &vpx_sad_skip_8x16x4d_sse2),
+  SadSkipMxNx4Param(8, 8, &vpx_sad_skip_8x8x4d_sse2),
+  SadSkipMxNx4Param(4, 8, &vpx_sad_skip_4x8x4d_sse2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_sse2, 8),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_sse2, 8),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_sse2, 8),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_sse2, 8),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_sse2, 8),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_sse2, 8),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_sse2, 8),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_sse2, 8),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_sse2, 8),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_sse2, 8),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_sse2, 8),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_sse2, 10),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_sse2, 10),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_sse2, 10),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_sse2, 10),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_sse2, 10),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_sse2, 10),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_sse2, 10),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_sse2, 10),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_sse2, 10),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_sse2, 10),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_sse2, 10),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_sse2, 12),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_sse2, 12),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_sse2, 12),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_sse2, 12),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_sse2, 12),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_sse2, 12),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_sse2, 12),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_sse2, 12),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_sse2, 12),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_sse2, 12),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_sse2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_sse2_tests));
 #endif  // HAVE_SSE2
 
 #if HAVE_SSE3
@@ -835,10 +1706,6 @@ INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests));
 // Only functions are x3, which do not have tests.
 #endif  // HAVE_SSSE3
 
-#if HAVE_SSE4_1
-// Only functions are x8, which do not have tests.
-#endif  // HAVE_SSE4_1
-
 #if HAVE_AVX2
 const SadMxNParam avx2_tests[] = {
   SadMxNParam(64, 64, &vpx_sad64x64_avx2),
@@ -846,8 +1713,74 @@ const SadMxNParam avx2_tests[] = {
   SadMxNParam(32, 64, &vpx_sad32x64_avx2),
   SadMxNParam(32, 32, &vpx_sad32x32_avx2),
   SadMxNParam(32, 16, &vpx_sad32x16_avx2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadMxNParam(64, 64, &vpx_highbd_sad64x64_avx2, 8),
+  SadMxNParam(64, 32, &vpx_highbd_sad64x32_avx2, 8),
+  SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 8),
+  SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 8),
+  SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 8),
+  SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 8),
+  SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 8),
+  SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 8),
+
+  SadMxNParam(64, 64, &vpx_highbd_sad64x64_avx2, 10),
+  SadMxNParam(64, 32, &vpx_highbd_sad64x32_avx2, 10),
+  SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 10),
+  SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 10),
+  SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 10),
+  SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 10),
+  SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 10),
+  SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 10),
+
+  SadMxNParam(64, 64, &vpx_highbd_sad64x64_avx2, 12),
+  SadMxNParam(64, 32, &vpx_highbd_sad64x32_avx2, 12),
+  SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 12),
+  SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 12),
+  SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 12),
+  SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 12),
+  SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 12),
+  SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 };
-INSTANTIATE_TEST_CASE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests));
+INSTANTIATE_TEST_SUITE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests));
+
+const SadSkipMxNParam skip_avx2_tests[] = {
+  SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_avx2),
+  SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_avx2),
+  SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_avx2),
+  SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_avx2),
+  SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_avx2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_avx2, 8),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_avx2, 8),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_avx2, 8),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_avx2, 8),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_avx2, 8),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_avx2, 8),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_avx2, 8),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_avx2, 8),
+
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_avx2, 10),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_avx2, 10),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_avx2, 10),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_avx2, 10),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_avx2, 10),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_avx2, 10),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_avx2, 10),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_avx2, 10),
+
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_avx2, 12),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_avx2, 12),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_avx2, 12),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_avx2, 12),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_avx2, 12),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_avx2, 12),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_avx2, 12),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_avx2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, SADSkipTest,
+                         ::testing::ValuesIn(skip_avx2_tests));
 
 const SadMxNAvgParam avg_avx2_tests[] = {
   SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_avx2),
@@ -855,16 +1788,140 @@ const SadMxNAvgParam avg_avx2_tests[] = {
   SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_avx2),
   SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_avx2),
   SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_avx2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_avx2, 8),
+  SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_avx2, 8),
+  SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_avx2, 8),
+  SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_avx2, 8),
+  SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_avx2, 8),
+  SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 8),
+  SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 8),
+  SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 8),
+  SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_avx2, 10),
+  SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_avx2, 10),
+  SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_avx2, 10),
+  SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_avx2, 10),
+  SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_avx2, 10),
+  SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 10),
+  SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 10),
+  SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 10),
+  SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_avx2, 12),
+  SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_avx2, 12),
+  SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_avx2, 12),
+  SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_avx2, 12),
+  SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_avx2, 12),
+  SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 12),
+  SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 12),
+  SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 };
-INSTANTIATE_TEST_CASE_P(AVX2, SADavgTest, ::testing::ValuesIn(avg_avx2_tests));
+INSTANTIATE_TEST_SUITE_P(AVX2, SADavgTest, ::testing::ValuesIn(avg_avx2_tests));
 
 const SadMxNx4Param x4d_avx2_tests[] = {
   SadMxNx4Param(64, 64, &vpx_sad64x64x4d_avx2),
   SadMxNx4Param(32, 32, &vpx_sad32x32x4d_avx2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_avx2, 8),
+  SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_avx2, 8),
+  SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 8),
+  SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 8),
+  SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 8),
+  SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 8),
+  SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 8),
+  SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 8),
+  SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_avx2, 10),
+  SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_avx2, 10),
+  SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 10),
+  SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 10),
+  SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 10),
+  SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 10),
+  SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 10),
+  SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 10),
+  SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_avx2, 12),
+  SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_avx2, 12),
+  SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 12),
+  SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 12),
+  SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 12),
+  SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 12),
+  SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 12),
+  SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 };
-INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
+INSTANTIATE_TEST_SUITE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
+
+const SadSkipMxNx4Param skip_x4d_avx2_tests[] = {
+  SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_avx2),
+  SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_avx2),
+  SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_avx2),
+  SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_avx2),
+  SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_avx2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_avx2, 8),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_avx2, 8),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_avx2, 8),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_avx2, 8),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_avx2, 8),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_avx2, 8),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_avx2, 8),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_avx2, 8),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_avx2, 10),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_avx2, 10),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_avx2, 10),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_avx2, 10),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_avx2, 10),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_avx2, 10),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_avx2, 10),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_avx2, 10),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_avx2, 12),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_avx2, 12),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_avx2, 12),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_avx2, 12),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_avx2, 12),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_avx2, 12),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_avx2, 12),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_avx2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_avx2_tests));
+
 #endif  // HAVE_AVX2
 
+#if HAVE_AVX512
+const SadMxNParam avx512_tests[] = {
+  SadMxNParam(64, 64, &vpx_sad64x64_avx512),
+  SadMxNParam(64, 32, &vpx_sad64x32_avx512),
+};
+INSTANTIATE_TEST_SUITE_P(AVX512, SADTest, ::testing::ValuesIn(avx512_tests));
+
+const SadSkipMxNParam skip_avx512_tests[] = {
+  SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_avx512),
+  SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_avx512),
+};
+INSTANTIATE_TEST_SUITE_P(AVX512, SADSkipTest,
+                         ::testing::ValuesIn(skip_avx512_tests));
+
+const SadMxNAvgParam avg_avx512_tests[] = {
+  SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_avx512),
+  SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_avx512),
+};
+INSTANTIATE_TEST_SUITE_P(AVX512, SADavgTest,
+                         ::testing::ValuesIn(avg_avx512_tests));
+
+const SadMxNx4Param x4d_avx512_tests[] = {
+  SadMxNx4Param(64, 64, &vpx_sad64x64x4d_avx512),
+};
+INSTANTIATE_TEST_SUITE_P(AVX512, SADx4Test,
+                         ::testing::ValuesIn(x4d_avx512_tests));
+
+const SadSkipMxNx4Param skip_x4d_avx512_tests[] = {
+  SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_avx512),
+  SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_avx512),
+};
+INSTANTIATE_TEST_SUITE_P(AVX512, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_avx512_tests));
+#endif  // HAVE_AVX512
+
 //------------------------------------------------------------------------------
 // MIPS functions
 #if HAVE_MSA
@@ -883,7 +1940,7 @@ const SadMxNParam msa_tests[] = {
   SadMxNParam(4, 8, &vpx_sad4x8_msa),
   SadMxNParam(4, 4, &vpx_sad4x4_msa),
 };
-INSTANTIATE_TEST_CASE_P(MSA, SADTest, ::testing::ValuesIn(msa_tests));
+INSTANTIATE_TEST_SUITE_P(MSA, SADTest, ::testing::ValuesIn(msa_tests));
 
 const SadMxNAvgParam avg_msa_tests[] = {
   SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_msa),
@@ -900,7 +1957,7 @@ const SadMxNAvgParam avg_msa_tests[] = {
   SadMxNAvgParam(4, 8, &vpx_sad4x8_avg_msa),
   SadMxNAvgParam(4, 4, &vpx_sad4x4_avg_msa),
 };
-INSTANTIATE_TEST_CASE_P(MSA, SADavgTest, ::testing::ValuesIn(avg_msa_tests));
+INSTANTIATE_TEST_SUITE_P(MSA, SADavgTest, ::testing::ValuesIn(avg_msa_tests));
 
 const SadMxNx4Param x4d_msa_tests[] = {
   SadMxNx4Param(64, 64, &vpx_sad64x64x4d_msa),
@@ -917,7 +1974,133 @@ const SadMxNx4Param x4d_msa_tests[] = {
   SadMxNx4Param(4, 8, &vpx_sad4x8x4d_msa),
   SadMxNx4Param(4, 4, &vpx_sad4x4x4d_msa),
 };
-INSTANTIATE_TEST_CASE_P(MSA, SADx4Test, ::testing::ValuesIn(x4d_msa_tests));
+INSTANTIATE_TEST_SUITE_P(MSA, SADx4Test, ::testing::ValuesIn(x4d_msa_tests));
 #endif  // HAVE_MSA
 
+//------------------------------------------------------------------------------
+// VSX functions
+#if HAVE_VSX
+const SadMxNParam vsx_tests[] = {
+  SadMxNParam(64, 64, &vpx_sad64x64_vsx),
+  SadMxNParam(64, 32, &vpx_sad64x32_vsx),
+  SadMxNParam(32, 64, &vpx_sad32x64_vsx),
+  SadMxNParam(32, 32, &vpx_sad32x32_vsx),
+  SadMxNParam(32, 16, &vpx_sad32x16_vsx),
+  SadMxNParam(16, 32, &vpx_sad16x32_vsx),
+  SadMxNParam(16, 16, &vpx_sad16x16_vsx),
+  SadMxNParam(16, 8, &vpx_sad16x8_vsx),
+  SadMxNParam(8, 16, &vpx_sad8x16_vsx),
+  SadMxNParam(8, 8, &vpx_sad8x8_vsx),
+  SadMxNParam(8, 4, &vpx_sad8x4_vsx),
+};
+INSTANTIATE_TEST_SUITE_P(VSX, SADTest, ::testing::ValuesIn(vsx_tests));
+
+const SadMxNAvgParam avg_vsx_tests[] = {
+  SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_vsx),
+  SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_vsx),
+  SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_vsx),
+  SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_vsx),
+  SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_vsx),
+  SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_vsx),
+  SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_vsx),
+  SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_vsx),
+};
+INSTANTIATE_TEST_SUITE_P(VSX, SADavgTest, ::testing::ValuesIn(avg_vsx_tests));
+
+const SadMxNx4Param x4d_vsx_tests[] = {
+  SadMxNx4Param(64, 64, &vpx_sad64x64x4d_vsx),
+  SadMxNx4Param(64, 32, &vpx_sad64x32x4d_vsx),
+  SadMxNx4Param(32, 64, &vpx_sad32x64x4d_vsx),
+  SadMxNx4Param(32, 32, &vpx_sad32x32x4d_vsx),
+  SadMxNx4Param(32, 16, &vpx_sad32x16x4d_vsx),
+  SadMxNx4Param(16, 32, &vpx_sad16x32x4d_vsx),
+  SadMxNx4Param(16, 16, &vpx_sad16x16x4d_vsx),
+  SadMxNx4Param(16, 8, &vpx_sad16x8x4d_vsx),
+};
+INSTANTIATE_TEST_SUITE_P(VSX, SADx4Test, ::testing::ValuesIn(x4d_vsx_tests));
+#endif  // HAVE_VSX
+
+//------------------------------------------------------------------------------
+// Loongson functions
+#if HAVE_MMI
+const SadMxNParam mmi_tests[] = {
+  SadMxNParam(64, 64, &vpx_sad64x64_mmi),
+  SadMxNParam(64, 32, &vpx_sad64x32_mmi),
+  SadMxNParam(32, 64, &vpx_sad32x64_mmi),
+  SadMxNParam(32, 32, &vpx_sad32x32_mmi),
+  SadMxNParam(32, 16, &vpx_sad32x16_mmi),
+  SadMxNParam(16, 32, &vpx_sad16x32_mmi),
+  SadMxNParam(16, 16, &vpx_sad16x16_mmi),
+  SadMxNParam(16, 8, &vpx_sad16x8_mmi),
+  SadMxNParam(8, 16, &vpx_sad8x16_mmi),
+  SadMxNParam(8, 8, &vpx_sad8x8_mmi),
+  SadMxNParam(8, 4, &vpx_sad8x4_mmi),
+  SadMxNParam(4, 8, &vpx_sad4x8_mmi),
+  SadMxNParam(4, 4, &vpx_sad4x4_mmi),
+};
+INSTANTIATE_TEST_SUITE_P(MMI, SADTest, ::testing::ValuesIn(mmi_tests));
+
+const SadMxNAvgParam avg_mmi_tests[] = {
+  SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_mmi),
+  SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_mmi),
+  SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_mmi),
+  SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_mmi),
+  SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_mmi),
+  SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_mmi),
+  SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_mmi),
+  SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_mmi),
+  SadMxNAvgParam(8, 16, &vpx_sad8x16_avg_mmi),
+  SadMxNAvgParam(8, 8, &vpx_sad8x8_avg_mmi),
+  SadMxNAvgParam(8, 4, &vpx_sad8x4_avg_mmi),
+  SadMxNAvgParam(4, 8, &vpx_sad4x8_avg_mmi),
+  SadMxNAvgParam(4, 4, &vpx_sad4x4_avg_mmi),
+};
+INSTANTIATE_TEST_SUITE_P(MMI, SADavgTest, ::testing::ValuesIn(avg_mmi_tests));
+
+const SadMxNx4Param x4d_mmi_tests[] = {
+  SadMxNx4Param(64, 64, &vpx_sad64x64x4d_mmi),
+  SadMxNx4Param(64, 32, &vpx_sad64x32x4d_mmi),
+  SadMxNx4Param(32, 64, &vpx_sad32x64x4d_mmi),
+  SadMxNx4Param(32, 32, &vpx_sad32x32x4d_mmi),
+  SadMxNx4Param(32, 16, &vpx_sad32x16x4d_mmi),
+  SadMxNx4Param(16, 32, &vpx_sad16x32x4d_mmi),
+  SadMxNx4Param(16, 16, &vpx_sad16x16x4d_mmi),
+  SadMxNx4Param(16, 8, &vpx_sad16x8x4d_mmi),
+  SadMxNx4Param(8, 16, &vpx_sad8x16x4d_mmi),
+  SadMxNx4Param(8, 8, &vpx_sad8x8x4d_mmi),
+  SadMxNx4Param(8, 4, &vpx_sad8x4x4d_mmi),
+  SadMxNx4Param(4, 8, &vpx_sad4x8x4d_mmi),
+  SadMxNx4Param(4, 4, &vpx_sad4x4x4d_mmi),
+};
+INSTANTIATE_TEST_SUITE_P(MMI, SADx4Test, ::testing::ValuesIn(x4d_mmi_tests));
+#endif  // HAVE_MMI
+
+//------------------------------------------------------------------------------
+// loongarch functions
+#if HAVE_LSX
+const SadMxNParam lsx_tests[] = {
+  SadMxNParam(64, 64, &vpx_sad64x64_lsx),
+  SadMxNParam(32, 32, &vpx_sad32x32_lsx),
+  SadMxNParam(16, 16, &vpx_sad16x16_lsx),
+  SadMxNParam(8, 8, &vpx_sad8x8_lsx),
+};
+INSTANTIATE_TEST_SUITE_P(LSX, SADTest, ::testing::ValuesIn(lsx_tests));
+
+const SadMxNAvgParam avg_lsx_tests[] = {
+  SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_lsx),
+  SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_lsx),
+};
+INSTANTIATE_TEST_SUITE_P(LSX, SADavgTest, ::testing::ValuesIn(avg_lsx_tests));
+
+const SadMxNx4Param x4d_lsx_tests[] = {
+  SadMxNx4Param(64, 64, &vpx_sad64x64x4d_lsx),
+  SadMxNx4Param(64, 32, &vpx_sad64x32x4d_lsx),
+  SadMxNx4Param(32, 64, &vpx_sad32x64x4d_lsx),
+  SadMxNx4Param(32, 32, &vpx_sad32x32x4d_lsx),
+  SadMxNx4Param(16, 16, &vpx_sad16x16x4d_lsx),
+  SadMxNx4Param(8, 8, &vpx_sad8x8x4d_lsx),
+};
+INSTANTIATE_TEST_SUITE_P(LSX, SADx4Test, ::testing::ValuesIn(x4d_lsx_tests));
+#endif  // HAVE_LSX
+
 }  // namespace
diff --git a/media/libvpx/libvpx/test/set_maps.sh b/media/libvpx/libvpx/test/set_maps.sh
index e7c8d43fa8..f45dc51f49 100644
--- a/media/libvpx/libvpx/test/set_maps.sh
+++ b/media/libvpx/libvpx/test/set_maps.sh
@@ -36,7 +36,7 @@ set_maps() {
 
   eval "${VPX_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
       "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \
-      ${devnull}
+      ${devnull} || return 1
 
   [ -e "${output_file}" ] || return 1
 }
diff --git a/media/libvpx/libvpx/test/set_roi.cc b/media/libvpx/libvpx/test/set_roi.cc
index 38711a806d..ac07ca161f 100644
--- a/media/libvpx/libvpx/test/set_roi.cc
+++ b/media/libvpx/libvpx/test/set_roi.cc
@@ -15,7 +15,7 @@
 #include <string.h>
 #include <sys/types.h>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/acm_random.h"
 #include "vp8/encoder/onyx_int.h"
 #include "vpx/vpx_integer.h"
@@ -40,7 +40,7 @@ TEST(VP8RoiMapTest, ParameterCheck) {
 
   // Initialize elements of cpi with valid defaults.
   VP8_COMP cpi;
-  cpi.mb.e_mbd.mb_segement_abs_delta = SEGMENT_DELTADATA;
+  cpi.mb.e_mbd.mb_segment_abs_delta = SEGMENT_DELTADATA;
   cpi.cyclic_refresh_mode_enabled = 0;
   cpi.mb.e_mbd.segmentation_enabled = 0;
   cpi.mb.e_mbd.update_mb_segmentation_map = 0;
@@ -146,14 +146,6 @@ TEST(VP8RoiMapTest, ParameterCheck) {
       if (deltas_valid != roi_retval) break;
     }
 
-    // Test that we report and error if cyclic refresh is enabled.
-    cpi.cyclic_refresh_mode_enabled = 1;
-    roi_retval =
-        vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows, cpi.common.mb_cols,
-                       delta_q, delta_lf, threshold);
-    EXPECT_EQ(-1, roi_retval) << "cyclic refresh check error";
-    cpi.cyclic_refresh_mode_enabled = 0;
-
     // Test invalid number of rows or colums.
     roi_retval =
         vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows + 1,
@@ -169,6 +161,6 @@ TEST(VP8RoiMapTest, ParameterCheck) {
   // Free allocated memory
   if (cpi.segmentation_map) vpx_free(cpi.segmentation_map);
   if (roi_map) vpx_free(roi_map);
-};
+}
 
 }  // namespace
diff --git a/media/libvpx/libvpx/test/simple_decoder.sh b/media/libvpx/libvpx/test/simple_decoder.sh
index 7eeaf71b1c..65fc4828ed 100644
--- a/media/libvpx/libvpx/test/simple_decoder.sh
+++ b/media/libvpx/libvpx/test/simple_decoder.sh
@@ -38,7 +38,7 @@ simple_decoder() {
   fi
 
   eval "${VPX_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \
-      ${devnull}
+      ${devnull} || return 1
 
   [ -e "${output_file}" ] || return 1
 }
diff --git a/media/libvpx/libvpx/test/simple_encoder.sh b/media/libvpx/libvpx/test/simple_encoder.sh
index ee633ae99e..dc7f46ff38 100644
--- a/media/libvpx/libvpx/test/simple_encoder.sh
+++ b/media/libvpx/libvpx/test/simple_encoder.sh
@@ -36,7 +36,7 @@ simple_encoder() {
 
   eval "${VPX_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
       "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" 9999 0 100 \
-      ${devnull}
+      ${devnull} || return 1
 
   [ -e "${output_file}" ] || return 1
 }
diff --git a/media/libvpx/libvpx/test/stress.sh b/media/libvpx/libvpx/test/stress.sh
index 9523824764..ba79a52ac3 100644
--- a/media/libvpx/libvpx/test/stress.sh
+++ b/media/libvpx/libvpx/test/stress.sh
@@ -8,15 +8,17 @@
 ##  in the file PATENTS.  All contributing project authors may
 ##  be found in the AUTHORS file in the root of the source tree.
 ##
-##  This file performs a stress test. It runs 5 encodes and 30 decodes in
-##  parallel.
+##  This file performs a stress test. It runs (STRESS_ONEPASS_MAX_JOBS,
+##  default=5) one, (STRESS_TWOPASS_MAX_JOBS, default=5) two pass &
+##  (STRESS_RT_MAX_JOBS, default=5) encodes and (STRESS_<codec>_DECODE_MAX_JOBS,
+##  default=30) decodes in parallel.
 
 . $(dirname $0)/tools_common.sh
 
 YUV="${LIBVPX_TEST_DATA_PATH}/niklas_1280_720_30.yuv"
 VP8="${LIBVPX_TEST_DATA_PATH}/tos_vp8.webm"
 VP9="${LIBVPX_TEST_DATA_PATH}/vp90-2-sintel_1920x818_tile_1x4_fpm_2279kbps.webm"
-DATA_URL="http://downloads.webmproject.org/test_data/libvpx/"
+DATA_URL="https://storage.googleapis.com/downloads.webmproject.org/test_data/libvpx/"
 SHA1_FILE="$(dirname $0)/test-data.sha1"
 
 # Set sha1sum to proper sha program (sha1sum, shasum, sha1). This code is
@@ -28,7 +30,7 @@ SHA1_FILE="$(dirname $0)/test-data.sha1"
 # Download a file from the url and check its sha1sum.
 download_and_check_file() {
   # Get the file from the file path.
-  local readonly root="${1#${LIBVPX_TEST_DATA_PATH}/}"
+  local root="${1#${LIBVPX_TEST_DATA_PATH}/}"
 
   # Download the file using curl. Trap to insure non partial file.
   (trap "rm -f $1" INT TERM \
@@ -50,7 +52,7 @@ stress_verify_environment() {
   fi
   for file in "${YUV}" "${VP8}" "${VP9}"; do
     if [ ! -e "${file}" ] ; then
-      download_and_check_file "${file}"
+      download_and_check_file "${file}" || return 1
     fi
   done
   if [ ! -e "${YUV}" ] || [ ! -e "${VP8}" ] || [ ! -e "${VP9}" ] ; then
@@ -70,25 +72,38 @@ stress_verify_environment() {
 # This function runs tests on libvpx that run multiple encodes and decodes
 # in parallel in hopes of catching synchronization and/or threading issues.
 stress() {
-  local readonly decoder="$(vpx_tool_path vpxdec)"
-  local readonly encoder="$(vpx_tool_path vpxenc)"
-  local readonly codec="$1"
-  local readonly webm="$2"
-  local readonly decode_count="$3"
+  local decoder="$(vpx_tool_path vpxdec)"
+  local encoder="$(vpx_tool_path vpxenc)"
+  local codec="$1"
+  local webm="$2"
+  local decode_count="$3"
+  local threads="$4"
+  local enc_args="$5"
   local pids=""
   local rt_max_jobs=${STRESS_RT_MAX_JOBS:-5}
+  local onepass_max_jobs=${STRESS_ONEPASS_MAX_JOBS:-5}
   local twopass_max_jobs=${STRESS_TWOPASS_MAX_JOBS:-5}
 
   # Enable job control, so we can run multiple processes.
   set -m
 
+  # Start $onepass_max_jobs encode jobs in parallel.
+  for i in $(seq ${onepass_max_jobs}); do
+    bitrate=$(($i * 20 + 300))
+    eval "${VPX_TEST_PREFIX}" "${encoder}" "--codec=${codec} -w 1280 -h 720" \
+      "${YUV}" "-t ${threads} --limit=150 --test-decode=fatal --passes=1" \
+      "--target-bitrate=${bitrate} -o ${VPX_TEST_OUTPUT_DIR}/${i}.1pass.webm" \
+      "${enc_args}" ${devnull} &
+    pids="${pids} $!"
+  done
+
   # Start $twopass_max_jobs encode jobs in parallel.
   for i in $(seq ${twopass_max_jobs}); do
     bitrate=$(($i * 20 + 300))
     eval "${VPX_TEST_PREFIX}" "${encoder}" "--codec=${codec} -w 1280 -h 720" \
-      "${YUV}" "-t 4 --limit=150 --test-decode=fatal " \
-      "--target-bitrate=${bitrate} -o ${VPX_TEST_OUTPUT_DIR}/${i}.webm" \
-      ${devnull} &
+      "${YUV}" "-t ${threads} --limit=150 --test-decode=fatal --passes=2" \
+      "--target-bitrate=${bitrate} -o ${VPX_TEST_OUTPUT_DIR}/${i}.2pass.webm" \
+      "${enc_args}" ${devnull} &
     pids="${pids} $!"
   done
 
@@ -96,7 +111,7 @@ stress() {
   for i in $(seq ${rt_max_jobs}); do
     bitrate=$(($i * 20 + 300))
     eval "${VPX_TEST_PREFIX}" "${encoder}" "--codec=${codec} -w 1280 -h 720" \
-      "${YUV}" "-t 4 --limit=150 --test-decode=fatal " \
+      "${YUV}" "-t ${threads} --limit=150 --test-decode=fatal " \
       "--target-bitrate=${bitrate} --lag-in-frames=0 --error-resilient=1" \
       "--kf-min-dist=3000 --kf-max-dist=3000 --cpu-used=-6 --static-thresh=1" \
       "--end-usage=cbr --min-q=2 --max-q=56 --undershoot-pct=100" \
@@ -109,7 +124,7 @@ stress() {
 
   # Start $decode_count decode jobs in parallel.
   for i in $(seq "${decode_count}"); do
-    eval "${decoder}" "-t 4" "${webm}" "--noblit" ${devnull} &
+    eval "${decoder}" "-t ${threads}" "${webm}" "--noblit" ${devnull} &
     pids="${pids} $!"
   done
 
@@ -125,17 +140,44 @@ vp8_stress_test() {
   local vp8_max_jobs=${STRESS_VP8_DECODE_MAX_JOBS:-40}
   if [ "$(vp8_decode_available)" = "yes" -a \
        "$(vp8_encode_available)" = "yes" ]; then
-    stress vp8 "${VP8}" "${vp8_max_jobs}"
+    stress vp8 "${VP8}" "${vp8_max_jobs}" 4
   fi
 }
 
-vp9_stress_test() {
+vp8_stress_test_token_parititions() {
+  local vp8_max_jobs=${STRESS_VP8_DECODE_MAX_JOBS:-40}
+  if [ "$(vp8_decode_available)" = "yes" -a \
+       "$(vp8_encode_available)" = "yes" ]; then
+    for threads in 2 4 8; do
+      for token_partitions in 1 2 3; do
+        stress vp8 "${VP8}" "${vp8_max_jobs}" ${threads} \
+          "--token-parts=$token_partitions"
+      done
+    done
+  fi
+}
+
+vp9_stress() {
   local vp9_max_jobs=${STRESS_VP9_DECODE_MAX_JOBS:-25}
 
   if [ "$(vp9_decode_available)" = "yes" -a \
        "$(vp9_encode_available)" = "yes" ]; then
-    stress vp9 "${VP9}" "${vp9_max_jobs}"
+    stress vp9 "${VP9}" "${vp9_max_jobs}" "$@"
   fi
 }
 
-run_tests stress_verify_environment "vp8_stress_test vp9_stress_test"
+vp9_stress_test() {
+  for threads in 4 8 64; do
+    vp9_stress "$threads" "--row-mt=0"
+  done
+}
+
+vp9_stress_test_row_mt() {
+  for threads in 4 8 64; do
+    vp9_stress "$threads" "--row-mt=1"
+  done
+}
+
+run_tests stress_verify_environment \
+  "vp8_stress_test vp8_stress_test_token_parititions
+   vp9_stress_test vp9_stress_test_row_mt"
diff --git a/media/libvpx/libvpx/test/sum_squares_test.cc b/media/libvpx/libvpx/test/sum_squares_test.cc
index 3aa43d2ce0..8b57e772d1 100644
--- a/media/libvpx/libvpx/test/sum_squares_test.cc
+++ b/media/libvpx/libvpx/test/sum_squares_test.cc
@@ -9,10 +9,12 @@
  */
 
 #include <cmath>
+#include <cstdint>
 #include <cstdlib>
 #include <string>
+#include <tuple>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
@@ -20,30 +22,36 @@
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
+#include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
+#include "vpx_ports/vpx_timer.h"
 
 using libvpx_test::ACMRandom;
+using ::testing::Combine;
+using ::testing::Range;
+using ::testing::ValuesIn;
 
 namespace {
 const int kNumIterations = 10000;
 
-typedef uint64_t (*SSI16Func)(const int16_t *src, int stride, int size);
-typedef std::tr1::tuple<SSI16Func, SSI16Func> SumSquaresParam;
+using SSI16Func = uint64_t (*)(const int16_t *src, int stride, int size);
+using SumSquaresParam = std::tuple<SSI16Func, SSI16Func>;
 
 class SumSquaresTest : public ::testing::TestWithParam<SumSquaresParam> {
  public:
-  virtual ~SumSquaresTest() {}
-  virtual void SetUp() {
+  ~SumSquaresTest() override = default;
+  void SetUp() override {
     ref_func_ = GET_PARAM(0);
     tst_func_ = GET_PARAM(1);
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   SSI16Func ref_func_;
   SSI16Func tst_func_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SumSquaresTest);
 
 TEST_P(SumSquaresTest, OperationCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -102,12 +110,239 @@ TEST_P(SumSquaresTest, ExtremeValues) {
   }
 }
 
-using std::tr1::make_tuple;
+using std::make_tuple;
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, SumSquaresTest,
+    ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c,
+                                 &vpx_sum_squares_2d_i16_neon)));
+#endif  // HAVE_NEON
+
+#if HAVE_SVE
+INSTANTIATE_TEST_SUITE_P(
+    SVE, SumSquaresTest,
+    ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c,
+                                 &vpx_sum_squares_2d_i16_sve)));
+#endif  // HAVE_SVE
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, SumSquaresTest,
     ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c,
                                  &vpx_sum_squares_2d_i16_sse2)));
 #endif  // HAVE_SSE2
+
+#if HAVE_MSA
+INSTANTIATE_TEST_SUITE_P(
+    MSA, SumSquaresTest,
+    ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c,
+                                 &vpx_sum_squares_2d_i16_msa)));
+#endif  // HAVE_MSA
+
+using SSEFunc = int64_t (*)(const uint8_t *a, int a_stride, const uint8_t *b,
+                            int b_stride, int width, int height);
+
+struct TestSSEFuncs {
+  TestSSEFuncs(SSEFunc ref = nullptr, SSEFunc tst = nullptr, int depth = 0)
+      : ref_func(ref), tst_func(tst), bit_depth(depth) {}
+  SSEFunc ref_func;  // Pointer to reference function
+  SSEFunc tst_func;  // Pointer to tested function
+  int bit_depth;
+};
+
+using SSETestParam = std::tuple<TestSSEFuncs, int>;
+
+class SSETest : public ::testing::TestWithParam<SSETestParam> {
+ public:
+  ~SSETest() override = default;
+  void SetUp() override {
+    params_ = GET_PARAM(0);
+    width_ = GET_PARAM(1);
+    is_hbd_ =
+#if CONFIG_VP9_HIGHBITDEPTH
+        params_.ref_func == vpx_highbd_sse_c;
+#else
+        false;
+#endif
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src_ = reinterpret_cast<uint8_t *>(vpx_memalign(32, 256 * 256 * 2));
+    ref_ = reinterpret_cast<uint8_t *>(vpx_memalign(32, 256 * 256 * 2));
+    ASSERT_NE(src_, nullptr);
+    ASSERT_NE(ref_, nullptr);
+  }
+
+  void TearDown() override {
+    vpx_free(src_);
+    vpx_free(ref_);
+  }
+  void RunTest(bool is_random, int width, int height, int run_times);
+
+  void GenRandomData(int width, int height, int stride) {
+    uint16_t *src16 = reinterpret_cast<uint16_t *>(src_);
+    uint16_t *ref16 = reinterpret_cast<uint16_t *>(ref_);
+    const int msb = 11;  // Up to 12 bit input
+    const int limit = 1 << (msb + 1);
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        if (!is_hbd_) {
+          src_[ii * stride + jj] = rnd_.Rand8();
+          ref_[ii * stride + jj] = rnd_.Rand8();
+        } else {
+          src16[ii * stride + jj] = rnd_(limit);
+          ref16[ii * stride + jj] = rnd_(limit);
+        }
+      }
+    }
+  }
+
+  void GenExtremeData(int width, int height, int stride, uint8_t *data,
+                      int16_t val) {
+    uint16_t *data16 = reinterpret_cast<uint16_t *>(data);
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        if (!is_hbd_) {
+          data[ii * stride + jj] = static_cast<uint8_t>(val);
+        } else {
+          data16[ii * stride + jj] = val;
+        }
+      }
+    }
+  }
+
+ protected:
+  bool is_hbd_;
+  int width_;
+  TestSSEFuncs params_;
+  uint8_t *src_;
+  uint8_t *ref_;
+  ACMRandom rnd_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SSETest);
+
+void SSETest::RunTest(bool is_random, int width, int height, int run_times) {
+  int failed = 0;
+  vpx_usec_timer ref_timer, test_timer;
+  for (int k = 0; k < 3; k++) {
+    int stride = 4 << rnd_(7);  // Up to 256 stride
+    while (stride < width) {    // Make sure it's valid
+      stride = 4 << rnd_(7);
+    }
+    if (is_random) {
+      GenRandomData(width, height, stride);
+    } else {
+      const int msb = is_hbd_ ? 12 : 8;  // Up to 12 bit input
+      const int limit = (1 << msb) - 1;
+      if (k == 0) {
+        GenExtremeData(width, height, stride, src_, 0);
+        GenExtremeData(width, height, stride, ref_, limit);
+      } else {
+        GenExtremeData(width, height, stride, src_, limit);
+        GenExtremeData(width, height, stride, ref_, 0);
+      }
+    }
+    int64_t res_ref, res_tst;
+    uint8_t *src = src_;
+    uint8_t *ref = ref_;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (is_hbd_) {
+      src = CONVERT_TO_BYTEPTR(src_);
+      ref = CONVERT_TO_BYTEPTR(ref_);
+    }
+#endif
+    res_ref = params_.ref_func(src, stride, ref, stride, width, height);
+    res_tst = params_.tst_func(src, stride, ref, stride, width, height);
+    if (run_times > 1) {
+      vpx_usec_timer_start(&ref_timer);
+      for (int j = 0; j < run_times; j++) {
+        params_.ref_func(src, stride, ref, stride, width, height);
+      }
+      vpx_usec_timer_mark(&ref_timer);
+      const int elapsed_time_c =
+          static_cast<int>(vpx_usec_timer_elapsed(&ref_timer));
+
+      vpx_usec_timer_start(&test_timer);
+      for (int j = 0; j < run_times; j++) {
+        params_.tst_func(src, stride, ref, stride, width, height);
+      }
+      vpx_usec_timer_mark(&test_timer);
+      const int elapsed_time_simd =
+          static_cast<int>(vpx_usec_timer_elapsed(&test_timer));
+
+      printf(
+          "c_time=%d \t simd_time=%d \t "
+          "gain=%d\n",
+          elapsed_time_c, elapsed_time_simd,
+          (elapsed_time_c / elapsed_time_simd));
+    } else {
+      if (!failed) {
+        failed = res_ref != res_tst;
+        EXPECT_EQ(res_ref, res_tst)
+            << "Error:" << (is_hbd_ ? "hbd " : " ") << k << " SSE Test ["
+            << width << "x" << height
+            << "] C output does not match optimized output.";
+      }
+    }
+  }
+}
+
+TEST_P(SSETest, OperationCheck) {
+  for (int height = 4; height <= 128; height += 4) {
+    RunTest(true, width_, height, 1);  // GenRandomData
+  }
+}
+
+TEST_P(SSETest, ExtremeValues) {
+  for (int height = 4; height <= 128; height += 4) {
+    RunTest(false, width_, height, 1);
+  }
+}
+
+TEST_P(SSETest, DISABLED_Speed) {
+  for (int height = 4; height <= 128; height += 4) {
+    RunTest(true, width_, height, 100);
+  }
+}
+
+#if HAVE_NEON
+TestSSEFuncs sse_neon[] = {
+  TestSSEFuncs(&vpx_sse_c, &vpx_sse_neon),
+#if CONFIG_VP9_HIGHBITDEPTH
+  TestSSEFuncs(&vpx_highbd_sse_c, &vpx_highbd_sse_neon)
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(NEON, SSETest,
+                         Combine(ValuesIn(sse_neon), Range(4, 129, 4)));
+#endif  // HAVE_NEON
+
+#if HAVE_NEON_DOTPROD
+TestSSEFuncs sse_neon_dotprod[] = {
+  TestSSEFuncs(&vpx_sse_c, &vpx_sse_neon_dotprod),
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SSETest,
+                         Combine(ValuesIn(sse_neon_dotprod), Range(4, 129, 4)));
+#endif  // HAVE_NEON_DOTPROD
+
+#if HAVE_SSE4_1
+TestSSEFuncs sse_sse4[] = {
+  TestSSEFuncs(&vpx_sse_c, &vpx_sse_sse4_1),
+#if CONFIG_VP9_HIGHBITDEPTH
+  TestSSEFuncs(&vpx_highbd_sse_c, &vpx_highbd_sse_sse4_1)
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(SSE4_1, SSETest,
+                         Combine(ValuesIn(sse_sse4), Range(4, 129, 4)));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+
+TestSSEFuncs sse_avx2[] = {
+  TestSSEFuncs(&vpx_sse_c, &vpx_sse_avx2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  TestSSEFuncs(&vpx_highbd_sse_c, &vpx_highbd_sse_avx2)
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, SSETest,
+                         Combine(ValuesIn(sse_avx2), Range(4, 129, 4)));
+#endif  // HAVE_AVX2
 }  // namespace
diff --git a/media/libvpx/libvpx/test/superframe_test.cc b/media/libvpx/libvpx/test/superframe_test.cc
index 421dfccd60..9319fab2ae 100644
--- a/media/libvpx/libvpx/test/superframe_test.cc
+++ b/media/libvpx/libvpx/test/superframe_test.cc
@@ -8,7 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 #include <climits>
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include <tuple>
+
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
@@ -18,36 +20,36 @@ namespace {
 
 const int kTestMode = 0;
 
-typedef std::tr1::tuple<libvpx_test::TestMode, int> SuperframeTestParam;
+using SuperframeTestParam = std::tuple<libvpx_test::TestMode, int>;
 
 class SuperframeTest
     : public ::libvpx_test::EncoderTest,
       public ::libvpx_test::CodecTestWithParam<SuperframeTestParam> {
  protected:
   SuperframeTest()
-      : EncoderTest(GET_PARAM(0)), modified_buf_(NULL), last_sf_pts_(0) {}
-  virtual ~SuperframeTest() {}
+      : EncoderTest(GET_PARAM(0)), modified_buf_(nullptr), last_sf_pts_(0) {}
+  ~SuperframeTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     const SuperframeTestParam input = GET_PARAM(1);
-    const libvpx_test::TestMode mode = std::tr1::get<kTestMode>(input);
+    const libvpx_test::TestMode mode = std::get<kTestMode>(input);
     SetMode(mode);
     sf_count_ = 0;
     sf_count_max_ = INT_MAX;
   }
 
-  virtual void TearDown() { delete[] modified_buf_; }
+  void TearDown() override { delete[] modified_buf_; }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
       encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
     }
   }
 
-  virtual const vpx_codec_cx_pkt_t *MutateEncoderOutputHook(
-      const vpx_codec_cx_pkt_t *pkt) {
+  const vpx_codec_cx_pkt_t *MutateEncoderOutputHook(
+      const vpx_codec_cx_pkt_t *pkt) override {
     if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return pkt;
 
     const uint8_t *buffer = reinterpret_cast<uint8_t *>(pkt->data.frame.buf);
@@ -93,7 +95,7 @@ TEST_P(SuperframeTest, TestSuperframeIndexIsOptional) {
   EXPECT_EQ(sf_count_, 1);
 }
 
-VP9_INSTANTIATE_TEST_CASE(
+VP9_INSTANTIATE_TEST_SUITE(
     SuperframeTest,
     ::testing::Combine(::testing::Values(::libvpx_test::kTwoPassGood),
                        ::testing::Values(0)));
diff --git a/media/libvpx/libvpx/test/svc_datarate_test.cc b/media/libvpx/libvpx/test/svc_datarate_test.cc
new file mode 100644
index 0000000000..7e0f95277e
--- /dev/null
+++ b/media/libvpx/libvpx/test/svc_datarate_test.cc
@@ -0,0 +1,2208 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#include "gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/svc_test.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vpx/vpx_codec.h"
+#include "vpx_ports/bitops.h"
+
+namespace svc_test {
+namespace {
+
+enum INTER_LAYER_PRED {
+  // Inter-layer prediction is on on all frames.
+  INTER_LAYER_PRED_ON,
+  // Inter-layer prediction is off on all frames.
+  INTER_LAYER_PRED_OFF,
+  // Inter-layer prediction is off on non-key frames and non-sync frames.
+  INTER_LAYER_PRED_OFF_NONKEY,
+  // Inter-layer prediction is on on all frames, but constrained such
+  // that any layer S (> 0) can only predict from previous spatial
+  // layer S-1, from the same superframe.
+  INTER_LAYER_PRED_ON_CONSTRAINED
+};
+
+class DatarateOnePassCbrSvc : public OnePassCbrSvc {
+ public:
+  explicit DatarateOnePassCbrSvc(const ::libvpx_test::CodecFactory *codec)
+      : OnePassCbrSvc(codec) {
+    inter_layer_pred_mode_ = 0;
+  }
+
+ protected:
+  ~DatarateOnePassCbrSvc() override = default;
+
+  virtual void ResetModel() {
+    last_pts_ = 0;
+    duration_ = 0.0;
+    mismatch_psnr_ = 0.0;
+    mismatch_nframes_ = 0;
+    denoiser_on_ = 0;
+    tune_content_ = 0;
+    base_speed_setting_ = 5;
+    spatial_layer_id_ = 0;
+    temporal_layer_id_ = 0;
+    update_pattern_ = 0;
+    memset(bits_in_buffer_model_, 0, sizeof(bits_in_buffer_model_));
+    memset(bits_total_, 0, sizeof(bits_total_));
+    memset(layer_target_avg_bandwidth_, 0, sizeof(layer_target_avg_bandwidth_));
+    dynamic_drop_layer_ = false;
+    single_layer_resize_ = false;
+    change_bitrate_ = false;
+    last_pts_ref_ = 0;
+    middle_bitrate_ = 0;
+    top_bitrate_ = 0;
+    superframe_count_ = -1;
+    key_frame_spacing_ = 9999;
+    num_nonref_frames_ = 0;
+    layer_framedrop_ = 0;
+    force_key_ = 0;
+    force_key_test_ = 0;
+    insert_layer_sync_ = 0;
+    layer_sync_on_base_ = 0;
+    force_intra_only_frame_ = 0;
+    superframe_has_intra_only_ = 0;
+    use_post_encode_drop_ = 0;
+    denoiser_off_on_ = false;
+    denoiser_enable_layers_ = false;
+    num_resize_down_ = 0;
+    num_resize_up_ = 0;
+    for (int i = 0; i < VPX_MAX_LAYERS; i++) {
+      prev_frame_width_[i] = 320;
+      prev_frame_height_[i] = 240;
+    }
+    ksvc_flex_noupd_tlenh_ = false;
+    external_resize_dynamic_drop_layer_ = false;
+    external_resize_pattern_ = 0;
+    superframe_cnt_ = 0;
+  }
+  void BeginPassHook(unsigned int /*pass*/) override {}
+
+  // Example pattern for spatial layers and 2 temporal layers used in the
+  // bypass/flexible mode. The pattern corresponds to the pattern
+  // VP9E_TEMPORAL_LAYERING_MODE_0101 (temporal_layering_mode == 2) used in
+  // non-flexible mode, except that we disable inter-layer prediction.
+  void set_frame_flags_bypass_mode(int tl, int num_spatial_layers,
+                                   int is_key_frame,
+                                   vpx_svc_ref_frame_config_t *ref_frame_config,
+                                   int noupdate_tlenh) {
+    for (int sl = 0; sl < num_spatial_layers; ++sl)
+      ref_frame_config->update_buffer_slot[sl] = 0;
+
+    for (int sl = 0; sl < num_spatial_layers; ++sl) {
+      if (tl == 0) {
+        ref_frame_config->lst_fb_idx[sl] = sl;
+        if (sl) {
+          if (is_key_frame) {
+            ref_frame_config->lst_fb_idx[sl] = sl - 1;
+            ref_frame_config->gld_fb_idx[sl] = sl;
+          } else {
+            ref_frame_config->gld_fb_idx[sl] = sl - 1;
+          }
+        } else {
+          ref_frame_config->gld_fb_idx[sl] = 0;
+        }
+        ref_frame_config->alt_fb_idx[sl] = 0;
+      } else if (tl == 1) {
+        ref_frame_config->lst_fb_idx[sl] = sl;
+        ref_frame_config->gld_fb_idx[sl] =
+            VPXMIN(REF_FRAMES - 1, num_spatial_layers + sl - 1);
+        ref_frame_config->alt_fb_idx[sl] =
+            VPXMIN(REF_FRAMES - 1, num_spatial_layers + sl);
+      }
+      if (!tl) {
+        if (!sl) {
+          ref_frame_config->reference_last[sl] = 1;
+          ref_frame_config->reference_golden[sl] = 0;
+          ref_frame_config->reference_alt_ref[sl] = 0;
+          ref_frame_config->update_buffer_slot[sl] |=
+              1 << ref_frame_config->lst_fb_idx[sl];
+        } else {
+          if (is_key_frame) {
+            ref_frame_config->reference_last[sl] = 1;
+            ref_frame_config->reference_golden[sl] = 0;
+            ref_frame_config->reference_alt_ref[sl] = 0;
+            ref_frame_config->update_buffer_slot[sl] |=
+                1 << ref_frame_config->gld_fb_idx[sl];
+          } else {
+            ref_frame_config->reference_last[sl] = 1;
+            ref_frame_config->reference_golden[sl] = 0;
+            ref_frame_config->reference_alt_ref[sl] = 0;
+            ref_frame_config->update_buffer_slot[sl] |=
+                1 << ref_frame_config->lst_fb_idx[sl];
+          }
+        }
+      } else if (tl == 1) {
+        if (!sl) {
+          ref_frame_config->reference_last[sl] = 1;
+          ref_frame_config->reference_golden[sl] = 0;
+          ref_frame_config->reference_alt_ref[sl] = 0;
+          ref_frame_config->update_buffer_slot[sl] |=
+              1 << ref_frame_config->alt_fb_idx[sl];
+        } else {
+          ref_frame_config->reference_last[sl] = 1;
+          ref_frame_config->reference_golden[sl] = 0;
+          ref_frame_config->reference_alt_ref[sl] = 0;
+          // Non reference frame on top temporal top spatial.
+          ref_frame_config->update_buffer_slot[sl] = 0;
+        }
+        // Force no update on all spatial layers for temporal enhancement layer
+        // frames.
+        if (noupdate_tlenh) ref_frame_config->update_buffer_slot[sl] = 0;
+      }
+    }
+  }
+
+  void CheckLayerRateTargeting(int num_spatial_layers, int num_temporal_layers,
+                               double thresh_overshoot,
+                               double thresh_undershoot) const {
+    for (int sl = 0; sl < num_spatial_layers; ++sl)
+      for (int tl = 0; tl < num_temporal_layers; ++tl) {
+        const int layer = sl * num_temporal_layers + tl;
+        ASSERT_GE(cfg_.layer_target_bitrate[layer],
+                  file_datarate_[layer] * thresh_overshoot)
+            << " The datarate for the file exceeds the target by too much!";
+        ASSERT_LE(cfg_.layer_target_bitrate[layer],
+                  file_datarate_[layer] * thresh_undershoot)
+            << " The datarate for the file is lower than the target by too "
+               "much!";
+      }
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    PreEncodeFrameHookSetup(video, encoder);
+
+    if (video->frame() == 0) {
+      if (force_intra_only_frame_) {
+        // Decoder sets the color_space for Intra-only frames
+        // to BT_601 (see line 1810 in vp9_decodeframe.c).
+        // So set it here in these tess to avoid encoder-decoder
+        // mismatch check on color space setting.
+        encoder->Control(VP9E_SET_COLOR_SPACE, VPX_CS_BT_601);
+      }
+      encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_);
+      encoder->Control(VP9E_SET_TUNE_CONTENT, tune_content_);
+      encoder->Control(VP9E_SET_SVC_INTER_LAYER_PRED, inter_layer_pred_mode_);
+
+      if (layer_framedrop_) {
+        vpx_svc_frame_drop_t svc_drop_frame;
+        svc_drop_frame.framedrop_mode = LAYER_DROP;
+        for (int i = 0; i < number_spatial_layers_; i++)
+          svc_drop_frame.framedrop_thresh[i] = 30;
+        svc_drop_frame.max_consec_drop = 30;
+        encoder->Control(VP9E_SET_SVC_FRAME_DROP_LAYER, &svc_drop_frame);
+      }
+
+      if (use_post_encode_drop_) {
+        encoder->Control(VP9E_SET_POSTENCODE_DROP, use_post_encode_drop_);
+      }
+      // We want to force external resize on the very first frame.
+      if (external_resize_dynamic_drop_layer_) video->Next();
+    }
+
+    if (denoiser_off_on_) {
+      encoder->Control(VP9E_SET_AQ_MODE, 3);
+      // Set inter_layer_pred to INTER_LAYER_PRED_OFF_NONKEY (K-SVC).
+      encoder->Control(VP9E_SET_SVC_INTER_LAYER_PRED, 2);
+      if (!denoiser_enable_layers_) {
+        if (video->frame() == 0)
+          encoder->Control(VP9E_SET_NOISE_SENSITIVITY, 0);
+        else if (video->frame() == 100)
+          encoder->Control(VP9E_SET_NOISE_SENSITIVITY, 1);
+      } else {
+        // Cumulative bitrates for top spatial layers, for
+        // 3 temporal layers.
+        if (video->frame() == 0) {
+          encoder->Control(VP9E_SET_NOISE_SENSITIVITY, 0);
+          // Change layer bitrates to set top spatial layer to 0.
+          // This is for 3 spatial 3 temporal layers.
+          // This will trigger skip encoding/dropping of top spatial layer.
+          cfg_.rc_target_bitrate -= cfg_.layer_target_bitrate[8];
+          for (int i = 0; i < 3; i++)
+            bitrate_sl3_[i] = cfg_.layer_target_bitrate[i + 6];
+          cfg_.layer_target_bitrate[6] = 0;
+          cfg_.layer_target_bitrate[7] = 0;
+          cfg_.layer_target_bitrate[8] = 0;
+          encoder->Config(&cfg_);
+        } else if (video->frame() == 100) {
+          // Change layer bitrates to non-zero on top spatial layer.
+          // This will trigger skip encoding of top spatial layer
+          // on key frame (period = 100).
+          for (int i = 0; i < 3; i++)
+            cfg_.layer_target_bitrate[i + 6] = bitrate_sl3_[i];
+          cfg_.rc_target_bitrate += cfg_.layer_target_bitrate[8];
+          encoder->Config(&cfg_);
+        } else if (video->frame() == 120) {
+          // Enable denoiser and top spatial layer after key frame (period is
+          // 100).
+          encoder->Control(VP9E_SET_NOISE_SENSITIVITY, 1);
+        }
+      }
+    }
+
+    if (ksvc_flex_noupd_tlenh_) {
+      vpx_svc_layer_id_t layer_id;
+      layer_id.spatial_layer_id = 0;
+      layer_id.temporal_layer_id = (video->frame() % 2 != 0);
+      temporal_layer_id_ = layer_id.temporal_layer_id;
+      for (int i = 0; i < number_spatial_layers_; i++) {
+        layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_;
+        ref_frame_config_.duration[i] = 1;
+      }
+      encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
+      set_frame_flags_bypass_mode(layer_id.temporal_layer_id,
+                                  number_spatial_layers_, 0, &ref_frame_config_,
+                                  1);
+      encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_);
+    }
+
+    if (update_pattern_ && video->frame() >= 100) {
+      vpx_svc_layer_id_t layer_id;
+      if (video->frame() == 100) {
+        cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+        encoder->Config(&cfg_);
+      }
+      // Set layer id since the pattern changed.
+      layer_id.spatial_layer_id = 0;
+      layer_id.temporal_layer_id = (video->frame() % 2 != 0);
+      temporal_layer_id_ = layer_id.temporal_layer_id;
+      for (int i = 0; i < number_spatial_layers_; i++) {
+        layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_;
+        ref_frame_config_.duration[i] = 1;
+      }
+      encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
+      set_frame_flags_bypass_mode(layer_id.temporal_layer_id,
+                                  number_spatial_layers_, 0, &ref_frame_config_,
+                                  0);
+      encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_);
+    }
+
+    if (change_bitrate_ && video->frame() == 200) {
+      duration_ = (last_pts_ + 1) * timebase_;
+      for (int sl = 0; sl < number_spatial_layers_; ++sl) {
+        for (int tl = 0; tl < number_temporal_layers_; ++tl) {
+          const int layer = sl * number_temporal_layers_ + tl;
+          const double file_size_in_kb = bits_total_[layer] / 1000.;
+          file_datarate_[layer] = file_size_in_kb / duration_;
+        }
+      }
+
+      CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_,
+                              0.78, 1.15);
+
+      memset(file_datarate_, 0, sizeof(file_datarate_));
+      memset(bits_total_, 0, sizeof(bits_total_));
+      int64_t bits_in_buffer_model_tmp[VPX_MAX_LAYERS];
+      last_pts_ref_ = last_pts_;
+      // Set new target bitarate.
+      cfg_.rc_target_bitrate = cfg_.rc_target_bitrate >> 1;
+      // Buffer level should not reset on dynamic bitrate change.
+      memcpy(bits_in_buffer_model_tmp, bits_in_buffer_model_,
+             sizeof(bits_in_buffer_model_));
+      AssignLayerBitrates();
+      memcpy(bits_in_buffer_model_, bits_in_buffer_model_tmp,
+             sizeof(bits_in_buffer_model_));
+
+      // Change config to update encoder with new bitrate configuration.
+      encoder->Config(&cfg_);
+    }
+
+    if (external_resize_dynamic_drop_layer_) {
+      frame_flags_ = 0;
+      for (int i = 0; i < 9; ++i) {
+        svc_params_.min_quantizers[i] = 20;
+        svc_params_.max_quantizers[i] = 56;
+      }
+      if (video->frame() == 1 || video->frame() == 150) {
+        // Set the new top width/height for external resize.
+        top_sl_width_ = video->img()->d_w;
+        top_sl_height_ = video->img()->d_h;
+        for (int i = 0; i < 9; ++i) {
+          bitrate_layer_[i] = cfg_.layer_target_bitrate[i];
+        }
+        if (external_resize_pattern_ == 1) {
+          // Input size is 1/4. 2 top spatial layers are dropped.
+          // This will trigger skip encoding/dropping of two top spatial layers.
+          cfg_.rc_target_bitrate -=
+              cfg_.layer_target_bitrate[5] + cfg_.layer_target_bitrate[8];
+          for (int i = 3; i < 9; ++i) {
+            cfg_.layer_target_bitrate[i] = 0;
+          }
+          for (int sl = 0; sl < 3; sl++) {
+            svc_params_.scaling_factor_num[sl] = 1;
+            svc_params_.scaling_factor_den[sl] = 1;
+          }
+        } else if (external_resize_pattern_ == 2) {
+          // Input size is 1/2. Top spatial layer is dropped.
+          // This will trigger skip encoding/dropping of top spatial layer.
+          cfg_.rc_target_bitrate -= cfg_.layer_target_bitrate[8];
+          for (int i = 6; i < 9; ++i) {
+            cfg_.layer_target_bitrate[i] = 0;
+          }
+          svc_params_.scaling_factor_num[0] = 1;
+          svc_params_.scaling_factor_den[0] = 2;
+          svc_params_.scaling_factor_num[1] = 1;
+          svc_params_.scaling_factor_den[1] = 1;
+          svc_params_.scaling_factor_num[2] = 1;
+          svc_params_.scaling_factor_den[2] = 1;
+        }
+        encoder->Config(&cfg_);
+        encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
+      } else if (video->frame() == 50 || video->frame() == 200) {
+        top_sl_width_ = video->img()->d_w;
+        top_sl_height_ = video->img()->d_h;
+        if (external_resize_pattern_ == 1) {
+          // Input size is 1/2. Change layer bitrates to set top layer to 0.
+          // This will trigger skip encoding/dropping of top spatial layer.
+          cfg_.rc_target_bitrate += bitrate_layer_[5];
+          for (int i = 3; i < 6; ++i) {
+            cfg_.layer_target_bitrate[i] = bitrate_layer_[i];
+          }
+          svc_params_.scaling_factor_num[0] = 1;
+          svc_params_.scaling_factor_den[0] = 2;
+          svc_params_.scaling_factor_num[1] = 1;
+          svc_params_.scaling_factor_den[1] = 1;
+          svc_params_.scaling_factor_num[2] = 1;
+          svc_params_.scaling_factor_den[2] = 1;
+        } else if (external_resize_pattern_ == 2) {
+          // Input size is 1/4. Change layer bitrates to set two top layers to
+          // 0. This will trigger skip encoding/dropping of two top spatial
+          // layers.
+          cfg_.rc_target_bitrate -= bitrate_layer_[5];
+          for (int i = 3; i < 6; ++i) {
+            cfg_.layer_target_bitrate[i] = 0;
+          }
+          for (int sl = 0; sl < 3; sl++) {
+            svc_params_.scaling_factor_num[sl] = 1;
+            svc_params_.scaling_factor_den[sl] = 1;
+          }
+        }
+        encoder->Config(&cfg_);
+        encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
+      } else if (video->frame() == 100 || video->frame() == 250) {
+        top_sl_width_ = video->img()->d_w;
+        top_sl_height_ = video->img()->d_h;
+        // Input is original size. Change layer bitrates to nonzero for all
+        // layers.
+        cfg_.rc_target_bitrate =
+            bitrate_layer_[2] + bitrate_layer_[5] + bitrate_layer_[8];
+        for (int i = 0; i < 9; ++i) {
+          cfg_.layer_target_bitrate[i] = bitrate_layer_[i];
+        }
+        svc_params_.scaling_factor_num[0] = 1;
+        svc_params_.scaling_factor_den[0] = 4;
+        svc_params_.scaling_factor_num[1] = 1;
+        svc_params_.scaling_factor_den[1] = 2;
+        svc_params_.scaling_factor_num[2] = 1;
+        svc_params_.scaling_factor_den[2] = 1;
+        encoder->Config(&cfg_);
+        encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
+      }
+    } else if (dynamic_drop_layer_ && !single_layer_resize_) {
+      if (video->frame() == 0) {
+        // Change layer bitrates to set top layers to 0. This will trigger skip
+        // encoding/dropping of top two spatial layers.
+        cfg_.rc_target_bitrate -=
+            (cfg_.layer_target_bitrate[1] + cfg_.layer_target_bitrate[2]);
+        middle_bitrate_ = cfg_.layer_target_bitrate[1];
+        top_bitrate_ = cfg_.layer_target_bitrate[2];
+        cfg_.layer_target_bitrate[1] = 0;
+        cfg_.layer_target_bitrate[2] = 0;
+        encoder->Config(&cfg_);
+      } else if (video->frame() == 50) {
+        // Change layer bitrates to non-zero on two top spatial layers.
+        // This will trigger skip encoding of top two spatial layers.
+        cfg_.layer_target_bitrate[1] = middle_bitrate_;
+        cfg_.layer_target_bitrate[2] = top_bitrate_;
+        cfg_.rc_target_bitrate +=
+            cfg_.layer_target_bitrate[2] + cfg_.layer_target_bitrate[1];
+        encoder->Config(&cfg_);
+      } else if (video->frame() == 100) {
+        // Change layer bitrates to set top layers to 0. This will trigger skip
+        // encoding/dropping of top two spatial layers.
+        cfg_.rc_target_bitrate -=
+            (cfg_.layer_target_bitrate[1] + cfg_.layer_target_bitrate[2]);
+        middle_bitrate_ = cfg_.layer_target_bitrate[1];
+        top_bitrate_ = cfg_.layer_target_bitrate[2];
+        cfg_.layer_target_bitrate[1] = 0;
+        cfg_.layer_target_bitrate[2] = 0;
+        encoder->Config(&cfg_);
+      } else if (video->frame() == 150) {
+        // Change layer bitrate on second layer to non-zero to start
+        // encoding it again.
+        cfg_.layer_target_bitrate[1] = middle_bitrate_;
+        cfg_.rc_target_bitrate += cfg_.layer_target_bitrate[1];
+        encoder->Config(&cfg_);
+      } else if (video->frame() == 200) {
+        // Change layer bitrate on top layer to non-zero to start
+        // encoding it again.
+        cfg_.layer_target_bitrate[2] = top_bitrate_;
+        cfg_.rc_target_bitrate += cfg_.layer_target_bitrate[2];
+        encoder->Config(&cfg_);
+      }
+    } else if (dynamic_drop_layer_ && single_layer_resize_) {
+      // Change layer bitrates to set top layers to 0. This will trigger skip
+      // encoding/dropping of top spatial layers.
+      if (video->frame() == 2) {
+        cfg_.rc_target_bitrate -=
+            (cfg_.layer_target_bitrate[1] + cfg_.layer_target_bitrate[2]);
+        middle_bitrate_ = cfg_.layer_target_bitrate[1];
+        top_bitrate_ = cfg_.layer_target_bitrate[2];
+        cfg_.layer_target_bitrate[1] = 0;
+        cfg_.layer_target_bitrate[2] = 0;
+        // Set spatial layer 0 to a very low bitrate to trigger resize.
+        cfg_.layer_target_bitrate[0] = 30;
+        cfg_.rc_target_bitrate = cfg_.layer_target_bitrate[0];
+        encoder->Config(&cfg_);
+      } else if (video->frame() == 100) {
+        // Set base spatial layer to very high to go back up to original size.
+        cfg_.layer_target_bitrate[0] = 400;
+        cfg_.rc_target_bitrate = cfg_.layer_target_bitrate[0];
+        encoder->Config(&cfg_);
+      }
+    } else if (!dynamic_drop_layer_ && single_layer_resize_) {
+      if (video->frame() == 2) {
+        cfg_.layer_target_bitrate[0] = 30;
+        cfg_.layer_target_bitrate[1] = 50;
+        cfg_.rc_target_bitrate =
+            (cfg_.layer_target_bitrate[0] + cfg_.layer_target_bitrate[1]);
+        encoder->Config(&cfg_);
+      } else if (video->frame() == 160) {
+        cfg_.layer_target_bitrate[0] = 1500;
+        cfg_.layer_target_bitrate[1] = 2000;
+        cfg_.rc_target_bitrate =
+            (cfg_.layer_target_bitrate[0] + cfg_.layer_target_bitrate[1]);
+        encoder->Config(&cfg_);
+      }
+    }
+    if (force_key_test_ && force_key_) frame_flags_ = VPX_EFLAG_FORCE_KF;
+
+    if (insert_layer_sync_) {
+      vpx_svc_spatial_layer_sync_t svc_layer_sync;
+      svc_layer_sync.base_layer_intra_only = 0;
+      for (int i = 0; i < number_spatial_layers_; i++)
+        svc_layer_sync.spatial_layer_sync[i] = 0;
+      if (force_intra_only_frame_) {
+        superframe_has_intra_only_ = 0;
+        if (video->frame() == 0) {
+          svc_layer_sync.base_layer_intra_only = 1;
+          svc_layer_sync.spatial_layer_sync[0] = 1;
+          encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync);
+          superframe_has_intra_only_ = 1;
+        } else if (video->frame() == 100) {
+          svc_layer_sync.base_layer_intra_only = 1;
+          svc_layer_sync.spatial_layer_sync[0] = 1;
+          encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync);
+          superframe_has_intra_only_ = 1;
+        }
+      } else {
+        layer_sync_on_base_ = 0;
+        if (video->frame() == 150) {
+          svc_layer_sync.spatial_layer_sync[1] = 1;
+          encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync);
+        } else if (video->frame() == 240) {
+          svc_layer_sync.spatial_layer_sync[2] = 1;
+          encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync);
+        } else if (video->frame() == 320) {
+          svc_layer_sync.spatial_layer_sync[0] = 1;
+          layer_sync_on_base_ = 1;
+          encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync);
+        }
+      }
+    }
+
+    const vpx_rational_t tb = video->timebase();
+    timebase_ = static_cast<double>(tb.num) / tb.den;
+    duration_ = 0;
+    superframe_cnt_++;
+  }
+
+  vpx_codec_err_t parse_superframe_index(const uint8_t *data, size_t data_sz,
+                                         uint32_t sizes[8], int *count) {
+    uint8_t marker;
+    marker = *(data + data_sz - 1);
+    *count = 0;
+    if ((marker & 0xe0) == 0xc0) {
+      const uint32_t frames = (marker & 0x7) + 1;
+      const uint32_t mag = ((marker >> 3) & 0x3) + 1;
+      const size_t index_sz = 2 + mag * frames;
+      // This chunk is marked as having a superframe index but doesn't have
+      // enough data for it, thus it's an invalid superframe index.
+      if (data_sz < index_sz) return VPX_CODEC_CORRUPT_FRAME;
+      {
+        const uint8_t marker2 = *(data + data_sz - index_sz);
+        // This chunk is marked as having a superframe index but doesn't have
+        // the matching marker byte at the front of the index therefore it's an
+        // invalid chunk.
+        if (marker != marker2) return VPX_CODEC_CORRUPT_FRAME;
+      }
+      {
+        uint32_t i, j;
+        const uint8_t *x = &data[data_sz - index_sz + 1];
+        for (i = 0; i < frames; ++i) {
+          uint32_t this_sz = 0;
+
+          for (j = 0; j < mag; ++j) this_sz |= (*x++) << (j * 8);
+          sizes[i] = this_sz;
+        }
+        *count = frames;
+      }
+    }
+    return VPX_CODEC_OK;
+  }
+
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
+    uint32_t sizes[8] = { 0 };
+    uint32_t sizes_parsed[8] = { 0 };
+    int count = 0;
+    int num_layers_encoded = 0;
+    last_pts_ = pkt->data.frame.pts;
+    const bool key_frame =
+        (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false;
+    if (external_resize_dynamic_drop_layer_) {
+      // No key frames expected in stream, except for first.
+      if (cfg_.kf_max_dist > 1000) {
+        ASSERT_FALSE(key_frame && superframe_cnt_ > 1);
+      }
+    }
+    if (key_frame) {
+      // For test that inserts layer sync frames: requesting a layer_sync on
+      // the base layer must force key frame. So if any key frame occurs after
+      // first superframe it must due to layer sync on base spatial layer.
+      if (superframe_count_ > 0 && insert_layer_sync_ &&
+          !force_intra_only_frame_) {
+        ASSERT_EQ(layer_sync_on_base_, 1);
+      }
+      temporal_layer_id_ = 0;
+      superframe_count_ = 0;
+    }
+    parse_superframe_index(static_cast<const uint8_t *>(pkt->data.frame.buf),
+                           pkt->data.frame.sz, sizes_parsed, &count);
+    // Count may be less than number of spatial layers because of frame drops.
+    if (number_spatial_layers_ > 1) {
+      for (int sl = 0; sl < number_spatial_layers_; ++sl) {
+        if (pkt->data.frame.spatial_layer_encoded[sl]) {
+          sizes[sl] = sizes_parsed[num_layers_encoded];
+          num_layers_encoded++;
+        }
+      }
+    }
+    // For superframe with Intra-only count will be +1 larger
+    // because of no-show frame.
+    if (force_intra_only_frame_ && superframe_has_intra_only_)
+      ASSERT_EQ(count, num_layers_encoded + 1);
+    else
+      ASSERT_EQ(count, num_layers_encoded);
+
+    // In the constrained frame drop mode, if a given spatial is dropped all
+    // upper layers must be dropped too.
+    if (!layer_framedrop_) {
+      int num_layers_dropped = 0;
+      for (int sl = 0; sl < number_spatial_layers_; ++sl) {
+        if (!pkt->data.frame.spatial_layer_encoded[sl]) {
+          // Check that all upper layers are dropped.
+          num_layers_dropped++;
+          for (int sl2 = sl + 1; sl2 < number_spatial_layers_; ++sl2)
+            ASSERT_EQ(pkt->data.frame.spatial_layer_encoded[sl2], 0);
+        }
+      }
+      if (num_layers_dropped == number_spatial_layers_ - 1)
+        force_key_ = 1;
+      else
+        force_key_ = 0;
+    }
+    // Keep track of number of non-reference frames, needed for mismatch check.
+    // Non-reference frames are top spatial and temporal layer frames,
+    // for TL > 0.
+    if (temporal_layer_id_ == number_temporal_layers_ - 1 &&
+        temporal_layer_id_ > 0 &&
+        pkt->data.frame.spatial_layer_encoded[number_spatial_layers_ - 1])
+      num_nonref_frames_++;
+    for (int sl = 0; sl < number_spatial_layers_; ++sl) {
+      sizes[sl] = sizes[sl] << 3;
+      // Update the total encoded bits per layer.
+      // For temporal layers, update the cumulative encoded bits per layer.
+      for (int tl = temporal_layer_id_; tl < number_temporal_layers_; ++tl) {
+        const int layer = sl * number_temporal_layers_ + tl;
+        bits_total_[layer] += static_cast<int64_t>(sizes[sl]);
+        // Update the per-layer buffer level with the encoded frame size.
+        bits_in_buffer_model_[layer] -= static_cast<int64_t>(sizes[sl]);
+        // There should be no buffer underrun, except on the base
+        // temporal layer, since there may be key frames there.
+        // Fo short key frame spacing, buffer can underrun on individual frames.
+        if (!key_frame && tl > 0 && key_frame_spacing_ < 100) {
+          ASSERT_GE(bits_in_buffer_model_[layer], 0)
+              << "Buffer Underrun at frame " << pkt->data.frame.pts;
+        }
+      }
+
+      if (!single_layer_resize_ && sl < number_spatial_layers_ - 1) {
+        unsigned int scaled_width = top_sl_width_ *
+                                    svc_params_.scaling_factor_num[sl] /
+                                    svc_params_.scaling_factor_den[sl];
+        if (scaled_width % 2 != 0) scaled_width += 1;
+        ASSERT_EQ(pkt->data.frame.width[sl], scaled_width);
+        unsigned int scaled_height = top_sl_height_ *
+                                     svc_params_.scaling_factor_num[sl] /
+                                     svc_params_.scaling_factor_den[sl];
+        if (scaled_height % 2 != 0) scaled_height += 1;
+        ASSERT_EQ(pkt->data.frame.height[sl], scaled_height);
+      } else if (superframe_count_ > 0) {
+        if (pkt->data.frame.width[sl] < prev_frame_width_[sl] &&
+            pkt->data.frame.height[sl] < prev_frame_height_[sl])
+          num_resize_down_ += 1;
+        if (pkt->data.frame.width[sl] > prev_frame_width_[sl] &&
+            pkt->data.frame.height[sl] > prev_frame_height_[sl])
+          num_resize_up_ += 1;
+      }
+      prev_frame_width_[sl] = pkt->data.frame.width[sl];
+      prev_frame_height_[sl] = pkt->data.frame.height[sl];
+    }
+  }
+
+  void EndPassHook() override {
+    if (change_bitrate_) last_pts_ = last_pts_ - last_pts_ref_;
+    duration_ = (last_pts_ + 1) * timebase_;
+    for (int sl = 0; sl < number_spatial_layers_; ++sl) {
+      for (int tl = 0; tl < number_temporal_layers_; ++tl) {
+        const int layer = sl * number_temporal_layers_ + tl;
+        const double file_size_in_kb = bits_total_[layer] / 1000.;
+        file_datarate_[layer] = file_size_in_kb / duration_;
+      }
+    }
+  }
+
+  void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) override {
+    // TODO(marpan): Look into why an assert is triggered in compute_psnr
+    // for mismatch frames for the special test case: ksvc_flex_noupd_tlenh.
+    // Has to do with dropped frames in bypass/flexible svc mode.
+    if (!ksvc_flex_noupd_tlenh_) {
+      double mismatch_psnr = compute_psnr(img1, img2);
+      mismatch_psnr_ += mismatch_psnr;
+      ++mismatch_nframes_;
+    }
+  }
+
+  unsigned int GetMismatchFrames() { return mismatch_nframes_; }
+  unsigned int GetNonRefFrames() { return num_nonref_frames_; }
+
+  vpx_codec_pts_t last_pts_;
+  double timebase_;
+  int64_t bits_total_[VPX_MAX_LAYERS];
+  double duration_;
+  double file_datarate_[VPX_MAX_LAYERS];
+  size_t bits_in_last_frame_;
+  double mismatch_psnr_;
+  int denoiser_on_;
+  int tune_content_;
+  int spatial_layer_id_;
+  bool dynamic_drop_layer_;
+  bool single_layer_resize_;
+  unsigned int top_sl_width_;
+  unsigned int top_sl_height_;
+  vpx_svc_ref_frame_config_t ref_frame_config_;
+  int update_pattern_;
+  bool change_bitrate_;
+  vpx_codec_pts_t last_pts_ref_;
+  int middle_bitrate_;
+  int top_bitrate_;
+  int key_frame_spacing_;
+  int layer_framedrop_;
+  int force_key_;
+  int force_key_test_;
+  int inter_layer_pred_mode_;
+  int insert_layer_sync_;
+  int layer_sync_on_base_;
+  int force_intra_only_frame_;
+  int superframe_has_intra_only_;
+  int use_post_encode_drop_;
+  int bitrate_sl3_[3];
+  // Denoiser switched on the fly.
+  bool denoiser_off_on_;
+  // Top layer enabled on the fly.
+  bool denoiser_enable_layers_;
+  int num_resize_up_;
+  int num_resize_down_;
+  unsigned int prev_frame_width_[VPX_MAX_LAYERS];
+  unsigned int prev_frame_height_[VPX_MAX_LAYERS];
+  bool ksvc_flex_noupd_tlenh_;
+  bool external_resize_dynamic_drop_layer_;
+  int bitrate_layer_[9];
+  int external_resize_pattern_;
+  int superframe_cnt_;
+
+ private:
+  void SetConfig(const int num_temporal_layer) override {
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 1;
+    if (num_temporal_layer == 3) {
+      cfg_.ts_rate_decimator[0] = 4;
+      cfg_.ts_rate_decimator[1] = 2;
+      cfg_.ts_rate_decimator[2] = 1;
+      cfg_.temporal_layering_mode = 3;
+    } else if (num_temporal_layer == 2) {
+      cfg_.ts_rate_decimator[0] = 2;
+      cfg_.ts_rate_decimator[1] = 1;
+      cfg_.temporal_layering_mode = 2;
+    } else if (num_temporal_layer == 1) {
+      cfg_.ts_rate_decimator[0] = 1;
+      cfg_.temporal_layering_mode = 0;
+    }
+  }
+
+  unsigned int num_nonref_frames_;
+  unsigned int mismatch_nframes_;
+};
+
+void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
+                         unsigned int initial_h, unsigned int *w,
+                         unsigned int *h, int resize_pattern) {
+  *w = initial_w;
+  *h = initial_h;
+  if (resize_pattern == 1) {
+    if (frame < 50) {
+      *w = initial_w / 4;
+      *h = initial_h / 4;
+    } else if (frame < 100) {
+      *w = initial_w / 2;
+      *h = initial_h / 2;
+    } else if (frame < 150) {
+      *w = initial_w;
+      *h = initial_h;
+    } else if (frame < 200) {
+      *w = initial_w / 4;
+      *h = initial_h / 4;
+    } else if (frame < 250) {
+      *w = initial_w / 2;
+      *h = initial_h / 2;
+    }
+  } else if (resize_pattern == 2) {
+    if (frame < 50) {
+      *w = initial_w / 2;
+      *h = initial_h / 2;
+    } else if (frame < 100) {
+      *w = initial_w / 4;
+      *h = initial_h / 4;
+    } else if (frame < 150) {
+      *w = initial_w;
+      *h = initial_h;
+    } else if (frame < 200) {
+      *w = initial_w / 2;
+      *h = initial_h / 2;
+    } else if (frame < 250) {
+      *w = initial_w / 4;
+      *h = initial_h / 4;
+    }
+  }
+}
+
+class ResizingVideoSource : public ::libvpx_test::DummyVideoSource {
+ public:
+  ResizingVideoSource(int width, int height) {
+    top_width_ = width;
+    top_height_ = height;
+    SetSize(top_width_, top_height_);
+    limit_ = 300;
+  }
+  int external_resize_pattern_ = 1;
+  int force_zero_source_ = 0;
+  int top_width_;
+  int top_height_;
+  ~ResizingVideoSource() override = default;
+
+ protected:
+  void Next() override {
+    ++frame_;
+    unsigned int width = 0;
+    unsigned int height = 0;
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    ScaleForFrameNumber(frame_, top_width_, top_height_, &width, &height,
+                        external_resize_pattern_);
+    SetSize(width, height);
+    FillFrame();
+    unsigned char *image = img_->planes[0];
+    for (size_t i = 0; i < raw_sz_; ++i) {
+      image[i] = rnd.Rand8();
+      if (force_zero_source_ && frame_ % 20 == 0) image[i] = 0;
+    }
+  }
+};
+
+// Params: speed setting.
+class DatarateOnePassCbrSvcSingleBR
+    : public DatarateOnePassCbrSvc,
+      public ::libvpx_test::CodecTestWithParam<int> {
+ public:
+  DatarateOnePassCbrSvcSingleBR() : DatarateOnePassCbrSvc(GET_PARAM(0)) {
+    memset(&svc_params_, 0, sizeof(svc_params_));
+  }
+  ~DatarateOnePassCbrSvcSingleBR() override = default;
+
+ protected:
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    speed_setting_ = GET_PARAM(1);
+    ResetModel();
+  }
+};
+
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3
+// temporal layers, for 4:4:4 Profile 1.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TL444Profile1) {
+  SetSvcConfig(3, 3);
+  ::libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140);
+  cfg_.g_profile = 1;
+  cfg_.g_bit_depth = VPX_BITS_8;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.kf_max_dist = 9999;
+
+  top_sl_width_ = 352;
+  top_sl_height_ = 288;
+  cfg_.rc_target_bitrate = 500;
+  ResetModel();
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78,
+                          1.15);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 3
+// temporal layers, for 4:2:2 Profile 1.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc2SL3TL422Profile1) {
+  SetSvcConfig(2, 3);
+  ::libvpx_test::Y4mVideoSource video("park_joy_90p_8_422.y4m", 0, 20);
+  cfg_.g_profile = 1;
+  cfg_.g_bit_depth = VPX_BITS_8;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.kf_max_dist = 9999;
+
+  top_sl_width_ = 160;
+  top_sl_height_ = 90;
+  cfg_.rc_target_bitrate = 500;
+  ResetModel();
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Use large under/over shoot thresholds as this is a very short clip,
+  // so not good for testing rate-targeting.
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.5,
+                          1.7);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3
+// temporal layers, for Profle 2 10bit.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TL10bitProfile2) {
+  SetSvcConfig(3, 3);
+  ::libvpx_test::Y4mVideoSource video("park_joy_90p_10_420_20f.y4m", 0, 20);
+  cfg_.g_profile = 2;
+  cfg_.g_bit_depth = VPX_BITS_10;
+  cfg_.g_input_bit_depth = VPX_BITS_10;
+  if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.kf_max_dist = 9999;
+
+  top_sl_width_ = 160;
+  top_sl_height_ = 90;
+  cfg_.rc_target_bitrate = 500;
+  ResetModel();
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // TODO(marpan/jianj): Comment out the rate-target checking for now
+  // as superframe parsing to get frame size needs to be fixed for
+  // high bitdepth.
+  /*
+  // Use large under/over shoot thresholds as this is a very short clip,
+  // so not good for testing rate-targeting.
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.5,
+                          1.7);
+  */
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3
+// temporal layers, for Profle 2 12bit.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TL12bitProfile2) {
+  SetSvcConfig(3, 3);
+  ::libvpx_test::Y4mVideoSource video("park_joy_90p_12_420_20f.y4m", 0, 20);
+  cfg_.g_profile = 2;
+  cfg_.g_bit_depth = VPX_BITS_12;
+  cfg_.g_input_bit_depth = VPX_BITS_12;
+  if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.kf_max_dist = 9999;
+
+  top_sl_width_ = 160;
+  top_sl_height_ = 90;
+  cfg_.rc_target_bitrate = 500;
+  ResetModel();
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // TODO(marpan/jianj): Comment out the rate-target checking for now
+  // as superframe parsing to get frame size needs to be fixed for
+  // high bitdepth.
+  /*
+  // Use large under/over shoot thresholds as this is a very short clip,
+  // so not good for testing rate-targeting.
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.5,
+                          1.7);
+  */
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+#endif
+
+// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 1
+// temporal layer, with screen content mode on and same speed setting for all
+// layers.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc2SL1TLScreenContent1) {
+  SetSvcConfig(2, 1);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 10;
+  cfg_.kf_max_dist = 9999;
+
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  top_sl_width_ = 1280;
+  top_sl_height_ = 720;
+  cfg_.rc_target_bitrate = 500;
+  ResetModel();
+  tune_content_ = 1;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78,
+                          1.15);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and
+// 3 temporal layers, with force key frame after frame drop
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TLForceKey) {
+  SetSvcConfig(3, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
+  cfg_.rc_target_bitrate = 100;
+  ResetModel();
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78,
+                          1.25);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and
+// 2 temporal layers, with a change on the fly from the fixed SVC pattern to one
+// generate via SVC_SET_REF_FRAME_CONFIG. The new pattern also disables
+// inter-layer prediction.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL2TLDynamicPatternChange) {
+  SetSvcConfig(3, 2);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
+  cfg_.rc_target_bitrate = 800;
+  ResetModel();
+  // Change SVC pattern on the fly.
+  update_pattern_ = 1;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78,
+                          1.15);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC with 3 spatial and 3 temporal
+// layers, for inter_layer_pred=OffKey (K-SVC) and on the fly switching
+// of denoiser from off to on (on at frame = 100). Key frame period is set to
+// 1000 so denoise is enabled on non-key.
+TEST_P(DatarateOnePassCbrSvcSingleBR,
+       OnePassCbrSvc3SL3TL_DenoiserOffOnFixedLayers) {
+  SetSvcConfig(3, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 1000;
+  ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", 1280,
+                                       720, 30, 1, 0, 300);
+  top_sl_width_ = 1280;
+  top_sl_height_ = 720;
+  cfg_.rc_target_bitrate = 1000;
+  ResetModel();
+  denoiser_off_on_ = true;
+  denoiser_enable_layers_ = false;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Don't check rate targeting on two top spatial layer since they will be
+  // skipped for part of the sequence.
+  CheckLayerRateTargeting(number_spatial_layers_ - 2, number_temporal_layers_,
+                          0.78, 1.15);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC with 3 spatial and 3 temporal
+// layers, for inter_layer_pred=OffKey (K-SVC) and on the fly switching
+// of denoiser from off to on, for dynamic layers. Start at 2 spatial layers
+// and enable 3rd spatial layer at frame = 100. Use periodic key frame with
+// period 100 so enabling of spatial layer occurs at key frame. Enable denoiser
+// at frame > 100, after the key frame sync.
+TEST_P(DatarateOnePassCbrSvcSingleBR,
+       OnePassCbrSvc3SL3TL_DenoiserOffOnEnableLayers) {
+  SetSvcConfig(3, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.kf_max_dist = 100;
+  ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", 1280,
+                                       720, 30, 1, 0, 300);
+  top_sl_width_ = 1280;
+  top_sl_height_ = 720;
+  cfg_.rc_target_bitrate = 1000;
+  ResetModel();
+  denoiser_off_on_ = true;
+  denoiser_enable_layers_ = true;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Don't check rate targeting on two top spatial layer since they will be
+  // skipped for part of the sequence.
+  CheckLayerRateTargeting(number_spatial_layers_ - 2, number_temporal_layers_,
+                          0.78, 1.15);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC with 3 spatial layers and on
+// the fly switching to 1 and then 2 and back to 3 spatial layers. This switch
+// is done by setting spatial layer bitrates to 0, and then back to non-zero,
+// during the sequence.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL_DisableEnableLayers) {
+  SetSvcConfig(3, 1);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.temporal_layering_mode = 0;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
+  cfg_.rc_target_bitrate = 800;
+  ResetModel();
+  dynamic_drop_layer_ = true;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Don't check rate targeting on two top spatial layer since they will be
+  // skipped for part of the sequence.
+  CheckLayerRateTargeting(number_spatial_layers_ - 2, number_temporal_layers_,
+                          0.78, 1.15);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC with 2 spatial layers and on
+// the fly switching to 1 spatial layer with dynamic resize enabled.
+// The resizer will resize the single layer down and back up again, as the
+// bitrate goes back up.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc2SL_SingleLayerResize) {
+  SetSvcConfig(2, 1);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.temporal_layering_mode = 0;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  cfg_.rc_resize_allowed = 1;
+  ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", 1280,
+                                       720, 15, 1, 0, 300);
+  top_sl_width_ = 1280;
+  top_sl_height_ = 720;
+  cfg_.rc_target_bitrate = 800;
+  ResetModel();
+  dynamic_drop_layer_ = true;
+  single_layer_resize_ = true;
+  base_speed_setting_ = speed_setting_;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Expect at least one resize down and at least one resize back up.
+  EXPECT_GE(num_resize_down_, 1);
+  EXPECT_GE(num_resize_up_, 1);
+  // Don't check rate targeting on two top spatial layer since they will be
+  // skipped for part of the sequence.
+  CheckLayerRateTargeting(number_spatial_layers_ - 2, number_temporal_layers_,
+                          0.78, 1.15);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// For 1 pass CBR SVC with 1 spatial and 2 temporal layers with dynamic resize
+// and denoiser enabled. The resizer will resize the single layer down and back
+// up again, as the bitrate goes back up.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc1SL2TL_DenoiseResize) {
+  SetSvcConfig(1, 2);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.temporal_layering_mode = 2;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  cfg_.rc_resize_allowed = 1;
+  ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", 1280,
+                                       720, 12, 1, 0, 300);
+  top_sl_width_ = 1280;
+  top_sl_height_ = 720;
+  cfg_.rc_target_bitrate = 800;
+  ResetModel();
+  dynamic_drop_layer_ = false;
+  single_layer_resize_ = true;
+  denoiser_on_ = 1;
+  base_speed_setting_ = speed_setting_;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Expect at least one resize down and at least one resize back up.
+  EXPECT_GE(num_resize_down_, 1);
+  EXPECT_GE(num_resize_up_, 1);
+}
+
+// Run SVC encoder for 1 temporal layer, 2 spatial layers, with spatial
+// downscale 5x5.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc2SL1TL5x5MultipleRuns) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.ss_number_layers = 2;
+  cfg_.ts_number_layers = 1;
+  cfg_.ts_rate_decimator[0] = 1;
+  cfg_.g_error_resilient = 1;
+  cfg_.g_threads = 3;
+  cfg_.temporal_layering_mode = 0;
+  svc_params_.scaling_factor_num[0] = 256;
+  svc_params_.scaling_factor_den[0] = 1280;
+  svc_params_.scaling_factor_num[1] = 1280;
+  svc_params_.scaling_factor_den[1] = 1280;
+  cfg_.rc_dropframe_thresh = 10;
+  cfg_.kf_max_dist = 999999;
+  cfg_.kf_min_dist = 0;
+  cfg_.ss_target_bitrate[0] = 300;
+  cfg_.ss_target_bitrate[1] = 1400;
+  cfg_.layer_target_bitrate[0] = 300;
+  cfg_.layer_target_bitrate[1] = 1400;
+  cfg_.rc_target_bitrate = 1700;
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ResetModel();
+  layer_target_avg_bandwidth_[0] = cfg_.layer_target_bitrate[0] * 1000 / 30;
+  bits_in_buffer_model_[0] =
+      cfg_.layer_target_bitrate[0] * cfg_.rc_buf_initial_sz;
+  layer_target_avg_bandwidth_[1] = cfg_.layer_target_bitrate[1] * 1000 / 30;
+  bits_in_buffer_model_[1] =
+      cfg_.layer_target_bitrate[1] * cfg_.rc_buf_initial_sz;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  top_sl_width_ = 1280;
+  top_sl_height_ = 720;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78,
+                          1.15);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// For 1 pass CBR SVC with 3 spatial and 3 temporal layers with external resize
+// and denoiser enabled. The external resizer will resize down and back up,
+// setting 0/nonzero bitrate on spatial enhancement layers to disable/enable
+// layers. Resizing starts on first frame and the pattern is:
+//  1/4 -> 1/2 -> 1 -> 1/4 -> 1/2.
+TEST_P(DatarateOnePassCbrSvcSingleBR,
+       OnePassCbrSvc3SL3TL_DenoiseExternalResizePattern1) {
+  SetSvcConfig(3, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 40;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.temporal_layering_mode = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.kf_max_dist = 10000;
+  cfg_.kf_min_dist = 10000;
+  cfg_.rc_resize_allowed = 0;
+  cfg_.g_w = 1280;
+  cfg_.g_h = 720;
+  top_sl_width_ = 1280;
+  top_sl_height_ = 720;
+  ResizingVideoSource video(1280, 720);
+  video.external_resize_pattern_ = 1;
+  video.force_zero_source_ = 0;
+  cfg_.rc_target_bitrate = 1000;
+  ResetModel();
+  dynamic_drop_layer_ = false;
+  single_layer_resize_ = false;
+  denoiser_on_ = 1;
+  base_speed_setting_ = speed_setting_;
+  external_resize_dynamic_drop_layer_ = true;
+  external_resize_pattern_ = video.external_resize_pattern_;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+// For 1 pass CBR SVC with 3 spatial and 3 temporal layers with external resize
+// and denoiser enabled. The external resizer will resize down and back up,
+// setting 0/nonzero bitrate on spatial enhancement layers to disable/enable
+// layers. Resizing starts on first frame and the pattern is:
+//  1/2 -> 1/4 -> 1 -> 1/2 -> 1/4.
+TEST_P(DatarateOnePassCbrSvcSingleBR,
+       OnePassCbrSvc3SL3TL_DenoiseExternalResizePattern2) {
+  SetSvcConfig(3, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 40;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.temporal_layering_mode = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.kf_max_dist = 10000;
+  cfg_.kf_min_dist = 10000;
+  cfg_.rc_resize_allowed = 0;
+  cfg_.g_w = 1280;
+  cfg_.g_h = 720;
+  top_sl_width_ = 1280;
+  top_sl_height_ = 720;
+  ResizingVideoSource video(1280, 720);
+  video.external_resize_pattern_ = 2;
+  video.force_zero_source_ = 0;
+  cfg_.rc_target_bitrate = 1000;
+  ResetModel();
+  dynamic_drop_layer_ = false;
+  single_layer_resize_ = false;
+  denoiser_on_ = 1;
+  base_speed_setting_ = speed_setting_;
+  external_resize_dynamic_drop_layer_ = true;
+  external_resize_pattern_ = video.external_resize_pattern_;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+// For 1 pass CBR SVC with 3 spatial and 3 temporal layers with external resize
+// and denoiser enabled. The external resizer will resize down and back up,
+// setting 0/nonzero bitrate on spatial enhancement layers to disable/enable
+// layers. Resizing starts on first frame and the pattern is:
+//  1/2 -> 1/4 -> 1 -> 1/2 -> 1/4. This test uses 4 threads with small keyframe
+// spacing, and top resolution is 1280x960.
+TEST_P(DatarateOnePassCbrSvcSingleBR,
+       OnePassCbrSvc3SL3TL_DenoiseExternalResizePattern2Key4Threads) {
+  SetSvcConfig(3, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 40;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 4;
+  cfg_.temporal_layering_mode = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.kf_max_dist = 40;
+  cfg_.kf_min_dist = 40;
+  cfg_.rc_resize_allowed = 0;
+  cfg_.g_w = 1280;
+  cfg_.g_h = 960;
+  top_sl_width_ = cfg_.g_w;
+  top_sl_height_ = cfg_.g_h;
+  ResizingVideoSource video(1280, 960);
+  video.external_resize_pattern_ = 2;
+  video.force_zero_source_ = 0;
+  cfg_.rc_target_bitrate = 1000;
+  ResetModel();
+  dynamic_drop_layer_ = false;
+  single_layer_resize_ = false;
+  denoiser_on_ = 1;
+  base_speed_setting_ = speed_setting_;
+  external_resize_dynamic_drop_layer_ = true;
+  external_resize_pattern_ = video.external_resize_pattern_;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+// For 1 pass CBR SVC with 3 spatial and 3 temporal layers with external resize
+// and denoiser enabled. The external resizer will resize down and back up,
+// setting 0/nonzero bitrate on spatial enhancement layers to disable/enable
+// layers. Resizing starts on first frame and the pattern is:
+//  1/4 -> 1/2 -> 1 -> 1/4 -> 1/2. The source will be set to 0 every x frames,
+// otherwise random values, to trigger scene detection in the encoder.
+TEST_P(DatarateOnePassCbrSvcSingleBR,
+       OnePassCbrSvc3SL3TL_DenoiseExternalResizePattern1SceneChange) {
+  SetSvcConfig(3, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 40;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.temporal_layering_mode = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.kf_max_dist = 10000;
+  cfg_.kf_min_dist = 10000;
+  cfg_.rc_resize_allowed = 0;
+  cfg_.g_w = 1280;
+  cfg_.g_h = 720;
+  top_sl_width_ = 1280;
+  top_sl_height_ = 720;
+  ResizingVideoSource video(1280, 720);
+  video.external_resize_pattern_ = 1;
+  video.force_zero_source_ = 1;
+  cfg_.rc_target_bitrate = 1000;
+  ResetModel();
+  dynamic_drop_layer_ = false;
+  single_layer_resize_ = false;
+  denoiser_on_ = 1;
+  base_speed_setting_ = speed_setting_;
+  external_resize_dynamic_drop_layer_ = true;
+  external_resize_pattern_ = video.external_resize_pattern_;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+// Params: speed setting and index for bitrate array.
+class DatarateOnePassCbrSvcMultiBR
+    : public DatarateOnePassCbrSvc,
+      public ::libvpx_test::CodecTestWith2Params<int, int> {
+ public:
+  DatarateOnePassCbrSvcMultiBR() : DatarateOnePassCbrSvc(GET_PARAM(0)) {
+    memset(&svc_params_, 0, sizeof(svc_params_));
+  }
+  ~DatarateOnePassCbrSvcMultiBR() override = default;
+
+ protected:
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    speed_setting_ = GET_PARAM(1);
+    ResetModel();
+  }
+};
+
+// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and
+// 3 temporal layers. Run CIF clip with 1 thread.
+TEST_P(DatarateOnePassCbrSvcMultiBR, OnePassCbrSvc2SL3TL) {
+  SetSvcConfig(2, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
+  const int bitrates[3] = { 200, 400, 600 };
+  // TODO(marpan): Check that effective_datarate for each layer hits the
+  // layer target_bitrate.
+  cfg_.rc_target_bitrate = bitrates[GET_PARAM(2)];
+  ResetModel();
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.75,
+                          1.2);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass VBR SVC: 2 spatial layers and
+// 3 temporal layers. Run VGA clip with 1 thread.
+TEST_P(DatarateOnePassCbrSvcMultiBR, OnePassVbrSvc2SL3TL) {
+  SetSvcConfig(2, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 2;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  cfg_.rc_end_usage = VPX_VBR;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
+  const int bitrates[3] = { 200, 400, 600 };
+  cfg_.rc_target_bitrate = bitrates[GET_PARAM(2)];
+  ResetModel();
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.70,
+                          1.3);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Params: speed setting, layer framedrop control and index for bitrate array.
+class DatarateOnePassCbrSvcFrameDropMultiBR
+    : public DatarateOnePassCbrSvc,
+      public ::libvpx_test::CodecTestWith3Params<int, int, int> {
+ public:
+  DatarateOnePassCbrSvcFrameDropMultiBR()
+      : DatarateOnePassCbrSvc(GET_PARAM(0)) {
+    memset(&svc_params_, 0, sizeof(svc_params_));
+  }
+  ~DatarateOnePassCbrSvcFrameDropMultiBR() override = default;
+
+ protected:
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    speed_setting_ = GET_PARAM(1);
+    ResetModel();
+  }
+};
+
+// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and
+// 3 temporal layers. Run HD clip with 4 threads.
+TEST_P(DatarateOnePassCbrSvcFrameDropMultiBR, OnePassCbrSvc2SL3TL4Threads) {
+  SetSvcConfig(2, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 4;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  top_sl_width_ = 1280;
+  top_sl_height_ = 720;
+  layer_framedrop_ = 0;
+  const int bitrates[3] = { 200, 400, 600 };
+  cfg_.rc_target_bitrate = bitrates[GET_PARAM(3)];
+  ResetModel();
+  layer_framedrop_ = GET_PARAM(2);
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.64,
+                          1.45);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and
+// 3 temporal layers. Run HD clip with 4 threads.
+TEST_P(DatarateOnePassCbrSvcFrameDropMultiBR, OnePassCbrSvc3SL3TL4Threads) {
+  SetSvcConfig(3, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 4;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  top_sl_width_ = 1280;
+  top_sl_height_ = 720;
+  layer_framedrop_ = 0;
+  const int bitrates[3] = { 200, 400, 600 };
+  cfg_.rc_target_bitrate = bitrates[GET_PARAM(3)];
+  ResetModel();
+  layer_framedrop_ = GET_PARAM(2);
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.58,
+                          1.2);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and
+// 3 temporal layers. Run HD clip with 4 threads, for 1284x770, which
+// likely is the issue for Bug: 366146260.
+TEST_P(DatarateOnePassCbrSvcFrameDropMultiBR,
+       OnePassCbrSvc3SL3TL4Threads1284x770) {
+  SetSvcConfig(3, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 4;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  ::libvpx_test::Y4mVideoSource video("niklas_1284_770_30.y4m", 0, 60);
+  top_sl_width_ = 1284;
+  top_sl_height_ = 770;
+  layer_framedrop_ = 0;
+  const int bitrates[3] = { 200, 400, 600 };
+  cfg_.rc_target_bitrate = bitrates[GET_PARAM(3)];
+  ResetModel();
+  layer_framedrop_ = GET_PARAM(2);
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.58,
+                          1.2);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and
+// 3 temporal layers. Run HD clip with 4 threads, for 1857x167.
+TEST_P(DatarateOnePassCbrSvcFrameDropMultiBR,
+       OnePassCbrSvc3SL3TL4Threads1857x167) {
+  SetSvcConfig(3, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  ::libvpx_test::Y4mVideoSource video("niklas_1857_167_30.y4m", 0, 60);
+  top_sl_width_ = 1857;
+  top_sl_height_ = 167;
+  layer_framedrop_ = 0;
+  const int bitrates[3] = { 200, 400, 600 };
+  cfg_.rc_target_bitrate = bitrates[GET_PARAM(3)];
+  ResetModel();
+  layer_framedrop_ = GET_PARAM(2);
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.58,
+                          1.2);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and
+// 2 temporal layers, for KSVC in flexible mode with no update of reference
+// frames for all spatial layers on TL > 0 superframes.
+// Run HD clip with 4 threads.
+TEST_P(DatarateOnePassCbrSvcFrameDropMultiBR, OnePassCbrSvc3SL2TL4ThKSVCFlex) {
+  SetSvcConfig(3, 2);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 4;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  top_sl_width_ = 1280;
+  top_sl_height_ = 720;
+  layer_framedrop_ = 0;
+  const int bitrates[3] = { 200, 400, 600 };
+  cfg_.rc_target_bitrate = bitrates[GET_PARAM(3)];
+  ResetModel();
+  layer_framedrop_ = GET_PARAM(2);
+  AssignLayerBitrates();
+  ksvc_flex_noupd_tlenh_ = true;
+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.58,
+                          1.2);
+}
+
+// Params: speed setting, inter-layer prediction mode.
+class DatarateOnePassCbrSvcInterLayerPredSingleBR
+    : public DatarateOnePassCbrSvc,
+      public ::libvpx_test::CodecTestWith2Params<int, int> {
+ public:
+  DatarateOnePassCbrSvcInterLayerPredSingleBR()
+      : DatarateOnePassCbrSvc(GET_PARAM(0)) {
+    memset(&svc_params_, 0, sizeof(svc_params_));
+  }
+  ~DatarateOnePassCbrSvcInterLayerPredSingleBR() override = default;
+
+ protected:
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    speed_setting_ = GET_PARAM(1);
+    inter_layer_pred_mode_ = GET_PARAM(2);
+    ResetModel();
+  }
+};
+
+// Check basic rate targeting with different inter-layer prediction modes for 1
+// pass CBR SVC: 3 spatial layers and 3 temporal layers. Run CIF clip with 1
+// thread.
+TEST_P(DatarateOnePassCbrSvcInterLayerPredSingleBR, OnePassCbrSvc3SL3TL) {
+  // Disable test for inter-layer pred off for now since simulcast_mode fails.
+  if (inter_layer_pred_mode_ == INTER_LAYER_PRED_OFF) return;
+  SetSvcConfig(3, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.temporal_layering_mode = 3;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
+  cfg_.rc_target_bitrate = 800;
+  ResetModel();
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78,
+                          1.15);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Check rate targeting with different inter-layer prediction modes for 1 pass
+// CBR SVC: 3 spatial layers and 3 temporal layers, changing the target bitrate
+// at the middle of encoding.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TLDynamicBitrateChange) {
+  SetSvcConfig(3, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
+  cfg_.rc_target_bitrate = 800;
+  ResetModel();
+  change_bitrate_ = true;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78,
+                          1.15);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+// Params: speed setting, noise sensitivity, index for bitrate array and inter
+// layer pred mode.
+class DatarateOnePassCbrSvcDenoiser
+    : public DatarateOnePassCbrSvc,
+      public ::libvpx_test::CodecTestWith4Params<int, int, int, int> {
+ public:
+  DatarateOnePassCbrSvcDenoiser() : DatarateOnePassCbrSvc(GET_PARAM(0)) {
+    memset(&svc_params_, 0, sizeof(svc_params_));
+  }
+  ~DatarateOnePassCbrSvcDenoiser() override = default;
+
+ protected:
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    speed_setting_ = GET_PARAM(1);
+    inter_layer_pred_mode_ = GET_PARAM(3);
+    ResetModel();
+  }
+};
+
+// Check basic rate targeting for 1 pass CBR SVC with denoising.
+// 2 spatial layers and 3 temporal layer. Run HD clip with 2 threads.
+TEST_P(DatarateOnePassCbrSvcDenoiser, OnePassCbrSvc2SL3TLDenoiserOn) {
+  SetSvcConfig(2, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 2;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
+  const int bitrates[3] = { 600, 800, 1000 };
+  // TODO(marpan): Check that effective_datarate for each layer hits the
+  // layer target_bitrate.
+  // For SVC, noise_sen = 1 means denoising only the top spatial layer
+  // noise_sen = 2 means denoising the two top spatial layers.
+  cfg_.rc_target_bitrate = bitrates[GET_PARAM(3)];
+  ResetModel();
+  denoiser_on_ = GET_PARAM(2);
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78,
+                          1.15);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+#endif
+
+// Params: speed setting, key frame dist.
+class DatarateOnePassCbrSvcSmallKF
+    : public DatarateOnePassCbrSvc,
+      public ::libvpx_test::CodecTestWith2Params<int, int> {
+ public:
+  DatarateOnePassCbrSvcSmallKF() : DatarateOnePassCbrSvc(GET_PARAM(0)) {
+    memset(&svc_params_, 0, sizeof(svc_params_));
+  }
+  ~DatarateOnePassCbrSvcSmallKF() override = default;
+
+ protected:
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    speed_setting_ = GET_PARAM(1);
+    ResetModel();
+  }
+};
+
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3
+// temporal layers. Run CIF clip with 1 thread, and few short key frame periods.
+TEST_P(DatarateOnePassCbrSvcSmallKF, OnePassCbrSvc3SL3TLSmallKf) {
+  SetSvcConfig(3, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 10;
+  cfg_.rc_target_bitrate = 800;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
+  // For this 3 temporal layer case, pattern repeats every 4 frames, so choose
+  // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2).
+  const int kf_dist = GET_PARAM(2);
+  cfg_.kf_max_dist = kf_dist;
+  key_frame_spacing_ = kf_dist;
+  ResetModel();
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.70,
+                          1.15);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 3
+// temporal layers. Run CIF clip with 1 thread, and few short key frame periods.
+TEST_P(DatarateOnePassCbrSvcSmallKF, OnePassCbrSvc2SL3TLSmallKf) {
+  SetSvcConfig(2, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 10;
+  cfg_.rc_target_bitrate = 400;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
+  // For this 3 temporal layer case, pattern repeats every 4 frames, so choose
+  // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2).
+  const int kf_dist = GET_PARAM(2) + 32;
+  cfg_.kf_max_dist = kf_dist;
+  key_frame_spacing_ = kf_dist;
+  ResetModel();
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78,
+                          1.15);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3
+// temporal layers. Run VGA clip with 1 thread, and place layer sync frames:
+// one at middle layer first, then another one for top layer, and another
+// insert for base spatial layer (which forces key frame).
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TLSyncFrames) {
+  SetSvcConfig(3, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.kf_max_dist = 9999;
+  cfg_.rc_dropframe_thresh = 10;
+  cfg_.rc_target_bitrate = 400;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
+  ResetModel();
+  insert_layer_sync_ = 1;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78,
+                          1.15);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Run SVC encoder for 3 spatial layers, 1 temporal layer, with
+// intra-only frame as sync frame on base spatial layer.
+// Intra_only is inserted at start and in middle of sequence.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL1TLSyncWithIntraOnly) {
+  SetSvcConfig(3, 1);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 4;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  cfg_.rc_target_bitrate = 400;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
+  ResetModel();
+  insert_layer_sync_ = 1;
+  // Use intra_only frame for sync on base layer.
+  force_intra_only_frame_ = 1;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.73,
+                          1.2);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Run SVC encoder for 2 quality layers (same resolution different,
+// bitrates), 1 temporal layer, with screen content mode.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc2QL1TLScreen) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.ss_number_layers = 2;
+  cfg_.ts_number_layers = 1;
+  cfg_.ts_rate_decimator[0] = 1;
+  cfg_.temporal_layering_mode = 0;
+  cfg_.g_error_resilient = 1;
+  cfg_.g_threads = 2;
+  svc_params_.scaling_factor_num[0] = 1;
+  svc_params_.scaling_factor_den[0] = 1;
+  svc_params_.scaling_factor_num[1] = 1;
+  svc_params_.scaling_factor_den[1] = 1;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
+  ResetModel();
+  tune_content_ = 1;
+  // Set the layer bitrates, for 2 spatial layers, 1 temporal.
+  cfg_.rc_target_bitrate = 400;
+  cfg_.ss_target_bitrate[0] = 100;
+  cfg_.ss_target_bitrate[1] = 300;
+  cfg_.layer_target_bitrate[0] = 100;
+  cfg_.layer_target_bitrate[1] = 300;
+  for (int sl = 0; sl < 2; ++sl) {
+    float layer_framerate = 30.0;
+    layer_target_avg_bandwidth_[sl] = static_cast<int>(
+        cfg_.layer_target_bitrate[sl] * 1000.0 / layer_framerate);
+    bits_in_buffer_model_[sl] =
+        cfg_.layer_target_bitrate[sl] * cfg_.rc_buf_initial_sz;
+  }
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.73,
+                          1.25);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Params: speed setting.
+class DatarateOnePassCbrSvcPostencodeDrop
+    : public DatarateOnePassCbrSvc,
+      public ::libvpx_test::CodecTestWithParam<int> {
+ public:
+  DatarateOnePassCbrSvcPostencodeDrop() : DatarateOnePassCbrSvc(GET_PARAM(0)) {
+    memset(&svc_params_, 0, sizeof(svc_params_));
+  }
+  ~DatarateOnePassCbrSvcPostencodeDrop() override = default;
+
+ protected:
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    speed_setting_ = GET_PARAM(1);
+    ResetModel();
+  }
+};
+
+// Run SVC encoder for 2 quality layers (same resolution different,
+// bitrates), 1 temporal layer, with screen content mode.
+TEST_P(DatarateOnePassCbrSvcPostencodeDrop, OnePassCbrSvc2QL1TLScreen) {
+  cfg_.rc_buf_initial_sz = 200;
+  cfg_.rc_buf_optimal_sz = 200;
+  cfg_.rc_buf_sz = 400;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 52;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.ss_number_layers = 2;
+  cfg_.ts_number_layers = 1;
+  cfg_.ts_rate_decimator[0] = 1;
+  cfg_.temporal_layering_mode = 0;
+  cfg_.g_error_resilient = 1;
+  cfg_.g_threads = 2;
+  svc_params_.scaling_factor_num[0] = 1;
+  svc_params_.scaling_factor_den[0] = 1;
+  svc_params_.scaling_factor_num[1] = 1;
+  svc_params_.scaling_factor_den[1] = 1;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+  top_sl_width_ = 352;
+  top_sl_height_ = 288;
+  ResetModel();
+  base_speed_setting_ = speed_setting_;
+  tune_content_ = 1;
+  use_post_encode_drop_ = 1;
+  // Set the layer bitrates, for 2 spatial layers, 1 temporal.
+  cfg_.rc_target_bitrate = 400;
+  cfg_.ss_target_bitrate[0] = 100;
+  cfg_.ss_target_bitrate[1] = 300;
+  cfg_.layer_target_bitrate[0] = 100;
+  cfg_.layer_target_bitrate[1] = 300;
+  for (int sl = 0; sl < 2; ++sl) {
+    float layer_framerate = 30.0;
+    layer_target_avg_bandwidth_[sl] = static_cast<int>(
+        cfg_.layer_target_bitrate[sl] * 1000.0 / layer_framerate);
+    bits_in_buffer_model_[sl] =
+        cfg_.layer_target_bitrate[sl] * cfg_.rc_buf_initial_sz;
+  }
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.73,
+                          1.25);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+VP9_INSTANTIATE_TEST_SUITE(DatarateOnePassCbrSvcSingleBR,
+                           ::testing::Range(5, 10));
+
+VP9_INSTANTIATE_TEST_SUITE(DatarateOnePassCbrSvcPostencodeDrop,
+                           ::testing::Range(5, 6));
+
+VP9_INSTANTIATE_TEST_SUITE(DatarateOnePassCbrSvcInterLayerPredSingleBR,
+                           ::testing::Range(5, 10), ::testing::Range(0, 3));
+
+VP9_INSTANTIATE_TEST_SUITE(DatarateOnePassCbrSvcMultiBR,
+                           ::testing::Range(5, 10), ::testing::Range(0, 3));
+
+VP9_INSTANTIATE_TEST_SUITE(DatarateOnePassCbrSvcFrameDropMultiBR,
+                           ::testing::Range(5, 10), ::testing::Range(0, 2),
+                           ::testing::Range(0, 3));
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+VP9_INSTANTIATE_TEST_SUITE(DatarateOnePassCbrSvcDenoiser,
+                           ::testing::Range(5, 10), ::testing::Range(1, 3),
+                           ::testing::Range(0, 3), ::testing::Range(0, 4));
+#endif
+
+VP9_INSTANTIATE_TEST_SUITE(DatarateOnePassCbrSvcSmallKF,
+                           ::testing::Range(5, 10), ::testing::Range(32, 36));
+}  // namespace
+}  // namespace svc_test
diff --git a/media/libvpx/libvpx/test/svc_end_to_end_test.cc b/media/libvpx/libvpx/test/svc_end_to_end_test.cc
new file mode 100644
index 0000000000..3bd6b1307c
--- /dev/null
+++ b/media/libvpx/libvpx/test/svc_end_to_end_test.cc
@@ -0,0 +1,825 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#include "gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/svc_test.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vpx/vpx_codec.h"
+#include "vpx_ports/bitops.h"
+
+namespace svc_test {
+namespace {
+
+enum INTER_LAYER_PRED {
+  // Inter-layer prediction is on on all frames.
+  INTER_LAYER_PRED_ON,
+  // Inter-layer prediction is off on all frames.
+  INTER_LAYER_PRED_OFF,
+  // Inter-layer prediction is off on non-key frames and non-sync frames.
+  INTER_LAYER_PRED_OFF_NONKEY,
+  // Inter-layer prediction is on on all frames, but constrained such
+  // that any layer S (> 0) can only predict from previous spatial
+  // layer S-1, from the same superframe.
+  INTER_LAYER_PRED_ON_CONSTRAINED
+};
+
+class ScalePartitionOnePassCbrSvc
+    : public OnePassCbrSvc,
+      public ::testing::TestWithParam<const ::libvpx_test::CodecFactory *> {
+ public:
+  ScalePartitionOnePassCbrSvc()
+      : OnePassCbrSvc(GetParam()), mismatch_nframes_(0), num_nonref_frames_(0) {
+    SetMode(::libvpx_test::kRealTime);
+  }
+
+ protected:
+  ~ScalePartitionOnePassCbrSvc() override = default;
+
+  void SetUp() override {
+    InitializeConfig();
+    speed_setting_ = 7;
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    PreEncodeFrameHookSetup(video, encoder);
+  }
+
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
+    // Keep track of number of non-reference frames, needed for mismatch check.
+    // Non-reference frames are top spatial and temporal layer frames,
+    // for TL > 0.
+    if (temporal_layer_id_ == number_temporal_layers_ - 1 &&
+        temporal_layer_id_ > 0 &&
+        pkt->data.frame.spatial_layer_encoded[number_spatial_layers_ - 1])
+      num_nonref_frames_++;
+  }
+
+  void MismatchHook(const vpx_image_t * /*img1*/,
+                    const vpx_image_t * /*img2*/) override {
+    ++mismatch_nframes_;
+  }
+
+  void SetConfig(const int /*num_temporal_layer*/) override {}
+
+  unsigned int GetMismatchFrames() const { return mismatch_nframes_; }
+  unsigned int GetNonRefFrames() const { return num_nonref_frames_; }
+
+ private:
+  unsigned int mismatch_nframes_;
+  unsigned int num_nonref_frames_;
+};
+
+TEST_P(ScalePartitionOnePassCbrSvc, OnePassCbrSvc3SL3TL1080P) {
+  SetSvcConfig(3, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 10;
+  cfg_.rc_target_bitrate = 800;
+  cfg_.kf_max_dist = 9999;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.g_error_resilient = 1;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+  cfg_.temporal_layering_mode = 3;
+  ::libvpx_test::I420VideoSource video(
+      "slides_code_term_web_plot.1920_1080.yuv", 1920, 1080, 30, 1, 0, 100);
+  // For this 3 temporal layer case, pattern repeats every 4 frames, so choose
+  // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2).
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Params: Inter layer prediction modes.
+class SyncFrameOnePassCbrSvc : public OnePassCbrSvc,
+                               public ::libvpx_test::CodecTestWithParam<int> {
+ public:
+  SyncFrameOnePassCbrSvc()
+      : OnePassCbrSvc(GET_PARAM(0)), current_video_frame_(0),
+        frame_to_start_decode_(0), frame_to_sync_(0),
+        inter_layer_pred_mode_(GET_PARAM(1)), decode_to_layer_before_sync_(-1),
+        decode_to_layer_after_sync_(-1), denoiser_on_(0),
+        intra_only_test_(false), loopfilter_off_(0), mismatch_nframes_(0),
+        num_nonref_frames_(0) {
+    SetMode(::libvpx_test::kRealTime);
+    memset(&svc_layer_sync_, 0, sizeof(svc_layer_sync_));
+  }
+
+ protected:
+  ~SyncFrameOnePassCbrSvc() override = default;
+
+  void SetUp() override {
+    InitializeConfig();
+    speed_setting_ = 7;
+  }
+
+  bool DoDecode() const override {
+    return current_video_frame_ >= frame_to_start_decode_;
+  }
+
+  // Example pattern for spatial layers and 2 temporal layers used in the
+  // bypass/flexible mode. The pattern corresponds to the pattern
+  // VP9E_TEMPORAL_LAYERING_MODE_0101 (temporal_layering_mode == 2) used in
+  // non-flexible mode.
+  void set_frame_flags_bypass_mode(
+      int tl, int num_spatial_layers, int is_key_frame,
+      vpx_svc_ref_frame_config_t *ref_frame_config) {
+    int sl;
+    for (sl = 0; sl < num_spatial_layers; ++sl)
+      ref_frame_config->update_buffer_slot[sl] = 0;
+
+    for (sl = 0; sl < num_spatial_layers; ++sl) {
+      // Set the buffer idx.
+      if (tl == 0) {
+        ref_frame_config->lst_fb_idx[sl] = sl;
+        if (sl) {
+          if (is_key_frame) {
+            ref_frame_config->lst_fb_idx[sl] = sl - 1;
+            ref_frame_config->gld_fb_idx[sl] = sl;
+          } else {
+            ref_frame_config->gld_fb_idx[sl] = sl - 1;
+          }
+        } else {
+          ref_frame_config->gld_fb_idx[sl] = 0;
+        }
+        ref_frame_config->alt_fb_idx[sl] = 0;
+      } else if (tl == 1) {
+        ref_frame_config->lst_fb_idx[sl] = sl;
+        ref_frame_config->gld_fb_idx[sl] =
+            (sl == 0) ? 0 : num_spatial_layers + sl - 1;
+        ref_frame_config->alt_fb_idx[sl] = num_spatial_layers + sl;
+      }
+      // Set the reference and update flags.
+      if (!tl) {
+        if (!sl) {
+          // Base spatial and base temporal (sl = 0, tl = 0)
+          ref_frame_config->reference_last[sl] = 1;
+          ref_frame_config->reference_golden[sl] = 0;
+          ref_frame_config->reference_alt_ref[sl] = 0;
+          ref_frame_config->update_buffer_slot[sl] |=
+              1 << ref_frame_config->lst_fb_idx[sl];
+        } else {
+          if (is_key_frame) {
+            ref_frame_config->reference_last[sl] = 1;
+            ref_frame_config->reference_golden[sl] = 0;
+            ref_frame_config->reference_alt_ref[sl] = 0;
+            ref_frame_config->update_buffer_slot[sl] |=
+                1 << ref_frame_config->gld_fb_idx[sl];
+          } else {
+            // Non-zero spatiall layer.
+            ref_frame_config->reference_last[sl] = 1;
+            ref_frame_config->reference_golden[sl] = 1;
+            ref_frame_config->reference_alt_ref[sl] = 1;
+            ref_frame_config->update_buffer_slot[sl] |=
+                1 << ref_frame_config->lst_fb_idx[sl];
+          }
+        }
+      } else if (tl == 1) {
+        if (!sl) {
+          // Base spatial and top temporal (tl = 1)
+          ref_frame_config->reference_last[sl] = 1;
+          ref_frame_config->reference_golden[sl] = 0;
+          ref_frame_config->reference_alt_ref[sl] = 0;
+          ref_frame_config->update_buffer_slot[sl] |=
+              1 << ref_frame_config->alt_fb_idx[sl];
+        } else {
+          // Non-zero spatial.
+          if (sl < num_spatial_layers - 1) {
+            ref_frame_config->reference_last[sl] = 1;
+            ref_frame_config->reference_golden[sl] = 1;
+            ref_frame_config->reference_alt_ref[sl] = 0;
+            ref_frame_config->update_buffer_slot[sl] |=
+                1 << ref_frame_config->alt_fb_idx[sl];
+          } else if (sl == num_spatial_layers - 1) {
+            // Top spatial and top temporal (non-reference -- doesn't
+            // update any reference buffers).
+            ref_frame_config->reference_last[sl] = 1;
+            ref_frame_config->reference_golden[sl] = 1;
+            ref_frame_config->reference_alt_ref[sl] = 0;
+          }
+        }
+      }
+    }
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    current_video_frame_ = video->frame();
+    PreEncodeFrameHookSetup(video, encoder);
+    if (video->frame() == 0) {
+      // Do not turn off inter-layer pred completely because simulcast mode
+      // fails.
+      if (inter_layer_pred_mode_ != INTER_LAYER_PRED_OFF)
+        encoder->Control(VP9E_SET_SVC_INTER_LAYER_PRED, inter_layer_pred_mode_);
+      encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_);
+      if (intra_only_test_)
+        // Decoder sets the color_space for Intra-only frames
+        // to BT_601 (see line 1810 in vp9_decodeframe.c).
+        // So set it here in these tess to avoid encoder-decoder
+        // mismatch check on color space setting.
+        encoder->Control(VP9E_SET_COLOR_SPACE, VPX_CS_BT_601);
+
+      encoder->Control(VP9E_SET_DISABLE_LOOPFILTER, loopfilter_off_);
+    }
+    if (flexible_mode_) {
+      vpx_svc_layer_id_t layer_id;
+      layer_id.spatial_layer_id = 0;
+      layer_id.temporal_layer_id = (video->frame() % 2 != 0);
+      temporal_layer_id_ = layer_id.temporal_layer_id;
+      for (int i = 0; i < number_spatial_layers_; i++) {
+        layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_;
+        ref_frame_config_.duration[i] = 1;
+      }
+      encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
+      set_frame_flags_bypass_mode(layer_id.temporal_layer_id,
+                                  number_spatial_layers_, 0,
+                                  &ref_frame_config_);
+      encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_);
+    }
+    if (video->frame() == frame_to_sync_) {
+      encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync_);
+    }
+  }
+
+#if CONFIG_VP9_DECODER
+  void PreDecodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Decoder *decoder) override {
+    if (video->frame() < frame_to_sync_) {
+      if (decode_to_layer_before_sync_ >= 0)
+        decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER,
+                         decode_to_layer_before_sync_);
+    } else {
+      if (decode_to_layer_after_sync_ >= 0) {
+        int decode_to_layer = decode_to_layer_after_sync_;
+        // Overlay frame is additional layer for intra-only.
+        if (video->frame() == frame_to_sync_ && intra_only_test_ &&
+            decode_to_layer_after_sync_ == 0 && number_spatial_layers_ > 1)
+          decode_to_layer += 1;
+        decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER, decode_to_layer);
+      }
+    }
+  }
+#endif
+
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
+    // Keep track of number of non-reference frames, needed for mismatch check.
+    // Non-reference frames are top spatial and temporal layer frames,
+    // for TL > 0.
+    if (temporal_layer_id_ == number_temporal_layers_ - 1 &&
+        temporal_layer_id_ > 0 &&
+        pkt->data.frame.spatial_layer_encoded[number_spatial_layers_ - 1] &&
+        current_video_frame_ >= frame_to_sync_)
+      num_nonref_frames_++;
+
+    if (intra_only_test_ && current_video_frame_ == frame_to_sync_) {
+      // Intra-only frame is only generated for spatial layers > 1 and <= 3,
+      // among other conditions (see constraint in set_intra_only_frame(). If
+      // intra-only is no allowed then encoder will insert key frame instead.
+      const bool key_frame =
+          (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false;
+      if (number_spatial_layers_ == 1 || number_spatial_layers_ > 3)
+        ASSERT_TRUE(key_frame);
+      else
+        ASSERT_FALSE(key_frame);
+    }
+  }
+
+  void MismatchHook(const vpx_image_t * /*img1*/,
+                    const vpx_image_t * /*img2*/) override {
+    if (current_video_frame_ >= frame_to_sync_) ++mismatch_nframes_;
+  }
+
+  unsigned int GetMismatchFrames() const { return mismatch_nframes_; }
+  unsigned int GetNonRefFrames() const { return num_nonref_frames_; }
+
+  unsigned int current_video_frame_;
+  unsigned int frame_to_start_decode_;
+  unsigned int frame_to_sync_;
+  int inter_layer_pred_mode_;
+  int decode_to_layer_before_sync_;
+  int decode_to_layer_after_sync_;
+  int denoiser_on_;
+  bool intra_only_test_;
+  int loopfilter_off_;
+  vpx_svc_spatial_layer_sync_t svc_layer_sync_;
+  unsigned int mismatch_nframes_;
+  unsigned int num_nonref_frames_;
+  bool flexible_mode_;
+  vpx_svc_ref_frame_config_t ref_frame_config_;
+
+ private:
+  void SetConfig(const int num_temporal_layer) override {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 1;
+    cfg_.g_threads = 1;
+    cfg_.rc_dropframe_thresh = 30;
+    cfg_.kf_max_dist = 9999;
+    if (num_temporal_layer == 3) {
+      cfg_.ts_rate_decimator[0] = 4;
+      cfg_.ts_rate_decimator[1] = 2;
+      cfg_.ts_rate_decimator[2] = 1;
+      cfg_.temporal_layering_mode = 3;
+    } else if (num_temporal_layer == 2) {
+      cfg_.ts_rate_decimator[0] = 2;
+      cfg_.ts_rate_decimator[1] = 1;
+      cfg_.temporal_layering_mode = 2;
+    } else if (num_temporal_layer == 1) {
+      cfg_.ts_rate_decimator[0] = 1;
+      cfg_.temporal_layering_mode = 0;
+    }
+  }
+};
+
+// Test for sync layer for 1 pass CBR SVC: 3 spatial layers and
+// 3 temporal layers. Only start decoding on the sync layer.
+// Full sync: insert key frame on base layer.
+TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLFullSync) {
+  SetSvcConfig(3, 3);
+  // Sync is on base layer so the frame to sync and the frame to start decoding
+  // is the same.
+  frame_to_start_decode_ = 20;
+  frame_to_sync_ = 20;
+  decode_to_layer_before_sync_ = -1;
+  decode_to_layer_after_sync_ = 2;
+
+  // Set up svc layer sync structure.
+  svc_layer_sync_.base_layer_intra_only = 0;
+  svc_layer_sync_.spatial_layer_sync[0] = 1;
+
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+
+  cfg_.rc_target_bitrate = 600;
+  flexible_mode_ = false;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Test for sync layer for 1 pass CBR SVC: 2 spatial layers and
+// 3 temporal layers. Decoding QVGA before sync frame and decode up to
+// VGA on and after sync.
+TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc2SL3TLSyncToVGA) {
+  SetSvcConfig(2, 3);
+  frame_to_start_decode_ = 0;
+  frame_to_sync_ = 100;
+  decode_to_layer_before_sync_ = 0;
+  decode_to_layer_after_sync_ = 1;
+
+  // Set up svc layer sync structure.
+  svc_layer_sync_.base_layer_intra_only = 0;
+  svc_layer_sync_.spatial_layer_sync[0] = 0;
+  svc_layer_sync_.spatial_layer_sync[1] = 1;
+
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  cfg_.rc_target_bitrate = 400;
+  flexible_mode_ = false;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Test for sync layer for 1 pass CBR SVC: 3 spatial layers and
+// 3 temporal layers. Decoding QVGA and VGA before sync frame and decode up to
+// HD on and after sync.
+TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncToHD) {
+  SetSvcConfig(3, 3);
+  frame_to_start_decode_ = 0;
+  frame_to_sync_ = 20;
+  decode_to_layer_before_sync_ = 1;
+  decode_to_layer_after_sync_ = 2;
+
+  // Set up svc layer sync structure.
+  svc_layer_sync_.base_layer_intra_only = 0;
+  svc_layer_sync_.spatial_layer_sync[0] = 0;
+  svc_layer_sync_.spatial_layer_sync[1] = 0;
+  svc_layer_sync_.spatial_layer_sync[2] = 1;
+
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  cfg_.rc_target_bitrate = 600;
+  flexible_mode_ = false;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Test for sync layer for 1 pass CBR SVC: 3 spatial layers and
+// 3 temporal layers. Decoding QVGA before sync frame and decode up to
+// HD on and after sync.
+TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncToVGAHD) {
+  SetSvcConfig(3, 3);
+  frame_to_start_decode_ = 0;
+  frame_to_sync_ = 20;
+  decode_to_layer_before_sync_ = 0;
+  decode_to_layer_after_sync_ = 2;
+
+  // Set up svc layer sync structure.
+  svc_layer_sync_.base_layer_intra_only = 0;
+  svc_layer_sync_.spatial_layer_sync[0] = 0;
+  svc_layer_sync_.spatial_layer_sync[1] = 1;
+  svc_layer_sync_.spatial_layer_sync[2] = 1;
+
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  cfg_.rc_target_bitrate = 600;
+  flexible_mode_ = false;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+// Test for sync layer for 1 pass CBR SVC: 2 spatial layers and
+// 3 temporal layers. Decoding QVGA before sync frame and decode up to
+// VGA on and after sync.
+TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc2SL3TLSyncFrameVGADenoise) {
+  SetSvcConfig(2, 3);
+  frame_to_start_decode_ = 0;
+  frame_to_sync_ = 100;
+  decode_to_layer_before_sync_ = 0;
+  decode_to_layer_after_sync_ = 1;
+
+  denoiser_on_ = 1;
+  // Set up svc layer sync structure.
+  svc_layer_sync_.base_layer_intra_only = 0;
+  svc_layer_sync_.spatial_layer_sync[0] = 0;
+  svc_layer_sync_.spatial_layer_sync[1] = 1;
+
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  cfg_.rc_target_bitrate = 400;
+  flexible_mode_ = false;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+#endif
+
+// Encode 3 spatial, 2 temporal layer in flexible mode but don't
+// start decoding. During the sequence insert intra-only on base/qvga
+// layer at frame 20 and start decoding only QVGA layer from there.
+TEST_P(SyncFrameOnePassCbrSvc,
+       OnePassCbrSvc3SL3TLSyncFrameStartDecodeOnIntraOnlyQVGAFlex) {
+  SetSvcConfig(3, 2);
+  frame_to_start_decode_ = 20;
+  frame_to_sync_ = 20;
+  decode_to_layer_before_sync_ = 2;
+  decode_to_layer_after_sync_ = 0;
+  intra_only_test_ = true;
+
+  // Set up svc layer sync structure.
+  svc_layer_sync_.base_layer_intra_only = 1;
+  svc_layer_sync_.spatial_layer_sync[0] = 1;
+  svc_layer_sync_.spatial_layer_sync[1] = 0;
+  svc_layer_sync_.spatial_layer_sync[2] = 0;
+
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  cfg_.rc_target_bitrate = 600;
+  flexible_mode_ = true;
+  AssignLayerBitrates();
+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Can't check mismatch here because only base is decoded at
+  // frame sync, whereas encoder continues encoding all layers.
+}
+
+// Encode 3 spatial, 3 temporal layer but don't start decoding.
+// During the sequence insert intra-only on base/qvga layer at frame 20
+// and start decoding only QVGA layer from there.
+TEST_P(SyncFrameOnePassCbrSvc,
+       OnePassCbrSvc3SL3TLSyncFrameStartDecodeOnIntraOnlyQVGA) {
+  SetSvcConfig(3, 3);
+  frame_to_start_decode_ = 20;
+  frame_to_sync_ = 20;
+  decode_to_layer_before_sync_ = 2;
+  decode_to_layer_after_sync_ = 0;
+  intra_only_test_ = true;
+
+  // Set up svc layer sync structure.
+  svc_layer_sync_.base_layer_intra_only = 1;
+  svc_layer_sync_.spatial_layer_sync[0] = 1;
+  svc_layer_sync_.spatial_layer_sync[1] = 0;
+  svc_layer_sync_.spatial_layer_sync[2] = 0;
+
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  cfg_.rc_target_bitrate = 600;
+  flexible_mode_ = false;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Can't check mismatch here because only base is decoded at
+  // frame sync, whereas encoder continues encoding all layers.
+}
+
+// Start decoding from beginning of sequence, during sequence insert intra-only
+// on base/qvga layer. Decode all layers.
+TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncFrameIntraOnlyQVGA) {
+  SetSvcConfig(3, 3);
+  frame_to_start_decode_ = 0;
+  frame_to_sync_ = 20;
+  decode_to_layer_before_sync_ = 2;
+  // The superframe containing intra-only layer will have +1 frames. Thus set
+  // the layer to decode after sync frame to +1 from
+  // decode_to_layer_before_sync.
+  decode_to_layer_after_sync_ = 3;
+  intra_only_test_ = true;
+
+  // Set up svc layer sync structure.
+  svc_layer_sync_.base_layer_intra_only = 1;
+  svc_layer_sync_.spatial_layer_sync[0] = 1;
+  svc_layer_sync_.spatial_layer_sync[1] = 0;
+  svc_layer_sync_.spatial_layer_sync[2] = 0;
+
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  cfg_.rc_target_bitrate = 600;
+  flexible_mode_ = false;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Start decoding from beginning of sequence, during sequence insert intra-only
+// on base/qvga layer and sync_layer on middle/VGA layer. Decode all layers.
+TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncFrameIntraOnlyVGA) {
+  SetSvcConfig(3, 3);
+  frame_to_start_decode_ = 0;
+  frame_to_sync_ = 20;
+  decode_to_layer_before_sync_ = 2;
+  // The superframe containing intra-only layer will have +1 frames. Thus set
+  // the layer to decode after sync frame to +1 from
+  // decode_to_layer_before_sync.
+  decode_to_layer_after_sync_ = 3;
+  intra_only_test_ = true;
+
+  // Set up svc layer sync structure.
+  svc_layer_sync_.base_layer_intra_only = 1;
+  svc_layer_sync_.spatial_layer_sync[0] = 1;
+  svc_layer_sync_.spatial_layer_sync[1] = 1;
+  svc_layer_sync_.spatial_layer_sync[2] = 0;
+
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  cfg_.rc_target_bitrate = 600;
+  flexible_mode_ = false;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Start decoding from sync frame, insert intra-only on base/qvga layer. Decode
+// all layers. For 1 spatial layer, it inserts a key frame.
+TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc1SL3TLSyncFrameIntraOnlyQVGA) {
+  SetSvcConfig(1, 3);
+  frame_to_start_decode_ = 20;
+  frame_to_sync_ = 20;
+  decode_to_layer_before_sync_ = 0;
+  decode_to_layer_after_sync_ = 0;
+  intra_only_test_ = true;
+
+  // Set up svc layer sync structure.
+  svc_layer_sync_.base_layer_intra_only = 1;
+  svc_layer_sync_.spatial_layer_sync[0] = 1;
+
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  cfg_.rc_target_bitrate = 600;
+  flexible_mode_ = false;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Params: Loopfilter modes.
+class LoopfilterOnePassCbrSvc : public OnePassCbrSvc,
+                                public ::libvpx_test::CodecTestWithParam<int> {
+ public:
+  LoopfilterOnePassCbrSvc()
+      : OnePassCbrSvc(GET_PARAM(0)), loopfilter_off_(GET_PARAM(1)),
+        mismatch_nframes_(0), num_nonref_frames_(0) {
+    SetMode(::libvpx_test::kRealTime);
+  }
+
+ protected:
+  ~LoopfilterOnePassCbrSvc() override = default;
+
+  void SetUp() override {
+    InitializeConfig();
+    speed_setting_ = 7;
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    PreEncodeFrameHookSetup(video, encoder);
+    if (number_temporal_layers_ > 1 || number_spatial_layers_ > 1) {
+      // Consider 3 cases:
+      if (loopfilter_off_ == 0) {
+        // loopfilter is on for all spatial layers on every superrframe.
+        for (int i = 0; i < VPX_SS_MAX_LAYERS; ++i) {
+          svc_params_.loopfilter_ctrl[i] = 0;
+        }
+      } else if (loopfilter_off_ == 1) {
+        // loopfilter is off for non-reference frames for all spatial layers.
+        for (int i = 0; i < VPX_SS_MAX_LAYERS; ++i) {
+          svc_params_.loopfilter_ctrl[i] = 1;
+        }
+      } else {
+        // loopfilter is off for all SL0 frames, and off only for non-reference
+        // frames for SL > 0.
+        svc_params_.loopfilter_ctrl[0] = 2;
+        for (int i = 1; i < VPX_SS_MAX_LAYERS; ++i) {
+          svc_params_.loopfilter_ctrl[i] = 1;
+        }
+      }
+      encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
+    } else if (number_temporal_layers_ == 1 && number_spatial_layers_ == 1) {
+      // For non-SVC mode use the single layer control.
+      encoder->Control(VP9E_SET_DISABLE_LOOPFILTER, loopfilter_off_);
+    }
+  }
+
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
+    // Keep track of number of non-reference frames, needed for mismatch check.
+    // Non-reference frames are top spatial and temporal layer frames,
+    // for TL > 0.
+    if (temporal_layer_id_ == number_temporal_layers_ - 1 &&
+        temporal_layer_id_ > 0 &&
+        pkt->data.frame.spatial_layer_encoded[number_spatial_layers_ - 1])
+      num_nonref_frames_++;
+  }
+
+  void MismatchHook(const vpx_image_t * /*img1*/,
+                    const vpx_image_t * /*img2*/) override {
+    ++mismatch_nframes_;
+  }
+
+  void SetConfig(const int /*num_temporal_layer*/) override {}
+
+  int GetMismatchFrames() const { return mismatch_nframes_; }
+  int GetNonRefFrames() const { return num_nonref_frames_; }
+
+  int loopfilter_off_;
+
+ private:
+  int mismatch_nframes_;
+  int num_nonref_frames_;
+};
+
+TEST_P(LoopfilterOnePassCbrSvc, OnePassCbrSvc1SL1TLLoopfilterOff) {
+  SetSvcConfig(1, 1);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_target_bitrate = 800;
+  cfg_.kf_max_dist = 9999;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.g_error_resilient = 1;
+  cfg_.ts_rate_decimator[0] = 1;
+  cfg_.temporal_layering_mode = 0;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  cfg_.rc_target_bitrate = 600;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#if CONFIG_VP9_DECODER
+  if (loopfilter_off_ == 0)
+    EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+  else
+    EXPECT_EQ(GetMismatchFrames(), 0);
+#endif
+}
+
+TEST_P(LoopfilterOnePassCbrSvc, OnePassCbrSvc1SL3TLLoopfilterOff) {
+  SetSvcConfig(1, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_target_bitrate = 800;
+  cfg_.kf_max_dist = 9999;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.g_error_resilient = 1;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+  cfg_.temporal_layering_mode = 3;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  cfg_.rc_target_bitrate = 600;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#if CONFIG_VP9_DECODER
+  if (loopfilter_off_ == 0)
+    EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+  else
+    EXPECT_EQ(GetMismatchFrames(), 0);
+#endif
+}
+
+TEST_P(LoopfilterOnePassCbrSvc, OnePassCbrSvc3SL3TLLoopfilterOff) {
+  SetSvcConfig(3, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_target_bitrate = 800;
+  cfg_.kf_max_dist = 9999;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.g_error_resilient = 1;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+  cfg_.temporal_layering_mode = 3;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  cfg_.rc_target_bitrate = 600;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#if CONFIG_VP9_DECODER
+  if (loopfilter_off_ == 0)
+    EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+  else
+    EXPECT_EQ(GetMismatchFrames(), 0);
+#endif
+}
+
+VP9_INSTANTIATE_TEST_SUITE(SyncFrameOnePassCbrSvc, ::testing::Range(0, 3));
+
+VP9_INSTANTIATE_TEST_SUITE(LoopfilterOnePassCbrSvc, ::testing::Range(0, 3));
+
+INSTANTIATE_TEST_SUITE_P(
+    VP9, ScalePartitionOnePassCbrSvc,
+    ::testing::Values(
+        static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)));
+
+}  // namespace
+}  // namespace svc_test
diff --git a/media/libvpx/libvpx/test/svc_test.cc b/media/libvpx/libvpx/test/svc_test.cc
index 482d9fffa1..cbc0abe032 100644
--- a/media/libvpx/libvpx/test/svc_test.cc
+++ b/media/libvpx/libvpx/test/svc_test.cc
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -8,782 +8,128 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <string>
-#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "test/codec_factory.h"
-#include "test/decode_test_driver.h"
-#include "test/i420_video_source.h"
+#include "test/svc_test.h"
 
-#include "vp9/decoder/vp9_decoder.h"
-
-#include "vpx/svc_context.h"
-#include "vpx/vp8cx.h"
-#include "vpx/vpx_encoder.h"
-
-namespace {
-
-using libvpx_test::CodecFactory;
-using libvpx_test::Decoder;
-using libvpx_test::DxDataIterator;
-using libvpx_test::VP9CodecFactory;
-
-class SvcTest : public ::testing::Test {
- protected:
-  static const uint32_t kWidth = 352;
-  static const uint32_t kHeight = 288;
-
-  SvcTest()
-      : codec_iface_(0), test_file_name_("hantro_collage_w352h288.yuv"),
-        codec_initialized_(false), decoder_(0) {
-    memset(&svc_, 0, sizeof(svc_));
-    memset(&codec_, 0, sizeof(codec_));
-    memset(&codec_enc_, 0, sizeof(codec_enc_));
+namespace svc_test {
+void OnePassCbrSvc::SetSvcConfig(const int num_spatial_layer,
+                                 const int num_temporal_layer) {
+  SetConfig(num_temporal_layer);
+  cfg_.ss_number_layers = num_spatial_layer;
+  cfg_.ts_number_layers = num_temporal_layer;
+  if (num_spatial_layer == 1) {
+    svc_params_.scaling_factor_num[0] = 288;
+    svc_params_.scaling_factor_den[0] = 288;
+  } else if (num_spatial_layer == 2) {
+    svc_params_.scaling_factor_num[0] = 144;
+    svc_params_.scaling_factor_den[0] = 288;
+    svc_params_.scaling_factor_num[1] = 288;
+    svc_params_.scaling_factor_den[1] = 288;
+  } else if (num_spatial_layer == 3) {
+    svc_params_.scaling_factor_num[0] = 72;
+    svc_params_.scaling_factor_den[0] = 288;
+    svc_params_.scaling_factor_num[1] = 144;
+    svc_params_.scaling_factor_den[1] = 288;
+    svc_params_.scaling_factor_num[2] = 288;
+    svc_params_.scaling_factor_den[2] = 288;
   }
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+}
 
-  virtual ~SvcTest() {}
-
-  virtual void SetUp() {
-    svc_.log_level = SVC_LOG_DEBUG;
-    svc_.log_print = 0;
-
-    codec_iface_ = vpx_codec_vp9_cx();
-    const vpx_codec_err_t res =
-        vpx_codec_enc_config_default(codec_iface_, &codec_enc_, 0);
-    EXPECT_EQ(VPX_CODEC_OK, res);
-
-    codec_enc_.g_w = kWidth;
-    codec_enc_.g_h = kHeight;
-    codec_enc_.g_timebase.num = 1;
-    codec_enc_.g_timebase.den = 60;
-    codec_enc_.kf_min_dist = 100;
-    codec_enc_.kf_max_dist = 100;
-
-    vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
-    VP9CodecFactory codec_factory;
-    decoder_ = codec_factory.CreateDecoder(dec_cfg, 0);
-
-    tile_columns_ = 0;
-    tile_rows_ = 0;
-  }
-
-  virtual void TearDown() {
-    ReleaseEncoder();
-    delete (decoder_);
-  }
-
-  void InitializeEncoder() {
-    const vpx_codec_err_t res =
-        vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-    EXPECT_EQ(VPX_CODEC_OK, res);
-    vpx_codec_control(&codec_, VP8E_SET_CPUUSED, 4);  // Make the test faster
-    vpx_codec_control(&codec_, VP9E_SET_TILE_COLUMNS, tile_columns_);
-    vpx_codec_control(&codec_, VP9E_SET_TILE_ROWS, tile_rows_);
-    codec_initialized_ = true;
-  }
-
-  void ReleaseEncoder() {
-    vpx_svc_release(&svc_);
-    if (codec_initialized_) vpx_codec_destroy(&codec_);
-    codec_initialized_ = false;
-  }
-
-  void GetStatsData(std::string *const stats_buf) {
-    vpx_codec_iter_t iter = NULL;
-    const vpx_codec_cx_pkt_t *cx_pkt;
-
-    while ((cx_pkt = vpx_codec_get_cx_data(&codec_, &iter)) != NULL) {
-      if (cx_pkt->kind == VPX_CODEC_STATS_PKT) {
-        EXPECT_GT(cx_pkt->data.twopass_stats.sz, 0U);
-        ASSERT_TRUE(cx_pkt->data.twopass_stats.buf != NULL);
-        stats_buf->append(static_cast<char *>(cx_pkt->data.twopass_stats.buf),
-                          cx_pkt->data.twopass_stats.sz);
+void OnePassCbrSvc::PreEncodeFrameHookSetup(::libvpx_test::VideoSource *video,
+                                            ::libvpx_test::Encoder *encoder) {
+  if (video->frame() == 0) {
+    for (int i = 0; i < VPX_MAX_LAYERS; ++i) {
+      svc_params_.max_quantizers[i] = 63;
+      svc_params_.min_quantizers[i] = 0;
+    }
+    if (number_temporal_layers_ > 1 || number_spatial_layers_ > 1) {
+      svc_params_.speed_per_layer[0] = base_speed_setting_;
+      for (int i = 1; i < VPX_SS_MAX_LAYERS; ++i) {
+        svc_params_.speed_per_layer[i] = speed_setting_;
       }
+      encoder->Control(VP9E_SET_SVC, 1);
+      encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
+    }
+    encoder->Control(VP8E_SET_CPUUSED, speed_setting_);
+    encoder->Control(VP9E_SET_AQ_MODE, 3);
+    encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 300);
+    encoder->Control(VP9E_SET_TILE_COLUMNS, get_msb(cfg_.g_threads));
+    encoder->Control(VP9E_SET_ROW_MT, 1);
+    encoder->Control(VP8E_SET_STATIC_THRESHOLD, 1);
+  }
+
+  superframe_count_++;
+  temporal_layer_id_ = 0;
+  if (number_temporal_layers_ == 2) {
+    temporal_layer_id_ = (superframe_count_ % 2 != 0);
+  } else if (number_temporal_layers_ == 3) {
+    if (superframe_count_ % 2 != 0) temporal_layer_id_ = 2;
+    if (superframe_count_ > 1) {
+      if ((superframe_count_ - 2) % 4 == 0) temporal_layer_id_ = 1;
     }
   }
 
-  void Pass1EncodeNFrames(const int n, const int layers,
-                          std::string *const stats_buf) {
-    vpx_codec_err_t res;
+  frame_flags_ = 0;
+}
 
-    ASSERT_GT(n, 0);
-    ASSERT_GT(layers, 0);
-    svc_.spatial_layers = layers;
-    codec_enc_.g_pass = VPX_RC_FIRST_PASS;
-    InitializeEncoder();
-
-    libvpx_test::I420VideoSource video(
-        test_file_name_, codec_enc_.g_w, codec_enc_.g_h,
-        codec_enc_.g_timebase.den, codec_enc_.g_timebase.num, 0, 30);
-    video.Begin();
-
-    for (int i = 0; i < n; ++i) {
-      res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                           video.duration(), VPX_DL_GOOD_QUALITY);
-      ASSERT_EQ(VPX_CODEC_OK, res);
-      GetStatsData(stats_buf);
-      video.Next();
-    }
-
-    // Flush encoder and test EOS packet.
-    res = vpx_svc_encode(&svc_, &codec_, NULL, video.pts(), video.duration(),
-                         VPX_DL_GOOD_QUALITY);
-    ASSERT_EQ(VPX_CODEC_OK, res);
-    GetStatsData(stats_buf);
-
-    ReleaseEncoder();
-  }
-
-  void StoreFrames(const size_t max_frame_received,
-                   struct vpx_fixed_buf *const outputs,
-                   size_t *const frame_received) {
-    vpx_codec_iter_t iter = NULL;
-    const vpx_codec_cx_pkt_t *cx_pkt;
-
-    while ((cx_pkt = vpx_codec_get_cx_data(&codec_, &iter)) != NULL) {
-      if (cx_pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
-        const size_t frame_size = cx_pkt->data.frame.sz;
-
-        EXPECT_GT(frame_size, 0U);
-        ASSERT_TRUE(cx_pkt->data.frame.buf != NULL);
-        ASSERT_LT(*frame_received, max_frame_received);
-
-        if (*frame_received == 0)
-          EXPECT_EQ(1, !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY));
-
-        outputs[*frame_received].buf = malloc(frame_size + 16);
-        ASSERT_TRUE(outputs[*frame_received].buf != NULL);
-        memcpy(outputs[*frame_received].buf, cx_pkt->data.frame.buf,
-               frame_size);
-        outputs[*frame_received].sz = frame_size;
-        ++(*frame_received);
-      }
+void OnePassCbrSvc::PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
+  vpx_svc_layer_id_t layer_id;
+  encoder->Control(VP9E_GET_SVC_LAYER_ID, &layer_id);
+  temporal_layer_id_ = layer_id.temporal_layer_id;
+  for (int sl = 0; sl < number_spatial_layers_; ++sl) {
+    for (int tl = temporal_layer_id_; tl < number_temporal_layers_; ++tl) {
+      const int layer = sl * number_temporal_layers_ + tl;
+      bits_in_buffer_model_[layer] +=
+          static_cast<int64_t>(layer_target_avg_bandwidth_[layer]);
     }
   }
+}
 
-  void Pass2EncodeNFrames(std::string *const stats_buf, const int n,
-                          const int layers,
-                          struct vpx_fixed_buf *const outputs) {
-    vpx_codec_err_t res;
-    size_t frame_received = 0;
-
-    ASSERT_TRUE(outputs != NULL);
-    ASSERT_GT(n, 0);
-    ASSERT_GT(layers, 0);
-    svc_.spatial_layers = layers;
-    codec_enc_.rc_target_bitrate = 500;
-    if (codec_enc_.g_pass == VPX_RC_LAST_PASS) {
-      ASSERT_TRUE(stats_buf != NULL);
-      ASSERT_GT(stats_buf->size(), 0U);
-      codec_enc_.rc_twopass_stats_in.buf = &(*stats_buf)[0];
-      codec_enc_.rc_twopass_stats_in.sz = stats_buf->size();
-    }
-    InitializeEncoder();
-
-    libvpx_test::I420VideoSource video(
-        test_file_name_, codec_enc_.g_w, codec_enc_.g_h,
-        codec_enc_.g_timebase.den, codec_enc_.g_timebase.num, 0, 30);
-    video.Begin();
-
-    for (int i = 0; i < n; ++i) {
-      res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                           video.duration(), VPX_DL_GOOD_QUALITY);
-      ASSERT_EQ(VPX_CODEC_OK, res);
-      StoreFrames(n, outputs, &frame_received);
-      video.Next();
-    }
-
-    // Flush encoder.
-    res = vpx_svc_encode(&svc_, &codec_, NULL, 0, video.duration(),
-                         VPX_DL_GOOD_QUALITY);
-    EXPECT_EQ(VPX_CODEC_OK, res);
-    StoreFrames(n, outputs, &frame_received);
-
-    EXPECT_EQ(frame_received, static_cast<size_t>(n));
-
-    ReleaseEncoder();
-  }
-
-  void DecodeNFrames(const struct vpx_fixed_buf *const inputs, const int n) {
-    int decoded_frames = 0;
-    int received_frames = 0;
-
-    ASSERT_TRUE(inputs != NULL);
-    ASSERT_GT(n, 0);
-
-    for (int i = 0; i < n; ++i) {
-      ASSERT_TRUE(inputs[i].buf != NULL);
-      ASSERT_GT(inputs[i].sz, 0U);
-      const vpx_codec_err_t res_dec = decoder_->DecodeFrame(
-          static_cast<const uint8_t *>(inputs[i].buf), inputs[i].sz);
-      ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
-      ++decoded_frames;
-
-      DxDataIterator dec_iter = decoder_->GetDxData();
-      while (dec_iter.Next() != NULL) {
-        ++received_frames;
-      }
-    }
-    EXPECT_EQ(decoded_frames, n);
-    EXPECT_EQ(received_frames, n);
-  }
-
-  void DropEnhancementLayers(struct vpx_fixed_buf *const inputs,
-                             const int num_super_frames,
-                             const int remained_spatial_layers) {
-    ASSERT_TRUE(inputs != NULL);
-    ASSERT_GT(num_super_frames, 0);
-    ASSERT_GT(remained_spatial_layers, 0);
-
-    for (int i = 0; i < num_super_frames; ++i) {
-      uint32_t frame_sizes[8] = { 0 };
-      int frame_count = 0;
-      int frames_found = 0;
-      int frame;
-      ASSERT_TRUE(inputs[i].buf != NULL);
-      ASSERT_GT(inputs[i].sz, 0U);
-
-      vpx_codec_err_t res = vp9_parse_superframe_index(
-          static_cast<const uint8_t *>(inputs[i].buf), inputs[i].sz,
-          frame_sizes, &frame_count, NULL, NULL);
-      ASSERT_EQ(VPX_CODEC_OK, res);
-
-      if (frame_count == 0) {
-        // There's no super frame but only a single frame.
-        ASSERT_EQ(1, remained_spatial_layers);
-      } else {
-        // Found a super frame.
-        uint8_t *frame_data = static_cast<uint8_t *>(inputs[i].buf);
-        uint8_t *frame_start = frame_data;
-        for (frame = 0; frame < frame_count; ++frame) {
-          // Looking for a visible frame.
-          if (frame_data[0] & 0x02) {
-            ++frames_found;
-            if (frames_found == remained_spatial_layers) break;
-          }
-          frame_data += frame_sizes[frame];
-        }
-        ASSERT_LT(frame, frame_count)
-            << "Couldn't find a visible frame. "
-            << "remained_spatial_layers: " << remained_spatial_layers
-            << "    super_frame: " << i;
-        if (frame == frame_count - 1) continue;
-
-        frame_data += frame_sizes[frame];
-
-        // We need to add one more frame for multiple frame contexts.
-        uint8_t marker =
-            static_cast<const uint8_t *>(inputs[i].buf)[inputs[i].sz - 1];
-        const uint32_t mag = ((marker >> 3) & 0x3) + 1;
-        const size_t index_sz = 2 + mag * frame_count;
-        const size_t new_index_sz = 2 + mag * (frame + 1);
-        marker &= 0x0f8;
-        marker |= frame;
-
-        // Copy existing frame sizes.
-        memmove(frame_data + 1, frame_start + inputs[i].sz - index_sz + 1,
-                new_index_sz - 2);
-        // New marker.
-        frame_data[0] = marker;
-        frame_data += (mag * (frame + 1) + 1);
-
-        *frame_data++ = marker;
-        inputs[i].sz = frame_data - frame_start;
-      }
+void OnePassCbrSvc::AssignLayerBitrates() {
+  int sl, spatial_layer_target;
+  int spatial_layers = cfg_.ss_number_layers;
+  int temporal_layers = cfg_.ts_number_layers;
+  float total = 0;
+  float alloc_ratio[VPX_MAX_LAYERS] = { 0 };
+  float framerate = 30.0;
+  for (sl = 0; sl < spatial_layers; ++sl) {
+    if (svc_params_.scaling_factor_den[sl] > 0) {
+      alloc_ratio[sl] =
+          static_cast<float>((svc_params_.scaling_factor_num[sl] * 1.0 /
+                              svc_params_.scaling_factor_den[sl]));
+      total += alloc_ratio[sl];
     }
   }
-
-  void FreeBitstreamBuffers(struct vpx_fixed_buf *const inputs, const int n) {
-    ASSERT_TRUE(inputs != NULL);
-    ASSERT_GT(n, 0);
-
-    for (int i = 0; i < n; ++i) {
-      free(inputs[i].buf);
-      inputs[i].buf = NULL;
-      inputs[i].sz = 0;
+  for (sl = 0; sl < spatial_layers; ++sl) {
+    cfg_.ss_target_bitrate[sl] = spatial_layer_target =
+        static_cast<unsigned int>(cfg_.rc_target_bitrate * alloc_ratio[sl] /
+                                  total);
+    const int index = sl * temporal_layers;
+    if (cfg_.temporal_layering_mode == 3) {
+      cfg_.layer_target_bitrate[index] = spatial_layer_target >> 1;
+      cfg_.layer_target_bitrate[index + 1] =
+          (spatial_layer_target >> 1) + (spatial_layer_target >> 2);
+      cfg_.layer_target_bitrate[index + 2] = spatial_layer_target;
+    } else if (cfg_.temporal_layering_mode == 2) {
+      cfg_.layer_target_bitrate[index] = spatial_layer_target * 2 / 3;
+      cfg_.layer_target_bitrate[index + 1] = spatial_layer_target;
+    } else if (cfg_.temporal_layering_mode <= 1) {
+      cfg_.layer_target_bitrate[index] = spatial_layer_target;
+    }
+  }
+  for (sl = 0; sl < spatial_layers; ++sl) {
+    for (int tl = 0; tl < temporal_layers; ++tl) {
+      const int layer = sl * temporal_layers + tl;
+      float layer_framerate = framerate;
+      if (temporal_layers == 2 && tl == 0) layer_framerate = framerate / 2;
+      if (temporal_layers == 3 && tl == 0) layer_framerate = framerate / 4;
+      if (temporal_layers == 3 && tl == 1) layer_framerate = framerate / 2;
+      layer_target_avg_bandwidth_[layer] = static_cast<int>(
+          cfg_.layer_target_bitrate[layer] * 1000.0 / layer_framerate);
+      bits_in_buffer_model_[layer] =
+          cfg_.layer_target_bitrate[layer] * cfg_.rc_buf_initial_sz;
     }
   }
-
-  SvcContext svc_;
-  vpx_codec_ctx_t codec_;
-  struct vpx_codec_enc_cfg codec_enc_;
-  vpx_codec_iface_t *codec_iface_;
-  std::string test_file_name_;
-  bool codec_initialized_;
-  Decoder *decoder_;
-  int tile_columns_;
-  int tile_rows_;
-};
-
-TEST_F(SvcTest, SvcInit) {
-  // test missing parameters
-  vpx_codec_err_t res = vpx_svc_init(NULL, &codec_, codec_iface_, &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-  res = vpx_svc_init(&svc_, NULL, codec_iface_, &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-  res = vpx_svc_init(&svc_, &codec_, NULL, &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_init(&svc_, &codec_, codec_iface_, NULL);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  svc_.spatial_layers = 6;  // too many layers
-  res = vpx_svc_init(&svc_, &codec_, codec_iface_, &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  svc_.spatial_layers = 0;  // use default layers
-  InitializeEncoder();
-  EXPECT_EQ(VPX_SS_DEFAULT_LAYERS, svc_.spatial_layers);
 }
-
-TEST_F(SvcTest, InitTwoLayers) {
-  svc_.spatial_layers = 2;
-  InitializeEncoder();
-}
-
-TEST_F(SvcTest, InvalidOptions) {
-  vpx_codec_err_t res = vpx_svc_set_options(&svc_, NULL);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "not-an-option=1");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-}
-
-TEST_F(SvcTest, SetLayersOption) {
-  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "spatial-layers=3");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  InitializeEncoder();
-  EXPECT_EQ(3, svc_.spatial_layers);
-}
-
-TEST_F(SvcTest, SetMultipleOptions) {
-  vpx_codec_err_t res =
-      vpx_svc_set_options(&svc_, "spatial-layers=2 scale-factors=1/3,2/3");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  InitializeEncoder();
-  EXPECT_EQ(2, svc_.spatial_layers);
-}
-
-TEST_F(SvcTest, SetScaleFactorsOption) {
-  svc_.spatial_layers = 2;
-  vpx_codec_err_t res =
-      vpx_svc_set_options(&svc_, "scale-factors=not-scale-factors");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "scale-factors=1/3, 3*3");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "scale-factors=1/3");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "scale-factors=1/3,2/3");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  InitializeEncoder();
-}
-
-TEST_F(SvcTest, SetQuantizersOption) {
-  svc_.spatial_layers = 2;
-  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "max-quantizers=nothing");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "min-quantizers=nothing");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "max-quantizers=40");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "min-quantizers=40");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "max-quantizers=30,30 min-quantizers=40,40");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "max-quantizers=40,40 min-quantizers=30,30");
-  InitializeEncoder();
-}
-
-TEST_F(SvcTest, SetAutoAltRefOption) {
-  svc_.spatial_layers = 5;
-  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "auto-alt-refs=none");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "auto-alt-refs=1,1,1,1,0");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  vpx_svc_set_options(&svc_, "auto-alt-refs=0,1,1,1,0");
-  InitializeEncoder();
-}
-
-// Test that decoder can handle an SVC frame as the first frame in a sequence.
-TEST_F(SvcTest, OnePassEncodeOneFrame) {
-  codec_enc_.g_pass = VPX_RC_ONE_PASS;
-  vpx_fixed_buf output = vpx_fixed_buf();
-  Pass2EncodeNFrames(NULL, 1, 2, &output);
-  DecodeNFrames(&output, 1);
-  FreeBitstreamBuffers(&output, 1);
-}
-
-TEST_F(SvcTest, OnePassEncodeThreeFrames) {
-  codec_enc_.g_pass = VPX_RC_ONE_PASS;
-  codec_enc_.g_lag_in_frames = 0;
-  vpx_fixed_buf outputs[3];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(NULL, 3, 2, &outputs[0]);
-  DecodeNFrames(&outputs[0], 3);
-  FreeBitstreamBuffers(&outputs[0], 3);
-}
-
-TEST_F(SvcTest, TwoPassEncode10Frames) {
-  // First pass encode
-  std::string stats_buf;
-  Pass1EncodeNFrames(10, 2, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode20FramesWithAltRef) {
-  // First pass encode
-  std::string stats_buf;
-  Pass1EncodeNFrames(20, 2, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1");
-  vpx_fixed_buf outputs[20];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 20, 2, &outputs[0]);
-  DecodeNFrames(&outputs[0], 20);
-  FreeBitstreamBuffers(&outputs[0], 20);
-}
-
-TEST_F(SvcTest, TwoPassEncode2SpatialLayersDecodeBaseLayerOnly) {
-  // First pass encode
-  std::string stats_buf;
-  Pass1EncodeNFrames(10, 2, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
-  DropEnhancementLayers(&outputs[0], 10, 1);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode5SpatialLayersDecode54321Layers) {
-  // First pass encode
-  std::string stats_buf;
-  Pass1EncodeNFrames(10, 5, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=0,1,1,1,0");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 5, &outputs[0]);
-
-  DecodeNFrames(&outputs[0], 10);
-  DropEnhancementLayers(&outputs[0], 10, 4);
-  DecodeNFrames(&outputs[0], 10);
-  DropEnhancementLayers(&outputs[0], 10, 3);
-  DecodeNFrames(&outputs[0], 10);
-  DropEnhancementLayers(&outputs[0], 10, 2);
-  DecodeNFrames(&outputs[0], 10);
-  DropEnhancementLayers(&outputs[0], 10, 1);
-  DecodeNFrames(&outputs[0], 10);
-
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode2SNRLayers) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1");
-  Pass1EncodeNFrames(20, 2, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 scale-factors=1/1,1/1");
-  vpx_fixed_buf outputs[20];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 20, 2, &outputs[0]);
-  DecodeNFrames(&outputs[0], 20);
-  FreeBitstreamBuffers(&outputs[0], 20);
-}
-
-TEST_F(SvcTest, TwoPassEncode3SNRLayersDecode321Layers) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1,1/1");
-  Pass1EncodeNFrames(20, 3, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1,1 scale-factors=1/1,1/1,1/1");
-  vpx_fixed_buf outputs[20];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 20, 3, &outputs[0]);
-  DecodeNFrames(&outputs[0], 20);
-  DropEnhancementLayers(&outputs[0], 20, 2);
-  DecodeNFrames(&outputs[0], 20);
-  DropEnhancementLayers(&outputs[0], 20, 1);
-  DecodeNFrames(&outputs[0], 20);
-
-  FreeBitstreamBuffers(&outputs[0], 20);
-}
-
-TEST_F(SvcTest, SetMultipleFrameContextsOption) {
-  svc_.spatial_layers = 5;
-  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "multi-frame-contexts=1");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  svc_.spatial_layers = 2;
-  res = vpx_svc_set_options(&svc_, "multi-frame-contexts=1");
-  InitializeEncoder();
-}
-
-TEST_F(SvcTest, TwoPassEncode2SpatialLayersWithMultipleFrameContexts) {
-  // First pass encode
-  std::string stats_buf;
-  Pass1EncodeNFrames(10, 2, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  codec_enc_.g_error_resilient = 0;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 multi-frame-contexts=1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest,
-       TwoPassEncode2SpatialLayersWithMultipleFrameContextsDecodeBaselayer) {
-  // First pass encode
-  std::string stats_buf;
-  Pass1EncodeNFrames(10, 2, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  codec_enc_.g_error_resilient = 0;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 multi-frame-contexts=1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
-  DropEnhancementLayers(&outputs[0], 10, 1);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode2SNRLayersWithMultipleFrameContexts) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1");
-  Pass1EncodeNFrames(10, 2, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  codec_enc_.g_error_resilient = 0;
-  vpx_svc_set_options(&svc_,
-                      "auto-alt-refs=1,1 scale-factors=1/1,1/1 "
-                      "multi-frame-contexts=1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest,
-       TwoPassEncode3SNRLayersWithMultipleFrameContextsDecode321Layer) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1,1/1");
-  Pass1EncodeNFrames(10, 3, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  codec_enc_.g_error_resilient = 0;
-  vpx_svc_set_options(&svc_,
-                      "auto-alt-refs=1,1,1 scale-factors=1/1,1/1,1/1 "
-                      "multi-frame-contexts=1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 3, &outputs[0]);
-
-  DecodeNFrames(&outputs[0], 10);
-  DropEnhancementLayers(&outputs[0], 10, 2);
-  DecodeNFrames(&outputs[0], 10);
-  DropEnhancementLayers(&outputs[0], 10, 1);
-  DecodeNFrames(&outputs[0], 10);
-
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode2TemporalLayers) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1");
-  svc_.temporal_layers = 2;
-  Pass1EncodeNFrames(10, 1, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  svc_.temporal_layers = 2;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode2TemporalLayersWithMultipleFrameContexts) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1");
-  svc_.temporal_layers = 2;
-  Pass1EncodeNFrames(10, 1, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  svc_.temporal_layers = 2;
-  codec_enc_.g_error_resilient = 0;
-  vpx_svc_set_options(&svc_,
-                      "auto-alt-refs=1 scale-factors=1/1 "
-                      "multi-frame-contexts=1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode2TemporalLayersDecodeBaseLayer) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1");
-  svc_.temporal_layers = 2;
-  Pass1EncodeNFrames(10, 1, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  svc_.temporal_layers = 2;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
-
-  vpx_fixed_buf base_layer[5];
-  for (int i = 0; i < 5; ++i) base_layer[i] = outputs[i * 2];
-
-  DecodeNFrames(&base_layer[0], 5);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest,
-       TwoPassEncode2TemporalLayersWithMultipleFrameContextsDecodeBaseLayer) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1");
-  svc_.temporal_layers = 2;
-  Pass1EncodeNFrames(10, 1, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  svc_.temporal_layers = 2;
-  codec_enc_.g_error_resilient = 0;
-  vpx_svc_set_options(&svc_,
-                      "auto-alt-refs=1 scale-factors=1/1 "
-                      "multi-frame-contexts=1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
-
-  vpx_fixed_buf base_layer[5];
-  for (int i = 0; i < 5; ++i) base_layer[i] = outputs[i * 2];
-
-  DecodeNFrames(&base_layer[0], 5);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode2TemporalLayersWithTiles) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1");
-  svc_.temporal_layers = 2;
-  Pass1EncodeNFrames(10, 1, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  svc_.temporal_layers = 2;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1");
-  codec_enc_.g_w = 704;
-  codec_enc_.g_h = 144;
-  tile_columns_ = 1;
-  tile_rows_ = 1;
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode2TemporalLayersWithMultipleFrameContextsAndTiles) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1");
-  svc_.temporal_layers = 2;
-  Pass1EncodeNFrames(10, 1, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  svc_.temporal_layers = 2;
-  codec_enc_.g_error_resilient = 0;
-  codec_enc_.g_w = 704;
-  codec_enc_.g_h = 144;
-  tile_columns_ = 1;
-  tile_rows_ = 1;
-  vpx_svc_set_options(&svc_,
-                      "auto-alt-refs=1 scale-factors=1/1 "
-                      "multi-frame-contexts=1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-}  // namespace
+}  // namespace svc_test
diff --git a/media/libvpx/libvpx/test/svc_test.h b/media/libvpx/libvpx/test/svc_test.h
new file mode 100644
index 0000000000..de39412f6d
--- /dev/null
+++ b/media/libvpx/libvpx/test/svc_test.h
@@ -0,0 +1,67 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_TEST_SVC_TEST_H_
+#define VPX_TEST_SVC_TEST_H_
+
+#include "./vpx_config.h"
+#include "gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "vpx/vpx_codec.h"
+#include "vpx_ports/bitops.h"
+
+namespace svc_test {
+class OnePassCbrSvc : public ::libvpx_test::EncoderTest {
+ public:
+  explicit OnePassCbrSvc(const ::libvpx_test::CodecFactory *codec)
+      : EncoderTest(codec), base_speed_setting_(0), speed_setting_(0),
+        superframe_count_(0), temporal_layer_id_(0), number_temporal_layers_(0),
+        number_spatial_layers_(0) {
+    memset(&svc_params_, 0, sizeof(svc_params_));
+    memset(bits_in_buffer_model_, 0,
+           sizeof(bits_in_buffer_model_[0]) * VPX_MAX_LAYERS);
+    memset(layer_target_avg_bandwidth_, 0,
+           sizeof(layer_target_avg_bandwidth_[0]) * VPX_MAX_LAYERS);
+  }
+
+ protected:
+  ~OnePassCbrSvc() override {}
+
+  virtual void SetConfig(const int num_temporal_layer) = 0;
+
+  virtual void SetSvcConfig(const int num_spatial_layer,
+                            const int num_temporal_layer);
+
+  virtual void PreEncodeFrameHookSetup(::libvpx_test::VideoSource *video,
+                                       ::libvpx_test::Encoder *encoder);
+
+  void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override;
+
+  virtual void AssignLayerBitrates();
+
+  void MismatchHook(const vpx_image_t *, const vpx_image_t *) override {}
+
+  vpx_svc_extra_cfg_t svc_params_;
+  int64_t bits_in_buffer_model_[VPX_MAX_LAYERS];
+  int layer_target_avg_bandwidth_[VPX_MAX_LAYERS];
+  int base_speed_setting_;
+  int speed_setting_;
+  int superframe_count_;
+  int temporal_layer_id_;
+  int number_temporal_layers_;
+  int number_spatial_layers_;
+};
+}  // namespace svc_test
+
+#endif  // VPX_TEST_SVC_TEST_H_
diff --git a/media/libvpx/libvpx/test/test-data.mk b/media/libvpx/libvpx/test/test-data.mk
index a6cda1e9b0..ffb9c0801e 100644
--- a/media/libvpx/libvpx/test/test-data.mk
+++ b/media/libvpx/libvpx/test/test-data.mk
@@ -2,15 +2,19 @@ LIBVPX_TEST_SRCS-yes += test-data.mk
 
 # Encoder test source
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_collage_w352h288.yuv
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_collage_w352h288_nv12.yuv
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_odd.yuv
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += desktop_office1.1280_720-020.yuv
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += slides_code_term_web_plot.1920_1080.yuv
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += desktopqvga.320_240.yuv
 
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_420.y4m
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_422.y4m
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_444.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_420_20f.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_422_20f.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_444_20f.y4m
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_440.yuv
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_420.y4m
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_422.y4m
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_444.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_420_20f.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_422_20f.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_444_20f.y4m
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_440.yuv
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_420_a10-1.y4m
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_420.y4m
@@ -18,11 +22,19 @@ LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_422.y4m
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_444.y4m
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_440.yuv
 
+LIBVPX_TEST_DATA-$(CONFIG_VP8_ENCODER) += repro-oss-fuzz-69906.y4m
+
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += 4x2.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += desktop_credits.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1280_720_30.y4m
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1284_770_30.y4m
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1857_167_30.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += noisy_clip_640_360.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += screendata.y4m
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_640_480_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += bus_352x288_420_f20_b8.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += crowd_run_360p_10_150f.y4m
 
 # Test vectors
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf
@@ -731,8 +743,16 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv444.webm.md5
 endif  # CONFIG_VP9_HIGHBITDEPTH
 
 # Invalid files for testing libvpx error checking.
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-1443.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-1443.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-148271109.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-148271109.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-token-partition.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-token-partition.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v3.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v3.webm.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02-v2.webm
@@ -771,6 +791,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s367
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-1.webm
@@ -778,8 +800,13 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-2.web
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-3.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-629481.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-629481.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-1558.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-1558.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-1562.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-1562.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-667044.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-667044.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += crbug-1539.rawfile
 
 ifeq ($(CONFIG_DECODE_PERF_TESTS),yes)
 # Encode / Decode test
@@ -814,7 +841,6 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += kirland_640_480_30.yuv
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += macmarcomoving_640_480_30.yuv
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += macmarcostationary_640_480_30.yuv
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1280_720_30.yuv
-LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_640_480_30.yuv
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += tacomanarrows_640_480_30.yuv
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += tacomasmallcameramovement_640_480_30.yuv
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += thaloundeskmtg_640_480_30.yuv
@@ -874,3 +900,5 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_3-4
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_3-4.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_3.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_3.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_1.webm.md5
diff --git a/media/libvpx/libvpx/test/test-data.sha1 b/media/libvpx/libvpx/test/test-data.sha1
index 22ca6f5643..1e8c090e0c 100644
--- a/media/libvpx/libvpx/test/test-data.sha1
+++ b/media/libvpx/libvpx/test/test-data.sha1
@@ -1,3 +1,4 @@
+3eaf216d9fc8b4b9bb8c3956311f49a85974806c *bus_352x288_420_f20_b8.yuv
 d5dfb0151c9051f8c85999255645d7a23916d3c0 *hantro_collage_w352h288.yuv
 b87815bf86020c592ccc7a846ba2e28ec8043902 *hantro_odd.yuv
 76024eb753cdac6a5e5703aaea189d35c3c30ac7 *invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf
@@ -6,6 +7,8 @@ b87815bf86020c592ccc7a846ba2e28ec8043902 *hantro_odd.yuv
 456d1493e52d32a5c30edf44a27debc1fa6b253a *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf.res
 c123d1f9f02fb4143abb5e271916e3a3080de8f6 *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf
 456d1493e52d32a5c30edf44a27debc1fa6b253a *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf.res
+efafb92b7567bc04c3f1432ea6c268c1c31affd5 *invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf
+5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf.res
 fe346136b9b8c1e6f6084cc106485706915795e4 *invalid-vp90-01-v3.webm
 5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-vp90-01-v3.webm.res
 d78e2fceba5ac942246503ec8366f879c4775ca5 *invalid-vp90-02-v2.webm
@@ -15,13 +18,13 @@ df1a1453feb3c00d7d89746c7003b4163523bff3 *invalid-vp90-03-v3.webm
 d637297561dd904eb2c97a9015deeb31c4a1e8d2 *invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm
 3a204bdbeaa3c6458b77bcebb8366d107267f55d *invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm.res
 9aa21d8b2cb9d39abe8a7bb6032dc66955fb4342 *noisy_clip_640_360.y4m
-a432f96ff0a787268e2f94a8092ab161a18d1b06 *park_joy_90p_10_420.y4m
-0b194cc312c3a2e84d156a221b0a5eb615dfddc5 *park_joy_90p_10_422.y4m
-ff0e0a21dc2adc95b8c1b37902713700655ced17 *park_joy_90p_10_444.y4m
+0936b837708ae68c034719f8e07596021c2c214f *park_joy_90p_10_420_20f.y4m
+5727a853c083c1099f837d27967bc1322d50ed4f *park_joy_90p_10_422_20f.y4m
+e13489470ef8e8b2a871a5640d795a42a39be58d *park_joy_90p_10_444_20f.y4m
 c934da6fb8cc54ee2a8c17c54cf6076dac37ead0 *park_joy_90p_10_440.yuv
-614c32ae1eca391e867c70d19974f0d62664dd99 *park_joy_90p_12_420.y4m
-c92825f1ea25c5c37855083a69faac6ac4641a9e *park_joy_90p_12_422.y4m
-b592189b885b6cc85db55cc98512a197d73d3b34 *park_joy_90p_12_444.y4m
+79b0dc1784635a7f291e21c4e8d66a29c496ab99 *park_joy_90p_12_420_20f.y4m
+9cf22b0f809f7464c8b9058f0cfa9d905921cbd1 *park_joy_90p_12_422_20f.y4m
+22b2a4abaecc4a9ade6bb503d25fb82367947e85 *park_joy_90p_12_444_20f.y4m
 82c1bfcca368c2f22bad7d693d690d5499ecdd11 *park_joy_90p_12_440.yuv
 b9e1e90aece2be6e2c90d89e6ab2372d5f8c792d *park_joy_90p_8_420_a10-1.y4m
 4e0eb61e76f0684188d9bc9f3ce61f6b6b77bb2c *park_joy_90p_8_420.y4m
@@ -848,3 +851,27 @@ a000d568431d07379dd5a8ec066061c07e560b47 *invalid-vp90-2-00-quantizer-63.ivf.kf_
 6fa3d3ac306a3d9ce1d610b78441dc00d2c2d4b9 *tos_vp8.webm
 e402cbbf9e550ae017a1e9f1f73931c1d18474e8 *invalid-crbug-667044.webm
 d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-crbug-667044.webm.res
+fd9df7f3f6992af1d7a9dde975c9a0d6f28c053d *invalid-bug-1443.ivf
+fd3020fa6e9ca5966206738654c97dec313b0a95 *invalid-bug-1443.ivf.res
+1a0e405606939f2febab1a21b30c37cb8f2c8cb1 *invalid-token-partition.ivf
+90a8a95e7024f015b87f5483a65036609b3d1b74 *invalid-token-partition.ivf.res
+17696cd21e875f1d6e5d418cbf89feab02c8850a *vp90-2-22-svc_1280x720_1.webm
+e2f9e1e47a791b4e939a9bdc50bf7a25b3761f77 *vp90-2-22-svc_1280x720_1.webm.md5
+a0fbbbc5dd50fd452096f4455a58c1a8c9f66697 *invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf
+a61774cf03fc584bd9f0904fc145253bb8ea6c4c *invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf.res
+894fae3afee0290546590823974203ab4b8abd95 *crbug-1539.rawfile
+f1026c03efd5da21b381c8eb21f0d64e6d7e4ba3 *invalid-crbug-1558.ivf
+eb198c25f861c3fe2cbd310de11eb96843019345 *invalid-crbug-1558.ivf.res
+c62b005a9fd32c36a1b3f67de6840330f9915e34 *invalid-crbug-1562.ivf
+f0cd8389948ad16085714d96567612136f6a46c5 *invalid-crbug-1562.ivf.res
+bac455906360b45338a16dd626ac5f19bc36a307 *desktop_office1.1280_720-020.yuv
+094be4b80fa30bd227149ea16ab6476d549ea092 *slides_code_term_web_plot.1920_1080.yuv
+518a0be998afece76d3df76047d51e256c591ff2 *invalid-bug-148271109.ivf
+d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-bug-148271109.ivf.res
+ad18ca16f0a249fb3b7c38de0d9b327fed273f96 *hantro_collage_w352h288_nv12.yuv
+8a0b2c350539859463d3546a67876c83ff6ff0ac *desktopqvga.320_240.yuv
+ad9942a073e245585c93f764ea299382a65939a7 *crowd_run_360p_10_150f.y4m
+f9a73e921552598a5804911e9f84fec2318e056a *repro-oss-fuzz-69906.y4m
+320874b648f54e9156339e9d7e322ec4c51cb5f7 *niklas_1284_770_30.y4m
+64e876e725f83b93e92a8240457c152dd6f5b77d *niklas_1857_167_30.y4m
+27a7f0ebda1842a27c4b973b0e9e393769f74012 *4x2.y4m
diff --git a/media/libvpx/libvpx/test/test.mk b/media/libvpx/libvpx/test/test.mk
index e25463e46a..7c5cbf0720 100644
--- a/media/libvpx/libvpx/test/test.mk
+++ b/media/libvpx/libvpx/test/test.mk
@@ -1,9 +1,14 @@
 LIBVPX_TEST_SRCS-yes += acm_random.h
+LIBVPX_TEST_SRCS-yes += bench.h
+LIBVPX_TEST_SRCS-yes += bench.cc
+LIBVPX_TEST_SRCS-yes += buffer.h
 LIBVPX_TEST_SRCS-yes += clear_system_state.h
 LIBVPX_TEST_SRCS-yes += codec_factory.h
 LIBVPX_TEST_SRCS-yes += md5_helper.h
 LIBVPX_TEST_SRCS-yes += register_state_check.h
 LIBVPX_TEST_SRCS-yes += test.mk
+LIBVPX_TEST_SRCS-yes += init_vpx_test.cc
+LIBVPX_TEST_SRCS-yes += init_vpx_test.h
 LIBVPX_TEST_SRCS-yes += test_libvpx.cc
 LIBVPX_TEST_SRCS-yes += test_vectors.cc
 LIBVPX_TEST_SRCS-yes += test_vectors.h
@@ -16,12 +21,12 @@ LIBVPX_TEST_SRCS-yes += video_source.h
 ## Black box tests only use the public API.
 ##
 LIBVPX_TEST_SRCS-yes                   += ../md5_utils.h ../md5_utils.c
+LIBVPX_TEST_SRCS-yes                   += vpx_image_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ivf_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += ../y4minput.h ../y4minput.c
+ifneq ($(CONFIG_REALTIME_ONLY),yes)
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += altref_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += aq_segment_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += alt_ref_aq_segment_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += datarate_test.cc
+endif
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += encode_api_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += error_resilience_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += i420_video_source.h
@@ -30,24 +35,41 @@ LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += resize_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += y4m_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += yuv_video_source.h
 
+ifneq ($(CONFIG_REALTIME_ONLY),yes)
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += config_test.cc
+endif
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += cq_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_datarate_test.cc
 
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += byte_alignment_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += decode_svc_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += external_frame_buffer_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += user_priv_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_frame_parallel_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_refresh_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_test.cc
+ifneq ($(CONFIG_REALTIME_ONLY),yes)
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += alt_ref_aq_segment_test.cc
+endif
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += aq_segment_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += frame_size_tests.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_lossless_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_end_to_end_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += decode_corrupted.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ethread_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_motion_vector_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += level_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_datarate_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_test.h
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_end_to_end_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += timestamp_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_datarate_test.cc
+ifneq ($(CONFIG_REALTIME_ONLY),yes)
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ext_ratectrl_test.cc
+endif
 
 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.cc
 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.h
@@ -66,12 +88,14 @@ LIBWEBM_PARSER_SRCS += ../third_party/libwebm/mkvparser/mkvparser.cc
 LIBWEBM_PARSER_SRCS += ../third_party/libwebm/mkvparser/mkvreader.cc
 LIBWEBM_PARSER_SRCS += ../third_party/libwebm/mkvparser/mkvparser.h
 LIBWEBM_PARSER_SRCS += ../third_party/libwebm/mkvparser/mkvreader.h
+LIBWEBM_PARSER_SRCS += ../third_party/libwebm/common/webmids.h
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += $(LIBWEBM_PARSER_SRCS)
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../tools_common.h
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../webmdec.cc
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../webmdec.h
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += webm_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_skip_loopfilter_test.cc
+$(BUILD_PFX)third_party/libwebm/%.cc.o: CXXFLAGS += $(LIBWEBM_CXXFLAGS)
 endif
 
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += decode_api_test.cc
@@ -110,11 +134,13 @@ ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),yesyes)
 LIBVPX_TEST_SRCS-yes                   += vp8_boolcoder_test.cc
 LIBVPX_TEST_SRCS-yes                   += vp8_fragments_test.cc
 endif
-
 LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC)    += add_noise_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC)    += pp_filter_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += vp8_decrypt_test.cc
+ifneq (, $(filter yes, $(HAVE_SSE2) $(HAVE_SSSE3) $(HAVE_SSE4_1) $(HAVE_NEON) \
+                       $(HAVE_MSA) $(HAVE_MMI)))
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += quantize_test.cc
+endif
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
@@ -122,6 +148,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
 LIBVPX_TEST_SRCS-yes                   += idct_test.cc
 LIBVPX_TEST_SRCS-yes                   += predict_test.cc
 LIBVPX_TEST_SRCS-yes                   += vpx_scale_test.cc
+LIBVPX_TEST_SRCS-yes                   += vpx_scale_test.h
 
 ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_TEMPORAL_DENOISING),yesyes)
 LIBVPX_TEST_SRCS-$(HAVE_SSE2) += vp8_denoiser_sse2_test.cc
@@ -141,6 +168,7 @@ LIBVPX_TEST_SRCS-yes                   += superframe_test.cc
 LIBVPX_TEST_SRCS-yes                   += tile_independence_test.cc
 LIBVPX_TEST_SRCS-yes                   += vp9_boolcoder_test.cc
 LIBVPX_TEST_SRCS-yes                   += vp9_encoder_parms_get_to_decoder.cc
+LIBVPX_TEST_SRCS-yes                   += vp9_roi_test.cc
 endif
 
 LIBVPX_TEST_SRCS-yes                   += convolve_test.cc
@@ -149,25 +177,36 @@ LIBVPX_TEST_SRCS-yes                   += vp9_intrapred_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_decrypt_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_thread_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += avg_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += comp_avg_pred_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct32x32_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct_partial_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += hadamard_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += minmax_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_scale_test.cc
+ifneq ($(CONFIG_REALTIME_ONLY),yes)
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += yuv_temporal_filter_test.cc
+endif
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_error_block_test.cc
+ifneq (, $(filter yes, $(HAVE_SSE2) $(HAVE_AVX2) $(HAVE_NEON)))
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_block_error_test.cc
+endif
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
 
 ifeq ($(CONFIG_VP9_ENCODER),yes)
-LIBVPX_TEST_SRCS-$(CONFIG_SPATIAL_SVC) += svc_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_INTERNAL_STATS) += blockiness_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_INTERNAL_STATS) += consistency_test.cc
 endif
 
+ifeq ($(CONFIG_VP9_ENCODER),yes)
+LIBVPX_TEST_SRCS-$(CONFIG_NON_GREEDY_MV) += non_greedy_mv_test.cc
+endif
+
 ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_TEMPORAL_DENOISING),yesyes)
-LIBVPX_TEST_SRCS-$(HAVE_SSE2) += vp9_denoiser_sse2_test.cc
+LIBVPX_TEST_SRCS-yes += vp9_denoiser_test.cc
 endif
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_arf_freq_test.cc
 
@@ -176,10 +215,23 @@ endif # VP9
 ## Multi-codec / unconditional whitebox tests.
 
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc
+ifneq (, $(filter yes, $(HAVE_NEON) $(HAVE_SSE2) $(HAVE_MSA)))
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sum_squares_test.cc
+endif
 
 TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc
 TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c
+TEST_INTRA_PRED_SPEED_SRCS-yes += init_vpx_test.cc
+TEST_INTRA_PRED_SPEED_SRCS-yes += init_vpx_test.h
+
+RC_INTERFACE_TEST_SRCS-yes := test_rc_interface.cc
+RC_INTERFACE_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ratectrl_rtc_test.cc
+RC_INTERFACE_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_ratectrl_rtc_test.cc
+RC_INTERFACE_TEST_SRCS-$(CONFIG_ENCODERS) += encode_test_driver.cc
+RC_INTERFACE_TEST_SRCS-$(CONFIG_ENCODERS) += encode_test_driver.h
+RC_INTERFACE_TEST_SRCS-yes += decode_test_driver.cc
+RC_INTERFACE_TEST_SRCS-yes += decode_test_driver.h
+RC_INTERFACE_TEST_SRCS-yes += codec_factory.h
 
 endif # CONFIG_SHARED
 
diff --git a/media/libvpx/libvpx/test/test_intra_pred_speed.cc b/media/libvpx/libvpx/test/test_intra_pred_speed.cc
index 17dde1a526..87ab732306 100644
--- a/media/libvpx/libvpx/test/test_intra_pred_speed.cc
+++ b/media/libvpx/libvpx/test/test_intra_pred_speed.cc
@@ -12,11 +12,13 @@
 #include <stdio.h>
 #include <string.h>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
+#include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
+#include "test/init_vpx_test.h"
 #include "test/md5_helper.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
@@ -26,8 +28,8 @@
 
 namespace {
 
-typedef void (*VpxPredFunc)(uint8_t *dst, ptrdiff_t y_stride,
-                            const uint8_t *above, const uint8_t *left);
+using VpxPredFunc = void (*)(uint8_t *dst, ptrdiff_t y_stride,
+                             const uint8_t *above, const uint8_t *left);
 
 const int kBPS = 32;
 const int kTotalPixels = 32 * kBPS;
@@ -48,11 +50,9 @@ struct IntraPredTestMem {
     for (int i = 0; i < kBPS; ++i) left[i] = rnd.Rand16() & mask;
     for (int i = -1; i < kBPS; ++i) above[i] = rnd.Rand16() & mask;
 
-    // some code assumes the top row has been extended:
-    // d45/d63 C-code, for instance, but not the assembly.
-    // TODO(jzern): this style of extension isn't strictly necessary.
+    // d45/d63 require the top row to be extended.
     ASSERT_LE(block_size, kBPS);
-    for (int i = block_size; i < 2 * kBPS; ++i) {
+    for (int i = block_size; i < 2 * block_size; ++i) {
       above[i] = above[block_size - 1];
     }
   }
@@ -63,7 +63,7 @@ struct IntraPredTestMem {
   DECLARE_ALIGNED(16, Pixel, above_mem[2 * kBPS + 16]);
 };
 
-typedef IntraPredTestMem<uint8_t> Vp9IntraPredTestMem;
+using Vp9IntraPredTestMem = IntraPredTestMem<uint8_t>;
 
 void CheckMd5Signature(const char name[], const char *const signatures[],
                        const void *data, size_t data_size, int elapsed_time,
@@ -85,7 +85,7 @@ void TestIntraPred(const char name[], VpxPredFunc const *pred_funcs,
   intra_pred_test_mem.Init(block_size, 8);
 
   for (int k = 0; k < kNumVp9IntraPredFuncs; ++k) {
-    if (pred_funcs[k] == NULL) continue;
+    if (pred_funcs[k] == nullptr) continue;
     memcpy(intra_pred_test_mem.src, intra_pred_test_mem.ref_src,
            sizeof(intra_pred_test_mem.src));
     vpx_usec_timer timer;
@@ -206,58 +206,64 @@ INTRA_PRED_TEST(C, TestIntraPred32, vpx_dc_predictor_32x32_c,
 INTRA_PRED_TEST(SSE2, TestIntraPred4, vpx_dc_predictor_4x4_sse2,
                 vpx_dc_left_predictor_4x4_sse2, vpx_dc_top_predictor_4x4_sse2,
                 vpx_dc_128_predictor_4x4_sse2, vpx_v_predictor_4x4_sse2,
-                vpx_h_predictor_4x4_sse2, vpx_d45_predictor_4x4_sse2, NULL,
-                NULL, NULL, vpx_d207_predictor_4x4_sse2, NULL,
+                vpx_h_predictor_4x4_sse2, vpx_d45_predictor_4x4_sse2, nullptr,
+                nullptr, nullptr, vpx_d207_predictor_4x4_sse2, nullptr,
                 vpx_tm_predictor_4x4_sse2)
 
 INTRA_PRED_TEST(SSE2, TestIntraPred8, vpx_dc_predictor_8x8_sse2,
                 vpx_dc_left_predictor_8x8_sse2, vpx_dc_top_predictor_8x8_sse2,
                 vpx_dc_128_predictor_8x8_sse2, vpx_v_predictor_8x8_sse2,
-                vpx_h_predictor_8x8_sse2, vpx_d45_predictor_8x8_sse2, NULL,
-                NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_sse2)
+                vpx_h_predictor_8x8_sse2, vpx_d45_predictor_8x8_sse2, nullptr,
+                nullptr, nullptr, nullptr, nullptr, vpx_tm_predictor_8x8_sse2)
 
 INTRA_PRED_TEST(SSE2, TestIntraPred16, vpx_dc_predictor_16x16_sse2,
                 vpx_dc_left_predictor_16x16_sse2,
                 vpx_dc_top_predictor_16x16_sse2,
                 vpx_dc_128_predictor_16x16_sse2, vpx_v_predictor_16x16_sse2,
-                vpx_h_predictor_16x16_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_16x16_sse2)
+                vpx_h_predictor_16x16_sse2, nullptr, nullptr, nullptr, nullptr,
+                nullptr, nullptr, vpx_tm_predictor_16x16_sse2)
 
 INTRA_PRED_TEST(SSE2, TestIntraPred32, vpx_dc_predictor_32x32_sse2,
                 vpx_dc_left_predictor_32x32_sse2,
                 vpx_dc_top_predictor_32x32_sse2,
                 vpx_dc_128_predictor_32x32_sse2, vpx_v_predictor_32x32_sse2,
-                vpx_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_32x32_sse2)
+                vpx_h_predictor_32x32_sse2, nullptr, nullptr, nullptr, nullptr,
+                nullptr, nullptr, vpx_tm_predictor_32x32_sse2)
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
-INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, vpx_d153_predictor_4x4_ssse3, NULL,
-                vpx_d63_predictor_4x4_ssse3, NULL)
-INTRA_PRED_TEST(SSSE3, TestIntraPred8, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, vpx_d153_predictor_8x8_ssse3,
-                vpx_d207_predictor_8x8_ssse3, vpx_d63_predictor_8x8_ssse3, NULL)
-INTRA_PRED_TEST(SSSE3, TestIntraPred16, NULL, NULL, NULL, NULL, NULL, NULL,
-                vpx_d45_predictor_16x16_ssse3, NULL, NULL,
-                vpx_d153_predictor_16x16_ssse3, vpx_d207_predictor_16x16_ssse3,
-                vpx_d63_predictor_16x16_ssse3, NULL)
-INTRA_PRED_TEST(SSSE3, TestIntraPred32, NULL, NULL, NULL, NULL, NULL, NULL,
-                vpx_d45_predictor_32x32_ssse3, NULL, NULL,
-                vpx_d153_predictor_32x32_ssse3, vpx_d207_predictor_32x32_ssse3,
-                vpx_d63_predictor_32x32_ssse3, NULL)
+INTRA_PRED_TEST(SSSE3, TestIntraPred4, nullptr, nullptr, nullptr, nullptr,
+                nullptr, nullptr, nullptr, nullptr, nullptr,
+                vpx_d153_predictor_4x4_ssse3, nullptr,
+                vpx_d63_predictor_4x4_ssse3, nullptr)
+INTRA_PRED_TEST(SSSE3, TestIntraPred8, nullptr, nullptr, nullptr, nullptr,
+                nullptr, nullptr, nullptr, nullptr, nullptr,
+                vpx_d153_predictor_8x8_ssse3, vpx_d207_predictor_8x8_ssse3,
+                vpx_d63_predictor_8x8_ssse3, nullptr)
+INTRA_PRED_TEST(SSSE3, TestIntraPred16, nullptr, nullptr, nullptr, nullptr,
+                nullptr, nullptr, vpx_d45_predictor_16x16_ssse3, nullptr,
+                nullptr, vpx_d153_predictor_16x16_ssse3,
+                vpx_d207_predictor_16x16_ssse3, vpx_d63_predictor_16x16_ssse3,
+                nullptr)
+INTRA_PRED_TEST(SSSE3, TestIntraPred32, nullptr, nullptr, nullptr, nullptr,
+                nullptr, nullptr, vpx_d45_predictor_32x32_ssse3, nullptr,
+                nullptr, vpx_d153_predictor_32x32_ssse3,
+                vpx_d207_predictor_32x32_ssse3, vpx_d63_predictor_32x32_ssse3,
+                nullptr)
 #endif  // HAVE_SSSE3
 
 #if HAVE_DSPR2
-INTRA_PRED_TEST(DSPR2, TestIntraPred4, vpx_dc_predictor_4x4_dspr2, NULL, NULL,
-                NULL, NULL, vpx_h_predictor_4x4_dspr2, NULL, NULL, NULL, NULL,
-                NULL, NULL, vpx_tm_predictor_4x4_dspr2)
-INTRA_PRED_TEST(DSPR2, TestIntraPred8, vpx_dc_predictor_8x8_dspr2, NULL, NULL,
-                NULL, NULL, vpx_h_predictor_8x8_dspr2, NULL, NULL, NULL, NULL,
-                NULL, NULL, vpx_tm_predictor_8x8_c)
-INTRA_PRED_TEST(DSPR2, TestIntraPred16, vpx_dc_predictor_16x16_dspr2, NULL,
-                NULL, NULL, NULL, vpx_h_predictor_16x16_dspr2, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(DSPR2, TestIntraPred4, vpx_dc_predictor_4x4_dspr2, nullptr,
+                nullptr, nullptr, nullptr, vpx_h_predictor_4x4_dspr2, nullptr,
+                nullptr, nullptr, nullptr, nullptr, nullptr,
+                vpx_tm_predictor_4x4_dspr2)
+INTRA_PRED_TEST(DSPR2, TestIntraPred8, vpx_dc_predictor_8x8_dspr2, nullptr,
+                nullptr, nullptr, nullptr, vpx_h_predictor_8x8_dspr2, nullptr,
+                nullptr, nullptr, nullptr, nullptr, nullptr,
+                vpx_tm_predictor_8x8_c)
+INTRA_PRED_TEST(DSPR2, TestIntraPred16, vpx_dc_predictor_16x16_dspr2, nullptr,
+                nullptr, nullptr, nullptr, vpx_h_predictor_16x16_dspr2, nullptr,
+                nullptr, nullptr, nullptr, nullptr, nullptr, nullptr)
 #endif  // HAVE_DSPR2
 
 #if HAVE_NEON
@@ -265,63 +271,104 @@ INTRA_PRED_TEST(NEON, TestIntraPred4, vpx_dc_predictor_4x4_neon,
                 vpx_dc_left_predictor_4x4_neon, vpx_dc_top_predictor_4x4_neon,
                 vpx_dc_128_predictor_4x4_neon, vpx_v_predictor_4x4_neon,
                 vpx_h_predictor_4x4_neon, vpx_d45_predictor_4x4_neon,
-                vpx_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_4x4_neon)
+                vpx_d135_predictor_4x4_neon, vpx_d117_predictor_4x4_neon,
+                vpx_d153_predictor_4x4_neon, vpx_d207_predictor_4x4_neon,
+                vpx_d63_predictor_4x4_neon, vpx_tm_predictor_4x4_neon)
 INTRA_PRED_TEST(NEON, TestIntraPred8, vpx_dc_predictor_8x8_neon,
                 vpx_dc_left_predictor_8x8_neon, vpx_dc_top_predictor_8x8_neon,
                 vpx_dc_128_predictor_8x8_neon, vpx_v_predictor_8x8_neon,
                 vpx_h_predictor_8x8_neon, vpx_d45_predictor_8x8_neon,
-                vpx_d135_predictor_8x8_neon, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_8x8_neon)
+                vpx_d135_predictor_8x8_neon, vpx_d117_predictor_8x8_neon,
+                vpx_d153_predictor_8x8_neon, vpx_d207_predictor_8x8_neon,
+                vpx_d63_predictor_8x8_neon, vpx_tm_predictor_8x8_neon)
 INTRA_PRED_TEST(NEON, TestIntraPred16, vpx_dc_predictor_16x16_neon,
                 vpx_dc_left_predictor_16x16_neon,
                 vpx_dc_top_predictor_16x16_neon,
                 vpx_dc_128_predictor_16x16_neon, vpx_v_predictor_16x16_neon,
                 vpx_h_predictor_16x16_neon, vpx_d45_predictor_16x16_neon,
-                vpx_d135_predictor_16x16_neon, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_16x16_neon)
+                vpx_d135_predictor_16x16_neon, vpx_d117_predictor_16x16_neon,
+                vpx_d153_predictor_16x16_neon, vpx_d207_predictor_16x16_neon,
+                vpx_d63_predictor_16x16_neon, vpx_tm_predictor_16x16_neon)
 INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon,
                 vpx_dc_left_predictor_32x32_neon,
                 vpx_dc_top_predictor_32x32_neon,
                 vpx_dc_128_predictor_32x32_neon, vpx_v_predictor_32x32_neon,
                 vpx_h_predictor_32x32_neon, vpx_d45_predictor_32x32_neon,
-                vpx_d135_predictor_32x32_neon, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_32x32_neon)
+                vpx_d135_predictor_32x32_neon, vpx_d117_predictor_32x32_neon,
+                vpx_d153_predictor_32x32_neon, vpx_d207_predictor_32x32_neon,
+                vpx_d63_predictor_32x32_neon, vpx_tm_predictor_32x32_neon)
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
 INTRA_PRED_TEST(MSA, TestIntraPred4, vpx_dc_predictor_4x4_msa,
                 vpx_dc_left_predictor_4x4_msa, vpx_dc_top_predictor_4x4_msa,
                 vpx_dc_128_predictor_4x4_msa, vpx_v_predictor_4x4_msa,
-                vpx_h_predictor_4x4_msa, NULL, NULL, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_4x4_msa)
+                vpx_h_predictor_4x4_msa, nullptr, nullptr, nullptr, nullptr,
+                nullptr, nullptr, vpx_tm_predictor_4x4_msa)
 INTRA_PRED_TEST(MSA, TestIntraPred8, vpx_dc_predictor_8x8_msa,
                 vpx_dc_left_predictor_8x8_msa, vpx_dc_top_predictor_8x8_msa,
                 vpx_dc_128_predictor_8x8_msa, vpx_v_predictor_8x8_msa,
-                vpx_h_predictor_8x8_msa, NULL, NULL, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_8x8_msa)
+                vpx_h_predictor_8x8_msa, nullptr, nullptr, nullptr, nullptr,
+                nullptr, nullptr, vpx_tm_predictor_8x8_msa)
 INTRA_PRED_TEST(MSA, TestIntraPred16, vpx_dc_predictor_16x16_msa,
                 vpx_dc_left_predictor_16x16_msa, vpx_dc_top_predictor_16x16_msa,
                 vpx_dc_128_predictor_16x16_msa, vpx_v_predictor_16x16_msa,
-                vpx_h_predictor_16x16_msa, NULL, NULL, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_16x16_msa)
+                vpx_h_predictor_16x16_msa, nullptr, nullptr, nullptr, nullptr,
+                nullptr, nullptr, vpx_tm_predictor_16x16_msa)
 INTRA_PRED_TEST(MSA, TestIntraPred32, vpx_dc_predictor_32x32_msa,
                 vpx_dc_left_predictor_32x32_msa, vpx_dc_top_predictor_32x32_msa,
                 vpx_dc_128_predictor_32x32_msa, vpx_v_predictor_32x32_msa,
-                vpx_h_predictor_32x32_msa, NULL, NULL, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_32x32_msa)
+                vpx_h_predictor_32x32_msa, nullptr, nullptr, nullptr, nullptr,
+                nullptr, nullptr, vpx_tm_predictor_32x32_msa)
 #endif  // HAVE_MSA
 
+#if HAVE_VSX
+// TODO(crbug.com/webm/1522): Fix test failures.
+#if 0
+INTRA_PRED_TEST(VSX, TestIntraPred4, nullptr, nullptr, nullptr, nullptr,
+                nullptr, vpx_h_predictor_4x4_vsx, nullptr, nullptr, nullptr,
+                nullptr, nullptr, nullptr, vpx_tm_predictor_4x4_vsx)
+
+INTRA_PRED_TEST(VSX, TestIntraPred8, vpx_dc_predictor_8x8_vsx, nullptr, nullptr,
+                nullptr, nullptr, vpx_h_predictor_8x8_vsx,
+                vpx_d45_predictor_8x8_vsx, nullptr, nullptr, nullptr, nullptr,
+                vpx_d63_predictor_8x8_vsx, vpx_tm_predictor_8x8_vsx)
+#endif
+
+INTRA_PRED_TEST(VSX, TestIntraPred16, vpx_dc_predictor_16x16_vsx,
+                vpx_dc_left_predictor_16x16_vsx, vpx_dc_top_predictor_16x16_vsx,
+                vpx_dc_128_predictor_16x16_vsx, vpx_v_predictor_16x16_vsx,
+                vpx_h_predictor_16x16_vsx, vpx_d45_predictor_16x16_vsx, nullptr,
+                nullptr, nullptr, nullptr, vpx_d63_predictor_16x16_vsx,
+                vpx_tm_predictor_16x16_vsx)
+
+INTRA_PRED_TEST(VSX, TestIntraPred32, vpx_dc_predictor_32x32_vsx,
+                vpx_dc_left_predictor_32x32_vsx, vpx_dc_top_predictor_32x32_vsx,
+                vpx_dc_128_predictor_32x32_vsx, vpx_v_predictor_32x32_vsx,
+                vpx_h_predictor_32x32_vsx, vpx_d45_predictor_32x32_vsx, nullptr,
+                nullptr, nullptr, nullptr, vpx_d63_predictor_32x32_vsx,
+                vpx_tm_predictor_32x32_vsx)
+#endif  // HAVE_VSX
+
+#if HAVE_LSX
+INTRA_PRED_TEST(LSX, TestIntraPred8, vpx_dc_predictor_8x8_lsx, nullptr, nullptr,
+                nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+                nullptr, nullptr, nullptr)
+INTRA_PRED_TEST(LSX, TestIntraPred16, vpx_dc_predictor_16x16_lsx, nullptr,
+                nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+                nullptr, nullptr, nullptr, nullptr)
+#endif  // HAVE_LSX
+
 // -----------------------------------------------------------------------------
 
 #if CONFIG_VP9_HIGHBITDEPTH
 namespace {
 
-typedef void (*VpxHighbdPredFunc)(uint16_t *dst, ptrdiff_t y_stride,
-                                  const uint16_t *above, const uint16_t *left,
-                                  int bd);
+using VpxHighbdPredFunc = void (*)(uint16_t *dst, ptrdiff_t y_stride,
+                                   const uint16_t *above, const uint16_t *left,
+                                   int bd);
 
-typedef IntraPredTestMem<uint16_t> Vp9HighbdIntraPredTestMem;
+using Vp9HighbdIntraPredTestMem = IntraPredTestMem<uint16_t>;
 
 void TestHighbdIntraPred(const char name[], VpxHighbdPredFunc const *pred_funcs,
                          const char *const signatures[], int block_size) {
@@ -333,7 +380,7 @@ void TestHighbdIntraPred(const char name[], VpxHighbdPredFunc const *pred_funcs,
   intra_pred_test_mem.Init(block_size, 12);
 
   for (int k = 0; k < kNumVp9IntraPredFuncs; ++k) {
-    if (pred_funcs[k] == NULL) continue;
+    if (pred_funcs[k] == nullptr) continue;
     memcpy(intra_pred_test_mem.src, intra_pred_test_mem.ref_src,
            sizeof(intra_pred_test_mem.src));
     vpx_usec_timer timer;
@@ -455,66 +502,115 @@ HIGHBD_INTRA_PRED_TEST(
     vpx_highbd_d63_predictor_32x32_c, vpx_highbd_tm_predictor_32x32_c)
 
 #if HAVE_SSE2
-HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred4,
-                       vpx_highbd_dc_predictor_4x4_sse2, NULL, NULL, NULL,
-                       vpx_highbd_v_predictor_4x4_sse2, NULL, NULL, NULL, NULL,
-                       NULL, NULL, NULL, vpx_highbd_tm_predictor_4x4_c)
+HIGHBD_INTRA_PRED_TEST(
+    SSE2, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_sse2,
+    vpx_highbd_dc_left_predictor_4x4_sse2, vpx_highbd_dc_top_predictor_4x4_sse2,
+    vpx_highbd_dc_128_predictor_4x4_sse2, vpx_highbd_v_predictor_4x4_sse2,
+    vpx_highbd_h_predictor_4x4_sse2, nullptr,
+    vpx_highbd_d135_predictor_4x4_sse2, vpx_highbd_d117_predictor_4x4_sse2,
+    vpx_highbd_d153_predictor_4x4_sse2, vpx_highbd_d207_predictor_4x4_sse2,
+    vpx_highbd_d63_predictor_4x4_sse2, vpx_highbd_tm_predictor_4x4_c)
 
-HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred8,
-                       vpx_highbd_dc_predictor_8x8_sse2, NULL, NULL, NULL,
-                       vpx_highbd_v_predictor_8x8_sse2, NULL, NULL, NULL, NULL,
-                       NULL, NULL, NULL, vpx_highbd_tm_predictor_8x8_sse2)
+HIGHBD_INTRA_PRED_TEST(
+    SSE2, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_sse2,
+    vpx_highbd_dc_left_predictor_8x8_sse2, vpx_highbd_dc_top_predictor_8x8_sse2,
+    vpx_highbd_dc_128_predictor_8x8_sse2, vpx_highbd_v_predictor_8x8_sse2,
+    vpx_highbd_h_predictor_8x8_sse2, nullptr, nullptr, nullptr, nullptr,
+    nullptr, nullptr, vpx_highbd_tm_predictor_8x8_sse2)
 
 HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred16,
-                       vpx_highbd_dc_predictor_16x16_sse2, NULL, NULL, NULL,
-                       vpx_highbd_v_predictor_16x16_sse2, NULL, NULL, NULL,
-                       NULL, NULL, NULL, NULL,
+                       vpx_highbd_dc_predictor_16x16_sse2,
+                       vpx_highbd_dc_left_predictor_16x16_sse2,
+                       vpx_highbd_dc_top_predictor_16x16_sse2,
+                       vpx_highbd_dc_128_predictor_16x16_sse2,
+                       vpx_highbd_v_predictor_16x16_sse2,
+                       vpx_highbd_h_predictor_16x16_sse2, nullptr, nullptr,
+                       nullptr, nullptr, nullptr, nullptr,
                        vpx_highbd_tm_predictor_16x16_sse2)
 
 HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred32,
-                       vpx_highbd_dc_predictor_32x32_sse2, NULL, NULL, NULL,
-                       vpx_highbd_v_predictor_32x32_sse2, NULL, NULL, NULL,
-                       NULL, NULL, NULL, NULL,
+                       vpx_highbd_dc_predictor_32x32_sse2,
+                       vpx_highbd_dc_left_predictor_32x32_sse2,
+                       vpx_highbd_dc_top_predictor_32x32_sse2,
+                       vpx_highbd_dc_128_predictor_32x32_sse2,
+                       vpx_highbd_v_predictor_32x32_sse2,
+                       vpx_highbd_h_predictor_32x32_sse2, nullptr, nullptr,
+                       nullptr, nullptr, nullptr, nullptr,
                        vpx_highbd_tm_predictor_32x32_sse2)
 #endif  // HAVE_SSE2
 
+#if HAVE_SSSE3
+HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred4, nullptr, nullptr, nullptr,
+                       nullptr, nullptr, nullptr,
+                       vpx_highbd_d45_predictor_4x4_ssse3, nullptr, nullptr,
+                       nullptr, nullptr, nullptr, nullptr)
+HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred8, nullptr, nullptr, nullptr,
+                       nullptr, nullptr, nullptr,
+                       vpx_highbd_d45_predictor_8x8_ssse3,
+                       vpx_highbd_d135_predictor_8x8_ssse3,
+                       vpx_highbd_d117_predictor_8x8_ssse3,
+                       vpx_highbd_d153_predictor_8x8_ssse3,
+                       vpx_highbd_d207_predictor_8x8_ssse3,
+                       vpx_highbd_d63_predictor_8x8_ssse3, nullptr)
+HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred16, nullptr, nullptr, nullptr,
+                       nullptr, nullptr, nullptr,
+                       vpx_highbd_d45_predictor_16x16_ssse3,
+                       vpx_highbd_d135_predictor_16x16_ssse3,
+                       vpx_highbd_d117_predictor_16x16_ssse3,
+                       vpx_highbd_d153_predictor_16x16_ssse3,
+                       vpx_highbd_d207_predictor_16x16_ssse3,
+                       vpx_highbd_d63_predictor_16x16_ssse3, nullptr)
+HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred32, nullptr, nullptr, nullptr,
+                       nullptr, nullptr, nullptr,
+                       vpx_highbd_d45_predictor_32x32_ssse3,
+                       vpx_highbd_d135_predictor_32x32_ssse3,
+                       vpx_highbd_d117_predictor_32x32_ssse3,
+                       vpx_highbd_d153_predictor_32x32_ssse3,
+                       vpx_highbd_d207_predictor_32x32_ssse3,
+                       vpx_highbd_d63_predictor_32x32_ssse3, nullptr)
+#endif  // HAVE_SSSE3
+
 #if HAVE_NEON
 HIGHBD_INTRA_PRED_TEST(
     NEON, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_neon,
     vpx_highbd_dc_left_predictor_4x4_neon, vpx_highbd_dc_top_predictor_4x4_neon,
     vpx_highbd_dc_128_predictor_4x4_neon, vpx_highbd_v_predictor_4x4_neon,
     vpx_highbd_h_predictor_4x4_neon, vpx_highbd_d45_predictor_4x4_neon,
-    vpx_highbd_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL,
-    vpx_highbd_tm_predictor_4x4_neon)
+    vpx_highbd_d135_predictor_4x4_neon, vpx_highbd_d117_predictor_4x4_neon,
+    vpx_highbd_d153_predictor_4x4_neon, vpx_highbd_d207_predictor_4x4_neon,
+    vpx_highbd_d63_predictor_4x4_neon, vpx_highbd_tm_predictor_4x4_neon)
 HIGHBD_INTRA_PRED_TEST(
     NEON, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_neon,
     vpx_highbd_dc_left_predictor_8x8_neon, vpx_highbd_dc_top_predictor_8x8_neon,
     vpx_highbd_dc_128_predictor_8x8_neon, vpx_highbd_v_predictor_8x8_neon,
     vpx_highbd_h_predictor_8x8_neon, vpx_highbd_d45_predictor_8x8_neon,
-    vpx_highbd_d135_predictor_8x8_neon, NULL, NULL, NULL, NULL,
-    vpx_highbd_tm_predictor_8x8_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred16,
-                       vpx_highbd_dc_predictor_16x16_neon,
-                       vpx_highbd_dc_left_predictor_16x16_neon,
-                       vpx_highbd_dc_top_predictor_16x16_neon,
-                       vpx_highbd_dc_128_predictor_16x16_neon,
-                       vpx_highbd_v_predictor_16x16_neon,
-                       vpx_highbd_h_predictor_16x16_neon,
-                       vpx_highbd_d45_predictor_16x16_neon,
-                       vpx_highbd_d135_predictor_16x16_neon, NULL, NULL, NULL,
-                       NULL, vpx_highbd_tm_predictor_16x16_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred32,
-                       vpx_highbd_dc_predictor_32x32_neon,
-                       vpx_highbd_dc_left_predictor_32x32_neon,
-                       vpx_highbd_dc_top_predictor_32x32_neon,
-                       vpx_highbd_dc_128_predictor_32x32_neon,
-                       vpx_highbd_v_predictor_32x32_neon,
-                       vpx_highbd_h_predictor_32x32_neon,
-                       vpx_highbd_d45_predictor_32x32_neon,
-                       vpx_highbd_d135_predictor_32x32_neon, NULL, NULL, NULL,
-                       NULL, vpx_highbd_tm_predictor_32x32_neon)
+    vpx_highbd_d135_predictor_8x8_neon, vpx_highbd_d117_predictor_8x8_neon,
+    vpx_highbd_d153_predictor_8x8_neon, vpx_highbd_d207_predictor_8x8_neon,
+    vpx_highbd_d63_predictor_8x8_neon, vpx_highbd_tm_predictor_8x8_neon)
+HIGHBD_INTRA_PRED_TEST(
+    NEON, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_neon,
+    vpx_highbd_dc_left_predictor_16x16_neon,
+    vpx_highbd_dc_top_predictor_16x16_neon,
+    vpx_highbd_dc_128_predictor_16x16_neon, vpx_highbd_v_predictor_16x16_neon,
+    vpx_highbd_h_predictor_16x16_neon, vpx_highbd_d45_predictor_16x16_neon,
+    vpx_highbd_d135_predictor_16x16_neon, vpx_highbd_d117_predictor_16x16_neon,
+    vpx_highbd_d153_predictor_16x16_neon, vpx_highbd_d207_predictor_16x16_neon,
+    vpx_highbd_d63_predictor_16x16_neon, vpx_highbd_tm_predictor_16x16_neon)
+HIGHBD_INTRA_PRED_TEST(
+    NEON, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_neon,
+    vpx_highbd_dc_left_predictor_32x32_neon,
+    vpx_highbd_dc_top_predictor_32x32_neon,
+    vpx_highbd_dc_128_predictor_32x32_neon, vpx_highbd_v_predictor_32x32_neon,
+    vpx_highbd_h_predictor_32x32_neon, vpx_highbd_d45_predictor_32x32_neon,
+    vpx_highbd_d135_predictor_32x32_neon, vpx_highbd_d117_predictor_32x32_neon,
+    vpx_highbd_d153_predictor_32x32_neon, vpx_highbd_d207_predictor_32x32_neon,
+    vpx_highbd_d63_predictor_32x32_neon, vpx_highbd_tm_predictor_32x32_neon)
 #endif  // HAVE_NEON
 
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#include "test/test_libvpx.cc"
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  ::libvpx_test::init_vpx_test();
+  return RUN_ALL_TESTS();
+}
diff --git a/media/libvpx/libvpx/test/test_libvpx.cc b/media/libvpx/libvpx/test/test_libvpx.cc
index 8a70b4e28c..ba27102385 100644
--- a/media/libvpx/libvpx/test/test_libvpx.cc
+++ b/media/libvpx/libvpx/test/test_libvpx.cc
@@ -7,67 +7,12 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include <string>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
-
-#include "./vpx_config.h"
-#if ARCH_X86 || ARCH_X86_64
-#include "vpx_ports/x86.h"
-#endif
-extern "C" {
-#if CONFIG_VP8
-extern void vp8_rtcd();
-#endif  // CONFIG_VP8
-#if CONFIG_VP9
-extern void vp9_rtcd();
-#endif  // CONFIG_VP9
-extern void vpx_dsp_rtcd();
-extern void vpx_scale_rtcd();
-}
-
-#if ARCH_X86 || ARCH_X86_64
-static void append_negative_gtest_filter(const char *str) {
-  std::string filter = ::testing::FLAGS_gtest_filter;
-  // Negative patterns begin with one '-' followed by a ':' separated list.
-  if (filter.find('-') == std::string::npos) filter += '-';
-  filter += str;
-  ::testing::FLAGS_gtest_filter = filter;
-}
-#endif  // ARCH_X86 || ARCH_X86_64
+#include "gtest/gtest.h"
+#include "test/init_vpx_test.h"
 
 int main(int argc, char **argv) {
   ::testing::InitGoogleTest(&argc, argv);
-
-#if ARCH_X86 || ARCH_X86_64
-  const int simd_caps = x86_simd_caps();
-  if (!(simd_caps & HAS_MMX)) append_negative_gtest_filter(":MMX.*:MMX/*");
-  if (!(simd_caps & HAS_SSE)) append_negative_gtest_filter(":SSE.*:SSE/*");
-  if (!(simd_caps & HAS_SSE2)) append_negative_gtest_filter(":SSE2.*:SSE2/*");
-  if (!(simd_caps & HAS_SSE3)) append_negative_gtest_filter(":SSE3.*:SSE3/*");
-  if (!(simd_caps & HAS_SSSE3)) {
-    append_negative_gtest_filter(":SSSE3.*:SSSE3/*");
-  }
-  if (!(simd_caps & HAS_SSE4_1)) {
-    append_negative_gtest_filter(":SSE4_1.*:SSE4_1/*");
-  }
-  if (!(simd_caps & HAS_AVX)) append_negative_gtest_filter(":AVX.*:AVX/*");
-  if (!(simd_caps & HAS_AVX2)) append_negative_gtest_filter(":AVX2.*:AVX2/*");
-#endif  // ARCH_X86 || ARCH_X86_64
-
-#if !CONFIG_SHARED
-// Shared library builds don't support whitebox tests
-// that exercise internal symbols.
-
-#if CONFIG_VP8
-  vp8_rtcd();
-#endif  // CONFIG_VP8
-#if CONFIG_VP9
-  vp9_rtcd();
-#endif  // CONFIG_VP9
-  vpx_dsp_rtcd();
-  vpx_scale_rtcd();
-#endif  // !CONFIG_SHARED
-
+  ::libvpx_test::init_vpx_test();
   return RUN_ALL_TESTS();
 }
diff --git a/media/libvpx/libvpx/test/test_rc_interface.cc b/media/libvpx/libvpx/test/test_rc_interface.cc
new file mode 100644
index 0000000000..840a299f82
--- /dev/null
+++ b/media/libvpx/libvpx/test/test_rc_interface.cc
@@ -0,0 +1,16 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "gtest/gtest.h"
+
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/media/libvpx/libvpx/test/test_vector_test.cc b/media/libvpx/libvpx/test/test_vector_test.cc
index 2dd33f73bc..2c0de86cba 100644
--- a/media/libvpx/libvpx/test/test_vector_test.cc
+++ b/media/libvpx/libvpx/test/test_vector_test.cc
@@ -10,9 +10,12 @@
 
 #include <cstdio>
 #include <cstdlib>
+#include <memory>
 #include <set>
 #include <string>
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include <tuple>
+
+#include "gtest/gtest.h"
 #include "../tools_common.h"
 #include "./vpx_config.h"
 #include "test/codec_factory.h"
@@ -28,18 +31,16 @@
 
 namespace {
 
-enum DecodeMode { kSerialMode, kFrameParallelMode };
-
-const int kDecodeMode = 0;
-const int kThreads = 1;
+const int kThreads = 0;
+const int kMtMode = 1;
 const int kFileName = 2;
 
-typedef std::tr1::tuple<int, int, const char *> DecodeParam;
+using DecodeParam = std::tuple<int, int, const char *>;
 
 class TestVectorTest : public ::libvpx_test::DecoderTest,
                        public ::libvpx_test::CodecTestWithParam<DecodeParam> {
  protected:
-  TestVectorTest() : DecoderTest(GET_PARAM(0)), md5_file_(NULL) {
+  TestVectorTest() : DecoderTest(GET_PARAM(0)), md5_file_(nullptr) {
 #if CONFIG_VP9_DECODER
     resize_clips_.insert(::libvpx_test::kVP9TestVectorsResize,
                          ::libvpx_test::kVP9TestVectorsResize +
@@ -47,19 +48,37 @@ class TestVectorTest : public ::libvpx_test::DecoderTest,
 #endif
   }
 
-  virtual ~TestVectorTest() {
+  ~TestVectorTest() override {
     if (md5_file_) fclose(md5_file_);
   }
 
   void OpenMD5File(const std::string &md5_file_name_) {
     md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_);
-    ASSERT_TRUE(md5_file_ != NULL) << "Md5 file open failed. Filename: "
-                                   << md5_file_name_;
+    ASSERT_NE(md5_file_, nullptr)
+        << "Md5 file open failed. Filename: " << md5_file_name_;
   }
 
-  virtual void DecompressedFrameHook(const vpx_image_t &img,
-                                     const unsigned int frame_number) {
-    ASSERT_TRUE(md5_file_ != NULL);
+#if CONFIG_VP9_DECODER
+  void PreDecodeFrameHook(const libvpx_test::CompressedVideoSource &video,
+                          libvpx_test::Decoder *decoder) override {
+    if (video.frame_number() == 0 && mt_mode_ >= 0) {
+      if (mt_mode_ == 1) {
+        decoder->Control(VP9D_SET_LOOP_FILTER_OPT, 1);
+        decoder->Control(VP9D_SET_ROW_MT, 0);
+      } else if (mt_mode_ == 2) {
+        decoder->Control(VP9D_SET_LOOP_FILTER_OPT, 0);
+        decoder->Control(VP9D_SET_ROW_MT, 1);
+      } else {
+        decoder->Control(VP9D_SET_LOOP_FILTER_OPT, 0);
+        decoder->Control(VP9D_SET_ROW_MT, 0);
+      }
+    }
+  }
+#endif
+
+  void DecompressedFrameHook(const vpx_image_t &img,
+                             const unsigned int frame_number) override {
+    ASSERT_NE(md5_file_, nullptr);
     char expected_md5[33];
     char junk[128];
 
@@ -80,6 +99,7 @@ class TestVectorTest : public ::libvpx_test::DecoderTest,
 #if CONFIG_VP9_DECODER
   std::set<std::string> resize_clips_;
 #endif
+  int mt_mode_;
 
  private:
   FILE *md5_file_;
@@ -91,34 +111,20 @@ class TestVectorTest : public ::libvpx_test::DecoderTest,
 // the test failed.
 TEST_P(TestVectorTest, MD5Match) {
   const DecodeParam input = GET_PARAM(1);
-  const std::string filename = std::tr1::get<kFileName>(input);
-  const int threads = std::tr1::get<kThreads>(input);
-  const int mode = std::tr1::get<kDecodeMode>(input);
+  const std::string filename = std::get<kFileName>(input);
   vpx_codec_flags_t flags = 0;
   vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
   char str[256];
 
-  if (mode == kFrameParallelMode) {
-    flags |= VPX_CODEC_USE_FRAME_THREADING;
-#if CONFIG_VP9_DECODER
-    // TODO(hkuang): Fix frame parallel decode bug. See issue 1086.
-    if (resize_clips_.find(filename) != resize_clips_.end()) {
-      printf("Skipping the test file: %s, due to frame parallel decode bug.\n",
-             filename.c_str());
-      return;
-    }
-#endif
-  }
-
-  cfg.threads = threads;
-
+  cfg.threads = std::get<kThreads>(input);
+  mt_mode_ = std::get<kMtMode>(input);
   snprintf(str, sizeof(str) / sizeof(str[0]) - 1,
-           "file: %s  mode: %s threads: %d", filename.c_str(),
-           mode == 0 ? "Serial" : "Parallel", threads);
+           "file: %s threads: %d MT mode: %d", filename.c_str(), cfg.threads,
+           mt_mode_);
   SCOPED_TRACE(str);
 
   // Open compressed video file.
-  testing::internal::scoped_ptr<libvpx_test::CompressedVideoSource> video;
+  std::unique_ptr<libvpx_test::CompressedVideoSource> video;
   if (filename.substr(filename.length() - 3, 3) == "ivf") {
     video.reset(new libvpx_test::IVFVideoSource(filename));
   } else if (filename.substr(filename.length() - 4, 4) == "webm") {
@@ -130,7 +136,7 @@ TEST_P(TestVectorTest, MD5Match) {
     return;
 #endif
   }
-  ASSERT_TRUE(video.get() != NULL);
+  ASSERT_NE(video.get(), nullptr);
   video->Init();
 
   // Construct md5 file name.
@@ -145,53 +151,52 @@ TEST_P(TestVectorTest, MD5Match) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(video.get(), cfg));
 }
 
-// Test VP8 decode in serial mode with single thread.
-// NOTE: VP8 only support serial mode.
 #if CONFIG_VP8_DECODER
-VP8_INSTANTIATE_TEST_CASE(
+VP8_INSTANTIATE_TEST_SUITE(
     TestVectorTest,
     ::testing::Combine(
-        ::testing::Values(0),  // Serial Mode.
-        ::testing::Values(1),  // Single thread.
+        ::testing::Values(1),   // Single thread.
+        ::testing::Values(-1),  // LPF opt and Row MT is not applicable
         ::testing::ValuesIn(libvpx_test::kVP8TestVectors,
                             libvpx_test::kVP8TestVectors +
                                 libvpx_test::kNumVP8TestVectors)));
 
 // Test VP8 decode in with different numbers of threads.
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     VP8MultiThreaded, TestVectorTest,
     ::testing::Combine(
         ::testing::Values(
             static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP8)),
         ::testing::Combine(
-            ::testing::Values(0),    // Serial Mode.
-            ::testing::Range(1, 8),  // With 1 ~ 8 threads.
+            ::testing::Range(2, 9),  // With 2 ~ 8 threads.
+            ::testing::Values(-1),   // LPF opt and Row MT is not applicable
             ::testing::ValuesIn(libvpx_test::kVP8TestVectors,
                                 libvpx_test::kVP8TestVectors +
                                     libvpx_test::kNumVP8TestVectors))));
 
 #endif  // CONFIG_VP8_DECODER
 
-// Test VP9 decode in serial mode with single thread.
 #if CONFIG_VP9_DECODER
-VP9_INSTANTIATE_TEST_CASE(
+VP9_INSTANTIATE_TEST_SUITE(
     TestVectorTest,
     ::testing::Combine(
-        ::testing::Values(0),  // Serial Mode.
-        ::testing::Values(1),  // Single thread.
+        ::testing::Values(1),   // Single thread.
+        ::testing::Values(-1),  // LPF opt and Row MT is not applicable
         ::testing::ValuesIn(libvpx_test::kVP9TestVectors,
                             libvpx_test::kVP9TestVectors +
                                 libvpx_test::kNumVP9TestVectors)));
 
-// Test VP9 decode in frame parallel mode with different number of threads.
-INSTANTIATE_TEST_CASE_P(
-    VP9MultiThreadedFrameParallel, TestVectorTest,
+INSTANTIATE_TEST_SUITE_P(
+    VP9MultiThreaded, TestVectorTest,
     ::testing::Combine(
         ::testing::Values(
             static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)),
         ::testing::Combine(
-            ::testing::Values(1),    // Frame Parallel mode.
             ::testing::Range(2, 9),  // With 2 ~ 8 threads.
+            ::testing::Range(0, 3),  // With multi threads modes 0 ~ 2
+                                     // 0: LPF opt and Row MT disabled
+                                     // 1: LPF opt enabled
+                                     // 2: Row MT enabled
             ::testing::ValuesIn(libvpx_test::kVP9TestVectors,
                                 libvpx_test::kVP9TestVectors +
                                     libvpx_test::kNumVP9TestVectors))));
diff --git a/media/libvpx/libvpx/test/test_vectors.cc b/media/libvpx/libvpx/test/test_vectors.cc
index def78da282..954ff771a9 100644
--- a/media/libvpx/libvpx/test/test_vectors.cc
+++ b/media/libvpx/libvpx/test/test_vectors.cc
@@ -9,6 +9,7 @@
  */
 
 #include "test/test_vectors.h"
+#include "vpx_config.h"
 
 namespace libvpx_test {
 
@@ -371,6 +372,7 @@ const char *const kVP9TestVectors[] = {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   "vp90-2-20-big_superframe-01.webm",
   "vp90-2-20-big_superframe-02.webm",
+  "vp90-2-22-svc_1280x720_1.webm",
   RESIZE_TEST_VECTORS
 };
 const char *const kVP9TestVectorsSvc[] = { "vp90-2-22-svc_1280x720_3.ivf" };
diff --git a/media/libvpx/libvpx/test/test_vectors.h b/media/libvpx/libvpx/test/test_vectors.h
index 3df3e81133..0a4be0f1a2 100644
--- a/media/libvpx/libvpx/test/test_vectors.h
+++ b/media/libvpx/libvpx/test/test_vectors.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef TEST_TEST_VECTORS_H_
-#define TEST_TEST_VECTORS_H_
+#ifndef VPX_TEST_TEST_VECTORS_H_
+#define VPX_TEST_TEST_VECTORS_H_
 
 #include "./vpx_config.h"
 
@@ -31,4 +31,4 @@ extern const char *const kVP9TestVectorsResize[];
 
 }  // namespace libvpx_test
 
-#endif  // TEST_TEST_VECTORS_H_
+#endif  // VPX_TEST_TEST_VECTORS_H_
diff --git a/media/libvpx/libvpx/test/tile_independence_test.cc b/media/libvpx/libvpx/test/tile_independence_test.cc
index e24981c68d..6bf203571f 100644
--- a/media/libvpx/libvpx/test/tile_independence_test.cc
+++ b/media/libvpx/libvpx/test/tile_independence_test.cc
@@ -11,12 +11,12 @@
 #include <cstdio>
 #include <cstdlib>
 #include <string>
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
-#include "test/util.h"
 #include "test/md5_helper.h"
+#include "test/util.h"
 #include "vpx_mem/vpx_mem.h"
 
 namespace {
@@ -36,19 +36,19 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest,
     inv_dec_->Control(VP9_INVERT_TILE_DECODE_ORDER, 1);
   }
 
-  virtual ~TileIndependenceTest() {
+  ~TileIndependenceTest() override {
     delete fw_dec_;
     delete inv_dec_;
   }
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(libvpx_test::kTwoPassGood);
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
       encoder->Control(VP9E_SET_TILE_COLUMNS, n_tiles_);
     }
   }
@@ -65,7 +65,7 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest,
     md5->Add(img);
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     UpdateMD5(fw_dec_, pkt, &md5_fw_order_);
     UpdateMD5(inv_dec_, pkt, &md5_inv_order_);
   }
@@ -100,5 +100,5 @@ TEST_P(TileIndependenceTest, MD5Match) {
   ASSERT_STREQ(md5_fw_str, md5_inv_str);
 }
 
-VP9_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Range(0, 2, 1));
+VP9_INSTANTIATE_TEST_SUITE(TileIndependenceTest, ::testing::Range(0, 2, 1));
 }  // namespace
diff --git a/media/libvpx/libvpx/test/timestamp_test.cc b/media/libvpx/libvpx/test/timestamp_test.cc
new file mode 100644
index 0000000000..3567824df3
--- /dev/null
+++ b/media/libvpx/libvpx/test/timestamp_test.cc
@@ -0,0 +1,109 @@
+/*
+ *  Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/video_source.h"
+#include "vpx_config.h"
+
+namespace {
+
+const int kVideoSourceWidth = 320;
+const int kVideoSourceHeight = 240;
+const int kFramesToEncode = 3;
+
+// A video source that exposes functions to set the timebase, framerate and
+// starting pts.
+class DummyTimebaseVideoSource : public ::libvpx_test::DummyVideoSource {
+ public:
+  // Parameters num and den set the timebase for the video source.
+  DummyTimebaseVideoSource(int num, int den)
+      : timebase_({ num, den }), framerate_numerator_(30),
+        framerate_denominator_(1), starting_pts_(0) {
+    SetSize(kVideoSourceWidth, kVideoSourceHeight);
+    set_limit(kFramesToEncode);
+  }
+
+  void SetFramerate(int numerator, int denominator) {
+    framerate_numerator_ = numerator;
+    framerate_denominator_ = denominator;
+  }
+
+  // Returns one frames duration in timebase units as a double.
+  double FrameDuration() const {
+    return (static_cast<double>(timebase_.den) / timebase_.num) /
+           (static_cast<double>(framerate_numerator_) / framerate_denominator_);
+  }
+
+  vpx_codec_pts_t pts() const override {
+    return static_cast<vpx_codec_pts_t>(frame_ * FrameDuration() +
+                                        starting_pts_ + 0.5);
+  }
+
+  unsigned long duration() const override {
+    return static_cast<unsigned long>(FrameDuration() + 0.5);
+  }
+
+  vpx_rational_t timebase() const override { return timebase_; }
+
+  void set_starting_pts(int64_t starting_pts) { starting_pts_ = starting_pts; }
+
+ private:
+  vpx_rational_t timebase_;
+  int framerate_numerator_;
+  int framerate_denominator_;
+  int64_t starting_pts_;
+};
+
+class TimestampTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
+ protected:
+  TimestampTest() : EncoderTest(GET_PARAM(0)) {}
+  ~TimestampTest() override = default;
+
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+  }
+};
+
+// Tests encoding in millisecond timebase.
+TEST_P(TimestampTest, EncodeFrames) {
+  DummyTimebaseVideoSource video(1, 1000);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+TEST_P(TimestampTest, TestMicrosecondTimebase) {
+  // Set the timebase to microseconds.
+  DummyTimebaseVideoSource video(1, 1000000);
+  video.set_limit(1);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+TEST_P(TimestampTest, TestVpxRollover) {
+  DummyTimebaseVideoSource video(1, 1000);
+  video.set_starting_pts(922337170351ll);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+#if CONFIG_REALTIME_ONLY
+VP8_INSTANTIATE_TEST_SUITE(TimestampTest,
+                           ::testing::Values(::libvpx_test::kRealTime));
+VP9_INSTANTIATE_TEST_SUITE(TimestampTest,
+                           ::testing::Values(::libvpx_test::kRealTime));
+#else
+VP8_INSTANTIATE_TEST_SUITE(TimestampTest,
+                           ::testing::Values(::libvpx_test::kTwoPassGood));
+VP9_INSTANTIATE_TEST_SUITE(TimestampTest,
+                           ::testing::Values(::libvpx_test::kTwoPassGood));
+#endif
+}  // namespace
diff --git a/media/libvpx/libvpx/test/tools_common.sh b/media/libvpx/libvpx/test/tools_common.sh
index 0bdcc08d78..d0dd24df36 100644
--- a/media/libvpx/libvpx/test/tools_common.sh
+++ b/media/libvpx/libvpx/test/tools_common.sh
@@ -133,7 +133,7 @@ vpx_config_option_enabled() {
   vpx_config_option="${1}"
   vpx_config_file="${LIBVPX_CONFIG_PATH}/vpx_config.h"
   config_line=$(grep "${vpx_config_option}" "${vpx_config_file}")
-  if echo "${config_line}" | egrep -q '1$'; then
+  if echo "${config_line}" | grep -E -q '1$'; then
     echo yes
   fi
 }
@@ -150,7 +150,7 @@ is_windows_target() {
 # empty string. Caller is responsible for testing the string once the function
 # returns.
 vpx_tool_path() {
-  local readonly tool_name="$1"
+  local tool_name="$1"
   local tool_path="${LIBVPX_BIN_PATH}/${tool_name}${VPX_TEST_EXE_SUFFIX}"
   if [ ! -x "${tool_path}" ]; then
     # Try one directory up: when running via examples.sh the tool could be in
@@ -222,7 +222,7 @@ filter_strings() {
 
   if [ -n "${filter}" ]; then
     for s in ${strings}; do
-      if echo "${s}" | egrep -q ${exclude} "${filter}" > /dev/null 2>&1; then
+      if echo "${s}" | grep -E -q ${exclude} "${filter}" > /dev/null 2>&1; then
         filtered_strings="${filtered_strings} ${s}"
       fi
     done
@@ -280,7 +280,12 @@ run_tests() {
     test_end "${test}"
   done
 
-  local tested_config="$(test_configuration_target) @ $(current_hash)"
+  # C vs SIMD tests are run for x86 32-bit, 64-bit and ARM platform
+  if [ "${test_name}" = "vp9_c_vs_simd_encode" ]; then
+    local tested_config="$(current_hash)"
+  else
+    local tested_config="$(test_configuration_target) @ $(current_hash)"
+  fi
   echo "${test_name}: Done, all tests pass for ${tested_config}."
 }
 
@@ -404,12 +409,16 @@ VP9_WEBM_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-00-quantizer-00.webm"
 VP9_FPM_WEBM_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-07-frame_parallel-1.webm"
 VP9_LT_50_FRAMES_WEBM_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-02-size-32x08.webm"
 
+VP9_RAW_FILE="${LIBVPX_TEST_DATA_PATH}/crbug-1539.rawfile"
+
 YUV_RAW_INPUT="${LIBVPX_TEST_DATA_PATH}/hantro_collage_w352h288.yuv"
 YUV_RAW_INPUT_WIDTH=352
 YUV_RAW_INPUT_HEIGHT=288
 
 Y4M_NOSQ_PAR_INPUT="${LIBVPX_TEST_DATA_PATH}/park_joy_90p_8_420_a10-1.y4m"
 Y4M_720P_INPUT="${LIBVPX_TEST_DATA_PATH}/niklas_1280_720_30.y4m"
+Y4M_720P_INPUT_WIDTH=1280
+Y4M_720P_INPUT_HEIGHT=720
 
 # Setup a trap function to clean up after tests complete.
 trap cleanup EXIT
diff --git a/media/libvpx/libvpx/test/twopass_encoder.sh b/media/libvpx/libvpx/test/twopass_encoder.sh
index 7a223f2afc..69ecbacd0c 100644
--- a/media/libvpx/libvpx/test/twopass_encoder.sh
+++ b/media/libvpx/libvpx/test/twopass_encoder.sh
@@ -37,7 +37,7 @@ twopass_encoder() {
 
   eval "${VPX_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
       "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" 100 \
-      ${devnull}
+      ${devnull} || return 1
 
   [ -e "${output_file}" ] || return 1
 }
@@ -54,7 +54,10 @@ twopass_encoder_vp9() {
   fi
 }
 
-twopass_encoder_tests="twopass_encoder_vp8
-                       twopass_encoder_vp9"
 
-run_tests twopass_encoder_verify_environment "${twopass_encoder_tests}"
+if [ "$(vpx_config_option_enabled CONFIG_REALTIME_ONLY)" != "yes" ]; then
+  twopass_encoder_tests="twopass_encoder_vp8
+                         twopass_encoder_vp9"
+
+  run_tests twopass_encoder_verify_environment "${twopass_encoder_tests}"
+fi
diff --git a/media/libvpx/libvpx/test/user_priv_test.cc b/media/libvpx/libvpx/test/user_priv_test.cc
index 4b5de094e9..0f45b082e5 100644
--- a/media/libvpx/libvpx/test/user_priv_test.cc
+++ b/media/libvpx/libvpx/test/user_priv_test.cc
@@ -11,7 +11,7 @@
 #include <cstdio>
 #include <cstdlib>
 #include <string>
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "./vpx_config.h"
 #include "test/acm_random.h"
 #include "test/codec_factory.h"
@@ -27,8 +27,8 @@
 
 namespace {
 
-using std::string;
 using libvpx_test::ACMRandom;
+using std::string;
 
 #if CONFIG_WEBM_IO
 
@@ -57,28 +57,28 @@ string DecodeFile(const string &filename) {
     void *user_priv = reinterpret_cast<void *>(&frame_num);
     const vpx_codec_err_t res =
         decoder.DecodeFrame(video.cxdata(), video.frame_size(),
-                            (frame_num == 0) ? NULL : user_priv);
+                            (frame_num == 0) ? nullptr : user_priv);
     if (res != VPX_CODEC_OK) {
       EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
       break;
     }
     libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
-    const vpx_image_t *img = NULL;
+    const vpx_image_t *img = nullptr;
 
     // Get decompressed data.
     while ((img = dec_iter.Next())) {
       if (frame_num == 0) {
-        CheckUserPrivateData(img->user_priv, NULL);
+        CheckUserPrivateData(img->user_priv, nullptr);
       } else {
         CheckUserPrivateData(img->user_priv, &frame_num);
 
         // Also test ctrl_get_reference api.
-        struct vp9_ref_frame ref;
+        struct vp9_ref_frame ref = vp9_ref_frame();
         // Randomly fetch a reference frame.
         ref.idx = rnd.Rand8() % 3;
         decoder.Control(VP9_GET_REFERENCE, &ref);
 
-        CheckUserPrivateData(ref.img.user_priv, NULL);
+        CheckUserPrivateData(ref.img.user_priv, nullptr);
       }
       md5.Add(img);
     }
diff --git a/media/libvpx/libvpx/test/util.h b/media/libvpx/libvpx/test/util.h
index 1f2540ecf2..94bab66416 100644
--- a/media/libvpx/libvpx/test/util.h
+++ b/media/libvpx/libvpx/test/util.h
@@ -8,16 +8,18 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef TEST_UTIL_H_
-#define TEST_UTIL_H_
+#ifndef VPX_TEST_UTIL_H_
+#define VPX_TEST_UTIL_H_
 
 #include <stdio.h>
 #include <math.h>
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include <tuple>
+
+#include "gtest/gtest.h"
 #include "vpx/vpx_image.h"
 
 // Macros
-#define GET_PARAM(k) std::tr1::get<k>(GetParam())
+#define GET_PARAM(k) std::get<k>(GetParam())
 
 inline double compute_psnr(const vpx_image_t *img1, const vpx_image_t *img2) {
   assert((img1->fmt == img2->fmt) && (img1->d_w == img2->d_w) &&
@@ -43,4 +45,4 @@ inline double compute_psnr(const vpx_image_t *img1, const vpx_image_t *img2) {
   return psnr;
 }
 
-#endif  // TEST_UTIL_H_
+#endif  // VPX_TEST_UTIL_H_
diff --git a/media/libvpx/libvpx/test/variance_test.cc b/media/libvpx/libvpx/test/variance_test.cc
index 6e31165faf..aceb4e4168 100644
--- a/media/libvpx/libvpx/test/variance_test.cc
+++ b/media/libvpx/libvpx/test/variance_test.cc
@@ -11,7 +11,7 @@
 #include <cstdlib>
 #include <new>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
@@ -20,26 +20,19 @@
 #include "test/register_state_check.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/variance.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
+#include "vpx_ports/vpx_timer.h"
 
 namespace {
 
-typedef unsigned int (*VarianceMxNFunc)(const uint8_t *a, int a_stride,
-                                        const uint8_t *b, int b_stride,
-                                        unsigned int *sse);
-typedef unsigned int (*SubpixVarMxNFunc)(const uint8_t *a, int a_stride,
-                                         int xoffset, int yoffset,
-                                         const uint8_t *b, int b_stride,
-                                         unsigned int *sse);
-typedef unsigned int (*SubpixAvgVarMxNFunc)(const uint8_t *a, int a_stride,
-                                            int xoffset, int yoffset,
-                                            const uint8_t *b, int b_stride,
-                                            uint32_t *sse,
-                                            const uint8_t *second_pred);
-typedef unsigned int (*Get4x4SseFunc)(const uint8_t *a, int a_stride,
-                                      const uint8_t *b, int b_stride);
-typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src);
+using Get4x4SseFunc = unsigned int (*)(const uint8_t *a, int a_stride,
+                                       const uint8_t *b, int b_stride);
+using GetVarianceFunc = void (*)(const uint8_t *src_ptr, int src_stride,
+                                 const uint8_t *ref_ptr, int ref_stride,
+                                 uint32_t *sse, int *sum);
+using SumOfSquaresFunction = unsigned int (*)(const int16_t *src);
 
 using libvpx_test::ACMRandom;
 
@@ -73,35 +66,65 @@ static unsigned int mb_ss_ref(const int16_t *src) {
  *  Our codebase calculates the "diff" value in the variance algorithm by
  *  (src - ref).
  */
-static uint32_t variance_ref(const uint8_t *src, const uint8_t *ref, int l2w,
-                             int l2h, int src_stride, int ref_stride,
-                             uint32_t *sse_ptr, bool use_high_bit_depth_,
-                             vpx_bit_depth_t bit_depth) {
-  int64_t se = 0;
-  uint64_t sse = 0;
-  const int w = 1 << l2w;
-  const int h = 1 << l2h;
+static void variance(const uint8_t *src, int src_stride, const uint8_t *ref,
+                     int ref_stride, int w, int h, bool use_high_bit_depth_,
+                     uint64_t *sse, int64_t *se, vpx_bit_depth_t bit_depth) {
+  int64_t se_long = 0;
+  uint64_t sse_long = 0;
+
   for (int y = 0; y < h; y++) {
     for (int x = 0; x < w; x++) {
-      int diff;
+      int diff = 0;
       if (!use_high_bit_depth_) {
         diff = src[y * src_stride + x] - ref[y * ref_stride + x];
-        se += diff;
-        sse += diff * diff;
 #if CONFIG_VP9_HIGHBITDEPTH
       } else {
         diff = CONVERT_TO_SHORTPTR(src)[y * src_stride + x] -
                CONVERT_TO_SHORTPTR(ref)[y * ref_stride + x];
-        se += diff;
-        sse += diff * diff;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       }
+      se_long += diff;
+      sse_long += diff * diff;
     }
   }
-  RoundHighBitDepth(bit_depth, &se, &sse);
-  *sse_ptr = static_cast<uint32_t>(sse);
+
+  RoundHighBitDepth(bit_depth, &se_long, &sse_long);
+
+  *sse = sse_long;
+  *se = se_long;
+}
+
+static void get_variance_ref(const uint8_t *src, int src_stride,
+                             const uint8_t *ref, int ref_stride, int l2w,
+                             int l2h, bool use_high_bit_depth_, uint32_t *sse,
+                             int *se, vpx_bit_depth_t bit_depth) {
+  const int w = 1 << l2w;
+  const int h = 1 << l2h;
+  int64_t se_long = 0;
+  uint64_t sse_long = 0;
+
+  variance(src, src_stride, ref, ref_stride, w, h, use_high_bit_depth_,
+           &sse_long, &se_long, bit_depth);
+
+  *sse = static_cast<uint32_t>(sse_long);
+  *se = static_cast<int>(se_long);
+}
+
+static uint32_t variance_ref(const uint8_t *src, const uint8_t *ref, int l2w,
+                             int l2h, int src_stride, int ref_stride,
+                             uint32_t *sse_ptr, bool use_high_bit_depth_,
+                             vpx_bit_depth_t bit_depth) {
+  const int w = 1 << l2w;
+  const int h = 1 << l2h;
+  int64_t se_long = 0;
+  uint64_t sse_long = 0;
+
+  variance(src, src_stride, ref, ref_stride, w, h, use_high_bit_depth_,
+           &sse_long, &se_long, bit_depth);
+
+  *sse_ptr = static_cast<uint32_t>(sse_long);
   return static_cast<uint32_t>(
-      sse - ((static_cast<int64_t>(se) * se) >> (l2w + l2h)));
+      sse_long - ((static_cast<int64_t>(se_long) * se_long) >> (l2w + l2h)));
 }
 
 /* The subpel reference functions differ from the codec version in one aspect:
@@ -220,7 +243,7 @@ class SumOfSquaresTest : public ::testing::TestWithParam<SumOfSquaresFunction> {
  public:
   SumOfSquaresTest() : func_(GetParam()) {}
 
-  virtual ~SumOfSquaresTest() { libvpx_test::ClearSystemState(); }
+  ~SumOfSquaresTest() override { libvpx_test::ClearSystemState(); }
 
  protected:
   void ConstTest();
@@ -263,7 +286,7 @@ void SumOfSquaresTest::RefTest() {
 
 template <typename Func>
 struct TestParams {
-  TestParams(int log2w = 0, int log2h = 0, Func function = NULL,
+  TestParams(int log2w = 0, int log2h = 0, Func function = nullptr,
              int bit_depth_value = 0)
       : log2width(log2w), log2height(log2h), func(function) {
     use_high_bit_depth = (bit_depth_value > 0);
@@ -299,7 +322,7 @@ template <typename FunctionType>
 class MainTestClass
     : public ::testing::TestWithParam<TestParams<FunctionType> > {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     params_ = this->GetParam();
 
     rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -307,8 +330,8 @@ class MainTestClass
         use_high_bit_depth() ? sizeof(uint16_t) : sizeof(uint8_t);
     src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size() * unit));
     ref_ = new uint8_t[block_size() * unit];
-    ASSERT_TRUE(src_ != NULL);
-    ASSERT_TRUE(ref_ != NULL);
+    ASSERT_NE(src_, nullptr);
+    ASSERT_NE(ref_, nullptr);
 #if CONFIG_VP9_HIGHBITDEPTH
     if (use_high_bit_depth()) {
       // TODO(skal): remove!
@@ -318,7 +341,7 @@ class MainTestClass
 #endif
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
 #if CONFIG_VP9_HIGHBITDEPTH
     if (use_high_bit_depth()) {
       // TODO(skal): remove!
@@ -329,8 +352,8 @@ class MainTestClass
 
     vpx_free(src_);
     delete[] ref_;
-    src_ = NULL;
-    ref_ = NULL;
+    src_ = nullptr;
+    ref_ = nullptr;
     libvpx_test::ClearSystemState();
   }
 
@@ -345,6 +368,10 @@ class MainTestClass
   void RefTest();
   void RefStrideTest();
   void OneQuarterTest();
+  void SpeedTest();
+
+  // GetVariance tests
+  void RefTestGetVar();
 
   // MSE/SSE tests
   void RefTestMse();
@@ -363,6 +390,7 @@ class MainTestClass
   int byte_shift() const { return params_.bit_depth - 8; }
   int block_size() const { return params_.block_size; }
   int width() const { return params_.width; }
+  int height() const { return params_.height; }
   uint32_t mask() const { return params_.mask; }
 };
 
@@ -471,6 +499,64 @@ void MainTestClass<VarianceFunctionType>::OneQuarterTest() {
   EXPECT_EQ(expected, var);
 }
 
+template <typename VarianceFunctionType>
+void MainTestClass<VarianceFunctionType>::SpeedTest() {
+  const int half = block_size() / 2;
+  if (!use_high_bit_depth()) {
+    memset(src_, 255, block_size());
+    memset(ref_, 255, half);
+    memset(ref_ + half, 0, half);
+#if CONFIG_VP9_HIGHBITDEPTH
+  } else {
+    vpx_memset16(CONVERT_TO_SHORTPTR(src_), 255 << byte_shift(), block_size());
+    vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 255 << byte_shift(), half);
+    vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, 0, half);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+  unsigned int sse;
+
+  vpx_usec_timer timer;
+  vpx_usec_timer_start(&timer);
+  for (int i = 0; i < (1 << 30) / block_size(); ++i) {
+    const uint32_t variance = params_.func(src_, width(), ref_, width(), &sse);
+    // Ignore return value.
+    (void)variance;
+  }
+  vpx_usec_timer_mark(&timer);
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("Variance %dx%d %dbpp time: %5d ms\n", width(), height(),
+         params_.bit_depth, elapsed_time / 1000);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Tests related to GetVariance.
+template <typename GetVarianceFunctionType>
+void MainTestClass<GetVarianceFunctionType>::RefTestGetVar() {
+  for (int i = 0; i < 10; ++i) {
+    for (int j = 0; j < block_size(); j++) {
+      if (!use_high_bit_depth()) {
+        src_[j] = rnd_.Rand8();
+        ref_[j] = rnd_.Rand8();
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+        CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+    }
+    unsigned int sse1, sse2;
+    int sum1, sum2;
+    const int stride = width();
+    ASM_REGISTER_STATE_CHECK(
+        params_.func(src_, stride, ref_, stride, &sse1, &sum1));
+    get_variance_ref(src_, stride, ref_, stride, params_.log2width,
+                     params_.log2height, use_high_bit_depth(), &sse2, &sum2,
+                     params_.bit_depth);
+    EXPECT_EQ(sse1, sse2) << "Error at test index: " << i;
+    EXPECT_EQ(sum1, sum2) << "Error at test index: " << i;
+  }
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // Tests related to MSE / SSE.
 
@@ -478,14 +564,21 @@ template <typename FunctionType>
 void MainTestClass<FunctionType>::RefTestMse() {
   for (int i = 0; i < 10; ++i) {
     for (int j = 0; j < block_size(); ++j) {
-      src_[j] = rnd_.Rand8();
-      ref_[j] = rnd_.Rand8();
+      if (!use_high_bit_depth()) {
+        src_[j] = rnd_.Rand8();
+        ref_[j] = rnd_.Rand8();
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+        CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
     }
     unsigned int sse1, sse2;
     const int stride = width();
     ASM_REGISTER_STATE_CHECK(params_.func(src_, stride, ref_, stride, &sse1));
     variance_ref(src_, ref_, params_.log2width, params_.log2height, stride,
-                 stride, &sse2, false, VPX_BITS_8);
+                 stride, &sse2, use_high_bit_depth(), params_.bit_depth);
     EXPECT_EQ(sse1, sse2);
   }
 }
@@ -509,8 +602,15 @@ void MainTestClass<FunctionType>::RefTestSse() {
 
 template <typename FunctionType>
 void MainTestClass<FunctionType>::MaxTestMse() {
-  memset(src_, 255, block_size());
-  memset(ref_, 0, block_size());
+  if (!use_high_bit_depth()) {
+    memset(src_, 255, block_size());
+    memset(ref_, 0, block_size());
+#if CONFIG_VP9_HIGHBITDEPTH
+  } else {
+    vpx_memset16(CONVERT_TO_SHORTPTR(src_), 255 << byte_shift(), block_size());
+    vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 0, block_size());
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
   unsigned int sse;
   ASM_REGISTER_STATE_CHECK(params_.func(src_, width(), ref_, width(), &sse));
   const unsigned int expected = block_size() * 255 * 255;
@@ -529,62 +629,43 @@ void MainTestClass<FunctionType>::MaxTestSse() {
 
 ////////////////////////////////////////////////////////////////////////////////
 
-using ::std::tr1::get;
-using ::std::tr1::make_tuple;
-using ::std::tr1::tuple;
-
-template <typename SubpelVarianceFunctionType>
+template <typename FunctionType>
 class SubpelVarianceTest
-    : public ::testing::TestWithParam<
-          tuple<int, int, SubpelVarianceFunctionType, int> > {
+    : public ::testing::TestWithParam<TestParams<FunctionType> > {
  public:
-  virtual void SetUp() {
-    const tuple<int, int, SubpelVarianceFunctionType, int> &params =
-        this->GetParam();
-    log2width_ = get<0>(params);
-    width_ = 1 << log2width_;
-    log2height_ = get<1>(params);
-    height_ = 1 << log2height_;
-    subpel_variance_ = get<2>(params);
-    if (get<3>(params)) {
-      bit_depth_ = (vpx_bit_depth_t)get<3>(params);
-      use_high_bit_depth_ = true;
-    } else {
-      bit_depth_ = VPX_BITS_8;
-      use_high_bit_depth_ = false;
-    }
-    mask_ = (1 << bit_depth_) - 1;
+  void SetUp() override {
+    params_ = this->GetParam();
 
     rnd_.Reset(ACMRandom::DeterministicSeed());
-    block_size_ = width_ * height_;
-    if (!use_high_bit_depth_) {
-      src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
-      sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
-      ref_ = new uint8_t[block_size_ + width_ + height_ + 1];
+    if (!use_high_bit_depth()) {
+      src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size()));
+      sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size()));
+      ref_ = reinterpret_cast<uint8_t *>(
+          vpx_malloc(block_size() + width() + height() + 1));
 #if CONFIG_VP9_HIGHBITDEPTH
     } else {
       src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
-          vpx_memalign(16, block_size_ * sizeof(uint16_t))));
+          vpx_memalign(16, block_size() * sizeof(uint16_t))));
       sec_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
-          vpx_memalign(16, block_size_ * sizeof(uint16_t))));
-      ref_ =
-          CONVERT_TO_BYTEPTR(new uint16_t[block_size_ + width_ + height_ + 1]);
+          vpx_memalign(16, block_size() * sizeof(uint16_t))));
+      ref_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(vpx_malloc(
+          (block_size() + width() + height() + 1) * sizeof(uint16_t))));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     }
-    ASSERT_TRUE(src_ != NULL);
-    ASSERT_TRUE(sec_ != NULL);
-    ASSERT_TRUE(ref_ != NULL);
+    ASSERT_NE(src_, nullptr);
+    ASSERT_NE(sec_, nullptr);
+    ASSERT_NE(ref_, nullptr);
   }
 
-  virtual void TearDown() {
-    if (!use_high_bit_depth_) {
+  void TearDown() override {
+    if (!use_high_bit_depth()) {
       vpx_free(src_);
-      delete[] ref_;
       vpx_free(sec_);
+      vpx_free(ref_);
 #if CONFIG_VP9_HIGHBITDEPTH
     } else {
       vpx_free(CONVERT_TO_SHORTPTR(src_));
-      delete[] CONVERT_TO_SHORTPTR(ref_);
+      vpx_free(CONVERT_TO_SHORTPTR(ref_));
       vpx_free(CONVERT_TO_SHORTPTR(sec_));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     }
@@ -594,47 +675,51 @@ class SubpelVarianceTest
  protected:
   void RefTest();
   void ExtremeRefTest();
+  void SpeedTest();
 
   ACMRandom rnd_;
   uint8_t *src_;
   uint8_t *ref_;
   uint8_t *sec_;
-  bool use_high_bit_depth_;
-  vpx_bit_depth_t bit_depth_;
-  int width_, log2width_;
-  int height_, log2height_;
-  int block_size_, mask_;
-  SubpelVarianceFunctionType subpel_variance_;
+  TestParams<FunctionType> params_;
+
+  // some relay helpers
+  bool use_high_bit_depth() const { return params_.use_high_bit_depth; }
+  int byte_shift() const { return params_.bit_depth - 8; }
+  int block_size() const { return params_.block_size; }
+  int width() const { return params_.width; }
+  int height() const { return params_.height; }
+  uint32_t mask() const { return params_.mask; }
 };
 
 template <typename SubpelVarianceFunctionType>
 void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() {
   for (int x = 0; x < 8; ++x) {
     for (int y = 0; y < 8; ++y) {
-      if (!use_high_bit_depth_) {
-        for (int j = 0; j < block_size_; j++) {
+      if (!use_high_bit_depth()) {
+        for (int j = 0; j < block_size(); j++) {
           src_[j] = rnd_.Rand8();
         }
-        for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+        for (int j = 0; j < block_size() + width() + height() + 1; j++) {
           ref_[j] = rnd_.Rand8();
         }
 #if CONFIG_VP9_HIGHBITDEPTH
       } else {
-        for (int j = 0; j < block_size_; j++) {
-          CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_;
+        for (int j = 0; j < block_size(); j++) {
+          CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
         }
-        for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
-          CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_;
+        for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+          CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
         }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       }
       unsigned int sse1, sse2;
       unsigned int var1;
       ASM_REGISTER_STATE_CHECK(
-          var1 = subpel_variance_(ref_, width_ + 1, x, y, src_, width_, &sse1));
-      const unsigned int var2 =
-          subpel_variance_ref(ref_, src_, log2width_, log2height_, x, y, &sse2,
-                              use_high_bit_depth_, bit_depth_);
+          var1 = params_.func(ref_, width() + 1, x, y, src_, width(), &sse1));
+      const unsigned int var2 = subpel_variance_ref(
+          ref_, src_, params_.log2width, params_.log2height, x, y, &sse2,
+          use_high_bit_depth(), params_.bit_depth);
       EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
       EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
     }
@@ -648,108 +733,143 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::ExtremeRefTest() {
   // Ref: Set the first half of values to the maximum, the second half to 0.
   for (int x = 0; x < 8; ++x) {
     for (int y = 0; y < 8; ++y) {
-      const int half = block_size_ / 2;
-      if (!use_high_bit_depth_) {
+      const int half = block_size() / 2;
+      if (!use_high_bit_depth()) {
         memset(src_, 0, half);
         memset(src_ + half, 255, half);
         memset(ref_, 255, half);
-        memset(ref_ + half, 0, half + width_ + height_ + 1);
+        memset(ref_ + half, 0, half + width() + height() + 1);
 #if CONFIG_VP9_HIGHBITDEPTH
       } else {
-        vpx_memset16(CONVERT_TO_SHORTPTR(src_), mask_, half);
+        vpx_memset16(CONVERT_TO_SHORTPTR(src_), mask(), half);
         vpx_memset16(CONVERT_TO_SHORTPTR(src_) + half, 0, half);
         vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 0, half);
-        vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask_,
-                     half + width_ + height_ + 1);
+        vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask(),
+                     half + width() + height() + 1);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       }
       unsigned int sse1, sse2;
       unsigned int var1;
       ASM_REGISTER_STATE_CHECK(
-          var1 = subpel_variance_(ref_, width_ + 1, x, y, src_, width_, &sse1));
-      const unsigned int var2 =
-          subpel_variance_ref(ref_, src_, log2width_, log2height_, x, y, &sse2,
-                              use_high_bit_depth_, bit_depth_);
+          var1 = params_.func(ref_, width() + 1, x, y, src_, width(), &sse1));
+      const unsigned int var2 = subpel_variance_ref(
+          ref_, src_, params_.log2width, params_.log2height, x, y, &sse2,
+          use_high_bit_depth(), params_.bit_depth);
       EXPECT_EQ(sse1, sse2) << "for xoffset " << x << " and yoffset " << y;
       EXPECT_EQ(var1, var2) << "for xoffset " << x << " and yoffset " << y;
     }
   }
 }
 
+template <typename SubpelVarianceFunctionType>
+void SubpelVarianceTest<SubpelVarianceFunctionType>::SpeedTest() {
+  // The only interesting points are 0, 4, and anything else. To make the loops
+  // simple we will use 0, 2 and 4.
+  for (int x = 0; x <= 4; x += 2) {
+    for (int y = 0; y <= 4; y += 2) {
+      if (!use_high_bit_depth()) {
+        memset(src_, 25, block_size());
+        memset(ref_, 50, block_size());
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        vpx_memset16(CONVERT_TO_SHORTPTR(src_), 25, block_size());
+        vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 50, block_size());
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+      unsigned int sse;
+      vpx_usec_timer timer;
+      vpx_usec_timer_start(&timer);
+      for (int i = 0; i < 1000000000 / block_size(); ++i) {
+        const uint32_t variance =
+            params_.func(ref_, width() + 1, x, y, src_, width(), &sse);
+        (void)variance;
+      }
+      vpx_usec_timer_mark(&timer);
+      const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+      printf("SubpelVariance %dx%d xoffset: %d yoffset: %d time: %5d ms\n",
+             width(), height(), x, y, elapsed_time / 1000);
+    }
+  }
+}
+
 template <>
-void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() {
+void SubpelVarianceTest<vpx_subp_avg_variance_fn_t>::RefTest() {
   for (int x = 0; x < 8; ++x) {
     for (int y = 0; y < 8; ++y) {
-      if (!use_high_bit_depth_) {
-        for (int j = 0; j < block_size_; j++) {
+      if (!use_high_bit_depth()) {
+        for (int j = 0; j < block_size(); j++) {
           src_[j] = rnd_.Rand8();
           sec_[j] = rnd_.Rand8();
         }
-        for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+        for (int j = 0; j < block_size() + width() + height() + 1; j++) {
           ref_[j] = rnd_.Rand8();
         }
 #if CONFIG_VP9_HIGHBITDEPTH
       } else {
-        for (int j = 0; j < block_size_; j++) {
-          CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_;
-          CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask_;
+        for (int j = 0; j < block_size(); j++) {
+          CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+          CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask();
         }
-        for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
-          CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_;
+        for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+          CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
         }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       }
       uint32_t sse1, sse2;
       uint32_t var1, var2;
-      ASM_REGISTER_STATE_CHECK(var1 =
-                                   subpel_variance_(ref_, width_ + 1, x, y,
-                                                    src_, width_, &sse1, sec_));
-      var2 = subpel_avg_variance_ref(ref_, src_, sec_, log2width_, log2height_,
-                                     x, y, &sse2, use_high_bit_depth_,
-                                     static_cast<vpx_bit_depth_t>(bit_depth_));
+      ASM_REGISTER_STATE_CHECK(var1 = params_.func(ref_, width() + 1, x, y,
+                                                   src_, width(), &sse1, sec_));
+      var2 = subpel_avg_variance_ref(ref_, src_, sec_, params_.log2width,
+                                     params_.log2height, x, y, &sse2,
+                                     use_high_bit_depth(), params_.bit_depth);
       EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
       EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
     }
   }
 }
 
-typedef MainTestClass<Get4x4SseFunc> VpxSseTest;
-typedef MainTestClass<VarianceMxNFunc> VpxMseTest;
-typedef MainTestClass<VarianceMxNFunc> VpxVarianceTest;
-typedef SubpelVarianceTest<SubpixVarMxNFunc> VpxSubpelVarianceTest;
-typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> VpxSubpelAvgVarianceTest;
+using VpxSseTest = MainTestClass<Get4x4SseFunc>;
+using VpxMseTest = MainTestClass<vpx_variance_fn_t>;
+using VpxVarianceTest = MainTestClass<vpx_variance_fn_t>;
+using VpxGetVarianceTest = MainTestClass<GetVarianceFunc>;
+using VpxSubpelVarianceTest = SubpelVarianceTest<vpx_subpixvariance_fn_t>;
+using VpxSubpelAvgVarianceTest = SubpelVarianceTest<vpx_subp_avg_variance_fn_t>;
 
 TEST_P(VpxSseTest, RefSse) { RefTestSse(); }
 TEST_P(VpxSseTest, MaxSse) { MaxTestSse(); }
 TEST_P(VpxMseTest, RefMse) { RefTestMse(); }
 TEST_P(VpxMseTest, MaxMse) { MaxTestMse(); }
+TEST_P(VpxMseTest, DISABLED_Speed) { SpeedTest(); }
 TEST_P(VpxVarianceTest, Zero) { ZeroTest(); }
 TEST_P(VpxVarianceTest, Ref) { RefTest(); }
 TEST_P(VpxVarianceTest, RefStride) { RefStrideTest(); }
 TEST_P(VpxVarianceTest, OneQuarter) { OneQuarterTest(); }
+TEST_P(VpxVarianceTest, DISABLED_Speed) { SpeedTest(); }
+TEST_P(VpxGetVarianceTest, RefGetVar) { RefTestGetVar(); }
 TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
 TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
 TEST_P(VpxSubpelVarianceTest, Ref) { RefTest(); }
 TEST_P(VpxSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
+TEST_P(VpxSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); }
 TEST_P(VpxSubpelAvgVarianceTest, Ref) { RefTest(); }
 
-INSTANTIATE_TEST_CASE_P(C, SumOfSquaresTest,
-                        ::testing::Values(vpx_get_mb_ss_c));
+INSTANTIATE_TEST_SUITE_P(C, SumOfSquaresTest,
+                         ::testing::Values(vpx_get_mb_ss_c));
 
-typedef TestParams<Get4x4SseFunc> SseParams;
-INSTANTIATE_TEST_CASE_P(C, VpxSseTest,
-                        ::testing::Values(SseParams(2, 2,
-                                                    &vpx_get4x4sse_cs_c)));
+using SseParams = TestParams<Get4x4SseFunc>;
+INSTANTIATE_TEST_SUITE_P(C, VpxSseTest,
+                         ::testing::Values(SseParams(2, 2,
+                                                     &vpx_get4x4sse_cs_c)));
 
-typedef TestParams<VarianceMxNFunc> MseParams;
-INSTANTIATE_TEST_CASE_P(C, VpxMseTest,
-                        ::testing::Values(MseParams(4, 4, &vpx_mse16x16_c),
-                                          MseParams(4, 3, &vpx_mse16x8_c),
-                                          MseParams(3, 4, &vpx_mse8x16_c),
-                                          MseParams(3, 3, &vpx_mse8x8_c)));
+using MseParams = TestParams<vpx_variance_fn_t>;
+INSTANTIATE_TEST_SUITE_P(C, VpxMseTest,
+                         ::testing::Values(MseParams(4, 4, &vpx_mse16x16_c),
+                                           MseParams(4, 3, &vpx_mse16x8_c),
+                                           MseParams(3, 4, &vpx_mse8x16_c),
+                                           MseParams(3, 3, &vpx_mse8x8_c)));
 
-typedef TestParams<VarianceMxNFunc> VarianceParams;
-INSTANTIATE_TEST_CASE_P(
+using VarianceParams = TestParams<vpx_variance_fn_t>;
+INSTANTIATE_TEST_SUITE_P(
     C, VpxVarianceTest,
     ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_c),
                       VarianceParams(6, 5, &vpx_variance64x32_c),
@@ -765,72 +885,91 @@ INSTANTIATE_TEST_CASE_P(
                       VarianceParams(2, 3, &vpx_variance4x8_c),
                       VarianceParams(2, 2, &vpx_variance4x4_c)));
 
-INSTANTIATE_TEST_CASE_P(
-    C, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_c, 0),
-                      make_tuple(6, 5, &vpx_sub_pixel_variance64x32_c, 0),
-                      make_tuple(5, 6, &vpx_sub_pixel_variance32x64_c, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_c, 0),
-                      make_tuple(5, 4, &vpx_sub_pixel_variance32x16_c, 0),
-                      make_tuple(4, 5, &vpx_sub_pixel_variance16x32_c, 0),
-                      make_tuple(4, 4, &vpx_sub_pixel_variance16x16_c, 0),
-                      make_tuple(4, 3, &vpx_sub_pixel_variance16x8_c, 0),
-                      make_tuple(3, 4, &vpx_sub_pixel_variance8x16_c, 0),
-                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_c, 0),
-                      make_tuple(3, 2, &vpx_sub_pixel_variance8x4_c, 0),
-                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_c, 0),
-                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_c, 0)));
+using GetVarianceParams = TestParams<GetVarianceFunc>;
+INSTANTIATE_TEST_SUITE_P(
+    C, VpxGetVarianceTest,
+    ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_c),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_c),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_c),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_c),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_c),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_c)));
 
-INSTANTIATE_TEST_CASE_P(
+using SubpelVarianceParams = TestParams<vpx_subpixvariance_fn_t>;
+INSTANTIATE_TEST_SUITE_P(
+    C, VpxSubpelVarianceTest,
+    ::testing::Values(
+        SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_c, 0),
+        SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_c, 0),
+        SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_c, 0),
+        SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_c, 0),
+        SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_c, 0),
+        SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_c, 0),
+        SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_c, 0),
+        SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_c, 0),
+        SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_c, 0),
+        SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_c, 0),
+        SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_c, 0),
+        SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_c, 0),
+        SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_c, 0)));
+
+using SubpelAvgVarianceParams = TestParams<vpx_subp_avg_variance_fn_t>;
+INSTANTIATE_TEST_SUITE_P(
     C, VpxSubpelAvgVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_c, 0),
-                      make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_c, 0),
-                      make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_c, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_c, 0),
-                      make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_c, 0),
-                      make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_c, 0),
-                      make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_c, 0),
-                      make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_c, 0),
-                      make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_c, 0),
-                      make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_c, 0),
-                      make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_c, 0),
-                      make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_c, 0),
-                      make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_c, 0)));
+    ::testing::Values(
+        SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_c, 0),
+        SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_c, 0),
+        SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_c, 0),
+        SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_c, 0),
+        SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_c, 0),
+        SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_c, 0),
+        SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_c, 0),
+        SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_c, 0),
+        SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_c, 0),
+        SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_c, 0),
+        SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_c, 0),
+        SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_c, 0),
+        SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_c, 0)));
 
 #if CONFIG_VP9_HIGHBITDEPTH
-typedef MainTestClass<VarianceMxNFunc> VpxHBDMseTest;
-typedef MainTestClass<VarianceMxNFunc> VpxHBDVarianceTest;
-typedef SubpelVarianceTest<SubpixVarMxNFunc> VpxHBDSubpelVarianceTest;
-typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> VpxHBDSubpelAvgVarianceTest;
+using VpxHBDVarianceTest = MainTestClass<vpx_variance_fn_t>;
+using VpxHBDGetVarianceTest = MainTestClass<GetVarianceFunc>;
+using VpxHBDSubpelVarianceTest = SubpelVarianceTest<vpx_subpixvariance_fn_t>;
+using VpxHBDSubpelAvgVarianceTest =
+    SubpelVarianceTest<vpx_subp_avg_variance_fn_t>;
 
-TEST_P(VpxHBDMseTest, RefMse) { RefTestMse(); }
-TEST_P(VpxHBDMseTest, MaxMse) { MaxTestMse(); }
 TEST_P(VpxHBDVarianceTest, Zero) { ZeroTest(); }
 TEST_P(VpxHBDVarianceTest, Ref) { RefTest(); }
 TEST_P(VpxHBDVarianceTest, RefStride) { RefStrideTest(); }
 TEST_P(VpxHBDVarianceTest, OneQuarter) { OneQuarterTest(); }
+TEST_P(VpxHBDVarianceTest, DISABLED_Speed) { SpeedTest(); }
+TEST_P(VpxHBDGetVarianceTest, RefGetVar) { RefTestGetVar(); }
 TEST_P(VpxHBDSubpelVarianceTest, Ref) { RefTest(); }
 TEST_P(VpxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
 TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
 
-/* TODO(debargha): This test does not support the highbd version
-INSTANTIATE_TEST_CASE_P(
+using VpxHBDMseTest = MainTestClass<vpx_variance_fn_t>;
+TEST_P(VpxHBDMseTest, RefMse) { RefTestMse(); }
+TEST_P(VpxHBDMseTest, MaxMse) { MaxTestMse(); }
+TEST_P(VpxHBDMseTest, DISABLED_Speed) { SpeedTest(); }
+INSTANTIATE_TEST_SUITE_P(
     C, VpxHBDMseTest,
-    ::testing::Values(make_tuple(4, 4, &vpx_highbd_12_mse16x16_c),
-                      make_tuple(4, 4, &vpx_highbd_12_mse16x8_c),
-                      make_tuple(4, 4, &vpx_highbd_12_mse8x16_c),
-                      make_tuple(4, 4, &vpx_highbd_12_mse8x8_c),
-                      make_tuple(4, 4, &vpx_highbd_10_mse16x16_c),
-                      make_tuple(4, 4, &vpx_highbd_10_mse16x8_c),
-                      make_tuple(4, 4, &vpx_highbd_10_mse8x16_c),
-                      make_tuple(4, 4, &vpx_highbd_10_mse8x8_c),
-                      make_tuple(4, 4, &vpx_highbd_8_mse16x16_c),
-                      make_tuple(4, 4, &vpx_highbd_8_mse16x8_c),
-                      make_tuple(4, 4, &vpx_highbd_8_mse8x16_c),
-                      make_tuple(4, 4, &vpx_highbd_8_mse8x8_c)));
-*/
+    ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_c, VPX_BITS_12),
+                      MseParams(4, 3, &vpx_highbd_12_mse16x8_c, VPX_BITS_12),
+                      MseParams(3, 4, &vpx_highbd_12_mse8x16_c, VPX_BITS_12),
+                      MseParams(3, 3, &vpx_highbd_12_mse8x8_c, VPX_BITS_12),
+                      MseParams(4, 4, &vpx_highbd_10_mse16x16_c, VPX_BITS_10),
+                      MseParams(4, 3, &vpx_highbd_10_mse16x8_c, VPX_BITS_10),
+                      MseParams(3, 4, &vpx_highbd_10_mse8x16_c, VPX_BITS_10),
+                      MseParams(3, 3, &vpx_highbd_10_mse8x8_c, VPX_BITS_10),
+                      MseParams(4, 4, &vpx_highbd_8_mse16x16_c, VPX_BITS_8),
+                      MseParams(4, 3, &vpx_highbd_8_mse16x8_c, VPX_BITS_8),
+                      MseParams(3, 4, &vpx_highbd_8_mse8x16_c, VPX_BITS_8),
+                      MseParams(3, 3, &vpx_highbd_8_mse8x8_c, VPX_BITS_8)));
 
-INSTANTIATE_TEST_CASE_P(
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(VpxHBDMseTest);
+
+INSTANTIATE_TEST_SUITE_P(
     C, VpxHBDVarianceTest,
     ::testing::Values(VarianceParams(6, 6, &vpx_highbd_12_variance64x64_c, 12),
                       VarianceParams(6, 5, &vpx_highbd_12_variance64x32_c, 12),
@@ -872,104 +1011,186 @@ INSTANTIATE_TEST_CASE_P(
                       VarianceParams(2, 3, &vpx_highbd_8_variance4x8_c, 8),
                       VarianceParams(2, 2, &vpx_highbd_8_variance4x4_c, 8)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
+    C, VpxHBDGetVarianceTest,
+    ::testing::Values(GetVarianceParams(4, 4, &vpx_highbd_12_get16x16var_c, 12),
+                      GetVarianceParams(3, 3, &vpx_highbd_12_get8x8var_c, 12),
+                      GetVarianceParams(4, 4, &vpx_highbd_10_get16x16var_c, 10),
+                      GetVarianceParams(3, 3, &vpx_highbd_10_get8x8var_c, 10),
+                      GetVarianceParams(4, 4, &vpx_highbd_8_get16x16var_c, 8),
+                      GetVarianceParams(3, 3, &vpx_highbd_8_get8x8var_c, 8)));
+
+INSTANTIATE_TEST_SUITE_P(
     C, VpxHBDSubpelVarianceTest,
     ::testing::Values(
-        make_tuple(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_c, 8),
-        make_tuple(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_c, 8),
-        make_tuple(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_c, 8),
-        make_tuple(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_c, 8),
-        make_tuple(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_c, 8),
-        make_tuple(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_c, 8),
-        make_tuple(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_c, 8),
-        make_tuple(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_c, 8),
-        make_tuple(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_c, 8),
-        make_tuple(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_c, 8),
-        make_tuple(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_c, 8),
-        make_tuple(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_c, 8),
-        make_tuple(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_c, 8),
-        make_tuple(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_c, 10),
-        make_tuple(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_c, 10),
-        make_tuple(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_c, 10),
-        make_tuple(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_c, 10),
-        make_tuple(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_c, 10),
-        make_tuple(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_c, 10),
-        make_tuple(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_c, 10),
-        make_tuple(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_c, 10),
-        make_tuple(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_c, 10),
-        make_tuple(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_c, 10),
-        make_tuple(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_c, 10),
-        make_tuple(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_c, 10),
-        make_tuple(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_c, 10),
-        make_tuple(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_c, 12),
-        make_tuple(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_c, 12),
-        make_tuple(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_c, 12),
-        make_tuple(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_c, 12),
-        make_tuple(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_c, 12),
-        make_tuple(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_c, 12),
-        make_tuple(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_c, 12),
-        make_tuple(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_c, 12),
-        make_tuple(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_c, 12),
-        make_tuple(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_c, 12),
-        make_tuple(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_c, 12),
-        make_tuple(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_c, 12),
-        make_tuple(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_c, 12)));
+        SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_c, 8),
+        SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_c, 8),
+        SubpelVarianceParams(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_c, 8),
+        SubpelVarianceParams(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_c, 8),
+        SubpelVarianceParams(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_c, 8),
+        SubpelVarianceParams(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_c, 8),
+        SubpelVarianceParams(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_c, 8),
+        SubpelVarianceParams(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_c, 8),
+        SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_c, 8),
+        SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_c, 8),
+        SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_c, 8),
+        SubpelVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_c, 8),
+        SubpelVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_c, 8),
+        SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_c,
+                             10),
+        SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_c,
+                             10),
+        SubpelVarianceParams(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_c,
+                             10),
+        SubpelVarianceParams(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_c,
+                             10),
+        SubpelVarianceParams(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_c,
+                             10),
+        SubpelVarianceParams(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_c,
+                             10),
+        SubpelVarianceParams(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_c,
+                             10),
+        SubpelVarianceParams(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_c, 10),
+        SubpelVarianceParams(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_c, 10),
+        SubpelVarianceParams(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_c, 10),
+        SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_c, 10),
+        SubpelVarianceParams(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_c, 10),
+        SubpelVarianceParams(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_c, 10),
+        SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_c,
+                             12),
+        SubpelVarianceParams(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_c,
+                             12),
+        SubpelVarianceParams(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_c,
+                             12),
+        SubpelVarianceParams(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_c,
+                             12),
+        SubpelVarianceParams(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_c,
+                             12),
+        SubpelVarianceParams(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_c,
+                             12),
+        SubpelVarianceParams(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_c,
+                             12),
+        SubpelVarianceParams(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_c, 12),
+        SubpelVarianceParams(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_c, 12),
+        SubpelVarianceParams(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_c, 12),
+        SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_c, 12),
+        SubpelVarianceParams(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_c, 12),
+        SubpelVarianceParams(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_c,
+                             12)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     C, VpxHBDSubpelAvgVarianceTest,
     ::testing::Values(
-        make_tuple(6, 6, &vpx_highbd_8_sub_pixel_avg_variance64x64_c, 8),
-        make_tuple(6, 5, &vpx_highbd_8_sub_pixel_avg_variance64x32_c, 8),
-        make_tuple(5, 6, &vpx_highbd_8_sub_pixel_avg_variance32x64_c, 8),
-        make_tuple(5, 5, &vpx_highbd_8_sub_pixel_avg_variance32x32_c, 8),
-        make_tuple(5, 4, &vpx_highbd_8_sub_pixel_avg_variance32x16_c, 8),
-        make_tuple(4, 5, &vpx_highbd_8_sub_pixel_avg_variance16x32_c, 8),
-        make_tuple(4, 4, &vpx_highbd_8_sub_pixel_avg_variance16x16_c, 8),
-        make_tuple(4, 3, &vpx_highbd_8_sub_pixel_avg_variance16x8_c, 8),
-        make_tuple(3, 4, &vpx_highbd_8_sub_pixel_avg_variance8x16_c, 8),
-        make_tuple(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_c, 8),
-        make_tuple(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_c, 8),
-        make_tuple(2, 3, &vpx_highbd_8_sub_pixel_avg_variance4x8_c, 8),
-        make_tuple(2, 2, &vpx_highbd_8_sub_pixel_avg_variance4x4_c, 8),
-        make_tuple(6, 6, &vpx_highbd_10_sub_pixel_avg_variance64x64_c, 10),
-        make_tuple(6, 5, &vpx_highbd_10_sub_pixel_avg_variance64x32_c, 10),
-        make_tuple(5, 6, &vpx_highbd_10_sub_pixel_avg_variance32x64_c, 10),
-        make_tuple(5, 5, &vpx_highbd_10_sub_pixel_avg_variance32x32_c, 10),
-        make_tuple(5, 4, &vpx_highbd_10_sub_pixel_avg_variance32x16_c, 10),
-        make_tuple(4, 5, &vpx_highbd_10_sub_pixel_avg_variance16x32_c, 10),
-        make_tuple(4, 4, &vpx_highbd_10_sub_pixel_avg_variance16x16_c, 10),
-        make_tuple(4, 3, &vpx_highbd_10_sub_pixel_avg_variance16x8_c, 10),
-        make_tuple(3, 4, &vpx_highbd_10_sub_pixel_avg_variance8x16_c, 10),
-        make_tuple(3, 3, &vpx_highbd_10_sub_pixel_avg_variance8x8_c, 10),
-        make_tuple(3, 2, &vpx_highbd_10_sub_pixel_avg_variance8x4_c, 10),
-        make_tuple(2, 3, &vpx_highbd_10_sub_pixel_avg_variance4x8_c, 10),
-        make_tuple(2, 2, &vpx_highbd_10_sub_pixel_avg_variance4x4_c, 10),
-        make_tuple(6, 6, &vpx_highbd_12_sub_pixel_avg_variance64x64_c, 12),
-        make_tuple(6, 5, &vpx_highbd_12_sub_pixel_avg_variance64x32_c, 12),
-        make_tuple(5, 6, &vpx_highbd_12_sub_pixel_avg_variance32x64_c, 12),
-        make_tuple(5, 5, &vpx_highbd_12_sub_pixel_avg_variance32x32_c, 12),
-        make_tuple(5, 4, &vpx_highbd_12_sub_pixel_avg_variance32x16_c, 12),
-        make_tuple(4, 5, &vpx_highbd_12_sub_pixel_avg_variance16x32_c, 12),
-        make_tuple(4, 4, &vpx_highbd_12_sub_pixel_avg_variance16x16_c, 12),
-        make_tuple(4, 3, &vpx_highbd_12_sub_pixel_avg_variance16x8_c, 12),
-        make_tuple(3, 4, &vpx_highbd_12_sub_pixel_avg_variance8x16_c, 12),
-        make_tuple(3, 3, &vpx_highbd_12_sub_pixel_avg_variance8x8_c, 12),
-        make_tuple(3, 2, &vpx_highbd_12_sub_pixel_avg_variance8x4_c, 12),
-        make_tuple(2, 3, &vpx_highbd_12_sub_pixel_avg_variance4x8_c, 12),
-        make_tuple(2, 2, &vpx_highbd_12_sub_pixel_avg_variance4x4_c, 12)));
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_8_sub_pixel_avg_variance64x64_c, 8),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance64x32_c, 8),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x64_c, 8),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x32_c, 8),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x16_c, 8),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x32_c, 8),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x16_c, 8),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x8_c, 8),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance8x16_c, 8),
+        SubpelAvgVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_c,
+                                8),
+        SubpelAvgVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_c,
+                                8),
+        SubpelAvgVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_avg_variance4x8_c,
+                                8),
+        SubpelAvgVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_avg_variance4x4_c,
+                                8),
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_10_sub_pixel_avg_variance64x64_c,
+                                10),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance64x32_c,
+                                10),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x64_c,
+                                10),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x32_c,
+                                10),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x16_c,
+                                10),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x32_c,
+                                10),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x16_c,
+                                10),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x8_c,
+                                10),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x16_c,
+                                10),
+        SubpelAvgVarianceParams(3, 3,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x8_c, 10),
+        SubpelAvgVarianceParams(3, 2,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x4_c, 10),
+        SubpelAvgVarianceParams(2, 3,
+                                &vpx_highbd_10_sub_pixel_avg_variance4x8_c, 10),
+        SubpelAvgVarianceParams(2, 2,
+                                &vpx_highbd_10_sub_pixel_avg_variance4x4_c, 10),
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_12_sub_pixel_avg_variance64x64_c,
+                                12),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance64x32_c,
+                                12),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x64_c,
+                                12),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x32_c,
+                                12),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x16_c,
+                                12),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x32_c,
+                                12),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x16_c,
+                                12),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x8_c,
+                                12),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x16_c,
+                                12),
+        SubpelAvgVarianceParams(3, 3,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x8_c, 12),
+        SubpelAvgVarianceParams(3, 2,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x4_c, 12),
+        SubpelAvgVarianceParams(2, 3,
+                                &vpx_highbd_12_sub_pixel_avg_variance4x8_c, 12),
+        SubpelAvgVarianceParams(2, 2,
+                                &vpx_highbd_12_sub_pixel_avg_variance4x4_c,
+                                12)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest,
-                        ::testing::Values(vpx_get_mb_ss_sse2));
+INSTANTIATE_TEST_SUITE_P(SSE2, SumOfSquaresTest,
+                         ::testing::Values(vpx_get_mb_ss_sse2));
 
-INSTANTIATE_TEST_CASE_P(SSE2, VpxMseTest,
-                        ::testing::Values(MseParams(4, 4, &vpx_mse16x16_sse2),
-                                          MseParams(4, 3, &vpx_mse16x8_sse2),
-                                          MseParams(3, 4, &vpx_mse8x16_sse2),
-                                          MseParams(3, 3, &vpx_mse8x8_sse2)));
+INSTANTIATE_TEST_SUITE_P(SSE2, VpxMseTest,
+                         ::testing::Values(MseParams(4, 4, &vpx_mse16x16_sse2),
+                                           MseParams(4, 3, &vpx_mse16x8_sse2),
+                                           MseParams(3, 4, &vpx_mse8x16_sse2),
+                                           MseParams(3, 3, &vpx_mse8x8_sse2)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, VpxVarianceTest,
     ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_sse2),
                       VarianceParams(6, 5, &vpx_variance64x32_sse2),
@@ -985,58 +1206,61 @@ INSTANTIATE_TEST_CASE_P(
                       VarianceParams(2, 3, &vpx_variance4x8_sse2),
                       VarianceParams(2, 2, &vpx_variance4x4_sse2)));
 
-INSTANTIATE_TEST_CASE_P(
-    SSE2, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_sse2, 0),
-                      make_tuple(6, 5, &vpx_sub_pixel_variance64x32_sse2, 0),
-                      make_tuple(5, 6, &vpx_sub_pixel_variance32x64_sse2, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_sse2, 0),
-                      make_tuple(5, 4, &vpx_sub_pixel_variance32x16_sse2, 0),
-                      make_tuple(4, 5, &vpx_sub_pixel_variance16x32_sse2, 0),
-                      make_tuple(4, 4, &vpx_sub_pixel_variance16x16_sse2, 0),
-                      make_tuple(4, 3, &vpx_sub_pixel_variance16x8_sse2, 0),
-                      make_tuple(3, 4, &vpx_sub_pixel_variance8x16_sse2, 0),
-                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_sse2, 0),
-                      make_tuple(3, 2, &vpx_sub_pixel_variance8x4_sse2, 0),
-                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_sse2, 0),
-                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_sse2, 0)));
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, VpxGetVarianceTest,
+    ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_sse2),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_sse2),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_sse2),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_sse2),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_sse2),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_sse2)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, VpxSubpelVarianceTest,
+    ::testing::Values(
+        SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_sse2, 0),
+        SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_sse2, 0),
+        SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_sse2, 0),
+        SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_sse2, 0),
+        SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_sse2, 0),
+        SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_sse2, 0),
+        SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_sse2, 0),
+        SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_sse2, 0),
+        SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_sse2, 0),
+        SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_sse2, 0),
+        SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_sse2, 0),
+        SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_sse2, 0),
+        SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_sse2, 0)));
+
+INSTANTIATE_TEST_SUITE_P(
     SSE2, VpxSubpelAvgVarianceTest,
     ::testing::Values(
-        make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_sse2, 0),
-        make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_sse2, 0),
-        make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_sse2, 0),
-        make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_sse2, 0),
-        make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_sse2, 0),
-        make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_sse2, 0),
-        make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_sse2, 0),
-        make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_sse2, 0),
-        make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_sse2, 0),
-        make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_sse2, 0),
-        make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_sse2, 0),
-        make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_sse2, 0),
-        make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0)));
+        SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_sse2, 0),
+        SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_sse2, 0),
+        SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_sse2, 0),
+        SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_sse2, 0),
+        SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_sse2, 0),
+        SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_sse2, 0),
+        SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_sse2, 0),
+        SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_sse2, 0),
+        SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_sse2, 0),
+        SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_sse2, 0),
+        SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_sse2, 0),
+        SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_sse2, 0),
+        SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0)));
 
 #if CONFIG_VP9_HIGHBITDEPTH
-/* TODO(debargha): This test does not support the highbd version
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, VpxHBDMseTest,
-    ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_sse2),
-                      MseParams(4, 3, &vpx_highbd_12_mse16x8_sse2),
-                      MseParams(3, 4, &vpx_highbd_12_mse8x16_sse2),
-                      MseParams(3, 3, &vpx_highbd_12_mse8x8_sse2),
-                      MseParams(4, 4, &vpx_highbd_10_mse16x16_sse2),
-                      MseParams(4, 3, &vpx_highbd_10_mse16x8_sse2),
-                      MseParams(3, 4, &vpx_highbd_10_mse8x16_sse2),
-                      MseParams(3, 3, &vpx_highbd_10_mse8x8_sse2),
-                      MseParams(4, 4, &vpx_highbd_8_mse16x16_sse2),
-                      MseParams(4, 3, &vpx_highbd_8_mse16x8_sse2),
-                      MseParams(3, 4, &vpx_highbd_8_mse8x16_sse2),
-                      MseParams(3, 3, &vpx_highbd_8_mse8x8_sse2)));
-*/
+    ::testing::Values(
+        MseParams(4, 4, &vpx_highbd_12_mse16x16_sse2, VPX_BITS_12),
+        MseParams(3, 3, &vpx_highbd_12_mse8x8_sse2, VPX_BITS_12),
+        MseParams(4, 4, &vpx_highbd_10_mse16x16_sse2, VPX_BITS_10),
+        MseParams(3, 3, &vpx_highbd_10_mse8x8_sse2, VPX_BITS_10),
+        MseParams(4, 4, &vpx_highbd_8_mse16x16_sse2, VPX_BITS_8),
+        MseParams(3, 3, &vpx_highbd_8_mse8x8_sse2, VPX_BITS_8)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, VpxHBDVarianceTest,
     ::testing::Values(
         VarianceParams(6, 6, &vpx_highbd_12_variance64x64_sse2, 12),
@@ -1070,183 +1294,745 @@ INSTANTIATE_TEST_CASE_P(
         VarianceParams(3, 4, &vpx_highbd_8_variance8x16_sse2, 8),
         VarianceParams(3, 3, &vpx_highbd_8_variance8x8_sse2, 8)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, VpxHBDGetVarianceTest,
+    ::testing::Values(
+        GetVarianceParams(4, 4, &vpx_highbd_12_get16x16var_sse2, 12),
+        GetVarianceParams(3, 3, &vpx_highbd_12_get8x8var_sse2, 12),
+        GetVarianceParams(4, 4, &vpx_highbd_10_get16x16var_sse2, 10),
+        GetVarianceParams(3, 3, &vpx_highbd_10_get8x8var_sse2, 10),
+        GetVarianceParams(4, 4, &vpx_highbd_8_get16x16var_sse2, 8),
+        GetVarianceParams(3, 3, &vpx_highbd_8_get8x8var_sse2, 8)));
+
+INSTANTIATE_TEST_SUITE_P(
     SSE2, VpxHBDSubpelVarianceTest,
     ::testing::Values(
-        make_tuple(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_sse2, 12),
-        make_tuple(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_sse2, 12),
-        make_tuple(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_sse2, 12),
-        make_tuple(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_sse2, 12),
-        make_tuple(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_sse2, 12),
-        make_tuple(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_sse2, 12),
-        make_tuple(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_sse2, 12),
-        make_tuple(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_sse2, 12),
-        make_tuple(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_sse2, 12),
-        make_tuple(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_sse2, 12),
-        make_tuple(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_sse2, 12),
-        make_tuple(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_sse2, 10),
-        make_tuple(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_sse2, 10),
-        make_tuple(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_sse2, 10),
-        make_tuple(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_sse2, 10),
-        make_tuple(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_sse2, 10),
-        make_tuple(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_sse2, 10),
-        make_tuple(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_sse2, 10),
-        make_tuple(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_sse2, 10),
-        make_tuple(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_sse2, 10),
-        make_tuple(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_sse2, 10),
-        make_tuple(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_sse2, 10),
-        make_tuple(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_sse2, 8),
-        make_tuple(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_sse2, 8),
-        make_tuple(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_sse2, 8),
-        make_tuple(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_sse2, 8),
-        make_tuple(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_sse2, 8),
-        make_tuple(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_sse2, 8),
-        make_tuple(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_sse2, 8),
-        make_tuple(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_sse2, 8),
-        make_tuple(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_sse2, 8),
-        make_tuple(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_sse2, 8),
-        make_tuple(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_sse2, 8)));
+        SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_sse2,
+                             12),
+        SubpelVarianceParams(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_sse2,
+                             12),
+        SubpelVarianceParams(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_sse2,
+                             12),
+        SubpelVarianceParams(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_sse2,
+                             12),
+        SubpelVarianceParams(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_sse2,
+                             12),
+        SubpelVarianceParams(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_sse2,
+                             12),
+        SubpelVarianceParams(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_sse2,
+                             12),
+        SubpelVarianceParams(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_sse2,
+                             12),
+        SubpelVarianceParams(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_sse2,
+                             12),
+        SubpelVarianceParams(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_sse2,
+                             12),
+        SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_sse2,
+                             12),
+        SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_sse2,
+                             10),
+        SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_sse2,
+                             10),
+        SubpelVarianceParams(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_sse2,
+                             10),
+        SubpelVarianceParams(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_sse2,
+                             10),
+        SubpelVarianceParams(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_sse2,
+                             10),
+        SubpelVarianceParams(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_sse2,
+                             10),
+        SubpelVarianceParams(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_sse2,
+                             10),
+        SubpelVarianceParams(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_sse2,
+                             10),
+        SubpelVarianceParams(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_sse2,
+                             10),
+        SubpelVarianceParams(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_sse2,
+                             10),
+        SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_sse2,
+                             10),
+        SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_sse2,
+                             8),
+        SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_sse2,
+                             8),
+        SubpelVarianceParams(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_sse2,
+                             8),
+        SubpelVarianceParams(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_sse2,
+                             8),
+        SubpelVarianceParams(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_sse2,
+                             8),
+        SubpelVarianceParams(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_sse2,
+                             8),
+        SubpelVarianceParams(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_sse2,
+                             8),
+        SubpelVarianceParams(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_sse2,
+                             8),
+        SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_sse2,
+                             8),
+        SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_sse2, 8),
+        SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_sse2,
+                             8)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, VpxHBDSubpelAvgVarianceTest,
     ::testing::Values(
-        make_tuple(6, 6, &vpx_highbd_12_sub_pixel_avg_variance64x64_sse2, 12),
-        make_tuple(6, 5, &vpx_highbd_12_sub_pixel_avg_variance64x32_sse2, 12),
-        make_tuple(5, 6, &vpx_highbd_12_sub_pixel_avg_variance32x64_sse2, 12),
-        make_tuple(5, 5, &vpx_highbd_12_sub_pixel_avg_variance32x32_sse2, 12),
-        make_tuple(5, 4, &vpx_highbd_12_sub_pixel_avg_variance32x16_sse2, 12),
-        make_tuple(4, 5, &vpx_highbd_12_sub_pixel_avg_variance16x32_sse2, 12),
-        make_tuple(4, 4, &vpx_highbd_12_sub_pixel_avg_variance16x16_sse2, 12),
-        make_tuple(4, 3, &vpx_highbd_12_sub_pixel_avg_variance16x8_sse2, 12),
-        make_tuple(3, 4, &vpx_highbd_12_sub_pixel_avg_variance8x16_sse2, 12),
-        make_tuple(3, 3, &vpx_highbd_12_sub_pixel_avg_variance8x8_sse2, 12),
-        make_tuple(3, 2, &vpx_highbd_12_sub_pixel_avg_variance8x4_sse2, 12),
-        make_tuple(6, 6, &vpx_highbd_10_sub_pixel_avg_variance64x64_sse2, 10),
-        make_tuple(6, 5, &vpx_highbd_10_sub_pixel_avg_variance64x32_sse2, 10),
-        make_tuple(5, 6, &vpx_highbd_10_sub_pixel_avg_variance32x64_sse2, 10),
-        make_tuple(5, 5, &vpx_highbd_10_sub_pixel_avg_variance32x32_sse2, 10),
-        make_tuple(5, 4, &vpx_highbd_10_sub_pixel_avg_variance32x16_sse2, 10),
-        make_tuple(4, 5, &vpx_highbd_10_sub_pixel_avg_variance16x32_sse2, 10),
-        make_tuple(4, 4, &vpx_highbd_10_sub_pixel_avg_variance16x16_sse2, 10),
-        make_tuple(4, 3, &vpx_highbd_10_sub_pixel_avg_variance16x8_sse2, 10),
-        make_tuple(3, 4, &vpx_highbd_10_sub_pixel_avg_variance8x16_sse2, 10),
-        make_tuple(3, 3, &vpx_highbd_10_sub_pixel_avg_variance8x8_sse2, 10),
-        make_tuple(3, 2, &vpx_highbd_10_sub_pixel_avg_variance8x4_sse2, 10),
-        make_tuple(6, 6, &vpx_highbd_8_sub_pixel_avg_variance64x64_sse2, 8),
-        make_tuple(6, 5, &vpx_highbd_8_sub_pixel_avg_variance64x32_sse2, 8),
-        make_tuple(5, 6, &vpx_highbd_8_sub_pixel_avg_variance32x64_sse2, 8),
-        make_tuple(5, 5, &vpx_highbd_8_sub_pixel_avg_variance32x32_sse2, 8),
-        make_tuple(5, 4, &vpx_highbd_8_sub_pixel_avg_variance32x16_sse2, 8),
-        make_tuple(4, 5, &vpx_highbd_8_sub_pixel_avg_variance16x32_sse2, 8),
-        make_tuple(4, 4, &vpx_highbd_8_sub_pixel_avg_variance16x16_sse2, 8),
-        make_tuple(4, 3, &vpx_highbd_8_sub_pixel_avg_variance16x8_sse2, 8),
-        make_tuple(3, 4, &vpx_highbd_8_sub_pixel_avg_variance8x16_sse2, 8),
-        make_tuple(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_sse2, 8),
-        make_tuple(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_sse2, 8)));
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_12_sub_pixel_avg_variance64x64_sse2,
+                                12),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance64x32_sse2,
+                                12),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x64_sse2,
+                                12),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x32_sse2,
+                                12),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x16_sse2,
+                                12),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x32_sse2,
+                                12),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x16_sse2,
+                                12),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x8_sse2,
+                                12),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x16_sse2,
+                                12),
+        SubpelAvgVarianceParams(3, 3,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x8_sse2,
+                                12),
+        SubpelAvgVarianceParams(3, 2,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x4_sse2,
+                                12),
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_10_sub_pixel_avg_variance64x64_sse2,
+                                10),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance64x32_sse2,
+                                10),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x64_sse2,
+                                10),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x32_sse2,
+                                10),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x16_sse2,
+                                10),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x32_sse2,
+                                10),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x16_sse2,
+                                10),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x8_sse2,
+                                10),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x16_sse2,
+                                10),
+        SubpelAvgVarianceParams(3, 3,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x8_sse2,
+                                10),
+        SubpelAvgVarianceParams(3, 2,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x4_sse2,
+                                10),
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_8_sub_pixel_avg_variance64x64_sse2,
+                                8),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance64x32_sse2,
+                                8),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x64_sse2,
+                                8),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x32_sse2,
+                                8),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x16_sse2,
+                                8),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x32_sse2,
+                                8),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x16_sse2,
+                                8),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x8_sse2,
+                                8),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance8x16_sse2,
+                                8),
+        SubpelAvgVarianceParams(3, 3,
+                                &vpx_highbd_8_sub_pixel_avg_variance8x8_sse2,
+                                8),
+        SubpelAvgVarianceParams(3, 2,
+                                &vpx_highbd_8_sub_pixel_avg_variance8x4_sse2,
+                                8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSSE3, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_ssse3, 0),
-                      make_tuple(6, 5, &vpx_sub_pixel_variance64x32_ssse3, 0),
-                      make_tuple(5, 6, &vpx_sub_pixel_variance32x64_ssse3, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_ssse3, 0),
-                      make_tuple(5, 4, &vpx_sub_pixel_variance32x16_ssse3, 0),
-                      make_tuple(4, 5, &vpx_sub_pixel_variance16x32_ssse3, 0),
-                      make_tuple(4, 4, &vpx_sub_pixel_variance16x16_ssse3, 0),
-                      make_tuple(4, 3, &vpx_sub_pixel_variance16x8_ssse3, 0),
-                      make_tuple(3, 4, &vpx_sub_pixel_variance8x16_ssse3, 0),
-                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_ssse3, 0),
-                      make_tuple(3, 2, &vpx_sub_pixel_variance8x4_ssse3, 0),
-                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_ssse3, 0),
-                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_ssse3, 0)));
+    ::testing::Values(
+        SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_ssse3, 0),
+        SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_ssse3, 0),
+        SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_ssse3, 0),
+        SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_ssse3, 0),
+        SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_ssse3, 0),
+        SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_ssse3, 0),
+        SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_ssse3, 0),
+        SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_ssse3, 0),
+        SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_ssse3, 0),
+        SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_ssse3, 0),
+        SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_ssse3, 0),
+        SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_ssse3, 0),
+        SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_ssse3, 0)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSSE3, VpxSubpelAvgVarianceTest,
     ::testing::Values(
-        make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_ssse3, 0),
-        make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_ssse3, 0),
-        make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_ssse3, 0),
-        make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_ssse3, 0),
-        make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_ssse3, 0),
-        make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_ssse3, 0),
-        make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_ssse3, 0),
-        make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_ssse3, 0),
-        make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_ssse3, 0),
-        make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_ssse3, 0),
-        make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_ssse3, 0),
-        make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_ssse3, 0),
-        make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_ssse3, 0)));
+        SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_ssse3,
+                                0),
+        SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_ssse3,
+                                0),
+        SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_ssse3,
+                                0),
+        SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_ssse3,
+                                0),
+        SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_ssse3,
+                                0),
+        SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_ssse3,
+                                0),
+        SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_ssse3,
+                                0),
+        SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_ssse3, 0),
+        SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_ssse3, 0),
+        SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_ssse3, 0),
+        SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_ssse3, 0),
+        SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_ssse3, 0),
+        SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_ssse3,
+                                0)));
 #endif  // HAVE_SSSE3
 
 #if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(AVX2, VpxMseTest,
-                        ::testing::Values(MseParams(4, 4, &vpx_mse16x16_avx2)));
+INSTANTIATE_TEST_SUITE_P(AVX2, VpxMseTest,
+                         ::testing::Values(MseParams(4, 4, &vpx_mse16x16_avx2),
+                                           MseParams(4, 3, &vpx_mse16x8_avx2)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     AVX2, VpxVarianceTest,
     ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_avx2),
                       VarianceParams(6, 5, &vpx_variance64x32_avx2),
+                      VarianceParams(5, 6, &vpx_variance32x64_avx2),
                       VarianceParams(5, 5, &vpx_variance32x32_avx2),
                       VarianceParams(5, 4, &vpx_variance32x16_avx2),
-                      VarianceParams(4, 4, &vpx_variance16x16_avx2)));
+                      VarianceParams(4, 5, &vpx_variance16x32_avx2),
+                      VarianceParams(4, 4, &vpx_variance16x16_avx2),
+                      VarianceParams(4, 3, &vpx_variance16x8_avx2),
+                      VarianceParams(3, 4, &vpx_variance8x16_avx2),
+                      VarianceParams(3, 3, &vpx_variance8x8_avx2),
+                      VarianceParams(3, 2, &vpx_variance8x4_avx2)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     AVX2, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_avx2, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_avx2, 0)));
+    ::testing::Values(
+        SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_avx2, 0),
+        SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_avx2, 0)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     AVX2, VpxSubpelAvgVarianceTest,
     ::testing::Values(
-        make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_avx2, 0),
-        make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_avx2, 0)));
+        SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_avx2, 0),
+        SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_avx2,
+                                0)));
 #endif  // HAVE_AVX2
 
 #if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(NEON, VpxSseTest,
-                        ::testing::Values(SseParams(2, 2,
-                                                    &vpx_get4x4sse_cs_neon)));
+INSTANTIATE_TEST_SUITE_P(NEON, VpxSseTest,
+                         ::testing::Values(SseParams(2, 2,
+                                                     &vpx_get4x4sse_cs_neon)));
 
-INSTANTIATE_TEST_CASE_P(NEON, VpxMseTest,
-                        ::testing::Values(MseParams(4, 4, &vpx_mse16x16_neon)));
+INSTANTIATE_TEST_SUITE_P(NEON, VpxMseTest,
+                         ::testing::Values(MseParams(4, 4, &vpx_mse16x16_neon),
+                                           MseParams(4, 3, &vpx_mse16x8_neon),
+                                           MseParams(3, 4, &vpx_mse8x16_neon),
+                                           MseParams(3, 3, &vpx_mse8x8_neon)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     NEON, VpxVarianceTest,
     ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_neon),
                       VarianceParams(6, 5, &vpx_variance64x32_neon),
                       VarianceParams(5, 6, &vpx_variance32x64_neon),
                       VarianceParams(5, 5, &vpx_variance32x32_neon),
+                      VarianceParams(5, 4, &vpx_variance32x16_neon),
+                      VarianceParams(4, 5, &vpx_variance16x32_neon),
                       VarianceParams(4, 4, &vpx_variance16x16_neon),
                       VarianceParams(4, 3, &vpx_variance16x8_neon),
                       VarianceParams(3, 4, &vpx_variance8x16_neon),
-                      VarianceParams(3, 3, &vpx_variance8x8_neon)));
+                      VarianceParams(3, 3, &vpx_variance8x8_neon),
+                      VarianceParams(3, 2, &vpx_variance8x4_neon),
+                      VarianceParams(2, 3, &vpx_variance4x8_neon),
+                      VarianceParams(2, 2, &vpx_variance4x4_neon)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
+    NEON, VpxGetVarianceTest,
+    ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_neon),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_neon),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_neon),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_neon),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_neon),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_neon)));
+
+#if HAVE_NEON_DOTPROD
+INSTANTIATE_TEST_SUITE_P(
+    NEON_DOTPROD, VpxSseTest,
+    ::testing::Values(SseParams(2, 2, &vpx_get4x4sse_cs_neon_dotprod)));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON_DOTPROD, VpxMseTest,
+    ::testing::Values(MseParams(4, 4, &vpx_mse16x16_neon_dotprod),
+                      MseParams(4, 3, &vpx_mse16x8_neon_dotprod),
+                      MseParams(3, 4, &vpx_mse8x16_neon_dotprod),
+                      MseParams(3, 3, &vpx_mse8x8_neon_dotprod)));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON_DOTPROD, VpxVarianceTest,
+    ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_neon_dotprod),
+                      VarianceParams(6, 5, &vpx_variance64x32_neon_dotprod),
+                      VarianceParams(5, 6, &vpx_variance32x64_neon_dotprod),
+                      VarianceParams(5, 5, &vpx_variance32x32_neon_dotprod),
+                      VarianceParams(5, 4, &vpx_variance32x16_neon_dotprod),
+                      VarianceParams(4, 5, &vpx_variance16x32_neon_dotprod),
+                      VarianceParams(4, 4, &vpx_variance16x16_neon_dotprod),
+                      VarianceParams(4, 3, &vpx_variance16x8_neon_dotprod),
+                      VarianceParams(3, 4, &vpx_variance8x16_neon_dotprod),
+                      VarianceParams(3, 3, &vpx_variance8x8_neon_dotprod),
+                      VarianceParams(3, 2, &vpx_variance8x4_neon_dotprod),
+                      VarianceParams(2, 3, &vpx_variance4x8_neon_dotprod),
+                      VarianceParams(2, 2, &vpx_variance4x4_neon_dotprod)));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON_DOTPROD, VpxGetVarianceTest,
+    ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_neon_dotprod),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_neon_dotprod),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_neon_dotprod),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_neon_dotprod),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_neon_dotprod),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_neon_dotprod)));
+#endif  // HAVE_NEON_DOTPROD
+
+INSTANTIATE_TEST_SUITE_P(
     NEON, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_neon, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_neon, 0),
-                      make_tuple(4, 4, &vpx_sub_pixel_variance16x16_neon, 0),
-                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_neon, 0)));
+    ::testing::Values(
+        SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_neon, 0),
+        SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_neon, 0),
+        SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_neon, 0),
+        SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_neon, 0),
+        SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_neon, 0),
+        SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_neon, 0),
+        SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_neon, 0),
+        SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_neon, 0),
+        SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_neon, 0),
+        SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_neon, 0),
+        SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_neon, 0),
+        SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_neon, 0),
+        SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_neon, 0)));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, VpxSubpelAvgVarianceTest,
+    ::testing::Values(
+        SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_neon, 0),
+        SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_neon, 0),
+        SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_neon, 0),
+        SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_neon, 0),
+        SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_neon, 0),
+        SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_neon, 0),
+        SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_neon, 0),
+        SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_neon, 0),
+        SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_neon, 0),
+        SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_neon, 0),
+        SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_neon, 0),
+        SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_neon, 0),
+        SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_neon, 0)));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    NEON, VpxHBDMseTest,
+    ::testing::Values(
+        MseParams(4, 4, &vpx_highbd_12_mse16x16_neon, VPX_BITS_12),
+        MseParams(4, 3, &vpx_highbd_12_mse16x8_neon, VPX_BITS_12),
+        MseParams(3, 4, &vpx_highbd_12_mse8x16_neon, VPX_BITS_12),
+        MseParams(3, 3, &vpx_highbd_12_mse8x8_neon, VPX_BITS_12),
+        MseParams(4, 4, &vpx_highbd_10_mse16x16_neon, VPX_BITS_10),
+        MseParams(4, 3, &vpx_highbd_10_mse16x8_neon, VPX_BITS_10),
+        MseParams(3, 4, &vpx_highbd_10_mse8x16_neon, VPX_BITS_10),
+        MseParams(3, 3, &vpx_highbd_10_mse8x8_neon, VPX_BITS_10),
+        MseParams(4, 4, &vpx_highbd_8_mse16x16_neon, VPX_BITS_8),
+        MseParams(4, 3, &vpx_highbd_8_mse16x8_neon, VPX_BITS_8),
+        MseParams(3, 4, &vpx_highbd_8_mse8x16_neon, VPX_BITS_8),
+        MseParams(3, 3, &vpx_highbd_8_mse8x8_neon, VPX_BITS_8)));
+
+#if HAVE_NEON_DOTPROD
+INSTANTIATE_TEST_SUITE_P(
+    NEON_DOTPROD, VpxHBDMseTest,
+    ::testing::Values(
+        MseParams(4, 4, &vpx_highbd_8_mse16x16_neon_dotprod, VPX_BITS_8),
+        MseParams(4, 3, &vpx_highbd_8_mse16x8_neon_dotprod, VPX_BITS_8),
+        MseParams(3, 4, &vpx_highbd_8_mse8x16_neon_dotprod, VPX_BITS_8),
+        MseParams(3, 3, &vpx_highbd_8_mse8x8_neon_dotprod, VPX_BITS_8)));
+#endif  // HAVE_NEON_DOTPROD
+
+#if HAVE_SVE
+INSTANTIATE_TEST_SUITE_P(
+    SVE, VpxHBDMseTest,
+    ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_sve, VPX_BITS_12),
+                      MseParams(4, 3, &vpx_highbd_12_mse16x8_sve, VPX_BITS_12),
+                      MseParams(3, 4, &vpx_highbd_12_mse8x16_sve, VPX_BITS_12),
+                      MseParams(3, 3, &vpx_highbd_12_mse8x8_sve, VPX_BITS_12),
+                      MseParams(4, 4, &vpx_highbd_10_mse16x16_sve, VPX_BITS_10),
+                      MseParams(4, 3, &vpx_highbd_10_mse16x8_sve, VPX_BITS_10),
+                      MseParams(3, 4, &vpx_highbd_10_mse8x16_sve, VPX_BITS_10),
+                      MseParams(3, 3, &vpx_highbd_10_mse8x8_sve, VPX_BITS_10)));
+#endif  // HAVE_SVE
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, VpxHBDVarianceTest,
+    ::testing::Values(
+        VarianceParams(6, 6, &vpx_highbd_12_variance64x64_neon, 12),
+        VarianceParams(6, 5, &vpx_highbd_12_variance64x32_neon, 12),
+        VarianceParams(5, 6, &vpx_highbd_12_variance32x64_neon, 12),
+        VarianceParams(5, 5, &vpx_highbd_12_variance32x32_neon, 12),
+        VarianceParams(5, 4, &vpx_highbd_12_variance32x16_neon, 12),
+        VarianceParams(4, 5, &vpx_highbd_12_variance16x32_neon, 12),
+        VarianceParams(4, 4, &vpx_highbd_12_variance16x16_neon, 12),
+        VarianceParams(4, 3, &vpx_highbd_12_variance16x8_neon, 12),
+        VarianceParams(3, 4, &vpx_highbd_12_variance8x16_neon, 12),
+        VarianceParams(3, 3, &vpx_highbd_12_variance8x8_neon, 12),
+        VarianceParams(3, 2, &vpx_highbd_12_variance8x4_neon, 12),
+        VarianceParams(2, 3, &vpx_highbd_12_variance4x8_neon, 12),
+        VarianceParams(2, 2, &vpx_highbd_12_variance4x4_neon, 12),
+        VarianceParams(6, 6, &vpx_highbd_10_variance64x64_neon, 10),
+        VarianceParams(6, 5, &vpx_highbd_10_variance64x32_neon, 10),
+        VarianceParams(5, 6, &vpx_highbd_10_variance32x64_neon, 10),
+        VarianceParams(5, 5, &vpx_highbd_10_variance32x32_neon, 10),
+        VarianceParams(5, 4, &vpx_highbd_10_variance32x16_neon, 10),
+        VarianceParams(4, 5, &vpx_highbd_10_variance16x32_neon, 10),
+        VarianceParams(4, 4, &vpx_highbd_10_variance16x16_neon, 10),
+        VarianceParams(4, 3, &vpx_highbd_10_variance16x8_neon, 10),
+        VarianceParams(3, 4, &vpx_highbd_10_variance8x16_neon, 10),
+        VarianceParams(3, 3, &vpx_highbd_10_variance8x8_neon, 10),
+        VarianceParams(3, 2, &vpx_highbd_10_variance8x4_neon, 10),
+        VarianceParams(2, 3, &vpx_highbd_10_variance4x8_neon, 10),
+        VarianceParams(2, 2, &vpx_highbd_10_variance4x4_neon, 10),
+        VarianceParams(6, 6, &vpx_highbd_8_variance64x64_neon, 8),
+        VarianceParams(6, 5, &vpx_highbd_8_variance64x32_neon, 8),
+        VarianceParams(5, 6, &vpx_highbd_8_variance32x64_neon, 8),
+        VarianceParams(5, 5, &vpx_highbd_8_variance32x32_neon, 8),
+        VarianceParams(5, 4, &vpx_highbd_8_variance32x16_neon, 8),
+        VarianceParams(4, 5, &vpx_highbd_8_variance16x32_neon, 8),
+        VarianceParams(4, 4, &vpx_highbd_8_variance16x16_neon, 8),
+        VarianceParams(4, 3, &vpx_highbd_8_variance16x8_neon, 8),
+        VarianceParams(3, 4, &vpx_highbd_8_variance8x16_neon, 8),
+        VarianceParams(3, 3, &vpx_highbd_8_variance8x8_neon, 8),
+        VarianceParams(3, 2, &vpx_highbd_8_variance8x4_neon, 8),
+        VarianceParams(2, 3, &vpx_highbd_8_variance4x8_neon, 8),
+        VarianceParams(2, 2, &vpx_highbd_8_variance4x4_neon, 8)));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, VpxHBDGetVarianceTest,
+    ::testing::Values(
+        GetVarianceParams(4, 4, &vpx_highbd_12_get16x16var_neon, 12),
+        GetVarianceParams(3, 3, &vpx_highbd_12_get8x8var_neon, 12),
+        GetVarianceParams(4, 4, &vpx_highbd_10_get16x16var_neon, 10),
+        GetVarianceParams(3, 3, &vpx_highbd_10_get8x8var_neon, 10),
+        GetVarianceParams(4, 4, &vpx_highbd_8_get16x16var_neon, 8),
+        GetVarianceParams(3, 3, &vpx_highbd_8_get8x8var_neon, 8)));
+
+#if HAVE_SVE
+INSTANTIATE_TEST_SUITE_P(
+    SVE, VpxHBDGetVarianceTest,
+    ::testing::Values(
+        GetVarianceParams(4, 4, &vpx_highbd_12_get16x16var_sve, 12),
+        GetVarianceParams(3, 3, &vpx_highbd_12_get8x8var_sve, 12),
+        GetVarianceParams(4, 4, &vpx_highbd_10_get16x16var_sve, 10),
+        GetVarianceParams(3, 3, &vpx_highbd_10_get8x8var_sve, 10),
+        GetVarianceParams(4, 4, &vpx_highbd_8_get16x16var_sve, 8),
+        GetVarianceParams(3, 3, &vpx_highbd_8_get8x8var_sve, 8)));
+#endif  // HAVE_SVE
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, VpxHBDSubpelVarianceTest,
+    ::testing::Values(
+        SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_neon,
+                             12),
+        SubpelVarianceParams(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_neon,
+                             12),
+        SubpelVarianceParams(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_neon,
+                             12),
+        SubpelVarianceParams(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_neon,
+                             12),
+        SubpelVarianceParams(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_neon,
+                             12),
+        SubpelVarianceParams(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_neon,
+                             12),
+        SubpelVarianceParams(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_neon,
+                             12),
+        SubpelVarianceParams(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_neon,
+                             12),
+        SubpelVarianceParams(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_neon,
+                             12),
+        SubpelVarianceParams(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_neon,
+                             12),
+        SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_neon,
+                             12),
+        SubpelVarianceParams(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_neon,
+                             12),
+        SubpelVarianceParams(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_neon,
+                             12),
+        SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_neon,
+                             10),
+        SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_neon,
+                             10),
+        SubpelVarianceParams(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_neon,
+                             10),
+        SubpelVarianceParams(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_neon,
+                             10),
+        SubpelVarianceParams(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_neon,
+                             10),
+        SubpelVarianceParams(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_neon,
+                             10),
+        SubpelVarianceParams(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_neon,
+                             10),
+        SubpelVarianceParams(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_neon,
+                             10),
+        SubpelVarianceParams(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_neon,
+                             10),
+        SubpelVarianceParams(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_neon,
+                             10),
+        SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_neon,
+                             10),
+        SubpelVarianceParams(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_neon,
+                             10),
+        SubpelVarianceParams(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_neon,
+                             10),
+        SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_neon,
+                             8),
+        SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_neon,
+                             8),
+        SubpelVarianceParams(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_neon,
+                             8),
+        SubpelVarianceParams(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_neon,
+                             8),
+        SubpelVarianceParams(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_neon,
+                             8),
+        SubpelVarianceParams(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_neon,
+                             8),
+        SubpelVarianceParams(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_neon,
+                             8),
+        SubpelVarianceParams(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_neon,
+                             8),
+        SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_neon,
+                             8),
+        SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_neon, 8),
+        SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_neon, 8),
+        SubpelVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_neon, 8),
+        SubpelVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_neon,
+                             8)));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, VpxHBDSubpelAvgVarianceTest,
+    ::testing::Values(
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_12_sub_pixel_avg_variance64x64_neon,
+                                12),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance64x32_neon,
+                                12),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x64_neon,
+                                12),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x32_neon,
+                                12),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x16_neon,
+                                12),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x32_neon,
+                                12),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x16_neon,
+                                12),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x8_neon,
+                                12),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x16_neon,
+                                12),
+        SubpelAvgVarianceParams(3, 3,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x8_neon,
+                                12),
+        SubpelAvgVarianceParams(3, 2,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x4_neon,
+                                12),
+        SubpelAvgVarianceParams(2, 3,
+                                &vpx_highbd_12_sub_pixel_avg_variance4x8_neon,
+                                12),
+        SubpelAvgVarianceParams(2, 2,
+                                &vpx_highbd_12_sub_pixel_avg_variance4x4_neon,
+                                12),
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_10_sub_pixel_avg_variance64x64_neon,
+                                10),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance64x32_neon,
+                                10),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x64_neon,
+                                10),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x32_neon,
+                                10),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x16_neon,
+                                10),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x32_neon,
+                                10),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x16_neon,
+                                10),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x8_neon,
+                                10),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x16_neon,
+                                10),
+        SubpelAvgVarianceParams(3, 3,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x8_neon,
+                                10),
+        SubpelAvgVarianceParams(3, 2,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x4_neon,
+                                10),
+        SubpelAvgVarianceParams(2, 3,
+                                &vpx_highbd_10_sub_pixel_avg_variance4x8_neon,
+                                10),
+        SubpelAvgVarianceParams(2, 2,
+                                &vpx_highbd_10_sub_pixel_avg_variance4x4_neon,
+                                10),
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_8_sub_pixel_avg_variance64x64_neon,
+                                8),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance64x32_neon,
+                                8),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x64_neon,
+                                8),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x32_neon,
+                                8),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x16_neon,
+                                8),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x32_neon,
+                                8),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x16_neon,
+                                8),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x8_neon,
+                                8),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance8x16_neon,
+                                8),
+        SubpelAvgVarianceParams(3, 3,
+                                &vpx_highbd_8_sub_pixel_avg_variance8x8_neon,
+                                8),
+        SubpelAvgVarianceParams(3, 2,
+                                &vpx_highbd_8_sub_pixel_avg_variance8x4_neon,
+                                8),
+        SubpelAvgVarianceParams(2, 3,
+                                &vpx_highbd_8_sub_pixel_avg_variance4x8_neon,
+                                8),
+        SubpelAvgVarianceParams(2, 2,
+                                &vpx_highbd_8_sub_pixel_avg_variance4x4_neon,
+                                8)));
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_NEON
 
+#if HAVE_SVE
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    SVE, VpxHBDVarianceTest,
+    ::testing::Values(
+        VarianceParams(6, 6, &vpx_highbd_12_variance64x64_sve, 12),
+        VarianceParams(6, 5, &vpx_highbd_12_variance64x32_sve, 12),
+        VarianceParams(5, 6, &vpx_highbd_12_variance32x64_sve, 12),
+        VarianceParams(5, 5, &vpx_highbd_12_variance32x32_sve, 12),
+        VarianceParams(5, 4, &vpx_highbd_12_variance32x16_sve, 12),
+        VarianceParams(4, 5, &vpx_highbd_12_variance16x32_sve, 12),
+        VarianceParams(4, 4, &vpx_highbd_12_variance16x16_sve, 12),
+        VarianceParams(4, 3, &vpx_highbd_12_variance16x8_sve, 12),
+        VarianceParams(3, 4, &vpx_highbd_12_variance8x16_sve, 12),
+        VarianceParams(3, 3, &vpx_highbd_12_variance8x8_sve, 12),
+        VarianceParams(3, 2, &vpx_highbd_12_variance8x4_sve, 12),
+        VarianceParams(2, 3, &vpx_highbd_12_variance4x8_sve, 12),
+        VarianceParams(2, 2, &vpx_highbd_12_variance4x4_sve, 12),
+        VarianceParams(6, 6, &vpx_highbd_10_variance64x64_sve, 10),
+        VarianceParams(6, 5, &vpx_highbd_10_variance64x32_sve, 10),
+        VarianceParams(5, 6, &vpx_highbd_10_variance32x64_sve, 10),
+        VarianceParams(5, 5, &vpx_highbd_10_variance32x32_sve, 10),
+        VarianceParams(5, 4, &vpx_highbd_10_variance32x16_sve, 10),
+        VarianceParams(4, 5, &vpx_highbd_10_variance16x32_sve, 10),
+        VarianceParams(4, 4, &vpx_highbd_10_variance16x16_sve, 10),
+        VarianceParams(4, 3, &vpx_highbd_10_variance16x8_sve, 10),
+        VarianceParams(3, 4, &vpx_highbd_10_variance8x16_sve, 10),
+        VarianceParams(3, 3, &vpx_highbd_10_variance8x8_sve, 10),
+        VarianceParams(3, 2, &vpx_highbd_10_variance8x4_sve, 10),
+        VarianceParams(2, 3, &vpx_highbd_10_variance4x8_sve, 10),
+        VarianceParams(2, 2, &vpx_highbd_10_variance4x4_sve, 10),
+        VarianceParams(6, 6, &vpx_highbd_8_variance64x64_sve, 8),
+        VarianceParams(6, 5, &vpx_highbd_8_variance64x32_sve, 8),
+        VarianceParams(5, 6, &vpx_highbd_8_variance32x64_sve, 8),
+        VarianceParams(5, 5, &vpx_highbd_8_variance32x32_sve, 8),
+        VarianceParams(5, 4, &vpx_highbd_8_variance32x16_sve, 8),
+        VarianceParams(4, 5, &vpx_highbd_8_variance16x32_sve, 8),
+        VarianceParams(4, 4, &vpx_highbd_8_variance16x16_sve, 8),
+        VarianceParams(4, 3, &vpx_highbd_8_variance16x8_sve, 8),
+        VarianceParams(3, 4, &vpx_highbd_8_variance8x16_sve, 8),
+        VarianceParams(3, 3, &vpx_highbd_8_variance8x8_sve, 8),
+        VarianceParams(3, 2, &vpx_highbd_8_variance8x4_sve, 8),
+        VarianceParams(2, 3, &vpx_highbd_8_variance4x8_sve, 8),
+        VarianceParams(2, 2, &vpx_highbd_8_variance4x4_sve, 8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_SVE
+
 #if HAVE_MSA
-INSTANTIATE_TEST_CASE_P(MSA, SumOfSquaresTest,
-                        ::testing::Values(vpx_get_mb_ss_msa));
+INSTANTIATE_TEST_SUITE_P(MSA, SumOfSquaresTest,
+                         ::testing::Values(vpx_get_mb_ss_msa));
 
-INSTANTIATE_TEST_CASE_P(MSA, VpxSseTest,
-                        ::testing::Values(SseParams(2, 2,
-                                                    &vpx_get4x4sse_cs_msa)));
+INSTANTIATE_TEST_SUITE_P(MSA, VpxSseTest,
+                         ::testing::Values(SseParams(2, 2,
+                                                     &vpx_get4x4sse_cs_msa)));
 
-INSTANTIATE_TEST_CASE_P(MSA, VpxMseTest,
-                        ::testing::Values(MseParams(4, 4, &vpx_mse16x16_msa),
-                                          MseParams(4, 3, &vpx_mse16x8_msa),
-                                          MseParams(3, 4, &vpx_mse8x16_msa),
-                                          MseParams(3, 3, &vpx_mse8x8_msa)));
+INSTANTIATE_TEST_SUITE_P(MSA, VpxMseTest,
+                         ::testing::Values(MseParams(4, 4, &vpx_mse16x16_msa),
+                                           MseParams(4, 3, &vpx_mse16x8_msa),
+                                           MseParams(3, 4, &vpx_mse8x16_msa),
+                                           MseParams(3, 3, &vpx_mse8x8_msa)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     MSA, VpxVarianceTest,
     ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_msa),
                       VarianceParams(6, 5, &vpx_variance64x32_msa),
@@ -1262,36 +2048,167 @@ INSTANTIATE_TEST_CASE_P(
                       VarianceParams(2, 3, &vpx_variance4x8_msa),
                       VarianceParams(2, 2, &vpx_variance4x4_msa)));
 
-INSTANTIATE_TEST_CASE_P(
-    MSA, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(2, 2, &vpx_sub_pixel_variance4x4_msa, 0),
-                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_msa, 0),
-                      make_tuple(3, 2, &vpx_sub_pixel_variance8x4_msa, 0),
-                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_msa, 0),
-                      make_tuple(3, 4, &vpx_sub_pixel_variance8x16_msa, 0),
-                      make_tuple(4, 3, &vpx_sub_pixel_variance16x8_msa, 0),
-                      make_tuple(4, 4, &vpx_sub_pixel_variance16x16_msa, 0),
-                      make_tuple(4, 5, &vpx_sub_pixel_variance16x32_msa, 0),
-                      make_tuple(5, 4, &vpx_sub_pixel_variance32x16_msa, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_msa, 0),
-                      make_tuple(5, 6, &vpx_sub_pixel_variance32x64_msa, 0),
-                      make_tuple(6, 5, &vpx_sub_pixel_variance64x32_msa, 0),
-                      make_tuple(6, 6, &vpx_sub_pixel_variance64x64_msa, 0)));
+INSTANTIATE_TEST_SUITE_P(
+    MSA, VpxGetVarianceTest,
+    ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_msa),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_msa),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_msa),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_msa),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_msa),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_msa)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
+    MSA, VpxSubpelVarianceTest,
+    ::testing::Values(
+        SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_msa, 0),
+        SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_msa, 0),
+        SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_msa, 0),
+        SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_msa, 0),
+        SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_msa, 0),
+        SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_msa, 0),
+        SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_msa, 0),
+        SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_msa, 0),
+        SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_msa, 0),
+        SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_msa, 0),
+        SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_msa, 0),
+        SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_msa, 0),
+        SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_msa, 0)));
+
+INSTANTIATE_TEST_SUITE_P(
     MSA, VpxSubpelAvgVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_msa, 0),
-                      make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_msa, 0),
-                      make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_msa, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_msa, 0),
-                      make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_msa, 0),
-                      make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_msa, 0),
-                      make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_msa, 0),
-                      make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_msa, 0),
-                      make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_msa, 0),
-                      make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_msa, 0),
-                      make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_msa, 0),
-                      make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_msa, 0),
-                      make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_msa, 0)));
+    ::testing::Values(
+        SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_msa, 0),
+        SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_msa, 0),
+        SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_msa, 0),
+        SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_msa, 0),
+        SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_msa, 0),
+        SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_msa, 0),
+        SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_msa, 0),
+        SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_msa, 0),
+        SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_msa, 0),
+        SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_msa, 0),
+        SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_msa, 0),
+        SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_msa, 0),
+        SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_msa, 0)));
 #endif  // HAVE_MSA
+
+#if HAVE_VSX
+INSTANTIATE_TEST_SUITE_P(VSX, SumOfSquaresTest,
+                         ::testing::Values(vpx_get_mb_ss_vsx));
+
+INSTANTIATE_TEST_SUITE_P(VSX, VpxSseTest,
+                         ::testing::Values(SseParams(2, 2,
+                                                     &vpx_get4x4sse_cs_vsx)));
+INSTANTIATE_TEST_SUITE_P(VSX, VpxMseTest,
+                         ::testing::Values(MseParams(4, 4, &vpx_mse16x16_vsx),
+                                           MseParams(4, 3, &vpx_mse16x8_vsx),
+                                           MseParams(3, 4, &vpx_mse8x16_vsx),
+                                           MseParams(3, 3, &vpx_mse8x8_vsx)));
+
+INSTANTIATE_TEST_SUITE_P(
+    VSX, VpxVarianceTest,
+    ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_vsx),
+                      VarianceParams(6, 5, &vpx_variance64x32_vsx),
+                      VarianceParams(5, 6, &vpx_variance32x64_vsx),
+                      VarianceParams(5, 5, &vpx_variance32x32_vsx),
+                      VarianceParams(5, 4, &vpx_variance32x16_vsx),
+                      VarianceParams(4, 5, &vpx_variance16x32_vsx),
+                      VarianceParams(4, 4, &vpx_variance16x16_vsx),
+                      VarianceParams(4, 3, &vpx_variance16x8_vsx),
+                      VarianceParams(3, 4, &vpx_variance8x16_vsx),
+                      VarianceParams(3, 3, &vpx_variance8x8_vsx),
+                      VarianceParams(3, 2, &vpx_variance8x4_vsx),
+                      VarianceParams(2, 3, &vpx_variance4x8_vsx),
+                      VarianceParams(2, 2, &vpx_variance4x4_vsx)));
+
+INSTANTIATE_TEST_SUITE_P(
+    VSX, VpxGetVarianceTest,
+    ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_vsx),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_vsx),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_vsx),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_vsx),
+                      GetVarianceParams(4, 4, &vpx_get16x16var_vsx),
+                      GetVarianceParams(3, 3, &vpx_get8x8var_vsx)));
+#endif  // HAVE_VSX
+
+#if HAVE_MMI
+INSTANTIATE_TEST_SUITE_P(MMI, VpxMseTest,
+                         ::testing::Values(MseParams(4, 4, &vpx_mse16x16_mmi),
+                                           MseParams(4, 3, &vpx_mse16x8_mmi),
+                                           MseParams(3, 4, &vpx_mse8x16_mmi),
+                                           MseParams(3, 3, &vpx_mse8x8_mmi)));
+
+INSTANTIATE_TEST_SUITE_P(
+    MMI, VpxVarianceTest,
+    ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_mmi),
+                      VarianceParams(6, 5, &vpx_variance64x32_mmi),
+                      VarianceParams(5, 6, &vpx_variance32x64_mmi),
+                      VarianceParams(5, 5, &vpx_variance32x32_mmi),
+                      VarianceParams(5, 4, &vpx_variance32x16_mmi),
+                      VarianceParams(4, 5, &vpx_variance16x32_mmi),
+                      VarianceParams(4, 4, &vpx_variance16x16_mmi),
+                      VarianceParams(4, 3, &vpx_variance16x8_mmi),
+                      VarianceParams(3, 4, &vpx_variance8x16_mmi),
+                      VarianceParams(3, 3, &vpx_variance8x8_mmi),
+                      VarianceParams(3, 2, &vpx_variance8x4_mmi),
+                      VarianceParams(2, 3, &vpx_variance4x8_mmi),
+                      VarianceParams(2, 2, &vpx_variance4x4_mmi)));
+
+INSTANTIATE_TEST_SUITE_P(
+    MMI, VpxSubpelVarianceTest,
+    ::testing::Values(
+        SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_mmi, 0),
+        SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_mmi, 0),
+        SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_mmi, 0),
+        SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_mmi, 0),
+        SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_mmi, 0),
+        SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_mmi, 0),
+        SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_mmi, 0),
+        SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_mmi, 0),
+        SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_mmi, 0),
+        SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_mmi, 0),
+        SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_mmi, 0),
+        SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_mmi, 0),
+        SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_mmi, 0)));
+
+INSTANTIATE_TEST_SUITE_P(
+    MMI, VpxSubpelAvgVarianceTest,
+    ::testing::Values(
+        SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_mmi, 0),
+        SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_mmi, 0),
+        SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_mmi, 0),
+        SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_mmi, 0),
+        SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_mmi, 0),
+        SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_mmi, 0),
+        SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_mmi, 0),
+        SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_mmi, 0),
+        SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_mmi, 0),
+        SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_mmi, 0),
+        SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_mmi, 0),
+        SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_mmi, 0),
+        SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_mmi, 0)));
+#endif  // HAVE_MMI
+
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(LSX, VpxMseTest,
+                         ::testing::Values(MseParams(4, 4, &vpx_mse16x16_lsx)));
+
+INSTANTIATE_TEST_SUITE_P(
+    LSX, VpxVarianceTest,
+    ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_lsx),
+                      VarianceParams(5, 5, &vpx_variance32x32_lsx),
+                      VarianceParams(4, 4, &vpx_variance16x16_lsx),
+                      VarianceParams(3, 3, &vpx_variance8x8_lsx)));
+
+INSTANTIATE_TEST_SUITE_P(
+    LSX, VpxSubpelVarianceTest,
+    ::testing::Values(
+        SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_lsx, 0),
+        SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_lsx, 0),
+        SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_lsx, 0)));
+
+INSTANTIATE_TEST_SUITE_P(LSX, VpxSubpelAvgVarianceTest,
+                         ::testing::Values(SubpelAvgVarianceParams(
+                             6, 6, &vpx_sub_pixel_avg_variance64x64_lsx, 0)));
+#endif
 }  // namespace
diff --git a/media/libvpx/libvpx/test/video_source.h b/media/libvpx/libvpx/test/video_source.h
index 54f692865b..419d162596 100644
--- a/media/libvpx/libvpx/test/video_source.h
+++ b/media/libvpx/libvpx/test/video_source.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef TEST_VIDEO_SOURCE_H_
-#define TEST_VIDEO_SOURCE_H_
+#ifndef VPX_TEST_VIDEO_SOURCE_H_
+#define VPX_TEST_VIDEO_SOURCE_H_
 
 #if defined(_WIN32)
 #undef NOMINMAX
@@ -20,7 +20,13 @@
 #endif
 #include <cstdio>
 #include <cstdlib>
+#include <cstring>
+#include <memory>
 #include <string>
+
+#if !defined(_WIN32)
+#include "gtest/gtest.h"
+#endif
 #include "test/acm_random.h"
 #include "vpx/vpx_encoder.h"
 
@@ -36,7 +42,7 @@ namespace libvpx_test {
 // A simple function to encapsulate cross platform retrieval of test data path
 static std::string GetDataPath() {
   const char *const data_path = getenv("LIBVPX_TEST_DATA_PATH");
-  if (data_path == NULL) {
+  if (data_path == nullptr) {
 #ifdef LIBVPX_TEST_DATA_PATH
     // In some environments, we cannot set environment variables
     // Instead, we set the data path by using a preprocessor symbol
@@ -58,7 +64,7 @@ inline FILE *OpenTestDataFile(const std::string &file_name) {
   return fopen(path_to_source.c_str(), "rb");
 }
 
-static FILE *GetTempOutFile(std::string *file_name) {
+static FILE *GetTempOutFile(std::string *file_name, const char *io_mode) {
   file_name->clear();
 #if defined(_WIN32)
   char fname[MAX_PATH];
@@ -67,18 +73,37 @@ static FILE *GetTempOutFile(std::string *file_name) {
     // Assume for now that the filename generated is unique per process
     if (GetTempFileNameA(tmppath, "lvx", 0, fname)) {
       file_name->assign(fname);
-      return fopen(fname, "wb+");
+      return fopen(fname, io_mode);
     }
   }
-  return NULL;
+  return nullptr;
 #else
-  return tmpfile();
+  std::string temp_dir = testing::TempDir();
+  if (temp_dir.empty()) return nullptr;
+  // Versions of testing::TempDir() prior to release-1.11.0-214-g5e6a5336 may
+  // use the value of an environment variable without checking for a trailing
+  // path delimiter.
+  if (temp_dir[temp_dir.size() - 1] != '/') temp_dir += '/';
+  const char name_template[] = "libvpxtest.XXXXXX";
+  std::unique_ptr<char[]> temp_file_name(
+      new char[temp_dir.size() + sizeof(name_template)]);
+  if (temp_file_name == nullptr) return nullptr;
+  memcpy(temp_file_name.get(), temp_dir.data(), temp_dir.size());
+  memcpy(temp_file_name.get() + temp_dir.size(), name_template,
+         sizeof(name_template));
+  const int fd = mkstemp(temp_file_name.get());
+  if (fd == -1) return nullptr;
+  *file_name = temp_file_name.get();
+  return fdopen(fd, io_mode);
 #endif
 }
 
 class TempOutFile {
  public:
-  TempOutFile() { file_ = GetTempOutFile(&file_name_); }
+  TempOutFile() { file_ = GetTempOutFile(&file_name_, "wb+"); }
+  TempOutFile(const char *io_mode) {
+    file_ = GetTempOutFile(&file_name_, io_mode);
+  }
   ~TempOutFile() {
     CloseFile();
     if (!file_name_.empty()) {
@@ -92,7 +117,7 @@ class TempOutFile {
   void CloseFile() {
     if (file_) {
       fclose(file_);
-      file_ = NULL;
+      file_ = nullptr;
     }
   }
   FILE *file_;
@@ -111,7 +136,7 @@ class VideoSource {
   // Advance the cursor to the next frame
   virtual void Next() = 0;
 
-  // Get the current video frame, or NULL on End-Of-Stream.
+  // Get the current video frame, or nullptr on End-Of-Stream.
   virtual vpx_image_t *img() const = 0;
 
   // Get the presentation timestamp of the current frame.
@@ -133,38 +158,40 @@ class VideoSource {
 class DummyVideoSource : public VideoSource {
  public:
   DummyVideoSource()
-      : img_(NULL), limit_(100), width_(80), height_(64),
+      : img_(nullptr), limit_(100), width_(80), height_(64),
         format_(VPX_IMG_FMT_I420) {
     ReallocImage();
   }
 
-  virtual ~DummyVideoSource() { vpx_img_free(img_); }
+  ~DummyVideoSource() override { vpx_img_free(img_); }
 
-  virtual void Begin() {
+  void Begin() override {
     frame_ = 0;
     FillFrame();
   }
 
-  virtual void Next() {
+  void Next() override {
     ++frame_;
     FillFrame();
   }
 
-  virtual vpx_image_t *img() const { return (frame_ < limit_) ? img_ : NULL; }
+  vpx_image_t *img() const override {
+    return (frame_ < limit_) ? img_ : nullptr;
+  }
 
   // Models a stream where Timebase = 1/FPS, so pts == frame.
-  virtual vpx_codec_pts_t pts() const { return frame_; }
+  vpx_codec_pts_t pts() const override { return frame_; }
 
-  virtual unsigned long duration() const { return 1; }
+  unsigned long duration() const override { return 1; }
 
-  virtual vpx_rational_t timebase() const {
+  vpx_rational_t timebase() const override {
     const vpx_rational_t t = { 1, 30 };
     return t;
   }
 
-  virtual unsigned int frame() const { return frame_; }
+  unsigned int frame() const override { return frame_; }
 
-  virtual unsigned int limit() const { return limit_; }
+  unsigned int limit() const override { return limit_; }
 
   void set_limit(unsigned int limit) { limit_ = limit; }
 
@@ -190,8 +217,9 @@ class DummyVideoSource : public VideoSource {
 
   void ReallocImage() {
     vpx_img_free(img_);
-    img_ = vpx_img_alloc(NULL, format_, width_, height_, 32);
-    raw_sz_ = ((img_->w + 31) & ~31) * img_->h * img_->bps / 8;
+    img_ = vpx_img_alloc(nullptr, format_, width_, height_, 32);
+    ASSERT_NE(img_, nullptr);
+    raw_sz_ = ((img_->w + 31) & ~31u) * img_->h * img_->bps / 8;
   }
 
   vpx_image_t *img_;
@@ -208,17 +236,17 @@ class RandomVideoSource : public DummyVideoSource {
   RandomVideoSource(int seed = ACMRandom::DeterministicSeed())
       : rnd_(seed), seed_(seed) {}
 
- protected:
   // Reset the RNG to get a matching stream for the second pass
-  virtual void Begin() {
+  void Begin() override {
     frame_ = 0;
     rnd_.Reset(seed_);
     FillFrame();
   }
 
+ protected:
   // 15 frames of noise, followed by 15 static frames. Reset to 0 rather
   // than holding previous frames to encourage keyframes to be thrown.
-  virtual void FillFrame() {
+  void FillFrame() override {
     if (img_) {
       if (frame_ % 30 < 15) {
         for (size_t i = 0; i < raw_sz_; ++i) img_->img_data[i] = rnd_.Rand8();
@@ -255,4 +283,4 @@ class CompressedVideoSource {
 
 }  // namespace libvpx_test
 
-#endif  // TEST_VIDEO_SOURCE_H_
+#endif  // VPX_TEST_VIDEO_SOURCE_H_
diff --git a/media/libvpx/libvpx/test/vp8_boolcoder_test.cc b/media/libvpx/libvpx/test/vp8_boolcoder_test.cc
index 9d81f9382a..0e2c7b77e1 100644
--- a/media/libvpx/libvpx/test/vp8_boolcoder_test.cc
+++ b/media/libvpx/libvpx/test/vp8_boolcoder_test.cc
@@ -15,7 +15,7 @@
 #include <string.h>
 #include <sys/types.h>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "test/acm_random.h"
 #include "vp8/decoder/dboolhuff.h"
@@ -93,6 +93,9 @@ TEST(VP8, TestBitIO) {
         }
 
         vp8_stop_encode(&bw);
+        // vp8dx_bool_decoder_fill() may read into uninitialized data that
+        // isn't used meaningfully, but may trigger an MSan warning.
+        memset(bw_buffer + bw.pos, 0, sizeof(VP8_BD_VALUE) - 1);
 
         BOOL_DECODER br;
         encrypt_buffer(bw_buffer, kBufferSize);
diff --git a/media/libvpx/libvpx/test/vp8_datarate_test.cc b/media/libvpx/libvpx/test/vp8_datarate_test.cc
new file mode 100644
index 0000000000..63fc724b7d
--- /dev/null
+++ b/media/libvpx/libvpx/test/vp8_datarate_test.cc
@@ -0,0 +1,508 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#include "gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "vpx/vpx_encoder.h"
+
+namespace {
+
+class DatarateTestLarge
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+ public:
+  DatarateTestLarge() : EncoderTest(GET_PARAM(0)) {}
+
+  ~DatarateTestLarge() override = default;
+
+ protected:
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    set_cpu_used_ = GET_PARAM(2);
+    ResetModel();
+  }
+
+  void ResetModel() {
+    last_pts_ = 0;
+    bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz;
+    frame_number_ = 0;
+    first_drop_ = 0;
+    bits_total_ = 0;
+    duration_ = 0.0;
+    // Denoiser is off by default.
+    denoiser_on_ = 0;
+    denoiser_offon_test_ = 0;
+    denoiser_offon_period_ = -1;
+    gf_boost_ = 0;
+    use_roi_ = false;
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_);
+      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(VP8E_SET_GF_CBR_BOOST_PCT, gf_boost_);
+    }
+
+    if (use_roi_) {
+      encoder->Control(VP8E_SET_ROI_MAP, &roi_);
+    }
+
+    if (denoiser_offon_test_) {
+      ASSERT_GT(denoiser_offon_period_, 0)
+          << "denoiser_offon_period_ is not positive.";
+      if ((video->frame() + 1) % denoiser_offon_period_ == 0) {
+        // Flip denoiser_on_ periodically
+        denoiser_on_ ^= 1;
+      }
+      encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_);
+    }
+
+    const vpx_rational_t tb = video->timebase();
+    timebase_ = static_cast<double>(tb.num) / tb.den;
+    duration_ = 0;
+  }
+
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
+    // Time since last timestamp = duration.
+    vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
+
+    // TODO(jimbankoski): Remove these lines when the issue:
+    // http://code.google.com/p/webm/issues/detail?id=496 is fixed.
+    // For now the codec assumes buffer starts at starting buffer rate
+    // plus one frame's time.
+    if (last_pts_ == 0) duration = 1;
+
+    // Add to the buffer the bits we'd expect from a constant bitrate server.
+    bits_in_buffer_model_ += static_cast<int64_t>(
+        duration * timebase_ * cfg_.rc_target_bitrate * 1000);
+
+    /* Test the buffer model here before subtracting the frame. Do so because
+     * the way the leaky bucket model works in libvpx is to allow the buffer to
+     * empty - and then stop showing frames until we've got enough bits to
+     * show one. As noted in comment below (issue 495), this does not currently
+     * apply to key frames. For now exclude key frames in condition below. */
+    const bool key_frame =
+        (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false;
+    if (!key_frame) {
+      ASSERT_GE(bits_in_buffer_model_, 0)
+          << "Buffer Underrun at frame " << pkt->data.frame.pts;
+    }
+
+    const int64_t frame_size_in_bits = pkt->data.frame.sz * 8;
+
+    // Subtract from the buffer the bits associated with a played back frame.
+    bits_in_buffer_model_ -= frame_size_in_bits;
+
+    // Update the running total of bits for end of test datarate checks.
+    bits_total_ += frame_size_in_bits;
+
+    // If first drop not set and we have a drop set it to this time.
+    if (!first_drop_ && duration > 1) first_drop_ = last_pts_ + 1;
+
+    // Update the most recent pts.
+    last_pts_ = pkt->data.frame.pts;
+
+    // We update this so that we can calculate the datarate minus the last
+    // frame encoded in the file.
+    bits_in_last_frame_ = frame_size_in_bits;
+
+    ++frame_number_;
+  }
+
+  void EndPassHook() override {
+    if (bits_total_) {
+      const double file_size_in_kb = bits_total_ / 1000.;  // bits per kilobit
+
+      duration_ = (last_pts_ + 1) * timebase_;
+
+      // Effective file datarate includes the time spent prebuffering.
+      effective_datarate_ = (bits_total_ - bits_in_last_frame_) / 1000.0 /
+                            (cfg_.rc_buf_initial_sz / 1000.0 + duration_);
+
+      file_datarate_ = file_size_in_kb / duration_;
+    }
+  }
+
+  virtual void DenoiserLevelsTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_dropframe_thresh = 1;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_end_usage = VPX_CBR;
+    ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 140);
+    for (int j = 1; j < 5; ++j) {
+      // Run over the denoiser levels.
+      // For the temporal denoiser (#if CONFIG_TEMPORAL_DENOISING) the level j
+      // refers to the 4 denoiser modes: denoiserYonly, denoiserOnYUV,
+      // denoiserOnAggressive, and denoiserOnAdaptive.
+      cfg_.rc_target_bitrate = 300;
+      ResetModel();
+      denoiser_on_ = j;
+      ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+      ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+          << " The datarate for the file exceeds the target!";
+
+      ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
+          << " The datarate for the file missed the target!";
+    }
+  }
+
+  virtual void DenoiserOffOnTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_dropframe_thresh = 1;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_end_usage = VPX_CBR;
+    ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 299);
+    cfg_.rc_target_bitrate = 300;
+    ResetModel();
+    // Set the offon test flag.
+    denoiser_offon_test_ = 1;
+    denoiser_offon_period_ = 100;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+        << " The datarate for the file exceeds the target!";
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
+        << " The datarate for the file missed the target!";
+  }
+
+  virtual void BasicBufferModelTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_dropframe_thresh = 1;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_end_usage = VPX_CBR;
+    // 2 pass cbr datarate control has a bug hidden by the small # of
+    // frames selected in this encode. The problem is that even if the buffer is
+    // negative we produce a keyframe on a cutscene. Ignoring datarate
+    // constraints
+    // TODO(jimbankoski): ( Fix when issue
+    // http://code.google.com/p/webm/issues/detail?id=495 is addressed. )
+    ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 140);
+
+    // There is an issue for low bitrates in real-time mode, where the
+    // effective_datarate slightly overshoots the target bitrate.
+    // This is same the issue as noted about (#495).
+    // TODO(jimbankoski/marpan): Update test to run for lower bitrates (< 100),
+    // when the issue is resolved.
+    for (int i = 100; i < 800; i += 200) {
+      cfg_.rc_target_bitrate = i;
+      ResetModel();
+      ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+      ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+          << " The datarate for the file exceeds the target!";
+      ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
+          << " The datarate for the file missed the target!";
+    }
+  }
+
+  virtual void ChangingDropFrameThreshTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_max_quantizer = 36;
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.rc_target_bitrate = 200;
+    cfg_.kf_mode = VPX_KF_DISABLED;
+
+    const int frame_count = 40;
+    ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, frame_count);
+
+    // Here we check that the first dropped frame gets earlier and earlier
+    // as the drop frame threshold is increased.
+
+    const int kDropFrameThreshTestStep = 30;
+    vpx_codec_pts_t last_drop = frame_count;
+    for (int i = 1; i < 91; i += kDropFrameThreshTestStep) {
+      cfg_.rc_dropframe_thresh = i;
+      ResetModel();
+      ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+      ASSERT_LE(first_drop_, last_drop)
+          << " The first dropped frame for drop_thresh " << i
+          << " > first dropped frame for drop_thresh "
+          << i - kDropFrameThreshTestStep;
+      last_drop = first_drop_;
+    }
+  }
+
+  virtual void DropFramesMultiThreadsTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_dropframe_thresh = 30;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.g_threads = 2;
+
+    ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 140);
+    cfg_.rc_target_bitrate = 200;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+        << " The datarate for the file exceeds the target!";
+
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
+        << " The datarate for the file missed the target!";
+  }
+
+  virtual void MultiThreadsPSNRTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.g_threads = 4;
+    init_flags_ = VPX_CODEC_USE_PSNR;
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, 30);
+    cfg_.rc_target_bitrate = 1000;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.5)
+        << " The datarate for the file exceeds the target!";
+
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 2.0)
+        << " The datarate for the file missed the target!";
+  }
+
+  vpx_codec_pts_t last_pts_;
+  int64_t bits_in_buffer_model_;
+  double timebase_;
+  int frame_number_;
+  vpx_codec_pts_t first_drop_;
+  int64_t bits_total_;
+  double duration_;
+  double file_datarate_;
+  double effective_datarate_;
+  int64_t bits_in_last_frame_;
+  int denoiser_on_;
+  int denoiser_offon_test_;
+  int denoiser_offon_period_;
+  int set_cpu_used_;
+  int gf_boost_;
+  bool use_roi_;
+  vpx_roi_map_t roi_;
+};
+
+#if CONFIG_TEMPORAL_DENOISING
+// Check basic datarate targeting, for a single bitrate, but loop over the
+// various denoiser settings.
+TEST_P(DatarateTestLarge, DenoiserLevels) { DenoiserLevelsTest(); }
+
+// Check basic datarate targeting, for a single bitrate, when denoiser is off
+// and on.
+TEST_P(DatarateTestLarge, DenoiserOffOn) { DenoiserOffOnTest(); }
+#endif  // CONFIG_TEMPORAL_DENOISING
+
+TEST_P(DatarateTestLarge, BasicBufferModel) { BasicBufferModelTest(); }
+
+TEST_P(DatarateTestLarge, ChangingDropFrameThresh) {
+  ChangingDropFrameThreshTest();
+}
+
+TEST_P(DatarateTestLarge, DropFramesMultiThreads) {
+  DropFramesMultiThreadsTest();
+}
+
+class DatarateTestRealTime : public DatarateTestLarge {
+ public:
+  ~DatarateTestRealTime() override = default;
+};
+
+#if CONFIG_TEMPORAL_DENOISING
+// Check basic datarate targeting, for a single bitrate, but loop over the
+// various denoiser settings.
+TEST_P(DatarateTestRealTime, DenoiserLevels) { DenoiserLevelsTest(); }
+
+// Check basic datarate targeting, for a single bitrate, when denoiser is off
+// and on.
+TEST_P(DatarateTestRealTime, DenoiserOffOn) {}
+#endif  // CONFIG_TEMPORAL_DENOISING
+
+TEST_P(DatarateTestRealTime, BasicBufferModel) { BasicBufferModelTest(); }
+
+TEST_P(DatarateTestRealTime, ChangingDropFrameThresh) {
+  ChangingDropFrameThreshTest();
+}
+
+TEST_P(DatarateTestRealTime, DropFramesMultiThreads) {
+  DropFramesMultiThreadsTest();
+}
+
+TEST_P(DatarateTestRealTime, MultiThreadsPSNR) { MultiThreadsPSNRTest(); }
+
+TEST_P(DatarateTestRealTime, RegionOfInterest) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  // Encode using multiple threads.
+  cfg_.g_threads = 2;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+  cfg_.rc_target_bitrate = 450;
+  cfg_.g_w = 352;
+  cfg_.g_h = 288;
+
+  ResetModel();
+
+  // Set ROI parameters
+  use_roi_ = true;
+  memset(&roi_, 0, sizeof(roi_));
+
+  roi_.rows = (cfg_.g_h + 15) / 16;
+  roi_.cols = (cfg_.g_w + 15) / 16;
+
+  roi_.delta_q[0] = 0;
+  roi_.delta_q[1] = -20;
+  roi_.delta_q[2] = 0;
+  roi_.delta_q[3] = 0;
+
+  roi_.delta_lf[0] = 0;
+  roi_.delta_lf[1] = -20;
+  roi_.delta_lf[2] = 0;
+  roi_.delta_lf[3] = 0;
+
+  roi_.static_threshold[0] = 0;
+  roi_.static_threshold[1] = 1000;
+  roi_.static_threshold[2] = 0;
+  roi_.static_threshold[3] = 0;
+
+  // Use 2 states: 1 is center square, 0 is the rest.
+  roi_.roi_map =
+      (uint8_t *)calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map));
+  for (unsigned int i = 0; i < roi_.rows; ++i) {
+    for (unsigned int j = 0; j < roi_.cols; ++j) {
+      if (i > (roi_.rows >> 2) && i < ((roi_.rows * 3) >> 2) &&
+          j > (roi_.cols >> 2) && j < ((roi_.cols * 3) >> 2)) {
+        roi_.roi_map[i * roi_.cols + j] = 1;
+      }
+    }
+  }
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+      << " The datarate for the file exceeds the target!";
+
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
+      << " The datarate for the file missed the target!";
+
+  free(roi_.roi_map);
+}
+
+TEST_P(DatarateTestRealTime, GFBoost) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_error_resilient = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+  cfg_.rc_target_bitrate = 300;
+  ResetModel();
+  // Apply a gf boost.
+  gf_boost_ = 50;
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+      << " The datarate for the file exceeds the target!";
+
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
+      << " The datarate for the file missed the target!";
+}
+
+TEST_P(DatarateTestRealTime, NV12) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_error_resilient = 0;
+  ::libvpx_test::YUVVideoSource video("hantro_collage_w352h288_nv12.yuv",
+                                      VPX_IMG_FMT_NV12, 352, 288, 30, 1, 0,
+                                      100);
+
+  cfg_.rc_target_bitrate = 200;
+  ResetModel();
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+      << " The datarate for the file exceeds the target!";
+
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
+      << " The datarate for the file missed the target!";
+}
+
+class DatarateTestPsnr : public DatarateTestLarge {
+ public:
+  DatarateTestPsnr() : DatarateTestLarge() {}
+  ~DatarateTestPsnr() override = default;
+
+ protected:
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(libvpx_test::kRealTime);
+    set_cpu_used_ = 10;
+    ResetModel();
+    frame_flags_ = VPX_EFLAG_CALCULATE_PSNR;
+  }
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    DatarateTestLarge::PreEncodeFrameHook(video, encoder);
+    frame_flags_ ^= VPX_EFLAG_CALCULATE_PSNR;
+#if CONFIG_INTERNAL_STATS
+    // CONFIG_INTERNAL_STATS unconditionally generates PSNR.
+    expect_psnr_ = true;
+#else
+    expect_psnr_ = (frame_flags_ & VPX_EFLAG_CALCULATE_PSNR) != 0;
+#endif  // CONFIG_INTERNAL_STATS
+    if (video->img() == nullptr) {
+      expect_psnr_ = false;
+    }
+  }
+  void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override {
+    libvpx_test::CxDataIterator iter = encoder->GetCxData();
+
+    bool had_psnr = false;
+    while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) {
+      if (pkt->kind == VPX_CODEC_PSNR_PKT) had_psnr = true;
+    }
+
+    EXPECT_EQ(had_psnr, expect_psnr_);
+  }
+
+ private:
+  bool expect_psnr_;
+};
+
+TEST_P(DatarateTestPsnr, PerFramePsnr) {
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 100);
+
+  ResetModel();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+VP8_INSTANTIATE_TEST_SUITE(DatarateTestLarge, ALL_TEST_MODES,
+                           ::testing::Values(0));
+VP8_INSTANTIATE_TEST_SUITE(DatarateTestRealTime,
+                           ::testing::Values(::libvpx_test::kRealTime),
+                           ::testing::Values(-6, -12));
+VP8_INSTANTIATE_TEST_SUITE(DatarateTestPsnr,
+                           ::testing::Values(::libvpx_test::kRealTime),
+                           ::testing::Values(0));
+
+}  // namespace
diff --git a/media/libvpx/libvpx/test/vp8_decrypt_test.cc b/media/libvpx/libvpx/test/vp8_decrypt_test.cc
index bcac9d1a82..e00620b192 100644
--- a/media/libvpx/libvpx/test/vp8_decrypt_test.cc
+++ b/media/libvpx/libvpx/test/vp8_decrypt_test.cc
@@ -12,7 +12,7 @@
 #include <cstdlib>
 #include <string>
 #include <vector>
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/ivf_video_source.h"
 
diff --git a/media/libvpx/libvpx/test/vp8_denoiser_sse2_test.cc b/media/libvpx/libvpx/test/vp8_denoiser_sse2_test.cc
index 2cbcf04149..6c68355d8e 100644
--- a/media/libvpx/libvpx/test/vp8_denoiser_sse2_test.cc
+++ b/media/libvpx/libvpx/test/vp8_denoiser_sse2_test.cc
@@ -12,7 +12,7 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
@@ -21,6 +21,7 @@
 #include "vp8/encoder/denoising.h"
 #include "vp8/common/reconinter.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_config.h"
 #include "vpx_mem/vpx_mem.h"
 
 using libvpx_test::ACMRandom;
@@ -30,17 +31,22 @@ namespace {
 const int kNumPixels = 16 * 16;
 class VP8DenoiserTest : public ::testing::TestWithParam<int> {
  public:
-  virtual ~VP8DenoiserTest() {}
+  ~VP8DenoiserTest() override = default;
 
-  virtual void SetUp() { increase_denoising_ = GetParam(); }
+  void SetUp() override { increase_denoising_ = GetParam(); }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   int increase_denoising_;
 };
 
+// TODO(https://crbug.com/webm/1718): This test fails with gcc 8-10.
+#if defined(__GNUC__) && __GNUC__ >= 8
+TEST_P(VP8DenoiserTest, DISABLED_BitexactCheck) {
+#else
 TEST_P(VP8DenoiserTest, BitexactCheck) {
+#endif
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   const int count_test_block = 4000;
   const int stride = 16;
@@ -87,7 +93,7 @@ TEST_P(VP8DenoiserTest, BitexactCheck) {
     // Check bitexactness.
     for (int h = 0; h < 16; ++h) {
       for (int w = 0; w < 16; ++w) {
-        EXPECT_EQ(avg_block_c[h * stride + w], avg_block_sse2[h * stride + w]);
+        ASSERT_EQ(avg_block_c[h * stride + w], avg_block_sse2[h * stride + w]);
       }
     }
 
@@ -103,12 +109,12 @@ TEST_P(VP8DenoiserTest, BitexactCheck) {
     // Check bitexactness.
     for (int h = 0; h < 16; ++h) {
       for (int w = 0; w < 16; ++w) {
-        EXPECT_EQ(avg_block_c[h * stride + w], avg_block_sse2[h * stride + w]);
+        ASSERT_EQ(avg_block_c[h * stride + w], avg_block_sse2[h * stride + w]);
       }
     }
   }
 }
 
 // Test for all block size.
-INSTANTIATE_TEST_CASE_P(SSE2, VP8DenoiserTest, ::testing::Values(0, 1));
+INSTANTIATE_TEST_SUITE_P(SSE2, VP8DenoiserTest, ::testing::Values(0, 1));
 }  // namespace
diff --git a/media/libvpx/libvpx/test/vp8_fdct4x4_test.cc b/media/libvpx/libvpx/test/vp8_fdct4x4_test.cc
index da4f0caa1e..3a6514a3e9 100644
--- a/media/libvpx/libvpx/test/vp8_fdct4x4_test.cc
+++ b/media/libvpx/libvpx/test/vp8_fdct4x4_test.cc
@@ -15,14 +15,18 @@
 #include <string.h>
 #include <sys/types.h>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
+#include "./vpx_config.h"
 #include "./vp8_rtcd.h"
 #include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
 
 namespace {
 
+using FdctFunc = void (*)(int16_t *a, int16_t *b, int a_stride);
+
 const int cospi8sqrt2minus1 = 20091;
 const int sinpi8sqrt2 = 35468;
 
@@ -68,10 +72,21 @@ void reference_idct4x4(const int16_t *input, int16_t *output) {
 
 using libvpx_test::ACMRandom;
 
-TEST(VP8FdctTest, SignBiasCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
+class FdctTest : public ::testing::TestWithParam<FdctFunc> {
+ public:
+  void SetUp() override {
+    fdct_func_ = GetParam();
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+ protected:
+  FdctFunc fdct_func_;
+  ACMRandom rnd_;
+};
+
+TEST_P(FdctTest, SignBiasCheck) {
   int16_t test_input_block[16];
-  int16_t test_output_block[16];
+  DECLARE_ALIGNED(16, int16_t, test_output_block[16]);
   const int pitch = 8;
   int count_sign_block[16][2];
   const int count_test_block = 1000000;
@@ -81,10 +96,10 @@ TEST(VP8FdctTest, SignBiasCheck) {
   for (int i = 0; i < count_test_block; ++i) {
     // Initialize a test block with input range [-255, 255].
     for (int j = 0; j < 16; ++j) {
-      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+      test_input_block[j] = rnd_.Rand8() - rnd_.Rand8();
     }
 
-    vp8_short_fdct4x4_c(test_input_block, test_output_block, pitch);
+    fdct_func_(test_input_block, test_output_block, pitch);
 
     for (int j = 0; j < 16; ++j) {
       if (test_output_block[j] < 0) {
@@ -110,10 +125,10 @@ TEST(VP8FdctTest, SignBiasCheck) {
   for (int i = 0; i < count_test_block; ++i) {
     // Initialize a test block with input range [-15, 15].
     for (int j = 0; j < 16; ++j) {
-      test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
+      test_input_block[j] = (rnd_.Rand8() >> 4) - (rnd_.Rand8() >> 4);
     }
 
-    vp8_short_fdct4x4_c(test_input_block, test_output_block, pitch);
+    fdct_func_(test_input_block, test_output_block, pitch);
 
     for (int j = 0; j < 16; ++j) {
       if (test_output_block[j] < 0) {
@@ -133,25 +148,24 @@ TEST(VP8FdctTest, SignBiasCheck) {
 
   EXPECT_EQ(true, bias_acceptable)
       << "Error: 4x4 FDCT has a sign bias > 10% for input range [-15, 15]";
-};
+}
 
-TEST(VP8FdctTest, RoundTripErrorCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
+TEST_P(FdctTest, RoundTripErrorCheck) {
   int max_error = 0;
   double total_error = 0;
   const int count_test_block = 1000000;
   for (int i = 0; i < count_test_block; ++i) {
     int16_t test_input_block[16];
-    int16_t test_temp_block[16];
     int16_t test_output_block[16];
+    DECLARE_ALIGNED(16, int16_t, test_temp_block[16]);
 
     // Initialize a test block with input range [-255, 255].
     for (int j = 0; j < 16; ++j) {
-      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+      test_input_block[j] = rnd_.Rand8() - rnd_.Rand8();
     }
 
     const int pitch = 8;
-    vp8_short_fdct4x4_c(test_input_block, test_temp_block, pitch);
+    fdct_func_(test_input_block, test_temp_block, pitch);
     reference_idct4x4(test_temp_block, test_output_block);
 
     for (int j = 0; j < 16; ++j) {
@@ -167,6 +181,31 @@ TEST(VP8FdctTest, RoundTripErrorCheck) {
 
   EXPECT_GE(count_test_block, total_error)
       << "Error: FDCT/IDCT has average roundtrip error > 1 per block";
-};
+}
 
+INSTANTIATE_TEST_SUITE_P(C, FdctTest, ::testing::Values(vp8_short_fdct4x4_c));
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, FdctTest,
+                         ::testing::Values(vp8_short_fdct4x4_neon));
+#endif  // HAVE_NEON
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, FdctTest,
+                         ::testing::Values(vp8_short_fdct4x4_sse2));
+#endif  // HAVE_SSE2
+
+#if HAVE_MSA
+INSTANTIATE_TEST_SUITE_P(MSA, FdctTest,
+                         ::testing::Values(vp8_short_fdct4x4_msa));
+#endif  // HAVE_MSA
+#if HAVE_MMI
+INSTANTIATE_TEST_SUITE_P(MMI, FdctTest,
+                         ::testing::Values(vp8_short_fdct4x4_mmi));
+#endif  // HAVE_MMI
+
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(LSX, FdctTest,
+                         ::testing::Values(vp8_short_fdct4x4_lsx));
+#endif  // HAVE_LSX
 }  // namespace
diff --git a/media/libvpx/libvpx/test/vp8_fragments_test.cc b/media/libvpx/libvpx/test/vp8_fragments_test.cc
index ac967d1b7e..1f13cde812 100644
--- a/media/libvpx/libvpx/test/vp8_fragments_test.cc
+++ b/media/libvpx/libvpx/test/vp8_fragments_test.cc
@@ -7,19 +7,19 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/video_source.h"
 
 namespace {
 
-class VP8FramgmentsTest : public ::libvpx_test::EncoderTest,
-                          public ::testing::Test {
+class VP8FragmentsTest : public ::libvpx_test::EncoderTest,
+                         public ::testing::Test {
  protected:
-  VP8FramgmentsTest() : EncoderTest(&::libvpx_test::kVP8) {}
-  virtual ~VP8FramgmentsTest() {}
+  VP8FragmentsTest() : EncoderTest(&::libvpx_test::kVP8) {}
+  ~VP8FragmentsTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     const unsigned long init_flags =  // NOLINT(runtime/int)
         VPX_CODEC_USE_OUTPUT_PARTITION;
     InitializeConfig();
@@ -28,7 +28,7 @@ class VP8FramgmentsTest : public ::libvpx_test::EncoderTest,
   }
 };
 
-TEST_F(VP8FramgmentsTest, TestFragmentsEncodeDecode) {
+TEST_F(VP8FragmentsTest, TestFragmentsEncodeDecode) {
   ::libvpx_test::RandomVideoSource video;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
diff --git a/media/libvpx/libvpx/test/vp8_multi_resolution_encoder.sh b/media/libvpx/libvpx/test/vp8_multi_resolution_encoder.sh
index a8b7fe78ee..1e96f94cc7 100644
--- a/media/libvpx/libvpx/test/vp8_multi_resolution_encoder.sh
+++ b/media/libvpx/libvpx/test/vp8_multi_resolution_encoder.sh
@@ -22,7 +22,7 @@ vp8_multi_resolution_encoder_verify_environment() {
       elog "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
       return 1
     fi
-    local readonly app="vp8_multi_resolution_encoder"
+    local app="vp8_multi_resolution_encoder"
     if [ -z "$(vpx_tool_path "${app}")" ]; then
       elog "${app} not found. It must exist in LIBVPX_BIN_PATH or its parent."
       return 1
@@ -33,7 +33,7 @@ vp8_multi_resolution_encoder_verify_environment() {
 # Runs vp8_multi_resolution_encoder. Simply forwards all arguments to
 # vp8_multi_resolution_encoder after building path to the executable.
 vp8_mre() {
-  local readonly encoder="$(vpx_tool_path vp8_multi_resolution_encoder)"
+  local encoder="$(vpx_tool_path vp8_multi_resolution_encoder)"
   if [ ! -x "${encoder}" ]; then
     elog "${encoder} does not exist or is not executable."
     return 1
@@ -43,23 +43,35 @@ vp8_mre() {
 }
 
 vp8_multi_resolution_encoder_three_formats() {
-  local readonly output_files="${VPX_TEST_OUTPUT_DIR}/vp8_mre_0.ivf
-                               ${VPX_TEST_OUTPUT_DIR}/vp8_mre_1.ivf
-                               ${VPX_TEST_OUTPUT_DIR}/vp8_mre_2.ivf"
+  local output_files="${VPX_TEST_OUTPUT_DIR}/vp8_mre_0.ivf
+                      ${VPX_TEST_OUTPUT_DIR}/vp8_mre_1.ivf
+                      ${VPX_TEST_OUTPUT_DIR}/vp8_mre_2.ivf"
+  local layer_bitrates="150 80 50"
+  local keyframe_insert="200"
+  local temporal_layers="3 3 3"
+  local framerate="30"
 
   if [ "$(vpx_config_option_enabled CONFIG_MULTI_RES_ENCODING)" = "yes" ]; then
     if [ "$(vp8_encode_available)" = "yes" ]; then
       # Param order:
       #  Input width
       #  Input height
+      #  Framerate
       #  Input file path
       #  Output file names
+      #  Layer bitrates
+      #  Temporal layers
+      #  Keyframe insert
       #  Output PSNR
       vp8_mre "${YUV_RAW_INPUT_WIDTH}" \
         "${YUV_RAW_INPUT_HEIGHT}" \
+        "${framerate}" \
         "${YUV_RAW_INPUT}" \
         ${output_files} \
-        0
+        ${layer_bitrates} \
+        ${temporal_layers} \
+        "${keyframe_insert}" \
+        0 || return 1
 
       for output_file in ${output_files}; do
         if [ ! -e "${output_file}" ]; then
diff --git a/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc b/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc
new file mode 100644
index 0000000000..74a8cde7db
--- /dev/null
+++ b/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc
@@ -0,0 +1,424 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <fstream>  // NOLINT
+#include <string>
+
+#include "./vpx_config.h"
+#include "gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/video_source.h"
+#include "vp8/vp8_ratectrl_rtc.h"
+#include "vpx/vpx_codec.h"
+#include "vpx_ports/bitops.h"
+
+namespace {
+
+struct Vp8RCTestVideo {
+  Vp8RCTestVideo() = default;
+  Vp8RCTestVideo(const char *name_, int width_, int height_,
+                 unsigned int frames_)
+      : name(name_), width(width_), height(height_), frames(frames_) {}
+
+  friend std::ostream &operator<<(std::ostream &os,
+                                  const Vp8RCTestVideo &video) {
+    os << video.name << " " << video.width << " " << video.height << " "
+       << video.frames;
+    return os;
+  }
+  const char *name;
+  int width;
+  int height;
+  unsigned int frames;
+};
+
+const Vp8RCTestVideo kVp8RCTestVectors[] = {
+  Vp8RCTestVideo("niklas_640_480_30.yuv", 640, 480, 470),
+  Vp8RCTestVideo("desktop_office1.1280_720-020.yuv", 1280, 720, 300),
+  Vp8RCTestVideo("hantro_collage_w352h288.yuv", 352, 288, 100),
+};
+
+class Vp8RcInterfaceTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<int, Vp8RCTestVideo> {
+ public:
+  Vp8RcInterfaceTest()
+      : EncoderTest(GET_PARAM(0)), key_interval_(3000), encoder_exit_(false),
+        frame_drop_thresh_(0) {}
+  ~Vp8RcInterfaceTest() override = default;
+
+ protected:
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+  }
+
+  // From error_resilience_test.cc
+  int SetFrameFlags(int frame_num, int num_temp_layers) {
+    int frame_flags = 0;
+    if (num_temp_layers == 2) {
+      if (frame_num % 2 == 0) {
+        // Layer 0: predict from L and ARF, update L.
+        frame_flags =
+            VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
+      } else {
+        // Layer 1: predict from L, G and ARF, and update G.
+        frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST |
+                      VP8_EFLAG_NO_UPD_ENTROPY;
+      }
+    } else if (num_temp_layers == 3) {
+      if (frame_num % 4 == 0) {
+        // Layer 0: predict from L, update L.
+        frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
+                      VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF;
+      } else if ((frame_num - 2) % 4 == 0) {
+        // Layer 1: predict from L, G,  update G.
+        frame_flags =
+            VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_REF_ARF;
+      } else if ((frame_num - 1) % 2 == 0) {
+        // Layer 2: predict from L, G, ARF; update ARG.
+        frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_LAST;
+      }
+    }
+    return frame_flags;
+  }
+
+  int SetLayerId(int frame_num, int num_temp_layers) {
+    int layer_id = 0;
+    if (num_temp_layers == 2) {
+      if (frame_num % 2 == 0) {
+        layer_id = 0;
+      } else {
+        layer_id = 1;
+      }
+    } else if (num_temp_layers == 3) {
+      if (frame_num % 4 == 0) {
+        layer_id = 0;
+      } else if ((frame_num - 2) % 4 == 0) {
+        layer_id = 1;
+      } else if ((frame_num - 1) % 2 == 0) {
+        layer_id = 2;
+      }
+    }
+    return layer_id;
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (rc_cfg_.ts_number_layers > 1) {
+      const int layer_id = SetLayerId(video->frame(), cfg_.ts_number_layers);
+      const int frame_flags =
+          SetFrameFlags(video->frame(), cfg_.ts_number_layers);
+      frame_params_.temporal_layer_id = layer_id;
+      if (video->frame() > 0) {
+        encoder->Control(VP8E_SET_TEMPORAL_LAYER_ID, layer_id);
+        encoder->Control(VP8E_SET_FRAME_FLAGS, frame_flags);
+      }
+    } else {
+      if (video->frame() == 0) {
+        encoder->Control(VP8E_SET_CPUUSED, -6);
+        encoder->Control(VP8E_SET_RTC_EXTERNAL_RATECTRL, 1);
+        encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000);
+        if (rc_cfg_.is_screen) {
+          encoder->Control(VP8E_SET_SCREEN_CONTENT_MODE, 1);
+        }
+      } else if (frame_params_.frame_type == libvpx::RcFrameType::kInterFrame) {
+        // Disable golden frame update.
+        frame_flags_ |= VP8_EFLAG_NO_UPD_GF;
+        frame_flags_ |= VP8_EFLAG_NO_UPD_ARF;
+      }
+    }
+    frame_params_.frame_type = video->frame() % key_interval_ == 0
+                                   ? libvpx::RcFrameType::kKeyFrame
+                                   : libvpx::RcFrameType::kInterFrame;
+    encoder_exit_ = video->frame() == test_video_.frames;
+  }
+
+  void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override {
+    if (encoder_exit_) {
+      return;
+    }
+    int qp;
+    libvpx::UVDeltaQP uv_delta_qp;
+    encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp);
+    if (rc_api_->ComputeQP(frame_params_) == libvpx::FrameDropDecision::kOk) {
+      ASSERT_EQ(rc_api_->GetQP(), qp);
+      uv_delta_qp = rc_api_->GetUVDeltaQP();
+      // delta_qp for UV channel is only set for screen.
+      if (!rc_cfg_.is_screen) {
+        ASSERT_EQ(uv_delta_qp.uvdc_delta_q, 0);
+        ASSERT_EQ(uv_delta_qp.uvac_delta_q, 0);
+      }
+    } else {
+      num_drops_++;
+    }
+  }
+
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
+    rc_api_->PostEncodeUpdate(pkt->data.frame.sz);
+  }
+
+  void RunOneLayer() {
+    test_video_ = GET_PARAM(2);
+    target_bitrate_ = GET_PARAM(1);
+    SetConfig();
+    rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
+    ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
+
+    ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
+                                         test_video_.height, 30, 1, 0,
+                                         test_video_.frames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunOneLayerScreen() {
+    test_video_ = GET_PARAM(2);
+    target_bitrate_ = GET_PARAM(1);
+    SetConfig();
+    rc_cfg_.is_screen = true;
+    rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
+    ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
+
+    ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
+                                         test_video_.height, 30, 1, 0,
+                                         test_video_.frames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunOneLayerDropFrames() {
+    test_video_ = GET_PARAM(2);
+    target_bitrate_ = GET_PARAM(1);
+    frame_drop_thresh_ = 30;
+    num_drops_ = 0;
+    // Use lower target_bitrate and max_quantizer to trigger drops.
+    target_bitrate_ = target_bitrate_ >> 2;
+    SetConfig();
+    rc_cfg_.max_quantizer = 56;
+    cfg_.rc_max_quantizer = 56;
+    rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
+    ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
+
+    ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
+                                         test_video_.height, 30, 1, 0,
+                                         test_video_.frames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    // Check that some frames were dropped, otherwise test has no value.
+    ASSERT_GE(num_drops_, 1);
+  }
+
+  void RunPeriodicKey() {
+    test_video_ = GET_PARAM(2);
+    target_bitrate_ = GET_PARAM(1);
+    key_interval_ = 100;
+    frame_drop_thresh_ = 30;
+    SetConfig();
+    rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
+    ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
+
+    ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
+                                         test_video_.height, 30, 1, 0,
+                                         test_video_.frames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunTemporalLayers2TL() {
+    test_video_ = GET_PARAM(2);
+    target_bitrate_ = GET_PARAM(1);
+    SetConfigTemporalLayers(2);
+    rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
+    ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
+
+    ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
+                                         test_video_.height, 30, 1, 0,
+                                         test_video_.frames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunTemporalLayers3TL() {
+    test_video_ = GET_PARAM(2);
+    target_bitrate_ = GET_PARAM(1);
+    SetConfigTemporalLayers(3);
+    rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
+    ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
+
+    ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
+                                         test_video_.height, 30, 1, 0,
+                                         test_video_.frames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunTemporalLayers3TLDropFrames() {
+    test_video_ = GET_PARAM(2);
+    target_bitrate_ = GET_PARAM(1);
+    frame_drop_thresh_ = 30;
+    num_drops_ = 0;
+    // Use lower target_bitrate and max_quantizer to trigger drops.
+    target_bitrate_ = target_bitrate_ >> 2;
+    SetConfigTemporalLayers(3);
+    rc_cfg_.max_quantizer = 56;
+    cfg_.rc_max_quantizer = 56;
+    rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
+    ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
+
+    ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
+                                         test_video_.height, 30, 1, 0,
+                                         test_video_.frames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    // Check that some frames were dropped, otherwise test has no value.
+    ASSERT_GE(num_drops_, 1);
+  }
+
+ private:
+  void SetConfig() {
+    rc_cfg_.width = test_video_.width;
+    rc_cfg_.height = test_video_.height;
+    rc_cfg_.max_quantizer = 60;
+    rc_cfg_.min_quantizer = 2;
+    rc_cfg_.target_bandwidth = target_bitrate_;
+    rc_cfg_.buf_initial_sz = 600;
+    rc_cfg_.buf_optimal_sz = 600;
+    rc_cfg_.buf_sz = target_bitrate_;
+    rc_cfg_.undershoot_pct = 50;
+    rc_cfg_.overshoot_pct = 50;
+    rc_cfg_.max_intra_bitrate_pct = 1000;
+    rc_cfg_.framerate = 30.0;
+    rc_cfg_.layer_target_bitrate[0] = target_bitrate_;
+    rc_cfg_.frame_drop_thresh = frame_drop_thresh_;
+
+    // Encoder settings for ground truth.
+    cfg_.g_w = test_video_.width;
+    cfg_.g_h = test_video_.height;
+    cfg_.rc_undershoot_pct = 50;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_buf_initial_sz = 600;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_buf_sz = target_bitrate_;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 60;
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 1;
+    cfg_.rc_target_bitrate = target_bitrate_;
+    cfg_.kf_min_dist = key_interval_;
+    cfg_.kf_max_dist = key_interval_;
+    cfg_.rc_dropframe_thresh = frame_drop_thresh_;
+  }
+
+  void SetConfigTemporalLayers(int temporal_layers) {
+    rc_cfg_.width = test_video_.width;
+    rc_cfg_.height = test_video_.height;
+    rc_cfg_.max_quantizer = 60;
+    rc_cfg_.min_quantizer = 2;
+    rc_cfg_.target_bandwidth = target_bitrate_;
+    rc_cfg_.buf_initial_sz = 600;
+    rc_cfg_.buf_optimal_sz = 600;
+    rc_cfg_.buf_sz = target_bitrate_;
+    rc_cfg_.undershoot_pct = 50;
+    rc_cfg_.overshoot_pct = 50;
+    rc_cfg_.max_intra_bitrate_pct = 1000;
+    rc_cfg_.framerate = 30.0;
+    rc_cfg_.frame_drop_thresh = frame_drop_thresh_;
+    if (temporal_layers == 2) {
+      rc_cfg_.layer_target_bitrate[0] = 60 * target_bitrate_ / 100;
+      rc_cfg_.layer_target_bitrate[1] = target_bitrate_;
+      rc_cfg_.ts_rate_decimator[0] = 2;
+      rc_cfg_.ts_rate_decimator[1] = 1;
+    } else if (temporal_layers == 3) {
+      rc_cfg_.layer_target_bitrate[0] = 40 * target_bitrate_ / 100;
+      rc_cfg_.layer_target_bitrate[1] = 60 * target_bitrate_ / 100;
+      rc_cfg_.layer_target_bitrate[2] = target_bitrate_;
+      rc_cfg_.ts_rate_decimator[0] = 4;
+      rc_cfg_.ts_rate_decimator[1] = 2;
+      rc_cfg_.ts_rate_decimator[2] = 1;
+    }
+
+    rc_cfg_.ts_number_layers = temporal_layers;
+
+    // Encoder settings for ground truth.
+    cfg_.g_w = test_video_.width;
+    cfg_.g_h = test_video_.height;
+    cfg_.rc_undershoot_pct = 50;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_buf_initial_sz = 600;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_buf_sz = target_bitrate_;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 60;
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 1;
+    cfg_.rc_target_bitrate = target_bitrate_;
+    cfg_.kf_min_dist = key_interval_;
+    cfg_.kf_max_dist = key_interval_;
+    cfg_.rc_dropframe_thresh = frame_drop_thresh_;
+    // 2 Temporal layers, no spatial layers, CBR mode.
+    cfg_.ss_number_layers = 1;
+    cfg_.ts_number_layers = temporal_layers;
+    if (temporal_layers == 2) {
+      cfg_.ts_rate_decimator[0] = 2;
+      cfg_.ts_rate_decimator[1] = 1;
+      cfg_.ts_periodicity = 2;
+      cfg_.ts_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100;
+      cfg_.ts_target_bitrate[1] = cfg_.rc_target_bitrate;
+    } else if (temporal_layers == 3) {
+      cfg_.ts_rate_decimator[0] = 4;
+      cfg_.ts_rate_decimator[1] = 2;
+      cfg_.ts_rate_decimator[2] = 1;
+      cfg_.ts_periodicity = 4;
+      cfg_.ts_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
+      cfg_.ts_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
+      cfg_.ts_target_bitrate[2] = cfg_.rc_target_bitrate;
+    }
+  }
+
+  std::unique_ptr<libvpx::VP8RateControlRTC> rc_api_;
+  libvpx::VP8RateControlRtcConfig rc_cfg_;
+  int key_interval_;
+  int target_bitrate_;
+  Vp8RCTestVideo test_video_;
+  libvpx::VP8FrameParamsQpRTC frame_params_;
+  bool encoder_exit_;
+  int frame_drop_thresh_;
+  int num_drops_;
+};
+
+TEST_P(Vp8RcInterfaceTest, OneLayer) { RunOneLayer(); }
+
+TEST_P(Vp8RcInterfaceTest, OneLayerScreen) { RunOneLayerScreen(); }
+
+TEST_P(Vp8RcInterfaceTest, OneLayerDropFrames) { RunOneLayerDropFrames(); }
+
+TEST_P(Vp8RcInterfaceTest, OneLayerPeriodicKey) { RunPeriodicKey(); }
+
+TEST_P(Vp8RcInterfaceTest, TemporalLayers2TL) { RunTemporalLayers2TL(); }
+
+TEST_P(Vp8RcInterfaceTest, TemporalLayers3TL) { RunTemporalLayers3TL(); }
+
+TEST_P(Vp8RcInterfaceTest, TemporalLayers3TLDropFrames) {
+  RunTemporalLayers3TLDropFrames();
+}
+
+VP8_INSTANTIATE_TEST_SUITE(Vp8RcInterfaceTest,
+                           ::testing::Values(200, 400, 1000),
+                           ::testing::ValuesIn(kVp8RCTestVectors));
+
+}  // namespace
diff --git a/media/libvpx/libvpx/test/vp9_arf_freq_test.cc b/media/libvpx/libvpx/test/vp9_arf_freq_test.cc
index 48a4ca7392..ea018b6e22 100644
--- a/media/libvpx/libvpx/test/vp9_arf_freq_test.cc
+++ b/media/libvpx/libvpx/test/vp9_arf_freq_test.cc
@@ -8,7 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include <memory>
+
+#include "gtest/gtest.h"
 
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
@@ -16,6 +18,7 @@
 #include "test/y4m_video_source.h"
 #include "test/yuv_video_source.h"
 #include "vp9/encoder/vp9_ratectrl.h"
+#include "vpx_config.h"
 
 namespace {
 
@@ -25,7 +28,7 @@ const int kBitrate = 500;
 #define ARF_NOT_SEEN 1000001
 #define ARF_SEEN_ONCE 1000000
 
-typedef struct {
+struct TestVideoParam {
   const char *filename;
   unsigned int width;
   unsigned int height;
@@ -35,12 +38,12 @@ typedef struct {
   vpx_img_fmt fmt;
   vpx_bit_depth_t bit_depth;
   unsigned int profile;
-} TestVideoParam;
+};
 
-typedef struct {
+struct TestEncodeParam {
   libvpx_test::TestMode mode;
   int cpu_used;
-} TestEncodeParam;
+};
 
 const TestVideoParam kTestVectors[] = {
   // artificially increase framerate to trigger default check
@@ -84,9 +87,9 @@ class ArfFreqTest
       : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
         test_encode_param_(GET_PARAM(2)), min_arf_requested_(GET_PARAM(3)) {}
 
-  virtual ~ArfFreqTest() {}
+  ~ArfFreqTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(test_encode_param_.mode);
     if (test_encode_param_.mode != ::libvpx_test::kRealTime) {
@@ -102,7 +105,7 @@ class ArfFreqTest
     dec_cfg_.threads = 4;
   }
 
-  virtual void BeginPassHook(unsigned int) {
+  void BeginPassHook(unsigned int) override {
     min_run_ = ARF_NOT_SEEN;
     run_of_visible_frames_ = 0;
   }
@@ -124,7 +127,7 @@ class ArfFreqTest
     return frames;
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return;
     const int frames = GetNumFramesInPkt(pkt);
     if (frames == 1) {
@@ -143,8 +146,8 @@ class ArfFreqTest
     }
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1);
       encoder->Control(VP9E_SET_TILE_COLUMNS, 4);
@@ -190,7 +193,7 @@ TEST_P(ArfFreqTest, MinArfFreqTest) {
   init_flags_ = VPX_CODEC_USE_PSNR;
   if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH;
 
-  testing::internal::scoped_ptr<libvpx_test::VideoSource> video;
+  std::unique_ptr<libvpx_test::VideoSource> video;
   if (is_extension_y4m(test_video_param_.filename)) {
     video.reset(new libvpx_test::Y4mVideoSource(test_video_param_.filename, 0,
                                                 kFrames));
@@ -211,7 +214,7 @@ TEST_P(ArfFreqTest, MinArfFreqTest) {
   }
 }
 
-VP9_INSTANTIATE_TEST_CASE(ArfFreqTest, ::testing::ValuesIn(kTestVectors),
-                          ::testing::ValuesIn(kEncodeVectors),
-                          ::testing::ValuesIn(kMinArfVectors));
+VP9_INSTANTIATE_TEST_SUITE(ArfFreqTest, ::testing::ValuesIn(kTestVectors),
+                           ::testing::ValuesIn(kEncodeVectors),
+                           ::testing::ValuesIn(kMinArfVectors));
 }  // namespace
diff --git a/media/libvpx/libvpx/test/vp9_error_block_test.cc b/media/libvpx/libvpx/test/vp9_block_error_test.cc
similarity index 58%
rename from media/libvpx/libvpx/test/vp9_error_block_test.cc
rename to media/libvpx/libvpx/test/vp9_block_error_test.cc
index 74436c09e7..5cf8280c68 100644
--- a/media/libvpx/libvpx/test/vp9_error_block_test.cc
+++ b/media/libvpx/libvpx/test/vp9_block_error_test.cc
@@ -11,8 +11,9 @@
 #include <cmath>
 #include <cstdlib>
 #include <string>
+#include <tuple>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "./vpx_config.h"
 #include "./vp9_rtcd.h"
@@ -23,51 +24,52 @@
 #include "vp9/common/vp9_entropy.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 
 using libvpx_test::ACMRandom;
 
 namespace {
-#if CONFIG_VP9_HIGHBITDEPTH
 const int kNumIterations = 1000;
 
-typedef int64_t (*ErrorBlockFunc)(const tran_low_t *coeff,
-                                  const tran_low_t *dqcoeff,
-                                  intptr_t block_size, int64_t *ssz, int bps);
+using HBDBlockErrorFunc = int64_t (*)(const tran_low_t *coeff,
+                                      const tran_low_t *dqcoeff,
+                                      intptr_t block_size, int64_t *ssz,
+                                      int bps);
 
-typedef std::tr1::tuple<ErrorBlockFunc, ErrorBlockFunc, vpx_bit_depth_t>
-    ErrorBlockParam;
+using BlockErrorParam =
+    std::tuple<HBDBlockErrorFunc, HBDBlockErrorFunc, vpx_bit_depth_t>;
 
-// wrapper for 8-bit block error functions without a 'bps' param.
-typedef int64_t (*HighBdBlockError8bit)(const tran_low_t *coeff,
-                                        const tran_low_t *dqcoeff,
-                                        intptr_t block_size, int64_t *ssz);
-template <HighBdBlockError8bit fn>
-int64_t HighBdBlockError8bitWrapper(const tran_low_t *coeff,
-                                    const tran_low_t *dqcoeff,
-                                    intptr_t block_size, int64_t *ssz,
-                                    int bps) {
-  EXPECT_EQ(8, bps);
+using BlockErrorFunc = int64_t (*)(const tran_low_t *coeff,
+                                   const tran_low_t *dqcoeff,
+                                   intptr_t block_size, int64_t *ssz);
+
+template <BlockErrorFunc fn>
+int64_t BlockError8BitWrapper(const tran_low_t *coeff,
+                              const tran_low_t *dqcoeff, intptr_t block_size,
+                              int64_t *ssz, int bps) {
+  EXPECT_EQ(bps, 8);
   return fn(coeff, dqcoeff, block_size, ssz);
 }
 
-class ErrorBlockTest : public ::testing::TestWithParam<ErrorBlockParam> {
+class BlockErrorTest : public ::testing::TestWithParam<BlockErrorParam> {
  public:
-  virtual ~ErrorBlockTest() {}
-  virtual void SetUp() {
+  ~BlockErrorTest() override = default;
+  void SetUp() override {
     error_block_op_ = GET_PARAM(0);
     ref_error_block_op_ = GET_PARAM(1);
     bit_depth_ = GET_PARAM(2);
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   vpx_bit_depth_t bit_depth_;
-  ErrorBlockFunc error_block_op_;
-  ErrorBlockFunc ref_error_block_op_;
+  HBDBlockErrorFunc error_block_op_;
+  HBDBlockErrorFunc ref_error_block_op_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BlockErrorTest);
 
-TEST_P(ErrorBlockTest, OperationCheck) {
+TEST_P(BlockErrorTest, OperationCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   DECLARE_ALIGNED(16, tran_low_t, coeff[4096]);
   DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]);
@@ -110,7 +112,7 @@ TEST_P(ErrorBlockTest, OperationCheck) {
       << "First failed at test case " << first_failure;
 }
 
-TEST_P(ErrorBlockTest, ExtremeValues) {
+TEST_P(BlockErrorTest, ExtremeValues) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   DECLARE_ALIGNED(16, tran_low_t, coeff[4096]);
   DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]);
@@ -168,32 +170,58 @@ TEST_P(ErrorBlockTest, ExtremeValues) {
       << "First failed at test case " << first_failure;
 }
 
-using std::tr1::make_tuple;
+using std::make_tuple;
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
-    SSE2, ErrorBlockTest,
-    ::testing::Values(
-        make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,
-                   VPX_BITS_10),
-        make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,
-                   VPX_BITS_12),
-        make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,
-                   VPX_BITS_8),
-        make_tuple(
-            &HighBdBlockError8bitWrapper<vp9_highbd_block_error_8bit_sse2>,
-            &HighBdBlockError8bitWrapper<vp9_highbd_block_error_8bit_c>,
-            VPX_BITS_8)));
+const BlockErrorParam sse2_block_error_tests[] = {
+#if CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,
+             VPX_BITS_10),
+  make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,
+             VPX_BITS_12),
+  make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,
+             VPX_BITS_8),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(&BlockError8BitWrapper<vp9_block_error_sse2>,
+             &BlockError8BitWrapper<vp9_block_error_c>, VPX_BITS_8)
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE2, BlockErrorTest,
+                         ::testing::ValuesIn(sse2_block_error_tests));
 #endif  // HAVE_SSE2
 
-#if HAVE_AVX
-INSTANTIATE_TEST_CASE_P(
-    AVX, ErrorBlockTest,
-    ::testing::Values(make_tuple(
-        &HighBdBlockError8bitWrapper<vp9_highbd_block_error_8bit_avx>,
-        &HighBdBlockError8bitWrapper<vp9_highbd_block_error_8bit_c>,
-        VPX_BITS_8)));
-#endif  // HAVE_AVX
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, BlockErrorTest,
+    ::testing::Values(make_tuple(&BlockError8BitWrapper<vp9_block_error_avx2>,
+                                 &BlockError8BitWrapper<vp9_block_error_c>,
+                                 VPX_BITS_8)));
+#endif  // HAVE_AVX2
 
+#if HAVE_NEON
+const BlockErrorParam neon_block_error_tests[] = {
+#if CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(&vp9_highbd_block_error_neon, &vp9_highbd_block_error_c,
+             VPX_BITS_10),
+  make_tuple(&vp9_highbd_block_error_neon, &vp9_highbd_block_error_c,
+             VPX_BITS_12),
+  make_tuple(&vp9_highbd_block_error_neon, &vp9_highbd_block_error_c,
+             VPX_BITS_8),
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(&BlockError8BitWrapper<vp9_block_error_neon>,
+             &BlockError8BitWrapper<vp9_block_error_c>, VPX_BITS_8)
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, BlockErrorTest,
+                         ::testing::ValuesIn(neon_block_error_tests));
+#endif  // HAVE_NEON
+
+#if HAVE_SVE
+const BlockErrorParam sve_block_error_tests[] = { make_tuple(
+    &BlockError8BitWrapper<vp9_block_error_sve>,
+    &BlockError8BitWrapper<vp9_block_error_c>, VPX_BITS_8) };
+
+INSTANTIATE_TEST_SUITE_P(SVE, BlockErrorTest,
+                         ::testing::ValuesIn(sve_block_error_tests));
+#endif  // HAVE_SVE
 }  // namespace
diff --git a/media/libvpx/libvpx/test/vp9_boolcoder_test.cc b/media/libvpx/libvpx/test/vp9_boolcoder_test.cc
index 5dbfd5ca59..fbbdb19465 100644
--- a/media/libvpx/libvpx/test/vp9_boolcoder_test.cc
+++ b/media/libvpx/libvpx/test/vp9_boolcoder_test.cc
@@ -12,7 +12,7 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
@@ -53,7 +53,7 @@ TEST(VP9, TestBitIO) {
         ACMRandom bit_rnd(random_seed);
         vpx_writer bw;
         uint8_t bw_buffer[kBufferSize];
-        vpx_start_encode(&bw, bw_buffer);
+        vpx_start_encode(&bw, bw_buffer, sizeof(bw_buffer));
 
         int bit = (bit_method == 0) ? 0 : (bit_method == 1) ? 1 : 0;
         for (int i = 0; i < kBitsToTest; ++i) {
@@ -65,13 +65,16 @@ TEST(VP9, TestBitIO) {
           vpx_write(&bw, bit, static_cast<int>(probas[i]));
         }
 
-        vpx_stop_encode(&bw);
+        GTEST_ASSERT_EQ(vpx_stop_encode(&bw), 0);
+        // vpx_reader_fill() may read into uninitialized data that
+        // isn't used meaningfully, but may trigger an MSan warning.
+        memset(bw_buffer + bw.pos, 0, sizeof(BD_VALUE) - 1);
 
         // First bit should be zero
         GTEST_ASSERT_EQ(bw_buffer[0] & 0x80, 0);
 
         vpx_reader br;
-        vpx_reader_init(&br, bw_buffer, kBufferSize, NULL, NULL);
+        vpx_reader_init(&br, bw_buffer, kBufferSize, nullptr, nullptr);
         bit_rnd.Reset(random_seed);
         for (int i = 0; i < kBitsToTest; ++i) {
           if (bit_method == 2) {
@@ -87,3 +90,24 @@ TEST(VP9, TestBitIO) {
     }
   }
 }
+
+TEST(VP9, TestBitIOBufferSize0) {
+  vpx_writer bw;
+  uint8_t bw_buffer[1];
+  vpx_start_encode(&bw, bw_buffer, 0);
+  GTEST_ASSERT_EQ(vpx_stop_encode(&bw), -1);
+}
+
+TEST(VP9, TestBitIOBufferSize1) {
+  vpx_writer bw;
+  uint8_t bw_buffer[1];
+  vpx_start_encode(&bw, bw_buffer, sizeof(bw_buffer));
+  GTEST_ASSERT_EQ(vpx_stop_encode(&bw), -1);
+}
+
+TEST(VP9, TestBitIOBufferSize2) {
+  vpx_writer bw;
+  uint8_t bw_buffer[2];
+  vpx_start_encode(&bw, bw_buffer, sizeof(bw_buffer));
+  GTEST_ASSERT_EQ(vpx_stop_encode(&bw), 0);
+}
diff --git a/media/libvpx/libvpx/test/vp9_c_vs_simd_encode.sh b/media/libvpx/libvpx/test/vp9_c_vs_simd_encode.sh
new file mode 100755
index 0000000000..03843610dc
--- /dev/null
+++ b/media/libvpx/libvpx/test/vp9_c_vs_simd_encode.sh
@@ -0,0 +1,420 @@
+#!/bin/sh
+##
+##  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  This script checks the bit exactness between C and SIMD
+##  implementations of VP9 encoder.
+##
+. $(dirname $0)/tools_common.sh
+
+TEST_BITRATES="1600 6400"
+PRESETS="good rt"
+TEST_CLIPS="yuv_raw_input y4m_360p_10bit_input yuv_480p_raw_input y4m_720p_input"
+OUT_FILE_SUFFIX=".ivf"
+SCRIPT_DIR=$(dirname "$0")
+LIBVPX_SOURCE_DIR=$(cd "${SCRIPT_DIR}/.."; pwd)
+
+# Clips used in test.
+YUV_RAW_INPUT="${LIBVPX_TEST_DATA_PATH}/hantro_collage_w352h288.yuv"
+YUV_480P_RAW_INPUT="${LIBVPX_TEST_DATA_PATH}/niklas_640_480_30.yuv"
+Y4M_360P_10BIT_INPUT="${LIBVPX_TEST_DATA_PATH}/crowd_run_360p_10_150f.y4m"
+Y4M_720P_INPUT="${LIBVPX_TEST_DATA_PATH}/niklas_1280_720_30.y4m"
+
+# Number of frames to test.
+VP9_ENCODE_C_VS_SIMD_TEST_FRAME_LIMIT=20
+
+# Create a temporary directory for output files.
+if [ -n "${TMPDIR}" ]; then
+  VPX_TEST_TEMP_ROOT="${TMPDIR}"
+elif [ -n "${TEMPDIR}" ]; then
+  VPX_TEST_TEMP_ROOT="${TEMPDIR}"
+else
+  VPX_TEST_TEMP_ROOT=/tmp
+fi
+
+VPX_TEST_OUTPUT_DIR="${VPX_TEST_TEMP_ROOT}/vpx_test_$$"
+
+if ! mkdir -p "${VPX_TEST_OUTPUT_DIR}" || \
+   [ ! -d "${VPX_TEST_OUTPUT_DIR}" ]; then
+  echo "${0##*/}: Cannot create output directory, giving up."
+  echo "${0##*/}:   VPX_TEST_OUTPUT_DIR=${VPX_TEST_OUTPUT_DIR}"
+  exit 1
+fi
+
+elog() {
+  echo "$@" 1>&2
+}
+
+# Echoes path to $1 when it's executable and exists in ${VPX_TEST_OUTPUT_DIR},
+# or an empty string. Caller is responsible for testing the string once the
+# function returns.
+vp9_enc_tool_path() {
+  local target="$1"
+  local tool_path="${VPX_TEST_OUTPUT_DIR}/build_target_${target}/vpxenc"
+
+  if [ ! -x "${tool_path}" ]; then
+    tool_path=""
+  fi
+  echo "${tool_path}"
+}
+
+# Environment check: Make sure input and source directories are available.
+vp9_c_vs_simd_enc_verify_environment() {
+  if [ ! -e "${YUV_RAW_INPUT}" ]; then
+    elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ ! -e "${YUV_480P_RAW_INPUT}" ]; then
+    elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ ! -e "${Y4M_720P_INPUT}" ]; then
+    elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ ! -e "${Y4M_360P_10BIT_INPUT}" ]; then
+    elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ ! -d "$LIBVPX_SOURCE_DIR" ]; then
+    elog "LIBVPX_SOURCE_DIR does not exist."
+    return 1
+  fi
+}
+
+# This is not needed since tools_common.sh does the same cleanup.
+# Keep the code here for our reference.
+# cleanup() {
+#   rm -rf  ${VPX_TEST_OUTPUT_DIR}
+# }
+
+# Echo VPX_SIMD_CAPS_MASK for different instruction set architecture.
+avx512f() {
+   echo "0x1FF"
+}
+
+avx2() {
+   echo "0x0FF"
+}
+
+sse4_1() {
+   echo "0x03F"
+}
+
+ssse3() {
+   echo "0x01F"
+}
+
+sse2() {
+   echo "0x007"
+}
+
+# Echo clip details to be used as input to vpxenc.
+yuv_raw_input() {
+  echo ""${YUV_RAW_INPUT}"
+       --width=352
+       --height=288
+       --bit-depth=8
+       --profile=0"
+}
+
+yuv_480p_raw_input() {
+  echo ""${YUV_480P_RAW_INPUT}"
+       --width=640
+       --height=480
+       --bit-depth=8
+       --profile=0"
+}
+
+y4m_720p_input() {
+  echo ""${Y4M_720P_INPUT}"
+       --bit-depth=8
+       --profile=0"
+}
+
+y4m_360p_10bit_input() {
+  echo ""${Y4M_360P_10BIT_INPUT}"
+       --bit-depth=10
+       --profile=2"
+}
+
+has_x86_isa_extn() {
+  instruction_set=$1
+  if ! grep -q "$instruction_set" /proc/cpuinfo; then
+    # This instruction_set is not supported.
+    return 1
+  fi
+  # This instruction_set is supported.
+  return 0
+}
+
+# Echo good encode params for use with VP9 encoder.
+vp9_encode_good_params() {
+  echo "--codec=vp9 \
+  --good \
+  --test-decode=fatal \
+  --ivf \
+  --threads=1 \
+  --static-thresh=0 \
+  --tile-columns=0 \
+  --end-usage=vbr \
+  --kf-max-dist=160 \
+  --kf-min-dist=0 \
+  --lag-in-frames=19 \
+  --max-q=63 \
+  --min-q=0 \
+  --passes=2 \
+  --undershoot-pct=100 \
+  --overshoot-pct=100 \
+  --verbose \
+  --auto-alt-ref=1 \
+  --drop-frame=0 \
+  --bias-pct=50 \
+  --minsection-pct=0 \
+  --maxsection-pct=2000 \
+  --arnr-maxframes=7 \
+  --arnr-strength=5 \
+  --sharpness=0 \
+  --frame-parallel=0"
+}
+
+# Echo realtime encode params for use with VP9 encoder.
+vp9_encode_rt_params() {
+  echo "--codec=vp9 \
+  --rt \
+  --test-decode=fatal \
+  --ivf \
+  --threads=1 \
+  --static-thresh=0 \
+  --tile-columns=0 \
+  --tile-rows=0 \
+  --end-usage=cbr \
+  --kf-max-dist=90000 \
+  --lag-in-frames=0 \
+  --max-q=58 \
+  --min-q=2 \
+  --passes=1 \
+  --undershoot-pct=50 \
+  --overshoot-pct=50 \
+  --verbose \
+  --row-mt=0 \
+  --buf-sz=1000 \
+  --buf-initial-sz=500 \
+  --buf-optimal-sz=600 \
+  --max-intra-rate=300 \
+  --resize-allowed=0 \
+  --noise-sensitivity=0 \
+  --aq-mode=3 \
+  --error-resilient=0"
+}
+
+# Configures for the given target in the
+# ${VPX_TEST_OUTPUT_DIR}/build_target_${target} directory.
+vp9_enc_build() {
+  local target=$1
+  local configure="$2"
+  local tmp_build_dir=${VPX_TEST_OUTPUT_DIR}/build_target_${target}
+  mkdir -p "$tmp_build_dir"
+  local save_dir="$PWD"
+  cd "$tmp_build_dir"
+
+  echo "Building target: ${target}"
+  local config_args="--disable-install-docs \
+             --enable-unit-tests \
+             --enable-debug \
+             --enable-postproc \
+             --enable-vp9-postproc \
+             --enable-vp9-temporal-denoising \
+             --enable-vp9-highbitdepth"
+
+  eval "$configure" --target="${target}" "${config_args}" ${devnull}
+  eval make -j$(nproc) ${devnull}
+  echo "Done building target: ${target}"
+  cd "${save_dir}"
+}
+
+compare_enc_output() {
+  local target=$1
+  local cpu=$2
+  local clip=$3
+  local bitrate=$4
+  local preset=$5
+  if ! diff -q ${VPX_TEST_OUTPUT_DIR}/Out-generic-gnu-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} \
+       ${VPX_TEST_OUTPUT_DIR}/Out-${target}-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX}; then
+    elog "C vs ${target} encode mismatches for ${clip}, at ${bitrate} kbps, speed ${cpu}, ${preset} preset"
+    return 1
+  fi
+}
+
+vp9_enc_test() {
+  local encoder="$1"
+  local target=$2
+  if [ -z "$(vp9_enc_tool_path "${target}")" ]; then
+    elog "vpxenc not found. It must exist in ${VPX_TEST_OUTPUT_DIR}/build_target_${target} path"
+    return 1
+  fi
+
+  local tmp_build_dir=${VPX_TEST_OUTPUT_DIR}/build_target_${target}
+  local save_dir="$PWD"
+  cd "$tmp_build_dir"
+  for preset in ${PRESETS}; do
+    if [ "${preset}" = "good" ]; then
+      local max_cpu_used=5
+      local test_params=vp9_encode_good_params
+    elif [ "${preset}" = "rt" ]; then
+      local max_cpu_used=9
+      local test_params=vp9_encode_rt_params
+    else
+      elog "Invalid preset"
+      cd "${save_dir}"
+      return 1
+    fi
+
+    # Enable armv8 test for real-time only
+    if [ "${preset}" = "good" ] && [ "${target}" = "armv8-linux-gcc" ]; then
+      continue
+    fi
+
+    for cpu in $(seq 0 $max_cpu_used); do
+      for clip in ${TEST_CLIPS}; do
+        for bitrate in ${TEST_BITRATES}; do
+          eval "${encoder}" $($clip) $($test_params) \
+          "--limit=${VP9_ENCODE_C_VS_SIMD_TEST_FRAME_LIMIT}" \
+          "--cpu-used=${cpu}" "--target-bitrate=${bitrate}" "-o" \
+          ${VPX_TEST_OUTPUT_DIR}/Out-${target}-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} \
+          ${devnull}
+
+          if [ "${target}" != "generic-gnu" ]; then
+            if ! compare_enc_output ${target} $cpu ${clip} $bitrate ${preset}; then
+              # Find the mismatch
+              cd "${save_dir}"
+              return 1
+            fi
+          fi
+        done
+      done
+    done
+  done
+  cd "${save_dir}"
+}
+
+vp9_test_generic() {
+  local configure="$LIBVPX_SOURCE_DIR/configure"
+  local target="generic-gnu"
+
+  echo "Build for: ${target}"
+  vp9_enc_build ${target} ${configure}
+  local encoder="$(vp9_enc_tool_path "${target}")"
+  vp9_enc_test $encoder "${target}"
+}
+
+# This function encodes VP9 bitstream by enabling SSE2, SSSE3, SSE4_1, AVX2, AVX512f as there are
+# no functions with MMX, SSE, SSE3 and AVX specialization.
+# The value of environment variable 'VPX_SIMD_CAPS' controls enabling of different instruction
+# set extension optimizations. The value of the flag 'VPX_SIMD_CAPS' and the corresponding
+# instruction set extension optimization enabled are as follows:
+# AVX512 AVX2 AVX SSE4_1 SSSE3 SSE3 SSE2 SSE MMX
+#   1     1    1    1      1    1    1    1   1  -> 0x1FF -> Enable AVX512 and lower variants
+#   0     1    1    1      1    1    1    1   1  -> 0x0FF -> Enable AVX2 and lower variants
+#   0     0    1    1      1    1    1    1   1  -> 0x07F -> Enable AVX and lower variants
+#   0     0    0    1      1    1    1    1   1  -> 0x03F  -> Enable SSE4_1 and lower variants
+#   0     0    0    0      1    1    1    1   1  -> 0x01F  -> Enable SSSE3 and lower variants
+#   0     0    0    0      0    1    1    1   1  -> 0x00F  -> Enable SSE3 and lower variants
+#   0     0    0    0      0    0    1    1   1  -> 0x007  -> Enable SSE2 and lower variants
+#   0     0    0    0      0    0    0    1   1  -> 0x003  -> Enable SSE and lower variants
+#   0     0    0    0      0    0    0    0   1  -> 0x001  -> Enable MMX
+## NOTE: In x86_64 platform, it is not possible to enable sse/mmx/c using "VPX_SIMD_CAPS_MASK" as
+#  all x86_64 platforms implement sse2.
+vp9_test_x86() {
+  local arch=$1
+
+  if ! uname -m | grep -q "x86"; then
+    elog "Machine architecture is not x86 or x86_64"
+    return 0
+  fi
+
+  if [ $arch = "x86" ]; then
+    local target="x86-linux-gcc"
+  elif [ $arch = "x86_64" ]; then
+    local target="x86_64-linux-gcc"
+  fi
+
+  local x86_isa_variants="avx512f avx2 sse4_1 ssse3 sse2"
+  local configure="$LIBVPX_SOURCE_DIR/configure"
+
+  echo "Build for x86: ${target}"
+  vp9_enc_build ${target} ${configure}
+  local encoder="$(vp9_enc_tool_path "${target}")"
+  for isa in $x86_isa_variants; do
+    # Note that if has_x86_isa_extn returns 1, it is false, and vice versa.
+    if ! has_x86_isa_extn $isa; then
+      echo "${isa} is not supported in this machine"
+      continue
+    fi
+    export VPX_SIMD_CAPS_MASK=$($isa)
+    if ! vp9_enc_test $encoder ${target}; then
+      # Find the mismatch
+      return 1
+    fi
+    unset VPX_SIMD_CAPS_MASK
+  done
+}
+
+vp9_test_arm() {
+  local target="armv8-linux-gcc"
+  local configure="CROSS=aarch64-linux-gnu- $LIBVPX_SOURCE_DIR/configure --extra-cflags=-march=armv8.4-a \
+          --extra-cxxflags=-march=armv8.4-a"
+  echo "Build for arm64: ${target}"
+  vp9_enc_build ${target} "${configure}"
+
+  local encoder="$(vp9_enc_tool_path "${target}")"
+  if ! vp9_enc_test "qemu-aarch64 -L /usr/aarch64-linux-gnu ${encoder}" ${target}; then
+    # Find the mismatch
+    return 1
+  fi
+}
+
+vp9_c_vs_simd_enc_test() {
+  # Test Generic
+  vp9_test_generic
+
+  # Test x86 (32 bit)
+  echo "vp9 test for x86 (32 bit): Started."
+  if ! vp9_test_x86 "x86"; then
+    echo "vp9 test for x86 (32 bit): Done, test failed."
+    return 1
+  else
+    echo "vp9 test for x86 (32 bit): Done, all tests passed."
+  fi
+
+  # Test x86_64 (64 bit)
+  if [ "$(eval uname -m)" = "x86_64" ]; then
+    echo "vp9 test for x86_64 (64 bit): Started."
+    if ! vp9_test_x86 "x86_64"; then
+      echo "vp9 test for x86_64 (64 bit): Done, test failed."
+      return 1
+    else
+      echo "vp9 test for x86_64 (64 bit): Done, all tests passed."
+    fi
+  fi
+
+  # Test ARM
+  echo "vp9_test_arm: Started."
+  if ! vp9_test_arm; then
+    echo "vp9 test for arm: Done, test failed."
+    return 1
+  else
+    echo "vp9 test for arm: Done, all tests passed."
+  fi
+}
+
+# Setup a trap function to clean up build, and output files after tests complete.
+# trap cleanup EXIT
+
+run_tests vp9_c_vs_simd_enc_verify_environment vp9_c_vs_simd_enc_test
diff --git a/media/libvpx/libvpx/test/vp9_datarate_test.cc b/media/libvpx/libvpx/test/vp9_datarate_test.cc
new file mode 100644
index 0000000000..2c90d751e7
--- /dev/null
+++ b/media/libvpx/libvpx/test/vp9_datarate_test.cc
@@ -0,0 +1,1151 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#include "gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "vpx/vpx_codec.h"
+#include "vpx_ports/bitops.h"
+
+namespace {
+
+class DatarateTestVP9 : public ::libvpx_test::EncoderTest {
+ public:
+  explicit DatarateTestVP9(const ::libvpx_test::CodecFactory *codec)
+      : EncoderTest(codec) {
+    tune_content_ = 0;
+  }
+
+ protected:
+  ~DatarateTestVP9() override = default;
+
+  virtual void ResetModel() {
+    last_pts_ = 0;
+    bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz;
+    frame_number_ = 0;
+    tot_frame_number_ = 0;
+    first_drop_ = 0;
+    num_drops_ = 0;
+    aq_mode_ = 3;
+    // Denoiser is off by default.
+    denoiser_on_ = 0;
+    // For testing up to 3 layers.
+    for (int i = 0; i < 3; ++i) {
+      bits_total_[i] = 0;
+    }
+    denoiser_offon_test_ = 0;
+    denoiser_offon_period_ = -1;
+    frame_parallel_decoding_mode_ = 1;
+    delta_q_uv_ = 0;
+    use_roi_ = false;
+  }
+
+  //
+  // Frame flags and layer id for temporal layers.
+  //
+
+  // For two layers, test pattern is:
+  //   1     3
+  // 0    2     .....
+  // For three layers, test pattern is:
+  //   1      3    5      7
+  //      2           6
+  // 0          4            ....
+  // LAST is always update on base/layer 0, GOLDEN is updated on layer 1.
+  // For this 3 layer example, the 2nd enhancement layer (layer 2) updates
+  // the altref frame.
+  static int GetFrameFlags(int frame_num, int num_temp_layers) {
+    int frame_flags = 0;
+    if (num_temp_layers == 2) {
+      if (frame_num % 2 == 0) {
+        // Layer 0: predict from L and ARF, update L.
+        frame_flags =
+            VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
+      } else {
+        // Layer 1: predict from L, G and ARF, and update G.
+        frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST |
+                      VP8_EFLAG_NO_UPD_ENTROPY;
+      }
+    } else if (num_temp_layers == 3) {
+      if (frame_num % 4 == 0) {
+        // Layer 0: predict from L and ARF; update L.
+        frame_flags =
+            VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_GF;
+      } else if ((frame_num - 2) % 4 == 0) {
+        // Layer 1: predict from L, G, ARF; update G.
+        frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST;
+      } else if ((frame_num - 1) % 2 == 0) {
+        // Layer 2: predict from L, G, ARF; update ARF.
+        frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_LAST;
+      }
+    }
+    return frame_flags;
+  }
+
+  static int SetLayerId(int frame_num, int num_temp_layers) {
+    int layer_id = 0;
+    if (num_temp_layers == 2) {
+      if (frame_num % 2 == 0) {
+        layer_id = 0;
+      } else {
+        layer_id = 1;
+      }
+    } else if (num_temp_layers == 3) {
+      if (frame_num % 4 == 0) {
+        layer_id = 0;
+      } else if ((frame_num - 2) % 4 == 0) {
+        layer_id = 1;
+      } else if ((frame_num - 1) % 2 == 0) {
+        layer_id = 2;
+      }
+    }
+    return layer_id;
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(VP9E_SET_AQ_MODE, aq_mode_);
+      encoder->Control(VP9E_SET_TUNE_CONTENT, tune_content_);
+    }
+
+    if (denoiser_offon_test_) {
+      ASSERT_GT(denoiser_offon_period_, 0)
+          << "denoiser_offon_period_ is not positive.";
+      if ((video->frame() + 1) % denoiser_offon_period_ == 0) {
+        // Flip denoiser_on_ periodically
+        denoiser_on_ ^= 1;
+      }
+    }
+
+    encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_);
+    encoder->Control(VP9E_SET_TILE_COLUMNS, get_msb(cfg_.g_threads));
+    encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING,
+                     frame_parallel_decoding_mode_);
+
+    if (use_roi_) {
+      encoder->Control(VP9E_SET_ROI_MAP, &roi_);
+      encoder->Control(VP9E_SET_AQ_MODE, 0);
+    }
+
+    if (delta_q_uv_ != 0) {
+      encoder->Control(VP9E_SET_DELTA_Q_UV, delta_q_uv_);
+    }
+
+    if (cfg_.ts_number_layers > 1) {
+      if (video->frame() == 0) {
+        encoder->Control(VP9E_SET_SVC, 1);
+      }
+      if (cfg_.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+        vpx_svc_layer_id_t layer_id;
+        frame_flags_ = GetFrameFlags(video->frame(), cfg_.ts_number_layers);
+        layer_id.spatial_layer_id = 0;
+        layer_id.temporal_layer_id =
+            SetLayerId(video->frame(), cfg_.ts_number_layers);
+        layer_id.temporal_layer_id_per_spatial[0] =
+            SetLayerId(video->frame(), cfg_.ts_number_layers);
+        encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
+      }
+    }
+    const vpx_rational_t tb = video->timebase();
+    timebase_ = static_cast<double>(tb.num) / tb.den;
+    duration_ = 0;
+  }
+
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
+    // Time since last timestamp = duration.
+    vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
+
+    if (duration > 1) {
+      // If first drop not set and we have a drop set it to this time.
+      if (!first_drop_) first_drop_ = last_pts_ + 1;
+      // Update the number of frame drops.
+      num_drops_ += static_cast<int>(duration - 1);
+      // Update counter for total number of frames (#frames input to encoder).
+      // Needed for setting the proper layer_id below.
+      tot_frame_number_ += static_cast<int>(duration - 1);
+    }
+
+    int layer = SetLayerId(tot_frame_number_, cfg_.ts_number_layers);
+
+    // Add to the buffer the bits we'd expect from a constant bitrate server.
+    bits_in_buffer_model_ += static_cast<int64_t>(
+        duration * timebase_ * cfg_.rc_target_bitrate * 1000);
+
+    // Buffer should not go negative.
+    ASSERT_GE(bits_in_buffer_model_, 0)
+        << "Buffer Underrun at frame " << pkt->data.frame.pts;
+
+    const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
+
+    // Update the total encoded bits. For temporal layers, update the cumulative
+    // encoded bits per layer.
+    for (int i = layer; i < static_cast<int>(cfg_.ts_number_layers); ++i) {
+      bits_total_[i] += frame_size_in_bits;
+    }
+
+    // Update the most recent pts.
+    last_pts_ = pkt->data.frame.pts;
+    ++frame_number_;
+    ++tot_frame_number_;
+  }
+
+  void EndPassHook() override {
+    for (int layer = 0; layer < static_cast<int>(cfg_.ts_number_layers);
+         ++layer) {
+      duration_ = (last_pts_ + 1) * timebase_;
+      if (bits_total_[layer]) {
+        // Effective file datarate:
+        effective_datarate_[layer] = (bits_total_[layer] / 1000.0) / duration_;
+      }
+    }
+  }
+
+  vpx_codec_pts_t last_pts_;
+  double timebase_;
+  int tune_content_;
+  int frame_number_;      // Counter for number of non-dropped/encoded frames.
+  int tot_frame_number_;  // Counter for total number of input frames.
+  int64_t bits_total_[3];
+  double duration_;
+  double effective_datarate_[3];
+  int set_cpu_used_;
+  int64_t bits_in_buffer_model_;
+  vpx_codec_pts_t first_drop_;
+  int num_drops_;
+  int aq_mode_;
+  int denoiser_on_;
+  int denoiser_offon_test_;
+  int denoiser_offon_period_;
+  int frame_parallel_decoding_mode_;
+  int delta_q_uv_;
+  bool use_roi_;
+  vpx_roi_map_t roi_;
+};
+
+// Params: test mode, speed setting and index for bitrate array.
+class DatarateTestVP9RealTimeMultiBR
+    : public DatarateTestVP9,
+      public ::libvpx_test::CodecTestWith2Params<int, int> {
+ public:
+  DatarateTestVP9RealTimeMultiBR() : DatarateTestVP9(GET_PARAM(0)) {}
+
+ protected:
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    set_cpu_used_ = GET_PARAM(1);
+    ResetModel();
+  }
+};
+
+// Params: speed setting and index for bitrate array.
+class DatarateTestVP9LargeVBR
+    : public DatarateTestVP9,
+      public ::libvpx_test::CodecTestWith2Params<int, int> {
+ public:
+  DatarateTestVP9LargeVBR() : DatarateTestVP9(GET_PARAM(0)) {}
+
+ protected:
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    set_cpu_used_ = GET_PARAM(1);
+    ResetModel();
+  }
+};
+
+// Check basic rate targeting for VBR mode with 0 lag.
+TEST_P(DatarateTestVP9LargeVBR, BasicRateTargetingVBRLagZero) {
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_error_resilient = 0;
+  cfg_.rc_end_usage = VPX_VBR;
+  cfg_.g_lag_in_frames = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+
+  const int bitrates[2] = { 400, 800 };
+  const int bitrate_index = GET_PARAM(2);
+  cfg_.rc_target_bitrate = bitrates[bitrate_index];
+  ResetModel();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.36)
+      << " The datarate for the file is greater than target by too much!";
+}
+
+// Check basic rate targeting for VBR mode with non-zero lag.
+TEST_P(DatarateTestVP9LargeVBR, BasicRateTargetingVBRLagNonZero) {
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_error_resilient = 0;
+  cfg_.rc_end_usage = VPX_VBR;
+  // For non-zero lag, rate control will work (be within bounds) for
+  // real-time mode.
+  if (deadline_ == VPX_DL_REALTIME) {
+    cfg_.g_lag_in_frames = 15;
+  } else {
+    cfg_.g_lag_in_frames = 0;
+  }
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+  const int bitrates[2] = { 400, 800 };
+  const int bitrate_index = GET_PARAM(2);
+  cfg_.rc_target_bitrate = bitrates[bitrate_index];
+  ResetModel();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.35)
+      << " The datarate for the file is greater than target by too much!";
+}
+
+// Check basic rate targeting for VBR mode with non-zero lag, with
+// frame_parallel_decoding_mode off. This enables the adapt_coeff/mode/mv probs
+// since error_resilience is off.
+TEST_P(DatarateTestVP9LargeVBR, BasicRateTargetingVBRLagNonZeroFrameParDecOff) {
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_error_resilient = 0;
+  cfg_.rc_end_usage = VPX_VBR;
+  // For non-zero lag, rate control will work (be within bounds) for
+  // real-time mode.
+  if (deadline_ == VPX_DL_REALTIME) {
+    cfg_.g_lag_in_frames = 15;
+  } else {
+    cfg_.g_lag_in_frames = 0;
+  }
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+  const int bitrates[2] = { 400, 800 };
+  const int bitrate_index = GET_PARAM(2);
+  cfg_.rc_target_bitrate = bitrates[bitrate_index];
+  ResetModel();
+  frame_parallel_decoding_mode_ = 0;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.35)
+      << " The datarate for the file is greater than target by too much!";
+}
+
+// Check basic rate targeting for CBR mode.
+TEST_P(DatarateTestVP9RealTimeMultiBR, BasicRateTargeting) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  const int bitrates[4] = { 150, 350, 550, 750 };
+  const int bitrate_index = GET_PARAM(2);
+  cfg_.rc_target_bitrate = bitrates[bitrate_index];
+  ResetModel();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
+      << " The datarate for the file is greater than target by too much!";
+}
+
+// Check basic rate targeting for CBR mode, with frame_parallel_decoding_mode
+// off( and error_resilience off).
+TEST_P(DatarateTestVP9RealTimeMultiBR, BasicRateTargetingFrameParDecOff) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.g_error_resilient = 0;
+
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  const int bitrates[4] = { 150, 350, 550, 750 };
+  const int bitrate_index = GET_PARAM(2);
+  cfg_.rc_target_bitrate = bitrates[bitrate_index];
+  ResetModel();
+  frame_parallel_decoding_mode_ = 0;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
+      << " The datarate for the file is greater than target by too much!";
+}
+
+// Check basic rate targeting for CBR.
+TEST_P(DatarateTestVP9RealTimeMultiBR, BasicRateTargeting444) {
+  ::libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140);
+
+  cfg_.g_profile = 1;
+  cfg_.g_timebase = video.timebase();
+
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  const int bitrates[4] = { 250, 450, 650, 850 };
+  const int bitrate_index = GET_PARAM(2);
+  cfg_.rc_target_bitrate = bitrates[bitrate_index];
+  ResetModel();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(static_cast<double>(cfg_.rc_target_bitrate),
+            effective_datarate_[0] * 0.80)
+      << " The datarate for the file exceeds the target by too much!";
+  ASSERT_LE(static_cast<double>(cfg_.rc_target_bitrate),
+            effective_datarate_[0] * 1.15)
+      << " The datarate for the file missed the target!"
+      << cfg_.rc_target_bitrate << " " << effective_datarate_;
+}
+
+// Check that (1) the first dropped frame gets earlier and earlier
+// as the drop frame threshold is increased, and (2) that the total number of
+// frame drops does not decrease as we increase frame drop threshold.
+// Use a lower qp-max to force some frame drops.
+TEST_P(DatarateTestVP9RealTimeMultiBR, ChangingDropFrameThresh) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_undershoot_pct = 20;
+  cfg_.rc_undershoot_pct = 20;
+  cfg_.rc_dropframe_thresh = 10;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 50;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.rc_target_bitrate = 200;
+  cfg_.g_lag_in_frames = 0;
+  // TODO(marpan): Investigate datarate target failures with a smaller keyframe
+  // interval (128).
+  cfg_.kf_max_dist = 9999;
+
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+
+  const int kDropFrameThreshTestStep = 30;
+  const int bitrates[2] = { 50, 150 };
+  const int bitrate_index = GET_PARAM(2);
+  if (bitrate_index > 1) return;
+  cfg_.rc_target_bitrate = bitrates[bitrate_index];
+  vpx_codec_pts_t last_drop = 140;
+  int last_num_drops = 0;
+  for (int i = 10; i < 100; i += kDropFrameThreshTestStep) {
+    cfg_.rc_dropframe_thresh = i;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+        << " The datarate for the file is lower than target by too much!";
+    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25)
+        << " The datarate for the file is greater than target by too much!";
+    ASSERT_LE(first_drop_, last_drop)
+        << " The first dropped frame for drop_thresh " << i
+        << " > first dropped frame for drop_thresh "
+        << i - kDropFrameThreshTestStep;
+    ASSERT_GE(num_drops_, last_num_drops * 0.85)
+        << " The number of dropped frames for drop_thresh " << i
+        << " < number of dropped frames for drop_thresh "
+        << i - kDropFrameThreshTestStep;
+    last_drop = first_drop_;
+    last_num_drops = num_drops_;
+  }
+}  // namespace
+
+// Check basic rate targeting for 2 temporal layers.
+TEST_P(DatarateTestVP9RealTimeMultiBR, BasicRateTargeting2TemporalLayers) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  // 2 Temporal layers, no spatial layers: Framerate decimation (2, 1).
+  cfg_.ss_number_layers = 1;
+  cfg_.ts_number_layers = 2;
+  cfg_.ts_rate_decimator[0] = 2;
+  cfg_.ts_rate_decimator[1] = 1;
+
+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  const int bitrates[4] = { 200, 400, 600, 800 };
+  const int bitrate_index = GET_PARAM(2);
+  cfg_.rc_target_bitrate = bitrates[bitrate_index];
+  ResetModel();
+  // 60-40 bitrate allocation for 2 temporal layers.
+  cfg_.layer_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[1] = cfg_.rc_target_bitrate;
+  aq_mode_ = 0;
+  if (deadline_ == VPX_DL_REALTIME) {
+    aq_mode_ = 3;
+    cfg_.g_error_resilient = 1;
+  }
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
+    ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.85)
+        << " The datarate for the file is lower than target by too much, "
+           "for layer: "
+        << j;
+    ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.15)
+        << " The datarate for the file is greater than target by too much, "
+           "for layer: "
+        << j;
+  }
+}
+
+// Check basic rate targeting for 3 temporal layers.
+TEST_P(DatarateTestVP9RealTimeMultiBR, BasicRateTargeting3TemporalLayers) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1).
+  cfg_.ss_number_layers = 1;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+
+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  const int bitrates[4] = { 200, 400, 600, 800 };
+  const int bitrate_index = GET_PARAM(2);
+  cfg_.rc_target_bitrate = bitrates[bitrate_index];
+  ResetModel();
+  // 40-20-40 bitrate allocation for 3 temporal layers.
+  cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate;
+  aq_mode_ = 0;
+  if (deadline_ == VPX_DL_REALTIME) {
+    aq_mode_ = 3;
+    cfg_.g_error_resilient = 1;
+  }
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
+    // TODO(yaowu): Work out more stable rc control strategy and
+    //              Adjust the thresholds to be tighter than .75.
+    ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.75)
+        << " The datarate for the file is lower than target by too much, "
+           "for layer: "
+        << j;
+    // TODO(yaowu): Work out more stable rc control strategy and
+    //              Adjust the thresholds to be tighter than 1.25.
+    ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.25)
+        << " The datarate for the file is greater than target by too much, "
+           "for layer: "
+        << j;
+  }
+}
+
+// Params: speed setting.
+class DatarateTestVP9RealTime : public DatarateTestVP9,
+                                public ::libvpx_test::CodecTestWithParam<int> {
+ public:
+  DatarateTestVP9RealTime() : DatarateTestVP9(GET_PARAM(0)) {}
+  ~DatarateTestVP9RealTime() override = default;
+
+ protected:
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    set_cpu_used_ = GET_PARAM(1);
+    ResetModel();
+  }
+};
+
+// Check basic rate targeting for CBR mode, with 2 threads and dropped frames.
+TEST_P(DatarateTestVP9RealTime, BasicRateTargetingDropFramesMultiThreads) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  // Encode using multiple threads.
+  cfg_.g_threads = 2;
+
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  cfg_.rc_target_bitrate = 200;
+  ResetModel();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
+      << " The datarate for the file is greater than target by too much!";
+}
+
+// Check basic rate targeting for 3 temporal layers, with frame dropping.
+// Only for one (low) bitrate with lower max_quantizer, and somewhat higher
+// frame drop threshold, to force frame dropping.
+TEST_P(DatarateTestVP9RealTime,
+       BasicRateTargeting3TemporalLayersFrameDropping) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  // Set frame drop threshold and rc_max_quantizer to force some frame drops.
+  cfg_.rc_dropframe_thresh = 20;
+  cfg_.rc_max_quantizer = 45;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1).
+  cfg_.ss_number_layers = 1;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+
+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  cfg_.rc_target_bitrate = 200;
+  ResetModel();
+  // 40-20-40 bitrate allocation for 3 temporal layers.
+  cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate;
+  aq_mode_ = 0;
+  if (deadline_ == VPX_DL_REALTIME) {
+    aq_mode_ = 3;
+    cfg_.g_error_resilient = 1;
+  }
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
+    ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.85)
+        << " The datarate for the file is lower than target by too much, "
+           "for layer: "
+        << j;
+    ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.20)
+        << " The datarate for the file is greater than target by too much, "
+           "for layer: "
+        << j;
+    // Expect some frame drops in this test: for this 200 frames test,
+    // expect at least 10% and not more than 60% drops.
+    ASSERT_GE(num_drops_, 20);
+    ASSERT_LE(num_drops_, 280);
+  }
+}
+
+// Check VP9 region of interest feature.
+TEST_P(DatarateTestVP9RealTime, RegionOfInterest) {
+  if (deadline_ != VPX_DL_REALTIME || set_cpu_used_ < 5) return;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+
+  cfg_.rc_target_bitrate = 450;
+  cfg_.g_w = 640;
+  cfg_.g_h = 480;
+
+  ResetModel();
+
+  // Set ROI parameters
+  use_roi_ = true;
+  memset(&roi_, 0, sizeof(roi_));
+
+  roi_.rows = (cfg_.g_h + 7) / 8;
+  roi_.cols = (cfg_.g_w + 7) / 8;
+
+  roi_.delta_q[1] = -20;
+  roi_.delta_lf[1] = -20;
+  memset(roi_.ref_frame, -1, sizeof(roi_.ref_frame));
+  roi_.ref_frame[1] = 1;
+
+  // Use 2 states: 1 is center square, 0 is the rest.
+  roi_.roi_map = reinterpret_cast<uint8_t *>(
+      calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map)));
+  ASSERT_NE(roi_.roi_map, nullptr);
+
+  for (unsigned int i = 0; i < roi_.rows; ++i) {
+    for (unsigned int j = 0; j < roi_.cols; ++j) {
+      if (i > (roi_.rows >> 2) && i < ((roi_.rows * 3) >> 2) &&
+          j > (roi_.cols >> 2) && j < ((roi_.cols * 3) >> 2)) {
+        roi_.roi_map[i * roi_.cols + j] = 1;
+      }
+    }
+  }
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_[0] * 0.90)
+      << " The datarate for the file exceeds the target!";
+
+  ASSERT_LE(cfg_.rc_target_bitrate, effective_datarate_[0] * 1.4)
+      << " The datarate for the file missed the target!";
+
+  free(roi_.roi_map);
+}
+
+// Params: speed setting, delta q UV.
+class DatarateTestVP9RealTimeDeltaQUV
+    : public DatarateTestVP9,
+      public ::libvpx_test::CodecTestWith2Params<int, int> {
+ public:
+  DatarateTestVP9RealTimeDeltaQUV() : DatarateTestVP9(GET_PARAM(0)) {}
+  ~DatarateTestVP9RealTimeDeltaQUV() override = default;
+
+ protected:
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    set_cpu_used_ = GET_PARAM(1);
+    ResetModel();
+  }
+};
+
+TEST_P(DatarateTestVP9RealTimeDeltaQUV, DeltaQUV) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+
+  cfg_.rc_target_bitrate = 450;
+  cfg_.g_w = 640;
+  cfg_.g_h = 480;
+
+  ResetModel();
+
+  delta_q_uv_ = GET_PARAM(2);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_[0] * 0.90)
+      << " The datarate for the file exceeds the target!";
+
+  ASSERT_LE(cfg_.rc_target_bitrate, effective_datarate_[0] * 1.4)
+      << " The datarate for the file missed the target!";
+}
+
+// Params: test mode, speed setting and index for bitrate array.
+class DatarateTestVP9PostEncodeDrop
+    : public DatarateTestVP9,
+      public ::libvpx_test::CodecTestWithParam<int> {
+ public:
+  DatarateTestVP9PostEncodeDrop() : DatarateTestVP9(GET_PARAM(0)) {}
+
+ protected:
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    set_cpu_used_ = GET_PARAM(1);
+    ResetModel();
+  }
+};
+
+// Check basic rate targeting for CBR mode, with 2 threads and dropped frames.
+TEST_P(DatarateTestVP9PostEncodeDrop, PostEncodeDropScreenContent) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  // Encode using multiple threads.
+  cfg_.g_threads = 2;
+  cfg_.g_error_resilient = 0;
+  tune_content_ = 1;
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+  cfg_.rc_target_bitrate = 300;
+  ResetModel();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
+      << " The datarate for the file is greater than target by too much!";
+}
+
+using libvpx_test::ACMRandom;
+
+class DatarateTestVP9FrameQp
+    : public DatarateTestVP9,
+      public ::testing::TestWithParam<const libvpx_test::CodecFactory *> {
+ public:
+  DatarateTestVP9FrameQp() : DatarateTestVP9(GetParam()), frame_(0) {}
+  ~DatarateTestVP9FrameQp() override = default;
+
+ protected:
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    ResetModel();
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    set_cpu_used_ = 7;
+    DatarateTestVP9::PreEncodeFrameHook(video, encoder);
+    frame_qp_ = static_cast<int>(rnd_.RandRange(64));
+    encoder->Control(VP9E_SET_QUANTIZER_ONE_PASS, frame_qp_);
+    frame_++;
+  }
+
+  void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override {
+    int qp = 0;
+    vpx_svc_layer_id_t layer_id;
+    if (frame_ >= total_frame_) return;
+    encoder->Control(VP8E_GET_LAST_QUANTIZER_64, &qp);
+    ASSERT_EQ(frame_qp_, qp);
+    encoder->Control(VP9E_GET_SVC_LAYER_ID, &layer_id);
+    temporal_layer_id_ = layer_id.temporal_layer_id;
+  }
+
+  void MismatchHook(const vpx_image_t * /*img1*/,
+                    const vpx_image_t * /*img2*/) override {
+    if (frame_ >= total_frame_) return;
+    ASSERT_TRUE(cfg_.temporal_layering_mode ==
+                    VP9E_TEMPORAL_LAYERING_MODE_0212 &&
+                temporal_layer_id_ == 2);
+  }
+
+ protected:
+  int total_frame_;
+
+ private:
+  ACMRandom rnd_;
+  int frame_qp_;
+  int frame_;
+  int temporal_layer_id_;
+};
+
+TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  total_frame_ = 400;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, total_frame_);
+  ResetModel();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayersBypass) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1).
+  cfg_.ss_number_layers = 1;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+
+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+  cfg_.rc_target_bitrate = 200;
+  total_frame_ = 400;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, total_frame_);
+  ResetModel();
+  cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayersFixedMode) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1).
+  cfg_.ss_number_layers = 1;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+
+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_0212;
+  cfg_.rc_target_bitrate = 200;
+  cfg_.g_error_resilient = 1;
+  total_frame_ = 400;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, total_frame_);
+  ResetModel();
+  cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+// Params: speed setting.
+class DatarateTestVP9RealTimeDenoiser : public DatarateTestVP9RealTime {
+ public:
+  ~DatarateTestVP9RealTimeDenoiser() override = default;
+};
+
+// Check basic datarate targeting, for a single bitrate, when denoiser is on.
+TEST_P(DatarateTestVP9RealTimeDenoiser, LowNoise) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 2;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+
+  // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING),
+  // there is only one denoiser mode: denoiserYonly(which is 1),
+  // but may add more modes in the future.
+  cfg_.rc_target_bitrate = 400;
+  ResetModel();
+  // Turn on the denoiser.
+  denoiser_on_ = 1;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
+      << " The datarate for the file is greater than target by too much!";
+}
+
+// Check basic datarate targeting, for a single bitrate, when denoiser is on,
+// for clip with high noise level. Use 2 threads.
+TEST_P(DatarateTestVP9RealTimeDenoiser, HighNoise) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 2;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.g_threads = 2;
+
+  ::libvpx_test::Y4mVideoSource video("noisy_clip_640_360.y4m", 0, 200);
+
+  // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING),
+  // there is only one denoiser mode: kDenoiserOnYOnly(which is 1),
+  // but may add more modes in the future.
+  cfg_.rc_target_bitrate = 1000;
+  ResetModel();
+  // Turn on the denoiser.
+  denoiser_on_ = 1;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
+      << " The datarate for the file is greater than target by too much!";
+}
+
+// Check basic datarate targeting, for a single bitrate, when denoiser is on,
+// for 1280x720 clip with 4 threads.
+TEST_P(DatarateTestVP9RealTimeDenoiser, 4threads) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 2;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.g_threads = 4;
+
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
+
+  // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING),
+  // there is only one denoiser mode: denoiserYonly(which is 1),
+  // but may add more modes in the future.
+  cfg_.rc_target_bitrate = 1000;
+  ResetModel();
+  // Turn on the denoiser.
+  denoiser_on_ = 1;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.29)
+      << " The datarate for the file is greater than target by too much!";
+}
+
+// Check basic datarate targeting, for a single bitrate, when denoiser is off
+// and on.
+TEST_P(DatarateTestVP9RealTimeDenoiser, DenoiserOffOn) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 2;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+
+  // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING),
+  // there is only one denoiser mode: denoiserYonly(which is 1),
+  // but may add more modes in the future.
+  cfg_.rc_target_bitrate = 400;
+  ResetModel();
+  // The denoiser is off by default.
+  denoiser_on_ = 0;
+  // Set the offon test flag.
+  denoiser_offon_test_ = 1;
+  denoiser_offon_period_ = 100;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
+      << " The datarate for the file is greater than target by too much!";
+}
+#endif  // CONFIG_VP9_TEMPORAL_DENOISING
+
+class DatarateTestVP9Psnr : public DatarateTestVP9,
+                            public ::libvpx_test::CodecTestWithParam<int> {
+ protected:
+  DatarateTestVP9Psnr() : DatarateTestVP9(GET_PARAM(0)) {}
+  ~DatarateTestVP9Psnr() override = default;
+
+  void SetUp() override {
+    InitializeConfig();
+    cfg_.g_lag_in_frames = 0;
+    SetMode(libvpx_test::kRealTime);
+    set_cpu_used_ = 10;
+    ResetModel();
+    frame_flags_ = VPX_EFLAG_CALCULATE_PSNR;
+    expect_psnr_ = true;
+  }
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    DatarateTestVP9::PreEncodeFrameHook(video, encoder);
+    frame_flags_ ^= VPX_EFLAG_CALCULATE_PSNR;
+#if CONFIG_INTERNAL_STATS
+    // CONFIG_INTERNAL_STATS unconditionally generates PSNR.
+    expect_psnr_ = true;
+#else
+    expect_psnr_ = (frame_flags_ & VPX_EFLAG_CALCULATE_PSNR) != 0;
+#endif  // CONFIG_INTERNAL_STATS
+    if (video->img() == nullptr) {
+      expect_psnr_ = false;
+    }
+  }
+  void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override {
+    libvpx_test::CxDataIterator iter = encoder->GetCxData();
+
+    bool had_psnr = false;
+    while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) {
+      if (pkt->kind == VPX_CODEC_PSNR_PKT) had_psnr = true;
+    }
+
+    EXPECT_EQ(had_psnr, expect_psnr_);
+  }
+
+ private:
+  bool expect_psnr_;
+};
+
+TEST_P(DatarateTestVP9Psnr, PerFramePsnr) {
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 100);
+
+  ResetModel();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9RealTimeMultiBR,
+                           ::testing::Range(5, 10), ::testing::Range(0, 4));
+
+VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9LargeVBR, ::testing::Range(5, 9),
+                           ::testing::Range(0, 2));
+
+VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9RealTime, ::testing::Range(5, 10));
+
+#if CONFIG_VP9
+INSTANTIATE_TEST_SUITE_P(
+    VP9, DatarateTestVP9FrameQp,
+    ::testing::Values(
+        static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)));
+#endif
+
+VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9RealTimeDeltaQUV,
+                           ::testing::Range(5, 10),
+                           ::testing::Values(-5, -10, -15));
+
+VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9PostEncodeDrop,
+                           ::testing::Range(5, 6));
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9RealTimeDenoiser,
+                           ::testing::Range(5, 10));
+#endif
+
+VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9Psnr,
+                           ::testing::Values(::libvpx_test::kRealTime));
+}  // namespace
diff --git a/media/libvpx/libvpx/test/vp9_decrypt_test.cc b/media/libvpx/libvpx/test/vp9_decrypt_test.cc
index 1874d23117..558ee70366 100644
--- a/media/libvpx/libvpx/test/vp9_decrypt_test.cc
+++ b/media/libvpx/libvpx/test/vp9_decrypt_test.cc
@@ -12,7 +12,7 @@
 #include <cstdlib>
 #include <string>
 #include <vector>
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/ivf_video_source.h"
 
diff --git a/media/libvpx/libvpx/test/vp9_denoiser_sse2_test.cc b/media/libvpx/libvpx/test/vp9_denoiser_test.cc
similarity index 50%
rename from media/libvpx/libvpx/test/vp9_denoiser_sse2_test.cc
rename to media/libvpx/libvpx/test/vp9_denoiser_test.cc
index 2a50b77355..bc5612ea47 100644
--- a/media/libvpx/libvpx/test/vp9_denoiser_sse2_test.cc
+++ b/media/libvpx/libvpx/test/vp9_denoiser_test.cc
@@ -11,8 +11,9 @@
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
+#include <tuple>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
@@ -23,23 +24,35 @@
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/encoder/vp9_context_tree.h"
 #include "vp9/encoder/vp9_denoiser.h"
+#include "vpx_config.h"
 
 using libvpx_test::ACMRandom;
 
 namespace {
 
 const int kNumPixels = 64 * 64;
-class VP9DenoiserTest : public ::testing::TestWithParam<BLOCK_SIZE> {
+
+using Vp9DenoiserFilterFunc = int (*)(const uint8_t *sig, int sig_stride,
+                                      const uint8_t *mc_avg, int mc_avg_stride,
+                                      uint8_t *avg, int avg_stride,
+                                      int increase_denoising, BLOCK_SIZE bs,
+                                      int motion_magnitude);
+using VP9DenoiserTestParam = std::tuple<Vp9DenoiserFilterFunc, BLOCK_SIZE>;
+
+class VP9DenoiserTest
+    : public ::testing::Test,
+      public ::testing::WithParamInterface<VP9DenoiserTestParam> {
  public:
-  virtual ~VP9DenoiserTest() {}
+  ~VP9DenoiserTest() override = default;
 
-  virtual void SetUp() { bs_ = GetParam(); }
+  void SetUp() override { bs_ = GET_PARAM(1); }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   BLOCK_SIZE bs_;
 };
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(VP9DenoiserTest);
 
 TEST_P(VP9DenoiserTest, BitexactCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -76,9 +89,9 @@ TEST_P(VP9DenoiserTest, BitexactCheck) {
                                                    64, avg_block_c, 64, 0, bs_,
                                                    motion_magnitude_random));
 
-    ASM_REGISTER_STATE_CHECK(vp9_denoiser_filter_sse2(
-        sig_block, 64, mc_avg_block, 64, avg_block_sse2, 64, 0, bs_,
-        motion_magnitude_random));
+    ASM_REGISTER_STATE_CHECK(GET_PARAM(0)(sig_block, 64, mc_avg_block, 64,
+                                          avg_block_sse2, 64, 0, bs_,
+                                          motion_magnitude_random));
 
     // Test bitexactness.
     for (int h = 0; h < (4 << b_height_log2_lookup[bs_]); ++h) {
@@ -89,10 +102,36 @@ TEST_P(VP9DenoiserTest, BitexactCheck) {
   }
 }
 
+using std::make_tuple;
+
 // Test for all block size.
-INSTANTIATE_TEST_CASE_P(SSE2, VP9DenoiserTest,
-                        ::testing::Values(BLOCK_8X8, BLOCK_8X16, BLOCK_16X8,
-                                          BLOCK_16X16, BLOCK_16X32, BLOCK_32X16,
-                                          BLOCK_32X32, BLOCK_32X64, BLOCK_64X32,
-                                          BLOCK_64X64));
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, VP9DenoiserTest,
+    ::testing::Values(make_tuple(&vp9_denoiser_filter_sse2, BLOCK_8X8),
+                      make_tuple(&vp9_denoiser_filter_sse2, BLOCK_8X16),
+                      make_tuple(&vp9_denoiser_filter_sse2, BLOCK_16X8),
+                      make_tuple(&vp9_denoiser_filter_sse2, BLOCK_16X16),
+                      make_tuple(&vp9_denoiser_filter_sse2, BLOCK_16X32),
+                      make_tuple(&vp9_denoiser_filter_sse2, BLOCK_32X16),
+                      make_tuple(&vp9_denoiser_filter_sse2, BLOCK_32X32),
+                      make_tuple(&vp9_denoiser_filter_sse2, BLOCK_32X64),
+                      make_tuple(&vp9_denoiser_filter_sse2, BLOCK_64X32),
+                      make_tuple(&vp9_denoiser_filter_sse2, BLOCK_64X64)));
+#endif  // HAVE_SSE2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, VP9DenoiserTest,
+    ::testing::Values(make_tuple(&vp9_denoiser_filter_neon, BLOCK_8X8),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_8X16),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_16X8),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_16X16),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_16X32),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_32X16),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_32X32),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_32X64),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_64X32),
+                      make_tuple(&vp9_denoiser_filter_neon, BLOCK_64X64)));
+#endif
 }  // namespace
diff --git a/media/libvpx/libvpx/test/vp9_encoder_parms_get_to_decoder.cc b/media/libvpx/libvpx/test/vp9_encoder_parms_get_to_decoder.cc
index 53dc8c9fe4..00860789c1 100644
--- a/media/libvpx/libvpx/test/vp9_encoder_parms_get_to_decoder.cc
+++ b/media/libvpx/libvpx/test/vp9_encoder_parms_get_to_decoder.cc
@@ -8,7 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include <memory>
+
+#include "gtest/gtest.h"
 
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
@@ -60,9 +62,9 @@ class VpxEncoderParmsGetToDecoder
   VpxEncoderParmsGetToDecoder()
       : EncoderTest(GET_PARAM(0)), encode_parms(GET_PARAM(1)) {}
 
-  virtual ~VpxEncoderParmsGetToDecoder() {}
+  ~VpxEncoderParmsGetToDecoder() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kTwoPassGood);
     cfg_.g_lag_in_frames = 25;
@@ -72,9 +74,9 @@ class VpxEncoderParmsGetToDecoder
     cfg_.rc_target_bitrate = test_video_.bitrate;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
       encoder->Control(VP9E_SET_COLOR_SPACE, encode_parms.cs);
       encoder->Control(VP9E_SET_COLOR_RANGE, encode_parms.color_range);
       encoder->Control(VP9E_SET_LOSSLESS, encode_parms.lossless);
@@ -93,15 +95,13 @@ class VpxEncoderParmsGetToDecoder
     }
   }
 
-  virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
-                                  const libvpx_test::VideoSource & /*video*/,
-                                  libvpx_test::Decoder *decoder) {
+  bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+                          const libvpx_test::VideoSource & /*video*/,
+                          libvpx_test::Decoder *decoder) override {
     vpx_codec_ctx_t *const vp9_decoder = decoder->GetDecoder();
     vpx_codec_alg_priv_t *const priv =
         reinterpret_cast<vpx_codec_alg_priv_t *>(vp9_decoder->priv);
-    FrameWorkerData *const worker_data =
-        reinterpret_cast<FrameWorkerData *>(priv->frame_workers[0].data1);
-    VP9_COMMON *const common = &worker_data->pbi->common;
+    VP9_COMMON *const common = &priv->pbi->common;
 
     if (encode_parms.lossless) {
       EXPECT_EQ(0, common->base_qindex);
@@ -140,14 +140,14 @@ class VpxEncoderParmsGetToDecoder
 TEST_P(VpxEncoderParmsGetToDecoder, BitstreamParms) {
   init_flags_ = VPX_CODEC_USE_PSNR;
 
-  testing::internal::scoped_ptr<libvpx_test::VideoSource> video(
+  std::unique_ptr<libvpx_test::VideoSource> video(
       new libvpx_test::Y4mVideoSource(test_video_.name, 0, test_video_.frames));
-  ASSERT_TRUE(video.get() != NULL);
+  ASSERT_NE(video.get(), nullptr);
 
   ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
 }
 
-VP9_INSTANTIATE_TEST_CASE(VpxEncoderParmsGetToDecoder,
-                          ::testing::ValuesIn(kVP9EncodeParameterSet),
-                          ::testing::ValuesIn(kVP9EncodePerfTestVectors));
+VP9_INSTANTIATE_TEST_SUITE(VpxEncoderParmsGetToDecoder,
+                           ::testing::ValuesIn(kVP9EncodeParameterSet),
+                           ::testing::ValuesIn(kVP9EncodePerfTestVectors));
 }  // namespace
diff --git a/media/libvpx/libvpx/test/vp9_end_to_end_test.cc b/media/libvpx/libvpx/test/vp9_end_to_end_test.cc
index 955f567ce2..a1ad5c1e4e 100644
--- a/media/libvpx/libvpx/test/vp9_end_to_end_test.cc
+++ b/media/libvpx/libvpx/test/vp9_end_to_end_test.cc
@@ -8,36 +8,40 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "memory"
+
+#include "gtest/gtest.h"
 
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
 #include "test/util.h"
 #include "test/y4m_video_source.h"
 #include "test/yuv_video_source.h"
+#include "vpx_config.h"
 
 namespace {
 
 const unsigned int kWidth = 160;
 const unsigned int kHeight = 90;
 const unsigned int kFramerate = 50;
-const unsigned int kFrames = 10;
+const unsigned int kFrames = 20;
 const int kBitrate = 500;
 // List of psnr thresholds for speed settings 0-7 and 5 encoding modes
 const double kPsnrThreshold[][5] = {
   { 36.0, 37.0, 37.0, 37.0, 37.0 }, { 35.0, 36.0, 36.0, 36.0, 36.0 },
   { 34.0, 35.0, 35.0, 35.0, 35.0 }, { 33.0, 34.0, 34.0, 34.0, 34.0 },
-  { 32.0, 33.0, 33.0, 33.0, 33.0 }, { 31.0, 32.0, 32.0, 32.0, 32.0 },
-  { 30.0, 31.0, 31.0, 31.0, 31.0 }, { 29.0, 30.0, 30.0, 30.0, 30.0 },
+  { 32.0, 33.0, 33.0, 33.0, 33.0 }, { 28.0, 32.0, 32.0, 32.0, 32.0 },
+  { 28.4, 31.0, 31.0, 31.0, 31.0 }, { 27.5, 30.0, 30.0, 30.0, 30.0 },
 };
 
-typedef struct {
+struct TestVideoParam {
   const char *filename;
   unsigned int input_bit_depth;
   vpx_img_fmt fmt;
   vpx_bit_depth_t bit_depth;
   unsigned int profile;
-} TestVideoParam;
+};
 
 const TestVideoParam kTestVectors[] = {
   { "park_joy_90p_8_420.y4m", 8, VPX_IMG_FMT_I420, VPX_BITS_8, 0 },
@@ -45,25 +49,35 @@ const TestVideoParam kTestVectors[] = {
   { "park_joy_90p_8_444.y4m", 8, VPX_IMG_FMT_I444, VPX_BITS_8, 1 },
   { "park_joy_90p_8_440.yuv", 8, VPX_IMG_FMT_I440, VPX_BITS_8, 1 },
 #if CONFIG_VP9_HIGHBITDEPTH
-  { "park_joy_90p_10_420.y4m", 10, VPX_IMG_FMT_I42016, VPX_BITS_10, 2 },
-  { "park_joy_90p_10_422.y4m", 10, VPX_IMG_FMT_I42216, VPX_BITS_10, 3 },
-  { "park_joy_90p_10_444.y4m", 10, VPX_IMG_FMT_I44416, VPX_BITS_10, 3 },
+  { "park_joy_90p_10_420_20f.y4m", 10, VPX_IMG_FMT_I42016, VPX_BITS_10, 2 },
+  { "park_joy_90p_10_422_20f.y4m", 10, VPX_IMG_FMT_I42216, VPX_BITS_10, 3 },
+  { "park_joy_90p_10_444_20f.y4m", 10, VPX_IMG_FMT_I44416, VPX_BITS_10, 3 },
   { "park_joy_90p_10_440.yuv", 10, VPX_IMG_FMT_I44016, VPX_BITS_10, 3 },
-  { "park_joy_90p_12_420.y4m", 12, VPX_IMG_FMT_I42016, VPX_BITS_12, 2 },
-  { "park_joy_90p_12_422.y4m", 12, VPX_IMG_FMT_I42216, VPX_BITS_12, 3 },
-  { "park_joy_90p_12_444.y4m", 12, VPX_IMG_FMT_I44416, VPX_BITS_12, 3 },
+  { "park_joy_90p_12_420_20f.y4m", 12, VPX_IMG_FMT_I42016, VPX_BITS_12, 2 },
+  { "park_joy_90p_12_422_20f.y4m", 12, VPX_IMG_FMT_I42216, VPX_BITS_12, 3 },
+  { "park_joy_90p_12_444_20f.y4m", 12, VPX_IMG_FMT_I44416, VPX_BITS_12, 3 },
   { "park_joy_90p_12_440.yuv", 12, VPX_IMG_FMT_I44016, VPX_BITS_12, 3 },
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 };
 
+const TestVideoParam kTestVectorsNv12[] = {
+  { "hantro_collage_w352h288_nv12.yuv", 8, VPX_IMG_FMT_NV12, VPX_BITS_8, 0 },
+};
+
+const TestVideoParam k4x2VideoTestVectors[] = {
+  { "4x2.y4m", 8, VPX_IMG_FMT_I420, VPX_BITS_8, 0 },
+};
+
 // Encoding modes tested
 const libvpx_test::TestMode kEncodingModeVectors[] = {
+#if !CONFIG_REALTIME_ONLY
   ::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood,
-  ::libvpx_test::kRealTime,
+#endif
+  ::libvpx_test::kRealTime
 };
 
 // Speed settings tested
-const int kCpuUsedVectors[] = { 1, 2, 3, 5, 6 };
+const int kCpuUsedVectors[] = { 1, 2, 3, 5, 6, 7 };
 
 int is_extension_y4m(const char *filename) {
   const char *dot = strrchr(filename, '.');
@@ -74,6 +88,43 @@ int is_extension_y4m(const char *filename) {
   }
 }
 
+class EndToEndTestAdaptiveRDThresh
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<int, int> {
+ protected:
+  EndToEndTestAdaptiveRDThresh()
+      : EncoderTest(GET_PARAM(0)), cpu_used_start_(GET_PARAM(1)),
+        cpu_used_end_(GET_PARAM(2)) {}
+
+  ~EndToEndTestAdaptiveRDThresh() override = default;
+
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    cfg_.g_lag_in_frames = 0;
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 600;
+    dec_cfg_.threads = 4;
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      encoder->Control(VP8E_SET_CPUUSED, cpu_used_start_);
+      encoder->Control(VP9E_SET_ROW_MT, 1);
+      encoder->Control(VP9E_SET_TILE_COLUMNS, 2);
+    }
+    if (video->frame() == 100)
+      encoder->Control(VP8E_SET_CPUUSED, cpu_used_end_);
+  }
+
+ private:
+  int cpu_used_start_;
+  int cpu_used_end_;
+};
+
 class EndToEndTestLarge
     : public ::libvpx_test::EncoderTest,
       public ::libvpx_test::CodecTestWith3Params<libvpx_test::TestMode,
@@ -82,11 +133,14 @@ class EndToEndTestLarge
   EndToEndTestLarge()
       : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(2)),
         cpu_used_(GET_PARAM(3)), psnr_(0.0), nframes_(0),
-        encoding_mode_(GET_PARAM(1)) {}
+        encoding_mode_(GET_PARAM(1)) {
+    cyclic_refresh_ = 0;
+    denoiser_on_ = 0;
+  }
 
-  virtual ~EndToEndTestLarge() {}
+  ~EndToEndTestLarge() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
     if (encoding_mode_ != ::libvpx_test::kRealTime) {
@@ -102,19 +156,19 @@ class EndToEndTestLarge
     dec_cfg_.threads = 4;
   }
 
-  virtual void BeginPassHook(unsigned int) {
+  void BeginPassHook(unsigned int) override {
     psnr_ = 0.0;
     nframes_ = 0;
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     psnr_ += pkt->data.psnr.psnr[0];
     nframes_++;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
       encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1);
       encoder->Control(VP9E_SET_TILE_COLUMNS, 4);
       encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
@@ -123,6 +177,9 @@ class EndToEndTestLarge
         encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
         encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
         encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+      } else {
+        encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_);
+        encoder->Control(VP9E_SET_AQ_MODE, cyclic_refresh_);
       }
     }
   }
@@ -138,6 +195,8 @@ class EndToEndTestLarge
 
   TestVideoParam test_video_param_;
   int cpu_used_;
+  int cyclic_refresh_;
+  int denoiser_on_;
 
  private:
   double psnr_;
@@ -145,6 +204,89 @@ class EndToEndTestLarge
   libvpx_test::TestMode encoding_mode_;
 };
 
+#if CONFIG_VP9_DECODER
+// The test parameters control VP9D_SET_LOOP_FILTER_OPT and the number of
+// decoder threads.
+class EndToEndTestLoopFilterThreading
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<bool, int> {
+ protected:
+  EndToEndTestLoopFilterThreading()
+      : EncoderTest(GET_PARAM(0)), use_loop_filter_opt_(GET_PARAM(1)) {}
+
+  ~EndToEndTestLoopFilterThreading() override = default;
+
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    cfg_.g_threads = 2;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.rc_target_bitrate = 500;
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.kf_min_dist = 1;
+    cfg_.kf_max_dist = 1;
+    dec_cfg_.threads = GET_PARAM(2);
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      encoder->Control(VP8E_SET_CPUUSED, 8);
+    }
+    encoder->Control(VP9E_SET_TILE_COLUMNS, 4 - video->frame() % 5);
+  }
+
+  void PreDecodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Decoder *decoder) override {
+    if (video->frame() == 0) {
+      decoder->Control(VP9D_SET_LOOP_FILTER_OPT, use_loop_filter_opt_ ? 1 : 0);
+    }
+  }
+
+ private:
+  const bool use_loop_filter_opt_;
+};
+#endif  // CONFIG_VP9_DECODER
+
+class EndToEndNV12 : public EndToEndTestLarge {};
+
+TEST_P(EndToEndNV12, EndtoEndNV12Test) {
+  cfg_.rc_target_bitrate = kBitrate;
+  cfg_.g_error_resilient = 0;
+  cfg_.g_profile = test_video_param_.profile;
+  cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
+  cfg_.g_bit_depth = test_video_param_.bit_depth;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+  if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH;
+
+  std::unique_ptr<libvpx_test::VideoSource> video;
+
+  video.reset(new libvpx_test::YUVVideoSource(test_video_param_.filename,
+                                              test_video_param_.fmt, 352, 288,
+                                              30, 1, 0, 100));
+  ASSERT_NE(video.get(), nullptr);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
+class EndToEnd4x2Video : public EndToEndTestLarge {};
+
+TEST_P(EndToEnd4x2Video, EndtoEnd4x2VideoTest) {
+  cfg_.rc_target_bitrate = kBitrate;
+  cfg_.g_error_resilient = 0;
+  cfg_.g_profile = test_video_param_.profile;
+  cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
+  cfg_.g_bit_depth = test_video_param_.bit_depth;
+
+  std::unique_ptr<libvpx_test::VideoSource> video;
+
+  video.reset(
+      new libvpx_test::Y4mVideoSource(test_video_param_.filename, 0, 200));
+  ASSERT_NE(video.get(), nullptr);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
 TEST_P(EndToEndTestLarge, EndtoEndPSNRTest) {
   cfg_.rc_target_bitrate = kBitrate;
   cfg_.g_error_resilient = 0;
@@ -154,7 +296,7 @@ TEST_P(EndToEndTestLarge, EndtoEndPSNRTest) {
   init_flags_ = VPX_CODEC_USE_PSNR;
   if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH;
 
-  testing::internal::scoped_ptr<libvpx_test::VideoSource> video;
+  std::unique_ptr<libvpx_test::VideoSource> video;
   if (is_extension_y4m(test_video_param_.filename)) {
     video.reset(new libvpx_test::Y4mVideoSource(test_video_param_.filename, 0,
                                                 kFrames));
@@ -163,15 +305,80 @@ TEST_P(EndToEndTestLarge, EndtoEndPSNRTest) {
         test_video_param_.filename, test_video_param_.fmt, kWidth, kHeight,
         kFramerate, 1, 0, kFrames));
   }
-  ASSERT_TRUE(video.get() != NULL);
+  ASSERT_NE(video.get(), nullptr);
 
   ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
   const double psnr = GetAveragePsnr();
   EXPECT_GT(psnr, GetPsnrThreshold());
 }
 
-VP9_INSTANTIATE_TEST_CASE(EndToEndTestLarge,
-                          ::testing::ValuesIn(kEncodingModeVectors),
-                          ::testing::ValuesIn(kTestVectors),
-                          ::testing::ValuesIn(kCpuUsedVectors));
+TEST_P(EndToEndTestLarge, EndtoEndPSNRDenoiserAQTest) {
+  cfg_.rc_target_bitrate = kBitrate;
+  cfg_.g_error_resilient = 0;
+  cfg_.g_profile = test_video_param_.profile;
+  cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
+  cfg_.g_bit_depth = test_video_param_.bit_depth;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+  cyclic_refresh_ = 3;
+  denoiser_on_ = 1;
+  if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH;
+
+  std::unique_ptr<libvpx_test::VideoSource> video;
+  if (is_extension_y4m(test_video_param_.filename)) {
+    video.reset(new libvpx_test::Y4mVideoSource(test_video_param_.filename, 0,
+                                                kFrames));
+  } else {
+    video.reset(new libvpx_test::YUVVideoSource(
+        test_video_param_.filename, test_video_param_.fmt, kWidth, kHeight,
+        kFramerate, 1, 0, kFrames));
+  }
+  ASSERT_NE(video.get(), nullptr);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+  const double psnr = GetAveragePsnr();
+  EXPECT_GT(psnr, GetPsnrThreshold());
+}
+
+TEST_P(EndToEndTestAdaptiveRDThresh, EndtoEndAdaptiveRDThreshRowMT) {
+  cfg_.rc_target_bitrate = kBitrate;
+  cfg_.g_error_resilient = 0;
+  cfg_.g_threads = 2;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+#if CONFIG_VP9_DECODER
+TEST_P(EndToEndTestLoopFilterThreading, TileCountChange) {
+  ::libvpx_test::RandomVideoSource video;
+  video.SetSize(4096, 2160);
+  video.set_limit(10);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+#endif  // CONFIG_VP9_DECODER
+
+VP9_INSTANTIATE_TEST_SUITE(EndToEndTestLarge,
+                           ::testing::ValuesIn(kEncodingModeVectors),
+                           ::testing::ValuesIn(kTestVectors),
+                           ::testing::ValuesIn(kCpuUsedVectors));
+
+VP9_INSTANTIATE_TEST_SUITE(EndToEndNV12,
+                           ::testing::Values(::libvpx_test::kRealTime),
+                           ::testing::ValuesIn(kTestVectorsNv12),
+                           ::testing::Values(6, 7, 8));
+
+VP9_INSTANTIATE_TEST_SUITE(EndToEnd4x2Video,
+                           ::testing::Values(::libvpx_test::kTwoPassGood),
+                           ::testing::ValuesIn(k4x2VideoTestVectors),
+                           ::testing::Values(0, 1));
+
+VP9_INSTANTIATE_TEST_SUITE(EndToEndTestAdaptiveRDThresh,
+                           ::testing::Values(5, 6, 7), ::testing::Values(8, 9));
+
+#if CONFIG_VP9_DECODER
+VP9_INSTANTIATE_TEST_SUITE(EndToEndTestLoopFilterThreading, ::testing::Bool(),
+                           ::testing::Range(2, 6));
+#endif  // CONFIG_VP9_DECODER
 }  // namespace
diff --git a/media/libvpx/libvpx/test/vp9_ethread_test.cc b/media/libvpx/libvpx/test/vp9_ethread_test.cc
index 4df40854b8..9e366eb366 100644
--- a/media/libvpx/libvpx/test/vp9_ethread_test.cc
+++ b/media/libvpx/libvpx/test/vp9_ethread_test.cc
@@ -10,14 +10,221 @@
 
 #include <string>
 #include <vector>
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/md5_helper.h"
 #include "test/util.h"
 #include "test/y4m_video_source.h"
+#include "vp9/encoder/vp9_firstpass.h"
+#include "vpx_config.h"
 
 namespace {
+// FIRSTPASS_STATS struct:
+// {
+//   26 double members;
+//   1 int64_t member;
+// }
+// Whenever FIRSTPASS_STATS struct is modified, the following constants need to
+// be revisited.
+const int kDbl = 26;
+const int kInt = 1;
+const size_t kFirstPassStatsSz = kDbl * sizeof(double) + kInt * sizeof(int64_t);
+
+class VPxFirstPassEncoderThreadTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+ protected:
+  VPxFirstPassEncoderThreadTest()
+      : EncoderTest(GET_PARAM(0)), encoder_initialized_(false), tiles_(0),
+        encoding_mode_(GET_PARAM(1)), set_cpu_used_(GET_PARAM(2)) {
+    init_flags_ = VPX_CODEC_USE_PSNR;
+
+    row_mt_mode_ = 1;
+    first_pass_only_ = true;
+    firstpass_stats_.buf = nullptr;
+    firstpass_stats_.sz = 0;
+  }
+  ~VPxFirstPassEncoderThreadTest() override { free(firstpass_stats_.buf); }
+
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+
+    cfg_.rc_end_usage = VPX_VBR;
+    cfg_.rc_2pass_vbr_minsection_pct = 5;
+    cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_min_quantizer = 0;
+  }
+
+  void BeginPassHook(unsigned int /*pass*/) override {
+    encoder_initialized_ = false;
+    abort_ = false;
+  }
+
+  void EndPassHook() override {
+    // For first pass stats test, only run first pass encoder.
+    if (first_pass_only_ && cfg_.g_pass == VPX_RC_FIRST_PASS)
+      abort_ |= first_pass_only_;
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (!encoder_initialized_) {
+      // Encode in 2-pass mode.
+      encoder->Control(VP9E_SET_TILE_COLUMNS, tiles_);
+      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+      encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+      encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+      encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 0);
+
+      if (encoding_mode_ == ::libvpx_test::kTwoPassGood)
+        encoder->Control(VP9E_SET_ROW_MT, row_mt_mode_);
+
+      encoder_initialized_ = true;
+    }
+  }
+
+  void StatsPktHook(const vpx_codec_cx_pkt_t *pkt) override {
+    const uint8_t *const pkt_buf =
+        reinterpret_cast<uint8_t *>(pkt->data.twopass_stats.buf);
+    const size_t pkt_size = pkt->data.twopass_stats.sz;
+
+    // First pass stats size equals sizeof(FIRSTPASS_STATS)
+    EXPECT_EQ(pkt_size, kFirstPassStatsSz)
+        << "Error: First pass stats size doesn't equal kFirstPassStatsSz";
+
+    firstpass_stats_.buf =
+        realloc(firstpass_stats_.buf, firstpass_stats_.sz + pkt_size);
+    ASSERT_NE(firstpass_stats_.buf, nullptr);
+    memcpy((uint8_t *)firstpass_stats_.buf + firstpass_stats_.sz, pkt_buf,
+           pkt_size);
+    firstpass_stats_.sz += pkt_size;
+  }
+
+  bool encoder_initialized_;
+  int tiles_;
+  ::libvpx_test::TestMode encoding_mode_;
+  int set_cpu_used_;
+  int row_mt_mode_;
+  bool first_pass_only_;
+  vpx_fixed_buf_t firstpass_stats_;
+};
+
+#if !CONFIG_REALTIME_ONLY
+static void compare_fp_stats(vpx_fixed_buf_t *fp_stats, double factor) {
+  // fp_stats consists of 2 set of first pass encoding stats. These 2 set of
+  // stats are compared to check if the stats match or at least are very close.
+  FIRSTPASS_STATS *stats1 = reinterpret_cast<FIRSTPASS_STATS *>(fp_stats->buf);
+  int nframes_ = (int)(fp_stats->sz / sizeof(FIRSTPASS_STATS));
+  FIRSTPASS_STATS *stats2 = stats1 + nframes_ / 2;
+  int i, j;
+
+  // The total stats are also output and included in the first pass stats. Here
+  // ignore that in the comparison.
+  for (i = 0; i < (nframes_ / 2 - 1); ++i) {
+    const double *frame_stats1 = reinterpret_cast<double *>(stats1);
+    const double *frame_stats2 = reinterpret_cast<double *>(stats2);
+
+    for (j = 0; j < kDbl; ++j) {
+      ASSERT_LE(fabs(*frame_stats1 - *frame_stats2),
+                fabs(*frame_stats1) / factor)
+          << "First failure @ frame #" << i << " stat #" << j << " ("
+          << *frame_stats1 << " vs. " << *frame_stats2 << ")";
+      frame_stats1++;
+      frame_stats2++;
+    }
+
+    stats1++;
+    stats2++;
+  }
+
+  // Reset firstpass_stats_ to 0.
+  memset((uint8_t *)fp_stats->buf, 0, fp_stats->sz);
+  fp_stats->sz = 0;
+}
+
+static void compare_fp_stats_md5(vpx_fixed_buf_t *fp_stats) {
+  // fp_stats consists of 2 set of first pass encoding stats. These 2 set of
+  // stats are compared to check if the stats match.
+  uint8_t *stats1 = reinterpret_cast<uint8_t *>(fp_stats->buf);
+  uint8_t *stats2 = stats1 + fp_stats->sz / 2;
+  ::libvpx_test::MD5 md5_row_mt_0, md5_row_mt_1;
+
+  md5_row_mt_0.Add(stats1, fp_stats->sz / 2);
+  const char *md5_row_mt_0_str = md5_row_mt_0.Get();
+
+  md5_row_mt_1.Add(stats2, fp_stats->sz / 2);
+  const char *md5_row_mt_1_str = md5_row_mt_1.Get();
+
+  // Check md5 match.
+  ASSERT_STREQ(md5_row_mt_0_str, md5_row_mt_1_str)
+      << "MD5 checksums don't match";
+
+  // Reset firstpass_stats_ to 0.
+  memset((uint8_t *)fp_stats->buf, 0, fp_stats->sz);
+  fp_stats->sz = 0;
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+TEST_P(VPxFirstPassEncoderThreadTest, FirstPassStatsTest) {
+#if CONFIG_REALTIME_ONLY
+  GTEST_SKIP();
+#else
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+
+  first_pass_only_ = true;
+  cfg_.rc_target_bitrate = 1000;
+
+  // Test row_mt_mode: 0 vs 1 at single thread case(threads = 1, tiles_ = 0)
+  tiles_ = 0;
+  cfg_.g_threads = 1;
+
+  row_mt_mode_ = 0;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  row_mt_mode_ = 1;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // Compare to check if using or not using row-mt generates close stats.
+  ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 400.0));
+
+  // Test single thread vs multiple threads
+  row_mt_mode_ = 1;
+  tiles_ = 0;
+
+  cfg_.g_threads = 1;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  cfg_.g_threads = 4;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // Compare to check if single-thread and multi-thread stats are close enough.
+  ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 400.0));
+
+  // Bit exact test in row_mt mode.
+  // When row_mt_mode_=1 and using >1 threads, the encoder generates bit exact
+  // result.
+  row_mt_mode_ = 1;
+  tiles_ = 2;
+
+  cfg_.g_threads = 2;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  cfg_.g_threads = 8;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // Compare to check if stats match with row-mt=0/1.
+  compare_fp_stats_md5(&firstpass_stats_);
+#endif  // CONFIG_REALTIME_ONLY
+}
+
 class VPxEncoderThreadTest
     : public ::libvpx_test::EncoderTest,
       public ::libvpx_test::CodecTestWith4Params<libvpx_test::TestMode, int,
@@ -29,15 +236,17 @@ class VPxEncoderThreadTest
         encoding_mode_(GET_PARAM(1)), set_cpu_used_(GET_PARAM(2)) {
     init_flags_ = VPX_CODEC_USE_PSNR;
     md5_.clear();
+    row_mt_mode_ = 1;
+    psnr_ = 0.0;
+    nframes_ = 0;
   }
-  virtual ~VPxEncoderThreadTest() {}
+  ~VPxEncoderThreadTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
 
     if (encoding_mode_ != ::libvpx_test::kRealTime) {
-      cfg_.g_lag_in_frames = 3;
       cfg_.rc_end_usage = VPX_VBR;
       cfg_.rc_2pass_vbr_minsection_pct = 5;
       cfg_.rc_2pass_vbr_maxsection_pct = 2000;
@@ -50,12 +259,14 @@ class VPxEncoderThreadTest
     cfg_.rc_min_quantizer = 0;
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     encoder_initialized_ = false;
+    psnr_ = 0.0;
+    nframes_ = 0;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/,
+                          ::libvpx_test::Encoder *encoder) override {
     if (!encoder_initialized_) {
       // Encode 4 column tiles.
       encoder->Control(VP9E_SET_TILE_COLUMNS, tiles_);
@@ -70,20 +281,27 @@ class VPxEncoderThreadTest
         encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 0);
         encoder->Control(VP9E_SET_AQ_MODE, 3);
       }
+      encoder->Control(VP9E_SET_ROW_MT, row_mt_mode_);
+
       encoder_initialized_ = true;
     }
   }
 
-  virtual void DecompressedFrameHook(const vpx_image_t &img,
-                                     vpx_codec_pts_t /*pts*/) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  void DecompressedFrameHook(const vpx_image_t &img,
+                             vpx_codec_pts_t /*pts*/) override {
     ::libvpx_test::MD5 md5_res;
     md5_res.Add(&img);
     md5_.push_back(md5_res.Get());
   }
 
-  virtual bool HandleDecodeResult(const vpx_codec_err_t res,
-                                  const libvpx_test::VideoSource & /*video*/,
-                                  libvpx_test::Decoder * /*decoder*/) {
+  bool HandleDecodeResult(const vpx_codec_err_t res,
+                          const libvpx_test::VideoSource & /*video*/,
+                          libvpx_test::Decoder * /*decoder*/) override {
     if (res != VPX_CODEC_OK) {
       EXPECT_EQ(VPX_CODEC_OK, res);
       return false;
@@ -92,63 +310,142 @@ class VPxEncoderThreadTest
     return true;
   }
 
+  double GetAveragePsnr() const { return nframes_ ? (psnr_ / nframes_) : 0.0; }
+
   bool encoder_initialized_;
   int tiles_;
   int threads_;
   ::libvpx_test::TestMode encoding_mode_;
   int set_cpu_used_;
+  int row_mt_mode_;
+  double psnr_;
+  unsigned int nframes_;
   std::vector<std::string> md5_;
 };
 
 TEST_P(VPxEncoderThreadTest, EncoderResultTest) {
-  std::vector<std::string> single_thr_md5, multi_thr_md5;
-
   ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 15, 20);
-
   cfg_.rc_target_bitrate = 1000;
 
+  // Part 1: Bit exact test for row_mt_mode_ = 0.
+  // This part keeps original unit tests done before row-mt code is checked in.
+  row_mt_mode_ = 0;
+
   // Encode using single thread.
   cfg_.g_threads = 1;
   init_flags_ = VPX_CODEC_USE_PSNR;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  single_thr_md5 = md5_;
+  const std::vector<std::string> single_thr_md5 = md5_;
   md5_.clear();
 
   // Encode using multiple threads.
   cfg_.g_threads = threads_;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  multi_thr_md5 = md5_;
+  const std::vector<std::string> multi_thr_md5 = md5_;
   md5_.clear();
 
   // Compare to check if two vectors are equal.
   ASSERT_EQ(single_thr_md5, multi_thr_md5);
+
+  // Part 2: row_mt_mode_ = 0 vs row_mt_mode_ = 1 single thread bit exact test.
+  row_mt_mode_ = 1;
+
+  // Encode using single thread
+  cfg_.g_threads = 1;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  std::vector<std::string> row_mt_single_thr_md5 = md5_;
+  md5_.clear();
+
+  ASSERT_EQ(single_thr_md5, row_mt_single_thr_md5);
+
+  // Part 3: Bit exact test with row-mt on
+  // When row_mt_mode_=1 and using >1 threads, the encoder generates bit exact
+  // result.
+  row_mt_mode_ = 1;
+  row_mt_single_thr_md5.clear();
+
+  // Encode using 2 threads.
+  cfg_.g_threads = 2;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  row_mt_single_thr_md5 = md5_;
+  md5_.clear();
+
+  // Encode using multiple threads.
+  cfg_.g_threads = threads_;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const std::vector<std::string> row_mt_multi_thr_md5 = md5_;
+  md5_.clear();
+
+  // Compare to check if two vectors are equal.
+  ASSERT_EQ(row_mt_single_thr_md5, row_mt_multi_thr_md5);
+
+  // Part 4: PSNR test with bit_match_mode_ = 0
+  row_mt_mode_ = 1;
+
+  // Encode using single thread.
+  cfg_.g_threads = 1;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double single_thr_psnr = GetAveragePsnr();
+
+  // Encode using multiple threads.
+  cfg_.g_threads = threads_;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double multi_thr_psnr = GetAveragePsnr();
+
+  EXPECT_NEAR(single_thr_psnr, multi_thr_psnr, 0.2);
 }
 
-// Split this into two instantiations so that we can distinguish
-// between very slow runs ( ie cpu_speed 0 ) vs ones that can be
+INSTANTIATE_TEST_SUITE_P(
+    VP9, VPxFirstPassEncoderThreadTest,
+    ::testing::Combine(
+        ::testing::Values(
+            static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)),
+        ::testing::Values(::libvpx_test::kTwoPassGood),
+        ::testing::Range(0, 4)));  // cpu_used
+
+constexpr libvpx_test::TestMode kOnePassTestModes[] = {
+  libvpx_test::kRealTime,
+#if !CONFIG_REALTIME_ONLY
+  libvpx_test::kOnePassGood,
+#endif
+};
+
+// Split this into multiple instantiations so that we can distinguish
+// between very slow runs ( i.e., cpu_speed 0 ) vs ones that can be
 // run nightly by adding Large to the title.
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     VP9, VPxEncoderThreadTest,
     ::testing::Combine(
         ::testing::Values(
             static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)),
-        ::testing::Values(::libvpx_test::kTwoPassGood,
-                          ::libvpx_test::kOnePassGood,
-                          ::libvpx_test::kRealTime),
-        ::testing::Range(2, 9),    // cpu_used
+        ::testing::ValuesIn(kOnePassTestModes),
+        ::testing::Range(3, 10),   // cpu_used
         ::testing::Range(0, 3),    // tile_columns
         ::testing::Range(2, 5)));  // threads
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     VP9Large, VPxEncoderThreadTest,
     ::testing::Combine(
         ::testing::Values(
             static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)),
-        ::testing::Values(::libvpx_test::kTwoPassGood,
-                          ::libvpx_test::kOnePassGood,
-                          ::libvpx_test::kRealTime),
-        ::testing::Range(0, 2),    // cpu_used
+        ::testing::ValuesIn(kOnePassTestModes),
+        ::testing::Range(0, 3),    // cpu_used
         ::testing::Range(0, 3),    // tile_columns
         ::testing::Range(2, 5)));  // threads
 
+#if !CONFIG_REALTIME_ONLY
+INSTANTIATE_TEST_SUITE_P(
+    VP9LargeBest, VPxEncoderThreadTest,
+    ::testing::Combine(
+        ::testing::Values(
+            static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)),
+        ::testing::Values(libvpx_test::kOnePassBest),
+        ::testing::Range(0, 10),   // cpu_used
+        ::testing::Range(0, 3),    // tile_columns
+        ::testing::Range(2, 5)));  // threads
+#endif
+
 }  // namespace
diff --git a/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc b/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc
new file mode 100644
index 0000000000..b85dee1d08
--- /dev/null
+++ b/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc
@@ -0,0 +1,271 @@
+/*
+ *  Copyright (c) 2020 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cstdint>
+#include <new>
+#include <memory>
+
+#include "./vpx_config.h"
+
+#include "gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/yuv_video_source.h"
+#if CONFIG_VP9_DECODER
+#include "vpx/vp8dx.h"
+#endif
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_encoder.h"
+#include "vpx/vpx_ext_ratectrl.h"
+#include "vpx/vpx_image.h"
+#include "vpx/vpx_tpl.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+namespace {
+
+constexpr int kShowFrameCount = 10;
+constexpr int kKeyframeQp = 10;
+constexpr int kLeafQp = 40;
+constexpr int kArfQp = 15;
+
+// Simple external rate controller for testing.
+class RateControllerForTest {
+ public:
+  RateControllerForTest() : current_gop_(-1) {}
+  ~RateControllerForTest() {}
+
+  void StartNextGop() { ++current_gop_; }
+
+  vpx_rc_gop_decision_t GetCurrentGop() const {
+    vpx_rc_gop_decision_t gop_decision;
+    if (current_gop_ == 0) {
+      gop_decision.use_key_frame = 1;
+      gop_decision.use_alt_ref = 1;
+      gop_decision.gop_coding_frames =
+          kShowFrameCount - 1 + gop_decision.use_alt_ref;
+      // key frame
+      gop_decision.update_type[0] = VPX_RC_KF_UPDATE;
+      gop_decision.update_ref_index[0] = 0;
+      gop_decision.ref_frame_list[0] = get_kf_ref_frame();
+      // arf
+      gop_decision.update_type[1] = VPX_RC_ARF_UPDATE;
+      gop_decision.update_ref_index[1] = 1;
+      gop_decision.ref_frame_list[1] = get_arf_ref_frame();
+      // leafs
+      for (int i = 2; i < gop_decision.gop_coding_frames; ++i) {
+        gop_decision.update_type[i] = VPX_RC_LF_UPDATE;
+        gop_decision.update_ref_index[i] = 2;
+        gop_decision.ref_frame_list[i] = get_leaf_ref_frame(i);
+      }
+    } else {
+      // Pad a overlay-only GOP as the last GOP.
+      EXPECT_EQ(current_gop_, 1);
+      gop_decision.use_key_frame = 0;
+      gop_decision.use_alt_ref = 0;
+      gop_decision.gop_coding_frames = 1;
+
+      gop_decision.update_type[0] = VPX_RC_OVERLAY_UPDATE;
+      gop_decision.update_ref_index[0] = 1;
+      gop_decision.ref_frame_list[0] = get_ovl_ref_frame();
+    }
+    return gop_decision;
+  }
+
+  int CalculateFrameDecision(int frame_index) {
+    if (current_gop_ == 0 && frame_index == 0) {
+      // Key frame, first frame in the first GOP.
+      return kKeyframeQp;
+    } else if (frame_index == 1) {
+      // ARF, we always use ARF for this test.
+      return kArfQp;
+    } else {
+      return kLeafQp;
+    }
+  }
+
+ private:
+  vpx_rc_ref_frame_t get_kf_ref_frame() const {
+    vpx_rc_ref_frame_t ref_frame;
+    ref_frame.index[0] = -1;
+    ref_frame.index[1] = -1;
+    ref_frame.index[2] = -1;
+    ref_frame.name[0] = VPX_RC_INVALID_REF_FRAME;
+    ref_frame.name[1] = VPX_RC_INVALID_REF_FRAME;
+    ref_frame.name[2] = VPX_RC_INVALID_REF_FRAME;
+    return ref_frame;
+  }
+  vpx_rc_ref_frame_t get_arf_ref_frame() const {
+    vpx_rc_ref_frame_t ref_frame;
+    ref_frame.index[0] = 0;
+    ref_frame.index[1] = -1;
+    ref_frame.index[2] = -1;
+    ref_frame.name[0] = VPX_RC_GOLDEN_FRAME;
+    ref_frame.name[1] = VPX_RC_INVALID_REF_FRAME;
+    ref_frame.name[2] = VPX_RC_INVALID_REF_FRAME;
+    return ref_frame;
+  }
+  vpx_rc_ref_frame_t get_leaf_ref_frame(int count) const {
+    vpx_rc_ref_frame_t ref_frame;
+    ref_frame.index[0] = 0;
+    ref_frame.index[1] = 1;
+    ref_frame.index[2] = count > 2 ? 2 : -1;
+    ref_frame.name[0] = VPX_RC_GOLDEN_FRAME;
+    ref_frame.name[1] = VPX_RC_ALTREF_FRAME;
+    ref_frame.name[2] =
+        count > 2 ? VPX_RC_LAST_FRAME : VPX_RC_INVALID_REF_FRAME;
+    return ref_frame;
+  }
+  vpx_rc_ref_frame_t get_ovl_ref_frame() const {
+    vpx_rc_ref_frame_t ref_frame;
+    ref_frame.index[0] = 1;
+    ref_frame.index[1] = -1;
+    ref_frame.index[2] = -1;
+    ref_frame.name[0] = VPX_RC_ALTREF_FRAME;
+    ref_frame.name[1] = VPX_RC_INVALID_REF_FRAME;
+    ref_frame.name[2] = VPX_RC_INVALID_REF_FRAME;
+    return ref_frame;
+  }
+
+  int current_gop_;
+};
+
+// Callbacks used in this test.
+vpx_rc_status_t rc_test_create_model(
+    void * /*priv*/, const vpx_rc_config_t * /*ratectrl_config*/,
+    vpx_rc_model_t *rate_ctrl_model_ptr) {
+  std::unique_ptr<RateControllerForTest> test_controller(
+      new RateControllerForTest());
+  *rate_ctrl_model_ptr = test_controller.release();
+  return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_test_send_firstpass_stats(
+    vpx_rc_model_t /*rate_ctrl_model*/,
+    const vpx_rc_firstpass_stats_t *first_pass_stats) {
+  EXPECT_EQ(first_pass_stats->num_frames, kShowFrameCount);
+  for (int i = 0; i < first_pass_stats->num_frames; ++i) {
+    EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i);
+  }
+  return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_test_send_tpl_gop_stats(
+    vpx_rc_model_t /*rate_ctrl_model*/, const VpxTplGopStats *tpl_gop_stats) {
+  EXPECT_GT(tpl_gop_stats->size, 0);
+
+  for (int i = 0; i < tpl_gop_stats->size; ++i) {
+    EXPECT_GT(tpl_gop_stats->frame_stats_list[i].num_blocks, 0);
+  }
+  return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_test_get_encodeframe_decision(
+    vpx_rc_model_t rate_ctrl_model, const int frame_gop_index,
+    vpx_rc_encodeframe_decision_t *frame_decision) {
+  RateControllerForTest *test_controller =
+      static_cast<RateControllerForTest *>(rate_ctrl_model);
+  frame_decision->q_index =
+      test_controller->CalculateFrameDecision(frame_gop_index);
+  frame_decision->rdmult =
+      frame_decision->q_index * frame_decision->q_index / 2;
+  frame_decision->delta_q_uv = 0;
+  return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_test_get_gop_decision(vpx_rc_model_t rate_ctrl_model,
+                                         vpx_rc_gop_decision_t *gop_decision) {
+  RateControllerForTest *test_controller =
+      static_cast<RateControllerForTest *>(rate_ctrl_model);
+  test_controller->StartNextGop();
+  *gop_decision = test_controller->GetCurrentGop();
+  return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_delete_model(vpx_rc_model_t rate_ctrl_model) {
+  RateControllerForTest *test_controller =
+      static_cast<RateControllerForTest *>(rate_ctrl_model);
+  delete test_controller;
+  return VPX_RC_OK;
+}
+
+class ExtRateCtrlTest : public ::libvpx_test::EncoderTest,
+                        public ::testing::Test {
+ protected:
+  ExtRateCtrlTest()
+      : EncoderTest(&::libvpx_test::kVP9), received_show_frame_count_(0),
+        current_frame_qp_(0) {}
+
+  ~ExtRateCtrlTest() override = default;
+
+  void SetUp() override {
+    InitializeConfig();
+#if CONFIG_REALTIME_ONLY
+    SetMode(::libvpx_test::kRealTime);
+#else
+    SetMode(::libvpx_test::kTwoPassGood);
+#endif
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      vpx_rc_funcs_t rc_funcs = {};
+      rc_funcs.rc_type = VPX_RC_GOP_QP;
+      rc_funcs.create_model = rc_test_create_model;
+      rc_funcs.send_firstpass_stats = rc_test_send_firstpass_stats;
+      rc_funcs.send_tpl_gop_stats = rc_test_send_tpl_gop_stats;
+      rc_funcs.get_gop_decision = rc_test_get_gop_decision;
+      rc_funcs.get_encodeframe_decision = rc_test_get_encodeframe_decision;
+      rc_funcs.delete_model = rc_delete_model;
+      encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
+    }
+  }
+
+#if CONFIG_VP9_DECODER
+  bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+                          const ::libvpx_test::VideoSource & /*video*/,
+                          ::libvpx_test::Decoder *decoder) override {
+    EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
+    decoder->Control(VPXD_GET_LAST_QUANTIZER, &current_frame_qp_);
+    return VPX_CODEC_OK == res_dec;
+  }
+
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
+    // We are not comparing current_frame_qp_ here because the encoder will
+    // pack ARF and the next show frame into one pkt. Therefore, we might
+    // receive two frames in one pkt. However, one thing we are sure is that
+    // each pkt will have just one show frame. Therefore, we can check if the
+    // received show frame count match the actual show frame count.
+    if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
+      ++received_show_frame_count_;
+    }
+  }
+#endif  // CONFIG_VP9_DECODER
+
+  int received_show_frame_count_;
+  int current_frame_qp_;
+};
+
+TEST_F(ExtRateCtrlTest, EncodeTest) {
+  cfg_.rc_target_bitrate = 4000;
+  cfg_.g_lag_in_frames = 25;
+
+  std::unique_ptr<libvpx_test::VideoSource> video;
+  video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
+      "bus_352x288_420_f20_b8.yuv", VPX_IMG_FMT_I420, 352, 288, 30, 1, 0,
+      kShowFrameCount));
+
+  ASSERT_NE(video, nullptr);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+  EXPECT_EQ(received_show_frame_count_, kShowFrameCount);
+}
+
+}  // namespace
diff --git a/media/libvpx/libvpx/test/vp9_frame_parallel_test.cc b/media/libvpx/libvpx/test/vp9_frame_parallel_test.cc
deleted file mode 100644
index 670cd4d721..0000000000
--- a/media/libvpx/libvpx/test/vp9_frame_parallel_test.cc
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "./vpx_config.h"
-#include "test/codec_factory.h"
-#include "test/decode_test_driver.h"
-#include "test/ivf_video_source.h"
-#include "test/md5_helper.h"
-#include "test/util.h"
-#if CONFIG_WEBM_IO
-#include "test/webm_video_source.h"
-#endif
-#include "vpx_mem/vpx_mem.h"
-
-namespace {
-
-using std::string;
-
-#if CONFIG_WEBM_IO
-
-struct PauseFileList {
-  const char *name;
-  // md5 sum for decoded frames which does not include skipped frames.
-  const char *expected_md5;
-  const int pause_frame_num;
-};
-
-// Decodes |filename| with |num_threads|. Pause at the specified frame_num,
-// seek to next key frame and then continue decoding until the end. Return
-// the md5 of the decoded frames which does not include skipped frames.
-string DecodeFileWithPause(const string &filename, int num_threads,
-                           int pause_num) {
-  libvpx_test::WebMVideoSource video(filename);
-  video.Init();
-  int in_frames = 0;
-  int out_frames = 0;
-
-  vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
-  cfg.threads = num_threads;
-  vpx_codec_flags_t flags = 0;
-  flags |= VPX_CODEC_USE_FRAME_THREADING;
-  libvpx_test::VP9Decoder decoder(cfg, flags);
-
-  libvpx_test::MD5 md5;
-  video.Begin();
-
-  do {
-    ++in_frames;
-    const vpx_codec_err_t res =
-        decoder.DecodeFrame(video.cxdata(), video.frame_size());
-    if (res != VPX_CODEC_OK) {
-      EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
-      break;
-    }
-
-    // Pause at specified frame number.
-    if (in_frames == pause_num) {
-      // Flush the decoder and then seek to next key frame.
-      decoder.DecodeFrame(NULL, 0);
-      video.SeekToNextKeyFrame();
-    } else {
-      video.Next();
-    }
-
-    // Flush the decoder at the end of the video.
-    if (!video.cxdata()) decoder.DecodeFrame(NULL, 0);
-
-    libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
-    const vpx_image_t *img;
-
-    // Get decompressed data
-    while ((img = dec_iter.Next())) {
-      ++out_frames;
-      md5.Add(img);
-    }
-  } while (video.cxdata() != NULL);
-
-  EXPECT_EQ(in_frames, out_frames)
-      << "Input frame count does not match output frame count";
-
-  return string(md5.Get());
-}
-
-void DecodeFilesWithPause(const PauseFileList files[]) {
-  for (const PauseFileList *iter = files; iter->name != NULL; ++iter) {
-    SCOPED_TRACE(iter->name);
-    for (int t = 2; t <= 8; ++t) {
-      EXPECT_EQ(iter->expected_md5,
-                DecodeFileWithPause(iter->name, t, iter->pause_frame_num))
-          << "threads = " << t;
-    }
-  }
-}
-
-TEST(VP9MultiThreadedFrameParallel, PauseSeekResume) {
-  // vp90-2-07-frame_parallel-1.webm is a 40 frame video file with
-  // one key frame for every ten frames.
-  static const PauseFileList files[] = {
-    { "vp90-2-07-frame_parallel-1.webm", "6ea7c3875d67252e7caf2bc6e75b36b1",
-      6 },
-    { "vp90-2-07-frame_parallel-1.webm", "4bb634160c7356a8d7d4299b6dc83a45",
-      12 },
-    { "vp90-2-07-frame_parallel-1.webm", "89772591e6ef461f9fa754f916c78ed8",
-      26 },
-    { NULL, NULL, 0 },
-  };
-  DecodeFilesWithPause(files);
-}
-
-struct FileList {
-  const char *name;
-  // md5 sum for decoded frames which does not include corrupted frames.
-  const char *expected_md5;
-  // Expected number of decoded frames which does not include corrupted frames.
-  const int expected_frame_count;
-};
-
-// Decodes |filename| with |num_threads|. Return the md5 of the decoded
-// frames which does not include corrupted frames.
-string DecodeFile(const string &filename, int num_threads,
-                  int expected_frame_count) {
-  libvpx_test::WebMVideoSource video(filename);
-  video.Init();
-
-  vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
-  cfg.threads = num_threads;
-  const vpx_codec_flags_t flags = VPX_CODEC_USE_FRAME_THREADING;
-  libvpx_test::VP9Decoder decoder(cfg, flags);
-
-  libvpx_test::MD5 md5;
-  video.Begin();
-
-  int out_frames = 0;
-  do {
-    const vpx_codec_err_t res =
-        decoder.DecodeFrame(video.cxdata(), video.frame_size());
-    // TODO(hkuang): frame parallel mode should return an error on corruption.
-    if (res != VPX_CODEC_OK) {
-      EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
-      break;
-    }
-
-    video.Next();
-
-    // Flush the decoder at the end of the video.
-    if (!video.cxdata()) decoder.DecodeFrame(NULL, 0);
-
-    libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
-    const vpx_image_t *img;
-
-    // Get decompressed data
-    while ((img = dec_iter.Next())) {
-      ++out_frames;
-      md5.Add(img);
-    }
-  } while (video.cxdata() != NULL);
-
-  EXPECT_EQ(expected_frame_count, out_frames)
-      << "Input frame count does not match expected output frame count";
-
-  return string(md5.Get());
-}
-
-void DecodeFiles(const FileList files[]) {
-  for (const FileList *iter = files; iter->name != NULL; ++iter) {
-    SCOPED_TRACE(iter->name);
-    for (int t = 2; t <= 8; ++t) {
-      EXPECT_EQ(iter->expected_md5,
-                DecodeFile(iter->name, t, iter->expected_frame_count))
-          << "threads = " << t;
-    }
-  }
-}
-
-TEST(VP9MultiThreadedFrameParallel, InvalidFileTest) {
-  static const FileList files[] = {
-    // invalid-vp90-2-07-frame_parallel-1.webm is a 40 frame video file with
-    // one key frame for every ten frames. The 11th frame has corrupted data.
-    { "invalid-vp90-2-07-frame_parallel-1.webm",
-      "0549d0f45f60deaef8eb708e6c0eb6cb", 30 },
-    // invalid-vp90-2-07-frame_parallel-2.webm is a 40 frame video file with
-    // one key frame for every ten frames. The 1st and 31st frames have
-    // corrupted data.
-    { "invalid-vp90-2-07-frame_parallel-2.webm",
-      "6a1f3cf6f9e7a364212fadb9580d525e", 20 },
-    // invalid-vp90-2-07-frame_parallel-3.webm is a 40 frame video file with
-    // one key frame for every ten frames. The 5th and 13th frames have
-    // corrupted data.
-    { "invalid-vp90-2-07-frame_parallel-3.webm",
-      "8256544308de926b0681e04685b98677", 27 },
-    { NULL, NULL, 0 },
-  };
-  DecodeFiles(files);
-}
-
-TEST(VP9MultiThreadedFrameParallel, ValidFileTest) {
-  static const FileList files[] = {
-#if CONFIG_VP9_HIGHBITDEPTH
-    { "vp92-2-20-10bit-yuv420.webm", "a16b99df180c584e8db2ffeda987d293", 10 },
-#endif
-    { NULL, NULL, 0 },
-  };
-  DecodeFiles(files);
-}
-#endif  // CONFIG_WEBM_IO
-}  // namespace
diff --git a/media/libvpx/libvpx/test/vp9_intrapred_test.cc b/media/libvpx/libvpx/test/vp9_intrapred_test.cc
index 8c5fb20191..6733ef73d1 100644
--- a/media/libvpx/libvpx/test/vp9_intrapred_test.cc
+++ b/media/libvpx/libvpx/test/vp9_intrapred_test.cc
@@ -10,7 +10,7 @@
 
 #include <string>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
@@ -28,11 +28,11 @@ using libvpx_test::ACMRandom;
 
 const int count_test_block = 100000;
 
-typedef void (*IntraPredFunc)(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left);
+using IntraPredFunc = void (*)(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left);
 
 struct IntraPredParam {
-  IntraPredParam(IntraPredFunc pred = NULL, IntraPredFunc ref = NULL,
+  IntraPredParam(IntraPredFunc pred = nullptr, IntraPredFunc ref = nullptr,
                  int block_size_value = 0, int bit_depth_value = 0)
       : pred_fn(pred), ref_fn(ref), block_size(block_size_value),
         bit_depth(bit_depth_value) {}
@@ -55,6 +55,21 @@ class IntraPredTest : public ::testing::TestWithParam<PredParam> {
     ref_dst_ = ref_dst;
     int error_count = 0;
     for (int i = 0; i < count_test_block; ++i) {
+      // TODO(webm:1797): Some of the optimised predictor implementations rely
+      // on the trailing half of the above_row_ being a copy of the final
+      // element, however relying on this in some cases can cause the MD5 tests
+      // to fail. We have fixed all of these cases for Neon, so fill the whole
+      // of above_row_ randomly.
+#if HAVE_NEON
+      // Fill edges with random data, try first with saturated values.
+      for (int x = -1; x < 2 * block_size; x++) {
+        if (i == 0) {
+          above_row_[x] = mask_;
+        } else {
+          above_row_[x] = rnd.Rand16() & mask_;
+        }
+      }
+#else
       // Fill edges with random data, try first with saturated values.
       for (int x = -1; x < block_size; x++) {
         if (i == 0) {
@@ -66,6 +81,7 @@ class IntraPredTest : public ::testing::TestWithParam<PredParam> {
       for (int x = block_size; x < 2 * block_size; x++) {
         above_row_[x] = above_row_[block_size - 1];
       }
+#endif
       for (int y = 0; y < block_size; y++) {
         if (i == 0) {
           left_col_[y] = mask_;
@@ -80,7 +96,7 @@ class IntraPredTest : public ::testing::TestWithParam<PredParam> {
   }
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     params_ = this->GetParam();
     stride_ = params_.block_size * 3;
     mask_ = (1 << params_.bit_depth) - 1;
@@ -119,7 +135,7 @@ void IntraPredTest<uint8_t, IntraPredParam>::Predict() {
       params_.pred_fn(dst_, stride_, above_row_, left_col_));
 }
 
-typedef IntraPredTest<uint8_t, IntraPredParam> VP9IntraPredTest;
+using VP9IntraPredTest = IntraPredTest<uint8_t, IntraPredParam>;
 
 TEST_P(VP9IntraPredTest, IntraPredTests) {
   // max block size is 32
@@ -130,8 +146,14 @@ TEST_P(VP9IntraPredTest, IntraPredTests) {
   RunTest(left_col, above_data, dst, ref_dst);
 }
 
+// Instantiate a token test to avoid -Wuninitialized warnings when none of the
+// other tests are enabled.
+INSTANTIATE_TEST_SUITE_P(
+    C, VP9IntraPredTest,
+    ::testing::Values(IntraPredParam(&vpx_d45_predictor_4x4_c,
+                                     &vpx_d45_predictor_4x4_c, 4, 8)));
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2, VP9IntraPredTest,
     ::testing::Values(
         IntraPredParam(&vpx_d45_predictor_4x4_sse2, &vpx_d45_predictor_4x4_c, 4,
@@ -195,7 +217,7 @@ INSTANTIATE_TEST_CASE_P(
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSSE3, VP9IntraPredTest,
     ::testing::Values(IntraPredParam(&vpx_d45_predictor_16x16_ssse3,
                                      &vpx_d45_predictor_16x16_c, 16, 8),
@@ -226,7 +248,7 @@ INSTANTIATE_TEST_CASE_P(
 #endif  // HAVE_SSSE3
 
 #if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     NEON, VP9IntraPredTest,
     ::testing::Values(
         IntraPredParam(&vpx_d45_predictor_4x4_neon, &vpx_d45_predictor_4x4_c, 4,
@@ -237,6 +259,22 @@ INSTANTIATE_TEST_CASE_P(
                        &vpx_d45_predictor_16x16_c, 16, 8),
         IntraPredParam(&vpx_d45_predictor_32x32_neon,
                        &vpx_d45_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_d63_predictor_4x4_neon, &vpx_d63_predictor_4x4_c, 4,
+                       8),
+        IntraPredParam(&vpx_d63_predictor_8x8_neon, &vpx_d63_predictor_8x8_c, 8,
+                       8),
+        IntraPredParam(&vpx_d63_predictor_16x16_neon,
+                       &vpx_d63_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_d63_predictor_32x32_neon,
+                       &vpx_d63_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_d117_predictor_4x4_neon, &vpx_d117_predictor_4x4_c,
+                       4, 8),
+        IntraPredParam(&vpx_d117_predictor_8x8_neon, &vpx_d117_predictor_8x8_c,
+                       8, 8),
+        IntraPredParam(&vpx_d117_predictor_16x16_neon,
+                       &vpx_d117_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_d117_predictor_32x32_neon,
+                       &vpx_d117_predictor_32x32_c, 32, 8),
         IntraPredParam(&vpx_d135_predictor_4x4_neon, &vpx_d135_predictor_4x4_c,
                        4, 8),
         IntraPredParam(&vpx_d135_predictor_8x8_neon, &vpx_d135_predictor_8x8_c,
@@ -245,6 +283,22 @@ INSTANTIATE_TEST_CASE_P(
                        &vpx_d135_predictor_16x16_c, 16, 8),
         IntraPredParam(&vpx_d135_predictor_32x32_neon,
                        &vpx_d135_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_d153_predictor_4x4_neon, &vpx_d153_predictor_4x4_c,
+                       4, 8),
+        IntraPredParam(&vpx_d153_predictor_8x8_neon, &vpx_d153_predictor_8x8_c,
+                       8, 8),
+        IntraPredParam(&vpx_d153_predictor_16x16_neon,
+                       &vpx_d153_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_d153_predictor_32x32_neon,
+                       &vpx_d153_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_d207_predictor_4x4_neon, &vpx_d207_predictor_4x4_c,
+                       4, 8),
+        IntraPredParam(&vpx_d207_predictor_8x8_neon, &vpx_d207_predictor_8x8_c,
+                       8, 8),
+        IntraPredParam(&vpx_d207_predictor_16x16_neon,
+                       &vpx_d207_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_d207_predictor_32x32_neon,
+                       &vpx_d207_predictor_32x32_c, 32, 8),
         IntraPredParam(&vpx_dc_128_predictor_4x4_neon,
                        &vpx_dc_128_predictor_4x4_c, 4, 8),
         IntraPredParam(&vpx_dc_128_predictor_8x8_neon,
@@ -300,7 +354,7 @@ INSTANTIATE_TEST_CASE_P(
 #endif  // HAVE_NEON
 
 #if HAVE_DSPR2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     DSPR2, VP9IntraPredTest,
     ::testing::Values(IntraPredParam(&vpx_dc_predictor_4x4_dspr2,
                                      &vpx_dc_predictor_4x4_c, 4, 8),
@@ -321,7 +375,7 @@ INSTANTIATE_TEST_CASE_P(
 #endif  // HAVE_DSPR2
 
 #if HAVE_MSA
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     MSA, VP9IntraPredTest,
     ::testing::Values(
         IntraPredParam(&vpx_dc_128_predictor_4x4_msa,
@@ -378,13 +432,81 @@ INSTANTIATE_TEST_CASE_P(
                        8)));
 #endif  // HAVE_MSA
 
+// TODO(crbug.com/webm/1522): Fix test failures.
+#if 0
+        IntraPredParam(&vpx_d45_predictor_8x8_vsx, &vpx_d45_predictor_8x8_c, 8,
+                       8),
+        IntraPredParam(&vpx_d63_predictor_8x8_vsx, &vpx_d63_predictor_8x8_c, 8,
+                       8),
+        IntraPredParam(&vpx_dc_predictor_8x8_vsx, &vpx_dc_predictor_8x8_c, 8,
+                       8),
+        IntraPredParam(&vpx_h_predictor_4x4_vsx, &vpx_h_predictor_4x4_c, 4, 8),
+        IntraPredParam(&vpx_h_predictor_8x8_vsx, &vpx_h_predictor_8x8_c, 8, 8),
+        IntraPredParam(&vpx_tm_predictor_4x4_vsx, &vpx_tm_predictor_4x4_c, 4,
+                       8),
+        IntraPredParam(&vpx_tm_predictor_8x8_vsx, &vpx_tm_predictor_8x8_c, 8,
+                       8),
+#endif
+
+#if HAVE_VSX
+INSTANTIATE_TEST_SUITE_P(
+    VSX, VP9IntraPredTest,
+    ::testing::Values(IntraPredParam(&vpx_d45_predictor_16x16_vsx,
+                                     &vpx_d45_predictor_16x16_c, 16, 8),
+                      IntraPredParam(&vpx_d45_predictor_32x32_vsx,
+                                     &vpx_d45_predictor_32x32_c, 32, 8),
+                      IntraPredParam(&vpx_d63_predictor_16x16_vsx,
+                                     &vpx_d63_predictor_16x16_c, 16, 8),
+                      IntraPredParam(&vpx_d63_predictor_32x32_vsx,
+                                     &vpx_d63_predictor_32x32_c, 32, 8),
+                      IntraPredParam(&vpx_dc_128_predictor_16x16_vsx,
+                                     &vpx_dc_128_predictor_16x16_c, 16, 8),
+                      IntraPredParam(&vpx_dc_128_predictor_32x32_vsx,
+                                     &vpx_dc_128_predictor_32x32_c, 32, 8),
+                      IntraPredParam(&vpx_dc_left_predictor_16x16_vsx,
+                                     &vpx_dc_left_predictor_16x16_c, 16, 8),
+                      IntraPredParam(&vpx_dc_left_predictor_32x32_vsx,
+                                     &vpx_dc_left_predictor_32x32_c, 32, 8),
+                      IntraPredParam(&vpx_dc_predictor_16x16_vsx,
+                                     &vpx_dc_predictor_16x16_c, 16, 8),
+                      IntraPredParam(&vpx_dc_predictor_32x32_vsx,
+                                     &vpx_dc_predictor_32x32_c, 32, 8),
+                      IntraPredParam(&vpx_dc_top_predictor_16x16_vsx,
+                                     &vpx_dc_top_predictor_16x16_c, 16, 8),
+                      IntraPredParam(&vpx_dc_top_predictor_32x32_vsx,
+                                     &vpx_dc_top_predictor_32x32_c, 32, 8),
+                      IntraPredParam(&vpx_h_predictor_16x16_vsx,
+                                     &vpx_h_predictor_16x16_c, 16, 8),
+                      IntraPredParam(&vpx_h_predictor_32x32_vsx,
+                                     &vpx_h_predictor_32x32_c, 32, 8),
+                      IntraPredParam(&vpx_tm_predictor_16x16_vsx,
+                                     &vpx_tm_predictor_16x16_c, 16, 8),
+                      IntraPredParam(&vpx_tm_predictor_32x32_vsx,
+                                     &vpx_tm_predictor_32x32_c, 32, 8),
+                      IntraPredParam(&vpx_v_predictor_16x16_vsx,
+                                     &vpx_v_predictor_16x16_c, 16, 8),
+                      IntraPredParam(&vpx_v_predictor_32x32_vsx,
+                                     &vpx_v_predictor_32x32_c, 32, 8)));
+#endif  // HAVE_VSX
+
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(
+    LSX, VP9IntraPredTest,
+    ::testing::Values(IntraPredParam(&vpx_dc_predictor_8x8_lsx,
+                                     &vpx_dc_predictor_8x8_c, 8, 8),
+                      IntraPredParam(&vpx_dc_predictor_16x16_lsx,
+                                     &vpx_dc_predictor_16x16_c, 16, 8)));
+#endif  // HAVE_LSX
+
 #if CONFIG_VP9_HIGHBITDEPTH
-typedef void (*HighbdIntraPred)(uint16_t *dst, ptrdiff_t stride,
-                                const uint16_t *above, const uint16_t *left,
-                                int bps);
+using HighbdIntraPred = void (*)(uint16_t *dst, ptrdiff_t stride,
+                                 const uint16_t *above, const uint16_t *left,
+                                 int bps);
+
 struct HighbdIntraPredParam {
-  HighbdIntraPredParam(HighbdIntraPred pred = NULL, HighbdIntraPred ref = NULL,
-                       int block_size_value = 0, int bit_depth_value = 0)
+  HighbdIntraPredParam(HighbdIntraPred pred = nullptr,
+                       HighbdIntraPred ref = nullptr, int block_size_value = 0,
+                       int bit_depth_value = 0)
       : pred_fn(pred), ref_fn(ref), block_size(block_size_value),
         bit_depth(bit_depth_value) {}
 
@@ -394,6 +516,7 @@ struct HighbdIntraPredParam {
   int bit_depth;
 };
 
+#if HAVE_SSSE3 || HAVE_NEON || HAVE_SSE2
 template <>
 void IntraPredTest<uint16_t, HighbdIntraPredParam>::Predict() {
   const int bit_depth = params_.bit_depth;
@@ -402,7 +525,8 @@ void IntraPredTest<uint16_t, HighbdIntraPredParam>::Predict() {
       params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth));
 }
 
-typedef IntraPredTest<uint16_t, HighbdIntraPredParam> VP9HighbdIntraPredTest;
+using VP9HighbdIntraPredTest = IntraPredTest<uint16_t, HighbdIntraPredParam>;
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(VP9HighbdIntraPredTest);
 
 TEST_P(VP9HighbdIntraPredTest, HighbdIntraPredTests) {
   // max block size is 32
@@ -412,11 +536,166 @@ TEST_P(VP9HighbdIntraPredTest, HighbdIntraPredTests) {
   DECLARE_ALIGNED(16, uint16_t, ref_dst[3 * 32 * 32]);
   RunTest(left_col, above_data, dst, ref_dst);
 }
+#endif
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(
+    SSSE3_TO_C_8, VP9HighbdIntraPredTest,
+    ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_ssse3,
+                             &vpx_highbd_d45_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_ssse3,
+                             &vpx_highbd_d45_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_ssse3,
+                             &vpx_highbd_d45_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_ssse3,
+                             &vpx_highbd_d45_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_ssse3,
+                             &vpx_highbd_d63_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_ssse3,
+                             &vpx_highbd_d63_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_c,
+                             &vpx_highbd_d63_predictor_32x32_ssse3, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_ssse3,
+                             &vpx_highbd_d117_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_ssse3,
+                             &vpx_highbd_d117_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_c,
+                             &vpx_highbd_d117_predictor_32x32_ssse3, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_ssse3,
+                             &vpx_highbd_d135_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_ssse3,
+                             &vpx_highbd_d135_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_ssse3,
+                             &vpx_highbd_d135_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_ssse3,
+                             &vpx_highbd_d153_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_ssse3,
+                             &vpx_highbd_d153_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_ssse3,
+                             &vpx_highbd_d153_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_ssse3,
+                             &vpx_highbd_d207_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_ssse3,
+                             &vpx_highbd_d207_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_ssse3,
+                             &vpx_highbd_d207_predictor_32x32_c, 32, 8)));
+
+INSTANTIATE_TEST_SUITE_P(
+    SSSE3_TO_C_10, VP9HighbdIntraPredTest,
+    ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_ssse3,
+                             &vpx_highbd_d45_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_ssse3,
+                             &vpx_highbd_d45_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_ssse3,
+                             &vpx_highbd_d45_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_ssse3,
+                             &vpx_highbd_d45_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_ssse3,
+                             &vpx_highbd_d63_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_ssse3,
+                             &vpx_highbd_d63_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_c,
+                             &vpx_highbd_d63_predictor_32x32_ssse3, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_ssse3,
+                             &vpx_highbd_d117_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_ssse3,
+                             &vpx_highbd_d117_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_c,
+                             &vpx_highbd_d117_predictor_32x32_ssse3, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_ssse3,
+                             &vpx_highbd_d135_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_ssse3,
+                             &vpx_highbd_d135_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_ssse3,
+                             &vpx_highbd_d135_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_ssse3,
+                             &vpx_highbd_d153_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_ssse3,
+                             &vpx_highbd_d153_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_ssse3,
+                             &vpx_highbd_d153_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_ssse3,
+                             &vpx_highbd_d207_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_ssse3,
+                             &vpx_highbd_d207_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_ssse3,
+                             &vpx_highbd_d207_predictor_32x32_c, 32, 10)));
+
+INSTANTIATE_TEST_SUITE_P(
+    SSSE3_TO_C_12, VP9HighbdIntraPredTest,
+    ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_ssse3,
+                             &vpx_highbd_d45_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_ssse3,
+                             &vpx_highbd_d45_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_ssse3,
+                             &vpx_highbd_d45_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_ssse3,
+                             &vpx_highbd_d45_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_ssse3,
+                             &vpx_highbd_d63_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_ssse3,
+                             &vpx_highbd_d63_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_c,
+                             &vpx_highbd_d63_predictor_32x32_ssse3, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_ssse3,
+                             &vpx_highbd_d117_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_ssse3,
+                             &vpx_highbd_d117_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_c,
+                             &vpx_highbd_d117_predictor_32x32_ssse3, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_ssse3,
+                             &vpx_highbd_d135_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_ssse3,
+                             &vpx_highbd_d135_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_ssse3,
+                             &vpx_highbd_d135_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_ssse3,
+                             &vpx_highbd_d153_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_ssse3,
+                             &vpx_highbd_d153_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_ssse3,
+                             &vpx_highbd_d153_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_ssse3,
+                             &vpx_highbd_d207_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_ssse3,
+                             &vpx_highbd_d207_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_ssse3,
+                             &vpx_highbd_d207_predictor_32x32_c, 32, 12)));
+#endif  // HAVE_SSSE3
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2_TO_C_8, VP9HighbdIntraPredTest,
     ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_sse2,
+                             &vpx_highbd_dc_128_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_sse2,
+                             &vpx_highbd_dc_128_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_sse2,
+                             &vpx_highbd_dc_128_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_sse2,
+                             &vpx_highbd_dc_128_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_sse2,
+                             &vpx_highbd_d63_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_sse2,
+                             &vpx_highbd_d117_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_sse2,
+                             &vpx_highbd_d135_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_sse2,
+                             &vpx_highbd_d153_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_sse2,
+                             &vpx_highbd_d207_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2,
+                             &vpx_highbd_dc_left_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2,
+                             &vpx_highbd_dc_left_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_sse2,
+                             &vpx_highbd_dc_left_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_sse2,
+                             &vpx_highbd_dc_left_predictor_32x32_c, 32, 8),
         HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2,
                              &vpx_highbd_dc_predictor_4x4_c, 4, 8),
         HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2,
@@ -425,6 +704,14 @@ INSTANTIATE_TEST_CASE_P(
                              &vpx_highbd_dc_predictor_16x16_c, 16, 8),
         HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2,
                              &vpx_highbd_dc_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_sse2,
+                             &vpx_highbd_dc_top_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_sse2,
+                             &vpx_highbd_dc_top_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_sse2,
+                             &vpx_highbd_dc_top_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_sse2,
+                             &vpx_highbd_dc_top_predictor_32x32_c, 32, 8),
         HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2,
                              &vpx_highbd_tm_predictor_4x4_c, 4, 8),
         HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2,
@@ -433,6 +720,14 @@ INSTANTIATE_TEST_CASE_P(
                              &vpx_highbd_tm_predictor_16x16_c, 16, 8),
         HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2,
                              &vpx_highbd_tm_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_sse2,
+                             &vpx_highbd_h_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_sse2,
+                             &vpx_highbd_h_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_sse2,
+                             &vpx_highbd_h_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_sse2,
+                             &vpx_highbd_h_predictor_32x32_c, 32, 8),
         HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2,
                              &vpx_highbd_v_predictor_4x4_c, 4, 8),
         HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2,
@@ -442,9 +737,35 @@ INSTANTIATE_TEST_CASE_P(
         HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_sse2,
                              &vpx_highbd_v_predictor_32x32_c, 32, 8)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2_TO_C_10, VP9HighbdIntraPredTest,
     ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_sse2,
+                             &vpx_highbd_dc_128_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_sse2,
+                             &vpx_highbd_dc_128_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_sse2,
+                             &vpx_highbd_dc_128_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_sse2,
+                             &vpx_highbd_dc_128_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_sse2,
+                             &vpx_highbd_d63_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_sse2,
+                             &vpx_highbd_d117_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_sse2,
+                             &vpx_highbd_d135_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_sse2,
+                             &vpx_highbd_d153_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_sse2,
+                             &vpx_highbd_d207_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2,
+                             &vpx_highbd_dc_left_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2,
+                             &vpx_highbd_dc_left_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_sse2,
+                             &vpx_highbd_dc_left_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_sse2,
+                             &vpx_highbd_dc_left_predictor_32x32_c, 32, 10),
         HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2,
                              &vpx_highbd_dc_predictor_4x4_c, 4, 10),
         HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2,
@@ -453,6 +774,14 @@ INSTANTIATE_TEST_CASE_P(
                              &vpx_highbd_dc_predictor_16x16_c, 16, 10),
         HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2,
                              &vpx_highbd_dc_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_sse2,
+                             &vpx_highbd_dc_top_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_sse2,
+                             &vpx_highbd_dc_top_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_sse2,
+                             &vpx_highbd_dc_top_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_sse2,
+                             &vpx_highbd_dc_top_predictor_32x32_c, 32, 10),
         HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2,
                              &vpx_highbd_tm_predictor_4x4_c, 4, 10),
         HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2,
@@ -461,6 +790,14 @@ INSTANTIATE_TEST_CASE_P(
                              &vpx_highbd_tm_predictor_16x16_c, 16, 10),
         HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2,
                              &vpx_highbd_tm_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_sse2,
+                             &vpx_highbd_h_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_sse2,
+                             &vpx_highbd_h_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_sse2,
+                             &vpx_highbd_h_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_sse2,
+                             &vpx_highbd_h_predictor_32x32_c, 32, 10),
         HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2,
                              &vpx_highbd_v_predictor_4x4_c, 4, 10),
         HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2,
@@ -470,9 +807,35 @@ INSTANTIATE_TEST_CASE_P(
         HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_sse2,
                              &vpx_highbd_v_predictor_32x32_c, 32, 10)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     SSE2_TO_C_12, VP9HighbdIntraPredTest,
     ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_sse2,
+                             &vpx_highbd_dc_128_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_sse2,
+                             &vpx_highbd_dc_128_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_sse2,
+                             &vpx_highbd_dc_128_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_sse2,
+                             &vpx_highbd_dc_128_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_sse2,
+                             &vpx_highbd_d63_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_sse2,
+                             &vpx_highbd_d117_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_sse2,
+                             &vpx_highbd_d135_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_sse2,
+                             &vpx_highbd_d153_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_sse2,
+                             &vpx_highbd_d207_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2,
+                             &vpx_highbd_dc_left_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2,
+                             &vpx_highbd_dc_left_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_sse2,
+                             &vpx_highbd_dc_left_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_sse2,
+                             &vpx_highbd_dc_left_predictor_32x32_c, 32, 12),
         HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2,
                              &vpx_highbd_dc_predictor_4x4_c, 4, 12),
         HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2,
@@ -481,6 +844,14 @@ INSTANTIATE_TEST_CASE_P(
                              &vpx_highbd_dc_predictor_16x16_c, 16, 12),
         HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2,
                              &vpx_highbd_dc_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_sse2,
+                             &vpx_highbd_dc_top_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_sse2,
+                             &vpx_highbd_dc_top_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_sse2,
+                             &vpx_highbd_dc_top_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_sse2,
+                             &vpx_highbd_dc_top_predictor_32x32_c, 32, 12),
         HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2,
                              &vpx_highbd_tm_predictor_4x4_c, 4, 12),
         HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2,
@@ -489,6 +860,14 @@ INSTANTIATE_TEST_CASE_P(
                              &vpx_highbd_tm_predictor_16x16_c, 16, 12),
         HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2,
                              &vpx_highbd_tm_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_sse2,
+                             &vpx_highbd_h_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_sse2,
+                             &vpx_highbd_h_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_sse2,
+                             &vpx_highbd_h_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_sse2,
+                             &vpx_highbd_h_predictor_32x32_c, 32, 12),
         HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2,
                              &vpx_highbd_v_predictor_4x4_c, 4, 12),
         HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2,
@@ -500,7 +879,7 @@ INSTANTIATE_TEST_CASE_P(
 #endif  // HAVE_SSE2
 
 #if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     NEON_TO_C_8, VP9HighbdIntraPredTest,
     ::testing::Values(
         HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon,
@@ -511,6 +890,22 @@ INSTANTIATE_TEST_CASE_P(
                              &vpx_highbd_d45_predictor_16x16_c, 16, 8),
         HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
                              &vpx_highbd_d45_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon,
+                             &vpx_highbd_d63_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon,
+                             &vpx_highbd_d63_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon,
+                             &vpx_highbd_d63_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon,
+                             &vpx_highbd_d63_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon,
+                             &vpx_highbd_d117_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon,
+                             &vpx_highbd_d117_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_neon,
+                             &vpx_highbd_d117_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_neon,
+                             &vpx_highbd_d117_predictor_32x32_c, 32, 8),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon,
                              &vpx_highbd_d135_predictor_4x4_c, 4, 8),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon,
@@ -519,6 +914,22 @@ INSTANTIATE_TEST_CASE_P(
                              &vpx_highbd_d135_predictor_16x16_c, 16, 8),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon,
                              &vpx_highbd_d135_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_neon,
+                             &vpx_highbd_d153_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_neon,
+                             &vpx_highbd_d153_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_neon,
+                             &vpx_highbd_d153_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_neon,
+                             &vpx_highbd_d153_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_neon,
+                             &vpx_highbd_d207_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_neon,
+                             &vpx_highbd_d207_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_neon,
+                             &vpx_highbd_d207_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_neon,
+                             &vpx_highbd_d207_predictor_32x32_c, 32, 8),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon,
                              &vpx_highbd_dc_128_predictor_4x4_c, 4, 8),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon,
@@ -576,7 +987,7 @@ INSTANTIATE_TEST_CASE_P(
         HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_neon,
                              &vpx_highbd_v_predictor_32x32_c, 32, 8)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     NEON_TO_C_10, VP9HighbdIntraPredTest,
     ::testing::Values(
         HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon,
@@ -587,6 +998,22 @@ INSTANTIATE_TEST_CASE_P(
                              &vpx_highbd_d45_predictor_16x16_c, 16, 10),
         HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
                              &vpx_highbd_d45_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon,
+                             &vpx_highbd_d63_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon,
+                             &vpx_highbd_d63_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon,
+                             &vpx_highbd_d63_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon,
+                             &vpx_highbd_d63_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon,
+                             &vpx_highbd_d117_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon,
+                             &vpx_highbd_d117_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_neon,
+                             &vpx_highbd_d117_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_neon,
+                             &vpx_highbd_d117_predictor_32x32_c, 32, 10),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon,
                              &vpx_highbd_d135_predictor_4x4_c, 4, 10),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon,
@@ -595,6 +1022,22 @@ INSTANTIATE_TEST_CASE_P(
                              &vpx_highbd_d135_predictor_16x16_c, 16, 10),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon,
                              &vpx_highbd_d135_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_neon,
+                             &vpx_highbd_d153_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_neon,
+                             &vpx_highbd_d153_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_neon,
+                             &vpx_highbd_d153_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_neon,
+                             &vpx_highbd_d153_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_neon,
+                             &vpx_highbd_d207_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_neon,
+                             &vpx_highbd_d207_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_neon,
+                             &vpx_highbd_d207_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_neon,
+                             &vpx_highbd_d207_predictor_32x32_c, 32, 10),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon,
                              &vpx_highbd_dc_128_predictor_4x4_c, 4, 10),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon,
@@ -652,7 +1095,7 @@ INSTANTIATE_TEST_CASE_P(
         HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_neon,
                              &vpx_highbd_v_predictor_32x32_c, 32, 10)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     NEON_TO_C_12, VP9HighbdIntraPredTest,
     ::testing::Values(
         HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon,
@@ -663,6 +1106,22 @@ INSTANTIATE_TEST_CASE_P(
                              &vpx_highbd_d45_predictor_16x16_c, 16, 12),
         HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
                              &vpx_highbd_d45_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon,
+                             &vpx_highbd_d63_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon,
+                             &vpx_highbd_d63_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon,
+                             &vpx_highbd_d63_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon,
+                             &vpx_highbd_d63_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon,
+                             &vpx_highbd_d117_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon,
+                             &vpx_highbd_d117_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_neon,
+                             &vpx_highbd_d117_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_neon,
+                             &vpx_highbd_d117_predictor_32x32_c, 32, 10),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon,
                              &vpx_highbd_d135_predictor_4x4_c, 4, 12),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon,
@@ -671,6 +1130,22 @@ INSTANTIATE_TEST_CASE_P(
                              &vpx_highbd_d135_predictor_16x16_c, 16, 12),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon,
                              &vpx_highbd_d135_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_neon,
+                             &vpx_highbd_d153_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_neon,
+                             &vpx_highbd_d153_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_neon,
+                             &vpx_highbd_d153_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_neon,
+                             &vpx_highbd_d153_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_neon,
+                             &vpx_highbd_d207_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_neon,
+                             &vpx_highbd_d207_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_neon,
+                             &vpx_highbd_d207_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_neon,
+                             &vpx_highbd_d207_predictor_32x32_c, 32, 12),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon,
                              &vpx_highbd_dc_128_predictor_4x4_c, 4, 12),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon,
diff --git a/media/libvpx/libvpx/test/vp9_lossless_test.cc b/media/libvpx/libvpx/test/vp9_lossless_test.cc
index 703b55e9bd..48839a7a23 100644
--- a/media/libvpx/libvpx/test/vp9_lossless_test.cc
+++ b/media/libvpx/libvpx/test/vp9_lossless_test.cc
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "./vpx_config.h"
 #include "test/codec_factory.h"
@@ -29,16 +29,16 @@ class LosslessTest
       : EncoderTest(GET_PARAM(0)), psnr_(kMaxPsnr), nframes_(0),
         encoding_mode_(GET_PARAM(1)) {}
 
-  virtual ~LosslessTest() {}
+  ~LosslessTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
       // Only call Control if quantizer > 0 to verify that using quantizer
       // alone will activate lossless
       if (cfg_.rc_max_quantizer > 0 || cfg_.rc_min_quantizer > 0) {
@@ -47,12 +47,12 @@ class LosslessTest
     }
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     psnr_ = kMaxPsnr;
     nframes_ = 0;
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (pkt->data.psnr.psnr[0] < psnr_) psnr_ = pkt->data.psnr.psnr[0];
   }
 
@@ -118,8 +118,13 @@ TEST_P(LosslessTest, TestLossLessEncodingCtrl) {
   EXPECT_GE(psnr_lossless, kMaxPsnr);
 }
 
-VP9_INSTANTIATE_TEST_CASE(LosslessTest,
-                          ::testing::Values(::libvpx_test::kRealTime,
-                                            ::libvpx_test::kOnePassGood,
-                                            ::libvpx_test::kTwoPassGood));
+#if CONFIG_REALTIME_ONLY
+VP9_INSTANTIATE_TEST_SUITE(LosslessTest,
+                           ::testing::Values(::libvpx_test::kRealTime));
+#else
+VP9_INSTANTIATE_TEST_SUITE(LosslessTest,
+                           ::testing::Values(::libvpx_test::kRealTime,
+                                             ::libvpx_test::kOnePassGood,
+                                             ::libvpx_test::kTwoPassGood));
+#endif
 }  // namespace
diff --git a/media/libvpx/libvpx/test/vp9_motion_vector_test.cc b/media/libvpx/libvpx/test/vp9_motion_vector_test.cc
new file mode 100644
index 0000000000..b47f530909
--- /dev/null
+++ b/media/libvpx/libvpx/test/vp9_motion_vector_test.cc
@@ -0,0 +1,102 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <memory>
+
+#include "gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/yuv_video_source.h"
+#include "vpx_config.h"
+
+namespace {
+#define MAX_EXTREME_MV 1
+#define MIN_EXTREME_MV 2
+
+// Encoding modes
+const libvpx_test::TestMode kEncodingModeVectors[] = {
+#if !CONFIG_REALTIME_ONLY
+  ::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood,
+#endif
+  ::libvpx_test::kRealTime
+};
+
+// Encoding speeds
+const int kCpuUsedVectors[] = { 0, 1, 2, 3, 4, 5, 6 };
+
+// MV test modes: 1 - always use maximum MV; 2 - always use minimum MV.
+const int kMVTestModes[] = { MAX_EXTREME_MV, MIN_EXTREME_MV };
+
+class MotionVectorTestLarge
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith3Params<libvpx_test::TestMode, int,
+                                                 int> {
+ protected:
+  MotionVectorTestLarge()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        cpu_used_(GET_PARAM(2)), mv_test_mode_(GET_PARAM(3)) {}
+
+  ~MotionVectorTestLarge() override = default;
+
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    if (encoding_mode_ != ::libvpx_test::kRealTime) {
+      cfg_.g_lag_in_frames = 3;
+      cfg_.rc_end_usage = VPX_VBR;
+    } else {
+      cfg_.g_lag_in_frames = 0;
+      cfg_.rc_end_usage = VPX_CBR;
+      cfg_.rc_buf_sz = 1000;
+      cfg_.rc_buf_initial_sz = 500;
+      cfg_.rc_buf_optimal_sz = 600;
+    }
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
+      encoder->Control(VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, mv_test_mode_);
+      if (encoding_mode_ != ::libvpx_test::kRealTime) {
+        encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+        encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+      }
+    }
+  }
+
+  libvpx_test::TestMode encoding_mode_;
+  int cpu_used_;
+  int mv_test_mode_;
+};
+
+TEST_P(MotionVectorTestLarge, OverallTest) {
+  cfg_.rc_target_bitrate = 24000;
+  cfg_.g_profile = 0;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  std::unique_ptr<libvpx_test::VideoSource> video;
+  video.reset(new libvpx_test::YUVVideoSource(
+      "niklas_640_480_30.yuv", VPX_IMG_FMT_I420, 3840, 2160,  // 2048, 1080,
+      30, 1, 0, 5));
+
+  ASSERT_NE(video.get(), nullptr);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
+VP9_INSTANTIATE_TEST_SUITE(MotionVectorTestLarge,
+                           ::testing::ValuesIn(kEncodingModeVectors),
+                           ::testing::ValuesIn(kCpuUsedVectors),
+                           ::testing::ValuesIn(kMVTestModes));
+}  // namespace
diff --git a/media/libvpx/libvpx/test/vp9_quantize_test.cc b/media/libvpx/libvpx/test/vp9_quantize_test.cc
index 4643895021..0c49598532 100644
--- a/media/libvpx/libvpx/test/vp9_quantize_test.cc
+++ b/media/libvpx/libvpx/test/vp9_quantize_test.cc
@@ -9,336 +9,716 @@
  */
 
 #include <math.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <tuple>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
+#include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
+#include "test/bench.h"
+#include "test/buffer.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/vpx_timer.h"
 
 using libvpx_test::ACMRandom;
+using libvpx_test::Buffer;
 
 namespace {
-#if CONFIG_VP9_HIGHBITDEPTH
 const int number_of_iterations = 100;
 
-typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
-                             int skip_block, const int16_t *zbin,
-                             const int16_t *round, const int16_t *quant,
-                             const int16_t *quant_shift, tran_low_t *qcoeff,
-                             tran_low_t *dqcoeff, const int16_t *dequant,
-                             uint16_t *eob, const int16_t *scan,
-                             const int16_t *iscan);
-typedef std::tr1::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t>
-    QuantizeParam;
+using QuantizeFunc = void (*)(const tran_low_t *coeff, intptr_t count,
+                              const macroblock_plane *mb_plane,
+                              tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                              const int16_t *dequant, uint16_t *eob,
+                              const struct ScanOrder *const scan_order);
+using QuantizeParam = std::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t,
+                                 int /*max_size*/, bool /*is_fp*/>;
 
-class VP9QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
+// Wrapper for 32x32 version which does not use count
+using Quantize32x32Func = void (*)(const tran_low_t *coeff,
+                                   const macroblock_plane *const mb_plane,
+                                   tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                                   const int16_t *dequant, uint16_t *eob,
+                                   const struct ScanOrder *const scan_order);
+
+template <Quantize32x32Func fn>
+void Quant32x32Wrapper(const tran_low_t *coeff, intptr_t count,
+                       const macroblock_plane *const mb_plane,
+                       tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                       const int16_t *dequant, uint16_t *eob,
+                       const struct ScanOrder *const scan_order) {
+  (void)count;
+  fn(coeff, mb_plane, qcoeff, dqcoeff, dequant, eob, scan_order);
+}
+
+// Wrapper for FP version which does not use zbin or quant_shift.
+using QuantizeFPFunc = void (*)(const tran_low_t *coeff, intptr_t count,
+                                const macroblock_plane *const mb_plane,
+                                tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                                const int16_t *dequant, uint16_t *eob,
+                                const struct ScanOrder *const scan_order);
+
+template <QuantizeFPFunc fn>
+void QuantFPWrapper(const tran_low_t *coeff, intptr_t count,
+                    const macroblock_plane *const mb_plane, tran_low_t *qcoeff,
+                    tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob,
+                    const struct ScanOrder *const scan_order) {
+  fn(coeff, count, mb_plane, qcoeff, dqcoeff, dequant, eob, scan_order);
+}
+
+void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
+                          int16_t *quant, int16_t *quant_shift,
+                          int16_t *dequant, int16_t *round_fp,
+                          int16_t *quant_fp) {
+  // Max when q == 0. Otherwise, it is 48 for Y and 42 for U/V.
+  constexpr int kMaxQRoundingFactorFp = 64;
+
+  for (int j = 0; j < 2; j++) {
+    // The range is 4 to 1828 in the VP9 tables.
+    const int qlookup = rnd->RandRange(1825) + 4;
+    round_fp[j] = (kMaxQRoundingFactorFp * qlookup) >> 7;
+    quant_fp[j] = (1 << 16) / qlookup;
+
+    // Values determined by deconstructing vp9_init_quantizer().
+    // zbin may be up to 1143 for 8 and 10 bit Y values, or 1200 for 12 bit Y
+    // values or U/V values of any bit depth. This is because y_delta is not
+    // factored into the vp9_ac_quant() call.
+    zbin[j] = rnd->RandRange(1200);
+
+    // round may be up to 685 for Y values or 914 for U/V.
+    round[j] = rnd->RandRange(914);
+    // quant ranges from 1 to -32703
+    quant[j] = static_cast<int>(rnd->RandRange(32704)) - 32703;
+    // quant_shift goes up to 1 << 16.
+    quant_shift[j] = rnd->RandRange(16384);
+    // dequant maxes out at 1828 for all cases.
+    dequant[j] = rnd->RandRange(1828);
+  }
+  for (int j = 2; j < 8; j++) {
+    zbin[j] = zbin[1];
+    round_fp[j] = round_fp[1];
+    quant_fp[j] = quant_fp[1];
+    round[j] = round[1];
+    quant[j] = quant[1];
+    quant_shift[j] = quant_shift[1];
+    dequant[j] = dequant[1];
+  }
+}
+
+class VP9QuantizeBase : public AbstractBench {
  public:
-  virtual ~VP9QuantizeTest() {}
-  virtual void SetUp() {
-    quantize_op_ = GET_PARAM(0);
-    ref_quantize_op_ = GET_PARAM(1);
-    bit_depth_ = GET_PARAM(2);
-    mask_ = (1 << bit_depth_) - 1;
+  VP9QuantizeBase(vpx_bit_depth_t bit_depth, int max_size, bool is_fp)
+      : bit_depth_(bit_depth), max_size_(max_size), is_fp_(is_fp),
+        coeff_(Buffer<tran_low_t>(max_size_, max_size_, 0, 16)),
+        qcoeff_(Buffer<tran_low_t>(max_size_, max_size_, 0, 32)),
+        dqcoeff_(Buffer<tran_low_t>(max_size_, max_size_, 0, 32)) {
+    // TODO(jianj): SSSE3 and AVX2 tests fail on extreme values.
+#if HAVE_NEON
+    max_value_ = (1 << (7 + bit_depth_)) - 1;
+#else
+    max_value_ = (1 << bit_depth_) - 1;
+#endif
+
+    mb_plane_ = reinterpret_cast<macroblock_plane *>(
+        vpx_memalign(16, sizeof(macroblock_plane)));
+
+    zbin_ptr_ = mb_plane_->zbin =
+        reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*zbin_ptr_)));
+    round_fp_ptr_ = mb_plane_->round_fp = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, 8 * sizeof(*round_fp_ptr_)));
+    quant_fp_ptr_ = mb_plane_->quant_fp = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, 8 * sizeof(*quant_fp_ptr_)));
+    round_ptr_ = mb_plane_->round =
+        reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*round_ptr_)));
+    quant_ptr_ = mb_plane_->quant =
+        reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*quant_ptr_)));
+    quant_shift_ptr_ = mb_plane_->quant_shift = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, 8 * sizeof(*quant_shift_ptr_)));
+    dequant_ptr_ = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, 8 * sizeof(*dequant_ptr_)));
+
+    r_ptr_ = (is_fp_) ? round_fp_ptr_ : round_ptr_;
+    q_ptr_ = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
-  vpx_bit_depth_t bit_depth_;
-  int mask_;
-  QuantizeFunc quantize_op_;
-  QuantizeFunc ref_quantize_op_;
-};
-
-class VP9Quantize32Test : public ::testing::TestWithParam<QuantizeParam> {
- public:
-  virtual ~VP9Quantize32Test() {}
-  virtual void SetUp() {
-    quantize_op_ = GET_PARAM(0);
-    ref_quantize_op_ = GET_PARAM(1);
-    bit_depth_ = GET_PARAM(2);
-    mask_ = (1 << bit_depth_) - 1;
+  ~VP9QuantizeBase() override {
+    vpx_free(mb_plane_);
+    vpx_free(zbin_ptr_);
+    vpx_free(round_fp_ptr_);
+    vpx_free(quant_fp_ptr_);
+    vpx_free(round_ptr_);
+    vpx_free(quant_ptr_);
+    vpx_free(quant_shift_ptr_);
+    vpx_free(dequant_ptr_);
+    mb_plane_ = nullptr;
+    zbin_ptr_ = nullptr;
+    round_fp_ptr_ = nullptr;
+    quant_fp_ptr_ = nullptr;
+    round_ptr_ = nullptr;
+    quant_ptr_ = nullptr;
+    quant_shift_ptr_ = nullptr;
+    dequant_ptr_ = nullptr;
+    libvpx_test::ClearSystemState();
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+ protected:
+  macroblock_plane *mb_plane_;
+  int16_t *zbin_ptr_;
+  int16_t *quant_fp_ptr_;
+  int16_t *round_fp_ptr_;
+  int16_t *round_ptr_;
+  int16_t *quant_ptr_;
+  int16_t *quant_shift_ptr_;
+  int16_t *dequant_ptr_;
+  const vpx_bit_depth_t bit_depth_;
+  int max_value_;
+  const int max_size_;
+  const bool is_fp_;
+  Buffer<tran_low_t> coeff_;
+  Buffer<tran_low_t> qcoeff_;
+  Buffer<tran_low_t> dqcoeff_;
+  int16_t *r_ptr_;
+  int16_t *q_ptr_;
+  int count_;
+  const ScanOrder *scan_;
+  uint16_t eob_;
+};
+
+class VP9QuantizeTest : public VP9QuantizeBase,
+                        public ::testing::TestWithParam<QuantizeParam> {
+ public:
+  VP9QuantizeTest()
+      : VP9QuantizeBase(GET_PARAM(2), GET_PARAM(3), GET_PARAM(4)),
+        quantize_op_(GET_PARAM(0)), ref_quantize_op_(GET_PARAM(1)) {}
 
  protected:
-  vpx_bit_depth_t bit_depth_;
-  int mask_;
-  QuantizeFunc quantize_op_;
-  QuantizeFunc ref_quantize_op_;
+  void Run() override;
+  void Speed(bool is_median);
+  const QuantizeFunc quantize_op_;
+  const QuantizeFunc ref_quantize_op_;
 };
 
+void VP9QuantizeTest::Run() {
+  quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, qcoeff_.TopLeftPixel(),
+               dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_);
+}
+
+void VP9QuantizeTest::Speed(bool is_median) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  ASSERT_TRUE(coeff_.Init());
+  ASSERT_TRUE(qcoeff_.Init());
+  ASSERT_TRUE(dqcoeff_.Init());
+  TX_SIZE starting_sz, ending_sz;
+
+  if (max_size_ == 16) {
+    starting_sz = TX_4X4;
+    ending_sz = TX_16X16;
+  } else {
+    starting_sz = TX_32X32;
+    ending_sz = TX_32X32;
+  }
+
+  for (TX_SIZE sz = starting_sz; sz <= ending_sz; ++sz) {
+    // zbin > coeff, zbin < coeff.
+    for (int i = 0; i < 2; ++i) {
+      // TX_TYPE defines the scan order. That is not relevant to the speed test.
+      // Pick the first one.
+      const TX_TYPE tx_type = DCT_DCT;
+      count_ = (4 << sz) * (4 << sz);
+      scan_ = &vp9_scan_orders[sz][tx_type];
+
+      GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
+                           quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
+                           quant_fp_ptr_);
+
+      if (i == 0) {
+        // When |coeff values| are less than zbin the results are 0.
+        int threshold = 100;
+        if (max_size_ == 32) {
+          // For 32x32, the threshold is halved. Double it to keep the values
+          // from clearing it.
+          threshold = 200;
+        }
+        for (int j = 0; j < 8; ++j) zbin_ptr_[j] = threshold;
+        coeff_.Set(&rnd, -99, 99);
+      } else if (i == 1) {
+        for (int j = 0; j < 8; ++j) zbin_ptr_[j] = 50;
+        coeff_.Set(&rnd, -500, 500);
+      }
+
+      const char *type =
+          (i == 0) ? "Bypass calculations " : "Full calculations ";
+      char block_size[16];
+      snprintf(block_size, sizeof(block_size), "%dx%d", 4 << sz, 4 << sz);
+      char title[100];
+      snprintf(title, sizeof(title), "%25s %8s ", type, block_size);
+
+      if (is_median) {
+        RunNTimes(10000000 / count_);
+        PrintMedian(title);
+      } else {
+        Buffer<tran_low_t> ref_qcoeff =
+            Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+        ASSERT_TRUE(ref_qcoeff.Init());
+        Buffer<tran_low_t> ref_dqcoeff =
+            Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+        ASSERT_TRUE(ref_dqcoeff.Init());
+        uint16_t ref_eob = 0;
+
+        const int kNumTests = 5000000;
+        vpx_usec_timer timer, simd_timer;
+
+        vpx_usec_timer_start(&timer);
+        for (int n = 0; n < kNumTests; ++n) {
+          ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_,
+                           ref_qcoeff.TopLeftPixel(),
+                           ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
+                           scan_);
+        }
+        vpx_usec_timer_mark(&timer);
+
+        vpx_usec_timer_start(&simd_timer);
+        for (int n = 0; n < kNumTests; ++n) {
+          quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_,
+                       qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
+                       dequant_ptr_, &eob_, scan_);
+        }
+        vpx_usec_timer_mark(&simd_timer);
+
+        const int elapsed_time =
+            static_cast<int>(vpx_usec_timer_elapsed(&timer));
+        const int simd_elapsed_time =
+            static_cast<int>(vpx_usec_timer_elapsed(&simd_timer));
+        printf("%s c_time = %d \t simd_time = %d \t Gain = %f \n", title,
+               elapsed_time, simd_elapsed_time,
+               ((float)elapsed_time / simd_elapsed_time));
+      }
+    }
+  }
+}
+
+// This quantizer compares the AC coefficients to the quantization step size to
+// determine if further multiplication operations are needed.
+// Based on vp9_quantize_fp_sse2().
+inline void quant_fp_nz(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                        const struct macroblock_plane *const mb_plane,
+                        tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                        const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                        const struct ScanOrder *const scan_order,
+                        int is_32x32) {
+  int i, eob = -1;
+  const int thr = dequant_ptr[1] >> (1 + is_32x32);
+  const int16_t *round_ptr = mb_plane->round_fp;
+  const int16_t *quant_ptr = mb_plane->quant_fp;
+  const int16_t *scan = scan_order->scan;
+
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  for (i = 0; i < n_coeffs; i += 16) {
+    int y;
+    int nzflag_cnt = 0;
+    int abs_coeff[16];
+    int coeff_sign[16];
+
+    // count nzflag for each row (16 tran_low_t)
+    for (y = 0; y < 16; ++y) {
+      const int rc = i + y;
+      const int coeff = coeff_ptr[rc];
+      coeff_sign[y] = (coeff >> 31);
+      abs_coeff[y] = (coeff ^ coeff_sign[y]) - coeff_sign[y];
+      // The first 16 are skipped in the sse2 code.  Do the same here to match.
+      if (i >= 16 && (abs_coeff[y] <= thr)) {
+        nzflag_cnt++;
+      }
+    }
+
+    for (y = 0; y < 16; ++y) {
+      const int rc = i + y;
+      // If all of the AC coeffs in a row has magnitude less than the
+      // quantization step_size/2, quantize to zero.
+      if (nzflag_cnt < 16) {
+        int tmp;
+        int _round;
+
+        if (is_32x32) {
+          _round = ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+        } else {
+          _round = round_ptr[rc != 0];
+        }
+        tmp = clamp(abs_coeff[y] + _round, INT16_MIN, INT16_MAX);
+        tmp = (tmp * quant_ptr[rc != 0]) >> (16 - is_32x32);
+        qcoeff_ptr[rc] = (tmp ^ coeff_sign[y]) - coeff_sign[y];
+        dqcoeff_ptr[rc] =
+            static_cast<tran_low_t>(qcoeff_ptr[rc] * dequant_ptr[rc != 0]);
+
+        if (is_32x32) {
+          dqcoeff_ptr[rc] = static_cast<tran_low_t>(qcoeff_ptr[rc] *
+                                                    dequant_ptr[rc != 0] / 2);
+        } else {
+          dqcoeff_ptr[rc] =
+              static_cast<tran_low_t>(qcoeff_ptr[rc] * dequant_ptr[rc != 0]);
+        }
+      } else {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+      }
+    }
+  }
+
+  // Scan for eob.
+  for (i = 0; i < n_coeffs; i++) {
+    // Use the scan order to find the correct eob.
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void quantize_fp_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                      const struct macroblock_plane *mb_plane,
+                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                      const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                      const struct ScanOrder *const scan_order) {
+  quant_fp_nz(coeff_ptr, n_coeffs, mb_plane, qcoeff_ptr, dqcoeff_ptr,
+              dequant_ptr, eob_ptr, scan_order, 0);
+}
+
+void quantize_fp_32x32_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            const struct macroblock_plane *mb_plane,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                            const struct ScanOrder *const scan_order) {
+  quant_fp_nz(coeff_ptr, n_coeffs, mb_plane, qcoeff_ptr, dqcoeff_ptr,
+              dequant_ptr, eob_ptr, scan_order, 1);
+}
+
 TEST_P(VP9QuantizeTest, OperationCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[256]);
-  DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
-  DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
-  DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
-  DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
-  int err_count_total = 0;
-  int first_failure = -1;
-  for (int i = 0; i < number_of_iterations; ++i) {
-    const int skip_block = i == 0;
-    const TX_SIZE sz = (TX_SIZE)(i % 3);  // TX_4X4, TX_8X8 TX_16X16
-    const TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3);
-    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
-    const int count = (4 << sz) * (4 << sz);  // 16, 64, 256
-    int err_count = 0;
-    *eob_ptr = rnd.Rand16();
-    *ref_eob_ptr = *eob_ptr;
-    for (int j = 0; j < count; j++) {
-      coeff_ptr[j] = rnd.Rand16() & mask_;
-    }
-    for (int j = 0; j < 2; j++) {
-      zbin_ptr[j] = rnd.Rand16() & mask_;
-      round_ptr[j] = rnd.Rand16();
-      quant_ptr[j] = rnd.Rand16();
-      quant_shift_ptr[j] = rnd.Rand16();
-      dequant_ptr[j] = rnd.Rand16();
-    }
-    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
-                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
-                     ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr,
-                     scan_order->scan, scan_order->iscan);
-    ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
-        quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
-        scan_order->scan, scan_order->iscan));
-    for (int j = 0; j < sz; ++j) {
-      err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
-                   (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
-    }
-    err_count += (*ref_eob_ptr != *eob_ptr);
-    if (err_count && !err_count_total) {
-      first_failure = i;
-    }
-    err_count_total += err_count;
-  }
-  EXPECT_EQ(0, err_count_total)
-      << "Error: Quantization Test, C output doesn't match SSE2 output. "
-      << "First failed at test case " << first_failure;
-}
+  ASSERT_TRUE(coeff_.Init());
+  ASSERT_TRUE(qcoeff_.Init());
+  ASSERT_TRUE(dqcoeff_.Init());
+  Buffer<tran_low_t> ref_qcoeff =
+      Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(ref_qcoeff.Init());
+  Buffer<tran_low_t> ref_dqcoeff =
+      Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(ref_dqcoeff.Init());
+  uint16_t ref_eob = 0;
+  eob_ = 0;
 
-TEST_P(VP9Quantize32Test, OperationCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[1024]);
-  DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
-  DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
-  DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
-  DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
-  int err_count_total = 0;
-  int first_failure = -1;
   for (int i = 0; i < number_of_iterations; ++i) {
-    const int skip_block = i == 0;
-    const TX_SIZE sz = TX_32X32;
-    const TX_TYPE tx_type = (TX_TYPE)(i % 4);
-    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
-    const int count = (4 << sz) * (4 << sz);  // 1024
-    int err_count = 0;
-    *eob_ptr = rnd.Rand16();
-    *ref_eob_ptr = *eob_ptr;
-    for (int j = 0; j < count; j++) {
-      coeff_ptr[j] = rnd.Rand16() & mask_;
+    TX_SIZE sz;
+    if (max_size_ == 16) {
+      sz = static_cast<TX_SIZE>(i % 3);  // TX_4X4, TX_8X8 TX_16X16
+    } else {
+      sz = TX_32X32;
     }
-    for (int j = 0; j < 2; j++) {
-      zbin_ptr[j] = rnd.Rand16() & mask_;
-      round_ptr[j] = rnd.Rand16();
-      quant_ptr[j] = rnd.Rand16();
-      quant_shift_ptr[j] = rnd.Rand16();
-      dequant_ptr[j] = rnd.Rand16();
-    }
-    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
-                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
-                     ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr,
-                     scan_order->scan, scan_order->iscan);
+    const TX_TYPE tx_type = static_cast<TX_TYPE>((i >> 2) % 3);
+    scan_ = &vp9_scan_orders[sz][tx_type];
+    count_ = (4 << sz) * (4 << sz);
+    coeff_.Set(&rnd, -max_value_, max_value_);
+    GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
+                         quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
+                         quant_fp_ptr_);
+    ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_,
+                     ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
+                     dequant_ptr_, &ref_eob, scan_);
+
     ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
-        quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
-        scan_order->scan, scan_order->iscan));
-    for (int j = 0; j < sz; ++j) {
-      err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
-                   (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
+        coeff_.TopLeftPixel(), count_, mb_plane_, qcoeff_.TopLeftPixel(),
+        dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_));
+
+    EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff));
+    EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff));
+
+    EXPECT_EQ(eob_, ref_eob);
+
+    if (HasFailure()) {
+      printf("Failure on iteration %d.\n", i);
+      qcoeff_.PrintDifference(ref_qcoeff);
+      dqcoeff_.PrintDifference(ref_dqcoeff);
+      return;
     }
-    err_count += (*ref_eob_ptr != *eob_ptr);
-    if (err_count && !err_count_total) {
-      first_failure = i;
-    }
-    err_count_total += err_count;
   }
-  EXPECT_EQ(0, err_count_total)
-      << "Error: Quantization Test, C output doesn't match SSE2 output. "
-      << "First failed at test case " << first_failure;
 }
 
 TEST_P(VP9QuantizeTest, EOBCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[256]);
-  DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
-  DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
-  DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
-  DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
-  int err_count_total = 0;
-  int first_failure = -1;
+  ASSERT_TRUE(coeff_.Init());
+  ASSERT_TRUE(qcoeff_.Init());
+  ASSERT_TRUE(dqcoeff_.Init());
+  Buffer<tran_low_t> ref_qcoeff =
+      Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(ref_qcoeff.Init());
+  Buffer<tran_low_t> ref_dqcoeff =
+      Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(ref_dqcoeff.Init());
+  uint16_t ref_eob = 0;
+  eob_ = 0;
+  const uint32_t max_index = max_size_ * max_size_ - 1;
+
   for (int i = 0; i < number_of_iterations; ++i) {
-    int skip_block = i == 0;
-    TX_SIZE sz = (TX_SIZE)(i % 3);  // TX_4X4, TX_8X8 TX_16X16
-    TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3);
-    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
-    int count = (4 << sz) * (4 << sz);  // 16, 64, 256
-    int err_count = 0;
-    *eob_ptr = rnd.Rand16();
-    *ref_eob_ptr = *eob_ptr;
+    TX_SIZE sz;
+    if (max_size_ == 16) {
+      sz = static_cast<TX_SIZE>(i % 3);  // TX_4X4, TX_8X8 TX_16X16
+    } else {
+      sz = TX_32X32;
+    }
+    const TX_TYPE tx_type = static_cast<TX_TYPE>((i >> 2) % 3);
+    scan_ = &vp9_scan_orders[sz][tx_type];
+    count_ = (4 << sz) * (4 << sz);
     // Two random entries
-    for (int j = 0; j < count; j++) {
-      coeff_ptr[j] = 0;
-    }
-    coeff_ptr[rnd(count)] = rnd.Rand16() & mask_;
-    coeff_ptr[rnd(count)] = rnd.Rand16() & mask_;
-    for (int j = 0; j < 2; j++) {
-      zbin_ptr[j] = rnd.Rand16() & mask_;
-      round_ptr[j] = rnd.Rand16();
-      quant_ptr[j] = rnd.Rand16();
-      quant_shift_ptr[j] = rnd.Rand16();
-      dequant_ptr[j] = rnd.Rand16();
-    }
+    coeff_.Set(0);
+    coeff_.TopLeftPixel()[rnd.RandRange(count_) & max_index] =
+        static_cast<int>(rnd.RandRange(max_value_ * 2)) - max_value_;
+    coeff_.TopLeftPixel()[rnd.RandRange(count_) & max_index] =
+        static_cast<int>(rnd.RandRange(max_value_ * 2)) - max_value_;
+    GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
+                         quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
+                         quant_fp_ptr_);
+    ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_,
+                     ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
+                     dequant_ptr_, &ref_eob, scan_);
 
-    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
-                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
-                     ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr,
-                     scan_order->scan, scan_order->iscan);
     ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
-        quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
-        scan_order->scan, scan_order->iscan));
+        coeff_.TopLeftPixel(), count_, mb_plane_, qcoeff_.TopLeftPixel(),
+        dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_));
 
-    for (int j = 0; j < sz; ++j) {
-      err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
-                   (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
+    EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff));
+    EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff));
+
+    EXPECT_EQ(eob_, ref_eob);
+
+    if (HasFailure()) {
+      printf("Failure on iteration %d.\n", i);
+      qcoeff_.PrintDifference(ref_qcoeff);
+      dqcoeff_.PrintDifference(ref_dqcoeff);
+      return;
     }
-    err_count += (*ref_eob_ptr != *eob_ptr);
-    if (err_count && !err_count_total) {
-      first_failure = i;
-    }
-    err_count_total += err_count;
   }
-  EXPECT_EQ(0, err_count_total)
-      << "Error: Quantization Test, C output doesn't match SSE2 output. "
-      << "First failed at test case " << first_failure;
 }
 
-TEST_P(VP9Quantize32Test, EOBCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[1024]);
-  DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
-  DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
-  DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
-  DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
-  int err_count_total = 0;
-  int first_failure = -1;
-  for (int i = 0; i < number_of_iterations; ++i) {
-    int skip_block = i == 0;
-    TX_SIZE sz = TX_32X32;
-    TX_TYPE tx_type = (TX_TYPE)(i % 4);
-    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
-    int count = (4 << sz) * (4 << sz);  // 1024
-    int err_count = 0;
-    *eob_ptr = rnd.Rand16();
-    *ref_eob_ptr = *eob_ptr;
-    for (int j = 0; j < count; j++) {
-      coeff_ptr[j] = 0;
-    }
-    // Two random entries
-    coeff_ptr[rnd(count)] = rnd.Rand16() & mask_;
-    coeff_ptr[rnd(count)] = rnd.Rand16() & mask_;
-    for (int j = 0; j < 2; j++) {
-      zbin_ptr[j] = rnd.Rand16() & mask_;
-      round_ptr[j] = rnd.Rand16();
-      quant_ptr[j] = rnd.Rand16();
-      quant_shift_ptr[j] = rnd.Rand16();
-      dequant_ptr[j] = rnd.Rand16();
-    }
+TEST_P(VP9QuantizeTest, DISABLED_Speed) { Speed(false); }
 
-    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
-                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
-                     ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr,
-                     scan_order->scan, scan_order->iscan);
-    ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
-        quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
-        scan_order->scan, scan_order->iscan));
+TEST_P(VP9QuantizeTest, DISABLED_SpeedMedian) { Speed(true); }
 
-    for (int j = 0; j < sz; ++j) {
-      err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
-                   (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
-    }
-    err_count += (*ref_eob_ptr != *eob_ptr);
-    if (err_count && !err_count_total) {
-      first_failure = i;
-    }
-    err_count_total += err_count;
-  }
-  EXPECT_EQ(0, err_count_total)
-      << "Error: Quantization Test, C output doesn't match SSE2 output. "
-      << "First failed at test case " << first_failure;
-}
-using std::tr1::make_tuple;
+using std::make_tuple;
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
     SSE2, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&vpx_highbd_quantize_b_sse2,
-                                 &vpx_highbd_quantize_b_c, VPX_BITS_8),
-                      make_tuple(&vpx_highbd_quantize_b_sse2,
-                                 &vpx_highbd_quantize_b_c, VPX_BITS_10),
-                      make_tuple(&vpx_highbd_quantize_b_sse2,
-                                 &vpx_highbd_quantize_b_c, VPX_BITS_12)));
-INSTANTIATE_TEST_CASE_P(
-    SSE2, VP9Quantize32Test,
-    ::testing::Values(make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                                 &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8),
-                      make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                                 &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10),
-                      make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                                 &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12)));
-#endif  // HAVE_SSE2
+    ::testing::Values(
+        make_tuple(vpx_quantize_b_sse2, vpx_quantize_b_c, VPX_BITS_8, 16,
+                   false),
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
+                   &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
+        make_tuple(vpx_highbd_quantize_b_sse2, vpx_highbd_quantize_b_c,
+                   VPX_BITS_8, 16, false),
+        make_tuple(vpx_highbd_quantize_b_sse2, vpx_highbd_quantize_b_c,
+                   VPX_BITS_10, 16, false),
+        make_tuple(vpx_highbd_quantize_b_sse2, vpx_highbd_quantize_b_c,
+                   VPX_BITS_12, 16, false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_sse2>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_8, 32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_sse2>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_10, 32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_sse2>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_12, 32, false)));
+
+#else
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, VP9QuantizeTest,
+    ::testing::Values(make_tuple(vpx_quantize_b_sse2, vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
+                                 &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
+                                 16, true)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(
+    SSSE3, VP9QuantizeTest,
+    ::testing::Values(make_tuple(vpx_quantize_b_ssse3, vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_ssse3>,
+                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                                 VPX_BITS_8, 32, false),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_ssse3>,
+                                 &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
+                                 16, true),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_ssse3>,
+                                 &QuantFPWrapper<quantize_fp_32x32_nz_c>,
+                                 VPX_BITS_8, 32, true)));
+#endif  // HAVE_SSSE3
+
+#if HAVE_AVX
+INSTANTIATE_TEST_SUITE_P(
+    AVX, VP9QuantizeTest,
+    ::testing::Values(make_tuple(vpx_quantize_b_avx, vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx>,
+                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                                 VPX_BITS_8, 32, false)));
+#endif  // HAVE_AVX
+
+#if VPX_ARCH_X86_64 && HAVE_AVX2
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, VP9QuantizeTest,
+    ::testing::Values(
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_avx2>,
+                   &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
+        make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_avx2>,
+                   &QuantFPWrapper<vp9_highbd_quantize_fp_c>, VPX_BITS_12, 16,
+                   true),
+        make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_32x32_avx2>,
+                   &QuantFPWrapper<vp9_highbd_quantize_fp_32x32_c>, VPX_BITS_12,
+                   32, true),
+        make_tuple(vpx_quantize_b_avx2, vpx_quantize_b_c, VPX_BITS_8, 16,
+                   false),
+        make_tuple(vpx_highbd_quantize_b_avx2, vpx_highbd_quantize_b_c,
+                   VPX_BITS_8, 16, false),
+        make_tuple(vpx_highbd_quantize_b_avx2, vpx_highbd_quantize_b_c,
+                   VPX_BITS_10, 16, false),
+        make_tuple(vpx_highbd_quantize_b_avx2, vpx_highbd_quantize_b_c,
+                   VPX_BITS_12, 16, false),
+        make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx2>,
+                   &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_avx2>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_8, 32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_avx2>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_10, 32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_avx2>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_12, 32, false)));
+#else
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&QuantFPWrapper<vp9_quantize_fp_avx2>,
+                                 &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
+                                 16, true),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_avx2>,
+                                 &QuantFPWrapper<quantize_fp_32x32_nz_c>,
+                                 VPX_BITS_8, 32, true),
+                      make_tuple(vpx_quantize_b_avx2, vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx2>,
+                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                                 VPX_BITS_8, 32, false)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_AVX2
+
+#if HAVE_NEON
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    NEON, VP9QuantizeTest,
+    ::testing::Values(
+        make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16,
+                   false),
+        make_tuple(vpx_highbd_quantize_b_neon, vpx_highbd_quantize_b_c,
+                   VPX_BITS_8, 16, false),
+        make_tuple(vpx_highbd_quantize_b_neon, vpx_highbd_quantize_b_c,
+                   VPX_BITS_10, 16, false),
+        make_tuple(vpx_highbd_quantize_b_neon, vpx_highbd_quantize_b_c,
+                   VPX_BITS_12, 16, false),
+        make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_neon>,
+                   &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_neon>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_8, 32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_neon>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_10, 32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_neon>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_12, 32, false),
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
+                   &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
+                   &QuantFPWrapper<vp9_quantize_fp_32x32_c>, VPX_BITS_8, 32,
+                   true)));
+#else
+INSTANTIATE_TEST_SUITE_P(
+    NEON, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_neon>,
+                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                                 VPX_BITS_8, 32, false),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
+                                 &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8,
+                                 16, true),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
+                                 &QuantFPWrapper<vp9_quantize_fp_32x32_c>,
+                                 VPX_BITS_8, 32, true)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_NEON
+
+#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    VSX, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&vpx_quantize_b_vsx, &vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&vpx_quantize_b_32x32_vsx,
+                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
+                                 false),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_vsx>,
+                                 &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8,
+                                 16, true),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_vsx>,
+                                 &QuantFPWrapper<vp9_quantize_fp_32x32_c>,
+                                 VPX_BITS_8, 32, true)));
+#endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    LSX, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&vpx_quantize_b_lsx, &vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_lsx>,
+                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                                 VPX_BITS_8, 32, false)));
+#endif  // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
+
+// Only useful to compare "Speed" test results.
+INSTANTIATE_TEST_SUITE_P(
+    DISABLED_C, VP9QuantizeTest,
+    ::testing::Values(
+        make_tuple(&vpx_quantize_b_c, &vpx_quantize_b_c, VPX_BITS_8, 16, false),
+        make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                   &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_c>,
+                   &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
+        make_tuple(&QuantFPWrapper<quantize_fp_nz_c>,
+                   &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
+        make_tuple(&QuantFPWrapper<quantize_fp_32x32_nz_c>,
+                   &QuantFPWrapper<quantize_fp_32x32_nz_c>, VPX_BITS_8, 32,
+                   true),
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_c>,
+                   &QuantFPWrapper<vp9_quantize_fp_32x32_c>, VPX_BITS_8, 32,
+                   true)));
 }  // namespace
diff --git a/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc b/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc
new file mode 100644
index 0000000000..e58f0a0d9d
--- /dev/null
+++ b/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc
@@ -0,0 +1,675 @@
+/*
+ *  Copyright (c) 2020 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "vp9/ratectrl_rtc.h"
+
+#include <climits>
+#include <fstream>  // NOLINT
+#include <string>
+
+#include "./vpx_config.h"
+#include "gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/video_source.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_svc_layercontext.h"
+#include "vpx/vpx_codec.h"
+#include "vpx_ports/bitops.h"
+
+namespace {
+
+const size_t kNumFrames = 300;
+
+const int kTemporalId3Layer[4] = { 0, 2, 1, 2 };
+const int kTemporalId2Layer[2] = { 0, 1 };
+const int kTemporalRateAllocation3Layer[3] = { 50, 70, 100 };
+const int kTemporalRateAllocation2Layer[2] = { 60, 100 };
+const int kSpatialLayerBitrate[3] = { 200, 400, 1000 };
+const int kSpatialLayerBitrateLow[3] = { 50, 100, 400 };
+
+class RcInterfaceTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<int, vpx_rc_mode> {
+ public:
+  RcInterfaceTest()
+      : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000),
+        encoder_exit_(false), frame_drop_thresh_(0), num_drops_(0) {}
+
+  ~RcInterfaceTest() override = default;
+
+ protected:
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+  }
+
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      encoder->Control(VP8E_SET_CPUUSED, 7);
+      encoder->Control(VP9E_SET_AQ_MODE, aq_mode_);
+      if (rc_cfg_.is_screen) {
+        encoder->Control(VP9E_SET_TUNE_CONTENT, VP9E_CONTENT_SCREEN);
+      } else {
+        encoder->Control(VP9E_SET_TUNE_CONTENT, VP9E_CONTENT_DEFAULT);
+      }
+      encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000);
+      encoder->Control(VP9E_SET_RTC_EXTERNAL_RATECTRL, 1);
+    }
+    frame_params_.frame_type = video->frame() % key_interval_ == 0
+                                   ? libvpx::RcFrameType::kKeyFrame
+                                   : libvpx::RcFrameType::kInterFrame;
+    if (rc_cfg_.rc_mode == VPX_CBR &&
+        frame_params_.frame_type == libvpx::RcFrameType::kInterFrame) {
+      // Disable golden frame update.
+      frame_flags_ |= VP8_EFLAG_NO_UPD_GF;
+      frame_flags_ |= VP8_EFLAG_NO_UPD_ARF;
+    }
+    encoder_exit_ = video->frame() == kNumFrames;
+  }
+
+  void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override {
+    if (encoder_exit_) {
+      return;
+    }
+    int loopfilter_level, qp;
+    encoder->Control(VP9E_GET_LOOPFILTER_LEVEL, &loopfilter_level);
+    encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp);
+    if (rc_api_->ComputeQP(frame_params_) == libvpx::FrameDropDecision::kOk) {
+      ASSERT_EQ(rc_api_->GetQP(), qp);
+      ASSERT_EQ(rc_api_->GetLoopfilterLevel(), loopfilter_level);
+    } else {
+      num_drops_++;
+    }
+  }
+
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
+    rc_api_->PostEncodeUpdate(pkt->data.frame.sz, frame_params_);
+  }
+
+  void RunOneLayer() {
+    SetConfig(GET_PARAM(2));
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    frame_params_.spatial_layer_id = 0;
+    frame_params_.temporal_layer_id = 0;
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunOneLayerScreen() {
+    SetConfig(GET_PARAM(2));
+    rc_cfg_.is_screen = true;
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    frame_params_.spatial_layer_id = 0;
+    frame_params_.temporal_layer_id = 0;
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunOneLayerDropFramesCBR() {
+    if (GET_PARAM(2) != VPX_CBR) {
+      GTEST_SKIP() << "Frame dropping is only for CBR mode.";
+    }
+    frame_drop_thresh_ = 30;
+    SetConfig(GET_PARAM(2));
+    // Use lower bitrate, lower max-q, and enable frame dropper.
+    rc_cfg_.target_bandwidth = 200;
+    cfg_.rc_target_bitrate = 200;
+    rc_cfg_.max_quantizer = 50;
+    cfg_.rc_max_quantizer = 50;
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    frame_params_.spatial_layer_id = 0;
+    frame_params_.temporal_layer_id = 0;
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    // Check that some frames were dropped, otherwise test has no value.
+    ASSERT_GE(num_drops_, 1);
+  }
+
+  void RunOneLayerVBRPeriodicKey() {
+    if (GET_PARAM(2) != VPX_VBR) return;
+    key_interval_ = 100;
+    SetConfig(VPX_VBR);
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    frame_params_.spatial_layer_id = 0;
+    frame_params_.temporal_layer_id = 0;
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+ private:
+  void SetConfig(vpx_rc_mode rc_mode) {
+    rc_cfg_.width = 1280;
+    rc_cfg_.height = 720;
+    rc_cfg_.max_quantizer = 52;
+    rc_cfg_.min_quantizer = 2;
+    rc_cfg_.target_bandwidth = 1000;
+    rc_cfg_.buf_initial_sz = 600;
+    rc_cfg_.buf_optimal_sz = 600;
+    rc_cfg_.buf_sz = 1000;
+    rc_cfg_.undershoot_pct = 50;
+    rc_cfg_.overshoot_pct = 50;
+    rc_cfg_.max_intra_bitrate_pct = 1000;
+    rc_cfg_.framerate = 30.0;
+    rc_cfg_.ss_number_layers = 1;
+    rc_cfg_.ts_number_layers = 1;
+    rc_cfg_.scaling_factor_num[0] = 1;
+    rc_cfg_.scaling_factor_den[0] = 1;
+    rc_cfg_.layer_target_bitrate[0] = 1000;
+    rc_cfg_.max_quantizers[0] = 52;
+    rc_cfg_.min_quantizers[0] = 2;
+    rc_cfg_.rc_mode = rc_mode;
+    rc_cfg_.aq_mode = aq_mode_;
+    rc_cfg_.frame_drop_thresh = frame_drop_thresh_;
+
+    // Encoder settings for ground truth.
+    cfg_.g_w = 1280;
+    cfg_.g_h = 720;
+    cfg_.rc_undershoot_pct = 50;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_buf_initial_sz = 600;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 52;
+    cfg_.rc_end_usage = rc_mode;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+    cfg_.rc_target_bitrate = 1000;
+    cfg_.kf_min_dist = key_interval_;
+    cfg_.kf_max_dist = key_interval_;
+    cfg_.rc_dropframe_thresh = frame_drop_thresh_;
+  }
+
+  std::unique_ptr<libvpx::VP9RateControlRTC> rc_api_;
+  libvpx::VP9RateControlRtcConfig rc_cfg_;
+  int aq_mode_;
+  int key_interval_;
+  libvpx::VP9FrameParamsQpRTC frame_params_;
+  bool encoder_exit_;
+  int frame_drop_thresh_;
+  int num_drops_;
+};
+
+class RcInterfaceSvcTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<int, bool> {
+ public:
+  RcInterfaceSvcTest()
+      : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000),
+        dynamic_spatial_layers_(0), inter_layer_pred_off_(GET_PARAM(2)),
+        parallel_spatial_layers_(false), frame_drop_thresh_(0),
+        max_consec_drop_(INT_MAX), num_drops_(0) {}
+  ~RcInterfaceSvcTest() override = default;
+
+ protected:
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+  }
+
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      current_superframe_ = 0;
+      encoder->Control(VP8E_SET_CPUUSED, 7);
+      encoder->Control(VP9E_SET_AQ_MODE, aq_mode_);
+      encoder->Control(VP9E_SET_TUNE_CONTENT, 0);
+      encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 900);
+      encoder->Control(VP9E_SET_RTC_EXTERNAL_RATECTRL, 1);
+      encoder->Control(VP9E_SET_SVC, 1);
+      encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
+      if (inter_layer_pred_off_) {
+        encoder->Control(VP9E_SET_SVC_INTER_LAYER_PRED,
+                         INTER_LAYER_PRED_OFF_NONKEY);
+      }
+      if (frame_drop_thresh_ > 0) {
+        vpx_svc_frame_drop_t svc_drop_frame;
+        svc_drop_frame.framedrop_mode = FULL_SUPERFRAME_DROP;
+        for (int sl = 0; sl < rc_cfg_.ss_number_layers; ++sl)
+          svc_drop_frame.framedrop_thresh[sl] = frame_drop_thresh_;
+        svc_drop_frame.max_consec_drop = max_consec_drop_;
+        encoder->Control(VP9E_SET_SVC_FRAME_DROP_LAYER, &svc_drop_frame);
+      }
+    }
+    frame_params_.frame_type = video->frame() % key_interval_ == 0
+                                   ? libvpx::RcFrameType::kKeyFrame
+                                   : libvpx::RcFrameType::kInterFrame;
+    encoder_exit_ = video->frame() == kNumFrames;
+    if (dynamic_spatial_layers_ == 1) {
+      if (video->frame() == 100) {
+        // Go down to 2 spatial layers: set top SL to 0 bitrate.
+        // Update the encoder config.
+        cfg_.rc_target_bitrate -= cfg_.layer_target_bitrate[8];
+        cfg_.layer_target_bitrate[6] = 0;
+        cfg_.layer_target_bitrate[7] = 0;
+        cfg_.layer_target_bitrate[8] = 0;
+        encoder->Config(&cfg_);
+        // Update the RC config.
+        rc_cfg_.target_bandwidth -= rc_cfg_.layer_target_bitrate[8];
+        rc_cfg_.layer_target_bitrate[6] = 0;
+        rc_cfg_.layer_target_bitrate[7] = 0;
+        rc_cfg_.layer_target_bitrate[8] = 0;
+        ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
+      } else if (video->frame() == 200) {
+        // Go down to 1 spatial layer.
+        // Update the encoder config.
+        cfg_.rc_target_bitrate -= cfg_.layer_target_bitrate[5];
+        cfg_.layer_target_bitrate[3] = 0;
+        cfg_.layer_target_bitrate[4] = 0;
+        cfg_.layer_target_bitrate[5] = 0;
+        encoder->Config(&cfg_);
+        // Update the RC config.
+        rc_cfg_.target_bandwidth -= rc_cfg_.layer_target_bitrate[5];
+        rc_cfg_.layer_target_bitrate[3] = 0;
+        rc_cfg_.layer_target_bitrate[4] = 0;
+        rc_cfg_.layer_target_bitrate[5] = 0;
+        ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
+      } else if (/*DISABLES CODE*/ (false) && video->frame() == 280) {
+        // TODO(marpan): Re-enable this going back up when issue is fixed.
+        // Go back up to 3 spatial layers.
+        // Update the encoder config: use the original bitrates.
+        SetEncoderConfigSvc(3, 3);
+        encoder->Config(&cfg_);
+        // Update the RC config.
+        SetRCConfigSvc(3, 3);
+        ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
+      }
+    }
+  }
+
+  virtual void SetFrameParamsSvc(int sl) {
+    frame_params_.spatial_layer_id = sl;
+    if (rc_cfg_.ts_number_layers == 3)
+      frame_params_.temporal_layer_id =
+          kTemporalId3Layer[current_superframe_ % 4];
+    else if (rc_cfg_.ts_number_layers == 2)
+      frame_params_.temporal_layer_id =
+          kTemporalId2Layer[current_superframe_ % 2];
+    else
+      frame_params_.temporal_layer_id = 0;
+    frame_params_.frame_type =
+        current_superframe_ % key_interval_ == 0 && sl == 0
+            ? libvpx::RcFrameType::kKeyFrame
+            : libvpx::RcFrameType::kInterFrame;
+  }
+
+  void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override {
+    if (encoder_exit_) {
+      return;
+    }
+    int superframe_is_dropped = false;
+    ::libvpx_test::CxDataIterator iter = encoder->GetCxData();
+    for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) sizes_[sl] = 0;
+    std::vector<int> rc_qp;
+    // For FULL_SUPERFRAME_DROP: the full superframe drop decision is
+    // determined on the base spatial layer.
+    SetFrameParamsSvc(0);
+    if (rc_api_->ComputeQP(frame_params_) == libvpx::FrameDropDecision::kDrop) {
+      superframe_is_dropped = true;
+      num_drops_++;
+    }
+    while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) {
+      ASSERT_EQ(superframe_is_dropped, false);
+      ParseSuperframeSizes(static_cast<const uint8_t *>(pkt->data.frame.buf),
+                           pkt->data.frame.sz);
+      if (!parallel_spatial_layers_ || current_superframe_ == 0) {
+        for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) {
+          if (sizes_[sl] > 0) {
+            SetFrameParamsSvc(sl);
+            // For sl=0 ComputeQP() is already called above (line 310).
+            if (sl > 0) rc_api_->ComputeQP(frame_params_);
+            rc_api_->PostEncodeUpdate(sizes_[sl], frame_params_);
+            rc_qp.push_back(rc_api_->GetQP());
+          }
+        }
+      } else {
+        for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) {
+          // For sl=0 ComputeQP() is already called above (line 310).
+          if (sizes_[sl] > 0 && sl > 0) {
+            SetFrameParamsSvc(sl);
+            rc_api_->ComputeQP(frame_params_);
+          }
+        }
+        for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) {
+          if (sizes_[sl] > 0) {
+            SetFrameParamsSvc(sl);
+            rc_api_->PostEncodeUpdate(sizes_[sl], frame_params_);
+            rc_qp.push_back(rc_api_->GetQP());
+          }
+        }
+      }
+    }
+    if (!superframe_is_dropped) {
+      int loopfilter_level;
+      std::vector<int> encoder_qp(VPX_SS_MAX_LAYERS, 0);
+      encoder->Control(VP9E_GET_LOOPFILTER_LEVEL, &loopfilter_level);
+      encoder->Control(VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, encoder_qp.data());
+      encoder_qp.resize(rc_qp.size());
+      ASSERT_EQ(rc_qp, encoder_qp);
+      ASSERT_EQ(rc_api_->GetLoopfilterLevel(), loopfilter_level);
+      current_superframe_++;
+    }
+  }
+  // This method needs to be overridden because non-reference frames are
+  // expected to be mismatched frames as the encoder will avoid loopfilter on
+  // these frames.
+  void MismatchHook(const vpx_image_t * /*img1*/,
+                    const vpx_image_t * /*img2*/) override {}
+
+  void RunSvc() {
+    SetRCConfigSvc(3, 3);
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    SetEncoderConfigSvc(3, 3);
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunSvcDropFramesCBR() {
+    max_consec_drop_ = 10;
+    frame_drop_thresh_ = 30;
+    SetRCConfigSvc(3, 3);
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    SetEncoderConfigSvc(3, 3);
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    // Check that some frames were dropped, otherwise test has no value.
+    ASSERT_GE(num_drops_, 1);
+  }
+
+  void RunSvcPeriodicKey() {
+    SetRCConfigSvc(3, 3);
+    key_interval_ = 100;
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    SetEncoderConfigSvc(3, 3);
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunSvcDynamicSpatial() {
+    dynamic_spatial_layers_ = 1;
+    SetRCConfigSvc(3, 3);
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    SetEncoderConfigSvc(3, 3);
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunSvcParallelSpatialLayers() {
+    if (!inter_layer_pred_off_) return;
+    parallel_spatial_layers_ = true;
+    SetRCConfigSvc(3, 3);
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    SetEncoderConfigSvc(3, 3);
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+ private:
+  vpx_codec_err_t ParseSuperframeSizes(const uint8_t *data, size_t data_sz) {
+    uint8_t marker = *(data + data_sz - 1);
+    if ((marker & 0xe0) == 0xc0) {
+      const uint32_t frames = (marker & 0x7) + 1;
+      const uint32_t mag = ((marker >> 3) & 0x3) + 1;
+      const size_t index_sz = 2 + mag * frames;
+      // This chunk is marked as having a superframe index but doesn't have
+      // enough data for it, thus it's an invalid superframe index.
+      if (data_sz < index_sz) return VPX_CODEC_CORRUPT_FRAME;
+      {
+        const uint8_t marker2 = *(data + data_sz - index_sz);
+        // This chunk is marked as having a superframe index but doesn't have
+        // the matching marker byte at the front of the index therefore it's an
+        // invalid chunk.
+        if (marker != marker2) return VPX_CODEC_CORRUPT_FRAME;
+      }
+      const uint8_t *x = &data[data_sz - index_sz + 1];
+      for (uint32_t i = 0; i < frames; ++i) {
+        uint32_t this_sz = 0;
+
+        for (uint32_t j = 0; j < mag; ++j) this_sz |= (*x++) << (j * 8);
+        sizes_[i] = this_sz;
+      }
+    }
+    return VPX_CODEC_OK;
+  }
+
+  void SetEncoderConfigSvc(int number_spatial_layers,
+                           int number_temporal_layers) {
+    cfg_.g_w = 1280;
+    cfg_.g_h = 720;
+    cfg_.ss_number_layers = number_spatial_layers;
+    cfg_.ts_number_layers = number_temporal_layers;
+    cfg_.g_timebase.num = 1;
+    cfg_.g_timebase.den = 30;
+    if (number_spatial_layers == 3) {
+      svc_params_.scaling_factor_num[0] = 1;
+      svc_params_.scaling_factor_den[0] = 4;
+      svc_params_.scaling_factor_num[1] = 2;
+      svc_params_.scaling_factor_den[1] = 4;
+      svc_params_.scaling_factor_num[2] = 4;
+      svc_params_.scaling_factor_den[2] = 4;
+    } else if (number_spatial_layers == 2) {
+      svc_params_.scaling_factor_num[0] = 1;
+      svc_params_.scaling_factor_den[0] = 2;
+      svc_params_.scaling_factor_num[1] = 2;
+      svc_params_.scaling_factor_den[1] = 2;
+    } else if (number_spatial_layers == 1) {
+      svc_params_.scaling_factor_num[0] = 1;
+      svc_params_.scaling_factor_den[0] = 1;
+    }
+
+    for (int i = 0; i < VPX_MAX_LAYERS; ++i) {
+      svc_params_.max_quantizers[i] = 56;
+      svc_params_.min_quantizers[i] = 2;
+      svc_params_.speed_per_layer[i] = 7;
+      svc_params_.loopfilter_ctrl[i] = LOOPFILTER_ALL;
+    }
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+
+    if (number_temporal_layers == 3) {
+      cfg_.ts_rate_decimator[0] = 4;
+      cfg_.ts_rate_decimator[1] = 2;
+      cfg_.ts_rate_decimator[2] = 1;
+      cfg_.temporal_layering_mode = 3;
+    } else if (number_temporal_layers == 2) {
+      cfg_.ts_rate_decimator[0] = 2;
+      cfg_.ts_rate_decimator[1] = 1;
+      cfg_.temporal_layering_mode = 2;
+    } else if (number_temporal_layers == 1) {
+      cfg_.ts_rate_decimator[0] = 1;
+      cfg_.temporal_layering_mode = 0;
+    }
+
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.g_threads = 1;
+    cfg_.kf_max_dist = 9999;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_undershoot_pct = 50;
+    cfg_.rc_dropframe_thresh = frame_drop_thresh_;
+
+    cfg_.rc_target_bitrate = 0;
+    for (int sl = 0; sl < number_spatial_layers; sl++) {
+      int spatial_bitrate = 0;
+      if (number_spatial_layers <= 3)
+        spatial_bitrate = frame_drop_thresh_ > 0 ? kSpatialLayerBitrateLow[sl]
+                                                 : kSpatialLayerBitrate[sl];
+      for (int tl = 0; tl < number_temporal_layers; tl++) {
+        int layer = sl * number_temporal_layers + tl;
+        if (number_temporal_layers == 3)
+          cfg_.layer_target_bitrate[layer] =
+              kTemporalRateAllocation3Layer[tl] * spatial_bitrate / 100;
+        else if (number_temporal_layers == 2)
+          cfg_.layer_target_bitrate[layer] =
+              kTemporalRateAllocation2Layer[tl] * spatial_bitrate / 100;
+        else if (number_temporal_layers == 1)
+          cfg_.layer_target_bitrate[layer] = spatial_bitrate;
+      }
+      cfg_.rc_target_bitrate += spatial_bitrate;
+    }
+
+    cfg_.kf_min_dist = key_interval_;
+    cfg_.kf_max_dist = key_interval_;
+  }
+
+  void SetRCConfigSvc(int number_spatial_layers, int number_temporal_layers) {
+    rc_cfg_.width = 1280;
+    rc_cfg_.height = 720;
+    rc_cfg_.ss_number_layers = number_spatial_layers;
+    rc_cfg_.ts_number_layers = number_temporal_layers;
+    rc_cfg_.max_quantizer = 56;
+    rc_cfg_.min_quantizer = 2;
+    rc_cfg_.buf_initial_sz = 500;
+    rc_cfg_.buf_optimal_sz = 600;
+    rc_cfg_.buf_sz = 1000;
+    rc_cfg_.undershoot_pct = 50;
+    rc_cfg_.overshoot_pct = 50;
+    rc_cfg_.max_intra_bitrate_pct = 900;
+    rc_cfg_.framerate = 30.0;
+    rc_cfg_.rc_mode = VPX_CBR;
+    rc_cfg_.aq_mode = aq_mode_;
+    rc_cfg_.frame_drop_thresh = frame_drop_thresh_;
+    rc_cfg_.max_consec_drop = max_consec_drop_;
+
+    if (number_spatial_layers == 3) {
+      rc_cfg_.scaling_factor_num[0] = 1;
+      rc_cfg_.scaling_factor_den[0] = 4;
+      rc_cfg_.scaling_factor_num[1] = 2;
+      rc_cfg_.scaling_factor_den[1] = 4;
+      rc_cfg_.scaling_factor_num[2] = 4;
+      rc_cfg_.scaling_factor_den[2] = 4;
+    } else if (number_spatial_layers == 2) {
+      rc_cfg_.scaling_factor_num[0] = 1;
+      rc_cfg_.scaling_factor_den[0] = 2;
+      rc_cfg_.scaling_factor_num[1] = 2;
+      rc_cfg_.scaling_factor_den[1] = 2;
+    } else if (number_spatial_layers == 1) {
+      rc_cfg_.scaling_factor_num[0] = 1;
+      rc_cfg_.scaling_factor_den[0] = 1;
+    }
+
+    if (number_temporal_layers == 3) {
+      rc_cfg_.ts_rate_decimator[0] = 4;
+      rc_cfg_.ts_rate_decimator[1] = 2;
+      rc_cfg_.ts_rate_decimator[2] = 1;
+    } else if (number_temporal_layers == 2) {
+      rc_cfg_.ts_rate_decimator[0] = 2;
+      rc_cfg_.ts_rate_decimator[1] = 1;
+    } else if (number_temporal_layers == 1) {
+      rc_cfg_.ts_rate_decimator[0] = 1;
+    }
+
+    rc_cfg_.target_bandwidth = 0;
+    for (int sl = 0; sl < number_spatial_layers; sl++) {
+      int spatial_bitrate = 0;
+      if (number_spatial_layers <= 3)
+        spatial_bitrate = frame_drop_thresh_ > 0 ? kSpatialLayerBitrateLow[sl]
+                                                 : kSpatialLayerBitrate[sl];
+      for (int tl = 0; tl < number_temporal_layers; tl++) {
+        int layer = sl * number_temporal_layers + tl;
+        if (number_temporal_layers == 3)
+          rc_cfg_.layer_target_bitrate[layer] =
+              kTemporalRateAllocation3Layer[tl] * spatial_bitrate / 100;
+        else if (number_temporal_layers == 2)
+          rc_cfg_.layer_target_bitrate[layer] =
+              kTemporalRateAllocation2Layer[tl] * spatial_bitrate / 100;
+        else if (number_temporal_layers == 1)
+          rc_cfg_.layer_target_bitrate[layer] = spatial_bitrate;
+      }
+      rc_cfg_.target_bandwidth += spatial_bitrate;
+    }
+
+    for (int sl = 0; sl < rc_cfg_.ss_number_layers; ++sl) {
+      for (int tl = 0; tl < rc_cfg_.ts_number_layers; ++tl) {
+        const int i = sl * rc_cfg_.ts_number_layers + tl;
+        rc_cfg_.max_quantizers[i] = 56;
+        rc_cfg_.min_quantizers[i] = 2;
+      }
+    }
+  }
+
+  int aq_mode_;
+  std::unique_ptr<libvpx::VP9RateControlRTC> rc_api_;
+  libvpx::VP9RateControlRtcConfig rc_cfg_;
+  vpx_svc_extra_cfg_t svc_params_;
+  libvpx::VP9FrameParamsQpRTC frame_params_;
+  bool encoder_exit_;
+  int current_superframe_;
+  uint32_t sizes_[8];
+  int key_interval_;
+  int dynamic_spatial_layers_;
+  bool inter_layer_pred_off_;
+  // ComputeQP() and PostEncodeUpdate() don't need to be sequential for KSVC.
+  bool parallel_spatial_layers_;
+  int frame_drop_thresh_;
+  int max_consec_drop_;
+  int num_drops_;
+};
+
+TEST_P(RcInterfaceTest, OneLayer) { RunOneLayer(); }
+
+TEST_P(RcInterfaceTest, OneLayerDropFramesCBR) { RunOneLayerDropFramesCBR(); }
+
+TEST_P(RcInterfaceTest, OneLayerScreen) { RunOneLayerScreen(); }
+
+TEST_P(RcInterfaceTest, OneLayerVBRPeriodicKey) { RunOneLayerVBRPeriodicKey(); }
+
+TEST_P(RcInterfaceSvcTest, Svc) { RunSvc(); }
+
+TEST_P(RcInterfaceSvcTest, SvcDropFramesCBR) { RunSvcDropFramesCBR(); }
+
+TEST_P(RcInterfaceSvcTest, SvcParallelSpatialLayers) {
+  RunSvcParallelSpatialLayers();
+}
+
+TEST_P(RcInterfaceSvcTest, SvcPeriodicKey) { RunSvcPeriodicKey(); }
+
+TEST_P(RcInterfaceSvcTest, SvcDynamicSpatial) { RunSvcDynamicSpatial(); }
+
+VP9_INSTANTIATE_TEST_SUITE(RcInterfaceTest, ::testing::Values(0, 3),
+                           ::testing::Values(VPX_CBR, VPX_VBR));
+VP9_INSTANTIATE_TEST_SUITE(RcInterfaceSvcTest, ::testing::Values(0, 3),
+                           ::testing::Values(true, false));
+}  // namespace
diff --git a/media/libvpx/libvpx/test/vp9_roi_test.cc b/media/libvpx/libvpx/test/vp9_roi_test.cc
new file mode 100644
index 0000000000..9427796a2b
--- /dev/null
+++ b/media/libvpx/libvpx/test/vp9_roi_test.cc
@@ -0,0 +1,148 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+#include "gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/video_source.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+#include "vpx/vp8cx.h"
+#include "vpx/vpx_encoder.h"
+
+#define MASK_WIDTH 40
+#define MASK_HEIGHT 30
+#define MASK_SIZE MASK_WIDTH *MASK_HEIGHT
+
+namespace {
+
+const int mask[MASK_SIZE] = {
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
+  1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0
+};
+
+class RoiMaskBackgroundSkip : public ::libvpx_test::EncoderTest,
+                              public ::testing::Test {
+ protected:
+  RoiMaskBackgroundSkip() : EncoderTest(&::libvpx_test::kVP9) {}
+  ~RoiMaskBackgroundSkip() override { free(roi_.roi_map); }
+
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    SetRoi();
+  }
+
+  void SetRoi() {
+    const int block_size = 8;
+    unsigned int i, j;
+    roi_.rows = (cfg_.g_h + block_size - 1) / block_size;
+    roi_.cols = (cfg_.g_w + block_size - 1) / block_size;
+    memset(&roi_.skip, 0, sizeof(roi_.skip));
+    memset(&roi_.delta_q, 0, sizeof(roi_.delta_q));
+    memset(&roi_.delta_lf, 0, sizeof(roi_.delta_lf));
+    memset(roi_.ref_frame, -1, sizeof(roi_.ref_frame));
+    roi_.ref_frame[1] = 1;
+    // Use segment 3 for skip.
+    roi_.skip[3] = 1;
+    roi_.roi_map =
+        (uint8_t *)calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map));
+    for (i = 0; i < roi_.rows; ++i) {
+      for (j = 0; j < roi_.cols; ++j) {
+        const int idx = i * roi_.cols + j;
+        if (mask[idx] == 1) roi_.roi_map[idx] = 3;
+      }
+    }
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      encoder->Control(VP8E_SET_CPUUSED, 7);
+      encoder->Control(VP9E_SET_AQ_MODE, 3);
+    }
+    encoder->Control(VP9E_SET_ROI_MAP, &roi_);
+  }
+
+ private:
+  vpx_roi_map_t roi_;
+};
+
+TEST_F(RoiMaskBackgroundSkip, RoiMaskNoMismatch) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_undershoot_pct = 20;
+  cfg_.rc_undershoot_pct = 20;
+  cfg_.rc_dropframe_thresh = 10;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 50;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.rc_target_bitrate = 200;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.kf_max_dist = 9999;
+
+  ::libvpx_test::I420VideoSource video("desktopqvga.320_240.yuv", 320, 240, 30,
+                                       1, 0, 150);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+}  // namespace
diff --git a/media/libvpx/libvpx/test/vp9_scale_test.cc b/media/libvpx/libvpx/test/vp9_scale_test.cc
new file mode 100644
index 0000000000..53b38d22aa
--- /dev/null
+++ b/media/libvpx/libvpx/test/vp9_scale_test.cc
@@ -0,0 +1,214 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "gtest/gtest.h"
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "./vpx_scale_rtcd.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/vpx_scale_test.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_timer.h"
+#include "vpx_scale/yv12config.h"
+
+namespace libvpx_test {
+
+using ScaleFrameFunc = void (*)(const YV12_BUFFER_CONFIG *src,
+                                YV12_BUFFER_CONFIG *dst,
+                                INTERP_FILTER filter_type, int phase_scaler);
+
+class ScaleTest : public VpxScaleBase,
+                  public ::testing::TestWithParam<ScaleFrameFunc> {
+ public:
+  ~ScaleTest() override = default;
+
+ protected:
+  void SetUp() override { scale_fn_ = GetParam(); }
+
+  void ReferenceScaleFrame(INTERP_FILTER filter_type, int phase_scaler) {
+    vp9_scale_and_extend_frame_c(&img_, &ref_img_, filter_type, phase_scaler);
+  }
+
+  void ScaleFrame(INTERP_FILTER filter_type, int phase_scaler) {
+    ASM_REGISTER_STATE_CHECK(
+        scale_fn_(&img_, &dst_img_, filter_type, phase_scaler));
+  }
+
+  void RunTest(INTERP_FILTER filter_type) {
+    static const int kNumSizesToTest = 22;
+    static const int kNumScaleFactorsToTest = 4;
+    static const int kSizesToTest[] = { 1,  2,  3,  4,  6,   8,  10, 12,
+                                        14, 16, 18, 20, 22,  24, 26, 28,
+                                        30, 32, 34, 68, 128, 134 };
+    static const int kScaleFactors[] = { 1, 2, 3, 4 };
+    for (int phase_scaler = 0; phase_scaler < 16; ++phase_scaler) {
+      for (int h = 0; h < kNumSizesToTest; ++h) {
+        const int src_height = kSizesToTest[h];
+        for (int w = 0; w < kNumSizesToTest; ++w) {
+          const int src_width = kSizesToTest[w];
+          for (int sf_up_idx = 0; sf_up_idx < kNumScaleFactorsToTest;
+               ++sf_up_idx) {
+            const int sf_up = kScaleFactors[sf_up_idx];
+            for (int sf_down_idx = 0; sf_down_idx < kNumScaleFactorsToTest;
+                 ++sf_down_idx) {
+              const int sf_down = kScaleFactors[sf_down_idx];
+              const int dst_width = src_width * sf_up / sf_down;
+              const int dst_height = src_height * sf_up / sf_down;
+              if (sf_up == sf_down && sf_up != 1) {
+                continue;
+              }
+              // I420 frame width and height must be even.
+              if (!dst_width || !dst_height || dst_width & 1 ||
+                  dst_height & 1) {
+                continue;
+              }
+              // vpx_convolve8_c() has restriction on the step which cannot
+              // exceed 64 (ratio 1 to 4).
+              if (src_width > 4 * dst_width || src_height > 4 * dst_height) {
+                continue;
+              }
+              ASSERT_NO_FATAL_FAILURE(ResetScaleImages(src_width, src_height,
+                                                       dst_width, dst_height));
+              ReferenceScaleFrame(filter_type, phase_scaler);
+              ScaleFrame(filter_type, phase_scaler);
+              if (memcmp(dst_img_.buffer_alloc, ref_img_.buffer_alloc,
+                         ref_img_.frame_size)) {
+                printf(
+                    "filter_type = %d, phase_scaler = %d, src_width = %4d, "
+                    "src_height = %4d, dst_width = %4d, dst_height = %4d, "
+                    "scale factor = %d:%d\n",
+                    filter_type, phase_scaler, src_width, src_height, dst_width,
+                    dst_height, sf_down, sf_up);
+                PrintDiff();
+              }
+              CompareImages(dst_img_);
+              DeallocScaleImages();
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void PrintDiffComponent(const uint8_t *const ref, const uint8_t *const opt,
+                          const int stride, const int width, const int height,
+                          const int plane_idx) const {
+    for (int y = 0; y < height; y++) {
+      for (int x = 0; x < width; x++) {
+        if (ref[y * stride + x] != opt[y * stride + x]) {
+          printf("Plane %d pixel[%d][%d] diff:%6d (ref),%6d (opt)\n", plane_idx,
+                 y, x, ref[y * stride + x], opt[y * stride + x]);
+          break;
+        }
+      }
+    }
+  }
+
+  void PrintDiff() const {
+    assert(ref_img_.y_stride == dst_img_.y_stride);
+    assert(ref_img_.y_width == dst_img_.y_width);
+    assert(ref_img_.y_height == dst_img_.y_height);
+    assert(ref_img_.uv_stride == dst_img_.uv_stride);
+    assert(ref_img_.uv_width == dst_img_.uv_width);
+    assert(ref_img_.uv_height == dst_img_.uv_height);
+
+    if (memcmp(dst_img_.buffer_alloc, ref_img_.buffer_alloc,
+               ref_img_.frame_size)) {
+      PrintDiffComponent(ref_img_.y_buffer, dst_img_.y_buffer,
+                         ref_img_.y_stride, ref_img_.y_width, ref_img_.y_height,
+                         0);
+      PrintDiffComponent(ref_img_.u_buffer, dst_img_.u_buffer,
+                         ref_img_.uv_stride, ref_img_.uv_width,
+                         ref_img_.uv_height, 1);
+      PrintDiffComponent(ref_img_.v_buffer, dst_img_.v_buffer,
+                         ref_img_.uv_stride, ref_img_.uv_width,
+                         ref_img_.uv_height, 2);
+    }
+  }
+
+  ScaleFrameFunc scale_fn_;
+};
+
+TEST_P(ScaleTest, ScaleFrame_EightTap) { RunTest(EIGHTTAP); }
+TEST_P(ScaleTest, ScaleFrame_EightTapSmooth) { RunTest(EIGHTTAP_SMOOTH); }
+TEST_P(ScaleTest, ScaleFrame_EightTapSharp) { RunTest(EIGHTTAP_SHARP); }
+TEST_P(ScaleTest, ScaleFrame_Bilinear) { RunTest(BILINEAR); }
+
+TEST_P(ScaleTest, DISABLED_Speed) {
+  static const int kCountSpeedTestBlock = 100;
+  static const int kNumScaleFactorsToTest = 4;
+  static const int kScaleFactors[] = { 1, 2, 3, 4 };
+  const int src_width = 1280;
+  const int src_height = 720;
+  for (INTERP_FILTER filter_type = 2; filter_type < 4; ++filter_type) {
+    for (int phase_scaler = 0; phase_scaler < 2; ++phase_scaler) {
+      for (int sf_up_idx = 0; sf_up_idx < kNumScaleFactorsToTest; ++sf_up_idx) {
+        const int sf_up = kScaleFactors[sf_up_idx];
+        for (int sf_down_idx = 0; sf_down_idx < kNumScaleFactorsToTest;
+             ++sf_down_idx) {
+          const int sf_down = kScaleFactors[sf_down_idx];
+          const int dst_width = src_width * sf_up / sf_down;
+          const int dst_height = src_height * sf_up / sf_down;
+          if (sf_up == sf_down && sf_up != 1) {
+            continue;
+          }
+          // I420 frame width and height must be even.
+          if (dst_width & 1 || dst_height & 1) {
+            continue;
+          }
+          ASSERT_NO_FATAL_FAILURE(
+              ResetScaleImages(src_width, src_height, dst_width, dst_height));
+          ASM_REGISTER_STATE_CHECK(
+              ReferenceScaleFrame(filter_type, phase_scaler));
+
+          vpx_usec_timer timer;
+          vpx_usec_timer_start(&timer);
+          for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+            ScaleFrame(filter_type, phase_scaler);
+          }
+          libvpx_test::ClearSystemState();
+          vpx_usec_timer_mark(&timer);
+          const int elapsed_time =
+              static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
+          CompareImages(dst_img_);
+          DeallocScaleImages();
+
+          printf(
+              "filter_type = %d, phase_scaler = %d, src_width = %4d, "
+              "src_height = %4d, dst_width = %4d, dst_height = %4d, "
+              "scale factor = %d:%d, scale time: %5d ms\n",
+              filter_type, phase_scaler, src_width, src_height, dst_width,
+              dst_height, sf_down, sf_up, elapsed_time);
+        }
+      }
+    }
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(C, ScaleTest,
+                         ::testing::Values(vp9_scale_and_extend_frame_c));
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(SSSE3, ScaleTest,
+                         ::testing::Values(vp9_scale_and_extend_frame_ssse3));
+#endif  // HAVE_SSSE3
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, ScaleTest,
+                         ::testing::Values(vp9_scale_and_extend_frame_neon));
+#endif  // HAVE_NEON
+
+}  // namespace libvpx_test
diff --git a/media/libvpx/libvpx/test/vp9_skip_loopfilter_test.cc b/media/libvpx/libvpx/test/vp9_skip_loopfilter_test.cc
index e847bbddf3..c080a2caae 100644
--- a/media/libvpx/libvpx/test/vp9_skip_loopfilter_test.cc
+++ b/media/libvpx/libvpx/test/vp9_skip_loopfilter_test.cc
@@ -24,33 +24,42 @@ const char kVp9Md5File[] = "vp90-2-08-tile_1x8_frame_parallel.webm.md5";
 // Class for testing shutting off the loop filter.
 class SkipLoopFilterTest {
  public:
-  SkipLoopFilterTest() : video_(NULL), decoder_(NULL), md5_file_(NULL) {}
+  SkipLoopFilterTest()
+      : video_(nullptr), decoder_(nullptr), md5_file_(nullptr) {}
 
   ~SkipLoopFilterTest() {
-    if (md5_file_ != NULL) fclose(md5_file_);
+    if (md5_file_ != nullptr) fclose(md5_file_);
     delete decoder_;
     delete video_;
   }
 
   // If |threads| > 0 then set the decoder with that number of threads.
-  void Init(int num_threads) {
+  bool Init(int num_threads) {
     expected_md5_[0] = '\0';
     junk_[0] = '\0';
     video_ = new libvpx_test::WebMVideoSource(kVp9TestFile);
-    ASSERT_TRUE(video_ != NULL);
+    if (video_ == nullptr) {
+      EXPECT_NE(video_, nullptr);
+      return false;
+    }
     video_->Init();
     video_->Begin();
 
     vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
     if (num_threads > 0) cfg.threads = num_threads;
     decoder_ = new libvpx_test::VP9Decoder(cfg, 0);
-    ASSERT_TRUE(decoder_ != NULL);
+    if (decoder_ == nullptr) {
+      EXPECT_NE(decoder_, nullptr);
+      return false;
+    }
 
     OpenMd5File(kVp9Md5File);
+    return !::testing::Test::HasFailure();
   }
 
   // Set the VP9 skipLoopFilter control value.
   void SetSkipLoopFilter(int value, vpx_codec_err_t expected_value) {
+    ASSERT_NE(decoder_, nullptr);
     decoder_->Control(VP9_SET_SKIP_LOOP_FILTER, value, expected_value);
   }
 
@@ -65,7 +74,7 @@ class SkipLoopFilterTest {
   }
 
   vpx_codec_err_t DecodeRemainingFrames() {
-    for (; video_->cxdata() != NULL; video_->Next()) {
+    for (; video_->cxdata() != nullptr; video_->Next()) {
       const vpx_codec_err_t res =
           decoder_->DecodeFrame(video_->cxdata(), video_->frame_size());
       if (res != VPX_CODEC_OK) return res;
@@ -85,13 +94,13 @@ class SkipLoopFilterTest {
   // TODO(fgalligan): Move the MD5 testing code into another class.
   void OpenMd5File(const std::string &md5_file_name) {
     md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name);
-    ASSERT_TRUE(md5_file_ != NULL) << "MD5 file open failed. Filename: "
-                                   << md5_file_name;
+    ASSERT_NE(md5_file_, nullptr)
+        << "MD5 file open failed. Filename: " << md5_file_name;
   }
 
   // Reads the next line of the MD5 file.
   void ReadMd5() {
-    ASSERT_TRUE(md5_file_ != NULL);
+    ASSERT_NE(md5_file_, nullptr);
     const int res = fscanf(md5_file_, "%s  %s", expected_md5_, junk_);
     ASSERT_NE(EOF, res) << "Read md5 data failed";
     expected_md5_[32] = '\0';
@@ -121,7 +130,7 @@ TEST(SkipLoopFilterTest, ShutOffLoopFilter) {
   const int non_zero_value = 1;
   const int num_threads = 0;
   SkipLoopFilterTest skip_loop_filter;
-  skip_loop_filter.Init(num_threads);
+  ASSERT_TRUE(skip_loop_filter.Init(num_threads));
   skip_loop_filter.SetSkipLoopFilter(non_zero_value, VPX_CODEC_OK);
   ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeRemainingFrames());
   skip_loop_filter.CheckMd5(false);
@@ -131,7 +140,7 @@ TEST(SkipLoopFilterTest, ShutOffLoopFilterSingleThread) {
   const int non_zero_value = 1;
   const int num_threads = 1;
   SkipLoopFilterTest skip_loop_filter;
-  skip_loop_filter.Init(num_threads);
+  ASSERT_TRUE(skip_loop_filter.Init(num_threads));
   skip_loop_filter.SetSkipLoopFilter(non_zero_value, VPX_CODEC_OK);
   ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeRemainingFrames());
   skip_loop_filter.CheckMd5(false);
@@ -141,7 +150,7 @@ TEST(SkipLoopFilterTest, ShutOffLoopFilter8Threads) {
   const int non_zero_value = 1;
   const int num_threads = 8;
   SkipLoopFilterTest skip_loop_filter;
-  skip_loop_filter.Init(num_threads);
+  ASSERT_TRUE(skip_loop_filter.Init(num_threads));
   skip_loop_filter.SetSkipLoopFilter(non_zero_value, VPX_CODEC_OK);
   ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeRemainingFrames());
   skip_loop_filter.CheckMd5(false);
@@ -151,7 +160,7 @@ TEST(SkipLoopFilterTest, WithLoopFilter) {
   const int non_zero_value = 1;
   const int num_threads = 0;
   SkipLoopFilterTest skip_loop_filter;
-  skip_loop_filter.Init(num_threads);
+  ASSERT_TRUE(skip_loop_filter.Init(num_threads));
   skip_loop_filter.SetSkipLoopFilter(non_zero_value, VPX_CODEC_OK);
   skip_loop_filter.SetSkipLoopFilter(0, VPX_CODEC_OK);
   ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeRemainingFrames());
@@ -161,7 +170,7 @@ TEST(SkipLoopFilterTest, WithLoopFilter) {
 TEST(SkipLoopFilterTest, ToggleLoopFilter) {
   const int num_threads = 0;
   SkipLoopFilterTest skip_loop_filter;
-  skip_loop_filter.Init(num_threads);
+  ASSERT_TRUE(skip_loop_filter.Init(num_threads));
 
   for (int i = 0; i < 10; ++i) {
     skip_loop_filter.SetSkipLoopFilter(i % 2, VPX_CODEC_OK);
diff --git a/media/libvpx/libvpx/test/vp9_spatial_svc_encoder.sh b/media/libvpx/libvpx/test/vp9_spatial_svc_encoder.sh
deleted file mode 100644
index 65031073f8..0000000000
--- a/media/libvpx/libvpx/test/vp9_spatial_svc_encoder.sh
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/bin/sh
-##
-##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-##
-##  Use of this source code is governed by a BSD-style license
-##  that can be found in the LICENSE file in the root of the source
-##  tree. An additional intellectual property rights grant can be found
-##  in the file PATENTS.  All contributing project authors may
-##  be found in the AUTHORS file in the root of the source tree.
-##
-##  This file tests the libvpx vp9_spatial_svc_encoder example. To add new
-##  tests to to this file, do the following:
-##    1. Write a shell function (this is your test).
-##    2. Add the function to vp9_spatial_svc_tests (on a new line).
-##
-. $(dirname $0)/tools_common.sh
-
-# Environment check: $YUV_RAW_INPUT is required.
-vp9_spatial_svc_encoder_verify_environment() {
-  if [ ! -e "${YUV_RAW_INPUT}" ]; then
-    echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
-    return 1
-  fi
-}
-
-# Runs vp9_spatial_svc_encoder. $1 is the test name.
-vp9_spatial_svc_encoder() {
-  local readonly \
-    encoder="${LIBVPX_BIN_PATH}/vp9_spatial_svc_encoder${VPX_TEST_EXE_SUFFIX}"
-  local readonly test_name="$1"
-  local readonly \
-    output_file="${VPX_TEST_OUTPUT_DIR}/vp9_ssvc_encoder${test_name}.ivf"
-  local readonly frames_to_encode=10
-  local readonly max_kf=9999
-
-  shift
-
-  if [ ! -x "${encoder}" ]; then
-    elog "${encoder} does not exist or is not executable."
-    return 1
-  fi
-
-  eval "${VPX_TEST_PREFIX}" "${encoder}" -w "${YUV_RAW_INPUT_WIDTH}" \
-    -h "${YUV_RAW_INPUT_HEIGHT}" -k "${max_kf}" -f "${frames_to_encode}" \
-    "$@" "${YUV_RAW_INPUT}" "${output_file}" ${devnull}
-
-  [ -e "${output_file}" ] || return 1
-}
-
-# Each test is run with layer count 1-$vp9_ssvc_test_layers.
-vp9_ssvc_test_layers=5
-
-vp9_spatial_svc() {
-  if [ "$(vp9_encode_available)" = "yes" ]; then
-    local readonly test_name="vp9_spatial_svc"
-    for layers in $(seq 1 ${vp9_ssvc_test_layers}); do
-      vp9_spatial_svc_encoder "${test_name}" -sl ${layers}
-    done
-  fi
-}
-
-readonly vp9_spatial_svc_tests="DISABLED_vp9_spatial_svc_mode_i
-                                DISABLED_vp9_spatial_svc_mode_altip
-                                DISABLED_vp9_spatial_svc_mode_ip
-                                DISABLED_vp9_spatial_svc_mode_gf
-                                vp9_spatial_svc"
-
-if [ "$(vpx_config_option_enabled CONFIG_SPATIAL_SVC)" = "yes" ]; then
-  run_tests \
-    vp9_spatial_svc_encoder_verify_environment \
-    "${vp9_spatial_svc_tests}"
-fi
diff --git a/media/libvpx/libvpx/test/vp9_subtract_test.cc b/media/libvpx/libvpx/test/vp9_subtract_test.cc
index 19ed304315..19234c8c6b 100644
--- a/media/libvpx/libvpx/test/vp9_subtract_test.cc
+++ b/media/libvpx/libvpx/test/vp9_subtract_test.cc
@@ -7,98 +7,315 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <cstdio>
+#include <tuple>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
+#include "test/bench.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
+#include "test/util.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_timer.h"
 
-typedef void (*SubtractFunc)(int rows, int cols, int16_t *diff_ptr,
-                             ptrdiff_t diff_stride, const uint8_t *src_ptr,
-                             ptrdiff_t src_stride, const uint8_t *pred_ptr,
-                             ptrdiff_t pred_stride);
+using SubtractFunc = void (*)(int rows, int cols, int16_t *diff_ptr,
+                              ptrdiff_t diff_stride, const uint8_t *src_ptr,
+                              ptrdiff_t src_stride, const uint8_t *pred_ptr,
+                              ptrdiff_t pred_stride);
 
 namespace vp9 {
 
-class VP9SubtractBlockTest : public ::testing::TestWithParam<SubtractFunc> {
+class VP9SubtractBlockTest : public AbstractBench,
+                             public ::testing::TestWithParam<SubtractFunc> {
  public:
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void Run() override {
+    GetParam()(block_height_, block_width_, diff_, block_width_, src_,
+               block_width_, pred_, block_width_);
+  }
+
+  void SetupBlocks(BLOCK_SIZE bsize) {
+    block_width_ = 4 * num_4x4_blocks_wide_lookup[bsize];
+    block_height_ = 4 * num_4x4_blocks_high_lookup[bsize];
+    diff_ = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, sizeof(*diff_) * block_width_ * block_height_ * 2));
+    pred_ = reinterpret_cast<uint8_t *>(
+        vpx_memalign(16, block_width_ * block_height_ * 2));
+    src_ = reinterpret_cast<uint8_t *>(
+        vpx_memalign(16, block_width_ * block_height_ * 2));
+  }
+
+  int block_width_;
+  int block_height_;
+  int16_t *diff_;
+  uint8_t *pred_;
+  uint8_t *src_;
 };
 
 using libvpx_test::ACMRandom;
 
-TEST_P(VP9SubtractBlockTest, SimpleSubtract) {
+TEST_P(VP9SubtractBlockTest, DISABLED_Speed) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
 
-  // FIXME(rbultje) split in its own file
   for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES;
        bsize = static_cast<BLOCK_SIZE>(static_cast<int>(bsize) + 1)) {
-    const int block_width = 4 * num_4x4_blocks_wide_lookup[bsize];
-    const int block_height = 4 * num_4x4_blocks_high_lookup[bsize];
-    int16_t *diff = reinterpret_cast<int16_t *>(
-        vpx_memalign(16, sizeof(*diff) * block_width * block_height * 2));
-    uint8_t *pred = reinterpret_cast<uint8_t *>(
-        vpx_memalign(16, block_width * block_height * 2));
-    uint8_t *src = reinterpret_cast<uint8_t *>(
-        vpx_memalign(16, block_width * block_height * 2));
+    SetupBlocks(bsize);
 
-    for (int n = 0; n < 100; n++) {
-      for (int r = 0; r < block_height; ++r) {
-        for (int c = 0; c < block_width * 2; ++c) {
-          src[r * block_width * 2 + c] = rnd.Rand8();
-          pred[r * block_width * 2 + c] = rnd.Rand8();
-        }
-      }
+    RunNTimes(100000000 / (block_height_ * block_width_));
+    char block_size[16];
+    snprintf(block_size, sizeof(block_size), "%dx%d", block_height_,
+             block_width_);
+    char title[100];
+    snprintf(title, sizeof(title), "%8s ", block_size);
+    PrintMedian(title);
 
-      GetParam()(block_height, block_width, diff, block_width, src, block_width,
-                 pred, block_width);
-
-      for (int r = 0; r < block_height; ++r) {
-        for (int c = 0; c < block_width; ++c) {
-          EXPECT_EQ(diff[r * block_width + c],
-                    (src[r * block_width + c] - pred[r * block_width + c]))
-              << "r = " << r << ", c = " << c << ", bs = " << bsize;
-        }
-      }
-
-      GetParam()(block_height, block_width, diff, block_width * 2, src,
-                 block_width * 2, pred, block_width * 2);
-
-      for (int r = 0; r < block_height; ++r) {
-        for (int c = 0; c < block_width; ++c) {
-          EXPECT_EQ(
-              diff[r * block_width * 2 + c],
-              (src[r * block_width * 2 + c] - pred[r * block_width * 2 + c]))
-              << "r = " << r << ", c = " << c << ", bs = " << bsize;
-        }
-      }
-    }
-    vpx_free(diff);
-    vpx_free(pred);
-    vpx_free(src);
+    vpx_free(diff_);
+    vpx_free(pred_);
+    vpx_free(src_);
   }
 }
 
-INSTANTIATE_TEST_CASE_P(C, VP9SubtractBlockTest,
-                        ::testing::Values(vpx_subtract_block_c));
+TEST_P(VP9SubtractBlockTest, SimpleSubtract) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+  for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES;
+       bsize = static_cast<BLOCK_SIZE>(static_cast<int>(bsize) + 1)) {
+    SetupBlocks(bsize);
+
+    for (int n = 0; n < 100; n++) {
+      for (int r = 0; r < block_height_; ++r) {
+        for (int c = 0; c < block_width_ * 2; ++c) {
+          src_[r * block_width_ * 2 + c] = rnd.Rand8();
+          pred_[r * block_width_ * 2 + c] = rnd.Rand8();
+        }
+      }
+
+      GetParam()(block_height_, block_width_, diff_, block_width_, src_,
+                 block_width_, pred_, block_width_);
+
+      for (int r = 0; r < block_height_; ++r) {
+        for (int c = 0; c < block_width_; ++c) {
+          EXPECT_EQ(diff_[r * block_width_ + c],
+                    (src_[r * block_width_ + c] - pred_[r * block_width_ + c]))
+              << "r = " << r << ", c = " << c
+              << ", bs = " << static_cast<int>(bsize);
+        }
+      }
+
+      GetParam()(block_height_, block_width_, diff_, block_width_ * 2, src_,
+                 block_width_ * 2, pred_, block_width_ * 2);
+
+      for (int r = 0; r < block_height_; ++r) {
+        for (int c = 0; c < block_width_; ++c) {
+          EXPECT_EQ(diff_[r * block_width_ * 2 + c],
+                    (src_[r * block_width_ * 2 + c] -
+                     pred_[r * block_width_ * 2 + c]))
+              << "r = " << r << ", c = " << c
+              << ", bs = " << static_cast<int>(bsize);
+        }
+      }
+    }
+    vpx_free(diff_);
+    vpx_free(pred_);
+    vpx_free(src_);
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(C, VP9SubtractBlockTest,
+                         ::testing::Values(vpx_subtract_block_c));
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, VP9SubtractBlockTest,
-                        ::testing::Values(vpx_subtract_block_sse2));
+INSTANTIATE_TEST_SUITE_P(SSE2, VP9SubtractBlockTest,
+                         ::testing::Values(vpx_subtract_block_sse2));
+#endif
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, VP9SubtractBlockTest,
+                         ::testing::Values(vpx_subtract_block_avx2));
 #endif
 #if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(NEON, VP9SubtractBlockTest,
-                        ::testing::Values(vpx_subtract_block_neon));
+INSTANTIATE_TEST_SUITE_P(NEON, VP9SubtractBlockTest,
+                         ::testing::Values(vpx_subtract_block_neon));
 #endif
 #if HAVE_MSA
-INSTANTIATE_TEST_CASE_P(MSA, VP9SubtractBlockTest,
-                        ::testing::Values(vpx_subtract_block_msa));
+INSTANTIATE_TEST_SUITE_P(MSA, VP9SubtractBlockTest,
+                         ::testing::Values(vpx_subtract_block_msa));
 #endif
 
+#if HAVE_MMI
+INSTANTIATE_TEST_SUITE_P(MMI, VP9SubtractBlockTest,
+                         ::testing::Values(vpx_subtract_block_mmi));
+#endif
+
+#if HAVE_VSX
+INSTANTIATE_TEST_SUITE_P(VSX, VP9SubtractBlockTest,
+                         ::testing::Values(vpx_subtract_block_vsx));
+#endif
+
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(LSX, VP9SubtractBlockTest,
+                         ::testing::Values(vpx_subtract_block_lsx));
+#endif
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+using HBDSubtractFunc = void (*)(int rows, int cols, int16_t *diff_ptr,
+                                 ptrdiff_t diff_stride, const uint8_t *src_ptr,
+                                 ptrdiff_t src_stride, const uint8_t *pred_ptr,
+                                 ptrdiff_t pred_stride, int bd);
+
+// <BLOCK_SIZE, bit_depth, optimized subtract func, reference subtract func>
+using Params = std::tuple<BLOCK_SIZE, int, HBDSubtractFunc, HBDSubtractFunc>;
+
+class VPXHBDSubtractBlockTest : public ::testing::TestWithParam<Params> {
+ public:
+  void SetUp() override {
+    block_width_ = 4 * num_4x4_blocks_wide_lookup[GET_PARAM(0)];
+    block_height_ = 4 * num_4x4_blocks_high_lookup[GET_PARAM(0)];
+    bit_depth_ = static_cast<vpx_bit_depth_t>(GET_PARAM(1));
+    func_ = GET_PARAM(2);
+    ref_func_ = GET_PARAM(3);
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+
+    constexpr size_t kMaxWidth = 128;
+    constexpr size_t kMaxBlockSize = kMaxWidth * kMaxWidth;
+    src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+        vpx_memalign(16, kMaxBlockSize * sizeof(uint16_t))));
+    ASSERT_NE(src_, nullptr);
+    pred_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+        vpx_memalign(16, kMaxBlockSize * sizeof(uint16_t))));
+    ASSERT_NE(pred_, nullptr);
+    diff_ = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, kMaxBlockSize * sizeof(int16_t)));
+    ASSERT_NE(diff_, nullptr);
+  }
+
+  void TearDown() override {
+    vpx_free(CONVERT_TO_SHORTPTR(src_));
+    vpx_free(CONVERT_TO_SHORTPTR(pred_));
+    vpx_free(diff_);
+  }
+
+ protected:
+  void CheckResult();
+  void RunForSpeed();
+
+ private:
+  ACMRandom rnd_;
+  int block_height_;
+  int block_width_;
+  vpx_bit_depth_t bit_depth_;
+  HBDSubtractFunc func_;
+  HBDSubtractFunc ref_func_;
+  uint8_t *src_;
+  uint8_t *pred_;
+  int16_t *diff_;
+};
+
+void VPXHBDSubtractBlockTest::CheckResult() {
+  constexpr int kTestNum = 100;
+  constexpr int kMaxWidth = 128;
+  constexpr int kMaxBlockSize = kMaxWidth * kMaxWidth;
+  const int mask = (1 << bit_depth_) - 1;
+  for (int i = 0; i < kTestNum; ++i) {
+    for (int j = 0; j < kMaxBlockSize; ++j) {
+      CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
+      CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
+    }
+
+    func_(block_height_, block_width_, diff_, block_width_, src_, block_width_,
+          pred_, block_width_, bit_depth_);
+
+    for (int r = 0; r < block_height_; ++r) {
+      for (int c = 0; c < block_width_; ++c) {
+        EXPECT_EQ(diff_[r * block_width_ + c],
+                  (CONVERT_TO_SHORTPTR(src_)[r * block_width_ + c] -
+                   CONVERT_TO_SHORTPTR(pred_)[r * block_width_ + c]))
+            << "r = " << r << ", c = " << c << ", test: " << i;
+      }
+    }
+  }
+}
+
+TEST_P(VPXHBDSubtractBlockTest, CheckResult) { CheckResult(); }
+
+void VPXHBDSubtractBlockTest::RunForSpeed() {
+  constexpr int kTestNum = 200000;
+  constexpr int kMaxWidth = 128;
+  constexpr int kMaxBlockSize = kMaxWidth * kMaxWidth;
+  const int mask = (1 << bit_depth_) - 1;
+
+  if (ref_func_ == func_) GTEST_SKIP();
+
+  for (int j = 0; j < kMaxBlockSize; ++j) {
+    CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
+    CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
+  }
+
+  vpx_usec_timer ref_timer;
+  vpx_usec_timer_start(&ref_timer);
+  for (int i = 0; i < kTestNum; ++i) {
+    ref_func_(block_height_, block_width_, diff_, block_width_, src_,
+              block_width_, pred_, block_width_, bit_depth_);
+  }
+  vpx_usec_timer_mark(&ref_timer);
+  const int64_t ref_elapsed_time = vpx_usec_timer_elapsed(&ref_timer);
+
+  for (int j = 0; j < kMaxBlockSize; ++j) {
+    CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
+    CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
+  }
+
+  vpx_usec_timer timer;
+  vpx_usec_timer_start(&timer);
+  for (int i = 0; i < kTestNum; ++i) {
+    func_(block_height_, block_width_, diff_, block_width_, src_, block_width_,
+          pred_, block_width_, bit_depth_);
+  }
+  vpx_usec_timer_mark(&timer);
+  const int64_t elapsed_time = vpx_usec_timer_elapsed(&timer);
+
+  printf(
+      "[%dx%d]: "
+      "ref_time=%6" PRId64 " \t simd_time=%6" PRId64
+      " \t "
+      "gain=%f \n",
+      block_width_, block_height_, ref_elapsed_time, elapsed_time,
+      static_cast<double>(ref_elapsed_time) /
+          static_cast<double>(elapsed_time));
+}
+
+TEST_P(VPXHBDSubtractBlockTest, DISABLED_Speed) { RunForSpeed(); }
+
+const BLOCK_SIZE kValidBlockSize[] = { BLOCK_4X4,   BLOCK_4X8,   BLOCK_8X4,
+                                       BLOCK_8X8,   BLOCK_8X16,  BLOCK_16X8,
+                                       BLOCK_16X16, BLOCK_16X32, BLOCK_32X16,
+                                       BLOCK_32X32, BLOCK_32X64, BLOCK_64X32,
+                                       BLOCK_64X64 };
+
+INSTANTIATE_TEST_SUITE_P(
+    C, VPXHBDSubtractBlockTest,
+    ::testing::Combine(::testing::ValuesIn(kValidBlockSize),
+                       ::testing::Values(12),
+                       ::testing::Values(&vpx_highbd_subtract_block_c),
+                       ::testing::Values(&vpx_highbd_subtract_block_c)));
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, VPXHBDSubtractBlockTest,
+    ::testing::Combine(::testing::ValuesIn(kValidBlockSize),
+                       ::testing::Values(12),
+                       ::testing::Values(&vpx_highbd_subtract_block_avx2),
+                       ::testing::Values(&vpx_highbd_subtract_block_c)));
+#endif  // HAVE_AVX2
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace vp9
diff --git a/media/libvpx/libvpx/test/vp9_thread_test.cc b/media/libvpx/libvpx/test/vp9_thread_test.cc
index 3e3fd25acb..387bb4050b 100644
--- a/media/libvpx/libvpx/test/vp9_thread_test.cc
+++ b/media/libvpx/libvpx/test/vp9_thread_test.cc
@@ -10,7 +10,7 @@
 
 #include <string>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 #include "./vpx_config.h"
 #include "test/codec_factory.h"
 #include "test/decode_test_driver.h"
@@ -26,10 +26,10 @@ using std::string;
 
 class VPxWorkerThreadTest : public ::testing::TestWithParam<bool> {
  protected:
-  virtual ~VPxWorkerThreadTest() {}
-  virtual void SetUp() { vpx_get_worker_interface()->init(&worker_); }
+  ~VPxWorkerThreadTest() override = default;
+  void SetUp() override { vpx_get_worker_interface()->init(&worker_); }
 
-  virtual void TearDown() { vpx_get_worker_interface()->end(&worker_); }
+  void TearDown() override { vpx_get_worker_interface()->end(&worker_); }
 
   void Run(VPxWorker *worker) {
     const bool synchronous = GetParam();
@@ -128,18 +128,18 @@ TEST_P(VPxWorkerThreadTest, EndWithoutSync) {
 }
 
 TEST(VPxWorkerThreadTest, TestInterfaceAPI) {
-  EXPECT_EQ(0, vpx_set_worker_interface(NULL));
-  EXPECT_TRUE(vpx_get_worker_interface() != NULL);
+  EXPECT_EQ(0, vpx_set_worker_interface(nullptr));
+  EXPECT_NE(vpx_get_worker_interface(), nullptr);
   for (int i = 0; i < 6; ++i) {
     VPxWorkerInterface winterface = *vpx_get_worker_interface();
     switch (i) {
       default:
-      case 0: winterface.init = NULL; break;
-      case 1: winterface.reset = NULL; break;
-      case 2: winterface.sync = NULL; break;
-      case 3: winterface.launch = NULL; break;
-      case 4: winterface.execute = NULL; break;
-      case 5: winterface.end = NULL; break;
+      case 0: winterface.init = nullptr; break;
+      case 1: winterface.reset = nullptr; break;
+      case 2: winterface.sync = nullptr; break;
+      case 3: winterface.launch = nullptr; break;
+      case 4: winterface.execute = nullptr; break;
+      case 5: winterface.end = nullptr; break;
     }
     EXPECT_EQ(0, vpx_set_worker_interface(&winterface));
   }
@@ -147,13 +147,7 @@ TEST(VPxWorkerThreadTest, TestInterfaceAPI) {
 
 // -----------------------------------------------------------------------------
 // Multi-threaded decode tests
-
 #if CONFIG_WEBM_IO
-struct FileList {
-  const char *name;
-  const char *expected_md5;
-};
-
 // Decodes |filename| with |num_threads|. Returns the md5 of the decoded frames.
 string DecodeFile(const string &filename, int num_threads) {
   libvpx_test::WebMVideoSource video(filename);
@@ -173,7 +167,7 @@ string DecodeFile(const string &filename, int num_threads) {
     }
 
     libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
-    const vpx_image_t *img = NULL;
+    const vpx_image_t *img = nullptr;
 
     // Get decompressed data
     while ((img = dec_iter.Next())) {
@@ -183,20 +177,11 @@ string DecodeFile(const string &filename, int num_threads) {
   return string(md5.Get());
 }
 
-void DecodeFiles(const FileList files[]) {
-  for (const FileList *iter = files; iter->name != NULL; ++iter) {
-    SCOPED_TRACE(iter->name);
-    for (int t = 1; t <= 8; ++t) {
-      EXPECT_EQ(iter->expected_md5, DecodeFile(iter->name, t)) << "threads = "
-                                                               << t;
-    }
-  }
-}
-
 // Trivial serialized thread worker interface implementation.
 // Note any worker that requires synchronization between other workers will
 // hang.
 namespace impl {
+namespace {
 
 void Init(VPxWorker *const worker) { memset(worker, 0, sizeof(*worker)); }
 int Reset(VPxWorker *const /*worker*/) { return 1; }
@@ -209,16 +194,13 @@ void Execute(VPxWorker *const worker) {
 void Launch(VPxWorker *const worker) { Execute(worker); }
 void End(VPxWorker *const /*worker*/) {}
 
+}  // namespace
 }  // namespace impl
 
 TEST(VPxWorkerThreadTest, TestSerialInterface) {
   static const VPxWorkerInterface serial_interface = {
     impl::Init, impl::Reset, impl::Sync, impl::Launch, impl::Execute, impl::End
   };
-  // TODO(jzern): Avoid using a file that will use the row-based thread
-  // loopfilter, with the simple serialized implementation it will hang. This is
-  // due to its expectation that rows will be run in parallel as they wait on
-  // progress in the row above before proceeding.
   static const char expected_md5[] = "b35a1b707b28e82be025d960aba039bc";
   static const char filename[] = "vp90-2-03-size-226x226.webm";
   VPxWorkerInterface default_interface = *vpx_get_worker_interface();
@@ -231,90 +213,83 @@ TEST(VPxWorkerThreadTest, TestSerialInterface) {
   EXPECT_EQ(expected_md5, DecodeFile(filename, 2));
 }
 
-TEST(VP9DecodeMultiThreadedTest, NoTilesNonFrameParallel) {
-  // no tiles or frame parallel; this exercises loop filter threading.
-  EXPECT_EQ("b35a1b707b28e82be025d960aba039bc",
-            DecodeFile("vp90-2-03-size-226x226.webm", 2));
+struct FileParam {
+  const char *name;
+  const char *expected_md5;
+  friend std::ostream &operator<<(std::ostream &os, const FileParam &param) {
+    return os << "file name: " << param.name
+              << " digest: " << param.expected_md5;
+  }
+};
+
+class VP9DecodeMultiThreadedTest : public ::testing::TestWithParam<FileParam> {
+};
+
+TEST_P(VP9DecodeMultiThreadedTest, Decode) {
+  for (int t = 1; t <= 8; ++t) {
+    EXPECT_EQ(GetParam().expected_md5, DecodeFile(GetParam().name, t))
+        << "threads = " << t;
+  }
 }
 
-TEST(VP9DecodeMultiThreadedTest, FrameParallel) {
-  static const FileList files[] = { { "vp90-2-08-tile_1x2_frame_parallel.webm",
-                                      "68ede6abd66bae0a2edf2eb9232241b6" },
-                                    { "vp90-2-08-tile_1x4_frame_parallel.webm",
-                                      "368ebc6ebf3a5e478d85b2c3149b2848" },
-                                    { "vp90-2-08-tile_1x8_frame_parallel.webm",
-                                      "17e439da2388aff3a0f69cb22579c6c1" },
-                                    { NULL, NULL } };
+const FileParam kNoTilesNonFrameParallelFiles[] = {
+  { "vp90-2-03-size-226x226.webm", "b35a1b707b28e82be025d960aba039bc" }
+};
 
-  DecodeFiles(files);
-}
+const FileParam kFrameParallelFiles[] = {
+  { "vp90-2-08-tile_1x2_frame_parallel.webm",
+    "68ede6abd66bae0a2edf2eb9232241b6" },
+  { "vp90-2-08-tile_1x4_frame_parallel.webm",
+    "368ebc6ebf3a5e478d85b2c3149b2848" },
+  { "vp90-2-08-tile_1x8_frame_parallel.webm",
+    "17e439da2388aff3a0f69cb22579c6c1" },
+};
 
-TEST(VP9DecodeMultiThreadedTest, FrameParallelResize) {
-  static const FileList files[] = {
-    { "vp90-2-14-resize-fp-tiles-1-16.webm",
-      "0cd5e632c326297e975f38949c31ea94" },
-    { "vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm",
-      "5c78a96a42e7f4a4f6b2edcdb791e44c" },
-    { "vp90-2-14-resize-fp-tiles-1-2.webm",
-      "e030450ae85c3277be2a418769df98e2" },
-    { "vp90-2-14-resize-fp-tiles-1-4.webm",
-      "312eed4e2b64eb7a4e7f18916606a430" },
-    { "vp90-2-14-resize-fp-tiles-16-1.webm",
-      "1755c16d8af16a9cb3fe7338d90abe52" },
-    { "vp90-2-14-resize-fp-tiles-16-2.webm",
-      "500300592d3fcb6f12fab25e48aaf4df" },
-    { "vp90-2-14-resize-fp-tiles-16-4.webm",
-      "47c48379fa6331215d91c67648e1af6e" },
-    { "vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm",
-      "eecf17290739bc708506fa4827665989" },
-    { "vp90-2-14-resize-fp-tiles-16-8.webm",
-      "29b6bb54e4c26b5ca85d5de5fed94e76" },
-    { "vp90-2-14-resize-fp-tiles-1-8.webm",
-      "1b6f175e08cd82cf84bb800ac6d1caa3" },
-    { "vp90-2-14-resize-fp-tiles-2-16.webm",
-      "ca3b03e4197995d8d5444ede7a6c0804" },
-    { "vp90-2-14-resize-fp-tiles-2-1.webm",
-      "99aec065369d70bbb78ccdff65afed3f" },
-    { "vp90-2-14-resize-fp-tiles-2-4.webm",
-      "22d0ebdb49b87d2920a85aea32e1afd5" },
-    { "vp90-2-14-resize-fp-tiles-2-8.webm",
-      "c2115cf051c62e0f7db1d4a783831541" },
-    { "vp90-2-14-resize-fp-tiles-4-16.webm",
-      "c690d7e1719b31367564cac0af0939cb" },
-    { "vp90-2-14-resize-fp-tiles-4-1.webm",
-      "a926020b2cc3e15ad4cc271853a0ff26" },
-    { "vp90-2-14-resize-fp-tiles-4-2.webm",
-      "42699063d9e581f1993d0cf890c2be78" },
-    { "vp90-2-14-resize-fp-tiles-4-8.webm",
-      "7f76d96036382f45121e3d5aa6f8ec52" },
-    { "vp90-2-14-resize-fp-tiles-8-16.webm",
-      "76a43fcdd7e658542913ea43216ec55d" },
-    { "vp90-2-14-resize-fp-tiles-8-1.webm",
-      "8e3fbe89486ca60a59299dea9da91378" },
-    { "vp90-2-14-resize-fp-tiles-8-2.webm",
-      "ae96f21f21b6370cc0125621b441fc52" },
-    { "vp90-2-14-resize-fp-tiles-8-4.webm",
-      "3eb4f24f10640d42218f7fd7b9fd30d4" },
-    { NULL, NULL }
-  };
+const FileParam kFrameParallelResizeFiles[] = {
+  { "vp90-2-14-resize-fp-tiles-1-16.webm", "0cd5e632c326297e975f38949c31ea94" },
+  { "vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm",
+    "5c78a96a42e7f4a4f6b2edcdb791e44c" },
+  { "vp90-2-14-resize-fp-tiles-1-2.webm", "e030450ae85c3277be2a418769df98e2" },
+  { "vp90-2-14-resize-fp-tiles-1-4.webm", "312eed4e2b64eb7a4e7f18916606a430" },
+  { "vp90-2-14-resize-fp-tiles-16-1.webm", "1755c16d8af16a9cb3fe7338d90abe52" },
+  { "vp90-2-14-resize-fp-tiles-16-2.webm", "500300592d3fcb6f12fab25e48aaf4df" },
+  { "vp90-2-14-resize-fp-tiles-16-4.webm", "47c48379fa6331215d91c67648e1af6e" },
+  { "vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm",
+    "eecf17290739bc708506fa4827665989" },
+  { "vp90-2-14-resize-fp-tiles-16-8.webm", "29b6bb54e4c26b5ca85d5de5fed94e76" },
+  { "vp90-2-14-resize-fp-tiles-1-8.webm", "1b6f175e08cd82cf84bb800ac6d1caa3" },
+  { "vp90-2-14-resize-fp-tiles-2-16.webm", "ca3b03e4197995d8d5444ede7a6c0804" },
+  { "vp90-2-14-resize-fp-tiles-2-1.webm", "99aec065369d70bbb78ccdff65afed3f" },
+  { "vp90-2-14-resize-fp-tiles-2-4.webm", "22d0ebdb49b87d2920a85aea32e1afd5" },
+  { "vp90-2-14-resize-fp-tiles-2-8.webm", "c2115cf051c62e0f7db1d4a783831541" },
+  { "vp90-2-14-resize-fp-tiles-4-16.webm", "c690d7e1719b31367564cac0af0939cb" },
+  { "vp90-2-14-resize-fp-tiles-4-1.webm", "a926020b2cc3e15ad4cc271853a0ff26" },
+  { "vp90-2-14-resize-fp-tiles-4-2.webm", "42699063d9e581f1993d0cf890c2be78" },
+  { "vp90-2-14-resize-fp-tiles-4-8.webm", "7f76d96036382f45121e3d5aa6f8ec52" },
+  { "vp90-2-14-resize-fp-tiles-8-16.webm", "76a43fcdd7e658542913ea43216ec55d" },
+  { "vp90-2-14-resize-fp-tiles-8-1.webm", "8e3fbe89486ca60a59299dea9da91378" },
+  { "vp90-2-14-resize-fp-tiles-8-2.webm", "ae96f21f21b6370cc0125621b441fc52" },
+  { "vp90-2-14-resize-fp-tiles-8-4.webm", "3eb4f24f10640d42218f7fd7b9fd30d4" },
+};
 
-  DecodeFiles(files);
-}
+const FileParam kNonFrameParallelFiles[] = {
+  { "vp90-2-08-tile_1x2.webm", "570b4a5d5a70d58b5359671668328a16" },
+  { "vp90-2-08-tile_1x4.webm", "988d86049e884c66909d2d163a09841a" },
+  { "vp90-2-08-tile_1x8.webm", "0941902a52e9092cb010905eab16364c" },
+  { "vp90-2-08-tile-4x1.webm", "06505aade6647c583c8e00a2f582266f" },
+  { "vp90-2-08-tile-4x4.webm", "85c2299892460d76e2c600502d52bfe2" },
+};
 
-TEST(VP9DecodeMultiThreadedTest, NonFrameParallel) {
-  static const FileList files[] = {
-    { "vp90-2-08-tile_1x2.webm", "570b4a5d5a70d58b5359671668328a16" },
-    { "vp90-2-08-tile_1x4.webm", "988d86049e884c66909d2d163a09841a" },
-    { "vp90-2-08-tile_1x8.webm", "0941902a52e9092cb010905eab16364c" },
-    { "vp90-2-08-tile-4x1.webm", "06505aade6647c583c8e00a2f582266f" },
-    { "vp90-2-08-tile-4x4.webm", "85c2299892460d76e2c600502d52bfe2" },
-    { NULL, NULL }
-  };
-
-  DecodeFiles(files);
-}
+INSTANTIATE_TEST_SUITE_P(NoTilesNonFrameParallel, VP9DecodeMultiThreadedTest,
+                         ::testing::ValuesIn(kNoTilesNonFrameParallelFiles));
+INSTANTIATE_TEST_SUITE_P(FrameParallel, VP9DecodeMultiThreadedTest,
+                         ::testing::ValuesIn(kFrameParallelFiles));
+INSTANTIATE_TEST_SUITE_P(FrameParallelResize, VP9DecodeMultiThreadedTest,
+                         ::testing::ValuesIn(kFrameParallelResizeFiles));
+INSTANTIATE_TEST_SUITE_P(NonFrameParallel, VP9DecodeMultiThreadedTest,
+                         ::testing::ValuesIn(kNonFrameParallelFiles));
 #endif  // CONFIG_WEBM_IO
 
-INSTANTIATE_TEST_CASE_P(Synchronous, VPxWorkerThreadTest, ::testing::Bool());
+INSTANTIATE_TEST_SUITE_P(Synchronous, VPxWorkerThreadTest, ::testing::Bool());
 
 }  // namespace
diff --git a/media/libvpx/libvpx/test/vpx_image_test.cc b/media/libvpx/libvpx/test/vpx_image_test.cc
new file mode 100644
index 0000000000..7c32035ed4
--- /dev/null
+++ b/media/libvpx/libvpx/test/vpx_image_test.cc
@@ -0,0 +1,128 @@
+/*
+ *  Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <climits>
+#include <cstdint>
+
+#include "vpx/vpx_image.h"
+#include "gtest/gtest.h"
+
+TEST(VpxImageTest, VpxImgWrapInvalidAlign) {
+  const int kWidth = 128;
+  const int kHeight = 128;
+  unsigned char buf[kWidth * kHeight * 3];
+
+  vpx_image_t img;
+  // Set img_data and img_data_owner to junk values. vpx_img_wrap() should
+  // not read these values on failure.
+  unsigned char empty[] = "";
+  img.img_data = empty;
+  img.img_data_owner = 1;
+
+  vpx_img_fmt_t format = VPX_IMG_FMT_I444;
+  // 'align' must be a power of 2 but is not. This causes the vpx_img_wrap()
+  // call to fail. The test verifies we do not read the junk values in 'img'.
+  unsigned int align = 31;
+  EXPECT_EQ(vpx_img_wrap(&img, format, kWidth, kHeight, align, buf), nullptr);
+}
+
+TEST(VpxImageTest, VpxImgSetRectOverflow) {
+  const int kWidth = 128;
+  const int kHeight = 128;
+  unsigned char buf[kWidth * kHeight * 3];
+
+  vpx_image_t img;
+  vpx_img_fmt_t format = VPX_IMG_FMT_I444;
+  unsigned int align = 32;
+  EXPECT_EQ(vpx_img_wrap(&img, format, kWidth, kHeight, align, buf), &img);
+
+  EXPECT_EQ(vpx_img_set_rect(&img, 0, 0, kWidth, kHeight), 0);
+  // This would result in overflow because -1 is cast to UINT_MAX.
+  EXPECT_NE(vpx_img_set_rect(&img, static_cast<unsigned int>(-1),
+                             static_cast<unsigned int>(-1), kWidth, kHeight),
+            0);
+}
+
+TEST(VpxImageTest, VpxImgAllocNone) {
+  const int kWidth = 128;
+  const int kHeight = 128;
+
+  vpx_image_t img;
+  vpx_img_fmt_t format = VPX_IMG_FMT_NONE;
+  unsigned int align = 32;
+  ASSERT_EQ(vpx_img_alloc(&img, format, kWidth, kHeight, align), nullptr);
+}
+
+TEST(VpxImageTest, VpxImgAllocNv12) {
+  const int kWidth = 128;
+  const int kHeight = 128;
+
+  vpx_image_t img;
+  vpx_img_fmt_t format = VPX_IMG_FMT_NV12;
+  unsigned int align = 32;
+  EXPECT_EQ(vpx_img_alloc(&img, format, kWidth, kHeight, align), &img);
+  EXPECT_EQ(img.stride[VPX_PLANE_U], img.stride[VPX_PLANE_Y]);
+  EXPECT_EQ(img.stride[VPX_PLANE_V], img.stride[VPX_PLANE_U]);
+  EXPECT_EQ(img.planes[VPX_PLANE_V], img.planes[VPX_PLANE_U] + 1);
+  vpx_img_free(&img);
+}
+
+TEST(VpxImageTest, VpxImgAllocHugeWidth) {
+  // The stride (0x80000000 * 2) would overflow unsigned int.
+  vpx_image_t *image =
+      vpx_img_alloc(nullptr, VPX_IMG_FMT_I42016, 0x80000000, 1, 1);
+  ASSERT_EQ(image, nullptr);
+
+  // The stride (0x80000000) would overflow int.
+  image = vpx_img_alloc(nullptr, VPX_IMG_FMT_I420, 0x80000000, 1, 1);
+  ASSERT_EQ(image, nullptr);
+
+  // The aligned width (UINT_MAX + 1) would overflow unsigned int.
+  image = vpx_img_alloc(nullptr, VPX_IMG_FMT_I420, UINT_MAX, 1, 1);
+  ASSERT_EQ(image, nullptr);
+
+  image = vpx_img_alloc(nullptr, VPX_IMG_FMT_I420, 0x7ffffffe, 1, 1);
+  if (image) {
+    vpx_img_free(image);
+  }
+
+  image = vpx_img_alloc(nullptr, VPX_IMG_FMT_I420, 285245883, 64, 1);
+  if (image) {
+    vpx_img_free(image);
+  }
+
+  image = vpx_img_alloc(nullptr, VPX_IMG_FMT_NV12, 285245883, 64, 1);
+  if (image) {
+    vpx_img_free(image);
+  }
+
+  image = vpx_img_alloc(nullptr, VPX_IMG_FMT_YV12, 285245883, 64, 1);
+  if (image) {
+    vpx_img_free(image);
+  }
+
+  image = vpx_img_alloc(nullptr, VPX_IMG_FMT_I42016, 65536, 2, 1);
+  if (image) {
+    uint16_t *y_plane =
+        reinterpret_cast<uint16_t *>(image->planes[VPX_PLANE_Y]);
+    y_plane[0] = 0;
+    y_plane[image->d_w - 1] = 0;
+    vpx_img_free(image);
+  }
+
+  image = vpx_img_alloc(nullptr, VPX_IMG_FMT_I42016, 285245883, 2, 1);
+  if (image) {
+    uint16_t *y_plane =
+        reinterpret_cast<uint16_t *>(image->planes[VPX_PLANE_Y]);
+    y_plane[0] = 0;
+    y_plane[image->d_w - 1] = 0;
+    vpx_img_free(image);
+  }
+}
diff --git a/media/libvpx/libvpx/test/vpx_scale_test.cc b/media/libvpx/libvpx/test/vpx_scale_test.cc
index 81773fe5b6..a5307ace93 100644
--- a/media/libvpx/libvpx/test/vpx_scale_test.cc
+++ b/media/libvpx/libvpx/test/vpx_scale_test.cc
@@ -8,181 +8,51 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "./vpx_config.h"
 #include "./vpx_scale_rtcd.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
+#include "test/vpx_scale_test.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_timer.h"
 #include "vpx_scale/yv12config.h"
 
+namespace libvpx_test {
 namespace {
 
-typedef void (*ExtendFrameBorderFunc)(YV12_BUFFER_CONFIG *ybf);
-typedef void (*CopyFrameFunc)(const YV12_BUFFER_CONFIG *src_ybf,
-                              YV12_BUFFER_CONFIG *dst_ybf);
+#if VPX_ARCH_ARM || (VPX_ARCH_MIPS && !HAVE_MIPS64) || VPX_ARCH_X86
+// Avoid OOM failures on 32-bit platforms.
+const int kNumSizesToTest = 7;
+#else
+const int kNumSizesToTest = 8;
+#endif
+const int kSizesToTest[] = { 1, 15, 33, 145, 512, 1025, 3840, 16383 };
 
-class VpxScaleBase {
- public:
-  virtual ~VpxScaleBase() { libvpx_test::ClearSystemState(); }
-
-  void ResetImage(int width, int height) {
-    width_ = width;
-    height_ = height;
-    memset(&img_, 0, sizeof(img_));
-    ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&img_, width_, height_,
-                                             VP8BORDERINPIXELS));
-    memset(img_.buffer_alloc, kBufFiller, img_.frame_size);
-    FillPlane(img_.y_buffer, img_.y_crop_width, img_.y_crop_height,
-              img_.y_stride);
-    FillPlane(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height,
-              img_.uv_stride);
-    FillPlane(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height,
-              img_.uv_stride);
-
-    memset(&ref_img_, 0, sizeof(ref_img_));
-    ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&ref_img_, width_, height_,
-                                             VP8BORDERINPIXELS));
-    memset(ref_img_.buffer_alloc, kBufFiller, ref_img_.frame_size);
-
-    memset(&cpy_img_, 0, sizeof(cpy_img_));
-    ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&cpy_img_, width_, height_,
-                                             VP8BORDERINPIXELS));
-    memset(cpy_img_.buffer_alloc, kBufFiller, cpy_img_.frame_size);
-    ReferenceCopyFrame();
-  }
-
-  void DeallocImage() {
-    vp8_yv12_de_alloc_frame_buffer(&img_);
-    vp8_yv12_de_alloc_frame_buffer(&ref_img_);
-    vp8_yv12_de_alloc_frame_buffer(&cpy_img_);
-  }
-
- protected:
-  static const int kBufFiller = 123;
-  static const int kBufMax = kBufFiller - 1;
-
-  static void FillPlane(uint8_t *buf, int width, int height, int stride) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        buf[x + (y * stride)] = (x + (width * y)) % kBufMax;
-      }
-    }
-  }
-
-  static void ExtendPlane(uint8_t *buf, int crop_width, int crop_height,
-                          int width, int height, int stride, int padding) {
-    // Copy the outermost visible pixel to a distance of at least 'padding.'
-    // The buffers are allocated such that there may be excess space outside the
-    // padding. As long as the minimum amount of padding is achieved it is not
-    // necessary to fill this space as well.
-    uint8_t *left = buf - padding;
-    uint8_t *right = buf + crop_width;
-    const int right_extend = padding + (width - crop_width);
-    const int bottom_extend = padding + (height - crop_height);
-
-    // Fill the border pixels from the nearest image pixel.
-    for (int y = 0; y < crop_height; ++y) {
-      memset(left, left[padding], padding);
-      memset(right, right[-1], right_extend);
-      left += stride;
-      right += stride;
-    }
-
-    left = buf - padding;
-    uint8_t *top = left - (stride * padding);
-    // The buffer does not always extend as far as the stride.
-    // Equivalent to padding + width + padding.
-    const int extend_width = padding + crop_width + right_extend;
-
-    // The first row was already extended to the left and right. Copy it up.
-    for (int y = 0; y < padding; ++y) {
-      memcpy(top, left, extend_width);
-      top += stride;
-    }
-
-    uint8_t *bottom = left + (crop_height * stride);
-    for (int y = 0; y < bottom_extend; ++y) {
-      memcpy(bottom, left + (crop_height - 1) * stride, extend_width);
-      bottom += stride;
-    }
-  }
-
-  void ReferenceExtendBorder() {
-    ExtendPlane(ref_img_.y_buffer, ref_img_.y_crop_width,
-                ref_img_.y_crop_height, ref_img_.y_width, ref_img_.y_height,
-                ref_img_.y_stride, ref_img_.border);
-    ExtendPlane(ref_img_.u_buffer, ref_img_.uv_crop_width,
-                ref_img_.uv_crop_height, ref_img_.uv_width, ref_img_.uv_height,
-                ref_img_.uv_stride, ref_img_.border / 2);
-    ExtendPlane(ref_img_.v_buffer, ref_img_.uv_crop_width,
-                ref_img_.uv_crop_height, ref_img_.uv_width, ref_img_.uv_height,
-                ref_img_.uv_stride, ref_img_.border / 2);
-  }
-
-  void ReferenceCopyFrame() {
-    // Copy img_ to ref_img_ and extend frame borders. This will be used for
-    // verifying extend_fn_ as well as copy_frame_fn_.
-    EXPECT_EQ(ref_img_.frame_size, img_.frame_size);
-    for (int y = 0; y < img_.y_crop_height; ++y) {
-      for (int x = 0; x < img_.y_crop_width; ++x) {
-        ref_img_.y_buffer[x + y * ref_img_.y_stride] =
-            img_.y_buffer[x + y * img_.y_stride];
-      }
-    }
-
-    for (int y = 0; y < img_.uv_crop_height; ++y) {
-      for (int x = 0; x < img_.uv_crop_width; ++x) {
-        ref_img_.u_buffer[x + y * ref_img_.uv_stride] =
-            img_.u_buffer[x + y * img_.uv_stride];
-        ref_img_.v_buffer[x + y * ref_img_.uv_stride] =
-            img_.v_buffer[x + y * img_.uv_stride];
-      }
-    }
-
-    ReferenceExtendBorder();
-  }
-
-  void CompareImages(const YV12_BUFFER_CONFIG actual) {
-    EXPECT_EQ(ref_img_.frame_size, actual.frame_size);
-    EXPECT_EQ(0, memcmp(ref_img_.buffer_alloc, actual.buffer_alloc,
-                        ref_img_.frame_size));
-  }
-
-  YV12_BUFFER_CONFIG img_;
-  YV12_BUFFER_CONFIG ref_img_;
-  YV12_BUFFER_CONFIG cpy_img_;
-  int width_;
-  int height_;
-};
+using ExtendFrameBorderFunc = void (*)(YV12_BUFFER_CONFIG *ybf);
+using CopyFrameFunc = void (*)(const YV12_BUFFER_CONFIG *src_ybf,
+                               YV12_BUFFER_CONFIG *dst_ybf);
 
 class ExtendBorderTest
     : public VpxScaleBase,
       public ::testing::TestWithParam<ExtendFrameBorderFunc> {
  public:
-  virtual ~ExtendBorderTest() {}
+  ~ExtendBorderTest() override = default;
 
  protected:
-  virtual void SetUp() { extend_fn_ = GetParam(); }
+  void SetUp() override { extend_fn_ = GetParam(); }
 
   void ExtendBorder() { ASM_REGISTER_STATE_CHECK(extend_fn_(&img_)); }
 
   void RunTest() {
-#if ARCH_ARM
-    // Some arm devices OOM when trying to allocate the largest buffers.
-    static const int kNumSizesToTest = 6;
-#else
-    static const int kNumSizesToTest = 7;
-#endif
-    static const int kSizesToTest[] = { 1, 15, 33, 145, 512, 1025, 16383 };
     for (int h = 0; h < kNumSizesToTest; ++h) {
       for (int w = 0; w < kNumSizesToTest; ++w) {
-        ResetImage(kSizesToTest[w], kSizesToTest[h]);
+        ASSERT_NO_FATAL_FAILURE(ResetImages(kSizesToTest[w], kSizesToTest[h]));
+        ReferenceCopyFrame();
         ExtendBorder();
-        ReferenceExtendBorder();
         CompareImages(img_);
-        DeallocImage();
+        DeallocImages();
       }
     }
   }
@@ -192,36 +62,29 @@ class ExtendBorderTest
 
 TEST_P(ExtendBorderTest, ExtendBorder) { ASSERT_NO_FATAL_FAILURE(RunTest()); }
 
-INSTANTIATE_TEST_CASE_P(C, ExtendBorderTest,
-                        ::testing::Values(vp8_yv12_extend_frame_borders_c));
+INSTANTIATE_TEST_SUITE_P(C, ExtendBorderTest,
+                         ::testing::Values(vp8_yv12_extend_frame_borders_c));
 
 class CopyFrameTest : public VpxScaleBase,
                       public ::testing::TestWithParam<CopyFrameFunc> {
  public:
-  virtual ~CopyFrameTest() {}
+  ~CopyFrameTest() override = default;
 
  protected:
-  virtual void SetUp() { copy_frame_fn_ = GetParam(); }
+  void SetUp() override { copy_frame_fn_ = GetParam(); }
 
   void CopyFrame() {
-    ASM_REGISTER_STATE_CHECK(copy_frame_fn_(&img_, &cpy_img_));
+    ASM_REGISTER_STATE_CHECK(copy_frame_fn_(&img_, &dst_img_));
   }
 
   void RunTest() {
-#if ARCH_ARM
-    // Some arm devices OOM when trying to allocate the largest buffers.
-    static const int kNumSizesToTest = 6;
-#else
-    static const int kNumSizesToTest = 7;
-#endif
-    static const int kSizesToTest[] = { 1, 15, 33, 145, 512, 1025, 16383 };
     for (int h = 0; h < kNumSizesToTest; ++h) {
       for (int w = 0; w < kNumSizesToTest; ++w) {
-        ResetImage(kSizesToTest[w], kSizesToTest[h]);
+        ASSERT_NO_FATAL_FAILURE(ResetImages(kSizesToTest[w], kSizesToTest[h]));
         ReferenceCopyFrame();
         CopyFrame();
-        CompareImages(cpy_img_);
-        DeallocImage();
+        CompareImages(dst_img_);
+        DeallocImages();
       }
     }
   }
@@ -231,6 +94,8 @@ class CopyFrameTest : public VpxScaleBase,
 
 TEST_P(CopyFrameTest, CopyFrame) { ASSERT_NO_FATAL_FAILURE(RunTest()); }
 
-INSTANTIATE_TEST_CASE_P(C, CopyFrameTest,
-                        ::testing::Values(vp8_yv12_copy_frame_c));
+INSTANTIATE_TEST_SUITE_P(C, CopyFrameTest,
+                         ::testing::Values(vp8_yv12_copy_frame_c));
+
 }  // namespace
+}  // namespace libvpx_test
diff --git a/media/libvpx/libvpx/test/vpx_scale_test.h b/media/libvpx/libvpx/test/vpx_scale_test.h
new file mode 100644
index 0000000000..89d62fcd7f
--- /dev/null
+++ b/media/libvpx/libvpx/test/vpx_scale_test.h
@@ -0,0 +1,201 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_TEST_VPX_SCALE_TEST_H_
+#define VPX_TEST_VPX_SCALE_TEST_H_
+
+#include "gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "./vpx_scale_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/yv12config.h"
+
+using libvpx_test::ACMRandom;
+
+namespace libvpx_test {
+
+class VpxScaleBase {
+ public:
+  virtual ~VpxScaleBase() { libvpx_test::ClearSystemState(); }
+
+  void ResetImage(YV12_BUFFER_CONFIG *const img, const int width,
+                  const int height) {
+    memset(img, 0, sizeof(*img));
+    ASSERT_EQ(
+        0, vp8_yv12_alloc_frame_buffer(img, width, height, VP8BORDERINPIXELS))
+        << "for width: " << width << " height: " << height;
+    memset(img->buffer_alloc, kBufFiller, img->frame_size);
+  }
+
+  void ResetImages(const int width, const int height) {
+    ResetImage(&img_, width, height);
+    ResetImage(&ref_img_, width, height);
+    ResetImage(&dst_img_, width, height);
+
+    FillPlane(img_.y_buffer, img_.y_crop_width, img_.y_crop_height,
+              img_.y_stride);
+    FillPlane(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height,
+              img_.uv_stride);
+    FillPlane(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height,
+              img_.uv_stride);
+  }
+
+  void ResetScaleImage(YV12_BUFFER_CONFIG *const img, const int width,
+                       const int height) {
+    memset(img, 0, sizeof(*img));
+#if CONFIG_VP9_HIGHBITDEPTH
+    ASSERT_EQ(0, vpx_alloc_frame_buffer(img, width, height, 1, 1, 0,
+                                        VP9_ENC_BORDER_IN_PIXELS, 0));
+#else
+    ASSERT_EQ(0, vpx_alloc_frame_buffer(img, width, height, 1, 1,
+                                        VP9_ENC_BORDER_IN_PIXELS, 0));
+#endif
+    memset(img->buffer_alloc, kBufFiller, img->frame_size);
+  }
+
+  void ResetScaleImages(const int src_width, const int src_height,
+                        const int dst_width, const int dst_height) {
+    ResetScaleImage(&img_, src_width, src_height);
+    ResetScaleImage(&ref_img_, dst_width, dst_height);
+    ResetScaleImage(&dst_img_, dst_width, dst_height);
+    FillPlaneExtreme(img_.y_buffer, img_.y_crop_width, img_.y_crop_height,
+                     img_.y_stride);
+    FillPlaneExtreme(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height,
+                     img_.uv_stride);
+    FillPlaneExtreme(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height,
+                     img_.uv_stride);
+  }
+
+  void DeallocImages() {
+    vp8_yv12_de_alloc_frame_buffer(&img_);
+    vp8_yv12_de_alloc_frame_buffer(&ref_img_);
+    vp8_yv12_de_alloc_frame_buffer(&dst_img_);
+  }
+
+  void DeallocScaleImages() {
+    vpx_free_frame_buffer(&img_);
+    vpx_free_frame_buffer(&ref_img_);
+    vpx_free_frame_buffer(&dst_img_);
+  }
+
+ protected:
+  static const int kBufFiller = 123;
+  static const int kBufMax = kBufFiller - 1;
+
+  static void FillPlane(uint8_t *const buf, const int width, const int height,
+                        const int stride) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        buf[x + (y * stride)] = (x + (width * y)) % kBufMax;
+      }
+    }
+  }
+
+  static void FillPlaneExtreme(uint8_t *const buf, const int width,
+                               const int height, const int stride) {
+    ACMRandom rnd;
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        buf[x + (y * stride)] = rnd.Rand8() % 2 ? 255 : 0;
+      }
+    }
+  }
+
+  static void ExtendPlane(uint8_t *buf, int crop_width, int crop_height,
+                          int width, int height, int stride, int padding) {
+    // Copy the outermost visible pixel to a distance of at least 'padding.'
+    // The buffers are allocated such that there may be excess space outside the
+    // padding. As long as the minimum amount of padding is achieved it is not
+    // necessary to fill this space as well.
+    uint8_t *left = buf - padding;
+    uint8_t *right = buf + crop_width;
+    const int right_extend = padding + (width - crop_width);
+    const int bottom_extend = padding + (height - crop_height);
+
+    // Fill the border pixels from the nearest image pixel.
+    for (int y = 0; y < crop_height; ++y) {
+      memset(left, left[padding], padding);
+      memset(right, right[-1], right_extend);
+      left += stride;
+      right += stride;
+    }
+
+    left = buf - padding;
+    uint8_t *top = left - (stride * padding);
+    // The buffer does not always extend as far as the stride.
+    // Equivalent to padding + width + padding.
+    const int extend_width = padding + crop_width + right_extend;
+
+    // The first row was already extended to the left and right. Copy it up.
+    for (int y = 0; y < padding; ++y) {
+      memcpy(top, left, extend_width);
+      top += stride;
+    }
+
+    uint8_t *bottom = left + (crop_height * stride);
+    for (int y = 0; y < bottom_extend; ++y) {
+      memcpy(bottom, left + (crop_height - 1) * stride, extend_width);
+      bottom += stride;
+    }
+  }
+
+  void ReferenceExtendBorder() {
+    ExtendPlane(ref_img_.y_buffer, ref_img_.y_crop_width,
+                ref_img_.y_crop_height, ref_img_.y_width, ref_img_.y_height,
+                ref_img_.y_stride, ref_img_.border);
+    ExtendPlane(ref_img_.u_buffer, ref_img_.uv_crop_width,
+                ref_img_.uv_crop_height, ref_img_.uv_width, ref_img_.uv_height,
+                ref_img_.uv_stride, ref_img_.border / 2);
+    ExtendPlane(ref_img_.v_buffer, ref_img_.uv_crop_width,
+                ref_img_.uv_crop_height, ref_img_.uv_width, ref_img_.uv_height,
+                ref_img_.uv_stride, ref_img_.border / 2);
+  }
+
+  void ReferenceCopyFrame() {
+    // Copy img_ to ref_img_ and extend frame borders. This will be used for
+    // verifying extend_fn_ as well as copy_frame_fn_.
+    EXPECT_EQ(ref_img_.frame_size, img_.frame_size);
+    for (int y = 0; y < img_.y_crop_height; ++y) {
+      for (int x = 0; x < img_.y_crop_width; ++x) {
+        ref_img_.y_buffer[x + y * ref_img_.y_stride] =
+            img_.y_buffer[x + y * img_.y_stride];
+      }
+    }
+
+    for (int y = 0; y < img_.uv_crop_height; ++y) {
+      for (int x = 0; x < img_.uv_crop_width; ++x) {
+        ref_img_.u_buffer[x + y * ref_img_.uv_stride] =
+            img_.u_buffer[x + y * img_.uv_stride];
+        ref_img_.v_buffer[x + y * ref_img_.uv_stride] =
+            img_.v_buffer[x + y * img_.uv_stride];
+      }
+    }
+
+    ReferenceExtendBorder();
+  }
+
+  void CompareImages(const YV12_BUFFER_CONFIG actual) {
+    EXPECT_EQ(ref_img_.frame_size, actual.frame_size);
+    EXPECT_EQ(0, memcmp(ref_img_.buffer_alloc, actual.buffer_alloc,
+                        ref_img_.frame_size));
+  }
+
+  YV12_BUFFER_CONFIG img_;
+  YV12_BUFFER_CONFIG ref_img_;
+  YV12_BUFFER_CONFIG dst_img_;
+};
+
+}  // namespace libvpx_test
+
+#endif  // VPX_TEST_VPX_SCALE_TEST_H_
diff --git a/media/libvpx/libvpx/test/vpx_temporal_svc_encoder.sh b/media/libvpx/libvpx/test/vpx_temporal_svc_encoder.sh
index 6b6d15e7f8..69c734daf8 100644
--- a/media/libvpx/libvpx/test/vpx_temporal_svc_encoder.sh
+++ b/media/libvpx/libvpx/test/vpx_temporal_svc_encoder.sh
@@ -38,9 +38,11 @@ vpx_tsvc_encoder() {
   local output_file="${VPX_TEST_OUTPUT_DIR}/${output_file_base}"
   local timebase_num="1"
   local timebase_den="1000"
+  local timebase_den_y4m="30"
   local speed="6"
   local frame_drop_thresh="30"
   local max_threads="4"
+  local error_resilient="1"
 
   shift 2
 
@@ -51,11 +53,25 @@ vpx_tsvc_encoder() {
 
   # TODO(tomfinegan): Verify file output for all thread runs.
   for threads in $(seq $max_threads); do
-    eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" "${output_file}" \
-        "${codec}" "${YUV_RAW_INPUT_WIDTH}" "${YUV_RAW_INPUT_HEIGHT}" \
-        "${timebase_num}" "${timebase_den}" "${speed}" "${frame_drop_thresh}" \
-        "${threads}" "$@" \
-        ${devnull}
+    if [ "$(vpx_config_option_enabled CONFIG_VP9_HIGHBITDEPTH)" != "yes" ]; then
+      eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" \
+        "${output_file}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+        "${YUV_RAW_INPUT_HEIGHT}" "${timebase_num}" "${timebase_den}" \
+        "${speed}" "${frame_drop_thresh}" "${error_resilient}" "${threads}" \
+        "$@" ${devnull} || return 1
+      # Test for y4m input.
+      eval "${VPX_TEST_PREFIX}" "${encoder}" "${Y4M_720P_INPUT}" \
+        "${output_file}" "${codec}" "${Y4M_720P_INPUT_WIDTH}" \
+        "${Y4M_720P_INPUT_HEIGHT}" "${timebase_num}" "${timebase_den_y4m}" \
+        "${speed}" "${frame_drop_thresh}" "${error_resilient}" "${threads}" \
+        "$@" ${devnull} || return 1
+    else
+      eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" \
+        "${output_file}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+        "${YUV_RAW_INPUT_HEIGHT}" "${timebase_num}" "${timebase_den}" \
+        "${speed}" "${frame_drop_thresh}" "${error_resilient}" "${threads}" \
+        "$@" "8" ${devnull} || return 1
+    fi
   done
 }
 
@@ -76,193 +92,217 @@ files_exist() {
 
 vpx_tsvc_encoder_vp8_mode_0() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 0 200 || return 1
+    local output_basename="vpx_tsvc_encoder_vp8_mode_0"
+    vpx_tsvc_encoder vp8 "${output_basename}" 0 200 || return 1
     # Mode 0 produces 1 stream
-    files_exist "${FUNCNAME}" 1 || return 1
+    files_exist "${output_basename}" 1 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp8_mode_1() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 1 200 400 || return 1
+    local output_basename="vpx_tsvc_encoder_vp8_mode_1"
+    vpx_tsvc_encoder vp8 "${output_basename}" 1 200 400 || return 1
     # Mode 1 produces 2 streams
-    files_exist "${FUNCNAME}" 2 || return 1
+    files_exist "${output_basename}" 2 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp8_mode_2() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 2 200 400 || return 1
+    local output_basename="vpx_tsvc_encoder_vp8_mode_2"
+    vpx_tsvc_encoder vp8 "${output_basename}" 2 200 400 || return 1
     # Mode 2 produces 2 streams
-    files_exist "${FUNCNAME}" 2 || return 1
+    files_exist "${output_basename}" 2 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp8_mode_3() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 3 200 400 600 || return 1
+    local output_basename="vpx_tsvc_encoder_vp8_mode_3"
+    vpx_tsvc_encoder vp8 "${output_basename}" 3 200 400 600 || return 1
     # Mode 3 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
+    files_exist "${output_basename}" 3 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp8_mode_4() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 4 200 400 600 || return 1
+    local output_basename="vpx_tsvc_encoder_vp8_mode_4"
+    vpx_tsvc_encoder vp8 "${output_basename}" 4 200 400 600 || return 1
     # Mode 4 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
+    files_exist "${output_basename}" 3 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp8_mode_5() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 5 200 400 600 || return 1
+    local output_basename="vpx_tsvc_encoder_vp8_mode_5"
+    vpx_tsvc_encoder vp8 "${output_basename}" 5 200 400 600 || return 1
     # Mode 5 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
+    files_exist "${output_basename}" 3 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp8_mode_6() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 6 200 400 600 || return 1
+    local output_basename="vpx_tsvc_encoder_vp8_mode_6"
+    vpx_tsvc_encoder vp8 "${output_basename}" 6 200 400 600 || return 1
     # Mode 6 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
+    files_exist "${output_basename}" 3 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp8_mode_7() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 7 200 400 600 800 1000 || return 1
+    local output_basename="vpx_tsvc_encoder_vp8_mode_7"
+    vpx_tsvc_encoder vp8 "${output_basename}" 7 200 400 600 800 1000 || return 1
     # Mode 7 produces 5 streams
-    files_exist "${FUNCNAME}" 5 || return 1
+    files_exist "${output_basename}" 5 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp8_mode_8() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 8 200 400 || return 1
+    local output_basename="vpx_tsvc_encoder_vp8_mode_8"
+    vpx_tsvc_encoder vp8 "${output_basename}" 8 200 400 || return 1
     # Mode 8 produces 2 streams
-    files_exist "${FUNCNAME}" 2 || return 1
+    files_exist "${output_basename}" 2 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp8_mode_9() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 9 200 400 600 || return 1
+    local output_basename="vpx_tsvc_encoder_vp8_mode_9"
+    vpx_tsvc_encoder vp8 "${output_basename}" 9 200 400 600 || return 1
     # Mode 9 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
+    files_exist "${output_basename}" 3 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp8_mode_10() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 10 200 400 600 || return 1
+    local output_basename="vpx_tsvc_encoder_vp8_mode_10"
+    vpx_tsvc_encoder vp8 "${output_basename}" 10 200 400 600 || return 1
     # Mode 10 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
+    files_exist "${output_basename}" 3 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp8_mode_11() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp8 "${FUNCNAME}" 11 200 400 600 || return 1
+    local output_basename="vpx_tsvc_encoder_vp8_mode_11"
+    vpx_tsvc_encoder vp8 "${output_basename}" 11 200 400 600 || return 1
     # Mode 11 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
+    files_exist "${output_basename}" 3 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp9_mode_0() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 0 200 || return 1
+    local output_basename="vpx_tsvc_encoder_vp9_mode_0"
+    vpx_tsvc_encoder vp9 "${output_basename}" 0 200 || return 1
     # Mode 0 produces 1 stream
-    files_exist "${FUNCNAME}" 1 || return 1
+    files_exist "${output_basename}" 1 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp9_mode_1() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 1 200 400 || return 1
+    local output_basename="vpx_tsvc_encoder_vp9_mode_1"
+    vpx_tsvc_encoder vp9 "${output_basename}" 1 200 400 || return 1
     # Mode 1 produces 2 streams
-    files_exist "${FUNCNAME}" 2 || return 1
+    files_exist "${output_basename}" 2 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp9_mode_2() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 2 200 400 || return 1
+    local output_basename="vpx_tsvc_encoder_vp9_mode_2"
+    vpx_tsvc_encoder vp9 "${output_basename}" 2 200 400 || return 1
     # Mode 2 produces 2 streams
-    files_exist "${FUNCNAME}" 2 || return 1
+    files_exist "${output_basename}" 2 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp9_mode_3() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 3 200 400 600 || return 1
+    local output_basename="vpx_tsvc_encoder_vp9_mode_3"
+    vpx_tsvc_encoder vp9 "${output_basename}" 3 200 400 600 || return 1
     # Mode 3 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
+    files_exist "${output_basename}" 3 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp9_mode_4() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 4 200 400 600 || return 1
+    local output_basename="vpx_tsvc_encoder_vp9_mode_4"
+    vpx_tsvc_encoder vp9 "${output_basename}" 4 200 400 600 || return 1
     # Mode 4 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
+    files_exist "${output_basename}" 3 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp9_mode_5() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 5 200 400 600 || return 1
+    local output_basename="vpx_tsvc_encoder_vp9_mode_5"
+    vpx_tsvc_encoder vp9 "${output_basename}" 5 200 400 600 || return 1
     # Mode 5 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
+    files_exist "${output_basename}" 3 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp9_mode_6() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 6 200 400 600 || return 1
+    local output_basename="vpx_tsvc_encoder_vp9_mode_6"
+    vpx_tsvc_encoder vp9 "${output_basename}" 6 200 400 600 || return 1
     # Mode 6 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
+    files_exist "${output_basename}" 3 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp9_mode_7() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 7 200 400 600 800 1000 || return 1
+    local output_basename="vpx_tsvc_encoder_vp9_mode_7"
+    vpx_tsvc_encoder vp9 "${output_basename}" 7 200 400 600 800 1000 || return 1
     # Mode 7 produces 5 streams
-    files_exist "${FUNCNAME}" 5 || return 1
+    files_exist "${output_basename}" 5 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp9_mode_8() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 8 200 400 || return 1
+    local output_basename="vpx_tsvc_encoder_vp9_mode_8"
+    vpx_tsvc_encoder vp9 "${output_basename}" 8 200 400 || return 1
     # Mode 8 produces 2 streams
-    files_exist "${FUNCNAME}" 2 || return 1
+    files_exist "${output_basename}" 2 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp9_mode_9() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 9 200 400 600 || return 1
+    local output_basename="vpx_tsvc_encoder_vp9_mode_9"
+    vpx_tsvc_encoder vp9 "${output_basename}" 9 200 400 600 || return 1
     # Mode 9 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
+    files_exist "${output_basename}" 3 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp9_mode_10() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 10 200 400 600 || return 1
+    local output_basename="vpx_tsvc_encoder_vp9_mode_10"
+    vpx_tsvc_encoder vp9 "${output_basename}" 10 200 400 600 || return 1
     # Mode 10 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
+    files_exist "${output_basename}" 3 || return 1
   fi
 }
 
 vpx_tsvc_encoder_vp9_mode_11() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    vpx_tsvc_encoder vp9 "${FUNCNAME}" 11 200 400 600 || return 1
+    local output_basename="vpx_tsvc_encoder_vp9_mode_11"
+    vpx_tsvc_encoder vp9 "${output_basename}" 11 200 400 600 || return 1
     # Mode 11 produces 3 streams
-    files_exist "${FUNCNAME}" 3 || return 1
+    files_exist "${output_basename}" 3 || return 1
   fi
 }
 
diff --git a/media/libvpx/libvpx/test/vpxdec.sh b/media/libvpx/libvpx/test/vpxdec.sh
index de51c8004e..199feae5f3 100644
--- a/media/libvpx/libvpx/test/vpxdec.sh
+++ b/media/libvpx/libvpx/test/vpxdec.sh
@@ -18,7 +18,8 @@
 vpxdec_verify_environment() {
   if [ ! -e "${VP8_IVF_FILE}" ] || [ ! -e "${VP9_WEBM_FILE}" ] || \
     [ ! -e "${VP9_FPM_WEBM_FILE}" ] || \
-    [ ! -e "${VP9_LT_50_FRAMES_WEBM_FILE}" ] ; then
+    [ ! -e "${VP9_LT_50_FRAMES_WEBM_FILE}" ] || \
+    [ ! -e "${VP9_RAW_FILE}" ]; then
     elog "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
     return 1
   fi
@@ -33,8 +34,8 @@ vpxdec_verify_environment() {
 # input file path and shifted away. All remaining parameters are passed through
 # to vpxdec.
 vpxdec_pipe() {
-  local readonly decoder="$(vpx_tool_path vpxdec)"
-  local readonly input="$1"
+  local decoder="$(vpx_tool_path vpxdec)"
+  local input="$1"
   shift
   cat "${input}" | eval "${VPX_TEST_PREFIX}" "${decoder}" - "$@" ${devnull}
 }
@@ -43,8 +44,8 @@ vpxdec_pipe() {
 # the directory containing vpxdec. $1 one is used as the input file path and
 # shifted away. All remaining parameters are passed through to vpxdec.
 vpxdec() {
-  local readonly decoder="$(vpx_tool_path vpxdec)"
-  local readonly input="$1"
+  local decoder="$(vpx_tool_path vpxdec)"
+  local input="$1"
   shift
   eval "${VPX_TEST_PREFIX}" "${decoder}" "$input" "$@" ${devnull}
 }
@@ -85,7 +86,7 @@ vpxdec_vp9_webm_frame_parallel() {
      [ "$(webm_io_available)" = "yes" ]; then
     for threads in 2 3 4 5 6 7 8; do
       vpxdec "${VP9_FPM_WEBM_FILE}" --summary --noblit --threads=$threads \
-        --frame-parallel
+        --frame-parallel || return 1
     done
   fi
 }
@@ -95,9 +96,9 @@ vpxdec_vp9_webm_less_than_50_frames() {
   # frames in actual webm_read_frame calls.
   if [ "$(vpxdec_can_decode_vp9)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
-    local readonly decoder="$(vpx_tool_path vpxdec)"
-    local readonly expected=10
-    local readonly num_frames=$(${VPX_TEST_PREFIX} "${decoder}" \
+    local decoder="$(vpx_tool_path vpxdec)"
+    local expected=10
+    local num_frames=$(${VPX_TEST_PREFIX} "${decoder}" \
       "${VP9_LT_50_FRAMES_WEBM_FILE}" --summary --noblit 2>&1 \
       | awk '/^[0-9]+ decoded frames/ { print $1 }')
     if [ "$num_frames" -ne "$expected" ]; then
@@ -107,10 +108,28 @@ vpxdec_vp9_webm_less_than_50_frames() {
   fi
 }
 
+# Ensures VP9_RAW_FILE correctly produces 1 frame instead of causing a hang.
+vpxdec_vp9_raw_file() {
+  # Ensure a raw file properly reports eof and doesn't cause a hang.
+  if [ "$(vpxdec_can_decode_vp9)" = "yes" ]; then
+    local decoder="$(vpx_tool_path vpxdec)"
+    local expected=1
+    [ -x /usr/bin/timeout ] && local TIMEOUT="/usr/bin/timeout 30s"
+    local num_frames=$(${TIMEOUT} ${VPX_TEST_PREFIX} "${decoder}" \
+      "${VP9_RAW_FILE}" --summary --noblit 2>&1 \
+      | awk '/^[0-9]+ decoded frames/ { print $1 }')
+    if [ -z "$num_frames" ] || [ "$num_frames" -ne "$expected" ]; then
+      elog "Output frames ($num_frames) != expected ($expected)"
+      return 1
+    fi
+  fi
+}
+
 vpxdec_tests="vpxdec_vp8_ivf
               vpxdec_vp8_ivf_pipe_input
               vpxdec_vp9_webm
               vpxdec_vp9_webm_frame_parallel
-              vpxdec_vp9_webm_less_than_50_frames"
+              vpxdec_vp9_webm_less_than_50_frames
+              vpxdec_vp9_raw_file"
 
 run_tests vpxdec_verify_environment "${vpxdec_tests}"
diff --git a/media/libvpx/libvpx/test/vpxenc.sh b/media/libvpx/libvpx/test/vpxenc.sh
index e8994992ae..172349a2b3 100644
--- a/media/libvpx/libvpx/test/vpxenc.sh
+++ b/media/libvpx/libvpx/test/vpxenc.sh
@@ -67,7 +67,7 @@ y4m_input_720p() {
 # Echo default vpxenc real time encoding params. $1 is the codec, which defaults
 # to vp8 if unspecified.
 vpxenc_rt_params() {
-  local readonly codec="${1:-vp8}"
+  local codec="${1:-vp8}"
   echo "--codec=${codec}
     --buf-initial-sz=500
     --buf-optimal-sz=600
@@ -90,13 +90,22 @@ vpxenc_rt_params() {
     --undershoot-pct=50"
 }
 
+# Forces --passes to 1 with CONFIG_REALTIME_ONLY.
+vpxenc_passes_param() {
+  if [ "$(vpx_config_option_enabled CONFIG_REALTIME_ONLY)" = "yes" ]; then
+    echo "--passes=1"
+  else
+    echo "--passes=2"
+  fi
+}
+
 # Wrapper function for running vpxenc with pipe input. Requires that
 # LIBVPX_BIN_PATH points to the directory containing vpxenc. $1 is used as the
 # input file path and shifted away. All remaining parameters are passed through
 # to vpxenc.
 vpxenc_pipe() {
-  local readonly encoder="$(vpx_tool_path vpxenc)"
-  local readonly input="$1"
+  local encoder="$(vpx_tool_path vpxenc)"
+  local input="$1"
   shift
   cat "${input}" | eval "${VPX_TEST_PREFIX}" "${encoder}" - \
     --test-decode=fatal \
@@ -107,8 +116,8 @@ vpxenc_pipe() {
 # the directory containing vpxenc. $1 one is used as the input file path and
 # shifted away. All remaining parameters are passed through to vpxenc.
 vpxenc() {
-  local readonly encoder="$(vpx_tool_path vpxenc)"
-  local readonly input="$1"
+  local encoder="$(vpx_tool_path vpxenc)"
+  local input="$1"
   shift
   eval "${VPX_TEST_PREFIX}" "${encoder}" "${input}" \
     --test-decode=fatal \
@@ -117,12 +126,12 @@ vpxenc() {
 
 vpxenc_vp8_ivf() {
   if [ "$(vpxenc_can_encode_vp8)" = "yes" ]; then
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.ivf"
+    local output="${VPX_TEST_OUTPUT_DIR}/vp8.ivf"
     vpxenc $(yuv_input_hantro_collage) \
       --codec=vp8 \
       --limit="${TEST_FRAMES}" \
       --ivf \
-      --output="${output}"
+      --output="${output}" || return 1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -134,11 +143,11 @@ vpxenc_vp8_ivf() {
 vpxenc_vp8_webm() {
   if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.webm"
+    local output="${VPX_TEST_OUTPUT_DIR}/vp8.webm"
     vpxenc $(yuv_input_hantro_collage) \
       --codec=vp8 \
       --limit="${TEST_FRAMES}" \
-      --output="${output}"
+      --output="${output}" || return 1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -150,10 +159,11 @@ vpxenc_vp8_webm() {
 vpxenc_vp8_webm_rt() {
   if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_rt.webm"
+    local output="${VPX_TEST_OUTPUT_DIR}/vp8_rt.webm"
     vpxenc $(yuv_input_hantro_collage) \
       $(vpxenc_rt_params vp8) \
-      --output="${output}"
+      --output="${output}" || return 1
+
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
       return 1
@@ -164,12 +174,12 @@ vpxenc_vp8_webm_rt() {
 vpxenc_vp8_webm_2pass() {
   if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.webm"
+    local output="${VPX_TEST_OUTPUT_DIR}/vp8.webm"
     vpxenc $(yuv_input_hantro_collage) \
       --codec=vp8 \
       --limit="${TEST_FRAMES}" \
       --output="${output}" \
-      --passes=2
+      --passes=2 || return 1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -181,16 +191,16 @@ vpxenc_vp8_webm_2pass() {
 vpxenc_vp8_webm_lag10_frames20() {
   if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
-    local readonly lag_total_frames=20
-    local readonly lag_frames=10
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_lag10_frames20.webm"
+    local lag_total_frames=20
+    local lag_frames=10
+    local output="${VPX_TEST_OUTPUT_DIR}/vp8_lag10_frames20.webm"
     vpxenc $(yuv_input_hantro_collage) \
       --codec=vp8 \
       --limit="${lag_total_frames}" \
       --lag-in-frames="${lag_frames}" \
       --output="${output}" \
       --auto-alt-ref=1 \
-      --passes=2
+      --passes=2 || return 1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -201,12 +211,12 @@ vpxenc_vp8_webm_lag10_frames20() {
 
 vpxenc_vp8_ivf_piped_input() {
   if [ "$(vpxenc_can_encode_vp8)" = "yes" ]; then
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_piped_input.ivf"
+    local output="${VPX_TEST_OUTPUT_DIR}/vp8_piped_input.ivf"
     vpxenc_pipe $(yuv_input_hantro_collage) \
       --codec=vp8 \
       --limit="${TEST_FRAMES}" \
       --ivf \
-      --output="${output}"
+      --output="${output}" || return 1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -217,12 +227,14 @@ vpxenc_vp8_ivf_piped_input() {
 
 vpxenc_vp9_ivf() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.ivf"
+    local output="${VPX_TEST_OUTPUT_DIR}/vp9.ivf"
+    local passes=$(vpxenc_passes_param)
     vpxenc $(yuv_input_hantro_collage) \
       --codec=vp9 \
       --limit="${TEST_FRAMES}" \
+      "${passes}" \
       --ivf \
-      --output="${output}"
+      --output="${output}" || return 1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -234,11 +246,13 @@ vpxenc_vp9_ivf() {
 vpxenc_vp9_webm() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.webm"
+    local output="${VPX_TEST_OUTPUT_DIR}/vp9.webm"
+    local passes=$(vpxenc_passes_param)
     vpxenc $(yuv_input_hantro_collage) \
       --codec=vp9 \
       --limit="${TEST_FRAMES}" \
-      --output="${output}"
+      "${passes}" \
+      --output="${output}" || return 1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -250,10 +264,10 @@ vpxenc_vp9_webm() {
 vpxenc_vp9_webm_rt() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_rt.webm"
+    local output="${VPX_TEST_OUTPUT_DIR}/vp9_rt.webm"
     vpxenc $(yuv_input_hantro_collage) \
       $(vpxenc_rt_params vp9) \
-      --output="${output}"
+      --output="${output}" || return 1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -265,11 +279,11 @@ vpxenc_vp9_webm_rt() {
 vpxenc_vp9_webm_rt_multithread_tiled() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_rt_multithread_tiled.webm"
-    local readonly tilethread_min=2
-    local readonly tilethread_max=4
-    local readonly num_threads="$(seq ${tilethread_min} ${tilethread_max})"
-    local readonly num_tile_cols="$(seq ${tilethread_min} ${tilethread_max})"
+    local output="${VPX_TEST_OUTPUT_DIR}/vp9_rt_multithread_tiled.webm"
+    local tilethread_min=2
+    local tilethread_max=4
+    local num_threads="$(seq ${tilethread_min} ${tilethread_max})"
+    local num_tile_cols="$(seq ${tilethread_min} ${tilethread_max})"
 
     for threads in ${num_threads}; do
       for tile_cols in ${num_tile_cols}; do
@@ -277,27 +291,26 @@ vpxenc_vp9_webm_rt_multithread_tiled() {
           $(vpxenc_rt_params vp9) \
           --threads=${threads} \
           --tile-columns=${tile_cols} \
-          --output="${output}"
+          --output="${output}" || return 1
+
+        if [ ! -e "${output}" ]; then
+          elog "Output file does not exist."
+          return 1
+        fi
+        rm "${output}"
       done
     done
-
-    if [ ! -e "${output}" ]; then
-      elog "Output file does not exist."
-      return 1
-    fi
-
-    rm "${output}"
   fi
 }
 
 vpxenc_vp9_webm_rt_multithread_tiled_frameparallel() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_rt_mt_t_fp.webm"
-    local readonly tilethread_min=2
-    local readonly tilethread_max=4
-    local readonly num_threads="$(seq ${tilethread_min} ${tilethread_max})"
-    local readonly num_tile_cols="$(seq ${tilethread_min} ${tilethread_max})"
+    local output="${VPX_TEST_OUTPUT_DIR}/vp9_rt_mt_t_fp.webm"
+    local tilethread_min=2
+    local tilethread_max=4
+    local num_threads="$(seq ${tilethread_min} ${tilethread_max})"
+    local num_tile_cols="$(seq ${tilethread_min} ${tilethread_max})"
 
     for threads in ${num_threads}; do
       for tile_cols in ${num_tile_cols}; do
@@ -306,28 +319,27 @@ vpxenc_vp9_webm_rt_multithread_tiled_frameparallel() {
           --threads=${threads} \
           --tile-columns=${tile_cols} \
           --frame-parallel=1 \
-          --output="${output}"
+          --output="${output}" || return 1
+
+        if [ ! -e "${output}" ]; then
+          elog "Output file does not exist."
+          return 1
+        fi
+        rm "${output}"
       done
     done
-
-    if [ ! -e "${output}" ]; then
-      elog "Output file does not exist."
-      return 1
-    fi
-
-    rm "${output}"
   fi
 }
 
 vpxenc_vp9_webm_2pass() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.webm"
+    local output="${VPX_TEST_OUTPUT_DIR}/vp9.webm"
     vpxenc $(yuv_input_hantro_collage) \
       --codec=vp9 \
       --limit="${TEST_FRAMES}" \
       --output="${output}" \
-      --passes=2
+      --passes=2 || return 1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -338,13 +350,15 @@ vpxenc_vp9_webm_2pass() {
 
 vpxenc_vp9_ivf_lossless() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless.ivf"
+    local output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless.ivf"
+    local passes=$(vpxenc_passes_param)
     vpxenc $(yuv_input_hantro_collage) \
       --codec=vp9 \
       --limit="${TEST_FRAMES}" \
       --ivf \
       --output="${output}" \
-      --lossless=1
+      "${passes}" \
+      --lossless=1 || return 1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -355,14 +369,16 @@ vpxenc_vp9_ivf_lossless() {
 
 vpxenc_vp9_ivf_minq0_maxq0() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless_minq0_maxq0.ivf"
+    local output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless_minq0_maxq0.ivf"
+    local passes=$(vpxenc_passes_param)
     vpxenc $(yuv_input_hantro_collage) \
       --codec=vp9 \
       --limit="${TEST_FRAMES}" \
       --ivf \
       --output="${output}" \
+      "${passes}" \
       --min-q=0 \
-      --max-q=0
+      --max-q=0 || return 1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -374,16 +390,17 @@ vpxenc_vp9_ivf_minq0_maxq0() {
 vpxenc_vp9_webm_lag10_frames20() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
-    local readonly lag_total_frames=20
-    local readonly lag_frames=10
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lag10_frames20.webm"
+    local lag_total_frames=20
+    local lag_frames=10
+    local output="${VPX_TEST_OUTPUT_DIR}/vp9_lag10_frames20.webm"
+    local passes=$(vpxenc_passes_param)
     vpxenc $(yuv_input_hantro_collage) \
       --codec=vp9 \
       --limit="${lag_total_frames}" \
       --lag-in-frames="${lag_frames}" \
       --output="${output}" \
-      --passes=2 \
-      --auto-alt-ref=1
+      "${passes}" \
+      --auto-alt-ref=1 || return 1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -396,11 +413,13 @@ vpxenc_vp9_webm_lag10_frames20() {
 vpxenc_vp9_webm_non_square_par() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_non_square_par.webm"
+    local output="${VPX_TEST_OUTPUT_DIR}/vp9_non_square_par.webm"
+    local passes=$(vpxenc_passes_param)
     vpxenc $(y4m_input_non_square_par) \
       --codec=vp9 \
       --limit="${TEST_FRAMES}" \
-      --output="${output}"
+      "${passes}" \
+      --output="${output}" || return 1
 
     if [ ! -e "${output}" ]; then
       elog "Output file does not exist."
@@ -409,21 +428,62 @@ vpxenc_vp9_webm_non_square_par() {
   fi
 }
 
+vpxenc_vp9_webm_sharpness() {
+  if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
+    local sharpnesses="0 1 2 3 4 5 6 7"
+    local output="${VPX_TEST_OUTPUT_DIR}/vpxenc_vp9_webm_sharpness.ivf"
+    local last_size=0
+    local this_size=0
+
+    for sharpness in ${sharpnesses}; do
+
+      vpxenc $(yuv_input_hantro_collage) \
+        --sharpness="${sharpness}" \
+        --codec=vp9 \
+        --limit=1 \
+        --cpu-used=2 \
+        --end-usage=q \
+        --cq-level=40 \
+        --output="${output}" \
+        "${passes}" || return 1
+
+      if [ ! -e "${output}" ]; then
+        elog "Output file does not exist."
+        return 1
+      fi
+
+      this_size=$(stat -c '%s' "${output}")
+      if [ "${this_size}" -lt "${last_size}" ]; then
+        elog "Higher sharpness value yielded lower file size."
+        echo "${this_size}" " < " "${last_size}"
+        return 1
+      fi
+      last_size="${this_size}"
+
+    done
+  fi
+}
+
 vpxenc_tests="vpxenc_vp8_ivf
               vpxenc_vp8_webm
               vpxenc_vp8_webm_rt
-              vpxenc_vp8_webm_2pass
-              vpxenc_vp8_webm_lag10_frames20
               vpxenc_vp8_ivf_piped_input
               vpxenc_vp9_ivf
               vpxenc_vp9_webm
               vpxenc_vp9_webm_rt
               vpxenc_vp9_webm_rt_multithread_tiled
               vpxenc_vp9_webm_rt_multithread_tiled_frameparallel
-              vpxenc_vp9_webm_2pass
               vpxenc_vp9_ivf_lossless
               vpxenc_vp9_ivf_minq0_maxq0
               vpxenc_vp9_webm_lag10_frames20
-              vpxenc_vp9_webm_non_square_par"
+              vpxenc_vp9_webm_non_square_par
+              vpxenc_vp9_webm_sharpness"
+
+if [ "$(vpx_config_option_enabled CONFIG_REALTIME_ONLY)" != "yes" ]; then
+  vpxenc_tests="$vpxenc_tests
+                vpxenc_vp8_webm_2pass
+                vpxenc_vp8_webm_lag10_frames20
+                vpxenc_vp9_webm_2pass"
+fi
 
 run_tests vpxenc_verify_environment "${vpxenc_tests}"
diff --git a/media/libvpx/libvpx/test/webm_video_source.h b/media/libvpx/libvpx/test/webm_video_source.h
index 53713618ee..6ab50c849f 100644
--- a/media/libvpx/libvpx/test/webm_video_source.h
+++ b/media/libvpx/libvpx/test/webm_video_source.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef TEST_WEBM_VIDEO_SOURCE_H_
-#define TEST_WEBM_VIDEO_SOURCE_H_
+#ifndef VPX_TEST_WEBM_VIDEO_SOURCE_H_
+#define VPX_TEST_WEBM_VIDEO_SOURCE_H_
 #include <cstdarg>
 #include <cstdio>
 #include <cstdlib>
@@ -26,35 +26,35 @@ class WebMVideoSource : public CompressedVideoSource {
  public:
   explicit WebMVideoSource(const std::string &file_name)
       : file_name_(file_name), vpx_ctx_(new VpxInputContext()),
-        webm_ctx_(new WebmInputContext()), buf_(NULL), buf_sz_(0), frame_(0),
+        webm_ctx_(new WebmInputContext()), buf_(nullptr), buf_sz_(0), frame_(0),
         end_of_file_(false) {}
 
-  virtual ~WebMVideoSource() {
-    if (vpx_ctx_->file != NULL) fclose(vpx_ctx_->file);
+  ~WebMVideoSource() override {
+    if (vpx_ctx_->file != nullptr) fclose(vpx_ctx_->file);
     webm_free(webm_ctx_);
     delete vpx_ctx_;
     delete webm_ctx_;
   }
 
-  virtual void Init() {}
+  void Init() override {}
 
-  virtual void Begin() {
+  void Begin() override {
     vpx_ctx_->file = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(vpx_ctx_->file != NULL) << "Input file open failed. Filename: "
-                                        << file_name_;
+    ASSERT_NE(vpx_ctx_->file, nullptr)
+        << "Input file open failed. Filename: " << file_name_;
 
     ASSERT_EQ(file_is_webm(webm_ctx_, vpx_ctx_), 1) << "file is not WebM";
 
     FillFrame();
   }
 
-  virtual void Next() {
+  void Next() override {
     ++frame_;
     FillFrame();
   }
 
   void FillFrame() {
-    ASSERT_TRUE(vpx_ctx_->file != NULL);
+    ASSERT_NE(vpx_ctx_->file, nullptr);
     const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_);
     ASSERT_GE(status, 0) << "webm_read_frame failed";
     if (status == 1) {
@@ -63,7 +63,7 @@ class WebMVideoSource : public CompressedVideoSource {
   }
 
   void SeekToNextKeyFrame() {
-    ASSERT_TRUE(vpx_ctx_->file != NULL);
+    ASSERT_NE(vpx_ctx_->file, nullptr);
     do {
       const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_);
       ASSERT_GE(status, 0) << "webm_read_frame failed";
@@ -74,9 +74,11 @@ class WebMVideoSource : public CompressedVideoSource {
     } while (!webm_ctx_->is_key_frame && !end_of_file_);
   }
 
-  virtual const uint8_t *cxdata() const { return end_of_file_ ? NULL : buf_; }
-  virtual size_t frame_size() const { return buf_sz_; }
-  virtual unsigned int frame_number() const { return frame_; }
+  const uint8_t *cxdata() const override {
+    return end_of_file_ ? nullptr : buf_;
+  }
+  size_t frame_size() const override { return buf_sz_; }
+  unsigned int frame_number() const override { return frame_; }
 
  protected:
   std::string file_name_;
@@ -90,4 +92,4 @@ class WebMVideoSource : public CompressedVideoSource {
 
 }  // namespace libvpx_test
 
-#endif  // TEST_WEBM_VIDEO_SOURCE_H_
+#endif  // VPX_TEST_WEBM_VIDEO_SOURCE_H_
diff --git a/media/libvpx/libvpx/test/y4m_test.cc b/media/libvpx/libvpx/test/y4m_test.cc
index ced717a7c1..3865880b60 100644
--- a/media/libvpx/libvpx/test/y4m_test.cc
+++ b/media/libvpx/libvpx/test/y4m_test.cc
@@ -10,7 +10,7 @@
 
 #include <string>
 
-#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #include "./vpx_config.h"
 #include "./y4menc.h"
@@ -40,18 +40,18 @@ const Y4mTestParam kY4mTestVectors[] = {
     "284a47a47133b12884ec3a14e959a0b6" },
   { "park_joy_90p_8_444.y4m", 8, VPX_IMG_FMT_I444,
     "90517ff33843d85de712fd4fe60dbed0" },
-  { "park_joy_90p_10_420.y4m", 10, VPX_IMG_FMT_I42016,
-    "63f21f9f717d8b8631bd2288ee87137b" },
-  { "park_joy_90p_10_422.y4m", 10, VPX_IMG_FMT_I42216,
-    "48ab51fb540aed07f7ff5af130c9b605" },
-  { "park_joy_90p_10_444.y4m", 10, VPX_IMG_FMT_I44416,
-    "067bfd75aa85ff9bae91fa3e0edd1e3e" },
-  { "park_joy_90p_12_420.y4m", 12, VPX_IMG_FMT_I42016,
-    "9e6d8f6508c6e55625f6b697bc461cef" },
-  { "park_joy_90p_12_422.y4m", 12, VPX_IMG_FMT_I42216,
-    "b239c6b301c0b835485be349ca83a7e3" },
-  { "park_joy_90p_12_444.y4m", 12, VPX_IMG_FMT_I44416,
-    "5a6481a550821dab6d0192f5c63845e9" },
+  { "park_joy_90p_10_420_20f.y4m", 10, VPX_IMG_FMT_I42016,
+    "2f56ab9809269f074df7e3daf1ce0be6" },
+  { "park_joy_90p_10_422_20f.y4m", 10, VPX_IMG_FMT_I42216,
+    "1b5c73d2e8e8c4e02dc4889ecac41c83" },
+  { "park_joy_90p_10_444_20f.y4m", 10, VPX_IMG_FMT_I44416,
+    "ec4ab5be53195c5b838d1d19e1bc2674" },
+  { "park_joy_90p_12_420_20f.y4m", 12, VPX_IMG_FMT_I42016,
+    "3370856c8ddebbd1f9bb2e66f97677f4" },
+  { "park_joy_90p_12_422_20f.y4m", 12, VPX_IMG_FMT_I42216,
+    "4eab364318dd8201acbb182e43bd4966" },
+  { "park_joy_90p_12_444_20f.y4m", 12, VPX_IMG_FMT_I44416,
+    "f189dfbbd92119fc8e5f211a550166be" },
 };
 
 static void write_image_file(const vpx_image_t *img, FILE *file) {
@@ -78,7 +78,7 @@ class Y4mVideoSourceTest : public ::testing::TestWithParam<Y4mTestParam>,
  protected:
   Y4mVideoSourceTest() : Y4mVideoSource("", 0, 0) {}
 
-  virtual ~Y4mVideoSourceTest() { CloseSource(); }
+  ~Y4mVideoSourceTest() override { CloseSource(); }
 
   virtual void Init(const std::string &file_name, int limit) {
     file_name_ = file_name;
@@ -90,7 +90,7 @@ class Y4mVideoSourceTest : public ::testing::TestWithParam<Y4mTestParam>,
 
   // Checks y4m header information
   void HeaderChecks(unsigned int bit_depth, vpx_img_fmt_t fmt) {
-    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_NE(input_file_, nullptr);
     ASSERT_EQ(y4m_.pic_w, (int)kWidth);
     ASSERT_EQ(y4m_.pic_h, (int)kHeight);
     ASSERT_EQ(img()->d_w, kWidth);
@@ -116,7 +116,7 @@ class Y4mVideoSourceTest : public ::testing::TestWithParam<Y4mTestParam>,
 
   // Checks MD5 of the raw frame data
   void Md5Check(const string &expected_md5) {
-    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_NE(input_file_, nullptr);
     libvpx_test::MD5 md5;
     for (unsigned int i = start_; i < limit_; i++) {
       md5.Add(img());
@@ -133,16 +133,16 @@ TEST_P(Y4mVideoSourceTest, SourceTest) {
   Md5Check(t.md5raw);
 }
 
-INSTANTIATE_TEST_CASE_P(C, Y4mVideoSourceTest,
-                        ::testing::ValuesIn(kY4mTestVectors));
+INSTANTIATE_TEST_SUITE_P(C, Y4mVideoSourceTest,
+                         ::testing::ValuesIn(kY4mTestVectors));
 
 class Y4mVideoWriteTest : public Y4mVideoSourceTest {
  protected:
-  Y4mVideoWriteTest() : tmpfile_(NULL) {}
+  Y4mVideoWriteTest() : tmpfile_(nullptr) {}
 
-  virtual ~Y4mVideoWriteTest() {
+  ~Y4mVideoWriteTest() override {
     delete tmpfile_;
-    input_file_ = NULL;
+    input_file_ = nullptr;
   }
 
   void ReplaceInputFile(FILE *input_file) {
@@ -155,11 +155,11 @@ class Y4mVideoWriteTest : public Y4mVideoSourceTest {
 
   // Writes out a y4m file and then reads it back
   void WriteY4mAndReadBack() {
-    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_NE(input_file_, nullptr);
     char buf[Y4M_BUFFER_SIZE] = { 0 };
     const struct VpxRational framerate = { y4m_.fps_n, y4m_.fps_d };
     tmpfile_ = new libvpx_test::TempOutFile;
-    ASSERT_TRUE(tmpfile_->file() != NULL);
+    ASSERT_NE(tmpfile_->file(), nullptr);
     y4m_write_file_header(buf, sizeof(buf), kWidth, kHeight, &framerate,
                           y4m_.vpx_fmt, y4m_.bit_depth);
     fputs(buf, tmpfile_->file());
@@ -172,7 +172,7 @@ class Y4mVideoWriteTest : public Y4mVideoSourceTest {
     ReplaceInputFile(tmpfile_->file());
   }
 
-  virtual void Init(const std::string &file_name, int limit) {
+  void Init(const std::string &file_name, int limit) override {
     Y4mVideoSourceTest::Init(file_name, limit);
     WriteY4mAndReadBack();
   }
@@ -186,6 +186,59 @@ TEST_P(Y4mVideoWriteTest, WriteTest) {
   Md5Check(t.md5raw);
 }
 
-INSTANTIATE_TEST_CASE_P(C, Y4mVideoWriteTest,
-                        ::testing::ValuesIn(kY4mTestVectors));
+INSTANTIATE_TEST_SUITE_P(C, Y4mVideoWriteTest,
+                         ::testing::ValuesIn(kY4mTestVectors));
+
+static const char kY4MRegularHeader[] =
+    "YUV4MPEG2 W4 H4 F30:1 Ip A0:0 C420jpeg XYSCSS=420JPEG\n"
+    "FRAME\n"
+    "012345678912345601230123";
+
+TEST(Y4MHeaderTest, RegularHeader) {
+  libvpx_test::TempOutFile f;
+  ASSERT_NE(f.file(), nullptr);
+  fwrite(kY4MRegularHeader, 1, sizeof(kY4MRegularHeader), f.file());
+  fflush(f.file());
+  EXPECT_EQ(0, fseek(f.file(), 0, 0));
+
+  y4m_input y4m;
+  EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/nullptr,
+                           /*num_skip=*/0, /*only_420=*/0),
+            0);
+  EXPECT_EQ(y4m.pic_w, 4);
+  EXPECT_EQ(y4m.pic_h, 4);
+  EXPECT_EQ(y4m.fps_n, 30);
+  EXPECT_EQ(y4m.fps_d, 1);
+  EXPECT_EQ(y4m.interlace, 'p');
+  EXPECT_EQ(strcmp("420jpeg", y4m.chroma_type), 0);
+  y4m_input_close(&y4m);
+}
+
+// Testing that headers over 100 characters can be parsed.
+static const char kY4MLongHeader[] =
+    "YUV4MPEG2 W4 H4 F30:1 Ip A0:0 C420jpeg XYSCSS=420JPEG "
+    "XCOLORRANGE=LIMITED XSOME_UNKNOWN_METADATA XOTHER_UNKNOWN_METADATA\n"
+    "FRAME\n"
+    "012345678912345601230123";
+
+TEST(Y4MHeaderTest, LongHeader) {
+  libvpx_test::TempOutFile f;
+  ASSERT_NE(f.file(), nullptr);
+  fwrite(kY4MLongHeader, 1, sizeof(kY4MLongHeader), f.file());
+  fflush(f.file());
+  EXPECT_EQ(fseek(f.file(), 0, 0), 0);
+
+  y4m_input y4m;
+  EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/nullptr,
+                           /*num_skip=*/0, /*only_420=*/0),
+            0);
+  EXPECT_EQ(y4m.pic_w, 4);
+  EXPECT_EQ(y4m.pic_h, 4);
+  EXPECT_EQ(y4m.fps_n, 30);
+  EXPECT_EQ(y4m.fps_d, 1);
+  EXPECT_EQ(y4m.interlace, 'p');
+  EXPECT_EQ(strcmp("420jpeg", y4m.chroma_type), 0);
+  y4m_input_close(&y4m);
+}
+
 }  // namespace
diff --git a/media/libvpx/libvpx/test/y4m_video_source.h b/media/libvpx/libvpx/test/y4m_video_source.h
index 2682ddde3d..e43e37d9e4 100644
--- a/media/libvpx/libvpx/test/y4m_video_source.h
+++ b/media/libvpx/libvpx/test/y4m_video_source.h
@@ -7,9 +7,10 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef TEST_Y4M_VIDEO_SOURCE_H_
-#define TEST_Y4M_VIDEO_SOURCE_H_
+#ifndef VPX_TEST_Y4M_VIDEO_SOURCE_H_
+#define VPX_TEST_Y4M_VIDEO_SOURCE_H_
 #include <algorithm>
+#include <memory>
 #include <string>
 
 #include "test/video_source.h"
@@ -22,11 +23,11 @@ namespace libvpx_test {
 class Y4mVideoSource : public VideoSource {
  public:
   Y4mVideoSource(const std::string &file_name, unsigned int start, int limit)
-      : file_name_(file_name), input_file_(NULL), img_(new vpx_image_t()),
+      : file_name_(file_name), input_file_(nullptr), img_(new vpx_image_t()),
         start_(start), limit_(limit), frame_(0), framerate_numerator_(0),
         framerate_denominator_(0), y4m_() {}
 
-  virtual ~Y4mVideoSource() {
+  ~Y4mVideoSource() override {
     vpx_img_free(img_.get());
     CloseSource();
   }
@@ -34,13 +35,13 @@ class Y4mVideoSource : public VideoSource {
   virtual void OpenSource() {
     CloseSource();
     input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
-                                     << file_name_;
+    ASSERT_NE(input_file_, nullptr)
+        << "Input file open failed. Filename: " << file_name_;
   }
 
   virtual void ReadSourceToStart() {
-    ASSERT_TRUE(input_file_ != NULL);
-    ASSERT_FALSE(y4m_input_open(&y4m_, input_file_, NULL, 0, 0));
+    ASSERT_NE(input_file_, nullptr);
+    ASSERT_FALSE(y4m_input_open(&y4m_, input_file_, nullptr, 0, 0));
     framerate_numerator_ = y4m_.fps_n;
     framerate_denominator_ = y4m_.fps_d;
     frame_ = 0;
@@ -50,36 +51,36 @@ class Y4mVideoSource : public VideoSource {
     FillFrame();
   }
 
-  virtual void Begin() {
+  void Begin() override {
     OpenSource();
     ReadSourceToStart();
   }
 
-  virtual void Next() {
+  void Next() override {
     ++frame_;
     FillFrame();
   }
 
-  virtual vpx_image_t *img() const {
-    return (frame_ < limit_) ? img_.get() : NULL;
+  vpx_image_t *img() const override {
+    return (frame_ < limit_) ? img_.get() : nullptr;
   }
 
   // Models a stream where Timebase = 1/FPS, so pts == frame.
-  virtual vpx_codec_pts_t pts() const { return frame_; }
+  vpx_codec_pts_t pts() const override { return frame_; }
 
-  virtual unsigned long duration() const { return 1; }
+  unsigned long duration() const override { return 1; }
 
-  virtual vpx_rational_t timebase() const {
+  vpx_rational_t timebase() const override {
     const vpx_rational_t t = { framerate_denominator_, framerate_numerator_ };
     return t;
   }
 
-  virtual unsigned int frame() const { return frame_; }
+  unsigned int frame() const override { return frame_; }
 
-  virtual unsigned int limit() const { return limit_; }
+  unsigned int limit() const override { return limit_; }
 
   virtual void FillFrame() {
-    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_NE(input_file_, nullptr);
     // Read a frame from input_file.
     y4m_input_fetch_frame(&y4m_, input_file_, img_.get());
   }
@@ -100,15 +101,15 @@ class Y4mVideoSource : public VideoSource {
   void CloseSource() {
     y4m_input_close(&y4m_);
     y4m_ = y4m_input();
-    if (input_file_ != NULL) {
+    if (input_file_ != nullptr) {
       fclose(input_file_);
-      input_file_ = NULL;
+      input_file_ = nullptr;
     }
   }
 
   std::string file_name_;
   FILE *input_file_;
-  testing::internal::scoped_ptr<vpx_image_t> img_;
+  std::unique_ptr<vpx_image_t> img_;
   unsigned int start_;
   unsigned int limit_;
   unsigned int frame_;
@@ -119,4 +120,4 @@ class Y4mVideoSource : public VideoSource {
 
 }  // namespace libvpx_test
 
-#endif  // TEST_Y4M_VIDEO_SOURCE_H_
+#endif  // VPX_TEST_Y4M_VIDEO_SOURCE_H_
diff --git a/media/libvpx/libvpx/test/yuv_temporal_filter_test.cc b/media/libvpx/libvpx/test/yuv_temporal_filter_test.cc
new file mode 100644
index 0000000000..e3a9ae9d27
--- /dev/null
+++ b/media/libvpx/libvpx/test/yuv_temporal_filter_test.cc
@@ -0,0 +1,727 @@
+/*
+ *  Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "gtest/gtest.h"
+
+#include "./vp9_rtcd.h"
+#include "test/acm_random.h"
+#include "test/buffer.h"
+#include "test/register_state_check.h"
+#include "vpx_config.h"
+#include "vpx_ports/vpx_timer.h"
+
+namespace {
+
+using ::libvpx_test::ACMRandom;
+using ::libvpx_test::Buffer;
+
+using YUVTemporalFilterFunc = void (*)(
+    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32,
+    uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator,
+    uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+
+struct TemporalFilterWithBd {
+  TemporalFilterWithBd(YUVTemporalFilterFunc func, int bitdepth)
+      : temporal_filter(func), bd(bitdepth) {}
+
+  YUVTemporalFilterFunc temporal_filter;
+  int bd;
+};
+
+std::ostream &operator<<(std::ostream &os, const TemporalFilterWithBd &tf) {
+  return os << "Bitdepth: " << tf.bd;
+}
+
+int GetFilterWeight(unsigned int row, unsigned int col,
+                    unsigned int block_height, unsigned int block_width,
+                    const int *const blk_fw, int use_32x32) {
+  if (use_32x32) {
+    return blk_fw[0];
+  }
+
+  return blk_fw[2 * (row >= block_height / 2) + (col >= block_width / 2)];
+}
+
+template <typename PixelType>
+int GetModIndex(int sum_dist, int index, int rounding, int strength,
+                int filter_weight) {
+  int mod = sum_dist * 3 / index;
+  mod += rounding;
+  mod >>= strength;
+
+  mod = VPXMIN(16, mod);
+
+  mod = 16 - mod;
+  mod *= filter_weight;
+
+  return mod;
+}
+
+template <>
+int GetModIndex<uint8_t>(int sum_dist, int index, int rounding, int strength,
+                         int filter_weight) {
+  unsigned int index_mult[14] = { 0,     0,     0,     0,     49152,
+                                  39322, 32768, 28087, 24576, 21846,
+                                  19661, 17874, 0,     15124 };
+
+  assert(index >= 0 && index <= 13);
+  assert(index_mult[index] != 0);
+
+  int mod = (clamp(sum_dist, 0, UINT16_MAX) * index_mult[index]) >> 16;
+  mod += rounding;
+  mod >>= strength;
+
+  mod = VPXMIN(16, mod);
+
+  mod = 16 - mod;
+  mod *= filter_weight;
+
+  return mod;
+}
+
+template <>
+int GetModIndex<uint16_t>(int sum_dist, int index, int rounding, int strength,
+                          int filter_weight) {
+  int64_t index_mult[14] = { 0U,          0U,          0U,          0U,
+                             3221225472U, 2576980378U, 2147483648U, 1840700270U,
+                             1610612736U, 1431655766U, 1288490189U, 1171354718U,
+                             0U,          991146300U };
+
+  assert(index >= 0 && index <= 13);
+  assert(index_mult[index] != 0);
+
+  int mod = static_cast<int>((sum_dist * index_mult[index]) >> 32);
+  mod += rounding;
+  mod >>= strength;
+
+  mod = VPXMIN(16, mod);
+
+  mod = 16 - mod;
+  mod *= filter_weight;
+
+  return mod;
+}
+
+template <typename PixelType>
+void ApplyReferenceFilter(
+    const Buffer<PixelType> &y_src, const Buffer<PixelType> &y_pre,
+    const Buffer<PixelType> &u_src, const Buffer<PixelType> &v_src,
+    const Buffer<PixelType> &u_pre, const Buffer<PixelType> &v_pre,
+    unsigned int block_width, unsigned int block_height, int ss_x, int ss_y,
+    int strength, const int *const blk_fw, int use_32x32,
+    Buffer<uint32_t> *y_accumulator, Buffer<uint16_t> *y_counter,
+    Buffer<uint32_t> *u_accumulator, Buffer<uint16_t> *u_counter,
+    Buffer<uint32_t> *v_accumulator, Buffer<uint16_t> *v_counter) {
+  const PixelType *y_src_ptr = y_src.TopLeftPixel();
+  const PixelType *y_pre_ptr = y_pre.TopLeftPixel();
+  const PixelType *u_src_ptr = u_src.TopLeftPixel();
+  const PixelType *u_pre_ptr = u_pre.TopLeftPixel();
+  const PixelType *v_src_ptr = v_src.TopLeftPixel();
+  const PixelType *v_pre_ptr = v_pre.TopLeftPixel();
+
+  const int uv_block_width = block_width >> ss_x,
+            uv_block_height = block_height >> ss_y;
+  const int y_src_stride = y_src.stride(), y_pre_stride = y_pre.stride();
+  const int uv_src_stride = u_src.stride(), uv_pre_stride = u_pre.stride();
+  const int y_diff_stride = block_width, uv_diff_stride = uv_block_width;
+
+  Buffer<int> y_dif = Buffer<int>(block_width, block_height, 0);
+  Buffer<int> u_dif = Buffer<int>(uv_block_width, uv_block_height, 0);
+  Buffer<int> v_dif = Buffer<int>(uv_block_width, uv_block_height, 0);
+
+  ASSERT_TRUE(y_dif.Init());
+  ASSERT_TRUE(u_dif.Init());
+  ASSERT_TRUE(v_dif.Init());
+  y_dif.Set(0);
+  u_dif.Set(0);
+  v_dif.Set(0);
+
+  int *y_diff_ptr = y_dif.TopLeftPixel();
+  int *u_diff_ptr = u_dif.TopLeftPixel();
+  int *v_diff_ptr = v_dif.TopLeftPixel();
+
+  uint32_t *y_accum = y_accumulator->TopLeftPixel();
+  uint32_t *u_accum = u_accumulator->TopLeftPixel();
+  uint32_t *v_accum = v_accumulator->TopLeftPixel();
+  uint16_t *y_count = y_counter->TopLeftPixel();
+  uint16_t *u_count = u_counter->TopLeftPixel();
+  uint16_t *v_count = v_counter->TopLeftPixel();
+
+  const int y_accum_stride = y_accumulator->stride();
+  const int u_accum_stride = u_accumulator->stride();
+  const int v_accum_stride = v_accumulator->stride();
+  const int y_count_stride = y_counter->stride();
+  const int u_count_stride = u_counter->stride();
+  const int v_count_stride = v_counter->stride();
+
+  const int rounding = (1 << strength) >> 1;
+
+  // Get the square diffs
+  for (int row = 0; row < static_cast<int>(block_height); row++) {
+    for (int col = 0; col < static_cast<int>(block_width); col++) {
+      const int diff = y_src_ptr[row * y_src_stride + col] -
+                       y_pre_ptr[row * y_pre_stride + col];
+      y_diff_ptr[row * y_diff_stride + col] = diff * diff;
+    }
+  }
+
+  for (int row = 0; row < uv_block_height; row++) {
+    for (int col = 0; col < uv_block_width; col++) {
+      const int u_diff = u_src_ptr[row * uv_src_stride + col] -
+                         u_pre_ptr[row * uv_pre_stride + col];
+      const int v_diff = v_src_ptr[row * uv_src_stride + col] -
+                         v_pre_ptr[row * uv_pre_stride + col];
+      u_diff_ptr[row * uv_diff_stride + col] = u_diff * u_diff;
+      v_diff_ptr[row * uv_diff_stride + col] = v_diff * v_diff;
+    }
+  }
+
+  // Apply the filter to luma
+  for (int row = 0; row < static_cast<int>(block_height); row++) {
+    for (int col = 0; col < static_cast<int>(block_width); col++) {
+      const int uv_row = row >> ss_y;
+      const int uv_col = col >> ss_x;
+      const int filter_weight = GetFilterWeight(row, col, block_height,
+                                                block_width, blk_fw, use_32x32);
+
+      // First we get the modifier for the current y pixel
+      const int y_pixel = y_pre_ptr[row * y_pre_stride + col];
+      int y_num_used = 0;
+      int y_mod = 0;
+
+      // Sum the neighboring 3x3 y pixels
+      for (int row_step = -1; row_step <= 1; row_step++) {
+        for (int col_step = -1; col_step <= 1; col_step++) {
+          const int sub_row = row + row_step;
+          const int sub_col = col + col_step;
+
+          if (sub_row >= 0 && sub_row < static_cast<int>(block_height) &&
+              sub_col >= 0 && sub_col < static_cast<int>(block_width)) {
+            y_mod += y_diff_ptr[sub_row * y_diff_stride + sub_col];
+            y_num_used++;
+          }
+        }
+      }
+
+      // Sum the corresponding uv pixels to the current y modifier
+      // Note we are rounding down instead of rounding to the nearest pixel.
+      y_mod += u_diff_ptr[uv_row * uv_diff_stride + uv_col];
+      y_mod += v_diff_ptr[uv_row * uv_diff_stride + uv_col];
+
+      y_num_used += 2;
+
+      // Set the modifier
+      y_mod = GetModIndex<PixelType>(y_mod, y_num_used, rounding, strength,
+                                     filter_weight);
+
+      // Accumulate the result
+      y_count[row * y_count_stride + col] += y_mod;
+      y_accum[row * y_accum_stride + col] += y_mod * y_pixel;
+    }
+  }
+
+  // Apply the filter to chroma
+  for (int uv_row = 0; uv_row < uv_block_height; uv_row++) {
+    for (int uv_col = 0; uv_col < uv_block_width; uv_col++) {
+      const int y_row = uv_row << ss_y;
+      const int y_col = uv_col << ss_x;
+      const int filter_weight = GetFilterWeight(
+          uv_row, uv_col, uv_block_height, uv_block_width, blk_fw, use_32x32);
+
+      const int u_pixel = u_pre_ptr[uv_row * uv_pre_stride + uv_col];
+      const int v_pixel = v_pre_ptr[uv_row * uv_pre_stride + uv_col];
+
+      int uv_num_used = 0;
+      int u_mod = 0, v_mod = 0;
+
+      // Sum the neighboring 3x3 chromal pixels to the chroma modifier
+      for (int row_step = -1; row_step <= 1; row_step++) {
+        for (int col_step = -1; col_step <= 1; col_step++) {
+          const int sub_row = uv_row + row_step;
+          const int sub_col = uv_col + col_step;
+
+          if (sub_row >= 0 && sub_row < uv_block_height && sub_col >= 0 &&
+              sub_col < uv_block_width) {
+            u_mod += u_diff_ptr[sub_row * uv_diff_stride + sub_col];
+            v_mod += v_diff_ptr[sub_row * uv_diff_stride + sub_col];
+            uv_num_used++;
+          }
+        }
+      }
+
+      // Sum all the luma pixels associated with the current luma pixel
+      for (int row_step = 0; row_step < 1 + ss_y; row_step++) {
+        for (int col_step = 0; col_step < 1 + ss_x; col_step++) {
+          const int sub_row = y_row + row_step;
+          const int sub_col = y_col + col_step;
+          const int y_diff = y_diff_ptr[sub_row * y_diff_stride + sub_col];
+
+          u_mod += y_diff;
+          v_mod += y_diff;
+          uv_num_used++;
+        }
+      }
+
+      // Set the modifier
+      u_mod = GetModIndex<PixelType>(u_mod, uv_num_used, rounding, strength,
+                                     filter_weight);
+      v_mod = GetModIndex<PixelType>(v_mod, uv_num_used, rounding, strength,
+                                     filter_weight);
+
+      // Accumulate the result
+      u_count[uv_row * u_count_stride + uv_col] += u_mod;
+      u_accum[uv_row * u_accum_stride + uv_col] += u_mod * u_pixel;
+      v_count[uv_row * v_count_stride + uv_col] += v_mod;
+      v_accum[uv_row * v_accum_stride + uv_col] += v_mod * v_pixel;
+    }
+  }
+}
+
+class YUVTemporalFilterTest
+    : public ::testing::TestWithParam<TemporalFilterWithBd> {
+ public:
+  void SetUp() override {
+    filter_func_ = GetParam().temporal_filter;
+    bd_ = GetParam().bd;
+    use_highbd_ = (bd_ != 8);
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    saturate_test_ = 0;
+    num_repeats_ = 10;
+
+    ASSERT_TRUE(bd_ == 8 || bd_ == 10 || bd_ == 12);
+  }
+
+ protected:
+  template <typename PixelType>
+  void CompareTestWithParam(int width, int height, int ss_x, int ss_y,
+                            int filter_strength, int use_32x32,
+                            const int *filter_weight);
+  template <typename PixelType>
+  void RunTestFilterWithParam(int width, int height, int ss_x, int ss_y,
+                              int filter_strength, int use_32x32,
+                              const int *filter_weight);
+  YUVTemporalFilterFunc filter_func_;
+  ACMRandom rnd_;
+  int saturate_test_;
+  int num_repeats_;
+  int use_highbd_;
+  int bd_;
+};
+
+template <typename PixelType>
+void YUVTemporalFilterTest::CompareTestWithParam(int width, int height,
+                                                 int ss_x, int ss_y,
+                                                 int filter_strength,
+                                                 int use_32x32,
+                                                 const int *filter_weight) {
+  const int uv_width = width >> ss_x, uv_height = height >> ss_y;
+
+  Buffer<PixelType> y_src = Buffer<PixelType>(width, height, 0);
+  Buffer<PixelType> y_pre = Buffer<PixelType>(width, height, 0);
+  Buffer<uint16_t> y_count_ref = Buffer<uint16_t>(width, height, 0);
+  Buffer<uint32_t> y_accum_ref = Buffer<uint32_t>(width, height, 0);
+  Buffer<uint16_t> y_count_tst = Buffer<uint16_t>(width, height, 0);
+  Buffer<uint32_t> y_accum_tst = Buffer<uint32_t>(width, height, 0);
+
+  Buffer<PixelType> u_src = Buffer<PixelType>(uv_width, uv_height, 0);
+  Buffer<PixelType> u_pre = Buffer<PixelType>(uv_width, uv_height, 0);
+  Buffer<uint16_t> u_count_ref = Buffer<uint16_t>(uv_width, uv_height, 0);
+  Buffer<uint32_t> u_accum_ref = Buffer<uint32_t>(uv_width, uv_height, 0);
+  Buffer<uint16_t> u_count_tst = Buffer<uint16_t>(uv_width, uv_height, 0);
+  Buffer<uint32_t> u_accum_tst = Buffer<uint32_t>(uv_width, uv_height, 0);
+
+  Buffer<PixelType> v_src = Buffer<PixelType>(uv_width, uv_height, 0);
+  Buffer<PixelType> v_pre = Buffer<PixelType>(uv_width, uv_height, 0);
+  Buffer<uint16_t> v_count_ref = Buffer<uint16_t>(uv_width, uv_height, 0);
+  Buffer<uint32_t> v_accum_ref = Buffer<uint32_t>(uv_width, uv_height, 0);
+  Buffer<uint16_t> v_count_tst = Buffer<uint16_t>(uv_width, uv_height, 0);
+  Buffer<uint32_t> v_accum_tst = Buffer<uint32_t>(uv_width, uv_height, 0);
+
+  ASSERT_TRUE(y_src.Init());
+  ASSERT_TRUE(y_pre.Init());
+  ASSERT_TRUE(y_count_ref.Init());
+  ASSERT_TRUE(y_accum_ref.Init());
+  ASSERT_TRUE(y_count_tst.Init());
+  ASSERT_TRUE(y_accum_tst.Init());
+  ASSERT_TRUE(u_src.Init());
+  ASSERT_TRUE(u_pre.Init());
+  ASSERT_TRUE(u_count_ref.Init());
+  ASSERT_TRUE(u_accum_ref.Init());
+  ASSERT_TRUE(u_count_tst.Init());
+  ASSERT_TRUE(u_accum_tst.Init());
+
+  ASSERT_TRUE(v_src.Init());
+  ASSERT_TRUE(v_pre.Init());
+  ASSERT_TRUE(v_count_ref.Init());
+  ASSERT_TRUE(v_accum_ref.Init());
+  ASSERT_TRUE(v_count_tst.Init());
+  ASSERT_TRUE(v_accum_tst.Init());
+
+  y_accum_ref.Set(0);
+  y_accum_tst.Set(0);
+  y_count_ref.Set(0);
+  y_count_tst.Set(0);
+  u_accum_ref.Set(0);
+  u_accum_tst.Set(0);
+  u_count_ref.Set(0);
+  u_count_tst.Set(0);
+  v_accum_ref.Set(0);
+  v_accum_tst.Set(0);
+  v_count_ref.Set(0);
+  v_count_tst.Set(0);
+
+  for (int repeats = 0; repeats < num_repeats_; repeats++) {
+    if (saturate_test_) {
+      const int max_val = (1 << bd_) - 1;
+      y_src.Set(max_val);
+      y_pre.Set(0);
+      u_src.Set(max_val);
+      u_pre.Set(0);
+      v_src.Set(max_val);
+      v_pre.Set(0);
+    } else {
+      y_src.Set(&rnd_, 0, 7 << (bd_ - 8));
+      y_pre.Set(&rnd_, 0, 7 << (bd_ - 8));
+      u_src.Set(&rnd_, 0, 7 << (bd_ - 8));
+      u_pre.Set(&rnd_, 0, 7 << (bd_ - 8));
+      v_src.Set(&rnd_, 0, 7 << (bd_ - 8));
+      v_pre.Set(&rnd_, 0, 7 << (bd_ - 8));
+    }
+
+    ApplyReferenceFilter<PixelType>(
+        y_src, y_pre, u_src, v_src, u_pre, v_pre, width, height, ss_x, ss_y,
+        filter_strength, filter_weight, use_32x32, &y_accum_ref, &y_count_ref,
+        &u_accum_ref, &u_count_ref, &v_accum_ref, &v_count_ref);
+
+    ASM_REGISTER_STATE_CHECK(filter_func_(
+        reinterpret_cast<const uint8_t *>(y_src.TopLeftPixel()), y_src.stride(),
+        reinterpret_cast<const uint8_t *>(y_pre.TopLeftPixel()), y_pre.stride(),
+        reinterpret_cast<const uint8_t *>(u_src.TopLeftPixel()),
+        reinterpret_cast<const uint8_t *>(v_src.TopLeftPixel()), u_src.stride(),
+        reinterpret_cast<const uint8_t *>(u_pre.TopLeftPixel()),
+        reinterpret_cast<const uint8_t *>(v_pre.TopLeftPixel()), u_pre.stride(),
+        width, height, ss_x, ss_y, filter_strength, filter_weight, use_32x32,
+        y_accum_tst.TopLeftPixel(), y_count_tst.TopLeftPixel(),
+        u_accum_tst.TopLeftPixel(), u_count_tst.TopLeftPixel(),
+        v_accum_tst.TopLeftPixel(), v_count_tst.TopLeftPixel()));
+
+    EXPECT_TRUE(y_accum_tst.CheckValues(y_accum_ref));
+    EXPECT_TRUE(y_count_tst.CheckValues(y_count_ref));
+    EXPECT_TRUE(u_accum_tst.CheckValues(u_accum_ref));
+    EXPECT_TRUE(u_count_tst.CheckValues(u_count_ref));
+    EXPECT_TRUE(v_accum_tst.CheckValues(v_accum_ref));
+    EXPECT_TRUE(v_count_tst.CheckValues(v_count_ref));
+
+    if (HasFailure()) {
+      if (use_32x32) {
+        printf("SS_X: %d, SS_Y: %d, Strength: %d, Weight: %d\n", ss_x, ss_y,
+               filter_strength, *filter_weight);
+      } else {
+        printf("SS_X: %d, SS_Y: %d, Strength: %d, Weights: %d,%d,%d,%d\n", ss_x,
+               ss_y, filter_strength, filter_weight[0], filter_weight[1],
+               filter_weight[2], filter_weight[3]);
+      }
+      y_accum_tst.PrintDifference(y_accum_ref);
+      y_count_tst.PrintDifference(y_count_ref);
+      u_accum_tst.PrintDifference(u_accum_ref);
+      u_count_tst.PrintDifference(u_count_ref);
+      v_accum_tst.PrintDifference(v_accum_ref);
+      v_count_tst.PrintDifference(v_count_ref);
+
+      return;
+    }
+  }
+}
+
+template <typename PixelType>
+void YUVTemporalFilterTest::RunTestFilterWithParam(int width, int height,
+                                                   int ss_x, int ss_y,
+                                                   int filter_strength,
+                                                   int use_32x32,
+                                                   const int *filter_weight) {
+  const int uv_width = width >> ss_x, uv_height = height >> ss_y;
+
+  Buffer<PixelType> y_src = Buffer<PixelType>(width, height, 0);
+  Buffer<PixelType> y_pre = Buffer<PixelType>(width, height, 0);
+  Buffer<uint16_t> y_count = Buffer<uint16_t>(width, height, 0);
+  Buffer<uint32_t> y_accum = Buffer<uint32_t>(width, height, 0);
+
+  Buffer<PixelType> u_src = Buffer<PixelType>(uv_width, uv_height, 0);
+  Buffer<PixelType> u_pre = Buffer<PixelType>(uv_width, uv_height, 0);
+  Buffer<uint16_t> u_count = Buffer<uint16_t>(uv_width, uv_height, 0);
+  Buffer<uint32_t> u_accum = Buffer<uint32_t>(uv_width, uv_height, 0);
+
+  Buffer<PixelType> v_src = Buffer<PixelType>(uv_width, uv_height, 0);
+  Buffer<PixelType> v_pre = Buffer<PixelType>(uv_width, uv_height, 0);
+  Buffer<uint16_t> v_count = Buffer<uint16_t>(uv_width, uv_height, 0);
+  Buffer<uint32_t> v_accum = Buffer<uint32_t>(uv_width, uv_height, 0);
+
+  ASSERT_TRUE(y_src.Init());
+  ASSERT_TRUE(y_pre.Init());
+  ASSERT_TRUE(y_count.Init());
+  ASSERT_TRUE(y_accum.Init());
+
+  ASSERT_TRUE(u_src.Init());
+  ASSERT_TRUE(u_pre.Init());
+  ASSERT_TRUE(u_count.Init());
+  ASSERT_TRUE(u_accum.Init());
+
+  ASSERT_TRUE(v_src.Init());
+  ASSERT_TRUE(v_pre.Init());
+  ASSERT_TRUE(v_count.Init());
+  ASSERT_TRUE(v_accum.Init());
+
+  y_accum.Set(0);
+  y_count.Set(0);
+
+  u_accum.Set(0);
+  u_count.Set(0);
+
+  v_accum.Set(0);
+  v_count.Set(0);
+
+  y_src.Set(&rnd_, 0, 7 << (bd_ - 8));
+  y_pre.Set(&rnd_, 0, 7 << (bd_ - 8));
+  u_src.Set(&rnd_, 0, 7 << (bd_ - 8));
+  u_pre.Set(&rnd_, 0, 7 << (bd_ - 8));
+  v_src.Set(&rnd_, 0, 7 << (bd_ - 8));
+  v_pre.Set(&rnd_, 0, 7 << (bd_ - 8));
+
+  for (int repeats = 0; repeats < num_repeats_; repeats++) {
+    ASM_REGISTER_STATE_CHECK(filter_func_(
+        reinterpret_cast<const uint8_t *>(y_src.TopLeftPixel()), y_src.stride(),
+        reinterpret_cast<const uint8_t *>(y_pre.TopLeftPixel()), y_pre.stride(),
+        reinterpret_cast<const uint8_t *>(u_src.TopLeftPixel()),
+        reinterpret_cast<const uint8_t *>(v_src.TopLeftPixel()), u_src.stride(),
+        reinterpret_cast<const uint8_t *>(u_pre.TopLeftPixel()),
+        reinterpret_cast<const uint8_t *>(v_pre.TopLeftPixel()), u_pre.stride(),
+        width, height, ss_x, ss_y, filter_strength, filter_weight, use_32x32,
+        y_accum.TopLeftPixel(), y_count.TopLeftPixel(), u_accum.TopLeftPixel(),
+        u_count.TopLeftPixel(), v_accum.TopLeftPixel(),
+        v_count.TopLeftPixel()));
+  }
+}
+
+TEST_P(YUVTemporalFilterTest, Use32x32) {
+  const int width = 32, height = 32;
+  const int use_32x32 = 1;
+
+  for (int ss_x = 0; ss_x <= 1; ss_x++) {
+    for (int ss_y = 0; ss_y <= 1; ss_y++) {
+      for (int filter_strength = 0; filter_strength <= 6;
+           filter_strength += 2) {
+        for (int filter_weight = 0; filter_weight <= 2; filter_weight++) {
+          if (use_highbd_) {
+            const int adjusted_strength = filter_strength + 2 * (bd_ - 8);
+            CompareTestWithParam<uint16_t>(width, height, ss_x, ss_y,
+                                           adjusted_strength, use_32x32,
+                                           &filter_weight);
+          } else {
+            CompareTestWithParam<uint8_t>(width, height, ss_x, ss_y,
+                                          filter_strength, use_32x32,
+                                          &filter_weight);
+          }
+          ASSERT_FALSE(HasFailure());
+        }
+      }
+    }
+  }
+}
+
+TEST_P(YUVTemporalFilterTest, Use16x16) {
+  const int width = 32, height = 32;
+  const int use_32x32 = 0;
+
+  for (int ss_x = 0; ss_x <= 1; ss_x++) {
+    for (int ss_y = 0; ss_y <= 1; ss_y++) {
+      for (int filter_idx = 0; filter_idx < 3 * 3 * 3 * 3; filter_idx++) {
+        // Set up the filter
+        int filter_weight[4];
+        int filter_idx_cp = filter_idx;
+        for (int idx = 0; idx < 4; idx++) {
+          filter_weight[idx] = filter_idx_cp % 3;
+          filter_idx_cp /= 3;
+        }
+
+        // Test each parameter
+        for (int filter_strength = 0; filter_strength <= 6;
+             filter_strength += 2) {
+          if (use_highbd_) {
+            const int adjusted_strength = filter_strength + 2 * (bd_ - 8);
+            CompareTestWithParam<uint16_t>(width, height, ss_x, ss_y,
+                                           adjusted_strength, use_32x32,
+                                           filter_weight);
+          } else {
+            CompareTestWithParam<uint8_t>(width, height, ss_x, ss_y,
+                                          filter_strength, use_32x32,
+                                          filter_weight);
+          }
+
+          ASSERT_FALSE(HasFailure());
+        }
+      }
+    }
+  }
+}
+
+TEST_P(YUVTemporalFilterTest, SaturationTest) {
+  const int width = 32, height = 32;
+  const int use_32x32 = 1;
+  const int filter_weight = 1;
+  saturate_test_ = 1;
+
+  for (int ss_x = 0; ss_x <= 1; ss_x++) {
+    for (int ss_y = 0; ss_y <= 1; ss_y++) {
+      for (int filter_strength = 0; filter_strength <= 6;
+           filter_strength += 2) {
+        if (use_highbd_) {
+          const int adjusted_strength = filter_strength + 2 * (bd_ - 8);
+          CompareTestWithParam<uint16_t>(width, height, ss_x, ss_y,
+                                         adjusted_strength, use_32x32,
+                                         &filter_weight);
+        } else {
+          CompareTestWithParam<uint8_t>(width, height, ss_x, ss_y,
+                                        filter_strength, use_32x32,
+                                        &filter_weight);
+        }
+
+        ASSERT_FALSE(HasFailure());
+      }
+    }
+  }
+}
+
+TEST_P(YUVTemporalFilterTest, DISABLED_Speed) {
+  const int width = 32, height = 32;
+  num_repeats_ = 1000;
+
+  for (int use_32x32 = 0; use_32x32 <= 1; use_32x32++) {
+    const int num_filter_weights = use_32x32 ? 3 : 3 * 3 * 3 * 3;
+    for (int ss_x = 0; ss_x <= 1; ss_x++) {
+      for (int ss_y = 0; ss_y <= 1; ss_y++) {
+        for (int filter_idx = 0; filter_idx < num_filter_weights;
+             filter_idx++) {
+          // Set up the filter
+          int filter_weight[4];
+          int filter_idx_cp = filter_idx;
+          for (int idx = 0; idx < 4; idx++) {
+            filter_weight[idx] = filter_idx_cp % 3;
+            filter_idx_cp /= 3;
+          }
+
+          // Test each parameter
+          for (int filter_strength = 0; filter_strength <= 6;
+               filter_strength += 2) {
+            vpx_usec_timer timer;
+            vpx_usec_timer_start(&timer);
+
+            if (use_highbd_) {
+              RunTestFilterWithParam<uint16_t>(width, height, ss_x, ss_y,
+                                               filter_strength, use_32x32,
+                                               filter_weight);
+            } else {
+              RunTestFilterWithParam<uint8_t>(width, height, ss_x, ss_y,
+                                              filter_strength, use_32x32,
+                                              filter_weight);
+            }
+
+            vpx_usec_timer_mark(&timer);
+            const int elapsed_time =
+                static_cast<int>(vpx_usec_timer_elapsed(&timer));
+
+            printf(
+                "Bitdepth: %d, Use 32X32: %d, SS_X: %d, SS_Y: %d, Weight Idx: "
+                "%d, Strength: %d, Time: %5d\n",
+                bd_, use_32x32, ss_x, ss_y, filter_idx, filter_strength,
+                elapsed_time);
+          }
+        }
+      }
+    }
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define WRAP_HIGHBD_FUNC(func, bd)                                            \
+  void wrap_##func##_##bd(                                                    \
+      const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,           \
+      int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,           \
+      int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,          \
+      int uv_pre_stride, unsigned int block_width, unsigned int block_height, \
+      int ss_x, int ss_y, int strength, const int *const blk_fw,              \
+      int use_32x32, uint32_t *y_accumulator, uint16_t *y_count,              \
+      uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator,    \
+      uint16_t *v_count) {                                                    \
+    func(reinterpret_cast<const uint16_t *>(y_src), y_src_stride,             \
+         reinterpret_cast<const uint16_t *>(y_pre), y_pre_stride,             \
+         reinterpret_cast<const uint16_t *>(u_src),                           \
+         reinterpret_cast<const uint16_t *>(v_src), uv_src_stride,            \
+         reinterpret_cast<const uint16_t *>(u_pre),                           \
+         reinterpret_cast<const uint16_t *>(v_pre), uv_pre_stride,            \
+         block_width, block_height, ss_x, ss_y, strength, blk_fw, use_32x32,  \
+         y_accumulator, y_count, u_accumulator, u_count, v_accumulator,       \
+         v_count);                                                            \
+  }
+
+WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_c, 10)
+WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_c, 12)
+
+INSTANTIATE_TEST_SUITE_P(
+    C, YUVTemporalFilterTest,
+    ::testing::Values(
+        TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_c_10, 10),
+        TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_c_12, 12)));
+#if HAVE_SSE4_1
+WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_sse4_1, 10)
+WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_sse4_1, 12)
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, YUVTemporalFilterTest,
+    ::testing::Values(
+        TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_sse4_1_10,
+                             10),
+        TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_sse4_1_12,
+                             12)));
+#endif  // HAVE_SSE4_1
+#if HAVE_NEON
+WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_neon, 10)
+WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_neon, 12)
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, YUVTemporalFilterTest,
+    ::testing::Values(
+        TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_neon_10,
+                             10),
+        TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_neon_12,
+                             12)));
+#endif  // HAVE_NEON
+#else
+INSTANTIATE_TEST_SUITE_P(
+    C, YUVTemporalFilterTest,
+    ::testing::Values(TemporalFilterWithBd(&vp9_apply_temporal_filter_c, 8)));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE4_1, YUVTemporalFilterTest,
+                         ::testing::Values(TemporalFilterWithBd(
+                             &vp9_apply_temporal_filter_sse4_1, 8)));
+#endif  // HAVE_SSE4_1
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, YUVTemporalFilterTest,
+                         ::testing::Values(TemporalFilterWithBd(
+                             &vp9_apply_temporal_filter_neon, 8)));
+#endif  // HAVE_NEON
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+}  // namespace
diff --git a/media/libvpx/libvpx/test/yuv_video_source.h b/media/libvpx/libvpx/test/yuv_video_source.h
index 71ad2ab9a6..bb5eec5bb8 100644
--- a/media/libvpx/libvpx/test/yuv_video_source.h
+++ b/media/libvpx/libvpx/test/yuv_video_source.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef TEST_YUV_VIDEO_SOURCE_H_
-#define TEST_YUV_VIDEO_SOURCE_H_
+#ifndef VPX_TEST_YUV_VIDEO_SOURCE_H_
+#define VPX_TEST_YUV_VIDEO_SOURCE_H_
 
 #include <cstdio>
 #include <cstdlib>
@@ -27,24 +27,24 @@ class YUVVideoSource : public VideoSource {
   YUVVideoSource(const std::string &file_name, vpx_img_fmt format,
                  unsigned int width, unsigned int height, int rate_numerator,
                  int rate_denominator, unsigned int start, int limit)
-      : file_name_(file_name), input_file_(NULL), img_(NULL), start_(start),
-        limit_(limit), frame_(0), width_(0), height_(0),
+      : file_name_(file_name), input_file_(nullptr), img_(nullptr),
+        start_(start), limit_(limit), frame_(0), width_(0), height_(0),
         format_(VPX_IMG_FMT_NONE), framerate_numerator_(rate_numerator),
         framerate_denominator_(rate_denominator) {
     // This initializes format_, raw_size_, width_, height_ and allocates img.
     SetSize(width, height, format);
   }
 
-  virtual ~YUVVideoSource() {
+  ~YUVVideoSource() override {
     vpx_img_free(img_);
     if (input_file_) fclose(input_file_);
   }
 
-  virtual void Begin() {
+  void Begin() override {
     if (input_file_) fclose(input_file_);
     input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
-                                     << file_name_;
+    ASSERT_NE(input_file_, nullptr)
+        << "Input file open failed. Filename: " << file_name_;
     if (start_) {
       fseek(input_file_, static_cast<unsigned>(raw_size_) * start_, SEEK_SET);
     }
@@ -53,37 +53,40 @@ class YUVVideoSource : public VideoSource {
     FillFrame();
   }
 
-  virtual void Next() {
+  void Next() override {
     ++frame_;
     FillFrame();
   }
 
-  virtual vpx_image_t *img() const { return (frame_ < limit_) ? img_ : NULL; }
+  vpx_image_t *img() const override {
+    return (frame_ < limit_) ? img_ : nullptr;
+  }
 
   // Models a stream where Timebase = 1/FPS, so pts == frame.
-  virtual vpx_codec_pts_t pts() const { return frame_; }
+  vpx_codec_pts_t pts() const override { return frame_; }
 
-  virtual unsigned long duration() const { return 1; }
+  unsigned long duration() const override { return 1; }
 
-  virtual vpx_rational_t timebase() const {
+  vpx_rational_t timebase() const override {
     const vpx_rational_t t = { framerate_denominator_, framerate_numerator_ };
     return t;
   }
 
-  virtual unsigned int frame() const { return frame_; }
+  unsigned int frame() const override { return frame_; }
 
-  virtual unsigned int limit() const { return limit_; }
+  unsigned int limit() const override { return limit_; }
 
   virtual void SetSize(unsigned int width, unsigned int height,
                        vpx_img_fmt format) {
     if (width != width_ || height != height_ || format != format_) {
       vpx_img_free(img_);
-      img_ = vpx_img_alloc(NULL, format, width, height, 1);
-      ASSERT_TRUE(img_ != NULL);
+      img_ = vpx_img_alloc(nullptr, format, width, height, 1);
+      ASSERT_NE(img_, nullptr);
       width_ = width;
       height_ = height;
       format_ = format;
       switch (format) {
+        case VPX_IMG_FMT_NV12:
         case VPX_IMG_FMT_I420: raw_size_ = width * height * 3 / 2; break;
         case VPX_IMG_FMT_I422: raw_size_ = width * height * 2; break;
         case VPX_IMG_FMT_I440: raw_size_ = width * height * 2; break;
@@ -98,7 +101,7 @@ class YUVVideoSource : public VideoSource {
   }
 
   virtual void FillFrame() {
-    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_NE(input_file_, nullptr);
     // Read a frame from input_file.
     if (fread(img_->img_data, raw_size_, 1, input_file_) == 0) {
       limit_ = frame_;
@@ -122,4 +125,4 @@ class YUVVideoSource : public VideoSource {
 
 }  // namespace libvpx_test
 
-#endif  // TEST_YUV_VIDEO_SOURCE_H_
+#endif  // VPX_TEST_YUV_VIDEO_SOURCE_H_
diff --git a/media/libvpx/libvpx/third_party/googletest/README.libvpx b/media/libvpx/libvpx/third_party/googletest/README.libvpx
index 0e3b8b9377..5f6b01b0ec 100644
--- a/media/libvpx/libvpx/third_party/googletest/README.libvpx
+++ b/media/libvpx/libvpx/third_party/googletest/README.libvpx
@@ -1,7 +1,7 @@
-URL: http://code.google.com/p/googletest/
-Version: 1.7.0
+URL: https://github.com/google/googletest.git
+Version: release-1.12.1
 License: BSD
-License File: COPYING
+License File: LICENSE
 
 Description:
 Google's framework for writing C++ tests on a variety of platforms
@@ -12,10 +12,18 @@ failures, various options for running the tests, and XML test report
 generation.
 
 Local Modifications:
-- Removed unused declarations of kPathSeparatorString to have warning
-  free build.
-- Added GTEST_ATTRIBUTE_UNUSED_ to test registering dummies in TEST_P
-  and INSTANTIATE_TEST_CASE_P to remove warnings about unused variables
-  under GCC 5.
-- Only define g_in_fast_death_test_child for non-Windows builds; quiets an
-  unused variable warning.
+- Remove everything but:
+  .clang-format
+  CONTRIBUTORS
+  googletest/
+   include
+   README.md
+   src
+  LICENSE
+- Move .clang-format, CONTRIBUTORS, and LICENSE into googletest/
+- In googletest/include/gtest/internal/custom/gtest-port.h, define
+  GTEST_HAS_NOTIFICATION_ as 1 and use a stub Notification class to fix
+  the mingw32 g++ compilation errors caused by the lack of std::mutex
+  and std::condition_variable in the <mutex> and <condition_variable>
+  headers if mingw32 is configured with the win32 threads option. See
+  https://stackoverflow.com/questions/17242516/mingw-w64-threads-posix-vs-win32
diff --git a/media/libvpx/libvpx/third_party/googletest/src/.clang-format b/media/libvpx/libvpx/third_party/googletest/src/.clang-format
new file mode 100644
index 0000000000..5b9bfe6d22
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/.clang-format
@@ -0,0 +1,4 @@
+# Run manually to reformat a file:
+# clang-format -i --style=file <file>
+Language:        Cpp
+BasedOnStyle:  Google
diff --git a/media/libvpx/libvpx/third_party/googletest/src/CHANGES b/media/libvpx/libvpx/third_party/googletest/src/CHANGES
deleted file mode 100644
index 0552132421..0000000000
--- a/media/libvpx/libvpx/third_party/googletest/src/CHANGES
+++ /dev/null
@@ -1,157 +0,0 @@
-Changes for 1.7.0:
-
-* New feature: death tests are supported on OpenBSD and in iOS
-  simulator now.
-* New feature: Google Test now implements a protocol to allow
-  a test runner to detect that a test program has exited
-  prematurely and report it as a failure (before it would be
-  falsely reported as a success if the exit code is 0).
-* New feature: Test::RecordProperty() can now be used outside of the
-  lifespan of a test method, in which case it will be attributed to
-  the current test case or the test program in the XML report.
-* New feature (potentially breaking): --gtest_list_tests now prints
-  the type parameters and value parameters for each test.
-* Improvement: char pointers and char arrays are now escaped properly
-  in failure messages.
-* Improvement: failure summary in XML reports now includes file and
-  line information.
-* Improvement: the <testsuites> XML element now has a timestamp attribute.
-* Improvement: When --gtest_filter is specified, XML report now doesn't
-  contain information about tests that are filtered out.
-* Fixed the bug where long --gtest_filter flag values are truncated in
-  death tests.
-* Potentially breaking change: RUN_ALL_TESTS() is now implemented as a
-  function instead of a macro in order to work better with Clang.
-* Compatibility fixes with C++ 11 and various platforms.
-* Bug/warning fixes.
-
-Changes for 1.6.0:
-
-* New feature: ADD_FAILURE_AT() for reporting a test failure at the
-  given source location -- useful for writing testing utilities.
-* New feature: the universal value printer is moved from Google Mock
-  to Google Test.
-* New feature: type parameters and value parameters are reported in
-  the XML report now.
-* A gtest_disable_pthreads CMake option.
-* Colored output works in GNU Screen sessions now.
-* Parameters of value-parameterized tests are now printed in the
-  textual output.
-* Failures from ad hoc test assertions run before RUN_ALL_TESTS() are
-  now correctly reported.
-* Arguments of ASSERT_XY and EXPECT_XY no longer need to support << to
-  ostream.
-* More complete handling of exceptions.
-* GTEST_ASSERT_XY can be used instead of ASSERT_XY in case the latter
-  name is already used by another library.
-* --gtest_catch_exceptions is now true by default, allowing a test
-  program to continue after an exception is thrown.
-* Value-parameterized test fixtures can now derive from Test and
-  WithParamInterface<T> separately, easing conversion of legacy tests.
-* Death test messages are clearly marked to make them more
-  distinguishable from other messages.
-* Compatibility fixes for Android, Google Native Client, MinGW, HP UX,
-  PowerPC, Lucid autotools, libCStd, Sun C++, Borland C++ Builder (Code Gear),
-  IBM XL C++ (Visual Age C++), and C++0x.
-* Bug fixes and implementation clean-ups.
-* Potentially incompatible changes: disables the harmful 'make install'
-  command in autotools.
-
-Changes for 1.5.0:
-
- * New feature: assertions can be safely called in multiple threads
-   where the pthreads library is available.
- * New feature: predicates used inside EXPECT_TRUE() and friends
-   can now generate custom failure messages.
- * New feature: Google Test can now be compiled as a DLL.
- * New feature: fused source files are included.
- * New feature: prints help when encountering unrecognized Google Test flags.
- * Experimental feature: CMake build script (requires CMake 2.6.4+).
- * Experimental feature: the Pump script for meta programming.
- * double values streamed to an assertion are printed with enough precision
-   to differentiate any two different values.
- * Google Test now works on Solaris and AIX.
- * Build and test script improvements.
- * Bug fixes and implementation clean-ups.
-
- Potentially breaking changes:
-
- * Stopped supporting VC++ 7.1 with exceptions disabled.
- * Dropped support for 'make install'.
-
-Changes for 1.4.0:
-
- * New feature: the event listener API
- * New feature: test shuffling
- * New feature: the XML report format is closer to junitreport and can
-   be parsed by Hudson now.
- * New feature: when a test runs under Visual Studio, its failures are
-   integrated in the IDE.
- * New feature: /MD(d) versions of VC++ projects.
- * New feature: elapsed time for the tests is printed by default.
- * New feature: comes with a TR1 tuple implementation such that Boost
-   is no longer needed for Combine().
- * New feature: EXPECT_DEATH_IF_SUPPORTED macro and friends.
- * New feature: the Xcode project can now produce static gtest
-   libraries in addition to a framework.
- * Compatibility fixes for Solaris, Cygwin, minGW, Windows Mobile,
-   Symbian, gcc, and C++Builder.
- * Bug fixes and implementation clean-ups.
-
-Changes for 1.3.0:
-
- * New feature: death tests on Windows, Cygwin, and Mac.
- * New feature: ability to use Google Test assertions in other testing
-   frameworks.
- * New feature: ability to run disabled test via
-   --gtest_also_run_disabled_tests.
- * New feature: the --help flag for printing the usage.
- * New feature: access to Google Test flag values in user code.
- * New feature: a script that packs Google Test into one .h and one
-   .cc file for easy deployment.
- * New feature: support for distributing test functions to multiple
-   machines (requires support from the test runner).
- * Bug fixes and implementation clean-ups.
-
-Changes for 1.2.1:
-
- * Compatibility fixes for Linux IA-64 and IBM z/OS.
- * Added support for using Boost and other TR1 implementations.
- * Changes to the build scripts to support upcoming release of Google C++
-   Mocking Framework.
- * Added Makefile to the distribution package.
- * Improved build instructions in README.
-
-Changes for 1.2.0:
-
- * New feature: value-parameterized tests.
- * New feature: the ASSERT/EXPECT_(NON)FATAL_FAILURE(_ON_ALL_THREADS)
-   macros.
- * Changed the XML report format to match JUnit/Ant's.
- * Added tests to the Xcode project.
- * Added scons/SConscript for building with SCons.
- * Added src/gtest-all.cc for building Google Test from a single file.
- * Fixed compatibility with Solaris and z/OS.
- * Enabled running Python tests on systems with python 2.3 installed,
-   e.g. Mac OS X 10.4.
- * Bug fixes.
-
-Changes for 1.1.0:
-
- * New feature: type-parameterized tests.
- * New feature: exception assertions.
- * New feature: printing elapsed time of tests.
- * Improved the robustness of death tests.
- * Added an Xcode project and samples.
- * Adjusted the output format on Windows to be understandable by Visual Studio.
- * Minor bug fixes.
-
-Changes for 1.0.1:
-
- * Added project files for Visual Studio 7.1.
- * Fixed issues with compiling on Mac OS X.
- * Fixed issues with compiling on Cygwin.
-
-Changes for 1.0.0:
-
- * Initial Open Source release of Google Test
diff --git a/media/libvpx/libvpx/third_party/googletest/src/CONTRIBUTORS b/media/libvpx/libvpx/third_party/googletest/src/CONTRIBUTORS
index feae2fc044..77397a5b53 100644
--- a/media/libvpx/libvpx/third_party/googletest/src/CONTRIBUTORS
+++ b/media/libvpx/libvpx/third_party/googletest/src/CONTRIBUTORS
@@ -5,33 +5,61 @@
 
 Ajay Joshi <jaj@google.com>
 Balázs Dán <balazs.dan@gmail.com>
+Benoit Sigoure <tsuna@google.com>
 Bharat Mediratta <bharat@menalto.com>
+Bogdan Piloca <boo@google.com>
 Chandler Carruth <chandlerc@google.com>
 Chris Prince <cprince@google.com>
 Chris Taylor <taylorc@google.com>
 Dan Egnor <egnor@google.com>
+Dave MacLachlan <dmaclach@gmail.com>
+David Anderson <danderson@google.com>
+Dean Sturtevant
 Eric Roman <eroman@chromium.org>
+Gene Volovich <gv@cite.com>
 Hady Zalek <hady.zalek@gmail.com>
+Hal Burch <gmock@hburch.com>
 Jeffrey Yasskin <jyasskin@google.com>
+Jim Keller <jimkeller@google.com>
+Joe Walnes <joe@truemesh.com>
+Jon Wray <jwray@google.com>
 Jói Sigurðsson <joi@google.com>
 Keir Mierle <mierle@gmail.com>
 Keith Ray <keith.ray@gmail.com>
 Kenton Varda <kenton@google.com>
+Kostya Serebryany <kcc@google.com>
+Krystian Kuzniarek <krystian.kuzniarek@gmail.com>
+Lev Makhlis
 Manuel Klimek <klimek@google.com>
+Mario Tanev <radix@google.com>
+Mark Paskin
 Markus Heule <markus.heule@gmail.com>
+Martijn Vels <mvels@google.com>
+Matthew Simmons <simmonmt@acm.org>
 Mika Raento <mikie@iki.fi>
+Mike Bland <mbland@google.com>
 Miklós Fazekas <mfazekas@szemafor.com>
+Neal Norwitz <nnorwitz@gmail.com>
+Nermin Ozkiranartli <nermin@google.com>
+Owen Carlsen <ocarlsen@google.com>
+Paneendra Ba <paneendra@google.com>
 Pasi Valminen <pasi.valminen@gmail.com>
 Patrick Hanna <phanna@google.com>
 Patrick Riley <pfr@google.com>
+Paul Menage <menage@google.com>
 Peter Kaminski <piotrk@google.com>
+Piotr Kaminski <piotrk@google.com>
 Preston Jackson <preston.a.jackson@gmail.com>
 Rainer Klaffenboeck <rainer.klaffenboeck@dynatrace.com>
 Russ Cox <rsc@google.com>
 Russ Rufer <russ@pentad.com>
 Sean Mcafee <eefacm@gmail.com>
 Sigurður Ásgeirsson <siggi@google.com>
+Sverre Sundsdal <sundsdal@gmail.com>
+Szymon Sobik <sobik.szymon@gmail.com>
+Takeshi Yoshino <tyoshino@google.com>
 Tracy Bialik <tracy@pentad.com>
 Vadim Berman <vadimb@google.com>
 Vlad Losev <vladl@google.com>
+Wolfgang Klier <wklier@google.com>
 Zhanyong Wan <wan@google.com>
diff --git a/media/libvpx/libvpx/third_party/googletest/src/README b/media/libvpx/libvpx/third_party/googletest/src/README
deleted file mode 100644
index 26f35a8479..0000000000
--- a/media/libvpx/libvpx/third_party/googletest/src/README
+++ /dev/null
@@ -1,435 +0,0 @@
-Google C++ Testing Framework
-============================
-
-http://code.google.com/p/googletest/
-
-Overview
---------
-
-Google's framework for writing C++ tests on a variety of platforms
-(Linux, Mac OS X, Windows, Windows CE, Symbian, etc).  Based on the
-xUnit architecture.  Supports automatic test discovery, a rich set of
-assertions, user-defined assertions, death tests, fatal and non-fatal
-failures, various options for running the tests, and XML test report
-generation.
-
-Please see the project page above for more information as well as the
-mailing list for questions, discussions, and development.  There is
-also an IRC channel on OFTC (irc.oftc.net) #gtest available.  Please
-join us!
-
-Requirements for End Users
---------------------------
-
-Google Test is designed to have fairly minimal requirements to build
-and use with your projects, but there are some.  Currently, we support
-Linux, Windows, Mac OS X, and Cygwin.  We will also make our best
-effort to support other platforms (e.g. Solaris, AIX, and z/OS).
-However, since core members of the Google Test project have no access
-to these platforms, Google Test may have outstanding issues there.  If
-you notice any problems on your platform, please notify
-googletestframework@googlegroups.com.  Patches for fixing them are
-even more welcome!
-
-### Linux Requirements ###
-
-These are the base requirements to build and use Google Test from a source
-package (as described below):
-  * GNU-compatible Make or gmake
-  * POSIX-standard shell
-  * POSIX(-2) Regular Expressions (regex.h)
-  * A C++98-standard-compliant compiler
-
-### Windows Requirements ###
-
-  * Microsoft Visual C++ 7.1 or newer
-
-### Cygwin Requirements ###
-
-  * Cygwin 1.5.25-14 or newer
-
-### Mac OS X Requirements ###
-
-  * Mac OS X 10.4 Tiger or newer
-  * Developer Tools Installed
-
-Also, you'll need CMake 2.6.4 or higher if you want to build the
-samples using the provided CMake script, regardless of the platform.
-
-Requirements for Contributors
------------------------------
-
-We welcome patches.  If you plan to contribute a patch, you need to
-build Google Test and its own tests from an SVN checkout (described
-below), which has further requirements:
-
-  * Python version 2.3 or newer (for running some of the tests and
-    re-generating certain source files from templates)
-  * CMake 2.6.4 or newer
-
-Getting the Source
-------------------
-
-There are two primary ways of getting Google Test's source code: you
-can download a stable source release in your preferred archive format,
-or directly check out the source from our Subversion (SVN) repositary.
-The SVN checkout requires a few extra steps and some extra software
-packages on your system, but lets you track the latest development and
-make patches much more easily, so we highly encourage it.
-
-### Source Package ###
-
-Google Test is released in versioned source packages which can be
-downloaded from the download page [1].  Several different archive
-formats are provided, but the only difference is the tools used to
-manipulate them, and the size of the resulting file.  Download
-whichever you are most comfortable with.
-
-  [1] http://code.google.com/p/googletest/downloads/list
-
-Once the package is downloaded, expand it using whichever tools you
-prefer for that type.  This will result in a new directory with the
-name "gtest-X.Y.Z" which contains all of the source code.  Here are
-some examples on Linux:
-
-  tar -xvzf gtest-X.Y.Z.tar.gz
-  tar -xvjf gtest-X.Y.Z.tar.bz2
-  unzip gtest-X.Y.Z.zip
-
-### SVN Checkout ###
-
-To check out the main branch (also known as the "trunk") of Google
-Test, run the following Subversion command:
-
-  svn checkout http://googletest.googlecode.com/svn/trunk/ gtest-svn
-
-Setting up the Build
---------------------
-
-To build Google Test and your tests that use it, you need to tell your
-build system where to find its headers and source files.  The exact
-way to do it depends on which build system you use, and is usually
-straightforward.
-
-### Generic Build Instructions ###
-
-Suppose you put Google Test in directory ${GTEST_DIR}.  To build it,
-create a library build target (or a project as called by Visual Studio
-and Xcode) to compile
-
-  ${GTEST_DIR}/src/gtest-all.cc
-
-with ${GTEST_DIR}/include in the system header search path and ${GTEST_DIR}
-in the normal header search path.  Assuming a Linux-like system and gcc,
-something like the following will do:
-
-  g++ -isystem ${GTEST_DIR}/include -I${GTEST_DIR} \
-      -pthread -c ${GTEST_DIR}/src/gtest-all.cc
-  ar -rv libgtest.a gtest-all.o
-
-(We need -pthread as Google Test uses threads.)
-
-Next, you should compile your test source file with
-${GTEST_DIR}/include in the system header search path, and link it
-with gtest and any other necessary libraries:
-
-  g++ -isystem ${GTEST_DIR}/include -pthread path/to/your_test.cc libgtest.a \
-      -o your_test
-
-As an example, the make/ directory contains a Makefile that you can
-use to build Google Test on systems where GNU make is available
-(e.g. Linux, Mac OS X, and Cygwin).  It doesn't try to build Google
-Test's own tests.  Instead, it just builds the Google Test library and
-a sample test.  You can use it as a starting point for your own build
-script.
-
-If the default settings are correct for your environment, the
-following commands should succeed:
-
-  cd ${GTEST_DIR}/make
-  make
-  ./sample1_unittest
-
-If you see errors, try to tweak the contents of make/Makefile to make
-them go away.  There are instructions in make/Makefile on how to do
-it.
-
-### Using CMake ###
-
-Google Test comes with a CMake build script (CMakeLists.txt) that can
-be used on a wide range of platforms ("C" stands for cross-platofrm.).
-If you don't have CMake installed already, you can download it for
-free from http://www.cmake.org/.
-
-CMake works by generating native makefiles or build projects that can
-be used in the compiler environment of your choice.  The typical
-workflow starts with:
-
-  mkdir mybuild       # Create a directory to hold the build output.
-  cd mybuild
-  cmake ${GTEST_DIR}  # Generate native build scripts.
-
-If you want to build Google Test's samples, you should replace the
-last command with
-
-  cmake -Dgtest_build_samples=ON ${GTEST_DIR}
-
-If you are on a *nix system, you should now see a Makefile in the
-current directory.  Just type 'make' to build gtest.
-
-If you use Windows and have Vistual Studio installed, a gtest.sln file
-and several .vcproj files will be created.  You can then build them
-using Visual Studio.
-
-On Mac OS X with Xcode installed, a .xcodeproj file will be generated.
-
-### Legacy Build Scripts ###
-
-Before settling on CMake, we have been providing hand-maintained build
-projects/scripts for Visual Studio, Xcode, and Autotools.  While we
-continue to provide them for convenience, they are not actively
-maintained any more.  We highly recommend that you follow the
-instructions in the previous two sections to integrate Google Test
-with your existing build system.
-
-If you still need to use the legacy build scripts, here's how:
-
-The msvc\ folder contains two solutions with Visual C++ projects.
-Open the gtest.sln or gtest-md.sln file using Visual Studio, and you
-are ready to build Google Test the same way you build any Visual
-Studio project.  Files that have names ending with -md use DLL
-versions of Microsoft runtime libraries (the /MD or the /MDd compiler
-option).  Files without that suffix use static versions of the runtime
-libraries (the /MT or the /MTd option).  Please note that one must use
-the same option to compile both gtest and the test code.  If you use
-Visual Studio 2005 or above, we recommend the -md version as /MD is
-the default for new projects in these versions of Visual Studio.
-
-On Mac OS X, open the gtest.xcodeproj in the xcode/ folder using
-Xcode.  Build the "gtest" target.  The universal binary framework will
-end up in your selected build directory (selected in the Xcode
-"Preferences..." -> "Building" pane and defaults to xcode/build).
-Alternatively, at the command line, enter:
-
-  xcodebuild
-
-This will build the "Release" configuration of gtest.framework in your
-default build location.  See the "xcodebuild" man page for more
-information about building different configurations and building in
-different locations.
-
-If you wish to use the Google Test Xcode project with Xcode 4.x and
-above, you need to either:
- * update the SDK configuration options in xcode/Config/General.xconfig.
-   Comment options SDKROOT, MACOS_DEPLOYMENT_TARGET, and GCC_VERSION. If
-   you choose this route you lose the ability to target earlier versions
-   of MacOS X.
- * Install an SDK for an earlier version. This doesn't appear to be
-   supported by Apple, but has been reported to work
-   (http://stackoverflow.com/questions/5378518).
-
-Tweaking Google Test
---------------------
-
-Google Test can be used in diverse environments.  The default
-configuration may not work (or may not work well) out of the box in
-some environments.  However, you can easily tweak Google Test by
-defining control macros on the compiler command line.  Generally,
-these macros are named like GTEST_XYZ and you define them to either 1
-or 0 to enable or disable a certain feature.
-
-We list the most frequently used macros below.  For a complete list,
-see file include/gtest/internal/gtest-port.h.
-
-### Choosing a TR1 Tuple Library ###
-
-Some Google Test features require the C++ Technical Report 1 (TR1)
-tuple library, which is not yet available with all compilers.  The
-good news is that Google Test implements a subset of TR1 tuple that's
-enough for its own need, and will automatically use this when the
-compiler doesn't provide TR1 tuple.
-
-Usually you don't need to care about which tuple library Google Test
-uses.  However, if your project already uses TR1 tuple, you need to
-tell Google Test to use the same TR1 tuple library the rest of your
-project uses, or the two tuple implementations will clash.  To do
-that, add
-
-  -DGTEST_USE_OWN_TR1_TUPLE=0
-
-to the compiler flags while compiling Google Test and your tests.  If
-you want to force Google Test to use its own tuple library, just add
-
-  -DGTEST_USE_OWN_TR1_TUPLE=1
-
-to the compiler flags instead.
-
-If you don't want Google Test to use tuple at all, add
-
-  -DGTEST_HAS_TR1_TUPLE=0
-
-and all features using tuple will be disabled.
-
-### Multi-threaded Tests ###
-
-Google Test is thread-safe where the pthread library is available.
-After #include "gtest/gtest.h", you can check the GTEST_IS_THREADSAFE
-macro to see whether this is the case (yes if the macro is #defined to
-1, no if it's undefined.).
-
-If Google Test doesn't correctly detect whether pthread is available
-in your environment, you can force it with
-
-  -DGTEST_HAS_PTHREAD=1
-
-or
-
-  -DGTEST_HAS_PTHREAD=0
-
-When Google Test uses pthread, you may need to add flags to your
-compiler and/or linker to select the pthread library, or you'll get
-link errors.  If you use the CMake script or the deprecated Autotools
-script, this is taken care of for you.  If you use your own build
-script, you'll need to read your compiler and linker's manual to
-figure out what flags to add.
-
-### As a Shared Library (DLL) ###
-
-Google Test is compact, so most users can build and link it as a
-static library for the simplicity.  You can choose to use Google Test
-as a shared library (known as a DLL on Windows) if you prefer.
-
-To compile *gtest* as a shared library, add
-
-  -DGTEST_CREATE_SHARED_LIBRARY=1
-
-to the compiler flags.  You'll also need to tell the linker to produce
-a shared library instead - consult your linker's manual for how to do
-it.
-
-To compile your *tests* that use the gtest shared library, add
-
-  -DGTEST_LINKED_AS_SHARED_LIBRARY=1
-
-to the compiler flags.
-
-Note: while the above steps aren't technically necessary today when
-using some compilers (e.g. GCC), they may become necessary in the
-future, if we decide to improve the speed of loading the library (see
-http://gcc.gnu.org/wiki/Visibility for details).  Therefore you are
-recommended to always add the above flags when using Google Test as a
-shared library.  Otherwise a future release of Google Test may break
-your build script.
-
-### Avoiding Macro Name Clashes ###
-
-In C++, macros don't obey namespaces.  Therefore two libraries that
-both define a macro of the same name will clash if you #include both
-definitions.  In case a Google Test macro clashes with another
-library, you can force Google Test to rename its macro to avoid the
-conflict.
-
-Specifically, if both Google Test and some other code define macro
-FOO, you can add
-
-  -DGTEST_DONT_DEFINE_FOO=1
-
-to the compiler flags to tell Google Test to change the macro's name
-from FOO to GTEST_FOO.  Currently FOO can be FAIL, SUCCEED, or TEST.
-For example, with -DGTEST_DONT_DEFINE_TEST=1, you'll need to write
-
-  GTEST_TEST(SomeTest, DoesThis) { ... }
-
-instead of
-
-  TEST(SomeTest, DoesThis) { ... }
-
-in order to define a test.
-
-Upgrating from an Earlier Version
----------------------------------
-
-We strive to keep Google Test releases backward compatible.
-Sometimes, though, we have to make some breaking changes for the
-users' long-term benefits.  This section describes what you'll need to
-do if you are upgrading from an earlier version of Google Test.
-
-### Upgrading from 1.3.0 or Earlier ###
-
-You may need to explicitly enable or disable Google Test's own TR1
-tuple library.  See the instructions in section "Choosing a TR1 Tuple
-Library".
-
-### Upgrading from 1.4.0 or Earlier ###
-
-The Autotools build script (configure + make) is no longer officially
-supportted.  You are encouraged to migrate to your own build system or
-use CMake.  If you still need to use Autotools, you can find
-instructions in the README file from Google Test 1.4.0.
-
-On platforms where the pthread library is available, Google Test uses
-it in order to be thread-safe.  See the "Multi-threaded Tests" section
-for what this means to your build script.
-
-If you use Microsoft Visual C++ 7.1 with exceptions disabled, Google
-Test will no longer compile.  This should affect very few people, as a
-large portion of STL (including <string>) doesn't compile in this mode
-anyway.  We decided to stop supporting it in order to greatly simplify
-Google Test's implementation.
-
-Developing Google Test
-----------------------
-
-This section discusses how to make your own changes to Google Test.
-
-### Testing Google Test Itself ###
-
-To make sure your changes work as intended and don't break existing
-functionality, you'll want to compile and run Google Test's own tests.
-For that you can use CMake:
-
-  mkdir mybuild
-  cd mybuild
-  cmake -Dgtest_build_tests=ON ${GTEST_DIR}
-
-Make sure you have Python installed, as some of Google Test's tests
-are written in Python.  If the cmake command complains about not being
-able to find Python ("Could NOT find PythonInterp (missing:
-PYTHON_EXECUTABLE)"), try telling it explicitly where your Python
-executable can be found:
-
-  cmake -DPYTHON_EXECUTABLE=path/to/python -Dgtest_build_tests=ON ${GTEST_DIR}
-
-Next, you can build Google Test and all of its own tests.  On *nix,
-this is usually done by 'make'.  To run the tests, do
-
-  make test
-
-All tests should pass.
-
-### Regenerating Source Files ###
-
-Some of Google Test's source files are generated from templates (not
-in the C++ sense) using a script.  A template file is named FOO.pump,
-where FOO is the name of the file it will generate.  For example, the
-file include/gtest/internal/gtest-type-util.h.pump is used to generate
-gtest-type-util.h in the same directory.
-
-Normally you don't need to worry about regenerating the source files,
-unless you need to modify them.  In that case, you should modify the
-corresponding .pump files instead and run the pump.py Python script to
-regenerate them.  You can find pump.py in the scripts/ directory.
-Read the Pump manual [2] for how to use it.
-
-  [2] http://code.google.com/p/googletest/wiki/PumpManual
-
-### Contributing a Patch ###
-
-We welcome patches.  Please read the Google Test developer's guide [3]
-for how you can contribute.  In particular, make sure you have signed
-the Contributor License Agreement, or we won't be able to accept the
-patch.
-
-  [3] http://code.google.com/p/googletest/wiki/GoogleTestDevGuide
-
-Happy testing!
diff --git a/media/libvpx/libvpx/third_party/googletest/src/README.md b/media/libvpx/libvpx/third_party/googletest/src/README.md
new file mode 100644
index 0000000000..d26b309ed0
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/README.md
@@ -0,0 +1,217 @@
+### Generic Build Instructions
+
+#### Setup
+
+To build GoogleTest and your tests that use it, you need to tell your build
+system where to find its headers and source files. The exact way to do it
+depends on which build system you use, and is usually straightforward.
+
+### Build with CMake
+
+GoogleTest comes with a CMake build script
+([CMakeLists.txt](https://github.com/google/googletest/blob/master/CMakeLists.txt))
+that can be used on a wide range of platforms ("C" stands for cross-platform.).
+If you don't have CMake installed already, you can download it for free from
+<http://www.cmake.org/>.
+
+CMake works by generating native makefiles or build projects that can be used in
+the compiler environment of your choice. You can either build GoogleTest as a
+standalone project or it can be incorporated into an existing CMake build for
+another project.
+
+#### Standalone CMake Project
+
+When building GoogleTest as a standalone project, the typical workflow starts
+with
+
+```
+git clone https://github.com/google/googletest.git -b release-1.11.0
+cd googletest        # Main directory of the cloned repository.
+mkdir build          # Create a directory to hold the build output.
+cd build
+cmake ..             # Generate native build scripts for GoogleTest.
+```
+
+The above command also includes GoogleMock by default. And so, if you want to
+build only GoogleTest, you should replace the last command with
+
+```
+cmake .. -DBUILD_GMOCK=OFF
+```
+
+If you are on a \*nix system, you should now see a Makefile in the current
+directory. Just type `make` to build GoogleTest. And then you can simply install
+GoogleTest if you are a system administrator.
+
+```
+make
+sudo make install    # Install in /usr/local/ by default
+```
+
+If you use Windows and have Visual Studio installed, a `gtest.sln` file and
+several `.vcproj` files will be created. You can then build them using Visual
+Studio.
+
+On Mac OS X with Xcode installed, a `.xcodeproj` file will be generated.
+
+#### Incorporating Into An Existing CMake Project
+
+If you want to use GoogleTest in a project which already uses CMake, the easiest
+way is to get installed libraries and headers.
+
+*   Import GoogleTest by using `find_package` (or `pkg_check_modules`). For
+    example, if `find_package(GTest CONFIG REQUIRED)` succeeds, you can use the
+    libraries as `GTest::gtest`, `GTest::gmock`.
+
+And a more robust and flexible approach is to build GoogleTest as part of that
+project directly. This is done by making the GoogleTest source code available to
+the main build and adding it using CMake's `add_subdirectory()` command. This
+has the significant advantage that the same compiler and linker settings are
+used between GoogleTest and the rest of your project, so issues associated with
+using incompatible libraries (eg debug/release), etc. are avoided. This is
+particularly useful on Windows. Making GoogleTest's source code available to the
+main build can be done a few different ways:
+
+*   Download the GoogleTest source code manually and place it at a known
+    location. This is the least flexible approach and can make it more difficult
+    to use with continuous integration systems, etc.
+*   Embed the GoogleTest source code as a direct copy in the main project's
+    source tree. This is often the simplest approach, but is also the hardest to
+    keep up to date. Some organizations may not permit this method.
+*   Add GoogleTest as a git submodule or equivalent. This may not always be
+    possible or appropriate. Git submodules, for example, have their own set of
+    advantages and drawbacks.
+*   Use CMake to download GoogleTest as part of the build's configure step. This
+    approach doesn't have the limitations of the other methods.
+
+The last of the above methods is implemented with a small piece of CMake code
+that downloads and pulls the GoogleTest code into the main build.
+
+Just add to your `CMakeLists.txt`:
+
+```cmake
+include(FetchContent)
+FetchContent_Declare(
+  googletest
+  # Specify the commit you depend on and update it regularly.
+  URL https://github.com/google/googletest/archive/e2239ee6043f73722e7aa812a459f54a28552929.zip
+)
+# For Windows: Prevent overriding the parent project's compiler/linker settings
+set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+FetchContent_MakeAvailable(googletest)
+
+# Now simply link against gtest or gtest_main as needed. Eg
+add_executable(example example.cpp)
+target_link_libraries(example gtest_main)
+add_test(NAME example_test COMMAND example)
+```
+
+Note that this approach requires CMake 3.14 or later due to its use of the
+`FetchContent_MakeAvailable()` command.
+
+##### Visual Studio Dynamic vs Static Runtimes
+
+By default, new Visual Studio projects link the C runtimes dynamically but
+GoogleTest links them statically. This will generate an error that looks
+something like the following: gtest.lib(gtest-all.obj) : error LNK2038: mismatch
+detected for 'RuntimeLibrary': value 'MTd_StaticDebug' doesn't match value
+'MDd_DynamicDebug' in main.obj
+
+GoogleTest already has a CMake option for this: `gtest_force_shared_crt`
+
+Enabling this option will make gtest link the runtimes dynamically too, and
+match the project in which it is included.
+
+#### C++ Standard Version
+
+An environment that supports C++11 is required in order to successfully build
+GoogleTest. One way to ensure this is to specify the standard in the top-level
+project, for example by using the `set(CMAKE_CXX_STANDARD 11)` command. If this
+is not feasible, for example in a C project using GoogleTest for validation,
+then it can be specified by adding it to the options for cmake via the
+`DCMAKE_CXX_FLAGS` option.
+
+### Tweaking GoogleTest
+
+GoogleTest can be used in diverse environments. The default configuration may
+not work (or may not work well) out of the box in some environments. However,
+you can easily tweak GoogleTest by defining control macros on the compiler
+command line. Generally, these macros are named like `GTEST_XYZ` and you define
+them to either 1 or 0 to enable or disable a certain feature.
+
+We list the most frequently used macros below. For a complete list, see file
+[include/gtest/internal/gtest-port.h](https://github.com/google/googletest/blob/master/googletest/include/gtest/internal/gtest-port.h).
+
+### Multi-threaded Tests
+
+GoogleTest is thread-safe where the pthread library is available. After
+`#include "gtest/gtest.h"`, you can check the
+`GTEST_IS_THREADSAFE` macro to see whether this is the case (yes if the macro is
+`#defined` to 1, no if it's undefined.).
+
+If GoogleTest doesn't correctly detect whether pthread is available in your
+environment, you can force it with
+
+    -DGTEST_HAS_PTHREAD=1
+
+or
+
+    -DGTEST_HAS_PTHREAD=0
+
+When GoogleTest uses pthread, you may need to add flags to your compiler and/or
+linker to select the pthread library, or you'll get link errors. If you use the
+CMake script, this is taken care of for you. If you use your own build script,
+you'll need to read your compiler and linker's manual to figure out what flags
+to add.
+
+### As a Shared Library (DLL)
+
+GoogleTest is compact, so most users can build and link it as a static library
+for the simplicity. You can choose to use GoogleTest as a shared library (known
+as a DLL on Windows) if you prefer.
+
+To compile *gtest* as a shared library, add
+
+    -DGTEST_CREATE_SHARED_LIBRARY=1
+
+to the compiler flags. You'll also need to tell the linker to produce a shared
+library instead - consult your linker's manual for how to do it.
+
+To compile your *tests* that use the gtest shared library, add
+
+    -DGTEST_LINKED_AS_SHARED_LIBRARY=1
+
+to the compiler flags.
+
+Note: while the above steps aren't technically necessary today when using some
+compilers (e.g. GCC), they may become necessary in the future, if we decide to
+improve the speed of loading the library (see
+<http://gcc.gnu.org/wiki/Visibility> for details). Therefore you are recommended
+to always add the above flags when using GoogleTest as a shared library.
+Otherwise a future release of GoogleTest may break your build script.
+
+### Avoiding Macro Name Clashes
+
+In C++, macros don't obey namespaces. Therefore two libraries that both define a
+macro of the same name will clash if you `#include` both definitions. In case a
+GoogleTest macro clashes with another library, you can force GoogleTest to
+rename its macro to avoid the conflict.
+
+Specifically, if both GoogleTest and some other code define macro FOO, you can
+add
+
+    -DGTEST_DONT_DEFINE_FOO=1
+
+to the compiler flags to tell GoogleTest to change the macro's name from `FOO`
+to `GTEST_FOO`. Currently `FOO` can be `ASSERT_EQ`, `ASSERT_FALSE`, `ASSERT_GE`,
+`ASSERT_GT`, `ASSERT_LE`, `ASSERT_LT`, `ASSERT_NE`, `ASSERT_TRUE`,
+`EXPECT_FALSE`, `EXPECT_TRUE`, `FAIL`, `SUCCEED`, `TEST`, or `TEST_F`. For
+example, with `-DGTEST_DONT_DEFINE_TEST=1`, you'll need to write
+
+    GTEST_TEST(SomeTest, DoesThis) { ... }
+
+instead of
+
+    TEST(SomeTest, DoesThis) { ... }
+
+in order to define a test.
diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-assertion-result.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-assertion-result.h
new file mode 100644
index 0000000000..addbb59c64
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-assertion-result.h
@@ -0,0 +1,237 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This file implements the AssertionResult type.
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_ASSERTION_RESULT_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_ASSERTION_RESULT_H_
+
+#include <memory>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-port.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251                                   \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+
+// A class for indicating whether an assertion was successful.  When
+// the assertion wasn't successful, the AssertionResult object
+// remembers a non-empty message that describes how it failed.
+//
+// To create an instance of this class, use one of the factory functions
+// (AssertionSuccess() and AssertionFailure()).
+//
+// This class is useful for two purposes:
+//   1. Defining predicate functions to be used with Boolean test assertions
+//      EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts
+//   2. Defining predicate-format functions to be
+//      used with predicate assertions (ASSERT_PRED_FORMAT*, etc).
+//
+// For example, if you define IsEven predicate:
+//
+//   testing::AssertionResult IsEven(int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess();
+//     else
+//       return testing::AssertionFailure() << n << " is odd";
+//   }
+//
+// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5)))
+// will print the message
+//
+//   Value of: IsEven(Fib(5))
+//     Actual: false (5 is odd)
+//   Expected: true
+//
+// instead of a more opaque
+//
+//   Value of: IsEven(Fib(5))
+//     Actual: false
+//   Expected: true
+//
+// in case IsEven is a simple Boolean predicate.
+//
+// If you expect your predicate to be reused and want to support informative
+// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up
+// about half as often as positive ones in our tests), supply messages for
+// both success and failure cases:
+//
+//   testing::AssertionResult IsEven(int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess() << n << " is even";
+//     else
+//       return testing::AssertionFailure() << n << " is odd";
+//   }
+//
+// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print
+//
+//   Value of: IsEven(Fib(6))
+//     Actual: true (8 is even)
+//   Expected: false
+//
+// NB: Predicates that support negative Boolean assertions have reduced
+// performance in positive ones so be careful not to use them in tests
+// that have lots (tens of thousands) of positive Boolean assertions.
+//
+// To use this class with EXPECT_PRED_FORMAT assertions such as:
+//
+//   // Verifies that Foo() returns an even number.
+//   EXPECT_PRED_FORMAT1(IsEven, Foo());
+//
+// you need to define:
+//
+//   testing::AssertionResult IsEven(const char* expr, int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess();
+//     else
+//       return testing::AssertionFailure()
+//         << "Expected: " << expr << " is even\n  Actual: it's " << n;
+//   }
+//
+// If Foo() returns 5, you will see the following message:
+//
+//   Expected: Foo() is even
+//     Actual: it's 5
+//
+class GTEST_API_ AssertionResult {
+ public:
+  // Copy constructor.
+  // Used in EXPECT_TRUE/FALSE(assertion_result).
+  AssertionResult(const AssertionResult& other);
+
+// C4800 is a level 3 warning in Visual Studio 2015 and earlier.
+// This warning is not emitted in Visual Studio 2017.
+// This warning is off by default starting in Visual Studio 2019 but can be
+// enabled with command-line options.
+#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920)
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */)
+#endif
+
+  // Used in the EXPECT_TRUE/FALSE(bool_expression).
+  //
+  // T must be contextually convertible to bool.
+  //
+  // The second parameter prevents this overload from being considered if
+  // the argument is implicitly convertible to AssertionResult. In that case
+  // we want AssertionResult's copy constructor to be used.
+  template <typename T>
+  explicit AssertionResult(
+      const T& success,
+      typename std::enable_if<
+          !std::is_convertible<T, AssertionResult>::value>::type*
+      /*enabler*/
+      = nullptr)
+      : success_(success) {}
+
+#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920)
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif
+
+  // Assignment operator.
+  AssertionResult& operator=(AssertionResult other) {
+    swap(other);
+    return *this;
+  }
+
+  // Returns true if and only if the assertion succeeded.
+  operator bool() const { return success_; }  // NOLINT
+
+  // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
+  AssertionResult operator!() const;
+
+  // Returns the text streamed into this AssertionResult. Test assertions
+  // use it when they fail (i.e., the predicate's outcome doesn't match the
+  // assertion's expectation). When nothing has been streamed into the
+  // object, returns an empty string.
+  const char* message() const {
+    return message_.get() != nullptr ? message_->c_str() : "";
+  }
+  // Deprecated; please use message() instead.
+  const char* failure_message() const { return message(); }
+
+  // Streams a custom failure message into this object.
+  template <typename T>
+  AssertionResult& operator<<(const T& value) {
+    AppendMessage(Message() << value);
+    return *this;
+  }
+
+  // Allows streaming basic output manipulators such as endl or flush into
+  // this object.
+  AssertionResult& operator<<(
+      ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) {
+    AppendMessage(Message() << basic_manipulator);
+    return *this;
+  }
+
+ private:
+  // Appends the contents of message to message_.
+  void AppendMessage(const Message& a_message) {
+    if (message_.get() == nullptr) message_.reset(new ::std::string);
+    message_->append(a_message.GetString().c_str());
+  }
+
+  // Swap the contents of this AssertionResult with other.
+  void swap(AssertionResult& other);
+
+  // Stores result of the assertion predicate.
+  bool success_;
+  // Stores the message describing the condition in case the expectation
+  // construct is not satisfied with the predicate's outcome.
+  // Referenced via a pointer to avoid taking too much stack frame space
+  // with test assertions.
+  std::unique_ptr< ::std::string> message_;
+};
+
+// Makes a successful assertion result.
+GTEST_API_ AssertionResult AssertionSuccess();
+
+// Makes a failed assertion result.
+GTEST_API_ AssertionResult AssertionFailure();
+
+// Makes a failed assertion result with the given failure message.
+// Deprecated; use AssertionFailure() << msg.
+GTEST_API_ AssertionResult AssertionFailure(const Message& msg);
+
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  // 4251
+
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_ASSERTION_RESULT_H_
diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h
new file mode 100644
index 0000000000..84e5a5bbd3
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h
@@ -0,0 +1,345 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file defines the public API for death tests.  It is
+// #included by gtest.h so a user doesn't need to include this
+// directly.
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+
+#include "gtest/internal/gtest-death-test-internal.h"
+
+// This flag controls the style of death tests.  Valid values are "threadsafe",
+// meaning that the death test child process will re-execute the test binary
+// from the start, running only a single death test, or "fast",
+// meaning that the child process will execute the test logic immediately
+// after forking.
+GTEST_DECLARE_string_(death_test_style);
+
+namespace testing {
+
+#if GTEST_HAS_DEATH_TEST
+
+namespace internal {
+
+// Returns a Boolean value indicating whether the caller is currently
+// executing in the context of the death test child process.  Tools such as
+// Valgrind heap checkers may need this to modify their behavior in death
+// tests.  IMPORTANT: This is an internal utility.  Using it may break the
+// implementation of death tests.  User code MUST NOT use it.
+GTEST_API_ bool InDeathTestChild();
+
+}  // namespace internal
+
+// The following macros are useful for writing death tests.
+
+// Here's what happens when an ASSERT_DEATH* or EXPECT_DEATH* is
+// executed:
+//
+//   1. It generates a warning if there is more than one active
+//   thread.  This is because it's safe to fork() or clone() only
+//   when there is a single thread.
+//
+//   2. The parent process clone()s a sub-process and runs the death
+//   test in it; the sub-process exits with code 0 at the end of the
+//   death test, if it hasn't exited already.
+//
+//   3. The parent process waits for the sub-process to terminate.
+//
+//   4. The parent process checks the exit code and error message of
+//   the sub-process.
+//
+// Examples:
+//
+//   ASSERT_DEATH(server.SendMessage(56, "Hello"), "Invalid port number");
+//   for (int i = 0; i < 5; i++) {
+//     EXPECT_DEATH(server.ProcessRequest(i),
+//                  "Invalid request .* in ProcessRequest()")
+//                  << "Failed to die on request " << i;
+//   }
+//
+//   ASSERT_EXIT(server.ExitNow(), ::testing::ExitedWithCode(0), "Exiting");
+//
+//   bool KilledBySIGHUP(int exit_code) {
+//     return WIFSIGNALED(exit_code) && WTERMSIG(exit_code) == SIGHUP;
+//   }
+//
+//   ASSERT_EXIT(client.HangUpServer(), KilledBySIGHUP, "Hanging up!");
+//
+// The final parameter to each of these macros is a matcher applied to any data
+// the sub-process wrote to stderr.  For compatibility with existing tests, a
+// bare string is interpreted as a regular expression matcher.
+//
+// On the regular expressions used in death tests:
+//
+//   On POSIX-compliant systems (*nix), we use the <regex.h> library,
+//   which uses the POSIX extended regex syntax.
+//
+//   On other platforms (e.g. Windows or Mac), we only support a simple regex
+//   syntax implemented as part of Google Test.  This limited
+//   implementation should be enough most of the time when writing
+//   death tests; though it lacks many features you can find in PCRE
+//   or POSIX extended regex syntax.  For example, we don't support
+//   union ("x|y"), grouping ("(xy)"), brackets ("[xy]"), and
+//   repetition count ("x{5,7}"), among others.
+//
+//   Below is the syntax that we do support.  We chose it to be a
+//   subset of both PCRE and POSIX extended regex, so it's easy to
+//   learn wherever you come from.  In the following: 'A' denotes a
+//   literal character, period (.), or a single \\ escape sequence;
+//   'x' and 'y' denote regular expressions; 'm' and 'n' are for
+//   natural numbers.
+//
+//     c     matches any literal character c
+//     \\d   matches any decimal digit
+//     \\D   matches any character that's not a decimal digit
+//     \\f   matches \f
+//     \\n   matches \n
+//     \\r   matches \r
+//     \\s   matches any ASCII whitespace, including \n
+//     \\S   matches any character that's not a whitespace
+//     \\t   matches \t
+//     \\v   matches \v
+//     \\w   matches any letter, _, or decimal digit
+//     \\W   matches any character that \\w doesn't match
+//     \\c   matches any literal character c, which must be a punctuation
+//     .     matches any single character except \n
+//     A?    matches 0 or 1 occurrences of A
+//     A*    matches 0 or many occurrences of A
+//     A+    matches 1 or many occurrences of A
+//     ^     matches the beginning of a string (not that of each line)
+//     $     matches the end of a string (not that of each line)
+//     xy    matches x followed by y
+//
+//   If you accidentally use PCRE or POSIX extended regex features
+//   not implemented by us, you will get a run-time failure.  In that
+//   case, please try to rewrite your regular expression within the
+//   above syntax.
+//
+//   This implementation is *not* meant to be as highly tuned or robust
+//   as a compiled regex library, but should perform well enough for a
+//   death test, which already incurs significant overhead by launching
+//   a child process.
+//
+// Known caveats:
+//
+//   A "threadsafe" style death test obtains the path to the test
+//   program from argv[0] and re-executes it in the sub-process.  For
+//   simplicity, the current implementation doesn't search the PATH
+//   when launching the sub-process.  This means that the user must
+//   invoke the test program via a path that contains at least one
+//   path separator (e.g. path/to/foo_test and
+//   /absolute/path/to/bar_test are fine, but foo_test is not).  This
+//   is rarely a problem as people usually don't put the test binary
+//   directory in PATH.
+//
+
+// Asserts that a given `statement` causes the program to exit, with an
+// integer exit status that satisfies `predicate`, and emitting error output
+// that matches `matcher`.
+#define ASSERT_EXIT(statement, predicate, matcher) \
+  GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_FATAL_FAILURE_)
+
+// Like `ASSERT_EXIT`, but continues on to successive tests in the
+// test suite, if any:
+#define EXPECT_EXIT(statement, predicate, matcher) \
+  GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_NONFATAL_FAILURE_)
+
+// Asserts that a given `statement` causes the program to exit, either by
+// explicitly exiting with a nonzero exit code or being killed by a
+// signal, and emitting error output that matches `matcher`.
+#define ASSERT_DEATH(statement, matcher) \
+  ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher)
+
+// Like `ASSERT_DEATH`, but continues on to successive tests in the
+// test suite, if any:
+#define EXPECT_DEATH(statement, matcher) \
+  EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher)
+
+// Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*:
+
+// Tests that an exit code describes a normal exit with a given exit code.
+class GTEST_API_ ExitedWithCode {
+ public:
+  explicit ExitedWithCode(int exit_code);
+  ExitedWithCode(const ExitedWithCode&) = default;
+  void operator=(const ExitedWithCode& other) = delete;
+  bool operator()(int exit_status) const;
+
+ private:
+  const int exit_code_;
+};
+
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+// Tests that an exit code describes an exit due to termination by a
+// given signal.
+class GTEST_API_ KilledBySignal {
+ public:
+  explicit KilledBySignal(int signum);
+  bool operator()(int exit_status) const;
+
+ private:
+  const int signum_;
+};
+#endif  // !GTEST_OS_WINDOWS
+
+// EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode.
+// The death testing framework causes this to have interesting semantics,
+// since the sideeffects of the call are only visible in opt mode, and not
+// in debug mode.
+//
+// In practice, this can be used to test functions that utilize the
+// LOG(DFATAL) macro using the following style:
+//
+// int DieInDebugOr12(int* sideeffect) {
+//   if (sideeffect) {
+//     *sideeffect = 12;
+//   }
+//   LOG(DFATAL) << "death";
+//   return 12;
+// }
+//
+// TEST(TestSuite, TestDieOr12WorksInDgbAndOpt) {
+//   int sideeffect = 0;
+//   // Only asserts in dbg.
+//   EXPECT_DEBUG_DEATH(DieInDebugOr12(&sideeffect), "death");
+//
+// #ifdef NDEBUG
+//   // opt-mode has sideeffect visible.
+//   EXPECT_EQ(12, sideeffect);
+// #else
+//   // dbg-mode no visible sideeffect.
+//   EXPECT_EQ(0, sideeffect);
+// #endif
+// }
+//
+// This will assert that DieInDebugReturn12InOpt() crashes in debug
+// mode, usually due to a DCHECK or LOG(DFATAL), but returns the
+// appropriate fallback value (12 in this case) in opt mode. If you
+// need to test that a function has appropriate side-effects in opt
+// mode, include assertions against the side-effects.  A general
+// pattern for this is:
+//
+// EXPECT_DEBUG_DEATH({
+//   // Side-effects here will have an effect after this statement in
+//   // opt mode, but none in debug mode.
+//   EXPECT_EQ(12, DieInDebugOr12(&sideeffect));
+// }, "death");
+//
+#ifdef NDEBUG
+
+#define EXPECT_DEBUG_DEATH(statement, regex) \
+  GTEST_EXECUTE_STATEMENT_(statement, regex)
+
+#define ASSERT_DEBUG_DEATH(statement, regex) \
+  GTEST_EXECUTE_STATEMENT_(statement, regex)
+
+#else
+
+#define EXPECT_DEBUG_DEATH(statement, regex) EXPECT_DEATH(statement, regex)
+
+#define ASSERT_DEBUG_DEATH(statement, regex) ASSERT_DEATH(statement, regex)
+
+#endif  // NDEBUG for EXPECT_DEBUG_DEATH
+#endif  // GTEST_HAS_DEATH_TEST
+
+// This macro is used for implementing macros such as
+// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where
+// death tests are not supported. Those macros must compile on such systems
+// if and only if EXPECT_DEATH and ASSERT_DEATH compile with the same parameters
+// on systems that support death tests. This allows one to write such a macro on
+// a system that does not support death tests and be sure that it will compile
+// on a death-test supporting system. It is exposed publicly so that systems
+// that have death-tests with stricter requirements than GTEST_HAS_DEATH_TEST
+// can write their own equivalent of EXPECT_DEATH_IF_SUPPORTED and
+// ASSERT_DEATH_IF_SUPPORTED.
+//
+// Parameters:
+//   statement -  A statement that a macro such as EXPECT_DEATH would test
+//                for program termination. This macro has to make sure this
+//                statement is compiled but not executed, to ensure that
+//                EXPECT_DEATH_IF_SUPPORTED compiles with a certain
+//                parameter if and only if EXPECT_DEATH compiles with it.
+//   regex     -  A regex that a macro such as EXPECT_DEATH would use to test
+//                the output of statement.  This parameter has to be
+//                compiled but not evaluated by this macro, to ensure that
+//                this macro only accepts expressions that a macro such as
+//                EXPECT_DEATH would accept.
+//   terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED
+//                and a return statement for ASSERT_DEATH_IF_SUPPORTED.
+//                This ensures that ASSERT_DEATH_IF_SUPPORTED will not
+//                compile inside functions where ASSERT_DEATH doesn't
+//                compile.
+//
+//  The branch that has an always false condition is used to ensure that
+//  statement and regex are compiled (and thus syntactically correct) but
+//  never executed. The unreachable code macro protects the terminator
+//  statement from generating an 'unreachable code' warning in case
+//  statement unconditionally returns or throws. The Message constructor at
+//  the end allows the syntax of streaming additional messages into the
+//  macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH.
+#define GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, terminator)             \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                                \
+  if (::testing::internal::AlwaysTrue()) {                                     \
+    GTEST_LOG_(WARNING) << "Death tests are not supported on this platform.\n" \
+                        << "Statement '" #statement "' cannot be verified.";   \
+  } else if (::testing::internal::AlwaysFalse()) {                             \
+    ::testing::internal::RE::PartialMatch(".*", (regex));                      \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);                 \
+    terminator;                                                                \
+  } else                                                                       \
+    ::testing::Message()
+
+// EXPECT_DEATH_IF_SUPPORTED(statement, regex) and
+// ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if
+// death tests are supported; otherwise they just issue a warning.  This is
+// useful when you are combining death test assertions with normal test
+// assertions in one test.
+#if GTEST_HAS_DEATH_TEST
+#define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+  EXPECT_DEATH(statement, regex)
+#define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+  ASSERT_DEATH(statement, regex)
+#else
+#define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+  GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, )
+#define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+  GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, return)
+#endif
+
+}  // namespace testing
+
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-matchers.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-matchers.h
new file mode 100644
index 0000000000..bffa00c533
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-matchers.h
@@ -0,0 +1,956 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This file implements just enough of the matcher interface to allow
+// EXPECT_DEATH and friends to accept a matcher argument.
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
+
+#include <atomic>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "gtest/gtest-printers.h"
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
+
+// MSVC warning C5046 is new as of VS2017 version 15.8.
+#if defined(_MSC_VER) && _MSC_VER >= 1915
+#define GTEST_MAYBE_5046_ 5046
+#else
+#define GTEST_MAYBE_5046_
+#endif
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(
+    4251 GTEST_MAYBE_5046_ /* class A needs to have dll-interface to be used by
+                              clients of class B */
+    /* Symbol involving type with internal linkage not defined */)
+
+namespace testing {
+
+// To implement a matcher Foo for type T, define:
+//   1. a class FooMatcherMatcher that implements the matcher interface:
+//     using is_gtest_matcher = void;
+//     bool MatchAndExplain(const T&, std::ostream*);
+//       (MatchResultListener* can also be used instead of std::ostream*)
+//     void DescribeTo(std::ostream*);
+//     void DescribeNegationTo(std::ostream*);
+//
+//   2. a factory function that creates a Matcher<T> object from a
+//      FooMatcherMatcher.
+
+class MatchResultListener {
+ public:
+  // Creates a listener object with the given underlying ostream.  The
+  // listener does not own the ostream, and does not dereference it
+  // in the constructor or destructor.
+  explicit MatchResultListener(::std::ostream* os) : stream_(os) {}
+  virtual ~MatchResultListener() = 0;  // Makes this class abstract.
+
+  // Streams x to the underlying ostream; does nothing if the ostream
+  // is NULL.
+  template <typename T>
+  MatchResultListener& operator<<(const T& x) {
+    if (stream_ != nullptr) *stream_ << x;
+    return *this;
+  }
+
+  // Returns the underlying ostream.
+  ::std::ostream* stream() { return stream_; }
+
+  // Returns true if and only if the listener is interested in an explanation
+  // of the match result.  A matcher's MatchAndExplain() method can use
+  // this information to avoid generating the explanation when no one
+  // intends to hear it.
+  bool IsInterested() const { return stream_ != nullptr; }
+
+ private:
+  ::std::ostream* const stream_;
+
+  MatchResultListener(const MatchResultListener&) = delete;
+  MatchResultListener& operator=(const MatchResultListener&) = delete;
+};
+
+inline MatchResultListener::~MatchResultListener() {}
+
+// An instance of a subclass of this knows how to describe itself as a
+// matcher.
+class GTEST_API_ MatcherDescriberInterface {
+ public:
+  virtual ~MatcherDescriberInterface() {}
+
+  // Describes this matcher to an ostream.  The function should print
+  // a verb phrase that describes the property a value matching this
+  // matcher should have.  The subject of the verb phrase is the value
+  // being matched.  For example, the DescribeTo() method of the Gt(7)
+  // matcher prints "is greater than 7".
+  virtual void DescribeTo(::std::ostream* os) const = 0;
+
+  // Describes the negation of this matcher to an ostream.  For
+  // example, if the description of this matcher is "is greater than
+  // 7", the negated description could be "is not greater than 7".
+  // You are not required to override this when implementing
+  // MatcherInterface, but it is highly advised so that your matcher
+  // can produce good error messages.
+  virtual void DescribeNegationTo(::std::ostream* os) const {
+    *os << "not (";
+    DescribeTo(os);
+    *os << ")";
+  }
+};
+
+// The implementation of a matcher.
+template <typename T>
+class MatcherInterface : public MatcherDescriberInterface {
+ public:
+  // Returns true if and only if the matcher matches x; also explains the
+  // match result to 'listener' if necessary (see the next paragraph), in
+  // the form of a non-restrictive relative clause ("which ...",
+  // "whose ...", etc) that describes x.  For example, the
+  // MatchAndExplain() method of the Pointee(...) matcher should
+  // generate an explanation like "which points to ...".
+  //
+  // Implementations of MatchAndExplain() should add an explanation of
+  // the match result *if and only if* they can provide additional
+  // information that's not already present (or not obvious) in the
+  // print-out of x and the matcher's description.  Whether the match
+  // succeeds is not a factor in deciding whether an explanation is
+  // needed, as sometimes the caller needs to print a failure message
+  // when the match succeeds (e.g. when the matcher is used inside
+  // Not()).
+  //
+  // For example, a "has at least 10 elements" matcher should explain
+  // what the actual element count is, regardless of the match result,
+  // as it is useful information to the reader; on the other hand, an
+  // "is empty" matcher probably only needs to explain what the actual
+  // size is when the match fails, as it's redundant to say that the
+  // size is 0 when the value is already known to be empty.
+  //
+  // You should override this method when defining a new matcher.
+  //
+  // It's the responsibility of the caller (Google Test) to guarantee
+  // that 'listener' is not NULL.  This helps to simplify a matcher's
+  // implementation when it doesn't care about the performance, as it
+  // can talk to 'listener' without checking its validity first.
+  // However, in order to implement dummy listeners efficiently,
+  // listener->stream() may be NULL.
+  virtual bool MatchAndExplain(T x, MatchResultListener* listener) const = 0;
+
+  // Inherits these methods from MatcherDescriberInterface:
+  //   virtual void DescribeTo(::std::ostream* os) const = 0;
+  //   virtual void DescribeNegationTo(::std::ostream* os) const;
+};
+
+namespace internal {
+
+struct AnyEq {
+  template <typename A, typename B>
+  bool operator()(const A& a, const B& b) const {
+    return a == b;
+  }
+};
+struct AnyNe {
+  template <typename A, typename B>
+  bool operator()(const A& a, const B& b) const {
+    return a != b;
+  }
+};
+struct AnyLt {
+  template <typename A, typename B>
+  bool operator()(const A& a, const B& b) const {
+    return a < b;
+  }
+};
+struct AnyGt {
+  template <typename A, typename B>
+  bool operator()(const A& a, const B& b) const {
+    return a > b;
+  }
+};
+struct AnyLe {
+  template <typename A, typename B>
+  bool operator()(const A& a, const B& b) const {
+    return a <= b;
+  }
+};
+struct AnyGe {
+  template <typename A, typename B>
+  bool operator()(const A& a, const B& b) const {
+    return a >= b;
+  }
+};
+
+// A match result listener that ignores the explanation.
+class DummyMatchResultListener : public MatchResultListener {
+ public:
+  DummyMatchResultListener() : MatchResultListener(nullptr) {}
+
+ private:
+  DummyMatchResultListener(const DummyMatchResultListener&) = delete;
+  DummyMatchResultListener& operator=(const DummyMatchResultListener&) = delete;
+};
+
+// A match result listener that forwards the explanation to a given
+// ostream.  The difference between this and MatchResultListener is
+// that the former is concrete.
+class StreamMatchResultListener : public MatchResultListener {
+ public:
+  explicit StreamMatchResultListener(::std::ostream* os)
+      : MatchResultListener(os) {}
+
+ private:
+  StreamMatchResultListener(const StreamMatchResultListener&) = delete;
+  StreamMatchResultListener& operator=(const StreamMatchResultListener&) =
+      delete;
+};
+
+struct SharedPayloadBase {
+  std::atomic<int> ref{1};
+  void Ref() { ref.fetch_add(1, std::memory_order_relaxed); }
+  bool Unref() { return ref.fetch_sub(1, std::memory_order_acq_rel) == 1; }
+};
+
+template <typename T>
+struct SharedPayload : SharedPayloadBase {
+  explicit SharedPayload(const T& v) : value(v) {}
+  explicit SharedPayload(T&& v) : value(std::move(v)) {}
+
+  static void Destroy(SharedPayloadBase* shared) {
+    delete static_cast<SharedPayload*>(shared);
+  }
+
+  T value;
+};
+
+// An internal class for implementing Matcher<T>, which will derive
+// from it.  We put functionalities common to all Matcher<T>
+// specializations here to avoid code duplication.
+template <typename T>
+class MatcherBase : private MatcherDescriberInterface {
+ public:
+  // Returns true if and only if the matcher matches x; also explains the
+  // match result to 'listener'.
+  bool MatchAndExplain(const T& x, MatchResultListener* listener) const {
+    GTEST_CHECK_(vtable_ != nullptr);
+    return vtable_->match_and_explain(*this, x, listener);
+  }
+
+  // Returns true if and only if this matcher matches x.
+  bool Matches(const T& x) const {
+    DummyMatchResultListener dummy;
+    return MatchAndExplain(x, &dummy);
+  }
+
+  // Describes this matcher to an ostream.
+  void DescribeTo(::std::ostream* os) const final {
+    GTEST_CHECK_(vtable_ != nullptr);
+    vtable_->describe(*this, os, false);
+  }
+
+  // Describes the negation of this matcher to an ostream.
+  void DescribeNegationTo(::std::ostream* os) const final {
+    GTEST_CHECK_(vtable_ != nullptr);
+    vtable_->describe(*this, os, true);
+  }
+
+  // Explains why x matches, or doesn't match, the matcher.
+  void ExplainMatchResultTo(const T& x, ::std::ostream* os) const {
+    StreamMatchResultListener listener(os);
+    MatchAndExplain(x, &listener);
+  }
+
+  // Returns the describer for this matcher object; retains ownership
+  // of the describer, which is only guaranteed to be alive when
+  // this matcher object is alive.
+  const MatcherDescriberInterface* GetDescriber() const {
+    if (vtable_ == nullptr) return nullptr;
+    return vtable_->get_describer(*this);
+  }
+
+ protected:
+  MatcherBase() : vtable_(nullptr), buffer_() {}
+
+  // Constructs a matcher from its implementation.
+  template <typename U>
+  explicit MatcherBase(const MatcherInterface<U>* impl)
+      : vtable_(nullptr), buffer_() {
+    Init(impl);
+  }
+
+  template <typename M, typename = typename std::remove_reference<
+                            M>::type::is_gtest_matcher>
+  MatcherBase(M&& m) : vtable_(nullptr), buffer_() {  // NOLINT
+    Init(std::forward<M>(m));
+  }
+
+  MatcherBase(const MatcherBase& other)
+      : vtable_(other.vtable_), buffer_(other.buffer_) {
+    if (IsShared()) buffer_.shared->Ref();
+  }
+
+  MatcherBase& operator=(const MatcherBase& other) {
+    if (this == &other) return *this;
+    Destroy();
+    vtable_ = other.vtable_;
+    buffer_ = other.buffer_;
+    if (IsShared()) buffer_.shared->Ref();
+    return *this;
+  }
+
+  MatcherBase(MatcherBase&& other)
+      : vtable_(other.vtable_), buffer_(other.buffer_) {
+    other.vtable_ = nullptr;
+  }
+
+  MatcherBase& operator=(MatcherBase&& other) {
+    if (this == &other) return *this;
+    Destroy();
+    vtable_ = other.vtable_;
+    buffer_ = other.buffer_;
+    other.vtable_ = nullptr;
+    return *this;
+  }
+
+  ~MatcherBase() override { Destroy(); }
+
+ private:
+  struct VTable {
+    bool (*match_and_explain)(const MatcherBase&, const T&,
+                              MatchResultListener*);
+    void (*describe)(const MatcherBase&, std::ostream*, bool negation);
+    // Returns the captured object if it implements the interface, otherwise
+    // returns the MatcherBase itself.
+    const MatcherDescriberInterface* (*get_describer)(const MatcherBase&);
+    // Called on shared instances when the reference count reaches 0.
+    void (*shared_destroy)(SharedPayloadBase*);
+  };
+
+  bool IsShared() const {
+    return vtable_ != nullptr && vtable_->shared_destroy != nullptr;
+  }
+
+  // If the implementation uses a listener, call that.
+  template <typename P>
+  static auto MatchAndExplainImpl(const MatcherBase& m, const T& value,
+                                  MatchResultListener* listener)
+      -> decltype(P::Get(m).MatchAndExplain(value, listener->stream())) {
+    return P::Get(m).MatchAndExplain(value, listener->stream());
+  }
+
+  template <typename P>
+  static auto MatchAndExplainImpl(const MatcherBase& m, const T& value,
+                                  MatchResultListener* listener)
+      -> decltype(P::Get(m).MatchAndExplain(value, listener)) {
+    return P::Get(m).MatchAndExplain(value, listener);
+  }
+
+  template <typename P>
+  static void DescribeImpl(const MatcherBase& m, std::ostream* os,
+                           bool negation) {
+    if (negation) {
+      P::Get(m).DescribeNegationTo(os);
+    } else {
+      P::Get(m).DescribeTo(os);
+    }
+  }
+
+  template <typename P>
+  static const MatcherDescriberInterface* GetDescriberImpl(
+      const MatcherBase& m) {
+    // If the impl is a MatcherDescriberInterface, then return it.
+    // Otherwise use MatcherBase itself.
+    // This allows us to implement the GetDescriber() function without support
+    // from the impl, but some users really want to get their impl back when
+    // they call GetDescriber().
+    // We use std::get on a tuple as a workaround of not having `if constexpr`.
+    return std::get<(
+        std::is_convertible<decltype(&P::Get(m)),
+                            const MatcherDescriberInterface*>::value
+            ? 1
+            : 0)>(std::make_tuple(&m, &P::Get(m)));
+  }
+
+  template <typename P>
+  const VTable* GetVTable() {
+    static constexpr VTable kVTable = {&MatchAndExplainImpl<P>,
+                                       &DescribeImpl<P>, &GetDescriberImpl<P>,
+                                       P::shared_destroy};
+    return &kVTable;
+  }
+
+  union Buffer {
+    // Add some types to give Buffer some common alignment/size use cases.
+    void* ptr;
+    double d;
+    int64_t i;
+    // And add one for the out-of-line cases.
+    SharedPayloadBase* shared;
+  };
+
+  void Destroy() {
+    if (IsShared() && buffer_.shared->Unref()) {
+      vtable_->shared_destroy(buffer_.shared);
+    }
+  }
+
+  template <typename M>
+  static constexpr bool IsInlined() {
+    return sizeof(M) <= sizeof(Buffer) && alignof(M) <= alignof(Buffer) &&
+           std::is_trivially_copy_constructible<M>::value &&
+           std::is_trivially_destructible<M>::value;
+  }
+
+  template <typename M, bool = MatcherBase::IsInlined<M>()>
+  struct ValuePolicy {
+    static const M& Get(const MatcherBase& m) {
+      // When inlined along with Init, need to be explicit to avoid violating
+      // strict aliasing rules.
+      const M* ptr =
+          static_cast<const M*>(static_cast<const void*>(&m.buffer_));
+      return *ptr;
+    }
+    static void Init(MatcherBase& m, M impl) {
+      ::new (static_cast<void*>(&m.buffer_)) M(impl);
+    }
+    static constexpr auto shared_destroy = nullptr;
+  };
+
+  template <typename M>
+  struct ValuePolicy<M, false> {
+    using Shared = SharedPayload<M>;
+    static const M& Get(const MatcherBase& m) {
+      return static_cast<Shared*>(m.buffer_.shared)->value;
+    }
+    template <typename Arg>
+    static void Init(MatcherBase& m, Arg&& arg) {
+      m.buffer_.shared = new Shared(std::forward<Arg>(arg));
+    }
+    static constexpr auto shared_destroy = &Shared::Destroy;
+  };
+
+  template <typename U, bool B>
+  struct ValuePolicy<const MatcherInterface<U>*, B> {
+    using M = const MatcherInterface<U>;
+    using Shared = SharedPayload<std::unique_ptr<M>>;
+    static const M& Get(const MatcherBase& m) {
+      return *static_cast<Shared*>(m.buffer_.shared)->value;
+    }
+    static void Init(MatcherBase& m, M* impl) {
+      m.buffer_.shared = new Shared(std::unique_ptr<M>(impl));
+    }
+
+    static constexpr auto shared_destroy = &Shared::Destroy;
+  };
+
+  template <typename M>
+  void Init(M&& m) {
+    using MM = typename std::decay<M>::type;
+    using Policy = ValuePolicy<MM>;
+    vtable_ = GetVTable<Policy>();
+    Policy::Init(*this, std::forward<M>(m));
+  }
+
+  const VTable* vtable_;
+  Buffer buffer_;
+};
+
+}  // namespace internal
+
+// A Matcher<T> is a copyable and IMMUTABLE (except by assignment)
+// object that can check whether a value of type T matches.  The
+// implementation of Matcher<T> is just a std::shared_ptr to const
+// MatcherInterface<T>.  Don't inherit from Matcher!
+template <typename T>
+class Matcher : public internal::MatcherBase<T> {
+ public:
+  // Constructs a null matcher.  Needed for storing Matcher objects in STL
+  // containers.  A default-constructed matcher is not yet initialized.  You
+  // cannot use it until a valid value has been assigned to it.
+  explicit Matcher() {}  // NOLINT
+
+  // Constructs a matcher from its implementation.
+  explicit Matcher(const MatcherInterface<const T&>* impl)
+      : internal::MatcherBase<T>(impl) {}
+
+  template <typename U>
+  explicit Matcher(
+      const MatcherInterface<U>* impl,
+      typename std::enable_if<!std::is_same<U, const U&>::value>::type* =
+          nullptr)
+      : internal::MatcherBase<T>(impl) {}
+
+  template <typename M, typename = typename std::remove_reference<
+                            M>::type::is_gtest_matcher>
+  Matcher(M&& m) : internal::MatcherBase<T>(std::forward<M>(m)) {}  // NOLINT
+
+  // Implicit constructor here allows people to write
+  // EXPECT_CALL(foo, Bar(5)) instead of EXPECT_CALL(foo, Bar(Eq(5))) sometimes
+  Matcher(T value);  // NOLINT
+};
+
+// The following two specializations allow the user to write str
+// instead of Eq(str) and "foo" instead of Eq("foo") when a std::string
+// matcher is expected.
+template <>
+class GTEST_API_ Matcher<const std::string&>
+    : public internal::MatcherBase<const std::string&> {
+ public:
+  Matcher() {}
+
+  explicit Matcher(const MatcherInterface<const std::string&>* impl)
+      : internal::MatcherBase<const std::string&>(impl) {}
+
+  template <typename M, typename = typename std::remove_reference<
+                            M>::type::is_gtest_matcher>
+  Matcher(M&& m)  // NOLINT
+      : internal::MatcherBase<const std::string&>(std::forward<M>(m)) {}
+
+  // Allows the user to write str instead of Eq(str) sometimes, where
+  // str is a std::string object.
+  Matcher(const std::string& s);  // NOLINT
+
+  // Allows the user to write "foo" instead of Eq("foo") sometimes.
+  Matcher(const char* s);  // NOLINT
+};
+
+template <>
+class GTEST_API_ Matcher<std::string>
+    : public internal::MatcherBase<std::string> {
+ public:
+  Matcher() {}
+
+  explicit Matcher(const MatcherInterface<const std::string&>* impl)
+      : internal::MatcherBase<std::string>(impl) {}
+  explicit Matcher(const MatcherInterface<std::string>* impl)
+      : internal::MatcherBase<std::string>(impl) {}
+
+  template <typename M, typename = typename std::remove_reference<
+                            M>::type::is_gtest_matcher>
+  Matcher(M&& m)  // NOLINT
+      : internal::MatcherBase<std::string>(std::forward<M>(m)) {}
+
+  // Allows the user to write str instead of Eq(str) sometimes, where
+  // str is a string object.
+  Matcher(const std::string& s);  // NOLINT
+
+  // Allows the user to write "foo" instead of Eq("foo") sometimes.
+  Matcher(const char* s);  // NOLINT
+};
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+// The following two specializations allow the user to write str
+// instead of Eq(str) and "foo" instead of Eq("foo") when a absl::string_view
+// matcher is expected.
+template <>
+class GTEST_API_ Matcher<const internal::StringView&>
+    : public internal::MatcherBase<const internal::StringView&> {
+ public:
+  Matcher() {}
+
+  explicit Matcher(const MatcherInterface<const internal::StringView&>* impl)
+      : internal::MatcherBase<const internal::StringView&>(impl) {}
+
+  template <typename M, typename = typename std::remove_reference<
+                            M>::type::is_gtest_matcher>
+  Matcher(M&& m)  // NOLINT
+      : internal::MatcherBase<const internal::StringView&>(std::forward<M>(m)) {
+  }
+
+  // Allows the user to write str instead of Eq(str) sometimes, where
+  // str is a std::string object.
+  Matcher(const std::string& s);  // NOLINT
+
+  // Allows the user to write "foo" instead of Eq("foo") sometimes.
+  Matcher(const char* s);  // NOLINT
+
+  // Allows the user to pass absl::string_views or std::string_views directly.
+  Matcher(internal::StringView s);  // NOLINT
+};
+
+template <>
+class GTEST_API_ Matcher<internal::StringView>
+    : public internal::MatcherBase<internal::StringView> {
+ public:
+  Matcher() {}
+
+  explicit Matcher(const MatcherInterface<const internal::StringView&>* impl)
+      : internal::MatcherBase<internal::StringView>(impl) {}
+  explicit Matcher(const MatcherInterface<internal::StringView>* impl)
+      : internal::MatcherBase<internal::StringView>(impl) {}
+
+  template <typename M, typename = typename std::remove_reference<
+                            M>::type::is_gtest_matcher>
+  Matcher(M&& m)  // NOLINT
+      : internal::MatcherBase<internal::StringView>(std::forward<M>(m)) {}
+
+  // Allows the user to write str instead of Eq(str) sometimes, where
+  // str is a std::string object.
+  Matcher(const std::string& s);  // NOLINT
+
+  // Allows the user to write "foo" instead of Eq("foo") sometimes.
+  Matcher(const char* s);  // NOLINT
+
+  // Allows the user to pass absl::string_views or std::string_views directly.
+  Matcher(internal::StringView s);  // NOLINT
+};
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
+
+// Prints a matcher in a human-readable format.
+template <typename T>
+std::ostream& operator<<(std::ostream& os, const Matcher<T>& matcher) {
+  matcher.DescribeTo(&os);
+  return os;
+}
+
+// The PolymorphicMatcher class template makes it easy to implement a
+// polymorphic matcher (i.e. a matcher that can match values of more
+// than one type, e.g. Eq(n) and NotNull()).
+//
+// To define a polymorphic matcher, a user should provide an Impl
+// class that has a DescribeTo() method and a DescribeNegationTo()
+// method, and define a member function (or member function template)
+//
+//   bool MatchAndExplain(const Value& value,
+//                        MatchResultListener* listener) const;
+//
+// See the definition of NotNull() for a complete example.
+template <class Impl>
+class PolymorphicMatcher {
+ public:
+  explicit PolymorphicMatcher(const Impl& an_impl) : impl_(an_impl) {}
+
+  // Returns a mutable reference to the underlying matcher
+  // implementation object.
+  Impl& mutable_impl() { return impl_; }
+
+  // Returns an immutable reference to the underlying matcher
+  // implementation object.
+  const Impl& impl() const { return impl_; }
+
+  template <typename T>
+  operator Matcher<T>() const {
+    return Matcher<T>(new MonomorphicImpl<const T&>(impl_));
+  }
+
+ private:
+  template <typename T>
+  class MonomorphicImpl : public MatcherInterface<T> {
+   public:
+    explicit MonomorphicImpl(const Impl& impl) : impl_(impl) {}
+
+    void DescribeTo(::std::ostream* os) const override { impl_.DescribeTo(os); }
+
+    void DescribeNegationTo(::std::ostream* os) const override {
+      impl_.DescribeNegationTo(os);
+    }
+
+    bool MatchAndExplain(T x, MatchResultListener* listener) const override {
+      return impl_.MatchAndExplain(x, listener);
+    }
+
+   private:
+    const Impl impl_;
+  };
+
+  Impl impl_;
+};
+
+// Creates a matcher from its implementation.
+// DEPRECATED: Especially in the generic code, prefer:
+//   Matcher<T>(new MyMatcherImpl<const T&>(...));
+//
+// MakeMatcher may create a Matcher that accepts its argument by value, which
+// leads to unnecessary copies & lack of support for non-copyable types.
+template <typename T>
+inline Matcher<T> MakeMatcher(const MatcherInterface<T>* impl) {
+  return Matcher<T>(impl);
+}
+
+// Creates a polymorphic matcher from its implementation.  This is
+// easier to use than the PolymorphicMatcher<Impl> constructor as it
+// doesn't require you to explicitly write the template argument, e.g.
+//
+//   MakePolymorphicMatcher(foo);
+// vs
+//   PolymorphicMatcher<TypeOfFoo>(foo);
+template <class Impl>
+inline PolymorphicMatcher<Impl> MakePolymorphicMatcher(const Impl& impl) {
+  return PolymorphicMatcher<Impl>(impl);
+}
+
+namespace internal {
+// Implements a matcher that compares a given value with a
+// pre-supplied value using one of the ==, <=, <, etc, operators.  The
+// two values being compared don't have to have the same type.
+//
+// The matcher defined here is polymorphic (for example, Eq(5) can be
+// used to match an int, a short, a double, etc).  Therefore we use
+// a template type conversion operator in the implementation.
+//
+// The following template definition assumes that the Rhs parameter is
+// a "bare" type (i.e. neither 'const T' nor 'T&').
+template <typename D, typename Rhs, typename Op>
+class ComparisonBase {
+ public:
+  explicit ComparisonBase(const Rhs& rhs) : rhs_(rhs) {}
+
+  using is_gtest_matcher = void;
+
+  template <typename Lhs>
+  bool MatchAndExplain(const Lhs& lhs, std::ostream*) const {
+    return Op()(lhs, Unwrap(rhs_));
+  }
+  void DescribeTo(std::ostream* os) const {
+    *os << D::Desc() << " ";
+    UniversalPrint(Unwrap(rhs_), os);
+  }
+  void DescribeNegationTo(std::ostream* os) const {
+    *os << D::NegatedDesc() << " ";
+    UniversalPrint(Unwrap(rhs_), os);
+  }
+
+ private:
+  template <typename T>
+  static const T& Unwrap(const T& v) {
+    return v;
+  }
+  template <typename T>
+  static const T& Unwrap(std::reference_wrapper<T> v) {
+    return v;
+  }
+
+  Rhs rhs_;
+};
+
+template <typename Rhs>
+class EqMatcher : public ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq> {
+ public:
+  explicit EqMatcher(const Rhs& rhs)
+      : ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq>(rhs) {}
+  static const char* Desc() { return "is equal to"; }
+  static const char* NegatedDesc() { return "isn't equal to"; }
+};
+template <typename Rhs>
+class NeMatcher : public ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe> {
+ public:
+  explicit NeMatcher(const Rhs& rhs)
+      : ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe>(rhs) {}
+  static const char* Desc() { return "isn't equal to"; }
+  static const char* NegatedDesc() { return "is equal to"; }
+};
+template <typename Rhs>
+class LtMatcher : public ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt> {
+ public:
+  explicit LtMatcher(const Rhs& rhs)
+      : ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt>(rhs) {}
+  static const char* Desc() { return "is <"; }
+  static const char* NegatedDesc() { return "isn't <"; }
+};
+template <typename Rhs>
+class GtMatcher : public ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt> {
+ public:
+  explicit GtMatcher(const Rhs& rhs)
+      : ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt>(rhs) {}
+  static const char* Desc() { return "is >"; }
+  static const char* NegatedDesc() { return "isn't >"; }
+};
+template <typename Rhs>
+class LeMatcher : public ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe> {
+ public:
+  explicit LeMatcher(const Rhs& rhs)
+      : ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe>(rhs) {}
+  static const char* Desc() { return "is <="; }
+  static const char* NegatedDesc() { return "isn't <="; }
+};
+template <typename Rhs>
+class GeMatcher : public ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe> {
+ public:
+  explicit GeMatcher(const Rhs& rhs)
+      : ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe>(rhs) {}
+  static const char* Desc() { return "is >="; }
+  static const char* NegatedDesc() { return "isn't >="; }
+};
+
+template <typename T, typename = typename std::enable_if<
+                          std::is_constructible<std::string, T>::value>::type>
+using StringLike = T;
+
+// Implements polymorphic matchers MatchesRegex(regex) and
+// ContainsRegex(regex), which can be used as a Matcher<T> as long as
+// T can be converted to a string.
+class MatchesRegexMatcher {
+ public:
+  MatchesRegexMatcher(const RE* regex, bool full_match)
+      : regex_(regex), full_match_(full_match) {}
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+  bool MatchAndExplain(const internal::StringView& s,
+                       MatchResultListener* listener) const {
+    return MatchAndExplain(std::string(s), listener);
+  }
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
+
+  // Accepts pointer types, particularly:
+  //   const char*
+  //   char*
+  //   const wchar_t*
+  //   wchar_t*
+  template <typename CharType>
+  bool MatchAndExplain(CharType* s, MatchResultListener* listener) const {
+    return s != nullptr && MatchAndExplain(std::string(s), listener);
+  }
+
+  // Matches anything that can convert to std::string.
+  //
+  // This is a template, not just a plain function with const std::string&,
+  // because absl::string_view has some interfering non-explicit constructors.
+  template <class MatcheeStringType>
+  bool MatchAndExplain(const MatcheeStringType& s,
+                       MatchResultListener* /* listener */) const {
+    const std::string& s2(s);
+    return full_match_ ? RE::FullMatch(s2, *regex_)
+                       : RE::PartialMatch(s2, *regex_);
+  }
+
+  void DescribeTo(::std::ostream* os) const {
+    *os << (full_match_ ? "matches" : "contains") << " regular expression ";
+    UniversalPrinter<std::string>::Print(regex_->pattern(), os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "doesn't " << (full_match_ ? "match" : "contain")
+        << " regular expression ";
+    UniversalPrinter<std::string>::Print(regex_->pattern(), os);
+  }
+
+ private:
+  const std::shared_ptr<const RE> regex_;
+  const bool full_match_;
+};
+}  // namespace internal
+
+// Matches a string that fully matches regular expression 'regex'.
+// The matcher takes ownership of 'regex'.
+inline PolymorphicMatcher<internal::MatchesRegexMatcher> MatchesRegex(
+    const internal::RE* regex) {
+  return MakePolymorphicMatcher(internal::MatchesRegexMatcher(regex, true));
+}
+template <typename T = std::string>
+PolymorphicMatcher<internal::MatchesRegexMatcher> MatchesRegex(
+    const internal::StringLike<T>& regex) {
+  return MatchesRegex(new internal::RE(std::string(regex)));
+}
+
+// Matches a string that contains regular expression 'regex'.
+// The matcher takes ownership of 'regex'.
+inline PolymorphicMatcher<internal::MatchesRegexMatcher> ContainsRegex(
+    const internal::RE* regex) {
+  return MakePolymorphicMatcher(internal::MatchesRegexMatcher(regex, false));
+}
+template <typename T = std::string>
+PolymorphicMatcher<internal::MatchesRegexMatcher> ContainsRegex(
+    const internal::StringLike<T>& regex) {
+  return ContainsRegex(new internal::RE(std::string(regex)));
+}
+
+// Creates a polymorphic matcher that matches anything equal to x.
+// Note: if the parameter of Eq() were declared as const T&, Eq("foo")
+// wouldn't compile.
+template <typename T>
+inline internal::EqMatcher<T> Eq(T x) {
+  return internal::EqMatcher<T>(x);
+}
+
+// Constructs a Matcher<T> from a 'value' of type T.  The constructed
+// matcher matches any value that's equal to 'value'.
+template <typename T>
+Matcher<T>::Matcher(T value) {
+  *this = Eq(value);
+}
+
+// Creates a monomorphic matcher that matches anything with type Lhs
+// and equal to rhs.  A user may need to use this instead of Eq(...)
+// in order to resolve an overloading ambiguity.
+//
+// TypedEq<T>(x) is just a convenient short-hand for Matcher<T>(Eq(x))
+// or Matcher<T>(x), but more readable than the latter.
+//
+// We could define similar monomorphic matchers for other comparison
+// operations (e.g. TypedLt, TypedGe, and etc), but decided not to do
+// it yet as those are used much less than Eq() in practice.  A user
+// can always write Matcher<T>(Lt(5)) to be explicit about the type,
+// for example.
+template <typename Lhs, typename Rhs>
+inline Matcher<Lhs> TypedEq(const Rhs& rhs) {
+  return Eq(rhs);
+}
+
+// Creates a polymorphic matcher that matches anything >= x.
+template <typename Rhs>
+inline internal::GeMatcher<Rhs> Ge(Rhs x) {
+  return internal::GeMatcher<Rhs>(x);
+}
+
+// Creates a polymorphic matcher that matches anything > x.
+template <typename Rhs>
+inline internal::GtMatcher<Rhs> Gt(Rhs x) {
+  return internal::GtMatcher<Rhs>(x);
+}
+
+// Creates a polymorphic matcher that matches anything <= x.
+template <typename Rhs>
+inline internal::LeMatcher<Rhs> Le(Rhs x) {
+  return internal::LeMatcher<Rhs>(x);
+}
+
+// Creates a polymorphic matcher that matches anything < x.
+template <typename Rhs>
+inline internal::LtMatcher<Rhs> Lt(Rhs x) {
+  return internal::LtMatcher<Rhs>(x);
+}
+
+// Creates a polymorphic matcher that matches anything != x.
+template <typename Rhs>
+inline internal::NeMatcher<Rhs> Ne(Rhs x) {
+  return internal::NeMatcher<Rhs>(x);
+}
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251 5046
+
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-message.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-message.h
new file mode 100644
index 0000000000..6c8bf90009
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-message.h
@@ -0,0 +1,218 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file defines the Message class.
+//
+// IMPORTANT NOTE: Due to limitation of the C++ language, we have to
+// leave some internal implementation details in this header file.
+// They are clearly marked by comments like this:
+//
+//   // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+//
+// Such code is NOT meant to be used by a user directly, and is subject
+// to CHANGE WITHOUT NOTICE.  Therefore DO NOT DEPEND ON IT in a user
+// program!
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+
+#include <limits>
+#include <memory>
+#include <sstream>
+
+#include "gtest/internal/gtest-port.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+// Ensures that there is at least one operator<< in the global namespace.
+// See Message& operator<<(...) below for why.
+void operator<<(const testing::internal::Secret&, int);
+
+namespace testing {
+
+// The Message class works like an ostream repeater.
+//
+// Typical usage:
+//
+//   1. You stream a bunch of values to a Message object.
+//      It will remember the text in a stringstream.
+//   2. Then you stream the Message object to an ostream.
+//      This causes the text in the Message to be streamed
+//      to the ostream.
+//
+// For example;
+//
+//   testing::Message foo;
+//   foo << 1 << " != " << 2;
+//   std::cout << foo;
+//
+// will print "1 != 2".
+//
+// Message is not intended to be inherited from.  In particular, its
+// destructor is not virtual.
+//
+// Note that stringstream behaves differently in gcc and in MSVC.  You
+// can stream a NULL char pointer to it in the former, but not in the
+// latter (it causes an access violation if you do).  The Message
+// class hides this difference by treating a NULL char pointer as
+// "(null)".
+class GTEST_API_ Message {
+ private:
+  // The type of basic IO manipulators (endl, ends, and flush) for
+  // narrow streams.
+  typedef std::ostream& (*BasicNarrowIoManip)(std::ostream&);
+
+ public:
+  // Constructs an empty Message.
+  Message();
+
+  // Copy constructor.
+  Message(const Message& msg) : ss_(new ::std::stringstream) {  // NOLINT
+    *ss_ << msg.GetString();
+  }
+
+  // Constructs a Message from a C-string.
+  explicit Message(const char* str) : ss_(new ::std::stringstream) {
+    *ss_ << str;
+  }
+
+  // Streams a non-pointer value to this object.
+  template <typename T>
+  inline Message& operator<<(const T& val) {
+        // Some libraries overload << for STL containers.  These
+    // overloads are defined in the global namespace instead of ::std.
+    //
+    // C++'s symbol lookup rule (i.e. Koenig lookup) says that these
+    // overloads are visible in either the std namespace or the global
+    // namespace, but not other namespaces, including the testing
+    // namespace which Google Test's Message class is in.
+    //
+    // To allow STL containers (and other types that has a << operator
+    // defined in the global namespace) to be used in Google Test
+    // assertions, testing::Message must access the custom << operator
+    // from the global namespace.  With this using declaration,
+    // overloads of << defined in the global namespace and those
+    // visible via Koenig lookup are both exposed in this function.
+    using ::operator<<;
+    *ss_ << val;
+    return *this;
+  }
+
+  // Streams a pointer value to this object.
+  //
+  // This function is an overload of the previous one.  When you
+  // stream a pointer to a Message, this definition will be used as it
+  // is more specialized.  (The C++ Standard, section
+  // [temp.func.order].)  If you stream a non-pointer, then the
+  // previous definition will be used.
+  //
+  // The reason for this overload is that streaming a NULL pointer to
+  // ostream is undefined behavior.  Depending on the compiler, you
+  // may get "0", "(nil)", "(null)", or an access violation.  To
+  // ensure consistent result across compilers, we always treat NULL
+  // as "(null)".
+  template <typename T>
+  inline Message& operator<<(T* const& pointer) {  // NOLINT
+    if (pointer == nullptr) {
+      *ss_ << "(null)";
+    } else {
+      *ss_ << pointer;
+    }
+    return *this;
+  }
+
+  // Since the basic IO manipulators are overloaded for both narrow
+  // and wide streams, we have to provide this specialized definition
+  // of operator <<, even though its body is the same as the
+  // templatized version above.  Without this definition, streaming
+  // endl or other basic IO manipulators to Message will confuse the
+  // compiler.
+  Message& operator<<(BasicNarrowIoManip val) {
+    *ss_ << val;
+    return *this;
+  }
+
+  // Instead of 1/0, we want to see true/false for bool values.
+  Message& operator<<(bool b) { return *this << (b ? "true" : "false"); }
+
+  // These two overloads allow streaming a wide C string to a Message
+  // using the UTF-8 encoding.
+  Message& operator<<(const wchar_t* wide_c_str);
+  Message& operator<<(wchar_t* wide_c_str);
+
+#if GTEST_HAS_STD_WSTRING
+  // Converts the given wide string to a narrow string using the UTF-8
+  // encoding, and streams the result to this Message object.
+  Message& operator<<(const ::std::wstring& wstr);
+#endif  // GTEST_HAS_STD_WSTRING
+
+  // Gets the text streamed to this object so far as an std::string.
+  // Each '\0' character in the buffer is replaced with "\\0".
+  //
+  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+  std::string GetString() const;
+
+ private:
+  // We'll hold the text streamed to this object here.
+  const std::unique_ptr< ::std::stringstream> ss_;
+
+  // We declare (but don't implement) this to prevent the compiler
+  // from implementing the assignment operator.
+  void operator=(const Message&);
+};
+
+// Streams a Message to an ostream.
+inline std::ostream& operator<<(std::ostream& os, const Message& sb) {
+  return os << sb.GetString();
+}
+
+namespace internal {
+
+// Converts a streamable value to an std::string.  A NULL pointer is
+// converted to "(null)".  When the input value is a ::string,
+// ::std::string, ::wstring, or ::std::wstring object, each NUL
+// character in it is replaced with "\\0".
+template <typename T>
+std::string StreamableToString(const T& streamable) {
+  return (Message() << streamable).GetString();
+}
+
+}  // namespace internal
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h
new file mode 100644
index 0000000000..b55119ac62
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h
@@ -0,0 +1,510 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Macros and functions for implementing parameterized tests
+// in Google C++ Testing and Mocking Framework (Google Test)
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+
+// Value-parameterized tests allow you to test your code with different
+// parameters without writing multiple copies of the same test.
+//
+// Here is how you use value-parameterized tests:
+
+#if 0
+
+// To write value-parameterized tests, first you should define a fixture
+// class. It is usually derived from testing::TestWithParam<T> (see below for
+// another inheritance scheme that's sometimes useful in more complicated
+// class hierarchies), where the type of your parameter values.
+// TestWithParam<T> is itself derived from testing::Test. T can be any
+// copyable type. If it's a raw pointer, you are responsible for managing the
+// lifespan of the pointed values.
+
+class FooTest : public ::testing::TestWithParam<const char*> {
+  // You can implement all the usual class fixture members here.
+};
+
+// Then, use the TEST_P macro to define as many parameterized tests
+// for this fixture as you want. The _P suffix is for "parameterized"
+// or "pattern", whichever you prefer to think.
+
+TEST_P(FooTest, DoesBlah) {
+  // Inside a test, access the test parameter with the GetParam() method
+  // of the TestWithParam<T> class:
+  EXPECT_TRUE(foo.Blah(GetParam()));
+  ...
+}
+
+TEST_P(FooTest, HasBlahBlah) {
+  ...
+}
+
+// Finally, you can use INSTANTIATE_TEST_SUITE_P to instantiate the test
+// case with any set of parameters you want. Google Test defines a number
+// of functions for generating test parameters. They return what we call
+// (surprise!) parameter generators. Here is a summary of them, which
+// are all in the testing namespace:
+//
+//
+//  Range(begin, end [, step]) - Yields values {begin, begin+step,
+//                               begin+step+step, ...}. The values do not
+//                               include end. step defaults to 1.
+//  Values(v1, v2, ..., vN)    - Yields values {v1, v2, ..., vN}.
+//  ValuesIn(container)        - Yields values from a C-style array, an STL
+//  ValuesIn(begin,end)          container, or an iterator range [begin, end).
+//  Bool()                     - Yields sequence {false, true}.
+//  Combine(g1, g2, ..., gN)   - Yields all combinations (the Cartesian product
+//                               for the math savvy) of the values generated
+//                               by the N generators.
+//
+// For more details, see comments at the definitions of these functions below
+// in this file.
+//
+// The following statement will instantiate tests from the FooTest test suite
+// each with parameter values "meeny", "miny", and "moe".
+
+INSTANTIATE_TEST_SUITE_P(InstantiationName,
+                         FooTest,
+                         Values("meeny", "miny", "moe"));
+
+// To distinguish different instances of the pattern, (yes, you
+// can instantiate it more than once) the first argument to the
+// INSTANTIATE_TEST_SUITE_P macro is a prefix that will be added to the
+// actual test suite name. Remember to pick unique prefixes for different
+// instantiations. The tests from the instantiation above will have
+// these names:
+//
+//    * InstantiationName/FooTest.DoesBlah/0 for "meeny"
+//    * InstantiationName/FooTest.DoesBlah/1 for "miny"
+//    * InstantiationName/FooTest.DoesBlah/2 for "moe"
+//    * InstantiationName/FooTest.HasBlahBlah/0 for "meeny"
+//    * InstantiationName/FooTest.HasBlahBlah/1 for "miny"
+//    * InstantiationName/FooTest.HasBlahBlah/2 for "moe"
+//
+// You can use these names in --gtest_filter.
+//
+// This statement will instantiate all tests from FooTest again, each
+// with parameter values "cat" and "dog":
+
+const char* pets[] = {"cat", "dog"};
+INSTANTIATE_TEST_SUITE_P(AnotherInstantiationName, FooTest, ValuesIn(pets));
+
+// The tests from the instantiation above will have these names:
+//
+//    * AnotherInstantiationName/FooTest.DoesBlah/0 for "cat"
+//    * AnotherInstantiationName/FooTest.DoesBlah/1 for "dog"
+//    * AnotherInstantiationName/FooTest.HasBlahBlah/0 for "cat"
+//    * AnotherInstantiationName/FooTest.HasBlahBlah/1 for "dog"
+//
+// Please note that INSTANTIATE_TEST_SUITE_P will instantiate all tests
+// in the given test suite, whether their definitions come before or
+// AFTER the INSTANTIATE_TEST_SUITE_P statement.
+//
+// Please also note that generator expressions (including parameters to the
+// generators) are evaluated in InitGoogleTest(), after main() has started.
+// This allows the user on one hand, to adjust generator parameters in order
+// to dynamically determine a set of tests to run and on the other hand,
+// give the user a chance to inspect the generated tests with Google Test
+// reflection API before RUN_ALL_TESTS() is executed.
+//
+// You can see samples/sample7_unittest.cc and samples/sample8_unittest.cc
+// for more examples.
+//
+// In the future, we plan to publish the API for defining new parameter
+// generators. But for now this interface remains part of the internal
+// implementation and is subject to change.
+//
+//
+// A parameterized test fixture must be derived from testing::Test and from
+// testing::WithParamInterface<T>, where T is the type of the parameter
+// values. Inheriting from TestWithParam<T> satisfies that requirement because
+// TestWithParam<T> inherits from both Test and WithParamInterface. In more
+// complicated hierarchies, however, it is occasionally useful to inherit
+// separately from Test and WithParamInterface. For example:
+
+class BaseTest : public ::testing::Test {
+  // You can inherit all the usual members for a non-parameterized test
+  // fixture here.
+};
+
+class DerivedTest : public BaseTest, public ::testing::WithParamInterface<int> {
+  // The usual test fixture members go here too.
+};
+
+TEST_F(BaseTest, HasFoo) {
+  // This is an ordinary non-parameterized test.
+}
+
+TEST_P(DerivedTest, DoesBlah) {
+  // GetParam works just the same here as if you inherit from TestWithParam.
+  EXPECT_TRUE(foo.Blah(GetParam()));
+}
+
+#endif  // 0
+
+#include <iterator>
+#include <utility>
+
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-param-util.h"
+#include "gtest/internal/gtest-port.h"
+
+namespace testing {
+
+// Functions producing parameter generators.
+//
+// Google Test uses these generators to produce parameters for value-
+// parameterized tests. When a parameterized test suite is instantiated
+// with a particular generator, Google Test creates and runs tests
+// for each element in the sequence produced by the generator.
+//
+// In the following sample, tests from test suite FooTest are instantiated
+// each three times with parameter values 3, 5, and 8:
+//
+// class FooTest : public TestWithParam<int> { ... };
+//
+// TEST_P(FooTest, TestThis) {
+// }
+// TEST_P(FooTest, TestThat) {
+// }
+// INSTANTIATE_TEST_SUITE_P(TestSequence, FooTest, Values(3, 5, 8));
+//
+
+// Range() returns generators providing sequences of values in a range.
+//
+// Synopsis:
+// Range(start, end)
+//   - returns a generator producing a sequence of values {start, start+1,
+//     start+2, ..., }.
+// Range(start, end, step)
+//   - returns a generator producing a sequence of values {start, start+step,
+//     start+step+step, ..., }.
+// Notes:
+//   * The generated sequences never include end. For example, Range(1, 5)
+//     returns a generator producing a sequence {1, 2, 3, 4}. Range(1, 9, 2)
+//     returns a generator producing {1, 3, 5, 7}.
+//   * start and end must have the same type. That type may be any integral or
+//     floating-point type or a user defined type satisfying these conditions:
+//     * It must be assignable (have operator=() defined).
+//     * It must have operator+() (operator+(int-compatible type) for
+//       two-operand version).
+//     * It must have operator<() defined.
+//     Elements in the resulting sequences will also have that type.
+//   * Condition start < end must be satisfied in order for resulting sequences
+//     to contain any elements.
+//
+template <typename T, typename IncrementT>
+internal::ParamGenerator<T> Range(T start, T end, IncrementT step) {
+  return internal::ParamGenerator<T>(
+      new internal::RangeGenerator<T, IncrementT>(start, end, step));
+}
+
+template <typename T>
+internal::ParamGenerator<T> Range(T start, T end) {
+  return Range(start, end, 1);
+}
+
+// ValuesIn() function allows generation of tests with parameters coming from
+// a container.
+//
+// Synopsis:
+// ValuesIn(const T (&array)[N])
+//   - returns a generator producing sequences with elements from
+//     a C-style array.
+// ValuesIn(const Container& container)
+//   - returns a generator producing sequences with elements from
+//     an STL-style container.
+// ValuesIn(Iterator begin, Iterator end)
+//   - returns a generator producing sequences with elements from
+//     a range [begin, end) defined by a pair of STL-style iterators. These
+//     iterators can also be plain C pointers.
+//
+// Please note that ValuesIn copies the values from the containers
+// passed in and keeps them to generate tests in RUN_ALL_TESTS().
+//
+// Examples:
+//
+// This instantiates tests from test suite StringTest
+// each with C-string values of "foo", "bar", and "baz":
+//
+// const char* strings[] = {"foo", "bar", "baz"};
+// INSTANTIATE_TEST_SUITE_P(StringSequence, StringTest, ValuesIn(strings));
+//
+// This instantiates tests from test suite StlStringTest
+// each with STL strings with values "a" and "b":
+//
+// ::std::vector< ::std::string> GetParameterStrings() {
+//   ::std::vector< ::std::string> v;
+//   v.push_back("a");
+//   v.push_back("b");
+//   return v;
+// }
+//
+// INSTANTIATE_TEST_SUITE_P(CharSequence,
+//                          StlStringTest,
+//                          ValuesIn(GetParameterStrings()));
+//
+//
+// This will also instantiate tests from CharTest
+// each with parameter values 'a' and 'b':
+//
+// ::std::list<char> GetParameterChars() {
+//   ::std::list<char> list;
+//   list.push_back('a');
+//   list.push_back('b');
+//   return list;
+// }
+// ::std::list<char> l = GetParameterChars();
+// INSTANTIATE_TEST_SUITE_P(CharSequence2,
+//                          CharTest,
+//                          ValuesIn(l.begin(), l.end()));
+//
+template <typename ForwardIterator>
+internal::ParamGenerator<
+    typename std::iterator_traits<ForwardIterator>::value_type>
+ValuesIn(ForwardIterator begin, ForwardIterator end) {
+  typedef typename std::iterator_traits<ForwardIterator>::value_type ParamType;
+  return internal::ParamGenerator<ParamType>(
+      new internal::ValuesInIteratorRangeGenerator<ParamType>(begin, end));
+}
+
+template <typename T, size_t N>
+internal::ParamGenerator<T> ValuesIn(const T (&array)[N]) {
+  return ValuesIn(array, array + N);
+}
+
+template <class Container>
+internal::ParamGenerator<typename Container::value_type> ValuesIn(
+    const Container& container) {
+  return ValuesIn(container.begin(), container.end());
+}
+
+// Values() allows generating tests from explicitly specified list of
+// parameters.
+//
+// Synopsis:
+// Values(T v1, T v2, ..., T vN)
+//   - returns a generator producing sequences with elements v1, v2, ..., vN.
+//
+// For example, this instantiates tests from test suite BarTest each
+// with values "one", "two", and "three":
+//
+// INSTANTIATE_TEST_SUITE_P(NumSequence,
+//                          BarTest,
+//                          Values("one", "two", "three"));
+//
+// This instantiates tests from test suite BazTest each with values 1, 2, 3.5.
+// The exact type of values will depend on the type of parameter in BazTest.
+//
+// INSTANTIATE_TEST_SUITE_P(FloatingNumbers, BazTest, Values(1, 2, 3.5));
+//
+//
+template <typename... T>
+internal::ValueArray<T...> Values(T... v) {
+  return internal::ValueArray<T...>(std::move(v)...);
+}
+
+// Bool() allows generating tests with parameters in a set of (false, true).
+//
+// Synopsis:
+// Bool()
+//   - returns a generator producing sequences with elements {false, true}.
+//
+// It is useful when testing code that depends on Boolean flags. Combinations
+// of multiple flags can be tested when several Bool()'s are combined using
+// Combine() function.
+//
+// In the following example all tests in the test suite FlagDependentTest
+// will be instantiated twice with parameters false and true.
+//
+// class FlagDependentTest : public testing::TestWithParam<bool> {
+//   virtual void SetUp() {
+//     external_flag = GetParam();
+//   }
+// }
+// INSTANTIATE_TEST_SUITE_P(BoolSequence, FlagDependentTest, Bool());
+//
+inline internal::ParamGenerator<bool> Bool() { return Values(false, true); }
+
+// Combine() allows the user to combine two or more sequences to produce
+// values of a Cartesian product of those sequences' elements.
+//
+// Synopsis:
+// Combine(gen1, gen2, ..., genN)
+//   - returns a generator producing sequences with elements coming from
+//     the Cartesian product of elements from the sequences generated by
+//     gen1, gen2, ..., genN. The sequence elements will have a type of
+//     std::tuple<T1, T2, ..., TN> where T1, T2, ..., TN are the types
+//     of elements from sequences produces by gen1, gen2, ..., genN.
+//
+// Example:
+//
+// This will instantiate tests in test suite AnimalTest each one with
+// the parameter values tuple("cat", BLACK), tuple("cat", WHITE),
+// tuple("dog", BLACK), and tuple("dog", WHITE):
+//
+// enum Color { BLACK, GRAY, WHITE };
+// class AnimalTest
+//     : public testing::TestWithParam<std::tuple<const char*, Color> > {...};
+//
+// TEST_P(AnimalTest, AnimalLooksNice) {...}
+//
+// INSTANTIATE_TEST_SUITE_P(AnimalVariations, AnimalTest,
+//                          Combine(Values("cat", "dog"),
+//                                  Values(BLACK, WHITE)));
+//
+// This will instantiate tests in FlagDependentTest with all variations of two
+// Boolean flags:
+//
+// class FlagDependentTest
+//     : public testing::TestWithParam<std::tuple<bool, bool> > {
+//   virtual void SetUp() {
+//     // Assigns external_flag_1 and external_flag_2 values from the tuple.
+//     std::tie(external_flag_1, external_flag_2) = GetParam();
+//   }
+// };
+//
+// TEST_P(FlagDependentTest, TestFeature1) {
+//   // Test your code using external_flag_1 and external_flag_2 here.
+// }
+// INSTANTIATE_TEST_SUITE_P(TwoBoolSequence, FlagDependentTest,
+//                          Combine(Bool(), Bool()));
+//
+template <typename... Generator>
+internal::CartesianProductHolder<Generator...> Combine(const Generator&... g) {
+  return internal::CartesianProductHolder<Generator...>(g...);
+}
+
+#define TEST_P(test_suite_name, test_name)                                     \
+  class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                     \
+      : public test_suite_name {                                               \
+   public:                                                                     \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() {}                    \
+    void TestBody() override;                                                  \
+                                                                               \
+   private:                                                                    \
+    static int AddToRegistry() {                                               \
+      ::testing::UnitTest::GetInstance()                                       \
+          ->parameterized_test_registry()                                      \
+          .GetTestSuitePatternHolder<test_suite_name>(                         \
+              GTEST_STRINGIFY_(test_suite_name),                               \
+              ::testing::internal::CodeLocation(__FILE__, __LINE__))           \
+          ->AddTestPattern(                                                    \
+              GTEST_STRINGIFY_(test_suite_name), GTEST_STRINGIFY_(test_name),  \
+              new ::testing::internal::TestMetaFactory<GTEST_TEST_CLASS_NAME_( \
+                  test_suite_name, test_name)>(),                              \
+              ::testing::internal::CodeLocation(__FILE__, __LINE__));          \
+      return 0;                                                                \
+    }                                                                          \
+    static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_;               \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                         \
+    (const GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) &) = delete;     \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) & operator=(            \
+        const GTEST_TEST_CLASS_NAME_(test_suite_name,                          \
+                                     test_name) &) = delete; /* NOLINT */      \
+  };                                                                           \
+  int GTEST_TEST_CLASS_NAME_(test_suite_name,                                  \
+                             test_name)::gtest_registering_dummy_ =            \
+      GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::AddToRegistry();     \
+  void GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::TestBody()
+
+// The last argument to INSTANTIATE_TEST_SUITE_P allows the user to specify
+// generator and an optional function or functor that generates custom test name
+// suffixes based on the test parameters. Such a function or functor should
+// accept one argument of type testing::TestParamInfo<class ParamType>, and
+// return std::string.
+//
+// testing::PrintToStringParamName is a builtin test suffix generator that
+// returns the value of testing::PrintToString(GetParam()).
+//
+// Note: test names must be non-empty, unique, and may only contain ASCII
+// alphanumeric characters or underscore. Because PrintToString adds quotes
+// to std::string and C strings, it won't work for these types.
+
+#define GTEST_EXPAND_(arg) arg
+#define GTEST_GET_FIRST_(first, ...) first
+#define GTEST_GET_SECOND_(first, second, ...) second
+
+#define INSTANTIATE_TEST_SUITE_P(prefix, test_suite_name, ...)               \
+  static ::testing::internal::ParamGenerator<test_suite_name::ParamType>     \
+      gtest_##prefix##test_suite_name##_EvalGenerator_() {                   \
+    return GTEST_EXPAND_(GTEST_GET_FIRST_(__VA_ARGS__, DUMMY_PARAM_));       \
+  }                                                                          \
+  static ::std::string gtest_##prefix##test_suite_name##_EvalGenerateName_(  \
+      const ::testing::TestParamInfo<test_suite_name::ParamType>& info) {    \
+    if (::testing::internal::AlwaysFalse()) {                                \
+      ::testing::internal::TestNotEmpty(GTEST_EXPAND_(GTEST_GET_SECOND_(     \
+          __VA_ARGS__,                                                       \
+          ::testing::internal::DefaultParamName<test_suite_name::ParamType>, \
+          DUMMY_PARAM_)));                                                   \
+      auto t = std::make_tuple(__VA_ARGS__);                                 \
+      static_assert(std::tuple_size<decltype(t)>::value <= 2,                \
+                    "Too Many Args!");                                       \
+    }                                                                        \
+    return ((GTEST_EXPAND_(GTEST_GET_SECOND_(                                \
+        __VA_ARGS__,                                                         \
+        ::testing::internal::DefaultParamName<test_suite_name::ParamType>,   \
+        DUMMY_PARAM_))))(info);                                              \
+  }                                                                          \
+  static int gtest_##prefix##test_suite_name##_dummy_                        \
+      GTEST_ATTRIBUTE_UNUSED_ =                                              \
+          ::testing::UnitTest::GetInstance()                                 \
+              ->parameterized_test_registry()                                \
+              .GetTestSuitePatternHolder<test_suite_name>(                   \
+                  GTEST_STRINGIFY_(test_suite_name),                         \
+                  ::testing::internal::CodeLocation(__FILE__, __LINE__))     \
+              ->AddTestSuiteInstantiation(                                   \
+                  GTEST_STRINGIFY_(prefix),                                  \
+                  &gtest_##prefix##test_suite_name##_EvalGenerator_,         \
+                  &gtest_##prefix##test_suite_name##_EvalGenerateName_,      \
+                  __FILE__, __LINE__)
+
+// Allow Marking a Parameterized test class as not needing to be instantiated.
+#define GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(T)                  \
+  namespace gtest_do_not_use_outside_namespace_scope {}                   \
+  static const ::testing::internal::MarkAsIgnored gtest_allow_ignore_##T( \
+      GTEST_STRINGIFY_(T))
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+#define INSTANTIATE_TEST_CASE_P                                            \
+  static_assert(::testing::internal::InstantiateTestCase_P_IsDeprecated(), \
+                "");                                                       \
+  INSTANTIATE_TEST_SUITE_P
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+}  // namespace testing
+
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h
new file mode 100644
index 0000000000..a91e8b8b10
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h
@@ -0,0 +1,1048 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Google Test - The Google C++ Testing and Mocking Framework
+//
+// This file implements a universal value printer that can print a
+// value of any type T:
+//
+//   void ::testing::internal::UniversalPrinter<T>::Print(value, ostream_ptr);
+//
+// A user can teach this function how to print a class type T by
+// defining either operator<<() or PrintTo() in the namespace that
+// defines T.  More specifically, the FIRST defined function in the
+// following list will be used (assuming T is defined in namespace
+// foo):
+//
+//   1. foo::PrintTo(const T&, ostream*)
+//   2. operator<<(ostream&, const T&) defined in either foo or the
+//      global namespace.
+//
+// However if T is an STL-style container then it is printed element-wise
+// unless foo::PrintTo(const T&, ostream*) is defined. Note that
+// operator<<() is ignored for container types.
+//
+// If none of the above is defined, it will print the debug string of
+// the value if it is a protocol buffer, or print the raw bytes in the
+// value otherwise.
+//
+// To aid debugging: when T is a reference type, the address of the
+// value is also printed; when T is a (const) char pointer, both the
+// pointer value and the NUL-terminated string it points to are
+// printed.
+//
+// We also provide some convenient wrappers:
+//
+//   // Prints a value to a string.  For a (const or not) char
+//   // pointer, the NUL-terminated string (but not the pointer) is
+//   // printed.
+//   std::string ::testing::PrintToString(const T& value);
+//
+//   // Prints a value tersely: for a reference type, the referenced
+//   // value (but not the address) is printed; for a (const or not) char
+//   // pointer, the NUL-terminated string (but not the pointer) is
+//   // printed.
+//   void ::testing::internal::UniversalTersePrint(const T& value, ostream*);
+//
+//   // Prints value using the type inferred by the compiler.  The difference
+//   // from UniversalTersePrint() is that this function prints both the
+//   // pointer and the NUL-terminated string for a (const or not) char pointer.
+//   void ::testing::internal::UniversalPrint(const T& value, ostream*);
+//
+//   // Prints the fields of a tuple tersely to a string vector, one
+//   // element for each field. Tuple support must be enabled in
+//   // gtest-port.h.
+//   std::vector<string> UniversalTersePrintTupleFieldsToStrings(
+//       const Tuple& value);
+//
+// Known limitation:
+//
+// The print primitives print the elements of an STL-style container
+// using the compiler-inferred type of *iter where iter is a
+// const_iterator of the container.  When const_iterator is an input
+// iterator but not a forward iterator, this inferred type may not
+// match value_type, and the print output may be incorrect.  In
+// practice, this is rarely a problem as for most containers
+// const_iterator is a forward iterator.  We'll fix this if there's an
+// actual need for it.  Note that this fix cannot rely on value_type
+// being defined as many user-defined container types don't have
+// value_type.
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+
+#include <functional>
+#include <memory>
+#include <ostream>  // NOLINT
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
+
+namespace testing {
+
+// Definitions in the internal* namespaces are subject to change without notice.
+// DO NOT USE THEM IN USER CODE!
+namespace internal {
+
+template <typename T>
+void UniversalPrint(const T& value, ::std::ostream* os);
+
+// Used to print an STL-style container when the user doesn't define
+// a PrintTo() for it.
+struct ContainerPrinter {
+  template <typename T,
+            typename = typename std::enable_if<
+                (sizeof(IsContainerTest<T>(0)) == sizeof(IsContainer)) &&
+                !IsRecursiveContainer<T>::value>::type>
+  static void PrintValue(const T& container, std::ostream* os) {
+    const size_t kMaxCount = 32;  // The maximum number of elements to print.
+    *os << '{';
+    size_t count = 0;
+    for (auto&& elem : container) {
+      if (count > 0) {
+        *os << ',';
+        if (count == kMaxCount) {  // Enough has been printed.
+          *os << " ...";
+          break;
+        }
+      }
+      *os << ' ';
+      // We cannot call PrintTo(elem, os) here as PrintTo() doesn't
+      // handle `elem` being a native array.
+      internal::UniversalPrint(elem, os);
+      ++count;
+    }
+
+    if (count > 0) {
+      *os << ' ';
+    }
+    *os << '}';
+  }
+};
+
+// Used to print a pointer that is neither a char pointer nor a member
+// pointer, when the user doesn't define PrintTo() for it.  (A member
+// variable pointer or member function pointer doesn't really point to
+// a location in the address space.  Their representation is
+// implementation-defined.  Therefore they will be printed as raw
+// bytes.)
+struct FunctionPointerPrinter {
+  template <typename T, typename = typename std::enable_if<
+                            std::is_function<T>::value>::type>
+  static void PrintValue(T* p, ::std::ostream* os) {
+    if (p == nullptr) {
+      *os << "NULL";
+    } else {
+      // T is a function type, so '*os << p' doesn't do what we want
+      // (it just prints p as bool).  We want to print p as a const
+      // void*.
+      *os << reinterpret_cast<const void*>(p);
+    }
+  }
+};
+
+struct PointerPrinter {
+  template <typename T>
+  static void PrintValue(T* p, ::std::ostream* os) {
+    if (p == nullptr) {
+      *os << "NULL";
+    } else {
+      // T is not a function type.  We just call << to print p,
+      // relying on ADL to pick up user-defined << for their pointer
+      // types, if any.
+      *os << p;
+    }
+  }
+};
+
+namespace internal_stream_operator_without_lexical_name_lookup {
+
+// The presence of an operator<< here will terminate lexical scope lookup
+// straight away (even though it cannot be a match because of its argument
+// types). Thus, the two operator<< calls in StreamPrinter will find only ADL
+// candidates.
+struct LookupBlocker {};
+void operator<<(LookupBlocker, LookupBlocker);
+
+struct StreamPrinter {
+  template <typename T,
+            // Don't accept member pointers here. We'd print them via implicit
+            // conversion to bool, which isn't useful.
+            typename = typename std::enable_if<
+                !std::is_member_pointer<T>::value>::type,
+            // Only accept types for which we can find a streaming operator via
+            // ADL (possibly involving implicit conversions).
+            typename = decltype(std::declval<std::ostream&>()
+                                << std::declval<const T&>())>
+  static void PrintValue(const T& value, ::std::ostream* os) {
+    // Call streaming operator found by ADL, possibly with implicit conversions
+    // of the arguments.
+    *os << value;
+  }
+};
+
+}  // namespace internal_stream_operator_without_lexical_name_lookup
+
+struct ProtobufPrinter {
+  // We print a protobuf using its ShortDebugString() when the string
+  // doesn't exceed this many characters; otherwise we print it using
+  // DebugString() for better readability.
+  static const size_t kProtobufOneLinerMaxLength = 50;
+
+  template <typename T,
+            typename = typename std::enable_if<
+                internal::HasDebugStringAndShortDebugString<T>::value>::type>
+  static void PrintValue(const T& value, ::std::ostream* os) {
+    std::string pretty_str = value.ShortDebugString();
+    if (pretty_str.length() > kProtobufOneLinerMaxLength) {
+      pretty_str = "\n" + value.DebugString();
+    }
+    *os << ("<" + pretty_str + ">");
+  }
+};
+
+struct ConvertibleToIntegerPrinter {
+  // Since T has no << operator or PrintTo() but can be implicitly
+  // converted to BiggestInt, we print it as a BiggestInt.
+  //
+  // Most likely T is an enum type (either named or unnamed), in which
+  // case printing it as an integer is the desired behavior.  In case
+  // T is not an enum, printing it as an integer is the best we can do
+  // given that it has no user-defined printer.
+  static void PrintValue(internal::BiggestInt value, ::std::ostream* os) {
+    *os << value;
+  }
+};
+
+struct ConvertibleToStringViewPrinter {
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+  static void PrintValue(internal::StringView value, ::std::ostream* os) {
+    internal::UniversalPrint(value, os);
+  }
+#endif
+};
+
+// Prints the given number of bytes in the given object to the given
+// ostream.
+GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes,
+                                     size_t count, ::std::ostream* os);
+struct RawBytesPrinter {
+  // SFINAE on `sizeof` to make sure we have a complete type.
+  template <typename T, size_t = sizeof(T)>
+  static void PrintValue(const T& value, ::std::ostream* os) {
+    PrintBytesInObjectTo(
+        static_cast<const unsigned char*>(
+            // Load bearing cast to void* to support iOS
+            reinterpret_cast<const void*>(std::addressof(value))),
+        sizeof(value), os);
+  }
+};
+
+struct FallbackPrinter {
+  template <typename T>
+  static void PrintValue(const T&, ::std::ostream* os) {
+    *os << "(incomplete type)";
+  }
+};
+
+// Try every printer in order and return the first one that works.
+template <typename T, typename E, typename Printer, typename... Printers>
+struct FindFirstPrinter : FindFirstPrinter<T, E, Printers...> {};
+
+template <typename T, typename Printer, typename... Printers>
+struct FindFirstPrinter<
+    T, decltype(Printer::PrintValue(std::declval<const T&>(), nullptr)),
+    Printer, Printers...> {
+  using type = Printer;
+};
+
+// Select the best printer in the following order:
+//  - Print containers (they have begin/end/etc).
+//  - Print function pointers.
+//  - Print object pointers.
+//  - Use the stream operator, if available.
+//  - Print protocol buffers.
+//  - Print types convertible to BiggestInt.
+//  - Print types convertible to StringView, if available.
+//  - Fallback to printing the raw bytes of the object.
+template <typename T>
+void PrintWithFallback(const T& value, ::std::ostream* os) {
+  using Printer = typename FindFirstPrinter<
+      T, void, ContainerPrinter, FunctionPointerPrinter, PointerPrinter,
+      internal_stream_operator_without_lexical_name_lookup::StreamPrinter,
+      ProtobufPrinter, ConvertibleToIntegerPrinter,
+      ConvertibleToStringViewPrinter, RawBytesPrinter, FallbackPrinter>::type;
+  Printer::PrintValue(value, os);
+}
+
+// FormatForComparison<ToPrint, OtherOperand>::Format(value) formats a
+// value of type ToPrint that is an operand of a comparison assertion
+// (e.g. ASSERT_EQ).  OtherOperand is the type of the other operand in
+// the comparison, and is used to help determine the best way to
+// format the value.  In particular, when the value is a C string
+// (char pointer) and the other operand is an STL string object, we
+// want to format the C string as a string, since we know it is
+// compared by value with the string object.  If the value is a char
+// pointer but the other operand is not an STL string object, we don't
+// know whether the pointer is supposed to point to a NUL-terminated
+// string, and thus want to print it as a pointer to be safe.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+
+// The default case.
+template <typename ToPrint, typename OtherOperand>
+class FormatForComparison {
+ public:
+  static ::std::string Format(const ToPrint& value) {
+    return ::testing::PrintToString(value);
+  }
+};
+
+// Array.
+template <typename ToPrint, size_t N, typename OtherOperand>
+class FormatForComparison<ToPrint[N], OtherOperand> {
+ public:
+  static ::std::string Format(const ToPrint* value) {
+    return FormatForComparison<const ToPrint*, OtherOperand>::Format(value);
+  }
+};
+
+// By default, print C string as pointers to be safe, as we don't know
+// whether they actually point to a NUL-terminated string.
+
+#define GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(CharType)                \
+  template <typename OtherOperand>                                      \
+  class FormatForComparison<CharType*, OtherOperand> {                  \
+   public:                                                              \
+    static ::std::string Format(CharType* value) {                      \
+      return ::testing::PrintToString(static_cast<const void*>(value)); \
+    }                                                                   \
+  }
+
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t);
+#ifdef __cpp_lib_char8_t
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char8_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char8_t);
+#endif
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char16_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char16_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char32_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char32_t);
+
+#undef GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_
+
+// If a C string is compared with an STL string object, we know it's meant
+// to point to a NUL-terminated string, and thus can print it as a string.
+
+#define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \
+  template <>                                                            \
+  class FormatForComparison<CharType*, OtherStringType> {                \
+   public:                                                               \
+    static ::std::string Format(CharType* value) {                       \
+      return ::testing::PrintToString(value);                            \
+    }                                                                    \
+  }
+
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string);
+#ifdef __cpp_char8_t
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char8_t, ::std::u8string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char8_t, ::std::u8string);
+#endif
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char16_t, ::std::u16string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char16_t, ::std::u16string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char32_t, ::std::u32string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char32_t, ::std::u32string);
+
+#if GTEST_HAS_STD_WSTRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring);
+#endif
+
+#undef GTEST_IMPL_FORMAT_C_STRING_AS_STRING_
+
+// Formats a comparison assertion (e.g. ASSERT_EQ, EXPECT_LT, and etc)
+// operand to be used in a failure message.  The type (but not value)
+// of the other operand may affect the format.  This allows us to
+// print a char* as a raw pointer when it is compared against another
+// char* or void*, and print it as a C string when it is compared
+// against an std::string object, for example.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+template <typename T1, typename T2>
+std::string FormatForComparisonFailureMessage(const T1& value,
+                                              const T2& /* other_operand */) {
+  return FormatForComparison<T1, T2>::Format(value);
+}
+
+// UniversalPrinter<T>::Print(value, ostream_ptr) prints the given
+// value to the given ostream.  The caller must ensure that
+// 'ostream_ptr' is not NULL, or the behavior is undefined.
+//
+// We define UniversalPrinter as a class template (as opposed to a
+// function template), as we need to partially specialize it for
+// reference types, which cannot be done with function templates.
+template <typename T>
+class UniversalPrinter;
+
+// Prints the given value using the << operator if it has one;
+// otherwise prints the bytes in it.  This is what
+// UniversalPrinter<T>::Print() does when PrintTo() is not specialized
+// or overloaded for type T.
+//
+// A user can override this behavior for a class type Foo by defining
+// an overload of PrintTo() in the namespace where Foo is defined.  We
+// give the user this option as sometimes defining a << operator for
+// Foo is not desirable (e.g. the coding style may prevent doing it,
+// or there is already a << operator but it doesn't do what the user
+// wants).
+template <typename T>
+void PrintTo(const T& value, ::std::ostream* os) {
+  internal::PrintWithFallback(value, os);
+}
+
+// The following list of PrintTo() overloads tells
+// UniversalPrinter<T>::Print() how to print standard types (built-in
+// types, strings, plain arrays, and pointers).
+
+// Overloads for various char types.
+GTEST_API_ void PrintTo(unsigned char c, ::std::ostream* os);
+GTEST_API_ void PrintTo(signed char c, ::std::ostream* os);
+inline void PrintTo(char c, ::std::ostream* os) {
+  // When printing a plain char, we always treat it as unsigned.  This
+  // way, the output won't be affected by whether the compiler thinks
+  // char is signed or not.
+  PrintTo(static_cast<unsigned char>(c), os);
+}
+
+// Overloads for other simple built-in types.
+inline void PrintTo(bool x, ::std::ostream* os) {
+  *os << (x ? "true" : "false");
+}
+
+// Overload for wchar_t type.
+// Prints a wchar_t as a symbol if it is printable or as its internal
+// code otherwise and also as its decimal code (except for L'\0').
+// The L'\0' char is printed as "L'\\0'". The decimal code is printed
+// as signed integer when wchar_t is implemented by the compiler
+// as a signed type and is printed as an unsigned integer when wchar_t
+// is implemented as an unsigned type.
+GTEST_API_ void PrintTo(wchar_t wc, ::std::ostream* os);
+
+GTEST_API_ void PrintTo(char32_t c, ::std::ostream* os);
+inline void PrintTo(char16_t c, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<char32_t>(c), os);
+}
+#ifdef __cpp_char8_t
+inline void PrintTo(char8_t c, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<char32_t>(c), os);
+}
+#endif
+
+// gcc/clang __{u,}int128_t
+#if defined(__SIZEOF_INT128__)
+GTEST_API_ void PrintTo(__uint128_t v, ::std::ostream* os);
+GTEST_API_ void PrintTo(__int128_t v, ::std::ostream* os);
+#endif  // __SIZEOF_INT128__
+
+// Overloads for C strings.
+GTEST_API_ void PrintTo(const char* s, ::std::ostream* os);
+inline void PrintTo(char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const char*>(s), os);
+}
+
+// signed/unsigned char is often used for representing binary data, so
+// we print pointers to it as void* to be safe.
+inline void PrintTo(const signed char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+inline void PrintTo(signed char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+inline void PrintTo(const unsigned char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+inline void PrintTo(unsigned char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+#ifdef __cpp_char8_t
+// Overloads for u8 strings.
+GTEST_API_ void PrintTo(const char8_t* s, ::std::ostream* os);
+inline void PrintTo(char8_t* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const char8_t*>(s), os);
+}
+#endif
+// Overloads for u16 strings.
+GTEST_API_ void PrintTo(const char16_t* s, ::std::ostream* os);
+inline void PrintTo(char16_t* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const char16_t*>(s), os);
+}
+// Overloads for u32 strings.
+GTEST_API_ void PrintTo(const char32_t* s, ::std::ostream* os);
+inline void PrintTo(char32_t* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const char32_t*>(s), os);
+}
+
+// MSVC can be configured to define wchar_t as a typedef of unsigned
+// short.  It defines _NATIVE_WCHAR_T_DEFINED when wchar_t is a native
+// type.  When wchar_t is a typedef, defining an overload for const
+// wchar_t* would cause unsigned short* be printed as a wide string,
+// possibly causing invalid memory accesses.
+#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
+// Overloads for wide C strings
+GTEST_API_ void PrintTo(const wchar_t* s, ::std::ostream* os);
+inline void PrintTo(wchar_t* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const wchar_t*>(s), os);
+}
+#endif
+
+// Overload for C arrays.  Multi-dimensional arrays are printed
+// properly.
+
+// Prints the given number of elements in an array, without printing
+// the curly braces.
+template <typename T>
+void PrintRawArrayTo(const T a[], size_t count, ::std::ostream* os) {
+  UniversalPrint(a[0], os);
+  for (size_t i = 1; i != count; i++) {
+    *os << ", ";
+    UniversalPrint(a[i], os);
+  }
+}
+
+// Overloads for ::std::string.
+GTEST_API_ void PrintStringTo(const ::std::string& s, ::std::ostream* os);
+inline void PrintTo(const ::std::string& s, ::std::ostream* os) {
+  PrintStringTo(s, os);
+}
+
+// Overloads for ::std::u8string
+#ifdef __cpp_char8_t
+GTEST_API_ void PrintU8StringTo(const ::std::u8string& s, ::std::ostream* os);
+inline void PrintTo(const ::std::u8string& s, ::std::ostream* os) {
+  PrintU8StringTo(s, os);
+}
+#endif
+
+// Overloads for ::std::u16string
+GTEST_API_ void PrintU16StringTo(const ::std::u16string& s, ::std::ostream* os);
+inline void PrintTo(const ::std::u16string& s, ::std::ostream* os) {
+  PrintU16StringTo(s, os);
+}
+
+// Overloads for ::std::u32string
+GTEST_API_ void PrintU32StringTo(const ::std::u32string& s, ::std::ostream* os);
+inline void PrintTo(const ::std::u32string& s, ::std::ostream* os) {
+  PrintU32StringTo(s, os);
+}
+
+// Overloads for ::std::wstring.
+#if GTEST_HAS_STD_WSTRING
+GTEST_API_ void PrintWideStringTo(const ::std::wstring& s, ::std::ostream* os);
+inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) {
+  PrintWideStringTo(s, os);
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+// Overload for internal::StringView.
+inline void PrintTo(internal::StringView sp, ::std::ostream* os) {
+  PrintTo(::std::string(sp), os);
+}
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
+
+inline void PrintTo(std::nullptr_t, ::std::ostream* os) { *os << "(nullptr)"; }
+
+#if GTEST_HAS_RTTI
+inline void PrintTo(const std::type_info& info, std::ostream* os) {
+  *os << internal::GetTypeName(info);
+}
+#endif  // GTEST_HAS_RTTI
+
+template <typename T>
+void PrintTo(std::reference_wrapper<T> ref, ::std::ostream* os) {
+  UniversalPrinter<T&>::Print(ref.get(), os);
+}
+
+inline const void* VoidifyPointer(const void* p) { return p; }
+inline const void* VoidifyPointer(volatile const void* p) {
+  return const_cast<const void*>(p);
+}
+
+template <typename T, typename Ptr>
+void PrintSmartPointer(const Ptr& ptr, std::ostream* os, char) {
+  if (ptr == nullptr) {
+    *os << "(nullptr)";
+  } else {
+    // We can't print the value. Just print the pointer..
+    *os << "(" << (VoidifyPointer)(ptr.get()) << ")";
+  }
+}
+template <typename T, typename Ptr,
+          typename = typename std::enable_if<!std::is_void<T>::value &&
+                                             !std::is_array<T>::value>::type>
+void PrintSmartPointer(const Ptr& ptr, std::ostream* os, int) {
+  if (ptr == nullptr) {
+    *os << "(nullptr)";
+  } else {
+    *os << "(ptr = " << (VoidifyPointer)(ptr.get()) << ", value = ";
+    UniversalPrinter<T>::Print(*ptr, os);
+    *os << ")";
+  }
+}
+
+template <typename T, typename D>
+void PrintTo(const std::unique_ptr<T, D>& ptr, std::ostream* os) {
+  (PrintSmartPointer<T>)(ptr, os, 0);
+}
+
+template <typename T>
+void PrintTo(const std::shared_ptr<T>& ptr, std::ostream* os) {
+  (PrintSmartPointer<T>)(ptr, os, 0);
+}
+
+// Helper function for printing a tuple.  T must be instantiated with
+// a tuple type.
+template <typename T>
+void PrintTupleTo(const T&, std::integral_constant<size_t, 0>,
+                  ::std::ostream*) {}
+
+template <typename T, size_t I>
+void PrintTupleTo(const T& t, std::integral_constant<size_t, I>,
+                  ::std::ostream* os) {
+  PrintTupleTo(t, std::integral_constant<size_t, I - 1>(), os);
+  GTEST_INTENTIONAL_CONST_COND_PUSH_()
+  if (I > 1) {
+    GTEST_INTENTIONAL_CONST_COND_POP_()
+    *os << ", ";
+  }
+  UniversalPrinter<typename std::tuple_element<I - 1, T>::type>::Print(
+      std::get<I - 1>(t), os);
+}
+
+template <typename... Types>
+void PrintTo(const ::std::tuple<Types...>& t, ::std::ostream* os) {
+  *os << "(";
+  PrintTupleTo(t, std::integral_constant<size_t, sizeof...(Types)>(), os);
+  *os << ")";
+}
+
+// Overload for std::pair.
+template <typename T1, typename T2>
+void PrintTo(const ::std::pair<T1, T2>& value, ::std::ostream* os) {
+  *os << '(';
+  // We cannot use UniversalPrint(value.first, os) here, as T1 may be
+  // a reference type.  The same for printing value.second.
+  UniversalPrinter<T1>::Print(value.first, os);
+  *os << ", ";
+  UniversalPrinter<T2>::Print(value.second, os);
+  *os << ')';
+}
+
+// Implements printing a non-reference type T by letting the compiler
+// pick the right overload of PrintTo() for T.
+template <typename T>
+class UniversalPrinter {
+ public:
+  // MSVC warns about adding const to a function type, so we want to
+  // disable the warning.
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4180)
+
+  // Note: we deliberately don't call this PrintTo(), as that name
+  // conflicts with ::testing::internal::PrintTo in the body of the
+  // function.
+  static void Print(const T& value, ::std::ostream* os) {
+    // By default, ::testing::internal::PrintTo() is used for printing
+    // the value.
+    //
+    // Thanks to Koenig look-up, if T is a class and has its own
+    // PrintTo() function defined in its namespace, that function will
+    // be visible here.  Since it is more specific than the generic ones
+    // in ::testing::internal, it will be picked by the compiler in the
+    // following statement - exactly what we want.
+    PrintTo(value, os);
+  }
+
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+};
+
+// Remove any const-qualifiers before passing a type to UniversalPrinter.
+template <typename T>
+class UniversalPrinter<const T> : public UniversalPrinter<T> {};
+
+#if GTEST_INTERNAL_HAS_ANY
+
+// Printer for std::any / absl::any
+
+template <>
+class UniversalPrinter<Any> {
+ public:
+  static void Print(const Any& value, ::std::ostream* os) {
+    if (value.has_value()) {
+      *os << "value of type " << GetTypeName(value);
+    } else {
+      *os << "no value";
+    }
+  }
+
+ private:
+  static std::string GetTypeName(const Any& value) {
+#if GTEST_HAS_RTTI
+    return internal::GetTypeName(value.type());
+#else
+    static_cast<void>(value);  // possibly unused
+    return "<unknown_type>";
+#endif  // GTEST_HAS_RTTI
+  }
+};
+
+#endif  // GTEST_INTERNAL_HAS_ANY
+
+#if GTEST_INTERNAL_HAS_OPTIONAL
+
+// Printer for std::optional / absl::optional
+
+template <typename T>
+class UniversalPrinter<Optional<T>> {
+ public:
+  static void Print(const Optional<T>& value, ::std::ostream* os) {
+    *os << '(';
+    if (!value) {
+      *os << "nullopt";
+    } else {
+      UniversalPrint(*value, os);
+    }
+    *os << ')';
+  }
+};
+
+template <>
+class UniversalPrinter<decltype(Nullopt())> {
+ public:
+  static void Print(decltype(Nullopt()), ::std::ostream* os) {
+    *os << "(nullopt)";
+  }
+};
+
+#endif  // GTEST_INTERNAL_HAS_OPTIONAL
+
+#if GTEST_INTERNAL_HAS_VARIANT
+
+// Printer for std::variant / absl::variant
+
+template <typename... T>
+class UniversalPrinter<Variant<T...>> {
+ public:
+  static void Print(const Variant<T...>& value, ::std::ostream* os) {
+    *os << '(';
+#if GTEST_HAS_ABSL
+    absl::visit(Visitor{os, value.index()}, value);
+#else
+    std::visit(Visitor{os, value.index()}, value);
+#endif  // GTEST_HAS_ABSL
+    *os << ')';
+  }
+
+ private:
+  struct Visitor {
+    template <typename U>
+    void operator()(const U& u) const {
+      *os << "'" << GetTypeName<U>() << "(index = " << index
+          << ")' with value ";
+      UniversalPrint(u, os);
+    }
+    ::std::ostream* os;
+    std::size_t index;
+  };
+};
+
+#endif  // GTEST_INTERNAL_HAS_VARIANT
+
+// UniversalPrintArray(begin, len, os) prints an array of 'len'
+// elements, starting at address 'begin'.
+template <typename T>
+void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) {
+  if (len == 0) {
+    *os << "{}";
+  } else {
+    *os << "{ ";
+    const size_t kThreshold = 18;
+    const size_t kChunkSize = 8;
+    // If the array has more than kThreshold elements, we'll have to
+    // omit some details by printing only the first and the last
+    // kChunkSize elements.
+    if (len <= kThreshold) {
+      PrintRawArrayTo(begin, len, os);
+    } else {
+      PrintRawArrayTo(begin, kChunkSize, os);
+      *os << ", ..., ";
+      PrintRawArrayTo(begin + len - kChunkSize, kChunkSize, os);
+    }
+    *os << " }";
+  }
+}
+// This overload prints a (const) char array compactly.
+GTEST_API_ void UniversalPrintArray(const char* begin, size_t len,
+                                    ::std::ostream* os);
+
+#ifdef __cpp_char8_t
+// This overload prints a (const) char8_t array compactly.
+GTEST_API_ void UniversalPrintArray(const char8_t* begin, size_t len,
+                                    ::std::ostream* os);
+#endif
+
+// This overload prints a (const) char16_t array compactly.
+GTEST_API_ void UniversalPrintArray(const char16_t* begin, size_t len,
+                                    ::std::ostream* os);
+
+// This overload prints a (const) char32_t array compactly.
+GTEST_API_ void UniversalPrintArray(const char32_t* begin, size_t len,
+                                    ::std::ostream* os);
+
+// This overload prints a (const) wchar_t array compactly.
+GTEST_API_ void UniversalPrintArray(const wchar_t* begin, size_t len,
+                                    ::std::ostream* os);
+
+// Implements printing an array type T[N].
+template <typename T, size_t N>
+class UniversalPrinter<T[N]> {
+ public:
+  // Prints the given array, omitting some elements when there are too
+  // many.
+  static void Print(const T (&a)[N], ::std::ostream* os) {
+    UniversalPrintArray(a, N, os);
+  }
+};
+
+// Implements printing a reference type T&.
+template <typename T>
+class UniversalPrinter<T&> {
+ public:
+  // MSVC warns about adding const to a function type, so we want to
+  // disable the warning.
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4180)
+
+  static void Print(const T& value, ::std::ostream* os) {
+    // Prints the address of the value.  We use reinterpret_cast here
+    // as static_cast doesn't compile when T is a function type.
+    *os << "@" << reinterpret_cast<const void*>(&value) << " ";
+
+    // Then prints the value itself.
+    UniversalPrint(value, os);
+  }
+
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+};
+
+// Prints a value tersely: for a reference type, the referenced value
+// (but not the address) is printed; for a (const) char pointer, the
+// NUL-terminated string (but not the pointer) is printed.
+
+template <typename T>
+class UniversalTersePrinter {
+ public:
+  static void Print(const T& value, ::std::ostream* os) {
+    UniversalPrint(value, os);
+  }
+};
+template <typename T>
+class UniversalTersePrinter<T&> {
+ public:
+  static void Print(const T& value, ::std::ostream* os) {
+    UniversalPrint(value, os);
+  }
+};
+template <typename T, size_t N>
+class UniversalTersePrinter<T[N]> {
+ public:
+  static void Print(const T (&value)[N], ::std::ostream* os) {
+    UniversalPrinter<T[N]>::Print(value, os);
+  }
+};
+template <>
+class UniversalTersePrinter<const char*> {
+ public:
+  static void Print(const char* str, ::std::ostream* os) {
+    if (str == nullptr) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(std::string(str), os);
+    }
+  }
+};
+template <>
+class UniversalTersePrinter<char*> : public UniversalTersePrinter<const char*> {
+};
+
+#ifdef __cpp_char8_t
+template <>
+class UniversalTersePrinter<const char8_t*> {
+ public:
+  static void Print(const char8_t* str, ::std::ostream* os) {
+    if (str == nullptr) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(::std::u8string(str), os);
+    }
+  }
+};
+template <>
+class UniversalTersePrinter<char8_t*>
+    : public UniversalTersePrinter<const char8_t*> {};
+#endif
+
+template <>
+class UniversalTersePrinter<const char16_t*> {
+ public:
+  static void Print(const char16_t* str, ::std::ostream* os) {
+    if (str == nullptr) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(::std::u16string(str), os);
+    }
+  }
+};
+template <>
+class UniversalTersePrinter<char16_t*>
+    : public UniversalTersePrinter<const char16_t*> {};
+
+template <>
+class UniversalTersePrinter<const char32_t*> {
+ public:
+  static void Print(const char32_t* str, ::std::ostream* os) {
+    if (str == nullptr) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(::std::u32string(str), os);
+    }
+  }
+};
+template <>
+class UniversalTersePrinter<char32_t*>
+    : public UniversalTersePrinter<const char32_t*> {};
+
+#if GTEST_HAS_STD_WSTRING
+template <>
+class UniversalTersePrinter<const wchar_t*> {
+ public:
+  static void Print(const wchar_t* str, ::std::ostream* os) {
+    if (str == nullptr) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(::std::wstring(str), os);
+    }
+  }
+};
+#endif
+
+template <>
+class UniversalTersePrinter<wchar_t*> {
+ public:
+  static void Print(wchar_t* str, ::std::ostream* os) {
+    UniversalTersePrinter<const wchar_t*>::Print(str, os);
+  }
+};
+
+template <typename T>
+void UniversalTersePrint(const T& value, ::std::ostream* os) {
+  UniversalTersePrinter<T>::Print(value, os);
+}
+
+// Prints a value using the type inferred by the compiler.  The
+// difference between this and UniversalTersePrint() is that for a
+// (const) char pointer, this prints both the pointer and the
+// NUL-terminated string.
+template <typename T>
+void UniversalPrint(const T& value, ::std::ostream* os) {
+  // A workarond for the bug in VC++ 7.1 that prevents us from instantiating
+  // UniversalPrinter with T directly.
+  typedef T T1;
+  UniversalPrinter<T1>::Print(value, os);
+}
+
+typedef ::std::vector<::std::string> Strings;
+
+// Tersely prints the first N fields of a tuple to a string vector,
+// one element for each field.
+template <typename Tuple>
+void TersePrintPrefixToStrings(const Tuple&, std::integral_constant<size_t, 0>,
+                               Strings*) {}
+template <typename Tuple, size_t I>
+void TersePrintPrefixToStrings(const Tuple& t,
+                               std::integral_constant<size_t, I>,
+                               Strings* strings) {
+  TersePrintPrefixToStrings(t, std::integral_constant<size_t, I - 1>(),
+                            strings);
+  ::std::stringstream ss;
+  UniversalTersePrint(std::get<I - 1>(t), &ss);
+  strings->push_back(ss.str());
+}
+
+// Prints the fields of a tuple tersely to a string vector, one
+// element for each field.  See the comment before
+// UniversalTersePrint() for how we define "tersely".
+template <typename Tuple>
+Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) {
+  Strings result;
+  TersePrintPrefixToStrings(
+      value, std::integral_constant<size_t, std::tuple_size<Tuple>::value>(),
+      &result);
+  return result;
+}
+
+}  // namespace internal
+
+template <typename T>
+::std::string PrintToString(const T& value) {
+  ::std::stringstream ss;
+  internal::UniversalTersePrinter<T>::Print(value, &ss);
+  return ss.str();
+}
+
+}  // namespace testing
+
+// Include any custom printer added by the local installation.
+// We must include this header at the end to make sure it can use the
+// declarations from this file.
+#include "gtest/internal/custom/gtest-printers.h"
+
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h
new file mode 100644
index 0000000000..bec8c4810b
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h
@@ -0,0 +1,248 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Utilities for testing Google Test itself and code that uses Google Test
+// (e.g. frameworks built on top of Google Test).
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_
+
+#include "gtest/gtest.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+
+// This helper class can be used to mock out Google Test failure reporting
+// so that we can test Google Test or code that builds on Google Test.
+//
+// An object of this class appends a TestPartResult object to the
+// TestPartResultArray object given in the constructor whenever a Google Test
+// failure is reported. It can either intercept only failures that are
+// generated in the same thread that created this object or it can intercept
+// all generated failures. The scope of this mock object can be controlled with
+// the second argument to the two arguments constructor.
+class GTEST_API_ ScopedFakeTestPartResultReporter
+    : public TestPartResultReporterInterface {
+ public:
+  // The two possible mocking modes of this object.
+  enum InterceptMode {
+    INTERCEPT_ONLY_CURRENT_THREAD,  // Intercepts only thread local failures.
+    INTERCEPT_ALL_THREADS           // Intercepts all failures.
+  };
+
+  // The c'tor sets this object as the test part result reporter used
+  // by Google Test.  The 'result' parameter specifies where to report the
+  // results. This reporter will only catch failures generated in the current
+  // thread. DEPRECATED
+  explicit ScopedFakeTestPartResultReporter(TestPartResultArray* result);
+
+  // Same as above, but you can choose the interception scope of this object.
+  ScopedFakeTestPartResultReporter(InterceptMode intercept_mode,
+                                   TestPartResultArray* result);
+
+  // The d'tor restores the previous test part result reporter.
+  ~ScopedFakeTestPartResultReporter() override;
+
+  // Appends the TestPartResult object to the TestPartResultArray
+  // received in the constructor.
+  //
+  // This method is from the TestPartResultReporterInterface
+  // interface.
+  void ReportTestPartResult(const TestPartResult& result) override;
+
+ private:
+  void Init();
+
+  const InterceptMode intercept_mode_;
+  TestPartResultReporterInterface* old_reporter_;
+  TestPartResultArray* const result_;
+
+  ScopedFakeTestPartResultReporter(const ScopedFakeTestPartResultReporter&) =
+      delete;
+  ScopedFakeTestPartResultReporter& operator=(
+      const ScopedFakeTestPartResultReporter&) = delete;
+};
+
+namespace internal {
+
+// A helper class for implementing EXPECT_FATAL_FAILURE() and
+// EXPECT_NONFATAL_FAILURE().  Its destructor verifies that the given
+// TestPartResultArray contains exactly one failure that has the given
+// type and contains the given substring.  If that's not the case, a
+// non-fatal failure will be generated.
+class GTEST_API_ SingleFailureChecker {
+ public:
+  // The constructor remembers the arguments.
+  SingleFailureChecker(const TestPartResultArray* results,
+                       TestPartResult::Type type, const std::string& substr);
+  ~SingleFailureChecker();
+
+ private:
+  const TestPartResultArray* const results_;
+  const TestPartResult::Type type_;
+  const std::string substr_;
+
+  SingleFailureChecker(const SingleFailureChecker&) = delete;
+  SingleFailureChecker& operator=(const SingleFailureChecker&) = delete;
+};
+
+}  // namespace internal
+
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+// A set of macros for testing Google Test assertions or code that's expected
+// to generate Google Test fatal failures (e.g. a failure from an ASSERT_EQ, but
+// not a non-fatal failure, as from EXPECT_EQ).  It verifies that the given
+// statement will cause exactly one fatal Google Test failure with 'substr'
+// being part of the failure message.
+//
+// There are two different versions of this macro. EXPECT_FATAL_FAILURE only
+// affects and considers failures generated in the current thread and
+// EXPECT_FATAL_FAILURE_ON_ALL_THREADS does the same but for all threads.
+//
+// The verification of the assertion is done correctly even when the statement
+// throws an exception or aborts the current function.
+//
+// Known restrictions:
+//   - 'statement' cannot reference local non-static variables or
+//     non-static members of the current object.
+//   - 'statement' cannot return a value.
+//   - You cannot stream a failure message to this macro.
+//
+// Note that even though the implementations of the following two
+// macros are much alike, we cannot refactor them to use a common
+// helper macro, due to some peculiarity in how the preprocessor
+// works.  The AcceptsMacroThatExpandsToUnprotectedComma test in
+// gtest_unittest.cc will fail to compile if we do that.
+#define EXPECT_FATAL_FAILURE(statement, substr)                               \
+  do {                                                                        \
+    class GTestExpectFatalFailureHelper {                                     \
+     public:                                                                  \
+      static void Execute() { statement; }                                    \
+    };                                                                        \
+    ::testing::TestPartResultArray gtest_failures;                            \
+    ::testing::internal::SingleFailureChecker gtest_checker(                  \
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr)); \
+    {                                                                         \
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(             \
+          ::testing::ScopedFakeTestPartResultReporter::                       \
+              INTERCEPT_ONLY_CURRENT_THREAD,                                  \
+          &gtest_failures);                                                   \
+      GTestExpectFatalFailureHelper::Execute();                               \
+    }                                                                         \
+  } while (::testing::internal::AlwaysFalse())
+
+#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr)                \
+  do {                                                                        \
+    class GTestExpectFatalFailureHelper {                                     \
+     public:                                                                  \
+      static void Execute() { statement; }                                    \
+    };                                                                        \
+    ::testing::TestPartResultArray gtest_failures;                            \
+    ::testing::internal::SingleFailureChecker gtest_checker(                  \
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr)); \
+    {                                                                         \
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(             \
+          ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \
+          &gtest_failures);                                                   \
+      GTestExpectFatalFailureHelper::Execute();                               \
+    }                                                                         \
+  } while (::testing::internal::AlwaysFalse())
+
+// A macro for testing Google Test assertions or code that's expected to
+// generate Google Test non-fatal failures (e.g. a failure from an EXPECT_EQ,
+// but not from an ASSERT_EQ). It asserts that the given statement will cause
+// exactly one non-fatal Google Test failure with 'substr' being part of the
+// failure message.
+//
+// There are two different versions of this macro. EXPECT_NONFATAL_FAILURE only
+// affects and considers failures generated in the current thread and
+// EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS does the same but for all threads.
+//
+// 'statement' is allowed to reference local variables and members of
+// the current object.
+//
+// The verification of the assertion is done correctly even when the statement
+// throws an exception or aborts the current function.
+//
+// Known restrictions:
+//   - You cannot stream a failure message to this macro.
+//
+// Note that even though the implementations of the following two
+// macros are much alike, we cannot refactor them to use a common
+// helper macro, due to some peculiarity in how the preprocessor
+// works.  If we do that, the code won't compile when the user gives
+// EXPECT_NONFATAL_FAILURE() a statement that contains a macro that
+// expands to code containing an unprotected comma.  The
+// AcceptsMacroThatExpandsToUnprotectedComma test in gtest_unittest.cc
+// catches that.
+//
+// For the same reason, we have to write
+//   if (::testing::internal::AlwaysTrue()) { statement; }
+// instead of
+//   GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
+// to avoid an MSVC warning on unreachable code.
+#define EXPECT_NONFATAL_FAILURE(statement, substr)                    \
+  do {                                                                \
+    ::testing::TestPartResultArray gtest_failures;                    \
+    ::testing::internal::SingleFailureChecker gtest_checker(          \
+        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
+        (substr));                                                    \
+    {                                                                 \
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(     \
+          ::testing::ScopedFakeTestPartResultReporter::               \
+              INTERCEPT_ONLY_CURRENT_THREAD,                          \
+          &gtest_failures);                                           \
+      if (::testing::internal::AlwaysTrue()) {                        \
+        statement;                                                    \
+      }                                                               \
+    }                                                                 \
+  } while (::testing::internal::AlwaysFalse())
+
+#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr)             \
+  do {                                                                        \
+    ::testing::TestPartResultArray gtest_failures;                            \
+    ::testing::internal::SingleFailureChecker gtest_checker(                  \
+        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure,         \
+        (substr));                                                            \
+    {                                                                         \
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(             \
+          ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \
+          &gtest_failures);                                                   \
+      if (::testing::internal::AlwaysTrue()) {                                \
+        statement;                                                            \
+      }                                                                       \
+    }                                                                         \
+  } while (::testing::internal::AlwaysFalse())
+
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_
diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h
new file mode 100644
index 0000000000..09cc8c34f0
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h
@@ -0,0 +1,190 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+
+#include <iosfwd>
+#include <vector>
+
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-string.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+
+// A copyable object representing the result of a test part (i.e. an
+// assertion or an explicit FAIL(), ADD_FAILURE(), or SUCCESS()).
+//
+// Don't inherit from TestPartResult as its destructor is not virtual.
+class GTEST_API_ TestPartResult {
+ public:
+  // The possible outcomes of a test part (i.e. an assertion or an
+  // explicit SUCCEED(), FAIL(), or ADD_FAILURE()).
+  enum Type {
+    kSuccess,          // Succeeded.
+    kNonFatalFailure,  // Failed but the test can continue.
+    kFatalFailure,     // Failed and the test should be terminated.
+    kSkip              // Skipped.
+  };
+
+  // C'tor.  TestPartResult does NOT have a default constructor.
+  // Always use this constructor (with parameters) to create a
+  // TestPartResult object.
+  TestPartResult(Type a_type, const char* a_file_name, int a_line_number,
+                 const char* a_message)
+      : type_(a_type),
+        file_name_(a_file_name == nullptr ? "" : a_file_name),
+        line_number_(a_line_number),
+        summary_(ExtractSummary(a_message)),
+        message_(a_message) {}
+
+  // Gets the outcome of the test part.
+  Type type() const { return type_; }
+
+  // Gets the name of the source file where the test part took place, or
+  // NULL if it's unknown.
+  const char* file_name() const {
+    return file_name_.empty() ? nullptr : file_name_.c_str();
+  }
+
+  // Gets the line in the source file where the test part took place,
+  // or -1 if it's unknown.
+  int line_number() const { return line_number_; }
+
+  // Gets the summary of the failure message.
+  const char* summary() const { return summary_.c_str(); }
+
+  // Gets the message associated with the test part.
+  const char* message() const { return message_.c_str(); }
+
+  // Returns true if and only if the test part was skipped.
+  bool skipped() const { return type_ == kSkip; }
+
+  // Returns true if and only if the test part passed.
+  bool passed() const { return type_ == kSuccess; }
+
+  // Returns true if and only if the test part non-fatally failed.
+  bool nonfatally_failed() const { return type_ == kNonFatalFailure; }
+
+  // Returns true if and only if the test part fatally failed.
+  bool fatally_failed() const { return type_ == kFatalFailure; }
+
+  // Returns true if and only if the test part failed.
+  bool failed() const { return fatally_failed() || nonfatally_failed(); }
+
+ private:
+  Type type_;
+
+  // Gets the summary of the failure message by omitting the stack
+  // trace in it.
+  static std::string ExtractSummary(const char* message);
+
+  // The name of the source file where the test part took place, or
+  // "" if the source file is unknown.
+  std::string file_name_;
+  // The line in the source file where the test part took place, or -1
+  // if the line number is unknown.
+  int line_number_;
+  std::string summary_;  // The test failure summary.
+  std::string message_;  // The test failure message.
+};
+
+// Prints a TestPartResult object.
+std::ostream& operator<<(std::ostream& os, const TestPartResult& result);
+
+// An array of TestPartResult objects.
+//
+// Don't inherit from TestPartResultArray as its destructor is not
+// virtual.
+class GTEST_API_ TestPartResultArray {
+ public:
+  TestPartResultArray() {}
+
+  // Appends the given TestPartResult to the array.
+  void Append(const TestPartResult& result);
+
+  // Returns the TestPartResult at the given index (0-based).
+  const TestPartResult& GetTestPartResult(int index) const;
+
+  // Returns the number of TestPartResult objects in the array.
+  int size() const;
+
+ private:
+  std::vector<TestPartResult> array_;
+
+  TestPartResultArray(const TestPartResultArray&) = delete;
+  TestPartResultArray& operator=(const TestPartResultArray&) = delete;
+};
+
+// This interface knows how to report a test part result.
+class GTEST_API_ TestPartResultReporterInterface {
+ public:
+  virtual ~TestPartResultReporterInterface() {}
+
+  virtual void ReportTestPartResult(const TestPartResult& result) = 0;
+};
+
+namespace internal {
+
+// This helper class is used by {ASSERT|EXPECT}_NO_FATAL_FAILURE to check if a
+// statement generates new fatal failures. To do so it registers itself as the
+// current test part result reporter. Besides checking if fatal failures were
+// reported, it only delegates the reporting to the former result reporter.
+// The original result reporter is restored in the destructor.
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+class GTEST_API_ HasNewFatalFailureHelper
+    : public TestPartResultReporterInterface {
+ public:
+  HasNewFatalFailureHelper();
+  ~HasNewFatalFailureHelper() override;
+  void ReportTestPartResult(const TestPartResult& result) override;
+  bool has_new_fatal_failure() const { return has_new_fatal_failure_; }
+
+ private:
+  bool has_new_fatal_failure_;
+  TestPartResultReporterInterface* original_reporter_;
+
+  HasNewFatalFailureHelper(const HasNewFatalFailureHelper&) = delete;
+  HasNewFatalFailureHelper& operator=(const HasNewFatalFailureHelper&) = delete;
+};
+
+}  // namespace internal
+
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h
new file mode 100644
index 0000000000..bd35a32660
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h
@@ -0,0 +1,331 @@
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+
+// This header implements typed tests and type-parameterized tests.
+
+// Typed (aka type-driven) tests repeat the same test for types in a
+// list.  You must know which types you want to test with when writing
+// typed tests. Here's how you do it:
+
+#if 0
+
+// First, define a fixture class template.  It should be parameterized
+// by a type.  Remember to derive it from testing::Test.
+template <typename T>
+class FooTest : public testing::Test {
+ public:
+  ...
+  typedef std::list<T> List;
+  static T shared_;
+  T value_;
+};
+
+// Next, associate a list of types with the test suite, which will be
+// repeated for each type in the list.  The typedef is necessary for
+// the macro to parse correctly.
+typedef testing::Types<char, int, unsigned int> MyTypes;
+TYPED_TEST_SUITE(FooTest, MyTypes);
+
+// If the type list contains only one type, you can write that type
+// directly without Types<...>:
+//   TYPED_TEST_SUITE(FooTest, int);
+
+// Then, use TYPED_TEST() instead of TEST_F() to define as many typed
+// tests for this test suite as you want.
+TYPED_TEST(FooTest, DoesBlah) {
+  // Inside a test, refer to the special name TypeParam to get the type
+  // parameter.  Since we are inside a derived class template, C++ requires
+  // us to visit the members of FooTest via 'this'.
+  TypeParam n = this->value_;
+
+  // To visit static members of the fixture, add the TestFixture::
+  // prefix.
+  n += TestFixture::shared_;
+
+  // To refer to typedefs in the fixture, add the "typename
+  // TestFixture::" prefix.
+  typename TestFixture::List values;
+  values.push_back(n);
+  ...
+}
+
+TYPED_TEST(FooTest, HasPropertyA) { ... }
+
+// TYPED_TEST_SUITE takes an optional third argument which allows to specify a
+// class that generates custom test name suffixes based on the type. This should
+// be a class which has a static template function GetName(int index) returning
+// a string for each type. The provided integer index equals the index of the
+// type in the provided type list. In many cases the index can be ignored.
+//
+// For example:
+//   class MyTypeNames {
+//    public:
+//     template <typename T>
+//     static std::string GetName(int) {
+//       if (std::is_same<T, char>()) return "char";
+//       if (std::is_same<T, int>()) return "int";
+//       if (std::is_same<T, unsigned int>()) return "unsignedInt";
+//     }
+//   };
+//   TYPED_TEST_SUITE(FooTest, MyTypes, MyTypeNames);
+
+#endif  // 0
+
+// Type-parameterized tests are abstract test patterns parameterized
+// by a type.  Compared with typed tests, type-parameterized tests
+// allow you to define the test pattern without knowing what the type
+// parameters are.  The defined pattern can be instantiated with
+// different types any number of times, in any number of translation
+// units.
+//
+// If you are designing an interface or concept, you can define a
+// suite of type-parameterized tests to verify properties that any
+// valid implementation of the interface/concept should have.  Then,
+// each implementation can easily instantiate the test suite to verify
+// that it conforms to the requirements, without having to write
+// similar tests repeatedly.  Here's an example:
+
+#if 0
+
+// First, define a fixture class template.  It should be parameterized
+// by a type.  Remember to derive it from testing::Test.
+template <typename T>
+class FooTest : public testing::Test {
+  ...
+};
+
+// Next, declare that you will define a type-parameterized test suite
+// (the _P suffix is for "parameterized" or "pattern", whichever you
+// prefer):
+TYPED_TEST_SUITE_P(FooTest);
+
+// Then, use TYPED_TEST_P() to define as many type-parameterized tests
+// for this type-parameterized test suite as you want.
+TYPED_TEST_P(FooTest, DoesBlah) {
+  // Inside a test, refer to TypeParam to get the type parameter.
+  TypeParam n = 0;
+  ...
+}
+
+TYPED_TEST_P(FooTest, HasPropertyA) { ... }
+
+// Now the tricky part: you need to register all test patterns before
+// you can instantiate them.  The first argument of the macro is the
+// test suite name; the rest are the names of the tests in this test
+// case.
+REGISTER_TYPED_TEST_SUITE_P(FooTest,
+                            DoesBlah, HasPropertyA);
+
+// Finally, you are free to instantiate the pattern with the types you
+// want.  If you put the above code in a header file, you can #include
+// it in multiple C++ source files and instantiate it multiple times.
+//
+// To distinguish different instances of the pattern, the first
+// argument to the INSTANTIATE_* macro is a prefix that will be added
+// to the actual test suite name.  Remember to pick unique prefixes for
+// different instances.
+typedef testing::Types<char, int, unsigned int> MyTypes;
+INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
+
+// If the type list contains only one type, you can write that type
+// directly without Types<...>:
+//   INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, int);
+//
+// Similar to the optional argument of TYPED_TEST_SUITE above,
+// INSTANTIATE_TEST_SUITE_P takes an optional fourth argument which allows to
+// generate custom names.
+//   INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes, MyTypeNames);
+
+#endif  // 0
+
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
+#include "gtest/internal/gtest-type-util.h"
+
+// Implements typed tests.
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Expands to the name of the typedef for the type parameters of the
+// given test suite.
+#define GTEST_TYPE_PARAMS_(TestSuiteName) gtest_type_params_##TestSuiteName##_
+
+// Expands to the name of the typedef for the NameGenerator, responsible for
+// creating the suffixes of the name.
+#define GTEST_NAME_GENERATOR_(TestSuiteName) \
+  gtest_type_params_##TestSuiteName##_NameGenerator
+
+#define TYPED_TEST_SUITE(CaseName, Types, ...)                          \
+  typedef ::testing::internal::GenerateTypeList<Types>::type            \
+      GTEST_TYPE_PARAMS_(CaseName);                                     \
+  typedef ::testing::internal::NameGeneratorSelector<__VA_ARGS__>::type \
+  GTEST_NAME_GENERATOR_(CaseName)
+
+#define TYPED_TEST(CaseName, TestName)                                        \
+  static_assert(sizeof(GTEST_STRINGIFY_(TestName)) > 1,                       \
+                "test-name must not be empty");                               \
+  template <typename gtest_TypeParam_>                                        \
+  class GTEST_TEST_CLASS_NAME_(CaseName, TestName)                            \
+      : public CaseName<gtest_TypeParam_> {                                   \
+   private:                                                                   \
+    typedef CaseName<gtest_TypeParam_> TestFixture;                           \
+    typedef gtest_TypeParam_ TypeParam;                                       \
+    void TestBody() override;                                                 \
+  };                                                                          \
+  static bool gtest_##CaseName##_##TestName##_registered_                     \
+      GTEST_ATTRIBUTE_UNUSED_ = ::testing::internal::TypeParameterizedTest<   \
+          CaseName,                                                           \
+          ::testing::internal::TemplateSel<GTEST_TEST_CLASS_NAME_(CaseName,   \
+                                                                  TestName)>, \
+          GTEST_TYPE_PARAMS_(                                                 \
+              CaseName)>::Register("",                                        \
+                                   ::testing::internal::CodeLocation(         \
+                                       __FILE__, __LINE__),                   \
+                                   GTEST_STRINGIFY_(CaseName),                \
+                                   GTEST_STRINGIFY_(TestName), 0,             \
+                                   ::testing::internal::GenerateNames<        \
+                                       GTEST_NAME_GENERATOR_(CaseName),       \
+                                       GTEST_TYPE_PARAMS_(CaseName)>());      \
+  template <typename gtest_TypeParam_>                                        \
+  void GTEST_TEST_CLASS_NAME_(CaseName,                                       \
+                              TestName)<gtest_TypeParam_>::TestBody()
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+#define TYPED_TEST_CASE                                                \
+  static_assert(::testing::internal::TypedTestCaseIsDeprecated(), ""); \
+  TYPED_TEST_SUITE
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+// Implements type-parameterized tests.
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Expands to the namespace name that the type-parameterized tests for
+// the given type-parameterized test suite are defined in.  The exact
+// name of the namespace is subject to change without notice.
+#define GTEST_SUITE_NAMESPACE_(TestSuiteName) gtest_suite_##TestSuiteName##_
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Expands to the name of the variable used to remember the names of
+// the defined tests in the given test suite.
+#define GTEST_TYPED_TEST_SUITE_P_STATE_(TestSuiteName) \
+  gtest_typed_test_suite_p_state_##TestSuiteName##_
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE DIRECTLY.
+//
+// Expands to the name of the variable used to remember the names of
+// the registered tests in the given test suite.
+#define GTEST_REGISTERED_TEST_NAMES_(TestSuiteName) \
+  gtest_registered_test_names_##TestSuiteName##_
+
+// The variables defined in the type-parameterized test macros are
+// static as typically these macros are used in a .h file that can be
+// #included in multiple translation units linked together.
+#define TYPED_TEST_SUITE_P(SuiteName)              \
+  static ::testing::internal::TypedTestSuitePState \
+  GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName)
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+#define TYPED_TEST_CASE_P                                                 \
+  static_assert(::testing::internal::TypedTestCase_P_IsDeprecated(), ""); \
+  TYPED_TEST_SUITE_P
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+#define TYPED_TEST_P(SuiteName, TestName)                             \
+  namespace GTEST_SUITE_NAMESPACE_(SuiteName) {                       \
+    template <typename gtest_TypeParam_>                              \
+    class TestName : public SuiteName<gtest_TypeParam_> {             \
+     private:                                                         \
+      typedef SuiteName<gtest_TypeParam_> TestFixture;                \
+      typedef gtest_TypeParam_ TypeParam;                             \
+      void TestBody() override;                                       \
+    };                                                                \
+    static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \
+        GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).AddTestName(       \
+            __FILE__, __LINE__, GTEST_STRINGIFY_(SuiteName),          \
+            GTEST_STRINGIFY_(TestName));                              \
+  }                                                                   \
+  template <typename gtest_TypeParam_>                                \
+  void GTEST_SUITE_NAMESPACE_(                                        \
+      SuiteName)::TestName<gtest_TypeParam_>::TestBody()
+
+// Note: this won't work correctly if the trailing arguments are macros.
+#define REGISTER_TYPED_TEST_SUITE_P(SuiteName, ...)                         \
+  namespace GTEST_SUITE_NAMESPACE_(SuiteName) {                             \
+    typedef ::testing::internal::Templates<__VA_ARGS__> gtest_AllTests_;    \
+  }                                                                         \
+  static const char* const GTEST_REGISTERED_TEST_NAMES_(                    \
+      SuiteName) GTEST_ATTRIBUTE_UNUSED_ =                                  \
+      GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).VerifyRegisteredTestNames( \
+          GTEST_STRINGIFY_(SuiteName), __FILE__, __LINE__, #__VA_ARGS__)
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+#define REGISTER_TYPED_TEST_CASE_P                                           \
+  static_assert(::testing::internal::RegisterTypedTestCase_P_IsDeprecated(), \
+                "");                                                         \
+  REGISTER_TYPED_TEST_SUITE_P
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+#define INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, SuiteName, Types, ...)     \
+  static_assert(sizeof(GTEST_STRINGIFY_(Prefix)) > 1,                     \
+                "test-suit-prefix must not be empty");                    \
+  static bool gtest_##Prefix##_##SuiteName GTEST_ATTRIBUTE_UNUSED_ =      \
+      ::testing::internal::TypeParameterizedTestSuite<                    \
+          SuiteName, GTEST_SUITE_NAMESPACE_(SuiteName)::gtest_AllTests_,  \
+          ::testing::internal::GenerateTypeList<Types>::type>::           \
+          Register(GTEST_STRINGIFY_(Prefix),                              \
+                   ::testing::internal::CodeLocation(__FILE__, __LINE__), \
+                   &GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName),           \
+                   GTEST_STRINGIFY_(SuiteName),                           \
+                   GTEST_REGISTERED_TEST_NAMES_(SuiteName),               \
+                   ::testing::internal::GenerateNames<                    \
+                       ::testing::internal::NameGeneratorSelector<        \
+                           __VA_ARGS__>::type,                            \
+                       ::testing::internal::GenerateTypeList<Types>::type>())
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+#define INSTANTIATE_TYPED_TEST_CASE_P                                      \
+  static_assert(                                                           \
+      ::testing::internal::InstantiateTypedTestCase_P_IsDeprecated(), ""); \
+  INSTANTIATE_TYPED_TEST_SUITE_P
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest.h
index 581a44e95f..d19a587a18 100644
--- a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest.h
+++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest.h
@@ -26,10 +26,8 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-//
-// The Google C++ Testing Framework (Google Test)
+
+// The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines the public API for Google Test.  It should be
 // included by any test program that uses Google Test.
@@ -48,17452 +46,31 @@
 // registration from Barthelemy Dagenais' (barthelemy@prologique.com)
 // easyUnit framework.
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_H_
-#define GTEST_INCLUDE_GTEST_GTEST_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_H_
 
+#include <cstddef>
 #include <limits>
+#include <memory>
 #include <ostream>
+#include <type_traits>
 #include <vector>
 
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
-//
-// The Google C++ Testing Framework (Google Test)
-//
-// This header file declares functions and macros used internally by
-// Google Test.  They are subject to change without notice.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
-
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Authors: wan@google.com (Zhanyong Wan)
-//
-// Low-level types and utilities for porting Google Test to various
-// platforms.  They are subject to change without notice.  DO NOT USE
-// THEM IN USER CODE.
-//
-// This file is fundamental to Google Test.  All other Google Test source
-// files are expected to #include this.  Therefore, it cannot #include
-// any other Google Test header.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
-
-// The user can define the following macros in the build script to
-// control Google Test's behavior.  If the user doesn't define a macro
-// in this list, Google Test will define it.
-//
-//   GTEST_HAS_CLONE          - Define it to 1/0 to indicate that clone(2)
-//                              is/isn't available.
-//   GTEST_HAS_EXCEPTIONS     - Define it to 1/0 to indicate that exceptions
-//                              are enabled.
-//   GTEST_HAS_GLOBAL_STRING  - Define it to 1/0 to indicate that ::string
-//                              is/isn't available (some systems define
-//                              ::string, which is different to std::string).
-//   GTEST_HAS_GLOBAL_WSTRING - Define it to 1/0 to indicate that ::string
-//                              is/isn't available (some systems define
-//                              ::wstring, which is different to std::wstring).
-//   GTEST_HAS_POSIX_RE       - Define it to 1/0 to indicate that POSIX regular
-//                              expressions are/aren't available.
-//   GTEST_HAS_PTHREAD        - Define it to 1/0 to indicate that <pthread.h>
-//                              is/isn't available.
-//   GTEST_HAS_RTTI           - Define it to 1/0 to indicate that RTTI is/isn't
-//                              enabled.
-//   GTEST_HAS_STD_WSTRING    - Define it to 1/0 to indicate that
-//                              std::wstring does/doesn't work (Google Test can
-//                              be used where std::wstring is unavailable).
-//   GTEST_HAS_TR1_TUPLE      - Define it to 1/0 to indicate tr1::tuple
-//                              is/isn't available.
-//   GTEST_HAS_SEH            - Define it to 1/0 to indicate whether the
-//                              compiler supports Microsoft's "Structured
-//                              Exception Handling".
-//   GTEST_HAS_STREAM_REDIRECTION
-//                            - Define it to 1/0 to indicate whether the
-//                              platform supports I/O stream redirection using
-//                              dup() and dup2().
-//   GTEST_USE_OWN_TR1_TUPLE  - Define it to 1/0 to indicate whether Google
-//                              Test's own tr1 tuple implementation should be
-//                              used.  Unused when the user sets
-//                              GTEST_HAS_TR1_TUPLE to 0.
-//   GTEST_LANG_CXX11         - Define it to 1/0 to indicate that Google Test
-//                              is building in C++11/C++98 mode.
-//   GTEST_LINKED_AS_SHARED_LIBRARY
-//                            - Define to 1 when compiling tests that use
-//                              Google Test as a shared library (known as
-//                              DLL on Windows).
-//   GTEST_CREATE_SHARED_LIBRARY
-//                            - Define to 1 when compiling Google Test itself
-//                              as a shared library.
-
-// This header defines the following utilities:
-//
-// Macros indicating the current platform (defined to 1 if compiled on
-// the given platform; otherwise undefined):
-//   GTEST_OS_AIX      - IBM AIX
-//   GTEST_OS_CYGWIN   - Cygwin
-//   GTEST_OS_HPUX     - HP-UX
-//   GTEST_OS_LINUX    - Linux
-//     GTEST_OS_LINUX_ANDROID - Google Android
-//   GTEST_OS_MAC      - Mac OS X
-//     GTEST_OS_IOS    - iOS
-//       GTEST_OS_IOS_SIMULATOR - iOS simulator
-//   GTEST_OS_NACL     - Google Native Client (NaCl)
-//   GTEST_OS_OPENBSD  - OpenBSD
-//   GTEST_OS_QNX      - QNX
-//   GTEST_OS_SOLARIS  - Sun Solaris
-//   GTEST_OS_SYMBIAN  - Symbian
-//   GTEST_OS_WINDOWS  - Windows (Desktop, MinGW, or Mobile)
-//     GTEST_OS_WINDOWS_DESKTOP  - Windows Desktop
-//     GTEST_OS_WINDOWS_MINGW    - MinGW
-//     GTEST_OS_WINDOWS_MOBILE   - Windows Mobile
-//   GTEST_OS_ZOS      - z/OS
-//
-// Among the platforms, Cygwin, Linux, Max OS X, and Windows have the
-// most stable support.  Since core members of the Google Test project
-// don't have access to other platforms, support for them may be less
-// stable.  If you notice any problems on your platform, please notify
-// googletestframework@googlegroups.com (patches for fixing them are
-// even more welcome!).
-//
-// Note that it is possible that none of the GTEST_OS_* macros are defined.
-//
-// Macros indicating available Google Test features (defined to 1 if
-// the corresponding feature is supported; otherwise undefined):
-//   GTEST_HAS_COMBINE      - the Combine() function (for value-parameterized
-//                            tests)
-//   GTEST_HAS_DEATH_TEST   - death tests
-//   GTEST_HAS_PARAM_TEST   - value-parameterized tests
-//   GTEST_HAS_TYPED_TEST   - typed tests
-//   GTEST_HAS_TYPED_TEST_P - type-parameterized tests
-//   GTEST_USES_POSIX_RE    - enhanced POSIX regex is used. Do not confuse with
-//                            GTEST_HAS_POSIX_RE (see above) which users can
-//                            define themselves.
-//   GTEST_USES_SIMPLE_RE   - our own simple regex is used;
-//                            the above two are mutually exclusive.
-//   GTEST_CAN_COMPARE_NULL - accepts untyped NULL in EXPECT_EQ().
-//
-// Macros for basic C++ coding:
-//   GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning.
-//   GTEST_ATTRIBUTE_UNUSED_  - declares that a class' instances or a
-//                              variable don't have to be used.
-//   GTEST_DISALLOW_ASSIGN_   - disables operator=.
-//   GTEST_DISALLOW_COPY_AND_ASSIGN_ - disables copy ctor and operator=.
-//   GTEST_MUST_USE_RESULT_   - declares that a function's result must be used.
-//
-// Synchronization:
-//   Mutex, MutexLock, ThreadLocal, GetThreadCount()
-//                  - synchronization primitives.
-//   GTEST_IS_THREADSAFE - defined to 1 to indicate that the above
-//                         synchronization primitives have real implementations
-//                         and Google Test is thread-safe; or 0 otherwise.
-//
-// Template meta programming:
-//   is_pointer     - as in TR1; needed on Symbian and IBM XL C/C++ only.
-//   IteratorTraits - partial implementation of std::iterator_traits, which
-//                    is not available in libCstd when compiled with Sun C++.
-//
-// Smart pointers:
-//   scoped_ptr     - as in TR2.
-//
-// Regular expressions:
-//   RE             - a simple regular expression class using the POSIX
-//                    Extended Regular Expression syntax on UNIX-like
-//                    platforms, or a reduced regular exception syntax on
-//                    other platforms, including Windows.
-//
-// Logging:
-//   GTEST_LOG_()   - logs messages at the specified severity level.
-//   LogToStderr()  - directs all log messages to stderr.
-//   FlushInfoLog() - flushes informational log messages.
-//
-// Stdout and stderr capturing:
-//   CaptureStdout()     - starts capturing stdout.
-//   GetCapturedStdout() - stops capturing stdout and returns the captured
-//                         string.
-//   CaptureStderr()     - starts capturing stderr.
-//   GetCapturedStderr() - stops capturing stderr and returns the captured
-//                         string.
-//
-// Integer types:
-//   TypeWithSize   - maps an integer to a int type.
-//   Int32, UInt32, Int64, UInt64, TimeInMillis
-//                  - integers of known sizes.
-//   BiggestInt     - the biggest signed integer type.
-//
-// Command-line utilities:
-//   GTEST_FLAG()       - references a flag.
-//   GTEST_DECLARE_*()  - declares a flag.
-//   GTEST_DEFINE_*()   - defines a flag.
-//   GetInjectableArgvs() - returns the command line as a vector of strings.
-//
-// Environment variable utilities:
-//   GetEnv()             - gets the value of an environment variable.
-//   BoolFromGTestEnv()   - parses a bool environment variable.
-//   Int32FromGTestEnv()  - parses an Int32 environment variable.
-//   StringFromGTestEnv() - parses a string environment variable.
-
-#include <ctype.h>   // for isspace, etc
-#include <stddef.h>  // for ptrdiff_t
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#ifndef _WIN32_WCE
-# include <sys/types.h>
-# include <sys/stat.h>
-#endif  // !_WIN32_WCE
-
-#if defined __APPLE__
-# include <AvailabilityMacros.h>
-# include <TargetConditionals.h>
-#endif
-
-#include <iostream>  // NOLINT
-#include <sstream>  // NOLINT
-#include <string>  // NOLINT
-
-#define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
-#define GTEST_FLAG_PREFIX_ "gtest_"
-#define GTEST_FLAG_PREFIX_DASH_ "gtest-"
-#define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
-#define GTEST_NAME_ "Google Test"
-#define GTEST_PROJECT_URL_ "http://code.google.com/p/googletest/"
-
-// Determines the version of gcc that is used to compile this.
-#ifdef __GNUC__
-// 40302 means version 4.3.2.
-# define GTEST_GCC_VER_ \
-    (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
-#endif  // __GNUC__
-
-// Determines the platform on which Google Test is compiled.
-#ifdef __CYGWIN__
-# define GTEST_OS_CYGWIN 1
-#elif defined __SYMBIAN32__
-# define GTEST_OS_SYMBIAN 1
-#elif defined _WIN32
-# define GTEST_OS_WINDOWS 1
-# ifdef _WIN32_WCE
-#  define GTEST_OS_WINDOWS_MOBILE 1
-# elif defined(__MINGW__) || defined(__MINGW32__)
-#  define GTEST_OS_WINDOWS_MINGW 1
-# else
-#  define GTEST_OS_WINDOWS_DESKTOP 1
-# endif  // _WIN32_WCE
-#elif defined __APPLE__
-# define GTEST_OS_MAC 1
-# if TARGET_OS_IPHONE
-#  define GTEST_OS_IOS 1
-#  if TARGET_IPHONE_SIMULATOR
-#   define GTEST_OS_IOS_SIMULATOR 1
-#  endif
-# endif
-#elif defined __linux__
-# define GTEST_OS_LINUX 1
-# if defined __ANDROID__
-#  define GTEST_OS_LINUX_ANDROID 1
-# endif
-#elif defined __MVS__
-# define GTEST_OS_ZOS 1
-#elif defined(__sun) && defined(__SVR4)
-# define GTEST_OS_SOLARIS 1
-#elif defined(_AIX)
-# define GTEST_OS_AIX 1
-#elif defined(__hpux)
-# define GTEST_OS_HPUX 1
-#elif defined __native_client__
-# define GTEST_OS_NACL 1
-#elif defined __OpenBSD__
-# define GTEST_OS_OPENBSD 1
-#elif defined __QNX__
-# define GTEST_OS_QNX 1
-#endif  // __CYGWIN__
-
-#ifndef GTEST_LANG_CXX11
-// gcc and clang define __GXX_EXPERIMENTAL_CXX0X__ when
-// -std={c,gnu}++{0x,11} is passed.  The C++11 standard specifies a
-// value for __cplusplus, and recent versions of clang, gcc, and
-// probably other compilers set that too in C++11 mode.
-# if __GXX_EXPERIMENTAL_CXX0X__ || __cplusplus >= 201103L
-// Compiling in at least C++11 mode.
-#  define GTEST_LANG_CXX11 1
-# else
-#  define GTEST_LANG_CXX11 0
-# endif
-#endif
-
-// Brings in definitions for functions used in the testing::internal::posix
-// namespace (read, write, close, chdir, isatty, stat). We do not currently
-// use them on Windows Mobile.
-#if !GTEST_OS_WINDOWS
-// This assumes that non-Windows OSes provide unistd.h. For OSes where this
-// is not the case, we need to include headers that provide the functions
-// mentioned above.
-# include <unistd.h>
-# include <strings.h>
-#elif !GTEST_OS_WINDOWS_MOBILE
-# include <direct.h>
-# include <io.h>
-#endif
-
-#if GTEST_OS_LINUX_ANDROID
-// Used to define __ANDROID_API__ matching the target NDK API level.
-#  include <android/api-level.h>  // NOLINT
-#endif
-
-// Defines this to true iff Google Test can use POSIX regular expressions.
-#ifndef GTEST_HAS_POSIX_RE
-# if GTEST_OS_LINUX_ANDROID
-// On Android, <regex.h> is only available starting with Gingerbread.
-#  define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9)
-# else
-#  define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS)
-# endif
-#endif
-
-#if GTEST_HAS_POSIX_RE
-
-// On some platforms, <regex.h> needs someone to define size_t, and
-// won't compile otherwise.  We can #include it here as we already
-// included <stdlib.h>, which is guaranteed to define size_t through
-// <stddef.h>.
-# include <regex.h>  // NOLINT
-
-# define GTEST_USES_POSIX_RE 1
-
-#elif GTEST_OS_WINDOWS
-
-// <regex.h> is not available on Windows.  Use our own simple regex
-// implementation instead.
-# define GTEST_USES_SIMPLE_RE 1
-
-#else
-
-// <regex.h> may not be available on this platform.  Use our own
-// simple regex implementation instead.
-# define GTEST_USES_SIMPLE_RE 1
-
-#endif  // GTEST_HAS_POSIX_RE
-
-#ifndef GTEST_HAS_EXCEPTIONS
-// The user didn't tell us whether exceptions are enabled, so we need
-// to figure it out.
-# if defined(_MSC_VER) || defined(__BORLANDC__)
-// MSVC's and C++Builder's implementations of the STL use the _HAS_EXCEPTIONS
-// macro to enable exceptions, so we'll do the same.
-// Assumes that exceptions are enabled by default.
-#  ifndef _HAS_EXCEPTIONS
-#   define _HAS_EXCEPTIONS 1
-#  endif  // _HAS_EXCEPTIONS
-#  define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS
-# elif defined(__GNUC__) && __EXCEPTIONS
-// gcc defines __EXCEPTIONS to 1 iff exceptions are enabled.
-#  define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__SUNPRO_CC)
-// Sun Pro CC supports exceptions.  However, there is no compile-time way of
-// detecting whether they are enabled or not.  Therefore, we assume that
-// they are enabled unless the user tells us otherwise.
-#  define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__IBMCPP__) && __EXCEPTIONS
-// xlC defines __EXCEPTIONS to 1 iff exceptions are enabled.
-#  define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__HP_aCC)
-// Exception handling is in effect by default in HP aCC compiler. It has to
-// be turned of by +noeh compiler option if desired.
-#  define GTEST_HAS_EXCEPTIONS 1
-# else
-// For other compilers, we assume exceptions are disabled to be
-// conservative.
-#  define GTEST_HAS_EXCEPTIONS 0
-# endif  // defined(_MSC_VER) || defined(__BORLANDC__)
-#endif  // GTEST_HAS_EXCEPTIONS
-
-#if !defined(GTEST_HAS_STD_STRING)
-// Even though we don't use this macro any longer, we keep it in case
-// some clients still depend on it.
-# define GTEST_HAS_STD_STRING 1
-#elif !GTEST_HAS_STD_STRING
-// The user told us that ::std::string isn't available.
-# error "Google Test cannot be used where ::std::string isn't available."
-#endif  // !defined(GTEST_HAS_STD_STRING)
-
-#ifndef GTEST_HAS_GLOBAL_STRING
-// The user didn't tell us whether ::string is available, so we need
-// to figure it out.
-
-# define GTEST_HAS_GLOBAL_STRING 0
-
-#endif  // GTEST_HAS_GLOBAL_STRING
-
-#ifndef GTEST_HAS_STD_WSTRING
-// The user didn't tell us whether ::std::wstring is available, so we need
-// to figure it out.
-// TODO(wan@google.com): uses autoconf to detect whether ::std::wstring
-//   is available.
-
-// Cygwin 1.7 and below doesn't support ::std::wstring.
-// Solaris' libc++ doesn't support it either.  Android has
-// no support for it at least as recent as Froyo (2.2).
-# define GTEST_HAS_STD_WSTRING \
-    (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS))
-
-#endif  // GTEST_HAS_STD_WSTRING
-
-#ifndef GTEST_HAS_GLOBAL_WSTRING
-// The user didn't tell us whether ::wstring is available, so we need
-// to figure it out.
-# define GTEST_HAS_GLOBAL_WSTRING \
-    (GTEST_HAS_STD_WSTRING && GTEST_HAS_GLOBAL_STRING)
-#endif  // GTEST_HAS_GLOBAL_WSTRING
-
-// Determines whether RTTI is available.
-#ifndef GTEST_HAS_RTTI
-// The user didn't tell us whether RTTI is enabled, so we need to
-// figure it out.
-
-# ifdef _MSC_VER
-
-#  ifdef _CPPRTTI  // MSVC defines this macro iff RTTI is enabled.
-#   define GTEST_HAS_RTTI 1
-#  else
-#   define GTEST_HAS_RTTI 0
-#  endif
-
-// Starting with version 4.3.2, gcc defines __GXX_RTTI iff RTTI is enabled.
-# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40302)
-
-#  ifdef __GXX_RTTI
-// When building against STLport with the Android NDK and with
-// -frtti -fno-exceptions, the build fails at link time with undefined
-// references to __cxa_bad_typeid. Note sure if STL or toolchain bug,
-// so disable RTTI when detected.
-#   if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && \
-       !defined(__EXCEPTIONS)
-#    define GTEST_HAS_RTTI 0
-#   else
-#    define GTEST_HAS_RTTI 1
-#   endif  // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS
-#  else
-#   define GTEST_HAS_RTTI 0
-#  endif  // __GXX_RTTI
-
-// Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends
-// using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the
-// first version with C++ support.
-# elif defined(__clang__)
-
-#  define GTEST_HAS_RTTI __has_feature(cxx_rtti)
-
-// Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if
-// both the typeid and dynamic_cast features are present.
-# elif defined(__IBMCPP__) && (__IBMCPP__ >= 900)
-
-#  ifdef __RTTI_ALL__
-#   define GTEST_HAS_RTTI 1
-#  else
-#   define GTEST_HAS_RTTI 0
-#  endif
-
-# else
-
-// For all other compilers, we assume RTTI is enabled.
-#  define GTEST_HAS_RTTI 1
-
-# endif  // _MSC_VER
-
-#endif  // GTEST_HAS_RTTI
-
-// It's this header's responsibility to #include <typeinfo> when RTTI
-// is enabled.
-#if GTEST_HAS_RTTI
-# include <typeinfo>
-#endif
-
-// Determines whether Google Test can use the pthreads library.
-#ifndef GTEST_HAS_PTHREAD
-// The user didn't tell us explicitly, so we assume pthreads support is
-// available on Linux and Mac.
-//
-// To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0
-// to your compiler flags.
-# define GTEST_HAS_PTHREAD (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX \
-    || GTEST_OS_QNX)
-#endif  // GTEST_HAS_PTHREAD
-
-#if GTEST_HAS_PTHREAD
-// gtest-port.h guarantees to #include <pthread.h> when GTEST_HAS_PTHREAD is
-// true.
-# include <pthread.h>  // NOLINT
-
-// For timespec and nanosleep, used below.
-# include <time.h>  // NOLINT
-#endif
-
-// Determines whether Google Test can use tr1/tuple.  You can define
-// this macro to 0 to prevent Google Test from using tuple (any
-// feature depending on tuple with be disabled in this mode).
-#ifndef GTEST_HAS_TR1_TUPLE
-# if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR)
-// STLport, provided with the Android NDK, has neither <tr1/tuple> or <tuple>.
-#  define GTEST_HAS_TR1_TUPLE 0
-# else
-// The user didn't tell us not to do it, so we assume it's OK.
-#  define GTEST_HAS_TR1_TUPLE 1
-# endif
-#endif  // GTEST_HAS_TR1_TUPLE
-
-// Determines whether Google Test's own tr1 tuple implementation
-// should be used.
-#ifndef GTEST_USE_OWN_TR1_TUPLE
-// The user didn't tell us, so we need to figure it out.
-
-// We use our own TR1 tuple if we aren't sure the user has an
-// implementation of it already.  At this time, libstdc++ 4.0.0+ and
-// MSVC 2010 are the only mainstream standard libraries that come
-// with a TR1 tuple implementation.  NVIDIA's CUDA NVCC compiler
-// pretends to be GCC by defining __GNUC__ and friends, but cannot
-// compile GCC's tuple implementation.  MSVC 2008 (9.0) provides TR1
-// tuple in a 323 MB Feature Pack download, which we cannot assume the
-// user has.  QNX's QCC compiler is a modified GCC but it doesn't
-// support TR1 tuple.  libc++ only provides std::tuple, in C++11 mode,
-// and it can be used with some compilers that define __GNUC__.
-# if (defined(__GNUC__) && !defined(__CUDACC__) && (GTEST_GCC_VER_ >= 40000) \
-      && !GTEST_OS_QNX && !defined(_LIBCPP_VERSION)) || _MSC_VER >= 1600
-#  define GTEST_ENV_HAS_TR1_TUPLE_ 1
-# endif
-
-// C++11 specifies that <tuple> provides std::tuple. Use that if gtest is used
-// in C++11 mode and libstdc++ isn't very old (binaries targeting OS X 10.6
-// can build with clang but need to use gcc4.2's libstdc++).
-# if GTEST_LANG_CXX11 && (!defined(__GLIBCXX__) || __GLIBCXX__ > 20110325)
-#  define GTEST_ENV_HAS_STD_TUPLE_ 1
-# endif
-
-# if GTEST_ENV_HAS_TR1_TUPLE_ || GTEST_ENV_HAS_STD_TUPLE_
-#  define GTEST_USE_OWN_TR1_TUPLE 0
-# else
-#  define GTEST_USE_OWN_TR1_TUPLE 1
-# endif
-
-#endif  // GTEST_USE_OWN_TR1_TUPLE
-
-// To avoid conditional compilation everywhere, we make it
-// gtest-port.h's responsibility to #include the header implementing
-// tr1/tuple.
-#if GTEST_HAS_TR1_TUPLE
-
-# if GTEST_USE_OWN_TR1_TUPLE
-// This file was GENERATED by command:
-//     pump.py gtest-tuple.h.pump
-// DO NOT EDIT BY HAND!!!
-
-// Copyright 2009 Google Inc.
-// All Rights Reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-
-// Implements a subset of TR1 tuple needed by Google Test and Google Mock.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
-
-#include <utility>  // For ::std::pair.
-
-// The compiler used in Symbian has a bug that prevents us from declaring the
-// tuple template as a friend (it complains that tuple is redefined).  This
-// hack bypasses the bug by declaring the members that should otherwise be
-// private as public.
-// Sun Studio versions < 12 also have the above bug.
-#if defined(__SYMBIAN32__) || (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x590)
-# define GTEST_DECLARE_TUPLE_AS_FRIEND_ public:
-#else
-# define GTEST_DECLARE_TUPLE_AS_FRIEND_ \
-    template <GTEST_10_TYPENAMES_(U)> friend class tuple; \
-   private:
-#endif
-
-// GTEST_n_TUPLE_(T) is the type of an n-tuple.
-#define GTEST_0_TUPLE_(T) tuple<>
-#define GTEST_1_TUPLE_(T) tuple<T##0, void, void, void, void, void, void, \
-    void, void, void>
-#define GTEST_2_TUPLE_(T) tuple<T##0, T##1, void, void, void, void, void, \
-    void, void, void>
-#define GTEST_3_TUPLE_(T) tuple<T##0, T##1, T##2, void, void, void, void, \
-    void, void, void>
-#define GTEST_4_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, void, void, void, \
-    void, void, void>
-#define GTEST_5_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, void, void, \
-    void, void, void>
-#define GTEST_6_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, void, \
-    void, void, void>
-#define GTEST_7_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
-    void, void, void>
-#define GTEST_8_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
-    T##7, void, void>
-#define GTEST_9_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
-    T##7, T##8, void>
-#define GTEST_10_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
-    T##7, T##8, T##9>
-
-// GTEST_n_TYPENAMES_(T) declares a list of n typenames.
-#define GTEST_0_TYPENAMES_(T)
-#define GTEST_1_TYPENAMES_(T) typename T##0
-#define GTEST_2_TYPENAMES_(T) typename T##0, typename T##1
-#define GTEST_3_TYPENAMES_(T) typename T##0, typename T##1, typename T##2
-#define GTEST_4_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
-    typename T##3
-#define GTEST_5_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
-    typename T##3, typename T##4
-#define GTEST_6_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
-    typename T##3, typename T##4, typename T##5
-#define GTEST_7_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
-    typename T##3, typename T##4, typename T##5, typename T##6
-#define GTEST_8_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
-    typename T##3, typename T##4, typename T##5, typename T##6, typename T##7
-#define GTEST_9_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
-    typename T##3, typename T##4, typename T##5, typename T##6, \
-    typename T##7, typename T##8
-#define GTEST_10_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
-    typename T##3, typename T##4, typename T##5, typename T##6, \
-    typename T##7, typename T##8, typename T##9
-
-// In theory, defining stuff in the ::std namespace is undefined
-// behavior.  We can do this as we are playing the role of a standard
-// library vendor.
-namespace std {
-namespace tr1 {
-
-template <typename T0 = void, typename T1 = void, typename T2 = void,
-    typename T3 = void, typename T4 = void, typename T5 = void,
-    typename T6 = void, typename T7 = void, typename T8 = void,
-    typename T9 = void>
-class tuple;
-
-// Anything in namespace gtest_internal is Google Test's INTERNAL
-// IMPLEMENTATION DETAIL and MUST NOT BE USED DIRECTLY in user code.
-namespace gtest_internal {
-
-// ByRef<T>::type is T if T is a reference; otherwise it's const T&.
-template <typename T>
-struct ByRef { typedef const T& type; };  // NOLINT
-template <typename T>
-struct ByRef<T&> { typedef T& type; };  // NOLINT
-
-// A handy wrapper for ByRef.
-#define GTEST_BY_REF_(T) typename ::std::tr1::gtest_internal::ByRef<T>::type
-
-// AddRef<T>::type is T if T is a reference; otherwise it's T&.  This
-// is the same as tr1::add_reference<T>::type.
-template <typename T>
-struct AddRef { typedef T& type; };  // NOLINT
-template <typename T>
-struct AddRef<T&> { typedef T& type; };  // NOLINT
-
-// A handy wrapper for AddRef.
-#define GTEST_ADD_REF_(T) typename ::std::tr1::gtest_internal::AddRef<T>::type
-
-// A helper for implementing get<k>().
-template <int k> class Get;
-
-// A helper for implementing tuple_element<k, T>.  kIndexValid is true
-// iff k < the number of fields in tuple type T.
-template <bool kIndexValid, int kIndex, class Tuple>
-struct TupleElement;
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 0, GTEST_10_TUPLE_(T) > {
-  typedef T0 type;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 1, GTEST_10_TUPLE_(T) > {
-  typedef T1 type;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 2, GTEST_10_TUPLE_(T) > {
-  typedef T2 type;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 3, GTEST_10_TUPLE_(T) > {
-  typedef T3 type;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 4, GTEST_10_TUPLE_(T) > {
-  typedef T4 type;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 5, GTEST_10_TUPLE_(T) > {
-  typedef T5 type;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 6, GTEST_10_TUPLE_(T) > {
-  typedef T6 type;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 7, GTEST_10_TUPLE_(T) > {
-  typedef T7 type;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 8, GTEST_10_TUPLE_(T) > {
-  typedef T8 type;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 9, GTEST_10_TUPLE_(T) > {
-  typedef T9 type;
-};
-
-}  // namespace gtest_internal
-
-template <>
-class tuple<> {
- public:
-  tuple() {}
-  tuple(const tuple& /* t */)  {}
-  tuple& operator=(const tuple& /* t */) { return *this; }
-};
-
-template <GTEST_1_TYPENAMES_(T)>
-class GTEST_1_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0) : f0_(f0) {}
-
-  tuple(const tuple& t) : f0_(t.f0_) {}
-
-  template <GTEST_1_TYPENAMES_(U)>
-  tuple(const GTEST_1_TUPLE_(U)& t) : f0_(t.f0_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_1_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_1_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_1_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_1_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    return *this;
-  }
-
-  T0 f0_;
-};
-
-template <GTEST_2_TYPENAMES_(T)>
-class GTEST_2_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1) : f0_(f0),
-      f1_(f1) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_) {}
-
-  template <GTEST_2_TYPENAMES_(U)>
-  tuple(const GTEST_2_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_) {}
-  template <typename U0, typename U1>
-  tuple(const ::std::pair<U0, U1>& p) : f0_(p.first), f1_(p.second) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_2_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_2_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-  template <typename U0, typename U1>
-  tuple& operator=(const ::std::pair<U0, U1>& p) {
-    f0_ = p.first;
-    f1_ = p.second;
-    return *this;
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_2_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_2_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-};
-
-template <GTEST_3_TYPENAMES_(T)>
-class GTEST_3_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_(), f2_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
-      GTEST_BY_REF_(T2) f2) : f0_(f0), f1_(f1), f2_(f2) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {}
-
-  template <GTEST_3_TYPENAMES_(U)>
-  tuple(const GTEST_3_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_3_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_3_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_3_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_3_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    f2_ = t.f2_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-  T2 f2_;
-};
-
-template <GTEST_4_TYPENAMES_(T)>
-class GTEST_4_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_(), f2_(), f3_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
-      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3) : f0_(f0), f1_(f1), f2_(f2),
-      f3_(f3) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_) {}
-
-  template <GTEST_4_TYPENAMES_(U)>
-  tuple(const GTEST_4_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
-      f3_(t.f3_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_4_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_4_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_4_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_4_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    f2_ = t.f2_;
-    f3_ = t.f3_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-  T2 f2_;
-  T3 f3_;
-};
-
-template <GTEST_5_TYPENAMES_(T)>
-class GTEST_5_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_(), f2_(), f3_(), f4_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
-      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3,
-      GTEST_BY_REF_(T4) f4) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
-      f4_(t.f4_) {}
-
-  template <GTEST_5_TYPENAMES_(U)>
-  tuple(const GTEST_5_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
-      f3_(t.f3_), f4_(t.f4_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_5_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_5_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_5_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_5_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    f2_ = t.f2_;
-    f3_ = t.f3_;
-    f4_ = t.f4_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-  T2 f2_;
-  T3 f3_;
-  T4 f4_;
-};
-
-template <GTEST_6_TYPENAMES_(T)>
-class GTEST_6_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
-      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
-      GTEST_BY_REF_(T5) f5) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
-      f5_(f5) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
-      f4_(t.f4_), f5_(t.f5_) {}
-
-  template <GTEST_6_TYPENAMES_(U)>
-  tuple(const GTEST_6_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
-      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_6_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_6_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_6_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_6_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    f2_ = t.f2_;
-    f3_ = t.f3_;
-    f4_ = t.f4_;
-    f5_ = t.f5_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-  T2 f2_;
-  T3 f3_;
-  T4 f4_;
-  T5 f5_;
-};
-
-template <GTEST_7_TYPENAMES_(T)>
-class GTEST_7_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
-      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
-      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6) : f0_(f0), f1_(f1), f2_(f2),
-      f3_(f3), f4_(f4), f5_(f5), f6_(f6) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
-      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {}
-
-  template <GTEST_7_TYPENAMES_(U)>
-  tuple(const GTEST_7_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
-      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_7_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_7_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_7_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_7_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    f2_ = t.f2_;
-    f3_ = t.f3_;
-    f4_ = t.f4_;
-    f5_ = t.f5_;
-    f6_ = t.f6_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-  T2 f2_;
-  T3 f3_;
-  T4 f4_;
-  T5 f5_;
-  T6 f6_;
-};
-
-template <GTEST_8_TYPENAMES_(T)>
-class GTEST_8_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
-      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
-      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6,
-      GTEST_BY_REF_(T7) f7) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
-      f5_(f5), f6_(f6), f7_(f7) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
-      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {}
-
-  template <GTEST_8_TYPENAMES_(U)>
-  tuple(const GTEST_8_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
-      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_8_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_8_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_8_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_8_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    f2_ = t.f2_;
-    f3_ = t.f3_;
-    f4_ = t.f4_;
-    f5_ = t.f5_;
-    f6_ = t.f6_;
-    f7_ = t.f7_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-  T2 f2_;
-  T3 f3_;
-  T4 f4_;
-  T5 f5_;
-  T6 f6_;
-  T7 f7_;
-};
-
-template <GTEST_9_TYPENAMES_(T)>
-class GTEST_9_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
-      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
-      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7,
-      GTEST_BY_REF_(T8) f8) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
-      f5_(f5), f6_(f6), f7_(f7), f8_(f8) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
-      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {}
-
-  template <GTEST_9_TYPENAMES_(U)>
-  tuple(const GTEST_9_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
-      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_9_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_9_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_9_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_9_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    f2_ = t.f2_;
-    f3_ = t.f3_;
-    f4_ = t.f4_;
-    f5_ = t.f5_;
-    f6_ = t.f6_;
-    f7_ = t.f7_;
-    f8_ = t.f8_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-  T2 f2_;
-  T3 f3_;
-  T4 f4_;
-  T5 f5_;
-  T6 f6_;
-  T7 f7_;
-  T8 f8_;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-class tuple {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_(),
-      f9_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
-      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
-      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7,
-      GTEST_BY_REF_(T8) f8, GTEST_BY_REF_(T9) f9) : f0_(f0), f1_(f1), f2_(f2),
-      f3_(f3), f4_(f4), f5_(f5), f6_(f6), f7_(f7), f8_(f8), f9_(f9) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
-      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_), f9_(t.f9_) {}
-
-  template <GTEST_10_TYPENAMES_(U)>
-  tuple(const GTEST_10_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
-      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_),
-      f9_(t.f9_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_10_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_10_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_10_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_10_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    f2_ = t.f2_;
-    f3_ = t.f3_;
-    f4_ = t.f4_;
-    f5_ = t.f5_;
-    f6_ = t.f6_;
-    f7_ = t.f7_;
-    f8_ = t.f8_;
-    f9_ = t.f9_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-  T2 f2_;
-  T3 f3_;
-  T4 f4_;
-  T5 f5_;
-  T6 f6_;
-  T7 f7_;
-  T8 f8_;
-  T9 f9_;
-};
-
-// 6.1.3.2 Tuple creation functions.
-
-// Known limitations: we don't support passing an
-// std::tr1::reference_wrapper<T> to make_tuple().  And we don't
-// implement tie().
-
-inline tuple<> make_tuple() { return tuple<>(); }
-
-template <GTEST_1_TYPENAMES_(T)>
-inline GTEST_1_TUPLE_(T) make_tuple(const T0& f0) {
-  return GTEST_1_TUPLE_(T)(f0);
-}
-
-template <GTEST_2_TYPENAMES_(T)>
-inline GTEST_2_TUPLE_(T) make_tuple(const T0& f0, const T1& f1) {
-  return GTEST_2_TUPLE_(T)(f0, f1);
-}
-
-template <GTEST_3_TYPENAMES_(T)>
-inline GTEST_3_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2) {
-  return GTEST_3_TUPLE_(T)(f0, f1, f2);
-}
-
-template <GTEST_4_TYPENAMES_(T)>
-inline GTEST_4_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
-    const T3& f3) {
-  return GTEST_4_TUPLE_(T)(f0, f1, f2, f3);
-}
-
-template <GTEST_5_TYPENAMES_(T)>
-inline GTEST_5_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
-    const T3& f3, const T4& f4) {
-  return GTEST_5_TUPLE_(T)(f0, f1, f2, f3, f4);
-}
-
-template <GTEST_6_TYPENAMES_(T)>
-inline GTEST_6_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
-    const T3& f3, const T4& f4, const T5& f5) {
-  return GTEST_6_TUPLE_(T)(f0, f1, f2, f3, f4, f5);
-}
-
-template <GTEST_7_TYPENAMES_(T)>
-inline GTEST_7_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
-    const T3& f3, const T4& f4, const T5& f5, const T6& f6) {
-  return GTEST_7_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6);
-}
-
-template <GTEST_8_TYPENAMES_(T)>
-inline GTEST_8_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
-    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7) {
-  return GTEST_8_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7);
-}
-
-template <GTEST_9_TYPENAMES_(T)>
-inline GTEST_9_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
-    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7,
-    const T8& f8) {
-  return GTEST_9_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8);
-}
-
-template <GTEST_10_TYPENAMES_(T)>
-inline GTEST_10_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
-    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7,
-    const T8& f8, const T9& f9) {
-  return GTEST_10_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8, f9);
-}
-
-// 6.1.3.3 Tuple helper classes.
-
-template <typename Tuple> struct tuple_size;
-
-template <GTEST_0_TYPENAMES_(T)>
-struct tuple_size<GTEST_0_TUPLE_(T) > {
-  static const int value = 0;
-};
-
-template <GTEST_1_TYPENAMES_(T)>
-struct tuple_size<GTEST_1_TUPLE_(T) > {
-  static const int value = 1;
-};
-
-template <GTEST_2_TYPENAMES_(T)>
-struct tuple_size<GTEST_2_TUPLE_(T) > {
-  static const int value = 2;
-};
-
-template <GTEST_3_TYPENAMES_(T)>
-struct tuple_size<GTEST_3_TUPLE_(T) > {
-  static const int value = 3;
-};
-
-template <GTEST_4_TYPENAMES_(T)>
-struct tuple_size<GTEST_4_TUPLE_(T) > {
-  static const int value = 4;
-};
-
-template <GTEST_5_TYPENAMES_(T)>
-struct tuple_size<GTEST_5_TUPLE_(T) > {
-  static const int value = 5;
-};
-
-template <GTEST_6_TYPENAMES_(T)>
-struct tuple_size<GTEST_6_TUPLE_(T) > {
-  static const int value = 6;
-};
-
-template <GTEST_7_TYPENAMES_(T)>
-struct tuple_size<GTEST_7_TUPLE_(T) > {
-  static const int value = 7;
-};
-
-template <GTEST_8_TYPENAMES_(T)>
-struct tuple_size<GTEST_8_TUPLE_(T) > {
-  static const int value = 8;
-};
-
-template <GTEST_9_TYPENAMES_(T)>
-struct tuple_size<GTEST_9_TUPLE_(T) > {
-  static const int value = 9;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-struct tuple_size<GTEST_10_TUPLE_(T) > {
-  static const int value = 10;
-};
-
-template <int k, class Tuple>
-struct tuple_element {
-  typedef typename gtest_internal::TupleElement<
-      k < (tuple_size<Tuple>::value), k, Tuple>::type type;
-};
-
-#define GTEST_TUPLE_ELEMENT_(k, Tuple) typename tuple_element<k, Tuple >::type
-
-// 6.1.3.4 Element access.
-
-namespace gtest_internal {
-
-template <>
-class Get<0> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple))
-  Field(Tuple& t) { return t.f0_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple))
-  ConstField(const Tuple& t) { return t.f0_; }
-};
-
-template <>
-class Get<1> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple))
-  Field(Tuple& t) { return t.f1_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple))
-  ConstField(const Tuple& t) { return t.f1_; }
-};
-
-template <>
-class Get<2> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple))
-  Field(Tuple& t) { return t.f2_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple))
-  ConstField(const Tuple& t) { return t.f2_; }
-};
-
-template <>
-class Get<3> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple))
-  Field(Tuple& t) { return t.f3_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple))
-  ConstField(const Tuple& t) { return t.f3_; }
-};
-
-template <>
-class Get<4> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple))
-  Field(Tuple& t) { return t.f4_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple))
-  ConstField(const Tuple& t) { return t.f4_; }
-};
-
-template <>
-class Get<5> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple))
-  Field(Tuple& t) { return t.f5_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple))
-  ConstField(const Tuple& t) { return t.f5_; }
-};
-
-template <>
-class Get<6> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple))
-  Field(Tuple& t) { return t.f6_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple))
-  ConstField(const Tuple& t) { return t.f6_; }
-};
-
-template <>
-class Get<7> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple))
-  Field(Tuple& t) { return t.f7_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple))
-  ConstField(const Tuple& t) { return t.f7_; }
-};
-
-template <>
-class Get<8> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple))
-  Field(Tuple& t) { return t.f8_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple))
-  ConstField(const Tuple& t) { return t.f8_; }
-};
-
-template <>
-class Get<9> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple))
-  Field(Tuple& t) { return t.f9_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple))
-  ConstField(const Tuple& t) { return t.f9_; }
-};
-
-}  // namespace gtest_internal
-
-template <int k, GTEST_10_TYPENAMES_(T)>
-GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(k, GTEST_10_TUPLE_(T)))
-get(GTEST_10_TUPLE_(T)& t) {
-  return gtest_internal::Get<k>::Field(t);
-}
-
-template <int k, GTEST_10_TYPENAMES_(T)>
-GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(k,  GTEST_10_TUPLE_(T)))
-get(const GTEST_10_TUPLE_(T)& t) {
-  return gtest_internal::Get<k>::ConstField(t);
-}
-
-// 6.1.3.5 Relational operators
-
-// We only implement == and !=, as we don't have a need for the rest yet.
-
-namespace gtest_internal {
-
-// SameSizeTuplePrefixComparator<k, k>::Eq(t1, t2) returns true if the
-// first k fields of t1 equals the first k fields of t2.
-// SameSizeTuplePrefixComparator(k1, k2) would be a compiler error if
-// k1 != k2.
-template <int kSize1, int kSize2>
-struct SameSizeTuplePrefixComparator;
-
-template <>
-struct SameSizeTuplePrefixComparator<0, 0> {
-  template <class Tuple1, class Tuple2>
-  static bool Eq(const Tuple1& /* t1 */, const Tuple2& /* t2 */) {
-    return true;
-  }
-};
-
-template <int k>
-struct SameSizeTuplePrefixComparator<k, k> {
-  template <class Tuple1, class Tuple2>
-  static bool Eq(const Tuple1& t1, const Tuple2& t2) {
-    return SameSizeTuplePrefixComparator<k - 1, k - 1>::Eq(t1, t2) &&
-        ::std::tr1::get<k - 1>(t1) == ::std::tr1::get<k - 1>(t2);
-  }
-};
-
-}  // namespace gtest_internal
-
-template <GTEST_10_TYPENAMES_(T), GTEST_10_TYPENAMES_(U)>
-inline bool operator==(const GTEST_10_TUPLE_(T)& t,
-                       const GTEST_10_TUPLE_(U)& u) {
-  return gtest_internal::SameSizeTuplePrefixComparator<
-      tuple_size<GTEST_10_TUPLE_(T) >::value,
-      tuple_size<GTEST_10_TUPLE_(U) >::value>::Eq(t, u);
-}
-
-template <GTEST_10_TYPENAMES_(T), GTEST_10_TYPENAMES_(U)>
-inline bool operator!=(const GTEST_10_TUPLE_(T)& t,
-                       const GTEST_10_TUPLE_(U)& u) { return !(t == u); }
-
-// 6.1.4 Pairs.
-// Unimplemented.
-
-}  // namespace tr1
-}  // namespace std
-
-#undef GTEST_0_TUPLE_
-#undef GTEST_1_TUPLE_
-#undef GTEST_2_TUPLE_
-#undef GTEST_3_TUPLE_
-#undef GTEST_4_TUPLE_
-#undef GTEST_5_TUPLE_
-#undef GTEST_6_TUPLE_
-#undef GTEST_7_TUPLE_
-#undef GTEST_8_TUPLE_
-#undef GTEST_9_TUPLE_
-#undef GTEST_10_TUPLE_
-
-#undef GTEST_0_TYPENAMES_
-#undef GTEST_1_TYPENAMES_
-#undef GTEST_2_TYPENAMES_
-#undef GTEST_3_TYPENAMES_
-#undef GTEST_4_TYPENAMES_
-#undef GTEST_5_TYPENAMES_
-#undef GTEST_6_TYPENAMES_
-#undef GTEST_7_TYPENAMES_
-#undef GTEST_8_TYPENAMES_
-#undef GTEST_9_TYPENAMES_
-#undef GTEST_10_TYPENAMES_
-
-#undef GTEST_DECLARE_TUPLE_AS_FRIEND_
-#undef GTEST_BY_REF_
-#undef GTEST_ADD_REF_
-#undef GTEST_TUPLE_ELEMENT_
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
-# elif GTEST_ENV_HAS_STD_TUPLE_
-#  include <tuple>
-// C++11 puts its tuple into the ::std namespace rather than
-// ::std::tr1.  gtest expects tuple to live in ::std::tr1, so put it there.
-// This causes undefined behavior, but supported compilers react in
-// the way we intend.
-namespace std {
-namespace tr1 {
-using ::std::get;
-using ::std::make_tuple;
-using ::std::tuple;
-using ::std::tuple_element;
-using ::std::tuple_size;
-}
-}
-
-# elif GTEST_OS_SYMBIAN
-
-// On Symbian, BOOST_HAS_TR1_TUPLE causes Boost's TR1 tuple library to
-// use STLport's tuple implementation, which unfortunately doesn't
-// work as the copy of STLport distributed with Symbian is incomplete.
-// By making sure BOOST_HAS_TR1_TUPLE is undefined, we force Boost to
-// use its own tuple implementation.
-#  ifdef BOOST_HAS_TR1_TUPLE
-#   undef BOOST_HAS_TR1_TUPLE
-#  endif  // BOOST_HAS_TR1_TUPLE
-
-// This prevents <boost/tr1/detail/config.hpp>, which defines
-// BOOST_HAS_TR1_TUPLE, from being #included by Boost's <tuple>.
-#  define BOOST_TR1_DETAIL_CONFIG_HPP_INCLUDED
-#  include <tuple>
-
-# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40000)
-// GCC 4.0+ implements tr1/tuple in the <tr1/tuple> header.  This does
-// not conform to the TR1 spec, which requires the header to be <tuple>.
-
-#  if !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302
-// Until version 4.3.2, gcc has a bug that causes <tr1/functional>,
-// which is #included by <tr1/tuple>, to not compile when RTTI is
-// disabled.  _TR1_FUNCTIONAL is the header guard for
-// <tr1/functional>.  Hence the following #define is a hack to prevent
-// <tr1/functional> from being included.
-#   define _TR1_FUNCTIONAL 1
-#   include <tr1/tuple>
-#   undef _TR1_FUNCTIONAL  // Allows the user to #include
-                        // <tr1/functional> if he chooses to.
-#  else
-#   include <tr1/tuple>  // NOLINT
-#  endif  // !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302
-
-# else
-// If the compiler is not GCC 4.0+, we assume the user is using a
-// spec-conforming TR1 implementation.
-#  include <tuple>  // NOLINT
-# endif  // GTEST_USE_OWN_TR1_TUPLE
-
-#endif  // GTEST_HAS_TR1_TUPLE
-
-// Determines whether clone(2) is supported.
-// Usually it will only be available on Linux, excluding
-// Linux on the Itanium architecture.
-// Also see http://linux.die.net/man/2/clone.
-#ifndef GTEST_HAS_CLONE
-// The user didn't tell us, so we need to figure it out.
-
-# if GTEST_OS_LINUX && !defined(__ia64__)
-#  if GTEST_OS_LINUX_ANDROID
-// On Android, clone() is only available on ARM starting with Gingerbread.
-#    if defined(__arm__) && __ANDROID_API__ >= 9
-#     define GTEST_HAS_CLONE 1
-#    else
-#     define GTEST_HAS_CLONE 0
-#    endif
-#  else
-#   define GTEST_HAS_CLONE 1
-#  endif
-# else
-#  define GTEST_HAS_CLONE 0
-# endif  // GTEST_OS_LINUX && !defined(__ia64__)
-
-#endif  // GTEST_HAS_CLONE
-
-// Determines whether to support stream redirection. This is used to test
-// output correctness and to implement death tests.
-#ifndef GTEST_HAS_STREAM_REDIRECTION
-// By default, we assume that stream redirection is supported on all
-// platforms except known mobile ones.
-# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN
-#  define GTEST_HAS_STREAM_REDIRECTION 0
-# else
-#  define GTEST_HAS_STREAM_REDIRECTION 1
-# endif  // !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_SYMBIAN
-#endif  // GTEST_HAS_STREAM_REDIRECTION
-
-// Determines whether to support death tests.
-// Google Test does not support death tests for VC 7.1 and earlier as
-// abort() in a VC 7.1 application compiled as GUI in debug config
-// pops up a dialog window that cannot be suppressed programmatically.
-#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \
-     (GTEST_OS_MAC && !GTEST_OS_IOS) || GTEST_OS_IOS_SIMULATOR || \
-     (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER >= 1400) || \
-     GTEST_OS_WINDOWS_MINGW || GTEST_OS_AIX || GTEST_OS_HPUX || \
-     GTEST_OS_OPENBSD || GTEST_OS_QNX)
-# define GTEST_HAS_DEATH_TEST 1
-# include <vector>  // NOLINT
-#endif
-
-// We don't support MSVC 7.1 with exceptions disabled now.  Therefore
-// all the compilers we care about are adequate for supporting
-// value-parameterized tests.
-#define GTEST_HAS_PARAM_TEST 1
-
-// Determines whether to support type-driven tests.
-
-// Typed tests need <typeinfo> and variadic macros, which GCC, VC++ 8.0,
-// Sun Pro CC, IBM Visual Age, and HP aCC support.
-#if defined(__GNUC__) || (_MSC_VER >= 1400) || defined(__SUNPRO_CC) || \
-    defined(__IBMCPP__) || defined(__HP_aCC)
-# define GTEST_HAS_TYPED_TEST 1
-# define GTEST_HAS_TYPED_TEST_P 1
-#endif
-
-// Determines whether to support Combine(). This only makes sense when
-// value-parameterized tests are enabled.  The implementation doesn't
-// work on Sun Studio since it doesn't understand templated conversion
-// operators.
-#if GTEST_HAS_PARAM_TEST && GTEST_HAS_TR1_TUPLE && !defined(__SUNPRO_CC)
-# define GTEST_HAS_COMBINE 1
-#endif
-
-// Determines whether the system compiler uses UTF-16 for encoding wide strings.
-#define GTEST_WIDE_STRING_USES_UTF16_ \
-    (GTEST_OS_WINDOWS || GTEST_OS_CYGWIN || GTEST_OS_SYMBIAN || GTEST_OS_AIX)
-
-// Determines whether test results can be streamed to a socket.
-#if GTEST_OS_LINUX
-# define GTEST_CAN_STREAM_RESULTS_ 1
-#endif
-
-// Defines some utility macros.
-
-// The GNU compiler emits a warning if nested "if" statements are followed by
-// an "else" statement and braces are not used to explicitly disambiguate the
-// "else" binding.  This leads to problems with code like:
-//
-//   if (gate)
-//     ASSERT_*(condition) << "Some message";
-//
-// The "switch (0) case 0:" idiom is used to suppress this.
-#ifdef __INTEL_COMPILER
-# define GTEST_AMBIGUOUS_ELSE_BLOCKER_
-#else
-# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ switch (0) case 0: default:  // NOLINT
-#endif
-
-// Use this annotation at the end of a struct/class definition to
-// prevent the compiler from optimizing away instances that are never
-// used.  This is useful when all interesting logic happens inside the
-// c'tor and / or d'tor.  Example:
-//
-//   struct Foo {
-//     Foo() { ... }
-//   } GTEST_ATTRIBUTE_UNUSED_;
-//
-// Also use it after a variable or parameter declaration to tell the
-// compiler the variable/parameter does not have to be used.
-#if defined(__GNUC__) && !defined(COMPILER_ICC)
-# define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
-#else
-# define GTEST_ATTRIBUTE_UNUSED_
-#endif
-
-// A macro to disallow operator=
-// This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_ASSIGN_(type)\
-  void operator=(type const &)
-
-// A macro to disallow copy constructor and operator=
-// This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type)\
-  type(type const &);\
-  GTEST_DISALLOW_ASSIGN_(type)
-
-// Tell the compiler to warn about unused return values for functions declared
-// with this macro.  The macro should be used on function declarations
-// following the argument list:
-//
-//   Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_;
-#if defined(__GNUC__) && (GTEST_GCC_VER_ >= 30400) && !defined(COMPILER_ICC)
-# define GTEST_MUST_USE_RESULT_ __attribute__ ((warn_unused_result))
-#else
-# define GTEST_MUST_USE_RESULT_
-#endif  // __GNUC__ && (GTEST_GCC_VER_ >= 30400) && !COMPILER_ICC
-
-// Determine whether the compiler supports Microsoft's Structured Exception
-// Handling.  This is supported by several Windows compilers but generally
-// does not exist on any other system.
-#ifndef GTEST_HAS_SEH
-// The user didn't tell us, so we need to figure it out.
-
-# if defined(_MSC_VER) || defined(__BORLANDC__)
-// These two compilers are known to support SEH.
-#  define GTEST_HAS_SEH 1
-# else
-// Assume no SEH.
-#  define GTEST_HAS_SEH 0
-# endif
-
-#endif  // GTEST_HAS_SEH
-
-#ifdef _MSC_VER
-
-# if GTEST_LINKED_AS_SHARED_LIBRARY
-#  define GTEST_API_ __declspec(dllimport)
-# elif GTEST_CREATE_SHARED_LIBRARY
-#  define GTEST_API_ __declspec(dllexport)
-# endif
-
-#endif  // _MSC_VER
-
-#ifndef GTEST_API_
-# define GTEST_API_
-#endif
-
-#ifdef __GNUC__
-// Ask the compiler to never inline a given function.
-# define GTEST_NO_INLINE_ __attribute__((noinline))
-#else
-# define GTEST_NO_INLINE_
-#endif
-
-// _LIBCPP_VERSION is defined by the libc++ library from the LLVM project.
-#if defined(__GLIBCXX__) || defined(_LIBCPP_VERSION)
-# define GTEST_HAS_CXXABI_H_ 1
-#else
-# define GTEST_HAS_CXXABI_H_ 0
-#endif
-
-namespace testing {
-
-class Message;
-
-namespace internal {
-
-// A secret type that Google Test users don't know about.  It has no
-// definition on purpose.  Therefore it's impossible to create a
-// Secret object, which is what we want.
-class Secret;
-
-// The GTEST_COMPILE_ASSERT_ macro can be used to verify that a compile time
-// expression is true. For example, you could use it to verify the
-// size of a static array:
-//
-//   GTEST_COMPILE_ASSERT_(ARRAYSIZE(content_type_names) == CONTENT_NUM_TYPES,
-//                         content_type_names_incorrect_size);
-//
-// or to make sure a struct is smaller than a certain size:
-//
-//   GTEST_COMPILE_ASSERT_(sizeof(foo) < 128, foo_too_large);
-//
-// The second argument to the macro is the name of the variable. If
-// the expression is false, most compilers will issue a warning/error
-// containing the name of the variable.
-
-template <bool>
-struct CompileAssert {
-};
-
-#define GTEST_COMPILE_ASSERT_(expr, msg) \
-  typedef ::testing::internal::CompileAssert<(static_cast<bool>(expr))> \
-      msg[static_cast<bool>(expr) ? 1 : -1] GTEST_ATTRIBUTE_UNUSED_
-
-// Implementation details of GTEST_COMPILE_ASSERT_:
-//
-// - GTEST_COMPILE_ASSERT_ works by defining an array type that has -1
-//   elements (and thus is invalid) when the expression is false.
-//
-// - The simpler definition
-//
-//    #define GTEST_COMPILE_ASSERT_(expr, msg) typedef char msg[(expr) ? 1 : -1]
-//
-//   does not work, as gcc supports variable-length arrays whose sizes
-//   are determined at run-time (this is gcc's extension and not part
-//   of the C++ standard).  As a result, gcc fails to reject the
-//   following code with the simple definition:
-//
-//     int foo;
-//     GTEST_COMPILE_ASSERT_(foo, msg); // not supposed to compile as foo is
-//                                      // not a compile-time constant.
-//
-// - By using the type CompileAssert<(bool(expr))>, we ensures that
-//   expr is a compile-time constant.  (Template arguments must be
-//   determined at compile-time.)
-//
-// - The outter parentheses in CompileAssert<(bool(expr))> are necessary
-//   to work around a bug in gcc 3.4.4 and 4.0.1.  If we had written
-//
-//     CompileAssert<bool(expr)>
-//
-//   instead, these compilers will refuse to compile
-//
-//     GTEST_COMPILE_ASSERT_(5 > 0, some_message);
-//
-//   (They seem to think the ">" in "5 > 0" marks the end of the
-//   template argument list.)
-//
-// - The array size is (bool(expr) ? 1 : -1), instead of simply
-//
-//     ((expr) ? 1 : -1).
-//
-//   This is to avoid running into a bug in MS VC 7.1, which
-//   causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1.
-
-// StaticAssertTypeEqHelper is used by StaticAssertTypeEq defined in gtest.h.
-//
-// This template is declared, but intentionally undefined.
-template <typename T1, typename T2>
-struct StaticAssertTypeEqHelper;
-
-template <typename T>
-struct StaticAssertTypeEqHelper<T, T> {};
-
-#if GTEST_HAS_GLOBAL_STRING
-typedef ::string string;
-#else
-typedef ::std::string string;
-#endif  // GTEST_HAS_GLOBAL_STRING
-
-#if GTEST_HAS_GLOBAL_WSTRING
-typedef ::wstring wstring;
-#elif GTEST_HAS_STD_WSTRING
-typedef ::std::wstring wstring;
-#endif  // GTEST_HAS_GLOBAL_WSTRING
-
-// A helper for suppressing warnings on constant condition.  It just
-// returns 'condition'.
-GTEST_API_ bool IsTrue(bool condition);
-
-// Defines scoped_ptr.
-
-// This implementation of scoped_ptr is PARTIAL - it only contains
-// enough stuff to satisfy Google Test's need.
-template <typename T>
-class scoped_ptr {
- public:
-  typedef T element_type;
-
-  explicit scoped_ptr(T* p = NULL) : ptr_(p) {}
-  ~scoped_ptr() { reset(); }
-
-  T& operator*() const { return *ptr_; }
-  T* operator->() const { return ptr_; }
-  T* get() const { return ptr_; }
-
-  T* release() {
-    T* const ptr = ptr_;
-    ptr_ = NULL;
-    return ptr;
-  }
-
-  void reset(T* p = NULL) {
-    if (p != ptr_) {
-      if (IsTrue(sizeof(T) > 0)) {  // Makes sure T is a complete type.
-        delete ptr_;
-      }
-      ptr_ = p;
-    }
-  }
-
- private:
-  T* ptr_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(scoped_ptr);
-};
-
-// Defines RE.
-
-// A simple C++ wrapper for <regex.h>.  It uses the POSIX Extended
-// Regular Expression syntax.
-class GTEST_API_ RE {
- public:
-  // A copy constructor is required by the Standard to initialize object
-  // references from r-values.
-  RE(const RE& other) { Init(other.pattern()); }
-
-  // Constructs an RE from a string.
-  RE(const ::std::string& regex) { Init(regex.c_str()); }  // NOLINT
-
-#if GTEST_HAS_GLOBAL_STRING
-
-  RE(const ::string& regex) { Init(regex.c_str()); }  // NOLINT
-
-#endif  // GTEST_HAS_GLOBAL_STRING
-
-  RE(const char* regex) { Init(regex); }  // NOLINT
-  ~RE();
-
-  // Returns the string representation of the regex.
-  const char* pattern() const { return pattern_; }
-
-  // FullMatch(str, re) returns true iff regular expression re matches
-  // the entire str.
-  // PartialMatch(str, re) returns true iff regular expression re
-  // matches a substring of str (including str itself).
-  //
-  // TODO(wan@google.com): make FullMatch() and PartialMatch() work
-  // when str contains NUL characters.
-  static bool FullMatch(const ::std::string& str, const RE& re) {
-    return FullMatch(str.c_str(), re);
-  }
-  static bool PartialMatch(const ::std::string& str, const RE& re) {
-    return PartialMatch(str.c_str(), re);
-  }
-
-#if GTEST_HAS_GLOBAL_STRING
-
-  static bool FullMatch(const ::string& str, const RE& re) {
-    return FullMatch(str.c_str(), re);
-  }
-  static bool PartialMatch(const ::string& str, const RE& re) {
-    return PartialMatch(str.c_str(), re);
-  }
-
-#endif  // GTEST_HAS_GLOBAL_STRING
-
-  static bool FullMatch(const char* str, const RE& re);
-  static bool PartialMatch(const char* str, const RE& re);
-
- private:
-  void Init(const char* regex);
-
-  // We use a const char* instead of an std::string, as Google Test used to be
-  // used where std::string is not available.  TODO(wan@google.com): change to
-  // std::string.
-  const char* pattern_;
-  bool is_valid_;
-
-#if GTEST_USES_POSIX_RE
-
-  regex_t full_regex_;     // For FullMatch().
-  regex_t partial_regex_;  // For PartialMatch().
-
-#else  // GTEST_USES_SIMPLE_RE
-
-  const char* full_pattern_;  // For FullMatch();
-
-#endif
-
-  GTEST_DISALLOW_ASSIGN_(RE);
-};
-
-// Formats a source file path and a line number as they would appear
-// in an error message from the compiler used to compile this code.
-GTEST_API_ ::std::string FormatFileLocation(const char* file, int line);
-
-// Formats a file location for compiler-independent XML output.
-// Although this function is not platform dependent, we put it next to
-// FormatFileLocation in order to contrast the two functions.
-GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file,
-                                                               int line);
-
-// Defines logging utilities:
-//   GTEST_LOG_(severity) - logs messages at the specified severity level. The
-//                          message itself is streamed into the macro.
-//   LogToStderr()  - directs all log messages to stderr.
-//   FlushInfoLog() - flushes informational log messages.
-
-enum GTestLogSeverity {
-  GTEST_INFO,
-  GTEST_WARNING,
-  GTEST_ERROR,
-  GTEST_FATAL
-};
-
-// Formats log entry severity, provides a stream object for streaming the
-// log message, and terminates the message with a newline when going out of
-// scope.
-class GTEST_API_ GTestLog {
- public:
-  GTestLog(GTestLogSeverity severity, const char* file, int line);
-
-  // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
-  ~GTestLog();
-
-  ::std::ostream& GetStream() { return ::std::cerr; }
-
- private:
-  const GTestLogSeverity severity_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestLog);
-};
-
-#define GTEST_LOG_(severity) \
-    ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
-                                  __FILE__, __LINE__).GetStream()
-
-inline void LogToStderr() {}
-inline void FlushInfoLog() { fflush(NULL); }
-
-// INTERNAL IMPLEMENTATION - DO NOT USE.
-//
-// GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition
-// is not satisfied.
-//  Synopsys:
-//    GTEST_CHECK_(boolean_condition);
-//     or
-//    GTEST_CHECK_(boolean_condition) << "Additional message";
-//
-//    This checks the condition and if the condition is not satisfied
-//    it prints message about the condition violation, including the
-//    condition itself, plus additional message streamed into it, if any,
-//    and then it aborts the program. It aborts the program irrespective of
-//    whether it is built in the debug mode or not.
-#define GTEST_CHECK_(condition) \
-    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-    if (::testing::internal::IsTrue(condition)) \
-      ; \
-    else \
-      GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
-
-// An all-mode assert to verify that the given POSIX-style function
-// call returns 0 (indicating success).  Known limitation: this
-// doesn't expand to a balanced 'if' statement, so enclose the macro
-// in {} if you need to use it as the only statement in an 'if'
-// branch.
-#define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \
-  if (const int gtest_error = (posix_call)) \
-    GTEST_LOG_(FATAL) << #posix_call << "failed with error " \
-                      << gtest_error
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Use ImplicitCast_ as a safe version of static_cast for upcasting in
-// the type hierarchy (e.g. casting a Foo* to a SuperclassOfFoo* or a
-// const Foo*).  When you use ImplicitCast_, the compiler checks that
-// the cast is safe.  Such explicit ImplicitCast_s are necessary in
-// surprisingly many situations where C++ demands an exact type match
-// instead of an argument type convertable to a target type.
-//
-// The syntax for using ImplicitCast_ is the same as for static_cast:
-//
-//   ImplicitCast_<ToType>(expr)
-//
-// ImplicitCast_ would have been part of the C++ standard library,
-// but the proposal was submitted too late.  It will probably make
-// its way into the language in the future.
-//
-// This relatively ugly name is intentional. It prevents clashes with
-// similar functions users may have (e.g., implicit_cast). The internal
-// namespace alone is not enough because the function can be found by ADL.
-template<typename To>
-inline To ImplicitCast_(To x) { return x; }
-
-// When you upcast (that is, cast a pointer from type Foo to type
-// SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts
-// always succeed.  When you downcast (that is, cast a pointer from
-// type Foo to type SubclassOfFoo), static_cast<> isn't safe, because
-// how do you know the pointer is really of type SubclassOfFoo?  It
-// could be a bare Foo, or of type DifferentSubclassOfFoo.  Thus,
-// when you downcast, you should use this macro.  In debug mode, we
-// use dynamic_cast<> to double-check the downcast is legal (we die
-// if it's not).  In normal mode, we do the efficient static_cast<>
-// instead.  Thus, it's important to test in debug mode to make sure
-// the cast is legal!
-//    This is the only place in the code we should use dynamic_cast<>.
-// In particular, you SHOULDN'T be using dynamic_cast<> in order to
-// do RTTI (eg code like this:
-//    if (dynamic_cast<Subclass1>(foo)) HandleASubclass1Object(foo);
-//    if (dynamic_cast<Subclass2>(foo)) HandleASubclass2Object(foo);
-// You should design the code some other way not to need this.
-//
-// This relatively ugly name is intentional. It prevents clashes with
-// similar functions users may have (e.g., down_cast). The internal
-// namespace alone is not enough because the function can be found by ADL.
-template<typename To, typename From>  // use like this: DownCast_<T*>(foo);
-inline To DownCast_(From* f) {  // so we only accept pointers
-  // Ensures that To is a sub-type of From *.  This test is here only
-  // for compile-time type checking, and has no overhead in an
-  // optimized build at run-time, as it will be optimized away
-  // completely.
-  if (false) {
-    const To to = NULL;
-    ::testing::internal::ImplicitCast_<From*>(to);
-  }
-
-#if GTEST_HAS_RTTI
-  // RTTI: debug mode only!
-  GTEST_CHECK_(f == NULL || dynamic_cast<To>(f) != NULL);
-#endif
-  return static_cast<To>(f);
-}
-
-// Downcasts the pointer of type Base to Derived.
-// Derived must be a subclass of Base. The parameter MUST
-// point to a class of type Derived, not any subclass of it.
-// When RTTI is available, the function performs a runtime
-// check to enforce this.
-template <class Derived, class Base>
-Derived* CheckedDowncastToActualType(Base* base) {
-#if GTEST_HAS_RTTI
-  GTEST_CHECK_(typeid(*base) == typeid(Derived));
-  return dynamic_cast<Derived*>(base);  // NOLINT
-#else
-  return static_cast<Derived*>(base);  // Poor man's downcast.
-#endif
-}
-
-#if GTEST_HAS_STREAM_REDIRECTION
-
-// Defines the stderr capturer:
-//   CaptureStdout     - starts capturing stdout.
-//   GetCapturedStdout - stops capturing stdout and returns the captured string.
-//   CaptureStderr     - starts capturing stderr.
-//   GetCapturedStderr - stops capturing stderr and returns the captured string.
-//
-GTEST_API_ void CaptureStdout();
-GTEST_API_ std::string GetCapturedStdout();
-GTEST_API_ void CaptureStderr();
-GTEST_API_ std::string GetCapturedStderr();
-
-#endif  // GTEST_HAS_STREAM_REDIRECTION
-
-
-#if GTEST_HAS_DEATH_TEST
-
-const ::std::vector<testing::internal::string>& GetInjectableArgvs();
-void SetInjectableArgvs(const ::std::vector<testing::internal::string>*
-                             new_argvs);
-
-// A copy of all command line arguments.  Set by InitGoogleTest().
-extern ::std::vector<testing::internal::string> g_argvs;
-
-#endif  // GTEST_HAS_DEATH_TEST
-
-// Defines synchronization primitives.
-
-#if GTEST_HAS_PTHREAD
-
-// Sleeps for (roughly) n milli-seconds.  This function is only for
-// testing Google Test's own constructs.  Don't use it in user tests,
-// either directly or indirectly.
-inline void SleepMilliseconds(int n) {
-  const timespec time = {
-    0,                  // 0 seconds.
-    n * 1000L * 1000L,  // And n ms.
-  };
-  nanosleep(&time, NULL);
-}
-
-// Allows a controller thread to pause execution of newly created
-// threads until notified.  Instances of this class must be created
-// and destroyed in the controller thread.
-//
-// This class is only for testing Google Test's own constructs. Do not
-// use it in user tests, either directly or indirectly.
-class Notification {
- public:
-  Notification() : notified_(false) {
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL));
-  }
-  ~Notification() {
-    pthread_mutex_destroy(&mutex_);
-  }
-
-  // Notifies all threads created with this notification to start. Must
-  // be called from the controller thread.
-  void Notify() {
-    pthread_mutex_lock(&mutex_);
-    notified_ = true;
-    pthread_mutex_unlock(&mutex_);
-  }
-
-  // Blocks until the controller thread notifies. Must be called from a test
-  // thread.
-  void WaitForNotification() {
-    for (;;) {
-      pthread_mutex_lock(&mutex_);
-      const bool notified = notified_;
-      pthread_mutex_unlock(&mutex_);
-      if (notified)
-        break;
-      SleepMilliseconds(10);
-    }
-  }
-
- private:
-  pthread_mutex_t mutex_;
-  bool notified_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
-};
-
-// As a C-function, ThreadFuncWithCLinkage cannot be templated itself.
-// Consequently, it cannot select a correct instantiation of ThreadWithParam
-// in order to call its Run(). Introducing ThreadWithParamBase as a
-// non-templated base class for ThreadWithParam allows us to bypass this
-// problem.
-class ThreadWithParamBase {
- public:
-  virtual ~ThreadWithParamBase() {}
-  virtual void Run() = 0;
-};
-
-// pthread_create() accepts a pointer to a function type with the C linkage.
-// According to the Standard (7.5/1), function types with different linkages
-// are different even if they are otherwise identical.  Some compilers (for
-// example, SunStudio) treat them as different types.  Since class methods
-// cannot be defined with C-linkage we need to define a free C-function to
-// pass into pthread_create().
-extern "C" inline void* ThreadFuncWithCLinkage(void* thread) {
-  static_cast<ThreadWithParamBase*>(thread)->Run();
-  return NULL;
-}
-
-// Helper class for testing Google Test's multi-threading constructs.
-// To use it, write:
-//
-//   void ThreadFunc(int param) { /* Do things with param */ }
-//   Notification thread_can_start;
-//   ...
-//   // The thread_can_start parameter is optional; you can supply NULL.
-//   ThreadWithParam<int> thread(&ThreadFunc, 5, &thread_can_start);
-//   thread_can_start.Notify();
-//
-// These classes are only for testing Google Test's own constructs. Do
-// not use them in user tests, either directly or indirectly.
-template <typename T>
-class ThreadWithParam : public ThreadWithParamBase {
- public:
-  typedef void (*UserThreadFunc)(T);
-
-  ThreadWithParam(
-      UserThreadFunc func, T param, Notification* thread_can_start)
-      : func_(func),
-        param_(param),
-        thread_can_start_(thread_can_start),
-        finished_(false) {
-    ThreadWithParamBase* const base = this;
-    // The thread can be created only after all fields except thread_
-    // have been initialized.
-    GTEST_CHECK_POSIX_SUCCESS_(
-        pthread_create(&thread_, 0, &ThreadFuncWithCLinkage, base));
-  }
-  ~ThreadWithParam() { Join(); }
-
-  void Join() {
-    if (!finished_) {
-      GTEST_CHECK_POSIX_SUCCESS_(pthread_join(thread_, 0));
-      finished_ = true;
-    }
-  }
-
-  virtual void Run() {
-    if (thread_can_start_ != NULL)
-      thread_can_start_->WaitForNotification();
-    func_(param_);
-  }
-
- private:
-  const UserThreadFunc func_;  // User-supplied thread function.
-  const T param_;  // User-supplied parameter to the thread function.
-  // When non-NULL, used to block execution until the controller thread
-  // notifies.
-  Notification* const thread_can_start_;
-  bool finished_;  // true iff we know that the thread function has finished.
-  pthread_t thread_;  // The native thread object.
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
-};
-
-// MutexBase and Mutex implement mutex on pthreads-based platforms. They
-// are used in conjunction with class MutexLock:
-//
-//   Mutex mutex;
-//   ...
-//   MutexLock lock(&mutex);  // Acquires the mutex and releases it at the end
-//                            // of the current scope.
-//
-// MutexBase implements behavior for both statically and dynamically
-// allocated mutexes.  Do not use MutexBase directly.  Instead, write
-// the following to define a static mutex:
-//
-//   GTEST_DEFINE_STATIC_MUTEX_(g_some_mutex);
-//
-// You can forward declare a static mutex like this:
-//
-//   GTEST_DECLARE_STATIC_MUTEX_(g_some_mutex);
-//
-// To create a dynamic mutex, just define an object of type Mutex.
-class MutexBase {
- public:
-  // Acquires this mutex.
-  void Lock() {
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_lock(&mutex_));
-    owner_ = pthread_self();
-    has_owner_ = true;
-  }
-
-  // Releases this mutex.
-  void Unlock() {
-    // Since the lock is being released the owner_ field should no longer be
-    // considered valid. We don't protect writing to has_owner_ here, as it's
-    // the caller's responsibility to ensure that the current thread holds the
-    // mutex when this is called.
-    has_owner_ = false;
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_unlock(&mutex_));
-  }
-
-  // Does nothing if the current thread holds the mutex. Otherwise, crashes
-  // with high probability.
-  void AssertHeld() const {
-    GTEST_CHECK_(has_owner_ && pthread_equal(owner_, pthread_self()))
-        << "The current thread is not holding the mutex @" << this;
-  }
-
-  // A static mutex may be used before main() is entered.  It may even
-  // be used before the dynamic initialization stage.  Therefore we
-  // must be able to initialize a static mutex object at link time.
-  // This means MutexBase has to be a POD and its member variables
-  // have to be public.
- public:
-  pthread_mutex_t mutex_;  // The underlying pthread mutex.
-  // has_owner_ indicates whether the owner_ field below contains a valid thread
-  // ID and is therefore safe to inspect (e.g., to use in pthread_equal()). All
-  // accesses to the owner_ field should be protected by a check of this field.
-  // An alternative might be to memset() owner_ to all zeros, but there's no
-  // guarantee that a zero'd pthread_t is necessarily invalid or even different
-  // from pthread_self().
-  bool has_owner_;
-  pthread_t owner_;  // The thread holding the mutex.
-};
-
-// Forward-declares a static mutex.
-# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
-    extern ::testing::internal::MutexBase mutex
-
-// Defines and statically (i.e. at link time) initializes a static mutex.
-// The initialization list here does not explicitly initialize each field,
-// instead relying on default initialization for the unspecified fields. In
-// particular, the owner_ field (a pthread_t) is not explicitly initialized.
-// This allows initialization to work whether pthread_t is a scalar or struct.
-// The flag -Wmissing-field-initializers must not be specified for this to work.
-# define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
-    ::testing::internal::MutexBase mutex = { PTHREAD_MUTEX_INITIALIZER, false }
-
-// The Mutex class can only be used for mutexes created at runtime. It
-// shares its API with MutexBase otherwise.
-class Mutex : public MutexBase {
- public:
-  Mutex() {
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL));
-    has_owner_ = false;
-  }
-  ~Mutex() {
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_));
-  }
-
- private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
-};
-
-// We cannot name this class MutexLock as the ctor declaration would
-// conflict with a macro named MutexLock, which is defined on some
-// platforms.  Hence the typedef trick below.
-class GTestMutexLock {
- public:
-  explicit GTestMutexLock(MutexBase* mutex)
-      : mutex_(mutex) { mutex_->Lock(); }
-
-  ~GTestMutexLock() { mutex_->Unlock(); }
-
- private:
-  MutexBase* const mutex_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
-};
-
-typedef GTestMutexLock MutexLock;
-
-// Helpers for ThreadLocal.
-
-// pthread_key_create() requires DeleteThreadLocalValue() to have
-// C-linkage.  Therefore it cannot be templatized to access
-// ThreadLocal<T>.  Hence the need for class
-// ThreadLocalValueHolderBase.
-class ThreadLocalValueHolderBase {
- public:
-  virtual ~ThreadLocalValueHolderBase() {}
-};
-
-// Called by pthread to delete thread-local data stored by
-// pthread_setspecific().
-extern "C" inline void DeleteThreadLocalValue(void* value_holder) {
-  delete static_cast<ThreadLocalValueHolderBase*>(value_holder);
-}
-
-// Implements thread-local storage on pthreads-based systems.
-//
-//   // Thread 1
-//   ThreadLocal<int> tl(100);  // 100 is the default value for each thread.
-//
-//   // Thread 2
-//   tl.set(150);  // Changes the value for thread 2 only.
-//   EXPECT_EQ(150, tl.get());
-//
-//   // Thread 1
-//   EXPECT_EQ(100, tl.get());  // In thread 1, tl has the original value.
-//   tl.set(200);
-//   EXPECT_EQ(200, tl.get());
-//
-// The template type argument T must have a public copy constructor.
-// In addition, the default ThreadLocal constructor requires T to have
-// a public default constructor.
-//
-// An object managed for a thread by a ThreadLocal instance is deleted
-// when the thread exits.  Or, if the ThreadLocal instance dies in
-// that thread, when the ThreadLocal dies.  It's the user's
-// responsibility to ensure that all other threads using a ThreadLocal
-// have exited when it dies, or the per-thread objects for those
-// threads will not be deleted.
-//
-// Google Test only uses global ThreadLocal objects.  That means they
-// will die after main() has returned.  Therefore, no per-thread
-// object managed by Google Test will be leaked as long as all threads
-// using Google Test have exited when main() returns.
-template <typename T>
-class ThreadLocal {
- public:
-  ThreadLocal() : key_(CreateKey()),
-                  default_() {}
-  explicit ThreadLocal(const T& value) : key_(CreateKey()),
-                                         default_(value) {}
-
-  ~ThreadLocal() {
-    // Destroys the managed object for the current thread, if any.
-    DeleteThreadLocalValue(pthread_getspecific(key_));
-
-    // Releases resources associated with the key.  This will *not*
-    // delete managed objects for other threads.
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_key_delete(key_));
-  }
-
-  T* pointer() { return GetOrCreateValue(); }
-  const T* pointer() const { return GetOrCreateValue(); }
-  const T& get() const { return *pointer(); }
-  void set(const T& value) { *pointer() = value; }
-
- private:
-  // Holds a value of type T.
-  class ValueHolder : public ThreadLocalValueHolderBase {
-   public:
-    explicit ValueHolder(const T& value) : value_(value) {}
-
-    T* pointer() { return &value_; }
-
-   private:
-    T value_;
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder);
-  };
-
-  static pthread_key_t CreateKey() {
-    pthread_key_t key;
-    // When a thread exits, DeleteThreadLocalValue() will be called on
-    // the object managed for that thread.
-    GTEST_CHECK_POSIX_SUCCESS_(
-        pthread_key_create(&key, &DeleteThreadLocalValue));
-    return key;
-  }
-
-  T* GetOrCreateValue() const {
-    ThreadLocalValueHolderBase* const holder =
-        static_cast<ThreadLocalValueHolderBase*>(pthread_getspecific(key_));
-    if (holder != NULL) {
-      return CheckedDowncastToActualType<ValueHolder>(holder)->pointer();
-    }
-
-    ValueHolder* const new_holder = new ValueHolder(default_);
-    ThreadLocalValueHolderBase* const holder_base = new_holder;
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_setspecific(key_, holder_base));
-    return new_holder->pointer();
-  }
-
-  // A key pthreads uses for looking up per-thread values.
-  const pthread_key_t key_;
-  const T default_;  // The default value for each thread.
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
-};
-
-# define GTEST_IS_THREADSAFE 1
-
-#else  // GTEST_HAS_PTHREAD
-
-// A dummy implementation of synchronization primitives (mutex, lock,
-// and thread-local variable).  Necessary for compiling Google Test where
-// mutex is not supported - using Google Test in multiple threads is not
-// supported on such platforms.
-
-class Mutex {
- public:
-  Mutex() {}
-  void Lock() {}
-  void Unlock() {}
-  void AssertHeld() const {}
-};
-
-# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
-  extern ::testing::internal::Mutex mutex
-
-# define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex
-
-class GTestMutexLock {
- public:
-  explicit GTestMutexLock(Mutex*) {}  // NOLINT
-};
-
-typedef GTestMutexLock MutexLock;
-
-template <typename T>
-class ThreadLocal {
- public:
-  ThreadLocal() : value_() {}
-  explicit ThreadLocal(const T& value) : value_(value) {}
-  T* pointer() { return &value_; }
-  const T* pointer() const { return &value_; }
-  const T& get() const { return value_; }
-  void set(const T& value) { value_ = value; }
- private:
-  T value_;
-};
-
-// The above synchronization primitives have dummy implementations.
-// Therefore Google Test is not thread-safe.
-# define GTEST_IS_THREADSAFE 0
-
-#endif  // GTEST_HAS_PTHREAD
-
-// Returns the number of threads running in the process, or 0 to indicate that
-// we cannot detect it.
-GTEST_API_ size_t GetThreadCount();
-
-// Passing non-POD classes through ellipsis (...) crashes the ARM
-// compiler and generates a warning in Sun Studio.  The Nokia Symbian
-// and the IBM XL C/C++ compiler try to instantiate a copy constructor
-// for objects passed through ellipsis (...), failing for uncopyable
-// objects.  We define this to ensure that only POD is passed through
-// ellipsis on these systems.
-#if defined(__SYMBIAN32__) || defined(__IBMCPP__) || defined(__SUNPRO_CC)
-// We lose support for NULL detection where the compiler doesn't like
-// passing non-POD classes through ellipsis (...).
-# define GTEST_ELLIPSIS_NEEDS_POD_ 1
-#else
-# define GTEST_CAN_COMPARE_NULL 1
-#endif
-
-// The Nokia Symbian and IBM XL C/C++ compilers cannot decide between
-// const T& and const T* in a function template.  These compilers
-// _can_ decide between class template specializations for T and T*,
-// so a tr1::type_traits-like is_pointer works.
-#if defined(__SYMBIAN32__) || defined(__IBMCPP__)
-# define GTEST_NEEDS_IS_POINTER_ 1
-#endif
-
-template <bool bool_value>
-struct bool_constant {
-  typedef bool_constant<bool_value> type;
-  static const bool value = bool_value;
-};
-template <bool bool_value> const bool bool_constant<bool_value>::value;
-
-typedef bool_constant<false> false_type;
-typedef bool_constant<true> true_type;
-
-template <typename T>
-struct is_pointer : public false_type {};
-
-template <typename T>
-struct is_pointer<T*> : public true_type {};
-
-template <typename Iterator>
-struct IteratorTraits {
-  typedef typename Iterator::value_type value_type;
-};
-
-template <typename T>
-struct IteratorTraits<T*> {
-  typedef T value_type;
-};
-
-template <typename T>
-struct IteratorTraits<const T*> {
-  typedef T value_type;
-};
-
-#if GTEST_OS_WINDOWS
-# define GTEST_PATH_SEP_ "\\"
-# define GTEST_HAS_ALT_PATH_SEP_ 1
-// The biggest signed integer type the compiler supports.
-typedef __int64 BiggestInt;
-#else
-# define GTEST_PATH_SEP_ "/"
-# define GTEST_HAS_ALT_PATH_SEP_ 0
-typedef long long BiggestInt;  // NOLINT
-#endif  // GTEST_OS_WINDOWS
-
-// Utilities for char.
-
-// isspace(int ch) and friends accept an unsigned char or EOF.  char
-// may be signed, depending on the compiler (or compiler flags).
-// Therefore we need to cast a char to unsigned char before calling
-// isspace(), etc.
-
-inline bool IsAlpha(char ch) {
-  return isalpha(static_cast<unsigned char>(ch)) != 0;
-}
-inline bool IsAlNum(char ch) {
-  return isalnum(static_cast<unsigned char>(ch)) != 0;
-}
-inline bool IsDigit(char ch) {
-  return isdigit(static_cast<unsigned char>(ch)) != 0;
-}
-inline bool IsLower(char ch) {
-  return islower(static_cast<unsigned char>(ch)) != 0;
-}
-inline bool IsSpace(char ch) {
-  return isspace(static_cast<unsigned char>(ch)) != 0;
-}
-inline bool IsUpper(char ch) {
-  return isupper(static_cast<unsigned char>(ch)) != 0;
-}
-inline bool IsXDigit(char ch) {
-  return isxdigit(static_cast<unsigned char>(ch)) != 0;
-}
-inline bool IsXDigit(wchar_t ch) {
-  const unsigned char low_byte = static_cast<unsigned char>(ch);
-  return ch == low_byte && isxdigit(low_byte) != 0;
-}
-
-inline char ToLower(char ch) {
-  return static_cast<char>(tolower(static_cast<unsigned char>(ch)));
-}
-inline char ToUpper(char ch) {
-  return static_cast<char>(toupper(static_cast<unsigned char>(ch)));
-}
-
-// The testing::internal::posix namespace holds wrappers for common
-// POSIX functions.  These wrappers hide the differences between
-// Windows/MSVC and POSIX systems.  Since some compilers define these
-// standard functions as macros, the wrapper cannot have the same name
-// as the wrapped function.
-
-namespace posix {
-
-// Functions with a different name on Windows.
-
-#if GTEST_OS_WINDOWS
-
-typedef struct _stat StatStruct;
-
-# ifdef __BORLANDC__
-inline int IsATTY(int fd) { return isatty(fd); }
-inline int StrCaseCmp(const char* s1, const char* s2) {
-  return stricmp(s1, s2);
-}
-inline char* StrDup(const char* src) { return strdup(src); }
-# else  // !__BORLANDC__
-#  if GTEST_OS_WINDOWS_MOBILE
-inline int IsATTY(int /* fd */) { return 0; }
-#  else
-inline int IsATTY(int fd) { return _isatty(fd); }
-#  endif  // GTEST_OS_WINDOWS_MOBILE
-inline int StrCaseCmp(const char* s1, const char* s2) {
-  return _stricmp(s1, s2);
-}
-inline char* StrDup(const char* src) { return _strdup(src); }
-# endif  // __BORLANDC__
-
-# if GTEST_OS_WINDOWS_MOBILE
-inline int FileNo(FILE* file) { return reinterpret_cast<int>(_fileno(file)); }
-// Stat(), RmDir(), and IsDir() are not needed on Windows CE at this
-// time and thus not defined there.
-# else
-inline int FileNo(FILE* file) { return _fileno(file); }
-inline int Stat(const char* path, StatStruct* buf) { return _stat(path, buf); }
-inline int RmDir(const char* dir) { return _rmdir(dir); }
-inline bool IsDir(const StatStruct& st) {
-  return (_S_IFDIR & st.st_mode) != 0;
-}
-# endif  // GTEST_OS_WINDOWS_MOBILE
-
-#else
-
-typedef struct stat StatStruct;
-
-inline int FileNo(FILE* file) { return fileno(file); }
-inline int IsATTY(int fd) { return isatty(fd); }
-inline int Stat(const char* path, StatStruct* buf) { return stat(path, buf); }
-inline int StrCaseCmp(const char* s1, const char* s2) {
-  return strcasecmp(s1, s2);
-}
-inline char* StrDup(const char* src) { return strdup(src); }
-inline int RmDir(const char* dir) { return rmdir(dir); }
-inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); }
-
-#endif  // GTEST_OS_WINDOWS
-
-// Functions deprecated by MSVC 8.0.
-
-#ifdef _MSC_VER
-// Temporarily disable warning 4996 (deprecated function).
-# pragma warning(push)
-# pragma warning(disable:4996)
-#endif
-
-inline const char* StrNCpy(char* dest, const char* src, size_t n) {
-  return strncpy(dest, src, n);
-}
-
-// ChDir(), FReopen(), FDOpen(), Read(), Write(), Close(), and
-// StrError() aren't needed on Windows CE at this time and thus not
-// defined there.
-
-#if !GTEST_OS_WINDOWS_MOBILE
-inline int ChDir(const char* dir) { return chdir(dir); }
-#endif
-inline FILE* FOpen(const char* path, const char* mode) {
-  return fopen(path, mode);
-}
-#if !GTEST_OS_WINDOWS_MOBILE
-inline FILE *FReopen(const char* path, const char* mode, FILE* stream) {
-  return freopen(path, mode, stream);
-}
-inline FILE* FDOpen(int fd, const char* mode) { return fdopen(fd, mode); }
-#endif
-inline int FClose(FILE* fp) { return fclose(fp); }
-#if !GTEST_OS_WINDOWS_MOBILE
-inline int Read(int fd, void* buf, unsigned int count) {
-  return static_cast<int>(read(fd, buf, count));
-}
-inline int Write(int fd, const void* buf, unsigned int count) {
-  return static_cast<int>(write(fd, buf, count));
-}
-inline int Close(int fd) { return close(fd); }
-inline const char* StrError(int errnum) { return strerror(errnum); }
-#endif
-inline const char* GetEnv(const char* name) {
-#if GTEST_OS_WINDOWS_MOBILE
-  // We are on Windows CE, which has no environment variables.
-  return NULL;
-#elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9)
-  // Environment variables which we programmatically clear will be set to the
-  // empty string rather than unset (NULL).  Handle that case.
-  const char* const env = getenv(name);
-  return (env != NULL && env[0] != '\0') ? env : NULL;
-#else
-  return getenv(name);
-#endif
-}
-
-#ifdef _MSC_VER
-# pragma warning(pop)  // Restores the warning state.
-#endif
-
-#if GTEST_OS_WINDOWS_MOBILE
-// Windows CE has no C library. The abort() function is used in
-// several places in Google Test. This implementation provides a reasonable
-// imitation of standard behaviour.
-void Abort();
-#else
-inline void Abort() { abort(); }
-#endif  // GTEST_OS_WINDOWS_MOBILE
-
-}  // namespace posix
-
-// MSVC "deprecates" snprintf and issues warnings wherever it is used.  In
-// order to avoid these warnings, we need to use _snprintf or _snprintf_s on
-// MSVC-based platforms.  We map the GTEST_SNPRINTF_ macro to the appropriate
-// function in order to achieve that.  We use macro definition here because
-// snprintf is a variadic function.
-#if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE
-// MSVC 2005 and above support variadic macros.
-# define GTEST_SNPRINTF_(buffer, size, format, ...) \
-     _snprintf_s(buffer, size, size, format, __VA_ARGS__)
-#elif defined(_MSC_VER)
-// Windows CE does not define _snprintf_s and MSVC prior to 2005 doesn't
-// complain about _snprintf.
-# define GTEST_SNPRINTF_ _snprintf
-#else
-# define GTEST_SNPRINTF_ snprintf
-#endif
-
-// The maximum number a BiggestInt can represent.  This definition
-// works no matter BiggestInt is represented in one's complement or
-// two's complement.
-//
-// We cannot rely on numeric_limits in STL, as __int64 and long long
-// are not part of standard C++ and numeric_limits doesn't need to be
-// defined for them.
-const BiggestInt kMaxBiggestInt =
-    ~(static_cast<BiggestInt>(1) << (8*sizeof(BiggestInt) - 1));
-
-// This template class serves as a compile-time function from size to
-// type.  It maps a size in bytes to a primitive type with that
-// size. e.g.
-//
-//   TypeWithSize<4>::UInt
-//
-// is typedef-ed to be unsigned int (unsigned integer made up of 4
-// bytes).
-//
-// Such functionality should belong to STL, but I cannot find it
-// there.
-//
-// Google Test uses this class in the implementation of floating-point
-// comparison.
-//
-// For now it only handles UInt (unsigned int) as that's all Google Test
-// needs.  Other types can be easily added in the future if need
-// arises.
-template <size_t size>
-class TypeWithSize {
- public:
-  // This prevents the user from using TypeWithSize<N> with incorrect
-  // values of N.
-  typedef void UInt;
-};
-
-// The specialization for size 4.
-template <>
-class TypeWithSize<4> {
- public:
-  // unsigned int has size 4 in both gcc and MSVC.
-  //
-  // As base/basictypes.h doesn't compile on Windows, we cannot use
-  // uint32, uint64, and etc here.
-  typedef int Int;
-  typedef unsigned int UInt;
-};
-
-// The specialization for size 8.
-template <>
-class TypeWithSize<8> {
- public:
-#if GTEST_OS_WINDOWS
-  typedef __int64 Int;
-  typedef unsigned __int64 UInt;
-#else
-  typedef long long Int;  // NOLINT
-  typedef unsigned long long UInt;  // NOLINT
-#endif  // GTEST_OS_WINDOWS
-};
-
-// Integer types of known sizes.
-typedef TypeWithSize<4>::Int Int32;
-typedef TypeWithSize<4>::UInt UInt32;
-typedef TypeWithSize<8>::Int Int64;
-typedef TypeWithSize<8>::UInt UInt64;
-typedef TypeWithSize<8>::Int TimeInMillis;  // Represents time in milliseconds.
-
-// Utilities for command line flags and environment variables.
-
-// Macro for referencing flags.
-#define GTEST_FLAG(name) FLAGS_gtest_##name
-
-// Macros for declaring flags.
-#define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name)
-#define GTEST_DECLARE_int32_(name) \
-    GTEST_API_ extern ::testing::internal::Int32 GTEST_FLAG(name)
-#define GTEST_DECLARE_string_(name) \
-    GTEST_API_ extern ::std::string GTEST_FLAG(name)
-
-// Macros for defining flags.
-#define GTEST_DEFINE_bool_(name, default_val, doc) \
-    GTEST_API_ bool GTEST_FLAG(name) = (default_val)
-#define GTEST_DEFINE_int32_(name, default_val, doc) \
-    GTEST_API_ ::testing::internal::Int32 GTEST_FLAG(name) = (default_val)
-#define GTEST_DEFINE_string_(name, default_val, doc) \
-    GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val)
-
-// Thread annotations
-#define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
-#define GTEST_LOCK_EXCLUDED_(locks)
-
-// Parses 'str' for a 32-bit signed integer.  If successful, writes the result
-// to *value and returns true; otherwise leaves *value unchanged and returns
-// false.
-// TODO(chandlerc): Find a better way to refactor flag and environment parsing
-// out of both gtest-port.cc and gtest.cc to avoid exporting this utility
-// function.
-bool ParseInt32(const Message& src_text, const char* str, Int32* value);
-
-// Parses a bool/Int32/string from the environment variable
-// corresponding to the given Google Test flag.
-bool BoolFromGTestEnv(const char* flag, bool default_val);
-GTEST_API_ Int32 Int32FromGTestEnv(const char* flag, Int32 default_val);
-const char* StringFromGTestEnv(const char* flag, const char* default_val);
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
-
-#if GTEST_OS_LINUX
-# include <stdlib.h>
-# include <sys/types.h>
-# include <sys/wait.h>
-# include <unistd.h>
-#endif  // GTEST_OS_LINUX
-
-#if GTEST_HAS_EXCEPTIONS
-# include <stdexcept>
-#endif
-
-#include <ctype.h>
-#include <float.h>
-#include <string.h>
-#include <iomanip>
-#include <limits>
-#include <set>
-
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-//
-// The Google C++ Testing Framework (Google Test)
-//
-// This header file defines the Message class.
-//
-// IMPORTANT NOTE: Due to limitation of the C++ language, we have to
-// leave some internal implementation details in this header file.
-// They are clearly marked by comments like this:
-//
-//   // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-//
-// Such code is NOT meant to be used by a user directly, and is subject
-// to CHANGE WITHOUT NOTICE.  Therefore DO NOT DEPEND ON IT in a user
-// program!
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
-#define GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
-
-#include <limits>
-
-
-// Ensures that there is at least one operator<< in the global namespace.
-// See Message& operator<<(...) below for why.
-void operator<<(const testing::internal::Secret&, int);
-
-namespace testing {
-
-// The Message class works like an ostream repeater.
-//
-// Typical usage:
-//
-//   1. You stream a bunch of values to a Message object.
-//      It will remember the text in a stringstream.
-//   2. Then you stream the Message object to an ostream.
-//      This causes the text in the Message to be streamed
-//      to the ostream.
-//
-// For example;
-//
-//   testing::Message foo;
-//   foo << 1 << " != " << 2;
-//   std::cout << foo;
-//
-// will print "1 != 2".
-//
-// Message is not intended to be inherited from.  In particular, its
-// destructor is not virtual.
-//
-// Note that stringstream behaves differently in gcc and in MSVC.  You
-// can stream a NULL char pointer to it in the former, but not in the
-// latter (it causes an access violation if you do).  The Message
-// class hides this difference by treating a NULL char pointer as
-// "(null)".
-class GTEST_API_ Message {
- private:
-  // The type of basic IO manipulators (endl, ends, and flush) for
-  // narrow streams.
-  typedef std::ostream& (*BasicNarrowIoManip)(std::ostream&);
-
- public:
-  // Constructs an empty Message.
-  Message();
-
-  // Copy constructor.
-  Message(const Message& msg) : ss_(new ::std::stringstream) {  // NOLINT
-    *ss_ << msg.GetString();
-  }
-
-  // Constructs a Message from a C-string.
-  explicit Message(const char* str) : ss_(new ::std::stringstream) {
-    *ss_ << str;
-  }
-
-#if GTEST_OS_SYMBIAN
-  // Streams a value (either a pointer or not) to this object.
-  template <typename T>
-  inline Message& operator <<(const T& value) {
-    StreamHelper(typename internal::is_pointer<T>::type(), value);
-    return *this;
-  }
-#else
-  // Streams a non-pointer value to this object.
-  template <typename T>
-  inline Message& operator <<(const T& val) {
-    // Some libraries overload << for STL containers.  These
-    // overloads are defined in the global namespace instead of ::std.
-    //
-    // C++'s symbol lookup rule (i.e. Koenig lookup) says that these
-    // overloads are visible in either the std namespace or the global
-    // namespace, but not other namespaces, including the testing
-    // namespace which Google Test's Message class is in.
-    //
-    // To allow STL containers (and other types that has a << operator
-    // defined in the global namespace) to be used in Google Test
-    // assertions, testing::Message must access the custom << operator
-    // from the global namespace.  With this using declaration,
-    // overloads of << defined in the global namespace and those
-    // visible via Koenig lookup are both exposed in this function.
-    using ::operator <<;
-    *ss_ << val;
-    return *this;
-  }
-
-  // Streams a pointer value to this object.
-  //
-  // This function is an overload of the previous one.  When you
-  // stream a pointer to a Message, this definition will be used as it
-  // is more specialized.  (The C++ Standard, section
-  // [temp.func.order].)  If you stream a non-pointer, then the
-  // previous definition will be used.
-  //
-  // The reason for this overload is that streaming a NULL pointer to
-  // ostream is undefined behavior.  Depending on the compiler, you
-  // may get "0", "(nil)", "(null)", or an access violation.  To
-  // ensure consistent result across compilers, we always treat NULL
-  // as "(null)".
-  template <typename T>
-  inline Message& operator <<(T* const& pointer) {  // NOLINT
-    if (pointer == NULL) {
-      *ss_ << "(null)";
-    } else {
-      *ss_ << pointer;
-    }
-    return *this;
-  }
-#endif  // GTEST_OS_SYMBIAN
-
-  // Since the basic IO manipulators are overloaded for both narrow
-  // and wide streams, we have to provide this specialized definition
-  // of operator <<, even though its body is the same as the
-  // templatized version above.  Without this definition, streaming
-  // endl or other basic IO manipulators to Message will confuse the
-  // compiler.
-  Message& operator <<(BasicNarrowIoManip val) {
-    *ss_ << val;
-    return *this;
-  }
-
-  // Instead of 1/0, we want to see true/false for bool values.
-  Message& operator <<(bool b) {
-    return *this << (b ? "true" : "false");
-  }
-
-  // These two overloads allow streaming a wide C string to a Message
-  // using the UTF-8 encoding.
-  Message& operator <<(const wchar_t* wide_c_str);
-  Message& operator <<(wchar_t* wide_c_str);
-
-#if GTEST_HAS_STD_WSTRING
-  // Converts the given wide string to a narrow string using the UTF-8
-  // encoding, and streams the result to this Message object.
-  Message& operator <<(const ::std::wstring& wstr);
-#endif  // GTEST_HAS_STD_WSTRING
-
-#if GTEST_HAS_GLOBAL_WSTRING
-  // Converts the given wide string to a narrow string using the UTF-8
-  // encoding, and streams the result to this Message object.
-  Message& operator <<(const ::wstring& wstr);
-#endif  // GTEST_HAS_GLOBAL_WSTRING
-
-  // Gets the text streamed to this object so far as an std::string.
-  // Each '\0' character in the buffer is replaced with "\\0".
-  //
-  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-  std::string GetString() const;
-
- private:
-
-#if GTEST_OS_SYMBIAN
-  // These are needed as the Nokia Symbian Compiler cannot decide between
-  // const T& and const T* in a function template. The Nokia compiler _can_
-  // decide between class template specializations for T and T*, so a
-  // tr1::type_traits-like is_pointer works, and we can overload on that.
-  template <typename T>
-  inline void StreamHelper(internal::true_type /*is_pointer*/, T* pointer) {
-    if (pointer == NULL) {
-      *ss_ << "(null)";
-    } else {
-      *ss_ << pointer;
-    }
-  }
-  template <typename T>
-  inline void StreamHelper(internal::false_type /*is_pointer*/,
-                           const T& value) {
-    // See the comments in Message& operator <<(const T&) above for why
-    // we need this using statement.
-    using ::operator <<;
-    *ss_ << value;
-  }
-#endif  // GTEST_OS_SYMBIAN
-
-  // We'll hold the text streamed to this object here.
-  const internal::scoped_ptr< ::std::stringstream> ss_;
-
-  // We declare (but don't implement) this to prevent the compiler
-  // from implementing the assignment operator.
-  void operator=(const Message&);
-};
-
-// Streams a Message to an ostream.
-inline std::ostream& operator <<(std::ostream& os, const Message& sb) {
-  return os << sb.GetString();
-}
-
-namespace internal {
-
-// Converts a streamable value to an std::string.  A NULL pointer is
-// converted to "(null)".  When the input value is a ::string,
-// ::std::string, ::wstring, or ::std::wstring object, each NUL
-// character in it is replaced with "\\0".
-template <typename T>
-std::string StreamableToString(const T& streamable) {
-  return (Message() << streamable).GetString();
-}
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
-//
-// The Google C++ Testing Framework (Google Test)
-//
-// This header file declares the String class and functions used internally by
-// Google Test.  They are subject to change without notice. They should not used
-// by code external to Google Test.
-//
-// This header file is #included by <gtest/internal/gtest-internal.h>.
-// It should not be #included by other files.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
-
-#ifdef __BORLANDC__
-// string.h is not guaranteed to provide strcpy on C++ Builder.
-# include <mem.h>
-#endif
-
-#include <string.h>
-#include <string>
-
-
-namespace testing {
-namespace internal {
-
-// String - an abstract class holding static string utilities.
-class GTEST_API_ String {
- public:
-  // Static utility methods
-
-  // Clones a 0-terminated C string, allocating memory using new.  The
-  // caller is responsible for deleting the return value using
-  // delete[].  Returns the cloned string, or NULL if the input is
-  // NULL.
-  //
-  // This is different from strdup() in string.h, which allocates
-  // memory using malloc().
-  static const char* CloneCString(const char* c_str);
-
-#if GTEST_OS_WINDOWS_MOBILE
-  // Windows CE does not have the 'ANSI' versions of Win32 APIs. To be
-  // able to pass strings to Win32 APIs on CE we need to convert them
-  // to 'Unicode', UTF-16.
-
-  // Creates a UTF-16 wide string from the given ANSI string, allocating
-  // memory using new. The caller is responsible for deleting the return
-  // value using delete[]. Returns the wide string, or NULL if the
-  // input is NULL.
-  //
-  // The wide string is created using the ANSI codepage (CP_ACP) to
-  // match the behaviour of the ANSI versions of Win32 calls and the
-  // C runtime.
-  static LPCWSTR AnsiToUtf16(const char* c_str);
-
-  // Creates an ANSI string from the given wide string, allocating
-  // memory using new. The caller is responsible for deleting the return
-  // value using delete[]. Returns the ANSI string, or NULL if the
-  // input is NULL.
-  //
-  // The returned string is created using the ANSI codepage (CP_ACP) to
-  // match the behaviour of the ANSI versions of Win32 calls and the
-  // C runtime.
-  static const char* Utf16ToAnsi(LPCWSTR utf16_str);
-#endif
-
-  // Compares two C strings.  Returns true iff they have the same content.
-  //
-  // Unlike strcmp(), this function can handle NULL argument(s).  A
-  // NULL C string is considered different to any non-NULL C string,
-  // including the empty string.
-  static bool CStringEquals(const char* lhs, const char* rhs);
-
-  // Converts a wide C string to a String using the UTF-8 encoding.
-  // NULL will be converted to "(null)".  If an error occurred during
-  // the conversion, "(failed to convert from wide string)" is
-  // returned.
-  static std::string ShowWideCString(const wchar_t* wide_c_str);
-
-  // Compares two wide C strings.  Returns true iff they have the same
-  // content.
-  //
-  // Unlike wcscmp(), this function can handle NULL argument(s).  A
-  // NULL C string is considered different to any non-NULL C string,
-  // including the empty string.
-  static bool WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs);
-
-  // Compares two C strings, ignoring case.  Returns true iff they
-  // have the same content.
-  //
-  // Unlike strcasecmp(), this function can handle NULL argument(s).
-  // A NULL C string is considered different to any non-NULL C string,
-  // including the empty string.
-  static bool CaseInsensitiveCStringEquals(const char* lhs,
-                                           const char* rhs);
-
-  // Compares two wide C strings, ignoring case.  Returns true iff they
-  // have the same content.
-  //
-  // Unlike wcscasecmp(), this function can handle NULL argument(s).
-  // A NULL C string is considered different to any non-NULL wide C string,
-  // including the empty string.
-  // NB: The implementations on different platforms slightly differ.
-  // On windows, this method uses _wcsicmp which compares according to LC_CTYPE
-  // environment variable. On GNU platform this method uses wcscasecmp
-  // which compares according to LC_CTYPE category of the current locale.
-  // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
-  // current locale.
-  static bool CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
-                                               const wchar_t* rhs);
-
-  // Returns true iff the given string ends with the given suffix, ignoring
-  // case. Any string is considered to end with an empty suffix.
-  static bool EndsWithCaseInsensitive(
-      const std::string& str, const std::string& suffix);
-
-  // Formats an int value as "%02d".
-  static std::string FormatIntWidth2(int value);  // "%02d" for width == 2
-
-  // Formats an int value as "%X".
-  static std::string FormatHexInt(int value);
-
-  // Formats a byte as "%02X".
-  static std::string FormatByte(unsigned char value);
-
- private:
-  String();  // Not meant to be instantiated.
-};  // class String
-
-// Gets the content of the stringstream's buffer as an std::string.  Each '\0'
-// character in the buffer is replaced with "\\0".
-GTEST_API_ std::string StringStreamToString(::std::stringstream* stream);
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: keith.ray@gmail.com (Keith Ray)
-//
-// Google Test filepath utilities
-//
-// This header file declares classes and functions used internally by
-// Google Test.  They are subject to change without notice.
-//
-// This file is #included in <gtest/internal/gtest-internal.h>.
-// Do not include this header file separately!
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
-
-
-namespace testing {
-namespace internal {
-
-// FilePath - a class for file and directory pathname manipulation which
-// handles platform-specific conventions (like the pathname separator).
-// Used for helper functions for naming files in a directory for xml output.
-// Except for Set methods, all methods are const or static, which provides an
-// "immutable value object" -- useful for peace of mind.
-// A FilePath with a value ending in a path separator ("like/this/") represents
-// a directory, otherwise it is assumed to represent a file. In either case,
-// it may or may not represent an actual file or directory in the file system.
-// Names are NOT checked for syntax correctness -- no checking for illegal
-// characters, malformed paths, etc.
-
-class GTEST_API_ FilePath {
- public:
-  FilePath() : pathname_("") { }
-  FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) { }
-
-  explicit FilePath(const std::string& pathname) : pathname_(pathname) {
-    Normalize();
-  }
-
-  FilePath& operator=(const FilePath& rhs) {
-    Set(rhs);
-    return *this;
-  }
-
-  void Set(const FilePath& rhs) {
-    pathname_ = rhs.pathname_;
-  }
-
-  const std::string& string() const { return pathname_; }
-  const char* c_str() const { return pathname_.c_str(); }
-
-  // Returns the current working directory, or "" if unsuccessful.
-  static FilePath GetCurrentDir();
-
-  // Given directory = "dir", base_name = "test", number = 0,
-  // extension = "xml", returns "dir/test.xml". If number is greater
-  // than zero (e.g., 12), returns "dir/test_12.xml".
-  // On Windows platform, uses \ as the separator rather than /.
-  static FilePath MakeFileName(const FilePath& directory,
-                               const FilePath& base_name,
-                               int number,
-                               const char* extension);
-
-  // Given directory = "dir", relative_path = "test.xml",
-  // returns "dir/test.xml".
-  // On Windows, uses \ as the separator rather than /.
-  static FilePath ConcatPaths(const FilePath& directory,
-                              const FilePath& relative_path);
-
-  // Returns a pathname for a file that does not currently exist. The pathname
-  // will be directory/base_name.extension or
-  // directory/base_name_<number>.extension if directory/base_name.extension
-  // already exists. The number will be incremented until a pathname is found
-  // that does not already exist.
-  // Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
-  // There could be a race condition if two or more processes are calling this
-  // function at the same time -- they could both pick the same filename.
-  static FilePath GenerateUniqueFileName(const FilePath& directory,
-                                         const FilePath& base_name,
-                                         const char* extension);
-
-  // Returns true iff the path is "".
-  bool IsEmpty() const { return pathname_.empty(); }
-
-  // If input name has a trailing separator character, removes it and returns
-  // the name, otherwise return the name string unmodified.
-  // On Windows platform, uses \ as the separator, other platforms use /.
-  FilePath RemoveTrailingPathSeparator() const;
-
-  // Returns a copy of the FilePath with the directory part removed.
-  // Example: FilePath("path/to/file").RemoveDirectoryName() returns
-  // FilePath("file"). If there is no directory part ("just_a_file"), it returns
-  // the FilePath unmodified. If there is no file part ("just_a_dir/") it
-  // returns an empty FilePath ("").
-  // On Windows platform, '\' is the path separator, otherwise it is '/'.
-  FilePath RemoveDirectoryName() const;
-
-  // RemoveFileName returns the directory path with the filename removed.
-  // Example: FilePath("path/to/file").RemoveFileName() returns "path/to/".
-  // If the FilePath is "a_file" or "/a_file", RemoveFileName returns
-  // FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does
-  // not have a file, like "just/a/dir/", it returns the FilePath unmodified.
-  // On Windows platform, '\' is the path separator, otherwise it is '/'.
-  FilePath RemoveFileName() const;
-
-  // Returns a copy of the FilePath with the case-insensitive extension removed.
-  // Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
-  // FilePath("dir/file"). If a case-insensitive extension is not
-  // found, returns a copy of the original FilePath.
-  FilePath RemoveExtension(const char* extension) const;
-
-  // Creates directories so that path exists. Returns true if successful or if
-  // the directories already exist; returns false if unable to create
-  // directories for any reason. Will also return false if the FilePath does
-  // not represent a directory (that is, it doesn't end with a path separator).
-  bool CreateDirectoriesRecursively() const;
-
-  // Create the directory so that path exists. Returns true if successful or
-  // if the directory already exists; returns false if unable to create the
-  // directory for any reason, including if the parent directory does not
-  // exist. Not named "CreateDirectory" because that's a macro on Windows.
-  bool CreateFolder() const;
-
-  // Returns true if FilePath describes something in the file-system,
-  // either a file, directory, or whatever, and that something exists.
-  bool FileOrDirectoryExists() const;
-
-  // Returns true if pathname describes a directory in the file-system
-  // that exists.
-  bool DirectoryExists() const;
-
-  // Returns true if FilePath ends with a path separator, which indicates that
-  // it is intended to represent a directory. Returns false otherwise.
-  // This does NOT check that a directory (or file) actually exists.
-  bool IsDirectory() const;
-
-  // Returns true if pathname describes a root directory. (Windows has one
-  // root directory per disk drive.)
-  bool IsRootDirectory() const;
-
-  // Returns true if pathname describes an absolute path.
-  bool IsAbsolutePath() const;
-
- private:
-  // Replaces multiple consecutive separators with a single separator.
-  // For example, "bar///foo" becomes "bar/foo". Does not eliminate other
-  // redundancies that might be in a pathname involving "." or "..".
-  //
-  // A pathname with multiple consecutive separators may occur either through
-  // user error or as a result of some scripts or APIs that generate a pathname
-  // with a trailing separator. On other platforms the same API or script
-  // may NOT generate a pathname with a trailing "/". Then elsewhere that
-  // pathname may have another "/" and pathname components added to it,
-  // without checking for the separator already being there.
-  // The script language and operating system may allow paths like "foo//bar"
-  // but some of the functions in FilePath will not handle that correctly. In
-  // particular, RemoveTrailingPathSeparator() only removes one separator, and
-  // it is called in CreateDirectoriesRecursively() assuming that it will change
-  // a pathname from directory syntax (trailing separator) to filename syntax.
-  //
-  // On Windows this method also replaces the alternate path separator '/' with
-  // the primary path separator '\\', so that for example "bar\\/\\foo" becomes
-  // "bar\\foo".
-
-  void Normalize();
-
-  // Returns a pointer to the last occurence of a valid path separator in
-  // the FilePath. On Windows, for example, both '/' and '\' are valid path
-  // separators. Returns NULL if no path separator was found.
-  const char* FindLastPathSeparator() const;
-
-  std::string pathname_;
-};  // class FilePath
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
-// This file was GENERATED by command:
-//     pump.py gtest-type-util.h.pump
-// DO NOT EDIT BY HAND!!!
-
-// Copyright 2008 Google Inc.
-// All Rights Reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-
-// Type utilities needed for implementing typed and type-parameterized
-// tests.  This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
-//
-// Currently we support at most 50 types in a list, and at most 50
-// type-parameterized tests in one type-parameterized test case.
-// Please contact googletestframework@googlegroups.com if you need
-// more.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
-
-
-// #ifdef __GNUC__ is too general here.  It is possible to use gcc without using
-// libstdc++ (which is where cxxabi.h comes from).
-# if GTEST_HAS_CXXABI_H_
-#  include <cxxabi.h>
-# elif defined(__HP_aCC)
-#  include <acxx_demangle.h>
-# endif  // GTEST_HASH_CXXABI_H_
-
-namespace testing {
-namespace internal {
-
-// GetTypeName<T>() returns a human-readable name of type T.
-// NB: This function is also used in Google Mock, so don't move it inside of
-// the typed-test-only section below.
-template <typename T>
-std::string GetTypeName() {
-# if GTEST_HAS_RTTI
-
-  const char* const name = typeid(T).name();
-#  if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC)
-  int status = 0;
-  // gcc's implementation of typeid(T).name() mangles the type name,
-  // so we have to demangle it.
-#   if GTEST_HAS_CXXABI_H_
-  using abi::__cxa_demangle;
-#   endif  // GTEST_HAS_CXXABI_H_
-  char* const readable_name = __cxa_demangle(name, 0, 0, &status);
-  const std::string name_str(status == 0 ? readable_name : name);
-  free(readable_name);
-  return name_str;
-#  else
-  return name;
-#  endif  // GTEST_HAS_CXXABI_H_ || __HP_aCC
-
-# else
-
-  return "<type>";
-
-# endif  // GTEST_HAS_RTTI
-}
-
-#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
-// AssertyTypeEq<T1, T2>::type is defined iff T1 and T2 are the same
-// type.  This can be used as a compile-time assertion to ensure that
-// two types are equal.
-
-template <typename T1, typename T2>
-struct AssertTypeEq;
-
-template <typename T>
-struct AssertTypeEq<T, T> {
-  typedef bool type;
-};
-
-// A unique type used as the default value for the arguments of class
-// template Types.  This allows us to simulate variadic templates
-// (e.g. Types<int>, Type<int, double>, and etc), which C++ doesn't
-// support directly.
-struct None {};
-
-// The following family of struct and struct templates are used to
-// represent type lists.  In particular, TypesN<T1, T2, ..., TN>
-// represents a type list with N types (T1, T2, ..., and TN) in it.
-// Except for Types0, every struct in the family has two member types:
-// Head for the first type in the list, and Tail for the rest of the
-// list.
-
-// The empty type list.
-struct Types0 {};
-
-// Type lists of length 1, 2, 3, and so on.
-
-template <typename T1>
-struct Types1 {
-  typedef T1 Head;
-  typedef Types0 Tail;
-};
-template <typename T1, typename T2>
-struct Types2 {
-  typedef T1 Head;
-  typedef Types1<T2> Tail;
-};
-
-template <typename T1, typename T2, typename T3>
-struct Types3 {
-  typedef T1 Head;
-  typedef Types2<T2, T3> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4>
-struct Types4 {
-  typedef T1 Head;
-  typedef Types3<T2, T3, T4> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-struct Types5 {
-  typedef T1 Head;
-  typedef Types4<T2, T3, T4, T5> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6>
-struct Types6 {
-  typedef T1 Head;
-  typedef Types5<T2, T3, T4, T5, T6> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7>
-struct Types7 {
-  typedef T1 Head;
-  typedef Types6<T2, T3, T4, T5, T6, T7> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8>
-struct Types8 {
-  typedef T1 Head;
-  typedef Types7<T2, T3, T4, T5, T6, T7, T8> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9>
-struct Types9 {
-  typedef T1 Head;
-  typedef Types8<T2, T3, T4, T5, T6, T7, T8, T9> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10>
-struct Types10 {
-  typedef T1 Head;
-  typedef Types9<T2, T3, T4, T5, T6, T7, T8, T9, T10> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11>
-struct Types11 {
-  typedef T1 Head;
-  typedef Types10<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12>
-struct Types12 {
-  typedef T1 Head;
-  typedef Types11<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13>
-struct Types13 {
-  typedef T1 Head;
-  typedef Types12<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14>
-struct Types14 {
-  typedef T1 Head;
-  typedef Types13<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15>
-struct Types15 {
-  typedef T1 Head;
-  typedef Types14<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16>
-struct Types16 {
-  typedef T1 Head;
-  typedef Types15<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17>
-struct Types17 {
-  typedef T1 Head;
-  typedef Types16<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18>
-struct Types18 {
-  typedef T1 Head;
-  typedef Types17<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19>
-struct Types19 {
-  typedef T1 Head;
-  typedef Types18<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20>
-struct Types20 {
-  typedef T1 Head;
-  typedef Types19<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21>
-struct Types21 {
-  typedef T1 Head;
-  typedef Types20<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22>
-struct Types22 {
-  typedef T1 Head;
-  typedef Types21<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23>
-struct Types23 {
-  typedef T1 Head;
-  typedef Types22<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24>
-struct Types24 {
-  typedef T1 Head;
-  typedef Types23<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25>
-struct Types25 {
-  typedef T1 Head;
-  typedef Types24<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26>
-struct Types26 {
-  typedef T1 Head;
-  typedef Types25<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27>
-struct Types27 {
-  typedef T1 Head;
-  typedef Types26<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28>
-struct Types28 {
-  typedef T1 Head;
-  typedef Types27<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29>
-struct Types29 {
-  typedef T1 Head;
-  typedef Types28<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30>
-struct Types30 {
-  typedef T1 Head;
-  typedef Types29<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31>
-struct Types31 {
-  typedef T1 Head;
-  typedef Types30<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32>
-struct Types32 {
-  typedef T1 Head;
-  typedef Types31<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33>
-struct Types33 {
-  typedef T1 Head;
-  typedef Types32<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34>
-struct Types34 {
-  typedef T1 Head;
-  typedef Types33<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35>
-struct Types35 {
-  typedef T1 Head;
-  typedef Types34<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36>
-struct Types36 {
-  typedef T1 Head;
-  typedef Types35<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37>
-struct Types37 {
-  typedef T1 Head;
-  typedef Types36<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38>
-struct Types38 {
-  typedef T1 Head;
-  typedef Types37<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39>
-struct Types39 {
-  typedef T1 Head;
-  typedef Types38<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40>
-struct Types40 {
-  typedef T1 Head;
-  typedef Types39<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41>
-struct Types41 {
-  typedef T1 Head;
-  typedef Types40<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42>
-struct Types42 {
-  typedef T1 Head;
-  typedef Types41<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43>
-struct Types43 {
-  typedef T1 Head;
-  typedef Types42<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44>
-struct Types44 {
-  typedef T1 Head;
-  typedef Types43<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45>
-struct Types45 {
-  typedef T1 Head;
-  typedef Types44<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46>
-struct Types46 {
-  typedef T1 Head;
-  typedef Types45<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45, T46> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47>
-struct Types47 {
-  typedef T1 Head;
-  typedef Types46<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45, T46, T47> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48>
-struct Types48 {
-  typedef T1 Head;
-  typedef Types47<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45, T46, T47, T48> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49>
-struct Types49 {
-  typedef T1 Head;
-  typedef Types48<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45, T46, T47, T48, T49> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49, typename T50>
-struct Types50 {
-  typedef T1 Head;
-  typedef Types49<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45, T46, T47, T48, T49, T50> Tail;
-};
-
-
-}  // namespace internal
-
-// We don't want to require the users to write TypesN<...> directly,
-// as that would require them to count the length.  Types<...> is much
-// easier to write, but generates horrible messages when there is a
-// compiler error, as gcc insists on printing out each template
-// argument, even if it has the default value (this means Types<int>
-// will appear as Types<int, None, None, ..., None> in the compiler
-// errors).
-//
-// Our solution is to combine the best part of the two approaches: a
-// user would write Types<T1, ..., TN>, and Google Test will translate
-// that to TypesN<T1, ..., TN> internally to make error messages
-// readable.  The translation is done by the 'type' member of the
-// Types template.
-template <typename T1 = internal::None, typename T2 = internal::None,
-    typename T3 = internal::None, typename T4 = internal::None,
-    typename T5 = internal::None, typename T6 = internal::None,
-    typename T7 = internal::None, typename T8 = internal::None,
-    typename T9 = internal::None, typename T10 = internal::None,
-    typename T11 = internal::None, typename T12 = internal::None,
-    typename T13 = internal::None, typename T14 = internal::None,
-    typename T15 = internal::None, typename T16 = internal::None,
-    typename T17 = internal::None, typename T18 = internal::None,
-    typename T19 = internal::None, typename T20 = internal::None,
-    typename T21 = internal::None, typename T22 = internal::None,
-    typename T23 = internal::None, typename T24 = internal::None,
-    typename T25 = internal::None, typename T26 = internal::None,
-    typename T27 = internal::None, typename T28 = internal::None,
-    typename T29 = internal::None, typename T30 = internal::None,
-    typename T31 = internal::None, typename T32 = internal::None,
-    typename T33 = internal::None, typename T34 = internal::None,
-    typename T35 = internal::None, typename T36 = internal::None,
-    typename T37 = internal::None, typename T38 = internal::None,
-    typename T39 = internal::None, typename T40 = internal::None,
-    typename T41 = internal::None, typename T42 = internal::None,
-    typename T43 = internal::None, typename T44 = internal::None,
-    typename T45 = internal::None, typename T46 = internal::None,
-    typename T47 = internal::None, typename T48 = internal::None,
-    typename T49 = internal::None, typename T50 = internal::None>
-struct Types {
-  typedef internal::Types50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46, T47, T48, T49, T50> type;
-};
-
-template <>
-struct Types<internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types0 type;
-};
-template <typename T1>
-struct Types<T1, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types1<T1> type;
-};
-template <typename T1, typename T2>
-struct Types<T1, T2, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types2<T1, T2> type;
-};
-template <typename T1, typename T2, typename T3>
-struct Types<T1, T2, T3, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types3<T1, T2, T3> type;
-};
-template <typename T1, typename T2, typename T3, typename T4>
-struct Types<T1, T2, T3, T4, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types4<T1, T2, T3, T4> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-struct Types<T1, T2, T3, T4, T5, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types5<T1, T2, T3, T4, T5> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6>
-struct Types<T1, T2, T3, T4, T5, T6, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types6<T1, T2, T3, T4, T5, T6> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7>
-struct Types<T1, T2, T3, T4, T5, T6, T7, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types7<T1, T2, T3, T4, T5, T6, T7> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types8<T1, T2, T3, T4, T5, T6, T7, T8> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types9<T1, T2, T3, T4, T5, T6, T7, T8, T9> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
-    T46, internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
-    T46, T47, internal::None, internal::None, internal::None> {
-  typedef internal::Types47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46, T47> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
-    T46, T47, T48, internal::None, internal::None> {
-  typedef internal::Types48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46, T47, T48> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
-    T46, T47, T48, T49, internal::None> {
-  typedef internal::Types49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46, T47, T48, T49> type;
-};
-
-namespace internal {
-
-# define GTEST_TEMPLATE_ template <typename T> class
-
-// The template "selector" struct TemplateSel<Tmpl> is used to
-// represent Tmpl, which must be a class template with one type
-// parameter, as a type.  TemplateSel<Tmpl>::Bind<T>::type is defined
-// as the type Tmpl<T>.  This allows us to actually instantiate the
-// template "selected" by TemplateSel<Tmpl>.
-//
-// This trick is necessary for simulating typedef for class templates,
-// which C++ doesn't support directly.
-template <GTEST_TEMPLATE_ Tmpl>
-struct TemplateSel {
-  template <typename T>
-  struct Bind {
-    typedef Tmpl<T> type;
-  };
-};
-
-# define GTEST_BIND_(TmplSel, T) \
-  TmplSel::template Bind<T>::type
-
-// A unique struct template used as the default value for the
-// arguments of class template Templates.  This allows us to simulate
-// variadic templates (e.g. Templates<int>, Templates<int, double>,
-// and etc), which C++ doesn't support directly.
-template <typename T>
-struct NoneT {};
-
-// The following family of struct and struct templates are used to
-// represent template lists.  In particular, TemplatesN<T1, T2, ...,
-// TN> represents a list of N templates (T1, T2, ..., and TN).  Except
-// for Templates0, every struct in the family has two member types:
-// Head for the selector of the first template in the list, and Tail
-// for the rest of the list.
-
-// The empty template list.
-struct Templates0 {};
-
-// Template lists of length 1, 2, 3, and so on.
-
-template <GTEST_TEMPLATE_ T1>
-struct Templates1 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates0 Tail;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2>
-struct Templates2 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates1<T2> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3>
-struct Templates3 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates2<T2, T3> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4>
-struct Templates4 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates3<T2, T3, T4> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5>
-struct Templates5 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates4<T2, T3, T4, T5> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6>
-struct Templates6 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates5<T2, T3, T4, T5, T6> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7>
-struct Templates7 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates6<T2, T3, T4, T5, T6, T7> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8>
-struct Templates8 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates7<T2, T3, T4, T5, T6, T7, T8> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9>
-struct Templates9 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates8<T2, T3, T4, T5, T6, T7, T8, T9> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10>
-struct Templates10 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates9<T2, T3, T4, T5, T6, T7, T8, T9, T10> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11>
-struct Templates11 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates10<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12>
-struct Templates12 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates11<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13>
-struct Templates13 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates12<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14>
-struct Templates14 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates13<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15>
-struct Templates15 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates14<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16>
-struct Templates16 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates15<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17>
-struct Templates17 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates16<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18>
-struct Templates18 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates17<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19>
-struct Templates19 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates18<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20>
-struct Templates20 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates19<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21>
-struct Templates21 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates20<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22>
-struct Templates22 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates21<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23>
-struct Templates23 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates22<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24>
-struct Templates24 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates23<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25>
-struct Templates25 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates24<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26>
-struct Templates26 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates25<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27>
-struct Templates27 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates26<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28>
-struct Templates28 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates27<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29>
-struct Templates29 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates28<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30>
-struct Templates30 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates29<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31>
-struct Templates31 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates30<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32>
-struct Templates32 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates31<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33>
-struct Templates33 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates32<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34>
-struct Templates34 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates33<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35>
-struct Templates35 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates34<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36>
-struct Templates36 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates35<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37>
-struct Templates37 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates36<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38>
-struct Templates38 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates37<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39>
-struct Templates39 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates38<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40>
-struct Templates40 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates39<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41>
-struct Templates41 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates40<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42>
-struct Templates42 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates41<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43>
-struct Templates43 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates42<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44>
-struct Templates44 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates43<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45>
-struct Templates45 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates44<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46>
-struct Templates46 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates45<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45, T46> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47>
-struct Templates47 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates46<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45, T46, T47> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48>
-struct Templates48 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates47<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45, T46, T47, T48> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
-    GTEST_TEMPLATE_ T49>
-struct Templates49 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates48<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45, T46, T47, T48, T49> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
-    GTEST_TEMPLATE_ T49, GTEST_TEMPLATE_ T50>
-struct Templates50 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates49<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45, T46, T47, T48, T49, T50> Tail;
-};
-
-
-// We don't want to require the users to write TemplatesN<...> directly,
-// as that would require them to count the length.  Templates<...> is much
-// easier to write, but generates horrible messages when there is a
-// compiler error, as gcc insists on printing out each template
-// argument, even if it has the default value (this means Templates<list>
-// will appear as Templates<list, NoneT, NoneT, ..., NoneT> in the compiler
-// errors).
-//
-// Our solution is to combine the best part of the two approaches: a
-// user would write Templates<T1, ..., TN>, and Google Test will translate
-// that to TemplatesN<T1, ..., TN> internally to make error messages
-// readable.  The translation is done by the 'type' member of the
-// Templates template.
-template <GTEST_TEMPLATE_ T1 = NoneT, GTEST_TEMPLATE_ T2 = NoneT,
-    GTEST_TEMPLATE_ T3 = NoneT, GTEST_TEMPLATE_ T4 = NoneT,
-    GTEST_TEMPLATE_ T5 = NoneT, GTEST_TEMPLATE_ T6 = NoneT,
-    GTEST_TEMPLATE_ T7 = NoneT, GTEST_TEMPLATE_ T8 = NoneT,
-    GTEST_TEMPLATE_ T9 = NoneT, GTEST_TEMPLATE_ T10 = NoneT,
-    GTEST_TEMPLATE_ T11 = NoneT, GTEST_TEMPLATE_ T12 = NoneT,
-    GTEST_TEMPLATE_ T13 = NoneT, GTEST_TEMPLATE_ T14 = NoneT,
-    GTEST_TEMPLATE_ T15 = NoneT, GTEST_TEMPLATE_ T16 = NoneT,
-    GTEST_TEMPLATE_ T17 = NoneT, GTEST_TEMPLATE_ T18 = NoneT,
-    GTEST_TEMPLATE_ T19 = NoneT, GTEST_TEMPLATE_ T20 = NoneT,
-    GTEST_TEMPLATE_ T21 = NoneT, GTEST_TEMPLATE_ T22 = NoneT,
-    GTEST_TEMPLATE_ T23 = NoneT, GTEST_TEMPLATE_ T24 = NoneT,
-    GTEST_TEMPLATE_ T25 = NoneT, GTEST_TEMPLATE_ T26 = NoneT,
-    GTEST_TEMPLATE_ T27 = NoneT, GTEST_TEMPLATE_ T28 = NoneT,
-    GTEST_TEMPLATE_ T29 = NoneT, GTEST_TEMPLATE_ T30 = NoneT,
-    GTEST_TEMPLATE_ T31 = NoneT, GTEST_TEMPLATE_ T32 = NoneT,
-    GTEST_TEMPLATE_ T33 = NoneT, GTEST_TEMPLATE_ T34 = NoneT,
-    GTEST_TEMPLATE_ T35 = NoneT, GTEST_TEMPLATE_ T36 = NoneT,
-    GTEST_TEMPLATE_ T37 = NoneT, GTEST_TEMPLATE_ T38 = NoneT,
-    GTEST_TEMPLATE_ T39 = NoneT, GTEST_TEMPLATE_ T40 = NoneT,
-    GTEST_TEMPLATE_ T41 = NoneT, GTEST_TEMPLATE_ T42 = NoneT,
-    GTEST_TEMPLATE_ T43 = NoneT, GTEST_TEMPLATE_ T44 = NoneT,
-    GTEST_TEMPLATE_ T45 = NoneT, GTEST_TEMPLATE_ T46 = NoneT,
-    GTEST_TEMPLATE_ T47 = NoneT, GTEST_TEMPLATE_ T48 = NoneT,
-    GTEST_TEMPLATE_ T49 = NoneT, GTEST_TEMPLATE_ T50 = NoneT>
-struct Templates {
-  typedef Templates50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45, T46, T47, T48, T49, T50> type;
-};
-
-template <>
-struct Templates<NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT> {
-  typedef Templates0 type;
-};
-template <GTEST_TEMPLATE_ T1>
-struct Templates<T1, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT> {
-  typedef Templates1<T1> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2>
-struct Templates<T1, T2, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT> {
-  typedef Templates2<T1, T2> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3>
-struct Templates<T1, T2, T3, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates3<T1, T2, T3> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4>
-struct Templates<T1, T2, T3, T4, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates4<T1, T2, T3, T4> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5>
-struct Templates<T1, T2, T3, T4, T5, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates5<T1, T2, T3, T4, T5> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6>
-struct Templates<T1, T2, T3, T4, T5, T6, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates6<T1, T2, T3, T4, T5, T6> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates7<T1, T2, T3, T4, T5, T6, T7> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates8<T1, T2, T3, T4, T5, T6, T7, T8> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates9<T1, T2, T3, T4, T5, T6, T7, T8, T9> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT> {
-  typedef Templates22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT> {
-  typedef Templates23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT> {
-  typedef Templates24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT> {
-  typedef Templates25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT> {
-  typedef Templates26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT> {
-  typedef Templates27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT> {
-  typedef Templates28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT> {
-  typedef Templates29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    T45, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    T45, T46, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45, T46> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    T45, T46, T47, NoneT, NoneT, NoneT> {
-  typedef Templates47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45, T46, T47> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    T45, T46, T47, T48, NoneT, NoneT> {
-  typedef Templates48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45, T46, T47, T48> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
-    GTEST_TEMPLATE_ T49>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    T45, T46, T47, T48, T49, NoneT> {
-  typedef Templates49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45, T46, T47, T48, T49> type;
-};
-
-// The TypeList template makes it possible to use either a single type
-// or a Types<...> list in TYPED_TEST_CASE() and
-// INSTANTIATE_TYPED_TEST_CASE_P().
-
-template <typename T>
-struct TypeList {
-  typedef Types1<T> type;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49, typename T50>
-struct TypeList<Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44, T45, T46, T47, T48, T49, T50> > {
-  typedef typename Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46, T47, T48, T49, T50>::type type;
-};
-
-#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
-
-// Due to C++ preprocessor weirdness, we need double indirection to
-// concatenate two tokens when one of them is __LINE__.  Writing
-//
-//   foo ## __LINE__
-//
-// will result in the token foo__LINE__, instead of foo followed by
-// the current line number.  For more details, see
-// http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.6
-#define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar)
-#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo ## bar
-
-class ProtocolMessage;
-namespace proto2 { class Message; }
-
-namespace testing {
-
-// Forward declarations.
-
-class AssertionResult;                 // Result of an assertion.
-class Message;                         // Represents a failure message.
-class Test;                            // Represents a test.
-class TestInfo;                        // Information about a test.
-class TestPartResult;                  // Result of a test part.
-class UnitTest;                        // A collection of test cases.
-
-template <typename T>
-::std::string PrintToString(const T& value);
-
-namespace internal {
-
-struct TraceInfo;                      // Information about a trace point.
-class ScopedTrace;                     // Implements scoped trace.
-class TestInfoImpl;                    // Opaque implementation of TestInfo
-class UnitTestImpl;                    // Opaque implementation of UnitTest
-
-// How many times InitGoogleTest() has been called.
-GTEST_API_ extern int g_init_gtest_count;
-
-// The text used in failure messages to indicate the start of the
-// stack trace.
-GTEST_API_ extern const char kStackTraceMarker[];
-
-// Two overloaded helpers for checking at compile time whether an
-// expression is a null pointer literal (i.e. NULL or any 0-valued
-// compile-time integral constant).  Their return values have
-// different sizes, so we can use sizeof() to test which version is
-// picked by the compiler.  These helpers have no implementations, as
-// we only need their signatures.
-//
-// Given IsNullLiteralHelper(x), the compiler will pick the first
-// version if x can be implicitly converted to Secret*, and pick the
-// second version otherwise.  Since Secret is a secret and incomplete
-// type, the only expression a user can write that has type Secret* is
-// a null pointer literal.  Therefore, we know that x is a null
-// pointer literal if and only if the first version is picked by the
-// compiler.
-char IsNullLiteralHelper(Secret* p);
-char (&IsNullLiteralHelper(...))[2];  // NOLINT
-
-// A compile-time bool constant that is true if and only if x is a
-// null pointer literal (i.e. NULL or any 0-valued compile-time
-// integral constant).
-#ifdef GTEST_ELLIPSIS_NEEDS_POD_
-// We lose support for NULL detection where the compiler doesn't like
-// passing non-POD classes through ellipsis (...).
-# define GTEST_IS_NULL_LITERAL_(x) false
-#else
-# define GTEST_IS_NULL_LITERAL_(x) \
-    (sizeof(::testing::internal::IsNullLiteralHelper(x)) == 1)
-#endif  // GTEST_ELLIPSIS_NEEDS_POD_
-
-// Appends the user-supplied message to the Google-Test-generated message.
-GTEST_API_ std::string AppendUserMessage(
-    const std::string& gtest_msg, const Message& user_msg);
-
-#if GTEST_HAS_EXCEPTIONS
-
-// This exception is thrown by (and only by) a failed Google Test
-// assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions
-// are enabled).  We derive it from std::runtime_error, which is for
-// errors presumably detectable only at run time.  Since
-// std::runtime_error inherits from std::exception, many testing
-// frameworks know how to extract and print the message inside it.
-class GTEST_API_ GoogleTestFailureException : public ::std::runtime_error {
- public:
-  explicit GoogleTestFailureException(const TestPartResult& failure);
-};
-
-#endif  // GTEST_HAS_EXCEPTIONS
-
-// A helper class for creating scoped traces in user programs.
-class GTEST_API_ ScopedTrace {
- public:
-  // The c'tor pushes the given source file location and message onto
-  // a trace stack maintained by Google Test.
-  ScopedTrace(const char* file, int line, const Message& message);
-
-  // The d'tor pops the info pushed by the c'tor.
-  //
-  // Note that the d'tor is not virtual in order to be efficient.
-  // Don't inherit from ScopedTrace!
-  ~ScopedTrace();
-
- private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace);
-} GTEST_ATTRIBUTE_UNUSED_;  // A ScopedTrace object does its job in its
-                            // c'tor and d'tor.  Therefore it doesn't
-                            // need to be used otherwise.
-
-// Constructs and returns the message for an equality assertion
-// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
-//
-// The first four parameters are the expressions used in the assertion
-// and their values, as strings.  For example, for ASSERT_EQ(foo, bar)
-// where foo is 5 and bar is 6, we have:
-//
-//   expected_expression: "foo"
-//   actual_expression:   "bar"
-//   expected_value:      "5"
-//   actual_value:        "6"
-//
-// The ignoring_case parameter is true iff the assertion is a
-// *_STRCASEEQ*.  When it's true, the string " (ignoring case)" will
-// be inserted into the message.
-GTEST_API_ AssertionResult EqFailure(const char* expected_expression,
-                                     const char* actual_expression,
-                                     const std::string& expected_value,
-                                     const std::string& actual_value,
-                                     bool ignoring_case);
-
-// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
-GTEST_API_ std::string GetBoolAssertionFailureMessage(
-    const AssertionResult& assertion_result,
-    const char* expression_text,
-    const char* actual_predicate_value,
-    const char* expected_predicate_value);
-
-// This template class represents an IEEE floating-point number
-// (either single-precision or double-precision, depending on the
-// template parameters).
-//
-// The purpose of this class is to do more sophisticated number
-// comparison.  (Due to round-off error, etc, it's very unlikely that
-// two floating-points will be equal exactly.  Hence a naive
-// comparison by the == operation often doesn't work.)
-//
-// Format of IEEE floating-point:
-//
-//   The most-significant bit being the leftmost, an IEEE
-//   floating-point looks like
-//
-//     sign_bit exponent_bits fraction_bits
-//
-//   Here, sign_bit is a single bit that designates the sign of the
-//   number.
-//
-//   For float, there are 8 exponent bits and 23 fraction bits.
-//
-//   For double, there are 11 exponent bits and 52 fraction bits.
-//
-//   More details can be found at
-//   http://en.wikipedia.org/wiki/IEEE_floating-point_standard.
-//
-// Template parameter:
-//
-//   RawType: the raw floating-point type (either float or double)
-template <typename RawType>
-class FloatingPoint {
- public:
-  // Defines the unsigned integer type that has the same size as the
-  // floating point number.
-  typedef typename TypeWithSize<sizeof(RawType)>::UInt Bits;
-
-  // Constants.
-
-  // # of bits in a number.
-  static const size_t kBitCount = 8*sizeof(RawType);
-
-  // # of fraction bits in a number.
-  static const size_t kFractionBitCount =
-    std::numeric_limits<RawType>::digits - 1;
-
-  // # of exponent bits in a number.
-  static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount;
-
-  // The mask for the sign bit.
-  static const Bits kSignBitMask = static_cast<Bits>(1) << (kBitCount - 1);
-
-  // The mask for the fraction bits.
-  static const Bits kFractionBitMask =
-    ~static_cast<Bits>(0) >> (kExponentBitCount + 1);
-
-  // The mask for the exponent bits.
-  static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask);
-
-  // How many ULP's (Units in the Last Place) we want to tolerate when
-  // comparing two numbers.  The larger the value, the more error we
-  // allow.  A 0 value means that two numbers must be exactly the same
-  // to be considered equal.
-  //
-  // The maximum error of a single floating-point operation is 0.5
-  // units in the last place.  On Intel CPU's, all floating-point
-  // calculations are done with 80-bit precision, while double has 64
-  // bits.  Therefore, 4 should be enough for ordinary use.
-  //
-  // See the following article for more details on ULP:
-  // http://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
-  static const size_t kMaxUlps = 4;
-
-  // Constructs a FloatingPoint from a raw floating-point number.
-  //
-  // On an Intel CPU, passing a non-normalized NAN (Not a Number)
-  // around may change its bits, although the new value is guaranteed
-  // to be also a NAN.  Therefore, don't expect this constructor to
-  // preserve the bits in x when x is a NAN.
-  explicit FloatingPoint(const RawType& x) { u_.value_ = x; }
-
-  // Static methods
-
-  // Reinterprets a bit pattern as a floating-point number.
-  //
-  // This function is needed to test the AlmostEquals() method.
-  static RawType ReinterpretBits(const Bits bits) {
-    FloatingPoint fp(0);
-    fp.u_.bits_ = bits;
-    return fp.u_.value_;
-  }
-
-  // Returns the floating-point number that represent positive infinity.
-  static RawType Infinity() {
-    return ReinterpretBits(kExponentBitMask);
-  }
-
-  // Returns the maximum representable finite floating-point number.
-  static RawType Max();
-
-  // Non-static methods
-
-  // Returns the bits that represents this number.
-  const Bits &bits() const { return u_.bits_; }
-
-  // Returns the exponent bits of this number.
-  Bits exponent_bits() const { return kExponentBitMask & u_.bits_; }
-
-  // Returns the fraction bits of this number.
-  Bits fraction_bits() const { return kFractionBitMask & u_.bits_; }
-
-  // Returns the sign bit of this number.
-  Bits sign_bit() const { return kSignBitMask & u_.bits_; }
-
-  // Returns true iff this is NAN (not a number).
-  bool is_nan() const {
-    // It's a NAN if the exponent bits are all ones and the fraction
-    // bits are not entirely zeros.
-    return (exponent_bits() == kExponentBitMask) && (fraction_bits() != 0);
-  }
-
-  // Returns true iff this number is at most kMaxUlps ULP's away from
-  // rhs.  In particular, this function:
-  //
-  //   - returns false if either number is (or both are) NAN.
-  //   - treats really large numbers as almost equal to infinity.
-  //   - thinks +0.0 and -0.0 are 0 DLP's apart.
-  bool AlmostEquals(const FloatingPoint& rhs) const {
-    // The IEEE standard says that any comparison operation involving
-    // a NAN must return false.
-    if (is_nan() || rhs.is_nan()) return false;
-
-    return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_)
-        <= kMaxUlps;
-  }
-
- private:
-  // The data type used to store the actual floating-point number.
-  union FloatingPointUnion {
-    RawType value_;  // The raw floating-point number.
-    Bits bits_;      // The bits that represent the number.
-  };
-
-  // Converts an integer from the sign-and-magnitude representation to
-  // the biased representation.  More precisely, let N be 2 to the
-  // power of (kBitCount - 1), an integer x is represented by the
-  // unsigned number x + N.
-  //
-  // For instance,
-  //
-  //   -N + 1 (the most negative number representable using
-  //          sign-and-magnitude) is represented by 1;
-  //   0      is represented by N; and
-  //   N - 1  (the biggest number representable using
-  //          sign-and-magnitude) is represented by 2N - 1.
-  //
-  // Read http://en.wikipedia.org/wiki/Signed_number_representations
-  // for more details on signed number representations.
-  static Bits SignAndMagnitudeToBiased(const Bits &sam) {
-    if (kSignBitMask & sam) {
-      // sam represents a negative number.
-      return ~sam + 1;
-    } else {
-      // sam represents a positive number.
-      return kSignBitMask | sam;
-    }
-  }
-
-  // Given two numbers in the sign-and-magnitude representation,
-  // returns the distance between them as an unsigned number.
-  static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits &sam1,
-                                                     const Bits &sam2) {
-    const Bits biased1 = SignAndMagnitudeToBiased(sam1);
-    const Bits biased2 = SignAndMagnitudeToBiased(sam2);
-    return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1);
-  }
-
-  FloatingPointUnion u_;
-};
-
-// We cannot use std::numeric_limits<T>::max() as it clashes with the max()
-// macro defined by <windows.h>.
-template <>
-inline float FloatingPoint<float>::Max() { return FLT_MAX; }
-template <>
-inline double FloatingPoint<double>::Max() { return DBL_MAX; }
-
-// Typedefs the instances of the FloatingPoint template class that we
-// care to use.
-typedef FloatingPoint<float> Float;
-typedef FloatingPoint<double> Double;
-
-// In order to catch the mistake of putting tests that use different
-// test fixture classes in the same test case, we need to assign
-// unique IDs to fixture classes and compare them.  The TypeId type is
-// used to hold such IDs.  The user should treat TypeId as an opaque
-// type: the only operation allowed on TypeId values is to compare
-// them for equality using the == operator.
-typedef const void* TypeId;
-
-template <typename T>
-class TypeIdHelper {
- public:
-  // dummy_ must not have a const type.  Otherwise an overly eager
-  // compiler (e.g. MSVC 7.1 & 8.0) may try to merge
-  // TypeIdHelper<T>::dummy_ for different Ts as an "optimization".
-  static bool dummy_;
-};
-
-template <typename T>
-bool TypeIdHelper<T>::dummy_ = false;
-
-// GetTypeId<T>() returns the ID of type T.  Different values will be
-// returned for different types.  Calling the function twice with the
-// same type argument is guaranteed to return the same ID.
-template <typename T>
-TypeId GetTypeId() {
-  // The compiler is required to allocate a different
-  // TypeIdHelper<T>::dummy_ variable for each T used to instantiate
-  // the template.  Therefore, the address of dummy_ is guaranteed to
-  // be unique.
-  return &(TypeIdHelper<T>::dummy_);
-}
-
-// Returns the type ID of ::testing::Test.  Always call this instead
-// of GetTypeId< ::testing::Test>() to get the type ID of
-// ::testing::Test, as the latter may give the wrong result due to a
-// suspected linker bug when compiling Google Test as a Mac OS X
-// framework.
-GTEST_API_ TypeId GetTestTypeId();
-
-// Defines the abstract factory interface that creates instances
-// of a Test object.
-class TestFactoryBase {
- public:
-  virtual ~TestFactoryBase() {}
-
-  // Creates a test instance to run. The instance is both created and destroyed
-  // within TestInfoImpl::Run()
-  virtual Test* CreateTest() = 0;
-
- protected:
-  TestFactoryBase() {}
-
- private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestFactoryBase);
-};
-
-// This class provides implementation of TeastFactoryBase interface.
-// It is used in TEST and TEST_F macros.
-template <class TestClass>
-class TestFactoryImpl : public TestFactoryBase {
- public:
-  virtual Test* CreateTest() { return new TestClass; }
-};
-
-#if GTEST_OS_WINDOWS
-
-// Predicate-formatters for implementing the HRESULT checking macros
-// {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED}
-// We pass a long instead of HRESULT to avoid causing an
-// include dependency for the HRESULT type.
-GTEST_API_ AssertionResult IsHRESULTSuccess(const char* expr,
-                                            long hr);  // NOLINT
-GTEST_API_ AssertionResult IsHRESULTFailure(const char* expr,
-                                            long hr);  // NOLINT
-
-#endif  // GTEST_OS_WINDOWS
-
-// Types of SetUpTestCase() and TearDownTestCase() functions.
-typedef void (*SetUpTestCaseFunc)();
-typedef void (*TearDownTestCaseFunc)();
-
-// Creates a new TestInfo object and registers it with Google Test;
-// returns the created object.
-//
-// Arguments:
-//
-//   test_case_name:   name of the test case
-//   name:             name of the test
-//   type_param        the name of the test's type parameter, or NULL if
-//                     this is not a typed or a type-parameterized test.
-//   value_param       text representation of the test's value parameter,
-//                     or NULL if this is not a type-parameterized test.
-//   fixture_class_id: ID of the test fixture class
-//   set_up_tc:        pointer to the function that sets up the test case
-//   tear_down_tc:     pointer to the function that tears down the test case
-//   factory:          pointer to the factory that creates a test object.
-//                     The newly created TestInfo instance will assume
-//                     ownership of the factory object.
-GTEST_API_ TestInfo* MakeAndRegisterTestInfo(
-    const char* test_case_name,
-    const char* name,
-    const char* type_param,
-    const char* value_param,
-    TypeId fixture_class_id,
-    SetUpTestCaseFunc set_up_tc,
-    TearDownTestCaseFunc tear_down_tc,
-    TestFactoryBase* factory);
-
-// If *pstr starts with the given prefix, modifies *pstr to be right
-// past the prefix and returns true; otherwise leaves *pstr unchanged
-// and returns false.  None of pstr, *pstr, and prefix can be NULL.
-GTEST_API_ bool SkipPrefix(const char* prefix, const char** pstr);
-
-#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
-// State of the definition of a type-parameterized test case.
-class GTEST_API_ TypedTestCasePState {
- public:
-  TypedTestCasePState() : registered_(false) {}
-
-  // Adds the given test name to defined_test_names_ and return true
-  // if the test case hasn't been registered; otherwise aborts the
-  // program.
-  bool AddTestName(const char* file, int line, const char* case_name,
-                   const char* test_name) {
-    if (registered_) {
-      fprintf(stderr, "%s Test %s must be defined before "
-              "REGISTER_TYPED_TEST_CASE_P(%s, ...).\n",
-              FormatFileLocation(file, line).c_str(), test_name, case_name);
-      fflush(stderr);
-      posix::Abort();
-    }
-    defined_test_names_.insert(test_name);
-    return true;
-  }
-
-  // Verifies that registered_tests match the test names in
-  // defined_test_names_; returns registered_tests if successful, or
-  // aborts the program otherwise.
-  const char* VerifyRegisteredTestNames(
-      const char* file, int line, const char* registered_tests);
-
- private:
-  bool registered_;
-  ::std::set<const char*> defined_test_names_;
-};
-
-// Skips to the first non-space char after the first comma in 'str';
-// returns NULL if no comma is found in 'str'.
-inline const char* SkipComma(const char* str) {
-  const char* comma = strchr(str, ',');
-  if (comma == NULL) {
-    return NULL;
-  }
-  while (IsSpace(*(++comma))) {}
-  return comma;
-}
-
-// Returns the prefix of 'str' before the first comma in it; returns
-// the entire string if it contains no comma.
-inline std::string GetPrefixUntilComma(const char* str) {
-  const char* comma = strchr(str, ',');
-  return comma == NULL ? str : std::string(str, comma);
-}
-
-// TypeParameterizedTest<Fixture, TestSel, Types>::Register()
-// registers a list of type-parameterized tests with Google Test.  The
-// return value is insignificant - we just need to return something
-// such that we can call this function in a namespace scope.
-//
-// Implementation note: The GTEST_TEMPLATE_ macro declares a template
-// template parameter.  It's defined in gtest-type-util.h.
-template <GTEST_TEMPLATE_ Fixture, class TestSel, typename Types>
-class TypeParameterizedTest {
- public:
-  // 'index' is the index of the test in the type list 'Types'
-  // specified in INSTANTIATE_TYPED_TEST_CASE_P(Prefix, TestCase,
-  // Types).  Valid values for 'index' are [0, N - 1] where N is the
-  // length of Types.
-  static bool Register(const char* prefix, const char* case_name,
-                       const char* test_names, int index) {
-    typedef typename Types::Head Type;
-    typedef Fixture<Type> FixtureClass;
-    typedef typename GTEST_BIND_(TestSel, Type) TestClass;
-
-    // First, registers the first type-parameterized test in the type
-    // list.
-    MakeAndRegisterTestInfo(
-        (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name + "/"
-         + StreamableToString(index)).c_str(),
-        GetPrefixUntilComma(test_names).c_str(),
-        GetTypeName<Type>().c_str(),
-        NULL,  // No value parameter.
-        GetTypeId<FixtureClass>(),
-        TestClass::SetUpTestCase,
-        TestClass::TearDownTestCase,
-        new TestFactoryImpl<TestClass>);
-
-    // Next, recurses (at compile time) with the tail of the type list.
-    return TypeParameterizedTest<Fixture, TestSel, typename Types::Tail>
-        ::Register(prefix, case_name, test_names, index + 1);
-  }
-};
-
-// The base case for the compile time recursion.
-template <GTEST_TEMPLATE_ Fixture, class TestSel>
-class TypeParameterizedTest<Fixture, TestSel, Types0> {
- public:
-  static bool Register(const char* /*prefix*/, const char* /*case_name*/,
-                       const char* /*test_names*/, int /*index*/) {
-    return true;
-  }
-};
-
-// TypeParameterizedTestCase<Fixture, Tests, Types>::Register()
-// registers *all combinations* of 'Tests' and 'Types' with Google
-// Test.  The return value is insignificant - we just need to return
-// something such that we can call this function in a namespace scope.
-template <GTEST_TEMPLATE_ Fixture, typename Tests, typename Types>
-class TypeParameterizedTestCase {
- public:
-  static bool Register(const char* prefix, const char* case_name,
-                       const char* test_names) {
-    typedef typename Tests::Head Head;
-
-    // First, register the first test in 'Test' for each type in 'Types'.
-    TypeParameterizedTest<Fixture, Head, Types>::Register(
-        prefix, case_name, test_names, 0);
-
-    // Next, recurses (at compile time) with the tail of the test list.
-    return TypeParameterizedTestCase<Fixture, typename Tests::Tail, Types>
-        ::Register(prefix, case_name, SkipComma(test_names));
-  }
-};
-
-// The base case for the compile time recursion.
-template <GTEST_TEMPLATE_ Fixture, typename Types>
-class TypeParameterizedTestCase<Fixture, Templates0, Types> {
- public:
-  static bool Register(const char* /*prefix*/, const char* /*case_name*/,
-                       const char* /*test_names*/) {
-    return true;
-  }
-};
-
-#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
-// Returns the current OS stack trace as an std::string.
-//
-// The maximum number of stack frames to be included is specified by
-// the gtest_stack_trace_depth flag.  The skip_count parameter
-// specifies the number of top frames to be skipped, which doesn't
-// count against the number of frames to be included.
-//
-// For example, if Foo() calls Bar(), which in turn calls
-// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
-// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
-GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(
-    UnitTest* unit_test, int skip_count);
-
-// Helpers for suppressing warnings on unreachable code or constant
-// condition.
-
-// Always returns true.
-GTEST_API_ bool AlwaysTrue();
-
-// Always returns false.
-inline bool AlwaysFalse() { return !AlwaysTrue(); }
-
-// Helper for suppressing false warning from Clang on a const char*
-// variable declared in a conditional expression always being NULL in
-// the else branch.
-struct GTEST_API_ ConstCharPtr {
-  ConstCharPtr(const char* str) : value(str) {}
-  operator bool() const { return true; }
-  const char* value;
-};
-
-// A simple Linear Congruential Generator for generating random
-// numbers with a uniform distribution.  Unlike rand() and srand(), it
-// doesn't use global state (and therefore can't interfere with user
-// code).  Unlike rand_r(), it's portable.  An LCG isn't very random,
-// but it's good enough for our purposes.
-class GTEST_API_ Random {
- public:
-  static const UInt32 kMaxRange = 1u << 31;
-
-  explicit Random(UInt32 seed) : state_(seed) {}
-
-  void Reseed(UInt32 seed) { state_ = seed; }
-
-  // Generates a random number from [0, range).  Crashes if 'range' is
-  // 0 or greater than kMaxRange.
-  UInt32 Generate(UInt32 range);
-
- private:
-  UInt32 state_;
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Random);
-};
-
-// Defining a variable of type CompileAssertTypesEqual<T1, T2> will cause a
-// compiler error iff T1 and T2 are different types.
-template <typename T1, typename T2>
-struct CompileAssertTypesEqual;
-
-template <typename T>
-struct CompileAssertTypesEqual<T, T> {
-};
-
-// Removes the reference from a type if it is a reference type,
-// otherwise leaves it unchanged.  This is the same as
-// tr1::remove_reference, which is not widely available yet.
-template <typename T>
-struct RemoveReference { typedef T type; };  // NOLINT
-template <typename T>
-struct RemoveReference<T&> { typedef T type; };  // NOLINT
-
-// A handy wrapper around RemoveReference that works when the argument
-// T depends on template parameters.
-#define GTEST_REMOVE_REFERENCE_(T) \
-    typename ::testing::internal::RemoveReference<T>::type
-
-// Removes const from a type if it is a const type, otherwise leaves
-// it unchanged.  This is the same as tr1::remove_const, which is not
-// widely available yet.
-template <typename T>
-struct RemoveConst { typedef T type; };  // NOLINT
-template <typename T>
-struct RemoveConst<const T> { typedef T type; };  // NOLINT
-
-// MSVC 8.0, Sun C++, and IBM XL C++ have a bug which causes the above
-// definition to fail to remove the const in 'const int[3]' and 'const
-// char[3][4]'.  The following specialization works around the bug.
-template <typename T, size_t N>
-struct RemoveConst<const T[N]> {
-  typedef typename RemoveConst<T>::type type[N];
-};
-
-#if defined(_MSC_VER) && _MSC_VER < 1400
-// This is the only specialization that allows VC++ 7.1 to remove const in
-// 'const int[3] and 'const int[3][4]'.  However, it causes trouble with GCC
-// and thus needs to be conditionally compiled.
-template <typename T, size_t N>
-struct RemoveConst<T[N]> {
-  typedef typename RemoveConst<T>::type type[N];
-};
-#endif
-
-// A handy wrapper around RemoveConst that works when the argument
-// T depends on template parameters.
-#define GTEST_REMOVE_CONST_(T) \
-    typename ::testing::internal::RemoveConst<T>::type
-
-// Turns const U&, U&, const U, and U all into U.
-#define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \
-    GTEST_REMOVE_CONST_(GTEST_REMOVE_REFERENCE_(T))
-
-// Adds reference to a type if it is not a reference type,
-// otherwise leaves it unchanged.  This is the same as
-// tr1::add_reference, which is not widely available yet.
-template <typename T>
-struct AddReference { typedef T& type; };  // NOLINT
-template <typename T>
-struct AddReference<T&> { typedef T& type; };  // NOLINT
-
-// A handy wrapper around AddReference that works when the argument T
-// depends on template parameters.
-#define GTEST_ADD_REFERENCE_(T) \
-    typename ::testing::internal::AddReference<T>::type
-
-// Adds a reference to const on top of T as necessary.  For example,
-// it transforms
-//
-//   char         ==> const char&
-//   const char   ==> const char&
-//   char&        ==> const char&
-//   const char&  ==> const char&
-//
-// The argument T must depend on some template parameters.
-#define GTEST_REFERENCE_TO_CONST_(T) \
-    GTEST_ADD_REFERENCE_(const GTEST_REMOVE_REFERENCE_(T))
-
-// ImplicitlyConvertible<From, To>::value is a compile-time bool
-// constant that's true iff type From can be implicitly converted to
-// type To.
-template <typename From, typename To>
-class ImplicitlyConvertible {
- private:
-  // We need the following helper functions only for their types.
-  // They have no implementations.
-
-  // MakeFrom() is an expression whose type is From.  We cannot simply
-  // use From(), as the type From may not have a public default
-  // constructor.
-  static From MakeFrom();
-
-  // These two functions are overloaded.  Given an expression
-  // Helper(x), the compiler will pick the first version if x can be
-  // implicitly converted to type To; otherwise it will pick the
-  // second version.
-  //
-  // The first version returns a value of size 1, and the second
-  // version returns a value of size 2.  Therefore, by checking the
-  // size of Helper(x), which can be done at compile time, we can tell
-  // which version of Helper() is used, and hence whether x can be
-  // implicitly converted to type To.
-  static char Helper(To);
-  static char (&Helper(...))[2];  // NOLINT
-
-  // We have to put the 'public' section after the 'private' section,
-  // or MSVC refuses to compile the code.
- public:
-  // MSVC warns about implicitly converting from double to int for
-  // possible loss of data, so we need to temporarily disable the
-  // warning.
-#ifdef _MSC_VER
-# pragma warning(push)          // Saves the current warning state.
-# pragma warning(disable:4244)  // Temporarily disables warning 4244.
-
-  static const bool value =
-      sizeof(Helper(ImplicitlyConvertible::MakeFrom())) == 1;
-# pragma warning(pop)           // Restores the warning state.
-#elif defined(__BORLANDC__)
-  // C++Builder cannot use member overload resolution during template
-  // instantiation.  The simplest workaround is to use its C++0x type traits
-  // functions (C++Builder 2009 and above only).
-  static const bool value = __is_convertible(From, To);
-#else
-  static const bool value =
-      sizeof(Helper(ImplicitlyConvertible::MakeFrom())) == 1;
-#endif  // _MSV_VER
-};
-template <typename From, typename To>
-const bool ImplicitlyConvertible<From, To>::value;
-
-// IsAProtocolMessage<T>::value is a compile-time bool constant that's
-// true iff T is type ProtocolMessage, proto2::Message, or a subclass
-// of those.
-template <typename T>
-struct IsAProtocolMessage
-    : public bool_constant<
-  ImplicitlyConvertible<const T*, const ::ProtocolMessage*>::value ||
-  ImplicitlyConvertible<const T*, const ::proto2::Message*>::value> {
-};
-
-// When the compiler sees expression IsContainerTest<C>(0), if C is an
-// STL-style container class, the first overload of IsContainerTest
-// will be viable (since both C::iterator* and C::const_iterator* are
-// valid types and NULL can be implicitly converted to them).  It will
-// be picked over the second overload as 'int' is a perfect match for
-// the type of argument 0.  If C::iterator or C::const_iterator is not
-// a valid type, the first overload is not viable, and the second
-// overload will be picked.  Therefore, we can determine whether C is
-// a container class by checking the type of IsContainerTest<C>(0).
-// The value of the expression is insignificant.
-//
-// Note that we look for both C::iterator and C::const_iterator.  The
-// reason is that C++ injects the name of a class as a member of the
-// class itself (e.g. you can refer to class iterator as either
-// 'iterator' or 'iterator::iterator').  If we look for C::iterator
-// only, for example, we would mistakenly think that a class named
-// iterator is an STL container.
-//
-// Also note that the simpler approach of overloading
-// IsContainerTest(typename C::const_iterator*) and
-// IsContainerTest(...) doesn't work with Visual Age C++ and Sun C++.
-typedef int IsContainer;
-template <class C>
-IsContainer IsContainerTest(int /* dummy */,
-                            typename C::iterator* /* it */ = NULL,
-                            typename C::const_iterator* /* const_it */ = NULL) {
-  return 0;
-}
-
-typedef char IsNotContainer;
-template <class C>
-IsNotContainer IsContainerTest(long /* dummy */) { return '\0'; }
-
-// EnableIf<condition>::type is void when 'Cond' is true, and
-// undefined when 'Cond' is false.  To use SFINAE to make a function
-// overload only apply when a particular expression is true, add
-// "typename EnableIf<expression>::type* = 0" as the last parameter.
-template<bool> struct EnableIf;
-template<> struct EnableIf<true> { typedef void type; };  // NOLINT
-
-// Utilities for native arrays.
-
-// ArrayEq() compares two k-dimensional native arrays using the
-// elements' operator==, where k can be any integer >= 0.  When k is
-// 0, ArrayEq() degenerates into comparing a single pair of values.
-
-template <typename T, typename U>
-bool ArrayEq(const T* lhs, size_t size, const U* rhs);
-
-// This generic version is used when k is 0.
-template <typename T, typename U>
-inline bool ArrayEq(const T& lhs, const U& rhs) { return lhs == rhs; }
-
-// This overload is used when k >= 1.
-template <typename T, typename U, size_t N>
-inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) {
-  return internal::ArrayEq(lhs, N, rhs);
-}
-
-// This helper reduces code bloat.  If we instead put its logic inside
-// the previous ArrayEq() function, arrays with different sizes would
-// lead to different copies of the template code.
-template <typename T, typename U>
-bool ArrayEq(const T* lhs, size_t size, const U* rhs) {
-  for (size_t i = 0; i != size; i++) {
-    if (!internal::ArrayEq(lhs[i], rhs[i]))
-      return false;
-  }
-  return true;
-}
-
-// Finds the first element in the iterator range [begin, end) that
-// equals elem.  Element may be a native array type itself.
-template <typename Iter, typename Element>
-Iter ArrayAwareFind(Iter begin, Iter end, const Element& elem) {
-  for (Iter it = begin; it != end; ++it) {
-    if (internal::ArrayEq(*it, elem))
-      return it;
-  }
-  return end;
-}
-
-// CopyArray() copies a k-dimensional native array using the elements'
-// operator=, where k can be any integer >= 0.  When k is 0,
-// CopyArray() degenerates into copying a single value.
-
-template <typename T, typename U>
-void CopyArray(const T* from, size_t size, U* to);
-
-// This generic version is used when k is 0.
-template <typename T, typename U>
-inline void CopyArray(const T& from, U* to) { *to = from; }
-
-// This overload is used when k >= 1.
-template <typename T, typename U, size_t N>
-inline void CopyArray(const T(&from)[N], U(*to)[N]) {
-  internal::CopyArray(from, N, *to);
-}
-
-// This helper reduces code bloat.  If we instead put its logic inside
-// the previous CopyArray() function, arrays with different sizes
-// would lead to different copies of the template code.
-template <typename T, typename U>
-void CopyArray(const T* from, size_t size, U* to) {
-  for (size_t i = 0; i != size; i++) {
-    internal::CopyArray(from[i], to + i);
-  }
-}
-
-// The relation between an NativeArray object (see below) and the
-// native array it represents.
-enum RelationToSource {
-  kReference,  // The NativeArray references the native array.
-  kCopy        // The NativeArray makes a copy of the native array and
-               // owns the copy.
-};
-
-// Adapts a native array to a read-only STL-style container.  Instead
-// of the complete STL container concept, this adaptor only implements
-// members useful for Google Mock's container matchers.  New members
-// should be added as needed.  To simplify the implementation, we only
-// support Element being a raw type (i.e. having no top-level const or
-// reference modifier).  It's the client's responsibility to satisfy
-// this requirement.  Element can be an array type itself (hence
-// multi-dimensional arrays are supported).
-template <typename Element>
-class NativeArray {
- public:
-  // STL-style container typedefs.
-  typedef Element value_type;
-  typedef Element* iterator;
-  typedef const Element* const_iterator;
-
-  // Constructs from a native array.
-  NativeArray(const Element* array, size_t count, RelationToSource relation) {
-    Init(array, count, relation);
-  }
-
-  // Copy constructor.
-  NativeArray(const NativeArray& rhs) {
-    Init(rhs.array_, rhs.size_, rhs.relation_to_source_);
-  }
-
-  ~NativeArray() {
-    // Ensures that the user doesn't instantiate NativeArray with a
-    // const or reference type.
-    static_cast<void>(StaticAssertTypeEqHelper<Element,
-        GTEST_REMOVE_REFERENCE_AND_CONST_(Element)>());
-    if (relation_to_source_ == kCopy)
-      delete[] array_;
-  }
-
-  // STL-style container methods.
-  size_t size() const { return size_; }
-  const_iterator begin() const { return array_; }
-  const_iterator end() const { return array_ + size_; }
-  bool operator==(const NativeArray& rhs) const {
-    return size() == rhs.size() &&
-        ArrayEq(begin(), size(), rhs.begin());
-  }
-
- private:
-  // Initializes this object; makes a copy of the input array if
-  // 'relation' is kCopy.
-  void Init(const Element* array, size_t a_size, RelationToSource relation) {
-    if (relation == kReference) {
-      array_ = array;
-    } else {
-      Element* const copy = new Element[a_size];
-      CopyArray(array, a_size, copy);
-      array_ = copy;
-    }
-    size_ = a_size;
-    relation_to_source_ = relation;
-  }
-
-  const Element* array_;
-  size_t size_;
-  RelationToSource relation_to_source_;
-
-  GTEST_DISALLOW_ASSIGN_(NativeArray);
-};
-
-}  // namespace internal
-}  // namespace testing
-
-#define GTEST_MESSAGE_AT_(file, line, message, result_type) \
-  ::testing::internal::AssertHelper(result_type, file, line, message) \
-    = ::testing::Message()
-
-#define GTEST_MESSAGE_(message, result_type) \
-  GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type)
-
-#define GTEST_FATAL_FAILURE_(message) \
-  return GTEST_MESSAGE_(message, ::testing::TestPartResult::kFatalFailure)
-
-#define GTEST_NONFATAL_FAILURE_(message) \
-  GTEST_MESSAGE_(message, ::testing::TestPartResult::kNonFatalFailure)
-
-#define GTEST_SUCCESS_(message) \
-  GTEST_MESSAGE_(message, ::testing::TestPartResult::kSuccess)
-
-// Suppresses MSVC warnings 4072 (unreachable code) for the code following
-// statement if it returns or throws (or doesn't return or throw in some
-// situations).
-#define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \
-  if (::testing::internal::AlwaysTrue()) { statement; }
-
-#define GTEST_TEST_THROW_(statement, expected_exception, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::ConstCharPtr gtest_msg = "") { \
-    bool gtest_caught_expected = false; \
-    try { \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    } \
-    catch (expected_exception const&) { \
-      gtest_caught_expected = true; \
-    } \
-    catch (...) { \
-      gtest_msg.value = \
-          "Expected: " #statement " throws an exception of type " \
-          #expected_exception ".\n  Actual: it throws a different type."; \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
-    } \
-    if (!gtest_caught_expected) { \
-      gtest_msg.value = \
-          "Expected: " #statement " throws an exception of type " \
-          #expected_exception ".\n  Actual: it throws nothing."; \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__): \
-      fail(gtest_msg.value)
-
-#define GTEST_TEST_NO_THROW_(statement, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
-    try { \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    } \
-    catch (...) { \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__): \
-      fail("Expected: " #statement " doesn't throw an exception.\n" \
-           "  Actual: it throws.")
-
-#define GTEST_TEST_ANY_THROW_(statement, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
-    bool gtest_caught_any = false; \
-    try { \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    } \
-    catch (...) { \
-      gtest_caught_any = true; \
-    } \
-    if (!gtest_caught_any) { \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__): \
-      fail("Expected: " #statement " throws an exception.\n" \
-           "  Actual: it doesn't.")
-
-
-// Implements Boolean test assertions such as EXPECT_TRUE. expression can be
-// either a boolean expression or an AssertionResult. text is a textual
-// represenation of expression as it was passed into the EXPECT_TRUE.
-#define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (const ::testing::AssertionResult gtest_ar_ = \
-      ::testing::AssertionResult(expression)) \
-    ; \
-  else \
-    fail(::testing::internal::GetBoolAssertionFailureMessage(\
-        gtest_ar_, text, #actual, #expected).c_str())
-
-#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
-    ::testing::internal::HasNewFatalFailureHelper gtest_fatal_failure_checker; \
-    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__): \
-      fail("Expected: " #statement " doesn't generate new fatal " \
-           "failures in the current thread.\n" \
-           "  Actual: it does.")
-
-// Expands to the name of the class that implements the given test.
-#define GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
-  test_case_name##_##test_name##_Test
-
-// Helper macro for defining tests.
-#define GTEST_TEST_(test_case_name, test_name, parent_class, parent_id)\
-class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) : public parent_class {\
- public:\
-  GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}\
- private:\
-  virtual void TestBody();\
-  static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;\
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(\
-      GTEST_TEST_CLASS_NAME_(test_case_name, test_name));\
-};\
-\
-::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name, test_name)\
-  ::test_info_ =\
-    ::testing::internal::MakeAndRegisterTestInfo(\
-        #test_case_name, #test_name, NULL, NULL, \
-        (parent_id), \
-        parent_class::SetUpTestCase, \
-        parent_class::TearDownTestCase, \
-        new ::testing::internal::TestFactoryImpl<\
-            GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>);\
-void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-//
-// The Google C++ Testing Framework (Google Test)
-//
-// This header file defines the public API for death tests.  It is
-// #included by gtest.h so a user doesn't need to include this
-// directly.
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
-#define GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
-
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
-//
-// The Google C++ Testing Framework (Google Test)
-//
-// This header file defines internal utilities needed for implementing
-// death tests.  They are subject to change without notice.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
-
-
-#include <stdio.h>
-
-namespace testing {
-namespace internal {
-
-GTEST_DECLARE_string_(internal_run_death_test);
-
-// Names of the flags (needed for parsing Google Test flags).
-const char kDeathTestStyleFlag[] = "death_test_style";
-const char kDeathTestUseFork[] = "death_test_use_fork";
-const char kInternalRunDeathTestFlag[] = "internal_run_death_test";
-
-#if GTEST_HAS_DEATH_TEST
-
-// DeathTest is a class that hides much of the complexity of the
-// GTEST_DEATH_TEST_ macro.  It is abstract; its static Create method
-// returns a concrete class that depends on the prevailing death test
-// style, as defined by the --gtest_death_test_style and/or
-// --gtest_internal_run_death_test flags.
-
-// In describing the results of death tests, these terms are used with
-// the corresponding definitions:
-//
-// exit status:  The integer exit information in the format specified
-//               by wait(2)
-// exit code:    The integer code passed to exit(3), _exit(2), or
-//               returned from main()
-class GTEST_API_ DeathTest {
- public:
-  // Create returns false if there was an error determining the
-  // appropriate action to take for the current death test; for example,
-  // if the gtest_death_test_style flag is set to an invalid value.
-  // The LastMessage method will return a more detailed message in that
-  // case.  Otherwise, the DeathTest pointer pointed to by the "test"
-  // argument is set.  If the death test should be skipped, the pointer
-  // is set to NULL; otherwise, it is set to the address of a new concrete
-  // DeathTest object that controls the execution of the current test.
-  static bool Create(const char* statement, const RE* regex,
-                     const char* file, int line, DeathTest** test);
-  DeathTest();
-  virtual ~DeathTest() { }
-
-  // A helper class that aborts a death test when it's deleted.
-  class ReturnSentinel {
-   public:
-    explicit ReturnSentinel(DeathTest* test) : test_(test) { }
-    ~ReturnSentinel() { test_->Abort(TEST_ENCOUNTERED_RETURN_STATEMENT); }
-   private:
-    DeathTest* const test_;
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(ReturnSentinel);
-  } GTEST_ATTRIBUTE_UNUSED_;
-
-  // An enumeration of possible roles that may be taken when a death
-  // test is encountered.  EXECUTE means that the death test logic should
-  // be executed immediately.  OVERSEE means that the program should prepare
-  // the appropriate environment for a child process to execute the death
-  // test, then wait for it to complete.
-  enum TestRole { OVERSEE_TEST, EXECUTE_TEST };
-
-  // An enumeration of the three reasons that a test might be aborted.
-  enum AbortReason {
-    TEST_ENCOUNTERED_RETURN_STATEMENT,
-    TEST_THREW_EXCEPTION,
-    TEST_DID_NOT_DIE
-  };
-
-  // Assumes one of the above roles.
-  virtual TestRole AssumeRole() = 0;
-
-  // Waits for the death test to finish and returns its status.
-  virtual int Wait() = 0;
-
-  // Returns true if the death test passed; that is, the test process
-  // exited during the test, its exit status matches a user-supplied
-  // predicate, and its stderr output matches a user-supplied regular
-  // expression.
-  // The user-supplied predicate may be a macro expression rather
-  // than a function pointer or functor, or else Wait and Passed could
-  // be combined.
-  virtual bool Passed(bool exit_status_ok) = 0;
-
-  // Signals that the death test did not die as expected.
-  virtual void Abort(AbortReason reason) = 0;
-
-  // Returns a human-readable outcome message regarding the outcome of
-  // the last death test.
-  static const char* LastMessage();
-
-  static void set_last_death_test_message(const std::string& message);
-
- private:
-  // A string containing a description of the outcome of the last death test.
-  static std::string last_death_test_message_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(DeathTest);
-};
-
-// Factory interface for death tests.  May be mocked out for testing.
-class DeathTestFactory {
- public:
-  virtual ~DeathTestFactory() { }
-  virtual bool Create(const char* statement, const RE* regex,
-                      const char* file, int line, DeathTest** test) = 0;
-};
-
-// A concrete DeathTestFactory implementation for normal use.
-class DefaultDeathTestFactory : public DeathTestFactory {
- public:
-  virtual bool Create(const char* statement, const RE* regex,
-                      const char* file, int line, DeathTest** test);
-};
-
-// Returns true if exit_status describes a process that was terminated
-// by a signal, or exited normally with a nonzero exit code.
-GTEST_API_ bool ExitedUnsuccessfully(int exit_status);
-
-// Traps C++ exceptions escaping statement and reports them as test
-// failures. Note that trapping SEH exceptions is not implemented here.
-# if GTEST_HAS_EXCEPTIONS
-#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
-  try { \
-    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-  } catch (const ::std::exception& gtest_exception) { \
-    fprintf(\
-        stderr, \
-        "\n%s: Caught std::exception-derived exception escaping the " \
-        "death test statement. Exception message: %s\n", \
-        ::testing::internal::FormatFileLocation(__FILE__, __LINE__).c_str(), \
-        gtest_exception.what()); \
-    fflush(stderr); \
-    death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
-  } catch (...) { \
-    death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
-  }
-
-# else
-#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
-  GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
-
-# endif
-
-// This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*,
-// ASSERT_EXIT*, and EXPECT_EXIT*.
-# define GTEST_DEATH_TEST_(statement, predicate, regex, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
-    const ::testing::internal::RE& gtest_regex = (regex); \
-    ::testing::internal::DeathTest* gtest_dt; \
-    if (!::testing::internal::DeathTest::Create(#statement, &gtest_regex, \
-        __FILE__, __LINE__, &gtest_dt)) { \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \
-    } \
-    if (gtest_dt != NULL) { \
-      ::testing::internal::scoped_ptr< ::testing::internal::DeathTest> \
-          gtest_dt_ptr(gtest_dt); \
-      switch (gtest_dt->AssumeRole()) { \
-        case ::testing::internal::DeathTest::OVERSEE_TEST: \
-          if (!gtest_dt->Passed(predicate(gtest_dt->Wait()))) { \
-            goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \
-          } \
-          break; \
-        case ::testing::internal::DeathTest::EXECUTE_TEST: { \
-          ::testing::internal::DeathTest::ReturnSentinel \
-              gtest_sentinel(gtest_dt); \
-          GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, gtest_dt); \
-          gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE); \
-          break; \
-        } \
-        default: \
-          break; \
-      } \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__): \
-      fail(::testing::internal::DeathTest::LastMessage())
-// The symbol "fail" here expands to something into which a message
-// can be streamed.
-
-// This macro is for implementing ASSERT/EXPECT_DEBUG_DEATH when compiled in
-// NDEBUG mode. In this case we need the statements to be executed, the regex is
-// ignored, and the macro must accept a streamed message even though the message
-// is never printed.
-# define GTEST_EXECUTE_STATEMENT_(statement, regex) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
-     GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-  } else \
-    ::testing::Message()
-
-// A class representing the parsed contents of the
-// --gtest_internal_run_death_test flag, as it existed when
-// RUN_ALL_TESTS was called.
-class InternalRunDeathTestFlag {
- public:
-  InternalRunDeathTestFlag(const std::string& a_file,
-                           int a_line,
-                           int an_index,
-                           int a_write_fd)
-      : file_(a_file), line_(a_line), index_(an_index),
-        write_fd_(a_write_fd) {}
-
-  ~InternalRunDeathTestFlag() {
-    if (write_fd_ >= 0)
-      posix::Close(write_fd_);
-  }
-
-  const std::string& file() const { return file_; }
-  int line() const { return line_; }
-  int index() const { return index_; }
-  int write_fd() const { return write_fd_; }
-
- private:
-  std::string file_;
-  int line_;
-  int index_;
-  int write_fd_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(InternalRunDeathTestFlag);
-};
-
-// Returns a newly created InternalRunDeathTestFlag object with fields
-// initialized from the GTEST_FLAG(internal_run_death_test) flag if
-// the flag is specified; otherwise returns NULL.
-InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag();
-
-#else  // GTEST_HAS_DEATH_TEST
-
-// This macro is used for implementing macros such as
-// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where
-// death tests are not supported. Those macros must compile on such systems
-// iff EXPECT_DEATH and ASSERT_DEATH compile with the same parameters on
-// systems that support death tests. This allows one to write such a macro
-// on a system that does not support death tests and be sure that it will
-// compile on a death-test supporting system.
-//
-// Parameters:
-//   statement -  A statement that a macro such as EXPECT_DEATH would test
-//                for program termination. This macro has to make sure this
-//                statement is compiled but not executed, to ensure that
-//                EXPECT_DEATH_IF_SUPPORTED compiles with a certain
-//                parameter iff EXPECT_DEATH compiles with it.
-//   regex     -  A regex that a macro such as EXPECT_DEATH would use to test
-//                the output of statement.  This parameter has to be
-//                compiled but not evaluated by this macro, to ensure that
-//                this macro only accepts expressions that a macro such as
-//                EXPECT_DEATH would accept.
-//   terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED
-//                and a return statement for ASSERT_DEATH_IF_SUPPORTED.
-//                This ensures that ASSERT_DEATH_IF_SUPPORTED will not
-//                compile inside functions where ASSERT_DEATH doesn't
-//                compile.
-//
-//  The branch that has an always false condition is used to ensure that
-//  statement and regex are compiled (and thus syntactically correct) but
-//  never executed. The unreachable code macro protects the terminator
-//  statement from generating an 'unreachable code' warning in case
-//  statement unconditionally returns or throws. The Message constructor at
-//  the end allows the syntax of streaming additional messages into the
-//  macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH.
-# define GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, terminator) \
-    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-    if (::testing::internal::AlwaysTrue()) { \
-      GTEST_LOG_(WARNING) \
-          << "Death tests are not supported on this platform.\n" \
-          << "Statement '" #statement "' cannot be verified."; \
-    } else if (::testing::internal::AlwaysFalse()) { \
-      ::testing::internal::RE::PartialMatch(".*", (regex)); \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-      terminator; \
-    } else \
-      ::testing::Message()
-
-#endif  // GTEST_HAS_DEATH_TEST
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
-
-namespace testing {
-
-// This flag controls the style of death tests.  Valid values are "threadsafe",
-// meaning that the death test child process will re-execute the test binary
-// from the start, running only a single death test, or "fast",
-// meaning that the child process will execute the test logic immediately
-// after forking.
-GTEST_DECLARE_string_(death_test_style);
-
-#if GTEST_HAS_DEATH_TEST
-
-namespace internal {
-
-// Returns a Boolean value indicating whether the caller is currently
-// executing in the context of the death test child process.  Tools such as
-// Valgrind heap checkers may need this to modify their behavior in death
-// tests.  IMPORTANT: This is an internal utility.  Using it may break the
-// implementation of death tests.  User code MUST NOT use it.
-GTEST_API_ bool InDeathTestChild();
-
-}  // namespace internal
-
-// The following macros are useful for writing death tests.
-
-// Here's what happens when an ASSERT_DEATH* or EXPECT_DEATH* is
-// executed:
-//
-//   1. It generates a warning if there is more than one active
-//   thread.  This is because it's safe to fork() or clone() only
-//   when there is a single thread.
-//
-//   2. The parent process clone()s a sub-process and runs the death
-//   test in it; the sub-process exits with code 0 at the end of the
-//   death test, if it hasn't exited already.
-//
-//   3. The parent process waits for the sub-process to terminate.
-//
-//   4. The parent process checks the exit code and error message of
-//   the sub-process.
-//
-// Examples:
-//
-//   ASSERT_DEATH(server.SendMessage(56, "Hello"), "Invalid port number");
-//   for (int i = 0; i < 5; i++) {
-//     EXPECT_DEATH(server.ProcessRequest(i),
-//                  "Invalid request .* in ProcessRequest()")
-//                  << "Failed to die on request " << i;
-//   }
-//
-//   ASSERT_EXIT(server.ExitNow(), ::testing::ExitedWithCode(0), "Exiting");
-//
-//   bool KilledBySIGHUP(int exit_code) {
-//     return WIFSIGNALED(exit_code) && WTERMSIG(exit_code) == SIGHUP;
-//   }
-//
-//   ASSERT_EXIT(client.HangUpServer(), KilledBySIGHUP, "Hanging up!");
-//
-// On the regular expressions used in death tests:
-//
-//   On POSIX-compliant systems (*nix), we use the <regex.h> library,
-//   which uses the POSIX extended regex syntax.
-//
-//   On other platforms (e.g. Windows), we only support a simple regex
-//   syntax implemented as part of Google Test.  This limited
-//   implementation should be enough most of the time when writing
-//   death tests; though it lacks many features you can find in PCRE
-//   or POSIX extended regex syntax.  For example, we don't support
-//   union ("x|y"), grouping ("(xy)"), brackets ("[xy]"), and
-//   repetition count ("x{5,7}"), among others.
-//
-//   Below is the syntax that we do support.  We chose it to be a
-//   subset of both PCRE and POSIX extended regex, so it's easy to
-//   learn wherever you come from.  In the following: 'A' denotes a
-//   literal character, period (.), or a single \\ escape sequence;
-//   'x' and 'y' denote regular expressions; 'm' and 'n' are for
-//   natural numbers.
-//
-//     c     matches any literal character c
-//     \\d   matches any decimal digit
-//     \\D   matches any character that's not a decimal digit
-//     \\f   matches \f
-//     \\n   matches \n
-//     \\r   matches \r
-//     \\s   matches any ASCII whitespace, including \n
-//     \\S   matches any character that's not a whitespace
-//     \\t   matches \t
-//     \\v   matches \v
-//     \\w   matches any letter, _, or decimal digit
-//     \\W   matches any character that \\w doesn't match
-//     \\c   matches any literal character c, which must be a punctuation
-//     .     matches any single character except \n
-//     A?    matches 0 or 1 occurrences of A
-//     A*    matches 0 or many occurrences of A
-//     A+    matches 1 or many occurrences of A
-//     ^     matches the beginning of a string (not that of each line)
-//     $     matches the end of a string (not that of each line)
-//     xy    matches x followed by y
-//
-//   If you accidentally use PCRE or POSIX extended regex features
-//   not implemented by us, you will get a run-time failure.  In that
-//   case, please try to rewrite your regular expression within the
-//   above syntax.
-//
-//   This implementation is *not* meant to be as highly tuned or robust
-//   as a compiled regex library, but should perform well enough for a
-//   death test, which already incurs significant overhead by launching
-//   a child process.
-//
-// Known caveats:
-//
-//   A "threadsafe" style death test obtains the path to the test
-//   program from argv[0] and re-executes it in the sub-process.  For
-//   simplicity, the current implementation doesn't search the PATH
-//   when launching the sub-process.  This means that the user must
-//   invoke the test program via a path that contains at least one
-//   path separator (e.g. path/to/foo_test and
-//   /absolute/path/to/bar_test are fine, but foo_test is not).  This
-//   is rarely a problem as people usually don't put the test binary
-//   directory in PATH.
-//
-// TODO(wan@google.com): make thread-safe death tests search the PATH.
-
-// Asserts that a given statement causes the program to exit, with an
-// integer exit status that satisfies predicate, and emitting error output
-// that matches regex.
-# define ASSERT_EXIT(statement, predicate, regex) \
-    GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_FATAL_FAILURE_)
-
-// Like ASSERT_EXIT, but continues on to successive tests in the
-// test case, if any:
-# define EXPECT_EXIT(statement, predicate, regex) \
-    GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_NONFATAL_FAILURE_)
-
-// Asserts that a given statement causes the program to exit, either by
-// explicitly exiting with a nonzero exit code or being killed by a
-// signal, and emitting error output that matches regex.
-# define ASSERT_DEATH(statement, regex) \
-    ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
-
-// Like ASSERT_DEATH, but continues on to successive tests in the
-// test case, if any:
-# define EXPECT_DEATH(statement, regex) \
-    EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
-
-// Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*:
-
-// Tests that an exit code describes a normal exit with a given exit code.
-class GTEST_API_ ExitedWithCode {
- public:
-  explicit ExitedWithCode(int exit_code);
-  bool operator()(int exit_status) const;
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ExitedWithCode& other);
-
-  const int exit_code_;
-};
-
-# if !GTEST_OS_WINDOWS
-// Tests that an exit code describes an exit due to termination by a
-// given signal.
-class GTEST_API_ KilledBySignal {
- public:
-  explicit KilledBySignal(int signum);
-  bool operator()(int exit_status) const;
- private:
-  const int signum_;
-};
-# endif  // !GTEST_OS_WINDOWS
-
-// EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode.
-// The death testing framework causes this to have interesting semantics,
-// since the sideeffects of the call are only visible in opt mode, and not
-// in debug mode.
-//
-// In practice, this can be used to test functions that utilize the
-// LOG(DFATAL) macro using the following style:
-//
-// int DieInDebugOr12(int* sideeffect) {
-//   if (sideeffect) {
-//     *sideeffect = 12;
-//   }
-//   LOG(DFATAL) << "death";
-//   return 12;
-// }
-//
-// TEST(TestCase, TestDieOr12WorksInDgbAndOpt) {
-//   int sideeffect = 0;
-//   // Only asserts in dbg.
-//   EXPECT_DEBUG_DEATH(DieInDebugOr12(&sideeffect), "death");
-//
-// #ifdef NDEBUG
-//   // opt-mode has sideeffect visible.
-//   EXPECT_EQ(12, sideeffect);
-// #else
-//   // dbg-mode no visible sideeffect.
-//   EXPECT_EQ(0, sideeffect);
-// #endif
-// }
-//
-// This will assert that DieInDebugReturn12InOpt() crashes in debug
-// mode, usually due to a DCHECK or LOG(DFATAL), but returns the
-// appropriate fallback value (12 in this case) in opt mode. If you
-// need to test that a function has appropriate side-effects in opt
-// mode, include assertions against the side-effects.  A general
-// pattern for this is:
-//
-// EXPECT_DEBUG_DEATH({
-//   // Side-effects here will have an effect after this statement in
-//   // opt mode, but none in debug mode.
-//   EXPECT_EQ(12, DieInDebugOr12(&sideeffect));
-// }, "death");
-//
-# ifdef NDEBUG
-
-#  define EXPECT_DEBUG_DEATH(statement, regex) \
-  GTEST_EXECUTE_STATEMENT_(statement, regex)
-
-#  define ASSERT_DEBUG_DEATH(statement, regex) \
-  GTEST_EXECUTE_STATEMENT_(statement, regex)
-
-# else
-
-#  define EXPECT_DEBUG_DEATH(statement, regex) \
-  EXPECT_DEATH(statement, regex)
-
-#  define ASSERT_DEBUG_DEATH(statement, regex) \
-  ASSERT_DEATH(statement, regex)
-
-# endif  // NDEBUG for EXPECT_DEBUG_DEATH
-#endif  // GTEST_HAS_DEATH_TEST
-
-// EXPECT_DEATH_IF_SUPPORTED(statement, regex) and
-// ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if
-// death tests are supported; otherwise they just issue a warning.  This is
-// useful when you are combining death test assertions with normal test
-// assertions in one test.
-#if GTEST_HAS_DEATH_TEST
-# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
-    EXPECT_DEATH(statement, regex)
-# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
-    ASSERT_DEATH(statement, regex)
-#else
-# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
-    GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, )
-# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
-    GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, return)
-#endif
-
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
-// This file was GENERATED by command:
-//     pump.py gtest-param-test.h.pump
-// DO NOT EDIT BY HAND!!!
-
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Authors: vladl@google.com (Vlad Losev)
-//
-// Macros and functions for implementing parameterized tests
-// in Google C++ Testing Framework (Google Test)
-//
-// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
-//
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
-
-
-// Value-parameterized tests allow you to test your code with different
-// parameters without writing multiple copies of the same test.
-//
-// Here is how you use value-parameterized tests:
-
-#if 0
-
-// To write value-parameterized tests, first you should define a fixture
-// class. It is usually derived from testing::TestWithParam<T> (see below for
-// another inheritance scheme that's sometimes useful in more complicated
-// class hierarchies), where the type of your parameter values.
-// TestWithParam<T> is itself derived from testing::Test. T can be any
-// copyable type. If it's a raw pointer, you are responsible for managing the
-// lifespan of the pointed values.
-
-class FooTest : public ::testing::TestWithParam<const char*> {
-  // You can implement all the usual class fixture members here.
-};
-
-// Then, use the TEST_P macro to define as many parameterized tests
-// for this fixture as you want. The _P suffix is for "parameterized"
-// or "pattern", whichever you prefer to think.
-
-TEST_P(FooTest, DoesBlah) {
-  // Inside a test, access the test parameter with the GetParam() method
-  // of the TestWithParam<T> class:
-  EXPECT_TRUE(foo.Blah(GetParam()));
-  ...
-}
-
-TEST_P(FooTest, HasBlahBlah) {
-  ...
-}
-
-// Finally, you can use INSTANTIATE_TEST_CASE_P to instantiate the test
-// case with any set of parameters you want. Google Test defines a number
-// of functions for generating test parameters. They return what we call
-// (surprise!) parameter generators. Here is a  summary of them, which
-// are all in the testing namespace:
-//
-//
-//  Range(begin, end [, step]) - Yields values {begin, begin+step,
-//                               begin+step+step, ...}. The values do not
-//                               include end. step defaults to 1.
-//  Values(v1, v2, ..., vN)    - Yields values {v1, v2, ..., vN}.
-//  ValuesIn(container)        - Yields values from a C-style array, an STL
-//  ValuesIn(begin,end)          container, or an iterator range [begin, end).
-//  Bool()                     - Yields sequence {false, true}.
-//  Combine(g1, g2, ..., gN)   - Yields all combinations (the Cartesian product
-//                               for the math savvy) of the values generated
-//                               by the N generators.
-//
-// For more details, see comments at the definitions of these functions below
-// in this file.
-//
-// The following statement will instantiate tests from the FooTest test case
-// each with parameter values "meeny", "miny", and "moe".
-
-INSTANTIATE_TEST_CASE_P(InstantiationName,
-                        FooTest,
-                        Values("meeny", "miny", "moe"));
-
-// To distinguish different instances of the pattern, (yes, you
-// can instantiate it more then once) the first argument to the
-// INSTANTIATE_TEST_CASE_P macro is a prefix that will be added to the
-// actual test case name. Remember to pick unique prefixes for different
-// instantiations. The tests from the instantiation above will have
-// these names:
-//
-//    * InstantiationName/FooTest.DoesBlah/0 for "meeny"
-//    * InstantiationName/FooTest.DoesBlah/1 for "miny"
-//    * InstantiationName/FooTest.DoesBlah/2 for "moe"
-//    * InstantiationName/FooTest.HasBlahBlah/0 for "meeny"
-//    * InstantiationName/FooTest.HasBlahBlah/1 for "miny"
-//    * InstantiationName/FooTest.HasBlahBlah/2 for "moe"
-//
-// You can use these names in --gtest_filter.
-//
-// This statement will instantiate all tests from FooTest again, each
-// with parameter values "cat" and "dog":
-
-const char* pets[] = {"cat", "dog"};
-INSTANTIATE_TEST_CASE_P(AnotherInstantiationName, FooTest, ValuesIn(pets));
-
-// The tests from the instantiation above will have these names:
-//
-//    * AnotherInstantiationName/FooTest.DoesBlah/0 for "cat"
-//    * AnotherInstantiationName/FooTest.DoesBlah/1 for "dog"
-//    * AnotherInstantiationName/FooTest.HasBlahBlah/0 for "cat"
-//    * AnotherInstantiationName/FooTest.HasBlahBlah/1 for "dog"
-//
-// Please note that INSTANTIATE_TEST_CASE_P will instantiate all tests
-// in the given test case, whether their definitions come before or
-// AFTER the INSTANTIATE_TEST_CASE_P statement.
-//
-// Please also note that generator expressions (including parameters to the
-// generators) are evaluated in InitGoogleTest(), after main() has started.
-// This allows the user on one hand, to adjust generator parameters in order
-// to dynamically determine a set of tests to run and on the other hand,
-// give the user a chance to inspect the generated tests with Google Test
-// reflection API before RUN_ALL_TESTS() is executed.
-//
-// You can see samples/sample7_unittest.cc and samples/sample8_unittest.cc
-// for more examples.
-//
-// In the future, we plan to publish the API for defining new parameter
-// generators. But for now this interface remains part of the internal
-// implementation and is subject to change.
-//
-//
-// A parameterized test fixture must be derived from testing::Test and from
-// testing::WithParamInterface<T>, where T is the type of the parameter
-// values. Inheriting from TestWithParam<T> satisfies that requirement because
-// TestWithParam<T> inherits from both Test and WithParamInterface. In more
-// complicated hierarchies, however, it is occasionally useful to inherit
-// separately from Test and WithParamInterface. For example:
-
-class BaseTest : public ::testing::Test {
-  // You can inherit all the usual members for a non-parameterized test
-  // fixture here.
-};
-
-class DerivedTest : public BaseTest, public ::testing::WithParamInterface<int> {
-  // The usual test fixture members go here too.
-};
-
-TEST_F(BaseTest, HasFoo) {
-  // This is an ordinary non-parameterized test.
-}
-
-TEST_P(DerivedTest, DoesBlah) {
-  // GetParam works just the same here as if you inherit from TestWithParam.
-  EXPECT_TRUE(foo.Blah(GetParam()));
-}
-
-#endif  // 0
-
-
-#if !GTEST_OS_SYMBIAN
-# include <utility>
-#endif
-
-// scripts/fuse_gtest.py depends on gtest's own header being #included
-// *unconditionally*.  Therefore these #includes cannot be moved
-// inside #if GTEST_HAS_PARAM_TEST.
-// Copyright 2008 Google Inc.
-// All Rights Reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: vladl@google.com (Vlad Losev)
-
-// Type and function utilities for implementing parameterized tests.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
-
-#include <iterator>
-#include <utility>
-#include <vector>
-
-// scripts/fuse_gtest.py depends on gtest's own header being #included
-// *unconditionally*.  Therefore these #includes cannot be moved
-// inside #if GTEST_HAS_PARAM_TEST.
-// Copyright 2003 Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Authors: Dan Egnor (egnor@google.com)
-//
-// A "smart" pointer type with reference tracking.  Every pointer to a
-// particular object is kept on a circular linked list.  When the last pointer
-// to an object is destroyed or reassigned, the object is deleted.
-//
-// Used properly, this deletes the object when the last reference goes away.
-// There are several caveats:
-// - Like all reference counting schemes, cycles lead to leaks.
-// - Each smart pointer is actually two pointers (8 bytes instead of 4).
-// - Every time a pointer is assigned, the entire list of pointers to that
-//   object is traversed.  This class is therefore NOT SUITABLE when there
-//   will often be more than two or three pointers to a particular object.
-// - References are only tracked as long as linked_ptr<> objects are copied.
-//   If a linked_ptr<> is converted to a raw pointer and back, BAD THINGS
-//   will happen (double deletion).
-//
-// A good use of this class is storing object references in STL containers.
-// You can safely put linked_ptr<> in a vector<>.
-// Other uses may not be as good.
-//
-// Note: If you use an incomplete type with linked_ptr<>, the class
-// *containing* linked_ptr<> must have a constructor and destructor (even
-// if they do nothing!).
-//
-// Bill Gibbons suggested we use something like this.
-//
-// Thread Safety:
-//   Unlike other linked_ptr implementations, in this implementation
-//   a linked_ptr object is thread-safe in the sense that:
-//     - it's safe to copy linked_ptr objects concurrently,
-//     - it's safe to copy *from* a linked_ptr and read its underlying
-//       raw pointer (e.g. via get()) concurrently, and
-//     - it's safe to write to two linked_ptrs that point to the same
-//       shared object concurrently.
-// TODO(wan@google.com): rename this to safe_linked_ptr to avoid
-// confusion with normal linked_ptr.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
-
-#include <stdlib.h>
-#include <assert.h>
-
-
-namespace testing {
-namespace internal {
-
-// Protects copying of all linked_ptr objects.
-GTEST_API_ GTEST_DECLARE_STATIC_MUTEX_(g_linked_ptr_mutex);
-
-// This is used internally by all instances of linked_ptr<>.  It needs to be
-// a non-template class because different types of linked_ptr<> can refer to
-// the same object (linked_ptr<Superclass>(obj) vs linked_ptr<Subclass>(obj)).
-// So, it needs to be possible for different types of linked_ptr to participate
-// in the same circular linked list, so we need a single class type here.
-//
-// DO NOT USE THIS CLASS DIRECTLY YOURSELF.  Use linked_ptr<T>.
-class linked_ptr_internal {
- public:
-  // Create a new circle that includes only this instance.
-  void join_new() {
-    next_ = this;
-  }
-
-  // Many linked_ptr operations may change p.link_ for some linked_ptr
-  // variable p in the same circle as this object.  Therefore we need
-  // to prevent two such operations from occurring concurrently.
-  //
-  // Note that different types of linked_ptr objects can coexist in a
-  // circle (e.g. linked_ptr<Base>, linked_ptr<Derived1>, and
-  // linked_ptr<Derived2>).  Therefore we must use a single mutex to
-  // protect all linked_ptr objects.  This can create serious
-  // contention in production code, but is acceptable in a testing
-  // framework.
-
-  // Join an existing circle.
-  void join(linked_ptr_internal const* ptr)
-      GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) {
-    MutexLock lock(&g_linked_ptr_mutex);
-
-    linked_ptr_internal const* p = ptr;
-    while (p->next_ != ptr) p = p->next_;
-    p->next_ = this;
-    next_ = ptr;
-  }
-
-  // Leave whatever circle we're part of.  Returns true if we were the
-  // last member of the circle.  Once this is done, you can join() another.
-  bool depart()
-      GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) {
-    MutexLock lock(&g_linked_ptr_mutex);
-
-    if (next_ == this) return true;
-    linked_ptr_internal const* p = next_;
-    while (p->next_ != this) p = p->next_;
-    p->next_ = next_;
-    return false;
-  }
-
- private:
-  mutable linked_ptr_internal const* next_;
-};
-
-template <typename T>
-class linked_ptr {
- public:
-  typedef T element_type;
-
-  // Take over ownership of a raw pointer.  This should happen as soon as
-  // possible after the object is created.
-  explicit linked_ptr(T* ptr = NULL) { capture(ptr); }
-  ~linked_ptr() { depart(); }
-
-  // Copy an existing linked_ptr<>, adding ourselves to the list of references.
-  template <typename U> linked_ptr(linked_ptr<U> const& ptr) { copy(&ptr); }
-  linked_ptr(linked_ptr const& ptr) {  // NOLINT
-    assert(&ptr != this);
-    copy(&ptr);
-  }
-
-  // Assignment releases the old value and acquires the new.
-  template <typename U> linked_ptr& operator=(linked_ptr<U> const& ptr) {
-    depart();
-    copy(&ptr);
-    return *this;
-  }
-
-  linked_ptr& operator=(linked_ptr const& ptr) {
-    if (&ptr != this) {
-      depart();
-      copy(&ptr);
-    }
-    return *this;
-  }
-
-  // Smart pointer members.
-  void reset(T* ptr = NULL) {
-    depart();
-    capture(ptr);
-  }
-  T* get() const { return value_; }
-  T* operator->() const { return value_; }
-  T& operator*() const { return *value_; }
-
-  bool operator==(T* p) const { return value_ == p; }
-  bool operator!=(T* p) const { return value_ != p; }
-  template <typename U>
-  bool operator==(linked_ptr<U> const& ptr) const {
-    return value_ == ptr.get();
-  }
-  template <typename U>
-  bool operator!=(linked_ptr<U> const& ptr) const {
-    return value_ != ptr.get();
-  }
-
- private:
-  template <typename U>
-  friend class linked_ptr;
-
-  T* value_;
-  linked_ptr_internal link_;
-
-  void depart() {
-    if (link_.depart()) delete value_;
-  }
-
-  void capture(T* ptr) {
-    value_ = ptr;
-    link_.join_new();
-  }
-
-  template <typename U> void copy(linked_ptr<U> const* ptr) {
-    value_ = ptr->get();
-    if (value_)
-      link_.join(&ptr->link_);
-    else
-      link_.join_new();
-  }
-};
-
-template<typename T> inline
-bool operator==(T* ptr, const linked_ptr<T>& x) {
-  return ptr == x.get();
-}
-
-template<typename T> inline
-bool operator!=(T* ptr, const linked_ptr<T>& x) {
-  return ptr != x.get();
-}
-
-// A function to convert T* into linked_ptr<T>
-// Doing e.g. make_linked_ptr(new FooBarBaz<type>(arg)) is a shorter notation
-// for linked_ptr<FooBarBaz<type> >(new FooBarBaz<type>(arg))
-template <typename T>
-linked_ptr<T> make_linked_ptr(T* ptr) {
-  return linked_ptr<T>(ptr);
-}
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-
-// Google Test - The Google C++ Testing Framework
-//
-// This file implements a universal value printer that can print a
-// value of any type T:
-//
-//   void ::testing::internal::UniversalPrinter<T>::Print(value, ostream_ptr);
-//
-// A user can teach this function how to print a class type T by
-// defining either operator<<() or PrintTo() in the namespace that
-// defines T.  More specifically, the FIRST defined function in the
-// following list will be used (assuming T is defined in namespace
-// foo):
-//
-//   1. foo::PrintTo(const T&, ostream*)
-//   2. operator<<(ostream&, const T&) defined in either foo or the
-//      global namespace.
-//
-// If none of the above is defined, it will print the debug string of
-// the value if it is a protocol buffer, or print the raw bytes in the
-// value otherwise.
-//
-// To aid debugging: when T is a reference type, the address of the
-// value is also printed; when T is a (const) char pointer, both the
-// pointer value and the NUL-terminated string it points to are
-// printed.
-//
-// We also provide some convenient wrappers:
-//
-//   // Prints a value to a string.  For a (const or not) char
-//   // pointer, the NUL-terminated string (but not the pointer) is
-//   // printed.
-//   std::string ::testing::PrintToString(const T& value);
-//
-//   // Prints a value tersely: for a reference type, the referenced
-//   // value (but not the address) is printed; for a (const or not) char
-//   // pointer, the NUL-terminated string (but not the pointer) is
-//   // printed.
-//   void ::testing::internal::UniversalTersePrint(const T& value, ostream*);
-//
-//   // Prints value using the type inferred by the compiler.  The difference
-//   // from UniversalTersePrint() is that this function prints both the
-//   // pointer and the NUL-terminated string for a (const or not) char pointer.
-//   void ::testing::internal::UniversalPrint(const T& value, ostream*);
-//
-//   // Prints the fields of a tuple tersely to a string vector, one
-//   // element for each field. Tuple support must be enabled in
-//   // gtest-port.h.
-//   std::vector<string> UniversalTersePrintTupleFieldsToStrings(
-//       const Tuple& value);
-//
-// Known limitation:
-//
-// The print primitives print the elements of an STL-style container
-// using the compiler-inferred type of *iter where iter is a
-// const_iterator of the container.  When const_iterator is an input
-// iterator but not a forward iterator, this inferred type may not
-// match value_type, and the print output may be incorrect.  In
-// practice, this is rarely a problem as for most containers
-// const_iterator is a forward iterator.  We'll fix this if there's an
-// actual need for it.  Note that this fix cannot rely on value_type
-// being defined as many user-defined container types don't have
-// value_type.
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
-
-#include <ostream>  // NOLINT
-#include <sstream>
-#include <string>
-#include <utility>
-#include <vector>
-
-namespace testing {
-
-// Definitions in the 'internal' and 'internal2' name spaces are
-// subject to change without notice.  DO NOT USE THEM IN USER CODE!
-namespace internal2 {
-
-// Prints the given number of bytes in the given object to the given
-// ostream.
-GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes,
-                                     size_t count,
-                                     ::std::ostream* os);
-
-// For selecting which printer to use when a given type has neither <<
-// nor PrintTo().
-enum TypeKind {
-  kProtobuf,              // a protobuf type
-  kConvertibleToInteger,  // a type implicitly convertible to BiggestInt
-                          // (e.g. a named or unnamed enum type)
-  kOtherType              // anything else
-};
-
-// TypeWithoutFormatter<T, kTypeKind>::PrintValue(value, os) is called
-// by the universal printer to print a value of type T when neither
-// operator<< nor PrintTo() is defined for T, where kTypeKind is the
-// "kind" of T as defined by enum TypeKind.
-template <typename T, TypeKind kTypeKind>
-class TypeWithoutFormatter {
- public:
-  // This default version is called when kTypeKind is kOtherType.
-  static void PrintValue(const T& value, ::std::ostream* os) {
-    PrintBytesInObjectTo(reinterpret_cast<const unsigned char*>(&value),
-                         sizeof(value), os);
-  }
-};
-
-// We print a protobuf using its ShortDebugString() when the string
-// doesn't exceed this many characters; otherwise we print it using
-// DebugString() for better readability.
-const size_t kProtobufOneLinerMaxLength = 50;
-
-template <typename T>
-class TypeWithoutFormatter<T, kProtobuf> {
- public:
-  static void PrintValue(const T& value, ::std::ostream* os) {
-    const ::testing::internal::string short_str = value.ShortDebugString();
-    const ::testing::internal::string pretty_str =
-        short_str.length() <= kProtobufOneLinerMaxLength ?
-        short_str : ("\n" + value.DebugString());
-    *os << ("<" + pretty_str + ">");
-  }
-};
-
-template <typename T>
-class TypeWithoutFormatter<T, kConvertibleToInteger> {
- public:
-  // Since T has no << operator or PrintTo() but can be implicitly
-  // converted to BiggestInt, we print it as a BiggestInt.
-  //
-  // Most likely T is an enum type (either named or unnamed), in which
-  // case printing it as an integer is the desired behavior.  In case
-  // T is not an enum, printing it as an integer is the best we can do
-  // given that it has no user-defined printer.
-  static void PrintValue(const T& value, ::std::ostream* os) {
-    const internal::BiggestInt kBigInt = value;
-    *os << kBigInt;
-  }
-};
-
-// Prints the given value to the given ostream.  If the value is a
-// protocol message, its debug string is printed; if it's an enum or
-// of a type implicitly convertible to BiggestInt, it's printed as an
-// integer; otherwise the bytes in the value are printed.  This is
-// what UniversalPrinter<T>::Print() does when it knows nothing about
-// type T and T has neither << operator nor PrintTo().
-//
-// A user can override this behavior for a class type Foo by defining
-// a << operator in the namespace where Foo is defined.
-//
-// We put this operator in namespace 'internal2' instead of 'internal'
-// to simplify the implementation, as much code in 'internal' needs to
-// use << in STL, which would conflict with our own << were it defined
-// in 'internal'.
-//
-// Note that this operator<< takes a generic std::basic_ostream<Char,
-// CharTraits> type instead of the more restricted std::ostream.  If
-// we define it to take an std::ostream instead, we'll get an
-// "ambiguous overloads" compiler error when trying to print a type
-// Foo that supports streaming to std::basic_ostream<Char,
-// CharTraits>, as the compiler cannot tell whether
-// operator<<(std::ostream&, const T&) or
-// operator<<(std::basic_stream<Char, CharTraits>, const Foo&) is more
-// specific.
-template <typename Char, typename CharTraits, typename T>
-::std::basic_ostream<Char, CharTraits>& operator<<(
-    ::std::basic_ostream<Char, CharTraits>& os, const T& x) {
-  TypeWithoutFormatter<T,
-      (internal::IsAProtocolMessage<T>::value ? kProtobuf :
-       internal::ImplicitlyConvertible<const T&, internal::BiggestInt>::value ?
-       kConvertibleToInteger : kOtherType)>::PrintValue(x, &os);
-  return os;
-}
-
-}  // namespace internal2
-}  // namespace testing
-
-// This namespace MUST NOT BE NESTED IN ::testing, or the name look-up
-// magic needed for implementing UniversalPrinter won't work.
-namespace testing_internal {
-
-// Used to print a value that is not an STL-style container when the
-// user doesn't define PrintTo() for it.
-template <typename T>
-void DefaultPrintNonContainerTo(const T& value, ::std::ostream* os) {
-  // With the following statement, during unqualified name lookup,
-  // testing::internal2::operator<< appears as if it was declared in
-  // the nearest enclosing namespace that contains both
-  // ::testing_internal and ::testing::internal2, i.e. the global
-  // namespace.  For more details, refer to the C++ Standard section
-  // 7.3.4-1 [namespace.udir].  This allows us to fall back onto
-  // testing::internal2::operator<< in case T doesn't come with a <<
-  // operator.
-  //
-  // We cannot write 'using ::testing::internal2::operator<<;', which
-  // gcc 3.3 fails to compile due to a compiler bug.
-  using namespace ::testing::internal2;  // NOLINT
-
-  // Assuming T is defined in namespace foo, in the next statement,
-  // the compiler will consider all of:
-  //
-  //   1. foo::operator<< (thanks to Koenig look-up),
-  //   2. ::operator<< (as the current namespace is enclosed in ::),
-  //   3. testing::internal2::operator<< (thanks to the using statement above).
-  //
-  // The operator<< whose type matches T best will be picked.
-  //
-  // We deliberately allow #2 to be a candidate, as sometimes it's
-  // impossible to define #1 (e.g. when foo is ::std, defining
-  // anything in it is undefined behavior unless you are a compiler
-  // vendor.).
-  *os << value;
-}
-
-}  // namespace testing_internal
-
-namespace testing {
-namespace internal {
-
-// UniversalPrinter<T>::Print(value, ostream_ptr) prints the given
-// value to the given ostream.  The caller must ensure that
-// 'ostream_ptr' is not NULL, or the behavior is undefined.
-//
-// We define UniversalPrinter as a class template (as opposed to a
-// function template), as we need to partially specialize it for
-// reference types, which cannot be done with function templates.
-template <typename T>
-class UniversalPrinter;
-
-template <typename T>
-void UniversalPrint(const T& value, ::std::ostream* os);
-
-// Used to print an STL-style container when the user doesn't define
-// a PrintTo() for it.
-template <typename C>
-void DefaultPrintTo(IsContainer /* dummy */,
-                    false_type /* is not a pointer */,
-                    const C& container, ::std::ostream* os) {
-  const size_t kMaxCount = 32;  // The maximum number of elements to print.
-  *os << '{';
-  size_t count = 0;
-  for (typename C::const_iterator it = container.begin();
-       it != container.end(); ++it, ++count) {
-    if (count > 0) {
-      *os << ',';
-      if (count == kMaxCount) {  // Enough has been printed.
-        *os << " ...";
-        break;
-      }
-    }
-    *os << ' ';
-    // We cannot call PrintTo(*it, os) here as PrintTo() doesn't
-    // handle *it being a native array.
-    internal::UniversalPrint(*it, os);
-  }
-
-  if (count > 0) {
-    *os << ' ';
-  }
-  *os << '}';
-}
-
-// Used to print a pointer that is neither a char pointer nor a member
-// pointer, when the user doesn't define PrintTo() for it.  (A member
-// variable pointer or member function pointer doesn't really point to
-// a location in the address space.  Their representation is
-// implementation-defined.  Therefore they will be printed as raw
-// bytes.)
-template <typename T>
-void DefaultPrintTo(IsNotContainer /* dummy */,
-                    true_type /* is a pointer */,
-                    T* p, ::std::ostream* os) {
-  if (p == NULL) {
-    *os << "NULL";
-  } else {
-    // C++ doesn't allow casting from a function pointer to any object
-    // pointer.
-    //
-    // IsTrue() silences warnings: "Condition is always true",
-    // "unreachable code".
-    if (IsTrue(ImplicitlyConvertible<T*, const void*>::value)) {
-      // T is not a function type.  We just call << to print p,
-      // relying on ADL to pick up user-defined << for their pointer
-      // types, if any.
-      *os << p;
-    } else {
-      // T is a function type, so '*os << p' doesn't do what we want
-      // (it just prints p as bool).  We want to print p as a const
-      // void*.  However, we cannot cast it to const void* directly,
-      // even using reinterpret_cast, as earlier versions of gcc
-      // (e.g. 3.4.5) cannot compile the cast when p is a function
-      // pointer.  Casting to UInt64 first solves the problem.
-      *os << reinterpret_cast<const void*>(
-          reinterpret_cast<internal::UInt64>(p));
-    }
-  }
-}
-
-// Used to print a non-container, non-pointer value when the user
-// doesn't define PrintTo() for it.
-template <typename T>
-void DefaultPrintTo(IsNotContainer /* dummy */,
-                    false_type /* is not a pointer */,
-                    const T& value, ::std::ostream* os) {
-  ::testing_internal::DefaultPrintNonContainerTo(value, os);
-}
-
-// Prints the given value using the << operator if it has one;
-// otherwise prints the bytes in it.  This is what
-// UniversalPrinter<T>::Print() does when PrintTo() is not specialized
-// or overloaded for type T.
-//
-// A user can override this behavior for a class type Foo by defining
-// an overload of PrintTo() in the namespace where Foo is defined.  We
-// give the user this option as sometimes defining a << operator for
-// Foo is not desirable (e.g. the coding style may prevent doing it,
-// or there is already a << operator but it doesn't do what the user
-// wants).
-template <typename T>
-void PrintTo(const T& value, ::std::ostream* os) {
-  // DefaultPrintTo() is overloaded.  The type of its first two
-  // arguments determine which version will be picked.  If T is an
-  // STL-style container, the version for container will be called; if
-  // T is a pointer, the pointer version will be called; otherwise the
-  // generic version will be called.
-  //
-  // Note that we check for container types here, prior to we check
-  // for protocol message types in our operator<<.  The rationale is:
-  //
-  // For protocol messages, we want to give people a chance to
-  // override Google Mock's format by defining a PrintTo() or
-  // operator<<.  For STL containers, other formats can be
-  // incompatible with Google Mock's format for the container
-  // elements; therefore we check for container types here to ensure
-  // that our format is used.
-  //
-  // The second argument of DefaultPrintTo() is needed to bypass a bug
-  // in Symbian's C++ compiler that prevents it from picking the right
-  // overload between:
-  //
-  //   PrintTo(const T& x, ...);
-  //   PrintTo(T* x, ...);
-  DefaultPrintTo(IsContainerTest<T>(0), is_pointer<T>(), value, os);
-}
-
-// The following list of PrintTo() overloads tells
-// UniversalPrinter<T>::Print() how to print standard types (built-in
-// types, strings, plain arrays, and pointers).
-
-// Overloads for various char types.
-GTEST_API_ void PrintTo(unsigned char c, ::std::ostream* os);
-GTEST_API_ void PrintTo(signed char c, ::std::ostream* os);
-inline void PrintTo(char c, ::std::ostream* os) {
-  // When printing a plain char, we always treat it as unsigned.  This
-  // way, the output won't be affected by whether the compiler thinks
-  // char is signed or not.
-  PrintTo(static_cast<unsigned char>(c), os);
-}
-
-// Overloads for other simple built-in types.
-inline void PrintTo(bool x, ::std::ostream* os) {
-  *os << (x ? "true" : "false");
-}
-
-// Overload for wchar_t type.
-// Prints a wchar_t as a symbol if it is printable or as its internal
-// code otherwise and also as its decimal code (except for L'\0').
-// The L'\0' char is printed as "L'\\0'". The decimal code is printed
-// as signed integer when wchar_t is implemented by the compiler
-// as a signed type and is printed as an unsigned integer when wchar_t
-// is implemented as an unsigned type.
-GTEST_API_ void PrintTo(wchar_t wc, ::std::ostream* os);
-
-// Overloads for C strings.
-GTEST_API_ void PrintTo(const char* s, ::std::ostream* os);
-inline void PrintTo(char* s, ::std::ostream* os) {
-  PrintTo(ImplicitCast_<const char*>(s), os);
-}
-
-// signed/unsigned char is often used for representing binary data, so
-// we print pointers to it as void* to be safe.
-inline void PrintTo(const signed char* s, ::std::ostream* os) {
-  PrintTo(ImplicitCast_<const void*>(s), os);
-}
-inline void PrintTo(signed char* s, ::std::ostream* os) {
-  PrintTo(ImplicitCast_<const void*>(s), os);
-}
-inline void PrintTo(const unsigned char* s, ::std::ostream* os) {
-  PrintTo(ImplicitCast_<const void*>(s), os);
-}
-inline void PrintTo(unsigned char* s, ::std::ostream* os) {
-  PrintTo(ImplicitCast_<const void*>(s), os);
-}
-
-// MSVC can be configured to define wchar_t as a typedef of unsigned
-// short.  It defines _NATIVE_WCHAR_T_DEFINED when wchar_t is a native
-// type.  When wchar_t is a typedef, defining an overload for const
-// wchar_t* would cause unsigned short* be printed as a wide string,
-// possibly causing invalid memory accesses.
-#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
-// Overloads for wide C strings
-GTEST_API_ void PrintTo(const wchar_t* s, ::std::ostream* os);
-inline void PrintTo(wchar_t* s, ::std::ostream* os) {
-  PrintTo(ImplicitCast_<const wchar_t*>(s), os);
-}
-#endif
-
-// Overload for C arrays.  Multi-dimensional arrays are printed
-// properly.
-
-// Prints the given number of elements in an array, without printing
-// the curly braces.
-template <typename T>
-void PrintRawArrayTo(const T a[], size_t count, ::std::ostream* os) {
-  UniversalPrint(a[0], os);
-  for (size_t i = 1; i != count; i++) {
-    *os << ", ";
-    UniversalPrint(a[i], os);
-  }
-}
-
-// Overloads for ::string and ::std::string.
-#if GTEST_HAS_GLOBAL_STRING
-GTEST_API_ void PrintStringTo(const ::string&s, ::std::ostream* os);
-inline void PrintTo(const ::string& s, ::std::ostream* os) {
-  PrintStringTo(s, os);
-}
-#endif  // GTEST_HAS_GLOBAL_STRING
-
-GTEST_API_ void PrintStringTo(const ::std::string&s, ::std::ostream* os);
-inline void PrintTo(const ::std::string& s, ::std::ostream* os) {
-  PrintStringTo(s, os);
-}
-
-// Overloads for ::wstring and ::std::wstring.
-#if GTEST_HAS_GLOBAL_WSTRING
-GTEST_API_ void PrintWideStringTo(const ::wstring&s, ::std::ostream* os);
-inline void PrintTo(const ::wstring& s, ::std::ostream* os) {
-  PrintWideStringTo(s, os);
-}
-#endif  // GTEST_HAS_GLOBAL_WSTRING
-
-#if GTEST_HAS_STD_WSTRING
-GTEST_API_ void PrintWideStringTo(const ::std::wstring&s, ::std::ostream* os);
-inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) {
-  PrintWideStringTo(s, os);
-}
-#endif  // GTEST_HAS_STD_WSTRING
-
-#if GTEST_HAS_TR1_TUPLE
-// Overload for ::std::tr1::tuple.  Needed for printing function arguments,
-// which are packed as tuples.
-
-// Helper function for printing a tuple.  T must be instantiated with
-// a tuple type.
-template <typename T>
-void PrintTupleTo(const T& t, ::std::ostream* os);
-
-// Overloaded PrintTo() for tuples of various arities.  We support
-// tuples of up-to 10 fields.  The following implementation works
-// regardless of whether tr1::tuple is implemented using the
-// non-standard variadic template feature or not.
-
-inline void PrintTo(const ::std::tr1::tuple<>& t, ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1>
-void PrintTo(const ::std::tr1::tuple<T1>& t, ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2>
-void PrintTo(const ::std::tr1::tuple<T1, T2>& t, ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2, typename T3>
-void PrintTo(const ::std::tr1::tuple<T1, T2, T3>& t, ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2, typename T3, typename T4>
-void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4>& t, ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5>& t,
-             ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-          typename T6>
-void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6>& t,
-             ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-          typename T6, typename T7>
-void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7>& t,
-             ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-          typename T6, typename T7, typename T8>
-void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8>& t,
-             ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-          typename T6, typename T7, typename T8, typename T9>
-void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9>& t,
-             ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-          typename T6, typename T7, typename T8, typename T9, typename T10>
-void PrintTo(
-    const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>& t,
-    ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-#endif  // GTEST_HAS_TR1_TUPLE
-
-// Overload for std::pair.
-template <typename T1, typename T2>
-void PrintTo(const ::std::pair<T1, T2>& value, ::std::ostream* os) {
-  *os << '(';
-  // We cannot use UniversalPrint(value.first, os) here, as T1 may be
-  // a reference type.  The same for printing value.second.
-  UniversalPrinter<T1>::Print(value.first, os);
-  *os << ", ";
-  UniversalPrinter<T2>::Print(value.second, os);
-  *os << ')';
-}
-
-// Implements printing a non-reference type T by letting the compiler
-// pick the right overload of PrintTo() for T.
-template <typename T>
-class UniversalPrinter {
- public:
-  // MSVC warns about adding const to a function type, so we want to
-  // disable the warning.
-#ifdef _MSC_VER
-# pragma warning(push)          // Saves the current warning state.
-# pragma warning(disable:4180)  // Temporarily disables warning 4180.
-#endif  // _MSC_VER
-
-  // Note: we deliberately don't call this PrintTo(), as that name
-  // conflicts with ::testing::internal::PrintTo in the body of the
-  // function.
-  static void Print(const T& value, ::std::ostream* os) {
-    // By default, ::testing::internal::PrintTo() is used for printing
-    // the value.
-    //
-    // Thanks to Koenig look-up, if T is a class and has its own
-    // PrintTo() function defined in its namespace, that function will
-    // be visible here.  Since it is more specific than the generic ones
-    // in ::testing::internal, it will be picked by the compiler in the
-    // following statement - exactly what we want.
-    PrintTo(value, os);
-  }
-
-#ifdef _MSC_VER
-# pragma warning(pop)           // Restores the warning state.
-#endif  // _MSC_VER
-};
-
-// UniversalPrintArray(begin, len, os) prints an array of 'len'
-// elements, starting at address 'begin'.
-template <typename T>
-void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) {
-  if (len == 0) {
-    *os << "{}";
-  } else {
-    *os << "{ ";
-    const size_t kThreshold = 18;
-    const size_t kChunkSize = 8;
-    // If the array has more than kThreshold elements, we'll have to
-    // omit some details by printing only the first and the last
-    // kChunkSize elements.
-    // TODO(wan@google.com): let the user control the threshold using a flag.
-    if (len <= kThreshold) {
-      PrintRawArrayTo(begin, len, os);
-    } else {
-      PrintRawArrayTo(begin, kChunkSize, os);
-      *os << ", ..., ";
-      PrintRawArrayTo(begin + len - kChunkSize, kChunkSize, os);
-    }
-    *os << " }";
-  }
-}
-// This overload prints a (const) char array compactly.
-GTEST_API_ void UniversalPrintArray(
-    const char* begin, size_t len, ::std::ostream* os);
-
-// This overload prints a (const) wchar_t array compactly.
-GTEST_API_ void UniversalPrintArray(
-    const wchar_t* begin, size_t len, ::std::ostream* os);
-
-// Implements printing an array type T[N].
-template <typename T, size_t N>
-class UniversalPrinter<T[N]> {
- public:
-  // Prints the given array, omitting some elements when there are too
-  // many.
-  static void Print(const T (&a)[N], ::std::ostream* os) {
-    UniversalPrintArray(a, N, os);
-  }
-};
-
-// Implements printing a reference type T&.
-template <typename T>
-class UniversalPrinter<T&> {
- public:
-  // MSVC warns about adding const to a function type, so we want to
-  // disable the warning.
-#ifdef _MSC_VER
-# pragma warning(push)          // Saves the current warning state.
-# pragma warning(disable:4180)  // Temporarily disables warning 4180.
-#endif  // _MSC_VER
-
-  static void Print(const T& value, ::std::ostream* os) {
-    // Prints the address of the value.  We use reinterpret_cast here
-    // as static_cast doesn't compile when T is a function type.
-    *os << "@" << reinterpret_cast<const void*>(&value) << " ";
-
-    // Then prints the value itself.
-    UniversalPrint(value, os);
-  }
-
-#ifdef _MSC_VER
-# pragma warning(pop)           // Restores the warning state.
-#endif  // _MSC_VER
-};
-
-// Prints a value tersely: for a reference type, the referenced value
-// (but not the address) is printed; for a (const) char pointer, the
-// NUL-terminated string (but not the pointer) is printed.
-
-template <typename T>
-class UniversalTersePrinter {
- public:
-  static void Print(const T& value, ::std::ostream* os) {
-    UniversalPrint(value, os);
-  }
-};
-template <typename T>
-class UniversalTersePrinter<T&> {
- public:
-  static void Print(const T& value, ::std::ostream* os) {
-    UniversalPrint(value, os);
-  }
-};
-template <typename T, size_t N>
-class UniversalTersePrinter<T[N]> {
- public:
-  static void Print(const T (&value)[N], ::std::ostream* os) {
-    UniversalPrinter<T[N]>::Print(value, os);
-  }
-};
-template <>
-class UniversalTersePrinter<const char*> {
- public:
-  static void Print(const char* str, ::std::ostream* os) {
-    if (str == NULL) {
-      *os << "NULL";
-    } else {
-      UniversalPrint(string(str), os);
-    }
-  }
-};
-template <>
-class UniversalTersePrinter<char*> {
- public:
-  static void Print(char* str, ::std::ostream* os) {
-    UniversalTersePrinter<const char*>::Print(str, os);
-  }
-};
-
-#if GTEST_HAS_STD_WSTRING
-template <>
-class UniversalTersePrinter<const wchar_t*> {
- public:
-  static void Print(const wchar_t* str, ::std::ostream* os) {
-    if (str == NULL) {
-      *os << "NULL";
-    } else {
-      UniversalPrint(::std::wstring(str), os);
-    }
-  }
-};
-#endif
-
-template <>
-class UniversalTersePrinter<wchar_t*> {
- public:
-  static void Print(wchar_t* str, ::std::ostream* os) {
-    UniversalTersePrinter<const wchar_t*>::Print(str, os);
-  }
-};
-
-template <typename T>
-void UniversalTersePrint(const T& value, ::std::ostream* os) {
-  UniversalTersePrinter<T>::Print(value, os);
-}
-
-// Prints a value using the type inferred by the compiler.  The
-// difference between this and UniversalTersePrint() is that for a
-// (const) char pointer, this prints both the pointer and the
-// NUL-terminated string.
-template <typename T>
-void UniversalPrint(const T& value, ::std::ostream* os) {
-  // A workarond for the bug in VC++ 7.1 that prevents us from instantiating
-  // UniversalPrinter with T directly.
-  typedef T T1;
-  UniversalPrinter<T1>::Print(value, os);
-}
-
-#if GTEST_HAS_TR1_TUPLE
-typedef ::std::vector<string> Strings;
-
-// This helper template allows PrintTo() for tuples and
-// UniversalTersePrintTupleFieldsToStrings() to be defined by
-// induction on the number of tuple fields.  The idea is that
-// TuplePrefixPrinter<N>::PrintPrefixTo(t, os) prints the first N
-// fields in tuple t, and can be defined in terms of
-// TuplePrefixPrinter<N - 1>.
-
-// The inductive case.
-template <size_t N>
-struct TuplePrefixPrinter {
-  // Prints the first N fields of a tuple.
-  template <typename Tuple>
-  static void PrintPrefixTo(const Tuple& t, ::std::ostream* os) {
-    TuplePrefixPrinter<N - 1>::PrintPrefixTo(t, os);
-    *os << ", ";
-    UniversalPrinter<typename ::std::tr1::tuple_element<N - 1, Tuple>::type>
-        ::Print(::std::tr1::get<N - 1>(t), os);
-  }
-
-  // Tersely prints the first N fields of a tuple to a string vector,
-  // one element for each field.
-  template <typename Tuple>
-  static void TersePrintPrefixToStrings(const Tuple& t, Strings* strings) {
-    TuplePrefixPrinter<N - 1>::TersePrintPrefixToStrings(t, strings);
-    ::std::stringstream ss;
-    UniversalTersePrint(::std::tr1::get<N - 1>(t), &ss);
-    strings->push_back(ss.str());
-  }
-};
-
-// Base cases.
-template <>
-struct TuplePrefixPrinter<0> {
-  template <typename Tuple>
-  static void PrintPrefixTo(const Tuple&, ::std::ostream*) {}
-
-  template <typename Tuple>
-  static void TersePrintPrefixToStrings(const Tuple&, Strings*) {}
-};
-// We have to specialize the entire TuplePrefixPrinter<> class
-// template here, even though the definition of
-// TersePrintPrefixToStrings() is the same as the generic version, as
-// Embarcadero (formerly CodeGear, formerly Borland) C++ doesn't
-// support specializing a method template of a class template.
-template <>
-struct TuplePrefixPrinter<1> {
-  template <typename Tuple>
-  static void PrintPrefixTo(const Tuple& t, ::std::ostream* os) {
-    UniversalPrinter<typename ::std::tr1::tuple_element<0, Tuple>::type>::
-        Print(::std::tr1::get<0>(t), os);
-  }
-
-  template <typename Tuple>
-  static void TersePrintPrefixToStrings(const Tuple& t, Strings* strings) {
-    ::std::stringstream ss;
-    UniversalTersePrint(::std::tr1::get<0>(t), &ss);
-    strings->push_back(ss.str());
-  }
-};
-
-// Helper function for printing a tuple.  T must be instantiated with
-// a tuple type.
-template <typename T>
-void PrintTupleTo(const T& t, ::std::ostream* os) {
-  *os << "(";
-  TuplePrefixPrinter< ::std::tr1::tuple_size<T>::value>::
-      PrintPrefixTo(t, os);
-  *os << ")";
-}
-
-// Prints the fields of a tuple tersely to a string vector, one
-// element for each field.  See the comment before
-// UniversalTersePrint() for how we define "tersely".
-template <typename Tuple>
-Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) {
-  Strings result;
-  TuplePrefixPrinter< ::std::tr1::tuple_size<Tuple>::value>::
-      TersePrintPrefixToStrings(value, &result);
-  return result;
-}
-#endif  // GTEST_HAS_TR1_TUPLE
-
-}  // namespace internal
-
-template <typename T>
-::std::string PrintToString(const T& value) {
-  ::std::stringstream ss;
-  internal::UniversalTersePrinter<T>::Print(value, &ss);
-  return ss.str();
-}
-
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
-
-#if GTEST_HAS_PARAM_TEST
-
-namespace testing {
-namespace internal {
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Outputs a message explaining invalid registration of different
-// fixture class for the same test case. This may happen when
-// TEST_P macro is used to define two tests with the same name
-// but in different namespaces.
-GTEST_API_ void ReportInvalidTestCaseType(const char* test_case_name,
-                                          const char* file, int line);
-
-template <typename> class ParamGeneratorInterface;
-template <typename> class ParamGenerator;
-
-// Interface for iterating over elements provided by an implementation
-// of ParamGeneratorInterface<T>.
-template <typename T>
-class ParamIteratorInterface {
- public:
-  virtual ~ParamIteratorInterface() {}
-  // A pointer to the base generator instance.
-  // Used only for the purposes of iterator comparison
-  // to make sure that two iterators belong to the same generator.
-  virtual const ParamGeneratorInterface<T>* BaseGenerator() const = 0;
-  // Advances iterator to point to the next element
-  // provided by the generator. The caller is responsible
-  // for not calling Advance() on an iterator equal to
-  // BaseGenerator()->End().
-  virtual void Advance() = 0;
-  // Clones the iterator object. Used for implementing copy semantics
-  // of ParamIterator<T>.
-  virtual ParamIteratorInterface* Clone() const = 0;
-  // Dereferences the current iterator and provides (read-only) access
-  // to the pointed value. It is the caller's responsibility not to call
-  // Current() on an iterator equal to BaseGenerator()->End().
-  // Used for implementing ParamGenerator<T>::operator*().
-  virtual const T* Current() const = 0;
-  // Determines whether the given iterator and other point to the same
-  // element in the sequence generated by the generator.
-  // Used for implementing ParamGenerator<T>::operator==().
-  virtual bool Equals(const ParamIteratorInterface& other) const = 0;
-};
-
-// Class iterating over elements provided by an implementation of
-// ParamGeneratorInterface<T>. It wraps ParamIteratorInterface<T>
-// and implements the const forward iterator concept.
-template <typename T>
-class ParamIterator {
- public:
-  typedef T value_type;
-  typedef const T& reference;
-  typedef ptrdiff_t difference_type;
-
-  // ParamIterator assumes ownership of the impl_ pointer.
-  ParamIterator(const ParamIterator& other) : impl_(other.impl_->Clone()) {}
-  ParamIterator& operator=(const ParamIterator& other) {
-    if (this != &other)
-      impl_.reset(other.impl_->Clone());
-    return *this;
-  }
-
-  const T& operator*() const { return *impl_->Current(); }
-  const T* operator->() const { return impl_->Current(); }
-  // Prefix version of operator++.
-  ParamIterator& operator++() {
-    impl_->Advance();
-    return *this;
-  }
-  // Postfix version of operator++.
-  ParamIterator operator++(int /*unused*/) {
-    ParamIteratorInterface<T>* clone = impl_->Clone();
-    impl_->Advance();
-    return ParamIterator(clone);
-  }
-  bool operator==(const ParamIterator& other) const {
-    return impl_.get() == other.impl_.get() || impl_->Equals(*other.impl_);
-  }
-  bool operator!=(const ParamIterator& other) const {
-    return !(*this == other);
-  }
-
- private:
-  friend class ParamGenerator<T>;
-  explicit ParamIterator(ParamIteratorInterface<T>* impl) : impl_(impl) {}
-  scoped_ptr<ParamIteratorInterface<T> > impl_;
-};
-
-// ParamGeneratorInterface<T> is the binary interface to access generators
-// defined in other translation units.
-template <typename T>
-class ParamGeneratorInterface {
- public:
-  typedef T ParamType;
-
-  virtual ~ParamGeneratorInterface() {}
-
-  // Generator interface definition
-  virtual ParamIteratorInterface<T>* Begin() const = 0;
-  virtual ParamIteratorInterface<T>* End() const = 0;
-};
-
-// Wraps ParamGeneratorInterface<T> and provides general generator syntax
-// compatible with the STL Container concept.
-// This class implements copy initialization semantics and the contained
-// ParamGeneratorInterface<T> instance is shared among all copies
-// of the original object. This is possible because that instance is immutable.
-template<typename T>
-class ParamGenerator {
- public:
-  typedef ParamIterator<T> iterator;
-
-  explicit ParamGenerator(ParamGeneratorInterface<T>* impl) : impl_(impl) {}
-  ParamGenerator(const ParamGenerator& other) : impl_(other.impl_) {}
-
-  ParamGenerator& operator=(const ParamGenerator& other) {
-    impl_ = other.impl_;
-    return *this;
-  }
-
-  iterator begin() const { return iterator(impl_->Begin()); }
-  iterator end() const { return iterator(impl_->End()); }
-
- private:
-  linked_ptr<const ParamGeneratorInterface<T> > impl_;
-};
-
-// Generates values from a range of two comparable values. Can be used to
-// generate sequences of user-defined types that implement operator+() and
-// operator<().
-// This class is used in the Range() function.
-template <typename T, typename IncrementT>
-class RangeGenerator : public ParamGeneratorInterface<T> {
- public:
-  RangeGenerator(T begin, T end, IncrementT step)
-      : begin_(begin), end_(end),
-        step_(step), end_index_(CalculateEndIndex(begin, end, step)) {}
-  virtual ~RangeGenerator() {}
-
-  virtual ParamIteratorInterface<T>* Begin() const {
-    return new Iterator(this, begin_, 0, step_);
-  }
-  virtual ParamIteratorInterface<T>* End() const {
-    return new Iterator(this, end_, end_index_, step_);
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<T> {
-   public:
-    Iterator(const ParamGeneratorInterface<T>* base, T value, int index,
-             IncrementT step)
-        : base_(base), value_(value), index_(index), step_(step) {}
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<T>* BaseGenerator() const {
-      return base_;
-    }
-    virtual void Advance() {
-      value_ = value_ + step_;
-      index_++;
-    }
-    virtual ParamIteratorInterface<T>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const T* Current() const { return &value_; }
-    virtual bool Equals(const ParamIteratorInterface<T>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const int other_index =
-          CheckedDowncastToActualType<const Iterator>(&other)->index_;
-      return index_ == other_index;
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : ParamIteratorInterface<T>(),
-          base_(other.base_), value_(other.value_), index_(other.index_),
-          step_(other.step_) {}
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<T>* const base_;
-    T value_;
-    int index_;
-    const IncrementT step_;
-  };  // class RangeGenerator::Iterator
-
-  static int CalculateEndIndex(const T& begin,
-                               const T& end,
-                               const IncrementT& step) {
-    int end_index = 0;
-    for (T i = begin; i < end; i = i + step)
-      end_index++;
-    return end_index;
-  }
-
-  // No implementation - assignment is unsupported.
-  void operator=(const RangeGenerator& other);
-
-  const T begin_;
-  const T end_;
-  const IncrementT step_;
-  // The index for the end() iterator. All the elements in the generated
-  // sequence are indexed (0-based) to aid iterator comparison.
-  const int end_index_;
-};  // class RangeGenerator
-
-
-// Generates values from a pair of STL-style iterators. Used in the
-// ValuesIn() function. The elements are copied from the source range
-// since the source can be located on the stack, and the generator
-// is likely to persist beyond that stack frame.
-template <typename T>
-class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface<T> {
- public:
-  template <typename ForwardIterator>
-  ValuesInIteratorRangeGenerator(ForwardIterator begin, ForwardIterator end)
-      : container_(begin, end) {}
-  virtual ~ValuesInIteratorRangeGenerator() {}
-
-  virtual ParamIteratorInterface<T>* Begin() const {
-    return new Iterator(this, container_.begin());
-  }
-  virtual ParamIteratorInterface<T>* End() const {
-    return new Iterator(this, container_.end());
-  }
-
- private:
-  typedef typename ::std::vector<T> ContainerType;
-
-  class Iterator : public ParamIteratorInterface<T> {
-   public:
-    Iterator(const ParamGeneratorInterface<T>* base,
-             typename ContainerType::const_iterator iterator)
-        : base_(base), iterator_(iterator) {}
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<T>* BaseGenerator() const {
-      return base_;
-    }
-    virtual void Advance() {
-      ++iterator_;
-      value_.reset();
-    }
-    virtual ParamIteratorInterface<T>* Clone() const {
-      return new Iterator(*this);
-    }
-    // We need to use cached value referenced by iterator_ because *iterator_
-    // can return a temporary object (and of type other then T), so just
-    // having "return &*iterator_;" doesn't work.
-    // value_ is updated here and not in Advance() because Advance()
-    // can advance iterator_ beyond the end of the range, and we cannot
-    // detect that fact. The client code, on the other hand, is
-    // responsible for not calling Current() on an out-of-range iterator.
-    virtual const T* Current() const {
-      if (value_.get() == NULL)
-        value_.reset(new T(*iterator_));
-      return value_.get();
-    }
-    virtual bool Equals(const ParamIteratorInterface<T>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      return iterator_ ==
-          CheckedDowncastToActualType<const Iterator>(&other)->iterator_;
-    }
-
-   private:
-    Iterator(const Iterator& other)
-          // The explicit constructor call suppresses a false warning
-          // emitted by gcc when supplied with the -Wextra option.
-        : ParamIteratorInterface<T>(),
-          base_(other.base_),
-          iterator_(other.iterator_) {}
-
-    const ParamGeneratorInterface<T>* const base_;
-    typename ContainerType::const_iterator iterator_;
-    // A cached value of *iterator_. We keep it here to allow access by
-    // pointer in the wrapping iterator's operator->().
-    // value_ needs to be mutable to be accessed in Current().
-    // Use of scoped_ptr helps manage cached value's lifetime,
-    // which is bound by the lifespan of the iterator itself.
-    mutable scoped_ptr<const T> value_;
-  };  // class ValuesInIteratorRangeGenerator::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const ValuesInIteratorRangeGenerator& other);
-
-  const ContainerType container_;
-};  // class ValuesInIteratorRangeGenerator
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Stores a parameter value and later creates tests parameterized with that
-// value.
-template <class TestClass>
-class ParameterizedTestFactory : public TestFactoryBase {
- public:
-  typedef typename TestClass::ParamType ParamType;
-  explicit ParameterizedTestFactory(ParamType parameter) :
-      parameter_(parameter) {}
-  virtual Test* CreateTest() {
-    TestClass::SetParam(&parameter_);
-    return new TestClass();
-  }
-
- private:
-  const ParamType parameter_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestFactory);
-};
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// TestMetaFactoryBase is a base class for meta-factories that create
-// test factories for passing into MakeAndRegisterTestInfo function.
-template <class ParamType>
-class TestMetaFactoryBase {
- public:
-  virtual ~TestMetaFactoryBase() {}
-
-  virtual TestFactoryBase* CreateTestFactory(ParamType parameter) = 0;
-};
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// TestMetaFactory creates test factories for passing into
-// MakeAndRegisterTestInfo function. Since MakeAndRegisterTestInfo receives
-// ownership of test factory pointer, same factory object cannot be passed
-// into that method twice. But ParameterizedTestCaseInfo is going to call
-// it for each Test/Parameter value combination. Thus it needs meta factory
-// creator class.
-template <class TestCase>
-class TestMetaFactory
-    : public TestMetaFactoryBase<typename TestCase::ParamType> {
- public:
-  typedef typename TestCase::ParamType ParamType;
-
-  TestMetaFactory() {}
-
-  virtual TestFactoryBase* CreateTestFactory(ParamType parameter) {
-    return new ParameterizedTestFactory<TestCase>(parameter);
-  }
-
- private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestMetaFactory);
-};
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// ParameterizedTestCaseInfoBase is a generic interface
-// to ParameterizedTestCaseInfo classes. ParameterizedTestCaseInfoBase
-// accumulates test information provided by TEST_P macro invocations
-// and generators provided by INSTANTIATE_TEST_CASE_P macro invocations
-// and uses that information to register all resulting test instances
-// in RegisterTests method. The ParameterizeTestCaseRegistry class holds
-// a collection of pointers to the ParameterizedTestCaseInfo objects
-// and calls RegisterTests() on each of them when asked.
-class ParameterizedTestCaseInfoBase {
- public:
-  virtual ~ParameterizedTestCaseInfoBase() {}
-
-  // Base part of test case name for display purposes.
-  virtual const string& GetTestCaseName() const = 0;
-  // Test case id to verify identity.
-  virtual TypeId GetTestCaseTypeId() const = 0;
-  // UnitTest class invokes this method to register tests in this
-  // test case right before running them in RUN_ALL_TESTS macro.
-  // This method should not be called more then once on any single
-  // instance of a ParameterizedTestCaseInfoBase derived class.
-  virtual void RegisterTests() = 0;
-
- protected:
-  ParameterizedTestCaseInfoBase() {}
-
- private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfoBase);
-};
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// ParameterizedTestCaseInfo accumulates tests obtained from TEST_P
-// macro invocations for a particular test case and generators
-// obtained from INSTANTIATE_TEST_CASE_P macro invocations for that
-// test case. It registers tests with all values generated by all
-// generators when asked.
-template <class TestCase>
-class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase {
- public:
-  // ParamType and GeneratorCreationFunc are private types but are required
-  // for declarations of public methods AddTestPattern() and
-  // AddTestCaseInstantiation().
-  typedef typename TestCase::ParamType ParamType;
-  // A function that returns an instance of appropriate generator type.
-  typedef ParamGenerator<ParamType>(GeneratorCreationFunc)();
-
-  explicit ParameterizedTestCaseInfo(const char* name)
-      : test_case_name_(name) {}
-
-  // Test case base name for display purposes.
-  virtual const string& GetTestCaseName() const { return test_case_name_; }
-  // Test case id to verify identity.
-  virtual TypeId GetTestCaseTypeId() const { return GetTypeId<TestCase>(); }
-  // TEST_P macro uses AddTestPattern() to record information
-  // about a single test in a LocalTestInfo structure.
-  // test_case_name is the base name of the test case (without invocation
-  // prefix). test_base_name is the name of an individual test without
-  // parameter index. For the test SequenceA/FooTest.DoBar/1 FooTest is
-  // test case base name and DoBar is test base name.
-  void AddTestPattern(const char* test_case_name,
-                      const char* test_base_name,
-                      TestMetaFactoryBase<ParamType>* meta_factory) {
-    tests_.push_back(linked_ptr<TestInfo>(new TestInfo(test_case_name,
-                                                       test_base_name,
-                                                       meta_factory)));
-  }
-  // INSTANTIATE_TEST_CASE_P macro uses AddGenerator() to record information
-  // about a generator.
-  int AddTestCaseInstantiation(const string& instantiation_name,
-                               GeneratorCreationFunc* func,
-                               const char* /* file */,
-                               int /* line */) {
-    instantiations_.push_back(::std::make_pair(instantiation_name, func));
-    return 0;  // Return value used only to run this method in namespace scope.
-  }
-  // UnitTest class invokes this method to register tests in this test case
-  // test cases right before running tests in RUN_ALL_TESTS macro.
-  // This method should not be called more then once on any single
-  // instance of a ParameterizedTestCaseInfoBase derived class.
-  // UnitTest has a guard to prevent from calling this method more then once.
-  virtual void RegisterTests() {
-    for (typename TestInfoContainer::iterator test_it = tests_.begin();
-         test_it != tests_.end(); ++test_it) {
-      linked_ptr<TestInfo> test_info = *test_it;
-      for (typename InstantiationContainer::iterator gen_it =
-               instantiations_.begin(); gen_it != instantiations_.end();
-               ++gen_it) {
-        const string& instantiation_name = gen_it->first;
-        ParamGenerator<ParamType> generator((*gen_it->second)());
-
-        string test_case_name;
-        if ( !instantiation_name.empty() )
-          test_case_name = instantiation_name + "/";
-        test_case_name += test_info->test_case_base_name;
-
-        int i = 0;
-        for (typename ParamGenerator<ParamType>::iterator param_it =
-                 generator.begin();
-             param_it != generator.end(); ++param_it, ++i) {
-          Message test_name_stream;
-          test_name_stream << test_info->test_base_name << "/" << i;
-          MakeAndRegisterTestInfo(
-              test_case_name.c_str(),
-              test_name_stream.GetString().c_str(),
-              NULL,  // No type parameter.
-              PrintToString(*param_it).c_str(),
-              GetTestCaseTypeId(),
-              TestCase::SetUpTestCase,
-              TestCase::TearDownTestCase,
-              test_info->test_meta_factory->CreateTestFactory(*param_it));
-        }  // for param_it
-      }  // for gen_it
-    }  // for test_it
-  }  // RegisterTests
-
- private:
-  // LocalTestInfo structure keeps information about a single test registered
-  // with TEST_P macro.
-  struct TestInfo {
-    TestInfo(const char* a_test_case_base_name,
-             const char* a_test_base_name,
-             TestMetaFactoryBase<ParamType>* a_test_meta_factory) :
-        test_case_base_name(a_test_case_base_name),
-        test_base_name(a_test_base_name),
-        test_meta_factory(a_test_meta_factory) {}
-
-    const string test_case_base_name;
-    const string test_base_name;
-    const scoped_ptr<TestMetaFactoryBase<ParamType> > test_meta_factory;
-  };
-  typedef ::std::vector<linked_ptr<TestInfo> > TestInfoContainer;
-  // Keeps pairs of <Instantiation name, Sequence generator creation function>
-  // received from INSTANTIATE_TEST_CASE_P macros.
-  typedef ::std::vector<std::pair<string, GeneratorCreationFunc*> >
-      InstantiationContainer;
-
-  const string test_case_name_;
-  TestInfoContainer tests_;
-  InstantiationContainer instantiations_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfo);
-};  // class ParameterizedTestCaseInfo
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// ParameterizedTestCaseRegistry contains a map of ParameterizedTestCaseInfoBase
-// classes accessed by test case names. TEST_P and INSTANTIATE_TEST_CASE_P
-// macros use it to locate their corresponding ParameterizedTestCaseInfo
-// descriptors.
-class ParameterizedTestCaseRegistry {
- public:
-  ParameterizedTestCaseRegistry() {}
-  ~ParameterizedTestCaseRegistry() {
-    for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
-         it != test_case_infos_.end(); ++it) {
-      delete *it;
-    }
-  }
-
-  // Looks up or creates and returns a structure containing information about
-  // tests and instantiations of a particular test case.
-  template <class TestCase>
-  ParameterizedTestCaseInfo<TestCase>* GetTestCasePatternHolder(
-      const char* test_case_name,
-      const char* file,
-      int line) {
-    ParameterizedTestCaseInfo<TestCase>* typed_test_info = NULL;
-    for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
-         it != test_case_infos_.end(); ++it) {
-      if ((*it)->GetTestCaseName() == test_case_name) {
-        if ((*it)->GetTestCaseTypeId() != GetTypeId<TestCase>()) {
-          // Complain about incorrect usage of Google Test facilities
-          // and terminate the program since we cannot guaranty correct
-          // test case setup and tear-down in this case.
-          ReportInvalidTestCaseType(test_case_name,  file, line);
-          posix::Abort();
-        } else {
-          // At this point we are sure that the object we found is of the same
-          // type we are looking for, so we downcast it to that type
-          // without further checks.
-          typed_test_info = CheckedDowncastToActualType<
-              ParameterizedTestCaseInfo<TestCase> >(*it);
-        }
-        break;
-      }
-    }
-    if (typed_test_info == NULL) {
-      typed_test_info = new ParameterizedTestCaseInfo<TestCase>(test_case_name);
-      test_case_infos_.push_back(typed_test_info);
-    }
-    return typed_test_info;
-  }
-  void RegisterTests() {
-    for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
-         it != test_case_infos_.end(); ++it) {
-      (*it)->RegisterTests();
-    }
-  }
-
- private:
-  typedef ::std::vector<ParameterizedTestCaseInfoBase*> TestCaseInfoContainer;
-
-  TestCaseInfoContainer test_case_infos_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseRegistry);
-};
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  //  GTEST_HAS_PARAM_TEST
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
-// This file was GENERATED by command:
-//     pump.py gtest-param-util-generated.h.pump
-// DO NOT EDIT BY HAND!!!
-
-// Copyright 2008 Google Inc.
-// All Rights Reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: vladl@google.com (Vlad Losev)
-
-// Type and function utilities for implementing parameterized tests.
-// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
-//
-// Currently Google Test supports at most 50 arguments in Values,
-// and at most 10 arguments in Combine. Please contact
-// googletestframework@googlegroups.com if you need more.
-// Please note that the number of arguments to Combine is limited
-// by the maximum arity of the implementation of tr1::tuple which is
-// currently set at 10.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
-
-// scripts/fuse_gtest.py depends on gtest's own header being #included
-// *unconditionally*.  Therefore these #includes cannot be moved
-// inside #if GTEST_HAS_PARAM_TEST.
-
-#if GTEST_HAS_PARAM_TEST
-
-namespace testing {
-
-// Forward declarations of ValuesIn(), which is implemented in
-// include/gtest/gtest-param-test.h.
-template <typename ForwardIterator>
-internal::ParamGenerator<
-  typename ::testing::internal::IteratorTraits<ForwardIterator>::value_type>
-ValuesIn(ForwardIterator begin, ForwardIterator end);
-
-template <typename T, size_t N>
-internal::ParamGenerator<T> ValuesIn(const T (&array)[N]);
-
-template <class Container>
-internal::ParamGenerator<typename Container::value_type> ValuesIn(
-    const Container& container);
-
-namespace internal {
-
-// Used in the Values() function to provide polymorphic capabilities.
-template <typename T1>
-class ValueArray1 {
- public:
-  explicit ValueArray1(T1 v1) : v1_(v1) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const { return ValuesIn(&v1_, &v1_ + 1); }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray1& other);
-
-  const T1 v1_;
-};
-
-template <typename T1, typename T2>
-class ValueArray2 {
- public:
-  ValueArray2(T1 v1, T2 v2) : v1_(v1), v2_(v2) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray2& other);
-
-  const T1 v1_;
-  const T2 v2_;
-};
-
-template <typename T1, typename T2, typename T3>
-class ValueArray3 {
- public:
-  ValueArray3(T1 v1, T2 v2, T3 v3) : v1_(v1), v2_(v2), v3_(v3) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray3& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4>
-class ValueArray4 {
- public:
-  ValueArray4(T1 v1, T2 v2, T3 v3, T4 v4) : v1_(v1), v2_(v2), v3_(v3),
-      v4_(v4) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray4& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class ValueArray5 {
- public:
-  ValueArray5(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5) : v1_(v1), v2_(v2), v3_(v3),
-      v4_(v4), v5_(v5) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray5& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6>
-class ValueArray6 {
- public:
-  ValueArray6(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6) : v1_(v1), v2_(v2),
-      v3_(v3), v4_(v4), v5_(v5), v6_(v6) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray6& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7>
-class ValueArray7 {
- public:
-  ValueArray7(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7) : v1_(v1),
-      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray7& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8>
-class ValueArray8 {
- public:
-  ValueArray8(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
-      T8 v8) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray8& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9>
-class ValueArray9 {
- public:
-  ValueArray9(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
-      T9 v9) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray9& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10>
-class ValueArray10 {
- public:
-  ValueArray10(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray10& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11>
-class ValueArray11 {
- public:
-  ValueArray11(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
-      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray11& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12>
-class ValueArray12 {
- public:
-  ValueArray12(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
-      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray12& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13>
-class ValueArray13 {
- public:
-  ValueArray13(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
-      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
-      v12_(v12), v13_(v13) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray13& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14>
-class ValueArray14 {
- public:
-  ValueArray14(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) : v1_(v1), v2_(v2), v3_(v3),
-      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray14& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15>
-class ValueArray15 {
- public:
-  ValueArray15(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) : v1_(v1), v2_(v2),
-      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray15& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16>
-class ValueArray16 {
- public:
-  ValueArray16(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16) : v1_(v1),
-      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
-      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
-      v16_(v16) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray16& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17>
-class ValueArray17 {
- public:
-  ValueArray17(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16,
-      T17 v17) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray17& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18>
-class ValueArray18 {
- public:
-  ValueArray18(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray18& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19>
-class ValueArray19 {
- public:
-  ValueArray19(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
-      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
-      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray19& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20>
-class ValueArray20 {
- public:
-  ValueArray20(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
-      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
-      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
-      v19_(v19), v20_(v20) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray20& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21>
-class ValueArray21 {
- public:
-  ValueArray21(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
-      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
-      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
-      v18_(v18), v19_(v19), v20_(v20), v21_(v21) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray21& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22>
-class ValueArray22 {
- public:
-  ValueArray22(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22) : v1_(v1), v2_(v2), v3_(v3),
-      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
-      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray22& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23>
-class ValueArray23 {
- public:
-  ValueArray23(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23) : v1_(v1), v2_(v2),
-      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
-      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
-      v23_(v23) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray23& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24>
-class ValueArray24 {
- public:
-  ValueArray24(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24) : v1_(v1),
-      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
-      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
-      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
-      v22_(v22), v23_(v23), v24_(v24) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray24& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25>
-class ValueArray25 {
- public:
-  ValueArray25(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24,
-      T25 v25) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
-      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray25& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26>
-class ValueArray26 {
- public:
-  ValueArray26(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
-      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray26& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27>
-class ValueArray27 {
- public:
-  ValueArray27(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
-      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
-      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19),
-      v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25),
-      v26_(v26), v27_(v27) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray27& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28>
-class ValueArray28 {
- public:
-  ValueArray28(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
-      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
-      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
-      v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24),
-      v25_(v25), v26_(v26), v27_(v27), v28_(v28) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray28& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29>
-class ValueArray29 {
- public:
-  ValueArray29(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
-      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
-      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
-      v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23),
-      v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray29& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30>
-class ValueArray30 {
- public:
-  ValueArray30(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) : v1_(v1), v2_(v2), v3_(v3),
-      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
-      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
-      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
-      v29_(v29), v30_(v30) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray30& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31>
-class ValueArray31 {
- public:
-  ValueArray31(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) : v1_(v1), v2_(v2),
-      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
-      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
-      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
-      v29_(v29), v30_(v30), v31_(v31) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray31& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32>
-class ValueArray32 {
- public:
-  ValueArray32(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32) : v1_(v1),
-      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
-      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
-      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
-      v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27),
-      v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray32& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33>
-class ValueArray33 {
- public:
-  ValueArray33(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32,
-      T33 v33) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
-      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
-      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
-      v33_(v33) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray33& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34>
-class ValueArray34 {
- public:
-  ValueArray34(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
-      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
-      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
-      v33_(v33), v34_(v34) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray34& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35>
-class ValueArray35 {
- public:
-  ValueArray35(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
-      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
-      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19),
-      v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25),
-      v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31),
-      v32_(v32), v33_(v33), v34_(v34), v35_(v35) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray35& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36>
-class ValueArray36 {
- public:
-  ValueArray36(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
-      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
-      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
-      v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24),
-      v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30),
-      v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray36& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37>
-class ValueArray37 {
- public:
-  ValueArray37(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
-      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
-      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
-      v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23),
-      v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29),
-      v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35),
-      v36_(v36), v37_(v37) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray37& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38>
-class ValueArray38 {
- public:
-  ValueArray38(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38) : v1_(v1), v2_(v2), v3_(v3),
-      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
-      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
-      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
-      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
-      v35_(v35), v36_(v36), v37_(v37), v38_(v38) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray38& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39>
-class ValueArray39 {
- public:
-  ValueArray39(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39) : v1_(v1), v2_(v2),
-      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
-      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
-      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
-      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
-      v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray39& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40>
-class ValueArray40 {
- public:
-  ValueArray40(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) : v1_(v1),
-      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
-      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
-      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
-      v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27),
-      v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33),
-      v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39),
-      v40_(v40) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_), static_cast<T>(v40_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray40& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41>
-class ValueArray41 {
- public:
-  ValueArray41(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40,
-      T41 v41) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
-      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
-      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
-      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
-      v39_(v39), v40_(v40), v41_(v41) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray41& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42>
-class ValueArray42 {
- public:
-  ValueArray42(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
-      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
-      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
-      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
-      v39_(v39), v40_(v40), v41_(v41), v42_(v42) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
-        static_cast<T>(v42_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray42& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43>
-class ValueArray43 {
- public:
-  ValueArray43(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42, T43 v43) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
-      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
-      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19),
-      v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25),
-      v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31),
-      v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37),
-      v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
-        static_cast<T>(v42_), static_cast<T>(v43_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray43& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-  const T43 v43_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44>
-class ValueArray44 {
- public:
-  ValueArray44(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42, T43 v43, T44 v44) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
-      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
-      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
-      v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24),
-      v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30),
-      v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36),
-      v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42),
-      v43_(v43), v44_(v44) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
-        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray44& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-  const T43 v43_;
-  const T44 v44_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45>
-class ValueArray45 {
- public:
-  ValueArray45(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42, T43 v43, T44 v44, T45 v45) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
-      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
-      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
-      v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23),
-      v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29),
-      v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35),
-      v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41),
-      v42_(v42), v43_(v43), v44_(v44), v45_(v45) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
-        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
-        static_cast<T>(v45_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray45& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-  const T43 v43_;
-  const T44 v44_;
-  const T45 v45_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46>
-class ValueArray46 {
- public:
-  ValueArray46(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) : v1_(v1), v2_(v2), v3_(v3),
-      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
-      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
-      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
-      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
-      v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40),
-      v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
-        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
-        static_cast<T>(v45_), static_cast<T>(v46_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray46& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-  const T43 v43_;
-  const T44 v44_;
-  const T45 v45_;
-  const T46 v46_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47>
-class ValueArray47 {
- public:
-  ValueArray47(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) : v1_(v1), v2_(v2),
-      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
-      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
-      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
-      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
-      v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40),
-      v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46),
-      v47_(v47) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
-        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
-        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray47& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-  const T43 v43_;
-  const T44 v44_;
-  const T45 v45_;
-  const T46 v46_;
-  const T47 v47_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48>
-class ValueArray48 {
- public:
-  ValueArray48(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48) : v1_(v1),
-      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
-      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
-      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
-      v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27),
-      v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33),
-      v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39),
-      v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45),
-      v46_(v46), v47_(v47), v48_(v48) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
-        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
-        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_),
-        static_cast<T>(v48_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray48& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-  const T43 v43_;
-  const T44 v44_;
-  const T45 v45_;
-  const T46 v46_;
-  const T47 v47_;
-  const T48 v48_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49>
-class ValueArray49 {
- public:
-  ValueArray49(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48,
-      T49 v49) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
-      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
-      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
-      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
-      v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44),
-      v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
-        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
-        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_),
-        static_cast<T>(v48_), static_cast<T>(v49_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray49& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-  const T43 v43_;
-  const T44 v44_;
-  const T45 v45_;
-  const T46 v46_;
-  const T47 v47_;
-  const T48 v48_;
-  const T49 v49_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49, typename T50>
-class ValueArray50 {
- public:
-  ValueArray50(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48, T49 v49,
-      T50 v50) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
-      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
-      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
-      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
-      v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44),
-      v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49), v50_(v50) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
-        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
-        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
-        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
-        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
-        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
-        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
-        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
-        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
-        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
-        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
-        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
-        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
-        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
-        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
-        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_),
-        static_cast<T>(v48_), static_cast<T>(v49_), static_cast<T>(v50_)};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray50& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-  const T43 v43_;
-  const T44 v44_;
-  const T45 v45_;
-  const T46 v46_;
-  const T47 v47_;
-  const T48 v48_;
-  const T49 v49_;
-  const T50 v50_;
-};
-
-# if GTEST_HAS_COMBINE
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Generates values from the Cartesian product of values produced
-// by the argument generators.
-//
-template <typename T1, typename T2>
-class CartesianProductGenerator2
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2> > {
- public:
-  typedef ::std::tr1::tuple<T1, T2> ParamType;
-
-  CartesianProductGenerator2(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2)
-      : g1_(g1), g2_(g2) {}
-  virtual ~CartesianProductGenerator2() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current2_;
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator2::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator2& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-};  // class CartesianProductGenerator2
-
-
-template <typename T1, typename T2, typename T3>
-class CartesianProductGenerator3
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3> > {
- public:
-  typedef ::std::tr1::tuple<T1, T2, T3> ParamType;
-
-  CartesianProductGenerator3(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3)
-      : g1_(g1), g2_(g2), g3_(g3) {}
-  virtual ~CartesianProductGenerator3() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
-        g3_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2,
-      const ParamGenerator<T3>& g3,
-      const typename ParamGenerator<T3>::iterator& current3)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
-          begin3_(g3.begin()), end3_(g3.end()), current3_(current3)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current3_;
-      if (current3_ == end3_) {
-        current3_ = begin3_;
-        ++current2_;
-      }
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_ &&
-          current3_ == typed_other->current3_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_),
-        begin3_(other.begin3_),
-        end3_(other.end3_),
-        current3_(other.current3_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_ ||
-          current3_ == end3_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    const typename ParamGenerator<T3>::iterator begin3_;
-    const typename ParamGenerator<T3>::iterator end3_;
-    typename ParamGenerator<T3>::iterator current3_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator3::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator3& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-  const ParamGenerator<T3> g3_;
-};  // class CartesianProductGenerator3
-
-
-template <typename T1, typename T2, typename T3, typename T4>
-class CartesianProductGenerator4
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4> > {
- public:
-  typedef ::std::tr1::tuple<T1, T2, T3, T4> ParamType;
-
-  CartesianProductGenerator4(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
-      const ParamGenerator<T4>& g4)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {}
-  virtual ~CartesianProductGenerator4() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
-        g3_.begin(), g4_, g4_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
-        g4_, g4_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2,
-      const ParamGenerator<T3>& g3,
-      const typename ParamGenerator<T3>::iterator& current3,
-      const ParamGenerator<T4>& g4,
-      const typename ParamGenerator<T4>::iterator& current4)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
-          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
-          begin4_(g4.begin()), end4_(g4.end()), current4_(current4)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current4_;
-      if (current4_ == end4_) {
-        current4_ = begin4_;
-        ++current3_;
-      }
-      if (current3_ == end3_) {
-        current3_ = begin3_;
-        ++current2_;
-      }
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_ &&
-          current3_ == typed_other->current3_ &&
-          current4_ == typed_other->current4_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_),
-        begin3_(other.begin3_),
-        end3_(other.end3_),
-        current3_(other.current3_),
-        begin4_(other.begin4_),
-        end4_(other.end4_),
-        current4_(other.current4_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_ ||
-          current3_ == end3_ ||
-          current4_ == end4_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    const typename ParamGenerator<T3>::iterator begin3_;
-    const typename ParamGenerator<T3>::iterator end3_;
-    typename ParamGenerator<T3>::iterator current3_;
-    const typename ParamGenerator<T4>::iterator begin4_;
-    const typename ParamGenerator<T4>::iterator end4_;
-    typename ParamGenerator<T4>::iterator current4_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator4::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator4& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-  const ParamGenerator<T3> g3_;
-  const ParamGenerator<T4> g4_;
-};  // class CartesianProductGenerator4
-
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class CartesianProductGenerator5
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5> > {
- public:
-  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5> ParamType;
-
-  CartesianProductGenerator5(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
-      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {}
-  virtual ~CartesianProductGenerator5() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
-        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
-        g4_, g4_.end(), g5_, g5_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2,
-      const ParamGenerator<T3>& g3,
-      const typename ParamGenerator<T3>::iterator& current3,
-      const ParamGenerator<T4>& g4,
-      const typename ParamGenerator<T4>::iterator& current4,
-      const ParamGenerator<T5>& g5,
-      const typename ParamGenerator<T5>::iterator& current5)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
-          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
-          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
-          begin5_(g5.begin()), end5_(g5.end()), current5_(current5)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current5_;
-      if (current5_ == end5_) {
-        current5_ = begin5_;
-        ++current4_;
-      }
-      if (current4_ == end4_) {
-        current4_ = begin4_;
-        ++current3_;
-      }
-      if (current3_ == end3_) {
-        current3_ = begin3_;
-        ++current2_;
-      }
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_ &&
-          current3_ == typed_other->current3_ &&
-          current4_ == typed_other->current4_ &&
-          current5_ == typed_other->current5_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_),
-        begin3_(other.begin3_),
-        end3_(other.end3_),
-        current3_(other.current3_),
-        begin4_(other.begin4_),
-        end4_(other.end4_),
-        current4_(other.current4_),
-        begin5_(other.begin5_),
-        end5_(other.end5_),
-        current5_(other.current5_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_ ||
-          current3_ == end3_ ||
-          current4_ == end4_ ||
-          current5_ == end5_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    const typename ParamGenerator<T3>::iterator begin3_;
-    const typename ParamGenerator<T3>::iterator end3_;
-    typename ParamGenerator<T3>::iterator current3_;
-    const typename ParamGenerator<T4>::iterator begin4_;
-    const typename ParamGenerator<T4>::iterator end4_;
-    typename ParamGenerator<T4>::iterator current4_;
-    const typename ParamGenerator<T5>::iterator begin5_;
-    const typename ParamGenerator<T5>::iterator end5_;
-    typename ParamGenerator<T5>::iterator current5_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator5::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator5& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-  const ParamGenerator<T3> g3_;
-  const ParamGenerator<T4> g4_;
-  const ParamGenerator<T5> g5_;
-};  // class CartesianProductGenerator5
-
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6>
-class CartesianProductGenerator6
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5,
-        T6> > {
- public:
-  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6> ParamType;
-
-  CartesianProductGenerator6(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
-      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
-      const ParamGenerator<T6>& g6)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {}
-  virtual ~CartesianProductGenerator6() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
-        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
-        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2,
-      const ParamGenerator<T3>& g3,
-      const typename ParamGenerator<T3>::iterator& current3,
-      const ParamGenerator<T4>& g4,
-      const typename ParamGenerator<T4>::iterator& current4,
-      const ParamGenerator<T5>& g5,
-      const typename ParamGenerator<T5>::iterator& current5,
-      const ParamGenerator<T6>& g6,
-      const typename ParamGenerator<T6>::iterator& current6)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
-          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
-          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
-          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
-          begin6_(g6.begin()), end6_(g6.end()), current6_(current6)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current6_;
-      if (current6_ == end6_) {
-        current6_ = begin6_;
-        ++current5_;
-      }
-      if (current5_ == end5_) {
-        current5_ = begin5_;
-        ++current4_;
-      }
-      if (current4_ == end4_) {
-        current4_ = begin4_;
-        ++current3_;
-      }
-      if (current3_ == end3_) {
-        current3_ = begin3_;
-        ++current2_;
-      }
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_ &&
-          current3_ == typed_other->current3_ &&
-          current4_ == typed_other->current4_ &&
-          current5_ == typed_other->current5_ &&
-          current6_ == typed_other->current6_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_),
-        begin3_(other.begin3_),
-        end3_(other.end3_),
-        current3_(other.current3_),
-        begin4_(other.begin4_),
-        end4_(other.end4_),
-        current4_(other.current4_),
-        begin5_(other.begin5_),
-        end5_(other.end5_),
-        current5_(other.current5_),
-        begin6_(other.begin6_),
-        end6_(other.end6_),
-        current6_(other.current6_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_, *current6_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_ ||
-          current3_ == end3_ ||
-          current4_ == end4_ ||
-          current5_ == end5_ ||
-          current6_ == end6_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    const typename ParamGenerator<T3>::iterator begin3_;
-    const typename ParamGenerator<T3>::iterator end3_;
-    typename ParamGenerator<T3>::iterator current3_;
-    const typename ParamGenerator<T4>::iterator begin4_;
-    const typename ParamGenerator<T4>::iterator end4_;
-    typename ParamGenerator<T4>::iterator current4_;
-    const typename ParamGenerator<T5>::iterator begin5_;
-    const typename ParamGenerator<T5>::iterator end5_;
-    typename ParamGenerator<T5>::iterator current5_;
-    const typename ParamGenerator<T6>::iterator begin6_;
-    const typename ParamGenerator<T6>::iterator end6_;
-    typename ParamGenerator<T6>::iterator current6_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator6::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator6& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-  const ParamGenerator<T3> g3_;
-  const ParamGenerator<T4> g4_;
-  const ParamGenerator<T5> g5_;
-  const ParamGenerator<T6> g6_;
-};  // class CartesianProductGenerator6
-
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7>
-class CartesianProductGenerator7
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
-        T7> > {
- public:
-  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7> ParamType;
-
-  CartesianProductGenerator7(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
-      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
-      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {}
-  virtual ~CartesianProductGenerator7() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
-        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
-        g7_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
-        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2,
-      const ParamGenerator<T3>& g3,
-      const typename ParamGenerator<T3>::iterator& current3,
-      const ParamGenerator<T4>& g4,
-      const typename ParamGenerator<T4>::iterator& current4,
-      const ParamGenerator<T5>& g5,
-      const typename ParamGenerator<T5>::iterator& current5,
-      const ParamGenerator<T6>& g6,
-      const typename ParamGenerator<T6>::iterator& current6,
-      const ParamGenerator<T7>& g7,
-      const typename ParamGenerator<T7>::iterator& current7)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
-          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
-          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
-          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
-          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
-          begin7_(g7.begin()), end7_(g7.end()), current7_(current7)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current7_;
-      if (current7_ == end7_) {
-        current7_ = begin7_;
-        ++current6_;
-      }
-      if (current6_ == end6_) {
-        current6_ = begin6_;
-        ++current5_;
-      }
-      if (current5_ == end5_) {
-        current5_ = begin5_;
-        ++current4_;
-      }
-      if (current4_ == end4_) {
-        current4_ = begin4_;
-        ++current3_;
-      }
-      if (current3_ == end3_) {
-        current3_ = begin3_;
-        ++current2_;
-      }
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_ &&
-          current3_ == typed_other->current3_ &&
-          current4_ == typed_other->current4_ &&
-          current5_ == typed_other->current5_ &&
-          current6_ == typed_other->current6_ &&
-          current7_ == typed_other->current7_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_),
-        begin3_(other.begin3_),
-        end3_(other.end3_),
-        current3_(other.current3_),
-        begin4_(other.begin4_),
-        end4_(other.end4_),
-        current4_(other.current4_),
-        begin5_(other.begin5_),
-        end5_(other.end5_),
-        current5_(other.current5_),
-        begin6_(other.begin6_),
-        end6_(other.end6_),
-        current6_(other.current6_),
-        begin7_(other.begin7_),
-        end7_(other.end7_),
-        current7_(other.current7_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_, *current6_, *current7_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_ ||
-          current3_ == end3_ ||
-          current4_ == end4_ ||
-          current5_ == end5_ ||
-          current6_ == end6_ ||
-          current7_ == end7_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    const typename ParamGenerator<T3>::iterator begin3_;
-    const typename ParamGenerator<T3>::iterator end3_;
-    typename ParamGenerator<T3>::iterator current3_;
-    const typename ParamGenerator<T4>::iterator begin4_;
-    const typename ParamGenerator<T4>::iterator end4_;
-    typename ParamGenerator<T4>::iterator current4_;
-    const typename ParamGenerator<T5>::iterator begin5_;
-    const typename ParamGenerator<T5>::iterator end5_;
-    typename ParamGenerator<T5>::iterator current5_;
-    const typename ParamGenerator<T6>::iterator begin6_;
-    const typename ParamGenerator<T6>::iterator end6_;
-    typename ParamGenerator<T6>::iterator current6_;
-    const typename ParamGenerator<T7>::iterator begin7_;
-    const typename ParamGenerator<T7>::iterator end7_;
-    typename ParamGenerator<T7>::iterator current7_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator7::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator7& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-  const ParamGenerator<T3> g3_;
-  const ParamGenerator<T4> g4_;
-  const ParamGenerator<T5> g5_;
-  const ParamGenerator<T6> g6_;
-  const ParamGenerator<T7> g7_;
-};  // class CartesianProductGenerator7
-
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8>
-class CartesianProductGenerator8
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
-        T7, T8> > {
- public:
-  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8> ParamType;
-
-  CartesianProductGenerator8(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
-      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
-      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7,
-      const ParamGenerator<T8>& g8)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7),
-          g8_(g8) {}
-  virtual ~CartesianProductGenerator8() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
-        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
-        g7_.begin(), g8_, g8_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
-        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_,
-        g8_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2,
-      const ParamGenerator<T3>& g3,
-      const typename ParamGenerator<T3>::iterator& current3,
-      const ParamGenerator<T4>& g4,
-      const typename ParamGenerator<T4>::iterator& current4,
-      const ParamGenerator<T5>& g5,
-      const typename ParamGenerator<T5>::iterator& current5,
-      const ParamGenerator<T6>& g6,
-      const typename ParamGenerator<T6>::iterator& current6,
-      const ParamGenerator<T7>& g7,
-      const typename ParamGenerator<T7>::iterator& current7,
-      const ParamGenerator<T8>& g8,
-      const typename ParamGenerator<T8>::iterator& current8)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
-          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
-          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
-          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
-          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
-          begin7_(g7.begin()), end7_(g7.end()), current7_(current7),
-          begin8_(g8.begin()), end8_(g8.end()), current8_(current8)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current8_;
-      if (current8_ == end8_) {
-        current8_ = begin8_;
-        ++current7_;
-      }
-      if (current7_ == end7_) {
-        current7_ = begin7_;
-        ++current6_;
-      }
-      if (current6_ == end6_) {
-        current6_ = begin6_;
-        ++current5_;
-      }
-      if (current5_ == end5_) {
-        current5_ = begin5_;
-        ++current4_;
-      }
-      if (current4_ == end4_) {
-        current4_ = begin4_;
-        ++current3_;
-      }
-      if (current3_ == end3_) {
-        current3_ = begin3_;
-        ++current2_;
-      }
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_ &&
-          current3_ == typed_other->current3_ &&
-          current4_ == typed_other->current4_ &&
-          current5_ == typed_other->current5_ &&
-          current6_ == typed_other->current6_ &&
-          current7_ == typed_other->current7_ &&
-          current8_ == typed_other->current8_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_),
-        begin3_(other.begin3_),
-        end3_(other.end3_),
-        current3_(other.current3_),
-        begin4_(other.begin4_),
-        end4_(other.end4_),
-        current4_(other.current4_),
-        begin5_(other.begin5_),
-        end5_(other.end5_),
-        current5_(other.current5_),
-        begin6_(other.begin6_),
-        end6_(other.end6_),
-        current6_(other.current6_),
-        begin7_(other.begin7_),
-        end7_(other.end7_),
-        current7_(other.current7_),
-        begin8_(other.begin8_),
-        end8_(other.end8_),
-        current8_(other.current8_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_, *current6_, *current7_, *current8_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_ ||
-          current3_ == end3_ ||
-          current4_ == end4_ ||
-          current5_ == end5_ ||
-          current6_ == end6_ ||
-          current7_ == end7_ ||
-          current8_ == end8_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    const typename ParamGenerator<T3>::iterator begin3_;
-    const typename ParamGenerator<T3>::iterator end3_;
-    typename ParamGenerator<T3>::iterator current3_;
-    const typename ParamGenerator<T4>::iterator begin4_;
-    const typename ParamGenerator<T4>::iterator end4_;
-    typename ParamGenerator<T4>::iterator current4_;
-    const typename ParamGenerator<T5>::iterator begin5_;
-    const typename ParamGenerator<T5>::iterator end5_;
-    typename ParamGenerator<T5>::iterator current5_;
-    const typename ParamGenerator<T6>::iterator begin6_;
-    const typename ParamGenerator<T6>::iterator end6_;
-    typename ParamGenerator<T6>::iterator current6_;
-    const typename ParamGenerator<T7>::iterator begin7_;
-    const typename ParamGenerator<T7>::iterator end7_;
-    typename ParamGenerator<T7>::iterator current7_;
-    const typename ParamGenerator<T8>::iterator begin8_;
-    const typename ParamGenerator<T8>::iterator end8_;
-    typename ParamGenerator<T8>::iterator current8_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator8::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator8& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-  const ParamGenerator<T3> g3_;
-  const ParamGenerator<T4> g4_;
-  const ParamGenerator<T5> g5_;
-  const ParamGenerator<T6> g6_;
-  const ParamGenerator<T7> g7_;
-  const ParamGenerator<T8> g8_;
-};  // class CartesianProductGenerator8
-
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9>
-class CartesianProductGenerator9
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
-        T7, T8, T9> > {
- public:
-  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9> ParamType;
-
-  CartesianProductGenerator9(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
-      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
-      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7,
-      const ParamGenerator<T8>& g8, const ParamGenerator<T9>& g9)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
-          g9_(g9) {}
-  virtual ~CartesianProductGenerator9() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
-        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
-        g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
-        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_,
-        g8_.end(), g9_, g9_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2,
-      const ParamGenerator<T3>& g3,
-      const typename ParamGenerator<T3>::iterator& current3,
-      const ParamGenerator<T4>& g4,
-      const typename ParamGenerator<T4>::iterator& current4,
-      const ParamGenerator<T5>& g5,
-      const typename ParamGenerator<T5>::iterator& current5,
-      const ParamGenerator<T6>& g6,
-      const typename ParamGenerator<T6>::iterator& current6,
-      const ParamGenerator<T7>& g7,
-      const typename ParamGenerator<T7>::iterator& current7,
-      const ParamGenerator<T8>& g8,
-      const typename ParamGenerator<T8>::iterator& current8,
-      const ParamGenerator<T9>& g9,
-      const typename ParamGenerator<T9>::iterator& current9)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
-          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
-          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
-          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
-          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
-          begin7_(g7.begin()), end7_(g7.end()), current7_(current7),
-          begin8_(g8.begin()), end8_(g8.end()), current8_(current8),
-          begin9_(g9.begin()), end9_(g9.end()), current9_(current9)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current9_;
-      if (current9_ == end9_) {
-        current9_ = begin9_;
-        ++current8_;
-      }
-      if (current8_ == end8_) {
-        current8_ = begin8_;
-        ++current7_;
-      }
-      if (current7_ == end7_) {
-        current7_ = begin7_;
-        ++current6_;
-      }
-      if (current6_ == end6_) {
-        current6_ = begin6_;
-        ++current5_;
-      }
-      if (current5_ == end5_) {
-        current5_ = begin5_;
-        ++current4_;
-      }
-      if (current4_ == end4_) {
-        current4_ = begin4_;
-        ++current3_;
-      }
-      if (current3_ == end3_) {
-        current3_ = begin3_;
-        ++current2_;
-      }
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_ &&
-          current3_ == typed_other->current3_ &&
-          current4_ == typed_other->current4_ &&
-          current5_ == typed_other->current5_ &&
-          current6_ == typed_other->current6_ &&
-          current7_ == typed_other->current7_ &&
-          current8_ == typed_other->current8_ &&
-          current9_ == typed_other->current9_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_),
-        begin3_(other.begin3_),
-        end3_(other.end3_),
-        current3_(other.current3_),
-        begin4_(other.begin4_),
-        end4_(other.end4_),
-        current4_(other.current4_),
-        begin5_(other.begin5_),
-        end5_(other.end5_),
-        current5_(other.current5_),
-        begin6_(other.begin6_),
-        end6_(other.end6_),
-        current6_(other.current6_),
-        begin7_(other.begin7_),
-        end7_(other.end7_),
-        current7_(other.current7_),
-        begin8_(other.begin8_),
-        end8_(other.end8_),
-        current8_(other.current8_),
-        begin9_(other.begin9_),
-        end9_(other.end9_),
-        current9_(other.current9_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_, *current6_, *current7_, *current8_,
-            *current9_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_ ||
-          current3_ == end3_ ||
-          current4_ == end4_ ||
-          current5_ == end5_ ||
-          current6_ == end6_ ||
-          current7_ == end7_ ||
-          current8_ == end8_ ||
-          current9_ == end9_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    const typename ParamGenerator<T3>::iterator begin3_;
-    const typename ParamGenerator<T3>::iterator end3_;
-    typename ParamGenerator<T3>::iterator current3_;
-    const typename ParamGenerator<T4>::iterator begin4_;
-    const typename ParamGenerator<T4>::iterator end4_;
-    typename ParamGenerator<T4>::iterator current4_;
-    const typename ParamGenerator<T5>::iterator begin5_;
-    const typename ParamGenerator<T5>::iterator end5_;
-    typename ParamGenerator<T5>::iterator current5_;
-    const typename ParamGenerator<T6>::iterator begin6_;
-    const typename ParamGenerator<T6>::iterator end6_;
-    typename ParamGenerator<T6>::iterator current6_;
-    const typename ParamGenerator<T7>::iterator begin7_;
-    const typename ParamGenerator<T7>::iterator end7_;
-    typename ParamGenerator<T7>::iterator current7_;
-    const typename ParamGenerator<T8>::iterator begin8_;
-    const typename ParamGenerator<T8>::iterator end8_;
-    typename ParamGenerator<T8>::iterator current8_;
-    const typename ParamGenerator<T9>::iterator begin9_;
-    const typename ParamGenerator<T9>::iterator end9_;
-    typename ParamGenerator<T9>::iterator current9_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator9::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator9& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-  const ParamGenerator<T3> g3_;
-  const ParamGenerator<T4> g4_;
-  const ParamGenerator<T5> g5_;
-  const ParamGenerator<T6> g6_;
-  const ParamGenerator<T7> g7_;
-  const ParamGenerator<T8> g8_;
-  const ParamGenerator<T9> g9_;
-};  // class CartesianProductGenerator9
-
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10>
-class CartesianProductGenerator10
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
-        T7, T8, T9, T10> > {
- public:
-  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> ParamType;
-
-  CartesianProductGenerator10(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
-      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
-      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7,
-      const ParamGenerator<T8>& g8, const ParamGenerator<T9>& g9,
-      const ParamGenerator<T10>& g10)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
-          g9_(g9), g10_(g10) {}
-  virtual ~CartesianProductGenerator10() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
-        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
-        g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin(), g10_, g10_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
-        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_,
-        g8_.end(), g9_, g9_.end(), g10_, g10_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2,
-      const ParamGenerator<T3>& g3,
-      const typename ParamGenerator<T3>::iterator& current3,
-      const ParamGenerator<T4>& g4,
-      const typename ParamGenerator<T4>::iterator& current4,
-      const ParamGenerator<T5>& g5,
-      const typename ParamGenerator<T5>::iterator& current5,
-      const ParamGenerator<T6>& g6,
-      const typename ParamGenerator<T6>::iterator& current6,
-      const ParamGenerator<T7>& g7,
-      const typename ParamGenerator<T7>::iterator& current7,
-      const ParamGenerator<T8>& g8,
-      const typename ParamGenerator<T8>::iterator& current8,
-      const ParamGenerator<T9>& g9,
-      const typename ParamGenerator<T9>::iterator& current9,
-      const ParamGenerator<T10>& g10,
-      const typename ParamGenerator<T10>::iterator& current10)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
-          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
-          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
-          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
-          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
-          begin7_(g7.begin()), end7_(g7.end()), current7_(current7),
-          begin8_(g8.begin()), end8_(g8.end()), current8_(current8),
-          begin9_(g9.begin()), end9_(g9.end()), current9_(current9),
-          begin10_(g10.begin()), end10_(g10.end()), current10_(current10)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current10_;
-      if (current10_ == end10_) {
-        current10_ = begin10_;
-        ++current9_;
-      }
-      if (current9_ == end9_) {
-        current9_ = begin9_;
-        ++current8_;
-      }
-      if (current8_ == end8_) {
-        current8_ = begin8_;
-        ++current7_;
-      }
-      if (current7_ == end7_) {
-        current7_ = begin7_;
-        ++current6_;
-      }
-      if (current6_ == end6_) {
-        current6_ = begin6_;
-        ++current5_;
-      }
-      if (current5_ == end5_) {
-        current5_ = begin5_;
-        ++current4_;
-      }
-      if (current4_ == end4_) {
-        current4_ = begin4_;
-        ++current3_;
-      }
-      if (current3_ == end3_) {
-        current3_ = begin3_;
-        ++current2_;
-      }
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_ &&
-          current3_ == typed_other->current3_ &&
-          current4_ == typed_other->current4_ &&
-          current5_ == typed_other->current5_ &&
-          current6_ == typed_other->current6_ &&
-          current7_ == typed_other->current7_ &&
-          current8_ == typed_other->current8_ &&
-          current9_ == typed_other->current9_ &&
-          current10_ == typed_other->current10_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_),
-        begin3_(other.begin3_),
-        end3_(other.end3_),
-        current3_(other.current3_),
-        begin4_(other.begin4_),
-        end4_(other.end4_),
-        current4_(other.current4_),
-        begin5_(other.begin5_),
-        end5_(other.end5_),
-        current5_(other.current5_),
-        begin6_(other.begin6_),
-        end6_(other.end6_),
-        current6_(other.current6_),
-        begin7_(other.begin7_),
-        end7_(other.end7_),
-        current7_(other.current7_),
-        begin8_(other.begin8_),
-        end8_(other.end8_),
-        current8_(other.current8_),
-        begin9_(other.begin9_),
-        end9_(other.end9_),
-        current9_(other.current9_),
-        begin10_(other.begin10_),
-        end10_(other.end10_),
-        current10_(other.current10_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_, *current6_, *current7_, *current8_,
-            *current9_, *current10_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_ ||
-          current3_ == end3_ ||
-          current4_ == end4_ ||
-          current5_ == end5_ ||
-          current6_ == end6_ ||
-          current7_ == end7_ ||
-          current8_ == end8_ ||
-          current9_ == end9_ ||
-          current10_ == end10_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    const typename ParamGenerator<T3>::iterator begin3_;
-    const typename ParamGenerator<T3>::iterator end3_;
-    typename ParamGenerator<T3>::iterator current3_;
-    const typename ParamGenerator<T4>::iterator begin4_;
-    const typename ParamGenerator<T4>::iterator end4_;
-    typename ParamGenerator<T4>::iterator current4_;
-    const typename ParamGenerator<T5>::iterator begin5_;
-    const typename ParamGenerator<T5>::iterator end5_;
-    typename ParamGenerator<T5>::iterator current5_;
-    const typename ParamGenerator<T6>::iterator begin6_;
-    const typename ParamGenerator<T6>::iterator end6_;
-    typename ParamGenerator<T6>::iterator current6_;
-    const typename ParamGenerator<T7>::iterator begin7_;
-    const typename ParamGenerator<T7>::iterator end7_;
-    typename ParamGenerator<T7>::iterator current7_;
-    const typename ParamGenerator<T8>::iterator begin8_;
-    const typename ParamGenerator<T8>::iterator end8_;
-    typename ParamGenerator<T8>::iterator current8_;
-    const typename ParamGenerator<T9>::iterator begin9_;
-    const typename ParamGenerator<T9>::iterator end9_;
-    typename ParamGenerator<T9>::iterator current9_;
-    const typename ParamGenerator<T10>::iterator begin10_;
-    const typename ParamGenerator<T10>::iterator end10_;
-    typename ParamGenerator<T10>::iterator current10_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator10::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator10& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-  const ParamGenerator<T3> g3_;
-  const ParamGenerator<T4> g4_;
-  const ParamGenerator<T5> g5_;
-  const ParamGenerator<T6> g6_;
-  const ParamGenerator<T7> g7_;
-  const ParamGenerator<T8> g8_;
-  const ParamGenerator<T9> g9_;
-  const ParamGenerator<T10> g10_;
-};  // class CartesianProductGenerator10
-
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Helper classes providing Combine() with polymorphic features. They allow
-// casting CartesianProductGeneratorN<T> to ParamGenerator<U> if T is
-// convertible to U.
-//
-template <class Generator1, class Generator2>
-class CartesianProductHolder2 {
- public:
-CartesianProductHolder2(const Generator1& g1, const Generator2& g2)
-      : g1_(g1), g2_(g2) {}
-  template <typename T1, typename T2>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2> >(
-        new CartesianProductGenerator2<T1, T2>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder2& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-};  // class CartesianProductHolder2
-
-template <class Generator1, class Generator2, class Generator3>
-class CartesianProductHolder3 {
- public:
-CartesianProductHolder3(const Generator1& g1, const Generator2& g2,
-    const Generator3& g3)
-      : g1_(g1), g2_(g2), g3_(g3) {}
-  template <typename T1, typename T2, typename T3>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3> >(
-        new CartesianProductGenerator3<T1, T2, T3>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_),
-        static_cast<ParamGenerator<T3> >(g3_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder3& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-  const Generator3 g3_;
-};  // class CartesianProductHolder3
-
-template <class Generator1, class Generator2, class Generator3,
-    class Generator4>
-class CartesianProductHolder4 {
- public:
-CartesianProductHolder4(const Generator1& g1, const Generator2& g2,
-    const Generator3& g3, const Generator4& g4)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {}
-  template <typename T1, typename T2, typename T3, typename T4>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4> >(
-        new CartesianProductGenerator4<T1, T2, T3, T4>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_),
-        static_cast<ParamGenerator<T3> >(g3_),
-        static_cast<ParamGenerator<T4> >(g4_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder4& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-  const Generator3 g3_;
-  const Generator4 g4_;
-};  // class CartesianProductHolder4
-
-template <class Generator1, class Generator2, class Generator3,
-    class Generator4, class Generator5>
-class CartesianProductHolder5 {
- public:
-CartesianProductHolder5(const Generator1& g1, const Generator2& g2,
-    const Generator3& g3, const Generator4& g4, const Generator5& g5)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {}
-  template <typename T1, typename T2, typename T3, typename T4, typename T5>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5> >(
-        new CartesianProductGenerator5<T1, T2, T3, T4, T5>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_),
-        static_cast<ParamGenerator<T3> >(g3_),
-        static_cast<ParamGenerator<T4> >(g4_),
-        static_cast<ParamGenerator<T5> >(g5_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder5& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-  const Generator3 g3_;
-  const Generator4 g4_;
-  const Generator5 g5_;
-};  // class CartesianProductHolder5
-
-template <class Generator1, class Generator2, class Generator3,
-    class Generator4, class Generator5, class Generator6>
-class CartesianProductHolder6 {
- public:
-CartesianProductHolder6(const Generator1& g1, const Generator2& g2,
-    const Generator3& g3, const Generator4& g4, const Generator5& g5,
-    const Generator6& g6)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {}
-  template <typename T1, typename T2, typename T3, typename T4, typename T5,
-      typename T6>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6> >(
-        new CartesianProductGenerator6<T1, T2, T3, T4, T5, T6>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_),
-        static_cast<ParamGenerator<T3> >(g3_),
-        static_cast<ParamGenerator<T4> >(g4_),
-        static_cast<ParamGenerator<T5> >(g5_),
-        static_cast<ParamGenerator<T6> >(g6_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder6& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-  const Generator3 g3_;
-  const Generator4 g4_;
-  const Generator5 g5_;
-  const Generator6 g6_;
-};  // class CartesianProductHolder6
-
-template <class Generator1, class Generator2, class Generator3,
-    class Generator4, class Generator5, class Generator6, class Generator7>
-class CartesianProductHolder7 {
- public:
-CartesianProductHolder7(const Generator1& g1, const Generator2& g2,
-    const Generator3& g3, const Generator4& g4, const Generator5& g5,
-    const Generator6& g6, const Generator7& g7)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {}
-  template <typename T1, typename T2, typename T3, typename T4, typename T5,
-      typename T6, typename T7>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
-      T7> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7> >(
-        new CartesianProductGenerator7<T1, T2, T3, T4, T5, T6, T7>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_),
-        static_cast<ParamGenerator<T3> >(g3_),
-        static_cast<ParamGenerator<T4> >(g4_),
-        static_cast<ParamGenerator<T5> >(g5_),
-        static_cast<ParamGenerator<T6> >(g6_),
-        static_cast<ParamGenerator<T7> >(g7_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder7& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-  const Generator3 g3_;
-  const Generator4 g4_;
-  const Generator5 g5_;
-  const Generator6 g6_;
-  const Generator7 g7_;
-};  // class CartesianProductHolder7
-
-template <class Generator1, class Generator2, class Generator3,
-    class Generator4, class Generator5, class Generator6, class Generator7,
-    class Generator8>
-class CartesianProductHolder8 {
- public:
-CartesianProductHolder8(const Generator1& g1, const Generator2& g2,
-    const Generator3& g3, const Generator4& g4, const Generator5& g5,
-    const Generator6& g6, const Generator7& g7, const Generator8& g8)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7),
-          g8_(g8) {}
-  template <typename T1, typename T2, typename T3, typename T4, typename T5,
-      typename T6, typename T7, typename T8>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7,
-      T8> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8> >(
-        new CartesianProductGenerator8<T1, T2, T3, T4, T5, T6, T7, T8>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_),
-        static_cast<ParamGenerator<T3> >(g3_),
-        static_cast<ParamGenerator<T4> >(g4_),
-        static_cast<ParamGenerator<T5> >(g5_),
-        static_cast<ParamGenerator<T6> >(g6_),
-        static_cast<ParamGenerator<T7> >(g7_),
-        static_cast<ParamGenerator<T8> >(g8_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder8& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-  const Generator3 g3_;
-  const Generator4 g4_;
-  const Generator5 g5_;
-  const Generator6 g6_;
-  const Generator7 g7_;
-  const Generator8 g8_;
-};  // class CartesianProductHolder8
-
-template <class Generator1, class Generator2, class Generator3,
-    class Generator4, class Generator5, class Generator6, class Generator7,
-    class Generator8, class Generator9>
-class CartesianProductHolder9 {
- public:
-CartesianProductHolder9(const Generator1& g1, const Generator2& g2,
-    const Generator3& g3, const Generator4& g4, const Generator5& g5,
-    const Generator6& g6, const Generator7& g7, const Generator8& g8,
-    const Generator9& g9)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
-          g9_(g9) {}
-  template <typename T1, typename T2, typename T3, typename T4, typename T5,
-      typename T6, typename T7, typename T8, typename T9>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
-      T9> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
-        T9> >(
-        new CartesianProductGenerator9<T1, T2, T3, T4, T5, T6, T7, T8, T9>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_),
-        static_cast<ParamGenerator<T3> >(g3_),
-        static_cast<ParamGenerator<T4> >(g4_),
-        static_cast<ParamGenerator<T5> >(g5_),
-        static_cast<ParamGenerator<T6> >(g6_),
-        static_cast<ParamGenerator<T7> >(g7_),
-        static_cast<ParamGenerator<T8> >(g8_),
-        static_cast<ParamGenerator<T9> >(g9_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder9& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-  const Generator3 g3_;
-  const Generator4 g4_;
-  const Generator5 g5_;
-  const Generator6 g6_;
-  const Generator7 g7_;
-  const Generator8 g8_;
-  const Generator9 g9_;
-};  // class CartesianProductHolder9
-
-template <class Generator1, class Generator2, class Generator3,
-    class Generator4, class Generator5, class Generator6, class Generator7,
-    class Generator8, class Generator9, class Generator10>
-class CartesianProductHolder10 {
- public:
-CartesianProductHolder10(const Generator1& g1, const Generator2& g2,
-    const Generator3& g3, const Generator4& g4, const Generator5& g5,
-    const Generator6& g6, const Generator7& g7, const Generator8& g8,
-    const Generator9& g9, const Generator10& g10)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
-          g9_(g9), g10_(g10) {}
-  template <typename T1, typename T2, typename T3, typename T4, typename T5,
-      typename T6, typename T7, typename T8, typename T9, typename T10>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
-      T9, T10> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
-        T9, T10> >(
-        new CartesianProductGenerator10<T1, T2, T3, T4, T5, T6, T7, T8, T9,
-            T10>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_),
-        static_cast<ParamGenerator<T3> >(g3_),
-        static_cast<ParamGenerator<T4> >(g4_),
-        static_cast<ParamGenerator<T5> >(g5_),
-        static_cast<ParamGenerator<T6> >(g6_),
-        static_cast<ParamGenerator<T7> >(g7_),
-        static_cast<ParamGenerator<T8> >(g8_),
-        static_cast<ParamGenerator<T9> >(g9_),
-        static_cast<ParamGenerator<T10> >(g10_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder10& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-  const Generator3 g3_;
-  const Generator4 g4_;
-  const Generator5 g5_;
-  const Generator6 g6_;
-  const Generator7 g7_;
-  const Generator8 g8_;
-  const Generator9 g9_;
-  const Generator10 g10_;
-};  // class CartesianProductHolder10
-
-# endif  // GTEST_HAS_COMBINE
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  //  GTEST_HAS_PARAM_TEST
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
-
-#if GTEST_HAS_PARAM_TEST
-
-namespace testing {
-
-// Functions producing parameter generators.
-//
-// Google Test uses these generators to produce parameters for value-
-// parameterized tests. When a parameterized test case is instantiated
-// with a particular generator, Google Test creates and runs tests
-// for each element in the sequence produced by the generator.
-//
-// In the following sample, tests from test case FooTest are instantiated
-// each three times with parameter values 3, 5, and 8:
-//
-// class FooTest : public TestWithParam<int> { ... };
-//
-// TEST_P(FooTest, TestThis) {
-// }
-// TEST_P(FooTest, TestThat) {
-// }
-// INSTANTIATE_TEST_CASE_P(TestSequence, FooTest, Values(3, 5, 8));
-//
-
-// Range() returns generators providing sequences of values in a range.
-//
-// Synopsis:
-// Range(start, end)
-//   - returns a generator producing a sequence of values {start, start+1,
-//     start+2, ..., }.
-// Range(start, end, step)
-//   - returns a generator producing a sequence of values {start, start+step,
-//     start+step+step, ..., }.
-// Notes:
-//   * The generated sequences never include end. For example, Range(1, 5)
-//     returns a generator producing a sequence {1, 2, 3, 4}. Range(1, 9, 2)
-//     returns a generator producing {1, 3, 5, 7}.
-//   * start and end must have the same type. That type may be any integral or
-//     floating-point type or a user defined type satisfying these conditions:
-//     * It must be assignable (have operator=() defined).
-//     * It must have operator+() (operator+(int-compatible type) for
-//       two-operand version).
-//     * It must have operator<() defined.
-//     Elements in the resulting sequences will also have that type.
-//   * Condition start < end must be satisfied in order for resulting sequences
-//     to contain any elements.
-//
-template <typename T, typename IncrementT>
-internal::ParamGenerator<T> Range(T start, T end, IncrementT step) {
-  return internal::ParamGenerator<T>(
-      new internal::RangeGenerator<T, IncrementT>(start, end, step));
-}
-
-template <typename T>
-internal::ParamGenerator<T> Range(T start, T end) {
-  return Range(start, end, 1);
-}
-
-// ValuesIn() function allows generation of tests with parameters coming from
-// a container.
-//
-// Synopsis:
-// ValuesIn(const T (&array)[N])
-//   - returns a generator producing sequences with elements from
-//     a C-style array.
-// ValuesIn(const Container& container)
-//   - returns a generator producing sequences with elements from
-//     an STL-style container.
-// ValuesIn(Iterator begin, Iterator end)
-//   - returns a generator producing sequences with elements from
-//     a range [begin, end) defined by a pair of STL-style iterators. These
-//     iterators can also be plain C pointers.
-//
-// Please note that ValuesIn copies the values from the containers
-// passed in and keeps them to generate tests in RUN_ALL_TESTS().
-//
-// Examples:
-//
-// This instantiates tests from test case StringTest
-// each with C-string values of "foo", "bar", and "baz":
-//
-// const char* strings[] = {"foo", "bar", "baz"};
-// INSTANTIATE_TEST_CASE_P(StringSequence, SrtingTest, ValuesIn(strings));
-//
-// This instantiates tests from test case StlStringTest
-// each with STL strings with values "a" and "b":
-//
-// ::std::vector< ::std::string> GetParameterStrings() {
-//   ::std::vector< ::std::string> v;
-//   v.push_back("a");
-//   v.push_back("b");
-//   return v;
-// }
-//
-// INSTANTIATE_TEST_CASE_P(CharSequence,
-//                         StlStringTest,
-//                         ValuesIn(GetParameterStrings()));
-//
-//
-// This will also instantiate tests from CharTest
-// each with parameter values 'a' and 'b':
-//
-// ::std::list<char> GetParameterChars() {
-//   ::std::list<char> list;
-//   list.push_back('a');
-//   list.push_back('b');
-//   return list;
-// }
-// ::std::list<char> l = GetParameterChars();
-// INSTANTIATE_TEST_CASE_P(CharSequence2,
-//                         CharTest,
-//                         ValuesIn(l.begin(), l.end()));
-//
-template <typename ForwardIterator>
-internal::ParamGenerator<
-  typename ::testing::internal::IteratorTraits<ForwardIterator>::value_type>
-ValuesIn(ForwardIterator begin, ForwardIterator end) {
-  typedef typename ::testing::internal::IteratorTraits<ForwardIterator>
-      ::value_type ParamType;
-  return internal::ParamGenerator<ParamType>(
-      new internal::ValuesInIteratorRangeGenerator<ParamType>(begin, end));
-}
-
-template <typename T, size_t N>
-internal::ParamGenerator<T> ValuesIn(const T (&array)[N]) {
-  return ValuesIn(array, array + N);
-}
-
-template <class Container>
-internal::ParamGenerator<typename Container::value_type> ValuesIn(
-    const Container& container) {
-  return ValuesIn(container.begin(), container.end());
-}
-
-// Values() allows generating tests from explicitly specified list of
-// parameters.
-//
-// Synopsis:
-// Values(T v1, T v2, ..., T vN)
-//   - returns a generator producing sequences with elements v1, v2, ..., vN.
-//
-// For example, this instantiates tests from test case BarTest each
-// with values "one", "two", and "three":
-//
-// INSTANTIATE_TEST_CASE_P(NumSequence, BarTest, Values("one", "two", "three"));
-//
-// This instantiates tests from test case BazTest each with values 1, 2, 3.5.
-// The exact type of values will depend on the type of parameter in BazTest.
-//
-// INSTANTIATE_TEST_CASE_P(FloatingNumbers, BazTest, Values(1, 2, 3.5));
-//
-// Currently, Values() supports from 1 to 50 parameters.
-//
-template <typename T1>
-internal::ValueArray1<T1> Values(T1 v1) {
-  return internal::ValueArray1<T1>(v1);
-}
-
-template <typename T1, typename T2>
-internal::ValueArray2<T1, T2> Values(T1 v1, T2 v2) {
-  return internal::ValueArray2<T1, T2>(v1, v2);
-}
-
-template <typename T1, typename T2, typename T3>
-internal::ValueArray3<T1, T2, T3> Values(T1 v1, T2 v2, T3 v3) {
-  return internal::ValueArray3<T1, T2, T3>(v1, v2, v3);
-}
-
-template <typename T1, typename T2, typename T3, typename T4>
-internal::ValueArray4<T1, T2, T3, T4> Values(T1 v1, T2 v2, T3 v3, T4 v4) {
-  return internal::ValueArray4<T1, T2, T3, T4>(v1, v2, v3, v4);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-internal::ValueArray5<T1, T2, T3, T4, T5> Values(T1 v1, T2 v2, T3 v3, T4 v4,
-    T5 v5) {
-  return internal::ValueArray5<T1, T2, T3, T4, T5>(v1, v2, v3, v4, v5);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6>
-internal::ValueArray6<T1, T2, T3, T4, T5, T6> Values(T1 v1, T2 v2, T3 v3,
-    T4 v4, T5 v5, T6 v6) {
-  return internal::ValueArray6<T1, T2, T3, T4, T5, T6>(v1, v2, v3, v4, v5, v6);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7>
-internal::ValueArray7<T1, T2, T3, T4, T5, T6, T7> Values(T1 v1, T2 v2, T3 v3,
-    T4 v4, T5 v5, T6 v6, T7 v7) {
-  return internal::ValueArray7<T1, T2, T3, T4, T5, T6, T7>(v1, v2, v3, v4, v5,
-      v6, v7);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8>
-internal::ValueArray8<T1, T2, T3, T4, T5, T6, T7, T8> Values(T1 v1, T2 v2,
-    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8) {
-  return internal::ValueArray8<T1, T2, T3, T4, T5, T6, T7, T8>(v1, v2, v3, v4,
-      v5, v6, v7, v8);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9>
-internal::ValueArray9<T1, T2, T3, T4, T5, T6, T7, T8, T9> Values(T1 v1, T2 v2,
-    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9) {
-  return internal::ValueArray9<T1, T2, T3, T4, T5, T6, T7, T8, T9>(v1, v2, v3,
-      v4, v5, v6, v7, v8, v9);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10>
-internal::ValueArray10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> Values(T1 v1,
-    T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10) {
-  return internal::ValueArray10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>(v1,
-      v2, v3, v4, v5, v6, v7, v8, v9, v10);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11>
-internal::ValueArray11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10,
-    T11> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11) {
-  return internal::ValueArray11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10,
-      T11>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12>
-internal::ValueArray12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-    T12> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12) {
-  return internal::ValueArray12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13>
-internal::ValueArray13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-    T13> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13) {
-  return internal::ValueArray13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14>
-internal::ValueArray14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) {
-  return internal::ValueArray14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
-      v14);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15>
-internal::ValueArray15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
-    T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) {
-  return internal::ValueArray15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
-      v13, v14, v15);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16>
-internal::ValueArray16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
-    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16) {
-  return internal::ValueArray16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
-      v12, v13, v14, v15, v16);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17>
-internal::ValueArray17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
-    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16, T17 v17) {
-  return internal::ValueArray17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
-      v11, v12, v13, v14, v15, v16, v17);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18>
-internal::ValueArray18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6,
-    T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16, T17 v17, T18 v18) {
-  return internal::ValueArray18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18>(v1, v2, v3, v4, v5, v6, v7, v8, v9,
-      v10, v11, v12, v13, v14, v15, v16, v17, v18);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19>
-internal::ValueArray19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5,
-    T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14,
-    T15 v15, T16 v16, T17 v17, T18 v18, T19 v19) {
-  return internal::ValueArray19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19>(v1, v2, v3, v4, v5, v6, v7, v8,
-      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20>
-internal::ValueArray20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20> Values(T1 v1, T2 v2, T3 v3, T4 v4,
-    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
-    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20) {
-  return internal::ValueArray20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20>(v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21>
-internal::ValueArray21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21> Values(T1 v1, T2 v2, T3 v3, T4 v4,
-    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
-    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21) {
-  return internal::ValueArray21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21>(v1, v2, v3, v4, v5, v6,
-      v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22>
-internal::ValueArray22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22> Values(T1 v1, T2 v2, T3 v3,
-    T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
-    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
-    T21 v21, T22 v22) {
-  return internal::ValueArray22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22>(v1, v2, v3, v4,
-      v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
-      v20, v21, v22);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23>
-internal::ValueArray23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> Values(T1 v1, T2 v2,
-    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
-    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
-    T21 v21, T22 v22, T23 v23) {
-  return internal::ValueArray23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23>(v1, v2, v3,
-      v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
-      v20, v21, v22, v23);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24>
-internal::ValueArray24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> Values(T1 v1, T2 v2,
-    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
-    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
-    T21 v21, T22 v22, T23 v23, T24 v24) {
-  return internal::ValueArray24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24>(v1, v2,
-      v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18,
-      v19, v20, v21, v22, v23, v24);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25>
-internal::ValueArray25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Values(T1 v1,
-    T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11,
-    T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19,
-    T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25) {
-  return internal::ValueArray25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25>(v1,
-      v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17,
-      v18, v19, v20, v21, v22, v23, v24, v25);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26>
-internal::ValueArray26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-    T26> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-    T26 v26) {
-  return internal::ValueArray26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
-      v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27>
-internal::ValueArray27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-    T27> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-    T26 v26, T27 v27) {
-  return internal::ValueArray27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14,
-      v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28>
-internal::ValueArray28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-    T28> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-    T26 v26, T27 v27, T28 v28) {
-  return internal::ValueArray28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
-      v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27,
-      v28);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29>
-internal::ValueArray29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-    T26 v26, T27 v27, T28 v28, T29 v29) {
-  return internal::ValueArray29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
-      v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26,
-      v27, v28, v29);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30>
-internal::ValueArray30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
-    T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16,
-    T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24,
-    T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) {
-  return internal::ValueArray30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
-      v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25,
-      v26, v27, v28, v29, v30);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31>
-internal::ValueArray31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
-    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
-    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) {
-  return internal::ValueArray31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
-      v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24,
-      v25, v26, v27, v28, v29, v30, v31);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32>
-internal::ValueArray32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
-    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
-    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
-    T32 v32) {
-  return internal::ValueArray32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32>(v1, v2, v3, v4, v5, v6, v7, v8, v9,
-      v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
-      v24, v25, v26, v27, v28, v29, v30, v31, v32);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33>
-internal::ValueArray33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6,
-    T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
-    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
-    T32 v32, T33 v33) {
-  return internal::ValueArray33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33>(v1, v2, v3, v4, v5, v6, v7, v8,
-      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
-      v24, v25, v26, v27, v28, v29, v30, v31, v32, v33);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34>
-internal::ValueArray34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5,
-    T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14,
-    T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22,
-    T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30,
-    T31 v31, T32 v32, T33 v33, T34 v34) {
-  return internal::ValueArray34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34>(v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22,
-      v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35>
-internal::ValueArray35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35> Values(T1 v1, T2 v2, T3 v3, T4 v4,
-    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
-    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21,
-    T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29,
-    T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35) {
-  return internal::ValueArray35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35>(v1, v2, v3, v4, v5, v6,
-      v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21,
-      v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36>
-internal::ValueArray36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36> Values(T1 v1, T2 v2, T3 v3, T4 v4,
-    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
-    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21,
-    T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29,
-    T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36) {
-  return internal::ValueArray36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36>(v1, v2, v3, v4,
-      v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
-      v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33,
-      v34, v35, v36);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37>
-internal::ValueArray37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37> Values(T1 v1, T2 v2, T3 v3,
-    T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
-    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
-    T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28,
-    T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36,
-    T37 v37) {
-  return internal::ValueArray37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37>(v1, v2, v3,
-      v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
-      v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33,
-      v34, v35, v36, v37);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38>
-internal::ValueArray38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> Values(T1 v1, T2 v2,
-    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
-    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
-    T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28,
-    T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36,
-    T37 v37, T38 v38) {
-  return internal::ValueArray38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38>(v1, v2,
-      v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18,
-      v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32,
-      v33, v34, v35, v36, v37, v38);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39>
-internal::ValueArray39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Values(T1 v1, T2 v2,
-    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
-    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
-    T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28,
-    T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36,
-    T37 v37, T38 v38, T39 v39) {
-  return internal::ValueArray39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39>(v1,
-      v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17,
-      v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
-      v32, v33, v34, v35, v36, v37, v38, v39);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40>
-internal::ValueArray40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Values(T1 v1,
-    T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11,
-    T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19,
-    T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27,
-    T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35,
-    T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) {
-  return internal::ValueArray40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
-      v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29,
-      v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41>
-internal::ValueArray41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-    T41> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41) {
-  return internal::ValueArray41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14,
-      v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28,
-      v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42>
-internal::ValueArray42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-    T42> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-    T42 v42) {
-  return internal::ValueArray42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
-      v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27,
-      v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41,
-      v42);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43>
-internal::ValueArray43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-    T43> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-    T42 v42, T43 v43) {
-  return internal::ValueArray43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42, T43>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
-      v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26,
-      v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40,
-      v41, v42, v43);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44>
-internal::ValueArray44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-    T42 v42, T43 v43, T44 v44) {
-  return internal::ValueArray44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42, T43, T44>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
-      v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25,
-      v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39,
-      v40, v41, v42, v43, v44);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45>
-internal::ValueArray45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44, T45> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
-    T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16,
-    T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24,
-    T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32,
-    T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40,
-    T41 v41, T42 v42, T43 v43, T44 v44, T45 v45) {
-  return internal::ValueArray45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42, T43, T44, T45>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
-      v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24,
-      v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38,
-      v39, v40, v41, v42, v43, v44, v45);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46>
-internal::ValueArray46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44, T45, T46> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
-    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
-    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
-    T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39,
-    T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) {
-  return internal::ValueArray46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42, T43, T44, T45, T46>(v1, v2, v3, v4, v5, v6, v7, v8, v9,
-      v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
-      v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37,
-      v38, v39, v40, v41, v42, v43, v44, v45, v46);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47>
-internal::ValueArray47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44, T45, T46, T47> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
-    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
-    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
-    T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39,
-    T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) {
-  return internal::ValueArray47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42, T43, T44, T45, T46, T47>(v1, v2, v3, v4, v5, v6, v7, v8,
-      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
-      v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37,
-      v38, v39, v40, v41, v42, v43, v44, v45, v46, v47);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48>
-internal::ValueArray48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44, T45, T46, T47, T48> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6,
-    T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
-    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
-    T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39,
-    T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47,
-    T48 v48) {
-  return internal::ValueArray48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42, T43, T44, T45, T46, T47, T48>(v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22,
-      v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36,
-      v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49>
-internal::ValueArray49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44, T45, T46, T47, T48, T49> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5,
-    T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14,
-    T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22,
-    T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30,
-    T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38,
-    T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46,
-    T47 v47, T48 v48, T49 v49) {
-  return internal::ValueArray49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42, T43, T44, T45, T46, T47, T48, T49>(v1, v2, v3, v4, v5, v6,
-      v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21,
-      v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35,
-      v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49, typename T50>
-internal::ValueArray50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44, T45, T46, T47, T48, T49, T50> Values(T1 v1, T2 v2, T3 v3, T4 v4,
-    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
-    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21,
-    T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29,
-    T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37,
-    T38 v38, T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45,
-    T46 v46, T47 v47, T48 v48, T49 v49, T50 v50) {
-  return internal::ValueArray50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42, T43, T44, T45, T46, T47, T48, T49, T50>(v1, v2, v3, v4,
-      v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
-      v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33,
-      v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
-      v48, v49, v50);
-}
-
-// Bool() allows generating tests with parameters in a set of (false, true).
-//
-// Synopsis:
-// Bool()
-//   - returns a generator producing sequences with elements {false, true}.
-//
-// It is useful when testing code that depends on Boolean flags. Combinations
-// of multiple flags can be tested when several Bool()'s are combined using
-// Combine() function.
-//
-// In the following example all tests in the test case FlagDependentTest
-// will be instantiated twice with parameters false and true.
-//
-// class FlagDependentTest : public testing::TestWithParam<bool> {
-//   virtual void SetUp() {
-//     external_flag = GetParam();
-//   }
-// }
-// INSTANTIATE_TEST_CASE_P(BoolSequence, FlagDependentTest, Bool());
-//
-inline internal::ParamGenerator<bool> Bool() {
-  return Values(false, true);
-}
-
-# if GTEST_HAS_COMBINE
-// Combine() allows the user to combine two or more sequences to produce
-// values of a Cartesian product of those sequences' elements.
-//
-// Synopsis:
-// Combine(gen1, gen2, ..., genN)
-//   - returns a generator producing sequences with elements coming from
-//     the Cartesian product of elements from the sequences generated by
-//     gen1, gen2, ..., genN. The sequence elements will have a type of
-//     tuple<T1, T2, ..., TN> where T1, T2, ..., TN are the types
-//     of elements from sequences produces by gen1, gen2, ..., genN.
-//
-// Combine can have up to 10 arguments. This number is currently limited
-// by the maximum number of elements in the tuple implementation used by Google
-// Test.
-//
-// Example:
-//
-// This will instantiate tests in test case AnimalTest each one with
-// the parameter values tuple("cat", BLACK), tuple("cat", WHITE),
-// tuple("dog", BLACK), and tuple("dog", WHITE):
-//
-// enum Color { BLACK, GRAY, WHITE };
-// class AnimalTest
-//     : public testing::TestWithParam<tuple<const char*, Color> > {...};
-//
-// TEST_P(AnimalTest, AnimalLooksNice) {...}
-//
-// INSTANTIATE_TEST_CASE_P(AnimalVariations, AnimalTest,
-//                         Combine(Values("cat", "dog"),
-//                                 Values(BLACK, WHITE)));
-//
-// This will instantiate tests in FlagDependentTest with all variations of two
-// Boolean flags:
-//
-// class FlagDependentTest
-//     : public testing::TestWithParam<tuple<bool, bool> > {
-//   virtual void SetUp() {
-//     // Assigns external_flag_1 and external_flag_2 values from the tuple.
-//     tie(external_flag_1, external_flag_2) = GetParam();
-//   }
-// };
-//
-// TEST_P(FlagDependentTest, TestFeature1) {
-//   // Test your code using external_flag_1 and external_flag_2 here.
-// }
-// INSTANTIATE_TEST_CASE_P(TwoBoolSequence, FlagDependentTest,
-//                         Combine(Bool(), Bool()));
-//
-template <typename Generator1, typename Generator2>
-internal::CartesianProductHolder2<Generator1, Generator2> Combine(
-    const Generator1& g1, const Generator2& g2) {
-  return internal::CartesianProductHolder2<Generator1, Generator2>(
-      g1, g2);
-}
-
-template <typename Generator1, typename Generator2, typename Generator3>
-internal::CartesianProductHolder3<Generator1, Generator2, Generator3> Combine(
-    const Generator1& g1, const Generator2& g2, const Generator3& g3) {
-  return internal::CartesianProductHolder3<Generator1, Generator2, Generator3>(
-      g1, g2, g3);
-}
-
-template <typename Generator1, typename Generator2, typename Generator3,
-    typename Generator4>
-internal::CartesianProductHolder4<Generator1, Generator2, Generator3,
-    Generator4> Combine(
-    const Generator1& g1, const Generator2& g2, const Generator3& g3,
-        const Generator4& g4) {
-  return internal::CartesianProductHolder4<Generator1, Generator2, Generator3,
-      Generator4>(
-      g1, g2, g3, g4);
-}
-
-template <typename Generator1, typename Generator2, typename Generator3,
-    typename Generator4, typename Generator5>
-internal::CartesianProductHolder5<Generator1, Generator2, Generator3,
-    Generator4, Generator5> Combine(
-    const Generator1& g1, const Generator2& g2, const Generator3& g3,
-        const Generator4& g4, const Generator5& g5) {
-  return internal::CartesianProductHolder5<Generator1, Generator2, Generator3,
-      Generator4, Generator5>(
-      g1, g2, g3, g4, g5);
-}
-
-template <typename Generator1, typename Generator2, typename Generator3,
-    typename Generator4, typename Generator5, typename Generator6>
-internal::CartesianProductHolder6<Generator1, Generator2, Generator3,
-    Generator4, Generator5, Generator6> Combine(
-    const Generator1& g1, const Generator2& g2, const Generator3& g3,
-        const Generator4& g4, const Generator5& g5, const Generator6& g6) {
-  return internal::CartesianProductHolder6<Generator1, Generator2, Generator3,
-      Generator4, Generator5, Generator6>(
-      g1, g2, g3, g4, g5, g6);
-}
-
-template <typename Generator1, typename Generator2, typename Generator3,
-    typename Generator4, typename Generator5, typename Generator6,
-    typename Generator7>
-internal::CartesianProductHolder7<Generator1, Generator2, Generator3,
-    Generator4, Generator5, Generator6, Generator7> Combine(
-    const Generator1& g1, const Generator2& g2, const Generator3& g3,
-        const Generator4& g4, const Generator5& g5, const Generator6& g6,
-        const Generator7& g7) {
-  return internal::CartesianProductHolder7<Generator1, Generator2, Generator3,
-      Generator4, Generator5, Generator6, Generator7>(
-      g1, g2, g3, g4, g5, g6, g7);
-}
-
-template <typename Generator1, typename Generator2, typename Generator3,
-    typename Generator4, typename Generator5, typename Generator6,
-    typename Generator7, typename Generator8>
-internal::CartesianProductHolder8<Generator1, Generator2, Generator3,
-    Generator4, Generator5, Generator6, Generator7, Generator8> Combine(
-    const Generator1& g1, const Generator2& g2, const Generator3& g3,
-        const Generator4& g4, const Generator5& g5, const Generator6& g6,
-        const Generator7& g7, const Generator8& g8) {
-  return internal::CartesianProductHolder8<Generator1, Generator2, Generator3,
-      Generator4, Generator5, Generator6, Generator7, Generator8>(
-      g1, g2, g3, g4, g5, g6, g7, g8);
-}
-
-template <typename Generator1, typename Generator2, typename Generator3,
-    typename Generator4, typename Generator5, typename Generator6,
-    typename Generator7, typename Generator8, typename Generator9>
-internal::CartesianProductHolder9<Generator1, Generator2, Generator3,
-    Generator4, Generator5, Generator6, Generator7, Generator8,
-    Generator9> Combine(
-    const Generator1& g1, const Generator2& g2, const Generator3& g3,
-        const Generator4& g4, const Generator5& g5, const Generator6& g6,
-        const Generator7& g7, const Generator8& g8, const Generator9& g9) {
-  return internal::CartesianProductHolder9<Generator1, Generator2, Generator3,
-      Generator4, Generator5, Generator6, Generator7, Generator8, Generator9>(
-      g1, g2, g3, g4, g5, g6, g7, g8, g9);
-}
-
-template <typename Generator1, typename Generator2, typename Generator3,
-    typename Generator4, typename Generator5, typename Generator6,
-    typename Generator7, typename Generator8, typename Generator9,
-    typename Generator10>
-internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
-    Generator4, Generator5, Generator6, Generator7, Generator8, Generator9,
-    Generator10> Combine(
-    const Generator1& g1, const Generator2& g2, const Generator3& g3,
-        const Generator4& g4, const Generator5& g5, const Generator6& g6,
-        const Generator7& g7, const Generator8& g8, const Generator9& g9,
-        const Generator10& g10) {
-  return internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
-      Generator4, Generator5, Generator6, Generator7, Generator8, Generator9,
-      Generator10>(
-      g1, g2, g3, g4, g5, g6, g7, g8, g9, g10);
-}
-# endif  // GTEST_HAS_COMBINE
-
-
-
-# define TEST_P(test_case_name, test_name) \
-  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
-      : public test_case_name { \
-   public: \
-    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {} \
-    virtual void TestBody(); \
-   private: \
-    static int AddToRegistry() { \
-      ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
-          GetTestCasePatternHolder<test_case_name>(\
-              #test_case_name, __FILE__, __LINE__)->AddTestPattern(\
-                  #test_case_name, \
-                  #test_name, \
-                  new ::testing::internal::TestMetaFactory< \
-                      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>()); \
-      return 0; \
-    } \
-    static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_; \
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(\
-        GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \
-  }; \
-  int GTEST_TEST_CLASS_NAME_(test_case_name, \
-                             test_name)::gtest_registering_dummy_ = \
-      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry(); \
-  void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
-
-# define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator) \
-  ::testing::internal::ParamGenerator<test_case_name::ParamType> \
-      gtest_##prefix##test_case_name##_EvalGenerator_() { return generator; } \
-  int gtest_##prefix##test_case_name##_dummy_ GTEST_ATTRIBUTE_UNUSED_ = \
-      ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
-          GetTestCasePatternHolder<test_case_name>(\
-              #test_case_name, __FILE__, __LINE__)->AddTestCaseInstantiation(\
-                  #prefix, \
-                  &gtest_##prefix##test_case_name##_EvalGenerator_, \
-                  __FILE__, __LINE__)
-
-}  // namespace testing
-
-#endif  // GTEST_HAS_PARAM_TEST
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
-// Copyright 2006, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-//
-// Google C++ Testing Framework definitions useful in production code.
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PROD_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PROD_H_
-
-// When you need to test the private or protected members of a class,
-// use the FRIEND_TEST macro to declare your tests as friends of the
-// class.  For example:
-//
-// class MyClass {
-//  private:
-//   void MyMethod();
-//   FRIEND_TEST(MyClassTest, MyMethod);
-// };
-//
-// class MyClassTest : public testing::Test {
-//   // ...
-// };
-//
-// TEST_F(MyClassTest, MyMethod) {
-//   // Can call MyClass::MyMethod() here.
-// }
-
-#define FRIEND_TEST(test_case_name, test_name)\
-friend class test_case_name##_##test_name##_Test
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PROD_H_
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: mheule@google.com (Markus Heule)
-//
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
-#define GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
-
-#include <iosfwd>
-#include <vector>
-
-namespace testing {
-
-// A copyable object representing the result of a test part (i.e. an
-// assertion or an explicit FAIL(), ADD_FAILURE(), or SUCCESS()).
-//
-// Don't inherit from TestPartResult as its destructor is not virtual.
-class GTEST_API_ TestPartResult {
- public:
-  // The possible outcomes of a test part (i.e. an assertion or an
-  // explicit SUCCEED(), FAIL(), or ADD_FAILURE()).
-  enum Type {
-    kSuccess,          // Succeeded.
-    kNonFatalFailure,  // Failed but the test can continue.
-    kFatalFailure      // Failed and the test should be terminated.
-  };
-
-  // C'tor.  TestPartResult does NOT have a default constructor.
-  // Always use this constructor (with parameters) to create a
-  // TestPartResult object.
-  TestPartResult(Type a_type,
-                 const char* a_file_name,
-                 int a_line_number,
-                 const char* a_message)
-      : type_(a_type),
-        file_name_(a_file_name == NULL ? "" : a_file_name),
-        line_number_(a_line_number),
-        summary_(ExtractSummary(a_message)),
-        message_(a_message) {
-  }
-
-  // Gets the outcome of the test part.
-  Type type() const { return type_; }
-
-  // Gets the name of the source file where the test part took place, or
-  // NULL if it's unknown.
-  const char* file_name() const {
-    return file_name_.empty() ? NULL : file_name_.c_str();
-  }
-
-  // Gets the line in the source file where the test part took place,
-  // or -1 if it's unknown.
-  int line_number() const { return line_number_; }
-
-  // Gets the summary of the failure message.
-  const char* summary() const { return summary_.c_str(); }
-
-  // Gets the message associated with the test part.
-  const char* message() const { return message_.c_str(); }
-
-  // Returns true iff the test part passed.
-  bool passed() const { return type_ == kSuccess; }
-
-  // Returns true iff the test part failed.
-  bool failed() const { return type_ != kSuccess; }
-
-  // Returns true iff the test part non-fatally failed.
-  bool nonfatally_failed() const { return type_ == kNonFatalFailure; }
-
-  // Returns true iff the test part fatally failed.
-  bool fatally_failed() const { return type_ == kFatalFailure; }
-
- private:
-  Type type_;
-
-  // Gets the summary of the failure message by omitting the stack
-  // trace in it.
-  static std::string ExtractSummary(const char* message);
-
-  // The name of the source file where the test part took place, or
-  // "" if the source file is unknown.
-  std::string file_name_;
-  // The line in the source file where the test part took place, or -1
-  // if the line number is unknown.
-  int line_number_;
-  std::string summary_;  // The test failure summary.
-  std::string message_;  // The test failure message.
-};
-
-// Prints a TestPartResult object.
-std::ostream& operator<<(std::ostream& os, const TestPartResult& result);
-
-// An array of TestPartResult objects.
-//
-// Don't inherit from TestPartResultArray as its destructor is not
-// virtual.
-class GTEST_API_ TestPartResultArray {
- public:
-  TestPartResultArray() {}
-
-  // Appends the given TestPartResult to the array.
-  void Append(const TestPartResult& result);
-
-  // Returns the TestPartResult at the given index (0-based).
-  const TestPartResult& GetTestPartResult(int index) const;
-
-  // Returns the number of TestPartResult objects in the array.
-  int size() const;
-
- private:
-  std::vector<TestPartResult> array_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestPartResultArray);
-};
-
-// This interface knows how to report a test part result.
-class TestPartResultReporterInterface {
- public:
-  virtual ~TestPartResultReporterInterface() {}
-
-  virtual void ReportTestPartResult(const TestPartResult& result) = 0;
-};
-
-namespace internal {
-
-// This helper class is used by {ASSERT|EXPECT}_NO_FATAL_FAILURE to check if a
-// statement generates new fatal failures. To do so it registers itself as the
-// current test part result reporter. Besides checking if fatal failures were
-// reported, it only delegates the reporting to the former result reporter.
-// The original result reporter is restored in the destructor.
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-class GTEST_API_ HasNewFatalFailureHelper
-    : public TestPartResultReporterInterface {
- public:
-  HasNewFatalFailureHelper();
-  virtual ~HasNewFatalFailureHelper();
-  virtual void ReportTestPartResult(const TestPartResult& result);
-  bool has_new_fatal_failure() const { return has_new_fatal_failure_; }
- private:
-  bool has_new_fatal_failure_;
-  TestPartResultReporterInterface* original_reporter_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(HasNewFatalFailureHelper);
-};
-
-}  // namespace internal
-
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
-// Copyright 2008 Google Inc.
-// All Rights Reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
-#define GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
-
-// This header implements typed tests and type-parameterized tests.
-
-// Typed (aka type-driven) tests repeat the same test for types in a
-// list.  You must know which types you want to test with when writing
-// typed tests. Here's how you do it:
-
-#if 0
-
-// First, define a fixture class template.  It should be parameterized
-// by a type.  Remember to derive it from testing::Test.
-template <typename T>
-class FooTest : public testing::Test {
- public:
-  ...
-  typedef std::list<T> List;
-  static T shared_;
-  T value_;
-};
-
-// Next, associate a list of types with the test case, which will be
-// repeated for each type in the list.  The typedef is necessary for
-// the macro to parse correctly.
-typedef testing::Types<char, int, unsigned int> MyTypes;
-TYPED_TEST_CASE(FooTest, MyTypes);
-
-// If the type list contains only one type, you can write that type
-// directly without Types<...>:
-//   TYPED_TEST_CASE(FooTest, int);
-
-// Then, use TYPED_TEST() instead of TEST_F() to define as many typed
-// tests for this test case as you want.
-TYPED_TEST(FooTest, DoesBlah) {
-  // Inside a test, refer to TypeParam to get the type parameter.
-  // Since we are inside a derived class template, C++ requires use to
-  // visit the members of FooTest via 'this'.
-  TypeParam n = this->value_;
-
-  // To visit static members of the fixture, add the TestFixture::
-  // prefix.
-  n += TestFixture::shared_;
-
-  // To refer to typedefs in the fixture, add the "typename
-  // TestFixture::" prefix.
-  typename TestFixture::List values;
-  values.push_back(n);
-  ...
-}
-
-TYPED_TEST(FooTest, HasPropertyA) { ... }
-
-#endif  // 0
-
-// Type-parameterized tests are abstract test patterns parameterized
-// by a type.  Compared with typed tests, type-parameterized tests
-// allow you to define the test pattern without knowing what the type
-// parameters are.  The defined pattern can be instantiated with
-// different types any number of times, in any number of translation
-// units.
-//
-// If you are designing an interface or concept, you can define a
-// suite of type-parameterized tests to verify properties that any
-// valid implementation of the interface/concept should have.  Then,
-// each implementation can easily instantiate the test suite to verify
-// that it conforms to the requirements, without having to write
-// similar tests repeatedly.  Here's an example:
-
-#if 0
-
-// First, define a fixture class template.  It should be parameterized
-// by a type.  Remember to derive it from testing::Test.
-template <typename T>
-class FooTest : public testing::Test {
-  ...
-};
-
-// Next, declare that you will define a type-parameterized test case
-// (the _P suffix is for "parameterized" or "pattern", whichever you
-// prefer):
-TYPED_TEST_CASE_P(FooTest);
-
-// Then, use TYPED_TEST_P() to define as many type-parameterized tests
-// for this type-parameterized test case as you want.
-TYPED_TEST_P(FooTest, DoesBlah) {
-  // Inside a test, refer to TypeParam to get the type parameter.
-  TypeParam n = 0;
-  ...
-}
-
-TYPED_TEST_P(FooTest, HasPropertyA) { ... }
-
-// Now the tricky part: you need to register all test patterns before
-// you can instantiate them.  The first argument of the macro is the
-// test case name; the rest are the names of the tests in this test
-// case.
-REGISTER_TYPED_TEST_CASE_P(FooTest,
-                           DoesBlah, HasPropertyA);
-
-// Finally, you are free to instantiate the pattern with the types you
-// want.  If you put the above code in a header file, you can #include
-// it in multiple C++ source files and instantiate it multiple times.
-//
-// To distinguish different instances of the pattern, the first
-// argument to the INSTANTIATE_* macro is a prefix that will be added
-// to the actual test case name.  Remember to pick unique prefixes for
-// different instances.
-typedef testing::Types<char, int, unsigned int> MyTypes;
-INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes);
-
-// If the type list contains only one type, you can write that type
-// directly without Types<...>:
-//   INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, int);
-
-#endif  // 0
-
-
-// Implements typed tests.
-
-#if GTEST_HAS_TYPED_TEST
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Expands to the name of the typedef for the type parameters of the
-// given test case.
-# define GTEST_TYPE_PARAMS_(TestCaseName) gtest_type_params_##TestCaseName##_
-
-// The 'Types' template argument below must have spaces around it
-// since some compilers may choke on '>>' when passing a template
-// instance (e.g. Types<int>)
-# define TYPED_TEST_CASE(CaseName, Types) \
-  typedef ::testing::internal::TypeList< Types >::type \
-      GTEST_TYPE_PARAMS_(CaseName)
-
-# define TYPED_TEST(CaseName, TestName) \
-  template <typename gtest_TypeParam_> \
-  class GTEST_TEST_CLASS_NAME_(CaseName, TestName) \
-      : public CaseName<gtest_TypeParam_> { \
-   private: \
-    typedef CaseName<gtest_TypeParam_> TestFixture; \
-    typedef gtest_TypeParam_ TypeParam; \
-    virtual void TestBody(); \
-  }; \
-  bool gtest_##CaseName##_##TestName##_registered_ GTEST_ATTRIBUTE_UNUSED_ = \
-      ::testing::internal::TypeParameterizedTest< \
-          CaseName, \
-          ::testing::internal::TemplateSel< \
-              GTEST_TEST_CLASS_NAME_(CaseName, TestName)>, \
-          GTEST_TYPE_PARAMS_(CaseName)>::Register(\
-              "", #CaseName, #TestName, 0); \
-  template <typename gtest_TypeParam_> \
-  void GTEST_TEST_CLASS_NAME_(CaseName, TestName)<gtest_TypeParam_>::TestBody()
-
-#endif  // GTEST_HAS_TYPED_TEST
-
-// Implements type-parameterized tests.
-
-#if GTEST_HAS_TYPED_TEST_P
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Expands to the namespace name that the type-parameterized tests for
-// the given type-parameterized test case are defined in.  The exact
-// name of the namespace is subject to change without notice.
-# define GTEST_CASE_NAMESPACE_(TestCaseName) \
-  gtest_case_##TestCaseName##_
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Expands to the name of the variable used to remember the names of
-// the defined tests in the given test case.
-# define GTEST_TYPED_TEST_CASE_P_STATE_(TestCaseName) \
-  gtest_typed_test_case_p_state_##TestCaseName##_
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE DIRECTLY.
-//
-// Expands to the name of the variable used to remember the names of
-// the registered tests in the given test case.
-# define GTEST_REGISTERED_TEST_NAMES_(TestCaseName) \
-  gtest_registered_test_names_##TestCaseName##_
-
-// The variables defined in the type-parameterized test macros are
-// static as typically these macros are used in a .h file that can be
-// #included in multiple translation units linked together.
-# define TYPED_TEST_CASE_P(CaseName) \
-  static ::testing::internal::TypedTestCasePState \
-      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName)
-
-# define TYPED_TEST_P(CaseName, TestName) \
-  namespace GTEST_CASE_NAMESPACE_(CaseName) { \
-  template <typename gtest_TypeParam_> \
-  class TestName : public CaseName<gtest_TypeParam_> { \
-   private: \
-    typedef CaseName<gtest_TypeParam_> TestFixture; \
-    typedef gtest_TypeParam_ TypeParam; \
-    virtual void TestBody(); \
-  }; \
-  static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \
-      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).AddTestName(\
-          __FILE__, __LINE__, #CaseName, #TestName); \
-  } \
-  template <typename gtest_TypeParam_> \
-  void GTEST_CASE_NAMESPACE_(CaseName)::TestName<gtest_TypeParam_>::TestBody()
-
-# define REGISTER_TYPED_TEST_CASE_P(CaseName, ...) \
-  namespace GTEST_CASE_NAMESPACE_(CaseName) { \
-  typedef ::testing::internal::Templates<__VA_ARGS__>::type gtest_AllTests_; \
-  } \
-  static const char* const GTEST_REGISTERED_TEST_NAMES_(CaseName) = \
-      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).VerifyRegisteredTestNames(\
-          __FILE__, __LINE__, #__VA_ARGS__)
-
-// The 'Types' template argument below must have spaces around it
-// since some compilers may choke on '>>' when passing a template
-// instance (e.g. Types<int>)
-# define INSTANTIATE_TYPED_TEST_CASE_P(Prefix, CaseName, Types) \
-  bool gtest_##Prefix##_##CaseName GTEST_ATTRIBUTE_UNUSED_ = \
-      ::testing::internal::TypeParameterizedTestCase<CaseName, \
-          GTEST_CASE_NAMESPACE_(CaseName)::gtest_AllTests_, \
-          ::testing::internal::TypeList< Types >::type>::Register(\
-              #Prefix, #CaseName, GTEST_REGISTERED_TEST_NAMES_(CaseName))
-
-#endif  // GTEST_HAS_TYPED_TEST_P
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
-
-// Depending on the platform, different string classes are available.
-// On Linux, in addition to ::std::string, Google also makes use of
-// class ::string, which has the same interface as ::std::string, but
-// has a different implementation.
-//
-// The user can define GTEST_HAS_GLOBAL_STRING to 1 to indicate that
-// ::string is available AND is a distinct type to ::std::string, or
-// define it to 0 to indicate otherwise.
-//
-// If the user's ::std::string and ::string are the same class due to
-// aliasing, he should define GTEST_HAS_GLOBAL_STRING to 0.
-//
-// If the user doesn't define GTEST_HAS_GLOBAL_STRING, it is defined
-// heuristically.
-
-namespace testing {
+#include "gtest/gtest-assertion-result.h"
+#include "gtest/gtest-death-test.h"
+#include "gtest/gtest-matchers.h"
+#include "gtest/gtest-message.h"
+#include "gtest/gtest-param-test.h"
+#include "gtest/gtest-printers.h"
+#include "gtest/gtest-test-part.h"
+#include "gtest/gtest-typed-test.h"
+#include "gtest/gtest_pred_impl.h"
+#include "gtest/gtest_prod.h"
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-string.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
 
 // Declares the flags.
 
@@ -17512,10 +89,18 @@ GTEST_DECLARE_bool_(catch_exceptions);
 // to let Google Test decide.
 GTEST_DECLARE_string_(color);
 
+// This flag controls whether the test runner should continue execution past
+// first failure.
+GTEST_DECLARE_bool_(fail_fast);
+
 // This flag sets up the filter to select by name using a glob pattern
 // the tests to run. If the filter is not given all tests are executed.
 GTEST_DECLARE_string_(filter);
 
+// This flag controls whether Google Test installs a signal handler that dumps
+// debugging information when fatal signals are raised.
+GTEST_DECLARE_bool_(install_failure_signal_handler);
+
 // This flag causes the Google Test to list tests. None of the tests listed
 // are actually run if the flag is provided.
 GTEST_DECLARE_bool_(list_tests);
@@ -17524,10 +109,16 @@ GTEST_DECLARE_bool_(list_tests);
 // in addition to its normal textual output.
 GTEST_DECLARE_string_(output);
 
+// This flags control whether Google Test prints only test failures.
+GTEST_DECLARE_bool_(brief);
+
 // This flags control whether Google Test prints the elapsed time for each
 // test.
 GTEST_DECLARE_bool_(print_time);
 
+// This flags control whether Google Test prints UTF8 characters as text.
+GTEST_DECLARE_bool_(print_utf8);
+
 // This flag specifies the random number seed.
 GTEST_DECLARE_int32_(random_seed);
 
@@ -17535,6 +126,12 @@ GTEST_DECLARE_int32_(random_seed);
 // is 1. If the value is -1 the tests are repeating forever.
 GTEST_DECLARE_int32_(repeat);
 
+// This flag controls whether Google Test Environments are recreated for each
+// repeat of the tests. The default value is true. If set to false the global
+// test Environment objects are only set up once, for the first iteration, and
+// only torn down once, for the last.
+GTEST_DECLARE_bool_(recreate_environments_when_repeating);
+
 // This flag controls whether Google Test includes Google Test internal
 // stack frames in failure stack traces.
 GTEST_DECLARE_bool_(show_internal_stack_frames);
@@ -17548,7 +145,7 @@ GTEST_DECLARE_int32_(stack_trace_depth);
 
 // When this flag is specified, a failed assertion will throw an
 // exception if exceptions are enabled, or exit the program with a
-// non-zero code otherwise.
+// non-zero code otherwise. For use with an external test framework.
 GTEST_DECLARE_bool_(throw_on_failure);
 
 // When this flag is set with a "host:port" string, on supported
@@ -17556,6 +153,20 @@ GTEST_DECLARE_bool_(throw_on_failure);
 // the specified host machine.
 GTEST_DECLARE_string_(stream_result_to);
 
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+GTEST_DECLARE_string_(flagfile);
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+
+namespace testing {
+
+// Silence C4100 (unreferenced formal parameter) and 4805
+// unsafe mix of type 'const int' and type 'const bool'
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4805)
+#pragma warning(disable : 4100)
+#endif
+
 // The upper limit for valid stack trace depths.
 const int kMaxStackTraceDepth = 100;
 
@@ -17573,9 +184,11 @@ class TestEventListenersAccessor;
 class TestEventRepeater;
 class UnitTestRecordPropertyTestHelper;
 class WindowsDeathTest;
+class FuchsiaDeathTest;
 class UnitTestImpl* GetUnitTestImpl();
 void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
                                     const std::string& message);
+std::set<std::string>* GetIgnoredParameterizedTestSuites();
 
 }  // namespace internal
 
@@ -17583,173 +196,31 @@ void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
 // If we don't forward declare them the compiler might confuse the classes
 // in friendship clauses with same named classes on the scope.
 class Test;
-class TestCase;
+class TestSuite;
+
+// Old API is still available but deprecated
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+using TestCase = TestSuite;
+#endif
 class TestInfo;
 class UnitTest;
 
-// A class for indicating whether an assertion was successful.  When
-// the assertion wasn't successful, the AssertionResult object
-// remembers a non-empty message that describes how it failed.
-//
-// To create an instance of this class, use one of the factory functions
-// (AssertionSuccess() and AssertionFailure()).
-//
-// This class is useful for two purposes:
-//   1. Defining predicate functions to be used with Boolean test assertions
-//      EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts
-//   2. Defining predicate-format functions to be
-//      used with predicate assertions (ASSERT_PRED_FORMAT*, etc).
-//
-// For example, if you define IsEven predicate:
-//
-//   testing::AssertionResult IsEven(int n) {
-//     if ((n % 2) == 0)
-//       return testing::AssertionSuccess();
-//     else
-//       return testing::AssertionFailure() << n << " is odd";
-//   }
-//
-// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5)))
-// will print the message
-//
-//   Value of: IsEven(Fib(5))
-//     Actual: false (5 is odd)
-//   Expected: true
-//
-// instead of a more opaque
-//
-//   Value of: IsEven(Fib(5))
-//     Actual: false
-//   Expected: true
-//
-// in case IsEven is a simple Boolean predicate.
-//
-// If you expect your predicate to be reused and want to support informative
-// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up
-// about half as often as positive ones in our tests), supply messages for
-// both success and failure cases:
-//
-//   testing::AssertionResult IsEven(int n) {
-//     if ((n % 2) == 0)
-//       return testing::AssertionSuccess() << n << " is even";
-//     else
-//       return testing::AssertionFailure() << n << " is odd";
-//   }
-//
-// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print
-//
-//   Value of: IsEven(Fib(6))
-//     Actual: true (8 is even)
-//   Expected: false
-//
-// NB: Predicates that support negative Boolean assertions have reduced
-// performance in positive ones so be careful not to use them in tests
-// that have lots (tens of thousands) of positive Boolean assertions.
-//
-// To use this class with EXPECT_PRED_FORMAT assertions such as:
-//
-//   // Verifies that Foo() returns an even number.
-//   EXPECT_PRED_FORMAT1(IsEven, Foo());
-//
-// you need to define:
-//
-//   testing::AssertionResult IsEven(const char* expr, int n) {
-//     if ((n % 2) == 0)
-//       return testing::AssertionSuccess();
-//     else
-//       return testing::AssertionFailure()
-//         << "Expected: " << expr << " is even\n  Actual: it's " << n;
-//   }
-//
-// If Foo() returns 5, you will see the following message:
-//
-//   Expected: Foo() is even
-//     Actual: it's 5
-//
-class GTEST_API_ AssertionResult {
- public:
-  // Copy constructor.
-  // Used in EXPECT_TRUE/FALSE(assertion_result).
-  AssertionResult(const AssertionResult& other);
-  // Used in the EXPECT_TRUE/FALSE(bool_expression).
-  explicit AssertionResult(bool success) : success_(success) {}
-
-  // Returns true iff the assertion succeeded.
-  operator bool() const { return success_; }  // NOLINT
-
-  // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
-  AssertionResult operator!() const;
-
-  // Returns the text streamed into this AssertionResult. Test assertions
-  // use it when they fail (i.e., the predicate's outcome doesn't match the
-  // assertion's expectation). When nothing has been streamed into the
-  // object, returns an empty string.
-  const char* message() const {
-    return message_.get() != NULL ?  message_->c_str() : "";
-  }
-  // TODO(vladl@google.com): Remove this after making sure no clients use it.
-  // Deprecated; please use message() instead.
-  const char* failure_message() const { return message(); }
-
-  // Streams a custom failure message into this object.
-  template <typename T> AssertionResult& operator<<(const T& value) {
-    AppendMessage(Message() << value);
-    return *this;
-  }
-
-  // Allows streaming basic output manipulators such as endl or flush into
-  // this object.
-  AssertionResult& operator<<(
-      ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) {
-    AppendMessage(Message() << basic_manipulator);
-    return *this;
-  }
-
- private:
-  // Appends the contents of message to message_.
-  void AppendMessage(const Message& a_message) {
-    if (message_.get() == NULL)
-      message_.reset(new ::std::string);
-    message_->append(a_message.GetString().c_str());
-  }
-
-  // Stores result of the assertion predicate.
-  bool success_;
-  // Stores the message describing the condition in case the expectation
-  // construct is not satisfied with the predicate's outcome.
-  // Referenced via a pointer to avoid taking too much stack frame space
-  // with test assertions.
-  internal::scoped_ptr< ::std::string> message_;
-
-  GTEST_DISALLOW_ASSIGN_(AssertionResult);
-};
-
-// Makes a successful assertion result.
-GTEST_API_ AssertionResult AssertionSuccess();
-
-// Makes a failed assertion result.
-GTEST_API_ AssertionResult AssertionFailure();
-
-// Makes a failed assertion result with the given failure message.
-// Deprecated; use AssertionFailure() << msg.
-GTEST_API_ AssertionResult AssertionFailure(const Message& msg);
-
 // The abstract class that all tests inherit from.
 //
-// In Google Test, a unit test program contains one or many TestCases, and
-// each TestCase contains one or many Tests.
+// In Google Test, a unit test program contains one or many TestSuites, and
+// each TestSuite contains one or many Tests.
 //
 // When you define a test using the TEST macro, you don't need to
 // explicitly derive from Test - the TEST macro automatically does
 // this for you.
 //
 // The only time you derive from Test is when defining a test fixture
-// to be used a TEST_F.  For example:
+// to be used in a TEST_F.  For example:
 //
 //   class FooTest : public testing::Test {
 //    protected:
-//     virtual void SetUp() { ... }
-//     virtual void TearDown() { ... }
+//     void SetUp() override { ... }
+//     void TearDown() override { ... }
 //     ...
 //   };
 //
@@ -17761,49 +232,54 @@ class GTEST_API_ Test {
  public:
   friend class TestInfo;
 
-  // Defines types for pointers to functions that set up and tear down
-  // a test case.
-  typedef internal::SetUpTestCaseFunc SetUpTestCaseFunc;
-  typedef internal::TearDownTestCaseFunc TearDownTestCaseFunc;
-
   // The d'tor is virtual as we intend to inherit from Test.
   virtual ~Test();
 
-  // Sets up the stuff shared by all tests in this test case.
+  // Sets up the stuff shared by all tests in this test suite.
   //
-  // Google Test will call Foo::SetUpTestCase() before running the first
-  // test in test case Foo.  Hence a sub-class can define its own
-  // SetUpTestCase() method to shadow the one defined in the super
+  // Google Test will call Foo::SetUpTestSuite() before running the first
+  // test in test suite Foo.  Hence a sub-class can define its own
+  // SetUpTestSuite() method to shadow the one defined in the super
   // class.
-  static void SetUpTestCase() {}
+  static void SetUpTestSuite() {}
 
-  // Tears down the stuff shared by all tests in this test case.
+  // Tears down the stuff shared by all tests in this test suite.
   //
-  // Google Test will call Foo::TearDownTestCase() after running the last
-  // test in test case Foo.  Hence a sub-class can define its own
-  // TearDownTestCase() method to shadow the one defined in the super
+  // Google Test will call Foo::TearDownTestSuite() after running the last
+  // test in test suite Foo.  Hence a sub-class can define its own
+  // TearDownTestSuite() method to shadow the one defined in the super
   // class.
+  static void TearDownTestSuite() {}
+
+  // Legacy API is deprecated but still available. Use SetUpTestSuite and
+  // TearDownTestSuite instead.
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
   static void TearDownTestCase() {}
+  static void SetUpTestCase() {}
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
-  // Returns true iff the current test has a fatal failure.
+  // Returns true if and only if the current test has a fatal failure.
   static bool HasFatalFailure();
 
-  // Returns true iff the current test has a non-fatal failure.
+  // Returns true if and only if the current test has a non-fatal failure.
   static bool HasNonfatalFailure();
 
-  // Returns true iff the current test has a (either fatal or
+  // Returns true if and only if the current test was skipped.
+  static bool IsSkipped();
+
+  // Returns true if and only if the current test has a (either fatal or
   // non-fatal) failure.
   static bool HasFailure() { return HasFatalFailure() || HasNonfatalFailure(); }
 
-  // Logs a property for the current test, test case, or for the entire
+  // Logs a property for the current test, test suite, or for the entire
   // invocation of the test program when used outside of the context of a
-  // test case.  Only the last value for a given key is remembered.  These
+  // test suite.  Only the last value for a given key is remembered.  These
   // are public static so they can be called from utility functions that are
   // not members of the test fixture.  Calls to RecordProperty made during
   // lifespan of the test (from the moment its constructor starts to the
   // moment its destructor finishes) will be output in XML as attributes of
   // the <testcase> element.  Properties recorded from fixture's
-  // SetUpTestCase or TearDownTestCase are logged as attributes of the
+  // SetUpTestSuite or TearDownTestSuite are logged as attributes of the
   // corresponding <testsuite> element.  Calls to RecordProperty made in the
   // global context (before or after invocation of RUN_ALL_TESTS and from
   // SetUp/TearDown method of Environment objects registered with Google
@@ -17822,8 +298,8 @@ class GTEST_API_ Test {
   virtual void TearDown();
 
  private:
-  // Returns true iff the current test has the same fixture class as
-  // the first test in the current test case.
+  // Returns true if and only if the current test has the same fixture class
+  // as the first test in the current test suite.
   static bool HasSameFixtureClass();
 
   // Runs the test after the test fixture has been set up.
@@ -17841,30 +317,30 @@ class GTEST_API_ Test {
   // internal method to avoid clashing with names used in user TESTs.
   void DeleteSelf_() { delete this; }
 
-  // Uses a GTestFlagSaver to save and restore all Google Test flags.
-  const internal::GTestFlagSaver* const gtest_flag_saver_;
+  const std::unique_ptr<GTEST_FLAG_SAVER_> gtest_flag_saver_;
 
-  // Often a user mis-spells SetUp() as Setup() and spends a long time
+  // Often a user misspells SetUp() as Setup() and spends a long time
   // wondering why it is never called by Google Test.  The declaration of
   // the following method is solely for catching such an error at
   // compile time:
   //
   //   - The return type is deliberately chosen to be not void, so it
-  //   will be a conflict if a user declares void Setup() in his test
-  //   fixture.
+  //   will be a conflict if void Setup() is declared in the user's
+  //   test fixture.
   //
   //   - This method is private, so it will be another compiler error
-  //   if a user calls it from his test fixture.
+  //   if the method is called from the user's test fixture.
   //
   // DO NOT OVERRIDE THIS FUNCTION.
   //
   // If you see an error about overriding the following function or
   // about it being private, you have mis-spelled SetUp() as Setup().
   struct Setup_should_be_spelled_SetUp {};
-  virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; }
+  virtual Setup_should_be_spelled_SetUp* Setup() { return nullptr; }
 
   // We disallow copying Tests.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Test);
+  Test(const Test&) = delete;
+  Test& operator=(const Test&) = delete;
 };
 
 typedef internal::TimeInMillis TimeInMillis;
@@ -17878,24 +354,17 @@ class TestProperty {
   // C'tor.  TestProperty does NOT have a default constructor.
   // Always use this constructor (with parameters) to create a
   // TestProperty object.
-  TestProperty(const std::string& a_key, const std::string& a_value) :
-    key_(a_key), value_(a_value) {
-  }
+  TestProperty(const std::string& a_key, const std::string& a_value)
+      : key_(a_key), value_(a_value) {}
 
   // Gets the user supplied key.
-  const char* key() const {
-    return key_.c_str();
-  }
+  const char* key() const { return key_.c_str(); }
 
   // Gets the user supplied value.
-  const char* value() const {
-    return value_.c_str();
-  }
+  const char* value() const { return value_.c_str(); }
 
   // Sets a new value, overriding the one supplied in the constructor.
-  void SetValue(const std::string& new_value) {
-    value_ = new_value;
-  }
+  void SetValue(const std::string& new_value) { value_ = new_value; }
 
  private:
   // The key supplied by the user.
@@ -17925,24 +394,30 @@ class GTEST_API_ TestResult {
   // Returns the number of the test properties.
   int test_property_count() const;
 
-  // Returns true iff the test passed (i.e. no test part failed).
-  bool Passed() const { return !Failed(); }
+  // Returns true if and only if the test passed (i.e. no test part failed).
+  bool Passed() const { return !Skipped() && !Failed(); }
 
-  // Returns true iff the test failed.
+  // Returns true if and only if the test was skipped.
+  bool Skipped() const;
+
+  // Returns true if and only if the test failed.
   bool Failed() const;
 
-  // Returns true iff the test fatally failed.
+  // Returns true if and only if the test fatally failed.
   bool HasFatalFailure() const;
 
-  // Returns true iff the test has a non-fatal failure.
+  // Returns true if and only if the test has a non-fatal failure.
   bool HasNonfatalFailure() const;
 
   // Returns the elapsed time, in milliseconds.
   TimeInMillis elapsed_time() const { return elapsed_time_; }
 
-  // Returns the i-th test part result among all the results. i can range
-  // from 0 to test_property_count() - 1. If i is not in that range, aborts
-  // the program.
+  // Gets the time of the test case start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp() const { return start_timestamp_; }
+
+  // Returns the i-th test part result among all the results. i can range from 0
+  // to total_part_count() - 1. If i is not in that range, aborts the program.
   const TestPartResult& GetTestPartResult(int i) const;
 
   // Returns the i-th test property. i can range from 0 to
@@ -17952,13 +427,14 @@ class GTEST_API_ TestResult {
 
  private:
   friend class TestInfo;
-  friend class TestCase;
+  friend class TestSuite;
   friend class UnitTest;
   friend class internal::DefaultGlobalTestPartResultReporter;
   friend class internal::ExecDeathTest;
   friend class internal::TestResultAccessor;
   friend class internal::UnitTestImpl;
   friend class internal::WindowsDeathTest;
+  friend class internal::FuchsiaDeathTest;
 
   // Gets the vector of TestPartResults.
   const std::vector<TestPartResult>& test_part_results() const {
@@ -17970,6 +446,9 @@ class GTEST_API_ TestResult {
     return test_properties_;
   }
 
+  // Sets the start time.
+  void set_start_timestamp(TimeInMillis start) { start_timestamp_ = start; }
+
   // Sets the elapsed time.
   void set_elapsed_time(TimeInMillis elapsed) { elapsed_time_ = elapsed; }
 
@@ -17983,8 +462,8 @@ class GTEST_API_ TestResult {
                       const TestProperty& test_property);
 
   // Adds a failure if the key is a reserved attribute of Google Test
-  // testcase tags.  Returns true if the property is valid.
-  // TODO(russr): Validate attribute names are legal and human readable.
+  // testsuite tags.  Returns true if the property is valid.
+  // FIXME: Validate attribute names are legal and human readable.
   static bool ValidateTestProperty(const std::string& xml_element,
                                    const TestProperty& test_property);
 
@@ -18005,7 +484,7 @@ class GTEST_API_ TestResult {
 
   // Protects mutable state of the property vector and of owned
   // properties, whose values may be updated.
-  internal::Mutex test_properites_mutex_;
+  internal::Mutex test_properties_mutex_;
 
   // The vector of TestPartResults
   std::vector<TestPartResult> test_part_results_;
@@ -18013,16 +492,19 @@ class GTEST_API_ TestResult {
   std::vector<TestProperty> test_properties_;
   // Running count of death tests.
   int death_test_count_;
+  // The start time, in milliseconds since UNIX Epoch.
+  TimeInMillis start_timestamp_;
   // The elapsed time, in milliseconds.
   TimeInMillis elapsed_time_;
 
   // We disallow copying TestResult.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestResult);
+  TestResult(const TestResult&) = delete;
+  TestResult& operator=(const TestResult&) = delete;
 };  // class TestResult
 
 // A TestInfo object stores the following information about a test:
 //
-//   Test case name
+//   Test suite name
 //   Test name
 //   Whether the test should be run
 //   A function pointer that creates the test object when invoked
@@ -18037,8 +519,13 @@ class GTEST_API_ TestInfo {
   // don't inherit from TestInfo.
   ~TestInfo();
 
-  // Returns the test case name.
-  const char* test_case_name() const { return test_case_name_.c_str(); }
+  // Returns the test suite name.
+  const char* test_suite_name() const { return test_suite_name_.c_str(); }
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  const char* test_case_name() const { return test_suite_name(); }
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
   // Returns the test name.
   const char* name() const { return name_.c_str(); }
@@ -18046,25 +533,32 @@ class GTEST_API_ TestInfo {
   // Returns the name of the parameter type, or NULL if this is not a typed
   // or a type-parameterized test.
   const char* type_param() const {
-    if (type_param_.get() != NULL)
-      return type_param_->c_str();
-    return NULL;
+    if (type_param_.get() != nullptr) return type_param_->c_str();
+    return nullptr;
   }
 
   // Returns the text representation of the value parameter, or NULL if this
   // is not a value-parameterized test.
   const char* value_param() const {
-    if (value_param_.get() != NULL)
-      return value_param_->c_str();
-    return NULL;
+    if (value_param_.get() != nullptr) return value_param_->c_str();
+    return nullptr;
   }
 
+  // Returns the file name where this test is defined.
+  const char* file() const { return location_.file.c_str(); }
+
+  // Returns the line where this test is defined.
+  int line() const { return location_.line; }
+
+  // Return true if this test should not be run because it's in another shard.
+  bool is_in_another_shard() const { return is_in_another_shard_; }
+
   // Returns true if this test should run, that is if the test is not
   // disabled (or it is disabled but the also_run_disabled_tests flag has
   // been specified) and its full name matches the user-specified filter.
   //
   // Google Test allows the user to filter the tests by their full names.
-  // The full name of a test Bar in test case Foo is defined as
+  // The full name of a test Bar in test suite Foo is defined as
   // "Foo.Bar".  Only the tests that match the filter will run.
   //
   // A filter is a colon-separated list of glob (not regex) patterns,
@@ -18077,12 +571,11 @@ class GTEST_API_ TestInfo {
   // contains the character 'A' or starts with "Foo.".
   bool should_run() const { return should_run_; }
 
-  // Returns true iff this test will appear in the XML report.
+  // Returns true if and only if this test will appear in the XML report.
   bool is_reportable() const {
-    // For now, the XML report includes all tests matching the filter.
-    // In the future, we may trim tests that are excluded because of
-    // sharding.
-    return matches_filter_;
+    // The XML report includes tests matching the filter, excluding those
+    // run in other shards.
+    return matches_filter_ && !is_in_another_shard_;
   }
 
   // Returns the result of the test.
@@ -18093,25 +586,22 @@ class GTEST_API_ TestInfo {
   friend class internal::DefaultDeathTestFactory;
 #endif  // GTEST_HAS_DEATH_TEST
   friend class Test;
-  friend class TestCase;
+  friend class TestSuite;
   friend class internal::UnitTestImpl;
   friend class internal::StreamingListenerTest;
   friend TestInfo* internal::MakeAndRegisterTestInfo(
-      const char* test_case_name,
-      const char* name,
-      const char* type_param,
-      const char* value_param,
-      internal::TypeId fixture_class_id,
-      Test::SetUpTestCaseFunc set_up_tc,
-      Test::TearDownTestCaseFunc tear_down_tc,
+      const char* test_suite_name, const char* name, const char* type_param,
+      const char* value_param, internal::CodeLocation code_location,
+      internal::TypeId fixture_class_id, internal::SetUpTestSuiteFunc set_up_tc,
+      internal::TearDownTestSuiteFunc tear_down_tc,
       internal::TestFactoryBase* factory);
 
   // Constructs a TestInfo object. The newly constructed instance assumes
   // ownership of the factory object.
-  TestInfo(const std::string& test_case_name,
-           const std::string& name,
+  TestInfo(const std::string& test_suite_name, const std::string& name,
            const char* a_type_param,   // NULL if not a type-parameterized test
            const char* a_value_param,  // NULL if not a value-parameterized test
+           internal::CodeLocation a_code_location,
            internal::TypeId fixture_class_id,
            internal::TestFactoryBase* factory);
 
@@ -18125,24 +615,29 @@ class GTEST_API_ TestInfo {
   // deletes it.
   void Run();
 
+  // Skip and records the test result for this object.
+  void Skip();
+
   static void ClearTestResult(TestInfo* test_info) {
     test_info->result_.Clear();
   }
 
   // These fields are immutable properties of the test.
-  const std::string test_case_name_;     // Test case name
-  const std::string name_;               // Test name
+  const std::string test_suite_name_;  // test suite name
+  const std::string name_;             // Test name
   // Name of the parameter type, or NULL if this is not a typed or a
   // type-parameterized test.
-  const internal::scoped_ptr<const ::std::string> type_param_;
+  const std::unique_ptr<const ::std::string> type_param_;
   // Text representation of the value parameter, or NULL if this is not a
   // value-parameterized test.
-  const internal::scoped_ptr<const ::std::string> value_param_;
-  const internal::TypeId fixture_class_id_;   // ID of the test fixture class
-  bool should_run_;                 // True iff this test should run
-  bool is_disabled_;                // True iff this test is disabled
-  bool matches_filter_;             // True if this test matches the
-                                    // user-specified filter.
+  const std::unique_ptr<const ::std::string> value_param_;
+  internal::CodeLocation location_;
+  const internal::TypeId fixture_class_id_;  // ID of the test fixture class
+  bool should_run_;           // True if and only if this test should run
+  bool is_disabled_;          // True if and only if this test is disabled
+  bool matches_filter_;       // True if this test matches the
+                              // user-specified filter.
+  bool is_in_another_shard_;  // Will be run in another shard.
   internal::TestFactoryBase* const factory_;  // The factory that creates
                                               // the test object
 
@@ -18150,93 +645,102 @@ class GTEST_API_ TestInfo {
   // test for the second time.
   TestResult result_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestInfo);
+  TestInfo(const TestInfo&) = delete;
+  TestInfo& operator=(const TestInfo&) = delete;
 };
 
-// A test case, which consists of a vector of TestInfos.
+// A test suite, which consists of a vector of TestInfos.
 //
-// TestCase is not copyable.
-class GTEST_API_ TestCase {
+// TestSuite is not copyable.
+class GTEST_API_ TestSuite {
  public:
-  // Creates a TestCase with the given name.
+  // Creates a TestSuite with the given name.
   //
-  // TestCase does NOT have a default constructor.  Always use this
-  // constructor to create a TestCase object.
+  // TestSuite does NOT have a default constructor.  Always use this
+  // constructor to create a TestSuite object.
   //
   // Arguments:
   //
-  //   name:         name of the test case
+  //   name:         name of the test suite
   //   a_type_param: the name of the test's type parameter, or NULL if
   //                 this is not a type-parameterized test.
-  //   set_up_tc:    pointer to the function that sets up the test case
-  //   tear_down_tc: pointer to the function that tears down the test case
-  TestCase(const char* name, const char* a_type_param,
-           Test::SetUpTestCaseFunc set_up_tc,
-           Test::TearDownTestCaseFunc tear_down_tc);
+  //   set_up_tc:    pointer to the function that sets up the test suite
+  //   tear_down_tc: pointer to the function that tears down the test suite
+  TestSuite(const char* name, const char* a_type_param,
+            internal::SetUpTestSuiteFunc set_up_tc,
+            internal::TearDownTestSuiteFunc tear_down_tc);
 
-  // Destructor of TestCase.
-  virtual ~TestCase();
+  // Destructor of TestSuite.
+  virtual ~TestSuite();
 
-  // Gets the name of the TestCase.
+  // Gets the name of the TestSuite.
   const char* name() const { return name_.c_str(); }
 
   // Returns the name of the parameter type, or NULL if this is not a
-  // type-parameterized test case.
+  // type-parameterized test suite.
   const char* type_param() const {
-    if (type_param_.get() != NULL)
-      return type_param_->c_str();
-    return NULL;
+    if (type_param_.get() != nullptr) return type_param_->c_str();
+    return nullptr;
   }
 
-  // Returns true if any test in this test case should run.
+  // Returns true if any test in this test suite should run.
   bool should_run() const { return should_run_; }
 
-  // Gets the number of successful tests in this test case.
+  // Gets the number of successful tests in this test suite.
   int successful_test_count() const;
 
-  // Gets the number of failed tests in this test case.
+  // Gets the number of skipped tests in this test suite.
+  int skipped_test_count() const;
+
+  // Gets the number of failed tests in this test suite.
   int failed_test_count() const;
 
   // Gets the number of disabled tests that will be reported in the XML report.
   int reportable_disabled_test_count() const;
 
-  // Gets the number of disabled tests in this test case.
+  // Gets the number of disabled tests in this test suite.
   int disabled_test_count() const;
 
   // Gets the number of tests to be printed in the XML report.
   int reportable_test_count() const;
 
-  // Get the number of tests in this test case that should run.
+  // Get the number of tests in this test suite that should run.
   int test_to_run_count() const;
 
-  // Gets the number of all tests in this test case.
+  // Gets the number of all tests in this test suite.
   int total_test_count() const;
 
-  // Returns true iff the test case passed.
+  // Returns true if and only if the test suite passed.
   bool Passed() const { return !Failed(); }
 
-  // Returns true iff the test case failed.
-  bool Failed() const { return failed_test_count() > 0; }
+  // Returns true if and only if the test suite failed.
+  bool Failed() const {
+    return failed_test_count() > 0 || ad_hoc_test_result().Failed();
+  }
 
   // Returns the elapsed time, in milliseconds.
   TimeInMillis elapsed_time() const { return elapsed_time_; }
 
+  // Gets the time of the test suite start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp() const { return start_timestamp_; }
+
   // Returns the i-th test among all the tests. i can range from 0 to
   // total_test_count() - 1. If i is not in that range, returns NULL.
   const TestInfo* GetTestInfo(int i) const;
 
   // Returns the TestResult that holds test properties recorded during
-  // execution of SetUpTestCase and TearDownTestCase.
+  // execution of SetUpTestSuite and TearDownTestSuite.
   const TestResult& ad_hoc_test_result() const { return ad_hoc_test_result_; }
 
  private:
   friend class Test;
   friend class internal::UnitTestImpl;
 
-  // Gets the (mutable) vector of TestInfos in this TestCase.
+  // Gets the (mutable) vector of TestInfos in this TestSuite.
   std::vector<TestInfo*>& test_info_list() { return test_info_list_; }
 
-  // Gets the (immutable) vector of TestInfos in this TestCase.
+  // Gets the (immutable) vector of TestInfos in this TestSuite.
   const std::vector<TestInfo*>& test_info_list() const {
     return test_info_list_;
   }
@@ -18248,51 +752,67 @@ class GTEST_API_ TestCase {
   // Sets the should_run member.
   void set_should_run(bool should) { should_run_ = should; }
 
-  // Adds a TestInfo to this test case.  Will delete the TestInfo upon
-  // destruction of the TestCase object.
-  void AddTestInfo(TestInfo * test_info);
+  // Adds a TestInfo to this test suite.  Will delete the TestInfo upon
+  // destruction of the TestSuite object.
+  void AddTestInfo(TestInfo* test_info);
 
-  // Clears the results of all tests in this test case.
+  // Clears the results of all tests in this test suite.
   void ClearResult();
 
-  // Clears the results of all tests in the given test case.
-  static void ClearTestCaseResult(TestCase* test_case) {
-    test_case->ClearResult();
+  // Clears the results of all tests in the given test suite.
+  static void ClearTestSuiteResult(TestSuite* test_suite) {
+    test_suite->ClearResult();
   }
 
-  // Runs every test in this TestCase.
+  // Runs every test in this TestSuite.
   void Run();
 
-  // Runs SetUpTestCase() for this TestCase.  This wrapper is needed
-  // for catching exceptions thrown from SetUpTestCase().
-  void RunSetUpTestCase() { (*set_up_tc_)(); }
+  // Skips the execution of tests under this TestSuite
+  void Skip();
 
-  // Runs TearDownTestCase() for this TestCase.  This wrapper is
-  // needed for catching exceptions thrown from TearDownTestCase().
-  void RunTearDownTestCase() { (*tear_down_tc_)(); }
+  // Runs SetUpTestSuite() for this TestSuite.  This wrapper is needed
+  // for catching exceptions thrown from SetUpTestSuite().
+  void RunSetUpTestSuite() {
+    if (set_up_tc_ != nullptr) {
+      (*set_up_tc_)();
+    }
+  }
 
-  // Returns true iff test passed.
+  // Runs TearDownTestSuite() for this TestSuite.  This wrapper is
+  // needed for catching exceptions thrown from TearDownTestSuite().
+  void RunTearDownTestSuite() {
+    if (tear_down_tc_ != nullptr) {
+      (*tear_down_tc_)();
+    }
+  }
+
+  // Returns true if and only if test passed.
   static bool TestPassed(const TestInfo* test_info) {
     return test_info->should_run() && test_info->result()->Passed();
   }
 
-  // Returns true iff test failed.
+  // Returns true if and only if test skipped.
+  static bool TestSkipped(const TestInfo* test_info) {
+    return test_info->should_run() && test_info->result()->Skipped();
+  }
+
+  // Returns true if and only if test failed.
   static bool TestFailed(const TestInfo* test_info) {
     return test_info->should_run() && test_info->result()->Failed();
   }
 
-  // Returns true iff the test is disabled and will be reported in the XML
-  // report.
+  // Returns true if and only if the test is disabled and will be reported in
+  // the XML report.
   static bool TestReportableDisabled(const TestInfo* test_info) {
     return test_info->is_reportable() && test_info->is_disabled_;
   }
 
-  // Returns true iff test is disabled.
+  // Returns true if and only if test is disabled.
   static bool TestDisabled(const TestInfo* test_info) {
     return test_info->is_disabled_;
   }
 
-  // Returns true iff this test will appear in the XML report.
+  // Returns true if and only if this test will appear in the XML report.
   static bool TestReportable(const TestInfo* test_info) {
     return test_info->is_reportable();
   }
@@ -18302,17 +822,17 @@ class GTEST_API_ TestCase {
     return test_info->should_run();
   }
 
-  // Shuffles the tests in this test case.
+  // Shuffles the tests in this test suite.
   void ShuffleTests(internal::Random* random);
 
   // Restores the test order to before the first shuffle.
   void UnshuffleTests();
 
-  // Name of the test case.
+  // Name of the test suite.
   std::string name_;
   // Name of the parameter type, or NULL if this is not a typed or a
   // type-parameterized test.
-  const internal::scoped_ptr<const ::std::string> type_param_;
+  const std::unique_ptr<const ::std::string> type_param_;
   // The vector of TestInfos in their original order.  It owns the
   // elements in the vector.
   std::vector<TestInfo*> test_info_list_;
@@ -18320,24 +840,27 @@ class GTEST_API_ TestCase {
   // shuffling and restoring the test order.  The i-th element in this
   // vector is the index of the i-th test in the shuffled test list.
   std::vector<int> test_indices_;
-  // Pointer to the function that sets up the test case.
-  Test::SetUpTestCaseFunc set_up_tc_;
-  // Pointer to the function that tears down the test case.
-  Test::TearDownTestCaseFunc tear_down_tc_;
-  // True iff any test in this test case should run.
+  // Pointer to the function that sets up the test suite.
+  internal::SetUpTestSuiteFunc set_up_tc_;
+  // Pointer to the function that tears down the test suite.
+  internal::TearDownTestSuiteFunc tear_down_tc_;
+  // True if and only if any test in this test suite should run.
   bool should_run_;
+  // The start time, in milliseconds since UNIX Epoch.
+  TimeInMillis start_timestamp_;
   // Elapsed time, in milliseconds.
   TimeInMillis elapsed_time_;
-  // Holds test properties recorded during execution of SetUpTestCase and
-  // TearDownTestCase.
+  // Holds test properties recorded during execution of SetUpTestSuite and
+  // TearDownTestSuite.
   TestResult ad_hoc_test_result_;
 
-  // We disallow copying TestCases.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestCase);
+  // We disallow copying TestSuites.
+  TestSuite(const TestSuite&) = delete;
+  TestSuite& operator=(const TestSuite&) = delete;
 };
 
 // An Environment object is capable of setting up and tearing down an
-// environment.  The user should subclass this to define his own
+// environment.  You should subclass this to define your own
 // environment(s).
 //
 // An Environment object does the set-up and tear-down in virtual
@@ -18360,13 +883,26 @@ class Environment {
 
   // Override this to define how to tear down the environment.
   virtual void TearDown() {}
+
  private:
   // If you see an error about overriding the following function or
   // about it being private, you have mis-spelled SetUp() as Setup().
   struct Setup_should_be_spelled_SetUp {};
-  virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; }
+  virtual Setup_should_be_spelled_SetUp* Setup() { return nullptr; }
 };
 
+#if GTEST_HAS_EXCEPTIONS
+
+// Exception which can be thrown from TestEventListener::OnTestPartResult.
+class GTEST_API_ AssertionException
+    : public internal::GoogleTestFailureException {
+ public:
+  explicit AssertionException(const TestPartResult& result)
+      : GoogleTestFailureException(result) {}
+};
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
 // The interface for tracing execution of tests. The methods are organized in
 // the order the corresponding events are fired.
 class TestEventListener {
@@ -18388,20 +924,35 @@ class TestEventListener {
   // Fired after environment set-up for each iteration of tests ends.
   virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test) = 0;
 
-  // Fired before the test case starts.
-  virtual void OnTestCaseStart(const TestCase& test_case) = 0;
+  // Fired before the test suite starts.
+  virtual void OnTestSuiteStart(const TestSuite& /*test_suite*/) {}
+
+  //  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  virtual void OnTestCaseStart(const TestCase& /*test_case*/) {}
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
   // Fired before the test starts.
   virtual void OnTestStart(const TestInfo& test_info) = 0;
 
+  // Fired when a test is disabled
+  virtual void OnTestDisabled(const TestInfo& /*test_info*/) {}
+
   // Fired after a failed assertion or a SUCCEED() invocation.
+  // If you want to throw an exception from this function to skip to the next
+  // TEST, it must be AssertionException defined above, or inherited from it.
   virtual void OnTestPartResult(const TestPartResult& test_part_result) = 0;
 
   // Fired after the test ends.
   virtual void OnTestEnd(const TestInfo& test_info) = 0;
 
-  // Fired after the test case ends.
-  virtual void OnTestCaseEnd(const TestCase& test_case) = 0;
+  // Fired after the test suite ends.
+  virtual void OnTestSuiteEnd(const TestSuite& /*test_suite*/) {}
+
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  virtual void OnTestCaseEnd(const TestCase& /*test_case*/) {}
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
   // Fired before environment tear-down for each iteration of tests starts.
   virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test) = 0;
@@ -18410,8 +961,7 @@ class TestEventListener {
   virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test) = 0;
 
   // Fired after each iteration of tests finishes.
-  virtual void OnTestIterationEnd(const UnitTest& unit_test,
-                                  int iteration) = 0;
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration) = 0;
 
   // Fired after all test activities have ended.
   virtual void OnTestProgramEnd(const UnitTest& unit_test) = 0;
@@ -18424,21 +974,31 @@ class TestEventListener {
 // above.
 class EmptyTestEventListener : public TestEventListener {
  public:
-  virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {}
-  virtual void OnTestIterationStart(const UnitTest& /*unit_test*/,
-                                    int /*iteration*/) {}
-  virtual void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) {}
-  virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {}
-  virtual void OnTestCaseStart(const TestCase& /*test_case*/) {}
-  virtual void OnTestStart(const TestInfo& /*test_info*/) {}
-  virtual void OnTestPartResult(const TestPartResult& /*test_part_result*/) {}
-  virtual void OnTestEnd(const TestInfo& /*test_info*/) {}
-  virtual void OnTestCaseEnd(const TestCase& /*test_case*/) {}
-  virtual void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) {}
-  virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {}
-  virtual void OnTestIterationEnd(const UnitTest& /*unit_test*/,
-                                  int /*iteration*/) {}
-  virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {}
+  void OnTestProgramStart(const UnitTest& /*unit_test*/) override {}
+  void OnTestIterationStart(const UnitTest& /*unit_test*/,
+                            int /*iteration*/) override {}
+  void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) override {}
+  void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) override {}
+  void OnTestSuiteStart(const TestSuite& /*test_suite*/) override {}
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseStart(const TestCase& /*test_case*/) override {}
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  void OnTestStart(const TestInfo& /*test_info*/) override {}
+  void OnTestDisabled(const TestInfo& /*test_info*/) override {}
+  void OnTestPartResult(const TestPartResult& /*test_part_result*/) override {}
+  void OnTestEnd(const TestInfo& /*test_info*/) override {}
+  void OnTestSuiteEnd(const TestSuite& /*test_suite*/) override {}
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseEnd(const TestCase& /*test_case*/) override {}
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) override {}
+  void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) override {}
+  void OnTestIterationEnd(const UnitTest& /*unit_test*/,
+                          int /*iteration*/) override {}
+  void OnTestProgramEnd(const UnitTest& /*unit_test*/) override {}
 };
 
 // TestEventListeners lets users add listeners to track events in Google Test.
@@ -18478,7 +1038,7 @@ class GTEST_API_ TestEventListeners {
   }
 
  private:
-  friend class TestCase;
+  friend class TestSuite;
   friend class TestInfo;
   friend class internal::DefaultGlobalTestPartResultReporter;
   friend class internal::NoExecDeathTest;
@@ -18516,10 +1076,11 @@ class GTEST_API_ TestEventListeners {
   TestEventListener* default_xml_generator_;
 
   // We disallow copying TestEventListeners.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventListeners);
+  TestEventListeners(const TestEventListeners&) = delete;
+  TestEventListeners& operator=(const TestEventListeners&) = delete;
 };
 
-// A UnitTest consists of a vector of TestCases.
+// A UnitTest consists of a vector of TestSuites.
 //
 // This is a singleton class.  The only instance of UnitTest is
 // created when UnitTest::GetInstance() is first called.  This
@@ -18548,44 +1109,56 @@ class GTEST_API_ UnitTest {
   // was executed.  The UnitTest object owns the string.
   const char* original_working_dir() const;
 
-  // Returns the TestCase object for the test that's currently running,
+  // Returns the TestSuite object for the test that's currently running,
   // or NULL if no test is running.
-  const TestCase* current_test_case() const
-      GTEST_LOCK_EXCLUDED_(mutex_);
+  const TestSuite* current_test_suite() const GTEST_LOCK_EXCLUDED_(mutex_);
+
+// Legacy API is still available but deprecated
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  const TestCase* current_test_case() const GTEST_LOCK_EXCLUDED_(mutex_);
+#endif
 
   // Returns the TestInfo object for the test that's currently running,
   // or NULL if no test is running.
-  const TestInfo* current_test_info() const
-      GTEST_LOCK_EXCLUDED_(mutex_);
+  const TestInfo* current_test_info() const GTEST_LOCK_EXCLUDED_(mutex_);
 
   // Returns the random seed used at the start of the current test run.
   int random_seed() const;
 
-#if GTEST_HAS_PARAM_TEST
-  // Returns the ParameterizedTestCaseRegistry object used to keep track of
+  // Returns the ParameterizedTestSuiteRegistry object used to keep track of
   // value-parameterized tests and instantiate and register them.
   //
   // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-  internal::ParameterizedTestCaseRegistry& parameterized_test_registry()
+  internal::ParameterizedTestSuiteRegistry& parameterized_test_registry()
       GTEST_LOCK_EXCLUDED_(mutex_);
-#endif  // GTEST_HAS_PARAM_TEST
 
-  // Gets the number of successful test cases.
-  int successful_test_case_count() const;
+  // Gets the number of successful test suites.
+  int successful_test_suite_count() const;
 
-  // Gets the number of failed test cases.
-  int failed_test_case_count() const;
+  // Gets the number of failed test suites.
+  int failed_test_suite_count() const;
 
-  // Gets the number of all test cases.
-  int total_test_case_count() const;
+  // Gets the number of all test suites.
+  int total_test_suite_count() const;
 
-  // Gets the number of all test cases that contain at least one test
+  // Gets the number of all test suites that contain at least one test
   // that should run.
+  int test_suite_to_run_count() const;
+
+  //  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  int successful_test_case_count() const;
+  int failed_test_case_count() const;
+  int total_test_case_count() const;
   int test_case_to_run_count() const;
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
   // Gets the number of successful tests.
   int successful_test_count() const;
 
+  // Gets the number of skipped tests.
+  int skipped_test_count() const;
+
   // Gets the number of failed tests.
   int failed_test_count() const;
 
@@ -18611,19 +1184,25 @@ class GTEST_API_ UnitTest {
   // Gets the elapsed time, in milliseconds.
   TimeInMillis elapsed_time() const;
 
-  // Returns true iff the unit test passed (i.e. all test cases passed).
+  // Returns true if and only if the unit test passed (i.e. all test suites
+  // passed).
   bool Passed() const;
 
-  // Returns true iff the unit test failed (i.e. some test case failed
-  // or something outside of all tests failed).
+  // Returns true if and only if the unit test failed (i.e. some test suite
+  // failed or something outside of all tests failed).
   bool Failed() const;
 
-  // Gets the i-th test case among all the test cases. i can range from 0 to
-  // total_test_case_count() - 1. If i is not in that range, returns NULL.
+  // Gets the i-th test suite among all the test suites. i can range from 0 to
+  // total_test_suite_count() - 1. If i is not in that range, returns NULL.
+  const TestSuite* GetTestSuite(int i) const;
+
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
   const TestCase* GetTestCase(int i) const;
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
   // Returns the TestResult containing information on test failures and
-  // properties logged outside of individual test cases.
+  // properties logged outside of individual test suites.
   const TestResult& ad_hoc_test_result() const;
 
   // Returns the list of event listeners that can be used to track events
@@ -18647,39 +1226,38 @@ class GTEST_API_ UnitTest {
   // eventually call this to report their results.  The user code
   // should use the assertion macros instead of calling this directly.
   void AddTestPartResult(TestPartResult::Type result_type,
-                         const char* file_name,
-                         int line_number,
+                         const char* file_name, int line_number,
                          const std::string& message,
                          const std::string& os_stack_trace)
       GTEST_LOCK_EXCLUDED_(mutex_);
 
   // Adds a TestProperty to the current TestResult object when invoked from
-  // inside a test, to current TestCase's ad_hoc_test_result_ when invoked
-  // from SetUpTestCase or TearDownTestCase, or to the global property set
+  // inside a test, to current TestSuite's ad_hoc_test_result_ when invoked
+  // from SetUpTestSuite or TearDownTestSuite, or to the global property set
   // when invoked elsewhere.  If the result already contains a property with
   // the same key, the value will be updated.
   void RecordProperty(const std::string& key, const std::string& value);
 
-  // Gets the i-th test case among all the test cases. i can range from 0 to
-  // total_test_case_count() - 1. If i is not in that range, returns NULL.
-  TestCase* GetMutableTestCase(int i);
+  // Gets the i-th test suite among all the test suites. i can range from 0 to
+  // total_test_suite_count() - 1. If i is not in that range, returns NULL.
+  TestSuite* GetMutableTestSuite(int i);
 
   // Accessors for the implementation object.
   internal::UnitTestImpl* impl() { return impl_; }
   const internal::UnitTestImpl* impl() const { return impl_; }
 
-  // These classes and funcions are friends as they need to access private
+  // These classes and functions are friends as they need to access private
   // members of UnitTest.
+  friend class ScopedTrace;
   friend class Test;
   friend class internal::AssertHelper;
-  friend class internal::ScopedTrace;
   friend class internal::StreamingListenerTest;
   friend class internal::UnitTestRecordPropertyTestHelper;
   friend Environment* AddGlobalTestEnvironment(Environment* env);
+  friend std::set<std::string>* internal::GetIgnoredParameterizedTestSuites();
   friend internal::UnitTestImpl* internal::GetUnitTestImpl();
   friend void internal::ReportFailureInUnknownLocation(
-      TestPartResult::Type result_type,
-      const std::string& message);
+      TestPartResult::Type result_type, const std::string& message);
 
   // Creates an empty UnitTest.
   UnitTest();
@@ -18693,8 +1271,7 @@ class GTEST_API_ UnitTest {
       GTEST_LOCK_EXCLUDED_(mutex_);
 
   // Pops a trace from the per-thread Google Test trace stack.
-  void PopGTestTrace()
-      GTEST_LOCK_EXCLUDED_(mutex_);
+  void PopGTestTrace() GTEST_LOCK_EXCLUDED_(mutex_);
 
   // Protects mutable state in *impl_.  This is mutable as some const
   // methods need to lock it too.
@@ -18707,7 +1284,8 @@ class GTEST_API_ UnitTest {
   internal::UnitTestImpl* impl_;
 
   // We disallow copying UnitTest.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTest);
+  UnitTest(const UnitTest&) = delete;
+  UnitTest& operator=(const UnitTest&) = delete;
 };
 
 // A convenient wrapper for adding an environment for the test
@@ -18747,155 +1325,56 @@ GTEST_API_ void InitGoogleTest(int* argc, char** argv);
 // UNICODE mode.
 GTEST_API_ void InitGoogleTest(int* argc, wchar_t** argv);
 
+// This overloaded version can be used on Arduino/embedded platforms where
+// there is no argc/argv.
+GTEST_API_ void InitGoogleTest();
+
 namespace internal {
 
-// FormatForComparison<ToPrint, OtherOperand>::Format(value) formats a
-// value of type ToPrint that is an operand of a comparison assertion
-// (e.g. ASSERT_EQ).  OtherOperand is the type of the other operand in
-// the comparison, and is used to help determine the best way to
-// format the value.  In particular, when the value is a C string
-// (char pointer) and the other operand is an STL string object, we
-// want to format the C string as a string, since we know it is
-// compared by value with the string object.  If the value is a char
-// pointer but the other operand is not an STL string object, we don't
-// know whether the pointer is supposed to point to a NUL-terminated
-// string, and thus want to print it as a pointer to be safe.
-//
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-
-// The default case.
-template <typename ToPrint, typename OtherOperand>
-class FormatForComparison {
- public:
-  static ::std::string Format(const ToPrint& value) {
-    return ::testing::PrintToString(value);
-  }
-};
-
-// Array.
-template <typename ToPrint, size_t N, typename OtherOperand>
-class FormatForComparison<ToPrint[N], OtherOperand> {
- public:
-  static ::std::string Format(const ToPrint* value) {
-    return FormatForComparison<const ToPrint*, OtherOperand>::Format(value);
-  }
-};
-
-// By default, print C string as pointers to be safe, as we don't know
-// whether they actually point to a NUL-terminated string.
-
-#define GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(CharType)                \
-  template <typename OtherOperand>                                      \
-  class FormatForComparison<CharType*, OtherOperand> {                  \
-   public:                                                              \
-    static ::std::string Format(CharType* value) {                      \
-      return ::testing::PrintToString(static_cast<const void*>(value)); \
-    }                                                                   \
-  }
-
-GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char);
-GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char);
-GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t);
-GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t);
-
-#undef GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_
-
-// If a C string is compared with an STL string object, we know it's meant
-// to point to a NUL-terminated string, and thus can print it as a string.
-
-#define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \
-  template <>                                                           \
-  class FormatForComparison<CharType*, OtherStringType> {               \
-   public:                                                              \
-    static ::std::string Format(CharType* value) {                      \
-      return ::testing::PrintToString(value);                           \
-    }                                                                   \
-  }
-
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string);
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string);
-
-#if GTEST_HAS_GLOBAL_STRING
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::string);
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::string);
-#endif
-
-#if GTEST_HAS_GLOBAL_WSTRING
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::wstring);
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::wstring);
-#endif
-
-#if GTEST_HAS_STD_WSTRING
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring);
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring);
-#endif
-
-#undef GTEST_IMPL_FORMAT_C_STRING_AS_STRING_
-
-// Formats a comparison assertion (e.g. ASSERT_EQ, EXPECT_LT, and etc)
-// operand to be used in a failure message.  The type (but not value)
-// of the other operand may affect the format.  This allows us to
-// print a char* as a raw pointer when it is compared against another
-// char* or void*, and print it as a C string when it is compared
-// against an std::string object, for example.
-//
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+// Separate the error generating code from the code path to reduce the stack
+// frame size of CmpHelperEQ. This helps reduce the overhead of some sanitizers
+// when calling EXPECT_* in a tight loop.
 template <typename T1, typename T2>
-std::string FormatForComparisonFailureMessage(
-    const T1& value, const T2& /* other_operand */) {
-  return FormatForComparison<T1, T2>::Format(value);
+AssertionResult CmpHelperEQFailure(const char* lhs_expression,
+                                   const char* rhs_expression, const T1& lhs,
+                                   const T2& rhs) {
+  return EqFailure(lhs_expression, rhs_expression,
+                   FormatForComparisonFailureMessage(lhs, rhs),
+                   FormatForComparisonFailureMessage(rhs, lhs), false);
 }
 
+// This block of code defines operator==/!=
+// to block lexical scope lookup.
+// It prevents using invalid operator==/!= defined at namespace scope.
+struct faketype {};
+inline bool operator==(faketype, faketype) { return true; }
+inline bool operator!=(faketype, faketype) { return false; }
+
 // The helper function for {ASSERT|EXPECT}_EQ.
 template <typename T1, typename T2>
-AssertionResult CmpHelperEQ(const char* expected_expression,
-                            const char* actual_expression,
-                            const T1& expected,
-                            const T2& actual) {
-#ifdef _MSC_VER
-# pragma warning(push)          // Saves the current warning state.
-# pragma warning(disable:4389)  // Temporarily disables warning on
-                                // signed/unsigned mismatch.
-#endif
-
-  if (expected == actual) {
+AssertionResult CmpHelperEQ(const char* lhs_expression,
+                            const char* rhs_expression, const T1& lhs,
+                            const T2& rhs) {
+  if (lhs == rhs) {
     return AssertionSuccess();
   }
 
-#ifdef _MSC_VER
-# pragma warning(pop)          // Restores the warning state.
-#endif
-
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   FormatForComparisonFailureMessage(expected, actual),
-                   FormatForComparisonFailureMessage(actual, expected),
-                   false);
+  return CmpHelperEQFailure(lhs_expression, rhs_expression, lhs, rhs);
 }
 
-// With this overloaded version, we allow anonymous enums to be used
-// in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous enums
-// can be implicitly cast to BiggestInt.
-GTEST_API_ AssertionResult CmpHelperEQ(const char* expected_expression,
-                                       const char* actual_expression,
-                                       BiggestInt expected,
-                                       BiggestInt actual);
-
-// The helper class for {ASSERT|EXPECT}_EQ.  The template argument
-// lhs_is_null_literal is true iff the first argument to ASSERT_EQ()
-// is a null pointer literal.  The following default implementation is
-// for lhs_is_null_literal being false.
-template <bool lhs_is_null_literal>
 class EqHelper {
  public:
   // This templatized version is for the general case.
-  template <typename T1, typename T2>
-  static AssertionResult Compare(const char* expected_expression,
-                                 const char* actual_expression,
-                                 const T1& expected,
-                                 const T2& actual) {
-    return CmpHelperEQ(expected_expression, actual_expression, expected,
-                       actual);
+  template <
+      typename T1, typename T2,
+      // Disable this overload for cases where one argument is a pointer
+      // and the other is the null pointer constant.
+      typename std::enable_if<!std::is_integral<T1>::value ||
+                              !std::is_pointer<T2>::value>::type* = nullptr>
+  static AssertionResult Compare(const char* lhs_expression,
+                                 const char* rhs_expression, const T1& lhs,
+                                 const T2& rhs) {
+    return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
   }
 
   // With this overloaded version, we allow anonymous enums to be used
@@ -18904,149 +1383,109 @@ class EqHelper {
   //
   // Even though its body looks the same as the above version, we
   // cannot merge the two, as it will make anonymous enums unhappy.
-  static AssertionResult Compare(const char* expected_expression,
-                                 const char* actual_expression,
-                                 BiggestInt expected,
-                                 BiggestInt actual) {
-    return CmpHelperEQ(expected_expression, actual_expression, expected,
-                       actual);
-  }
-};
-
-// This specialization is used when the first argument to ASSERT_EQ()
-// is a null pointer literal, like NULL, false, or 0.
-template <>
-class EqHelper<true> {
- public:
-  // We define two overloaded versions of Compare().  The first
-  // version will be picked when the second argument to ASSERT_EQ() is
-  // NOT a pointer, e.g. ASSERT_EQ(0, AnIntFunction()) or
-  // EXPECT_EQ(false, a_bool).
-  template <typename T1, typename T2>
-  static AssertionResult Compare(
-      const char* expected_expression,
-      const char* actual_expression,
-      const T1& expected,
-      const T2& actual,
-      // The following line prevents this overload from being considered if T2
-      // is not a pointer type.  We need this because ASSERT_EQ(NULL, my_ptr)
-      // expands to Compare("", "", NULL, my_ptr), which requires a conversion
-      // to match the Secret* in the other overload, which would otherwise make
-      // this template match better.
-      typename EnableIf<!is_pointer<T2>::value>::type* = 0) {
-    return CmpHelperEQ(expected_expression, actual_expression, expected,
-                       actual);
+  static AssertionResult Compare(const char* lhs_expression,
+                                 const char* rhs_expression, BiggestInt lhs,
+                                 BiggestInt rhs) {
+    return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
   }
 
-  // This version will be picked when the second argument to ASSERT_EQ() is a
-  // pointer, e.g. ASSERT_EQ(NULL, a_pointer).
   template <typename T>
   static AssertionResult Compare(
-      const char* expected_expression,
-      const char* actual_expression,
-      // We used to have a second template parameter instead of Secret*.  That
-      // template parameter would deduce to 'long', making this a better match
-      // than the first overload even without the first overload's EnableIf.
-      // Unfortunately, gcc with -Wconversion-null warns when "passing NULL to
-      // non-pointer argument" (even a deduced integral argument), so the old
-      // implementation caused warnings in user code.
-      Secret* /* expected (NULL) */,
-      T* actual) {
-    // We already know that 'expected' is a null pointer.
-    return CmpHelperEQ(expected_expression, actual_expression,
-                       static_cast<T*>(NULL), actual);
+      const char* lhs_expression, const char* rhs_expression,
+      // Handle cases where '0' is used as a null pointer literal.
+      std::nullptr_t /* lhs */, T* rhs) {
+    // We already know that 'lhs' is a null pointer.
+    return CmpHelperEQ(lhs_expression, rhs_expression, static_cast<T*>(nullptr),
+                       rhs);
   }
 };
 
+// Separate the error generating code from the code path to reduce the stack
+// frame size of CmpHelperOP. This helps reduce the overhead of some sanitizers
+// when calling EXPECT_OP in a tight loop.
+template <typename T1, typename T2>
+AssertionResult CmpHelperOpFailure(const char* expr1, const char* expr2,
+                                   const T1& val1, const T2& val2,
+                                   const char* op) {
+  return AssertionFailure()
+         << "Expected: (" << expr1 << ") " << op << " (" << expr2
+         << "), actual: " << FormatForComparisonFailureMessage(val1, val2)
+         << " vs " << FormatForComparisonFailureMessage(val2, val1);
+}
+
 // A macro for implementing the helper functions needed to implement
 // ASSERT_?? and EXPECT_??.  It is here just to avoid copy-and-paste
 // of similar code.
 //
-// For each templatized helper function, we also define an overloaded
-// version for BiggestInt in order to reduce code bloat and allow
-// anonymous enums to be used with {ASSERT|EXPECT}_?? when compiled
-// with gcc 4.
-//
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
-template <typename T1, typename T2>\
-AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
-                                   const T1& val1, const T2& val2) {\
-  if (val1 op val2) {\
-    return AssertionSuccess();\
-  } else {\
-    return AssertionFailure() \
-        << "Expected: (" << expr1 << ") " #op " (" << expr2\
-        << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\
-        << " vs " << FormatForComparisonFailureMessage(val2, val1);\
-  }\
-}\
-GTEST_API_ AssertionResult CmpHelper##op_name(\
-    const char* expr1, const char* expr2, BiggestInt val1, BiggestInt val2)
+
+#define GTEST_IMPL_CMP_HELPER_(op_name, op)                                \
+  template <typename T1, typename T2>                                      \
+  AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
+                                     const T1& val1, const T2& val2) {     \
+    if (val1 op val2) {                                                    \
+      return AssertionSuccess();                                           \
+    } else {                                                               \
+      return CmpHelperOpFailure(expr1, expr2, val1, val2, #op);            \
+    }                                                                      \
+  }
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 
 // Implements the helper function for {ASSERT|EXPECT}_NE
-GTEST_IMPL_CMP_HELPER_(NE, !=);
+GTEST_IMPL_CMP_HELPER_(NE, !=)
 // Implements the helper function for {ASSERT|EXPECT}_LE
-GTEST_IMPL_CMP_HELPER_(LE, <=);
+GTEST_IMPL_CMP_HELPER_(LE, <=)
 // Implements the helper function for {ASSERT|EXPECT}_LT
-GTEST_IMPL_CMP_HELPER_(LT, <);
+GTEST_IMPL_CMP_HELPER_(LT, <)
 // Implements the helper function for {ASSERT|EXPECT}_GE
-GTEST_IMPL_CMP_HELPER_(GE, >=);
+GTEST_IMPL_CMP_HELPER_(GE, >=)
 // Implements the helper function for {ASSERT|EXPECT}_GT
-GTEST_IMPL_CMP_HELPER_(GT, >);
+GTEST_IMPL_CMP_HELPER_(GT, >)
 
 #undef GTEST_IMPL_CMP_HELPER_
 
 // The helper function for {ASSERT|EXPECT}_STREQ.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTREQ(const char* expected_expression,
-                                          const char* actual_expression,
-                                          const char* expected,
-                                          const char* actual);
+GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const char* s1, const char* s2);
 
 // The helper function for {ASSERT|EXPECT}_STRCASEEQ.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* expected_expression,
-                                              const char* actual_expression,
-                                              const char* expected,
-                                              const char* actual);
+GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* s1_expression,
+                                              const char* s2_expression,
+                                              const char* s1, const char* s2);
 
 // The helper function for {ASSERT|EXPECT}_STRNE.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
                                           const char* s2_expression,
-                                          const char* s1,
-                                          const char* s2);
+                                          const char* s1, const char* s2);
 
 // The helper function for {ASSERT|EXPECT}_STRCASENE.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
                                               const char* s2_expression,
-                                              const char* s1,
-                                              const char* s2);
-
+                                              const char* s1, const char* s2);
 
 // Helper function for *_STREQ on wide strings.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTREQ(const char* expected_expression,
-                                          const char* actual_expression,
-                                          const wchar_t* expected,
-                                          const wchar_t* actual);
+GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const wchar_t* s1, const wchar_t* s2);
 
 // Helper function for *_STRNE on wide strings.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
                                           const char* s2_expression,
-                                          const wchar_t* s1,
-                                          const wchar_t* s2);
+                                          const wchar_t* s1, const wchar_t* s2);
 
 }  // namespace internal
 
@@ -19058,32 +1497,40 @@ GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
 //
 // The {needle,haystack}_expr arguments are the stringified
 // expressions that generated the two real arguments.
-GTEST_API_ AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack);
-GTEST_API_ AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack);
-GTEST_API_ AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack);
+GTEST_API_ AssertionResult IsSubstring(const char* needle_expr,
+                                       const char* haystack_expr,
+                                       const char* needle,
+                                       const char* haystack);
+GTEST_API_ AssertionResult IsSubstring(const char* needle_expr,
+                                       const char* haystack_expr,
+                                       const wchar_t* needle,
+                                       const wchar_t* haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr,
+                                          const char* haystack_expr,
+                                          const char* needle,
+                                          const char* haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr,
+                                          const char* haystack_expr,
+                                          const wchar_t* needle,
+                                          const wchar_t* haystack);
+GTEST_API_ AssertionResult IsSubstring(const char* needle_expr,
+                                       const char* haystack_expr,
+                                       const ::std::string& needle,
+                                       const ::std::string& haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr,
+                                          const char* haystack_expr,
+                                          const ::std::string& needle,
+                                          const ::std::string& haystack);
 
 #if GTEST_HAS_STD_WSTRING
-GTEST_API_ AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack);
+GTEST_API_ AssertionResult IsSubstring(const char* needle_expr,
+                                       const char* haystack_expr,
+                                       const ::std::wstring& needle,
+                                       const ::std::wstring& haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr,
+                                          const char* haystack_expr,
+                                          const ::std::wstring& needle,
+                                          const ::std::wstring& haystack);
 #endif  // GTEST_HAS_STD_WSTRING
 
 namespace internal {
@@ -19096,28 +1543,25 @@ namespace internal {
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 template <typename RawType>
-AssertionResult CmpHelperFloatingPointEQ(const char* expected_expression,
-                                         const char* actual_expression,
-                                         RawType expected,
-                                         RawType actual) {
-  const FloatingPoint<RawType> lhs(expected), rhs(actual);
+AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
+                                         const char* rhs_expression,
+                                         RawType lhs_value, RawType rhs_value) {
+  const FloatingPoint<RawType> lhs(lhs_value), rhs(rhs_value);
 
   if (lhs.AlmostEquals(rhs)) {
     return AssertionSuccess();
   }
 
-  ::std::stringstream expected_ss;
-  expected_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
-              << expected;
+  ::std::stringstream lhs_ss;
+  lhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+         << lhs_value;
 
-  ::std::stringstream actual_ss;
-  actual_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
-            << actual;
+  ::std::stringstream rhs_ss;
+  rhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+         << rhs_value;
 
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   StringStreamToString(&expected_ss),
-                   StringStreamToString(&actual_ss),
+  return EqFailure(lhs_expression, rhs_expression,
+                   StringStreamToString(&lhs_ss), StringStreamToString(&rhs_ss),
                    false);
 }
 
@@ -19127,8 +1571,7 @@ AssertionResult CmpHelperFloatingPointEQ(const char* expected_expression,
 GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1,
                                                 const char* expr2,
                                                 const char* abs_error_expr,
-                                                double val1,
-                                                double val2,
+                                                double val1, double val2,
                                                 double abs_error);
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
@@ -19136,9 +1579,7 @@ GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1,
 class GTEST_API_ AssertHelper {
  public:
   // Constructor.
-  AssertHelper(TestPartResult::Type type,
-               const char* file,
-               int line,
+  AssertHelper(TestPartResult::Type type, const char* file, int line,
                const char* message);
   ~AssertHelper();
 
@@ -19152,11 +1593,9 @@ class GTEST_API_ AssertHelper {
   // re-using stack space even for temporary variables, so every EXPECT_EQ
   // reserves stack space for another AssertHelper.
   struct AssertHelperData {
-    AssertHelperData(TestPartResult::Type t,
-                     const char* srcfile,
-                     int line_num,
+    AssertHelperData(TestPartResult::Type t, const char* srcfile, int line_num,
                      const char* msg)
-        : type(t), file(srcfile), line(line_num), message(msg) { }
+        : type(t), file(srcfile), line(line_num), message(msg) {}
 
     TestPartResult::Type const type;
     const char* const file;
@@ -19164,17 +1603,18 @@ class GTEST_API_ AssertHelper {
     std::string const message;
 
    private:
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelperData);
+    AssertHelperData(const AssertHelperData&) = delete;
+    AssertHelperData& operator=(const AssertHelperData&) = delete;
   };
 
   AssertHelperData* const data_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelper);
+  AssertHelper(const AssertHelper&) = delete;
+  AssertHelper& operator=(const AssertHelper&) = delete;
 };
 
 }  // namespace internal
 
-#if GTEST_HAS_PARAM_TEST
 // The pure interface class that all value-parameterized tests inherit from.
 // A value-parameterized class must inherit from both ::testing::Test and
 // ::testing::WithParamInterface. In most cases that just means inheriting
@@ -19192,13 +1632,13 @@ class GTEST_API_ AssertHelper {
 //   FooTest() {
 //     // Can use GetParam() here.
 //   }
-//   virtual ~FooTest() {
+//   ~FooTest() override {
 //     // Can use GetParam() here.
 //   }
-//   virtual void SetUp() {
+//   void SetUp() override {
 //     // Can use GetParam() here.
 //   }
-//   virtual void TearDown {
+//   void TearDown override {
 //     // Can use GetParam() here.
 //   }
 // };
@@ -19207,7 +1647,7 @@ class GTEST_API_ AssertHelper {
 //   Foo foo;
 //   ASSERT_TRUE(foo.DoesBar(GetParam()));
 // }
-// INSTANTIATE_TEST_CASE_P(OneToTenRange, FooTest, ::testing::Range(1, 10));
+// INSTANTIATE_TEST_SUITE_P(OneToTenRange, FooTest, ::testing::Range(1, 10));
 
 template <typename T>
 class WithParamInterface {
@@ -19216,12 +1656,9 @@ class WithParamInterface {
   virtual ~WithParamInterface() {}
 
   // The current parameter value. Is also available in the test fixture's
-  // constructor. This member function is non-static, even though it only
-  // references static data, to reduce the opportunity for incorrect uses
-  // like writing 'WithParamInterface<bool>::GetParam()' for a test that
-  // uses a fixture whose parameter type is int.
-  const ParamType& GetParam() const {
-    GTEST_CHECK_(parameter_ != NULL)
+  // constructor.
+  static const ParamType& GetParam() {
+    GTEST_CHECK_(parameter_ != nullptr)
         << "GetParam() can only be called inside a value-parameterized test "
         << "-- did you intend to write TEST_P instead of TEST_F?";
     return *parameter_;
@@ -19230,31 +1667,32 @@ class WithParamInterface {
  private:
   // Sets parameter value. The caller is responsible for making sure the value
   // remains alive and unchanged throughout the current test.
-  static void SetParam(const ParamType* parameter) {
-    parameter_ = parameter;
-  }
+  static void SetParam(const ParamType* parameter) { parameter_ = parameter; }
 
   // Static value used for accessing parameter during a test lifetime.
   static const ParamType* parameter_;
 
   // TestClass must be a subclass of WithParamInterface<T> and Test.
-  template <class TestClass> friend class internal::ParameterizedTestFactory;
+  template <class TestClass>
+  friend class internal::ParameterizedTestFactory;
 };
 
 template <typename T>
-const T* WithParamInterface<T>::parameter_ = NULL;
+const T* WithParamInterface<T>::parameter_ = nullptr;
 
 // Most value-parameterized classes can ignore the existence of
 // WithParamInterface, and can just inherit from ::testing::TestWithParam.
 
 template <typename T>
-class TestWithParam : public Test, public WithParamInterface<T> {
-};
-
-#endif  // GTEST_HAS_PARAM_TEST
+class TestWithParam : public Test, public WithParamInterface<T> {};
 
 // Macros for indicating success/failure in test code.
 
+// Skips test in runtime.
+// Skipping test aborts current function.
+// Skipped tests are neither successful nor failed.
+#define GTEST_SKIP() GTEST_SKIP_("")
+
 // ADD_FAILURE unconditionally adds a failure to the current test.
 // SUCCEED generates a success - it doesn't automatically make the
 // current test successful, as a test is only successful when it has
@@ -19277,17 +1715,22 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 
 // Generates a nonfatal failure at the given source file location with
 // a generic message.
-#define ADD_FAILURE_AT(file, line) \
+#define ADD_FAILURE_AT(file, line)        \
   GTEST_MESSAGE_AT_(file, line, "Failed", \
                     ::testing::TestPartResult::kNonFatalFailure)
 
 // Generates a fatal failure with a generic message.
 #define GTEST_FAIL() GTEST_FATAL_FAILURE_("Failed")
 
+// Like GTEST_FAIL(), but at the given source file location.
+#define GTEST_FAIL_AT(file, line)         \
+  GTEST_MESSAGE_AT_(file, line, "Failed", \
+                    ::testing::TestPartResult::kFatalFailure)
+
 // Define this macro to 1 to omit the definition of FAIL(), which is a
 // generic name and clashes with some other libraries.
 #if !GTEST_DONT_DEFINE_FAIL
-# define FAIL() GTEST_FAIL()
+#define FAIL() GTEST_FAIL()
 #endif
 
 // Generates a success with a generic message.
@@ -19296,7 +1739,7 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 // Define this macro to 1 to omit the definition of SUCCEED(), which
 // is a generic name and clashes with some other libraries.
 #if !GTEST_DONT_DEFINE_SUCCEED
-# define SUCCEED() GTEST_SUCCEED()
+#define SUCCEED() GTEST_SUCCEED()
 #endif
 
 // Macros for testing exceptions.
@@ -19324,388 +1767,45 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 // Boolean assertions. Condition can be either a Boolean expression or an
 // AssertionResult. For more information on how to use AssertionResult with
 // these macros see comments on that class.
-#define EXPECT_TRUE(condition) \
+#define GTEST_EXPECT_TRUE(condition)                      \
   GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
                       GTEST_NONFATAL_FAILURE_)
-#define EXPECT_FALSE(condition) \
+#define GTEST_EXPECT_FALSE(condition)                        \
   GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
                       GTEST_NONFATAL_FAILURE_)
-#define ASSERT_TRUE(condition) \
-  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
-                      GTEST_FATAL_FAILURE_)
-#define ASSERT_FALSE(condition) \
+#define GTEST_ASSERT_TRUE(condition) \
+  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, GTEST_FATAL_FAILURE_)
+#define GTEST_ASSERT_FALSE(condition)                        \
   GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
                       GTEST_FATAL_FAILURE_)
 
-// Includes the auto-generated header that implements a family of
-// generic predicate assertion macros.
-// Copyright 2006, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// Define these macros to 1 to omit the definition of the corresponding
+// EXPECT or ASSERT, which clashes with some users' own code.
 
-// This file is AUTOMATICALLY GENERATED on 10/31/2011 by command
-// 'gen_gtest_pred_impl.py 5'.  DO NOT EDIT BY HAND!
-//
-// Implements a family of generic predicate assertion macros.
+#if !GTEST_DONT_DEFINE_EXPECT_TRUE
+#define EXPECT_TRUE(condition) GTEST_EXPECT_TRUE(condition)
+#endif
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+#if !GTEST_DONT_DEFINE_EXPECT_FALSE
+#define EXPECT_FALSE(condition) GTEST_EXPECT_FALSE(condition)
+#endif
 
-// Makes sure this header is not included before gtest.h.
-#ifndef GTEST_INCLUDE_GTEST_GTEST_H_
-# error Do not include gtest_pred_impl.h directly.  Include gtest.h instead.
-#endif  // GTEST_INCLUDE_GTEST_GTEST_H_
+#if !GTEST_DONT_DEFINE_ASSERT_TRUE
+#define ASSERT_TRUE(condition) GTEST_ASSERT_TRUE(condition)
+#endif
 
-// This header implements a family of generic predicate assertion
-// macros:
-//
-//   ASSERT_PRED_FORMAT1(pred_format, v1)
-//   ASSERT_PRED_FORMAT2(pred_format, v1, v2)
-//   ...
-//
-// where pred_format is a function or functor that takes n (in the
-// case of ASSERT_PRED_FORMATn) values and their source expression
-// text, and returns a testing::AssertionResult.  See the definition
-// of ASSERT_EQ in gtest.h for an example.
-//
-// If you don't care about formatting, you can use the more
-// restrictive version:
-//
-//   ASSERT_PRED1(pred, v1)
-//   ASSERT_PRED2(pred, v1, v2)
-//   ...
-//
-// where pred is an n-ary function or functor that returns bool,
-// and the values v1, v2, ..., must support the << operator for
-// streaming to std::ostream.
-//
-// We also define the EXPECT_* variations.
-//
-// For now we only support predicates whose arity is at most 5.
-// Please email googletestframework@googlegroups.com if you need
-// support for higher arities.
-
-// GTEST_ASSERT_ is the basic statement to which all of the assertions
-// in this file reduce.  Don't use this in your code.
-
-#define GTEST_ASSERT_(expression, on_failure) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (const ::testing::AssertionResult gtest_ar = (expression)) \
-    ; \
-  else \
-    on_failure(gtest_ar.failure_message())
-
-
-// Helper function for implementing {EXPECT|ASSERT}_PRED1.  Don't use
-// this in your code.
-template <typename Pred,
-          typename T1>
-AssertionResult AssertPred1Helper(const char* pred_text,
-                                  const char* e1,
-                                  Pred pred,
-                                  const T1& v1) {
-  if (pred(v1)) return AssertionSuccess();
-
-  return AssertionFailure() << pred_text << "("
-                            << e1 << ") evaluates to false, where"
-                            << "\n" << e1 << " evaluates to " << v1;
-}
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1.
-// Don't use this in your code.
-#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, v1), \
-                on_failure)
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED1.  Don't use
-// this in your code.
-#define GTEST_PRED1_(pred, v1, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, \
-                                             #v1, \
-                                             pred, \
-                                             v1), on_failure)
-
-// Unary predicate assertion macros.
-#define EXPECT_PRED_FORMAT1(pred_format, v1) \
-  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED1(pred, v1) \
-  GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_PRED_FORMAT1(pred_format, v1) \
-  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED1(pred, v1) \
-  GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_)
-
-
-
-// Helper function for implementing {EXPECT|ASSERT}_PRED2.  Don't use
-// this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2>
-AssertionResult AssertPred2Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2) {
-  if (pred(v1, v2)) return AssertionSuccess();
-
-  return AssertionFailure() << pred_text << "("
-                            << e1 << ", "
-                            << e2 << ") evaluates to false, where"
-                            << "\n" << e1 << " evaluates to " << v1
-                            << "\n" << e2 << " evaluates to " << v2;
-}
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2.
-// Don't use this in your code.
-#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), \
-                on_failure)
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED2.  Don't use
-// this in your code.
-#define GTEST_PRED2_(pred, v1, v2, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             pred, \
-                                             v1, \
-                                             v2), on_failure)
-
-// Binary predicate assertion macros.
-#define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \
-  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED2(pred, v1, v2) \
-  GTEST_PRED2_(pred, v1, v2, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_PRED_FORMAT2(pred_format, v1, v2) \
-  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED2(pred, v1, v2) \
-  GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_)
-
-
-
-// Helper function for implementing {EXPECT|ASSERT}_PRED3.  Don't use
-// this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2,
-          typename T3>
-AssertionResult AssertPred3Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  const char* e3,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2,
-                                  const T3& v3) {
-  if (pred(v1, v2, v3)) return AssertionSuccess();
-
-  return AssertionFailure() << pred_text << "("
-                            << e1 << ", "
-                            << e2 << ", "
-                            << e3 << ") evaluates to false, where"
-                            << "\n" << e1 << " evaluates to " << v1
-                            << "\n" << e2 << " evaluates to " << v2
-                            << "\n" << e3 << " evaluates to " << v3;
-}
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3.
-// Don't use this in your code.
-#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), \
-                on_failure)
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED3.  Don't use
-// this in your code.
-#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred3Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             #v3, \
-                                             pred, \
-                                             v1, \
-                                             v2, \
-                                             v3), on_failure)
-
-// Ternary predicate assertion macros.
-#define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \
-  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED3(pred, v1, v2, v3) \
-  GTEST_PRED3_(pred, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_PRED_FORMAT3(pred_format, v1, v2, v3) \
-  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED3(pred, v1, v2, v3) \
-  GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_)
-
-
-
-// Helper function for implementing {EXPECT|ASSERT}_PRED4.  Don't use
-// this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2,
-          typename T3,
-          typename T4>
-AssertionResult AssertPred4Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  const char* e3,
-                                  const char* e4,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2,
-                                  const T3& v3,
-                                  const T4& v4) {
-  if (pred(v1, v2, v3, v4)) return AssertionSuccess();
-
-  return AssertionFailure() << pred_text << "("
-                            << e1 << ", "
-                            << e2 << ", "
-                            << e3 << ", "
-                            << e4 << ") evaluates to false, where"
-                            << "\n" << e1 << " evaluates to " << v1
-                            << "\n" << e2 << " evaluates to " << v2
-                            << "\n" << e3 << " evaluates to " << v3
-                            << "\n" << e4 << " evaluates to " << v4;
-}
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4.
-// Don't use this in your code.
-#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), \
-                on_failure)
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED4.  Don't use
-// this in your code.
-#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             #v3, \
-                                             #v4, \
-                                             pred, \
-                                             v1, \
-                                             v2, \
-                                             v3, \
-                                             v4), on_failure)
-
-// 4-ary predicate assertion macros.
-#define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
-  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED4(pred, v1, v2, v3, v4) \
-  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
-  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED4(pred, v1, v2, v3, v4) \
-  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
-
-
-
-// Helper function for implementing {EXPECT|ASSERT}_PRED5.  Don't use
-// this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2,
-          typename T3,
-          typename T4,
-          typename T5>
-AssertionResult AssertPred5Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  const char* e3,
-                                  const char* e4,
-                                  const char* e5,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2,
-                                  const T3& v3,
-                                  const T4& v4,
-                                  const T5& v5) {
-  if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess();
-
-  return AssertionFailure() << pred_text << "("
-                            << e1 << ", "
-                            << e2 << ", "
-                            << e3 << ", "
-                            << e4 << ", "
-                            << e5 << ") evaluates to false, where"
-                            << "\n" << e1 << " evaluates to " << v1
-                            << "\n" << e2 << " evaluates to " << v2
-                            << "\n" << e3 << " evaluates to " << v3
-                            << "\n" << e4 << " evaluates to " << v4
-                            << "\n" << e5 << " evaluates to " << v5;
-}
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5.
-// Don't use this in your code.
-#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \
-                on_failure)
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED5.  Don't use
-// this in your code.
-#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             #v3, \
-                                             #v4, \
-                                             #v5, \
-                                             pred, \
-                                             v1, \
-                                             v2, \
-                                             v3, \
-                                             v4, \
-                                             v5), on_failure)
-
-// 5-ary predicate assertion macros.
-#define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
-  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED5(pred, v1, v2, v3, v4, v5) \
-  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
-  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \
-  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
-
-
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+#if !GTEST_DONT_DEFINE_ASSERT_FALSE
+#define ASSERT_FALSE(condition) GTEST_ASSERT_FALSE(condition)
+#endif
 
 // Macros for testing equalities and inequalities.
 //
-//    * {ASSERT|EXPECT}_EQ(expected, actual): Tests that expected == actual
-//    * {ASSERT|EXPECT}_NE(v1, v2):           Tests that v1 != v2
-//    * {ASSERT|EXPECT}_LT(v1, v2):           Tests that v1 < v2
-//    * {ASSERT|EXPECT}_LE(v1, v2):           Tests that v1 <= v2
-//    * {ASSERT|EXPECT}_GT(v1, v2):           Tests that v1 > v2
-//    * {ASSERT|EXPECT}_GE(v1, v2):           Tests that v1 >= v2
+//    * {ASSERT|EXPECT}_EQ(v1, v2): Tests that v1 == v2
+//    * {ASSERT|EXPECT}_NE(v1, v2): Tests that v1 != v2
+//    * {ASSERT|EXPECT}_LT(v1, v2): Tests that v1 < v2
+//    * {ASSERT|EXPECT}_LE(v1, v2): Tests that v1 <= v2
+//    * {ASSERT|EXPECT}_GT(v1, v2): Tests that v1 > v2
+//    * {ASSERT|EXPECT}_GE(v1, v2): Tests that v1 >= v2
 //
 // When they are not, Google Test prints both the tested expressions and
 // their actual values.  The values must be compatible built-in types,
@@ -19727,8 +1827,8 @@ AssertionResult AssertPred5Helper(const char* pred_text,
 //   are related, not how their content is related.  To compare two C
 //   strings by content, use {ASSERT|EXPECT}_STR*().
 //
-//   3. {ASSERT|EXPECT}_EQ(expected, actual) is preferred to
-//   {ASSERT|EXPECT}_TRUE(expected == actual), as the former tells you
+//   3. {ASSERT|EXPECT}_EQ(v1, v2) is preferred to
+//   {ASSERT|EXPECT}_TRUE(v1 == v2), as the former tells you
 //   what the actual value is when it fails, and similarly for the
 //   other comparisons.
 //
@@ -19739,17 +1839,15 @@ AssertionResult AssertPred5Helper(const char* pred_text,
 //
 // Examples:
 //
-//   EXPECT_NE(5, Foo());
-//   EXPECT_EQ(NULL, a_pointer);
+//   EXPECT_NE(Foo(), 5);
+//   EXPECT_EQ(a_pointer, NULL);
 //   ASSERT_LT(i, array_size);
 //   ASSERT_GT(records.size(), 0) << "There is no record left.";
 
-#define EXPECT_EQ(expected, actual) \
-  EXPECT_PRED_FORMAT2(::testing::internal:: \
-                      EqHelper<GTEST_IS_NULL_LITERAL_(expected)>::Compare, \
-                      expected, actual)
-#define EXPECT_NE(expected, actual) \
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, expected, actual)
+#define EXPECT_EQ(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::EqHelper::Compare, val1, val2)
+#define EXPECT_NE(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2)
 #define EXPECT_LE(val1, val2) \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2)
 #define EXPECT_LT(val1, val2) \
@@ -19759,10 +1857,8 @@ AssertionResult AssertPred5Helper(const char* pred_text,
 #define EXPECT_GT(val1, val2) \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2)
 
-#define GTEST_ASSERT_EQ(expected, actual) \
-  ASSERT_PRED_FORMAT2(::testing::internal:: \
-                      EqHelper<GTEST_IS_NULL_LITERAL_(expected)>::Compare, \
-                      expected, actual)
+#define GTEST_ASSERT_EQ(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::EqHelper::Compare, val1, val2)
 #define GTEST_ASSERT_NE(val1, val2) \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2)
 #define GTEST_ASSERT_LE(val1, val2) \
@@ -19778,27 +1874,27 @@ AssertionResult AssertPred5Helper(const char* pred_text,
 // ASSERT_XY(), which clashes with some users' own code.
 
 #if !GTEST_DONT_DEFINE_ASSERT_EQ
-# define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2)
+#define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_NE
-# define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2)
+#define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_LE
-# define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2)
+#define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_LT
-# define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2)
+#define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_GE
-# define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2)
+#define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_GT
-# define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2)
+#define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2)
 #endif
 
 // C-string Comparisons.  All tests treat NULL and any non-NULL string
@@ -19817,29 +1913,29 @@ AssertionResult AssertPred5Helper(const char* pred_text,
 //
 // These macros evaluate their arguments exactly once.
 
-#define EXPECT_STREQ(expected, actual) \
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, expected, actual)
+#define EXPECT_STREQ(s1, s2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, s1, s2)
 #define EXPECT_STRNE(s1, s2) \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
-#define EXPECT_STRCASEEQ(expected, actual) \
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, expected, actual)
-#define EXPECT_STRCASENE(s1, s2)\
+#define EXPECT_STRCASEEQ(s1, s2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
+#define EXPECT_STRCASENE(s1, s2) \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
 
-#define ASSERT_STREQ(expected, actual) \
-  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, expected, actual)
+#define ASSERT_STREQ(s1, s2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, s1, s2)
 #define ASSERT_STRNE(s1, s2) \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
-#define ASSERT_STRCASEEQ(expected, actual) \
-  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, expected, actual)
-#define ASSERT_STRCASENE(s1, s2)\
+#define ASSERT_STRCASEEQ(s1, s2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
+#define ASSERT_STRCASENE(s1, s2) \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
 
 // Macros for comparing floating-point numbers.
 //
-//    * {ASSERT|EXPECT}_FLOAT_EQ(expected, actual):
+//    * {ASSERT|EXPECT}_FLOAT_EQ(val1, val2):
 //         Tests that two float values are almost equal.
-//    * {ASSERT|EXPECT}_DOUBLE_EQ(expected, actual):
+//    * {ASSERT|EXPECT}_DOUBLE_EQ(val1, val2):
 //         Tests that two double values are almost equal.
 //    * {ASSERT|EXPECT}_NEAR(v1, v2, abs_error):
 //         Tests that v1 and v2 are within the given distance to each other.
@@ -19849,29 +1945,29 @@ AssertionResult AssertPred5Helper(const char* pred_text,
 // FloatingPoint template class in gtest-internal.h if you are
 // interested in the implementation details.
 
-#define EXPECT_FLOAT_EQ(expected, actual)\
+#define EXPECT_FLOAT_EQ(val1, val2)                                         \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
-                      expected, actual)
+                      val1, val2)
 
-#define EXPECT_DOUBLE_EQ(expected, actual)\
+#define EXPECT_DOUBLE_EQ(val1, val2)                                         \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
-                      expected, actual)
+                      val1, val2)
 
-#define ASSERT_FLOAT_EQ(expected, actual)\
+#define ASSERT_FLOAT_EQ(val1, val2)                                         \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
-                      expected, actual)
+                      val1, val2)
 
-#define ASSERT_DOUBLE_EQ(expected, actual)\
+#define ASSERT_DOUBLE_EQ(val1, val2)                                         \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
-                      expected, actual)
+                      val1, val2)
 
-#define EXPECT_NEAR(val1, val2, abs_error)\
-  EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
-                      val1, val2, abs_error)
+#define EXPECT_NEAR(val1, val2, abs_error)                                   \
+  EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, val1, val2, \
+                      abs_error)
 
-#define ASSERT_NEAR(val1, val2, abs_error)\
-  ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
-                      val1, val2, abs_error)
+#define ASSERT_NEAR(val1, val2, abs_error)                                   \
+  ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, val1, val2, \
+                      abs_error)
 
 // These predicate format functions work on floating-point values, and
 // can be used in {ASSERT|EXPECT}_PRED_FORMAT2*(), e.g.
@@ -19885,7 +1981,6 @@ GTEST_API_ AssertionResult FloatLE(const char* expr1, const char* expr2,
 GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
                                     double val1, double val2);
 
-
 #if GTEST_OS_WINDOWS
 
 // Macros that test for HRESULT failure and success, these are only useful
@@ -19897,17 +1992,17 @@ GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
 // expected result and the actual result with both a human-readable
 // string representation of the error, if available, as well as the
 // hex result code.
-# define EXPECT_HRESULT_SUCCEEDED(expr) \
-    EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+#define EXPECT_HRESULT_SUCCEEDED(expr) \
+  EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
 
-# define ASSERT_HRESULT_SUCCEEDED(expr) \
-    ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+#define ASSERT_HRESULT_SUCCEEDED(expr) \
+  ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
 
-# define EXPECT_HRESULT_FAILED(expr) \
-    EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+#define EXPECT_HRESULT_FAILED(expr) \
+  EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
 
-# define ASSERT_HRESULT_FAILED(expr) \
-    ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+#define ASSERT_HRESULT_FAILED(expr) \
+  ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
 
 #endif  // GTEST_OS_WINDOWS
 
@@ -19922,9 +2017,55 @@ GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
 //   ASSERT_NO_FATAL_FAILURE(Process()) << "Process() failed";
 //
 #define ASSERT_NO_FATAL_FAILURE(statement) \
-    GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_)
+  GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_)
 #define EXPECT_NO_FATAL_FAILURE(statement) \
-    GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_)
+  GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_)
+
+// Causes a trace (including the given source file path and line number,
+// and the given message) to be included in every test failure message generated
+// by code in the scope of the lifetime of an instance of this class. The effect
+// is undone with the destruction of the instance.
+//
+// The message argument can be anything streamable to std::ostream.
+//
+// Example:
+//   testing::ScopedTrace trace("file.cc", 123, "message");
+//
+class GTEST_API_ ScopedTrace {
+ public:
+  // The c'tor pushes the given source file location and message onto
+  // a trace stack maintained by Google Test.
+
+  // Template version. Uses Message() to convert the values into strings.
+  // Slow, but flexible.
+  template <typename T>
+  ScopedTrace(const char* file, int line, const T& message) {
+    PushTrace(file, line, (Message() << message).GetString());
+  }
+
+  // Optimize for some known types.
+  ScopedTrace(const char* file, int line, const char* message) {
+    PushTrace(file, line, message ? message : "(null)");
+  }
+
+  ScopedTrace(const char* file, int line, const std::string& message) {
+    PushTrace(file, line, message);
+  }
+
+  // The d'tor pops the info pushed by the c'tor.
+  //
+  // Note that the d'tor is not virtual in order to be efficient.
+  // Don't inherit from ScopedTrace!
+  ~ScopedTrace();
+
+ private:
+  void PushTrace(const char* file, int line, std::string message);
+
+  ScopedTrace(const ScopedTrace&) = delete;
+  ScopedTrace& operator=(const ScopedTrace&) = delete;
+} GTEST_ATTRIBUTE_UNUSED_;  // A ScopedTrace object does its job in its
+                            // c'tor and d'tor.  Therefore it doesn't
+                            // need to be used otherwise.
 
 // Causes a trace (including the source file path, the current line
 // number, and the given message) to be included in every test failure
@@ -19937,13 +2078,17 @@ GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
 // of the dummy variable name, thus allowing multiple SCOPED_TRACE()s
 // to appear in the same block - as long as they are on different
 // lines.
-#define SCOPED_TRACE(message) \
-  ::testing::internal::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)(\
-    __FILE__, __LINE__, ::testing::Message() << (message))
+//
+// Assuming that each thread maintains its own stack of traces.
+// Therefore, a SCOPED_TRACE() would (correctly) only affect the
+// assertions in its own thread.
+#define SCOPED_TRACE(message)                                         \
+  ::testing::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)( \
+      __FILE__, __LINE__, (message))
 
 // Compile-time assertion for type equality.
-// StaticAssertTypeEq<type1, type2>() compiles iff type1 and type2 are
-// the same type.  The value it returns is not interesting.
+// StaticAssertTypeEq<type1, type2>() compiles if and only if type1 and type2
+// are the same type.  The value it returns is not interesting.
 //
 // Instead of making StaticAssertTypeEq a class template, we make it a
 // function template that invokes a helper class template.  This
@@ -19972,21 +2117,21 @@ GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
 //
 // to cause a compiler error.
 template <typename T1, typename T2>
-bool StaticAssertTypeEq() {
-  (void)internal::StaticAssertTypeEqHelper<T1, T2>();
+constexpr bool StaticAssertTypeEq() noexcept {
+  static_assert(std::is_same<T1, T2>::value, "T1 and T2 are not the same type");
   return true;
 }
 
 // Defines a test.
 //
-// The first parameter is the name of the test case, and the second
-// parameter is the name of the test within the test case.
+// The first parameter is the name of the test suite, and the second
+// parameter is the name of the test within the test suite.
 //
-// The convention is to end the test case name with "Test".  For
-// example, a test case for the Foo class can be named FooTest.
+// The convention is to end the test suite name with "Test".  For
+// example, a test suite for the Foo class can be named FooTest.
 //
-// The user should put his test code between braces after using this
-// macro.  Example:
+// Test code should appear between braces after an invocation of
+// this macro.  Example:
 //
 //   TEST(FooTest, InitializesCorrectly) {
 //     Foo foo;
@@ -20002,28 +2147,28 @@ bool StaticAssertTypeEq() {
 // code.  GetTestTypeId() is guaranteed to always return the same
 // value, as it always calls GetTypeId<>() from the Google Test
 // framework.
-#define GTEST_TEST(test_case_name, test_name)\
-  GTEST_TEST_(test_case_name, test_name, \
-              ::testing::Test, ::testing::internal::GetTestTypeId())
+#define GTEST_TEST(test_suite_name, test_name)             \
+  GTEST_TEST_(test_suite_name, test_name, ::testing::Test, \
+              ::testing::internal::GetTestTypeId())
 
 // Define this macro to 1 to omit the definition of TEST(), which
 // is a generic name and clashes with some other libraries.
 #if !GTEST_DONT_DEFINE_TEST
-# define TEST(test_case_name, test_name) GTEST_TEST(test_case_name, test_name)
+#define TEST(test_suite_name, test_name) GTEST_TEST(test_suite_name, test_name)
 #endif
 
 // Defines a test that uses a test fixture.
 //
 // The first parameter is the name of the test fixture class, which
-// also doubles as the test case name.  The second parameter is the
-// name of the test within the test case.
+// also doubles as the test suite name.  The second parameter is the
+// name of the test within the test suite.
 //
 // A test fixture class must be declared earlier.  The user should put
-// his test code between braces after using this macro.  Example:
+// the test code between braces after using this macro.  Example:
 //
 //   class FooTest : public testing::Test {
 //    protected:
-//     virtual void SetUp() { b_.AddElement(3); }
+//     void SetUp() override { b_.AddElement(3); }
 //
 //     Foo a_;
 //     Foo b_;
@@ -20034,13 +2179,104 @@ bool StaticAssertTypeEq() {
 //   }
 //
 //   TEST_F(FooTest, ReturnsElementCountCorrectly) {
-//     EXPECT_EQ(0, a_.size());
-//     EXPECT_EQ(1, b_.size());
+//     EXPECT_EQ(a_.size(), 0);
+//     EXPECT_EQ(b_.size(), 1);
 //   }
-
-#define TEST_F(test_fixture, test_name)\
+#define GTEST_TEST_F(test_fixture, test_name)        \
   GTEST_TEST_(test_fixture, test_name, test_fixture, \
               ::testing::internal::GetTypeId<test_fixture>())
+#if !GTEST_DONT_DEFINE_TEST_F
+#define TEST_F(test_fixture, test_name) GTEST_TEST_F(test_fixture, test_name)
+#endif
+
+// Returns a path to temporary directory.
+// Tries to determine an appropriate directory for the platform.
+GTEST_API_ std::string TempDir();
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+// Dynamically registers a test with the framework.
+//
+// This is an advanced API only to be used when the `TEST` macros are
+// insufficient. The macros should be preferred when possible, as they avoid
+// most of the complexity of calling this function.
+//
+// The `factory` argument is a factory callable (move-constructible) object or
+// function pointer that creates a new instance of the Test object. It
+// handles ownership to the caller. The signature of the callable is
+// `Fixture*()`, where `Fixture` is the test fixture class for the test. All
+// tests registered with the same `test_suite_name` must return the same
+// fixture type. This is checked at runtime.
+//
+// The framework will infer the fixture class from the factory and will call
+// the `SetUpTestSuite` and `TearDownTestSuite` for it.
+//
+// Must be called before `RUN_ALL_TESTS()` is invoked, otherwise behavior is
+// undefined.
+//
+// Use case example:
+//
+// class MyFixture : public ::testing::Test {
+//  public:
+//   // All of these optional, just like in regular macro usage.
+//   static void SetUpTestSuite() { ... }
+//   static void TearDownTestSuite() { ... }
+//   void SetUp() override { ... }
+//   void TearDown() override { ... }
+// };
+//
+// class MyTest : public MyFixture {
+//  public:
+//   explicit MyTest(int data) : data_(data) {}
+//   void TestBody() override { ... }
+//
+//  private:
+//   int data_;
+// };
+//
+// void RegisterMyTests(const std::vector<int>& values) {
+//   for (int v : values) {
+//     ::testing::RegisterTest(
+//         "MyFixture", ("Test" + std::to_string(v)).c_str(), nullptr,
+//         std::to_string(v).c_str(),
+//         __FILE__, __LINE__,
+//         // Important to use the fixture type as the return type here.
+//         [=]() -> MyFixture* { return new MyTest(v); });
+//   }
+// }
+// ...
+// int main(int argc, char** argv) {
+//   ::testing::InitGoogleTest(&argc, argv);
+//   std::vector<int> values_to_test = LoadValuesFromConfig();
+//   RegisterMyTests(values_to_test);
+//   ...
+//   return RUN_ALL_TESTS();
+// }
+//
+template <int&... ExplicitParameterBarrier, typename Factory>
+TestInfo* RegisterTest(const char* test_suite_name, const char* test_name,
+                       const char* type_param, const char* value_param,
+                       const char* file, int line, Factory factory) {
+  using TestT = typename std::remove_pointer<decltype(factory())>::type;
+
+  class FactoryImpl : public internal::TestFactoryBase {
+   public:
+    explicit FactoryImpl(Factory f) : factory_(std::move(f)) {}
+    Test* CreateTest() override { return factory_(); }
+
+   private:
+    Factory factory_;
+  };
+
+  return internal::MakeAndRegisterTestInfo(
+      test_suite_name, test_name, type_param, value_param,
+      internal::CodeLocation(file, line), internal::GetTypeId<TestT>(),
+      internal::SuiteApiResolver<TestT>::GetSetUpCaseOrSuite(file, line),
+      internal::SuiteApiResolver<TestT>::GetTearDownCaseOrSuite(file, line),
+      new FactoryImpl{std::move(factory)});
+}
 
 }  // namespace testing
 
@@ -20054,8 +2290,8 @@ bool StaticAssertTypeEq() {
 // namespace and has an all-caps name.
 int RUN_ALL_TESTS() GTEST_MUST_USE_RESULT_;
 
-inline int RUN_ALL_TESTS() {
-  return ::testing::UnitTest::GetInstance()->Run();
-}
+inline int RUN_ALL_TESTS() { return ::testing::UnitTest::GetInstance()->Run(); }
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_H_
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_H_
diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h
new file mode 100644
index 0000000000..47a24aa687
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h
@@ -0,0 +1,279 @@
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Implements a family of generic predicate assertion macros.
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+
+#include "gtest/gtest-assertion-result.h"
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
+
+namespace testing {
+
+// This header implements a family of generic predicate assertion
+// macros:
+//
+//   ASSERT_PRED_FORMAT1(pred_format, v1)
+//   ASSERT_PRED_FORMAT2(pred_format, v1, v2)
+//   ...
+//
+// where pred_format is a function or functor that takes n (in the
+// case of ASSERT_PRED_FORMATn) values and their source expression
+// text, and returns a testing::AssertionResult.  See the definition
+// of ASSERT_EQ in gtest.h for an example.
+//
+// If you don't care about formatting, you can use the more
+// restrictive version:
+//
+//   ASSERT_PRED1(pred, v1)
+//   ASSERT_PRED2(pred, v1, v2)
+//   ...
+//
+// where pred is an n-ary function or functor that returns bool,
+// and the values v1, v2, ..., must support the << operator for
+// streaming to std::ostream.
+//
+// We also define the EXPECT_* variations.
+//
+// For now we only support predicates whose arity is at most 5.
+// Please email googletestframework@googlegroups.com if you need
+// support for higher arities.
+
+// GTEST_ASSERT_ is the basic statement to which all of the assertions
+// in this file reduce.  Don't use this in your code.
+
+#define GTEST_ASSERT_(expression, on_failure)                   \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                 \
+  if (const ::testing::AssertionResult gtest_ar = (expression)) \
+    ;                                                           \
+  else                                                          \
+    on_failure(gtest_ar.failure_message())
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED1.  Don't use
+// this in your code.
+template <typename Pred, typename T1>
+AssertionResult AssertPred1Helper(const char* pred_text, const char* e1,
+                                  Pred pred, const T1& v1) {
+  if (pred(v1)) return AssertionSuccess();
+
+  return AssertionFailure()
+         << pred_text << "(" << e1 << ") evaluates to false, where"
+         << "\n"
+         << e1 << " evaluates to " << ::testing::PrintToString(v1);
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure) \
+  GTEST_ASSERT_(pred_format(#v1, v1), on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED1.  Don't use
+// this in your code.
+#define GTEST_PRED1_(pred, v1, on_failure) \
+  GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, #v1, pred, v1), on_failure)
+
+// Unary predicate assertion macros.
+#define EXPECT_PRED_FORMAT1(pred_format, v1) \
+  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED1(pred, v1) GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT1(pred_format, v1) \
+  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED1(pred, v1) GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_)
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED2.  Don't use
+// this in your code.
+template <typename Pred, typename T1, typename T2>
+AssertionResult AssertPred2Helper(const char* pred_text, const char* e1,
+                                  const char* e2, Pred pred, const T1& v1,
+                                  const T2& v2) {
+  if (pred(v1, v2)) return AssertionSuccess();
+
+  return AssertionFailure()
+         << pred_text << "(" << e1 << ", " << e2
+         << ") evaluates to false, where"
+         << "\n"
+         << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n"
+         << e2 << " evaluates to " << ::testing::PrintToString(v2);
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure) \
+  GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED2.  Don't use
+// this in your code.
+#define GTEST_PRED2_(pred, v1, v2, on_failure)                               \
+  GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, #v1, #v2, pred, v1, v2), \
+                on_failure)
+
+// Binary predicate assertion macros.
+#define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \
+  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED2(pred, v1, v2) \
+  GTEST_PRED2_(pred, v1, v2, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT2(pred_format, v1, v2) \
+  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED2(pred, v1, v2) \
+  GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_)
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED3.  Don't use
+// this in your code.
+template <typename Pred, typename T1, typename T2, typename T3>
+AssertionResult AssertPred3Helper(const char* pred_text, const char* e1,
+                                  const char* e2, const char* e3, Pred pred,
+                                  const T1& v1, const T2& v2, const T3& v3) {
+  if (pred(v1, v2, v3)) return AssertionSuccess();
+
+  return AssertionFailure()
+         << pred_text << "(" << e1 << ", " << e2 << ", " << e3
+         << ") evaluates to false, where"
+         << "\n"
+         << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n"
+         << e2 << " evaluates to " << ::testing::PrintToString(v2) << "\n"
+         << e3 << " evaluates to " << ::testing::PrintToString(v3);
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure) \
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED3.  Don't use
+// this in your code.
+#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)                          \
+  GTEST_ASSERT_(                                                            \
+      ::testing::AssertPred3Helper(#pred, #v1, #v2, #v3, pred, v1, v2, v3), \
+      on_failure)
+
+// Ternary predicate assertion macros.
+#define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \
+  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED3(pred, v1, v2, v3) \
+  GTEST_PRED3_(pred, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT3(pred_format, v1, v2, v3) \
+  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED3(pred, v1, v2, v3) \
+  GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_)
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED4.  Don't use
+// this in your code.
+template <typename Pred, typename T1, typename T2, typename T3, typename T4>
+AssertionResult AssertPred4Helper(const char* pred_text, const char* e1,
+                                  const char* e2, const char* e3,
+                                  const char* e4, Pred pred, const T1& v1,
+                                  const T2& v2, const T3& v3, const T4& v4) {
+  if (pred(v1, v2, v3, v4)) return AssertionSuccess();
+
+  return AssertionFailure()
+         << pred_text << "(" << e1 << ", " << e2 << ", " << e3 << ", " << e4
+         << ") evaluates to false, where"
+         << "\n"
+         << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n"
+         << e2 << " evaluates to " << ::testing::PrintToString(v2) << "\n"
+         << e3 << " evaluates to " << ::testing::PrintToString(v3) << "\n"
+         << e4 << " evaluates to " << ::testing::PrintToString(v4);
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure) \
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED4.  Don't use
+// this in your code.
+#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)                        \
+  GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, #v1, #v2, #v3, #v4, pred, \
+                                             v1, v2, v3, v4),                 \
+                on_failure)
+
+// 4-ary predicate assertion macros.
+#define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
+  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED4(pred, v1, v2, v3, v4) \
+  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
+  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED4(pred, v1, v2, v3, v4) \
+  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED5.  Don't use
+// this in your code.
+template <typename Pred, typename T1, typename T2, typename T3, typename T4,
+          typename T5>
+AssertionResult AssertPred5Helper(const char* pred_text, const char* e1,
+                                  const char* e2, const char* e3,
+                                  const char* e4, const char* e5, Pred pred,
+                                  const T1& v1, const T2& v2, const T3& v3,
+                                  const T4& v4, const T5& v5) {
+  if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess();
+
+  return AssertionFailure()
+         << pred_text << "(" << e1 << ", " << e2 << ", " << e3 << ", " << e4
+         << ", " << e5 << ") evaluates to false, where"
+         << "\n"
+         << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n"
+         << e2 << " evaluates to " << ::testing::PrintToString(v2) << "\n"
+         << e3 << " evaluates to " << ::testing::PrintToString(v3) << "\n"
+         << e4 << " evaluates to " << ::testing::PrintToString(v4) << "\n"
+         << e5 << " evaluates to " << ::testing::PrintToString(v5);
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)  \
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED5.  Don't use
+// this in your code.
+#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)                   \
+  GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, #v1, #v2, #v3, #v4, #v5, \
+                                             pred, v1, v2, v3, v4, v5),      \
+                on_failure)
+
+// 5-ary predicate assertion macros.
+#define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
+  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED5(pred, v1, v2, v3, v4, v5) \
+  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
+  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \
+  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
+
+}  // namespace testing
+
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h
new file mode 100644
index 0000000000..1f37dc31c3
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h
@@ -0,0 +1,60 @@
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Google C++ Testing and Mocking Framework definitions useful in production
+// code.
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_
+
+// When you need to test the private or protected members of a class,
+// use the FRIEND_TEST macro to declare your tests as friends of the
+// class.  For example:
+//
+// class MyClass {
+//  private:
+//   void PrivateMethod();
+//   FRIEND_TEST(MyClassTest, PrivateMethodWorks);
+// };
+//
+// class MyClassTest : public testing::Test {
+//   // ...
+// };
+//
+// TEST_F(MyClassTest, PrivateMethodWorks) {
+//   // Can call MyClass::PrivateMethod() here.
+// }
+//
+// Note: The test class must be in the same namespace as the class being tested.
+// For example, putting MyClassTest in an anonymous namespace will not work.
+
+#define FRIEND_TEST(test_case_name, test_name) \
+  friend class test_case_name##_##test_name##_Test
+
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_
diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/README.md b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/README.md
new file mode 100644
index 0000000000..cb49e2c754
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/README.md
@@ -0,0 +1,44 @@
+# Customization Points
+
+The custom directory is an injection point for custom user configurations.
+
+## Header `gtest.h`
+
+### The following macros can be defined:
+
+*   `GTEST_OS_STACK_TRACE_GETTER_` - The name of an implementation of
+    `OsStackTraceGetterInterface`.
+*   `GTEST_CUSTOM_TEMPDIR_FUNCTION_` - An override for `testing::TempDir()`. See
+    `testing::TempDir` for semantics and signature.
+
+## Header `gtest-port.h`
+
+The following macros can be defined:
+
+### Logging:
+
+*   `GTEST_LOG_(severity)`
+*   `GTEST_CHECK_(condition)`
+*   Functions `LogToStderr()` and `FlushInfoLog()` have to be provided too.
+
+### Threading:
+
+*   `GTEST_HAS_NOTIFICATION_` - Enabled if Notification is already provided.
+*   `GTEST_HAS_MUTEX_AND_THREAD_LOCAL_` - Enabled if `Mutex` and `ThreadLocal`
+    are already provided. Must also provide `GTEST_DECLARE_STATIC_MUTEX_(mutex)`
+    and `GTEST_DEFINE_STATIC_MUTEX_(mutex)`
+*   `GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)`
+*   `GTEST_LOCK_EXCLUDED_(locks)`
+
+### Underlying library support features
+
+*   `GTEST_HAS_CXXABI_H_`
+
+### Exporting API symbols:
+
+*   `GTEST_API_` - Specifier for exported symbols.
+
+## Header `gtest-printers.h`
+
+*   See documentation at `gtest/gtest-printers.h` for details on how to define a
+    custom printer.
diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h
new file mode 100644
index 0000000000..9b7fb4261a
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h
@@ -0,0 +1,68 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Injection point for custom user configurations. See README for details
+//
+// ** Custom implementation starts here **
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+
+// Use a stub Notification class.
+//
+// The built-in Notification class in GoogleTest v1.12.1 uses std::mutex and
+// std::condition_variable. The <mutex> and <condition_variable> headers of
+// mingw32 g++ (GNU 10.0.0) define std::mutex and std::condition_variable only
+// when configured with the posix threads option but don't define them when
+// configured with the win32 threads option. The Notification class is only
+// used in GoogleTest's internal tests. Since we don't build GoogleTest's
+// internal tests, we don't need a working Notification class. Although it's
+// not hard to fix the mingw32 g++ compilation errors by implementing the
+// Notification class using Windows CRITICAL_SECTION and CONDITION_VARIABLE,
+// it's simpler to just use a stub Notification class on all platforms.
+//
+// The default constructor of the stub class is deleted and the declaration of
+// the Notify() method is commented out, so that compilation will fail if any
+// code actually uses the Notification class.
+
+#define GTEST_HAS_NOTIFICATION_ 1
+namespace testing {
+namespace internal {
+class Notification {
+ public:
+  Notification() = delete;
+  Notification(const Notification&) = delete;
+  Notification& operator=(const Notification&) = delete;
+  // void Notify();
+  void WaitForNotification() {}
+};
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h
new file mode 100644
index 0000000000..b9495d8378
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h
@@ -0,0 +1,42 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// This file provides an injection point for custom printers in a local
+// installation of gTest.
+// It will be included from gtest-printers.h and the overrides in this file
+// will be visible to everyone.
+//
+// Injection point for custom user configurations. See README for details
+//
+// ** Custom implementation starts here **
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
+
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest.h
new file mode 100644
index 0000000000..afaaf17ba2
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest.h
@@ -0,0 +1,37 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Injection point for custom user configurations. See README for details
+//
+// ** Custom implementation starts here **
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
+
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h
new file mode 100644
index 0000000000..45580ae805
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h
@@ -0,0 +1,306 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file defines internal utilities needed for implementing
+// death tests.  They are subject to change without notice.
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+
+#include <stdio.h>
+
+#include <memory>
+
+#include "gtest/gtest-matchers.h"
+#include "gtest/internal/gtest-internal.h"
+
+GTEST_DECLARE_string_(internal_run_death_test);
+
+namespace testing {
+namespace internal {
+
+// Names of the flags (needed for parsing Google Test flags).
+const char kDeathTestStyleFlag[] = "death_test_style";
+const char kDeathTestUseFork[] = "death_test_use_fork";
+const char kInternalRunDeathTestFlag[] = "internal_run_death_test";
+
+#if GTEST_HAS_DEATH_TEST
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+// DeathTest is a class that hides much of the complexity of the
+// GTEST_DEATH_TEST_ macro.  It is abstract; its static Create method
+// returns a concrete class that depends on the prevailing death test
+// style, as defined by the --gtest_death_test_style and/or
+// --gtest_internal_run_death_test flags.
+
+// In describing the results of death tests, these terms are used with
+// the corresponding definitions:
+//
+// exit status:  The integer exit information in the format specified
+//               by wait(2)
+// exit code:    The integer code passed to exit(3), _exit(2), or
+//               returned from main()
+class GTEST_API_ DeathTest {
+ public:
+  // Create returns false if there was an error determining the
+  // appropriate action to take for the current death test; for example,
+  // if the gtest_death_test_style flag is set to an invalid value.
+  // The LastMessage method will return a more detailed message in that
+  // case.  Otherwise, the DeathTest pointer pointed to by the "test"
+  // argument is set.  If the death test should be skipped, the pointer
+  // is set to NULL; otherwise, it is set to the address of a new concrete
+  // DeathTest object that controls the execution of the current test.
+  static bool Create(const char* statement, Matcher<const std::string&> matcher,
+                     const char* file, int line, DeathTest** test);
+  DeathTest();
+  virtual ~DeathTest() {}
+
+  // A helper class that aborts a death test when it's deleted.
+  class ReturnSentinel {
+   public:
+    explicit ReturnSentinel(DeathTest* test) : test_(test) {}
+    ~ReturnSentinel() { test_->Abort(TEST_ENCOUNTERED_RETURN_STATEMENT); }
+
+   private:
+    DeathTest* const test_;
+    ReturnSentinel(const ReturnSentinel&) = delete;
+    ReturnSentinel& operator=(const ReturnSentinel&) = delete;
+  } GTEST_ATTRIBUTE_UNUSED_;
+
+  // An enumeration of possible roles that may be taken when a death
+  // test is encountered.  EXECUTE means that the death test logic should
+  // be executed immediately.  OVERSEE means that the program should prepare
+  // the appropriate environment for a child process to execute the death
+  // test, then wait for it to complete.
+  enum TestRole { OVERSEE_TEST, EXECUTE_TEST };
+
+  // An enumeration of the three reasons that a test might be aborted.
+  enum AbortReason {
+    TEST_ENCOUNTERED_RETURN_STATEMENT,
+    TEST_THREW_EXCEPTION,
+    TEST_DID_NOT_DIE
+  };
+
+  // Assumes one of the above roles.
+  virtual TestRole AssumeRole() = 0;
+
+  // Waits for the death test to finish and returns its status.
+  virtual int Wait() = 0;
+
+  // Returns true if the death test passed; that is, the test process
+  // exited during the test, its exit status matches a user-supplied
+  // predicate, and its stderr output matches a user-supplied regular
+  // expression.
+  // The user-supplied predicate may be a macro expression rather
+  // than a function pointer or functor, or else Wait and Passed could
+  // be combined.
+  virtual bool Passed(bool exit_status_ok) = 0;
+
+  // Signals that the death test did not die as expected.
+  virtual void Abort(AbortReason reason) = 0;
+
+  // Returns a human-readable outcome message regarding the outcome of
+  // the last death test.
+  static const char* LastMessage();
+
+  static void set_last_death_test_message(const std::string& message);
+
+ private:
+  // A string containing a description of the outcome of the last death test.
+  static std::string last_death_test_message_;
+
+  DeathTest(const DeathTest&) = delete;
+  DeathTest& operator=(const DeathTest&) = delete;
+};
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+// Factory interface for death tests.  May be mocked out for testing.
+class DeathTestFactory {
+ public:
+  virtual ~DeathTestFactory() {}
+  virtual bool Create(const char* statement,
+                      Matcher<const std::string&> matcher, const char* file,
+                      int line, DeathTest** test) = 0;
+};
+
+// A concrete DeathTestFactory implementation for normal use.
+class DefaultDeathTestFactory : public DeathTestFactory {
+ public:
+  bool Create(const char* statement, Matcher<const std::string&> matcher,
+              const char* file, int line, DeathTest** test) override;
+};
+
+// Returns true if exit_status describes a process that was terminated
+// by a signal, or exited normally with a nonzero exit code.
+GTEST_API_ bool ExitedUnsuccessfully(int exit_status);
+
+// A string passed to EXPECT_DEATH (etc.) is caught by one of these overloads
+// and interpreted as a regex (rather than an Eq matcher) for legacy
+// compatibility.
+inline Matcher<const ::std::string&> MakeDeathTestMatcher(
+    ::testing::internal::RE regex) {
+  return ContainsRegex(regex.pattern());
+}
+inline Matcher<const ::std::string&> MakeDeathTestMatcher(const char* regex) {
+  return ContainsRegex(regex);
+}
+inline Matcher<const ::std::string&> MakeDeathTestMatcher(
+    const ::std::string& regex) {
+  return ContainsRegex(regex);
+}
+
+// If a Matcher<const ::std::string&> is passed to EXPECT_DEATH (etc.), it's
+// used directly.
+inline Matcher<const ::std::string&> MakeDeathTestMatcher(
+    Matcher<const ::std::string&> matcher) {
+  return matcher;
+}
+
+// Traps C++ exceptions escaping statement and reports them as test
+// failures. Note that trapping SEH exceptions is not implemented here.
+#if GTEST_HAS_EXCEPTIONS
+#define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test)           \
+  try {                                                                      \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);               \
+  } catch (const ::std::exception& gtest_exception) {                        \
+    fprintf(                                                                 \
+        stderr,                                                              \
+        "\n%s: Caught std::exception-derived exception escaping the "        \
+        "death test statement. Exception message: %s\n",                     \
+        ::testing::internal::FormatFileLocation(__FILE__, __LINE__).c_str(), \
+        gtest_exception.what());                                             \
+    fflush(stderr);                                                          \
+    death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
+  } catch (...) {                                                            \
+    death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
+  }
+
+#else
+#define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
+  GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
+
+#endif
+
+// This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*,
+// ASSERT_EXIT*, and EXPECT_EXIT*.
+#define GTEST_DEATH_TEST_(statement, predicate, regex_or_matcher, fail)        \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                                \
+  if (::testing::internal::AlwaysTrue()) {                                     \
+    ::testing::internal::DeathTest* gtest_dt;                                  \
+    if (!::testing::internal::DeathTest::Create(                               \
+            #statement,                                                        \
+            ::testing::internal::MakeDeathTestMatcher(regex_or_matcher),       \
+            __FILE__, __LINE__, &gtest_dt)) {                                  \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__);                        \
+    }                                                                          \
+    if (gtest_dt != nullptr) {                                                 \
+      std::unique_ptr< ::testing::internal::DeathTest> gtest_dt_ptr(gtest_dt); \
+      switch (gtest_dt->AssumeRole()) {                                        \
+        case ::testing::internal::DeathTest::OVERSEE_TEST:                     \
+          if (!gtest_dt->Passed(predicate(gtest_dt->Wait()))) {                \
+            goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__);                  \
+          }                                                                    \
+          break;                                                               \
+        case ::testing::internal::DeathTest::EXECUTE_TEST: {                   \
+          ::testing::internal::DeathTest::ReturnSentinel gtest_sentinel(       \
+              gtest_dt);                                                       \
+          GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, gtest_dt);            \
+          gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE);   \
+          break;                                                               \
+        }                                                                      \
+      }                                                                        \
+    }                                                                          \
+  } else                                                                       \
+    GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__)                                \
+        : fail(::testing::internal::DeathTest::LastMessage())
+// The symbol "fail" here expands to something into which a message
+// can be streamed.
+
+// This macro is for implementing ASSERT/EXPECT_DEBUG_DEATH when compiled in
+// NDEBUG mode. In this case we need the statements to be executed and the macro
+// must accept a streamed message even though the message is never printed.
+// The regex object is not evaluated, but it is used to prevent "unused"
+// warnings and to avoid an expression that doesn't compile in debug mode.
+#define GTEST_EXECUTE_STATEMENT_(statement, regex_or_matcher)    \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                  \
+  if (::testing::internal::AlwaysTrue()) {                       \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);   \
+  } else if (!::testing::internal::AlwaysTrue()) {               \
+    ::testing::internal::MakeDeathTestMatcher(regex_or_matcher); \
+  } else                                                         \
+    ::testing::Message()
+
+// A class representing the parsed contents of the
+// --gtest_internal_run_death_test flag, as it existed when
+// RUN_ALL_TESTS was called.
+class InternalRunDeathTestFlag {
+ public:
+  InternalRunDeathTestFlag(const std::string& a_file, int a_line, int an_index,
+                           int a_write_fd)
+      : file_(a_file), line_(a_line), index_(an_index), write_fd_(a_write_fd) {}
+
+  ~InternalRunDeathTestFlag() {
+    if (write_fd_ >= 0) posix::Close(write_fd_);
+  }
+
+  const std::string& file() const { return file_; }
+  int line() const { return line_; }
+  int index() const { return index_; }
+  int write_fd() const { return write_fd_; }
+
+ private:
+  std::string file_;
+  int line_;
+  int index_;
+  int write_fd_;
+
+  InternalRunDeathTestFlag(const InternalRunDeathTestFlag&) = delete;
+  InternalRunDeathTestFlag& operator=(const InternalRunDeathTestFlag&) = delete;
+};
+
+// Returns a newly created InternalRunDeathTestFlag object with fields
+// initialized from the GTEST_FLAG(internal_run_death_test) flag if
+// the flag is specified; otherwise returns NULL.
+InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag();
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h
new file mode 100644
index 0000000000..a2a60a962b
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h
@@ -0,0 +1,210 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Google Test filepath utilities
+//
+// This header file declares classes and functions used internally by
+// Google Test.  They are subject to change without notice.
+//
+// This file is #included in gtest/internal/gtest-internal.h.
+// Do not include this header file separately!
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+
+#include "gtest/internal/gtest-string.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+namespace internal {
+
+// FilePath - a class for file and directory pathname manipulation which
+// handles platform-specific conventions (like the pathname separator).
+// Used for helper functions for naming files in a directory for xml output.
+// Except for Set methods, all methods are const or static, which provides an
+// "immutable value object" -- useful for peace of mind.
+// A FilePath with a value ending in a path separator ("like/this/") represents
+// a directory, otherwise it is assumed to represent a file. In either case,
+// it may or may not represent an actual file or directory in the file system.
+// Names are NOT checked for syntax correctness -- no checking for illegal
+// characters, malformed paths, etc.
+
+class GTEST_API_ FilePath {
+ public:
+  FilePath() : pathname_("") {}
+  FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) {}
+
+  explicit FilePath(const std::string& pathname) : pathname_(pathname) {
+    Normalize();
+  }
+
+  FilePath& operator=(const FilePath& rhs) {
+    Set(rhs);
+    return *this;
+  }
+
+  void Set(const FilePath& rhs) { pathname_ = rhs.pathname_; }
+
+  const std::string& string() const { return pathname_; }
+  const char* c_str() const { return pathname_.c_str(); }
+
+  // Returns the current working directory, or "" if unsuccessful.
+  static FilePath GetCurrentDir();
+
+  // Given directory = "dir", base_name = "test", number = 0,
+  // extension = "xml", returns "dir/test.xml". If number is greater
+  // than zero (e.g., 12), returns "dir/test_12.xml".
+  // On Windows platform, uses \ as the separator rather than /.
+  static FilePath MakeFileName(const FilePath& directory,
+                               const FilePath& base_name, int number,
+                               const char* extension);
+
+  // Given directory = "dir", relative_path = "test.xml",
+  // returns "dir/test.xml".
+  // On Windows, uses \ as the separator rather than /.
+  static FilePath ConcatPaths(const FilePath& directory,
+                              const FilePath& relative_path);
+
+  // Returns a pathname for a file that does not currently exist. The pathname
+  // will be directory/base_name.extension or
+  // directory/base_name_<number>.extension if directory/base_name.extension
+  // already exists. The number will be incremented until a pathname is found
+  // that does not already exist.
+  // Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
+  // There could be a race condition if two or more processes are calling this
+  // function at the same time -- they could both pick the same filename.
+  static FilePath GenerateUniqueFileName(const FilePath& directory,
+                                         const FilePath& base_name,
+                                         const char* extension);
+
+  // Returns true if and only if the path is "".
+  bool IsEmpty() const { return pathname_.empty(); }
+
+  // If input name has a trailing separator character, removes it and returns
+  // the name, otherwise return the name string unmodified.
+  // On Windows platform, uses \ as the separator, other platforms use /.
+  FilePath RemoveTrailingPathSeparator() const;
+
+  // Returns a copy of the FilePath with the directory part removed.
+  // Example: FilePath("path/to/file").RemoveDirectoryName() returns
+  // FilePath("file"). If there is no directory part ("just_a_file"), it returns
+  // the FilePath unmodified. If there is no file part ("just_a_dir/") it
+  // returns an empty FilePath ("").
+  // On Windows platform, '\' is the path separator, otherwise it is '/'.
+  FilePath RemoveDirectoryName() const;
+
+  // RemoveFileName returns the directory path with the filename removed.
+  // Example: FilePath("path/to/file").RemoveFileName() returns "path/to/".
+  // If the FilePath is "a_file" or "/a_file", RemoveFileName returns
+  // FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does
+  // not have a file, like "just/a/dir/", it returns the FilePath unmodified.
+  // On Windows platform, '\' is the path separator, otherwise it is '/'.
+  FilePath RemoveFileName() const;
+
+  // Returns a copy of the FilePath with the case-insensitive extension removed.
+  // Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
+  // FilePath("dir/file"). If a case-insensitive extension is not
+  // found, returns a copy of the original FilePath.
+  FilePath RemoveExtension(const char* extension) const;
+
+  // Creates directories so that path exists. Returns true if successful or if
+  // the directories already exist; returns false if unable to create
+  // directories for any reason. Will also return false if the FilePath does
+  // not represent a directory (that is, it doesn't end with a path separator).
+  bool CreateDirectoriesRecursively() const;
+
+  // Create the directory so that path exists. Returns true if successful or
+  // if the directory already exists; returns false if unable to create the
+  // directory for any reason, including if the parent directory does not
+  // exist. Not named "CreateDirectory" because that's a macro on Windows.
+  bool CreateFolder() const;
+
+  // Returns true if FilePath describes something in the file-system,
+  // either a file, directory, or whatever, and that something exists.
+  bool FileOrDirectoryExists() const;
+
+  // Returns true if pathname describes a directory in the file-system
+  // that exists.
+  bool DirectoryExists() const;
+
+  // Returns true if FilePath ends with a path separator, which indicates that
+  // it is intended to represent a directory. Returns false otherwise.
+  // This does NOT check that a directory (or file) actually exists.
+  bool IsDirectory() const;
+
+  // Returns true if pathname describes a root directory. (Windows has one
+  // root directory per disk drive.)
+  bool IsRootDirectory() const;
+
+  // Returns true if pathname describes an absolute path.
+  bool IsAbsolutePath() const;
+
+ private:
+  // Replaces multiple consecutive separators with a single separator.
+  // For example, "bar///foo" becomes "bar/foo". Does not eliminate other
+  // redundancies that might be in a pathname involving "." or "..".
+  //
+  // A pathname with multiple consecutive separators may occur either through
+  // user error or as a result of some scripts or APIs that generate a pathname
+  // with a trailing separator. On other platforms the same API or script
+  // may NOT generate a pathname with a trailing "/". Then elsewhere that
+  // pathname may have another "/" and pathname components added to it,
+  // without checking for the separator already being there.
+  // The script language and operating system may allow paths like "foo//bar"
+  // but some of the functions in FilePath will not handle that correctly. In
+  // particular, RemoveTrailingPathSeparator() only removes one separator, and
+  // it is called in CreateDirectoriesRecursively() assuming that it will change
+  // a pathname from directory syntax (trailing separator) to filename syntax.
+  //
+  // On Windows this method also replaces the alternate path separator '/' with
+  // the primary path separator '\\', so that for example "bar\\/\\foo" becomes
+  // "bar\\foo".
+
+  void Normalize();
+
+  // Returns a pointer to the last occurrence of a valid path separator in
+  // the FilePath. On Windows, for example, both '/' and '\' are valid path
+  // separators. Returns NULL if no path separator was found.
+  const char* FindLastPathSeparator() const;
+
+  std::string pathname_;
+};  // class FilePath
+
+}  // namespace internal
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h
new file mode 100644
index 0000000000..9b04e4c85f
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h
@@ -0,0 +1,1570 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file declares functions and macros used internally by
+// Google Test.  They are subject to change without notice.
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+
+#include "gtest/internal/gtest-port.h"
+
+#if GTEST_OS_LINUX
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#endif  // GTEST_OS_LINUX
+
+#if GTEST_HAS_EXCEPTIONS
+#include <stdexcept>
+#endif
+
+#include <ctype.h>
+#include <float.h>
+#include <string.h>
+
+#include <cstdint>
+#include <iomanip>
+#include <limits>
+#include <map>
+#include <set>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-filepath.h"
+#include "gtest/internal/gtest-string.h"
+#include "gtest/internal/gtest-type-util.h"
+
+// Due to C++ preprocessor weirdness, we need double indirection to
+// concatenate two tokens when one of them is __LINE__.  Writing
+//
+//   foo ## __LINE__
+//
+// will result in the token foo__LINE__, instead of foo followed by
+// the current line number.  For more details, see
+// http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.6
+#define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar)
+#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo##bar
+
+// Stringifies its argument.
+// Work around a bug in visual studio which doesn't accept code like this:
+//
+//   #define GTEST_STRINGIFY_(name) #name
+//   #define MACRO(a, b, c) ... GTEST_STRINGIFY_(a) ...
+//   MACRO(, x, y)
+//
+// Complaining about the argument to GTEST_STRINGIFY_ being empty.
+// This is allowed by the spec.
+#define GTEST_STRINGIFY_HELPER_(name, ...) #name
+#define GTEST_STRINGIFY_(...) GTEST_STRINGIFY_HELPER_(__VA_ARGS__, )
+
+namespace proto2 {
+class MessageLite;
+}
+
+namespace testing {
+
+// Forward declarations.
+
+class AssertionResult;  // Result of an assertion.
+class Message;          // Represents a failure message.
+class Test;             // Represents a test.
+class TestInfo;         // Information about a test.
+class TestPartResult;   // Result of a test part.
+class UnitTest;         // A collection of test suites.
+
+template <typename T>
+::std::string PrintToString(const T& value);
+
+namespace internal {
+
+struct TraceInfo;    // Information about a trace point.
+class TestInfoImpl;  // Opaque implementation of TestInfo
+class UnitTestImpl;  // Opaque implementation of UnitTest
+
+// The text used in failure messages to indicate the start of the
+// stack trace.
+GTEST_API_ extern const char kStackTraceMarker[];
+
+// An IgnoredValue object can be implicitly constructed from ANY value.
+class IgnoredValue {
+  struct Sink {};
+
+ public:
+  // This constructor template allows any value to be implicitly
+  // converted to IgnoredValue.  The object has no data member and
+  // doesn't try to remember anything about the argument.  We
+  // deliberately omit the 'explicit' keyword in order to allow the
+  // conversion to be implicit.
+  // Disable the conversion if T already has a magical conversion operator.
+  // Otherwise we get ambiguity.
+  template <typename T,
+            typename std::enable_if<!std::is_convertible<T, Sink>::value,
+                                    int>::type = 0>
+  IgnoredValue(const T& /* ignored */) {}  // NOLINT(runtime/explicit)
+};
+
+// Appends the user-supplied message to the Google-Test-generated message.
+GTEST_API_ std::string AppendUserMessage(const std::string& gtest_msg,
+                                         const Message& user_msg);
+
+#if GTEST_HAS_EXCEPTIONS
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(
+    4275 /* an exported class was derived from a class that was not exported */)
+
+// This exception is thrown by (and only by) a failed Google Test
+// assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions
+// are enabled).  We derive it from std::runtime_error, which is for
+// errors presumably detectable only at run time.  Since
+// std::runtime_error inherits from std::exception, many testing
+// frameworks know how to extract and print the message inside it.
+class GTEST_API_ GoogleTestFailureException : public ::std::runtime_error {
+ public:
+  explicit GoogleTestFailureException(const TestPartResult& failure);
+};
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4275
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+namespace edit_distance {
+// Returns the optimal edits to go from 'left' to 'right'.
+// All edits cost the same, with replace having lower priority than
+// add/remove.
+// Simple implementation of the Wagner-Fischer algorithm.
+// See http://en.wikipedia.org/wiki/Wagner-Fischer_algorithm
+enum EditType { kMatch, kAdd, kRemove, kReplace };
+GTEST_API_ std::vector<EditType> CalculateOptimalEdits(
+    const std::vector<size_t>& left, const std::vector<size_t>& right);
+
+// Same as above, but the input is represented as strings.
+GTEST_API_ std::vector<EditType> CalculateOptimalEdits(
+    const std::vector<std::string>& left,
+    const std::vector<std::string>& right);
+
+// Create a diff of the input strings in Unified diff format.
+GTEST_API_ std::string CreateUnifiedDiff(const std::vector<std::string>& left,
+                                         const std::vector<std::string>& right,
+                                         size_t context = 2);
+
+}  // namespace edit_distance
+
+// Constructs and returns the message for an equality assertion
+// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
+//
+// The first four parameters are the expressions used in the assertion
+// and their values, as strings.  For example, for ASSERT_EQ(foo, bar)
+// where foo is 5 and bar is 6, we have:
+//
+//   expected_expression: "foo"
+//   actual_expression:   "bar"
+//   expected_value:      "5"
+//   actual_value:        "6"
+//
+// The ignoring_case parameter is true if and only if the assertion is a
+// *_STRCASEEQ*.  When it's true, the string " (ignoring case)" will
+// be inserted into the message.
+GTEST_API_ AssertionResult EqFailure(const char* expected_expression,
+                                     const char* actual_expression,
+                                     const std::string& expected_value,
+                                     const std::string& actual_value,
+                                     bool ignoring_case);
+
+// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
+GTEST_API_ std::string GetBoolAssertionFailureMessage(
+    const AssertionResult& assertion_result, const char* expression_text,
+    const char* actual_predicate_value, const char* expected_predicate_value);
+
+// This template class represents an IEEE floating-point number
+// (either single-precision or double-precision, depending on the
+// template parameters).
+//
+// The purpose of this class is to do more sophisticated number
+// comparison.  (Due to round-off error, etc, it's very unlikely that
+// two floating-points will be equal exactly.  Hence a naive
+// comparison by the == operation often doesn't work.)
+//
+// Format of IEEE floating-point:
+//
+//   The most-significant bit being the leftmost, an IEEE
+//   floating-point looks like
+//
+//     sign_bit exponent_bits fraction_bits
+//
+//   Here, sign_bit is a single bit that designates the sign of the
+//   number.
+//
+//   For float, there are 8 exponent bits and 23 fraction bits.
+//
+//   For double, there are 11 exponent bits and 52 fraction bits.
+//
+//   More details can be found at
+//   http://en.wikipedia.org/wiki/IEEE_floating-point_standard.
+//
+// Template parameter:
+//
+//   RawType: the raw floating-point type (either float or double)
+template <typename RawType>
+class FloatingPoint {
+ public:
+  // Defines the unsigned integer type that has the same size as the
+  // floating point number.
+  typedef typename TypeWithSize<sizeof(RawType)>::UInt Bits;
+
+  // Constants.
+
+  // # of bits in a number.
+  static const size_t kBitCount = 8 * sizeof(RawType);
+
+  // # of fraction bits in a number.
+  static const size_t kFractionBitCount =
+      std::numeric_limits<RawType>::digits - 1;
+
+  // # of exponent bits in a number.
+  static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount;
+
+  // The mask for the sign bit.
+  static const Bits kSignBitMask = static_cast<Bits>(1) << (kBitCount - 1);
+
+  // The mask for the fraction bits.
+  static const Bits kFractionBitMask = ~static_cast<Bits>(0) >>
+                                       (kExponentBitCount + 1);
+
+  // The mask for the exponent bits.
+  static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask);
+
+  // How many ULP's (Units in the Last Place) we want to tolerate when
+  // comparing two numbers.  The larger the value, the more error we
+  // allow.  A 0 value means that two numbers must be exactly the same
+  // to be considered equal.
+  //
+  // The maximum error of a single floating-point operation is 0.5
+  // units in the last place.  On Intel CPU's, all floating-point
+  // calculations are done with 80-bit precision, while double has 64
+  // bits.  Therefore, 4 should be enough for ordinary use.
+  //
+  // See the following article for more details on ULP:
+  // http://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
+  static const uint32_t kMaxUlps = 4;
+
+  // Constructs a FloatingPoint from a raw floating-point number.
+  //
+  // On an Intel CPU, passing a non-normalized NAN (Not a Number)
+  // around may change its bits, although the new value is guaranteed
+  // to be also a NAN.  Therefore, don't expect this constructor to
+  // preserve the bits in x when x is a NAN.
+  explicit FloatingPoint(const RawType& x) { u_.value_ = x; }
+
+  // Static methods
+
+  // Reinterprets a bit pattern as a floating-point number.
+  //
+  // This function is needed to test the AlmostEquals() method.
+  static RawType ReinterpretBits(const Bits bits) {
+    FloatingPoint fp(0);
+    fp.u_.bits_ = bits;
+    return fp.u_.value_;
+  }
+
+  // Returns the floating-point number that represent positive infinity.
+  static RawType Infinity() { return ReinterpretBits(kExponentBitMask); }
+
+  // Returns the maximum representable finite floating-point number.
+  static RawType Max();
+
+  // Non-static methods
+
+  // Returns the bits that represents this number.
+  const Bits& bits() const { return u_.bits_; }
+
+  // Returns the exponent bits of this number.
+  Bits exponent_bits() const { return kExponentBitMask & u_.bits_; }
+
+  // Returns the fraction bits of this number.
+  Bits fraction_bits() const { return kFractionBitMask & u_.bits_; }
+
+  // Returns the sign bit of this number.
+  Bits sign_bit() const { return kSignBitMask & u_.bits_; }
+
+  // Returns true if and only if this is NAN (not a number).
+  bool is_nan() const {
+    // It's a NAN if the exponent bits are all ones and the fraction
+    // bits are not entirely zeros.
+    return (exponent_bits() == kExponentBitMask) && (fraction_bits() != 0);
+  }
+
+  // Returns true if and only if this number is at most kMaxUlps ULP's away
+  // from rhs.  In particular, this function:
+  //
+  //   - returns false if either number is (or both are) NAN.
+  //   - treats really large numbers as almost equal to infinity.
+  //   - thinks +0.0 and -0.0 are 0 DLP's apart.
+  bool AlmostEquals(const FloatingPoint& rhs) const {
+    // The IEEE standard says that any comparison operation involving
+    // a NAN must return false.
+    if (is_nan() || rhs.is_nan()) return false;
+
+    return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_) <=
+           kMaxUlps;
+  }
+
+ private:
+  // The data type used to store the actual floating-point number.
+  union FloatingPointUnion {
+    RawType value_;  // The raw floating-point number.
+    Bits bits_;      // The bits that represent the number.
+  };
+
+  // Converts an integer from the sign-and-magnitude representation to
+  // the biased representation.  More precisely, let N be 2 to the
+  // power of (kBitCount - 1), an integer x is represented by the
+  // unsigned number x + N.
+  //
+  // For instance,
+  //
+  //   -N + 1 (the most negative number representable using
+  //          sign-and-magnitude) is represented by 1;
+  //   0      is represented by N; and
+  //   N - 1  (the biggest number representable using
+  //          sign-and-magnitude) is represented by 2N - 1.
+  //
+  // Read http://en.wikipedia.org/wiki/Signed_number_representations
+  // for more details on signed number representations.
+  static Bits SignAndMagnitudeToBiased(const Bits& sam) {
+    if (kSignBitMask & sam) {
+      // sam represents a negative number.
+      return ~sam + 1;
+    } else {
+      // sam represents a positive number.
+      return kSignBitMask | sam;
+    }
+  }
+
+  // Given two numbers in the sign-and-magnitude representation,
+  // returns the distance between them as an unsigned number.
+  static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits& sam1,
+                                                     const Bits& sam2) {
+    const Bits biased1 = SignAndMagnitudeToBiased(sam1);
+    const Bits biased2 = SignAndMagnitudeToBiased(sam2);
+    return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1);
+  }
+
+  FloatingPointUnion u_;
+};
+
+// We cannot use std::numeric_limits<T>::max() as it clashes with the max()
+// macro defined by <windows.h>.
+template <>
+inline float FloatingPoint<float>::Max() {
+  return FLT_MAX;
+}
+template <>
+inline double FloatingPoint<double>::Max() {
+  return DBL_MAX;
+}
+
+// Typedefs the instances of the FloatingPoint template class that we
+// care to use.
+typedef FloatingPoint<float> Float;
+typedef FloatingPoint<double> Double;
+
+// In order to catch the mistake of putting tests that use different
+// test fixture classes in the same test suite, we need to assign
+// unique IDs to fixture classes and compare them.  The TypeId type is
+// used to hold such IDs.  The user should treat TypeId as an opaque
+// type: the only operation allowed on TypeId values is to compare
+// them for equality using the == operator.
+typedef const void* TypeId;
+
+template <typename T>
+class TypeIdHelper {
+ public:
+  // dummy_ must not have a const type.  Otherwise an overly eager
+  // compiler (e.g. MSVC 7.1 & 8.0) may try to merge
+  // TypeIdHelper<T>::dummy_ for different Ts as an "optimization".
+  static bool dummy_;
+};
+
+template <typename T>
+bool TypeIdHelper<T>::dummy_ = false;
+
+// GetTypeId<T>() returns the ID of type T.  Different values will be
+// returned for different types.  Calling the function twice with the
+// same type argument is guaranteed to return the same ID.
+template <typename T>
+TypeId GetTypeId() {
+  // The compiler is required to allocate a different
+  // TypeIdHelper<T>::dummy_ variable for each T used to instantiate
+  // the template.  Therefore, the address of dummy_ is guaranteed to
+  // be unique.
+  return &(TypeIdHelper<T>::dummy_);
+}
+
+// Returns the type ID of ::testing::Test.  Always call this instead
+// of GetTypeId< ::testing::Test>() to get the type ID of
+// ::testing::Test, as the latter may give the wrong result due to a
+// suspected linker bug when compiling Google Test as a Mac OS X
+// framework.
+GTEST_API_ TypeId GetTestTypeId();
+
+// Defines the abstract factory interface that creates instances
+// of a Test object.
+class TestFactoryBase {
+ public:
+  virtual ~TestFactoryBase() {}
+
+  // Creates a test instance to run. The instance is both created and destroyed
+  // within TestInfoImpl::Run()
+  virtual Test* CreateTest() = 0;
+
+ protected:
+  TestFactoryBase() {}
+
+ private:
+  TestFactoryBase(const TestFactoryBase&) = delete;
+  TestFactoryBase& operator=(const TestFactoryBase&) = delete;
+};
+
+// This class provides implementation of TeastFactoryBase interface.
+// It is used in TEST and TEST_F macros.
+template <class TestClass>
+class TestFactoryImpl : public TestFactoryBase {
+ public:
+  Test* CreateTest() override { return new TestClass; }
+};
+
+#if GTEST_OS_WINDOWS
+
+// Predicate-formatters for implementing the HRESULT checking macros
+// {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED}
+// We pass a long instead of HRESULT to avoid causing an
+// include dependency for the HRESULT type.
+GTEST_API_ AssertionResult IsHRESULTSuccess(const char* expr,
+                                            long hr);  // NOLINT
+GTEST_API_ AssertionResult IsHRESULTFailure(const char* expr,
+                                            long hr);  // NOLINT
+
+#endif  // GTEST_OS_WINDOWS
+
+// Types of SetUpTestSuite() and TearDownTestSuite() functions.
+using SetUpTestSuiteFunc = void (*)();
+using TearDownTestSuiteFunc = void (*)();
+
+struct CodeLocation {
+  CodeLocation(const std::string& a_file, int a_line)
+      : file(a_file), line(a_line) {}
+
+  std::string file;
+  int line;
+};
+
+//  Helper to identify which setup function for TestCase / TestSuite to call.
+//  Only one function is allowed, either TestCase or TestSute but not both.
+
+// Utility functions to help SuiteApiResolver
+using SetUpTearDownSuiteFuncType = void (*)();
+
+inline SetUpTearDownSuiteFuncType GetNotDefaultOrNull(
+    SetUpTearDownSuiteFuncType a, SetUpTearDownSuiteFuncType def) {
+  return a == def ? nullptr : a;
+}
+
+template <typename T>
+//  Note that SuiteApiResolver inherits from T because
+//  SetUpTestSuite()/TearDownTestSuite() could be protected. This way
+//  SuiteApiResolver can access them.
+struct SuiteApiResolver : T {
+  // testing::Test is only forward declared at this point. So we make it a
+  // dependent class for the compiler to be OK with it.
+  using Test =
+      typename std::conditional<sizeof(T) != 0, ::testing::Test, void>::type;
+
+  static SetUpTearDownSuiteFuncType GetSetUpCaseOrSuite(const char* filename,
+                                                        int line_num) {
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+    SetUpTearDownSuiteFuncType test_case_fp =
+        GetNotDefaultOrNull(&T::SetUpTestCase, &Test::SetUpTestCase);
+    SetUpTearDownSuiteFuncType test_suite_fp =
+        GetNotDefaultOrNull(&T::SetUpTestSuite, &Test::SetUpTestSuite);
+
+    GTEST_CHECK_(!test_case_fp || !test_suite_fp)
+        << "Test can not provide both SetUpTestSuite and SetUpTestCase, please "
+           "make sure there is only one present at "
+        << filename << ":" << line_num;
+
+    return test_case_fp != nullptr ? test_case_fp : test_suite_fp;
+#else
+    (void)(filename);
+    (void)(line_num);
+    return &T::SetUpTestSuite;
+#endif
+  }
+
+  static SetUpTearDownSuiteFuncType GetTearDownCaseOrSuite(const char* filename,
+                                                           int line_num) {
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+    SetUpTearDownSuiteFuncType test_case_fp =
+        GetNotDefaultOrNull(&T::TearDownTestCase, &Test::TearDownTestCase);
+    SetUpTearDownSuiteFuncType test_suite_fp =
+        GetNotDefaultOrNull(&T::TearDownTestSuite, &Test::TearDownTestSuite);
+
+    GTEST_CHECK_(!test_case_fp || !test_suite_fp)
+        << "Test can not provide both TearDownTestSuite and TearDownTestCase,"
+           " please make sure there is only one present at"
+        << filename << ":" << line_num;
+
+    return test_case_fp != nullptr ? test_case_fp : test_suite_fp;
+#else
+    (void)(filename);
+    (void)(line_num);
+    return &T::TearDownTestSuite;
+#endif
+  }
+};
+
+// Creates a new TestInfo object and registers it with Google Test;
+// returns the created object.
+//
+// Arguments:
+//
+//   test_suite_name:  name of the test suite
+//   name:             name of the test
+//   type_param:       the name of the test's type parameter, or NULL if
+//                     this is not a typed or a type-parameterized test.
+//   value_param:      text representation of the test's value parameter,
+//                     or NULL if this is not a type-parameterized test.
+//   code_location:    code location where the test is defined
+//   fixture_class_id: ID of the test fixture class
+//   set_up_tc:        pointer to the function that sets up the test suite
+//   tear_down_tc:     pointer to the function that tears down the test suite
+//   factory:          pointer to the factory that creates a test object.
+//                     The newly created TestInfo instance will assume
+//                     ownership of the factory object.
+GTEST_API_ TestInfo* MakeAndRegisterTestInfo(
+    const char* test_suite_name, const char* name, const char* type_param,
+    const char* value_param, CodeLocation code_location,
+    TypeId fixture_class_id, SetUpTestSuiteFunc set_up_tc,
+    TearDownTestSuiteFunc tear_down_tc, TestFactoryBase* factory);
+
+// If *pstr starts with the given prefix, modifies *pstr to be right
+// past the prefix and returns true; otherwise leaves *pstr unchanged
+// and returns false.  None of pstr, *pstr, and prefix can be NULL.
+GTEST_API_ bool SkipPrefix(const char* prefix, const char** pstr);
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+// State of the definition of a type-parameterized test suite.
+class GTEST_API_ TypedTestSuitePState {
+ public:
+  TypedTestSuitePState() : registered_(false) {}
+
+  // Adds the given test name to defined_test_names_ and return true
+  // if the test suite hasn't been registered; otherwise aborts the
+  // program.
+  bool AddTestName(const char* file, int line, const char* case_name,
+                   const char* test_name) {
+    if (registered_) {
+      fprintf(stderr,
+              "%s Test %s must be defined before "
+              "REGISTER_TYPED_TEST_SUITE_P(%s, ...).\n",
+              FormatFileLocation(file, line).c_str(), test_name, case_name);
+      fflush(stderr);
+      posix::Abort();
+    }
+    registered_tests_.insert(
+        ::std::make_pair(test_name, CodeLocation(file, line)));
+    return true;
+  }
+
+  bool TestExists(const std::string& test_name) const {
+    return registered_tests_.count(test_name) > 0;
+  }
+
+  const CodeLocation& GetCodeLocation(const std::string& test_name) const {
+    RegisteredTestsMap::const_iterator it = registered_tests_.find(test_name);
+    GTEST_CHECK_(it != registered_tests_.end());
+    return it->second;
+  }
+
+  // Verifies that registered_tests match the test names in
+  // defined_test_names_; returns registered_tests if successful, or
+  // aborts the program otherwise.
+  const char* VerifyRegisteredTestNames(const char* test_suite_name,
+                                        const char* file, int line,
+                                        const char* registered_tests);
+
+ private:
+  typedef ::std::map<std::string, CodeLocation> RegisteredTestsMap;
+
+  bool registered_;
+  RegisteredTestsMap registered_tests_;
+};
+
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+using TypedTestCasePState = TypedTestSuitePState;
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+// Skips to the first non-space char after the first comma in 'str';
+// returns NULL if no comma is found in 'str'.
+inline const char* SkipComma(const char* str) {
+  const char* comma = strchr(str, ',');
+  if (comma == nullptr) {
+    return nullptr;
+  }
+  while (IsSpace(*(++comma))) {
+  }
+  return comma;
+}
+
+// Returns the prefix of 'str' before the first comma in it; returns
+// the entire string if it contains no comma.
+inline std::string GetPrefixUntilComma(const char* str) {
+  const char* comma = strchr(str, ',');
+  return comma == nullptr ? str : std::string(str, comma);
+}
+
+// Splits a given string on a given delimiter, populating a given
+// vector with the fields.
+void SplitString(const ::std::string& str, char delimiter,
+                 ::std::vector<::std::string>* dest);
+
+// The default argument to the template below for the case when the user does
+// not provide a name generator.
+struct DefaultNameGenerator {
+  template <typename T>
+  static std::string GetName(int i) {
+    return StreamableToString(i);
+  }
+};
+
+template <typename Provided = DefaultNameGenerator>
+struct NameGeneratorSelector {
+  typedef Provided type;
+};
+
+template <typename NameGenerator>
+void GenerateNamesRecursively(internal::None, std::vector<std::string>*, int) {}
+
+template <typename NameGenerator, typename Types>
+void GenerateNamesRecursively(Types, std::vector<std::string>* result, int i) {
+  result->push_back(NameGenerator::template GetName<typename Types::Head>(i));
+  GenerateNamesRecursively<NameGenerator>(typename Types::Tail(), result,
+                                          i + 1);
+}
+
+template <typename NameGenerator, typename Types>
+std::vector<std::string> GenerateNames() {
+  std::vector<std::string> result;
+  GenerateNamesRecursively<NameGenerator>(Types(), &result, 0);
+  return result;
+}
+
+// TypeParameterizedTest<Fixture, TestSel, Types>::Register()
+// registers a list of type-parameterized tests with Google Test.  The
+// return value is insignificant - we just need to return something
+// such that we can call this function in a namespace scope.
+//
+// Implementation note: The GTEST_TEMPLATE_ macro declares a template
+// template parameter.  It's defined in gtest-type-util.h.
+template <GTEST_TEMPLATE_ Fixture, class TestSel, typename Types>
+class TypeParameterizedTest {
+ public:
+  // 'index' is the index of the test in the type list 'Types'
+  // specified in INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, TestSuite,
+  // Types).  Valid values for 'index' are [0, N - 1] where N is the
+  // length of Types.
+  static bool Register(const char* prefix, const CodeLocation& code_location,
+                       const char* case_name, const char* test_names, int index,
+                       const std::vector<std::string>& type_names =
+                           GenerateNames<DefaultNameGenerator, Types>()) {
+    typedef typename Types::Head Type;
+    typedef Fixture<Type> FixtureClass;
+    typedef typename GTEST_BIND_(TestSel, Type) TestClass;
+
+    // First, registers the first type-parameterized test in the type
+    // list.
+    MakeAndRegisterTestInfo(
+        (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name +
+         "/" + type_names[static_cast<size_t>(index)])
+            .c_str(),
+        StripTrailingSpaces(GetPrefixUntilComma(test_names)).c_str(),
+        GetTypeName<Type>().c_str(),
+        nullptr,  // No value parameter.
+        code_location, GetTypeId<FixtureClass>(),
+        SuiteApiResolver<TestClass>::GetSetUpCaseOrSuite(
+            code_location.file.c_str(), code_location.line),
+        SuiteApiResolver<TestClass>::GetTearDownCaseOrSuite(
+            code_location.file.c_str(), code_location.line),
+        new TestFactoryImpl<TestClass>);
+
+    // Next, recurses (at compile time) with the tail of the type list.
+    return TypeParameterizedTest<Fixture, TestSel,
+                                 typename Types::Tail>::Register(prefix,
+                                                                 code_location,
+                                                                 case_name,
+                                                                 test_names,
+                                                                 index + 1,
+                                                                 type_names);
+  }
+};
+
+// The base case for the compile time recursion.
+template <GTEST_TEMPLATE_ Fixture, class TestSel>
+class TypeParameterizedTest<Fixture, TestSel, internal::None> {
+ public:
+  static bool Register(const char* /*prefix*/, const CodeLocation&,
+                       const char* /*case_name*/, const char* /*test_names*/,
+                       int /*index*/,
+                       const std::vector<std::string>& =
+                           std::vector<std::string>() /*type_names*/) {
+    return true;
+  }
+};
+
+GTEST_API_ void RegisterTypeParameterizedTestSuite(const char* test_suite_name,
+                                                   CodeLocation code_location);
+GTEST_API_ void RegisterTypeParameterizedTestSuiteInstantiation(
+    const char* case_name);
+
+// TypeParameterizedTestSuite<Fixture, Tests, Types>::Register()
+// registers *all combinations* of 'Tests' and 'Types' with Google
+// Test.  The return value is insignificant - we just need to return
+// something such that we can call this function in a namespace scope.
+template <GTEST_TEMPLATE_ Fixture, typename Tests, typename Types>
+class TypeParameterizedTestSuite {
+ public:
+  static bool Register(const char* prefix, CodeLocation code_location,
+                       const TypedTestSuitePState* state, const char* case_name,
+                       const char* test_names,
+                       const std::vector<std::string>& type_names =
+                           GenerateNames<DefaultNameGenerator, Types>()) {
+    RegisterTypeParameterizedTestSuiteInstantiation(case_name);
+    std::string test_name =
+        StripTrailingSpaces(GetPrefixUntilComma(test_names));
+    if (!state->TestExists(test_name)) {
+      fprintf(stderr, "Failed to get code location for test %s.%s at %s.",
+              case_name, test_name.c_str(),
+              FormatFileLocation(code_location.file.c_str(), code_location.line)
+                  .c_str());
+      fflush(stderr);
+      posix::Abort();
+    }
+    const CodeLocation& test_location = state->GetCodeLocation(test_name);
+
+    typedef typename Tests::Head Head;
+
+    // First, register the first test in 'Test' for each type in 'Types'.
+    TypeParameterizedTest<Fixture, Head, Types>::Register(
+        prefix, test_location, case_name, test_names, 0, type_names);
+
+    // Next, recurses (at compile time) with the tail of the test list.
+    return TypeParameterizedTestSuite<Fixture, typename Tests::Tail,
+                                      Types>::Register(prefix, code_location,
+                                                       state, case_name,
+                                                       SkipComma(test_names),
+                                                       type_names);
+  }
+};
+
+// The base case for the compile time recursion.
+template <GTEST_TEMPLATE_ Fixture, typename Types>
+class TypeParameterizedTestSuite<Fixture, internal::None, Types> {
+ public:
+  static bool Register(const char* /*prefix*/, const CodeLocation&,
+                       const TypedTestSuitePState* /*state*/,
+                       const char* /*case_name*/, const char* /*test_names*/,
+                       const std::vector<std::string>& =
+                           std::vector<std::string>() /*type_names*/) {
+    return true;
+  }
+};
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag.  The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
+// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
+GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(UnitTest* unit_test,
+                                                       int skip_count);
+
+// Helpers for suppressing warnings on unreachable code or constant
+// condition.
+
+// Always returns true.
+GTEST_API_ bool AlwaysTrue();
+
+// Always returns false.
+inline bool AlwaysFalse() { return !AlwaysTrue(); }
+
+// Helper for suppressing false warning from Clang on a const char*
+// variable declared in a conditional expression always being NULL in
+// the else branch.
+struct GTEST_API_ ConstCharPtr {
+  ConstCharPtr(const char* str) : value(str) {}
+  operator bool() const { return true; }
+  const char* value;
+};
+
+// Helper for declaring std::string within 'if' statement
+// in pre C++17 build environment.
+struct TrueWithString {
+  TrueWithString() = default;
+  explicit TrueWithString(const char* str) : value(str) {}
+  explicit TrueWithString(const std::string& str) : value(str) {}
+  explicit operator bool() const { return true; }
+  std::string value;
+};
+
+// A simple Linear Congruential Generator for generating random
+// numbers with a uniform distribution.  Unlike rand() and srand(), it
+// doesn't use global state (and therefore can't interfere with user
+// code).  Unlike rand_r(), it's portable.  An LCG isn't very random,
+// but it's good enough for our purposes.
+class GTEST_API_ Random {
+ public:
+  static const uint32_t kMaxRange = 1u << 31;
+
+  explicit Random(uint32_t seed) : state_(seed) {}
+
+  void Reseed(uint32_t seed) { state_ = seed; }
+
+  // Generates a random number from [0, range).  Crashes if 'range' is
+  // 0 or greater than kMaxRange.
+  uint32_t Generate(uint32_t range);
+
+ private:
+  uint32_t state_;
+  Random(const Random&) = delete;
+  Random& operator=(const Random&) = delete;
+};
+
+// Turns const U&, U&, const U, and U all into U.
+#define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \
+  typename std::remove_const<typename std::remove_reference<T>::type>::type
+
+// HasDebugStringAndShortDebugString<T>::value is a compile-time bool constant
+// that's true if and only if T has methods DebugString() and ShortDebugString()
+// that return std::string.
+template <typename T>
+class HasDebugStringAndShortDebugString {
+ private:
+  template <typename C>
+  static auto CheckDebugString(C*) -> typename std::is_same<
+      std::string, decltype(std::declval<const C>().DebugString())>::type;
+  template <typename>
+  static std::false_type CheckDebugString(...);
+
+  template <typename C>
+  static auto CheckShortDebugString(C*) -> typename std::is_same<
+      std::string, decltype(std::declval<const C>().ShortDebugString())>::type;
+  template <typename>
+  static std::false_type CheckShortDebugString(...);
+
+  using HasDebugStringType = decltype(CheckDebugString<T>(nullptr));
+  using HasShortDebugStringType = decltype(CheckShortDebugString<T>(nullptr));
+
+ public:
+  static constexpr bool value =
+      HasDebugStringType::value && HasShortDebugStringType::value;
+};
+
+template <typename T>
+constexpr bool HasDebugStringAndShortDebugString<T>::value;
+
+// When the compiler sees expression IsContainerTest<C>(0), if C is an
+// STL-style container class, the first overload of IsContainerTest
+// will be viable (since both C::iterator* and C::const_iterator* are
+// valid types and NULL can be implicitly converted to them).  It will
+// be picked over the second overload as 'int' is a perfect match for
+// the type of argument 0.  If C::iterator or C::const_iterator is not
+// a valid type, the first overload is not viable, and the second
+// overload will be picked.  Therefore, we can determine whether C is
+// a container class by checking the type of IsContainerTest<C>(0).
+// The value of the expression is insignificant.
+//
+// In C++11 mode we check the existence of a const_iterator and that an
+// iterator is properly implemented for the container.
+//
+// For pre-C++11 that we look for both C::iterator and C::const_iterator.
+// The reason is that C++ injects the name of a class as a member of the
+// class itself (e.g. you can refer to class iterator as either
+// 'iterator' or 'iterator::iterator').  If we look for C::iterator
+// only, for example, we would mistakenly think that a class named
+// iterator is an STL container.
+//
+// Also note that the simpler approach of overloading
+// IsContainerTest(typename C::const_iterator*) and
+// IsContainerTest(...) doesn't work with Visual Age C++ and Sun C++.
+typedef int IsContainer;
+template <class C,
+          class Iterator = decltype(::std::declval<const C&>().begin()),
+          class = decltype(::std::declval<const C&>().end()),
+          class = decltype(++::std::declval<Iterator&>()),
+          class = decltype(*::std::declval<Iterator>()),
+          class = typename C::const_iterator>
+IsContainer IsContainerTest(int /* dummy */) {
+  return 0;
+}
+
+typedef char IsNotContainer;
+template <class C>
+IsNotContainer IsContainerTest(long /* dummy */) {
+  return '\0';
+}
+
+// Trait to detect whether a type T is a hash table.
+// The heuristic used is that the type contains an inner type `hasher` and does
+// not contain an inner type `reverse_iterator`.
+// If the container is iterable in reverse, then order might actually matter.
+template <typename T>
+struct IsHashTable {
+ private:
+  template <typename U>
+  static char test(typename U::hasher*, typename U::reverse_iterator*);
+  template <typename U>
+  static int test(typename U::hasher*, ...);
+  template <typename U>
+  static char test(...);
+
+ public:
+  static const bool value = sizeof(test<T>(nullptr, nullptr)) == sizeof(int);
+};
+
+template <typename T>
+const bool IsHashTable<T>::value;
+
+template <typename C,
+          bool = sizeof(IsContainerTest<C>(0)) == sizeof(IsContainer)>
+struct IsRecursiveContainerImpl;
+
+template <typename C>
+struct IsRecursiveContainerImpl<C, false> : public std::false_type {};
+
+// Since the IsRecursiveContainerImpl depends on the IsContainerTest we need to
+// obey the same inconsistencies as the IsContainerTest, namely check if
+// something is a container is relying on only const_iterator in C++11 and
+// is relying on both const_iterator and iterator otherwise
+template <typename C>
+struct IsRecursiveContainerImpl<C, true> {
+  using value_type = decltype(*std::declval<typename C::const_iterator>());
+  using type =
+      std::is_same<typename std::remove_const<
+                       typename std::remove_reference<value_type>::type>::type,
+                   C>;
+};
+
+// IsRecursiveContainer<Type> is a unary compile-time predicate that
+// evaluates whether C is a recursive container type. A recursive container
+// type is a container type whose value_type is equal to the container type
+// itself. An example for a recursive container type is
+// boost::filesystem::path, whose iterator has a value_type that is equal to
+// boost::filesystem::path.
+template <typename C>
+struct IsRecursiveContainer : public IsRecursiveContainerImpl<C>::type {};
+
+// Utilities for native arrays.
+
+// ArrayEq() compares two k-dimensional native arrays using the
+// elements' operator==, where k can be any integer >= 0.  When k is
+// 0, ArrayEq() degenerates into comparing a single pair of values.
+
+template <typename T, typename U>
+bool ArrayEq(const T* lhs, size_t size, const U* rhs);
+
+// This generic version is used when k is 0.
+template <typename T, typename U>
+inline bool ArrayEq(const T& lhs, const U& rhs) {
+  return lhs == rhs;
+}
+
+// This overload is used when k >= 1.
+template <typename T, typename U, size_t N>
+inline bool ArrayEq(const T (&lhs)[N], const U (&rhs)[N]) {
+  return internal::ArrayEq(lhs, N, rhs);
+}
+
+// This helper reduces code bloat.  If we instead put its logic inside
+// the previous ArrayEq() function, arrays with different sizes would
+// lead to different copies of the template code.
+template <typename T, typename U>
+bool ArrayEq(const T* lhs, size_t size, const U* rhs) {
+  for (size_t i = 0; i != size; i++) {
+    if (!internal::ArrayEq(lhs[i], rhs[i])) return false;
+  }
+  return true;
+}
+
+// Finds the first element in the iterator range [begin, end) that
+// equals elem.  Element may be a native array type itself.
+template <typename Iter, typename Element>
+Iter ArrayAwareFind(Iter begin, Iter end, const Element& elem) {
+  for (Iter it = begin; it != end; ++it) {
+    if (internal::ArrayEq(*it, elem)) return it;
+  }
+  return end;
+}
+
+// CopyArray() copies a k-dimensional native array using the elements'
+// operator=, where k can be any integer >= 0.  When k is 0,
+// CopyArray() degenerates into copying a single value.
+
+template <typename T, typename U>
+void CopyArray(const T* from, size_t size, U* to);
+
+// This generic version is used when k is 0.
+template <typename T, typename U>
+inline void CopyArray(const T& from, U* to) {
+  *to = from;
+}
+
+// This overload is used when k >= 1.
+template <typename T, typename U, size_t N>
+inline void CopyArray(const T (&from)[N], U (*to)[N]) {
+  internal::CopyArray(from, N, *to);
+}
+
+// This helper reduces code bloat.  If we instead put its logic inside
+// the previous CopyArray() function, arrays with different sizes
+// would lead to different copies of the template code.
+template <typename T, typename U>
+void CopyArray(const T* from, size_t size, U* to) {
+  for (size_t i = 0; i != size; i++) {
+    internal::CopyArray(from[i], to + i);
+  }
+}
+
+// The relation between an NativeArray object (see below) and the
+// native array it represents.
+// We use 2 different structs to allow non-copyable types to be used, as long
+// as RelationToSourceReference() is passed.
+struct RelationToSourceReference {};
+struct RelationToSourceCopy {};
+
+// Adapts a native array to a read-only STL-style container.  Instead
+// of the complete STL container concept, this adaptor only implements
+// members useful for Google Mock's container matchers.  New members
+// should be added as needed.  To simplify the implementation, we only
+// support Element being a raw type (i.e. having no top-level const or
+// reference modifier).  It's the client's responsibility to satisfy
+// this requirement.  Element can be an array type itself (hence
+// multi-dimensional arrays are supported).
+template <typename Element>
+class NativeArray {
+ public:
+  // STL-style container typedefs.
+  typedef Element value_type;
+  typedef Element* iterator;
+  typedef const Element* const_iterator;
+
+  // Constructs from a native array. References the source.
+  NativeArray(const Element* array, size_t count, RelationToSourceReference) {
+    InitRef(array, count);
+  }
+
+  // Constructs from a native array. Copies the source.
+  NativeArray(const Element* array, size_t count, RelationToSourceCopy) {
+    InitCopy(array, count);
+  }
+
+  // Copy constructor.
+  NativeArray(const NativeArray& rhs) {
+    (this->*rhs.clone_)(rhs.array_, rhs.size_);
+  }
+
+  ~NativeArray() {
+    if (clone_ != &NativeArray::InitRef) delete[] array_;
+  }
+
+  // STL-style container methods.
+  size_t size() const { return size_; }
+  const_iterator begin() const { return array_; }
+  const_iterator end() const { return array_ + size_; }
+  bool operator==(const NativeArray& rhs) const {
+    return size() == rhs.size() && ArrayEq(begin(), size(), rhs.begin());
+  }
+
+ private:
+  static_assert(!std::is_const<Element>::value, "Type must not be const");
+  static_assert(!std::is_reference<Element>::value,
+                "Type must not be a reference");
+
+  // Initializes this object with a copy of the input.
+  void InitCopy(const Element* array, size_t a_size) {
+    Element* const copy = new Element[a_size];
+    CopyArray(array, a_size, copy);
+    array_ = copy;
+    size_ = a_size;
+    clone_ = &NativeArray::InitCopy;
+  }
+
+  // Initializes this object with a reference of the input.
+  void InitRef(const Element* array, size_t a_size) {
+    array_ = array;
+    size_ = a_size;
+    clone_ = &NativeArray::InitRef;
+  }
+
+  const Element* array_;
+  size_t size_;
+  void (NativeArray::*clone_)(const Element*, size_t);
+};
+
+// Backport of std::index_sequence.
+template <size_t... Is>
+struct IndexSequence {
+  using type = IndexSequence;
+};
+
+// Double the IndexSequence, and one if plus_one is true.
+template <bool plus_one, typename T, size_t sizeofT>
+struct DoubleSequence;
+template <size_t... I, size_t sizeofT>
+struct DoubleSequence<true, IndexSequence<I...>, sizeofT> {
+  using type = IndexSequence<I..., (sizeofT + I)..., 2 * sizeofT>;
+};
+template <size_t... I, size_t sizeofT>
+struct DoubleSequence<false, IndexSequence<I...>, sizeofT> {
+  using type = IndexSequence<I..., (sizeofT + I)...>;
+};
+
+// Backport of std::make_index_sequence.
+// It uses O(ln(N)) instantiation depth.
+template <size_t N>
+struct MakeIndexSequenceImpl
+    : DoubleSequence<N % 2 == 1, typename MakeIndexSequenceImpl<N / 2>::type,
+                     N / 2>::type {};
+
+template <>
+struct MakeIndexSequenceImpl<0> : IndexSequence<> {};
+
+template <size_t N>
+using MakeIndexSequence = typename MakeIndexSequenceImpl<N>::type;
+
+template <typename... T>
+using IndexSequenceFor = typename MakeIndexSequence<sizeof...(T)>::type;
+
+template <size_t>
+struct Ignore {
+  Ignore(...);  // NOLINT
+};
+
+template <typename>
+struct ElemFromListImpl;
+template <size_t... I>
+struct ElemFromListImpl<IndexSequence<I...>> {
+  // We make Ignore a template to solve a problem with MSVC.
+  // A non-template Ignore would work fine with `decltype(Ignore(I))...`, but
+  // MSVC doesn't understand how to deal with that pack expansion.
+  // Use `0 * I` to have a single instantiation of Ignore.
+  template <typename R>
+  static R Apply(Ignore<0 * I>..., R (*)(), ...);
+};
+
+template <size_t N, typename... T>
+struct ElemFromList {
+  using type =
+      decltype(ElemFromListImpl<typename MakeIndexSequence<N>::type>::Apply(
+          static_cast<T (*)()>(nullptr)...));
+};
+
+struct FlatTupleConstructTag {};
+
+template <typename... T>
+class FlatTuple;
+
+template <typename Derived, size_t I>
+struct FlatTupleElemBase;
+
+template <typename... T, size_t I>
+struct FlatTupleElemBase<FlatTuple<T...>, I> {
+  using value_type = typename ElemFromList<I, T...>::type;
+  FlatTupleElemBase() = default;
+  template <typename Arg>
+  explicit FlatTupleElemBase(FlatTupleConstructTag, Arg&& t)
+      : value(std::forward<Arg>(t)) {}
+  value_type value;
+};
+
+template <typename Derived, typename Idx>
+struct FlatTupleBase;
+
+template <size_t... Idx, typename... T>
+struct FlatTupleBase<FlatTuple<T...>, IndexSequence<Idx...>>
+    : FlatTupleElemBase<FlatTuple<T...>, Idx>... {
+  using Indices = IndexSequence<Idx...>;
+  FlatTupleBase() = default;
+  template <typename... Args>
+  explicit FlatTupleBase(FlatTupleConstructTag, Args&&... args)
+      : FlatTupleElemBase<FlatTuple<T...>, Idx>(FlatTupleConstructTag{},
+                                                std::forward<Args>(args))... {}
+
+  template <size_t I>
+  const typename ElemFromList<I, T...>::type& Get() const {
+    return FlatTupleElemBase<FlatTuple<T...>, I>::value;
+  }
+
+  template <size_t I>
+  typename ElemFromList<I, T...>::type& Get() {
+    return FlatTupleElemBase<FlatTuple<T...>, I>::value;
+  }
+
+  template <typename F>
+  auto Apply(F&& f) -> decltype(std::forward<F>(f)(this->Get<Idx>()...)) {
+    return std::forward<F>(f)(Get<Idx>()...);
+  }
+
+  template <typename F>
+  auto Apply(F&& f) const -> decltype(std::forward<F>(f)(this->Get<Idx>()...)) {
+    return std::forward<F>(f)(Get<Idx>()...);
+  }
+};
+
+// Analog to std::tuple but with different tradeoffs.
+// This class minimizes the template instantiation depth, thus allowing more
+// elements than std::tuple would. std::tuple has been seen to require an
+// instantiation depth of more than 10x the number of elements in some
+// implementations.
+// FlatTuple and ElemFromList are not recursive and have a fixed depth
+// regardless of T...
+// MakeIndexSequence, on the other hand, it is recursive but with an
+// instantiation depth of O(ln(N)).
+template <typename... T>
+class FlatTuple
+    : private FlatTupleBase<FlatTuple<T...>,
+                            typename MakeIndexSequence<sizeof...(T)>::type> {
+  using Indices = typename FlatTupleBase<
+      FlatTuple<T...>, typename MakeIndexSequence<sizeof...(T)>::type>::Indices;
+
+ public:
+  FlatTuple() = default;
+  template <typename... Args>
+  explicit FlatTuple(FlatTupleConstructTag tag, Args&&... args)
+      : FlatTuple::FlatTupleBase(tag, std::forward<Args>(args)...) {}
+
+  using FlatTuple::FlatTupleBase::Apply;
+  using FlatTuple::FlatTupleBase::Get;
+};
+
+// Utility functions to be called with static_assert to induce deprecation
+// warnings.
+GTEST_INTERNAL_DEPRECATED(
+    "INSTANTIATE_TEST_CASE_P is deprecated, please use "
+    "INSTANTIATE_TEST_SUITE_P")
+constexpr bool InstantiateTestCase_P_IsDeprecated() { return true; }
+
+GTEST_INTERNAL_DEPRECATED(
+    "TYPED_TEST_CASE_P is deprecated, please use "
+    "TYPED_TEST_SUITE_P")
+constexpr bool TypedTestCase_P_IsDeprecated() { return true; }
+
+GTEST_INTERNAL_DEPRECATED(
+    "TYPED_TEST_CASE is deprecated, please use "
+    "TYPED_TEST_SUITE")
+constexpr bool TypedTestCaseIsDeprecated() { return true; }
+
+GTEST_INTERNAL_DEPRECATED(
+    "REGISTER_TYPED_TEST_CASE_P is deprecated, please use "
+    "REGISTER_TYPED_TEST_SUITE_P")
+constexpr bool RegisterTypedTestCase_P_IsDeprecated() { return true; }
+
+GTEST_INTERNAL_DEPRECATED(
+    "INSTANTIATE_TYPED_TEST_CASE_P is deprecated, please use "
+    "INSTANTIATE_TYPED_TEST_SUITE_P")
+constexpr bool InstantiateTypedTestCase_P_IsDeprecated() { return true; }
+
+}  // namespace internal
+}  // namespace testing
+
+namespace std {
+// Some standard library implementations use `struct tuple_size` and some use
+// `class tuple_size`. Clang warns about the mismatch.
+// https://reviews.llvm.org/D55466
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wmismatched-tags"
+#endif
+template <typename... Ts>
+struct tuple_size<testing::internal::FlatTuple<Ts...>>
+    : std::integral_constant<size_t, sizeof...(Ts)> {};
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+}  // namespace std
+
+#define GTEST_MESSAGE_AT_(file, line, message, result_type)             \
+  ::testing::internal::AssertHelper(result_type, file, line, message) = \
+      ::testing::Message()
+
+#define GTEST_MESSAGE_(message, result_type) \
+  GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type)
+
+#define GTEST_FATAL_FAILURE_(message) \
+  return GTEST_MESSAGE_(message, ::testing::TestPartResult::kFatalFailure)
+
+#define GTEST_NONFATAL_FAILURE_(message) \
+  GTEST_MESSAGE_(message, ::testing::TestPartResult::kNonFatalFailure)
+
+#define GTEST_SUCCESS_(message) \
+  GTEST_MESSAGE_(message, ::testing::TestPartResult::kSuccess)
+
+#define GTEST_SKIP_(message) \
+  return GTEST_MESSAGE_(message, ::testing::TestPartResult::kSkip)
+
+// Suppress MSVC warning 4072 (unreachable code) for the code following
+// statement if it returns or throws (or doesn't return or throw in some
+// situations).
+// NOTE: The "else" is important to keep this expansion to prevent a top-level
+// "else" from attaching to our "if".
+#define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \
+  if (::testing::internal::AlwaysTrue()) {                        \
+    statement;                                                    \
+  } else                     /* NOLINT */                         \
+    static_assert(true, "")  // User must have a semicolon after expansion.
+
+#if GTEST_HAS_EXCEPTIONS
+
+namespace testing {
+namespace internal {
+
+class NeverThrown {
+ public:
+  const char* what() const noexcept {
+    return "this exception should never be thrown";
+  }
+};
+
+}  // namespace internal
+}  // namespace testing
+
+#if GTEST_HAS_RTTI
+
+#define GTEST_EXCEPTION_TYPE_(e) ::testing::internal::GetTypeName(typeid(e))
+
+#else  // GTEST_HAS_RTTI
+
+#define GTEST_EXCEPTION_TYPE_(e) \
+  std::string { "an std::exception-derived error" }
+
+#endif  // GTEST_HAS_RTTI
+
+#define GTEST_TEST_THROW_CATCH_STD_EXCEPTION_(statement, expected_exception)   \
+  catch (typename std::conditional<                                            \
+         std::is_same<typename std::remove_cv<typename std::remove_reference<  \
+                          expected_exception>::type>::type,                    \
+                      std::exception>::value,                                  \
+         const ::testing::internal::NeverThrown&, const std::exception&>::type \
+             e) {                                                              \
+    gtest_msg.value = "Expected: " #statement                                  \
+                      " throws an exception of type " #expected_exception      \
+                      ".\n  Actual: it throws ";                               \
+    gtest_msg.value += GTEST_EXCEPTION_TYPE_(e);                               \
+    gtest_msg.value += " with description \"";                                 \
+    gtest_msg.value += e.what();                                               \
+    gtest_msg.value += "\".";                                                  \
+    goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__);                \
+  }
+
+#else  // GTEST_HAS_EXCEPTIONS
+
+#define GTEST_TEST_THROW_CATCH_STD_EXCEPTION_(statement, expected_exception)
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+#define GTEST_TEST_THROW_(statement, expected_exception, fail)              \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                             \
+  if (::testing::internal::TrueWithString gtest_msg{}) {                    \
+    bool gtest_caught_expected = false;                                     \
+    try {                                                                   \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);            \
+    } catch (expected_exception const&) {                                   \
+      gtest_caught_expected = true;                                         \
+    }                                                                       \
+    GTEST_TEST_THROW_CATCH_STD_EXCEPTION_(statement, expected_exception)    \
+    catch (...) {                                                           \
+      gtest_msg.value = "Expected: " #statement                             \
+                        " throws an exception of type " #expected_exception \
+                        ".\n  Actual: it throws a different type.";         \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__);           \
+    }                                                                       \
+    if (!gtest_caught_expected) {                                           \
+      gtest_msg.value = "Expected: " #statement                             \
+                        " throws an exception of type " #expected_exception \
+                        ".\n  Actual: it throws nothing.";                  \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__);           \
+    }                                                                       \
+  } else /*NOLINT*/                                                         \
+    GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__)                   \
+        : fail(gtest_msg.value.c_str())
+
+#if GTEST_HAS_EXCEPTIONS
+
+#define GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_()                \
+  catch (std::exception const& e) {                               \
+    gtest_msg.value = "it throws ";                               \
+    gtest_msg.value += GTEST_EXCEPTION_TYPE_(e);                  \
+    gtest_msg.value += " with description \"";                    \
+    gtest_msg.value += e.what();                                  \
+    gtest_msg.value += "\".";                                     \
+    goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \
+  }
+
+#else  // GTEST_HAS_EXCEPTIONS
+
+#define GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_()
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+#define GTEST_TEST_NO_THROW_(statement, fail)                            \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                          \
+  if (::testing::internal::TrueWithString gtest_msg{}) {                 \
+    try {                                                                \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);         \
+    }                                                                    \
+    GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_()                           \
+    catch (...) {                                                        \
+      gtest_msg.value = "it throws.";                                    \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__);      \
+    }                                                                    \
+  } else                                                                 \
+    GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__)              \
+        : fail(("Expected: " #statement " doesn't throw an exception.\n" \
+                "  Actual: " +                                           \
+                gtest_msg.value)                                         \
+                   .c_str())
+
+#define GTEST_TEST_ANY_THROW_(statement, fail)                       \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                      \
+  if (::testing::internal::AlwaysTrue()) {                           \
+    bool gtest_caught_any = false;                                   \
+    try {                                                            \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);     \
+    } catch (...) {                                                  \
+      gtest_caught_any = true;                                       \
+    }                                                                \
+    if (!gtest_caught_any) {                                         \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \
+    }                                                                \
+  } else                                                             \
+    GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__)         \
+        : fail("Expected: " #statement                               \
+               " throws an exception.\n"                             \
+               "  Actual: it doesn't.")
+
+// Implements Boolean test assertions such as EXPECT_TRUE. expression can be
+// either a boolean expression or an AssertionResult. text is a textual
+// representation of expression as it was passed into the EXPECT_TRUE.
+#define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                       \
+  if (const ::testing::AssertionResult gtest_ar_ =                    \
+          ::testing::AssertionResult(expression))                     \
+    ;                                                                 \
+  else                                                                \
+    fail(::testing::internal::GetBoolAssertionFailureMessage(         \
+             gtest_ar_, text, #actual, #expected)                     \
+             .c_str())
+
+#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail)                          \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                                \
+  if (::testing::internal::AlwaysTrue()) {                                     \
+    ::testing::internal::HasNewFatalFailureHelper gtest_fatal_failure_checker; \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);                 \
+    if (gtest_fatal_failure_checker.has_new_fatal_failure()) {                 \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__);            \
+    }                                                                          \
+  } else                                                                       \
+    GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__)                    \
+        : fail("Expected: " #statement                                         \
+               " doesn't generate new fatal "                                  \
+               "failures in the current thread.\n"                             \
+               "  Actual: it does.")
+
+// Expands to the name of the class that implements the given test.
+#define GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \
+  test_suite_name##_##test_name##_Test
+
+// Helper macro for defining tests.
+#define GTEST_TEST_(test_suite_name, test_name, parent_class, parent_id)       \
+  static_assert(sizeof(GTEST_STRINGIFY_(test_suite_name)) > 1,                 \
+                "test_suite_name must not be empty");                          \
+  static_assert(sizeof(GTEST_STRINGIFY_(test_name)) > 1,                       \
+                "test_name must not be empty");                                \
+  class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                     \
+      : public parent_class {                                                  \
+   public:                                                                     \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() = default;            \
+    ~GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() override = default;  \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                         \
+    (const GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) &) = delete;     \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) & operator=(            \
+        const GTEST_TEST_CLASS_NAME_(test_suite_name,                          \
+                                     test_name) &) = delete; /* NOLINT */      \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                         \
+    (GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) &&) noexcept = delete; \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) & operator=(            \
+        GTEST_TEST_CLASS_NAME_(test_suite_name,                                \
+                               test_name) &&) noexcept = delete; /* NOLINT */  \
+                                                                               \
+   private:                                                                    \
+    void TestBody() override;                                                  \
+    static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;      \
+  };                                                                           \
+                                                                               \
+  ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_suite_name,           \
+                                                    test_name)::test_info_ =   \
+      ::testing::internal::MakeAndRegisterTestInfo(                            \
+          #test_suite_name, #test_name, nullptr, nullptr,                      \
+          ::testing::internal::CodeLocation(__FILE__, __LINE__), (parent_id),  \
+          ::testing::internal::SuiteApiResolver<                               \
+              parent_class>::GetSetUpCaseOrSuite(__FILE__, __LINE__),          \
+          ::testing::internal::SuiteApiResolver<                               \
+              parent_class>::GetTearDownCaseOrSuite(__FILE__, __LINE__),       \
+          new ::testing::internal::TestFactoryImpl<GTEST_TEST_CLASS_NAME_(     \
+              test_suite_name, test_name)>);                                   \
+  void GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::TestBody()
+
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h
new file mode 100644
index 0000000000..e7af2f904a
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h
@@ -0,0 +1,956 @@
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Type and function utilities for implementing parameterized tests.
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+
+#include <ctype.h>
+
+#include <cassert>
+#include <iterator>
+#include <memory>
+#include <set>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "gtest/gtest-printers.h"
+#include "gtest/gtest-test-part.h"
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
+
+namespace testing {
+// Input to a parameterized test name generator, describing a test parameter.
+// Consists of the parameter value and the integer parameter index.
+template <class ParamType>
+struct TestParamInfo {
+  TestParamInfo(const ParamType& a_param, size_t an_index)
+      : param(a_param), index(an_index) {}
+  ParamType param;
+  size_t index;
+};
+
+// A builtin parameterized test name generator which returns the result of
+// testing::PrintToString.
+struct PrintToStringParamName {
+  template <class ParamType>
+  std::string operator()(const TestParamInfo<ParamType>& info) const {
+    return PrintToString(info.param);
+  }
+};
+
+namespace internal {
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+// Utility Functions
+
+// Outputs a message explaining invalid registration of different
+// fixture class for the same test suite. This may happen when
+// TEST_P macro is used to define two tests with the same name
+// but in different namespaces.
+GTEST_API_ void ReportInvalidTestSuiteType(const char* test_suite_name,
+                                           CodeLocation code_location);
+
+template <typename>
+class ParamGeneratorInterface;
+template <typename>
+class ParamGenerator;
+
+// Interface for iterating over elements provided by an implementation
+// of ParamGeneratorInterface<T>.
+template <typename T>
+class ParamIteratorInterface {
+ public:
+  virtual ~ParamIteratorInterface() {}
+  // A pointer to the base generator instance.
+  // Used only for the purposes of iterator comparison
+  // to make sure that two iterators belong to the same generator.
+  virtual const ParamGeneratorInterface<T>* BaseGenerator() const = 0;
+  // Advances iterator to point to the next element
+  // provided by the generator. The caller is responsible
+  // for not calling Advance() on an iterator equal to
+  // BaseGenerator()->End().
+  virtual void Advance() = 0;
+  // Clones the iterator object. Used for implementing copy semantics
+  // of ParamIterator<T>.
+  virtual ParamIteratorInterface* Clone() const = 0;
+  // Dereferences the current iterator and provides (read-only) access
+  // to the pointed value. It is the caller's responsibility not to call
+  // Current() on an iterator equal to BaseGenerator()->End().
+  // Used for implementing ParamGenerator<T>::operator*().
+  virtual const T* Current() const = 0;
+  // Determines whether the given iterator and other point to the same
+  // element in the sequence generated by the generator.
+  // Used for implementing ParamGenerator<T>::operator==().
+  virtual bool Equals(const ParamIteratorInterface& other) const = 0;
+};
+
+// Class iterating over elements provided by an implementation of
+// ParamGeneratorInterface<T>. It wraps ParamIteratorInterface<T>
+// and implements the const forward iterator concept.
+template <typename T>
+class ParamIterator {
+ public:
+  typedef T value_type;
+  typedef const T& reference;
+  typedef ptrdiff_t difference_type;
+
+  // ParamIterator assumes ownership of the impl_ pointer.
+  ParamIterator(const ParamIterator& other) : impl_(other.impl_->Clone()) {}
+  ParamIterator& operator=(const ParamIterator& other) {
+    if (this != &other) impl_.reset(other.impl_->Clone());
+    return *this;
+  }
+
+  const T& operator*() const { return *impl_->Current(); }
+  const T* operator->() const { return impl_->Current(); }
+  // Prefix version of operator++.
+  ParamIterator& operator++() {
+    impl_->Advance();
+    return *this;
+  }
+  // Postfix version of operator++.
+  ParamIterator operator++(int /*unused*/) {
+    ParamIteratorInterface<T>* clone = impl_->Clone();
+    impl_->Advance();
+    return ParamIterator(clone);
+  }
+  bool operator==(const ParamIterator& other) const {
+    return impl_.get() == other.impl_.get() || impl_->Equals(*other.impl_);
+  }
+  bool operator!=(const ParamIterator& other) const {
+    return !(*this == other);
+  }
+
+ private:
+  friend class ParamGenerator<T>;
+  explicit ParamIterator(ParamIteratorInterface<T>* impl) : impl_(impl) {}
+  std::unique_ptr<ParamIteratorInterface<T>> impl_;
+};
+
+// ParamGeneratorInterface<T> is the binary interface to access generators
+// defined in other translation units.
+template <typename T>
+class ParamGeneratorInterface {
+ public:
+  typedef T ParamType;
+
+  virtual ~ParamGeneratorInterface() {}
+
+  // Generator interface definition
+  virtual ParamIteratorInterface<T>* Begin() const = 0;
+  virtual ParamIteratorInterface<T>* End() const = 0;
+};
+
+// Wraps ParamGeneratorInterface<T> and provides general generator syntax
+// compatible with the STL Container concept.
+// This class implements copy initialization semantics and the contained
+// ParamGeneratorInterface<T> instance is shared among all copies
+// of the original object. This is possible because that instance is immutable.
+template <typename T>
+class ParamGenerator {
+ public:
+  typedef ParamIterator<T> iterator;
+
+  explicit ParamGenerator(ParamGeneratorInterface<T>* impl) : impl_(impl) {}
+  ParamGenerator(const ParamGenerator& other) : impl_(other.impl_) {}
+
+  ParamGenerator& operator=(const ParamGenerator& other) {
+    impl_ = other.impl_;
+    return *this;
+  }
+
+  iterator begin() const { return iterator(impl_->Begin()); }
+  iterator end() const { return iterator(impl_->End()); }
+
+ private:
+  std::shared_ptr<const ParamGeneratorInterface<T>> impl_;
+};
+
+// Generates values from a range of two comparable values. Can be used to
+// generate sequences of user-defined types that implement operator+() and
+// operator<().
+// This class is used in the Range() function.
+template <typename T, typename IncrementT>
+class RangeGenerator : public ParamGeneratorInterface<T> {
+ public:
+  RangeGenerator(T begin, T end, IncrementT step)
+      : begin_(begin),
+        end_(end),
+        step_(step),
+        end_index_(CalculateEndIndex(begin, end, step)) {}
+  ~RangeGenerator() override {}
+
+  ParamIteratorInterface<T>* Begin() const override {
+    return new Iterator(this, begin_, 0, step_);
+  }
+  ParamIteratorInterface<T>* End() const override {
+    return new Iterator(this, end_, end_index_, step_);
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<T> {
+   public:
+    Iterator(const ParamGeneratorInterface<T>* base, T value, int index,
+             IncrementT step)
+        : base_(base), value_(value), index_(index), step_(step) {}
+    ~Iterator() override {}
+
+    const ParamGeneratorInterface<T>* BaseGenerator() const override {
+      return base_;
+    }
+    void Advance() override {
+      value_ = static_cast<T>(value_ + step_);
+      index_++;
+    }
+    ParamIteratorInterface<T>* Clone() const override {
+      return new Iterator(*this);
+    }
+    const T* Current() const override { return &value_; }
+    bool Equals(const ParamIteratorInterface<T>& other) const override {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const int other_index =
+          CheckedDowncastToActualType<const Iterator>(&other)->index_;
+      return index_ == other_index;
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : ParamIteratorInterface<T>(),
+          base_(other.base_),
+          value_(other.value_),
+          index_(other.index_),
+          step_(other.step_) {}
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<T>* const base_;
+    T value_;
+    int index_;
+    const IncrementT step_;
+  };  // class RangeGenerator::Iterator
+
+  static int CalculateEndIndex(const T& begin, const T& end,
+                               const IncrementT& step) {
+    int end_index = 0;
+    for (T i = begin; i < end; i = static_cast<T>(i + step)) end_index++;
+    return end_index;
+  }
+
+  // No implementation - assignment is unsupported.
+  void operator=(const RangeGenerator& other);
+
+  const T begin_;
+  const T end_;
+  const IncrementT step_;
+  // The index for the end() iterator. All the elements in the generated
+  // sequence are indexed (0-based) to aid iterator comparison.
+  const int end_index_;
+};  // class RangeGenerator
+
+// Generates values from a pair of STL-style iterators. Used in the
+// ValuesIn() function. The elements are copied from the source range
+// since the source can be located on the stack, and the generator
+// is likely to persist beyond that stack frame.
+template <typename T>
+class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface<T> {
+ public:
+  template <typename ForwardIterator>
+  ValuesInIteratorRangeGenerator(ForwardIterator begin, ForwardIterator end)
+      : container_(begin, end) {}
+  ~ValuesInIteratorRangeGenerator() override {}
+
+  ParamIteratorInterface<T>* Begin() const override {
+    return new Iterator(this, container_.begin());
+  }
+  ParamIteratorInterface<T>* End() const override {
+    return new Iterator(this, container_.end());
+  }
+
+ private:
+  typedef typename ::std::vector<T> ContainerType;
+
+  class Iterator : public ParamIteratorInterface<T> {
+   public:
+    Iterator(const ParamGeneratorInterface<T>* base,
+             typename ContainerType::const_iterator iterator)
+        : base_(base), iterator_(iterator) {}
+    ~Iterator() override {}
+
+    const ParamGeneratorInterface<T>* BaseGenerator() const override {
+      return base_;
+    }
+    void Advance() override {
+      ++iterator_;
+      value_.reset();
+    }
+    ParamIteratorInterface<T>* Clone() const override {
+      return new Iterator(*this);
+    }
+    // We need to use cached value referenced by iterator_ because *iterator_
+    // can return a temporary object (and of type other then T), so just
+    // having "return &*iterator_;" doesn't work.
+    // value_ is updated here and not in Advance() because Advance()
+    // can advance iterator_ beyond the end of the range, and we cannot
+    // detect that fact. The client code, on the other hand, is
+    // responsible for not calling Current() on an out-of-range iterator.
+    const T* Current() const override {
+      if (value_.get() == nullptr) value_.reset(new T(*iterator_));
+      return value_.get();
+    }
+    bool Equals(const ParamIteratorInterface<T>& other) const override {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      return iterator_ ==
+             CheckedDowncastToActualType<const Iterator>(&other)->iterator_;
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        // The explicit constructor call suppresses a false warning
+        // emitted by gcc when supplied with the -Wextra option.
+        : ParamIteratorInterface<T>(),
+          base_(other.base_),
+          iterator_(other.iterator_) {}
+
+    const ParamGeneratorInterface<T>* const base_;
+    typename ContainerType::const_iterator iterator_;
+    // A cached value of *iterator_. We keep it here to allow access by
+    // pointer in the wrapping iterator's operator->().
+    // value_ needs to be mutable to be accessed in Current().
+    // Use of std::unique_ptr helps manage cached value's lifetime,
+    // which is bound by the lifespan of the iterator itself.
+    mutable std::unique_ptr<const T> value_;
+  };  // class ValuesInIteratorRangeGenerator::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const ValuesInIteratorRangeGenerator& other);
+
+  const ContainerType container_;
+};  // class ValuesInIteratorRangeGenerator
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Default parameterized test name generator, returns a string containing the
+// integer test parameter index.
+template <class ParamType>
+std::string DefaultParamName(const TestParamInfo<ParamType>& info) {
+  Message name_stream;
+  name_stream << info.index;
+  return name_stream.GetString();
+}
+
+template <typename T = int>
+void TestNotEmpty() {
+  static_assert(sizeof(T) == 0, "Empty arguments are not allowed.");
+}
+template <typename T = int>
+void TestNotEmpty(const T&) {}
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Stores a parameter value and later creates tests parameterized with that
+// value.
+template <class TestClass>
+class ParameterizedTestFactory : public TestFactoryBase {
+ public:
+  typedef typename TestClass::ParamType ParamType;
+  explicit ParameterizedTestFactory(ParamType parameter)
+      : parameter_(parameter) {}
+  Test* CreateTest() override {
+    TestClass::SetParam(&parameter_);
+    return new TestClass();
+  }
+
+ private:
+  const ParamType parameter_;
+
+  ParameterizedTestFactory(const ParameterizedTestFactory&) = delete;
+  ParameterizedTestFactory& operator=(const ParameterizedTestFactory&) = delete;
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// TestMetaFactoryBase is a base class for meta-factories that create
+// test factories for passing into MakeAndRegisterTestInfo function.
+template <class ParamType>
+class TestMetaFactoryBase {
+ public:
+  virtual ~TestMetaFactoryBase() {}
+
+  virtual TestFactoryBase* CreateTestFactory(ParamType parameter) = 0;
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// TestMetaFactory creates test factories for passing into
+// MakeAndRegisterTestInfo function. Since MakeAndRegisterTestInfo receives
+// ownership of test factory pointer, same factory object cannot be passed
+// into that method twice. But ParameterizedTestSuiteInfo is going to call
+// it for each Test/Parameter value combination. Thus it needs meta factory
+// creator class.
+template <class TestSuite>
+class TestMetaFactory
+    : public TestMetaFactoryBase<typename TestSuite::ParamType> {
+ public:
+  using ParamType = typename TestSuite::ParamType;
+
+  TestMetaFactory() {}
+
+  TestFactoryBase* CreateTestFactory(ParamType parameter) override {
+    return new ParameterizedTestFactory<TestSuite>(parameter);
+  }
+
+ private:
+  TestMetaFactory(const TestMetaFactory&) = delete;
+  TestMetaFactory& operator=(const TestMetaFactory&) = delete;
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestSuiteInfoBase is a generic interface
+// to ParameterizedTestSuiteInfo classes. ParameterizedTestSuiteInfoBase
+// accumulates test information provided by TEST_P macro invocations
+// and generators provided by INSTANTIATE_TEST_SUITE_P macro invocations
+// and uses that information to register all resulting test instances
+// in RegisterTests method. The ParameterizeTestSuiteRegistry class holds
+// a collection of pointers to the ParameterizedTestSuiteInfo objects
+// and calls RegisterTests() on each of them when asked.
+class ParameterizedTestSuiteInfoBase {
+ public:
+  virtual ~ParameterizedTestSuiteInfoBase() {}
+
+  // Base part of test suite name for display purposes.
+  virtual const std::string& GetTestSuiteName() const = 0;
+  // Test suite id to verify identity.
+  virtual TypeId GetTestSuiteTypeId() const = 0;
+  // UnitTest class invokes this method to register tests in this
+  // test suite right before running them in RUN_ALL_TESTS macro.
+  // This method should not be called more than once on any single
+  // instance of a ParameterizedTestSuiteInfoBase derived class.
+  virtual void RegisterTests() = 0;
+
+ protected:
+  ParameterizedTestSuiteInfoBase() {}
+
+ private:
+  ParameterizedTestSuiteInfoBase(const ParameterizedTestSuiteInfoBase&) =
+      delete;
+  ParameterizedTestSuiteInfoBase& operator=(
+      const ParameterizedTestSuiteInfoBase&) = delete;
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Report a the name of a test_suit as safe to ignore
+// as the side effect of construction of this type.
+struct GTEST_API_ MarkAsIgnored {
+  explicit MarkAsIgnored(const char* test_suite);
+};
+
+GTEST_API_ void InsertSyntheticTestCase(const std::string& name,
+                                        CodeLocation location, bool has_test_p);
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestSuiteInfo accumulates tests obtained from TEST_P
+// macro invocations for a particular test suite and generators
+// obtained from INSTANTIATE_TEST_SUITE_P macro invocations for that
+// test suite. It registers tests with all values generated by all
+// generators when asked.
+template <class TestSuite>
+class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
+ public:
+  // ParamType and GeneratorCreationFunc are private types but are required
+  // for declarations of public methods AddTestPattern() and
+  // AddTestSuiteInstantiation().
+  using ParamType = typename TestSuite::ParamType;
+  // A function that returns an instance of appropriate generator type.
+  typedef ParamGenerator<ParamType>(GeneratorCreationFunc)();
+  using ParamNameGeneratorFunc = std::string(const TestParamInfo<ParamType>&);
+
+  explicit ParameterizedTestSuiteInfo(const char* name,
+                                      CodeLocation code_location)
+      : test_suite_name_(name), code_location_(code_location) {}
+
+  // Test suite base name for display purposes.
+  const std::string& GetTestSuiteName() const override {
+    return test_suite_name_;
+  }
+  // Test suite id to verify identity.
+  TypeId GetTestSuiteTypeId() const override { return GetTypeId<TestSuite>(); }
+  // TEST_P macro uses AddTestPattern() to record information
+  // about a single test in a LocalTestInfo structure.
+  // test_suite_name is the base name of the test suite (without invocation
+  // prefix). test_base_name is the name of an individual test without
+  // parameter index. For the test SequenceA/FooTest.DoBar/1 FooTest is
+  // test suite base name and DoBar is test base name.
+  void AddTestPattern(const char* test_suite_name, const char* test_base_name,
+                      TestMetaFactoryBase<ParamType>* meta_factory,
+                      CodeLocation code_location) {
+    tests_.push_back(std::shared_ptr<TestInfo>(new TestInfo(
+        test_suite_name, test_base_name, meta_factory, code_location)));
+  }
+  // INSTANTIATE_TEST_SUITE_P macro uses AddGenerator() to record information
+  // about a generator.
+  int AddTestSuiteInstantiation(const std::string& instantiation_name,
+                                GeneratorCreationFunc* func,
+                                ParamNameGeneratorFunc* name_func,
+                                const char* file, int line) {
+    instantiations_.push_back(
+        InstantiationInfo(instantiation_name, func, name_func, file, line));
+    return 0;  // Return value used only to run this method in namespace scope.
+  }
+  // UnitTest class invokes this method to register tests in this test suite
+  // right before running tests in RUN_ALL_TESTS macro.
+  // This method should not be called more than once on any single
+  // instance of a ParameterizedTestSuiteInfoBase derived class.
+  // UnitTest has a guard to prevent from calling this method more than once.
+  void RegisterTests() override {
+    bool generated_instantiations = false;
+
+    for (typename TestInfoContainer::iterator test_it = tests_.begin();
+         test_it != tests_.end(); ++test_it) {
+      std::shared_ptr<TestInfo> test_info = *test_it;
+      for (typename InstantiationContainer::iterator gen_it =
+               instantiations_.begin();
+           gen_it != instantiations_.end(); ++gen_it) {
+        const std::string& instantiation_name = gen_it->name;
+        ParamGenerator<ParamType> generator((*gen_it->generator)());
+        ParamNameGeneratorFunc* name_func = gen_it->name_func;
+        const char* file = gen_it->file;
+        int line = gen_it->line;
+
+        std::string test_suite_name;
+        if (!instantiation_name.empty())
+          test_suite_name = instantiation_name + "/";
+        test_suite_name += test_info->test_suite_base_name;
+
+        size_t i = 0;
+        std::set<std::string> test_param_names;
+        for (typename ParamGenerator<ParamType>::iterator param_it =
+                 generator.begin();
+             param_it != generator.end(); ++param_it, ++i) {
+          generated_instantiations = true;
+
+          Message test_name_stream;
+
+          std::string param_name =
+              name_func(TestParamInfo<ParamType>(*param_it, i));
+
+          GTEST_CHECK_(IsValidParamName(param_name))
+              << "Parameterized test name '" << param_name
+              << "' is invalid, in " << file << " line " << line << std::endl;
+
+          GTEST_CHECK_(test_param_names.count(param_name) == 0)
+              << "Duplicate parameterized test name '" << param_name << "', in "
+              << file << " line " << line << std::endl;
+
+          test_param_names.insert(param_name);
+
+          if (!test_info->test_base_name.empty()) {
+            test_name_stream << test_info->test_base_name << "/";
+          }
+          test_name_stream << param_name;
+          MakeAndRegisterTestInfo(
+              test_suite_name.c_str(), test_name_stream.GetString().c_str(),
+              nullptr,  // No type parameter.
+              PrintToString(*param_it).c_str(), test_info->code_location,
+              GetTestSuiteTypeId(),
+              SuiteApiResolver<TestSuite>::GetSetUpCaseOrSuite(file, line),
+              SuiteApiResolver<TestSuite>::GetTearDownCaseOrSuite(file, line),
+              test_info->test_meta_factory->CreateTestFactory(*param_it));
+        }  // for param_it
+      }    // for gen_it
+    }      // for test_it
+
+    if (!generated_instantiations) {
+      // There are no generaotrs, or they all generate nothing ...
+      InsertSyntheticTestCase(GetTestSuiteName(), code_location_,
+                              !tests_.empty());
+    }
+  }  // RegisterTests
+
+ private:
+  // LocalTestInfo structure keeps information about a single test registered
+  // with TEST_P macro.
+  struct TestInfo {
+    TestInfo(const char* a_test_suite_base_name, const char* a_test_base_name,
+             TestMetaFactoryBase<ParamType>* a_test_meta_factory,
+             CodeLocation a_code_location)
+        : test_suite_base_name(a_test_suite_base_name),
+          test_base_name(a_test_base_name),
+          test_meta_factory(a_test_meta_factory),
+          code_location(a_code_location) {}
+
+    const std::string test_suite_base_name;
+    const std::string test_base_name;
+    const std::unique_ptr<TestMetaFactoryBase<ParamType>> test_meta_factory;
+    const CodeLocation code_location;
+  };
+  using TestInfoContainer = ::std::vector<std::shared_ptr<TestInfo>>;
+  // Records data received from INSTANTIATE_TEST_SUITE_P macros:
+  //  <Instantiation name, Sequence generator creation function,
+  //     Name generator function, Source file, Source line>
+  struct InstantiationInfo {
+    InstantiationInfo(const std::string& name_in,
+                      GeneratorCreationFunc* generator_in,
+                      ParamNameGeneratorFunc* name_func_in, const char* file_in,
+                      int line_in)
+        : name(name_in),
+          generator(generator_in),
+          name_func(name_func_in),
+          file(file_in),
+          line(line_in) {}
+
+    std::string name;
+    GeneratorCreationFunc* generator;
+    ParamNameGeneratorFunc* name_func;
+    const char* file;
+    int line;
+  };
+  typedef ::std::vector<InstantiationInfo> InstantiationContainer;
+
+  static bool IsValidParamName(const std::string& name) {
+    // Check for empty string
+    if (name.empty()) return false;
+
+    // Check for invalid characters
+    for (std::string::size_type index = 0; index < name.size(); ++index) {
+      if (!IsAlNum(name[index]) && name[index] != '_') return false;
+    }
+
+    return true;
+  }
+
+  const std::string test_suite_name_;
+  CodeLocation code_location_;
+  TestInfoContainer tests_;
+  InstantiationContainer instantiations_;
+
+  ParameterizedTestSuiteInfo(const ParameterizedTestSuiteInfo&) = delete;
+  ParameterizedTestSuiteInfo& operator=(const ParameterizedTestSuiteInfo&) =
+      delete;
+};  // class ParameterizedTestSuiteInfo
+
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+template <class TestCase>
+using ParameterizedTestCaseInfo = ParameterizedTestSuiteInfo<TestCase>;
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestSuiteRegistry contains a map of
+// ParameterizedTestSuiteInfoBase classes accessed by test suite names. TEST_P
+// and INSTANTIATE_TEST_SUITE_P macros use it to locate their corresponding
+// ParameterizedTestSuiteInfo descriptors.
+class ParameterizedTestSuiteRegistry {
+ public:
+  ParameterizedTestSuiteRegistry() {}
+  ~ParameterizedTestSuiteRegistry() {
+    for (auto& test_suite_info : test_suite_infos_) {
+      delete test_suite_info;
+    }
+  }
+
+  // Looks up or creates and returns a structure containing information about
+  // tests and instantiations of a particular test suite.
+  template <class TestSuite>
+  ParameterizedTestSuiteInfo<TestSuite>* GetTestSuitePatternHolder(
+      const char* test_suite_name, CodeLocation code_location) {
+    ParameterizedTestSuiteInfo<TestSuite>* typed_test_info = nullptr;
+    for (auto& test_suite_info : test_suite_infos_) {
+      if (test_suite_info->GetTestSuiteName() == test_suite_name) {
+        if (test_suite_info->GetTestSuiteTypeId() != GetTypeId<TestSuite>()) {
+          // Complain about incorrect usage of Google Test facilities
+          // and terminate the program since we cannot guaranty correct
+          // test suite setup and tear-down in this case.
+          ReportInvalidTestSuiteType(test_suite_name, code_location);
+          posix::Abort();
+        } else {
+          // At this point we are sure that the object we found is of the same
+          // type we are looking for, so we downcast it to that type
+          // without further checks.
+          typed_test_info = CheckedDowncastToActualType<
+              ParameterizedTestSuiteInfo<TestSuite>>(test_suite_info);
+        }
+        break;
+      }
+    }
+    if (typed_test_info == nullptr) {
+      typed_test_info = new ParameterizedTestSuiteInfo<TestSuite>(
+          test_suite_name, code_location);
+      test_suite_infos_.push_back(typed_test_info);
+    }
+    return typed_test_info;
+  }
+  void RegisterTests() {
+    for (auto& test_suite_info : test_suite_infos_) {
+      test_suite_info->RegisterTests();
+    }
+  }
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  template <class TestCase>
+  ParameterizedTestCaseInfo<TestCase>* GetTestCasePatternHolder(
+      const char* test_case_name, CodeLocation code_location) {
+    return GetTestSuitePatternHolder<TestCase>(test_case_name, code_location);
+  }
+
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+ private:
+  using TestSuiteInfoContainer = ::std::vector<ParameterizedTestSuiteInfoBase*>;
+
+  TestSuiteInfoContainer test_suite_infos_;
+
+  ParameterizedTestSuiteRegistry(const ParameterizedTestSuiteRegistry&) =
+      delete;
+  ParameterizedTestSuiteRegistry& operator=(
+      const ParameterizedTestSuiteRegistry&) = delete;
+};
+
+// Keep track of what type-parameterized test suite are defined and
+// where as well as which are intatiated. This allows susequently
+// identifying suits that are defined but never used.
+class TypeParameterizedTestSuiteRegistry {
+ public:
+  // Add a suite definition
+  void RegisterTestSuite(const char* test_suite_name,
+                         CodeLocation code_location);
+
+  // Add an instantiation of a suit.
+  void RegisterInstantiation(const char* test_suite_name);
+
+  // For each suit repored as defined but not reported as instantiation,
+  // emit a test that reports that fact (configurably, as an error).
+  void CheckForInstantiations();
+
+ private:
+  struct TypeParameterizedTestSuiteInfo {
+    explicit TypeParameterizedTestSuiteInfo(CodeLocation c)
+        : code_location(c), instantiated(false) {}
+
+    CodeLocation code_location;
+    bool instantiated;
+  };
+
+  std::map<std::string, TypeParameterizedTestSuiteInfo> suites_;
+};
+
+}  // namespace internal
+
+// Forward declarations of ValuesIn(), which is implemented in
+// include/gtest/gtest-param-test.h.
+template <class Container>
+internal::ParamGenerator<typename Container::value_type> ValuesIn(
+    const Container& container);
+
+namespace internal {
+// Used in the Values() function to provide polymorphic capabilities.
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4100)
+#endif
+
+template <typename... Ts>
+class ValueArray {
+ public:
+  explicit ValueArray(Ts... v) : v_(FlatTupleConstructTag{}, std::move(v)...) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {  // NOLINT
+    return ValuesIn(MakeVector<T>(MakeIndexSequence<sizeof...(Ts)>()));
+  }
+
+ private:
+  template <typename T, size_t... I>
+  std::vector<T> MakeVector(IndexSequence<I...>) const {
+    return std::vector<T>{static_cast<T>(v_.template Get<I>())...};
+  }
+
+  FlatTuple<Ts...> v_;
+};
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+template <typename... T>
+class CartesianProductGenerator
+    : public ParamGeneratorInterface<::std::tuple<T...>> {
+ public:
+  typedef ::std::tuple<T...> ParamType;
+
+  CartesianProductGenerator(const std::tuple<ParamGenerator<T>...>& g)
+      : generators_(g) {}
+  ~CartesianProductGenerator() override {}
+
+  ParamIteratorInterface<ParamType>* Begin() const override {
+    return new Iterator(this, generators_, false);
+  }
+  ParamIteratorInterface<ParamType>* End() const override {
+    return new Iterator(this, generators_, true);
+  }
+
+ private:
+  template <class I>
+  class IteratorImpl;
+  template <size_t... I>
+  class IteratorImpl<IndexSequence<I...>>
+      : public ParamIteratorInterface<ParamType> {
+   public:
+    IteratorImpl(const ParamGeneratorInterface<ParamType>* base,
+                 const std::tuple<ParamGenerator<T>...>& generators,
+                 bool is_end)
+        : base_(base),
+          begin_(std::get<I>(generators).begin()...),
+          end_(std::get<I>(generators).end()...),
+          current_(is_end ? end_ : begin_) {
+      ComputeCurrentValue();
+    }
+    ~IteratorImpl() override {}
+
+    const ParamGeneratorInterface<ParamType>* BaseGenerator() const override {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    void Advance() override {
+      assert(!AtEnd());
+      // Advance the last iterator.
+      ++std::get<sizeof...(T) - 1>(current_);
+      // if that reaches end, propagate that up.
+      AdvanceIfEnd<sizeof...(T) - 1>();
+      ComputeCurrentValue();
+    }
+    ParamIteratorInterface<ParamType>* Clone() const override {
+      return new IteratorImpl(*this);
+    }
+
+    const ParamType* Current() const override { return current_value_.get(); }
+
+    bool Equals(const ParamIteratorInterface<ParamType>& other) const override {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const IteratorImpl* typed_other =
+          CheckedDowncastToActualType<const IteratorImpl>(&other);
+
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      if (AtEnd() && typed_other->AtEnd()) return true;
+
+      bool same = true;
+      bool dummy[] = {
+          (same = same && std::get<I>(current_) ==
+                              std::get<I>(typed_other->current_))...};
+      (void)dummy;
+      return same;
+    }
+
+   private:
+    template <size_t ThisI>
+    void AdvanceIfEnd() {
+      if (std::get<ThisI>(current_) != std::get<ThisI>(end_)) return;
+
+      bool last = ThisI == 0;
+      if (last) {
+        // We are done. Nothing else to propagate.
+        return;
+      }
+
+      constexpr size_t NextI = ThisI - (ThisI != 0);
+      std::get<ThisI>(current_) = std::get<ThisI>(begin_);
+      ++std::get<NextI>(current_);
+      AdvanceIfEnd<NextI>();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = std::make_shared<ParamType>(*std::get<I>(current_)...);
+    }
+    bool AtEnd() const {
+      bool at_end = false;
+      bool dummy[] = {
+          (at_end = at_end || std::get<I>(current_) == std::get<I>(end_))...};
+      (void)dummy;
+      return at_end;
+    }
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    std::tuple<typename ParamGenerator<T>::iterator...> begin_;
+    std::tuple<typename ParamGenerator<T>::iterator...> end_;
+    std::tuple<typename ParamGenerator<T>::iterator...> current_;
+    std::shared_ptr<ParamType> current_value_;
+  };
+
+  using Iterator = IteratorImpl<typename MakeIndexSequence<sizeof...(T)>::type>;
+
+  std::tuple<ParamGenerator<T>...> generators_;
+};
+
+template <class... Gen>
+class CartesianProductHolder {
+ public:
+  CartesianProductHolder(const Gen&... g) : generators_(g...) {}
+  template <typename... T>
+  operator ParamGenerator<::std::tuple<T...>>() const {
+    return ParamGenerator<::std::tuple<T...>>(
+        new CartesianProductGenerator<T...>(generators_));
+  }
+
+ private:
+  std::tuple<Gen...> generators_;
+};
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h
new file mode 100644
index 0000000000..f025db76ad
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h
@@ -0,0 +1,116 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file defines the GTEST_OS_* macro.
+// It is separate from gtest-port.h so that custom/gtest-port.h can include it.
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
+
+// Determines the platform on which Google Test is compiled.
+#ifdef __CYGWIN__
+#define GTEST_OS_CYGWIN 1
+#elif defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)
+#define GTEST_OS_WINDOWS_MINGW 1
+#define GTEST_OS_WINDOWS 1
+#elif defined _WIN32
+#define GTEST_OS_WINDOWS 1
+#ifdef _WIN32_WCE
+#define GTEST_OS_WINDOWS_MOBILE 1
+#elif defined(WINAPI_FAMILY)
+#include <winapifamily.h>
+#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#define GTEST_OS_WINDOWS_DESKTOP 1
+#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
+#define GTEST_OS_WINDOWS_PHONE 1
+#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
+#define GTEST_OS_WINDOWS_RT 1
+#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_TV_TITLE)
+#define GTEST_OS_WINDOWS_PHONE 1
+#define GTEST_OS_WINDOWS_TV_TITLE 1
+#else
+// WINAPI_FAMILY defined but no known partition matched.
+// Default to desktop.
+#define GTEST_OS_WINDOWS_DESKTOP 1
+#endif
+#else
+#define GTEST_OS_WINDOWS_DESKTOP 1
+#endif  // _WIN32_WCE
+#elif defined __OS2__
+#define GTEST_OS_OS2 1
+#elif defined __APPLE__
+#define GTEST_OS_MAC 1
+#include <TargetConditionals.h>
+#if TARGET_OS_IPHONE
+#define GTEST_OS_IOS 1
+#endif
+#elif defined __DragonFly__
+#define GTEST_OS_DRAGONFLY 1
+#elif defined __FreeBSD__
+#define GTEST_OS_FREEBSD 1
+#elif defined __Fuchsia__
+#define GTEST_OS_FUCHSIA 1
+#elif defined(__GNU__)
+#define GTEST_OS_GNU_HURD 1
+#elif defined(__GLIBC__) && defined(__FreeBSD_kernel__)
+#define GTEST_OS_GNU_KFREEBSD 1
+#elif defined __linux__
+#define GTEST_OS_LINUX 1
+#if defined __ANDROID__
+#define GTEST_OS_LINUX_ANDROID 1
+#endif
+#elif defined __MVS__
+#define GTEST_OS_ZOS 1
+#elif defined(__sun) && defined(__SVR4)
+#define GTEST_OS_SOLARIS 1
+#elif defined(_AIX)
+#define GTEST_OS_AIX 1
+#elif defined(__hpux)
+#define GTEST_OS_HPUX 1
+#elif defined __native_client__
+#define GTEST_OS_NACL 1
+#elif defined __NetBSD__
+#define GTEST_OS_NETBSD 1
+#elif defined __OpenBSD__
+#define GTEST_OS_OPENBSD 1
+#elif defined __QNX__
+#define GTEST_OS_QNX 1
+#elif defined(__HAIKU__)
+#define GTEST_OS_HAIKU 1
+#elif defined ESP8266
+#define GTEST_OS_ESP8266 1
+#elif defined ESP32
+#define GTEST_OS_ESP32 1
+#elif defined(__XTENSA__)
+#define GTEST_OS_XTENSA 1
+#endif  // __CYGWIN__
+
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h
new file mode 100644
index 0000000000..0003d27658
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h
@@ -0,0 +1,2413 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Low-level types and utilities for porting Google Test to various
+// platforms.  All macros ending with _ and symbols defined in an
+// internal namespace are subject to change without notice.  Code
+// outside Google Test MUST NOT USE THEM DIRECTLY.  Macros that don't
+// end with _ are part of Google Test's public API and can be used by
+// code outside Google Test.
+//
+// This file is fundamental to Google Test.  All other Google Test source
+// files are expected to #include this.  Therefore, it cannot #include
+// any other Google Test header.
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+
+// Environment-describing macros
+// -----------------------------
+//
+// Google Test can be used in many different environments.  Macros in
+// this section tell Google Test what kind of environment it is being
+// used in, such that Google Test can provide environment-specific
+// features and implementations.
+//
+// Google Test tries to automatically detect the properties of its
+// environment, so users usually don't need to worry about these
+// macros.  However, the automatic detection is not perfect.
+// Sometimes it's necessary for a user to define some of the following
+// macros in the build script to override Google Test's decisions.
+//
+// If the user doesn't define a macro in the list, Google Test will
+// provide a default definition.  After this header is #included, all
+// macros in this list will be defined to either 1 or 0.
+//
+// Notes to maintainers:
+//   - Each macro here is a user-tweakable knob; do not grow the list
+//     lightly.
+//   - Use #if to key off these macros.  Don't use #ifdef or "#if
+//     defined(...)", which will not work as these macros are ALWAYS
+//     defined.
+//
+//   GTEST_HAS_CLONE          - Define it to 1/0 to indicate that clone(2)
+//                              is/isn't available.
+//   GTEST_HAS_EXCEPTIONS     - Define it to 1/0 to indicate that exceptions
+//                              are enabled.
+//   GTEST_HAS_POSIX_RE       - Define it to 1/0 to indicate that POSIX regular
+//                              expressions are/aren't available.
+//   GTEST_HAS_PTHREAD        - Define it to 1/0 to indicate that <pthread.h>
+//                              is/isn't available.
+//   GTEST_HAS_RTTI           - Define it to 1/0 to indicate that RTTI is/isn't
+//                              enabled.
+//   GTEST_HAS_STD_WSTRING    - Define it to 1/0 to indicate that
+//                              std::wstring does/doesn't work (Google Test can
+//                              be used where std::wstring is unavailable).
+//   GTEST_HAS_SEH            - Define it to 1/0 to indicate whether the
+//                              compiler supports Microsoft's "Structured
+//                              Exception Handling".
+//   GTEST_HAS_STREAM_REDIRECTION
+//                            - Define it to 1/0 to indicate whether the
+//                              platform supports I/O stream redirection using
+//                              dup() and dup2().
+//   GTEST_LINKED_AS_SHARED_LIBRARY
+//                            - Define to 1 when compiling tests that use
+//                              Google Test as a shared library (known as
+//                              DLL on Windows).
+//   GTEST_CREATE_SHARED_LIBRARY
+//                            - Define to 1 when compiling Google Test itself
+//                              as a shared library.
+//   GTEST_DEFAULT_DEATH_TEST_STYLE
+//                            - The default value of --gtest_death_test_style.
+//                              The legacy default has been "fast" in the open
+//                              source version since 2008. The recommended value
+//                              is "threadsafe", and can be set in
+//                              custom/gtest-port.h.
+
+// Platform-indicating macros
+// --------------------------
+//
+// Macros indicating the platform on which Google Test is being used
+// (a macro is defined to 1 if compiled on the given platform;
+// otherwise UNDEFINED -- it's never defined to 0.).  Google Test
+// defines these macros automatically.  Code outside Google Test MUST
+// NOT define them.
+//
+//   GTEST_OS_AIX      - IBM AIX
+//   GTEST_OS_CYGWIN   - Cygwin
+//   GTEST_OS_DRAGONFLY - DragonFlyBSD
+//   GTEST_OS_FREEBSD  - FreeBSD
+//   GTEST_OS_FUCHSIA  - Fuchsia
+//   GTEST_OS_GNU_HURD - GNU/Hurd
+//   GTEST_OS_GNU_KFREEBSD - GNU/kFreeBSD
+//   GTEST_OS_HAIKU    - Haiku
+//   GTEST_OS_HPUX     - HP-UX
+//   GTEST_OS_LINUX    - Linux
+//     GTEST_OS_LINUX_ANDROID - Google Android
+//   GTEST_OS_MAC      - Mac OS X
+//     GTEST_OS_IOS    - iOS
+//   GTEST_OS_NACL     - Google Native Client (NaCl)
+//   GTEST_OS_NETBSD   - NetBSD
+//   GTEST_OS_OPENBSD  - OpenBSD
+//   GTEST_OS_OS2      - OS/2
+//   GTEST_OS_QNX      - QNX
+//   GTEST_OS_SOLARIS  - Sun Solaris
+//   GTEST_OS_WINDOWS  - Windows (Desktop, MinGW, or Mobile)
+//     GTEST_OS_WINDOWS_DESKTOP  - Windows Desktop
+//     GTEST_OS_WINDOWS_MINGW    - MinGW
+//     GTEST_OS_WINDOWS_MOBILE   - Windows Mobile
+//     GTEST_OS_WINDOWS_PHONE    - Windows Phone
+//     GTEST_OS_WINDOWS_RT       - Windows Store App/WinRT
+//   GTEST_OS_ZOS      - z/OS
+//
+// Among the platforms, Cygwin, Linux, Mac OS X, and Windows have the
+// most stable support.  Since core members of the Google Test project
+// don't have access to other platforms, support for them may be less
+// stable.  If you notice any problems on your platform, please notify
+// googletestframework@googlegroups.com (patches for fixing them are
+// even more welcome!).
+//
+// It is possible that none of the GTEST_OS_* macros are defined.
+
+// Feature-indicating macros
+// -------------------------
+//
+// Macros indicating which Google Test features are available (a macro
+// is defined to 1 if the corresponding feature is supported;
+// otherwise UNDEFINED -- it's never defined to 0.).  Google Test
+// defines these macros automatically.  Code outside Google Test MUST
+// NOT define them.
+//
+// These macros are public so that portable tests can be written.
+// Such tests typically surround code using a feature with an #if
+// which controls that code.  For example:
+//
+// #if GTEST_HAS_DEATH_TEST
+//   EXPECT_DEATH(DoSomethingDeadly());
+// #endif
+//
+//   GTEST_HAS_DEATH_TEST   - death tests
+//   GTEST_HAS_TYPED_TEST   - typed tests
+//   GTEST_HAS_TYPED_TEST_P - type-parameterized tests
+//   GTEST_IS_THREADSAFE    - Google Test is thread-safe.
+//   GTEST_USES_RE2         - the RE2 regular expression library is used
+//   GTEST_USES_POSIX_RE    - enhanced POSIX regex is used. Do not confuse with
+//                            GTEST_HAS_POSIX_RE (see above) which users can
+//                            define themselves.
+//   GTEST_USES_SIMPLE_RE   - our own simple regex is used;
+//                            the above RE\b(s) are mutually exclusive.
+
+// Misc public macros
+// ------------------
+//
+//   GTEST_FLAG(flag_name)  - references the variable corresponding to
+//                            the given Google Test flag.
+
+// Internal utilities
+// ------------------
+//
+// The following macros and utilities are for Google Test's INTERNAL
+// use only.  Code outside Google Test MUST NOT USE THEM DIRECTLY.
+//
+// Macros for basic C++ coding:
+//   GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning.
+//   GTEST_ATTRIBUTE_UNUSED_  - declares that a class' instances or a
+//                              variable don't have to be used.
+//   GTEST_MUST_USE_RESULT_   - declares that a function's result must be used.
+//   GTEST_INTENTIONAL_CONST_COND_PUSH_ - start code section where MSVC C4127 is
+//                                        suppressed (constant conditional).
+//   GTEST_INTENTIONAL_CONST_COND_POP_  - finish code section where MSVC C4127
+//                                        is suppressed.
+//   GTEST_INTERNAL_HAS_ANY - for enabling UniversalPrinter<std::any> or
+//                            UniversalPrinter<absl::any> specializations.
+//   GTEST_INTERNAL_HAS_OPTIONAL - for enabling UniversalPrinter<std::optional>
+//   or
+//                                 UniversalPrinter<absl::optional>
+//                                 specializations.
+//   GTEST_INTERNAL_HAS_STRING_VIEW - for enabling Matcher<std::string_view> or
+//                                    Matcher<absl::string_view>
+//                                    specializations.
+//   GTEST_INTERNAL_HAS_VARIANT - for enabling UniversalPrinter<std::variant> or
+//                                UniversalPrinter<absl::variant>
+//                                specializations.
+//
+// Synchronization:
+//   Mutex, MutexLock, ThreadLocal, GetThreadCount()
+//                            - synchronization primitives.
+//
+// Regular expressions:
+//   RE             - a simple regular expression class using
+//                     1) the RE2 syntax on all platforms when built with RE2
+//                        and Abseil as dependencies
+//                     2) the POSIX Extended Regular Expression syntax on
+//                        UNIX-like platforms,
+//                     3) A reduced regular exception syntax on other platforms,
+//                        including Windows.
+// Logging:
+//   GTEST_LOG_()   - logs messages at the specified severity level.
+//   LogToStderr()  - directs all log messages to stderr.
+//   FlushInfoLog() - flushes informational log messages.
+//
+// Stdout and stderr capturing:
+//   CaptureStdout()     - starts capturing stdout.
+//   GetCapturedStdout() - stops capturing stdout and returns the captured
+//                         string.
+//   CaptureStderr()     - starts capturing stderr.
+//   GetCapturedStderr() - stops capturing stderr and returns the captured
+//                         string.
+//
+// Integer types:
+//   TypeWithSize   - maps an integer to a int type.
+//   TimeInMillis   - integers of known sizes.
+//   BiggestInt     - the biggest signed integer type.
+//
+// Command-line utilities:
+//   GetInjectableArgvs() - returns the command line as a vector of strings.
+//
+// Environment variable utilities:
+//   GetEnv()             - gets the value of an environment variable.
+//   BoolFromGTestEnv()   - parses a bool environment variable.
+//   Int32FromGTestEnv()  - parses an int32_t environment variable.
+//   StringFromGTestEnv() - parses a string environment variable.
+//
+// Deprecation warnings:
+//   GTEST_INTERNAL_DEPRECATED(message) - attribute marking a function as
+//                                        deprecated; calling a marked function
+//                                        should generate a compiler warning
+
+#include <ctype.h>   // for isspace, etc
+#include <stddef.h>  // for ptrdiff_t
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <cerrno>
+// #include <condition_variable>  // Guarded by GTEST_IS_THREADSAFE below
+#include <cstdint>
+#include <iostream>
+#include <limits>
+#include <locale>
+#include <memory>
+#include <string>
+// #include <mutex>  // Guarded by GTEST_IS_THREADSAFE below
+#include <tuple>
+#include <type_traits>
+#include <vector>
+
+#ifndef _WIN32_WCE
+#include <sys/stat.h>
+#include <sys/types.h>
+#endif  // !_WIN32_WCE
+
+#if defined __APPLE__
+#include <AvailabilityMacros.h>
+#include <TargetConditionals.h>
+#endif
+
+#include "gtest/internal/custom/gtest-port.h"
+#include "gtest/internal/gtest-port-arch.h"
+
+#if GTEST_HAS_ABSL
+#include "absl/flags/declare.h"
+#include "absl/flags/flag.h"
+#include "absl/flags/reflection.h"
+#endif
+
+#if !defined(GTEST_DEV_EMAIL_)
+#define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
+#define GTEST_FLAG_PREFIX_ "gtest_"
+#define GTEST_FLAG_PREFIX_DASH_ "gtest-"
+#define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
+#define GTEST_NAME_ "Google Test"
+#define GTEST_PROJECT_URL_ "https://github.com/google/googletest/"
+#endif  // !defined(GTEST_DEV_EMAIL_)
+
+#if !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
+#define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest"
+#endif  // !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
+
+// Determines the version of gcc that is used to compile this.
+#ifdef __GNUC__
+// 40302 means version 4.3.2.
+#define GTEST_GCC_VER_ \
+  (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#endif  // __GNUC__
+
+// Macros for disabling Microsoft Visual C++ warnings.
+//
+//   GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 4385)
+//   /* code that triggers warnings C4800 and C4385 */
+//   GTEST_DISABLE_MSC_WARNINGS_POP_()
+#if defined(_MSC_VER)
+#define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \
+  __pragma(warning(push)) __pragma(warning(disable : warnings))
+#define GTEST_DISABLE_MSC_WARNINGS_POP_() __pragma(warning(pop))
+#else
+// Not all compilers are MSVC
+#define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings)
+#define GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif
+
+// Clang on Windows does not understand MSVC's pragma warning.
+// We need clang-specific way to disable function deprecation warning.
+#ifdef __clang__
+#define GTEST_DISABLE_MSC_DEPRECATED_PUSH_()                            \
+  _Pragma("clang diagnostic push")                                      \
+      _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") \
+          _Pragma("clang diagnostic ignored \"-Wdeprecated-implementations\"")
+#define GTEST_DISABLE_MSC_DEPRECATED_POP_() _Pragma("clang diagnostic pop")
+#else
+#define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
+#define GTEST_DISABLE_MSC_DEPRECATED_POP_() GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif
+
+// Brings in definitions for functions used in the testing::internal::posix
+// namespace (read, write, close, chdir, isatty, stat). We do not currently
+// use them on Windows Mobile.
+#if GTEST_OS_WINDOWS
+#if !GTEST_OS_WINDOWS_MOBILE
+#include <direct.h>
+#include <io.h>
+#endif
+// In order to avoid having to include <windows.h>, use forward declaration
+#if GTEST_OS_WINDOWS_MINGW && !defined(__MINGW64_VERSION_MAJOR)
+// MinGW defined _CRITICAL_SECTION and _RTL_CRITICAL_SECTION as two
+// separate (equivalent) structs, instead of using typedef
+typedef struct _CRITICAL_SECTION GTEST_CRITICAL_SECTION;
+#else
+// Assume CRITICAL_SECTION is a typedef of _RTL_CRITICAL_SECTION.
+// This assumption is verified by
+// WindowsTypesTest.CRITICAL_SECTIONIs_RTL_CRITICAL_SECTION.
+typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
+#endif
+#elif GTEST_OS_XTENSA
+#include <unistd.h>
+// Xtensa toolchains define strcasecmp in the string.h header instead of
+// strings.h. string.h is already included.
+#else
+// This assumes that non-Windows OSes provide unistd.h. For OSes where this
+// is not the case, we need to include headers that provide the functions
+// mentioned above.
+#include <strings.h>
+#include <unistd.h>
+#endif  // GTEST_OS_WINDOWS
+
+#if GTEST_OS_LINUX_ANDROID
+// Used to define __ANDROID_API__ matching the target NDK API level.
+#include <android/api-level.h>  // NOLINT
+#endif
+
+// Defines this to true if and only if Google Test can use POSIX regular
+// expressions.
+#ifndef GTEST_HAS_POSIX_RE
+#if GTEST_OS_LINUX_ANDROID
+// On Android, <regex.h> is only available starting with Gingerbread.
+#define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9)
+#else
+#define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS && !GTEST_OS_XTENSA)
+#endif
+#endif
+
+// Select the regular expression implementation.
+#if GTEST_HAS_ABSL
+// When using Abseil, RE2 is required.
+#include "absl/strings/string_view.h"
+#include "re2/re2.h"
+#define GTEST_USES_RE2 1
+#elif GTEST_HAS_POSIX_RE
+#include <regex.h>  // NOLINT
+#define GTEST_USES_POSIX_RE 1
+#else
+// Use our own simple regex implementation.
+#define GTEST_USES_SIMPLE_RE 1
+#endif
+
+#ifndef GTEST_HAS_EXCEPTIONS
+// The user didn't tell us whether exceptions are enabled, so we need
+// to figure it out.
+#if defined(_MSC_VER) && defined(_CPPUNWIND)
+// MSVC defines _CPPUNWIND to 1 if and only if exceptions are enabled.
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__BORLANDC__)
+// C++Builder's implementation of the STL uses the _HAS_EXCEPTIONS
+// macro to enable exceptions, so we'll do the same.
+// Assumes that exceptions are enabled by default.
+#ifndef _HAS_EXCEPTIONS
+#define _HAS_EXCEPTIONS 1
+#endif  // _HAS_EXCEPTIONS
+#define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS
+#elif defined(__clang__)
+// clang defines __EXCEPTIONS if and only if exceptions are enabled before clang
+// 220714, but if and only if cleanups are enabled after that. In Obj-C++ files,
+// there can be cleanups for ObjC exceptions which also need cleanups, even if
+// C++ exceptions are disabled. clang has __has_feature(cxx_exceptions) which
+// checks for C++ exceptions starting at clang r206352, but which checked for
+// cleanups prior to that. To reliably check for C++ exception availability with
+// clang, check for
+// __EXCEPTIONS && __has_feature(cxx_exceptions).
+#define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions))
+#elif defined(__GNUC__) && __EXCEPTIONS
+// gcc defines __EXCEPTIONS to 1 if and only if exceptions are enabled.
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__SUNPRO_CC)
+// Sun Pro CC supports exceptions.  However, there is no compile-time way of
+// detecting whether they are enabled or not.  Therefore, we assume that
+// they are enabled unless the user tells us otherwise.
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__IBMCPP__) && __EXCEPTIONS
+// xlC defines __EXCEPTIONS to 1 if and only if exceptions are enabled.
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__HP_aCC)
+// Exception handling is in effect by default in HP aCC compiler. It has to
+// be turned of by +noeh compiler option if desired.
+#define GTEST_HAS_EXCEPTIONS 1
+#else
+// For other compilers, we assume exceptions are disabled to be
+// conservative.
+#define GTEST_HAS_EXCEPTIONS 0
+#endif  // defined(_MSC_VER) || defined(__BORLANDC__)
+#endif  // GTEST_HAS_EXCEPTIONS
+
+#ifndef GTEST_HAS_STD_WSTRING
+// The user didn't tell us whether ::std::wstring is available, so we need
+// to figure it out.
+// Cygwin 1.7 and below doesn't support ::std::wstring.
+// Solaris' libc++ doesn't support it either.  Android has
+// no support for it at least as recent as Froyo (2.2).
+#define GTEST_HAS_STD_WSTRING                                         \
+  (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \
+     GTEST_OS_HAIKU || GTEST_OS_ESP32 || GTEST_OS_ESP8266 || GTEST_OS_XTENSA))
+
+#endif  // GTEST_HAS_STD_WSTRING
+
+// Determines whether RTTI is available.
+#ifndef GTEST_HAS_RTTI
+// The user didn't tell us whether RTTI is enabled, so we need to
+// figure it out.
+
+#ifdef _MSC_VER
+
+#ifdef _CPPRTTI  // MSVC defines this macro if and only if RTTI is enabled.
+#define GTEST_HAS_RTTI 1
+#else
+#define GTEST_HAS_RTTI 0
+#endif
+
+// Starting with version 4.3.2, gcc defines __GXX_RTTI if and only if RTTI is
+// enabled.
+#elif defined(__GNUC__)
+
+#ifdef __GXX_RTTI
+// When building against STLport with the Android NDK and with
+// -frtti -fno-exceptions, the build fails at link time with undefined
+// references to __cxa_bad_typeid. Note sure if STL or toolchain bug,
+// so disable RTTI when detected.
+#if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && !defined(__EXCEPTIONS)
+#define GTEST_HAS_RTTI 0
+#else
+#define GTEST_HAS_RTTI 1
+#endif  // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS
+#else
+#define GTEST_HAS_RTTI 0
+#endif  // __GXX_RTTI
+
+// Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends
+// using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the
+// first version with C++ support.
+#elif defined(__clang__)
+
+#define GTEST_HAS_RTTI __has_feature(cxx_rtti)
+
+// Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if
+// both the typeid and dynamic_cast features are present.
+#elif defined(__IBMCPP__) && (__IBMCPP__ >= 900)
+
+#ifdef __RTTI_ALL__
+#define GTEST_HAS_RTTI 1
+#else
+#define GTEST_HAS_RTTI 0
+#endif
+
+#else
+
+// For all other compilers, we assume RTTI is enabled.
+#define GTEST_HAS_RTTI 1
+
+#endif  // _MSC_VER
+
+#endif  // GTEST_HAS_RTTI
+
+// It's this header's responsibility to #include <typeinfo> when RTTI
+// is enabled.
+#if GTEST_HAS_RTTI
+#include <typeinfo>
+#endif
+
+// Determines whether Google Test can use the pthreads library.
+#ifndef GTEST_HAS_PTHREAD
+// The user didn't tell us explicitly, so we make reasonable assumptions about
+// which platforms have pthreads support.
+//
+// To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0
+// to your compiler flags.
+#define GTEST_HAS_PTHREAD                                                      \
+  (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX || GTEST_OS_QNX ||          \
+   GTEST_OS_FREEBSD || GTEST_OS_NACL || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA || \
+   GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_OPENBSD ||          \
+   GTEST_OS_HAIKU || GTEST_OS_GNU_HURD)
+#endif  // GTEST_HAS_PTHREAD
+
+#if GTEST_HAS_PTHREAD
+// gtest-port.h guarantees to #include <pthread.h> when GTEST_HAS_PTHREAD is
+// true.
+#include <pthread.h>  // NOLINT
+
+// For timespec and nanosleep, used below.
+#include <time.h>  // NOLINT
+#endif
+
+// Determines whether clone(2) is supported.
+// Usually it will only be available on Linux, excluding
+// Linux on the Itanium architecture.
+// Also see http://linux.die.net/man/2/clone.
+#ifndef GTEST_HAS_CLONE
+// The user didn't tell us, so we need to figure it out.
+
+#if GTEST_OS_LINUX && !defined(__ia64__)
+#if GTEST_OS_LINUX_ANDROID
+// On Android, clone() became available at different API levels for each 32-bit
+// architecture.
+#if defined(__LP64__) || (defined(__arm__) && __ANDROID_API__ >= 9) || \
+    (defined(__mips__) && __ANDROID_API__ >= 12) ||                    \
+    (defined(__i386__) && __ANDROID_API__ >= 17)
+#define GTEST_HAS_CLONE 1
+#else
+#define GTEST_HAS_CLONE 0
+#endif
+#else
+#define GTEST_HAS_CLONE 1
+#endif
+#else
+#define GTEST_HAS_CLONE 0
+#endif  // GTEST_OS_LINUX && !defined(__ia64__)
+
+#endif  // GTEST_HAS_CLONE
+
+// Determines whether to support stream redirection. This is used to test
+// output correctness and to implement death tests.
+#ifndef GTEST_HAS_STREAM_REDIRECTION
+// By default, we assume that stream redirection is supported on all
+// platforms except known mobile ones.
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \
+    GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_XTENSA
+#define GTEST_HAS_STREAM_REDIRECTION 0
+#else
+#define GTEST_HAS_STREAM_REDIRECTION 1
+#endif  // !GTEST_OS_WINDOWS_MOBILE
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+// Determines whether to support death tests.
+// pops up a dialog window that cannot be suppressed programmatically.
+#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS ||             \
+     (GTEST_OS_MAC && !GTEST_OS_IOS) ||                                   \
+     (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER) || GTEST_OS_WINDOWS_MINGW ||  \
+     GTEST_OS_AIX || GTEST_OS_HPUX || GTEST_OS_OPENBSD || GTEST_OS_QNX || \
+     GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA ||           \
+     GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_HAIKU ||     \
+     GTEST_OS_GNU_HURD)
+#define GTEST_HAS_DEATH_TEST 1
+#endif
+
+// Determines whether to support type-driven tests.
+
+// Typed tests need <typeinfo> and variadic macros, which GCC, VC++ 8.0,
+// Sun Pro CC, IBM Visual Age, and HP aCC support.
+#if defined(__GNUC__) || defined(_MSC_VER) || defined(__SUNPRO_CC) || \
+    defined(__IBMCPP__) || defined(__HP_aCC)
+#define GTEST_HAS_TYPED_TEST 1
+#define GTEST_HAS_TYPED_TEST_P 1
+#endif
+
+// Determines whether the system compiler uses UTF-16 for encoding wide strings.
+#define GTEST_WIDE_STRING_USES_UTF16_ \
+  (GTEST_OS_WINDOWS || GTEST_OS_CYGWIN || GTEST_OS_AIX || GTEST_OS_OS2)
+
+// Determines whether test results can be streamed to a socket.
+#if GTEST_OS_LINUX || GTEST_OS_GNU_KFREEBSD || GTEST_OS_DRAGONFLY || \
+    GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_OPENBSD ||       \
+    GTEST_OS_GNU_HURD
+#define GTEST_CAN_STREAM_RESULTS_ 1
+#endif
+
+// Defines some utility macros.
+
+// The GNU compiler emits a warning if nested "if" statements are followed by
+// an "else" statement and braces are not used to explicitly disambiguate the
+// "else" binding.  This leads to problems with code like:
+//
+//   if (gate)
+//     ASSERT_*(condition) << "Some message";
+//
+// The "switch (0) case 0:" idiom is used to suppress this.
+#ifdef __INTEL_COMPILER
+#define GTEST_AMBIGUOUS_ELSE_BLOCKER_
+#else
+#define GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  switch (0)                          \
+  case 0:                             \
+  default:  // NOLINT
+#endif
+
+// Use this annotation at the end of a struct/class definition to
+// prevent the compiler from optimizing away instances that are never
+// used.  This is useful when all interesting logic happens inside the
+// c'tor and / or d'tor.  Example:
+//
+//   struct Foo {
+//     Foo() { ... }
+//   } GTEST_ATTRIBUTE_UNUSED_;
+//
+// Also use it after a variable or parameter declaration to tell the
+// compiler the variable/parameter does not have to be used.
+#if defined(__GNUC__) && !defined(COMPILER_ICC)
+#define GTEST_ATTRIBUTE_UNUSED_ __attribute__((unused))
+#elif defined(__clang__)
+#if __has_attribute(unused)
+#define GTEST_ATTRIBUTE_UNUSED_ __attribute__((unused))
+#endif
+#endif
+#ifndef GTEST_ATTRIBUTE_UNUSED_
+#define GTEST_ATTRIBUTE_UNUSED_
+#endif
+
+// Use this annotation before a function that takes a printf format string.
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(COMPILER_ICC)
+#if defined(__MINGW_PRINTF_FORMAT)
+// MinGW has two different printf implementations. Ensure the format macro
+// matches the selected implementation. See
+// https://sourceforge.net/p/mingw-w64/wiki2/gnu%20printf/.
+#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
+  __attribute__((                                             \
+      __format__(__MINGW_PRINTF_FORMAT, string_index, first_to_check)))
+#else
+#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
+  __attribute__((__format__(__printf__, string_index, first_to_check)))
+#endif
+#else
+#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check)
+#endif
+
+// Tell the compiler to warn about unused return values for functions declared
+// with this macro.  The macro should be used on function declarations
+// following the argument list:
+//
+//   Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_;
+#if defined(__GNUC__) && !defined(COMPILER_ICC)
+#define GTEST_MUST_USE_RESULT_ __attribute__((warn_unused_result))
+#else
+#define GTEST_MUST_USE_RESULT_
+#endif  // __GNUC__ && !COMPILER_ICC
+
+// MS C++ compiler emits warning when a conditional expression is compile time
+// constant. In some contexts this warning is false positive and needs to be
+// suppressed. Use the following two macros in such cases:
+//
+// GTEST_INTENTIONAL_CONST_COND_PUSH_()
+// while (true) {
+// GTEST_INTENTIONAL_CONST_COND_POP_()
+// }
+#define GTEST_INTENTIONAL_CONST_COND_PUSH_() \
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127)
+#define GTEST_INTENTIONAL_CONST_COND_POP_() GTEST_DISABLE_MSC_WARNINGS_POP_()
+
+// Determine whether the compiler supports Microsoft's Structured Exception
+// Handling.  This is supported by several Windows compilers but generally
+// does not exist on any other system.
+#ifndef GTEST_HAS_SEH
+// The user didn't tell us, so we need to figure it out.
+
+#if defined(_MSC_VER) || defined(__BORLANDC__)
+// These two compilers are known to support SEH.
+#define GTEST_HAS_SEH 1
+#else
+// Assume no SEH.
+#define GTEST_HAS_SEH 0
+#endif
+
+#endif  // GTEST_HAS_SEH
+
+#ifndef GTEST_IS_THREADSAFE
+
+#define GTEST_IS_THREADSAFE                                                 \
+  (GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ ||                                     \
+   (GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT) || \
+   GTEST_HAS_PTHREAD)
+
+#endif  // GTEST_IS_THREADSAFE
+
+#if GTEST_IS_THREADSAFE
+// Some platforms don't support including these threading related headers.
+#include <condition_variable>  // NOLINT
+#include <mutex>               // NOLINT
+#endif                         // GTEST_IS_THREADSAFE
+
+// GTEST_API_ qualifies all symbols that must be exported. The definitions below
+// are guarded by #ifndef to give embedders a chance to define GTEST_API_ in
+// gtest/internal/custom/gtest-port.h
+#ifndef GTEST_API_
+
+#ifdef _MSC_VER
+#if GTEST_LINKED_AS_SHARED_LIBRARY
+#define GTEST_API_ __declspec(dllimport)
+#elif GTEST_CREATE_SHARED_LIBRARY
+#define GTEST_API_ __declspec(dllexport)
+#endif
+#elif __GNUC__ >= 4 || defined(__clang__)
+#define GTEST_API_ __attribute__((visibility("default")))
+#endif  // _MSC_VER
+
+#endif  // GTEST_API_
+
+#ifndef GTEST_API_
+#define GTEST_API_
+#endif  // GTEST_API_
+
+#ifndef GTEST_DEFAULT_DEATH_TEST_STYLE
+#define GTEST_DEFAULT_DEATH_TEST_STYLE "fast"
+#endif  // GTEST_DEFAULT_DEATH_TEST_STYLE
+
+#ifdef __GNUC__
+// Ask the compiler to never inline a given function.
+#define GTEST_NO_INLINE_ __attribute__((noinline))
+#else
+#define GTEST_NO_INLINE_
+#endif
+
+#if defined(__clang__)
+// Nested ifs to avoid triggering MSVC warning.
+#if __has_attribute(disable_tail_calls)
+// Ask the compiler not to perform tail call optimization inside
+// the marked function.
+#define GTEST_NO_TAIL_CALL_ __attribute__((disable_tail_calls))
+#endif
+#elif __GNUC__
+#define GTEST_NO_TAIL_CALL_ \
+  __attribute__((optimize("no-optimize-sibling-calls")))
+#else
+#define GTEST_NO_TAIL_CALL_
+#endif
+
+// _LIBCPP_VERSION is defined by the libc++ library from the LLVM project.
+#if !defined(GTEST_HAS_CXXABI_H_)
+#if defined(__GLIBCXX__) || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER))
+#define GTEST_HAS_CXXABI_H_ 1
+#else
+#define GTEST_HAS_CXXABI_H_ 0
+#endif
+#endif
+
+// A function level attribute to disable checking for use of uninitialized
+// memory when built with MemorySanitizer.
+#if defined(__clang__)
+#if __has_feature(memory_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ __attribute__((no_sanitize_memory))
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+#endif  // __has_feature(memory_sanitizer)
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+#endif  // __clang__
+
+// A function level attribute to disable AddressSanitizer instrumentation.
+#if defined(__clang__)
+#if __has_feature(address_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ \
+  __attribute__((no_sanitize_address))
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+#endif  // __has_feature(address_sanitizer)
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+#endif  // __clang__
+
+// A function level attribute to disable HWAddressSanitizer instrumentation.
+#if defined(__clang__)
+#if __has_feature(hwaddress_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ \
+  __attribute__((no_sanitize("hwaddress")))
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+#endif  // __has_feature(hwaddress_sanitizer)
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+#endif  // __clang__
+
+// A function level attribute to disable ThreadSanitizer instrumentation.
+#if defined(__clang__)
+#if __has_feature(thread_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ __attribute__((no_sanitize_thread))
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+#endif  // __has_feature(thread_sanitizer)
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+#endif  // __clang__
+
+namespace testing {
+
+class Message;
+
+// Legacy imports for backwards compatibility.
+// New code should use std:: names directly.
+using std::get;
+using std::make_tuple;
+using std::tuple;
+using std::tuple_element;
+using std::tuple_size;
+
+namespace internal {
+
+// A secret type that Google Test users don't know about.  It has no
+// definition on purpose.  Therefore it's impossible to create a
+// Secret object, which is what we want.
+class Secret;
+
+// A helper for suppressing warnings on constant condition.  It just
+// returns 'condition'.
+GTEST_API_ bool IsTrue(bool condition);
+
+// Defines RE.
+
+#if GTEST_USES_RE2
+
+// This is almost `using RE = ::RE2`, except it is copy-constructible, and it
+// needs to disambiguate the `std::string`, `absl::string_view`, and `const
+// char*` constructors.
+class GTEST_API_ RE {
+ public:
+  RE(absl::string_view regex) : regex_(regex) {}                  // NOLINT
+  RE(const char* regex) : RE(absl::string_view(regex)) {}         // NOLINT
+  RE(const std::string& regex) : RE(absl::string_view(regex)) {}  // NOLINT
+  RE(const RE& other) : RE(other.pattern()) {}
+
+  const std::string& pattern() const { return regex_.pattern(); }
+
+  static bool FullMatch(absl::string_view str, const RE& re) {
+    return RE2::FullMatch(str, re.regex_);
+  }
+  static bool PartialMatch(absl::string_view str, const RE& re) {
+    return RE2::PartialMatch(str, re.regex_);
+  }
+
+ private:
+  RE2 regex_;
+};
+
+#elif GTEST_USES_POSIX_RE || GTEST_USES_SIMPLE_RE
+
+// A simple C++ wrapper for <regex.h>.  It uses the POSIX Extended
+// Regular Expression syntax.
+class GTEST_API_ RE {
+ public:
+  // A copy constructor is required by the Standard to initialize object
+  // references from r-values.
+  RE(const RE& other) { Init(other.pattern()); }
+
+  // Constructs an RE from a string.
+  RE(const ::std::string& regex) { Init(regex.c_str()); }  // NOLINT
+
+  RE(const char* regex) { Init(regex); }  // NOLINT
+  ~RE();
+
+  // Returns the string representation of the regex.
+  const char* pattern() const { return pattern_; }
+
+  // FullMatch(str, re) returns true if and only if regular expression re
+  // matches the entire str.
+  // PartialMatch(str, re) returns true if and only if regular expression re
+  // matches a substring of str (including str itself).
+  static bool FullMatch(const ::std::string& str, const RE& re) {
+    return FullMatch(str.c_str(), re);
+  }
+  static bool PartialMatch(const ::std::string& str, const RE& re) {
+    return PartialMatch(str.c_str(), re);
+  }
+
+  static bool FullMatch(const char* str, const RE& re);
+  static bool PartialMatch(const char* str, const RE& re);
+
+ private:
+  void Init(const char* regex);
+  const char* pattern_;
+  bool is_valid_;
+
+#if GTEST_USES_POSIX_RE
+
+  regex_t full_regex_;     // For FullMatch().
+  regex_t partial_regex_;  // For PartialMatch().
+
+#else  // GTEST_USES_SIMPLE_RE
+
+  const char* full_pattern_;  // For FullMatch();
+
+#endif
+};
+
+#endif  // ::testing::internal::RE implementation
+
+// Formats a source file path and a line number as they would appear
+// in an error message from the compiler used to compile this code.
+GTEST_API_ ::std::string FormatFileLocation(const char* file, int line);
+
+// Formats a file location for compiler-independent XML output.
+// Although this function is not platform dependent, we put it next to
+// FormatFileLocation in order to contrast the two functions.
+GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file,
+                                                               int line);
+
+// Defines logging utilities:
+//   GTEST_LOG_(severity) - logs messages at the specified severity level. The
+//                          message itself is streamed into the macro.
+//   LogToStderr()  - directs all log messages to stderr.
+//   FlushInfoLog() - flushes informational log messages.
+
+enum GTestLogSeverity { GTEST_INFO, GTEST_WARNING, GTEST_ERROR, GTEST_FATAL };
+
+// Formats log entry severity, provides a stream object for streaming the
+// log message, and terminates the message with a newline when going out of
+// scope.
+class GTEST_API_ GTestLog {
+ public:
+  GTestLog(GTestLogSeverity severity, const char* file, int line);
+
+  // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
+  ~GTestLog();
+
+  ::std::ostream& GetStream() { return ::std::cerr; }
+
+ private:
+  const GTestLogSeverity severity_;
+
+  GTestLog(const GTestLog&) = delete;
+  GTestLog& operator=(const GTestLog&) = delete;
+};
+
+#if !defined(GTEST_LOG_)
+
+#define GTEST_LOG_(severity)                                           \
+  ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
+                                __FILE__, __LINE__)                    \
+      .GetStream()
+
+inline void LogToStderr() {}
+inline void FlushInfoLog() { fflush(nullptr); }
+
+#endif  // !defined(GTEST_LOG_)
+
+#if !defined(GTEST_CHECK_)
+// INTERNAL IMPLEMENTATION - DO NOT USE.
+//
+// GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition
+// is not satisfied.
+//  Synopsis:
+//    GTEST_CHECK_(boolean_condition);
+//     or
+//    GTEST_CHECK_(boolean_condition) << "Additional message";
+//
+//    This checks the condition and if the condition is not satisfied
+//    it prints message about the condition violation, including the
+//    condition itself, plus additional message streamed into it, if any,
+//    and then it aborts the program. It aborts the program irrespective of
+//    whether it is built in the debug mode or not.
+#define GTEST_CHECK_(condition)               \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_               \
+  if (::testing::internal::IsTrue(condition)) \
+    ;                                         \
+  else                                        \
+    GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
+#endif  // !defined(GTEST_CHECK_)
+
+// An all-mode assert to verify that the given POSIX-style function
+// call returns 0 (indicating success).  Known limitation: this
+// doesn't expand to a balanced 'if' statement, so enclose the macro
+// in {} if you need to use it as the only statement in an 'if'
+// branch.
+#define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \
+  if (const int gtest_error = (posix_call))    \
+  GTEST_LOG_(FATAL) << #posix_call << "failed with error " << gtest_error
+
+// Transforms "T" into "const T&" according to standard reference collapsing
+// rules (this is only needed as a backport for C++98 compilers that do not
+// support reference collapsing). Specifically, it transforms:
+//
+//   char         ==> const char&
+//   const char   ==> const char&
+//   char&        ==> char&
+//   const char&  ==> const char&
+//
+// Note that the non-const reference will not have "const" added. This is
+// standard, and necessary so that "T" can always bind to "const T&".
+template <typename T>
+struct ConstRef {
+  typedef const T& type;
+};
+template <typename T>
+struct ConstRef<T&> {
+  typedef T& type;
+};
+
+// The argument T must depend on some template parameters.
+#define GTEST_REFERENCE_TO_CONST_(T) \
+  typename ::testing::internal::ConstRef<T>::type
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Use ImplicitCast_ as a safe version of static_cast for upcasting in
+// the type hierarchy (e.g. casting a Foo* to a SuperclassOfFoo* or a
+// const Foo*).  When you use ImplicitCast_, the compiler checks that
+// the cast is safe.  Such explicit ImplicitCast_s are necessary in
+// surprisingly many situations where C++ demands an exact type match
+// instead of an argument type convertible to a target type.
+//
+// The syntax for using ImplicitCast_ is the same as for static_cast:
+//
+//   ImplicitCast_<ToType>(expr)
+//
+// ImplicitCast_ would have been part of the C++ standard library,
+// but the proposal was submitted too late.  It will probably make
+// its way into the language in the future.
+//
+// This relatively ugly name is intentional. It prevents clashes with
+// similar functions users may have (e.g., implicit_cast). The internal
+// namespace alone is not enough because the function can be found by ADL.
+template <typename To>
+inline To ImplicitCast_(To x) {
+  return x;
+}
+
+// When you upcast (that is, cast a pointer from type Foo to type
+// SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts
+// always succeed.  When you downcast (that is, cast a pointer from
+// type Foo to type SubclassOfFoo), static_cast<> isn't safe, because
+// how do you know the pointer is really of type SubclassOfFoo?  It
+// could be a bare Foo, or of type DifferentSubclassOfFoo.  Thus,
+// when you downcast, you should use this macro.  In debug mode, we
+// use dynamic_cast<> to double-check the downcast is legal (we die
+// if it's not).  In normal mode, we do the efficient static_cast<>
+// instead.  Thus, it's important to test in debug mode to make sure
+// the cast is legal!
+//    This is the only place in the code we should use dynamic_cast<>.
+// In particular, you SHOULDN'T be using dynamic_cast<> in order to
+// do RTTI (eg code like this:
+//    if (dynamic_cast<Subclass1>(foo)) HandleASubclass1Object(foo);
+//    if (dynamic_cast<Subclass2>(foo)) HandleASubclass2Object(foo);
+// You should design the code some other way not to need this.
+//
+// This relatively ugly name is intentional. It prevents clashes with
+// similar functions users may have (e.g., down_cast). The internal
+// namespace alone is not enough because the function can be found by ADL.
+template <typename To, typename From>  // use like this: DownCast_<T*>(foo);
+inline To DownCast_(From* f) {         // so we only accept pointers
+  // Ensures that To is a sub-type of From *.  This test is here only
+  // for compile-time type checking, and has no overhead in an
+  // optimized build at run-time, as it will be optimized away
+  // completely.
+  GTEST_INTENTIONAL_CONST_COND_PUSH_()
+  if (false) {
+    GTEST_INTENTIONAL_CONST_COND_POP_()
+    const To to = nullptr;
+    ::testing::internal::ImplicitCast_<From*>(to);
+  }
+
+#if GTEST_HAS_RTTI
+  // RTTI: debug mode only!
+  GTEST_CHECK_(f == nullptr || dynamic_cast<To>(f) != nullptr);
+#endif
+  return static_cast<To>(f);
+}
+
+// Downcasts the pointer of type Base to Derived.
+// Derived must be a subclass of Base. The parameter MUST
+// point to a class of type Derived, not any subclass of it.
+// When RTTI is available, the function performs a runtime
+// check to enforce this.
+template <class Derived, class Base>
+Derived* CheckedDowncastToActualType(Base* base) {
+#if GTEST_HAS_RTTI
+  GTEST_CHECK_(typeid(*base) == typeid(Derived));
+#endif
+
+#if GTEST_HAS_DOWNCAST_
+  return ::down_cast<Derived*>(base);
+#elif GTEST_HAS_RTTI
+  return dynamic_cast<Derived*>(base);  // NOLINT
+#else
+  return static_cast<Derived*>(base);  // Poor man's downcast.
+#endif
+}
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Defines the stderr capturer:
+//   CaptureStdout     - starts capturing stdout.
+//   GetCapturedStdout - stops capturing stdout and returns the captured string.
+//   CaptureStderr     - starts capturing stderr.
+//   GetCapturedStderr - stops capturing stderr and returns the captured string.
+//
+GTEST_API_ void CaptureStdout();
+GTEST_API_ std::string GetCapturedStdout();
+GTEST_API_ void CaptureStderr();
+GTEST_API_ std::string GetCapturedStderr();
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+// Returns the size (in bytes) of a file.
+GTEST_API_ size_t GetFileSize(FILE* file);
+
+// Reads the entire content of a file as a string.
+GTEST_API_ std::string ReadEntireFile(FILE* file);
+
+// All command line arguments.
+GTEST_API_ std::vector<std::string> GetArgvs();
+
+#if GTEST_HAS_DEATH_TEST
+
+std::vector<std::string> GetInjectableArgvs();
+// Deprecated: pass the args vector by value instead.
+void SetInjectableArgvs(const std::vector<std::string>* new_argvs);
+void SetInjectableArgvs(const std::vector<std::string>& new_argvs);
+void ClearInjectableArgvs();
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+// Defines synchronization primitives.
+#if GTEST_IS_THREADSAFE
+
+#if GTEST_OS_WINDOWS
+// Provides leak-safe Windows kernel handle ownership.
+// Used in death tests and in threading support.
+class GTEST_API_ AutoHandle {
+ public:
+  // Assume that Win32 HANDLE type is equivalent to void*. Doing so allows us to
+  // avoid including <windows.h> in this header file. Including <windows.h> is
+  // undesirable because it defines a lot of symbols and macros that tend to
+  // conflict with client code. This assumption is verified by
+  // WindowsTypesTest.HANDLEIsVoidStar.
+  typedef void* Handle;
+  AutoHandle();
+  explicit AutoHandle(Handle handle);
+
+  ~AutoHandle();
+
+  Handle Get() const;
+  void Reset();
+  void Reset(Handle handle);
+
+ private:
+  // Returns true if and only if the handle is a valid handle object that can be
+  // closed.
+  bool IsCloseable() const;
+
+  Handle handle_;
+
+  AutoHandle(const AutoHandle&) = delete;
+  AutoHandle& operator=(const AutoHandle&) = delete;
+};
+#endif
+
+#if GTEST_HAS_NOTIFICATION_
+// Notification has already been imported into the namespace.
+// Nothing to do here.
+
+#else
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+// Allows a controller thread to pause execution of newly created
+// threads until notified.  Instances of this class must be created
+// and destroyed in the controller thread.
+//
+// This class is only for testing Google Test's own constructs. Do not
+// use it in user tests, either directly or indirectly.
+// TODO(b/203539622): Replace unconditionally with absl::Notification.
+class GTEST_API_ Notification {
+ public:
+  Notification() : notified_(false) {}
+  Notification(const Notification&) = delete;
+  Notification& operator=(const Notification&) = delete;
+
+  // Notifies all threads created with this notification to start. Must
+  // be called from the controller thread.
+  void Notify() {
+    std::lock_guard<std::mutex> lock(mu_);
+    notified_ = true;
+    cv_.notify_all();
+  }
+
+  // Blocks until the controller thread notifies. Must be called from a test
+  // thread.
+  void WaitForNotification() {
+    std::unique_lock<std::mutex> lock(mu_);
+    cv_.wait(lock, [this]() { return notified_; });
+  }
+
+ private:
+  std::mutex mu_;
+  std::condition_variable cv_;
+  bool notified_;
+};
+GTEST_DISABLE_MSC_WARNINGS_POP_()  // 4251
+#endif  // GTEST_HAS_NOTIFICATION_
+
+// On MinGW, we can have both GTEST_OS_WINDOWS and GTEST_HAS_PTHREAD
+// defined, but we don't want to use MinGW's pthreads implementation, which
+// has conformance problems with some versions of the POSIX standard.
+#if GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW
+
+// As a C-function, ThreadFuncWithCLinkage cannot be templated itself.
+// Consequently, it cannot select a correct instantiation of ThreadWithParam
+// in order to call its Run(). Introducing ThreadWithParamBase as a
+// non-templated base class for ThreadWithParam allows us to bypass this
+// problem.
+class ThreadWithParamBase {
+ public:
+  virtual ~ThreadWithParamBase() {}
+  virtual void Run() = 0;
+};
+
+// pthread_create() accepts a pointer to a function type with the C linkage.
+// According to the Standard (7.5/1), function types with different linkages
+// are different even if they are otherwise identical.  Some compilers (for
+// example, SunStudio) treat them as different types.  Since class methods
+// cannot be defined with C-linkage we need to define a free C-function to
+// pass into pthread_create().
+extern "C" inline void* ThreadFuncWithCLinkage(void* thread) {
+  static_cast<ThreadWithParamBase*>(thread)->Run();
+  return nullptr;
+}
+
+// Helper class for testing Google Test's multi-threading constructs.
+// To use it, write:
+//
+//   void ThreadFunc(int param) { /* Do things with param */ }
+//   Notification thread_can_start;
+//   ...
+//   // The thread_can_start parameter is optional; you can supply NULL.
+//   ThreadWithParam<int> thread(&ThreadFunc, 5, &thread_can_start);
+//   thread_can_start.Notify();
+//
+// These classes are only for testing Google Test's own constructs. Do
+// not use them in user tests, either directly or indirectly.
+template <typename T>
+class ThreadWithParam : public ThreadWithParamBase {
+ public:
+  typedef void UserThreadFunc(T);
+
+  ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start)
+      : func_(func),
+        param_(param),
+        thread_can_start_(thread_can_start),
+        finished_(false) {
+    ThreadWithParamBase* const base = this;
+    // The thread can be created only after all fields except thread_
+    // have been initialized.
+    GTEST_CHECK_POSIX_SUCCESS_(
+        pthread_create(&thread_, nullptr, &ThreadFuncWithCLinkage, base));
+  }
+  ~ThreadWithParam() override { Join(); }
+
+  void Join() {
+    if (!finished_) {
+      GTEST_CHECK_POSIX_SUCCESS_(pthread_join(thread_, nullptr));
+      finished_ = true;
+    }
+  }
+
+  void Run() override {
+    if (thread_can_start_ != nullptr) thread_can_start_->WaitForNotification();
+    func_(param_);
+  }
+
+ private:
+  UserThreadFunc* const func_;  // User-supplied thread function.
+  const T param_;  // User-supplied parameter to the thread function.
+  // When non-NULL, used to block execution until the controller thread
+  // notifies.
+  Notification* const thread_can_start_;
+  bool finished_;  // true if and only if we know that the thread function has
+                   // finished.
+  pthread_t thread_;  // The native thread object.
+
+  ThreadWithParam(const ThreadWithParam&) = delete;
+  ThreadWithParam& operator=(const ThreadWithParam&) = delete;
+};
+#endif  // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD ||
+        // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+
+#if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+// Mutex and ThreadLocal have already been imported into the namespace.
+// Nothing to do here.
+
+#elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+
+// Mutex implements mutex on Windows platforms.  It is used in conjunction
+// with class MutexLock:
+//
+//   Mutex mutex;
+//   ...
+//   MutexLock lock(&mutex);  // Acquires the mutex and releases it at the
+//                            // end of the current scope.
+//
+// A static Mutex *must* be defined or declared using one of the following
+// macros:
+//   GTEST_DEFINE_STATIC_MUTEX_(g_some_mutex);
+//   GTEST_DECLARE_STATIC_MUTEX_(g_some_mutex);
+//
+// (A non-static Mutex is defined/declared in the usual way).
+class GTEST_API_ Mutex {
+ public:
+  enum MutexType { kStatic = 0, kDynamic = 1 };
+  // We rely on kStaticMutex being 0 as it is to what the linker initializes
+  // type_ in static mutexes.  critical_section_ will be initialized lazily
+  // in ThreadSafeLazyInit().
+  enum StaticConstructorSelector { kStaticMutex = 0 };
+
+  // This constructor intentionally does nothing.  It relies on type_ being
+  // statically initialized to 0 (effectively setting it to kStatic) and on
+  // ThreadSafeLazyInit() to lazily initialize the rest of the members.
+  explicit Mutex(StaticConstructorSelector /*dummy*/) {}
+
+  Mutex();
+  ~Mutex();
+
+  void Lock();
+
+  void Unlock();
+
+  // Does nothing if the current thread holds the mutex. Otherwise, crashes
+  // with high probability.
+  void AssertHeld();
+
+ private:
+  // Initializes owner_thread_id_ and critical_section_ in static mutexes.
+  void ThreadSafeLazyInit();
+
+  // Per https://blogs.msdn.microsoft.com/oldnewthing/20040223-00/?p=40503,
+  // we assume that 0 is an invalid value for thread IDs.
+  unsigned int owner_thread_id_;
+
+  // For static mutexes, we rely on these members being initialized to zeros
+  // by the linker.
+  MutexType type_;
+  long critical_section_init_phase_;  // NOLINT
+  GTEST_CRITICAL_SECTION* critical_section_;
+
+  Mutex(const Mutex&) = delete;
+  Mutex& operator=(const Mutex&) = delete;
+};
+
+#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+  extern ::testing::internal::Mutex mutex
+
+#define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
+  ::testing::internal::Mutex mutex(::testing::internal::Mutex::kStaticMutex)
+
+// We cannot name this class MutexLock because the ctor declaration would
+// conflict with a macro named MutexLock, which is defined on some
+// platforms. That macro is used as a defensive measure to prevent against
+// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than
+// "MutexLock l(&mu)".  Hence the typedef trick below.
+class GTestMutexLock {
+ public:
+  explicit GTestMutexLock(Mutex* mutex) : mutex_(mutex) { mutex_->Lock(); }
+
+  ~GTestMutexLock() { mutex_->Unlock(); }
+
+ private:
+  Mutex* const mutex_;
+
+  GTestMutexLock(const GTestMutexLock&) = delete;
+  GTestMutexLock& operator=(const GTestMutexLock&) = delete;
+};
+
+typedef GTestMutexLock MutexLock;
+
+// Base class for ValueHolder<T>.  Allows a caller to hold and delete a value
+// without knowing its type.
+class ThreadLocalValueHolderBase {
+ public:
+  virtual ~ThreadLocalValueHolderBase() {}
+};
+
+// Provides a way for a thread to send notifications to a ThreadLocal
+// regardless of its parameter type.
+class ThreadLocalBase {
+ public:
+  // Creates a new ValueHolder<T> object holding a default value passed to
+  // this ThreadLocal<T>'s constructor and returns it.  It is the caller's
+  // responsibility not to call this when the ThreadLocal<T> instance already
+  // has a value on the current thread.
+  virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const = 0;
+
+ protected:
+  ThreadLocalBase() {}
+  virtual ~ThreadLocalBase() {}
+
+ private:
+  ThreadLocalBase(const ThreadLocalBase&) = delete;
+  ThreadLocalBase& operator=(const ThreadLocalBase&) = delete;
+};
+
+// Maps a thread to a set of ThreadLocals that have values instantiated on that
+// thread and notifies them when the thread exits.  A ThreadLocal instance is
+// expected to persist until all threads it has values on have terminated.
+class GTEST_API_ ThreadLocalRegistry {
+ public:
+  // Registers thread_local_instance as having value on the current thread.
+  // Returns a value that can be used to identify the thread from other threads.
+  static ThreadLocalValueHolderBase* GetValueOnCurrentThread(
+      const ThreadLocalBase* thread_local_instance);
+
+  // Invoked when a ThreadLocal instance is destroyed.
+  static void OnThreadLocalDestroyed(
+      const ThreadLocalBase* thread_local_instance);
+};
+
+class GTEST_API_ ThreadWithParamBase {
+ public:
+  void Join();
+
+ protected:
+  class Runnable {
+   public:
+    virtual ~Runnable() {}
+    virtual void Run() = 0;
+  };
+
+  ThreadWithParamBase(Runnable* runnable, Notification* thread_can_start);
+  virtual ~ThreadWithParamBase();
+
+ private:
+  AutoHandle thread_;
+};
+
+// Helper class for testing Google Test's multi-threading constructs.
+template <typename T>
+class ThreadWithParam : public ThreadWithParamBase {
+ public:
+  typedef void UserThreadFunc(T);
+
+  ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start)
+      : ThreadWithParamBase(new RunnableImpl(func, param), thread_can_start) {}
+  virtual ~ThreadWithParam() {}
+
+ private:
+  class RunnableImpl : public Runnable {
+   public:
+    RunnableImpl(UserThreadFunc* func, T param) : func_(func), param_(param) {}
+    virtual ~RunnableImpl() {}
+    virtual void Run() { func_(param_); }
+
+   private:
+    UserThreadFunc* const func_;
+    const T param_;
+
+    RunnableImpl(const RunnableImpl&) = delete;
+    RunnableImpl& operator=(const RunnableImpl&) = delete;
+  };
+
+  ThreadWithParam(const ThreadWithParam&) = delete;
+  ThreadWithParam& operator=(const ThreadWithParam&) = delete;
+};
+
+// Implements thread-local storage on Windows systems.
+//
+//   // Thread 1
+//   ThreadLocal<int> tl(100);  // 100 is the default value for each thread.
+//
+//   // Thread 2
+//   tl.set(150);  // Changes the value for thread 2 only.
+//   EXPECT_EQ(150, tl.get());
+//
+//   // Thread 1
+//   EXPECT_EQ(100, tl.get());  // In thread 1, tl has the original value.
+//   tl.set(200);
+//   EXPECT_EQ(200, tl.get());
+//
+// The template type argument T must have a public copy constructor.
+// In addition, the default ThreadLocal constructor requires T to have
+// a public default constructor.
+//
+// The users of a TheadLocal instance have to make sure that all but one
+// threads (including the main one) using that instance have exited before
+// destroying it. Otherwise, the per-thread objects managed for them by the
+// ThreadLocal instance are not guaranteed to be destroyed on all platforms.
+//
+// Google Test only uses global ThreadLocal objects.  That means they
+// will die after main() has returned.  Therefore, no per-thread
+// object managed by Google Test will be leaked as long as all threads
+// using Google Test have exited when main() returns.
+template <typename T>
+class ThreadLocal : public ThreadLocalBase {
+ public:
+  ThreadLocal() : default_factory_(new DefaultValueHolderFactory()) {}
+  explicit ThreadLocal(const T& value)
+      : default_factory_(new InstanceValueHolderFactory(value)) {}
+
+  ~ThreadLocal() override { ThreadLocalRegistry::OnThreadLocalDestroyed(this); }
+
+  T* pointer() { return GetOrCreateValue(); }
+  const T* pointer() const { return GetOrCreateValue(); }
+  const T& get() const { return *pointer(); }
+  void set(const T& value) { *pointer() = value; }
+
+ private:
+  // Holds a value of T.  Can be deleted via its base class without the caller
+  // knowing the type of T.
+  class ValueHolder : public ThreadLocalValueHolderBase {
+   public:
+    ValueHolder() : value_() {}
+    explicit ValueHolder(const T& value) : value_(value) {}
+
+    T* pointer() { return &value_; }
+
+   private:
+    T value_;
+    ValueHolder(const ValueHolder&) = delete;
+    ValueHolder& operator=(const ValueHolder&) = delete;
+  };
+
+  T* GetOrCreateValue() const {
+    return static_cast<ValueHolder*>(
+               ThreadLocalRegistry::GetValueOnCurrentThread(this))
+        ->pointer();
+  }
+
+  ThreadLocalValueHolderBase* NewValueForCurrentThread() const override {
+    return default_factory_->MakeNewHolder();
+  }
+
+  class ValueHolderFactory {
+   public:
+    ValueHolderFactory() {}
+    virtual ~ValueHolderFactory() {}
+    virtual ValueHolder* MakeNewHolder() const = 0;
+
+   private:
+    ValueHolderFactory(const ValueHolderFactory&) = delete;
+    ValueHolderFactory& operator=(const ValueHolderFactory&) = delete;
+  };
+
+  class DefaultValueHolderFactory : public ValueHolderFactory {
+   public:
+    DefaultValueHolderFactory() {}
+    ValueHolder* MakeNewHolder() const override { return new ValueHolder(); }
+
+   private:
+    DefaultValueHolderFactory(const DefaultValueHolderFactory&) = delete;
+    DefaultValueHolderFactory& operator=(const DefaultValueHolderFactory&) =
+        delete;
+  };
+
+  class InstanceValueHolderFactory : public ValueHolderFactory {
+   public:
+    explicit InstanceValueHolderFactory(const T& value) : value_(value) {}
+    ValueHolder* MakeNewHolder() const override {
+      return new ValueHolder(value_);
+    }
+
+   private:
+    const T value_;  // The value for each thread.
+
+    InstanceValueHolderFactory(const InstanceValueHolderFactory&) = delete;
+    InstanceValueHolderFactory& operator=(const InstanceValueHolderFactory&) =
+        delete;
+  };
+
+  std::unique_ptr<ValueHolderFactory> default_factory_;
+
+  ThreadLocal(const ThreadLocal&) = delete;
+  ThreadLocal& operator=(const ThreadLocal&) = delete;
+};
+
+#elif GTEST_HAS_PTHREAD
+
+// MutexBase and Mutex implement mutex on pthreads-based platforms.
+class MutexBase {
+ public:
+  // Acquires this mutex.
+  void Lock() {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_lock(&mutex_));
+    owner_ = pthread_self();
+    has_owner_ = true;
+  }
+
+  // Releases this mutex.
+  void Unlock() {
+    // Since the lock is being released the owner_ field should no longer be
+    // considered valid. We don't protect writing to has_owner_ here, as it's
+    // the caller's responsibility to ensure that the current thread holds the
+    // mutex when this is called.
+    has_owner_ = false;
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_unlock(&mutex_));
+  }
+
+  // Does nothing if the current thread holds the mutex. Otherwise, crashes
+  // with high probability.
+  void AssertHeld() const {
+    GTEST_CHECK_(has_owner_ && pthread_equal(owner_, pthread_self()))
+        << "The current thread is not holding the mutex @" << this;
+  }
+
+  // A static mutex may be used before main() is entered.  It may even
+  // be used before the dynamic initialization stage.  Therefore we
+  // must be able to initialize a static mutex object at link time.
+  // This means MutexBase has to be a POD and its member variables
+  // have to be public.
+ public:
+  pthread_mutex_t mutex_;  // The underlying pthread mutex.
+  // has_owner_ indicates whether the owner_ field below contains a valid thread
+  // ID and is therefore safe to inspect (e.g., to use in pthread_equal()). All
+  // accesses to the owner_ field should be protected by a check of this field.
+  // An alternative might be to memset() owner_ to all zeros, but there's no
+  // guarantee that a zero'd pthread_t is necessarily invalid or even different
+  // from pthread_self().
+  bool has_owner_;
+  pthread_t owner_;  // The thread holding the mutex.
+};
+
+// Forward-declares a static mutex.
+#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+  extern ::testing::internal::MutexBase mutex
+
+// Defines and statically (i.e. at link time) initializes a static mutex.
+// The initialization list here does not explicitly initialize each field,
+// instead relying on default initialization for the unspecified fields. In
+// particular, the owner_ field (a pthread_t) is not explicitly initialized.
+// This allows initialization to work whether pthread_t is a scalar or struct.
+// The flag -Wmissing-field-initializers must not be specified for this to work.
+#define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
+  ::testing::internal::MutexBase mutex = {PTHREAD_MUTEX_INITIALIZER, false, 0}
+
+// The Mutex class can only be used for mutexes created at runtime. It
+// shares its API with MutexBase otherwise.
+class Mutex : public MutexBase {
+ public:
+  Mutex() {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, nullptr));
+    has_owner_ = false;
+  }
+  ~Mutex() { GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_)); }
+
+ private:
+  Mutex(const Mutex&) = delete;
+  Mutex& operator=(const Mutex&) = delete;
+};
+
+// We cannot name this class MutexLock because the ctor declaration would
+// conflict with a macro named MutexLock, which is defined on some
+// platforms. That macro is used as a defensive measure to prevent against
+// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than
+// "MutexLock l(&mu)".  Hence the typedef trick below.
+class GTestMutexLock {
+ public:
+  explicit GTestMutexLock(MutexBase* mutex) : mutex_(mutex) { mutex_->Lock(); }
+
+  ~GTestMutexLock() { mutex_->Unlock(); }
+
+ private:
+  MutexBase* const mutex_;
+
+  GTestMutexLock(const GTestMutexLock&) = delete;
+  GTestMutexLock& operator=(const GTestMutexLock&) = delete;
+};
+
+typedef GTestMutexLock MutexLock;
+
+// Helpers for ThreadLocal.
+
+// pthread_key_create() requires DeleteThreadLocalValue() to have
+// C-linkage.  Therefore it cannot be templatized to access
+// ThreadLocal<T>.  Hence the need for class
+// ThreadLocalValueHolderBase.
+class ThreadLocalValueHolderBase {
+ public:
+  virtual ~ThreadLocalValueHolderBase() {}
+};
+
+// Called by pthread to delete thread-local data stored by
+// pthread_setspecific().
+extern "C" inline void DeleteThreadLocalValue(void* value_holder) {
+  delete static_cast<ThreadLocalValueHolderBase*>(value_holder);
+}
+
+// Implements thread-local storage on pthreads-based systems.
+template <typename T>
+class GTEST_API_ ThreadLocal {
+ public:
+  ThreadLocal()
+      : key_(CreateKey()), default_factory_(new DefaultValueHolderFactory()) {}
+  explicit ThreadLocal(const T& value)
+      : key_(CreateKey()),
+        default_factory_(new InstanceValueHolderFactory(value)) {}
+
+  ~ThreadLocal() {
+    // Destroys the managed object for the current thread, if any.
+    DeleteThreadLocalValue(pthread_getspecific(key_));
+
+    // Releases resources associated with the key.  This will *not*
+    // delete managed objects for other threads.
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_key_delete(key_));
+  }
+
+  T* pointer() { return GetOrCreateValue(); }
+  const T* pointer() const { return GetOrCreateValue(); }
+  const T& get() const { return *pointer(); }
+  void set(const T& value) { *pointer() = value; }
+
+ private:
+  // Holds a value of type T.
+  class ValueHolder : public ThreadLocalValueHolderBase {
+   public:
+    ValueHolder() : value_() {}
+    explicit ValueHolder(const T& value) : value_(value) {}
+
+    T* pointer() { return &value_; }
+
+   private:
+    T value_;
+    ValueHolder(const ValueHolder&) = delete;
+    ValueHolder& operator=(const ValueHolder&) = delete;
+  };
+
+  static pthread_key_t CreateKey() {
+    pthread_key_t key;
+    // When a thread exits, DeleteThreadLocalValue() will be called on
+    // the object managed for that thread.
+    GTEST_CHECK_POSIX_SUCCESS_(
+        pthread_key_create(&key, &DeleteThreadLocalValue));
+    return key;
+  }
+
+  T* GetOrCreateValue() const {
+    ThreadLocalValueHolderBase* const holder =
+        static_cast<ThreadLocalValueHolderBase*>(pthread_getspecific(key_));
+    if (holder != nullptr) {
+      return CheckedDowncastToActualType<ValueHolder>(holder)->pointer();
+    }
+
+    ValueHolder* const new_holder = default_factory_->MakeNewHolder();
+    ThreadLocalValueHolderBase* const holder_base = new_holder;
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_setspecific(key_, holder_base));
+    return new_holder->pointer();
+  }
+
+  class ValueHolderFactory {
+   public:
+    ValueHolderFactory() {}
+    virtual ~ValueHolderFactory() {}
+    virtual ValueHolder* MakeNewHolder() const = 0;
+
+   private:
+    ValueHolderFactory(const ValueHolderFactory&) = delete;
+    ValueHolderFactory& operator=(const ValueHolderFactory&) = delete;
+  };
+
+  class DefaultValueHolderFactory : public ValueHolderFactory {
+   public:
+    DefaultValueHolderFactory() {}
+    ValueHolder* MakeNewHolder() const override { return new ValueHolder(); }
+
+   private:
+    DefaultValueHolderFactory(const DefaultValueHolderFactory&) = delete;
+    DefaultValueHolderFactory& operator=(const DefaultValueHolderFactory&) =
+        delete;
+  };
+
+  class InstanceValueHolderFactory : public ValueHolderFactory {
+   public:
+    explicit InstanceValueHolderFactory(const T& value) : value_(value) {}
+    ValueHolder* MakeNewHolder() const override {
+      return new ValueHolder(value_);
+    }
+
+   private:
+    const T value_;  // The value for each thread.
+
+    InstanceValueHolderFactory(const InstanceValueHolderFactory&) = delete;
+    InstanceValueHolderFactory& operator=(const InstanceValueHolderFactory&) =
+        delete;
+  };
+
+  // A key pthreads uses for looking up per-thread values.
+  const pthread_key_t key_;
+  std::unique_ptr<ValueHolderFactory> default_factory_;
+
+  ThreadLocal(const ThreadLocal&) = delete;
+  ThreadLocal& operator=(const ThreadLocal&) = delete;
+};
+
+#endif  // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+
+#else  // GTEST_IS_THREADSAFE
+
+// A dummy implementation of synchronization primitives (mutex, lock,
+// and thread-local variable).  Necessary for compiling Google Test where
+// mutex is not supported - using Google Test in multiple threads is not
+// supported on such platforms.
+
+class Mutex {
+ public:
+  Mutex() {}
+  void Lock() {}
+  void Unlock() {}
+  void AssertHeld() const {}
+};
+
+#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+  extern ::testing::internal::Mutex mutex
+
+#define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex
+
+// We cannot name this class MutexLock because the ctor declaration would
+// conflict with a macro named MutexLock, which is defined on some
+// platforms. That macro is used as a defensive measure to prevent against
+// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than
+// "MutexLock l(&mu)".  Hence the typedef trick below.
+class GTestMutexLock {
+ public:
+  explicit GTestMutexLock(Mutex*) {}  // NOLINT
+};
+
+typedef GTestMutexLock MutexLock;
+
+template <typename T>
+class GTEST_API_ ThreadLocal {
+ public:
+  ThreadLocal() : value_() {}
+  explicit ThreadLocal(const T& value) : value_(value) {}
+  T* pointer() { return &value_; }
+  const T* pointer() const { return &value_; }
+  const T& get() const { return value_; }
+  void set(const T& value) { value_ = value; }
+
+ private:
+  T value_;
+};
+
+#endif  // GTEST_IS_THREADSAFE
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+GTEST_API_ size_t GetThreadCount();
+
+#if GTEST_OS_WINDOWS
+#define GTEST_PATH_SEP_ "\\"
+#define GTEST_HAS_ALT_PATH_SEP_ 1
+#else
+#define GTEST_PATH_SEP_ "/"
+#define GTEST_HAS_ALT_PATH_SEP_ 0
+#endif  // GTEST_OS_WINDOWS
+
+// Utilities for char.
+
+// isspace(int ch) and friends accept an unsigned char or EOF.  char
+// may be signed, depending on the compiler (or compiler flags).
+// Therefore we need to cast a char to unsigned char before calling
+// isspace(), etc.
+
+inline bool IsAlpha(char ch) {
+  return isalpha(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsAlNum(char ch) {
+  return isalnum(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsDigit(char ch) {
+  return isdigit(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsLower(char ch) {
+  return islower(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsSpace(char ch) {
+  return isspace(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsUpper(char ch) {
+  return isupper(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsXDigit(char ch) {
+  return isxdigit(static_cast<unsigned char>(ch)) != 0;
+}
+#ifdef __cpp_char8_t
+inline bool IsXDigit(char8_t ch) {
+  return isxdigit(static_cast<unsigned char>(ch)) != 0;
+}
+#endif
+inline bool IsXDigit(char16_t ch) {
+  const unsigned char low_byte = static_cast<unsigned char>(ch);
+  return ch == low_byte && isxdigit(low_byte) != 0;
+}
+inline bool IsXDigit(char32_t ch) {
+  const unsigned char low_byte = static_cast<unsigned char>(ch);
+  return ch == low_byte && isxdigit(low_byte) != 0;
+}
+inline bool IsXDigit(wchar_t ch) {
+  const unsigned char low_byte = static_cast<unsigned char>(ch);
+  return ch == low_byte && isxdigit(low_byte) != 0;
+}
+
+inline char ToLower(char ch) {
+  return static_cast<char>(tolower(static_cast<unsigned char>(ch)));
+}
+inline char ToUpper(char ch) {
+  return static_cast<char>(toupper(static_cast<unsigned char>(ch)));
+}
+
+inline std::string StripTrailingSpaces(std::string str) {
+  std::string::iterator it = str.end();
+  while (it != str.begin() && IsSpace(*--it)) it = str.erase(it);
+  return str;
+}
+
+// The testing::internal::posix namespace holds wrappers for common
+// POSIX functions.  These wrappers hide the differences between
+// Windows/MSVC and POSIX systems.  Since some compilers define these
+// standard functions as macros, the wrapper cannot have the same name
+// as the wrapped function.
+
+namespace posix {
+
+// Functions with a different name on Windows.
+
+#if GTEST_OS_WINDOWS
+
+typedef struct _stat StatStruct;
+
+#ifdef __BORLANDC__
+inline int DoIsATTY(int fd) { return isatty(fd); }
+inline int StrCaseCmp(const char* s1, const char* s2) {
+  return stricmp(s1, s2);
+}
+inline char* StrDup(const char* src) { return strdup(src); }
+#else  // !__BORLANDC__
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS || GTEST_OS_IOS || \
+    GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT || defined(ESP_PLATFORM)
+inline int DoIsATTY(int /* fd */) { return 0; }
+#else
+inline int DoIsATTY(int fd) { return _isatty(fd); }
+#endif  // GTEST_OS_WINDOWS_MOBILE
+inline int StrCaseCmp(const char* s1, const char* s2) {
+  return _stricmp(s1, s2);
+}
+inline char* StrDup(const char* src) { return _strdup(src); }
+#endif  // __BORLANDC__
+
+#if GTEST_OS_WINDOWS_MOBILE
+inline int FileNo(FILE* file) { return reinterpret_cast<int>(_fileno(file)); }
+// Stat(), RmDir(), and IsDir() are not needed on Windows CE at this
+// time and thus not defined there.
+#else
+inline int FileNo(FILE* file) { return _fileno(file); }
+inline int Stat(const char* path, StatStruct* buf) { return _stat(path, buf); }
+inline int RmDir(const char* dir) { return _rmdir(dir); }
+inline bool IsDir(const StatStruct& st) { return (_S_IFDIR & st.st_mode) != 0; }
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+#elif GTEST_OS_ESP8266
+typedef struct stat StatStruct;
+
+inline int FileNo(FILE* file) { return fileno(file); }
+inline int DoIsATTY(int fd) { return isatty(fd); }
+inline int Stat(const char* path, StatStruct* buf) {
+  // stat function not implemented on ESP8266
+  return 0;
+}
+inline int StrCaseCmp(const char* s1, const char* s2) {
+  return strcasecmp(s1, s2);
+}
+inline char* StrDup(const char* src) { return strdup(src); }
+inline int RmDir(const char* dir) { return rmdir(dir); }
+inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); }
+
+#else
+
+typedef struct stat StatStruct;
+
+inline int FileNo(FILE* file) { return fileno(file); }
+inline int DoIsATTY(int fd) { return isatty(fd); }
+inline int Stat(const char* path, StatStruct* buf) { return stat(path, buf); }
+inline int StrCaseCmp(const char* s1, const char* s2) {
+  return strcasecmp(s1, s2);
+}
+inline char* StrDup(const char* src) { return strdup(src); }
+inline int RmDir(const char* dir) { return rmdir(dir); }
+inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); }
+
+#endif  // GTEST_OS_WINDOWS
+
+inline int IsATTY(int fd) {
+  // DoIsATTY might change errno (for example ENOTTY in case you redirect stdout
+  // to a file on Linux), which is unexpected, so save the previous value, and
+  // restore it after the call.
+  int savedErrno = errno;
+  int isAttyValue = DoIsATTY(fd);
+  errno = savedErrno;
+
+  return isAttyValue;
+}
+
+// Functions deprecated by MSVC 8.0.
+
+GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
+
+// ChDir(), FReopen(), FDOpen(), Read(), Write(), Close(), and
+// StrError() aren't needed on Windows CE at this time and thus not
+// defined there.
+
+#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \
+    !GTEST_OS_WINDOWS_RT && !GTEST_OS_ESP8266 && !GTEST_OS_XTENSA
+inline int ChDir(const char* dir) { return chdir(dir); }
+#endif
+inline FILE* FOpen(const char* path, const char* mode) {
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
+  struct wchar_codecvt : public std::codecvt<wchar_t, char, std::mbstate_t> {};
+  std::wstring_convert<wchar_codecvt> converter;
+  std::wstring wide_path = converter.from_bytes(path);
+  std::wstring wide_mode = converter.from_bytes(mode);
+  return _wfopen(wide_path.c_str(), wide_mode.c_str());
+#else   // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
+  return fopen(path, mode);
+#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
+}
+#if !GTEST_OS_WINDOWS_MOBILE
+inline FILE* FReopen(const char* path, const char* mode, FILE* stream) {
+  return freopen(path, mode, stream);
+}
+inline FILE* FDOpen(int fd, const char* mode) { return fdopen(fd, mode); }
+#endif
+inline int FClose(FILE* fp) { return fclose(fp); }
+#if !GTEST_OS_WINDOWS_MOBILE
+inline int Read(int fd, void* buf, unsigned int count) {
+  return static_cast<int>(read(fd, buf, count));
+}
+inline int Write(int fd, const void* buf, unsigned int count) {
+  return static_cast<int>(write(fd, buf, count));
+}
+inline int Close(int fd) { return close(fd); }
+inline const char* StrError(int errnum) { return strerror(errnum); }
+#endif
+inline const char* GetEnv(const char* name) {
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \
+    GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_XTENSA
+  // We are on an embedded platform, which has no environment variables.
+  static_cast<void>(name);  // To prevent 'unused argument' warning.
+  return nullptr;
+#elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9)
+  // Environment variables which we programmatically clear will be set to the
+  // empty string rather than unset (NULL).  Handle that case.
+  const char* const env = getenv(name);
+  return (env != nullptr && env[0] != '\0') ? env : nullptr;
+#else
+  return getenv(name);
+#endif
+}
+
+GTEST_DISABLE_MSC_DEPRECATED_POP_()
+
+#if GTEST_OS_WINDOWS_MOBILE
+// Windows CE has no C library. The abort() function is used in
+// several places in Google Test. This implementation provides a reasonable
+// imitation of standard behaviour.
+[[noreturn]] void Abort();
+#else
+[[noreturn]] inline void Abort() { abort(); }
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+}  // namespace posix
+
+// MSVC "deprecates" snprintf and issues warnings wherever it is used.  In
+// order to avoid these warnings, we need to use _snprintf or _snprintf_s on
+// MSVC-based platforms.  We map the GTEST_SNPRINTF_ macro to the appropriate
+// function in order to achieve that.  We use macro definition here because
+// snprintf is a variadic function.
+#if _MSC_VER && !GTEST_OS_WINDOWS_MOBILE
+// MSVC 2005 and above support variadic macros.
+#define GTEST_SNPRINTF_(buffer, size, format, ...) \
+  _snprintf_s(buffer, size, size, format, __VA_ARGS__)
+#elif defined(_MSC_VER)
+// Windows CE does not define _snprintf_s
+#define GTEST_SNPRINTF_ _snprintf
+#else
+#define GTEST_SNPRINTF_ snprintf
+#endif
+
+// The biggest signed integer type the compiler supports.
+//
+// long long is guaranteed to be at least 64-bits in C++11.
+using BiggestInt = long long;  // NOLINT
+
+// The maximum number a BiggestInt can represent.
+constexpr BiggestInt kMaxBiggestInt = (std::numeric_limits<BiggestInt>::max)();
+
+// This template class serves as a compile-time function from size to
+// type.  It maps a size in bytes to a primitive type with that
+// size. e.g.
+//
+//   TypeWithSize<4>::UInt
+//
+// is typedef-ed to be unsigned int (unsigned integer made up of 4
+// bytes).
+//
+// Such functionality should belong to STL, but I cannot find it
+// there.
+//
+// Google Test uses this class in the implementation of floating-point
+// comparison.
+//
+// For now it only handles UInt (unsigned int) as that's all Google Test
+// needs.  Other types can be easily added in the future if need
+// arises.
+template <size_t size>
+class TypeWithSize {
+ public:
+  // This prevents the user from using TypeWithSize<N> with incorrect
+  // values of N.
+  using UInt = void;
+};
+
+// The specialization for size 4.
+template <>
+class TypeWithSize<4> {
+ public:
+  using Int = std::int32_t;
+  using UInt = std::uint32_t;
+};
+
+// The specialization for size 8.
+template <>
+class TypeWithSize<8> {
+ public:
+  using Int = std::int64_t;
+  using UInt = std::uint64_t;
+};
+
+// Integer types of known sizes.
+using TimeInMillis = int64_t;  // Represents time in milliseconds.
+
+// Utilities for command line flags and environment variables.
+
+// Macro for referencing flags.
+#if !defined(GTEST_FLAG)
+#define GTEST_FLAG_NAME_(name) gtest_##name
+#define GTEST_FLAG(name) FLAGS_gtest_##name
+#endif  // !defined(GTEST_FLAG)
+
+// Pick a command line flags implementation.
+#if GTEST_HAS_ABSL
+
+// Macros for defining flags.
+#define GTEST_DEFINE_bool_(name, default_val, doc) \
+  ABSL_FLAG(bool, GTEST_FLAG_NAME_(name), default_val, doc)
+#define GTEST_DEFINE_int32_(name, default_val, doc) \
+  ABSL_FLAG(int32_t, GTEST_FLAG_NAME_(name), default_val, doc)
+#define GTEST_DEFINE_string_(name, default_val, doc) \
+  ABSL_FLAG(std::string, GTEST_FLAG_NAME_(name), default_val, doc)
+
+// Macros for declaring flags.
+#define GTEST_DECLARE_bool_(name) \
+  ABSL_DECLARE_FLAG(bool, GTEST_FLAG_NAME_(name))
+#define GTEST_DECLARE_int32_(name) \
+  ABSL_DECLARE_FLAG(int32_t, GTEST_FLAG_NAME_(name))
+#define GTEST_DECLARE_string_(name) \
+  ABSL_DECLARE_FLAG(std::string, GTEST_FLAG_NAME_(name))
+
+#define GTEST_FLAG_SAVER_ ::absl::FlagSaver
+
+#define GTEST_FLAG_GET(name) ::absl::GetFlag(GTEST_FLAG(name))
+#define GTEST_FLAG_SET(name, value) \
+  (void)(::absl::SetFlag(&GTEST_FLAG(name), value))
+#define GTEST_USE_OWN_FLAGFILE_FLAG_ 0
+
+#else  // GTEST_HAS_ABSL
+
+// Macros for defining flags.
+#define GTEST_DEFINE_bool_(name, default_val, doc)  \
+  namespace testing {                               \
+  GTEST_API_ bool GTEST_FLAG(name) = (default_val); \
+  }                                                 \
+  static_assert(true, "no-op to require trailing semicolon")
+#define GTEST_DEFINE_int32_(name, default_val, doc)         \
+  namespace testing {                                       \
+  GTEST_API_ std::int32_t GTEST_FLAG(name) = (default_val); \
+  }                                                         \
+  static_assert(true, "no-op to require trailing semicolon")
+#define GTEST_DEFINE_string_(name, default_val, doc)         \
+  namespace testing {                                        \
+  GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val); \
+  }                                                          \
+  static_assert(true, "no-op to require trailing semicolon")
+
+// Macros for declaring flags.
+#define GTEST_DECLARE_bool_(name)          \
+  namespace testing {                      \
+  GTEST_API_ extern bool GTEST_FLAG(name); \
+  }                                        \
+  static_assert(true, "no-op to require trailing semicolon")
+#define GTEST_DECLARE_int32_(name)                 \
+  namespace testing {                              \
+  GTEST_API_ extern std::int32_t GTEST_FLAG(name); \
+  }                                                \
+  static_assert(true, "no-op to require trailing semicolon")
+#define GTEST_DECLARE_string_(name)                 \
+  namespace testing {                               \
+  GTEST_API_ extern ::std::string GTEST_FLAG(name); \
+  }                                                 \
+  static_assert(true, "no-op to require trailing semicolon")
+
+#define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver
+
+#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
+#define GTEST_FLAG_SET(name, value) (void)(::testing::GTEST_FLAG(name) = value)
+#define GTEST_USE_OWN_FLAGFILE_FLAG_ 1
+
+#endif  // GTEST_HAS_ABSL
+
+// Thread annotations
+#if !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
+#define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
+#define GTEST_LOCK_EXCLUDED_(locks)
+#endif  // !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
+
+// Parses 'str' for a 32-bit signed integer.  If successful, writes the result
+// to *value and returns true; otherwise leaves *value unchanged and returns
+// false.
+GTEST_API_ bool ParseInt32(const Message& src_text, const char* str,
+                           int32_t* value);
+
+// Parses a bool/int32_t/string from the environment variable
+// corresponding to the given Google Test flag.
+bool BoolFromGTestEnv(const char* flag, bool default_val);
+GTEST_API_ int32_t Int32FromGTestEnv(const char* flag, int32_t default_val);
+std::string OutputFlagAlsoCheckEnvVar();
+const char* StringFromGTestEnv(const char* flag, const char* default_val);
+
+}  // namespace internal
+}  // namespace testing
+
+#if !defined(GTEST_INTERNAL_DEPRECATED)
+
+// Internal Macro to mark an API deprecated, for googletest usage only
+// Usage: class GTEST_INTERNAL_DEPRECATED(message) MyClass or
+// GTEST_INTERNAL_DEPRECATED(message) <return_type> myFunction(); Every usage of
+// a deprecated entity will trigger a warning when compiled with
+// `-Wdeprecated-declarations` option (clang, gcc, any __GNUC__ compiler).
+// For msvc /W3 option will need to be used
+// Note that for 'other' compilers this macro evaluates to nothing to prevent
+// compilations errors.
+#if defined(_MSC_VER)
+#define GTEST_INTERNAL_DEPRECATED(message) __declspec(deprecated(message))
+#elif defined(__GNUC__)
+#define GTEST_INTERNAL_DEPRECATED(message) __attribute__((deprecated(message)))
+#else
+#define GTEST_INTERNAL_DEPRECATED(message)
+#endif
+
+#endif  // !defined(GTEST_INTERNAL_DEPRECATED)
+
+#if GTEST_HAS_ABSL
+// Always use absl::any for UniversalPrinter<> specializations if googletest
+// is built with absl support.
+#define GTEST_INTERNAL_HAS_ANY 1
+#include "absl/types/any.h"
+namespace testing {
+namespace internal {
+using Any = ::absl::any;
+}  // namespace internal
+}  // namespace testing
+#else
+#ifdef __has_include
+#if __has_include(<any>) && __cplusplus >= 201703L
+// Otherwise for C++17 and higher use std::any for UniversalPrinter<>
+// specializations.
+#define GTEST_INTERNAL_HAS_ANY 1
+#include <any>
+namespace testing {
+namespace internal {
+using Any = ::std::any;
+}  // namespace internal
+}  // namespace testing
+// The case where absl is configured NOT to alias std::any is not
+// supported.
+#endif  // __has_include(<any>) && __cplusplus >= 201703L
+#endif  // __has_include
+#endif  // GTEST_HAS_ABSL
+
+#if GTEST_HAS_ABSL
+// Always use absl::optional for UniversalPrinter<> specializations if
+// googletest is built with absl support.
+#define GTEST_INTERNAL_HAS_OPTIONAL 1
+#include "absl/types/optional.h"
+namespace testing {
+namespace internal {
+template <typename T>
+using Optional = ::absl::optional<T>;
+inline ::absl::nullopt_t Nullopt() { return ::absl::nullopt; }
+}  // namespace internal
+}  // namespace testing
+#else
+#ifdef __has_include
+#if __has_include(<optional>) && __cplusplus >= 201703L
+// Otherwise for C++17 and higher use std::optional for UniversalPrinter<>
+// specializations.
+#define GTEST_INTERNAL_HAS_OPTIONAL 1
+#include <optional>
+namespace testing {
+namespace internal {
+template <typename T>
+using Optional = ::std::optional<T>;
+inline ::std::nullopt_t Nullopt() { return ::std::nullopt; }
+}  // namespace internal
+}  // namespace testing
+// The case where absl is configured NOT to alias std::optional is not
+// supported.
+#endif  // __has_include(<optional>) && __cplusplus >= 201703L
+#endif  // __has_include
+#endif  // GTEST_HAS_ABSL
+
+#if GTEST_HAS_ABSL
+// Always use absl::string_view for Matcher<> specializations if googletest
+// is built with absl support.
+#define GTEST_INTERNAL_HAS_STRING_VIEW 1
+#include "absl/strings/string_view.h"
+namespace testing {
+namespace internal {
+using StringView = ::absl::string_view;
+}  // namespace internal
+}  // namespace testing
+#else
+#ifdef __has_include
+#if __has_include(<string_view>) && __cplusplus >= 201703L
+// Otherwise for C++17 and higher use std::string_view for Matcher<>
+// specializations.
+#define GTEST_INTERNAL_HAS_STRING_VIEW 1
+#include <string_view>
+namespace testing {
+namespace internal {
+using StringView = ::std::string_view;
+}  // namespace internal
+}  // namespace testing
+// The case where absl is configured NOT to alias std::string_view is not
+// supported.
+#endif  // __has_include(<string_view>) && __cplusplus >= 201703L
+#endif  // __has_include
+#endif  // GTEST_HAS_ABSL
+
+#if GTEST_HAS_ABSL
+// Always use absl::variant for UniversalPrinter<> specializations if googletest
+// is built with absl support.
+#define GTEST_INTERNAL_HAS_VARIANT 1
+#include "absl/types/variant.h"
+namespace testing {
+namespace internal {
+template <typename... T>
+using Variant = ::absl::variant<T...>;
+}  // namespace internal
+}  // namespace testing
+#else
+#ifdef __has_include
+#if __has_include(<variant>) && __cplusplus >= 201703L
+// Otherwise for C++17 and higher use std::variant for UniversalPrinter<>
+// specializations.
+#define GTEST_INTERNAL_HAS_VARIANT 1
+#include <variant>
+namespace testing {
+namespace internal {
+template <typename... T>
+using Variant = ::std::variant<T...>;
+}  // namespace internal
+}  // namespace testing
+// The case where absl is configured NOT to alias std::variant is not supported.
+#endif  // __has_include(<variant>) && __cplusplus >= 201703L
+#endif  // __has_include
+#endif  // GTEST_HAS_ABSL
+
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h
new file mode 100644
index 0000000000..cca2e1f2ad
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h
@@ -0,0 +1,177 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file declares the String class and functions used internally by
+// Google Test.  They are subject to change without notice. They should not used
+// by code external to Google Test.
+//
+// This header file is #included by gtest-internal.h.
+// It should not be #included by other files.
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+
+#ifdef __BORLANDC__
+// string.h is not guaranteed to provide strcpy on C++ Builder.
+#include <mem.h>
+#endif
+
+#include <string.h>
+
+#include <cstdint>
+#include <string>
+
+#include "gtest/internal/gtest-port.h"
+
+namespace testing {
+namespace internal {
+
+// String - an abstract class holding static string utilities.
+class GTEST_API_ String {
+ public:
+  // Static utility methods
+
+  // Clones a 0-terminated C string, allocating memory using new.  The
+  // caller is responsible for deleting the return value using
+  // delete[].  Returns the cloned string, or NULL if the input is
+  // NULL.
+  //
+  // This is different from strdup() in string.h, which allocates
+  // memory using malloc().
+  static const char* CloneCString(const char* c_str);
+
+#if GTEST_OS_WINDOWS_MOBILE
+  // Windows CE does not have the 'ANSI' versions of Win32 APIs. To be
+  // able to pass strings to Win32 APIs on CE we need to convert them
+  // to 'Unicode', UTF-16.
+
+  // Creates a UTF-16 wide string from the given ANSI string, allocating
+  // memory using new. The caller is responsible for deleting the return
+  // value using delete[]. Returns the wide string, or NULL if the
+  // input is NULL.
+  //
+  // The wide string is created using the ANSI codepage (CP_ACP) to
+  // match the behaviour of the ANSI versions of Win32 calls and the
+  // C runtime.
+  static LPCWSTR AnsiToUtf16(const char* c_str);
+
+  // Creates an ANSI string from the given wide string, allocating
+  // memory using new. The caller is responsible for deleting the return
+  // value using delete[]. Returns the ANSI string, or NULL if the
+  // input is NULL.
+  //
+  // The returned string is created using the ANSI codepage (CP_ACP) to
+  // match the behaviour of the ANSI versions of Win32 calls and the
+  // C runtime.
+  static const char* Utf16ToAnsi(LPCWSTR utf16_str);
+#endif
+
+  // Compares two C strings.  Returns true if and only if they have the same
+  // content.
+  //
+  // Unlike strcmp(), this function can handle NULL argument(s).  A
+  // NULL C string is considered different to any non-NULL C string,
+  // including the empty string.
+  static bool CStringEquals(const char* lhs, const char* rhs);
+
+  // Converts a wide C string to a String using the UTF-8 encoding.
+  // NULL will be converted to "(null)".  If an error occurred during
+  // the conversion, "(failed to convert from wide string)" is
+  // returned.
+  static std::string ShowWideCString(const wchar_t* wide_c_str);
+
+  // Compares two wide C strings.  Returns true if and only if they have the
+  // same content.
+  //
+  // Unlike wcscmp(), this function can handle NULL argument(s).  A
+  // NULL C string is considered different to any non-NULL C string,
+  // including the empty string.
+  static bool WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs);
+
+  // Compares two C strings, ignoring case.  Returns true if and only if
+  // they have the same content.
+  //
+  // Unlike strcasecmp(), this function can handle NULL argument(s).
+  // A NULL C string is considered different to any non-NULL C string,
+  // including the empty string.
+  static bool CaseInsensitiveCStringEquals(const char* lhs, const char* rhs);
+
+  // Compares two wide C strings, ignoring case.  Returns true if and only if
+  // they have the same content.
+  //
+  // Unlike wcscasecmp(), this function can handle NULL argument(s).
+  // A NULL C string is considered different to any non-NULL wide C string,
+  // including the empty string.
+  // NB: The implementations on different platforms slightly differ.
+  // On windows, this method uses _wcsicmp which compares according to LC_CTYPE
+  // environment variable. On GNU platform this method uses wcscasecmp
+  // which compares according to LC_CTYPE category of the current locale.
+  // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
+  // current locale.
+  static bool CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
+                                               const wchar_t* rhs);
+
+  // Returns true if and only if the given string ends with the given suffix,
+  // ignoring case. Any string is considered to end with an empty suffix.
+  static bool EndsWithCaseInsensitive(const std::string& str,
+                                      const std::string& suffix);
+
+  // Formats an int value as "%02d".
+  static std::string FormatIntWidth2(int value);  // "%02d" for width == 2
+
+  // Formats an int value to given width with leading zeros.
+  static std::string FormatIntWidthN(int value, int width);
+
+  // Formats an int value as "%X".
+  static std::string FormatHexInt(int value);
+
+  // Formats an int value as "%X".
+  static std::string FormatHexUInt32(uint32_t value);
+
+  // Formats a byte as "%02X".
+  static std::string FormatByte(unsigned char value);
+
+ private:
+  String();  // Not meant to be instantiated.
+};           // class String
+
+// Gets the content of the stringstream's buffer as an std::string.  Each '\0'
+// character in the buffer is replaced with "\\0".
+GTEST_API_ std::string StringStreamToString(::std::stringstream* stream);
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h
new file mode 100644
index 0000000000..6bc02a7de3
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h
@@ -0,0 +1,186 @@
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Type utilities needed for implementing typed and type-parameterized
+// tests.
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+
+#include "gtest/internal/gtest-port.h"
+
+// #ifdef __GNUC__ is too general here.  It is possible to use gcc without using
+// libstdc++ (which is where cxxabi.h comes from).
+#if GTEST_HAS_CXXABI_H_
+#include <cxxabi.h>
+#elif defined(__HP_aCC)
+#include <acxx_demangle.h>
+#endif  // GTEST_HASH_CXXABI_H_
+
+namespace testing {
+namespace internal {
+
+// Canonicalizes a given name with respect to the Standard C++ Library.
+// This handles removing the inline namespace within `std` that is
+// used by various standard libraries (e.g., `std::__1`).  Names outside
+// of namespace std are returned unmodified.
+inline std::string CanonicalizeForStdLibVersioning(std::string s) {
+  static const char prefix[] = "std::__";
+  if (s.compare(0, strlen(prefix), prefix) == 0) {
+    std::string::size_type end = s.find("::", strlen(prefix));
+    if (end != s.npos) {
+      // Erase everything between the initial `std` and the second `::`.
+      s.erase(strlen("std"), end - strlen("std"));
+    }
+  }
+  return s;
+}
+
+#if GTEST_HAS_RTTI
+// GetTypeName(const std::type_info&) returns a human-readable name of type T.
+inline std::string GetTypeName(const std::type_info& type) {
+  const char* const name = type.name();
+#if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC)
+  int status = 0;
+  // gcc's implementation of typeid(T).name() mangles the type name,
+  // so we have to demangle it.
+#if GTEST_HAS_CXXABI_H_
+  using abi::__cxa_demangle;
+#endif  // GTEST_HAS_CXXABI_H_
+  char* const readable_name = __cxa_demangle(name, nullptr, nullptr, &status);
+  const std::string name_str(status == 0 ? readable_name : name);
+  free(readable_name);
+  return CanonicalizeForStdLibVersioning(name_str);
+#else
+  return name;
+#endif  // GTEST_HAS_CXXABI_H_ || __HP_aCC
+}
+#endif  // GTEST_HAS_RTTI
+
+// GetTypeName<T>() returns a human-readable name of type T if and only if
+// RTTI is enabled, otherwise it returns a dummy type name.
+// NB: This function is also used in Google Mock, so don't move it inside of
+// the typed-test-only section below.
+template <typename T>
+std::string GetTypeName() {
+#if GTEST_HAS_RTTI
+  return GetTypeName(typeid(T));
+#else
+  return "<type>";
+#endif  // GTEST_HAS_RTTI
+}
+
+// A unique type indicating an empty node
+struct None {};
+
+#define GTEST_TEMPLATE_ \
+  template <typename T> \
+  class
+
+// The template "selector" struct TemplateSel<Tmpl> is used to
+// represent Tmpl, which must be a class template with one type
+// parameter, as a type.  TemplateSel<Tmpl>::Bind<T>::type is defined
+// as the type Tmpl<T>.  This allows us to actually instantiate the
+// template "selected" by TemplateSel<Tmpl>.
+//
+// This trick is necessary for simulating typedef for class templates,
+// which C++ doesn't support directly.
+template <GTEST_TEMPLATE_ Tmpl>
+struct TemplateSel {
+  template <typename T>
+  struct Bind {
+    typedef Tmpl<T> type;
+  };
+};
+
+#define GTEST_BIND_(TmplSel, T) TmplSel::template Bind<T>::type
+
+template <GTEST_TEMPLATE_ Head_, GTEST_TEMPLATE_... Tail_>
+struct Templates {
+  using Head = TemplateSel<Head_>;
+  using Tail = Templates<Tail_...>;
+};
+
+template <GTEST_TEMPLATE_ Head_>
+struct Templates<Head_> {
+  using Head = TemplateSel<Head_>;
+  using Tail = None;
+};
+
+// Tuple-like type lists
+template <typename Head_, typename... Tail_>
+struct Types {
+  using Head = Head_;
+  using Tail = Types<Tail_...>;
+};
+
+template <typename Head_>
+struct Types<Head_> {
+  using Head = Head_;
+  using Tail = None;
+};
+
+// Helper metafunctions to tell apart a single type from types
+// generated by ::testing::Types
+template <typename... Ts>
+struct ProxyTypeList {
+  using type = Types<Ts...>;
+};
+
+template <typename>
+struct is_proxy_type_list : std::false_type {};
+
+template <typename... Ts>
+struct is_proxy_type_list<ProxyTypeList<Ts...>> : std::true_type {};
+
+// Generator which conditionally creates type lists.
+// It recognizes if a requested type list should be created
+// and prevents creating a new type list nested within another one.
+template <typename T>
+struct GenerateTypeList {
+ private:
+  using proxy = typename std::conditional<is_proxy_type_list<T>::value, T,
+                                          ProxyTypeList<T>>::type;
+
+ public:
+  using type = typename proxy::type;
+};
+
+}  // namespace internal
+
+template <typename... Ts>
+using Types = internal::ProxyTypeList<Ts...>;
+
+}  // namespace testing
+
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest-all.cc b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-all.cc
index 912868148e..2a70ed88c7 100644
--- a/media/libvpx/libvpx/third_party/googletest/src/src/gtest-all.cc
+++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-all.cc
@@ -26,10 +26,9 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 //
-// Author: mheule@google.com (Markus Heule)
-//
-// Google C++ Testing Framework (Google Test)
+// Google C++ Testing and Mocking Framework (Google Test)
 //
 // Sometimes it's desirable to build Google Test by compiling a single file.
 // This file serves this purpose.
@@ -39,9554 +38,12 @@
 #include "gtest/gtest.h"
 
 // The following lines pull in the real gtest *.cc files.
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-//
-// The Google C++ Testing Framework (Google Test)
-
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-//
-// Utilities for testing Google Test itself and code that uses Google Test
-// (e.g. frameworks built on top of Google Test).
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_SPI_H_
-#define GTEST_INCLUDE_GTEST_GTEST_SPI_H_
-
-
-namespace testing {
-
-// This helper class can be used to mock out Google Test failure reporting
-// so that we can test Google Test or code that builds on Google Test.
-//
-// An object of this class appends a TestPartResult object to the
-// TestPartResultArray object given in the constructor whenever a Google Test
-// failure is reported. It can either intercept only failures that are
-// generated in the same thread that created this object or it can intercept
-// all generated failures. The scope of this mock object can be controlled with
-// the second argument to the two arguments constructor.
-class GTEST_API_ ScopedFakeTestPartResultReporter
-    : public TestPartResultReporterInterface {
- public:
-  // The two possible mocking modes of this object.
-  enum InterceptMode {
-    INTERCEPT_ONLY_CURRENT_THREAD,  // Intercepts only thread local failures.
-    INTERCEPT_ALL_THREADS           // Intercepts all failures.
-  };
-
-  // The c'tor sets this object as the test part result reporter used
-  // by Google Test.  The 'result' parameter specifies where to report the
-  // results. This reporter will only catch failures generated in the current
-  // thread. DEPRECATED
-  explicit ScopedFakeTestPartResultReporter(TestPartResultArray* result);
-
-  // Same as above, but you can choose the interception scope of this object.
-  ScopedFakeTestPartResultReporter(InterceptMode intercept_mode,
-                                   TestPartResultArray* result);
-
-  // The d'tor restores the previous test part result reporter.
-  virtual ~ScopedFakeTestPartResultReporter();
-
-  // Appends the TestPartResult object to the TestPartResultArray
-  // received in the constructor.
-  //
-  // This method is from the TestPartResultReporterInterface
-  // interface.
-  virtual void ReportTestPartResult(const TestPartResult& result);
- private:
-  void Init();
-
-  const InterceptMode intercept_mode_;
-  TestPartResultReporterInterface* old_reporter_;
-  TestPartResultArray* const result_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedFakeTestPartResultReporter);
-};
-
-namespace internal {
-
-// A helper class for implementing EXPECT_FATAL_FAILURE() and
-// EXPECT_NONFATAL_FAILURE().  Its destructor verifies that the given
-// TestPartResultArray contains exactly one failure that has the given
-// type and contains the given substring.  If that's not the case, a
-// non-fatal failure will be generated.
-class GTEST_API_ SingleFailureChecker {
- public:
-  // The constructor remembers the arguments.
-  SingleFailureChecker(const TestPartResultArray* results,
-                       TestPartResult::Type type,
-                       const string& substr);
-  ~SingleFailureChecker();
- private:
-  const TestPartResultArray* const results_;
-  const TestPartResult::Type type_;
-  const string substr_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(SingleFailureChecker);
-};
-
-}  // namespace internal
-
-}  // namespace testing
-
-// A set of macros for testing Google Test assertions or code that's expected
-// to generate Google Test fatal failures.  It verifies that the given
-// statement will cause exactly one fatal Google Test failure with 'substr'
-// being part of the failure message.
-//
-// There are two different versions of this macro. EXPECT_FATAL_FAILURE only
-// affects and considers failures generated in the current thread and
-// EXPECT_FATAL_FAILURE_ON_ALL_THREADS does the same but for all threads.
-//
-// The verification of the assertion is done correctly even when the statement
-// throws an exception or aborts the current function.
-//
-// Known restrictions:
-//   - 'statement' cannot reference local non-static variables or
-//     non-static members of the current object.
-//   - 'statement' cannot return a value.
-//   - You cannot stream a failure message to this macro.
-//
-// Note that even though the implementations of the following two
-// macros are much alike, we cannot refactor them to use a common
-// helper macro, due to some peculiarity in how the preprocessor
-// works.  The AcceptsMacroThatExpandsToUnprotectedComma test in
-// gtest_unittest.cc will fail to compile if we do that.
-#define EXPECT_FATAL_FAILURE(statement, substr) \
-  do { \
-    class GTestExpectFatalFailureHelper {\
-     public:\
-      static void Execute() { statement; }\
-    };\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
-        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
-          ::testing::ScopedFakeTestPartResultReporter:: \
-          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
-      GTestExpectFatalFailureHelper::Execute();\
-    }\
-  } while (::testing::internal::AlwaysFalse())
-
-#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
-  do { \
-    class GTestExpectFatalFailureHelper {\
-     public:\
-      static void Execute() { statement; }\
-    };\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
-        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
-          ::testing::ScopedFakeTestPartResultReporter:: \
-          INTERCEPT_ALL_THREADS, &gtest_failures);\
-      GTestExpectFatalFailureHelper::Execute();\
-    }\
-  } while (::testing::internal::AlwaysFalse())
-
-// A macro for testing Google Test assertions or code that's expected to
-// generate Google Test non-fatal failures.  It asserts that the given
-// statement will cause exactly one non-fatal Google Test failure with 'substr'
-// being part of the failure message.
-//
-// There are two different versions of this macro. EXPECT_NONFATAL_FAILURE only
-// affects and considers failures generated in the current thread and
-// EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS does the same but for all threads.
-//
-// 'statement' is allowed to reference local variables and members of
-// the current object.
-//
-// The verification of the assertion is done correctly even when the statement
-// throws an exception or aborts the current function.
-//
-// Known restrictions:
-//   - You cannot stream a failure message to this macro.
-//
-// Note that even though the implementations of the following two
-// macros are much alike, we cannot refactor them to use a common
-// helper macro, due to some peculiarity in how the preprocessor
-// works.  If we do that, the code won't compile when the user gives
-// EXPECT_NONFATAL_FAILURE() a statement that contains a macro that
-// expands to code containing an unprotected comma.  The
-// AcceptsMacroThatExpandsToUnprotectedComma test in gtest_unittest.cc
-// catches that.
-//
-// For the same reason, we have to write
-//   if (::testing::internal::AlwaysTrue()) { statement; }
-// instead of
-//   GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
-// to avoid an MSVC warning on unreachable code.
-#define EXPECT_NONFATAL_FAILURE(statement, substr) \
-  do {\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
-        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
-        (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
-          ::testing::ScopedFakeTestPartResultReporter:: \
-          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
-      if (::testing::internal::AlwaysTrue()) { statement; }\
-    }\
-  } while (::testing::internal::AlwaysFalse())
-
-#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
-  do {\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
-        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
-        (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
-          ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \
-          &gtest_failures);\
-      if (::testing::internal::AlwaysTrue()) { statement; }\
-    }\
-  } while (::testing::internal::AlwaysFalse())
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_SPI_H_
-
-#include <ctype.h>
-#include <math.h>
-#include <stdarg.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-#include <wchar.h>
-#include <wctype.h>
-
-#include <algorithm>
-#include <iomanip>
-#include <limits>
-#include <ostream>  // NOLINT
-#include <sstream>
-#include <vector>
-
-#if GTEST_OS_LINUX
-
-// TODO(kenton@google.com): Use autoconf to detect availability of
-// gettimeofday().
-# define GTEST_HAS_GETTIMEOFDAY_ 1
-
-# include <fcntl.h>  // NOLINT
-# include <limits.h>  // NOLINT
-# include <sched.h>  // NOLINT
-// Declares vsnprintf().  This header is not available on Windows.
-# include <strings.h>  // NOLINT
-# include <sys/mman.h>  // NOLINT
-# include <sys/time.h>  // NOLINT
-# include <unistd.h>  // NOLINT
-# include <string>
-
-#elif GTEST_OS_SYMBIAN
-# define GTEST_HAS_GETTIMEOFDAY_ 1
-# include <sys/time.h>  // NOLINT
-
-#elif GTEST_OS_ZOS
-# define GTEST_HAS_GETTIMEOFDAY_ 1
-# include <sys/time.h>  // NOLINT
-
-// On z/OS we additionally need strings.h for strcasecmp.
-# include <strings.h>  // NOLINT
-
-#elif GTEST_OS_WINDOWS_MOBILE  // We are on Windows CE.
-
-# include <windows.h>  // NOLINT
-
-#elif GTEST_OS_WINDOWS  // We are on Windows proper.
-
-# include <io.h>  // NOLINT
-# include <sys/timeb.h>  // NOLINT
-# include <sys/types.h>  // NOLINT
-# include <sys/stat.h>  // NOLINT
-
-# if GTEST_OS_WINDOWS_MINGW
-// MinGW has gettimeofday() but not _ftime64().
-// TODO(kenton@google.com): Use autoconf to detect availability of
-//   gettimeofday().
-// TODO(kenton@google.com): There are other ways to get the time on
-//   Windows, like GetTickCount() or GetSystemTimeAsFileTime().  MinGW
-//   supports these.  consider using them instead.
-#  define GTEST_HAS_GETTIMEOFDAY_ 1
-#  include <sys/time.h>  // NOLINT
-# endif  // GTEST_OS_WINDOWS_MINGW
-
-// cpplint thinks that the header is already included, so we want to
-// silence it.
-# include <windows.h>  // NOLINT
-
-#else
-
-// Assume other platforms have gettimeofday().
-// TODO(kenton@google.com): Use autoconf to detect availability of
-//   gettimeofday().
-# define GTEST_HAS_GETTIMEOFDAY_ 1
-
-// cpplint thinks that the header is already included, so we want to
-// silence it.
-# include <sys/time.h>  // NOLINT
-# include <unistd.h>  // NOLINT
-
-#endif  // GTEST_OS_LINUX
-
-#if GTEST_HAS_EXCEPTIONS
-# include <stdexcept>
-#endif
-
-#if GTEST_CAN_STREAM_RESULTS_
-# include <arpa/inet.h>  // NOLINT
-# include <netdb.h>  // NOLINT
-#endif
-
-// Indicates that this translation unit is part of Google Test's
-// implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick is to
-// prevent a user from accidentally including gtest-internal-inl.h in
-// his code.
-#define GTEST_IMPLEMENTATION_ 1
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Utility functions and classes used by the Google C++ testing framework.
-//
-// Author: wan@google.com (Zhanyong Wan)
-//
-// This file contains purely Google Test's internal implementation.  Please
-// DO NOT #INCLUDE IT IN A USER PROGRAM.
-
-#ifndef GTEST_SRC_GTEST_INTERNAL_INL_H_
-#define GTEST_SRC_GTEST_INTERNAL_INL_H_
-
-// GTEST_IMPLEMENTATION_ is defined to 1 iff the current translation unit is
-// part of Google Test's implementation; otherwise it's undefined.
-#if !GTEST_IMPLEMENTATION_
-// A user is trying to include this from his code - just say no.
-# error "gtest-internal-inl.h is part of Google Test's internal implementation."
-# error "It must not be included except by Google Test itself."
-#endif  // GTEST_IMPLEMENTATION_
-
-#ifndef _WIN32_WCE
-# include <errno.h>
-#endif  // !_WIN32_WCE
-#include <stddef.h>
-#include <stdlib.h>  // For strtoll/_strtoul64/malloc/free.
-#include <string.h>  // For memmove.
-
-#include <algorithm>
-#include <string>
-#include <vector>
-
-
-#if GTEST_CAN_STREAM_RESULTS_
-# include <arpa/inet.h>  // NOLINT
-# include <netdb.h>  // NOLINT
-#endif
-
-#if GTEST_OS_WINDOWS
-# include <windows.h>  // NOLINT
-#endif  // GTEST_OS_WINDOWS
-
-
-namespace testing {
-
-// Declares the flags.
-//
-// We don't want the users to modify this flag in the code, but want
-// Google Test's own unit tests to be able to access it. Therefore we
-// declare it here as opposed to in gtest.h.
-GTEST_DECLARE_bool_(death_test_use_fork);
-
-namespace internal {
-
-// The value of GetTestTypeId() as seen from within the Google Test
-// library.  This is solely for testing GetTestTypeId().
-GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest;
-
-// Names of the flags (needed for parsing Google Test flags).
-const char kAlsoRunDisabledTestsFlag[] = "also_run_disabled_tests";
-const char kBreakOnFailureFlag[] = "break_on_failure";
-const char kCatchExceptionsFlag[] = "catch_exceptions";
-const char kColorFlag[] = "color";
-const char kFilterFlag[] = "filter";
-const char kListTestsFlag[] = "list_tests";
-const char kOutputFlag[] = "output";
-const char kPrintTimeFlag[] = "print_time";
-const char kRandomSeedFlag[] = "random_seed";
-const char kRepeatFlag[] = "repeat";
-const char kShuffleFlag[] = "shuffle";
-const char kStackTraceDepthFlag[] = "stack_trace_depth";
-const char kStreamResultToFlag[] = "stream_result_to";
-const char kThrowOnFailureFlag[] = "throw_on_failure";
-
-// A valid random seed must be in [1, kMaxRandomSeed].
-const int kMaxRandomSeed = 99999;
-
-// g_help_flag is true iff the --help flag or an equivalent form is
-// specified on the command line.
-GTEST_API_ extern bool g_help_flag;
-
-// Returns the current time in milliseconds.
-GTEST_API_ TimeInMillis GetTimeInMillis();
-
-// Returns true iff Google Test should use colors in the output.
-GTEST_API_ bool ShouldUseColor(bool stdout_is_tty);
-
-// Formats the given time in milliseconds as seconds.
-GTEST_API_ std::string FormatTimeInMillisAsSeconds(TimeInMillis ms);
-
-// Converts the given time in milliseconds to a date string in the ISO 8601
-// format, without the timezone information.  N.B.: due to the use the
-// non-reentrant localtime() function, this function is not thread safe.  Do
-// not use it in any code that can be called from multiple threads.
-GTEST_API_ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms);
-
-// Parses a string for an Int32 flag, in the form of "--flag=value".
-//
-// On success, stores the value of the flag in *value, and returns
-// true.  On failure, returns false without changing *value.
-GTEST_API_ bool ParseInt32Flag(
-    const char* str, const char* flag, Int32* value);
-
-// Returns a random seed in range [1, kMaxRandomSeed] based on the
-// given --gtest_random_seed flag value.
-inline int GetRandomSeedFromFlag(Int32 random_seed_flag) {
-  const unsigned int raw_seed = (random_seed_flag == 0) ?
-      static_cast<unsigned int>(GetTimeInMillis()) :
-      static_cast<unsigned int>(random_seed_flag);
-
-  // Normalizes the actual seed to range [1, kMaxRandomSeed] such that
-  // it's easy to type.
-  const int normalized_seed =
-      static_cast<int>((raw_seed - 1U) %
-                       static_cast<unsigned int>(kMaxRandomSeed)) + 1;
-  return normalized_seed;
-}
-
-// Returns the first valid random seed after 'seed'.  The behavior is
-// undefined if 'seed' is invalid.  The seed after kMaxRandomSeed is
-// considered to be 1.
-inline int GetNextRandomSeed(int seed) {
-  GTEST_CHECK_(1 <= seed && seed <= kMaxRandomSeed)
-      << "Invalid random seed " << seed << " - must be in [1, "
-      << kMaxRandomSeed << "].";
-  const int next_seed = seed + 1;
-  return (next_seed > kMaxRandomSeed) ? 1 : next_seed;
-}
-
-// This class saves the values of all Google Test flags in its c'tor, and
-// restores them in its d'tor.
-class GTestFlagSaver {
- public:
-  // The c'tor.
-  GTestFlagSaver() {
-    also_run_disabled_tests_ = GTEST_FLAG(also_run_disabled_tests);
-    break_on_failure_ = GTEST_FLAG(break_on_failure);
-    catch_exceptions_ = GTEST_FLAG(catch_exceptions);
-    color_ = GTEST_FLAG(color);
-    death_test_style_ = GTEST_FLAG(death_test_style);
-    death_test_use_fork_ = GTEST_FLAG(death_test_use_fork);
-    filter_ = GTEST_FLAG(filter);
-    internal_run_death_test_ = GTEST_FLAG(internal_run_death_test);
-    list_tests_ = GTEST_FLAG(list_tests);
-    output_ = GTEST_FLAG(output);
-    print_time_ = GTEST_FLAG(print_time);
-    random_seed_ = GTEST_FLAG(random_seed);
-    repeat_ = GTEST_FLAG(repeat);
-    shuffle_ = GTEST_FLAG(shuffle);
-    stack_trace_depth_ = GTEST_FLAG(stack_trace_depth);
-    stream_result_to_ = GTEST_FLAG(stream_result_to);
-    throw_on_failure_ = GTEST_FLAG(throw_on_failure);
-  }
-
-  // The d'tor is not virtual.  DO NOT INHERIT FROM THIS CLASS.
-  ~GTestFlagSaver() {
-    GTEST_FLAG(also_run_disabled_tests) = also_run_disabled_tests_;
-    GTEST_FLAG(break_on_failure) = break_on_failure_;
-    GTEST_FLAG(catch_exceptions) = catch_exceptions_;
-    GTEST_FLAG(color) = color_;
-    GTEST_FLAG(death_test_style) = death_test_style_;
-    GTEST_FLAG(death_test_use_fork) = death_test_use_fork_;
-    GTEST_FLAG(filter) = filter_;
-    GTEST_FLAG(internal_run_death_test) = internal_run_death_test_;
-    GTEST_FLAG(list_tests) = list_tests_;
-    GTEST_FLAG(output) = output_;
-    GTEST_FLAG(print_time) = print_time_;
-    GTEST_FLAG(random_seed) = random_seed_;
-    GTEST_FLAG(repeat) = repeat_;
-    GTEST_FLAG(shuffle) = shuffle_;
-    GTEST_FLAG(stack_trace_depth) = stack_trace_depth_;
-    GTEST_FLAG(stream_result_to) = stream_result_to_;
-    GTEST_FLAG(throw_on_failure) = throw_on_failure_;
-  }
-
- private:
-  // Fields for saving the original values of flags.
-  bool also_run_disabled_tests_;
-  bool break_on_failure_;
-  bool catch_exceptions_;
-  std::string color_;
-  std::string death_test_style_;
-  bool death_test_use_fork_;
-  std::string filter_;
-  std::string internal_run_death_test_;
-  bool list_tests_;
-  std::string output_;
-  bool print_time_;
-  internal::Int32 random_seed_;
-  internal::Int32 repeat_;
-  bool shuffle_;
-  internal::Int32 stack_trace_depth_;
-  std::string stream_result_to_;
-  bool throw_on_failure_;
-} GTEST_ATTRIBUTE_UNUSED_;
-
-// Converts a Unicode code point to a narrow string in UTF-8 encoding.
-// code_point parameter is of type UInt32 because wchar_t may not be
-// wide enough to contain a code point.
-// If the code_point is not a valid Unicode code point
-// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
-// to "(Invalid Unicode 0xXXXXXXXX)".
-GTEST_API_ std::string CodePointToUtf8(UInt32 code_point);
-
-// Converts a wide string to a narrow string in UTF-8 encoding.
-// The wide string is assumed to have the following encoding:
-//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS)
-//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
-// Parameter str points to a null-terminated wide string.
-// Parameter num_chars may additionally limit the number
-// of wchar_t characters processed. -1 is used when the entire string
-// should be processed.
-// If the string contains code points that are not valid Unicode code points
-// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
-// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
-// and contains invalid UTF-16 surrogate pairs, values in those pairs
-// will be encoded as individual Unicode characters from Basic Normal Plane.
-GTEST_API_ std::string WideStringToUtf8(const wchar_t* str, int num_chars);
-
-// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
-// if the variable is present. If a file already exists at this location, this
-// function will write over it. If the variable is present, but the file cannot
-// be created, prints an error and exits.
-void WriteToShardStatusFileIfNeeded();
-
-// Checks whether sharding is enabled by examining the relevant
-// environment variable values. If the variables are present,
-// but inconsistent (e.g., shard_index >= total_shards), prints
-// an error and exits. If in_subprocess_for_death_test, sharding is
-// disabled because it must only be applied to the original test
-// process. Otherwise, we could filter out death tests we intended to execute.
-GTEST_API_ bool ShouldShard(const char* total_shards_str,
-                            const char* shard_index_str,
-                            bool in_subprocess_for_death_test);
-
-// Parses the environment variable var as an Int32. If it is unset,
-// returns default_val. If it is not an Int32, prints an error and
-// and aborts.
-GTEST_API_ Int32 Int32FromEnvOrDie(const char* env_var, Int32 default_val);
-
-// Given the total number of shards, the shard index, and the test id,
-// returns true iff the test should be run on this shard. The test id is
-// some arbitrary but unique non-negative integer assigned to each test
-// method. Assumes that 0 <= shard_index < total_shards.
-GTEST_API_ bool ShouldRunTestOnShard(
-    int total_shards, int shard_index, int test_id);
-
-// STL container utilities.
-
-// Returns the number of elements in the given container that satisfy
-// the given predicate.
-template <class Container, typename Predicate>
-inline int CountIf(const Container& c, Predicate predicate) {
-  // Implemented as an explicit loop since std::count_if() in libCstd on
-  // Solaris has a non-standard signature.
-  int count = 0;
-  for (typename Container::const_iterator it = c.begin(); it != c.end(); ++it) {
-    if (predicate(*it))
-      ++count;
-  }
-  return count;
-}
-
-// Applies a function/functor to each element in the container.
-template <class Container, typename Functor>
-void ForEach(const Container& c, Functor functor) {
-  std::for_each(c.begin(), c.end(), functor);
-}
-
-// Returns the i-th element of the vector, or default_value if i is not
-// in range [0, v.size()).
-template <typename E>
-inline E GetElementOr(const std::vector<E>& v, int i, E default_value) {
-  return (i < 0 || i >= static_cast<int>(v.size())) ? default_value : v[i];
-}
-
-// Performs an in-place shuffle of a range of the vector's elements.
-// 'begin' and 'end' are element indices as an STL-style range;
-// i.e. [begin, end) are shuffled, where 'end' == size() means to
-// shuffle to the end of the vector.
-template <typename E>
-void ShuffleRange(internal::Random* random, int begin, int end,
-                  std::vector<E>* v) {
-  const int size = static_cast<int>(v->size());
-  GTEST_CHECK_(0 <= begin && begin <= size)
-      << "Invalid shuffle range start " << begin << ": must be in range [0, "
-      << size << "].";
-  GTEST_CHECK_(begin <= end && end <= size)
-      << "Invalid shuffle range finish " << end << ": must be in range ["
-      << begin << ", " << size << "].";
-
-  // Fisher-Yates shuffle, from
-  // http://en.wikipedia.org/wiki/Fisher-Yates_shuffle
-  for (int range_width = end - begin; range_width >= 2; range_width--) {
-    const int last_in_range = begin + range_width - 1;
-    const int selected = begin + random->Generate(range_width);
-    std::swap((*v)[selected], (*v)[last_in_range]);
-  }
-}
-
-// Performs an in-place shuffle of the vector's elements.
-template <typename E>
-inline void Shuffle(internal::Random* random, std::vector<E>* v) {
-  ShuffleRange(random, 0, static_cast<int>(v->size()), v);
-}
-
-// A function for deleting an object.  Handy for being used as a
-// functor.
-template <typename T>
-static void Delete(T* x) {
-  delete x;
-}
-
-// A predicate that checks the key of a TestProperty against a known key.
-//
-// TestPropertyKeyIs is copyable.
-class TestPropertyKeyIs {
- public:
-  // Constructor.
-  //
-  // TestPropertyKeyIs has NO default constructor.
-  explicit TestPropertyKeyIs(const std::string& key) : key_(key) {}
-
-  // Returns true iff the test name of test property matches on key_.
-  bool operator()(const TestProperty& test_property) const {
-    return test_property.key() == key_;
-  }
-
- private:
-  std::string key_;
-};
-
-// Class UnitTestOptions.
-//
-// This class contains functions for processing options the user
-// specifies when running the tests.  It has only static members.
-//
-// In most cases, the user can specify an option using either an
-// environment variable or a command line flag.  E.g. you can set the
-// test filter using either GTEST_FILTER or --gtest_filter.  If both
-// the variable and the flag are present, the latter overrides the
-// former.
-class GTEST_API_ UnitTestOptions {
- public:
-  // Functions for processing the gtest_output flag.
-
-  // Returns the output format, or "" for normal printed output.
-  static std::string GetOutputFormat();
-
-  // Returns the absolute path of the requested output file, or the
-  // default (test_detail.xml in the original working directory) if
-  // none was explicitly specified.
-  static std::string GetAbsolutePathToOutputFile();
-
-  // Functions for processing the gtest_filter flag.
-
-  // Returns true iff the wildcard pattern matches the string.  The
-  // first ':' or '\0' character in pattern marks the end of it.
-  //
-  // This recursive algorithm isn't very efficient, but is clear and
-  // works well enough for matching test names, which are short.
-  static bool PatternMatchesString(const char *pattern, const char *str);
-
-  // Returns true iff the user-specified filter matches the test case
-  // name and the test name.
-  static bool FilterMatchesTest(const std::string &test_case_name,
-                                const std::string &test_name);
-
-#if GTEST_OS_WINDOWS
-  // Function for supporting the gtest_catch_exception flag.
-
-  // Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
-  // given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
-  // This function is useful as an __except condition.
-  static int GTestShouldProcessSEH(DWORD exception_code);
-#endif  // GTEST_OS_WINDOWS
-
-  // Returns true if "name" matches the ':' separated list of glob-style
-  // filters in "filter".
-  static bool MatchesFilter(const std::string& name, const char* filter);
-};
-
-// Returns the current application's name, removing directory path if that
-// is present.  Used by UnitTestOptions::GetOutputFile.
-GTEST_API_ FilePath GetCurrentExecutableName();
-
-// The role interface for getting the OS stack trace as a string.
-class OsStackTraceGetterInterface {
- public:
-  OsStackTraceGetterInterface() {}
-  virtual ~OsStackTraceGetterInterface() {}
-
-  // Returns the current OS stack trace as an std::string.  Parameters:
-  //
-  //   max_depth  - the maximum number of stack frames to be included
-  //                in the trace.
-  //   skip_count - the number of top frames to be skipped; doesn't count
-  //                against max_depth.
-  virtual string CurrentStackTrace(int max_depth, int skip_count) = 0;
-
-  // UponLeavingGTest() should be called immediately before Google Test calls
-  // user code. It saves some information about the current stack that
-  // CurrentStackTrace() will use to find and hide Google Test stack frames.
-  virtual void UponLeavingGTest() = 0;
-
- private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface);
-};
-
-// A working implementation of the OsStackTraceGetterInterface interface.
-class OsStackTraceGetter : public OsStackTraceGetterInterface {
- public:
-  OsStackTraceGetter() : caller_frame_(NULL) {}
-
-  virtual string CurrentStackTrace(int max_depth, int skip_count)
-      GTEST_LOCK_EXCLUDED_(mutex_);
-
-  virtual void UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_);
-
-  // This string is inserted in place of stack frames that are part of
-  // Google Test's implementation.
-  static const char* const kElidedFramesMarker;
-
- private:
-  Mutex mutex_;  // protects all internal state
-
-  // We save the stack frame below the frame that calls user code.
-  // We do this because the address of the frame immediately below
-  // the user code changes between the call to UponLeavingGTest()
-  // and any calls to CurrentStackTrace() from within the user code.
-  void* caller_frame_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter);
-};
-
-// Information about a Google Test trace point.
-struct TraceInfo {
-  const char* file;
-  int line;
-  std::string message;
-};
-
-// This is the default global test part result reporter used in UnitTestImpl.
-// This class should only be used by UnitTestImpl.
-class DefaultGlobalTestPartResultReporter
-  : public TestPartResultReporterInterface {
- public:
-  explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test);
-  // Implements the TestPartResultReporterInterface. Reports the test part
-  // result in the current test.
-  virtual void ReportTestPartResult(const TestPartResult& result);
-
- private:
-  UnitTestImpl* const unit_test_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultGlobalTestPartResultReporter);
-};
-
-// This is the default per thread test part result reporter used in
-// UnitTestImpl. This class should only be used by UnitTestImpl.
-class DefaultPerThreadTestPartResultReporter
-    : public TestPartResultReporterInterface {
- public:
-  explicit DefaultPerThreadTestPartResultReporter(UnitTestImpl* unit_test);
-  // Implements the TestPartResultReporterInterface. The implementation just
-  // delegates to the current global test part result reporter of *unit_test_.
-  virtual void ReportTestPartResult(const TestPartResult& result);
-
- private:
-  UnitTestImpl* const unit_test_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultPerThreadTestPartResultReporter);
-};
-
-// The private implementation of the UnitTest class.  We don't protect
-// the methods under a mutex, as this class is not accessible by a
-// user and the UnitTest class that delegates work to this class does
-// proper locking.
-class GTEST_API_ UnitTestImpl {
- public:
-  explicit UnitTestImpl(UnitTest* parent);
-  virtual ~UnitTestImpl();
-
-  // There are two different ways to register your own TestPartResultReporter.
-  // You can register your own repoter to listen either only for test results
-  // from the current thread or for results from all threads.
-  // By default, each per-thread test result repoter just passes a new
-  // TestPartResult to the global test result reporter, which registers the
-  // test part result for the currently running test.
-
-  // Returns the global test part result reporter.
-  TestPartResultReporterInterface* GetGlobalTestPartResultReporter();
-
-  // Sets the global test part result reporter.
-  void SetGlobalTestPartResultReporter(
-      TestPartResultReporterInterface* reporter);
-
-  // Returns the test part result reporter for the current thread.
-  TestPartResultReporterInterface* GetTestPartResultReporterForCurrentThread();
-
-  // Sets the test part result reporter for the current thread.
-  void SetTestPartResultReporterForCurrentThread(
-      TestPartResultReporterInterface* reporter);
-
-  // Gets the number of successful test cases.
-  int successful_test_case_count() const;
-
-  // Gets the number of failed test cases.
-  int failed_test_case_count() const;
-
-  // Gets the number of all test cases.
-  int total_test_case_count() const;
-
-  // Gets the number of all test cases that contain at least one test
-  // that should run.
-  int test_case_to_run_count() const;
-
-  // Gets the number of successful tests.
-  int successful_test_count() const;
-
-  // Gets the number of failed tests.
-  int failed_test_count() const;
-
-  // Gets the number of disabled tests that will be reported in the XML report.
-  int reportable_disabled_test_count() const;
-
-  // Gets the number of disabled tests.
-  int disabled_test_count() const;
-
-  // Gets the number of tests to be printed in the XML report.
-  int reportable_test_count() const;
-
-  // Gets the number of all tests.
-  int total_test_count() const;
-
-  // Gets the number of tests that should run.
-  int test_to_run_count() const;
-
-  // Gets the time of the test program start, in ms from the start of the
-  // UNIX epoch.
-  TimeInMillis start_timestamp() const { return start_timestamp_; }
-
-  // Gets the elapsed time, in milliseconds.
-  TimeInMillis elapsed_time() const { return elapsed_time_; }
-
-  // Returns true iff the unit test passed (i.e. all test cases passed).
-  bool Passed() const { return !Failed(); }
-
-  // Returns true iff the unit test failed (i.e. some test case failed
-  // or something outside of all tests failed).
-  bool Failed() const {
-    return failed_test_case_count() > 0 || ad_hoc_test_result()->Failed();
-  }
-
-  // Gets the i-th test case among all the test cases. i can range from 0 to
-  // total_test_case_count() - 1. If i is not in that range, returns NULL.
-  const TestCase* GetTestCase(int i) const {
-    const int index = GetElementOr(test_case_indices_, i, -1);
-    return index < 0 ? NULL : test_cases_[i];
-  }
-
-  // Gets the i-th test case among all the test cases. i can range from 0 to
-  // total_test_case_count() - 1. If i is not in that range, returns NULL.
-  TestCase* GetMutableTestCase(int i) {
-    const int index = GetElementOr(test_case_indices_, i, -1);
-    return index < 0 ? NULL : test_cases_[index];
-  }
-
-  // Provides access to the event listener list.
-  TestEventListeners* listeners() { return &listeners_; }
-
-  // Returns the TestResult for the test that's currently running, or
-  // the TestResult for the ad hoc test if no test is running.
-  TestResult* current_test_result();
-
-  // Returns the TestResult for the ad hoc test.
-  const TestResult* ad_hoc_test_result() const { return &ad_hoc_test_result_; }
-
-  // Sets the OS stack trace getter.
-  //
-  // Does nothing if the input and the current OS stack trace getter
-  // are the same; otherwise, deletes the old getter and makes the
-  // input the current getter.
-  void set_os_stack_trace_getter(OsStackTraceGetterInterface* getter);
-
-  // Returns the current OS stack trace getter if it is not NULL;
-  // otherwise, creates an OsStackTraceGetter, makes it the current
-  // getter, and returns it.
-  OsStackTraceGetterInterface* os_stack_trace_getter();
-
-  // Returns the current OS stack trace as an std::string.
-  //
-  // The maximum number of stack frames to be included is specified by
-  // the gtest_stack_trace_depth flag.  The skip_count parameter
-  // specifies the number of top frames to be skipped, which doesn't
-  // count against the number of frames to be included.
-  //
-  // For example, if Foo() calls Bar(), which in turn calls
-  // CurrentOsStackTraceExceptTop(1), Foo() will be included in the
-  // trace but Bar() and CurrentOsStackTraceExceptTop() won't.
-  std::string CurrentOsStackTraceExceptTop(int skip_count) GTEST_NO_INLINE_;
-
-  // Finds and returns a TestCase with the given name.  If one doesn't
-  // exist, creates one and returns it.
-  //
-  // Arguments:
-  //
-  //   test_case_name: name of the test case
-  //   type_param:     the name of the test's type parameter, or NULL if
-  //                   this is not a typed or a type-parameterized test.
-  //   set_up_tc:      pointer to the function that sets up the test case
-  //   tear_down_tc:   pointer to the function that tears down the test case
-  TestCase* GetTestCase(const char* test_case_name,
-                        const char* type_param,
-                        Test::SetUpTestCaseFunc set_up_tc,
-                        Test::TearDownTestCaseFunc tear_down_tc);
-
-  // Adds a TestInfo to the unit test.
-  //
-  // Arguments:
-  //
-  //   set_up_tc:    pointer to the function that sets up the test case
-  //   tear_down_tc: pointer to the function that tears down the test case
-  //   test_info:    the TestInfo object
-  void AddTestInfo(Test::SetUpTestCaseFunc set_up_tc,
-                   Test::TearDownTestCaseFunc tear_down_tc,
-                   TestInfo* test_info) {
-    // In order to support thread-safe death tests, we need to
-    // remember the original working directory when the test program
-    // was first invoked.  We cannot do this in RUN_ALL_TESTS(), as
-    // the user may have changed the current directory before calling
-    // RUN_ALL_TESTS().  Therefore we capture the current directory in
-    // AddTestInfo(), which is called to register a TEST or TEST_F
-    // before main() is reached.
-    if (original_working_dir_.IsEmpty()) {
-      original_working_dir_.Set(FilePath::GetCurrentDir());
-      GTEST_CHECK_(!original_working_dir_.IsEmpty())
-          << "Failed to get the current working directory.";
-    }
-
-    GetTestCase(test_info->test_case_name(),
-                test_info->type_param(),
-                set_up_tc,
-                tear_down_tc)->AddTestInfo(test_info);
-  }
-
-#if GTEST_HAS_PARAM_TEST
-  // Returns ParameterizedTestCaseRegistry object used to keep track of
-  // value-parameterized tests and instantiate and register them.
-  internal::ParameterizedTestCaseRegistry& parameterized_test_registry() {
-    return parameterized_test_registry_;
-  }
-#endif  // GTEST_HAS_PARAM_TEST
-
-  // Sets the TestCase object for the test that's currently running.
-  void set_current_test_case(TestCase* a_current_test_case) {
-    current_test_case_ = a_current_test_case;
-  }
-
-  // Sets the TestInfo object for the test that's currently running.  If
-  // current_test_info is NULL, the assertion results will be stored in
-  // ad_hoc_test_result_.
-  void set_current_test_info(TestInfo* a_current_test_info) {
-    current_test_info_ = a_current_test_info;
-  }
-
-  // Registers all parameterized tests defined using TEST_P and
-  // INSTANTIATE_TEST_CASE_P, creating regular tests for each test/parameter
-  // combination. This method can be called more then once; it has guards
-  // protecting from registering the tests more then once.  If
-  // value-parameterized tests are disabled, RegisterParameterizedTests is
-  // present but does nothing.
-  void RegisterParameterizedTests();
-
-  // Runs all tests in this UnitTest object, prints the result, and
-  // returns true if all tests are successful.  If any exception is
-  // thrown during a test, this test is considered to be failed, but
-  // the rest of the tests will still be run.
-  bool RunAllTests();
-
-  // Clears the results of all tests, except the ad hoc tests.
-  void ClearNonAdHocTestResult() {
-    ForEach(test_cases_, TestCase::ClearTestCaseResult);
-  }
-
-  // Clears the results of ad-hoc test assertions.
-  void ClearAdHocTestResult() {
-    ad_hoc_test_result_.Clear();
-  }
-
-  // Adds a TestProperty to the current TestResult object when invoked in a
-  // context of a test or a test case, or to the global property set. If the
-  // result already contains a property with the same key, the value will be
-  // updated.
-  void RecordProperty(const TestProperty& test_property);
-
-  enum ReactionToSharding {
-    HONOR_SHARDING_PROTOCOL,
-    IGNORE_SHARDING_PROTOCOL
-  };
-
-  // Matches the full name of each test against the user-specified
-  // filter to decide whether the test should run, then records the
-  // result in each TestCase and TestInfo object.
-  // If shard_tests == HONOR_SHARDING_PROTOCOL, further filters tests
-  // based on sharding variables in the environment.
-  // Returns the number of tests that should run.
-  int FilterTests(ReactionToSharding shard_tests);
-
-  // Prints the names of the tests matching the user-specified filter flag.
-  void ListTestsMatchingFilter();
-
-  const TestCase* current_test_case() const { return current_test_case_; }
-  TestInfo* current_test_info() { return current_test_info_; }
-  const TestInfo* current_test_info() const { return current_test_info_; }
-
-  // Returns the vector of environments that need to be set-up/torn-down
-  // before/after the tests are run.
-  std::vector<Environment*>& environments() { return environments_; }
-
-  // Getters for the per-thread Google Test trace stack.
-  std::vector<TraceInfo>& gtest_trace_stack() {
-    return *(gtest_trace_stack_.pointer());
-  }
-  const std::vector<TraceInfo>& gtest_trace_stack() const {
-    return gtest_trace_stack_.get();
-  }
-
-#if GTEST_HAS_DEATH_TEST
-  void InitDeathTestSubprocessControlInfo() {
-    internal_run_death_test_flag_.reset(ParseInternalRunDeathTestFlag());
-  }
-  // Returns a pointer to the parsed --gtest_internal_run_death_test
-  // flag, or NULL if that flag was not specified.
-  // This information is useful only in a death test child process.
-  // Must not be called before a call to InitGoogleTest.
-  const InternalRunDeathTestFlag* internal_run_death_test_flag() const {
-    return internal_run_death_test_flag_.get();
-  }
-
-  // Returns a pointer to the current death test factory.
-  internal::DeathTestFactory* death_test_factory() {
-    return death_test_factory_.get();
-  }
-
-  void SuppressTestEventsIfInSubprocess();
-
-  friend class ReplaceDeathTestFactory;
-#endif  // GTEST_HAS_DEATH_TEST
-
-  // Initializes the event listener performing XML output as specified by
-  // UnitTestOptions. Must not be called before InitGoogleTest.
-  void ConfigureXmlOutput();
-
-#if GTEST_CAN_STREAM_RESULTS_
-  // Initializes the event listener for streaming test results to a socket.
-  // Must not be called before InitGoogleTest.
-  void ConfigureStreamingOutput();
-#endif
-
-  // Performs initialization dependent upon flag values obtained in
-  // ParseGoogleTestFlagsOnly.  Is called from InitGoogleTest after the call to
-  // ParseGoogleTestFlagsOnly.  In case a user neglects to call InitGoogleTest
-  // this function is also called from RunAllTests.  Since this function can be
-  // called more than once, it has to be idempotent.
-  void PostFlagParsingInit();
-
-  // Gets the random seed used at the start of the current test iteration.
-  int random_seed() const { return random_seed_; }
-
-  // Gets the random number generator.
-  internal::Random* random() { return &random_; }
-
-  // Shuffles all test cases, and the tests within each test case,
-  // making sure that death tests are still run first.
-  void ShuffleTests();
-
-  // Restores the test cases and tests to their order before the first shuffle.
-  void UnshuffleTests();
-
-  // Returns the value of GTEST_FLAG(catch_exceptions) at the moment
-  // UnitTest::Run() starts.
-  bool catch_exceptions() const { return catch_exceptions_; }
-
- private:
-  friend class ::testing::UnitTest;
-
-  // Used by UnitTest::Run() to capture the state of
-  // GTEST_FLAG(catch_exceptions) at the moment it starts.
-  void set_catch_exceptions(bool value) { catch_exceptions_ = value; }
-
-  // The UnitTest object that owns this implementation object.
-  UnitTest* const parent_;
-
-  // The working directory when the first TEST() or TEST_F() was
-  // executed.
-  internal::FilePath original_working_dir_;
-
-  // The default test part result reporters.
-  DefaultGlobalTestPartResultReporter default_global_test_part_result_reporter_;
-  DefaultPerThreadTestPartResultReporter
-      default_per_thread_test_part_result_reporter_;
-
-  // Points to (but doesn't own) the global test part result reporter.
-  TestPartResultReporterInterface* global_test_part_result_repoter_;
-
-  // Protects read and write access to global_test_part_result_reporter_.
-  internal::Mutex global_test_part_result_reporter_mutex_;
-
-  // Points to (but doesn't own) the per-thread test part result reporter.
-  internal::ThreadLocal<TestPartResultReporterInterface*>
-      per_thread_test_part_result_reporter_;
-
-  // The vector of environments that need to be set-up/torn-down
-  // before/after the tests are run.
-  std::vector<Environment*> environments_;
-
-  // The vector of TestCases in their original order.  It owns the
-  // elements in the vector.
-  std::vector<TestCase*> test_cases_;
-
-  // Provides a level of indirection for the test case list to allow
-  // easy shuffling and restoring the test case order.  The i-th
-  // element of this vector is the index of the i-th test case in the
-  // shuffled order.
-  std::vector<int> test_case_indices_;
-
-#if GTEST_HAS_PARAM_TEST
-  // ParameterizedTestRegistry object used to register value-parameterized
-  // tests.
-  internal::ParameterizedTestCaseRegistry parameterized_test_registry_;
-
-  // Indicates whether RegisterParameterizedTests() has been called already.
-  bool parameterized_tests_registered_;
-#endif  // GTEST_HAS_PARAM_TEST
-
-  // Index of the last death test case registered.  Initially -1.
-  int last_death_test_case_;
-
-  // This points to the TestCase for the currently running test.  It
-  // changes as Google Test goes through one test case after another.
-  // When no test is running, this is set to NULL and Google Test
-  // stores assertion results in ad_hoc_test_result_.  Initially NULL.
-  TestCase* current_test_case_;
-
-  // This points to the TestInfo for the currently running test.  It
-  // changes as Google Test goes through one test after another.  When
-  // no test is running, this is set to NULL and Google Test stores
-  // assertion results in ad_hoc_test_result_.  Initially NULL.
-  TestInfo* current_test_info_;
-
-  // Normally, a user only writes assertions inside a TEST or TEST_F,
-  // or inside a function called by a TEST or TEST_F.  Since Google
-  // Test keeps track of which test is current running, it can
-  // associate such an assertion with the test it belongs to.
-  //
-  // If an assertion is encountered when no TEST or TEST_F is running,
-  // Google Test attributes the assertion result to an imaginary "ad hoc"
-  // test, and records the result in ad_hoc_test_result_.
-  TestResult ad_hoc_test_result_;
-
-  // The list of event listeners that can be used to track events inside
-  // Google Test.
-  TestEventListeners listeners_;
-
-  // The OS stack trace getter.  Will be deleted when the UnitTest
-  // object is destructed.  By default, an OsStackTraceGetter is used,
-  // but the user can set this field to use a custom getter if that is
-  // desired.
-  OsStackTraceGetterInterface* os_stack_trace_getter_;
-
-  // True iff PostFlagParsingInit() has been called.
-  bool post_flag_parse_init_performed_;
-
-  // The random number seed used at the beginning of the test run.
-  int random_seed_;
-
-  // Our random number generator.
-  internal::Random random_;
-
-  // The time of the test program start, in ms from the start of the
-  // UNIX epoch.
-  TimeInMillis start_timestamp_;
-
-  // How long the test took to run, in milliseconds.
-  TimeInMillis elapsed_time_;
-
-#if GTEST_HAS_DEATH_TEST
-  // The decomposed components of the gtest_internal_run_death_test flag,
-  // parsed when RUN_ALL_TESTS is called.
-  internal::scoped_ptr<InternalRunDeathTestFlag> internal_run_death_test_flag_;
-  internal::scoped_ptr<internal::DeathTestFactory> death_test_factory_;
-#endif  // GTEST_HAS_DEATH_TEST
-
-  // A per-thread stack of traces created by the SCOPED_TRACE() macro.
-  internal::ThreadLocal<std::vector<TraceInfo> > gtest_trace_stack_;
-
-  // The value of GTEST_FLAG(catch_exceptions) at the moment RunAllTests()
-  // starts.
-  bool catch_exceptions_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTestImpl);
-};  // class UnitTestImpl
-
-// Convenience function for accessing the global UnitTest
-// implementation object.
-inline UnitTestImpl* GetUnitTestImpl() {
-  return UnitTest::GetInstance()->impl();
-}
-
-#if GTEST_USES_SIMPLE_RE
-
-// Internal helper functions for implementing the simple regular
-// expression matcher.
-GTEST_API_ bool IsInSet(char ch, const char* str);
-GTEST_API_ bool IsAsciiDigit(char ch);
-GTEST_API_ bool IsAsciiPunct(char ch);
-GTEST_API_ bool IsRepeat(char ch);
-GTEST_API_ bool IsAsciiWhiteSpace(char ch);
-GTEST_API_ bool IsAsciiWordChar(char ch);
-GTEST_API_ bool IsValidEscape(char ch);
-GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch);
-GTEST_API_ bool ValidateRegex(const char* regex);
-GTEST_API_ bool MatchRegexAtHead(const char* regex, const char* str);
-GTEST_API_ bool MatchRepetitionAndRegexAtHead(
-    bool escaped, char ch, char repeat, const char* regex, const char* str);
-GTEST_API_ bool MatchRegexAnywhere(const char* regex, const char* str);
-
-#endif  // GTEST_USES_SIMPLE_RE
-
-// Parses the command line for Google Test flags, without initializing
-// other parts of Google Test.
-GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, char** argv);
-GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv);
-
-#if GTEST_HAS_DEATH_TEST
-
-// Returns the message describing the last system error, regardless of the
-// platform.
-GTEST_API_ std::string GetLastErrnoDescription();
-
-# if GTEST_OS_WINDOWS
-// Provides leak-safe Windows kernel handle ownership.
-class AutoHandle {
- public:
-  AutoHandle() : handle_(INVALID_HANDLE_VALUE) {}
-  explicit AutoHandle(HANDLE handle) : handle_(handle) {}
-
-  ~AutoHandle() { Reset(); }
-
-  HANDLE Get() const { return handle_; }
-  void Reset() { Reset(INVALID_HANDLE_VALUE); }
-  void Reset(HANDLE handle) {
-    if (handle != handle_) {
-      if (handle_ != INVALID_HANDLE_VALUE)
-        ::CloseHandle(handle_);
-      handle_ = handle;
-    }
-  }
-
- private:
-  HANDLE handle_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(AutoHandle);
-};
-# endif  // GTEST_OS_WINDOWS
-
-// Attempts to parse a string into a positive integer pointed to by the
-// number parameter.  Returns true if that is possible.
-// GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we can use
-// it here.
-template <typename Integer>
-bool ParseNaturalNumber(const ::std::string& str, Integer* number) {
-  // Fail fast if the given string does not begin with a digit;
-  // this bypasses strtoXXX's "optional leading whitespace and plus
-  // or minus sign" semantics, which are undesirable here.
-  if (str.empty() || !IsDigit(str[0])) {
-    return false;
-  }
-  errno = 0;
-
-  char* end;
-  // BiggestConvertible is the largest integer type that system-provided
-  // string-to-number conversion routines can return.
-
-# if GTEST_OS_WINDOWS && !defined(__GNUC__)
-
-  // MSVC and C++ Builder define __int64 instead of the standard long long.
-  typedef unsigned __int64 BiggestConvertible;
-  const BiggestConvertible parsed = _strtoui64(str.c_str(), &end, 10);
-
-# else
-
-  typedef unsigned long long BiggestConvertible;  // NOLINT
-  const BiggestConvertible parsed = strtoull(str.c_str(), &end, 10);
-
-# endif  // GTEST_OS_WINDOWS && !defined(__GNUC__)
-
-  const bool parse_success = *end == '\0' && errno == 0;
-
-  // TODO(vladl@google.com): Convert this to compile time assertion when it is
-  // available.
-  GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed));
-
-  const Integer result = static_cast<Integer>(parsed);
-  if (parse_success && static_cast<BiggestConvertible>(result) == parsed) {
-    *number = result;
-    return true;
-  }
-  return false;
-}
-#endif  // GTEST_HAS_DEATH_TEST
-
-// TestResult contains some private methods that should be hidden from
-// Google Test user but are required for testing. This class allow our tests
-// to access them.
-//
-// This class is supplied only for the purpose of testing Google Test's own
-// constructs. Do not use it in user tests, either directly or indirectly.
-class TestResultAccessor {
- public:
-  static void RecordProperty(TestResult* test_result,
-                             const std::string& xml_element,
-                             const TestProperty& property) {
-    test_result->RecordProperty(xml_element, property);
-  }
-
-  static void ClearTestPartResults(TestResult* test_result) {
-    test_result->ClearTestPartResults();
-  }
-
-  static const std::vector<testing::TestPartResult>& test_part_results(
-      const TestResult& test_result) {
-    return test_result.test_part_results();
-  }
-};
-
-#if GTEST_CAN_STREAM_RESULTS_
-
-// Streams test results to the given port on the given host machine.
-class StreamingListener : public EmptyTestEventListener {
- public:
-  // Abstract base class for writing strings to a socket.
-  class AbstractSocketWriter {
-   public:
-    virtual ~AbstractSocketWriter() {}
-
-    // Sends a string to the socket.
-    virtual void Send(const string& message) = 0;
-
-    // Closes the socket.
-    virtual void CloseConnection() {}
-
-    // Sends a string and a newline to the socket.
-    void SendLn(const string& message) {
-      Send(message + "\n");
-    }
-  };
-
-  // Concrete class for actually writing strings to a socket.
-  class SocketWriter : public AbstractSocketWriter {
-   public:
-    SocketWriter(const string& host, const string& port)
-        : sockfd_(-1), host_name_(host), port_num_(port) {
-      MakeConnection();
-    }
-
-    virtual ~SocketWriter() {
-      if (sockfd_ != -1)
-        CloseConnection();
-    }
-
-    // Sends a string to the socket.
-    virtual void Send(const string& message) {
-      GTEST_CHECK_(sockfd_ != -1)
-          << "Send() can be called only when there is a connection.";
-
-      const int len = static_cast<int>(message.length());
-      if (write(sockfd_, message.c_str(), len) != len) {
-        GTEST_LOG_(WARNING)
-            << "stream_result_to: failed to stream to "
-            << host_name_ << ":" << port_num_;
-      }
-    }
-
-   private:
-    // Creates a client socket and connects to the server.
-    void MakeConnection();
-
-    // Closes the socket.
-    void CloseConnection() {
-      GTEST_CHECK_(sockfd_ != -1)
-          << "CloseConnection() can be called only when there is a connection.";
-
-      close(sockfd_);
-      sockfd_ = -1;
-    }
-
-    int sockfd_;  // socket file descriptor
-    const string host_name_;
-    const string port_num_;
-
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(SocketWriter);
-  };  // class SocketWriter
-
-  // Escapes '=', '&', '%', and '\n' characters in str as "%xx".
-  static string UrlEncode(const char* str);
-
-  StreamingListener(const string& host, const string& port)
-      : socket_writer_(new SocketWriter(host, port)) { Start(); }
-
-  explicit StreamingListener(AbstractSocketWriter* socket_writer)
-      : socket_writer_(socket_writer) { Start(); }
-
-  void OnTestProgramStart(const UnitTest& /* unit_test */) {
-    SendLn("event=TestProgramStart");
-  }
-
-  void OnTestProgramEnd(const UnitTest& unit_test) {
-    // Note that Google Test current only report elapsed time for each
-    // test iteration, not for the entire test program.
-    SendLn("event=TestProgramEnd&passed=" + FormatBool(unit_test.Passed()));
-
-    // Notify the streaming server to stop.
-    socket_writer_->CloseConnection();
-  }
-
-  void OnTestIterationStart(const UnitTest& /* unit_test */, int iteration) {
-    SendLn("event=TestIterationStart&iteration=" +
-           StreamableToString(iteration));
-  }
-
-  void OnTestIterationEnd(const UnitTest& unit_test, int /* iteration */) {
-    SendLn("event=TestIterationEnd&passed=" +
-           FormatBool(unit_test.Passed()) + "&elapsed_time=" +
-           StreamableToString(unit_test.elapsed_time()) + "ms");
-  }
-
-  void OnTestCaseStart(const TestCase& test_case) {
-    SendLn(std::string("event=TestCaseStart&name=") + test_case.name());
-  }
-
-  void OnTestCaseEnd(const TestCase& test_case) {
-    SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed())
-           + "&elapsed_time=" + StreamableToString(test_case.elapsed_time())
-           + "ms");
-  }
-
-  void OnTestStart(const TestInfo& test_info) {
-    SendLn(std::string("event=TestStart&name=") + test_info.name());
-  }
-
-  void OnTestEnd(const TestInfo& test_info) {
-    SendLn("event=TestEnd&passed=" +
-           FormatBool((test_info.result())->Passed()) +
-           "&elapsed_time=" +
-           StreamableToString((test_info.result())->elapsed_time()) + "ms");
-  }
-
-  void OnTestPartResult(const TestPartResult& test_part_result) {
-    const char* file_name = test_part_result.file_name();
-    if (file_name == NULL)
-      file_name = "";
-    SendLn("event=TestPartResult&file=" + UrlEncode(file_name) +
-           "&line=" + StreamableToString(test_part_result.line_number()) +
-           "&message=" + UrlEncode(test_part_result.message()));
-  }
-
- private:
-  // Sends the given message and a newline to the socket.
-  void SendLn(const string& message) { socket_writer_->SendLn(message); }
-
-  // Called at the start of streaming to notify the receiver what
-  // protocol we are using.
-  void Start() { SendLn("gtest_streaming_protocol_version=1.0"); }
-
-  string FormatBool(bool value) { return value ? "1" : "0"; }
-
-  const scoped_ptr<AbstractSocketWriter> socket_writer_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamingListener);
-};  // class StreamingListener
-
-#endif  // GTEST_CAN_STREAM_RESULTS_
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  // GTEST_SRC_GTEST_INTERNAL_INL_H_
-#undef GTEST_IMPLEMENTATION_
-
-#if GTEST_OS_WINDOWS
-# define vsnprintf _vsnprintf
-#endif  // GTEST_OS_WINDOWS
-
-namespace testing {
-
-using internal::CountIf;
-using internal::ForEach;
-using internal::GetElementOr;
-using internal::Shuffle;
-
-// Constants.
-
-// A test whose test case name or test name matches this filter is
-// disabled and not run.
-static const char kDisableTestFilter[] = "DISABLED_*:*/DISABLED_*";
-
-// A test case whose name matches this filter is considered a death
-// test case and will be run before test cases whose name doesn't
-// match this filter.
-static const char kDeathTestCaseFilter[] = "*DeathTest:*DeathTest/*";
-
-// A test filter that matches everything.
-static const char kUniversalFilter[] = "*";
-
-// The default output file for XML output.
-static const char kDefaultOutputFile[] = "test_detail.xml";
-
-// The environment variable name for the test shard index.
-static const char kTestShardIndex[] = "GTEST_SHARD_INDEX";
-// The environment variable name for the total number of test shards.
-static const char kTestTotalShards[] = "GTEST_TOTAL_SHARDS";
-// The environment variable name for the test shard status file.
-static const char kTestShardStatusFile[] = "GTEST_SHARD_STATUS_FILE";
-
-namespace internal {
-
-// The text used in failure messages to indicate the start of the
-// stack trace.
-const char kStackTraceMarker[] = "\nStack trace:\n";
-
-// g_help_flag is true iff the --help flag or an equivalent form is
-// specified on the command line.
-bool g_help_flag = false;
-
-}  // namespace internal
-
-static const char* GetDefaultFilter() {
-  return kUniversalFilter;
-}
-
-GTEST_DEFINE_bool_(
-    also_run_disabled_tests,
-    internal::BoolFromGTestEnv("also_run_disabled_tests", false),
-    "Run disabled tests too, in addition to the tests normally being run.");
-
-GTEST_DEFINE_bool_(
-    break_on_failure,
-    internal::BoolFromGTestEnv("break_on_failure", false),
-    "True iff a failed assertion should be a debugger break-point.");
-
-GTEST_DEFINE_bool_(
-    catch_exceptions,
-    internal::BoolFromGTestEnv("catch_exceptions", true),
-    "True iff " GTEST_NAME_
-    " should catch exceptions and treat them as test failures.");
-
-GTEST_DEFINE_string_(
-    color,
-    internal::StringFromGTestEnv("color", "auto"),
-    "Whether to use colors in the output.  Valid values: yes, no, "
-    "and auto.  'auto' means to use colors if the output is "
-    "being sent to a terminal and the TERM environment variable "
-    "is set to a terminal type that supports colors.");
-
-GTEST_DEFINE_string_(
-    filter,
-    internal::StringFromGTestEnv("filter", GetDefaultFilter()),
-    "A colon-separated list of glob (not regex) patterns "
-    "for filtering the tests to run, optionally followed by a "
-    "'-' and a : separated list of negative patterns (tests to "
-    "exclude).  A test is run if it matches one of the positive "
-    "patterns and does not match any of the negative patterns.");
-
-GTEST_DEFINE_bool_(list_tests, false,
-                   "List all tests without running them.");
-
-GTEST_DEFINE_string_(
-    output,
-    internal::StringFromGTestEnv("output", ""),
-    "A format (currently must be \"xml\"), optionally followed "
-    "by a colon and an output file name or directory. A directory "
-    "is indicated by a trailing pathname separator. "
-    "Examples: \"xml:filename.xml\", \"xml::directoryname/\". "
-    "If a directory is specified, output files will be created "
-    "within that directory, with file-names based on the test "
-    "executable's name and, if necessary, made unique by adding "
-    "digits.");
-
-GTEST_DEFINE_bool_(
-    print_time,
-    internal::BoolFromGTestEnv("print_time", true),
-    "True iff " GTEST_NAME_
-    " should display elapsed time in text output.");
-
-GTEST_DEFINE_int32_(
-    random_seed,
-    internal::Int32FromGTestEnv("random_seed", 0),
-    "Random number seed to use when shuffling test orders.  Must be in range "
-    "[1, 99999], or 0 to use a seed based on the current time.");
-
-GTEST_DEFINE_int32_(
-    repeat,
-    internal::Int32FromGTestEnv("repeat", 1),
-    "How many times to repeat each test.  Specify a negative number "
-    "for repeating forever.  Useful for shaking out flaky tests.");
-
-GTEST_DEFINE_bool_(
-    show_internal_stack_frames, false,
-    "True iff " GTEST_NAME_ " should include internal stack frames when "
-    "printing test failure stack traces.");
-
-GTEST_DEFINE_bool_(
-    shuffle,
-    internal::BoolFromGTestEnv("shuffle", false),
-    "True iff " GTEST_NAME_
-    " should randomize tests' order on every run.");
-
-GTEST_DEFINE_int32_(
-    stack_trace_depth,
-    internal::Int32FromGTestEnv("stack_trace_depth", kMaxStackTraceDepth),
-    "The maximum number of stack frames to print when an "
-    "assertion fails.  The valid range is 0 through 100, inclusive.");
-
-GTEST_DEFINE_string_(
-    stream_result_to,
-    internal::StringFromGTestEnv("stream_result_to", ""),
-    "This flag specifies the host name and the port number on which to stream "
-    "test results. Example: \"localhost:555\". The flag is effective only on "
-    "Linux.");
-
-GTEST_DEFINE_bool_(
-    throw_on_failure,
-    internal::BoolFromGTestEnv("throw_on_failure", false),
-    "When this flag is specified, a failed assertion will throw an exception "
-    "if exceptions are enabled or exit the program with a non-zero code "
-    "otherwise.");
-
-namespace internal {
-
-// Generates a random number from [0, range), using a Linear
-// Congruential Generator (LCG).  Crashes if 'range' is 0 or greater
-// than kMaxRange.
-UInt32 Random::Generate(UInt32 range) {
-  // These constants are the same as are used in glibc's rand(3).
-  state_ = (1103515245U*state_ + 12345U) % kMaxRange;
-
-  GTEST_CHECK_(range > 0)
-      << "Cannot generate a number in the range [0, 0).";
-  GTEST_CHECK_(range <= kMaxRange)
-      << "Generation of a number in [0, " << range << ") was requested, "
-      << "but this can only generate numbers in [0, " << kMaxRange << ").";
-
-  // Converting via modulus introduces a bit of downward bias, but
-  // it's simple, and a linear congruential generator isn't too good
-  // to begin with.
-  return state_ % range;
-}
-
-// GTestIsInitialized() returns true iff the user has initialized
-// Google Test.  Useful for catching the user mistake of not initializing
-// Google Test before calling RUN_ALL_TESTS().
-//
-// A user must call testing::InitGoogleTest() to initialize Google
-// Test.  g_init_gtest_count is set to the number of times
-// InitGoogleTest() has been called.  We don't protect this variable
-// under a mutex as it is only accessed in the main thread.
-GTEST_API_ int g_init_gtest_count = 0;
-static bool GTestIsInitialized() { return g_init_gtest_count != 0; }
-
-// Iterates over a vector of TestCases, keeping a running sum of the
-// results of calling a given int-returning method on each.
-// Returns the sum.
-static int SumOverTestCaseList(const std::vector<TestCase*>& case_list,
-                               int (TestCase::*method)() const) {
-  int sum = 0;
-  for (size_t i = 0; i < case_list.size(); i++) {
-    sum += (case_list[i]->*method)();
-  }
-  return sum;
-}
-
-// Returns true iff the test case passed.
-static bool TestCasePassed(const TestCase* test_case) {
-  return test_case->should_run() && test_case->Passed();
-}
-
-// Returns true iff the test case failed.
-static bool TestCaseFailed(const TestCase* test_case) {
-  return test_case->should_run() && test_case->Failed();
-}
-
-// Returns true iff test_case contains at least one test that should
-// run.
-static bool ShouldRunTestCase(const TestCase* test_case) {
-  return test_case->should_run();
-}
-
-// AssertHelper constructor.
-AssertHelper::AssertHelper(TestPartResult::Type type,
-                           const char* file,
-                           int line,
-                           const char* message)
-    : data_(new AssertHelperData(type, file, line, message)) {
-}
-
-AssertHelper::~AssertHelper() {
-  delete data_;
-}
-
-// Message assignment, for assertion streaming support.
-void AssertHelper::operator=(const Message& message) const {
-  UnitTest::GetInstance()->
-    AddTestPartResult(data_->type, data_->file, data_->line,
-                      AppendUserMessage(data_->message, message),
-                      UnitTest::GetInstance()->impl()
-                      ->CurrentOsStackTraceExceptTop(1)
-                      // Skips the stack frame for this function itself.
-                      );  // NOLINT
-}
-
-// Mutex for linked pointers.
-GTEST_API_ GTEST_DEFINE_STATIC_MUTEX_(g_linked_ptr_mutex);
-
-// Application pathname gotten in InitGoogleTest.
-std::string g_executable_path;
-
-// Returns the current application's name, removing directory path if that
-// is present.
-FilePath GetCurrentExecutableName() {
-  FilePath result;
-
-#if GTEST_OS_WINDOWS
-  result.Set(FilePath(g_executable_path).RemoveExtension("exe"));
-#else
-  result.Set(FilePath(g_executable_path));
-#endif  // GTEST_OS_WINDOWS
-
-  return result.RemoveDirectoryName();
-}
-
-// Functions for processing the gtest_output flag.
-
-// Returns the output format, or "" for normal printed output.
-std::string UnitTestOptions::GetOutputFormat() {
-  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
-  if (gtest_output_flag == NULL) return std::string("");
-
-  const char* const colon = strchr(gtest_output_flag, ':');
-  return (colon == NULL) ?
-      std::string(gtest_output_flag) :
-      std::string(gtest_output_flag, colon - gtest_output_flag);
-}
-
-// Returns the name of the requested output file, or the default if none
-// was explicitly specified.
-std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
-  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
-  if (gtest_output_flag == NULL)
-    return "";
-
-  const char* const colon = strchr(gtest_output_flag, ':');
-  if (colon == NULL)
-    return internal::FilePath::ConcatPaths(
-        internal::FilePath(
-            UnitTest::GetInstance()->original_working_dir()),
-        internal::FilePath(kDefaultOutputFile)).string();
-
-  internal::FilePath output_name(colon + 1);
-  if (!output_name.IsAbsolutePath())
-    // TODO(wan@google.com): on Windows \some\path is not an absolute
-    // path (as its meaning depends on the current drive), yet the
-    // following logic for turning it into an absolute path is wrong.
-    // Fix it.
-    output_name = internal::FilePath::ConcatPaths(
-        internal::FilePath(UnitTest::GetInstance()->original_working_dir()),
-        internal::FilePath(colon + 1));
-
-  if (!output_name.IsDirectory())
-    return output_name.string();
-
-  internal::FilePath result(internal::FilePath::GenerateUniqueFileName(
-      output_name, internal::GetCurrentExecutableName(),
-      GetOutputFormat().c_str()));
-  return result.string();
-}
-
-// Returns true iff the wildcard pattern matches the string.  The
-// first ':' or '\0' character in pattern marks the end of it.
-//
-// This recursive algorithm isn't very efficient, but is clear and
-// works well enough for matching test names, which are short.
-bool UnitTestOptions::PatternMatchesString(const char *pattern,
-                                           const char *str) {
-  switch (*pattern) {
-    case '\0':
-    case ':':  // Either ':' or '\0' marks the end of the pattern.
-      return *str == '\0';
-    case '?':  // Matches any single character.
-      return *str != '\0' && PatternMatchesString(pattern + 1, str + 1);
-    case '*':  // Matches any string (possibly empty) of characters.
-      return (*str != '\0' && PatternMatchesString(pattern, str + 1)) ||
-          PatternMatchesString(pattern + 1, str);
-    default:  // Non-special character.  Matches itself.
-      return *pattern == *str &&
-          PatternMatchesString(pattern + 1, str + 1);
-  }
-}
-
-bool UnitTestOptions::MatchesFilter(
-    const std::string& name, const char* filter) {
-  const char *cur_pattern = filter;
-  for (;;) {
-    if (PatternMatchesString(cur_pattern, name.c_str())) {
-      return true;
-    }
-
-    // Finds the next pattern in the filter.
-    cur_pattern = strchr(cur_pattern, ':');
-
-    // Returns if no more pattern can be found.
-    if (cur_pattern == NULL) {
-      return false;
-    }
-
-    // Skips the pattern separater (the ':' character).
-    cur_pattern++;
-  }
-}
-
-// Returns true iff the user-specified filter matches the test case
-// name and the test name.
-bool UnitTestOptions::FilterMatchesTest(const std::string &test_case_name,
-                                        const std::string &test_name) {
-  const std::string& full_name = test_case_name + "." + test_name.c_str();
-
-  // Split --gtest_filter at '-', if there is one, to separate into
-  // positive filter and negative filter portions
-  const char* const p = GTEST_FLAG(filter).c_str();
-  const char* const dash = strchr(p, '-');
-  std::string positive;
-  std::string negative;
-  if (dash == NULL) {
-    positive = GTEST_FLAG(filter).c_str();  // Whole string is a positive filter
-    negative = "";
-  } else {
-    positive = std::string(p, dash);   // Everything up to the dash
-    negative = std::string(dash + 1);  // Everything after the dash
-    if (positive.empty()) {
-      // Treat '-test1' as the same as '*-test1'
-      positive = kUniversalFilter;
-    }
-  }
-
-  // A filter is a colon-separated list of patterns.  It matches a
-  // test if any pattern in it matches the test.
-  return (MatchesFilter(full_name, positive.c_str()) &&
-          !MatchesFilter(full_name, negative.c_str()));
-}
-
-#if GTEST_HAS_SEH
-// Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
-// given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
-// This function is useful as an __except condition.
-int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) {
-  // Google Test should handle a SEH exception if:
-  //   1. the user wants it to, AND
-  //   2. this is not a breakpoint exception, AND
-  //   3. this is not a C++ exception (VC++ implements them via SEH,
-  //      apparently).
-  //
-  // SEH exception code for C++ exceptions.
-  // (see http://support.microsoft.com/kb/185294 for more information).
-  const DWORD kCxxExceptionCode = 0xe06d7363;
-
-  bool should_handle = true;
-
-  if (!GTEST_FLAG(catch_exceptions))
-    should_handle = false;
-  else if (exception_code == EXCEPTION_BREAKPOINT)
-    should_handle = false;
-  else if (exception_code == kCxxExceptionCode)
-    should_handle = false;
-
-  return should_handle ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH;
-}
-#endif  // GTEST_HAS_SEH
-
-}  // namespace internal
-
-// The c'tor sets this object as the test part result reporter used by
-// Google Test.  The 'result' parameter specifies where to report the
-// results. Intercepts only failures from the current thread.
-ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
-    TestPartResultArray* result)
-    : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD),
-      result_(result) {
-  Init();
-}
-
-// The c'tor sets this object as the test part result reporter used by
-// Google Test.  The 'result' parameter specifies where to report the
-// results.
-ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
-    InterceptMode intercept_mode, TestPartResultArray* result)
-    : intercept_mode_(intercept_mode),
-      result_(result) {
-  Init();
-}
-
-void ScopedFakeTestPartResultReporter::Init() {
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
-  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
-    old_reporter_ = impl->GetGlobalTestPartResultReporter();
-    impl->SetGlobalTestPartResultReporter(this);
-  } else {
-    old_reporter_ = impl->GetTestPartResultReporterForCurrentThread();
-    impl->SetTestPartResultReporterForCurrentThread(this);
-  }
-}
-
-// The d'tor restores the test part result reporter used by Google Test
-// before.
-ScopedFakeTestPartResultReporter::~ScopedFakeTestPartResultReporter() {
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
-  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
-    impl->SetGlobalTestPartResultReporter(old_reporter_);
-  } else {
-    impl->SetTestPartResultReporterForCurrentThread(old_reporter_);
-  }
-}
-
-// Increments the test part result count and remembers the result.
-// This method is from the TestPartResultReporterInterface interface.
-void ScopedFakeTestPartResultReporter::ReportTestPartResult(
-    const TestPartResult& result) {
-  result_->Append(result);
-}
-
-namespace internal {
-
-// Returns the type ID of ::testing::Test.  We should always call this
-// instead of GetTypeId< ::testing::Test>() to get the type ID of
-// testing::Test.  This is to work around a suspected linker bug when
-// using Google Test as a framework on Mac OS X.  The bug causes
-// GetTypeId< ::testing::Test>() to return different values depending
-// on whether the call is from the Google Test framework itself or
-// from user test code.  GetTestTypeId() is guaranteed to always
-// return the same value, as it always calls GetTypeId<>() from the
-// gtest.cc, which is within the Google Test framework.
-TypeId GetTestTypeId() {
-  return GetTypeId<Test>();
-}
-
-// The value of GetTestTypeId() as seen from within the Google Test
-// library.  This is solely for testing GetTestTypeId().
-extern const TypeId kTestTypeIdInGoogleTest = GetTestTypeId();
-
-// This predicate-formatter checks that 'results' contains a test part
-// failure of the given type and that the failure message contains the
-// given substring.
-AssertionResult HasOneFailure(const char* /* results_expr */,
-                              const char* /* type_expr */,
-                              const char* /* substr_expr */,
-                              const TestPartResultArray& results,
-                              TestPartResult::Type type,
-                              const string& substr) {
-  const std::string expected(type == TestPartResult::kFatalFailure ?
-                        "1 fatal failure" :
-                        "1 non-fatal failure");
-  Message msg;
-  if (results.size() != 1) {
-    msg << "Expected: " << expected << "\n"
-        << "  Actual: " << results.size() << " failures";
-    for (int i = 0; i < results.size(); i++) {
-      msg << "\n" << results.GetTestPartResult(i);
-    }
-    return AssertionFailure() << msg;
-  }
-
-  const TestPartResult& r = results.GetTestPartResult(0);
-  if (r.type() != type) {
-    return AssertionFailure() << "Expected: " << expected << "\n"
-                              << "  Actual:\n"
-                              << r;
-  }
-
-  if (strstr(r.message(), substr.c_str()) == NULL) {
-    return AssertionFailure() << "Expected: " << expected << " containing \""
-                              << substr << "\"\n"
-                              << "  Actual:\n"
-                              << r;
-  }
-
-  return AssertionSuccess();
-}
-
-// The constructor of SingleFailureChecker remembers where to look up
-// test part results, what type of failure we expect, and what
-// substring the failure message should contain.
-SingleFailureChecker:: SingleFailureChecker(
-    const TestPartResultArray* results,
-    TestPartResult::Type type,
-    const string& substr)
-    : results_(results),
-      type_(type),
-      substr_(substr) {}
-
-// The destructor of SingleFailureChecker verifies that the given
-// TestPartResultArray contains exactly one failure that has the given
-// type and contains the given substring.  If that's not the case, a
-// non-fatal failure will be generated.
-SingleFailureChecker::~SingleFailureChecker() {
-  EXPECT_PRED_FORMAT3(HasOneFailure, *results_, type_, substr_);
-}
-
-DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter(
-    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
-
-void DefaultGlobalTestPartResultReporter::ReportTestPartResult(
-    const TestPartResult& result) {
-  unit_test_->current_test_result()->AddTestPartResult(result);
-  unit_test_->listeners()->repeater()->OnTestPartResult(result);
-}
-
-DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter(
-    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
-
-void DefaultPerThreadTestPartResultReporter::ReportTestPartResult(
-    const TestPartResult& result) {
-  unit_test_->GetGlobalTestPartResultReporter()->ReportTestPartResult(result);
-}
-
-// Returns the global test part result reporter.
-TestPartResultReporterInterface*
-UnitTestImpl::GetGlobalTestPartResultReporter() {
-  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
-  return global_test_part_result_repoter_;
-}
-
-// Sets the global test part result reporter.
-void UnitTestImpl::SetGlobalTestPartResultReporter(
-    TestPartResultReporterInterface* reporter) {
-  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
-  global_test_part_result_repoter_ = reporter;
-}
-
-// Returns the test part result reporter for the current thread.
-TestPartResultReporterInterface*
-UnitTestImpl::GetTestPartResultReporterForCurrentThread() {
-  return per_thread_test_part_result_reporter_.get();
-}
-
-// Sets the test part result reporter for the current thread.
-void UnitTestImpl::SetTestPartResultReporterForCurrentThread(
-    TestPartResultReporterInterface* reporter) {
-  per_thread_test_part_result_reporter_.set(reporter);
-}
-
-// Gets the number of successful test cases.
-int UnitTestImpl::successful_test_case_count() const {
-  return CountIf(test_cases_, TestCasePassed);
-}
-
-// Gets the number of failed test cases.
-int UnitTestImpl::failed_test_case_count() const {
-  return CountIf(test_cases_, TestCaseFailed);
-}
-
-// Gets the number of all test cases.
-int UnitTestImpl::total_test_case_count() const {
-  return static_cast<int>(test_cases_.size());
-}
-
-// Gets the number of all test cases that contain at least one test
-// that should run.
-int UnitTestImpl::test_case_to_run_count() const {
-  return CountIf(test_cases_, ShouldRunTestCase);
-}
-
-// Gets the number of successful tests.
-int UnitTestImpl::successful_test_count() const {
-  return SumOverTestCaseList(test_cases_, &TestCase::successful_test_count);
-}
-
-// Gets the number of failed tests.
-int UnitTestImpl::failed_test_count() const {
-  return SumOverTestCaseList(test_cases_, &TestCase::failed_test_count);
-}
-
-// Gets the number of disabled tests that will be reported in the XML report.
-int UnitTestImpl::reportable_disabled_test_count() const {
-  return SumOverTestCaseList(test_cases_,
-                             &TestCase::reportable_disabled_test_count);
-}
-
-// Gets the number of disabled tests.
-int UnitTestImpl::disabled_test_count() const {
-  return SumOverTestCaseList(test_cases_, &TestCase::disabled_test_count);
-}
-
-// Gets the number of tests to be printed in the XML report.
-int UnitTestImpl::reportable_test_count() const {
-  return SumOverTestCaseList(test_cases_, &TestCase::reportable_test_count);
-}
-
-// Gets the number of all tests.
-int UnitTestImpl::total_test_count() const {
-  return SumOverTestCaseList(test_cases_, &TestCase::total_test_count);
-}
-
-// Gets the number of tests that should run.
-int UnitTestImpl::test_to_run_count() const {
-  return SumOverTestCaseList(test_cases_, &TestCase::test_to_run_count);
-}
-
-// Returns the current OS stack trace as an std::string.
-//
-// The maximum number of stack frames to be included is specified by
-// the gtest_stack_trace_depth flag.  The skip_count parameter
-// specifies the number of top frames to be skipped, which doesn't
-// count against the number of frames to be included.
-//
-// For example, if Foo() calls Bar(), which in turn calls
-// CurrentOsStackTraceExceptTop(1), Foo() will be included in the
-// trace but Bar() and CurrentOsStackTraceExceptTop() won't.
-std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) {
-  (void)skip_count;
-  return "";
-}
-
-// Returns the current time in milliseconds.
-TimeInMillis GetTimeInMillis() {
-#if GTEST_OS_WINDOWS_MOBILE || defined(__BORLANDC__)
-  // Difference between 1970-01-01 and 1601-01-01 in milliseconds.
-  // http://analogous.blogspot.com/2005/04/epoch.html
-  const TimeInMillis kJavaEpochToWinFileTimeDelta =
-    static_cast<TimeInMillis>(116444736UL) * 100000UL;
-  const DWORD kTenthMicrosInMilliSecond = 10000;
-
-  SYSTEMTIME now_systime;
-  FILETIME now_filetime;
-  ULARGE_INTEGER now_int64;
-  // TODO(kenton@google.com): Shouldn't this just use
-  //   GetSystemTimeAsFileTime()?
-  GetSystemTime(&now_systime);
-  if (SystemTimeToFileTime(&now_systime, &now_filetime)) {
-    now_int64.LowPart = now_filetime.dwLowDateTime;
-    now_int64.HighPart = now_filetime.dwHighDateTime;
-    now_int64.QuadPart = (now_int64.QuadPart / kTenthMicrosInMilliSecond) -
-      kJavaEpochToWinFileTimeDelta;
-    return now_int64.QuadPart;
-  }
-  return 0;
-#elif GTEST_OS_WINDOWS && !GTEST_HAS_GETTIMEOFDAY_
-  __timeb64 now;
-
-# ifdef _MSC_VER
-
-  // MSVC 8 deprecates _ftime64(), so we want to suppress warning 4996
-  // (deprecated function) there.
-  // TODO(kenton@google.com): Use GetTickCount()?  Or use
-  //   SystemTimeToFileTime()
-#  pragma warning(push)          // Saves the current warning state.
-#  pragma warning(disable:4996)  // Temporarily disables warning 4996.
-  _ftime64(&now);
-#  pragma warning(pop)           // Restores the warning state.
-# else
-
-  _ftime64(&now);
-
-# endif  // _MSC_VER
-
-  return static_cast<TimeInMillis>(now.time) * 1000 + now.millitm;
-#elif GTEST_HAS_GETTIMEOFDAY_
-  struct timeval now;
-  gettimeofday(&now, NULL);
-  return static_cast<TimeInMillis>(now.tv_sec) * 1000 + now.tv_usec / 1000;
-#else
-# error "Don't know how to get the current time on your system."
-#endif
-}
-
-// Utilities
-
-// class String.
-
-#if GTEST_OS_WINDOWS_MOBILE
-// Creates a UTF-16 wide string from the given ANSI string, allocating
-// memory using new. The caller is responsible for deleting the return
-// value using delete[]. Returns the wide string, or NULL if the
-// input is NULL.
-LPCWSTR String::AnsiToUtf16(const char* ansi) {
-  if (!ansi) return NULL;
-  const int length = strlen(ansi);
-  const int unicode_length =
-      MultiByteToWideChar(CP_ACP, 0, ansi, length,
-                          NULL, 0);
-  WCHAR* unicode = new WCHAR[unicode_length + 1];
-  MultiByteToWideChar(CP_ACP, 0, ansi, length,
-                      unicode, unicode_length);
-  unicode[unicode_length] = 0;
-  return unicode;
-}
-
-// Creates an ANSI string from the given wide string, allocating
-// memory using new. The caller is responsible for deleting the return
-// value using delete[]. Returns the ANSI string, or NULL if the
-// input is NULL.
-const char* String::Utf16ToAnsi(LPCWSTR utf16_str)  {
-  if (!utf16_str) return NULL;
-  const int ansi_length =
-      WideCharToMultiByte(CP_ACP, 0, utf16_str, -1,
-                          NULL, 0, NULL, NULL);
-  char* ansi = new char[ansi_length + 1];
-  WideCharToMultiByte(CP_ACP, 0, utf16_str, -1,
-                      ansi, ansi_length, NULL, NULL);
-  ansi[ansi_length] = 0;
-  return ansi;
-}
-
-#endif  // GTEST_OS_WINDOWS_MOBILE
-
-// Compares two C strings.  Returns true iff they have the same content.
-//
-// Unlike strcmp(), this function can handle NULL argument(s).  A NULL
-// C string is considered different to any non-NULL C string,
-// including the empty string.
-bool String::CStringEquals(const char * lhs, const char * rhs) {
-  if ( lhs == NULL ) return rhs == NULL;
-
-  if ( rhs == NULL ) return false;
-
-  return strcmp(lhs, rhs) == 0;
-}
-
-#if GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING
-
-// Converts an array of wide chars to a narrow string using the UTF-8
-// encoding, and streams the result to the given Message object.
-static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length,
-                                     Message* msg) {
-  for (size_t i = 0; i != length; ) {  // NOLINT
-    if (wstr[i] != L'\0') {
-      *msg << WideStringToUtf8(wstr + i, static_cast<int>(length - i));
-      while (i != length && wstr[i] != L'\0')
-        i++;
-    } else {
-      *msg << '\0';
-      i++;
-    }
-  }
-}
-
-#endif  // GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING
-
-}  // namespace internal
-
-// Constructs an empty Message.
-// We allocate the stringstream separately because otherwise each use of
-// ASSERT/EXPECT in a procedure adds over 200 bytes to the procedure's
-// stack frame leading to huge stack frames in some cases; gcc does not reuse
-// the stack space.
-Message::Message() : ss_(new ::std::stringstream) {
-  // By default, we want there to be enough precision when printing
-  // a double to a Message.
-  *ss_ << std::setprecision(std::numeric_limits<double>::digits10 + 2);
-}
-
-// These two overloads allow streaming a wide C string to a Message
-// using the UTF-8 encoding.
-Message& Message::operator <<(const wchar_t* wide_c_str) {
-  return *this << internal::String::ShowWideCString(wide_c_str);
-}
-Message& Message::operator <<(wchar_t* wide_c_str) {
-  return *this << internal::String::ShowWideCString(wide_c_str);
-}
-
-#if GTEST_HAS_STD_WSTRING
-// Converts the given wide string to a narrow string using the UTF-8
-// encoding, and streams the result to this Message object.
-Message& Message::operator <<(const ::std::wstring& wstr) {
-  internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
-  return *this;
-}
-#endif  // GTEST_HAS_STD_WSTRING
-
-#if GTEST_HAS_GLOBAL_WSTRING
-// Converts the given wide string to a narrow string using the UTF-8
-// encoding, and streams the result to this Message object.
-Message& Message::operator <<(const ::wstring& wstr) {
-  internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
-  return *this;
-}
-#endif  // GTEST_HAS_GLOBAL_WSTRING
-
-// Gets the text streamed to this object so far as an std::string.
-// Each '\0' character in the buffer is replaced with "\\0".
-std::string Message::GetString() const {
-  return internal::StringStreamToString(ss_.get());
-}
-
-// AssertionResult constructors.
-// Used in EXPECT_TRUE/FALSE(assertion_result).
-AssertionResult::AssertionResult(const AssertionResult& other)
-    : success_(other.success_),
-      message_(other.message_.get() != NULL ?
-               new ::std::string(*other.message_) :
-               static_cast< ::std::string*>(NULL)) {
-}
-
-// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
-AssertionResult AssertionResult::operator!() const {
-  AssertionResult negation(!success_);
-  if (message_.get() != NULL)
-    negation << *message_;
-  return negation;
-}
-
-// Makes a successful assertion result.
-AssertionResult AssertionSuccess() {
-  return AssertionResult(true);
-}
-
-// Makes a failed assertion result.
-AssertionResult AssertionFailure() {
-  return AssertionResult(false);
-}
-
-// Makes a failed assertion result with the given failure message.
-// Deprecated; use AssertionFailure() << message.
-AssertionResult AssertionFailure(const Message& message) {
-  return AssertionFailure() << message;
-}
-
-namespace internal {
-
-// Constructs and returns the message for an equality assertion
-// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
-//
-// The first four parameters are the expressions used in the assertion
-// and their values, as strings.  For example, for ASSERT_EQ(foo, bar)
-// where foo is 5 and bar is 6, we have:
-//
-//   expected_expression: "foo"
-//   actual_expression:   "bar"
-//   expected_value:      "5"
-//   actual_value:        "6"
-//
-// The ignoring_case parameter is true iff the assertion is a
-// *_STRCASEEQ*.  When it's true, the string " (ignoring case)" will
-// be inserted into the message.
-AssertionResult EqFailure(const char* expected_expression,
-                          const char* actual_expression,
-                          const std::string& expected_value,
-                          const std::string& actual_value,
-                          bool ignoring_case) {
-  Message msg;
-  msg << "Value of: " << actual_expression;
-  if (actual_value != actual_expression) {
-    msg << "\n  Actual: " << actual_value;
-  }
-
-  msg << "\nExpected: " << expected_expression;
-  if (ignoring_case) {
-    msg << " (ignoring case)";
-  }
-  if (expected_value != expected_expression) {
-    msg << "\nWhich is: " << expected_value;
-  }
-
-  return AssertionFailure() << msg;
-}
-
-// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
-std::string GetBoolAssertionFailureMessage(
-    const AssertionResult& assertion_result,
-    const char* expression_text,
-    const char* actual_predicate_value,
-    const char* expected_predicate_value) {
-  const char* actual_message = assertion_result.message();
-  Message msg;
-  msg << "Value of: " << expression_text
-      << "\n  Actual: " << actual_predicate_value;
-  if (actual_message[0] != '\0')
-    msg << " (" << actual_message << ")";
-  msg << "\nExpected: " << expected_predicate_value;
-  return msg.GetString();
-}
-
-// Helper function for implementing ASSERT_NEAR.
-AssertionResult DoubleNearPredFormat(const char* expr1,
-                                     const char* expr2,
-                                     const char* abs_error_expr,
-                                     double val1,
-                                     double val2,
-                                     double abs_error) {
-  const double diff = fabs(val1 - val2);
-  if (diff <= abs_error) return AssertionSuccess();
-
-  // TODO(wan): do not print the value of an expression if it's
-  // already a literal.
-  return AssertionFailure()
-      << "The difference between " << expr1 << " and " << expr2
-      << " is " << diff << ", which exceeds " << abs_error_expr << ", where\n"
-      << expr1 << " evaluates to " << val1 << ",\n"
-      << expr2 << " evaluates to " << val2 << ", and\n"
-      << abs_error_expr << " evaluates to " << abs_error << ".";
-}
-
-
-// Helper template for implementing FloatLE() and DoubleLE().
-template <typename RawType>
-AssertionResult FloatingPointLE(const char* expr1,
-                                const char* expr2,
-                                RawType val1,
-                                RawType val2) {
-  // Returns success if val1 is less than val2,
-  if (val1 < val2) {
-    return AssertionSuccess();
-  }
-
-  // or if val1 is almost equal to val2.
-  const FloatingPoint<RawType> lhs(val1), rhs(val2);
-  if (lhs.AlmostEquals(rhs)) {
-    return AssertionSuccess();
-  }
-
-  // Note that the above two checks will both fail if either val1 or
-  // val2 is NaN, as the IEEE floating-point standard requires that
-  // any predicate involving a NaN must return false.
-
-  ::std::stringstream val1_ss;
-  val1_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
-          << val1;
-
-  ::std::stringstream val2_ss;
-  val2_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
-          << val2;
-
-  return AssertionFailure()
-      << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n"
-      << "  Actual: " << StringStreamToString(&val1_ss) << " vs "
-      << StringStreamToString(&val2_ss);
-}
-
-}  // namespace internal
-
-// Asserts that val1 is less than, or almost equal to, val2.  Fails
-// otherwise.  In particular, it fails if either val1 or val2 is NaN.
-AssertionResult FloatLE(const char* expr1, const char* expr2,
-                        float val1, float val2) {
-  return internal::FloatingPointLE<float>(expr1, expr2, val1, val2);
-}
-
-// Asserts that val1 is less than, or almost equal to, val2.  Fails
-// otherwise.  In particular, it fails if either val1 or val2 is NaN.
-AssertionResult DoubleLE(const char* expr1, const char* expr2,
-                         double val1, double val2) {
-  return internal::FloatingPointLE<double>(expr1, expr2, val1, val2);
-}
-
-namespace internal {
-
-// The helper function for {ASSERT|EXPECT}_EQ with int or enum
-// arguments.
-AssertionResult CmpHelperEQ(const char* expected_expression,
-                            const char* actual_expression,
-                            BiggestInt expected,
-                            BiggestInt actual) {
-  if (expected == actual) {
-    return AssertionSuccess();
-  }
-
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   FormatForComparisonFailureMessage(expected, actual),
-                   FormatForComparisonFailureMessage(actual, expected),
-                   false);
-}
-
-// A macro for implementing the helper functions needed to implement
-// ASSERT_?? and EXPECT_?? with integer or enum arguments.  It is here
-// just to avoid copy-and-paste of similar code.
-#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
-AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
-                                   BiggestInt val1, BiggestInt val2) {\
-  if (val1 op val2) {\
-    return AssertionSuccess();\
-  } else {\
-    return AssertionFailure() \
-        << "Expected: (" << expr1 << ") " #op " (" << expr2\
-        << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\
-        << " vs " << FormatForComparisonFailureMessage(val2, val1);\
-  }\
-}
-
-// Implements the helper function for {ASSERT|EXPECT}_NE with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(NE, !=)
-// Implements the helper function for {ASSERT|EXPECT}_LE with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(LE, <=)
-// Implements the helper function for {ASSERT|EXPECT}_LT with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(LT, < )
-// Implements the helper function for {ASSERT|EXPECT}_GE with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(GE, >=)
-// Implements the helper function for {ASSERT|EXPECT}_GT with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(GT, > )
-
-#undef GTEST_IMPL_CMP_HELPER_
-
-// The helper function for {ASSERT|EXPECT}_STREQ.
-AssertionResult CmpHelperSTREQ(const char* expected_expression,
-                               const char* actual_expression,
-                               const char* expected,
-                               const char* actual) {
-  if (String::CStringEquals(expected, actual)) {
-    return AssertionSuccess();
-  }
-
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   PrintToString(expected),
-                   PrintToString(actual),
-                   false);
-}
-
-// The helper function for {ASSERT|EXPECT}_STRCASEEQ.
-AssertionResult CmpHelperSTRCASEEQ(const char* expected_expression,
-                                   const char* actual_expression,
-                                   const char* expected,
-                                   const char* actual) {
-  if (String::CaseInsensitiveCStringEquals(expected, actual)) {
-    return AssertionSuccess();
-  }
-
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   PrintToString(expected),
-                   PrintToString(actual),
-                   true);
-}
-
-// The helper function for {ASSERT|EXPECT}_STRNE.
-AssertionResult CmpHelperSTRNE(const char* s1_expression,
-                               const char* s2_expression,
-                               const char* s1,
-                               const char* s2) {
-  if (!String::CStringEquals(s1, s2)) {
-    return AssertionSuccess();
-  } else {
-    return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
-                              << s2_expression << "), actual: \""
-                              << s1 << "\" vs \"" << s2 << "\"";
-  }
-}
-
-// The helper function for {ASSERT|EXPECT}_STRCASENE.
-AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
-                                   const char* s2_expression,
-                                   const char* s1,
-                                   const char* s2) {
-  if (!String::CaseInsensitiveCStringEquals(s1, s2)) {
-    return AssertionSuccess();
-  } else {
-    return AssertionFailure()
-        << "Expected: (" << s1_expression << ") != ("
-        << s2_expression << ") (ignoring case), actual: \""
-        << s1 << "\" vs \"" << s2 << "\"";
-  }
-}
-
-}  // namespace internal
-
-namespace {
-
-// Helper functions for implementing IsSubString() and IsNotSubstring().
-
-// This group of overloaded functions return true iff needle is a
-// substring of haystack.  NULL is considered a substring of itself
-// only.
-
-bool IsSubstringPred(const char* needle, const char* haystack) {
-  if (needle == NULL || haystack == NULL)
-    return needle == haystack;
-
-  return strstr(haystack, needle) != NULL;
-}
-
-bool IsSubstringPred(const wchar_t* needle, const wchar_t* haystack) {
-  if (needle == NULL || haystack == NULL)
-    return needle == haystack;
-
-  return wcsstr(haystack, needle) != NULL;
-}
-
-// StringType here can be either ::std::string or ::std::wstring.
-template <typename StringType>
-bool IsSubstringPred(const StringType& needle,
-                     const StringType& haystack) {
-  return haystack.find(needle) != StringType::npos;
-}
-
-// This function implements either IsSubstring() or IsNotSubstring(),
-// depending on the value of the expected_to_be_substring parameter.
-// StringType here can be const char*, const wchar_t*, ::std::string,
-// or ::std::wstring.
-template <typename StringType>
-AssertionResult IsSubstringImpl(
-    bool expected_to_be_substring,
-    const char* needle_expr, const char* haystack_expr,
-    const StringType& needle, const StringType& haystack) {
-  if (IsSubstringPred(needle, haystack) == expected_to_be_substring)
-    return AssertionSuccess();
-
-  const bool is_wide_string = sizeof(needle[0]) > 1;
-  const char* const begin_string_quote = is_wide_string ? "L\"" : "\"";
-  return AssertionFailure()
-      << "Value of: " << needle_expr << "\n"
-      << "  Actual: " << begin_string_quote << needle << "\"\n"
-      << "Expected: " << (expected_to_be_substring ? "" : "not ")
-      << "a substring of " << haystack_expr << "\n"
-      << "Which is: " << begin_string_quote << haystack << "\"";
-}
-
-}  // namespace
-
-// IsSubstring() and IsNotSubstring() check whether needle is a
-// substring of haystack (NULL is considered a substring of itself
-// only), and return an appropriate error message when they fail.
-
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack) {
-  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
-}
-
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack) {
-  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
-}
-
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack) {
-  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
-}
-
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack) {
-  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
-}
-
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack) {
-  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
-}
-
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack) {
-  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
-}
-
-#if GTEST_HAS_STD_WSTRING
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack) {
-  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
-}
-
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack) {
-  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
-}
-#endif  // GTEST_HAS_STD_WSTRING
-
-namespace internal {
-
-#if GTEST_OS_WINDOWS
-
-namespace {
-
-// Helper function for IsHRESULT{SuccessFailure} predicates
-AssertionResult HRESULTFailureHelper(const char* expr,
-                                     const char* expected,
-                                     long hr) {  // NOLINT
-# if GTEST_OS_WINDOWS_MOBILE
-
-  // Windows CE doesn't support FormatMessage.
-  const char error_text[] = "";
-
-# else
-
-  // Looks up the human-readable system message for the HRESULT code
-  // and since we're not passing any params to FormatMessage, we don't
-  // want inserts expanded.
-  const DWORD kFlags = FORMAT_MESSAGE_FROM_SYSTEM |
-                       FORMAT_MESSAGE_IGNORE_INSERTS;
-  const DWORD kBufSize = 4096;
-  // Gets the system's human readable message string for this HRESULT.
-  char error_text[kBufSize] = { '\0' };
-  DWORD message_length = ::FormatMessageA(kFlags,
-                                          0,  // no source, we're asking system
-                                          hr,  // the error
-                                          0,  // no line width restrictions
-                                          error_text,  // output buffer
-                                          kBufSize,  // buf size
-                                          NULL);  // no arguments for inserts
-  // Trims tailing white space (FormatMessage leaves a trailing CR-LF)
-  for (; message_length && IsSpace(error_text[message_length - 1]);
-          --message_length) {
-    error_text[message_length - 1] = '\0';
-  }
-
-# endif  // GTEST_OS_WINDOWS_MOBILE
-
-  const std::string error_hex("0x" + String::FormatHexInt(hr));
-  return ::testing::AssertionFailure()
-      << "Expected: " << expr << " " << expected << ".\n"
-      << "  Actual: " << error_hex << " " << error_text << "\n";
-}
-
-}  // namespace
-
-AssertionResult IsHRESULTSuccess(const char* expr, long hr) {  // NOLINT
-  if (SUCCEEDED(hr)) {
-    return AssertionSuccess();
-  }
-  return HRESULTFailureHelper(expr, "succeeds", hr);
-}
-
-AssertionResult IsHRESULTFailure(const char* expr, long hr) {  // NOLINT
-  if (FAILED(hr)) {
-    return AssertionSuccess();
-  }
-  return HRESULTFailureHelper(expr, "fails", hr);
-}
-
-#endif  // GTEST_OS_WINDOWS
-
-// Utility functions for encoding Unicode text (wide strings) in
-// UTF-8.
-
-// A Unicode code-point can have upto 21 bits, and is encoded in UTF-8
-// like this:
-//
-// Code-point length   Encoding
-//   0 -  7 bits       0xxxxxxx
-//   8 - 11 bits       110xxxxx 10xxxxxx
-//  12 - 16 bits       1110xxxx 10xxxxxx 10xxxxxx
-//  17 - 21 bits       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-
-// The maximum code-point a one-byte UTF-8 sequence can represent.
-const UInt32 kMaxCodePoint1 = (static_cast<UInt32>(1) <<  7) - 1;
-
-// The maximum code-point a two-byte UTF-8 sequence can represent.
-const UInt32 kMaxCodePoint2 = (static_cast<UInt32>(1) << (5 + 6)) - 1;
-
-// The maximum code-point a three-byte UTF-8 sequence can represent.
-const UInt32 kMaxCodePoint3 = (static_cast<UInt32>(1) << (4 + 2*6)) - 1;
-
-// The maximum code-point a four-byte UTF-8 sequence can represent.
-const UInt32 kMaxCodePoint4 = (static_cast<UInt32>(1) << (3 + 3*6)) - 1;
-
-// Chops off the n lowest bits from a bit pattern.  Returns the n
-// lowest bits.  As a side effect, the original bit pattern will be
-// shifted to the right by n bits.
-inline UInt32 ChopLowBits(UInt32* bits, int n) {
-  const UInt32 low_bits = *bits & ((static_cast<UInt32>(1) << n) - 1);
-  *bits >>= n;
-  return low_bits;
-}
-
-// Converts a Unicode code point to a narrow string in UTF-8 encoding.
-// code_point parameter is of type UInt32 because wchar_t may not be
-// wide enough to contain a code point.
-// If the code_point is not a valid Unicode code point
-// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
-// to "(Invalid Unicode 0xXXXXXXXX)".
-std::string CodePointToUtf8(UInt32 code_point) {
-  if (code_point > kMaxCodePoint4) {
-    return "(Invalid Unicode 0x" + String::FormatHexInt(code_point) + ")";
-  }
-
-  char str[5];  // Big enough for the largest valid code point.
-  if (code_point <= kMaxCodePoint1) {
-    str[1] = '\0';
-    str[0] = static_cast<char>(code_point);                          // 0xxxxxxx
-  } else if (code_point <= kMaxCodePoint2) {
-    str[2] = '\0';
-    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
-    str[0] = static_cast<char>(0xC0 | code_point);                   // 110xxxxx
-  } else if (code_point <= kMaxCodePoint3) {
-    str[3] = '\0';
-    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
-    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
-    str[0] = static_cast<char>(0xE0 | code_point);                   // 1110xxxx
-  } else {  // code_point <= kMaxCodePoint4
-    str[4] = '\0';
-    str[3] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
-    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
-    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
-    str[0] = static_cast<char>(0xF0 | code_point);                   // 11110xxx
-  }
-  return str;
-}
-
-// The following two functions only make sense if the the system
-// uses UTF-16 for wide string encoding. All supported systems
-// with 16 bit wchar_t (Windows, Cygwin, Symbian OS) do use UTF-16.
-
-// Determines if the arguments constitute UTF-16 surrogate pair
-// and thus should be combined into a single Unicode code point
-// using CreateCodePointFromUtf16SurrogatePair.
-inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) {
-  return sizeof(wchar_t) == 2 &&
-      (first & 0xFC00) == 0xD800 && (second & 0xFC00) == 0xDC00;
-}
-
-// Creates a Unicode code point from UTF16 surrogate pair.
-inline UInt32 CreateCodePointFromUtf16SurrogatePair(wchar_t first,
-                                                    wchar_t second) {
-  const UInt32 mask = (1 << 10) - 1;
-  return (sizeof(wchar_t) == 2) ?
-      (((first & mask) << 10) | (second & mask)) + 0x10000 :
-      // This function should not be called when the condition is
-      // false, but we provide a sensible default in case it is.
-      static_cast<UInt32>(first);
-}
-
-// Converts a wide string to a narrow string in UTF-8 encoding.
-// The wide string is assumed to have the following encoding:
-//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS)
-//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
-// Parameter str points to a null-terminated wide string.
-// Parameter num_chars may additionally limit the number
-// of wchar_t characters processed. -1 is used when the entire string
-// should be processed.
-// If the string contains code points that are not valid Unicode code points
-// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
-// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
-// and contains invalid UTF-16 surrogate pairs, values in those pairs
-// will be encoded as individual Unicode characters from Basic Normal Plane.
-std::string WideStringToUtf8(const wchar_t* str, int num_chars) {
-  if (num_chars == -1)
-    num_chars = static_cast<int>(wcslen(str));
-
-  ::std::stringstream stream;
-  for (int i = 0; i < num_chars; ++i) {
-    UInt32 unicode_code_point;
-
-    if (str[i] == L'\0') {
-      break;
-    } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) {
-      unicode_code_point = CreateCodePointFromUtf16SurrogatePair(str[i],
-                                                                 str[i + 1]);
-      i++;
-    } else {
-      unicode_code_point = static_cast<UInt32>(str[i]);
-    }
-
-    stream << CodePointToUtf8(unicode_code_point);
-  }
-  return StringStreamToString(&stream);
-}
-
-// Converts a wide C string to an std::string using the UTF-8 encoding.
-// NULL will be converted to "(null)".
-std::string String::ShowWideCString(const wchar_t * wide_c_str) {
-  if (wide_c_str == NULL)  return "(null)";
-
-  return internal::WideStringToUtf8(wide_c_str, -1);
-}
-
-// Compares two wide C strings.  Returns true iff they have the same
-// content.
-//
-// Unlike wcscmp(), this function can handle NULL argument(s).  A NULL
-// C string is considered different to any non-NULL C string,
-// including the empty string.
-bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) {
-  if (lhs == NULL) return rhs == NULL;
-
-  if (rhs == NULL) return false;
-
-  return wcscmp(lhs, rhs) == 0;
-}
-
-// Helper function for *_STREQ on wide strings.
-AssertionResult CmpHelperSTREQ(const char* expected_expression,
-                               const char* actual_expression,
-                               const wchar_t* expected,
-                               const wchar_t* actual) {
-  if (String::WideCStringEquals(expected, actual)) {
-    return AssertionSuccess();
-  }
-
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   PrintToString(expected),
-                   PrintToString(actual),
-                   false);
-}
-
-// Helper function for *_STRNE on wide strings.
-AssertionResult CmpHelperSTRNE(const char* s1_expression,
-                               const char* s2_expression,
-                               const wchar_t* s1,
-                               const wchar_t* s2) {
-  if (!String::WideCStringEquals(s1, s2)) {
-    return AssertionSuccess();
-  }
-
-  return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
-                            << s2_expression << "), actual: "
-                            << PrintToString(s1)
-                            << " vs " << PrintToString(s2);
-}
-
-// Compares two C strings, ignoring case.  Returns true iff they have
-// the same content.
-//
-// Unlike strcasecmp(), this function can handle NULL argument(s).  A
-// NULL C string is considered different to any non-NULL C string,
-// including the empty string.
-bool String::CaseInsensitiveCStringEquals(const char * lhs, const char * rhs) {
-  if (lhs == NULL)
-    return rhs == NULL;
-  if (rhs == NULL)
-    return false;
-  return posix::StrCaseCmp(lhs, rhs) == 0;
-}
-
-  // Compares two wide C strings, ignoring case.  Returns true iff they
-  // have the same content.
-  //
-  // Unlike wcscasecmp(), this function can handle NULL argument(s).
-  // A NULL C string is considered different to any non-NULL wide C string,
-  // including the empty string.
-  // NB: The implementations on different platforms slightly differ.
-  // On windows, this method uses _wcsicmp which compares according to LC_CTYPE
-  // environment variable. On GNU platform this method uses wcscasecmp
-  // which compares according to LC_CTYPE category of the current locale.
-  // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
-  // current locale.
-bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
-                                              const wchar_t* rhs) {
-  if (lhs == NULL) return rhs == NULL;
-
-  if (rhs == NULL) return false;
-
-#if GTEST_OS_WINDOWS
-  return _wcsicmp(lhs, rhs) == 0;
-#elif GTEST_OS_LINUX && !GTEST_OS_LINUX_ANDROID
-  return wcscasecmp(lhs, rhs) == 0;
-#else
-  // Android, Mac OS X and Cygwin don't define wcscasecmp.
-  // Other unknown OSes may not define it either.
-  wint_t left, right;
-  do {
-    left = towlower(*lhs++);
-    right = towlower(*rhs++);
-  } while (left && left == right);
-  return left == right;
-#endif  // OS selector
-}
-
-// Returns true iff str ends with the given suffix, ignoring case.
-// Any string is considered to end with an empty suffix.
-bool String::EndsWithCaseInsensitive(
-    const std::string& str, const std::string& suffix) {
-  const size_t str_len = str.length();
-  const size_t suffix_len = suffix.length();
-  return (str_len >= suffix_len) &&
-         CaseInsensitiveCStringEquals(str.c_str() + str_len - suffix_len,
-                                      suffix.c_str());
-}
-
-// Formats an int value as "%02d".
-std::string String::FormatIntWidth2(int value) {
-  std::stringstream ss;
-  ss << std::setfill('0') << std::setw(2) << value;
-  return ss.str();
-}
-
-// Formats an int value as "%X".
-std::string String::FormatHexInt(int value) {
-  std::stringstream ss;
-  ss << std::hex << std::uppercase << value;
-  return ss.str();
-}
-
-// Formats a byte as "%02X".
-std::string String::FormatByte(unsigned char value) {
-  std::stringstream ss;
-  ss << std::setfill('0') << std::setw(2) << std::hex << std::uppercase
-     << static_cast<unsigned int>(value);
-  return ss.str();
-}
-
-// Converts the buffer in a stringstream to an std::string, converting NUL
-// bytes to "\\0" along the way.
-std::string StringStreamToString(::std::stringstream* ss) {
-  const ::std::string& str = ss->str();
-  const char* const start = str.c_str();
-  const char* const end = start + str.length();
-
-  std::string result;
-  result.reserve(2 * (end - start));
-  for (const char* ch = start; ch != end; ++ch) {
-    if (*ch == '\0') {
-      result += "\\0";  // Replaces NUL with "\\0";
-    } else {
-      result += *ch;
-    }
-  }
-
-  return result;
-}
-
-// Appends the user-supplied message to the Google-Test-generated message.
-std::string AppendUserMessage(const std::string& gtest_msg,
-                              const Message& user_msg) {
-  // Appends the user message if it's non-empty.
-  const std::string user_msg_string = user_msg.GetString();
-  if (user_msg_string.empty()) {
-    return gtest_msg;
-  }
-
-  return gtest_msg + "\n" + user_msg_string;
-}
-
-}  // namespace internal
-
-// class TestResult
-
-// Creates an empty TestResult.
-TestResult::TestResult()
-    : death_test_count_(0),
-      elapsed_time_(0) {
-}
-
-// D'tor.
-TestResult::~TestResult() {
-}
-
-// Returns the i-th test part result among all the results. i can
-// range from 0 to total_part_count() - 1. If i is not in that range,
-// aborts the program.
-const TestPartResult& TestResult::GetTestPartResult(int i) const {
-  if (i < 0 || i >= total_part_count())
-    internal::posix::Abort();
-  return test_part_results_.at(i);
-}
-
-// Returns the i-th test property. i can range from 0 to
-// test_property_count() - 1. If i is not in that range, aborts the
-// program.
-const TestProperty& TestResult::GetTestProperty(int i) const {
-  if (i < 0 || i >= test_property_count())
-    internal::posix::Abort();
-  return test_properties_.at(i);
-}
-
-// Clears the test part results.
-void TestResult::ClearTestPartResults() {
-  test_part_results_.clear();
-}
-
-// Adds a test part result to the list.
-void TestResult::AddTestPartResult(const TestPartResult& test_part_result) {
-  test_part_results_.push_back(test_part_result);
-}
-
-// Adds a test property to the list. If a property with the same key as the
-// supplied property is already represented, the value of this test_property
-// replaces the old value for that key.
-void TestResult::RecordProperty(const std::string& xml_element,
-                                const TestProperty& test_property) {
-  if (!ValidateTestProperty(xml_element, test_property)) {
-    return;
-  }
-  internal::MutexLock lock(&test_properites_mutex_);
-  const std::vector<TestProperty>::iterator property_with_matching_key =
-      std::find_if(test_properties_.begin(), test_properties_.end(),
-                   internal::TestPropertyKeyIs(test_property.key()));
-  if (property_with_matching_key == test_properties_.end()) {
-    test_properties_.push_back(test_property);
-    return;
-  }
-  property_with_matching_key->SetValue(test_property.value());
-}
-
-// The list of reserved attributes used in the <testsuites> element of XML
-// output.
-static const char* const kReservedTestSuitesAttributes[] = {
-  "disabled",
-  "errors",
-  "failures",
-  "name",
-  "random_seed",
-  "tests",
-  "time",
-  "timestamp"
-};
-
-// The list of reserved attributes used in the <testsuite> element of XML
-// output.
-static const char* const kReservedTestSuiteAttributes[] = {
-  "disabled",
-  "errors",
-  "failures",
-  "name",
-  "tests",
-  "time"
-};
-
-// The list of reserved attributes used in the <testcase> element of XML output.
-static const char* const kReservedTestCaseAttributes[] = {
-  "classname",
-  "name",
-  "status",
-  "time",
-  "type_param",
-  "value_param"
-};
-
-template <int kSize>
-std::vector<std::string> ArrayAsVector(const char* const (&array)[kSize]) {
-  return std::vector<std::string>(array, array + kSize);
-}
-
-static std::vector<std::string> GetReservedAttributesForElement(
-    const std::string& xml_element) {
-  if (xml_element == "testsuites") {
-    return ArrayAsVector(kReservedTestSuitesAttributes);
-  } else if (xml_element == "testsuite") {
-    return ArrayAsVector(kReservedTestSuiteAttributes);
-  } else if (xml_element == "testcase") {
-    return ArrayAsVector(kReservedTestCaseAttributes);
-  } else {
-    GTEST_CHECK_(false) << "Unrecognized xml_element provided: " << xml_element;
-  }
-  // This code is unreachable but some compilers may not realizes that.
-  return std::vector<std::string>();
-}
-
-static std::string FormatWordList(const std::vector<std::string>& words) {
-  Message word_list;
-  for (size_t i = 0; i < words.size(); ++i) {
-    if (i > 0 && words.size() > 2) {
-      word_list << ", ";
-    }
-    if (i == words.size() - 1) {
-      word_list << "and ";
-    }
-    word_list << "'" << words[i] << "'";
-  }
-  return word_list.GetString();
-}
-
-bool ValidateTestPropertyName(const std::string& property_name,
-                              const std::vector<std::string>& reserved_names) {
-  if (std::find(reserved_names.begin(), reserved_names.end(), property_name) !=
-          reserved_names.end()) {
-    ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name
-                  << " (" << FormatWordList(reserved_names)
-                  << " are reserved by " << GTEST_NAME_ << ")";
-    return false;
-  }
-  return true;
-}
-
-// Adds a failure if the key is a reserved attribute of the element named
-// xml_element.  Returns true if the property is valid.
-bool TestResult::ValidateTestProperty(const std::string& xml_element,
-                                      const TestProperty& test_property) {
-  return ValidateTestPropertyName(test_property.key(),
-                                  GetReservedAttributesForElement(xml_element));
-}
-
-// Clears the object.
-void TestResult::Clear() {
-  test_part_results_.clear();
-  test_properties_.clear();
-  death_test_count_ = 0;
-  elapsed_time_ = 0;
-}
-
-// Returns true iff the test failed.
-bool TestResult::Failed() const {
-  for (int i = 0; i < total_part_count(); ++i) {
-    if (GetTestPartResult(i).failed())
-      return true;
-  }
-  return false;
-}
-
-// Returns true iff the test part fatally failed.
-static bool TestPartFatallyFailed(const TestPartResult& result) {
-  return result.fatally_failed();
-}
-
-// Returns true iff the test fatally failed.
-bool TestResult::HasFatalFailure() const {
-  return CountIf(test_part_results_, TestPartFatallyFailed) > 0;
-}
-
-// Returns true iff the test part non-fatally failed.
-static bool TestPartNonfatallyFailed(const TestPartResult& result) {
-  return result.nonfatally_failed();
-}
-
-// Returns true iff the test has a non-fatal failure.
-bool TestResult::HasNonfatalFailure() const {
-  return CountIf(test_part_results_, TestPartNonfatallyFailed) > 0;
-}
-
-// Gets the number of all test parts.  This is the sum of the number
-// of successful test parts and the number of failed test parts.
-int TestResult::total_part_count() const {
-  return static_cast<int>(test_part_results_.size());
-}
-
-// Returns the number of the test properties.
-int TestResult::test_property_count() const {
-  return static_cast<int>(test_properties_.size());
-}
-
-// class Test
-
-// Creates a Test object.
-
-// The c'tor saves the values of all Google Test flags.
-Test::Test()
-    : gtest_flag_saver_(new internal::GTestFlagSaver) {
-}
-
-// The d'tor restores the values of all Google Test flags.
-Test::~Test() {
-  delete gtest_flag_saver_;
-}
-
-// Sets up the test fixture.
-//
-// A sub-class may override this.
-void Test::SetUp() {
-}
-
-// Tears down the test fixture.
-//
-// A sub-class may override this.
-void Test::TearDown() {
-}
-
-// Allows user supplied key value pairs to be recorded for later output.
-void Test::RecordProperty(const std::string& key, const std::string& value) {
-  UnitTest::GetInstance()->RecordProperty(key, value);
-}
-
-// Allows user supplied key value pairs to be recorded for later output.
-void Test::RecordProperty(const std::string& key, int value) {
-  Message value_message;
-  value_message << value;
-  RecordProperty(key, value_message.GetString().c_str());
-}
-
-namespace internal {
-
-void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
-                                    const std::string& message) {
-  // This function is a friend of UnitTest and as such has access to
-  // AddTestPartResult.
-  UnitTest::GetInstance()->AddTestPartResult(
-      result_type,
-      NULL,  // No info about the source file where the exception occurred.
-      -1,    // We have no info on which line caused the exception.
-      message,
-      "");   // No stack trace, either.
-}
-
-}  // namespace internal
-
-// Google Test requires all tests in the same test case to use the same test
-// fixture class.  This function checks if the current test has the
-// same fixture class as the first test in the current test case.  If
-// yes, it returns true; otherwise it generates a Google Test failure and
-// returns false.
-bool Test::HasSameFixtureClass() {
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
-  const TestCase* const test_case = impl->current_test_case();
-
-  // Info about the first test in the current test case.
-  const TestInfo* const first_test_info = test_case->test_info_list()[0];
-  const internal::TypeId first_fixture_id = first_test_info->fixture_class_id_;
-  const char* const first_test_name = first_test_info->name();
-
-  // Info about the current test.
-  const TestInfo* const this_test_info = impl->current_test_info();
-  const internal::TypeId this_fixture_id = this_test_info->fixture_class_id_;
-  const char* const this_test_name = this_test_info->name();
-
-  if (this_fixture_id != first_fixture_id) {
-    // Is the first test defined using TEST?
-    const bool first_is_TEST = first_fixture_id == internal::GetTestTypeId();
-    // Is this test defined using TEST?
-    const bool this_is_TEST = this_fixture_id == internal::GetTestTypeId();
-
-    if (first_is_TEST || this_is_TEST) {
-      // The user mixed TEST and TEST_F in this test case - we'll tell
-      // him/her how to fix it.
-
-      // Gets the name of the TEST and the name of the TEST_F.  Note
-      // that first_is_TEST and this_is_TEST cannot both be true, as
-      // the fixture IDs are different for the two tests.
-      const char* const TEST_name =
-          first_is_TEST ? first_test_name : this_test_name;
-      const char* const TEST_F_name =
-          first_is_TEST ? this_test_name : first_test_name;
-
-      ADD_FAILURE()
-          << "All tests in the same test case must use the same test fixture\n"
-          << "class, so mixing TEST_F and TEST in the same test case is\n"
-          << "illegal.  In test case " << this_test_info->test_case_name()
-          << ",\n"
-          << "test " << TEST_F_name << " is defined using TEST_F but\n"
-          << "test " << TEST_name << " is defined using TEST.  You probably\n"
-          << "want to change the TEST to TEST_F or move it to another test\n"
-          << "case.";
-    } else {
-      // The user defined two fixture classes with the same name in
-      // two namespaces - we'll tell him/her how to fix it.
-      ADD_FAILURE()
-          << "All tests in the same test case must use the same test fixture\n"
-          << "class.  However, in test case "
-          << this_test_info->test_case_name() << ",\n"
-          << "you defined test " << first_test_name
-          << " and test " << this_test_name << "\n"
-          << "using two different test fixture classes.  This can happen if\n"
-          << "the two classes are from different namespaces or translation\n"
-          << "units and have the same name.  You should probably rename one\n"
-          << "of the classes to put the tests into different test cases.";
-    }
-    return false;
-  }
-
-  return true;
-}
-
-#if GTEST_HAS_SEH
-
-// Adds an "exception thrown" fatal failure to the current test.  This
-// function returns its result via an output parameter pointer because VC++
-// prohibits creation of objects with destructors on stack in functions
-// using __try (see error C2712).
-static std::string* FormatSehExceptionMessage(DWORD exception_code,
-                                              const char* location) {
-  Message message;
-  message << "SEH exception with code 0x" << std::setbase(16) <<
-    exception_code << std::setbase(10) << " thrown in " << location << ".";
-
-  return new std::string(message.GetString());
-}
-
-#endif  // GTEST_HAS_SEH
-
-namespace internal {
-
-#if GTEST_HAS_EXCEPTIONS
-
-// Adds an "exception thrown" fatal failure to the current test.
-static std::string FormatCxxExceptionMessage(const char* description,
-                                             const char* location) {
-  Message message;
-  if (description != NULL) {
-    message << "C++ exception with description \"" << description << "\"";
-  } else {
-    message << "Unknown C++ exception";
-  }
-  message << " thrown in " << location << ".";
-
-  return message.GetString();
-}
-
-static std::string PrintTestPartResultToString(
-    const TestPartResult& test_part_result);
-
-GoogleTestFailureException::GoogleTestFailureException(
-    const TestPartResult& failure)
-    : ::std::runtime_error(PrintTestPartResultToString(failure).c_str()) {}
-
-#endif  // GTEST_HAS_EXCEPTIONS
-
-// We put these helper functions in the internal namespace as IBM's xlC
-// compiler rejects the code if they were declared static.
-
-// Runs the given method and handles SEH exceptions it throws, when
-// SEH is supported; returns the 0-value for type Result in case of an
-// SEH exception.  (Microsoft compilers cannot handle SEH and C++
-// exceptions in the same function.  Therefore, we provide a separate
-// wrapper function for handling SEH exceptions.)
-template <class T, typename Result>
-Result HandleSehExceptionsInMethodIfSupported(
-    T* object, Result (T::*method)(), const char* location) {
-#if GTEST_HAS_SEH
-  __try {
-    return (object->*method)();
-  } __except (internal::UnitTestOptions::GTestShouldProcessSEH(  // NOLINT
-      GetExceptionCode())) {
-    // We create the exception message on the heap because VC++ prohibits
-    // creation of objects with destructors on stack in functions using __try
-    // (see error C2712).
-    std::string* exception_message = FormatSehExceptionMessage(
-        GetExceptionCode(), location);
-    internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure,
-                                             *exception_message);
-    delete exception_message;
-    return static_cast<Result>(0);
-  }
-#else
-  (void)location;
-  return (object->*method)();
-#endif  // GTEST_HAS_SEH
-}
-
-// Runs the given method and catches and reports C++ and/or SEH-style
-// exceptions, if they are supported; returns the 0-value for type
-// Result in case of an SEH exception.
-template <class T, typename Result>
-Result HandleExceptionsInMethodIfSupported(
-    T* object, Result (T::*method)(), const char* location) {
-  // NOTE: The user code can affect the way in which Google Test handles
-  // exceptions by setting GTEST_FLAG(catch_exceptions), but only before
-  // RUN_ALL_TESTS() starts. It is technically possible to check the flag
-  // after the exception is caught and either report or re-throw the
-  // exception based on the flag's value:
-  //
-  // try {
-  //   // Perform the test method.
-  // } catch (...) {
-  //   if (GTEST_FLAG(catch_exceptions))
-  //     // Report the exception as failure.
-  //   else
-  //     throw;  // Re-throws the original exception.
-  // }
-  //
-  // However, the purpose of this flag is to allow the program to drop into
-  // the debugger when the exception is thrown. On most platforms, once the
-  // control enters the catch block, the exception origin information is
-  // lost and the debugger will stop the program at the point of the
-  // re-throw in this function -- instead of at the point of the original
-  // throw statement in the code under test.  For this reason, we perform
-  // the check early, sacrificing the ability to affect Google Test's
-  // exception handling in the method where the exception is thrown.
-  if (internal::GetUnitTestImpl()->catch_exceptions()) {
-#if GTEST_HAS_EXCEPTIONS
-    try {
-      return HandleSehExceptionsInMethodIfSupported(object, method, location);
-    } catch (const internal::GoogleTestFailureException&) {  // NOLINT
-      // This exception type can only be thrown by a failed Google
-      // Test assertion with the intention of letting another testing
-      // framework catch it.  Therefore we just re-throw it.
-      throw;
-    } catch (const std::exception& e) {  // NOLINT
-      internal::ReportFailureInUnknownLocation(
-          TestPartResult::kFatalFailure,
-          FormatCxxExceptionMessage(e.what(), location));
-    } catch (...) {  // NOLINT
-      internal::ReportFailureInUnknownLocation(
-          TestPartResult::kFatalFailure,
-          FormatCxxExceptionMessage(NULL, location));
-    }
-    return static_cast<Result>(0);
-#else
-    return HandleSehExceptionsInMethodIfSupported(object, method, location);
-#endif  // GTEST_HAS_EXCEPTIONS
-  } else {
-    return (object->*method)();
-  }
-}
-
-}  // namespace internal
-
-// Runs the test and updates the test result.
-void Test::Run() {
-  if (!HasSameFixtureClass()) return;
-
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
-  impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(this, &Test::SetUp, "SetUp()");
-  // We will run the test only if SetUp() was successful.
-  if (!HasFatalFailure()) {
-    impl->os_stack_trace_getter()->UponLeavingGTest();
-    internal::HandleExceptionsInMethodIfSupported(
-        this, &Test::TestBody, "the test body");
-  }
-
-  // However, we want to clean up as much as possible.  Hence we will
-  // always call TearDown(), even if SetUp() or the test body has
-  // failed.
-  impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(
-      this, &Test::TearDown, "TearDown()");
-}
-
-// Returns true iff the current test has a fatal failure.
-bool Test::HasFatalFailure() {
-  return internal::GetUnitTestImpl()->current_test_result()->HasFatalFailure();
-}
-
-// Returns true iff the current test has a non-fatal failure.
-bool Test::HasNonfatalFailure() {
-  return internal::GetUnitTestImpl()->current_test_result()->
-      HasNonfatalFailure();
-}
-
-// class TestInfo
-
-// Constructs a TestInfo object. It assumes ownership of the test factory
-// object.
-TestInfo::TestInfo(const std::string& a_test_case_name,
-                   const std::string& a_name,
-                   const char* a_type_param,
-                   const char* a_value_param,
-                   internal::TypeId fixture_class_id,
-                   internal::TestFactoryBase* factory)
-    : test_case_name_(a_test_case_name),
-      name_(a_name),
-      type_param_(a_type_param ? new std::string(a_type_param) : NULL),
-      value_param_(a_value_param ? new std::string(a_value_param) : NULL),
-      fixture_class_id_(fixture_class_id),
-      should_run_(false),
-      is_disabled_(false),
-      matches_filter_(false),
-      factory_(factory),
-      result_() {}
-
-// Destructs a TestInfo object.
-TestInfo::~TestInfo() { delete factory_; }
-
-namespace internal {
-
-// Creates a new TestInfo object and registers it with Google Test;
-// returns the created object.
-//
-// Arguments:
-//
-//   test_case_name:   name of the test case
-//   name:             name of the test
-//   type_param:       the name of the test's type parameter, or NULL if
-//                     this is not a typed or a type-parameterized test.
-//   value_param:      text representation of the test's value parameter,
-//                     or NULL if this is not a value-parameterized test.
-//   fixture_class_id: ID of the test fixture class
-//   set_up_tc:        pointer to the function that sets up the test case
-//   tear_down_tc:     pointer to the function that tears down the test case
-//   factory:          pointer to the factory that creates a test object.
-//                     The newly created TestInfo instance will assume
-//                     ownership of the factory object.
-TestInfo* MakeAndRegisterTestInfo(
-    const char* test_case_name,
-    const char* name,
-    const char* type_param,
-    const char* value_param,
-    TypeId fixture_class_id,
-    SetUpTestCaseFunc set_up_tc,
-    TearDownTestCaseFunc tear_down_tc,
-    TestFactoryBase* factory) {
-  TestInfo* const test_info =
-      new TestInfo(test_case_name, name, type_param, value_param,
-                   fixture_class_id, factory);
-  GetUnitTestImpl()->AddTestInfo(set_up_tc, tear_down_tc, test_info);
-  return test_info;
-}
-
-#if GTEST_HAS_PARAM_TEST
-void ReportInvalidTestCaseType(const char* test_case_name,
-                               const char* file, int line) {
-  Message errors;
-  errors
-      << "Attempted redefinition of test case " << test_case_name << ".\n"
-      << "All tests in the same test case must use the same test fixture\n"
-      << "class.  However, in test case " << test_case_name << ", you tried\n"
-      << "to define a test using a fixture class different from the one\n"
-      << "used earlier. This can happen if the two fixture classes are\n"
-      << "from different namespaces and have the same name. You should\n"
-      << "probably rename one of the classes to put the tests into different\n"
-      << "test cases.";
-
-  fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(),
-          errors.GetString().c_str());
-}
-#endif  // GTEST_HAS_PARAM_TEST
-
-}  // namespace internal
-
-namespace {
-
-// A predicate that checks the test name of a TestInfo against a known
-// value.
-//
-// This is used for implementation of the TestCase class only.  We put
-// it in the anonymous namespace to prevent polluting the outer
-// namespace.
-//
-// TestNameIs is copyable.
-class TestNameIs {
- public:
-  // Constructor.
-  //
-  // TestNameIs has NO default constructor.
-  explicit TestNameIs(const char* name)
-      : name_(name) {}
-
-  // Returns true iff the test name of test_info matches name_.
-  bool operator()(const TestInfo * test_info) const {
-    return test_info && test_info->name() == name_;
-  }
-
- private:
-  std::string name_;
-};
-
-}  // namespace
-
-namespace internal {
-
-// This method expands all parameterized tests registered with macros TEST_P
-// and INSTANTIATE_TEST_CASE_P into regular tests and registers those.
-// This will be done just once during the program runtime.
-void UnitTestImpl::RegisterParameterizedTests() {
-#if GTEST_HAS_PARAM_TEST
-  if (!parameterized_tests_registered_) {
-    parameterized_test_registry_.RegisterTests();
-    parameterized_tests_registered_ = true;
-  }
-#endif
-}
-
-}  // namespace internal
-
-// Creates the test object, runs it, records its result, and then
-// deletes it.
-void TestInfo::Run() {
-  if (!should_run_) return;
-
-  // Tells UnitTest where to store test result.
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
-  impl->set_current_test_info(this);
-
-  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
-
-  // Notifies the unit test event listeners that a test is about to start.
-  repeater->OnTestStart(*this);
-
-  const TimeInMillis start = internal::GetTimeInMillis();
-
-  impl->os_stack_trace_getter()->UponLeavingGTest();
-
-  // Creates the test object.
-  Test* const test = internal::HandleExceptionsInMethodIfSupported(
-      factory_, &internal::TestFactoryBase::CreateTest,
-      "the test fixture's constructor");
-
-  // Runs the test only if the test object was created and its
-  // constructor didn't generate a fatal failure.
-  if ((test != NULL) && !Test::HasFatalFailure()) {
-    // This doesn't throw as all user code that can throw are wrapped into
-    // exception handling code.
-    test->Run();
-  }
-
-  // Deletes the test object.
-  impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(
-      test, &Test::DeleteSelf_, "the test fixture's destructor");
-
-  result_.set_elapsed_time(internal::GetTimeInMillis() - start);
-
-  // Notifies the unit test event listener that a test has just finished.
-  repeater->OnTestEnd(*this);
-
-  // Tells UnitTest to stop associating assertion results to this
-  // test.
-  impl->set_current_test_info(NULL);
-}
-
-// class TestCase
-
-// Gets the number of successful tests in this test case.
-int TestCase::successful_test_count() const {
-  return CountIf(test_info_list_, TestPassed);
-}
-
-// Gets the number of failed tests in this test case.
-int TestCase::failed_test_count() const {
-  return CountIf(test_info_list_, TestFailed);
-}
-
-// Gets the number of disabled tests that will be reported in the XML report.
-int TestCase::reportable_disabled_test_count() const {
-  return CountIf(test_info_list_, TestReportableDisabled);
-}
-
-// Gets the number of disabled tests in this test case.
-int TestCase::disabled_test_count() const {
-  return CountIf(test_info_list_, TestDisabled);
-}
-
-// Gets the number of tests to be printed in the XML report.
-int TestCase::reportable_test_count() const {
-  return CountIf(test_info_list_, TestReportable);
-}
-
-// Get the number of tests in this test case that should run.
-int TestCase::test_to_run_count() const {
-  return CountIf(test_info_list_, ShouldRunTest);
-}
-
-// Gets the number of all tests.
-int TestCase::total_test_count() const {
-  return static_cast<int>(test_info_list_.size());
-}
-
-// Creates a TestCase with the given name.
-//
-// Arguments:
-//
-//   name:         name of the test case
-//   a_type_param: the name of the test case's type parameter, or NULL if
-//                 this is not a typed or a type-parameterized test case.
-//   set_up_tc:    pointer to the function that sets up the test case
-//   tear_down_tc: pointer to the function that tears down the test case
-TestCase::TestCase(const char* a_name, const char* a_type_param,
-                   Test::SetUpTestCaseFunc set_up_tc,
-                   Test::TearDownTestCaseFunc tear_down_tc)
-    : name_(a_name),
-      type_param_(a_type_param ? new std::string(a_type_param) : NULL),
-      set_up_tc_(set_up_tc),
-      tear_down_tc_(tear_down_tc),
-      should_run_(false),
-      elapsed_time_(0) {
-}
-
-// Destructor of TestCase.
-TestCase::~TestCase() {
-  // Deletes every Test in the collection.
-  ForEach(test_info_list_, internal::Delete<TestInfo>);
-}
-
-// Returns the i-th test among all the tests. i can range from 0 to
-// total_test_count() - 1. If i is not in that range, returns NULL.
-const TestInfo* TestCase::GetTestInfo(int i) const {
-  const int index = GetElementOr(test_indices_, i, -1);
-  return index < 0 ? NULL : test_info_list_[index];
-}
-
-// Returns the i-th test among all the tests. i can range from 0 to
-// total_test_count() - 1. If i is not in that range, returns NULL.
-TestInfo* TestCase::GetMutableTestInfo(int i) {
-  const int index = GetElementOr(test_indices_, i, -1);
-  return index < 0 ? NULL : test_info_list_[index];
-}
-
-// Adds a test to this test case.  Will delete the test upon
-// destruction of the TestCase object.
-void TestCase::AddTestInfo(TestInfo * test_info) {
-  test_info_list_.push_back(test_info);
-  test_indices_.push_back(static_cast<int>(test_indices_.size()));
-}
-
-// Runs every test in this TestCase.
-void TestCase::Run() {
-  if (!should_run_) return;
-
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
-  impl->set_current_test_case(this);
-
-  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
-
-  repeater->OnTestCaseStart(*this);
-  impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(
-      this, &TestCase::RunSetUpTestCase, "SetUpTestCase()");
-
-  const internal::TimeInMillis start = internal::GetTimeInMillis();
-  for (int i = 0; i < total_test_count(); i++) {
-    GetMutableTestInfo(i)->Run();
-  }
-  elapsed_time_ = internal::GetTimeInMillis() - start;
-
-  impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(
-      this, &TestCase::RunTearDownTestCase, "TearDownTestCase()");
-
-  repeater->OnTestCaseEnd(*this);
-  impl->set_current_test_case(NULL);
-}
-
-// Clears the results of all tests in this test case.
-void TestCase::ClearResult() {
-  ad_hoc_test_result_.Clear();
-  ForEach(test_info_list_, TestInfo::ClearTestResult);
-}
-
-// Shuffles the tests in this test case.
-void TestCase::ShuffleTests(internal::Random* random) {
-  Shuffle(random, &test_indices_);
-}
-
-// Restores the test order to before the first shuffle.
-void TestCase::UnshuffleTests() {
-  for (size_t i = 0; i < test_indices_.size(); i++) {
-    test_indices_[i] = static_cast<int>(i);
-  }
-}
-
-// Formats a countable noun.  Depending on its quantity, either the
-// singular form or the plural form is used. e.g.
-//
-// FormatCountableNoun(1, "formula", "formuli") returns "1 formula".
-// FormatCountableNoun(5, "book", "books") returns "5 books".
-static std::string FormatCountableNoun(int count,
-                                       const char * singular_form,
-                                       const char * plural_form) {
-  return internal::StreamableToString(count) + " " +
-      (count == 1 ? singular_form : plural_form);
-}
-
-// Formats the count of tests.
-static std::string FormatTestCount(int test_count) {
-  return FormatCountableNoun(test_count, "test", "tests");
-}
-
-// Formats the count of test cases.
-static std::string FormatTestCaseCount(int test_case_count) {
-  return FormatCountableNoun(test_case_count, "test case", "test cases");
-}
-
-// Converts a TestPartResult::Type enum to human-friendly string
-// representation.  Both kNonFatalFailure and kFatalFailure are translated
-// to "Failure", as the user usually doesn't care about the difference
-// between the two when viewing the test result.
-static const char * TestPartResultTypeToString(TestPartResult::Type type) {
-  switch (type) {
-    case TestPartResult::kSuccess:
-      return "Success";
-
-    case TestPartResult::kNonFatalFailure:
-    case TestPartResult::kFatalFailure:
-#ifdef _MSC_VER
-      return "error: ";
-#else
-      return "Failure\n";
-#endif
-    default:
-      return "Unknown result type";
-  }
-}
-
-namespace internal {
-
-// Prints a TestPartResult to an std::string.
-static std::string PrintTestPartResultToString(
-    const TestPartResult& test_part_result) {
-  return (Message()
-          << internal::FormatFileLocation(test_part_result.file_name(),
-                                          test_part_result.line_number())
-          << " " << TestPartResultTypeToString(test_part_result.type())
-          << test_part_result.message()).GetString();
-}
-
-// Prints a TestPartResult.
-static void PrintTestPartResult(const TestPartResult& test_part_result) {
-  const std::string& result =
-      PrintTestPartResultToString(test_part_result);
-  printf("%s\n", result.c_str());
-  fflush(stdout);
-  // If the test program runs in Visual Studio or a debugger, the
-  // following statements add the test part result message to the Output
-  // window such that the user can double-click on it to jump to the
-  // corresponding source code location; otherwise they do nothing.
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
-  // We don't call OutputDebugString*() on Windows Mobile, as printing
-  // to stdout is done by OutputDebugString() there already - we don't
-  // want the same message printed twice.
-  ::OutputDebugStringA(result.c_str());
-  ::OutputDebugStringA("\n");
-#endif
-}
-
-// class PrettyUnitTestResultPrinter
-
-enum GTestColor {
-  COLOR_DEFAULT,
-  COLOR_RED,
-  COLOR_GREEN,
-  COLOR_YELLOW
-};
-
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
-
-// Returns the character attribute for the given color.
-WORD GetColorAttribute(GTestColor color) {
-  switch (color) {
-    case COLOR_RED:    return FOREGROUND_RED;
-    case COLOR_GREEN:  return FOREGROUND_GREEN;
-    case COLOR_YELLOW: return FOREGROUND_RED | FOREGROUND_GREEN;
-    default:           return 0;
-  }
-}
-
-#else
-
-// Returns the ANSI color code for the given color.  COLOR_DEFAULT is
-// an invalid input.
-const char* GetAnsiColorCode(GTestColor color) {
-  switch (color) {
-    case COLOR_RED:     return "1";
-    case COLOR_GREEN:   return "2";
-    case COLOR_YELLOW:  return "3";
-    default:            return NULL;
-  };
-}
-
-#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
-
-// Returns true iff Google Test should use colors in the output.
-bool ShouldUseColor(bool stdout_is_tty) {
-  const char* const gtest_color = GTEST_FLAG(color).c_str();
-
-  if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) {
-#if GTEST_OS_WINDOWS
-    // On Windows the TERM variable is usually not set, but the
-    // console there does support colors.
-    return stdout_is_tty;
-#else
-    // On non-Windows platforms, we rely on the TERM variable.
-    const char* const term = posix::GetEnv("TERM");
-    const bool term_supports_color =
-        String::CStringEquals(term, "xterm") ||
-        String::CStringEquals(term, "xterm-color") ||
-        String::CStringEquals(term, "xterm-256color") ||
-        String::CStringEquals(term, "screen") ||
-        String::CStringEquals(term, "screen-256color") ||
-        String::CStringEquals(term, "linux") ||
-        String::CStringEquals(term, "cygwin");
-    return stdout_is_tty && term_supports_color;
-#endif  // GTEST_OS_WINDOWS
-  }
-
-  return String::CaseInsensitiveCStringEquals(gtest_color, "yes") ||
-      String::CaseInsensitiveCStringEquals(gtest_color, "true") ||
-      String::CaseInsensitiveCStringEquals(gtest_color, "t") ||
-      String::CStringEquals(gtest_color, "1");
-  // We take "yes", "true", "t", and "1" as meaning "yes".  If the
-  // value is neither one of these nor "auto", we treat it as "no" to
-  // be conservative.
-}
-
-// Helpers for printing colored strings to stdout. Note that on Windows, we
-// cannot simply emit special characters and have the terminal change colors.
-// This routine must actually emit the characters rather than return a string
-// that would be colored when printed, as can be done on Linux.
-void ColoredPrintf(GTestColor color, const char* fmt, ...) {
-  va_list args;
-  va_start(args, fmt);
-
-#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS || GTEST_OS_IOS
-  const bool use_color = false;
-#else
-  static const bool in_color_mode =
-      ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0);
-  const bool use_color = in_color_mode && (color != COLOR_DEFAULT);
-#endif  // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS
-  // The '!= 0' comparison is necessary to satisfy MSVC 7.1.
-
-  if (!use_color) {
-    vprintf(fmt, args);
-    va_end(args);
-    return;
-  }
-
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
-  const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
-
-  // Gets the current text color.
-  CONSOLE_SCREEN_BUFFER_INFO buffer_info;
-  GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
-  const WORD old_color_attrs = buffer_info.wAttributes;
-
-  // We need to flush the stream buffers into the console before each
-  // SetConsoleTextAttribute call lest it affect the text that is already
-  // printed but has not yet reached the console.
-  fflush(stdout);
-  SetConsoleTextAttribute(stdout_handle,
-                          GetColorAttribute(color) | FOREGROUND_INTENSITY);
-  vprintf(fmt, args);
-
-  fflush(stdout);
-  // Restores the text color.
-  SetConsoleTextAttribute(stdout_handle, old_color_attrs);
-#else
-  printf("\033[0;3%sm", GetAnsiColorCode(color));
-  vprintf(fmt, args);
-  printf("\033[m");  // Resets the terminal to default.
-#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
-  va_end(args);
-}
-
-// Text printed in Google Test's text output and --gunit_list_tests
-// output to label the type parameter and value parameter for a test.
-static const char kTypeParamLabel[] = "TypeParam";
-static const char kValueParamLabel[] = "GetParam()";
-
-void PrintFullTestCommentIfPresent(const TestInfo& test_info) {
-  const char* const type_param = test_info.type_param();
-  const char* const value_param = test_info.value_param();
-
-  if (type_param != NULL || value_param != NULL) {
-    printf(", where ");
-    if (type_param != NULL) {
-      printf("%s = %s", kTypeParamLabel, type_param);
-      if (value_param != NULL)
-        printf(" and ");
-    }
-    if (value_param != NULL) {
-      printf("%s = %s", kValueParamLabel, value_param);
-    }
-  }
-}
-
-// This class implements the TestEventListener interface.
-//
-// Class PrettyUnitTestResultPrinter is copyable.
-class PrettyUnitTestResultPrinter : public TestEventListener {
- public:
-  PrettyUnitTestResultPrinter() {}
-  static void PrintTestName(const char * test_case, const char * test) {
-    printf("%s.%s", test_case, test);
-  }
-
-  // The following methods override what's in the TestEventListener class.
-  virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {}
-  virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration);
-  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test);
-  virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {}
-  virtual void OnTestCaseStart(const TestCase& test_case);
-  virtual void OnTestStart(const TestInfo& test_info);
-  virtual void OnTestPartResult(const TestPartResult& result);
-  virtual void OnTestEnd(const TestInfo& test_info);
-  virtual void OnTestCaseEnd(const TestCase& test_case);
-  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test);
-  virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {}
-  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
-  virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {}
-
- private:
-  static void PrintFailedTests(const UnitTest& unit_test);
-};
-
-  // Fired before each iteration of tests starts.
-void PrettyUnitTestResultPrinter::OnTestIterationStart(
-    const UnitTest& unit_test, int iteration) {
-  if (GTEST_FLAG(repeat) != 1)
-    printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1);
-
-  const char* const filter = GTEST_FLAG(filter).c_str();
-
-  // Prints the filter if it's not *.  This reminds the user that some
-  // tests may be skipped.
-  if (!String::CStringEquals(filter, kUniversalFilter)) {
-    ColoredPrintf(COLOR_YELLOW,
-                  "Note: %s filter = %s\n", GTEST_NAME_, filter);
-  }
-
-  if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) {
-    const Int32 shard_index = Int32FromEnvOrDie(kTestShardIndex, -1);
-    ColoredPrintf(COLOR_YELLOW,
-                  "Note: This is test shard %d of %s.\n",
-                  static_cast<int>(shard_index) + 1,
-                  internal::posix::GetEnv(kTestTotalShards));
-  }
-
-  if (GTEST_FLAG(shuffle)) {
-    ColoredPrintf(COLOR_YELLOW,
-                  "Note: Randomizing tests' orders with a seed of %d .\n",
-                  unit_test.random_seed());
-  }
-
-  ColoredPrintf(COLOR_GREEN,  "[==========] ");
-  printf("Running %s from %s.\n",
-         FormatTestCount(unit_test.test_to_run_count()).c_str(),
-         FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str());
-  fflush(stdout);
-}
-
-void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart(
-    const UnitTest& /*unit_test*/) {
-  ColoredPrintf(COLOR_GREEN,  "[----------] ");
-  printf("Global test environment set-up.\n");
-  fflush(stdout);
-}
-
-void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase& test_case) {
-  const std::string counts =
-      FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
-  ColoredPrintf(COLOR_GREEN, "[----------] ");
-  printf("%s from %s", counts.c_str(), test_case.name());
-  if (test_case.type_param() == NULL) {
-    printf("\n");
-  } else {
-    printf(", where %s = %s\n", kTypeParamLabel, test_case.type_param());
-  }
-  fflush(stdout);
-}
-
-void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) {
-  ColoredPrintf(COLOR_GREEN,  "[ RUN      ] ");
-  PrintTestName(test_info.test_case_name(), test_info.name());
-  printf("\n");
-  fflush(stdout);
-}
-
-// Called after an assertion failure.
-void PrettyUnitTestResultPrinter::OnTestPartResult(
-    const TestPartResult& result) {
-  // If the test part succeeded, we don't need to do anything.
-  if (result.type() == TestPartResult::kSuccess)
-    return;
-
-  // Print failure message from the assertion (e.g. expected this and got that).
-  PrintTestPartResult(result);
-  fflush(stdout);
-}
-
-void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
-  if (test_info.result()->Passed()) {
-    ColoredPrintf(COLOR_GREEN, "[       OK ] ");
-  } else {
-    ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
-  }
-  PrintTestName(test_info.test_case_name(), test_info.name());
-  if (test_info.result()->Failed())
-    PrintFullTestCommentIfPresent(test_info);
-
-  if (GTEST_FLAG(print_time)) {
-    printf(" (%s ms)\n", internal::StreamableToString(
-           test_info.result()->elapsed_time()).c_str());
-  } else {
-    printf("\n");
-  }
-  fflush(stdout);
-}
-
-void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) {
-  if (!GTEST_FLAG(print_time)) return;
-
-  const std::string counts =
-      FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
-  ColoredPrintf(COLOR_GREEN, "[----------] ");
-  printf("%s from %s (%s ms total)\n\n",
-         counts.c_str(), test_case.name(),
-         internal::StreamableToString(test_case.elapsed_time()).c_str());
-  fflush(stdout);
-}
-
-void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart(
-    const UnitTest& /*unit_test*/) {
-  ColoredPrintf(COLOR_GREEN,  "[----------] ");
-  printf("Global test environment tear-down\n");
-  fflush(stdout);
-}
-
-// Internal helper for printing the list of failed tests.
-void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) {
-  const int failed_test_count = unit_test.failed_test_count();
-  if (failed_test_count == 0) {
-    return;
-  }
-
-  for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
-    const TestCase& test_case = *unit_test.GetTestCase(i);
-    if (!test_case.should_run() || (test_case.failed_test_count() == 0)) {
-      continue;
-    }
-    for (int j = 0; j < test_case.total_test_count(); ++j) {
-      const TestInfo& test_info = *test_case.GetTestInfo(j);
-      if (!test_info.should_run() || test_info.result()->Passed()) {
-        continue;
-      }
-      ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
-      printf("%s.%s", test_case.name(), test_info.name());
-      PrintFullTestCommentIfPresent(test_info);
-      printf("\n");
-    }
-  }
-}
-
-void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
-                                                     int /*iteration*/) {
-  ColoredPrintf(COLOR_GREEN,  "[==========] ");
-  printf("%s from %s ran.",
-         FormatTestCount(unit_test.test_to_run_count()).c_str(),
-         FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str());
-  if (GTEST_FLAG(print_time)) {
-    printf(" (%s ms total)",
-           internal::StreamableToString(unit_test.elapsed_time()).c_str());
-  }
-  printf("\n");
-  ColoredPrintf(COLOR_GREEN,  "[  PASSED  ] ");
-  printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str());
-
-  int num_failures = unit_test.failed_test_count();
-  if (!unit_test.Passed()) {
-    const int failed_test_count = unit_test.failed_test_count();
-    ColoredPrintf(COLOR_RED,  "[  FAILED  ] ");
-    printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str());
-    PrintFailedTests(unit_test);
-    printf("\n%2d FAILED %s\n", num_failures,
-                        num_failures == 1 ? "TEST" : "TESTS");
-  }
-
-  int num_disabled = unit_test.reportable_disabled_test_count();
-  if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) {
-    if (!num_failures) {
-      printf("\n");  // Add a spacer if no FAILURE banner is displayed.
-    }
-    ColoredPrintf(COLOR_YELLOW,
-                  "  YOU HAVE %d DISABLED %s\n\n",
-                  num_disabled,
-                  num_disabled == 1 ? "TEST" : "TESTS");
-  }
-  // Ensure that Google Test output is printed before, e.g., heapchecker output.
-  fflush(stdout);
-}
-
-// End PrettyUnitTestResultPrinter
-
-// class TestEventRepeater
-//
-// This class forwards events to other event listeners.
-class TestEventRepeater : public TestEventListener {
- public:
-  TestEventRepeater() : forwarding_enabled_(true) {}
-  virtual ~TestEventRepeater();
-  void Append(TestEventListener *listener);
-  TestEventListener* Release(TestEventListener* listener);
-
-  // Controls whether events will be forwarded to listeners_. Set to false
-  // in death test child processes.
-  bool forwarding_enabled() const { return forwarding_enabled_; }
-  void set_forwarding_enabled(bool enable) { forwarding_enabled_ = enable; }
-
-  virtual void OnTestProgramStart(const UnitTest& unit_test);
-  virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration);
-  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test);
-  virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test);
-  virtual void OnTestCaseStart(const TestCase& test_case);
-  virtual void OnTestStart(const TestInfo& test_info);
-  virtual void OnTestPartResult(const TestPartResult& result);
-  virtual void OnTestEnd(const TestInfo& test_info);
-  virtual void OnTestCaseEnd(const TestCase& test_case);
-  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test);
-  virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test);
-  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
-  virtual void OnTestProgramEnd(const UnitTest& unit_test);
-
- private:
-  // Controls whether events will be forwarded to listeners_. Set to false
-  // in death test child processes.
-  bool forwarding_enabled_;
-  // The list of listeners that receive events.
-  std::vector<TestEventListener*> listeners_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventRepeater);
-};
-
-TestEventRepeater::~TestEventRepeater() {
-  ForEach(listeners_, Delete<TestEventListener>);
-}
-
-void TestEventRepeater::Append(TestEventListener *listener) {
-  listeners_.push_back(listener);
-}
-
-// TODO(vladl@google.com): Factor the search functionality into Vector::Find.
-TestEventListener* TestEventRepeater::Release(TestEventListener *listener) {
-  for (size_t i = 0; i < listeners_.size(); ++i) {
-    if (listeners_[i] == listener) {
-      listeners_.erase(listeners_.begin() + i);
-      return listener;
-    }
-  }
-
-  return NULL;
-}
-
-// Since most methods are very similar, use macros to reduce boilerplate.
-// This defines a member that forwards the call to all listeners.
-#define GTEST_REPEATER_METHOD_(Name, Type) \
-void TestEventRepeater::Name(const Type& parameter) { \
-  if (forwarding_enabled_) { \
-    for (size_t i = 0; i < listeners_.size(); i++) { \
-      listeners_[i]->Name(parameter); \
-    } \
-  } \
-}
-// This defines a member that forwards the call to all listeners in reverse
-// order.
-#define GTEST_REVERSE_REPEATER_METHOD_(Name, Type) \
-void TestEventRepeater::Name(const Type& parameter) { \
-  if (forwarding_enabled_) { \
-    for (int i = static_cast<int>(listeners_.size()) - 1; i >= 0; i--) { \
-      listeners_[i]->Name(parameter); \
-    } \
-  } \
-}
-
-GTEST_REPEATER_METHOD_(OnTestProgramStart, UnitTest)
-GTEST_REPEATER_METHOD_(OnEnvironmentsSetUpStart, UnitTest)
-GTEST_REPEATER_METHOD_(OnTestCaseStart, TestCase)
-GTEST_REPEATER_METHOD_(OnTestStart, TestInfo)
-GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult)
-GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest)
-GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest)
-GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsTearDownEnd, UnitTest)
-GTEST_REVERSE_REPEATER_METHOD_(OnTestEnd, TestInfo)
-GTEST_REVERSE_REPEATER_METHOD_(OnTestCaseEnd, TestCase)
-GTEST_REVERSE_REPEATER_METHOD_(OnTestProgramEnd, UnitTest)
-
-#undef GTEST_REPEATER_METHOD_
-#undef GTEST_REVERSE_REPEATER_METHOD_
-
-void TestEventRepeater::OnTestIterationStart(const UnitTest& unit_test,
-                                             int iteration) {
-  if (forwarding_enabled_) {
-    for (size_t i = 0; i < listeners_.size(); i++) {
-      listeners_[i]->OnTestIterationStart(unit_test, iteration);
-    }
-  }
-}
-
-void TestEventRepeater::OnTestIterationEnd(const UnitTest& unit_test,
-                                           int iteration) {
-  if (forwarding_enabled_) {
-    for (int i = static_cast<int>(listeners_.size()) - 1; i >= 0; i--) {
-      listeners_[i]->OnTestIterationEnd(unit_test, iteration);
-    }
-  }
-}
-
-// End TestEventRepeater
-
-// This class generates an XML output file.
-class XmlUnitTestResultPrinter : public EmptyTestEventListener {
- public:
-  explicit XmlUnitTestResultPrinter(const char* output_file);
-
-  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
-
- private:
-  // Is c a whitespace character that is normalized to a space character
-  // when it appears in an XML attribute value?
-  static bool IsNormalizableWhitespace(char c) {
-    return c == 0x9 || c == 0xA || c == 0xD;
-  }
-
-  // May c appear in a well-formed XML document?
-  static bool IsValidXmlCharacter(char c) {
-    return IsNormalizableWhitespace(c) || c >= 0x20;
-  }
-
-  // Returns an XML-escaped copy of the input string str.  If
-  // is_attribute is true, the text is meant to appear as an attribute
-  // value, and normalizable whitespace is preserved by replacing it
-  // with character references.
-  static std::string EscapeXml(const std::string& str, bool is_attribute);
-
-  // Returns the given string with all characters invalid in XML removed.
-  static std::string RemoveInvalidXmlCharacters(const std::string& str);
-
-  // Convenience wrapper around EscapeXml when str is an attribute value.
-  static std::string EscapeXmlAttribute(const std::string& str) {
-    return EscapeXml(str, true);
-  }
-
-  // Convenience wrapper around EscapeXml when str is not an attribute value.
-  static std::string EscapeXmlText(const char* str) {
-    return EscapeXml(str, false);
-  }
-
-  // Verifies that the given attribute belongs to the given element and
-  // streams the attribute as XML.
-  static void OutputXmlAttribute(std::ostream* stream,
-                                 const std::string& element_name,
-                                 const std::string& name,
-                                 const std::string& value);
-
-  // Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
-  static void OutputXmlCDataSection(::std::ostream* stream, const char* data);
-
-  // Streams an XML representation of a TestInfo object.
-  static void OutputXmlTestInfo(::std::ostream* stream,
-                                const char* test_case_name,
-                                const TestInfo& test_info);
-
-  // Prints an XML representation of a TestCase object
-  static void PrintXmlTestCase(::std::ostream* stream,
-                               const TestCase& test_case);
-
-  // Prints an XML summary of unit_test to output stream out.
-  static void PrintXmlUnitTest(::std::ostream* stream,
-                               const UnitTest& unit_test);
-
-  // Produces a string representing the test properties in a result as space
-  // delimited XML attributes based on the property key="value" pairs.
-  // When the std::string is not empty, it includes a space at the beginning,
-  // to delimit this attribute from prior attributes.
-  static std::string TestPropertiesAsXmlAttributes(const TestResult& result);
-
-  // The output file.
-  const std::string output_file_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(XmlUnitTestResultPrinter);
-};
-
-// Creates a new XmlUnitTestResultPrinter.
-XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char* output_file)
-    : output_file_(output_file) {
-  if (output_file_.c_str() == NULL || output_file_.empty()) {
-    fprintf(stderr, "XML output file may not be null\n");
-    fflush(stderr);
-    exit(EXIT_FAILURE);
-  }
-}
-
-// Called after the unit test ends.
-void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
-                                                  int /*iteration*/) {
-  FILE* xmlout = NULL;
-  FilePath output_file(output_file_);
-  FilePath output_dir(output_file.RemoveFileName());
-
-  if (output_dir.CreateDirectoriesRecursively()) {
-    xmlout = posix::FOpen(output_file_.c_str(), "w");
-  }
-  if (xmlout == NULL) {
-    // TODO(wan): report the reason of the failure.
-    //
-    // We don't do it for now as:
-    //
-    //   1. There is no urgent need for it.
-    //   2. It's a bit involved to make the errno variable thread-safe on
-    //      all three operating systems (Linux, Windows, and Mac OS).
-    //   3. To interpret the meaning of errno in a thread-safe way,
-    //      we need the strerror_r() function, which is not available on
-    //      Windows.
-    fprintf(stderr,
-            "Unable to open file \"%s\"\n",
-            output_file_.c_str());
-    fflush(stderr);
-    exit(EXIT_FAILURE);
-  }
-  std::stringstream stream;
-  PrintXmlUnitTest(&stream, unit_test);
-  fprintf(xmlout, "%s", StringStreamToString(&stream).c_str());
-  fclose(xmlout);
-}
-
-// Returns an XML-escaped copy of the input string str.  If is_attribute
-// is true, the text is meant to appear as an attribute value, and
-// normalizable whitespace is preserved by replacing it with character
-// references.
-//
-// Invalid XML characters in str, if any, are stripped from the output.
-// It is expected that most, if not all, of the text processed by this
-// module will consist of ordinary English text.
-// If this module is ever modified to produce version 1.1 XML output,
-// most invalid characters can be retained using character references.
-// TODO(wan): It might be nice to have a minimally invasive, human-readable
-// escaping scheme for invalid characters, rather than dropping them.
-std::string XmlUnitTestResultPrinter::EscapeXml(
-    const std::string& str, bool is_attribute) {
-  Message m;
-
-  for (size_t i = 0; i < str.size(); ++i) {
-    const char ch = str[i];
-    switch (ch) {
-      case '<':
-        m << "&lt;";
-        break;
-      case '>':
-        m << "&gt;";
-        break;
-      case '&':
-        m << "&amp;";
-        break;
-      case '\'':
-        if (is_attribute)
-          m << "&apos;";
-        else
-          m << '\'';
-        break;
-      case '"':
-        if (is_attribute)
-          m << "&quot;";
-        else
-          m << '"';
-        break;
-      default:
-        if (IsValidXmlCharacter(ch)) {
-          if (is_attribute && IsNormalizableWhitespace(ch))
-            m << "&#x" << String::FormatByte(static_cast<unsigned char>(ch))
-              << ";";
-          else
-            m << ch;
-        }
-        break;
-    }
-  }
-
-  return m.GetString();
-}
-
-// Returns the given string with all characters invalid in XML removed.
-// Currently invalid characters are dropped from the string. An
-// alternative is to replace them with certain characters such as . or ?.
-std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(
-    const std::string& str) {
-  std::string output;
-  output.reserve(str.size());
-  for (std::string::const_iterator it = str.begin(); it != str.end(); ++it)
-    if (IsValidXmlCharacter(*it))
-      output.push_back(*it);
-
-  return output;
-}
-
-// The following routines generate an XML representation of a UnitTest
-// object.
-//
-// This is how Google Test concepts map to the DTD:
-//
-// <testsuites name="AllTests">        <-- corresponds to a UnitTest object
-//   <testsuite name="testcase-name">  <-- corresponds to a TestCase object
-//     <testcase name="test-name">     <-- corresponds to a TestInfo object
-//       <failure message="...">...</failure>
-//       <failure message="...">...</failure>
-//       <failure message="...">...</failure>
-//                                     <-- individual assertion failures
-//     </testcase>
-//   </testsuite>
-// </testsuites>
-
-// Formats the given time in milliseconds as seconds.
-std::string FormatTimeInMillisAsSeconds(TimeInMillis ms) {
-  ::std::stringstream ss;
-  ss << ms/1000.0;
-  return ss.str();
-}
-
-// Converts the given epoch time in milliseconds to a date string in the ISO
-// 8601 format, without the timezone information.
-std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms) {
-  // Using non-reentrant version as localtime_r is not portable.
-  time_t seconds = static_cast<time_t>(ms / 1000);
-#ifdef _MSC_VER
-# pragma warning(push)          // Saves the current warning state.
-# pragma warning(disable:4996)  // Temporarily disables warning 4996
-                                // (function or variable may be unsafe).
-  const struct tm* const time_struct = localtime(&seconds);  // NOLINT
-# pragma warning(pop)           // Restores the warning state again.
-#else
-  const struct tm* const time_struct = localtime(&seconds);  // NOLINT
-#endif
-  if (time_struct == NULL)
-    return "";  // Invalid ms value
-
-  // YYYY-MM-DDThh:mm:ss
-  return StreamableToString(time_struct->tm_year + 1900) + "-" +
-      String::FormatIntWidth2(time_struct->tm_mon + 1) + "-" +
-      String::FormatIntWidth2(time_struct->tm_mday) + "T" +
-      String::FormatIntWidth2(time_struct->tm_hour) + ":" +
-      String::FormatIntWidth2(time_struct->tm_min) + ":" +
-      String::FormatIntWidth2(time_struct->tm_sec);
-}
-
-// Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
-void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream,
-                                                     const char* data) {
-  const char* segment = data;
-  *stream << "<![CDATA[";
-  for (;;) {
-    const char* const next_segment = strstr(segment, "]]>");
-    if (next_segment != NULL) {
-      stream->write(
-          segment, static_cast<std::streamsize>(next_segment - segment));
-      *stream << "]]>]]&gt;<![CDATA[";
-      segment = next_segment + strlen("]]>");
-    } else {
-      *stream << segment;
-      break;
-    }
-  }
-  *stream << "]]>";
-}
-
-void XmlUnitTestResultPrinter::OutputXmlAttribute(
-    std::ostream* stream,
-    const std::string& element_name,
-    const std::string& name,
-    const std::string& value) {
-  const std::vector<std::string>& allowed_names =
-      GetReservedAttributesForElement(element_name);
-
-  GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
-                   allowed_names.end())
-      << "Attribute " << name << " is not allowed for element <" << element_name
-      << ">.";
-
-  *stream << " " << name << "=\"" << EscapeXmlAttribute(value) << "\"";
-}
-
-// Prints an XML representation of a TestInfo object.
-// TODO(wan): There is also value in printing properties with the plain printer.
-void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
-                                                 const char* test_case_name,
-                                                 const TestInfo& test_info) {
-  const TestResult& result = *test_info.result();
-  const std::string kTestcase = "testcase";
-
-  *stream << "    <testcase";
-  OutputXmlAttribute(stream, kTestcase, "name", test_info.name());
-
-  if (test_info.value_param() != NULL) {
-    OutputXmlAttribute(stream, kTestcase, "value_param",
-                       test_info.value_param());
-  }
-  if (test_info.type_param() != NULL) {
-    OutputXmlAttribute(stream, kTestcase, "type_param", test_info.type_param());
-  }
-
-  OutputXmlAttribute(stream, kTestcase, "status",
-                     test_info.should_run() ? "run" : "notrun");
-  OutputXmlAttribute(stream, kTestcase, "time",
-                     FormatTimeInMillisAsSeconds(result.elapsed_time()));
-  OutputXmlAttribute(stream, kTestcase, "classname", test_case_name);
-  *stream << TestPropertiesAsXmlAttributes(result);
-
-  int failures = 0;
-  for (int i = 0; i < result.total_part_count(); ++i) {
-    const TestPartResult& part = result.GetTestPartResult(i);
-    if (part.failed()) {
-      if (++failures == 1) {
-        *stream << ">\n";
-      }
-      const string location = internal::FormatCompilerIndependentFileLocation(
-          part.file_name(), part.line_number());
-      const string summary = location + "\n" + part.summary();
-      *stream << "      <failure message=\""
-              << EscapeXmlAttribute(summary.c_str())
-              << "\" type=\"\">";
-      const string detail = location + "\n" + part.message();
-      OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str());
-      *stream << "</failure>\n";
-    }
-  }
-
-  if (failures == 0)
-    *stream << " />\n";
-  else
-    *stream << "    </testcase>\n";
-}
-
-// Prints an XML representation of a TestCase object
-void XmlUnitTestResultPrinter::PrintXmlTestCase(std::ostream* stream,
-                                                const TestCase& test_case) {
-  const std::string kTestsuite = "testsuite";
-  *stream << "  <" << kTestsuite;
-  OutputXmlAttribute(stream, kTestsuite, "name", test_case.name());
-  OutputXmlAttribute(stream, kTestsuite, "tests",
-                     StreamableToString(test_case.reportable_test_count()));
-  OutputXmlAttribute(stream, kTestsuite, "failures",
-                     StreamableToString(test_case.failed_test_count()));
-  OutputXmlAttribute(
-      stream, kTestsuite, "disabled",
-      StreamableToString(test_case.reportable_disabled_test_count()));
-  OutputXmlAttribute(stream, kTestsuite, "errors", "0");
-  OutputXmlAttribute(stream, kTestsuite, "time",
-                     FormatTimeInMillisAsSeconds(test_case.elapsed_time()));
-  *stream << TestPropertiesAsXmlAttributes(test_case.ad_hoc_test_result())
-          << ">\n";
-
-  for (int i = 0; i < test_case.total_test_count(); ++i) {
-    if (test_case.GetTestInfo(i)->is_reportable())
-      OutputXmlTestInfo(stream, test_case.name(), *test_case.GetTestInfo(i));
-  }
-  *stream << "  </" << kTestsuite << ">\n";
-}
-
-// Prints an XML summary of unit_test to output stream out.
-void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream,
-                                                const UnitTest& unit_test) {
-  const std::string kTestsuites = "testsuites";
-
-  *stream << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
-  *stream << "<" << kTestsuites;
-
-  OutputXmlAttribute(stream, kTestsuites, "tests",
-                     StreamableToString(unit_test.reportable_test_count()));
-  OutputXmlAttribute(stream, kTestsuites, "failures",
-                     StreamableToString(unit_test.failed_test_count()));
-  OutputXmlAttribute(
-      stream, kTestsuites, "disabled",
-      StreamableToString(unit_test.reportable_disabled_test_count()));
-  OutputXmlAttribute(stream, kTestsuites, "errors", "0");
-  OutputXmlAttribute(
-      stream, kTestsuites, "timestamp",
-      FormatEpochTimeInMillisAsIso8601(unit_test.start_timestamp()));
-  OutputXmlAttribute(stream, kTestsuites, "time",
-                     FormatTimeInMillisAsSeconds(unit_test.elapsed_time()));
-
-  if (GTEST_FLAG(shuffle)) {
-    OutputXmlAttribute(stream, kTestsuites, "random_seed",
-                       StreamableToString(unit_test.random_seed()));
-  }
-
-  *stream << TestPropertiesAsXmlAttributes(unit_test.ad_hoc_test_result());
-
-  OutputXmlAttribute(stream, kTestsuites, "name", "AllTests");
-  *stream << ">\n";
-
-  for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
-    if (unit_test.GetTestCase(i)->reportable_test_count() > 0)
-      PrintXmlTestCase(stream, *unit_test.GetTestCase(i));
-  }
-  *stream << "</" << kTestsuites << ">\n";
-}
-
-// Produces a string representing the test properties in a result as space
-// delimited XML attributes based on the property key="value" pairs.
-std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
-    const TestResult& result) {
-  Message attributes;
-  for (int i = 0; i < result.test_property_count(); ++i) {
-    const TestProperty& property = result.GetTestProperty(i);
-    attributes << " " << property.key() << "="
-        << "\"" << EscapeXmlAttribute(property.value()) << "\"";
-  }
-  return attributes.GetString();
-}
-
-// End XmlUnitTestResultPrinter
-
-#if GTEST_CAN_STREAM_RESULTS_
-
-// Checks if str contains '=', '&', '%' or '\n' characters. If yes,
-// replaces them by "%xx" where xx is their hexadecimal value. For
-// example, replaces "=" with "%3D".  This algorithm is O(strlen(str))
-// in both time and space -- important as the input str may contain an
-// arbitrarily long test failure message and stack trace.
-string StreamingListener::UrlEncode(const char* str) {
-  string result;
-  result.reserve(strlen(str) + 1);
-  for (char ch = *str; ch != '\0'; ch = *++str) {
-    switch (ch) {
-      case '%':
-      case '=':
-      case '&':
-      case '\n':
-        result.append("%" + String::FormatByte(static_cast<unsigned char>(ch)));
-        break;
-      default:
-        result.push_back(ch);
-        break;
-    }
-  }
-  return result;
-}
-
-void StreamingListener::SocketWriter::MakeConnection() {
-  GTEST_CHECK_(sockfd_ == -1)
-      << "MakeConnection() can't be called when there is already a connection.";
-
-  addrinfo hints;
-  memset(&hints, 0, sizeof(hints));
-  hints.ai_family = AF_UNSPEC;    // To allow both IPv4 and IPv6 addresses.
-  hints.ai_socktype = SOCK_STREAM;
-  addrinfo* servinfo = NULL;
-
-  // Use the getaddrinfo() to get a linked list of IP addresses for
-  // the given host name.
-  const int error_num = getaddrinfo(
-      host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
-  if (error_num != 0) {
-    GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: "
-                        << gai_strerror(error_num);
-  }
-
-  // Loop through all the results and connect to the first we can.
-  for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != NULL;
-       cur_addr = cur_addr->ai_next) {
-    sockfd_ = socket(
-        cur_addr->ai_family, cur_addr->ai_socktype, cur_addr->ai_protocol);
-    if (sockfd_ != -1) {
-      // Connect the client socket to the server socket.
-      if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) {
-        close(sockfd_);
-        sockfd_ = -1;
-      }
-    }
-  }
-
-  freeaddrinfo(servinfo);  // all done with this structure
-
-  if (sockfd_ == -1) {
-    GTEST_LOG_(WARNING) << "stream_result_to: failed to connect to "
-                        << host_name_ << ":" << port_num_;
-  }
-}
-
-// End of class Streaming Listener
-#endif  // GTEST_CAN_STREAM_RESULTS__
-
-// Class ScopedTrace
-
-// Pushes the given source file location and message onto a per-thread
-// trace stack maintained by Google Test.
-ScopedTrace::ScopedTrace(const char* file, int line, const Message& message)
-    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
-  TraceInfo trace;
-  trace.file = file;
-  trace.line = line;
-  trace.message = message.GetString();
-
-  UnitTest::GetInstance()->PushGTestTrace(trace);
-}
-
-// Pops the info pushed by the c'tor.
-ScopedTrace::~ScopedTrace()
-    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
-  UnitTest::GetInstance()->PopGTestTrace();
-}
-
-
-// class OsStackTraceGetter
-
-// Returns the current OS stack trace as an std::string.  Parameters:
-//
-//   max_depth  - the maximum number of stack frames to be included
-//                in the trace.
-//   skip_count - the number of top frames to be skipped; doesn't count
-//                against max_depth.
-//
-string OsStackTraceGetter::CurrentStackTrace(int /* max_depth */,
-                                             int /* skip_count */)
-    GTEST_LOCK_EXCLUDED_(mutex_) {
-  return "";
-}
-
-void OsStackTraceGetter::UponLeavingGTest()
-    GTEST_LOCK_EXCLUDED_(mutex_) {
-}
-
-const char* const
-OsStackTraceGetter::kElidedFramesMarker =
-    "... " GTEST_NAME_ " internal frames ...";
-
-// A helper class that creates the premature-exit file in its
-// constructor and deletes the file in its destructor.
-class ScopedPrematureExitFile {
- public:
-  explicit ScopedPrematureExitFile(const char* premature_exit_filepath)
-      : premature_exit_filepath_(premature_exit_filepath) {
-    // If a path to the premature-exit file is specified...
-    if (premature_exit_filepath != NULL && *premature_exit_filepath != '\0') {
-      // create the file with a single "0" character in it.  I/O
-      // errors are ignored as there's nothing better we can do and we
-      // don't want to fail the test because of this.
-      FILE* pfile = posix::FOpen(premature_exit_filepath, "w");
-      fwrite("0", 1, 1, pfile);
-      fclose(pfile);
-    }
-  }
-
-  ~ScopedPrematureExitFile() {
-    if (premature_exit_filepath_ != NULL && *premature_exit_filepath_ != '\0') {
-      remove(premature_exit_filepath_);
-    }
-  }
-
- private:
-  const char* const premature_exit_filepath_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedPrematureExitFile);
-};
-
-}  // namespace internal
-
-// class TestEventListeners
-
-TestEventListeners::TestEventListeners()
-    : repeater_(new internal::TestEventRepeater()),
-      default_result_printer_(NULL),
-      default_xml_generator_(NULL) {
-}
-
-TestEventListeners::~TestEventListeners() { delete repeater_; }
-
-// Returns the standard listener responsible for the default console
-// output.  Can be removed from the listeners list to shut down default
-// console output.  Note that removing this object from the listener list
-// with Release transfers its ownership to the user.
-void TestEventListeners::Append(TestEventListener* listener) {
-  repeater_->Append(listener);
-}
-
-// Removes the given event listener from the list and returns it.  It then
-// becomes the caller's responsibility to delete the listener. Returns
-// NULL if the listener is not found in the list.
-TestEventListener* TestEventListeners::Release(TestEventListener* listener) {
-  if (listener == default_result_printer_)
-    default_result_printer_ = NULL;
-  else if (listener == default_xml_generator_)
-    default_xml_generator_ = NULL;
-  return repeater_->Release(listener);
-}
-
-// Returns repeater that broadcasts the TestEventListener events to all
-// subscribers.
-TestEventListener* TestEventListeners::repeater() { return repeater_; }
-
-// Sets the default_result_printer attribute to the provided listener.
-// The listener is also added to the listener list and previous
-// default_result_printer is removed from it and deleted. The listener can
-// also be NULL in which case it will not be added to the list. Does
-// nothing if the previous and the current listener objects are the same.
-void TestEventListeners::SetDefaultResultPrinter(TestEventListener* listener) {
-  if (default_result_printer_ != listener) {
-    // It is an error to pass this method a listener that is already in the
-    // list.
-    delete Release(default_result_printer_);
-    default_result_printer_ = listener;
-    if (listener != NULL)
-      Append(listener);
-  }
-}
-
-// Sets the default_xml_generator attribute to the provided listener.  The
-// listener is also added to the listener list and previous
-// default_xml_generator is removed from it and deleted. The listener can
-// also be NULL in which case it will not be added to the list. Does
-// nothing if the previous and the current listener objects are the same.
-void TestEventListeners::SetDefaultXmlGenerator(TestEventListener* listener) {
-  if (default_xml_generator_ != listener) {
-    // It is an error to pass this method a listener that is already in the
-    // list.
-    delete Release(default_xml_generator_);
-    default_xml_generator_ = listener;
-    if (listener != NULL)
-      Append(listener);
-  }
-}
-
-// Controls whether events will be forwarded by the repeater to the
-// listeners in the list.
-bool TestEventListeners::EventForwardingEnabled() const {
-  return repeater_->forwarding_enabled();
-}
-
-void TestEventListeners::SuppressEventForwarding() {
-  repeater_->set_forwarding_enabled(false);
-}
-
-// class UnitTest
-
-// Gets the singleton UnitTest object.  The first time this method is
-// called, a UnitTest object is constructed and returned.  Consecutive
-// calls will return the same object.
-//
-// We don't protect this under mutex_ as a user is not supposed to
-// call this before main() starts, from which point on the return
-// value will never change.
-UnitTest* UnitTest::GetInstance() {
-  // When compiled with MSVC 7.1 in optimized mode, destroying the
-  // UnitTest object upon exiting the program messes up the exit code,
-  // causing successful tests to appear failed.  We have to use a
-  // different implementation in this case to bypass the compiler bug.
-  // This implementation makes the compiler happy, at the cost of
-  // leaking the UnitTest object.
-
-  // CodeGear C++Builder insists on a public destructor for the
-  // default implementation.  Use this implementation to keep good OO
-  // design with private destructor.
-
-#if (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__)
-  static UnitTest* const instance = new UnitTest;
-  return instance;
-#else
-  static UnitTest instance;
-  return &instance;
-#endif  // (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__)
-}
-
-// Gets the number of successful test cases.
-int UnitTest::successful_test_case_count() const {
-  return impl()->successful_test_case_count();
-}
-
-// Gets the number of failed test cases.
-int UnitTest::failed_test_case_count() const {
-  return impl()->failed_test_case_count();
-}
-
-// Gets the number of all test cases.
-int UnitTest::total_test_case_count() const {
-  return impl()->total_test_case_count();
-}
-
-// Gets the number of all test cases that contain at least one test
-// that should run.
-int UnitTest::test_case_to_run_count() const {
-  return impl()->test_case_to_run_count();
-}
-
-// Gets the number of successful tests.
-int UnitTest::successful_test_count() const {
-  return impl()->successful_test_count();
-}
-
-// Gets the number of failed tests.
-int UnitTest::failed_test_count() const { return impl()->failed_test_count(); }
-
-// Gets the number of disabled tests that will be reported in the XML report.
-int UnitTest::reportable_disabled_test_count() const {
-  return impl()->reportable_disabled_test_count();
-}
-
-// Gets the number of disabled tests.
-int UnitTest::disabled_test_count() const {
-  return impl()->disabled_test_count();
-}
-
-// Gets the number of tests to be printed in the XML report.
-int UnitTest::reportable_test_count() const {
-  return impl()->reportable_test_count();
-}
-
-// Gets the number of all tests.
-int UnitTest::total_test_count() const { return impl()->total_test_count(); }
-
-// Gets the number of tests that should run.
-int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); }
-
-// Gets the time of the test program start, in ms from the start of the
-// UNIX epoch.
-internal::TimeInMillis UnitTest::start_timestamp() const {
-    return impl()->start_timestamp();
-}
-
-// Gets the elapsed time, in milliseconds.
-internal::TimeInMillis UnitTest::elapsed_time() const {
-  return impl()->elapsed_time();
-}
-
-// Returns true iff the unit test passed (i.e. all test cases passed).
-bool UnitTest::Passed() const { return impl()->Passed(); }
-
-// Returns true iff the unit test failed (i.e. some test case failed
-// or something outside of all tests failed).
-bool UnitTest::Failed() const { return impl()->Failed(); }
-
-// Gets the i-th test case among all the test cases. i can range from 0 to
-// total_test_case_count() - 1. If i is not in that range, returns NULL.
-const TestCase* UnitTest::GetTestCase(int i) const {
-  return impl()->GetTestCase(i);
-}
-
-// Returns the TestResult containing information on test failures and
-// properties logged outside of individual test cases.
-const TestResult& UnitTest::ad_hoc_test_result() const {
-  return *impl()->ad_hoc_test_result();
-}
-
-// Gets the i-th test case among all the test cases. i can range from 0 to
-// total_test_case_count() - 1. If i is not in that range, returns NULL.
-TestCase* UnitTest::GetMutableTestCase(int i) {
-  return impl()->GetMutableTestCase(i);
-}
-
-// Returns the list of event listeners that can be used to track events
-// inside Google Test.
-TestEventListeners& UnitTest::listeners() {
-  return *impl()->listeners();
-}
-
-// Registers and returns a global test environment.  When a test
-// program is run, all global test environments will be set-up in the
-// order they were registered.  After all tests in the program have
-// finished, all global test environments will be torn-down in the
-// *reverse* order they were registered.
-//
-// The UnitTest object takes ownership of the given environment.
-//
-// We don't protect this under mutex_, as we only support calling it
-// from the main thread.
-Environment* UnitTest::AddEnvironment(Environment* env) {
-  if (env == NULL) {
-    return NULL;
-  }
-
-  impl_->environments().push_back(env);
-  return env;
-}
-
-// Adds a TestPartResult to the current TestResult object.  All Google Test
-// assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call
-// this to report their results.  The user code should use the
-// assertion macros instead of calling this directly.
-void UnitTest::AddTestPartResult(
-    TestPartResult::Type result_type,
-    const char* file_name,
-    int line_number,
-    const std::string& message,
-    const std::string& os_stack_trace) GTEST_LOCK_EXCLUDED_(mutex_) {
-  Message msg;
-  msg << message;
-
-  internal::MutexLock lock(&mutex_);
-  if (impl_->gtest_trace_stack().size() > 0) {
-    msg << "\n" << GTEST_NAME_ << " trace:";
-
-    for (int i = static_cast<int>(impl_->gtest_trace_stack().size());
-         i > 0; --i) {
-      const internal::TraceInfo& trace = impl_->gtest_trace_stack()[i - 1];
-      msg << "\n" << internal::FormatFileLocation(trace.file, trace.line)
-          << " " << trace.message;
-    }
-  }
-
-  if (os_stack_trace.c_str() != NULL && !os_stack_trace.empty()) {
-    msg << internal::kStackTraceMarker << os_stack_trace;
-  }
-
-  const TestPartResult result =
-    TestPartResult(result_type, file_name, line_number,
-                   msg.GetString().c_str());
-  impl_->GetTestPartResultReporterForCurrentThread()->
-      ReportTestPartResult(result);
-
-  if (result_type != TestPartResult::kSuccess) {
-    // gtest_break_on_failure takes precedence over
-    // gtest_throw_on_failure.  This allows a user to set the latter
-    // in the code (perhaps in order to use Google Test assertions
-    // with another testing framework) and specify the former on the
-    // command line for debugging.
-    if (GTEST_FLAG(break_on_failure)) {
-#if GTEST_OS_WINDOWS
-      // Using DebugBreak on Windows allows gtest to still break into a debugger
-      // when a failure happens and both the --gtest_break_on_failure and
-      // the --gtest_catch_exceptions flags are specified.
-      DebugBreak();
-#else
-      // Dereference NULL through a volatile pointer to prevent the compiler
-      // from removing. We use this rather than abort() or __builtin_trap() for
-      // portability: Symbian doesn't implement abort() well, and some debuggers
-      // don't correctly trap abort().
-      *static_cast<volatile int*>(NULL) = 1;
-#endif  // GTEST_OS_WINDOWS
-    } else if (GTEST_FLAG(throw_on_failure)) {
-#if GTEST_HAS_EXCEPTIONS
-      throw internal::GoogleTestFailureException(result);
-#else
-      // We cannot call abort() as it generates a pop-up in debug mode
-      // that cannot be suppressed in VC 7.1 or below.
-      exit(1);
-#endif
-    }
-  }
-}
-
-// Adds a TestProperty to the current TestResult object when invoked from
-// inside a test, to current TestCase's ad_hoc_test_result_ when invoked
-// from SetUpTestCase or TearDownTestCase, or to the global property set
-// when invoked elsewhere.  If the result already contains a property with
-// the same key, the value will be updated.
-void UnitTest::RecordProperty(const std::string& key,
-                              const std::string& value) {
-  impl_->RecordProperty(TestProperty(key, value));
-}
-
-// Runs all tests in this UnitTest object and prints the result.
-// Returns 0 if successful, or 1 otherwise.
-//
-// We don't protect this under mutex_, as we only support calling it
-// from the main thread.
-int UnitTest::Run() {
-  const bool in_death_test_child_process =
-      internal::GTEST_FLAG(internal_run_death_test).length() > 0;
-
-  // Google Test implements this protocol for catching that a test
-  // program exits before returning control to Google Test:
-  //
-  //   1. Upon start, Google Test creates a file whose absolute path
-  //      is specified by the environment variable
-  //      TEST_PREMATURE_EXIT_FILE.
-  //   2. When Google Test has finished its work, it deletes the file.
-  //
-  // This allows a test runner to set TEST_PREMATURE_EXIT_FILE before
-  // running a Google-Test-based test program and check the existence
-  // of the file at the end of the test execution to see if it has
-  // exited prematurely.
-
-  // If we are in the child process of a death test, don't
-  // create/delete the premature exit file, as doing so is unnecessary
-  // and will confuse the parent process.  Otherwise, create/delete
-  // the file upon entering/leaving this function.  If the program
-  // somehow exits before this function has a chance to return, the
-  // premature-exit file will be left undeleted, causing a test runner
-  // that understands the premature-exit-file protocol to report the
-  // test as having failed.
-  const internal::ScopedPrematureExitFile premature_exit_file(
-      in_death_test_child_process ?
-      NULL : internal::posix::GetEnv("TEST_PREMATURE_EXIT_FILE"));
-
-  // Captures the value of GTEST_FLAG(catch_exceptions).  This value will be
-  // used for the duration of the program.
-  impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions));
-
-#if GTEST_HAS_SEH
-  // Either the user wants Google Test to catch exceptions thrown by the
-  // tests or this is executing in the context of death test child
-  // process. In either case the user does not want to see pop-up dialogs
-  // about crashes - they are expected.
-  if (impl()->catch_exceptions() || in_death_test_child_process) {
-# if !GTEST_OS_WINDOWS_MOBILE
-    // SetErrorMode doesn't exist on CE.
-    SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT |
-                 SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX);
-# endif  // !GTEST_OS_WINDOWS_MOBILE
-
-# if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE
-    // Death test children can be terminated with _abort().  On Windows,
-    // _abort() can show a dialog with a warning message.  This forces the
-    // abort message to go to stderr instead.
-    _set_error_mode(_OUT_TO_STDERR);
-# endif
-
-# if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE
-    // In the debug version, Visual Studio pops up a separate dialog
-    // offering a choice to debug the aborted program. We need to suppress
-    // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement
-    // executed. Google Test will notify the user of any unexpected
-    // failure via stderr.
-    //
-    // VC++ doesn't define _set_abort_behavior() prior to the version 8.0.
-    // Users of prior VC versions shall suffer the agony and pain of
-    // clicking through the countless debug dialogs.
-    // TODO(vladl@google.com): find a way to suppress the abort dialog() in the
-    // debug mode when compiled with VC 7.1 or lower.
-    if (!GTEST_FLAG(break_on_failure))
-      _set_abort_behavior(
-          0x0,                                    // Clear the following flags:
-          _WRITE_ABORT_MSG | _CALL_REPORTFAULT);  // pop-up window, core dump.
-# endif
-  }
-#endif  // GTEST_HAS_SEH
-
-  return internal::HandleExceptionsInMethodIfSupported(
-      impl(),
-      &internal::UnitTestImpl::RunAllTests,
-      "auxiliary test code (environments or event listeners)") ? 0 : 1;
-}
-
-// Returns the working directory when the first TEST() or TEST_F() was
-// executed.
-const char* UnitTest::original_working_dir() const {
-  return impl_->original_working_dir_.c_str();
-}
-
-// Returns the TestCase object for the test that's currently running,
-// or NULL if no test is running.
-const TestCase* UnitTest::current_test_case() const
-    GTEST_LOCK_EXCLUDED_(mutex_) {
-  internal::MutexLock lock(&mutex_);
-  return impl_->current_test_case();
-}
-
-// Returns the TestInfo object for the test that's currently running,
-// or NULL if no test is running.
-const TestInfo* UnitTest::current_test_info() const
-    GTEST_LOCK_EXCLUDED_(mutex_) {
-  internal::MutexLock lock(&mutex_);
-  return impl_->current_test_info();
-}
-
-// Returns the random seed used at the start of the current test run.
-int UnitTest::random_seed() const { return impl_->random_seed(); }
-
-#if GTEST_HAS_PARAM_TEST
-// Returns ParameterizedTestCaseRegistry object used to keep track of
-// value-parameterized tests and instantiate and register them.
-internal::ParameterizedTestCaseRegistry&
-    UnitTest::parameterized_test_registry()
-        GTEST_LOCK_EXCLUDED_(mutex_) {
-  return impl_->parameterized_test_registry();
-}
-#endif  // GTEST_HAS_PARAM_TEST
-
-// Creates an empty UnitTest.
-UnitTest::UnitTest() {
-  impl_ = new internal::UnitTestImpl(this);
-}
-
-// Destructor of UnitTest.
-UnitTest::~UnitTest() {
-  delete impl_;
-}
-
-// Pushes a trace defined by SCOPED_TRACE() on to the per-thread
-// Google Test trace stack.
-void UnitTest::PushGTestTrace(const internal::TraceInfo& trace)
-    GTEST_LOCK_EXCLUDED_(mutex_) {
-  internal::MutexLock lock(&mutex_);
-  impl_->gtest_trace_stack().push_back(trace);
-}
-
-// Pops a trace from the per-thread Google Test trace stack.
-void UnitTest::PopGTestTrace()
-    GTEST_LOCK_EXCLUDED_(mutex_) {
-  internal::MutexLock lock(&mutex_);
-  impl_->gtest_trace_stack().pop_back();
-}
-
-namespace internal {
-
-UnitTestImpl::UnitTestImpl(UnitTest* parent)
-    : parent_(parent),
-#ifdef _MSC_VER
-# pragma warning(push)                    // Saves the current warning state.
-# pragma warning(disable:4355)            // Temporarily disables warning 4355
-                                         // (using this in initializer).
-      default_global_test_part_result_reporter_(this),
-      default_per_thread_test_part_result_reporter_(this),
-# pragma warning(pop)                     // Restores the warning state again.
-#else
-      default_global_test_part_result_reporter_(this),
-      default_per_thread_test_part_result_reporter_(this),
-#endif  // _MSC_VER
-      global_test_part_result_repoter_(
-          &default_global_test_part_result_reporter_),
-      per_thread_test_part_result_reporter_(
-          &default_per_thread_test_part_result_reporter_),
-#if GTEST_HAS_PARAM_TEST
-      parameterized_test_registry_(),
-      parameterized_tests_registered_(false),
-#endif  // GTEST_HAS_PARAM_TEST
-      last_death_test_case_(-1),
-      current_test_case_(NULL),
-      current_test_info_(NULL),
-      ad_hoc_test_result_(),
-      os_stack_trace_getter_(NULL),
-      post_flag_parse_init_performed_(false),
-      random_seed_(0),  // Will be overridden by the flag before first use.
-      random_(0),  // Will be reseeded before first use.
-      start_timestamp_(0),
-      elapsed_time_(0),
-#if GTEST_HAS_DEATH_TEST
-      death_test_factory_(new DefaultDeathTestFactory),
-#endif
-      // Will be overridden by the flag before first use.
-      catch_exceptions_(false) {
-  listeners()->SetDefaultResultPrinter(new PrettyUnitTestResultPrinter);
-}
-
-UnitTestImpl::~UnitTestImpl() {
-  // Deletes every TestCase.
-  ForEach(test_cases_, internal::Delete<TestCase>);
-
-  // Deletes every Environment.
-  ForEach(environments_, internal::Delete<Environment>);
-
-  delete os_stack_trace_getter_;
-}
-
-// Adds a TestProperty to the current TestResult object when invoked in a
-// context of a test, to current test case's ad_hoc_test_result when invoke
-// from SetUpTestCase/TearDownTestCase, or to the global property set
-// otherwise.  If the result already contains a property with the same key,
-// the value will be updated.
-void UnitTestImpl::RecordProperty(const TestProperty& test_property) {
-  std::string xml_element;
-  TestResult* test_result;  // TestResult appropriate for property recording.
-
-  if (current_test_info_ != NULL) {
-    xml_element = "testcase";
-    test_result = &(current_test_info_->result_);
-  } else if (current_test_case_ != NULL) {
-    xml_element = "testsuite";
-    test_result = &(current_test_case_->ad_hoc_test_result_);
-  } else {
-    xml_element = "testsuites";
-    test_result = &ad_hoc_test_result_;
-  }
-  test_result->RecordProperty(xml_element, test_property);
-}
-
-#if GTEST_HAS_DEATH_TEST
-// Disables event forwarding if the control is currently in a death test
-// subprocess. Must not be called before InitGoogleTest.
-void UnitTestImpl::SuppressTestEventsIfInSubprocess() {
-  if (internal_run_death_test_flag_.get() != NULL)
-    listeners()->SuppressEventForwarding();
-}
-#endif  // GTEST_HAS_DEATH_TEST
-
-// Initializes event listeners performing XML output as specified by
-// UnitTestOptions. Must not be called before InitGoogleTest.
-void UnitTestImpl::ConfigureXmlOutput() {
-  const std::string& output_format = UnitTestOptions::GetOutputFormat();
-  if (output_format == "xml") {
-    listeners()->SetDefaultXmlGenerator(new XmlUnitTestResultPrinter(
-        UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
-  } else if (output_format != "") {
-    printf("WARNING: unrecognized output format \"%s\" ignored.\n",
-           output_format.c_str());
-    fflush(stdout);
-  }
-}
-
-#if GTEST_CAN_STREAM_RESULTS_
-// Initializes event listeners for streaming test results in string form.
-// Must not be called before InitGoogleTest.
-void UnitTestImpl::ConfigureStreamingOutput() {
-  const std::string& target = GTEST_FLAG(stream_result_to);
-  if (!target.empty()) {
-    const size_t pos = target.find(':');
-    if (pos != std::string::npos) {
-      listeners()->Append(new StreamingListener(target.substr(0, pos),
-                                                target.substr(pos+1)));
-    } else {
-      printf("WARNING: unrecognized streaming target \"%s\" ignored.\n",
-             target.c_str());
-      fflush(stdout);
-    }
-  }
-}
-#endif  // GTEST_CAN_STREAM_RESULTS_
-
-// Performs initialization dependent upon flag values obtained in
-// ParseGoogleTestFlagsOnly.  Is called from InitGoogleTest after the call to
-// ParseGoogleTestFlagsOnly.  In case a user neglects to call InitGoogleTest
-// this function is also called from RunAllTests.  Since this function can be
-// called more than once, it has to be idempotent.
-void UnitTestImpl::PostFlagParsingInit() {
-  // Ensures that this function does not execute more than once.
-  if (!post_flag_parse_init_performed_) {
-    post_flag_parse_init_performed_ = true;
-
-#if GTEST_HAS_DEATH_TEST
-    InitDeathTestSubprocessControlInfo();
-    SuppressTestEventsIfInSubprocess();
-#endif  // GTEST_HAS_DEATH_TEST
-
-    // Registers parameterized tests. This makes parameterized tests
-    // available to the UnitTest reflection API without running
-    // RUN_ALL_TESTS.
-    RegisterParameterizedTests();
-
-    // Configures listeners for XML output. This makes it possible for users
-    // to shut down the default XML output before invoking RUN_ALL_TESTS.
-    ConfigureXmlOutput();
-
-#if GTEST_CAN_STREAM_RESULTS_
-    // Configures listeners for streaming test results to the specified server.
-    ConfigureStreamingOutput();
-#endif  // GTEST_CAN_STREAM_RESULTS_
-  }
-}
-
-// A predicate that checks the name of a TestCase against a known
-// value.
-//
-// This is used for implementation of the UnitTest class only.  We put
-// it in the anonymous namespace to prevent polluting the outer
-// namespace.
-//
-// TestCaseNameIs is copyable.
-class TestCaseNameIs {
- public:
-  // Constructor.
-  explicit TestCaseNameIs(const std::string& name)
-      : name_(name) {}
-
-  // Returns true iff the name of test_case matches name_.
-  bool operator()(const TestCase* test_case) const {
-    return test_case != NULL && strcmp(test_case->name(), name_.c_str()) == 0;
-  }
-
- private:
-  std::string name_;
-};
-
-// Finds and returns a TestCase with the given name.  If one doesn't
-// exist, creates one and returns it.  It's the CALLER'S
-// RESPONSIBILITY to ensure that this function is only called WHEN THE
-// TESTS ARE NOT SHUFFLED.
-//
-// Arguments:
-//
-//   test_case_name: name of the test case
-//   type_param:     the name of the test case's type parameter, or NULL if
-//                   this is not a typed or a type-parameterized test case.
-//   set_up_tc:      pointer to the function that sets up the test case
-//   tear_down_tc:   pointer to the function that tears down the test case
-TestCase* UnitTestImpl::GetTestCase(const char* test_case_name,
-                                    const char* type_param,
-                                    Test::SetUpTestCaseFunc set_up_tc,
-                                    Test::TearDownTestCaseFunc tear_down_tc) {
-  // Can we find a TestCase with the given name?
-  const std::vector<TestCase*>::const_iterator test_case =
-      std::find_if(test_cases_.begin(), test_cases_.end(),
-                   TestCaseNameIs(test_case_name));
-
-  if (test_case != test_cases_.end())
-    return *test_case;
-
-  // No.  Let's create one.
-  TestCase* const new_test_case =
-      new TestCase(test_case_name, type_param, set_up_tc, tear_down_tc);
-
-  // Is this a death test case?
-  if (internal::UnitTestOptions::MatchesFilter(test_case_name,
-                                               kDeathTestCaseFilter)) {
-    // Yes.  Inserts the test case after the last death test case
-    // defined so far.  This only works when the test cases haven't
-    // been shuffled.  Otherwise we may end up running a death test
-    // after a non-death test.
-    ++last_death_test_case_;
-    test_cases_.insert(test_cases_.begin() + last_death_test_case_,
-                       new_test_case);
-  } else {
-    // No.  Appends to the end of the list.
-    test_cases_.push_back(new_test_case);
-  }
-
-  test_case_indices_.push_back(static_cast<int>(test_case_indices_.size()));
-  return new_test_case;
-}
-
-// Helpers for setting up / tearing down the given environment.  They
-// are for use in the ForEach() function.
-static void SetUpEnvironment(Environment* env) { env->SetUp(); }
-static void TearDownEnvironment(Environment* env) { env->TearDown(); }
-
-// Runs all tests in this UnitTest object, prints the result, and
-// returns true if all tests are successful.  If any exception is
-// thrown during a test, the test is considered to be failed, but the
-// rest of the tests will still be run.
-//
-// When parameterized tests are enabled, it expands and registers
-// parameterized tests first in RegisterParameterizedTests().
-// All other functions called from RunAllTests() may safely assume that
-// parameterized tests are ready to be counted and run.
-bool UnitTestImpl::RunAllTests() {
-  // Makes sure InitGoogleTest() was called.
-  if (!GTestIsInitialized()) {
-    printf("%s",
-           "\nThis test program did NOT call ::testing::InitGoogleTest "
-           "before calling RUN_ALL_TESTS().  Please fix it.\n");
-    return false;
-  }
-
-  // Do not run any test if the --help flag was specified.
-  if (g_help_flag)
-    return true;
-
-  // Repeats the call to the post-flag parsing initialization in case the
-  // user didn't call InitGoogleTest.
-  PostFlagParsingInit();
-
-  // Even if sharding is not on, test runners may want to use the
-  // GTEST_SHARD_STATUS_FILE to query whether the test supports the sharding
-  // protocol.
-  internal::WriteToShardStatusFileIfNeeded();
-
-  // True iff we are in a subprocess for running a thread-safe-style
-  // death test.
-  bool in_subprocess_for_death_test = false;
-
-#if GTEST_HAS_DEATH_TEST
-  in_subprocess_for_death_test = (internal_run_death_test_flag_.get() != NULL);
-#endif  // GTEST_HAS_DEATH_TEST
-
-  const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex,
-                                        in_subprocess_for_death_test);
-
-  // Compares the full test names with the filter to decide which
-  // tests to run.
-  const bool has_tests_to_run = FilterTests(should_shard
-                                              ? HONOR_SHARDING_PROTOCOL
-                                              : IGNORE_SHARDING_PROTOCOL) > 0;
-
-  // Lists the tests and exits if the --gtest_list_tests flag was specified.
-  if (GTEST_FLAG(list_tests)) {
-    // This must be called *after* FilterTests() has been called.
-    ListTestsMatchingFilter();
-    return true;
-  }
-
-  random_seed_ = GTEST_FLAG(shuffle) ?
-      GetRandomSeedFromFlag(GTEST_FLAG(random_seed)) : 0;
-
-  // True iff at least one test has failed.
-  bool failed = false;
-
-  TestEventListener* repeater = listeners()->repeater();
-
-  start_timestamp_ = GetTimeInMillis();
-  repeater->OnTestProgramStart(*parent_);
-
-  // How many times to repeat the tests?  We don't want to repeat them
-  // when we are inside the subprocess of a death test.
-  const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG(repeat);
-  // Repeats forever if the repeat count is negative.
-  const bool forever = repeat < 0;
-  for (int i = 0; forever || i != repeat; i++) {
-    // We want to preserve failures generated by ad-hoc test
-    // assertions executed before RUN_ALL_TESTS().
-    ClearNonAdHocTestResult();
-
-    const TimeInMillis start = GetTimeInMillis();
-
-    // Shuffles test cases and tests if requested.
-    if (has_tests_to_run && GTEST_FLAG(shuffle)) {
-      random()->Reseed(random_seed_);
-      // This should be done before calling OnTestIterationStart(),
-      // such that a test event listener can see the actual test order
-      // in the event.
-      ShuffleTests();
-    }
-
-    // Tells the unit test event listeners that the tests are about to start.
-    repeater->OnTestIterationStart(*parent_, i);
-
-    // Runs each test case if there is at least one test to run.
-    if (has_tests_to_run) {
-      // Sets up all environments beforehand.
-      repeater->OnEnvironmentsSetUpStart(*parent_);
-      ForEach(environments_, SetUpEnvironment);
-      repeater->OnEnvironmentsSetUpEnd(*parent_);
-
-      // Runs the tests only if there was no fatal failure during global
-      // set-up.
-      if (!Test::HasFatalFailure()) {
-        for (int test_index = 0; test_index < total_test_case_count();
-             test_index++) {
-          GetMutableTestCase(test_index)->Run();
-        }
-      }
-
-      // Tears down all environments in reverse order afterwards.
-      repeater->OnEnvironmentsTearDownStart(*parent_);
-      std::for_each(environments_.rbegin(), environments_.rend(),
-                    TearDownEnvironment);
-      repeater->OnEnvironmentsTearDownEnd(*parent_);
-    }
-
-    elapsed_time_ = GetTimeInMillis() - start;
-
-    // Tells the unit test event listener that the tests have just finished.
-    repeater->OnTestIterationEnd(*parent_, i);
-
-    // Gets the result and clears it.
-    if (!Passed()) {
-      failed = true;
-    }
-
-    // Restores the original test order after the iteration.  This
-    // allows the user to quickly repro a failure that happens in the
-    // N-th iteration without repeating the first (N - 1) iterations.
-    // This is not enclosed in "if (GTEST_FLAG(shuffle)) { ... }", in
-    // case the user somehow changes the value of the flag somewhere
-    // (it's always safe to unshuffle the tests).
-    UnshuffleTests();
-
-    if (GTEST_FLAG(shuffle)) {
-      // Picks a new random seed for each iteration.
-      random_seed_ = GetNextRandomSeed(random_seed_);
-    }
-  }
-
-  repeater->OnTestProgramEnd(*parent_);
-
-  return !failed;
-}
-
-// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
-// if the variable is present. If a file already exists at this location, this
-// function will write over it. If the variable is present, but the file cannot
-// be created, prints an error and exits.
-void WriteToShardStatusFileIfNeeded() {
-  const char* const test_shard_file = posix::GetEnv(kTestShardStatusFile);
-  if (test_shard_file != NULL) {
-    FILE* const file = posix::FOpen(test_shard_file, "w");
-    if (file == NULL) {
-      ColoredPrintf(COLOR_RED,
-                    "Could not write to the test shard status file \"%s\" "
-                    "specified by the %s environment variable.\n",
-                    test_shard_file, kTestShardStatusFile);
-      fflush(stdout);
-      exit(EXIT_FAILURE);
-    }
-    fclose(file);
-  }
-}
-
-// Checks whether sharding is enabled by examining the relevant
-// environment variable values. If the variables are present,
-// but inconsistent (i.e., shard_index >= total_shards), prints
-// an error and exits. If in_subprocess_for_death_test, sharding is
-// disabled because it must only be applied to the original test
-// process. Otherwise, we could filter out death tests we intended to execute.
-bool ShouldShard(const char* total_shards_env,
-                 const char* shard_index_env,
-                 bool in_subprocess_for_death_test) {
-  if (in_subprocess_for_death_test) {
-    return false;
-  }
-
-  const Int32 total_shards = Int32FromEnvOrDie(total_shards_env, -1);
-  const Int32 shard_index = Int32FromEnvOrDie(shard_index_env, -1);
-
-  if (total_shards == -1 && shard_index == -1) {
-    return false;
-  } else if (total_shards == -1 && shard_index != -1) {
-    const Message msg = Message()
-      << "Invalid environment variables: you have "
-      << kTestShardIndex << " = " << shard_index
-      << ", but have left " << kTestTotalShards << " unset.\n";
-    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
-    fflush(stdout);
-    exit(EXIT_FAILURE);
-  } else if (total_shards != -1 && shard_index == -1) {
-    const Message msg = Message()
-      << "Invalid environment variables: you have "
-      << kTestTotalShards << " = " << total_shards
-      << ", but have left " << kTestShardIndex << " unset.\n";
-    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
-    fflush(stdout);
-    exit(EXIT_FAILURE);
-  } else if (shard_index < 0 || shard_index >= total_shards) {
-    const Message msg = Message()
-      << "Invalid environment variables: we require 0 <= "
-      << kTestShardIndex << " < " << kTestTotalShards
-      << ", but you have " << kTestShardIndex << "=" << shard_index
-      << ", " << kTestTotalShards << "=" << total_shards << ".\n";
-    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
-    fflush(stdout);
-    exit(EXIT_FAILURE);
-  }
-
-  return total_shards > 1;
-}
-
-// Parses the environment variable var as an Int32. If it is unset,
-// returns default_val. If it is not an Int32, prints an error
-// and aborts.
-Int32 Int32FromEnvOrDie(const char* var, Int32 default_val) {
-  const char* str_val = posix::GetEnv(var);
-  if (str_val == NULL) {
-    return default_val;
-  }
-
-  Int32 result;
-  if (!ParseInt32(Message() << "The value of environment variable " << var,
-                  str_val, &result)) {
-    exit(EXIT_FAILURE);
-  }
-  return result;
-}
-
-// Given the total number of shards, the shard index, and the test id,
-// returns true iff the test should be run on this shard. The test id is
-// some arbitrary but unique non-negative integer assigned to each test
-// method. Assumes that 0 <= shard_index < total_shards.
-bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) {
-  return (test_id % total_shards) == shard_index;
-}
-
-// Compares the name of each test with the user-specified filter to
-// decide whether the test should be run, then records the result in
-// each TestCase and TestInfo object.
-// If shard_tests == true, further filters tests based on sharding
-// variables in the environment - see
-// http://code.google.com/p/googletest/wiki/GoogleTestAdvancedGuide.
-// Returns the number of tests that should run.
-int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
-  const Int32 total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ?
-      Int32FromEnvOrDie(kTestTotalShards, -1) : -1;
-  const Int32 shard_index = shard_tests == HONOR_SHARDING_PROTOCOL ?
-      Int32FromEnvOrDie(kTestShardIndex, -1) : -1;
-
-  // num_runnable_tests are the number of tests that will
-  // run across all shards (i.e., match filter and are not disabled).
-  // num_selected_tests are the number of tests to be run on
-  // this shard.
-  int num_runnable_tests = 0;
-  int num_selected_tests = 0;
-  for (size_t i = 0; i < test_cases_.size(); i++) {
-    TestCase* const test_case = test_cases_[i];
-    const std::string &test_case_name = test_case->name();
-    test_case->set_should_run(false);
-
-    for (size_t j = 0; j < test_case->test_info_list().size(); j++) {
-      TestInfo* const test_info = test_case->test_info_list()[j];
-      const std::string test_name(test_info->name());
-      // A test is disabled if test case name or test name matches
-      // kDisableTestFilter.
-      const bool is_disabled =
-          internal::UnitTestOptions::MatchesFilter(test_case_name,
-                                                   kDisableTestFilter) ||
-          internal::UnitTestOptions::MatchesFilter(test_name,
-                                                   kDisableTestFilter);
-      test_info->is_disabled_ = is_disabled;
-
-      const bool matches_filter =
-          internal::UnitTestOptions::FilterMatchesTest(test_case_name,
-                                                       test_name);
-      test_info->matches_filter_ = matches_filter;
-
-      const bool is_runnable =
-          (GTEST_FLAG(also_run_disabled_tests) || !is_disabled) &&
-          matches_filter;
-
-      const bool is_selected = is_runnable &&
-          (shard_tests == IGNORE_SHARDING_PROTOCOL ||
-           ShouldRunTestOnShard(total_shards, shard_index,
-                                num_runnable_tests));
-
-      num_runnable_tests += is_runnable;
-      num_selected_tests += is_selected;
-
-      test_info->should_run_ = is_selected;
-      test_case->set_should_run(test_case->should_run() || is_selected);
-    }
-  }
-  return num_selected_tests;
-}
-
-// Prints the given C-string on a single line by replacing all '\n'
-// characters with string "\\n".  If the output takes more than
-// max_length characters, only prints the first max_length characters
-// and "...".
-static void PrintOnOneLine(const char* str, int max_length) {
-  if (str != NULL) {
-    for (int i = 0; *str != '\0'; ++str) {
-      if (i >= max_length) {
-        printf("...");
-        break;
-      }
-      if (*str == '\n') {
-        printf("\\n");
-        i += 2;
-      } else {
-        printf("%c", *str);
-        ++i;
-      }
-    }
-  }
-}
-
-// Prints the names of the tests matching the user-specified filter flag.
-void UnitTestImpl::ListTestsMatchingFilter() {
-  // Print at most this many characters for each type/value parameter.
-  const int kMaxParamLength = 250;
-
-  for (size_t i = 0; i < test_cases_.size(); i++) {
-    const TestCase* const test_case = test_cases_[i];
-    bool printed_test_case_name = false;
-
-    for (size_t j = 0; j < test_case->test_info_list().size(); j++) {
-      const TestInfo* const test_info =
-          test_case->test_info_list()[j];
-      if (test_info->matches_filter_) {
-        if (!printed_test_case_name) {
-          printed_test_case_name = true;
-          printf("%s.", test_case->name());
-          if (test_case->type_param() != NULL) {
-            printf("  # %s = ", kTypeParamLabel);
-            // We print the type parameter on a single line to make
-            // the output easy to parse by a program.
-            PrintOnOneLine(test_case->type_param(), kMaxParamLength);
-          }
-          printf("\n");
-        }
-        printf("  %s", test_info->name());
-        if (test_info->value_param() != NULL) {
-          printf("  # %s = ", kValueParamLabel);
-          // We print the value parameter on a single line to make the
-          // output easy to parse by a program.
-          PrintOnOneLine(test_info->value_param(), kMaxParamLength);
-        }
-        printf("\n");
-      }
-    }
-  }
-  fflush(stdout);
-}
-
-// Sets the OS stack trace getter.
-//
-// Does nothing if the input and the current OS stack trace getter are
-// the same; otherwise, deletes the old getter and makes the input the
-// current getter.
-void UnitTestImpl::set_os_stack_trace_getter(
-    OsStackTraceGetterInterface* getter) {
-  if (os_stack_trace_getter_ != getter) {
-    delete os_stack_trace_getter_;
-    os_stack_trace_getter_ = getter;
-  }
-}
-
-// Returns the current OS stack trace getter if it is not NULL;
-// otherwise, creates an OsStackTraceGetter, makes it the current
-// getter, and returns it.
-OsStackTraceGetterInterface* UnitTestImpl::os_stack_trace_getter() {
-  if (os_stack_trace_getter_ == NULL) {
-    os_stack_trace_getter_ = new OsStackTraceGetter;
-  }
-
-  return os_stack_trace_getter_;
-}
-
-// Returns the TestResult for the test that's currently running, or
-// the TestResult for the ad hoc test if no test is running.
-TestResult* UnitTestImpl::current_test_result() {
-  return current_test_info_ ?
-      &(current_test_info_->result_) : &ad_hoc_test_result_;
-}
-
-// Shuffles all test cases, and the tests within each test case,
-// making sure that death tests are still run first.
-void UnitTestImpl::ShuffleTests() {
-  // Shuffles the death test cases.
-  ShuffleRange(random(), 0, last_death_test_case_ + 1, &test_case_indices_);
-
-  // Shuffles the non-death test cases.
-  ShuffleRange(random(), last_death_test_case_ + 1,
-               static_cast<int>(test_cases_.size()), &test_case_indices_);
-
-  // Shuffles the tests inside each test case.
-  for (size_t i = 0; i < test_cases_.size(); i++) {
-    test_cases_[i]->ShuffleTests(random());
-  }
-}
-
-// Restores the test cases and tests to their order before the first shuffle.
-void UnitTestImpl::UnshuffleTests() {
-  for (size_t i = 0; i < test_cases_.size(); i++) {
-    // Unshuffles the tests in each test case.
-    test_cases_[i]->UnshuffleTests();
-    // Resets the index of each test case.
-    test_case_indices_[i] = static_cast<int>(i);
-  }
-}
-
-// Returns the current OS stack trace as an std::string.
-//
-// The maximum number of stack frames to be included is specified by
-// the gtest_stack_trace_depth flag.  The skip_count parameter
-// specifies the number of top frames to be skipped, which doesn't
-// count against the number of frames to be included.
-//
-// For example, if Foo() calls Bar(), which in turn calls
-// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
-// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
-std::string GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/,
-                                            int skip_count) {
-  // We pass skip_count + 1 to skip this wrapper function in addition
-  // to what the user really wants to skip.
-  return GetUnitTestImpl()->CurrentOsStackTraceExceptTop(skip_count + 1);
-}
-
-// Used by the GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_ macro to
-// suppress unreachable code warnings.
-namespace {
-class ClassUniqueToAlwaysTrue {};
-}
-
-bool IsTrue(bool condition) { return condition; }
-
-bool AlwaysTrue() {
-#if GTEST_HAS_EXCEPTIONS
-  // This condition is always false so AlwaysTrue() never actually throws,
-  // but it makes the compiler think that it may throw.
-  if (IsTrue(false))
-    throw ClassUniqueToAlwaysTrue();
-#endif  // GTEST_HAS_EXCEPTIONS
-  return true;
-}
-
-// If *pstr starts with the given prefix, modifies *pstr to be right
-// past the prefix and returns true; otherwise leaves *pstr unchanged
-// and returns false.  None of pstr, *pstr, and prefix can be NULL.
-bool SkipPrefix(const char* prefix, const char** pstr) {
-  const size_t prefix_len = strlen(prefix);
-  if (strncmp(*pstr, prefix, prefix_len) == 0) {
-    *pstr += prefix_len;
-    return true;
-  }
-  return false;
-}
-
-// Parses a string as a command line flag.  The string should have
-// the format "--flag=value".  When def_optional is true, the "=value"
-// part can be omitted.
-//
-// Returns the value of the flag, or NULL if the parsing failed.
-const char* ParseFlagValue(const char* str,
-                           const char* flag,
-                           bool def_optional) {
-  // str and flag must not be NULL.
-  if (str == NULL || flag == NULL) return NULL;
-
-  // The flag must start with "--" followed by GTEST_FLAG_PREFIX_.
-  const std::string flag_str = std::string("--") + GTEST_FLAG_PREFIX_ + flag;
-  const size_t flag_len = flag_str.length();
-  if (strncmp(str, flag_str.c_str(), flag_len) != 0) return NULL;
-
-  // Skips the flag name.
-  const char* flag_end = str + flag_len;
-
-  // When def_optional is true, it's OK to not have a "=value" part.
-  if (def_optional && (flag_end[0] == '\0')) {
-    return flag_end;
-  }
-
-  // If def_optional is true and there are more characters after the
-  // flag name, or if def_optional is false, there must be a '=' after
-  // the flag name.
-  if (flag_end[0] != '=') return NULL;
-
-  // Returns the string after "=".
-  return flag_end + 1;
-}
-
-// Parses a string for a bool flag, in the form of either
-// "--flag=value" or "--flag".
-//
-// In the former case, the value is taken as true as long as it does
-// not start with '0', 'f', or 'F'.
-//
-// In the latter case, the value is taken as true.
-//
-// On success, stores the value of the flag in *value, and returns
-// true.  On failure, returns false without changing *value.
-bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
-  // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, true);
-
-  // Aborts if the parsing failed.
-  if (value_str == NULL) return false;
-
-  // Converts the string value to a bool.
-  *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F');
-  return true;
-}
-
-// Parses a string for an Int32 flag, in the form of
-// "--flag=value".
-//
-// On success, stores the value of the flag in *value, and returns
-// true.  On failure, returns false without changing *value.
-bool ParseInt32Flag(const char* str, const char* flag, Int32* value) {
-  // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, false);
-
-  // Aborts if the parsing failed.
-  if (value_str == NULL) return false;
-
-  // Sets *value to the value of the flag.
-  return ParseInt32(Message() << "The value of flag --" << flag,
-                    value_str, value);
-}
-
-// Parses a string for a string flag, in the form of
-// "--flag=value".
-//
-// On success, stores the value of the flag in *value, and returns
-// true.  On failure, returns false without changing *value.
-bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
-  // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, false);
-
-  // Aborts if the parsing failed.
-  if (value_str == NULL) return false;
-
-  // Sets *value to the value of the flag.
-  *value = value_str;
-  return true;
-}
-
-// Determines whether a string has a prefix that Google Test uses for its
-// flags, i.e., starts with GTEST_FLAG_PREFIX_ or GTEST_FLAG_PREFIX_DASH_.
-// If Google Test detects that a command line flag has its prefix but is not
-// recognized, it will print its help message. Flags starting with
-// GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test
-// internal flags and do not trigger the help message.
-static bool HasGoogleTestFlagPrefix(const char* str) {
-  return (SkipPrefix("--", &str) ||
-          SkipPrefix("-", &str) ||
-          SkipPrefix("/", &str)) &&
-         !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) &&
-         (SkipPrefix(GTEST_FLAG_PREFIX_, &str) ||
-          SkipPrefix(GTEST_FLAG_PREFIX_DASH_, &str));
-}
-
-// Prints a string containing code-encoded text.  The following escape
-// sequences can be used in the string to control the text color:
-//
-//   @@    prints a single '@' character.
-//   @R    changes the color to red.
-//   @G    changes the color to green.
-//   @Y    changes the color to yellow.
-//   @D    changes to the default terminal text color.
-//
-// TODO(wan@google.com): Write tests for this once we add stdout
-// capturing to Google Test.
-static void PrintColorEncoded(const char* str) {
-  GTestColor color = COLOR_DEFAULT;  // The current color.
-
-  // Conceptually, we split the string into segments divided by escape
-  // sequences.  Then we print one segment at a time.  At the end of
-  // each iteration, the str pointer advances to the beginning of the
-  // next segment.
-  for (;;) {
-    const char* p = strchr(str, '@');
-    if (p == NULL) {
-      ColoredPrintf(color, "%s", str);
-      return;
-    }
-
-    ColoredPrintf(color, "%s", std::string(str, p).c_str());
-
-    const char ch = p[1];
-    str = p + 2;
-    if (ch == '@') {
-      ColoredPrintf(color, "@");
-    } else if (ch == 'D') {
-      color = COLOR_DEFAULT;
-    } else if (ch == 'R') {
-      color = COLOR_RED;
-    } else if (ch == 'G') {
-      color = COLOR_GREEN;
-    } else if (ch == 'Y') {
-      color = COLOR_YELLOW;
-    } else {
-      --str;
-    }
-  }
-}
-
-static const char kColorEncodedHelpMessage[] =
-"This program contains tests written using " GTEST_NAME_ ". You can use the\n"
-"following command line flags to control its behavior:\n"
-"\n"
-"Test Selection:\n"
-"  @G--" GTEST_FLAG_PREFIX_ "list_tests@D\n"
-"      List the names of all tests instead of running them. The name of\n"
-"      TEST(Foo, Bar) is \"Foo.Bar\".\n"
-"  @G--" GTEST_FLAG_PREFIX_ "filter=@YPOSTIVE_PATTERNS"
-    "[@G-@YNEGATIVE_PATTERNS]@D\n"
-"      Run only the tests whose name matches one of the positive patterns but\n"
-"      none of the negative patterns. '?' matches any single character; '*'\n"
-"      matches any substring; ':' separates two patterns.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "also_run_disabled_tests@D\n"
-"      Run all disabled tests too.\n"
-"\n"
-"Test Execution:\n"
-"  @G--" GTEST_FLAG_PREFIX_ "repeat=@Y[COUNT]@D\n"
-"      Run the tests repeatedly; use a negative count to repeat forever.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "shuffle@D\n"
-"      Randomize tests' orders on every iteration.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "random_seed=@Y[NUMBER]@D\n"
-"      Random number seed to use for shuffling test orders (between 1 and\n"
-"      99999, or 0 to use a seed based on the current time).\n"
-"\n"
-"Test Output:\n"
-"  @G--" GTEST_FLAG_PREFIX_ "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n"
-"      Enable/disable colored output. The default is @Gauto@D.\n"
-"  -@G-" GTEST_FLAG_PREFIX_ "print_time=0@D\n"
-"      Don't print the elapsed time of each test.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "output=xml@Y[@G:@YDIRECTORY_PATH@G"
-    GTEST_PATH_SEP_ "@Y|@G:@YFILE_PATH]@D\n"
-"      Generate an XML report in the given directory or with the given file\n"
-"      name. @YFILE_PATH@D defaults to @Gtest_details.xml@D.\n"
-#if GTEST_CAN_STREAM_RESULTS_
-"  @G--" GTEST_FLAG_PREFIX_ "stream_result_to=@YHOST@G:@YPORT@D\n"
-"      Stream test results to the given server.\n"
-#endif  // GTEST_CAN_STREAM_RESULTS_
-"\n"
-"Assertion Behavior:\n"
-#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
-"  @G--" GTEST_FLAG_PREFIX_ "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n"
-"      Set the default death test style.\n"
-#endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
-"  @G--" GTEST_FLAG_PREFIX_ "break_on_failure@D\n"
-"      Turn assertion failures into debugger break-points.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "throw_on_failure@D\n"
-"      Turn assertion failures into C++ exceptions.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "catch_exceptions=0@D\n"
-"      Do not report exceptions as test failures. Instead, allow them\n"
-"      to crash the program or throw a pop-up (on Windows).\n"
-"\n"
-"Except for @G--" GTEST_FLAG_PREFIX_ "list_tests@D, you can alternatively set "
-    "the corresponding\n"
-"environment variable of a flag (all letters in upper-case). For example, to\n"
-"disable colored text output, you can either specify @G--" GTEST_FLAG_PREFIX_
-    "color=no@D or set\n"
-"the @G" GTEST_FLAG_PREFIX_UPPER_ "COLOR@D environment variable to @Gno@D.\n"
-"\n"
-"For more information, please read the " GTEST_NAME_ " documentation at\n"
-"@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_ "\n"
-"(not one in your own code or tests), please report it to\n"
-"@G<" GTEST_DEV_EMAIL_ ">@D.\n";
-
-// Parses the command line for Google Test flags, without initializing
-// other parts of Google Test.  The type parameter CharType can be
-// instantiated to either char or wchar_t.
-template <typename CharType>
-void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
-  for (int i = 1; i < *argc; i++) {
-    const std::string arg_string = StreamableToString(argv[i]);
-    const char* const arg = arg_string.c_str();
-
-    using internal::ParseBoolFlag;
-    using internal::ParseInt32Flag;
-    using internal::ParseStringFlag;
-
-    // Do we see a Google Test flag?
-    if (ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag,
-                      &GTEST_FLAG(also_run_disabled_tests)) ||
-        ParseBoolFlag(arg, kBreakOnFailureFlag,
-                      &GTEST_FLAG(break_on_failure)) ||
-        ParseBoolFlag(arg, kCatchExceptionsFlag,
-                      &GTEST_FLAG(catch_exceptions)) ||
-        ParseStringFlag(arg, kColorFlag, &GTEST_FLAG(color)) ||
-        ParseStringFlag(arg, kDeathTestStyleFlag,
-                        &GTEST_FLAG(death_test_style)) ||
-        ParseBoolFlag(arg, kDeathTestUseFork,
-                      &GTEST_FLAG(death_test_use_fork)) ||
-        ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
-        ParseStringFlag(arg, kInternalRunDeathTestFlag,
-                        &GTEST_FLAG(internal_run_death_test)) ||
-        ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
-        ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
-        ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
-        ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
-        ParseInt32Flag(arg, kRepeatFlag, &GTEST_FLAG(repeat)) ||
-        ParseBoolFlag(arg, kShuffleFlag, &GTEST_FLAG(shuffle)) ||
-        ParseInt32Flag(arg, kStackTraceDepthFlag,
-                       &GTEST_FLAG(stack_trace_depth)) ||
-        ParseStringFlag(arg, kStreamResultToFlag,
-                        &GTEST_FLAG(stream_result_to)) ||
-        ParseBoolFlag(arg, kThrowOnFailureFlag,
-                      &GTEST_FLAG(throw_on_failure))
-        ) {
-      // Yes.  Shift the remainder of the argv list left by one.  Note
-      // that argv has (*argc + 1) elements, the last one always being
-      // NULL.  The following loop moves the trailing NULL element as
-      // well.
-      for (int j = i; j != *argc; j++) {
-        argv[j] = argv[j + 1];
-      }
-
-      // Decrements the argument count.
-      (*argc)--;
-
-      // We also need to decrement the iterator as we just removed
-      // an element.
-      i--;
-    } else if (arg_string == "--help" || arg_string == "-h" ||
-               arg_string == "-?" || arg_string == "/?" ||
-               HasGoogleTestFlagPrefix(arg)) {
-      // Both help flag and unrecognized Google Test flags (excluding
-      // internal ones) trigger help display.
-      g_help_flag = true;
-    }
-  }
-
-  if (g_help_flag) {
-    // We print the help here instead of in RUN_ALL_TESTS(), as the
-    // latter may not be called at all if the user is using Google
-    // Test with another testing framework.
-    PrintColorEncoded(kColorEncodedHelpMessage);
-  }
-}
-
-// Parses the command line for Google Test flags, without initializing
-// other parts of Google Test.
-void ParseGoogleTestFlagsOnly(int* argc, char** argv) {
-  ParseGoogleTestFlagsOnlyImpl(argc, argv);
-}
-void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv) {
-  ParseGoogleTestFlagsOnlyImpl(argc, argv);
-}
-
-// The internal implementation of InitGoogleTest().
-//
-// The type parameter CharType can be instantiated to either char or
-// wchar_t.
-template <typename CharType>
-void InitGoogleTestImpl(int* argc, CharType** argv) {
-  g_init_gtest_count++;
-
-  // We don't want to run the initialization code twice.
-  if (g_init_gtest_count != 1) return;
-
-  if (*argc <= 0) return;
-
-  internal::g_executable_path = internal::StreamableToString(argv[0]);
-
-#if GTEST_HAS_DEATH_TEST
-
-  g_argvs.clear();
-  for (int i = 0; i != *argc; i++) {
-    g_argvs.push_back(StreamableToString(argv[i]));
-  }
-
-#endif  // GTEST_HAS_DEATH_TEST
-
-  ParseGoogleTestFlagsOnly(argc, argv);
-  GetUnitTestImpl()->PostFlagParsingInit();
-}
-
-}  // namespace internal
-
-// Initializes Google Test.  This must be called before calling
-// RUN_ALL_TESTS().  In particular, it parses a command line for the
-// flags that Google Test recognizes.  Whenever a Google Test flag is
-// seen, it is removed from argv, and *argc is decremented.
-//
-// No value is returned.  Instead, the Google Test flag variables are
-// updated.
-//
-// Calling the function for the second time has no user-visible effect.
-void InitGoogleTest(int* argc, char** argv) {
-  internal::InitGoogleTestImpl(argc, argv);
-}
-
-// This overloaded version can be used in Windows programs compiled in
-// UNICODE mode.
-void InitGoogleTest(int* argc, wchar_t** argv) {
-  internal::InitGoogleTestImpl(argc, argv);
-}
-
-}  // namespace testing
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan), vladl@google.com (Vlad Losev)
-//
-// This file implements death tests.
-
-
-#if GTEST_HAS_DEATH_TEST
-
-# if GTEST_OS_MAC
-#  include <crt_externs.h>
-# endif  // GTEST_OS_MAC
-
-# include <errno.h>
-# include <fcntl.h>
-# include <limits.h>
-
-# if GTEST_OS_LINUX
-#  include <signal.h>
-# endif  // GTEST_OS_LINUX
-
-# include <stdarg.h>
-
-# if GTEST_OS_WINDOWS
-#  include <windows.h>
-# else
-#  include <sys/mman.h>
-#  include <sys/wait.h>
-# endif  // GTEST_OS_WINDOWS
-
-# if GTEST_OS_QNX
-#  include <spawn.h>
-# endif  // GTEST_OS_QNX
-
-#endif  // GTEST_HAS_DEATH_TEST
-
-
-// Indicates that this translation unit is part of Google Test's
-// implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick is to
-// prevent a user from accidentally including gtest-internal-inl.h in
-// his code.
-#define GTEST_IMPLEMENTATION_ 1
-#undef GTEST_IMPLEMENTATION_
-
-namespace testing {
-
-// Constants.
-
-// The default death test style.
-static const char kDefaultDeathTestStyle[] = "fast";
-
-GTEST_DEFINE_string_(
-    death_test_style,
-    internal::StringFromGTestEnv("death_test_style", kDefaultDeathTestStyle),
-    "Indicates how to run a death test in a forked child process: "
-    "\"threadsafe\" (child process re-executes the test binary "
-    "from the beginning, running only the specific death test) or "
-    "\"fast\" (child process runs the death test immediately "
-    "after forking).");
-
-GTEST_DEFINE_bool_(
-    death_test_use_fork,
-    internal::BoolFromGTestEnv("death_test_use_fork", false),
-    "Instructs to use fork()/_exit() instead of clone() in death tests. "
-    "Ignored and always uses fork() on POSIX systems where clone() is not "
-    "implemented. Useful when running under valgrind or similar tools if "
-    "those do not support clone(). Valgrind 3.3.1 will just fail if "
-    "it sees an unsupported combination of clone() flags. "
-    "It is not recommended to use this flag w/o valgrind though it will "
-    "work in 99% of the cases. Once valgrind is fixed, this flag will "
-    "most likely be removed.");
-
-namespace internal {
-GTEST_DEFINE_string_(
-    internal_run_death_test, "",
-    "Indicates the file, line number, temporal index of "
-    "the single death test to run, and a file descriptor to "
-    "which a success code may be sent, all separated by "
-    "the '|' characters.  This flag is specified if and only if the current "
-    "process is a sub-process launched for running a thread-safe "
-    "death test.  FOR INTERNAL USE ONLY.");
-}  // namespace internal
-
-#if GTEST_HAS_DEATH_TEST
-
-namespace internal {
-
-# if !GTEST_OS_WINDOWS
-// Valid only for fast death tests. Indicates the code is running in the
-// child process of a fast style death test.
-static bool g_in_fast_death_test_child = false;
-# endif  // !GTEST_OS_WINDOWS
-
-// Returns a Boolean value indicating whether the caller is currently
-// executing in the context of the death test child process.  Tools such as
-// Valgrind heap checkers may need this to modify their behavior in death
-// tests.  IMPORTANT: This is an internal utility.  Using it may break the
-// implementation of death tests.  User code MUST NOT use it.
-bool InDeathTestChild() {
-# if GTEST_OS_WINDOWS
-
-  // On Windows, death tests are thread-safe regardless of the value of the
-  // death_test_style flag.
-  return !GTEST_FLAG(internal_run_death_test).empty();
-
-# else
-
-  if (GTEST_FLAG(death_test_style) == "threadsafe")
-    return !GTEST_FLAG(internal_run_death_test).empty();
-  else
-    return g_in_fast_death_test_child;
-#endif
-}
-
-}  // namespace internal
-
-// ExitedWithCode constructor.
-ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {
-}
-
-// ExitedWithCode function-call operator.
-bool ExitedWithCode::operator()(int exit_status) const {
-# if GTEST_OS_WINDOWS
-
-  return exit_status == exit_code_;
-
-# else
-
-  return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_;
-
-# endif  // GTEST_OS_WINDOWS
-}
-
-# if !GTEST_OS_WINDOWS
-// KilledBySignal constructor.
-KilledBySignal::KilledBySignal(int signum) : signum_(signum) {
-}
-
-// KilledBySignal function-call operator.
-bool KilledBySignal::operator()(int exit_status) const {
-  return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_;
-}
-# endif  // !GTEST_OS_WINDOWS
-
-namespace internal {
-
-// Utilities needed for death tests.
-
-// Generates a textual description of a given exit code, in the format
-// specified by wait(2).
-static std::string ExitSummary(int exit_code) {
-  Message m;
-
-# if GTEST_OS_WINDOWS
-
-  m << "Exited with exit status " << exit_code;
-
-# else
-
-  if (WIFEXITED(exit_code)) {
-    m << "Exited with exit status " << WEXITSTATUS(exit_code);
-  } else if (WIFSIGNALED(exit_code)) {
-    m << "Terminated by signal " << WTERMSIG(exit_code);
-  }
-#  ifdef WCOREDUMP
-  if (WCOREDUMP(exit_code)) {
-    m << " (core dumped)";
-  }
-#  endif
-# endif  // GTEST_OS_WINDOWS
-
-  return m.GetString();
-}
-
-// Returns true if exit_status describes a process that was terminated
-// by a signal, or exited normally with a nonzero exit code.
-bool ExitedUnsuccessfully(int exit_status) {
-  return !ExitedWithCode(0)(exit_status);
-}
-
-# if !GTEST_OS_WINDOWS
-// Generates a textual failure message when a death test finds more than
-// one thread running, or cannot determine the number of threads, prior
-// to executing the given statement.  It is the responsibility of the
-// caller not to pass a thread_count of 1.
-static std::string DeathTestThreadWarning(size_t thread_count) {
-  Message msg;
-  msg << "Death tests use fork(), which is unsafe particularly"
-      << " in a threaded context. For this test, " << GTEST_NAME_ << " ";
-  if (thread_count == 0)
-    msg << "couldn't detect the number of threads.";
-  else
-    msg << "detected " << thread_count << " threads.";
-  return msg.GetString();
-}
-# endif  // !GTEST_OS_WINDOWS
-
-// Flag characters for reporting a death test that did not die.
-static const char kDeathTestLived = 'L';
-static const char kDeathTestReturned = 'R';
-static const char kDeathTestThrew = 'T';
-static const char kDeathTestInternalError = 'I';
-
-// An enumeration describing all of the possible ways that a death test can
-// conclude.  DIED means that the process died while executing the test
-// code; LIVED means that process lived beyond the end of the test code;
-// RETURNED means that the test statement attempted to execute a return
-// statement, which is not allowed; THREW means that the test statement
-// returned control by throwing an exception.  IN_PROGRESS means the test
-// has not yet concluded.
-// TODO(vladl@google.com): Unify names and possibly values for
-// AbortReason, DeathTestOutcome, and flag characters above.
-enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW };
-
-// Routine for aborting the program which is safe to call from an
-// exec-style death test child process, in which case the error
-// message is propagated back to the parent process.  Otherwise, the
-// message is simply printed to stderr.  In either case, the program
-// then exits with status 1.
-void DeathTestAbort(const std::string& message) {
-  // On a POSIX system, this function may be called from a threadsafe-style
-  // death test child process, which operates on a very small stack.  Use
-  // the heap for any additional non-minuscule memory requirements.
-  const InternalRunDeathTestFlag* const flag =
-      GetUnitTestImpl()->internal_run_death_test_flag();
-  if (flag != NULL) {
-    FILE* parent = posix::FDOpen(flag->write_fd(), "w");
-    fputc(kDeathTestInternalError, parent);
-    fprintf(parent, "%s", message.c_str());
-    fflush(parent);
-    _exit(1);
-  } else {
-    fprintf(stderr, "%s", message.c_str());
-    fflush(stderr);
-    posix::Abort();
-  }
-}
-
-// A replacement for CHECK that calls DeathTestAbort if the assertion
-// fails.
-# define GTEST_DEATH_TEST_CHECK_(expression) \
-  do { \
-    if (!::testing::internal::IsTrue(expression)) { \
-      DeathTestAbort( \
-          ::std::string("CHECK failed: File ") + __FILE__ +  ", line " \
-          + ::testing::internal::StreamableToString(__LINE__) + ": " \
-          + #expression); \
-    } \
-  } while (::testing::internal::AlwaysFalse())
-
-// This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for
-// evaluating any system call that fulfills two conditions: it must return
-// -1 on failure, and set errno to EINTR when it is interrupted and
-// should be tried again.  The macro expands to a loop that repeatedly
-// evaluates the expression as long as it evaluates to -1 and sets
-// errno to EINTR.  If the expression evaluates to -1 but errno is
-// something other than EINTR, DeathTestAbort is called.
-# define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \
-  do { \
-    int gtest_retval; \
-    do { \
-      gtest_retval = (expression); \
-    } while (gtest_retval == -1 && errno == EINTR); \
-    if (gtest_retval == -1) { \
-      DeathTestAbort( \
-          ::std::string("CHECK failed: File ") + __FILE__ + ", line " \
-          + ::testing::internal::StreamableToString(__LINE__) + ": " \
-          + #expression + " != -1"); \
-    } \
-  } while (::testing::internal::AlwaysFalse())
-
-// Returns the message describing the last system error in errno.
-std::string GetLastErrnoDescription() {
-    return errno == 0 ? "" : posix::StrError(errno);
-}
-
-// This is called from a death test parent process to read a failure
-// message from the death test child process and log it with the FATAL
-// severity. On Windows, the message is read from a pipe handle. On other
-// platforms, it is read from a file descriptor.
-static void FailFromInternalError(int fd) {
-  Message error;
-  char buffer[256];
-  int num_read;
-
-  do {
-    while ((num_read = posix::Read(fd, buffer, 255)) > 0) {
-      buffer[num_read] = '\0';
-      error << buffer;
-    }
-  } while (num_read == -1 && errno == EINTR);
-
-  if (num_read == 0) {
-    GTEST_LOG_(FATAL) << error.GetString();
-  } else {
-    const int last_error = errno;
-    GTEST_LOG_(FATAL) << "Error while reading death test internal: "
-                      << GetLastErrnoDescription() << " [" << last_error << "]";
-  }
-}
-
-// Death test constructor.  Increments the running death test count
-// for the current test.
-DeathTest::DeathTest() {
-  TestInfo* const info = GetUnitTestImpl()->current_test_info();
-  if (info == NULL) {
-    DeathTestAbort("Cannot run a death test outside of a TEST or "
-                   "TEST_F construct");
-  }
-}
-
-// Creates and returns a death test by dispatching to the current
-// death test factory.
-bool DeathTest::Create(const char* statement, const RE* regex,
-                       const char* file, int line, DeathTest** test) {
-  return GetUnitTestImpl()->death_test_factory()->Create(
-      statement, regex, file, line, test);
-}
-
-const char* DeathTest::LastMessage() {
-  return last_death_test_message_.c_str();
-}
-
-void DeathTest::set_last_death_test_message(const std::string& message) {
-  last_death_test_message_ = message;
-}
-
-std::string DeathTest::last_death_test_message_;
-
-// Provides cross platform implementation for some death functionality.
-class DeathTestImpl : public DeathTest {
- protected:
-  DeathTestImpl(const char* a_statement, const RE* a_regex)
-      : statement_(a_statement),
-        regex_(a_regex),
-        spawned_(false),
-        status_(-1),
-        outcome_(IN_PROGRESS),
-        read_fd_(-1),
-        write_fd_(-1) {}
-
-  // read_fd_ is expected to be closed and cleared by a derived class.
-  ~DeathTestImpl() { GTEST_DEATH_TEST_CHECK_(read_fd_ == -1); }
-
-  void Abort(AbortReason reason);
-  virtual bool Passed(bool status_ok);
-
-  const char* statement() const { return statement_; }
-  const RE* regex() const { return regex_; }
-  bool spawned() const { return spawned_; }
-  void set_spawned(bool is_spawned) { spawned_ = is_spawned; }
-  int status() const { return status_; }
-  void set_status(int a_status) { status_ = a_status; }
-  DeathTestOutcome outcome() const { return outcome_; }
-  void set_outcome(DeathTestOutcome an_outcome) { outcome_ = an_outcome; }
-  int read_fd() const { return read_fd_; }
-  void set_read_fd(int fd) { read_fd_ = fd; }
-  int write_fd() const { return write_fd_; }
-  void set_write_fd(int fd) { write_fd_ = fd; }
-
-  // Called in the parent process only. Reads the result code of the death
-  // test child process via a pipe, interprets it to set the outcome_
-  // member, and closes read_fd_.  Outputs diagnostics and terminates in
-  // case of unexpected codes.
-  void ReadAndInterpretStatusByte();
-
- private:
-  // The textual content of the code this object is testing.  This class
-  // doesn't own this string and should not attempt to delete it.
-  const char* const statement_;
-  // The regular expression which test output must match.  DeathTestImpl
-  // doesn't own this object and should not attempt to delete it.
-  const RE* const regex_;
-  // True if the death test child process has been successfully spawned.
-  bool spawned_;
-  // The exit status of the child process.
-  int status_;
-  // How the death test concluded.
-  DeathTestOutcome outcome_;
-  // Descriptor to the read end of the pipe to the child process.  It is
-  // always -1 in the child process.  The child keeps its write end of the
-  // pipe in write_fd_.
-  int read_fd_;
-  // Descriptor to the child's write end of the pipe to the parent process.
-  // It is always -1 in the parent process.  The parent keeps its end of the
-  // pipe in read_fd_.
-  int write_fd_;
-};
-
-// Called in the parent process only. Reads the result code of the death
-// test child process via a pipe, interprets it to set the outcome_
-// member, and closes read_fd_.  Outputs diagnostics and terminates in
-// case of unexpected codes.
-void DeathTestImpl::ReadAndInterpretStatusByte() {
-  char flag;
-  int bytes_read;
-
-  // The read() here blocks until data is available (signifying the
-  // failure of the death test) or until the pipe is closed (signifying
-  // its success), so it's okay to call this in the parent before
-  // the child process has exited.
-  do {
-    bytes_read = posix::Read(read_fd(), &flag, 1);
-  } while (bytes_read == -1 && errno == EINTR);
-
-  if (bytes_read == 0) {
-    set_outcome(DIED);
-  } else if (bytes_read == 1) {
-    switch (flag) {
-      case kDeathTestReturned:
-        set_outcome(RETURNED);
-        break;
-      case kDeathTestThrew:
-        set_outcome(THREW);
-        break;
-      case kDeathTestLived:
-        set_outcome(LIVED);
-        break;
-      case kDeathTestInternalError:
-        FailFromInternalError(read_fd());  // Does not return.
-        break;
-      default:
-        GTEST_LOG_(FATAL) << "Death test child process reported "
-                          << "unexpected status byte ("
-                          << static_cast<unsigned int>(flag) << ")";
-    }
-  } else {
-    GTEST_LOG_(FATAL) << "Read from death test child process failed: "
-                      << GetLastErrnoDescription();
-  }
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Close(read_fd()));
-  set_read_fd(-1);
-}
-
-// Signals that the death test code which should have exited, didn't.
-// Should be called only in a death test child process.
-// Writes a status byte to the child's status file descriptor, then
-// calls _exit(1).
-void DeathTestImpl::Abort(AbortReason reason) {
-  // The parent process considers the death test to be a failure if
-  // it finds any data in our pipe.  So, here we write a single flag byte
-  // to the pipe, then exit.
-  const char status_ch =
-      reason == TEST_DID_NOT_DIE ? kDeathTestLived :
-      reason == TEST_THREW_EXCEPTION ? kDeathTestThrew : kDeathTestReturned;
-
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1));
-  // We are leaking the descriptor here because on some platforms (i.e.,
-  // when built as Windows DLL), destructors of global objects will still
-  // run after calling _exit(). On such systems, write_fd_ will be
-  // indirectly closed from the destructor of UnitTestImpl, causing double
-  // close if it is also closed here. On debug configurations, double close
-  // may assert. As there are no in-process buffers to flush here, we are
-  // relying on the OS to close the descriptor after the process terminates
-  // when the destructors are not run.
-  _exit(1);  // Exits w/o any normal exit hooks (we were supposed to crash)
-}
-
-// Returns an indented copy of stderr output for a death test.
-// This makes distinguishing death test output lines from regular log lines
-// much easier.
-static ::std::string FormatDeathTestOutput(const ::std::string& output) {
-  ::std::string ret;
-  for (size_t at = 0; ; ) {
-    const size_t line_end = output.find('\n', at);
-    ret += "[  DEATH   ] ";
-    if (line_end == ::std::string::npos) {
-      ret += output.substr(at);
-      break;
-    }
-    ret += output.substr(at, line_end + 1 - at);
-    at = line_end + 1;
-  }
-  return ret;
-}
-
-// Assesses the success or failure of a death test, using both private
-// members which have previously been set, and one argument:
-//
-// Private data members:
-//   outcome:  An enumeration describing how the death test
-//             concluded: DIED, LIVED, THREW, or RETURNED.  The death test
-//             fails in the latter three cases.
-//   status:   The exit status of the child process. On *nix, it is in the
-//             in the format specified by wait(2). On Windows, this is the
-//             value supplied to the ExitProcess() API or a numeric code
-//             of the exception that terminated the program.
-//   regex:    A regular expression object to be applied to
-//             the test's captured standard error output; the death test
-//             fails if it does not match.
-//
-// Argument:
-//   status_ok: true if exit_status is acceptable in the context of
-//              this particular death test, which fails if it is false
-//
-// Returns true iff all of the above conditions are met.  Otherwise, the
-// first failing condition, in the order given above, is the one that is
-// reported. Also sets the last death test message string.
-bool DeathTestImpl::Passed(bool status_ok) {
-  if (!spawned())
-    return false;
-
-  const std::string error_message = GetCapturedStderr();
-
-  bool success = false;
-  Message buffer;
-
-  buffer << "Death test: " << statement() << "\n";
-  switch (outcome()) {
-    case LIVED:
-      buffer << "    Result: failed to die.\n"
-             << " Error msg:\n" << FormatDeathTestOutput(error_message);
-      break;
-    case THREW:
-      buffer << "    Result: threw an exception.\n"
-             << " Error msg:\n" << FormatDeathTestOutput(error_message);
-      break;
-    case RETURNED:
-      buffer << "    Result: illegal return in test statement.\n"
-             << " Error msg:\n" << FormatDeathTestOutput(error_message);
-      break;
-    case DIED:
-      if (status_ok) {
-        const bool matched = RE::PartialMatch(error_message.c_str(), *regex());
-        if (matched) {
-          success = true;
-        } else {
-          buffer << "    Result: died but not with expected error.\n"
-                 << "  Expected: " << regex()->pattern() << "\n"
-                 << "Actual msg:\n" << FormatDeathTestOutput(error_message);
-        }
-      } else {
-        buffer << "    Result: died but not with expected exit code:\n"
-               << "            " << ExitSummary(status()) << "\n"
-               << "Actual msg:\n" << FormatDeathTestOutput(error_message);
-      }
-      break;
-    case IN_PROGRESS:
-    default:
-      GTEST_LOG_(FATAL)
-          << "DeathTest::Passed somehow called before conclusion of test";
-  }
-
-  DeathTest::set_last_death_test_message(buffer.GetString());
-  return success;
-}
-
-# if GTEST_OS_WINDOWS
-// WindowsDeathTest implements death tests on Windows. Due to the
-// specifics of starting new processes on Windows, death tests there are
-// always threadsafe, and Google Test considers the
-// --gtest_death_test_style=fast setting to be equivalent to
-// --gtest_death_test_style=threadsafe there.
-//
-// A few implementation notes:  Like the Linux version, the Windows
-// implementation uses pipes for child-to-parent communication. But due to
-// the specifics of pipes on Windows, some extra steps are required:
-//
-// 1. The parent creates a communication pipe and stores handles to both
-//    ends of it.
-// 2. The parent starts the child and provides it with the information
-//    necessary to acquire the handle to the write end of the pipe.
-// 3. The child acquires the write end of the pipe and signals the parent
-//    using a Windows event.
-// 4. Now the parent can release the write end of the pipe on its side. If
-//    this is done before step 3, the object's reference count goes down to
-//    0 and it is destroyed, preventing the child from acquiring it. The
-//    parent now has to release it, or read operations on the read end of
-//    the pipe will not return when the child terminates.
-// 5. The parent reads child's output through the pipe (outcome code and
-//    any possible error messages) from the pipe, and its stderr and then
-//    determines whether to fail the test.
-//
-// Note: to distinguish Win32 API calls from the local method and function
-// calls, the former are explicitly resolved in the global namespace.
-//
-class WindowsDeathTest : public DeathTestImpl {
- public:
-  WindowsDeathTest(const char* a_statement,
-                   const RE* a_regex,
-                   const char* file,
-                   int line)
-      : DeathTestImpl(a_statement, a_regex), file_(file), line_(line) {}
-
-  // All of these virtual functions are inherited from DeathTest.
-  virtual int Wait();
-  virtual TestRole AssumeRole();
-
- private:
-  // The name of the file in which the death test is located.
-  const char* const file_;
-  // The line number on which the death test is located.
-  const int line_;
-  // Handle to the write end of the pipe to the child process.
-  AutoHandle write_handle_;
-  // Child process handle.
-  AutoHandle child_handle_;
-  // Event the child process uses to signal the parent that it has
-  // acquired the handle to the write end of the pipe. After seeing this
-  // event the parent can release its own handles to make sure its
-  // ReadFile() calls return when the child terminates.
-  AutoHandle event_handle_;
-};
-
-// Waits for the child in a death test to exit, returning its exit
-// status, or 0 if no child process exists.  As a side effect, sets the
-// outcome data member.
-int WindowsDeathTest::Wait() {
-  if (!spawned())
-    return 0;
-
-  // Wait until the child either signals that it has acquired the write end
-  // of the pipe or it dies.
-  const HANDLE wait_handles[2] = { child_handle_.Get(), event_handle_.Get() };
-  switch (::WaitForMultipleObjects(2,
-                                   wait_handles,
-                                   FALSE,  // Waits for any of the handles.
-                                   INFINITE)) {
-    case WAIT_OBJECT_0:
-    case WAIT_OBJECT_0 + 1:
-      break;
-    default:
-      GTEST_DEATH_TEST_CHECK_(false);  // Should not get here.
-  }
-
-  // The child has acquired the write end of the pipe or exited.
-  // We release the handle on our side and continue.
-  write_handle_.Reset();
-  event_handle_.Reset();
-
-  ReadAndInterpretStatusByte();
-
-  // Waits for the child process to exit if it haven't already. This
-  // returns immediately if the child has already exited, regardless of
-  // whether previous calls to WaitForMultipleObjects synchronized on this
-  // handle or not.
-  GTEST_DEATH_TEST_CHECK_(
-      WAIT_OBJECT_0 == ::WaitForSingleObject(child_handle_.Get(),
-                                             INFINITE));
-  DWORD status_code;
-  GTEST_DEATH_TEST_CHECK_(
-      ::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE);
-  child_handle_.Reset();
-  set_status(static_cast<int>(status_code));
-  return status();
-}
-
-// The AssumeRole process for a Windows death test.  It creates a child
-// process with the same executable as the current process to run the
-// death test.  The child process is given the --gtest_filter and
-// --gtest_internal_run_death_test flags such that it knows to run the
-// current death test only.
-DeathTest::TestRole WindowsDeathTest::AssumeRole() {
-  const UnitTestImpl* const impl = GetUnitTestImpl();
-  const InternalRunDeathTestFlag* const flag =
-      impl->internal_run_death_test_flag();
-  const TestInfo* const info = impl->current_test_info();
-  const int death_test_index = info->result()->death_test_count();
-
-  if (flag != NULL) {
-    // ParseInternalRunDeathTestFlag() has performed all the necessary
-    // processing.
-    set_write_fd(flag->write_fd());
-    return EXECUTE_TEST;
-  }
-
-  // WindowsDeathTest uses an anonymous pipe to communicate results of
-  // a death test.
-  SECURITY_ATTRIBUTES handles_are_inheritable = {
-    sizeof(SECURITY_ATTRIBUTES), NULL, TRUE };
-  HANDLE read_handle, write_handle;
-  GTEST_DEATH_TEST_CHECK_(
-      ::CreatePipe(&read_handle, &write_handle, &handles_are_inheritable,
-                   0)  // Default buffer size.
-      != FALSE);
-  set_read_fd(::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle),
-                                O_RDONLY));
-  write_handle_.Reset(write_handle);
-  event_handle_.Reset(::CreateEvent(
-      &handles_are_inheritable,
-      TRUE,    // The event will automatically reset to non-signaled state.
-      FALSE,   // The initial state is non-signalled.
-      NULL));  // The even is unnamed.
-  GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != NULL);
-  const std::string filter_flag =
-      std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "=" +
-      info->test_case_name() + "." + info->name();
-  const std::string internal_flag =
-      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag +
-      "=" + file_ + "|" + StreamableToString(line_) + "|" +
-      StreamableToString(death_test_index) + "|" +
-      StreamableToString(static_cast<unsigned int>(::GetCurrentProcessId())) +
-      // size_t has the same width as pointers on both 32-bit and 64-bit
-      // Windows platforms.
-      // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx.
-      "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) +
-      "|" + StreamableToString(reinterpret_cast<size_t>(event_handle_.Get()));
-
-  char executable_path[_MAX_PATH + 1];  // NOLINT
-  GTEST_DEATH_TEST_CHECK_(
-      _MAX_PATH + 1 != ::GetModuleFileNameA(NULL,
-                                            executable_path,
-                                            _MAX_PATH));
-
-  std::string command_line =
-      std::string(::GetCommandLineA()) + " " + filter_flag + " \"" +
-      internal_flag + "\"";
-
-  DeathTest::set_last_death_test_message("");
-
-  CaptureStderr();
-  // Flush the log buffers since the log streams are shared with the child.
-  FlushInfoLog();
-
-  // The child process will share the standard handles with the parent.
-  STARTUPINFOA startup_info;
-  memset(&startup_info, 0, sizeof(STARTUPINFO));
-  startup_info.dwFlags = STARTF_USESTDHANDLES;
-  startup_info.hStdInput = ::GetStdHandle(STD_INPUT_HANDLE);
-  startup_info.hStdOutput = ::GetStdHandle(STD_OUTPUT_HANDLE);
-  startup_info.hStdError = ::GetStdHandle(STD_ERROR_HANDLE);
-
-  PROCESS_INFORMATION process_info;
-  GTEST_DEATH_TEST_CHECK_(::CreateProcessA(
-      executable_path,
-      const_cast<char*>(command_line.c_str()),
-      NULL,   // Retuned process handle is not inheritable.
-      NULL,   // Retuned thread handle is not inheritable.
-      TRUE,   // Child inherits all inheritable handles (for write_handle_).
-      0x0,    // Default creation flags.
-      NULL,   // Inherit the parent's environment.
-      UnitTest::GetInstance()->original_working_dir(),
-      &startup_info,
-      &process_info) != FALSE);
-  child_handle_.Reset(process_info.hProcess);
-  ::CloseHandle(process_info.hThread);
-  set_spawned(true);
-  return OVERSEE_TEST;
-}
-# else  // We are not on Windows.
-
-// ForkingDeathTest provides implementations for most of the abstract
-// methods of the DeathTest interface.  Only the AssumeRole method is
-// left undefined.
-class ForkingDeathTest : public DeathTestImpl {
- public:
-  ForkingDeathTest(const char* statement, const RE* regex);
-
-  // All of these virtual functions are inherited from DeathTest.
-  virtual int Wait();
-
- protected:
-  void set_child_pid(pid_t child_pid) { child_pid_ = child_pid; }
-
- private:
-  // PID of child process during death test; 0 in the child process itself.
-  pid_t child_pid_;
-};
-
-// Constructs a ForkingDeathTest.
-ForkingDeathTest::ForkingDeathTest(const char* a_statement, const RE* a_regex)
-    : DeathTestImpl(a_statement, a_regex),
-      child_pid_(-1) {}
-
-// Waits for the child in a death test to exit, returning its exit
-// status, or 0 if no child process exists.  As a side effect, sets the
-// outcome data member.
-int ForkingDeathTest::Wait() {
-  if (!spawned())
-    return 0;
-
-  ReadAndInterpretStatusByte();
-
-  int status_value;
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(waitpid(child_pid_, &status_value, 0));
-  set_status(status_value);
-  return status_value;
-}
-
-// A concrete death test class that forks, then immediately runs the test
-// in the child process.
-class NoExecDeathTest : public ForkingDeathTest {
- public:
-  NoExecDeathTest(const char* a_statement, const RE* a_regex) :
-      ForkingDeathTest(a_statement, a_regex) { }
-  virtual TestRole AssumeRole();
-};
-
-// The AssumeRole process for a fork-and-run death test.  It implements a
-// straightforward fork, with a simple pipe to transmit the status byte.
-DeathTest::TestRole NoExecDeathTest::AssumeRole() {
-  const size_t thread_count = GetThreadCount();
-  if (thread_count != 1) {
-    GTEST_LOG_(WARNING) << DeathTestThreadWarning(thread_count);
-  }
-
-  int pipe_fd[2];
-  GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
-
-  DeathTest::set_last_death_test_message("");
-  CaptureStderr();
-  // When we fork the process below, the log file buffers are copied, but the
-  // file descriptors are shared.  We flush all log files here so that closing
-  // the file descriptors in the child process doesn't throw off the
-  // synchronization between descriptors and buffers in the parent process.
-  // This is as close to the fork as possible to avoid a race condition in case
-  // there are multiple threads running before the death test, and another
-  // thread writes to the log file.
-  FlushInfoLog();
-
-  const pid_t child_pid = fork();
-  GTEST_DEATH_TEST_CHECK_(child_pid != -1);
-  set_child_pid(child_pid);
-  if (child_pid == 0) {
-    GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[0]));
-    set_write_fd(pipe_fd[1]);
-    // Redirects all logging to stderr in the child process to prevent
-    // concurrent writes to the log files.  We capture stderr in the parent
-    // process and append the child process' output to a log.
-    LogToStderr();
-    // Event forwarding to the listeners of event listener API mush be shut
-    // down in death test subprocesses.
-    GetUnitTestImpl()->listeners()->SuppressEventForwarding();
-    g_in_fast_death_test_child = true;
-    return EXECUTE_TEST;
-  } else {
-    GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
-    set_read_fd(pipe_fd[0]);
-    set_spawned(true);
-    return OVERSEE_TEST;
-  }
-}
-
-// A concrete death test class that forks and re-executes the main
-// program from the beginning, with command-line flags set that cause
-// only this specific death test to be run.
-class ExecDeathTest : public ForkingDeathTest {
- public:
-  ExecDeathTest(const char* a_statement, const RE* a_regex,
-                const char* file, int line) :
-      ForkingDeathTest(a_statement, a_regex), file_(file), line_(line) { }
-  virtual TestRole AssumeRole();
- private:
-  static ::std::vector<testing::internal::string>
-  GetArgvsForDeathTestChildProcess() {
-    ::std::vector<testing::internal::string> args = GetInjectableArgvs();
-    return args;
-  }
-  // The name of the file in which the death test is located.
-  const char* const file_;
-  // The line number on which the death test is located.
-  const int line_;
-};
-
-// Utility class for accumulating command-line arguments.
-class Arguments {
- public:
-  Arguments() {
-    args_.push_back(NULL);
-  }
-
-  ~Arguments() {
-    for (std::vector<char*>::iterator i = args_.begin(); i != args_.end();
-         ++i) {
-      free(*i);
-    }
-  }
-  void AddArgument(const char* argument) {
-    args_.insert(args_.end() - 1, posix::StrDup(argument));
-  }
-
-  template <typename Str>
-  void AddArguments(const ::std::vector<Str>& arguments) {
-    for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
-         i != arguments.end();
-         ++i) {
-      args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
-    }
-  }
-  char* const* Argv() {
-    return &args_[0];
-  }
-
- private:
-  std::vector<char*> args_;
-};
-
-// A struct that encompasses the arguments to the child process of a
-// threadsafe-style death test process.
-struct ExecDeathTestArgs {
-  char* const* argv;  // Command-line arguments for the child's call to exec
-  int close_fd;       // File descriptor to close; the read end of a pipe
-};
-
-#  if GTEST_OS_MAC
-inline char** GetEnviron() {
-  // When Google Test is built as a framework on MacOS X, the environ variable
-  // is unavailable. Apple's documentation (man environ) recommends using
-  // _NSGetEnviron() instead.
-  return *_NSGetEnviron();
-}
-#  else
-// Some POSIX platforms expect you to declare environ. extern "C" makes
-// it reside in the global namespace.
-extern "C" char** environ;
-inline char** GetEnviron() { return environ; }
-#  endif  // GTEST_OS_MAC
-
-#  if !GTEST_OS_QNX
-// The main function for a threadsafe-style death test child process.
-// This function is called in a clone()-ed process and thus must avoid
-// any potentially unsafe operations like malloc or libc functions.
-static int ExecDeathTestChildMain(void* child_arg) {
-  ExecDeathTestArgs* const args = static_cast<ExecDeathTestArgs*>(child_arg);
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(args->close_fd));
-
-  // We need to execute the test program in the same environment where
-  // it was originally invoked.  Therefore we change to the original
-  // working directory first.
-  const char* const original_dir =
-      UnitTest::GetInstance()->original_working_dir();
-  // We can safely call chdir() as it's a direct system call.
-  if (chdir(original_dir) != 0) {
-    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
-                   GetLastErrnoDescription());
-    return EXIT_FAILURE;
-  }
-
-  // We can safely call execve() as it's a direct system call.  We
-  // cannot use execvp() as it's a libc function and thus potentially
-  // unsafe.  Since execve() doesn't search the PATH, the user must
-  // invoke the test program via a valid path that contains at least
-  // one path separator.
-  execve(args->argv[0], args->argv, GetEnviron());
-  DeathTestAbort(std::string("execve(") + args->argv[0] + ", ...) in " +
-                 original_dir + " failed: " +
-                 GetLastErrnoDescription());
-  return EXIT_FAILURE;
-}
-#  endif  // !GTEST_OS_QNX
-
-// Two utility routines that together determine the direction the stack
-// grows.
-// This could be accomplished more elegantly by a single recursive
-// function, but we want to guard against the unlikely possibility of
-// a smart compiler optimizing the recursion away.
-//
-// GTEST_NO_INLINE_ is required to prevent GCC 4.6 from inlining
-// StackLowerThanAddress into StackGrowsDown, which then doesn't give
-// correct answer.
-void StackLowerThanAddress(const void* ptr, bool* result) GTEST_NO_INLINE_;
-void StackLowerThanAddress(const void* ptr, bool* result) {
-  int dummy;
-  *result = (&dummy < ptr);
-}
-
-bool StackGrowsDown() {
-  int dummy;
-  bool result;
-  StackLowerThanAddress(&dummy, &result);
-  return result;
-}
-
-// Spawns a child process with the same executable as the current process in
-// a thread-safe manner and instructs it to run the death test.  The
-// implementation uses fork(2) + exec.  On systems where clone(2) is
-// available, it is used instead, being slightly more thread-safe.  On QNX,
-// fork supports only single-threaded environments, so this function uses
-// spawn(2) there instead.  The function dies with an error message if
-// anything goes wrong.
-static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
-  ExecDeathTestArgs args = { argv, close_fd };
-  pid_t child_pid = -1;
-
-#  if GTEST_OS_QNX
-  // Obtains the current directory and sets it to be closed in the child
-  // process.
-  const int cwd_fd = open(".", O_RDONLY);
-  GTEST_DEATH_TEST_CHECK_(cwd_fd != -1);
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(cwd_fd, F_SETFD, FD_CLOEXEC));
-  // We need to execute the test program in the same environment where
-  // it was originally invoked.  Therefore we change to the original
-  // working directory first.
-  const char* const original_dir =
-      UnitTest::GetInstance()->original_working_dir();
-  // We can safely call chdir() as it's a direct system call.
-  if (chdir(original_dir) != 0) {
-    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
-                   GetLastErrnoDescription());
-    return EXIT_FAILURE;
-  }
-
-  int fd_flags;
-  // Set close_fd to be closed after spawn.
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(fd_flags = fcntl(close_fd, F_GETFD));
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(close_fd, F_SETFD,
-                                        fd_flags | FD_CLOEXEC));
-  struct inheritance inherit = {0};
-  // spawn is a system call.
-  child_pid = spawn(args.argv[0], 0, NULL, &inherit, args.argv, GetEnviron());
-  // Restores the current working directory.
-  GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1);
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd));
-
-#  else   // GTEST_OS_QNX
-#   if GTEST_OS_LINUX
-  // When a SIGPROF signal is received while fork() or clone() are executing,
-  // the process may hang. To avoid this, we ignore SIGPROF here and re-enable
-  // it after the call to fork()/clone() is complete.
-  struct sigaction saved_sigprof_action;
-  struct sigaction ignore_sigprof_action;
-  memset(&ignore_sigprof_action, 0, sizeof(ignore_sigprof_action));
-  sigemptyset(&ignore_sigprof_action.sa_mask);
-  ignore_sigprof_action.sa_handler = SIG_IGN;
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(sigaction(
-      SIGPROF, &ignore_sigprof_action, &saved_sigprof_action));
-#   endif  // GTEST_OS_LINUX
-
-#   if GTEST_HAS_CLONE
-  const bool use_fork = GTEST_FLAG(death_test_use_fork);
-
-  if (!use_fork) {
-    static const bool stack_grows_down = StackGrowsDown();
-    const size_t stack_size = getpagesize();
-    // MMAP_ANONYMOUS is not defined on Mac, so we use MAP_ANON instead.
-    void* const stack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
-                             MAP_ANON | MAP_PRIVATE, -1, 0);
-    GTEST_DEATH_TEST_CHECK_(stack != MAP_FAILED);
-
-    // Maximum stack alignment in bytes:  For a downward-growing stack, this
-    // amount is subtracted from size of the stack space to get an address
-    // that is within the stack space and is aligned on all systems we care
-    // about.  As far as I know there is no ABI with stack alignment greater
-    // than 64.  We assume stack and stack_size already have alignment of
-    // kMaxStackAlignment.
-    const size_t kMaxStackAlignment = 64;
-    void* const stack_top =
-        static_cast<char*>(stack) +
-            (stack_grows_down ? stack_size - kMaxStackAlignment : 0);
-    GTEST_DEATH_TEST_CHECK_(stack_size > kMaxStackAlignment &&
-        reinterpret_cast<intptr_t>(stack_top) % kMaxStackAlignment == 0);
-
-    child_pid = clone(&ExecDeathTestChildMain, stack_top, SIGCHLD, &args);
-
-    GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1);
-  }
-#   else
-  const bool use_fork = true;
-#   endif  // GTEST_HAS_CLONE
-
-  if (use_fork && (child_pid = fork()) == 0) {
-      ExecDeathTestChildMain(&args);
-      _exit(0);
-  }
-#  endif  // GTEST_OS_QNX
-#  if GTEST_OS_LINUX
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(
-      sigaction(SIGPROF, &saved_sigprof_action, NULL));
-#  endif  // GTEST_OS_LINUX
-
-  GTEST_DEATH_TEST_CHECK_(child_pid != -1);
-  return child_pid;
-}
-
-// The AssumeRole process for a fork-and-exec death test.  It re-executes the
-// main program from the beginning, setting the --gtest_filter
-// and --gtest_internal_run_death_test flags to cause only the current
-// death test to be re-run.
-DeathTest::TestRole ExecDeathTest::AssumeRole() {
-  const UnitTestImpl* const impl = GetUnitTestImpl();
-  const InternalRunDeathTestFlag* const flag =
-      impl->internal_run_death_test_flag();
-  const TestInfo* const info = impl->current_test_info();
-  const int death_test_index = info->result()->death_test_count();
-
-  if (flag != NULL) {
-    set_write_fd(flag->write_fd());
-    return EXECUTE_TEST;
-  }
-
-  int pipe_fd[2];
-  GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
-  // Clear the close-on-exec flag on the write end of the pipe, lest
-  // it be closed when the child process does an exec:
-  GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1);
-
-  const std::string filter_flag =
-      std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "="
-      + info->test_case_name() + "." + info->name();
-  const std::string internal_flag =
-      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "="
-      + file_ + "|" + StreamableToString(line_) + "|"
-      + StreamableToString(death_test_index) + "|"
-      + StreamableToString(pipe_fd[1]);
-  Arguments args;
-  args.AddArguments(GetArgvsForDeathTestChildProcess());
-  args.AddArgument(filter_flag.c_str());
-  args.AddArgument(internal_flag.c_str());
-
-  DeathTest::set_last_death_test_message("");
-
-  CaptureStderr();
-  // See the comment in NoExecDeathTest::AssumeRole for why the next line
-  // is necessary.
-  FlushInfoLog();
-
-  const pid_t child_pid = ExecDeathTestSpawnChild(args.Argv(), pipe_fd[0]);
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
-  set_child_pid(child_pid);
-  set_read_fd(pipe_fd[0]);
-  set_spawned(true);
-  return OVERSEE_TEST;
-}
-
-# endif  // !GTEST_OS_WINDOWS
-
-// Creates a concrete DeathTest-derived class that depends on the
-// --gtest_death_test_style flag, and sets the pointer pointed to
-// by the "test" argument to its address.  If the test should be
-// skipped, sets that pointer to NULL.  Returns true, unless the
-// flag is set to an invalid value.
-bool DefaultDeathTestFactory::Create(const char* statement, const RE* regex,
-                                     const char* file, int line,
-                                     DeathTest** test) {
-  UnitTestImpl* const impl = GetUnitTestImpl();
-  const InternalRunDeathTestFlag* const flag =
-      impl->internal_run_death_test_flag();
-  const int death_test_index = impl->current_test_info()
-      ->increment_death_test_count();
-
-  if (flag != NULL) {
-    if (death_test_index > flag->index()) {
-      DeathTest::set_last_death_test_message(
-          "Death test count (" + StreamableToString(death_test_index)
-          + ") somehow exceeded expected maximum ("
-          + StreamableToString(flag->index()) + ")");
-      return false;
-    }
-
-    if (!(flag->file() == file && flag->line() == line &&
-          flag->index() == death_test_index)) {
-      *test = NULL;
-      return true;
-    }
-  }
-
-# if GTEST_OS_WINDOWS
-
-  if (GTEST_FLAG(death_test_style) == "threadsafe" ||
-      GTEST_FLAG(death_test_style) == "fast") {
-    *test = new WindowsDeathTest(statement, regex, file, line);
-  }
-
-# else
-
-  if (GTEST_FLAG(death_test_style) == "threadsafe") {
-    *test = new ExecDeathTest(statement, regex, file, line);
-  } else if (GTEST_FLAG(death_test_style) == "fast") {
-    *test = new NoExecDeathTest(statement, regex);
-  }
-
-# endif  // GTEST_OS_WINDOWS
-
-  else {  // NOLINT - this is more readable than unbalanced brackets inside #if.
-    DeathTest::set_last_death_test_message(
-        "Unknown death test style \"" + GTEST_FLAG(death_test_style)
-        + "\" encountered");
-    return false;
-  }
-
-  return true;
-}
-
-// Splits a given string on a given delimiter, populating a given
-// vector with the fields.  GTEST_HAS_DEATH_TEST implies that we have
-// ::std::string, so we can use it here.
-static void SplitString(const ::std::string& str, char delimiter,
-                        ::std::vector< ::std::string>* dest) {
-  ::std::vector< ::std::string> parsed;
-  ::std::string::size_type pos = 0;
-  while (::testing::internal::AlwaysTrue()) {
-    const ::std::string::size_type colon = str.find(delimiter, pos);
-    if (colon == ::std::string::npos) {
-      parsed.push_back(str.substr(pos));
-      break;
-    } else {
-      parsed.push_back(str.substr(pos, colon - pos));
-      pos = colon + 1;
-    }
-  }
-  dest->swap(parsed);
-}
-
-# if GTEST_OS_WINDOWS
-// Recreates the pipe and event handles from the provided parameters,
-// signals the event, and returns a file descriptor wrapped around the pipe
-// handle. This function is called in the child process only.
-int GetStatusFileDescriptor(unsigned int parent_process_id,
-                            size_t write_handle_as_size_t,
-                            size_t event_handle_as_size_t) {
-  AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE,
-                                                   FALSE,  // Non-inheritable.
-                                                   parent_process_id));
-  if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) {
-    DeathTestAbort("Unable to open parent process " +
-                   StreamableToString(parent_process_id));
-  }
-
-  // TODO(vladl@google.com): Replace the following check with a
-  // compile-time assertion when available.
-  GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t));
-
-  const HANDLE write_handle =
-      reinterpret_cast<HANDLE>(write_handle_as_size_t);
-  HANDLE dup_write_handle;
-
-  // The newly initialized handle is accessible only in in the parent
-  // process. To obtain one accessible within the child, we need to use
-  // DuplicateHandle.
-  if (!::DuplicateHandle(parent_process_handle.Get(), write_handle,
-                         ::GetCurrentProcess(), &dup_write_handle,
-                         0x0,    // Requested privileges ignored since
-                                 // DUPLICATE_SAME_ACCESS is used.
-                         FALSE,  // Request non-inheritable handler.
-                         DUPLICATE_SAME_ACCESS)) {
-    DeathTestAbort("Unable to duplicate the pipe handle " +
-                   StreamableToString(write_handle_as_size_t) +
-                   " from the parent process " +
-                   StreamableToString(parent_process_id));
-  }
-
-  const HANDLE event_handle = reinterpret_cast<HANDLE>(event_handle_as_size_t);
-  HANDLE dup_event_handle;
-
-  if (!::DuplicateHandle(parent_process_handle.Get(), event_handle,
-                         ::GetCurrentProcess(), &dup_event_handle,
-                         0x0,
-                         FALSE,
-                         DUPLICATE_SAME_ACCESS)) {
-    DeathTestAbort("Unable to duplicate the event handle " +
-                   StreamableToString(event_handle_as_size_t) +
-                   " from the parent process " +
-                   StreamableToString(parent_process_id));
-  }
-
-  const int write_fd =
-      ::_open_osfhandle(reinterpret_cast<intptr_t>(dup_write_handle), O_APPEND);
-  if (write_fd == -1) {
-    DeathTestAbort("Unable to convert pipe handle " +
-                   StreamableToString(write_handle_as_size_t) +
-                   " to a file descriptor");
-  }
-
-  // Signals the parent that the write end of the pipe has been acquired
-  // so the parent can release its own write end.
-  ::SetEvent(dup_event_handle);
-
-  return write_fd;
-}
-# endif  // GTEST_OS_WINDOWS
-
-// Returns a newly created InternalRunDeathTestFlag object with fields
-// initialized from the GTEST_FLAG(internal_run_death_test) flag if
-// the flag is specified; otherwise returns NULL.
-InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
-  if (GTEST_FLAG(internal_run_death_test) == "") return NULL;
-
-  // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we
-  // can use it here.
-  int line = -1;
-  int index = -1;
-  ::std::vector< ::std::string> fields;
-  SplitString(GTEST_FLAG(internal_run_death_test).c_str(), '|', &fields);
-  int write_fd = -1;
-
-# if GTEST_OS_WINDOWS
-
-  unsigned int parent_process_id = 0;
-  size_t write_handle_as_size_t = 0;
-  size_t event_handle_as_size_t = 0;
-
-  if (fields.size() != 6
-      || !ParseNaturalNumber(fields[1], &line)
-      || !ParseNaturalNumber(fields[2], &index)
-      || !ParseNaturalNumber(fields[3], &parent_process_id)
-      || !ParseNaturalNumber(fields[4], &write_handle_as_size_t)
-      || !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) {
-    DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
-                   GTEST_FLAG(internal_run_death_test));
-  }
-  write_fd = GetStatusFileDescriptor(parent_process_id,
-                                     write_handle_as_size_t,
-                                     event_handle_as_size_t);
-# else
-
-  if (fields.size() != 4
-      || !ParseNaturalNumber(fields[1], &line)
-      || !ParseNaturalNumber(fields[2], &index)
-      || !ParseNaturalNumber(fields[3], &write_fd)) {
-    DeathTestAbort("Bad --gtest_internal_run_death_test flag: "
-        + GTEST_FLAG(internal_run_death_test));
-  }
-
-# endif  // GTEST_OS_WINDOWS
-
-  return new InternalRunDeathTestFlag(fields[0], line, index, write_fd);
-}
-
-}  // namespace internal
-
-#endif  // GTEST_HAS_DEATH_TEST
-
-}  // namespace testing
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Authors: keith.ray@gmail.com (Keith Ray)
-
-
-#include <stdlib.h>
-
-#if GTEST_OS_WINDOWS_MOBILE
-# include <windows.h>
-#elif GTEST_OS_WINDOWS
-# include <direct.h>
-# include <io.h>
-#elif GTEST_OS_SYMBIAN
-// Symbian OpenC has PATH_MAX in sys/syslimits.h
-# include <sys/syslimits.h>
-#else
-# include <limits.h>
-# include <climits>  // Some Linux distributions define PATH_MAX here.
-#endif  // GTEST_OS_WINDOWS_MOBILE
-
-#if GTEST_OS_WINDOWS
-# define GTEST_PATH_MAX_ _MAX_PATH
-#elif defined(PATH_MAX)
-# define GTEST_PATH_MAX_ PATH_MAX
-#elif defined(_XOPEN_PATH_MAX)
-# define GTEST_PATH_MAX_ _XOPEN_PATH_MAX
-#else
-# define GTEST_PATH_MAX_ _POSIX_PATH_MAX
-#endif  // GTEST_OS_WINDOWS
-
-
-namespace testing {
-namespace internal {
-
-#if GTEST_OS_WINDOWS
-// On Windows, '\\' is the standard path separator, but many tools and the
-// Windows API also accept '/' as an alternate path separator. Unless otherwise
-// noted, a file path can contain either kind of path separators, or a mixture
-// of them.
-const char kPathSeparator = '\\';
-const char kAlternatePathSeparator = '/';
-const char kAlternatePathSeparatorString[] = "/";
-# if GTEST_OS_WINDOWS_MOBILE
-// Windows CE doesn't have a current directory. You should not use
-// the current directory in tests on Windows CE, but this at least
-// provides a reasonable fallback.
-const char kCurrentDirectoryString[] = "\\";
-// Windows CE doesn't define INVALID_FILE_ATTRIBUTES
-const DWORD kInvalidFileAttributes = 0xffffffff;
-# else
-const char kCurrentDirectoryString[] = ".\\";
-# endif  // GTEST_OS_WINDOWS_MOBILE
-#else
-const char kPathSeparator = '/';
-const char kCurrentDirectoryString[] = "./";
-#endif  // GTEST_OS_WINDOWS
-
-// Returns whether the given character is a valid path separator.
-static bool IsPathSeparator(char c) {
-#if GTEST_HAS_ALT_PATH_SEP_
-  return (c == kPathSeparator) || (c == kAlternatePathSeparator);
-#else
-  return c == kPathSeparator;
-#endif
-}
-
-// Returns the current working directory, or "" if unsuccessful.
-FilePath FilePath::GetCurrentDir() {
-#if GTEST_OS_WINDOWS_MOBILE
-  // Windows CE doesn't have a current directory, so we just return
-  // something reasonable.
-  return FilePath(kCurrentDirectoryString);
-#elif GTEST_OS_WINDOWS
-  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
-  return FilePath(_getcwd(cwd, sizeof(cwd)) == NULL ? "" : cwd);
-#else
-  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
-  return FilePath(getcwd(cwd, sizeof(cwd)) == NULL ? "" : cwd);
-#endif  // GTEST_OS_WINDOWS_MOBILE
-}
-
-// Returns a copy of the FilePath with the case-insensitive extension removed.
-// Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
-// FilePath("dir/file"). If a case-insensitive extension is not
-// found, returns a copy of the original FilePath.
-FilePath FilePath::RemoveExtension(const char* extension) const {
-  const std::string dot_extension = std::string(".") + extension;
-  if (String::EndsWithCaseInsensitive(pathname_, dot_extension)) {
-    return FilePath(pathname_.substr(
-        0, pathname_.length() - dot_extension.length()));
-  }
-  return *this;
-}
-
-// Returns a pointer to the last occurence of a valid path separator in
-// the FilePath. On Windows, for example, both '/' and '\' are valid path
-// separators. Returns NULL if no path separator was found.
-const char* FilePath::FindLastPathSeparator() const {
-  const char* const last_sep = strrchr(c_str(), kPathSeparator);
-#if GTEST_HAS_ALT_PATH_SEP_
-  const char* const last_alt_sep = strrchr(c_str(), kAlternatePathSeparator);
-  // Comparing two pointers of which only one is NULL is undefined.
-  if (last_alt_sep != NULL &&
-      (last_sep == NULL || last_alt_sep > last_sep)) {
-    return last_alt_sep;
-  }
-#endif
-  return last_sep;
-}
-
-// Returns a copy of the FilePath with the directory part removed.
-// Example: FilePath("path/to/file").RemoveDirectoryName() returns
-// FilePath("file"). If there is no directory part ("just_a_file"), it returns
-// the FilePath unmodified. If there is no file part ("just_a_dir/") it
-// returns an empty FilePath ("").
-// On Windows platform, '\' is the path separator, otherwise it is '/'.
-FilePath FilePath::RemoveDirectoryName() const {
-  const char* const last_sep = FindLastPathSeparator();
-  return last_sep ? FilePath(last_sep + 1) : *this;
-}
-
-// RemoveFileName returns the directory path with the filename removed.
-// Example: FilePath("path/to/file").RemoveFileName() returns "path/to/".
-// If the FilePath is "a_file" or "/a_file", RemoveFileName returns
-// FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does
-// not have a file, like "just/a/dir/", it returns the FilePath unmodified.
-// On Windows platform, '\' is the path separator, otherwise it is '/'.
-FilePath FilePath::RemoveFileName() const {
-  const char* const last_sep = FindLastPathSeparator();
-  std::string dir;
-  if (last_sep) {
-    dir = std::string(c_str(), last_sep + 1 - c_str());
-  } else {
-    dir = kCurrentDirectoryString;
-  }
-  return FilePath(dir);
-}
-
-// Helper functions for naming files in a directory for xml output.
-
-// Given directory = "dir", base_name = "test", number = 0,
-// extension = "xml", returns "dir/test.xml". If number is greater
-// than zero (e.g., 12), returns "dir/test_12.xml".
-// On Windows platform, uses \ as the separator rather than /.
-FilePath FilePath::MakeFileName(const FilePath& directory,
-                                const FilePath& base_name,
-                                int number,
-                                const char* extension) {
-  std::string file;
-  if (number == 0) {
-    file = base_name.string() + "." + extension;
-  } else {
-    file = base_name.string() + "_" + StreamableToString(number)
-        + "." + extension;
-  }
-  return ConcatPaths(directory, FilePath(file));
-}
-
-// Given directory = "dir", relative_path = "test.xml", returns "dir/test.xml".
-// On Windows, uses \ as the separator rather than /.
-FilePath FilePath::ConcatPaths(const FilePath& directory,
-                               const FilePath& relative_path) {
-  if (directory.IsEmpty())
-    return relative_path;
-  const FilePath dir(directory.RemoveTrailingPathSeparator());
-  return FilePath(dir.string() + kPathSeparator + relative_path.string());
-}
-
-// Returns true if pathname describes something findable in the file-system,
-// either a file, directory, or whatever.
-bool FilePath::FileOrDirectoryExists() const {
-#if GTEST_OS_WINDOWS_MOBILE
-  LPCWSTR unicode = String::AnsiToUtf16(pathname_.c_str());
-  const DWORD attributes = GetFileAttributes(unicode);
-  delete [] unicode;
-  return attributes != kInvalidFileAttributes;
-#else
-  posix::StatStruct file_stat;
-  return posix::Stat(pathname_.c_str(), &file_stat) == 0;
-#endif  // GTEST_OS_WINDOWS_MOBILE
-}
-
-// Returns true if pathname describes a directory in the file-system
-// that exists.
-bool FilePath::DirectoryExists() const {
-  bool result = false;
-#if GTEST_OS_WINDOWS
-  // Don't strip off trailing separator if path is a root directory on
-  // Windows (like "C:\\").
-  const FilePath& path(IsRootDirectory() ? *this :
-                                           RemoveTrailingPathSeparator());
-#else
-  const FilePath& path(*this);
-#endif
-
-#if GTEST_OS_WINDOWS_MOBILE
-  LPCWSTR unicode = String::AnsiToUtf16(path.c_str());
-  const DWORD attributes = GetFileAttributes(unicode);
-  delete [] unicode;
-  if ((attributes != kInvalidFileAttributes) &&
-      (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
-    result = true;
-  }
-#else
-  posix::StatStruct file_stat;
-  result = posix::Stat(path.c_str(), &file_stat) == 0 &&
-      posix::IsDir(file_stat);
-#endif  // GTEST_OS_WINDOWS_MOBILE
-
-  return result;
-}
-
-// Returns true if pathname describes a root directory. (Windows has one
-// root directory per disk drive.)
-bool FilePath::IsRootDirectory() const {
-#if GTEST_OS_WINDOWS
-  // TODO(wan@google.com): on Windows a network share like
-  // \\server\share can be a root directory, although it cannot be the
-  // current directory.  Handle this properly.
-  return pathname_.length() == 3 && IsAbsolutePath();
-#else
-  return pathname_.length() == 1 && IsPathSeparator(pathname_.c_str()[0]);
-#endif
-}
-
-// Returns true if pathname describes an absolute path.
-bool FilePath::IsAbsolutePath() const {
-  const char* const name = pathname_.c_str();
-#if GTEST_OS_WINDOWS
-  return pathname_.length() >= 3 &&
-     ((name[0] >= 'a' && name[0] <= 'z') ||
-      (name[0] >= 'A' && name[0] <= 'Z')) &&
-     name[1] == ':' &&
-     IsPathSeparator(name[2]);
-#else
-  return IsPathSeparator(name[0]);
-#endif
-}
-
-// Returns a pathname for a file that does not currently exist. The pathname
-// will be directory/base_name.extension or
-// directory/base_name_<number>.extension if directory/base_name.extension
-// already exists. The number will be incremented until a pathname is found
-// that does not already exist.
-// Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
-// There could be a race condition if two or more processes are calling this
-// function at the same time -- they could both pick the same filename.
-FilePath FilePath::GenerateUniqueFileName(const FilePath& directory,
-                                          const FilePath& base_name,
-                                          const char* extension) {
-  FilePath full_pathname;
-  int number = 0;
-  do {
-    full_pathname.Set(MakeFileName(directory, base_name, number++, extension));
-  } while (full_pathname.FileOrDirectoryExists());
-  return full_pathname;
-}
-
-// Returns true if FilePath ends with a path separator, which indicates that
-// it is intended to represent a directory. Returns false otherwise.
-// This does NOT check that a directory (or file) actually exists.
-bool FilePath::IsDirectory() const {
-  return !pathname_.empty() &&
-         IsPathSeparator(pathname_.c_str()[pathname_.length() - 1]);
-}
-
-// Create directories so that path exists. Returns true if successful or if
-// the directories already exist; returns false if unable to create directories
-// for any reason.
-bool FilePath::CreateDirectoriesRecursively() const {
-  if (!this->IsDirectory()) {
-    return false;
-  }
-
-  if (pathname_.length() == 0 || this->DirectoryExists()) {
-    return true;
-  }
-
-  const FilePath parent(this->RemoveTrailingPathSeparator().RemoveFileName());
-  return parent.CreateDirectoriesRecursively() && this->CreateFolder();
-}
-
-// Create the directory so that path exists. Returns true if successful or
-// if the directory already exists; returns false if unable to create the
-// directory for any reason, including if the parent directory does not
-// exist. Not named "CreateDirectory" because that's a macro on Windows.
-bool FilePath::CreateFolder() const {
-#if GTEST_OS_WINDOWS_MOBILE
-  FilePath removed_sep(this->RemoveTrailingPathSeparator());
-  LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str());
-  int result = CreateDirectory(unicode, NULL) ? 0 : -1;
-  delete [] unicode;
-#elif GTEST_OS_WINDOWS
-  int result = _mkdir(pathname_.c_str());
-#else
-  int result = mkdir(pathname_.c_str(), 0777);
-#endif  // GTEST_OS_WINDOWS_MOBILE
-
-  if (result == -1) {
-    return this->DirectoryExists();  // An error is OK if the directory exists.
-  }
-  return true;  // No error.
-}
-
-// If input name has a trailing separator character, remove it and return the
-// name, otherwise return the name string unmodified.
-// On Windows platform, uses \ as the separator, other platforms use /.
-FilePath FilePath::RemoveTrailingPathSeparator() const {
-  return IsDirectory()
-      ? FilePath(pathname_.substr(0, pathname_.length() - 1))
-      : *this;
-}
-
-// Removes any redundant separators that might be in the pathname.
-// For example, "bar///foo" becomes "bar/foo". Does not eliminate other
-// redundancies that might be in a pathname involving "." or "..".
-// TODO(wan@google.com): handle Windows network shares (e.g. \\server\share).
-void FilePath::Normalize() {
-  if (pathname_.c_str() == NULL) {
-    pathname_ = "";
-    return;
-  }
-  const char* src = pathname_.c_str();
-  char* const dest = new char[pathname_.length() + 1];
-  char* dest_ptr = dest;
-  memset(dest_ptr, 0, pathname_.length() + 1);
-
-  while (*src != '\0') {
-    *dest_ptr = *src;
-    if (!IsPathSeparator(*src)) {
-      src++;
-    } else {
-#if GTEST_HAS_ALT_PATH_SEP_
-      if (*dest_ptr == kAlternatePathSeparator) {
-        *dest_ptr = kPathSeparator;
-      }
-#endif
-      while (IsPathSeparator(*src))
-        src++;
-    }
-    dest_ptr++;
-  }
-  *dest_ptr = '\0';
-  pathname_ = dest;
-  delete[] dest;
-}
-
-}  // namespace internal
-}  // namespace testing
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-
-
-#include <limits.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-
-#if GTEST_OS_WINDOWS_MOBILE
-# include <windows.h>  // For TerminateProcess()
-#elif GTEST_OS_WINDOWS
-# include <io.h>
-# include <sys/stat.h>
-#else
-# include <unistd.h>
-#endif  // GTEST_OS_WINDOWS_MOBILE
-
-#if GTEST_OS_MAC
-# include <mach/mach_init.h>
-# include <mach/task.h>
-# include <mach/vm_map.h>
-#endif  // GTEST_OS_MAC
-
-#if GTEST_OS_QNX
-# include <devctl.h>
-# include <sys/procfs.h>
-#endif  // GTEST_OS_QNX
-
-
-// Indicates that this translation unit is part of Google Test's
-// implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick is to
-// prevent a user from accidentally including gtest-internal-inl.h in
-// his code.
-#define GTEST_IMPLEMENTATION_ 1
-#undef GTEST_IMPLEMENTATION_
-
-namespace testing {
-namespace internal {
-
-#if defined(_MSC_VER) || defined(__BORLANDC__)
-// MSVC and C++Builder do not provide a definition of STDERR_FILENO.
-const int kStdOutFileno = 1;
-const int kStdErrFileno = 2;
-#else
-const int kStdOutFileno = STDOUT_FILENO;
-const int kStdErrFileno = STDERR_FILENO;
-#endif  // _MSC_VER
-
-#if GTEST_OS_MAC
-
-// Returns the number of threads running in the process, or 0 to indicate that
-// we cannot detect it.
-size_t GetThreadCount() {
-  const task_t task = mach_task_self();
-  mach_msg_type_number_t thread_count;
-  thread_act_array_t thread_list;
-  const kern_return_t status = task_threads(task, &thread_list, &thread_count);
-  if (status == KERN_SUCCESS) {
-    // task_threads allocates resources in thread_list and we need to free them
-    // to avoid leaks.
-    vm_deallocate(task,
-                  reinterpret_cast<vm_address_t>(thread_list),
-                  sizeof(thread_t) * thread_count);
-    return static_cast<size_t>(thread_count);
-  } else {
-    return 0;
-  }
-}
-
-#elif GTEST_OS_QNX
-
-// Returns the number of threads running in the process, or 0 to indicate that
-// we cannot detect it.
-size_t GetThreadCount() {
-  const int fd = open("/proc/self/as", O_RDONLY);
-  if (fd < 0) {
-    return 0;
-  }
-  procfs_info process_info;
-  const int status =
-      devctl(fd, DCMD_PROC_INFO, &process_info, sizeof(process_info), NULL);
-  close(fd);
-  if (status == EOK) {
-    return static_cast<size_t>(process_info.num_threads);
-  } else {
-    return 0;
-  }
-}
-
-#else
-
-size_t GetThreadCount() {
-  // There's no portable way to detect the number of threads, so we just
-  // return 0 to indicate that we cannot detect it.
-  return 0;
-}
-
-#endif  // GTEST_OS_MAC
-
-#if GTEST_USES_POSIX_RE
-
-// Implements RE.  Currently only needed for death tests.
-
-RE::~RE() {
-  if (is_valid_) {
-    // regfree'ing an invalid regex might crash because the content
-    // of the regex is undefined. Since the regex's are essentially
-    // the same, one cannot be valid (or invalid) without the other
-    // being so too.
-    regfree(&partial_regex_);
-    regfree(&full_regex_);
-  }
-  free(const_cast<char*>(pattern_));
-}
-
-// Returns true iff regular expression re matches the entire str.
-bool RE::FullMatch(const char* str, const RE& re) {
-  if (!re.is_valid_) return false;
-
-  regmatch_t match;
-  return regexec(&re.full_regex_, str, 1, &match, 0) == 0;
-}
-
-// Returns true iff regular expression re matches a substring of str
-// (including str itself).
-bool RE::PartialMatch(const char* str, const RE& re) {
-  if (!re.is_valid_) return false;
-
-  regmatch_t match;
-  return regexec(&re.partial_regex_, str, 1, &match, 0) == 0;
-}
-
-// Initializes an RE from its string representation.
-void RE::Init(const char* regex) {
-  pattern_ = posix::StrDup(regex);
-
-  // Reserves enough bytes to hold the regular expression used for a
-  // full match.
-  const size_t full_regex_len = strlen(regex) + 10;
-  char* const full_pattern = new char[full_regex_len];
-
-  snprintf(full_pattern, full_regex_len, "^(%s)$", regex);
-  is_valid_ = regcomp(&full_regex_, full_pattern, REG_EXTENDED) == 0;
-  // We want to call regcomp(&partial_regex_, ...) even if the
-  // previous expression returns false.  Otherwise partial_regex_ may
-  // not be properly initialized can may cause trouble when it's
-  // freed.
-  //
-  // Some implementation of POSIX regex (e.g. on at least some
-  // versions of Cygwin) doesn't accept the empty string as a valid
-  // regex.  We change it to an equivalent form "()" to be safe.
-  if (is_valid_) {
-    const char* const partial_regex = (*regex == '\0') ? "()" : regex;
-    is_valid_ = regcomp(&partial_regex_, partial_regex, REG_EXTENDED) == 0;
-  }
-  EXPECT_TRUE(is_valid_)
-      << "Regular expression \"" << regex
-      << "\" is not a valid POSIX Extended regular expression.";
-
-  delete[] full_pattern;
-}
-
-#elif GTEST_USES_SIMPLE_RE
-
-// Returns true iff ch appears anywhere in str (excluding the
-// terminating '\0' character).
-bool IsInSet(char ch, const char* str) {
-  return ch != '\0' && strchr(str, ch) != NULL;
-}
-
-// Returns true iff ch belongs to the given classification.  Unlike
-// similar functions in <ctype.h>, these aren't affected by the
-// current locale.
-bool IsAsciiDigit(char ch) { return '0' <= ch && ch <= '9'; }
-bool IsAsciiPunct(char ch) {
-  return IsInSet(ch, "^-!\"#$%&'()*+,./:;<=>?@[\\]_`{|}~");
-}
-bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); }
-bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); }
-bool IsAsciiWordChar(char ch) {
-  return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') ||
-      ('0' <= ch && ch <= '9') || ch == '_';
-}
-
-// Returns true iff "\\c" is a supported escape sequence.
-bool IsValidEscape(char c) {
-  return (IsAsciiPunct(c) || IsInSet(c, "dDfnrsStvwW"));
-}
-
-// Returns true iff the given atom (specified by escaped and pattern)
-// matches ch.  The result is undefined if the atom is invalid.
-bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
-  if (escaped) {  // "\\p" where p is pattern_char.
-    switch (pattern_char) {
-      case 'd': return IsAsciiDigit(ch);
-      case 'D': return !IsAsciiDigit(ch);
-      case 'f': return ch == '\f';
-      case 'n': return ch == '\n';
-      case 'r': return ch == '\r';
-      case 's': return IsAsciiWhiteSpace(ch);
-      case 'S': return !IsAsciiWhiteSpace(ch);
-      case 't': return ch == '\t';
-      case 'v': return ch == '\v';
-      case 'w': return IsAsciiWordChar(ch);
-      case 'W': return !IsAsciiWordChar(ch);
-    }
-    return IsAsciiPunct(pattern_char) && pattern_char == ch;
-  }
-
-  return (pattern_char == '.' && ch != '\n') || pattern_char == ch;
-}
-
-// Helper function used by ValidateRegex() to format error messages.
-std::string FormatRegexSyntaxError(const char* regex, int index) {
-  return (Message() << "Syntax error at index " << index
-          << " in simple regular expression \"" << regex << "\": ").GetString();
-}
-
-// Generates non-fatal failures and returns false if regex is invalid;
-// otherwise returns true.
-bool ValidateRegex(const char* regex) {
-  if (regex == NULL) {
-    // TODO(wan@google.com): fix the source file location in the
-    // assertion failures to match where the regex is used in user
-    // code.
-    ADD_FAILURE() << "NULL is not a valid simple regular expression.";
-    return false;
-  }
-
-  bool is_valid = true;
-
-  // True iff ?, *, or + can follow the previous atom.
-  bool prev_repeatable = false;
-  for (int i = 0; regex[i]; i++) {
-    if (regex[i] == '\\') {  // An escape sequence
-      i++;
-      if (regex[i] == '\0') {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
-                      << "'\\' cannot appear at the end.";
-        return false;
-      }
-
-      if (!IsValidEscape(regex[i])) {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
-                      << "invalid escape sequence \"\\" << regex[i] << "\".";
-        is_valid = false;
-      }
-      prev_repeatable = true;
-    } else {  // Not an escape sequence.
-      const char ch = regex[i];
-
-      if (ch == '^' && i > 0) {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
-                      << "'^' can only appear at the beginning.";
-        is_valid = false;
-      } else if (ch == '$' && regex[i + 1] != '\0') {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
-                      << "'$' can only appear at the end.";
-        is_valid = false;
-      } else if (IsInSet(ch, "()[]{}|")) {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
-                      << "'" << ch << "' is unsupported.";
-        is_valid = false;
-      } else if (IsRepeat(ch) && !prev_repeatable) {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
-                      << "'" << ch << "' can only follow a repeatable token.";
-        is_valid = false;
-      }
-
-      prev_repeatable = !IsInSet(ch, "^$?*+");
-    }
-  }
-
-  return is_valid;
-}
-
-// Matches a repeated regex atom followed by a valid simple regular
-// expression.  The regex atom is defined as c if escaped is false,
-// or \c otherwise.  repeat is the repetition meta character (?, *,
-// or +).  The behavior is undefined if str contains too many
-// characters to be indexable by size_t, in which case the test will
-// probably time out anyway.  We are fine with this limitation as
-// std::string has it too.
-bool MatchRepetitionAndRegexAtHead(
-    bool escaped, char c, char repeat, const char* regex,
-    const char* str) {
-  const size_t min_count = (repeat == '+') ? 1 : 0;
-  const size_t max_count = (repeat == '?') ? 1 :
-      static_cast<size_t>(-1) - 1;
-  // We cannot call numeric_limits::max() as it conflicts with the
-  // max() macro on Windows.
-
-  for (size_t i = 0; i <= max_count; ++i) {
-    // We know that the atom matches each of the first i characters in str.
-    if (i >= min_count && MatchRegexAtHead(regex, str + i)) {
-      // We have enough matches at the head, and the tail matches too.
-      // Since we only care about *whether* the pattern matches str
-      // (as opposed to *how* it matches), there is no need to find a
-      // greedy match.
-      return true;
-    }
-    if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i]))
-      return false;
-  }
-  return false;
-}
-
-// Returns true iff regex matches a prefix of str.  regex must be a
-// valid simple regular expression and not start with "^", or the
-// result is undefined.
-bool MatchRegexAtHead(const char* regex, const char* str) {
-  if (*regex == '\0')  // An empty regex matches a prefix of anything.
-    return true;
-
-  // "$" only matches the end of a string.  Note that regex being
-  // valid guarantees that there's nothing after "$" in it.
-  if (*regex == '$')
-    return *str == '\0';
-
-  // Is the first thing in regex an escape sequence?
-  const bool escaped = *regex == '\\';
-  if (escaped)
-    ++regex;
-  if (IsRepeat(regex[1])) {
-    // MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so
-    // here's an indirect recursion.  It terminates as the regex gets
-    // shorter in each recursion.
-    return MatchRepetitionAndRegexAtHead(
-        escaped, regex[0], regex[1], regex + 2, str);
-  } else {
-    // regex isn't empty, isn't "$", and doesn't start with a
-    // repetition.  We match the first atom of regex with the first
-    // character of str and recurse.
-    return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) &&
-        MatchRegexAtHead(regex + 1, str + 1);
-  }
-}
-
-// Returns true iff regex matches any substring of str.  regex must be
-// a valid simple regular expression, or the result is undefined.
-//
-// The algorithm is recursive, but the recursion depth doesn't exceed
-// the regex length, so we won't need to worry about running out of
-// stack space normally.  In rare cases the time complexity can be
-// exponential with respect to the regex length + the string length,
-// but usually it's must faster (often close to linear).
-bool MatchRegexAnywhere(const char* regex, const char* str) {
-  if (regex == NULL || str == NULL)
-    return false;
-
-  if (*regex == '^')
-    return MatchRegexAtHead(regex + 1, str);
-
-  // A successful match can be anywhere in str.
-  do {
-    if (MatchRegexAtHead(regex, str))
-      return true;
-  } while (*str++ != '\0');
-  return false;
-}
-
-// Implements the RE class.
-
-RE::~RE() {
-  free(const_cast<char*>(pattern_));
-  free(const_cast<char*>(full_pattern_));
-}
-
-// Returns true iff regular expression re matches the entire str.
-bool RE::FullMatch(const char* str, const RE& re) {
-  return re.is_valid_ && MatchRegexAnywhere(re.full_pattern_, str);
-}
-
-// Returns true iff regular expression re matches a substring of str
-// (including str itself).
-bool RE::PartialMatch(const char* str, const RE& re) {
-  return re.is_valid_ && MatchRegexAnywhere(re.pattern_, str);
-}
-
-// Initializes an RE from its string representation.
-void RE::Init(const char* regex) {
-  pattern_ = full_pattern_ = NULL;
-  if (regex != NULL) {
-    pattern_ = posix::StrDup(regex);
-  }
-
-  is_valid_ = ValidateRegex(regex);
-  if (!is_valid_) {
-    // No need to calculate the full pattern when the regex is invalid.
-    return;
-  }
-
-  const size_t len = strlen(regex);
-  // Reserves enough bytes to hold the regular expression used for a
-  // full match: we need space to prepend a '^', append a '$', and
-  // terminate the string with '\0'.
-  char* buffer = static_cast<char*>(malloc(len + 3));
-  full_pattern_ = buffer;
-
-  if (*regex != '^')
-    *buffer++ = '^';  // Makes sure full_pattern_ starts with '^'.
-
-  // We don't use snprintf or strncpy, as they trigger a warning when
-  // compiled with VC++ 8.0.
-  memcpy(buffer, regex, len);
-  buffer += len;
-
-  if (len == 0 || regex[len - 1] != '$')
-    *buffer++ = '$';  // Makes sure full_pattern_ ends with '$'.
-
-  *buffer = '\0';
-}
-
-#endif  // GTEST_USES_POSIX_RE
-
-const char kUnknownFile[] = "unknown file";
-
-// Formats a source file path and a line number as they would appear
-// in an error message from the compiler used to compile this code.
-GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) {
-  const std::string file_name(file == NULL ? kUnknownFile : file);
-
-  if (line < 0) {
-    return file_name + ":";
-  }
-#ifdef _MSC_VER
-  return file_name + "(" + StreamableToString(line) + "):";
-#else
-  return file_name + ":" + StreamableToString(line) + ":";
-#endif  // _MSC_VER
-}
-
-// Formats a file location for compiler-independent XML output.
-// Although this function is not platform dependent, we put it next to
-// FormatFileLocation in order to contrast the two functions.
-// Note that FormatCompilerIndependentFileLocation() does NOT append colon
-// to the file location it produces, unlike FormatFileLocation().
-GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(
-    const char* file, int line) {
-  const std::string file_name(file == NULL ? kUnknownFile : file);
-
-  if (line < 0)
-    return file_name;
-  else
-    return file_name + ":" + StreamableToString(line);
-}
-
-
-GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line)
-    : severity_(severity) {
-  const char* const marker =
-      severity == GTEST_INFO ?    "[  INFO ]" :
-      severity == GTEST_WARNING ? "[WARNING]" :
-      severity == GTEST_ERROR ?   "[ ERROR ]" : "[ FATAL ]";
-  GetStream() << ::std::endl << marker << " "
-              << FormatFileLocation(file, line).c_str() << ": ";
-}
-
-// Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
-GTestLog::~GTestLog() {
-  GetStream() << ::std::endl;
-  if (severity_ == GTEST_FATAL) {
-    fflush(stderr);
-    posix::Abort();
-  }
-}
-// Disable Microsoft deprecation warnings for POSIX functions called from
-// this class (creat, dup, dup2, and close)
-#ifdef _MSC_VER
-# pragma warning(push)
-# pragma warning(disable: 4996)
-#endif  // _MSC_VER
-
-#if GTEST_HAS_STREAM_REDIRECTION
-
-// Object that captures an output stream (stdout/stderr).
-class CapturedStream {
- public:
-  // The ctor redirects the stream to a temporary file.
-  explicit CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) {
-# if GTEST_OS_WINDOWS
-    char temp_dir_path[MAX_PATH + 1] = { '\0' };  // NOLINT
-    char temp_file_path[MAX_PATH + 1] = { '\0' };  // NOLINT
-
-    ::GetTempPathA(sizeof(temp_dir_path), temp_dir_path);
-    const UINT success = ::GetTempFileNameA(temp_dir_path,
-                                            "gtest_redir",
-                                            0,  // Generate unique file name.
-                                            temp_file_path);
-    GTEST_CHECK_(success != 0)
-        << "Unable to create a temporary file in " << temp_dir_path;
-    const int captured_fd = creat(temp_file_path, _S_IREAD | _S_IWRITE);
-    GTEST_CHECK_(captured_fd != -1) << "Unable to open temporary file "
-                                    << temp_file_path;
-    filename_ = temp_file_path;
-# else
-    // There's no guarantee that a test has write access to the current
-    // directory, so we create the temporary file in the /tmp directory
-    // instead. We use /tmp on most systems, and /sdcard on Android.
-    // That's because Android doesn't have /tmp.
-#  if GTEST_OS_LINUX_ANDROID
-    // Note: Android applications are expected to call the framework's
-    // Context.getExternalStorageDirectory() method through JNI to get
-    // the location of the world-writable SD Card directory. However,
-    // this requires a Context handle, which cannot be retrieved
-    // globally from native code. Doing so also precludes running the
-    // code as part of a regular standalone executable, which doesn't
-    // run in a Dalvik process (e.g. when running it through 'adb shell').
-    //
-    // The location /sdcard is directly accessible from native code
-    // and is the only location (unofficially) supported by the Android
-    // team. It's generally a symlink to the real SD Card mount point
-    // which can be /mnt/sdcard, /mnt/sdcard0, /system/media/sdcard, or
-    // other OEM-customized locations. Never rely on these, and always
-    // use /sdcard.
-    char name_template[] = "/sdcard/gtest_captured_stream.XXXXXX";
-#  else
-    char name_template[] = "/tmp/captured_stream.XXXXXX";
-#  endif  // GTEST_OS_LINUX_ANDROID
-    const int captured_fd = mkstemp(name_template);
-    filename_ = name_template;
-# endif  // GTEST_OS_WINDOWS
-    fflush(NULL);
-    dup2(captured_fd, fd_);
-    close(captured_fd);
-  }
-
-  ~CapturedStream() {
-    remove(filename_.c_str());
-  }
-
-  std::string GetCapturedString() {
-    if (uncaptured_fd_ != -1) {
-      // Restores the original stream.
-      fflush(NULL);
-      dup2(uncaptured_fd_, fd_);
-      close(uncaptured_fd_);
-      uncaptured_fd_ = -1;
-    }
-
-    FILE* const file = posix::FOpen(filename_.c_str(), "r");
-    const std::string content = ReadEntireFile(file);
-    posix::FClose(file);
-    return content;
-  }
-
- private:
-  // Reads the entire content of a file as an std::string.
-  static std::string ReadEntireFile(FILE* file);
-
-  // Returns the size (in bytes) of a file.
-  static size_t GetFileSize(FILE* file);
-
-  const int fd_;  // A stream to capture.
-  int uncaptured_fd_;
-  // Name of the temporary file holding the stderr output.
-  ::std::string filename_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream);
-};
-
-// Returns the size (in bytes) of a file.
-size_t CapturedStream::GetFileSize(FILE* file) {
-  fseek(file, 0, SEEK_END);
-  return static_cast<size_t>(ftell(file));
-}
-
-// Reads the entire content of a file as a string.
-std::string CapturedStream::ReadEntireFile(FILE* file) {
-  const size_t file_size = GetFileSize(file);
-  char* const buffer = new char[file_size];
-
-  size_t bytes_last_read = 0;  // # of bytes read in the last fread()
-  size_t bytes_read = 0;       // # of bytes read so far
-
-  fseek(file, 0, SEEK_SET);
-
-  // Keeps reading the file until we cannot read further or the
-  // pre-determined file size is reached.
-  do {
-    bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file);
-    bytes_read += bytes_last_read;
-  } while (bytes_last_read > 0 && bytes_read < file_size);
-
-  const std::string content(buffer, bytes_read);
-  delete[] buffer;
-
-  return content;
-}
-
-# ifdef _MSC_VER
-#  pragma warning(pop)
-# endif  // _MSC_VER
-
-static CapturedStream* g_captured_stderr = NULL;
-static CapturedStream* g_captured_stdout = NULL;
-
-// Starts capturing an output stream (stdout/stderr).
-void CaptureStream(int fd, const char* stream_name, CapturedStream** stream) {
-  if (*stream != NULL) {
-    GTEST_LOG_(FATAL) << "Only one " << stream_name
-                      << " capturer can exist at a time.";
-  }
-  *stream = new CapturedStream(fd);
-}
-
-// Stops capturing the output stream and returns the captured string.
-std::string GetCapturedStream(CapturedStream** captured_stream) {
-  const std::string content = (*captured_stream)->GetCapturedString();
-
-  delete *captured_stream;
-  *captured_stream = NULL;
-
-  return content;
-}
-
-// Starts capturing stdout.
-void CaptureStdout() {
-  CaptureStream(kStdOutFileno, "stdout", &g_captured_stdout);
-}
-
-// Starts capturing stderr.
-void CaptureStderr() {
-  CaptureStream(kStdErrFileno, "stderr", &g_captured_stderr);
-}
-
-// Stops capturing stdout and returns the captured string.
-std::string GetCapturedStdout() {
-  return GetCapturedStream(&g_captured_stdout);
-}
-
-// Stops capturing stderr and returns the captured string.
-std::string GetCapturedStderr() {
-  return GetCapturedStream(&g_captured_stderr);
-}
-
-#endif  // GTEST_HAS_STREAM_REDIRECTION
-
-#if GTEST_HAS_DEATH_TEST
-
-// A copy of all command line arguments.  Set by InitGoogleTest().
-::std::vector<testing::internal::string> g_argvs;
-
-static const ::std::vector<testing::internal::string>* g_injected_test_argvs =
-                                        NULL;  // Owned.
-
-void SetInjectableArgvs(const ::std::vector<testing::internal::string>* argvs) {
-  if (g_injected_test_argvs != argvs)
-    delete g_injected_test_argvs;
-  g_injected_test_argvs = argvs;
-}
-
-const ::std::vector<testing::internal::string>& GetInjectableArgvs() {
-  if (g_injected_test_argvs != NULL) {
-    return *g_injected_test_argvs;
-  }
-  return g_argvs;
-}
-#endif  // GTEST_HAS_DEATH_TEST
-
-#if GTEST_OS_WINDOWS_MOBILE
-namespace posix {
-void Abort() {
-  DebugBreak();
-  TerminateProcess(GetCurrentProcess(), 1);
-}
-}  // namespace posix
-#endif  // GTEST_OS_WINDOWS_MOBILE
-
-// Returns the name of the environment variable corresponding to the
-// given flag.  For example, FlagToEnvVar("foo") will return
-// "GTEST_FOO" in the open-source version.
-static std::string FlagToEnvVar(const char* flag) {
-  const std::string full_flag =
-      (Message() << GTEST_FLAG_PREFIX_ << flag).GetString();
-
-  Message env_var;
-  for (size_t i = 0; i != full_flag.length(); i++) {
-    env_var << ToUpper(full_flag.c_str()[i]);
-  }
-
-  return env_var.GetString();
-}
-
-// Parses 'str' for a 32-bit signed integer.  If successful, writes
-// the result to *value and returns true; otherwise leaves *value
-// unchanged and returns false.
-bool ParseInt32(const Message& src_text, const char* str, Int32* value) {
-  // Parses the environment variable as a decimal integer.
-  char* end = NULL;
-  const long long_value = strtol(str, &end, 10);  // NOLINT
-
-  // Has strtol() consumed all characters in the string?
-  if (*end != '\0') {
-    // No - an invalid character was encountered.
-    Message msg;
-    msg << "WARNING: " << src_text
-        << " is expected to be a 32-bit integer, but actually"
-        << " has value \"" << str << "\".\n";
-    printf("%s", msg.GetString().c_str());
-    fflush(stdout);
-    return false;
-  }
-
-  // Is the parsed value in the range of an Int32?
-  const Int32 result = static_cast<Int32>(long_value);
-  if (long_value == LONG_MAX || long_value == LONG_MIN ||
-      // The parsed value overflows as a long.  (strtol() returns
-      // LONG_MAX or LONG_MIN when the input overflows.)
-      result != long_value
-      // The parsed value overflows as an Int32.
-      ) {
-    Message msg;
-    msg << "WARNING: " << src_text
-        << " is expected to be a 32-bit integer, but actually"
-        << " has value " << str << ", which overflows.\n";
-    printf("%s", msg.GetString().c_str());
-    fflush(stdout);
-    return false;
-  }
-
-  *value = result;
-  return true;
-}
-
-// Reads and returns the Boolean environment variable corresponding to
-// the given flag; if it's not set, returns default_value.
-//
-// The value is considered true iff it's not "0".
-bool BoolFromGTestEnv(const char* flag, bool default_value) {
-  const std::string env_var = FlagToEnvVar(flag);
-  const char* const string_value = posix::GetEnv(env_var.c_str());
-  return string_value == NULL ?
-      default_value : strcmp(string_value, "0") != 0;
-}
-
-// Reads and returns a 32-bit integer stored in the environment
-// variable corresponding to the given flag; if it isn't set or
-// doesn't represent a valid 32-bit integer, returns default_value.
-Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) {
-  const std::string env_var = FlagToEnvVar(flag);
-  const char* const string_value = posix::GetEnv(env_var.c_str());
-  if (string_value == NULL) {
-    // The environment variable is not set.
-    return default_value;
-  }
-
-  Int32 result = default_value;
-  if (!ParseInt32(Message() << "Environment variable " << env_var,
-                  string_value, &result)) {
-    printf("The default value %s is used.\n",
-           (Message() << default_value).GetString().c_str());
-    fflush(stdout);
-    return default_value;
-  }
-
-  return result;
-}
-
-// Reads and returns the string environment variable corresponding to
-// the given flag; if it's not set, returns default_value.
-const char* StringFromGTestEnv(const char* flag, const char* default_value) {
-  const std::string env_var = FlagToEnvVar(flag);
-  const char* const value = posix::GetEnv(env_var.c_str());
-  return value == NULL ? default_value : value;
-}
-
-}  // namespace internal
-}  // namespace testing
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-
-// Google Test - The Google C++ Testing Framework
-//
-// This file implements a universal value printer that can print a
-// value of any type T:
-//
-//   void ::testing::internal::UniversalPrinter<T>::Print(value, ostream_ptr);
-//
-// It uses the << operator when possible, and prints the bytes in the
-// object otherwise.  A user can override its behavior for a class
-// type Foo by defining either operator<<(::std::ostream&, const Foo&)
-// or void PrintTo(const Foo&, ::std::ostream*) in the namespace that
-// defines Foo.
-
-#include <ctype.h>
-#include <stdio.h>
-#include <ostream>  // NOLINT
-#include <string>
-
-namespace testing {
-
-namespace {
-
-using ::std::ostream;
-
-// Prints a segment of bytes in the given object.
-void PrintByteSegmentInObjectTo(const unsigned char* obj_bytes, size_t start,
-                                size_t count, ostream* os) {
-  char text[5] = "";
-  for (size_t i = 0; i != count; i++) {
-    const size_t j = start + i;
-    if (i != 0) {
-      // Organizes the bytes into groups of 2 for easy parsing by
-      // human.
-      if ((j % 2) == 0)
-        *os << ' ';
-      else
-        *os << '-';
-    }
-    GTEST_SNPRINTF_(text, sizeof(text), "%02X", obj_bytes[j]);
-    *os << text;
-  }
-}
-
-// Prints the bytes in the given value to the given ostream.
-void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count,
-                              ostream* os) {
-  // Tells the user how big the object is.
-  *os << count << "-byte object <";
-
-  const size_t kThreshold = 132;
-  const size_t kChunkSize = 64;
-  // If the object size is bigger than kThreshold, we'll have to omit
-  // some details by printing only the first and the last kChunkSize
-  // bytes.
-  // TODO(wan): let the user control the threshold using a flag.
-  if (count < kThreshold) {
-    PrintByteSegmentInObjectTo(obj_bytes, 0, count, os);
-  } else {
-    PrintByteSegmentInObjectTo(obj_bytes, 0, kChunkSize, os);
-    *os << " ... ";
-    // Rounds up to 2-byte boundary.
-    const size_t resume_pos = (count - kChunkSize + 1)/2*2;
-    PrintByteSegmentInObjectTo(obj_bytes, resume_pos, count - resume_pos, os);
-  }
-  *os << ">";
-}
-
-}  // namespace
-
-namespace internal2 {
-
-// Delegates to PrintBytesInObjectToImpl() to print the bytes in the
-// given object.  The delegation simplifies the implementation, which
-// uses the << operator and thus is easier done outside of the
-// ::testing::internal namespace, which contains a << operator that
-// sometimes conflicts with the one in STL.
-void PrintBytesInObjectTo(const unsigned char* obj_bytes, size_t count,
-                          ostream* os) {
-  PrintBytesInObjectToImpl(obj_bytes, count, os);
-}
-
-}  // namespace internal2
-
-namespace internal {
-
-// Depending on the value of a char (or wchar_t), we print it in one
-// of three formats:
-//   - as is if it's a printable ASCII (e.g. 'a', '2', ' '),
-//   - as a hexidecimal escape sequence (e.g. '\x7F'), or
-//   - as a special escape sequence (e.g. '\r', '\n').
-enum CharFormat {
-  kAsIs,
-  kHexEscape,
-  kSpecialEscape
-};
-
-// Returns true if c is a printable ASCII character.  We test the
-// value of c directly instead of calling isprint(), which is buggy on
-// Windows Mobile.
-inline bool IsPrintableAscii(wchar_t c) {
-  return 0x20 <= c && c <= 0x7E;
-}
-
-// Prints a wide or narrow char c as a character literal without the
-// quotes, escaping it when necessary; returns how c was formatted.
-// The template argument UnsignedChar is the unsigned version of Char,
-// which is the type of c.
-template <typename UnsignedChar, typename Char>
-static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) {
-  switch (static_cast<wchar_t>(c)) {
-    case L'\0':
-      *os << "\\0";
-      break;
-    case L'\'':
-      *os << "\\'";
-      break;
-    case L'\\':
-      *os << "\\\\";
-      break;
-    case L'\a':
-      *os << "\\a";
-      break;
-    case L'\b':
-      *os << "\\b";
-      break;
-    case L'\f':
-      *os << "\\f";
-      break;
-    case L'\n':
-      *os << "\\n";
-      break;
-    case L'\r':
-      *os << "\\r";
-      break;
-    case L'\t':
-      *os << "\\t";
-      break;
-    case L'\v':
-      *os << "\\v";
-      break;
-    default:
-      if (IsPrintableAscii(c)) {
-        *os << static_cast<char>(c);
-        return kAsIs;
-      } else {
-        *os << "\\x" + String::FormatHexInt(static_cast<UnsignedChar>(c));
-        return kHexEscape;
-      }
-  }
-  return kSpecialEscape;
-}
-
-// Prints a wchar_t c as if it's part of a string literal, escaping it when
-// necessary; returns how c was formatted.
-static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream* os) {
-  switch (c) {
-    case L'\'':
-      *os << "'";
-      return kAsIs;
-    case L'"':
-      *os << "\\\"";
-      return kSpecialEscape;
-    default:
-      return PrintAsCharLiteralTo<wchar_t>(c, os);
-  }
-}
-
-// Prints a char c as if it's part of a string literal, escaping it when
-// necessary; returns how c was formatted.
-static CharFormat PrintAsStringLiteralTo(char c, ostream* os) {
-  return PrintAsStringLiteralTo(
-      static_cast<wchar_t>(static_cast<unsigned char>(c)), os);
-}
-
-// Prints a wide or narrow character c and its code.  '\0' is printed
-// as "'\\0'", other unprintable characters are also properly escaped
-// using the standard C++ escape sequence.  The template argument
-// UnsignedChar is the unsigned version of Char, which is the type of c.
-template <typename UnsignedChar, typename Char>
-void PrintCharAndCodeTo(Char c, ostream* os) {
-  // First, print c as a literal in the most readable form we can find.
-  *os << ((sizeof(c) > 1) ? "L'" : "'");
-  const CharFormat format = PrintAsCharLiteralTo<UnsignedChar>(c, os);
-  *os << "'";
-
-  // To aid user debugging, we also print c's code in decimal, unless
-  // it's 0 (in which case c was printed as '\\0', making the code
-  // obvious).
-  if (c == 0)
-    return;
-  *os << " (" << static_cast<int>(c);
-
-  // For more convenience, we print c's code again in hexidecimal,
-  // unless c was already printed in the form '\x##' or the code is in
-  // [1, 9].
-  if (format == kHexEscape || (1 <= c && c <= 9)) {
-    // Do nothing.
-  } else {
-    *os << ", 0x" << String::FormatHexInt(static_cast<UnsignedChar>(c));
-  }
-  *os << ")";
-}
-
-void PrintTo(unsigned char c, ::std::ostream* os) {
-  PrintCharAndCodeTo<unsigned char>(c, os);
-}
-void PrintTo(signed char c, ::std::ostream* os) {
-  PrintCharAndCodeTo<unsigned char>(c, os);
-}
-
-// Prints a wchar_t as a symbol if it is printable or as its internal
-// code otherwise and also as its code.  L'\0' is printed as "L'\\0'".
-void PrintTo(wchar_t wc, ostream* os) {
-  PrintCharAndCodeTo<wchar_t>(wc, os);
-}
-
-// Prints the given array of characters to the ostream.  CharType must be either
-// char or wchar_t.
-// The array starts at begin, the length is len, it may include '\0' characters
-// and may not be NUL-terminated.
-template <typename CharType>
-static void PrintCharsAsStringTo(
-    const CharType* begin, size_t len, ostream* os) {
-  const char* const kQuoteBegin = sizeof(CharType) == 1 ? "\"" : "L\"";
-  *os << kQuoteBegin;
-  bool is_previous_hex = false;
-  for (size_t index = 0; index < len; ++index) {
-    const CharType cur = begin[index];
-    if (is_previous_hex && IsXDigit(cur)) {
-      // Previous character is of '\x..' form and this character can be
-      // interpreted as another hexadecimal digit in its number. Break string to
-      // disambiguate.
-      *os << "\" " << kQuoteBegin;
-    }
-    is_previous_hex = PrintAsStringLiteralTo(cur, os) == kHexEscape;
-  }
-  *os << "\"";
-}
-
-// Prints a (const) char/wchar_t array of 'len' elements, starting at address
-// 'begin'.  CharType must be either char or wchar_t.
-template <typename CharType>
-static void UniversalPrintCharArray(
-    const CharType* begin, size_t len, ostream* os) {
-  // The code
-  //   const char kFoo[] = "foo";
-  // generates an array of 4, not 3, elements, with the last one being '\0'.
-  //
-  // Therefore when printing a char array, we don't print the last element if
-  // it's '\0', such that the output matches the string literal as it's
-  // written in the source code.
-  if (len > 0 && begin[len - 1] == '\0') {
-    PrintCharsAsStringTo(begin, len - 1, os);
-    return;
-  }
-
-  // If, however, the last element in the array is not '\0', e.g.
-  //    const char kFoo[] = { 'f', 'o', 'o' };
-  // we must print the entire array.  We also print a message to indicate
-  // that the array is not NUL-terminated.
-  PrintCharsAsStringTo(begin, len, os);
-  *os << " (no terminating NUL)";
-}
-
-// Prints a (const) char array of 'len' elements, starting at address 'begin'.
-void UniversalPrintArray(const char* begin, size_t len, ostream* os) {
-  UniversalPrintCharArray(begin, len, os);
-}
-
-// Prints a (const) wchar_t array of 'len' elements, starting at address
-// 'begin'.
-void UniversalPrintArray(const wchar_t* begin, size_t len, ostream* os) {
-  UniversalPrintCharArray(begin, len, os);
-}
-
-// Prints the given C string to the ostream.
-void PrintTo(const char* s, ostream* os) {
-  if (s == NULL) {
-    *os << "NULL";
-  } else {
-    *os << ImplicitCast_<const void*>(s) << " pointing to ";
-    PrintCharsAsStringTo(s, strlen(s), os);
-  }
-}
-
-// MSVC compiler can be configured to define whar_t as a typedef
-// of unsigned short. Defining an overload for const wchar_t* in that case
-// would cause pointers to unsigned shorts be printed as wide strings,
-// possibly accessing more memory than intended and causing invalid
-// memory accesses. MSVC defines _NATIVE_WCHAR_T_DEFINED symbol when
-// wchar_t is implemented as a native type.
-#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
-// Prints the given wide C string to the ostream.
-void PrintTo(const wchar_t* s, ostream* os) {
-  if (s == NULL) {
-    *os << "NULL";
-  } else {
-    *os << ImplicitCast_<const void*>(s) << " pointing to ";
-    PrintCharsAsStringTo(s, wcslen(s), os);
-  }
-}
-#endif  // wchar_t is native
-
-// Prints a ::string object.
-#if GTEST_HAS_GLOBAL_STRING
-void PrintStringTo(const ::string& s, ostream* os) {
-  PrintCharsAsStringTo(s.data(), s.size(), os);
-}
-#endif  // GTEST_HAS_GLOBAL_STRING
-
-void PrintStringTo(const ::std::string& s, ostream* os) {
-  PrintCharsAsStringTo(s.data(), s.size(), os);
-}
-
-// Prints a ::wstring object.
-#if GTEST_HAS_GLOBAL_WSTRING
-void PrintWideStringTo(const ::wstring& s, ostream* os) {
-  PrintCharsAsStringTo(s.data(), s.size(), os);
-}
-#endif  // GTEST_HAS_GLOBAL_WSTRING
-
-#if GTEST_HAS_STD_WSTRING
-void PrintWideStringTo(const ::std::wstring& s, ostream* os) {
-  PrintCharsAsStringTo(s.data(), s.size(), os);
-}
-#endif  // GTEST_HAS_STD_WSTRING
-
-}  // namespace internal
-
-}  // namespace testing
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: mheule@google.com (Markus Heule)
-//
-// The Google C++ Testing Framework (Google Test)
-
-
-// Indicates that this translation unit is part of Google Test's
-// implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick is to
-// prevent a user from accidentally including gtest-internal-inl.h in
-// his code.
-#define GTEST_IMPLEMENTATION_ 1
-#undef GTEST_IMPLEMENTATION_
-
-namespace testing {
-
-using internal::GetUnitTestImpl;
-
-// Gets the summary of the failure message by omitting the stack trace
-// in it.
-std::string TestPartResult::ExtractSummary(const char* message) {
-  const char* const stack_trace = strstr(message, internal::kStackTraceMarker);
-  return stack_trace == NULL ? message :
-      std::string(message, stack_trace);
-}
-
-// Prints a TestPartResult object.
-std::ostream& operator<<(std::ostream& os, const TestPartResult& result) {
-  return os
-      << result.file_name() << ":" << result.line_number() << ": "
-      << (result.type() == TestPartResult::kSuccess ? "Success" :
-          result.type() == TestPartResult::kFatalFailure ? "Fatal failure" :
-          "Non-fatal failure") << ":\n"
-      << result.message() << std::endl;
-}
-
-// Appends a TestPartResult to the array.
-void TestPartResultArray::Append(const TestPartResult& result) {
-  array_.push_back(result);
-}
-
-// Returns the TestPartResult at the given index (0-based).
-const TestPartResult& TestPartResultArray::GetTestPartResult(int index) const {
-  if (index < 0 || index >= size()) {
-    printf("\nInvalid index (%d) into TestPartResultArray.\n", index);
-    internal::posix::Abort();
-  }
-
-  return array_[index];
-}
-
-// Returns the number of TestPartResult objects in the array.
-int TestPartResultArray::size() const {
-  return static_cast<int>(array_.size());
-}
-
-namespace internal {
-
-HasNewFatalFailureHelper::HasNewFatalFailureHelper()
-    : has_new_fatal_failure_(false),
-      original_reporter_(GetUnitTestImpl()->
-                         GetTestPartResultReporterForCurrentThread()) {
-  GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this);
-}
-
-HasNewFatalFailureHelper::~HasNewFatalFailureHelper() {
-  GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(
-      original_reporter_);
-}
-
-void HasNewFatalFailureHelper::ReportTestPartResult(
-    const TestPartResult& result) {
-  if (result.fatally_failed())
-    has_new_fatal_failure_ = true;
-  original_reporter_->ReportTestPartResult(result);
-}
-
-}  // namespace internal
-
-}  // namespace testing
-// Copyright 2008 Google Inc.
-// All Rights Reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-
-
-namespace testing {
-namespace internal {
-
-#if GTEST_HAS_TYPED_TEST_P
-
-// Skips to the first non-space char in str. Returns an empty string if str
-// contains only whitespace characters.
-static const char* SkipSpaces(const char* str) {
-  while (IsSpace(*str))
-    str++;
-  return str;
-}
-
-// Verifies that registered_tests match the test names in
-// defined_test_names_; returns registered_tests if successful, or
-// aborts the program otherwise.
-const char* TypedTestCasePState::VerifyRegisteredTestNames(
-    const char* file, int line, const char* registered_tests) {
-  typedef ::std::set<const char*>::const_iterator DefinedTestIter;
-  registered_ = true;
-
-  // Skip initial whitespace in registered_tests since some
-  // preprocessors prefix stringizied literals with whitespace.
-  registered_tests = SkipSpaces(registered_tests);
-
-  Message errors;
-  ::std::set<std::string> tests;
-  for (const char* names = registered_tests; names != NULL;
-       names = SkipComma(names)) {
-    const std::string name = GetPrefixUntilComma(names);
-    if (tests.count(name) != 0) {
-      errors << "Test " << name << " is listed more than once.\n";
-      continue;
-    }
-
-    bool found = false;
-    for (DefinedTestIter it = defined_test_names_.begin();
-         it != defined_test_names_.end();
-         ++it) {
-      if (name == *it) {
-        found = true;
-        break;
-      }
-    }
-
-    if (found) {
-      tests.insert(name);
-    } else {
-      errors << "No test named " << name
-             << " can be found in this test case.\n";
-    }
-  }
-
-  for (DefinedTestIter it = defined_test_names_.begin();
-       it != defined_test_names_.end();
-       ++it) {
-    if (tests.count(*it) == 0) {
-      errors << "You forgot to list test " << *it << ".\n";
-    }
-  }
-
-  const std::string& errors_str = errors.GetString();
-  if (errors_str != "") {
-    fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(),
-            errors_str.c_str());
-    fflush(stderr);
-    posix::Abort();
-  }
-
-  return registered_tests;
-}
-
-#endif  // GTEST_HAS_TYPED_TEST_P
-
-}  // namespace internal
-}  // namespace testing
+#include "src/gtest-assertion-result.cc"
+#include "src/gtest-death-test.cc"
+#include "src/gtest-filepath.cc"
+#include "src/gtest-matchers.cc"
+#include "src/gtest-port.cc"
+#include "src/gtest-printers.cc"
+#include "src/gtest-test-part.cc"
+#include "src/gtest-typed-test.cc"
+#include "src/gtest.cc"
diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest-assertion-result.cc b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-assertion-result.cc
new file mode 100644
index 0000000000..f1c0b10dc9
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-assertion-result.cc
@@ -0,0 +1,77 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This file defines the AssertionResult type.
+
+#include "gtest/gtest-assertion-result.h"
+
+#include <string>
+#include <utility>
+
+#include "gtest/gtest-message.h"
+
+namespace testing {
+
+// AssertionResult constructors.
+// Used in EXPECT_TRUE/FALSE(assertion_result).
+AssertionResult::AssertionResult(const AssertionResult& other)
+    : success_(other.success_),
+      message_(other.message_.get() != nullptr
+                   ? new ::std::string(*other.message_)
+                   : static_cast< ::std::string*>(nullptr)) {}
+
+// Swaps two AssertionResults.
+void AssertionResult::swap(AssertionResult& other) {
+  using std::swap;
+  swap(success_, other.success_);
+  swap(message_, other.message_);
+}
+
+// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
+AssertionResult AssertionResult::operator!() const {
+  AssertionResult negation(!success_);
+  if (message_.get() != nullptr) negation << *message_;
+  return negation;
+}
+
+// Makes a successful assertion result.
+AssertionResult AssertionSuccess() { return AssertionResult(true); }
+
+// Makes a failed assertion result.
+AssertionResult AssertionFailure() { return AssertionResult(false); }
+
+// Makes a failed assertion result with the given failure message.
+// Deprecated; use AssertionFailure() << message.
+AssertionResult AssertionFailure(const Message& message) {
+  return AssertionFailure() << message;
+}
+
+}  // namespace testing
diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest-death-test.cc b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-death-test.cc
new file mode 100644
index 0000000000..e6abc6278a
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-death-test.cc
@@ -0,0 +1,1620 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// This file implements death tests.
+
+#include "gtest/gtest-death-test.h"
+
+#include <functional>
+#include <utility>
+
+#include "gtest/internal/custom/gtest.h"
+#include "gtest/internal/gtest-port.h"
+
+#if GTEST_HAS_DEATH_TEST
+
+#if GTEST_OS_MAC
+#include <crt_externs.h>
+#endif  // GTEST_OS_MAC
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+
+#if GTEST_OS_LINUX
+#include <signal.h>
+#endif  // GTEST_OS_LINUX
+
+#include <stdarg.h>
+
+#if GTEST_OS_WINDOWS
+#include <windows.h>
+#else
+#include <sys/mman.h>
+#include <sys/wait.h>
+#endif  // GTEST_OS_WINDOWS
+
+#if GTEST_OS_QNX
+#include <spawn.h>
+#endif  // GTEST_OS_QNX
+
+#if GTEST_OS_FUCHSIA
+#include <lib/fdio/fd.h>
+#include <lib/fdio/io.h>
+#include <lib/fdio/spawn.h>
+#include <lib/zx/channel.h>
+#include <lib/zx/port.h>
+#include <lib/zx/process.h>
+#include <lib/zx/socket.h>
+#include <zircon/processargs.h>
+#include <zircon/syscalls.h>
+#include <zircon/syscalls/policy.h>
+#include <zircon/syscalls/port.h>
+#endif  // GTEST_OS_FUCHSIA
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+#include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-string.h"
+#include "src/gtest-internal-inl.h"
+
+namespace testing {
+
+// Constants.
+
+// The default death test style.
+//
+// This is defined in internal/gtest-port.h as "fast", but can be overridden by
+// a definition in internal/custom/gtest-port.h. The recommended value, which is
+// used internally at Google, is "threadsafe".
+static const char kDefaultDeathTestStyle[] = GTEST_DEFAULT_DEATH_TEST_STYLE;
+
+}  // namespace testing
+
+GTEST_DEFINE_string_(
+    death_test_style,
+    testing::internal::StringFromGTestEnv("death_test_style",
+                                          testing::kDefaultDeathTestStyle),
+    "Indicates how to run a death test in a forked child process: "
+    "\"threadsafe\" (child process re-executes the test binary "
+    "from the beginning, running only the specific death test) or "
+    "\"fast\" (child process runs the death test immediately "
+    "after forking).");
+
+GTEST_DEFINE_bool_(
+    death_test_use_fork,
+    testing::internal::BoolFromGTestEnv("death_test_use_fork", false),
+    "Instructs to use fork()/_exit() instead of clone() in death tests. "
+    "Ignored and always uses fork() on POSIX systems where clone() is not "
+    "implemented. Useful when running under valgrind or similar tools if "
+    "those do not support clone(). Valgrind 3.3.1 will just fail if "
+    "it sees an unsupported combination of clone() flags. "
+    "It is not recommended to use this flag w/o valgrind though it will "
+    "work in 99% of the cases. Once valgrind is fixed, this flag will "
+    "most likely be removed.");
+
+GTEST_DEFINE_string_(
+    internal_run_death_test, "",
+    "Indicates the file, line number, temporal index of "
+    "the single death test to run, and a file descriptor to "
+    "which a success code may be sent, all separated by "
+    "the '|' characters.  This flag is specified if and only if the "
+    "current process is a sub-process launched for running a thread-safe "
+    "death test.  FOR INTERNAL USE ONLY.");
+
+namespace testing {
+
+#if GTEST_HAS_DEATH_TEST
+
+namespace internal {
+
+// Valid only for fast death tests. Indicates the code is running in the
+// child process of a fast style death test.
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+static bool g_in_fast_death_test_child = false;
+#endif
+
+// Returns a Boolean value indicating whether the caller is currently
+// executing in the context of the death test child process.  Tools such as
+// Valgrind heap checkers may need this to modify their behavior in death
+// tests.  IMPORTANT: This is an internal utility.  Using it may break the
+// implementation of death tests.  User code MUST NOT use it.
+bool InDeathTestChild() {
+#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+
+  // On Windows and Fuchsia, death tests are thread-safe regardless of the value
+  // of the death_test_style flag.
+  return !GTEST_FLAG_GET(internal_run_death_test).empty();
+
+#else
+
+  if (GTEST_FLAG_GET(death_test_style) == "threadsafe")
+    return !GTEST_FLAG_GET(internal_run_death_test).empty();
+  else
+    return g_in_fast_death_test_child;
+#endif
+}
+
+}  // namespace internal
+
+// ExitedWithCode constructor.
+ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {}
+
+// ExitedWithCode function-call operator.
+bool ExitedWithCode::operator()(int exit_status) const {
+#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+
+  return exit_status == exit_code_;
+
+#else
+
+  return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_;
+
+#endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+}
+
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+// KilledBySignal constructor.
+KilledBySignal::KilledBySignal(int signum) : signum_(signum) {}
+
+// KilledBySignal function-call operator.
+bool KilledBySignal::operator()(int exit_status) const {
+#if defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
+  {
+    bool result;
+    if (GTEST_KILLED_BY_SIGNAL_OVERRIDE_(signum_, exit_status, &result)) {
+      return result;
+    }
+  }
+#endif  // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
+  return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_;
+}
+#endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+
+namespace internal {
+
+// Utilities needed for death tests.
+
+// Generates a textual description of a given exit code, in the format
+// specified by wait(2).
+static std::string ExitSummary(int exit_code) {
+  Message m;
+
+#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+
+  m << "Exited with exit status " << exit_code;
+
+#else
+
+  if (WIFEXITED(exit_code)) {
+    m << "Exited with exit status " << WEXITSTATUS(exit_code);
+  } else if (WIFSIGNALED(exit_code)) {
+    m << "Terminated by signal " << WTERMSIG(exit_code);
+  }
+#ifdef WCOREDUMP
+  if (WCOREDUMP(exit_code)) {
+    m << " (core dumped)";
+  }
+#endif
+#endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+
+  return m.GetString();
+}
+
+// Returns true if exit_status describes a process that was terminated
+// by a signal, or exited normally with a nonzero exit code.
+bool ExitedUnsuccessfully(int exit_status) {
+  return !ExitedWithCode(0)(exit_status);
+}
+
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+// Generates a textual failure message when a death test finds more than
+// one thread running, or cannot determine the number of threads, prior
+// to executing the given statement.  It is the responsibility of the
+// caller not to pass a thread_count of 1.
+static std::string DeathTestThreadWarning(size_t thread_count) {
+  Message msg;
+  msg << "Death tests use fork(), which is unsafe particularly"
+      << " in a threaded context. For this test, " << GTEST_NAME_ << " ";
+  if (thread_count == 0) {
+    msg << "couldn't detect the number of threads.";
+  } else {
+    msg << "detected " << thread_count << " threads.";
+  }
+  msg << " See "
+         "https://github.com/google/googletest/blob/master/docs/"
+         "advanced.md#death-tests-and-threads"
+      << " for more explanation and suggested solutions, especially if"
+      << " this is the last message you see before your test times out.";
+  return msg.GetString();
+}
+#endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+
+// Flag characters for reporting a death test that did not die.
+static const char kDeathTestLived = 'L';
+static const char kDeathTestReturned = 'R';
+static const char kDeathTestThrew = 'T';
+static const char kDeathTestInternalError = 'I';
+
+#if GTEST_OS_FUCHSIA
+
+// File descriptor used for the pipe in the child process.
+static const int kFuchsiaReadPipeFd = 3;
+
+#endif
+
+// An enumeration describing all of the possible ways that a death test can
+// conclude.  DIED means that the process died while executing the test
+// code; LIVED means that process lived beyond the end of the test code;
+// RETURNED means that the test statement attempted to execute a return
+// statement, which is not allowed; THREW means that the test statement
+// returned control by throwing an exception.  IN_PROGRESS means the test
+// has not yet concluded.
+enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW };
+
+// Routine for aborting the program which is safe to call from an
+// exec-style death test child process, in which case the error
+// message is propagated back to the parent process.  Otherwise, the
+// message is simply printed to stderr.  In either case, the program
+// then exits with status 1.
+static void DeathTestAbort(const std::string& message) {
+  // On a POSIX system, this function may be called from a threadsafe-style
+  // death test child process, which operates on a very small stack.  Use
+  // the heap for any additional non-minuscule memory requirements.
+  const InternalRunDeathTestFlag* const flag =
+      GetUnitTestImpl()->internal_run_death_test_flag();
+  if (flag != nullptr) {
+    FILE* parent = posix::FDOpen(flag->write_fd(), "w");
+    fputc(kDeathTestInternalError, parent);
+    fprintf(parent, "%s", message.c_str());
+    fflush(parent);
+    _exit(1);
+  } else {
+    fprintf(stderr, "%s", message.c_str());
+    fflush(stderr);
+    posix::Abort();
+  }
+}
+
+// A replacement for CHECK that calls DeathTestAbort if the assertion
+// fails.
+#define GTEST_DEATH_TEST_CHECK_(expression)                              \
+  do {                                                                   \
+    if (!::testing::internal::IsTrue(expression)) {                      \
+      DeathTestAbort(::std::string("CHECK failed: File ") + __FILE__ +   \
+                     ", line " +                                         \
+                     ::testing::internal::StreamableToString(__LINE__) + \
+                     ": " + #expression);                                \
+    }                                                                    \
+  } while (::testing::internal::AlwaysFalse())
+
+// This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for
+// evaluating any system call that fulfills two conditions: it must return
+// -1 on failure, and set errno to EINTR when it is interrupted and
+// should be tried again.  The macro expands to a loop that repeatedly
+// evaluates the expression as long as it evaluates to -1 and sets
+// errno to EINTR.  If the expression evaluates to -1 but errno is
+// something other than EINTR, DeathTestAbort is called.
+#define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression)                      \
+  do {                                                                   \
+    int gtest_retval;                                                    \
+    do {                                                                 \
+      gtest_retval = (expression);                                       \
+    } while (gtest_retval == -1 && errno == EINTR);                      \
+    if (gtest_retval == -1) {                                            \
+      DeathTestAbort(::std::string("CHECK failed: File ") + __FILE__ +   \
+                     ", line " +                                         \
+                     ::testing::internal::StreamableToString(__LINE__) + \
+                     ": " + #expression + " != -1");                     \
+    }                                                                    \
+  } while (::testing::internal::AlwaysFalse())
+
+// Returns the message describing the last system error in errno.
+std::string GetLastErrnoDescription() {
+  return errno == 0 ? "" : posix::StrError(errno);
+}
+
+// This is called from a death test parent process to read a failure
+// message from the death test child process and log it with the FATAL
+// severity. On Windows, the message is read from a pipe handle. On other
+// platforms, it is read from a file descriptor.
+static void FailFromInternalError(int fd) {
+  Message error;
+  char buffer[256];
+  int num_read;
+
+  do {
+    while ((num_read = posix::Read(fd, buffer, 255)) > 0) {
+      buffer[num_read] = '\0';
+      error << buffer;
+    }
+  } while (num_read == -1 && errno == EINTR);
+
+  if (num_read == 0) {
+    GTEST_LOG_(FATAL) << error.GetString();
+  } else {
+    const int last_error = errno;
+    GTEST_LOG_(FATAL) << "Error while reading death test internal: "
+                      << GetLastErrnoDescription() << " [" << last_error << "]";
+  }
+}
+
+// Death test constructor.  Increments the running death test count
+// for the current test.
+DeathTest::DeathTest() {
+  TestInfo* const info = GetUnitTestImpl()->current_test_info();
+  if (info == nullptr) {
+    DeathTestAbort(
+        "Cannot run a death test outside of a TEST or "
+        "TEST_F construct");
+  }
+}
+
+// Creates and returns a death test by dispatching to the current
+// death test factory.
+bool DeathTest::Create(const char* statement,
+                       Matcher<const std::string&> matcher, const char* file,
+                       int line, DeathTest** test) {
+  return GetUnitTestImpl()->death_test_factory()->Create(
+      statement, std::move(matcher), file, line, test);
+}
+
+const char* DeathTest::LastMessage() {
+  return last_death_test_message_.c_str();
+}
+
+void DeathTest::set_last_death_test_message(const std::string& message) {
+  last_death_test_message_ = message;
+}
+
+std::string DeathTest::last_death_test_message_;
+
+// Provides cross platform implementation for some death functionality.
+class DeathTestImpl : public DeathTest {
+ protected:
+  DeathTestImpl(const char* a_statement, Matcher<const std::string&> matcher)
+      : statement_(a_statement),
+        matcher_(std::move(matcher)),
+        spawned_(false),
+        status_(-1),
+        outcome_(IN_PROGRESS),
+        read_fd_(-1),
+        write_fd_(-1) {}
+
+  // read_fd_ is expected to be closed and cleared by a derived class.
+  ~DeathTestImpl() override { GTEST_DEATH_TEST_CHECK_(read_fd_ == -1); }
+
+  void Abort(AbortReason reason) override;
+  bool Passed(bool status_ok) override;
+
+  const char* statement() const { return statement_; }
+  bool spawned() const { return spawned_; }
+  void set_spawned(bool is_spawned) { spawned_ = is_spawned; }
+  int status() const { return status_; }
+  void set_status(int a_status) { status_ = a_status; }
+  DeathTestOutcome outcome() const { return outcome_; }
+  void set_outcome(DeathTestOutcome an_outcome) { outcome_ = an_outcome; }
+  int read_fd() const { return read_fd_; }
+  void set_read_fd(int fd) { read_fd_ = fd; }
+  int write_fd() const { return write_fd_; }
+  void set_write_fd(int fd) { write_fd_ = fd; }
+
+  // Called in the parent process only. Reads the result code of the death
+  // test child process via a pipe, interprets it to set the outcome_
+  // member, and closes read_fd_.  Outputs diagnostics and terminates in
+  // case of unexpected codes.
+  void ReadAndInterpretStatusByte();
+
+  // Returns stderr output from the child process.
+  virtual std::string GetErrorLogs();
+
+ private:
+  // The textual content of the code this object is testing.  This class
+  // doesn't own this string and should not attempt to delete it.
+  const char* const statement_;
+  // A matcher that's expected to match the stderr output by the child process.
+  Matcher<const std::string&> matcher_;
+  // True if the death test child process has been successfully spawned.
+  bool spawned_;
+  // The exit status of the child process.
+  int status_;
+  // How the death test concluded.
+  DeathTestOutcome outcome_;
+  // Descriptor to the read end of the pipe to the child process.  It is
+  // always -1 in the child process.  The child keeps its write end of the
+  // pipe in write_fd_.
+  int read_fd_;
+  // Descriptor to the child's write end of the pipe to the parent process.
+  // It is always -1 in the parent process.  The parent keeps its end of the
+  // pipe in read_fd_.
+  int write_fd_;
+};
+
+// Called in the parent process only. Reads the result code of the death
+// test child process via a pipe, interprets it to set the outcome_
+// member, and closes read_fd_.  Outputs diagnostics and terminates in
+// case of unexpected codes.
+void DeathTestImpl::ReadAndInterpretStatusByte() {
+  char flag;
+  int bytes_read;
+
+  // The read() here blocks until data is available (signifying the
+  // failure of the death test) or until the pipe is closed (signifying
+  // its success), so it's okay to call this in the parent before
+  // the child process has exited.
+  do {
+    bytes_read = posix::Read(read_fd(), &flag, 1);
+  } while (bytes_read == -1 && errno == EINTR);
+
+  if (bytes_read == 0) {
+    set_outcome(DIED);
+  } else if (bytes_read == 1) {
+    switch (flag) {
+      case kDeathTestReturned:
+        set_outcome(RETURNED);
+        break;
+      case kDeathTestThrew:
+        set_outcome(THREW);
+        break;
+      case kDeathTestLived:
+        set_outcome(LIVED);
+        break;
+      case kDeathTestInternalError:
+        FailFromInternalError(read_fd());  // Does not return.
+        break;
+      default:
+        GTEST_LOG_(FATAL) << "Death test child process reported "
+                          << "unexpected status byte ("
+                          << static_cast<unsigned int>(flag) << ")";
+    }
+  } else {
+    GTEST_LOG_(FATAL) << "Read from death test child process failed: "
+                      << GetLastErrnoDescription();
+  }
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Close(read_fd()));
+  set_read_fd(-1);
+}
+
+std::string DeathTestImpl::GetErrorLogs() { return GetCapturedStderr(); }
+
+// Signals that the death test code which should have exited, didn't.
+// Should be called only in a death test child process.
+// Writes a status byte to the child's status file descriptor, then
+// calls _exit(1).
+void DeathTestImpl::Abort(AbortReason reason) {
+  // The parent process considers the death test to be a failure if
+  // it finds any data in our pipe.  So, here we write a single flag byte
+  // to the pipe, then exit.
+  const char status_ch = reason == TEST_DID_NOT_DIE       ? kDeathTestLived
+                         : reason == TEST_THREW_EXCEPTION ? kDeathTestThrew
+                                                          : kDeathTestReturned;
+
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1));
+  // We are leaking the descriptor here because on some platforms (i.e.,
+  // when built as Windows DLL), destructors of global objects will still
+  // run after calling _exit(). On such systems, write_fd_ will be
+  // indirectly closed from the destructor of UnitTestImpl, causing double
+  // close if it is also closed here. On debug configurations, double close
+  // may assert. As there are no in-process buffers to flush here, we are
+  // relying on the OS to close the descriptor after the process terminates
+  // when the destructors are not run.
+  _exit(1);  // Exits w/o any normal exit hooks (we were supposed to crash)
+}
+
+// Returns an indented copy of stderr output for a death test.
+// This makes distinguishing death test output lines from regular log lines
+// much easier.
+static ::std::string FormatDeathTestOutput(const ::std::string& output) {
+  ::std::string ret;
+  for (size_t at = 0;;) {
+    const size_t line_end = output.find('\n', at);
+    ret += "[  DEATH   ] ";
+    if (line_end == ::std::string::npos) {
+      ret += output.substr(at);
+      break;
+    }
+    ret += output.substr(at, line_end + 1 - at);
+    at = line_end + 1;
+  }
+  return ret;
+}
+
+// Assesses the success or failure of a death test, using both private
+// members which have previously been set, and one argument:
+//
+// Private data members:
+//   outcome:  An enumeration describing how the death test
+//             concluded: DIED, LIVED, THREW, or RETURNED.  The death test
+//             fails in the latter three cases.
+//   status:   The exit status of the child process. On *nix, it is in the
+//             in the format specified by wait(2). On Windows, this is the
+//             value supplied to the ExitProcess() API or a numeric code
+//             of the exception that terminated the program.
+//   matcher_: A matcher that's expected to match the stderr output by the child
+//             process.
+//
+// Argument:
+//   status_ok: true if exit_status is acceptable in the context of
+//              this particular death test, which fails if it is false
+//
+// Returns true if and only if all of the above conditions are met.  Otherwise,
+// the first failing condition, in the order given above, is the one that is
+// reported. Also sets the last death test message string.
+bool DeathTestImpl::Passed(bool status_ok) {
+  if (!spawned()) return false;
+
+  const std::string error_message = GetErrorLogs();
+
+  bool success = false;
+  Message buffer;
+
+  buffer << "Death test: " << statement() << "\n";
+  switch (outcome()) {
+    case LIVED:
+      buffer << "    Result: failed to die.\n"
+             << " Error msg:\n"
+             << FormatDeathTestOutput(error_message);
+      break;
+    case THREW:
+      buffer << "    Result: threw an exception.\n"
+             << " Error msg:\n"
+             << FormatDeathTestOutput(error_message);
+      break;
+    case RETURNED:
+      buffer << "    Result: illegal return in test statement.\n"
+             << " Error msg:\n"
+             << FormatDeathTestOutput(error_message);
+      break;
+    case DIED:
+      if (status_ok) {
+        if (matcher_.Matches(error_message)) {
+          success = true;
+        } else {
+          std::ostringstream stream;
+          matcher_.DescribeTo(&stream);
+          buffer << "    Result: died but not with expected error.\n"
+                 << "  Expected: " << stream.str() << "\n"
+                 << "Actual msg:\n"
+                 << FormatDeathTestOutput(error_message);
+        }
+      } else {
+        buffer << "    Result: died but not with expected exit code:\n"
+               << "            " << ExitSummary(status()) << "\n"
+               << "Actual msg:\n"
+               << FormatDeathTestOutput(error_message);
+      }
+      break;
+    case IN_PROGRESS:
+    default:
+      GTEST_LOG_(FATAL)
+          << "DeathTest::Passed somehow called before conclusion of test";
+  }
+
+  DeathTest::set_last_death_test_message(buffer.GetString());
+  return success;
+}
+
+#if GTEST_OS_WINDOWS
+// WindowsDeathTest implements death tests on Windows. Due to the
+// specifics of starting new processes on Windows, death tests there are
+// always threadsafe, and Google Test considers the
+// --gtest_death_test_style=fast setting to be equivalent to
+// --gtest_death_test_style=threadsafe there.
+//
+// A few implementation notes:  Like the Linux version, the Windows
+// implementation uses pipes for child-to-parent communication. But due to
+// the specifics of pipes on Windows, some extra steps are required:
+//
+// 1. The parent creates a communication pipe and stores handles to both
+//    ends of it.
+// 2. The parent starts the child and provides it with the information
+//    necessary to acquire the handle to the write end of the pipe.
+// 3. The child acquires the write end of the pipe and signals the parent
+//    using a Windows event.
+// 4. Now the parent can release the write end of the pipe on its side. If
+//    this is done before step 3, the object's reference count goes down to
+//    0 and it is destroyed, preventing the child from acquiring it. The
+//    parent now has to release it, or read operations on the read end of
+//    the pipe will not return when the child terminates.
+// 5. The parent reads child's output through the pipe (outcome code and
+//    any possible error messages) from the pipe, and its stderr and then
+//    determines whether to fail the test.
+//
+// Note: to distinguish Win32 API calls from the local method and function
+// calls, the former are explicitly resolved in the global namespace.
+//
+class WindowsDeathTest : public DeathTestImpl {
+ public:
+  WindowsDeathTest(const char* a_statement, Matcher<const std::string&> matcher,
+                   const char* file, int line)
+      : DeathTestImpl(a_statement, std::move(matcher)),
+        file_(file),
+        line_(line) {}
+
+  // All of these virtual functions are inherited from DeathTest.
+  virtual int Wait();
+  virtual TestRole AssumeRole();
+
+ private:
+  // The name of the file in which the death test is located.
+  const char* const file_;
+  // The line number on which the death test is located.
+  const int line_;
+  // Handle to the write end of the pipe to the child process.
+  AutoHandle write_handle_;
+  // Child process handle.
+  AutoHandle child_handle_;
+  // Event the child process uses to signal the parent that it has
+  // acquired the handle to the write end of the pipe. After seeing this
+  // event the parent can release its own handles to make sure its
+  // ReadFile() calls return when the child terminates.
+  AutoHandle event_handle_;
+};
+
+// Waits for the child in a death test to exit, returning its exit
+// status, or 0 if no child process exists.  As a side effect, sets the
+// outcome data member.
+int WindowsDeathTest::Wait() {
+  if (!spawned()) return 0;
+
+  // Wait until the child either signals that it has acquired the write end
+  // of the pipe or it dies.
+  const HANDLE wait_handles[2] = {child_handle_.Get(), event_handle_.Get()};
+  switch (::WaitForMultipleObjects(2, wait_handles,
+                                   FALSE,  // Waits for any of the handles.
+                                   INFINITE)) {
+    case WAIT_OBJECT_0:
+    case WAIT_OBJECT_0 + 1:
+      break;
+    default:
+      GTEST_DEATH_TEST_CHECK_(false);  // Should not get here.
+  }
+
+  // The child has acquired the write end of the pipe or exited.
+  // We release the handle on our side and continue.
+  write_handle_.Reset();
+  event_handle_.Reset();
+
+  ReadAndInterpretStatusByte();
+
+  // Waits for the child process to exit if it haven't already. This
+  // returns immediately if the child has already exited, regardless of
+  // whether previous calls to WaitForMultipleObjects synchronized on this
+  // handle or not.
+  GTEST_DEATH_TEST_CHECK_(WAIT_OBJECT_0 ==
+                          ::WaitForSingleObject(child_handle_.Get(), INFINITE));
+  DWORD status_code;
+  GTEST_DEATH_TEST_CHECK_(
+      ::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE);
+  child_handle_.Reset();
+  set_status(static_cast<int>(status_code));
+  return status();
+}
+
+// The AssumeRole process for a Windows death test.  It creates a child
+// process with the same executable as the current process to run the
+// death test.  The child process is given the --gtest_filter and
+// --gtest_internal_run_death_test flags such that it knows to run the
+// current death test only.
+DeathTest::TestRole WindowsDeathTest::AssumeRole() {
+  const UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const TestInfo* const info = impl->current_test_info();
+  const int death_test_index = info->result()->death_test_count();
+
+  if (flag != nullptr) {
+    // ParseInternalRunDeathTestFlag() has performed all the necessary
+    // processing.
+    set_write_fd(flag->write_fd());
+    return EXECUTE_TEST;
+  }
+
+  // WindowsDeathTest uses an anonymous pipe to communicate results of
+  // a death test.
+  SECURITY_ATTRIBUTES handles_are_inheritable = {sizeof(SECURITY_ATTRIBUTES),
+                                                 nullptr, TRUE};
+  HANDLE read_handle, write_handle;
+  GTEST_DEATH_TEST_CHECK_(::CreatePipe(&read_handle, &write_handle,
+                                       &handles_are_inheritable,
+                                       0)  // Default buffer size.
+                          != FALSE);
+  set_read_fd(
+      ::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle), O_RDONLY));
+  write_handle_.Reset(write_handle);
+  event_handle_.Reset(::CreateEvent(
+      &handles_are_inheritable,
+      TRUE,       // The event will automatically reset to non-signaled state.
+      FALSE,      // The initial state is non-signalled.
+      nullptr));  // The even is unnamed.
+  GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != nullptr);
+  const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
+                                  "filter=" + info->test_suite_name() + "." +
+                                  info->name();
+  const std::string internal_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ +
+      "internal_run_death_test=" + file_ + "|" + StreamableToString(line_) +
+      "|" + StreamableToString(death_test_index) + "|" +
+      StreamableToString(static_cast<unsigned int>(::GetCurrentProcessId())) +
+      // size_t has the same width as pointers on both 32-bit and 64-bit
+      // Windows platforms.
+      // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx.
+      "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) + "|" +
+      StreamableToString(reinterpret_cast<size_t>(event_handle_.Get()));
+
+  char executable_path[_MAX_PATH + 1];  // NOLINT
+  GTEST_DEATH_TEST_CHECK_(_MAX_PATH + 1 != ::GetModuleFileNameA(nullptr,
+                                                                executable_path,
+                                                                _MAX_PATH));
+
+  std::string command_line = std::string(::GetCommandLineA()) + " " +
+                             filter_flag + " \"" + internal_flag + "\"";
+
+  DeathTest::set_last_death_test_message("");
+
+  CaptureStderr();
+  // Flush the log buffers since the log streams are shared with the child.
+  FlushInfoLog();
+
+  // The child process will share the standard handles with the parent.
+  STARTUPINFOA startup_info;
+  memset(&startup_info, 0, sizeof(STARTUPINFO));
+  startup_info.dwFlags = STARTF_USESTDHANDLES;
+  startup_info.hStdInput = ::GetStdHandle(STD_INPUT_HANDLE);
+  startup_info.hStdOutput = ::GetStdHandle(STD_OUTPUT_HANDLE);
+  startup_info.hStdError = ::GetStdHandle(STD_ERROR_HANDLE);
+
+  PROCESS_INFORMATION process_info;
+  GTEST_DEATH_TEST_CHECK_(
+      ::CreateProcessA(
+          executable_path, const_cast<char*>(command_line.c_str()),
+          nullptr,  // Returned process handle is not inheritable.
+          nullptr,  // Returned thread handle is not inheritable.
+          TRUE,  // Child inherits all inheritable handles (for write_handle_).
+          0x0,   // Default creation flags.
+          nullptr,  // Inherit the parent's environment.
+          UnitTest::GetInstance()->original_working_dir(), &startup_info,
+          &process_info) != FALSE);
+  child_handle_.Reset(process_info.hProcess);
+  ::CloseHandle(process_info.hThread);
+  set_spawned(true);
+  return OVERSEE_TEST;
+}
+
+#elif GTEST_OS_FUCHSIA
+
+class FuchsiaDeathTest : public DeathTestImpl {
+ public:
+  FuchsiaDeathTest(const char* a_statement, Matcher<const std::string&> matcher,
+                   const char* file, int line)
+      : DeathTestImpl(a_statement, std::move(matcher)),
+        file_(file),
+        line_(line) {}
+
+  // All of these virtual functions are inherited from DeathTest.
+  int Wait() override;
+  TestRole AssumeRole() override;
+  std::string GetErrorLogs() override;
+
+ private:
+  // The name of the file in which the death test is located.
+  const char* const file_;
+  // The line number on which the death test is located.
+  const int line_;
+  // The stderr data captured by the child process.
+  std::string captured_stderr_;
+
+  zx::process child_process_;
+  zx::channel exception_channel_;
+  zx::socket stderr_socket_;
+};
+
+// Utility class for accumulating command-line arguments.
+class Arguments {
+ public:
+  Arguments() { args_.push_back(nullptr); }
+
+  ~Arguments() {
+    for (std::vector<char*>::iterator i = args_.begin(); i != args_.end();
+         ++i) {
+      free(*i);
+    }
+  }
+  void AddArgument(const char* argument) {
+    args_.insert(args_.end() - 1, posix::StrDup(argument));
+  }
+
+  template <typename Str>
+  void AddArguments(const ::std::vector<Str>& arguments) {
+    for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
+         i != arguments.end(); ++i) {
+      args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
+    }
+  }
+  char* const* Argv() { return &args_[0]; }
+
+  int size() { return static_cast<int>(args_.size()) - 1; }
+
+ private:
+  std::vector<char*> args_;
+};
+
+// Waits for the child in a death test to exit, returning its exit
+// status, or 0 if no child process exists.  As a side effect, sets the
+// outcome data member.
+int FuchsiaDeathTest::Wait() {
+  const int kProcessKey = 0;
+  const int kSocketKey = 1;
+  const int kExceptionKey = 2;
+
+  if (!spawned()) return 0;
+
+  // Create a port to wait for socket/task/exception events.
+  zx_status_t status_zx;
+  zx::port port;
+  status_zx = zx::port::create(0, &port);
+  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+  // Register to wait for the child process to terminate.
+  status_zx =
+      child_process_.wait_async(port, kProcessKey, ZX_PROCESS_TERMINATED, 0);
+  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+  // Register to wait for the socket to be readable or closed.
+  status_zx = stderr_socket_.wait_async(
+      port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED, 0);
+  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+  // Register to wait for an exception.
+  status_zx = exception_channel_.wait_async(port, kExceptionKey,
+                                            ZX_CHANNEL_READABLE, 0);
+  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+  bool process_terminated = false;
+  bool socket_closed = false;
+  do {
+    zx_port_packet_t packet = {};
+    status_zx = port.wait(zx::time::infinite(), &packet);
+    GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+    if (packet.key == kExceptionKey) {
+      // Process encountered an exception. Kill it directly rather than
+      // letting other handlers process the event. We will get a kProcessKey
+      // event when the process actually terminates.
+      status_zx = child_process_.kill();
+      GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+    } else if (packet.key == kProcessKey) {
+      // Process terminated.
+      GTEST_DEATH_TEST_CHECK_(ZX_PKT_IS_SIGNAL_ONE(packet.type));
+      GTEST_DEATH_TEST_CHECK_(packet.signal.observed & ZX_PROCESS_TERMINATED);
+      process_terminated = true;
+    } else if (packet.key == kSocketKey) {
+      GTEST_DEATH_TEST_CHECK_(ZX_PKT_IS_SIGNAL_ONE(packet.type));
+      if (packet.signal.observed & ZX_SOCKET_READABLE) {
+        // Read data from the socket.
+        constexpr size_t kBufferSize = 1024;
+        do {
+          size_t old_length = captured_stderr_.length();
+          size_t bytes_read = 0;
+          captured_stderr_.resize(old_length + kBufferSize);
+          status_zx =
+              stderr_socket_.read(0, &captured_stderr_.front() + old_length,
+                                  kBufferSize, &bytes_read);
+          captured_stderr_.resize(old_length + bytes_read);
+        } while (status_zx == ZX_OK);
+        if (status_zx == ZX_ERR_PEER_CLOSED) {
+          socket_closed = true;
+        } else {
+          GTEST_DEATH_TEST_CHECK_(status_zx == ZX_ERR_SHOULD_WAIT);
+          status_zx = stderr_socket_.wait_async(
+              port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED, 0);
+          GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+        }
+      } else {
+        GTEST_DEATH_TEST_CHECK_(packet.signal.observed & ZX_SOCKET_PEER_CLOSED);
+        socket_closed = true;
+      }
+    }
+  } while (!process_terminated && !socket_closed);
+
+  ReadAndInterpretStatusByte();
+
+  zx_info_process_t buffer;
+  status_zx = child_process_.get_info(ZX_INFO_PROCESS, &buffer, sizeof(buffer),
+                                      nullptr, nullptr);
+  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+  GTEST_DEATH_TEST_CHECK_(buffer.flags & ZX_INFO_PROCESS_FLAG_EXITED);
+  set_status(static_cast<int>(buffer.return_code));
+  return status();
+}
+
+// The AssumeRole process for a Fuchsia death test.  It creates a child
+// process with the same executable as the current process to run the
+// death test.  The child process is given the --gtest_filter and
+// --gtest_internal_run_death_test flags such that it knows to run the
+// current death test only.
+DeathTest::TestRole FuchsiaDeathTest::AssumeRole() {
+  const UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const TestInfo* const info = impl->current_test_info();
+  const int death_test_index = info->result()->death_test_count();
+
+  if (flag != nullptr) {
+    // ParseInternalRunDeathTestFlag() has performed all the necessary
+    // processing.
+    set_write_fd(kFuchsiaReadPipeFd);
+    return EXECUTE_TEST;
+  }
+
+  // Flush the log buffers since the log streams are shared with the child.
+  FlushInfoLog();
+
+  // Build the child process command line.
+  const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
+                                  "filter=" + info->test_suite_name() + "." +
+                                  info->name();
+  const std::string internal_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
+                                    kInternalRunDeathTestFlag + "=" + file_ +
+                                    "|" + StreamableToString(line_) + "|" +
+                                    StreamableToString(death_test_index);
+  Arguments args;
+  args.AddArguments(GetInjectableArgvs());
+  args.AddArgument(filter_flag.c_str());
+  args.AddArgument(internal_flag.c_str());
+
+  // Build the pipe for communication with the child.
+  zx_status_t status;
+  zx_handle_t child_pipe_handle;
+  int child_pipe_fd;
+  status = fdio_pipe_half(&child_pipe_fd, &child_pipe_handle);
+  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+  set_read_fd(child_pipe_fd);
+
+  // Set the pipe handle for the child.
+  fdio_spawn_action_t spawn_actions[2] = {};
+  fdio_spawn_action_t* add_handle_action = &spawn_actions[0];
+  add_handle_action->action = FDIO_SPAWN_ACTION_ADD_HANDLE;
+  add_handle_action->h.id = PA_HND(PA_FD, kFuchsiaReadPipeFd);
+  add_handle_action->h.handle = child_pipe_handle;
+
+  // Create a socket pair will be used to receive the child process' stderr.
+  zx::socket stderr_producer_socket;
+  status = zx::socket::create(0, &stderr_producer_socket, &stderr_socket_);
+  GTEST_DEATH_TEST_CHECK_(status >= 0);
+  int stderr_producer_fd = -1;
+  status =
+      fdio_fd_create(stderr_producer_socket.release(), &stderr_producer_fd);
+  GTEST_DEATH_TEST_CHECK_(status >= 0);
+
+  // Make the stderr socket nonblocking.
+  GTEST_DEATH_TEST_CHECK_(fcntl(stderr_producer_fd, F_SETFL, 0) == 0);
+
+  fdio_spawn_action_t* add_stderr_action = &spawn_actions[1];
+  add_stderr_action->action = FDIO_SPAWN_ACTION_CLONE_FD;
+  add_stderr_action->fd.local_fd = stderr_producer_fd;
+  add_stderr_action->fd.target_fd = STDERR_FILENO;
+
+  // Create a child job.
+  zx_handle_t child_job = ZX_HANDLE_INVALID;
+  status = zx_job_create(zx_job_default(), 0, &child_job);
+  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+  zx_policy_basic_t policy;
+  policy.condition = ZX_POL_NEW_ANY;
+  policy.policy = ZX_POL_ACTION_ALLOW;
+  status = zx_job_set_policy(child_job, ZX_JOB_POL_RELATIVE, ZX_JOB_POL_BASIC,
+                             &policy, 1);
+  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+
+  // Create an exception channel attached to the |child_job|, to allow
+  // us to suppress the system default exception handler from firing.
+  status = zx_task_create_exception_channel(
+      child_job, 0, exception_channel_.reset_and_get_address());
+  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+
+  // Spawn the child process.
+  status = fdio_spawn_etc(child_job, FDIO_SPAWN_CLONE_ALL, args.Argv()[0],
+                          args.Argv(), nullptr, 2, spawn_actions,
+                          child_process_.reset_and_get_address(), nullptr);
+  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+
+  set_spawned(true);
+  return OVERSEE_TEST;
+}
+
+std::string FuchsiaDeathTest::GetErrorLogs() { return captured_stderr_; }
+
+#else  // We are neither on Windows, nor on Fuchsia.
+
+// ForkingDeathTest provides implementations for most of the abstract
+// methods of the DeathTest interface.  Only the AssumeRole method is
+// left undefined.
+class ForkingDeathTest : public DeathTestImpl {
+ public:
+  ForkingDeathTest(const char* statement, Matcher<const std::string&> matcher);
+
+  // All of these virtual functions are inherited from DeathTest.
+  int Wait() override;
+
+ protected:
+  void set_child_pid(pid_t child_pid) { child_pid_ = child_pid; }
+
+ private:
+  // PID of child process during death test; 0 in the child process itself.
+  pid_t child_pid_;
+};
+
+// Constructs a ForkingDeathTest.
+ForkingDeathTest::ForkingDeathTest(const char* a_statement,
+                                   Matcher<const std::string&> matcher)
+    : DeathTestImpl(a_statement, std::move(matcher)), child_pid_(-1) {}
+
+// Waits for the child in a death test to exit, returning its exit
+// status, or 0 if no child process exists.  As a side effect, sets the
+// outcome data member.
+int ForkingDeathTest::Wait() {
+  if (!spawned()) return 0;
+
+  ReadAndInterpretStatusByte();
+
+  int status_value;
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(waitpid(child_pid_, &status_value, 0));
+  set_status(status_value);
+  return status_value;
+}
+
+// A concrete death test class that forks, then immediately runs the test
+// in the child process.
+class NoExecDeathTest : public ForkingDeathTest {
+ public:
+  NoExecDeathTest(const char* a_statement, Matcher<const std::string&> matcher)
+      : ForkingDeathTest(a_statement, std::move(matcher)) {}
+  TestRole AssumeRole() override;
+};
+
+// The AssumeRole process for a fork-and-run death test.  It implements a
+// straightforward fork, with a simple pipe to transmit the status byte.
+DeathTest::TestRole NoExecDeathTest::AssumeRole() {
+  const size_t thread_count = GetThreadCount();
+  if (thread_count != 1) {
+    GTEST_LOG_(WARNING) << DeathTestThreadWarning(thread_count);
+  }
+
+  int pipe_fd[2];
+  GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
+
+  DeathTest::set_last_death_test_message("");
+  CaptureStderr();
+  // When we fork the process below, the log file buffers are copied, but the
+  // file descriptors are shared.  We flush all log files here so that closing
+  // the file descriptors in the child process doesn't throw off the
+  // synchronization between descriptors and buffers in the parent process.
+  // This is as close to the fork as possible to avoid a race condition in case
+  // there are multiple threads running before the death test, and another
+  // thread writes to the log file.
+  FlushInfoLog();
+
+  const pid_t child_pid = fork();
+  GTEST_DEATH_TEST_CHECK_(child_pid != -1);
+  set_child_pid(child_pid);
+  if (child_pid == 0) {
+    GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[0]));
+    set_write_fd(pipe_fd[1]);
+    // Redirects all logging to stderr in the child process to prevent
+    // concurrent writes to the log files.  We capture stderr in the parent
+    // process and append the child process' output to a log.
+    LogToStderr();
+    // Event forwarding to the listeners of event listener API mush be shut
+    // down in death test subprocesses.
+    GetUnitTestImpl()->listeners()->SuppressEventForwarding();
+    g_in_fast_death_test_child = true;
+    return EXECUTE_TEST;
+  } else {
+    GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
+    set_read_fd(pipe_fd[0]);
+    set_spawned(true);
+    return OVERSEE_TEST;
+  }
+}
+
+// A concrete death test class that forks and re-executes the main
+// program from the beginning, with command-line flags set that cause
+// only this specific death test to be run.
+class ExecDeathTest : public ForkingDeathTest {
+ public:
+  ExecDeathTest(const char* a_statement, Matcher<const std::string&> matcher,
+                const char* file, int line)
+      : ForkingDeathTest(a_statement, std::move(matcher)),
+        file_(file),
+        line_(line) {}
+  TestRole AssumeRole() override;
+
+ private:
+  static ::std::vector<std::string> GetArgvsForDeathTestChildProcess() {
+    ::std::vector<std::string> args = GetInjectableArgvs();
+#if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
+    ::std::vector<std::string> extra_args =
+        GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_();
+    args.insert(args.end(), extra_args.begin(), extra_args.end());
+#endif  // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
+    return args;
+  }
+  // The name of the file in which the death test is located.
+  const char* const file_;
+  // The line number on which the death test is located.
+  const int line_;
+};
+
+// Utility class for accumulating command-line arguments.
+class Arguments {
+ public:
+  Arguments() { args_.push_back(nullptr); }
+
+  ~Arguments() {
+    for (std::vector<char*>::iterator i = args_.begin(); i != args_.end();
+         ++i) {
+      free(*i);
+    }
+  }
+  void AddArgument(const char* argument) {
+    args_.insert(args_.end() - 1, posix::StrDup(argument));
+  }
+
+  template <typename Str>
+  void AddArguments(const ::std::vector<Str>& arguments) {
+    for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
+         i != arguments.end(); ++i) {
+      args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
+    }
+  }
+  char* const* Argv() { return &args_[0]; }
+
+ private:
+  std::vector<char*> args_;
+};
+
+// A struct that encompasses the arguments to the child process of a
+// threadsafe-style death test process.
+struct ExecDeathTestArgs {
+  char* const* argv;  // Command-line arguments for the child's call to exec
+  int close_fd;       // File descriptor to close; the read end of a pipe
+};
+
+#if GTEST_OS_QNX
+extern "C" char** environ;
+#else   // GTEST_OS_QNX
+// The main function for a threadsafe-style death test child process.
+// This function is called in a clone()-ed process and thus must avoid
+// any potentially unsafe operations like malloc or libc functions.
+static int ExecDeathTestChildMain(void* child_arg) {
+  ExecDeathTestArgs* const args = static_cast<ExecDeathTestArgs*>(child_arg);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(args->close_fd));
+
+  // We need to execute the test program in the same environment where
+  // it was originally invoked.  Therefore we change to the original
+  // working directory first.
+  const char* const original_dir =
+      UnitTest::GetInstance()->original_working_dir();
+  // We can safely call chdir() as it's a direct system call.
+  if (chdir(original_dir) != 0) {
+    DeathTestAbort(std::string("chdir(\"") + original_dir +
+                   "\") failed: " + GetLastErrnoDescription());
+    return EXIT_FAILURE;
+  }
+
+  // We can safely call execv() as it's almost a direct system call. We
+  // cannot use execvp() as it's a libc function and thus potentially
+  // unsafe.  Since execv() doesn't search the PATH, the user must
+  // invoke the test program via a valid path that contains at least
+  // one path separator.
+  execv(args->argv[0], args->argv);
+  DeathTestAbort(std::string("execv(") + args->argv[0] + ", ...) in " +
+                 original_dir + " failed: " + GetLastErrnoDescription());
+  return EXIT_FAILURE;
+}
+#endif  // GTEST_OS_QNX
+
+#if GTEST_HAS_CLONE
+// Two utility routines that together determine the direction the stack
+// grows.
+// This could be accomplished more elegantly by a single recursive
+// function, but we want to guard against the unlikely possibility of
+// a smart compiler optimizing the recursion away.
+//
+// GTEST_NO_INLINE_ is required to prevent GCC 4.6 from inlining
+// StackLowerThanAddress into StackGrowsDown, which then doesn't give
+// correct answer.
+static void StackLowerThanAddress(const void* ptr,
+                                  bool* result) GTEST_NO_INLINE_;
+// Make sure sanitizers do not tamper with the stack here.
+// Ideally, we want to use `__builtin_frame_address` instead of a local variable
+// address with sanitizer disabled, but it does not work when the
+// compiler optimizes the stack frame out, which happens on PowerPC targets.
+// HWAddressSanitizer add a random tag to the MSB of the local variable address,
+// making comparison result unpredictable.
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+static void StackLowerThanAddress(const void* ptr, bool* result) {
+  int dummy = 0;
+  *result = std::less<const void*>()(&dummy, ptr);
+}
+
+// Make sure AddressSanitizer does not tamper with the stack here.
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+static bool StackGrowsDown() {
+  int dummy = 0;
+  bool result;
+  StackLowerThanAddress(&dummy, &result);
+  return result;
+}
+#endif  // GTEST_HAS_CLONE
+
+// Spawns a child process with the same executable as the current process in
+// a thread-safe manner and instructs it to run the death test.  The
+// implementation uses fork(2) + exec.  On systems where clone(2) is
+// available, it is used instead, being slightly more thread-safe.  On QNX,
+// fork supports only single-threaded environments, so this function uses
+// spawn(2) there instead.  The function dies with an error message if
+// anything goes wrong.
+static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
+  ExecDeathTestArgs args = {argv, close_fd};
+  pid_t child_pid = -1;
+
+#if GTEST_OS_QNX
+  // Obtains the current directory and sets it to be closed in the child
+  // process.
+  const int cwd_fd = open(".", O_RDONLY);
+  GTEST_DEATH_TEST_CHECK_(cwd_fd != -1);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(cwd_fd, F_SETFD, FD_CLOEXEC));
+  // We need to execute the test program in the same environment where
+  // it was originally invoked.  Therefore we change to the original
+  // working directory first.
+  const char* const original_dir =
+      UnitTest::GetInstance()->original_working_dir();
+  // We can safely call chdir() as it's a direct system call.
+  if (chdir(original_dir) != 0) {
+    DeathTestAbort(std::string("chdir(\"") + original_dir +
+                   "\") failed: " + GetLastErrnoDescription());
+    return EXIT_FAILURE;
+  }
+
+  int fd_flags;
+  // Set close_fd to be closed after spawn.
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(fd_flags = fcntl(close_fd, F_GETFD));
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(
+      fcntl(close_fd, F_SETFD, fd_flags | FD_CLOEXEC));
+  struct inheritance inherit = {0};
+  // spawn is a system call.
+  child_pid = spawn(args.argv[0], 0, nullptr, &inherit, args.argv, environ);
+  // Restores the current working directory.
+  GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd));
+
+#else  // GTEST_OS_QNX
+#if GTEST_OS_LINUX
+  // When a SIGPROF signal is received while fork() or clone() are executing,
+  // the process may hang. To avoid this, we ignore SIGPROF here and re-enable
+  // it after the call to fork()/clone() is complete.
+  struct sigaction saved_sigprof_action;
+  struct sigaction ignore_sigprof_action;
+  memset(&ignore_sigprof_action, 0, sizeof(ignore_sigprof_action));
+  sigemptyset(&ignore_sigprof_action.sa_mask);
+  ignore_sigprof_action.sa_handler = SIG_IGN;
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(
+      sigaction(SIGPROF, &ignore_sigprof_action, &saved_sigprof_action));
+#endif  // GTEST_OS_LINUX
+
+#if GTEST_HAS_CLONE
+  const bool use_fork = GTEST_FLAG_GET(death_test_use_fork);
+
+  if (!use_fork) {
+    static const bool stack_grows_down = StackGrowsDown();
+    const auto stack_size = static_cast<size_t>(getpagesize() * 2);
+    // MMAP_ANONYMOUS is not defined on Mac, so we use MAP_ANON instead.
+    void* const stack = mmap(nullptr, stack_size, PROT_READ | PROT_WRITE,
+                             MAP_ANON | MAP_PRIVATE, -1, 0);
+    GTEST_DEATH_TEST_CHECK_(stack != MAP_FAILED);
+
+    // Maximum stack alignment in bytes:  For a downward-growing stack, this
+    // amount is subtracted from size of the stack space to get an address
+    // that is within the stack space and is aligned on all systems we care
+    // about.  As far as I know there is no ABI with stack alignment greater
+    // than 64.  We assume stack and stack_size already have alignment of
+    // kMaxStackAlignment.
+    const size_t kMaxStackAlignment = 64;
+    void* const stack_top =
+        static_cast<char*>(stack) +
+        (stack_grows_down ? stack_size - kMaxStackAlignment : 0);
+    GTEST_DEATH_TEST_CHECK_(
+        static_cast<size_t>(stack_size) > kMaxStackAlignment &&
+        reinterpret_cast<uintptr_t>(stack_top) % kMaxStackAlignment == 0);
+
+    child_pid = clone(&ExecDeathTestChildMain, stack_top, SIGCHLD, &args);
+
+    GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1);
+  }
+#else
+  const bool use_fork = true;
+#endif  // GTEST_HAS_CLONE
+
+  if (use_fork && (child_pid = fork()) == 0) {
+    ExecDeathTestChildMain(&args);
+    _exit(0);
+  }
+#endif  // GTEST_OS_QNX
+#if GTEST_OS_LINUX
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(
+      sigaction(SIGPROF, &saved_sigprof_action, nullptr));
+#endif  // GTEST_OS_LINUX
+
+  GTEST_DEATH_TEST_CHECK_(child_pid != -1);
+  return child_pid;
+}
+
+// The AssumeRole process for a fork-and-exec death test.  It re-executes the
+// main program from the beginning, setting the --gtest_filter
+// and --gtest_internal_run_death_test flags to cause only the current
+// death test to be re-run.
+DeathTest::TestRole ExecDeathTest::AssumeRole() {
+  const UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const TestInfo* const info = impl->current_test_info();
+  const int death_test_index = info->result()->death_test_count();
+
+  if (flag != nullptr) {
+    set_write_fd(flag->write_fd());
+    return EXECUTE_TEST;
+  }
+
+  int pipe_fd[2];
+  GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
+  // Clear the close-on-exec flag on the write end of the pipe, lest
+  // it be closed when the child process does an exec:
+  GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1);
+
+  const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
+                                  "filter=" + info->test_suite_name() + "." +
+                                  info->name();
+  const std::string internal_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
+                                    "internal_run_death_test=" + file_ + "|" +
+                                    StreamableToString(line_) + "|" +
+                                    StreamableToString(death_test_index) + "|" +
+                                    StreamableToString(pipe_fd[1]);
+  Arguments args;
+  args.AddArguments(GetArgvsForDeathTestChildProcess());
+  args.AddArgument(filter_flag.c_str());
+  args.AddArgument(internal_flag.c_str());
+
+  DeathTest::set_last_death_test_message("");
+
+  CaptureStderr();
+  // See the comment in NoExecDeathTest::AssumeRole for why the next line
+  // is necessary.
+  FlushInfoLog();
+
+  const pid_t child_pid = ExecDeathTestSpawnChild(args.Argv(), pipe_fd[0]);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
+  set_child_pid(child_pid);
+  set_read_fd(pipe_fd[0]);
+  set_spawned(true);
+  return OVERSEE_TEST;
+}
+
+#endif  // !GTEST_OS_WINDOWS
+
+// Creates a concrete DeathTest-derived class that depends on the
+// --gtest_death_test_style flag, and sets the pointer pointed to
+// by the "test" argument to its address.  If the test should be
+// skipped, sets that pointer to NULL.  Returns true, unless the
+// flag is set to an invalid value.
+bool DefaultDeathTestFactory::Create(const char* statement,
+                                     Matcher<const std::string&> matcher,
+                                     const char* file, int line,
+                                     DeathTest** test) {
+  UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const int death_test_index =
+      impl->current_test_info()->increment_death_test_count();
+
+  if (flag != nullptr) {
+    if (death_test_index > flag->index()) {
+      DeathTest::set_last_death_test_message(
+          "Death test count (" + StreamableToString(death_test_index) +
+          ") somehow exceeded expected maximum (" +
+          StreamableToString(flag->index()) + ")");
+      return false;
+    }
+
+    if (!(flag->file() == file && flag->line() == line &&
+          flag->index() == death_test_index)) {
+      *test = nullptr;
+      return true;
+    }
+  }
+
+#if GTEST_OS_WINDOWS
+
+  if (GTEST_FLAG_GET(death_test_style) == "threadsafe" ||
+      GTEST_FLAG_GET(death_test_style) == "fast") {
+    *test = new WindowsDeathTest(statement, std::move(matcher), file, line);
+  }
+
+#elif GTEST_OS_FUCHSIA
+
+  if (GTEST_FLAG_GET(death_test_style) == "threadsafe" ||
+      GTEST_FLAG_GET(death_test_style) == "fast") {
+    *test = new FuchsiaDeathTest(statement, std::move(matcher), file, line);
+  }
+
+#else
+
+  if (GTEST_FLAG_GET(death_test_style) == "threadsafe") {
+    *test = new ExecDeathTest(statement, std::move(matcher), file, line);
+  } else if (GTEST_FLAG_GET(death_test_style) == "fast") {
+    *test = new NoExecDeathTest(statement, std::move(matcher));
+  }
+
+#endif  // GTEST_OS_WINDOWS
+
+  else {  // NOLINT - this is more readable than unbalanced brackets inside #if.
+    DeathTest::set_last_death_test_message("Unknown death test style \"" +
+                                           GTEST_FLAG_GET(death_test_style) +
+                                           "\" encountered");
+    return false;
+  }
+
+  return true;
+}
+
+#if GTEST_OS_WINDOWS
+// Recreates the pipe and event handles from the provided parameters,
+// signals the event, and returns a file descriptor wrapped around the pipe
+// handle. This function is called in the child process only.
+static int GetStatusFileDescriptor(unsigned int parent_process_id,
+                                   size_t write_handle_as_size_t,
+                                   size_t event_handle_as_size_t) {
+  AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE,
+                                                 FALSE,  // Non-inheritable.
+                                                 parent_process_id));
+  if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) {
+    DeathTestAbort("Unable to open parent process " +
+                   StreamableToString(parent_process_id));
+  }
+
+  GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t));
+
+  const HANDLE write_handle = reinterpret_cast<HANDLE>(write_handle_as_size_t);
+  HANDLE dup_write_handle;
+
+  // The newly initialized handle is accessible only in the parent
+  // process. To obtain one accessible within the child, we need to use
+  // DuplicateHandle.
+  if (!::DuplicateHandle(parent_process_handle.Get(), write_handle,
+                         ::GetCurrentProcess(), &dup_write_handle,
+                         0x0,    // Requested privileges ignored since
+                                 // DUPLICATE_SAME_ACCESS is used.
+                         FALSE,  // Request non-inheritable handler.
+                         DUPLICATE_SAME_ACCESS)) {
+    DeathTestAbort("Unable to duplicate the pipe handle " +
+                   StreamableToString(write_handle_as_size_t) +
+                   " from the parent process " +
+                   StreamableToString(parent_process_id));
+  }
+
+  const HANDLE event_handle = reinterpret_cast<HANDLE>(event_handle_as_size_t);
+  HANDLE dup_event_handle;
+
+  if (!::DuplicateHandle(parent_process_handle.Get(), event_handle,
+                         ::GetCurrentProcess(), &dup_event_handle, 0x0, FALSE,
+                         DUPLICATE_SAME_ACCESS)) {
+    DeathTestAbort("Unable to duplicate the event handle " +
+                   StreamableToString(event_handle_as_size_t) +
+                   " from the parent process " +
+                   StreamableToString(parent_process_id));
+  }
+
+  const int write_fd =
+      ::_open_osfhandle(reinterpret_cast<intptr_t>(dup_write_handle), O_APPEND);
+  if (write_fd == -1) {
+    DeathTestAbort("Unable to convert pipe handle " +
+                   StreamableToString(write_handle_as_size_t) +
+                   " to a file descriptor");
+  }
+
+  // Signals the parent that the write end of the pipe has been acquired
+  // so the parent can release its own write end.
+  ::SetEvent(dup_event_handle);
+
+  return write_fd;
+}
+#endif  // GTEST_OS_WINDOWS
+
+// Returns a newly created InternalRunDeathTestFlag object with fields
+// initialized from the GTEST_FLAG(internal_run_death_test) flag if
+// the flag is specified; otherwise returns NULL.
+InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
+  if (GTEST_FLAG_GET(internal_run_death_test) == "") return nullptr;
+
+  // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we
+  // can use it here.
+  int line = -1;
+  int index = -1;
+  ::std::vector< ::std::string> fields;
+  SplitString(GTEST_FLAG_GET(internal_run_death_test), '|', &fields);
+  int write_fd = -1;
+
+#if GTEST_OS_WINDOWS
+
+  unsigned int parent_process_id = 0;
+  size_t write_handle_as_size_t = 0;
+  size_t event_handle_as_size_t = 0;
+
+  if (fields.size() != 6 || !ParseNaturalNumber(fields[1], &line) ||
+      !ParseNaturalNumber(fields[2], &index) ||
+      !ParseNaturalNumber(fields[3], &parent_process_id) ||
+      !ParseNaturalNumber(fields[4], &write_handle_as_size_t) ||
+      !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
+                   GTEST_FLAG_GET(internal_run_death_test));
+  }
+  write_fd = GetStatusFileDescriptor(parent_process_id, write_handle_as_size_t,
+                                     event_handle_as_size_t);
+
+#elif GTEST_OS_FUCHSIA
+
+  if (fields.size() != 3 || !ParseNaturalNumber(fields[1], &line) ||
+      !ParseNaturalNumber(fields[2], &index)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
+                   GTEST_FLAG_GET(internal_run_death_test));
+  }
+
+#else
+
+  if (fields.size() != 4 || !ParseNaturalNumber(fields[1], &line) ||
+      !ParseNaturalNumber(fields[2], &index) ||
+      !ParseNaturalNumber(fields[3], &write_fd)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
+                   GTEST_FLAG_GET(internal_run_death_test));
+  }
+
+#endif  // GTEST_OS_WINDOWS
+
+  return new InternalRunDeathTestFlag(fields[0], line, index, write_fd);
+}
+
+}  // namespace internal
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+}  // namespace testing
diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest-filepath.cc b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-filepath.cc
new file mode 100644
index 0000000000..f6ee90cdb7
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-filepath.cc
@@ -0,0 +1,367 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "gtest/internal/gtest-filepath.h"
+
+#include <stdlib.h>
+
+#include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-port.h"
+
+#if GTEST_OS_WINDOWS_MOBILE
+#include <windows.h>
+#elif GTEST_OS_WINDOWS
+#include <direct.h>
+#include <io.h>
+#else
+#include <limits.h>
+
+#include <climits>  // Some Linux distributions define PATH_MAX here.
+#endif              // GTEST_OS_WINDOWS_MOBILE
+
+#include "gtest/internal/gtest-string.h"
+
+#if GTEST_OS_WINDOWS
+#define GTEST_PATH_MAX_ _MAX_PATH
+#elif defined(PATH_MAX)
+#define GTEST_PATH_MAX_ PATH_MAX
+#elif defined(_XOPEN_PATH_MAX)
+#define GTEST_PATH_MAX_ _XOPEN_PATH_MAX
+#else
+#define GTEST_PATH_MAX_ _POSIX_PATH_MAX
+#endif  // GTEST_OS_WINDOWS
+
+namespace testing {
+namespace internal {
+
+#if GTEST_OS_WINDOWS
+// On Windows, '\\' is the standard path separator, but many tools and the
+// Windows API also accept '/' as an alternate path separator. Unless otherwise
+// noted, a file path can contain either kind of path separators, or a mixture
+// of them.
+const char kPathSeparator = '\\';
+const char kAlternatePathSeparator = '/';
+const char kAlternatePathSeparatorString[] = "/";
+#if GTEST_OS_WINDOWS_MOBILE
+// Windows CE doesn't have a current directory. You should not use
+// the current directory in tests on Windows CE, but this at least
+// provides a reasonable fallback.
+const char kCurrentDirectoryString[] = "\\";
+// Windows CE doesn't define INVALID_FILE_ATTRIBUTES
+const DWORD kInvalidFileAttributes = 0xffffffff;
+#else
+const char kCurrentDirectoryString[] = ".\\";
+#endif  // GTEST_OS_WINDOWS_MOBILE
+#else
+const char kPathSeparator = '/';
+const char kCurrentDirectoryString[] = "./";
+#endif  // GTEST_OS_WINDOWS
+
+// Returns whether the given character is a valid path separator.
+static bool IsPathSeparator(char c) {
+#if GTEST_HAS_ALT_PATH_SEP_
+  return (c == kPathSeparator) || (c == kAlternatePathSeparator);
+#else
+  return c == kPathSeparator;
+#endif
+}
+
+// Returns the current working directory, or "" if unsuccessful.
+FilePath FilePath::GetCurrentDir() {
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE ||         \
+    GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_ESP32 || \
+    GTEST_OS_XTENSA
+  // These platforms do not have a current directory, so we just return
+  // something reasonable.
+  return FilePath(kCurrentDirectoryString);
+#elif GTEST_OS_WINDOWS
+  char cwd[GTEST_PATH_MAX_ + 1] = {'\0'};
+  return FilePath(_getcwd(cwd, sizeof(cwd)) == nullptr ? "" : cwd);
+#else
+  char cwd[GTEST_PATH_MAX_ + 1] = {'\0'};
+  char* result = getcwd(cwd, sizeof(cwd));
+#if GTEST_OS_NACL
+  // getcwd will likely fail in NaCl due to the sandbox, so return something
+  // reasonable. The user may have provided a shim implementation for getcwd,
+  // however, so fallback only when failure is detected.
+  return FilePath(result == nullptr ? kCurrentDirectoryString : cwd);
+#endif  // GTEST_OS_NACL
+  return FilePath(result == nullptr ? "" : cwd);
+#endif  // GTEST_OS_WINDOWS_MOBILE
+}
+
+// Returns a copy of the FilePath with the case-insensitive extension removed.
+// Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
+// FilePath("dir/file"). If a case-insensitive extension is not
+// found, returns a copy of the original FilePath.
+FilePath FilePath::RemoveExtension(const char* extension) const {
+  const std::string dot_extension = std::string(".") + extension;
+  if (String::EndsWithCaseInsensitive(pathname_, dot_extension)) {
+    return FilePath(
+        pathname_.substr(0, pathname_.length() - dot_extension.length()));
+  }
+  return *this;
+}
+
+// Returns a pointer to the last occurrence of a valid path separator in
+// the FilePath. On Windows, for example, both '/' and '\' are valid path
+// separators. Returns NULL if no path separator was found.
+const char* FilePath::FindLastPathSeparator() const {
+  const char* const last_sep = strrchr(c_str(), kPathSeparator);
+#if GTEST_HAS_ALT_PATH_SEP_
+  const char* const last_alt_sep = strrchr(c_str(), kAlternatePathSeparator);
+  // Comparing two pointers of which only one is NULL is undefined.
+  if (last_alt_sep != nullptr &&
+      (last_sep == nullptr || last_alt_sep > last_sep)) {
+    return last_alt_sep;
+  }
+#endif
+  return last_sep;
+}
+
+// Returns a copy of the FilePath with the directory part removed.
+// Example: FilePath("path/to/file").RemoveDirectoryName() returns
+// FilePath("file"). If there is no directory part ("just_a_file"), it returns
+// the FilePath unmodified. If there is no file part ("just_a_dir/") it
+// returns an empty FilePath ("").
+// On Windows platform, '\' is the path separator, otherwise it is '/'.
+FilePath FilePath::RemoveDirectoryName() const {
+  const char* const last_sep = FindLastPathSeparator();
+  return last_sep ? FilePath(last_sep + 1) : *this;
+}
+
+// RemoveFileName returns the directory path with the filename removed.
+// Example: FilePath("path/to/file").RemoveFileName() returns "path/to/".
+// If the FilePath is "a_file" or "/a_file", RemoveFileName returns
+// FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does
+// not have a file, like "just/a/dir/", it returns the FilePath unmodified.
+// On Windows platform, '\' is the path separator, otherwise it is '/'.
+FilePath FilePath::RemoveFileName() const {
+  const char* const last_sep = FindLastPathSeparator();
+  std::string dir;
+  if (last_sep) {
+    dir = std::string(c_str(), static_cast<size_t>(last_sep + 1 - c_str()));
+  } else {
+    dir = kCurrentDirectoryString;
+  }
+  return FilePath(dir);
+}
+
+// Helper functions for naming files in a directory for xml output.
+
+// Given directory = "dir", base_name = "test", number = 0,
+// extension = "xml", returns "dir/test.xml". If number is greater
+// than zero (e.g., 12), returns "dir/test_12.xml".
+// On Windows platform, uses \ as the separator rather than /.
+FilePath FilePath::MakeFileName(const FilePath& directory,
+                                const FilePath& base_name, int number,
+                                const char* extension) {
+  std::string file;
+  if (number == 0) {
+    file = base_name.string() + "." + extension;
+  } else {
+    file =
+        base_name.string() + "_" + StreamableToString(number) + "." + extension;
+  }
+  return ConcatPaths(directory, FilePath(file));
+}
+
+// Given directory = "dir", relative_path = "test.xml", returns "dir/test.xml".
+// On Windows, uses \ as the separator rather than /.
+FilePath FilePath::ConcatPaths(const FilePath& directory,
+                               const FilePath& relative_path) {
+  if (directory.IsEmpty()) return relative_path;
+  const FilePath dir(directory.RemoveTrailingPathSeparator());
+  return FilePath(dir.string() + kPathSeparator + relative_path.string());
+}
+
+// Returns true if pathname describes something findable in the file-system,
+// either a file, directory, or whatever.
+bool FilePath::FileOrDirectoryExists() const {
+#if GTEST_OS_WINDOWS_MOBILE
+  LPCWSTR unicode = String::AnsiToUtf16(pathname_.c_str());
+  const DWORD attributes = GetFileAttributes(unicode);
+  delete[] unicode;
+  return attributes != kInvalidFileAttributes;
+#else
+  posix::StatStruct file_stat{};
+  return posix::Stat(pathname_.c_str(), &file_stat) == 0;
+#endif  // GTEST_OS_WINDOWS_MOBILE
+}
+
+// Returns true if pathname describes a directory in the file-system
+// that exists.
+bool FilePath::DirectoryExists() const {
+  bool result = false;
+#if GTEST_OS_WINDOWS
+  // Don't strip off trailing separator if path is a root directory on
+  // Windows (like "C:\\").
+  const FilePath& path(IsRootDirectory() ? *this
+                                         : RemoveTrailingPathSeparator());
+#else
+  const FilePath& path(*this);
+#endif
+
+#if GTEST_OS_WINDOWS_MOBILE
+  LPCWSTR unicode = String::AnsiToUtf16(path.c_str());
+  const DWORD attributes = GetFileAttributes(unicode);
+  delete[] unicode;
+  if ((attributes != kInvalidFileAttributes) &&
+      (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
+    result = true;
+  }
+#else
+  posix::StatStruct file_stat{};
+  result =
+      posix::Stat(path.c_str(), &file_stat) == 0 && posix::IsDir(file_stat);
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+  return result;
+}
+
+// Returns true if pathname describes a root directory. (Windows has one
+// root directory per disk drive.)
+bool FilePath::IsRootDirectory() const {
+#if GTEST_OS_WINDOWS
+  return pathname_.length() == 3 && IsAbsolutePath();
+#else
+  return pathname_.length() == 1 && IsPathSeparator(pathname_.c_str()[0]);
+#endif
+}
+
+// Returns true if pathname describes an absolute path.
+bool FilePath::IsAbsolutePath() const {
+  const char* const name = pathname_.c_str();
+#if GTEST_OS_WINDOWS
+  return pathname_.length() >= 3 &&
+         ((name[0] >= 'a' && name[0] <= 'z') ||
+          (name[0] >= 'A' && name[0] <= 'Z')) &&
+         name[1] == ':' && IsPathSeparator(name[2]);
+#else
+  return IsPathSeparator(name[0]);
+#endif
+}
+
+// Returns a pathname for a file that does not currently exist. The pathname
+// will be directory/base_name.extension or
+// directory/base_name_<number>.extension if directory/base_name.extension
+// already exists. The number will be incremented until a pathname is found
+// that does not already exist.
+// Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
+// There could be a race condition if two or more processes are calling this
+// function at the same time -- they could both pick the same filename.
+FilePath FilePath::GenerateUniqueFileName(const FilePath& directory,
+                                          const FilePath& base_name,
+                                          const char* extension) {
+  FilePath full_pathname;
+  int number = 0;
+  do {
+    full_pathname.Set(MakeFileName(directory, base_name, number++, extension));
+  } while (full_pathname.FileOrDirectoryExists());
+  return full_pathname;
+}
+
+// Returns true if FilePath ends with a path separator, which indicates that
+// it is intended to represent a directory. Returns false otherwise.
+// This does NOT check that a directory (or file) actually exists.
+bool FilePath::IsDirectory() const {
+  return !pathname_.empty() &&
+         IsPathSeparator(pathname_.c_str()[pathname_.length() - 1]);
+}
+
+// Create directories so that path exists. Returns true if successful or if
+// the directories already exist; returns false if unable to create directories
+// for any reason.
+bool FilePath::CreateDirectoriesRecursively() const {
+  if (!this->IsDirectory()) {
+    return false;
+  }
+
+  if (pathname_.length() == 0 || this->DirectoryExists()) {
+    return true;
+  }
+
+  const FilePath parent(this->RemoveTrailingPathSeparator().RemoveFileName());
+  return parent.CreateDirectoriesRecursively() && this->CreateFolder();
+}
+
+// Create the directory so that path exists. Returns true if successful or
+// if the directory already exists; returns false if unable to create the
+// directory for any reason, including if the parent directory does not
+// exist. Not named "CreateDirectory" because that's a macro on Windows.
+bool FilePath::CreateFolder() const {
+#if GTEST_OS_WINDOWS_MOBILE
+  FilePath removed_sep(this->RemoveTrailingPathSeparator());
+  LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str());
+  int result = CreateDirectory(unicode, nullptr) ? 0 : -1;
+  delete[] unicode;
+#elif GTEST_OS_WINDOWS
+  int result = _mkdir(pathname_.c_str());
+#elif GTEST_OS_ESP8266 || GTEST_OS_XTENSA
+  // do nothing
+  int result = 0;
+#else
+  int result = mkdir(pathname_.c_str(), 0777);
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+  if (result == -1) {
+    return this->DirectoryExists();  // An error is OK if the directory exists.
+  }
+  return true;  // No error.
+}
+
+// If input name has a trailing separator character, remove it and return the
+// name, otherwise return the name string unmodified.
+// On Windows platform, uses \ as the separator, other platforms use /.
+FilePath FilePath::RemoveTrailingPathSeparator() const {
+  return IsDirectory() ? FilePath(pathname_.substr(0, pathname_.length() - 1))
+                       : *this;
+}
+
+// Removes any redundant separators that might be in the pathname.
+// For example, "bar///foo" becomes "bar/foo". Does not eliminate other
+// redundancies that might be in a pathname involving "." or "..".
+void FilePath::Normalize() {
+  auto out = pathname_.begin();
+
+  for (const char character : pathname_) {
+    if (!IsPathSeparator(character)) {
+      *(out++) = character;
+    } else if (out == pathname_.begin() || *std::prev(out) != kPathSeparator) {
+      *(out++) = kPathSeparator;
+    } else {
+      continue;
+    }
+  }
+
+  pathname_.erase(out, pathname_.end());
+}
+
+}  // namespace internal
+}  // namespace testing
diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest-internal-inl.h b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-internal-inl.h
new file mode 100644
index 0000000000..0b9e929c68
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-internal-inl.h
@@ -0,0 +1,1212 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Utility functions and classes used by the Google C++ testing framework.//
+// This file contains purely Google Test's internal implementation.  Please
+// DO NOT #INCLUDE IT IN A USER PROGRAM.
+
+#ifndef GOOGLETEST_SRC_GTEST_INTERNAL_INL_H_
+#define GOOGLETEST_SRC_GTEST_INTERNAL_INL_H_
+
+#ifndef _WIN32_WCE
+#include <errno.h>
+#endif  // !_WIN32_WCE
+#include <stddef.h>
+#include <stdlib.h>  // For strtoll/_strtoul64/malloc/free.
+#include <string.h>  // For memmove.
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "gtest/internal/gtest-port.h"
+
+#if GTEST_CAN_STREAM_RESULTS_
+#include <arpa/inet.h>  // NOLINT
+#include <netdb.h>      // NOLINT
+#endif
+
+#if GTEST_OS_WINDOWS
+#include <windows.h>  // NOLINT
+#endif                // GTEST_OS_WINDOWS
+
+#include "gtest/gtest-spi.h"
+#include "gtest/gtest.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+// Declares the flags.
+//
+// We don't want the users to modify this flag in the code, but want
+// Google Test's own unit tests to be able to access it. Therefore we
+// declare it here as opposed to in gtest.h.
+GTEST_DECLARE_bool_(death_test_use_fork);
+
+namespace testing {
+namespace internal {
+
+// The value of GetTestTypeId() as seen from within the Google Test
+// library.  This is solely for testing GetTestTypeId().
+GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest;
+
+// A valid random seed must be in [1, kMaxRandomSeed].
+const int kMaxRandomSeed = 99999;
+
+// g_help_flag is true if and only if the --help flag or an equivalent form
+// is specified on the command line.
+GTEST_API_ extern bool g_help_flag;
+
+// Returns the current time in milliseconds.
+GTEST_API_ TimeInMillis GetTimeInMillis();
+
+// Returns true if and only if Google Test should use colors in the output.
+GTEST_API_ bool ShouldUseColor(bool stdout_is_tty);
+
+// Formats the given time in milliseconds as seconds.
+GTEST_API_ std::string FormatTimeInMillisAsSeconds(TimeInMillis ms);
+
+// Converts the given time in milliseconds to a date string in the ISO 8601
+// format, without the timezone information.  N.B.: due to the use the
+// non-reentrant localtime() function, this function is not thread safe.  Do
+// not use it in any code that can be called from multiple threads.
+GTEST_API_ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms);
+
+// Parses a string for an Int32 flag, in the form of "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+GTEST_API_ bool ParseFlag(const char* str, const char* flag, int32_t* value);
+
+// Returns a random seed in range [1, kMaxRandomSeed] based on the
+// given --gtest_random_seed flag value.
+inline int GetRandomSeedFromFlag(int32_t random_seed_flag) {
+  const unsigned int raw_seed =
+      (random_seed_flag == 0) ? static_cast<unsigned int>(GetTimeInMillis())
+                              : static_cast<unsigned int>(random_seed_flag);
+
+  // Normalizes the actual seed to range [1, kMaxRandomSeed] such that
+  // it's easy to type.
+  const int normalized_seed =
+      static_cast<int>((raw_seed - 1U) %
+                       static_cast<unsigned int>(kMaxRandomSeed)) +
+      1;
+  return normalized_seed;
+}
+
+// Returns the first valid random seed after 'seed'.  The behavior is
+// undefined if 'seed' is invalid.  The seed after kMaxRandomSeed is
+// considered to be 1.
+inline int GetNextRandomSeed(int seed) {
+  GTEST_CHECK_(1 <= seed && seed <= kMaxRandomSeed)
+      << "Invalid random seed " << seed << " - must be in [1, "
+      << kMaxRandomSeed << "].";
+  const int next_seed = seed + 1;
+  return (next_seed > kMaxRandomSeed) ? 1 : next_seed;
+}
+
+// This class saves the values of all Google Test flags in its c'tor, and
+// restores them in its d'tor.
+class GTestFlagSaver {
+ public:
+  // The c'tor.
+  GTestFlagSaver() {
+    also_run_disabled_tests_ = GTEST_FLAG_GET(also_run_disabled_tests);
+    break_on_failure_ = GTEST_FLAG_GET(break_on_failure);
+    catch_exceptions_ = GTEST_FLAG_GET(catch_exceptions);
+    color_ = GTEST_FLAG_GET(color);
+    death_test_style_ = GTEST_FLAG_GET(death_test_style);
+    death_test_use_fork_ = GTEST_FLAG_GET(death_test_use_fork);
+    fail_fast_ = GTEST_FLAG_GET(fail_fast);
+    filter_ = GTEST_FLAG_GET(filter);
+    internal_run_death_test_ = GTEST_FLAG_GET(internal_run_death_test);
+    list_tests_ = GTEST_FLAG_GET(list_tests);
+    output_ = GTEST_FLAG_GET(output);
+    brief_ = GTEST_FLAG_GET(brief);
+    print_time_ = GTEST_FLAG_GET(print_time);
+    print_utf8_ = GTEST_FLAG_GET(print_utf8);
+    random_seed_ = GTEST_FLAG_GET(random_seed);
+    repeat_ = GTEST_FLAG_GET(repeat);
+    recreate_environments_when_repeating_ =
+        GTEST_FLAG_GET(recreate_environments_when_repeating);
+    shuffle_ = GTEST_FLAG_GET(shuffle);
+    stack_trace_depth_ = GTEST_FLAG_GET(stack_trace_depth);
+    stream_result_to_ = GTEST_FLAG_GET(stream_result_to);
+    throw_on_failure_ = GTEST_FLAG_GET(throw_on_failure);
+  }
+
+  // The d'tor is not virtual.  DO NOT INHERIT FROM THIS CLASS.
+  ~GTestFlagSaver() {
+    GTEST_FLAG_SET(also_run_disabled_tests, also_run_disabled_tests_);
+    GTEST_FLAG_SET(break_on_failure, break_on_failure_);
+    GTEST_FLAG_SET(catch_exceptions, catch_exceptions_);
+    GTEST_FLAG_SET(color, color_);
+    GTEST_FLAG_SET(death_test_style, death_test_style_);
+    GTEST_FLAG_SET(death_test_use_fork, death_test_use_fork_);
+    GTEST_FLAG_SET(filter, filter_);
+    GTEST_FLAG_SET(fail_fast, fail_fast_);
+    GTEST_FLAG_SET(internal_run_death_test, internal_run_death_test_);
+    GTEST_FLAG_SET(list_tests, list_tests_);
+    GTEST_FLAG_SET(output, output_);
+    GTEST_FLAG_SET(brief, brief_);
+    GTEST_FLAG_SET(print_time, print_time_);
+    GTEST_FLAG_SET(print_utf8, print_utf8_);
+    GTEST_FLAG_SET(random_seed, random_seed_);
+    GTEST_FLAG_SET(repeat, repeat_);
+    GTEST_FLAG_SET(recreate_environments_when_repeating,
+                   recreate_environments_when_repeating_);
+    GTEST_FLAG_SET(shuffle, shuffle_);
+    GTEST_FLAG_SET(stack_trace_depth, stack_trace_depth_);
+    GTEST_FLAG_SET(stream_result_to, stream_result_to_);
+    GTEST_FLAG_SET(throw_on_failure, throw_on_failure_);
+  }
+
+ private:
+  // Fields for saving the original values of flags.
+  bool also_run_disabled_tests_;
+  bool break_on_failure_;
+  bool catch_exceptions_;
+  std::string color_;
+  std::string death_test_style_;
+  bool death_test_use_fork_;
+  bool fail_fast_;
+  std::string filter_;
+  std::string internal_run_death_test_;
+  bool list_tests_;
+  std::string output_;
+  bool brief_;
+  bool print_time_;
+  bool print_utf8_;
+  int32_t random_seed_;
+  int32_t repeat_;
+  bool recreate_environments_when_repeating_;
+  bool shuffle_;
+  int32_t stack_trace_depth_;
+  std::string stream_result_to_;
+  bool throw_on_failure_;
+} GTEST_ATTRIBUTE_UNUSED_;
+
+// Converts a Unicode code point to a narrow string in UTF-8 encoding.
+// code_point parameter is of type UInt32 because wchar_t may not be
+// wide enough to contain a code point.
+// If the code_point is not a valid Unicode code point
+// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
+// to "(Invalid Unicode 0xXXXXXXXX)".
+GTEST_API_ std::string CodePointToUtf8(uint32_t code_point);
+
+// Converts a wide string to a narrow string in UTF-8 encoding.
+// The wide string is assumed to have the following encoding:
+//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin)
+//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
+// Parameter str points to a null-terminated wide string.
+// Parameter num_chars may additionally limit the number
+// of wchar_t characters processed. -1 is used when the entire string
+// should be processed.
+// If the string contains code points that are not valid Unicode code points
+// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
+// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
+// and contains invalid UTF-16 surrogate pairs, values in those pairs
+// will be encoded as individual Unicode characters from Basic Normal Plane.
+GTEST_API_ std::string WideStringToUtf8(const wchar_t* str, int num_chars);
+
+// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
+// if the variable is present. If a file already exists at this location, this
+// function will write over it. If the variable is present, but the file cannot
+// be created, prints an error and exits.
+void WriteToShardStatusFileIfNeeded();
+
+// Checks whether sharding is enabled by examining the relevant
+// environment variable values. If the variables are present,
+// but inconsistent (e.g., shard_index >= total_shards), prints
+// an error and exits. If in_subprocess_for_death_test, sharding is
+// disabled because it must only be applied to the original test
+// process. Otherwise, we could filter out death tests we intended to execute.
+GTEST_API_ bool ShouldShard(const char* total_shards_str,
+                            const char* shard_index_str,
+                            bool in_subprocess_for_death_test);
+
+// Parses the environment variable var as a 32-bit integer. If it is unset,
+// returns default_val. If it is not a 32-bit integer, prints an error and
+// and aborts.
+GTEST_API_ int32_t Int32FromEnvOrDie(const char* env_var, int32_t default_val);
+
+// Given the total number of shards, the shard index, and the test id,
+// returns true if and only if the test should be run on this shard. The test id
+// is some arbitrary but unique non-negative integer assigned to each test
+// method. Assumes that 0 <= shard_index < total_shards.
+GTEST_API_ bool ShouldRunTestOnShard(int total_shards, int shard_index,
+                                     int test_id);
+
+// STL container utilities.
+
+// Returns the number of elements in the given container that satisfy
+// the given predicate.
+template <class Container, typename Predicate>
+inline int CountIf(const Container& c, Predicate predicate) {
+  // Implemented as an explicit loop since std::count_if() in libCstd on
+  // Solaris has a non-standard signature.
+  int count = 0;
+  for (auto it = c.begin(); it != c.end(); ++it) {
+    if (predicate(*it)) ++count;
+  }
+  return count;
+}
+
+// Applies a function/functor to each element in the container.
+template <class Container, typename Functor>
+void ForEach(const Container& c, Functor functor) {
+  std::for_each(c.begin(), c.end(), functor);
+}
+
+// Returns the i-th element of the vector, or default_value if i is not
+// in range [0, v.size()).
+template <typename E>
+inline E GetElementOr(const std::vector<E>& v, int i, E default_value) {
+  return (i < 0 || i >= static_cast<int>(v.size())) ? default_value
+                                                    : v[static_cast<size_t>(i)];
+}
+
+// Performs an in-place shuffle of a range of the vector's elements.
+// 'begin' and 'end' are element indices as an STL-style range;
+// i.e. [begin, end) are shuffled, where 'end' == size() means to
+// shuffle to the end of the vector.
+template <typename E>
+void ShuffleRange(internal::Random* random, int begin, int end,
+                  std::vector<E>* v) {
+  const int size = static_cast<int>(v->size());
+  GTEST_CHECK_(0 <= begin && begin <= size)
+      << "Invalid shuffle range start " << begin << ": must be in range [0, "
+      << size << "].";
+  GTEST_CHECK_(begin <= end && end <= size)
+      << "Invalid shuffle range finish " << end << ": must be in range ["
+      << begin << ", " << size << "].";
+
+  // Fisher-Yates shuffle, from
+  // http://en.wikipedia.org/wiki/Fisher-Yates_shuffle
+  for (int range_width = end - begin; range_width >= 2; range_width--) {
+    const int last_in_range = begin + range_width - 1;
+    const int selected =
+        begin +
+        static_cast<int>(random->Generate(static_cast<uint32_t>(range_width)));
+    std::swap((*v)[static_cast<size_t>(selected)],
+              (*v)[static_cast<size_t>(last_in_range)]);
+  }
+}
+
+// Performs an in-place shuffle of the vector's elements.
+template <typename E>
+inline void Shuffle(internal::Random* random, std::vector<E>* v) {
+  ShuffleRange(random, 0, static_cast<int>(v->size()), v);
+}
+
+// A function for deleting an object.  Handy for being used as a
+// functor.
+template <typename T>
+static void Delete(T* x) {
+  delete x;
+}
+
+// A predicate that checks the key of a TestProperty against a known key.
+//
+// TestPropertyKeyIs is copyable.
+class TestPropertyKeyIs {
+ public:
+  // Constructor.
+  //
+  // TestPropertyKeyIs has NO default constructor.
+  explicit TestPropertyKeyIs(const std::string& key) : key_(key) {}
+
+  // Returns true if and only if the test name of test property matches on key_.
+  bool operator()(const TestProperty& test_property) const {
+    return test_property.key() == key_;
+  }
+
+ private:
+  std::string key_;
+};
+
+// Class UnitTestOptions.
+//
+// This class contains functions for processing options the user
+// specifies when running the tests.  It has only static members.
+//
+// In most cases, the user can specify an option using either an
+// environment variable or a command line flag.  E.g. you can set the
+// test filter using either GTEST_FILTER or --gtest_filter.  If both
+// the variable and the flag are present, the latter overrides the
+// former.
+class GTEST_API_ UnitTestOptions {
+ public:
+  // Functions for processing the gtest_output flag.
+
+  // Returns the output format, or "" for normal printed output.
+  static std::string GetOutputFormat();
+
+  // Returns the absolute path of the requested output file, or the
+  // default (test_detail.xml in the original working directory) if
+  // none was explicitly specified.
+  static std::string GetAbsolutePathToOutputFile();
+
+  // Functions for processing the gtest_filter flag.
+
+  // Returns true if and only if the user-specified filter matches the test
+  // suite name and the test name.
+  static bool FilterMatchesTest(const std::string& test_suite_name,
+                                const std::string& test_name);
+
+#if GTEST_OS_WINDOWS
+  // Function for supporting the gtest_catch_exception flag.
+
+  // Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
+  // given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
+  // This function is useful as an __except condition.
+  static int GTestShouldProcessSEH(DWORD exception_code);
+#endif  // GTEST_OS_WINDOWS
+
+  // Returns true if "name" matches the ':' separated list of glob-style
+  // filters in "filter".
+  static bool MatchesFilter(const std::string& name, const char* filter);
+};
+
+// Returns the current application's name, removing directory path if that
+// is present.  Used by UnitTestOptions::GetOutputFile.
+GTEST_API_ FilePath GetCurrentExecutableName();
+
+// The role interface for getting the OS stack trace as a string.
+class OsStackTraceGetterInterface {
+ public:
+  OsStackTraceGetterInterface() {}
+  virtual ~OsStackTraceGetterInterface() {}
+
+  // Returns the current OS stack trace as an std::string.  Parameters:
+  //
+  //   max_depth  - the maximum number of stack frames to be included
+  //                in the trace.
+  //   skip_count - the number of top frames to be skipped; doesn't count
+  //                against max_depth.
+  virtual std::string CurrentStackTrace(int max_depth, int skip_count) = 0;
+
+  // UponLeavingGTest() should be called immediately before Google Test calls
+  // user code. It saves some information about the current stack that
+  // CurrentStackTrace() will use to find and hide Google Test stack frames.
+  virtual void UponLeavingGTest() = 0;
+
+  // This string is inserted in place of stack frames that are part of
+  // Google Test's implementation.
+  static const char* const kElidedFramesMarker;
+
+ private:
+  OsStackTraceGetterInterface(const OsStackTraceGetterInterface&) = delete;
+  OsStackTraceGetterInterface& operator=(const OsStackTraceGetterInterface&) =
+      delete;
+};
+
+// A working implementation of the OsStackTraceGetterInterface interface.
+class OsStackTraceGetter : public OsStackTraceGetterInterface {
+ public:
+  OsStackTraceGetter() {}
+
+  std::string CurrentStackTrace(int max_depth, int skip_count) override;
+  void UponLeavingGTest() override;
+
+ private:
+#if GTEST_HAS_ABSL
+  Mutex mutex_;  // Protects all internal state.
+
+  // We save the stack frame below the frame that calls user code.
+  // We do this because the address of the frame immediately below
+  // the user code changes between the call to UponLeavingGTest()
+  // and any calls to the stack trace code from within the user code.
+  void* caller_frame_ = nullptr;
+#endif  // GTEST_HAS_ABSL
+
+  OsStackTraceGetter(const OsStackTraceGetter&) = delete;
+  OsStackTraceGetter& operator=(const OsStackTraceGetter&) = delete;
+};
+
+// Information about a Google Test trace point.
+struct TraceInfo {
+  const char* file;
+  int line;
+  std::string message;
+};
+
+// This is the default global test part result reporter used in UnitTestImpl.
+// This class should only be used by UnitTestImpl.
+class DefaultGlobalTestPartResultReporter
+    : public TestPartResultReporterInterface {
+ public:
+  explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test);
+  // Implements the TestPartResultReporterInterface. Reports the test part
+  // result in the current test.
+  void ReportTestPartResult(const TestPartResult& result) override;
+
+ private:
+  UnitTestImpl* const unit_test_;
+
+  DefaultGlobalTestPartResultReporter(
+      const DefaultGlobalTestPartResultReporter&) = delete;
+  DefaultGlobalTestPartResultReporter& operator=(
+      const DefaultGlobalTestPartResultReporter&) = delete;
+};
+
+// This is the default per thread test part result reporter used in
+// UnitTestImpl. This class should only be used by UnitTestImpl.
+class DefaultPerThreadTestPartResultReporter
+    : public TestPartResultReporterInterface {
+ public:
+  explicit DefaultPerThreadTestPartResultReporter(UnitTestImpl* unit_test);
+  // Implements the TestPartResultReporterInterface. The implementation just
+  // delegates to the current global test part result reporter of *unit_test_.
+  void ReportTestPartResult(const TestPartResult& result) override;
+
+ private:
+  UnitTestImpl* const unit_test_;
+
+  DefaultPerThreadTestPartResultReporter(
+      const DefaultPerThreadTestPartResultReporter&) = delete;
+  DefaultPerThreadTestPartResultReporter& operator=(
+      const DefaultPerThreadTestPartResultReporter&) = delete;
+};
+
+// The private implementation of the UnitTest class.  We don't protect
+// the methods under a mutex, as this class is not accessible by a
+// user and the UnitTest class that delegates work to this class does
+// proper locking.
+class GTEST_API_ UnitTestImpl {
+ public:
+  explicit UnitTestImpl(UnitTest* parent);
+  virtual ~UnitTestImpl();
+
+  // There are two different ways to register your own TestPartResultReporter.
+  // You can register your own repoter to listen either only for test results
+  // from the current thread or for results from all threads.
+  // By default, each per-thread test result repoter just passes a new
+  // TestPartResult to the global test result reporter, which registers the
+  // test part result for the currently running test.
+
+  // Returns the global test part result reporter.
+  TestPartResultReporterInterface* GetGlobalTestPartResultReporter();
+
+  // Sets the global test part result reporter.
+  void SetGlobalTestPartResultReporter(
+      TestPartResultReporterInterface* reporter);
+
+  // Returns the test part result reporter for the current thread.
+  TestPartResultReporterInterface* GetTestPartResultReporterForCurrentThread();
+
+  // Sets the test part result reporter for the current thread.
+  void SetTestPartResultReporterForCurrentThread(
+      TestPartResultReporterInterface* reporter);
+
+  // Gets the number of successful test suites.
+  int successful_test_suite_count() const;
+
+  // Gets the number of failed test suites.
+  int failed_test_suite_count() const;
+
+  // Gets the number of all test suites.
+  int total_test_suite_count() const;
+
+  // Gets the number of all test suites that contain at least one test
+  // that should run.
+  int test_suite_to_run_count() const;
+
+  // Gets the number of successful tests.
+  int successful_test_count() const;
+
+  // Gets the number of skipped tests.
+  int skipped_test_count() const;
+
+  // Gets the number of failed tests.
+  int failed_test_count() const;
+
+  // Gets the number of disabled tests that will be reported in the XML report.
+  int reportable_disabled_test_count() const;
+
+  // Gets the number of disabled tests.
+  int disabled_test_count() const;
+
+  // Gets the number of tests to be printed in the XML report.
+  int reportable_test_count() const;
+
+  // Gets the number of all tests.
+  int total_test_count() const;
+
+  // Gets the number of tests that should run.
+  int test_to_run_count() const;
+
+  // Gets the time of the test program start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp() const { return start_timestamp_; }
+
+  // Gets the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const { return elapsed_time_; }
+
+  // Returns true if and only if the unit test passed (i.e. all test suites
+  // passed).
+  bool Passed() const { return !Failed(); }
+
+  // Returns true if and only if the unit test failed (i.e. some test suite
+  // failed or something outside of all tests failed).
+  bool Failed() const {
+    return failed_test_suite_count() > 0 || ad_hoc_test_result()->Failed();
+  }
+
+  // Gets the i-th test suite among all the test suites. i can range from 0 to
+  // total_test_suite_count() - 1. If i is not in that range, returns NULL.
+  const TestSuite* GetTestSuite(int i) const {
+    const int index = GetElementOr(test_suite_indices_, i, -1);
+    return index < 0 ? nullptr : test_suites_[static_cast<size_t>(i)];
+  }
+
+  //  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  const TestCase* GetTestCase(int i) const { return GetTestSuite(i); }
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  // Gets the i-th test suite among all the test suites. i can range from 0 to
+  // total_test_suite_count() - 1. If i is not in that range, returns NULL.
+  TestSuite* GetMutableSuiteCase(int i) {
+    const int index = GetElementOr(test_suite_indices_, i, -1);
+    return index < 0 ? nullptr : test_suites_[static_cast<size_t>(index)];
+  }
+
+  // Provides access to the event listener list.
+  TestEventListeners* listeners() { return &listeners_; }
+
+  // Returns the TestResult for the test that's currently running, or
+  // the TestResult for the ad hoc test if no test is running.
+  TestResult* current_test_result();
+
+  // Returns the TestResult for the ad hoc test.
+  const TestResult* ad_hoc_test_result() const { return &ad_hoc_test_result_; }
+
+  // Sets the OS stack trace getter.
+  //
+  // Does nothing if the input and the current OS stack trace getter
+  // are the same; otherwise, deletes the old getter and makes the
+  // input the current getter.
+  void set_os_stack_trace_getter(OsStackTraceGetterInterface* getter);
+
+  // Returns the current OS stack trace getter if it is not NULL;
+  // otherwise, creates an OsStackTraceGetter, makes it the current
+  // getter, and returns it.
+  OsStackTraceGetterInterface* os_stack_trace_getter();
+
+  // Returns the current OS stack trace as an std::string.
+  //
+  // The maximum number of stack frames to be included is specified by
+  // the gtest_stack_trace_depth flag.  The skip_count parameter
+  // specifies the number of top frames to be skipped, which doesn't
+  // count against the number of frames to be included.
+  //
+  // For example, if Foo() calls Bar(), which in turn calls
+  // CurrentOsStackTraceExceptTop(1), Foo() will be included in the
+  // trace but Bar() and CurrentOsStackTraceExceptTop() won't.
+  std::string CurrentOsStackTraceExceptTop(int skip_count)
+      GTEST_NO_INLINE_ GTEST_NO_TAIL_CALL_;
+
+  // Finds and returns a TestSuite with the given name.  If one doesn't
+  // exist, creates one and returns it.
+  //
+  // Arguments:
+  //
+  //   test_suite_name: name of the test suite
+  //   type_param:      the name of the test's type parameter, or NULL if
+  //                    this is not a typed or a type-parameterized test.
+  //   set_up_tc:       pointer to the function that sets up the test suite
+  //   tear_down_tc:    pointer to the function that tears down the test suite
+  TestSuite* GetTestSuite(const char* test_suite_name, const char* type_param,
+                          internal::SetUpTestSuiteFunc set_up_tc,
+                          internal::TearDownTestSuiteFunc tear_down_tc);
+
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  TestCase* GetTestCase(const char* test_case_name, const char* type_param,
+                        internal::SetUpTestSuiteFunc set_up_tc,
+                        internal::TearDownTestSuiteFunc tear_down_tc) {
+    return GetTestSuite(test_case_name, type_param, set_up_tc, tear_down_tc);
+  }
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  // Adds a TestInfo to the unit test.
+  //
+  // Arguments:
+  //
+  //   set_up_tc:    pointer to the function that sets up the test suite
+  //   tear_down_tc: pointer to the function that tears down the test suite
+  //   test_info:    the TestInfo object
+  void AddTestInfo(internal::SetUpTestSuiteFunc set_up_tc,
+                   internal::TearDownTestSuiteFunc tear_down_tc,
+                   TestInfo* test_info) {
+#if GTEST_HAS_DEATH_TEST
+    // In order to support thread-safe death tests, we need to
+    // remember the original working directory when the test program
+    // was first invoked.  We cannot do this in RUN_ALL_TESTS(), as
+    // the user may have changed the current directory before calling
+    // RUN_ALL_TESTS().  Therefore we capture the current directory in
+    // AddTestInfo(), which is called to register a TEST or TEST_F
+    // before main() is reached.
+    if (original_working_dir_.IsEmpty()) {
+      original_working_dir_.Set(FilePath::GetCurrentDir());
+      GTEST_CHECK_(!original_working_dir_.IsEmpty())
+          << "Failed to get the current working directory.";
+    }
+#endif  // GTEST_HAS_DEATH_TEST
+
+    GetTestSuite(test_info->test_suite_name(), test_info->type_param(),
+                 set_up_tc, tear_down_tc)
+        ->AddTestInfo(test_info);
+  }
+
+  // Returns ParameterizedTestSuiteRegistry object used to keep track of
+  // value-parameterized tests and instantiate and register them.
+  internal::ParameterizedTestSuiteRegistry& parameterized_test_registry() {
+    return parameterized_test_registry_;
+  }
+
+  std::set<std::string>* ignored_parameterized_test_suites() {
+    return &ignored_parameterized_test_suites_;
+  }
+
+  // Returns TypeParameterizedTestSuiteRegistry object used to keep track of
+  // type-parameterized tests and instantiations of them.
+  internal::TypeParameterizedTestSuiteRegistry&
+  type_parameterized_test_registry() {
+    return type_parameterized_test_registry_;
+  }
+
+  // Sets the TestSuite object for the test that's currently running.
+  void set_current_test_suite(TestSuite* a_current_test_suite) {
+    current_test_suite_ = a_current_test_suite;
+  }
+
+  // Sets the TestInfo object for the test that's currently running.  If
+  // current_test_info is NULL, the assertion results will be stored in
+  // ad_hoc_test_result_.
+  void set_current_test_info(TestInfo* a_current_test_info) {
+    current_test_info_ = a_current_test_info;
+  }
+
+  // Registers all parameterized tests defined using TEST_P and
+  // INSTANTIATE_TEST_SUITE_P, creating regular tests for each test/parameter
+  // combination. This method can be called more then once; it has guards
+  // protecting from registering the tests more then once.  If
+  // value-parameterized tests are disabled, RegisterParameterizedTests is
+  // present but does nothing.
+  void RegisterParameterizedTests();
+
+  // Runs all tests in this UnitTest object, prints the result, and
+  // returns true if all tests are successful.  If any exception is
+  // thrown during a test, this test is considered to be failed, but
+  // the rest of the tests will still be run.
+  bool RunAllTests();
+
+  // Clears the results of all tests, except the ad hoc tests.
+  void ClearNonAdHocTestResult() {
+    ForEach(test_suites_, TestSuite::ClearTestSuiteResult);
+  }
+
+  // Clears the results of ad-hoc test assertions.
+  void ClearAdHocTestResult() { ad_hoc_test_result_.Clear(); }
+
+  // Adds a TestProperty to the current TestResult object when invoked in a
+  // context of a test or a test suite, or to the global property set. If the
+  // result already contains a property with the same key, the value will be
+  // updated.
+  void RecordProperty(const TestProperty& test_property);
+
+  enum ReactionToSharding { HONOR_SHARDING_PROTOCOL, IGNORE_SHARDING_PROTOCOL };
+
+  // Matches the full name of each test against the user-specified
+  // filter to decide whether the test should run, then records the
+  // result in each TestSuite and TestInfo object.
+  // If shard_tests == HONOR_SHARDING_PROTOCOL, further filters tests
+  // based on sharding variables in the environment.
+  // Returns the number of tests that should run.
+  int FilterTests(ReactionToSharding shard_tests);
+
+  // Prints the names of the tests matching the user-specified filter flag.
+  void ListTestsMatchingFilter();
+
+  const TestSuite* current_test_suite() const { return current_test_suite_; }
+  TestInfo* current_test_info() { return current_test_info_; }
+  const TestInfo* current_test_info() const { return current_test_info_; }
+
+  // Returns the vector of environments that need to be set-up/torn-down
+  // before/after the tests are run.
+  std::vector<Environment*>& environments() { return environments_; }
+
+  // Getters for the per-thread Google Test trace stack.
+  std::vector<TraceInfo>& gtest_trace_stack() {
+    return *(gtest_trace_stack_.pointer());
+  }
+  const std::vector<TraceInfo>& gtest_trace_stack() const {
+    return gtest_trace_stack_.get();
+  }
+
+#if GTEST_HAS_DEATH_TEST
+  void InitDeathTestSubprocessControlInfo() {
+    internal_run_death_test_flag_.reset(ParseInternalRunDeathTestFlag());
+  }
+  // Returns a pointer to the parsed --gtest_internal_run_death_test
+  // flag, or NULL if that flag was not specified.
+  // This information is useful only in a death test child process.
+  // Must not be called before a call to InitGoogleTest.
+  const InternalRunDeathTestFlag* internal_run_death_test_flag() const {
+    return internal_run_death_test_flag_.get();
+  }
+
+  // Returns a pointer to the current death test factory.
+  internal::DeathTestFactory* death_test_factory() {
+    return death_test_factory_.get();
+  }
+
+  void SuppressTestEventsIfInSubprocess();
+
+  friend class ReplaceDeathTestFactory;
+#endif  // GTEST_HAS_DEATH_TEST
+
+  // Initializes the event listener performing XML output as specified by
+  // UnitTestOptions. Must not be called before InitGoogleTest.
+  void ConfigureXmlOutput();
+
+#if GTEST_CAN_STREAM_RESULTS_
+  // Initializes the event listener for streaming test results to a socket.
+  // Must not be called before InitGoogleTest.
+  void ConfigureStreamingOutput();
+#endif
+
+  // Performs initialization dependent upon flag values obtained in
+  // ParseGoogleTestFlagsOnly.  Is called from InitGoogleTest after the call to
+  // ParseGoogleTestFlagsOnly.  In case a user neglects to call InitGoogleTest
+  // this function is also called from RunAllTests.  Since this function can be
+  // called more than once, it has to be idempotent.
+  void PostFlagParsingInit();
+
+  // Gets the random seed used at the start of the current test iteration.
+  int random_seed() const { return random_seed_; }
+
+  // Gets the random number generator.
+  internal::Random* random() { return &random_; }
+
+  // Shuffles all test suites, and the tests within each test suite,
+  // making sure that death tests are still run first.
+  void ShuffleTests();
+
+  // Restores the test suites and tests to their order before the first shuffle.
+  void UnshuffleTests();
+
+  // Returns the value of GTEST_FLAG(catch_exceptions) at the moment
+  // UnitTest::Run() starts.
+  bool catch_exceptions() const { return catch_exceptions_; }
+
+ private:
+  friend class ::testing::UnitTest;
+
+  // Used by UnitTest::Run() to capture the state of
+  // GTEST_FLAG(catch_exceptions) at the moment it starts.
+  void set_catch_exceptions(bool value) { catch_exceptions_ = value; }
+
+  // The UnitTest object that owns this implementation object.
+  UnitTest* const parent_;
+
+  // The working directory when the first TEST() or TEST_F() was
+  // executed.
+  internal::FilePath original_working_dir_;
+
+  // The default test part result reporters.
+  DefaultGlobalTestPartResultReporter default_global_test_part_result_reporter_;
+  DefaultPerThreadTestPartResultReporter
+      default_per_thread_test_part_result_reporter_;
+
+  // Points to (but doesn't own) the global test part result reporter.
+  TestPartResultReporterInterface* global_test_part_result_repoter_;
+
+  // Protects read and write access to global_test_part_result_reporter_.
+  internal::Mutex global_test_part_result_reporter_mutex_;
+
+  // Points to (but doesn't own) the per-thread test part result reporter.
+  internal::ThreadLocal<TestPartResultReporterInterface*>
+      per_thread_test_part_result_reporter_;
+
+  // The vector of environments that need to be set-up/torn-down
+  // before/after the tests are run.
+  std::vector<Environment*> environments_;
+
+  // The vector of TestSuites in their original order.  It owns the
+  // elements in the vector.
+  std::vector<TestSuite*> test_suites_;
+
+  // Provides a level of indirection for the test suite list to allow
+  // easy shuffling and restoring the test suite order.  The i-th
+  // element of this vector is the index of the i-th test suite in the
+  // shuffled order.
+  std::vector<int> test_suite_indices_;
+
+  // ParameterizedTestRegistry object used to register value-parameterized
+  // tests.
+  internal::ParameterizedTestSuiteRegistry parameterized_test_registry_;
+  internal::TypeParameterizedTestSuiteRegistry
+      type_parameterized_test_registry_;
+
+  // The set holding the name of parameterized
+  // test suites that may go uninstantiated.
+  std::set<std::string> ignored_parameterized_test_suites_;
+
+  // Indicates whether RegisterParameterizedTests() has been called already.
+  bool parameterized_tests_registered_;
+
+  // Index of the last death test suite registered.  Initially -1.
+  int last_death_test_suite_;
+
+  // This points to the TestSuite for the currently running test.  It
+  // changes as Google Test goes through one test suite after another.
+  // When no test is running, this is set to NULL and Google Test
+  // stores assertion results in ad_hoc_test_result_.  Initially NULL.
+  TestSuite* current_test_suite_;
+
+  // This points to the TestInfo for the currently running test.  It
+  // changes as Google Test goes through one test after another.  When
+  // no test is running, this is set to NULL and Google Test stores
+  // assertion results in ad_hoc_test_result_.  Initially NULL.
+  TestInfo* current_test_info_;
+
+  // Normally, a user only writes assertions inside a TEST or TEST_F,
+  // or inside a function called by a TEST or TEST_F.  Since Google
+  // Test keeps track of which test is current running, it can
+  // associate such an assertion with the test it belongs to.
+  //
+  // If an assertion is encountered when no TEST or TEST_F is running,
+  // Google Test attributes the assertion result to an imaginary "ad hoc"
+  // test, and records the result in ad_hoc_test_result_.
+  TestResult ad_hoc_test_result_;
+
+  // The list of event listeners that can be used to track events inside
+  // Google Test.
+  TestEventListeners listeners_;
+
+  // The OS stack trace getter.  Will be deleted when the UnitTest
+  // object is destructed.  By default, an OsStackTraceGetter is used,
+  // but the user can set this field to use a custom getter if that is
+  // desired.
+  OsStackTraceGetterInterface* os_stack_trace_getter_;
+
+  // True if and only if PostFlagParsingInit() has been called.
+  bool post_flag_parse_init_performed_;
+
+  // The random number seed used at the beginning of the test run.
+  int random_seed_;
+
+  // Our random number generator.
+  internal::Random random_;
+
+  // The time of the test program start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp_;
+
+  // How long the test took to run, in milliseconds.
+  TimeInMillis elapsed_time_;
+
+#if GTEST_HAS_DEATH_TEST
+  // The decomposed components of the gtest_internal_run_death_test flag,
+  // parsed when RUN_ALL_TESTS is called.
+  std::unique_ptr<InternalRunDeathTestFlag> internal_run_death_test_flag_;
+  std::unique_ptr<internal::DeathTestFactory> death_test_factory_;
+#endif  // GTEST_HAS_DEATH_TEST
+
+  // A per-thread stack of traces created by the SCOPED_TRACE() macro.
+  internal::ThreadLocal<std::vector<TraceInfo> > gtest_trace_stack_;
+
+  // The value of GTEST_FLAG(catch_exceptions) at the moment RunAllTests()
+  // starts.
+  bool catch_exceptions_;
+
+  UnitTestImpl(const UnitTestImpl&) = delete;
+  UnitTestImpl& operator=(const UnitTestImpl&) = delete;
+};  // class UnitTestImpl
+
+// Convenience function for accessing the global UnitTest
+// implementation object.
+inline UnitTestImpl* GetUnitTestImpl() {
+  return UnitTest::GetInstance()->impl();
+}
+
+#if GTEST_USES_SIMPLE_RE
+
+// Internal helper functions for implementing the simple regular
+// expression matcher.
+GTEST_API_ bool IsInSet(char ch, const char* str);
+GTEST_API_ bool IsAsciiDigit(char ch);
+GTEST_API_ bool IsAsciiPunct(char ch);
+GTEST_API_ bool IsRepeat(char ch);
+GTEST_API_ bool IsAsciiWhiteSpace(char ch);
+GTEST_API_ bool IsAsciiWordChar(char ch);
+GTEST_API_ bool IsValidEscape(char ch);
+GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch);
+GTEST_API_ bool ValidateRegex(const char* regex);
+GTEST_API_ bool MatchRegexAtHead(const char* regex, const char* str);
+GTEST_API_ bool MatchRepetitionAndRegexAtHead(bool escaped, char ch,
+                                              char repeat, const char* regex,
+                                              const char* str);
+GTEST_API_ bool MatchRegexAnywhere(const char* regex, const char* str);
+
+#endif  // GTEST_USES_SIMPLE_RE
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.
+GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, char** argv);
+GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv);
+
+#if GTEST_HAS_DEATH_TEST
+
+// Returns the message describing the last system error, regardless of the
+// platform.
+GTEST_API_ std::string GetLastErrnoDescription();
+
+// Attempts to parse a string into a positive integer pointed to by the
+// number parameter.  Returns true if that is possible.
+// GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we can use
+// it here.
+template <typename Integer>
+bool ParseNaturalNumber(const ::std::string& str, Integer* number) {
+  // Fail fast if the given string does not begin with a digit;
+  // this bypasses strtoXXX's "optional leading whitespace and plus
+  // or minus sign" semantics, which are undesirable here.
+  if (str.empty() || !IsDigit(str[0])) {
+    return false;
+  }
+  errno = 0;
+
+  char* end;
+  // BiggestConvertible is the largest integer type that system-provided
+  // string-to-number conversion routines can return.
+  using BiggestConvertible = unsigned long long;  // NOLINT
+
+  const BiggestConvertible parsed = strtoull(str.c_str(), &end, 10);  // NOLINT
+  const bool parse_success = *end == '\0' && errno == 0;
+
+  GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed));
+
+  const Integer result = static_cast<Integer>(parsed);
+  if (parse_success && static_cast<BiggestConvertible>(result) == parsed) {
+    *number = result;
+    return true;
+  }
+  return false;
+}
+#endif  // GTEST_HAS_DEATH_TEST
+
+// TestResult contains some private methods that should be hidden from
+// Google Test user but are required for testing. This class allow our tests
+// to access them.
+//
+// This class is supplied only for the purpose of testing Google Test's own
+// constructs. Do not use it in user tests, either directly or indirectly.
+class TestResultAccessor {
+ public:
+  static void RecordProperty(TestResult* test_result,
+                             const std::string& xml_element,
+                             const TestProperty& property) {
+    test_result->RecordProperty(xml_element, property);
+  }
+
+  static void ClearTestPartResults(TestResult* test_result) {
+    test_result->ClearTestPartResults();
+  }
+
+  static const std::vector<testing::TestPartResult>& test_part_results(
+      const TestResult& test_result) {
+    return test_result.test_part_results();
+  }
+};
+
+#if GTEST_CAN_STREAM_RESULTS_
+
+// Streams test results to the given port on the given host machine.
+class StreamingListener : public EmptyTestEventListener {
+ public:
+  // Abstract base class for writing strings to a socket.
+  class AbstractSocketWriter {
+   public:
+    virtual ~AbstractSocketWriter() {}
+
+    // Sends a string to the socket.
+    virtual void Send(const std::string& message) = 0;
+
+    // Closes the socket.
+    virtual void CloseConnection() {}
+
+    // Sends a string and a newline to the socket.
+    void SendLn(const std::string& message) { Send(message + "\n"); }
+  };
+
+  // Concrete class for actually writing strings to a socket.
+  class SocketWriter : public AbstractSocketWriter {
+   public:
+    SocketWriter(const std::string& host, const std::string& port)
+        : sockfd_(-1), host_name_(host), port_num_(port) {
+      MakeConnection();
+    }
+
+    ~SocketWriter() override {
+      if (sockfd_ != -1) CloseConnection();
+    }
+
+    // Sends a string to the socket.
+    void Send(const std::string& message) override {
+      GTEST_CHECK_(sockfd_ != -1)
+          << "Send() can be called only when there is a connection.";
+
+      const auto len = static_cast<size_t>(message.length());
+      if (write(sockfd_, message.c_str(), len) != static_cast<ssize_t>(len)) {
+        GTEST_LOG_(WARNING) << "stream_result_to: failed to stream to "
+                            << host_name_ << ":" << port_num_;
+      }
+    }
+
+   private:
+    // Creates a client socket and connects to the server.
+    void MakeConnection();
+
+    // Closes the socket.
+    void CloseConnection() override {
+      GTEST_CHECK_(sockfd_ != -1)
+          << "CloseConnection() can be called only when there is a connection.";
+
+      close(sockfd_);
+      sockfd_ = -1;
+    }
+
+    int sockfd_;  // socket file descriptor
+    const std::string host_name_;
+    const std::string port_num_;
+
+    SocketWriter(const SocketWriter&) = delete;
+    SocketWriter& operator=(const SocketWriter&) = delete;
+  };  // class SocketWriter
+
+  // Escapes '=', '&', '%', and '\n' characters in str as "%xx".
+  static std::string UrlEncode(const char* str);
+
+  StreamingListener(const std::string& host, const std::string& port)
+      : socket_writer_(new SocketWriter(host, port)) {
+    Start();
+  }
+
+  explicit StreamingListener(AbstractSocketWriter* socket_writer)
+      : socket_writer_(socket_writer) {
+    Start();
+  }
+
+  void OnTestProgramStart(const UnitTest& /* unit_test */) override {
+    SendLn("event=TestProgramStart");
+  }
+
+  void OnTestProgramEnd(const UnitTest& unit_test) override {
+    // Note that Google Test current only report elapsed time for each
+    // test iteration, not for the entire test program.
+    SendLn("event=TestProgramEnd&passed=" + FormatBool(unit_test.Passed()));
+
+    // Notify the streaming server to stop.
+    socket_writer_->CloseConnection();
+  }
+
+  void OnTestIterationStart(const UnitTest& /* unit_test */,
+                            int iteration) override {
+    SendLn("event=TestIterationStart&iteration=" +
+           StreamableToString(iteration));
+  }
+
+  void OnTestIterationEnd(const UnitTest& unit_test,
+                          int /* iteration */) override {
+    SendLn("event=TestIterationEnd&passed=" + FormatBool(unit_test.Passed()) +
+           "&elapsed_time=" + StreamableToString(unit_test.elapsed_time()) +
+           "ms");
+  }
+
+  // Note that "event=TestCaseStart" is a wire format and has to remain
+  // "case" for compatibility
+  void OnTestSuiteStart(const TestSuite& test_suite) override {
+    SendLn(std::string("event=TestCaseStart&name=") + test_suite.name());
+  }
+
+  // Note that "event=TestCaseEnd" is a wire format and has to remain
+  // "case" for compatibility
+  void OnTestSuiteEnd(const TestSuite& test_suite) override {
+    SendLn("event=TestCaseEnd&passed=" + FormatBool(test_suite.Passed()) +
+           "&elapsed_time=" + StreamableToString(test_suite.elapsed_time()) +
+           "ms");
+  }
+
+  void OnTestStart(const TestInfo& test_info) override {
+    SendLn(std::string("event=TestStart&name=") + test_info.name());
+  }
+
+  void OnTestEnd(const TestInfo& test_info) override {
+    SendLn("event=TestEnd&passed=" +
+           FormatBool((test_info.result())->Passed()) + "&elapsed_time=" +
+           StreamableToString((test_info.result())->elapsed_time()) + "ms");
+  }
+
+  void OnTestPartResult(const TestPartResult& test_part_result) override {
+    const char* file_name = test_part_result.file_name();
+    if (file_name == nullptr) file_name = "";
+    SendLn("event=TestPartResult&file=" + UrlEncode(file_name) +
+           "&line=" + StreamableToString(test_part_result.line_number()) +
+           "&message=" + UrlEncode(test_part_result.message()));
+  }
+
+ private:
+  // Sends the given message and a newline to the socket.
+  void SendLn(const std::string& message) { socket_writer_->SendLn(message); }
+
+  // Called at the start of streaming to notify the receiver what
+  // protocol we are using.
+  void Start() { SendLn("gtest_streaming_protocol_version=1.0"); }
+
+  std::string FormatBool(bool value) { return value ? "1" : "0"; }
+
+  const std::unique_ptr<AbstractSocketWriter> socket_writer_;
+
+  StreamingListener(const StreamingListener&) = delete;
+  StreamingListener& operator=(const StreamingListener&) = delete;
+};  // class StreamingListener
+
+#endif  // GTEST_CAN_STREAM_RESULTS_
+
+}  // namespace internal
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+#endif  // GOOGLETEST_SRC_GTEST_INTERNAL_INL_H_
diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest-matchers.cc b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-matchers.cc
new file mode 100644
index 0000000000..7e3bcc0cff
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-matchers.cc
@@ -0,0 +1,98 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This file implements just enough of the matcher interface to allow
+// EXPECT_DEATH and friends to accept a matcher argument.
+
+#include "gtest/gtest-matchers.h"
+
+#include <string>
+
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
+
+namespace testing {
+
+// Constructs a matcher that matches a const std::string& whose value is
+// equal to s.
+Matcher<const std::string&>::Matcher(const std::string& s) { *this = Eq(s); }
+
+// Constructs a matcher that matches a const std::string& whose value is
+// equal to s.
+Matcher<const std::string&>::Matcher(const char* s) {
+  *this = Eq(std::string(s));
+}
+
+// Constructs a matcher that matches a std::string whose value is equal to
+// s.
+Matcher<std::string>::Matcher(const std::string& s) { *this = Eq(s); }
+
+// Constructs a matcher that matches a std::string whose value is equal to
+// s.
+Matcher<std::string>::Matcher(const char* s) { *this = Eq(std::string(s)); }
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+// Constructs a matcher that matches a const StringView& whose value is
+// equal to s.
+Matcher<const internal::StringView&>::Matcher(const std::string& s) {
+  *this = Eq(s);
+}
+
+// Constructs a matcher that matches a const StringView& whose value is
+// equal to s.
+Matcher<const internal::StringView&>::Matcher(const char* s) {
+  *this = Eq(std::string(s));
+}
+
+// Constructs a matcher that matches a const StringView& whose value is
+// equal to s.
+Matcher<const internal::StringView&>::Matcher(internal::StringView s) {
+  *this = Eq(std::string(s));
+}
+
+// Constructs a matcher that matches a StringView whose value is equal to
+// s.
+Matcher<internal::StringView>::Matcher(const std::string& s) { *this = Eq(s); }
+
+// Constructs a matcher that matches a StringView whose value is equal to
+// s.
+Matcher<internal::StringView>::Matcher(const char* s) {
+  *this = Eq(std::string(s));
+}
+
+// Constructs a matcher that matches a StringView whose value is equal to
+// s.
+Matcher<internal::StringView>::Matcher(internal::StringView s) {
+  *this = Eq(std::string(s));
+}
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
+
+}  // namespace testing
diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest-port.cc b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-port.cc
new file mode 100644
index 0000000000..d797fe4d58
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-port.cc
@@ -0,0 +1,1394 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "gtest/internal/gtest-port.h"
+
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <cstdint>
+#include <fstream>
+#include <memory>
+
+#if GTEST_OS_WINDOWS
+#include <io.h>
+#include <sys/stat.h>
+#include <windows.h>
+
+#include <map>  // Used in ThreadLocal.
+#ifdef _MSC_VER
+#include <crtdbg.h>
+#endif  // _MSC_VER
+#else
+#include <unistd.h>
+#endif  // GTEST_OS_WINDOWS
+
+#if GTEST_OS_MAC
+#include <mach/mach_init.h>
+#include <mach/task.h>
+#include <mach/vm_map.h>
+#endif  // GTEST_OS_MAC
+
+#if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \
+    GTEST_OS_NETBSD || GTEST_OS_OPENBSD
+#include <sys/sysctl.h>
+#if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD
+#include <sys/user.h>
+#endif
+#endif
+
+#if GTEST_OS_QNX
+#include <devctl.h>
+#include <fcntl.h>
+#include <sys/procfs.h>
+#endif  // GTEST_OS_QNX
+
+#if GTEST_OS_AIX
+#include <procinfo.h>
+#include <sys/types.h>
+#endif  // GTEST_OS_AIX
+
+#if GTEST_OS_FUCHSIA
+#include <zircon/process.h>
+#include <zircon/syscalls.h>
+#endif  // GTEST_OS_FUCHSIA
+
+#include "gtest/gtest-message.h"
+#include "gtest/gtest-spi.h"
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-string.h"
+#include "src/gtest-internal-inl.h"
+
+namespace testing {
+namespace internal {
+
+#if GTEST_OS_LINUX || GTEST_OS_GNU_HURD
+
+namespace {
+template <typename T>
+T ReadProcFileField(const std::string& filename, int field) {
+  std::string dummy;
+  std::ifstream file(filename.c_str());
+  while (field-- > 0) {
+    file >> dummy;
+  }
+  T output = 0;
+  file >> output;
+  return output;
+}
+}  // namespace
+
+// Returns the number of active threads, or 0 when there is an error.
+size_t GetThreadCount() {
+  const std::string filename =
+      (Message() << "/proc/" << getpid() << "/stat").GetString();
+  return ReadProcFileField<size_t>(filename, 19);
+}
+
+#elif GTEST_OS_MAC
+
+size_t GetThreadCount() {
+  const task_t task = mach_task_self();
+  mach_msg_type_number_t thread_count;
+  thread_act_array_t thread_list;
+  const kern_return_t status = task_threads(task, &thread_list, &thread_count);
+  if (status == KERN_SUCCESS) {
+    // task_threads allocates resources in thread_list and we need to free them
+    // to avoid leaks.
+    vm_deallocate(task, reinterpret_cast<vm_address_t>(thread_list),
+                  sizeof(thread_t) * thread_count);
+    return static_cast<size_t>(thread_count);
+  } else {
+    return 0;
+  }
+}
+
+#elif GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \
+    GTEST_OS_NETBSD
+
+#if GTEST_OS_NETBSD
+#undef KERN_PROC
+#define KERN_PROC KERN_PROC2
+#define kinfo_proc kinfo_proc2
+#endif
+
+#if GTEST_OS_DRAGONFLY
+#define KP_NLWP(kp) (kp.kp_nthreads)
+#elif GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD
+#define KP_NLWP(kp) (kp.ki_numthreads)
+#elif GTEST_OS_NETBSD
+#define KP_NLWP(kp) (kp.p_nlwps)
+#endif
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+size_t GetThreadCount() {
+  int mib[] = {
+    CTL_KERN,
+    KERN_PROC,
+    KERN_PROC_PID,
+    getpid(),
+#if GTEST_OS_NETBSD
+    sizeof(struct kinfo_proc),
+    1,
+#endif
+  };
+  u_int miblen = sizeof(mib) / sizeof(mib[0]);
+  struct kinfo_proc info;
+  size_t size = sizeof(info);
+  if (sysctl(mib, miblen, &info, &size, NULL, 0)) {
+    return 0;
+  }
+  return static_cast<size_t>(KP_NLWP(info));
+}
+#elif GTEST_OS_OPENBSD
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+size_t GetThreadCount() {
+  int mib[] = {
+      CTL_KERN,
+      KERN_PROC,
+      KERN_PROC_PID | KERN_PROC_SHOW_THREADS,
+      getpid(),
+      sizeof(struct kinfo_proc),
+      0,
+  };
+  u_int miblen = sizeof(mib) / sizeof(mib[0]);
+
+  // get number of structs
+  size_t size;
+  if (sysctl(mib, miblen, NULL, &size, NULL, 0)) {
+    return 0;
+  }
+
+  mib[5] = static_cast<int>(size / static_cast<size_t>(mib[4]));
+
+  // populate array of structs
+  struct kinfo_proc info[mib[5]];
+  if (sysctl(mib, miblen, &info, &size, NULL, 0)) {
+    return 0;
+  }
+
+  // exclude empty members
+  size_t nthreads = 0;
+  for (size_t i = 0; i < size / static_cast<size_t>(mib[4]); i++) {
+    if (info[i].p_tid != -1) nthreads++;
+  }
+  return nthreads;
+}
+
+#elif GTEST_OS_QNX
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+size_t GetThreadCount() {
+  const int fd = open("/proc/self/as", O_RDONLY);
+  if (fd < 0) {
+    return 0;
+  }
+  procfs_info process_info;
+  const int status =
+      devctl(fd, DCMD_PROC_INFO, &process_info, sizeof(process_info), nullptr);
+  close(fd);
+  if (status == EOK) {
+    return static_cast<size_t>(process_info.num_threads);
+  } else {
+    return 0;
+  }
+}
+
+#elif GTEST_OS_AIX
+
+size_t GetThreadCount() {
+  struct procentry64 entry;
+  pid_t pid = getpid();
+  int status = getprocs64(&entry, sizeof(entry), nullptr, 0, &pid, 1);
+  if (status == 1) {
+    return entry.pi_thcount;
+  } else {
+    return 0;
+  }
+}
+
+#elif GTEST_OS_FUCHSIA
+
+size_t GetThreadCount() {
+  int dummy_buffer;
+  size_t avail;
+  zx_status_t status =
+      zx_object_get_info(zx_process_self(), ZX_INFO_PROCESS_THREADS,
+                         &dummy_buffer, 0, nullptr, &avail);
+  if (status == ZX_OK) {
+    return avail;
+  } else {
+    return 0;
+  }
+}
+
+#else
+
+size_t GetThreadCount() {
+  // There's no portable way to detect the number of threads, so we just
+  // return 0 to indicate that we cannot detect it.
+  return 0;
+}
+
+#endif  // GTEST_OS_LINUX
+
+#if GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS
+
+AutoHandle::AutoHandle() : handle_(INVALID_HANDLE_VALUE) {}
+
+AutoHandle::AutoHandle(Handle handle) : handle_(handle) {}
+
+AutoHandle::~AutoHandle() { Reset(); }
+
+AutoHandle::Handle AutoHandle::Get() const { return handle_; }
+
+void AutoHandle::Reset() { Reset(INVALID_HANDLE_VALUE); }
+
+void AutoHandle::Reset(HANDLE handle) {
+  // Resetting with the same handle we already own is invalid.
+  if (handle_ != handle) {
+    if (IsCloseable()) {
+      ::CloseHandle(handle_);
+    }
+    handle_ = handle;
+  } else {
+    GTEST_CHECK_(!IsCloseable())
+        << "Resetting a valid handle to itself is likely a programmer error "
+           "and thus not allowed.";
+  }
+}
+
+bool AutoHandle::IsCloseable() const {
+  // Different Windows APIs may use either of these values to represent an
+  // invalid handle.
+  return handle_ != nullptr && handle_ != INVALID_HANDLE_VALUE;
+}
+
+Mutex::Mutex()
+    : owner_thread_id_(0),
+      type_(kDynamic),
+      critical_section_init_phase_(0),
+      critical_section_(new CRITICAL_SECTION) {
+  ::InitializeCriticalSection(critical_section_);
+}
+
+Mutex::~Mutex() {
+  // Static mutexes are leaked intentionally. It is not thread-safe to try
+  // to clean them up.
+  if (type_ == kDynamic) {
+    ::DeleteCriticalSection(critical_section_);
+    delete critical_section_;
+    critical_section_ = nullptr;
+  }
+}
+
+void Mutex::Lock() {
+  ThreadSafeLazyInit();
+  ::EnterCriticalSection(critical_section_);
+  owner_thread_id_ = ::GetCurrentThreadId();
+}
+
+void Mutex::Unlock() {
+  ThreadSafeLazyInit();
+  // We don't protect writing to owner_thread_id_ here, as it's the
+  // caller's responsibility to ensure that the current thread holds the
+  // mutex when this is called.
+  owner_thread_id_ = 0;
+  ::LeaveCriticalSection(critical_section_);
+}
+
+// Does nothing if the current thread holds the mutex. Otherwise, crashes
+// with high probability.
+void Mutex::AssertHeld() {
+  ThreadSafeLazyInit();
+  GTEST_CHECK_(owner_thread_id_ == ::GetCurrentThreadId())
+      << "The current thread is not holding the mutex @" << this;
+}
+
+namespace {
+
+#ifdef _MSC_VER
+// Use the RAII idiom to flag mem allocs that are intentionally never
+// deallocated. The motivation is to silence the false positive mem leaks
+// that are reported by the debug version of MS's CRT which can only detect
+// if an alloc is missing a matching deallocation.
+// Example:
+//    MemoryIsNotDeallocated memory_is_not_deallocated;
+//    critical_section_ = new CRITICAL_SECTION;
+//
+class MemoryIsNotDeallocated {
+ public:
+  MemoryIsNotDeallocated() : old_crtdbg_flag_(0) {
+    old_crtdbg_flag_ = _CrtSetDbgFlag(_CRTDBG_REPORT_FLAG);
+    // Set heap allocation block type to _IGNORE_BLOCK so that MS debug CRT
+    // doesn't report mem leak if there's no matching deallocation.
+    (void)_CrtSetDbgFlag(old_crtdbg_flag_ & ~_CRTDBG_ALLOC_MEM_DF);
+  }
+
+  ~MemoryIsNotDeallocated() {
+    // Restore the original _CRTDBG_ALLOC_MEM_DF flag
+    (void)_CrtSetDbgFlag(old_crtdbg_flag_);
+  }
+
+ private:
+  int old_crtdbg_flag_;
+
+  MemoryIsNotDeallocated(const MemoryIsNotDeallocated&) = delete;
+  MemoryIsNotDeallocated& operator=(const MemoryIsNotDeallocated&) = delete;
+};
+#endif  // _MSC_VER
+
+}  // namespace
+
+// Initializes owner_thread_id_ and critical_section_ in static mutexes.
+void Mutex::ThreadSafeLazyInit() {
+  // Dynamic mutexes are initialized in the constructor.
+  if (type_ == kStatic) {
+    switch (
+        ::InterlockedCompareExchange(&critical_section_init_phase_, 1L, 0L)) {
+      case 0:
+        // If critical_section_init_phase_ was 0 before the exchange, we
+        // are the first to test it and need to perform the initialization.
+        owner_thread_id_ = 0;
+        {
+          // Use RAII to flag that following mem alloc is never deallocated.
+#ifdef _MSC_VER
+          MemoryIsNotDeallocated memory_is_not_deallocated;
+#endif  // _MSC_VER
+          critical_section_ = new CRITICAL_SECTION;
+        }
+        ::InitializeCriticalSection(critical_section_);
+        // Updates the critical_section_init_phase_ to 2 to signal
+        // initialization complete.
+        GTEST_CHECK_(::InterlockedCompareExchange(&critical_section_init_phase_,
+                                                  2L, 1L) == 1L);
+        break;
+      case 1:
+        // Somebody else is already initializing the mutex; spin until they
+        // are done.
+        while (::InterlockedCompareExchange(&critical_section_init_phase_, 2L,
+                                            2L) != 2L) {
+          // Possibly yields the rest of the thread's time slice to other
+          // threads.
+          ::Sleep(0);
+        }
+        break;
+
+      case 2:
+        break;  // The mutex is already initialized and ready for use.
+
+      default:
+        GTEST_CHECK_(false)
+            << "Unexpected value of critical_section_init_phase_ "
+            << "while initializing a static mutex.";
+    }
+  }
+}
+
+namespace {
+
+class ThreadWithParamSupport : public ThreadWithParamBase {
+ public:
+  static HANDLE CreateThread(Runnable* runnable,
+                             Notification* thread_can_start) {
+    ThreadMainParam* param = new ThreadMainParam(runnable, thread_can_start);
+    DWORD thread_id;
+    HANDLE thread_handle = ::CreateThread(
+        nullptr,  // Default security.
+        0,        // Default stack size.
+        &ThreadWithParamSupport::ThreadMain,
+        param,        // Parameter to ThreadMainStatic
+        0x0,          // Default creation flags.
+        &thread_id);  // Need a valid pointer for the call to work under Win98.
+    GTEST_CHECK_(thread_handle != nullptr)
+        << "CreateThread failed with error " << ::GetLastError() << ".";
+    if (thread_handle == nullptr) {
+      delete param;
+    }
+    return thread_handle;
+  }
+
+ private:
+  struct ThreadMainParam {
+    ThreadMainParam(Runnable* runnable, Notification* thread_can_start)
+        : runnable_(runnable), thread_can_start_(thread_can_start) {}
+    std::unique_ptr<Runnable> runnable_;
+    // Does not own.
+    Notification* thread_can_start_;
+  };
+
+  static DWORD WINAPI ThreadMain(void* ptr) {
+    // Transfers ownership.
+    std::unique_ptr<ThreadMainParam> param(static_cast<ThreadMainParam*>(ptr));
+    if (param->thread_can_start_ != nullptr)
+      param->thread_can_start_->WaitForNotification();
+    param->runnable_->Run();
+    return 0;
+  }
+
+  // Prohibit instantiation.
+  ThreadWithParamSupport();
+
+  ThreadWithParamSupport(const ThreadWithParamSupport&) = delete;
+  ThreadWithParamSupport& operator=(const ThreadWithParamSupport&) = delete;
+};
+
+}  // namespace
+
+ThreadWithParamBase::ThreadWithParamBase(Runnable* runnable,
+                                         Notification* thread_can_start)
+    : thread_(
+          ThreadWithParamSupport::CreateThread(runnable, thread_can_start)) {}
+
+ThreadWithParamBase::~ThreadWithParamBase() { Join(); }
+
+void ThreadWithParamBase::Join() {
+  GTEST_CHECK_(::WaitForSingleObject(thread_.Get(), INFINITE) == WAIT_OBJECT_0)
+      << "Failed to join the thread with error " << ::GetLastError() << ".";
+}
+
+// Maps a thread to a set of ThreadIdToThreadLocals that have values
+// instantiated on that thread and notifies them when the thread exits.  A
+// ThreadLocal instance is expected to persist until all threads it has
+// values on have terminated.
+class ThreadLocalRegistryImpl {
+ public:
+  // Registers thread_local_instance as having value on the current thread.
+  // Returns a value that can be used to identify the thread from other threads.
+  static ThreadLocalValueHolderBase* GetValueOnCurrentThread(
+      const ThreadLocalBase* thread_local_instance) {
+#ifdef _MSC_VER
+    MemoryIsNotDeallocated memory_is_not_deallocated;
+#endif  // _MSC_VER
+    DWORD current_thread = ::GetCurrentThreadId();
+    MutexLock lock(&mutex_);
+    ThreadIdToThreadLocals* const thread_to_thread_locals =
+        GetThreadLocalsMapLocked();
+    ThreadIdToThreadLocals::iterator thread_local_pos =
+        thread_to_thread_locals->find(current_thread);
+    if (thread_local_pos == thread_to_thread_locals->end()) {
+      thread_local_pos =
+          thread_to_thread_locals
+              ->insert(std::make_pair(current_thread, ThreadLocalValues()))
+              .first;
+      StartWatcherThreadFor(current_thread);
+    }
+    ThreadLocalValues& thread_local_values = thread_local_pos->second;
+    ThreadLocalValues::iterator value_pos =
+        thread_local_values.find(thread_local_instance);
+    if (value_pos == thread_local_values.end()) {
+      value_pos =
+          thread_local_values
+              .insert(std::make_pair(
+                  thread_local_instance,
+                  std::shared_ptr<ThreadLocalValueHolderBase>(
+                      thread_local_instance->NewValueForCurrentThread())))
+              .first;
+    }
+    return value_pos->second.get();
+  }
+
+  static void OnThreadLocalDestroyed(
+      const ThreadLocalBase* thread_local_instance) {
+    std::vector<std::shared_ptr<ThreadLocalValueHolderBase> > value_holders;
+    // Clean up the ThreadLocalValues data structure while holding the lock, but
+    // defer the destruction of the ThreadLocalValueHolderBases.
+    {
+      MutexLock lock(&mutex_);
+      ThreadIdToThreadLocals* const thread_to_thread_locals =
+          GetThreadLocalsMapLocked();
+      for (ThreadIdToThreadLocals::iterator it =
+               thread_to_thread_locals->begin();
+           it != thread_to_thread_locals->end(); ++it) {
+        ThreadLocalValues& thread_local_values = it->second;
+        ThreadLocalValues::iterator value_pos =
+            thread_local_values.find(thread_local_instance);
+        if (value_pos != thread_local_values.end()) {
+          value_holders.push_back(value_pos->second);
+          thread_local_values.erase(value_pos);
+          // This 'if' can only be successful at most once, so theoretically we
+          // could break out of the loop here, but we don't bother doing so.
+        }
+      }
+    }
+    // Outside the lock, let the destructor for 'value_holders' deallocate the
+    // ThreadLocalValueHolderBases.
+  }
+
+  static void OnThreadExit(DWORD thread_id) {
+    GTEST_CHECK_(thread_id != 0) << ::GetLastError();
+    std::vector<std::shared_ptr<ThreadLocalValueHolderBase> > value_holders;
+    // Clean up the ThreadIdToThreadLocals data structure while holding the
+    // lock, but defer the destruction of the ThreadLocalValueHolderBases.
+    {
+      MutexLock lock(&mutex_);
+      ThreadIdToThreadLocals* const thread_to_thread_locals =
+          GetThreadLocalsMapLocked();
+      ThreadIdToThreadLocals::iterator thread_local_pos =
+          thread_to_thread_locals->find(thread_id);
+      if (thread_local_pos != thread_to_thread_locals->end()) {
+        ThreadLocalValues& thread_local_values = thread_local_pos->second;
+        for (ThreadLocalValues::iterator value_pos =
+                 thread_local_values.begin();
+             value_pos != thread_local_values.end(); ++value_pos) {
+          value_holders.push_back(value_pos->second);
+        }
+        thread_to_thread_locals->erase(thread_local_pos);
+      }
+    }
+    // Outside the lock, let the destructor for 'value_holders' deallocate the
+    // ThreadLocalValueHolderBases.
+  }
+
+ private:
+  // In a particular thread, maps a ThreadLocal object to its value.
+  typedef std::map<const ThreadLocalBase*,
+                   std::shared_ptr<ThreadLocalValueHolderBase> >
+      ThreadLocalValues;
+  // Stores all ThreadIdToThreadLocals having values in a thread, indexed by
+  // thread's ID.
+  typedef std::map<DWORD, ThreadLocalValues> ThreadIdToThreadLocals;
+
+  // Holds the thread id and thread handle that we pass from
+  // StartWatcherThreadFor to WatcherThreadFunc.
+  typedef std::pair<DWORD, HANDLE> ThreadIdAndHandle;
+
+  static void StartWatcherThreadFor(DWORD thread_id) {
+    // The returned handle will be kept in thread_map and closed by
+    // watcher_thread in WatcherThreadFunc.
+    HANDLE thread =
+        ::OpenThread(SYNCHRONIZE | THREAD_QUERY_INFORMATION, FALSE, thread_id);
+    GTEST_CHECK_(thread != nullptr);
+    // We need to pass a valid thread ID pointer into CreateThread for it
+    // to work correctly under Win98.
+    DWORD watcher_thread_id;
+    HANDLE watcher_thread = ::CreateThread(
+        nullptr,  // Default security.
+        0,        // Default stack size
+        &ThreadLocalRegistryImpl::WatcherThreadFunc,
+        reinterpret_cast<LPVOID>(new ThreadIdAndHandle(thread_id, thread)),
+        CREATE_SUSPENDED, &watcher_thread_id);
+    GTEST_CHECK_(watcher_thread != nullptr)
+        << "CreateThread failed with error " << ::GetLastError() << ".";
+    // Give the watcher thread the same priority as ours to avoid being
+    // blocked by it.
+    ::SetThreadPriority(watcher_thread,
+                        ::GetThreadPriority(::GetCurrentThread()));
+    ::ResumeThread(watcher_thread);
+    ::CloseHandle(watcher_thread);
+  }
+
+  // Monitors exit from a given thread and notifies those
+  // ThreadIdToThreadLocals about thread termination.
+  static DWORD WINAPI WatcherThreadFunc(LPVOID param) {
+    const ThreadIdAndHandle* tah =
+        reinterpret_cast<const ThreadIdAndHandle*>(param);
+    GTEST_CHECK_(::WaitForSingleObject(tah->second, INFINITE) == WAIT_OBJECT_0);
+    OnThreadExit(tah->first);
+    ::CloseHandle(tah->second);
+    delete tah;
+    return 0;
+  }
+
+  // Returns map of thread local instances.
+  static ThreadIdToThreadLocals* GetThreadLocalsMapLocked() {
+    mutex_.AssertHeld();
+#ifdef _MSC_VER
+    MemoryIsNotDeallocated memory_is_not_deallocated;
+#endif  // _MSC_VER
+    static ThreadIdToThreadLocals* map = new ThreadIdToThreadLocals();
+    return map;
+  }
+
+  // Protects access to GetThreadLocalsMapLocked() and its return value.
+  static Mutex mutex_;
+  // Protects access to GetThreadMapLocked() and its return value.
+  static Mutex thread_map_mutex_;
+};
+
+Mutex ThreadLocalRegistryImpl::mutex_(Mutex::kStaticMutex);  // NOLINT
+Mutex ThreadLocalRegistryImpl::thread_map_mutex_(
+    Mutex::kStaticMutex);  // NOLINT
+
+ThreadLocalValueHolderBase* ThreadLocalRegistry::GetValueOnCurrentThread(
+    const ThreadLocalBase* thread_local_instance) {
+  return ThreadLocalRegistryImpl::GetValueOnCurrentThread(
+      thread_local_instance);
+}
+
+void ThreadLocalRegistry::OnThreadLocalDestroyed(
+    const ThreadLocalBase* thread_local_instance) {
+  ThreadLocalRegistryImpl::OnThreadLocalDestroyed(thread_local_instance);
+}
+
+#endif  // GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS
+
+#if GTEST_USES_POSIX_RE
+
+// Implements RE.  Currently only needed for death tests.
+
+RE::~RE() {
+  if (is_valid_) {
+    // regfree'ing an invalid regex might crash because the content
+    // of the regex is undefined. Since the regex's are essentially
+    // the same, one cannot be valid (or invalid) without the other
+    // being so too.
+    regfree(&partial_regex_);
+    regfree(&full_regex_);
+  }
+  free(const_cast<char*>(pattern_));
+}
+
+// Returns true if and only if regular expression re matches the entire str.
+bool RE::FullMatch(const char* str, const RE& re) {
+  if (!re.is_valid_) return false;
+
+  regmatch_t match;
+  return regexec(&re.full_regex_, str, 1, &match, 0) == 0;
+}
+
+// Returns true if and only if regular expression re matches a substring of
+// str (including str itself).
+bool RE::PartialMatch(const char* str, const RE& re) {
+  if (!re.is_valid_) return false;
+
+  regmatch_t match;
+  return regexec(&re.partial_regex_, str, 1, &match, 0) == 0;
+}
+
+// Initializes an RE from its string representation.
+void RE::Init(const char* regex) {
+  pattern_ = posix::StrDup(regex);
+
+  // Reserves enough bytes to hold the regular expression used for a
+  // full match.
+  const size_t full_regex_len = strlen(regex) + 10;
+  char* const full_pattern = new char[full_regex_len];
+
+  snprintf(full_pattern, full_regex_len, "^(%s)$", regex);
+  is_valid_ = regcomp(&full_regex_, full_pattern, REG_EXTENDED) == 0;
+  // We want to call regcomp(&partial_regex_, ...) even if the
+  // previous expression returns false.  Otherwise partial_regex_ may
+  // not be properly initialized can may cause trouble when it's
+  // freed.
+  //
+  // Some implementation of POSIX regex (e.g. on at least some
+  // versions of Cygwin) doesn't accept the empty string as a valid
+  // regex.  We change it to an equivalent form "()" to be safe.
+  if (is_valid_) {
+    const char* const partial_regex = (*regex == '\0') ? "()" : regex;
+    is_valid_ = regcomp(&partial_regex_, partial_regex, REG_EXTENDED) == 0;
+  }
+  EXPECT_TRUE(is_valid_)
+      << "Regular expression \"" << regex
+      << "\" is not a valid POSIX Extended regular expression.";
+
+  delete[] full_pattern;
+}
+
+#elif GTEST_USES_SIMPLE_RE
+
+// Returns true if and only if ch appears anywhere in str (excluding the
+// terminating '\0' character).
+bool IsInSet(char ch, const char* str) {
+  return ch != '\0' && strchr(str, ch) != nullptr;
+}
+
+// Returns true if and only if ch belongs to the given classification.
+// Unlike similar functions in <ctype.h>, these aren't affected by the
+// current locale.
+bool IsAsciiDigit(char ch) { return '0' <= ch && ch <= '9'; }
+bool IsAsciiPunct(char ch) {
+  return IsInSet(ch, "^-!\"#$%&'()*+,./:;<=>?@[\\]_`{|}~");
+}
+bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); }
+bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); }
+bool IsAsciiWordChar(char ch) {
+  return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') ||
+         ('0' <= ch && ch <= '9') || ch == '_';
+}
+
+// Returns true if and only if "\\c" is a supported escape sequence.
+bool IsValidEscape(char c) {
+  return (IsAsciiPunct(c) || IsInSet(c, "dDfnrsStvwW"));
+}
+
+// Returns true if and only if the given atom (specified by escaped and
+// pattern) matches ch.  The result is undefined if the atom is invalid.
+bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
+  if (escaped) {  // "\\p" where p is pattern_char.
+    switch (pattern_char) {
+      case 'd':
+        return IsAsciiDigit(ch);
+      case 'D':
+        return !IsAsciiDigit(ch);
+      case 'f':
+        return ch == '\f';
+      case 'n':
+        return ch == '\n';
+      case 'r':
+        return ch == '\r';
+      case 's':
+        return IsAsciiWhiteSpace(ch);
+      case 'S':
+        return !IsAsciiWhiteSpace(ch);
+      case 't':
+        return ch == '\t';
+      case 'v':
+        return ch == '\v';
+      case 'w':
+        return IsAsciiWordChar(ch);
+      case 'W':
+        return !IsAsciiWordChar(ch);
+    }
+    return IsAsciiPunct(pattern_char) && pattern_char == ch;
+  }
+
+  return (pattern_char == '.' && ch != '\n') || pattern_char == ch;
+}
+
+// Helper function used by ValidateRegex() to format error messages.
+static std::string FormatRegexSyntaxError(const char* regex, int index) {
+  return (Message() << "Syntax error at index " << index
+                    << " in simple regular expression \"" << regex << "\": ")
+      .GetString();
+}
+
+// Generates non-fatal failures and returns false if regex is invalid;
+// otherwise returns true.
+bool ValidateRegex(const char* regex) {
+  if (regex == nullptr) {
+    ADD_FAILURE() << "NULL is not a valid simple regular expression.";
+    return false;
+  }
+
+  bool is_valid = true;
+
+  // True if and only if ?, *, or + can follow the previous atom.
+  bool prev_repeatable = false;
+  for (int i = 0; regex[i]; i++) {
+    if (regex[i] == '\\') {  // An escape sequence
+      i++;
+      if (regex[i] == '\0') {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
+                      << "'\\' cannot appear at the end.";
+        return false;
+      }
+
+      if (!IsValidEscape(regex[i])) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
+                      << "invalid escape sequence \"\\" << regex[i] << "\".";
+        is_valid = false;
+      }
+      prev_repeatable = true;
+    } else {  // Not an escape sequence.
+      const char ch = regex[i];
+
+      if (ch == '^' && i > 0) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'^' can only appear at the beginning.";
+        is_valid = false;
+      } else if (ch == '$' && regex[i + 1] != '\0') {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'$' can only appear at the end.";
+        is_valid = false;
+      } else if (IsInSet(ch, "()[]{}|")) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i) << "'" << ch
+                      << "' is unsupported.";
+        is_valid = false;
+      } else if (IsRepeat(ch) && !prev_repeatable) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i) << "'" << ch
+                      << "' can only follow a repeatable token.";
+        is_valid = false;
+      }
+
+      prev_repeatable = !IsInSet(ch, "^$?*+");
+    }
+  }
+
+  return is_valid;
+}
+
+// Matches a repeated regex atom followed by a valid simple regular
+// expression.  The regex atom is defined as c if escaped is false,
+// or \c otherwise.  repeat is the repetition meta character (?, *,
+// or +).  The behavior is undefined if str contains too many
+// characters to be indexable by size_t, in which case the test will
+// probably time out anyway.  We are fine with this limitation as
+// std::string has it too.
+bool MatchRepetitionAndRegexAtHead(bool escaped, char c, char repeat,
+                                   const char* regex, const char* str) {
+  const size_t min_count = (repeat == '+') ? 1 : 0;
+  const size_t max_count = (repeat == '?') ? 1 : static_cast<size_t>(-1) - 1;
+  // We cannot call numeric_limits::max() as it conflicts with the
+  // max() macro on Windows.
+
+  for (size_t i = 0; i <= max_count; ++i) {
+    // We know that the atom matches each of the first i characters in str.
+    if (i >= min_count && MatchRegexAtHead(regex, str + i)) {
+      // We have enough matches at the head, and the tail matches too.
+      // Since we only care about *whether* the pattern matches str
+      // (as opposed to *how* it matches), there is no need to find a
+      // greedy match.
+      return true;
+    }
+    if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i])) return false;
+  }
+  return false;
+}
+
+// Returns true if and only if regex matches a prefix of str. regex must
+// be a valid simple regular expression and not start with "^", or the
+// result is undefined.
+bool MatchRegexAtHead(const char* regex, const char* str) {
+  if (*regex == '\0')  // An empty regex matches a prefix of anything.
+    return true;
+
+  // "$" only matches the end of a string.  Note that regex being
+  // valid guarantees that there's nothing after "$" in it.
+  if (*regex == '$') return *str == '\0';
+
+  // Is the first thing in regex an escape sequence?
+  const bool escaped = *regex == '\\';
+  if (escaped) ++regex;
+  if (IsRepeat(regex[1])) {
+    // MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so
+    // here's an indirect recursion.  It terminates as the regex gets
+    // shorter in each recursion.
+    return MatchRepetitionAndRegexAtHead(escaped, regex[0], regex[1], regex + 2,
+                                         str);
+  } else {
+    // regex isn't empty, isn't "$", and doesn't start with a
+    // repetition.  We match the first atom of regex with the first
+    // character of str and recurse.
+    return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) &&
+           MatchRegexAtHead(regex + 1, str + 1);
+  }
+}
+
+// Returns true if and only if regex matches any substring of str.  regex must
+// be a valid simple regular expression, or the result is undefined.
+//
+// The algorithm is recursive, but the recursion depth doesn't exceed
+// the regex length, so we won't need to worry about running out of
+// stack space normally.  In rare cases the time complexity can be
+// exponential with respect to the regex length + the string length,
+// but usually it's must faster (often close to linear).
+bool MatchRegexAnywhere(const char* regex, const char* str) {
+  if (regex == nullptr || str == nullptr) return false;
+
+  if (*regex == '^') return MatchRegexAtHead(regex + 1, str);
+
+  // A successful match can be anywhere in str.
+  do {
+    if (MatchRegexAtHead(regex, str)) return true;
+  } while (*str++ != '\0');
+  return false;
+}
+
+// Implements the RE class.
+
+RE::~RE() {
+  free(const_cast<char*>(pattern_));
+  free(const_cast<char*>(full_pattern_));
+}
+
+// Returns true if and only if regular expression re matches the entire str.
+bool RE::FullMatch(const char* str, const RE& re) {
+  return re.is_valid_ && MatchRegexAnywhere(re.full_pattern_, str);
+}
+
+// Returns true if and only if regular expression re matches a substring of
+// str (including str itself).
+bool RE::PartialMatch(const char* str, const RE& re) {
+  return re.is_valid_ && MatchRegexAnywhere(re.pattern_, str);
+}
+
+// Initializes an RE from its string representation.
+void RE::Init(const char* regex) {
+  pattern_ = full_pattern_ = nullptr;
+  if (regex != nullptr) {
+    pattern_ = posix::StrDup(regex);
+  }
+
+  is_valid_ = ValidateRegex(regex);
+  if (!is_valid_) {
+    // No need to calculate the full pattern when the regex is invalid.
+    return;
+  }
+
+  const size_t len = strlen(regex);
+  // Reserves enough bytes to hold the regular expression used for a
+  // full match: we need space to prepend a '^', append a '$', and
+  // terminate the string with '\0'.
+  char* buffer = static_cast<char*>(malloc(len + 3));
+  full_pattern_ = buffer;
+
+  if (*regex != '^')
+    *buffer++ = '^';  // Makes sure full_pattern_ starts with '^'.
+
+  // We don't use snprintf or strncpy, as they trigger a warning when
+  // compiled with VC++ 8.0.
+  memcpy(buffer, regex, len);
+  buffer += len;
+
+  if (len == 0 || regex[len - 1] != '$')
+    *buffer++ = '$';  // Makes sure full_pattern_ ends with '$'.
+
+  *buffer = '\0';
+}
+
+#endif  // GTEST_USES_POSIX_RE
+
+const char kUnknownFile[] = "unknown file";
+
+// Formats a source file path and a line number as they would appear
+// in an error message from the compiler used to compile this code.
+GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) {
+  const std::string file_name(file == nullptr ? kUnknownFile : file);
+
+  if (line < 0) {
+    return file_name + ":";
+  }
+#ifdef _MSC_VER
+  return file_name + "(" + StreamableToString(line) + "):";
+#else
+  return file_name + ":" + StreamableToString(line) + ":";
+#endif  // _MSC_VER
+}
+
+// Formats a file location for compiler-independent XML output.
+// Although this function is not platform dependent, we put it next to
+// FormatFileLocation in order to contrast the two functions.
+// Note that FormatCompilerIndependentFileLocation() does NOT append colon
+// to the file location it produces, unlike FormatFileLocation().
+GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file,
+                                                               int line) {
+  const std::string file_name(file == nullptr ? kUnknownFile : file);
+
+  if (line < 0)
+    return file_name;
+  else
+    return file_name + ":" + StreamableToString(line);
+}
+
+GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line)
+    : severity_(severity) {
+  const char* const marker = severity == GTEST_INFO      ? "[  INFO ]"
+                             : severity == GTEST_WARNING ? "[WARNING]"
+                             : severity == GTEST_ERROR   ? "[ ERROR ]"
+                                                         : "[ FATAL ]";
+  GetStream() << ::std::endl
+              << marker << " " << FormatFileLocation(file, line).c_str()
+              << ": ";
+}
+
+// Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
+GTestLog::~GTestLog() {
+  GetStream() << ::std::endl;
+  if (severity_ == GTEST_FATAL) {
+    fflush(stderr);
+    posix::Abort();
+  }
+}
+
+// Disable Microsoft deprecation warnings for POSIX functions called from
+// this class (creat, dup, dup2, and close)
+GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Object that captures an output stream (stdout/stderr).
+class CapturedStream {
+ public:
+  // The ctor redirects the stream to a temporary file.
+  explicit CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) {
+#if GTEST_OS_WINDOWS
+    char temp_dir_path[MAX_PATH + 1] = {'\0'};   // NOLINT
+    char temp_file_path[MAX_PATH + 1] = {'\0'};  // NOLINT
+
+    ::GetTempPathA(sizeof(temp_dir_path), temp_dir_path);
+    const UINT success = ::GetTempFileNameA(temp_dir_path, "gtest_redir",
+                                            0,  // Generate unique file name.
+                                            temp_file_path);
+    GTEST_CHECK_(success != 0)
+        << "Unable to create a temporary file in " << temp_dir_path;
+    const int captured_fd = creat(temp_file_path, _S_IREAD | _S_IWRITE);
+    GTEST_CHECK_(captured_fd != -1)
+        << "Unable to open temporary file " << temp_file_path;
+    filename_ = temp_file_path;
+#else
+    // There's no guarantee that a test has write access to the current
+    // directory, so we create the temporary file in a temporary directory.
+    std::string name_template;
+
+#if GTEST_OS_LINUX_ANDROID
+    // Note: Android applications are expected to call the framework's
+    // Context.getExternalStorageDirectory() method through JNI to get
+    // the location of the world-writable SD Card directory. However,
+    // this requires a Context handle, which cannot be retrieved
+    // globally from native code. Doing so also precludes running the
+    // code as part of a regular standalone executable, which doesn't
+    // run in a Dalvik process (e.g. when running it through 'adb shell').
+    //
+    // The location /data/local/tmp is directly accessible from native code.
+    // '/sdcard' and other variants cannot be relied on, as they are not
+    // guaranteed to be mounted, or may have a delay in mounting.
+    name_template = "/data/local/tmp/";
+#elif GTEST_OS_IOS
+    char user_temp_dir[PATH_MAX + 1];
+
+    // Documented alternative to NSTemporaryDirectory() (for obtaining creating
+    // a temporary directory) at
+    // https://developer.apple.com/library/archive/documentation/Security/Conceptual/SecureCodingGuide/Articles/RaceConditions.html#//apple_ref/doc/uid/TP40002585-SW10
+    //
+    // _CS_DARWIN_USER_TEMP_DIR (as well as _CS_DARWIN_USER_CACHE_DIR) is not
+    // documented in the confstr() man page at
+    // https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man3/confstr.3.html#//apple_ref/doc/man/3/confstr
+    // but are still available, according to the WebKit patches at
+    // https://trac.webkit.org/changeset/262004/webkit
+    // https://trac.webkit.org/changeset/263705/webkit
+    //
+    // The confstr() implementation falls back to getenv("TMPDIR"). See
+    // https://opensource.apple.com/source/Libc/Libc-1439.100.3/gen/confstr.c.auto.html
+    ::confstr(_CS_DARWIN_USER_TEMP_DIR, user_temp_dir, sizeof(user_temp_dir));
+
+    name_template = user_temp_dir;
+    if (name_template.back() != GTEST_PATH_SEP_[0])
+      name_template.push_back(GTEST_PATH_SEP_[0]);
+#else
+    name_template = "/tmp/";
+#endif
+    name_template.append("gtest_captured_stream.XXXXXX");
+
+    // mkstemp() modifies the string bytes in place, and does not go beyond the
+    // string's length. This results in well-defined behavior in C++17.
+    //
+    // The const_cast is needed below C++17. The constraints on std::string
+    // implementations in C++11 and above make assumption behind the const_cast
+    // fairly safe.
+    const int captured_fd = ::mkstemp(const_cast<char*>(name_template.data()));
+    if (captured_fd == -1) {
+      GTEST_LOG_(WARNING)
+          << "Failed to create tmp file " << name_template
+          << " for test; does the test have access to the /tmp directory?";
+    }
+    filename_ = std::move(name_template);
+#endif  // GTEST_OS_WINDOWS
+    fflush(nullptr);
+    dup2(captured_fd, fd_);
+    close(captured_fd);
+  }
+
+  ~CapturedStream() { remove(filename_.c_str()); }
+
+  std::string GetCapturedString() {
+    if (uncaptured_fd_ != -1) {
+      // Restores the original stream.
+      fflush(nullptr);
+      dup2(uncaptured_fd_, fd_);
+      close(uncaptured_fd_);
+      uncaptured_fd_ = -1;
+    }
+
+    FILE* const file = posix::FOpen(filename_.c_str(), "r");
+    if (file == nullptr) {
+      GTEST_LOG_(FATAL) << "Failed to open tmp file " << filename_
+                        << " for capturing stream.";
+    }
+    const std::string content = ReadEntireFile(file);
+    posix::FClose(file);
+    return content;
+  }
+
+ private:
+  const int fd_;  // A stream to capture.
+  int uncaptured_fd_;
+  // Name of the temporary file holding the stderr output.
+  ::std::string filename_;
+
+  CapturedStream(const CapturedStream&) = delete;
+  CapturedStream& operator=(const CapturedStream&) = delete;
+};
+
+GTEST_DISABLE_MSC_DEPRECATED_POP_()
+
+static CapturedStream* g_captured_stderr = nullptr;
+static CapturedStream* g_captured_stdout = nullptr;
+
+// Starts capturing an output stream (stdout/stderr).
+static void CaptureStream(int fd, const char* stream_name,
+                          CapturedStream** stream) {
+  if (*stream != nullptr) {
+    GTEST_LOG_(FATAL) << "Only one " << stream_name
+                      << " capturer can exist at a time.";
+  }
+  *stream = new CapturedStream(fd);
+}
+
+// Stops capturing the output stream and returns the captured string.
+static std::string GetCapturedStream(CapturedStream** captured_stream) {
+  const std::string content = (*captured_stream)->GetCapturedString();
+
+  delete *captured_stream;
+  *captured_stream = nullptr;
+
+  return content;
+}
+
+#if defined(_MSC_VER) || defined(__BORLANDC__)
+// MSVC and C++Builder do not provide a definition of STDERR_FILENO.
+const int kStdOutFileno = 1;
+const int kStdErrFileno = 2;
+#else
+const int kStdOutFileno = STDOUT_FILENO;
+const int kStdErrFileno = STDERR_FILENO;
+#endif  // defined(_MSC_VER) || defined(__BORLANDC__)
+
+// Starts capturing stdout.
+void CaptureStdout() {
+  CaptureStream(kStdOutFileno, "stdout", &g_captured_stdout);
+}
+
+// Starts capturing stderr.
+void CaptureStderr() {
+  CaptureStream(kStdErrFileno, "stderr", &g_captured_stderr);
+}
+
+// Stops capturing stdout and returns the captured string.
+std::string GetCapturedStdout() {
+  return GetCapturedStream(&g_captured_stdout);
+}
+
+// Stops capturing stderr and returns the captured string.
+std::string GetCapturedStderr() {
+  return GetCapturedStream(&g_captured_stderr);
+}
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+size_t GetFileSize(FILE* file) {
+  fseek(file, 0, SEEK_END);
+  return static_cast<size_t>(ftell(file));
+}
+
+std::string ReadEntireFile(FILE* file) {
+  const size_t file_size = GetFileSize(file);
+  char* const buffer = new char[file_size];
+
+  size_t bytes_last_read = 0;  // # of bytes read in the last fread()
+  size_t bytes_read = 0;       // # of bytes read so far
+
+  fseek(file, 0, SEEK_SET);
+
+  // Keeps reading the file until we cannot read further or the
+  // pre-determined file size is reached.
+  do {
+    bytes_last_read =
+        fread(buffer + bytes_read, 1, file_size - bytes_read, file);
+    bytes_read += bytes_last_read;
+  } while (bytes_last_read > 0 && bytes_read < file_size);
+
+  const std::string content(buffer, bytes_read);
+  delete[] buffer;
+
+  return content;
+}
+
+#if GTEST_HAS_DEATH_TEST
+static const std::vector<std::string>* g_injected_test_argvs =
+    nullptr;  // Owned.
+
+std::vector<std::string> GetInjectableArgvs() {
+  if (g_injected_test_argvs != nullptr) {
+    return *g_injected_test_argvs;
+  }
+  return GetArgvs();
+}
+
+void SetInjectableArgvs(const std::vector<std::string>* new_argvs) {
+  if (g_injected_test_argvs != new_argvs) delete g_injected_test_argvs;
+  g_injected_test_argvs = new_argvs;
+}
+
+void SetInjectableArgvs(const std::vector<std::string>& new_argvs) {
+  SetInjectableArgvs(
+      new std::vector<std::string>(new_argvs.begin(), new_argvs.end()));
+}
+
+void ClearInjectableArgvs() {
+  delete g_injected_test_argvs;
+  g_injected_test_argvs = nullptr;
+}
+#endif  // GTEST_HAS_DEATH_TEST
+
+#if GTEST_OS_WINDOWS_MOBILE
+namespace posix {
+void Abort() {
+  DebugBreak();
+  TerminateProcess(GetCurrentProcess(), 1);
+}
+}  // namespace posix
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+// Returns the name of the environment variable corresponding to the
+// given flag.  For example, FlagToEnvVar("foo") will return
+// "GTEST_FOO" in the open-source version.
+static std::string FlagToEnvVar(const char* flag) {
+  const std::string full_flag =
+      (Message() << GTEST_FLAG_PREFIX_ << flag).GetString();
+
+  Message env_var;
+  for (size_t i = 0; i != full_flag.length(); i++) {
+    env_var << ToUpper(full_flag.c_str()[i]);
+  }
+
+  return env_var.GetString();
+}
+
+// Parses 'str' for a 32-bit signed integer.  If successful, writes
+// the result to *value and returns true; otherwise leaves *value
+// unchanged and returns false.
+bool ParseInt32(const Message& src_text, const char* str, int32_t* value) {
+  // Parses the environment variable as a decimal integer.
+  char* end = nullptr;
+  const long long_value = strtol(str, &end, 10);  // NOLINT
+
+  // Has strtol() consumed all characters in the string?
+  if (*end != '\0') {
+    // No - an invalid character was encountered.
+    Message msg;
+    msg << "WARNING: " << src_text
+        << " is expected to be a 32-bit integer, but actually"
+        << " has value \"" << str << "\".\n";
+    printf("%s", msg.GetString().c_str());
+    fflush(stdout);
+    return false;
+  }
+
+  // Is the parsed value in the range of an int32_t?
+  const auto result = static_cast<int32_t>(long_value);
+  if (long_value == LONG_MAX || long_value == LONG_MIN ||
+      // The parsed value overflows as a long.  (strtol() returns
+      // LONG_MAX or LONG_MIN when the input overflows.)
+      result != long_value
+      // The parsed value overflows as an int32_t.
+  ) {
+    Message msg;
+    msg << "WARNING: " << src_text
+        << " is expected to be a 32-bit integer, but actually"
+        << " has value " << str << ", which overflows.\n";
+    printf("%s", msg.GetString().c_str());
+    fflush(stdout);
+    return false;
+  }
+
+  *value = result;
+  return true;
+}
+
+// Reads and returns the Boolean environment variable corresponding to
+// the given flag; if it's not set, returns default_value.
+//
+// The value is considered true if and only if it's not "0".
+bool BoolFromGTestEnv(const char* flag, bool default_value) {
+#if defined(GTEST_GET_BOOL_FROM_ENV_)
+  return GTEST_GET_BOOL_FROM_ENV_(flag, default_value);
+#else
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const string_value = posix::GetEnv(env_var.c_str());
+  return string_value == nullptr ? default_value
+                                 : strcmp(string_value, "0") != 0;
+#endif  // defined(GTEST_GET_BOOL_FROM_ENV_)
+}
+
+// Reads and returns a 32-bit integer stored in the environment
+// variable corresponding to the given flag; if it isn't set or
+// doesn't represent a valid 32-bit integer, returns default_value.
+int32_t Int32FromGTestEnv(const char* flag, int32_t default_value) {
+#if defined(GTEST_GET_INT32_FROM_ENV_)
+  return GTEST_GET_INT32_FROM_ENV_(flag, default_value);
+#else
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const string_value = posix::GetEnv(env_var.c_str());
+  if (string_value == nullptr) {
+    // The environment variable is not set.
+    return default_value;
+  }
+
+  int32_t result = default_value;
+  if (!ParseInt32(Message() << "Environment variable " << env_var, string_value,
+                  &result)) {
+    printf("The default value %s is used.\n",
+           (Message() << default_value).GetString().c_str());
+    fflush(stdout);
+    return default_value;
+  }
+
+  return result;
+#endif  // defined(GTEST_GET_INT32_FROM_ENV_)
+}
+
+// As a special case for the 'output' flag, if GTEST_OUTPUT is not
+// set, we look for XML_OUTPUT_FILE, which is set by the Bazel build
+// system.  The value of XML_OUTPUT_FILE is a filename without the
+// "xml:" prefix of GTEST_OUTPUT.
+// Note that this is meant to be called at the call site so it does
+// not check that the flag is 'output'
+// In essence this checks an env variable called XML_OUTPUT_FILE
+// and if it is set we prepend "xml:" to its value, if it not set we return ""
+std::string OutputFlagAlsoCheckEnvVar() {
+  std::string default_value_for_output_flag = "";
+  const char* xml_output_file_env = posix::GetEnv("XML_OUTPUT_FILE");
+  if (nullptr != xml_output_file_env) {
+    default_value_for_output_flag = std::string("xml:") + xml_output_file_env;
+  }
+  return default_value_for_output_flag;
+}
+
+// Reads and returns the string environment variable corresponding to
+// the given flag; if it's not set, returns default_value.
+const char* StringFromGTestEnv(const char* flag, const char* default_value) {
+#if defined(GTEST_GET_STRING_FROM_ENV_)
+  return GTEST_GET_STRING_FROM_ENV_(flag, default_value);
+#else
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const value = posix::GetEnv(env_var.c_str());
+  return value == nullptr ? default_value : value;
+#endif  // defined(GTEST_GET_STRING_FROM_ENV_)
+}
+
+}  // namespace internal
+}  // namespace testing
diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest-printers.cc b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-printers.cc
new file mode 100644
index 0000000000..f3976d230d
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-printers.cc
@@ -0,0 +1,553 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Google Test - The Google C++ Testing and Mocking Framework
+//
+// This file implements a universal value printer that can print a
+// value of any type T:
+//
+//   void ::testing::internal::UniversalPrinter<T>::Print(value, ostream_ptr);
+//
+// It uses the << operator when possible, and prints the bytes in the
+// object otherwise.  A user can override its behavior for a class
+// type Foo by defining either operator<<(::std::ostream&, const Foo&)
+// or void PrintTo(const Foo&, ::std::ostream*) in the namespace that
+// defines Foo.
+
+#include "gtest/gtest-printers.h"
+
+#include <stdio.h>
+
+#include <cctype>
+#include <cstdint>
+#include <cwchar>
+#include <ostream>  // NOLINT
+#include <string>
+#include <type_traits>
+
+#include "gtest/internal/gtest-port.h"
+#include "src/gtest-internal-inl.h"
+
+namespace testing {
+
+namespace {
+
+using ::std::ostream;
+
+// Prints a segment of bytes in the given object.
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+void PrintByteSegmentInObjectTo(const unsigned char* obj_bytes, size_t start,
+                                size_t count, ostream* os) {
+  char text[5] = "";
+  for (size_t i = 0; i != count; i++) {
+    const size_t j = start + i;
+    if (i != 0) {
+      // Organizes the bytes into groups of 2 for easy parsing by
+      // human.
+      if ((j % 2) == 0)
+        *os << ' ';
+      else
+        *os << '-';
+    }
+    GTEST_SNPRINTF_(text, sizeof(text), "%02X", obj_bytes[j]);
+    *os << text;
+  }
+}
+
+// Prints the bytes in the given value to the given ostream.
+void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count,
+                              ostream* os) {
+  // Tells the user how big the object is.
+  *os << count << "-byte object <";
+
+  const size_t kThreshold = 132;
+  const size_t kChunkSize = 64;
+  // If the object size is bigger than kThreshold, we'll have to omit
+  // some details by printing only the first and the last kChunkSize
+  // bytes.
+  if (count < kThreshold) {
+    PrintByteSegmentInObjectTo(obj_bytes, 0, count, os);
+  } else {
+    PrintByteSegmentInObjectTo(obj_bytes, 0, kChunkSize, os);
+    *os << " ... ";
+    // Rounds up to 2-byte boundary.
+    const size_t resume_pos = (count - kChunkSize + 1) / 2 * 2;
+    PrintByteSegmentInObjectTo(obj_bytes, resume_pos, count - resume_pos, os);
+  }
+  *os << ">";
+}
+
+// Helpers for widening a character to char32_t. Since the standard does not
+// specify if char / wchar_t is signed or unsigned, it is important to first
+// convert it to the unsigned type of the same width before widening it to
+// char32_t.
+template <typename CharType>
+char32_t ToChar32(CharType in) {
+  return static_cast<char32_t>(
+      static_cast<typename std::make_unsigned<CharType>::type>(in));
+}
+
+}  // namespace
+
+namespace internal {
+
+// Delegates to PrintBytesInObjectToImpl() to print the bytes in the
+// given object.  The delegation simplifies the implementation, which
+// uses the << operator and thus is easier done outside of the
+// ::testing::internal namespace, which contains a << operator that
+// sometimes conflicts with the one in STL.
+void PrintBytesInObjectTo(const unsigned char* obj_bytes, size_t count,
+                          ostream* os) {
+  PrintBytesInObjectToImpl(obj_bytes, count, os);
+}
+
+// Depending on the value of a char (or wchar_t), we print it in one
+// of three formats:
+//   - as is if it's a printable ASCII (e.g. 'a', '2', ' '),
+//   - as a hexadecimal escape sequence (e.g. '\x7F'), or
+//   - as a special escape sequence (e.g. '\r', '\n').
+enum CharFormat { kAsIs, kHexEscape, kSpecialEscape };
+
+// Returns true if c is a printable ASCII character.  We test the
+// value of c directly instead of calling isprint(), which is buggy on
+// Windows Mobile.
+inline bool IsPrintableAscii(char32_t c) { return 0x20 <= c && c <= 0x7E; }
+
+// Prints c (of type char, char8_t, char16_t, char32_t, or wchar_t) as a
+// character literal without the quotes, escaping it when necessary; returns how
+// c was formatted.
+template <typename Char>
+static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) {
+  const char32_t u_c = ToChar32(c);
+  switch (u_c) {
+    case L'\0':
+      *os << "\\0";
+      break;
+    case L'\'':
+      *os << "\\'";
+      break;
+    case L'\\':
+      *os << "\\\\";
+      break;
+    case L'\a':
+      *os << "\\a";
+      break;
+    case L'\b':
+      *os << "\\b";
+      break;
+    case L'\f':
+      *os << "\\f";
+      break;
+    case L'\n':
+      *os << "\\n";
+      break;
+    case L'\r':
+      *os << "\\r";
+      break;
+    case L'\t':
+      *os << "\\t";
+      break;
+    case L'\v':
+      *os << "\\v";
+      break;
+    default:
+      if (IsPrintableAscii(u_c)) {
+        *os << static_cast<char>(c);
+        return kAsIs;
+      } else {
+        ostream::fmtflags flags = os->flags();
+        *os << "\\x" << std::hex << std::uppercase << static_cast<int>(u_c);
+        os->flags(flags);
+        return kHexEscape;
+      }
+  }
+  return kSpecialEscape;
+}
+
+// Prints a char32_t c as if it's part of a string literal, escaping it when
+// necessary; returns how c was formatted.
+static CharFormat PrintAsStringLiteralTo(char32_t c, ostream* os) {
+  switch (c) {
+    case L'\'':
+      *os << "'";
+      return kAsIs;
+    case L'"':
+      *os << "\\\"";
+      return kSpecialEscape;
+    default:
+      return PrintAsCharLiteralTo(c, os);
+  }
+}
+
+static const char* GetCharWidthPrefix(char) { return ""; }
+
+static const char* GetCharWidthPrefix(signed char) { return ""; }
+
+static const char* GetCharWidthPrefix(unsigned char) { return ""; }
+
+#ifdef __cpp_char8_t
+static const char* GetCharWidthPrefix(char8_t) { return "u8"; }
+#endif
+
+static const char* GetCharWidthPrefix(char16_t) { return "u"; }
+
+static const char* GetCharWidthPrefix(char32_t) { return "U"; }
+
+static const char* GetCharWidthPrefix(wchar_t) { return "L"; }
+
+// Prints a char c as if it's part of a string literal, escaping it when
+// necessary; returns how c was formatted.
+static CharFormat PrintAsStringLiteralTo(char c, ostream* os) {
+  return PrintAsStringLiteralTo(ToChar32(c), os);
+}
+
+#ifdef __cpp_char8_t
+static CharFormat PrintAsStringLiteralTo(char8_t c, ostream* os) {
+  return PrintAsStringLiteralTo(ToChar32(c), os);
+}
+#endif
+
+static CharFormat PrintAsStringLiteralTo(char16_t c, ostream* os) {
+  return PrintAsStringLiteralTo(ToChar32(c), os);
+}
+
+static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream* os) {
+  return PrintAsStringLiteralTo(ToChar32(c), os);
+}
+
+// Prints a character c (of type char, char8_t, char16_t, char32_t, or wchar_t)
+// and its code. '\0' is printed as "'\\0'", other unprintable characters are
+// also properly escaped using the standard C++ escape sequence.
+template <typename Char>
+void PrintCharAndCodeTo(Char c, ostream* os) {
+  // First, print c as a literal in the most readable form we can find.
+  *os << GetCharWidthPrefix(c) << "'";
+  const CharFormat format = PrintAsCharLiteralTo(c, os);
+  *os << "'";
+
+  // To aid user debugging, we also print c's code in decimal, unless
+  // it's 0 (in which case c was printed as '\\0', making the code
+  // obvious).
+  if (c == 0) return;
+  *os << " (" << static_cast<int>(c);
+
+  // For more convenience, we print c's code again in hexadecimal,
+  // unless c was already printed in the form '\x##' or the code is in
+  // [1, 9].
+  if (format == kHexEscape || (1 <= c && c <= 9)) {
+    // Do nothing.
+  } else {
+    *os << ", 0x" << String::FormatHexInt(static_cast<int>(c));
+  }
+  *os << ")";
+}
+
+void PrintTo(unsigned char c, ::std::ostream* os) { PrintCharAndCodeTo(c, os); }
+void PrintTo(signed char c, ::std::ostream* os) { PrintCharAndCodeTo(c, os); }
+
+// Prints a wchar_t as a symbol if it is printable or as its internal
+// code otherwise and also as its code.  L'\0' is printed as "L'\\0'".
+void PrintTo(wchar_t wc, ostream* os) { PrintCharAndCodeTo(wc, os); }
+
+// TODO(dcheng): Consider making this delegate to PrintCharAndCodeTo() as well.
+void PrintTo(char32_t c, ::std::ostream* os) {
+  *os << std::hex << "U+" << std::uppercase << std::setfill('0') << std::setw(4)
+      << static_cast<uint32_t>(c);
+}
+
+// gcc/clang __{u,}int128_t
+#if defined(__SIZEOF_INT128__)
+void PrintTo(__uint128_t v, ::std::ostream* os) {
+  if (v == 0) {
+    *os << "0";
+    return;
+  }
+
+  // Buffer large enough for ceil(log10(2^128))==39 and the null terminator
+  char buf[40];
+  char* p = buf + sizeof(buf);
+
+  // Some configurations have a __uint128_t, but no support for built in
+  // division. Do manual long division instead.
+
+  uint64_t high = static_cast<uint64_t>(v >> 64);
+  uint64_t low = static_cast<uint64_t>(v);
+
+  *--p = 0;
+  while (high != 0 || low != 0) {
+    uint64_t high_mod = high % 10;
+    high = high / 10;
+    // This is the long division algorithm specialized for a divisor of 10 and
+    // only two elements.
+    // Notable values:
+    //   2^64 / 10 == 1844674407370955161
+    //   2^64 % 10 == 6
+    const uint64_t carry = 6 * high_mod + low % 10;
+    low = low / 10 + high_mod * 1844674407370955161 + carry / 10;
+
+    char digit = static_cast<char>(carry % 10);
+    *--p = '0' + digit;
+  }
+  *os << p;
+}
+void PrintTo(__int128_t v, ::std::ostream* os) {
+  __uint128_t uv = static_cast<__uint128_t>(v);
+  if (v < 0) {
+    *os << "-";
+    uv = -uv;
+  }
+  PrintTo(uv, os);
+}
+#endif  // __SIZEOF_INT128__
+
+// Prints the given array of characters to the ostream.  CharType must be either
+// char, char8_t, char16_t, char32_t, or wchar_t.
+// The array starts at begin, the length is len, it may include '\0' characters
+// and may not be NUL-terminated.
+template <typename CharType>
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+    GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+        GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ static CharFormat
+        PrintCharsAsStringTo(const CharType* begin, size_t len, ostream* os) {
+  const char* const quote_prefix = GetCharWidthPrefix(*begin);
+  *os << quote_prefix << "\"";
+  bool is_previous_hex = false;
+  CharFormat print_format = kAsIs;
+  for (size_t index = 0; index < len; ++index) {
+    const CharType cur = begin[index];
+    if (is_previous_hex && IsXDigit(cur)) {
+      // Previous character is of '\x..' form and this character can be
+      // interpreted as another hexadecimal digit in its number. Break string to
+      // disambiguate.
+      *os << "\" " << quote_prefix << "\"";
+    }
+    is_previous_hex = PrintAsStringLiteralTo(cur, os) == kHexEscape;
+    // Remember if any characters required hex escaping.
+    if (is_previous_hex) {
+      print_format = kHexEscape;
+    }
+  }
+  *os << "\"";
+  return print_format;
+}
+
+// Prints a (const) char/wchar_t array of 'len' elements, starting at address
+// 'begin'.  CharType must be either char or wchar_t.
+template <typename CharType>
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+    GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+        GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ static void
+        UniversalPrintCharArray(const CharType* begin, size_t len,
+                                ostream* os) {
+  // The code
+  //   const char kFoo[] = "foo";
+  // generates an array of 4, not 3, elements, with the last one being '\0'.
+  //
+  // Therefore when printing a char array, we don't print the last element if
+  // it's '\0', such that the output matches the string literal as it's
+  // written in the source code.
+  if (len > 0 && begin[len - 1] == '\0') {
+    PrintCharsAsStringTo(begin, len - 1, os);
+    return;
+  }
+
+  // If, however, the last element in the array is not '\0', e.g.
+  //    const char kFoo[] = { 'f', 'o', 'o' };
+  // we must print the entire array.  We also print a message to indicate
+  // that the array is not NUL-terminated.
+  PrintCharsAsStringTo(begin, len, os);
+  *os << " (no terminating NUL)";
+}
+
+// Prints a (const) char array of 'len' elements, starting at address 'begin'.
+void UniversalPrintArray(const char* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+
+#ifdef __cpp_char8_t
+// Prints a (const) char8_t array of 'len' elements, starting at address
+// 'begin'.
+void UniversalPrintArray(const char8_t* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+#endif
+
+// Prints a (const) char16_t array of 'len' elements, starting at address
+// 'begin'.
+void UniversalPrintArray(const char16_t* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+
+// Prints a (const) char32_t array of 'len' elements, starting at address
+// 'begin'.
+void UniversalPrintArray(const char32_t* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+
+// Prints a (const) wchar_t array of 'len' elements, starting at address
+// 'begin'.
+void UniversalPrintArray(const wchar_t* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+
+namespace {
+
+// Prints a null-terminated C-style string to the ostream.
+template <typename Char>
+void PrintCStringTo(const Char* s, ostream* os) {
+  if (s == nullptr) {
+    *os << "NULL";
+  } else {
+    *os << ImplicitCast_<const void*>(s) << " pointing to ";
+    PrintCharsAsStringTo(s, std::char_traits<Char>::length(s), os);
+  }
+}
+
+}  // anonymous namespace
+
+void PrintTo(const char* s, ostream* os) { PrintCStringTo(s, os); }
+
+#ifdef __cpp_char8_t
+void PrintTo(const char8_t* s, ostream* os) { PrintCStringTo(s, os); }
+#endif
+
+void PrintTo(const char16_t* s, ostream* os) { PrintCStringTo(s, os); }
+
+void PrintTo(const char32_t* s, ostream* os) { PrintCStringTo(s, os); }
+
+// MSVC compiler can be configured to define whar_t as a typedef
+// of unsigned short. Defining an overload for const wchar_t* in that case
+// would cause pointers to unsigned shorts be printed as wide strings,
+// possibly accessing more memory than intended and causing invalid
+// memory accesses. MSVC defines _NATIVE_WCHAR_T_DEFINED symbol when
+// wchar_t is implemented as a native type.
+#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
+// Prints the given wide C string to the ostream.
+void PrintTo(const wchar_t* s, ostream* os) { PrintCStringTo(s, os); }
+#endif  // wchar_t is native
+
+namespace {
+
+bool ContainsUnprintableControlCodes(const char* str, size_t length) {
+  const unsigned char* s = reinterpret_cast<const unsigned char*>(str);
+
+  for (size_t i = 0; i < length; i++) {
+    unsigned char ch = *s++;
+    if (std::iscntrl(ch)) {
+      switch (ch) {
+        case '\t':
+        case '\n':
+        case '\r':
+          break;
+        default:
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool IsUTF8TrailByte(unsigned char t) { return 0x80 <= t && t <= 0xbf; }
+
+bool IsValidUTF8(const char* str, size_t length) {
+  const unsigned char* s = reinterpret_cast<const unsigned char*>(str);
+
+  for (size_t i = 0; i < length;) {
+    unsigned char lead = s[i++];
+
+    if (lead <= 0x7f) {
+      continue;  // single-byte character (ASCII) 0..7F
+    }
+    if (lead < 0xc2) {
+      return false;  // trail byte or non-shortest form
+    } else if (lead <= 0xdf && (i + 1) <= length && IsUTF8TrailByte(s[i])) {
+      ++i;  // 2-byte character
+    } else if (0xe0 <= lead && lead <= 0xef && (i + 2) <= length &&
+               IsUTF8TrailByte(s[i]) && IsUTF8TrailByte(s[i + 1]) &&
+               // check for non-shortest form and surrogate
+               (lead != 0xe0 || s[i] >= 0xa0) &&
+               (lead != 0xed || s[i] < 0xa0)) {
+      i += 2;  // 3-byte character
+    } else if (0xf0 <= lead && lead <= 0xf4 && (i + 3) <= length &&
+               IsUTF8TrailByte(s[i]) && IsUTF8TrailByte(s[i + 1]) &&
+               IsUTF8TrailByte(s[i + 2]) &&
+               // check for non-shortest form
+               (lead != 0xf0 || s[i] >= 0x90) &&
+               (lead != 0xf4 || s[i] < 0x90)) {
+      i += 3;  // 4-byte character
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+void ConditionalPrintAsText(const char* str, size_t length, ostream* os) {
+  if (!ContainsUnprintableControlCodes(str, length) &&
+      IsValidUTF8(str, length)) {
+    *os << "\n    As Text: \"" << str << "\"";
+  }
+}
+
+}  // anonymous namespace
+
+void PrintStringTo(const ::std::string& s, ostream* os) {
+  if (PrintCharsAsStringTo(s.data(), s.size(), os) == kHexEscape) {
+    if (GTEST_FLAG_GET(print_utf8)) {
+      ConditionalPrintAsText(s.data(), s.size(), os);
+    }
+  }
+}
+
+#ifdef __cpp_char8_t
+void PrintU8StringTo(const ::std::u8string& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+#endif
+
+void PrintU16StringTo(const ::std::u16string& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+
+void PrintU32StringTo(const ::std::u32string& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+
+#if GTEST_HAS_STD_WSTRING
+void PrintWideStringTo(const ::std::wstring& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+}  // namespace internal
+
+}  // namespace testing
diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest-test-part.cc b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-test-part.cc
new file mode 100644
index 0000000000..eb7c8d1cf9
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-test-part.cc
@@ -0,0 +1,105 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+
+#include "gtest/gtest-test-part.h"
+
+#include "gtest/internal/gtest-port.h"
+#include "src/gtest-internal-inl.h"
+
+namespace testing {
+
+using internal::GetUnitTestImpl;
+
+// Gets the summary of the failure message by omitting the stack trace
+// in it.
+std::string TestPartResult::ExtractSummary(const char* message) {
+  const char* const stack_trace = strstr(message, internal::kStackTraceMarker);
+  return stack_trace == nullptr ? message : std::string(message, stack_trace);
+}
+
+// Prints a TestPartResult object.
+std::ostream& operator<<(std::ostream& os, const TestPartResult& result) {
+  return os << internal::FormatFileLocation(result.file_name(),
+                                            result.line_number())
+            << " "
+            << (result.type() == TestPartResult::kSuccess ? "Success"
+                : result.type() == TestPartResult::kSkip  ? "Skipped"
+                : result.type() == TestPartResult::kFatalFailure
+                    ? "Fatal failure"
+                    : "Non-fatal failure")
+            << ":\n"
+            << result.message() << std::endl;
+}
+
+// Appends a TestPartResult to the array.
+void TestPartResultArray::Append(const TestPartResult& result) {
+  array_.push_back(result);
+}
+
+// Returns the TestPartResult at the given index (0-based).
+const TestPartResult& TestPartResultArray::GetTestPartResult(int index) const {
+  if (index < 0 || index >= size()) {
+    printf("\nInvalid index (%d) into TestPartResultArray.\n", index);
+    internal::posix::Abort();
+  }
+
+  return array_[static_cast<size_t>(index)];
+}
+
+// Returns the number of TestPartResult objects in the array.
+int TestPartResultArray::size() const {
+  return static_cast<int>(array_.size());
+}
+
+namespace internal {
+
+HasNewFatalFailureHelper::HasNewFatalFailureHelper()
+    : has_new_fatal_failure_(false),
+      original_reporter_(
+          GetUnitTestImpl()->GetTestPartResultReporterForCurrentThread()) {
+  GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this);
+}
+
+HasNewFatalFailureHelper::~HasNewFatalFailureHelper() {
+  GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(
+      original_reporter_);
+}
+
+void HasNewFatalFailureHelper::ReportTestPartResult(
+    const TestPartResult& result) {
+  if (result.fatally_failed()) has_new_fatal_failure_ = true;
+  original_reporter_->ReportTestPartResult(result);
+}
+
+}  // namespace internal
+
+}  // namespace testing
diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest-typed-test.cc b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-typed-test.cc
new file mode 100644
index 0000000000..a2828b83c6
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-typed-test.cc
@@ -0,0 +1,104 @@
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "gtest/gtest-typed-test.h"
+
+#include "gtest/gtest.h"
+
+namespace testing {
+namespace internal {
+
+// Skips to the first non-space char in str. Returns an empty string if str
+// contains only whitespace characters.
+static const char* SkipSpaces(const char* str) {
+  while (IsSpace(*str)) str++;
+  return str;
+}
+
+static std::vector<std::string> SplitIntoTestNames(const char* src) {
+  std::vector<std::string> name_vec;
+  src = SkipSpaces(src);
+  for (; src != nullptr; src = SkipComma(src)) {
+    name_vec.push_back(StripTrailingSpaces(GetPrefixUntilComma(src)));
+  }
+  return name_vec;
+}
+
+// Verifies that registered_tests match the test names in
+// registered_tests_; returns registered_tests if successful, or
+// aborts the program otherwise.
+const char* TypedTestSuitePState::VerifyRegisteredTestNames(
+    const char* test_suite_name, const char* file, int line,
+    const char* registered_tests) {
+  RegisterTypeParameterizedTestSuite(test_suite_name, CodeLocation(file, line));
+
+  typedef RegisteredTestsMap::const_iterator RegisteredTestIter;
+  registered_ = true;
+
+  std::vector<std::string> name_vec = SplitIntoTestNames(registered_tests);
+
+  Message errors;
+
+  std::set<std::string> tests;
+  for (std::vector<std::string>::const_iterator name_it = name_vec.begin();
+       name_it != name_vec.end(); ++name_it) {
+    const std::string& name = *name_it;
+    if (tests.count(name) != 0) {
+      errors << "Test " << name << " is listed more than once.\n";
+      continue;
+    }
+
+    if (registered_tests_.count(name) != 0) {
+      tests.insert(name);
+    } else {
+      errors << "No test named " << name
+             << " can be found in this test suite.\n";
+    }
+  }
+
+  for (RegisteredTestIter it = registered_tests_.begin();
+       it != registered_tests_.end(); ++it) {
+    if (tests.count(it->first) == 0) {
+      errors << "You forgot to list test " << it->first << ".\n";
+    }
+  }
+
+  const std::string& errors_str = errors.GetString();
+  if (errors_str != "") {
+    fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(),
+            errors_str.c_str());
+    fflush(stderr);
+    posix::Abort();
+  }
+
+  return registered_tests;
+}
+
+}  // namespace internal
+}  // namespace testing
diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest.cc b/media/libvpx/libvpx/third_party/googletest/src/src/gtest.cc
new file mode 100644
index 0000000000..6f31dd2260
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest.cc
@@ -0,0 +1,6795 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+
+#include "gtest/gtest.h"
+
+#include <ctype.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <wchar.h>
+#include <wctype.h>
+
+#include <algorithm>
+#include <chrono>  // NOLINT
+#include <cmath>
+#include <cstdint>
+#include <initializer_list>
+#include <iomanip>
+#include <iterator>
+#include <limits>
+#include <list>
+#include <map>
+#include <ostream>  // NOLINT
+#include <sstream>
+#include <unordered_set>
+#include <vector>
+
+#include "gtest/gtest-assertion-result.h"
+#include "gtest/gtest-spi.h"
+#include "gtest/internal/custom/gtest.h"
+
+#if GTEST_OS_LINUX
+
+#include <fcntl.h>   // NOLINT
+#include <limits.h>  // NOLINT
+#include <sched.h>   // NOLINT
+// Declares vsnprintf().  This header is not available on Windows.
+#include <strings.h>   // NOLINT
+#include <sys/mman.h>  // NOLINT
+#include <sys/time.h>  // NOLINT
+#include <unistd.h>    // NOLINT
+
+#include <string>
+
+#elif GTEST_OS_ZOS
+#include <sys/time.h>  // NOLINT
+
+// On z/OS we additionally need strings.h for strcasecmp.
+#include <strings.h>   // NOLINT
+
+#elif GTEST_OS_WINDOWS_MOBILE  // We are on Windows CE.
+
+#include <windows.h>  // NOLINT
+#undef min
+
+#elif GTEST_OS_WINDOWS  // We are on Windows proper.
+
+#include <windows.h>  // NOLINT
+#undef min
+
+#ifdef _MSC_VER
+#include <crtdbg.h>  // NOLINT
+#endif
+
+#include <io.h>         // NOLINT
+#include <sys/stat.h>   // NOLINT
+#include <sys/timeb.h>  // NOLINT
+#include <sys/types.h>  // NOLINT
+
+#if GTEST_OS_WINDOWS_MINGW
+#include <sys/time.h>  // NOLINT
+#endif                 // GTEST_OS_WINDOWS_MINGW
+
+#else
+
+// cpplint thinks that the header is already included, so we want to
+// silence it.
+#include <sys/time.h>  // NOLINT
+#include <unistd.h>    // NOLINT
+
+#endif  // GTEST_OS_LINUX
+
+#if GTEST_HAS_EXCEPTIONS
+#include <stdexcept>
+#endif
+
+#if GTEST_CAN_STREAM_RESULTS_
+#include <arpa/inet.h>   // NOLINT
+#include <netdb.h>       // NOLINT
+#include <sys/socket.h>  // NOLINT
+#include <sys/types.h>   // NOLINT
+#endif
+
+#include "src/gtest-internal-inl.h"
+
+#if GTEST_OS_WINDOWS
+#define vsnprintf _vsnprintf
+#endif  // GTEST_OS_WINDOWS
+
+#if GTEST_OS_MAC
+#ifndef GTEST_OS_IOS
+#include <crt_externs.h>
+#endif
+#endif
+
+#if GTEST_HAS_ABSL
+#include "absl/debugging/failure_signal_handler.h"
+#include "absl/debugging/stacktrace.h"
+#include "absl/debugging/symbolize.h"
+#include "absl/flags/parse.h"
+#include "absl/flags/usage.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_replace.h"
+#endif  // GTEST_HAS_ABSL
+
+namespace testing {
+
+using internal::CountIf;
+using internal::ForEach;
+using internal::GetElementOr;
+using internal::Shuffle;
+
+// Constants.
+
+// A test whose test suite name or test name matches this filter is
+// disabled and not run.
+static const char kDisableTestFilter[] = "DISABLED_*:*/DISABLED_*";
+
+// A test suite whose name matches this filter is considered a death
+// test suite and will be run before test suites whose name doesn't
+// match this filter.
+static const char kDeathTestSuiteFilter[] = "*DeathTest:*DeathTest/*";
+
+// A test filter that matches everything.
+static const char kUniversalFilter[] = "*";
+
+// The default output format.
+static const char kDefaultOutputFormat[] = "xml";
+// The default output file.
+static const char kDefaultOutputFile[] = "test_detail";
+
+// The environment variable name for the test shard index.
+static const char kTestShardIndex[] = "GTEST_SHARD_INDEX";
+// The environment variable name for the total number of test shards.
+static const char kTestTotalShards[] = "GTEST_TOTAL_SHARDS";
+// The environment variable name for the test shard status file.
+static const char kTestShardStatusFile[] = "GTEST_SHARD_STATUS_FILE";
+
+namespace internal {
+
+// The text used in failure messages to indicate the start of the
+// stack trace.
+const char kStackTraceMarker[] = "\nStack trace:\n";
+
+// g_help_flag is true if and only if the --help flag or an equivalent form
+// is specified on the command line.
+bool g_help_flag = false;
+
+// Utility function to Open File for Writing
+static FILE* OpenFileForWriting(const std::string& output_file) {
+  FILE* fileout = nullptr;
+  FilePath output_file_path(output_file);
+  FilePath output_dir(output_file_path.RemoveFileName());
+
+  if (output_dir.CreateDirectoriesRecursively()) {
+    fileout = posix::FOpen(output_file.c_str(), "w");
+  }
+  if (fileout == nullptr) {
+    GTEST_LOG_(FATAL) << "Unable to open file \"" << output_file << "\"";
+  }
+  return fileout;
+}
+
+}  // namespace internal
+
+// Bazel passes in the argument to '--test_filter' via the TESTBRIDGE_TEST_ONLY
+// environment variable.
+static const char* GetDefaultFilter() {
+  const char* const testbridge_test_only =
+      internal::posix::GetEnv("TESTBRIDGE_TEST_ONLY");
+  if (testbridge_test_only != nullptr) {
+    return testbridge_test_only;
+  }
+  return kUniversalFilter;
+}
+
+// Bazel passes in the argument to '--test_runner_fail_fast' via the
+// TESTBRIDGE_TEST_RUNNER_FAIL_FAST environment variable.
+static bool GetDefaultFailFast() {
+  const char* const testbridge_test_runner_fail_fast =
+      internal::posix::GetEnv("TESTBRIDGE_TEST_RUNNER_FAIL_FAST");
+  if (testbridge_test_runner_fail_fast != nullptr) {
+    return strcmp(testbridge_test_runner_fail_fast, "1") == 0;
+  }
+  return false;
+}
+
+}  // namespace testing
+
+GTEST_DEFINE_bool_(
+    fail_fast,
+    testing::internal::BoolFromGTestEnv("fail_fast",
+                                        testing::GetDefaultFailFast()),
+    "True if and only if a test failure should stop further test execution.");
+
+GTEST_DEFINE_bool_(
+    also_run_disabled_tests,
+    testing::internal::BoolFromGTestEnv("also_run_disabled_tests", false),
+    "Run disabled tests too, in addition to the tests normally being run.");
+
+GTEST_DEFINE_bool_(
+    break_on_failure,
+    testing::internal::BoolFromGTestEnv("break_on_failure", false),
+    "True if and only if a failed assertion should be a debugger "
+    "break-point.");
+
+GTEST_DEFINE_bool_(catch_exceptions,
+                   testing::internal::BoolFromGTestEnv("catch_exceptions",
+                                                       true),
+                   "True if and only if " GTEST_NAME_
+                   " should catch exceptions and treat them as test failures.");
+
+GTEST_DEFINE_string_(
+    color, testing::internal::StringFromGTestEnv("color", "auto"),
+    "Whether to use colors in the output.  Valid values: yes, no, "
+    "and auto.  'auto' means to use colors if the output is "
+    "being sent to a terminal and the TERM environment variable "
+    "is set to a terminal type that supports colors.");
+
+GTEST_DEFINE_string_(
+    filter,
+    testing::internal::StringFromGTestEnv("filter",
+                                          testing::GetDefaultFilter()),
+    "A colon-separated list of glob (not regex) patterns "
+    "for filtering the tests to run, optionally followed by a "
+    "'-' and a : separated list of negative patterns (tests to "
+    "exclude).  A test is run if it matches one of the positive "
+    "patterns and does not match any of the negative patterns.");
+
+GTEST_DEFINE_bool_(
+    install_failure_signal_handler,
+    testing::internal::BoolFromGTestEnv("install_failure_signal_handler",
+                                        false),
+    "If true and supported on the current platform, " GTEST_NAME_
+    " should "
+    "install a signal handler that dumps debugging information when fatal "
+    "signals are raised.");
+
+GTEST_DEFINE_bool_(list_tests, false, "List all tests without running them.");
+
+// The net priority order after flag processing is thus:
+//   --gtest_output command line flag
+//   GTEST_OUTPUT environment variable
+//   XML_OUTPUT_FILE environment variable
+//   ''
+GTEST_DEFINE_string_(
+    output,
+    testing::internal::StringFromGTestEnv(
+        "output", testing::internal::OutputFlagAlsoCheckEnvVar().c_str()),
+    "A format (defaults to \"xml\" but can be specified to be \"json\"), "
+    "optionally followed by a colon and an output file name or directory. "
+    "A directory is indicated by a trailing pathname separator. "
+    "Examples: \"xml:filename.xml\", \"xml::directoryname/\". "
+    "If a directory is specified, output files will be created "
+    "within that directory, with file-names based on the test "
+    "executable's name and, if necessary, made unique by adding "
+    "digits.");
+
+GTEST_DEFINE_bool_(
+    brief, testing::internal::BoolFromGTestEnv("brief", false),
+    "True if only test failures should be displayed in text output.");
+
+GTEST_DEFINE_bool_(print_time,
+                   testing::internal::BoolFromGTestEnv("print_time", true),
+                   "True if and only if " GTEST_NAME_
+                   " should display elapsed time in text output.");
+
+GTEST_DEFINE_bool_(print_utf8,
+                   testing::internal::BoolFromGTestEnv("print_utf8", true),
+                   "True if and only if " GTEST_NAME_
+                   " prints UTF8 characters as text.");
+
+GTEST_DEFINE_int32_(
+    random_seed, testing::internal::Int32FromGTestEnv("random_seed", 0),
+    "Random number seed to use when shuffling test orders.  Must be in range "
+    "[1, 99999], or 0 to use a seed based on the current time.");
+
+GTEST_DEFINE_int32_(
+    repeat, testing::internal::Int32FromGTestEnv("repeat", 1),
+    "How many times to repeat each test.  Specify a negative number "
+    "for repeating forever.  Useful for shaking out flaky tests.");
+
+GTEST_DEFINE_bool_(
+    recreate_environments_when_repeating,
+    testing::internal::BoolFromGTestEnv("recreate_environments_when_repeating",
+                                        false),
+    "Controls whether global test environments are recreated for each repeat "
+    "of the tests. If set to false the global test environments are only set "
+    "up once, for the first iteration, and only torn down once, for the last. "
+    "Useful for shaking out flaky tests with stable, expensive test "
+    "environments. If --gtest_repeat is set to a negative number, meaning "
+    "there is no last run, the environments will always be recreated to avoid "
+    "leaks.");
+
+GTEST_DEFINE_bool_(show_internal_stack_frames, false,
+                   "True if and only if " GTEST_NAME_
+                   " should include internal stack frames when "
+                   "printing test failure stack traces.");
+
+GTEST_DEFINE_bool_(shuffle,
+                   testing::internal::BoolFromGTestEnv("shuffle", false),
+                   "True if and only if " GTEST_NAME_
+                   " should randomize tests' order on every run.");
+
+GTEST_DEFINE_int32_(
+    stack_trace_depth,
+    testing::internal::Int32FromGTestEnv("stack_trace_depth",
+                                         testing::kMaxStackTraceDepth),
+    "The maximum number of stack frames to print when an "
+    "assertion fails.  The valid range is 0 through 100, inclusive.");
+
+GTEST_DEFINE_string_(
+    stream_result_to,
+    testing::internal::StringFromGTestEnv("stream_result_to", ""),
+    "This flag specifies the host name and the port number on which to stream "
+    "test results. Example: \"localhost:555\". The flag is effective only on "
+    "Linux.");
+
+GTEST_DEFINE_bool_(
+    throw_on_failure,
+    testing::internal::BoolFromGTestEnv("throw_on_failure", false),
+    "When this flag is specified, a failed assertion will throw an exception "
+    "if exceptions are enabled or exit the program with a non-zero code "
+    "otherwise. For use with an external test framework.");
+
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+GTEST_DEFINE_string_(
+    flagfile, testing::internal::StringFromGTestEnv("flagfile", ""),
+    "This flag specifies the flagfile to read command-line flags from.");
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+
+namespace testing {
+namespace internal {
+
+// Generates a random number from [0, range), using a Linear
+// Congruential Generator (LCG).  Crashes if 'range' is 0 or greater
+// than kMaxRange.
+uint32_t Random::Generate(uint32_t range) {
+  // These constants are the same as are used in glibc's rand(3).
+  // Use wider types than necessary to prevent unsigned overflow diagnostics.
+  state_ = static_cast<uint32_t>(1103515245ULL * state_ + 12345U) % kMaxRange;
+
+  GTEST_CHECK_(range > 0) << "Cannot generate a number in the range [0, 0).";
+  GTEST_CHECK_(range <= kMaxRange)
+      << "Generation of a number in [0, " << range << ") was requested, "
+      << "but this can only generate numbers in [0, " << kMaxRange << ").";
+
+  // Converting via modulus introduces a bit of downward bias, but
+  // it's simple, and a linear congruential generator isn't too good
+  // to begin with.
+  return state_ % range;
+}
+
+// GTestIsInitialized() returns true if and only if the user has initialized
+// Google Test.  Useful for catching the user mistake of not initializing
+// Google Test before calling RUN_ALL_TESTS().
+static bool GTestIsInitialized() { return GetArgvs().size() > 0; }
+
+// Iterates over a vector of TestSuites, keeping a running sum of the
+// results of calling a given int-returning method on each.
+// Returns the sum.
+static int SumOverTestSuiteList(const std::vector<TestSuite*>& case_list,
+                                int (TestSuite::*method)() const) {
+  int sum = 0;
+  for (size_t i = 0; i < case_list.size(); i++) {
+    sum += (case_list[i]->*method)();
+  }
+  return sum;
+}
+
+// Returns true if and only if the test suite passed.
+static bool TestSuitePassed(const TestSuite* test_suite) {
+  return test_suite->should_run() && test_suite->Passed();
+}
+
+// Returns true if and only if the test suite failed.
+static bool TestSuiteFailed(const TestSuite* test_suite) {
+  return test_suite->should_run() && test_suite->Failed();
+}
+
+// Returns true if and only if test_suite contains at least one test that
+// should run.
+static bool ShouldRunTestSuite(const TestSuite* test_suite) {
+  return test_suite->should_run();
+}
+
+// AssertHelper constructor.
+AssertHelper::AssertHelper(TestPartResult::Type type, const char* file,
+                           int line, const char* message)
+    : data_(new AssertHelperData(type, file, line, message)) {}
+
+AssertHelper::~AssertHelper() { delete data_; }
+
+// Message assignment, for assertion streaming support.
+void AssertHelper::operator=(const Message& message) const {
+  UnitTest::GetInstance()->AddTestPartResult(
+      data_->type, data_->file, data_->line,
+      AppendUserMessage(data_->message, message),
+      UnitTest::GetInstance()->impl()->CurrentOsStackTraceExceptTop(1)
+      // Skips the stack frame for this function itself.
+  );  // NOLINT
+}
+
+namespace {
+
+// When TEST_P is found without a matching INSTANTIATE_TEST_SUITE_P
+// to creates test cases for it, a synthetic test case is
+// inserted to report ether an error or a log message.
+//
+// This configuration bit will likely be removed at some point.
+constexpr bool kErrorOnUninstantiatedParameterizedTest = true;
+constexpr bool kErrorOnUninstantiatedTypeParameterizedTest = true;
+
+// A test that fails at a given file/line location with a given message.
+class FailureTest : public Test {
+ public:
+  explicit FailureTest(const CodeLocation& loc, std::string error_message,
+                       bool as_error)
+      : loc_(loc),
+        error_message_(std::move(error_message)),
+        as_error_(as_error) {}
+
+  void TestBody() override {
+    if (as_error_) {
+      AssertHelper(TestPartResult::kNonFatalFailure, loc_.file.c_str(),
+                   loc_.line, "") = Message() << error_message_;
+    } else {
+      std::cout << error_message_ << std::endl;
+    }
+  }
+
+ private:
+  const CodeLocation loc_;
+  const std::string error_message_;
+  const bool as_error_;
+};
+
+}  // namespace
+
+std::set<std::string>* GetIgnoredParameterizedTestSuites() {
+  return UnitTest::GetInstance()->impl()->ignored_parameterized_test_suites();
+}
+
+// Add a given test_suit to the list of them allow to go un-instantiated.
+MarkAsIgnored::MarkAsIgnored(const char* test_suite) {
+  GetIgnoredParameterizedTestSuites()->insert(test_suite);
+}
+
+// If this parameterized test suite has no instantiations (and that
+// has not been marked as okay), emit a test case reporting that.
+void InsertSyntheticTestCase(const std::string& name, CodeLocation location,
+                             bool has_test_p) {
+  const auto& ignored = *GetIgnoredParameterizedTestSuites();
+  if (ignored.find(name) != ignored.end()) return;
+
+  const char kMissingInstantiation[] =  //
+      " is defined via TEST_P, but never instantiated. None of the test cases "
+      "will run. Either no INSTANTIATE_TEST_SUITE_P is provided or the only "
+      "ones provided expand to nothing."
+      "\n\n"
+      "Ideally, TEST_P definitions should only ever be included as part of "
+      "binaries that intend to use them. (As opposed to, for example, being "
+      "placed in a library that may be linked in to get other utilities.)";
+
+  const char kMissingTestCase[] =  //
+      " is instantiated via INSTANTIATE_TEST_SUITE_P, but no tests are "
+      "defined via TEST_P . No test cases will run."
+      "\n\n"
+      "Ideally, INSTANTIATE_TEST_SUITE_P should only ever be invoked from "
+      "code that always depend on code that provides TEST_P. Failing to do "
+      "so is often an indication of dead code, e.g. the last TEST_P was "
+      "removed but the rest got left behind.";
+
+  std::string message =
+      "Parameterized test suite " + name +
+      (has_test_p ? kMissingInstantiation : kMissingTestCase) +
+      "\n\n"
+      "To suppress this error for this test suite, insert the following line "
+      "(in a non-header) in the namespace it is defined in:"
+      "\n\n"
+      "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" +
+      name + ");";
+
+  std::string full_name = "UninstantiatedParameterizedTestSuite<" + name + ">";
+  RegisterTest(  //
+      "GoogleTestVerification", full_name.c_str(),
+      nullptr,  // No type parameter.
+      nullptr,  // No value parameter.
+      location.file.c_str(), location.line, [message, location] {
+        return new FailureTest(location, message,
+                               kErrorOnUninstantiatedParameterizedTest);
+      });
+}
+
+void RegisterTypeParameterizedTestSuite(const char* test_suite_name,
+                                        CodeLocation code_location) {
+  GetUnitTestImpl()->type_parameterized_test_registry().RegisterTestSuite(
+      test_suite_name, code_location);
+}
+
+void RegisterTypeParameterizedTestSuiteInstantiation(const char* case_name) {
+  GetUnitTestImpl()->type_parameterized_test_registry().RegisterInstantiation(
+      case_name);
+}
+
+void TypeParameterizedTestSuiteRegistry::RegisterTestSuite(
+    const char* test_suite_name, CodeLocation code_location) {
+  suites_.emplace(std::string(test_suite_name),
+                  TypeParameterizedTestSuiteInfo(code_location));
+}
+
+void TypeParameterizedTestSuiteRegistry::RegisterInstantiation(
+    const char* test_suite_name) {
+  auto it = suites_.find(std::string(test_suite_name));
+  if (it != suites_.end()) {
+    it->second.instantiated = true;
+  } else {
+    GTEST_LOG_(ERROR) << "Unknown type parameterized test suit '"
+                      << test_suite_name << "'";
+  }
+}
+
+void TypeParameterizedTestSuiteRegistry::CheckForInstantiations() {
+  const auto& ignored = *GetIgnoredParameterizedTestSuites();
+  for (const auto& testcase : suites_) {
+    if (testcase.second.instantiated) continue;
+    if (ignored.find(testcase.first) != ignored.end()) continue;
+
+    std::string message =
+        "Type parameterized test suite " + testcase.first +
+        " is defined via REGISTER_TYPED_TEST_SUITE_P, but never instantiated "
+        "via INSTANTIATE_TYPED_TEST_SUITE_P. None of the test cases will run."
+        "\n\n"
+        "Ideally, TYPED_TEST_P definitions should only ever be included as "
+        "part of binaries that intend to use them. (As opposed to, for "
+        "example, being placed in a library that may be linked in to get other "
+        "utilities.)"
+        "\n\n"
+        "To suppress this error for this test suite, insert the following line "
+        "(in a non-header) in the namespace it is defined in:"
+        "\n\n"
+        "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" +
+        testcase.first + ");";
+
+    std::string full_name =
+        "UninstantiatedTypeParameterizedTestSuite<" + testcase.first + ">";
+    RegisterTest(  //
+        "GoogleTestVerification", full_name.c_str(),
+        nullptr,  // No type parameter.
+        nullptr,  // No value parameter.
+        testcase.second.code_location.file.c_str(),
+        testcase.second.code_location.line, [message, testcase] {
+          return new FailureTest(testcase.second.code_location, message,
+                                 kErrorOnUninstantiatedTypeParameterizedTest);
+        });
+  }
+}
+
+// A copy of all command line arguments.  Set by InitGoogleTest().
+static ::std::vector<std::string> g_argvs;
+
+::std::vector<std::string> GetArgvs() {
+#if defined(GTEST_CUSTOM_GET_ARGVS_)
+  // GTEST_CUSTOM_GET_ARGVS_() may return a container of std::string or
+  // ::string. This code converts it to the appropriate type.
+  const auto& custom = GTEST_CUSTOM_GET_ARGVS_();
+  return ::std::vector<std::string>(custom.begin(), custom.end());
+#else   // defined(GTEST_CUSTOM_GET_ARGVS_)
+  return g_argvs;
+#endif  // defined(GTEST_CUSTOM_GET_ARGVS_)
+}
+
+// Returns the current application's name, removing directory path if that
+// is present.
+FilePath GetCurrentExecutableName() {
+  FilePath result;
+
+#if GTEST_OS_WINDOWS || GTEST_OS_OS2
+  result.Set(FilePath(GetArgvs()[0]).RemoveExtension("exe"));
+#else
+  result.Set(FilePath(GetArgvs()[0]));
+#endif  // GTEST_OS_WINDOWS
+
+  return result.RemoveDirectoryName();
+}
+
+// Functions for processing the gtest_output flag.
+
+// Returns the output format, or "" for normal printed output.
+std::string UnitTestOptions::GetOutputFormat() {
+  std::string s = GTEST_FLAG_GET(output);
+  const char* const gtest_output_flag = s.c_str();
+  const char* const colon = strchr(gtest_output_flag, ':');
+  return (colon == nullptr)
+             ? std::string(gtest_output_flag)
+             : std::string(gtest_output_flag,
+                           static_cast<size_t>(colon - gtest_output_flag));
+}
+
+// Returns the name of the requested output file, or the default if none
+// was explicitly specified.
+std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
+  std::string s = GTEST_FLAG_GET(output);
+  const char* const gtest_output_flag = s.c_str();
+
+  std::string format = GetOutputFormat();
+  if (format.empty()) format = std::string(kDefaultOutputFormat);
+
+  const char* const colon = strchr(gtest_output_flag, ':');
+  if (colon == nullptr)
+    return internal::FilePath::MakeFileName(
+               internal::FilePath(
+                   UnitTest::GetInstance()->original_working_dir()),
+               internal::FilePath(kDefaultOutputFile), 0, format.c_str())
+        .string();
+
+  internal::FilePath output_name(colon + 1);
+  if (!output_name.IsAbsolutePath())
+    output_name = internal::FilePath::ConcatPaths(
+        internal::FilePath(UnitTest::GetInstance()->original_working_dir()),
+        internal::FilePath(colon + 1));
+
+  if (!output_name.IsDirectory()) return output_name.string();
+
+  internal::FilePath result(internal::FilePath::GenerateUniqueFileName(
+      output_name, internal::GetCurrentExecutableName(),
+      GetOutputFormat().c_str()));
+  return result.string();
+}
+
+// Returns true if and only if the wildcard pattern matches the string. Each
+// pattern consists of regular characters, single-character wildcards (?), and
+// multi-character wildcards (*).
+//
+// This function implements a linear-time string globbing algorithm based on
+// https://research.swtch.com/glob.
+static bool PatternMatchesString(const std::string& name_str,
+                                 const char* pattern, const char* pattern_end) {
+  const char* name = name_str.c_str();
+  const char* const name_begin = name;
+  const char* const name_end = name + name_str.size();
+
+  const char* pattern_next = pattern;
+  const char* name_next = name;
+
+  while (pattern < pattern_end || name < name_end) {
+    if (pattern < pattern_end) {
+      switch (*pattern) {
+        default:  // Match an ordinary character.
+          if (name < name_end && *name == *pattern) {
+            ++pattern;
+            ++name;
+            continue;
+          }
+          break;
+        case '?':  // Match any single character.
+          if (name < name_end) {
+            ++pattern;
+            ++name;
+            continue;
+          }
+          break;
+        case '*':
+          // Match zero or more characters. Start by skipping over the wildcard
+          // and matching zero characters from name. If that fails, restart and
+          // match one more character than the last attempt.
+          pattern_next = pattern;
+          name_next = name + 1;
+          ++pattern;
+          continue;
+      }
+    }
+    // Failed to match a character. Restart if possible.
+    if (name_begin < name_next && name_next <= name_end) {
+      pattern = pattern_next;
+      name = name_next;
+      continue;
+    }
+    return false;
+  }
+  return true;
+}
+
+namespace {
+
+bool IsGlobPattern(const std::string& pattern) {
+  return std::any_of(pattern.begin(), pattern.end(),
+                     [](const char c) { return c == '?' || c == '*'; });
+}
+
+class UnitTestFilter {
+ public:
+  UnitTestFilter() = default;
+
+  // Constructs a filter from a string of patterns separated by `:`.
+  explicit UnitTestFilter(const std::string& filter) {
+    // By design "" filter matches "" string.
+    std::vector<std::string> all_patterns;
+    SplitString(filter, ':', &all_patterns);
+    const auto exact_match_patterns_begin = std::partition(
+        all_patterns.begin(), all_patterns.end(), &IsGlobPattern);
+
+    glob_patterns_.reserve(static_cast<size_t>(
+        std::distance(all_patterns.begin(), exact_match_patterns_begin)));
+    std::move(all_patterns.begin(), exact_match_patterns_begin,
+              std::inserter(glob_patterns_, glob_patterns_.begin()));
+    std::move(
+        exact_match_patterns_begin, all_patterns.end(),
+        std::inserter(exact_match_patterns_, exact_match_patterns_.begin()));
+  }
+
+  // Returns true if and only if name matches at least one of the patterns in
+  // the filter.
+  bool MatchesName(const std::string& name) const {
+    return exact_match_patterns_.count(name) > 0 ||
+           std::any_of(glob_patterns_.begin(), glob_patterns_.end(),
+                       [&name](const std::string& pattern) {
+                         return PatternMatchesString(
+                             name, pattern.c_str(),
+                             pattern.c_str() + pattern.size());
+                       });
+  }
+
+ private:
+  std::vector<std::string> glob_patterns_;
+  std::unordered_set<std::string> exact_match_patterns_;
+};
+
+class PositiveAndNegativeUnitTestFilter {
+ public:
+  // Constructs a positive and a negative filter from a string. The string
+  // contains a positive filter optionally followed by a '-' character and a
+  // negative filter. In case only a negative filter is provided the positive
+  // filter will be assumed "*".
+  // A filter is a list of patterns separated by ':'.
+  explicit PositiveAndNegativeUnitTestFilter(const std::string& filter) {
+    std::vector<std::string> positive_and_negative_filters;
+
+    // NOTE: `SplitString` always returns a non-empty container.
+    SplitString(filter, '-', &positive_and_negative_filters);
+    const auto& positive_filter = positive_and_negative_filters.front();
+
+    if (positive_and_negative_filters.size() > 1) {
+      positive_filter_ = UnitTestFilter(
+          positive_filter.empty() ? kUniversalFilter : positive_filter);
+
+      // TODO(b/214626361): Fail on multiple '-' characters
+      // For the moment to preserve old behavior we concatenate the rest of the
+      // string parts with `-` as separator to generate the negative filter.
+      auto negative_filter_string = positive_and_negative_filters[1];
+      for (std::size_t i = 2; i < positive_and_negative_filters.size(); i++)
+        negative_filter_string =
+            negative_filter_string + '-' + positive_and_negative_filters[i];
+      negative_filter_ = UnitTestFilter(negative_filter_string);
+    } else {
+      // In case we don't have a negative filter and positive filter is ""
+      // we do not use kUniversalFilter by design as opposed to when we have a
+      // negative filter.
+      positive_filter_ = UnitTestFilter(positive_filter);
+    }
+  }
+
+  // Returns true if and only if test name (this is generated by appending test
+  // suit name and test name via a '.' character) matches the positive filter
+  // and does not match the negative filter.
+  bool MatchesTest(const std::string& test_suite_name,
+                   const std::string& test_name) const {
+    return MatchesName(test_suite_name + "." + test_name);
+  }
+
+  // Returns true if and only if name matches the positive filter and does not
+  // match the negative filter.
+  bool MatchesName(const std::string& name) const {
+    return positive_filter_.MatchesName(name) &&
+           !negative_filter_.MatchesName(name);
+  }
+
+ private:
+  UnitTestFilter positive_filter_;
+  UnitTestFilter negative_filter_;
+};
+}  // namespace
+
+bool UnitTestOptions::MatchesFilter(const std::string& name_str,
+                                    const char* filter) {
+  return UnitTestFilter(filter).MatchesName(name_str);
+}
+
+// Returns true if and only if the user-specified filter matches the test
+// suite name and the test name.
+bool UnitTestOptions::FilterMatchesTest(const std::string& test_suite_name,
+                                        const std::string& test_name) {
+  // Split --gtest_filter at '-', if there is one, to separate into
+  // positive filter and negative filter portions
+  return PositiveAndNegativeUnitTestFilter(GTEST_FLAG_GET(filter))
+      .MatchesTest(test_suite_name, test_name);
+}
+
+#if GTEST_HAS_SEH
+// Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
+// given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
+// This function is useful as an __except condition.
+int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) {
+  // Google Test should handle a SEH exception if:
+  //   1. the user wants it to, AND
+  //   2. this is not a breakpoint exception, AND
+  //   3. this is not a C++ exception (VC++ implements them via SEH,
+  //      apparently).
+  //
+  // SEH exception code for C++ exceptions.
+  // (see http://support.microsoft.com/kb/185294 for more information).
+  const DWORD kCxxExceptionCode = 0xe06d7363;
+
+  bool should_handle = true;
+
+  if (!GTEST_FLAG_GET(catch_exceptions))
+    should_handle = false;
+  else if (exception_code == EXCEPTION_BREAKPOINT)
+    should_handle = false;
+  else if (exception_code == kCxxExceptionCode)
+    should_handle = false;
+
+  return should_handle ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH;
+}
+#endif  // GTEST_HAS_SEH
+
+}  // namespace internal
+
+// The c'tor sets this object as the test part result reporter used by
+// Google Test.  The 'result' parameter specifies where to report the
+// results. Intercepts only failures from the current thread.
+ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
+    TestPartResultArray* result)
+    : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD), result_(result) {
+  Init();
+}
+
+// The c'tor sets this object as the test part result reporter used by
+// Google Test.  The 'result' parameter specifies where to report the
+// results.
+ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
+    InterceptMode intercept_mode, TestPartResultArray* result)
+    : intercept_mode_(intercept_mode), result_(result) {
+  Init();
+}
+
+void ScopedFakeTestPartResultReporter::Init() {
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
+    old_reporter_ = impl->GetGlobalTestPartResultReporter();
+    impl->SetGlobalTestPartResultReporter(this);
+  } else {
+    old_reporter_ = impl->GetTestPartResultReporterForCurrentThread();
+    impl->SetTestPartResultReporterForCurrentThread(this);
+  }
+}
+
+// The d'tor restores the test part result reporter used by Google Test
+// before.
+ScopedFakeTestPartResultReporter::~ScopedFakeTestPartResultReporter() {
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
+    impl->SetGlobalTestPartResultReporter(old_reporter_);
+  } else {
+    impl->SetTestPartResultReporterForCurrentThread(old_reporter_);
+  }
+}
+
+// Increments the test part result count and remembers the result.
+// This method is from the TestPartResultReporterInterface interface.
+void ScopedFakeTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult& result) {
+  result_->Append(result);
+}
+
+namespace internal {
+
+// Returns the type ID of ::testing::Test.  We should always call this
+// instead of GetTypeId< ::testing::Test>() to get the type ID of
+// testing::Test.  This is to work around a suspected linker bug when
+// using Google Test as a framework on Mac OS X.  The bug causes
+// GetTypeId< ::testing::Test>() to return different values depending
+// on whether the call is from the Google Test framework itself or
+// from user test code.  GetTestTypeId() is guaranteed to always
+// return the same value, as it always calls GetTypeId<>() from the
+// gtest.cc, which is within the Google Test framework.
+TypeId GetTestTypeId() { return GetTypeId<Test>(); }
+
+// The value of GetTestTypeId() as seen from within the Google Test
+// library.  This is solely for testing GetTestTypeId().
+extern const TypeId kTestTypeIdInGoogleTest = GetTestTypeId();
+
+// This predicate-formatter checks that 'results' contains a test part
+// failure of the given type and that the failure message contains the
+// given substring.
+static AssertionResult HasOneFailure(const char* /* results_expr */,
+                                     const char* /* type_expr */,
+                                     const char* /* substr_expr */,
+                                     const TestPartResultArray& results,
+                                     TestPartResult::Type type,
+                                     const std::string& substr) {
+  const std::string expected(type == TestPartResult::kFatalFailure
+                                 ? "1 fatal failure"
+                                 : "1 non-fatal failure");
+  Message msg;
+  if (results.size() != 1) {
+    msg << "Expected: " << expected << "\n"
+        << "  Actual: " << results.size() << " failures";
+    for (int i = 0; i < results.size(); i++) {
+      msg << "\n" << results.GetTestPartResult(i);
+    }
+    return AssertionFailure() << msg;
+  }
+
+  const TestPartResult& r = results.GetTestPartResult(0);
+  if (r.type() != type) {
+    return AssertionFailure() << "Expected: " << expected << "\n"
+                              << "  Actual:\n"
+                              << r;
+  }
+
+  if (strstr(r.message(), substr.c_str()) == nullptr) {
+    return AssertionFailure()
+           << "Expected: " << expected << " containing \"" << substr << "\"\n"
+           << "  Actual:\n"
+           << r;
+  }
+
+  return AssertionSuccess();
+}
+
+// The constructor of SingleFailureChecker remembers where to look up
+// test part results, what type of failure we expect, and what
+// substring the failure message should contain.
+SingleFailureChecker::SingleFailureChecker(const TestPartResultArray* results,
+                                           TestPartResult::Type type,
+                                           const std::string& substr)
+    : results_(results), type_(type), substr_(substr) {}
+
+// The destructor of SingleFailureChecker verifies that the given
+// TestPartResultArray contains exactly one failure that has the given
+// type and contains the given substring.  If that's not the case, a
+// non-fatal failure will be generated.
+SingleFailureChecker::~SingleFailureChecker() {
+  EXPECT_PRED_FORMAT3(HasOneFailure, *results_, type_, substr_);
+}
+
+DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter(
+    UnitTestImpl* unit_test)
+    : unit_test_(unit_test) {}
+
+void DefaultGlobalTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult& result) {
+  unit_test_->current_test_result()->AddTestPartResult(result);
+  unit_test_->listeners()->repeater()->OnTestPartResult(result);
+}
+
+DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter(
+    UnitTestImpl* unit_test)
+    : unit_test_(unit_test) {}
+
+void DefaultPerThreadTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult& result) {
+  unit_test_->GetGlobalTestPartResultReporter()->ReportTestPartResult(result);
+}
+
+// Returns the global test part result reporter.
+TestPartResultReporterInterface*
+UnitTestImpl::GetGlobalTestPartResultReporter() {
+  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
+  return global_test_part_result_repoter_;
+}
+
+// Sets the global test part result reporter.
+void UnitTestImpl::SetGlobalTestPartResultReporter(
+    TestPartResultReporterInterface* reporter) {
+  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
+  global_test_part_result_repoter_ = reporter;
+}
+
+// Returns the test part result reporter for the current thread.
+TestPartResultReporterInterface*
+UnitTestImpl::GetTestPartResultReporterForCurrentThread() {
+  return per_thread_test_part_result_reporter_.get();
+}
+
+// Sets the test part result reporter for the current thread.
+void UnitTestImpl::SetTestPartResultReporterForCurrentThread(
+    TestPartResultReporterInterface* reporter) {
+  per_thread_test_part_result_reporter_.set(reporter);
+}
+
+// Gets the number of successful test suites.
+int UnitTestImpl::successful_test_suite_count() const {
+  return CountIf(test_suites_, TestSuitePassed);
+}
+
+// Gets the number of failed test suites.
+int UnitTestImpl::failed_test_suite_count() const {
+  return CountIf(test_suites_, TestSuiteFailed);
+}
+
+// Gets the number of all test suites.
+int UnitTestImpl::total_test_suite_count() const {
+  return static_cast<int>(test_suites_.size());
+}
+
+// Gets the number of all test suites that contain at least one test
+// that should run.
+int UnitTestImpl::test_suite_to_run_count() const {
+  return CountIf(test_suites_, ShouldRunTestSuite);
+}
+
+// Gets the number of successful tests.
+int UnitTestImpl::successful_test_count() const {
+  return SumOverTestSuiteList(test_suites_, &TestSuite::successful_test_count);
+}
+
+// Gets the number of skipped tests.
+int UnitTestImpl::skipped_test_count() const {
+  return SumOverTestSuiteList(test_suites_, &TestSuite::skipped_test_count);
+}
+
+// Gets the number of failed tests.
+int UnitTestImpl::failed_test_count() const {
+  return SumOverTestSuiteList(test_suites_, &TestSuite::failed_test_count);
+}
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int UnitTestImpl::reportable_disabled_test_count() const {
+  return SumOverTestSuiteList(test_suites_,
+                              &TestSuite::reportable_disabled_test_count);
+}
+
+// Gets the number of disabled tests.
+int UnitTestImpl::disabled_test_count() const {
+  return SumOverTestSuiteList(test_suites_, &TestSuite::disabled_test_count);
+}
+
+// Gets the number of tests to be printed in the XML report.
+int UnitTestImpl::reportable_test_count() const {
+  return SumOverTestSuiteList(test_suites_, &TestSuite::reportable_test_count);
+}
+
+// Gets the number of all tests.
+int UnitTestImpl::total_test_count() const {
+  return SumOverTestSuiteList(test_suites_, &TestSuite::total_test_count);
+}
+
+// Gets the number of tests that should run.
+int UnitTestImpl::test_to_run_count() const {
+  return SumOverTestSuiteList(test_suites_, &TestSuite::test_to_run_count);
+}
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag.  The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// CurrentOsStackTraceExceptTop(1), Foo() will be included in the
+// trace but Bar() and CurrentOsStackTraceExceptTop() won't.
+std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) {
+  return os_stack_trace_getter()->CurrentStackTrace(
+      static_cast<int>(GTEST_FLAG_GET(stack_trace_depth)), skip_count + 1
+      // Skips the user-specified number of frames plus this function
+      // itself.
+  );  // NOLINT
+}
+
+// A helper class for measuring elapsed times.
+class Timer {
+ public:
+  Timer() : start_(std::chrono::steady_clock::now()) {}
+
+  // Return time elapsed in milliseconds since the timer was created.
+  TimeInMillis Elapsed() {
+    return std::chrono::duration_cast<std::chrono::milliseconds>(
+               std::chrono::steady_clock::now() - start_)
+        .count();
+  }
+
+ private:
+  std::chrono::steady_clock::time_point start_;
+};
+
+// Returns a timestamp as milliseconds since the epoch. Note this time may jump
+// around subject to adjustments by the system, to measure elapsed time use
+// Timer instead.
+TimeInMillis GetTimeInMillis() {
+  return std::chrono::duration_cast<std::chrono::milliseconds>(
+             std::chrono::system_clock::now() -
+             std::chrono::system_clock::from_time_t(0))
+      .count();
+}
+
+// Utilities
+
+// class String.
+
+#if GTEST_OS_WINDOWS_MOBILE
+// Creates a UTF-16 wide string from the given ANSI string, allocating
+// memory using new. The caller is responsible for deleting the return
+// value using delete[]. Returns the wide string, or NULL if the
+// input is NULL.
+LPCWSTR String::AnsiToUtf16(const char* ansi) {
+  if (!ansi) return nullptr;
+  const int length = strlen(ansi);
+  const int unicode_length =
+      MultiByteToWideChar(CP_ACP, 0, ansi, length, nullptr, 0);
+  WCHAR* unicode = new WCHAR[unicode_length + 1];
+  MultiByteToWideChar(CP_ACP, 0, ansi, length, unicode, unicode_length);
+  unicode[unicode_length] = 0;
+  return unicode;
+}
+
+// Creates an ANSI string from the given wide string, allocating
+// memory using new. The caller is responsible for deleting the return
+// value using delete[]. Returns the ANSI string, or NULL if the
+// input is NULL.
+const char* String::Utf16ToAnsi(LPCWSTR utf16_str) {
+  if (!utf16_str) return nullptr;
+  const int ansi_length = WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, nullptr,
+                                              0, nullptr, nullptr);
+  char* ansi = new char[ansi_length + 1];
+  WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, ansi, ansi_length, nullptr,
+                      nullptr);
+  ansi[ansi_length] = 0;
+  return ansi;
+}
+
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+// Compares two C strings.  Returns true if and only if they have the same
+// content.
+//
+// Unlike strcmp(), this function can handle NULL argument(s).  A NULL
+// C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::CStringEquals(const char* lhs, const char* rhs) {
+  if (lhs == nullptr) return rhs == nullptr;
+
+  if (rhs == nullptr) return false;
+
+  return strcmp(lhs, rhs) == 0;
+}
+
+#if GTEST_HAS_STD_WSTRING
+
+// Converts an array of wide chars to a narrow string using the UTF-8
+// encoding, and streams the result to the given Message object.
+static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length,
+                                     Message* msg) {
+  for (size_t i = 0; i != length;) {  // NOLINT
+    if (wstr[i] != L'\0') {
+      *msg << WideStringToUtf8(wstr + i, static_cast<int>(length - i));
+      while (i != length && wstr[i] != L'\0') i++;
+    } else {
+      *msg << '\0';
+      i++;
+    }
+  }
+}
+
+#endif  // GTEST_HAS_STD_WSTRING
+
+void SplitString(const ::std::string& str, char delimiter,
+                 ::std::vector< ::std::string>* dest) {
+  ::std::vector< ::std::string> parsed;
+  ::std::string::size_type pos = 0;
+  while (::testing::internal::AlwaysTrue()) {
+    const ::std::string::size_type colon = str.find(delimiter, pos);
+    if (colon == ::std::string::npos) {
+      parsed.push_back(str.substr(pos));
+      break;
+    } else {
+      parsed.push_back(str.substr(pos, colon - pos));
+      pos = colon + 1;
+    }
+  }
+  dest->swap(parsed);
+}
+
+}  // namespace internal
+
+// Constructs an empty Message.
+// We allocate the stringstream separately because otherwise each use of
+// ASSERT/EXPECT in a procedure adds over 200 bytes to the procedure's
+// stack frame leading to huge stack frames in some cases; gcc does not reuse
+// the stack space.
+Message::Message() : ss_(new ::std::stringstream) {
+  // By default, we want there to be enough precision when printing
+  // a double to a Message.
+  *ss_ << std::setprecision(std::numeric_limits<double>::digits10 + 2);
+}
+
+// These two overloads allow streaming a wide C string to a Message
+// using the UTF-8 encoding.
+Message& Message::operator<<(const wchar_t* wide_c_str) {
+  return *this << internal::String::ShowWideCString(wide_c_str);
+}
+Message& Message::operator<<(wchar_t* wide_c_str) {
+  return *this << internal::String::ShowWideCString(wide_c_str);
+}
+
+#if GTEST_HAS_STD_WSTRING
+// Converts the given wide string to a narrow string using the UTF-8
+// encoding, and streams the result to this Message object.
+Message& Message::operator<<(const ::std::wstring& wstr) {
+  internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
+  return *this;
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+// Gets the text streamed to this object so far as an std::string.
+// Each '\0' character in the buffer is replaced with "\\0".
+std::string Message::GetString() const {
+  return internal::StringStreamToString(ss_.get());
+}
+
+namespace internal {
+
+namespace edit_distance {
+std::vector<EditType> CalculateOptimalEdits(const std::vector<size_t>& left,
+                                            const std::vector<size_t>& right) {
+  std::vector<std::vector<double> > costs(
+      left.size() + 1, std::vector<double>(right.size() + 1));
+  std::vector<std::vector<EditType> > best_move(
+      left.size() + 1, std::vector<EditType>(right.size() + 1));
+
+  // Populate for empty right.
+  for (size_t l_i = 0; l_i < costs.size(); ++l_i) {
+    costs[l_i][0] = static_cast<double>(l_i);
+    best_move[l_i][0] = kRemove;
+  }
+  // Populate for empty left.
+  for (size_t r_i = 1; r_i < costs[0].size(); ++r_i) {
+    costs[0][r_i] = static_cast<double>(r_i);
+    best_move[0][r_i] = kAdd;
+  }
+
+  for (size_t l_i = 0; l_i < left.size(); ++l_i) {
+    for (size_t r_i = 0; r_i < right.size(); ++r_i) {
+      if (left[l_i] == right[r_i]) {
+        // Found a match. Consume it.
+        costs[l_i + 1][r_i + 1] = costs[l_i][r_i];
+        best_move[l_i + 1][r_i + 1] = kMatch;
+        continue;
+      }
+
+      const double add = costs[l_i + 1][r_i];
+      const double remove = costs[l_i][r_i + 1];
+      const double replace = costs[l_i][r_i];
+      if (add < remove && add < replace) {
+        costs[l_i + 1][r_i + 1] = add + 1;
+        best_move[l_i + 1][r_i + 1] = kAdd;
+      } else if (remove < add && remove < replace) {
+        costs[l_i + 1][r_i + 1] = remove + 1;
+        best_move[l_i + 1][r_i + 1] = kRemove;
+      } else {
+        // We make replace a little more expensive than add/remove to lower
+        // their priority.
+        costs[l_i + 1][r_i + 1] = replace + 1.00001;
+        best_move[l_i + 1][r_i + 1] = kReplace;
+      }
+    }
+  }
+
+  // Reconstruct the best path. We do it in reverse order.
+  std::vector<EditType> best_path;
+  for (size_t l_i = left.size(), r_i = right.size(); l_i > 0 || r_i > 0;) {
+    EditType move = best_move[l_i][r_i];
+    best_path.push_back(move);
+    l_i -= move != kAdd;
+    r_i -= move != kRemove;
+  }
+  std::reverse(best_path.begin(), best_path.end());
+  return best_path;
+}
+
+namespace {
+
+// Helper class to convert string into ids with deduplication.
+class InternalStrings {
+ public:
+  size_t GetId(const std::string& str) {
+    IdMap::iterator it = ids_.find(str);
+    if (it != ids_.end()) return it->second;
+    size_t id = ids_.size();
+    return ids_[str] = id;
+  }
+
+ private:
+  typedef std::map<std::string, size_t> IdMap;
+  IdMap ids_;
+};
+
+}  // namespace
+
+std::vector<EditType> CalculateOptimalEdits(
+    const std::vector<std::string>& left,
+    const std::vector<std::string>& right) {
+  std::vector<size_t> left_ids, right_ids;
+  {
+    InternalStrings intern_table;
+    for (size_t i = 0; i < left.size(); ++i) {
+      left_ids.push_back(intern_table.GetId(left[i]));
+    }
+    for (size_t i = 0; i < right.size(); ++i) {
+      right_ids.push_back(intern_table.GetId(right[i]));
+    }
+  }
+  return CalculateOptimalEdits(left_ids, right_ids);
+}
+
+namespace {
+
+// Helper class that holds the state for one hunk and prints it out to the
+// stream.
+// It reorders adds/removes when possible to group all removes before all
+// adds. It also adds the hunk header before printint into the stream.
+class Hunk {
+ public:
+  Hunk(size_t left_start, size_t right_start)
+      : left_start_(left_start),
+        right_start_(right_start),
+        adds_(),
+        removes_(),
+        common_() {}
+
+  void PushLine(char edit, const char* line) {
+    switch (edit) {
+      case ' ':
+        ++common_;
+        FlushEdits();
+        hunk_.push_back(std::make_pair(' ', line));
+        break;
+      case '-':
+        ++removes_;
+        hunk_removes_.push_back(std::make_pair('-', line));
+        break;
+      case '+':
+        ++adds_;
+        hunk_adds_.push_back(std::make_pair('+', line));
+        break;
+    }
+  }
+
+  void PrintTo(std::ostream* os) {
+    PrintHeader(os);
+    FlushEdits();
+    for (std::list<std::pair<char, const char*> >::const_iterator it =
+             hunk_.begin();
+         it != hunk_.end(); ++it) {
+      *os << it->first << it->second << "\n";
+    }
+  }
+
+  bool has_edits() const { return adds_ || removes_; }
+
+ private:
+  void FlushEdits() {
+    hunk_.splice(hunk_.end(), hunk_removes_);
+    hunk_.splice(hunk_.end(), hunk_adds_);
+  }
+
+  // Print a unified diff header for one hunk.
+  // The format is
+  //   "@@ -<left_start>,<left_length> +<right_start>,<right_length> @@"
+  // where the left/right parts are omitted if unnecessary.
+  void PrintHeader(std::ostream* ss) const {
+    *ss << "@@ ";
+    if (removes_) {
+      *ss << "-" << left_start_ << "," << (removes_ + common_);
+    }
+    if (removes_ && adds_) {
+      *ss << " ";
+    }
+    if (adds_) {
+      *ss << "+" << right_start_ << "," << (adds_ + common_);
+    }
+    *ss << " @@\n";
+  }
+
+  size_t left_start_, right_start_;
+  size_t adds_, removes_, common_;
+  std::list<std::pair<char, const char*> > hunk_, hunk_adds_, hunk_removes_;
+};
+
+}  // namespace
+
+// Create a list of diff hunks in Unified diff format.
+// Each hunk has a header generated by PrintHeader above plus a body with
+// lines prefixed with ' ' for no change, '-' for deletion and '+' for
+// addition.
+// 'context' represents the desired unchanged prefix/suffix around the diff.
+// If two hunks are close enough that their contexts overlap, then they are
+// joined into one hunk.
+std::string CreateUnifiedDiff(const std::vector<std::string>& left,
+                              const std::vector<std::string>& right,
+                              size_t context) {
+  const std::vector<EditType> edits = CalculateOptimalEdits(left, right);
+
+  size_t l_i = 0, r_i = 0, edit_i = 0;
+  std::stringstream ss;
+  while (edit_i < edits.size()) {
+    // Find first edit.
+    while (edit_i < edits.size() && edits[edit_i] == kMatch) {
+      ++l_i;
+      ++r_i;
+      ++edit_i;
+    }
+
+    // Find the first line to include in the hunk.
+    const size_t prefix_context = std::min(l_i, context);
+    Hunk hunk(l_i - prefix_context + 1, r_i - prefix_context + 1);
+    for (size_t i = prefix_context; i > 0; --i) {
+      hunk.PushLine(' ', left[l_i - i].c_str());
+    }
+
+    // Iterate the edits until we found enough suffix for the hunk or the input
+    // is over.
+    size_t n_suffix = 0;
+    for (; edit_i < edits.size(); ++edit_i) {
+      if (n_suffix >= context) {
+        // Continue only if the next hunk is very close.
+        auto it = edits.begin() + static_cast<int>(edit_i);
+        while (it != edits.end() && *it == kMatch) ++it;
+        if (it == edits.end() ||
+            static_cast<size_t>(it - edits.begin()) - edit_i >= context) {
+          // There is no next edit or it is too far away.
+          break;
+        }
+      }
+
+      EditType edit = edits[edit_i];
+      // Reset count when a non match is found.
+      n_suffix = edit == kMatch ? n_suffix + 1 : 0;
+
+      if (edit == kMatch || edit == kRemove || edit == kReplace) {
+        hunk.PushLine(edit == kMatch ? ' ' : '-', left[l_i].c_str());
+      }
+      if (edit == kAdd || edit == kReplace) {
+        hunk.PushLine('+', right[r_i].c_str());
+      }
+
+      // Advance indices, depending on edit type.
+      l_i += edit != kAdd;
+      r_i += edit != kRemove;
+    }
+
+    if (!hunk.has_edits()) {
+      // We are done. We don't want this hunk.
+      break;
+    }
+
+    hunk.PrintTo(&ss);
+  }
+  return ss.str();
+}
+
+}  // namespace edit_distance
+
+namespace {
+
+// The string representation of the values received in EqFailure() are already
+// escaped. Split them on escaped '\n' boundaries. Leave all other escaped
+// characters the same.
+std::vector<std::string> SplitEscapedString(const std::string& str) {
+  std::vector<std::string> lines;
+  size_t start = 0, end = str.size();
+  if (end > 2 && str[0] == '"' && str[end - 1] == '"') {
+    ++start;
+    --end;
+  }
+  bool escaped = false;
+  for (size_t i = start; i + 1 < end; ++i) {
+    if (escaped) {
+      escaped = false;
+      if (str[i] == 'n') {
+        lines.push_back(str.substr(start, i - start - 1));
+        start = i + 1;
+      }
+    } else {
+      escaped = str[i] == '\\';
+    }
+  }
+  lines.push_back(str.substr(start, end - start));
+  return lines;
+}
+
+}  // namespace
+
+// Constructs and returns the message for an equality assertion
+// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
+//
+// The first four parameters are the expressions used in the assertion
+// and their values, as strings.  For example, for ASSERT_EQ(foo, bar)
+// where foo is 5 and bar is 6, we have:
+//
+//   lhs_expression: "foo"
+//   rhs_expression: "bar"
+//   lhs_value:      "5"
+//   rhs_value:      "6"
+//
+// The ignoring_case parameter is true if and only if the assertion is a
+// *_STRCASEEQ*.  When it's true, the string "Ignoring case" will
+// be inserted into the message.
+AssertionResult EqFailure(const char* lhs_expression,
+                          const char* rhs_expression,
+                          const std::string& lhs_value,
+                          const std::string& rhs_value, bool ignoring_case) {
+  Message msg;
+  msg << "Expected equality of these values:";
+  msg << "\n  " << lhs_expression;
+  if (lhs_value != lhs_expression) {
+    msg << "\n    Which is: " << lhs_value;
+  }
+  msg << "\n  " << rhs_expression;
+  if (rhs_value != rhs_expression) {
+    msg << "\n    Which is: " << rhs_value;
+  }
+
+  if (ignoring_case) {
+    msg << "\nIgnoring case";
+  }
+
+  if (!lhs_value.empty() && !rhs_value.empty()) {
+    const std::vector<std::string> lhs_lines = SplitEscapedString(lhs_value);
+    const std::vector<std::string> rhs_lines = SplitEscapedString(rhs_value);
+    if (lhs_lines.size() > 1 || rhs_lines.size() > 1) {
+      msg << "\nWith diff:\n"
+          << edit_distance::CreateUnifiedDiff(lhs_lines, rhs_lines);
+    }
+  }
+
+  return AssertionFailure() << msg;
+}
+
+// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
+std::string GetBoolAssertionFailureMessage(
+    const AssertionResult& assertion_result, const char* expression_text,
+    const char* actual_predicate_value, const char* expected_predicate_value) {
+  const char* actual_message = assertion_result.message();
+  Message msg;
+  msg << "Value of: " << expression_text
+      << "\n  Actual: " << actual_predicate_value;
+  if (actual_message[0] != '\0') msg << " (" << actual_message << ")";
+  msg << "\nExpected: " << expected_predicate_value;
+  return msg.GetString();
+}
+
+// Helper function for implementing ASSERT_NEAR.
+AssertionResult DoubleNearPredFormat(const char* expr1, const char* expr2,
+                                     const char* abs_error_expr, double val1,
+                                     double val2, double abs_error) {
+  const double diff = fabs(val1 - val2);
+  if (diff <= abs_error) return AssertionSuccess();
+
+  // Find the value which is closest to zero.
+  const double min_abs = std::min(fabs(val1), fabs(val2));
+  // Find the distance to the next double from that value.
+  const double epsilon =
+      nextafter(min_abs, std::numeric_limits<double>::infinity()) - min_abs;
+  // Detect the case where abs_error is so small that EXPECT_NEAR is
+  // effectively the same as EXPECT_EQUAL, and give an informative error
+  // message so that the situation can be more easily understood without
+  // requiring exotic floating-point knowledge.
+  // Don't do an epsilon check if abs_error is zero because that implies
+  // that an equality check was actually intended.
+  if (!(std::isnan)(val1) && !(std::isnan)(val2) && abs_error > 0 &&
+      abs_error < epsilon) {
+    return AssertionFailure()
+           << "The difference between " << expr1 << " and " << expr2 << " is "
+           << diff << ", where\n"
+           << expr1 << " evaluates to " << val1 << ",\n"
+           << expr2 << " evaluates to " << val2 << ".\nThe abs_error parameter "
+           << abs_error_expr << " evaluates to " << abs_error
+           << " which is smaller than the minimum distance between doubles for "
+              "numbers of this magnitude which is "
+           << epsilon
+           << ", thus making this EXPECT_NEAR check equivalent to "
+              "EXPECT_EQUAL. Consider using EXPECT_DOUBLE_EQ instead.";
+  }
+  return AssertionFailure()
+         << "The difference between " << expr1 << " and " << expr2 << " is "
+         << diff << ", which exceeds " << abs_error_expr << ", where\n"
+         << expr1 << " evaluates to " << val1 << ",\n"
+         << expr2 << " evaluates to " << val2 << ", and\n"
+         << abs_error_expr << " evaluates to " << abs_error << ".";
+}
+
+// Helper template for implementing FloatLE() and DoubleLE().
+template <typename RawType>
+AssertionResult FloatingPointLE(const char* expr1, const char* expr2,
+                                RawType val1, RawType val2) {
+  // Returns success if val1 is less than val2,
+  if (val1 < val2) {
+    return AssertionSuccess();
+  }
+
+  // or if val1 is almost equal to val2.
+  const FloatingPoint<RawType> lhs(val1), rhs(val2);
+  if (lhs.AlmostEquals(rhs)) {
+    return AssertionSuccess();
+  }
+
+  // Note that the above two checks will both fail if either val1 or
+  // val2 is NaN, as the IEEE floating-point standard requires that
+  // any predicate involving a NaN must return false.
+
+  ::std::stringstream val1_ss;
+  val1_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+          << val1;
+
+  ::std::stringstream val2_ss;
+  val2_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+          << val2;
+
+  return AssertionFailure()
+         << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n"
+         << "  Actual: " << StringStreamToString(&val1_ss) << " vs "
+         << StringStreamToString(&val2_ss);
+}
+
+}  // namespace internal
+
+// Asserts that val1 is less than, or almost equal to, val2.  Fails
+// otherwise.  In particular, it fails if either val1 or val2 is NaN.
+AssertionResult FloatLE(const char* expr1, const char* expr2, float val1,
+                        float val2) {
+  return internal::FloatingPointLE<float>(expr1, expr2, val1, val2);
+}
+
+// Asserts that val1 is less than, or almost equal to, val2.  Fails
+// otherwise.  In particular, it fails if either val1 or val2 is NaN.
+AssertionResult DoubleLE(const char* expr1, const char* expr2, double val1,
+                         double val2) {
+  return internal::FloatingPointLE<double>(expr1, expr2, val1, val2);
+}
+
+namespace internal {
+
+// The helper function for {ASSERT|EXPECT}_STREQ.
+AssertionResult CmpHelperSTREQ(const char* lhs_expression,
+                               const char* rhs_expression, const char* lhs,
+                               const char* rhs) {
+  if (String::CStringEquals(lhs, rhs)) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs),
+                   PrintToString(rhs), false);
+}
+
+// The helper function for {ASSERT|EXPECT}_STRCASEEQ.
+AssertionResult CmpHelperSTRCASEEQ(const char* lhs_expression,
+                                   const char* rhs_expression, const char* lhs,
+                                   const char* rhs) {
+  if (String::CaseInsensitiveCStringEquals(lhs, rhs)) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs),
+                   PrintToString(rhs), true);
+}
+
+// The helper function for {ASSERT|EXPECT}_STRNE.
+AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                               const char* s2_expression, const char* s1,
+                               const char* s2) {
+  if (!String::CStringEquals(s1, s2)) {
+    return AssertionSuccess();
+  } else {
+    return AssertionFailure()
+           << "Expected: (" << s1_expression << ") != (" << s2_expression
+           << "), actual: \"" << s1 << "\" vs \"" << s2 << "\"";
+  }
+}
+
+// The helper function for {ASSERT|EXPECT}_STRCASENE.
+AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
+                                   const char* s2_expression, const char* s1,
+                                   const char* s2) {
+  if (!String::CaseInsensitiveCStringEquals(s1, s2)) {
+    return AssertionSuccess();
+  } else {
+    return AssertionFailure()
+           << "Expected: (" << s1_expression << ") != (" << s2_expression
+           << ") (ignoring case), actual: \"" << s1 << "\" vs \"" << s2 << "\"";
+  }
+}
+
+}  // namespace internal
+
+namespace {
+
+// Helper functions for implementing IsSubString() and IsNotSubstring().
+
+// This group of overloaded functions return true if and only if needle
+// is a substring of haystack.  NULL is considered a substring of
+// itself only.
+
+bool IsSubstringPred(const char* needle, const char* haystack) {
+  if (needle == nullptr || haystack == nullptr) return needle == haystack;
+
+  return strstr(haystack, needle) != nullptr;
+}
+
+bool IsSubstringPred(const wchar_t* needle, const wchar_t* haystack) {
+  if (needle == nullptr || haystack == nullptr) return needle == haystack;
+
+  return wcsstr(haystack, needle) != nullptr;
+}
+
+// StringType here can be either ::std::string or ::std::wstring.
+template <typename StringType>
+bool IsSubstringPred(const StringType& needle, const StringType& haystack) {
+  return haystack.find(needle) != StringType::npos;
+}
+
+// This function implements either IsSubstring() or IsNotSubstring(),
+// depending on the value of the expected_to_be_substring parameter.
+// StringType here can be const char*, const wchar_t*, ::std::string,
+// or ::std::wstring.
+template <typename StringType>
+AssertionResult IsSubstringImpl(bool expected_to_be_substring,
+                                const char* needle_expr,
+                                const char* haystack_expr,
+                                const StringType& needle,
+                                const StringType& haystack) {
+  if (IsSubstringPred(needle, haystack) == expected_to_be_substring)
+    return AssertionSuccess();
+
+  const bool is_wide_string = sizeof(needle[0]) > 1;
+  const char* const begin_string_quote = is_wide_string ? "L\"" : "\"";
+  return AssertionFailure()
+         << "Value of: " << needle_expr << "\n"
+         << "  Actual: " << begin_string_quote << needle << "\"\n"
+         << "Expected: " << (expected_to_be_substring ? "" : "not ")
+         << "a substring of " << haystack_expr << "\n"
+         << "Which is: " << begin_string_quote << haystack << "\"";
+}
+
+}  // namespace
+
+// IsSubstring() and IsNotSubstring() check whether needle is a
+// substring of haystack (NULL is considered a substring of itself
+// only), and return an appropriate error message when they fail.
+
+AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr,
+                            const char* needle, const char* haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr,
+                            const wchar_t* needle, const wchar_t* haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(const char* needle_expr,
+                               const char* haystack_expr, const char* needle,
+                               const char* haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(const char* needle_expr,
+                               const char* haystack_expr, const wchar_t* needle,
+                               const wchar_t* haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr,
+                            const ::std::string& needle,
+                            const ::std::string& haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(const char* needle_expr,
+                               const char* haystack_expr,
+                               const ::std::string& needle,
+                               const ::std::string& haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+#if GTEST_HAS_STD_WSTRING
+AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr,
+                            const ::std::wstring& needle,
+                            const ::std::wstring& haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(const char* needle_expr,
+                               const char* haystack_expr,
+                               const ::std::wstring& needle,
+                               const ::std::wstring& haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+namespace internal {
+
+#if GTEST_OS_WINDOWS
+
+namespace {
+
+// Helper function for IsHRESULT{SuccessFailure} predicates
+AssertionResult HRESULTFailureHelper(const char* expr, const char* expected,
+                                     long hr) {  // NOLINT
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_TV_TITLE
+
+  // Windows CE doesn't support FormatMessage.
+  const char error_text[] = "";
+
+#else
+
+  // Looks up the human-readable system message for the HRESULT code
+  // and since we're not passing any params to FormatMessage, we don't
+  // want inserts expanded.
+  const DWORD kFlags =
+      FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS;
+  const DWORD kBufSize = 4096;
+  // Gets the system's human readable message string for this HRESULT.
+  char error_text[kBufSize] = {'\0'};
+  DWORD message_length = ::FormatMessageA(kFlags,
+                                          0,  // no source, we're asking system
+                                          static_cast<DWORD>(hr),  // the error
+                                          0,  // no line width restrictions
+                                          error_text,  // output buffer
+                                          kBufSize,    // buf size
+                                          nullptr);  // no arguments for inserts
+  // Trims tailing white space (FormatMessage leaves a trailing CR-LF)
+  for (; message_length && IsSpace(error_text[message_length - 1]);
+       --message_length) {
+    error_text[message_length - 1] = '\0';
+  }
+
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+  const std::string error_hex("0x" + String::FormatHexInt(hr));
+  return ::testing::AssertionFailure()
+         << "Expected: " << expr << " " << expected << ".\n"
+         << "  Actual: " << error_hex << " " << error_text << "\n";
+}
+
+}  // namespace
+
+AssertionResult IsHRESULTSuccess(const char* expr, long hr) {  // NOLINT
+  if (SUCCEEDED(hr)) {
+    return AssertionSuccess();
+  }
+  return HRESULTFailureHelper(expr, "succeeds", hr);
+}
+
+AssertionResult IsHRESULTFailure(const char* expr, long hr) {  // NOLINT
+  if (FAILED(hr)) {
+    return AssertionSuccess();
+  }
+  return HRESULTFailureHelper(expr, "fails", hr);
+}
+
+#endif  // GTEST_OS_WINDOWS
+
+// Utility functions for encoding Unicode text (wide strings) in
+// UTF-8.
+
+// A Unicode code-point can have up to 21 bits, and is encoded in UTF-8
+// like this:
+//
+// Code-point length   Encoding
+//   0 -  7 bits       0xxxxxxx
+//   8 - 11 bits       110xxxxx 10xxxxxx
+//  12 - 16 bits       1110xxxx 10xxxxxx 10xxxxxx
+//  17 - 21 bits       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+// The maximum code-point a one-byte UTF-8 sequence can represent.
+constexpr uint32_t kMaxCodePoint1 = (static_cast<uint32_t>(1) << 7) - 1;
+
+// The maximum code-point a two-byte UTF-8 sequence can represent.
+constexpr uint32_t kMaxCodePoint2 = (static_cast<uint32_t>(1) << (5 + 6)) - 1;
+
+// The maximum code-point a three-byte UTF-8 sequence can represent.
+constexpr uint32_t kMaxCodePoint3 =
+    (static_cast<uint32_t>(1) << (4 + 2 * 6)) - 1;
+
+// The maximum code-point a four-byte UTF-8 sequence can represent.
+constexpr uint32_t kMaxCodePoint4 =
+    (static_cast<uint32_t>(1) << (3 + 3 * 6)) - 1;
+
+// Chops off the n lowest bits from a bit pattern.  Returns the n
+// lowest bits.  As a side effect, the original bit pattern will be
+// shifted to the right by n bits.
+inline uint32_t ChopLowBits(uint32_t* bits, int n) {
+  const uint32_t low_bits = *bits & ((static_cast<uint32_t>(1) << n) - 1);
+  *bits >>= n;
+  return low_bits;
+}
+
+// Converts a Unicode code point to a narrow string in UTF-8 encoding.
+// code_point parameter is of type uint32_t because wchar_t may not be
+// wide enough to contain a code point.
+// If the code_point is not a valid Unicode code point
+// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
+// to "(Invalid Unicode 0xXXXXXXXX)".
+std::string CodePointToUtf8(uint32_t code_point) {
+  if (code_point > kMaxCodePoint4) {
+    return "(Invalid Unicode 0x" + String::FormatHexUInt32(code_point) + ")";
+  }
+
+  char str[5];  // Big enough for the largest valid code point.
+  if (code_point <= kMaxCodePoint1) {
+    str[1] = '\0';
+    str[0] = static_cast<char>(code_point);  // 0xxxxxxx
+  } else if (code_point <= kMaxCodePoint2) {
+    str[2] = '\0';
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xC0 | code_point);                   // 110xxxxx
+  } else if (code_point <= kMaxCodePoint3) {
+    str[3] = '\0';
+    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xE0 | code_point);                   // 1110xxxx
+  } else {  // code_point <= kMaxCodePoint4
+    str[4] = '\0';
+    str[3] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xF0 | code_point);                   // 11110xxx
+  }
+  return str;
+}
+
+// The following two functions only make sense if the system
+// uses UTF-16 for wide string encoding. All supported systems
+// with 16 bit wchar_t (Windows, Cygwin) do use UTF-16.
+
+// Determines if the arguments constitute UTF-16 surrogate pair
+// and thus should be combined into a single Unicode code point
+// using CreateCodePointFromUtf16SurrogatePair.
+inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) {
+  return sizeof(wchar_t) == 2 && (first & 0xFC00) == 0xD800 &&
+         (second & 0xFC00) == 0xDC00;
+}
+
+// Creates a Unicode code point from UTF16 surrogate pair.
+inline uint32_t CreateCodePointFromUtf16SurrogatePair(wchar_t first,
+                                                      wchar_t second) {
+  const auto first_u = static_cast<uint32_t>(first);
+  const auto second_u = static_cast<uint32_t>(second);
+  const uint32_t mask = (1 << 10) - 1;
+  return (sizeof(wchar_t) == 2)
+             ? (((first_u & mask) << 10) | (second_u & mask)) + 0x10000
+             :
+             // This function should not be called when the condition is
+             // false, but we provide a sensible default in case it is.
+             first_u;
+}
+
+// Converts a wide string to a narrow string in UTF-8 encoding.
+// The wide string is assumed to have the following encoding:
+//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin)
+//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
+// Parameter str points to a null-terminated wide string.
+// Parameter num_chars may additionally limit the number
+// of wchar_t characters processed. -1 is used when the entire string
+// should be processed.
+// If the string contains code points that are not valid Unicode code points
+// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
+// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
+// and contains invalid UTF-16 surrogate pairs, values in those pairs
+// will be encoded as individual Unicode characters from Basic Normal Plane.
+std::string WideStringToUtf8(const wchar_t* str, int num_chars) {
+  if (num_chars == -1) num_chars = static_cast<int>(wcslen(str));
+
+  ::std::stringstream stream;
+  for (int i = 0; i < num_chars; ++i) {
+    uint32_t unicode_code_point;
+
+    if (str[i] == L'\0') {
+      break;
+    } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) {
+      unicode_code_point =
+          CreateCodePointFromUtf16SurrogatePair(str[i], str[i + 1]);
+      i++;
+    } else {
+      unicode_code_point = static_cast<uint32_t>(str[i]);
+    }
+
+    stream << CodePointToUtf8(unicode_code_point);
+  }
+  return StringStreamToString(&stream);
+}
+
+// Converts a wide C string to an std::string using the UTF-8 encoding.
+// NULL will be converted to "(null)".
+std::string String::ShowWideCString(const wchar_t* wide_c_str) {
+  if (wide_c_str == nullptr) return "(null)";
+
+  return internal::WideStringToUtf8(wide_c_str, -1);
+}
+
+// Compares two wide C strings.  Returns true if and only if they have the
+// same content.
+//
+// Unlike wcscmp(), this function can handle NULL argument(s).  A NULL
+// C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs) {
+  if (lhs == nullptr) return rhs == nullptr;
+
+  if (rhs == nullptr) return false;
+
+  return wcscmp(lhs, rhs) == 0;
+}
+
+// Helper function for *_STREQ on wide strings.
+AssertionResult CmpHelperSTREQ(const char* lhs_expression,
+                               const char* rhs_expression, const wchar_t* lhs,
+                               const wchar_t* rhs) {
+  if (String::WideCStringEquals(lhs, rhs)) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs),
+                   PrintToString(rhs), false);
+}
+
+// Helper function for *_STRNE on wide strings.
+AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                               const char* s2_expression, const wchar_t* s1,
+                               const wchar_t* s2) {
+  if (!String::WideCStringEquals(s1, s2)) {
+    return AssertionSuccess();
+  }
+
+  return AssertionFailure()
+         << "Expected: (" << s1_expression << ") != (" << s2_expression
+         << "), actual: " << PrintToString(s1) << " vs " << PrintToString(s2);
+}
+
+// Compares two C strings, ignoring case.  Returns true if and only if they have
+// the same content.
+//
+// Unlike strcasecmp(), this function can handle NULL argument(s).  A
+// NULL C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::CaseInsensitiveCStringEquals(const char* lhs, const char* rhs) {
+  if (lhs == nullptr) return rhs == nullptr;
+  if (rhs == nullptr) return false;
+  return posix::StrCaseCmp(lhs, rhs) == 0;
+}
+
+// Compares two wide C strings, ignoring case.  Returns true if and only if they
+// have the same content.
+//
+// Unlike wcscasecmp(), this function can handle NULL argument(s).
+// A NULL C string is considered different to any non-NULL wide C string,
+// including the empty string.
+// NB: The implementations on different platforms slightly differ.
+// On windows, this method uses _wcsicmp which compares according to LC_CTYPE
+// environment variable. On GNU platform this method uses wcscasecmp
+// which compares according to LC_CTYPE category of the current locale.
+// On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
+// current locale.
+bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
+                                              const wchar_t* rhs) {
+  if (lhs == nullptr) return rhs == nullptr;
+
+  if (rhs == nullptr) return false;
+
+#if GTEST_OS_WINDOWS
+  return _wcsicmp(lhs, rhs) == 0;
+#elif GTEST_OS_LINUX && !GTEST_OS_LINUX_ANDROID
+  return wcscasecmp(lhs, rhs) == 0;
+#else
+  // Android, Mac OS X and Cygwin don't define wcscasecmp.
+  // Other unknown OSes may not define it either.
+  wint_t left, right;
+  do {
+    left = towlower(static_cast<wint_t>(*lhs++));
+    right = towlower(static_cast<wint_t>(*rhs++));
+  } while (left && left == right);
+  return left == right;
+#endif  // OS selector
+}
+
+// Returns true if and only if str ends with the given suffix, ignoring case.
+// Any string is considered to end with an empty suffix.
+bool String::EndsWithCaseInsensitive(const std::string& str,
+                                     const std::string& suffix) {
+  const size_t str_len = str.length();
+  const size_t suffix_len = suffix.length();
+  return (str_len >= suffix_len) &&
+         CaseInsensitiveCStringEquals(str.c_str() + str_len - suffix_len,
+                                      suffix.c_str());
+}
+
+// Formats an int value as "%02d".
+std::string String::FormatIntWidth2(int value) {
+  return FormatIntWidthN(value, 2);
+}
+
+// Formats an int value to given width with leading zeros.
+std::string String::FormatIntWidthN(int value, int width) {
+  std::stringstream ss;
+  ss << std::setfill('0') << std::setw(width) << value;
+  return ss.str();
+}
+
+// Formats an int value as "%X".
+std::string String::FormatHexUInt32(uint32_t value) {
+  std::stringstream ss;
+  ss << std::hex << std::uppercase << value;
+  return ss.str();
+}
+
+// Formats an int value as "%X".
+std::string String::FormatHexInt(int value) {
+  return FormatHexUInt32(static_cast<uint32_t>(value));
+}
+
+// Formats a byte as "%02X".
+std::string String::FormatByte(unsigned char value) {
+  std::stringstream ss;
+  ss << std::setfill('0') << std::setw(2) << std::hex << std::uppercase
+     << static_cast<unsigned int>(value);
+  return ss.str();
+}
+
+// Converts the buffer in a stringstream to an std::string, converting NUL
+// bytes to "\\0" along the way.
+std::string StringStreamToString(::std::stringstream* ss) {
+  const ::std::string& str = ss->str();
+  const char* const start = str.c_str();
+  const char* const end = start + str.length();
+
+  std::string result;
+  result.reserve(static_cast<size_t>(2 * (end - start)));
+  for (const char* ch = start; ch != end; ++ch) {
+    if (*ch == '\0') {
+      result += "\\0";  // Replaces NUL with "\\0";
+    } else {
+      result += *ch;
+    }
+  }
+
+  return result;
+}
+
+// Appends the user-supplied message to the Google-Test-generated message.
+std::string AppendUserMessage(const std::string& gtest_msg,
+                              const Message& user_msg) {
+  // Appends the user message if it's non-empty.
+  const std::string user_msg_string = user_msg.GetString();
+  if (user_msg_string.empty()) {
+    return gtest_msg;
+  }
+  if (gtest_msg.empty()) {
+    return user_msg_string;
+  }
+  return gtest_msg + "\n" + user_msg_string;
+}
+
+}  // namespace internal
+
+// class TestResult
+
+// Creates an empty TestResult.
+TestResult::TestResult()
+    : death_test_count_(0), start_timestamp_(0), elapsed_time_(0) {}
+
+// D'tor.
+TestResult::~TestResult() {}
+
+// Returns the i-th test part result among all the results. i can
+// range from 0 to total_part_count() - 1. If i is not in that range,
+// aborts the program.
+const TestPartResult& TestResult::GetTestPartResult(int i) const {
+  if (i < 0 || i >= total_part_count()) internal::posix::Abort();
+  return test_part_results_.at(static_cast<size_t>(i));
+}
+
+// Returns the i-th test property. i can range from 0 to
+// test_property_count() - 1. If i is not in that range, aborts the
+// program.
+const TestProperty& TestResult::GetTestProperty(int i) const {
+  if (i < 0 || i >= test_property_count()) internal::posix::Abort();
+  return test_properties_.at(static_cast<size_t>(i));
+}
+
+// Clears the test part results.
+void TestResult::ClearTestPartResults() { test_part_results_.clear(); }
+
+// Adds a test part result to the list.
+void TestResult::AddTestPartResult(const TestPartResult& test_part_result) {
+  test_part_results_.push_back(test_part_result);
+}
+
+// Adds a test property to the list. If a property with the same key as the
+// supplied property is already represented, the value of this test_property
+// replaces the old value for that key.
+void TestResult::RecordProperty(const std::string& xml_element,
+                                const TestProperty& test_property) {
+  if (!ValidateTestProperty(xml_element, test_property)) {
+    return;
+  }
+  internal::MutexLock lock(&test_properties_mutex_);
+  const std::vector<TestProperty>::iterator property_with_matching_key =
+      std::find_if(test_properties_.begin(), test_properties_.end(),
+                   internal::TestPropertyKeyIs(test_property.key()));
+  if (property_with_matching_key == test_properties_.end()) {
+    test_properties_.push_back(test_property);
+    return;
+  }
+  property_with_matching_key->SetValue(test_property.value());
+}
+
+// The list of reserved attributes used in the <testsuites> element of XML
+// output.
+static const char* const kReservedTestSuitesAttributes[] = {
+    "disabled",    "errors", "failures", "name",
+    "random_seed", "tests",  "time",     "timestamp"};
+
+// The list of reserved attributes used in the <testsuite> element of XML
+// output.
+static const char* const kReservedTestSuiteAttributes[] = {
+    "disabled", "errors", "failures",  "name",
+    "tests",    "time",   "timestamp", "skipped"};
+
+// The list of reserved attributes used in the <testcase> element of XML output.
+static const char* const kReservedTestCaseAttributes[] = {
+    "classname",  "name",        "status", "time",
+    "type_param", "value_param", "file",   "line"};
+
+// Use a slightly different set for allowed output to ensure existing tests can
+// still RecordProperty("result") or "RecordProperty(timestamp")
+static const char* const kReservedOutputTestCaseAttributes[] = {
+    "classname",   "name", "status", "time",   "type_param",
+    "value_param", "file", "line",   "result", "timestamp"};
+
+template <size_t kSize>
+std::vector<std::string> ArrayAsVector(const char* const (&array)[kSize]) {
+  return std::vector<std::string>(array, array + kSize);
+}
+
+static std::vector<std::string> GetReservedAttributesForElement(
+    const std::string& xml_element) {
+  if (xml_element == "testsuites") {
+    return ArrayAsVector(kReservedTestSuitesAttributes);
+  } else if (xml_element == "testsuite") {
+    return ArrayAsVector(kReservedTestSuiteAttributes);
+  } else if (xml_element == "testcase") {
+    return ArrayAsVector(kReservedTestCaseAttributes);
+  } else {
+    GTEST_CHECK_(false) << "Unrecognized xml_element provided: " << xml_element;
+  }
+  // This code is unreachable but some compilers may not realizes that.
+  return std::vector<std::string>();
+}
+
+// TODO(jdesprez): Merge the two getReserved attributes once skip is improved
+static std::vector<std::string> GetReservedOutputAttributesForElement(
+    const std::string& xml_element) {
+  if (xml_element == "testsuites") {
+    return ArrayAsVector(kReservedTestSuitesAttributes);
+  } else if (xml_element == "testsuite") {
+    return ArrayAsVector(kReservedTestSuiteAttributes);
+  } else if (xml_element == "testcase") {
+    return ArrayAsVector(kReservedOutputTestCaseAttributes);
+  } else {
+    GTEST_CHECK_(false) << "Unrecognized xml_element provided: " << xml_element;
+  }
+  // This code is unreachable but some compilers may not realizes that.
+  return std::vector<std::string>();
+}
+
+static std::string FormatWordList(const std::vector<std::string>& words) {
+  Message word_list;
+  for (size_t i = 0; i < words.size(); ++i) {
+    if (i > 0 && words.size() > 2) {
+      word_list << ", ";
+    }
+    if (i == words.size() - 1) {
+      word_list << "and ";
+    }
+    word_list << "'" << words[i] << "'";
+  }
+  return word_list.GetString();
+}
+
+static bool ValidateTestPropertyName(
+    const std::string& property_name,
+    const std::vector<std::string>& reserved_names) {
+  if (std::find(reserved_names.begin(), reserved_names.end(), property_name) !=
+      reserved_names.end()) {
+    ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name
+                  << " (" << FormatWordList(reserved_names)
+                  << " are reserved by " << GTEST_NAME_ << ")";
+    return false;
+  }
+  return true;
+}
+
+// Adds a failure if the key is a reserved attribute of the element named
+// xml_element.  Returns true if the property is valid.
+bool TestResult::ValidateTestProperty(const std::string& xml_element,
+                                      const TestProperty& test_property) {
+  return ValidateTestPropertyName(test_property.key(),
+                                  GetReservedAttributesForElement(xml_element));
+}
+
+// Clears the object.
+void TestResult::Clear() {
+  test_part_results_.clear();
+  test_properties_.clear();
+  death_test_count_ = 0;
+  elapsed_time_ = 0;
+}
+
+// Returns true off the test part was skipped.
+static bool TestPartSkipped(const TestPartResult& result) {
+  return result.skipped();
+}
+
+// Returns true if and only if the test was skipped.
+bool TestResult::Skipped() const {
+  return !Failed() && CountIf(test_part_results_, TestPartSkipped) > 0;
+}
+
+// Returns true if and only if the test failed.
+bool TestResult::Failed() const {
+  for (int i = 0; i < total_part_count(); ++i) {
+    if (GetTestPartResult(i).failed()) return true;
+  }
+  return false;
+}
+
+// Returns true if and only if the test part fatally failed.
+static bool TestPartFatallyFailed(const TestPartResult& result) {
+  return result.fatally_failed();
+}
+
+// Returns true if and only if the test fatally failed.
+bool TestResult::HasFatalFailure() const {
+  return CountIf(test_part_results_, TestPartFatallyFailed) > 0;
+}
+
+// Returns true if and only if the test part non-fatally failed.
+static bool TestPartNonfatallyFailed(const TestPartResult& result) {
+  return result.nonfatally_failed();
+}
+
+// Returns true if and only if the test has a non-fatal failure.
+bool TestResult::HasNonfatalFailure() const {
+  return CountIf(test_part_results_, TestPartNonfatallyFailed) > 0;
+}
+
+// Gets the number of all test parts.  This is the sum of the number
+// of successful test parts and the number of failed test parts.
+int TestResult::total_part_count() const {
+  return static_cast<int>(test_part_results_.size());
+}
+
+// Returns the number of the test properties.
+int TestResult::test_property_count() const {
+  return static_cast<int>(test_properties_.size());
+}
+
+// class Test
+
+// Creates a Test object.
+
+// The c'tor saves the states of all flags.
+Test::Test() : gtest_flag_saver_(new GTEST_FLAG_SAVER_) {}
+
+// The d'tor restores the states of all flags.  The actual work is
+// done by the d'tor of the gtest_flag_saver_ field, and thus not
+// visible here.
+Test::~Test() {}
+
+// Sets up the test fixture.
+//
+// A sub-class may override this.
+void Test::SetUp() {}
+
+// Tears down the test fixture.
+//
+// A sub-class may override this.
+void Test::TearDown() {}
+
+// Allows user supplied key value pairs to be recorded for later output.
+void Test::RecordProperty(const std::string& key, const std::string& value) {
+  UnitTest::GetInstance()->RecordProperty(key, value);
+}
+
+// Allows user supplied key value pairs to be recorded for later output.
+void Test::RecordProperty(const std::string& key, int value) {
+  Message value_message;
+  value_message << value;
+  RecordProperty(key, value_message.GetString().c_str());
+}
+
+namespace internal {
+
+void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
+                                    const std::string& message) {
+  // This function is a friend of UnitTest and as such has access to
+  // AddTestPartResult.
+  UnitTest::GetInstance()->AddTestPartResult(
+      result_type,
+      nullptr,  // No info about the source file where the exception occurred.
+      -1,       // We have no info on which line caused the exception.
+      message,
+      "");  // No stack trace, either.
+}
+
+}  // namespace internal
+
+// Google Test requires all tests in the same test suite to use the same test
+// fixture class.  This function checks if the current test has the
+// same fixture class as the first test in the current test suite.  If
+// yes, it returns true; otherwise it generates a Google Test failure and
+// returns false.
+bool Test::HasSameFixtureClass() {
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  const TestSuite* const test_suite = impl->current_test_suite();
+
+  // Info about the first test in the current test suite.
+  const TestInfo* const first_test_info = test_suite->test_info_list()[0];
+  const internal::TypeId first_fixture_id = first_test_info->fixture_class_id_;
+  const char* const first_test_name = first_test_info->name();
+
+  // Info about the current test.
+  const TestInfo* const this_test_info = impl->current_test_info();
+  const internal::TypeId this_fixture_id = this_test_info->fixture_class_id_;
+  const char* const this_test_name = this_test_info->name();
+
+  if (this_fixture_id != first_fixture_id) {
+    // Is the first test defined using TEST?
+    const bool first_is_TEST = first_fixture_id == internal::GetTestTypeId();
+    // Is this test defined using TEST?
+    const bool this_is_TEST = this_fixture_id == internal::GetTestTypeId();
+
+    if (first_is_TEST || this_is_TEST) {
+      // Both TEST and TEST_F appear in same test suite, which is incorrect.
+      // Tell the user how to fix this.
+
+      // Gets the name of the TEST and the name of the TEST_F.  Note
+      // that first_is_TEST and this_is_TEST cannot both be true, as
+      // the fixture IDs are different for the two tests.
+      const char* const TEST_name =
+          first_is_TEST ? first_test_name : this_test_name;
+      const char* const TEST_F_name =
+          first_is_TEST ? this_test_name : first_test_name;
+
+      ADD_FAILURE()
+          << "All tests in the same test suite must use the same test fixture\n"
+          << "class, so mixing TEST_F and TEST in the same test suite is\n"
+          << "illegal.  In test suite " << this_test_info->test_suite_name()
+          << ",\n"
+          << "test " << TEST_F_name << " is defined using TEST_F but\n"
+          << "test " << TEST_name << " is defined using TEST.  You probably\n"
+          << "want to change the TEST to TEST_F or move it to another test\n"
+          << "case.";
+    } else {
+      // Two fixture classes with the same name appear in two different
+      // namespaces, which is not allowed. Tell the user how to fix this.
+      ADD_FAILURE()
+          << "All tests in the same test suite must use the same test fixture\n"
+          << "class.  However, in test suite "
+          << this_test_info->test_suite_name() << ",\n"
+          << "you defined test " << first_test_name << " and test "
+          << this_test_name << "\n"
+          << "using two different test fixture classes.  This can happen if\n"
+          << "the two classes are from different namespaces or translation\n"
+          << "units and have the same name.  You should probably rename one\n"
+          << "of the classes to put the tests into different test suites.";
+    }
+    return false;
+  }
+
+  return true;
+}
+
+#if GTEST_HAS_SEH
+
+// Adds an "exception thrown" fatal failure to the current test.  This
+// function returns its result via an output parameter pointer because VC++
+// prohibits creation of objects with destructors on stack in functions
+// using __try (see error C2712).
+static std::string* FormatSehExceptionMessage(DWORD exception_code,
+                                              const char* location) {
+  Message message;
+  message << "SEH exception with code 0x" << std::setbase(16) << exception_code
+          << std::setbase(10) << " thrown in " << location << ".";
+
+  return new std::string(message.GetString());
+}
+
+#endif  // GTEST_HAS_SEH
+
+namespace internal {
+
+#if GTEST_HAS_EXCEPTIONS
+
+// Adds an "exception thrown" fatal failure to the current test.
+static std::string FormatCxxExceptionMessage(const char* description,
+                                             const char* location) {
+  Message message;
+  if (description != nullptr) {
+    message << "C++ exception with description \"" << description << "\"";
+  } else {
+    message << "Unknown C++ exception";
+  }
+  message << " thrown in " << location << ".";
+
+  return message.GetString();
+}
+
+static std::string PrintTestPartResultToString(
+    const TestPartResult& test_part_result);
+
+GoogleTestFailureException::GoogleTestFailureException(
+    const TestPartResult& failure)
+    : ::std::runtime_error(PrintTestPartResultToString(failure).c_str()) {}
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+// We put these helper functions in the internal namespace as IBM's xlC
+// compiler rejects the code if they were declared static.
+
+// Runs the given method and handles SEH exceptions it throws, when
+// SEH is supported; returns the 0-value for type Result in case of an
+// SEH exception.  (Microsoft compilers cannot handle SEH and C++
+// exceptions in the same function.  Therefore, we provide a separate
+// wrapper function for handling SEH exceptions.)
+template <class T, typename Result>
+Result HandleSehExceptionsInMethodIfSupported(T* object, Result (T::*method)(),
+                                              const char* location) {
+#if GTEST_HAS_SEH
+  __try {
+    return (object->*method)();
+  } __except (internal::UnitTestOptions::GTestShouldProcessSEH(  // NOLINT
+      GetExceptionCode())) {
+    // We create the exception message on the heap because VC++ prohibits
+    // creation of objects with destructors on stack in functions using __try
+    // (see error C2712).
+    std::string* exception_message =
+        FormatSehExceptionMessage(GetExceptionCode(), location);
+    internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure,
+                                             *exception_message);
+    delete exception_message;
+    return static_cast<Result>(0);
+  }
+#else
+  (void)location;
+  return (object->*method)();
+#endif  // GTEST_HAS_SEH
+}
+
+// Runs the given method and catches and reports C++ and/or SEH-style
+// exceptions, if they are supported; returns the 0-value for type
+// Result in case of an SEH exception.
+template <class T, typename Result>
+Result HandleExceptionsInMethodIfSupported(T* object, Result (T::*method)(),
+                                           const char* location) {
+  // NOTE: The user code can affect the way in which Google Test handles
+  // exceptions by setting GTEST_FLAG(catch_exceptions), but only before
+  // RUN_ALL_TESTS() starts. It is technically possible to check the flag
+  // after the exception is caught and either report or re-throw the
+  // exception based on the flag's value:
+  //
+  // try {
+  //   // Perform the test method.
+  // } catch (...) {
+  //   if (GTEST_FLAG_GET(catch_exceptions))
+  //     // Report the exception as failure.
+  //   else
+  //     throw;  // Re-throws the original exception.
+  // }
+  //
+  // However, the purpose of this flag is to allow the program to drop into
+  // the debugger when the exception is thrown. On most platforms, once the
+  // control enters the catch block, the exception origin information is
+  // lost and the debugger will stop the program at the point of the
+  // re-throw in this function -- instead of at the point of the original
+  // throw statement in the code under test.  For this reason, we perform
+  // the check early, sacrificing the ability to affect Google Test's
+  // exception handling in the method where the exception is thrown.
+  if (internal::GetUnitTestImpl()->catch_exceptions()) {
+#if GTEST_HAS_EXCEPTIONS
+    try {
+      return HandleSehExceptionsInMethodIfSupported(object, method, location);
+    } catch (const AssertionException&) {  // NOLINT
+      // This failure was reported already.
+    } catch (const internal::GoogleTestFailureException&) {  // NOLINT
+      // This exception type can only be thrown by a failed Google
+      // Test assertion with the intention of letting another testing
+      // framework catch it.  Therefore we just re-throw it.
+      throw;
+    } catch (const std::exception& e) {  // NOLINT
+      internal::ReportFailureInUnknownLocation(
+          TestPartResult::kFatalFailure,
+          FormatCxxExceptionMessage(e.what(), location));
+    } catch (...) {  // NOLINT
+      internal::ReportFailureInUnknownLocation(
+          TestPartResult::kFatalFailure,
+          FormatCxxExceptionMessage(nullptr, location));
+    }
+    return static_cast<Result>(0);
+#else
+    return HandleSehExceptionsInMethodIfSupported(object, method, location);
+#endif  // GTEST_HAS_EXCEPTIONS
+  } else {
+    return (object->*method)();
+  }
+}
+
+}  // namespace internal
+
+// Runs the test and updates the test result.
+void Test::Run() {
+  if (!HasSameFixtureClass()) return;
+
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(this, &Test::SetUp, "SetUp()");
+  // We will run the test only if SetUp() was successful and didn't call
+  // GTEST_SKIP().
+  if (!HasFatalFailure() && !IsSkipped()) {
+    impl->os_stack_trace_getter()->UponLeavingGTest();
+    internal::HandleExceptionsInMethodIfSupported(this, &Test::TestBody,
+                                                  "the test body");
+  }
+
+  // However, we want to clean up as much as possible.  Hence we will
+  // always call TearDown(), even if SetUp() or the test body has
+  // failed.
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(this, &Test::TearDown,
+                                                "TearDown()");
+}
+
+// Returns true if and only if the current test has a fatal failure.
+bool Test::HasFatalFailure() {
+  return internal::GetUnitTestImpl()->current_test_result()->HasFatalFailure();
+}
+
+// Returns true if and only if the current test has a non-fatal failure.
+bool Test::HasNonfatalFailure() {
+  return internal::GetUnitTestImpl()
+      ->current_test_result()
+      ->HasNonfatalFailure();
+}
+
+// Returns true if and only if the current test was skipped.
+bool Test::IsSkipped() {
+  return internal::GetUnitTestImpl()->current_test_result()->Skipped();
+}
+
+// class TestInfo
+
+// Constructs a TestInfo object. It assumes ownership of the test factory
+// object.
+TestInfo::TestInfo(const std::string& a_test_suite_name,
+                   const std::string& a_name, const char* a_type_param,
+                   const char* a_value_param,
+                   internal::CodeLocation a_code_location,
+                   internal::TypeId fixture_class_id,
+                   internal::TestFactoryBase* factory)
+    : test_suite_name_(a_test_suite_name),
+      name_(a_name),
+      type_param_(a_type_param ? new std::string(a_type_param) : nullptr),
+      value_param_(a_value_param ? new std::string(a_value_param) : nullptr),
+      location_(a_code_location),
+      fixture_class_id_(fixture_class_id),
+      should_run_(false),
+      is_disabled_(false),
+      matches_filter_(false),
+      is_in_another_shard_(false),
+      factory_(factory),
+      result_() {}
+
+// Destructs a TestInfo object.
+TestInfo::~TestInfo() { delete factory_; }
+
+namespace internal {
+
+// Creates a new TestInfo object and registers it with Google Test;
+// returns the created object.
+//
+// Arguments:
+//
+//   test_suite_name:  name of the test suite
+//   name:             name of the test
+//   type_param:       the name of the test's type parameter, or NULL if
+//                     this is not a typed or a type-parameterized test.
+//   value_param:      text representation of the test's value parameter,
+//                     or NULL if this is not a value-parameterized test.
+//   code_location:    code location where the test is defined
+//   fixture_class_id: ID of the test fixture class
+//   set_up_tc:        pointer to the function that sets up the test suite
+//   tear_down_tc:     pointer to the function that tears down the test suite
+//   factory:          pointer to the factory that creates a test object.
+//                     The newly created TestInfo instance will assume
+//                     ownership of the factory object.
+TestInfo* MakeAndRegisterTestInfo(
+    const char* test_suite_name, const char* name, const char* type_param,
+    const char* value_param, CodeLocation code_location,
+    TypeId fixture_class_id, SetUpTestSuiteFunc set_up_tc,
+    TearDownTestSuiteFunc tear_down_tc, TestFactoryBase* factory) {
+  TestInfo* const test_info =
+      new TestInfo(test_suite_name, name, type_param, value_param,
+                   code_location, fixture_class_id, factory);
+  GetUnitTestImpl()->AddTestInfo(set_up_tc, tear_down_tc, test_info);
+  return test_info;
+}
+
+void ReportInvalidTestSuiteType(const char* test_suite_name,
+                                CodeLocation code_location) {
+  Message errors;
+  errors
+      << "Attempted redefinition of test suite " << test_suite_name << ".\n"
+      << "All tests in the same test suite must use the same test fixture\n"
+      << "class.  However, in test suite " << test_suite_name << ", you tried\n"
+      << "to define a test using a fixture class different from the one\n"
+      << "used earlier. This can happen if the two fixture classes are\n"
+      << "from different namespaces and have the same name. You should\n"
+      << "probably rename one of the classes to put the tests into different\n"
+      << "test suites.";
+
+  GTEST_LOG_(ERROR) << FormatFileLocation(code_location.file.c_str(),
+                                          code_location.line)
+                    << " " << errors.GetString();
+}
+}  // namespace internal
+
+namespace {
+
+// A predicate that checks the test name of a TestInfo against a known
+// value.
+//
+// This is used for implementation of the TestSuite class only.  We put
+// it in the anonymous namespace to prevent polluting the outer
+// namespace.
+//
+// TestNameIs is copyable.
+class TestNameIs {
+ public:
+  // Constructor.
+  //
+  // TestNameIs has NO default constructor.
+  explicit TestNameIs(const char* name) : name_(name) {}
+
+  // Returns true if and only if the test name of test_info matches name_.
+  bool operator()(const TestInfo* test_info) const {
+    return test_info && test_info->name() == name_;
+  }
+
+ private:
+  std::string name_;
+};
+
+}  // namespace
+
+namespace internal {
+
+// This method expands all parameterized tests registered with macros TEST_P
+// and INSTANTIATE_TEST_SUITE_P into regular tests and registers those.
+// This will be done just once during the program runtime.
+void UnitTestImpl::RegisterParameterizedTests() {
+  if (!parameterized_tests_registered_) {
+    parameterized_test_registry_.RegisterTests();
+    type_parameterized_test_registry_.CheckForInstantiations();
+    parameterized_tests_registered_ = true;
+  }
+}
+
+}  // namespace internal
+
+// Creates the test object, runs it, records its result, and then
+// deletes it.
+void TestInfo::Run() {
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+  if (!should_run_) {
+    if (is_disabled_ && matches_filter_) repeater->OnTestDisabled(*this);
+    return;
+  }
+
+  // Tells UnitTest where to store test result.
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->set_current_test_info(this);
+
+  // Notifies the unit test event listeners that a test is about to start.
+  repeater->OnTestStart(*this);
+  result_.set_start_timestamp(internal::GetTimeInMillis());
+  internal::Timer timer;
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+
+  // Creates the test object.
+  Test* const test = internal::HandleExceptionsInMethodIfSupported(
+      factory_, &internal::TestFactoryBase::CreateTest,
+      "the test fixture's constructor");
+
+  // Runs the test if the constructor didn't generate a fatal failure or invoke
+  // GTEST_SKIP().
+  // Note that the object will not be null
+  if (!Test::HasFatalFailure() && !Test::IsSkipped()) {
+    // This doesn't throw as all user code that can throw are wrapped into
+    // exception handling code.
+    test->Run();
+  }
+
+  if (test != nullptr) {
+    // Deletes the test object.
+    impl->os_stack_trace_getter()->UponLeavingGTest();
+    internal::HandleExceptionsInMethodIfSupported(
+        test, &Test::DeleteSelf_, "the test fixture's destructor");
+  }
+
+  result_.set_elapsed_time(timer.Elapsed());
+
+  // Notifies the unit test event listener that a test has just finished.
+  repeater->OnTestEnd(*this);
+
+  // Tells UnitTest to stop associating assertion results to this
+  // test.
+  impl->set_current_test_info(nullptr);
+}
+
+// Skip and records a skipped test result for this object.
+void TestInfo::Skip() {
+  if (!should_run_) return;
+
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->set_current_test_info(this);
+
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+
+  // Notifies the unit test event listeners that a test is about to start.
+  repeater->OnTestStart(*this);
+
+  const TestPartResult test_part_result =
+      TestPartResult(TestPartResult::kSkip, this->file(), this->line(), "");
+  impl->GetTestPartResultReporterForCurrentThread()->ReportTestPartResult(
+      test_part_result);
+
+  // Notifies the unit test event listener that a test has just finished.
+  repeater->OnTestEnd(*this);
+  impl->set_current_test_info(nullptr);
+}
+
+// class TestSuite
+
+// Gets the number of successful tests in this test suite.
+int TestSuite::successful_test_count() const {
+  return CountIf(test_info_list_, TestPassed);
+}
+
+// Gets the number of successful tests in this test suite.
+int TestSuite::skipped_test_count() const {
+  return CountIf(test_info_list_, TestSkipped);
+}
+
+// Gets the number of failed tests in this test suite.
+int TestSuite::failed_test_count() const {
+  return CountIf(test_info_list_, TestFailed);
+}
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int TestSuite::reportable_disabled_test_count() const {
+  return CountIf(test_info_list_, TestReportableDisabled);
+}
+
+// Gets the number of disabled tests in this test suite.
+int TestSuite::disabled_test_count() const {
+  return CountIf(test_info_list_, TestDisabled);
+}
+
+// Gets the number of tests to be printed in the XML report.
+int TestSuite::reportable_test_count() const {
+  return CountIf(test_info_list_, TestReportable);
+}
+
+// Get the number of tests in this test suite that should run.
+int TestSuite::test_to_run_count() const {
+  return CountIf(test_info_list_, ShouldRunTest);
+}
+
+// Gets the number of all tests.
+int TestSuite::total_test_count() const {
+  return static_cast<int>(test_info_list_.size());
+}
+
+// Creates a TestSuite with the given name.
+//
+// Arguments:
+//
+//   a_name:       name of the test suite
+//   a_type_param: the name of the test suite's type parameter, or NULL if
+//                 this is not a typed or a type-parameterized test suite.
+//   set_up_tc:    pointer to the function that sets up the test suite
+//   tear_down_tc: pointer to the function that tears down the test suite
+TestSuite::TestSuite(const char* a_name, const char* a_type_param,
+                     internal::SetUpTestSuiteFunc set_up_tc,
+                     internal::TearDownTestSuiteFunc tear_down_tc)
+    : name_(a_name),
+      type_param_(a_type_param ? new std::string(a_type_param) : nullptr),
+      set_up_tc_(set_up_tc),
+      tear_down_tc_(tear_down_tc),
+      should_run_(false),
+      start_timestamp_(0),
+      elapsed_time_(0) {}
+
+// Destructor of TestSuite.
+TestSuite::~TestSuite() {
+  // Deletes every Test in the collection.
+  ForEach(test_info_list_, internal::Delete<TestInfo>);
+}
+
+// Returns the i-th test among all the tests. i can range from 0 to
+// total_test_count() - 1. If i is not in that range, returns NULL.
+const TestInfo* TestSuite::GetTestInfo(int i) const {
+  const int index = GetElementOr(test_indices_, i, -1);
+  return index < 0 ? nullptr : test_info_list_[static_cast<size_t>(index)];
+}
+
+// Returns the i-th test among all the tests. i can range from 0 to
+// total_test_count() - 1. If i is not in that range, returns NULL.
+TestInfo* TestSuite::GetMutableTestInfo(int i) {
+  const int index = GetElementOr(test_indices_, i, -1);
+  return index < 0 ? nullptr : test_info_list_[static_cast<size_t>(index)];
+}
+
+// Adds a test to this test suite.  Will delete the test upon
+// destruction of the TestSuite object.
+void TestSuite::AddTestInfo(TestInfo* test_info) {
+  test_info_list_.push_back(test_info);
+  test_indices_.push_back(static_cast<int>(test_indices_.size()));
+}
+
+// Runs every test in this TestSuite.
+void TestSuite::Run() {
+  if (!should_run_) return;
+
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->set_current_test_suite(this);
+
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+
+  // Call both legacy and the new API
+  repeater->OnTestSuiteStart(*this);
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  repeater->OnTestCaseStart(*this);
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      this, &TestSuite::RunSetUpTestSuite, "SetUpTestSuite()");
+
+  const bool skip_all = ad_hoc_test_result().Failed();
+
+  start_timestamp_ = internal::GetTimeInMillis();
+  internal::Timer timer;
+  for (int i = 0; i < total_test_count(); i++) {
+    if (skip_all) {
+      GetMutableTestInfo(i)->Skip();
+    } else {
+      GetMutableTestInfo(i)->Run();
+    }
+    if (GTEST_FLAG_GET(fail_fast) &&
+        GetMutableTestInfo(i)->result()->Failed()) {
+      for (int j = i + 1; j < total_test_count(); j++) {
+        GetMutableTestInfo(j)->Skip();
+      }
+      break;
+    }
+  }
+  elapsed_time_ = timer.Elapsed();
+
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      this, &TestSuite::RunTearDownTestSuite, "TearDownTestSuite()");
+
+  // Call both legacy and the new API
+  repeater->OnTestSuiteEnd(*this);
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  repeater->OnTestCaseEnd(*this);
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  impl->set_current_test_suite(nullptr);
+}
+
+// Skips all tests under this TestSuite.
+void TestSuite::Skip() {
+  if (!should_run_) return;
+
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->set_current_test_suite(this);
+
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+
+  // Call both legacy and the new API
+  repeater->OnTestSuiteStart(*this);
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  repeater->OnTestCaseStart(*this);
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  for (int i = 0; i < total_test_count(); i++) {
+    GetMutableTestInfo(i)->Skip();
+  }
+
+  // Call both legacy and the new API
+  repeater->OnTestSuiteEnd(*this);
+  // Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  repeater->OnTestCaseEnd(*this);
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  impl->set_current_test_suite(nullptr);
+}
+
+// Clears the results of all tests in this test suite.
+void TestSuite::ClearResult() {
+  ad_hoc_test_result_.Clear();
+  ForEach(test_info_list_, TestInfo::ClearTestResult);
+}
+
+// Shuffles the tests in this test suite.
+void TestSuite::ShuffleTests(internal::Random* random) {
+  Shuffle(random, &test_indices_);
+}
+
+// Restores the test order to before the first shuffle.
+void TestSuite::UnshuffleTests() {
+  for (size_t i = 0; i < test_indices_.size(); i++) {
+    test_indices_[i] = static_cast<int>(i);
+  }
+}
+
+// Formats a countable noun.  Depending on its quantity, either the
+// singular form or the plural form is used. e.g.
+//
+// FormatCountableNoun(1, "formula", "formuli") returns "1 formula".
+// FormatCountableNoun(5, "book", "books") returns "5 books".
+static std::string FormatCountableNoun(int count, const char* singular_form,
+                                       const char* plural_form) {
+  return internal::StreamableToString(count) + " " +
+         (count == 1 ? singular_form : plural_form);
+}
+
+// Formats the count of tests.
+static std::string FormatTestCount(int test_count) {
+  return FormatCountableNoun(test_count, "test", "tests");
+}
+
+// Formats the count of test suites.
+static std::string FormatTestSuiteCount(int test_suite_count) {
+  return FormatCountableNoun(test_suite_count, "test suite", "test suites");
+}
+
+// Converts a TestPartResult::Type enum to human-friendly string
+// representation.  Both kNonFatalFailure and kFatalFailure are translated
+// to "Failure", as the user usually doesn't care about the difference
+// between the two when viewing the test result.
+static const char* TestPartResultTypeToString(TestPartResult::Type type) {
+  switch (type) {
+    case TestPartResult::kSkip:
+      return "Skipped\n";
+    case TestPartResult::kSuccess:
+      return "Success";
+
+    case TestPartResult::kNonFatalFailure:
+    case TestPartResult::kFatalFailure:
+#ifdef _MSC_VER
+      return "error: ";
+#else
+      return "Failure\n";
+#endif
+    default:
+      return "Unknown result type";
+  }
+}
+
+namespace internal {
+namespace {
+enum class GTestColor { kDefault, kRed, kGreen, kYellow };
+}  // namespace
+
+// Prints a TestPartResult to an std::string.
+static std::string PrintTestPartResultToString(
+    const TestPartResult& test_part_result) {
+  return (Message() << internal::FormatFileLocation(
+                           test_part_result.file_name(),
+                           test_part_result.line_number())
+                    << " "
+                    << TestPartResultTypeToString(test_part_result.type())
+                    << test_part_result.message())
+      .GetString();
+}
+
+// Prints a TestPartResult.
+static void PrintTestPartResult(const TestPartResult& test_part_result) {
+  const std::string& result = PrintTestPartResultToString(test_part_result);
+  printf("%s\n", result.c_str());
+  fflush(stdout);
+  // If the test program runs in Visual Studio or a debugger, the
+  // following statements add the test part result message to the Output
+  // window such that the user can double-click on it to jump to the
+  // corresponding source code location; otherwise they do nothing.
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+  // We don't call OutputDebugString*() on Windows Mobile, as printing
+  // to stdout is done by OutputDebugString() there already - we don't
+  // want the same message printed twice.
+  ::OutputDebugStringA(result.c_str());
+  ::OutputDebugStringA("\n");
+#endif
+}
+
+// class PrettyUnitTestResultPrinter
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \
+    !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
+
+// Returns the character attribute for the given color.
+static WORD GetColorAttribute(GTestColor color) {
+  switch (color) {
+    case GTestColor::kRed:
+      return FOREGROUND_RED;
+    case GTestColor::kGreen:
+      return FOREGROUND_GREEN;
+    case GTestColor::kYellow:
+      return FOREGROUND_RED | FOREGROUND_GREEN;
+    default:
+      return 0;
+  }
+}
+
+static int GetBitOffset(WORD color_mask) {
+  if (color_mask == 0) return 0;
+
+  int bitOffset = 0;
+  while ((color_mask & 1) == 0) {
+    color_mask >>= 1;
+    ++bitOffset;
+  }
+  return bitOffset;
+}
+
+static WORD GetNewColor(GTestColor color, WORD old_color_attrs) {
+  // Let's reuse the BG
+  static const WORD background_mask = BACKGROUND_BLUE | BACKGROUND_GREEN |
+                                      BACKGROUND_RED | BACKGROUND_INTENSITY;
+  static const WORD foreground_mask = FOREGROUND_BLUE | FOREGROUND_GREEN |
+                                      FOREGROUND_RED | FOREGROUND_INTENSITY;
+  const WORD existing_bg = old_color_attrs & background_mask;
+
+  WORD new_color =
+      GetColorAttribute(color) | existing_bg | FOREGROUND_INTENSITY;
+  static const int bg_bitOffset = GetBitOffset(background_mask);
+  static const int fg_bitOffset = GetBitOffset(foreground_mask);
+
+  if (((new_color & background_mask) >> bg_bitOffset) ==
+      ((new_color & foreground_mask) >> fg_bitOffset)) {
+    new_color ^= FOREGROUND_INTENSITY;  // invert intensity
+  }
+  return new_color;
+}
+
+#else
+
+// Returns the ANSI color code for the given color. GTestColor::kDefault is
+// an invalid input.
+static const char* GetAnsiColorCode(GTestColor color) {
+  switch (color) {
+    case GTestColor::kRed:
+      return "1";
+    case GTestColor::kGreen:
+      return "2";
+    case GTestColor::kYellow:
+      return "3";
+    default:
+      return nullptr;
+  }
+}
+
+#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+
+// Returns true if and only if Google Test should use colors in the output.
+bool ShouldUseColor(bool stdout_is_tty) {
+  std::string c = GTEST_FLAG_GET(color);
+  const char* const gtest_color = c.c_str();
+
+  if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) {
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
+    // On Windows the TERM variable is usually not set, but the
+    // console there does support colors.
+    return stdout_is_tty;
+#else
+    // On non-Windows platforms, we rely on the TERM variable.
+    const char* const term = posix::GetEnv("TERM");
+    const bool term_supports_color =
+        String::CStringEquals(term, "xterm") ||
+        String::CStringEquals(term, "xterm-color") ||
+        String::CStringEquals(term, "xterm-256color") ||
+        String::CStringEquals(term, "screen") ||
+        String::CStringEquals(term, "screen-256color") ||
+        String::CStringEquals(term, "tmux") ||
+        String::CStringEquals(term, "tmux-256color") ||
+        String::CStringEquals(term, "rxvt-unicode") ||
+        String::CStringEquals(term, "rxvt-unicode-256color") ||
+        String::CStringEquals(term, "linux") ||
+        String::CStringEquals(term, "cygwin");
+    return stdout_is_tty && term_supports_color;
+#endif  // GTEST_OS_WINDOWS
+  }
+
+  return String::CaseInsensitiveCStringEquals(gtest_color, "yes") ||
+         String::CaseInsensitiveCStringEquals(gtest_color, "true") ||
+         String::CaseInsensitiveCStringEquals(gtest_color, "t") ||
+         String::CStringEquals(gtest_color, "1");
+  // We take "yes", "true", "t", and "1" as meaning "yes".  If the
+  // value is neither one of these nor "auto", we treat it as "no" to
+  // be conservative.
+}
+
+// Helpers for printing colored strings to stdout. Note that on Windows, we
+// cannot simply emit special characters and have the terminal change colors.
+// This routine must actually emit the characters rather than return a string
+// that would be colored when printed, as can be done on Linux.
+
+GTEST_ATTRIBUTE_PRINTF_(2, 3)
+static void ColoredPrintf(GTestColor color, const char* fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+
+  static const bool in_color_mode =
+      ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0);
+  const bool use_color = in_color_mode && (color != GTestColor::kDefault);
+
+  if (!use_color) {
+    vprintf(fmt, args);
+    va_end(args);
+    return;
+  }
+
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \
+    !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
+  const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
+
+  // Gets the current text color.
+  CONSOLE_SCREEN_BUFFER_INFO buffer_info;
+  GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
+  const WORD old_color_attrs = buffer_info.wAttributes;
+  const WORD new_color = GetNewColor(color, old_color_attrs);
+
+  // We need to flush the stream buffers into the console before each
+  // SetConsoleTextAttribute call lest it affect the text that is already
+  // printed but has not yet reached the console.
+  fflush(stdout);
+  SetConsoleTextAttribute(stdout_handle, new_color);
+
+  vprintf(fmt, args);
+
+  fflush(stdout);
+  // Restores the text color.
+  SetConsoleTextAttribute(stdout_handle, old_color_attrs);
+#else
+  printf("\033[0;3%sm", GetAnsiColorCode(color));
+  vprintf(fmt, args);
+  printf("\033[m");  // Resets the terminal to default.
+#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+  va_end(args);
+}
+
+// Text printed in Google Test's text output and --gtest_list_tests
+// output to label the type parameter and value parameter for a test.
+static const char kTypeParamLabel[] = "TypeParam";
+static const char kValueParamLabel[] = "GetParam()";
+
+static void PrintFullTestCommentIfPresent(const TestInfo& test_info) {
+  const char* const type_param = test_info.type_param();
+  const char* const value_param = test_info.value_param();
+
+  if (type_param != nullptr || value_param != nullptr) {
+    printf(", where ");
+    if (type_param != nullptr) {
+      printf("%s = %s", kTypeParamLabel, type_param);
+      if (value_param != nullptr) printf(" and ");
+    }
+    if (value_param != nullptr) {
+      printf("%s = %s", kValueParamLabel, value_param);
+    }
+  }
+}
+
+// This class implements the TestEventListener interface.
+//
+// Class PrettyUnitTestResultPrinter is copyable.
+class PrettyUnitTestResultPrinter : public TestEventListener {
+ public:
+  PrettyUnitTestResultPrinter() {}
+  static void PrintTestName(const char* test_suite, const char* test) {
+    printf("%s.%s", test_suite, test);
+  }
+
+  // The following methods override what's in the TestEventListener class.
+  void OnTestProgramStart(const UnitTest& /*unit_test*/) override {}
+  void OnTestIterationStart(const UnitTest& unit_test, int iteration) override;
+  void OnEnvironmentsSetUpStart(const UnitTest& unit_test) override;
+  void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) override {}
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseStart(const TestCase& test_case) override;
+#else
+  void OnTestSuiteStart(const TestSuite& test_suite) override;
+#endif  // OnTestCaseStart
+
+  void OnTestStart(const TestInfo& test_info) override;
+  void OnTestDisabled(const TestInfo& test_info) override;
+
+  void OnTestPartResult(const TestPartResult& result) override;
+  void OnTestEnd(const TestInfo& test_info) override;
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseEnd(const TestCase& test_case) override;
+#else
+  void OnTestSuiteEnd(const TestSuite& test_suite) override;
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  void OnEnvironmentsTearDownStart(const UnitTest& unit_test) override;
+  void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) override {}
+  void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override;
+  void OnTestProgramEnd(const UnitTest& /*unit_test*/) override {}
+
+ private:
+  static void PrintFailedTests(const UnitTest& unit_test);
+  static void PrintFailedTestSuites(const UnitTest& unit_test);
+  static void PrintSkippedTests(const UnitTest& unit_test);
+};
+
+// Fired before each iteration of tests starts.
+void PrettyUnitTestResultPrinter::OnTestIterationStart(
+    const UnitTest& unit_test, int iteration) {
+  if (GTEST_FLAG_GET(repeat) != 1)
+    printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1);
+
+  std::string f = GTEST_FLAG_GET(filter);
+  const char* const filter = f.c_str();
+
+  // Prints the filter if it's not *.  This reminds the user that some
+  // tests may be skipped.
+  if (!String::CStringEquals(filter, kUniversalFilter)) {
+    ColoredPrintf(GTestColor::kYellow, "Note: %s filter = %s\n", GTEST_NAME_,
+                  filter);
+  }
+
+  if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) {
+    const int32_t shard_index = Int32FromEnvOrDie(kTestShardIndex, -1);
+    ColoredPrintf(GTestColor::kYellow, "Note: This is test shard %d of %s.\n",
+                  static_cast<int>(shard_index) + 1,
+                  internal::posix::GetEnv(kTestTotalShards));
+  }
+
+  if (GTEST_FLAG_GET(shuffle)) {
+    ColoredPrintf(GTestColor::kYellow,
+                  "Note: Randomizing tests' orders with a seed of %d .\n",
+                  unit_test.random_seed());
+  }
+
+  ColoredPrintf(GTestColor::kGreen, "[==========] ");
+  printf("Running %s from %s.\n",
+         FormatTestCount(unit_test.test_to_run_count()).c_str(),
+         FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart(
+    const UnitTest& /*unit_test*/) {
+  ColoredPrintf(GTestColor::kGreen, "[----------] ");
+  printf("Global test environment set-up.\n");
+  fflush(stdout);
+}
+
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase& test_case) {
+  const std::string counts =
+      FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
+  ColoredPrintf(GTestColor::kGreen, "[----------] ");
+  printf("%s from %s", counts.c_str(), test_case.name());
+  if (test_case.type_param() == nullptr) {
+    printf("\n");
+  } else {
+    printf(", where %s = %s\n", kTypeParamLabel, test_case.type_param());
+  }
+  fflush(stdout);
+}
+#else
+void PrettyUnitTestResultPrinter::OnTestSuiteStart(
+    const TestSuite& test_suite) {
+  const std::string counts =
+      FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests");
+  ColoredPrintf(GTestColor::kGreen, "[----------] ");
+  printf("%s from %s", counts.c_str(), test_suite.name());
+  if (test_suite.type_param() == nullptr) {
+    printf("\n");
+  } else {
+    printf(", where %s = %s\n", kTypeParamLabel, test_suite.type_param());
+  }
+  fflush(stdout);
+}
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) {
+  ColoredPrintf(GTestColor::kGreen, "[ RUN      ] ");
+  PrintTestName(test_info.test_suite_name(), test_info.name());
+  printf("\n");
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestDisabled(const TestInfo& test_info) {
+  ColoredPrintf(GTestColor::kYellow, "[ DISABLED ] ");
+  PrintTestName(test_info.test_suite_name(), test_info.name());
+  printf("\n");
+  fflush(stdout);
+}
+
+// Called after an assertion failure.
+void PrettyUnitTestResultPrinter::OnTestPartResult(
+    const TestPartResult& result) {
+  switch (result.type()) {
+    // If the test part succeeded, we don't need to do anything.
+    case TestPartResult::kSuccess:
+      return;
+    default:
+      // Print failure message from the assertion
+      // (e.g. expected this and got that).
+      PrintTestPartResult(result);
+      fflush(stdout);
+  }
+}
+
+void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
+  if (test_info.result()->Passed()) {
+    ColoredPrintf(GTestColor::kGreen, "[       OK ] ");
+  } else if (test_info.result()->Skipped()) {
+    ColoredPrintf(GTestColor::kGreen, "[  SKIPPED ] ");
+  } else {
+    ColoredPrintf(GTestColor::kRed, "[  FAILED  ] ");
+  }
+  PrintTestName(test_info.test_suite_name(), test_info.name());
+  if (test_info.result()->Failed()) PrintFullTestCommentIfPresent(test_info);
+
+  if (GTEST_FLAG_GET(print_time)) {
+    printf(" (%s ms)\n",
+           internal::StreamableToString(test_info.result()->elapsed_time())
+               .c_str());
+  } else {
+    printf("\n");
+  }
+  fflush(stdout);
+}
+
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) {
+  if (!GTEST_FLAG_GET(print_time)) return;
+
+  const std::string counts =
+      FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
+  ColoredPrintf(GTestColor::kGreen, "[----------] ");
+  printf("%s from %s (%s ms total)\n\n", counts.c_str(), test_case.name(),
+         internal::StreamableToString(test_case.elapsed_time()).c_str());
+  fflush(stdout);
+}
+#else
+void PrettyUnitTestResultPrinter::OnTestSuiteEnd(const TestSuite& test_suite) {
+  if (!GTEST_FLAG_GET(print_time)) return;
+
+  const std::string counts =
+      FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests");
+  ColoredPrintf(GTestColor::kGreen, "[----------] ");
+  printf("%s from %s (%s ms total)\n\n", counts.c_str(), test_suite.name(),
+         internal::StreamableToString(test_suite.elapsed_time()).c_str());
+  fflush(stdout);
+}
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart(
+    const UnitTest& /*unit_test*/) {
+  ColoredPrintf(GTestColor::kGreen, "[----------] ");
+  printf("Global test environment tear-down\n");
+  fflush(stdout);
+}
+
+// Internal helper for printing the list of failed tests.
+void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) {
+  const int failed_test_count = unit_test.failed_test_count();
+  ColoredPrintf(GTestColor::kRed, "[  FAILED  ] ");
+  printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str());
+
+  for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
+    const TestSuite& test_suite = *unit_test.GetTestSuite(i);
+    if (!test_suite.should_run() || (test_suite.failed_test_count() == 0)) {
+      continue;
+    }
+    for (int j = 0; j < test_suite.total_test_count(); ++j) {
+      const TestInfo& test_info = *test_suite.GetTestInfo(j);
+      if (!test_info.should_run() || !test_info.result()->Failed()) {
+        continue;
+      }
+      ColoredPrintf(GTestColor::kRed, "[  FAILED  ] ");
+      printf("%s.%s", test_suite.name(), test_info.name());
+      PrintFullTestCommentIfPresent(test_info);
+      printf("\n");
+    }
+  }
+  printf("\n%2d FAILED %s\n", failed_test_count,
+         failed_test_count == 1 ? "TEST" : "TESTS");
+}
+
+// Internal helper for printing the list of test suite failures not covered by
+// PrintFailedTests.
+void PrettyUnitTestResultPrinter::PrintFailedTestSuites(
+    const UnitTest& unit_test) {
+  int suite_failure_count = 0;
+  for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
+    const TestSuite& test_suite = *unit_test.GetTestSuite(i);
+    if (!test_suite.should_run()) {
+      continue;
+    }
+    if (test_suite.ad_hoc_test_result().Failed()) {
+      ColoredPrintf(GTestColor::kRed, "[  FAILED  ] ");
+      printf("%s: SetUpTestSuite or TearDownTestSuite\n", test_suite.name());
+      ++suite_failure_count;
+    }
+  }
+  if (suite_failure_count > 0) {
+    printf("\n%2d FAILED TEST %s\n", suite_failure_count,
+           suite_failure_count == 1 ? "SUITE" : "SUITES");
+  }
+}
+
+// Internal helper for printing the list of skipped tests.
+void PrettyUnitTestResultPrinter::PrintSkippedTests(const UnitTest& unit_test) {
+  const int skipped_test_count = unit_test.skipped_test_count();
+  if (skipped_test_count == 0) {
+    return;
+  }
+
+  for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
+    const TestSuite& test_suite = *unit_test.GetTestSuite(i);
+    if (!test_suite.should_run() || (test_suite.skipped_test_count() == 0)) {
+      continue;
+    }
+    for (int j = 0; j < test_suite.total_test_count(); ++j) {
+      const TestInfo& test_info = *test_suite.GetTestInfo(j);
+      if (!test_info.should_run() || !test_info.result()->Skipped()) {
+        continue;
+      }
+      ColoredPrintf(GTestColor::kGreen, "[  SKIPPED ] ");
+      printf("%s.%s", test_suite.name(), test_info.name());
+      printf("\n");
+    }
+  }
+}
+
+void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+                                                     int /*iteration*/) {
+  ColoredPrintf(GTestColor::kGreen, "[==========] ");
+  printf("%s from %s ran.",
+         FormatTestCount(unit_test.test_to_run_count()).c_str(),
+         FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
+  if (GTEST_FLAG_GET(print_time)) {
+    printf(" (%s ms total)",
+           internal::StreamableToString(unit_test.elapsed_time()).c_str());
+  }
+  printf("\n");
+  ColoredPrintf(GTestColor::kGreen, "[  PASSED  ] ");
+  printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str());
+
+  const int skipped_test_count = unit_test.skipped_test_count();
+  if (skipped_test_count > 0) {
+    ColoredPrintf(GTestColor::kGreen, "[  SKIPPED ] ");
+    printf("%s, listed below:\n", FormatTestCount(skipped_test_count).c_str());
+    PrintSkippedTests(unit_test);
+  }
+
+  if (!unit_test.Passed()) {
+    PrintFailedTests(unit_test);
+    PrintFailedTestSuites(unit_test);
+  }
+
+  int num_disabled = unit_test.reportable_disabled_test_count();
+  if (num_disabled && !GTEST_FLAG_GET(also_run_disabled_tests)) {
+    if (unit_test.Passed()) {
+      printf("\n");  // Add a spacer if no FAILURE banner is displayed.
+    }
+    ColoredPrintf(GTestColor::kYellow, "  YOU HAVE %d DISABLED %s\n\n",
+                  num_disabled, num_disabled == 1 ? "TEST" : "TESTS");
+  }
+  // Ensure that Google Test output is printed before, e.g., heapchecker output.
+  fflush(stdout);
+}
+
+// End PrettyUnitTestResultPrinter
+
+// This class implements the TestEventListener interface.
+//
+// Class BriefUnitTestResultPrinter is copyable.
+class BriefUnitTestResultPrinter : public TestEventListener {
+ public:
+  BriefUnitTestResultPrinter() {}
+  static void PrintTestName(const char* test_suite, const char* test) {
+    printf("%s.%s", test_suite, test);
+  }
+
+  // The following methods override what's in the TestEventListener class.
+  void OnTestProgramStart(const UnitTest& /*unit_test*/) override {}
+  void OnTestIterationStart(const UnitTest& /*unit_test*/,
+                            int /*iteration*/) override {}
+  void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) override {}
+  void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) override {}
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseStart(const TestCase& /*test_case*/) override {}
+#else
+  void OnTestSuiteStart(const TestSuite& /*test_suite*/) override {}
+#endif  // OnTestCaseStart
+
+  void OnTestStart(const TestInfo& /*test_info*/) override {}
+  void OnTestDisabled(const TestInfo& /*test_info*/) override {}
+
+  void OnTestPartResult(const TestPartResult& result) override;
+  void OnTestEnd(const TestInfo& test_info) override;
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseEnd(const TestCase& /*test_case*/) override {}
+#else
+  void OnTestSuiteEnd(const TestSuite& /*test_suite*/) override {}
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) override {}
+  void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) override {}
+  void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override;
+  void OnTestProgramEnd(const UnitTest& /*unit_test*/) override {}
+};
+
+// Called after an assertion failure.
+void BriefUnitTestResultPrinter::OnTestPartResult(
+    const TestPartResult& result) {
+  switch (result.type()) {
+    // If the test part succeeded, we don't need to do anything.
+    case TestPartResult::kSuccess:
+      return;
+    default:
+      // Print failure message from the assertion
+      // (e.g. expected this and got that).
+      PrintTestPartResult(result);
+      fflush(stdout);
+  }
+}
+
+void BriefUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
+  if (test_info.result()->Failed()) {
+    ColoredPrintf(GTestColor::kRed, "[  FAILED  ] ");
+    PrintTestName(test_info.test_suite_name(), test_info.name());
+    PrintFullTestCommentIfPresent(test_info);
+
+    if (GTEST_FLAG_GET(print_time)) {
+      printf(" (%s ms)\n",
+             internal::StreamableToString(test_info.result()->elapsed_time())
+                 .c_str());
+    } else {
+      printf("\n");
+    }
+    fflush(stdout);
+  }
+}
+
+void BriefUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+                                                    int /*iteration*/) {
+  ColoredPrintf(GTestColor::kGreen, "[==========] ");
+  printf("%s from %s ran.",
+         FormatTestCount(unit_test.test_to_run_count()).c_str(),
+         FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
+  if (GTEST_FLAG_GET(print_time)) {
+    printf(" (%s ms total)",
+           internal::StreamableToString(unit_test.elapsed_time()).c_str());
+  }
+  printf("\n");
+  ColoredPrintf(GTestColor::kGreen, "[  PASSED  ] ");
+  printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str());
+
+  const int skipped_test_count = unit_test.skipped_test_count();
+  if (skipped_test_count > 0) {
+    ColoredPrintf(GTestColor::kGreen, "[  SKIPPED ] ");
+    printf("%s.\n", FormatTestCount(skipped_test_count).c_str());
+  }
+
+  int num_disabled = unit_test.reportable_disabled_test_count();
+  if (num_disabled && !GTEST_FLAG_GET(also_run_disabled_tests)) {
+    if (unit_test.Passed()) {
+      printf("\n");  // Add a spacer if no FAILURE banner is displayed.
+    }
+    ColoredPrintf(GTestColor::kYellow, "  YOU HAVE %d DISABLED %s\n\n",
+                  num_disabled, num_disabled == 1 ? "TEST" : "TESTS");
+  }
+  // Ensure that Google Test output is printed before, e.g., heapchecker output.
+  fflush(stdout);
+}
+
+// End BriefUnitTestResultPrinter
+
+// class TestEventRepeater
+//
+// This class forwards events to other event listeners.
+class TestEventRepeater : public TestEventListener {
+ public:
+  TestEventRepeater() : forwarding_enabled_(true) {}
+  ~TestEventRepeater() override;
+  void Append(TestEventListener* listener);
+  TestEventListener* Release(TestEventListener* listener);
+
+  // Controls whether events will be forwarded to listeners_. Set to false
+  // in death test child processes.
+  bool forwarding_enabled() const { return forwarding_enabled_; }
+  void set_forwarding_enabled(bool enable) { forwarding_enabled_ = enable; }
+
+  void OnTestProgramStart(const UnitTest& unit_test) override;
+  void OnTestIterationStart(const UnitTest& unit_test, int iteration) override;
+  void OnEnvironmentsSetUpStart(const UnitTest& unit_test) override;
+  void OnEnvironmentsSetUpEnd(const UnitTest& unit_test) override;
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseStart(const TestSuite& parameter) override;
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestSuiteStart(const TestSuite& parameter) override;
+  void OnTestStart(const TestInfo& test_info) override;
+  void OnTestDisabled(const TestInfo& test_info) override;
+  void OnTestPartResult(const TestPartResult& result) override;
+  void OnTestEnd(const TestInfo& test_info) override;
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseEnd(const TestCase& parameter) override;
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestSuiteEnd(const TestSuite& parameter) override;
+  void OnEnvironmentsTearDownStart(const UnitTest& unit_test) override;
+  void OnEnvironmentsTearDownEnd(const UnitTest& unit_test) override;
+  void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override;
+  void OnTestProgramEnd(const UnitTest& unit_test) override;
+
+ private:
+  // Controls whether events will be forwarded to listeners_. Set to false
+  // in death test child processes.
+  bool forwarding_enabled_;
+  // The list of listeners that receive events.
+  std::vector<TestEventListener*> listeners_;
+
+  TestEventRepeater(const TestEventRepeater&) = delete;
+  TestEventRepeater& operator=(const TestEventRepeater&) = delete;
+};
+
+TestEventRepeater::~TestEventRepeater() {
+  ForEach(listeners_, Delete<TestEventListener>);
+}
+
+void TestEventRepeater::Append(TestEventListener* listener) {
+  listeners_.push_back(listener);
+}
+
+TestEventListener* TestEventRepeater::Release(TestEventListener* listener) {
+  for (size_t i = 0; i < listeners_.size(); ++i) {
+    if (listeners_[i] == listener) {
+      listeners_.erase(listeners_.begin() + static_cast<int>(i));
+      return listener;
+    }
+  }
+
+  return nullptr;
+}
+
+// Since most methods are very similar, use macros to reduce boilerplate.
+// This defines a member that forwards the call to all listeners.
+#define GTEST_REPEATER_METHOD_(Name, Type)              \
+  void TestEventRepeater::Name(const Type& parameter) { \
+    if (forwarding_enabled_) {                          \
+      for (size_t i = 0; i < listeners_.size(); i++) {  \
+        listeners_[i]->Name(parameter);                 \
+      }                                                 \
+    }                                                   \
+  }
+// This defines a member that forwards the call to all listeners in reverse
+// order.
+#define GTEST_REVERSE_REPEATER_METHOD_(Name, Type)      \
+  void TestEventRepeater::Name(const Type& parameter) { \
+    if (forwarding_enabled_) {                          \
+      for (size_t i = listeners_.size(); i != 0; i--) { \
+        listeners_[i - 1]->Name(parameter);             \
+      }                                                 \
+    }                                                   \
+  }
+
+GTEST_REPEATER_METHOD_(OnTestProgramStart, UnitTest)
+GTEST_REPEATER_METHOD_(OnEnvironmentsSetUpStart, UnitTest)
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+GTEST_REPEATER_METHOD_(OnTestCaseStart, TestSuite)
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+GTEST_REPEATER_METHOD_(OnTestSuiteStart, TestSuite)
+GTEST_REPEATER_METHOD_(OnTestStart, TestInfo)
+GTEST_REPEATER_METHOD_(OnTestDisabled, TestInfo)
+GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult)
+GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsTearDownEnd, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnTestEnd, TestInfo)
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+GTEST_REVERSE_REPEATER_METHOD_(OnTestCaseEnd, TestSuite)
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+GTEST_REVERSE_REPEATER_METHOD_(OnTestSuiteEnd, TestSuite)
+GTEST_REVERSE_REPEATER_METHOD_(OnTestProgramEnd, UnitTest)
+
+#undef GTEST_REPEATER_METHOD_
+#undef GTEST_REVERSE_REPEATER_METHOD_
+
+void TestEventRepeater::OnTestIterationStart(const UnitTest& unit_test,
+                                             int iteration) {
+  if (forwarding_enabled_) {
+    for (size_t i = 0; i < listeners_.size(); i++) {
+      listeners_[i]->OnTestIterationStart(unit_test, iteration);
+    }
+  }
+}
+
+void TestEventRepeater::OnTestIterationEnd(const UnitTest& unit_test,
+                                           int iteration) {
+  if (forwarding_enabled_) {
+    for (size_t i = listeners_.size(); i > 0; i--) {
+      listeners_[i - 1]->OnTestIterationEnd(unit_test, iteration);
+    }
+  }
+}
+
+// End TestEventRepeater
+
+// This class generates an XML output file.
+class XmlUnitTestResultPrinter : public EmptyTestEventListener {
+ public:
+  explicit XmlUnitTestResultPrinter(const char* output_file);
+
+  void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override;
+  void ListTestsMatchingFilter(const std::vector<TestSuite*>& test_suites);
+
+  // Prints an XML summary of all unit tests.
+  static void PrintXmlTestsList(std::ostream* stream,
+                                const std::vector<TestSuite*>& test_suites);
+
+ private:
+  // Is c a whitespace character that is normalized to a space character
+  // when it appears in an XML attribute value?
+  static bool IsNormalizableWhitespace(unsigned char c) {
+    return c == '\t' || c == '\n' || c == '\r';
+  }
+
+  // May c appear in a well-formed XML document?
+  // https://www.w3.org/TR/REC-xml/#charsets
+  static bool IsValidXmlCharacter(unsigned char c) {
+    return IsNormalizableWhitespace(c) || c >= 0x20;
+  }
+
+  // Returns an XML-escaped copy of the input string str.  If
+  // is_attribute is true, the text is meant to appear as an attribute
+  // value, and normalizable whitespace is preserved by replacing it
+  // with character references.
+  static std::string EscapeXml(const std::string& str, bool is_attribute);
+
+  // Returns the given string with all characters invalid in XML removed.
+  static std::string RemoveInvalidXmlCharacters(const std::string& str);
+
+  // Convenience wrapper around EscapeXml when str is an attribute value.
+  static std::string EscapeXmlAttribute(const std::string& str) {
+    return EscapeXml(str, true);
+  }
+
+  // Convenience wrapper around EscapeXml when str is not an attribute value.
+  static std::string EscapeXmlText(const char* str) {
+    return EscapeXml(str, false);
+  }
+
+  // Verifies that the given attribute belongs to the given element and
+  // streams the attribute as XML.
+  static void OutputXmlAttribute(std::ostream* stream,
+                                 const std::string& element_name,
+                                 const std::string& name,
+                                 const std::string& value);
+
+  // Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
+  static void OutputXmlCDataSection(::std::ostream* stream, const char* data);
+
+  // Streams a test suite XML stanza containing the given test result.
+  //
+  // Requires: result.Failed()
+  static void OutputXmlTestSuiteForTestResult(::std::ostream* stream,
+                                              const TestResult& result);
+
+  // Streams an XML representation of a TestResult object.
+  static void OutputXmlTestResult(::std::ostream* stream,
+                                  const TestResult& result);
+
+  // Streams an XML representation of a TestInfo object.
+  static void OutputXmlTestInfo(::std::ostream* stream,
+                                const char* test_suite_name,
+                                const TestInfo& test_info);
+
+  // Prints an XML representation of a TestSuite object
+  static void PrintXmlTestSuite(::std::ostream* stream,
+                                const TestSuite& test_suite);
+
+  // Prints an XML summary of unit_test to output stream out.
+  static void PrintXmlUnitTest(::std::ostream* stream,
+                               const UnitTest& unit_test);
+
+  // Produces a string representing the test properties in a result as space
+  // delimited XML attributes based on the property key="value" pairs.
+  // When the std::string is not empty, it includes a space at the beginning,
+  // to delimit this attribute from prior attributes.
+  static std::string TestPropertiesAsXmlAttributes(const TestResult& result);
+
+  // Streams an XML representation of the test properties of a TestResult
+  // object.
+  static void OutputXmlTestProperties(std::ostream* stream,
+                                      const TestResult& result);
+
+  // The output file.
+  const std::string output_file_;
+
+  XmlUnitTestResultPrinter(const XmlUnitTestResultPrinter&) = delete;
+  XmlUnitTestResultPrinter& operator=(const XmlUnitTestResultPrinter&) = delete;
+};
+
+// Creates a new XmlUnitTestResultPrinter.
+XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char* output_file)
+    : output_file_(output_file) {
+  if (output_file_.empty()) {
+    GTEST_LOG_(FATAL) << "XML output file may not be null";
+  }
+}
+
+// Called after the unit test ends.
+void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+                                                  int /*iteration*/) {
+  FILE* xmlout = OpenFileForWriting(output_file_);
+  std::stringstream stream;
+  PrintXmlUnitTest(&stream, unit_test);
+  fprintf(xmlout, "%s", StringStreamToString(&stream).c_str());
+  fclose(xmlout);
+}
+
+void XmlUnitTestResultPrinter::ListTestsMatchingFilter(
+    const std::vector<TestSuite*>& test_suites) {
+  FILE* xmlout = OpenFileForWriting(output_file_);
+  std::stringstream stream;
+  PrintXmlTestsList(&stream, test_suites);
+  fprintf(xmlout, "%s", StringStreamToString(&stream).c_str());
+  fclose(xmlout);
+}
+
+// Returns an XML-escaped copy of the input string str.  If is_attribute
+// is true, the text is meant to appear as an attribute value, and
+// normalizable whitespace is preserved by replacing it with character
+// references.
+//
+// Invalid XML characters in str, if any, are stripped from the output.
+// It is expected that most, if not all, of the text processed by this
+// module will consist of ordinary English text.
+// If this module is ever modified to produce version 1.1 XML output,
+// most invalid characters can be retained using character references.
+std::string XmlUnitTestResultPrinter::EscapeXml(const std::string& str,
+                                                bool is_attribute) {
+  Message m;
+
+  for (size_t i = 0; i < str.size(); ++i) {
+    const char ch = str[i];
+    switch (ch) {
+      case '<':
+        m << "&lt;";
+        break;
+      case '>':
+        m << "&gt;";
+        break;
+      case '&':
+        m << "&amp;";
+        break;
+      case '\'':
+        if (is_attribute)
+          m << "&apos;";
+        else
+          m << '\'';
+        break;
+      case '"':
+        if (is_attribute)
+          m << "&quot;";
+        else
+          m << '"';
+        break;
+      default:
+        if (IsValidXmlCharacter(static_cast<unsigned char>(ch))) {
+          if (is_attribute &&
+              IsNormalizableWhitespace(static_cast<unsigned char>(ch)))
+            m << "&#x" << String::FormatByte(static_cast<unsigned char>(ch))
+              << ";";
+          else
+            m << ch;
+        }
+        break;
+    }
+  }
+
+  return m.GetString();
+}
+
+// Returns the given string with all characters invalid in XML removed.
+// Currently invalid characters are dropped from the string. An
+// alternative is to replace them with certain characters such as . or ?.
+std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(
+    const std::string& str) {
+  std::string output;
+  output.reserve(str.size());
+  for (std::string::const_iterator it = str.begin(); it != str.end(); ++it)
+    if (IsValidXmlCharacter(static_cast<unsigned char>(*it)))
+      output.push_back(*it);
+
+  return output;
+}
+
+// The following routines generate an XML representation of a UnitTest
+// object.
+//
+// This is how Google Test concepts map to the DTD:
+//
+// <testsuites name="AllTests">        <-- corresponds to a UnitTest object
+//   <testsuite name="testcase-name">  <-- corresponds to a TestSuite object
+//     <testcase name="test-name">     <-- corresponds to a TestInfo object
+//       <failure message="...">...</failure>
+//       <failure message="...">...</failure>
+//       <failure message="...">...</failure>
+//                                     <-- individual assertion failures
+//     </testcase>
+//   </testsuite>
+// </testsuites>
+
+// Formats the given time in milliseconds as seconds.
+std::string FormatTimeInMillisAsSeconds(TimeInMillis ms) {
+  ::std::stringstream ss;
+  ss << (static_cast<double>(ms) * 1e-3);
+  return ss.str();
+}
+
+static bool PortableLocaltime(time_t seconds, struct tm* out) {
+#if defined(_MSC_VER)
+  return localtime_s(out, &seconds) == 0;
+#elif defined(__MINGW32__) || defined(__MINGW64__)
+  // MINGW <time.h> provides neither localtime_r nor localtime_s, but uses
+  // Windows' localtime(), which has a thread-local tm buffer.
+  struct tm* tm_ptr = localtime(&seconds);  // NOLINT
+  if (tm_ptr == nullptr) return false;
+  *out = *tm_ptr;
+  return true;
+#elif defined(__STDC_LIB_EXT1__)
+  // Uses localtime_s when available as localtime_r is only available from
+  // C23 standard.
+  return localtime_s(&seconds, out) != nullptr;
+#else
+  return localtime_r(&seconds, out) != nullptr;
+#endif
+}
+
+// Converts the given epoch time in milliseconds to a date string in the ISO
+// 8601 format, without the timezone information.
+std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms) {
+  struct tm time_struct;
+  if (!PortableLocaltime(static_cast<time_t>(ms / 1000), &time_struct))
+    return "";
+  // YYYY-MM-DDThh:mm:ss.sss
+  return StreamableToString(time_struct.tm_year + 1900) + "-" +
+         String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
+         String::FormatIntWidth2(time_struct.tm_mday) + "T" +
+         String::FormatIntWidth2(time_struct.tm_hour) + ":" +
+         String::FormatIntWidth2(time_struct.tm_min) + ":" +
+         String::FormatIntWidth2(time_struct.tm_sec) + "." +
+         String::FormatIntWidthN(static_cast<int>(ms % 1000), 3);
+}
+
+// Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
+void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream,
+                                                     const char* data) {
+  const char* segment = data;
+  *stream << "<![CDATA[";
+  for (;;) {
+    const char* const next_segment = strstr(segment, "]]>");
+    if (next_segment != nullptr) {
+      stream->write(segment,
+                    static_cast<std::streamsize>(next_segment - segment));
+      *stream << "]]>]]&gt;<![CDATA[";
+      segment = next_segment + strlen("]]>");
+    } else {
+      *stream << segment;
+      break;
+    }
+  }
+  *stream << "]]>";
+}
+
+void XmlUnitTestResultPrinter::OutputXmlAttribute(
+    std::ostream* stream, const std::string& element_name,
+    const std::string& name, const std::string& value) {
+  const std::vector<std::string>& allowed_names =
+      GetReservedOutputAttributesForElement(element_name);
+
+  GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
+               allowed_names.end())
+      << "Attribute " << name << " is not allowed for element <" << element_name
+      << ">.";
+
+  *stream << " " << name << "=\"" << EscapeXmlAttribute(value) << "\"";
+}
+
+// Streams a test suite XML stanza containing the given test result.
+void XmlUnitTestResultPrinter::OutputXmlTestSuiteForTestResult(
+    ::std::ostream* stream, const TestResult& result) {
+  // Output the boilerplate for a minimal test suite with one test.
+  *stream << "  <testsuite";
+  OutputXmlAttribute(stream, "testsuite", "name", "NonTestSuiteFailure");
+  OutputXmlAttribute(stream, "testsuite", "tests", "1");
+  OutputXmlAttribute(stream, "testsuite", "failures", "1");
+  OutputXmlAttribute(stream, "testsuite", "disabled", "0");
+  OutputXmlAttribute(stream, "testsuite", "skipped", "0");
+  OutputXmlAttribute(stream, "testsuite", "errors", "0");
+  OutputXmlAttribute(stream, "testsuite", "time",
+                     FormatTimeInMillisAsSeconds(result.elapsed_time()));
+  OutputXmlAttribute(
+      stream, "testsuite", "timestamp",
+      FormatEpochTimeInMillisAsIso8601(result.start_timestamp()));
+  *stream << ">";
+
+  // Output the boilerplate for a minimal test case with a single test.
+  *stream << "    <testcase";
+  OutputXmlAttribute(stream, "testcase", "name", "");
+  OutputXmlAttribute(stream, "testcase", "status", "run");
+  OutputXmlAttribute(stream, "testcase", "result", "completed");
+  OutputXmlAttribute(stream, "testcase", "classname", "");
+  OutputXmlAttribute(stream, "testcase", "time",
+                     FormatTimeInMillisAsSeconds(result.elapsed_time()));
+  OutputXmlAttribute(
+      stream, "testcase", "timestamp",
+      FormatEpochTimeInMillisAsIso8601(result.start_timestamp()));
+
+  // Output the actual test result.
+  OutputXmlTestResult(stream, result);
+
+  // Complete the test suite.
+  *stream << "  </testsuite>\n";
+}
+
+// Prints an XML representation of a TestInfo object.
+void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
+                                                 const char* test_suite_name,
+                                                 const TestInfo& test_info) {
+  const TestResult& result = *test_info.result();
+  const std::string kTestsuite = "testcase";
+
+  if (test_info.is_in_another_shard()) {
+    return;
+  }
+
+  *stream << "    <testcase";
+  OutputXmlAttribute(stream, kTestsuite, "name", test_info.name());
+
+  if (test_info.value_param() != nullptr) {
+    OutputXmlAttribute(stream, kTestsuite, "value_param",
+                       test_info.value_param());
+  }
+  if (test_info.type_param() != nullptr) {
+    OutputXmlAttribute(stream, kTestsuite, "type_param",
+                       test_info.type_param());
+  }
+
+  OutputXmlAttribute(stream, kTestsuite, "file", test_info.file());
+  OutputXmlAttribute(stream, kTestsuite, "line",
+                     StreamableToString(test_info.line()));
+  if (GTEST_FLAG_GET(list_tests)) {
+    *stream << " />\n";
+    return;
+  }
+
+  OutputXmlAttribute(stream, kTestsuite, "status",
+                     test_info.should_run() ? "run" : "notrun");
+  OutputXmlAttribute(stream, kTestsuite, "result",
+                     test_info.should_run()
+                         ? (result.Skipped() ? "skipped" : "completed")
+                         : "suppressed");
+  OutputXmlAttribute(stream, kTestsuite, "time",
+                     FormatTimeInMillisAsSeconds(result.elapsed_time()));
+  OutputXmlAttribute(
+      stream, kTestsuite, "timestamp",
+      FormatEpochTimeInMillisAsIso8601(result.start_timestamp()));
+  OutputXmlAttribute(stream, kTestsuite, "classname", test_suite_name);
+
+  OutputXmlTestResult(stream, result);
+}
+
+void XmlUnitTestResultPrinter::OutputXmlTestResult(::std::ostream* stream,
+                                                   const TestResult& result) {
+  int failures = 0;
+  int skips = 0;
+  for (int i = 0; i < result.total_part_count(); ++i) {
+    const TestPartResult& part = result.GetTestPartResult(i);
+    if (part.failed()) {
+      if (++failures == 1 && skips == 0) {
+        *stream << ">\n";
+      }
+      const std::string location =
+          internal::FormatCompilerIndependentFileLocation(part.file_name(),
+                                                          part.line_number());
+      const std::string summary = location + "\n" + part.summary();
+      *stream << "      <failure message=\"" << EscapeXmlAttribute(summary)
+              << "\" type=\"\">";
+      const std::string detail = location + "\n" + part.message();
+      OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str());
+      *stream << "</failure>\n";
+    } else if (part.skipped()) {
+      if (++skips == 1 && failures == 0) {
+        *stream << ">\n";
+      }
+      const std::string location =
+          internal::FormatCompilerIndependentFileLocation(part.file_name(),
+                                                          part.line_number());
+      const std::string summary = location + "\n" + part.summary();
+      *stream << "      <skipped message=\""
+              << EscapeXmlAttribute(summary.c_str()) << "\">";
+      const std::string detail = location + "\n" + part.message();
+      OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str());
+      *stream << "</skipped>\n";
+    }
+  }
+
+  if (failures == 0 && skips == 0 && result.test_property_count() == 0) {
+    *stream << " />\n";
+  } else {
+    if (failures == 0 && skips == 0) {
+      *stream << ">\n";
+    }
+    OutputXmlTestProperties(stream, result);
+    *stream << "    </testcase>\n";
+  }
+}
+
+// Prints an XML representation of a TestSuite object
+void XmlUnitTestResultPrinter::PrintXmlTestSuite(std::ostream* stream,
+                                                 const TestSuite& test_suite) {
+  const std::string kTestsuite = "testsuite";
+  *stream << "  <" << kTestsuite;
+  OutputXmlAttribute(stream, kTestsuite, "name", test_suite.name());
+  OutputXmlAttribute(stream, kTestsuite, "tests",
+                     StreamableToString(test_suite.reportable_test_count()));
+  if (!GTEST_FLAG_GET(list_tests)) {
+    OutputXmlAttribute(stream, kTestsuite, "failures",
+                       StreamableToString(test_suite.failed_test_count()));
+    OutputXmlAttribute(
+        stream, kTestsuite, "disabled",
+        StreamableToString(test_suite.reportable_disabled_test_count()));
+    OutputXmlAttribute(stream, kTestsuite, "skipped",
+                       StreamableToString(test_suite.skipped_test_count()));
+
+    OutputXmlAttribute(stream, kTestsuite, "errors", "0");
+
+    OutputXmlAttribute(stream, kTestsuite, "time",
+                       FormatTimeInMillisAsSeconds(test_suite.elapsed_time()));
+    OutputXmlAttribute(
+        stream, kTestsuite, "timestamp",
+        FormatEpochTimeInMillisAsIso8601(test_suite.start_timestamp()));
+    *stream << TestPropertiesAsXmlAttributes(test_suite.ad_hoc_test_result());
+  }
+  *stream << ">\n";
+  for (int i = 0; i < test_suite.total_test_count(); ++i) {
+    if (test_suite.GetTestInfo(i)->is_reportable())
+      OutputXmlTestInfo(stream, test_suite.name(), *test_suite.GetTestInfo(i));
+  }
+  *stream << "  </" << kTestsuite << ">\n";
+}
+
+// Prints an XML summary of unit_test to output stream out.
+void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream,
+                                                const UnitTest& unit_test) {
+  const std::string kTestsuites = "testsuites";
+
+  *stream << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
+  *stream << "<" << kTestsuites;
+
+  OutputXmlAttribute(stream, kTestsuites, "tests",
+                     StreamableToString(unit_test.reportable_test_count()));
+  OutputXmlAttribute(stream, kTestsuites, "failures",
+                     StreamableToString(unit_test.failed_test_count()));
+  OutputXmlAttribute(
+      stream, kTestsuites, "disabled",
+      StreamableToString(unit_test.reportable_disabled_test_count()));
+  OutputXmlAttribute(stream, kTestsuites, "errors", "0");
+  OutputXmlAttribute(stream, kTestsuites, "time",
+                     FormatTimeInMillisAsSeconds(unit_test.elapsed_time()));
+  OutputXmlAttribute(
+      stream, kTestsuites, "timestamp",
+      FormatEpochTimeInMillisAsIso8601(unit_test.start_timestamp()));
+
+  if (GTEST_FLAG_GET(shuffle)) {
+    OutputXmlAttribute(stream, kTestsuites, "random_seed",
+                       StreamableToString(unit_test.random_seed()));
+  }
+  *stream << TestPropertiesAsXmlAttributes(unit_test.ad_hoc_test_result());
+
+  OutputXmlAttribute(stream, kTestsuites, "name", "AllTests");
+  *stream << ">\n";
+
+  for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
+    if (unit_test.GetTestSuite(i)->reportable_test_count() > 0)
+      PrintXmlTestSuite(stream, *unit_test.GetTestSuite(i));
+  }
+
+  // If there was a test failure outside of one of the test suites (like in a
+  // test environment) include that in the output.
+  if (unit_test.ad_hoc_test_result().Failed()) {
+    OutputXmlTestSuiteForTestResult(stream, unit_test.ad_hoc_test_result());
+  }
+
+  *stream << "</" << kTestsuites << ">\n";
+}
+
+void XmlUnitTestResultPrinter::PrintXmlTestsList(
+    std::ostream* stream, const std::vector<TestSuite*>& test_suites) {
+  const std::string kTestsuites = "testsuites";
+
+  *stream << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
+  *stream << "<" << kTestsuites;
+
+  int total_tests = 0;
+  for (auto test_suite : test_suites) {
+    total_tests += test_suite->total_test_count();
+  }
+  OutputXmlAttribute(stream, kTestsuites, "tests",
+                     StreamableToString(total_tests));
+  OutputXmlAttribute(stream, kTestsuites, "name", "AllTests");
+  *stream << ">\n";
+
+  for (auto test_suite : test_suites) {
+    PrintXmlTestSuite(stream, *test_suite);
+  }
+  *stream << "</" << kTestsuites << ">\n";
+}
+
+// Produces a string representing the test properties in a result as space
+// delimited XML attributes based on the property key="value" pairs.
+std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
+    const TestResult& result) {
+  Message attributes;
+  for (int i = 0; i < result.test_property_count(); ++i) {
+    const TestProperty& property = result.GetTestProperty(i);
+    attributes << " " << property.key() << "="
+               << "\"" << EscapeXmlAttribute(property.value()) << "\"";
+  }
+  return attributes.GetString();
+}
+
+void XmlUnitTestResultPrinter::OutputXmlTestProperties(
+    std::ostream* stream, const TestResult& result) {
+  const std::string kProperties = "properties";
+  const std::string kProperty = "property";
+
+  if (result.test_property_count() <= 0) {
+    return;
+  }
+
+  *stream << "      <" << kProperties << ">\n";
+  for (int i = 0; i < result.test_property_count(); ++i) {
+    const TestProperty& property = result.GetTestProperty(i);
+    *stream << "        <" << kProperty;
+    *stream << " name=\"" << EscapeXmlAttribute(property.key()) << "\"";
+    *stream << " value=\"" << EscapeXmlAttribute(property.value()) << "\"";
+    *stream << "/>\n";
+  }
+  *stream << "      </" << kProperties << ">\n";
+}
+
+// End XmlUnitTestResultPrinter
+
+// This class generates an JSON output file.
+class JsonUnitTestResultPrinter : public EmptyTestEventListener {
+ public:
+  explicit JsonUnitTestResultPrinter(const char* output_file);
+
+  void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override;
+
+  // Prints an JSON summary of all unit tests.
+  static void PrintJsonTestList(::std::ostream* stream,
+                                const std::vector<TestSuite*>& test_suites);
+
+ private:
+  // Returns an JSON-escaped copy of the input string str.
+  static std::string EscapeJson(const std::string& str);
+
+  //// Verifies that the given attribute belongs to the given element and
+  //// streams the attribute as JSON.
+  static void OutputJsonKey(std::ostream* stream,
+                            const std::string& element_name,
+                            const std::string& name, const std::string& value,
+                            const std::string& indent, bool comma = true);
+  static void OutputJsonKey(std::ostream* stream,
+                            const std::string& element_name,
+                            const std::string& name, int value,
+                            const std::string& indent, bool comma = true);
+
+  // Streams a test suite JSON stanza containing the given test result.
+  //
+  // Requires: result.Failed()
+  static void OutputJsonTestSuiteForTestResult(::std::ostream* stream,
+                                               const TestResult& result);
+
+  // Streams a JSON representation of a TestResult object.
+  static void OutputJsonTestResult(::std::ostream* stream,
+                                   const TestResult& result);
+
+  // Streams a JSON representation of a TestInfo object.
+  static void OutputJsonTestInfo(::std::ostream* stream,
+                                 const char* test_suite_name,
+                                 const TestInfo& test_info);
+
+  // Prints a JSON representation of a TestSuite object
+  static void PrintJsonTestSuite(::std::ostream* stream,
+                                 const TestSuite& test_suite);
+
+  // Prints a JSON summary of unit_test to output stream out.
+  static void PrintJsonUnitTest(::std::ostream* stream,
+                                const UnitTest& unit_test);
+
+  // Produces a string representing the test properties in a result as
+  // a JSON dictionary.
+  static std::string TestPropertiesAsJson(const TestResult& result,
+                                          const std::string& indent);
+
+  // The output file.
+  const std::string output_file_;
+
+  JsonUnitTestResultPrinter(const JsonUnitTestResultPrinter&) = delete;
+  JsonUnitTestResultPrinter& operator=(const JsonUnitTestResultPrinter&) =
+      delete;
+};
+
+// Creates a new JsonUnitTestResultPrinter.
+JsonUnitTestResultPrinter::JsonUnitTestResultPrinter(const char* output_file)
+    : output_file_(output_file) {
+  if (output_file_.empty()) {
+    GTEST_LOG_(FATAL) << "JSON output file may not be null";
+  }
+}
+
+void JsonUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+                                                   int /*iteration*/) {
+  FILE* jsonout = OpenFileForWriting(output_file_);
+  std::stringstream stream;
+  PrintJsonUnitTest(&stream, unit_test);
+  fprintf(jsonout, "%s", StringStreamToString(&stream).c_str());
+  fclose(jsonout);
+}
+
+// Returns an JSON-escaped copy of the input string str.
+std::string JsonUnitTestResultPrinter::EscapeJson(const std::string& str) {
+  Message m;
+
+  for (size_t i = 0; i < str.size(); ++i) {
+    const char ch = str[i];
+    switch (ch) {
+      case '\\':
+      case '"':
+      case '/':
+        m << '\\' << ch;
+        break;
+      case '\b':
+        m << "\\b";
+        break;
+      case '\t':
+        m << "\\t";
+        break;
+      case '\n':
+        m << "\\n";
+        break;
+      case '\f':
+        m << "\\f";
+        break;
+      case '\r':
+        m << "\\r";
+        break;
+      default:
+        if (ch < ' ') {
+          m << "\\u00" << String::FormatByte(static_cast<unsigned char>(ch));
+        } else {
+          m << ch;
+        }
+        break;
+    }
+  }
+
+  return m.GetString();
+}
+
+// The following routines generate an JSON representation of a UnitTest
+// object.
+
+// Formats the given time in milliseconds as seconds.
+static std::string FormatTimeInMillisAsDuration(TimeInMillis ms) {
+  ::std::stringstream ss;
+  ss << (static_cast<double>(ms) * 1e-3) << "s";
+  return ss.str();
+}
+
+// Converts the given epoch time in milliseconds to a date string in the
+// RFC3339 format, without the timezone information.
+static std::string FormatEpochTimeInMillisAsRFC3339(TimeInMillis ms) {
+  struct tm time_struct;
+  if (!PortableLocaltime(static_cast<time_t>(ms / 1000), &time_struct))
+    return "";
+  // YYYY-MM-DDThh:mm:ss
+  return StreamableToString(time_struct.tm_year + 1900) + "-" +
+         String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
+         String::FormatIntWidth2(time_struct.tm_mday) + "T" +
+         String::FormatIntWidth2(time_struct.tm_hour) + ":" +
+         String::FormatIntWidth2(time_struct.tm_min) + ":" +
+         String::FormatIntWidth2(time_struct.tm_sec) + "Z";
+}
+
+static inline std::string Indent(size_t width) {
+  return std::string(width, ' ');
+}
+
+void JsonUnitTestResultPrinter::OutputJsonKey(std::ostream* stream,
+                                              const std::string& element_name,
+                                              const std::string& name,
+                                              const std::string& value,
+                                              const std::string& indent,
+                                              bool comma) {
+  const std::vector<std::string>& allowed_names =
+      GetReservedOutputAttributesForElement(element_name);
+
+  GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
+               allowed_names.end())
+      << "Key \"" << name << "\" is not allowed for value \"" << element_name
+      << "\".";
+
+  *stream << indent << "\"" << name << "\": \"" << EscapeJson(value) << "\"";
+  if (comma) *stream << ",\n";
+}
+
+void JsonUnitTestResultPrinter::OutputJsonKey(
+    std::ostream* stream, const std::string& element_name,
+    const std::string& name, int value, const std::string& indent, bool comma) {
+  const std::vector<std::string>& allowed_names =
+      GetReservedOutputAttributesForElement(element_name);
+
+  GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
+               allowed_names.end())
+      << "Key \"" << name << "\" is not allowed for value \"" << element_name
+      << "\".";
+
+  *stream << indent << "\"" << name << "\": " << StreamableToString(value);
+  if (comma) *stream << ",\n";
+}
+
+// Streams a test suite JSON stanza containing the given test result.
+void JsonUnitTestResultPrinter::OutputJsonTestSuiteForTestResult(
+    ::std::ostream* stream, const TestResult& result) {
+  // Output the boilerplate for a new test suite.
+  *stream << Indent(4) << "{\n";
+  OutputJsonKey(stream, "testsuite", "name", "NonTestSuiteFailure", Indent(6));
+  OutputJsonKey(stream, "testsuite", "tests", 1, Indent(6));
+  if (!GTEST_FLAG_GET(list_tests)) {
+    OutputJsonKey(stream, "testsuite", "failures", 1, Indent(6));
+    OutputJsonKey(stream, "testsuite", "disabled", 0, Indent(6));
+    OutputJsonKey(stream, "testsuite", "skipped", 0, Indent(6));
+    OutputJsonKey(stream, "testsuite", "errors", 0, Indent(6));
+    OutputJsonKey(stream, "testsuite", "time",
+                  FormatTimeInMillisAsDuration(result.elapsed_time()),
+                  Indent(6));
+    OutputJsonKey(stream, "testsuite", "timestamp",
+                  FormatEpochTimeInMillisAsRFC3339(result.start_timestamp()),
+                  Indent(6));
+  }
+  *stream << Indent(6) << "\"testsuite\": [\n";
+
+  // Output the boilerplate for a new test case.
+  *stream << Indent(8) << "{\n";
+  OutputJsonKey(stream, "testcase", "name", "", Indent(10));
+  OutputJsonKey(stream, "testcase", "status", "RUN", Indent(10));
+  OutputJsonKey(stream, "testcase", "result", "COMPLETED", Indent(10));
+  OutputJsonKey(stream, "testcase", "timestamp",
+                FormatEpochTimeInMillisAsRFC3339(result.start_timestamp()),
+                Indent(10));
+  OutputJsonKey(stream, "testcase", "time",
+                FormatTimeInMillisAsDuration(result.elapsed_time()),
+                Indent(10));
+  OutputJsonKey(stream, "testcase", "classname", "", Indent(10), false);
+  *stream << TestPropertiesAsJson(result, Indent(10));
+
+  // Output the actual test result.
+  OutputJsonTestResult(stream, result);
+
+  // Finish the test suite.
+  *stream << "\n" << Indent(6) << "]\n" << Indent(4) << "}";
+}
+
+// Prints a JSON representation of a TestInfo object.
+void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream* stream,
+                                                   const char* test_suite_name,
+                                                   const TestInfo& test_info) {
+  const TestResult& result = *test_info.result();
+  const std::string kTestsuite = "testcase";
+  const std::string kIndent = Indent(10);
+
+  *stream << Indent(8) << "{\n";
+  OutputJsonKey(stream, kTestsuite, "name", test_info.name(), kIndent);
+
+  if (test_info.value_param() != nullptr) {
+    OutputJsonKey(stream, kTestsuite, "value_param", test_info.value_param(),
+                  kIndent);
+  }
+  if (test_info.type_param() != nullptr) {
+    OutputJsonKey(stream, kTestsuite, "type_param", test_info.type_param(),
+                  kIndent);
+  }
+
+  OutputJsonKey(stream, kTestsuite, "file", test_info.file(), kIndent);
+  OutputJsonKey(stream, kTestsuite, "line", test_info.line(), kIndent, false);
+  if (GTEST_FLAG_GET(list_tests)) {
+    *stream << "\n" << Indent(8) << "}";
+    return;
+  } else {
+    *stream << ",\n";
+  }
+
+  OutputJsonKey(stream, kTestsuite, "status",
+                test_info.should_run() ? "RUN" : "NOTRUN", kIndent);
+  OutputJsonKey(stream, kTestsuite, "result",
+                test_info.should_run()
+                    ? (result.Skipped() ? "SKIPPED" : "COMPLETED")
+                    : "SUPPRESSED",
+                kIndent);
+  OutputJsonKey(stream, kTestsuite, "timestamp",
+                FormatEpochTimeInMillisAsRFC3339(result.start_timestamp()),
+                kIndent);
+  OutputJsonKey(stream, kTestsuite, "time",
+                FormatTimeInMillisAsDuration(result.elapsed_time()), kIndent);
+  OutputJsonKey(stream, kTestsuite, "classname", test_suite_name, kIndent,
+                false);
+  *stream << TestPropertiesAsJson(result, kIndent);
+
+  OutputJsonTestResult(stream, result);
+}
+
+void JsonUnitTestResultPrinter::OutputJsonTestResult(::std::ostream* stream,
+                                                     const TestResult& result) {
+  const std::string kIndent = Indent(10);
+
+  int failures = 0;
+  for (int i = 0; i < result.total_part_count(); ++i) {
+    const TestPartResult& part = result.GetTestPartResult(i);
+    if (part.failed()) {
+      *stream << ",\n";
+      if (++failures == 1) {
+        *stream << kIndent << "\""
+                << "failures"
+                << "\": [\n";
+      }
+      const std::string location =
+          internal::FormatCompilerIndependentFileLocation(part.file_name(),
+                                                          part.line_number());
+      const std::string message = EscapeJson(location + "\n" + part.message());
+      *stream << kIndent << "  {\n"
+              << kIndent << "    \"failure\": \"" << message << "\",\n"
+              << kIndent << "    \"type\": \"\"\n"
+              << kIndent << "  }";
+    }
+  }
+
+  if (failures > 0) *stream << "\n" << kIndent << "]";
+  *stream << "\n" << Indent(8) << "}";
+}
+
+// Prints an JSON representation of a TestSuite object
+void JsonUnitTestResultPrinter::PrintJsonTestSuite(
+    std::ostream* stream, const TestSuite& test_suite) {
+  const std::string kTestsuite = "testsuite";
+  const std::string kIndent = Indent(6);
+
+  *stream << Indent(4) << "{\n";
+  OutputJsonKey(stream, kTestsuite, "name", test_suite.name(), kIndent);
+  OutputJsonKey(stream, kTestsuite, "tests", test_suite.reportable_test_count(),
+                kIndent);
+  if (!GTEST_FLAG_GET(list_tests)) {
+    OutputJsonKey(stream, kTestsuite, "failures",
+                  test_suite.failed_test_count(), kIndent);
+    OutputJsonKey(stream, kTestsuite, "disabled",
+                  test_suite.reportable_disabled_test_count(), kIndent);
+    OutputJsonKey(stream, kTestsuite, "errors", 0, kIndent);
+    OutputJsonKey(
+        stream, kTestsuite, "timestamp",
+        FormatEpochTimeInMillisAsRFC3339(test_suite.start_timestamp()),
+        kIndent);
+    OutputJsonKey(stream, kTestsuite, "time",
+                  FormatTimeInMillisAsDuration(test_suite.elapsed_time()),
+                  kIndent, false);
+    *stream << TestPropertiesAsJson(test_suite.ad_hoc_test_result(), kIndent)
+            << ",\n";
+  }
+
+  *stream << kIndent << "\"" << kTestsuite << "\": [\n";
+
+  bool comma = false;
+  for (int i = 0; i < test_suite.total_test_count(); ++i) {
+    if (test_suite.GetTestInfo(i)->is_reportable()) {
+      if (comma) {
+        *stream << ",\n";
+      } else {
+        comma = true;
+      }
+      OutputJsonTestInfo(stream, test_suite.name(), *test_suite.GetTestInfo(i));
+    }
+  }
+  *stream << "\n" << kIndent << "]\n" << Indent(4) << "}";
+}
+
+// Prints a JSON summary of unit_test to output stream out.
+void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream* stream,
+                                                  const UnitTest& unit_test) {
+  const std::string kTestsuites = "testsuites";
+  const std::string kIndent = Indent(2);
+  *stream << "{\n";
+
+  OutputJsonKey(stream, kTestsuites, "tests", unit_test.reportable_test_count(),
+                kIndent);
+  OutputJsonKey(stream, kTestsuites, "failures", unit_test.failed_test_count(),
+                kIndent);
+  OutputJsonKey(stream, kTestsuites, "disabled",
+                unit_test.reportable_disabled_test_count(), kIndent);
+  OutputJsonKey(stream, kTestsuites, "errors", 0, kIndent);
+  if (GTEST_FLAG_GET(shuffle)) {
+    OutputJsonKey(stream, kTestsuites, "random_seed", unit_test.random_seed(),
+                  kIndent);
+  }
+  OutputJsonKey(stream, kTestsuites, "timestamp",
+                FormatEpochTimeInMillisAsRFC3339(unit_test.start_timestamp()),
+                kIndent);
+  OutputJsonKey(stream, kTestsuites, "time",
+                FormatTimeInMillisAsDuration(unit_test.elapsed_time()), kIndent,
+                false);
+
+  *stream << TestPropertiesAsJson(unit_test.ad_hoc_test_result(), kIndent)
+          << ",\n";
+
+  OutputJsonKey(stream, kTestsuites, "name", "AllTests", kIndent);
+  *stream << kIndent << "\"" << kTestsuites << "\": [\n";
+
+  bool comma = false;
+  for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
+    if (unit_test.GetTestSuite(i)->reportable_test_count() > 0) {
+      if (comma) {
+        *stream << ",\n";
+      } else {
+        comma = true;
+      }
+      PrintJsonTestSuite(stream, *unit_test.GetTestSuite(i));
+    }
+  }
+
+  // If there was a test failure outside of one of the test suites (like in a
+  // test environment) include that in the output.
+  if (unit_test.ad_hoc_test_result().Failed()) {
+    OutputJsonTestSuiteForTestResult(stream, unit_test.ad_hoc_test_result());
+  }
+
+  *stream << "\n"
+          << kIndent << "]\n"
+          << "}\n";
+}
+
+void JsonUnitTestResultPrinter::PrintJsonTestList(
+    std::ostream* stream, const std::vector<TestSuite*>& test_suites) {
+  const std::string kTestsuites = "testsuites";
+  const std::string kIndent = Indent(2);
+  *stream << "{\n";
+  int total_tests = 0;
+  for (auto test_suite : test_suites) {
+    total_tests += test_suite->total_test_count();
+  }
+  OutputJsonKey(stream, kTestsuites, "tests", total_tests, kIndent);
+
+  OutputJsonKey(stream, kTestsuites, "name", "AllTests", kIndent);
+  *stream << kIndent << "\"" << kTestsuites << "\": [\n";
+
+  for (size_t i = 0; i < test_suites.size(); ++i) {
+    if (i != 0) {
+      *stream << ",\n";
+    }
+    PrintJsonTestSuite(stream, *test_suites[i]);
+  }
+
+  *stream << "\n"
+          << kIndent << "]\n"
+          << "}\n";
+}
+// Produces a string representing the test properties in a result as
+// a JSON dictionary.
+std::string JsonUnitTestResultPrinter::TestPropertiesAsJson(
+    const TestResult& result, const std::string& indent) {
+  Message attributes;
+  for (int i = 0; i < result.test_property_count(); ++i) {
+    const TestProperty& property = result.GetTestProperty(i);
+    attributes << ",\n"
+               << indent << "\"" << property.key() << "\": "
+               << "\"" << EscapeJson(property.value()) << "\"";
+  }
+  return attributes.GetString();
+}
+
+// End JsonUnitTestResultPrinter
+
+#if GTEST_CAN_STREAM_RESULTS_
+
+// Checks if str contains '=', '&', '%' or '\n' characters. If yes,
+// replaces them by "%xx" where xx is their hexadecimal value. For
+// example, replaces "=" with "%3D".  This algorithm is O(strlen(str))
+// in both time and space -- important as the input str may contain an
+// arbitrarily long test failure message and stack trace.
+std::string StreamingListener::UrlEncode(const char* str) {
+  std::string result;
+  result.reserve(strlen(str) + 1);
+  for (char ch = *str; ch != '\0'; ch = *++str) {
+    switch (ch) {
+      case '%':
+      case '=':
+      case '&':
+      case '\n':
+        result.append("%" + String::FormatByte(static_cast<unsigned char>(ch)));
+        break;
+      default:
+        result.push_back(ch);
+        break;
+    }
+  }
+  return result;
+}
+
+void StreamingListener::SocketWriter::MakeConnection() {
+  GTEST_CHECK_(sockfd_ == -1)
+      << "MakeConnection() can't be called when there is already a connection.";
+
+  addrinfo hints;
+  memset(&hints, 0, sizeof(hints));
+  hints.ai_family = AF_UNSPEC;  // To allow both IPv4 and IPv6 addresses.
+  hints.ai_socktype = SOCK_STREAM;
+  addrinfo* servinfo = nullptr;
+
+  // Use the getaddrinfo() to get a linked list of IP addresses for
+  // the given host name.
+  const int error_num =
+      getaddrinfo(host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
+  if (error_num != 0) {
+    GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: "
+                        << gai_strerror(error_num);
+  }
+
+  // Loop through all the results and connect to the first we can.
+  for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != nullptr;
+       cur_addr = cur_addr->ai_next) {
+    sockfd_ = socket(cur_addr->ai_family, cur_addr->ai_socktype,
+                     cur_addr->ai_protocol);
+    if (sockfd_ != -1) {
+      // Connect the client socket to the server socket.
+      if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) {
+        close(sockfd_);
+        sockfd_ = -1;
+      }
+    }
+  }
+
+  freeaddrinfo(servinfo);  // all done with this structure
+
+  if (sockfd_ == -1) {
+    GTEST_LOG_(WARNING) << "stream_result_to: failed to connect to "
+                        << host_name_ << ":" << port_num_;
+  }
+}
+
+// End of class Streaming Listener
+#endif  // GTEST_CAN_STREAM_RESULTS__
+
+// class OsStackTraceGetter
+
+const char* const OsStackTraceGetterInterface::kElidedFramesMarker =
+    "... " GTEST_NAME_ " internal frames ...";
+
+std::string OsStackTraceGetter::CurrentStackTrace(int max_depth, int skip_count)
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+#if GTEST_HAS_ABSL
+  std::string result;
+
+  if (max_depth <= 0) {
+    return result;
+  }
+
+  max_depth = std::min(max_depth, kMaxStackTraceDepth);
+
+  std::vector<void*> raw_stack(max_depth);
+  // Skips the frames requested by the caller, plus this function.
+  const int raw_stack_size =
+      absl::GetStackTrace(&raw_stack[0], max_depth, skip_count + 1);
+
+  void* caller_frame = nullptr;
+  {
+    MutexLock lock(&mutex_);
+    caller_frame = caller_frame_;
+  }
+
+  for (int i = 0; i < raw_stack_size; ++i) {
+    if (raw_stack[i] == caller_frame &&
+        !GTEST_FLAG_GET(show_internal_stack_frames)) {
+      // Add a marker to the trace and stop adding frames.
+      absl::StrAppend(&result, kElidedFramesMarker, "\n");
+      break;
+    }
+
+    char tmp[1024];
+    const char* symbol = "(unknown)";
+    if (absl::Symbolize(raw_stack[i], tmp, sizeof(tmp))) {
+      symbol = tmp;
+    }
+
+    char line[1024];
+    snprintf(line, sizeof(line), "  %p: %s\n", raw_stack[i], symbol);
+    result += line;
+  }
+
+  return result;
+
+#else   // !GTEST_HAS_ABSL
+  static_cast<void>(max_depth);
+  static_cast<void>(skip_count);
+  return "";
+#endif  // GTEST_HAS_ABSL
+}
+
+void OsStackTraceGetter::UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_) {
+#if GTEST_HAS_ABSL
+  void* caller_frame = nullptr;
+  if (absl::GetStackTrace(&caller_frame, 1, 3) <= 0) {
+    caller_frame = nullptr;
+  }
+
+  MutexLock lock(&mutex_);
+  caller_frame_ = caller_frame;
+#endif  // GTEST_HAS_ABSL
+}
+
+// A helper class that creates the premature-exit file in its
+// constructor and deletes the file in its destructor.
+class ScopedPrematureExitFile {
+ public:
+  explicit ScopedPrematureExitFile(const char* premature_exit_filepath)
+      : premature_exit_filepath_(
+            premature_exit_filepath ? premature_exit_filepath : "") {
+    // If a path to the premature-exit file is specified...
+    if (!premature_exit_filepath_.empty()) {
+      // create the file with a single "0" character in it.  I/O
+      // errors are ignored as there's nothing better we can do and we
+      // don't want to fail the test because of this.
+      FILE* pfile = posix::FOpen(premature_exit_filepath_.c_str(), "w");
+      fwrite("0", 1, 1, pfile);
+      fclose(pfile);
+    }
+  }
+
+  ~ScopedPrematureExitFile() {
+#if !defined GTEST_OS_ESP8266
+    if (!premature_exit_filepath_.empty()) {
+      int retval = remove(premature_exit_filepath_.c_str());
+      if (retval) {
+        GTEST_LOG_(ERROR) << "Failed to remove premature exit filepath \""
+                          << premature_exit_filepath_ << "\" with error "
+                          << retval;
+      }
+    }
+#endif
+  }
+
+ private:
+  const std::string premature_exit_filepath_;
+
+  ScopedPrematureExitFile(const ScopedPrematureExitFile&) = delete;
+  ScopedPrematureExitFile& operator=(const ScopedPrematureExitFile&) = delete;
+};
+
+}  // namespace internal
+
+// class TestEventListeners
+
+TestEventListeners::TestEventListeners()
+    : repeater_(new internal::TestEventRepeater()),
+      default_result_printer_(nullptr),
+      default_xml_generator_(nullptr) {}
+
+TestEventListeners::~TestEventListeners() { delete repeater_; }
+
+// Returns the standard listener responsible for the default console
+// output.  Can be removed from the listeners list to shut down default
+// console output.  Note that removing this object from the listener list
+// with Release transfers its ownership to the user.
+void TestEventListeners::Append(TestEventListener* listener) {
+  repeater_->Append(listener);
+}
+
+// Removes the given event listener from the list and returns it.  It then
+// becomes the caller's responsibility to delete the listener. Returns
+// NULL if the listener is not found in the list.
+TestEventListener* TestEventListeners::Release(TestEventListener* listener) {
+  if (listener == default_result_printer_)
+    default_result_printer_ = nullptr;
+  else if (listener == default_xml_generator_)
+    default_xml_generator_ = nullptr;
+  return repeater_->Release(listener);
+}
+
+// Returns repeater that broadcasts the TestEventListener events to all
+// subscribers.
+TestEventListener* TestEventListeners::repeater() { return repeater_; }
+
+// Sets the default_result_printer attribute to the provided listener.
+// The listener is also added to the listener list and previous
+// default_result_printer is removed from it and deleted. The listener can
+// also be NULL in which case it will not be added to the list. Does
+// nothing if the previous and the current listener objects are the same.
+void TestEventListeners::SetDefaultResultPrinter(TestEventListener* listener) {
+  if (default_result_printer_ != listener) {
+    // It is an error to pass this method a listener that is already in the
+    // list.
+    delete Release(default_result_printer_);
+    default_result_printer_ = listener;
+    if (listener != nullptr) Append(listener);
+  }
+}
+
+// Sets the default_xml_generator attribute to the provided listener.  The
+// listener is also added to the listener list and previous
+// default_xml_generator is removed from it and deleted. The listener can
+// also be NULL in which case it will not be added to the list. Does
+// nothing if the previous and the current listener objects are the same.
+void TestEventListeners::SetDefaultXmlGenerator(TestEventListener* listener) {
+  if (default_xml_generator_ != listener) {
+    // It is an error to pass this method a listener that is already in the
+    // list.
+    delete Release(default_xml_generator_);
+    default_xml_generator_ = listener;
+    if (listener != nullptr) Append(listener);
+  }
+}
+
+// Controls whether events will be forwarded by the repeater to the
+// listeners in the list.
+bool TestEventListeners::EventForwardingEnabled() const {
+  return repeater_->forwarding_enabled();
+}
+
+void TestEventListeners::SuppressEventForwarding() {
+  repeater_->set_forwarding_enabled(false);
+}
+
+// class UnitTest
+
+// Gets the singleton UnitTest object.  The first time this method is
+// called, a UnitTest object is constructed and returned.  Consecutive
+// calls will return the same object.
+//
+// We don't protect this under mutex_ as a user is not supposed to
+// call this before main() starts, from which point on the return
+// value will never change.
+UnitTest* UnitTest::GetInstance() {
+  // CodeGear C++Builder insists on a public destructor for the
+  // default implementation.  Use this implementation to keep good OO
+  // design with private destructor.
+
+#if defined(__BORLANDC__)
+  static UnitTest* const instance = new UnitTest;
+  return instance;
+#else
+  static UnitTest instance;
+  return &instance;
+#endif  // defined(__BORLANDC__)
+}
+
+// Gets the number of successful test suites.
+int UnitTest::successful_test_suite_count() const {
+  return impl()->successful_test_suite_count();
+}
+
+// Gets the number of failed test suites.
+int UnitTest::failed_test_suite_count() const {
+  return impl()->failed_test_suite_count();
+}
+
+// Gets the number of all test suites.
+int UnitTest::total_test_suite_count() const {
+  return impl()->total_test_suite_count();
+}
+
+// Gets the number of all test suites that contain at least one test
+// that should run.
+int UnitTest::test_suite_to_run_count() const {
+  return impl()->test_suite_to_run_count();
+}
+
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+int UnitTest::successful_test_case_count() const {
+  return impl()->successful_test_suite_count();
+}
+int UnitTest::failed_test_case_count() const {
+  return impl()->failed_test_suite_count();
+}
+int UnitTest::total_test_case_count() const {
+  return impl()->total_test_suite_count();
+}
+int UnitTest::test_case_to_run_count() const {
+  return impl()->test_suite_to_run_count();
+}
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+// Gets the number of successful tests.
+int UnitTest::successful_test_count() const {
+  return impl()->successful_test_count();
+}
+
+// Gets the number of skipped tests.
+int UnitTest::skipped_test_count() const {
+  return impl()->skipped_test_count();
+}
+
+// Gets the number of failed tests.
+int UnitTest::failed_test_count() const { return impl()->failed_test_count(); }
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int UnitTest::reportable_disabled_test_count() const {
+  return impl()->reportable_disabled_test_count();
+}
+
+// Gets the number of disabled tests.
+int UnitTest::disabled_test_count() const {
+  return impl()->disabled_test_count();
+}
+
+// Gets the number of tests to be printed in the XML report.
+int UnitTest::reportable_test_count() const {
+  return impl()->reportable_test_count();
+}
+
+// Gets the number of all tests.
+int UnitTest::total_test_count() const { return impl()->total_test_count(); }
+
+// Gets the number of tests that should run.
+int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); }
+
+// Gets the time of the test program start, in ms from the start of the
+// UNIX epoch.
+internal::TimeInMillis UnitTest::start_timestamp() const {
+  return impl()->start_timestamp();
+}
+
+// Gets the elapsed time, in milliseconds.
+internal::TimeInMillis UnitTest::elapsed_time() const {
+  return impl()->elapsed_time();
+}
+
+// Returns true if and only if the unit test passed (i.e. all test suites
+// passed).
+bool UnitTest::Passed() const { return impl()->Passed(); }
+
+// Returns true if and only if the unit test failed (i.e. some test suite
+// failed or something outside of all tests failed).
+bool UnitTest::Failed() const { return impl()->Failed(); }
+
+// Gets the i-th test suite among all the test suites. i can range from 0 to
+// total_test_suite_count() - 1. If i is not in that range, returns NULL.
+const TestSuite* UnitTest::GetTestSuite(int i) const {
+  return impl()->GetTestSuite(i);
+}
+
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+const TestCase* UnitTest::GetTestCase(int i) const {
+  return impl()->GetTestCase(i);
+}
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+// Returns the TestResult containing information on test failures and
+// properties logged outside of individual test suites.
+const TestResult& UnitTest::ad_hoc_test_result() const {
+  return *impl()->ad_hoc_test_result();
+}
+
+// Gets the i-th test suite among all the test suites. i can range from 0 to
+// total_test_suite_count() - 1. If i is not in that range, returns NULL.
+TestSuite* UnitTest::GetMutableTestSuite(int i) {
+  return impl()->GetMutableSuiteCase(i);
+}
+
+// Returns the list of event listeners that can be used to track events
+// inside Google Test.
+TestEventListeners& UnitTest::listeners() { return *impl()->listeners(); }
+
+// Registers and returns a global test environment.  When a test
+// program is run, all global test environments will be set-up in the
+// order they were registered.  After all tests in the program have
+// finished, all global test environments will be torn-down in the
+// *reverse* order they were registered.
+//
+// The UnitTest object takes ownership of the given environment.
+//
+// We don't protect this under mutex_, as we only support calling it
+// from the main thread.
+Environment* UnitTest::AddEnvironment(Environment* env) {
+  if (env == nullptr) {
+    return nullptr;
+  }
+
+  impl_->environments().push_back(env);
+  return env;
+}
+
+// Adds a TestPartResult to the current TestResult object.  All Google Test
+// assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call
+// this to report their results.  The user code should use the
+// assertion macros instead of calling this directly.
+void UnitTest::AddTestPartResult(TestPartResult::Type result_type,
+                                 const char* file_name, int line_number,
+                                 const std::string& message,
+                                 const std::string& os_stack_trace)
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  Message msg;
+  msg << message;
+
+  internal::MutexLock lock(&mutex_);
+  if (impl_->gtest_trace_stack().size() > 0) {
+    msg << "\n" << GTEST_NAME_ << " trace:";
+
+    for (size_t i = impl_->gtest_trace_stack().size(); i > 0; --i) {
+      const internal::TraceInfo& trace = impl_->gtest_trace_stack()[i - 1];
+      msg << "\n"
+          << internal::FormatFileLocation(trace.file, trace.line) << " "
+          << trace.message;
+    }
+  }
+
+  if (os_stack_trace.c_str() != nullptr && !os_stack_trace.empty()) {
+    msg << internal::kStackTraceMarker << os_stack_trace;
+  }
+
+  const TestPartResult result = TestPartResult(
+      result_type, file_name, line_number, msg.GetString().c_str());
+  impl_->GetTestPartResultReporterForCurrentThread()->ReportTestPartResult(
+      result);
+
+  if (result_type != TestPartResult::kSuccess &&
+      result_type != TestPartResult::kSkip) {
+    // gtest_break_on_failure takes precedence over
+    // gtest_throw_on_failure.  This allows a user to set the latter
+    // in the code (perhaps in order to use Google Test assertions
+    // with another testing framework) and specify the former on the
+    // command line for debugging.
+    if (GTEST_FLAG_GET(break_on_failure)) {
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+      // Using DebugBreak on Windows allows gtest to still break into a debugger
+      // when a failure happens and both the --gtest_break_on_failure and
+      // the --gtest_catch_exceptions flags are specified.
+      DebugBreak();
+#elif (!defined(__native_client__)) &&            \
+    ((defined(__clang__) || defined(__GNUC__)) && \
+     (defined(__x86_64__) || defined(__i386__)))
+      // with clang/gcc we can achieve the same effect on x86 by invoking int3
+      asm("int3");
+#else
+      // Dereference nullptr through a volatile pointer to prevent the compiler
+      // from removing. We use this rather than abort() or __builtin_trap() for
+      // portability: some debuggers don't correctly trap abort().
+      *static_cast<volatile int*>(nullptr) = 1;
+#endif  // GTEST_OS_WINDOWS
+    } else if (GTEST_FLAG_GET(throw_on_failure)) {
+#if GTEST_HAS_EXCEPTIONS
+      throw internal::GoogleTestFailureException(result);
+#else
+      // We cannot call abort() as it generates a pop-up in debug mode
+      // that cannot be suppressed in VC 7.1 or below.
+      exit(1);
+#endif
+    }
+  }
+}
+
+// Adds a TestProperty to the current TestResult object when invoked from
+// inside a test, to current TestSuite's ad_hoc_test_result_ when invoked
+// from SetUpTestSuite or TearDownTestSuite, or to the global property set
+// when invoked elsewhere.  If the result already contains a property with
+// the same key, the value will be updated.
+void UnitTest::RecordProperty(const std::string& key,
+                              const std::string& value) {
+  impl_->RecordProperty(TestProperty(key, value));
+}
+
+// Runs all tests in this UnitTest object and prints the result.
+// Returns 0 if successful, or 1 otherwise.
+//
+// We don't protect this under mutex_, as we only support calling it
+// from the main thread.
+int UnitTest::Run() {
+  const bool in_death_test_child_process =
+      GTEST_FLAG_GET(internal_run_death_test).length() > 0;
+
+  // Google Test implements this protocol for catching that a test
+  // program exits before returning control to Google Test:
+  //
+  //   1. Upon start, Google Test creates a file whose absolute path
+  //      is specified by the environment variable
+  //      TEST_PREMATURE_EXIT_FILE.
+  //   2. When Google Test has finished its work, it deletes the file.
+  //
+  // This allows a test runner to set TEST_PREMATURE_EXIT_FILE before
+  // running a Google-Test-based test program and check the existence
+  // of the file at the end of the test execution to see if it has
+  // exited prematurely.
+
+  // If we are in the child process of a death test, don't
+  // create/delete the premature exit file, as doing so is unnecessary
+  // and will confuse the parent process.  Otherwise, create/delete
+  // the file upon entering/leaving this function.  If the program
+  // somehow exits before this function has a chance to return, the
+  // premature-exit file will be left undeleted, causing a test runner
+  // that understands the premature-exit-file protocol to report the
+  // test as having failed.
+  const internal::ScopedPrematureExitFile premature_exit_file(
+      in_death_test_child_process
+          ? nullptr
+          : internal::posix::GetEnv("TEST_PREMATURE_EXIT_FILE"));
+
+  // Captures the value of GTEST_FLAG(catch_exceptions).  This value will be
+  // used for the duration of the program.
+  impl()->set_catch_exceptions(GTEST_FLAG_GET(catch_exceptions));
+
+#if GTEST_OS_WINDOWS
+  // Either the user wants Google Test to catch exceptions thrown by the
+  // tests or this is executing in the context of death test child
+  // process. In either case the user does not want to see pop-up dialogs
+  // about crashes - they are expected.
+  if (impl()->catch_exceptions() || in_death_test_child_process) {
+#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+    // SetErrorMode doesn't exist on CE.
+    SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT |
+                 SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX);
+#endif  // !GTEST_OS_WINDOWS_MOBILE
+
+#if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE
+    // Death test children can be terminated with _abort().  On Windows,
+    // _abort() can show a dialog with a warning message.  This forces the
+    // abort message to go to stderr instead.
+    _set_error_mode(_OUT_TO_STDERR);
+#endif
+
+#if defined(_MSC_VER) && !GTEST_OS_WINDOWS_MOBILE
+    // In the debug version, Visual Studio pops up a separate dialog
+    // offering a choice to debug the aborted program. We need to suppress
+    // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement
+    // executed. Google Test will notify the user of any unexpected
+    // failure via stderr.
+    if (!GTEST_FLAG_GET(break_on_failure))
+      _set_abort_behavior(
+          0x0,                                    // Clear the following flags:
+          _WRITE_ABORT_MSG | _CALL_REPORTFAULT);  // pop-up window, core dump.
+
+    // In debug mode, the Windows CRT can crash with an assertion over invalid
+    // input (e.g. passing an invalid file descriptor).  The default handling
+    // for these assertions is to pop up a dialog and wait for user input.
+    // Instead ask the CRT to dump such assertions to stderr non-interactively.
+    if (!IsDebuggerPresent()) {
+      (void)_CrtSetReportMode(_CRT_ASSERT,
+                              _CRTDBG_MODE_FILE | _CRTDBG_MODE_DEBUG);
+      (void)_CrtSetReportFile(_CRT_ASSERT, _CRTDBG_FILE_STDERR);
+    }
+#endif
+  }
+#endif  // GTEST_OS_WINDOWS
+
+  return internal::HandleExceptionsInMethodIfSupported(
+             impl(), &internal::UnitTestImpl::RunAllTests,
+             "auxiliary test code (environments or event listeners)")
+             ? 0
+             : 1;
+}
+
+// Returns the working directory when the first TEST() or TEST_F() was
+// executed.
+const char* UnitTest::original_working_dir() const {
+  return impl_->original_working_dir_.c_str();
+}
+
+// Returns the TestSuite object for the test that's currently running,
+// or NULL if no test is running.
+const TestSuite* UnitTest::current_test_suite() const
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  return impl_->current_test_suite();
+}
+
+// Legacy API is still available but deprecated
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+const TestCase* UnitTest::current_test_case() const
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  return impl_->current_test_suite();
+}
+#endif
+
+// Returns the TestInfo object for the test that's currently running,
+// or NULL if no test is running.
+const TestInfo* UnitTest::current_test_info() const
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  return impl_->current_test_info();
+}
+
+// Returns the random seed used at the start of the current test run.
+int UnitTest::random_seed() const { return impl_->random_seed(); }
+
+// Returns ParameterizedTestSuiteRegistry object used to keep track of
+// value-parameterized tests and instantiate and register them.
+internal::ParameterizedTestSuiteRegistry&
+UnitTest::parameterized_test_registry() GTEST_LOCK_EXCLUDED_(mutex_) {
+  return impl_->parameterized_test_registry();
+}
+
+// Creates an empty UnitTest.
+UnitTest::UnitTest() { impl_ = new internal::UnitTestImpl(this); }
+
+// Destructor of UnitTest.
+UnitTest::~UnitTest() { delete impl_; }
+
+// Pushes a trace defined by SCOPED_TRACE() on to the per-thread
+// Google Test trace stack.
+void UnitTest::PushGTestTrace(const internal::TraceInfo& trace)
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  impl_->gtest_trace_stack().push_back(trace);
+}
+
+// Pops a trace from the per-thread Google Test trace stack.
+void UnitTest::PopGTestTrace() GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  impl_->gtest_trace_stack().pop_back();
+}
+
+namespace internal {
+
+UnitTestImpl::UnitTestImpl(UnitTest* parent)
+    : parent_(parent),
+      GTEST_DISABLE_MSC_WARNINGS_PUSH_(4355 /* using this in initializer */)
+          default_global_test_part_result_reporter_(this),
+      default_per_thread_test_part_result_reporter_(this),
+      GTEST_DISABLE_MSC_WARNINGS_POP_() global_test_part_result_repoter_(
+          &default_global_test_part_result_reporter_),
+      per_thread_test_part_result_reporter_(
+          &default_per_thread_test_part_result_reporter_),
+      parameterized_test_registry_(),
+      parameterized_tests_registered_(false),
+      last_death_test_suite_(-1),
+      current_test_suite_(nullptr),
+      current_test_info_(nullptr),
+      ad_hoc_test_result_(),
+      os_stack_trace_getter_(nullptr),
+      post_flag_parse_init_performed_(false),
+      random_seed_(0),  // Will be overridden by the flag before first use.
+      random_(0),       // Will be reseeded before first use.
+      start_timestamp_(0),
+      elapsed_time_(0),
+#if GTEST_HAS_DEATH_TEST
+      death_test_factory_(new DefaultDeathTestFactory),
+#endif
+      // Will be overridden by the flag before first use.
+      catch_exceptions_(false) {
+  listeners()->SetDefaultResultPrinter(new PrettyUnitTestResultPrinter);
+}
+
+UnitTestImpl::~UnitTestImpl() {
+  // Deletes every TestSuite.
+  ForEach(test_suites_, internal::Delete<TestSuite>);
+
+  // Deletes every Environment.
+  ForEach(environments_, internal::Delete<Environment>);
+
+  delete os_stack_trace_getter_;
+}
+
+// Adds a TestProperty to the current TestResult object when invoked in a
+// context of a test, to current test suite's ad_hoc_test_result when invoke
+// from SetUpTestSuite/TearDownTestSuite, or to the global property set
+// otherwise.  If the result already contains a property with the same key,
+// the value will be updated.
+void UnitTestImpl::RecordProperty(const TestProperty& test_property) {
+  std::string xml_element;
+  TestResult* test_result;  // TestResult appropriate for property recording.
+
+  if (current_test_info_ != nullptr) {
+    xml_element = "testcase";
+    test_result = &(current_test_info_->result_);
+  } else if (current_test_suite_ != nullptr) {
+    xml_element = "testsuite";
+    test_result = &(current_test_suite_->ad_hoc_test_result_);
+  } else {
+    xml_element = "testsuites";
+    test_result = &ad_hoc_test_result_;
+  }
+  test_result->RecordProperty(xml_element, test_property);
+}
+
+#if GTEST_HAS_DEATH_TEST
+// Disables event forwarding if the control is currently in a death test
+// subprocess. Must not be called before InitGoogleTest.
+void UnitTestImpl::SuppressTestEventsIfInSubprocess() {
+  if (internal_run_death_test_flag_.get() != nullptr)
+    listeners()->SuppressEventForwarding();
+}
+#endif  // GTEST_HAS_DEATH_TEST
+
+// Initializes event listeners performing XML output as specified by
+// UnitTestOptions. Must not be called before InitGoogleTest.
+void UnitTestImpl::ConfigureXmlOutput() {
+  const std::string& output_format = UnitTestOptions::GetOutputFormat();
+  if (output_format == "xml") {
+    listeners()->SetDefaultXmlGenerator(new XmlUnitTestResultPrinter(
+        UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
+  } else if (output_format == "json") {
+    listeners()->SetDefaultXmlGenerator(new JsonUnitTestResultPrinter(
+        UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
+  } else if (output_format != "") {
+    GTEST_LOG_(WARNING) << "WARNING: unrecognized output format \""
+                        << output_format << "\" ignored.";
+  }
+}
+
+#if GTEST_CAN_STREAM_RESULTS_
+// Initializes event listeners for streaming test results in string form.
+// Must not be called before InitGoogleTest.
+void UnitTestImpl::ConfigureStreamingOutput() {
+  const std::string& target = GTEST_FLAG_GET(stream_result_to);
+  if (!target.empty()) {
+    const size_t pos = target.find(':');
+    if (pos != std::string::npos) {
+      listeners()->Append(
+          new StreamingListener(target.substr(0, pos), target.substr(pos + 1)));
+    } else {
+      GTEST_LOG_(WARNING) << "unrecognized streaming target \"" << target
+                          << "\" ignored.";
+    }
+  }
+}
+#endif  // GTEST_CAN_STREAM_RESULTS_
+
+// Performs initialization dependent upon flag values obtained in
+// ParseGoogleTestFlagsOnly.  Is called from InitGoogleTest after the call to
+// ParseGoogleTestFlagsOnly.  In case a user neglects to call InitGoogleTest
+// this function is also called from RunAllTests.  Since this function can be
+// called more than once, it has to be idempotent.
+void UnitTestImpl::PostFlagParsingInit() {
+  // Ensures that this function does not execute more than once.
+  if (!post_flag_parse_init_performed_) {
+    post_flag_parse_init_performed_ = true;
+
+#if defined(GTEST_CUSTOM_TEST_EVENT_LISTENER_)
+    // Register to send notifications about key process state changes.
+    listeners()->Append(new GTEST_CUSTOM_TEST_EVENT_LISTENER_());
+#endif  // defined(GTEST_CUSTOM_TEST_EVENT_LISTENER_)
+
+#if GTEST_HAS_DEATH_TEST
+    InitDeathTestSubprocessControlInfo();
+    SuppressTestEventsIfInSubprocess();
+#endif  // GTEST_HAS_DEATH_TEST
+
+    // Registers parameterized tests. This makes parameterized tests
+    // available to the UnitTest reflection API without running
+    // RUN_ALL_TESTS.
+    RegisterParameterizedTests();
+
+    // Configures listeners for XML output. This makes it possible for users
+    // to shut down the default XML output before invoking RUN_ALL_TESTS.
+    ConfigureXmlOutput();
+
+    if (GTEST_FLAG_GET(brief)) {
+      listeners()->SetDefaultResultPrinter(new BriefUnitTestResultPrinter);
+    }
+
+#if GTEST_CAN_STREAM_RESULTS_
+    // Configures listeners for streaming test results to the specified server.
+    ConfigureStreamingOutput();
+#endif  // GTEST_CAN_STREAM_RESULTS_
+
+#if GTEST_HAS_ABSL
+    if (GTEST_FLAG_GET(install_failure_signal_handler)) {
+      absl::FailureSignalHandlerOptions options;
+      absl::InstallFailureSignalHandler(options);
+    }
+#endif  // GTEST_HAS_ABSL
+  }
+}
+
+// A predicate that checks the name of a TestSuite against a known
+// value.
+//
+// This is used for implementation of the UnitTest class only.  We put
+// it in the anonymous namespace to prevent polluting the outer
+// namespace.
+//
+// TestSuiteNameIs is copyable.
+class TestSuiteNameIs {
+ public:
+  // Constructor.
+  explicit TestSuiteNameIs(const std::string& name) : name_(name) {}
+
+  // Returns true if and only if the name of test_suite matches name_.
+  bool operator()(const TestSuite* test_suite) const {
+    return test_suite != nullptr &&
+           strcmp(test_suite->name(), name_.c_str()) == 0;
+  }
+
+ private:
+  std::string name_;
+};
+
+// Finds and returns a TestSuite with the given name.  If one doesn't
+// exist, creates one and returns it.  It's the CALLER'S
+// RESPONSIBILITY to ensure that this function is only called WHEN THE
+// TESTS ARE NOT SHUFFLED.
+//
+// Arguments:
+//
+//   test_suite_name: name of the test suite
+//   type_param:      the name of the test suite's type parameter, or NULL if
+//                    this is not a typed or a type-parameterized test suite.
+//   set_up_tc:       pointer to the function that sets up the test suite
+//   tear_down_tc:    pointer to the function that tears down the test suite
+TestSuite* UnitTestImpl::GetTestSuite(
+    const char* test_suite_name, const char* type_param,
+    internal::SetUpTestSuiteFunc set_up_tc,
+    internal::TearDownTestSuiteFunc tear_down_tc) {
+  // Can we find a TestSuite with the given name?
+  const auto test_suite =
+      std::find_if(test_suites_.rbegin(), test_suites_.rend(),
+                   TestSuiteNameIs(test_suite_name));
+
+  if (test_suite != test_suites_.rend()) return *test_suite;
+
+  // No.  Let's create one.
+  auto* const new_test_suite =
+      new TestSuite(test_suite_name, type_param, set_up_tc, tear_down_tc);
+
+  const UnitTestFilter death_test_suite_filter(kDeathTestSuiteFilter);
+  // Is this a death test suite?
+  if (death_test_suite_filter.MatchesName(test_suite_name)) {
+    // Yes.  Inserts the test suite after the last death test suite
+    // defined so far.  This only works when the test suites haven't
+    // been shuffled.  Otherwise we may end up running a death test
+    // after a non-death test.
+    ++last_death_test_suite_;
+    test_suites_.insert(test_suites_.begin() + last_death_test_suite_,
+                        new_test_suite);
+  } else {
+    // No.  Appends to the end of the list.
+    test_suites_.push_back(new_test_suite);
+  }
+
+  test_suite_indices_.push_back(static_cast<int>(test_suite_indices_.size()));
+  return new_test_suite;
+}
+
+// Helpers for setting up / tearing down the given environment.  They
+// are for use in the ForEach() function.
+static void SetUpEnvironment(Environment* env) { env->SetUp(); }
+static void TearDownEnvironment(Environment* env) { env->TearDown(); }
+
+// Runs all tests in this UnitTest object, prints the result, and
+// returns true if all tests are successful.  If any exception is
+// thrown during a test, the test is considered to be failed, but the
+// rest of the tests will still be run.
+//
+// When parameterized tests are enabled, it expands and registers
+// parameterized tests first in RegisterParameterizedTests().
+// All other functions called from RunAllTests() may safely assume that
+// parameterized tests are ready to be counted and run.
+bool UnitTestImpl::RunAllTests() {
+  // True if and only if Google Test is initialized before RUN_ALL_TESTS() is
+  // called.
+  const bool gtest_is_initialized_before_run_all_tests = GTestIsInitialized();
+
+  // Do not run any test if the --help flag was specified.
+  if (g_help_flag) return true;
+
+  // Repeats the call to the post-flag parsing initialization in case the
+  // user didn't call InitGoogleTest.
+  PostFlagParsingInit();
+
+  // Even if sharding is not on, test runners may want to use the
+  // GTEST_SHARD_STATUS_FILE to query whether the test supports the sharding
+  // protocol.
+  internal::WriteToShardStatusFileIfNeeded();
+
+  // True if and only if we are in a subprocess for running a thread-safe-style
+  // death test.
+  bool in_subprocess_for_death_test = false;
+
+#if GTEST_HAS_DEATH_TEST
+  in_subprocess_for_death_test =
+      (internal_run_death_test_flag_.get() != nullptr);
+#if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
+  if (in_subprocess_for_death_test) {
+    GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_();
+  }
+#endif  // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
+#endif  // GTEST_HAS_DEATH_TEST
+
+  const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex,
+                                        in_subprocess_for_death_test);
+
+  // Compares the full test names with the filter to decide which
+  // tests to run.
+  const bool has_tests_to_run =
+      FilterTests(should_shard ? HONOR_SHARDING_PROTOCOL
+                               : IGNORE_SHARDING_PROTOCOL) > 0;
+
+  // Lists the tests and exits if the --gtest_list_tests flag was specified.
+  if (GTEST_FLAG_GET(list_tests)) {
+    // This must be called *after* FilterTests() has been called.
+    ListTestsMatchingFilter();
+    return true;
+  }
+
+  random_seed_ = GetRandomSeedFromFlag(GTEST_FLAG_GET(random_seed));
+
+  // True if and only if at least one test has failed.
+  bool failed = false;
+
+  TestEventListener* repeater = listeners()->repeater();
+
+  start_timestamp_ = GetTimeInMillis();
+  repeater->OnTestProgramStart(*parent_);
+
+  // How many times to repeat the tests?  We don't want to repeat them
+  // when we are inside the subprocess of a death test.
+  const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG_GET(repeat);
+
+  // Repeats forever if the repeat count is negative.
+  const bool gtest_repeat_forever = repeat < 0;
+
+  // Should test environments be set up and torn down for each repeat, or only
+  // set up on the first and torn down on the last iteration? If there is no
+  // "last" iteration because the tests will repeat forever, always recreate the
+  // environments to avoid leaks in case one of the environments is using
+  // resources that are external to this process. Without this check there would
+  // be no way to clean up those external resources automatically.
+  const bool recreate_environments_when_repeating =
+      GTEST_FLAG_GET(recreate_environments_when_repeating) ||
+      gtest_repeat_forever;
+
+  for (int i = 0; gtest_repeat_forever || i != repeat; i++) {
+    // We want to preserve failures generated by ad-hoc test
+    // assertions executed before RUN_ALL_TESTS().
+    ClearNonAdHocTestResult();
+
+    Timer timer;
+
+    // Shuffles test suites and tests if requested.
+    if (has_tests_to_run && GTEST_FLAG_GET(shuffle)) {
+      random()->Reseed(static_cast<uint32_t>(random_seed_));
+      // This should be done before calling OnTestIterationStart(),
+      // such that a test event listener can see the actual test order
+      // in the event.
+      ShuffleTests();
+    }
+
+    // Tells the unit test event listeners that the tests are about to start.
+    repeater->OnTestIterationStart(*parent_, i);
+
+    // Runs each test suite if there is at least one test to run.
+    if (has_tests_to_run) {
+      // Sets up all environments beforehand. If test environments aren't
+      // recreated for each iteration, only do so on the first iteration.
+      if (i == 0 || recreate_environments_when_repeating) {
+        repeater->OnEnvironmentsSetUpStart(*parent_);
+        ForEach(environments_, SetUpEnvironment);
+        repeater->OnEnvironmentsSetUpEnd(*parent_);
+      }
+
+      // Runs the tests only if there was no fatal failure or skip triggered
+      // during global set-up.
+      if (Test::IsSkipped()) {
+        // Emit diagnostics when global set-up calls skip, as it will not be
+        // emitted by default.
+        TestResult& test_result =
+            *internal::GetUnitTestImpl()->current_test_result();
+        for (int j = 0; j < test_result.total_part_count(); ++j) {
+          const TestPartResult& test_part_result =
+              test_result.GetTestPartResult(j);
+          if (test_part_result.type() == TestPartResult::kSkip) {
+            const std::string& result = test_part_result.message();
+            printf("%s\n", result.c_str());
+          }
+        }
+        fflush(stdout);
+      } else if (!Test::HasFatalFailure()) {
+        for (int test_index = 0; test_index < total_test_suite_count();
+             test_index++) {
+          GetMutableSuiteCase(test_index)->Run();
+          if (GTEST_FLAG_GET(fail_fast) &&
+              GetMutableSuiteCase(test_index)->Failed()) {
+            for (int j = test_index + 1; j < total_test_suite_count(); j++) {
+              GetMutableSuiteCase(j)->Skip();
+            }
+            break;
+          }
+        }
+      } else if (Test::HasFatalFailure()) {
+        // If there was a fatal failure during the global setup then we know we
+        // aren't going to run any tests. Explicitly mark all of the tests as
+        // skipped to make this obvious in the output.
+        for (int test_index = 0; test_index < total_test_suite_count();
+             test_index++) {
+          GetMutableSuiteCase(test_index)->Skip();
+        }
+      }
+
+      // Tears down all environments in reverse order afterwards. If test
+      // environments aren't recreated for each iteration, only do so on the
+      // last iteration.
+      if (i == repeat - 1 || recreate_environments_when_repeating) {
+        repeater->OnEnvironmentsTearDownStart(*parent_);
+        std::for_each(environments_.rbegin(), environments_.rend(),
+                      TearDownEnvironment);
+        repeater->OnEnvironmentsTearDownEnd(*parent_);
+      }
+    }
+
+    elapsed_time_ = timer.Elapsed();
+
+    // Tells the unit test event listener that the tests have just finished.
+    repeater->OnTestIterationEnd(*parent_, i);
+
+    // Gets the result and clears it.
+    if (!Passed()) {
+      failed = true;
+    }
+
+    // Restores the original test order after the iteration.  This
+    // allows the user to quickly repro a failure that happens in the
+    // N-th iteration without repeating the first (N - 1) iterations.
+    // This is not enclosed in "if (GTEST_FLAG(shuffle)) { ... }", in
+    // case the user somehow changes the value of the flag somewhere
+    // (it's always safe to unshuffle the tests).
+    UnshuffleTests();
+
+    if (GTEST_FLAG_GET(shuffle)) {
+      // Picks a new random seed for each iteration.
+      random_seed_ = GetNextRandomSeed(random_seed_);
+    }
+  }
+
+  repeater->OnTestProgramEnd(*parent_);
+
+  if (!gtest_is_initialized_before_run_all_tests) {
+    ColoredPrintf(
+        GTestColor::kRed,
+        "\nIMPORTANT NOTICE - DO NOT IGNORE:\n"
+        "This test program did NOT call " GTEST_INIT_GOOGLE_TEST_NAME_
+        "() before calling RUN_ALL_TESTS(). This is INVALID. Soon " GTEST_NAME_
+        " will start to enforce the valid usage. "
+        "Please fix it ASAP, or IT WILL START TO FAIL.\n");  // NOLINT
+#if GTEST_FOR_GOOGLE_
+    ColoredPrintf(GTestColor::kRed,
+                  "For more details, see http://wiki/Main/ValidGUnitMain.\n");
+#endif  // GTEST_FOR_GOOGLE_
+  }
+
+  return !failed;
+}
+
+// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
+// if the variable is present. If a file already exists at this location, this
+// function will write over it. If the variable is present, but the file cannot
+// be created, prints an error and exits.
+void WriteToShardStatusFileIfNeeded() {
+  const char* const test_shard_file = posix::GetEnv(kTestShardStatusFile);
+  if (test_shard_file != nullptr) {
+    FILE* const file = posix::FOpen(test_shard_file, "w");
+    if (file == nullptr) {
+      ColoredPrintf(GTestColor::kRed,
+                    "Could not write to the test shard status file \"%s\" "
+                    "specified by the %s environment variable.\n",
+                    test_shard_file, kTestShardStatusFile);
+      fflush(stdout);
+      exit(EXIT_FAILURE);
+    }
+    fclose(file);
+  }
+}
+
+// Checks whether sharding is enabled by examining the relevant
+// environment variable values. If the variables are present,
+// but inconsistent (i.e., shard_index >= total_shards), prints
+// an error and exits. If in_subprocess_for_death_test, sharding is
+// disabled because it must only be applied to the original test
+// process. Otherwise, we could filter out death tests we intended to execute.
+bool ShouldShard(const char* total_shards_env, const char* shard_index_env,
+                 bool in_subprocess_for_death_test) {
+  if (in_subprocess_for_death_test) {
+    return false;
+  }
+
+  const int32_t total_shards = Int32FromEnvOrDie(total_shards_env, -1);
+  const int32_t shard_index = Int32FromEnvOrDie(shard_index_env, -1);
+
+  if (total_shards == -1 && shard_index == -1) {
+    return false;
+  } else if (total_shards == -1 && shard_index != -1) {
+    const Message msg = Message() << "Invalid environment variables: you have "
+                                  << kTestShardIndex << " = " << shard_index
+                                  << ", but have left " << kTestTotalShards
+                                  << " unset.\n";
+    ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str());
+    fflush(stdout);
+    exit(EXIT_FAILURE);
+  } else if (total_shards != -1 && shard_index == -1) {
+    const Message msg = Message()
+                        << "Invalid environment variables: you have "
+                        << kTestTotalShards << " = " << total_shards
+                        << ", but have left " << kTestShardIndex << " unset.\n";
+    ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str());
+    fflush(stdout);
+    exit(EXIT_FAILURE);
+  } else if (shard_index < 0 || shard_index >= total_shards) {
+    const Message msg =
+        Message() << "Invalid environment variables: we require 0 <= "
+                  << kTestShardIndex << " < " << kTestTotalShards
+                  << ", but you have " << kTestShardIndex << "=" << shard_index
+                  << ", " << kTestTotalShards << "=" << total_shards << ".\n";
+    ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str());
+    fflush(stdout);
+    exit(EXIT_FAILURE);
+  }
+
+  return total_shards > 1;
+}
+
+// Parses the environment variable var as an Int32. If it is unset,
+// returns default_val. If it is not an Int32, prints an error
+// and aborts.
+int32_t Int32FromEnvOrDie(const char* var, int32_t default_val) {
+  const char* str_val = posix::GetEnv(var);
+  if (str_val == nullptr) {
+    return default_val;
+  }
+
+  int32_t result;
+  if (!ParseInt32(Message() << "The value of environment variable " << var,
+                  str_val, &result)) {
+    exit(EXIT_FAILURE);
+  }
+  return result;
+}
+
+// Given the total number of shards, the shard index, and the test id,
+// returns true if and only if the test should be run on this shard. The test id
+// is some arbitrary but unique non-negative integer assigned to each test
+// method. Assumes that 0 <= shard_index < total_shards.
+bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) {
+  return (test_id % total_shards) == shard_index;
+}
+
+// Compares the name of each test with the user-specified filter to
+// decide whether the test should be run, then records the result in
+// each TestSuite and TestInfo object.
+// If shard_tests == true, further filters tests based on sharding
+// variables in the environment - see
+// https://github.com/google/googletest/blob/master/googletest/docs/advanced.md
+// . Returns the number of tests that should run.
+int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
+  const int32_t total_shards = shard_tests == HONOR_SHARDING_PROTOCOL
+                                   ? Int32FromEnvOrDie(kTestTotalShards, -1)
+                                   : -1;
+  const int32_t shard_index = shard_tests == HONOR_SHARDING_PROTOCOL
+                                  ? Int32FromEnvOrDie(kTestShardIndex, -1)
+                                  : -1;
+
+  const PositiveAndNegativeUnitTestFilter gtest_flag_filter(
+      GTEST_FLAG_GET(filter));
+  const UnitTestFilter disable_test_filter(kDisableTestFilter);
+  // num_runnable_tests are the number of tests that will
+  // run across all shards (i.e., match filter and are not disabled).
+  // num_selected_tests are the number of tests to be run on
+  // this shard.
+  int num_runnable_tests = 0;
+  int num_selected_tests = 0;
+  for (auto* test_suite : test_suites_) {
+    const std::string& test_suite_name = test_suite->name();
+    test_suite->set_should_run(false);
+
+    for (size_t j = 0; j < test_suite->test_info_list().size(); j++) {
+      TestInfo* const test_info = test_suite->test_info_list()[j];
+      const std::string test_name(test_info->name());
+      // A test is disabled if test suite name or test name matches
+      // kDisableTestFilter.
+      const bool is_disabled =
+          disable_test_filter.MatchesName(test_suite_name) ||
+          disable_test_filter.MatchesName(test_name);
+      test_info->is_disabled_ = is_disabled;
+
+      const bool matches_filter =
+          gtest_flag_filter.MatchesTest(test_suite_name, test_name);
+      test_info->matches_filter_ = matches_filter;
+
+      const bool is_runnable =
+          (GTEST_FLAG_GET(also_run_disabled_tests) || !is_disabled) &&
+          matches_filter;
+
+      const bool is_in_another_shard =
+          shard_tests != IGNORE_SHARDING_PROTOCOL &&
+          !ShouldRunTestOnShard(total_shards, shard_index, num_runnable_tests);
+      test_info->is_in_another_shard_ = is_in_another_shard;
+      const bool is_selected = is_runnable && !is_in_another_shard;
+
+      num_runnable_tests += is_runnable;
+      num_selected_tests += is_selected;
+
+      test_info->should_run_ = is_selected;
+      test_suite->set_should_run(test_suite->should_run() || is_selected);
+    }
+  }
+  return num_selected_tests;
+}
+
+// Prints the given C-string on a single line by replacing all '\n'
+// characters with string "\\n".  If the output takes more than
+// max_length characters, only prints the first max_length characters
+// and "...".
+static void PrintOnOneLine(const char* str, int max_length) {
+  if (str != nullptr) {
+    for (int i = 0; *str != '\0'; ++str) {
+      if (i >= max_length) {
+        printf("...");
+        break;
+      }
+      if (*str == '\n') {
+        printf("\\n");
+        i += 2;
+      } else {
+        printf("%c", *str);
+        ++i;
+      }
+    }
+  }
+}
+
+// Prints the names of the tests matching the user-specified filter flag.
+void UnitTestImpl::ListTestsMatchingFilter() {
+  // Print at most this many characters for each type/value parameter.
+  const int kMaxParamLength = 250;
+
+  for (auto* test_suite : test_suites_) {
+    bool printed_test_suite_name = false;
+
+    for (size_t j = 0; j < test_suite->test_info_list().size(); j++) {
+      const TestInfo* const test_info = test_suite->test_info_list()[j];
+      if (test_info->matches_filter_) {
+        if (!printed_test_suite_name) {
+          printed_test_suite_name = true;
+          printf("%s.", test_suite->name());
+          if (test_suite->type_param() != nullptr) {
+            printf("  # %s = ", kTypeParamLabel);
+            // We print the type parameter on a single line to make
+            // the output easy to parse by a program.
+            PrintOnOneLine(test_suite->type_param(), kMaxParamLength);
+          }
+          printf("\n");
+        }
+        printf("  %s", test_info->name());
+        if (test_info->value_param() != nullptr) {
+          printf("  # %s = ", kValueParamLabel);
+          // We print the value parameter on a single line to make the
+          // output easy to parse by a program.
+          PrintOnOneLine(test_info->value_param(), kMaxParamLength);
+        }
+        printf("\n");
+      }
+    }
+  }
+  fflush(stdout);
+  const std::string& output_format = UnitTestOptions::GetOutputFormat();
+  if (output_format == "xml" || output_format == "json") {
+    FILE* fileout = OpenFileForWriting(
+        UnitTestOptions::GetAbsolutePathToOutputFile().c_str());
+    std::stringstream stream;
+    if (output_format == "xml") {
+      XmlUnitTestResultPrinter(
+          UnitTestOptions::GetAbsolutePathToOutputFile().c_str())
+          .PrintXmlTestsList(&stream, test_suites_);
+    } else if (output_format == "json") {
+      JsonUnitTestResultPrinter(
+          UnitTestOptions::GetAbsolutePathToOutputFile().c_str())
+          .PrintJsonTestList(&stream, test_suites_);
+    }
+    fprintf(fileout, "%s", StringStreamToString(&stream).c_str());
+    fclose(fileout);
+  }
+}
+
+// Sets the OS stack trace getter.
+//
+// Does nothing if the input and the current OS stack trace getter are
+// the same; otherwise, deletes the old getter and makes the input the
+// current getter.
+void UnitTestImpl::set_os_stack_trace_getter(
+    OsStackTraceGetterInterface* getter) {
+  if (os_stack_trace_getter_ != getter) {
+    delete os_stack_trace_getter_;
+    os_stack_trace_getter_ = getter;
+  }
+}
+
+// Returns the current OS stack trace getter if it is not NULL;
+// otherwise, creates an OsStackTraceGetter, makes it the current
+// getter, and returns it.
+OsStackTraceGetterInterface* UnitTestImpl::os_stack_trace_getter() {
+  if (os_stack_trace_getter_ == nullptr) {
+#ifdef GTEST_OS_STACK_TRACE_GETTER_
+    os_stack_trace_getter_ = new GTEST_OS_STACK_TRACE_GETTER_;
+#else
+    os_stack_trace_getter_ = new OsStackTraceGetter;
+#endif  // GTEST_OS_STACK_TRACE_GETTER_
+  }
+
+  return os_stack_trace_getter_;
+}
+
+// Returns the most specific TestResult currently running.
+TestResult* UnitTestImpl::current_test_result() {
+  if (current_test_info_ != nullptr) {
+    return &current_test_info_->result_;
+  }
+  if (current_test_suite_ != nullptr) {
+    return &current_test_suite_->ad_hoc_test_result_;
+  }
+  return &ad_hoc_test_result_;
+}
+
+// Shuffles all test suites, and the tests within each test suite,
+// making sure that death tests are still run first.
+void UnitTestImpl::ShuffleTests() {
+  // Shuffles the death test suites.
+  ShuffleRange(random(), 0, last_death_test_suite_ + 1, &test_suite_indices_);
+
+  // Shuffles the non-death test suites.
+  ShuffleRange(random(), last_death_test_suite_ + 1,
+               static_cast<int>(test_suites_.size()), &test_suite_indices_);
+
+  // Shuffles the tests inside each test suite.
+  for (auto& test_suite : test_suites_) {
+    test_suite->ShuffleTests(random());
+  }
+}
+
+// Restores the test suites and tests to their order before the first shuffle.
+void UnitTestImpl::UnshuffleTests() {
+  for (size_t i = 0; i < test_suites_.size(); i++) {
+    // Unshuffles the tests in each test suite.
+    test_suites_[i]->UnshuffleTests();
+    // Resets the index of each test suite.
+    test_suite_indices_[i] = static_cast<int>(i);
+  }
+}
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag.  The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
+// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
+GTEST_NO_INLINE_ GTEST_NO_TAIL_CALL_ std::string
+GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/, int skip_count) {
+  // We pass skip_count + 1 to skip this wrapper function in addition
+  // to what the user really wants to skip.
+  return GetUnitTestImpl()->CurrentOsStackTraceExceptTop(skip_count + 1);
+}
+
+// Used by the GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_ macro to
+// suppress unreachable code warnings.
+namespace {
+class ClassUniqueToAlwaysTrue {};
+}  // namespace
+
+bool IsTrue(bool condition) { return condition; }
+
+bool AlwaysTrue() {
+#if GTEST_HAS_EXCEPTIONS
+  // This condition is always false so AlwaysTrue() never actually throws,
+  // but it makes the compiler think that it may throw.
+  if (IsTrue(false)) throw ClassUniqueToAlwaysTrue();
+#endif  // GTEST_HAS_EXCEPTIONS
+  return true;
+}
+
+// If *pstr starts with the given prefix, modifies *pstr to be right
+// past the prefix and returns true; otherwise leaves *pstr unchanged
+// and returns false.  None of pstr, *pstr, and prefix can be NULL.
+bool SkipPrefix(const char* prefix, const char** pstr) {
+  const size_t prefix_len = strlen(prefix);
+  if (strncmp(*pstr, prefix, prefix_len) == 0) {
+    *pstr += prefix_len;
+    return true;
+  }
+  return false;
+}
+
+// Parses a string as a command line flag.  The string should have
+// the format "--flag=value".  When def_optional is true, the "=value"
+// part can be omitted.
+//
+// Returns the value of the flag, or NULL if the parsing failed.
+static const char* ParseFlagValue(const char* str, const char* flag_name,
+                                  bool def_optional) {
+  // str and flag must not be NULL.
+  if (str == nullptr || flag_name == nullptr) return nullptr;
+
+  // The flag must start with "--" followed by GTEST_FLAG_PREFIX_.
+  const std::string flag_str =
+      std::string("--") + GTEST_FLAG_PREFIX_ + flag_name;
+  const size_t flag_len = flag_str.length();
+  if (strncmp(str, flag_str.c_str(), flag_len) != 0) return nullptr;
+
+  // Skips the flag name.
+  const char* flag_end = str + flag_len;
+
+  // When def_optional is true, it's OK to not have a "=value" part.
+  if (def_optional && (flag_end[0] == '\0')) {
+    return flag_end;
+  }
+
+  // If def_optional is true and there are more characters after the
+  // flag name, or if def_optional is false, there must be a '=' after
+  // the flag name.
+  if (flag_end[0] != '=') return nullptr;
+
+  // Returns the string after "=".
+  return flag_end + 1;
+}
+
+// Parses a string for a bool flag, in the form of either
+// "--flag=value" or "--flag".
+//
+// In the former case, the value is taken as true as long as it does
+// not start with '0', 'f', or 'F'.
+//
+// In the latter case, the value is taken as true.
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+static bool ParseFlag(const char* str, const char* flag_name, bool* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag_name, true);
+
+  // Aborts if the parsing failed.
+  if (value_str == nullptr) return false;
+
+  // Converts the string value to a bool.
+  *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F');
+  return true;
+}
+
+// Parses a string for an int32_t flag, in the form of "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseFlag(const char* str, const char* flag_name, int32_t* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag_name, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == nullptr) return false;
+
+  // Sets *value to the value of the flag.
+  return ParseInt32(Message() << "The value of flag --" << flag_name, value_str,
+                    value);
+}
+
+// Parses a string for a string flag, in the form of "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+template <typename String>
+static bool ParseFlag(const char* str, const char* flag_name, String* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag_name, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == nullptr) return false;
+
+  // Sets *value to the value of the flag.
+  *value = value_str;
+  return true;
+}
+
+// Determines whether a string has a prefix that Google Test uses for its
+// flags, i.e., starts with GTEST_FLAG_PREFIX_ or GTEST_FLAG_PREFIX_DASH_.
+// If Google Test detects that a command line flag has its prefix but is not
+// recognized, it will print its help message. Flags starting with
+// GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test
+// internal flags and do not trigger the help message.
+static bool HasGoogleTestFlagPrefix(const char* str) {
+  return (SkipPrefix("--", &str) || SkipPrefix("-", &str) ||
+          SkipPrefix("/", &str)) &&
+         !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) &&
+         (SkipPrefix(GTEST_FLAG_PREFIX_, &str) ||
+          SkipPrefix(GTEST_FLAG_PREFIX_DASH_, &str));
+}
+
+// Prints a string containing code-encoded text.  The following escape
+// sequences can be used in the string to control the text color:
+//
+//   @@    prints a single '@' character.
+//   @R    changes the color to red.
+//   @G    changes the color to green.
+//   @Y    changes the color to yellow.
+//   @D    changes to the default terminal text color.
+//
+static void PrintColorEncoded(const char* str) {
+  GTestColor color = GTestColor::kDefault;  // The current color.
+
+  // Conceptually, we split the string into segments divided by escape
+  // sequences.  Then we print one segment at a time.  At the end of
+  // each iteration, the str pointer advances to the beginning of the
+  // next segment.
+  for (;;) {
+    const char* p = strchr(str, '@');
+    if (p == nullptr) {
+      ColoredPrintf(color, "%s", str);
+      return;
+    }
+
+    ColoredPrintf(color, "%s", std::string(str, p).c_str());
+
+    const char ch = p[1];
+    str = p + 2;
+    if (ch == '@') {
+      ColoredPrintf(color, "@");
+    } else if (ch == 'D') {
+      color = GTestColor::kDefault;
+    } else if (ch == 'R') {
+      color = GTestColor::kRed;
+    } else if (ch == 'G') {
+      color = GTestColor::kGreen;
+    } else if (ch == 'Y') {
+      color = GTestColor::kYellow;
+    } else {
+      --str;
+    }
+  }
+}
+
+static const char kColorEncodedHelpMessage[] =
+    "This program contains tests written using " GTEST_NAME_
+    ". You can use the\n"
+    "following command line flags to control its behavior:\n"
+    "\n"
+    "Test Selection:\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "list_tests@D\n"
+    "      List the names of all tests instead of running them. The name of\n"
+    "      TEST(Foo, Bar) is \"Foo.Bar\".\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "filter=@YPOSITIVE_PATTERNS"
+    "[@G-@YNEGATIVE_PATTERNS]@D\n"
+    "      Run only the tests whose name matches one of the positive patterns "
+    "but\n"
+    "      none of the negative patterns. '?' matches any single character; "
+    "'*'\n"
+    "      matches any substring; ':' separates two patterns.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "also_run_disabled_tests@D\n"
+    "      Run all disabled tests too.\n"
+    "\n"
+    "Test Execution:\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "repeat=@Y[COUNT]@D\n"
+    "      Run the tests repeatedly; use a negative count to repeat forever.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "shuffle@D\n"
+    "      Randomize tests' orders on every iteration.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "random_seed=@Y[NUMBER]@D\n"
+    "      Random number seed to use for shuffling test orders (between 1 and\n"
+    "      99999, or 0 to use a seed based on the current time).\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "recreate_environments_when_repeating@D\n"
+    "      Sets up and tears down the global test environment on each repeat\n"
+    "      of the test.\n"
+    "\n"
+    "Test Output:\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n"
+    "      Enable/disable colored output. The default is @Gauto@D.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "brief=1@D\n"
+    "      Only print test failures.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "print_time=0@D\n"
+    "      Don't print the elapsed time of each test.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "output=@Y(@Gjson@Y|@Gxml@Y)[@G:@YDIRECTORY_PATH@G" GTEST_PATH_SEP_
+    "@Y|@G:@YFILE_PATH]@D\n"
+    "      Generate a JSON or XML report in the given directory or with the "
+    "given\n"
+    "      file name. @YFILE_PATH@D defaults to @Gtest_detail.xml@D.\n"
+#if GTEST_CAN_STREAM_RESULTS_
+    "  @G--" GTEST_FLAG_PREFIX_
+    "stream_result_to=@YHOST@G:@YPORT@D\n"
+    "      Stream test results to the given server.\n"
+#endif  // GTEST_CAN_STREAM_RESULTS_
+    "\n"
+    "Assertion Behavior:\n"
+#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+    "  @G--" GTEST_FLAG_PREFIX_
+    "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n"
+    "      Set the default death test style.\n"
+#endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+    "  @G--" GTEST_FLAG_PREFIX_
+    "break_on_failure@D\n"
+    "      Turn assertion failures into debugger break-points.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "throw_on_failure@D\n"
+    "      Turn assertion failures into C++ exceptions for use by an external\n"
+    "      test framework.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "catch_exceptions=0@D\n"
+    "      Do not report exceptions as test failures. Instead, allow them\n"
+    "      to crash the program or throw a pop-up (on Windows).\n"
+    "\n"
+    "Except for @G--" GTEST_FLAG_PREFIX_
+    "list_tests@D, you can alternatively set "
+    "the corresponding\n"
+    "environment variable of a flag (all letters in upper-case). For example, "
+    "to\n"
+    "disable colored text output, you can either specify "
+    "@G--" GTEST_FLAG_PREFIX_
+    "color=no@D or set\n"
+    "the @G" GTEST_FLAG_PREFIX_UPPER_
+    "COLOR@D environment variable to @Gno@D.\n"
+    "\n"
+    "For more information, please read the " GTEST_NAME_
+    " documentation at\n"
+    "@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_
+    "\n"
+    "(not one in your own code or tests), please report it to\n"
+    "@G<" GTEST_DEV_EMAIL_ ">@D.\n";
+
+static bool ParseGoogleTestFlag(const char* const arg) {
+#define GTEST_INTERNAL_PARSE_FLAG(flag_name)  \
+  do {                                        \
+    auto value = GTEST_FLAG_GET(flag_name);   \
+    if (ParseFlag(arg, #flag_name, &value)) { \
+      GTEST_FLAG_SET(flag_name, value);       \
+      return true;                            \
+    }                                         \
+  } while (false)
+
+  GTEST_INTERNAL_PARSE_FLAG(also_run_disabled_tests);
+  GTEST_INTERNAL_PARSE_FLAG(break_on_failure);
+  GTEST_INTERNAL_PARSE_FLAG(catch_exceptions);
+  GTEST_INTERNAL_PARSE_FLAG(color);
+  GTEST_INTERNAL_PARSE_FLAG(death_test_style);
+  GTEST_INTERNAL_PARSE_FLAG(death_test_use_fork);
+  GTEST_INTERNAL_PARSE_FLAG(fail_fast);
+  GTEST_INTERNAL_PARSE_FLAG(filter);
+  GTEST_INTERNAL_PARSE_FLAG(internal_run_death_test);
+  GTEST_INTERNAL_PARSE_FLAG(list_tests);
+  GTEST_INTERNAL_PARSE_FLAG(output);
+  GTEST_INTERNAL_PARSE_FLAG(brief);
+  GTEST_INTERNAL_PARSE_FLAG(print_time);
+  GTEST_INTERNAL_PARSE_FLAG(print_utf8);
+  GTEST_INTERNAL_PARSE_FLAG(random_seed);
+  GTEST_INTERNAL_PARSE_FLAG(repeat);
+  GTEST_INTERNAL_PARSE_FLAG(recreate_environments_when_repeating);
+  GTEST_INTERNAL_PARSE_FLAG(shuffle);
+  GTEST_INTERNAL_PARSE_FLAG(stack_trace_depth);
+  GTEST_INTERNAL_PARSE_FLAG(stream_result_to);
+  GTEST_INTERNAL_PARSE_FLAG(throw_on_failure);
+  return false;
+}
+
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+static void LoadFlagsFromFile(const std::string& path) {
+  FILE* flagfile = posix::FOpen(path.c_str(), "r");
+  if (!flagfile) {
+    GTEST_LOG_(FATAL) << "Unable to open file \"" << GTEST_FLAG_GET(flagfile)
+                      << "\"";
+  }
+  std::string contents(ReadEntireFile(flagfile));
+  posix::FClose(flagfile);
+  std::vector<std::string> lines;
+  SplitString(contents, '\n', &lines);
+  for (size_t i = 0; i < lines.size(); ++i) {
+    if (lines[i].empty()) continue;
+    if (!ParseGoogleTestFlag(lines[i].c_str())) g_help_flag = true;
+  }
+}
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.  The type parameter CharType can be
+// instantiated to either char or wchar_t.
+template <typename CharType>
+void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
+  std::string flagfile_value;
+  for (int i = 1; i < *argc; i++) {
+    const std::string arg_string = StreamableToString(argv[i]);
+    const char* const arg = arg_string.c_str();
+
+    using internal::ParseFlag;
+
+    bool remove_flag = false;
+    if (ParseGoogleTestFlag(arg)) {
+      remove_flag = true;
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+    } else if (ParseFlag(arg, "flagfile", &flagfile_value)) {
+      GTEST_FLAG_SET(flagfile, flagfile_value);
+      LoadFlagsFromFile(flagfile_value);
+      remove_flag = true;
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+    } else if (arg_string == "--help" || HasGoogleTestFlagPrefix(arg)) {
+      // Both help flag and unrecognized Google Test flags (excluding
+      // internal ones) trigger help display.
+      g_help_flag = true;
+    }
+
+    if (remove_flag) {
+      // Shift the remainder of the argv list left by one.  Note
+      // that argv has (*argc + 1) elements, the last one always being
+      // NULL.  The following loop moves the trailing NULL element as
+      // well.
+      for (int j = i; j != *argc; j++) {
+        argv[j] = argv[j + 1];
+      }
+
+      // Decrements the argument count.
+      (*argc)--;
+
+      // We also need to decrement the iterator as we just removed
+      // an element.
+      i--;
+    }
+  }
+
+  if (g_help_flag) {
+    // We print the help here instead of in RUN_ALL_TESTS(), as the
+    // latter may not be called at all if the user is using Google
+    // Test with another testing framework.
+    PrintColorEncoded(kColorEncodedHelpMessage);
+  }
+}
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.
+void ParseGoogleTestFlagsOnly(int* argc, char** argv) {
+#if GTEST_HAS_ABSL
+  if (*argc > 0) {
+    // absl::ParseCommandLine() requires *argc > 0.
+    auto positional_args = absl::flags_internal::ParseCommandLineImpl(
+        *argc, argv, absl::flags_internal::ArgvListAction::kRemoveParsedArgs,
+        absl::flags_internal::UsageFlagsAction::kHandleUsage,
+        absl::flags_internal::OnUndefinedFlag::kReportUndefined);
+    // Any command-line positional arguments not part of any command-line flag
+    // (or arguments to a flag) are copied back out to argv, with the program
+    // invocation name at position 0, and argc is resized. This includes
+    // positional arguments after the flag-terminating delimiter '--'.
+    // See https://abseil.io/docs/cpp/guides/flags.
+    std::copy(positional_args.begin(), positional_args.end(), argv);
+    if (static_cast<int>(positional_args.size()) < *argc) {
+      argv[positional_args.size()] = nullptr;
+      *argc = static_cast<int>(positional_args.size());
+    }
+  }
+#else
+  ParseGoogleTestFlagsOnlyImpl(argc, argv);
+#endif
+
+  // Fix the value of *_NSGetArgc() on macOS, but if and only if
+  // *_NSGetArgv() == argv
+  // Only applicable to char** version of argv
+#if GTEST_OS_MAC
+#ifndef GTEST_OS_IOS
+  if (*_NSGetArgv() == argv) {
+    *_NSGetArgc() = *argc;
+  }
+#endif
+#endif
+}
+void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv) {
+  ParseGoogleTestFlagsOnlyImpl(argc, argv);
+}
+
+// The internal implementation of InitGoogleTest().
+//
+// The type parameter CharType can be instantiated to either char or
+// wchar_t.
+template <typename CharType>
+void InitGoogleTestImpl(int* argc, CharType** argv) {
+  // We don't want to run the initialization code twice.
+  if (GTestIsInitialized()) return;
+
+  if (*argc <= 0) return;
+
+  g_argvs.clear();
+  for (int i = 0; i != *argc; i++) {
+    g_argvs.push_back(StreamableToString(argv[i]));
+  }
+
+#if GTEST_HAS_ABSL
+  absl::InitializeSymbolizer(g_argvs[0].c_str());
+
+  // When using the Abseil Flags library, set the program usage message to the
+  // help message, but remove the color-encoding from the message first.
+  absl::SetProgramUsageMessage(absl::StrReplaceAll(
+      kColorEncodedHelpMessage,
+      {{"@D", ""}, {"@R", ""}, {"@G", ""}, {"@Y", ""}, {"@@", "@"}}));
+#endif  // GTEST_HAS_ABSL
+
+  ParseGoogleTestFlagsOnly(argc, argv);
+  GetUnitTestImpl()->PostFlagParsingInit();
+}
+
+}  // namespace internal
+
+// Initializes Google Test.  This must be called before calling
+// RUN_ALL_TESTS().  In particular, it parses a command line for the
+// flags that Google Test recognizes.  Whenever a Google Test flag is
+// seen, it is removed from argv, and *argc is decremented.
+//
+// No value is returned.  Instead, the Google Test flag variables are
+// updated.
+//
+// Calling the function for the second time has no user-visible effect.
+void InitGoogleTest(int* argc, char** argv) {
+#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
+#else   // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  internal::InitGoogleTestImpl(argc, argv);
+#endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+}
+
+// This overloaded version can be used in Windows programs compiled in
+// UNICODE mode.
+void InitGoogleTest(int* argc, wchar_t** argv) {
+#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
+#else   // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  internal::InitGoogleTestImpl(argc, argv);
+#endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+}
+
+// This overloaded version can be used on Arduino/embedded platforms where
+// there is no argc/argv.
+void InitGoogleTest() {
+  // Since Arduino doesn't have a command line, fake out the argc/argv arguments
+  int argc = 1;
+  const auto arg0 = "dummy";
+  char* argv0 = const_cast<char*>(arg0);
+  char** argv = &argv0;
+
+#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(&argc, argv);
+#else   // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  internal::InitGoogleTestImpl(&argc, argv);
+#endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+}
+
+#if !defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_)
+// Return value of first environment variable that is set and contains
+// a non-empty string. If there are none, return the "fallback" string.
+// Since we like the temporary directory to have a directory separator suffix,
+// add it if not provided in the environment variable value.
+static std::string GetTempDirFromEnv(
+    std::initializer_list<const char*> environment_variables,
+    const char* fallback, char separator) {
+  for (const char* variable_name : environment_variables) {
+    const char* value = internal::posix::GetEnv(variable_name);
+    if (value != nullptr && value[0] != '\0') {
+      if (value[strlen(value) - 1] != separator) {
+        return std::string(value).append(1, separator);
+      }
+      return value;
+    }
+  }
+  return fallback;
+}
+#endif
+
+std::string TempDir() {
+#if defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_)
+  return GTEST_CUSTOM_TEMPDIR_FUNCTION_();
+#elif GTEST_OS_WINDOWS || GTEST_OS_WINDOWS_MOBILE
+  return GetTempDirFromEnv({"TEST_TMPDIR", "TEMP"}, "\\temp\\", '\\');
+#elif GTEST_OS_LINUX_ANDROID
+  return GetTempDirFromEnv({"TEST_TMPDIR", "TMPDIR"}, "/data/local/tmp/", '/');
+#else
+  return GetTempDirFromEnv({"TEST_TMPDIR", "TMPDIR"}, "/tmp/", '/');
+#endif
+}
+
+// Class ScopedTrace
+
+// Pushes the given source file location and message onto a per-thread
+// trace stack maintained by Google Test.
+void ScopedTrace::PushTrace(const char* file, int line, std::string message) {
+  internal::TraceInfo trace;
+  trace.file = file;
+  trace.line = line;
+  trace.message.swap(message);
+
+  UnitTest::GetInstance()->PushGTestTrace(trace);
+}
+
+// Pops the info pushed by the c'tor.
+ScopedTrace::~ScopedTrace() GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
+  UnitTest::GetInstance()->PopGTestTrace();
+}
+
+}  // namespace testing
diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest_main.cc b/media/libvpx/libvpx/third_party/googletest/src/src/gtest_main.cc
index f302822552..44976375c9 100644
--- a/media/libvpx/libvpx/third_party/googletest/src/src/gtest_main.cc
+++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest_main.cc
@@ -27,12 +27,27 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-#include <stdio.h>
+#include <cstdio>
 
 #include "gtest/gtest.h"
 
+#if GTEST_OS_ESP8266 || GTEST_OS_ESP32
+#if GTEST_OS_ESP8266
+extern "C" {
+#endif
+void setup() { testing::InitGoogleTest(); }
+
+void loop() { RUN_ALL_TESTS(); }
+
+#if GTEST_OS_ESP8266
+}
+#endif
+
+#else
+
 GTEST_API_ int main(int argc, char **argv) {
-  printf("Running main() from gtest_main.cc\n");
+  printf("Running main() from %s\n", __FILE__);
   testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
+#endif
diff --git a/media/libvpx/libvpx/third_party/libwebm/AUTHORS.TXT b/media/libvpx/libvpx/third_party/libwebm/AUTHORS.TXT
index 9686ac13eb..59b648ca68 100644
--- a/media/libvpx/libvpx/third_party/libwebm/AUTHORS.TXT
+++ b/media/libvpx/libvpx/third_party/libwebm/AUTHORS.TXT
@@ -2,3 +2,4 @@
 # Name or Organization <email address>
 
 Google Inc.
+Elijah Cirioli <eli.cirioli@gmail.com>
diff --git a/media/libvpx/libvpx/third_party/libwebm/Android.mk b/media/libvpx/libvpx/third_party/libwebm/Android.mk
index 8149a083f4..e6c17df021 100644
--- a/media/libvpx/libvpx/third_party/libwebm/Android.mk
+++ b/media/libvpx/libvpx/third_party/libwebm/Android.mk
@@ -1,9 +1,11 @@
+# Ignore this file during non-NDK builds.
+ifdef NDK_ROOT
 LOCAL_PATH:= $(call my-dir)
 
 include $(CLEAR_VARS)
 LOCAL_MODULE:= libwebm
 LOCAL_CPPFLAGS:=-D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS
-LOCAL_CPPFLAGS+=-D__STDC_LIMIT_MACROS -Wno-extern-c-compat
+LOCAL_CPPFLAGS+=-D__STDC_LIMIT_MACROS -std=c++11
 LOCAL_C_INCLUDES:= $(LOCAL_PATH)
 LOCAL_EXPORT_C_INCLUDES:= $(LOCAL_PATH)
 
@@ -14,4 +16,8 @@ LOCAL_SRC_FILES:= common/file_util.cc \
                   mkvmuxer/mkvmuxer.cc \
                   mkvmuxer/mkvmuxerutil.cc \
                   mkvmuxer/mkvwriter.cc
+LOCAL_LICENSE_KINDS := SPDX-license-identifier-BSD
+LOCAL_LICENSE_CONDITIONS := notice
+LOCAL_NOTICE_FILE := $(LOCAL_PATH)/LICENSE.TXT $(LOCAL_PATH)/PATENTS.TXT
 include $(BUILD_STATIC_LIBRARY)
+endif  # NDK_ROOT
diff --git a/media/libvpx/libvpx/third_party/libwebm/README.libvpx b/media/libvpx/libvpx/third_party/libwebm/README.libvpx
index 1f8a13d78c..6e43487540 100644
--- a/media/libvpx/libvpx/third_party/libwebm/README.libvpx
+++ b/media/libvpx/libvpx/third_party/libwebm/README.libvpx
@@ -1,10 +1,20 @@
 URL: https://chromium.googlesource.com/webm/libwebm
-Version: 9732ae991efb71aced4267d4794918279e362d99
+Version: 3b630045052e1e4d563207ab9e3be8d137c26067
 License: BSD
-License File: LICENSE.txt
+License File: LICENSE.TXT
 
 Description:
 libwebm is used to handle WebM container I/O.
 
 Local Changes:
-* <none>
+Only keep:
+ - Android.mk
+ - AUTHORS.TXT
+ - common/
+    file_util.cc/h
+    hdr_util.cc/h
+    webmids.h
+ - LICENSE.TXT
+ - mkvmuxer/
+ - mkvparser/
+ - PATENTS.TXT
diff --git a/media/libvpx/libvpx/third_party/libwebm/common/file_util.cc b/media/libvpx/libvpx/third_party/libwebm/common/file_util.cc
index 6dab146dd9..6eb6428b98 100644
--- a/media/libvpx/libvpx/third_party/libwebm/common/file_util.cc
+++ b/media/libvpx/libvpx/third_party/libwebm/common/file_util.cc
@@ -17,14 +17,15 @@
 #include <cstring>
 #include <fstream>
 #include <ios>
+#include <string>
 
 namespace libwebm {
 
 std::string GetTempFileName() {
 #if !defined _MSC_VER && !defined __MINGW32__
   std::string temp_file_name_template_str =
-      std::string(std::getenv("TEST_TMPDIR") ? std::getenv("TEST_TMPDIR") :
-                                               ".") +
+      std::string(std::getenv("TEST_TMPDIR") ? std::getenv("TEST_TMPDIR")
+                                             : ".") +
       "/libwebm_temp.XXXXXX";
   char* temp_file_name_template =
       new char[temp_file_name_template_str.length() + 1];
@@ -41,7 +42,12 @@ std::string GetTempFileName() {
   return temp_file_name;
 #else
   char tmp_file_name[_MAX_PATH];
+#if defined _MSC_VER || defined MINGW_HAS_SECURE_API
   errno_t err = tmpnam_s(tmp_file_name);
+#else
+  char* fname_pointer = tmpnam(tmp_file_name);
+  int err = (fname_pointer == &tmp_file_name[0]) ? 0 : -1;
+#endif
   if (err == 0) {
     return std::string(tmp_file_name);
   }
@@ -65,6 +71,15 @@ uint64_t GetFileSize(const std::string& file_name) {
   return file_size;
 }
 
+bool GetFileContents(const std::string& file_name, std::string* contents) {
+  std::ifstream file(file_name.c_str());
+  *contents = std::string(static_cast<size_t>(GetFileSize(file_name)), 0);
+  if (file.good() && contents->size()) {
+    file.read(&(*contents)[0], contents->size());
+  }
+  return !file.fail();
+}
+
 TempFileDeleter::TempFileDeleter() { file_name_ = GetTempFileName(); }
 
 TempFileDeleter::~TempFileDeleter() {
diff --git a/media/libvpx/libvpx/third_party/libwebm/common/file_util.h b/media/libvpx/libvpx/third_party/libwebm/common/file_util.h
index 0e71eac11e..a873734641 100644
--- a/media/libvpx/libvpx/third_party/libwebm/common/file_util.h
+++ b/media/libvpx/libvpx/third_party/libwebm/common/file_util.h
@@ -22,6 +22,9 @@ std::string GetTempFileName();
 // Returns size of file specified by |file_name|, or 0 upon failure.
 uint64_t GetFileSize(const std::string& file_name);
 
+// Gets the contents file_name as a string. Returns false on error.
+bool GetFileContents(const std::string& file_name, std::string* contents);
+
 // Manages life of temporary file specified at time of construction. Deletes
 // file upon destruction.
 class TempFileDeleter {
@@ -38,4 +41,4 @@ class TempFileDeleter {
 
 }  // namespace libwebm
 
-#endif  // LIBWEBM_COMMON_FILE_UTIL_H_
\ No newline at end of file
+#endif  // LIBWEBM_COMMON_FILE_UTIL_H_
diff --git a/media/libvpx/libvpx/third_party/libwebm/common/hdr_util.cc b/media/libvpx/libvpx/third_party/libwebm/common/hdr_util.cc
index e1618ce75a..f1320a5361 100644
--- a/media/libvpx/libvpx/third_party/libwebm/common/hdr_util.cc
+++ b/media/libvpx/libvpx/third_party/libwebm/common/hdr_util.cc
@@ -36,10 +36,10 @@ bool CopyMasteringMetadata(const mkvparser::MasteringMetadata& parser_mm,
   if (MasteringMetadataValuePresent(parser_mm.luminance_min))
     muxer_mm->set_luminance_min(parser_mm.luminance_min);
 
-  PrimaryChromaticityPtr r_ptr(NULL);
-  PrimaryChromaticityPtr g_ptr(NULL);
-  PrimaryChromaticityPtr b_ptr(NULL);
-  PrimaryChromaticityPtr wp_ptr(NULL);
+  PrimaryChromaticityPtr r_ptr(nullptr);
+  PrimaryChromaticityPtr g_ptr(nullptr);
+  PrimaryChromaticityPtr b_ptr(nullptr);
+  PrimaryChromaticityPtr wp_ptr(nullptr);
 
   if (parser_mm.r) {
     if (!CopyPrimaryChromaticity(*parser_mm.r, &r_ptr))
@@ -202,7 +202,8 @@ bool ParseVpxCodecPrivate(const uint8_t* private_data, int32_t length,
       features->bit_depth = priv_profile;
     } else if (id_byte == kVp9ChromaSubsamplingId) {
       const int priv_profile = static_cast<int>(private_data[offset++]);
-      if (priv_profile != 0 && priv_profile != 2 && priv_profile != 3)
+      if (priv_profile != 0 && priv_profile != 1 && priv_profile != 2 &&
+          priv_profile != 3)
         return false;
       if (features->chroma_subsampling != Vp9CodecFeatures::kValueNotPresent &&
           features->chroma_subsampling != priv_profile) {
diff --git a/media/libvpx/libvpx/third_party/libwebm/common/hdr_util.h b/media/libvpx/libvpx/third_party/libwebm/common/hdr_util.h
index 689fb30a3f..78e2eeb705 100644
--- a/media/libvpx/libvpx/third_party/libwebm/common/hdr_util.h
+++ b/media/libvpx/libvpx/third_party/libwebm/common/hdr_util.h
@@ -47,7 +47,7 @@ struct Vp9CodecFeatures {
   int chroma_subsampling;
 };
 
-typedef std::auto_ptr<mkvmuxer::PrimaryChromaticity> PrimaryChromaticityPtr;
+typedef std::unique_ptr<mkvmuxer::PrimaryChromaticity> PrimaryChromaticityPtr;
 
 bool CopyPrimaryChromaticity(const mkvparser::PrimaryChromaticity& parser_pc,
                              PrimaryChromaticityPtr* muxer_pc);
diff --git a/media/libvpx/libvpx/third_party/libwebm/common/webmids.h b/media/libvpx/libvpx/third_party/libwebm/common/webmids.h
index 89d722a71b..fc0c208140 100644
--- a/media/libvpx/libvpx/third_party/libwebm/common/webmids.h
+++ b/media/libvpx/libvpx/third_party/libwebm/common/webmids.h
@@ -93,6 +93,7 @@ enum MkvId {
   kMkvDisplayHeight = 0x54BA,
   kMkvDisplayUnit = 0x54B2,
   kMkvAspectRatioType = 0x54B3,
+  kMkvColourSpace = 0x2EB524,
   kMkvFrameRate = 0x2383E3,
   // end video
   // colour
diff --git a/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc
index 299b45c989..21e51be474 100644
--- a/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc
+++ b/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc
@@ -8,6 +8,8 @@
 
 #include "mkvmuxer/mkvmuxer.h"
 
+#include <stdint.h>
+
 #include <cfloat>
 #include <climits>
 #include <cstdio>
@@ -63,11 +65,12 @@ bool StrCpy(const char* src, char** dst_ptr) {
   if (dst == NULL)
     return false;
 
-  strcpy(dst, src);  // NOLINT
+  memcpy(dst, src, size - 1);
+  dst[size - 1] = '\0';
   return true;
 }
 
-typedef std::auto_ptr<PrimaryChromaticity> PrimaryChromaticityPtr;
+typedef std::unique_ptr<PrimaryChromaticity> PrimaryChromaticityPtr;
 bool CopyChromaticity(const PrimaryChromaticity* src,
                       PrimaryChromaticityPtr* dst) {
   if (!dst)
@@ -605,10 +608,10 @@ bool ContentEncoding::Write(IMkvWriter* writer) const {
   return true;
 }
 
-uint64_t ContentEncoding::EncodingSize(uint64_t compresion_size,
+uint64_t ContentEncoding::EncodingSize(uint64_t compression_size,
                                        uint64_t encryption_size) const {
   // TODO(fgalligan): Add support for compression settings.
-  if (compresion_size != 0)
+  if (compression_size != 0)
     return 0;
 
   uint64_t encoding_size = 0;
@@ -771,6 +774,14 @@ bool Track::Write(IMkvWriter* writer) const {
   if (!type_ || !codec_id_)
     return false;
 
+  // AV1 tracks require a CodecPrivate. See
+  // https://github.com/ietf-wg-cellar/matroska-specification/blob/HEAD/codec/av1.md
+  // TODO(tomfinegan): Update the above link to the AV1 Matroska mappings to
+  // point to a stable version once it is finalized, or our own WebM mappings
+  // page on webmproject.org should we decide to release them.
+  if (!strcmp(codec_id_, Tracks::kAv1CodecId) && !codec_private_)
+    return false;
+
   // |size| may be bigger than what is written out in this function because
   // derived classes may write out more data in the Track element.
   const uint64_t payload_size = PayloadSize();
@@ -909,11 +920,8 @@ void Track::set_codec_id(const char* codec_id) {
     const size_t length = strlen(codec_id) + 1;
     codec_id_ = new (std::nothrow) char[length];  // NOLINT
     if (codec_id_) {
-#ifdef _MSC_VER
-      strcpy_s(codec_id_, length, codec_id);
-#else
-      strcpy(codec_id_, codec_id);
-#endif
+      memcpy(codec_id_, codec_id, length - 1);
+      codec_id_[length - 1] = '\0';
     }
   }
 }
@@ -926,11 +934,8 @@ void Track::set_language(const char* language) {
     const size_t length = strlen(language) + 1;
     language_ = new (std::nothrow) char[length];  // NOLINT
     if (language_) {
-#ifdef _MSC_VER
-      strcpy_s(language_, length, language);
-#else
-      strcpy(language_, language);
-#endif
+      memcpy(language_, language, length - 1);
+      language_[length - 1] = '\0';
     }
   }
 }
@@ -942,11 +947,8 @@ void Track::set_name(const char* name) {
     const size_t length = strlen(name) + 1;
     name_ = new (std::nothrow) char[length];  // NOLINT
     if (name_) {
-#ifdef _MSC_VER
-      strcpy_s(name_, length, name);
-#else
-      strcpy(name_, name);
-#endif
+      memcpy(name_, name, length - 1);
+      name_[length - 1] = '\0';
     }
   }
 }
@@ -1025,19 +1027,16 @@ bool MasteringMetadata::Write(IMkvWriter* writer) const {
       !WriteEbmlElement(writer, libwebm::kMkvLuminanceMin, luminance_min_)) {
     return false;
   }
-  if (r_ &&
-      !r_->Write(writer, libwebm::kMkvPrimaryRChromaticityX,
-                 libwebm::kMkvPrimaryRChromaticityY)) {
+  if (r_ && !r_->Write(writer, libwebm::kMkvPrimaryRChromaticityX,
+                       libwebm::kMkvPrimaryRChromaticityY)) {
     return false;
   }
-  if (g_ &&
-      !g_->Write(writer, libwebm::kMkvPrimaryGChromaticityX,
-                 libwebm::kMkvPrimaryGChromaticityY)) {
+  if (g_ && !g_->Write(writer, libwebm::kMkvPrimaryGChromaticityX,
+                       libwebm::kMkvPrimaryGChromaticityY)) {
     return false;
   }
-  if (b_ &&
-      !b_->Write(writer, libwebm::kMkvPrimaryBChromaticityX,
-                 libwebm::kMkvPrimaryBChromaticityY)) {
+  if (b_ && !b_->Write(writer, libwebm::kMkvPrimaryBChromaticityX,
+                       libwebm::kMkvPrimaryBChromaticityY)) {
     return false;
   }
   if (white_point_ &&
@@ -1052,22 +1051,22 @@ bool MasteringMetadata::Write(IMkvWriter* writer) const {
 bool MasteringMetadata::SetChromaticity(
     const PrimaryChromaticity* r, const PrimaryChromaticity* g,
     const PrimaryChromaticity* b, const PrimaryChromaticity* white_point) {
-  PrimaryChromaticityPtr r_ptr(NULL);
+  PrimaryChromaticityPtr r_ptr(nullptr);
   if (r) {
     if (!CopyChromaticity(r, &r_ptr))
       return false;
   }
-  PrimaryChromaticityPtr g_ptr(NULL);
+  PrimaryChromaticityPtr g_ptr(nullptr);
   if (g) {
     if (!CopyChromaticity(g, &g_ptr))
       return false;
   }
-  PrimaryChromaticityPtr b_ptr(NULL);
+  PrimaryChromaticityPtr b_ptr(nullptr);
   if (b) {
     if (!CopyChromaticity(b, &b_ptr))
       return false;
   }
-  PrimaryChromaticityPtr wp_ptr(NULL);
+  PrimaryChromaticityPtr wp_ptr(nullptr);
   if (white_point) {
     if (!CopyChromaticity(white_point, &wp_ptr))
       return false;
@@ -1233,7 +1232,7 @@ bool Colour::Write(IMkvWriter* writer) const {
 }
 
 bool Colour::SetMasteringMetadata(const MasteringMetadata& mastering_metadata) {
-  std::auto_ptr<MasteringMetadata> mm_ptr(new MasteringMetadata());
+  std::unique_ptr<MasteringMetadata> mm_ptr(new MasteringMetadata());
   if (!mm_ptr.get())
     return false;
 
@@ -1419,6 +1418,7 @@ VideoTrack::VideoTrack(unsigned int* seed)
       stereo_mode_(0),
       alpha_mode_(0),
       width_(0),
+      colour_space_(NULL),
       colour_(NULL),
       projection_(NULL) {}
 
@@ -1516,6 +1516,10 @@ bool VideoTrack::Write(IMkvWriter* writer) const {
                           static_cast<uint64>(alpha_mode_)))
       return false;
   }
+  if (colour_space_) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvColourSpace, colour_space_))
+      return false;
+  }
   if (frame_rate_ > 0.0) {
     if (!WriteEbmlElement(writer, libwebm::kMkvFrameRate,
                           static_cast<float>(frame_rate_))) {
@@ -1540,8 +1544,21 @@ bool VideoTrack::Write(IMkvWriter* writer) const {
   return true;
 }
 
+void VideoTrack::set_colour_space(const char* colour_space) {
+  if (colour_space) {
+    delete[] colour_space_;
+
+    const size_t length = strlen(colour_space) + 1;
+    colour_space_ = new (std::nothrow) char[length];  // NOLINT
+    if (colour_space_) {
+      memcpy(colour_space_, colour_space, length - 1);
+      colour_space_[length - 1] = '\0';
+    }
+  }
+}
+
 bool VideoTrack::SetColour(const Colour& colour) {
-  std::auto_ptr<Colour> colour_ptr(new Colour());
+  std::unique_ptr<Colour> colour_ptr(new Colour());
   if (!colour_ptr.get())
     return false;
 
@@ -1569,7 +1586,7 @@ bool VideoTrack::SetColour(const Colour& colour) {
 }
 
 bool VideoTrack::SetProjection(const Projection& projection) {
-  std::auto_ptr<Projection> projection_ptr(new Projection());
+  std::unique_ptr<Projection> projection_ptr(new Projection());
   if (!projection_ptr.get())
     return false;
 
@@ -1623,6 +1640,8 @@ uint64_t VideoTrack::VideoPayloadSize() const {
   if (frame_rate_ > 0.0)
     size += EbmlElementSize(libwebm::kMkvFrameRate,
                             static_cast<float>(frame_rate_));
+  if (colour_space_)
+    size += EbmlElementSize(libwebm::kMkvColourSpace, colour_space_);
   if (colour_)
     size += colour_->ColourSize();
   if (projection_)
@@ -1700,9 +1719,9 @@ bool AudioTrack::Write(IMkvWriter* writer) const {
 
 const char Tracks::kOpusCodecId[] = "A_OPUS";
 const char Tracks::kVorbisCodecId[] = "A_VORBIS";
+const char Tracks::kAv1CodecId[] = "V_AV1";
 const char Tracks::kVp8CodecId[] = "V_VP8";
 const char Tracks::kVp9CodecId[] = "V_VP9";
-const char Tracks::kVp10CodecId[] = "V_VP10";
 const char Tracks::kWebVttCaptionsId[] = "D_WEBVTT/CAPTIONS";
 const char Tracks::kWebVttDescriptionsId[] = "D_WEBVTT/DESCRIPTIONS";
 const char Tracks::kWebVttMetadataId[] = "D_WEBVTT/METADATA";
@@ -2661,7 +2680,7 @@ bool Cluster::QueueOrWriteFrame(const Frame* const frame) {
   // and write it if it is okay to do so (i.e.) no other track has an held back
   // frame with timestamp <= the timestamp of the frame in question.
   std::vector<std::list<Frame*>::iterator> frames_to_erase;
-  for (std::list<Frame *>::iterator
+  for (std::list<Frame*>::iterator
            current_track_iterator = stored_frames_[track_number].begin(),
            end = --stored_frames_[track_number].end();
        current_track_iterator != end; ++current_track_iterator) {
@@ -2826,13 +2845,13 @@ bool SeekHead::AddSeekEntry(uint32_t id, uint64_t pos) {
 
 uint32_t SeekHead::GetId(int index) const {
   if (index < 0 || index >= kSeekEntryCount)
-    return UINT_MAX;
+    return UINT32_MAX;
   return seek_entry_id_[index];
 }
 
 uint64_t SeekHead::GetPosition(int index) const {
   if (index < 0 || index >= kSeekEntryCount)
-    return ULLONG_MAX;
+    return UINT64_MAX;
   return seek_entry_pos_[index];
 }
 
@@ -2866,7 +2885,7 @@ SegmentInfo::SegmentInfo()
       muxing_app_(NULL),
       timecode_scale_(1000000ULL),
       writing_app_(NULL),
-      date_utc_(LLONG_MIN),
+      date_utc_(INT64_MIN),
       duration_pos_(-1) {}
 
 SegmentInfo::~SegmentInfo() {
@@ -2897,11 +2916,8 @@ bool SegmentInfo::Init() {
   if (!muxing_app_)
     return false;
 
-#ifdef _MSC_VER
-  strcpy_s(muxing_app_, app_len, temp);
-#else
-  strcpy(muxing_app_, temp);
-#endif
+  memcpy(muxing_app_, temp, app_len - 1);
+  muxing_app_[app_len - 1] = '\0';
 
   set_writing_app(temp);
   if (!writing_app_)
@@ -2944,7 +2960,7 @@ bool SegmentInfo::Write(IMkvWriter* writer) {
   if (duration_ > 0.0)
     size +=
         EbmlElementSize(libwebm::kMkvDuration, static_cast<float>(duration_));
-  if (date_utc_ != LLONG_MIN)
+  if (date_utc_ != INT64_MIN)
     size += EbmlDateElementSize(libwebm::kMkvDateUTC);
   size += EbmlElementSize(libwebm::kMkvMuxingApp, muxing_app_);
   size += EbmlElementSize(libwebm::kMkvWritingApp, writing_app_);
@@ -2969,7 +2985,7 @@ bool SegmentInfo::Write(IMkvWriter* writer) {
       return false;
   }
 
-  if (date_utc_ != LLONG_MIN)
+  if (date_utc_ != INT64_MIN)
     WriteEbmlDateElement(writer, libwebm::kMkvDateUTC, date_utc_);
 
   if (!WriteEbmlElement(writer, libwebm::kMkvMuxingApp, muxing_app_))
@@ -2992,11 +3008,8 @@ void SegmentInfo::set_muxing_app(const char* app) {
     if (!temp_str)
       return;
 
-#ifdef _MSC_VER
-    strcpy_s(temp_str, length, app);
-#else
-    strcpy(temp_str, app);
-#endif
+    memcpy(temp_str, app, length - 1);
+    temp_str[length - 1] = '\0';
 
     delete[] muxing_app_;
     muxing_app_ = temp_str;
@@ -3010,11 +3023,8 @@ void SegmentInfo::set_writing_app(const char* app) {
     if (!temp_str)
       return;
 
-#ifdef _MSC_VER
-    strcpy_s(temp_str, length, app);
-#else
-    strcpy(temp_str, app);
-#endif
+    memcpy(temp_str, app, length - 1);
+    temp_str[length - 1] = '\0';
 
     delete[] writing_app_;
     writing_app_ = temp_str;
@@ -3053,7 +3063,8 @@ Segment::Segment()
       output_cues_(true),
       accurate_cluster_duration_(false),
       fixed_size_cluster_timecode_(false),
-      estimate_file_duration_(true),
+      estimate_file_duration_(false),
+      ebml_header_size_(0),
       payload_pos_(0),
       size_position_(0),
       doc_type_version_(kDefaultDocTypeVersion),
@@ -3361,7 +3372,10 @@ uint64_t Segment::AddVideoTrack(int32_t width, int32_t height, int32_t number) {
   track->set_width(width);
   track->set_height(height);
 
-  tracks_.AddTrack(track, number);
+  if (!tracks_.AddTrack(track, number)) {
+    delete track;
+    return 0;
+  }
   has_video_ = true;
 
   return track->number();
@@ -3383,8 +3397,10 @@ bool Segment::AddCuePoint(uint64_t timestamp, uint64_t track) {
   cue->set_block_number(cluster->blocks_added());
   cue->set_cluster_pos(cluster->position_for_cues());
   cue->set_track(track);
-  if (!cues_.AddCue(cue))
+  if (!cues_.AddCue(cue)) {
+    delete cue;
     return false;
+  }
 
   new_cuepoint_ = false;
   return true;
@@ -3401,7 +3417,10 @@ uint64_t Segment::AddAudioTrack(int32_t sample_rate, int32_t channels,
   track->set_sample_rate(sample_rate);
   track->set_channels(channels);
 
-  tracks_.AddTrack(track, number);
+  if (!tracks_.AddTrack(track, number)) {
+    delete track;
+    return 0;
+  }
 
   return track->number();
 }
@@ -3490,16 +3509,33 @@ bool Segment::AddGenericFrame(const Frame* frame) {
   if (frame->discard_padding() != 0)
     doc_type_version_ = 4;
 
+  if (cluster_list_size_ > 0) {
+    const uint64_t timecode_scale = segment_info_.timecode_scale();
+    const uint64_t frame_timecode = frame->timestamp() / timecode_scale;
+
+    const Cluster* const last_cluster = cluster_list_[cluster_list_size_ - 1];
+    const uint64_t last_cluster_timecode = last_cluster->timecode();
+
+    const uint64_t rel_timecode = frame_timecode - last_cluster_timecode;
+    if (rel_timecode > kMaxBlockTimecode) {
+      force_new_cluster_ = true;
+    }
+  }
+
   // If the segment has a video track hold onto audio frames to make sure the
   // audio that is associated with the start time of a video key-frame is
   // muxed into the same cluster.
   if (has_video_ && tracks_.TrackIsAudio(frame->track_number()) &&
       !force_new_cluster_) {
     Frame* const new_frame = new (std::nothrow) Frame();
-    if (!new_frame || !new_frame->CopyFrom(*frame))
+    if (!new_frame || !new_frame->CopyFrom(*frame)) {
+      delete new_frame;
       return false;
-    if (!QueueFrame(new_frame))
+    }
+    if (!QueueFrame(new_frame)) {
+      delete new_frame;
       return false;
+    }
     track_frames_written_[frame->track_number() - 1]++;
     return true;
   }
@@ -3522,8 +3558,10 @@ bool Segment::AddGenericFrame(const Frame* frame) {
   if (!frame->CanBeSimpleBlock() && !frame->is_key() &&
       !frame->reference_block_timestamp_set()) {
     Frame* const new_frame = new (std::nothrow) Frame();
-    if (!new_frame->CopyFrom(*frame))
+    if (!new_frame || !new_frame->CopyFrom(*frame)) {
+      delete new_frame;
       return false;
+    }
     new_frame->set_reference_block_timestamp(
         last_track_timestamp_[frame->track_number() - 1]);
     frame = new_frame;
@@ -3570,19 +3608,17 @@ bool Segment::SetChunking(bool chunking, const char* filename) {
     if (chunking_ && !strcmp(filename, chunking_base_name_))
       return true;
 
-    const size_t name_length = strlen(filename) + 1;
-    char* const temp = new (std::nothrow) char[name_length];  // NOLINT
+    const size_t filename_length = strlen(filename);
+    char* const temp = new (std::nothrow) char[filename_length + 1];  // NOLINT
     if (!temp)
       return false;
 
-#ifdef _MSC_VER
-    strcpy_s(temp, name_length, filename);
-#else
-    strcpy(temp, filename);
-#endif
+    memcpy(temp, filename, filename_length);
+    temp[filename_length] = '\0';
 
     delete[] chunking_base_name_;
     chunking_base_name_ = temp;
+    // From this point, strlen(chunking_base_name_) == filename_length
 
     if (!UpdateChunkName("chk", &chunk_name_))
       return false;
@@ -3608,18 +3644,16 @@ bool Segment::SetChunking(bool chunking, const char* filename) {
     if (!chunk_writer_cluster_->Open(chunk_name_))
       return false;
 
-    const size_t header_length = strlen(filename) + strlen(".hdr") + 1;
+    const size_t hdr_length = strlen(".hdr");
+    const size_t header_length = filename_length + hdr_length + 1;
     char* const header = new (std::nothrow) char[header_length];  // NOLINT
     if (!header)
       return false;
 
-#ifdef _MSC_VER
-    strcpy_s(header, header_length - strlen(".hdr"), chunking_base_name_);
-    strcat_s(header, header_length, ".hdr");
-#else
-    strcpy(header, chunking_base_name_);
-    strcat(header, ".hdr");
-#endif
+    memcpy(header, chunking_base_name_, filename_length);
+    memcpy(&header[filename_length], ".hdr", hdr_length);
+    header[filename_length + hdr_length] = '\0';
+
     if (!chunk_writer_header_->Open(header)) {
       delete[] header;
       return false;
@@ -3964,18 +3998,16 @@ bool Segment::UpdateChunkName(const char* ext, char** name) const {
   snprintf(ext_chk, sizeof(ext_chk), "_%06d.%s", chunk_count_, ext);
 #endif
 
-  const size_t length = strlen(chunking_base_name_) + strlen(ext_chk) + 1;
+  const size_t chunking_base_name_length = strlen(chunking_base_name_);
+  const size_t ext_chk_length = strlen(ext_chk);
+  const size_t length = chunking_base_name_length + ext_chk_length + 1;
   char* const str = new (std::nothrow) char[length];  // NOLINT
   if (!str)
     return false;
 
-#ifdef _MSC_VER
-  strcpy_s(str, length - strlen(ext_chk), chunking_base_name_);
-  strcat_s(str, length, ext_chk);
-#else
-  strcpy(str, chunking_base_name_);
-  strcat(str, ext_chk);
-#endif
+  memcpy(str, chunking_base_name_, chunking_base_name_length);
+  memcpy(&str[chunking_base_name_length], ext_chk, ext_chk_length);
+  str[chunking_base_name_length + ext_chk_length] = '\0';
 
   delete[] * name;
   *name = str;
@@ -4048,12 +4080,16 @@ int Segment::WriteFramesAll() {
     // places where |doc_type_version_| needs to be updated.
     if (frame->discard_padding() != 0)
       doc_type_version_ = 4;
-    if (!cluster->AddFrame(frame))
-      return -1;
+    if (!cluster->AddFrame(frame)) {
+      delete frame;
+      continue;
+    }
 
     if (new_cuepoint_ && cues_track_ == frame->track_number()) {
-      if (!AddCuePoint(frame->timestamp(), cues_track_))
-        return -1;
+      if (!AddCuePoint(frame->timestamp(), cues_track_)) {
+        delete frame;
+        continue;
+      }
     }
 
     if (frame->timestamp() > last_timestamp_) {
@@ -4096,12 +4132,16 @@ bool Segment::WriteFramesLessThan(uint64_t timestamp) {
       const Frame* const frame_prev = frames_[i - 1];
       if (frame_prev->discard_padding() != 0)
         doc_type_version_ = 4;
-      if (!cluster->AddFrame(frame_prev))
-        return false;
+      if (!cluster->AddFrame(frame_prev)) {
+        delete frame_prev;
+        continue;
+      }
 
       if (new_cuepoint_ && cues_track_ == frame_prev->track_number()) {
-        if (!AddCuePoint(frame_prev->timestamp(), cues_track_))
-          return false;
+        if (!AddCuePoint(frame_prev->timestamp(), cues_track_)) {
+          delete frame_prev;
+          continue;
+        }
       }
 
       ++shift_left;
@@ -4136,8 +4176,8 @@ bool Segment::DocTypeIsWebm() const {
   // TODO(vigneshv): Tweak .clang-format.
   const char* kWebmCodecIds[kNumCodecIds] = {
       Tracks::kOpusCodecId,          Tracks::kVorbisCodecId,
-      Tracks::kVp8CodecId,           Tracks::kVp9CodecId,
-      Tracks::kVp10CodecId,          Tracks::kWebVttCaptionsId,
+      Tracks::kAv1CodecId,           Tracks::kVp8CodecId,
+      Tracks::kVp9CodecId,           Tracks::kWebVttCaptionsId,
       Tracks::kWebVttDescriptionsId, Tracks::kWebVttMetadataId,
       Tracks::kWebVttSubtitlesId};
 
diff --git a/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.h b/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.h
index 46b0029dc4..2c4bb9e93e 100644
--- a/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.h
+++ b/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.h
@@ -330,7 +330,7 @@ class ContentEncoding {
 
  private:
   // Returns the size in bytes for the encoding elements.
-  uint64_t EncodingSize(uint64_t compresion_size,
+  uint64_t EncodingSize(uint64_t compression_size,
                         uint64_t encryption_size) const;
 
   // Returns the size in bytes for the encryption elements.
@@ -795,6 +795,8 @@ class VideoTrack : public Track {
   uint64_t alpha_mode() { return alpha_mode_; }
   void set_width(uint64_t width) { width_ = width; }
   uint64_t width() const { return width_; }
+  void set_colour_space(const char* colour_space);
+  const char* colour_space() const { return colour_space_; }
 
   Colour* colour() { return colour_; }
 
@@ -824,6 +826,7 @@ class VideoTrack : public Track {
   uint64_t stereo_mode_;
   uint64_t alpha_mode_;
   uint64_t width_;
+  char* colour_space_;
 
   Colour* colour_;
   Projection* projection_;
@@ -871,9 +874,9 @@ class Tracks {
 
   static const char kOpusCodecId[];
   static const char kVorbisCodecId[];
+  static const char kAv1CodecId[];
   static const char kVp8CodecId[];
   static const char kVp9CodecId[];
-  static const char kVp10CodecId[];
   static const char kWebVttCaptionsId[];
   static const char kWebVttDescriptionsId[];
   static const char kWebVttMetadataId[];
@@ -1422,7 +1425,7 @@ class SeekHead {
   bool Write(IMkvWriter* writer);
 
   // We are going to put a cap on the number of Seek Entries.
-  const static int32_t kSeekEntryCount = 5;
+  constexpr static int32_t kSeekEntryCount = 5;
 
  private:
   // Returns the maximum size in bytes of one seek entry.
@@ -1478,7 +1481,7 @@ class SegmentInfo {
   uint64_t timecode_scale_;
   // Initially set to libwebm-%d.%d.%d.%d, major, minor, build, revision.
   char* writing_app_;
-  // LLONG_MIN when DateUTC is not set.
+  // INT64_MIN when DateUTC is not set.
   int64_t date_utc_;
 
   // The file position of the duration element.
@@ -1502,8 +1505,8 @@ class Segment {
     kBeforeClusters = 0x1  // Position Cues before Clusters
   };
 
-  static const uint32_t kDefaultDocTypeVersion = 4;
-  static const uint64_t kDefaultMaxClusterDuration = 30000000000ULL;
+  static constexpr uint32_t kDefaultDocTypeVersion = 4;
+  static constexpr uint64_t kDefaultMaxClusterDuration = 30000000000ULL;
 
   Segment();
   ~Segment();
diff --git a/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc b/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
index 1ba17ac1ba..d1e835cd00 100644
--- a/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
+++ b/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
@@ -10,6 +10,7 @@
 
 #ifdef __ANDROID__
 #include <fcntl.h>
+#include <unistd.h>
 #endif
 
 #include <cassert>
@@ -135,9 +136,8 @@ uint64 WriteBlock(IMkvWriter* writer, const Frame* const frame, int64 timecode,
     return false;
   }
 
-  if (!frame->is_key() &&
-      !WriteEbmlElement(writer, libwebm::kMkvReferenceBlock,
-                        reference_block_timestamp)) {
+  if (!frame->is_key() && !WriteEbmlElement(writer, libwebm::kMkvReferenceBlock,
+                                            reference_block_timestamp)) {
     return false;
   }
 
@@ -288,7 +288,7 @@ uint64 EbmlElementSize(uint64 type, const char* value) {
   ebml_size += strlen(value);
 
   // Size of Datasize
-  ebml_size++;
+  ebml_size += GetCodedUIntSize(strlen(value));
 
   return ebml_size;
 }
@@ -508,7 +508,7 @@ bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const char* value) {
   if (WriteUInt(writer, length))
     return false;
 
-  if (writer->Write(value, static_cast<const uint32>(length)))
+  if (writer->Write(value, static_cast<uint32>(length)))
     return false;
 
   return true;
@@ -562,10 +562,10 @@ uint64 WriteFrame(IMkvWriter* writer, const Frame* const frame,
   if (relative_timecode < 0 || relative_timecode > kMaxBlockTimecode)
     return 0;
 
-  return frame->CanBeSimpleBlock() ?
-             WriteSimpleBlock(writer, frame, relative_timecode) :
-             WriteBlock(writer, frame, relative_timecode,
-                        cluster->timecode_scale());
+  return frame->CanBeSimpleBlock()
+             ? WriteSimpleBlock(writer, frame, relative_timecode)
+             : WriteBlock(writer, frame, relative_timecode,
+                          cluster->timecode_scale());
 }
 
 uint64 WriteVoidElement(IMkvWriter* writer, uint64 size) {
@@ -606,26 +606,22 @@ uint64 WriteVoidElement(IMkvWriter* writer, uint64 size) {
 
 void GetVersion(int32* major, int32* minor, int32* build, int32* revision) {
   *major = 0;
-  *minor = 2;
-  *build = 1;
+  *minor = 3;
+  *build = 3;
   *revision = 0;
 }
 
 uint64 MakeUID(unsigned int* seed) {
   uint64 uid = 0;
 
-#ifdef __MINGW32__
-  srand(*seed);
-#endif
-
   for (int i = 0; i < 7; ++i) {  // avoid problems with 8-byte values
     uid <<= 8;
 
 // TODO(fgalligan): Move random number generation to platform specific code.
-#ifdef _MSC_VER
+#ifdef _WIN32
     (void)seed;
     const int32 nn = rand();
-#elif __ANDROID__
+#elif defined(__ANDROID__)
     (void)seed;
     int32 temp_num = 1;
     int fd = open("/dev/urandom", O_RDONLY);
@@ -634,8 +630,6 @@ uint64 MakeUID(unsigned int* seed) {
       close(fd);
     }
     const int32 nn = temp_num;
-#elif defined __MINGW32__
-    const int32 nn = rand();
 #else
     const int32 nn = rand_r(seed);
 #endif
diff --git a/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.h b/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.h
index 132388da59..85fc2a209e 100644
--- a/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.h
+++ b/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.h
@@ -8,9 +8,9 @@
 #ifndef MKVMUXER_MKVMUXERUTIL_H_
 #define MKVMUXER_MKVMUXERUTIL_H_
 
-#include "mkvmuxertypes.h"
+#include <stdint.h>
 
-#include "stdint.h"
+#include "mkvmuxertypes.h"
 
 namespace mkvmuxer {
 class Cluster;
@@ -31,6 +31,9 @@ const int64 kMaxBlockTimecode = 0x07FFFLL;
 // Writes out |value| in Big Endian order. Returns 0 on success.
 int32 SerializeInt(IMkvWriter* writer, int64 value, int32 size);
 
+// Writes out |f| in Big Endian order. Returns 0 on success.
+int32 SerializeFloat(IMkvWriter* writer, float f);
+
 // Returns the size in bytes of the element.
 int32 GetUIntSize(uint64 value);
 int32 GetIntSize(int64 value);
diff --git a/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc b/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc
index ec34e4df81..9b714a5e7c 100644
--- a/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc
+++ b/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc
@@ -8,6 +8,8 @@
 
 #include "mkvmuxer/mkvwriter.h"
 
+#include <sys/types.h>
+
 #ifdef _MSC_VER
 #include <share.h>  // for _SH_DENYWR
 #endif
@@ -76,8 +78,16 @@ int32 MkvWriter::Position(int64 position) {
 
 #ifdef _MSC_VER
   return _fseeki64(file_, position, SEEK_SET);
-#else
+#elif defined(_WIN32)
+  return fseeko64(file_, static_cast<off_t>(position), SEEK_SET);
+#elif !(defined(__ANDROID__) && __ANDROID_API__ < 24 && !defined(__LP64__) && \
+        defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS == 64)
+  // POSIX.1 has fseeko and ftello. fseeko and ftello are not available before
+  // Android API level 24. See
+  // https://android.googlesource.com/platform/bionic/+/main/docs/32-bit-abi.md
   return fseeko(file_, static_cast<off_t>(position), SEEK_SET);
+#else
+  return fseek(file_, static_cast<long>(position), SEEK_SET);
 #endif
 }
 
diff --git a/media/libvpx/libvpx/third_party/libwebm/mkvparser/mkvparser.cc b/media/libvpx/libvpx/third_party/libwebm/mkvparser/mkvparser.cc
index e62d6f6075..4fa7b37887 100644
--- a/media/libvpx/libvpx/third_party/libwebm/mkvparser/mkvparser.cc
+++ b/media/libvpx/libvpx/third_party/libwebm/mkvparser/mkvparser.cc
@@ -8,7 +8,6 @@
 #include "mkvparser/mkvparser.h"
 
 #if defined(_MSC_VER) && _MSC_VER < 1800
-#include <float.h>  // _isnan() / _finite()
 #define MSC_COMPAT
 #endif
 
@@ -16,6 +15,7 @@
 #include <cfloat>
 #include <climits>
 #include <cmath>
+#include <cstdint>
 #include <cstring>
 #include <memory>
 #include <new>
@@ -23,6 +23,7 @@
 #include "common/webmids.h"
 
 namespace mkvparser {
+const long long kStringElementSizeLimit = 20 * 1000 * 1000;
 const float MasteringMetadata::kValueNotPresent = FLT_MAX;
 const long long Colour::kValueNotPresent = LLONG_MAX;
 const float Projection::kValueNotPresent = FLT_MAX;
@@ -35,8 +36,6 @@ inline bool isnan(double val) { return std::isnan(val); }
 inline bool isinf(double val) { return std::isinf(val); }
 #endif  // MSC_COMPAT
 
-IMkvReader::~IMkvReader() {}
-
 template <typename Type>
 Type* SafeArrayAlloc(unsigned long long num_elements,
                      unsigned long long element_size) {
@@ -55,9 +54,9 @@ Type* SafeArrayAlloc(unsigned long long num_elements,
 
 void GetVersion(int& major, int& minor, int& build, int& revision) {
   major = 1;
-  minor = 0;
-  build = 0;
-  revision = 30;
+  minor = 1;
+  build = 3;
+  revision = 0;
 }
 
 long long ReadUInt(IMkvReader* pReader, long long pos, long& len) {
@@ -247,7 +246,8 @@ long UnserializeFloat(IMkvReader* pReader, long long pos, long long size_,
   if (size == 4) {
     union {
       float f;
-      unsigned long ff;
+      uint32_t ff;
+      static_assert(sizeof(float) == sizeof(uint32_t), "");
     };
 
     ff = 0;
@@ -265,7 +265,8 @@ long UnserializeFloat(IMkvReader* pReader, long long pos, long long size_,
   } else {
     union {
       double d;
-      unsigned long long dd;
+      uint64_t dd;
+      static_assert(sizeof(double) == sizeof(uint64_t), "");
     };
 
     dd = 0;
@@ -299,7 +300,7 @@ long UnserializeInt(IMkvReader* pReader, long long pos, long long size,
   if (status < 0)
     return status;
 
-  unsigned long long result = first_byte;
+  unsigned long long result = static_cast<unsigned long long>(first_byte);
   ++pos;
 
   for (long i = 1; i < size; ++i) {
@@ -325,7 +326,7 @@ long UnserializeString(IMkvReader* pReader, long long pos, long long size,
   delete[] str;
   str = NULL;
 
-  if (size >= LONG_MAX || size < 0)
+  if (size >= LONG_MAX || size < 0 || size > kStringElementSizeLimit)
     return E_FILE_FORMAT_INVALID;
 
   // +1 for '\0' terminator
@@ -1503,8 +1504,8 @@ long SeekHead::Parse() {
 
   // first count the seek head entries
 
-  int entry_count = 0;
-  int void_element_count = 0;
+  long long entry_count = 0;
+  long long void_element_count = 0;
 
   while (pos < stop) {
     long long id, size;
@@ -1514,10 +1515,15 @@ long SeekHead::Parse() {
     if (status < 0)  // error
       return status;
 
-    if (id == libwebm::kMkvSeek)
+    if (id == libwebm::kMkvSeek) {
       ++entry_count;
-    else if (id == libwebm::kMkvVoid)
+      if (entry_count > INT_MAX)
+        return E_PARSE_FAILED;
+    } else if (id == libwebm::kMkvVoid) {
       ++void_element_count;
+      if (void_element_count > INT_MAX)
+        return E_PARSE_FAILED;
+    }
 
     pos += size;  // consume payload
 
@@ -1528,15 +1534,20 @@ long SeekHead::Parse() {
   if (pos != stop)
     return E_FILE_FORMAT_INVALID;
 
-  m_entries = new (std::nothrow) Entry[entry_count];
+  if (entry_count > 0) {
+    m_entries = new (std::nothrow) Entry[static_cast<size_t>(entry_count)];
 
-  if (m_entries == NULL)
-    return -1;
+    if (m_entries == NULL)
+      return -1;
+  }
 
-  m_void_elements = new (std::nothrow) VoidElement[void_element_count];
+  if (void_element_count > 0) {
+    m_void_elements =
+        new (std::nothrow) VoidElement[static_cast<size_t>(void_element_count)];
 
-  if (m_void_elements == NULL)
-    return -1;
+    if (m_void_elements == NULL)
+      return -1;
+  }
 
   // now parse the entries and void elements
 
@@ -1555,14 +1566,14 @@ long SeekHead::Parse() {
     if (status < 0)  // error
       return status;
 
-    if (id == libwebm::kMkvSeek) {
+    if (id == libwebm::kMkvSeek && entry_count > 0) {
       if (ParseEntry(pReader, pos, size, pEntry)) {
         Entry& e = *pEntry++;
 
         e.element_start = idpos;
         e.element_size = (pos + size) - idpos;
       }
-    } else if (id == libwebm::kMkvVoid) {
+    } else if (id == libwebm::kMkvVoid && void_element_count > 0) {
       VoidElement& e = *pVoidElement++;
 
       e.element_start = idpos;
@@ -1579,13 +1590,13 @@ long SeekHead::Parse() {
 
   ptrdiff_t count_ = ptrdiff_t(pEntry - m_entries);
   assert(count_ >= 0);
-  assert(count_ <= entry_count);
+  assert(static_cast<long long>(count_) <= entry_count);
 
   m_entry_count = static_cast<int>(count_);
 
   count_ = ptrdiff_t(pVoidElement - m_void_elements);
   assert(count_ >= 0);
-  assert(count_ <= void_element_count);
+  assert(static_cast<long long>(count_) <= void_element_count);
 
   m_void_element_count = static_cast<int>(count_);
 
@@ -2296,7 +2307,7 @@ bool CuePoint::Load(IMkvReader* pReader) {
   long long pos = pos_;
 
   // First count number of track positions
-
+  unsigned long long track_positions_count = 0;
   while (pos < stop) {
     long len;
 
@@ -2320,12 +2331,17 @@ bool CuePoint::Load(IMkvReader* pReader) {
     if (id == libwebm::kMkvCueTime)
       m_timecode = UnserializeUInt(pReader, pos, size);
 
-    else if (id == libwebm::kMkvCueTrackPositions)
-      ++m_track_positions_count;
+    else if (id == libwebm::kMkvCueTrackPositions) {
+      ++track_positions_count;
+      if (track_positions_count > UINT_MAX)
+        return E_PARSE_FAILED;
+    }
 
     pos += size;  // consume payload
   }
 
+  m_track_positions_count = static_cast<size_t>(track_positions_count);
+
   if (m_timecode < 0 || m_track_positions_count <= 0) {
     return false;
   }
@@ -2418,7 +2434,7 @@ bool CuePoint::TrackPosition::Parse(IMkvReader* pReader, long long start_,
     pos += size;  // consume payload
   }
 
-  if ((m_pos < 0) || (m_track <= 0)) {
+  if ((m_pos < 0) || (m_track <= 0) || (m_block < 0) || (m_block > LONG_MAX)) {
     return false;
   }
 
@@ -2426,7 +2442,9 @@ bool CuePoint::TrackPosition::Parse(IMkvReader* pReader, long long start_,
 }
 
 const CuePoint::TrackPosition* CuePoint::Find(const Track* pTrack) const {
-  assert(pTrack);
+  if (pTrack == NULL) {
+    return NULL;
+  }
 
   const long long n = pTrack->GetNumber();
 
@@ -4026,7 +4044,7 @@ long SegmentInfo::Parse() {
   }
 
   const double rollover_check = m_duration * m_timecodeScale;
-  if (rollover_check > LLONG_MAX)
+  if (rollover_check > static_cast<double>(LLONG_MAX))
     return E_FILE_FORMAT_INVALID;
 
   if (pos != stop)
@@ -4189,8 +4207,8 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size,
   const long long stop = start + size;
 
   // Count ContentCompression and ContentEncryption elements.
-  int compression_count = 0;
-  int encryption_count = 0;
+  long long compression_count = 0;
+  long long encryption_count = 0;
 
   while (pos < stop) {
     long long id, size;
@@ -4198,11 +4216,17 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size,
     if (status < 0)  // error
       return status;
 
-    if (id == libwebm::kMkvContentCompression)
+    if (id == libwebm::kMkvContentCompression) {
       ++compression_count;
+      if (compression_count > INT_MAX)
+        return E_PARSE_FAILED;
+    }
 
-    if (id == libwebm::kMkvContentEncryption)
+    if (id == libwebm::kMkvContentEncryption) {
       ++encryption_count;
+      if (encryption_count > INT_MAX)
+        return E_PARSE_FAILED;
+    }
 
     pos += size;  // consume payload
     if (pos > stop)
@@ -4213,18 +4237,19 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size,
     return -1;
 
   if (compression_count > 0) {
-    compression_entries_ =
-        new (std::nothrow) ContentCompression*[compression_count];
+    compression_entries_ = new (std::nothrow)
+        ContentCompression*[static_cast<size_t>(compression_count)];
     if (!compression_entries_)
       return -1;
     compression_entries_end_ = compression_entries_;
   }
 
   if (encryption_count > 0) {
-    encryption_entries_ =
-        new (std::nothrow) ContentEncryption*[encryption_count];
+    encryption_entries_ = new (std::nothrow)
+        ContentEncryption*[static_cast<size_t>(encryption_count)];
     if (!encryption_entries_) {
       delete[] compression_entries_;
+      compression_entries_ = NULL;
       return -1;
     }
     encryption_entries_end_ = encryption_entries_;
@@ -4256,6 +4281,7 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size,
         delete compression;
         return status;
       }
+      assert(compression_count > 0);
       *compression_entries_end_++ = compression;
     } else if (id == libwebm::kMkvContentEncryption) {
       ContentEncryption* const encryption =
@@ -4268,6 +4294,7 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size,
         delete encryption;
         return status;
       }
+      assert(encryption_count > 0);
       *encryption_entries_end_++ = encryption;
     }
 
@@ -4320,6 +4347,12 @@ long ContentEncoding::ParseCompressionEntry(long long start, long long size,
         return status;
       }
 
+      // There should be only one settings element per content compression.
+      if (compression->settings != NULL) {
+        delete[] buf;
+        return E_FILE_FORMAT_INVALID;
+      }
+
       compression->settings = buf;
       compression->settings_len = buflen;
     }
@@ -4538,7 +4571,8 @@ int Track::Info::CopyStr(char* Info::*str, Info& dst_) const {
   if (dst == NULL)
     return -1;
 
-  strcpy(dst, src);
+  memcpy(dst, src, len);
+  dst[len] = '\0';
 
   return 0;
 }
@@ -4904,7 +4938,7 @@ long Track::ParseContentEncodingsEntry(long long start, long long size) {
   const long long stop = start + size;
 
   // Count ContentEncoding elements.
-  int count = 0;
+  long long count = 0;
   while (pos < stop) {
     long long id, size;
     const long status = ParseElementHeader(pReader, pos, stop, id, size);
@@ -4912,8 +4946,11 @@ long Track::ParseContentEncodingsEntry(long long start, long long size) {
       return status;
 
     // pos now designates start of element
-    if (id == libwebm::kMkvContentEncoding)
+    if (id == libwebm::kMkvContentEncoding) {
       ++count;
+      if (count > INT_MAX)
+        return E_PARSE_FAILED;
+    }
 
     pos += size;  // consume payload
     if (pos > stop)
@@ -4923,7 +4960,8 @@ long Track::ParseContentEncodingsEntry(long long start, long long size) {
   if (count <= 0)
     return -1;
 
-  content_encoding_entries_ = new (std::nothrow) ContentEncoding*[count];
+  content_encoding_entries_ =
+      new (std::nothrow) ContentEncoding*[static_cast<size_t>(count)];
   if (!content_encoding_entries_)
     return -1;
 
@@ -4975,29 +5013,27 @@ bool PrimaryChromaticity::Parse(IMkvReader* reader, long long read_pos,
   if (!reader)
     return false;
 
-  std::auto_ptr<PrimaryChromaticity> chromaticity_ptr;
+  if (!*chromaticity)
+    *chromaticity = new PrimaryChromaticity();
 
-  if (!*chromaticity) {
-    chromaticity_ptr.reset(new PrimaryChromaticity());
-  } else {
-    chromaticity_ptr.reset(*chromaticity);
-  }
-
-  if (!chromaticity_ptr.get())
+  if (!*chromaticity)
     return false;
 
-  float* value = is_x ? &chromaticity_ptr->x : &chromaticity_ptr->y;
+  PrimaryChromaticity* pc = *chromaticity;
+  float* value = is_x ? &pc->x : &pc->y;
 
   double parser_value = 0;
-  const long long value_parse_status =
+  const long long parse_status =
       UnserializeFloat(reader, read_pos, value_size, parser_value);
 
+  // Valid range is [0, 1]. Make sure the double is representable as a float
+  // before casting.
+  if (parse_status < 0 || parser_value < 0.0 || parser_value > 1.0 ||
+      (parser_value > 0.0 && parser_value < FLT_MIN))
+    return false;
+
   *value = static_cast<float>(parser_value);
 
-  if (value_parse_status < 0 || *value < 0.0 || *value > 1.0)
-    return false;
-
-  *chromaticity = chromaticity_ptr.release();
   return true;
 }
 
@@ -5006,7 +5042,7 @@ bool MasteringMetadata::Parse(IMkvReader* reader, long long mm_start,
   if (!reader || *mm)
     return false;
 
-  std::auto_ptr<MasteringMetadata> mm_ptr(new MasteringMetadata());
+  std::unique_ptr<MasteringMetadata> mm_ptr(new MasteringMetadata());
   if (!mm_ptr.get())
     return false;
 
@@ -5026,6 +5062,10 @@ bool MasteringMetadata::Parse(IMkvReader* reader, long long mm_start,
       double value = 0;
       const long long value_parse_status =
           UnserializeFloat(reader, read_pos, child_size, value);
+      if (value < -FLT_MAX || value > FLT_MAX ||
+          (value > 0.0 && value < FLT_MIN)) {
+        return false;
+      }
       mm_ptr->luminance_max = static_cast<float>(value);
       if (value_parse_status < 0 || mm_ptr->luminance_max < 0.0 ||
           mm_ptr->luminance_max > 9999.99) {
@@ -5035,6 +5075,10 @@ bool MasteringMetadata::Parse(IMkvReader* reader, long long mm_start,
       double value = 0;
       const long long value_parse_status =
           UnserializeFloat(reader, read_pos, child_size, value);
+      if (value < -FLT_MAX || value > FLT_MAX ||
+          (value > 0.0 && value < FLT_MIN)) {
+        return false;
+      }
       mm_ptr->luminance_min = static_cast<float>(value);
       if (value_parse_status < 0 || mm_ptr->luminance_min < 0.0 ||
           mm_ptr->luminance_min > 999.9999) {
@@ -5087,7 +5131,7 @@ bool Colour::Parse(IMkvReader* reader, long long colour_start,
   if (!reader || *colour)
     return false;
 
-  std::auto_ptr<Colour> colour_ptr(new Colour());
+  std::unique_ptr<Colour> colour_ptr(new Colour());
   if (!colour_ptr.get())
     return false;
 
@@ -5185,7 +5229,7 @@ bool Projection::Parse(IMkvReader* reader, long long start, long long size,
   if (!reader || *projection)
     return false;
 
-  std::auto_ptr<Projection> projection_ptr(new Projection());
+  std::unique_ptr<Projection> projection_ptr(new Projection());
   if (!projection_ptr.get())
     return false;
 
@@ -5209,6 +5253,8 @@ bool Projection::Parse(IMkvReader* reader, long long start, long long size,
 
       projection_ptr->type = static_cast<ProjectionType>(projection_type);
     } else if (child_id == libwebm::kMkvProjectionPrivate) {
+      if (projection_ptr->private_data != NULL)
+        return false;
       unsigned char* data = SafeArrayAlloc<unsigned char>(1, child_size);
 
       if (data == NULL)
@@ -5228,7 +5274,9 @@ bool Projection::Parse(IMkvReader* reader, long long start, long long size,
       double value = 0;
       const long long value_parse_status =
           UnserializeFloat(reader, read_pos, child_size, value);
-      if (value_parse_status < 0) {
+      // Make sure value is representable as a float before casting.
+      if (value_parse_status < 0 || value < -FLT_MAX || value > FLT_MAX ||
+          (value > 0.0 && value < FLT_MIN)) {
         return false;
       }
 
@@ -5259,10 +5307,12 @@ bool Projection::Parse(IMkvReader* reader, long long start, long long size,
 VideoTrack::VideoTrack(Segment* pSegment, long long element_start,
                        long long element_size)
     : Track(pSegment, element_start, element_size),
+      m_colour_space(NULL),
       m_colour(NULL),
       m_projection(NULL) {}
 
 VideoTrack::~VideoTrack() {
+  delete[] m_colour_space;
   delete m_colour;
   delete m_projection;
 }
@@ -5284,6 +5334,7 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
   long long stereo_mode = 0;
 
   double rate = 0.0;
+  std::unique_ptr<char[]> colour_space_ptr;
 
   IMkvReader* const pReader = pSegment->m_pReader;
 
@@ -5296,8 +5347,8 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
 
   const long long stop = pos + s.size;
 
-  Colour* colour = NULL;
-  Projection* projection = NULL;
+  std::unique_ptr<Colour> colour_ptr;
+  std::unique_ptr<Projection> projection_ptr;
 
   while (pos < stop) {
     long long id, size;
@@ -5346,11 +5397,25 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
       if (rate <= 0)
         return E_FILE_FORMAT_INVALID;
     } else if (id == libwebm::kMkvColour) {
-      if (!Colour::Parse(pReader, pos, size, &colour))
+      Colour* colour = NULL;
+      if (!Colour::Parse(pReader, pos, size, &colour)) {
         return E_FILE_FORMAT_INVALID;
+      } else {
+        colour_ptr.reset(colour);
+      }
     } else if (id == libwebm::kMkvProjection) {
-      if (!Projection::Parse(pReader, pos, size, &projection))
+      Projection* projection = NULL;
+      if (!Projection::Parse(pReader, pos, size, &projection)) {
         return E_FILE_FORMAT_INVALID;
+      } else {
+        projection_ptr.reset(projection);
+      }
+    } else if (id == libwebm::kMkvColourSpace) {
+      char* colour_space = NULL;
+      const long status = UnserializeString(pReader, pos, size, colour_space);
+      if (status < 0)
+        return status;
+      colour_space_ptr.reset(colour_space);
     }
 
     pos += size;  // consume payload
@@ -5381,8 +5446,9 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
   pTrack->m_display_unit = display_unit;
   pTrack->m_stereo_mode = stereo_mode;
   pTrack->m_rate = rate;
-  pTrack->m_colour = colour;
-  pTrack->m_projection = projection;
+  pTrack->m_colour = colour_ptr.release();
+  pTrack->m_colour_space = colour_space_ptr.release();
+  pTrack->m_projection = projection_ptr.release();
 
   pResult = pTrack;
   return 0;  // success
@@ -5611,7 +5677,7 @@ long Tracks::Parse() {
   const long long stop = m_start + m_size;
   IMkvReader* const pReader = m_pSegment->m_pReader;
 
-  int count = 0;
+  long long count = 0;
   long long pos = m_start;
 
   while (pos < stop) {
@@ -5625,8 +5691,11 @@ long Tracks::Parse() {
     if (size == 0)  // weird
       continue;
 
-    if (id == libwebm::kMkvTrackEntry)
+    if (id == libwebm::kMkvTrackEntry) {
       ++count;
+      if (count > INT_MAX)
+        return E_PARSE_FAILED;
+    }
 
     pos += size;  // consume payload
     if (pos > stop)
@@ -5639,7 +5708,7 @@ long Tracks::Parse() {
   if (count <= 0)
     return 0;  // success
 
-  m_trackEntries = new (std::nothrow) Track*[count];
+  m_trackEntries = new (std::nothrow) Track*[static_cast<size_t>(count)];
 
   if (m_trackEntries == NULL)
     return -1;
@@ -7821,8 +7890,10 @@ long Block::Parse(const Cluster* pCluster) {
     if (frame_size <= 0)
       return E_FILE_FORMAT_INVALID;
 
+#if LLONG_MAX > LONG_MAX
     if (frame_size > LONG_MAX)
       return E_FILE_FORMAT_INVALID;
+#endif
 
     if ((pos + len) > stop)
       return E_FILE_FORMAT_INVALID;
@@ -7888,10 +7959,16 @@ long Block::Parse(const Cluster* pCluster) {
       if (frame_size <= 0)
         return E_FILE_FORMAT_INVALID;
 
+#if LLONG_MAX > LONG_MAX
       if (frame_size > LONG_MAX)
         return E_FILE_FORMAT_INVALID;
+#endif
 
       curr.len = static_cast<long>(frame_size);
+      // Check if size + curr.len could overflow.
+      if (size > LLONG_MAX - curr.len) {
+        return E_FILE_FORMAT_INVALID;
+      }
       size += curr.len;  // contribution of this frame
 
       --frame_count;
@@ -7932,7 +8009,6 @@ long Block::Parse(const Cluster* pCluster) {
     pf = m_frames;
     while (pf != pf_end) {
       Frame& f = *pf++;
-      assert((pos + f.len) <= stop);
       if ((pos + f.len) > stop)
         return E_FILE_FORMAT_INVALID;
 
@@ -7954,6 +8030,11 @@ long long Block::GetTimeCode(const Cluster* pCluster) const {
   const long long tc0 = pCluster->GetTimeCode();
   assert(tc0 >= 0);
 
+  // Check if tc0 + m_timecode would overflow.
+  if (tc0 < 0 || LLONG_MAX - tc0 < m_timecode) {
+    return -1;
+  }
+
   const long long tc = tc0 + m_timecode;
 
   return tc;  // unscaled timecode units
@@ -7971,6 +8052,10 @@ long long Block::GetTime(const Cluster* pCluster) const {
   const long long scale = pInfo->GetTimeCodeScale();
   assert(scale >= 1);
 
+  // Check if tc * scale could overflow.
+  if (tc != 0 && scale > LLONG_MAX / tc) {
+    return -1;
+  }
   const long long ns = tc * scale;
 
   return ns;
diff --git a/media/libvpx/libvpx/third_party/libwebm/mkvparser/mkvparser.h b/media/libvpx/libvpx/third_party/libwebm/mkvparser/mkvparser.h
index 26c2b7e5eb..848d01f03e 100644
--- a/media/libvpx/libvpx/third_party/libwebm/mkvparser/mkvparser.h
+++ b/media/libvpx/libvpx/third_party/libwebm/mkvparser/mkvparser.h
@@ -22,7 +22,7 @@ class IMkvReader {
   virtual int Length(long long* total, long long* available) = 0;
 
  protected:
-  virtual ~IMkvReader();
+  virtual ~IMkvReader() {}
 };
 
 template <typename Type>
@@ -527,6 +527,8 @@ class VideoTrack : public Track {
 
   Projection* GetProjection() const;
 
+  const char* GetColourSpace() const { return m_colour_space; }
+
  private:
   long long m_width;
   long long m_height;
@@ -534,7 +536,7 @@ class VideoTrack : public Track {
   long long m_display_height;
   long long m_display_unit;
   long long m_stereo_mode;
-
+  char* m_colour_space;
   double m_rate;
 
   Colour* m_colour;
diff --git a/media/libvpx/libvpx/third_party/libwebm/mkvparser/mkvreader.cc b/media/libvpx/libvpx/third_party/libwebm/mkvparser/mkvreader.cc
index b8fd00c263..467260402a 100644
--- a/media/libvpx/libvpx/third_party/libwebm/mkvparser/mkvreader.cc
+++ b/media/libvpx/libvpx/third_party/libwebm/mkvparser/mkvreader.cc
@@ -7,6 +7,8 @@
 // be found in the AUTHORS file in the root of the source tree.
 #include "mkvparser/mkvreader.h"
 
+#include <sys/types.h>
+
 #include <cassert>
 
 namespace mkvparser {
@@ -116,8 +118,16 @@ int MkvReader::Read(long long offset, long len, unsigned char* buffer) {
 
   if (status)
     return -1;  // error
-#else
+#elif defined(_WIN32)
+  fseeko64(m_file, static_cast<off_t>(offset), SEEK_SET);
+#elif !(defined(__ANDROID__) && __ANDROID_API__ < 24 && !defined(__LP64__) && \
+        defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS == 64)
+  // POSIX.1 has fseeko and ftello. fseeko and ftello are not available before
+  // Android API level 24. See
+  // https://android.googlesource.com/platform/bionic/+/main/docs/32-bit-abi.md
   fseeko(m_file, static_cast<off_t>(offset), SEEK_SET);
+#else
+  fseek(m_file, static_cast<long>(offset), SEEK_SET);
 #endif
 
   const size_t size = fread(buffer, 1, len, m_file);
diff --git a/media/libvpx/libvpx/third_party/libyuv/LICENSE b/media/libvpx/libvpx/third_party/libyuv/LICENSE
new file mode 100644
index 0000000000..c911747a6b
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/libyuv/LICENSE
@@ -0,0 +1,29 @@
+Copyright 2011 The LibYuv Project Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+
+  * Neither the name of Google nor the names of its contributors may
+    be used to endorse or promote products derived from this software
+    without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/media/libvpx/libvpx/third_party/libyuv/README.libvpx b/media/libvpx/libvpx/third_party/libyuv/README.libvpx
index 485f79c0ff..9519dc4bee 100644
--- a/media/libvpx/libvpx/third_party/libyuv/README.libvpx
+++ b/media/libvpx/libvpx/third_party/libyuv/README.libvpx
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: https://chromium.googlesource.com/libyuv/libyuv
-Version: de944ed8c74909ea6fbd743a22efe1e55e851b83
+Version: a37e7bfece9e0676ae90a1700b0ec85b0f4f22a1
 License: BSD
 License File: LICENSE
 
@@ -8,15 +8,16 @@ Description:
 libyuv is an open source project that includes YUV conversion and scaling
 functionality.
 
-The optimized scaler in libyuv is used in multiple resolution encoder example,
-which down-samples the original input video (f.g. 1280x720) a number of times
-in order to encode multiple resolution bit streams.
+The optimized scaler in libyuv is used in the multiple resolution encoder
+example which down-samples the original input video (f.g. 1280x720) a number of
+times in order to encode multiple resolution bit streams.
 
 Local Modifications:
-rm -rf .gitignore .gn AUTHORS Android.mk BUILD.gn CMakeLists.txt DEPS LICENSE \
-  LICENSE_THIRD_PARTY OWNERS PATENTS PRESUBMIT.py README.chromium README.md \
-  all.gyp build_overrides/ chromium/ codereview.settings docs/ \
-  download_vs_toolchain.py gyp_libyuv gyp_libyuv.py include/libyuv.h \
-  include/libyuv/compare_row.h libyuv.gyp libyuv.gypi libyuv_nacl.gyp \
-  libyuv_test.gyp linux.mk public.mk setup_links.py sync_chromium.py \
-  third_party/ tools/ unit_test/ util/ winarm.mk
+Disable ARGBToRGB24Row_AVX512VBMI due to build failure on Mac.
+rm libyuv/include/libyuv.h libyuv/include/libyuv/compare_row.h
+mv libyuv/include tmp/
+mv libyuv/source tmp/
+mv libyuv/LICENSE tmp/
+rm -rf libyuv
+
+mv tmp/* third_party/libyuv/
diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/basic_types.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/basic_types.h
index 54a2181430..01d9dfc773 100644
--- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/basic_types.h
+++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/basic_types.h
@@ -8,82 +8,36 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_
 #define INCLUDE_LIBYUV_BASIC_TYPES_H_
 
-#include <stddef.h>  // for NULL, size_t
+#include <stddef.h>  // For size_t and NULL
+
+#if !defined(INT_TYPES_DEFINED) && !defined(GG_LONGLONG)
+#define INT_TYPES_DEFINED
 
 #if defined(_MSC_VER) && (_MSC_VER < 1600)
 #include <sys/types.h>  // for uintptr_t on x86
+typedef unsigned __int64 uint64_t;
+typedef __int64 int64_t;
+typedef unsigned int uint32_t;
+typedef int int32_t;
+typedef unsigned short uint16_t;
+typedef short int16_t;
+typedef unsigned char uint8_t;
+typedef signed char int8_t;
 #else
-#include <stdint.h>  // for uintptr_t
-#endif
-
-#ifndef GG_LONGLONG
-#ifndef INT_TYPES_DEFINED
-#define INT_TYPES_DEFINED
-#ifdef COMPILER_MSVC
-typedef unsigned __int64 uint64;
-typedef __int64 int64;
-#ifndef INT64_C
-#define INT64_C(x) x ## I64
-#endif
-#ifndef UINT64_C
-#define UINT64_C(x) x ## UI64
-#endif
-#define INT64_F "I64"
-#else  // COMPILER_MSVC
-#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
-typedef unsigned long uint64;  // NOLINT
-typedef long int64;  // NOLINT
-#ifndef INT64_C
-#define INT64_C(x) x ## L
-#endif
-#ifndef UINT64_C
-#define UINT64_C(x) x ## UL
-#endif
-#define INT64_F "l"
-#else  // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
-typedef unsigned long long uint64;  // NOLINT
-typedef long long int64;  // NOLINT
-#ifndef INT64_C
-#define INT64_C(x) x ## LL
-#endif
-#ifndef UINT64_C
-#define UINT64_C(x) x ## ULL
-#endif
-#define INT64_F "ll"
-#endif  // __LP64__
-#endif  // COMPILER_MSVC
-typedef unsigned int uint32;
-typedef int int32;
-typedef unsigned short uint16;  // NOLINT
-typedef short int16;  // NOLINT
-typedef unsigned char uint8;
-typedef signed char int8;
+#include <stdint.h>  // for uintptr_t and C99 types
+#endif               // defined(_MSC_VER) && (_MSC_VER < 1600)
+typedef uint64_t uint64;
+typedef int64_t int64;
+typedef uint32_t uint32;
+typedef int32_t int32;
+typedef uint16_t uint16;
+typedef int16_t int16;
+typedef uint8_t uint8;
+typedef int8_t int8;
 #endif  // INT_TYPES_DEFINED
-#endif  // GG_LONGLONG
-
-// Detect compiler is for x86 or x64.
-#if defined(__x86_64__) || defined(_M_X64) || \
-    defined(__i386__) || defined(_M_IX86)
-#define CPU_X86 1
-#endif
-// Detect compiler is for ARM.
-#if defined(__arm__) || defined(_M_ARM)
-#define CPU_ARM 1
-#endif
-
-#ifndef ALIGNP
-#ifdef __cplusplus
-#define ALIGNP(p, t) \
-    (reinterpret_cast<uint8*>(((reinterpret_cast<uintptr_t>(p) + \
-    ((t) - 1)) & ~((t) - 1))))
-#else
-#define ALIGNP(p, t) \
-    ((uint8*)((((uintptr_t)(p) + ((t) - 1)) & ~((t) - 1))))  /* NOLINT */
-#endif
-#endif
 
 #if !defined(LIBYUV_API)
 #if defined(_WIN32) || defined(__CYGWIN__)
@@ -95,24 +49,17 @@ typedef signed char int8;
 #define LIBYUV_API
 #endif  // LIBYUV_BUILDING_SHARED_LIBRARY
 #elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__APPLE__) && \
-    (defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \
-    defined(LIBYUV_USING_SHARED_LIBRARY))
-#define LIBYUV_API __attribute__ ((visibility ("default")))
+    (defined(LIBYUV_BUILDING_SHARED_LIBRARY) ||                      \
+     defined(LIBYUV_USING_SHARED_LIBRARY))
+#define LIBYUV_API __attribute__((visibility("default")))
 #else
 #define LIBYUV_API
 #endif  // __GNUC__
 #endif  // LIBYUV_API
 
+// TODO(fbarchard): Remove bool macros.
 #define LIBYUV_BOOL int
 #define LIBYUV_FALSE 0
 #define LIBYUV_TRUE 1
 
-// Visual C x86 or GCC little endian.
-#if defined(__x86_64__) || defined(_M_X64) || \
-  defined(__i386__) || defined(_M_IX86) || \
-  defined(__arm__) || defined(_M_ARM) || \
-  (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
-#define LIBYUV_LITTLE_ENDIAN
-#endif
-
-#endif  // INCLUDE_LIBYUV_BASIC_TYPES_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_BASIC_TYPES_H_
diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/compare.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/compare.h
index 08b2bb2ecf..3353ad71c6 100644
--- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/compare.h
+++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/compare.h
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_COMPARE_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_COMPARE_H_
 #define INCLUDE_LIBYUV_COMPARE_H_
 
 #include "libyuv/basic_types.h"
@@ -20,59 +20,92 @@ extern "C" {
 
 // Compute a hash for specified memory. Seed of 5381 recommended.
 LIBYUV_API
-uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);
+uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed);
+
+// Hamming Distance
+LIBYUV_API
+uint64_t ComputeHammingDistance(const uint8_t* src_a,
+                                const uint8_t* src_b,
+                                int count);
 
 // Scan an opaque argb image and return fourcc based on alpha offset.
 // Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
 LIBYUV_API
-uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height);
+uint32_t ARGBDetect(const uint8_t* argb,
+                    int stride_argb,
+                    int width,
+                    int height);
 
 // Sum Square Error - used to compute Mean Square Error or PSNR.
 LIBYUV_API
-uint64 ComputeSumSquareError(const uint8* src_a,
-                             const uint8* src_b, int count);
+uint64_t ComputeSumSquareError(const uint8_t* src_a,
+                               const uint8_t* src_b,
+                               int count);
 
 LIBYUV_API
-uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
-                                  const uint8* src_b, int stride_b,
-                                  int width, int height);
+uint64_t ComputeSumSquareErrorPlane(const uint8_t* src_a,
+                                    int stride_a,
+                                    const uint8_t* src_b,
+                                    int stride_b,
+                                    int width,
+                                    int height);
 
 static const int kMaxPsnr = 128;
 
 LIBYUV_API
-double SumSquareErrorToPsnr(uint64 sse, uint64 count);
+double SumSquareErrorToPsnr(uint64_t sse, uint64_t count);
 
 LIBYUV_API
-double CalcFramePsnr(const uint8* src_a, int stride_a,
-                     const uint8* src_b, int stride_b,
-                     int width, int height);
+double CalcFramePsnr(const uint8_t* src_a,
+                     int stride_a,
+                     const uint8_t* src_b,
+                     int stride_b,
+                     int width,
+                     int height);
 
 LIBYUV_API
-double I420Psnr(const uint8* src_y_a, int stride_y_a,
-                const uint8* src_u_a, int stride_u_a,
-                const uint8* src_v_a, int stride_v_a,
-                const uint8* src_y_b, int stride_y_b,
-                const uint8* src_u_b, int stride_u_b,
-                const uint8* src_v_b, int stride_v_b,
-                int width, int height);
+double I420Psnr(const uint8_t* src_y_a,
+                int stride_y_a,
+                const uint8_t* src_u_a,
+                int stride_u_a,
+                const uint8_t* src_v_a,
+                int stride_v_a,
+                const uint8_t* src_y_b,
+                int stride_y_b,
+                const uint8_t* src_u_b,
+                int stride_u_b,
+                const uint8_t* src_v_b,
+                int stride_v_b,
+                int width,
+                int height);
 
 LIBYUV_API
-double CalcFrameSsim(const uint8* src_a, int stride_a,
-                     const uint8* src_b, int stride_b,
-                     int width, int height);
+double CalcFrameSsim(const uint8_t* src_a,
+                     int stride_a,
+                     const uint8_t* src_b,
+                     int stride_b,
+                     int width,
+                     int height);
 
 LIBYUV_API
-double I420Ssim(const uint8* src_y_a, int stride_y_a,
-                const uint8* src_u_a, int stride_u_a,
-                const uint8* src_v_a, int stride_v_a,
-                const uint8* src_y_b, int stride_y_b,
-                const uint8* src_u_b, int stride_u_b,
-                const uint8* src_v_b, int stride_v_b,
-                int width, int height);
+double I420Ssim(const uint8_t* src_y_a,
+                int stride_y_a,
+                const uint8_t* src_u_a,
+                int stride_u_a,
+                const uint8_t* src_v_a,
+                int stride_v_a,
+                const uint8_t* src_y_b,
+                int stride_y_b,
+                const uint8_t* src_u_b,
+                int stride_u_b,
+                const uint8_t* src_v_b,
+                int stride_v_b,
+                int width,
+                int height);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_COMPARE_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_COMPARE_H_
diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert.h
index fcfcf544e1..d12ef24f79 100644
--- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert.h
+++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert.h
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_CONVERT_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_CONVERT_H_
 #define INCLUDE_LIBYUV_CONVERT_H_
 
 #include "libyuv/basic_types.h"
@@ -16,8 +16,8 @@
 #include "libyuv/rotate.h"  // For enum RotationMode.
 
 // TODO(fbarchard): fix WebRTC source to include following libyuv headers:
-#include "libyuv/convert_argb.h"  // For WebRTC I420ToARGB. b/620
-#include "libyuv/convert_from.h"  // For WebRTC ConvertFromI420. b/620
+#include "libyuv/convert_argb.h"      // For WebRTC I420ToARGB. b/620
+#include "libyuv/convert_from.h"      // For WebRTC ConvertFromI420. b/620
 #include "libyuv/planar_functions.h"  // For WebRTC I420Rect, CopyPlane. b/618
 
 #ifdef __cplusplus
@@ -27,195 +27,335 @@ extern "C" {
 
 // Convert I444 to I420.
 LIBYUV_API
-int I444ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int I444ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert I422 to I420.
 LIBYUV_API
-int I422ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// Convert I411 to I420.
-LIBYUV_API
-int I411ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int I422ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Copy I420 to I420.
 #define I420ToI420 I420Copy
 LIBYUV_API
-int I420Copy(const uint8* src_y, int src_stride_y,
-             const uint8* src_u, int src_stride_u,
-             const uint8* src_v, int src_stride_v,
-             uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int width, int height);
+int I420Copy(const uint8_t* src_y,
+             int src_stride_y,
+             const uint8_t* src_u,
+             int src_stride_u,
+             const uint8_t* src_v,
+             int src_stride_v,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height);
+
+// Copy I010 to I010
+#define I010ToI010 I010Copy
+#define H010ToH010 I010Copy
+LIBYUV_API
+int I010Copy(const uint16_t* src_y,
+             int src_stride_y,
+             const uint16_t* src_u,
+             int src_stride_u,
+             const uint16_t* src_v,
+             int src_stride_v,
+             uint16_t* dst_y,
+             int dst_stride_y,
+             uint16_t* dst_u,
+             int dst_stride_u,
+             uint16_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height);
+
+// Convert 10 bit YUV to 8 bit
+#define H010ToH420 I010ToI420
+LIBYUV_API
+int I010ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert I400 (grey) to I420.
 LIBYUV_API
-int I400ToI420(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int I400ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 #define J400ToJ420 I400ToI420
 
 // Convert NV12 to I420.
 LIBYUV_API
-int NV12ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_uv, int src_stride_uv,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int NV12ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert NV21 to I420.
 LIBYUV_API
-int NV21ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_vu, int src_stride_vu,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int NV21ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert YUY2 to I420.
 LIBYUV_API
-int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int YUY2ToI420(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert UYVY to I420.
 LIBYUV_API
-int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int UYVYToI420(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert M420 to I420.
 LIBYUV_API
-int M420ToI420(const uint8* src_m420, int src_stride_m420,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int M420ToI420(const uint8_t* src_m420,
+               int src_stride_m420,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert Android420 to I420.
 LIBYUV_API
-int Android420ToI420(const uint8* src_y, int src_stride_y,
-                     const uint8* src_u, int src_stride_u,
-                     const uint8* src_v, int src_stride_v,
-                     int pixel_stride_uv,
-                     uint8* dst_y, int dst_stride_y,
-                     uint8* dst_u, int dst_stride_u,
-                     uint8* dst_v, int dst_stride_v,
-                     int width, int height);
+int Android420ToI420(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     int src_pixel_stride_uv,
+                     uint8_t* dst_y,
+                     int dst_stride_y,
+                     uint8_t* dst_u,
+                     int dst_stride_u,
+                     uint8_t* dst_v,
+                     int dst_stride_v,
+                     int width,
+                     int height);
 
 // ARGB little endian (bgra in memory) to I420.
 LIBYUV_API
-int ARGBToI420(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int ARGBToI420(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // BGRA little endian (argb in memory) to I420.
 LIBYUV_API
-int BGRAToI420(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int BGRAToI420(const uint8_t* src_bgra,
+               int src_stride_bgra,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // ABGR little endian (rgba in memory) to I420.
 LIBYUV_API
-int ABGRToI420(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int ABGRToI420(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // RGBA little endian (abgr in memory) to I420.
 LIBYUV_API
-int RGBAToI420(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int RGBAToI420(const uint8_t* src_rgba,
+               int src_stride_rgba,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // RGB little endian (bgr in memory) to I420.
 LIBYUV_API
-int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
-                uint8* dst_y, int dst_stride_y,
-                uint8* dst_u, int dst_stride_u,
-                uint8* dst_v, int dst_stride_v,
-                int width, int height);
+int RGB24ToI420(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_y,
+                int dst_stride_y,
+                uint8_t* dst_u,
+                int dst_stride_u,
+                uint8_t* dst_v,
+                int dst_stride_v,
+                int width,
+                int height);
 
 // RGB big endian (rgb in memory) to I420.
 LIBYUV_API
-int RAWToI420(const uint8* src_frame, int src_stride_frame,
-              uint8* dst_y, int dst_stride_y,
-              uint8* dst_u, int dst_stride_u,
-              uint8* dst_v, int dst_stride_v,
-              int width, int height);
+int RAWToI420(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int width,
+              int height);
 
 // RGB16 (RGBP fourcc) little endian to I420.
 LIBYUV_API
-int RGB565ToI420(const uint8* src_frame, int src_stride_frame,
-                 uint8* dst_y, int dst_stride_y,
-                 uint8* dst_u, int dst_stride_u,
-                 uint8* dst_v, int dst_stride_v,
-                 int width, int height);
+int RGB565ToI420(const uint8_t* src_rgb565,
+                 int src_stride_rgb565,
+                 uint8_t* dst_y,
+                 int dst_stride_y,
+                 uint8_t* dst_u,
+                 int dst_stride_u,
+                 uint8_t* dst_v,
+                 int dst_stride_v,
+                 int width,
+                 int height);
 
 // RGB15 (RGBO fourcc) little endian to I420.
 LIBYUV_API
-int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame,
-                   uint8* dst_y, int dst_stride_y,
-                   uint8* dst_u, int dst_stride_u,
-                   uint8* dst_v, int dst_stride_v,
-                   int width, int height);
+int ARGB1555ToI420(const uint8_t* src_argb1555,
+                   int src_stride_argb1555,
+                   uint8_t* dst_y,
+                   int dst_stride_y,
+                   uint8_t* dst_u,
+                   int dst_stride_u,
+                   uint8_t* dst_v,
+                   int dst_stride_v,
+                   int width,
+                   int height);
 
 // RGB12 (R444 fourcc) little endian to I420.
 LIBYUV_API
-int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame,
-                   uint8* dst_y, int dst_stride_y,
-                   uint8* dst_u, int dst_stride_u,
-                   uint8* dst_v, int dst_stride_v,
-                   int width, int height);
+int ARGB4444ToI420(const uint8_t* src_argb4444,
+                   int src_stride_argb4444,
+                   uint8_t* dst_y,
+                   int dst_stride_y,
+                   uint8_t* dst_u,
+                   int dst_stride_u,
+                   uint8_t* dst_v,
+                   int dst_stride_v,
+                   int width,
+                   int height);
 
 #ifdef HAVE_JPEG
 // src_width/height provided by capture.
 // dst_width/height for clipping determine final size.
 LIBYUV_API
-int MJPGToI420(const uint8* sample, size_t sample_size,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int src_width, int src_height,
-               int dst_width, int dst_height);
+int MJPGToI420(const uint8_t* sample,
+               size_t sample_size,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int src_width,
+               int src_height,
+               int dst_width,
+               int dst_height);
 
 // Query size of MJPG in pixels.
 LIBYUV_API
-int MJPGSize(const uint8* sample, size_t sample_size,
-             int* width, int* height);
+int MJPGSize(const uint8_t* sample,
+             size_t sample_size,
+             int* width,
+             int* height);
 #endif
 
 // Convert camera sample to I420 with cropping, rotation and vertical flip.
@@ -238,22 +378,29 @@ int MJPGSize(const uint8* sample, size_t sample_size,
 //    Must be less than or equal to src_width/src_height
 //    Cropping parameters are pre-rotation.
 // "rotation" can be 0, 90, 180 or 270.
-// "format" is a fourcc. ie 'I420', 'YUY2'
+// "fourcc" is a fourcc. ie 'I420', 'YUY2'
 // Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
 LIBYUV_API
-int ConvertToI420(const uint8* src_frame, size_t src_size,
-                  uint8* dst_y, int dst_stride_y,
-                  uint8* dst_u, int dst_stride_u,
-                  uint8* dst_v, int dst_stride_v,
-                  int crop_x, int crop_y,
-                  int src_width, int src_height,
-                  int crop_width, int crop_height,
+int ConvertToI420(const uint8_t* sample,
+                  size_t sample_size,
+                  uint8_t* dst_y,
+                  int dst_stride_y,
+                  uint8_t* dst_u,
+                  int dst_stride_u,
+                  uint8_t* dst_v,
+                  int dst_stride_v,
+                  int crop_x,
+                  int crop_y,
+                  int src_width,
+                  int src_height,
+                  int crop_width,
+                  int crop_height,
                   enum RotationMode rotation,
-                  uint32 format);
+                  uint32_t fourcc);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_CONVERT_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_CONVERT_H_
diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_argb.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_argb.h
index 19672f3269..ab772b6c32 100644
--- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_argb.h
+++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_argb.h
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_
 #define INCLUDE_LIBYUV_CONVERT_ARGB_H_
 
 #include "libyuv/basic_types.h"
@@ -30,258 +30,621 @@ extern "C" {
 
 // Copy ARGB to ARGB.
 LIBYUV_API
-int ARGBCopy(const uint8* src_argb, int src_stride_argb,
-             uint8* dst_argb, int dst_stride_argb,
-             int width, int height);
+int ARGBCopy(const uint8_t* src_argb,
+             int src_stride_argb,
+             uint8_t* dst_argb,
+             int dst_stride_argb,
+             int width,
+             int height);
 
 // Convert I420 to ARGB.
 LIBYUV_API
-int I420ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int I420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Duplicate prototype for function in convert_from.h for remoting.
 LIBYUV_API
-int I420ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int I420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert I010 to ARGB.
+LIBYUV_API
+int I010ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert I010 to ARGB.
+LIBYUV_API
+int I010ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert I010 to ABGR.
+LIBYUV_API
+int I010ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert H010 to ARGB.
+LIBYUV_API
+int H010ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert H010 to ABGR.
+LIBYUV_API
+int H010ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
 
 // Convert I422 to ARGB.
 LIBYUV_API
-int I422ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int I422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Convert I444 to ARGB.
 LIBYUV_API
-int I444ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int I444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Convert J444 to ARGB.
 LIBYUV_API
-int J444ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int J444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Convert I444 to ABGR.
 LIBYUV_API
-int I444ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height);
-
-// Convert I411 to ARGB.
-LIBYUV_API
-int I411ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int I444ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
 
 // Convert I420 with Alpha to preattenuated ARGB.
 LIBYUV_API
-int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
-                    const uint8* src_u, int src_stride_u,
-                    const uint8* src_v, int src_stride_v,
-                    const uint8* src_a, int src_stride_a,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height, int attenuate);
+int I420AlphaToARGB(const uint8_t* src_y,
+                    int src_stride_y,
+                    const uint8_t* src_u,
+                    int src_stride_u,
+                    const uint8_t* src_v,
+                    int src_stride_v,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height,
+                    int attenuate);
 
 // Convert I420 with Alpha to preattenuated ABGR.
 LIBYUV_API
-int I420AlphaToABGR(const uint8* src_y, int src_stride_y,
-                    const uint8* src_u, int src_stride_u,
-                    const uint8* src_v, int src_stride_v,
-                    const uint8* src_a, int src_stride_a,
-                    uint8* dst_abgr, int dst_stride_abgr,
-                    int width, int height, int attenuate);
+int I420AlphaToABGR(const uint8_t* src_y,
+                    int src_stride_y,
+                    const uint8_t* src_u,
+                    int src_stride_u,
+                    const uint8_t* src_v,
+                    int src_stride_v,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_abgr,
+                    int dst_stride_abgr,
+                    int width,
+                    int height,
+                    int attenuate);
 
 // Convert I400 (grey) to ARGB.  Reverse of ARGBToI400.
 LIBYUV_API
-int I400ToARGB(const uint8* src_y, int src_stride_y,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int I400ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Convert J400 (jpeg grey) to ARGB.
 LIBYUV_API
-int J400ToARGB(const uint8* src_y, int src_stride_y,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int J400ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Alias.
 #define YToARGB I400ToARGB
 
 // Convert NV12 to ARGB.
 LIBYUV_API
-int NV12ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_uv, int src_stride_uv,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int NV12ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Convert NV21 to ARGB.
 LIBYUV_API
-int NV21ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_vu, int src_stride_vu,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int NV21ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert NV12 to ABGR.
+int NV12ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert NV21 to ABGR.
+LIBYUV_API
+int NV21ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert NV12 to RGB24.
+LIBYUV_API
+int NV12ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_uv,
+                int src_stride_uv,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
+
+// Convert NV21 to RGB24.
+LIBYUV_API
+int NV21ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_vu,
+                int src_stride_vu,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
 
 // Convert M420 to ARGB.
 LIBYUV_API
-int M420ToARGB(const uint8* src_m420, int src_stride_m420,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int M420ToARGB(const uint8_t* src_m420,
+               int src_stride_m420,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Convert YUY2 to ARGB.
 LIBYUV_API
-int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int YUY2ToARGB(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Convert UYVY to ARGB.
 LIBYUV_API
-int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int UYVYToARGB(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Convert J420 to ARGB.
 LIBYUV_API
-int J420ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int J420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Convert J422 to ARGB.
 LIBYUV_API
-int J422ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int J422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Convert J420 to ABGR.
 LIBYUV_API
-int J420ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height);
+int J420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
 
 // Convert J422 to ABGR.
 LIBYUV_API
-int J422ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height);
+int J422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
 
 // Convert H420 to ARGB.
 LIBYUV_API
-int H420ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int H420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Convert H422 to ARGB.
 LIBYUV_API
-int H422ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int H422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Convert H420 to ABGR.
 LIBYUV_API
-int H420ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height);
+int H420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
 
 // Convert H422 to ABGR.
 LIBYUV_API
-int H422ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height);
+int H422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert H010 to ARGB.
+LIBYUV_API
+int H010ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert I010 to AR30.
+LIBYUV_API
+int I010ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+
+// Convert H010 to AR30.
+LIBYUV_API
+int H010ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+
+// Convert I010 to AB30.
+LIBYUV_API
+int I010ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height);
+
+// Convert H010 to AB30.
+LIBYUV_API
+int H010ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height);
 
 // BGRA little endian (argb in memory) to ARGB.
 LIBYUV_API
-int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int BGRAToARGB(const uint8_t* src_bgra,
+               int src_stride_bgra,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // ABGR little endian (rgba in memory) to ARGB.
 LIBYUV_API
-int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int ABGRToARGB(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // RGBA little endian (abgr in memory) to ARGB.
 LIBYUV_API
-int RGBAToARGB(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int RGBAToARGB(const uint8_t* src_rgba,
+               int src_stride_rgba,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Deprecated function name.
 #define BG24ToARGB RGB24ToARGB
 
 // RGB little endian (bgr in memory) to ARGB.
 LIBYUV_API
-int RGB24ToARGB(const uint8* src_frame, int src_stride_frame,
-                uint8* dst_argb, int dst_stride_argb,
-                int width, int height);
+int RGB24ToARGB(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_argb,
+                int dst_stride_argb,
+                int width,
+                int height);
 
 // RGB big endian (rgb in memory) to ARGB.
 LIBYUV_API
-int RAWToARGB(const uint8* src_frame, int src_stride_frame,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height);
+int RAWToARGB(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height);
 
 // RGB16 (RGBP fourcc) little endian to ARGB.
 LIBYUV_API
-int RGB565ToARGB(const uint8* src_frame, int src_stride_frame,
-                 uint8* dst_argb, int dst_stride_argb,
-                 int width, int height);
+int RGB565ToARGB(const uint8_t* src_rgb565,
+                 int src_stride_rgb565,
+                 uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int width,
+                 int height);
 
 // RGB15 (RGBO fourcc) little endian to ARGB.
 LIBYUV_API
-int ARGB1555ToARGB(const uint8* src_frame, int src_stride_frame,
-                   uint8* dst_argb, int dst_stride_argb,
-                   int width, int height);
+int ARGB1555ToARGB(const uint8_t* src_argb1555,
+                   int src_stride_argb1555,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   int width,
+                   int height);
 
 // RGB12 (R444 fourcc) little endian to ARGB.
 LIBYUV_API
-int ARGB4444ToARGB(const uint8* src_frame, int src_stride_frame,
-                   uint8* dst_argb, int dst_stride_argb,
-                   int width, int height);
+int ARGB4444ToARGB(const uint8_t* src_argb4444,
+                   int src_stride_argb4444,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   int width,
+                   int height);
+
+// Aliases
+#define AB30ToARGB AR30ToABGR
+#define AB30ToABGR AR30ToARGB
+#define AB30ToAR30 AR30ToAB30
+
+// Convert AR30 To ARGB.
+LIBYUV_API
+int AR30ToARGB(const uint8_t* src_ar30,
+               int src_stride_ar30,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert AR30 To ABGR.
+LIBYUV_API
+int AR30ToABGR(const uint8_t* src_ar30,
+               int src_stride_ar30,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert AR30 To AB30.
+LIBYUV_API
+int AR30ToAB30(const uint8_t* src_ar30,
+               int src_stride_ar30,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height);
 
 #ifdef HAVE_JPEG
 // src_width/height provided by capture
 // dst_width/height for clipping determine final size.
 LIBYUV_API
-int MJPGToARGB(const uint8* sample, size_t sample_size,
-               uint8* dst_argb, int dst_stride_argb,
-               int src_width, int src_height,
-               int dst_width, int dst_height);
+int MJPGToARGB(const uint8_t* sample,
+               size_t sample_size,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int src_width,
+               int src_height,
+               int dst_width,
+               int dst_height);
 #endif
 
+// Convert Android420 to ARGB.
+LIBYUV_API
+int Android420ToARGB(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     int src_pixel_stride_uv,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     int width,
+                     int height);
+
+// Convert Android420 to ABGR.
+LIBYUV_API
+int Android420ToABGR(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     int src_pixel_stride_uv,
+                     uint8_t* dst_abgr,
+                     int dst_stride_abgr,
+                     int width,
+                     int height);
+
 // Convert camera sample to ARGB with cropping, rotation and vertical flip.
-// "src_size" is needed to parse MJPG.
+// "sample_size" is needed to parse MJPG.
 // "dst_stride_argb" number of bytes in a row of the dst_argb plane.
 //   Normally this would be the same as dst_width, with recommended alignment
 //   to 16 bytes for better efficiency.
@@ -300,20 +663,25 @@ int MJPGToARGB(const uint8* sample, size_t sample_size,
 //    Must be less than or equal to src_width/src_height
 //    Cropping parameters are pre-rotation.
 // "rotation" can be 0, 90, 180 or 270.
-// "format" is a fourcc. ie 'I420', 'YUY2'
+// "fourcc" is a fourcc. ie 'I420', 'YUY2'
 // Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
 LIBYUV_API
-int ConvertToARGB(const uint8* src_frame, size_t src_size,
-                  uint8* dst_argb, int dst_stride_argb,
-                  int crop_x, int crop_y,
-                  int src_width, int src_height,
-                  int crop_width, int crop_height,
+int ConvertToARGB(const uint8_t* sample,
+                  size_t sample_size,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int crop_x,
+                  int crop_y,
+                  int src_width,
+                  int src_height,
+                  int crop_width,
+                  int crop_height,
                   enum RotationMode rotation,
-                  uint32 format);
+                  uint32_t fourcc);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_CONVERT_ARGB_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_CONVERT_ARGB_H_
diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_from.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_from.h
index 39e1578a0e..5cd8a4bfc0 100644
--- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_from.h
+++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_from.h
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_
 #define INCLUDE_LIBYUV_CONVERT_FROM_H_
 
 #include "libyuv/basic_types.h"
@@ -21,159 +21,322 @@ extern "C" {
 
 // See Also convert.h for conversions from formats to I420.
 
-// I420Copy in convert to I420ToI420.
+// Convert 8 bit YUV to 10 bit.
+#define H420ToH010 I420ToI010
+int I420ToI010(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 LIBYUV_API
-int I420ToI422(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int I420ToI422(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 LIBYUV_API
-int I420ToI444(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-LIBYUV_API
-int I420ToI411(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int I420ToI444(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21.
 LIBYUV_API
-int I400Copy(const uint8* src_y, int src_stride_y,
-             uint8* dst_y, int dst_stride_y,
-             int width, int height);
+int I400Copy(const uint8_t* src_y,
+             int src_stride_y,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             int width,
+             int height);
 
 LIBYUV_API
-int I420ToNV12(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height);
+int I420ToNV12(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
 
 LIBYUV_API
-int I420ToNV21(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_vu, int dst_stride_vu,
-               int width, int height);
+int I420ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height);
 
 LIBYUV_API
-int I420ToYUY2(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_frame, int dst_stride_frame,
-               int width, int height);
+int I420ToYUY2(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height);
 
 LIBYUV_API
-int I420ToUYVY(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_frame, int dst_stride_frame,
-               int width, int height);
+int I420ToUYVY(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_uyvy,
+               int dst_stride_uyvy,
+               int width,
+               int height);
 
 LIBYUV_API
-int I420ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int I420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 LIBYUV_API
-int I420ToBGRA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int I420ToBGRA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_bgra,
+               int dst_stride_bgra,
+               int width,
+               int height);
 
 LIBYUV_API
-int I420ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int I420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
 
 LIBYUV_API
-int I420ToRGBA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_rgba, int dst_stride_rgba,
-               int width, int height);
+int I420ToRGBA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height);
 
 LIBYUV_API
-int I420ToRGB24(const uint8* src_y, int src_stride_y,
-                const uint8* src_u, int src_stride_u,
-                const uint8* src_v, int src_stride_v,
-                uint8* dst_frame, int dst_stride_frame,
-                int width, int height);
+int I420ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
 
 LIBYUV_API
-int I420ToRAW(const uint8* src_y, int src_stride_y,
-              const uint8* src_u, int src_stride_u,
-              const uint8* src_v, int src_stride_v,
-              uint8* dst_frame, int dst_stride_frame,
-              int width, int height);
+int I420ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
 
 LIBYUV_API
-int I420ToRGB565(const uint8* src_y, int src_stride_y,
-                 const uint8* src_u, int src_stride_u,
-                 const uint8* src_v, int src_stride_v,
-                 uint8* dst_frame, int dst_stride_frame,
-                 int width, int height);
+int H420ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
+
+LIBYUV_API
+int H420ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
+
+LIBYUV_API
+int I420ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height);
+
+LIBYUV_API
+int I422ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height);
 
 // Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
 // Values in dither matrix from 0 to 7 recommended.
 // The order of the dither matrix is first byte is upper left.
 
 LIBYUV_API
-int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
-                       const uint8* src_u, int src_stride_u,
-                       const uint8* src_v, int src_stride_v,
-                       uint8* dst_frame, int dst_stride_frame,
-                       const uint8* dither4x4, int width, int height);
+int I420ToRGB565Dither(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const uint8_t* dither4x4,
+                       int width,
+                       int height);
 
 LIBYUV_API
-int I420ToARGB1555(const uint8* src_y, int src_stride_y,
-                   const uint8* src_u, int src_stride_u,
-                   const uint8* src_v, int src_stride_v,
-                   uint8* dst_frame, int dst_stride_frame,
-                   int width, int height);
+int I420ToARGB1555(const uint8_t* src_y,
+                   int src_stride_y,
+                   const uint8_t* src_u,
+                   int src_stride_u,
+                   const uint8_t* src_v,
+                   int src_stride_v,
+                   uint8_t* dst_argb1555,
+                   int dst_stride_argb1555,
+                   int width,
+                   int height);
 
 LIBYUV_API
-int I420ToARGB4444(const uint8* src_y, int src_stride_y,
-                   const uint8* src_u, int src_stride_u,
-                   const uint8* src_v, int src_stride_v,
-                   uint8* dst_frame, int dst_stride_frame,
-                   int width, int height);
+int I420ToARGB4444(const uint8_t* src_y,
+                   int src_stride_y,
+                   const uint8_t* src_u,
+                   int src_stride_u,
+                   const uint8_t* src_v,
+                   int src_stride_v,
+                   uint8_t* dst_argb4444,
+                   int dst_stride_argb4444,
+                   int width,
+                   int height);
+
+// Convert I420 to AR30.
+LIBYUV_API
+int I420ToAR30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+
+// Convert H420 to AR30.
+LIBYUV_API
+int H420ToAR30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
 
 // Convert I420 to specified format.
 // "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
 //    buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.
 LIBYUV_API
-int ConvertFromI420(const uint8* y, int y_stride,
-                    const uint8* u, int u_stride,
-                    const uint8* v, int v_stride,
-                    uint8* dst_sample, int dst_sample_stride,
-                    int width, int height,
-                    uint32 format);
+int ConvertFromI420(const uint8_t* y,
+                    int y_stride,
+                    const uint8_t* u,
+                    int u_stride,
+                    const uint8_t* v,
+                    int v_stride,
+                    uint8_t* dst_sample,
+                    int dst_sample_stride,
+                    int width,
+                    int height,
+                    uint32_t fourcc);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_CONVERT_FROM_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_CONVERT_FROM_H_
diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h
index 1df53200dd..05c815a093 100644
--- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h
+++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
 #define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
 
 #include "libyuv/basic_types.h"
@@ -21,170 +21,267 @@ extern "C" {
 // Copy ARGB to ARGB.
 #define ARGBToARGB ARGBCopy
 LIBYUV_API
-int ARGBCopy(const uint8* src_argb, int src_stride_argb,
-             uint8* dst_argb, int dst_stride_argb,
-             int width, int height);
+int ARGBCopy(const uint8_t* src_argb,
+             int src_stride_argb,
+             uint8_t* dst_argb,
+             int dst_stride_argb,
+             int width,
+             int height);
 
 // Convert ARGB To BGRA.
 LIBYUV_API
-int ARGBToBGRA(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_bgra, int dst_stride_bgra,
-               int width, int height);
+int ARGBToBGRA(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_bgra,
+               int dst_stride_bgra,
+               int width,
+               int height);
 
 // Convert ARGB To ABGR.
 LIBYUV_API
-int ARGBToABGR(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height);
+int ARGBToABGR(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
 
 // Convert ARGB To RGBA.
 LIBYUV_API
-int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_rgba, int dst_stride_rgba,
-               int width, int height);
+int ARGBToRGBA(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height);
+
+// Aliases
+#define ARGBToAB30 ABGRToAR30
+#define ABGRToAB30 ARGBToAR30
+
+// Convert ABGR To AR30.
+LIBYUV_API
+int ABGRToAR30(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+
+// Convert ARGB To AR30.
+LIBYUV_API
+int ARGBToAR30(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
 
 // Convert ARGB To RGB24.
 LIBYUV_API
-int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
-                uint8* dst_rgb24, int dst_stride_rgb24,
-                int width, int height);
+int ARGBToRGB24(const uint8_t* src_argb,
+                int src_stride_argb,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
 
 // Convert ARGB To RAW.
 LIBYUV_API
-int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_rgb, int dst_stride_rgb,
-              int width, int height);
+int ARGBToRAW(const uint8_t* src_argb,
+              int src_stride_argb,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
 
 // Convert ARGB To RGB565.
 LIBYUV_API
-int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
-                 uint8* dst_rgb565, int dst_stride_rgb565,
-                 int width, int height);
+int ARGBToRGB565(const uint8_t* src_argb,
+                 int src_stride_argb,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height);
 
 // Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
 // Values in dither matrix from 0 to 7 recommended.
 // The order of the dither matrix is first byte is upper left.
 // TODO(fbarchard): Consider pointer to 2d array for dither4x4.
-// const uint8(*dither)[4][4];
+// const uint8_t(*dither)[4][4];
 LIBYUV_API
-int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_rgb565, int dst_stride_rgb565,
-                       const uint8* dither4x4, int width, int height);
+int ARGBToRGB565Dither(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const uint8_t* dither4x4,
+                       int width,
+                       int height);
 
 // Convert ARGB To ARGB1555.
 LIBYUV_API
-int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_argb1555, int dst_stride_argb1555,
-                   int width, int height);
+int ARGBToARGB1555(const uint8_t* src_argb,
+                   int src_stride_argb,
+                   uint8_t* dst_argb1555,
+                   int dst_stride_argb1555,
+                   int width,
+                   int height);
 
 // Convert ARGB To ARGB4444.
 LIBYUV_API
-int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_argb4444, int dst_stride_argb4444,
-                   int width, int height);
+int ARGBToARGB4444(const uint8_t* src_argb,
+                   int src_stride_argb,
+                   uint8_t* dst_argb4444,
+                   int dst_stride_argb4444,
+                   int width,
+                   int height);
 
 // Convert ARGB To I444.
 LIBYUV_API
-int ARGBToI444(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int ARGBToI444(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert ARGB To I422.
 LIBYUV_API
-int ARGBToI422(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int ARGBToI422(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert ARGB To I420. (also in convert.h)
 LIBYUV_API
-int ARGBToI420(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int ARGBToI420(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert ARGB to J420. (JPeg full range I420).
 LIBYUV_API
-int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yj, int dst_stride_yj,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int ARGBToJ420(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert ARGB to J422.
 LIBYUV_API
-int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yj, int dst_stride_yj,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// Convert ARGB To I411.
-LIBYUV_API
-int ARGBToI411(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int ARGBToJ422(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert ARGB to J400. (JPeg full range).
 LIBYUV_API
-int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yj, int dst_stride_yj,
-               int width, int height);
+int ARGBToJ400(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               int width,
+               int height);
 
 // Convert ARGB to I400.
 LIBYUV_API
-int ARGBToI400(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height);
+int ARGBToI400(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height);
 
 // Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB)
 LIBYUV_API
-int ARGBToG(const uint8* src_argb, int src_stride_argb,
-            uint8* dst_g, int dst_stride_g,
-            int width, int height);
+int ARGBToG(const uint8_t* src_argb,
+            int src_stride_argb,
+            uint8_t* dst_g,
+            int dst_stride_g,
+            int width,
+            int height);
 
 // Convert ARGB To NV12.
 LIBYUV_API
-int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height);
+int ARGBToNV12(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
 
 // Convert ARGB To NV21.
 LIBYUV_API
-int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_vu, int dst_stride_vu,
-               int width, int height);
+int ARGBToNV21(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height);
 
 // Convert ARGB To NV21.
 LIBYUV_API
-int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_vu, int dst_stride_vu,
-               int width, int height);
+int ARGBToNV21(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height);
 
 // Convert ARGB To YUY2.
 LIBYUV_API
-int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yuy2, int dst_stride_yuy2,
-               int width, int height);
+int ARGBToYUY2(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height);
 
 // Convert ARGB To UYVY.
 LIBYUV_API
-int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_uyvy, int dst_stride_uyvy,
-               int width, int height);
+int ARGBToUYVY(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_uyvy,
+               int dst_stride_uyvy,
+               int width,
+               int height);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/cpu_id.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/cpu_id.h
index dfb7445e2f..0229cb5e73 100644
--- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/cpu_id.h
+++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/cpu_id.h
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_CPU_ID_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_CPU_ID_H_
 #define INCLUDE_LIBYUV_CPU_ID_H_
 
 #include "libyuv/basic_types.h"
@@ -31,50 +31,89 @@ static const int kCpuHasX86 = 0x10;
 static const int kCpuHasSSE2 = 0x20;
 static const int kCpuHasSSSE3 = 0x40;
 static const int kCpuHasSSE41 = 0x80;
-static const int kCpuHasSSE42 = 0x100;
+static const int kCpuHasSSE42 = 0x100;  // unused at this time.
 static const int kCpuHasAVX = 0x200;
 static const int kCpuHasAVX2 = 0x400;
 static const int kCpuHasERMS = 0x800;
 static const int kCpuHasFMA3 = 0x1000;
-static const int kCpuHasAVX3 = 0x2000;
-// 0x2000, 0x4000, 0x8000 reserved for future X86 flags.
+static const int kCpuHasF16C = 0x2000;
+static const int kCpuHasGFNI = 0x4000;
+static const int kCpuHasAVX512BW = 0x8000;
+static const int kCpuHasAVX512VL = 0x10000;
+static const int kCpuHasAVX512VBMI = 0x20000;
+static const int kCpuHasAVX512VBMI2 = 0x40000;
+static const int kCpuHasAVX512VBITALG = 0x80000;
+static const int kCpuHasAVX512VPOPCNTDQ = 0x100000;
 
 // These flags are only valid on MIPS processors.
-static const int kCpuHasMIPS = 0x10000;
-static const int kCpuHasDSPR2 = 0x20000;
+static const int kCpuHasMIPS = 0x200000;
+static const int kCpuHasMSA = 0x400000;
 
-// Internal function used to auto-init.
+// Optional init function. TestCpuFlag does an auto-init.
+// Returns cpu_info flags.
 LIBYUV_API
 int InitCpuFlags(void);
 
+// Detect CPU has SSE2 etc.
+// Test_flag parameter should be one of kCpuHas constants above.
+// Returns non-zero if instruction set is detected
+static __inline int TestCpuFlag(int test_flag) {
+  LIBYUV_API extern int cpu_info_;
+#ifdef __ATOMIC_RELAXED
+  int cpu_info = __atomic_load_n(&cpu_info_, __ATOMIC_RELAXED);
+#else
+  int cpu_info = cpu_info_;
+#endif
+  return (!cpu_info ? InitCpuFlags() : cpu_info) & test_flag;
+}
+
 // Internal function for parsing /proc/cpuinfo.
 LIBYUV_API
 int ArmCpuCaps(const char* cpuinfo_name);
 
-// Detect CPU has SSE2 etc.
-// Test_flag parameter should be one of kCpuHas constants above.
-// returns non-zero if instruction set is detected
-static __inline int TestCpuFlag(int test_flag) {
-  LIBYUV_API extern int cpu_info_;
-  return (!cpu_info_ ? InitCpuFlags() : cpu_info_) & test_flag;
-}
-
 // For testing, allow CPU flags to be disabled.
 // ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
 // MaskCpuFlags(-1) to enable all cpu specific optimizations.
 // MaskCpuFlags(1) to disable all cpu specific optimizations.
+// MaskCpuFlags(0) to reset state so next call will auto init.
+// Returns cpu_info flags.
 LIBYUV_API
-void MaskCpuFlags(int enable_flags);
+int MaskCpuFlags(int enable_flags);
+
+// Sets the CPU flags to |cpu_flags|, bypassing the detection code. |cpu_flags|
+// should be a valid combination of the kCpuHas constants above and include
+// kCpuInitialized. Use this method when running in a sandboxed process where
+// the detection code might fail (as it might access /proc/cpuinfo). In such
+// cases the cpu_info can be obtained from a non sandboxed process by calling
+// InitCpuFlags() and passed to the sandboxed process (via command line
+// parameters, IPC...) which can then call this method to initialize the CPU
+// flags.
+// Notes:
+// - when specifying 0 for |cpu_flags|, the auto initialization is enabled
+//   again.
+// - enabling CPU features that are not supported by the CPU will result in
+//   undefined behavior.
+// TODO(fbarchard): consider writing a helper function that translates from
+// other library CPU info to libyuv CPU info and add a .md doc that explains
+// CPU detection.
+static __inline void SetCpuFlags(int cpu_flags) {
+  LIBYUV_API extern int cpu_info_;
+#ifdef __ATOMIC_RELAXED
+  __atomic_store_n(&cpu_info_, cpu_flags, __ATOMIC_RELAXED);
+#else
+  cpu_info_ = cpu_flags;
+#endif
+}
 
 // Low level cpuid for X86. Returns zeros on other CPUs.
 // eax is the info type that you want.
 // ecx is typically the cpu number, and should normally be zero.
 LIBYUV_API
-void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info);
+void CpuId(int info_eax, int info_ecx, int* cpu_info);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_CPU_ID_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_CPU_ID_H_
diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/macros_msa.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/macros_msa.h
new file mode 100644
index 0000000000..bba0e8aeda
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/macros_msa.h
@@ -0,0 +1,233 @@
+/*
+ *  Copyright 2016 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_MACROS_MSA_H_
+#define INCLUDE_LIBYUV_MACROS_MSA_H_
+
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#include <msa.h>
+#include <stdint.h>
+
+#if (__mips_isa_rev >= 6)
+#define LW(psrc)                                        \
+  ({                                                    \
+    const uint8_t* psrc_lw_m = (const uint8_t*)(psrc);  \
+    uint32_t val_m;                                     \
+    asm volatile("lw  %[val_m],  %[psrc_lw_m]  \n"      \
+                 : [val_m] "=r"(val_m)                  \
+                 : [psrc_lw_m] "m"(*psrc_lw_m));        \
+    val_m;                                              \
+  })
+
+#if (__mips == 64)
+#define LD(psrc)                                        \
+  ({                                                    \
+    const uint8_t* psrc_ld_m = (const uint8_t*)(psrc);  \
+    uint64_t val_m = 0;                                 \
+    asm volatile("ld  %[val_m],  %[psrc_ld_m]  \n"      \
+                 : [val_m] "=r"(val_m)                  \
+                 : [psrc_ld_m] "m"(*psrc_ld_m));        \
+    val_m;                                              \
+  })
+#else  // !(__mips == 64)
+#define LD(psrc)                                                         \
+  ({                                                                     \
+    const uint8_t* psrc_ld_m = (const uint8_t*)(psrc);                   \
+    uint32_t val0_m, val1_m;                                             \
+    uint64_t val_m = 0;                                                  \
+    val0_m = LW(psrc_ld_m);                                              \
+    val1_m = LW(psrc_ld_m + 4);                                          \
+    val_m = (uint64_t)(val1_m);                             /* NOLINT */ \
+    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \
+    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           /* NOLINT */ \
+    val_m;                                                               \
+  })
+#endif  // (__mips == 64)
+
+#define SW(val, pdst)                                   \
+  ({                                                    \
+    uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
+    uint32_t val_m = (val);                             \
+    asm volatile("sw  %[val_m],  %[pdst_sw_m]  \n"      \
+                 : [pdst_sw_m] "=m"(*pdst_sw_m)         \
+                 : [val_m] "r"(val_m));                 \
+  })
+
+#if (__mips == 64)
+#define SD(val, pdst)                                   \
+  ({                                                    \
+    uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
+    uint64_t val_m = (val);                             \
+    asm volatile("sd  %[val_m],  %[pdst_sd_m]  \n"      \
+                 : [pdst_sd_m] "=m"(*pdst_sd_m)         \
+                 : [val_m] "r"(val_m));                 \
+  })
+#else  // !(__mips == 64)
+#define SD(val, pdst)                                        \
+  ({                                                         \
+    uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */      \
+    uint32_t val0_m, val1_m;                                 \
+    val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
+    val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
+    SW(val0_m, pdst_sd_m);                                   \
+    SW(val1_m, pdst_sd_m + 4);                               \
+  })
+#endif  // !(__mips == 64)
+#else   // !(__mips_isa_rev >= 6)
+#define LW(psrc)                                        \
+  ({                                                    \
+    const uint8_t* psrc_lw_m = (const uint8_t*)(psrc);  \
+    uint32_t val_m;                                     \
+    asm volatile("ulw  %[val_m],  %[psrc_lw_m]  \n"     \
+                 : [val_m] "=r"(val_m)                  \
+                 : [psrc_lw_m] "m"(*psrc_lw_m));        \
+    val_m;                                              \
+  })
+
+#if (__mips == 64)
+#define LD(psrc)                                        \
+  ({                                                    \
+    const uint8_t* psrc_ld_m = (const uint8_t*)(psrc);  \
+    uint64_t val_m = 0;                                 \
+    asm volatile("uld  %[val_m],  %[psrc_ld_m]  \n"     \
+                 : [val_m] "=r"(val_m)                  \
+                 : [psrc_ld_m] "m"(*psrc_ld_m));        \
+    val_m;                                              \
+  })
+#else  // !(__mips == 64)
+#define LD(psrc)                                                         \
+  ({                                                                     \
+    const uint8_t* psrc_ld_m = (const uint8_t*)(psrc);                   \
+    uint32_t val0_m, val1_m;                                             \
+    uint64_t val_m = 0;                                                  \
+    val0_m = LW(psrc_ld_m);                                              \
+    val1_m = LW(psrc_ld_m + 4);                                          \
+    val_m = (uint64_t)(val1_m);                             /* NOLINT */ \
+    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \
+    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           /* NOLINT */ \
+    val_m;                                                               \
+  })
+#endif  // (__mips == 64)
+
+#define SW(val, pdst)                                   \
+  ({                                                    \
+    uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
+    uint32_t val_m = (val);                             \
+    asm volatile("usw  %[val_m],  %[pdst_sw_m]  \n"     \
+                 : [pdst_sw_m] "=m"(*pdst_sw_m)         \
+                 : [val_m] "r"(val_m));                 \
+  })
+
+#define SD(val, pdst)                                        \
+  ({                                                         \
+    uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */      \
+    uint32_t val0_m, val1_m;                                 \
+    val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
+    val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
+    SW(val0_m, pdst_sd_m);                                   \
+    SW(val1_m, pdst_sd_m + 4);                               \
+  })
+#endif  // (__mips_isa_rev >= 6)
+
+// TODO(fbarchard): Consider removing __VAR_ARGS versions.
+#define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */
+#define LD_UB(...) LD_B(const v16u8, __VA_ARGS__)
+
+#define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
+#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
+
+#define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
+#define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
+
+/* Description : Load two vectors with 16 'byte' sized elements
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Load 16 byte elements in 'out0' from (psrc)
+                 Load 16 byte elements in 'out1' from (psrc + stride)
+*/
+#define LD_B2(RTYPE, psrc, stride, out0, out1) \
+  {                                            \
+    out0 = LD_B(RTYPE, (psrc));                \
+    out1 = LD_B(RTYPE, (psrc) + stride);       \
+  }
+#define LD_UB2(...) LD_B2(const v16u8, __VA_ARGS__)
+
+#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
+  {                                                        \
+    LD_B2(RTYPE, (psrc), stride, out0, out1);              \
+    LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
+  }
+#define LD_UB4(...) LD_B4(const v16u8, __VA_ARGS__)
+
+/* Description : Store two vectors with stride each having 16 'byte' sized
+                 elements
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 16 byte elements from 'in0' to (pdst)
+                 Store 16 byte elements from 'in1' to (pdst + stride)
+*/
+#define ST_B2(RTYPE, in0, in1, pdst, stride) \
+  {                                          \
+    ST_B(RTYPE, in0, (pdst));                \
+    ST_B(RTYPE, in1, (pdst) + stride);       \
+  }
+#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
+
+#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
+  {                                                      \
+    ST_B2(RTYPE, in0, in1, (pdst), stride);              \
+    ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
+  }
+#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
+
+/* Description : Store vectors of 8 halfword elements with stride
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 8 halfword elements from 'in0' to (pdst)
+                 Store 8 halfword elements from 'in1' to (pdst + stride)
+*/
+#define ST_H2(RTYPE, in0, in1, pdst, stride) \
+  {                                          \
+    ST_H(RTYPE, in0, (pdst));                \
+    ST_H(RTYPE, in1, (pdst) + stride);       \
+  }
+#define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
+
+// TODO(fbarchard): Consider using __msa_vshf_b and __msa_ilvr_b directly.
+/* Description : Shuffle byte vector elements as per mask vector
+   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Byte elements from 'in0' & 'in1' are copied selectively to
+                 'out0' as per control vector 'mask0'
+*/
+#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)  \
+  {                                                                   \
+    out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
+    out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
+  }
+#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
+
+/* Description : Interleave both left and right half of input vectors
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of byte elements from 'in0' and 'in1' are
+                 interleaved and written to 'out0'
+*/
+#define ILVRL_B2(RTYPE, in0, in1, out0, out1)           \
+  {                                                     \
+    out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
+    out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
+  }
+#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
+
+#endif /* !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) */
+
+#endif  // INCLUDE_LIBYUV_MACROS_MSA_H_
diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h
index 8423121d11..275f8d4c18 100644
--- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h
+++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_
 #define INCLUDE_LIBYUV_MJPEG_DECODER_H_
 
 #include "libyuv/basic_types.h"
@@ -26,25 +26,24 @@ namespace libyuv {
 extern "C" {
 #endif
 
-LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size);
+LIBYUV_BOOL ValidateJpeg(const uint8_t* sample, size_t sample_size);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-static const uint32 kUnknownDataSize = 0xFFFFFFFF;
+static const uint32_t kUnknownDataSize = 0xFFFFFFFF;
 
 enum JpegSubsamplingType {
   kJpegYuv420,
   kJpegYuv422,
-  kJpegYuv411,
   kJpegYuv444,
   kJpegYuv400,
   kJpegUnknown
 };
 
 struct Buffer {
-  const uint8* data;
+  const uint8_t* data;
   int len;
 };
 
@@ -66,7 +65,7 @@ struct SetJmpErrorMgr;
 class LIBYUV_API MJpegDecoder {
  public:
   typedef void (*CallbackFunction)(void* opaque,
-                                   const uint8* const* data,
+                                   const uint8_t* const* data,
                                    const int* strides,
                                    int rows);
 
@@ -86,7 +85,7 @@ class LIBYUV_API MJpegDecoder {
   // If return value is LIBYUV_TRUE, then the values for all the following
   // getters are populated.
   // src_len is the size of the compressed mjpeg frame in bytes.
-  LIBYUV_BOOL LoadFrame(const uint8* src, size_t src_len);
+  LIBYUV_BOOL LoadFrame(const uint8_t* src, size_t src_len);
 
   // Returns width of the last loaded frame in pixels.
   int GetWidth();
@@ -139,18 +138,22 @@ class LIBYUV_API MJpegDecoder {
   // at least GetComponentSize(i). The pointers in planes are incremented
   // to point to after the end of the written data.
   // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
-  LIBYUV_BOOL DecodeToBuffers(uint8** planes, int dst_width, int dst_height);
+  LIBYUV_BOOL DecodeToBuffers(uint8_t** planes, int dst_width, int dst_height);
 
   // Decodes the entire image and passes the data via repeated calls to a
   // callback function. Each call will get the data for a whole number of
   // image scanlines.
   // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
-  LIBYUV_BOOL DecodeToCallback(CallbackFunction fn, void* opaque,
-                        int dst_width, int dst_height);
+  LIBYUV_BOOL DecodeToCallback(CallbackFunction fn,
+                               void* opaque,
+                               int dst_width,
+                               int dst_height);
 
   // The helper function which recognizes the jpeg sub-sampling type.
   static JpegSubsamplingType JpegSubsamplingTypeHelper(
-     int* subsample_x, int* subsample_y, int number_of_components);
+      int* subsample_x,
+      int* subsample_y,
+      int number_of_components);
 
  private:
   void AllocOutputBuffers(int num_outbufs);
@@ -159,7 +162,7 @@ class LIBYUV_API MJpegDecoder {
   LIBYUV_BOOL StartDecode();
   LIBYUV_BOOL FinishDecode();
 
-  void SetScanlinePointers(uint8** data);
+  void SetScanlinePointers(uint8_t** data);
   LIBYUV_BOOL DecodeImcuRow();
 
   int GetComponentScanlinePadding(int component);
@@ -178,15 +181,15 @@ class LIBYUV_API MJpegDecoder {
 
   // Temporaries used to point to scanline outputs.
   int num_outbufs_;  // Outermost size of all arrays below.
-  uint8*** scanlines_;
+  uint8_t*** scanlines_;
   int* scanlines_sizes_;
   // Temporary buffer used for decoding when we can't decode directly to the
   // output buffers. Large enough for just one iMCU row.
-  uint8** databuf_;
+  uint8_t** databuf_;
   int* databuf_strides_;
 };
 
 }  // namespace libyuv
 
 #endif  //  __cplusplus
-#endif  // INCLUDE_LIBYUV_MJPEG_DECODER_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_MJPEG_DECODER_H_
diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/planar_functions.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/planar_functions.h
index 9662516c57..91137baba2 100644
--- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/planar_functions.h
+++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/planar_functions.h
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
 #define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
 
 #include "libyuv/basic_types.h"
@@ -22,449 +22,10 @@ namespace libyuv {
 extern "C" {
 #endif
 
-// Copy a plane of data.
-LIBYUV_API
-void CopyPlane(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height);
-
-LIBYUV_API
-void CopyPlane_16(const uint16* src_y, int src_stride_y,
-                  uint16* dst_y, int dst_stride_y,
-                  int width, int height);
-
-// Set a plane of data to a 32 bit value.
-LIBYUV_API
-void SetPlane(uint8* dst_y, int dst_stride_y,
-              int width, int height,
-              uint32 value);
-
-// Split interleaved UV plane into separate U and V planes.
-LIBYUV_API
-void SplitUVPlane(const uint8* src_uv, int src_stride_uv,
-                  uint8* dst_u, int dst_stride_u,
-                  uint8* dst_v, int dst_stride_v,
-                  int width, int height);
-
-// Merge separate U and V planes into one interleaved UV plane.
-LIBYUV_API
-void MergeUVPlane(const uint8* src_u, int src_stride_u,
-                  const uint8* src_v, int src_stride_v,
-                  uint8* dst_uv, int dst_stride_uv,
-                  int width, int height);
-
-// Copy I400.  Supports inverting.
-LIBYUV_API
-int I400ToI400(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height);
-
-#define J400ToJ400 I400ToI400
-
-// Copy I422 to I422.
-#define I422ToI422 I422Copy
-LIBYUV_API
-int I422Copy(const uint8* src_y, int src_stride_y,
-             const uint8* src_u, int src_stride_u,
-             const uint8* src_v, int src_stride_v,
-             uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int width, int height);
-
-// Copy I444 to I444.
-#define I444ToI444 I444Copy
-LIBYUV_API
-int I444Copy(const uint8* src_y, int src_stride_y,
-             const uint8* src_u, int src_stride_u,
-             const uint8* src_v, int src_stride_v,
-             uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int width, int height);
-
-// Convert YUY2 to I422.
-LIBYUV_API
-int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// Convert UYVY to I422.
-LIBYUV_API
-int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-LIBYUV_API
-int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height);
-
-LIBYUV_API
-int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height);
-
-// Convert I420 to I400. (calls CopyPlane ignoring u/v).
-LIBYUV_API
-int I420ToI400(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height);
-
-// Alias
-#define J420ToJ400 I420ToI400
-#define I420ToI420Mirror I420Mirror
-
-// I420 mirror.
-LIBYUV_API
-int I420Mirror(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// Alias
-#define I400ToI400Mirror I400Mirror
-
-// I400 mirror.  A single plane is mirrored horizontally.
-// Pass negative height to achieve 180 degree rotation.
-LIBYUV_API
-int I400Mirror(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height);
-
-// Alias
-#define ARGBToARGBMirror ARGBMirror
-
-// ARGB mirror.
-LIBYUV_API
-int ARGBMirror(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-// Convert NV12 to RGB565.
-LIBYUV_API
-int NV12ToRGB565(const uint8* src_y, int src_stride_y,
-                 const uint8* src_uv, int src_stride_uv,
-                 uint8* dst_rgb565, int dst_stride_rgb565,
-                 int width, int height);
-
-// I422ToARGB is in convert_argb.h
-// Convert I422 to BGRA.
-LIBYUV_API
-int I422ToBGRA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_bgra, int dst_stride_bgra,
-               int width, int height);
-
-// Convert I422 to ABGR.
-LIBYUV_API
-int I422ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height);
-
-// Convert I422 to RGBA.
-LIBYUV_API
-int I422ToRGBA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_rgba, int dst_stride_rgba,
-               int width, int height);
-
-// Alias
-#define RGB24ToRAW RAWToRGB24
-
-LIBYUV_API
-int RAWToRGB24(const uint8* src_raw, int src_stride_raw,
-               uint8* dst_rgb24, int dst_stride_rgb24,
-               int width, int height);
-
-// Draw a rectangle into I420.
-LIBYUV_API
-int I420Rect(uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int x, int y, int width, int height,
-             int value_y, int value_u, int value_v);
-
-// Draw a rectangle into ARGB.
-LIBYUV_API
-int ARGBRect(uint8* dst_argb, int dst_stride_argb,
-             int x, int y, int width, int height, uint32 value);
-
-// Convert ARGB to gray scale ARGB.
-LIBYUV_API
-int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-// Make a rectangle of ARGB gray scale.
-LIBYUV_API
-int ARGBGray(uint8* dst_argb, int dst_stride_argb,
-             int x, int y, int width, int height);
-
-// Make a rectangle of ARGB Sepia tone.
-LIBYUV_API
-int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
-              int x, int y, int width, int height);
-
-// Apply a matrix rotation to each ARGB pixel.
-// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2.
-// The first 4 coefficients apply to B, G, R, A and produce B of the output.
-// The next 4 coefficients apply to B, G, R, A and produce G of the output.
-// The next 4 coefficients apply to B, G, R, A and produce R of the output.
-// The last 4 coefficients apply to B, G, R, A and produce A of the output.
-LIBYUV_API
-int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_argb, int dst_stride_argb,
-                    const int8* matrix_argb,
-                    int width, int height);
-
-// Deprecated. Use ARGBColorMatrix instead.
-// Apply a matrix rotation to each ARGB pixel.
-// matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1.
-// The first 4 coefficients apply to B, G, R, A and produce B of the output.
-// The next 4 coefficients apply to B, G, R, A and produce G of the output.
-// The last 4 coefficients apply to B, G, R, A and produce R of the output.
-LIBYUV_API
-int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
-                   const int8* matrix_rgb,
-                   int x, int y, int width, int height);
-
-// Apply a color table each ARGB pixel.
-// Table contains 256 ARGB values.
-LIBYUV_API
-int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
-                   const uint8* table_argb,
-                   int x, int y, int width, int height);
-
-// Apply a color table each ARGB pixel but preserve destination alpha.
-// Table contains 256 ARGB values.
-LIBYUV_API
-int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
-                  const uint8* table_argb,
-                  int x, int y, int width, int height);
-
-// Apply a luma/color table each ARGB pixel but preserve destination alpha.
-// Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from
-// RGB (YJ style) and C is an 8 bit color component (R, G or B).
-LIBYUV_API
-int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_argb, int dst_stride_argb,
-                       const uint8* luma_rgb_table,
-                       int width, int height);
-
-// Apply a 3 term polynomial to ARGB values.
-// poly points to a 4x4 matrix.  The first row is constants.  The 2nd row is
-// coefficients for b, g, r and a.  The 3rd row is coefficients for b squared,
-// g squared, r squared and a squared.  The 4rd row is coefficients for b to
-// the 3, g to the 3, r to the 3 and a to the 3.  The values are summed and
-// result clamped to 0 to 255.
-// A polynomial approximation can be dirived using software such as 'R'.
-
-LIBYUV_API
-int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_argb, int dst_stride_argb,
-                   const float* poly,
-                   int width, int height);
-
-// Quantize a rectangle of ARGB. Alpha unaffected.
-// scale is a 16 bit fractional fixed point scaler between 0 and 65535.
-// interval_size should be a value between 1 and 255.
-// interval_offset should be a value between 0 and 255.
-LIBYUV_API
-int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
-                 int scale, int interval_size, int interval_offset,
-                 int x, int y, int width, int height);
-
-// Copy ARGB to ARGB.
-LIBYUV_API
-int ARGBCopy(const uint8* src_argb, int src_stride_argb,
-             uint8* dst_argb, int dst_stride_argb,
-             int width, int height);
-
-// Copy Alpha channel of ARGB to alpha of ARGB.
-LIBYUV_API
-int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
-                  uint8* dst_argb, int dst_stride_argb,
-                  int width, int height);
-
-// Extract the alpha channel from ARGB.
-LIBYUV_API
-int ARGBExtractAlpha(const uint8* src_argb, int src_stride_argb,
-                     uint8* dst_a, int dst_stride_a,
-                     int width, int height);
-
-// Copy Y channel to Alpha of ARGB.
-LIBYUV_API
-int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
-                     uint8* dst_argb, int dst_stride_argb,
-                     int width, int height);
-
-typedef void (*ARGBBlendRow)(const uint8* src_argb0, const uint8* src_argb1,
-                             uint8* dst_argb, int width);
-
-// Get function to Alpha Blend ARGB pixels and store to destination.
-LIBYUV_API
-ARGBBlendRow GetARGBBlend();
-
-// Alpha Blend ARGB images and store to destination.
-// Source is pre-multiplied by alpha using ARGBAttenuate.
-// Alpha of destination is set to 255.
-LIBYUV_API
-int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
-              const uint8* src_argb1, int src_stride_argb1,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height);
-
-// Alpha Blend plane and store to destination.
-// Source is not pre-multiplied by alpha.
-LIBYUV_API
-int BlendPlane(const uint8* src_y0, int src_stride_y0,
-               const uint8* src_y1, int src_stride_y1,
-               const uint8* alpha, int alpha_stride,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height);
-
-// Alpha Blend YUV images and store to destination.
-// Source is not pre-multiplied by alpha.
-// Alpha is full width x height and subsampled to half size to apply to UV.
-LIBYUV_API
-int I420Blend(const uint8* src_y0, int src_stride_y0,
-              const uint8* src_u0, int src_stride_u0,
-              const uint8* src_v0, int src_stride_v0,
-              const uint8* src_y1, int src_stride_y1,
-              const uint8* src_u1, int src_stride_u1,
-              const uint8* src_v1, int src_stride_v1,
-              const uint8* alpha, int alpha_stride,
-              uint8* dst_y, int dst_stride_y,
-              uint8* dst_u, int dst_stride_u,
-              uint8* dst_v, int dst_stride_v,
-              int width, int height);
-
-// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.
-LIBYUV_API
-int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
-                 const uint8* src_argb1, int src_stride_argb1,
-                 uint8* dst_argb, int dst_stride_argb,
-                 int width, int height);
-
-// Add ARGB image with ARGB image. Saturates to 255.
-LIBYUV_API
-int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
-            const uint8* src_argb1, int src_stride_argb1,
-            uint8* dst_argb, int dst_stride_argb,
-            int width, int height);
-
-// Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0.
-LIBYUV_API
-int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
-                 const uint8* src_argb1, int src_stride_argb1,
-                 uint8* dst_argb, int dst_stride_argb,
-                 int width, int height);
-
-// Convert I422 to YUY2.
-LIBYUV_API
-int I422ToYUY2(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_frame, int dst_stride_frame,
-               int width, int height);
-
-// Convert I422 to UYVY.
-LIBYUV_API
-int I422ToUYVY(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_frame, int dst_stride_frame,
-               int width, int height);
-
-// Convert unattentuated ARGB to preattenuated ARGB.
-LIBYUV_API
-int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
-                  uint8* dst_argb, int dst_stride_argb,
-                  int width, int height);
-
-// Convert preattentuated ARGB to unattenuated ARGB.
-LIBYUV_API
-int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height);
-
-// Internal function - do not call directly.
-// Computes table of cumulative sum for image where the value is the sum
-// of all values above and to the left of the entry. Used by ARGBBlur.
-LIBYUV_API
-int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
-                             int32* dst_cumsum, int dst_stride32_cumsum,
-                             int width, int height);
-
-// Blur ARGB image.
-// dst_cumsum table of width * (height + 1) * 16 bytes aligned to
-//   16 byte boundary.
-// dst_stride32_cumsum is number of ints in a row (width * 4).
-// radius is number of pixels around the center.  e.g. 1 = 3x3. 2=5x5.
-// Blur is optimized for radius of 5 (11x11) or less.
-LIBYUV_API
-int ARGBBlur(const uint8* src_argb, int src_stride_argb,
-             uint8* dst_argb, int dst_stride_argb,
-             int32* dst_cumsum, int dst_stride32_cumsum,
-             int width, int height, int radius);
-
-// Multiply ARGB image by ARGB value.
-LIBYUV_API
-int ARGBShade(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height, uint32 value);
-
-// Interpolate between two images using specified amount of interpolation
-// (0 to 255) and store to destination.
-// 'interpolation' is specified as 8 bit fraction where 0 means 100% src0
-// and 255 means 1% src0 and 99% src1.
-LIBYUV_API
-int InterpolatePlane(const uint8* src0, int src_stride0,
-                     const uint8* src1, int src_stride1,
-                     uint8* dst, int dst_stride,
-                     int width, int height, int interpolation);
-
-// Interpolate between two ARGB images using specified amount of interpolation
-// Internally calls InterpolatePlane with width * 4 (bpp).
-LIBYUV_API
-int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
-                    const uint8* src_argb1, int src_stride_argb1,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height, int interpolation);
-
-// Interpolate between two YUV images using specified amount of interpolation
-// Internally calls InterpolatePlane on each plane where the U and V planes
-// are half width and half height.
-LIBYUV_API
-int I420Interpolate(const uint8* src0_y, int src0_stride_y,
-                    const uint8* src0_u, int src0_stride_u,
-                    const uint8* src0_v, int src0_stride_v,
-                    const uint8* src1_y, int src1_stride_y,
-                    const uint8* src1_u, int src1_stride_u,
-                    const uint8* src1_v, int src1_stride_v,
-                    uint8* dst_y, int dst_stride_y,
-                    uint8* dst_u, int dst_stride_u,
-                    uint8* dst_v, int dst_stride_v,
-                    int width, int height, int interpolation);
-
-#if defined(__pnacl__) || defined(__CLR_VER) || \
-    (defined(__i386__) && !defined(__SSE2__))
+// TODO(fbarchard): Move cpu macros to row.h
+#if defined(__pnacl__) || defined(__CLR_VER) ||            \
+    (defined(__native_client__) && defined(__x86_64__)) || \
+    (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
 #define LIBYUV_DISABLE_X86
 #endif
 // MemorySanitizer does not support assembly code yet. http://crbug.com/344505
@@ -479,43 +40,808 @@ int I420Interpolate(const uint8* src0_y, int src0_stride_y,
 #define HAS_ARGBAFFINEROW_SSE2
 #endif
 
+// Copy a plane of data.
+LIBYUV_API
+void CopyPlane(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height);
+
+LIBYUV_API
+void CopyPlane_16(const uint16_t* src_y,
+                  int src_stride_y,
+                  uint16_t* dst_y,
+                  int dst_stride_y,
+                  int width,
+                  int height);
+
+LIBYUV_API
+void Convert16To8Plane(const uint16_t* src_y,
+                       int src_stride_y,
+                       uint8_t* dst_y,
+                       int dst_stride_y,
+                       int scale,  // 16384 for 10 bits
+                       int width,
+                       int height);
+
+LIBYUV_API
+void Convert8To16Plane(const uint8_t* src_y,
+                       int src_stride_y,
+                       uint16_t* dst_y,
+                       int dst_stride_y,
+                       int scale,  // 1024 for 10 bits
+                       int width,
+                       int height);
+
+// Set a plane of data to a 32 bit value.
+LIBYUV_API
+void SetPlane(uint8_t* dst_y,
+              int dst_stride_y,
+              int width,
+              int height,
+              uint32_t value);
+
+// Split interleaved UV plane into separate U and V planes.
+LIBYUV_API
+void SplitUVPlane(const uint8_t* src_uv,
+                  int src_stride_uv,
+                  uint8_t* dst_u,
+                  int dst_stride_u,
+                  uint8_t* dst_v,
+                  int dst_stride_v,
+                  int width,
+                  int height);
+
+// Merge separate U and V planes into one interleaved UV plane.
+LIBYUV_API
+void MergeUVPlane(const uint8_t* src_u,
+                  int src_stride_u,
+                  const uint8_t* src_v,
+                  int src_stride_v,
+                  uint8_t* dst_uv,
+                  int dst_stride_uv,
+                  int width,
+                  int height);
+
+// Split interleaved RGB plane into separate R, G and B planes.
+LIBYUV_API
+void SplitRGBPlane(const uint8_t* src_rgb,
+                   int src_stride_rgb,
+                   uint8_t* dst_r,
+                   int dst_stride_r,
+                   uint8_t* dst_g,
+                   int dst_stride_g,
+                   uint8_t* dst_b,
+                   int dst_stride_b,
+                   int width,
+                   int height);
+
+// Merge separate R, G and B planes into one interleaved RGB plane.
+LIBYUV_API
+void MergeRGBPlane(const uint8_t* src_r,
+                   int src_stride_r,
+                   const uint8_t* src_g,
+                   int src_stride_g,
+                   const uint8_t* src_b,
+                   int src_stride_b,
+                   uint8_t* dst_rgb,
+                   int dst_stride_rgb,
+                   int width,
+                   int height);
+
+// Copy I400.  Supports inverting.
+LIBYUV_API
+int I400ToI400(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height);
+
+#define J400ToJ400 I400ToI400
+
+// Copy I422 to I422.
+#define I422ToI422 I422Copy
+LIBYUV_API
+int I422Copy(const uint8_t* src_y,
+             int src_stride_y,
+             const uint8_t* src_u,
+             int src_stride_u,
+             const uint8_t* src_v,
+             int src_stride_v,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height);
+
+// Copy I444 to I444.
+#define I444ToI444 I444Copy
+LIBYUV_API
+int I444Copy(const uint8_t* src_y,
+             int src_stride_y,
+             const uint8_t* src_u,
+             int src_stride_u,
+             const uint8_t* src_v,
+             int src_stride_v,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height);
+
+// Convert YUY2 to I422.
+LIBYUV_API
+int YUY2ToI422(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+// Convert UYVY to I422.
+LIBYUV_API
+int UYVYToI422(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+LIBYUV_API
+int YUY2ToNV12(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+LIBYUV_API
+int UYVYToNV12(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+LIBYUV_API
+int YUY2ToY(const uint8_t* src_yuy2,
+            int src_stride_yuy2,
+            uint8_t* dst_y,
+            int dst_stride_y,
+            int width,
+            int height);
+
+// Convert I420 to I400. (calls CopyPlane ignoring u/v).
+LIBYUV_API
+int I420ToI400(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height);
+
+// Alias
+#define J420ToJ400 I420ToI400
+#define I420ToI420Mirror I420Mirror
+
+// I420 mirror.
+LIBYUV_API
+int I420Mirror(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+// Alias
+#define I400ToI400Mirror I400Mirror
+
+// I400 mirror.  A single plane is mirrored horizontally.
+// Pass negative height to achieve 180 degree rotation.
+LIBYUV_API
+int I400Mirror(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height);
+
+// Alias
+#define ARGBToARGBMirror ARGBMirror
+
+// ARGB mirror.
+LIBYUV_API
+int ARGBMirror(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_uv,
+                 int src_stride_uv,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height);
+
+// I422ToARGB is in convert_argb.h
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_bgra,
+               int dst_stride_bgra,
+               int width,
+               int height);
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height);
+
+// Alias
+#define RGB24ToRAW RAWToRGB24
+
+LIBYUV_API
+int RAWToRGB24(const uint8_t* src_raw,
+               int src_stride_raw,
+               uint8_t* dst_rgb24,
+               int dst_stride_rgb24,
+               int width,
+               int height);
+
+// Draw a rectangle into I420.
+LIBYUV_API
+int I420Rect(uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int x,
+             int y,
+             int width,
+             int height,
+             int value_y,
+             int value_u,
+             int value_v);
+
+// Draw a rectangle into ARGB.
+LIBYUV_API
+int ARGBRect(uint8_t* dst_argb,
+             int dst_stride_argb,
+             int dst_x,
+             int dst_y,
+             int width,
+             int height,
+             uint32_t value);
+
+// Convert ARGB to gray scale ARGB.
+LIBYUV_API
+int ARGBGrayTo(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Make a rectangle of ARGB gray scale.
+LIBYUV_API
+int ARGBGray(uint8_t* dst_argb,
+             int dst_stride_argb,
+             int dst_x,
+             int dst_y,
+             int width,
+             int height);
+
+// Make a rectangle of ARGB Sepia tone.
+LIBYUV_API
+int ARGBSepia(uint8_t* dst_argb,
+              int dst_stride_argb,
+              int dst_x,
+              int dst_y,
+              int width,
+              int height);
+
+// Apply a matrix rotation to each ARGB pixel.
+// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2.
+// The first 4 coefficients apply to B, G, R, A and produce B of the output.
+// The next 4 coefficients apply to B, G, R, A and produce G of the output.
+// The next 4 coefficients apply to B, G, R, A and produce R of the output.
+// The last 4 coefficients apply to B, G, R, A and produce A of the output.
+LIBYUV_API
+int ARGBColorMatrix(const uint8_t* src_argb,
+                    int src_stride_argb,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    const int8_t* matrix_argb,
+                    int width,
+                    int height);
+
+// Deprecated. Use ARGBColorMatrix instead.
+// Apply a matrix rotation to each ARGB pixel.
+// matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1.
+// The first 4 coefficients apply to B, G, R, A and produce B of the output.
+// The next 4 coefficients apply to B, G, R, A and produce G of the output.
+// The last 4 coefficients apply to B, G, R, A and produce R of the output.
+LIBYUV_API
+int RGBColorMatrix(uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   const int8_t* matrix_rgb,
+                   int dst_x,
+                   int dst_y,
+                   int width,
+                   int height);
+
+// Apply a color table each ARGB pixel.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int ARGBColorTable(uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   const uint8_t* table_argb,
+                   int dst_x,
+                   int dst_y,
+                   int width,
+                   int height);
+
+// Apply a color table each ARGB pixel but preserve destination alpha.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int RGBColorTable(uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  const uint8_t* table_argb,
+                  int dst_x,
+                  int dst_y,
+                  int width,
+                  int height);
+
+// Apply a luma/color table each ARGB pixel but preserve destination alpha.
+// Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from
+// RGB (YJ style) and C is an 8 bit color component (R, G or B).
+LIBYUV_API
+int ARGBLumaColorTable(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_argb,
+                       int dst_stride_argb,
+                       const uint8_t* luma,
+                       int width,
+                       int height);
+
+// Apply a 3 term polynomial to ARGB values.
+// poly points to a 4x4 matrix.  The first row is constants.  The 2nd row is
+// coefficients for b, g, r and a.  The 3rd row is coefficients for b squared,
+// g squared, r squared and a squared.  The 4rd row is coefficients for b to
+// the 3, g to the 3, r to the 3 and a to the 3.  The values are summed and
+// result clamped to 0 to 255.
+// A polynomial approximation can be dirived using software such as 'R'.
+
+LIBYUV_API
+int ARGBPolynomial(const uint8_t* src_argb,
+                   int src_stride_argb,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   const float* poly,
+                   int width,
+                   int height);
+
+// Convert plane of 16 bit shorts to half floats.
+// Source values are multiplied by scale before storing as half float.
+LIBYUV_API
+int HalfFloatPlane(const uint16_t* src_y,
+                   int src_stride_y,
+                   uint16_t* dst_y,
+                   int dst_stride_y,
+                   float scale,
+                   int width,
+                   int height);
+
+// Convert a buffer of bytes to floats, scale the values and store as floats.
+LIBYUV_API
+int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width);
+
+// Quantize a rectangle of ARGB. Alpha unaffected.
+// scale is a 16 bit fractional fixed point scaler between 0 and 65535.
+// interval_size should be a value between 1 and 255.
+// interval_offset should be a value between 0 and 255.
+LIBYUV_API
+int ARGBQuantize(uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int scale,
+                 int interval_size,
+                 int interval_offset,
+                 int dst_x,
+                 int dst_y,
+                 int width,
+                 int height);
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopy(const uint8_t* src_argb,
+             int src_stride_argb,
+             uint8_t* dst_argb,
+             int dst_stride_argb,
+             int width,
+             int height);
+
+// Copy Alpha channel of ARGB to alpha of ARGB.
+LIBYUV_API
+int ARGBCopyAlpha(const uint8_t* src_argb,
+                  int src_stride_argb,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int width,
+                  int height);
+
+// Extract the alpha channel from ARGB.
+LIBYUV_API
+int ARGBExtractAlpha(const uint8_t* src_argb,
+                     int src_stride_argb,
+                     uint8_t* dst_a,
+                     int dst_stride_a,
+                     int width,
+                     int height);
+
+// Copy Y channel to Alpha of ARGB.
+LIBYUV_API
+int ARGBCopyYToAlpha(const uint8_t* src_y,
+                     int src_stride_y,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     int width,
+                     int height);
+
+typedef void (*ARGBBlendRow)(const uint8_t* src_argb0,
+                             const uint8_t* src_argb1,
+                             uint8_t* dst_argb,
+                             int width);
+
+// Get function to Alpha Blend ARGB pixels and store to destination.
+LIBYUV_API
+ARGBBlendRow GetARGBBlend();
+
+// Alpha Blend ARGB images and store to destination.
+// Source is pre-multiplied by alpha using ARGBAttenuate.
+// Alpha of destination is set to 255.
+LIBYUV_API
+int ARGBBlend(const uint8_t* src_argb0,
+              int src_stride_argb0,
+              const uint8_t* src_argb1,
+              int src_stride_argb1,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height);
+
+// Alpha Blend plane and store to destination.
+// Source is not pre-multiplied by alpha.
+LIBYUV_API
+int BlendPlane(const uint8_t* src_y0,
+               int src_stride_y0,
+               const uint8_t* src_y1,
+               int src_stride_y1,
+               const uint8_t* alpha,
+               int alpha_stride,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height);
+
+// Alpha Blend YUV images and store to destination.
+// Source is not pre-multiplied by alpha.
+// Alpha is full width x height and subsampled to half size to apply to UV.
+LIBYUV_API
+int I420Blend(const uint8_t* src_y0,
+              int src_stride_y0,
+              const uint8_t* src_u0,
+              int src_stride_u0,
+              const uint8_t* src_v0,
+              int src_stride_v0,
+              const uint8_t* src_y1,
+              int src_stride_y1,
+              const uint8_t* src_u1,
+              int src_stride_u1,
+              const uint8_t* src_v1,
+              int src_stride_v1,
+              const uint8_t* alpha,
+              int alpha_stride,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int width,
+              int height);
+
+// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.
+LIBYUV_API
+int ARGBMultiply(const uint8_t* src_argb0,
+                 int src_stride_argb0,
+                 const uint8_t* src_argb1,
+                 int src_stride_argb1,
+                 uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int width,
+                 int height);
+
+// Add ARGB image with ARGB image. Saturates to 255.
+LIBYUV_API
+int ARGBAdd(const uint8_t* src_argb0,
+            int src_stride_argb0,
+            const uint8_t* src_argb1,
+            int src_stride_argb1,
+            uint8_t* dst_argb,
+            int dst_stride_argb,
+            int width,
+            int height);
+
+// Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0.
+LIBYUV_API
+int ARGBSubtract(const uint8_t* src_argb0,
+                 int src_stride_argb0,
+                 const uint8_t* src_argb1,
+                 int src_stride_argb1,
+                 uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int width,
+                 int height);
+
+// Convert I422 to YUY2.
+LIBYUV_API
+int I422ToYUY2(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height);
+
+// Convert I422 to UYVY.
+LIBYUV_API
+int I422ToUYVY(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_uyvy,
+               int dst_stride_uyvy,
+               int width,
+               int height);
+
+// Convert unattentuated ARGB to preattenuated ARGB.
+LIBYUV_API
+int ARGBAttenuate(const uint8_t* src_argb,
+                  int src_stride_argb,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int width,
+                  int height);
+
+// Convert preattentuated ARGB to unattenuated ARGB.
+LIBYUV_API
+int ARGBUnattenuate(const uint8_t* src_argb,
+                    int src_stride_argb,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height);
+
+// Internal function - do not call directly.
+// Computes table of cumulative sum for image where the value is the sum
+// of all values above and to the left of the entry. Used by ARGBBlur.
+LIBYUV_API
+int ARGBComputeCumulativeSum(const uint8_t* src_argb,
+                             int src_stride_argb,
+                             int32_t* dst_cumsum,
+                             int dst_stride32_cumsum,
+                             int width,
+                             int height);
+
+// Blur ARGB image.
+// dst_cumsum table of width * (height + 1) * 16 bytes aligned to
+//   16 byte boundary.
+// dst_stride32_cumsum is number of ints in a row (width * 4).
+// radius is number of pixels around the center.  e.g. 1 = 3x3. 2=5x5.
+// Blur is optimized for radius of 5 (11x11) or less.
+LIBYUV_API
+int ARGBBlur(const uint8_t* src_argb,
+             int src_stride_argb,
+             uint8_t* dst_argb,
+             int dst_stride_argb,
+             int32_t* dst_cumsum,
+             int dst_stride32_cumsum,
+             int width,
+             int height,
+             int radius);
+
+// Multiply ARGB image by ARGB value.
+LIBYUV_API
+int ARGBShade(const uint8_t* src_argb,
+              int src_stride_argb,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height,
+              uint32_t value);
+
+// Interpolate between two images using specified amount of interpolation
+// (0 to 255) and store to destination.
+// 'interpolation' is specified as 8 bit fraction where 0 means 100% src0
+// and 255 means 1% src0 and 99% src1.
+LIBYUV_API
+int InterpolatePlane(const uint8_t* src0,
+                     int src_stride0,
+                     const uint8_t* src1,
+                     int src_stride1,
+                     uint8_t* dst,
+                     int dst_stride,
+                     int width,
+                     int height,
+                     int interpolation);
+
+// Interpolate between two ARGB images using specified amount of interpolation
+// Internally calls InterpolatePlane with width * 4 (bpp).
+LIBYUV_API
+int ARGBInterpolate(const uint8_t* src_argb0,
+                    int src_stride_argb0,
+                    const uint8_t* src_argb1,
+                    int src_stride_argb1,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height,
+                    int interpolation);
+
+// Interpolate between two YUV images using specified amount of interpolation
+// Internally calls InterpolatePlane on each plane where the U and V planes
+// are half width and half height.
+LIBYUV_API
+int I420Interpolate(const uint8_t* src0_y,
+                    int src0_stride_y,
+                    const uint8_t* src0_u,
+                    int src0_stride_u,
+                    const uint8_t* src0_v,
+                    int src0_stride_v,
+                    const uint8_t* src1_y,
+                    int src1_stride_y,
+                    const uint8_t* src1_u,
+                    int src1_stride_u,
+                    const uint8_t* src1_v,
+                    int src1_stride_v,
+                    uint8_t* dst_y,
+                    int dst_stride_y,
+                    uint8_t* dst_u,
+                    int dst_stride_u,
+                    uint8_t* dst_v,
+                    int dst_stride_v,
+                    int width,
+                    int height,
+                    int interpolation);
+
 // Row function for copying pixels from a source with a slope to a row
 // of destination. Useful for scaling, rotation, mirror, texture mapping.
 LIBYUV_API
-void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
-                     uint8* dst_argb, const float* uv_dudv, int width);
+void ARGBAffineRow_C(const uint8_t* src_argb,
+                     int src_argb_stride,
+                     uint8_t* dst_argb,
+                     const float* uv_dudv,
+                     int width);
+// TODO(fbarchard): Move ARGBAffineRow_SSE2 to row.h
 LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
-                        uint8* dst_argb, const float* uv_dudv, int width);
+void ARGBAffineRow_SSE2(const uint8_t* src_argb,
+                        int src_argb_stride,
+                        uint8_t* dst_argb,
+                        const float* uv_dudv,
+                        int width);
 
 // Shuffle ARGB channel order.  e.g. BGRA to ARGB.
 // shuffler is 16 bytes and must be aligned.
 LIBYUV_API
-int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
-                uint8* dst_argb, int dst_stride_argb,
-                const uint8* shuffler, int width, int height);
+int ARGBShuffle(const uint8_t* src_bgra,
+                int src_stride_bgra,
+                uint8_t* dst_argb,
+                int dst_stride_argb,
+                const uint8_t* shuffler,
+                int width,
+                int height);
 
 // Sobel ARGB effect with planar output.
 LIBYUV_API
-int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
-                     uint8* dst_y, int dst_stride_y,
-                     int width, int height);
+int ARGBSobelToPlane(const uint8_t* src_argb,
+                     int src_stride_argb,
+                     uint8_t* dst_y,
+                     int dst_stride_y,
+                     int width,
+                     int height);
 
 // Sobel ARGB effect.
 LIBYUV_API
-int ARGBSobel(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height);
+int ARGBSobel(const uint8_t* src_argb,
+              int src_stride_argb,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height);
 
 // Sobel ARGB effect w/ Sobel X, Sobel, Sobel Y in ARGB.
 LIBYUV_API
-int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
-                uint8* dst_argb, int dst_stride_argb,
-                int width, int height);
+int ARGBSobelXY(const uint8_t* src_argb,
+                int src_stride_argb,
+                uint8_t* dst_argb,
+                int dst_stride_argb,
+                int width,
+                int height);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate.h
index 8af60b8955..76b692be8b 100644
--- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate.h
+++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate.h
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_ROTATE_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_ROTATE_H_
 #define INCLUDE_LIBYUV_ROTATE_H_
 
 #include "libyuv/basic_types.h"
@@ -20,8 +20,8 @@ extern "C" {
 
 // Supported rotation.
 typedef enum RotationMode {
-  kRotate0 = 0,  // No rotation.
-  kRotate90 = 90,  // Rotate 90 degrees clockwise.
+  kRotate0 = 0,      // No rotation.
+  kRotate90 = 90,    // Rotate 90 degrees clockwise.
   kRotate180 = 180,  // Rotate 180 degrees.
   kRotate270 = 270,  // Rotate 270 degrees clockwise.
 
@@ -33,85 +33,132 @@ typedef enum RotationMode {
 
 // Rotate I420 frame.
 LIBYUV_API
-int I420Rotate(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int src_width, int src_height, enum RotationMode mode);
+int I420Rotate(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode);
 
 // Rotate NV12 input and store in I420.
 LIBYUV_API
-int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
-                     const uint8* src_uv, int src_stride_uv,
-                     uint8* dst_y, int dst_stride_y,
-                     uint8* dst_u, int dst_stride_u,
-                     uint8* dst_v, int dst_stride_v,
-                     int src_width, int src_height, enum RotationMode mode);
+int NV12ToI420Rotate(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_y,
+                     int dst_stride_y,
+                     uint8_t* dst_u,
+                     int dst_stride_u,
+                     uint8_t* dst_v,
+                     int dst_stride_v,
+                     int width,
+                     int height,
+                     enum RotationMode mode);
 
 // Rotate a plane by 0, 90, 180, or 270.
 LIBYUV_API
-int RotatePlane(const uint8* src, int src_stride,
-                uint8* dst, int dst_stride,
-                int src_width, int src_height, enum RotationMode mode);
+int RotatePlane(const uint8_t* src,
+                int src_stride,
+                uint8_t* dst,
+                int dst_stride,
+                int width,
+                int height,
+                enum RotationMode mode);
 
 // Rotate planes by 90, 180, 270. Deprecated.
 LIBYUV_API
-void RotatePlane90(const uint8* src, int src_stride,
-                   uint8* dst, int dst_stride,
-                   int width, int height);
+void RotatePlane90(const uint8_t* src,
+                   int src_stride,
+                   uint8_t* dst,
+                   int dst_stride,
+                   int width,
+                   int height);
 
 LIBYUV_API
-void RotatePlane180(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height);
+void RotatePlane180(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height);
 
 LIBYUV_API
-void RotatePlane270(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height);
+void RotatePlane270(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height);
 
 LIBYUV_API
-void RotateUV90(const uint8* src, int src_stride,
-                uint8* dst_a, int dst_stride_a,
-                uint8* dst_b, int dst_stride_b,
-                int width, int height);
+void RotateUV90(const uint8_t* src,
+                int src_stride,
+                uint8_t* dst_a,
+                int dst_stride_a,
+                uint8_t* dst_b,
+                int dst_stride_b,
+                int width,
+                int height);
 
 // Rotations for when U and V are interleaved.
 // These functions take one input pointer and
 // split the data into two buffers while
 // rotating them. Deprecated.
 LIBYUV_API
-void RotateUV180(const uint8* src, int src_stride,
-                 uint8* dst_a, int dst_stride_a,
-                 uint8* dst_b, int dst_stride_b,
-                 int width, int height);
+void RotateUV180(const uint8_t* src,
+                 int src_stride,
+                 uint8_t* dst_a,
+                 int dst_stride_a,
+                 uint8_t* dst_b,
+                 int dst_stride_b,
+                 int width,
+                 int height);
 
 LIBYUV_API
-void RotateUV270(const uint8* src, int src_stride,
-                 uint8* dst_a, int dst_stride_a,
-                 uint8* dst_b, int dst_stride_b,
-                 int width, int height);
+void RotateUV270(const uint8_t* src,
+                 int src_stride,
+                 uint8_t* dst_a,
+                 int dst_stride_a,
+                 uint8_t* dst_b,
+                 int dst_stride_b,
+                 int width,
+                 int height);
 
 // The 90 and 270 functions are based on transposes.
 // Doing a transpose with reversing the read/write
 // order will result in a rotation by +- 90 degrees.
 // Deprecated.
 LIBYUV_API
-void TransposePlane(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height);
+void TransposePlane(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height);
 
 LIBYUV_API
-void TransposeUV(const uint8* src, int src_stride,
-                 uint8* dst_a, int dst_stride_a,
-                 uint8* dst_b, int dst_stride_b,
-                 int width, int height);
+void TransposeUV(const uint8_t* src,
+                 int src_stride,
+                 uint8_t* dst_a,
+                 int dst_stride_a,
+                 uint8_t* dst_b,
+                 int dst_stride_b,
+                 int width,
+                 int height);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_ROTATE_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_ROTATE_H_
diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate_argb.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate_argb.h
index 660ff5573e..20432949ab 100644
--- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate_argb.h
+++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate_argb.h
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_
 #define INCLUDE_LIBYUV_ROTATE_ARGB_H_
 
 #include "libyuv/basic_types.h"
@@ -21,13 +21,17 @@ extern "C" {
 
 // Rotate ARGB frame
 LIBYUV_API
-int ARGBRotate(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_argb, int dst_stride_argb,
-               int src_width, int src_height, enum RotationMode mode);
+int ARGBRotate(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int src_width,
+               int src_height,
+               enum RotationMode mode);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_ROTATE_ARGB_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_ROTATE_ARGB_H_
diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate_row.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate_row.h
index ebc487f9ab..5edc0fcf13 100644
--- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate_row.h
+++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate_row.h
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_
 #define INCLUDE_LIBYUV_ROTATE_ROW_H_
 
 #include "libyuv/basic_types.h"
@@ -18,10 +18,14 @@ namespace libyuv {
 extern "C" {
 #endif
 
-#if defined(__pnacl__) || defined(__CLR_VER) || \
-    (defined(__i386__) && !defined(__SSE2__))
+#if defined(__pnacl__) || defined(__CLR_VER) ||            \
+    (defined(__native_client__) && defined(__x86_64__)) || \
+    (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
 #define LIBYUV_DISABLE_X86
 #endif
+#if defined(__native_client__)
+#define LIBYUV_DISABLE_NEON
+#endif
 // MemorySanitizer does not support assembly code yet. http://crbug.com/344505
 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer)
@@ -29,93 +33,162 @@ extern "C" {
 #endif
 #endif
 // The following are available for Visual C and clangcl 32 bit:
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
 #define HAS_TRANSPOSEWX8_SSSE3
 #define HAS_TRANSPOSEUVWX8_SSE2
 #endif
 
-// The following are available for GCC 32 or 64 bit but not NaCL for 64 bit:
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
+// The following are available for GCC 32 or 64 bit:
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__))
 #define HAS_TRANSPOSEWX8_SSSE3
 #endif
 
-// The following are available for 64 bit GCC but not NaCL:
-#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
-    defined(__x86_64__)
+// The following are available for 64 bit GCC:
+#if !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__)
 #define HAS_TRANSPOSEWX8_FAST_SSSE3
 #define HAS_TRANSPOSEUVWX8_SSE2
 #endif
 
-#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+#if !defined(LIBYUV_DISABLE_NEON) && \
     (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
 #define HAS_TRANSPOSEWX8_NEON
 #define HAS_TRANSPOSEUVWX8_NEON
 #endif
 
-#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
-    defined(__mips__) && \
-    defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-#define HAS_TRANSPOSEWX8_DSPR2
-#define HAS_TRANSPOSEUVWX8_DSPR2
-#endif  // defined(__mips__)
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#define HAS_TRANSPOSEWX16_MSA
+#define HAS_TRANSPOSEUVWX16_MSA
+#endif
 
-void TransposeWxH_C(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride, int width, int height);
+void TransposeWxH_C(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height);
 
-void TransposeWx8_C(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride, int width);
-void TransposeWx8_NEON(const uint8* src, int src_stride,
-                       uint8* dst, int dst_stride, int width);
-void TransposeWx8_SSSE3(const uint8* src, int src_stride,
-                        uint8* dst, int dst_stride, int width);
-void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
-                             uint8* dst, int dst_stride, int width);
-void TransposeWx8_DSPR2(const uint8* src, int src_stride,
-                        uint8* dst, int dst_stride, int width);
-void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
-                             uint8* dst, int dst_stride, int width);
+void TransposeWx8_C(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width);
+void TransposeWx16_C(const uint8_t* src,
+                     int src_stride,
+                     uint8_t* dst,
+                     int dst_stride,
+                     int width);
+void TransposeWx8_NEON(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst,
+                       int dst_stride,
+                       int width);
+void TransposeWx8_SSSE3(const uint8_t* src,
+                        int src_stride,
+                        uint8_t* dst,
+                        int dst_stride,
+                        int width);
+void TransposeWx8_Fast_SSSE3(const uint8_t* src,
+                             int src_stride,
+                             uint8_t* dst,
+                             int dst_stride,
+                             int width);
+void TransposeWx16_MSA(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst,
+                       int dst_stride,
+                       int width);
 
-void TransposeWx8_Any_NEON(const uint8* src, int src_stride,
-                           uint8* dst, int dst_stride, int width);
-void TransposeWx8_Any_SSSE3(const uint8* src, int src_stride,
-                            uint8* dst, int dst_stride, int width);
-void TransposeWx8_Fast_Any_SSSE3(const uint8* src, int src_stride,
-                                 uint8* dst, int dst_stride, int width);
-void TransposeWx8_Any_DSPR2(const uint8* src, int src_stride,
-                            uint8* dst, int dst_stride, int width);
+void TransposeWx8_Any_NEON(const uint8_t* src,
+                           int src_stride,
+                           uint8_t* dst,
+                           int dst_stride,
+                           int width);
+void TransposeWx8_Any_SSSE3(const uint8_t* src,
+                            int src_stride,
+                            uint8_t* dst,
+                            int dst_stride,
+                            int width);
+void TransposeWx8_Fast_Any_SSSE3(const uint8_t* src,
+                                 int src_stride,
+                                 uint8_t* dst,
+                                 int dst_stride,
+                                 int width);
+void TransposeWx16_Any_MSA(const uint8_t* src,
+                           int src_stride,
+                           uint8_t* dst,
+                           int dst_stride,
+                           int width);
 
-void TransposeUVWxH_C(const uint8* src, int src_stride,
-                      uint8* dst_a, int dst_stride_a,
-                      uint8* dst_b, int dst_stride_b,
-                      int width, int height);
+void TransposeUVWxH_C(const uint8_t* src,
+                      int src_stride,
+                      uint8_t* dst_a,
+                      int dst_stride_a,
+                      uint8_t* dst_b,
+                      int dst_stride_b,
+                      int width,
+                      int height);
 
-void TransposeUVWx8_C(const uint8* src, int src_stride,
-                      uint8* dst_a, int dst_stride_a,
-                      uint8* dst_b, int dst_stride_b, int width);
-void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b, int width);
-void TransposeUVWx8_NEON(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b, int width);
-void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
-                          uint8* dst_a, int dst_stride_a,
-                          uint8* dst_b, int dst_stride_b, int width);
+void TransposeUVWx8_C(const uint8_t* src,
+                      int src_stride,
+                      uint8_t* dst_a,
+                      int dst_stride_a,
+                      uint8_t* dst_b,
+                      int dst_stride_b,
+                      int width);
+void TransposeUVWx16_C(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst_a,
+                       int dst_stride_a,
+                       uint8_t* dst_b,
+                       int dst_stride_b,
+                       int width);
+void TransposeUVWx8_SSE2(const uint8_t* src,
+                         int src_stride,
+                         uint8_t* dst_a,
+                         int dst_stride_a,
+                         uint8_t* dst_b,
+                         int dst_stride_b,
+                         int width);
+void TransposeUVWx8_NEON(const uint8_t* src,
+                         int src_stride,
+                         uint8_t* dst_a,
+                         int dst_stride_a,
+                         uint8_t* dst_b,
+                         int dst_stride_b,
+                         int width);
+void TransposeUVWx16_MSA(const uint8_t* src,
+                         int src_stride,
+                         uint8_t* dst_a,
+                         int dst_stride_a,
+                         uint8_t* dst_b,
+                         int dst_stride_b,
+                         int width);
 
-void TransposeUVWx8_Any_SSE2(const uint8* src, int src_stride,
-                             uint8* dst_a, int dst_stride_a,
-                             uint8* dst_b, int dst_stride_b, int width);
-void TransposeUVWx8_Any_NEON(const uint8* src, int src_stride,
-                             uint8* dst_a, int dst_stride_a,
-                             uint8* dst_b, int dst_stride_b, int width);
-void TransposeUVWx8_Any_DSPR2(const uint8* src, int src_stride,
-                              uint8* dst_a, int dst_stride_a,
-                              uint8* dst_b, int dst_stride_b, int width);
+void TransposeUVWx8_Any_SSE2(const uint8_t* src,
+                             int src_stride,
+                             uint8_t* dst_a,
+                             int dst_stride_a,
+                             uint8_t* dst_b,
+                             int dst_stride_b,
+                             int width);
+void TransposeUVWx8_Any_NEON(const uint8_t* src,
+                             int src_stride,
+                             uint8_t* dst_a,
+                             int dst_stride_a,
+                             uint8_t* dst_b,
+                             int dst_stride_b,
+                             int width);
+void TransposeUVWx16_Any_MSA(const uint8_t* src,
+                             int src_stride,
+                             uint8_t* dst_a,
+                             int dst_stride_a,
+                             uint8_t* dst_b,
+                             int dst_stride_b,
+                             int width);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_ROTATE_ROW_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_ROTATE_ROW_H_
diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/row.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/row.h
index 013a7e53e3..65ef448b8c 100644
--- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/row.h
+++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/row.h
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_ROW_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_ROW_H_
 #define INCLUDE_LIBYUV_ROW_H_
 
 #include <stdlib.h>  // For malloc.
@@ -20,41 +20,20 @@ namespace libyuv {
 extern "C" {
 #endif
 
-#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
-
-#ifdef __cplusplus
-#define align_buffer_64(var, size)                                             \
-  uint8* var##_mem = reinterpret_cast<uint8*>(malloc((size) + 63));            \
-  uint8* var = reinterpret_cast<uint8*>                                        \
-      ((reinterpret_cast<intptr_t>(var##_mem) + 63) & ~63)
-#else
-#define align_buffer_64(var, size)                                             \
-  uint8* var##_mem = (uint8*)(malloc((size) + 63));               /* NOLINT */ \
-  uint8* var = (uint8*)(((intptr_t)(var##_mem) + 63) & ~63)       /* NOLINT */
-#endif
-
-#define free_aligned_buffer_64(var) \
-  free(var##_mem);  \
-  var = 0
-
-#if defined(__pnacl__) || defined(__CLR_VER) || \
-    (defined(__i386__) && !defined(__SSE2__))
+#if defined(__pnacl__) || defined(__CLR_VER) ||            \
+    (defined(__native_client__) && defined(__x86_64__)) || \
+    (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
 #define LIBYUV_DISABLE_X86
 #endif
+#if defined(__native_client__)
+#define LIBYUV_DISABLE_NEON
+#endif
 // MemorySanitizer does not support assembly code yet. http://crbug.com/344505
 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer)
 #define LIBYUV_DISABLE_X86
 #endif
 #endif
-// True if compiling for SSSE3 as a requirement.
-#if defined(__SSSE3__) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 3))
-#define LIBYUV_SSSE3_ONLY
-#endif
-
-#if defined(__native_client__)
-#define LIBYUV_DISABLE_NEON
-#endif
 // clang >= 3.5.0 required for Arm64.
 #if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON)
 #if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5))
@@ -76,9 +55,19 @@ extern "C" {
 #endif  // clang >= 3.4
 #endif  // __clang__
 
+// clang >= 6.0.0 required for AVX512.
+// TODO(fbarchard): fix xcode 9 ios b/789.
+#if 0  // Build fails in libvpx on Mac
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+#if (__clang_major__ >= 7) && !defined(__APPLE_EMBEDDED_SIMULATOR__)
+#define CLANG_HAS_AVX512 1
+#endif  // clang >= 7
+#endif  // __clang__
+#endif  // 0
+
 // Visual C 2012 required for AVX2.
-#if defined(_M_IX86) && !defined(__clang__) && \
-    defined(_MSC_VER) && _MSC_VER >= 1700
+#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \
+    _MSC_VER >= 1700
 #define VISUALC_HAS_AVX2 1
 #endif  // VisualStudio >= 2012
 
@@ -90,8 +79,8 @@ extern "C" {
 #define HAS_ABGRTOYROW_SSSE3
 #define HAS_ARGB1555TOARGBROW_SSE2
 #define HAS_ARGB4444TOARGBROW_SSE2
+#define HAS_ARGBEXTRACTALPHAROW_SSE2
 #define HAS_ARGBSETROW_X86
-#define HAS_ARGBSHUFFLEROW_SSE2
 #define HAS_ARGBSHUFFLEROW_SSSE3
 #define HAS_ARGBTOARGB1555ROW_SSE2
 #define HAS_ARGBTOARGB4444ROW_SSE2
@@ -104,12 +93,12 @@ extern "C" {
 #define HAS_ARGBTOUVROW_SSSE3
 #define HAS_ARGBTOYJROW_SSSE3
 #define HAS_ARGBTOYROW_SSSE3
-#define HAS_ARGBEXTRACTALPHAROW_SSE2
 #define HAS_BGRATOUVROW_SSSE3
 #define HAS_BGRATOYROW_SSSE3
 #define HAS_COPYROW_ERMS
 #define HAS_COPYROW_SSE2
 #define HAS_H422TOARGBROW_SSSE3
+#define HAS_HALFFLOATROW_SSE2
 #define HAS_I400TOARGBROW_SSE2
 #define HAS_I422TOARGB1555ROW_SSSE3
 #define HAS_I422TOARGB4444ROW_SSSE3
@@ -126,8 +115,10 @@ extern "C" {
 #define HAS_MIRRORROW_SSSE3
 #define HAS_MIRRORUVROW_SSSE3
 #define HAS_NV12TOARGBROW_SSSE3
+#define HAS_NV12TORGB24ROW_SSSE3
 #define HAS_NV12TORGB565ROW_SSSE3
 #define HAS_NV21TOARGBROW_SSSE3
+#define HAS_NV21TORGB24ROW_SSSE3
 #define HAS_RAWTOARGBROW_SSSE3
 #define HAS_RAWTORGB24ROW_SSSE3
 #define HAS_RAWTOYROW_SSSE3
@@ -180,11 +171,8 @@ extern "C" {
 
 // The following functions fail on gcc/clang 32 bit with fpic and framepointer.
 // caveat: clangcl uses row_win.cc which works.
-#if defined(NDEBUG) || !(defined(_DEBUG) && defined(__i386__)) || \
-    !defined(__i386__) || defined(_MSC_VER)
-// TODO(fbarchard): fix build error on x86 debug
-// https://code.google.com/p/libyuv/issues/detail?id=524
-#define HAS_I411TOARGBROW_SSSE3
+#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \
+    defined(_MSC_VER)
 // TODO(fbarchard): fix build error on android_full_debug=1
 // https://code.google.com/p/libyuv/issues/detail?id=517
 #define HAS_I422ALPHATOARGBROW_SSSE3
@@ -193,11 +181,12 @@ extern "C" {
 
 // The following are available on all x86 platforms, but
 // require VS2012, clang 3.4 or gcc 4.7.
-// The code supports NaCL but requires a new compiler and validator.
-#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \
-    defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#if !defined(LIBYUV_DISABLE_X86) &&                          \
+    (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
+     defined(GCC_HAS_AVX2))
 #define HAS_ARGBCOPYALPHAROW_AVX2
 #define HAS_ARGBCOPYYTOALPHAROW_AVX2
+#define HAS_ARGBEXTRACTALPHAROW_AVX2
 #define HAS_ARGBMIRRORROW_AVX2
 #define HAS_ARGBPOLYNOMIALROW_AVX2
 #define HAS_ARGBSHUFFLEROW_AVX2
@@ -208,13 +197,9 @@ extern "C" {
 #define HAS_ARGBTOYROW_AVX2
 #define HAS_COPYROW_AVX
 #define HAS_H422TOARGBROW_AVX2
+#define HAS_HALFFLOATROW_AVX2
+//  #define HAS_HALFFLOATROW_F16C  // Enable to test halffloat cast
 #define HAS_I400TOARGBROW_AVX2
-#if !(defined(_DEBUG) && defined(__i386__))
-// TODO(fbarchard): fix build error on android_full_debug=1
-// https://code.google.com/p/libyuv/issues/detail?id=517
-#define HAS_I422ALPHATOARGBROW_AVX2
-#endif
-#define HAS_I411TOARGBROW_AVX2
 #define HAS_I422TOARGB1555ROW_AVX2
 #define HAS_I422TOARGB4444ROW_AVX2
 #define HAS_I422TOARGBROW_AVX2
@@ -227,8 +212,10 @@ extern "C" {
 #define HAS_MERGEUVROW_AVX2
 #define HAS_MIRRORROW_AVX2
 #define HAS_NV12TOARGBROW_AVX2
+#define HAS_NV12TORGB24ROW_AVX2
 #define HAS_NV12TORGB565ROW_AVX2
 #define HAS_NV21TOARGBROW_AVX2
+#define HAS_NV21TORGB24ROW_AVX2
 #define HAS_SPLITUVROW_AVX2
 #define HAS_UYVYTOARGBROW_AVX2
 #define HAS_UYVYTOUV422ROW_AVX2
@@ -246,11 +233,18 @@ extern "C" {
 #define HAS_ARGBSUBTRACTROW_AVX2
 #define HAS_ARGBUNATTENUATEROW_AVX2
 #define HAS_BLENDPLANEROW_AVX2
+
+#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \
+    defined(_MSC_VER)
+// TODO(fbarchard): fix build error on android_full_debug=1
+// https://code.google.com/p/libyuv/issues/detail?id=517
+#define HAS_I422ALPHATOARGBROW_AVX2
+#endif
 #endif
 
 // The following are available for AVX2 Visual C and clangcl 32 bit:
 // TODO(fbarchard): Port to gcc.
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \
     (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
 #define HAS_ARGB1555TOARGBROW_AVX2
 #define HAS_ARGB4444TOARGBROW_AVX2
@@ -268,6 +262,51 @@ extern "C" {
 #define HAS_I422TOARGBROW_SSSE3
 #endif
 
+// The following are available for gcc/clang x86 platforms:
+// TODO(fbarchard): Port to Visual C
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#define HAS_ABGRTOAR30ROW_SSSE3
+#define HAS_ARGBTOAR30ROW_SSSE3
+#define HAS_CONVERT16TO8ROW_SSSE3
+#define HAS_CONVERT8TO16ROW_SSE2
+// I210 is for H010.  2 = 422.  I for 601 vs H for 709.
+#define HAS_I210TOAR30ROW_SSSE3
+#define HAS_I210TOARGBROW_SSSE3
+#define HAS_I422TOAR30ROW_SSSE3
+#define HAS_MERGERGBROW_SSSE3
+#define HAS_SPLITRGBROW_SSSE3
+#endif
+
+// The following are available for AVX2 gcc/clang x86 platforms:
+// TODO(fbarchard): Port to Visual C
+#if !defined(LIBYUV_DISABLE_X86) &&                                       \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
+    (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#define HAS_ABGRTOAR30ROW_AVX2
+#define HAS_ARGBTOAR30ROW_AVX2
+#define HAS_ARGBTORAWROW_AVX2
+#define HAS_ARGBTORGB24ROW_AVX2
+#define HAS_CONVERT16TO8ROW_AVX2
+#define HAS_CONVERT8TO16ROW_AVX2
+#define HAS_I210TOAR30ROW_AVX2
+#define HAS_I210TOARGBROW_AVX2
+#define HAS_I422TOAR30ROW_AVX2
+#define HAS_I422TOUYVYROW_AVX2
+#define HAS_I422TOYUY2ROW_AVX2
+#define HAS_MERGEUVROW_16_AVX2
+#define HAS_MULTIPLYROW_16_AVX2
+#endif
+
+// The following are available for AVX512 clang x86 platforms:
+// TODO(fbarchard): Port to GCC and Visual C
+// TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789
+#if !defined(LIBYUV_DISABLE_X86) &&                                       \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
+    (defined(CLANG_HAS_AVX512))
+#define HAS_ARGBTORGB24ROW_AVX512VBMI
+#endif
+
 // The following are available on Neon platforms:
 #if !defined(LIBYUV_DISABLE_NEON) && \
     (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
@@ -279,6 +318,7 @@ extern "C" {
 #define HAS_ARGB4444TOARGBROW_NEON
 #define HAS_ARGB4444TOUVROW_NEON
 #define HAS_ARGB4444TOYROW_NEON
+#define HAS_ARGBEXTRACTALPHAROW_NEON
 #define HAS_ARGBSETROW_NEON
 #define HAS_ARGBTOARGB1555ROW_NEON
 #define HAS_ARGBTOARGB4444ROW_NEON
@@ -286,18 +326,17 @@ extern "C" {
 #define HAS_ARGBTORGB24ROW_NEON
 #define HAS_ARGBTORGB565DITHERROW_NEON
 #define HAS_ARGBTORGB565ROW_NEON
-#define HAS_ARGBTOUV411ROW_NEON
 #define HAS_ARGBTOUV444ROW_NEON
 #define HAS_ARGBTOUVJROW_NEON
 #define HAS_ARGBTOUVROW_NEON
 #define HAS_ARGBTOYJROW_NEON
 #define HAS_ARGBTOYROW_NEON
-#define HAS_ARGBEXTRACTALPHAROW_NEON
 #define HAS_BGRATOUVROW_NEON
 #define HAS_BGRATOYROW_NEON
+#define HAS_BYTETOFLOATROW_NEON
 #define HAS_COPYROW_NEON
+#define HAS_HALFFLOATROW_NEON
 #define HAS_I400TOARGBROW_NEON
-#define HAS_I411TOARGBROW_NEON
 #define HAS_I422ALPHATOARGBROW_NEON
 #define HAS_I422TOARGB1555ROW_NEON
 #define HAS_I422TOARGB4444ROW_NEON
@@ -313,8 +352,10 @@ extern "C" {
 #define HAS_MIRRORROW_NEON
 #define HAS_MIRRORUVROW_NEON
 #define HAS_NV12TOARGBROW_NEON
+#define HAS_NV12TORGB24ROW_NEON
 #define HAS_NV12TORGB565ROW_NEON
 #define HAS_NV21TOARGBROW_NEON
+#define HAS_NV21TORGB24ROW_NEON
 #define HAS_RAWTOARGBROW_NEON
 #define HAS_RAWTORGB24ROW_NEON
 #define HAS_RAWTOUVROW_NEON
@@ -328,6 +369,7 @@ extern "C" {
 #define HAS_RGBATOUVROW_NEON
 #define HAS_RGBATOYROW_NEON
 #define HAS_SETROW_NEON
+#define HAS_SPLITRGBROW_NEON
 #define HAS_SPLITUVROW_NEON
 #define HAS_UYVYTOARGBROW_NEON
 #define HAS_UYVYTOUV422ROW_NEON
@@ -359,17 +401,87 @@ extern "C" {
 #define HAS_SOBELYROW_NEON
 #endif
 
-// The following are available on Mips platforms:
-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
-    (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
-#define HAS_COPYROW_MIPS
-#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-#define HAS_I422TOARGBROW_DSPR2
-#define HAS_INTERPOLATEROW_DSPR2
-#define HAS_MIRRORROW_DSPR2
-#define HAS_MIRRORUVROW_DSPR2
-#define HAS_SPLITUVROW_DSPR2
+// The following are available on AArch64 platforms:
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+#define HAS_SCALESUMSAMPLES_NEON
 #endif
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#define HAS_ABGRTOUVROW_MSA
+#define HAS_ABGRTOYROW_MSA
+#define HAS_ARGB1555TOARGBROW_MSA
+#define HAS_ARGB1555TOUVROW_MSA
+#define HAS_ARGB1555TOYROW_MSA
+#define HAS_ARGB4444TOARGBROW_MSA
+#define HAS_ARGBADDROW_MSA
+#define HAS_ARGBATTENUATEROW_MSA
+#define HAS_ARGBBLENDROW_MSA
+#define HAS_ARGBCOLORMATRIXROW_MSA
+#define HAS_ARGBEXTRACTALPHAROW_MSA
+#define HAS_ARGBGRAYROW_MSA
+#define HAS_ARGBMIRRORROW_MSA
+#define HAS_ARGBMULTIPLYROW_MSA
+#define HAS_ARGBQUANTIZEROW_MSA
+#define HAS_ARGBSEPIAROW_MSA
+#define HAS_ARGBSETROW_MSA
+#define HAS_ARGBSHADEROW_MSA
+#define HAS_ARGBSHUFFLEROW_MSA
+#define HAS_ARGBSUBTRACTROW_MSA
+#define HAS_ARGBTOARGB1555ROW_MSA
+#define HAS_ARGBTOARGB4444ROW_MSA
+#define HAS_ARGBTORAWROW_MSA
+#define HAS_ARGBTORGB24ROW_MSA
+#define HAS_ARGBTORGB565DITHERROW_MSA
+#define HAS_ARGBTORGB565ROW_MSA
+#define HAS_ARGBTOUV444ROW_MSA
+#define HAS_ARGBTOUVJROW_MSA
+#define HAS_ARGBTOUVROW_MSA
+#define HAS_ARGBTOYJROW_MSA
+#define HAS_ARGBTOYROW_MSA
+#define HAS_BGRATOUVROW_MSA
+#define HAS_BGRATOYROW_MSA
+#define HAS_HALFFLOATROW_MSA
+#define HAS_I400TOARGBROW_MSA
+#define HAS_I422ALPHATOARGBROW_MSA
+#define HAS_I422TOARGBROW_MSA
+#define HAS_I422TORGB24ROW_MSA
+#define HAS_I422TORGBAROW_MSA
+#define HAS_I422TOUYVYROW_MSA
+#define HAS_I422TOYUY2ROW_MSA
+#define HAS_I444TOARGBROW_MSA
+#define HAS_INTERPOLATEROW_MSA
+#define HAS_J400TOARGBROW_MSA
+#define HAS_MERGEUVROW_MSA
+#define HAS_MIRRORROW_MSA
+#define HAS_MIRRORUVROW_MSA
+#define HAS_NV12TOARGBROW_MSA
+#define HAS_NV12TORGB565ROW_MSA
+#define HAS_NV21TOARGBROW_MSA
+#define HAS_RAWTOARGBROW_MSA
+#define HAS_RAWTORGB24ROW_MSA
+#define HAS_RAWTOUVROW_MSA
+#define HAS_RAWTOYROW_MSA
+#define HAS_RGB24TOARGBROW_MSA
+#define HAS_RGB24TOUVROW_MSA
+#define HAS_RGB24TOYROW_MSA
+#define HAS_RGB565TOARGBROW_MSA
+#define HAS_RGB565TOUVROW_MSA
+#define HAS_RGB565TOYROW_MSA
+#define HAS_RGBATOUVROW_MSA
+#define HAS_RGBATOYROW_MSA
+#define HAS_SETROW_MSA
+#define HAS_SOBELROW_MSA
+#define HAS_SOBELTOPLANEROW_MSA
+#define HAS_SOBELXROW_MSA
+#define HAS_SOBELXYROW_MSA
+#define HAS_SOBELYROW_MSA
+#define HAS_SPLITUVROW_MSA
+#define HAS_UYVYTOARGBROW_MSA
+#define HAS_UYVYTOUVROW_MSA
+#define HAS_UYVYTOYROW_MSA
+#define HAS_YUY2TOARGBROW_MSA
+#define HAS_YUY2TOUV422ROW_MSA
+#define HAS_YUY2TOUVROW_MSA
+#define HAS_YUY2TOYROW_MSA
 #endif
 
 #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
@@ -378,18 +490,18 @@ extern "C" {
 #else
 #define SIMD_ALIGNED(var) __declspec(align(16)) var
 #endif
-typedef __declspec(align(16)) int16 vec16[8];
-typedef __declspec(align(16)) int32 vec32[4];
-typedef __declspec(align(16)) int8 vec8[16];
-typedef __declspec(align(16)) uint16 uvec16[8];
-typedef __declspec(align(16)) uint32 uvec32[4];
-typedef __declspec(align(16)) uint8 uvec8[16];
-typedef __declspec(align(32)) int16 lvec16[16];
-typedef __declspec(align(32)) int32 lvec32[8];
-typedef __declspec(align(32)) int8 lvec8[32];
-typedef __declspec(align(32)) uint16 ulvec16[16];
-typedef __declspec(align(32)) uint32 ulvec32[8];
-typedef __declspec(align(32)) uint8 ulvec8[32];
+typedef __declspec(align(16)) int16_t vec16[8];
+typedef __declspec(align(16)) int32_t vec32[4];
+typedef __declspec(align(16)) int8_t vec8[16];
+typedef __declspec(align(16)) uint16_t uvec16[8];
+typedef __declspec(align(16)) uint32_t uvec32[4];
+typedef __declspec(align(16)) uint8_t uvec8[16];
+typedef __declspec(align(32)) int16_t lvec16[16];
+typedef __declspec(align(32)) int32_t lvec32[8];
+typedef __declspec(align(32)) int8_t lvec8[32];
+typedef __declspec(align(32)) uint16_t ulvec16[16];
+typedef __declspec(align(32)) uint32_t ulvec32[8];
+typedef __declspec(align(32)) uint8_t ulvec8[32];
 #elif !defined(__pnacl__) && (defined(__GNUC__) || defined(__clang__))
 // Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.
 #if defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)
@@ -397,32 +509,32 @@ typedef __declspec(align(32)) uint8 ulvec8[32];
 #else
 #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
 #endif
-typedef int16 __attribute__((vector_size(16))) vec16;
-typedef int32 __attribute__((vector_size(16))) vec32;
-typedef int8 __attribute__((vector_size(16))) vec8;
-typedef uint16 __attribute__((vector_size(16))) uvec16;
-typedef uint32 __attribute__((vector_size(16))) uvec32;
-typedef uint8 __attribute__((vector_size(16))) uvec8;
-typedef int16 __attribute__((vector_size(32))) lvec16;
-typedef int32 __attribute__((vector_size(32))) lvec32;
-typedef int8 __attribute__((vector_size(32))) lvec8;
-typedef uint16 __attribute__((vector_size(32))) ulvec16;
-typedef uint32 __attribute__((vector_size(32))) ulvec32;
-typedef uint8 __attribute__((vector_size(32))) ulvec8;
+typedef int16_t __attribute__((vector_size(16))) vec16;
+typedef int32_t __attribute__((vector_size(16))) vec32;
+typedef int8_t __attribute__((vector_size(16))) vec8;
+typedef uint16_t __attribute__((vector_size(16))) uvec16;
+typedef uint32_t __attribute__((vector_size(16))) uvec32;
+typedef uint8_t __attribute__((vector_size(16))) uvec8;
+typedef int16_t __attribute__((vector_size(32))) lvec16;
+typedef int32_t __attribute__((vector_size(32))) lvec32;
+typedef int8_t __attribute__((vector_size(32))) lvec8;
+typedef uint16_t __attribute__((vector_size(32))) ulvec16;
+typedef uint32_t __attribute__((vector_size(32))) ulvec32;
+typedef uint8_t __attribute__((vector_size(32))) ulvec8;
 #else
 #define SIMD_ALIGNED(var) var
-typedef int16 vec16[8];
-typedef int32 vec32[4];
-typedef int8 vec8[16];
-typedef uint16 uvec16[8];
-typedef uint32 uvec32[4];
-typedef uint8 uvec8[16];
-typedef int16 lvec16[16];
-typedef int32 lvec32[8];
-typedef int8 lvec8[32];
-typedef uint16 ulvec16[16];
-typedef uint32 ulvec32[8];
-typedef uint8 ulvec8[32];
+typedef int16_t vec16[8];
+typedef int32_t vec32[4];
+typedef int8_t vec8[16];
+typedef uint16_t uvec16[8];
+typedef uint32_t uvec32[4];
+typedef uint8_t uvec8[16];
+typedef int16_t lvec16[16];
+typedef int32_t lvec32[8];
+typedef int8_t lvec8[32];
+typedef uint16_t ulvec16[16];
+typedef uint32_t ulvec32[8];
+typedef uint8_t ulvec8[32];
 #endif
 
 #if defined(__aarch64__)
@@ -446,23 +558,23 @@ struct YuvConstants {
 #else
 // This struct is for Intel color conversion.
 struct YuvConstants {
-  int8 kUVToB[32];
-  int8 kUVToG[32];
-  int8 kUVToR[32];
-  int16 kUVBiasB[16];
-  int16 kUVBiasG[16];
-  int16 kUVBiasR[16];
-  int16 kYToRgb[16];
+  int8_t kUVToB[32];
+  int8_t kUVToG[32];
+  int8_t kUVToR[32];
+  int16_t kUVBiasB[16];
+  int16_t kUVBiasG[16];
+  int16_t kUVBiasR[16];
+  int16_t kYToRgb[16];
 };
 
 // Offsets into YuvConstants structure
-#define KUVTOB   0
-#define KUVTOG   32
-#define KUVTOR   64
+#define KUVTOB 0
+#define KUVTOG 32
+#define KUVTOR 64
 #define KUVBIASB 96
 #define KUVBIASG 128
 #define KUVBIASR 160
-#define KYTORGB  192
+#define KYTORGB 192
 #endif
 
 // Conversion matrix for YUV to RGB
@@ -475,6 +587,16 @@ extern const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants);  // BT.601
 extern const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants);  // JPeg
 extern const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants);  // BT.709
 
+#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
+
+#define align_buffer_64(var, size)                                           \
+  uint8_t* var##_mem = (uint8_t*)(malloc((size) + 63));         /* NOLINT */ \
+  uint8_t* var = (uint8_t*)(((intptr_t)(var##_mem) + 63) & ~63) /* NOLINT */
+
+#define free_aligned_buffer_64(var) \
+  free(var##_mem);                  \
+  var = 0
+
 #if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
 #define OMITFP
 #else
@@ -487,1458 +609,2863 @@ extern const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants);  // BT.709
 #else
 #define LABELALIGN
 #endif
-#if defined(__native_client__) && defined(__x86_64__)
-// r14 is used for MEMOP macros.
-#define NACL_R14 "r14",
-#define BUNDLELOCK ".bundle_lock\n"
-#define BUNDLEUNLOCK ".bundle_unlock\n"
-#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")"
-#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")"
-#define MEMLEA(offset, base) #offset "(%q" #base ")"
-#define MEMLEA3(offset, index, scale) \
-    #offset "(,%q" #index "," #scale ")"
-#define MEMLEA4(offset, base, index, scale) \
-    #offset "(%q" #base ",%q" #index "," #scale ")"
-#define MEMMOVESTRING(s, d) "%%nacl:(%q" #s "),%%nacl:(%q" #d "), %%r15"
-#define MEMSTORESTRING(reg, d) "%%" #reg ",%%nacl:(%q" #d "), %%r15"
-#define MEMOPREG(opcode, offset, base, index, scale, reg) \
-    BUNDLELOCK \
-    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
-    #opcode " (%%r15,%%r14),%%" #reg "\n" \
-    BUNDLEUNLOCK
-#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
-    BUNDLELOCK \
-    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
-    #opcode " %%" #reg ",(%%r15,%%r14)\n" \
-    BUNDLEUNLOCK
-#define MEMOPARG(opcode, offset, base, index, scale, arg) \
-    BUNDLELOCK \
-    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
-    #opcode " (%%r15,%%r14),%" #arg "\n" \
-    BUNDLEUNLOCK
-#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \
-    BUNDLELOCK \
-    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
-    #opcode " (%%r15,%%r14),%%" #reg1 ",%%" #reg2 "\n" \
-    BUNDLEUNLOCK
-#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \
-    BUNDLELOCK \
-    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
-    #op " $" #sel ",%%" #reg ",(%%r15,%%r14)\n" \
-    BUNDLEUNLOCK
-#else  // defined(__native_client__) && defined(__x86_64__)
-#define NACL_R14
-#define BUNDLEALIGN
-#define MEMACCESS(base) "(%" #base ")"
-#define MEMACCESS2(offset, base) #offset "(%" #base ")"
-#define MEMLEA(offset, base) #offset "(%" #base ")"
-#define MEMLEA3(offset, index, scale) \
-    #offset "(,%" #index "," #scale ")"
-#define MEMLEA4(offset, base, index, scale) \
-    #offset "(%" #base ",%" #index "," #scale ")"
-#define MEMMOVESTRING(s, d)
-#define MEMSTORESTRING(reg, d)
-#define MEMOPREG(opcode, offset, base, index, scale, reg) \
-    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n"
-#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
-    #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
-#define MEMOPARG(opcode, offset, base, index, scale, arg) \
-    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n"
-#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \
-    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg1 ",%%" \
-    #reg2 "\n"
-#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \
-    #op " $" #sel ",%%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
-#endif  // defined(__native_client__) && defined(__x86_64__)
 
-#if defined(__arm__) || defined(__aarch64__)
-#undef MEMACCESS
-#if defined(__native_client__)
-#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n"
-#else
-#define MEMACCESS(base)
-#endif
+// Intel Code Analizer markers.  Insert IACA_START IACA_END around code to be
+// measured and then run with iaca -64 libyuv_unittest.
+// IACA_ASM_START amd IACA_ASM_END are equivalents that can be used within
+// inline assembly blocks.
+// example of iaca:
+// ~/iaca-lin64/bin/iaca.sh -64 -analysis LATENCY out/Release/libyuv_unittest
+
+#if defined(__x86_64__) || defined(__i386__)
+
+#define IACA_ASM_START  \
+  ".byte 0x0F, 0x0B\n"  \
+  " movl $111, %%ebx\n" \
+  ".byte 0x64, 0x67, 0x90\n"
+
+#define IACA_ASM_END         \
+  " movl $222, %%ebx\n"      \
+  ".byte 0x64, 0x67, 0x90\n" \
+  ".byte 0x0F, 0x0B\n"
+
+#define IACA_SSC_MARK(MARK_ID)                        \
+  __asm__ __volatile__("\n\t  movl $" #MARK_ID        \
+                       ", %%ebx"                      \
+                       "\n\t  .byte 0x64, 0x67, 0x90" \
+                       :                              \
+                       :                              \
+                       : "memory");
+
+#define IACA_UD_BYTES __asm__ __volatile__("\n\t .byte 0x0F, 0x0B");
+
+#else /* Visual C */
+#define IACA_UD_BYTES \
+  { __asm _emit 0x0F __asm _emit 0x0B }
+
+#define IACA_SSC_MARK(x) \
+  { __asm mov ebx, x __asm _emit 0x64 __asm _emit 0x67 __asm _emit 0x90 }
+
+#define IACA_VC64_START __writegsbyte(111, 111);
+#define IACA_VC64_END __writegsbyte(222, 222);
 #endif
 
-void I444ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+#define IACA_START     \
+  {                    \
+    IACA_UD_BYTES      \
+    IACA_SSC_MARK(111) \
+  }
+#define IACA_END       \
+  {                    \
+    IACA_SSC_MARK(222) \
+    IACA_UD_BYTES      \
+  }
+
+void I444ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
-void I422ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+void I422ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
-void I422AlphaToARGBRow_NEON(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
-                             const uint8* a_buf,
-                             uint8* dst_argb,
+void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             const uint8_t* src_a,
+                             uint8_t* dst_argb,
                              const struct YuvConstants* yuvconstants,
                              int width);
-void I422ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+void I422ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
-void I411ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+void I422ToRGBARow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgba,
                         const struct YuvConstants* yuvconstants,
                         int width);
-void I422ToRGBARow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_rgba,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I422ToRGB24Row_NEON(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_rgb24,
+void I422ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width);
-void I422ToRGB565Row_NEON(const uint8* src_y,
-                          const uint8* src_u,
-                          const uint8* src_v,
-                          uint8* dst_rgb565,
+void I422ToRGB565Row_NEON(const uint8_t* src_y,
+                          const uint8_t* src_u,
+                          const uint8_t* src_v,
+                          uint8_t* dst_rgb565,
                           const struct YuvConstants* yuvconstants,
                           int width);
-void I422ToARGB1555Row_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb1555,
+void I422ToARGB1555Row_NEON(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb1555,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void I422ToARGB4444Row_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb4444,
+void I422ToARGB4444Row_NEON(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb4444,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void NV12ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_uv,
-                        uint8* dst_argb,
+void NV12ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_uv,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
-void NV12ToRGB565Row_NEON(const uint8* src_y,
-                          const uint8* src_uv,
-                          uint8* dst_rgb565,
+void NV12ToRGB565Row_NEON(const uint8_t* src_y,
+                          const uint8_t* src_uv,
+                          uint8_t* dst_rgb565,
                           const struct YuvConstants* yuvconstants,
                           int width);
-void NV21ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_vu,
-                        uint8* dst_argb,
+void NV21ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_vu,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
-void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void UYVYToARGBRow_NEON(const uint8* src_uyvy,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-
-void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width);
-void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width);
-void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width);
-void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width);
-void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int width);
-void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int width);
-void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+void NV12ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_uv,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
                          int width);
-void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+void NV21ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
                          int width);
-void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
-                      uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
-                      uint8* dst_u, uint8* dst_v, int width);
-void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
-                      uint8* dst_u, uint8* dst_v, int width);
-void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
-                       uint8* dst_u, uint8* dst_v, int width);
-void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
-                     uint8* dst_u, uint8* dst_v, int width);
-void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
-                        uint8* dst_u, uint8* dst_v, int width);
-void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
-                          uint8* dst_u, uint8* dst_v, int width);
-void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
-                          uint8* dst_u, uint8* dst_v, int width);
-void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width);
-void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width);
-void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width);
-void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width);
-void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width);
-void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width);
-void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width);
-void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width);
-void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_C(const uint8* src_argb, uint8* dst_y, int width);
-void BGRAToYRow_C(const uint8* src_bgra, uint8* dst_y, int width);
-void ABGRToYRow_C(const uint8* src_abgr, uint8* dst_y, int width);
-void RGBAToYRow_C(const uint8* src_rgba, uint8* dst_y, int width);
-void RGB24ToYRow_C(const uint8* src_rgb24, uint8* dst_y, int width);
-void RAWToYRow_C(const uint8* src_raw, uint8* dst_y, int width);
-void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width);
-void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width);
-void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width);
-void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int width);
-void BGRAToYRow_Any_SSSE3(const uint8* src_bgra, uint8* dst_y, int width);
-void ABGRToYRow_Any_SSSE3(const uint8* src_abgr, uint8* dst_y, int width);
-void RGBAToYRow_Any_SSSE3(const uint8* src_rgba, uint8* dst_y, int width);
-void RGB24ToYRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_y, int width);
-void RAWToYRow_Any_SSSE3(const uint8* src_raw, uint8* dst_y, int width);
-void ARGBToYRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int width);
-void BGRAToYRow_Any_NEON(const uint8* src_bgra, uint8* dst_y, int width);
-void ABGRToYRow_Any_NEON(const uint8* src_abgr, uint8* dst_y, int width);
-void RGBAToYRow_Any_NEON(const uint8* src_rgba, uint8* dst_y, int width);
-void RGB24ToYRow_Any_NEON(const uint8* src_rgb24, uint8* dst_y, int width);
-void RAWToYRow_Any_NEON(const uint8* src_raw, uint8* dst_y, int width);
-void RGB565ToYRow_Any_NEON(const uint8* src_rgb565, uint8* dst_y, int width);
-void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555, uint8* dst_y,
-                             int width);
-void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, uint8* dst_y,
-                             int width);
-
-void ARGBToUVRow_AVX2(const uint8* src_argb, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_AVX2(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_SSSE3(const uint8* src_argb, int src_stride_argb,
-                        uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_SSSE3(const uint8* src_bgra, int src_stride_bgra,
-                       uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_SSSE3(const uint8* src_abgr, int src_stride_abgr,
-                       uint8* dst_u, uint8* dst_v, int width);
-void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba,
-                       uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb,
-                          uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_Any_AVX2(const uint8* src_argb, int src_stride_argb,
-                           uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
-                           uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
-                            uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_Any_SSSE3(const uint8* src_bgra, int src_stride_bgra,
-                           uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_Any_SSSE3(const uint8* src_abgr, int src_stride_abgr,
-                           uint8* dst_u, uint8* dst_v, int width);
-void RGBAToUVRow_Any_SSSE3(const uint8* src_rgba, int src_stride_rgba,
-                           uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUV444Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                             int width);
-void ARGBToUV411Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                             int width);
-void ARGBToUVRow_Any_NEON(const uint8* src_argb, int src_stride_argb,
-                          uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_Any_NEON(const uint8* src_argb, int src_stride_argb,
-                           uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_Any_NEON(const uint8* src_bgra, int src_stride_bgra,
-                          uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_Any_NEON(const uint8* src_abgr, int src_stride_abgr,
-                          uint8* dst_u, uint8* dst_v, int width);
-void RGBAToUVRow_Any_NEON(const uint8* src_rgba, int src_stride_rgba,
-                          uint8* dst_u, uint8* dst_v, int width);
-void RGB24ToUVRow_Any_NEON(const uint8* src_rgb24, int src_stride_rgb24,
-                           uint8* dst_u, uint8* dst_v, int width);
-void RAWToUVRow_Any_NEON(const uint8* src_raw, int src_stride_raw,
-                         uint8* dst_u, uint8* dst_v, int width);
-void RGB565ToUVRow_Any_NEON(const uint8* src_rgb565, int src_stride_rgb565,
-                            uint8* dst_u, uint8* dst_v, int width);
-void ARGB1555ToUVRow_Any_NEON(const uint8* src_argb1555,
-                              int src_stride_argb1555,
-                              uint8* dst_u, uint8* dst_v, int width);
-void ARGB4444ToUVRow_Any_NEON(const uint8* src_argb4444,
-                              int src_stride_argb4444,
-                              uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVRow_C(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_C(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_C(const uint8* src_bgra, int src_stride_bgra,
-                   uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_C(const uint8* src_abgr, int src_stride_abgr,
-                   uint8* dst_u, uint8* dst_v, int width);
-void RGBAToUVRow_C(const uint8* src_rgba, int src_stride_rgba,
-                   uint8* dst_u, uint8* dst_v, int width);
-void RGB24ToUVRow_C(const uint8* src_rgb24, int src_stride_rgb24,
-                    uint8* dst_u, uint8* dst_v, int width);
-void RAWToUVRow_C(const uint8* src_raw, int src_stride_raw,
-                  uint8* dst_u, uint8* dst_v, int width);
-void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
-                     uint8* dst_u, uint8* dst_v, int width);
-void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
-                       uint8* dst_u, uint8* dst_v, int width);
-void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
-                       uint8* dst_u, uint8* dst_v, int width);
-
-void ARGBToUV444Row_SSSE3(const uint8* src_argb,
-                          uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUV444Row_Any_SSSE3(const uint8* src_argb,
-                              uint8* dst_u, uint8* dst_v, int width);
-
-void ARGBToUV444Row_C(const uint8* src_argb,
-                      uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUV411Row_C(const uint8* src_argb,
-                      uint8* dst_u, uint8* dst_v, int width);
-
-void MirrorRow_AVX2(const uint8* src, uint8* dst, int width);
-void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
-void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
-void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width);
-void MirrorRow_C(const uint8* src, uint8* dst, int width);
-void MirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
-void MirrorRow_Any_SSSE3(const uint8* src, uint8* dst, int width);
-void MirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
-void MirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
-
-void MirrorUVRow_SSSE3(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I444ToARGBRow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
                        int width);
-void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                      int width);
-void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+
+void I422ToARGBRow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I422ToRGBARow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I422AlphaToARGBRow_MSA(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            const uint8_t* src_a,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToRGB24Row_MSA(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I422ToRGB565Row_MSA(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb565,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I422ToARGB4444Row_MSA(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_argb4444,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void I422ToARGB1555Row_MSA(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_argb1555,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void NV12ToARGBRow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_uv,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void NV12ToRGB565Row_MSA(const uint8_t* src_y,
+                         const uint8_t* src_uv,
+                         uint8_t* dst_rgb565,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void NV21ToARGBRow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_vu,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void UYVYToARGBRow_MSA(const uint8_t* src_uyvy,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
                        int width);
-void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width);
 
-void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
-
-void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width);
-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                     int width);
-void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                     int width);
-void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                     int width);
-void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width);
+void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void RGB24ToYRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RAWToYRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ARGBToUV444Row_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void ARGBToUVRow_NEON(const uint8_t* src_argb,
+                      int src_stride_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
                       int width);
-void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                         int width);
-void SplitUVRow_Any_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                         int width);
-void SplitUVRow_Any_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                         int width);
-void SplitUVRow_Any_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void ARGBToUV444Row_MSA(const uint8_t* src_argb,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
+void ARGBToUVRow_MSA(const uint8_t* src_argb0,
+                     int src_stride_argb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void ARGBToUVJRow_NEON(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
+void BGRAToUVRow_NEON(const uint8_t* src_bgra,
+                      int src_stride_bgra,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void ABGRToUVRow_NEON(const uint8_t* src_abgr,
+                      int src_stride_abgr,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void RGBAToUVRow_NEON(const uint8_t* src_rgba,
+                      int src_stride_rgba,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
+                       int src_stride_rgb24,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
+void RAWToUVRow_NEON(const uint8_t* src_raw,
+                     int src_stride_raw,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
+                        int src_stride_rgb565,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
+void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
+                          int src_stride_argb1555,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
                           int width);
+void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
+                          int src_stride_argb4444,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void ARGBToUVJRow_MSA(const uint8_t* src_rgb0,
+                      int src_stride_rgb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void BGRAToUVRow_MSA(const uint8_t* src_rgb0,
+                     int src_stride_rgb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void ABGRToUVRow_MSA(const uint8_t* src_rgb0,
+                     int src_stride_rgb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void RGBAToUVRow_MSA(const uint8_t* src_rgb0,
+                     int src_stride_rgb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
+                      int src_stride_rgb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void RAWToUVRow_MSA(const uint8_t* src_rgb0,
+                    int src_stride_rgb,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width);
+void RGB565ToUVRow_MSA(const uint8_t* src_rgb565,
+                       int src_stride_rgb565,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
+void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555,
+                         int src_stride_argb1555,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width);
+void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
+void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
+                         uint8_t* dst_y,
+                         int width);
+void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
+                         uint8_t* dst_y,
+                         int width);
+void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
+void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
+void ARGBToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ARGBToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void BGRAToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ABGRToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void RGBAToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void RGB24ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void RAWToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
+void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
+void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
+void ARGBToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void BGRAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYRow_Any_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RAWToYRow_Any_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void BGRAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB565ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGB1555ToYRow_Any_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void ARGB4444ToYRow_Any_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void BGRAToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYJRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB565ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGB1555ToYRow_Any_MSA(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
 
-void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
+                      int src_stride_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
+void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
+void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
+                        int src_stride_argb,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
+void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
+                       int src_stride_bgra,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
+void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
+                       int src_stride_abgr,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
+void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
+                       int src_stride_rgba,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
+void ARGBToUVRow_Any_AVX2(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void ARGBToUVJRow_Any_AVX2(const uint8_t* src_ptr,
+                           int src_stride_ptr,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void ARGBToUVRow_Any_SSSE3(const uint8_t* src_ptr,
+                           int src_stride_ptr,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void ARGBToUVJRow_Any_SSSE3(const uint8_t* src_ptr,
+                            int src_stride_ptr,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
+void BGRAToUVRow_Any_SSSE3(const uint8_t* src_ptr,
+                           int src_stride_ptr,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void ABGRToUVRow_Any_SSSE3(const uint8_t* src_ptr,
+                           int src_stride_ptr,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void RGBAToUVRow_Any_SSSE3(const uint8_t* src_ptr,
+                           int src_stride_ptr,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void ARGBToUV444Row_Any_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width);
+void ARGBToUVRow_Any_NEON(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void ARGBToUV444Row_Any_MSA(const uint8_t* src_ptr,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
+void ARGBToUVRow_Any_MSA(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void ARGBToUVJRow_Any_NEON(const uint8_t* src_ptr,
+                           int src_stride_ptr,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void BGRAToUVRow_Any_NEON(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void ABGRToUVRow_Any_NEON(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void RGBAToUVRow_Any_NEON(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void RGB24ToUVRow_Any_NEON(const uint8_t* src_ptr,
+                           int src_stride_ptr,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void RAWToUVRow_Any_NEON(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void RGB565ToUVRow_Any_NEON(const uint8_t* src_ptr,
+                            int src_stride_ptr,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
+void ARGB1555ToUVRow_Any_NEON(const uint8_t* src_ptr,
+                              int src_stride_ptr,
+                              uint8_t* dst_u,
+                              uint8_t* dst_v,
+                              int width);
+void ARGB4444ToUVRow_Any_NEON(const uint8_t* src_ptr,
+                              int src_stride_ptr,
+                              uint8_t* dst_u,
+                              uint8_t* dst_v,
+                              int width);
+void ARGBToUVJRow_Any_MSA(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void BGRAToUVRow_Any_MSA(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void ABGRToUVRow_Any_MSA(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void RGBAToUVRow_Any_MSA(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void RGB24ToUVRow_Any_MSA(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void RAWToUVRow_Any_MSA(const uint8_t* src_ptr,
+                        int src_stride_ptr,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
+void RGB565ToUVRow_Any_MSA(const uint8_t* src_ptr,
+                           int src_stride_ptr,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void ARGB1555ToUVRow_Any_MSA(const uint8_t* src_ptr,
+                             int src_stride_ptr,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width);
+void ARGBToUVRow_C(const uint8_t* src_rgb0,
+                   int src_stride_rgb,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width);
+void ARGBToUVJRow_C(const uint8_t* src_rgb0,
+                    int src_stride_rgb,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width);
+void ARGBToUVRow_C(const uint8_t* src_rgb0,
+                   int src_stride_rgb,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width);
+void ARGBToUVJRow_C(const uint8_t* src_rgb0,
+                    int src_stride_rgb,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width);
+void BGRAToUVRow_C(const uint8_t* src_rgb0,
+                   int src_stride_rgb,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width);
+void ABGRToUVRow_C(const uint8_t* src_rgb0,
+                   int src_stride_rgb,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width);
+void RGBAToUVRow_C(const uint8_t* src_rgb0,
+                   int src_stride_rgb,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width);
+void RGB24ToUVRow_C(const uint8_t* src_rgb0,
+                    int src_stride_rgb,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width);
+void RAWToUVRow_C(const uint8_t* src_rgb0,
+                  int src_stride_rgb,
+                  uint8_t* dst_u,
+                  uint8_t* dst_v,
                   int width);
-void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void RGB565ToUVRow_C(const uint8_t* src_rgb565,
+                     int src_stride_rgb565,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
                      int width);
-void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
+                       int src_stride_argb1555,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
+void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
+                       int src_stride_argb4444,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
+
+void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void ARGBToUV444Row_Any_SSSE3(const uint8_t* src_ptr,
+                              uint8_t* dst_u,
+                              uint8_t* dst_v,
+                              int width);
+
+void ARGBToUV444Row_C(const uint8_t* src_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+
+void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+
+void MirrorUVRow_SSSE3(const uint8_t* src,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
+void MirrorUVRow_NEON(const uint8_t* src_uv,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void MirrorUVRow_MSA(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
                      int width);
-void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MirrorUVRow_C(const uint8_t* src_uv,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width);
+
+void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_Any_AVX2(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void ARGBMirrorRow_Any_SSE2(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void ARGBMirrorRow_Any_NEON(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void ARGBMirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+
+void SplitUVRow_C(const uint8_t* src_uv,
+                  uint8_t* dst_u,
+                  uint8_t* dst_v,
+                  int width);
+void SplitUVRow_SSE2(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
                      int width);
-void MergeUVRow_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void SplitUVRow_AVX2(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void SplitUVRow_NEON(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void SplitUVRow_MSA(const uint8_t* src_uv,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width);
+void SplitUVRow_Any_SSE2(const uint8_t* src_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
                          int width);
-void MergeUVRow_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void SplitUVRow_Any_AVX2(const uint8_t* src_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
                          int width);
-void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void SplitUVRow_Any_NEON(const uint8_t* src_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
                          int width);
+void SplitUVRow_Any_MSA(const uint8_t* src_ptr,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
 
-void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
-void CopyRow_AVX(const uint8* src, uint8* dst, int count);
-void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
-void CopyRow_NEON(const uint8* src, uint8* dst, int count);
-void CopyRow_MIPS(const uint8* src, uint8* dst, int count);
-void CopyRow_C(const uint8* src, uint8* dst, int count);
-void CopyRow_Any_SSE2(const uint8* src, uint8* dst, int count);
-void CopyRow_Any_AVX(const uint8* src, uint8* dst, int count);
-void CopyRow_Any_NEON(const uint8* src, uint8* dst, int count);
+void MergeUVRow_C(const uint8_t* src_u,
+                  const uint8_t* src_v,
+                  uint8_t* dst_uv,
+                  int width);
+void MergeUVRow_SSE2(const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_uv,
+                     int width);
+void MergeUVRow_AVX2(const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_uv,
+                     int width);
+void MergeUVRow_NEON(const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_uv,
+                     int width);
+void MergeUVRow_MSA(const uint8_t* src_u,
+                    const uint8_t* src_v,
+                    uint8_t* dst_uv,
+                    int width);
+void MergeUVRow_Any_SSE2(const uint8_t* y_buf,
+                         const uint8_t* uv_buf,
+                         uint8_t* dst_ptr,
+                         int width);
+void MergeUVRow_Any_AVX2(const uint8_t* y_buf,
+                         const uint8_t* uv_buf,
+                         uint8_t* dst_ptr,
+                         int width);
+void MergeUVRow_Any_NEON(const uint8_t* y_buf,
+                         const uint8_t* uv_buf,
+                         uint8_t* dst_ptr,
+                         int width);
+void MergeUVRow_Any_MSA(const uint8_t* y_buf,
+                        const uint8_t* uv_buf,
+                        uint8_t* dst_ptr,
+                        int width);
 
-void CopyRow_16_C(const uint16* src, uint16* dst, int count);
+void SplitRGBRow_C(const uint8_t* src_rgb,
+                   uint8_t* dst_r,
+                   uint8_t* dst_g,
+                   uint8_t* dst_b,
+                   int width);
+void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
+                       uint8_t* dst_r,
+                       uint8_t* dst_g,
+                       uint8_t* dst_b,
+                       int width);
+void SplitRGBRow_NEON(const uint8_t* src_rgb,
+                      uint8_t* dst_r,
+                      uint8_t* dst_g,
+                      uint8_t* dst_b,
+                      int width);
+void SplitRGBRow_Any_SSSE3(const uint8_t* src_ptr,
+                           uint8_t* dst_r,
+                           uint8_t* dst_g,
+                           uint8_t* dst_b,
+                           int width);
+void SplitRGBRow_Any_NEON(const uint8_t* src_ptr,
+                          uint8_t* dst_r,
+                          uint8_t* dst_g,
+                          uint8_t* dst_b,
+                          int width);
 
-void ARGBCopyAlphaRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBCopyAlphaRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBCopyAlphaRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBCopyAlphaRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
+void MergeRGBRow_C(const uint8_t* src_r,
+                   const uint8_t* src_g,
+                   const uint8_t* src_b,
+                   uint8_t* dst_rgb,
+                   int width);
+void MergeRGBRow_SSSE3(const uint8_t* src_r,
+                       const uint8_t* src_g,
+                       const uint8_t* src_b,
+                       uint8_t* dst_rgb,
+                       int width);
+void MergeRGBRow_NEON(const uint8_t* src_r,
+                      const uint8_t* src_g,
+                      const uint8_t* src_b,
+                      uint8_t* dst_rgb,
+                      int width);
+void MergeRGBRow_Any_SSSE3(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           int width);
+void MergeRGBRow_Any_NEON(const uint8_t* src_r,
+                          const uint8_t* src_g,
+                          const uint8_t* src_b,
+                          uint8_t* dst_rgb,
+                          int width);
+
+void MergeUVRow_16_C(const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint16_t* dst_uv,
+                     int scale, /* 64 for 10 bit */
+                     int width);
+void MergeUVRow_16_AVX2(const uint16_t* src_u,
+                        const uint16_t* src_v,
+                        uint16_t* dst_uv,
+                        int scale,
+                        int width);
+
+void MultiplyRow_16_AVX2(const uint16_t* src_y,
+                         uint16_t* dst_y,
+                         int scale,
+                         int width);
+void MultiplyRow_16_C(const uint16_t* src_y,
+                      uint16_t* dst_y,
+                      int scale,
+                      int width);
+
+void Convert8To16Row_C(const uint8_t* src_y,
+                       uint16_t* dst_y,
+                       int scale,
+                       int width);
+void Convert8To16Row_SSE2(const uint8_t* src_y,
+                          uint16_t* dst_y,
+                          int scale,
+                          int width);
+void Convert8To16Row_AVX2(const uint8_t* src_y,
+                          uint16_t* dst_y,
+                          int scale,
+                          int width);
+void Convert8To16Row_Any_SSE2(const uint8_t* src_ptr,
+                              uint16_t* dst_ptr,
+                              int scale,
+                              int width);
+void Convert8To16Row_Any_AVX2(const uint8_t* src_ptr,
+                              uint16_t* dst_ptr,
+                              int scale,
+                              int width);
+
+void Convert16To8Row_C(const uint16_t* src_y,
+                       uint8_t* dst_y,
+                       int scale,
+                       int width);
+void Convert16To8Row_SSSE3(const uint16_t* src_y,
+                           uint8_t* dst_y,
+                           int scale,
+                           int width);
+void Convert16To8Row_AVX2(const uint16_t* src_y,
+                          uint8_t* dst_y,
+                          int scale,
+                          int width);
+void Convert16To8Row_Any_SSSE3(const uint16_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int scale,
                                int width);
-void ARGBCopyAlphaRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+void Convert16To8Row_Any_AVX2(const uint16_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int scale,
+                              int width);
+
+void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width);
+void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width);
+void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width);
+void CopyRow_MIPS(const uint8_t* src, uint8_t* dst, int count);
+void CopyRow_C(const uint8_t* src, uint8_t* dst, int count);
+void CopyRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void CopyRow_Any_AVX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void CopyRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+
+void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count);
+
+void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyAlphaRow_Any_SSE2(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int width);
+void ARGBCopyAlphaRow_Any_AVX2(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
                                int width);
 
-void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width);
-void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width);
-void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width);
-void ARGBExtractAlphaRow_Any_SSE2(const uint8* src_argb, uint8* dst_a,
+void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width);
+void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width);
+void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width);
+void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width);
+void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb,
+                             uint8_t* dst_a,
+                             int width);
+void ARGBExtractAlphaRow_Any_SSE2(const uint8_t* src_ptr,
+                                  uint8_t* dst_ptr,
                                   int width);
-void ARGBExtractAlphaRow_Any_NEON(const uint8* src_argb, uint8* dst_a,
+void ARGBExtractAlphaRow_Any_AVX2(const uint8_t* src_ptr,
+                                  uint8_t* dst_ptr,
+                                  int width);
+void ARGBExtractAlphaRow_Any_NEON(const uint8_t* src_ptr,
+                                  uint8_t* dst_ptr,
+                                  int width);
+void ARGBExtractAlphaRow_Any_MSA(const uint8_t* src_ptr,
+                                 uint8_t* dst_ptr,
+                                 int width);
+
+void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyYToAlphaRow_Any_SSE2(const uint8_t* src_ptr,
+                                  uint8_t* dst_ptr,
+                                  int width);
+void ARGBCopyYToAlphaRow_Any_AVX2(const uint8_t* src_ptr,
+                                  uint8_t* dst_ptr,
                                   int width);
 
-void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width);
-void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
-void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
-void ARGBCopyYToAlphaRow_Any_SSE2(const uint8* src_y, uint8* dst_argb,
-                                  int width);
-void ARGBCopyYToAlphaRow_Any_AVX2(const uint8* src_y, uint8* dst_argb,
-                                  int width);
+void SetRow_C(uint8_t* dst, uint8_t v8, int width);
+void SetRow_MSA(uint8_t* dst, uint8_t v8, int width);
+void SetRow_X86(uint8_t* dst, uint8_t v8, int width);
+void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width);
+void SetRow_NEON(uint8_t* dst, uint8_t v8, int width);
+void SetRow_Any_X86(uint8_t* dst_ptr, uint8_t v32, int width);
+void SetRow_Any_NEON(uint8_t* dst_ptr, uint8_t v32, int width);
 
-void SetRow_C(uint8* dst, uint8 v8, int count);
-void SetRow_X86(uint8* dst, uint8 v8, int count);
-void SetRow_ERMS(uint8* dst, uint8 v8, int count);
-void SetRow_NEON(uint8* dst, uint8 v8, int count);
-void SetRow_Any_X86(uint8* dst, uint8 v8, int count);
-void SetRow_Any_NEON(uint8* dst, uint8 v8, int count);
-
-void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int count);
-void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count);
-void ARGBSetRow_NEON(uint8* dst_argb, uint32 v32, int count);
-void ARGBSetRow_Any_NEON(uint8* dst_argb, uint32 v32, int count);
+void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width);
+void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width);
+void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width);
+void ARGBSetRow_Any_NEON(uint8_t* dst_ptr, uint32_t v32, int width);
+void ARGBSetRow_MSA(uint8_t* dst_argb, uint32_t v32, int width);
+void ARGBSetRow_Any_MSA(uint8_t* dst_ptr, uint32_t v32, int width);
 
 // ARGBShufflers for BGRAToARGB etc.
-void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
-                      const uint8* shuffler, int width);
-void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int width);
-void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                          const uint8* shuffler, int width);
-void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int width);
-void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int width);
-void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
-                             const uint8* shuffler, int width);
-void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                              const uint8* shuffler, int width);
-void ARGBShuffleRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
-                             const uint8* shuffler, int width);
-void ARGBShuffleRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
-                             const uint8* shuffler, int width);
-
-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width);
-void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width);
-void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width);
-void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int width);
-void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
-                            int width);
-void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
-                            int width);
-void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int width);
-void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
-                            int width);
-void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
-                            int width);
-
-void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width);
-void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width);
-void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width);
-void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width);
-void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
-                            int width);
-void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
-                            int width);
-void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width);
-void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width);
-void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width);
-void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width);
-void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb,
+void ARGBShuffleRow_C(const uint8_t* src_argb,
+                      uint8_t* dst_argb,
+                      const uint8_t* shuffler,
+                      int width);
+void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          const uint8_t* shuffler,
+                          int width);
+void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
+                         uint8_t* dst_argb,
+                         const uint8_t* shuffler,
+                         int width);
+void ARGBShuffleRow_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_argb,
+                         const uint8_t* shuffler,
+                         int width);
+void ARGBShuffleRow_MSA(const uint8_t* src_argb,
+                        uint8_t* dst_argb,
+                        const uint8_t* shuffler,
+                        int width);
+void ARGBShuffleRow_Any_SSSE3(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              const uint8_t* param,
                               int width);
-void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int width);
-void RAWToRGB24Row_Any_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width);
-
-void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb,
-                              int width);
-void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb,
-                                int width);
-void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb,
-                                int width);
-void RGB565ToARGBRow_Any_AVX2(const uint8* src_rgb565, uint8* dst_argb,
-                              int width);
-void ARGB1555ToARGBRow_Any_AVX2(const uint8* src_argb1555, uint8* dst_argb,
-                                int width);
-void ARGB4444ToARGBRow_Any_AVX2(const uint8* src_argb4444, uint8* dst_argb,
-                                int width);
-
-void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb,
+void ARGBShuffleRow_Any_AVX2(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             const uint8_t* param,
                              int width);
-void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int width);
-void RAWToRGB24Row_Any_NEON(const uint8* src_raw, uint8* dst_rgb24, int width);
-void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb,
+void ARGBShuffleRow_Any_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             const uint8_t* param,
+                             int width);
+void ARGBShuffleRow_Any_MSA(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            const uint8_t* param,
+                            int width);
+
+void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
+                          uint8_t* dst_argb,
+                          int width);
+void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565,
+                          uint8_t* dst_argb,
+                          int width);
+void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555,
+                            uint8_t* dst_argb,
+                            int width);
+void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444,
+                            uint8_t* dst_argb,
+                            int width);
+
+void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
+                         uint8_t* dst_argb,
+                         int width);
+void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
+void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
+                          uint8_t* dst_argb,
+                          int width);
+void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565,
+                         uint8_t* dst_argb,
+                         int width);
+void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
+                            uint8_t* dst_argb,
+                            int width);
+void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555,
+                           uint8_t* dst_argb,
+                           int width);
+void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
+                            uint8_t* dst_argb,
+                            int width);
+void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444,
+                           uint8_t* dst_argb,
+                           int width);
+void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
+void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RGB565ToARGBRow_C(const uint8_t* src_rgb565, uint8_t* dst_argb, int width);
+void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555,
+                         uint8_t* dst_argb,
+                         int width);
+void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444,
+                         uint8_t* dst_argb,
+                         int width);
+void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width);
+void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width);
+void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width);
+void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width);
+
+void RGB24ToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
                               int width);
-void ARGB1555ToARGBRow_Any_NEON(const uint8* src_argb1555, uint8* dst_argb,
+void RAWToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void RAWToRGB24Row_Any_SSSE3(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+
+void RGB565ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGB1555ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
                                 int width);
-void ARGB4444ToARGBRow_Any_NEON(const uint8* src_argb4444, uint8* dst_argb,
+void ARGB4444ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int width);
+void RGB565ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGB1555ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int width);
+void ARGB4444ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
                                 int width);
 
-void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
+void RGB24ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void RGB24ToARGBRow_Any_MSA(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void RAWToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToRGB24Row_Any_NEON(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void RAWToRGB24Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB565ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
+void RGB565ToARGBRow_Any_MSA(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void ARGB1555ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int width);
+void ARGB1555ToARGBRow_Any_MSA(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int width);
+void ARGB4444ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int width);
 
-void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
-                             const uint32 dither4, int width);
-void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
-                                const uint32 dither4, int width);
-void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
-                                const uint32 dither4, int width);
+void ARGB4444ToARGBRow_Any_MSA(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int width);
 
-void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);
 
-void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
-                                const uint32 dither4, int width);
+void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width);
 
-void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width);
 
-void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
-void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
-void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);
-void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);
-void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);
-void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);
-void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);
+void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
+                             uint8_t* dst_rgb,
+                             const uint32_t dither4,
+                             int width);
+void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
+                                uint8_t* dst,
+                                const uint32_t dither4,
+                                int width);
+void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
+                                uint8_t* dst,
+                                const uint32_t dither4,
+                                int width);
 
-void I444ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_argb,
+void ARGBToRGB565Row_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb,
+                            uint8_t* dst_rgb,
+                            int width);
+void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb,
+                            uint8_t* dst_rgb,
+                            int width);
+void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width);
+
+void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_rgb24,
+                         int width);
+void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width);
+void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
+                          uint8_t* dst_rgb565,
+                          int width);
+void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
+                            uint8_t* dst_argb1555,
+                            int width);
+void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
+                            uint8_t* dst_argb4444,
+                            int width);
+void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
+                                uint8_t* dst_rgb,
+                                const uint32_t dither4,
+                                int width);
+void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB565Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB1555Row_MSA(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width);
+void ARGBToARGB4444Row_MSA(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width);
+void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb,
+                               uint8_t* dst_rgb,
+                               const uint32_t dither4,
+                               int width);
+
+void ARGBToRGBARow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width);
+void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width);
+
+void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void J400ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void J400ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void J400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+
+void I444ToARGBRow_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width);
-void I422ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_argb,
+void I422ToARGBRow_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width);
-void I422ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_argb,
+void I422ToAR30Row_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width);
-void I422AlphaToARGBRow_C(const uint8* y_buf,
-                          const uint8* u_buf,
-                          const uint8* v_buf,
-                          const uint8* a_buf,
-                          uint8* dst_argb,
+void I210ToAR30Row_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void I210ToARGBRow_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void I422AlphaToARGBRow_C(const uint8_t* src_y,
+                          const uint8_t* src_u,
+                          const uint8_t* src_v,
+                          const uint8_t* src_a,
+                          uint8_t* rgb_buf,
                           const struct YuvConstants* yuvconstants,
                           int width);
-void I411ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_argb,
+void NV12ToARGBRow_C(const uint8_t* src_y,
+                     const uint8_t* src_uv,
+                     uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width);
-void NV12ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_uv,
-                     uint8* dst_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width);
-void NV12ToRGB565Row_C(const uint8* src_y,
-                       const uint8* src_uv,
-                       uint8* dst_argb,
+void NV12ToRGB565Row_C(const uint8_t* src_y,
+                       const uint8_t* src_uv,
+                       uint8_t* dst_rgb565,
                        const struct YuvConstants* yuvconstants,
                        int width);
-void NV21ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_uv,
-                     uint8* dst_argb,
+void NV21ToARGBRow_C(const uint8_t* src_y,
+                     const uint8_t* src_vu,
+                     uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width);
-void YUY2ToARGBRow_C(const uint8* src_yuy2,
-                     uint8* dst_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width);
-void UYVYToARGBRow_C(const uint8* src_uyvy,
-                     uint8* dst_argb,
-                     const struct YuvConstants* yuvconstants,
-                     int width);
-void I422ToRGBARow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_rgba,
-                     const struct YuvConstants* yuvconstants,
-                     int width);
-void I422ToRGB24Row_C(const uint8* src_y,
-                      const uint8* src_u,
-                      const uint8* src_v,
-                      uint8* dst_rgb24,
+void NV12ToRGB24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_uv,
+                      uint8_t* rgb_buf,
                       const struct YuvConstants* yuvconstants,
                       int width);
-void I422ToARGB4444Row_C(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb4444,
+void NV21ToRGB24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_vu,
+                      uint8_t* rgb_buf,
+                      const struct YuvConstants* yuvconstants,
+                      int width);
+void YUY2ToARGBRow_C(const uint8_t* src_yuy2,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void UYVYToARGBRow_C(const uint8_t* src_uyvy,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void I422ToRGBARow_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void I422ToRGB24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_u,
+                      const uint8_t* src_v,
+                      uint8_t* rgb_buf,
+                      const struct YuvConstants* yuvconstants,
+                      int width);
+void I422ToARGB4444Row_C(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_argb4444,
                          const struct YuvConstants* yuvconstants,
                          int width);
-void I422ToARGB1555Row_C(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb4444,
+void I422ToARGB1555Row_C(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_argb1555,
                          const struct YuvConstants* yuvconstants,
                          int width);
-void I422ToRGB565Row_C(const uint8* src_y,
-                       const uint8* src_u,
-                       const uint8* src_v,
-                       uint8* dst_rgb565,
+void I422ToRGB565Row_C(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_rgb565,
                        const struct YuvConstants* yuvconstants,
                        int width);
-void I422ToARGBRow_AVX2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+void I422ToARGBRow_AVX2(const uint8_t* y_buf,
+                        const uint8_t* u_buf,
+                        const uint8_t* v_buf,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
-void I422ToARGBRow_AVX2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+void I422ToRGBARow_AVX2(const uint8_t* y_buf,
+                        const uint8_t* u_buf,
+                        const uint8_t* v_buf,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
-void I422ToRGBARow_AVX2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I444ToARGBRow_SSSE3(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb,
+void I444ToARGBRow_SSSE3(const uint8_t* y_buf,
+                         const uint8_t* u_buf,
+                         const uint8_t* v_buf,
+                         uint8_t* dst_argb,
                          const struct YuvConstants* yuvconstants,
                          int width);
-void I444ToARGBRow_AVX2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+void I444ToARGBRow_AVX2(const uint8_t* y_buf,
+                        const uint8_t* u_buf,
+                        const uint8_t* v_buf,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
-void I444ToARGBRow_SSSE3(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb,
+void I444ToARGBRow_SSSE3(const uint8_t* y_buf,
+                         const uint8_t* u_buf,
+                         const uint8_t* v_buf,
+                         uint8_t* dst_argb,
                          const struct YuvConstants* yuvconstants,
                          int width);
-void I444ToARGBRow_AVX2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+void I444ToARGBRow_AVX2(const uint8_t* y_buf,
+                        const uint8_t* u_buf,
+                        const uint8_t* v_buf,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
-void I422ToARGBRow_SSSE3(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb,
+void I422ToARGBRow_SSSE3(const uint8_t* y_buf,
+                         const uint8_t* u_buf,
+                         const uint8_t* v_buf,
+                         uint8_t* dst_argb,
                          const struct YuvConstants* yuvconstants,
                          int width);
-void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              const uint8* a_buf,
-                              uint8* dst_argb,
+
+void I422ToAR30Row_SSSE3(const uint8_t* y_buf,
+                         const uint8_t* u_buf,
+                         const uint8_t* v_buf,
+                         uint8_t* dst_ar30,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I210ToAR30Row_SSSE3(const uint16_t* y_buf,
+                         const uint16_t* u_buf,
+                         const uint16_t* v_buf,
+                         uint8_t* dst_ar30,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I210ToARGBRow_SSSE3(const uint16_t* y_buf,
+                         const uint16_t* u_buf,
+                         const uint16_t* v_buf,
+                         uint8_t* dst_argb,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I422ToAR30Row_AVX2(const uint8_t* y_buf,
+                        const uint8_t* u_buf,
+                        const uint8_t* v_buf,
+                        uint8_t* dst_ar30,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I210ToARGBRow_AVX2(const uint16_t* y_buf,
+                        const uint16_t* u_buf,
+                        const uint16_t* v_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I210ToAR30Row_AVX2(const uint16_t* y_buf,
+                        const uint16_t* u_buf,
+                        const uint16_t* v_buf,
+                        uint8_t* dst_ar30,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
+                              const uint8_t* u_buf,
+                              const uint8_t* v_buf,
+                              const uint8_t* a_buf,
+                              uint8_t* dst_argb,
                               const struct YuvConstants* yuvconstants,
                               int width);
-void I422AlphaToARGBRow_AVX2(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
-                             const uint8* a_buf,
-                             uint8* dst_argb,
+void I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             const uint8_t* a_buf,
+                             uint8_t* dst_argb,
                              const struct YuvConstants* yuvconstants,
                              int width);
-void I422ToARGBRow_SSSE3(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb,
+void NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
+                         const uint8_t* uv_buf,
+                         uint8_t* dst_argb,
                          const struct YuvConstants* yuvconstants,
                          int width);
-void I411ToARGBRow_SSSE3(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void I411ToARGBRow_AVX2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+void NV12ToARGBRow_AVX2(const uint8_t* y_buf,
+                        const uint8_t* uv_buf,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
-void NV12ToARGBRow_SSSE3(const uint8* src_y,
-                         const uint8* src_uv,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width);
-void NV12ToARGBRow_AVX2(const uint8* src_y,
-                        const uint8* src_uv,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void NV12ToRGB565Row_SSSE3(const uint8* src_y,
-                           const uint8* src_uv,
-                           uint8* dst_argb,
+void NV12ToRGB24Row_SSSE3(const uint8_t* src_y,
+                          const uint8_t* src_uv,
+                          uint8_t* dst_rgb24,
+                          const struct YuvConstants* yuvconstants,
+                          int width);
+void NV21ToRGB24Row_SSSE3(const uint8_t* src_y,
+                          const uint8_t* src_vu,
+                          uint8_t* dst_rgb24,
+                          const struct YuvConstants* yuvconstants,
+                          int width);
+void NV12ToRGB565Row_SSSE3(const uint8_t* src_y,
+                           const uint8_t* src_uv,
+                           uint8_t* dst_rgb565,
                            const struct YuvConstants* yuvconstants,
                            int width);
-void NV12ToRGB565Row_AVX2(const uint8* src_y,
-                          const uint8* src_uv,
-                          uint8* dst_argb,
+void NV12ToRGB24Row_AVX2(const uint8_t* src_y,
+                         const uint8_t* src_uv,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void NV21ToRGB24Row_AVX2(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
+                          const uint8_t* src_uv,
+                          uint8_t* dst_rgb565,
                           const struct YuvConstants* yuvconstants,
                           int width);
-void NV21ToARGBRow_SSSE3(const uint8* src_y,
-                         const uint8* src_uv,
-                         uint8* dst_argb,
+void NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
+                         const uint8_t* vu_buf,
+                         uint8_t* dst_argb,
                          const struct YuvConstants* yuvconstants,
                          int width);
-void NV21ToARGBRow_AVX2(const uint8* src_y,
-                        const uint8* src_uv,
-                        uint8* dst_argb,
+void NV21ToARGBRow_AVX2(const uint8_t* y_buf,
+                        const uint8_t* vu_buf,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
-void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
-                         uint8* dst_argb,
+void YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
+                         uint8_t* dst_argb,
                          const struct YuvConstants* yuvconstants,
                          int width);
-void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
-                         uint8* dst_argb,
+void UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
+                         uint8_t* dst_argb,
                          const struct YuvConstants* yuvconstants,
                          int width);
-void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
-                        uint8* dst_argb,
+void YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
-void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
-                        uint8* dst_argb,
+void UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
-void I422ToRGBARow_SSSE3(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_rgba,
+void I422ToRGBARow_SSSE3(const uint8_t* y_buf,
+                         const uint8_t* u_buf,
+                         const uint8_t* v_buf,
+                         uint8_t* dst_rgba,
                          const struct YuvConstants* yuvconstants,
                          int width);
-void I422ToARGB4444Row_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb,
+void I422ToARGB4444Row_SSSE3(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             uint8_t* dst_argb4444,
                              const struct YuvConstants* yuvconstants,
                              int width);
-void I422ToARGB4444Row_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
+void I422ToARGB4444Row_AVX2(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb4444,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void I422ToARGB1555Row_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb,
+void I422ToARGB1555Row_SSSE3(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             uint8_t* dst_argb1555,
                              const struct YuvConstants* yuvconstants,
                              int width);
-void I422ToARGB1555Row_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
+void I422ToARGB1555Row_AVX2(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb1555,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void I422ToRGB565Row_SSSE3(const uint8* src_y,
-                           const uint8* src_u,
-                           const uint8* src_v,
-                           uint8* dst_argb,
+void I422ToRGB565Row_SSSE3(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_rgb565,
                            const struct YuvConstants* yuvconstants,
                            int width);
-void I422ToRGB565Row_AVX2(const uint8* src_y,
-                          const uint8* src_u,
-                          const uint8* src_v,
-                          uint8* dst_argb,
+void I422ToRGB565Row_AVX2(const uint8_t* src_y,
+                          const uint8_t* src_u,
+                          const uint8_t* src_v,
+                          uint8_t* dst_rgb565,
                           const struct YuvConstants* yuvconstants,
                           int width);
-void I422ToRGB24Row_SSSE3(const uint8* src_y,
-                          const uint8* src_u,
-                          const uint8* src_v,
-                          uint8* dst_rgb24,
+void I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
+                          const uint8_t* u_buf,
+                          const uint8_t* v_buf,
+                          uint8_t* dst_rgb24,
                           const struct YuvConstants* yuvconstants,
                           int width);
-void I422ToRGB24Row_AVX2(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_rgb24,
+void I422ToRGB24Row_AVX2(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width);
-void I422ToARGBRow_Any_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
+void I422ToARGBRow_Any_AVX2(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void I422ToRGBARow_Any_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
+void I422ToRGBARow_Any_AVX2(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void I444ToARGBRow_Any_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb,
+void I444ToARGBRow_Any_SSSE3(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
-void I444ToARGBRow_Any_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
+void I444ToARGBRow_Any_AVX2(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void I422ToARGBRow_Any_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb,
+void I422ToARGBRow_Any_SSSE3(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
-void I422AlphaToARGBRow_Any_SSSE3(const uint8* y_buf,
-                                  const uint8* u_buf,
-                                  const uint8* v_buf,
-                                  const uint8* a_buf,
-                                  uint8* dst_argb,
+void I422ToAR30Row_Any_SSSE3(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I210ToAR30Row_Any_SSSE3(const uint16_t* y_buf,
+                             const uint16_t* u_buf,
+                             const uint16_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I210ToARGBRow_Any_SSSE3(const uint16_t* y_buf,
+                             const uint16_t* u_buf,
+                             const uint16_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I422ToAR30Row_Any_AVX2(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I210ToARGBRow_Any_AVX2(const uint16_t* y_buf,
+                            const uint16_t* u_buf,
+                            const uint16_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I210ToAR30Row_Any_AVX2(const uint16_t* y_buf,
+                            const uint16_t* u_buf,
+                            const uint16_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422AlphaToARGBRow_Any_SSSE3(const uint8_t* y_buf,
+                                  const uint8_t* u_buf,
+                                  const uint8_t* v_buf,
+                                  const uint8_t* a_buf,
+                                  uint8_t* dst_ptr,
                                   const struct YuvConstants* yuvconstants,
                                   int width);
-void I422AlphaToARGBRow_Any_AVX2(const uint8* y_buf,
-                                 const uint8* u_buf,
-                                 const uint8* v_buf,
-                                 const uint8* a_buf,
-                                 uint8* dst_argb,
+void I422AlphaToARGBRow_Any_AVX2(const uint8_t* y_buf,
+                                 const uint8_t* u_buf,
+                                 const uint8_t* v_buf,
+                                 const uint8_t* a_buf,
+                                 uint8_t* dst_ptr,
                                  const struct YuvConstants* yuvconstants,
                                  int width);
-void I411ToARGBRow_Any_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb,
+void NV12ToARGBRow_Any_SSSE3(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
-void I411ToARGBRow_Any_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
+void NV12ToARGBRow_Any_AVX2(const uint8_t* y_buf,
+                            const uint8_t* uv_buf,
+                            uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void NV12ToARGBRow_Any_SSSE3(const uint8* src_y,
-                             const uint8* src_uv,
-                             uint8* dst_argb,
+void NV21ToARGBRow_Any_SSSE3(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
-void NV12ToARGBRow_Any_AVX2(const uint8* src_y,
-                            const uint8* src_uv,
-                            uint8* dst_argb,
+void NV21ToARGBRow_Any_AVX2(const uint8_t* y_buf,
+                            const uint8_t* uv_buf,
+                            uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void NV21ToARGBRow_Any_SSSE3(const uint8* src_y,
-                             const uint8* src_vu,
-                             uint8* dst_argb,
+void NV12ToRGB24Row_Any_SSSE3(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              const struct YuvConstants* yuvconstants,
+                              int width);
+void NV21ToRGB24Row_Any_SSSE3(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              const struct YuvConstants* yuvconstants,
+                              int width);
+void NV12ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
-void NV21ToARGBRow_Any_AVX2(const uint8* src_y,
-                            const uint8* src_vu,
-                            uint8* dst_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void NV12ToRGB565Row_Any_SSSE3(const uint8* src_y,
-                               const uint8* src_uv,
-                               uint8* dst_argb,
+void NV21ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void NV12ToRGB565Row_Any_SSSE3(const uint8_t* y_buf,
+                               const uint8_t* uv_buf,
+                               uint8_t* dst_ptr,
                                const struct YuvConstants* yuvconstants,
                                int width);
-void NV12ToRGB565Row_Any_AVX2(const uint8* src_y,
-                              const uint8* src_uv,
-                              uint8* dst_argb,
+void NV12ToRGB565Row_Any_AVX2(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
                               const struct YuvConstants* yuvconstants,
                               int width);
-void YUY2ToARGBRow_Any_SSSE3(const uint8* src_yuy2,
-                             uint8* dst_argb,
+void YUY2ToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
-void UYVYToARGBRow_Any_SSSE3(const uint8* src_uyvy,
-                             uint8* dst_argb,
+void UYVYToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
-void YUY2ToARGBRow_Any_AVX2(const uint8* src_yuy2,
-                            uint8* dst_argb,
+void YUY2ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void UYVYToARGBRow_Any_AVX2(const uint8* src_uyvy,
-                            uint8* dst_argb,
+void UYVYToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void I422ToRGBARow_Any_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_rgba,
+void I422ToRGBARow_Any_SSSE3(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
-void I422ToARGB4444Row_Any_SSSE3(const uint8* src_y,
-                                 const uint8* src_u,
-                                 const uint8* src_v,
-                                 uint8* dst_rgba,
+void I422ToARGB4444Row_Any_SSSE3(const uint8_t* y_buf,
+                                 const uint8_t* u_buf,
+                                 const uint8_t* v_buf,
+                                 uint8_t* dst_ptr,
                                  const struct YuvConstants* yuvconstants,
                                  int width);
-void I422ToARGB4444Row_Any_AVX2(const uint8* src_y,
-                                const uint8* src_u,
-                                const uint8* src_v,
-                                uint8* dst_rgba,
+void I422ToARGB4444Row_Any_AVX2(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_ptr,
                                 const struct YuvConstants* yuvconstants,
                                 int width);
-void I422ToARGB1555Row_Any_SSSE3(const uint8* src_y,
-                                 const uint8* src_u,
-                                 const uint8* src_v,
-                                 uint8* dst_rgba,
+void I422ToARGB1555Row_Any_SSSE3(const uint8_t* y_buf,
+                                 const uint8_t* u_buf,
+                                 const uint8_t* v_buf,
+                                 uint8_t* dst_ptr,
                                  const struct YuvConstants* yuvconstants,
                                  int width);
-void I422ToARGB1555Row_Any_AVX2(const uint8* src_y,
-                                const uint8* src_u,
-                                const uint8* src_v,
-                                uint8* dst_rgba,
+void I422ToARGB1555Row_Any_AVX2(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_ptr,
                                 const struct YuvConstants* yuvconstants,
                                 int width);
-void I422ToRGB565Row_Any_SSSE3(const uint8* src_y,
-                               const uint8* src_u,
-                               const uint8* src_v,
-                               uint8* dst_rgba,
+void I422ToRGB565Row_Any_SSSE3(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_ptr,
                                const struct YuvConstants* yuvconstants,
                                int width);
-void I422ToRGB565Row_Any_AVX2(const uint8* src_y,
-                              const uint8* src_u,
-                              const uint8* src_v,
-                              uint8* dst_rgba,
+void I422ToRGB565Row_Any_AVX2(const uint8_t* y_buf,
+                              const uint8_t* u_buf,
+                              const uint8_t* v_buf,
+                              uint8_t* dst_ptr,
                               const struct YuvConstants* yuvconstants,
                               int width);
-void I422ToRGB24Row_Any_SSSE3(const uint8* src_y,
-                              const uint8* src_u,
-                              const uint8* src_v,
-                              uint8* dst_argb,
+void I422ToRGB24Row_Any_SSSE3(const uint8_t* y_buf,
+                              const uint8_t* u_buf,
+                              const uint8_t* v_buf,
+                              uint8_t* dst_ptr,
                               const struct YuvConstants* yuvconstants,
                               int width);
-void I422ToRGB24Row_Any_AVX2(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb,
+void I422ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
 
-void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);
-void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
-void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
-void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);
-void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);
-void I400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);
-void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, int width);
+void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width);
+void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width);
+void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void I400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void I400ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void I400ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void I400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 
 // ARGB preattenuated alpha blend.
-void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1,
-                        uint8* dst_argb, int width);
-void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1,
-                       uint8* dst_argb, int width);
-void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1,
-                    uint8* dst_argb, int width);
+void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
+                        const uint8_t* src_argb1,
+                        uint8_t* dst_argb,
+                        int width);
+void ARGBBlendRow_NEON(const uint8_t* src_argb0,
+                       const uint8_t* src_argb1,
+                       uint8_t* dst_argb,
+                       int width);
+void ARGBBlendRow_MSA(const uint8_t* src_argb0,
+                      const uint8_t* src_argb1,
+                      uint8_t* dst_argb,
+                      int width);
+void ARGBBlendRow_C(const uint8_t* src_argb0,
+                    const uint8_t* src_argb1,
+                    uint8_t* dst_argb,
+                    int width);
 
 // Unattenuated planar alpha blend.
-void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
-                         const uint8* alpha, uint8* dst, int width);
-void BlendPlaneRow_Any_SSSE3(const uint8* src0, const uint8* src1,
-                             const uint8* alpha, uint8* dst, int width);
-void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
-                        const uint8* alpha, uint8* dst, int width);
-void BlendPlaneRow_Any_AVX2(const uint8* src0, const uint8* src1,
-                            const uint8* alpha, uint8* dst, int width);
-void BlendPlaneRow_C(const uint8* src0, const uint8* src1,
-                     const uint8* alpha, uint8* dst, int width);
+void BlendPlaneRow_SSSE3(const uint8_t* src0,
+                         const uint8_t* src1,
+                         const uint8_t* alpha,
+                         uint8_t* dst,
+                         int width);
+void BlendPlaneRow_Any_SSSE3(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
+                             int width);
+void BlendPlaneRow_AVX2(const uint8_t* src0,
+                        const uint8_t* src1,
+                        const uint8_t* alpha,
+                        uint8_t* dst,
+                        int width);
+void BlendPlaneRow_Any_AVX2(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            int width);
+void BlendPlaneRow_C(const uint8_t* src0,
+                     const uint8_t* src1,
+                     const uint8_t* alpha,
+                     uint8_t* dst,
+                     int width);
 
 // ARGB multiply images. Same API as Blend, but these require
 // pointer and width alignment for SSE2.
-void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1,
-                       uint8* dst_argb, int width);
-void ARGBMultiplyRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
-                          uint8* dst_argb, int width);
-void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
-                              uint8* dst_argb, int width);
-void ARGBMultiplyRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
-                          uint8* dst_argb, int width);
-void ARGBMultiplyRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
-                              uint8* dst_argb, int width);
-void ARGBMultiplyRow_NEON(const uint8* src_argb, const uint8* src_argb1,
-                          uint8* dst_argb, int width);
-void ARGBMultiplyRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
-                              uint8* dst_argb, int width);
+void ARGBMultiplyRow_C(const uint8_t* src_argb0,
+                       const uint8_t* src_argb1,
+                       uint8_t* dst_argb,
+                       int width);
+void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width);
+void ARGBMultiplyRow_Any_SSE2(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width);
+void ARGBMultiplyRow_Any_AVX2(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width);
+void ARGBMultiplyRow_Any_NEON(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGBMultiplyRow_MSA(const uint8_t* src_argb0,
+                         const uint8_t* src_argb1,
+                         uint8_t* dst_argb,
+                         int width);
+void ARGBMultiplyRow_Any_MSA(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             int width);
 
 // ARGB add images.
-void ARGBAddRow_C(const uint8* src_argb, const uint8* src_argb1,
-                  uint8* dst_argb, int width);
-void ARGBAddRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
-                     uint8* dst_argb, int width);
-void ARGBAddRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
-                         uint8* dst_argb, int width);
-void ARGBAddRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
-                     uint8* dst_argb, int width);
-void ARGBAddRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
-                         uint8* dst_argb, int width);
-void ARGBAddRow_NEON(const uint8* src_argb, const uint8* src_argb1,
-                     uint8* dst_argb, int width);
-void ARGBAddRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
-                         uint8* dst_argb, int width);
+void ARGBAddRow_C(const uint8_t* src_argb0,
+                  const uint8_t* src_argb1,
+                  uint8_t* dst_argb,
+                  int width);
+void ARGBAddRow_SSE2(const uint8_t* src_argb0,
+                     const uint8_t* src_argb1,
+                     uint8_t* dst_argb,
+                     int width);
+void ARGBAddRow_Any_SSE2(const uint8_t* y_buf,
+                         const uint8_t* uv_buf,
+                         uint8_t* dst_ptr,
+                         int width);
+void ARGBAddRow_AVX2(const uint8_t* src_argb0,
+                     const uint8_t* src_argb1,
+                     uint8_t* dst_argb,
+                     int width);
+void ARGBAddRow_Any_AVX2(const uint8_t* y_buf,
+                         const uint8_t* uv_buf,
+                         uint8_t* dst_ptr,
+                         int width);
+void ARGBAddRow_NEON(const uint8_t* src_argb0,
+                     const uint8_t* src_argb1,
+                     uint8_t* dst_argb,
+                     int width);
+void ARGBAddRow_Any_NEON(const uint8_t* y_buf,
+                         const uint8_t* uv_buf,
+                         uint8_t* dst_ptr,
+                         int width);
+void ARGBAddRow_MSA(const uint8_t* src_argb0,
+                    const uint8_t* src_argb1,
+                    uint8_t* dst_argb,
+                    int width);
+void ARGBAddRow_Any_MSA(const uint8_t* y_buf,
+                        const uint8_t* uv_buf,
+                        uint8_t* dst_ptr,
+                        int width);
 
 // ARGB subtract images. Same API as Blend, but these require
 // pointer and width alignment for SSE2.
-void ARGBSubtractRow_C(const uint8* src_argb, const uint8* src_argb1,
-                       uint8* dst_argb, int width);
-void ARGBSubtractRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
-                          uint8* dst_argb, int width);
-void ARGBSubtractRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
-                              uint8* dst_argb, int width);
-void ARGBSubtractRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
-                          uint8* dst_argb, int width);
-void ARGBSubtractRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
-                              uint8* dst_argb, int width);
-void ARGBSubtractRow_NEON(const uint8* src_argb, const uint8* src_argb1,
-                          uint8* dst_argb, int width);
-void ARGBSubtractRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
-                              uint8* dst_argb, int width);
+void ARGBSubtractRow_C(const uint8_t* src_argb0,
+                       const uint8_t* src_argb1,
+                       uint8_t* dst_argb,
+                       int width);
+void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width);
+void ARGBSubtractRow_Any_SSE2(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width);
+void ARGBSubtractRow_Any_AVX2(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width);
+void ARGBSubtractRow_Any_NEON(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGBSubtractRow_MSA(const uint8_t* src_argb0,
+                         const uint8_t* src_argb1,
+                         uint8_t* dst_argb,
+                         int width);
+void ARGBSubtractRow_Any_MSA(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             int width);
 
-void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,
+void ARGBToRGB24Row_Any_SSSE3(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGBToRAWRow_Any_SSSE3(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void ARGBToRGB565Row_Any_SSE2(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGBToARGB1555Row_Any_SSE2(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
                                 int width);
-void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,
+void ARGBToARGB4444Row_Any_SSE2(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
                                 int width);
+void ABGRToAR30Row_Any_SSSE3(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void ARGBToAR30Row_Any_SSSE3(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void ARGBToRAWRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToRGB24Row_Any_AVX2(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void ARGBToRGB24Row_Any_AVX512VBMI(const uint8_t* src_ptr,
+                                   uint8_t* dst_ptr,
+                                   int width);
+void ARGBToRGB565DitherRow_Any_SSE2(const uint8_t* src_ptr,
+                                    uint8_t* dst_ptr,
+                                    const uint32_t param,
+                                    int width);
+void ARGBToRGB565DitherRow_Any_AVX2(const uint8_t* src_ptr,
+                                    uint8_t* dst_ptr,
+                                    const uint32_t param,
+                                    int width);
 
-void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,
-                                    const uint32 dither4, int width);
-void ARGBToRGB565DitherRow_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,
-                                    const uint32 dither4, int width);
+void ARGBToRGB565Row_Any_AVX2(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGBToARGB1555Row_Any_AVX2(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int width);
+void ARGBToARGB4444Row_Any_AVX2(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int width);
+void ABGRToAR30Row_Any_AVX2(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void ARGBToAR30Row_Any_AVX2(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
 
-void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,
+void ARGBToRGB24Row_Any_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void ARGBToRAWRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToRGB565Row_Any_NEON(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGBToARGB1555Row_Any_NEON(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
                                 int width);
-void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,
+void ARGBToARGB4444Row_Any_NEON(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
                                 int width);
+void ARGBToRGB565DitherRow_Any_NEON(const uint8_t* src_ptr,
+                                    uint8_t* dst_ptr,
+                                    const uint32_t param,
+                                    int width);
+void ARGBToRGB24Row_Any_MSA(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void ARGBToRAWRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToRGB565Row_Any_MSA(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void ARGBToARGB1555Row_Any_MSA(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int width);
+void ARGBToARGB4444Row_Any_MSA(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int width);
+void ARGBToRGB565DitherRow_Any_MSA(const uint8_t* src_ptr,
+                                   uint8_t* dst_ptr,
+                                   const uint32_t param,
+                                   int width);
 
-void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb,
-                                int width);
-void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb,
-                                int width);
-void ARGBToRGB565DitherRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb,
-                                    const uint32 dither4, int width);
-
-void I444ToARGBRow_Any_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
+void I444ToARGBRow_Any_NEON(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void I422ToARGBRow_Any_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
+void I422ToARGBRow_Any_NEON(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void I422AlphaToARGBRow_Any_NEON(const uint8* src_y,
-                                 const uint8* src_u,
-                                 const uint8* src_v,
-                                 const uint8* src_a,
-                                 uint8* dst_argb,
+void I422AlphaToARGBRow_Any_NEON(const uint8_t* y_buf,
+                                 const uint8_t* u_buf,
+                                 const uint8_t* v_buf,
+                                 const uint8_t* a_buf,
+                                 uint8_t* dst_ptr,
                                  const struct YuvConstants* yuvconstants,
                                  int width);
-void I411ToARGBRow_Any_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
+void I422ToRGBARow_Any_NEON(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void I422ToRGBARow_Any_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I422ToRGB24Row_Any_NEON(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb,
+void I422ToRGB24Row_Any_NEON(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
-void I422ToARGB4444Row_Any_NEON(const uint8* src_y,
-                                const uint8* src_u,
-                                const uint8* src_v,
-                                uint8* dst_argb,
+void I422ToARGB4444Row_Any_NEON(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_ptr,
                                 const struct YuvConstants* yuvconstants,
                                 int width);
-void I422ToARGB1555Row_Any_NEON(const uint8* src_y,
-                                const uint8* src_u,
-                                const uint8* src_v,
-                                uint8* dst_argb,
+void I422ToARGB1555Row_Any_NEON(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_ptr,
                                 const struct YuvConstants* yuvconstants,
                                 int width);
-void I422ToRGB565Row_Any_NEON(const uint8* src_y,
-                              const uint8* src_u,
-                              const uint8* src_v,
-                              uint8* dst_argb,
+void I422ToRGB565Row_Any_NEON(const uint8_t* y_buf,
+                              const uint8_t* u_buf,
+                              const uint8_t* v_buf,
+                              uint8_t* dst_ptr,
                               const struct YuvConstants* yuvconstants,
                               int width);
-void NV12ToARGBRow_Any_NEON(const uint8* src_y,
-                            const uint8* src_uv,
-                            uint8* dst_argb,
+void NV12ToARGBRow_Any_NEON(const uint8_t* y_buf,
+                            const uint8_t* uv_buf,
+                            uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void NV21ToARGBRow_Any_NEON(const uint8* src_y,
-                            const uint8* src_vu,
-                            uint8* dst_argb,
+void NV21ToARGBRow_Any_NEON(const uint8_t* y_buf,
+                            const uint8_t* uv_buf,
+                            uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void NV12ToRGB565Row_Any_NEON(const uint8* src_y,
-                              const uint8* src_uv,
-                              uint8* dst_argb,
+void NV12ToRGB24Row_Any_NEON(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void NV21ToRGB24Row_Any_NEON(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void NV12ToRGB565Row_Any_NEON(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
                               const struct YuvConstants* yuvconstants,
                               int width);
-void YUY2ToARGBRow_Any_NEON(const uint8* src_yuy2,
-                            uint8* dst_argb,
+void YUY2ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void UYVYToARGBRow_Any_NEON(const uint8* src_uyvy,
-                            uint8* dst_argb,
+void UYVYToARGBRow_Any_NEON(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void I422ToARGBRow_DSPR2(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
+void I444ToARGBRow_Any_MSA(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void I422ToARGBRow_Any_MSA(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void I422ToRGBARow_Any_MSA(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void I422AlphaToARGBRow_Any_MSA(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                const uint8_t* a_buf,
+                                uint8_t* dst_ptr,
+                                const struct YuvConstants* yuvconstants,
+                                int width);
+void I422ToRGB24Row_Any_MSA(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToRGB565Row_Any_MSA(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I422ToARGB4444Row_Any_MSA(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_ptr,
+                               const struct YuvConstants* yuvconstants,
+                               int width);
+void I422ToARGB1555Row_Any_MSA(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_ptr,
+                               const struct YuvConstants* yuvconstants,
+                               int width);
+void NV12ToARGBRow_Any_MSA(const uint8_t* y_buf,
+                           const uint8_t* uv_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void NV12ToRGB565Row_Any_MSA(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void NV21ToARGBRow_Any_MSA(const uint8_t* y_buf,
+                           const uint8_t* uv_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void YUY2ToARGBRow_Any_MSA(const uint8_t* src_ptr,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void UYVYToARGBRow_Any_MSA(const uint8_t* src_ptr,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+
+void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
+                      int stride_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
                          int width);
-void I422ToARGBRow_DSPR2(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
+void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
+                      int stride_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
                          int width);
+void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
+                      int stride_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToUVRow_MSA(const uint8_t* src_yuy2,
+                     int src_stride_yuy2,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
+void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToUVRow_C(const uint8_t* src_yuy2,
+                   int src_stride_yuy2,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width);
+void YUY2ToUV422Row_C(const uint8_t* src_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void YUY2ToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void YUY2ToUVRow_Any_AVX2(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void YUY2ToUV422Row_Any_AVX2(const uint8_t* src_ptr,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width);
+void YUY2ToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void YUY2ToUVRow_Any_SSE2(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void YUY2ToUV422Row_Any_SSE2(const uint8_t* src_ptr,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width);
+void YUY2ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void YUY2ToUVRow_Any_NEON(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void YUY2ToUV422Row_Any_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width);
+void YUY2ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void YUY2ToUVRow_Any_MSA(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void YUY2ToUV422Row_Any_MSA(const uint8_t* src_ptr,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
+void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
+                      int stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
+                      int stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
+                      int stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
+                      int stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToUVRow_MSA(const uint8_t* src_uyvy,
+                     int src_stride_uyvy,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void UYVYToUV422Row_MSA(const uint8_t* src_uyvy,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
 
-void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2,
-                   uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToUV422Row_C(const uint8* src_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToYRow_Any_AVX2(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_Any_AVX2(const uint8* src_yuy2, int stride_yuy2,
-                          uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToUV422Row_Any_AVX2(const uint8* src_yuy2,
-                             uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                          uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToUV422Row_Any_SSE2(const uint8* src_yuy2,
-                             uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2,
-                          uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2,
-                             uint8* dst_u, uint8* dst_v, int width);
-void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width);
-void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int width);
-void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width);
-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int width);
-void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width);
-void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int width);
-void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width);
-void UYVYToUV422Row_NEON(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int width);
+void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToUVRow_C(const uint8_t* src_uyvy,
+                   int src_stride_uyvy,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width);
+void UYVYToUV422Row_C(const uint8_t* src_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void UYVYToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void UYVYToUVRow_Any_AVX2(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void UYVYToUV422Row_Any_AVX2(const uint8_t* src_ptr,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width);
+void UYVYToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void UYVYToUVRow_Any_SSE2(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void UYVYToUV422Row_Any_SSE2(const uint8_t* src_ptr,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width);
+void UYVYToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void UYVYToUVRow_Any_NEON(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void UYVYToUV422Row_Any_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width);
+void UYVYToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void UYVYToUVRow_Any_MSA(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void UYVYToUV422Row_Any_MSA(const uint8_t* src_ptr,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
 
-void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy,
-                   uint8* dst_u, uint8* dst_v, int width);
-void UYVYToUV422Row_C(const uint8* src_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width);
-void UYVYToYRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_Any_AVX2(const uint8* src_uyvy, int stride_uyvy,
-                          uint8* dst_u, uint8* dst_v, int width);
-void UYVYToUV422Row_Any_AVX2(const uint8* src_uyvy,
-                             uint8* dst_u, uint8* dst_v, int width);
-void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                          uint8* dst_u, uint8* dst_v, int width);
-void UYVYToUV422Row_Any_SSE2(const uint8* src_uyvy,
-                             uint8* dst_u, uint8* dst_v, int width);
-void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy,
-                          uint8* dst_u, uint8* dst_v, int width);
-void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy,
-                             uint8* dst_u, uint8* dst_v, int width);
-
-void I422ToYUY2Row_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_yuy2, int width);
-void I422ToUYVYRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_uyvy, int width);
-void I422ToYUY2Row_SSE2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_yuy2, int width);
-void I422ToUYVYRow_SSE2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_uyvy, int width);
-void I422ToYUY2Row_Any_SSE2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_yuy2, int width);
-void I422ToUYVYRow_Any_SSE2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_uyvy, int width);
-void I422ToYUY2Row_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_yuy2, int width);
-void I422ToUYVYRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_uyvy, int width);
-void I422ToYUY2Row_Any_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_yuy2, int width);
-void I422ToUYVYRow_Any_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_uyvy, int width);
+void I422ToYUY2Row_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_frame,
+                     int width);
+void I422ToUYVYRow_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_frame,
+                     int width);
+void I422ToYUY2Row_SSE2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_yuy2,
+                        int width);
+void I422ToUYVYRow_SSE2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_uyvy,
+                        int width);
+void I422ToYUY2Row_Any_SSE2(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            int width);
+void I422ToUYVYRow_Any_SSE2(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            int width);
+void I422ToYUY2Row_AVX2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_yuy2,
+                        int width);
+void I422ToUYVYRow_AVX2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_uyvy,
+                        int width);
+void I422ToYUY2Row_Any_AVX2(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            int width);
+void I422ToUYVYRow_Any_AVX2(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            int width);
+void I422ToYUY2Row_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_yuy2,
+                        int width);
+void I422ToUYVYRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_uyvy,
+                        int width);
+void I422ToYUY2Row_Any_NEON(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            int width);
+void I422ToUYVYRow_Any_NEON(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            int width);
+void I422ToYUY2Row_MSA(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_yuy2,
+                       int width);
+void I422ToUYVYRow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_uyvy,
+                       int width);
+void I422ToYUY2Row_Any_MSA(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           int width);
+void I422ToUYVYRow_Any_MSA(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           int width);
 
 // Effects related row functions.
-void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
-                               int width);
-void ARGBAttenuateRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
+void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            int width);
+void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int width);
+void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int width);
+void ARGBAttenuateRow_MSA(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width);
+void ARGBAttenuateRow_Any_SSSE3(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
                                 int width);
-void ARGBAttenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+void ARGBAttenuateRow_Any_AVX2(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
                                int width);
-void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
+void ARGBAttenuateRow_Any_NEON(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
                                int width);
+void ARGBAttenuateRow_Any_MSA(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
 
 // Inverse table for unattenuate, shared by C and SSE2.
-extern const uint32 fixed_invtbl8[256];
-void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBUnattenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
+extern const uint32_t fixed_invtbl8[256];
+void ARGBUnattenuateRow_C(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width);
+void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             int width);
+void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             int width);
+void ARGBUnattenuateRow_Any_SSE2(const uint8_t* src_ptr,
+                                 uint8_t* dst_ptr,
                                  int width);
-void ARGBUnattenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+void ARGBUnattenuateRow_Any_AVX2(const uint8_t* src_ptr,
+                                 uint8_t* dst_ptr,
                                  int width);
 
-void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 
-void ARGBSepiaRow_C(uint8* dst_argb, int width);
-void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width);
-void ARGBSepiaRow_NEON(uint8* dst_argb, int width);
+void ARGBSepiaRow_C(uint8_t* dst_argb, int width);
+void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width);
+void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width);
+void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width);
 
-void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
-                          const int8* matrix_argb, int width);
-void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                              const int8* matrix_argb, int width);
-void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
-                             const int8* matrix_argb, int width);
+void ARGBColorMatrixRow_C(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          const int8_t* matrix_argb,
+                          int width);
+void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
+                              uint8_t* dst_argb,
+                              const int8_t* matrix_argb,
+                              int width);
+void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             const int8_t* matrix_argb,
+                             int width);
+void ARGBColorMatrixRow_MSA(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            const int8_t* matrix_argb,
+                            int width);
 
-void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
-void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
+void ARGBColorTableRow_C(uint8_t* dst_argb,
+                         const uint8_t* table_argb,
+                         int width);
+void ARGBColorTableRow_X86(uint8_t* dst_argb,
+                           const uint8_t* table_argb,
+                           int width);
 
-void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
-void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
+void RGBColorTableRow_C(uint8_t* dst_argb,
+                        const uint8_t* table_argb,
+                        int width);
+void RGBColorTableRow_X86(uint8_t* dst_argb,
+                          const uint8_t* table_argb,
+                          int width);
 
-void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
-                       int interval_offset, int width);
-void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
-                          int interval_offset, int width);
-void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
-                          int interval_offset, int width);
+void ARGBQuantizeRow_C(uint8_t* dst_argb,
+                       int scale,
+                       int interval_size,
+                       int interval_offset,
+                       int width);
+void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
+                          int scale,
+                          int interval_size,
+                          int interval_offset,
+                          int width);
+void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
+                          int scale,
+                          int interval_size,
+                          int interval_offset,
+                          int width);
+void ARGBQuantizeRow_MSA(uint8_t* dst_argb,
+                         int scale,
+                         int interval_size,
+                         int interval_offset,
+                         int width);
 
-void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
-                    uint32 value);
-void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
-                       uint32 value);
-void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
-                       uint32 value);
+void ARGBShadeRow_C(const uint8_t* src_argb,
+                    uint8_t* dst_argb,
+                    int width,
+                    uint32_t value);
+void ARGBShadeRow_SSE2(const uint8_t* src_argb,
+                       uint8_t* dst_argb,
+                       int width,
+                       uint32_t value);
+void ARGBShadeRow_NEON(const uint8_t* src_argb,
+                       uint8_t* dst_argb,
+                       int width,
+                       uint32_t value);
+void ARGBShadeRow_MSA(const uint8_t* src_argb,
+                      uint8_t* dst_argb,
+                      int width,
+                      uint32_t value);
 
 // Used for blur.
-void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
-                                    int width, int area, uint8* dst, int count);
-void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
-                                  const int32* previous_cumsum, int width);
+void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
+                                    const int32_t* botleft,
+                                    int width,
+                                    int area,
+                                    uint8_t* dst,
+                                    int count);
+void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
+                                  int32_t* cumsum,
+                                  const int32_t* previous_cumsum,
+                                  int width);
 
-void CumulativeSumToAverageRow_C(const int32* topleft, const int32* botleft,
-                                 int width, int area, uint8* dst, int count);
-void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
-                               const int32* previous_cumsum, int width);
+void CumulativeSumToAverageRow_C(const int32_t* tl,
+                                 const int32_t* bl,
+                                 int w,
+                                 int area,
+                                 uint8_t* dst,
+                                 int count);
+void ComputeCumulativeSumRow_C(const uint8_t* row,
+                               int32_t* cumsum,
+                               const int32_t* previous_cumsum,
+                               int width);
 
 LIBYUV_API
-void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
-                     uint8* dst_argb, const float* uv_dudv, int width);
+void ARGBAffineRow_C(const uint8_t* src_argb,
+                     int src_argb_stride,
+                     uint8_t* dst_argb,
+                     const float* uv_dudv,
+                     int width);
 LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
-                        uint8* dst_argb, const float* uv_dudv, int width);
+void ARGBAffineRow_SSE2(const uint8_t* src_argb,
+                        int src_argb_stride,
+                        uint8_t* dst_argb,
+                        const float* src_dudv,
+                        int width);
 
 // Used for I420Scale, ARGBScale, and ARGBInterpolate.
-void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
-                      ptrdiff_t src_stride_ptr,
-                      int width, int source_y_fraction);
-void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                          ptrdiff_t src_stride_ptr, int width,
+void InterpolateRow_C(uint8_t* dst_ptr,
+                      const uint8_t* src_ptr,
+                      ptrdiff_t src_stride,
+                      int width,
+                      int source_y_fraction);
+void InterpolateRow_SSSE3(uint8_t* dst_ptr,
+                          const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          int dst_width,
                           int source_y_fraction);
-void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
-                         ptrdiff_t src_stride_ptr, int width,
+void InterpolateRow_AVX2(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         int dst_width,
                          int source_y_fraction);
-void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr,
-                         ptrdiff_t src_stride_ptr, int width,
+void InterpolateRow_NEON(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         int dst_width,
                          int source_y_fraction);
-void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
-                          ptrdiff_t src_stride_ptr, int width,
-                          int source_y_fraction);
-void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
-                             ptrdiff_t src_stride_ptr, int width,
+void InterpolateRow_MSA(uint8_t* dst_ptr,
+                        const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        int width,
+                        int source_y_fraction);
+void InterpolateRow_Any_NEON(uint8_t* dst_ptr,
+                             const uint8_t* src_ptr,
+                             ptrdiff_t src_stride_ptr,
+                             int width,
                              int source_y_fraction);
-void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                              ptrdiff_t src_stride_ptr, int width,
+void InterpolateRow_Any_SSSE3(uint8_t* dst_ptr,
+                              const uint8_t* src_ptr,
+                              ptrdiff_t src_stride_ptr,
+                              int width,
                               int source_y_fraction);
-void InterpolateRow_Any_AVX2(uint8* dst_ptr, const uint8* src_ptr,
-                             ptrdiff_t src_stride_ptr, int width,
+void InterpolateRow_Any_AVX2(uint8_t* dst_ptr,
+                             const uint8_t* src_ptr,
+                             ptrdiff_t src_stride_ptr,
+                             int width,
                              int source_y_fraction);
-void InterpolateRow_Any_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
-                              ptrdiff_t src_stride_ptr, int width,
-                              int source_y_fraction);
+void InterpolateRow_Any_MSA(uint8_t* dst_ptr,
+                            const uint8_t* src_ptr,
+                            ptrdiff_t src_stride_ptr,
+                            int width,
+                            int source_y_fraction);
 
-void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                         ptrdiff_t src_stride_ptr,
-                         int width, int source_y_fraction);
+void InterpolateRow_16_C(uint16_t* dst_ptr,
+                         const uint16_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         int width,
+                         int source_y_fraction);
 
 // Sobel images.
-void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
-                 uint8* dst_sobelx, int width);
-void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
-                    const uint8* src_y2, uint8* dst_sobelx, int width);
-void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
-                    const uint8* src_y2, uint8* dst_sobelx, int width);
-void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
-                 uint8* dst_sobely, int width);
-void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width);
-void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width);
-void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
-                uint8* dst_argb, int width);
-void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                   uint8* dst_argb, int width);
-void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                   uint8* dst_argb, int width);
-void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
-                       uint8* dst_y, int width);
-void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                          uint8* dst_y, int width);
-void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                          uint8* dst_y, int width);
-void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
-                  uint8* dst_argb, int width);
-void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width);
-void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width);
-void SobelRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                       uint8* dst_argb, int width);
-void SobelRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                       uint8* dst_argb, int width);
-void SobelToPlaneRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                              uint8* dst_y, int width);
-void SobelToPlaneRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                              uint8* dst_y, int width);
-void SobelXYRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                         uint8* dst_argb, int width);
-void SobelXYRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                         uint8* dst_argb, int width);
-
-void ARGBPolynomialRow_C(const uint8* src_argb,
-                         uint8* dst_argb, const float* poly,
+void SobelXRow_C(const uint8_t* src_y0,
+                 const uint8_t* src_y1,
+                 const uint8_t* src_y2,
+                 uint8_t* dst_sobelx,
+                 int width);
+void SobelXRow_SSE2(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    const uint8_t* src_y2,
+                    uint8_t* dst_sobelx,
+                    int width);
+void SobelXRow_NEON(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    const uint8_t* src_y2,
+                    uint8_t* dst_sobelx,
+                    int width);
+void SobelXRow_MSA(const uint8_t* src_y0,
+                   const uint8_t* src_y1,
+                   const uint8_t* src_y2,
+                   uint8_t* dst_sobelx,
+                   int width);
+void SobelYRow_C(const uint8_t* src_y0,
+                 const uint8_t* src_y1,
+                 uint8_t* dst_sobely,
+                 int width);
+void SobelYRow_SSE2(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    uint8_t* dst_sobely,
+                    int width);
+void SobelYRow_NEON(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    uint8_t* dst_sobely,
+                    int width);
+void SobelYRow_MSA(const uint8_t* src_y0,
+                   const uint8_t* src_y1,
+                   uint8_t* dst_sobely,
+                   int width);
+void SobelRow_C(const uint8_t* src_sobelx,
+                const uint8_t* src_sobely,
+                uint8_t* dst_argb,
+                int width);
+void SobelRow_SSE2(const uint8_t* src_sobelx,
+                   const uint8_t* src_sobely,
+                   uint8_t* dst_argb,
+                   int width);
+void SobelRow_NEON(const uint8_t* src_sobelx,
+                   const uint8_t* src_sobely,
+                   uint8_t* dst_argb,
+                   int width);
+void SobelRow_MSA(const uint8_t* src_sobelx,
+                  const uint8_t* src_sobely,
+                  uint8_t* dst_argb,
+                  int width);
+void SobelToPlaneRow_C(const uint8_t* src_sobelx,
+                       const uint8_t* src_sobely,
+                       uint8_t* dst_y,
+                       int width);
+void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
+                          const uint8_t* src_sobely,
+                          uint8_t* dst_y,
+                          int width);
+void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
+                          const uint8_t* src_sobely,
+                          uint8_t* dst_y,
+                          int width);
+void SobelToPlaneRow_MSA(const uint8_t* src_sobelx,
+                         const uint8_t* src_sobely,
+                         uint8_t* dst_y,
                          int width);
-void ARGBPolynomialRow_SSE2(const uint8* src_argb,
-                            uint8* dst_argb, const float* poly,
+void SobelXYRow_C(const uint8_t* src_sobelx,
+                  const uint8_t* src_sobely,
+                  uint8_t* dst_argb,
+                  int width);
+void SobelXYRow_SSE2(const uint8_t* src_sobelx,
+                     const uint8_t* src_sobely,
+                     uint8_t* dst_argb,
+                     int width);
+void SobelXYRow_NEON(const uint8_t* src_sobelx,
+                     const uint8_t* src_sobely,
+                     uint8_t* dst_argb,
+                     int width);
+void SobelXYRow_MSA(const uint8_t* src_sobelx,
+                    const uint8_t* src_sobely,
+                    uint8_t* dst_argb,
+                    int width);
+void SobelRow_Any_SSE2(const uint8_t* y_buf,
+                       const uint8_t* uv_buf,
+                       uint8_t* dst_ptr,
+                       int width);
+void SobelRow_Any_NEON(const uint8_t* y_buf,
+                       const uint8_t* uv_buf,
+                       uint8_t* dst_ptr,
+                       int width);
+void SobelRow_Any_MSA(const uint8_t* y_buf,
+                      const uint8_t* uv_buf,
+                      uint8_t* dst_ptr,
+                      int width);
+void SobelToPlaneRow_Any_SSE2(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              int width);
+void SobelToPlaneRow_Any_NEON(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              int width);
+void SobelToPlaneRow_Any_MSA(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             int width);
+void SobelXYRow_Any_SSE2(const uint8_t* y_buf,
+                         const uint8_t* uv_buf,
+                         uint8_t* dst_ptr,
+                         int width);
+void SobelXYRow_Any_NEON(const uint8_t* y_buf,
+                         const uint8_t* uv_buf,
+                         uint8_t* dst_ptr,
+                         int width);
+void SobelXYRow_Any_MSA(const uint8_t* y_buf,
+                        const uint8_t* uv_buf,
+                        uint8_t* dst_ptr,
+                        int width);
+
+void ARGBPolynomialRow_C(const uint8_t* src_argb,
+                         uint8_t* dst_argb,
+                         const float* poly,
+                         int width);
+void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            const float* poly,
                             int width);
-void ARGBPolynomialRow_AVX2(const uint8* src_argb,
-                            uint8* dst_argb, const float* poly,
+void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            const float* poly,
                             int width);
 
-void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
-                             const uint8* luma, uint32 lumacoeff);
-void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+// Scale and convert to half float.
+void HalfFloatRow_C(const uint16_t* src, uint16_t* dst, float scale, int width);
+void HalfFloatRow_SSE2(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width);
+void HalfFloatRow_Any_SSE2(const uint16_t* src_ptr,
+                           uint16_t* dst_ptr,
+                           float param,
+                           int width);
+void HalfFloatRow_AVX2(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width);
+void HalfFloatRow_Any_AVX2(const uint16_t* src_ptr,
+                           uint16_t* dst_ptr,
+                           float param,
+                           int width);
+void HalfFloatRow_F16C(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width);
+void HalfFloatRow_Any_F16C(const uint16_t* src,
+                           uint16_t* dst,
+                           float scale,
+                           int width);
+void HalfFloat1Row_F16C(const uint16_t* src,
+                        uint16_t* dst,
+                        float scale,
+                        int width);
+void HalfFloat1Row_Any_F16C(const uint16_t* src,
+                            uint16_t* dst,
+                            float scale,
+                            int width);
+void HalfFloatRow_NEON(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width);
+void HalfFloatRow_Any_NEON(const uint16_t* src_ptr,
+                           uint16_t* dst_ptr,
+                           float param,
+                           int width);
+void HalfFloat1Row_NEON(const uint16_t* src,
+                        uint16_t* dst,
+                        float scale,
+                        int width);
+void HalfFloat1Row_Any_NEON(const uint16_t* src_ptr,
+                            uint16_t* dst_ptr,
+                            float param,
+                            int width);
+void HalfFloatRow_MSA(const uint16_t* src,
+                      uint16_t* dst,
+                      float scale,
+                      int width);
+void HalfFloatRow_Any_MSA(const uint16_t* src_ptr,
+                          uint16_t* dst_ptr,
+                          float param,
+                          int width);
+void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width);
+void ByteToFloatRow_NEON(const uint8_t* src,
+                         float* dst,
+                         float scale,
+                         int width);
+void ByteToFloatRow_Any_NEON(const uint8_t* src_ptr,
+                             float* dst_ptr,
+                             float param,
+                             int width);
+
+void ARGBLumaColorTableRow_C(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             int width,
+                             const uint8_t* luma,
+                             uint32_t lumacoeff);
+void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
+                                 uint8_t* dst_argb,
                                  int width,
-                                 const uint8* luma, uint32 lumacoeff);
+                                 const uint8_t* luma,
+                                 uint32_t lumacoeff);
+
+float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width);
+float ScaleMaxSamples_NEON(const float* src,
+                           float* dst,
+                           float scale,
+                           int width);
+float ScaleSumSamples_C(const float* src, float* dst, float scale, int width);
+float ScaleSumSamples_NEON(const float* src,
+                           float* dst,
+                           float scale,
+                           int width);
+void ScaleSamples_C(const float* src, float* dst, float scale, int width);
+void ScaleSamples_NEON(const float* src, float* dst, float scale, int width);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_ROW_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_ROW_H_
diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale.h
index 102158d1ab..b937d348ca 100644
--- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale.h
+++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale.h
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_SCALE_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_SCALE_H_
 #define INCLUDE_LIBYUV_SCALE_H_
 
 #include "libyuv/basic_types.h"
@@ -20,25 +20,33 @@ extern "C" {
 
 // Supported filtering.
 typedef enum FilterMode {
-  kFilterNone = 0,  // Point sample; Fastest.
-  kFilterLinear = 1,  // Filter horizontally only.
+  kFilterNone = 0,      // Point sample; Fastest.
+  kFilterLinear = 1,    // Filter horizontally only.
   kFilterBilinear = 2,  // Faster than box, but lower quality scaling down.
-  kFilterBox = 3  // Highest quality.
+  kFilterBox = 3        // Highest quality.
 } FilterModeEnum;
 
 // Scale a YUV plane.
 LIBYUV_API
-void ScalePlane(const uint8* src, int src_stride,
-                int src_width, int src_height,
-                uint8* dst, int dst_stride,
-                int dst_width, int dst_height,
+void ScalePlane(const uint8_t* src,
+                int src_stride,
+                int src_width,
+                int src_height,
+                uint8_t* dst,
+                int dst_stride,
+                int dst_width,
+                int dst_height,
                 enum FilterMode filtering);
 
 LIBYUV_API
-void ScalePlane_16(const uint16* src, int src_stride,
-                   int src_width, int src_height,
-                   uint16* dst, int dst_stride,
-                   int dst_width, int dst_height,
+void ScalePlane_16(const uint16_t* src,
+                   int src_stride,
+                   int src_width,
+                   int src_height,
+                   uint16_t* dst,
+                   int dst_stride,
+                   int dst_width,
+                   int dst_height,
                    enum FilterMode filtering);
 
 // Scales a YUV 4:2:0 image from the src width and height to the
@@ -52,44 +60,64 @@ void ScalePlane_16(const uint16* src, int src_stride,
 // Returns 0 if successful.
 
 LIBYUV_API
-int I420Scale(const uint8* src_y, int src_stride_y,
-              const uint8* src_u, int src_stride_u,
-              const uint8* src_v, int src_stride_v,
-              int src_width, int src_height,
-              uint8* dst_y, int dst_stride_y,
-              uint8* dst_u, int dst_stride_u,
-              uint8* dst_v, int dst_stride_v,
-              int dst_width, int dst_height,
+int I420Scale(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              int src_width,
+              int src_height,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int dst_width,
+              int dst_height,
               enum FilterMode filtering);
 
 LIBYUV_API
-int I420Scale_16(const uint16* src_y, int src_stride_y,
-                 const uint16* src_u, int src_stride_u,
-                 const uint16* src_v, int src_stride_v,
-                 int src_width, int src_height,
-                 uint16* dst_y, int dst_stride_y,
-                 uint16* dst_u, int dst_stride_u,
-                 uint16* dst_v, int dst_stride_v,
-                 int dst_width, int dst_height,
+int I420Scale_16(const uint16_t* src_y,
+                 int src_stride_y,
+                 const uint16_t* src_u,
+                 int src_stride_u,
+                 const uint16_t* src_v,
+                 int src_stride_v,
+                 int src_width,
+                 int src_height,
+                 uint16_t* dst_y,
+                 int dst_stride_y,
+                 uint16_t* dst_u,
+                 int dst_stride_u,
+                 uint16_t* dst_v,
+                 int dst_stride_v,
+                 int dst_width,
+                 int dst_height,
                  enum FilterMode filtering);
 
 #ifdef __cplusplus
 // Legacy API.  Deprecated.
 LIBYUV_API
-int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
-          int src_stride_y, int src_stride_u, int src_stride_v,
-          int src_width, int src_height,
-          uint8* dst_y, uint8* dst_u, uint8* dst_v,
-          int dst_stride_y, int dst_stride_u, int dst_stride_v,
-          int dst_width, int dst_height,
+int Scale(const uint8_t* src_y,
+          const uint8_t* src_u,
+          const uint8_t* src_v,
+          int src_stride_y,
+          int src_stride_u,
+          int src_stride_v,
+          int src_width,
+          int src_height,
+          uint8_t* dst_y,
+          uint8_t* dst_u,
+          uint8_t* dst_v,
+          int dst_stride_y,
+          int dst_stride_u,
+          int dst_stride_v,
+          int dst_width,
+          int dst_height,
           LIBYUV_BOOL interpolate);
 
-// Legacy API.  Deprecated.
-LIBYUV_API
-int ScaleOffset(const uint8* src_i420, int src_width, int src_height,
-                uint8* dst_i420, int dst_width, int dst_height, int dst_yoffset,
-                LIBYUV_BOOL interpolate);
-
 // For testing, allow disabling of specialized scalers.
 LIBYUV_API
 void SetUseReferenceImpl(LIBYUV_BOOL use);
@@ -100,4 +128,4 @@ void SetUseReferenceImpl(LIBYUV_BOOL use);
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_SCALE_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_SCALE_H_
diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale_argb.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale_argb.h
index b56cf52099..7641f18e34 100644
--- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale_argb.h
+++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale_argb.h
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_
 #define INCLUDE_LIBYUV_SCALE_ARGB_H_
 
 #include "libyuv/basic_types.h"
@@ -20,32 +20,52 @@ extern "C" {
 #endif
 
 LIBYUV_API
-int ARGBScale(const uint8* src_argb, int src_stride_argb,
-              int src_width, int src_height,
-              uint8* dst_argb, int dst_stride_argb,
-              int dst_width, int dst_height,
+int ARGBScale(const uint8_t* src_argb,
+              int src_stride_argb,
+              int src_width,
+              int src_height,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int dst_width,
+              int dst_height,
               enum FilterMode filtering);
 
 // Clipped scale takes destination rectangle coordinates for clip values.
 LIBYUV_API
-int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
-                  int src_width, int src_height,
-                  uint8* dst_argb, int dst_stride_argb,
-                  int dst_width, int dst_height,
-                  int clip_x, int clip_y, int clip_width, int clip_height,
+int ARGBScaleClip(const uint8_t* src_argb,
+                  int src_stride_argb,
+                  int src_width,
+                  int src_height,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int dst_width,
+                  int dst_height,
+                  int clip_x,
+                  int clip_y,
+                  int clip_width,
+                  int clip_height,
                   enum FilterMode filtering);
 
 // Scale with YUV conversion to ARGB and clipping.
 LIBYUV_API
-int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
-                       const uint8* src_u, int src_stride_u,
-                       const uint8* src_v, int src_stride_v,
-                       uint32 src_fourcc,
-                       int src_width, int src_height,
-                       uint8* dst_argb, int dst_stride_argb,
-                       uint32 dst_fourcc,
-                       int dst_width, int dst_height,
-                       int clip_x, int clip_y, int clip_width, int clip_height,
+int YUVToARGBScaleClip(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint32_t src_fourcc,
+                       int src_width,
+                       int src_height,
+                       uint8_t* dst_argb,
+                       int dst_stride_argb,
+                       uint32_t dst_fourcc,
+                       int dst_width,
+                       int dst_height,
+                       int clip_x,
+                       int clip_y,
+                       int clip_width,
+                       int clip_height,
                        enum FilterMode filtering);
 
 #ifdef __cplusplus
@@ -53,4 +73,4 @@ int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_SCALE_ARGB_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_SCALE_ARGB_H_
diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale_row.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale_row.h
index df699e6c22..7194ba09f8 100644
--- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale_row.h
+++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale_row.h
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_
 #define INCLUDE_LIBYUV_SCALE_ROW_H_
 
 #include "libyuv/basic_types.h"
@@ -19,17 +19,20 @@ namespace libyuv {
 extern "C" {
 #endif
 
-#if defined(__pnacl__) || defined(__CLR_VER) || \
-    (defined(__i386__) && !defined(__SSE2__))
+#if defined(__pnacl__) || defined(__CLR_VER) ||            \
+    (defined(__native_client__) && defined(__x86_64__)) || \
+    (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
 #define LIBYUV_DISABLE_X86
 #endif
+#if defined(__native_client__)
+#define LIBYUV_DISABLE_NEON
+#endif
 // MemorySanitizer does not support assembly code yet. http://crbug.com/344505
 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer)
 #define LIBYUV_DISABLE_X86
 #endif
 #endif
-
 // GCC >= 4.7.0 required for AVX2.
 #if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
 #if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
@@ -45,8 +48,8 @@ extern "C" {
 #endif  // __clang__
 
 // Visual C 2012 required for AVX2.
-#if defined(_M_IX86) && !defined(__clang__) && \
-    defined(_MSC_VER) && _MSC_VER >= 1700
+#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \
+    _MSC_VER >= 1700
 #define VISUALC_HAS_AVX2 1
 #endif  // VisualStudio >= 2012
 
@@ -72,15 +75,16 @@ extern "C" {
 // The following are available on all x86 platforms, but
 // require VS2012, clang 3.4 or gcc 4.7.
 // The code supports NaCL but requires a new compiler and validator.
-#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \
-    defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#if !defined(LIBYUV_DISABLE_X86) &&                          \
+    (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
+     defined(GCC_HAS_AVX2))
 #define HAS_SCALEADDROW_AVX2
 #define HAS_SCALEROWDOWN2_AVX2
 #define HAS_SCALEROWDOWN4_AVX2
 #endif
 
 // The following are available on Neon platforms:
-#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+#if !defined(LIBYUV_DISABLE_NEON) && \
     (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
 #define HAS_SCALEARGBCOLS_NEON
 #define HAS_SCALEARGBROWDOWN2_NEON
@@ -93,33 +97,51 @@ extern "C" {
 #define HAS_SCALEARGBFILTERCOLS_NEON
 #endif
 
-// The following are available on Mips platforms:
-#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
-    defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-#define HAS_SCALEROWDOWN2_DSPR2
-#define HAS_SCALEROWDOWN4_DSPR2
-#define HAS_SCALEROWDOWN34_DSPR2
-#define HAS_SCALEROWDOWN38_DSPR2
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#define HAS_SCALEADDROW_MSA
+#define HAS_SCALEARGBCOLS_MSA
+#define HAS_SCALEARGBFILTERCOLS_MSA
+#define HAS_SCALEARGBROWDOWN2_MSA
+#define HAS_SCALEARGBROWDOWNEVEN_MSA
+#define HAS_SCALEFILTERCOLS_MSA
+#define HAS_SCALEROWDOWN2_MSA
+#define HAS_SCALEROWDOWN34_MSA
+#define HAS_SCALEROWDOWN38_MSA
+#define HAS_SCALEROWDOWN4_MSA
 #endif
 
 // Scale ARGB vertically with bilinear interpolation.
 void ScalePlaneVertical(int src_height,
-                        int dst_width, int dst_height,
-                        int src_stride, int dst_stride,
-                        const uint8* src_argb, uint8* dst_argb,
-                        int x, int y, int dy,
-                        int bpp, enum FilterMode filtering);
+                        int dst_width,
+                        int dst_height,
+                        int src_stride,
+                        int dst_stride,
+                        const uint8_t* src_argb,
+                        uint8_t* dst_argb,
+                        int x,
+                        int y,
+                        int dy,
+                        int bpp,
+                        enum FilterMode filtering);
 
 void ScalePlaneVertical_16(int src_height,
-                           int dst_width, int dst_height,
-                           int src_stride, int dst_stride,
-                           const uint16* src_argb, uint16* dst_argb,
-                           int x, int y, int dy,
-                           int wpp, enum FilterMode filtering);
+                           int dst_width,
+                           int dst_height,
+                           int src_stride,
+                           int dst_stride,
+                           const uint16_t* src_argb,
+                           uint16_t* dst_argb,
+                           int x,
+                           int y,
+                           int dy,
+                           int wpp,
+                           enum FilterMode filtering);
 
 // Simplify the filtering based on scale factors.
-enum FilterMode ScaleFilterReduce(int src_width, int src_height,
-                                  int dst_width, int dst_height,
+enum FilterMode ScaleFilterReduce(int src_width,
+                                  int src_height,
+                                  int dst_width,
+                                  int dst_height,
                                   enum FilterMode filtering);
 
 // Divide num by div and return as 16.16 fixed point result.
@@ -137,367 +159,786 @@ int FixedDiv1_X86(int num, int div);
 #endif
 
 // Compute slope values for stepping.
-void ScaleSlope(int src_width, int src_height,
-                int dst_width, int dst_height,
+void ScaleSlope(int src_width,
+                int src_height,
+                int dst_width,
+                int dst_height,
                 enum FilterMode filtering,
-                int* x, int* y, int* dx, int* dy);
+                int* x,
+                int* y,
+                int* dx,
+                int* dy);
 
-void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                     uint8* dst, int dst_width);
-void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                        uint16* dst, int dst_width);
-void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width);
-void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                              uint16* dst, int dst_width);
-void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width);
-void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width);
-void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst, int dst_width);
-void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                     uint8* dst, int dst_width);
-void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                        uint16* dst, int dst_width);
-void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width);
-void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst, int dst_width);
-void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                      uint8* dst, int dst_width);
-void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                         uint16* dst, int dst_width);
-void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* d, int dst_width);
-void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                               uint16* d, int dst_width);
-void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* d, int dst_width);
-void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                               uint16* d, int dst_width);
-void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
-                 int dst_width, int x, int dx);
-void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                    int dst_width, int x, int dx);
-void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
-                    int dst_width, int, int);
-void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                       int dst_width, int, int);
-void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
-                       int dst_width, int x, int dx);
-void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                          int dst_width, int x, int dx);
-void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
-                         int dst_width, int x, int dx);
-void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                            int dst_width, int x, int dx);
-void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                      uint8* dst, int dst_width);
-void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                         uint16* dst, int dst_width);
-void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
+void ScaleRowDown2_C(const uint8_t* src_ptr,
+                     ptrdiff_t src_stride,
+                     uint8_t* dst,
+                     int dst_width);
+void ScaleRowDown2_16_C(const uint16_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint16_t* dst,
+                        int dst_width);
+void ScaleRowDown2Linear_C(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst,
+                           int dst_width);
+void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint16_t* dst,
+                              int dst_width);
+void ScaleRowDown2Box_C(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width);
+void ScaleRowDown2Box_Odd_C(const uint8_t* src_ptr,
                             ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint16* dst_ptr, int dst_width);
-void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                               uint16* dst_ptr, int dst_width);
-void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width);
-void ScaleARGBRowDown2_C(const uint8* src_argb,
+                            uint8_t* dst,
+                            int dst_width);
+void ScaleRowDown2Box_16_C(const uint16_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint16_t* dst,
+                           int dst_width);
+void ScaleRowDown4_C(const uint8_t* src_ptr,
+                     ptrdiff_t src_stride,
+                     uint8_t* dst,
+                     int dst_width);
+void ScaleRowDown4_16_C(const uint16_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint16_t* dst,
+                        int dst_width);
+void ScaleRowDown4Box_C(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width);
+void ScaleRowDown4Box_16_C(const uint16_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint16_t* dst,
+                           int dst_width);
+void ScaleRowDown34_C(const uint8_t* src_ptr,
+                      ptrdiff_t src_stride,
+                      uint8_t* dst,
+                      int dst_width);
+void ScaleRowDown34_16_C(const uint16_t* src_ptr,
                          ptrdiff_t src_stride,
-                         uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
+                         uint16_t* dst,
+                         int dst_width);
+void ScaleRowDown34_0_Box_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* d,
+                            int dst_width);
+void ScaleRowDown34_0_Box_16_C(const uint16_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
-                            uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
+                               uint16_t* d,
+                               int dst_width);
+void ScaleRowDown34_1_Box_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* d,
+                            int dst_width);
+void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16_t* d,
+                               int dst_width);
+void ScaleCols_C(uint8_t* dst_ptr,
+                 const uint8_t* src_ptr,
+                 int dst_width,
+                 int x,
+                 int dx);
+void ScaleCols_16_C(uint16_t* dst_ptr,
+                    const uint16_t* src_ptr,
+                    int dst_width,
+                    int x,
+                    int dx);
+void ScaleColsUp2_C(uint8_t* dst_ptr,
+                    const uint8_t* src_ptr,
+                    int dst_width,
+                    int,
+                    int);
+void ScaleColsUp2_16_C(uint16_t* dst_ptr,
+                       const uint16_t* src_ptr,
+                       int dst_width,
+                       int,
+                       int);
+void ScaleFilterCols_C(uint8_t* dst_ptr,
+                       const uint8_t* src_ptr,
+                       int dst_width,
+                       int x,
+                       int dx);
+void ScaleFilterCols_16_C(uint16_t* dst_ptr,
+                          const uint16_t* src_ptr,
+                          int dst_width,
+                          int x,
+                          int dx);
+void ScaleFilterCols64_C(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         int dst_width,
+                         int x32,
+                         int dx);
+void ScaleFilterCols64_16_C(uint16_t* dst_ptr,
+                            const uint16_t* src_ptr,
+                            int dst_width,
+                            int x32,
+                            int dx);
+void ScaleRowDown38_C(const uint8_t* src_ptr,
+                      ptrdiff_t src_stride,
+                      uint8_t* dst,
+                      int dst_width);
+void ScaleRowDown38_16_C(const uint16_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint16_t* dst,
+                         int dst_width);
+void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16_t* dst_ptr,
+                               int dst_width);
+void ScaleAddRow_C(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleAddRow_16_C(const uint16_t* src_ptr,
+                      uint32_t* dst_ptr,
+                      int src_width);
+void ScaleARGBRowDown2_C(const uint8_t* src_argb,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_argb,
+                         int dst_width);
+void ScaleARGBRowDown2Linear_C(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_argb,
+                               int dst_width);
+void ScaleARGBRowDown2Box_C(const uint8_t* src_argb,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_argb,
+                            int dst_width);
+void ScaleARGBRowDownEven_C(const uint8_t* src_argb,
+                            ptrdiff_t src_stride,
                             int src_stepx,
-                            uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
+                            uint8_t* dst_argb,
+                            int dst_width);
+void ScaleARGBRowDownEvenBox_C(const uint8_t* src_argb,
                                ptrdiff_t src_stride,
                                int src_stepx,
-                               uint8* dst_argb, int dst_width);
-void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
-                     int dst_width, int x, int dx);
-void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
-                       int dst_width, int x, int dx);
-void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int, int);
-void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
-                           int dst_width, int x, int dx);
-void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
-                             int dst_width, int x, int dx);
+                               uint8_t* dst_argb,
+                               int dst_width);
+void ScaleARGBCols_C(uint8_t* dst_argb,
+                     const uint8_t* src_argb,
+                     int dst_width,
+                     int x,
+                     int dx);
+void ScaleARGBCols64_C(uint8_t* dst_argb,
+                       const uint8_t* src_argb,
+                       int dst_width,
+                       int x32,
+                       int dx);
+void ScaleARGBColsUp2_C(uint8_t* dst_argb,
+                        const uint8_t* src_argb,
+                        int dst_width,
+                        int,
+                        int);
+void ScaleARGBFilterCols_C(uint8_t* dst_argb,
+                           const uint8_t* src_argb,
+                           int dst_width,
+                           int x,
+                           int dx);
+void ScaleARGBFilterCols64_C(uint8_t* dst_argb,
+                             const uint8_t* src_argb,
+                             int dst_width,
+                             int x32,
+                             int dx);
 
 // Specialized scalers for x86.
-void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
-void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
+void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_ptr,
+                         int dst_width);
+void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width);
+void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width);
+void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_ptr,
+                         int dst_width);
+void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width);
+void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width);
 
-void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_ptr,
+                          int dst_width);
+void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_ptr,
+                          int dst_width);
+void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
-void ScaleRowDown2_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                             uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Linear_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                                   uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_Odd_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
-void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                  uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_Odd_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-void ScaleRowDown4_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                             uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
-void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleRowDown2_Any_SSSE3(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_ptr,
+                             int dst_width);
+void ScaleRowDown2Linear_Any_SSSE3(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleRowDown2Box_Any_SSSE3(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleRowDown2Box_Odd_SSSE3(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleRowDown2_Any_AVX2(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown2Linear_Any_AVX2(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleRowDown2Box_Any_AVX2(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown2Box_Odd_AVX2(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown4_Any_SSSE3(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_ptr,
+                             int dst_width);
+void ScaleRowDown4Box_Any_SSSE3(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleRowDown4_Any_AVX2(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown4Box_Any_AVX2(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
 
-void ScaleRowDown34_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_1_Box_Any_SSSE3(const uint8* src_ptr,
+void ScaleRowDown34_Any_SSSE3(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown34_1_Box_Any_SSSE3(const uint8_t* src_ptr,
                                     ptrdiff_t src_stride,
-                                    uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_0_Box_Any_SSSE3(const uint8* src_ptr,
+                                    uint8_t* dst_ptr,
+                                    int dst_width);
+void ScaleRowDown34_0_Box_Any_SSSE3(const uint8_t* src_ptr,
                                     ptrdiff_t src_stride,
-                                    uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_3_Box_Any_SSSE3(const uint8* src_ptr,
+                                    uint8_t* dst_ptr,
+                                    int dst_width);
+void ScaleRowDown38_Any_SSSE3(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown38_3_Box_Any_SSSE3(const uint8_t* src_ptr,
                                     ptrdiff_t src_stride,
-                                    uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr,
+                                    uint8_t* dst_ptr,
+                                    int dst_width);
+void ScaleRowDown38_2_Box_Any_SSSE3(const uint8_t* src_ptr,
                                     ptrdiff_t src_stride,
-                                    uint8* dst_ptr, int dst_width);
+                                    uint8_t* dst_ptr,
+                                    int dst_width);
 
-void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRow_Any_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRow_Any_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-
-void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                           int dst_width, int x, int dx);
-void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
-                       int dst_width, int x, int dx);
+void ScaleAddRow_SSE2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleAddRow_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleAddRow_Any_SSE2(const uint8_t* src_ptr,
+                          uint16_t* dst_ptr,
+                          int src_width);
+void ScaleAddRow_Any_AVX2(const uint8_t* src_ptr,
+                          uint16_t* dst_ptr,
+                          int src_width);
 
+void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
+                           const uint8_t* src_ptr,
+                           int dst_width,
+                           int x,
+                           int dx);
+void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
+                       const uint8_t* src_ptr,
+                       int dst_width,
+                       int x,
+                       int dx);
 
 // ARGB Column functions
-void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx);
-void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
-                               int dst_width, int x, int dx);
-void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
-                           int dst_width, int x, int dx);
-void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
-                              int dst_width, int x, int dx);
-void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx);
-void ScaleARGBFilterCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
-                                  int dst_width, int x, int dx);
-void ScaleARGBCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
-                            int dst_width, int x, int dx);
+void ScaleARGBCols_SSE2(uint8_t* dst_argb,
+                        const uint8_t* src_argb,
+                        int dst_width,
+                        int x,
+                        int dx);
+void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
+                               const uint8_t* src_argb,
+                               int dst_width,
+                               int x,
+                               int dx);
+void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
+                           const uint8_t* src_argb,
+                           int dst_width,
+                           int x,
+                           int dx);
+void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
+                              const uint8_t* src_argb,
+                              int dst_width,
+                              int x,
+                              int dx);
+void ScaleARGBCols_NEON(uint8_t* dst_argb,
+                        const uint8_t* src_argb,
+                        int dst_width,
+                        int x,
+                        int dx);
+void ScaleARGBFilterCols_Any_NEON(uint8_t* dst_ptr,
+                                  const uint8_t* src_ptr,
+                                  int dst_width,
+                                  int x,
+                                  int dx);
+void ScaleARGBCols_Any_NEON(uint8_t* dst_ptr,
+                            const uint8_t* src_ptr,
+                            int dst_width,
+                            int x,
+                            int dx);
+void ScaleARGBFilterCols_MSA(uint8_t* dst_argb,
+                             const uint8_t* src_argb,
+                             int dst_width,
+                             int x,
+                             int dx);
+void ScaleARGBCols_MSA(uint8_t* dst_argb,
+                       const uint8_t* src_argb,
+                       int dst_width,
+                       int x,
+                       int dx);
+void ScaleARGBFilterCols_Any_MSA(uint8_t* dst_ptr,
+                                 const uint8_t* src_ptr,
+                                 int dst_width,
+                                 int x,
+                                 int dx);
+void ScaleARGBCols_Any_MSA(uint8_t* dst_ptr,
+                           const uint8_t* src_ptr,
+                           int dst_width,
+                           int x,
+                           int dx);
 
 // ARGB Row functions
-void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                            uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                                  uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                               uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width);
-void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
-                                  uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst, int dst_width);
-void ScaleARGBRowDown2_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                                uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Linear_Any_SSE2(const uint8* src_argb,
+void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_argb,
+                            int dst_width);
+void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_argb,
+                                  int dst_width);
+void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_argb,
+                               int dst_width);
+void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst,
+                            int dst_width);
+void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_argb,
+                                  int dst_width);
+void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst,
+                               int dst_width);
+void ScaleARGBRowDown2_MSA(const uint8_t* src_argb,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_argb,
+                           int dst_width);
+void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_argb,
+                                 int dst_width);
+void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_argb,
+                              int dst_width);
+void ScaleARGBRowDown2_Any_SSE2(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleARGBRowDown2Linear_Any_SSE2(const uint8_t* src_ptr,
                                       ptrdiff_t src_stride,
-                                      uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Box_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                                   uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* dst, int dst_width);
-void ScaleARGBRowDown2Linear_Any_NEON(const uint8* src_argb,
+                                      uint8_t* dst_ptr,
+                                      int dst_width);
+void ScaleARGBRowDown2Box_Any_SSE2(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleARGBRowDown2_Any_NEON(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleARGBRowDown2Linear_Any_NEON(const uint8_t* src_ptr,
                                       ptrdiff_t src_stride,
-                                      uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                                   uint8* dst, int dst_width);
+                                      uint8_t* dst_ptr,
+                                      int dst_width);
+void ScaleARGBRowDown2Box_Any_NEON(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleARGBRowDown2_Any_MSA(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleARGBRowDown2Linear_Any_MSA(const uint8_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8_t* dst_ptr,
+                                     int dst_width);
+void ScaleARGBRowDown2Box_Any_MSA(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
 
-void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                               int src_stepx, uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                                  int src_stepx,
-                                  uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
                                int src_stepx,
-                               uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                               uint8_t* dst_argb,
+                               int dst_width);
+void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
                                   int src_stepx,
-                                  uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEven_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                  uint8_t* dst_argb,
+                                  int dst_width);
+void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8_t* dst_argb,
+                               int dst_width);
+void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8_t* dst_argb,
+                                  int dst_width);
+void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              int32_t src_stepx,
+                              uint8_t* dst_argb,
+                              int dst_width);
+void ScaleARGBRowDownEvenBox_MSA(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 int src_stepx,
+                                 uint8_t* dst_argb,
+                                 int dst_width);
+void ScaleARGBRowDownEven_Any_SSE2(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
                                    int src_stepx,
-                                   uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8* src_argb,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8_t* src_ptr,
                                       ptrdiff_t src_stride,
                                       int src_stepx,
-                                      uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEven_Any_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                      uint8_t* dst_ptr,
+                                      int dst_width);
+void ScaleARGBRowDownEven_Any_NEON(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
                                    int src_stepx,
-                                   uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_Any_NEON(const uint8* src_argb,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleARGBRowDownEvenBox_Any_NEON(const uint8_t* src_ptr,
                                       ptrdiff_t src_stride,
                                       int src_stepx,
-                                      uint8* dst_argb, int dst_width);
+                                      uint8_t* dst_ptr,
+                                      int dst_width);
+void ScaleARGBRowDownEven_Any_MSA(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  int32_t src_stepx,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleARGBRowDownEvenBox_Any_MSA(const uint8_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     int src_stepx,
+                                     uint8_t* dst_ptr,
+                                     int dst_width);
 
 // ScaleRowDown2Box also used by planar functions
 // NEON downscalers with interpolation.
 
 // Note - not static due to reuse in convert for 444 to 420.
-void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width);
-void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst, int dst_width);
-void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width);
+void ScaleRowDown2_NEON(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width);
+void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst,
+                              int dst_width);
+void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst,
+                           int dst_width);
 
-void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_NEON(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width);
+void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width);
 
 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
 //  to load up the every 4th pixel into a 4 different registers.
 // Point samples 32 pixels to 24 pixels.
-void ScaleRowDown34_NEON(const uint8* src_ptr,
+void ScaleRowDown34_NEON(const uint8_t* src_ptr,
                          ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+                         uint8_t* dst_ptr,
+                         int dst_width);
+void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
+                               uint8_t* dst_ptr,
+                               int dst_width);
 
 // 32 -> 12
-void ScaleRowDown38_NEON(const uint8* src_ptr,
+void ScaleRowDown38_NEON(const uint8_t* src_ptr,
                          ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width);
+                         uint8_t* dst_ptr,
+                         int dst_width);
 // 32x3 -> 12x1
-void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
+                               uint8_t* dst_ptr,
+                               int dst_width);
 // 32x2 -> 12x1
-void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
+                               uint8_t* dst_ptr,
+                               int dst_width);
 
-void ScaleRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width);
-void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                                  uint8* dst, int dst_width);
-void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst, int dst_width);
-void ScaleRowDown2Box_Odd_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst, int dst_width);
-void ScaleRowDown4_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                             uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_0_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                                   uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_1_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                                   uint8* dst_ptr, int dst_width);
+void ScaleRowDown2_Any_NEON(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown2Linear_Any_NEON(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleRowDown2Box_Any_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown2Box_Odd_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown4_Any_NEON(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown4Box_Any_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown34_Any_NEON(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_ptr,
+                             int dst_width);
+void ScaleRowDown34_0_Box_Any_NEON(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleRowDown34_1_Box_Any_NEON(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
 // 32 -> 12
-void ScaleRowDown38_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                             uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_Any_NEON(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_ptr,
+                             int dst_width);
 // 32x3 -> 12x1
-void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_Any_NEON(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
 // 32x2 -> 12x1
-void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_Any_NEON(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
 
-void ScaleAddRow_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRow_Any_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleAddRow_Any_NEON(const uint8_t* src_ptr,
+                          uint16_t* dst_ptr,
+                          int src_width);
 
-void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
-                          int dst_width, int x, int dx);
+void ScaleFilterCols_NEON(uint8_t* dst_ptr,
+                          const uint8_t* src_ptr,
+                          int dst_width,
+                          int x,
+                          int dx);
 
-void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
-                              int dst_width, int x, int dx);
+void ScaleFilterCols_Any_NEON(uint8_t* dst_ptr,
+                              const uint8_t* src_ptr,
+                              int dst_width,
+                              int x,
+                              int dx);
 
-void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst, int dst_width);
-void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width);
-void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst, int dst_width);
-void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width);
-void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst, int dst_width);
-void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* d, int dst_width);
-void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* d, int dst_width);
-void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst, int dst_width);
-void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown2_MSA(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst,
+                       int dst_width);
+void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst,
+                             int dst_width);
+void ScaleRowDown2Box_MSA(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width);
+void ScaleRowDown4_MSA(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst,
+                       int dst_width);
+void ScaleRowDown4Box_MSA(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width);
+void ScaleRowDown38_MSA(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width);
+void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleFilterCols_MSA(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         int dst_width,
+                         int x,
+                         int dx);
+void ScaleRowDown34_MSA(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width);
+void ScaleRowDown34_0_Box_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* d,
+                              int dst_width);
+void ScaleRowDown34_1_Box_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* d,
+                              int dst_width);
+
+void ScaleRowDown2_Any_MSA(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width);
+void ScaleRowDown2Linear_Any_MSA(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_ptr,
+                                 int dst_width);
+void ScaleRowDown2Box_Any_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown4_Any_MSA(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width);
+void ScaleRowDown4Box_Any_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown38_Any_MSA(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown38_2_Box_Any_MSA(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleRowDown38_3_Box_Any_MSA(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleAddRow_Any_MSA(const uint8_t* src_ptr,
+                         uint16_t* dst_ptr,
+                         int src_width);
+void ScaleFilterCols_Any_MSA(uint8_t* dst_ptr,
+                             const uint8_t* src_ptr,
+                             int dst_width,
+                             int x,
+                             int dx);
+void ScaleRowDown34_Any_MSA(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown34_0_Box_Any_MSA(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleRowDown34_1_Box_Any_MSA(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_SCALE_ROW_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_SCALE_ROW_H_
diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/version.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/version.h
index 0fbdc022d5..7022785d8c 100644
--- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/version.h
+++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/version.h
@@ -8,9 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1616
+#define LIBYUV_VERSION 1711
 
-#endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/video_common.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/video_common.h
index ad934e4241..bcef378b5a 100644
--- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/video_common.h
+++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/video_common.h
@@ -10,7 +10,7 @@
 
 // Common definitions for video, including fourcc and VideoFormat.
 
-#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_
 #define INCLUDE_LIBYUV_VIDEO_COMMON_H_
 
 #include "libyuv/basic_types.h"
@@ -28,13 +28,13 @@ extern "C" {
 // Needs to be a macro otherwise the OS X compiler complains when the kFormat*
 // constants are used in a switch.
 #ifdef __cplusplus
-#define FOURCC(a, b, c, d) ( \
-    (static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \
-    (static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24))
+#define FOURCC(a, b, c, d)                                        \
+  ((static_cast<uint32_t>(a)) | (static_cast<uint32_t>(b) << 8) | \
+   (static_cast<uint32_t>(c) << 16) | (static_cast<uint32_t>(d) << 24))
 #else
-#define FOURCC(a, b, c, d) ( \
-    ((uint32)(a)) | ((uint32)(b) << 8) | /* NOLINT */ \
-    ((uint32)(c) << 16) | ((uint32)(d) << 24))  /* NOLINT */
+#define FOURCC(a, b, c, d)                                     \
+  (((uint32_t)(a)) | ((uint32_t)(b) << 8) |       /* NOLINT */ \
+   ((uint32_t)(c) << 16) | ((uint32_t)(d) << 24)) /* NOLINT */
 #endif
 
 // Some pages discussing FourCC codes:
@@ -53,38 +53,33 @@ enum FourCC {
   FOURCC_I420 = FOURCC('I', '4', '2', '0'),
   FOURCC_I422 = FOURCC('I', '4', '2', '2'),
   FOURCC_I444 = FOURCC('I', '4', '4', '4'),
-  FOURCC_I411 = FOURCC('I', '4', '1', '1'),
   FOURCC_I400 = FOURCC('I', '4', '0', '0'),
   FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
   FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
   FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
   FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
+  FOURCC_H010 = FOURCC('H', '0', '1', '0'),  // unofficial fourcc. 10 bit lsb
 
-  // 2 Secondary YUV formats: row biplanar.
+  // 1 Secondary YUV format: row biplanar.
   FOURCC_M420 = FOURCC('M', '4', '2', '0'),
-  FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),  // deprecated.
 
-  // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.
+  // 11 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc
   FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
   FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
   FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
+  FOURCC_AR30 = FOURCC('A', 'R', '3', '0'),  // 10 bit per channel. 2101010.
+  FOURCC_AB30 = FOURCC('A', 'B', '3', '0'),  // ABGR version of 10 bit
   FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
-  FOURCC_RAW  = FOURCC('r', 'a', 'w', ' '),
+  FOURCC_RAW = FOURCC('r', 'a', 'w', ' '),
   FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
   FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'),  // rgb565 LE.
   FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'),  // argb1555 LE.
   FOURCC_R444 = FOURCC('R', '4', '4', '4'),  // argb4444 LE.
 
-  // 4 Secondary RGB formats: 4 Bayer Patterns. deprecated.
-  FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
-  FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
-  FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
-  FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
-
   // 1 Primary Compressed YUV format.
   FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
 
-  // 5 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
+  // 7 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
   FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
   FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
   FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
@@ -112,7 +107,13 @@ enum FourCC {
   FOURCC_L565 = FOURCC('L', '5', '6', '5'),  // Alias for RGBP.
   FOURCC_5551 = FOURCC('5', '5', '5', '1'),  // Alias for RGBO.
 
-  // 1 Auxiliary compressed YUV format set aside for capturer.
+  // deprecated formats.  Not supported, but defined for backward compatibility.
+  FOURCC_I411 = FOURCC('I', '4', '1', '1'),
+  FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),
+  FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
+  FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
+  FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
+  FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
   FOURCC_H264 = FOURCC('H', '2', '6', '4'),
 
   // Match any fourcc.
@@ -136,8 +137,10 @@ enum FourCCBpp {
   FOURCC_BPP_BGRA = 32,
   FOURCC_BPP_ABGR = 32,
   FOURCC_BPP_RGBA = 32,
+  FOURCC_BPP_AR30 = 32,
+  FOURCC_BPP_AB30 = 32,
   FOURCC_BPP_24BG = 24,
-  FOURCC_BPP_RAW  = 24,
+  FOURCC_BPP_RAW = 24,
   FOURCC_BPP_RGBP = 16,
   FOURCC_BPP_RGBO = 16,
   FOURCC_BPP_R444 = 16,
@@ -152,6 +155,7 @@ enum FourCCBpp {
   FOURCC_BPP_J420 = 12,
   FOURCC_BPP_J400 = 8,
   FOURCC_BPP_H420 = 12,
+  FOURCC_BPP_H010 = 24,
   FOURCC_BPP_MJPG = 0,  // 0 means unknown.
   FOURCC_BPP_H264 = 0,
   FOURCC_BPP_IYUV = 12,
@@ -170,15 +174,15 @@ enum FourCCBpp {
   FOURCC_BPP_CM24 = 24,
 
   // Match any fourcc.
-  FOURCC_BPP_ANY  = 0,  // 0 means unknown.
+  FOURCC_BPP_ANY = 0,  // 0 means unknown.
 };
 
 // Converts fourcc aliases into canonical ones.
-LIBYUV_API uint32 CanonicalFourCC(uint32 fourcc);
+LIBYUV_API uint32_t CanonicalFourCC(uint32_t fourcc);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_VIDEO_COMMON_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_VIDEO_COMMON_H_
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/compare.cc b/media/libvpx/libvpx/third_party/libyuv/source/compare.cc
index e3846bdfdd..50e3abd055 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/compare.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/compare.cc
@@ -29,10 +29,10 @@ extern "C" {
 
 // hash seed of 5381 recommended.
 LIBYUV_API
-uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
+uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed) {
   const int kBlockSize = 1 << 15;  // 32768;
   int remainder;
-  uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) =
+  uint32_t (*HashDjb2_SSE)(const uint8_t* src, int count, uint32_t seed) =
       HashDjb2_C;
 #if defined(HAS_HASHDJB2_SSE41)
   if (TestCpuFlag(kCpuHasSSE41)) {
@@ -45,25 +45,25 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
   }
 #endif
 
-  while (count >= (uint64)(kBlockSize)) {
+  while (count >= (uint64_t)(kBlockSize)) {
     seed = HashDjb2_SSE(src, kBlockSize, seed);
     src += kBlockSize;
     count -= kBlockSize;
   }
-  remainder = (int)(count) & ~15;
+  remainder = (int)count & ~15;
   if (remainder) {
     seed = HashDjb2_SSE(src, remainder, seed);
     src += remainder;
     count -= remainder;
   }
-  remainder = (int)(count) & 15;
+  remainder = (int)count & 15;
   if (remainder) {
     seed = HashDjb2_C(src, remainder, seed);
   }
   return seed;
 }
 
-static uint32 ARGBDetectRow_C(const uint8* argb, int width) {
+static uint32_t ARGBDetectRow_C(const uint8_t* argb, int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
     if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
@@ -94,8 +94,11 @@ static uint32 ARGBDetectRow_C(const uint8* argb, int width) {
 // Scan an opaque argb image and return fourcc based on alpha offset.
 // Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
 LIBYUV_API
-uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
-  uint32 fourcc = 0;
+uint32_t ARGBDetect(const uint8_t* argb,
+                    int stride_argb,
+                    int width,
+                    int height) {
+  uint32_t fourcc = 0;
   int h;
 
   // Coalesce rows.
@@ -111,19 +114,80 @@ uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
   return fourcc;
 }
 
+// NEON version accumulates in 16 bit shorts which overflow at 65536 bytes.
+// So actual maximum is 1 less loop, which is 64436 - 32 bytes.
+
+LIBYUV_API
+uint64_t ComputeHammingDistance(const uint8_t* src_a,
+                                const uint8_t* src_b,
+                                int count) {
+  const int kBlockSize = 1 << 15;  // 32768;
+  const int kSimdSize = 64;
+  // SIMD for multiple of 64, and C for remainder
+  int remainder = count & (kBlockSize - 1) & ~(kSimdSize - 1);
+  uint64_t diff = 0;
+  int i;
+  uint32_t (*HammingDistance)(const uint8_t* src_a, const uint8_t* src_b,
+                              int count) = HammingDistance_C;
+#if defined(HAS_HAMMINGDISTANCE_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    HammingDistance = HammingDistance_NEON;
+  }
+#endif
+#if defined(HAS_HAMMINGDISTANCE_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    HammingDistance = HammingDistance_SSSE3;
+  }
+#endif
+#if defined(HAS_HAMMINGDISTANCE_SSE42)
+  if (TestCpuFlag(kCpuHasSSE42)) {
+    HammingDistance = HammingDistance_SSE42;
+  }
+#endif
+#if defined(HAS_HAMMINGDISTANCE_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    HammingDistance = HammingDistance_AVX2;
+  }
+#endif
+#if defined(HAS_HAMMINGDISTANCE_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    HammingDistance = HammingDistance_MSA;
+  }
+#endif
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+ : diff)
+#endif
+  for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
+    diff += HammingDistance(src_a + i, src_b + i, kBlockSize);
+  }
+  src_a += count & ~(kBlockSize - 1);
+  src_b += count & ~(kBlockSize - 1);
+  if (remainder) {
+    diff += HammingDistance(src_a, src_b, remainder);
+    src_a += remainder;
+    src_b += remainder;
+  }
+  remainder = count & (kSimdSize - 1);
+  if (remainder) {
+    diff += HammingDistance_C(src_a, src_b, remainder);
+  }
+  return diff;
+}
+
 // TODO(fbarchard): Refactor into row function.
 LIBYUV_API
-uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
-                             int count) {
+uint64_t ComputeSumSquareError(const uint8_t* src_a,
+                               const uint8_t* src_b,
+                               int count) {
   // SumSquareError returns values 0 to 65535 for each squared difference.
-  // Up to 65536 of those can be summed and remain within a uint32.
-  // After each block of 65536 pixels, accumulate into a uint64.
+  // Up to 65536 of those can be summed and remain within a uint32_t.
+  // After each block of 65536 pixels, accumulate into a uint64_t.
   const int kBlockSize = 65536;
   int remainder = count & (kBlockSize - 1) & ~31;
-  uint64 sse = 0;
+  uint64_t sse = 0;
   int i;
-  uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
-      SumSquareError_C;
+  uint32_t (*SumSquareError)(const uint8_t* src_a, const uint8_t* src_b,
+                             int count) = SumSquareError_C;
 #if defined(HAS_SUMSQUAREERROR_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     SumSquareError = SumSquareError_NEON;
@@ -141,8 +205,13 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
     SumSquareError = SumSquareError_AVX2;
   }
 #endif
+#if defined(HAS_SUMSQUAREERROR_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SumSquareError = SumSquareError_MSA;
+  }
+#endif
 #ifdef _OPENMP
-#pragma omp parallel for reduction(+: sse)
+#pragma omp parallel for reduction(+ : sse)
 #endif
   for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
     sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
@@ -162,14 +231,16 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
 }
 
 LIBYUV_API
-uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
-                                  const uint8* src_b, int stride_b,
-                                  int width, int height) {
-  uint64 sse = 0;
+uint64_t ComputeSumSquareErrorPlane(const uint8_t* src_a,
+                                    int stride_a,
+                                    const uint8_t* src_b,
+                                    int stride_b,
+                                    int width,
+                                    int height) {
+  uint64_t sse = 0;
   int h;
   // Coalesce rows.
-  if (stride_a == width &&
-      stride_b == width) {
+  if (stride_a == width && stride_b == width) {
     width *= height;
     height = 1;
     stride_a = stride_b = 0;
@@ -183,66 +254,76 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
 }
 
 LIBYUV_API
-double SumSquareErrorToPsnr(uint64 sse, uint64 count) {
+double SumSquareErrorToPsnr(uint64_t sse, uint64_t count) {
   double psnr;
   if (sse > 0) {
-    double mse = (double)(count) / (double)(sse);
+    double mse = (double)count / (double)sse;
     psnr = 10.0 * log10(255.0 * 255.0 * mse);
   } else {
-    psnr = kMaxPsnr;      // Limit to prevent divide by 0
+    psnr = kMaxPsnr;  // Limit to prevent divide by 0
   }
 
-  if (psnr > kMaxPsnr)
+  if (psnr > kMaxPsnr) {
     psnr = kMaxPsnr;
+  }
 
   return psnr;
 }
 
 LIBYUV_API
-double CalcFramePsnr(const uint8* src_a, int stride_a,
-                     const uint8* src_b, int stride_b,
-                     int width, int height) {
-  const uint64 samples = width * height;
-  const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a,
-                                                src_b, stride_b,
-                                                width, height);
+double CalcFramePsnr(const uint8_t* src_a,
+                     int stride_a,
+                     const uint8_t* src_b,
+                     int stride_b,
+                     int width,
+                     int height) {
+  const uint64_t samples = (uint64_t)width * (uint64_t)height;
+  const uint64_t sse = ComputeSumSquareErrorPlane(src_a, stride_a, src_b,
+                                                  stride_b, width, height);
   return SumSquareErrorToPsnr(sse, samples);
 }
 
 LIBYUV_API
-double I420Psnr(const uint8* src_y_a, int stride_y_a,
-                const uint8* src_u_a, int stride_u_a,
-                const uint8* src_v_a, int stride_v_a,
-                const uint8* src_y_b, int stride_y_b,
-                const uint8* src_u_b, int stride_u_b,
-                const uint8* src_v_b, int stride_v_b,
-                int width, int height) {
-  const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a,
-                                                  src_y_b, stride_y_b,
-                                                  width, height);
+double I420Psnr(const uint8_t* src_y_a,
+                int stride_y_a,
+                const uint8_t* src_u_a,
+                int stride_u_a,
+                const uint8_t* src_v_a,
+                int stride_v_a,
+                const uint8_t* src_y_b,
+                int stride_y_b,
+                const uint8_t* src_u_b,
+                int stride_u_b,
+                const uint8_t* src_v_b,
+                int stride_v_b,
+                int width,
+                int height) {
+  const uint64_t sse_y = ComputeSumSquareErrorPlane(
+      src_y_a, stride_y_a, src_y_b, stride_y_b, width, height);
   const int width_uv = (width + 1) >> 1;
   const int height_uv = (height + 1) >> 1;
-  const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a,
-                                                  src_u_b, stride_u_b,
-                                                  width_uv, height_uv);
-  const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a,
-                                                  src_v_b, stride_v_b,
-                                                  width_uv, height_uv);
-  const uint64 samples = width * height + 2 * (width_uv * height_uv);
-  const uint64 sse = sse_y + sse_u + sse_v;
+  const uint64_t sse_u = ComputeSumSquareErrorPlane(
+      src_u_a, stride_u_a, src_u_b, stride_u_b, width_uv, height_uv);
+  const uint64_t sse_v = ComputeSumSquareErrorPlane(
+      src_v_a, stride_v_a, src_v_b, stride_v_b, width_uv, height_uv);
+  const uint64_t samples = (uint64_t)width * (uint64_t)height +
+                           2 * ((uint64_t)width_uv * (uint64_t)height_uv);
+  const uint64_t sse = sse_y + sse_u + sse_v;
   return SumSquareErrorToPsnr(sse, samples);
 }
 
-static const int64 cc1 =  26634;  // (64^2*(.01*255)^2
-static const int64 cc2 = 239708;  // (64^2*(.03*255)^2
+static const int64_t cc1 = 26634;   // (64^2*(.01*255)^2
+static const int64_t cc2 = 239708;  // (64^2*(.03*255)^2
 
-static double Ssim8x8_C(const uint8* src_a, int stride_a,
-                        const uint8* src_b, int stride_b) {
-  int64 sum_a = 0;
-  int64 sum_b = 0;
-  int64 sum_sq_a = 0;
-  int64 sum_sq_b = 0;
-  int64 sum_axb = 0;
+static double Ssim8x8_C(const uint8_t* src_a,
+                        int stride_a,
+                        const uint8_t* src_b,
+                        int stride_b) {
+  int64_t sum_a = 0;
+  int64_t sum_b = 0;
+  int64_t sum_sq_a = 0;
+  int64_t sum_sq_b = 0;
+  int64_t sum_axb = 0;
 
   int i;
   for (i = 0; i < 8; ++i) {
@@ -260,22 +341,22 @@ static double Ssim8x8_C(const uint8* src_a, int stride_a,
   }
 
   {
-    const int64 count = 64;
+    const int64_t count = 64;
     // scale the constants by number of pixels
-    const int64 c1 = (cc1 * count * count) >> 12;
-    const int64 c2 = (cc2 * count * count) >> 12;
+    const int64_t c1 = (cc1 * count * count) >> 12;
+    const int64_t c2 = (cc2 * count * count) >> 12;
 
-    const int64 sum_a_x_sum_b = sum_a * sum_b;
+    const int64_t sum_a_x_sum_b = sum_a * sum_b;
 
-    const int64 ssim_n = (2 * sum_a_x_sum_b + c1) *
-                         (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
+    const int64_t ssim_n = (2 * sum_a_x_sum_b + c1) *
+                           (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
 
-    const int64 sum_a_sq = sum_a*sum_a;
-    const int64 sum_b_sq = sum_b*sum_b;
+    const int64_t sum_a_sq = sum_a * sum_a;
+    const int64_t sum_b_sq = sum_b * sum_b;
 
-    const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) *
-                         (count * sum_sq_a - sum_a_sq +
-                          count * sum_sq_b - sum_b_sq + c2);
+    const int64_t ssim_d =
+        (sum_a_sq + sum_b_sq + c1) *
+        (count * sum_sq_a - sum_a_sq + count * sum_sq_b - sum_b_sq + c2);
 
     if (ssim_d == 0.0) {
       return DBL_MAX;
@@ -288,13 +369,16 @@ static double Ssim8x8_C(const uint8* src_a, int stride_a,
 // on the 4x4 pixel grid. Such arrangement allows the windows to overlap
 // block boundaries to penalize blocking artifacts.
 LIBYUV_API
-double CalcFrameSsim(const uint8* src_a, int stride_a,
-                     const uint8* src_b, int stride_b,
-                     int width, int height) {
+double CalcFrameSsim(const uint8_t* src_a,
+                     int stride_a,
+                     const uint8_t* src_b,
+                     int stride_b,
+                     int width,
+                     int height) {
   int samples = 0;
   double ssim_total = 0;
-  double (*Ssim8x8)(const uint8* src_a, int stride_a,
-                    const uint8* src_b, int stride_b) = Ssim8x8_C;
+  double (*Ssim8x8)(const uint8_t* src_a, int stride_a, const uint8_t* src_b,
+                    int stride_b) = Ssim8x8_C;
 
   // sample point start with each 4x4 location
   int i;
@@ -314,22 +398,27 @@ double CalcFrameSsim(const uint8* src_a, int stride_a,
 }
 
 LIBYUV_API
-double I420Ssim(const uint8* src_y_a, int stride_y_a,
-                const uint8* src_u_a, int stride_u_a,
-                const uint8* src_v_a, int stride_v_a,
-                const uint8* src_y_b, int stride_y_b,
-                const uint8* src_u_b, int stride_u_b,
-                const uint8* src_v_b, int stride_v_b,
-                int width, int height) {
-  const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a,
-                                      src_y_b, stride_y_b, width, height);
+double I420Ssim(const uint8_t* src_y_a,
+                int stride_y_a,
+                const uint8_t* src_u_a,
+                int stride_u_a,
+                const uint8_t* src_v_a,
+                int stride_v_a,
+                const uint8_t* src_y_b,
+                int stride_y_b,
+                const uint8_t* src_u_b,
+                int stride_u_b,
+                const uint8_t* src_v_b,
+                int stride_v_b,
+                int width,
+                int height) {
+  const double ssim_y =
+      CalcFrameSsim(src_y_a, stride_y_a, src_y_b, stride_y_b, width, height);
   const int width_uv = (width + 1) >> 1;
   const int height_uv = (height + 1) >> 1;
-  const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a,
-                                      src_u_b, stride_u_b,
+  const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a, src_u_b, stride_u_b,
                                       width_uv, height_uv);
-  const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a,
-                                      src_v_b, stride_v_b,
+  const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a, src_v_b, stride_v_b,
                                       width_uv, height_uv);
   return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v);
 }
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/compare_common.cc b/media/libvpx/libvpx/third_party/libyuv/source/compare_common.cc
index 42fc589354..d4b170ad98 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/compare_common.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/compare_common.cc
@@ -17,20 +17,80 @@ namespace libyuv {
 extern "C" {
 #endif
 
-uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) {
-  uint32 sse = 0u;
+#if ORIGINAL_OPT
+uint32_t HammingDistance_C1(const uint8_t* src_a,
+                            const uint8_t* src_b,
+                            int count) {
+  uint32_t diff = 0u;
+
+  int i;
+  for (i = 0; i < count; ++i) {
+    int x = src_a[i] ^ src_b[i];
+    if (x & 1)
+      ++diff;
+    if (x & 2)
+      ++diff;
+    if (x & 4)
+      ++diff;
+    if (x & 8)
+      ++diff;
+    if (x & 16)
+      ++diff;
+    if (x & 32)
+      ++diff;
+    if (x & 64)
+      ++diff;
+    if (x & 128)
+      ++diff;
+  }
+  return diff;
+}
+#endif
+
+// Hakmem method for hamming distance.
+uint32_t HammingDistance_C(const uint8_t* src_a,
+                           const uint8_t* src_b,
+                           int count) {
+  uint32_t diff = 0u;
+
+  int i;
+  for (i = 0; i < count - 3; i += 4) {
+    uint32_t x = *((const uint32_t*)src_a) ^ *((const uint32_t*)src_b);
+    uint32_t u = x - ((x >> 1) & 0x55555555);
+    u = ((u >> 2) & 0x33333333) + (u & 0x33333333);
+    diff += ((((u + (u >> 4)) & 0x0f0f0f0f) * 0x01010101) >> 24);
+    src_a += 4;
+    src_b += 4;
+  }
+
+  for (; i < count; ++i) {
+    uint32_t x = *src_a ^ *src_b;
+    uint32_t u = x - ((x >> 1) & 0x55);
+    u = ((u >> 2) & 0x33) + (u & 0x33);
+    diff += (u + (u >> 4)) & 0x0f;
+    src_a += 1;
+    src_b += 1;
+  }
+
+  return diff;
+}
+
+uint32_t SumSquareError_C(const uint8_t* src_a,
+                          const uint8_t* src_b,
+                          int count) {
+  uint32_t sse = 0u;
   int i;
   for (i = 0; i < count; ++i) {
     int diff = src_a[i] - src_b[i];
-    sse += (uint32)(diff * diff);
+    sse += (uint32_t)(diff * diff);
   }
   return sse;
 }
 
 // hash seed of 5381 recommended.
 // Internal C version of HashDjb2 with int sized count for efficiency.
-uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
-  uint32 hash = seed;
+uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed) {
+  uint32_t hash = seed;
   int i;
   for (i = 0; i < count; ++i) {
     hash += (hash << 5) + src[i];
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/compare_gcc.cc b/media/libvpx/libvpx/third_party/libyuv/source/compare_gcc.cc
index 1b83edb166..676527c1b1 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/compare_gcc.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/compare_gcc.cc
@@ -22,124 +22,334 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_X86) && \
     (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
 
-uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
-  uint32 sse;
-  asm volatile (
-    "pxor      %%xmm0,%%xmm0                   \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
-    "lea       " MEMLEA(0x10, 0) ",%0          \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
-    "lea       " MEMLEA(0x10, 1) ",%1          \n"
-    "movdqa    %%xmm1,%%xmm3                   \n"
-    "psubusb   %%xmm2,%%xmm1                   \n"
-    "psubusb   %%xmm3,%%xmm2                   \n"
-    "por       %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "punpckhbw %%xmm5,%%xmm2                   \n"
-    "pmaddwd   %%xmm1,%%xmm1                   \n"
-    "pmaddwd   %%xmm2,%%xmm2                   \n"
-    "paddd     %%xmm1,%%xmm0                   \n"
-    "paddd     %%xmm2,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
+#if defined(__x86_64__)
+uint32_t HammingDistance_SSE42(const uint8_t* src_a,
+                               const uint8_t* src_b,
+                               int count) {
+  uint64_t diff = 0u;
 
-    "pshufd    $0xee,%%xmm0,%%xmm1             \n"
-    "paddd     %%xmm1,%%xmm0                   \n"
-    "pshufd    $0x1,%%xmm0,%%xmm1              \n"
-    "paddd     %%xmm1,%%xmm0                   \n"
-    "movd      %%xmm0,%3                       \n"
+  asm volatile(
+      "xor        %3,%3                          \n"
+      "xor        %%r8,%%r8                      \n"
+      "xor        %%r9,%%r9                      \n"
+      "xor        %%r10,%%r10                    \n"
 
-  : "+r"(src_a),      // %0
-    "+r"(src_b),      // %1
-    "+r"(count),      // %2
-    "=g"(sse)         // %3
-  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+      // Process 32 bytes per loop.
+      LABELALIGN
+      "1:                                        \n"
+      "mov        (%0),%%rcx                     \n"
+      "mov        0x8(%0),%%rdx                  \n"
+      "xor        (%1),%%rcx                     \n"
+      "xor        0x8(%1),%%rdx                  \n"
+      "popcnt     %%rcx,%%rcx                    \n"
+      "popcnt     %%rdx,%%rdx                    \n"
+      "mov        0x10(%0),%%rsi                 \n"
+      "mov        0x18(%0),%%rdi                 \n"
+      "xor        0x10(%1),%%rsi                 \n"
+      "xor        0x18(%1),%%rdi                 \n"
+      "popcnt     %%rsi,%%rsi                    \n"
+      "popcnt     %%rdi,%%rdi                    \n"
+      "add        $0x20,%0                       \n"
+      "add        $0x20,%1                       \n"
+      "add        %%rcx,%3                       \n"
+      "add        %%rdx,%%r8                     \n"
+      "add        %%rsi,%%r9                     \n"
+      "add        %%rdi,%%r10                    \n"
+      "sub        $0x20,%2                       \n"
+      "jg         1b                             \n"
+
+      "add        %%r8, %3                       \n"
+      "add        %%r9, %3                       \n"
+      "add        %%r10, %3                      \n"
+      : "+r"(src_a),  // %0
+        "+r"(src_b),  // %1
+        "+r"(count),  // %2
+        "=r"(diff)    // %3
+      :
+      : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10");
+
+  return static_cast<uint32_t>(diff);
+}
+#else
+uint32_t HammingDistance_SSE42(const uint8_t* src_a,
+                               const uint8_t* src_b,
+                               int count) {
+  uint32_t diff = 0u;
+
+  asm volatile(
+      // Process 16 bytes per loop.
+      LABELALIGN
+      "1:                                        \n"
+      "mov        (%0),%%ecx                     \n"
+      "mov        0x4(%0),%%edx                  \n"
+      "xor        (%1),%%ecx                     \n"
+      "xor        0x4(%1),%%edx                  \n"
+      "popcnt     %%ecx,%%ecx                    \n"
+      "add        %%ecx,%3                       \n"
+      "popcnt     %%edx,%%edx                    \n"
+      "add        %%edx,%3                       \n"
+      "mov        0x8(%0),%%ecx                  \n"
+      "mov        0xc(%0),%%edx                  \n"
+      "xor        0x8(%1),%%ecx                  \n"
+      "xor        0xc(%1),%%edx                  \n"
+      "popcnt     %%ecx,%%ecx                    \n"
+      "add        %%ecx,%3                       \n"
+      "popcnt     %%edx,%%edx                    \n"
+      "add        %%edx,%3                       \n"
+      "add        $0x10,%0                       \n"
+      "add        $0x10,%1                       \n"
+      "sub        $0x10,%2                       \n"
+      "jg         1b                             \n"
+      : "+r"(src_a),  // %0
+        "+r"(src_b),  // %1
+        "+r"(count),  // %2
+        "+r"(diff)    // %3
+      :
+      : "memory", "cc", "ecx", "edx");
+
+  return diff;
+}
+#endif
+
+static const vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15,
+                                 15, 15, 15, 15, 15, 15, 15, 15};
+static const vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
+
+uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
+                               const uint8_t* src_b,
+                               int count) {
+  uint32_t diff = 0u;
+
+  asm volatile(
+      "movdqa     %4,%%xmm2                      \n"
+      "movdqa     %5,%%xmm3                      \n"
+      "pxor       %%xmm0,%%xmm0                  \n"
+      "pxor       %%xmm1,%%xmm1                  \n"
+      "sub        %0,%1                          \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqa     (%0),%%xmm4                    \n"
+      "movdqa     0x10(%0), %%xmm5               \n"
+      "pxor       (%0,%1), %%xmm4                \n"
+      "movdqa     %%xmm4,%%xmm6                  \n"
+      "pand       %%xmm2,%%xmm6                  \n"
+      "psrlw      $0x4,%%xmm4                    \n"
+      "movdqa     %%xmm3,%%xmm7                  \n"
+      "pshufb     %%xmm6,%%xmm7                  \n"
+      "pand       %%xmm2,%%xmm4                  \n"
+      "movdqa     %%xmm3,%%xmm6                  \n"
+      "pshufb     %%xmm4,%%xmm6                  \n"
+      "paddb      %%xmm7,%%xmm6                  \n"
+      "pxor       0x10(%0,%1),%%xmm5             \n"
+      "add        $0x20,%0                       \n"
+      "movdqa     %%xmm5,%%xmm4                  \n"
+      "pand       %%xmm2,%%xmm5                  \n"
+      "psrlw      $0x4,%%xmm4                    \n"
+      "movdqa     %%xmm3,%%xmm7                  \n"
+      "pshufb     %%xmm5,%%xmm7                  \n"
+      "pand       %%xmm2,%%xmm4                  \n"
+      "movdqa     %%xmm3,%%xmm5                  \n"
+      "pshufb     %%xmm4,%%xmm5                  \n"
+      "paddb      %%xmm7,%%xmm5                  \n"
+      "paddb      %%xmm5,%%xmm6                  \n"
+      "psadbw     %%xmm1,%%xmm6                  \n"
+      "paddd      %%xmm6,%%xmm0                  \n"
+      "sub        $0x20,%2                       \n"
+      "jg         1b                             \n"
+
+      "pshufd     $0xaa,%%xmm0,%%xmm1            \n"
+      "paddd      %%xmm1,%%xmm0                  \n"
+      "movd       %%xmm0, %3                     \n"
+      : "+r"(src_a),       // %0
+        "+r"(src_b),       // %1
+        "+r"(count),       // %2
+        "=r"(diff)         // %3
+      : "m"(kNibbleMask),  // %4
+        "m"(kBitCount)     // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+
+  return diff;
+}
+
+#ifdef HAS_HAMMINGDISTANCE_AVX2
+uint32_t HammingDistance_AVX2(const uint8_t* src_a,
+                              const uint8_t* src_b,
+                              int count) {
+  uint32_t diff = 0u;
+
+  asm volatile(
+      "vbroadcastf128 %4,%%ymm2                  \n"
+      "vbroadcastf128 %5,%%ymm3                  \n"
+      "vpxor      %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpxor      %%ymm1,%%ymm1,%%ymm1           \n"
+      "sub        %0,%1                          \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqa    (%0),%%ymm4                    \n"
+      "vmovdqa    0x20(%0), %%ymm5               \n"
+      "vpxor      (%0,%1), %%ymm4, %%ymm4        \n"
+      "vpand      %%ymm2,%%ymm4,%%ymm6           \n"
+      "vpsrlw     $0x4,%%ymm4,%%ymm4             \n"
+      "vpshufb    %%ymm6,%%ymm3,%%ymm6           \n"
+      "vpand      %%ymm2,%%ymm4,%%ymm4           \n"
+      "vpshufb    %%ymm4,%%ymm3,%%ymm4           \n"
+      "vpaddb     %%ymm4,%%ymm6,%%ymm6           \n"
+      "vpxor      0x20(%0,%1),%%ymm5,%%ymm4      \n"
+      "add        $0x40,%0                       \n"
+      "vpand      %%ymm2,%%ymm4,%%ymm5           \n"
+      "vpsrlw     $0x4,%%ymm4,%%ymm4             \n"
+      "vpshufb    %%ymm5,%%ymm3,%%ymm5           \n"
+      "vpand      %%ymm2,%%ymm4,%%ymm4           \n"
+      "vpshufb    %%ymm4,%%ymm3,%%ymm4           \n"
+      "vpaddb     %%ymm5,%%ymm4,%%ymm4           \n"
+      "vpaddb     %%ymm6,%%ymm4,%%ymm4           \n"
+      "vpsadbw    %%ymm1,%%ymm4,%%ymm4           \n"
+      "vpaddd     %%ymm0,%%ymm4,%%ymm0           \n"
+      "sub        $0x40,%2                       \n"
+      "jg         1b                             \n"
+
+      "vpermq     $0xb1,%%ymm0,%%ymm1            \n"
+      "vpaddd     %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xaa,%%ymm0,%%ymm1            \n"
+      "vpaddd     %%ymm1,%%ymm0,%%ymm0           \n"
+      "vmovd      %%xmm0, %3                     \n"
+      "vzeroupper                                \n"
+      : "+r"(src_a),       // %0
+        "+r"(src_b),       // %1
+        "+r"(count),       // %2
+        "=r"(diff)         // %3
+      : "m"(kNibbleMask),  // %4
+        "m"(kBitCount)     // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+
+  return diff;
+}
+#endif  // HAS_HAMMINGDISTANCE_AVX2
+
+uint32_t SumSquareError_SSE2(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count) {
+  uint32_t sse;
+  asm volatile(
+      "pxor      %%xmm0,%%xmm0                   \n"
+      "pxor      %%xmm5,%%xmm5                   \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm1                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqu    (%1),%%xmm2                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "movdqa    %%xmm1,%%xmm3                   \n"
+      "psubusb   %%xmm2,%%xmm1                   \n"
+      "psubusb   %%xmm3,%%xmm2                   \n"
+      "por       %%xmm2,%%xmm1                   \n"
+      "movdqa    %%xmm1,%%xmm2                   \n"
+      "punpcklbw %%xmm5,%%xmm1                   \n"
+      "punpckhbw %%xmm5,%%xmm2                   \n"
+      "pmaddwd   %%xmm1,%%xmm1                   \n"
+      "pmaddwd   %%xmm2,%%xmm2                   \n"
+      "paddd     %%xmm1,%%xmm0                   \n"
+      "paddd     %%xmm2,%%xmm0                   \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+
+      "pshufd    $0xee,%%xmm0,%%xmm1             \n"
+      "paddd     %%xmm1,%%xmm0                   \n"
+      "pshufd    $0x1,%%xmm0,%%xmm1              \n"
+      "paddd     %%xmm1,%%xmm0                   \n"
+      "movd      %%xmm0,%3                       \n"
+
+      : "+r"(src_a),  // %0
+        "+r"(src_b),  // %1
+        "+r"(count),  // %2
+        "=g"(sse)     // %3
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
   return sse;
 }
 
-static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
-static uvec32 kHashMul0 = {
-  0x0c3525e1,  // 33 ^ 15
-  0xa3476dc1,  // 33 ^ 14
-  0x3b4039a1,  // 33 ^ 13
-  0x4f5f0981,  // 33 ^ 12
+static const uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0};  // 33 ^ 16
+static const uvec32 kHashMul0 = {
+    0x0c3525e1,  // 33 ^ 15
+    0xa3476dc1,  // 33 ^ 14
+    0x3b4039a1,  // 33 ^ 13
+    0x4f5f0981,  // 33 ^ 12
 };
-static uvec32 kHashMul1 = {
-  0x30f35d61,  // 33 ^ 11
-  0x855cb541,  // 33 ^ 10
-  0x040a9121,  // 33 ^ 9
-  0x747c7101,  // 33 ^ 8
+static const uvec32 kHashMul1 = {
+    0x30f35d61,  // 33 ^ 11
+    0x855cb541,  // 33 ^ 10
+    0x040a9121,  // 33 ^ 9
+    0x747c7101,  // 33 ^ 8
 };
-static uvec32 kHashMul2 = {
-  0xec41d4e1,  // 33 ^ 7
-  0x4cfa3cc1,  // 33 ^ 6
-  0x025528a1,  // 33 ^ 5
-  0x00121881,  // 33 ^ 4
+static const uvec32 kHashMul2 = {
+    0xec41d4e1,  // 33 ^ 7
+    0x4cfa3cc1,  // 33 ^ 6
+    0x025528a1,  // 33 ^ 5
+    0x00121881,  // 33 ^ 4
 };
-static uvec32 kHashMul3 = {
-  0x00008c61,  // 33 ^ 3
-  0x00000441,  // 33 ^ 2
-  0x00000021,  // 33 ^ 1
-  0x00000001,  // 33 ^ 0
+static const uvec32 kHashMul3 = {
+    0x00008c61,  // 33 ^ 3
+    0x00000441,  // 33 ^ 2
+    0x00000021,  // 33 ^ 1
+    0x00000001,  // 33 ^ 0
 };
 
-uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
-  uint32 hash;
-  asm volatile (
-    "movd      %2,%%xmm0                       \n"
-    "pxor      %%xmm7,%%xmm7                   \n"
-    "movdqa    %4,%%xmm6                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
-    "lea       " MEMLEA(0x10, 0) ",%0          \n"
-    "pmulld    %%xmm6,%%xmm0                   \n"
-    "movdqa    %5,%%xmm5                       \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "punpcklbw %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm2,%%xmm3                   \n"
-    "punpcklwd %%xmm7,%%xmm3                   \n"
-    "pmulld    %%xmm5,%%xmm3                   \n"
-    "movdqa    %6,%%xmm5                       \n"
-    "movdqa    %%xmm2,%%xmm4                   \n"
-    "punpckhwd %%xmm7,%%xmm4                   \n"
-    "pmulld    %%xmm5,%%xmm4                   \n"
-    "movdqa    %7,%%xmm5                       \n"
-    "punpckhbw %%xmm7,%%xmm1                   \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "punpcklwd %%xmm7,%%xmm2                   \n"
-    "pmulld    %%xmm5,%%xmm2                   \n"
-    "movdqa    %8,%%xmm5                       \n"
-    "punpckhwd %%xmm7,%%xmm1                   \n"
-    "pmulld    %%xmm5,%%xmm1                   \n"
-    "paddd     %%xmm4,%%xmm3                   \n"
-    "paddd     %%xmm2,%%xmm1                   \n"
-    "paddd     %%xmm3,%%xmm1                   \n"
-    "pshufd    $0xe,%%xmm1,%%xmm2              \n"
-    "paddd     %%xmm2,%%xmm1                   \n"
-    "pshufd    $0x1,%%xmm1,%%xmm2              \n"
-    "paddd     %%xmm2,%%xmm1                   \n"
-    "paddd     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%1                        \n"
-    "jg        1b                              \n"
-    "movd      %%xmm0,%3                       \n"
-  : "+r"(src),        // %0
-    "+r"(count),      // %1
-    "+rm"(seed),      // %2
-    "=g"(hash)        // %3
-  : "m"(kHash16x33),  // %4
-    "m"(kHashMul0),   // %5
-    "m"(kHashMul1),   // %6
-    "m"(kHashMul2),   // %7
-    "m"(kHashMul3)    // %8
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
+  uint32_t hash;
+  asm volatile(
+      "movd      %2,%%xmm0                       \n"
+      "pxor      %%xmm7,%%xmm7                   \n"
+      "movdqa    %4,%%xmm6                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm1                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "pmulld    %%xmm6,%%xmm0                   \n"
+      "movdqa    %5,%%xmm5                       \n"
+      "movdqa    %%xmm1,%%xmm2                   \n"
+      "punpcklbw %%xmm7,%%xmm2                   \n"
+      "movdqa    %%xmm2,%%xmm3                   \n"
+      "punpcklwd %%xmm7,%%xmm3                   \n"
+      "pmulld    %%xmm5,%%xmm3                   \n"
+      "movdqa    %6,%%xmm5                       \n"
+      "movdqa    %%xmm2,%%xmm4                   \n"
+      "punpckhwd %%xmm7,%%xmm4                   \n"
+      "pmulld    %%xmm5,%%xmm4                   \n"
+      "movdqa    %7,%%xmm5                       \n"
+      "punpckhbw %%xmm7,%%xmm1                   \n"
+      "movdqa    %%xmm1,%%xmm2                   \n"
+      "punpcklwd %%xmm7,%%xmm2                   \n"
+      "pmulld    %%xmm5,%%xmm2                   \n"
+      "movdqa    %8,%%xmm5                       \n"
+      "punpckhwd %%xmm7,%%xmm1                   \n"
+      "pmulld    %%xmm5,%%xmm1                   \n"
+      "paddd     %%xmm4,%%xmm3                   \n"
+      "paddd     %%xmm2,%%xmm1                   \n"
+      "paddd     %%xmm3,%%xmm1                   \n"
+      "pshufd    $0xe,%%xmm1,%%xmm2              \n"
+      "paddd     %%xmm2,%%xmm1                   \n"
+      "pshufd    $0x1,%%xmm1,%%xmm2              \n"
+      "paddd     %%xmm2,%%xmm1                   \n"
+      "paddd     %%xmm1,%%xmm0                   \n"
+      "sub       $0x10,%1                        \n"
+      "jg        1b                              \n"
+      "movd      %%xmm0,%3                       \n"
+      : "+r"(src),        // %0
+        "+r"(count),      // %1
+        "+rm"(seed),      // %2
+        "=g"(hash)        // %3
+      : "m"(kHash16x33),  // %4
+        "m"(kHashMul0),   // %5
+        "m"(kHashMul1),   // %6
+        "m"(kHashMul2),   // %7
+        "m"(kHashMul3)    // %8
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
   return hash;
 }
 #endif  // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
@@ -148,4 +358,3 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
 }  // extern "C"
 }  // namespace libyuv
 #endif
-
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/compare_msa.cc b/media/libvpx/libvpx/third_party/libyuv/source/compare_msa.cc
new file mode 100644
index 0000000000..0b807d37be
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/libyuv/source/compare_msa.cc
@@ -0,0 +1,97 @@
+/*
+ *  Copyright 2017 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+// This module is for GCC MSA
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#include "libyuv/macros_msa.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+uint32_t HammingDistance_MSA(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count) {
+  uint32_t diff = 0u;
+  int i;
+  v16u8 src0, src1, src2, src3;
+  v2i64 vec0 = {0}, vec1 = {0};
+
+  for (i = 0; i < count; i += 32) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16);
+    src0 ^= src2;
+    src1 ^= src3;
+    vec0 += __msa_pcnt_d((v2i64)src0);
+    vec1 += __msa_pcnt_d((v2i64)src1);
+    src_a += 32;
+    src_b += 32;
+  }
+
+  vec0 += vec1;
+  diff = (uint32_t)__msa_copy_u_w((v4i32)vec0, 0);
+  diff += (uint32_t)__msa_copy_u_w((v4i32)vec0, 2);
+  return diff;
+}
+
+uint32_t SumSquareError_MSA(const uint8_t* src_a,
+                            const uint8_t* src_b,
+                            int count) {
+  uint32_t sse = 0u;
+  int i;
+  v16u8 src0, src1, src2, src3;
+  v8i16 vec0, vec1, vec2, vec3;
+  v4i32 reg0 = {0}, reg1 = {0}, reg2 = {0}, reg3 = {0};
+  v2i64 tmp0;
+
+  for (i = 0; i < count; i += 32) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16);
+    vec0 = (v8i16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
+    vec1 = (v8i16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
+    vec2 = (v8i16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
+    vec3 = (v8i16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
+    vec0 = __msa_hsub_u_h((v16u8)vec0, (v16u8)vec0);
+    vec1 = __msa_hsub_u_h((v16u8)vec1, (v16u8)vec1);
+    vec2 = __msa_hsub_u_h((v16u8)vec2, (v16u8)vec2);
+    vec3 = __msa_hsub_u_h((v16u8)vec3, (v16u8)vec3);
+    reg0 = __msa_dpadd_s_w(reg0, vec0, vec0);
+    reg1 = __msa_dpadd_s_w(reg1, vec1, vec1);
+    reg2 = __msa_dpadd_s_w(reg2, vec2, vec2);
+    reg3 = __msa_dpadd_s_w(reg3, vec3, vec3);
+    src_a += 32;
+    src_b += 32;
+  }
+
+  reg0 += reg1;
+  reg2 += reg3;
+  reg0 += reg2;
+  tmp0 = __msa_hadd_s_d(reg0, reg0);
+  sse = (uint32_t)__msa_copy_u_w((v4i32)tmp0, 0);
+  sse += (uint32_t)__msa_copy_u_w((v4i32)tmp0, 2);
+  return sse;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/compare_neon.cc b/media/libvpx/libvpx/third_party/libyuv/source/compare_neon.cc
index 49aa3b4eef..2a2181e0cb 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/compare_neon.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/compare_neon.cc
@@ -21,40 +21,70 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
     !defined(__aarch64__)
 
-uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
-  volatile uint32 sse;
-  asm volatile (
-    "vmov.u8    q8, #0                         \n"
-    "vmov.u8    q10, #0                        \n"
-    "vmov.u8    q9, #0                         \n"
-    "vmov.u8    q11, #0                        \n"
+// 256 bits at a time
+// uses short accumulator which restricts count to 131 KB
+uint32_t HammingDistance_NEON(const uint8_t* src_a,
+                              const uint8_t* src_b,
+                              int count) {
+  uint32_t diff;
 
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"
-    MEMACCESS(1)
-    "vld1.8     {q1}, [%1]!                    \n"
-    "subs       %2, %2, #16                    \n"
-    "vsubl.u8   q2, d0, d2                     \n"
-    "vsubl.u8   q3, d1, d3                     \n"
-    "vmlal.s16  q8, d4, d4                     \n"
-    "vmlal.s16  q9, d6, d6                     \n"
-    "vmlal.s16  q10, d5, d5                    \n"
-    "vmlal.s16  q11, d7, d7                    \n"
-    "bgt        1b                             \n"
+  asm volatile(
+      "vmov.u16   q4, #0                         \n"  // accumulator
 
-    "vadd.u32   q8, q8, q9                     \n"
-    "vadd.u32   q10, q10, q11                  \n"
-    "vadd.u32   q11, q8, q10                   \n"
-    "vpaddl.u32 q1, q11                        \n"
-    "vadd.u64   d0, d2, d3                     \n"
-    "vmov.32    %3, d0[0]                      \n"
-    : "+r"(src_a),
-      "+r"(src_b),
-      "+r"(count),
-      "=r"(sse)
-    :
-    : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+      "1:                                        \n"
+      "vld1.8     {q0, q1}, [%0]!                \n"
+      "vld1.8     {q2, q3}, [%1]!                \n"
+      "veor.32    q0, q0, q2                     \n"
+      "veor.32    q1, q1, q3                     \n"
+      "vcnt.i8    q0, q0                         \n"
+      "vcnt.i8    q1, q1                         \n"
+      "subs       %2, %2, #32                    \n"
+      "vadd.u8    q0, q0, q1                     \n"  // 16 byte counts
+      "vpadal.u8  q4, q0                         \n"  // 8 shorts
+      "bgt        1b                             \n"
+
+      "vpaddl.u16 q0, q4                         \n"  // 4 ints
+      "vpadd.u32  d0, d0, d1                     \n"
+      "vpadd.u32  d0, d0, d0                     \n"
+      "vmov.32    %3, d0[0]                      \n"
+
+      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
+      :
+      : "cc", "q0", "q1", "q2", "q3", "q4");
+  return diff;
+}
+
+uint32_t SumSquareError_NEON(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count) {
+  uint32_t sse;
+  asm volatile(
+      "vmov.u8    q8, #0                         \n"
+      "vmov.u8    q10, #0                        \n"
+      "vmov.u8    q9, #0                         \n"
+      "vmov.u8    q11, #0                        \n"
+
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"
+      "vld1.8     {q1}, [%1]!                    \n"
+      "subs       %2, %2, #16                    \n"
+      "vsubl.u8   q2, d0, d2                     \n"
+      "vsubl.u8   q3, d1, d3                     \n"
+      "vmlal.s16  q8, d4, d4                     \n"
+      "vmlal.s16  q9, d6, d6                     \n"
+      "vmlal.s16  q10, d5, d5                    \n"
+      "vmlal.s16  q11, d7, d7                    \n"
+      "bgt        1b                             \n"
+
+      "vadd.u32   q8, q8, q9                     \n"
+      "vadd.u32   q10, q10, q11                  \n"
+      "vadd.u32   q11, q8, q10                   \n"
+      "vpaddl.u32 q1, q11                        \n"
+      "vadd.u64   d0, d2, d3                     \n"
+      "vmov.32    %3, d0[0]                      \n"
+      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
   return sse;
 }
 
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/compare_neon64.cc b/media/libvpx/libvpx/third_party/libyuv/source/compare_neon64.cc
index f9c7df98c8..6e8f672ab7 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/compare_neon64.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/compare_neon64.cc
@@ -20,39 +20,65 @@ extern "C" {
 
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
-uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
-  volatile uint32 sse;
-  asm volatile (
-    "eor        v16.16b, v16.16b, v16.16b      \n"
-    "eor        v18.16b, v18.16b, v18.16b      \n"
-    "eor        v17.16b, v17.16b, v17.16b      \n"
-    "eor        v19.16b, v19.16b, v19.16b      \n"
+// 256 bits at a time
+// uses short accumulator which restricts count to 131 KB
+uint32_t HammingDistance_NEON(const uint8_t* src_a,
+                              const uint8_t* src_b,
+                              int count) {
+  uint32_t diff;
+  asm volatile(
+      "movi       v4.8h, #0                      \n"
 
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"
-    MEMACCESS(1)
-    "ld1        {v1.16b}, [%1], #16            \n"
-    "subs       %w2, %w2, #16                  \n"
-    "usubl      v2.8h, v0.8b, v1.8b            \n"
-    "usubl2     v3.8h, v0.16b, v1.16b          \n"
-    "smlal      v16.4s, v2.4h, v2.4h           \n"
-    "smlal      v17.4s, v3.4h, v3.4h           \n"
-    "smlal2     v18.4s, v2.8h, v2.8h           \n"
-    "smlal2     v19.4s, v3.8h, v3.8h           \n"
-    "b.gt       1b                             \n"
+      "1:                                        \n"
+      "ld1        {v0.16b, v1.16b}, [%0], #32    \n"
+      "ld1        {v2.16b, v3.16b}, [%1], #32    \n"
+      "eor        v0.16b, v0.16b, v2.16b         \n"
+      "eor        v1.16b, v1.16b, v3.16b         \n"
+      "cnt        v0.16b, v0.16b                 \n"
+      "cnt        v1.16b, v1.16b                 \n"
+      "subs       %w2, %w2, #32                  \n"
+      "add        v0.16b, v0.16b, v1.16b         \n"
+      "uadalp     v4.8h, v0.16b                  \n"
+      "b.gt       1b                             \n"
 
-    "add        v16.4s, v16.4s, v17.4s         \n"
-    "add        v18.4s, v18.4s, v19.4s         \n"
-    "add        v19.4s, v16.4s, v18.4s         \n"
-    "addv       s0, v19.4s                     \n"
-    "fmov       %w3, s0                        \n"
-    : "+r"(src_a),
-      "+r"(src_b),
-      "+r"(count),
-      "=r"(sse)
-    :
-    : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
+      "uaddlv     s4, v4.8h                      \n"
+      "fmov       %w3, s4                        \n"
+      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
+      :
+      : "cc", "v0", "v1", "v2", "v3", "v4");
+  return diff;
+}
+
+uint32_t SumSquareError_NEON(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count) {
+  uint32_t sse;
+  asm volatile(
+      "eor        v16.16b, v16.16b, v16.16b      \n"
+      "eor        v18.16b, v18.16b, v18.16b      \n"
+      "eor        v17.16b, v17.16b, v17.16b      \n"
+      "eor        v19.16b, v19.16b, v19.16b      \n"
+
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"
+      "ld1        {v1.16b}, [%1], #16            \n"
+      "subs       %w2, %w2, #16                  \n"
+      "usubl      v2.8h, v0.8b, v1.8b            \n"
+      "usubl2     v3.8h, v0.16b, v1.16b          \n"
+      "smlal      v16.4s, v2.4h, v2.4h           \n"
+      "smlal      v17.4s, v3.4h, v3.4h           \n"
+      "smlal2     v18.4s, v2.8h, v2.8h           \n"
+      "smlal2     v19.4s, v3.8h, v3.8h           \n"
+      "b.gt       1b                             \n"
+
+      "add        v16.4s, v16.4s, v17.4s         \n"
+      "add        v18.4s, v18.4s, v19.4s         \n"
+      "add        v19.4s, v16.4s, v18.4s         \n"
+      "addv       s0, v19.4s                     \n"
+      "fmov       %w3, s0                        \n"
+      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
+      :
+      : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
   return sse;
 }
 
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/compare_win.cc b/media/libvpx/libvpx/third_party/libyuv/source/compare_win.cc
index dc86fe25b1..d57d3d9d1c 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/compare_win.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/compare_win.cc
@@ -13,20 +13,39 @@
 #include "libyuv/compare_row.h"
 #include "libyuv/row.h"
 
+#if defined(_MSC_VER)
+#include <intrin.h>  // For __popcnt
+#endif
+
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 
 // This module is for 32 bit Visual C x86 and clangcl
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
 
-__declspec(naked)
-uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
+uint32_t HammingDistance_SSE42(const uint8_t* src_a,
+                               const uint8_t* src_b,
+                               int count) {
+  uint32_t diff = 0u;
+
+  int i;
+  for (i = 0; i < count - 3; i += 4) {
+    uint32_t x = *((uint32_t*)src_a) ^ *((uint32_t*)src_b);  // NOLINT
+    src_a += 4;
+    src_b += 4;
+    diff += __popcnt(x);
+  }
+  return diff;
+}
+
+__declspec(naked) uint32_t
+    SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) {
   __asm {
-    mov        eax, [esp + 4]    // src_a
-    mov        edx, [esp + 8]    // src_b
-    mov        ecx, [esp + 12]   // count
+    mov        eax, [esp + 4]  // src_a
+    mov        edx, [esp + 8]  // src_b
+    mov        ecx, [esp + 12]  // count
     pxor       xmm0, xmm0
     pxor       xmm5, xmm5
 
@@ -61,13 +80,13 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
 // Visual C 2012 required for AVX2.
 #if _MSC_VER >= 1700
 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
-#pragma warning(disable: 4752)
-__declspec(naked)
-uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
+#pragma warning(disable : 4752)
+__declspec(naked) uint32_t
+    SumSquareError_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count) {
   __asm {
-    mov        eax, [esp + 4]    // src_a
-    mov        edx, [esp + 8]    // src_b
-    mov        ecx, [esp + 12]   // count
+    mov        eax, [esp + 4]  // src_a
+    mov        edx, [esp + 8]  // src_b
+    mov        ecx, [esp + 12]  // count
     vpxor      ymm0, ymm0, ymm0  // sum
     vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck
     sub        edx, eax
@@ -101,65 +120,65 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
 }
 #endif  // _MSC_VER >= 1700
 
-uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
+uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0};  // 33 ^ 16
 uvec32 kHashMul0 = {
-  0x0c3525e1,  // 33 ^ 15
-  0xa3476dc1,  // 33 ^ 14
-  0x3b4039a1,  // 33 ^ 13
-  0x4f5f0981,  // 33 ^ 12
+    0x0c3525e1,  // 33 ^ 15
+    0xa3476dc1,  // 33 ^ 14
+    0x3b4039a1,  // 33 ^ 13
+    0x4f5f0981,  // 33 ^ 12
 };
 uvec32 kHashMul1 = {
-  0x30f35d61,  // 33 ^ 11
-  0x855cb541,  // 33 ^ 10
-  0x040a9121,  // 33 ^ 9
-  0x747c7101,  // 33 ^ 8
+    0x30f35d61,  // 33 ^ 11
+    0x855cb541,  // 33 ^ 10
+    0x040a9121,  // 33 ^ 9
+    0x747c7101,  // 33 ^ 8
 };
 uvec32 kHashMul2 = {
-  0xec41d4e1,  // 33 ^ 7
-  0x4cfa3cc1,  // 33 ^ 6
-  0x025528a1,  // 33 ^ 5
-  0x00121881,  // 33 ^ 4
+    0xec41d4e1,  // 33 ^ 7
+    0x4cfa3cc1,  // 33 ^ 6
+    0x025528a1,  // 33 ^ 5
+    0x00121881,  // 33 ^ 4
 };
 uvec32 kHashMul3 = {
-  0x00008c61,  // 33 ^ 3
-  0x00000441,  // 33 ^ 2
-  0x00000021,  // 33 ^ 1
-  0x00000001,  // 33 ^ 0
+    0x00008c61,  // 33 ^ 3
+    0x00000441,  // 33 ^ 2
+    0x00000021,  // 33 ^ 1
+    0x00000001,  // 33 ^ 0
 };
 
-__declspec(naked)
-uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+__declspec(naked) uint32_t
+    HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
   __asm {
-    mov        eax, [esp + 4]    // src
-    mov        ecx, [esp + 8]    // count
+    mov        eax, [esp + 4]  // src
+    mov        ecx, [esp + 8]  // count
     movd       xmm0, [esp + 12]  // seed
 
-    pxor       xmm7, xmm7        // constant 0 for unpck
+    pxor       xmm7, xmm7  // constant 0 for unpck
     movdqa     xmm6, xmmword ptr kHash16x33
 
   wloop:
-    movdqu     xmm1, [eax]       // src[0-15]
+    movdqu     xmm1, [eax]  // src[0-15]
     lea        eax, [eax + 16]
-    pmulld     xmm0, xmm6        // hash *= 33 ^ 16
+    pmulld     xmm0, xmm6  // hash *= 33 ^ 16
     movdqa     xmm5, xmmword ptr kHashMul0
     movdqa     xmm2, xmm1
-    punpcklbw  xmm2, xmm7        // src[0-7]
+    punpcklbw  xmm2, xmm7  // src[0-7]
     movdqa     xmm3, xmm2
-    punpcklwd  xmm3, xmm7        // src[0-3]
+    punpcklwd  xmm3, xmm7  // src[0-3]
     pmulld     xmm3, xmm5
     movdqa     xmm5, xmmword ptr kHashMul1
     movdqa     xmm4, xmm2
-    punpckhwd  xmm4, xmm7        // src[4-7]
+    punpckhwd  xmm4, xmm7  // src[4-7]
     pmulld     xmm4, xmm5
     movdqa     xmm5, xmmword ptr kHashMul2
-    punpckhbw  xmm1, xmm7        // src[8-15]
+    punpckhbw  xmm1, xmm7  // src[8-15]
     movdqa     xmm2, xmm1
-    punpcklwd  xmm2, xmm7        // src[8-11]
+    punpcklwd  xmm2, xmm7  // src[8-11]
     pmulld     xmm2, xmm5
     movdqa     xmm5, xmmword ptr kHashMul3
-    punpckhwd  xmm1, xmm7        // src[12-15]
+    punpckhwd  xmm1, xmm7  // src[12-15]
     pmulld     xmm1, xmm5
-    paddd      xmm3, xmm4        // add 16 results
+    paddd      xmm3, xmm4  // add 16 results
     paddd      xmm1, xmm2
     paddd      xmm1, xmm3
 
@@ -171,18 +190,18 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
     sub        ecx, 16
     jg         wloop
 
-    movd       eax, xmm0         // return hash
+    movd       eax, xmm0  // return hash
     ret
   }
 }
 
 // Visual C 2012 required for AVX2.
 #if _MSC_VER >= 1700
-__declspec(naked)
-uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
+__declspec(naked) uint32_t
+    HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) {
   __asm {
-    mov        eax, [esp + 4]    // src
-    mov        ecx, [esp + 8]    // count
+    mov        eax, [esp + 4]  // src
+    mov        ecx, [esp + 8]  // count
     vmovd      xmm0, [esp + 12]  // seed
 
   wloop:
@@ -196,7 +215,7 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
     vpmulld    xmm2, xmm2, xmmword ptr kHashMul2
     lea        eax, [eax + 16]
     vpmulld    xmm1, xmm1, xmmword ptr kHashMul3
-    vpaddd     xmm3, xmm3, xmm4        // add 16 results
+    vpaddd     xmm3, xmm3, xmm4  // add 16 results
     vpaddd     xmm1, xmm1, xmm2
     vpaddd     xmm1, xmm1, xmm3
     vpshufd    xmm2, xmm1, 0x0e  // upper 2 dwords
@@ -207,7 +226,7 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
     sub        ecx, 16
     jg         wloop
 
-    vmovd      eax, xmm0         // return hash
+    vmovd      eax, xmm0  // return hash
     vzeroupper
     ret
   }
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/convert.cc b/media/libvpx/libvpx/third_party/libyuv/source/convert.cc
index a33742d24d..375cc732c1 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/convert.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/convert.cc
@@ -14,8 +14,8 @@
 #include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"
-#include "libyuv/scale.h"  // For ScalePlane()
 #include "libyuv/row.h"
+#include "libyuv/scale.h"  // For ScalePlane()
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -28,14 +28,22 @@ static __inline int Abs(int v) {
 }
 
 // Any I4xx To I420 format with mirroring.
-static int I4xxToI420(const uint8* src_y, int src_stride_y,
-                      const uint8* src_u, int src_stride_u,
-                      const uint8* src_v, int src_stride_v,
-                      uint8* dst_y, int dst_stride_y,
-                      uint8* dst_u, int dst_stride_u,
-                      uint8* dst_v, int dst_stride_v,
-                      int src_y_width, int src_y_height,
-                      int src_uv_width, int src_uv_height) {
+static int I4xxToI420(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_y,
+                      int dst_stride_y,
+                      uint8_t* dst_u,
+                      int dst_stride_u,
+                      uint8_t* dst_v,
+                      int dst_stride_v,
+                      int src_y_width,
+                      int src_y_height,
+                      int src_uv_width,
+                      int src_uv_height) {
   const int dst_y_width = Abs(src_y_width);
   const int dst_y_height = Abs(src_y_height);
   const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
@@ -44,35 +52,37 @@ static int I4xxToI420(const uint8* src_y, int src_stride_y,
     return -1;
   }
   if (dst_y) {
-    ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
-               dst_y, dst_stride_y, dst_y_width, dst_y_height,
-               kFilterBilinear);
+    ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y,
+               dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear);
   }
-  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
-             dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
-             kFilterBilinear);
-  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
-             dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
-             kFilterBilinear);
+  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
+             dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
+  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
+             dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
   return 0;
 }
 
-// Copy I420 with optional flipping
+// Copy I420 with optional flipping.
 // TODO(fbarchard): Use Scale plane which supports mirroring, but ensure
 // is does row coalescing.
 LIBYUV_API
-int I420Copy(const uint8* src_y, int src_stride_y,
-             const uint8* src_u, int src_stride_u,
-             const uint8* src_v, int src_stride_v,
-             uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int width, int height) {
+int I420Copy(const uint8_t* src_y,
+             int src_stride_y,
+             const uint8_t* src_u,
+             int src_stride_u,
+             const uint8_t* src_v,
+             int src_stride_v,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if (!src_u || !src_v ||
-      !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -96,79 +106,152 @@ int I420Copy(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
+// Copy I010 with optional flipping.
+LIBYUV_API
+int I010Copy(const uint16_t* src_y,
+             int src_stride_y,
+             const uint16_t* src_u,
+             int src_stride_u,
+             const uint16_t* src_v,
+             int src_stride_v,
+             uint16_t* dst_y,
+             int dst_stride_y,
+             uint16_t* dst_u,
+             int dst_stride_u,
+             uint16_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  if (dst_y) {
+    CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  // Copy UV planes.
+  CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+  CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+  return 0;
+}
+
+// Convert 10 bit YUV to 8 bit.
+LIBYUV_API
+int I010ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  // Convert Y plane.
+  Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, 16384, width,
+                    height);
+  // Convert UV planes.
+  Convert16To8Plane(src_u, src_stride_u, dst_u, dst_stride_u, 16384, halfwidth,
+                    halfheight);
+  Convert16To8Plane(src_v, src_stride_v, dst_v, dst_stride_v, 16384, halfwidth,
+                    halfheight);
+  return 0;
+}
+
 // 422 chroma is 1/2 width, 1x height
 // 420 chroma is 1/2 width, 1/2 height
 LIBYUV_API
-int I422ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int I422ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   const int src_uv_width = SUBSAMPLE(width, 1, 1);
-  return I4xxToI420(src_y, src_stride_y,
-                    src_u, src_stride_u,
-                    src_v, src_stride_v,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height,
-                    src_uv_width, height);
+  return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, src_uv_width, height);
 }
 
 // 444 chroma is 1x width, 1x height
 // 420 chroma is 1/2 width, 1/2 height
 LIBYUV_API
-int I444ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  return I4xxToI420(src_y, src_stride_y,
-                    src_u, src_stride_u,
-                    src_v, src_stride_v,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height,
-                    width, height);
-}
-
-// 411 chroma is 1/4 width, 1x height
-// 420 chroma is 1/2 width, 1/2 height
-LIBYUV_API
-int I411ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  const int src_uv_width = SUBSAMPLE(width, 3, 2);
-  return I4xxToI420(src_y, src_stride_y,
-                    src_u, src_stride_u,
-                    src_v, src_stride_v,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height,
-                    src_uv_width, height);
+int I444ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, width, height);
 }
 
 // I400 is greyscale typically used in MJPG
 LIBYUV_API
-int I400ToI420(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int I400ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if (!dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -186,11 +269,15 @@ int I400ToI420(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
-static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
-                       uint8* dst, int dst_stride,
-                       int width, int height) {
+static void CopyPlane2(const uint8_t* src,
+                       int src_stride_0,
+                       int src_stride_1,
+                       uint8_t* dst,
+                       int dst_stride,
+                       int width,
+                       int height) {
   int y;
-  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+  void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
 #if defined(HAS_COPYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
@@ -211,11 +298,6 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
     CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
   }
 #endif
-#if defined(HAS_COPYROW_MIPS)
-  if (TestCpuFlag(kCpuHasMIPS)) {
-    CopyRow = CopyRow_MIPS;
-  }
-#endif
 
   // Copy plane
   for (y = 0; y < height - 1; y += 2) {
@@ -238,17 +320,22 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
 // src_stride_m420 is row planar. Normally this will be the width in pixels.
 //   The UV plane is half width, but 2 values, so src_stride_m420 applies to
 //   this as well as the two Y planes.
-static int X420ToI420(const uint8* src_y,
-                      int src_stride_y0, int src_stride_y1,
-                      const uint8* src_uv, int src_stride_uv,
-                      uint8* dst_y, int dst_stride_y,
-                      uint8* dst_u, int dst_stride_u,
-                      uint8* dst_v, int dst_stride_v,
-                      int width, int height) {
+static int X420ToI420(const uint8_t* src_y,
+                      int src_stride_y0,
+                      int src_stride_y1,
+                      const uint8_t* src_uv,
+                      int src_stride_uv,
+                      uint8_t* dst_y,
+                      int dst_stride_y,
+                      uint8_t* dst_u,
+                      int dst_stride_u,
+                      uint8_t* dst_v,
+                      int dst_stride_v,
+                      int width,
+                      int height) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if (!src_uv || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_uv || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -265,16 +352,14 @@ static int X420ToI420(const uint8* src_y,
     dst_stride_v = -dst_stride_v;
   }
   // Coalesce rows.
-  if (src_stride_y0 == width &&
-      src_stride_y1 == width &&
+  if (src_stride_y0 == width && src_stride_y1 == width &&
       dst_stride_y == width) {
     width *= height;
     height = 1;
     src_stride_y0 = src_stride_y1 = dst_stride_y = 0;
   }
   // Coalesce rows.
-  if (src_stride_uv == halfwidth * 2 &&
-      dst_stride_u == halfwidth &&
+  if (src_stride_uv == halfwidth * 2 && dst_stride_u == halfwidth &&
       dst_stride_v == halfwidth) {
     halfwidth *= halfheight;
     halfheight = 1;
@@ -299,63 +384,78 @@ static int X420ToI420(const uint8* src_y,
 
 // Convert NV12 to I420.
 LIBYUV_API
-int NV12ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_uv, int src_stride_uv,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  return X420ToI420(src_y, src_stride_y, src_stride_y,
-                    src_uv, src_stride_uv,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height);
+int NV12ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return X420ToI420(src_y, src_stride_y, src_stride_y, src_uv, src_stride_uv,
+                    dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+                    dst_stride_v, width, height);
 }
 
 // Convert NV21 to I420.  Same as NV12 but u and v pointers swapped.
 LIBYUV_API
-int NV21ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_vu, int src_stride_vu,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  return X420ToI420(src_y, src_stride_y, src_stride_y,
-                    src_vu, src_stride_vu,
-                    dst_y, dst_stride_y,
-                    dst_v, dst_stride_v,
-                    dst_u, dst_stride_u,
-                    width, height);
+int NV21ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return X420ToI420(src_y, src_stride_y, src_stride_y, src_vu, src_stride_vu,
+                    dst_y, dst_stride_y, dst_v, dst_stride_v, dst_u,
+                    dst_stride_u, width, height);
 }
 
 // Convert M420 to I420.
 LIBYUV_API
-int M420ToI420(const uint8* src_m420, int src_stride_m420,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int M420ToI420(const uint8_t* src_m420,
+               int src_stride_m420,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,
-                    src_m420 + src_stride_m420 * 2, src_stride_m420 * 3,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
+                    src_m420 + src_stride_m420 * 2, src_stride_m420 * 3, dst_y,
+                    dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
                     width, height);
 }
 
 // Convert YUY2 to I420.
 LIBYUV_API
-int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int YUY2ToI420(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2,
-      uint8* dst_u, uint8* dst_v, int width) = YUY2ToUVRow_C;
-  void (*YUY2ToYRow)(const uint8* src_yuy2,
-      uint8* dst_y, int width) = YUY2ToYRow_C;
+  void (*YUY2ToUVRow)(const uint8_t* src_yuy2, int src_stride_yuy2,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      YUY2ToUVRow_C;
+  void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
+      YUY2ToYRow_C;
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -392,6 +492,16 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
     }
   }
 #endif
+#if defined(HAS_YUY2TOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    YUY2ToYRow = YUY2ToYRow_Any_MSA;
+    YUY2ToUVRow = YUY2ToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToYRow = YUY2ToYRow_MSA;
+      YUY2ToUVRow = YUY2ToUVRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
@@ -411,16 +521,22 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
 
 // Convert UYVY to I420.
 LIBYUV_API
-int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int UYVYToI420(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,
-      uint8* dst_u, uint8* dst_v, int width) = UYVYToUVRow_C;
-  void (*UYVYToYRow)(const uint8* src_uyvy,
-      uint8* dst_y, int width) = UYVYToYRow_C;
+  void (*UYVYToUVRow)(const uint8_t* src_uyvy, int src_stride_uyvy,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      UYVYToUVRow_C;
+  void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) =
+      UYVYToYRow_C;
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -457,6 +573,16 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
     }
   }
 #endif
+#if defined(HAS_UYVYTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    UYVYToYRow = UYVYToYRow_Any_MSA;
+    UYVYToUVRow = UYVYToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToYRow = UYVYToYRow_MSA;
+      UYVYToUVRow = UYVYToUVRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
@@ -476,19 +602,23 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
 
 // Convert ARGB to I420.
 LIBYUV_API
-int ARGBToI420(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int ARGBToI420(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
-  if (!src_argb ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -533,6 +663,22 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
@@ -552,19 +698,23 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
 
 // Convert BGRA to I420.
 LIBYUV_API
-int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int BGRAToI420(const uint8_t* src_bgra,
+               int src_stride_bgra,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra,
-      uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C;
-  void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int width) =
+  void (*BGRAToUVRow)(const uint8_t* src_bgra0, int src_stride_bgra,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      BGRAToUVRow_C;
+  void (*BGRAToYRow)(const uint8_t* src_bgra, uint8_t* dst_y, int width) =
       BGRAToYRow_C;
-  if (!src_bgra ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_bgra || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -592,12 +742,28 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
   }
 #endif
 #if defined(HAS_BGRATOUVROW_NEON)
-    if (TestCpuFlag(kCpuHasNEON)) {
-      BGRAToUVRow = BGRAToUVRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        BGRAToUVRow = BGRAToUVRow_NEON;
-      }
+  if (TestCpuFlag(kCpuHasNEON)) {
+    BGRAToUVRow = BGRAToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToUVRow = BGRAToUVRow_NEON;
     }
+  }
+#endif
+#if defined(HAS_BGRATOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    BGRAToYRow = BGRAToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToYRow = BGRAToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    BGRAToUVRow = BGRAToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToUVRow = BGRAToUVRow_MSA;
+    }
+  }
 #endif
 
   for (y = 0; y < height - 1; y += 2) {
@@ -618,19 +784,23 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
 
 // Convert ABGR to I420.
 LIBYUV_API
-int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int ABGRToI420(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr,
-      uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C;
-  void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int width) =
+  void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ABGRToUVRow_C;
+  void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) =
       ABGRToYRow_C;
-  if (!src_abgr ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_abgr || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -665,6 +835,22 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
     }
   }
 #endif
+#if defined(HAS_ABGRTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ABGRToYRow = ABGRToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ABGRToUVRow = ABGRToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
@@ -684,19 +870,23 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
 
 // Convert RGBA to I420.
 LIBYUV_API
-int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int RGBAToI420(const uint8_t* src_rgba,
+               int src_stride_rgba,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba,
-      uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C;
-  void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int width) =
+  void (*RGBAToUVRow)(const uint8_t* src_rgba0, int src_stride_rgba,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      RGBAToUVRow_C;
+  void (*RGBAToYRow)(const uint8_t* src_rgba, uint8_t* dst_y, int width) =
       RGBAToYRow_C;
-  if (!src_rgba ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_rgba || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -731,6 +921,22 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
     }
   }
 #endif
+#if defined(HAS_RGBATOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGBAToYRow = RGBAToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToYRow = RGBAToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGBAToUVRow = RGBAToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToUVRow = RGBAToUVRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
@@ -750,27 +956,33 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
 
 // Convert RGB24 to I420.
 LIBYUV_API
-int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
-                uint8* dst_y, int dst_stride_y,
-                uint8* dst_u, int dst_stride_u,
-                uint8* dst_v, int dst_stride_v,
-                int width, int height) {
+int RGB24ToI420(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_y,
+                int dst_stride_y,
+                uint8_t* dst_u,
+                int dst_stride_u,
+                uint8_t* dst_v,
+                int dst_stride_v,
+                int width,
+                int height) {
   int y;
-#if defined(HAS_RGB24TOYROW_NEON)
-  void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24,
-      uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C;
-  void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int width) =
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
+  void (*RGB24ToUVRow)(const uint8_t* src_rgb24, int src_stride_rgb24,
+                       uint8_t* dst_u, uint8_t* dst_v, int width) =
+      RGB24ToUVRow_C;
+  void (*RGB24ToYRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) =
       RGB24ToYRow_C;
 #else
-  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+  void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
       RGB24ToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
 #endif
-  if (!src_rgb24 || !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -792,6 +1004,15 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
       }
     }
   }
+#elif defined(HAS_RGB24TOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGB24ToUVRow = RGB24ToUVRow_Any_MSA;
+    RGB24ToYRow = RGB24ToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToYRow = RGB24ToYRow_MSA;
+      RGB24ToUVRow = RGB24ToUVRow_MSA;
+    }
+  }
 // Other platforms do intermediate conversion from RGB24 to ARGB.
 #else
 #if defined(HAS_RGB24TOARGBROW_SSSE3)
@@ -822,14 +1043,17 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
     }
   }
 #endif
+#endif
+
   {
+#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
     // Allocate 2 rows of ARGB.
     const int kRowSize = (width * 4 + 31) & ~31;
     align_buffer_64(row, kRowSize * 2);
 #endif
 
     for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_RGB24TOYROW_NEON)
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
       RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
       RGB24ToYRow(src_rgb24, dst_y, width);
       RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
@@ -846,7 +1070,7 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
       dst_v += dst_stride_v;
     }
     if (height & 1) {
-#if defined(HAS_RGB24TOYROW_NEON)
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
       RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
       RGB24ToYRow(src_rgb24, dst_y, width);
 #else
@@ -855,36 +1079,41 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
       ARGBToYRow(row, dst_y, width);
 #endif
     }
-#if !defined(HAS_RGB24TOYROW_NEON)
+#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
     free_aligned_buffer_64(row);
-  }
 #endif
+  }
   return 0;
 }
 
 // Convert RAW to I420.
 LIBYUV_API
-int RAWToI420(const uint8* src_raw, int src_stride_raw,
-              uint8* dst_y, int dst_stride_y,
-              uint8* dst_u, int dst_stride_u,
-              uint8* dst_v, int dst_stride_v,
-              int width, int height) {
+int RAWToI420(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int width,
+              int height) {
   int y;
-#if defined(HAS_RAWTOYROW_NEON)
-  void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw,
-      uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C;
-  void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int width) =
+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+  void (*RAWToUVRow)(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u,
+                     uint8_t* dst_v, int width) = RAWToUVRow_C;
+  void (*RAWToYRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
       RAWToYRow_C;
 #else
-  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+  void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
       RAWToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
 #endif
-  if (!src_raw || !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -906,6 +1135,15 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
       }
     }
   }
+#elif defined(HAS_RAWTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RAWToUVRow = RAWToUVRow_Any_MSA;
+    RAWToYRow = RAWToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToYRow = RAWToYRow_MSA;
+      RAWToUVRow = RAWToUVRow_MSA;
+    }
+  }
 // Other platforms do intermediate conversion from RAW to ARGB.
 #else
 #if defined(HAS_RAWTOARGBROW_SSSE3)
@@ -936,14 +1174,17 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
     }
   }
 #endif
+#endif
+
   {
+#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
     // Allocate 2 rows of ARGB.
     const int kRowSize = (width * 4 + 31) & ~31;
     align_buffer_64(row, kRowSize * 2);
 #endif
 
     for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_RAWTOYROW_NEON)
+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
       RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
       RAWToYRow(src_raw, dst_y, width);
       RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
@@ -960,7 +1201,7 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
       dst_v += dst_stride_v;
     }
     if (height & 1) {
-#if defined(HAS_RAWTOYROW_NEON)
+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
       RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
       RAWToYRow(src_raw, dst_y, width);
 #else
@@ -969,36 +1210,42 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
       ARGBToYRow(row, dst_y, width);
 #endif
     }
-#if !defined(HAS_RAWTOYROW_NEON)
+#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
     free_aligned_buffer_64(row);
-  }
 #endif
+  }
   return 0;
 }
 
 // Convert RGB565 to I420.
 LIBYUV_API
-int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
-                 uint8* dst_y, int dst_stride_y,
-                 uint8* dst_u, int dst_stride_u,
-                 uint8* dst_v, int dst_stride_v,
-                 int width, int height) {
+int RGB565ToI420(const uint8_t* src_rgb565,
+                 int src_stride_rgb565,
+                 uint8_t* dst_y,
+                 int dst_stride_y,
+                 uint8_t* dst_u,
+                 int dst_stride_u,
+                 uint8_t* dst_v,
+                 int dst_stride_v,
+                 int width,
+                 int height) {
   int y;
-#if defined(HAS_RGB565TOYROW_NEON)
-  void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565,
-      uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C;
-  void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int width) =
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
+  void (*RGB565ToUVRow)(const uint8_t* src_rgb565, int src_stride_rgb565,
+                        uint8_t* dst_u, uint8_t* dst_v, int width) =
+      RGB565ToUVRow_C;
+  void (*RGB565ToYRow)(const uint8_t* src_rgb565, uint8_t* dst_y, int width) =
       RGB565ToYRow_C;
 #else
-  void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
-      RGB565ToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+  void (*RGB565ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+                          int width) = RGB565ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
 #endif
-  if (!src_rgb565 || !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_rgb565 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1020,6 +1267,15 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
       }
     }
   }
+#elif defined(HAS_RGB565TOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGB565ToUVRow = RGB565ToUVRow_Any_MSA;
+    RGB565ToYRow = RGB565ToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGB565ToYRow = RGB565ToYRow_MSA;
+      RGB565ToUVRow = RGB565ToUVRow_MSA;
+    }
+  }
 // Other platforms do intermediate conversion from RGB565 to ARGB.
 #else
 #if defined(HAS_RGB565TOARGBROW_SSE2)
@@ -1057,15 +1313,16 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
       ARGBToYRow = ARGBToYRow_AVX2;
     }
   }
+#endif
 #endif
   {
+#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
     // Allocate 2 rows of ARGB.
     const int kRowSize = (width * 4 + 31) & ~31;
     align_buffer_64(row, kRowSize * 2);
 #endif
-
     for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_RGB565TOYROW_NEON)
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
       RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
       RGB565ToYRow(src_rgb565, dst_y, width);
       RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
@@ -1082,7 +1339,7 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
       dst_v += dst_stride_v;
     }
     if (height & 1) {
-#if defined(HAS_RGB565TOYROW_NEON)
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
       RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
       RGB565ToYRow(src_rgb565, dst_y, width);
 #else
@@ -1091,36 +1348,43 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
       ARGBToYRow(row, dst_y, width);
 #endif
     }
-#if !defined(HAS_RGB565TOYROW_NEON)
+#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
     free_aligned_buffer_64(row);
-  }
 #endif
+  }
   return 0;
 }
 
 // Convert ARGB1555 to I420.
 LIBYUV_API
-int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
-                   uint8* dst_y, int dst_stride_y,
-                   uint8* dst_u, int dst_stride_u,
-                   uint8* dst_v, int dst_stride_v,
-                   int width, int height) {
+int ARGB1555ToI420(const uint8_t* src_argb1555,
+                   int src_stride_argb1555,
+                   uint8_t* dst_y,
+                   int dst_stride_y,
+                   uint8_t* dst_u,
+                   int dst_stride_u,
+                   uint8_t* dst_v,
+                   int dst_stride_v,
+                   int width,
+                   int height) {
   int y;
-#if defined(HAS_ARGB1555TOYROW_NEON)
-  void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555,
-      uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C;
-  void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int width) =
-      ARGB1555ToYRow_C;
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
+  void (*ARGB1555ToUVRow)(const uint8_t* src_argb1555, int src_stride_argb1555,
+                          uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGB1555ToUVRow_C;
+  void (*ARGB1555ToYRow)(const uint8_t* src_argb1555, uint8_t* dst_y,
+                         int width) = ARGB1555ToYRow_C;
 #else
-  void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
-      ARGB1555ToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+  void (*ARGB1555ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+                            int width) = ARGB1555ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
 #endif
-  if (!src_argb1555 || !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_argb1555 || !dst_y || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1142,6 +1406,15 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
       }
     }
   }
+#elif defined(HAS_ARGB1555TOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA;
+    ARGB1555ToYRow = ARGB1555ToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB1555ToYRow = ARGB1555ToYRow_MSA;
+      ARGB1555ToUVRow = ARGB1555ToUVRow_MSA;
+    }
+  }
 // Other platforms do intermediate conversion from ARGB1555 to ARGB.
 #else
 #if defined(HAS_ARGB1555TOARGBROW_SSE2)
@@ -1179,15 +1452,17 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
       ARGBToYRow = ARGBToYRow_AVX2;
     }
   }
+#endif
 #endif
   {
+#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
     // Allocate 2 rows of ARGB.
     const int kRowSize = (width * 4 + 31) & ~31;
     align_buffer_64(row, kRowSize * 2);
 #endif
 
     for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_ARGB1555TOYROW_NEON)
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
       ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
       ARGB1555ToYRow(src_argb1555, dst_y, width);
       ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
@@ -1206,7 +1481,7 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
       dst_v += dst_stride_v;
     }
     if (height & 1) {
-#if defined(HAS_ARGB1555TOYROW_NEON)
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
       ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
       ARGB1555ToYRow(src_argb1555, dst_y, width);
 #else
@@ -1215,36 +1490,43 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
       ARGBToYRow(row, dst_y, width);
 #endif
     }
-#if !defined(HAS_ARGB1555TOYROW_NEON)
+#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
     free_aligned_buffer_64(row);
-  }
 #endif
+  }
   return 0;
 }
 
 // Convert ARGB4444 to I420.
 LIBYUV_API
-int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
-                   uint8* dst_y, int dst_stride_y,
-                   uint8* dst_u, int dst_stride_u,
-                   uint8* dst_v, int dst_stride_v,
-                   int width, int height) {
+int ARGB4444ToI420(const uint8_t* src_argb4444,
+                   int src_stride_argb4444,
+                   uint8_t* dst_y,
+                   int dst_stride_y,
+                   uint8_t* dst_u,
+                   int dst_stride_u,
+                   uint8_t* dst_v,
+                   int dst_stride_v,
+                   int width,
+                   int height) {
   int y;
 #if defined(HAS_ARGB4444TOYROW_NEON)
-  void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444,
-      uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C;
-  void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int width) =
-      ARGB4444ToYRow_C;
+  void (*ARGB4444ToUVRow)(const uint8_t* src_argb4444, int src_stride_argb4444,
+                          uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGB4444ToUVRow_C;
+  void (*ARGB4444ToYRow)(const uint8_t* src_argb4444, uint8_t* dst_y,
+                         int width) = ARGB4444ToYRow_C;
 #else
-  void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
-      ARGB4444ToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+  void (*ARGB4444ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+                            int width) = ARGB4444ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
 #endif
-  if (!src_argb4444 || !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_argb4444 || !dst_y || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1284,6 +1566,14 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
     }
   }
 #endif
+#if defined(HAS_ARGB4444TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
@@ -1304,7 +1594,22 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+      if (IS_ALIGNED(width, 32)) {
+        ARGBToUVRow = ARGBToUVRow_MSA;
+      }
+    }
+  }
+#endif
+#endif
+
   {
+#if !defined(HAS_ARGB4444TOYROW_NEON)
     // Allocate 2 rows of ARGB.
     const int kRowSize = (width * 4 + 31) & ~31;
     align_buffer_64(row, kRowSize * 2);
@@ -1341,13 +1646,15 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
     }
 #if !defined(HAS_ARGB4444TOYROW_NEON)
     free_aligned_buffer_64(row);
-  }
 #endif
+  }
   return 0;
 }
 
-static void SplitPixels(const uint8* src_u, int src_pixel_stride_uv,
-                        uint8* dst_u, int width) {
+static void SplitPixels(const uint8_t* src_u,
+                        int src_pixel_stride_uv,
+                        uint8_t* dst_u,
+                        int width) {
   int i;
   for (i = 0; i < width; ++i) {
     *dst_u = *src_u;
@@ -1358,21 +1665,26 @@ static void SplitPixels(const uint8* src_u, int src_pixel_stride_uv,
 
 // Convert Android420 to I420.
 LIBYUV_API
-int Android420ToI420(const uint8* src_y, int src_stride_y,
-                     const uint8* src_u, int src_stride_u,
-                     const uint8* src_v, int src_stride_v,
+int Android420ToI420(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
                      int src_pixel_stride_uv,
-                     uint8* dst_y, int dst_stride_y,
-                     uint8* dst_u, int dst_stride_u,
-                     uint8* dst_v, int dst_stride_v,
-                     int width, int height) {
+                     uint8_t* dst_y,
+                     int dst_stride_y,
+                     uint8_t* dst_u,
+                     int dst_stride_u,
+                     uint8_t* dst_v,
+                     int dst_stride_v,
+                     int width,
+                     int height) {
   int y;
-  const int vu_off = src_v - src_u;
+  const ptrdiff_t vu_off = src_v - src_u;
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if (!src_u || !src_v ||
-      !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1396,15 +1708,16 @@ int Android420ToI420(const uint8* src_y, int src_stride_y,
     CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
     CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
     return 0;
-  // Split UV planes - NV21
-  } else if (src_pixel_stride_uv == 2 && vu_off == -1 &&
-             src_stride_u == src_stride_v) {
+    // Split UV planes - NV21
+  }
+  if (src_pixel_stride_uv == 2 && vu_off == -1 &&
+      src_stride_u == src_stride_v) {
     SplitUVPlane(src_v, src_stride_v, dst_v, dst_stride_v, dst_u, dst_stride_u,
                  halfwidth, halfheight);
     return 0;
-  // Split UV planes - NV12
-  } else if (src_pixel_stride_uv == 2 && vu_off == 1 &&
-             src_stride_u == src_stride_v) {
+    // Split UV planes - NV12
+  }
+  if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) {
     SplitUVPlane(src_u, src_stride_u, dst_u, dst_stride_u, dst_v, dst_stride_v,
                  halfwidth, halfheight);
     return 0;
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/convert_argb.cc b/media/libvpx/libvpx/third_party/libyuv/source/convert_argb.cc
index fb9582d627..f2fe474f70 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/convert_argb.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/convert_argb.cc
@@ -26,11 +26,13 @@ extern "C" {
 
 // Copy ARGB with optional flipping
 LIBYUV_API
-int ARGBCopy(const uint8* src_argb, int src_stride_argb,
-             uint8* dst_argb, int dst_stride_argb,
-             int width, int height) {
-  if (!src_argb || !dst_argb ||
-      width <= 0 || height == 0) {
+int ARGBCopy(const uint8_t* src_argb,
+             int src_stride_argb,
+             uint8_t* dst_argb,
+             int dst_stride_argb,
+             int width,
+             int height) {
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -40,27 +42,29 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
 
-  CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
-            width * 4, height);
+  CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width * 4,
+            height);
   return 0;
 }
 
-// Convert I422 to ARGB with matrix
-static int I420ToARGBMatrix(const uint8* src_y, int src_stride_y,
-                            const uint8* src_u, int src_stride_u,
-                            const uint8* src_v, int src_stride_v,
-                            uint8* dst_argb, int dst_stride_argb,
+// Convert I420 to ARGB with matrix
+static int I420ToARGBMatrix(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_argb,
+                            int dst_stride_argb,
                             const struct YuvConstants* yuvconstants,
-                            int width, int height) {
+                            int width,
+                            int height) {
   int y;
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = I422ToARGBRow_C;
-  if (!src_y || !src_u || !src_v || !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -93,13 +97,12 @@ static int I420ToARGBMatrix(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
-#if defined(HAS_I422TOARGBROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
-    I422ToARGBRow = I422ToARGBRow_DSPR2;
+#if defined(HAS_I422TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGBRow = I422ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_MSA;
+    }
   }
 #endif
 
@@ -117,111 +120,130 @@ static int I420ToARGBMatrix(const uint8* src_y, int src_stride_y,
 
 // Convert I420 to ARGB.
 LIBYUV_API
-int I420ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_argb, dst_stride_argb,
-                          &kYuvI601Constants,
-                          width, height);
+int I420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvI601Constants, width, height);
 }
 
 // Convert I420 to ABGR.
 LIBYUV_API
-int I420ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_abgr, dst_stride_abgr,
+int I420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
                           &kYvuI601Constants,  // Use Yvu matrix
                           width, height);
 }
 
 // Convert J420 to ARGB.
 LIBYUV_API
-int J420ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_argb, dst_stride_argb,
-                          &kYuvJPEGConstants,
-                          width, height);
+int J420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvJPEGConstants, width, height);
 }
 
 // Convert J420 to ABGR.
 LIBYUV_API
-int J420ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_abgr, dst_stride_abgr,
+int J420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
                           &kYvuJPEGConstants,  // Use Yvu matrix
                           width, height);
 }
 
 // Convert H420 to ARGB.
 LIBYUV_API
-int H420ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_argb, dst_stride_argb,
-                          &kYuvH709Constants,
-                          width, height);
+int H420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvH709Constants, width, height);
 }
 
 // Convert H420 to ABGR.
 LIBYUV_API
-int H420ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_abgr, dst_stride_abgr,
+int H420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
                           &kYvuH709Constants,  // Use Yvu matrix
                           width, height);
 }
 
 // Convert I422 to ARGB with matrix
-static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y,
-                            const uint8* src_u, int src_stride_u,
-                            const uint8* src_v, int src_stride_v,
-                            uint8* dst_argb, int dst_stride_argb,
+static int I422ToARGBMatrix(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_argb,
+                            int dst_stride_argb,
                             const struct YuvConstants* yuvconstants,
-                            int width, int height) {
+                            int width,
+                            int height) {
   int y;
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = I422ToARGBRow_C;
-  if (!src_y || !src_u || !src_v ||
-      !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -231,10 +253,8 @@ static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y,
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_y == width &&
-      src_stride_u * 2 == width &&
-      src_stride_v * 2 == width &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_y == width && src_stride_u * 2 == width &&
+      src_stride_v * 2 == width && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
@@ -263,13 +283,12 @@ static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
-#if defined(HAS_I422TOARGBROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
-    I422ToARGBRow = I422ToARGBRow_DSPR2;
+#if defined(HAS_I422TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGBRow = I422ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_MSA;
+    }
   }
 #endif
 
@@ -285,111 +304,380 @@ static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y,
 
 // Convert I422 to ARGB.
 LIBYUV_API
-int I422ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_argb, dst_stride_argb,
-                          &kYuvI601Constants,
-                          width, height);
+int I422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvI601Constants, width, height);
 }
 
 // Convert I422 to ABGR.
 LIBYUV_API
-int I422ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_abgr, dst_stride_abgr,
+int I422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
                           &kYvuI601Constants,  // Use Yvu matrix
                           width, height);
 }
 
 // Convert J422 to ARGB.
 LIBYUV_API
-int J422ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_argb, dst_stride_argb,
-                          &kYuvJPEGConstants,
-                          width, height);
+int J422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvJPEGConstants, width, height);
 }
 
 // Convert J422 to ABGR.
 LIBYUV_API
-int J422ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_abgr, dst_stride_abgr,
+int J422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
                           &kYvuJPEGConstants,  // Use Yvu matrix
                           width, height);
 }
 
 // Convert H422 to ARGB.
 LIBYUV_API
-int H422ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_argb, dst_stride_argb,
-                          &kYuvH709Constants,
-                          width, height);
+int H422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvH709Constants, width, height);
 }
 
 // Convert H422 to ABGR.
 LIBYUV_API
-int H422ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_abgr, dst_stride_abgr,
+int H422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuH709Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert 10 bit YUV to ARGB with matrix
+// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
+// multiply 10 bit yuv into high bits to allow any number of bits.
+static int I010ToAR30Matrix(const uint16_t* src_y,
+                            int src_stride_y,
+                            const uint16_t* src_u,
+                            int src_stride_u,
+                            const uint16_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_ar30,
+                            int dst_stride_ar30,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height) {
+  int y;
+  void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
+                        const uint16_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I210ToAR30Row_C;
+  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+    dst_stride_ar30 = -dst_stride_ar30;
+  }
+#if defined(HAS_I210TOAR30ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I210ToAR30Row = I210ToAR30Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I210ToAR30Row = I210ToAR30Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I210TOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I210ToAR30Row = I210ToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I210ToAR30Row = I210ToAR30Row_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+    dst_ar30 += dst_stride_ar30;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I010 to AR30.
+LIBYUV_API
+int I010ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert H010 to AR30.
+LIBYUV_API
+int H010ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYuvH709Constants, width, height);
+}
+
+// Convert I010 to AB30.
+LIBYUV_API
+int I010ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height) {
+  return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                          src_stride_u, dst_ab30, dst_stride_ab30,
+                          &kYvuI601Constants, width, height);
+}
+
+// Convert H010 to AB30.
+LIBYUV_API
+int H010ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height) {
+  return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                          src_stride_u, dst_ab30, dst_stride_ab30,
+                          &kYvuH709Constants, width, height);
+}
+
+// Convert 10 bit YUV to ARGB with matrix
+static int I010ToARGBMatrix(const uint16_t* src_y,
+                            int src_stride_y,
+                            const uint16_t* src_u,
+                            int src_stride_u,
+                            const uint16_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_argb,
+                            int dst_stride_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height) {
+  int y;
+  void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+                        const uint16_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I210ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I210TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I210ToARGBRow = I210ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I210ToARGBRow = I210ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I210TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I210ToARGBRow = I210ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I210ToARGBRow = I210ToARGBRow_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I010 to ARGB.
+LIBYUV_API
+int I010ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert I010 to ABGR.
+LIBYUV_API
+int I010ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I010ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert H010 to ARGB.
+LIBYUV_API
+int H010ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvH709Constants, width, height);
+}
+
+// Convert H010 to ABGR.
+LIBYUV_API
+int H010ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I010ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
                           &kYvuH709Constants,  // Use Yvu matrix
                           width, height);
 }
 
 // Convert I444 to ARGB with matrix
-static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y,
-                            const uint8* src_u, int src_stride_u,
-                            const uint8* src_v, int src_stride_v,
-                            uint8* dst_argb, int dst_stride_argb,
+static int I444ToARGBMatrix(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_argb,
+                            int dst_stride_argb,
                             const struct YuvConstants* yuvconstants,
-                            int width, int height) {
+                            int width,
+                            int height) {
   int y;
-  void (*I444ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = I444ToARGBRow_C;
-  if (!src_y || !src_u || !src_v ||
-      !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I444ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -399,9 +687,7 @@ static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y,
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_y == width &&
-      src_stride_u == width &&
-      src_stride_v == width &&
+  if (src_stride_y == width && src_stride_u == width && src_stride_v == width &&
       dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
@@ -431,6 +717,14 @@ static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I444TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I444ToARGBRow = I444ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
@@ -444,138 +738,81 @@ static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y,
 
 // Convert I444 to ARGB.
 LIBYUV_API
-int I444ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return I444ToARGBMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_argb, dst_stride_argb,
-                          &kYuvI601Constants,
-                          width, height);
+int I444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvI601Constants, width, height);
 }
 
 // Convert I444 to ABGR.
 LIBYUV_API
-int I444ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height) {
-  return I444ToARGBMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_abgr, dst_stride_abgr,
+int I444ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
                           &kYvuI601Constants,  // Use Yvu matrix
                           width, height);
 }
 
 // Convert J444 to ARGB.
 LIBYUV_API
-int J444ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return I444ToARGBMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_argb, dst_stride_argb,
-                          &kYuvJPEGConstants,
-                          width, height);
-}
-
-// Convert I411 to ARGB.
-LIBYUV_API
-int I411ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  int y;
-  void (*I411ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = I411ToARGBRow_C;
-  if (!src_y || !src_u || !src_v ||
-      !dst_argb ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_y == width &&
-      src_stride_u * 4 == width &&
-      src_stride_v * 4 == width &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
-  }
-#if defined(HAS_I411TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I411ToARGBRow = I411ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I411ToARGBRow = I411ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I411TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    I411ToARGBRow = I411ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I411ToARGBRow = I411ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I411TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I411ToARGBRow = I411ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I411ToARGBRow = I411ToARGBRow_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    I411ToARGBRow(src_y, src_u, src_v, dst_argb, &kYuvI601Constants, width);
-    dst_argb += dst_stride_argb;
-    src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
-  }
-  return 0;
+int J444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvJPEGConstants, width, height);
 }
 
 // Convert I420 with Alpha to preattenuated ARGB.
-static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y,
-                                 const uint8* src_u, int src_stride_u,
-                                 const uint8* src_v, int src_stride_v,
-                                 const uint8* src_a, int src_stride_a,
-                                 uint8* dst_argb, int dst_stride_argb,
+static int I420AlphaToARGBMatrix(const uint8_t* src_y,
+                                 int src_stride_y,
+                                 const uint8_t* src_u,
+                                 int src_stride_u,
+                                 const uint8_t* src_v,
+                                 int src_stride_v,
+                                 const uint8_t* src_a,
+                                 int src_stride_a,
+                                 uint8_t* dst_argb,
+                                 int dst_stride_argb,
                                  const struct YuvConstants* yuvconstants,
-                                 int width, int height, int attenuate) {
+                                 int width,
+                                 int height,
+                                 int attenuate) {
   int y;
-  void (*I422AlphaToARGBRow)(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
-                             const uint8* a_buf,
-                             uint8* dst_argb,
+  void (*I422AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                             const uint8_t* v_buf, const uint8_t* a_buf,
+                             uint8_t* dst_argb,
                              const struct YuvConstants* yuvconstants,
                              int width) = I422AlphaToARGBRow_C;
-  void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
+  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                            int width) = ARGBAttenuateRow_C;
-  if (!src_y || !src_u || !src_v || !dst_argb ||
-      width <= 0 || height == 0) {
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -608,13 +845,12 @@ static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
-#if defined(HAS_I422ALPHATOARGBROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
-    I422AlphaToARGBRow = I422AlphaToARGBRow_DSPR2;
+#if defined(HAS_I422ALPHATOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_MSA;
+    }
   }
 #endif
 #if defined(HAS_ARGBATTENUATEROW_SSSE3)
@@ -641,6 +877,14 @@ static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
@@ -661,49 +905,59 @@ static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y,
 
 // Convert I420 with Alpha to ARGB.
 LIBYUV_API
-int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
-                    const uint8* src_u, int src_stride_u,
-                    const uint8* src_v, int src_stride_v,
-                    const uint8* src_a, int src_stride_a,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height, int attenuate) {
-  return I420AlphaToARGBMatrix(src_y, src_stride_y,
-                               src_u, src_stride_u,
-                               src_v, src_stride_v,
-                               src_a, src_stride_a,
-                               dst_argb, dst_stride_argb,
-                               &kYuvI601Constants,
-                               width, height, attenuate);
+int I420AlphaToARGB(const uint8_t* src_y,
+                    int src_stride_y,
+                    const uint8_t* src_u,
+                    int src_stride_u,
+                    const uint8_t* src_v,
+                    int src_stride_v,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height,
+                    int attenuate) {
+  return I420AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                               src_stride_v, src_a, src_stride_a, dst_argb,
+                               dst_stride_argb, &kYuvI601Constants, width,
+                               height, attenuate);
 }
 
 // Convert I420 with Alpha to ABGR.
 LIBYUV_API
-int I420AlphaToABGR(const uint8* src_y, int src_stride_y,
-                    const uint8* src_u, int src_stride_u,
-                    const uint8* src_v, int src_stride_v,
-                    const uint8* src_a, int src_stride_a,
-                    uint8* dst_abgr, int dst_stride_abgr,
-                    int width, int height, int attenuate) {
-  return I420AlphaToARGBMatrix(src_y, src_stride_y,
-                               src_v, src_stride_v,  // Swap U and V
-                               src_u, src_stride_u,
-                               src_a, src_stride_a,
-                               dst_abgr, dst_stride_abgr,
-                               &kYvuI601Constants,  // Use Yvu matrix
-                               width, height, attenuate);
+int I420AlphaToABGR(const uint8_t* src_y,
+                    int src_stride_y,
+                    const uint8_t* src_u,
+                    int src_stride_u,
+                    const uint8_t* src_v,
+                    int src_stride_v,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_abgr,
+                    int dst_stride_abgr,
+                    int width,
+                    int height,
+                    int attenuate) {
+  return I420AlphaToARGBMatrix(
+      src_y, src_stride_y, src_v, src_stride_v,  // Swap U and V
+      src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr,
+      &kYvuI601Constants,  // Use Yvu matrix
+      width, height, attenuate);
 }
 
 // Convert I400 to ARGB.
 LIBYUV_API
-int I400ToARGB(const uint8* src_y, int src_stride_y,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int I400ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
   int y;
-  void (*I400ToARGBRow)(const uint8* y_buf,
-                     uint8* rgb_buf,
-                     int width) = I400ToARGBRow_C;
-  if (!src_y || !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf, int width) =
+      I400ToARGBRow_C;
+  if (!src_y || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -713,8 +967,7 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_y == width &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_y == width && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_y = dst_stride_argb = 0;
@@ -743,6 +996,14 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I400TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I400ToARGBRow = I400ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      I400ToARGBRow = I400ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I400ToARGBRow(src_y, dst_argb, width);
@@ -754,14 +1015,16 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
 
 // Convert J400 to ARGB.
 LIBYUV_API
-int J400ToARGB(const uint8* src_y, int src_stride_y,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int J400ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
   int y;
-  void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int width) =
+  void (*J400ToARGBRow)(const uint8_t* src_y, uint8_t* dst_argb, int width) =
       J400ToARGBRow_C;
-  if (!src_y || !dst_argb ||
-      width <= 0 || height == 0) {
+  if (!src_y || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -771,8 +1034,7 @@ int J400ToARGB(const uint8* src_y, int src_stride_y,
     src_stride_y = -src_stride_y;
   }
   // Coalesce rows.
-  if (src_stride_y == width &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_y == width && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_y = dst_stride_argb = 0;
@@ -800,6 +1062,14 @@ int J400ToARGB(const uint8* src_y, int src_stride_y,
       J400ToARGBRow = J400ToARGBRow_NEON;
     }
   }
+#endif
+#if defined(HAS_J400TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    J400ToARGBRow = J400ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      J400ToARGBRow = J400ToARGBRow_MSA;
+    }
+  }
 #endif
   for (y = 0; y < height; ++y) {
     J400ToARGBRow(src_y, dst_argb, width);
@@ -810,85 +1080,89 @@ int J400ToARGB(const uint8* src_y, int src_stride_y,
 }
 
 // Shuffle table for converting BGRA to ARGB.
-static uvec8 kShuffleMaskBGRAToARGB = {
-  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
-};
+static const uvec8 kShuffleMaskBGRAToARGB = {
+    3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u};
 
 // Shuffle table for converting ABGR to ARGB.
-static uvec8 kShuffleMaskABGRToARGB = {
-  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
-};
+static const uvec8 kShuffleMaskABGRToARGB = {
+    2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u};
 
 // Shuffle table for converting RGBA to ARGB.
-static uvec8 kShuffleMaskRGBAToARGB = {
-  1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
-};
+static const uvec8 kShuffleMaskRGBAToARGB = {
+    1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u};
 
 // Convert BGRA to ARGB.
 LIBYUV_API
-int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return ARGBShuffle(src_bgra, src_stride_bgra,
-                     dst_argb, dst_stride_argb,
-                     (const uint8*)(&kShuffleMaskBGRAToARGB),
-                     width, height);
+int BGRAToARGB(const uint8_t* src_bgra,
+               int src_stride_bgra,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,
+                     (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height);
 }
 
 // Convert ARGB to BGRA (same as BGRAToARGB).
 LIBYUV_API
-int ARGBToBGRA(const uint8* src_bgra, int src_stride_bgra,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return ARGBShuffle(src_bgra, src_stride_bgra,
-                     dst_argb, dst_stride_argb,
-                     (const uint8*)(&kShuffleMaskBGRAToARGB),
-                     width, height);
+int ARGBToBGRA(const uint8_t* src_bgra,
+               int src_stride_bgra,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,
+                     (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height);
 }
 
 // Convert ABGR to ARGB.
 LIBYUV_API
-int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return ARGBShuffle(src_abgr, src_stride_abgr,
-                     dst_argb, dst_stride_argb,
-                     (const uint8*)(&kShuffleMaskABGRToARGB),
-                     width, height);
+int ABGRToARGB(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,
+                     (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height);
 }
 
 // Convert ARGB to ABGR to (same as ABGRToARGB).
 LIBYUV_API
-int ARGBToABGR(const uint8* src_abgr, int src_stride_abgr,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return ARGBShuffle(src_abgr, src_stride_abgr,
-                     dst_argb, dst_stride_argb,
-                     (const uint8*)(&kShuffleMaskABGRToARGB),
-                     width, height);
+int ARGBToABGR(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,
+                     (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height);
 }
 
 // Convert RGBA to ARGB.
 LIBYUV_API
-int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return ARGBShuffle(src_rgba, src_stride_rgba,
-                     dst_argb, dst_stride_argb,
-                     (const uint8*)(&kShuffleMaskRGBAToARGB),
-                     width, height);
+int RGBAToARGB(const uint8_t* src_rgba,
+               int src_stride_rgba,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBShuffle(src_rgba, src_stride_rgba, dst_argb, dst_stride_argb,
+                     (const uint8_t*)(&kShuffleMaskRGBAToARGB), width, height);
 }
 
 // Convert RGB24 to ARGB.
 LIBYUV_API
-int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
-                uint8* dst_argb, int dst_stride_argb,
-                int width, int height) {
+int RGB24ToARGB(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_argb,
+                int dst_stride_argb,
+                int width,
+                int height) {
   int y;
-  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+  void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
       RGB24ToARGBRow_C;
-  if (!src_rgb24 || !dst_argb ||
-      width <= 0 || height == 0) {
+  if (!src_rgb24 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -898,8 +1172,7 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
     src_stride_rgb24 = -src_stride_rgb24;
   }
   // Coalesce rows.
-  if (src_stride_rgb24 == width * 3 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_rgb24 == width * 3 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_rgb24 = dst_stride_argb = 0;
@@ -920,6 +1193,14 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
     }
   }
 #endif
+#if defined(HAS_RGB24TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     RGB24ToARGBRow(src_rgb24, dst_argb, width);
@@ -931,14 +1212,16 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
 
 // Convert RAW to ARGB.
 LIBYUV_API
-int RAWToARGB(const uint8* src_raw, int src_stride_raw,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height) {
+int RAWToARGB(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height) {
   int y;
-  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+  void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
       RAWToARGBRow_C;
-  if (!src_raw || !dst_argb ||
-      width <= 0 || height == 0) {
+  if (!src_raw || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -948,8 +1231,7 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw,
     src_stride_raw = -src_stride_raw;
   }
   // Coalesce rows.
-  if (src_stride_raw == width * 3 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_raw == width * 3 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_raw = dst_stride_argb = 0;
@@ -970,6 +1252,14 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw,
     }
   }
 #endif
+#if defined(HAS_RAWTOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RAWToARGBRow = RAWToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToARGBRow = RAWToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     RAWToARGBRow(src_raw, dst_argb, width);
@@ -981,14 +1271,16 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw,
 
 // Convert RGB565 to ARGB.
 LIBYUV_API
-int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
-                 uint8* dst_argb, int dst_stride_argb,
-                 int width, int height) {
+int RGB565ToARGB(const uint8_t* src_rgb565,
+                 int src_stride_rgb565,
+                 uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int width,
+                 int height) {
   int y;
-  void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int width) =
-      RGB565ToARGBRow_C;
-  if (!src_rgb565 || !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*RGB565ToARGBRow)(const uint8_t* src_rgb565, uint8_t* dst_argb,
+                          int width) = RGB565ToARGBRow_C;
+  if (!src_rgb565 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -998,8 +1290,7 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
     src_stride_rgb565 = -src_stride_rgb565;
   }
   // Coalesce rows.
-  if (src_stride_rgb565 == width * 2 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_rgb565 == width * 2 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_rgb565 = dst_stride_argb = 0;
@@ -1028,6 +1319,14 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
     }
   }
 #endif
+#if defined(HAS_RGB565TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     RGB565ToARGBRow(src_rgb565, dst_argb, width);
@@ -1039,14 +1338,16 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
 
 // Convert ARGB1555 to ARGB.
 LIBYUV_API
-int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
-                   uint8* dst_argb, int dst_stride_argb,
-                   int width, int height) {
+int ARGB1555ToARGB(const uint8_t* src_argb1555,
+                   int src_stride_argb1555,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   int width,
+                   int height) {
   int y;
-  void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb,
-      int width) = ARGB1555ToARGBRow_C;
-  if (!src_argb1555 || !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*ARGB1555ToARGBRow)(const uint8_t* src_argb1555, uint8_t* dst_argb,
+                            int width) = ARGB1555ToARGBRow_C;
+  if (!src_argb1555 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1056,8 +1357,7 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
     src_stride_argb1555 = -src_stride_argb1555;
   }
   // Coalesce rows.
-  if (src_stride_argb1555 == width * 2 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb1555 == width * 2 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb1555 = dst_stride_argb = 0;
@@ -1086,6 +1386,14 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
     }
   }
 #endif
+#if defined(HAS_ARGB1555TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGB1555ToARGBRow(src_argb1555, dst_argb, width);
@@ -1097,14 +1405,16 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
 
 // Convert ARGB4444 to ARGB.
 LIBYUV_API
-int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
-                   uint8* dst_argb, int dst_stride_argb,
-                   int width, int height) {
+int ARGB4444ToARGB(const uint8_t* src_argb4444,
+                   int src_stride_argb4444,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   int width,
+                   int height) {
   int y;
-  void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb,
-      int width) = ARGB4444ToARGBRow_C;
-  if (!src_argb4444 || !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*ARGB4444ToARGBRow)(const uint8_t* src_argb4444, uint8_t* dst_argb,
+                            int width) = ARGB4444ToARGBRow_C;
+  if (!src_argb4444 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1114,8 +1424,7 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
     src_stride_argb4444 = -src_stride_argb4444;
   }
   // Coalesce rows.
-  if (src_stride_argb4444 == width * 2 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb4444 == width * 2 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb4444 = dst_stride_argb = 0;
@@ -1144,6 +1453,14 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
     }
   }
 #endif
+#if defined(HAS_ARGB4444TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGB4444ToARGBRow(src_argb4444, dst_argb, width);
@@ -1153,20 +1470,117 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
   return 0;
 }
 
-// Convert NV12 to ARGB.
+// Convert AR30 to ARGB.
 LIBYUV_API
-int NV12ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_uv, int src_stride_uv,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int AR30ToARGB(const uint8_t* src_ar30,
+               int src_stride_ar30,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
   int y;
-  void (*NV12ToARGBRow)(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = NV12ToARGBRow_C;
-  if (!src_y || !src_uv || !dst_argb ||
-      width <= 0 || height == 0) {
+  if (!src_ar30 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
+    src_stride_ar30 = -src_stride_ar30;
+  }
+  // Coalesce rows.
+  if (src_stride_ar30 == width * 4 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_ar30 = dst_stride_argb = 0;
+  }
+  for (y = 0; y < height; ++y) {
+    AR30ToARGBRow_C(src_ar30, dst_argb, width);
+    src_ar30 += src_stride_ar30;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert AR30 to ABGR.
+LIBYUV_API
+int AR30ToABGR(const uint8_t* src_ar30,
+               int src_stride_ar30,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  int y;
+  if (!src_ar30 || !dst_abgr || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
+    src_stride_ar30 = -src_stride_ar30;
+  }
+  // Coalesce rows.
+  if (src_stride_ar30 == width * 4 && dst_stride_abgr == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_ar30 = dst_stride_abgr = 0;
+  }
+  for (y = 0; y < height; ++y) {
+    AR30ToABGRRow_C(src_ar30, dst_abgr, width);
+    src_ar30 += src_stride_ar30;
+    dst_abgr += dst_stride_abgr;
+  }
+  return 0;
+}
+
+// Convert AR30 to AB30.
+LIBYUV_API
+int AR30ToAB30(const uint8_t* src_ar30,
+               int src_stride_ar30,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height) {
+  int y;
+  if (!src_ar30 || !dst_ab30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
+    src_stride_ar30 = -src_stride_ar30;
+  }
+  // Coalesce rows.
+  if (src_stride_ar30 == width * 4 && dst_stride_ab30 == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_ar30 = dst_stride_ab30 = 0;
+  }
+  for (y = 0; y < height; ++y) {
+    AR30ToAB30Row_C(src_ar30, dst_ab30, width);
+    src_ar30 += src_stride_ar30;
+    dst_ab30 += dst_stride_ab30;
+  }
+  return 0;
+}
+
+// Convert NV12 to ARGB with matrix
+static int NV12ToARGBMatrix(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_uv,
+                            int src_stride_uv,
+                            uint8_t* dst_argb,
+                            int dst_stride_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height) {
+  int y;
+  void (*NV12ToARGBRow)(
+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C;
+  if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1199,9 +1613,17 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_NV12TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
-    NV12ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width);
+    NV12ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
     if (y & 1) {
@@ -1211,20 +1633,21 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
-// Convert NV21 to ARGB.
-LIBYUV_API
-int NV21ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_uv, int src_stride_uv,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+// Convert NV21 to ARGB with matrix
+static int NV21ToARGBMatrix(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_vu,
+                            int src_stride_vu,
+                            uint8_t* dst_argb,
+                            int dst_stride_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height) {
   int y;
-  void (*NV21ToARGBRow)(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = NV21ToARGBRow_C;
-  if (!src_y || !src_uv || !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*NV21ToARGBRow)(
+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = NV21ToARGBRow_C;
+  if (!src_y || !src_vu || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1257,11 +1680,136 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_NV21TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToARGBRow = NV21ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
-    NV21ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width);
+    NV21ToARGBRow(src_y, src_vu, dst_argb, yuvconstants, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
+    if (y & 1) {
+      src_vu += src_stride_vu;
+    }
+  }
+  return 0;
+}
+
+// Convert NV12 to ARGB.
+LIBYUV_API
+int NV12ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return NV12ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_argb,
+                          dst_stride_argb, &kYuvI601Constants, width, height);
+}
+
+// Convert NV21 to ARGB.
+LIBYUV_API
+int NV21ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return NV21ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_argb,
+                          dst_stride_argb, &kYuvI601Constants, width, height);
+}
+
+// Convert NV12 to ABGR.
+// To output ABGR instead of ARGB swap the UV and use a mirrrored yuc matrix.
+// To swap the UV use NV12 instead of NV21.LIBYUV_API
+int NV12ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return NV21ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_abgr,
+                          dst_stride_abgr, &kYvuI601Constants, width, height);
+}
+
+// Convert NV21 to ABGR.
+LIBYUV_API
+int NV21ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return NV12ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_abgr,
+                          dst_stride_abgr, &kYvuI601Constants, width, height);
+}
+
+// TODO(fbarchard): Consider SSSE3 2 step conversion.
+// Convert NV12 to RGB24 with matrix
+static int NV12ToRGB24Matrix(const uint8_t* src_y,
+                             int src_stride_y,
+                             const uint8_t* src_uv,
+                             int src_stride_uv,
+                             uint8_t* dst_rgb24,
+                             int dst_stride_rgb24,
+                             const struct YuvConstants* yuvconstants,
+                             int width,
+                             int height) {
+  int y;
+  void (*NV12ToRGB24Row)(
+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = NV12ToRGB24Row_C;
+  if (!src_y || !src_uv || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
+  }
+#if defined(HAS_NV12TORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV12ToRGB24Row = NV12ToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToRGB24Row = NV12ToRGB24Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV12ToRGB24Row = NV12ToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      NV12ToRGB24Row = NV12ToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV12ToRGB24Row = NV12ToRGB24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      NV12ToRGB24Row = NV12ToRGB24Row_AVX2;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV12ToRGB24Row(src_y, src_uv, dst_rgb24, yuvconstants, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
     if (y & 1) {
       src_uv += src_stride_uv;
     }
@@ -1269,19 +1817,109 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
+// Convert NV21 to RGB24 with matrix
+static int NV21ToRGB24Matrix(const uint8_t* src_y,
+                             int src_stride_y,
+                             const uint8_t* src_vu,
+                             int src_stride_vu,
+                             uint8_t* dst_rgb24,
+                             int dst_stride_rgb24,
+                             const struct YuvConstants* yuvconstants,
+                             int width,
+                             int height) {
+  int y;
+  void (*NV21ToRGB24Row)(
+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = NV21ToRGB24Row_C;
+  if (!src_y || !src_vu || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
+  }
+#if defined(HAS_NV21TORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV21ToRGB24Row = NV21ToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToRGB24Row = NV21ToRGB24Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_NV21TORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV21ToRGB24Row = NV21ToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      NV21ToRGB24Row = NV21ToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV21TORGB24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV21ToRGB24Row = NV21ToRGB24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      NV21ToRGB24Row = NV21ToRGB24Row_AVX2;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV21ToRGB24Row(src_y, src_vu, dst_rgb24, yuvconstants, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_vu += src_stride_vu;
+    }
+  }
+  return 0;
+}
+
+// TODO(fbarchard): NV12ToRAW can be implemented by mirrored matrix.
+// Convert NV12 to RGB24.
+LIBYUV_API
+int NV12ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_uv,
+                int src_stride_uv,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  return NV12ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv,
+                           dst_rgb24, dst_stride_rgb24, &kYuvI601Constants,
+                           width, height);
+}
+
+// Convert NV21 to RGB24.
+LIBYUV_API
+int NV21ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_vu,
+                int src_stride_vu,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  return NV21ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu,
+                           dst_rgb24, dst_stride_rgb24, &kYuvI601Constants,
+                           width, height);
+}
+
 // Convert M420 to ARGB.
 LIBYUV_API
-int M420ToARGB(const uint8* src_m420, int src_stride_m420,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int M420ToARGB(const uint8_t* src_m420,
+               int src_stride_m420,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
   int y;
-  void (*NV12ToARGBRow)(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = NV12ToARGBRow_C;
-  if (!src_m420 || !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*NV12ToARGBRow)(
+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C;
+  if (!src_m420 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1314,6 +1952,14 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420,
     }
   }
 #endif
+#if defined(HAS_NV12TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
@@ -1332,17 +1978,17 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420,
 
 // Convert YUY2 to ARGB.
 LIBYUV_API
-int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int YUY2ToARGB(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
   int y;
-  void (*YUY2ToARGBRow)(const uint8* src_yuy2,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) =
+  void (*YUY2ToARGBRow)(const uint8_t* src_yuy2, uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants, int width) =
       YUY2ToARGBRow_C;
-  if (!src_yuy2 || !dst_argb ||
-      width <= 0 || height == 0) {
+  if (!src_yuy2 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1352,8 +1998,7 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
     src_stride_yuy2 = -src_stride_yuy2;
   }
   // Coalesce rows.
-  if (src_stride_yuy2 == width * 2 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_yuy2 == width * 2 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_yuy2 = dst_stride_argb = 0;
@@ -1381,6 +2026,14 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
       YUY2ToARGBRow = YUY2ToARGBRow_NEON;
     }
   }
+#endif
+#if defined(HAS_YUY2TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      YUY2ToARGBRow = YUY2ToARGBRow_MSA;
+    }
+  }
 #endif
   for (y = 0; y < height; ++y) {
     YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width);
@@ -1392,17 +2045,17 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
 
 // Convert UYVY to ARGB.
 LIBYUV_API
-int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int UYVYToARGB(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
   int y;
-  void (*UYVYToARGBRow)(const uint8* src_uyvy,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) =
+  void (*UYVYToARGBRow)(const uint8_t* src_uyvy, uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants, int width) =
       UYVYToARGBRow_C;
-  if (!src_uyvy || !dst_argb ||
-      width <= 0 || height == 0) {
+  if (!src_uyvy || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1412,8 +2065,7 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
     src_stride_uyvy = -src_stride_uyvy;
   }
   // Coalesce rows.
-  if (src_stride_uyvy == width * 2 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_uyvy == width * 2 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_uyvy = dst_stride_argb = 0;
@@ -1441,6 +2093,14 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
       UYVYToARGBRow = UYVYToARGBRow_NEON;
     }
   }
+#endif
+#if defined(HAS_UYVYTOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      UYVYToARGBRow = UYVYToARGBRow_MSA;
+    }
+  }
 #endif
   for (y = 0; y < height; ++y) {
     UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width);
@@ -1449,6 +2109,121 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
   }
   return 0;
 }
+static void WeavePixels(const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        int src_pixel_stride_uv,
+                        uint8_t* dst_uv,
+                        int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    dst_uv[0] = *src_u;
+    dst_uv[1] = *src_v;
+    dst_uv += 2;
+    src_u += src_pixel_stride_uv;
+    src_v += src_pixel_stride_uv;
+  }
+}
+
+// Convert Android420 to ARGB.
+LIBYUV_API
+int Android420ToARGBMatrix(const uint8_t* src_y,
+                           int src_stride_y,
+                           const uint8_t* src_u,
+                           int src_stride_u,
+                           const uint8_t* src_v,
+                           int src_stride_v,
+                           int src_pixel_stride_uv,
+                           uint8_t* dst_argb,
+                           int dst_stride_argb,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height) {
+  int y;
+  uint8_t* dst_uv;
+  const ptrdiff_t vu_off = src_v - src_u;
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+
+  // I420
+  if (src_pixel_stride_uv == 1) {
+    return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                            src_stride_v, dst_argb, dst_stride_argb,
+                            yuvconstants, width, height);
+    // NV21
+  }
+  if (src_pixel_stride_uv == 2 && vu_off == -1 &&
+      src_stride_u == src_stride_v) {
+    return NV21ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, dst_argb,
+                            dst_stride_argb, yuvconstants, width, height);
+    // NV12
+  }
+  if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) {
+    return NV12ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, dst_argb,
+                            dst_stride_argb, yuvconstants, width, height);
+  }
+
+  // General case fallback creates NV12
+  align_buffer_64(plane_uv, halfwidth * 2 * halfheight);
+  dst_uv = plane_uv;
+  for (y = 0; y < halfheight; ++y) {
+    WeavePixels(src_u, src_v, src_pixel_stride_uv, dst_uv, halfwidth);
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_uv += halfwidth * 2;
+  }
+  NV12ToARGBMatrix(src_y, src_stride_y, plane_uv, halfwidth * 2, dst_argb,
+                   dst_stride_argb, yuvconstants, width, height);
+  free_aligned_buffer_64(plane_uv);
+  return 0;
+}
+
+// Convert Android420 to ARGB.
+LIBYUV_API
+int Android420ToARGB(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     int src_pixel_stride_uv,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     int width,
+                     int height) {
+  return Android420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                                src_stride_v, src_pixel_stride_uv, dst_argb,
+                                dst_stride_argb, &kYuvI601Constants, width,
+                                height);
+}
+
+// Convert Android420 to ABGR.
+LIBYUV_API
+int Android420ToABGR(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     int src_pixel_stride_uv,
+                     uint8_t* dst_abgr,
+                     int dst_stride_abgr,
+                     int width,
+                     int height) {
+  return Android420ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                                src_stride_u, src_pixel_stride_uv, dst_abgr,
+                                dst_stride_abgr, &kYvuI601Constants, width,
+                                height);
+}
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/convert_from.cc b/media/libvpx/libvpx/third_party/libyuv/source/convert_from.cc
index 3b2dca8163..6fa253237e 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/convert_from.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/convert_from.cc
@@ -15,9 +15,9 @@
 #include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"
+#include "libyuv/row.h"
 #include "libyuv/scale.h"  // For ScalePlane()
 #include "libyuv/video_common.h"
-#include "libyuv/row.h"
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -30,109 +30,144 @@ static __inline int Abs(int v) {
 }
 
 // I420 To any I4xx YUV format with mirroring.
-static int I420ToI4xx(const uint8* src_y, int src_stride_y,
-                      const uint8* src_u, int src_stride_u,
-                      const uint8* src_v, int src_stride_v,
-                      uint8* dst_y, int dst_stride_y,
-                      uint8* dst_u, int dst_stride_u,
-                      uint8* dst_v, int dst_stride_v,
-                      int src_y_width, int src_y_height,
-                      int dst_uv_width, int dst_uv_height) {
+static int I420ToI4xx(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_y,
+                      int dst_stride_y,
+                      uint8_t* dst_u,
+                      int dst_stride_u,
+                      uint8_t* dst_v,
+                      int dst_stride_v,
+                      int src_y_width,
+                      int src_y_height,
+                      int dst_uv_width,
+                      int dst_uv_height) {
   const int dst_y_width = Abs(src_y_width);
   const int dst_y_height = Abs(src_y_height);
   const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1);
   const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1);
-  if (src_y_width == 0 || src_y_height == 0 ||
-      dst_uv_width <= 0 || dst_uv_height <= 0) {
+  if (src_y_width == 0 || src_y_height == 0 || dst_uv_width <= 0 ||
+      dst_uv_height <= 0) {
     return -1;
   }
   if (dst_y) {
-    ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
-               dst_y, dst_stride_y, dst_y_width, dst_y_height,
-               kFilterBilinear);
+    ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y,
+               dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear);
   }
-  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
-             dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
-             kFilterBilinear);
-  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
-             dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
-             kFilterBilinear);
+  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
+             dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
+  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
+             dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
+  return 0;
+}
+
+// Convert 8 bit YUV to 10 bit.
+LIBYUV_API
+int I420ToI010(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  // Convert Y plane.
+  Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 1024, width,
+                    height);
+  // Convert UV planes.
+  Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 1024, halfwidth,
+                    halfheight);
+  Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 1024, halfwidth,
+                    halfheight);
   return 0;
 }
 
 // 420 chroma is 1/2 width, 1/2 height
 // 422 chroma is 1/2 width, 1x height
 LIBYUV_API
-int I420ToI422(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int I420ToI422(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   const int dst_uv_width = (Abs(width) + 1) >> 1;
   const int dst_uv_height = Abs(height);
-  return I420ToI4xx(src_y, src_stride_y,
-                    src_u, src_stride_u,
-                    src_v, src_stride_v,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height,
-                    dst_uv_width, dst_uv_height);
+  return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, dst_uv_width,
+                    dst_uv_height);
 }
 
 // 420 chroma is 1/2 width, 1/2 height
 // 444 chroma is 1x width, 1x height
 LIBYUV_API
-int I420ToI444(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int I420ToI444(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   const int dst_uv_width = Abs(width);
   const int dst_uv_height = Abs(height);
-  return I420ToI4xx(src_y, src_stride_y,
-                    src_u, src_stride_u,
-                    src_v, src_stride_v,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height,
-                    dst_uv_width, dst_uv_height);
-}
-
-// 420 chroma is 1/2 width, 1/2 height
-// 411 chroma is 1/4 width, 1x height
-LIBYUV_API
-int I420ToI411(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  const int dst_uv_width = (Abs(width) + 3) >> 2;
-  const int dst_uv_height = Abs(height);
-  return I420ToI4xx(src_y, src_stride_y,
-                    src_u, src_stride_u,
-                    src_v, src_stride_v,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height,
-                    dst_uv_width, dst_uv_height);
+  return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, dst_uv_width,
+                    dst_uv_height);
 }
 
 // Copy to I400. Source can be I420,422,444,400,NV12,NV21
 LIBYUV_API
-int I400Copy(const uint8* src_y, int src_stride_y,
-             uint8* dst_y, int dst_stride_y,
-             int width, int height) {
-  if (!src_y || !dst_y ||
-      width <= 0 || height == 0) {
+int I400Copy(const uint8_t* src_y,
+             int src_stride_y,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             int width,
+             int height) {
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -146,17 +181,21 @@ int I400Copy(const uint8* src_y, int src_stride_y,
 }
 
 LIBYUV_API
-int I422ToYUY2(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_yuy2, int dst_stride_yuy2,
-               int width, int height) {
+int I422ToYUY2(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height) {
   int y;
-  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
-                        const uint8* src_v, uint8* dst_yuy2, int width) =
+  void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
+                        const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
       I422ToYUY2Row_C;
-  if (!src_y || !src_u || !src_v || !dst_yuy2 ||
-      width <= 0 || height == 0) {
+  if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -166,10 +205,8 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y,
     dst_stride_yuy2 = -dst_stride_yuy2;
   }
   // Coalesce rows.
-  if (src_stride_y == width &&
-      src_stride_u * 2 == width &&
-      src_stride_v * 2 == width &&
-      dst_stride_yuy2 == width * 2) {
+  if (src_stride_y == width && src_stride_u * 2 == width &&
+      src_stride_v * 2 == width && dst_stride_yuy2 == width * 2) {
     width *= height;
     height = 1;
     src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
@@ -182,6 +219,14 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I422TOYUY2ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToYUY2Row = I422ToYUY2Row_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_I422TOYUY2ROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
@@ -202,17 +247,21 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y,
 }
 
 LIBYUV_API
-int I420ToYUY2(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_yuy2, int dst_stride_yuy2,
-               int width, int height) {
+int I420ToYUY2(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height) {
   int y;
-  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
-                        const uint8* src_v, uint8* dst_yuy2, int width) =
+  void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
+                        const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
       I422ToYUY2Row_C;
-  if (!src_y || !src_u || !src_v || !dst_yuy2 ||
-      width <= 0 || height == 0) {
+  if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -229,6 +278,14 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I422TOYUY2ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToYUY2Row = I422ToYUY2Row_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_I422TOYUY2ROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
@@ -237,6 +294,14 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I422TOYUY2ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToYUY2Row = I422ToYUY2Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
@@ -254,17 +319,21 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y,
 }
 
 LIBYUV_API
-int I422ToUYVY(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_uyvy, int dst_stride_uyvy,
-               int width, int height) {
+int I422ToUYVY(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_uyvy,
+               int dst_stride_uyvy,
+               int width,
+               int height) {
   int y;
-  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
-                        const uint8* src_v, uint8* dst_uyvy, int width) =
+  void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
+                        const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
       I422ToUYVYRow_C;
-  if (!src_y || !src_u || !src_v || !dst_uyvy ||
-      width <= 0 || height == 0) {
+  if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -274,10 +343,8 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y,
     dst_stride_uyvy = -dst_stride_uyvy;
   }
   // Coalesce rows.
-  if (src_stride_y == width &&
-      src_stride_u * 2 == width &&
-      src_stride_v * 2 == width &&
-      dst_stride_uyvy == width * 2) {
+  if (src_stride_y == width && src_stride_u * 2 == width &&
+      src_stride_v * 2 == width && dst_stride_uyvy == width * 2) {
     width *= height;
     height = 1;
     src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
@@ -290,6 +357,14 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I422TOUYVYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_I422TOUYVYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
@@ -298,6 +373,14 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
@@ -310,17 +393,21 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y,
 }
 
 LIBYUV_API
-int I420ToUYVY(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_uyvy, int dst_stride_uyvy,
-               int width, int height) {
+int I420ToUYVY(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_uyvy,
+               int dst_stride_uyvy,
+               int width,
+               int height) {
   int y;
-  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
-                        const uint8* src_v, uint8* dst_uyvy, int width) =
+  void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
+                        const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
       I422ToUYVYRow_C;
-  if (!src_y || !src_u || !src_v || !dst_uyvy ||
-      width <= 0 || height == 0) {
+  if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -337,6 +424,14 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I422TOUYVYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_I422TOUYVYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
@@ -345,6 +440,14 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
@@ -363,14 +466,20 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y,
 
 // TODO(fbarchard): test negative height for invert.
 LIBYUV_API
-int I420ToNV12(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height) {
-  if (!src_y || !src_u || !src_v || !dst_y || !dst_uv ||
-      width <= 0 || height == 0) {
+int I420ToNV12(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 ||
+      height == 0) {
     return -1;
   }
   int halfwidth = (width + 1) / 2;
@@ -378,44 +487,47 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
   if (dst_y) {
     CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
   }
-  MergeUVPlane(src_u, src_stride_u,
-               src_v, src_stride_v,
-               dst_uv, dst_stride_uv,
+  MergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv, dst_stride_uv,
                halfwidth, halfheight);
   return 0;
 }
 
 LIBYUV_API
-int I420ToNV21(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_vu, int dst_stride_vu,
-               int width, int height) {
-  return I420ToNV12(src_y, src_stride_y,
-                    src_v, src_stride_v,
-                    src_u, src_stride_u,
-                    dst_y, dst_stride_y,
-                    dst_vu, dst_stride_vu,
+int I420ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
+  return I420ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                    src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu,
                     width, height);
 }
 
 // Convert I422 to RGBA with matrix
-static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y,
-                            const uint8* src_u, int src_stride_u,
-                            const uint8* src_v, int src_stride_v,
-                            uint8* dst_rgba, int dst_stride_rgba,
+static int I420ToRGBAMatrix(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_rgba,
+                            int dst_stride_rgba,
                             const struct YuvConstants* yuvconstants,
-                            int width, int height) {
+                            int width,
+                            int height) {
   int y;
-  void (*I422ToRGBARow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = I422ToRGBARow_C;
-  if (!src_y || !src_u || !src_v || !dst_rgba ||
-      width <= 0 || height == 0) {
+  void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGBARow_C;
+  if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -448,13 +560,12 @@ static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
-#if defined(HAS_I422TORGBAROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
-      IS_ALIGNED(dst_rgba, 4) && IS_ALIGNED(dst_stride_rgba, 4)) {
-    I422ToRGBARow = I422ToRGBARow_DSPR2;
+#if defined(HAS_I422TORGBAROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGBARow = I422ToRGBARow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_MSA;
+    }
   }
 #endif
 
@@ -472,50 +583,58 @@ static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y,
 
 // Convert I420 to RGBA.
 LIBYUV_API
-int I420ToRGBA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_rgba, int dst_stride_rgba,
-               int width, int height) {
-  return I420ToRGBAMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_rgba, dst_stride_rgba,
-                          &kYuvI601Constants,
-                          width, height);
+int I420ToRGBA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height) {
+  return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_rgba, dst_stride_rgba,
+                          &kYuvI601Constants, width, height);
 }
 
 // Convert I420 to BGRA.
 LIBYUV_API
-int I420ToBGRA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_bgra, int dst_stride_bgra,
-               int width, int height) {
-  return I420ToRGBAMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_bgra, dst_stride_bgra,
+int I420ToBGRA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_bgra,
+               int dst_stride_bgra,
+               int width,
+               int height) {
+  return I420ToRGBAMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_bgra, dst_stride_bgra,
                           &kYvuI601Constants,  // Use Yvu matrix
                           width, height);
 }
 
 // Convert I420 to RGB24 with matrix
-static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y,
-                             const uint8* src_u, int src_stride_u,
-                             const uint8* src_v, int src_stride_v,
-                             uint8* dst_rgb24, int dst_stride_rgb24,
+static int I420ToRGB24Matrix(const uint8_t* src_y,
+                             int src_stride_y,
+                             const uint8_t* src_u,
+                             int src_stride_u,
+                             const uint8_t* src_v,
+                             int src_stride_v,
+                             uint8_t* dst_rgb24,
+                             int dst_stride_rgb24,
                              const struct YuvConstants* yuvconstants,
-                             int width, int height) {
+                             int width,
+                             int height) {
   int y;
-  void (*I422ToRGB24Row)(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* rgb_buf,
-                         const struct YuvConstants* yuvconstants,
-                         int width) = I422ToRGB24Row_C;
-  if (!src_y || !src_u || !src_v || !dst_rgb24 ||
-      width <= 0 || height == 0) {
+  void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                         const uint8_t* v_buf, uint8_t* rgb_buf,
+                         const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGB24Row_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -548,6 +667,14 @@ static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I422TORGB24ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB24Row = I422ToRGB24Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
@@ -563,50 +690,95 @@ static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y,
 
 // Convert I420 to RGB24.
 LIBYUV_API
-int I420ToRGB24(const uint8* src_y, int src_stride_y,
-                const uint8* src_u, int src_stride_u,
-                const uint8* src_v, int src_stride_v,
-                uint8* dst_rgb24, int dst_stride_rgb24,
-                int width, int height) {
-  return I420ToRGB24Matrix(src_y, src_stride_y,
-                           src_u, src_stride_u,
-                           src_v, src_stride_v,
-                           dst_rgb24, dst_stride_rgb24,
-                           &kYuvI601Constants,
-                           width, height);
+int I420ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_rgb24, dst_stride_rgb24,
+                           &kYuvI601Constants, width, height);
 }
 
 // Convert I420 to RAW.
 LIBYUV_API
-int I420ToRAW(const uint8* src_y, int src_stride_y,
-              const uint8* src_u, int src_stride_u,
-              const uint8* src_v, int src_stride_v,
-              uint8* dst_raw, int dst_stride_raw,
-              int width, int height) {
-  return I420ToRGB24Matrix(src_y, src_stride_y,
-                           src_v, src_stride_v,  // Swap U and V
-                           src_u, src_stride_u,
-                           dst_raw, dst_stride_raw,
+int I420ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+                           src_stride_v,  // Swap U and V
+                           src_u, src_stride_u, dst_raw, dst_stride_raw,
                            &kYvuI601Constants,  // Use Yvu matrix
                            width, height);
 }
 
+// Convert H420 to RGB24.
+LIBYUV_API
+int H420ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_rgb24, dst_stride_rgb24,
+                           &kYuvH709Constants, width, height);
+}
+
+// Convert H420 to RAW.
+LIBYUV_API
+int H420ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+                           src_stride_v,  // Swap U and V
+                           src_u, src_stride_u, dst_raw, dst_stride_raw,
+                           &kYvuH709Constants,  // Use Yvu matrix
+                           width, height);
+}
+
 // Convert I420 to ARGB1555.
 LIBYUV_API
-int I420ToARGB1555(const uint8* src_y, int src_stride_y,
-                   const uint8* src_u, int src_stride_u,
-                   const uint8* src_v, int src_stride_v,
-                   uint8* dst_argb1555, int dst_stride_argb1555,
-                   int width, int height) {
+int I420ToARGB1555(const uint8_t* src_y,
+                   int src_stride_y,
+                   const uint8_t* src_u,
+                   int src_stride_u,
+                   const uint8_t* src_v,
+                   int src_stride_v,
+                   uint8_t* dst_argb1555,
+                   int dst_stride_argb1555,
+                   int width,
+                   int height) {
   int y;
-  void (*I422ToARGB1555Row)(const uint8* y_buf,
-                            const uint8* u_buf,
-                            const uint8* v_buf,
-                            uint8* rgb_buf,
+  void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                            const uint8_t* v_buf, uint8_t* rgb_buf,
                             const struct YuvConstants* yuvconstants,
                             int width) = I422ToARGB1555Row_C;
-  if (!src_y || !src_u || !src_v || !dst_argb1555 ||
-      width <= 0 || height == 0) {
+  if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 ||
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -639,6 +811,14 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I422TOARGB1555ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants,
@@ -653,23 +833,25 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
-
 // Convert I420 to ARGB4444.
 LIBYUV_API
-int I420ToARGB4444(const uint8* src_y, int src_stride_y,
-                   const uint8* src_u, int src_stride_u,
-                   const uint8* src_v, int src_stride_v,
-                   uint8* dst_argb4444, int dst_stride_argb4444,
-                   int width, int height) {
+int I420ToARGB4444(const uint8_t* src_y,
+                   int src_stride_y,
+                   const uint8_t* src_u,
+                   int src_stride_u,
+                   const uint8_t* src_v,
+                   int src_stride_v,
+                   uint8_t* dst_argb4444,
+                   int dst_stride_argb4444,
+                   int width,
+                   int height) {
   int y;
-  void (*I422ToARGB4444Row)(const uint8* y_buf,
-                            const uint8* u_buf,
-                            const uint8* v_buf,
-                            uint8* rgb_buf,
+  void (*I422ToARGB4444Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                            const uint8_t* v_buf, uint8_t* rgb_buf,
                             const struct YuvConstants* yuvconstants,
                             int width) = I422ToARGB4444Row_C;
-  if (!src_y || !src_u || !src_v || !dst_argb4444 ||
-      width <= 0 || height == 0) {
+  if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 ||
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -702,6 +884,14 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I422TOARGB4444ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants,
@@ -718,20 +908,22 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
 
 // Convert I420 to RGB565.
 LIBYUV_API
-int I420ToRGB565(const uint8* src_y, int src_stride_y,
-                 const uint8* src_u, int src_stride_u,
-                 const uint8* src_v, int src_stride_v,
-                 uint8* dst_rgb565, int dst_stride_rgb565,
-                 int width, int height) {
+int I420ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height) {
   int y;
-  void (*I422ToRGB565Row)(const uint8* y_buf,
-                          const uint8* u_buf,
-                          const uint8* v_buf,
-                          uint8* rgb_buf,
-                          const struct YuvConstants* yuvconstants,
-                          int width) = I422ToRGB565Row_C;
-  if (!src_y || !src_u || !src_v || !dst_rgb565 ||
-      width <= 0 || height == 0) {
+  void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                          const uint8_t* v_buf, uint8_t* rgb_buf,
+                          const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGB565Row_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -764,6 +956,14 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I422TORGB565ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
@@ -777,32 +977,102 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
+// Convert I422 to RGB565.
+LIBYUV_API
+int I422ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height) {
+  int y;
+  void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                          const uint8_t* v_buf, uint8_t* rgb_buf,
+                          const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGB565Row_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+#if defined(HAS_I422TORGB565ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB565Row = I422ToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
+    dst_rgb565 += dst_stride_rgb565;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
 // Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
-static const uint8 kDither565_4x4[16] = {
-  0, 4, 1, 5,
-  6, 2, 7, 3,
-  1, 5, 0, 4,
-  7, 3, 6, 2,
+static const uint8_t kDither565_4x4[16] = {
+    0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
 };
 
 // Convert I420 to RGB565 with dithering.
 LIBYUV_API
-int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
-                       const uint8* src_u, int src_stride_u,
-                       const uint8* src_v, int src_stride_v,
-                       uint8* dst_rgb565, int dst_stride_rgb565,
-                       const uint8* dither4x4, int width, int height) {
+int I420ToRGB565Dither(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const uint8_t* dither4x4,
+                       int width,
+                       int height) {
   int y;
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = I422ToARGBRow_C;
-  void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
-      const uint32 dither4, int width) = ARGBToRGB565DitherRow_C;
-  if (!src_y || !src_u || !src_v || !dst_rgb565 ||
-      width <= 0 || height == 0) {
+  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToARGBRow_C;
+  void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
+                                const uint32_t dither4, int width) =
+      ARGBToRGB565DitherRow_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -838,12 +1108,12 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
-#if defined(HAS_I422TOARGBROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
-    I422ToARGBRow = I422ToARGBRow_DSPR2;
+#if defined(HAS_I422TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGBRow = I422ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_MSA;
+    }
   }
 #endif
 #if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
@@ -869,6 +1139,14 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
       ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
     }
   }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
+    }
+  }
 #endif
   {
     // Allocate a row of argb.
@@ -876,7 +1154,8 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
     for (y = 0; y < height; ++y) {
       I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
       ARGBToRGB565DitherRow(row_argb, dst_rgb565,
-                            *(uint32*)(dither4x4 + ((y & 3) << 2)), width);
+                            *(const uint32_t*)(dither4x4 + ((y & 3) << 2)),
+                            width);
       dst_rgb565 += dst_stride_rgb565;
       src_y += src_stride_y;
       if (y & 1) {
@@ -889,220 +1168,254 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
+// Convert I420 to AR30 with matrix
+static int I420ToAR30Matrix(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_ar30,
+                            int dst_stride_ar30,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height) {
+  int y;
+  void (*I422ToAR30Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToAR30Row_C;
+
+  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+    dst_stride_ar30 = -dst_stride_ar30;
+  }
+
+#if defined(HAS_I422TOAR30ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToAR30Row = I422ToAR30Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToAR30Row = I422ToAR30Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToAR30Row = I422ToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToAR30Row = I422ToAR30Row_AVX2;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+    dst_ar30 += dst_stride_ar30;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to AR30.
+LIBYUV_API
+int I420ToAR30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert H420 to AR30.
+LIBYUV_API
+int H420ToAR30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYvuH709Constants, width, height);
+}
+
 // Convert I420 to specified format
 LIBYUV_API
-int ConvertFromI420(const uint8* y, int y_stride,
-                    const uint8* u, int u_stride,
-                    const uint8* v, int v_stride,
-                    uint8* dst_sample, int dst_sample_stride,
-                    int width, int height,
-                    uint32 fourcc) {
-  uint32 format = CanonicalFourCC(fourcc);
+int ConvertFromI420(const uint8_t* y,
+                    int y_stride,
+                    const uint8_t* u,
+                    int u_stride,
+                    const uint8_t* v,
+                    int v_stride,
+                    uint8_t* dst_sample,
+                    int dst_sample_stride,
+                    int width,
+                    int height,
+                    uint32_t fourcc) {
+  uint32_t format = CanonicalFourCC(fourcc);
   int r = 0;
-  if (!y || !u|| !v || !dst_sample ||
-      width <= 0 || height == 0) {
+  if (!y || !u || !v || !dst_sample || width <= 0 || height == 0) {
     return -1;
   }
   switch (format) {
     // Single plane formats
     case FOURCC_YUY2:
-      r = I420ToYUY2(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 2,
-                     width, height);
+      r = I420ToYUY2(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 2, width,
+                     height);
       break;
     case FOURCC_UYVY:
-      r = I420ToUYVY(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 2,
-                     width, height);
+      r = I420ToUYVY(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 2, width,
+                     height);
       break;
     case FOURCC_RGBP:
-      r = I420ToRGB565(y, y_stride,
-                       u, u_stride,
-                       v, v_stride,
-                       dst_sample,
-                       dst_sample_stride ? dst_sample_stride : width * 2,
-                       width, height);
+      r = I420ToRGB565(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                       dst_sample_stride ? dst_sample_stride : width * 2, width,
+                       height);
       break;
     case FOURCC_RGBO:
-      r = I420ToARGB1555(y, y_stride,
-                         u, u_stride,
-                         v, v_stride,
-                         dst_sample,
+      r = I420ToARGB1555(y, y_stride, u, u_stride, v, v_stride, dst_sample,
                          dst_sample_stride ? dst_sample_stride : width * 2,
                          width, height);
       break;
     case FOURCC_R444:
-      r = I420ToARGB4444(y, y_stride,
-                         u, u_stride,
-                         v, v_stride,
-                         dst_sample,
+      r = I420ToARGB4444(y, y_stride, u, u_stride, v, v_stride, dst_sample,
                          dst_sample_stride ? dst_sample_stride : width * 2,
                          width, height);
       break;
     case FOURCC_24BG:
-      r = I420ToRGB24(y, y_stride,
-                      u, u_stride,
-                      v, v_stride,
-                      dst_sample,
-                      dst_sample_stride ? dst_sample_stride : width * 3,
-                      width, height);
+      r = I420ToRGB24(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                      dst_sample_stride ? dst_sample_stride : width * 3, width,
+                      height);
       break;
     case FOURCC_RAW:
-      r = I420ToRAW(y, y_stride,
-                    u, u_stride,
-                    v, v_stride,
-                    dst_sample,
-                    dst_sample_stride ? dst_sample_stride : width * 3,
-                    width, height);
+      r = I420ToRAW(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                    dst_sample_stride ? dst_sample_stride : width * 3, width,
+                    height);
       break;
     case FOURCC_ARGB:
-      r = I420ToARGB(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 4,
-                     width, height);
+      r = I420ToARGB(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4, width,
+                     height);
       break;
     case FOURCC_BGRA:
-      r = I420ToBGRA(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 4,
-                     width, height);
+      r = I420ToBGRA(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4, width,
+                     height);
       break;
     case FOURCC_ABGR:
-      r = I420ToABGR(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 4,
-                     width, height);
+      r = I420ToABGR(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4, width,
+                     height);
       break;
     case FOURCC_RGBA:
-      r = I420ToRGBA(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 4,
-                     width, height);
+      r = I420ToRGBA(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4, width,
+                     height);
+      break;
+    case FOURCC_AR30:
+      r = I420ToAR30(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4, width,
+                     height);
       break;
     case FOURCC_I400:
-      r = I400Copy(y, y_stride,
-                   dst_sample,
-                   dst_sample_stride ? dst_sample_stride : width,
-                   width, height);
+      r = I400Copy(y, y_stride, dst_sample,
+                   dst_sample_stride ? dst_sample_stride : width, width,
+                   height);
       break;
     case FOURCC_NV12: {
-      uint8* dst_uv = dst_sample + width * height;
-      r = I420ToNV12(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width,
-                     dst_uv,
-                     dst_sample_stride ? dst_sample_stride : width,
-                     width, height);
+      uint8_t* dst_uv = dst_sample + width * height;
+      r = I420ToNV12(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width, dst_uv,
+                     dst_sample_stride ? dst_sample_stride : width, width,
+                     height);
       break;
     }
     case FOURCC_NV21: {
-      uint8* dst_vu = dst_sample + width * height;
-      r = I420ToNV21(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width,
-                     dst_vu,
-                     dst_sample_stride ? dst_sample_stride : width,
-                     width, height);
+      uint8_t* dst_vu = dst_sample + width * height;
+      r = I420ToNV21(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width, dst_vu,
+                     dst_sample_stride ? dst_sample_stride : width, width,
+                     height);
       break;
     }
     // TODO(fbarchard): Add M420.
     // Triplanar formats
-    // TODO(fbarchard): halfstride instead of halfwidth
     case FOURCC_I420:
     case FOURCC_YV12: {
-      int halfwidth = (width + 1) / 2;
+      dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
+      int halfstride = (dst_sample_stride + 1) / 2;
       int halfheight = (height + 1) / 2;
-      uint8* dst_u;
-      uint8* dst_v;
+      uint8_t* dst_u;
+      uint8_t* dst_v;
       if (format == FOURCC_YV12) {
-        dst_v = dst_sample + width * height;
-        dst_u = dst_v + halfwidth * halfheight;
+        dst_v = dst_sample + dst_sample_stride * height;
+        dst_u = dst_v + halfstride * halfheight;
       } else {
-        dst_u = dst_sample + width * height;
-        dst_v = dst_u + halfwidth * halfheight;
+        dst_u = dst_sample + dst_sample_stride * height;
+        dst_v = dst_u + halfstride * halfheight;
       }
-      r = I420Copy(y, y_stride,
-                   u, u_stride,
-                   v, v_stride,
-                   dst_sample, width,
-                   dst_u, halfwidth,
-                   dst_v, halfwidth,
+      r = I420Copy(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                   dst_sample_stride, dst_u, halfstride, dst_v, halfstride,
                    width, height);
       break;
     }
     case FOURCC_I422:
     case FOURCC_YV16: {
-      int halfwidth = (width + 1) / 2;
-      uint8* dst_u;
-      uint8* dst_v;
+      dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
+      int halfstride = (dst_sample_stride + 1) / 2;
+      uint8_t* dst_u;
+      uint8_t* dst_v;
       if (format == FOURCC_YV16) {
-        dst_v = dst_sample + width * height;
-        dst_u = dst_v + halfwidth * height;
+        dst_v = dst_sample + dst_sample_stride * height;
+        dst_u = dst_v + halfstride * height;
       } else {
-        dst_u = dst_sample + width * height;
-        dst_v = dst_u + halfwidth * height;
+        dst_u = dst_sample + dst_sample_stride * height;
+        dst_v = dst_u + halfstride * height;
       }
-      r = I420ToI422(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample, width,
-                     dst_u, halfwidth,
-                     dst_v, halfwidth,
+      r = I420ToI422(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride, dst_u, halfstride, dst_v, halfstride,
                      width, height);
       break;
     }
     case FOURCC_I444:
     case FOURCC_YV24: {
-      uint8* dst_u;
-      uint8* dst_v;
+      dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
+      uint8_t* dst_u;
+      uint8_t* dst_v;
       if (format == FOURCC_YV24) {
-        dst_v = dst_sample + width * height;
-        dst_u = dst_v + width * height;
+        dst_v = dst_sample + dst_sample_stride * height;
+        dst_u = dst_v + dst_sample_stride * height;
       } else {
-        dst_u = dst_sample + width * height;
-        dst_v = dst_u + width * height;
+        dst_u = dst_sample + dst_sample_stride * height;
+        dst_v = dst_u + dst_sample_stride * height;
       }
-      r = I420ToI444(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample, width,
-                     dst_u, width,
-                     dst_v, width,
-                     width, height);
+      r = I420ToI444(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride, dst_u, dst_sample_stride, dst_v,
+                     dst_sample_stride, width, height);
       break;
     }
-    case FOURCC_I411: {
-      int quarterwidth = (width + 3) / 4;
-      uint8* dst_u = dst_sample + width * height;
-      uint8* dst_v = dst_u + quarterwidth * height;
-      r = I420ToI411(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample, width,
-                     dst_u, quarterwidth,
-                     dst_v, quarterwidth,
-                     width, height);
-      break;
-    }
-
     // Formats not supported - MJPG, biplanar, some rgb formats.
     default:
       return -1;  // unknown fourcc - return failure code.
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/convert_from_argb.cc b/media/libvpx/libvpx/third_party/libyuv/source/convert_from_argb.cc
index 2a8682b7eb..c8d91252e9 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/convert_from_argb.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/convert_from_argb.cc
@@ -22,16 +22,21 @@ extern "C" {
 
 // ARGB little endian (bgra in memory) to I444
 LIBYUV_API
-int ARGBToI444(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int ARGBToI444(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
-  void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-      int width) = ARGBToUV444Row_C;
+  void (*ARGBToUV444Row)(const uint8_t* src_argb, uint8_t* dst_u,
+                         uint8_t* dst_v, int width) = ARGBToUV444Row_C;
   if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
@@ -41,20 +46,18 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_y == width &&
-      dst_stride_u == width &&
-      dst_stride_v == width) {
+  if (src_stride_argb == width * 4 && dst_stride_y == width &&
+      dst_stride_u == width && dst_stride_v == width) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
   }
 #if defined(HAS_ARGBTOUV444ROW_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3)) {
-      ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUV444Row = ARGBToUV444Row_SSSE3;
-      }
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV444Row = ARGBToUV444Row_SSSE3;
+    }
   }
 #endif
 #if defined(HAS_ARGBTOUV444ROW_NEON)
@@ -65,6 +68,14 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOUV444ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUV444Row = ARGBToUV444Row_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV444Row = ARGBToUV444Row_MSA;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -89,6 +100,14 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToUV444Row(src_argb, dst_u, dst_v, width);
@@ -103,19 +122,23 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
 
 // ARGB little endian (bgra in memory) to I422
 LIBYUV_API
-int ARGBToI422(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int ARGBToI422(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
-  if (!src_argb ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -125,10 +148,8 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_y == width &&
-      dst_stride_u * 2 == width &&
-      dst_stride_v * 2 == width) {
+  if (src_stride_argb == width * 4 && dst_stride_y == width &&
+      dst_stride_u * 2 == width && dst_stride_v * 2 == width) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
@@ -170,6 +191,23 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
   }
 #endif
 
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_MSA;
+    }
+  }
+#endif
+
   for (y = 0; y < height; ++y) {
     ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
     ARGBToYRow(src_argb, dst_y, width);
@@ -181,95 +219,25 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
   return 0;
 }
 
-// ARGB little endian (bgra in memory) to I411
 LIBYUV_API
-int ARGBToI411(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  int y;
-  void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-      int width) = ARGBToUV411Row_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
-      ARGBToYRow_C;
-  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_y == width &&
-      dst_stride_u * 4 == width &&
-      dst_stride_v * 4 == width) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
-  }
-#if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUV411ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUV411Row = ARGBToUV411Row_Any_NEON;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUV411Row = ARGBToUV411Row_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBToUV411Row(src_argb, dst_u, dst_v, width);
-    ARGBToYRow(src_argb, dst_y, width);
-    src_argb += src_stride_argb;
-    dst_y += dst_stride_y;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  return 0;
-}
-
-LIBYUV_API
-int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height) {
+int ARGBToNV12(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
   int y;
   int halfwidth = (width + 1) >> 1;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
-  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                      int width) = MergeUVRow_C;
-  if (!src_argb ||
-      !dst_y || !dst_uv ||
-      width <= 0 || height == 0) {
+  void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
+                      uint8_t* dst_uv, int width) = MergeUVRow_C;
+  if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -314,6 +282,22 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_MSA;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -337,11 +321,19 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
       MergeUVRow_ = MergeUVRow_NEON;
     }
   }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MergeUVRow_ = MergeUVRow_Any_MSA;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_MSA;
+    }
+  }
 #endif
   {
     // Allocate a rows of uv.
     align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
-    uint8* row_v = row_u + ((halfwidth + 31) & ~31);
+    uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
 
     for (y = 0; y < height - 1; y += 2) {
       ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
@@ -364,21 +356,24 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
 
 // Same as NV12 but U and V swapped.
 LIBYUV_API
-int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height) {
+int ARGBToNV21(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
   int y;
   int halfwidth = (width + 1) >> 1;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
-  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                      int width) = MergeUVRow_C;
-  if (!src_argb ||
-      !dst_y || !dst_uv ||
-      width <= 0 || height == 0) {
+  void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
+                      uint8_t* dst_vu, int width) = MergeUVRow_C;
+  if (!src_argb || !dst_y || !dst_vu || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -423,6 +418,22 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_MSA;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -446,24 +457,32 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
       MergeUVRow_ = MergeUVRow_NEON;
     }
   }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MergeUVRow_ = MergeUVRow_Any_MSA;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_MSA;
+    }
+  }
 #endif
   {
     // Allocate a rows of uv.
     align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
-    uint8* row_v = row_u + ((halfwidth + 31) & ~31);
+    uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
 
     for (y = 0; y < height - 1; y += 2) {
       ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
-      MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+      MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
       ARGBToYRow(src_argb, dst_y, width);
       ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
       src_argb += src_stride_argb * 2;
       dst_y += dst_stride_y * 2;
-      dst_uv += dst_stride_uv;
+      dst_vu += dst_stride_vu;
     }
     if (height & 1) {
       ARGBToUVRow(src_argb, 0, row_u, row_v, width);
-      MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+      MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
       ARGBToYRow(src_argb, dst_y, width);
     }
     free_aligned_buffer_64(row_u);
@@ -473,19 +492,23 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
 
 // Convert ARGB to YUY2.
 LIBYUV_API
-int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yuy2, int dst_stride_yuy2,
-               int width, int height) {
+int ARGBToYUY2(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height) {
   int y;
-  void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+  void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
-  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
-      const uint8* src_v, uint8* dst_yuy2, int width) = I422ToYUY2Row_C;
+  void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
+                        const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
+      I422ToYUY2Row_C;
 
-  if (!src_argb || !dst_yuy2 ||
-      width <= 0 || height == 0) {
+  if (!src_argb || !dst_yuy2 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -495,8 +518,7 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
     dst_stride_yuy2 = -dst_stride_yuy2;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_yuy2 == width * 2) {
+  if (src_stride_argb == width * 4 && dst_stride_yuy2 == width * 2) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_yuy2 = 0;
@@ -537,6 +559,22 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_MSA;
+    }
+  }
+#endif
 #if defined(HAS_I422TOYUY2ROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
@@ -545,6 +583,14 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_I422TOYUY2ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToYUY2Row = I422ToYUY2Row_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_I422TOYUY2ROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
@@ -553,12 +599,20 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_I422TOYUY2ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToYUY2Row = I422ToYUY2Row_MSA;
+    }
+  }
+#endif
 
   {
     // Allocate a rows of yuv.
     align_buffer_64(row_y, ((width + 63) & ~63) * 2);
-    uint8* row_u = row_y + ((width + 63) & ~63);
-    uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+    uint8_t* row_u = row_y + ((width + 63) & ~63);
+    uint8_t* row_v = row_u + ((width + 63) & ~63) / 2;
 
     for (y = 0; y < height; ++y) {
       ARGBToUVRow(src_argb, 0, row_u, row_v, width);
@@ -575,19 +629,23 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
 
 // Convert ARGB to UYVY.
 LIBYUV_API
-int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_uyvy, int dst_stride_uyvy,
-               int width, int height) {
+int ARGBToUYVY(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_uyvy,
+               int dst_stride_uyvy,
+               int width,
+               int height) {
   int y;
-  void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+  void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
-  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
-      const uint8* src_v, uint8* dst_uyvy, int width) = I422ToUYVYRow_C;
+  void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
+                        const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
+      I422ToUYVYRow_C;
 
-  if (!src_argb || !dst_uyvy ||
-      width <= 0 || height == 0) {
+  if (!src_argb || !dst_uyvy || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -597,8 +655,7 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
     dst_stride_uyvy = -dst_stride_uyvy;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_uyvy == width * 2) {
+  if (src_stride_argb == width * 4 && dst_stride_uyvy == width * 2) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_uyvy = 0;
@@ -639,6 +696,22 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_MSA;
+    }
+  }
+#endif
 #if defined(HAS_I422TOUYVYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
@@ -647,6 +720,14 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_I422TOUYVYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_I422TOUYVYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
@@ -655,12 +736,20 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_MSA;
+    }
+  }
+#endif
 
   {
     // Allocate a rows of yuv.
     align_buffer_64(row_y, ((width + 63) & ~63) * 2);
-    uint8* row_u = row_y + ((width + 63) & ~63);
-    uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+    uint8_t* row_u = row_y + ((width + 63) & ~63);
+    uint8_t* row_v = row_u + ((width + 63) & ~63) / 2;
 
     for (y = 0; y < height; ++y) {
       ARGBToUVRow(src_argb, 0, row_u, row_v, width);
@@ -677,11 +766,14 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
 
 // Convert ARGB to I400.
 LIBYUV_API
-int ARGBToI400(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height) {
+int ARGBToI400(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height) {
   int y;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
   if (!src_argb || !dst_y || width <= 0 || height == 0) {
     return -1;
@@ -692,8 +784,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_y == width) {
+  if (src_stride_argb == width * 4 && dst_stride_y == width) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_y = 0;
@@ -722,6 +813,14 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToYRow(src_argb, dst_y, width);
@@ -732,28 +831,31 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
 }
 
 // Shuffle table for converting ARGB to RGBA.
-static uvec8 kShuffleMaskARGBToRGBA = {
-  3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
-};
+static const uvec8 kShuffleMaskARGBToRGBA = {
+    3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u};
 
 // Convert ARGB to RGBA.
 LIBYUV_API
-int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_rgba, int dst_stride_rgba,
-               int width, int height) {
-  return ARGBShuffle(src_argb, src_stride_argb,
-                     dst_rgba, dst_stride_rgba,
-                     (const uint8*)(&kShuffleMaskARGBToRGBA),
-                     width, height);
+int ARGBToRGBA(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height) {
+  return ARGBShuffle(src_argb, src_stride_argb, dst_rgba, dst_stride_rgba,
+                     (const uint8_t*)(&kShuffleMaskARGBToRGBA), width, height);
 }
 
 // Convert ARGB To RGB24.
 LIBYUV_API
-int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
-                uint8* dst_rgb24, int dst_stride_rgb24,
-                int width, int height) {
+int ARGBToRGB24(const uint8_t* src_argb,
+                int src_stride_argb,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
   int y;
-  void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
+  void (*ARGBToRGB24Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) =
       ARGBToRGB24Row_C;
   if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) {
     return -1;
@@ -764,8 +866,7 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_rgb24 == width * 3) {
+  if (src_stride_argb == width * 4 && dst_stride_rgb24 == width * 3) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_rgb24 = 0;
@@ -778,6 +879,22 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB24ROW_AVX512VBMI)
+  if (TestCpuFlag(kCpuHasAVX512VBMI)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_Any_AVX512VBMI;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_AVX512VBMI;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTORGB24ROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
@@ -786,6 +903,14 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORGB24ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToRGB24Row(src_argb, dst_rgb24, width);
@@ -797,11 +922,14 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
 
 // Convert ARGB To RAW.
 LIBYUV_API
-int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_raw, int dst_stride_raw,
-              int width, int height) {
+int ARGBToRAW(const uint8_t* src_argb,
+              int src_stride_argb,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height) {
   int y;
-  void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int width) =
+  void (*ARGBToRAWRow)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) =
       ARGBToRAWRow_C;
   if (!src_argb || !dst_raw || width <= 0 || height == 0) {
     return -1;
@@ -812,8 +940,7 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_raw == width * 3) {
+  if (src_stride_argb == width * 4 && dst_stride_raw == width * 3) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_raw = 0;
@@ -826,6 +953,14 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORAWROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToRAWRow = ARGBToRAWRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToRAWRow = ARGBToRAWRow_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTORAWROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
@@ -834,6 +969,14 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORAWROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToRAWRow = ARGBToRAWRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRAWRow = ARGBToRAWRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToRAWRow(src_argb, dst_raw, width);
@@ -844,21 +987,23 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
 }
 
 // Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
-static const uint8 kDither565_4x4[16] = {
-  0, 4, 1, 5,
-  6, 2, 7, 3,
-  1, 5, 0, 4,
-  7, 3, 6, 2,
+static const uint8_t kDither565_4x4[16] = {
+    0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
 };
 
 // Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
 LIBYUV_API
-int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_rgb565, int dst_stride_rgb565,
-                       const uint8* dither4x4, int width, int height) {
+int ARGBToRGB565Dither(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const uint8_t* dither4x4,
+                       int width,
+                       int height) {
   int y;
-  void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
-      const uint32 dither4, int width) = ARGBToRGB565DitherRow_C;
+  void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
+                                const uint32_t dither4, int width) =
+      ARGBToRGB565DitherRow_C;
   if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
   }
@@ -894,9 +1039,19 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
+    }
+  }
+#endif
+
   for (y = 0; y < height; ++y) {
     ARGBToRGB565DitherRow(src_argb, dst_rgb565,
-                          *(uint32*)(dither4x4 + ((y & 3) << 2)), width);
+                          *(const uint32_t*)(dither4x4 + ((y & 3) << 2)),
+                          width);
     src_argb += src_stride_argb;
     dst_rgb565 += dst_stride_rgb565;
   }
@@ -906,12 +1061,15 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
 // Convert ARGB To RGB565.
 // TODO(fbarchard): Consider using dither function low level with zeros.
 LIBYUV_API
-int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
-                 uint8* dst_rgb565, int dst_stride_rgb565,
-                 int width, int height) {
+int ARGBToRGB565(const uint8_t* src_argb,
+                 int src_stride_argb,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height) {
   int y;
-  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
-      ARGBToRGB565Row_C;
+  void (*ARGBToRGB565Row)(const uint8_t* src_argb, uint8_t* dst_rgb,
+                          int width) = ARGBToRGB565Row_C;
   if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
   }
@@ -921,8 +1079,7 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_rgb565 == width * 2) {
+  if (src_stride_argb == width * 4 && dst_stride_rgb565 == width * 2) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_rgb565 = 0;
@@ -951,6 +1108,14 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORGB565ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToRGB565Row(src_argb, dst_rgb565, width);
@@ -962,12 +1127,15 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
 
 // Convert ARGB To ARGB1555.
 LIBYUV_API
-int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_argb1555, int dst_stride_argb1555,
-                   int width, int height) {
+int ARGBToARGB1555(const uint8_t* src_argb,
+                   int src_stride_argb,
+                   uint8_t* dst_argb1555,
+                   int dst_stride_argb1555,
+                   int width,
+                   int height) {
   int y;
-  void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
-      ARGBToARGB1555Row_C;
+  void (*ARGBToARGB1555Row)(const uint8_t* src_argb, uint8_t* dst_rgb,
+                            int width) = ARGBToARGB1555Row_C;
   if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {
     return -1;
   }
@@ -977,8 +1145,7 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb1555 == width * 2) {
+  if (src_stride_argb == width * 4 && dst_stride_argb1555 == width * 2) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb1555 = 0;
@@ -1007,6 +1174,14 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOARGB1555ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToARGB1555Row(src_argb, dst_argb1555, width);
@@ -1018,12 +1193,15 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
 
 // Convert ARGB To ARGB4444.
 LIBYUV_API
-int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_argb4444, int dst_stride_argb4444,
-                   int width, int height) {
+int ARGBToARGB4444(const uint8_t* src_argb,
+                   int src_stride_argb,
+                   uint8_t* dst_argb4444,
+                   int dst_stride_argb4444,
+                   int width,
+                   int height) {
   int y;
-  void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
-      ARGBToARGB4444Row_C;
+  void (*ARGBToARGB4444Row)(const uint8_t* src_argb, uint8_t* dst_rgb,
+                            int width) = ARGBToARGB4444Row_C;
   if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {
     return -1;
   }
@@ -1033,8 +1211,7 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb4444 == width * 2) {
+  if (src_stride_argb == width * 4 && dst_stride_argb4444 == width * 2) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb4444 = 0;
@@ -1063,6 +1240,14 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOARGB4444ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToARGB4444Row(src_argb, dst_argb4444, width);
@@ -1072,21 +1257,123 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
   return 0;
 }
 
+// Convert ABGR To AR30.
+LIBYUV_API
+int ABGRToAR30(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  int y;
+  void (*ABGRToAR30Row)(const uint8_t* src_abgr, uint8_t* dst_rgb, int width) =
+      ABGRToAR30Row_C;
+  if (!src_abgr || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
+  }
+  // Coalesce rows.
+  if (src_stride_abgr == width * 4 && dst_stride_ar30 == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_abgr = dst_stride_ar30 = 0;
+  }
+#if defined(HAS_ABGRTOAR30ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToAR30Row = ABGRToAR30Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ABGRToAR30Row = ABGRToAR30Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToAR30Row = ABGRToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ABGRToAR30Row = ABGRToAR30Row_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ABGRToAR30Row(src_abgr, dst_ar30, width);
+    src_abgr += src_stride_abgr;
+    dst_ar30 += dst_stride_ar30;
+  }
+  return 0;
+}
+
+// Convert ARGB To AR30.
+LIBYUV_API
+int ARGBToAR30(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  int y;
+  void (*ARGBToAR30Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) =
+      ARGBToAR30Row_C;
+  if (!src_argb || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_ar30 == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_ar30 = 0;
+  }
+#if defined(HAS_ARGBTOAR30ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToAR30Row = ARGBToAR30Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToAR30Row = ARGBToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToAR30Row = ARGBToAR30Row_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBToAR30Row(src_argb, dst_ar30, width);
+    src_argb += src_stride_argb;
+    dst_ar30 += dst_stride_ar30;
+  }
+  return 0;
+}
+
 // Convert ARGB to J420. (JPeg full range I420).
 LIBYUV_API
-int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yj, int dst_stride_yj,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int ARGBToJ420(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
-  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
+  void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
+                       uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVJRow_C;
+  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
       ARGBToYJRow_C;
-  if (!src_argb ||
-      !dst_yj || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1129,6 +1416,22 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYJRow = ARGBToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVJRow = ARGBToUVJRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width);
@@ -1148,19 +1451,23 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
 
 // Convert ARGB to J422. (JPeg full range I422).
 LIBYUV_API
-int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yj, int dst_stride_yj,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int ARGBToJ422(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
-  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
+  void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
+                       uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVJRow_C;
+  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
       ARGBToYJRow_C;
-  if (!src_argb ||
-      !dst_yj || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1170,10 +1477,8 @@ int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_yj == width &&
-      dst_stride_u * 2 == width &&
-      dst_stride_v * 2 == width) {
+  if (src_stride_argb == width * 4 && dst_stride_yj == width &&
+      dst_stride_u * 2 == width && dst_stride_v * 2 == width) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0;
@@ -1212,6 +1517,22 @@ int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYJRow = ARGBToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVJRow = ARGBToUVJRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
@@ -1226,11 +1547,14 @@ int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
 
 // Convert ARGB to J400.
 LIBYUV_API
-int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yj, int dst_stride_yj,
-               int width, int height) {
+int ARGBToJ400(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               int width,
+               int height) {
   int y;
-  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
+  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
       ARGBToYJRow_C;
   if (!src_argb || !dst_yj || width <= 0 || height == 0) {
     return -1;
@@ -1241,8 +1565,7 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_yj == width) {
+  if (src_stride_argb == width * 4 && dst_stride_yj == width) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_yj = 0;
@@ -1271,6 +1594,14 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYJRow = ARGBToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToYJRow(src_argb, dst_yj, width);
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/convert_jpeg.cc b/media/libvpx/libvpx/third_party/libyuv/source/convert_jpeg.cc
index 90f550a26a..ae3cc18cd2 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/convert_jpeg.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/convert_jpeg.cc
@@ -22,28 +22,24 @@ extern "C" {
 
 #ifdef HAVE_JPEG
 struct I420Buffers {
-  uint8* y;
+  uint8_t* y;
   int y_stride;
-  uint8* u;
+  uint8_t* u;
   int u_stride;
-  uint8* v;
+  uint8_t* v;
   int v_stride;
   int w;
   int h;
 };
 
 static void JpegCopyI420(void* opaque,
-                         const uint8* const* data,
+                         const uint8_t* const* data,
                          const int* strides,
                          int rows) {
   I420Buffers* dest = (I420Buffers*)(opaque);
-  I420Copy(data[0], strides[0],
-           data[1], strides[1],
-           data[2], strides[2],
-           dest->y, dest->y_stride,
-           dest->u, dest->u_stride,
-           dest->v, dest->v_stride,
-           dest->w, rows);
+  I420Copy(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+           dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
+           dest->v_stride, dest->w, rows);
   dest->y += rows * dest->y_stride;
   dest->u += ((rows + 1) >> 1) * dest->u_stride;
   dest->v += ((rows + 1) >> 1) * dest->v_stride;
@@ -51,17 +47,13 @@ static void JpegCopyI420(void* opaque,
 }
 
 static void JpegI422ToI420(void* opaque,
-                           const uint8* const* data,
+                           const uint8_t* const* data,
                            const int* strides,
                            int rows) {
   I420Buffers* dest = (I420Buffers*)(opaque);
-  I422ToI420(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->y, dest->y_stride,
-             dest->u, dest->u_stride,
-             dest->v, dest->v_stride,
-             dest->w, rows);
+  I422ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+             dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
+             dest->v_stride, dest->w, rows);
   dest->y += rows * dest->y_stride;
   dest->u += ((rows + 1) >> 1) * dest->u_stride;
   dest->v += ((rows + 1) >> 1) * dest->v_stride;
@@ -69,35 +61,13 @@ static void JpegI422ToI420(void* opaque,
 }
 
 static void JpegI444ToI420(void* opaque,
-                           const uint8* const* data,
+                           const uint8_t* const* data,
                            const int* strides,
                            int rows) {
   I420Buffers* dest = (I420Buffers*)(opaque);
-  I444ToI420(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->y, dest->y_stride,
-             dest->u, dest->u_stride,
-             dest->v, dest->v_stride,
-             dest->w, rows);
-  dest->y += rows * dest->y_stride;
-  dest->u += ((rows + 1) >> 1) * dest->u_stride;
-  dest->v += ((rows + 1) >> 1) * dest->v_stride;
-  dest->h -= rows;
-}
-
-static void JpegI411ToI420(void* opaque,
-                           const uint8* const* data,
-                           const int* strides,
-                           int rows) {
-  I420Buffers* dest = (I420Buffers*)(opaque);
-  I411ToI420(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->y, dest->y_stride,
-             dest->u, dest->u_stride,
-             dest->v, dest->v_stride,
-             dest->w, rows);
+  I444ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+             dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
+             dest->v_stride, dest->w, rows);
   dest->y += rows * dest->y_stride;
   dest->u += ((rows + 1) >> 1) * dest->u_stride;
   dest->v += ((rows + 1) >> 1) * dest->v_stride;
@@ -105,15 +75,12 @@ static void JpegI411ToI420(void* opaque,
 }
 
 static void JpegI400ToI420(void* opaque,
-                           const uint8* const* data,
+                           const uint8_t* const* data,
                            const int* strides,
                            int rows) {
   I420Buffers* dest = (I420Buffers*)(opaque);
-  I400ToI420(data[0], strides[0],
-             dest->y, dest->y_stride,
-             dest->u, dest->u_stride,
-             dest->v, dest->v_stride,
-             dest->w, rows);
+  I400ToI420(data[0], strides[0], dest->y, dest->y_stride, dest->u,
+             dest->u_stride, dest->v, dest->v_stride, dest->w, rows);
   dest->y += rows * dest->y_stride;
   dest->u += ((rows + 1) >> 1) * dest->u_stride;
   dest->v += ((rows + 1) >> 1) * dest->v_stride;
@@ -122,8 +89,10 @@ static void JpegI400ToI420(void* opaque,
 
 // Query size of MJPG in pixels.
 LIBYUV_API
-int MJPGSize(const uint8* sample, size_t sample_size,
-             int* width, int* height) {
+int MJPGSize(const uint8_t* sample,
+             size_t sample_size,
+             int* width,
+             int* height) {
   MJpegDecoder mjpeg_decoder;
   LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
   if (ret) {
@@ -135,15 +104,21 @@ int MJPGSize(const uint8* sample, size_t sample_size,
 }
 
 // MJPG (Motion JPeg) to I420
-// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
+// TODO(fbarchard): review src_width and src_height requirement. dst_width and
+// dst_height may be enough.
 LIBYUV_API
-int MJPGToI420(const uint8* sample,
+int MJPGToI420(const uint8_t* sample,
                size_t sample_size,
-               uint8* y, int y_stride,
-               uint8* u, int u_stride,
-               uint8* v, int v_stride,
-               int w, int h,
-               int dw, int dh) {
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int src_width,
+               int src_height,
+               int dst_width,
+               int dst_height) {
   if (sample_size == kUnknownDataSize) {
     // ERROR: MJPEG frame size unknown
     return -1;
@@ -152,17 +127,17 @@ int MJPGToI420(const uint8* sample,
   // TODO(fbarchard): Port MJpeg to C.
   MJpegDecoder mjpeg_decoder;
   LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
-  if (ret && (mjpeg_decoder.GetWidth() != w ||
-              mjpeg_decoder.GetHeight() != h)) {
+  if (ret && (mjpeg_decoder.GetWidth() != src_width ||
+              mjpeg_decoder.GetHeight() != src_height)) {
     // ERROR: MJPEG frame has unexpected dimensions
     mjpeg_decoder.UnloadFrame();
     return 1;  // runtime failure
   }
   if (ret) {
-    I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh };
+    I420Buffers bufs = {dst_y, dst_stride_y, dst_u,     dst_stride_u,
+                        dst_v, dst_stride_v, dst_width, dst_height};
     // YUV420
-    if (mjpeg_decoder.GetColorSpace() ==
-            MJpegDecoder::kColorSpaceYCbCr &&
+    if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
         mjpeg_decoder.GetNumComponents() == 3 &&
         mjpeg_decoder.GetVertSampFactor(0) == 2 &&
         mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
@@ -170,8 +145,9 @@ int MJPGToI420(const uint8* sample,
         mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
         mjpeg_decoder.GetVertSampFactor(2) == 1 &&
         mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh);
-    // YUV422
+      ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dst_width,
+                                           dst_height);
+      // YUV422
     } else if (mjpeg_decoder.GetColorSpace() ==
                    MJpegDecoder::kColorSpaceYCbCr &&
                mjpeg_decoder.GetNumComponents() == 3 &&
@@ -181,8 +157,9 @@ int MJPGToI420(const uint8* sample,
                mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
                mjpeg_decoder.GetVertSampFactor(2) == 1 &&
                mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh);
-    // YUV444
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dst_width,
+                                           dst_height);
+      // YUV444
     } else if (mjpeg_decoder.GetColorSpace() ==
                    MJpegDecoder::kColorSpaceYCbCr &&
                mjpeg_decoder.GetNumComponents() == 3 &&
@@ -192,28 +169,19 @@ int MJPGToI420(const uint8* sample,
                mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
                mjpeg_decoder.GetVertSampFactor(2) == 1 &&
                mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh);
-    // YUV411
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceYCbCr &&
-               mjpeg_decoder.GetNumComponents() == 3 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToI420, &bufs, dw, dh);
-    // YUV400
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dst_width,
+                                           dst_height);
+      // YUV400
     } else if (mjpeg_decoder.GetColorSpace() ==
                    MJpegDecoder::kColorSpaceGrayscale &&
                mjpeg_decoder.GetNumComponents() == 1 &&
                mjpeg_decoder.GetVertSampFactor(0) == 1 &&
                mjpeg_decoder.GetHorizSampFactor(0) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh);
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dst_width,
+                                           dst_height);
     } else {
       // TODO(fbarchard): Implement conversion for any other colorspace/sample
-      // factors that occur in practice. 411 is supported by libjpeg
+      // factors that occur in practice.
       // ERROR: Unable to convert MJPEG frame because format is not supported
       mjpeg_decoder.UnloadFrame();
       return 1;
@@ -224,88 +192,67 @@ int MJPGToI420(const uint8* sample,
 
 #ifdef HAVE_JPEG
 struct ARGBBuffers {
-  uint8* argb;
+  uint8_t* argb;
   int argb_stride;
   int w;
   int h;
 };
 
 static void JpegI420ToARGB(void* opaque,
-                         const uint8* const* data,
-                         const int* strides,
-                         int rows) {
+                           const uint8_t* const* data,
+                           const int* strides,
+                           int rows) {
   ARGBBuffers* dest = (ARGBBuffers*)(opaque);
-  I420ToARGB(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->argb, dest->argb_stride,
-             dest->w, rows);
+  I420ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+             dest->argb, dest->argb_stride, dest->w, rows);
   dest->argb += rows * dest->argb_stride;
   dest->h -= rows;
 }
 
 static void JpegI422ToARGB(void* opaque,
-                           const uint8* const* data,
+                           const uint8_t* const* data,
                            const int* strides,
                            int rows) {
   ARGBBuffers* dest = (ARGBBuffers*)(opaque);
-  I422ToARGB(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->argb, dest->argb_stride,
-             dest->w, rows);
+  I422ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+             dest->argb, dest->argb_stride, dest->w, rows);
   dest->argb += rows * dest->argb_stride;
   dest->h -= rows;
 }
 
 static void JpegI444ToARGB(void* opaque,
-                           const uint8* const* data,
+                           const uint8_t* const* data,
                            const int* strides,
                            int rows) {
   ARGBBuffers* dest = (ARGBBuffers*)(opaque);
-  I444ToARGB(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->argb, dest->argb_stride,
-             dest->w, rows);
-  dest->argb += rows * dest->argb_stride;
-  dest->h -= rows;
-}
-
-static void JpegI411ToARGB(void* opaque,
-                           const uint8* const* data,
-                           const int* strides,
-                           int rows) {
-  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
-  I411ToARGB(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->argb, dest->argb_stride,
-             dest->w, rows);
+  I444ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+             dest->argb, dest->argb_stride, dest->w, rows);
   dest->argb += rows * dest->argb_stride;
   dest->h -= rows;
 }
 
 static void JpegI400ToARGB(void* opaque,
-                           const uint8* const* data,
+                           const uint8_t* const* data,
                            const int* strides,
                            int rows) {
   ARGBBuffers* dest = (ARGBBuffers*)(opaque);
-  I400ToARGB(data[0], strides[0],
-             dest->argb, dest->argb_stride,
-             dest->w, rows);
+  I400ToARGB(data[0], strides[0], dest->argb, dest->argb_stride, dest->w, rows);
   dest->argb += rows * dest->argb_stride;
   dest->h -= rows;
 }
 
 // MJPG (Motion JPeg) to ARGB
-// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
+// TODO(fbarchard): review src_width and src_height requirement. dst_width and
+// dst_height may be enough.
 LIBYUV_API
-int MJPGToARGB(const uint8* sample,
+int MJPGToARGB(const uint8_t* sample,
                size_t sample_size,
-               uint8* argb, int argb_stride,
-               int w, int h,
-               int dw, int dh) {
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int src_width,
+               int src_height,
+               int dst_width,
+               int dst_height) {
   if (sample_size == kUnknownDataSize) {
     // ERROR: MJPEG frame size unknown
     return -1;
@@ -314,17 +261,16 @@ int MJPGToARGB(const uint8* sample,
   // TODO(fbarchard): Port MJpeg to C.
   MJpegDecoder mjpeg_decoder;
   LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
-  if (ret && (mjpeg_decoder.GetWidth() != w ||
-              mjpeg_decoder.GetHeight() != h)) {
+  if (ret && (mjpeg_decoder.GetWidth() != src_width ||
+              mjpeg_decoder.GetHeight() != src_height)) {
     // ERROR: MJPEG frame has unexpected dimensions
     mjpeg_decoder.UnloadFrame();
     return 1;  // runtime failure
   }
   if (ret) {
-    ARGBBuffers bufs = { argb, argb_stride, dw, dh };
+    ARGBBuffers bufs = {dst_argb, dst_stride_argb, dst_width, dst_height};
     // YUV420
-    if (mjpeg_decoder.GetColorSpace() ==
-            MJpegDecoder::kColorSpaceYCbCr &&
+    if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
         mjpeg_decoder.GetNumComponents() == 3 &&
         mjpeg_decoder.GetVertSampFactor(0) == 2 &&
         mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
@@ -332,8 +278,9 @@ int MJPGToARGB(const uint8* sample,
         mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
         mjpeg_decoder.GetVertSampFactor(2) == 1 &&
         mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh);
-    // YUV422
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dst_width,
+                                           dst_height);
+      // YUV422
     } else if (mjpeg_decoder.GetColorSpace() ==
                    MJpegDecoder::kColorSpaceYCbCr &&
                mjpeg_decoder.GetNumComponents() == 3 &&
@@ -343,8 +290,9 @@ int MJPGToARGB(const uint8* sample,
                mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
                mjpeg_decoder.GetVertSampFactor(2) == 1 &&
                mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh);
-    // YUV444
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dst_width,
+                                           dst_height);
+      // YUV444
     } else if (mjpeg_decoder.GetColorSpace() ==
                    MJpegDecoder::kColorSpaceYCbCr &&
                mjpeg_decoder.GetNumComponents() == 3 &&
@@ -354,28 +302,19 @@ int MJPGToARGB(const uint8* sample,
                mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
                mjpeg_decoder.GetVertSampFactor(2) == 1 &&
                mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh);
-    // YUV411
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceYCbCr &&
-               mjpeg_decoder.GetNumComponents() == 3 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToARGB, &bufs, dw, dh);
-    // YUV400
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dst_width,
+                                           dst_height);
+      // YUV400
     } else if (mjpeg_decoder.GetColorSpace() ==
                    MJpegDecoder::kColorSpaceGrayscale &&
                mjpeg_decoder.GetNumComponents() == 1 &&
                mjpeg_decoder.GetVertSampFactor(0) == 1 &&
                mjpeg_decoder.GetHorizSampFactor(0) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh);
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dst_width,
+                                           dst_height);
     } else {
       // TODO(fbarchard): Implement conversion for any other colorspace/sample
-      // factors that occur in practice. 411 is supported by libjpeg
+      // factors that occur in practice.
       // ERROR: Unable to convert MJPEG frame because format is not supported
       mjpeg_decoder.UnloadFrame();
       return 1;
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/convert_to_argb.cc b/media/libvpx/libvpx/third_party/libyuv/source/convert_to_argb.cc
index aecdc80fde..67484522c0 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/convert_to_argb.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/convert_to_argb.cc
@@ -28,36 +28,50 @@ extern "C" {
 // src_height is used to compute location of planes, and indicate inversion
 // sample_size is measured in bytes and is the size of the frame.
 //   With MJPEG it is the compressed size of the frame.
+
+// TODO(fbarchard): Add the following:
+// H010ToARGB
+// H420ToARGB
+// H422ToARGB
+// I010ToARGB
+// J400ToARGB
+// J422ToARGB
+// J444ToARGB
+
 LIBYUV_API
-int ConvertToARGB(const uint8* sample, size_t sample_size,
-                  uint8* crop_argb, int argb_stride,
-                  int crop_x, int crop_y,
-                  int src_width, int src_height,
-                  int crop_width, int crop_height,
+int ConvertToARGB(const uint8_t* sample,
+                  size_t sample_size,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int crop_x,
+                  int crop_y,
+                  int src_width,
+                  int src_height,
+                  int crop_width,
+                  int crop_height,
                   enum RotationMode rotation,
-                  uint32 fourcc) {
-  uint32 format = CanonicalFourCC(fourcc);
+                  uint32_t fourcc) {
+  uint32_t format = CanonicalFourCC(fourcc);
   int aligned_src_width = (src_width + 1) & ~1;
-  const uint8* src;
-  const uint8* src_uv;
+  const uint8_t* src;
+  const uint8_t* src_uv;
   int abs_src_height = (src_height < 0) ? -src_height : src_height;
   int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
   int r = 0;
 
   // One pass rotation is available for some formats. For the rest, convert
-  // to I420 (with optional vertical flipping) into a temporary I420 buffer,
-  // and then rotate the I420 to the final destination buffer.
-  // For in-place conversion, if destination crop_argb is same as source sample,
+  // to ARGB (with optional vertical flipping) into a temporary ARGB buffer,
+  // and then rotate the ARGB to the final destination buffer.
+  // For in-place conversion, if destination dst_argb is same as source sample,
   // also enable temporary buffer.
-  LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) ||
-      crop_argb == sample;
-  uint8* dest_argb = crop_argb;
-  int dest_argb_stride = argb_stride;
-  uint8* rotate_buffer = NULL;
+  LIBYUV_BOOL need_buf =
+      (rotation && format != FOURCC_ARGB) || dst_argb == sample;
+  uint8_t* dest_argb = dst_argb;
+  int dest_dst_stride_argb = dst_stride_argb;
+  uint8_t* rotate_buffer = NULL;
   int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
 
-  if (crop_argb == NULL || sample == NULL ||
-      src_width <= 0 || crop_width <= 0 ||
+  if (dst_argb == NULL || sample == NULL || src_width <= 0 || crop_width <= 0 ||
       src_height == 0 || crop_height == 0) {
     return -1;
   }
@@ -67,187 +81,174 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
 
   if (need_buf) {
     int argb_size = crop_width * 4 * abs_crop_height;
-    rotate_buffer = (uint8*)malloc(argb_size);
+    rotate_buffer = (uint8_t*)malloc(argb_size); /* NOLINT */
     if (!rotate_buffer) {
       return 1;  // Out of memory runtime error.
     }
-    crop_argb = rotate_buffer;
-    argb_stride = crop_width * 4;
+    dst_argb = rotate_buffer;
+    dst_stride_argb = crop_width * 4;
   }
 
   switch (format) {
     // Single plane formats
     case FOURCC_YUY2:
       src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = YUY2ToARGB(src, aligned_src_width * 2,
-                     crop_argb, argb_stride,
+      r = YUY2ToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb,
                      crop_width, inv_crop_height);
       break;
     case FOURCC_UYVY:
       src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = UYVYToARGB(src, aligned_src_width * 2,
-                     crop_argb, argb_stride,
+      r = UYVYToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb,
                      crop_width, inv_crop_height);
       break;
     case FOURCC_24BG:
       src = sample + (src_width * crop_y + crop_x) * 3;
-      r = RGB24ToARGB(src, src_width * 3,
-                      crop_argb, argb_stride,
-                      crop_width, inv_crop_height);
+      r = RGB24ToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width,
+                      inv_crop_height);
       break;
     case FOURCC_RAW:
       src = sample + (src_width * crop_y + crop_x) * 3;
-      r = RAWToARGB(src, src_width * 3,
-                    crop_argb, argb_stride,
-                    crop_width, inv_crop_height);
+      r = RAWToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width,
+                    inv_crop_height);
       break;
     case FOURCC_ARGB:
-      src = sample + (src_width * crop_y + crop_x) * 4;
-      r = ARGBToARGB(src, src_width * 4,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      if (!need_buf && !rotation) {
+        src = sample + (src_width * crop_y + crop_x) * 4;
+        r = ARGBToARGB(src, src_width * 4, dst_argb, dst_stride_argb,
+                       crop_width, inv_crop_height);
+      }
       break;
     case FOURCC_BGRA:
       src = sample + (src_width * crop_y + crop_x) * 4;
-      r = BGRAToARGB(src, src_width * 4,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = BGRAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
+                     inv_crop_height);
       break;
     case FOURCC_ABGR:
       src = sample + (src_width * crop_y + crop_x) * 4;
-      r = ABGRToARGB(src, src_width * 4,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = ABGRToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
+                     inv_crop_height);
       break;
     case FOURCC_RGBA:
       src = sample + (src_width * crop_y + crop_x) * 4;
-      r = RGBAToARGB(src, src_width * 4,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = RGBAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
+                     inv_crop_height);
+      break;
+    case FOURCC_AR30:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = AR30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
+                     inv_crop_height);
+      break;
+    case FOURCC_AB30:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = AB30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
+                     inv_crop_height);
       break;
     case FOURCC_RGBP:
       src = sample + (src_width * crop_y + crop_x) * 2;
-      r = RGB565ToARGB(src, src_width * 2,
-                       crop_argb, argb_stride,
+      r = RGB565ToARGB(src, src_width * 2, dst_argb, dst_stride_argb,
                        crop_width, inv_crop_height);
       break;
     case FOURCC_RGBO:
       src = sample + (src_width * crop_y + crop_x) * 2;
-      r = ARGB1555ToARGB(src, src_width * 2,
-                         crop_argb, argb_stride,
+      r = ARGB1555ToARGB(src, src_width * 2, dst_argb, dst_stride_argb,
                          crop_width, inv_crop_height);
       break;
     case FOURCC_R444:
       src = sample + (src_width * crop_y + crop_x) * 2;
-      r = ARGB4444ToARGB(src, src_width * 2,
-                         crop_argb, argb_stride,
+      r = ARGB4444ToARGB(src, src_width * 2, dst_argb, dst_stride_argb,
                          crop_width, inv_crop_height);
       break;
     case FOURCC_I400:
       src = sample + src_width * crop_y + crop_x;
-      r = I400ToARGB(src, src_width,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = I400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
+                     inv_crop_height);
       break;
 
     // Biplanar formats
     case FOURCC_NV12:
       src = sample + (src_width * crop_y + crop_x);
-      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
-      r = NV12ToARGB(src, src_width,
-                     src_uv, aligned_src_width,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      src_uv = sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x;
+      r = NV12ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb,
+                     dst_stride_argb, crop_width, inv_crop_height);
       break;
     case FOURCC_NV21:
       src = sample + (src_width * crop_y + crop_x);
-      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+      src_uv = sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x;
       // Call NV12 but with u and v parameters swapped.
-      r = NV21ToARGB(src, src_width,
-                     src_uv, aligned_src_width,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb,
+                     dst_stride_argb, crop_width, inv_crop_height);
       break;
     case FOURCC_M420:
       src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
-      r = M420ToARGB(src, src_width,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = M420ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
+                     inv_crop_height);
       break;
+
     // Triplanar formats
     case FOURCC_I420:
     case FOURCC_YV12: {
-      const uint8* src_y = sample + (src_width * crop_y + crop_x);
-      const uint8* src_u;
-      const uint8* src_v;
+      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8_t* src_u;
+      const uint8_t* src_v;
       int halfwidth = (src_width + 1) / 2;
       int halfheight = (abs_src_height + 1) / 2;
       if (format == FOURCC_YV12) {
         src_v = sample + src_width * abs_src_height +
-            (halfwidth * crop_y + crop_x) / 2;
+                (halfwidth * crop_y + crop_x) / 2;
         src_u = sample + src_width * abs_src_height +
-            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+                halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
       } else {
         src_u = sample + src_width * abs_src_height +
-            (halfwidth * crop_y + crop_x) / 2;
+                (halfwidth * crop_y + crop_x) / 2;
         src_v = sample + src_width * abs_src_height +
-            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+                halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
       }
-      r = I420ToARGB(src_y, src_width,
-                     src_u, halfwidth,
-                     src_v, halfwidth,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = I420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
       break;
     }
 
     case FOURCC_J420: {
-      const uint8* src_y = sample + (src_width * crop_y + crop_x);
-      const uint8* src_u;
-      const uint8* src_v;
+      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8_t* src_u;
+      const uint8_t* src_v;
       int halfwidth = (src_width + 1) / 2;
       int halfheight = (abs_src_height + 1) / 2;
       src_u = sample + src_width * abs_src_height +
-          (halfwidth * crop_y + crop_x) / 2;
+              (halfwidth * crop_y + crop_x) / 2;
       src_v = sample + src_width * abs_src_height +
-          halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
-      r = J420ToARGB(src_y, src_width,
-                     src_u, halfwidth,
-                     src_v, halfwidth,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+              halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
       break;
     }
 
     case FOURCC_I422:
     case FOURCC_YV16: {
-      const uint8* src_y = sample + src_width * crop_y + crop_x;
-      const uint8* src_u;
-      const uint8* src_v;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u;
+      const uint8_t* src_v;
       int halfwidth = (src_width + 1) / 2;
       if (format == FOURCC_YV16) {
-        src_v = sample + src_width * abs_src_height +
-            halfwidth * crop_y + crop_x / 2;
+        src_v = sample + src_width * abs_src_height + halfwidth * crop_y +
+                crop_x / 2;
         src_u = sample + src_width * abs_src_height +
-            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+                halfwidth * (abs_src_height + crop_y) + crop_x / 2;
       } else {
-        src_u = sample + src_width * abs_src_height +
-            halfwidth * crop_y + crop_x / 2;
+        src_u = sample + src_width * abs_src_height + halfwidth * crop_y +
+                crop_x / 2;
         src_v = sample + src_width * abs_src_height +
-            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+                halfwidth * (abs_src_height + crop_y) + crop_x / 2;
       }
-      r = I422ToARGB(src_y, src_width,
-                     src_u, halfwidth,
-                     src_v, halfwidth,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = I422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
       break;
     }
     case FOURCC_I444:
     case FOURCC_YV24: {
-      const uint8* src_y = sample + src_width * crop_y + crop_x;
-      const uint8* src_u;
-      const uint8* src_v;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u;
+      const uint8_t* src_v;
       if (format == FOURCC_YV24) {
         src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
         src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
@@ -255,32 +256,14 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
         src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
         src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
       }
-      r = I444ToARGB(src_y, src_width,
-                     src_u, src_width,
-                     src_v, src_width,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
-      break;
-    }
-    case FOURCC_I411: {
-      int quarterwidth = (src_width + 3) / 4;
-      const uint8* src_y = sample + src_width * crop_y + crop_x;
-      const uint8* src_u = sample + src_width * abs_src_height +
-          quarterwidth * crop_y + crop_x / 4;
-      const uint8* src_v = sample + src_width * abs_src_height +
-          quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
-      r = I411ToARGB(src_y, src_width,
-                     src_u, quarterwidth,
-                     src_v, quarterwidth,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = I444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
       break;
     }
 #ifdef HAVE_JPEG
     case FOURCC_MJPG:
-      r = MJPGToARGB(sample, sample_size,
-                     crop_argb, argb_stride,
-                     src_width, abs_src_height, crop_width, inv_crop_height);
+      r = MJPGToARGB(sample, sample_size, dst_argb, dst_stride_argb, src_width,
+                     abs_src_height, crop_width, inv_crop_height);
       break;
 #endif
     default:
@@ -289,11 +272,14 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
 
   if (need_buf) {
     if (!r) {
-      r = ARGBRotate(crop_argb, argb_stride,
-                     dest_argb, dest_argb_stride,
+      r = ARGBRotate(dst_argb, dst_stride_argb, dest_argb, dest_dst_stride_argb,
                      crop_width, abs_crop_height, rotation);
     }
     free(rotate_buffer);
+  } else if (rotation) {
+    src = sample + (src_width * crop_y + crop_x) * 4;
+    r = ARGBRotate(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
+                   inv_crop_height, rotation);
   }
 
   return r;
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/convert_to_i420.cc b/media/libvpx/libvpx/third_party/libyuv/source/convert_to_i420.cc
index e5f307c446..df08309f9b 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/convert_to_i420.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/convert_to_i420.cc
@@ -25,251 +25,216 @@ extern "C" {
 // sample_size is measured in bytes and is the size of the frame.
 //   With MJPEG it is the compressed size of the frame.
 LIBYUV_API
-int ConvertToI420(const uint8* sample,
+int ConvertToI420(const uint8_t* sample,
                   size_t sample_size,
-                  uint8* y, int y_stride,
-                  uint8* u, int u_stride,
-                  uint8* v, int v_stride,
-                  int crop_x, int crop_y,
-                  int src_width, int src_height,
-                  int crop_width, int crop_height,
+                  uint8_t* dst_y,
+                  int dst_stride_y,
+                  uint8_t* dst_u,
+                  int dst_stride_u,
+                  uint8_t* dst_v,
+                  int dst_stride_v,
+                  int crop_x,
+                  int crop_y,
+                  int src_width,
+                  int src_height,
+                  int crop_width,
+                  int crop_height,
                   enum RotationMode rotation,
-                  uint32 fourcc) {
-  uint32 format = CanonicalFourCC(fourcc);
+                  uint32_t fourcc) {
+  uint32_t format = CanonicalFourCC(fourcc);
   int aligned_src_width = (src_width + 1) & ~1;
-  const uint8* src;
-  const uint8* src_uv;
+  const uint8_t* src;
+  const uint8_t* src_uv;
   const int abs_src_height = (src_height < 0) ? -src_height : src_height;
   // TODO(nisse): Why allow crop_height < 0?
   const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
   int r = 0;
-  LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 &&
-      format != FOURCC_NV12 && format != FOURCC_NV21 &&
-      format != FOURCC_YV12) || y == sample;
-  uint8* tmp_y = y;
-  uint8* tmp_u = u;
-  uint8* tmp_v = v;
-  int tmp_y_stride = y_stride;
-  int tmp_u_stride = u_stride;
-  int tmp_v_stride = v_stride;
-  uint8* rotate_buffer = NULL;
+  LIBYUV_BOOL need_buf =
+      (rotation && format != FOURCC_I420 && format != FOURCC_NV12 &&
+       format != FOURCC_NV21 && format != FOURCC_YV12) ||
+      dst_y == sample;
+  uint8_t* tmp_y = dst_y;
+  uint8_t* tmp_u = dst_u;
+  uint8_t* tmp_v = dst_v;
+  int tmp_y_stride = dst_stride_y;
+  int tmp_u_stride = dst_stride_u;
+  int tmp_v_stride = dst_stride_v;
+  uint8_t* rotate_buffer = NULL;
   const int inv_crop_height =
       (src_height < 0) ? -abs_crop_height : abs_crop_height;
 
-  if (!y || !u || !v || !sample ||
-      src_width <= 0 || crop_width <= 0  ||
-      src_height == 0 || crop_height == 0) {
+  if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 ||
+      crop_width <= 0 || src_height == 0 || crop_height == 0) {
     return -1;
   }
 
   // One pass rotation is available for some formats. For the rest, convert
   // to I420 (with optional vertical flipping) into a temporary I420 buffer,
   // and then rotate the I420 to the final destination buffer.
-  // For in-place conversion, if destination y is same as source sample,
+  // For in-place conversion, if destination dst_y is same as source sample,
   // also enable temporary buffer.
   if (need_buf) {
     int y_size = crop_width * abs_crop_height;
     int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
-    rotate_buffer = (uint8*)malloc(y_size + uv_size * 2);
+    rotate_buffer = (uint8_t*)malloc(y_size + uv_size * 2); /* NOLINT */
     if (!rotate_buffer) {
       return 1;  // Out of memory runtime error.
     }
-    y = rotate_buffer;
-    u = y + y_size;
-    v = u + uv_size;
-    y_stride = crop_width;
-    u_stride = v_stride = ((crop_width + 1) / 2);
+    dst_y = rotate_buffer;
+    dst_u = dst_y + y_size;
+    dst_v = dst_u + uv_size;
+    dst_stride_y = crop_width;
+    dst_stride_u = dst_stride_v = ((crop_width + 1) / 2);
   }
 
   switch (format) {
     // Single plane formats
     case FOURCC_YUY2:
       src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = YUY2ToI420(src, aligned_src_width * 2,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u,
+                     dst_stride_u, dst_v, dst_stride_v, crop_width,
+                     inv_crop_height);
       break;
     case FOURCC_UYVY:
       src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = UYVYToI420(src, aligned_src_width * 2,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u,
+                     dst_stride_u, dst_v, dst_stride_v, crop_width,
+                     inv_crop_height);
       break;
     case FOURCC_RGBP:
       src = sample + (src_width * crop_y + crop_x) * 2;
-      r = RGB565ToI420(src, src_width * 2,
-                       y, y_stride,
-                       u, u_stride,
-                       v, v_stride,
-                       crop_width, inv_crop_height);
+      r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
+                       dst_stride_u, dst_v, dst_stride_v, crop_width,
+                       inv_crop_height);
       break;
     case FOURCC_RGBO:
       src = sample + (src_width * crop_y + crop_x) * 2;
-      r = ARGB1555ToI420(src, src_width * 2,
-                         y, y_stride,
-                         u, u_stride,
-                         v, v_stride,
-                         crop_width, inv_crop_height);
+      r = ARGB1555ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
+                         dst_stride_u, dst_v, dst_stride_v, crop_width,
+                         inv_crop_height);
       break;
     case FOURCC_R444:
       src = sample + (src_width * crop_y + crop_x) * 2;
-      r = ARGB4444ToI420(src, src_width * 2,
-                         y, y_stride,
-                         u, u_stride,
-                         v, v_stride,
-                         crop_width, inv_crop_height);
+      r = ARGB4444ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
+                         dst_stride_u, dst_v, dst_stride_v, crop_width,
+                         inv_crop_height);
       break;
     case FOURCC_24BG:
       src = sample + (src_width * crop_y + crop_x) * 3;
-      r = RGB24ToI420(src, src_width * 3,
-                      y, y_stride,
-                      u, u_stride,
-                      v, v_stride,
-                      crop_width, inv_crop_height);
+      r = RGB24ToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u,
+                      dst_stride_u, dst_v, dst_stride_v, crop_width,
+                      inv_crop_height);
       break;
     case FOURCC_RAW:
       src = sample + (src_width * crop_y + crop_x) * 3;
-      r = RAWToI420(src, src_width * 3,
-                    y, y_stride,
-                    u, u_stride,
-                    v, v_stride,
-                    crop_width, inv_crop_height);
+      r = RAWToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u,
+                    dst_stride_u, dst_v, dst_stride_v, crop_width,
+                    inv_crop_height);
       break;
     case FOURCC_ARGB:
       src = sample + (src_width * crop_y + crop_x) * 4;
-      r = ARGBToI420(src, src_width * 4,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      r = ARGBToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
+                     dst_stride_u, dst_v, dst_stride_v, crop_width,
+                     inv_crop_height);
       break;
     case FOURCC_BGRA:
       src = sample + (src_width * crop_y + crop_x) * 4;
-      r = BGRAToI420(src, src_width * 4,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      r = BGRAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
+                     dst_stride_u, dst_v, dst_stride_v, crop_width,
+                     inv_crop_height);
       break;
     case FOURCC_ABGR:
       src = sample + (src_width * crop_y + crop_x) * 4;
-      r = ABGRToI420(src, src_width * 4,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      r = ABGRToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
+                     dst_stride_u, dst_v, dst_stride_v, crop_width,
+                     inv_crop_height);
       break;
     case FOURCC_RGBA:
       src = sample + (src_width * crop_y + crop_x) * 4;
-      r = RGBAToI420(src, src_width * 4,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      r = RGBAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
+                     dst_stride_u, dst_v, dst_stride_v, crop_width,
+                     inv_crop_height);
       break;
+    // TODO(fbarchard): Add AR30 and AB30
     case FOURCC_I400:
       src = sample + src_width * crop_y + crop_x;
-      r = I400ToI420(src, src_width,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      r = I400ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                     dst_v, dst_stride_v, crop_width, inv_crop_height);
       break;
     // Biplanar formats
     case FOURCC_NV12:
       src = sample + (src_width * crop_y + crop_x);
-      src_uv = sample + (src_width * src_height) +
-        ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
-      r = NV12ToI420Rotate(src, src_width,
-                           src_uv, aligned_src_width,
-                           y, y_stride,
-                           u, u_stride,
-                           v, v_stride,
-                           crop_width, inv_crop_height, rotation);
+      src_uv = sample + (src_width * abs_src_height) +
+               ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
+      r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y,
+                           dst_stride_y, dst_u, dst_stride_u, dst_v,
+                           dst_stride_v, crop_width, inv_crop_height, rotation);
       break;
     case FOURCC_NV21:
       src = sample + (src_width * crop_y + crop_x);
-      src_uv = sample + (src_width * src_height) +
-        ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
-      // Call NV12 but with u and v parameters swapped.
-      r = NV12ToI420Rotate(src, src_width,
-                           src_uv, aligned_src_width,
-                           y, y_stride,
-                           v, v_stride,
-                           u, u_stride,
-                           crop_width, inv_crop_height, rotation);
+      src_uv = sample + (src_width * abs_src_height) +
+               ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
+      // Call NV12 but with dst_u and dst_v parameters swapped.
+      r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y,
+                           dst_stride_y, dst_v, dst_stride_v, dst_u,
+                           dst_stride_u, crop_width, inv_crop_height, rotation);
       break;
     case FOURCC_M420:
       src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
-      r = M420ToI420(src, src_width,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      r = M420ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                     dst_v, dst_stride_v, crop_width, inv_crop_height);
       break;
     // Triplanar formats
     case FOURCC_I420:
     case FOURCC_YV12: {
-      const uint8* src_y = sample + (src_width * crop_y + crop_x);
-      const uint8* src_u;
-      const uint8* src_v;
+      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8_t* src_u;
+      const uint8_t* src_v;
       int halfwidth = (src_width + 1) / 2;
       int halfheight = (abs_src_height + 1) / 2;
       if (format == FOURCC_YV12) {
         src_v = sample + src_width * abs_src_height +
-            (halfwidth * crop_y + crop_x) / 2;
+                (halfwidth * crop_y + crop_x) / 2;
         src_u = sample + src_width * abs_src_height +
-            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+                halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
       } else {
         src_u = sample + src_width * abs_src_height +
-            (halfwidth * crop_y + crop_x) / 2;
+                (halfwidth * crop_y + crop_x) / 2;
         src_v = sample + src_width * abs_src_height +
-            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+                halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
       }
-      r = I420Rotate(src_y, src_width,
-                     src_u, halfwidth,
-                     src_v, halfwidth,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height, rotation);
+      r = I420Rotate(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+                     dst_stride_v, crop_width, inv_crop_height, rotation);
       break;
     }
     case FOURCC_I422:
     case FOURCC_YV16: {
-      const uint8* src_y = sample + src_width * crop_y + crop_x;
-      const uint8* src_u;
-      const uint8* src_v;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u;
+      const uint8_t* src_v;
       int halfwidth = (src_width + 1) / 2;
       if (format == FOURCC_YV16) {
-        src_v = sample + src_width * abs_src_height +
-            halfwidth * crop_y + crop_x / 2;
+        src_v = sample + src_width * abs_src_height + halfwidth * crop_y +
+                crop_x / 2;
         src_u = sample + src_width * abs_src_height +
-            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+                halfwidth * (abs_src_height + crop_y) + crop_x / 2;
       } else {
-        src_u = sample + src_width * abs_src_height +
-            halfwidth * crop_y + crop_x / 2;
+        src_u = sample + src_width * abs_src_height + halfwidth * crop_y +
+                crop_x / 2;
         src_v = sample + src_width * abs_src_height +
-            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+                halfwidth * (abs_src_height + crop_y) + crop_x / 2;
       }
-      r = I422ToI420(src_y, src_width,
-                     src_u, halfwidth,
-                     src_v, halfwidth,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      r = I422ToI420(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+                     dst_stride_v, crop_width, inv_crop_height);
       break;
     }
     case FOURCC_I444:
     case FOURCC_YV24: {
-      const uint8* src_y = sample + src_width * crop_y + crop_x;
-      const uint8* src_u;
-      const uint8* src_v;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u;
+      const uint8_t* src_v;
       if (format == FOURCC_YV24) {
         src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
         src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
@@ -277,38 +242,16 @@ int ConvertToI420(const uint8* sample,
         src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
         src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
       }
-      r = I444ToI420(src_y, src_width,
-                     src_u, src_width,
-                     src_v, src_width,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
-      break;
-    }
-    case FOURCC_I411: {
-      int quarterwidth = (src_width + 3) / 4;
-      const uint8* src_y = sample + src_width * crop_y + crop_x;
-      const uint8* src_u = sample + src_width * abs_src_height +
-          quarterwidth * crop_y + crop_x / 4;
-      const uint8* src_v = sample + src_width * abs_src_height +
-          quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
-      r = I411ToI420(src_y, src_width,
-                     src_u, quarterwidth,
-                     src_v, quarterwidth,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      r = I444ToI420(src_y, src_width, src_u, src_width, src_v, src_width,
+                     dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+                     dst_stride_v, crop_width, inv_crop_height);
       break;
     }
 #ifdef HAVE_JPEG
     case FOURCC_MJPG:
-      r = MJPGToI420(sample, sample_size,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     src_width, abs_src_height, crop_width, inv_crop_height);
+      r = MJPGToI420(sample, sample_size, dst_y, dst_stride_y, dst_u,
+                     dst_stride_u, dst_v, dst_stride_v, src_width,
+                     abs_src_height, crop_width, inv_crop_height);
       break;
 #endif
     default:
@@ -317,13 +260,10 @@ int ConvertToI420(const uint8* sample,
 
   if (need_buf) {
     if (!r) {
-      r = I420Rotate(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     tmp_y, tmp_y_stride,
-                     tmp_u, tmp_u_stride,
-                     tmp_v, tmp_v_stride,
-                     crop_width, abs_crop_height, rotation);
+      r = I420Rotate(dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+                     dst_stride_v, tmp_y, tmp_y_stride, tmp_u, tmp_u_stride,
+                     tmp_v, tmp_v_stride, crop_width, abs_crop_height,
+                     rotation);
     }
     free(rotate_buffer);
   }
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/cpu_id.cc b/media/libvpx/libvpx/third_party/libyuv/source/cpu_id.cc
index 84927ebc3e..31e24b6739 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/cpu_id.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/cpu_id.cc
@@ -13,22 +13,16 @@
 #if defined(_MSC_VER)
 #include <intrin.h>  // For __cpuidex()
 #endif
-#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+#if !defined(__pnacl__) && !defined(__CLR_VER) &&                           \
     !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
     defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
 #include <immintrin.h>  // For _xgetbv()
 #endif
 
-#if !defined(__native_client__)
-#include <stdlib.h>  // For getenv()
-#endif
-
 // For ArmCpuCaps() but unittested on all platforms
 #include <stdio.h>
 #include <string.h>
 
-#include "libyuv/basic_types.h"  // For CPU_X86
-
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
@@ -43,16 +37,20 @@ extern "C" {
 #define SAFEBUFFERS
 #endif
 
+// cpu_info_ variable for SIMD instruction sets detected.
+LIBYUV_API int cpu_info_ = 0;
+
+// TODO(fbarchard): Consider using int for cpuid so casting is not needed.
 // Low level cpuid for X86.
-#if (defined(_M_IX86) || defined(_M_X64) || \
-    defined(__i386__) || defined(__x86_64__)) && \
+#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
+     defined(__x86_64__)) &&                                     \
     !defined(__pnacl__) && !defined(__CLR_VER)
 LIBYUV_API
-void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
+void CpuId(int info_eax, int info_ecx, int* cpu_info) {
 #if defined(_MSC_VER)
 // Visual C version uses intrinsic or inline x86 assembly.
 #if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
-  __cpuidex((int*)(cpu_info), info_eax, info_ecx);
+  __cpuidex(cpu_info, info_eax, info_ecx);
 #elif defined(_M_IX86)
   __asm {
     mov        eax, info_eax
@@ -66,26 +64,26 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
   }
 #else  // Visual C but not x86
   if (info_ecx == 0) {
-    __cpuid((int*)(cpu_info), info_eax);
+    __cpuid(cpu_info, info_eax);
   } else {
-    cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0;
+    cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0u;
   }
 #endif
 // GCC version uses inline x86 assembly.
 #else  // defined(_MSC_VER)
-  uint32 info_ebx, info_edx;
-  asm volatile (
-#if defined( __i386__) && defined(__PIC__)
-    // Preserve ebx for fpic 32 bit.
-    "mov %%ebx, %%edi                          \n"
-    "cpuid                                     \n"
-    "xchg %%edi, %%ebx                         \n"
-    : "=D" (info_ebx),
+  int info_ebx, info_edx;
+  asm volatile(
+#if defined(__i386__) && defined(__PIC__)
+      // Preserve ebx for fpic 32 bit.
+      "mov %%ebx, %%edi                          \n"
+      "cpuid                                     \n"
+      "xchg %%edi, %%ebx                         \n"
+      : "=D"(info_ebx),
 #else
-    "cpuid                                     \n"
-    : "=b" (info_ebx),
+      "cpuid                                     \n"
+      : "=b"(info_ebx),
 #endif  //  defined( __i386__) && defined(__PIC__)
-      "+a" (info_eax), "+c" (info_ecx), "=d" (info_edx));
+        "+a"(info_eax), "+c"(info_ecx), "=d"(info_edx));
   cpu_info[0] = info_eax;
   cpu_info[1] = info_ebx;
   cpu_info[2] = info_ecx;
@@ -94,7 +92,9 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
 }
 #else  // (defined(_M_IX86) || defined(_M_X64) ...
 LIBYUV_API
-void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
+void CpuId(int eax, int ecx, int* cpu_info) {
+  (void)eax;
+  (void)ecx;
   cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
 }
 #endif
@@ -111,20 +111,22 @@ void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
 #if defined(_M_IX86) && (_MSC_VER < 1900)
 #pragma optimize("g", off)
 #endif
-#if (defined(_M_IX86) || defined(_M_X64) || \
-    defined(__i386__) || defined(__x86_64__)) && \
+#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
+     defined(__x86_64__)) &&                                     \
     !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
-#define HAS_XGETBV
 // X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
 int GetXCR0() {
-  uint32 xcr0 = 0u;
+  int xcr0 = 0;
 #if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
-  xcr0 = (uint32)(_xgetbv(0));  // VS2010 SP1 required.
+  xcr0 = (int)_xgetbv(0);  // VS2010 SP1 required.  NOLINT
 #elif defined(__i386__) || defined(__x86_64__)
-  asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
+  asm(".byte 0x0f, 0x01, 0xd0" : "=a"(xcr0) : "c"(0) : "%edx");
 #endif  // defined(__i386__) || defined(__x86_64__)
   return xcr0;
 }
+#else
+// xgetbv unavailable to query for OSSave support.  Return 0.
+#define GetXCR0() 0
 #endif  // defined(_M_IX86) || defined(_M_X64) ..
 // Return optimization to previous setting.
 #if defined(_M_IX86) && (_MSC_VER < 1900)
@@ -133,8 +135,7 @@ int GetXCR0() {
 
 // based on libvpx arm_cpudetect.c
 // For Arm, but public to allow testing on any CPU
-LIBYUV_API SAFEBUFFERS
-int ArmCpuCaps(const char* cpuinfo_name) {
+LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) {
   char cpuinfo_line[512];
   FILE* f = fopen(cpuinfo_name, "r");
   if (!f) {
@@ -151,7 +152,7 @@ int ArmCpuCaps(const char* cpuinfo_name) {
       }
       // aarch64 uses asimd for Neon.
       p = strstr(cpuinfo_line, " asimd");
-      if (p && (p[6] == ' ' || p[6] == '\n')) {
+      if (p) {
         fclose(f);
         return kCpuHasNEON;
       }
@@ -161,103 +162,78 @@ int ArmCpuCaps(const char* cpuinfo_name) {
   return 0;
 }
 
-// CPU detect function for SIMD instruction sets.
-LIBYUV_API
-int cpu_info_ = 0;  // cpu_info is not initialized yet.
-
-// Test environment variable for disabling CPU features. Any non-zero value
-// to disable. Zero ignored to make it easy to set the variable on/off.
-#if !defined(__native_client__) && !defined(_M_ARM)
-
-static LIBYUV_BOOL TestEnv(const char* name) {
-  const char* var = getenv(name);
-  if (var) {
-    if (var[0] != '0') {
-      return LIBYUV_TRUE;
+// TODO(fbarchard): Consider read_msa_ir().
+// TODO(fbarchard): Add unittest.
+LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name,
+                                       const char ase[]) {
+  char cpuinfo_line[512];
+  FILE* f = fopen(cpuinfo_name, "r");
+  if (!f) {
+    // ase enabled if /proc/cpuinfo is unavailable.
+    if (strcmp(ase, " msa") == 0) {
+      return kCpuHasMSA;
+    }
+    return 0;
+  }
+  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+    if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) {
+      char* p = strstr(cpuinfo_line, ase);
+      if (p) {
+        fclose(f);
+        if (strcmp(ase, " msa") == 0) {
+          return kCpuHasMSA;
+        }
+        return 0;
+      }
     }
   }
-  return LIBYUV_FALSE;
+  fclose(f);
+  return 0;
 }
-#else  // nacl does not support getenv().
-static LIBYUV_BOOL TestEnv(const char*) {
-  return LIBYUV_FALSE;
-}
-#endif
 
-LIBYUV_API SAFEBUFFERS
-int InitCpuFlags(void) {
-  // TODO(fbarchard): swap kCpuInit logic so 0 means uninitialized.
+static SAFEBUFFERS int GetCpuFlags(void) {
   int cpu_info = 0;
-#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)
-  uint32 cpu_info0[4] = { 0, 0, 0, 0 };
-  uint32 cpu_info1[4] = { 0, 0, 0, 0 };
-  uint32 cpu_info7[4] = { 0, 0, 0, 0 };
+#if !defined(__pnacl__) && !defined(__CLR_VER) &&                   \
+    (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
+     defined(_M_IX86))
+  int cpu_info0[4] = {0, 0, 0, 0};
+  int cpu_info1[4] = {0, 0, 0, 0};
+  int cpu_info7[4] = {0, 0, 0, 0};
   CpuId(0, 0, cpu_info0);
   CpuId(1, 0, cpu_info1);
   if (cpu_info0[0] >= 7) {
     CpuId(7, 0, cpu_info7);
   }
-  cpu_info = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
+  cpu_info = kCpuHasX86 | ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
              ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
              ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
              ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
-             ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
-             ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
-             kCpuHasX86;
+             ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0);
 
-#ifdef HAS_XGETBV
-  // AVX requires CPU has AVX, XSAVE and OSXSave for xgetbv
+  // AVX requires OS saves YMM registers.
   if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) &&  // AVX and OSXSave
       ((GetXCR0() & 6) == 6)) {  // Test OS saves YMM registers
-    cpu_info |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | kCpuHasAVX;
+    cpu_info |= kCpuHasAVX | ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
+                ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
+                ((cpu_info1[2] & 0x20000000) ? kCpuHasF16C : 0);
 
     // Detect AVX512bw
     if ((GetXCR0() & 0xe0) == 0xe0) {
-      cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX3 : 0;
+      cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX512BW : 0;
+      cpu_info |= (cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0;
+      cpu_info |= (cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0;
+      cpu_info |= (cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0;
+      cpu_info |= (cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0;
+      cpu_info |= (cpu_info7[2] & 0x00004000) ? kCpuHasAVX512VPOPCNTDQ : 0;
+      cpu_info |= (cpu_info7[2] & 0x00000100) ? kCpuHasGFNI : 0;
     }
   }
 #endif
-
-  // Environment variable overrides for testing.
-  if (TestEnv("LIBYUV_DISABLE_X86")) {
-    cpu_info &= ~kCpuHasX86;
-  }
-  if (TestEnv("LIBYUV_DISABLE_SSE2")) {
-    cpu_info &= ~kCpuHasSSE2;
-  }
-  if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
-    cpu_info &= ~kCpuHasSSSE3;
-  }
-  if (TestEnv("LIBYUV_DISABLE_SSE41")) {
-    cpu_info &= ~kCpuHasSSE41;
-  }
-  if (TestEnv("LIBYUV_DISABLE_SSE42")) {
-    cpu_info &= ~kCpuHasSSE42;
-  }
-  if (TestEnv("LIBYUV_DISABLE_AVX")) {
-    cpu_info &= ~kCpuHasAVX;
-  }
-  if (TestEnv("LIBYUV_DISABLE_AVX2")) {
-    cpu_info &= ~kCpuHasAVX2;
-  }
-  if (TestEnv("LIBYUV_DISABLE_ERMS")) {
-    cpu_info &= ~kCpuHasERMS;
-  }
-  if (TestEnv("LIBYUV_DISABLE_FMA3")) {
-    cpu_info &= ~kCpuHasFMA3;
-  }
-  if (TestEnv("LIBYUV_DISABLE_AVX3")) {
-    cpu_info &= ~kCpuHasAVX3;
-  }
-#endif
 #if defined(__mips__) && defined(__linux__)
-#if defined(__mips_dspr2)
-  cpu_info |= kCpuHasDSPR2;
+#if defined(__mips_msa)
+  cpu_info = MipsCpuCaps("/proc/cpuinfo", " msa");
 #endif
   cpu_info |= kCpuHasMIPS;
-  if (getenv("LIBYUV_DISABLE_DSPR2")) {
-    cpu_info &= ~kCpuHasDSPR2;
-  }
 #endif
 #if defined(__arm__) || defined(__aarch64__)
 // gcc -mfpu=neon defines __ARM_NEON__
@@ -276,22 +252,22 @@ int InitCpuFlags(void) {
   cpu_info = ArmCpuCaps("/proc/cpuinfo");
 #endif
   cpu_info |= kCpuHasARM;
-  if (TestEnv("LIBYUV_DISABLE_NEON")) {
-    cpu_info &= ~kCpuHasNEON;
-  }
 #endif  // __arm__
-  if (TestEnv("LIBYUV_DISABLE_ASM")) {
-    cpu_info = 0;
-  }
-  cpu_info  |= kCpuInitialized;
-  cpu_info_ = cpu_info;
+  cpu_info |= kCpuInitialized;
   return cpu_info;
 }
 
 // Note that use of this function is not thread safe.
 LIBYUV_API
-void MaskCpuFlags(int enable_flags) {
-  cpu_info_ = InitCpuFlags() & enable_flags;
+int MaskCpuFlags(int enable_flags) {
+  int cpu_info = GetCpuFlags() & enable_flags;
+  SetCpuFlags(cpu_info);
+  return cpu_info;
+}
+
+LIBYUV_API
+int InitCpuFlags(void) {
+  return MaskCpuFlags(-1);
 }
 
 #ifdef __cplusplus
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/mjpeg_decoder.cc b/media/libvpx/libvpx/third_party/libyuv/source/mjpeg_decoder.cc
index 22025ad04a..eaf2530130 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/mjpeg_decoder.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/mjpeg_decoder.cc
@@ -21,7 +21,7 @@
 
 #if defined(_MSC_VER)
 // disable warning 4324: structure was padded due to __declspec(align())
-#pragma warning(disable:4324)
+#pragma warning(disable : 4324)
 #endif
 
 #endif
@@ -102,7 +102,7 @@ MJpegDecoder::~MJpegDecoder() {
   DestroyOutputBuffers();
 }
 
-LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
+LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8_t* src, size_t src_len) {
   if (!ValidateJpeg(src, src_len)) {
     return LIBYUV_FALSE;
   }
@@ -129,7 +129,7 @@ LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
       if (scanlines_[i]) {
         delete scanlines_[i];
       }
-      scanlines_[i] = new uint8* [scanlines_size];
+      scanlines_[i] = new uint8_t*[scanlines_size];
       scanlines_sizes_[i] = scanlines_size;
     }
 
@@ -145,7 +145,7 @@ LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
       if (databuf_[i]) {
         delete databuf_[i];
       }
-      databuf_[i] = new uint8[databuf_size];
+      databuf_[i] = new uint8_t[databuf_size];
       databuf_strides_[i] = databuf_stride;
     }
 
@@ -195,13 +195,11 @@ int MJpegDecoder::GetVertSampFactor(int component) {
 }
 
 int MJpegDecoder::GetHorizSubSampFactor(int component) {
-  return decompress_struct_->max_h_samp_factor /
-      GetHorizSampFactor(component);
+  return decompress_struct_->max_h_samp_factor / GetHorizSampFactor(component);
 }
 
 int MJpegDecoder::GetVertSubSampFactor(int component) {
-  return decompress_struct_->max_v_samp_factor /
-      GetVertSampFactor(component);
+  return decompress_struct_->max_v_samp_factor / GetVertSampFactor(component);
 }
 
 int MJpegDecoder::GetImageScanlinesPerImcuRow() {
@@ -245,10 +243,10 @@ LIBYUV_BOOL MJpegDecoder::UnloadFrame() {
 }
 
 // TODO(fbarchard): Allow rectangle to be specified: x, y, width, height.
-LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
-    uint8** planes, int dst_width, int dst_height) {
-  if (dst_width != GetWidth() ||
-      dst_height > GetHeight()) {
+LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(uint8_t** planes,
+                                          int dst_width,
+                                          int dst_height) {
+  if (dst_width != GetWidth() || dst_height > GetHeight()) {
     // ERROR: Bad dimensions
     return LIBYUV_FALSE;
   }
@@ -289,14 +287,13 @@ LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
       for (int i = 0; i < num_outbufs_; ++i) {
         // TODO(fbarchard): Compute skip to avoid this
         assert(skip % GetVertSubSampFactor(i) == 0);
-        int rows_to_skip =
-            DivideAndRoundDown(skip, GetVertSubSampFactor(i));
-        int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i) -
-                                rows_to_skip;
+        int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+        int scanlines_to_copy =
+            GetComponentScanlinesPerImcuRow(i) - rows_to_skip;
         int data_to_skip = rows_to_skip * GetComponentStride(i);
-        CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i),
-                  planes[i], GetComponentWidth(i),
-                  GetComponentWidth(i), scanlines_to_copy);
+        CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i), planes[i],
+                  GetComponentWidth(i), GetComponentWidth(i),
+                  scanlines_to_copy);
         planes[i] += scanlines_to_copy * GetComponentWidth(i);
       }
       lines_left -= (GetImageScanlinesPerImcuRow() - skip);
@@ -305,16 +302,15 @@ LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
 
   // Read full MCUs but cropped horizontally
   for (; lines_left > GetImageScanlinesPerImcuRow();
-         lines_left -= GetImageScanlinesPerImcuRow()) {
+       lines_left -= GetImageScanlinesPerImcuRow()) {
     if (!DecodeImcuRow()) {
       FinishDecode();
       return LIBYUV_FALSE;
     }
     for (int i = 0; i < num_outbufs_; ++i) {
       int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i);
-      CopyPlane(databuf_[i], GetComponentStride(i),
-                planes[i], GetComponentWidth(i),
-                GetComponentWidth(i), scanlines_to_copy);
+      CopyPlane(databuf_[i], GetComponentStride(i), planes[i],
+                GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy);
       planes[i] += scanlines_to_copy * GetComponentWidth(i);
     }
   }
@@ -328,19 +324,19 @@ LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
     for (int i = 0; i < num_outbufs_; ++i) {
       int scanlines_to_copy =
           DivideAndRoundUp(lines_left, GetVertSubSampFactor(i));
-      CopyPlane(databuf_[i], GetComponentStride(i),
-                planes[i], GetComponentWidth(i),
-                GetComponentWidth(i), scanlines_to_copy);
+      CopyPlane(databuf_[i], GetComponentStride(i), planes[i],
+                GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy);
       planes[i] += scanlines_to_copy * GetComponentWidth(i);
     }
   }
   return FinishDecode();
 }
 
-LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
-    int dst_width, int dst_height) {
-  if (dst_width != GetWidth() ||
-      dst_height > GetHeight()) {
+LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn,
+                                           void* opaque,
+                                           int dst_width,
+                                           int dst_height) {
+  if (dst_width != GetWidth() || dst_height > GetHeight()) {
     // ERROR: Bad dimensions
     return LIBYUV_FALSE;
   }
@@ -395,7 +391,7 @@ LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
   }
   // Read full MCUs until we get to the crop point.
   for (; lines_left >= GetImageScanlinesPerImcuRow();
-         lines_left -= GetImageScanlinesPerImcuRow()) {
+       lines_left -= GetImageScanlinesPerImcuRow()) {
     if (!DecodeImcuRow()) {
       FinishDecode();
       return LIBYUV_FALSE;
@@ -435,22 +431,22 @@ void skip_input_data(j_decompress_ptr cinfo, long num_bytes) {  // NOLINT
 }
 
 void term_source(j_decompress_ptr cinfo) {
-  // Nothing to do.
+  (void)cinfo;  // Nothing to do.
 }
 
 #ifdef HAVE_SETJMP
 void ErrorHandler(j_common_ptr cinfo) {
-  // This is called when a jpeglib command experiences an error. Unfortunately
-  // jpeglib's error handling model is not very flexible, because it expects the
-  // error handler to not return--i.e., it wants the program to terminate. To
-  // recover from errors we use setjmp() as shown in their example. setjmp() is
-  // C's implementation for the "call with current continuation" functionality
-  // seen in some functional programming languages.
-  // A formatted message can be output, but is unsafe for release.
+// This is called when a jpeglib command experiences an error. Unfortunately
+// jpeglib's error handling model is not very flexible, because it expects the
+// error handler to not return--i.e., it wants the program to terminate. To
+// recover from errors we use setjmp() as shown in their example. setjmp() is
+// C's implementation for the "call with current continuation" functionality
+// seen in some functional programming languages.
+// A formatted message can be output, but is unsafe for release.
 #ifdef DEBUG
   char buf[JMSG_LENGTH_MAX];
   (*cinfo->err->format_message)(cinfo, buf);
-  // ERROR: Error in jpeglib: buf
+// ERROR: Error in jpeglib: buf
 #endif
 
   SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
@@ -459,8 +455,9 @@ void ErrorHandler(j_common_ptr cinfo) {
   longjmp(mgr->setjmp_buffer, 1);
 }
 
+// Suppress fprintf warnings.
 void OutputHandler(j_common_ptr cinfo) {
-  // Suppress fprintf warnings.
+  (void)cinfo;
 }
 
 #endif  // HAVE_SETJMP
@@ -472,9 +469,9 @@ void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
     // it.
     DestroyOutputBuffers();
 
-    scanlines_ = new uint8** [num_outbufs];
+    scanlines_ = new uint8_t**[num_outbufs];
     scanlines_sizes_ = new int[num_outbufs];
-    databuf_ = new uint8* [num_outbufs];
+    databuf_ = new uint8_t*[num_outbufs];
     databuf_strides_ = new int[num_outbufs];
 
     for (int i = 0; i < num_outbufs; ++i) {
@@ -490,13 +487,13 @@ void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
 
 void MJpegDecoder::DestroyOutputBuffers() {
   for (int i = 0; i < num_outbufs_; ++i) {
-    delete [] scanlines_[i];
-    delete [] databuf_[i];
+    delete[] scanlines_[i];
+    delete[] databuf_[i];
   }
-  delete [] scanlines_;
-  delete [] databuf_;
-  delete [] scanlines_sizes_;
-  delete [] databuf_strides_;
+  delete[] scanlines_;
+  delete[] databuf_;
+  delete[] scanlines_sizes_;
+  delete[] databuf_strides_;
   scanlines_ = NULL;
   databuf_ = NULL;
   scanlines_sizes_ = NULL;
@@ -530,9 +527,9 @@ LIBYUV_BOOL MJpegDecoder::FinishDecode() {
   return LIBYUV_TRUE;
 }
 
-void MJpegDecoder::SetScanlinePointers(uint8** data) {
+void MJpegDecoder::SetScanlinePointers(uint8_t** data) {
   for (int i = 0; i < num_outbufs_; ++i) {
-    uint8* data_i = data[i];
+    uint8_t* data_i = data[i];
     for (int j = 0; j < scanlines_sizes_[i]; ++j) {
       scanlines_[i][j] = data_i;
       data_i += GetComponentStride(i);
@@ -542,26 +539,26 @@ void MJpegDecoder::SetScanlinePointers(uint8** data) {
 
 inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() {
   return (unsigned int)(GetImageScanlinesPerImcuRow()) ==
-      jpeg_read_raw_data(decompress_struct_,
-                         scanlines_,
-                         GetImageScanlinesPerImcuRow());
+         jpeg_read_raw_data(decompress_struct_, scanlines_,
+                            GetImageScanlinesPerImcuRow());
 }
 
 // The helper function which recognizes the jpeg sub-sampling type.
 JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
-    int* subsample_x, int* subsample_y, int number_of_components) {
+    int* subsample_x,
+    int* subsample_y,
+    int number_of_components) {
   if (number_of_components == 3) {  // Color images.
-    if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
-        subsample_x[1] == 2 && subsample_y[1] == 2 &&
-        subsample_x[2] == 2 && subsample_y[2] == 2) {
+    if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 &&
+        subsample_y[1] == 2 && subsample_x[2] == 2 && subsample_y[2] == 2) {
       return kJpegYuv420;
-    } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
-        subsample_x[1] == 2 && subsample_y[1] == 1 &&
-        subsample_x[2] == 2 && subsample_y[2] == 1) {
+    }
+    if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 &&
+        subsample_y[1] == 1 && subsample_x[2] == 2 && subsample_y[2] == 1) {
       return kJpegYuv422;
-    } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
-        subsample_x[1] == 1 && subsample_y[1] == 1 &&
-        subsample_x[2] == 1 && subsample_y[2] == 1) {
+    }
+    if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 1 &&
+        subsample_y[1] == 1 && subsample_x[2] == 1 && subsample_y[2] == 1) {
       return kJpegYuv444;
     }
   } else if (number_of_components == 1) {  // Grey-scale images.
@@ -574,4 +571,3 @@ JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
 
 }  // namespace libyuv
 #endif  // HAVE_JPEG
-
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/mjpeg_validate.cc b/media/libvpx/libvpx/third_party/libyuv/source/mjpeg_validate.cc
index 9c48832045..80c2cc0cb9 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/mjpeg_validate.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/mjpeg_validate.cc
@@ -18,13 +18,13 @@ extern "C" {
 #endif
 
 // Helper function to scan for EOI marker (0xff 0xd9).
-static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {
+static LIBYUV_BOOL ScanEOI(const uint8_t* sample, size_t sample_size) {
   if (sample_size >= 2) {
-    const uint8* end = sample + sample_size - 1;
-    const uint8* it = sample;
+    const uint8_t* end = sample + sample_size - 1;
+    const uint8_t* it = sample;
     while (it < end) {
       // TODO(fbarchard): scan for 0xd9 instead.
-      it = static_cast<const uint8 *>(memchr(it, 0xff, end - it));
+      it = (const uint8_t*)(memchr(it, 0xff, end - it));
       if (it == NULL) {
         break;
       }
@@ -39,7 +39,7 @@ static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {
 }
 
 // Helper function to validate the jpeg appears intact.
-LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
+LIBYUV_BOOL ValidateJpeg(const uint8_t* sample, size_t sample_size) {
   // Maximum size that ValidateJpeg will consider valid.
   const size_t kMaxJpegSize = 0x7fffffffull;
   const size_t kBackSearchSize = 1024;
@@ -68,4 +68,3 @@ LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
 }  // extern "C"
 }  // namespace libyuv
 #endif
-
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/planar_functions.cc b/media/libvpx/libvpx/third_party/libyuv/source/planar_functions.cc
index a764f8da47..5eae3f763a 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/planar_functions.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/planar_functions.cc
@@ -26,11 +26,14 @@ extern "C" {
 
 // Copy a plane of data
 LIBYUV_API
-void CopyPlane(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height) {
+void CopyPlane(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height) {
   int y;
-  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+  void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -38,8 +41,7 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
     dst_stride_y = -dst_stride_y;
   }
   // Coalesce rows.
-  if (src_stride_y == width &&
-      dst_stride_y == width) {
+  if (src_stride_y == width && dst_stride_y == width) {
     width *= height;
     height = 1;
     src_stride_y = dst_stride_y = 0;
@@ -48,6 +50,7 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
   if (src_y == dst_y && src_stride_y == dst_stride_y) {
     return;
   }
+
 #if defined(HAS_COPYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
@@ -68,11 +71,6 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
     CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
   }
 #endif
-#if defined(HAS_COPYROW_MIPS)
-  if (TestCpuFlag(kCpuHasMIPS)) {
-    CopyRow = CopyRow_MIPS;
-  }
-#endif
 
   // Copy plane
   for (y = 0; y < height; ++y) {
@@ -83,15 +81,18 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
 }
 
 // TODO(fbarchard): Consider support for negative height.
+// TODO(fbarchard): Consider stride measured in bytes.
 LIBYUV_API
-void CopyPlane_16(const uint16* src_y, int src_stride_y,
-                  uint16* dst_y, int dst_stride_y,
-                  int width, int height) {
+void CopyPlane_16(const uint16_t* src_y,
+                  int src_stride_y,
+                  uint16_t* dst_y,
+                  int dst_stride_y,
+                  int width,
+                  int height) {
   int y;
-  void (*CopyRow)(const uint16* src, uint16* dst, int width) = CopyRow_16_C;
+  void (*CopyRow)(const uint16_t* src, uint16_t* dst, int width) = CopyRow_16_C;
   // Coalesce rows.
-  if (src_stride_y == width &&
-      dst_stride_y == width) {
+  if (src_stride_y == width && dst_stride_y == width) {
     width *= height;
     height = 1;
     src_stride_y = dst_stride_y = 0;
@@ -111,11 +112,6 @@ void CopyPlane_16(const uint16* src_y, int src_stride_y,
     CopyRow = CopyRow_16_NEON;
   }
 #endif
-#if defined(HAS_COPYROW_16_MIPS)
-  if (TestCpuFlag(kCpuHasMIPS)) {
-    CopyRow = CopyRow_16_MIPS;
-  }
-#endif
 
   // Copy plane
   for (y = 0; y < height; ++y) {
@@ -125,19 +121,124 @@ void CopyPlane_16(const uint16* src_y, int src_stride_y,
   }
 }
 
+// Convert a plane of 16 bit data to 8 bit
+LIBYUV_API
+void Convert16To8Plane(const uint16_t* src_y,
+                       int src_stride_y,
+                       uint8_t* dst_y,
+                       int dst_stride_y,
+                       int scale,  // 16384 for 10 bits
+                       int width,
+                       int height) {
+  int y;
+  void (*Convert16To8Row)(const uint16_t* src_y, uint8_t* dst_y, int scale,
+                          int width) = Convert16To8Row_C;
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+#if defined(HAS_CONVERT16TO8ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    Convert16To8Row = Convert16To8Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      Convert16To8Row = Convert16To8Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_CONVERT16TO8ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    Convert16To8Row = Convert16To8Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      Convert16To8Row = Convert16To8Row_AVX2;
+    }
+  }
+#endif
+
+  // Convert plane
+  for (y = 0; y < height; ++y) {
+    Convert16To8Row(src_y, dst_y, scale, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
+// Convert a plane of 8 bit data to 16 bit
+LIBYUV_API
+void Convert8To16Plane(const uint8_t* src_y,
+                       int src_stride_y,
+                       uint16_t* dst_y,
+                       int dst_stride_y,
+                       int scale,  // 16384 for 10 bits
+                       int width,
+                       int height) {
+  int y;
+  void (*Convert8To16Row)(const uint8_t* src_y, uint16_t* dst_y, int scale,
+                          int width) = Convert8To16Row_C;
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+#if defined(HAS_CONVERT8TO16ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    Convert8To16Row = Convert8To16Row_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      Convert8To16Row = Convert8To16Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_CONVERT8TO16ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    Convert8To16Row = Convert8To16Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      Convert8To16Row = Convert8To16Row_AVX2;
+    }
+  }
+#endif
+
+  // Convert plane
+  for (y = 0; y < height; ++y) {
+    Convert8To16Row(src_y, dst_y, scale, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
 // Copy I422.
 LIBYUV_API
-int I422Copy(const uint8* src_y, int src_stride_y,
-             const uint8* src_u, int src_stride_u,
-             const uint8* src_v, int src_stride_v,
-             uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int width, int height) {
+int I422Copy(const uint8_t* src_y,
+             int src_stride_y,
+             const uint8_t* src_u,
+             int src_stride_u,
+             const uint8_t* src_v,
+             int src_stride_v,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height) {
   int halfwidth = (width + 1) >> 1;
-  if (!src_u || !src_v ||
-      !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -161,16 +262,21 @@ int I422Copy(const uint8* src_y, int src_stride_y,
 
 // Copy I444.
 LIBYUV_API
-int I444Copy(const uint8* src_y, int src_stride_y,
-             const uint8* src_u, int src_stride_u,
-             const uint8* src_v, int src_stride_v,
-             uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int width, int height) {
-  if (!src_u || !src_v ||
-      !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+int I444Copy(const uint8_t* src_y,
+             int src_stride_y,
+             const uint8_t* src_u,
+             int src_stride_u,
+             const uint8_t* src_v,
+             int src_stride_v,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height) {
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -194,9 +300,12 @@ int I444Copy(const uint8* src_y, int src_stride_y,
 
 // Copy I400.
 LIBYUV_API
-int I400ToI400(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height) {
+int I400ToI400(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height) {
   if (!src_y || !dst_y || width <= 0 || height == 0) {
     return -1;
   }
@@ -212,11 +321,20 @@ int I400ToI400(const uint8* src_y, int src_stride_y,
 
 // Convert I420 to I400.
 LIBYUV_API
-int I420ToI400(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height) {
+int I420ToI400(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height) {
+  (void)src_u;
+  (void)src_stride_u;
+  (void)src_v;
+  (void)src_stride_v;
   if (!src_y || !dst_y || width <= 0 || height == 0) {
     return -1;
   }
@@ -234,12 +352,16 @@ int I420ToI400(const uint8* src_y, int src_stride_y,
 // Support function for NV12 etc UV channels.
 // Width and height are plane sizes (typically half pixel width).
 LIBYUV_API
-void SplitUVPlane(const uint8* src_uv, int src_stride_uv,
-                  uint8* dst_u, int dst_stride_u,
-                  uint8* dst_v, int dst_stride_v,
-                  int width, int height) {
+void SplitUVPlane(const uint8_t* src_uv,
+                  int src_stride_uv,
+                  uint8_t* dst_u,
+                  int dst_stride_u,
+                  uint8_t* dst_v,
+                  int dst_stride_v,
+                  int width,
+                  int height) {
   int y;
-  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+  void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
                      int width) = SplitUVRow_C;
   // Negative height means invert the image.
   if (height < 0) {
@@ -250,8 +372,7 @@ void SplitUVPlane(const uint8* src_uv, int src_stride_uv,
     dst_stride_v = -dst_stride_v;
   }
   // Coalesce rows.
-  if (src_stride_uv == width * 2 &&
-      dst_stride_u == width &&
+  if (src_stride_uv == width * 2 && dst_stride_u == width &&
       dst_stride_v == width) {
     width *= height;
     height = 1;
@@ -281,13 +402,11 @@ void SplitUVPlane(const uint8* src_uv, int src_stride_uv,
     }
   }
 #endif
-#if defined(HAS_SPLITUVROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) &&
-      IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) {
-    SplitUVRow = SplitUVRow_Any_DSPR2;
-    if (IS_ALIGNED(width, 16)) {
-      SplitUVRow = SplitUVRow_DSPR2;
+#if defined(HAS_SPLITUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SplitUVRow = SplitUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      SplitUVRow = SplitUVRow_MSA;
     }
   }
 #endif
@@ -302,13 +421,17 @@ void SplitUVPlane(const uint8* src_uv, int src_stride_uv,
 }
 
 LIBYUV_API
-void MergeUVPlane(const uint8* src_u, int src_stride_u,
-                  const uint8* src_v, int src_stride_v,
-                  uint8* dst_uv, int dst_stride_uv,
-                  int width, int height) {
+void MergeUVPlane(const uint8_t* src_u,
+                  int src_stride_u,
+                  const uint8_t* src_v,
+                  int src_stride_v,
+                  uint8_t* dst_uv,
+                  int dst_stride_uv,
+                  int width,
+                  int height) {
   int y;
-  void (*MergeUVRow)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-      int width) = MergeUVRow_C;
+  void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
+                     uint8_t* dst_uv, int width) = MergeUVRow_C;
   // Coalesce rows.
   // Negative height means invert the image.
   if (height < 0) {
@@ -317,8 +440,7 @@ void MergeUVPlane(const uint8* src_u, int src_stride_u,
     dst_stride_uv = -dst_stride_uv;
   }
   // Coalesce rows.
-  if (src_stride_u == width &&
-      src_stride_v == width &&
+  if (src_stride_u == width && src_stride_v == width &&
       dst_stride_uv == width * 2) {
     width *= height;
     height = 1;
@@ -348,6 +470,14 @@ void MergeUVPlane(const uint8* src_u, int src_stride_u,
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MergeUVRow = MergeUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      MergeUVRow = MergeUVRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     // Merge a row of U and V into a row of UV.
@@ -358,12 +488,131 @@ void MergeUVPlane(const uint8* src_u, int src_stride_u,
   }
 }
 
-// Mirror a plane of data.
-void MirrorPlane(const uint8* src_y, int src_stride_y,
-                 uint8* dst_y, int dst_stride_y,
-                 int width, int height) {
+// Support function for NV12 etc RGB channels.
+// Width and height are plane sizes (typically half pixel width).
+LIBYUV_API
+void SplitRGBPlane(const uint8_t* src_rgb,
+                   int src_stride_rgb,
+                   uint8_t* dst_r,
+                   int dst_stride_r,
+                   uint8_t* dst_g,
+                   int dst_stride_g,
+                   uint8_t* dst_b,
+                   int dst_stride_b,
+                   int width,
+                   int height) {
   int y;
-  void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
+  void (*SplitRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g,
+                      uint8_t* dst_b, int width) = SplitRGBRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_r = dst_r + (height - 1) * dst_stride_r;
+    dst_g = dst_g + (height - 1) * dst_stride_g;
+    dst_b = dst_b + (height - 1) * dst_stride_b;
+    dst_stride_r = -dst_stride_r;
+    dst_stride_g = -dst_stride_g;
+    dst_stride_b = -dst_stride_b;
+  }
+  // Coalesce rows.
+  if (src_stride_rgb == width * 3 && dst_stride_r == width &&
+      dst_stride_g == width && dst_stride_b == width) {
+    width *= height;
+    height = 1;
+    src_stride_rgb = dst_stride_r = dst_stride_g = dst_stride_b = 0;
+  }
+#if defined(HAS_SPLITRGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    SplitRGBRow = SplitRGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      SplitRGBRow = SplitRGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SPLITRGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SplitRGBRow = SplitRGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SplitRGBRow = SplitRGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    // Copy a row of RGB.
+    SplitRGBRow(src_rgb, dst_r, dst_g, dst_b, width);
+    dst_r += dst_stride_r;
+    dst_g += dst_stride_g;
+    dst_b += dst_stride_b;
+    src_rgb += src_stride_rgb;
+  }
+}
+
+LIBYUV_API
+void MergeRGBPlane(const uint8_t* src_r,
+                   int src_stride_r,
+                   const uint8_t* src_g,
+                   int src_stride_g,
+                   const uint8_t* src_b,
+                   int src_stride_b,
+                   uint8_t* dst_rgb,
+                   int dst_stride_rgb,
+                   int width,
+                   int height) {
+  int y;
+  void (*MergeRGBRow)(const uint8_t* src_r, const uint8_t* src_g,
+                      const uint8_t* src_b, uint8_t* dst_rgb, int width) =
+      MergeRGBRow_C;
+  // Coalesce rows.
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb;
+    dst_stride_rgb = -dst_stride_rgb;
+  }
+  // Coalesce rows.
+  if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+      dst_stride_rgb == width * 3) {
+    width *= height;
+    height = 1;
+    src_stride_r = src_stride_g = src_stride_b = dst_stride_rgb = 0;
+  }
+#if defined(HAS_MERGERGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    MergeRGBRow = MergeRGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      MergeRGBRow = MergeRGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_MERGERGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeRGBRow = MergeRGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      MergeRGBRow = MergeRGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    // Merge a row of U and V into a row of RGB.
+    MergeRGBRow(src_r, src_g, src_b, dst_rgb, width);
+    src_r += src_stride_r;
+    src_g += src_stride_g;
+    src_b += src_stride_b;
+    dst_rgb += dst_stride_rgb;
+  }
+}
+
+// Mirror a plane of data.
+void MirrorPlane(const uint8_t* src_y,
+                 int src_stride_y,
+                 uint8_t* dst_y,
+                 int dst_stride_y,
+                 int width,
+                 int height) {
+  int y;
+  void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -394,12 +643,12 @@ void MirrorPlane(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
-// TODO(fbarchard): Mirror on mips handle unaligned memory.
-#if defined(HAS_MIRRORROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(dst_y, 4) && IS_ALIGNED(dst_stride_y, 4)) {
-    MirrorRow = MirrorRow_DSPR2;
+#if defined(HAS_MIRRORROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MirrorRow = MirrorRow_Any_MSA;
+    if (IS_ALIGNED(width, 64)) {
+      MirrorRow = MirrorRow_MSA;
+    }
   }
 #endif
 
@@ -413,17 +662,24 @@ void MirrorPlane(const uint8* src_y, int src_stride_y,
 
 // Convert YUY2 to I422.
 LIBYUV_API
-int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int YUY2ToI422(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*YUY2ToUV422Row)(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int width) =
-      YUY2ToUV422Row_C;
-  void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int width) =
+  void (*YUY2ToUV422Row)(const uint8_t* src_yuy2, uint8_t* dst_u,
+                         uint8_t* dst_v, int width) = YUY2ToUV422Row_C;
+  void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
       YUY2ToYRow_C;
+  if (!src_yuy2 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -431,10 +687,9 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
     src_stride_yuy2 = -src_stride_yuy2;
   }
   // Coalesce rows.
-  if (src_stride_yuy2 == width * 2 &&
-      dst_stride_y == width &&
-      dst_stride_u * 2 == width &&
-      dst_stride_v * 2 == width) {
+  if (src_stride_yuy2 == width * 2 && dst_stride_y == width &&
+      dst_stride_u * 2 == width && dst_stride_v * 2 == width &&
+      width * height <= 32768) {
     width *= height;
     height = 1;
     src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0;
@@ -462,15 +717,23 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
 #if defined(HAS_YUY2TOYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     YUY2ToYRow = YUY2ToYRow_Any_NEON;
-    if (width >= 16) {
-      YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
-    }
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
       YUY2ToYRow = YUY2ToYRow_NEON;
       YUY2ToUV422Row = YUY2ToUV422Row_NEON;
     }
   }
 #endif
+#if defined(HAS_YUY2TOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    YUY2ToYRow = YUY2ToYRow_Any_MSA;
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToYRow = YUY2ToYRow_MSA;
+      YUY2ToUV422Row = YUY2ToUV422Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
@@ -485,17 +748,24 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
 
 // Convert UYVY to I422.
 LIBYUV_API
-int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int UYVYToI422(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*UYVYToUV422Row)(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int width) =
-      UYVYToUV422Row_C;
-  void (*UYVYToYRow)(const uint8* src_uyvy,
-                     uint8* dst_y, int width) = UYVYToYRow_C;
+  void (*UYVYToUV422Row)(const uint8_t* src_uyvy, uint8_t* dst_u,
+                         uint8_t* dst_v, int width) = UYVYToUV422Row_C;
+  void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) =
+      UYVYToYRow_C;
+  if (!src_uyvy || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -503,10 +773,9 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
     src_stride_uyvy = -src_stride_uyvy;
   }
   // Coalesce rows.
-  if (src_stride_uyvy == width * 2 &&
-      dst_stride_y == width &&
-      dst_stride_u * 2 == width &&
-      dst_stride_v * 2 == width) {
+  if (src_stride_uyvy == width * 2 && dst_stride_y == width &&
+      dst_stride_u * 2 == width && dst_stride_v * 2 == width &&
+      width * height <= 32768) {
     width *= height;
     height = 1;
     src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0;
@@ -534,15 +803,23 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
 #if defined(HAS_UYVYTOYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     UYVYToYRow = UYVYToYRow_Any_NEON;
-    if (width >= 16) {
-      UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
-    }
+    UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
       UYVYToYRow = UYVYToYRow_NEON;
       UYVYToUV422Row = UYVYToUV422Row_NEON;
     }
   }
 #endif
+#if defined(HAS_UYVYTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    UYVYToYRow = UYVYToYRow_Any_MSA;
+    UYVYToUV422Row = UYVYToUV422Row_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToYRow = UYVYToYRow_MSA;
+      UYVYToUV422Row = UYVYToUV422Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);
@@ -555,13 +832,82 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
   return 0;
 }
 
+// Convert YUY2 to Y.
+LIBYUV_API
+int YUY2ToY(const uint8_t* src_yuy2,
+            int src_stride_yuy2,
+            uint8_t* dst_y,
+            int dst_stride_y,
+            int width,
+            int height) {
+  int y;
+  void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
+      YUY2ToYRow_C;
+  if (!src_yuy2 || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_yuy2 == width * 2 && dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_yuy2 = dst_stride_y = 0;
+  }
+#if defined(HAS_YUY2TOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToYRow = YUY2ToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToYRow = YUY2ToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    YUY2ToYRow = YUY2ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToYRow = YUY2ToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    YUY2ToYRow = YUY2ToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToYRow = YUY2ToYRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    YUY2ToYRow(src_yuy2, dst_y, width);
+    src_yuy2 += src_stride_yuy2;
+    dst_y += dst_stride_y;
+  }
+  return 0;
+}
+
 // Mirror I400 with optional flipping
 LIBYUV_API
-int I400Mirror(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height) {
-  if (!src_y || !dst_y ||
-      width <= 0 || height == 0) {
+int I400Mirror(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height) {
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -577,17 +923,24 @@ int I400Mirror(const uint8* src_y, int src_stride_y,
 
 // Mirror I420 with optional flipping
 LIBYUV_API
-int I420Mirror(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int I420Mirror(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -612,11 +965,14 @@ int I420Mirror(const uint8* src_y, int src_stride_y,
 
 // ARGB mirror.
 LIBYUV_API
-int ARGBMirror(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int ARGBMirror(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
   int y;
-  void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
+  void (*ARGBMirrorRow)(const uint8_t* src, uint8_t* dst, int width) =
       ARGBMirrorRow_C;
   if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
@@ -651,6 +1007,14 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBMIRRORROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBMirrorRow = ARGBMirrorRow_MSA;
+    }
+  }
+#endif
 
   // Mirror plane
   for (y = 0; y < height; ++y) {
@@ -666,8 +1030,8 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
 // the same blend function for all pixels if possible.
 LIBYUV_API
 ARGBBlendRow GetARGBBlend() {
-  void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
-                       uint8* dst_argb, int width) = ARGBBlendRow_C;
+  void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1,
+                       uint8_t* dst_argb, int width) = ARGBBlendRow_C;
 #if defined(HAS_ARGBBLENDROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBBlendRow = ARGBBlendRow_SSSE3;
@@ -678,19 +1042,28 @@ ARGBBlendRow GetARGBBlend() {
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBBlendRow = ARGBBlendRow_NEON;
   }
+#endif
+#if defined(HAS_ARGBBLENDROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBBlendRow = ARGBBlendRow_MSA;
+  }
 #endif
   return ARGBBlendRow;
 }
 
 // Alpha Blend 2 ARGB images and store to destination.
 LIBYUV_API
-int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
-              const uint8* src_argb1, int src_stride_argb1,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height) {
+int ARGBBlend(const uint8_t* src_argb0,
+              int src_stride_argb0,
+              const uint8_t* src_argb1,
+              int src_stride_argb1,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height) {
   int y;
-  void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
-                       uint8* dst_argb, int width) = GetARGBBlend();
+  void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1,
+                       uint8_t* dst_argb, int width) = GetARGBBlend();
   if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -701,8 +1074,7 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb0 == width * 4 &&
-      src_stride_argb1 == width * 4 &&
+  if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
       dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
@@ -720,14 +1092,20 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
 
 // Alpha Blend plane and store to destination.
 LIBYUV_API
-int BlendPlane(const uint8* src_y0, int src_stride_y0,
-               const uint8* src_y1, int src_stride_y1,
-               const uint8* alpha, int alpha_stride,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height) {
+int BlendPlane(const uint8_t* src_y0,
+               int src_stride_y0,
+               const uint8_t* src_y1,
+               int src_stride_y1,
+               const uint8_t* alpha,
+               int alpha_stride,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height) {
   int y;
-  void (*BlendPlaneRow)(const uint8* src0, const uint8* src1,
-      const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C;
+  void (*BlendPlaneRow)(const uint8_t* src0, const uint8_t* src1,
+                        const uint8_t* alpha, uint8_t* dst, int width) =
+      BlendPlaneRow_C;
   if (!src_y0 || !src_y1 || !alpha || !dst_y || width <= 0 || height == 0) {
     return -1;
   }
@@ -739,10 +1117,8 @@ int BlendPlane(const uint8* src_y0, int src_stride_y0,
   }
 
   // Coalesce rows for Y plane.
-  if (src_stride_y0 == width &&
-      src_stride_y1 == width &&
-      alpha_stride == width &&
-      dst_stride_y == width) {
+  if (src_stride_y0 == width && src_stride_y1 == width &&
+      alpha_stride == width && dst_stride_y == width) {
     width *= height;
     height = 1;
     src_stride_y0 = src_stride_y1 = alpha_stride = dst_stride_y = 0;
@@ -750,7 +1126,7 @@ int BlendPlane(const uint8* src_y0, int src_stride_y0,
 
 #if defined(HAS_BLENDPLANEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-  BlendPlaneRow = BlendPlaneRow_Any_SSSE3;
+    BlendPlaneRow = BlendPlaneRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
       BlendPlaneRow = BlendPlaneRow_SSSE3;
     }
@@ -758,7 +1134,7 @@ int BlendPlane(const uint8* src_y0, int src_stride_y0,
 #endif
 #if defined(HAS_BLENDPLANEROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-  BlendPlaneRow = BlendPlaneRow_Any_AVX2;
+    BlendPlaneRow = BlendPlaneRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
       BlendPlaneRow = BlendPlaneRow_AVX2;
     }
@@ -778,24 +1154,36 @@ int BlendPlane(const uint8* src_y0, int src_stride_y0,
 #define MAXTWIDTH 2048
 // Alpha Blend YUV images and store to destination.
 LIBYUV_API
-int I420Blend(const uint8* src_y0, int src_stride_y0,
-              const uint8* src_u0, int src_stride_u0,
-              const uint8* src_v0, int src_stride_v0,
-              const uint8* src_y1, int src_stride_y1,
-              const uint8* src_u1, int src_stride_u1,
-              const uint8* src_v1, int src_stride_v1,
-              const uint8* alpha, int alpha_stride,
-              uint8* dst_y, int dst_stride_y,
-              uint8* dst_u, int dst_stride_u,
-              uint8* dst_v, int dst_stride_v,
-              int width, int height) {
+int I420Blend(const uint8_t* src_y0,
+              int src_stride_y0,
+              const uint8_t* src_u0,
+              int src_stride_u0,
+              const uint8_t* src_v0,
+              int src_stride_v0,
+              const uint8_t* src_y1,
+              int src_stride_y1,
+              const uint8_t* src_u1,
+              int src_stride_u1,
+              const uint8_t* src_v1,
+              int src_stride_v1,
+              const uint8_t* alpha,
+              int alpha_stride,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int width,
+              int height) {
   int y;
   // Half width/height for UV.
   int halfwidth = (width + 1) >> 1;
-  void (*BlendPlaneRow)(const uint8* src0, const uint8* src1,
-      const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C;
-  void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) = ScaleRowDown2Box_C;
+  void (*BlendPlaneRow)(const uint8_t* src0, const uint8_t* src1,
+                        const uint8_t* alpha, uint8_t* dst, int width) =
+      BlendPlaneRow_C;
+  void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                        uint8_t* dst_ptr, int dst_width) = ScaleRowDown2Box_C;
   if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 ||
       !alpha || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
@@ -809,11 +1197,8 @@ int I420Blend(const uint8* src_y0, int src_stride_y0,
   }
 
   // Blend Y plane.
-  BlendPlane(src_y0, src_stride_y0,
-             src_y1, src_stride_y1,
-             alpha, alpha_stride,
-             dst_y, dst_stride_y,
-             width, height);
+  BlendPlane(src_y0, src_stride_y0, src_y1, src_stride_y1, alpha, alpha_stride,
+             dst_y, dst_stride_y, width, height);
 
 #if defined(HAS_BLENDPLANEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
@@ -893,13 +1278,17 @@ int I420Blend(const uint8* src_y0, int src_stride_y0,
 
 // Multiply 2 ARGB images and store to destination.
 LIBYUV_API
-int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
-                 const uint8* src_argb1, int src_stride_argb1,
-                 uint8* dst_argb, int dst_stride_argb,
-                 int width, int height) {
+int ARGBMultiply(const uint8_t* src_argb0,
+                 int src_stride_argb0,
+                 const uint8_t* src_argb1,
+                 int src_stride_argb1,
+                 uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int width,
+                 int height) {
   int y;
-  void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst,
-                          int width) = ARGBMultiplyRow_C;
+  void (*ARGBMultiplyRow)(const uint8_t* src0, const uint8_t* src1,
+                          uint8_t* dst, int width) = ARGBMultiplyRow_C;
   if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -910,8 +1299,7 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb0 == width * 4 &&
-      src_stride_argb1 == width * 4 &&
+  if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
       dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
@@ -941,6 +1329,14 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
     }
   }
 #endif
+#if defined(HAS_ARGBMULTIPLYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_Any_MSA;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMultiplyRow = ARGBMultiplyRow_MSA;
+    }
+  }
+#endif
 
   // Multiply plane
   for (y = 0; y < height; ++y) {
@@ -954,12 +1350,16 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
 
 // Add 2 ARGB images and store to destination.
 LIBYUV_API
-int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
-            const uint8* src_argb1, int src_stride_argb1,
-            uint8* dst_argb, int dst_stride_argb,
-            int width, int height) {
+int ARGBAdd(const uint8_t* src_argb0,
+            int src_stride_argb0,
+            const uint8_t* src_argb1,
+            int src_stride_argb1,
+            uint8_t* dst_argb,
+            int dst_stride_argb,
+            int width,
+            int height) {
   int y;
-  void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst,
+  void (*ARGBAddRow)(const uint8_t* src0, const uint8_t* src1, uint8_t* dst,
                      int width) = ARGBAddRow_C;
   if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
     return -1;
@@ -971,8 +1371,7 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb0 == width * 4 &&
-      src_stride_argb1 == width * 4 &&
+  if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
       dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
@@ -1007,6 +1406,14 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
     }
   }
 #endif
+#if defined(HAS_ARGBADDROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBAddRow = ARGBAddRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAddRow = ARGBAddRow_MSA;
+    }
+  }
+#endif
 
   // Add plane
   for (y = 0; y < height; ++y) {
@@ -1020,13 +1427,17 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
 
 // Subtract 2 ARGB images and store to destination.
 LIBYUV_API
-int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
-                 const uint8* src_argb1, int src_stride_argb1,
-                 uint8* dst_argb, int dst_stride_argb,
-                 int width, int height) {
+int ARGBSubtract(const uint8_t* src_argb0,
+                 int src_stride_argb0,
+                 const uint8_t* src_argb1,
+                 int src_stride_argb1,
+                 uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int width,
+                 int height) {
   int y;
-  void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst,
-                          int width) = ARGBSubtractRow_C;
+  void (*ARGBSubtractRow)(const uint8_t* src0, const uint8_t* src1,
+                          uint8_t* dst, int width) = ARGBSubtractRow_C;
   if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -1037,8 +1448,7 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb0 == width * 4 &&
-      src_stride_argb1 == width * 4 &&
+  if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
       dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
@@ -1068,6 +1478,14 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
     }
   }
 #endif
+#if defined(HAS_ARGBSUBTRACTROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBSubtractRow = ARGBSubtractRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBSubtractRow = ARGBSubtractRow_MSA;
+    }
+  }
+#endif
 
   // Subtract plane
   for (y = 0; y < height; ++y) {
@@ -1079,21 +1497,23 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
   return 0;
 }
 // Convert I422 to RGBA with matrix
-static int I422ToRGBAMatrix(const uint8* src_y, int src_stride_y,
-                            const uint8* src_u, int src_stride_u,
-                            const uint8* src_v, int src_stride_v,
-                            uint8* dst_rgba, int dst_stride_rgba,
+static int I422ToRGBAMatrix(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_rgba,
+                            int dst_stride_rgba,
                             const struct YuvConstants* yuvconstants,
-                            int width, int height) {
+                            int width,
+                            int height) {
   int y;
-  void (*I422ToRGBARow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = I422ToRGBARow_C;
-  if (!src_y || !src_u || !src_v || !dst_rgba ||
-      width <= 0 || height == 0) {
+  void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGBARow_C;
+  if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1126,13 +1546,12 @@ static int I422ToRGBAMatrix(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
-#if defined(HAS_I422TORGBAROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
-      IS_ALIGNED(dst_rgba, 4) && IS_ALIGNED(dst_stride_rgba, 4)) {
-    I422ToRGBARow = I422ToRGBARow_DSPR2;
+#if defined(HAS_I422TORGBAROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGBARow = I422ToRGBARow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_MSA;
+    }
   }
 #endif
 
@@ -1148,48 +1567,55 @@ static int I422ToRGBAMatrix(const uint8* src_y, int src_stride_y,
 
 // Convert I422 to RGBA.
 LIBYUV_API
-int I422ToRGBA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_rgba, int dst_stride_rgba,
-               int width, int height) {
-  return I422ToRGBAMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_rgba, dst_stride_rgba,
-                          &kYuvI601Constants,
-                          width, height);
+int I422ToRGBA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height) {
+  return I422ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_rgba, dst_stride_rgba,
+                          &kYuvI601Constants, width, height);
 }
 
 // Convert I422 to BGRA.
 LIBYUV_API
-int I422ToBGRA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_bgra, int dst_stride_bgra,
-               int width, int height) {
-  return I422ToRGBAMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_bgra, dst_stride_bgra,
+int I422ToBGRA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_bgra,
+               int dst_stride_bgra,
+               int width,
+               int height) {
+  return I422ToRGBAMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_bgra, dst_stride_bgra,
                           &kYvuI601Constants,  // Use Yvu matrix
                           width, height);
 }
 
 // Convert NV12 to RGB565.
 LIBYUV_API
-int NV12ToRGB565(const uint8* src_y, int src_stride_y,
-                 const uint8* src_uv, int src_stride_uv,
-                 uint8* dst_rgb565, int dst_stride_rgb565,
-                 int width, int height) {
+int NV12ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_uv,
+                 int src_stride_uv,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height) {
   int y;
-  void (*NV12ToRGB565Row)(const uint8* y_buf,
-                          const uint8* uv_buf,
-                          uint8* rgb_buf,
-                          const struct YuvConstants* yuvconstants,
-                          int width) = NV12ToRGB565Row_C;
-  if (!src_y || !src_uv || !dst_rgb565 ||
-      width <= 0 || height == 0) {
+  void (*NV12ToRGB565Row)(
+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C;
+  if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1222,6 +1648,14 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_NV12TORGB565ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     NV12ToRGB565Row(src_y, src_uv, dst_rgb565, &kYuvI601Constants, width);
@@ -1236,14 +1670,16 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
 
 // Convert RAW to RGB24.
 LIBYUV_API
-int RAWToRGB24(const uint8* src_raw, int src_stride_raw,
-               uint8* dst_rgb24, int dst_stride_rgb24,
-               int width, int height) {
+int RAWToRGB24(const uint8_t* src_raw,
+               int src_stride_raw,
+               uint8_t* dst_rgb24,
+               int dst_stride_rgb24,
+               int width,
+               int height) {
   int y;
-  void (*RAWToRGB24Row)(const uint8* src_rgb, uint8* dst_rgb24, int width) =
+  void (*RAWToRGB24Row)(const uint8_t* src_rgb, uint8_t* dst_rgb24, int width) =
       RAWToRGB24Row_C;
-  if (!src_raw || !dst_rgb24 ||
-      width <= 0 || height == 0) {
+  if (!src_raw || !dst_rgb24 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1253,8 +1689,7 @@ int RAWToRGB24(const uint8* src_raw, int src_stride_raw,
     src_stride_raw = -src_stride_raw;
   }
   // Coalesce rows.
-  if (src_stride_raw == width * 3 &&
-      dst_stride_rgb24 == width * 3) {
+  if (src_stride_raw == width * 3 && dst_stride_rgb24 == width * 3) {
     width *= height;
     height = 1;
     src_stride_raw = dst_stride_rgb24 = 0;
@@ -1275,6 +1710,14 @@ int RAWToRGB24(const uint8* src_raw, int src_stride_raw,
     }
   }
 #endif
+#if defined(HAS_RAWTORGB24ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RAWToRGB24Row = RAWToRGB24Row_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToRGB24Row = RAWToRGB24Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     RAWToRGB24Row(src_raw, dst_rgb24, width);
@@ -1285,11 +1728,13 @@ int RAWToRGB24(const uint8* src_raw, int src_stride_raw,
 }
 
 LIBYUV_API
-void SetPlane(uint8* dst_y, int dst_stride_y,
-              int width, int height,
-              uint32 value) {
+void SetPlane(uint8_t* dst_y,
+              int dst_stride_y,
+              int width,
+              int height,
+              uint32_t value) {
   int y;
-  void (*SetRow)(uint8* dst, uint8 value, int width) = SetRow_C;
+  void (*SetRow)(uint8_t * dst, uint8_t value, int width) = SetRow_C;
   if (height < 0) {
     height = -height;
     dst_y = dst_y + (height - 1) * dst_stride_y;
@@ -1322,6 +1767,11 @@ void SetPlane(uint8* dst_y, int dst_stride_y,
     SetRow = SetRow_ERMS;
   }
 #endif
+#if defined(HAS_SETROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 16)) {
+    SetRow = SetRow_MSA;
+  }
+#endif
 
   // Set plane
   for (y = 0; y < height; ++y) {
@@ -1332,22 +1782,26 @@ void SetPlane(uint8* dst_y, int dst_stride_y,
 
 // Draw a rectangle into I420
 LIBYUV_API
-int I420Rect(uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int x, int y,
-             int width, int height,
-             int value_y, int value_u, int value_v) {
+int I420Rect(uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int x,
+             int y,
+             int width,
+             int height,
+             int value_y,
+             int value_u,
+             int value_v) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  uint8* start_y = dst_y + y * dst_stride_y + x;
-  uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
-  uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
-  if (!dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0 ||
-      x < 0 || y < 0 ||
-      value_y < 0 || value_y > 255 ||
-      value_u < 0 || value_u > 255 ||
+  uint8_t* start_y = dst_y + y * dst_stride_y + x;
+  uint8_t* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
+  uint8_t* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
+  if (!dst_y || !dst_u || !dst_v || width <= 0 || height == 0 || x < 0 ||
+      y < 0 || value_y < 0 || value_y > 255 || value_u < 0 || value_u > 255 ||
       value_v < 0 || value_v > 255) {
     return -1;
   }
@@ -1360,15 +1814,17 @@ int I420Rect(uint8* dst_y, int dst_stride_y,
 
 // Draw a rectangle into ARGB
 LIBYUV_API
-int ARGBRect(uint8* dst_argb, int dst_stride_argb,
-             int dst_x, int dst_y,
-             int width, int height,
-             uint32 value) {
+int ARGBRect(uint8_t* dst_argb,
+             int dst_stride_argb,
+             int dst_x,
+             int dst_y,
+             int width,
+             int height,
+             uint32_t value) {
   int y;
-  void (*ARGBSetRow)(uint8* dst_argb, uint32 value, int width) = ARGBSetRow_C;
-  if (!dst_argb ||
-      width <= 0 || height == 0 ||
-      dst_x < 0 || dst_y < 0) {
+  void (*ARGBSetRow)(uint8_t * dst_argb, uint32_t value, int width) =
+      ARGBSetRow_C;
+  if (!dst_argb || width <= 0 || height == 0 || dst_x < 0 || dst_y < 0) {
     return -1;
   }
   if (height < 0) {
@@ -1397,6 +1853,14 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
     ARGBSetRow = ARGBSetRow_X86;
   }
 #endif
+#if defined(HAS_ARGBSETROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBSetRow = ARGBSetRow_Any_MSA;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBSetRow = ARGBSetRow_MSA;
+    }
+  }
+#endif
 
   // Set plane
   for (y = 0; y < height; ++y) {
@@ -1420,11 +1884,14 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
 //   f is foreground pixel premultiplied by alpha
 
 LIBYUV_API
-int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
-                  uint8* dst_argb, int dst_stride_argb,
-                  int width, int height) {
+int ARGBAttenuate(const uint8_t* src_argb,
+                  int src_stride_argb,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int width,
+                  int height) {
   int y;
-  void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
+  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                            int width) = ARGBAttenuateRow_C;
   if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
@@ -1435,8 +1902,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -1465,6 +1931,14 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBAttenuateRow(src_argb, dst_argb, width);
@@ -1476,11 +1950,14 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
 
 // Convert preattentuated ARGB to unattenuated ARGB.
 LIBYUV_API
-int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height) {
+int ARGBUnattenuate(const uint8_t* src_argb,
+                    int src_stride_argb,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height) {
   int y;
-  void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb,
+  void (*ARGBUnattenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                              int width) = ARGBUnattenuateRow_C;
   if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
@@ -1491,8 +1968,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -1513,7 +1989,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
-// TODO(fbarchard): Neon version.
+  // TODO(fbarchard): Neon version.
 
   for (y = 0; y < height; ++y) {
     ARGBUnattenuateRow(src_argb, dst_argb, width);
@@ -1525,12 +2001,15 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
 
 // Convert ARGB to Grayed ARGB.
 LIBYUV_API
-int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int ARGBGrayTo(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
   int y;
-  void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
-                      int width) = ARGBGrayRow_C;
+  void (*ARGBGrayRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
+      ARGBGrayRow_C;
   if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -1540,8 +2019,7 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -1556,6 +2034,11 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
     ARGBGrayRow = ARGBGrayRow_NEON;
   }
 #endif
+#if defined(HAS_ARGBGRAYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_MSA;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBGrayRow(src_argb, dst_argb, width);
@@ -1567,13 +2050,16 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
 
 // Make a rectangle of ARGB gray scale.
 LIBYUV_API
-int ARGBGray(uint8* dst_argb, int dst_stride_argb,
-             int dst_x, int dst_y,
-             int width, int height) {
+int ARGBGray(uint8_t* dst_argb,
+             int dst_stride_argb,
+             int dst_x,
+             int dst_y,
+             int width,
+             int height) {
   int y;
-  void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
-                      int width) = ARGBGrayRow_C;
-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  void (*ARGBGrayRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
+      ARGBGrayRow_C;
+  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
   if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
     return -1;
   }
@@ -1593,6 +2079,12 @@ int ARGBGray(uint8* dst_argb, int dst_stride_argb,
     ARGBGrayRow = ARGBGrayRow_NEON;
   }
 #endif
+#if defined(HAS_ARGBGRAYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_MSA;
+  }
+#endif
+
   for (y = 0; y < height; ++y) {
     ARGBGrayRow(dst, dst, width);
     dst += dst_stride_argb;
@@ -1602,11 +2094,15 @@ int ARGBGray(uint8* dst_argb, int dst_stride_argb,
 
 // Make a rectangle of ARGB Sepia tone.
 LIBYUV_API
-int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
-              int dst_x, int dst_y, int width, int height) {
+int ARGBSepia(uint8_t* dst_argb,
+              int dst_stride_argb,
+              int dst_x,
+              int dst_y,
+              int width,
+              int height) {
   int y;
-  void (*ARGBSepiaRow)(uint8* dst_argb, int width) = ARGBSepiaRow_C;
-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  void (*ARGBSepiaRow)(uint8_t * dst_argb, int width) = ARGBSepiaRow_C;
+  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
   if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
     return -1;
   }
@@ -1626,6 +2122,12 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
     ARGBSepiaRow = ARGBSepiaRow_NEON;
   }
 #endif
+#if defined(HAS_ARGBSEPIAROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+    ARGBSepiaRow = ARGBSepiaRow_MSA;
+  }
+#endif
+
   for (y = 0; y < height; ++y) {
     ARGBSepiaRow(dst, width);
     dst += dst_stride_argb;
@@ -1636,13 +2138,17 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
 // Apply a 4x4 matrix to each ARGB pixel.
 // Note: Normally for shading, but can be used to swizzle or invert.
 LIBYUV_API
-int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_argb, int dst_stride_argb,
-                    const int8* matrix_argb,
-                    int width, int height) {
+int ARGBColorMatrix(const uint8_t* src_argb,
+                    int src_stride_argb,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    const int8_t* matrix_argb,
+                    int width,
+                    int height) {
   int y;
-  void (*ARGBColorMatrixRow)(const uint8* src_argb, uint8* dst_argb,
-      const int8* matrix_argb, int width) = ARGBColorMatrixRow_C;
+  void (*ARGBColorMatrixRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+                             const int8_t* matrix_argb, int width) =
+      ARGBColorMatrixRow_C;
   if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -1652,8 +2158,7 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -1667,6 +2172,11 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
     ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
   }
+#endif
+#if defined(HAS_ARGBCOLORMATRIXROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+    ARGBColorMatrixRow = ARGBColorMatrixRow_MSA;
+  }
 #endif
   for (y = 0; y < height; ++y) {
     ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width);
@@ -1679,13 +2189,17 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
 // Apply a 4x3 matrix to each ARGB pixel.
 // Deprecated.
 LIBYUV_API
-int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
-                   const int8* matrix_rgb,
-                   int dst_x, int dst_y, int width, int height) {
-  SIMD_ALIGNED(int8 matrix_argb[16]);
-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
-  if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 ||
-      dst_x < 0 || dst_y < 0) {
+int RGBColorMatrix(uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   const int8_t* matrix_rgb,
+                   int dst_x,
+                   int dst_y,
+                   int width,
+                   int height) {
+  SIMD_ALIGNED(int8_t matrix_argb[16]);
+  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 || dst_x < 0 ||
+      dst_y < 0) {
     return -1;
   }
 
@@ -1705,23 +2219,26 @@ int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
   matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0;
   matrix_argb[15] = 64;  // 1.0
 
-  return ARGBColorMatrix((const uint8*)(dst), dst_stride_argb,
-                         dst, dst_stride_argb,
-                         &matrix_argb[0], width, height);
+  return ARGBColorMatrix((const uint8_t*)(dst), dst_stride_argb, dst,
+                         dst_stride_argb, &matrix_argb[0], width, height);
 }
 
 // Apply a color table each ARGB pixel.
 // Table contains 256 ARGB values.
 LIBYUV_API
-int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
-                   const uint8* table_argb,
-                   int dst_x, int dst_y, int width, int height) {
+int ARGBColorTable(uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   const uint8_t* table_argb,
+                   int dst_x,
+                   int dst_y,
+                   int width,
+                   int height) {
   int y;
-  void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
+  void (*ARGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb,
                             int width) = ARGBColorTableRow_C;
-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
-  if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
-      dst_x < 0 || dst_y < 0) {
+  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 ||
+      dst_y < 0) {
     return -1;
   }
   // Coalesce rows.
@@ -1745,15 +2262,19 @@ int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
 // Apply a color table each ARGB pixel but preserve destination alpha.
 // Table contains 256 ARGB values.
 LIBYUV_API
-int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
-                  const uint8* table_argb,
-                  int dst_x, int dst_y, int width, int height) {
+int RGBColorTable(uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  const uint8_t* table_argb,
+                  int dst_x,
+                  int dst_y,
+                  int width,
+                  int height) {
   int y;
-  void (*RGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
+  void (*RGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb,
                            int width) = RGBColorTableRow_C;
-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
-  if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
-      dst_x < 0 || dst_y < 0) {
+  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 ||
+      dst_y < 0) {
     return -1;
   }
   // Coalesce rows.
@@ -1784,13 +2305,19 @@ int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
 // Caveat - although SSE2 saturates, the C function does not and should be used
 // with care if doing anything but quantization.
 LIBYUV_API
-int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
-                 int scale, int interval_size, int interval_offset,
-                 int dst_x, int dst_y, int width, int height) {
+int ARGBQuantize(uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int scale,
+                 int interval_size,
+                 int interval_offset,
+                 int dst_x,
+                 int dst_y,
+                 int width,
+                 int height) {
   int y;
-  void (*ARGBQuantizeRow)(uint8* dst_argb, int scale, int interval_size,
+  void (*ARGBQuantizeRow)(uint8_t * dst_argb, int scale, int interval_size,
                           int interval_offset, int width) = ARGBQuantizeRow_C;
-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
   if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 ||
       interval_size < 1 || interval_size > 255) {
     return -1;
@@ -1810,6 +2337,11 @@ int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
     ARGBQuantizeRow = ARGBQuantizeRow_NEON;
   }
+#endif
+#if defined(HAS_ARGBQUANTIZEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+    ARGBQuantizeRow = ARGBQuantizeRow_MSA;
+  }
 #endif
   for (y = 0; y < height; ++y) {
     ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width);
@@ -1821,13 +2353,17 @@ int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
 // Computes table of cumulative sum for image where the value is the sum
 // of all values above and to the left of the entry. Used by ARGBBlur.
 LIBYUV_API
-int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
-                             int32* dst_cumsum, int dst_stride32_cumsum,
-                             int width, int height) {
+int ARGBComputeCumulativeSum(const uint8_t* src_argb,
+                             int src_stride_argb,
+                             int32_t* dst_cumsum,
+                             int dst_stride32_cumsum,
+                             int width,
+                             int height) {
   int y;
-  void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
-      const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
-  int32* previous_cumsum = dst_cumsum;
+  void (*ComputeCumulativeSumRow)(const uint8_t* row, int32_t* cumsum,
+                                  const int32_t* previous_cumsum, int width) =
+      ComputeCumulativeSumRow_C;
+  int32_t* previous_cumsum = dst_cumsum;
   if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) {
     return -1;
   }
@@ -1851,18 +2387,25 @@ int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
 // aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory
 // as the buffer is treated as circular.
 LIBYUV_API
-int ARGBBlur(const uint8* src_argb, int src_stride_argb,
-             uint8* dst_argb, int dst_stride_argb,
-             int32* dst_cumsum, int dst_stride32_cumsum,
-             int width, int height, int radius) {
+int ARGBBlur(const uint8_t* src_argb,
+             int src_stride_argb,
+             uint8_t* dst_argb,
+             int dst_stride_argb,
+             int32_t* dst_cumsum,
+             int dst_stride32_cumsum,
+             int width,
+             int height,
+             int radius) {
   int y;
-  void (*ComputeCumulativeSumRow)(const uint8 *row, int32 *cumsum,
-      const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
-  void (*CumulativeSumToAverageRow)(const int32* topleft, const int32* botleft,
-      int width, int area, uint8* dst, int count) = CumulativeSumToAverageRow_C;
-  int32* cumsum_bot_row;
-  int32* max_cumsum_bot_row;
-  int32* cumsum_top_row;
+  void (*ComputeCumulativeSumRow)(const uint8_t* row, int32_t* cumsum,
+                                  const int32_t* previous_cumsum, int width) =
+      ComputeCumulativeSumRow_C;
+  void (*CumulativeSumToAverageRow)(
+      const int32_t* topleft, const int32_t* botleft, int width, int area,
+      uint8_t* dst, int count) = CumulativeSumToAverageRow_C;
+  int32_t* cumsum_bot_row;
+  int32_t* max_cumsum_bot_row;
+  int32_t* cumsum_top_row;
 
   if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
@@ -1889,9 +2432,8 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb,
 #endif
   // Compute enough CumulativeSum for first row to be blurred. After this
   // one row of CumulativeSum is updated at a time.
-  ARGBComputeCumulativeSum(src_argb, src_stride_argb,
-                           dst_cumsum, dst_stride32_cumsum,
-                           width, radius);
+  ARGBComputeCumulativeSum(src_argb, src_stride_argb, dst_cumsum,
+                           dst_stride32_cumsum, width, radius);
 
   src_argb = src_argb + radius * src_stride_argb;
   cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum];
@@ -1917,7 +2459,7 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb,
     // Increment cumsum_bot_row pointer with circular buffer wrap around and
     // then fill in a row of CumulativeSum.
     if ((y + radius) < height) {
-      const int32* prev_cumsum_bot_row = cumsum_bot_row;
+      const int32_t* prev_cumsum_bot_row = cumsum_bot_row;
       cumsum_bot_row += dst_stride32_cumsum;
       if (cumsum_bot_row >= max_cumsum_bot_row) {
         cumsum_bot_row = dst_cumsum;
@@ -1929,24 +2471,24 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb,
 
     // Left clipped.
     for (x = 0; x < radius + 1; ++x) {
-      CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
-                                boxwidth, area, &dst_argb[x * 4], 1);
+      CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area,
+                                &dst_argb[x * 4], 1);
       area += (bot_y - top_y);
       boxwidth += 4;
     }
 
     // Middle unclipped.
     n = (width - 1) - radius - x + 1;
-    CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
-                              boxwidth, area, &dst_argb[x * 4], n);
+    CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area,
+                              &dst_argb[x * 4], n);
 
     // Right clipped.
     for (x += n; x <= width - 1; ++x) {
       area -= (bot_y - top_y);
       boxwidth -= 4;
       CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4,
-                                cumsum_bot_row + (x - radius - 1) * 4,
-                                boxwidth, area, &dst_argb[x * 4], 1);
+                                cumsum_bot_row + (x - radius - 1) * 4, boxwidth,
+                                area, &dst_argb[x * 4], 1);
     }
     dst_argb += dst_stride_argb;
   }
@@ -1955,12 +2497,16 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb,
 
 // Multiply ARGB image by a specified ARGB value.
 LIBYUV_API
-int ARGBShade(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height, uint32 value) {
+int ARGBShade(const uint8_t* src_argb,
+              int src_stride_argb,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height,
+              uint32_t value) {
   int y;
-  void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb,
-                       int width, uint32 value) = ARGBShadeRow_C;
+  void (*ARGBShadeRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width,
+                       uint32_t value) = ARGBShadeRow_C;
   if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) {
     return -1;
   }
@@ -1970,8 +2516,7 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -1986,6 +2531,11 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
     ARGBShadeRow = ARGBShadeRow_NEON;
   }
 #endif
+#if defined(HAS_ARGBSHADEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 4)) {
+    ARGBShadeRow = ARGBShadeRow_MSA;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBShadeRow(src_argb, dst_argb, width, value);
@@ -1997,12 +2547,17 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
 
 // Interpolate 2 planes by specified amount (0 to 255).
 LIBYUV_API
-int InterpolatePlane(const uint8* src0, int src_stride0,
-                     const uint8* src1, int src_stride1,
-                     uint8* dst, int dst_stride,
-                     int width, int height, int interpolation) {
+int InterpolatePlane(const uint8_t* src0,
+                     int src_stride0,
+                     const uint8_t* src1,
+                     int src_stride1,
+                     uint8_t* dst,
+                     int dst_stride,
+                     int width,
+                     int height,
+                     int interpolation) {
   int y;
-  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
   if (!src0 || !src1 || !dst || width <= 0 || height == 0) {
@@ -2015,9 +2570,7 @@ int InterpolatePlane(const uint8* src0, int src_stride0,
     dst_stride = -dst_stride;
   }
   // Coalesce rows.
-  if (src_stride0 == width &&
-      src_stride1 == width &&
-      dst_stride == width) {
+  if (src_stride0 == width && src_stride1 == width && dst_stride == width) {
     width *= height;
     height = 1;
     src_stride0 = src_stride1 = dst_stride = 0;
@@ -2046,13 +2599,12 @@ int InterpolatePlane(const uint8* src0, int src_stride0,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(src0, 4) && IS_ALIGNED(src_stride0, 4) &&
-      IS_ALIGNED(src1, 4) && IS_ALIGNED(src_stride1, 4) &&
-      IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4) &&
-      IS_ALIGNED(width, 4)) {
-    InterpolateRow = InterpolateRow_DSPR2;
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
   }
 #endif
 
@@ -2067,61 +2619,71 @@ int InterpolatePlane(const uint8* src0, int src_stride0,
 
 // Interpolate 2 ARGB images by specified amount (0 to 255).
 LIBYUV_API
-int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
-                    const uint8* src_argb1, int src_stride_argb1,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height, int interpolation) {
-  return InterpolatePlane(src_argb0, src_stride_argb0,
-                          src_argb1, src_stride_argb1,
-                          dst_argb, dst_stride_argb,
+int ARGBInterpolate(const uint8_t* src_argb0,
+                    int src_stride_argb0,
+                    const uint8_t* src_argb1,
+                    int src_stride_argb1,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height,
+                    int interpolation) {
+  return InterpolatePlane(src_argb0, src_stride_argb0, src_argb1,
+                          src_stride_argb1, dst_argb, dst_stride_argb,
                           width * 4, height, interpolation);
 }
 
 // Interpolate 2 YUV images by specified amount (0 to 255).
 LIBYUV_API
-int I420Interpolate(const uint8* src0_y, int src0_stride_y,
-                    const uint8* src0_u, int src0_stride_u,
-                    const uint8* src0_v, int src0_stride_v,
-                    const uint8* src1_y, int src1_stride_y,
-                    const uint8* src1_u, int src1_stride_u,
-                    const uint8* src1_v, int src1_stride_v,
-                    uint8* dst_y, int dst_stride_y,
-                    uint8* dst_u, int dst_stride_u,
-                    uint8* dst_v, int dst_stride_v,
-                    int width, int height, int interpolation) {
+int I420Interpolate(const uint8_t* src0_y,
+                    int src0_stride_y,
+                    const uint8_t* src0_u,
+                    int src0_stride_u,
+                    const uint8_t* src0_v,
+                    int src0_stride_v,
+                    const uint8_t* src1_y,
+                    int src1_stride_y,
+                    const uint8_t* src1_u,
+                    int src1_stride_u,
+                    const uint8_t* src1_v,
+                    int src1_stride_v,
+                    uint8_t* dst_y,
+                    int dst_stride_y,
+                    uint8_t* dst_u,
+                    int dst_stride_u,
+                    uint8_t* dst_v,
+                    int dst_stride_v,
+                    int width,
+                    int height,
+                    int interpolation) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if (!src0_y || !src0_u || !src0_v ||
-      !src1_y || !src1_u || !src1_v ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src0_y || !src0_u || !src0_v || !src1_y || !src1_u || !src1_v ||
+      !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
-  InterpolatePlane(src0_y, src0_stride_y,
-                   src1_y, src1_stride_y,
-                   dst_y, dst_stride_y,
-                   width, height, interpolation);
-  InterpolatePlane(src0_u, src0_stride_u,
-                   src1_u, src1_stride_u,
-                   dst_u, dst_stride_u,
-                   halfwidth, halfheight, interpolation);
-  InterpolatePlane(src0_v, src0_stride_v,
-                   src1_v, src1_stride_v,
-                   dst_v, dst_stride_v,
-                   halfwidth, halfheight, interpolation);
+  InterpolatePlane(src0_y, src0_stride_y, src1_y, src1_stride_y, dst_y,
+                   dst_stride_y, width, height, interpolation);
+  InterpolatePlane(src0_u, src0_stride_u, src1_u, src1_stride_u, dst_u,
+                   dst_stride_u, halfwidth, halfheight, interpolation);
+  InterpolatePlane(src0_v, src0_stride_v, src1_v, src1_stride_v, dst_v,
+                   dst_stride_v, halfwidth, halfheight, interpolation);
   return 0;
 }
 
 // Shuffle ARGB channel order.  e.g. BGRA to ARGB.
 LIBYUV_API
-int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
-                uint8* dst_argb, int dst_stride_argb,
-                const uint8* shuffler, int width, int height) {
+int ARGBShuffle(const uint8_t* src_bgra,
+                int src_stride_bgra,
+                uint8_t* dst_argb,
+                int dst_stride_argb,
+                const uint8_t* shuffler,
+                int width,
+                int height) {
   int y;
-  void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb,
-                         const uint8* shuffler, int width) = ARGBShuffleRow_C;
-  if (!src_bgra || !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*ARGBShuffleRow)(const uint8_t* src_bgra, uint8_t* dst_argb,
+                         const uint8_t* shuffler, int width) = ARGBShuffleRow_C;
+  if (!src_bgra || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -2131,20 +2693,11 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
     src_stride_bgra = -src_stride_bgra;
   }
   // Coalesce rows.
-  if (src_stride_bgra == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_bgra == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_bgra = dst_stride_argb = 0;
   }
-#if defined(HAS_ARGBSHUFFLEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBShuffleRow = ARGBShuffleRow_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBShuffleRow = ARGBShuffleRow_SSE2;
-    }
-  }
-#endif
 #if defined(HAS_ARGBSHUFFLEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3;
@@ -2169,6 +2722,14 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
     }
   }
 #endif
+#if defined(HAS_ARGBSHUFFLEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBShuffleRow = ARGBShuffleRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBShuffleRow(src_bgra, dst_argb, shuffler, width);
@@ -2179,28 +2740,32 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
 }
 
 // Sobel ARGB effect.
-static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
-                        uint8* dst_argb, int dst_stride_argb,
-                        int width, int height,
-                        void (*SobelRow)(const uint8* src_sobelx,
-                                         const uint8* src_sobely,
-                                         uint8* dst, int width)) {
+static int ARGBSobelize(const uint8_t* src_argb,
+                        int src_stride_argb,
+                        uint8_t* dst_argb,
+                        int dst_stride_argb,
+                        int width,
+                        int height,
+                        void (*SobelRow)(const uint8_t* src_sobelx,
+                                         const uint8_t* src_sobely,
+                                         uint8_t* dst,
+                                         int width)) {
   int y;
-  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_g, int width) =
+  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_g, int width) =
       ARGBToYJRow_C;
-  void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width) = SobelYRow_C;
-  void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
-                    const uint8* src_y2, uint8* dst_sobely, int width) =
+  void (*SobelYRow)(const uint8_t* src_y0, const uint8_t* src_y1,
+                    uint8_t* dst_sobely, int width) = SobelYRow_C;
+  void (*SobelXRow)(const uint8_t* src_y0, const uint8_t* src_y1,
+                    const uint8_t* src_y2, uint8_t* dst_sobely, int width) =
       SobelXRow_C;
   const int kEdge = 16;  // Extra pixels at start of row for extrude/align.
-  if (!src_argb  || !dst_argb || width <= 0 || height == 0) {
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb  = src_argb  + (height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
 
@@ -2228,6 +2793,14 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYJRow = ARGBToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_MSA;
+    }
+  }
+#endif
 
 #if defined(HAS_SOBELYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
@@ -2239,6 +2812,11 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
     SobelYRow = SobelYRow_NEON;
   }
 #endif
+#if defined(HAS_SOBELYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SobelYRow = SobelYRow_MSA;
+  }
+#endif
 #if defined(HAS_SOBELXROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     SobelXRow = SobelXRow_SSE2;
@@ -2248,19 +2826,24 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
   if (TestCpuFlag(kCpuHasNEON)) {
     SobelXRow = SobelXRow_NEON;
   }
+#endif
+#if defined(HAS_SOBELXROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SobelXRow = SobelXRow_MSA;
+  }
 #endif
   {
     // 3 rows with edges before/after.
     const int kRowSize = (width + kEdge + 31) & ~31;
     align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge));
-    uint8* row_sobelx = rows;
-    uint8* row_sobely = rows + kRowSize;
-    uint8* row_y = rows + kRowSize * 2;
+    uint8_t* row_sobelx = rows;
+    uint8_t* row_sobely = rows + kRowSize;
+    uint8_t* row_y = rows + kRowSize * 2;
 
     // Convert first row.
-    uint8* row_y0 = row_y + kEdge;
-    uint8* row_y1 = row_y0 + kRowSize;
-    uint8* row_y2 = row_y1 + kRowSize;
+    uint8_t* row_y0 = row_y + kEdge;
+    uint8_t* row_y1 = row_y0 + kRowSize;
+    uint8_t* row_y2 = row_y1 + kRowSize;
     ARGBToYJRow(src_argb, row_y0, width);
     row_y0[-1] = row_y0[0];
     memset(row_y0 + width, row_y0[width - 1], 16);  // Extrude 16 for valgrind.
@@ -2284,7 +2867,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
 
       // Cycle thru circular queue of 3 row_y buffers.
       {
-        uint8* row_yt = row_y0;
+        uint8_t* row_yt = row_y0;
         row_y0 = row_y1;
         row_y1 = row_y2;
         row_y2 = row_yt;
@@ -2299,11 +2882,14 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
 
 // Sobel ARGB effect.
 LIBYUV_API
-int ARGBSobel(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height) {
-  void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
-                   uint8* dst_argb, int width) = SobelRow_C;
+int ARGBSobel(const uint8_t* src_argb,
+              int src_stride_argb,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height) {
+  void (*SobelRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely,
+                   uint8_t* dst_argb, int width) = SobelRow_C;
 #if defined(HAS_SOBELROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     SobelRow = SobelRow_Any_SSE2;
@@ -2319,6 +2905,14 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
       SobelRow = SobelRow_NEON;
     }
   }
+#endif
+#if defined(HAS_SOBELROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SobelRow = SobelRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      SobelRow = SobelRow_MSA;
+    }
+  }
 #endif
   return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
                       width, height, SobelRow);
@@ -2326,11 +2920,14 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
 
 // Sobel ARGB effect with planar output.
 LIBYUV_API
-int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
-                     uint8* dst_y, int dst_stride_y,
-                     int width, int height) {
-  void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,
-                          uint8* dst_, int width) = SobelToPlaneRow_C;
+int ARGBSobelToPlane(const uint8_t* src_argb,
+                     int src_stride_argb,
+                     uint8_t* dst_y,
+                     int dst_stride_y,
+                     int width,
+                     int height) {
+  void (*SobelToPlaneRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely,
+                          uint8_t* dst_, int width) = SobelToPlaneRow_C;
 #if defined(HAS_SOBELTOPLANEROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     SobelToPlaneRow = SobelToPlaneRow_Any_SSE2;
@@ -2347,18 +2944,29 @@ int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
-  return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y,
-                      width, height, SobelToPlaneRow);
+#if defined(HAS_SOBELTOPLANEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SobelToPlaneRow = SobelToPlaneRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      SobelToPlaneRow = SobelToPlaneRow_MSA;
+    }
+  }
+#endif
+  return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y, width,
+                      height, SobelToPlaneRow);
 }
 
 // SobelXY ARGB effect.
 // Similar to Sobel, but also stores Sobel X in R and Sobel Y in B.  G = Sobel.
 LIBYUV_API
-int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
-                uint8* dst_argb, int dst_stride_argb,
-                int width, int height) {
-  void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) = SobelXYRow_C;
+int ARGBSobelXY(const uint8_t* src_argb,
+                int src_stride_argb,
+                uint8_t* dst_argb,
+                int dst_stride_argb,
+                int width,
+                int height) {
+  void (*SobelXYRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely,
+                     uint8_t* dst_argb, int width) = SobelXYRow_C;
 #if defined(HAS_SOBELXYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     SobelXYRow = SobelXYRow_Any_SSE2;
@@ -2374,6 +2982,14 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
       SobelXYRow = SobelXYRow_NEON;
     }
   }
+#endif
+#if defined(HAS_SOBELXYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SobelXYRow = SobelXYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      SobelXYRow = SobelXYRow_MSA;
+    }
+  }
 #endif
   return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
                       width, height, SobelXYRow);
@@ -2381,26 +2997,27 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
 
 // Apply a 4x4 polynomial to each ARGB pixel.
 LIBYUV_API
-int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_argb, int dst_stride_argb,
+int ARGBPolynomial(const uint8_t* src_argb,
+                   int src_stride_argb,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
                    const float* poly,
-                   int width, int height) {
+                   int width,
+                   int height) {
   int y;
-  void (*ARGBPolynomialRow)(const uint8* src_argb,
-                            uint8* dst_argb, const float* poly,
-                            int width) = ARGBPolynomialRow_C;
+  void (*ARGBPolynomialRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+                            const float* poly, int width) = ARGBPolynomialRow_C;
   if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb  = src_argb  + (height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -2425,28 +3042,132 @@ int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
   return 0;
 }
 
+// Convert plane of 16 bit shorts to half floats.
+// Source values are multiplied by scale before storing as half float.
+LIBYUV_API
+int HalfFloatPlane(const uint16_t* src_y,
+                   int src_stride_y,
+                   uint16_t* dst_y,
+                   int dst_stride_y,
+                   float scale,
+                   int width,
+                   int height) {
+  int y;
+  void (*HalfFloatRow)(const uint16_t* src, uint16_t* dst, float scale,
+                       int width) = HalfFloatRow_C;
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  src_stride_y >>= 1;
+  dst_stride_y >>= 1;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+#if defined(HAS_HALFFLOATROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    HalfFloatRow = HalfFloatRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      HalfFloatRow = HalfFloatRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_HALFFLOATROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    HalfFloatRow = HalfFloatRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      HalfFloatRow = HalfFloatRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_HALFFLOATROW_F16C)
+  if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasF16C)) {
+    HalfFloatRow =
+        (scale == 1.0f) ? HalfFloat1Row_Any_F16C : HalfFloatRow_Any_F16C;
+    if (IS_ALIGNED(width, 16)) {
+      HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_F16C : HalfFloatRow_F16C;
+    }
+  }
+#endif
+#if defined(HAS_HALFFLOATROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    HalfFloatRow =
+        (scale == 1.0f) ? HalfFloat1Row_Any_NEON : HalfFloatRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_NEON : HalfFloatRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_HALFFLOATROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    HalfFloatRow = HalfFloatRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      HalfFloatRow = HalfFloatRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    HalfFloatRow(src_y, dst_y, scale, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+  return 0;
+}
+
+// Convert a buffer of bytes to floats, scale the values and store as floats.
+LIBYUV_API
+int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width) {
+  void (*ByteToFloatRow)(const uint8_t* src, float* dst, float scale,
+                         int width) = ByteToFloatRow_C;
+  if (!src_y || !dst_y || width <= 0) {
+    return -1;
+  }
+#if defined(HAS_BYTETOFLOATROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ByteToFloatRow = ByteToFloatRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ByteToFloatRow = ByteToFloatRow_NEON;
+    }
+  }
+#endif
+
+  ByteToFloatRow(src_y, dst_y, scale, width);
+  return 0;
+}
+
 // Apply a lumacolortable to each ARGB pixel.
 LIBYUV_API
-int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_argb, int dst_stride_argb,
-                       const uint8* luma,
-                       int width, int height) {
+int ARGBLumaColorTable(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_argb,
+                       int dst_stride_argb,
+                       const uint8_t* luma,
+                       int width,
+                       int height) {
   int y;
-  void (*ARGBLumaColorTableRow)(const uint8* src_argb, uint8* dst_argb,
-      int width, const uint8* luma, const uint32 lumacoeff) =
-      ARGBLumaColorTableRow_C;
+  void (*ARGBLumaColorTableRow)(
+      const uint8_t* src_argb, uint8_t* dst_argb, int width,
+      const uint8_t* luma, const uint32_t lumacoeff) = ARGBLumaColorTableRow_C;
   if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb  = src_argb  + (height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -2467,12 +3188,15 @@ int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
 
 // Copy Alpha from one ARGB image to another.
 LIBYUV_API
-int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
-                  uint8* dst_argb, int dst_stride_argb,
-                  int width, int height) {
+int ARGBCopyAlpha(const uint8_t* src_argb,
+                  int src_stride_argb,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int width,
+                  int height) {
   int y;
-  void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) =
-      ARGBCopyAlphaRow_C;
+  void (*ARGBCopyAlphaRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+                           int width) = ARGBCopyAlphaRow_C;
   if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -2483,8 +3207,7 @@ int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -2516,55 +3239,73 @@ int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
 
 // Extract just the alpha channel from ARGB.
 LIBYUV_API
-int ARGBExtractAlpha(const uint8* src_argb, int src_stride,
-                     uint8* dst_a, int dst_stride,
-                     int width, int height) {
+int ARGBExtractAlpha(const uint8_t* src_argb,
+                     int src_stride_argb,
+                     uint8_t* dst_a,
+                     int dst_stride_a,
+                     int width,
+                     int height) {
   if (!src_argb || !dst_a || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb += (height - 1) * src_stride;
-    src_stride = -src_stride;
+    src_argb += (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride == width * 4 && dst_stride == width) {
+  if (src_stride_argb == width * 4 && dst_stride_a == width) {
     width *= height;
     height = 1;
-    src_stride = dst_stride = 0;
+    src_stride_argb = dst_stride_a = 0;
   }
-  void (*ARGBExtractAlphaRow)(const uint8 *src_argb, uint8 *dst_a, int width) =
-      ARGBExtractAlphaRow_C;
+  void (*ARGBExtractAlphaRow)(const uint8_t* src_argb, uint8_t* dst_a,
+                              int width) = ARGBExtractAlphaRow_C;
 #if defined(HAS_ARGBEXTRACTALPHAROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_SSE2
                                                : ARGBExtractAlphaRow_Any_SSE2;
   }
 #endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBExtractAlphaRow = IS_ALIGNED(width, 32) ? ARGBExtractAlphaRow_AVX2
+                                                : ARGBExtractAlphaRow_Any_AVX2;
+  }
+#endif
 #if defined(HAS_ARGBEXTRACTALPHAROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_NEON
                                                 : ARGBExtractAlphaRow_Any_NEON;
   }
 #endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_MSA
+                                                : ARGBExtractAlphaRow_Any_MSA;
+  }
+#endif
 
   for (int y = 0; y < height; ++y) {
     ARGBExtractAlphaRow(src_argb, dst_a, width);
-    src_argb += src_stride;
-    dst_a += dst_stride;
+    src_argb += src_stride_argb;
+    dst_a += dst_stride_a;
   }
   return 0;
 }
 
 // Copy a planar Y channel to the alpha channel of a destination ARGB image.
 LIBYUV_API
-int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
-                     uint8* dst_argb, int dst_stride_argb,
-                     int width, int height) {
+int ARGBCopyYToAlpha(const uint8_t* src_y,
+                     int src_stride_y,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     int width,
+                     int height) {
   int y;
-  void (*ARGBCopyYToAlphaRow)(const uint8* src_y, uint8* dst_argb, int width) =
-      ARGBCopyYToAlphaRow_C;
+  void (*ARGBCopyYToAlphaRow)(const uint8_t* src_y, uint8_t* dst_argb,
+                              int width) = ARGBCopyYToAlphaRow_C;
   if (!src_y || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -2575,8 +3316,7 @@ int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
     src_stride_y = -src_stride_y;
   }
   // Coalesce rows.
-  if (src_stride_y == width &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_y == width && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_y = dst_stride_argb = 0;
@@ -2610,20 +3350,22 @@ int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
 // directly. A SplitUVRow_Odd function could copy the remaining chroma.
 
 LIBYUV_API
-int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height) {
+int YUY2ToNV12(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
   int y;
   int halfwidth = (width + 1) >> 1;
-  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+  void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
                      int width) = SplitUVRow_C;
-  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
-  if (!src_yuy2 ||
-      !dst_y || !dst_uv ||
-      width <= 0 || height == 0) {
+  if (!src_yuy2 || !dst_y || !dst_uv || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -2656,6 +3398,14 @@ int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
     }
   }
 #endif
+#if defined(HAS_SPLITUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SplitUVRow = SplitUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      SplitUVRow = SplitUVRow_MSA;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -2680,6 +3430,14 @@ int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
+  }
+#endif
 
   {
     int awidth = halfwidth * 2;
@@ -2708,20 +3466,22 @@ int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
 }
 
 LIBYUV_API
-int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height) {
+int UYVYToNV12(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
   int y;
   int halfwidth = (width + 1) >> 1;
-  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+  void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
                      int width) = SplitUVRow_C;
-  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
-  if (!src_uyvy ||
-      !dst_y || !dst_uv ||
-      width <= 0 || height == 0) {
+  if (!src_uyvy || !dst_y || !dst_uv || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -2754,6 +3514,14 @@ int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
     }
   }
 #endif
+#if defined(HAS_SPLITUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SplitUVRow = SplitUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      SplitUVRow = SplitUVRow_MSA;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -2778,6 +3546,14 @@ int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
+  }
+#endif
 
   {
     int awidth = halfwidth * 2;
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/rotate.cc b/media/libvpx/libvpx/third_party/libyuv/source/rotate.cc
index 01ea5c4074..f2bed85b75 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/rotate.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/rotate.cc
@@ -10,8 +10,8 @@
 
 #include "libyuv/rotate.h"
 
-#include "libyuv/cpu_id.h"
 #include "libyuv/convert.h"
+#include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate_row.h"
 #include "libyuv/row.h"
@@ -22,12 +22,20 @@ extern "C" {
 #endif
 
 LIBYUV_API
-void TransposePlane(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height) {
+void TransposePlane(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height) {
   int i = height;
-  void (*TransposeWx8)(const uint8* src, int src_stride,
-                       uint8* dst, int dst_stride, int width) = TransposeWx8_C;
+#if defined(HAS_TRANSPOSEWX16_MSA)
+  void (*TransposeWx16)(const uint8_t* src, int src_stride, uint8_t* dst,
+                        int dst_stride, int width) = TransposeWx16_C;
+#else
+  void (*TransposeWx8)(const uint8_t* src, int src_stride, uint8_t* dst,
+                       int dst_stride, int width) = TransposeWx8_C;
+#endif
 #if defined(HAS_TRANSPOSEWX8_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     TransposeWx8 = TransposeWx8_NEON;
@@ -49,24 +57,32 @@ void TransposePlane(const uint8* src, int src_stride,
     }
   }
 #endif
-#if defined(HAS_TRANSPOSEWX8_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2)) {
-    if (IS_ALIGNED(width, 4) &&
-        IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
-      TransposeWx8 = TransposeWx8_Fast_DSPR2;
-    } else {
-      TransposeWx8 = TransposeWx8_DSPR2;
+#if defined(HAS_TRANSPOSEWX16_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    TransposeWx16 = TransposeWx16_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      TransposeWx16 = TransposeWx16_MSA;
     }
   }
 #endif
 
+#if defined(HAS_TRANSPOSEWX16_MSA)
+  // Work across the source in 16x16 tiles
+  while (i >= 16) {
+    TransposeWx16(src, src_stride, dst, dst_stride, width);
+    src += 16 * src_stride;  // Go down 16 rows.
+    dst += 16;               // Move over 16 columns.
+    i -= 16;
+  }
+#else
   // Work across the source in 8x8 tiles
   while (i >= 8) {
     TransposeWx8(src, src_stride, dst, dst_stride, width);
-    src += 8 * src_stride;    // Go down 8 rows.
-    dst += 8;                 // Move over 8 columns.
+    src += 8 * src_stride;  // Go down 8 rows.
+    dst += 8;               // Move over 8 columns.
     i -= 8;
   }
+#endif
 
   if (i > 0) {
     TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
@@ -74,9 +90,12 @@ void TransposePlane(const uint8* src, int src_stride,
 }
 
 LIBYUV_API
-void RotatePlane90(const uint8* src, int src_stride,
-                   uint8* dst, int dst_stride,
-                   int width, int height) {
+void RotatePlane90(const uint8_t* src,
+                   int src_stride,
+                   uint8_t* dst,
+                   int dst_stride,
+                   int width,
+                   int height) {
   // Rotate by 90 is a transpose with the source read
   // from bottom to top. So set the source pointer to the end
   // of the buffer and flip the sign of the source stride.
@@ -86,9 +105,12 @@ void RotatePlane90(const uint8* src, int src_stride,
 }
 
 LIBYUV_API
-void RotatePlane270(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height) {
+void RotatePlane270(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height) {
   // Rotate by 270 is a transpose with the destination written
   // from bottom to top. So set the destination pointer to the end
   // of the buffer and flip the sign of the destination stride.
@@ -98,17 +120,20 @@ void RotatePlane270(const uint8* src, int src_stride,
 }
 
 LIBYUV_API
-void RotatePlane180(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height) {
+void RotatePlane180(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height) {
   // Swap first and last row and mirror the content. Uses a temporary row.
   align_buffer_64(row, width);
-  const uint8* src_bot = src + src_stride * (height - 1);
-  uint8* dst_bot = dst + dst_stride * (height - 1);
+  const uint8_t* src_bot = src + src_stride * (height - 1);
+  uint8_t* dst_bot = dst + dst_stride * (height - 1);
   int half_height = (height + 1) >> 1;
   int y;
-  void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
-  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+  void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
+  void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
 #if defined(HAS_MIRRORROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     MirrorRow = MirrorRow_Any_NEON;
@@ -133,12 +158,12 @@ void RotatePlane180(const uint8* src, int src_stride,
     }
   }
 #endif
-// TODO(fbarchard): Mirror on mips handle unaligned memory.
-#if defined(HAS_MIRRORROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
-    MirrorRow = MirrorRow_DSPR2;
+#if defined(HAS_MIRRORROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MirrorRow = MirrorRow_Any_MSA;
+    if (IS_ALIGNED(width, 64)) {
+      MirrorRow = MirrorRow_MSA;
+    }
   }
 #endif
 #if defined(HAS_COPYROW_SSE2)
@@ -161,11 +186,6 @@ void RotatePlane180(const uint8* src, int src_stride,
     CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
   }
 #endif
-#if defined(HAS_COPYROW_MIPS)
-  if (TestCpuFlag(kCpuHasMIPS)) {
-    CopyRow = CopyRow_MIPS;
-  }
-#endif
 
   // Odd height will harmlessly mirror the middle row twice.
   for (y = 0; y < half_height; ++y) {
@@ -181,15 +201,24 @@ void RotatePlane180(const uint8* src, int src_stride,
 }
 
 LIBYUV_API
-void TransposeUV(const uint8* src, int src_stride,
-                 uint8* dst_a, int dst_stride_a,
-                 uint8* dst_b, int dst_stride_b,
-                 int width, int height) {
+void TransposeUV(const uint8_t* src,
+                 int src_stride,
+                 uint8_t* dst_a,
+                 int dst_stride_a,
+                 uint8_t* dst_b,
+                 int dst_stride_b,
+                 int width,
+                 int height) {
   int i = height;
-  void (*TransposeUVWx8)(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b,
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+  void (*TransposeUVWx16)(const uint8_t* src, int src_stride, uint8_t* dst_a,
+                          int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
+                          int width) = TransposeUVWx16_C;
+#else
+  void (*TransposeUVWx8)(const uint8_t* src, int src_stride, uint8_t* dst_a,
+                         int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
                          int width) = TransposeUVWx8_C;
+#endif
 #if defined(HAS_TRANSPOSEUVWX8_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     TransposeUVWx8 = TransposeUVWx8_NEON;
@@ -203,72 +232,90 @@ void TransposeUV(const uint8* src, int src_stride,
     }
   }
 #endif
-#if defined(HAS_TRANSPOSEUVWX8_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 2) &&
-      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
-    TransposeUVWx8 = TransposeUVWx8_DSPR2;
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    TransposeUVWx16 = TransposeUVWx16_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      TransposeUVWx16 = TransposeUVWx16_MSA;
+    }
   }
 #endif
 
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+  // Work through the source in 8x8 tiles.
+  while (i >= 16) {
+    TransposeUVWx16(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+                    width);
+    src += 16 * src_stride;  // Go down 16 rows.
+    dst_a += 16;             // Move over 8 columns.
+    dst_b += 16;             // Move over 8 columns.
+    i -= 16;
+  }
+#else
   // Work through the source in 8x8 tiles.
   while (i >= 8) {
-    TransposeUVWx8(src, src_stride,
-                   dst_a, dst_stride_a,
-                   dst_b, dst_stride_b,
+    TransposeUVWx8(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
                    width);
-    src += 8 * src_stride;    // Go down 8 rows.
-    dst_a += 8;               // Move over 8 columns.
-    dst_b += 8;               // Move over 8 columns.
+    src += 8 * src_stride;  // Go down 8 rows.
+    dst_a += 8;             // Move over 8 columns.
+    dst_b += 8;             // Move over 8 columns.
     i -= 8;
   }
+#endif
 
   if (i > 0) {
-    TransposeUVWxH_C(src, src_stride,
-                     dst_a, dst_stride_a,
-                     dst_b, dst_stride_b,
+    TransposeUVWxH_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
                      width, i);
   }
 }
 
 LIBYUV_API
-void RotateUV90(const uint8* src, int src_stride,
-                uint8* dst_a, int dst_stride_a,
-                uint8* dst_b, int dst_stride_b,
-                int width, int height) {
+void RotateUV90(const uint8_t* src,
+                int src_stride,
+                uint8_t* dst_a,
+                int dst_stride_a,
+                uint8_t* dst_b,
+                int dst_stride_b,
+                int width,
+                int height) {
   src += src_stride * (height - 1);
   src_stride = -src_stride;
 
-  TransposeUV(src, src_stride,
-              dst_a, dst_stride_a,
-              dst_b, dst_stride_b,
-              width, height);
+  TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width,
+              height);
 }
 
 LIBYUV_API
-void RotateUV270(const uint8* src, int src_stride,
-                 uint8* dst_a, int dst_stride_a,
-                 uint8* dst_b, int dst_stride_b,
-                 int width, int height) {
+void RotateUV270(const uint8_t* src,
+                 int src_stride,
+                 uint8_t* dst_a,
+                 int dst_stride_a,
+                 uint8_t* dst_b,
+                 int dst_stride_b,
+                 int width,
+                 int height) {
   dst_a += dst_stride_a * (width - 1);
   dst_b += dst_stride_b * (width - 1);
   dst_stride_a = -dst_stride_a;
   dst_stride_b = -dst_stride_b;
 
-  TransposeUV(src, src_stride,
-              dst_a, dst_stride_a,
-              dst_b, dst_stride_b,
-              width, height);
+  TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width,
+              height);
 }
 
 // Rotate 180 is a horizontal and vertical flip.
 LIBYUV_API
-void RotateUV180(const uint8* src, int src_stride,
-                 uint8* dst_a, int dst_stride_a,
-                 uint8* dst_b, int dst_stride_b,
-                 int width, int height) {
+void RotateUV180(const uint8_t* src,
+                 int src_stride,
+                 uint8_t* dst_a,
+                 int dst_stride_a,
+                 uint8_t* dst_b,
+                 int dst_stride_b,
+                 int width,
+                 int height) {
   int i;
-  void (*MirrorUVRow)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
-      MirrorUVRow_C;
+  void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v,
+                      int width) = MirrorUVRow_C;
 #if defined(HAS_MIRRORUVROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
     MirrorUVRow = MirrorUVRow_NEON;
@@ -279,10 +326,9 @@ void RotateUV180(const uint8* src, int src_stride,
     MirrorUVRow = MirrorUVRow_SSSE3;
   }
 #endif
-#if defined(HAS_MIRRORUVROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
-    MirrorUVRow = MirrorUVRow_DSPR2;
+#if defined(HAS_MIRRORUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) {
+    MirrorUVRow = MirrorUVRow_MSA;
   }
 #endif
 
@@ -298,9 +344,12 @@ void RotateUV180(const uint8* src, int src_stride,
 }
 
 LIBYUV_API
-int RotatePlane(const uint8* src, int src_stride,
-                uint8* dst, int dst_stride,
-                int width, int height,
+int RotatePlane(const uint8_t* src,
+                int src_stride,
+                uint8_t* dst,
+                int dst_stride,
+                int width,
+                int height,
                 enum RotationMode mode) {
   if (!src || width <= 0 || height == 0 || !dst) {
     return -1;
@@ -316,24 +365,16 @@ int RotatePlane(const uint8* src, int src_stride,
   switch (mode) {
     case kRotate0:
       // copy frame
-      CopyPlane(src, src_stride,
-                dst, dst_stride,
-                width, height);
+      CopyPlane(src, src_stride, dst, dst_stride, width, height);
       return 0;
     case kRotate90:
-      RotatePlane90(src, src_stride,
-                    dst, dst_stride,
-                    width, height);
+      RotatePlane90(src, src_stride, dst, dst_stride, width, height);
       return 0;
     case kRotate270:
-      RotatePlane270(src, src_stride,
-                     dst, dst_stride,
-                     width, height);
+      RotatePlane270(src, src_stride, dst, dst_stride, width, height);
       return 0;
     case kRotate180:
-      RotatePlane180(src, src_stride,
-                     dst, dst_stride,
-                     width, height);
+      RotatePlane180(src, src_stride, dst, dst_stride, width, height);
       return 0;
     default:
       break;
@@ -342,18 +383,25 @@ int RotatePlane(const uint8* src, int src_stride,
 }
 
 LIBYUV_API
-int I420Rotate(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height,
+int I420Rotate(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
                enum RotationMode mode) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
-      !dst_y || !dst_u || !dst_v) {
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+      !dst_u || !dst_v) {
     return -1;
   }
 
@@ -372,45 +420,29 @@ int I420Rotate(const uint8* src_y, int src_stride_y,
   switch (mode) {
     case kRotate0:
       // copy frame
-      return I420Copy(src_y, src_stride_y,
-                      src_u, src_stride_u,
-                      src_v, src_stride_v,
-                      dst_y, dst_stride_y,
-                      dst_u, dst_stride_u,
-                      dst_v, dst_stride_v,
-                      width, height);
+      return I420Copy(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                      src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                      dst_v, dst_stride_v, width, height);
     case kRotate90:
-      RotatePlane90(src_y, src_stride_y,
-                    dst_y, dst_stride_y,
-                    width, height);
-      RotatePlane90(src_u, src_stride_u,
-                    dst_u, dst_stride_u,
-                    halfwidth, halfheight);
-      RotatePlane90(src_v, src_stride_v,
-                    dst_v, dst_stride_v,
-                    halfwidth, halfheight);
+      RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                    halfheight);
+      RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                    halfheight);
       return 0;
     case kRotate270:
-      RotatePlane270(src_y, src_stride_y,
-                     dst_y, dst_stride_y,
-                     width, height);
-      RotatePlane270(src_u, src_stride_u,
-                     dst_u, dst_stride_u,
-                     halfwidth, halfheight);
-      RotatePlane270(src_v, src_stride_v,
-                     dst_v, dst_stride_v,
-                     halfwidth, halfheight);
+      RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                     halfheight);
+      RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                     halfheight);
       return 0;
     case kRotate180:
-      RotatePlane180(src_y, src_stride_y,
-                     dst_y, dst_stride_y,
-                     width, height);
-      RotatePlane180(src_u, src_stride_u,
-                     dst_u, dst_stride_u,
-                     halfwidth, halfheight);
-      RotatePlane180(src_v, src_stride_v,
-                     dst_v, dst_stride_v,
-                     halfwidth, halfheight);
+      RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                     halfheight);
+      RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                     halfheight);
       return 0;
     default:
       break;
@@ -419,17 +451,23 @@ int I420Rotate(const uint8* src_y, int src_stride_y,
 }
 
 LIBYUV_API
-int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
-                     const uint8* src_uv, int src_stride_uv,
-                     uint8* dst_y, int dst_stride_y,
-                     uint8* dst_u, int dst_stride_u,
-                     uint8* dst_v, int dst_stride_v,
-                     int width, int height,
+int NV12ToI420Rotate(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_y,
+                     int dst_stride_y,
+                     uint8_t* dst_u,
+                     int dst_stride_u,
+                     uint8_t* dst_v,
+                     int dst_stride_v,
+                     int width,
+                     int height,
                      enum RotationMode mode) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_uv || width <= 0 || height == 0 ||
-      !dst_y || !dst_u || !dst_v) {
+  if (!src_y || !src_uv || width <= 0 || height == 0 || !dst_y || !dst_u ||
+      !dst_v) {
     return -1;
   }
 
@@ -446,38 +484,23 @@ int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
   switch (mode) {
     case kRotate0:
       // copy frame
-      return NV12ToI420(src_y, src_stride_y,
-                        src_uv, src_stride_uv,
-                        dst_y, dst_stride_y,
-                        dst_u, dst_stride_u,
-                        dst_v, dst_stride_v,
+      return NV12ToI420(src_y, src_stride_y, src_uv, src_stride_uv, dst_y,
+                        dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
                         width, height);
     case kRotate90:
-      RotatePlane90(src_y, src_stride_y,
-                    dst_y, dst_stride_y,
-                    width, height);
-      RotateUV90(src_uv, src_stride_uv,
-                 dst_u, dst_stride_u,
-                 dst_v, dst_stride_v,
-                 halfwidth, halfheight);
+      RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotateUV90(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+                 dst_stride_v, halfwidth, halfheight);
       return 0;
     case kRotate270:
-      RotatePlane270(src_y, src_stride_y,
-                     dst_y, dst_stride_y,
-                     width, height);
-      RotateUV270(src_uv, src_stride_uv,
-                  dst_u, dst_stride_u,
-                  dst_v, dst_stride_v,
-                  halfwidth, halfheight);
+      RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotateUV270(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+                  dst_stride_v, halfwidth, halfheight);
       return 0;
     case kRotate180:
-      RotatePlane180(src_y, src_stride_y,
-                     dst_y, dst_stride_y,
-                     width, height);
-      RotateUV180(src_uv, src_stride_uv,
-                  dst_u, dst_stride_u,
-                  dst_v, dst_stride_v,
-                  halfwidth, halfheight);
+      RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotateUV180(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+                  dst_stride_v, halfwidth, halfheight);
       return 0;
     default:
       break;
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/rotate_any.cc b/media/libvpx/libvpx/third_party/libyuv/source/rotate_any.cc
index 31a74c3155..c2752e6222 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/rotate_any.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/rotate_any.cc
@@ -18,16 +18,16 @@ namespace libyuv {
 extern "C" {
 #endif
 
-#define TANY(NAMEANY, TPOS_SIMD, MASK)                                         \
-    void NAMEANY(const uint8* src, int src_stride,                             \
-                 uint8* dst, int dst_stride, int width) {                      \
-      int r = width & MASK;                                                    \
-      int n = width - r;                                                       \
-      if (n > 0) {                                                             \
-        TPOS_SIMD(src, src_stride, dst, dst_stride, n);                        \
-      }                                                                        \
-      TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r);\
-    }
+#define TANY(NAMEANY, TPOS_SIMD, MASK)                                        \
+  void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst,              \
+               int dst_stride, int width) {                                   \
+    int r = width & MASK;                                                     \
+    int n = width - r;                                                        \
+    if (n > 0) {                                                              \
+      TPOS_SIMD(src, src_stride, dst, dst_stride, n);                         \
+    }                                                                         \
+    TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r); \
+  }
 
 #ifdef HAS_TRANSPOSEWX8_NEON
 TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7)
@@ -38,25 +38,23 @@ TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7)
 #ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
 TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15)
 #endif
-#ifdef HAS_TRANSPOSEWX8_DSPR2
-TANY(TransposeWx8_Any_DSPR2, TransposeWx8_DSPR2, 7)
+#ifdef HAS_TRANSPOSEWX16_MSA
+TANY(TransposeWx16_Any_MSA, TransposeWx16_MSA, 15)
 #endif
 #undef TANY
 
 #define TUVANY(NAMEANY, TPOS_SIMD, MASK)                                       \
-    void NAMEANY(const uint8* src, int src_stride,                             \
-                uint8* dst_a, int dst_stride_a,                                \
-                uint8* dst_b, int dst_stride_b, int width) {                   \
-      int r = width & MASK;                                                    \
-      int n = width - r;                                                       \
-      if (n > 0) {                                                             \
-        TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,   \
-                  n);                                                          \
-      }                                                                        \
-      TransposeUVWx8_C(src + n * 2, src_stride,                                \
-                       dst_a + n * dst_stride_a, dst_stride_a,                 \
-                       dst_b + n * dst_stride_b, dst_stride_b, r);             \
-    }
+  void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst_a,             \
+               int dst_stride_a, uint8_t* dst_b, int dst_stride_b,             \
+               int width) {                                                    \
+    int r = width & MASK;                                                      \
+    int n = width - r;                                                         \
+    if (n > 0) {                                                               \
+      TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, n); \
+    }                                                                          \
+    TransposeUVWx8_C(src + n * 2, src_stride, dst_a + n * dst_stride_a,        \
+                     dst_stride_a, dst_b + n * dst_stride_b, dst_stride_b, r); \
+  }
 
 #ifdef HAS_TRANSPOSEUVWX8_NEON
 TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7)
@@ -64,8 +62,8 @@ TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7)
 #ifdef HAS_TRANSPOSEUVWX8_SSE2
 TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7)
 #endif
-#ifdef HAS_TRANSPOSEUVWX8_DSPR2
-TUVANY(TransposeUVWx8_Any_DSPR2, TransposeUVWx8_DSPR2, 7)
+#ifdef HAS_TRANSPOSEUVWX16_MSA
+TUVANY(TransposeUVWx16_Any_MSA, TransposeUVWx16_MSA, 7)
 #endif
 #undef TUVANY
 
@@ -73,8 +71,3 @@ TUVANY(TransposeUVWx8_Any_DSPR2, TransposeUVWx8_DSPR2, 7)
 }  // extern "C"
 }  // namespace libyuv
 #endif
-
-
-
-
-
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/rotate_argb.cc b/media/libvpx/libvpx/third_party/libyuv/source/rotate_argb.cc
index 787c0ad1be..5a6e05376f 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/rotate_argb.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/rotate_argb.cc
@@ -10,90 +10,106 @@
 
 #include "libyuv/rotate.h"
 
-#include "libyuv/cpu_id.h"
 #include "libyuv/convert.h"
+#include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/row.h"
+#include "libyuv/scale_row.h" /* for ScaleARGBRowDownEven_ */
 
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 
-// ARGBScale has a function to copy pixels to a row, striding each source
-// pixel by a constant.
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(_M_IX86) || \
-    (defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))
-#define HAS_SCALEARGBROWDOWNEVEN_SSE2
-void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
-                               int src_stepx, uint8* dst_ptr, int dst_width);
-#endif
-#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
-#define HAS_SCALEARGBROWDOWNEVEN_NEON
-void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
-                               int src_stepx, uint8* dst_ptr, int dst_width);
-#endif
-
-void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
-                            int src_stepx, uint8* dst_ptr, int dst_width);
-
-static void ARGBTranspose(const uint8* src, int src_stride,
-                          uint8* dst, int dst_stride, int width, int height) {
+static void ARGBTranspose(const uint8_t* src_argb,
+                          int src_stride_argb,
+                          uint8_t* dst_argb,
+                          int dst_stride_argb,
+                          int width,
+                          int height) {
   int i;
-  int src_pixel_step = src_stride >> 2;
-  void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
-      int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;
+  int src_pixel_step = src_stride_argb >> 2;
+  void (*ScaleARGBRowDownEven)(
+      const uint8_t* src_argb, ptrdiff_t src_stride_argb, int src_step,
+      uint8_t* dst_argb, int dst_width) = ScaleARGBRowDownEven_C;
 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) {  // Width of dest.
-    ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_SSE2;
+    if (IS_ALIGNED(height, 4)) {  // Width of dest.
+      ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
+    }
   }
 #endif
 #if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4)) {  // Width of dest.
-    ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_NEON;
+    if (IS_ALIGNED(height, 4)) {  // Width of dest.
+      ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MSA;
+    if (IS_ALIGNED(height, 4)) {  // Width of dest.
+      ScaleARGBRowDownEven = ScaleARGBRowDownEven_MSA;
+    }
   }
 #endif
 
   for (i = 0; i < width; ++i) {  // column of source to row of dest.
-    ScaleARGBRowDownEven(src, 0, src_pixel_step, dst, height);
-    dst += dst_stride;
-    src += 4;
+    ScaleARGBRowDownEven(src_argb, 0, src_pixel_step, dst_argb, height);
+    dst_argb += dst_stride_argb;
+    src_argb += 4;
   }
 }
 
-void ARGBRotate90(const uint8* src, int src_stride,
-                  uint8* dst, int dst_stride, int width, int height) {
+void ARGBRotate90(const uint8_t* src_argb,
+                  int src_stride_argb,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int width,
+                  int height) {
   // Rotate by 90 is a ARGBTranspose with the source read
   // from bottom to top. So set the source pointer to the end
   // of the buffer and flip the sign of the source stride.
-  src += src_stride * (height - 1);
-  src_stride = -src_stride;
-  ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
+  src_argb += src_stride_argb * (height - 1);
+  src_stride_argb = -src_stride_argb;
+  ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
+                height);
 }
 
-void ARGBRotate270(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride, int width, int height) {
+void ARGBRotate270(const uint8_t* src_argb,
+                   int src_stride_argb,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   int width,
+                   int height) {
   // Rotate by 270 is a ARGBTranspose with the destination written
   // from bottom to top. So set the destination pointer to the end
   // of the buffer and flip the sign of the destination stride.
-  dst += dst_stride * (width - 1);
-  dst_stride = -dst_stride;
-  ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
+  dst_argb += dst_stride_argb * (width - 1);
+  dst_stride_argb = -dst_stride_argb;
+  ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
+                height);
 }
 
-void ARGBRotate180(const uint8* src, int src_stride,
-                   uint8* dst, int dst_stride, int width, int height) {
+void ARGBRotate180(const uint8_t* src_argb,
+                   int src_stride_argb,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   int width,
+                   int height) {
   // Swap first and last row and mirror the content. Uses a temporary row.
   align_buffer_64(row, width * 4);
-  const uint8* src_bot = src + src_stride * (height - 1);
-  uint8* dst_bot = dst + dst_stride * (height - 1);
+  const uint8_t* src_bot = src_argb + src_stride_argb * (height - 1);
+  uint8_t* dst_bot = dst_argb + dst_stride_argb * (height - 1);
   int half_height = (height + 1) >> 1;
   int y;
-  void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
+  void (*ARGBMirrorRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
       ARGBMirrorRow_C;
-  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+  void (*CopyRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
+      CopyRow_C;
 #if defined(HAS_ARGBMIRRORROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
@@ -118,6 +134,14 @@ void ARGBRotate180(const uint8* src, int src_stride,
     }
   }
 #endif
+#if defined(HAS_ARGBMIRRORROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBMirrorRow = ARGBMirrorRow_MSA;
+    }
+  }
+#endif
 #if defined(HAS_COPYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
@@ -138,28 +162,27 @@ void ARGBRotate180(const uint8* src, int src_stride,
     CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
   }
 #endif
-#if defined(HAS_COPYROW_MIPS)
-  if (TestCpuFlag(kCpuHasMIPS)) {
-    CopyRow = CopyRow_MIPS;
-  }
-#endif
 
   // Odd height will harmlessly mirror the middle row twice.
   for (y = 0; y < half_height; ++y) {
-    ARGBMirrorRow(src, row, width);  // Mirror first row into a buffer
-    ARGBMirrorRow(src_bot, dst, width);  // Mirror last row into first row
+    ARGBMirrorRow(src_argb, row, width);      // Mirror first row into a buffer
+    ARGBMirrorRow(src_bot, dst_argb, width);  // Mirror last row into first row
     CopyRow(row, dst_bot, width * 4);  // Copy first mirrored row into last
-    src += src_stride;
-    dst += dst_stride;
-    src_bot -= src_stride;
-    dst_bot -= dst_stride;
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+    src_bot -= src_stride_argb;
+    dst_bot -= dst_stride_argb;
   }
   free_aligned_buffer_64(row);
 }
 
 LIBYUV_API
-int ARGBRotate(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_argb, int dst_stride_argb, int width, int height,
+int ARGBRotate(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height,
                enum RotationMode mode) {
   if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
     return -1;
@@ -175,23 +198,19 @@ int ARGBRotate(const uint8* src_argb, int src_stride_argb,
   switch (mode) {
     case kRotate0:
       // copy frame
-      return ARGBCopy(src_argb, src_stride_argb,
-                      dst_argb, dst_stride_argb,
+      return ARGBCopy(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
                       width, height);
     case kRotate90:
-      ARGBRotate90(src_argb, src_stride_argb,
-                   dst_argb, dst_stride_argb,
-                   width, height);
+      ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
+                   height);
       return 0;
     case kRotate270:
-      ARGBRotate270(src_argb, src_stride_argb,
-                    dst_argb, dst_stride_argb,
-                    width, height);
+      ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
+                    height);
       return 0;
     case kRotate180:
-      ARGBRotate180(src_argb, src_stride_argb,
-                    dst_argb, dst_stride_argb,
-                    width, height);
+      ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
+                    height);
       return 0;
     default:
       break;
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/rotate_common.cc b/media/libvpx/libvpx/third_party/libyuv/source/rotate_common.cc
index b33a9a0c6e..ff212adebc 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/rotate_common.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/rotate_common.cc
@@ -8,16 +8,19 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/row.h"
 #include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
 
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 
-void TransposeWx8_C(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride, int width) {
+void TransposeWx8_C(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width) {
   int i;
   for (i = 0; i < width; ++i) {
     dst[0] = src[0 * src_stride];
@@ -33,9 +36,13 @@ void TransposeWx8_C(const uint8* src, int src_stride,
   }
 }
 
-void TransposeUVWx8_C(const uint8* src, int src_stride,
-                      uint8* dst_a, int dst_stride_a,
-                      uint8* dst_b, int dst_stride_b, int width) {
+void TransposeUVWx8_C(const uint8_t* src,
+                      int src_stride,
+                      uint8_t* dst_a,
+                      int dst_stride_a,
+                      uint8_t* dst_b,
+                      int dst_stride_b,
+                      int width) {
   int i;
   for (i = 0; i < width; ++i) {
     dst_a[0] = src[0 * src_stride + 0];
@@ -60,9 +67,12 @@ void TransposeUVWx8_C(const uint8* src, int src_stride,
   }
 }
 
-void TransposeWxH_C(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height) {
+void TransposeWxH_C(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height) {
   int i;
   for (i = 0; i < width; ++i) {
     int j;
@@ -72,10 +82,14 @@ void TransposeWxH_C(const uint8* src, int src_stride,
   }
 }
 
-void TransposeUVWxH_C(const uint8* src, int src_stride,
-                      uint8* dst_a, int dst_stride_a,
-                      uint8* dst_b, int dst_stride_b,
-                      int width, int height) {
+void TransposeUVWxH_C(const uint8_t* src,
+                      int src_stride,
+                      uint8_t* dst_a,
+                      int dst_stride_a,
+                      uint8_t* dst_b,
+                      int dst_stride_b,
+                      int width,
+                      int height) {
   int i;
   for (i = 0; i < width * 2; i += 2) {
     int j;
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/rotate_gcc.cc b/media/libvpx/libvpx/third_party/libyuv/source/rotate_gcc.cc
index cbe870caa7..04e19e29ee 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/rotate_gcc.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/rotate_gcc.cc
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/row.h"
 #include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -22,342 +22,348 @@ extern "C" {
 
 // Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
 #if defined(HAS_TRANSPOSEWX8_SSSE3)
-void TransposeWx8_SSSE3(const uint8* src, int src_stride,
-                        uint8* dst, int dst_stride, int width) {
-  asm volatile (
-    // Read in the data from the source pointer.
-    // First round of bit swap.
-    LABELALIGN
-  "1:                                            \n"
-    "movq       (%0),%%xmm0                      \n"
-    "movq       (%0,%3),%%xmm1                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "punpcklbw  %%xmm1,%%xmm0                    \n"
-    "movq       (%0),%%xmm2                      \n"
-    "movdqa     %%xmm0,%%xmm1                    \n"
-    "palignr    $0x8,%%xmm1,%%xmm1               \n"
-    "movq       (%0,%3),%%xmm3                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "punpcklbw  %%xmm3,%%xmm2                    \n"
-    "movdqa     %%xmm2,%%xmm3                    \n"
-    "movq       (%0),%%xmm4                      \n"
-    "palignr    $0x8,%%xmm3,%%xmm3               \n"
-    "movq       (%0,%3),%%xmm5                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "punpcklbw  %%xmm5,%%xmm4                    \n"
-    "movdqa     %%xmm4,%%xmm5                    \n"
-    "movq       (%0),%%xmm6                      \n"
-    "palignr    $0x8,%%xmm5,%%xmm5               \n"
-    "movq       (%0,%3),%%xmm7                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "punpcklbw  %%xmm7,%%xmm6                    \n"
-    "neg        %3                               \n"
-    "movdqa     %%xmm6,%%xmm7                    \n"
-    "lea        0x8(%0,%3,8),%0                  \n"
-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
-    "neg        %3                               \n"
-     // Second round of bit swap.
-    "punpcklwd  %%xmm2,%%xmm0                    \n"
-    "punpcklwd  %%xmm3,%%xmm1                    \n"
-    "movdqa     %%xmm0,%%xmm2                    \n"
-    "movdqa     %%xmm1,%%xmm3                    \n"
-    "palignr    $0x8,%%xmm2,%%xmm2               \n"
-    "palignr    $0x8,%%xmm3,%%xmm3               \n"
-    "punpcklwd  %%xmm6,%%xmm4                    \n"
-    "punpcklwd  %%xmm7,%%xmm5                    \n"
-    "movdqa     %%xmm4,%%xmm6                    \n"
-    "movdqa     %%xmm5,%%xmm7                    \n"
-    "palignr    $0x8,%%xmm6,%%xmm6               \n"
-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
-    // Third round of bit swap.
-    // Write to the destination pointer.
-    "punpckldq  %%xmm4,%%xmm0                    \n"
-    "movq       %%xmm0,(%1)                      \n"
-    "movdqa     %%xmm0,%%xmm4                    \n"
-    "palignr    $0x8,%%xmm4,%%xmm4               \n"
-    "movq       %%xmm4,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm6,%%xmm2                    \n"
-    "movdqa     %%xmm2,%%xmm6                    \n"
-    "movq       %%xmm2,(%1)                      \n"
-    "palignr    $0x8,%%xmm6,%%xmm6               \n"
-    "punpckldq  %%xmm5,%%xmm1                    \n"
-    "movq       %%xmm6,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "movdqa     %%xmm1,%%xmm5                    \n"
-    "movq       %%xmm1,(%1)                      \n"
-    "palignr    $0x8,%%xmm5,%%xmm5               \n"
-    "movq       %%xmm5,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm7,%%xmm3                    \n"
-    "movq       %%xmm3,(%1)                      \n"
-    "movdqa     %%xmm3,%%xmm7                    \n"
-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
-    "sub        $0x8,%2                          \n"
-    "movq       %%xmm7,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "jg         1b                               \n"
-    : "+r"(src),    // %0
-      "+r"(dst),    // %1
-      "+r"(width)   // %2
-    : "r"((intptr_t)(src_stride)),  // %3
-      "r"((intptr_t)(dst_stride))   // %4
-    : "memory", "cc",
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+void TransposeWx8_SSSE3(const uint8_t* src,
+                        int src_stride,
+                        uint8_t* dst,
+                        int dst_stride,
+                        int width) {
+  asm volatile(
+      // Read in the data from the source pointer.
+      // First round of bit swap.
+      LABELALIGN
+      "1:                                          \n"
+      "movq       (%0),%%xmm0                      \n"
+      "movq       (%0,%3),%%xmm1                   \n"
+      "lea        (%0,%3,2),%0                     \n"
+      "punpcklbw  %%xmm1,%%xmm0                    \n"
+      "movq       (%0),%%xmm2                      \n"
+      "movdqa     %%xmm0,%%xmm1                    \n"
+      "palignr    $0x8,%%xmm1,%%xmm1               \n"
+      "movq       (%0,%3),%%xmm3                   \n"
+      "lea        (%0,%3,2),%0                     \n"
+      "punpcklbw  %%xmm3,%%xmm2                    \n"
+      "movdqa     %%xmm2,%%xmm3                    \n"
+      "movq       (%0),%%xmm4                      \n"
+      "palignr    $0x8,%%xmm3,%%xmm3               \n"
+      "movq       (%0,%3),%%xmm5                   \n"
+      "lea        (%0,%3,2),%0                     \n"
+      "punpcklbw  %%xmm5,%%xmm4                    \n"
+      "movdqa     %%xmm4,%%xmm5                    \n"
+      "movq       (%0),%%xmm6                      \n"
+      "palignr    $0x8,%%xmm5,%%xmm5               \n"
+      "movq       (%0,%3),%%xmm7                   \n"
+      "lea        (%0,%3,2),%0                     \n"
+      "punpcklbw  %%xmm7,%%xmm6                    \n"
+      "neg        %3                               \n"
+      "movdqa     %%xmm6,%%xmm7                    \n"
+      "lea        0x8(%0,%3,8),%0                  \n"
+      "palignr    $0x8,%%xmm7,%%xmm7               \n"
+      "neg        %3                               \n"
+      // Second round of bit swap.
+      "punpcklwd  %%xmm2,%%xmm0                    \n"
+      "punpcklwd  %%xmm3,%%xmm1                    \n"
+      "movdqa     %%xmm0,%%xmm2                    \n"
+      "movdqa     %%xmm1,%%xmm3                    \n"
+      "palignr    $0x8,%%xmm2,%%xmm2               \n"
+      "palignr    $0x8,%%xmm3,%%xmm3               \n"
+      "punpcklwd  %%xmm6,%%xmm4                    \n"
+      "punpcklwd  %%xmm7,%%xmm5                    \n"
+      "movdqa     %%xmm4,%%xmm6                    \n"
+      "movdqa     %%xmm5,%%xmm7                    \n"
+      "palignr    $0x8,%%xmm6,%%xmm6               \n"
+      "palignr    $0x8,%%xmm7,%%xmm7               \n"
+      // Third round of bit swap.
+      // Write to the destination pointer.
+      "punpckldq  %%xmm4,%%xmm0                    \n"
+      "movq       %%xmm0,(%1)                      \n"
+      "movdqa     %%xmm0,%%xmm4                    \n"
+      "palignr    $0x8,%%xmm4,%%xmm4               \n"
+      "movq       %%xmm4,(%1,%4)                   \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "punpckldq  %%xmm6,%%xmm2                    \n"
+      "movdqa     %%xmm2,%%xmm6                    \n"
+      "movq       %%xmm2,(%1)                      \n"
+      "palignr    $0x8,%%xmm6,%%xmm6               \n"
+      "punpckldq  %%xmm5,%%xmm1                    \n"
+      "movq       %%xmm6,(%1,%4)                   \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "movdqa     %%xmm1,%%xmm5                    \n"
+      "movq       %%xmm1,(%1)                      \n"
+      "palignr    $0x8,%%xmm5,%%xmm5               \n"
+      "movq       %%xmm5,(%1,%4)                   \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "punpckldq  %%xmm7,%%xmm3                    \n"
+      "movq       %%xmm3,(%1)                      \n"
+      "movdqa     %%xmm3,%%xmm7                    \n"
+      "palignr    $0x8,%%xmm7,%%xmm7               \n"
+      "sub        $0x8,%2                          \n"
+      "movq       %%xmm7,(%1,%4)                   \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "jg         1b                               \n"
+      : "+r"(src),                    // %0
+        "+r"(dst),                    // %1
+        "+r"(width)                   // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride))   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // defined(HAS_TRANSPOSEWX8_SSSE3)
 
 // Transpose 16x8. 64 bit
 #if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
-void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
-                             uint8* dst, int dst_stride, int width) {
-  asm volatile (
-    // Read in the data from the source pointer.
-    // First round of bit swap.
-    LABELALIGN
-  "1:                                            \n"
-    "movdqu     (%0),%%xmm0                      \n"
-    "movdqu     (%0,%3),%%xmm1                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "movdqa     %%xmm0,%%xmm8                    \n"
-    "punpcklbw  %%xmm1,%%xmm0                    \n"
-    "punpckhbw  %%xmm1,%%xmm8                    \n"
-    "movdqu     (%0),%%xmm2                      \n"
-    "movdqa     %%xmm0,%%xmm1                    \n"
-    "movdqa     %%xmm8,%%xmm9                    \n"
-    "palignr    $0x8,%%xmm1,%%xmm1               \n"
-    "palignr    $0x8,%%xmm9,%%xmm9               \n"
-    "movdqu     (%0,%3),%%xmm3                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "movdqa     %%xmm2,%%xmm10                   \n"
-    "punpcklbw  %%xmm3,%%xmm2                    \n"
-    "punpckhbw  %%xmm3,%%xmm10                   \n"
-    "movdqa     %%xmm2,%%xmm3                    \n"
-    "movdqa     %%xmm10,%%xmm11                  \n"
-    "movdqu     (%0),%%xmm4                      \n"
-    "palignr    $0x8,%%xmm3,%%xmm3               \n"
-    "palignr    $0x8,%%xmm11,%%xmm11             \n"
-    "movdqu     (%0,%3),%%xmm5                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "movdqa     %%xmm4,%%xmm12                   \n"
-    "punpcklbw  %%xmm5,%%xmm4                    \n"
-    "punpckhbw  %%xmm5,%%xmm12                   \n"
-    "movdqa     %%xmm4,%%xmm5                    \n"
-    "movdqa     %%xmm12,%%xmm13                  \n"
-    "movdqu     (%0),%%xmm6                      \n"
-    "palignr    $0x8,%%xmm5,%%xmm5               \n"
-    "palignr    $0x8,%%xmm13,%%xmm13             \n"
-    "movdqu     (%0,%3),%%xmm7                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "movdqa     %%xmm6,%%xmm14                   \n"
-    "punpcklbw  %%xmm7,%%xmm6                    \n"
-    "punpckhbw  %%xmm7,%%xmm14                   \n"
-    "neg        %3                               \n"
-    "movdqa     %%xmm6,%%xmm7                    \n"
-    "movdqa     %%xmm14,%%xmm15                  \n"
-    "lea        0x10(%0,%3,8),%0                 \n"
-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
-    "palignr    $0x8,%%xmm15,%%xmm15             \n"
-    "neg        %3                               \n"
-     // Second round of bit swap.
-    "punpcklwd  %%xmm2,%%xmm0                    \n"
-    "punpcklwd  %%xmm3,%%xmm1                    \n"
-    "movdqa     %%xmm0,%%xmm2                    \n"
-    "movdqa     %%xmm1,%%xmm3                    \n"
-    "palignr    $0x8,%%xmm2,%%xmm2               \n"
-    "palignr    $0x8,%%xmm3,%%xmm3               \n"
-    "punpcklwd  %%xmm6,%%xmm4                    \n"
-    "punpcklwd  %%xmm7,%%xmm5                    \n"
-    "movdqa     %%xmm4,%%xmm6                    \n"
-    "movdqa     %%xmm5,%%xmm7                    \n"
-    "palignr    $0x8,%%xmm6,%%xmm6               \n"
-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
-    "punpcklwd  %%xmm10,%%xmm8                   \n"
-    "punpcklwd  %%xmm11,%%xmm9                   \n"
-    "movdqa     %%xmm8,%%xmm10                   \n"
-    "movdqa     %%xmm9,%%xmm11                   \n"
-    "palignr    $0x8,%%xmm10,%%xmm10             \n"
-    "palignr    $0x8,%%xmm11,%%xmm11             \n"
-    "punpcklwd  %%xmm14,%%xmm12                  \n"
-    "punpcklwd  %%xmm15,%%xmm13                  \n"
-    "movdqa     %%xmm12,%%xmm14                  \n"
-    "movdqa     %%xmm13,%%xmm15                  \n"
-    "palignr    $0x8,%%xmm14,%%xmm14             \n"
-    "palignr    $0x8,%%xmm15,%%xmm15             \n"
-    // Third round of bit swap.
-    // Write to the destination pointer.
-    "punpckldq  %%xmm4,%%xmm0                    \n"
-    "movq       %%xmm0,(%1)                      \n"
-    "movdqa     %%xmm0,%%xmm4                    \n"
-    "palignr    $0x8,%%xmm4,%%xmm4               \n"
-    "movq       %%xmm4,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm6,%%xmm2                    \n"
-    "movdqa     %%xmm2,%%xmm6                    \n"
-    "movq       %%xmm2,(%1)                      \n"
-    "palignr    $0x8,%%xmm6,%%xmm6               \n"
-    "punpckldq  %%xmm5,%%xmm1                    \n"
-    "movq       %%xmm6,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "movdqa     %%xmm1,%%xmm5                    \n"
-    "movq       %%xmm1,(%1)                      \n"
-    "palignr    $0x8,%%xmm5,%%xmm5               \n"
-    "movq       %%xmm5,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm7,%%xmm3                    \n"
-    "movq       %%xmm3,(%1)                      \n"
-    "movdqa     %%xmm3,%%xmm7                    \n"
-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
-    "movq       %%xmm7,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm12,%%xmm8                   \n"
-    "movq       %%xmm8,(%1)                      \n"
-    "movdqa     %%xmm8,%%xmm12                   \n"
-    "palignr    $0x8,%%xmm12,%%xmm12             \n"
-    "movq       %%xmm12,(%1,%4)                  \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm14,%%xmm10                  \n"
-    "movdqa     %%xmm10,%%xmm14                  \n"
-    "movq       %%xmm10,(%1)                     \n"
-    "palignr    $0x8,%%xmm14,%%xmm14             \n"
-    "punpckldq  %%xmm13,%%xmm9                   \n"
-    "movq       %%xmm14,(%1,%4)                  \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "movdqa     %%xmm9,%%xmm13                   \n"
-    "movq       %%xmm9,(%1)                      \n"
-    "palignr    $0x8,%%xmm13,%%xmm13             \n"
-    "movq       %%xmm13,(%1,%4)                  \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm15,%%xmm11                  \n"
-    "movq       %%xmm11,(%1)                     \n"
-    "movdqa     %%xmm11,%%xmm15                  \n"
-    "palignr    $0x8,%%xmm15,%%xmm15             \n"
-    "sub        $0x10,%2                         \n"
-    "movq       %%xmm15,(%1,%4)                  \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "jg         1b                               \n"
-    : "+r"(src),    // %0
-      "+r"(dst),    // %1
-      "+r"(width)   // %2
-    : "r"((intptr_t)(src_stride)),  // %3
-      "r"((intptr_t)(dst_stride))   // %4
-    : "memory", "cc",
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
-      "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"
-  );
+void TransposeWx8_Fast_SSSE3(const uint8_t* src,
+                             int src_stride,
+                             uint8_t* dst,
+                             int dst_stride,
+                             int width) {
+  asm volatile(
+      // Read in the data from the source pointer.
+      // First round of bit swap.
+      LABELALIGN
+      "1:                                          \n"
+      "movdqu     (%0),%%xmm0                      \n"
+      "movdqu     (%0,%3),%%xmm1                   \n"
+      "lea        (%0,%3,2),%0                     \n"
+      "movdqa     %%xmm0,%%xmm8                    \n"
+      "punpcklbw  %%xmm1,%%xmm0                    \n"
+      "punpckhbw  %%xmm1,%%xmm8                    \n"
+      "movdqu     (%0),%%xmm2                      \n"
+      "movdqa     %%xmm0,%%xmm1                    \n"
+      "movdqa     %%xmm8,%%xmm9                    \n"
+      "palignr    $0x8,%%xmm1,%%xmm1               \n"
+      "palignr    $0x8,%%xmm9,%%xmm9               \n"
+      "movdqu     (%0,%3),%%xmm3                   \n"
+      "lea        (%0,%3,2),%0                     \n"
+      "movdqa     %%xmm2,%%xmm10                   \n"
+      "punpcklbw  %%xmm3,%%xmm2                    \n"
+      "punpckhbw  %%xmm3,%%xmm10                   \n"
+      "movdqa     %%xmm2,%%xmm3                    \n"
+      "movdqa     %%xmm10,%%xmm11                  \n"
+      "movdqu     (%0),%%xmm4                      \n"
+      "palignr    $0x8,%%xmm3,%%xmm3               \n"
+      "palignr    $0x8,%%xmm11,%%xmm11             \n"
+      "movdqu     (%0,%3),%%xmm5                   \n"
+      "lea        (%0,%3,2),%0                     \n"
+      "movdqa     %%xmm4,%%xmm12                   \n"
+      "punpcklbw  %%xmm5,%%xmm4                    \n"
+      "punpckhbw  %%xmm5,%%xmm12                   \n"
+      "movdqa     %%xmm4,%%xmm5                    \n"
+      "movdqa     %%xmm12,%%xmm13                  \n"
+      "movdqu     (%0),%%xmm6                      \n"
+      "palignr    $0x8,%%xmm5,%%xmm5               \n"
+      "palignr    $0x8,%%xmm13,%%xmm13             \n"
+      "movdqu     (%0,%3),%%xmm7                   \n"
+      "lea        (%0,%3,2),%0                     \n"
+      "movdqa     %%xmm6,%%xmm14                   \n"
+      "punpcklbw  %%xmm7,%%xmm6                    \n"
+      "punpckhbw  %%xmm7,%%xmm14                   \n"
+      "neg        %3                               \n"
+      "movdqa     %%xmm6,%%xmm7                    \n"
+      "movdqa     %%xmm14,%%xmm15                  \n"
+      "lea        0x10(%0,%3,8),%0                 \n"
+      "palignr    $0x8,%%xmm7,%%xmm7               \n"
+      "palignr    $0x8,%%xmm15,%%xmm15             \n"
+      "neg        %3                               \n"
+      // Second round of bit swap.
+      "punpcklwd  %%xmm2,%%xmm0                    \n"
+      "punpcklwd  %%xmm3,%%xmm1                    \n"
+      "movdqa     %%xmm0,%%xmm2                    \n"
+      "movdqa     %%xmm1,%%xmm3                    \n"
+      "palignr    $0x8,%%xmm2,%%xmm2               \n"
+      "palignr    $0x8,%%xmm3,%%xmm3               \n"
+      "punpcklwd  %%xmm6,%%xmm4                    \n"
+      "punpcklwd  %%xmm7,%%xmm5                    \n"
+      "movdqa     %%xmm4,%%xmm6                    \n"
+      "movdqa     %%xmm5,%%xmm7                    \n"
+      "palignr    $0x8,%%xmm6,%%xmm6               \n"
+      "palignr    $0x8,%%xmm7,%%xmm7               \n"
+      "punpcklwd  %%xmm10,%%xmm8                   \n"
+      "punpcklwd  %%xmm11,%%xmm9                   \n"
+      "movdqa     %%xmm8,%%xmm10                   \n"
+      "movdqa     %%xmm9,%%xmm11                   \n"
+      "palignr    $0x8,%%xmm10,%%xmm10             \n"
+      "palignr    $0x8,%%xmm11,%%xmm11             \n"
+      "punpcklwd  %%xmm14,%%xmm12                  \n"
+      "punpcklwd  %%xmm15,%%xmm13                  \n"
+      "movdqa     %%xmm12,%%xmm14                  \n"
+      "movdqa     %%xmm13,%%xmm15                  \n"
+      "palignr    $0x8,%%xmm14,%%xmm14             \n"
+      "palignr    $0x8,%%xmm15,%%xmm15             \n"
+      // Third round of bit swap.
+      // Write to the destination pointer.
+      "punpckldq  %%xmm4,%%xmm0                    \n"
+      "movq       %%xmm0,(%1)                      \n"
+      "movdqa     %%xmm0,%%xmm4                    \n"
+      "palignr    $0x8,%%xmm4,%%xmm4               \n"
+      "movq       %%xmm4,(%1,%4)                   \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "punpckldq  %%xmm6,%%xmm2                    \n"
+      "movdqa     %%xmm2,%%xmm6                    \n"
+      "movq       %%xmm2,(%1)                      \n"
+      "palignr    $0x8,%%xmm6,%%xmm6               \n"
+      "punpckldq  %%xmm5,%%xmm1                    \n"
+      "movq       %%xmm6,(%1,%4)                   \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "movdqa     %%xmm1,%%xmm5                    \n"
+      "movq       %%xmm1,(%1)                      \n"
+      "palignr    $0x8,%%xmm5,%%xmm5               \n"
+      "movq       %%xmm5,(%1,%4)                   \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "punpckldq  %%xmm7,%%xmm3                    \n"
+      "movq       %%xmm3,(%1)                      \n"
+      "movdqa     %%xmm3,%%xmm7                    \n"
+      "palignr    $0x8,%%xmm7,%%xmm7               \n"
+      "movq       %%xmm7,(%1,%4)                   \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "punpckldq  %%xmm12,%%xmm8                   \n"
+      "movq       %%xmm8,(%1)                      \n"
+      "movdqa     %%xmm8,%%xmm12                   \n"
+      "palignr    $0x8,%%xmm12,%%xmm12             \n"
+      "movq       %%xmm12,(%1,%4)                  \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "punpckldq  %%xmm14,%%xmm10                  \n"
+      "movdqa     %%xmm10,%%xmm14                  \n"
+      "movq       %%xmm10,(%1)                     \n"
+      "palignr    $0x8,%%xmm14,%%xmm14             \n"
+      "punpckldq  %%xmm13,%%xmm9                   \n"
+      "movq       %%xmm14,(%1,%4)                  \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "movdqa     %%xmm9,%%xmm13                   \n"
+      "movq       %%xmm9,(%1)                      \n"
+      "palignr    $0x8,%%xmm13,%%xmm13             \n"
+      "movq       %%xmm13,(%1,%4)                  \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "punpckldq  %%xmm15,%%xmm11                  \n"
+      "movq       %%xmm11,(%1)                     \n"
+      "movdqa     %%xmm11,%%xmm15                  \n"
+      "palignr    $0x8,%%xmm15,%%xmm15             \n"
+      "sub        $0x10,%2                         \n"
+      "movq       %%xmm15,(%1,%4)                  \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "jg         1b                               \n"
+      : "+r"(src),                    // %0
+        "+r"(dst),                    // %1
+        "+r"(width)                   // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride))   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+        "xmm15");
 }
 #endif  // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
 
 // Transpose UV 8x8.  64 bit.
 #if defined(HAS_TRANSPOSEUVWX8_SSE2)
-void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b, int width) {
-  asm volatile (
-    // Read in the data from the source pointer.
-    // First round of bit swap.
-    LABELALIGN
-  "1:                                            \n"
-    "movdqu     (%0),%%xmm0                      \n"
-    "movdqu     (%0,%4),%%xmm1                   \n"
-    "lea        (%0,%4,2),%0                     \n"
-    "movdqa     %%xmm0,%%xmm8                    \n"
-    "punpcklbw  %%xmm1,%%xmm0                    \n"
-    "punpckhbw  %%xmm1,%%xmm8                    \n"
-    "movdqa     %%xmm8,%%xmm1                    \n"
-    "movdqu     (%0),%%xmm2                      \n"
-    "movdqu     (%0,%4),%%xmm3                   \n"
-    "lea        (%0,%4,2),%0                     \n"
-    "movdqa     %%xmm2,%%xmm8                    \n"
-    "punpcklbw  %%xmm3,%%xmm2                    \n"
-    "punpckhbw  %%xmm3,%%xmm8                    \n"
-    "movdqa     %%xmm8,%%xmm3                    \n"
-    "movdqu     (%0),%%xmm4                      \n"
-    "movdqu     (%0,%4),%%xmm5                   \n"
-    "lea        (%0,%4,2),%0                     \n"
-    "movdqa     %%xmm4,%%xmm8                    \n"
-    "punpcklbw  %%xmm5,%%xmm4                    \n"
-    "punpckhbw  %%xmm5,%%xmm8                    \n"
-    "movdqa     %%xmm8,%%xmm5                    \n"
-    "movdqu     (%0),%%xmm6                      \n"
-    "movdqu     (%0,%4),%%xmm7                   \n"
-    "lea        (%0,%4,2),%0                     \n"
-    "movdqa     %%xmm6,%%xmm8                    \n"
-    "punpcklbw  %%xmm7,%%xmm6                    \n"
-    "neg        %4                               \n"
-    "lea        0x10(%0,%4,8),%0                 \n"
-    "punpckhbw  %%xmm7,%%xmm8                    \n"
-    "movdqa     %%xmm8,%%xmm7                    \n"
-    "neg        %4                               \n"
-     // Second round of bit swap.
-    "movdqa     %%xmm0,%%xmm8                    \n"
-    "movdqa     %%xmm1,%%xmm9                    \n"
-    "punpckhwd  %%xmm2,%%xmm8                    \n"
-    "punpckhwd  %%xmm3,%%xmm9                    \n"
-    "punpcklwd  %%xmm2,%%xmm0                    \n"
-    "punpcklwd  %%xmm3,%%xmm1                    \n"
-    "movdqa     %%xmm8,%%xmm2                    \n"
-    "movdqa     %%xmm9,%%xmm3                    \n"
-    "movdqa     %%xmm4,%%xmm8                    \n"
-    "movdqa     %%xmm5,%%xmm9                    \n"
-    "punpckhwd  %%xmm6,%%xmm8                    \n"
-    "punpckhwd  %%xmm7,%%xmm9                    \n"
-    "punpcklwd  %%xmm6,%%xmm4                    \n"
-    "punpcklwd  %%xmm7,%%xmm5                    \n"
-    "movdqa     %%xmm8,%%xmm6                    \n"
-    "movdqa     %%xmm9,%%xmm7                    \n"
-    // Third round of bit swap.
-    // Write to the destination pointer.
-    "movdqa     %%xmm0,%%xmm8                    \n"
-    "punpckldq  %%xmm4,%%xmm0                    \n"
-    "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
-    "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
-    "punpckhdq  %%xmm4,%%xmm8                    \n"
-    "movlpd     %%xmm8,(%1,%5)                   \n"
-    "lea        (%1,%5,2),%1                     \n"
-    "movhpd     %%xmm8,(%2,%6)                   \n"
-    "lea        (%2,%6,2),%2                     \n"
-    "movdqa     %%xmm2,%%xmm8                    \n"
-    "punpckldq  %%xmm6,%%xmm2                    \n"
-    "movlpd     %%xmm2,(%1)                      \n"
-    "movhpd     %%xmm2,(%2)                      \n"
-    "punpckhdq  %%xmm6,%%xmm8                    \n"
-    "movlpd     %%xmm8,(%1,%5)                   \n"
-    "lea        (%1,%5,2),%1                     \n"
-    "movhpd     %%xmm8,(%2,%6)                   \n"
-    "lea        (%2,%6,2),%2                     \n"
-    "movdqa     %%xmm1,%%xmm8                    \n"
-    "punpckldq  %%xmm5,%%xmm1                    \n"
-    "movlpd     %%xmm1,(%1)                      \n"
-    "movhpd     %%xmm1,(%2)                      \n"
-    "punpckhdq  %%xmm5,%%xmm8                    \n"
-    "movlpd     %%xmm8,(%1,%5)                   \n"
-    "lea        (%1,%5,2),%1                     \n"
-    "movhpd     %%xmm8,(%2,%6)                   \n"
-    "lea        (%2,%6,2),%2                     \n"
-    "movdqa     %%xmm3,%%xmm8                    \n"
-    "punpckldq  %%xmm7,%%xmm3                    \n"
-    "movlpd     %%xmm3,(%1)                      \n"
-    "movhpd     %%xmm3,(%2)                      \n"
-    "punpckhdq  %%xmm7,%%xmm8                    \n"
-    "sub        $0x8,%3                          \n"
-    "movlpd     %%xmm8,(%1,%5)                   \n"
-    "lea        (%1,%5,2),%1                     \n"
-    "movhpd     %%xmm8,(%2,%6)                   \n"
-    "lea        (%2,%6,2),%2                     \n"
-    "jg         1b                               \n"
-    : "+r"(src),    // %0
-      "+r"(dst_a),  // %1
-      "+r"(dst_b),  // %2
-      "+r"(width)   // %3
-    : "r"((intptr_t)(src_stride)),    // %4
-      "r"((intptr_t)(dst_stride_a)),  // %5
-      "r"((intptr_t)(dst_stride_b))   // %6
-    : "memory", "cc",
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
-      "xmm8", "xmm9"
-  );
+void TransposeUVWx8_SSE2(const uint8_t* src,
+                         int src_stride,
+                         uint8_t* dst_a,
+                         int dst_stride_a,
+                         uint8_t* dst_b,
+                         int dst_stride_b,
+                         int width) {
+  asm volatile(
+      // Read in the data from the source pointer.
+      // First round of bit swap.
+      LABELALIGN
+      "1:                                          \n"
+      "movdqu     (%0),%%xmm0                      \n"
+      "movdqu     (%0,%4),%%xmm1                   \n"
+      "lea        (%0,%4,2),%0                     \n"
+      "movdqa     %%xmm0,%%xmm8                    \n"
+      "punpcklbw  %%xmm1,%%xmm0                    \n"
+      "punpckhbw  %%xmm1,%%xmm8                    \n"
+      "movdqa     %%xmm8,%%xmm1                    \n"
+      "movdqu     (%0),%%xmm2                      \n"
+      "movdqu     (%0,%4),%%xmm3                   \n"
+      "lea        (%0,%4,2),%0                     \n"
+      "movdqa     %%xmm2,%%xmm8                    \n"
+      "punpcklbw  %%xmm3,%%xmm2                    \n"
+      "punpckhbw  %%xmm3,%%xmm8                    \n"
+      "movdqa     %%xmm8,%%xmm3                    \n"
+      "movdqu     (%0),%%xmm4                      \n"
+      "movdqu     (%0,%4),%%xmm5                   \n"
+      "lea        (%0,%4,2),%0                     \n"
+      "movdqa     %%xmm4,%%xmm8                    \n"
+      "punpcklbw  %%xmm5,%%xmm4                    \n"
+      "punpckhbw  %%xmm5,%%xmm8                    \n"
+      "movdqa     %%xmm8,%%xmm5                    \n"
+      "movdqu     (%0),%%xmm6                      \n"
+      "movdqu     (%0,%4),%%xmm7                   \n"
+      "lea        (%0,%4,2),%0                     \n"
+      "movdqa     %%xmm6,%%xmm8                    \n"
+      "punpcklbw  %%xmm7,%%xmm6                    \n"
+      "neg        %4                               \n"
+      "lea        0x10(%0,%4,8),%0                 \n"
+      "punpckhbw  %%xmm7,%%xmm8                    \n"
+      "movdqa     %%xmm8,%%xmm7                    \n"
+      "neg        %4                               \n"
+      // Second round of bit swap.
+      "movdqa     %%xmm0,%%xmm8                    \n"
+      "movdqa     %%xmm1,%%xmm9                    \n"
+      "punpckhwd  %%xmm2,%%xmm8                    \n"
+      "punpckhwd  %%xmm3,%%xmm9                    \n"
+      "punpcklwd  %%xmm2,%%xmm0                    \n"
+      "punpcklwd  %%xmm3,%%xmm1                    \n"
+      "movdqa     %%xmm8,%%xmm2                    \n"
+      "movdqa     %%xmm9,%%xmm3                    \n"
+      "movdqa     %%xmm4,%%xmm8                    \n"
+      "movdqa     %%xmm5,%%xmm9                    \n"
+      "punpckhwd  %%xmm6,%%xmm8                    \n"
+      "punpckhwd  %%xmm7,%%xmm9                    \n"
+      "punpcklwd  %%xmm6,%%xmm4                    \n"
+      "punpcklwd  %%xmm7,%%xmm5                    \n"
+      "movdqa     %%xmm8,%%xmm6                    \n"
+      "movdqa     %%xmm9,%%xmm7                    \n"
+      // Third round of bit swap.
+      // Write to the destination pointer.
+      "movdqa     %%xmm0,%%xmm8                    \n"
+      "punpckldq  %%xmm4,%%xmm0                    \n"
+      "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
+      "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
+      "punpckhdq  %%xmm4,%%xmm8                    \n"
+      "movlpd     %%xmm8,(%1,%5)                   \n"
+      "lea        (%1,%5,2),%1                     \n"
+      "movhpd     %%xmm8,(%2,%6)                   \n"
+      "lea        (%2,%6,2),%2                     \n"
+      "movdqa     %%xmm2,%%xmm8                    \n"
+      "punpckldq  %%xmm6,%%xmm2                    \n"
+      "movlpd     %%xmm2,(%1)                      \n"
+      "movhpd     %%xmm2,(%2)                      \n"
+      "punpckhdq  %%xmm6,%%xmm8                    \n"
+      "movlpd     %%xmm8,(%1,%5)                   \n"
+      "lea        (%1,%5,2),%1                     \n"
+      "movhpd     %%xmm8,(%2,%6)                   \n"
+      "lea        (%2,%6,2),%2                     \n"
+      "movdqa     %%xmm1,%%xmm8                    \n"
+      "punpckldq  %%xmm5,%%xmm1                    \n"
+      "movlpd     %%xmm1,(%1)                      \n"
+      "movhpd     %%xmm1,(%2)                      \n"
+      "punpckhdq  %%xmm5,%%xmm8                    \n"
+      "movlpd     %%xmm8,(%1,%5)                   \n"
+      "lea        (%1,%5,2),%1                     \n"
+      "movhpd     %%xmm8,(%2,%6)                   \n"
+      "lea        (%2,%6,2),%2                     \n"
+      "movdqa     %%xmm3,%%xmm8                    \n"
+      "punpckldq  %%xmm7,%%xmm3                    \n"
+      "movlpd     %%xmm3,(%1)                      \n"
+      "movhpd     %%xmm3,(%2)                      \n"
+      "punpckhdq  %%xmm7,%%xmm8                    \n"
+      "sub        $0x8,%3                          \n"
+      "movlpd     %%xmm8,(%1,%5)                   \n"
+      "lea        (%1,%5,2),%1                     \n"
+      "movhpd     %%xmm8,(%2,%6)                   \n"
+      "lea        (%2,%6,2),%2                     \n"
+      "jg         1b                               \n"
+      : "+r"(src),                      // %0
+        "+r"(dst_a),                    // %1
+        "+r"(dst_b),                    // %2
+        "+r"(width)                     // %3
+      : "r"((intptr_t)(src_stride)),    // %4
+        "r"((intptr_t)(dst_stride_a)),  // %5
+        "r"((intptr_t)(dst_stride_b))   // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7", "xmm8", "xmm9");
 }
 #endif  // defined(HAS_TRANSPOSEUVWX8_SSE2)
 #endif  // defined(__x86_64__) || defined(__i386__)
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/rotate_mips.cc b/media/libvpx/libvpx/third_party/libyuv/source/rotate_mips.cc
deleted file mode 100644
index 1e8ce25197..0000000000
--- a/media/libvpx/libvpx/third_party/libyuv/source/rotate_mips.cc
+++ /dev/null
@@ -1,484 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-#include "libyuv/rotate_row.h"
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if !defined(LIBYUV_DISABLE_MIPS) && \
-    defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
-    (_MIPS_SIM == _MIPS_SIM_ABI32)
-
-void TransposeWx8_DSPR2(const uint8* src, int src_stride,
-                        uint8* dst, int dst_stride, int width) {
-   __asm__ __volatile__ (
-      ".set push                                         \n"
-      ".set noreorder                                    \n"
-      "sll              $t2, %[src_stride], 0x1          \n" // src_stride x 2
-      "sll              $t4, %[src_stride], 0x2          \n" // src_stride x 4
-      "sll              $t9, %[src_stride], 0x3          \n" // src_stride x 8
-      "addu             $t3, $t2, %[src_stride]          \n"
-      "addu             $t5, $t4, %[src_stride]          \n"
-      "addu             $t6, $t2, $t4                    \n"
-      "andi             $t0, %[dst], 0x3                 \n"
-      "andi             $t1, %[dst_stride], 0x3          \n"
-      "or               $t0, $t0, $t1                    \n"
-      "bnez             $t0, 11f                         \n"
-      " subu            $t7, $t9, %[src_stride]          \n"
-//dst + dst_stride word aligned
-    "1:                                                  \n"
-      "lbu              $t0, 0(%[src])                   \n"
-      "lbux             $t1, %[src_stride](%[src])       \n"
-      "lbux             $t8, $t2(%[src])                 \n"
-      "lbux             $t9, $t3(%[src])                 \n"
-      "sll              $t1, $t1, 16                     \n"
-      "sll              $t9, $t9, 16                     \n"
-      "or               $t0, $t0, $t1                    \n"
-      "or               $t8, $t8, $t9                    \n"
-      "precr.qb.ph      $s0, $t8, $t0                    \n"
-      "lbux             $t0, $t4(%[src])                 \n"
-      "lbux             $t1, $t5(%[src])                 \n"
-      "lbux             $t8, $t6(%[src])                 \n"
-      "lbux             $t9, $t7(%[src])                 \n"
-      "sll              $t1, $t1, 16                     \n"
-      "sll              $t9, $t9, 16                     \n"
-      "or               $t0, $t0, $t1                    \n"
-      "or               $t8, $t8, $t9                    \n"
-      "precr.qb.ph      $s1, $t8, $t0                    \n"
-      "sw               $s0, 0(%[dst])                   \n"
-      "addiu            %[width], -1                     \n"
-      "addiu            %[src], 1                        \n"
-      "sw               $s1, 4(%[dst])                   \n"
-      "bnez             %[width], 1b                     \n"
-      " addu            %[dst], %[dst], %[dst_stride]    \n"
-      "b                2f                               \n"
-//dst + dst_stride unaligned
-   "11:                                                  \n"
-      "lbu              $t0, 0(%[src])                   \n"
-      "lbux             $t1, %[src_stride](%[src])       \n"
-      "lbux             $t8, $t2(%[src])                 \n"
-      "lbux             $t9, $t3(%[src])                 \n"
-      "sll              $t1, $t1, 16                     \n"
-      "sll              $t9, $t9, 16                     \n"
-      "or               $t0, $t0, $t1                    \n"
-      "or               $t8, $t8, $t9                    \n"
-      "precr.qb.ph      $s0, $t8, $t0                    \n"
-      "lbux             $t0, $t4(%[src])                 \n"
-      "lbux             $t1, $t5(%[src])                 \n"
-      "lbux             $t8, $t6(%[src])                 \n"
-      "lbux             $t9, $t7(%[src])                 \n"
-      "sll              $t1, $t1, 16                     \n"
-      "sll              $t9, $t9, 16                     \n"
-      "or               $t0, $t0, $t1                    \n"
-      "or               $t8, $t8, $t9                    \n"
-      "precr.qb.ph      $s1, $t8, $t0                    \n"
-      "swr              $s0, 0(%[dst])                   \n"
-      "swl              $s0, 3(%[dst])                   \n"
-      "addiu            %[width], -1                     \n"
-      "addiu            %[src], 1                        \n"
-      "swr              $s1, 4(%[dst])                   \n"
-      "swl              $s1, 7(%[dst])                   \n"
-      "bnez             %[width], 11b                    \n"
-       "addu             %[dst], %[dst], %[dst_stride]   \n"
-    "2:                                                  \n"
-      ".set pop                                          \n"
-      :[src] "+r" (src),
-       [dst] "+r" (dst),
-       [width] "+r" (width)
-      :[src_stride] "r" (src_stride),
-       [dst_stride] "r" (dst_stride)
-      : "t0", "t1",  "t2", "t3", "t4", "t5",
-        "t6", "t7", "t8", "t9",
-        "s0", "s1"
-  );
-}
-
-void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
-                             uint8* dst, int dst_stride, int width) {
-  __asm__ __volatile__ (
-      ".set noat                                         \n"
-      ".set push                                         \n"
-      ".set noreorder                                    \n"
-      "beqz             %[width], 2f                     \n"
-      " sll             $t2, %[src_stride], 0x1          \n"  // src_stride x 2
-      "sll              $t4, %[src_stride], 0x2          \n"  // src_stride x 4
-      "sll              $t9, %[src_stride], 0x3          \n"  // src_stride x 8
-      "addu             $t3, $t2, %[src_stride]          \n"
-      "addu             $t5, $t4, %[src_stride]          \n"
-      "addu             $t6, $t2, $t4                    \n"
-
-      "srl              $AT, %[width], 0x2               \n"
-      "andi             $t0, %[dst], 0x3                 \n"
-      "andi             $t1, %[dst_stride], 0x3          \n"
-      "or               $t0, $t0, $t1                    \n"
-      "bnez             $t0, 11f                         \n"
-      " subu            $t7, $t9, %[src_stride]          \n"
-//dst + dst_stride word aligned
-      "1:                                                \n"
-      "lw               $t0, 0(%[src])                   \n"
-      "lwx              $t1, %[src_stride](%[src])       \n"
-      "lwx              $t8, $t2(%[src])                 \n"
-      "lwx              $t9, $t3(%[src])                 \n"
-
-// t0 = | 30 | 20 | 10 | 00 |
-// t1 = | 31 | 21 | 11 | 01 |
-// t8 = | 32 | 22 | 12 | 02 |
-// t9 = | 33 | 23 | 13 | 03 |
-
-      "precr.qb.ph     $s0, $t1, $t0                     \n"
-      "precr.qb.ph     $s1, $t9, $t8                     \n"
-      "precrq.qb.ph    $s2, $t1, $t0                     \n"
-      "precrq.qb.ph    $s3, $t9, $t8                     \n"
-
-  // s0 = | 21 | 01 | 20 | 00 |
-  // s1 = | 23 | 03 | 22 | 02 |
-  // s2 = | 31 | 11 | 30 | 10 |
-  // s3 = | 33 | 13 | 32 | 12 |
-
-      "precr.qb.ph     $s4, $s1, $s0                     \n"
-      "precrq.qb.ph    $s5, $s1, $s0                     \n"
-      "precr.qb.ph     $s6, $s3, $s2                     \n"
-      "precrq.qb.ph    $s7, $s3, $s2                     \n"
-
-  // s4 = | 03 | 02 | 01 | 00 |
-  // s5 = | 23 | 22 | 21 | 20 |
-  // s6 = | 13 | 12 | 11 | 10 |
-  // s7 = | 33 | 32 | 31 | 30 |
-
-      "lwx              $t0, $t4(%[src])                 \n"
-      "lwx              $t1, $t5(%[src])                 \n"
-      "lwx              $t8, $t6(%[src])                 \n"
-      "lwx              $t9, $t7(%[src])                 \n"
-
-// t0 = | 34 | 24 | 14 | 04 |
-// t1 = | 35 | 25 | 15 | 05 |
-// t8 = | 36 | 26 | 16 | 06 |
-// t9 = | 37 | 27 | 17 | 07 |
-
-      "precr.qb.ph     $s0, $t1, $t0                     \n"
-      "precr.qb.ph     $s1, $t9, $t8                     \n"
-      "precrq.qb.ph    $s2, $t1, $t0                     \n"
-      "precrq.qb.ph    $s3, $t9, $t8                     \n"
-
-  // s0 = | 25 | 05 | 24 | 04 |
-  // s1 = | 27 | 07 | 26 | 06 |
-  // s2 = | 35 | 15 | 34 | 14 |
-  // s3 = | 37 | 17 | 36 | 16 |
-
-      "precr.qb.ph     $t0, $s1, $s0                     \n"
-      "precrq.qb.ph    $t1, $s1, $s0                     \n"
-      "precr.qb.ph     $t8, $s3, $s2                     \n"
-      "precrq.qb.ph    $t9, $s3, $s2                     \n"
-
-  // t0 = | 07 | 06 | 05 | 04 |
-  // t1 = | 27 | 26 | 25 | 24 |
-  // t8 = | 17 | 16 | 15 | 14 |
-  // t9 = | 37 | 36 | 35 | 34 |
-
-      "addu            $s0, %[dst], %[dst_stride]        \n"
-      "addu            $s1, $s0, %[dst_stride]           \n"
-      "addu            $s2, $s1, %[dst_stride]           \n"
-
-      "sw              $s4, 0(%[dst])                    \n"
-      "sw              $t0, 4(%[dst])                    \n"
-      "sw              $s6, 0($s0)                       \n"
-      "sw              $t8, 4($s0)                       \n"
-      "sw              $s5, 0($s1)                       \n"
-      "sw              $t1, 4($s1)                       \n"
-      "sw              $s7, 0($s2)                       \n"
-      "sw              $t9, 4($s2)                       \n"
-
-      "addiu            $AT, -1                          \n"
-      "addiu            %[src], 4                        \n"
-
-      "bnez             $AT, 1b                          \n"
-      " addu            %[dst], $s2, %[dst_stride]       \n"
-      "b                2f                               \n"
-//dst + dst_stride unaligned
-      "11:                                               \n"
-      "lw               $t0, 0(%[src])                   \n"
-      "lwx              $t1, %[src_stride](%[src])       \n"
-      "lwx              $t8, $t2(%[src])                 \n"
-      "lwx              $t9, $t3(%[src])                 \n"
-
-// t0 = | 30 | 20 | 10 | 00 |
-// t1 = | 31 | 21 | 11 | 01 |
-// t8 = | 32 | 22 | 12 | 02 |
-// t9 = | 33 | 23 | 13 | 03 |
-
-      "precr.qb.ph     $s0, $t1, $t0                     \n"
-      "precr.qb.ph     $s1, $t9, $t8                     \n"
-      "precrq.qb.ph    $s2, $t1, $t0                     \n"
-      "precrq.qb.ph    $s3, $t9, $t8                     \n"
-
-  // s0 = | 21 | 01 | 20 | 00 |
-  // s1 = | 23 | 03 | 22 | 02 |
-  // s2 = | 31 | 11 | 30 | 10 |
-  // s3 = | 33 | 13 | 32 | 12 |
-
-      "precr.qb.ph     $s4, $s1, $s0                     \n"
-      "precrq.qb.ph    $s5, $s1, $s0                     \n"
-      "precr.qb.ph     $s6, $s3, $s2                     \n"
-      "precrq.qb.ph    $s7, $s3, $s2                     \n"
-
-  // s4 = | 03 | 02 | 01 | 00 |
-  // s5 = | 23 | 22 | 21 | 20 |
-  // s6 = | 13 | 12 | 11 | 10 |
-  // s7 = | 33 | 32 | 31 | 30 |
-
-      "lwx              $t0, $t4(%[src])                 \n"
-      "lwx              $t1, $t5(%[src])                 \n"
-      "lwx              $t8, $t6(%[src])                 \n"
-      "lwx              $t9, $t7(%[src])                 \n"
-
-// t0 = | 34 | 24 | 14 | 04 |
-// t1 = | 35 | 25 | 15 | 05 |
-// t8 = | 36 | 26 | 16 | 06 |
-// t9 = | 37 | 27 | 17 | 07 |
-
-      "precr.qb.ph     $s0, $t1, $t0                     \n"
-      "precr.qb.ph     $s1, $t9, $t8                     \n"
-      "precrq.qb.ph    $s2, $t1, $t0                     \n"
-      "precrq.qb.ph    $s3, $t9, $t8                     \n"
-
-  // s0 = | 25 | 05 | 24 | 04 |
-  // s1 = | 27 | 07 | 26 | 06 |
-  // s2 = | 35 | 15 | 34 | 14 |
-  // s3 = | 37 | 17 | 36 | 16 |
-
-      "precr.qb.ph     $t0, $s1, $s0                     \n"
-      "precrq.qb.ph    $t1, $s1, $s0                     \n"
-      "precr.qb.ph     $t8, $s3, $s2                     \n"
-      "precrq.qb.ph    $t9, $s3, $s2                     \n"
-
-  // t0 = | 07 | 06 | 05 | 04 |
-  // t1 = | 27 | 26 | 25 | 24 |
-  // t8 = | 17 | 16 | 15 | 14 |
-  // t9 = | 37 | 36 | 35 | 34 |
-
-      "addu            $s0, %[dst], %[dst_stride]        \n"
-      "addu            $s1, $s0, %[dst_stride]           \n"
-      "addu            $s2, $s1, %[dst_stride]           \n"
-
-      "swr              $s4, 0(%[dst])                   \n"
-      "swl              $s4, 3(%[dst])                   \n"
-      "swr              $t0, 4(%[dst])                   \n"
-      "swl              $t0, 7(%[dst])                   \n"
-      "swr              $s6, 0($s0)                      \n"
-      "swl              $s6, 3($s0)                      \n"
-      "swr              $t8, 4($s0)                      \n"
-      "swl              $t8, 7($s0)                      \n"
-      "swr              $s5, 0($s1)                      \n"
-      "swl              $s5, 3($s1)                      \n"
-      "swr              $t1, 4($s1)                      \n"
-      "swl              $t1, 7($s1)                      \n"
-      "swr              $s7, 0($s2)                      \n"
-      "swl              $s7, 3($s2)                      \n"
-      "swr              $t9, 4($s2)                      \n"
-      "swl              $t9, 7($s2)                      \n"
-
-      "addiu            $AT, -1                          \n"
-      "addiu            %[src], 4                        \n"
-
-      "bnez             $AT, 11b                         \n"
-      " addu            %[dst], $s2, %[dst_stride]       \n"
-      "2:                                                \n"
-      ".set pop                                          \n"
-      ".set at                                           \n"
-      :[src] "+r" (src),
-       [dst] "+r" (dst),
-       [width] "+r" (width)
-      :[src_stride] "r" (src_stride),
-       [dst_stride] "r" (dst_stride)
-      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9",
-        "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7"
-  );
-}
-
-void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
-                          uint8* dst_a, int dst_stride_a,
-                          uint8* dst_b, int dst_stride_b,
-                          int width) {
-  __asm__ __volatile__ (
-      ".set push                                         \n"
-      ".set noreorder                                    \n"
-      "beqz            %[width], 2f                      \n"
-      " sll            $t2, %[src_stride], 0x1           \n" // src_stride x 2
-      "sll             $t4, %[src_stride], 0x2           \n" // src_stride x 4
-      "sll             $t9, %[src_stride], 0x3           \n" // src_stride x 8
-      "addu            $t3, $t2, %[src_stride]           \n"
-      "addu            $t5, $t4, %[src_stride]           \n"
-      "addu            $t6, $t2, $t4                     \n"
-      "subu            $t7, $t9, %[src_stride]           \n"
-      "srl             $t1, %[width], 1                  \n"
-
-// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
-      "andi            $t0, %[dst_a], 0x3                \n"
-      "andi            $t8, %[dst_b], 0x3                \n"
-      "or              $t0, $t0, $t8                     \n"
-      "andi            $t8, %[dst_stride_a], 0x3         \n"
-      "andi            $s5, %[dst_stride_b], 0x3         \n"
-      "or              $t8, $t8, $s5                     \n"
-      "or              $t0, $t0, $t8                     \n"
-      "bnez            $t0, 11f                          \n"
-      " nop                                              \n"
-// dst + dst_stride word aligned (both, a & b dst addresses)
-    "1:                                                  \n"
-      "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|
-      "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|
-      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
-      "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|
-      "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|
-      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
-
-      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|
-      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|
-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|
-
-      "sll             $t0, $t0, 16                      \n"
-      "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|
-      "sll             $t9, $t9, 16                      \n"
-      "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|
-
-      "sw              $s3, 0($s5)                       \n"
-      "sw              $s4, 0($s6)                       \n"
-
-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|
-
-      "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|
-      "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|
-      "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|
-      "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|
-      "sw              $s3, 0(%[dst_a])                  \n"
-      "sw              $s4, 0(%[dst_b])                  \n"
-
-      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|
-      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|
-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|
-
-      "sll             $t0, $t0, 16                      \n"
-      "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|
-      "sll             $t9, $t9, 16                      \n"
-      "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|
-      "sw              $s3, 4($s5)                       \n"
-      "sw              $s4, 4($s6)                       \n"
-
-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|
-
-      "addiu           %[src], 4                         \n"
-      "addiu           $t1, -1                           \n"
-      "sll             $t0, %[dst_stride_a], 1           \n"
-      "sll             $t8, %[dst_stride_b], 1           \n"
-      "sw              $s3, 4(%[dst_a])                  \n"
-      "sw              $s4, 4(%[dst_b])                  \n"
-      "addu            %[dst_a], %[dst_a], $t0           \n"
-      "bnez            $t1, 1b                           \n"
-      " addu           %[dst_b], %[dst_b], $t8           \n"
-      "b               2f                                \n"
-      " nop                                              \n"
-
-// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
-   "11:                                                  \n"
-      "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|
-      "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|
-      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
-      "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|
-      "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|
-      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
-
-      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|
-      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|
-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|
-
-      "sll             $t0, $t0, 16                      \n"
-      "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|
-      "sll             $t9, $t9, 16                      \n"
-      "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|
-
-      "swr             $s3, 0($s5)                       \n"
-      "swl             $s3, 3($s5)                       \n"
-      "swr             $s4, 0($s6)                       \n"
-      "swl             $s4, 3($s6)                       \n"
-
-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|
-
-      "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|
-      "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|
-      "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|
-      "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|
-      "swr             $s3, 0(%[dst_a])                  \n"
-      "swl             $s3, 3(%[dst_a])                  \n"
-      "swr             $s4, 0(%[dst_b])                  \n"
-      "swl             $s4, 3(%[dst_b])                  \n"
-
-      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|
-      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|
-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|
-
-      "sll             $t0, $t0, 16                      \n"
-      "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|
-      "sll             $t9, $t9, 16                      \n"
-      "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|
-
-      "swr             $s3, 4($s5)                       \n"
-      "swl             $s3, 7($s5)                       \n"
-      "swr             $s4, 4($s6)                       \n"
-      "swl             $s4, 7($s6)                       \n"
-
-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|
-
-      "addiu           %[src], 4                         \n"
-      "addiu           $t1, -1                           \n"
-      "sll             $t0, %[dst_stride_a], 1           \n"
-      "sll             $t8, %[dst_stride_b], 1           \n"
-      "swr             $s3, 4(%[dst_a])                  \n"
-      "swl             $s3, 7(%[dst_a])                  \n"
-      "swr             $s4, 4(%[dst_b])                  \n"
-      "swl             $s4, 7(%[dst_b])                  \n"
-      "addu            %[dst_a], %[dst_a], $t0           \n"
-      "bnez            $t1, 11b                          \n"
-      " addu           %[dst_b], %[dst_b], $t8           \n"
-
-      "2:                                                \n"
-      ".set pop                                          \n"
-      : [src] "+r" (src),
-        [dst_a] "+r" (dst_a),
-        [dst_b] "+r" (dst_b),
-        [width] "+r" (width),
-        [src_stride] "+r" (src_stride)
-      : [dst_stride_a] "r" (dst_stride_a),
-        [dst_stride_b] "r" (dst_stride_b)
-      : "t0", "t1",  "t2", "t3",  "t4", "t5",
-        "t6", "t7", "t8", "t9",
-        "s0", "s1", "s2", "s3",
-        "s4", "s5", "s6"
-  );
-}
-
-#endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/rotate_msa.cc b/media/libvpx/libvpx/third_party/libyuv/source/rotate_msa.cc
new file mode 100644
index 0000000000..99bdca65b3
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/libyuv/source/rotate_msa.cc
@@ -0,0 +1,250 @@
+/*
+ *  Copyright 2016 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate_row.h"
+
+// This module is for GCC MSA
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#include "libyuv/macros_msa.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define ILVRL_B(in0, in1, in2, in3, out0, out1, out2, out3) \
+  {                                                         \
+    out0 = (v16u8)__msa_ilvr_b((v16i8)in1, (v16i8)in0);     \
+    out1 = (v16u8)__msa_ilvl_b((v16i8)in1, (v16i8)in0);     \
+    out2 = (v16u8)__msa_ilvr_b((v16i8)in3, (v16i8)in2);     \
+    out3 = (v16u8)__msa_ilvl_b((v16i8)in3, (v16i8)in2);     \
+  }
+
+#define ILVRL_H(in0, in1, in2, in3, out0, out1, out2, out3) \
+  {                                                         \
+    out0 = (v16u8)__msa_ilvr_h((v8i16)in1, (v8i16)in0);     \
+    out1 = (v16u8)__msa_ilvl_h((v8i16)in1, (v8i16)in0);     \
+    out2 = (v16u8)__msa_ilvr_h((v8i16)in3, (v8i16)in2);     \
+    out3 = (v16u8)__msa_ilvl_h((v8i16)in3, (v8i16)in2);     \
+  }
+
+#define ILVRL_W(in0, in1, in2, in3, out0, out1, out2, out3) \
+  {                                                         \
+    out0 = (v16u8)__msa_ilvr_w((v4i32)in1, (v4i32)in0);     \
+    out1 = (v16u8)__msa_ilvl_w((v4i32)in1, (v4i32)in0);     \
+    out2 = (v16u8)__msa_ilvr_w((v4i32)in3, (v4i32)in2);     \
+    out3 = (v16u8)__msa_ilvl_w((v4i32)in3, (v4i32)in2);     \
+  }
+
+#define ILVRL_D(in0, in1, in2, in3, out0, out1, out2, out3) \
+  {                                                         \
+    out0 = (v16u8)__msa_ilvr_d((v2i64)in1, (v2i64)in0);     \
+    out1 = (v16u8)__msa_ilvl_d((v2i64)in1, (v2i64)in0);     \
+    out2 = (v16u8)__msa_ilvr_d((v2i64)in3, (v2i64)in2);     \
+    out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2);     \
+  }
+
+void TransposeWx16_C(const uint8_t* src,
+                     int src_stride,
+                     uint8_t* dst,
+                     int dst_stride,
+                     int width) {
+  TransposeWx8_C(src, src_stride, dst, dst_stride, width);
+  TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride,
+                 width);
+}
+
+void TransposeUVWx16_C(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst_a,
+                       int dst_stride_a,
+                       uint8_t* dst_b,
+                       int dst_stride_b,
+                       int width) {
+  TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+                   width);
+  TransposeUVWx8_C((src + 8 * src_stride), src_stride, (dst_a + 8),
+                   dst_stride_a, (dst_b + 8), dst_stride_b, width);
+}
+
+void TransposeWx16_MSA(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst,
+                       int dst_stride,
+                       int width) {
+  int x;
+  const uint8_t* s;
+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
+  v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
+
+  for (x = 0; x < width; x += 16) {
+    s = src;
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
+    ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
+    ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0);
+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0);
+    ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
+    ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+    dst += dst_stride * 4;
+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1);
+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1);
+    ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
+    ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+    dst += dst_stride * 4;
+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2);
+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2);
+    ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
+    ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+    dst += dst_stride * 4;
+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3);
+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3);
+    ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
+    ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+    src += 16;
+    dst += dst_stride * 4;
+  }
+}
+
+void TransposeUVWx16_MSA(const uint8_t* src,
+                         int src_stride,
+                         uint8_t* dst_a,
+                         int dst_stride_a,
+                         uint8_t* dst_b,
+                         int dst_stride_b,
+                         int width) {
+  int x;
+  const uint8_t* s;
+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
+  v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
+
+  for (x = 0; x < width; x += 8) {
+    s = src;
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
+    ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
+    ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0);
+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0);
+    ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
+    ST_UB2(dst0, dst2, dst_a, dst_stride_a);
+    ST_UB2(dst1, dst3, dst_b, dst_stride_b);
+    dst_a += dst_stride_a * 2;
+    dst_b += dst_stride_b * 2;
+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1);
+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1);
+    ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
+    ST_UB2(dst0, dst2, dst_a, dst_stride_a);
+    ST_UB2(dst1, dst3, dst_b, dst_stride_b);
+    dst_a += dst_stride_a * 2;
+    dst_b += dst_stride_b * 2;
+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2);
+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2);
+    ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
+    ST_UB2(dst0, dst2, dst_a, dst_stride_a);
+    ST_UB2(dst1, dst3, dst_b, dst_stride_b);
+    dst_a += dst_stride_a * 2;
+    dst_b += dst_stride_b * 2;
+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3);
+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3);
+    ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
+    ST_UB2(dst0, dst2, dst_a, dst_stride_a);
+    ST_UB2(dst1, dst3, dst_b, dst_stride_b);
+    src += 16;
+    dst_a += dst_stride_a * 2;
+    dst_b += dst_stride_b * 2;
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/rotate_neon.cc b/media/libvpx/libvpx/third_party/libyuv/source/rotate_neon.cc
index 1c22b472bc..fdc0dd476c 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/rotate_neon.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/rotate_neon.cc
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/row.h"
 #include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
 
 #include "libyuv/basic_types.h"
 
@@ -21,38 +21,32 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
     !defined(__aarch64__)
 
-static uvec8 kVTbl4x4Transpose =
-  { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };
+static const uvec8 kVTbl4x4Transpose = {0, 4, 8,  12, 1, 5, 9,  13,
+                                        2, 6, 10, 14, 3, 7, 11, 15};
 
-void TransposeWx8_NEON(const uint8* src, int src_stride,
-                       uint8* dst, int dst_stride,
+void TransposeWx8_NEON(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst,
+                       int dst_stride,
                        int width) {
-  const uint8* src_temp;
-  asm volatile (
-    // loops are on blocks of 8. loop will stop when
-    // counter gets to or below 0. starting the counter
-    // at w-8 allow for this
-    "sub         %5, #8                        \n"
+  const uint8_t* src_temp;
+  asm volatile(
+      // loops are on blocks of 8. loop will stop when
+      // counter gets to or below 0. starting the counter
+      // at w-8 allow for this
+      "sub         %5, #8                        \n"
 
-    // handle 8x8 blocks. this should be the majority of the plane
-    "1:                                        \n"
+      // handle 8x8 blocks. this should be the majority of the plane
+      "1:                                        \n"
       "mov         %0, %1                      \n"
 
-      MEMACCESS(0)
       "vld1.8      {d0}, [%0], %2              \n"
-      MEMACCESS(0)
       "vld1.8      {d1}, [%0], %2              \n"
-      MEMACCESS(0)
       "vld1.8      {d2}, [%0], %2              \n"
-      MEMACCESS(0)
       "vld1.8      {d3}, [%0], %2              \n"
-      MEMACCESS(0)
       "vld1.8      {d4}, [%0], %2              \n"
-      MEMACCESS(0)
       "vld1.8      {d5}, [%0], %2              \n"
-      MEMACCESS(0)
       "vld1.8      {d6}, [%0], %2              \n"
-      MEMACCESS(0)
       "vld1.8      {d7}, [%0]                  \n"
 
       "vtrn.8      d1, d0                      \n"
@@ -77,21 +71,13 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
 
       "mov         %0, %3                      \n"
 
-    MEMACCESS(0)
       "vst1.8      {d1}, [%0], %4              \n"
-    MEMACCESS(0)
       "vst1.8      {d0}, [%0], %4              \n"
-    MEMACCESS(0)
       "vst1.8      {d3}, [%0], %4              \n"
-    MEMACCESS(0)
       "vst1.8      {d2}, [%0], %4              \n"
-    MEMACCESS(0)
       "vst1.8      {d5}, [%0], %4              \n"
-    MEMACCESS(0)
       "vst1.8      {d4}, [%0], %4              \n"
-    MEMACCESS(0)
       "vst1.8      {d7}, [%0], %4              \n"
-    MEMACCESS(0)
       "vst1.8      {d6}, [%0]                  \n"
 
       "add         %1, #8                      \n"  // src += 8
@@ -99,180 +85,138 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
       "subs        %5,  #8                     \n"  // w   -= 8
       "bge         1b                          \n"
 
-    // add 8 back to counter. if the result is 0 there are
-    // no residuals.
-    "adds        %5, #8                        \n"
-    "beq         4f                            \n"
+      // add 8 back to counter. if the result is 0 there are
+      // no residuals.
+      "adds        %5, #8                        \n"
+      "beq         4f                            \n"
 
-    // some residual, so between 1 and 7 lines left to transpose
-    "cmp         %5, #2                        \n"
-    "blt         3f                            \n"
+      // some residual, so between 1 and 7 lines left to transpose
+      "cmp         %5, #2                        \n"
+      "blt         3f                            \n"
 
-    "cmp         %5, #4                        \n"
-    "blt         2f                            \n"
+      "cmp         %5, #4                        \n"
+      "blt         2f                            \n"
 
-    // 4x8 block
-    "mov         %0, %1                        \n"
-    MEMACCESS(0)
-    "vld1.32     {d0[0]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d0[1]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d1[0]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d1[1]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d2[0]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d2[1]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d3[0]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d3[1]}, [%0]                 \n"
+      // 4x8 block
+      "mov         %0, %1                        \n"
+      "vld1.32     {d0[0]}, [%0], %2             \n"
+      "vld1.32     {d0[1]}, [%0], %2             \n"
+      "vld1.32     {d1[0]}, [%0], %2             \n"
+      "vld1.32     {d1[1]}, [%0], %2             \n"
+      "vld1.32     {d2[0]}, [%0], %2             \n"
+      "vld1.32     {d2[1]}, [%0], %2             \n"
+      "vld1.32     {d3[0]}, [%0], %2             \n"
+      "vld1.32     {d3[1]}, [%0]                 \n"
 
-    "mov         %0, %3                        \n"
+      "mov         %0, %3                        \n"
 
-    MEMACCESS(6)
-    "vld1.8      {q3}, [%6]                    \n"
+      "vld1.8      {q3}, [%6]                    \n"
 
-    "vtbl.8      d4, {d0, d1}, d6              \n"
-    "vtbl.8      d5, {d0, d1}, d7              \n"
-    "vtbl.8      d0, {d2, d3}, d6              \n"
-    "vtbl.8      d1, {d2, d3}, d7              \n"
+      "vtbl.8      d4, {d0, d1}, d6              \n"
+      "vtbl.8      d5, {d0, d1}, d7              \n"
+      "vtbl.8      d0, {d2, d3}, d6              \n"
+      "vtbl.8      d1, {d2, d3}, d7              \n"
 
-    // TODO(frkoenig): Rework shuffle above to
-    // write out with 4 instead of 8 writes.
-    MEMACCESS(0)
-    "vst1.32     {d4[0]}, [%0], %4             \n"
-    MEMACCESS(0)
-    "vst1.32     {d4[1]}, [%0], %4             \n"
-    MEMACCESS(0)
-    "vst1.32     {d5[0]}, [%0], %4             \n"
-    MEMACCESS(0)
-    "vst1.32     {d5[1]}, [%0]                 \n"
+      // TODO(frkoenig): Rework shuffle above to
+      // write out with 4 instead of 8 writes.
+      "vst1.32     {d4[0]}, [%0], %4             \n"
+      "vst1.32     {d4[1]}, [%0], %4             \n"
+      "vst1.32     {d5[0]}, [%0], %4             \n"
+      "vst1.32     {d5[1]}, [%0]                 \n"
 
-    "add         %0, %3, #4                    \n"
-    MEMACCESS(0)
-    "vst1.32     {d0[0]}, [%0], %4             \n"
-    MEMACCESS(0)
-    "vst1.32     {d0[1]}, [%0], %4             \n"
-    MEMACCESS(0)
-    "vst1.32     {d1[0]}, [%0], %4             \n"
-    MEMACCESS(0)
-    "vst1.32     {d1[1]}, [%0]                 \n"
+      "add         %0, %3, #4                    \n"
+      "vst1.32     {d0[0]}, [%0], %4             \n"
+      "vst1.32     {d0[1]}, [%0], %4             \n"
+      "vst1.32     {d1[0]}, [%0], %4             \n"
+      "vst1.32     {d1[1]}, [%0]                 \n"
 
-    "add         %1, #4                        \n"  // src += 4
-    "add         %3, %3, %4, lsl #2            \n"  // dst += 4 * dst_stride
-    "subs        %5,  #4                       \n"  // w   -= 4
-    "beq         4f                            \n"
+      "add         %1, #4                        \n"  // src += 4
+      "add         %3, %3, %4, lsl #2            \n"  // dst += 4 * dst_stride
+      "subs        %5,  #4                       \n"  // w   -= 4
+      "beq         4f                            \n"
 
-    // some residual, check to see if it includes a 2x8 block,
-    // or less
-    "cmp         %5, #2                        \n"
-    "blt         3f                            \n"
+      // some residual, check to see if it includes a 2x8 block,
+      // or less
+      "cmp         %5, #2                        \n"
+      "blt         3f                            \n"
 
-    // 2x8 block
-    "2:                                        \n"
-    "mov         %0, %1                        \n"
-    MEMACCESS(0)
-    "vld1.16     {d0[0]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d1[0]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d0[1]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d1[1]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d0[2]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d1[2]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d0[3]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d1[3]}, [%0]                 \n"
+      // 2x8 block
+      "2:                                        \n"
+      "mov         %0, %1                        \n"
+      "vld1.16     {d0[0]}, [%0], %2             \n"
+      "vld1.16     {d1[0]}, [%0], %2             \n"
+      "vld1.16     {d0[1]}, [%0], %2             \n"
+      "vld1.16     {d1[1]}, [%0], %2             \n"
+      "vld1.16     {d0[2]}, [%0], %2             \n"
+      "vld1.16     {d1[2]}, [%0], %2             \n"
+      "vld1.16     {d0[3]}, [%0], %2             \n"
+      "vld1.16     {d1[3]}, [%0]                 \n"
 
-    "vtrn.8      d0, d1                        \n"
+      "vtrn.8      d0, d1                        \n"
 
-    "mov         %0, %3                        \n"
+      "mov         %0, %3                        \n"
 
-    MEMACCESS(0)
-    "vst1.64     {d0}, [%0], %4                \n"
-    MEMACCESS(0)
-    "vst1.64     {d1}, [%0]                    \n"
+      "vst1.64     {d0}, [%0], %4                \n"
+      "vst1.64     {d1}, [%0]                    \n"
 
-    "add         %1, #2                        \n"  // src += 2
-    "add         %3, %3, %4, lsl #1            \n"  // dst += 2 * dst_stride
-    "subs        %5,  #2                       \n"  // w   -= 2
-    "beq         4f                            \n"
+      "add         %1, #2                        \n"  // src += 2
+      "add         %3, %3, %4, lsl #1            \n"  // dst += 2 * dst_stride
+      "subs        %5,  #2                       \n"  // w   -= 2
+      "beq         4f                            \n"
 
-    // 1x8 block
-    "3:                                        \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[0]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[1]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[2]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[3]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[4]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[5]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[6]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[7]}, [%1]                 \n"
+      // 1x8 block
+      "3:                                        \n"
+      "vld1.8      {d0[0]}, [%1], %2             \n"
+      "vld1.8      {d0[1]}, [%1], %2             \n"
+      "vld1.8      {d0[2]}, [%1], %2             \n"
+      "vld1.8      {d0[3]}, [%1], %2             \n"
+      "vld1.8      {d0[4]}, [%1], %2             \n"
+      "vld1.8      {d0[5]}, [%1], %2             \n"
+      "vld1.8      {d0[6]}, [%1], %2             \n"
+      "vld1.8      {d0[7]}, [%1]                 \n"
 
-    MEMACCESS(3)
-    "vst1.64     {d0}, [%3]                    \n"
+      "vst1.64     {d0}, [%3]                    \n"
 
-    "4:                                        \n"
+      "4:                                        \n"
 
-    : "=&r"(src_temp),         // %0
-      "+r"(src),               // %1
-      "+r"(src_stride),        // %2
-      "+r"(dst),               // %3
-      "+r"(dst_stride),        // %4
-      "+r"(width)              // %5
-    : "r"(&kVTbl4x4Transpose)  // %6
-    : "memory", "cc", "q0", "q1", "q2", "q3"
-  );
+      : "=&r"(src_temp),         // %0
+        "+r"(src),               // %1
+        "+r"(src_stride),        // %2
+        "+r"(dst),               // %3
+        "+r"(dst_stride),        // %4
+        "+r"(width)              // %5
+      : "r"(&kVTbl4x4Transpose)  // %6
+      : "memory", "cc", "q0", "q1", "q2", "q3");
 }
 
-static uvec8 kVTbl4x4TransposeDi =
-  { 0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15 };
+static const uvec8 kVTbl4x4TransposeDi = {0, 8,  1, 9,  2, 10, 3, 11,
+                                          4, 12, 5, 13, 6, 14, 7, 15};
 
-void TransposeUVWx8_NEON(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b,
+void TransposeUVWx8_NEON(const uint8_t* src,
+                         int src_stride,
+                         uint8_t* dst_a,
+                         int dst_stride_a,
+                         uint8_t* dst_b,
+                         int dst_stride_b,
                          int width) {
-  const uint8* src_temp;
-  asm volatile (
-    // loops are on blocks of 8. loop will stop when
-    // counter gets to or below 0. starting the counter
-    // at w-8 allow for this
-    "sub         %7, #8                        \n"
+  const uint8_t* src_temp;
+  asm volatile(
+      // loops are on blocks of 8. loop will stop when
+      // counter gets to or below 0. starting the counter
+      // at w-8 allow for this
+      "sub         %7, #8                        \n"
 
-    // handle 8x8 blocks. this should be the majority of the plane
-    "1:                                        \n"
+      // handle 8x8 blocks. this should be the majority of the plane
+      "1:                                        \n"
       "mov         %0, %1                      \n"
 
-      MEMACCESS(0)
       "vld2.8      {d0,  d1},  [%0], %2        \n"
-      MEMACCESS(0)
       "vld2.8      {d2,  d3},  [%0], %2        \n"
-      MEMACCESS(0)
       "vld2.8      {d4,  d5},  [%0], %2        \n"
-      MEMACCESS(0)
       "vld2.8      {d6,  d7},  [%0], %2        \n"
-      MEMACCESS(0)
       "vld2.8      {d16, d17}, [%0], %2        \n"
-      MEMACCESS(0)
       "vld2.8      {d18, d19}, [%0], %2        \n"
-      MEMACCESS(0)
       "vld2.8      {d20, d21}, [%0], %2        \n"
-      MEMACCESS(0)
       "vld2.8      {d22, d23}, [%0]            \n"
 
       "vtrn.8      q1, q0                      \n"
@@ -301,40 +245,24 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
 
       "mov         %0, %3                      \n"
 
-    MEMACCESS(0)
       "vst1.8      {d2},  [%0], %4             \n"
-    MEMACCESS(0)
       "vst1.8      {d0},  [%0], %4             \n"
-    MEMACCESS(0)
       "vst1.8      {d6},  [%0], %4             \n"
-    MEMACCESS(0)
       "vst1.8      {d4},  [%0], %4             \n"
-    MEMACCESS(0)
       "vst1.8      {d18}, [%0], %4             \n"
-    MEMACCESS(0)
       "vst1.8      {d16}, [%0], %4             \n"
-    MEMACCESS(0)
       "vst1.8      {d22}, [%0], %4             \n"
-    MEMACCESS(0)
       "vst1.8      {d20}, [%0]                 \n"
 
       "mov         %0, %5                      \n"
 
-    MEMACCESS(0)
       "vst1.8      {d3},  [%0], %6             \n"
-    MEMACCESS(0)
       "vst1.8      {d1},  [%0], %6             \n"
-    MEMACCESS(0)
       "vst1.8      {d7},  [%0], %6             \n"
-    MEMACCESS(0)
       "vst1.8      {d5},  [%0], %6             \n"
-    MEMACCESS(0)
       "vst1.8      {d19}, [%0], %6             \n"
-    MEMACCESS(0)
       "vst1.8      {d17}, [%0], %6             \n"
-    MEMACCESS(0)
       "vst1.8      {d23}, [%0], %6             \n"
-    MEMACCESS(0)
       "vst1.8      {d21}, [%0]                 \n"
 
       "add         %1, #8*2                    \n"  // src   += 8*2
@@ -343,187 +271,142 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
       "subs        %7,  #8                     \n"  // w     -= 8
       "bge         1b                          \n"
 
-    // add 8 back to counter. if the result is 0 there are
-    // no residuals.
-    "adds        %7, #8                        \n"
-    "beq         4f                            \n"
+      // add 8 back to counter. if the result is 0 there are
+      // no residuals.
+      "adds        %7, #8                        \n"
+      "beq         4f                            \n"
 
-    // some residual, so between 1 and 7 lines left to transpose
-    "cmp         %7, #2                        \n"
-    "blt         3f                            \n"
+      // some residual, so between 1 and 7 lines left to transpose
+      "cmp         %7, #2                        \n"
+      "blt         3f                            \n"
 
-    "cmp         %7, #4                        \n"
-    "blt         2f                            \n"
+      "cmp         %7, #4                        \n"
+      "blt         2f                            \n"
 
-    // TODO(frkoenig): Clean this up
-    // 4x8 block
-    "mov         %0, %1                        \n"
-    MEMACCESS(0)
-    "vld1.64     {d0}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d1}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d2}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d3}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d4}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d5}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d6}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d7}, [%0]                    \n"
+      // TODO(frkoenig): Clean this up
+      // 4x8 block
+      "mov         %0, %1                        \n"
+      "vld1.64     {d0}, [%0], %2                \n"
+      "vld1.64     {d1}, [%0], %2                \n"
+      "vld1.64     {d2}, [%0], %2                \n"
+      "vld1.64     {d3}, [%0], %2                \n"
+      "vld1.64     {d4}, [%0], %2                \n"
+      "vld1.64     {d5}, [%0], %2                \n"
+      "vld1.64     {d6}, [%0], %2                \n"
+      "vld1.64     {d7}, [%0]                    \n"
 
-    MEMACCESS(8)
-    "vld1.8      {q15}, [%8]                   \n"
+      "vld1.8      {q15}, [%8]                   \n"
 
-    "vtrn.8      q0, q1                        \n"
-    "vtrn.8      q2, q3                        \n"
+      "vtrn.8      q0, q1                        \n"
+      "vtrn.8      q2, q3                        \n"
 
-    "vtbl.8      d16, {d0, d1}, d30            \n"
-    "vtbl.8      d17, {d0, d1}, d31            \n"
-    "vtbl.8      d18, {d2, d3}, d30            \n"
-    "vtbl.8      d19, {d2, d3}, d31            \n"
-    "vtbl.8      d20, {d4, d5}, d30            \n"
-    "vtbl.8      d21, {d4, d5}, d31            \n"
-    "vtbl.8      d22, {d6, d7}, d30            \n"
-    "vtbl.8      d23, {d6, d7}, d31            \n"
+      "vtbl.8      d16, {d0, d1}, d30            \n"
+      "vtbl.8      d17, {d0, d1}, d31            \n"
+      "vtbl.8      d18, {d2, d3}, d30            \n"
+      "vtbl.8      d19, {d2, d3}, d31            \n"
+      "vtbl.8      d20, {d4, d5}, d30            \n"
+      "vtbl.8      d21, {d4, d5}, d31            \n"
+      "vtbl.8      d22, {d6, d7}, d30            \n"
+      "vtbl.8      d23, {d6, d7}, d31            \n"
 
-    "mov         %0, %3                        \n"
+      "mov         %0, %3                        \n"
 
-    MEMACCESS(0)
-    "vst1.32     {d16[0]},  [%0], %4           \n"
-    MEMACCESS(0)
-    "vst1.32     {d16[1]},  [%0], %4           \n"
-    MEMACCESS(0)
-    "vst1.32     {d17[0]},  [%0], %4           \n"
-    MEMACCESS(0)
-    "vst1.32     {d17[1]},  [%0], %4           \n"
+      "vst1.32     {d16[0]},  [%0], %4           \n"
+      "vst1.32     {d16[1]},  [%0], %4           \n"
+      "vst1.32     {d17[0]},  [%0], %4           \n"
+      "vst1.32     {d17[1]},  [%0], %4           \n"
 
-    "add         %0, %3, #4                    \n"
-    MEMACCESS(0)
-    "vst1.32     {d20[0]}, [%0], %4            \n"
-    MEMACCESS(0)
-    "vst1.32     {d20[1]}, [%0], %4            \n"
-    MEMACCESS(0)
-    "vst1.32     {d21[0]}, [%0], %4            \n"
-    MEMACCESS(0)
-    "vst1.32     {d21[1]}, [%0]                \n"
+      "add         %0, %3, #4                    \n"
+      "vst1.32     {d20[0]}, [%0], %4            \n"
+      "vst1.32     {d20[1]}, [%0], %4            \n"
+      "vst1.32     {d21[0]}, [%0], %4            \n"
+      "vst1.32     {d21[1]}, [%0]                \n"
 
-    "mov         %0, %5                        \n"
+      "mov         %0, %5                        \n"
 
-    MEMACCESS(0)
-    "vst1.32     {d18[0]}, [%0], %6            \n"
-    MEMACCESS(0)
-    "vst1.32     {d18[1]}, [%0], %6            \n"
-    MEMACCESS(0)
-    "vst1.32     {d19[0]}, [%0], %6            \n"
-    MEMACCESS(0)
-    "vst1.32     {d19[1]}, [%0], %6            \n"
+      "vst1.32     {d18[0]}, [%0], %6            \n"
+      "vst1.32     {d18[1]}, [%0], %6            \n"
+      "vst1.32     {d19[0]}, [%0], %6            \n"
+      "vst1.32     {d19[1]}, [%0], %6            \n"
 
-    "add         %0, %5, #4                    \n"
-    MEMACCESS(0)
-    "vst1.32     {d22[0]},  [%0], %6           \n"
-    MEMACCESS(0)
-    "vst1.32     {d22[1]},  [%0], %6           \n"
-    MEMACCESS(0)
-    "vst1.32     {d23[0]},  [%0], %6           \n"
-    MEMACCESS(0)
-    "vst1.32     {d23[1]},  [%0]               \n"
+      "add         %0, %5, #4                    \n"
+      "vst1.32     {d22[0]},  [%0], %6           \n"
+      "vst1.32     {d22[1]},  [%0], %6           \n"
+      "vst1.32     {d23[0]},  [%0], %6           \n"
+      "vst1.32     {d23[1]},  [%0]               \n"
 
-    "add         %1, #4*2                      \n"  // src   += 4 * 2
-    "add         %3, %3, %4, lsl #2            \n"  // dst_a += 4 * dst_stride_a
-    "add         %5, %5, %6, lsl #2            \n"  // dst_b += 4 * dst_stride_b
-    "subs        %7,  #4                       \n"  // w     -= 4
-    "beq         4f                            \n"
+      "add         %1, #4*2                      \n"  // src   += 4 * 2
+      "add         %3, %3, %4, lsl #2            \n"  // dst_a += 4 *
+                                                      // dst_stride_a
+      "add         %5, %5, %6, lsl #2            \n"  // dst_b += 4 *
+                                                      // dst_stride_b
+      "subs        %7,  #4                       \n"  // w     -= 4
+      "beq         4f                            \n"
 
-    // some residual, check to see if it includes a 2x8 block,
-    // or less
-    "cmp         %7, #2                        \n"
-    "blt         3f                            \n"
+      // some residual, check to see if it includes a 2x8 block,
+      // or less
+      "cmp         %7, #2                        \n"
+      "blt         3f                            \n"
 
-    // 2x8 block
-    "2:                                        \n"
-    "mov         %0, %1                        \n"
-    MEMACCESS(0)
-    "vld2.16     {d0[0], d2[0]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d1[0], d3[0]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d0[1], d2[1]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d1[1], d3[1]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d0[2], d2[2]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d1[2], d3[2]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d0[3], d2[3]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d1[3], d3[3]}, [%0]          \n"
+      // 2x8 block
+      "2:                                        \n"
+      "mov         %0, %1                        \n"
+      "vld2.16     {d0[0], d2[0]}, [%0], %2      \n"
+      "vld2.16     {d1[0], d3[0]}, [%0], %2      \n"
+      "vld2.16     {d0[1], d2[1]}, [%0], %2      \n"
+      "vld2.16     {d1[1], d3[1]}, [%0], %2      \n"
+      "vld2.16     {d0[2], d2[2]}, [%0], %2      \n"
+      "vld2.16     {d1[2], d3[2]}, [%0], %2      \n"
+      "vld2.16     {d0[3], d2[3]}, [%0], %2      \n"
+      "vld2.16     {d1[3], d3[3]}, [%0]          \n"
 
-    "vtrn.8      d0, d1                        \n"
-    "vtrn.8      d2, d3                        \n"
+      "vtrn.8      d0, d1                        \n"
+      "vtrn.8      d2, d3                        \n"
 
-    "mov         %0, %3                        \n"
+      "mov         %0, %3                        \n"
 
-    MEMACCESS(0)
-    "vst1.64     {d0}, [%0], %4                \n"
-    MEMACCESS(0)
-    "vst1.64     {d2}, [%0]                    \n"
+      "vst1.64     {d0}, [%0], %4                \n"
+      "vst1.64     {d2}, [%0]                    \n"
 
-    "mov         %0, %5                        \n"
+      "mov         %0, %5                        \n"
 
-    MEMACCESS(0)
-    "vst1.64     {d1}, [%0], %6                \n"
-    MEMACCESS(0)
-    "vst1.64     {d3}, [%0]                    \n"
+      "vst1.64     {d1}, [%0], %6                \n"
+      "vst1.64     {d3}, [%0]                    \n"
 
-    "add         %1, #2*2                      \n"  // src   += 2 * 2
-    "add         %3, %3, %4, lsl #1            \n"  // dst_a += 2 * dst_stride_a
-    "add         %5, %5, %6, lsl #1            \n"  // dst_b += 2 * dst_stride_b
-    "subs        %7,  #2                       \n"  // w     -= 2
-    "beq         4f                            \n"
+      "add         %1, #2*2                      \n"  // src   += 2 * 2
+      "add         %3, %3, %4, lsl #1            \n"  // dst_a += 2 *
+                                                      // dst_stride_a
+      "add         %5, %5, %6, lsl #1            \n"  // dst_b += 2 *
+                                                      // dst_stride_b
+      "subs        %7,  #2                       \n"  // w     -= 2
+      "beq         4f                            \n"
 
-    // 1x8 block
-    "3:                                        \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[0], d1[0]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[1], d1[1]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[2], d1[2]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[3], d1[3]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[4], d1[4]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[5], d1[5]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[6], d1[6]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[7], d1[7]}, [%1]          \n"
+      // 1x8 block
+      "3:                                        \n"
+      "vld2.8      {d0[0], d1[0]}, [%1], %2      \n"
+      "vld2.8      {d0[1], d1[1]}, [%1], %2      \n"
+      "vld2.8      {d0[2], d1[2]}, [%1], %2      \n"
+      "vld2.8      {d0[3], d1[3]}, [%1], %2      \n"
+      "vld2.8      {d0[4], d1[4]}, [%1], %2      \n"
+      "vld2.8      {d0[5], d1[5]}, [%1], %2      \n"
+      "vld2.8      {d0[6], d1[6]}, [%1], %2      \n"
+      "vld2.8      {d0[7], d1[7]}, [%1]          \n"
 
-    MEMACCESS(3)
-    "vst1.64     {d0}, [%3]                    \n"
-    MEMACCESS(5)
-    "vst1.64     {d1}, [%5]                    \n"
+      "vst1.64     {d0}, [%3]                    \n"
+      "vst1.64     {d1}, [%5]                    \n"
 
-    "4:                                        \n"
+      "4:                                        \n"
 
-    : "=&r"(src_temp),           // %0
-      "+r"(src),                 // %1
-      "+r"(src_stride),          // %2
-      "+r"(dst_a),               // %3
-      "+r"(dst_stride_a),        // %4
-      "+r"(dst_b),               // %5
-      "+r"(dst_stride_b),        // %6
-      "+r"(width)                // %7
-    : "r"(&kVTbl4x4TransposeDi)  // %8
-    : "memory", "cc",
-      "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
-  );
+      : "=&r"(src_temp),           // %0
+        "+r"(src),                 // %1
+        "+r"(src_stride),          // %2
+        "+r"(dst_a),               // %3
+        "+r"(dst_stride_a),        // %4
+        "+r"(dst_b),               // %5
+        "+r"(dst_stride_b),        // %6
+        "+r"(width)                // %7
+      : "r"(&kVTbl4x4TransposeDi)  // %8
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
 }
 #endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
 
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/rotate_neon64.cc b/media/libvpx/libvpx/third_party/libyuv/source/rotate_neon64.cc
index 1ab448f3ab..f469baacf6 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/rotate_neon64.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/rotate_neon64.cc
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/row.h"
 #include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
 
 #include "libyuv/basic_types.h"
 
@@ -21,38 +21,32 @@ extern "C" {
 // This module is for GCC Neon armv8 64 bit.
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
-static uvec8 kVTbl4x4Transpose =
-  { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
+static const uvec8 kVTbl4x4Transpose = {0, 4, 8,  12, 1, 5, 9,  13,
+                                        2, 6, 10, 14, 3, 7, 11, 15};
 
-void TransposeWx8_NEON(const uint8* src, int src_stride,
-                       uint8* dst, int dst_stride, int width) {
-  const uint8* src_temp;
-  int64 width64 = (int64) width;  // Work around clang 3.4 warning.
-  asm volatile (
-    // loops are on blocks of 8. loop will stop when
-    // counter gets to or below 0. starting the counter
-    // at w-8 allow for this
-    "sub         %3, %3, #8                      \n"
+void TransposeWx8_NEON(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst,
+                       int dst_stride,
+                       int width) {
+  const uint8_t* src_temp;
+  asm volatile(
+      // loops are on blocks of 8. loop will stop when
+      // counter gets to or below 0. starting the counter
+      // at w-8 allow for this
+      "sub         %w3, %w3, #8                     \n"
 
-    // handle 8x8 blocks. this should be the majority of the plane
-    "1:                                          \n"
+      // handle 8x8 blocks. this should be the majority of the plane
+      "1:                                          \n"
       "mov         %0, %1                        \n"
 
-      MEMACCESS(0)
       "ld1        {v0.8b}, [%0], %5              \n"
-      MEMACCESS(0)
       "ld1        {v1.8b}, [%0], %5              \n"
-      MEMACCESS(0)
       "ld1        {v2.8b}, [%0], %5              \n"
-      MEMACCESS(0)
       "ld1        {v3.8b}, [%0], %5              \n"
-      MEMACCESS(0)
       "ld1        {v4.8b}, [%0], %5              \n"
-      MEMACCESS(0)
       "ld1        {v5.8b}, [%0], %5              \n"
-      MEMACCESS(0)
       "ld1        {v6.8b}, [%0], %5              \n"
-      MEMACCESS(0)
       "ld1        {v7.8b}, [%0]                  \n"
 
       "trn2     v16.8b, v0.8b, v1.8b             \n"
@@ -84,456 +78,345 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
 
       "mov         %0, %2                        \n"
 
-    MEMACCESS(0)
       "st1      {v17.8b}, [%0], %6               \n"
-    MEMACCESS(0)
       "st1      {v16.8b}, [%0], %6               \n"
-    MEMACCESS(0)
       "st1      {v19.8b}, [%0], %6               \n"
-    MEMACCESS(0)
       "st1      {v18.8b}, [%0], %6               \n"
-    MEMACCESS(0)
       "st1      {v21.8b}, [%0], %6               \n"
-    MEMACCESS(0)
       "st1      {v20.8b}, [%0], %6               \n"
-    MEMACCESS(0)
       "st1      {v23.8b}, [%0], %6               \n"
-    MEMACCESS(0)
       "st1      {v22.8b}, [%0]                   \n"
 
       "add         %1, %1, #8                    \n"  // src += 8
       "add         %2, %2, %6, lsl #3            \n"  // dst += 8 * dst_stride
-      "subs        %3, %3, #8                    \n"  // w   -= 8
+      "subs        %w3, %w3, #8                  \n"  // w   -= 8
       "b.ge        1b                            \n"
 
-    // add 8 back to counter. if the result is 0 there are
-    // no residuals.
-    "adds        %3, %3, #8                      \n"
-    "b.eq        4f                              \n"
+      // add 8 back to counter. if the result is 0 there are
+      // no residuals.
+      "adds        %w3, %w3, #8                    \n"
+      "b.eq        4f                              \n"
 
-    // some residual, so between 1 and 7 lines left to transpose
-    "cmp         %3, #2                          \n"
-    "b.lt        3f                              \n"
+      // some residual, so between 1 and 7 lines left to transpose
+      "cmp         %w3, #2                          \n"
+      "b.lt        3f                              \n"
 
-    "cmp         %3, #4                          \n"
-    "b.lt        2f                              \n"
+      "cmp         %w3, #4                          \n"
+      "b.lt        2f                              \n"
 
-    // 4x8 block
-    "mov         %0, %1                          \n"
-    MEMACCESS(0)
-    "ld1     {v0.s}[0], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v0.s}[1], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v0.s}[2], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v0.s}[3], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v1.s}[0], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v1.s}[1], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v1.s}[2], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v1.s}[3], [%0]                     \n"
+      // 4x8 block
+      "mov         %0, %1                          \n"
+      "ld1     {v0.s}[0], [%0], %5                 \n"
+      "ld1     {v0.s}[1], [%0], %5                 \n"
+      "ld1     {v0.s}[2], [%0], %5                 \n"
+      "ld1     {v0.s}[3], [%0], %5                 \n"
+      "ld1     {v1.s}[0], [%0], %5                 \n"
+      "ld1     {v1.s}[1], [%0], %5                 \n"
+      "ld1     {v1.s}[2], [%0], %5                 \n"
+      "ld1     {v1.s}[3], [%0]                     \n"
 
-    "mov         %0, %2                          \n"
+      "mov         %0, %2                          \n"
 
-    MEMACCESS(4)
-    "ld1      {v2.16b}, [%4]                     \n"
+      "ld1      {v2.16b}, [%4]                     \n"
 
-    "tbl      v3.16b, {v0.16b}, v2.16b           \n"
-    "tbl      v0.16b, {v1.16b}, v2.16b           \n"
+      "tbl      v3.16b, {v0.16b}, v2.16b           \n"
+      "tbl      v0.16b, {v1.16b}, v2.16b           \n"
 
-    // TODO(frkoenig): Rework shuffle above to
-    // write out with 4 instead of 8 writes.
-    MEMACCESS(0)
-    "st1 {v3.s}[0], [%0], %6                     \n"
-    MEMACCESS(0)
-    "st1 {v3.s}[1], [%0], %6                     \n"
-    MEMACCESS(0)
-    "st1 {v3.s}[2], [%0], %6                     \n"
-    MEMACCESS(0)
-    "st1 {v3.s}[3], [%0]                         \n"
+      // TODO(frkoenig): Rework shuffle above to
+      // write out with 4 instead of 8 writes.
+      "st1 {v3.s}[0], [%0], %6                     \n"
+      "st1 {v3.s}[1], [%0], %6                     \n"
+      "st1 {v3.s}[2], [%0], %6                     \n"
+      "st1 {v3.s}[3], [%0]                         \n"
 
-    "add         %0, %2, #4                      \n"
-    MEMACCESS(0)
-    "st1 {v0.s}[0], [%0], %6                     \n"
-    MEMACCESS(0)
-    "st1 {v0.s}[1], [%0], %6                     \n"
-    MEMACCESS(0)
-    "st1 {v0.s}[2], [%0], %6                     \n"
-    MEMACCESS(0)
-    "st1 {v0.s}[3], [%0]                         \n"
+      "add         %0, %2, #4                      \n"
+      "st1 {v0.s}[0], [%0], %6                     \n"
+      "st1 {v0.s}[1], [%0], %6                     \n"
+      "st1 {v0.s}[2], [%0], %6                     \n"
+      "st1 {v0.s}[3], [%0]                         \n"
 
-    "add         %1, %1, #4                      \n"  // src += 4
-    "add         %2, %2, %6, lsl #2              \n"  // dst += 4 * dst_stride
-    "subs        %3, %3, #4                      \n"  // w   -= 4
-    "b.eq        4f                              \n"
+      "add         %1, %1, #4                      \n"  // src += 4
+      "add         %2, %2, %6, lsl #2              \n"  // dst += 4 * dst_stride
+      "subs        %w3, %w3, #4                    \n"  // w   -= 4
+      "b.eq        4f                              \n"
 
-    // some residual, check to see if it includes a 2x8 block,
-    // or less
-    "cmp         %3, #2                          \n"
-    "b.lt        3f                              \n"
+      // some residual, check to see if it includes a 2x8 block,
+      // or less
+      "cmp         %w3, #2                         \n"
+      "b.lt        3f                              \n"
 
-    // 2x8 block
-    "2:                                          \n"
-    "mov         %0, %1                          \n"
-    MEMACCESS(0)
-    "ld1     {v0.h}[0], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v1.h}[0], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v0.h}[1], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v1.h}[1], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v0.h}[2], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v1.h}[2], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v0.h}[3], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v1.h}[3], [%0]                     \n"
+      // 2x8 block
+      "2:                                          \n"
+      "mov         %0, %1                          \n"
+      "ld1     {v0.h}[0], [%0], %5                 \n"
+      "ld1     {v1.h}[0], [%0], %5                 \n"
+      "ld1     {v0.h}[1], [%0], %5                 \n"
+      "ld1     {v1.h}[1], [%0], %5                 \n"
+      "ld1     {v0.h}[2], [%0], %5                 \n"
+      "ld1     {v1.h}[2], [%0], %5                 \n"
+      "ld1     {v0.h}[3], [%0], %5                 \n"
+      "ld1     {v1.h}[3], [%0]                     \n"
 
-    "trn2    v2.8b, v0.8b, v1.8b                 \n"
-    "trn1    v3.8b, v0.8b, v1.8b                 \n"
+      "trn2    v2.8b, v0.8b, v1.8b                 \n"
+      "trn1    v3.8b, v0.8b, v1.8b                 \n"
 
-    "mov         %0, %2                          \n"
+      "mov         %0, %2                          \n"
 
-    MEMACCESS(0)
-    "st1     {v3.8b}, [%0], %6                   \n"
-    MEMACCESS(0)
-    "st1     {v2.8b}, [%0]                       \n"
+      "st1     {v3.8b}, [%0], %6                   \n"
+      "st1     {v2.8b}, [%0]                       \n"
 
-    "add         %1, %1, #2                      \n"  // src += 2
-    "add         %2, %2, %6, lsl #1              \n"  // dst += 2 * dst_stride
-    "subs        %3, %3,  #2                     \n"  // w   -= 2
-    "b.eq        4f                              \n"
+      "add         %1, %1, #2                      \n"  // src += 2
+      "add         %2, %2, %6, lsl #1              \n"  // dst += 2 * dst_stride
+      "subs        %w3, %w3,  #2                   \n"  // w   -= 2
+      "b.eq        4f                              \n"
 
-    // 1x8 block
-    "3:                                          \n"
-    MEMACCESS(1)
-    "ld1         {v0.b}[0], [%1], %5             \n"
-    MEMACCESS(1)
-    "ld1         {v0.b}[1], [%1], %5             \n"
-    MEMACCESS(1)
-    "ld1         {v0.b}[2], [%1], %5             \n"
-    MEMACCESS(1)
-    "ld1         {v0.b}[3], [%1], %5             \n"
-    MEMACCESS(1)
-    "ld1         {v0.b}[4], [%1], %5             \n"
-    MEMACCESS(1)
-    "ld1         {v0.b}[5], [%1], %5             \n"
-    MEMACCESS(1)
-    "ld1         {v0.b}[6], [%1], %5             \n"
-    MEMACCESS(1)
-    "ld1         {v0.b}[7], [%1]                 \n"
+      // 1x8 block
+      "3:                                          \n"
+      "ld1         {v0.b}[0], [%1], %5             \n"
+      "ld1         {v0.b}[1], [%1], %5             \n"
+      "ld1         {v0.b}[2], [%1], %5             \n"
+      "ld1         {v0.b}[3], [%1], %5             \n"
+      "ld1         {v0.b}[4], [%1], %5             \n"
+      "ld1         {v0.b}[5], [%1], %5             \n"
+      "ld1         {v0.b}[6], [%1], %5             \n"
+      "ld1         {v0.b}[7], [%1]                 \n"
 
-    MEMACCESS(2)
-    "st1         {v0.8b}, [%2]                   \n"
+      "st1         {v0.8b}, [%2]                   \n"
 
-    "4:                                          \n"
+      "4:                                          \n"
 
-    : "=&r"(src_temp),                            // %0
-      "+r"(src),                                  // %1
-      "+r"(dst),                                  // %2
-      "+r"(width64)                               // %3
-    : "r"(&kVTbl4x4Transpose),                    // %4
-      "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
-      "r"(static_cast<ptrdiff_t>(dst_stride))     // %6
-    : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
-      "v17", "v18", "v19", "v20", "v21", "v22", "v23"
-  );
+      : "=&r"(src_temp),                          // %0
+        "+r"(src),                                // %1
+        "+r"(dst),                                // %2
+        "+r"(width)                               // %3
+      : "r"(&kVTbl4x4Transpose),                  // %4
+        "r"(static_cast<ptrdiff_t>(src_stride)),  // %5
+        "r"(static_cast<ptrdiff_t>(dst_stride))   // %6
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+        "v17", "v18", "v19", "v20", "v21", "v22", "v23");
 }
 
-static uint8 kVTbl4x4TransposeDi[32] =
-  { 0,  16, 32, 48,  2, 18, 34, 50,  4, 20, 36, 52,  6, 22, 38, 54,
-    1,  17, 33, 49,  3, 19, 35, 51,  5, 21, 37, 53,  7, 23, 39, 55};
+static const uint8_t kVTbl4x4TransposeDi[32] = {
+    0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54,
+    1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55};
 
-void TransposeUVWx8_NEON(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b,
+void TransposeUVWx8_NEON(const uint8_t* src,
+                         int src_stride,
+                         uint8_t* dst_a,
+                         int dst_stride_a,
+                         uint8_t* dst_b,
+                         int dst_stride_b,
                          int width) {
-  const uint8* src_temp;
-  int64 width64 = (int64) width;  // Work around clang 3.4 warning.
-  asm volatile (
-    // loops are on blocks of 8. loop will stop when
-    // counter gets to or below 0. starting the counter
-    // at w-8 allow for this
-    "sub       %4, %4, #8                      \n"
+  const uint8_t* src_temp;
+  asm volatile(
+      // loops are on blocks of 8. loop will stop when
+      // counter gets to or below 0. starting the counter
+      // at w-8 allow for this
+      "sub       %w4, %w4, #8                    \n"
 
-    // handle 8x8 blocks. this should be the majority of the plane
-    "1:                                        \n"
-    "mov       %0, %1                          \n"
+      // handle 8x8 blocks. this should be the majority of the plane
+      "1:                                        \n"
+      "mov       %0, %1                          \n"
 
-    MEMACCESS(0)
-    "ld1       {v0.16b}, [%0], %5              \n"
-    MEMACCESS(0)
-    "ld1       {v1.16b}, [%0], %5              \n"
-    MEMACCESS(0)
-    "ld1       {v2.16b}, [%0], %5              \n"
-    MEMACCESS(0)
-    "ld1       {v3.16b}, [%0], %5              \n"
-    MEMACCESS(0)
-    "ld1       {v4.16b}, [%0], %5              \n"
-    MEMACCESS(0)
-    "ld1       {v5.16b}, [%0], %5              \n"
-    MEMACCESS(0)
-    "ld1       {v6.16b}, [%0], %5              \n"
-    MEMACCESS(0)
-    "ld1       {v7.16b}, [%0]                  \n"
+      "ld1       {v0.16b}, [%0], %5              \n"
+      "ld1       {v1.16b}, [%0], %5              \n"
+      "ld1       {v2.16b}, [%0], %5              \n"
+      "ld1       {v3.16b}, [%0], %5              \n"
+      "ld1       {v4.16b}, [%0], %5              \n"
+      "ld1       {v5.16b}, [%0], %5              \n"
+      "ld1       {v6.16b}, [%0], %5              \n"
+      "ld1       {v7.16b}, [%0]                  \n"
 
-    "trn1      v16.16b, v0.16b, v1.16b         \n"
-    "trn2      v17.16b, v0.16b, v1.16b         \n"
-    "trn1      v18.16b, v2.16b, v3.16b         \n"
-    "trn2      v19.16b, v2.16b, v3.16b         \n"
-    "trn1      v20.16b, v4.16b, v5.16b         \n"
-    "trn2      v21.16b, v4.16b, v5.16b         \n"
-    "trn1      v22.16b, v6.16b, v7.16b         \n"
-    "trn2      v23.16b, v6.16b, v7.16b         \n"
+      "trn1      v16.16b, v0.16b, v1.16b         \n"
+      "trn2      v17.16b, v0.16b, v1.16b         \n"
+      "trn1      v18.16b, v2.16b, v3.16b         \n"
+      "trn2      v19.16b, v2.16b, v3.16b         \n"
+      "trn1      v20.16b, v4.16b, v5.16b         \n"
+      "trn2      v21.16b, v4.16b, v5.16b         \n"
+      "trn1      v22.16b, v6.16b, v7.16b         \n"
+      "trn2      v23.16b, v6.16b, v7.16b         \n"
 
-    "trn1      v0.8h, v16.8h, v18.8h           \n"
-    "trn2      v1.8h, v16.8h, v18.8h           \n"
-    "trn1      v2.8h, v20.8h, v22.8h           \n"
-    "trn2      v3.8h, v20.8h, v22.8h           \n"
-    "trn1      v4.8h, v17.8h, v19.8h           \n"
-    "trn2      v5.8h, v17.8h, v19.8h           \n"
-    "trn1      v6.8h, v21.8h, v23.8h           \n"
-    "trn2      v7.8h, v21.8h, v23.8h           \n"
+      "trn1      v0.8h, v16.8h, v18.8h           \n"
+      "trn2      v1.8h, v16.8h, v18.8h           \n"
+      "trn1      v2.8h, v20.8h, v22.8h           \n"
+      "trn2      v3.8h, v20.8h, v22.8h           \n"
+      "trn1      v4.8h, v17.8h, v19.8h           \n"
+      "trn2      v5.8h, v17.8h, v19.8h           \n"
+      "trn1      v6.8h, v21.8h, v23.8h           \n"
+      "trn2      v7.8h, v21.8h, v23.8h           \n"
 
-    "trn1      v16.4s, v0.4s, v2.4s            \n"
-    "trn2      v17.4s, v0.4s, v2.4s            \n"
-    "trn1      v18.4s, v1.4s, v3.4s            \n"
-    "trn2      v19.4s, v1.4s, v3.4s            \n"
-    "trn1      v20.4s, v4.4s, v6.4s            \n"
-    "trn2      v21.4s, v4.4s, v6.4s            \n"
-    "trn1      v22.4s, v5.4s, v7.4s            \n"
-    "trn2      v23.4s, v5.4s, v7.4s            \n"
+      "trn1      v16.4s, v0.4s, v2.4s            \n"
+      "trn2      v17.4s, v0.4s, v2.4s            \n"
+      "trn1      v18.4s, v1.4s, v3.4s            \n"
+      "trn2      v19.4s, v1.4s, v3.4s            \n"
+      "trn1      v20.4s, v4.4s, v6.4s            \n"
+      "trn2      v21.4s, v4.4s, v6.4s            \n"
+      "trn1      v22.4s, v5.4s, v7.4s            \n"
+      "trn2      v23.4s, v5.4s, v7.4s            \n"
 
-    "mov       %0, %2                          \n"
+      "mov       %0, %2                          \n"
 
-    MEMACCESS(0)
-    "st1       {v16.d}[0], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v18.d}[0], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v17.d}[0], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v19.d}[0], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v16.d}[1], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v18.d}[1], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v17.d}[1], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v19.d}[1], [%0]                \n"
+      "st1       {v16.d}[0], [%0], %6            \n"
+      "st1       {v18.d}[0], [%0], %6            \n"
+      "st1       {v17.d}[0], [%0], %6            \n"
+      "st1       {v19.d}[0], [%0], %6            \n"
+      "st1       {v16.d}[1], [%0], %6            \n"
+      "st1       {v18.d}[1], [%0], %6            \n"
+      "st1       {v17.d}[1], [%0], %6            \n"
+      "st1       {v19.d}[1], [%0]                \n"
 
-    "mov       %0, %3                          \n"
+      "mov       %0, %3                          \n"
 
-    MEMACCESS(0)
-    "st1       {v20.d}[0], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v22.d}[0], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v21.d}[0], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v23.d}[0], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v20.d}[1], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v22.d}[1], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v21.d}[1], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v23.d}[1], [%0]                \n"
+      "st1       {v20.d}[0], [%0], %7            \n"
+      "st1       {v22.d}[0], [%0], %7            \n"
+      "st1       {v21.d}[0], [%0], %7            \n"
+      "st1       {v23.d}[0], [%0], %7            \n"
+      "st1       {v20.d}[1], [%0], %7            \n"
+      "st1       {v22.d}[1], [%0], %7            \n"
+      "st1       {v21.d}[1], [%0], %7            \n"
+      "st1       {v23.d}[1], [%0]                \n"
 
-    "add       %1, %1, #16                     \n"  // src   += 8*2
-    "add       %2, %2, %6, lsl #3              \n"  // dst_a += 8 * dst_stride_a
-    "add       %3, %3, %7, lsl #3              \n"  // dst_b += 8 * dst_stride_b
-    "subs      %4, %4,  #8                     \n"  // w     -= 8
-    "b.ge      1b                              \n"
+      "add       %1, %1, #16                     \n"  // src   += 8*2
+      "add       %2, %2, %6, lsl #3              \n"  // dst_a += 8 *
+                                                      // dst_stride_a
+      "add       %3, %3, %7, lsl #3              \n"  // dst_b += 8 *
+                                                      // dst_stride_b
+      "subs      %w4, %w4,  #8                   \n"  // w     -= 8
+      "b.ge      1b                              \n"
 
-    // add 8 back to counter. if the result is 0 there are
-    // no residuals.
-    "adds      %4, %4, #8                      \n"
-    "b.eq      4f                              \n"
+      // add 8 back to counter. if the result is 0 there are
+      // no residuals.
+      "adds      %w4, %w4, #8                    \n"
+      "b.eq      4f                              \n"
 
-    // some residual, so between 1 and 7 lines left to transpose
-    "cmp       %4, #2                          \n"
-    "b.lt      3f                              \n"
+      // some residual, so between 1 and 7 lines left to transpose
+      "cmp       %w4, #2                         \n"
+      "b.lt      3f                              \n"
 
-    "cmp       %4, #4                          \n"
-    "b.lt      2f                              \n"
+      "cmp       %w4, #4                         \n"
+      "b.lt      2f                              \n"
 
-    // TODO(frkoenig): Clean this up
-    // 4x8 block
-    "mov       %0, %1                          \n"
-    MEMACCESS(0)
-    "ld1       {v0.8b}, [%0], %5               \n"
-    MEMACCESS(0)
-    "ld1       {v1.8b}, [%0], %5               \n"
-    MEMACCESS(0)
-    "ld1       {v2.8b}, [%0], %5               \n"
-    MEMACCESS(0)
-    "ld1       {v3.8b}, [%0], %5               \n"
-    MEMACCESS(0)
-    "ld1       {v4.8b}, [%0], %5               \n"
-    MEMACCESS(0)
-    "ld1       {v5.8b}, [%0], %5               \n"
-    MEMACCESS(0)
-    "ld1       {v6.8b}, [%0], %5               \n"
-    MEMACCESS(0)
-    "ld1       {v7.8b}, [%0]                   \n"
+      // TODO(frkoenig): Clean this up
+      // 4x8 block
+      "mov       %0, %1                          \n"
+      "ld1       {v0.8b}, [%0], %5               \n"
+      "ld1       {v1.8b}, [%0], %5               \n"
+      "ld1       {v2.8b}, [%0], %5               \n"
+      "ld1       {v3.8b}, [%0], %5               \n"
+      "ld1       {v4.8b}, [%0], %5               \n"
+      "ld1       {v5.8b}, [%0], %5               \n"
+      "ld1       {v6.8b}, [%0], %5               \n"
+      "ld1       {v7.8b}, [%0]                   \n"
 
-    MEMACCESS(8)
-    "ld1       {v30.16b}, [%8], #16            \n"
-    "ld1       {v31.16b}, [%8]                 \n"
+      "ld1       {v30.16b}, [%8], #16            \n"
+      "ld1       {v31.16b}, [%8]                 \n"
 
-    "tbl       v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b  \n"
-    "tbl       v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b  \n"
-    "tbl       v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b  \n"
-    "tbl       v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b  \n"
+      "tbl       v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b  \n"
+      "tbl       v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b  \n"
+      "tbl       v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b  \n"
+      "tbl       v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b  \n"
 
-    "mov       %0, %2                          \n"
+      "mov       %0, %2                          \n"
 
-    MEMACCESS(0)
-    "st1       {v16.s}[0],  [%0], %6           \n"
-    MEMACCESS(0)
-    "st1       {v16.s}[1],  [%0], %6           \n"
-    MEMACCESS(0)
-    "st1       {v16.s}[2],  [%0], %6           \n"
-    MEMACCESS(0)
-    "st1       {v16.s}[3],  [%0], %6           \n"
+      "st1       {v16.s}[0],  [%0], %6           \n"
+      "st1       {v16.s}[1],  [%0], %6           \n"
+      "st1       {v16.s}[2],  [%0], %6           \n"
+      "st1       {v16.s}[3],  [%0], %6           \n"
 
-    "add       %0, %2, #4                      \n"
-    MEMACCESS(0)
-    "st1       {v18.s}[0], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v18.s}[1], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v18.s}[2], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v18.s}[3], [%0]                \n"
+      "add       %0, %2, #4                      \n"
+      "st1       {v18.s}[0], [%0], %6            \n"
+      "st1       {v18.s}[1], [%0], %6            \n"
+      "st1       {v18.s}[2], [%0], %6            \n"
+      "st1       {v18.s}[3], [%0]                \n"
 
-    "mov       %0, %3                          \n"
+      "mov       %0, %3                          \n"
 
-    MEMACCESS(0)
-    "st1       {v17.s}[0], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v17.s}[1], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v17.s}[2], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v17.s}[3], [%0], %7            \n"
+      "st1       {v17.s}[0], [%0], %7            \n"
+      "st1       {v17.s}[1], [%0], %7            \n"
+      "st1       {v17.s}[2], [%0], %7            \n"
+      "st1       {v17.s}[3], [%0], %7            \n"
 
-    "add       %0, %3, #4                      \n"
-    MEMACCESS(0)
-    "st1       {v19.s}[0],  [%0], %7           \n"
-    MEMACCESS(0)
-    "st1       {v19.s}[1],  [%0], %7           \n"
-    MEMACCESS(0)
-    "st1       {v19.s}[2],  [%0], %7           \n"
-    MEMACCESS(0)
-    "st1       {v19.s}[3],  [%0]               \n"
+      "add       %0, %3, #4                      \n"
+      "st1       {v19.s}[0],  [%0], %7           \n"
+      "st1       {v19.s}[1],  [%0], %7           \n"
+      "st1       {v19.s}[2],  [%0], %7           \n"
+      "st1       {v19.s}[3],  [%0]               \n"
 
-    "add       %1, %1, #8                      \n"  // src   += 4 * 2
-    "add       %2, %2, %6, lsl #2              \n"  // dst_a += 4 * dst_stride_a
-    "add       %3, %3, %7, lsl #2              \n"  // dst_b += 4 * dst_stride_b
-    "subs      %4,  %4,  #4                    \n"  // w     -= 4
-    "b.eq      4f                              \n"
+      "add       %1, %1, #8                      \n"  // src   += 4 * 2
+      "add       %2, %2, %6, lsl #2              \n"  // dst_a += 4 *
+                                                      // dst_stride_a
+      "add       %3, %3, %7, lsl #2              \n"  // dst_b += 4 *
+                                                      // dst_stride_b
+      "subs      %w4,  %w4,  #4                  \n"  // w     -= 4
+      "b.eq      4f                              \n"
 
-    // some residual, check to see if it includes a 2x8 block,
-    // or less
-    "cmp       %4, #2                          \n"
-    "b.lt      3f                              \n"
+      // some residual, check to see if it includes a 2x8 block,
+      // or less
+      "cmp       %w4, #2                         \n"
+      "b.lt      3f                              \n"
 
-    // 2x8 block
-    "2:                                        \n"
-    "mov       %0, %1                          \n"
-    MEMACCESS(0)
-    "ld2       {v0.h, v1.h}[0], [%0], %5       \n"
-    MEMACCESS(0)
-    "ld2       {v2.h, v3.h}[0], [%0], %5       \n"
-    MEMACCESS(0)
-    "ld2       {v0.h, v1.h}[1], [%0], %5       \n"
-    MEMACCESS(0)
-    "ld2       {v2.h, v3.h}[1], [%0], %5       \n"
-    MEMACCESS(0)
-    "ld2       {v0.h, v1.h}[2], [%0], %5       \n"
-    MEMACCESS(0)
-    "ld2       {v2.h, v3.h}[2], [%0], %5       \n"
-    MEMACCESS(0)
-    "ld2       {v0.h, v1.h}[3], [%0], %5       \n"
-    MEMACCESS(0)
-    "ld2       {v2.h, v3.h}[3], [%0]           \n"
+      // 2x8 block
+      "2:                                        \n"
+      "mov       %0, %1                          \n"
+      "ld2       {v0.h, v1.h}[0], [%0], %5       \n"
+      "ld2       {v2.h, v3.h}[0], [%0], %5       \n"
+      "ld2       {v0.h, v1.h}[1], [%0], %5       \n"
+      "ld2       {v2.h, v3.h}[1], [%0], %5       \n"
+      "ld2       {v0.h, v1.h}[2], [%0], %5       \n"
+      "ld2       {v2.h, v3.h}[2], [%0], %5       \n"
+      "ld2       {v0.h, v1.h}[3], [%0], %5       \n"
+      "ld2       {v2.h, v3.h}[3], [%0]           \n"
 
-    "trn1      v4.8b, v0.8b, v2.8b             \n"
-    "trn2      v5.8b, v0.8b, v2.8b             \n"
-    "trn1      v6.8b, v1.8b, v3.8b             \n"
-    "trn2      v7.8b, v1.8b, v3.8b             \n"
+      "trn1      v4.8b, v0.8b, v2.8b             \n"
+      "trn2      v5.8b, v0.8b, v2.8b             \n"
+      "trn1      v6.8b, v1.8b, v3.8b             \n"
+      "trn2      v7.8b, v1.8b, v3.8b             \n"
 
-    "mov       %0, %2                          \n"
+      "mov       %0, %2                          \n"
 
-    MEMACCESS(0)
-    "st1       {v4.d}[0], [%0], %6             \n"
-    MEMACCESS(0)
-    "st1       {v6.d}[0], [%0]                 \n"
+      "st1       {v4.d}[0], [%0], %6             \n"
+      "st1       {v6.d}[0], [%0]                 \n"
 
-    "mov       %0, %3                          \n"
+      "mov       %0, %3                          \n"
 
-    MEMACCESS(0)
-    "st1       {v5.d}[0], [%0], %7             \n"
-    MEMACCESS(0)
-    "st1       {v7.d}[0], [%0]                 \n"
+      "st1       {v5.d}[0], [%0], %7             \n"
+      "st1       {v7.d}[0], [%0]                 \n"
 
-    "add       %1, %1, #4                      \n"  // src   += 2 * 2
-    "add       %2, %2, %6, lsl #1              \n"  // dst_a += 2 * dst_stride_a
-    "add       %3, %3, %7, lsl #1              \n"  // dst_b += 2 * dst_stride_b
-    "subs      %4,  %4,  #2                    \n"  // w     -= 2
-    "b.eq      4f                              \n"
+      "add       %1, %1, #4                      \n"  // src   += 2 * 2
+      "add       %2, %2, %6, lsl #1              \n"  // dst_a += 2 *
+                                                      // dst_stride_a
+      "add       %3, %3, %7, lsl #1              \n"  // dst_b += 2 *
+                                                      // dst_stride_b
+      "subs      %w4,  %w4,  #2                  \n"  // w     -= 2
+      "b.eq      4f                              \n"
 
-    // 1x8 block
-    "3:                                        \n"
-    MEMACCESS(1)
-    "ld2       {v0.b, v1.b}[0], [%1], %5       \n"
-    MEMACCESS(1)
-    "ld2       {v0.b, v1.b}[1], [%1], %5       \n"
-    MEMACCESS(1)
-    "ld2       {v0.b, v1.b}[2], [%1], %5       \n"
-    MEMACCESS(1)
-    "ld2       {v0.b, v1.b}[3], [%1], %5       \n"
-    MEMACCESS(1)
-    "ld2       {v0.b, v1.b}[4], [%1], %5       \n"
-    MEMACCESS(1)
-    "ld2       {v0.b, v1.b}[5], [%1], %5       \n"
-    MEMACCESS(1)
-    "ld2       {v0.b, v1.b}[6], [%1], %5       \n"
-    MEMACCESS(1)
-    "ld2       {v0.b, v1.b}[7], [%1]           \n"
+      // 1x8 block
+      "3:                                        \n"
+      "ld2       {v0.b, v1.b}[0], [%1], %5       \n"
+      "ld2       {v0.b, v1.b}[1], [%1], %5       \n"
+      "ld2       {v0.b, v1.b}[2], [%1], %5       \n"
+      "ld2       {v0.b, v1.b}[3], [%1], %5       \n"
+      "ld2       {v0.b, v1.b}[4], [%1], %5       \n"
+      "ld2       {v0.b, v1.b}[5], [%1], %5       \n"
+      "ld2       {v0.b, v1.b}[6], [%1], %5       \n"
+      "ld2       {v0.b, v1.b}[7], [%1]           \n"
 
-    MEMACCESS(2)
-    "st1       {v0.d}[0], [%2]                 \n"
-    MEMACCESS(3)
-    "st1       {v1.d}[0], [%3]                 \n"
+      "st1       {v0.d}[0], [%2]                 \n"
+      "st1       {v1.d}[0], [%3]                 \n"
 
-    "4:                                        \n"
+      "4:                                        \n"
 
-    : "=&r"(src_temp),                            // %0
-      "+r"(src),                                  // %1
-      "+r"(dst_a),                                // %2
-      "+r"(dst_b),                                // %3
-      "+r"(width64)                               // %4
-    : "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
-      "r"(static_cast<ptrdiff_t>(dst_stride_a)),  // %6
-      "r"(static_cast<ptrdiff_t>(dst_stride_b)),  // %7
-      "r"(&kVTbl4x4TransposeDi)                   // %8
-    : "memory", "cc",
-      "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-      "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
-      "v30", "v31"
-  );
+      : "=&r"(src_temp),                            // %0
+        "+r"(src),                                  // %1
+        "+r"(dst_a),                                // %2
+        "+r"(dst_b),                                // %3
+        "+r"(width)                                 // %4
+      : "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
+        "r"(static_cast<ptrdiff_t>(dst_stride_a)),  // %6
+        "r"(static_cast<ptrdiff_t>(dst_stride_b)),  // %7
+        "r"(&kVTbl4x4TransposeDi)                   // %8
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+        "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31");
 }
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/rotate_win.cc b/media/libvpx/libvpx/third_party/libyuv/source/rotate_win.cc
index 1300fc0feb..e887dd525c 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/rotate_win.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/rotate_win.cc
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/row.h"
 #include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -17,17 +17,19 @@ extern "C" {
 #endif
 
 // This module is for 32 bit Visual C x86 and clangcl
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
 
-__declspec(naked)
-void TransposeWx8_SSSE3(const uint8* src, int src_stride,
-                        uint8* dst, int dst_stride, int width) {
+__declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src,
+                                          int src_stride,
+                                          uint8_t* dst,
+                                          int dst_stride,
+                                          int width) {
   __asm {
     push      edi
     push      esi
     push      ebp
-    mov       eax, [esp + 12 + 4]   // src
-    mov       edi, [esp + 12 + 8]   // src_stride
+    mov       eax, [esp + 12 + 4]  // src
+    mov       edi, [esp + 12 + 8]  // src_stride
     mov       edx, [esp + 12 + 12]  // dst
     mov       esi, [esp + 12 + 16]  // dst_stride
     mov       ecx, [esp + 12 + 20]  // width
@@ -110,18 +112,20 @@ void TransposeWx8_SSSE3(const uint8* src, int src_stride,
   }
 }
 
-__declspec(naked)
-void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b,
-                         int w) {
+__declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src,
+                                           int src_stride,
+                                           uint8_t* dst_a,
+                                           int dst_stride_a,
+                                           uint8_t* dst_b,
+                                           int dst_stride_b,
+                                           int w) {
   __asm {
     push      ebx
     push      esi
     push      edi
     push      ebp
-    mov       eax, [esp + 16 + 4]   // src
-    mov       edi, [esp + 16 + 8]   // src_stride
+    mov       eax, [esp + 16 + 4]  // src
+    mov       edi, [esp + 16 + 8]  // src_stride
     mov       edx, [esp + 16 + 12]  // dst_a
     mov       esi, [esp + 16 + 16]  // dst_stride_a
     mov       ebx, [esp + 16 + 20]  // dst_b
@@ -133,9 +137,9 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
     mov       ecx, [ecx + 16 + 28]  // w
 
     align      4
- convertloop:
     // Read in the data from the source pointer.
     // First round of bit swap.
+  convertloop:
     movdqu    xmm0, [eax]
     movdqu    xmm1, [eax + edi]
     lea       eax, [eax + 2 * edi]
@@ -162,13 +166,13 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
     lea       eax, [eax + 2 * edi]
     movdqu    [esp], xmm5  // backup xmm5
     neg       edi
-    movdqa    xmm5, xmm6   // use xmm5 as temp register.
+    movdqa    xmm5, xmm6  // use xmm5 as temp register.
     punpcklbw xmm6, xmm7
     punpckhbw xmm5, xmm7
     movdqa    xmm7, xmm5
     lea       eax, [eax + 8 * edi + 16]
     neg       edi
-    // Second round of bit swap.
+        // Second round of bit swap.
     movdqa    xmm5, xmm0
     punpcklwd xmm0, xmm2
     punpckhwd xmm5, xmm2
@@ -183,12 +187,13 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
     movdqa    xmm6, xmm5
     movdqu    xmm5, [esp]  // restore xmm5
     movdqu    [esp], xmm6  // backup xmm6
-    movdqa    xmm6, xmm5    // use xmm6 as temp register.
+    movdqa    xmm6, xmm5  // use xmm6 as temp register.
     punpcklwd xmm5, xmm7
     punpckhwd xmm6, xmm7
     movdqa    xmm7, xmm6
-    // Third round of bit swap.
-    // Write to the destination pointer.
+
+        // Third round of bit swap.
+        // Write to the destination pointer.
     movdqa    xmm6, xmm0
     punpckldq xmm0, xmm4
     punpckhdq xmm6, xmm4
@@ -200,7 +205,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
     lea       edx, [edx + 2 * esi]
     movhpd    qword ptr [ebx + ebp], xmm4
     lea       ebx, [ebx + 2 * ebp]
-    movdqa    xmm0, xmm2   // use xmm0 as the temp register.
+    movdqa    xmm0, xmm2  // use xmm0 as the temp register.
     punpckldq xmm2, xmm6
     movlpd    qword ptr [edx], xmm2
     movhpd    qword ptr [ebx], xmm2
@@ -209,7 +214,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
     lea       edx, [edx + 2 * esi]
     movhpd    qword ptr [ebx + ebp], xmm0
     lea       ebx, [ebx + 2 * ebp]
-    movdqa    xmm0, xmm1   // use xmm0 as the temp register.
+    movdqa    xmm0, xmm1  // use xmm0 as the temp register.
     punpckldq xmm1, xmm5
     movlpd    qword ptr [edx], xmm1
     movhpd    qword ptr [ebx], xmm1
@@ -218,7 +223,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
     lea       edx, [edx + 2 * esi]
     movhpd    qword ptr [ebx + ebp], xmm0
     lea       ebx, [ebx + 2 * ebp]
-    movdqa    xmm0, xmm3   // use xmm0 as the temp register.
+    movdqa    xmm0, xmm3  // use xmm0 as the temp register.
     punpckldq xmm3, xmm7
     movlpd    qword ptr [edx], xmm3
     movhpd    qword ptr [ebx], xmm3
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/row_any.cc b/media/libvpx/libvpx/third_party/libyuv/source/row_any.cc
index 494164fd02..e91560c44c 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/row_any.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/row_any.cc
@@ -19,30 +19,38 @@ namespace libyuv {
 extern "C" {
 #endif
 
+// memset for temp is meant to clear the source buffer (not dest) so that
+// SIMD that reads full multiple of 16 bytes will not trigger msan errors.
+// memset is not needed for production, as the garbage values are processed but
+// not used, although there may be edge cases for subsampling.
+// The size of the buffer is based on the largest read, which can be inferred
+// by the source type (e.g. ARGB) and the mask (last parameter), or by examining
+// the source code for how much the source pointers are advanced.
+
 // Subsampled source needs to be increase by 1 of not even.
 #define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
 
 // Any 4 planes to 1 with yuvconstants
-#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                \
-    void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf,   \
-                 const uint8* a_buf, uint8* dst_ptr,                           \
-                 const struct YuvConstants* yuvconstants,  int width) {        \
-      SIMD_ALIGNED(uint8 temp[64 * 5]);                                        \
-      memset(temp, 0, 64 * 4);  /* for msan */                                 \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n);        \
-      }                                                                        \
-      memcpy(temp, y_buf + n, r);                                              \
-      memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
-      memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
-      memcpy(temp + 192, a_buf + n, r);                                        \
-      ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256,            \
-               yuvconstants, MASK + 1);                                        \
-      memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256,                      \
-             SS(r, DUVSHIFT) * BPP);                                           \
-    }
+#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)              \
+  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,                   \
+               const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \
+               const struct YuvConstants* yuvconstants, int width) {         \
+    SIMD_ALIGNED(uint8_t temp[64 * 5]);                                      \
+    memset(temp, 0, 64 * 4); /* for msan */                                  \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n);        \
+    }                                                                        \
+    memcpy(temp, y_buf + n, r);                                              \
+    memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
+    memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
+    memcpy(temp + 192, a_buf + n, r);                                        \
+    ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256,            \
+             yuvconstants, MASK + 1);                                        \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256,                      \
+           SS(r, DUVSHIFT) * BPP);                                           \
+  }
 
 #ifdef HAS_I422ALPHATOARGBROW_SSSE3
 ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7)
@@ -53,36 +61,57 @@ ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15)
 #ifdef HAS_I422ALPHATOARGBROW_NEON
 ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7)
 #endif
+#ifdef HAS_I422ALPHATOARGBROW_MSA
+ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7)
+#endif
 #undef ANY41C
 
 // Any 3 planes to 1.
-#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                 \
-    void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf,   \
-                 uint8* dst_ptr, int width) {                                  \
-      SIMD_ALIGNED(uint8 temp[64 * 4]);                                        \
-      memset(temp, 0, 64 * 3);  /* for YUY2 and msan */                        \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n);                             \
-      }                                                                        \
-      memcpy(temp, y_buf + n, r);                                              \
-      memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
-      memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
-      ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1);             \
-      memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,                      \
-             SS(r, DUVSHIFT) * BPP);                                           \
-    }
+#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)      \
+  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,          \
+               const uint8_t* v_buf, uint8_t* dst_ptr, int width) { \
+    SIMD_ALIGNED(uint8_t temp[64 * 4]);                             \
+    memset(temp, 0, 64 * 3); /* for YUY2 and msan */                \
+    int r = width & MASK;                                           \
+    int n = width & ~MASK;                                          \
+    if (n > 0) {                                                    \
+      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n);                    \
+    }                                                               \
+    memcpy(temp, y_buf + n, r);                                     \
+    memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));      \
+    memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));     \
+    ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1);    \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,             \
+           SS(r, DUVSHIFT) * BPP);                                  \
+  }
+
+// Merge functions.
+#ifdef HAS_MERGERGBROW_SSSE3
+ANY31(MergeRGBRow_Any_SSSE3, MergeRGBRow_SSSE3, 0, 0, 3, 15)
+#endif
+#ifdef HAS_MERGERGBROW_NEON
+ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15)
+#endif
 #ifdef HAS_I422TOYUY2ROW_SSE2
 ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
 ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
 #endif
+#ifdef HAS_I422TOYUY2ROW_AVX2
+ANY31(I422ToYUY2Row_Any_AVX2, I422ToYUY2Row_AVX2, 1, 1, 4, 31)
+ANY31(I422ToUYVYRow_Any_AVX2, I422ToUYVYRow_AVX2, 1, 1, 4, 31)
+#endif
 #ifdef HAS_I422TOYUY2ROW_NEON
 ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
 #endif
+#ifdef HAS_I422TOYUY2ROW_MSA
+ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31)
+#endif
 #ifdef HAS_I422TOUYVYROW_NEON
 ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
 #endif
+#ifdef HAS_I422TOUYVYROW_MSA
+ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31)
+#endif
 #ifdef HAS_BLENDPLANEROW_AVX2
 ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31)
 #endif
@@ -94,35 +123,38 @@ ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
 // Note that odd width replication includes 444 due to implementation
 // on arm that subsamples 444 to 422 internally.
 // Any 3 planes to 1 with yuvconstants
-#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                \
-    void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf,   \
-                 uint8* dst_ptr, const struct YuvConstants* yuvconstants,      \
-                 int width) {                                                  \
-      SIMD_ALIGNED(uint8 temp[64 * 4]);                                        \
-      memset(temp, 0, 64 * 3);  /* for YUY2 and msan */                        \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);               \
-      }                                                                        \
-      memcpy(temp, y_buf + n, r);                                              \
-      memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
-      memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
-      if (width & 1) {                                                         \
-        temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1];             \
-        temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1];           \
-      }                                                                        \
-      ANY_SIMD(temp, temp + 64, temp + 128, temp + 192,                        \
-               yuvconstants, MASK + 1);                                        \
-      memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,                      \
-             SS(r, DUVSHIFT) * BPP);                                           \
-    }
+#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)      \
+  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,           \
+               const uint8_t* v_buf, uint8_t* dst_ptr,               \
+               const struct YuvConstants* yuvconstants, int width) { \
+    SIMD_ALIGNED(uint8_t temp[128 * 4]);                             \
+    memset(temp, 0, 128 * 3); /* for YUY2 and msan */                \
+    int r = width & MASK;                                            \
+    int n = width & ~MASK;                                           \
+    if (n > 0) {                                                     \
+      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);       \
+    }                                                                \
+    memcpy(temp, y_buf + n, r);                                      \
+    memcpy(temp + 128, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));      \
+    memcpy(temp + 256, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));      \
+    if (width & 1) {                                                 \
+      temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1];   \
+      temp[256 + SS(r, UVSHIFT)] = temp[256 + SS(r, UVSHIFT) - 1];   \
+    }                                                                \
+    ANY_SIMD(temp, temp + 128, temp + 256, temp + 384, yuvconstants, \
+             MASK + 1);                                              \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 384,              \
+           SS(r, DUVSHIFT) * BPP);                                   \
+  }
 
 #ifdef HAS_I422TOARGBROW_SSSE3
 ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
 #endif
-#ifdef HAS_I411TOARGBROW_SSSE3
-ANY31C(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, 2, 0, 4, 7)
+#ifdef HAS_I422TOAR30ROW_SSSE3
+ANY31C(I422ToAR30Row_Any_SSSE3, I422ToAR30Row_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I422TOAR30ROW_AVX2
+ANY31C(I422ToAR30Row_Any_AVX2, I422ToAR30Row_AVX2, 1, 0, 4, 15)
 #endif
 #ifdef HAS_I444TOARGBROW_SSSE3
 ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
@@ -130,10 +162,10 @@ ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
 ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
 ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
 ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
-ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7)
+ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 15)
 #endif  // HAS_I444TOARGBROW_SSSE3
 #ifdef HAS_I422TORGB24ROW_AVX2
-ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15)
+ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31)
 #endif
 #ifdef HAS_I422TOARGBROW_AVX2
 ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
@@ -144,47 +176,87 @@ ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
 #ifdef HAS_I444TOARGBROW_AVX2
 ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
 #endif
-#ifdef HAS_I411TOARGBROW_AVX2
-ANY31C(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15)
-#endif
 #ifdef HAS_I422TOARGB4444ROW_AVX2
-ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 7)
+ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 15)
 #endif
 #ifdef HAS_I422TOARGB1555ROW_AVX2
-ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 15)
 #endif
 #ifdef HAS_I422TORGB565ROW_AVX2
-ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 7)
+ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 15)
 #endif
 #ifdef HAS_I422TOARGBROW_NEON
 ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
 ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)
-ANY31C(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, 2, 0, 4, 7)
 ANY31C(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7)
 ANY31C(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7)
 ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)
 ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)
 ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
 #endif
+#ifdef HAS_I422TOARGBROW_MSA
+ANY31C(I444ToARGBRow_Any_MSA, I444ToARGBRow_MSA, 0, 0, 4, 7)
+ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7)
+ANY31C(I422ToRGBARow_Any_MSA, I422ToRGBARow_MSA, 1, 0, 4, 7)
+ANY31C(I422ToRGB24Row_Any_MSA, I422ToRGB24Row_MSA, 1, 0, 3, 15)
+ANY31C(I422ToARGB4444Row_Any_MSA, I422ToARGB4444Row_MSA, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_MSA, I422ToARGB1555Row_MSA, 1, 0, 2, 7)
+ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7)
+#endif
 #undef ANY31C
 
+// Any 3 planes of 16 bit to 1 with yuvconstants
+// TODO(fbarchard): consider sharing this code with ANY31C
+#define ANY31CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \
+  void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf,            \
+               uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, \
+               int width) {                                               \
+    SIMD_ALIGNED(T temp[16 * 3]);                                         \
+    SIMD_ALIGNED(uint8_t out[64]);                                        \
+    memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */               \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);            \
+    }                                                                     \
+    memcpy(temp, y_buf + n, r * SBPP);                                    \
+    memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);     \
+    memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);     \
+    ANY_SIMD(temp, temp + 16, temp + 32, out, yuvconstants, MASK + 1);    \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP);  \
+  }
+
+#ifdef HAS_I210TOAR30ROW_SSSE3
+ANY31CT(I210ToAR30Row_Any_SSSE3, I210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_I210TOARGBROW_SSSE3
+ANY31CT(I210ToARGBRow_Any_SSSE3, I210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_I210TOARGBROW_AVX2
+ANY31CT(I210ToARGBRow_Any_AVX2, I210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_I210TOAR30ROW_AVX2
+ANY31CT(I210ToAR30Row_Any_AVX2, I210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)
+#endif
+#undef ANY31CT
+
 // Any 2 planes to 1.
-#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)              \
-    void NAMEANY(const uint8* y_buf, const uint8* uv_buf,                      \
-                 uint8* dst_ptr, int width) {                                  \
-      SIMD_ALIGNED(uint8 temp[64 * 3]);                                        \
-      memset(temp, 0, 64 * 2);  /* for msan */                                 \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(y_buf, uv_buf, dst_ptr, n);                                   \
-      }                                                                        \
-      memcpy(temp, y_buf + n * SBPP, r * SBPP);                                \
-      memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                       \
-             SS(r, UVSHIFT) * SBPP2);                                          \
-      ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1);                         \
-      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
-    }
+#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)             \
+  void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
+               int width) {                                                   \
+    SIMD_ALIGNED(uint8_t temp[64 * 3]);                                       \
+    memset(temp, 0, 64 * 2); /* for msan */                                   \
+    int r = width & MASK;                                                     \
+    int n = width & ~MASK;                                                    \
+    if (n > 0) {                                                              \
+      ANY_SIMD(y_buf, uv_buf, dst_ptr, n);                                    \
+    }                                                                         \
+    memcpy(temp, y_buf + n * SBPP, r * SBPP);                                 \
+    memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                        \
+           SS(r, UVSHIFT) * SBPP2);                                           \
+    ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1);                          \
+    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                           \
+  }
 
 // Merge functions.
 #ifdef HAS_MERGEUVROW_SSE2
@@ -196,6 +268,9 @@ ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31)
 #ifdef HAS_MERGEUVROW_NEON
 ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)
 #endif
+#ifdef HAS_MERGEUVROW_MSA
+ANY21(MergeUVRow_Any_MSA, MergeUVRow_MSA, 0, 1, 1, 2, 15)
+#endif
 
 // Math functions.
 #ifdef HAS_ARGBMULTIPLYROW_SSE2
@@ -225,44 +300,61 @@ ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7)
 #ifdef HAS_ARGBSUBTRACTROW_NEON
 ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7)
 #endif
+#ifdef HAS_ARGBMULTIPLYROW_MSA
+ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBADDROW_MSA
+ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_MSA
+ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7)
+#endif
 #ifdef HAS_SOBELROW_SSE2
 ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15)
 #endif
 #ifdef HAS_SOBELROW_NEON
 ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7)
 #endif
+#ifdef HAS_SOBELROW_MSA
+ANY21(SobelRow_Any_MSA, SobelRow_MSA, 0, 1, 1, 4, 15)
+#endif
 #ifdef HAS_SOBELTOPLANEROW_SSE2
 ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15)
 #endif
 #ifdef HAS_SOBELTOPLANEROW_NEON
 ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15)
 #endif
+#ifdef HAS_SOBELTOPLANEROW_MSA
+ANY21(SobelToPlaneRow_Any_MSA, SobelToPlaneRow_MSA, 0, 1, 1, 1, 31)
+#endif
 #ifdef HAS_SOBELXYROW_SSE2
 ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15)
 #endif
 #ifdef HAS_SOBELXYROW_NEON
 ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7)
 #endif
+#ifdef HAS_SOBELXYROW_MSA
+ANY21(SobelXYRow_Any_MSA, SobelXYRow_MSA, 0, 1, 1, 4, 15)
+#endif
 #undef ANY21
 
 // Any 2 planes to 1 with yuvconstants
-#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)             \
-    void NAMEANY(const uint8* y_buf, const uint8* uv_buf,                      \
-                 uint8* dst_ptr, const struct YuvConstants* yuvconstants,      \
-                 int width) {                                                  \
-      SIMD_ALIGNED(uint8 temp[64 * 3]);                                        \
-      memset(temp, 0, 64 * 2);  /* for msan */                                 \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n);                     \
-      }                                                                        \
-      memcpy(temp, y_buf + n * SBPP, r * SBPP);                                \
-      memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                       \
-             SS(r, UVSHIFT) * SBPP2);                                          \
-      ANY_SIMD(temp, temp + 64, temp + 128, yuvconstants, MASK + 1);           \
-      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
-    }
+#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)            \
+  void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
+               const struct YuvConstants* yuvconstants, int width) {          \
+    SIMD_ALIGNED(uint8_t temp[128 * 3]);                                      \
+    memset(temp, 0, 128 * 2); /* for msan */                                  \
+    int r = width & MASK;                                                     \
+    int n = width & ~MASK;                                                    \
+    if (n > 0) {                                                              \
+      ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n);                      \
+    }                                                                         \
+    memcpy(temp, y_buf + n * SBPP, r * SBPP);                                 \
+    memcpy(temp + 128, uv_buf + (n >> UVSHIFT) * SBPP2,                       \
+           SS(r, UVSHIFT) * SBPP2);                                           \
+    ANY_SIMD(temp, temp + 128, temp + 256, yuvconstants, MASK + 1);           \
+    memcpy(dst_ptr + n * BPP, temp + 256, r * BPP);                           \
+  }
 
 // Biplanar to RGB.
 #ifdef HAS_NV12TOARGBROW_SSSE3
@@ -274,6 +366,9 @@ ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)
 #ifdef HAS_NV12TOARGBROW_NEON
 ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
 #endif
+#ifdef HAS_NV12TOARGBROW_MSA
+ANY21C(NV12ToARGBRow_Any_MSA, NV12ToARGBRow_MSA, 1, 1, 2, 4, 7)
+#endif
 #ifdef HAS_NV21TOARGBROW_SSSE3
 ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
 #endif
@@ -283,6 +378,27 @@ ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15)
 #ifdef HAS_NV21TOARGBROW_NEON
 ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)
 #endif
+#ifdef HAS_NV21TOARGBROW_MSA
+ANY21C(NV21ToARGBRow_Any_MSA, NV21ToARGBRow_MSA, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TORGB24ROW_NEON
+ANY21C(NV12ToRGB24Row_Any_NEON, NV12ToRGB24Row_NEON, 1, 1, 2, 3, 7)
+#endif
+#ifdef HAS_NV21TORGB24ROW_NEON
+ANY21C(NV21ToRGB24Row_Any_NEON, NV21ToRGB24Row_NEON, 1, 1, 2, 3, 7)
+#endif
+#ifdef HAS_NV12TORGB24ROW_SSSE3
+ANY21C(NV12ToRGB24Row_Any_SSSE3, NV12ToRGB24Row_SSSE3, 1, 1, 2, 3, 15)
+#endif
+#ifdef HAS_NV21TORGB24ROW_SSSE3
+ANY21C(NV21ToRGB24Row_Any_SSSE3, NV21ToRGB24Row_SSSE3, 1, 1, 2, 3, 15)
+#endif
+#ifdef HAS_NV12TORGB24ROW_AVX2
+ANY21C(NV12ToRGB24Row_Any_AVX2, NV12ToRGB24Row_AVX2, 1, 1, 2, 3, 31)
+#endif
+#ifdef HAS_NV21TORGB24ROW_AVX2
+ANY21C(NV21ToRGB24Row_Any_AVX2, NV21ToRGB24Row_AVX2, 1, 1, 2, 3, 31)
+#endif
 #ifdef HAS_NV12TORGB565ROW_SSSE3
 ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
 #endif
@@ -292,22 +408,25 @@ ANY21C(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
 #ifdef HAS_NV12TORGB565ROW_NEON
 ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
 #endif
+#ifdef HAS_NV12TORGB565ROW_MSA
+ANY21C(NV12ToRGB565Row_Any_MSA, NV12ToRGB565Row_MSA, 1, 1, 2, 2, 7)
+#endif
 #undef ANY21C
 
 // Any 1 to 1.
-#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                     \
-    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \
-      SIMD_ALIGNED(uint8 temp[128 * 2]);                                       \
-      memset(temp, 0, 128);  /* for YUY2 and msan */                           \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(src_ptr, dst_ptr, n);                                         \
-      }                                                                        \
-      memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP);    \
-      ANY_SIMD(temp, temp + 128, MASK + 1);                                    \
-      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
-    }
+#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {     \
+    SIMD_ALIGNED(uint8_t temp[128 * 2]);                                  \
+    memset(temp, 0, 128); /* for YUY2 and msan */                         \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(src_ptr, dst_ptr, n);                                      \
+    }                                                                     \
+    memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+    ANY_SIMD(temp, temp + 128, MASK + 1);                                 \
+    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                       \
+  }
 
 #ifdef HAS_COPYROW_AVX
 ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63)
@@ -325,6 +444,15 @@ ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3)
 ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3)
 ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3)
 #endif
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
+ANY11(ARGBToRGB24Row_Any_AVX2, ARGBToRGB24Row_AVX2, 0, 4, 3, 31)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_AVX512VBMI)
+ANY11(ARGBToRGB24Row_Any_AVX512VBMI, ARGBToRGB24Row_AVX512VBMI, 0, 4, 3, 31)
+#endif
+#if defined(HAS_ARGBTORAWROW_AVX2)
+ANY11(ARGBToRAWRow_Any_AVX2, ARGBToRAWRow_AVX2, 0, 4, 3, 31)
+#endif
 #if defined(HAS_ARGBTORGB565ROW_AVX2)
 ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
 #endif
@@ -332,6 +460,18 @@ ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
 ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7)
 ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
 #endif
+#if defined(HAS_ABGRTOAR30ROW_SSSE3)
+ANY11(ABGRToAR30Row_Any_SSSE3, ABGRToAR30Row_SSSE3, 0, 4, 4, 3)
+#endif
+#if defined(HAS_ARGBTOAR30ROW_SSSE3)
+ANY11(ARGBToAR30Row_Any_SSSE3, ARGBToAR30Row_SSSE3, 0, 4, 4, 3)
+#endif
+#if defined(HAS_ABGRTOAR30ROW_AVX2)
+ANY11(ABGRToAR30Row_Any_AVX2, ABGRToAR30Row_AVX2, 0, 4, 4, 7)
+#endif
+#if defined(HAS_ARGBTOAR30ROW_AVX2)
+ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7)
+#endif
 #if defined(HAS_J400TOARGBROW_SSE2)
 ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7)
 #endif
@@ -372,9 +512,21 @@ ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7)
 ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7)
 ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7)
 #endif
+#if defined(HAS_ARGBTORGB24ROW_MSA)
+ANY11(ARGBToRGB24Row_Any_MSA, ARGBToRGB24Row_MSA, 0, 4, 3, 15)
+ANY11(ARGBToRAWRow_Any_MSA, ARGBToRAWRow_MSA, 0, 4, 3, 15)
+ANY11(ARGBToRGB565Row_Any_MSA, ARGBToRGB565Row_MSA, 0, 4, 2, 7)
+ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7)
+ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15)
+ANY11(I400ToARGBRow_Any_MSA, I400ToARGBRow_MSA, 0, 1, 4, 15)
+#endif
 #if defined(HAS_RAWTORGB24ROW_NEON)
 ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
 #endif
+#if defined(HAS_RAWTORGB24ROW_MSA)
+ANY11(RAWToRGB24Row_Any_MSA, RAWToRGB24Row_MSA, 0, 3, 3, 15)
+#endif
 #ifdef HAS_ARGBTOYROW_AVX2
 ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
 #endif
@@ -403,30 +555,57 @@ ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15)
 #ifdef HAS_ARGBTOYROW_NEON
 ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7)
 #endif
+#ifdef HAS_ARGBTOYROW_MSA
+ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15)
+#endif
 #ifdef HAS_ARGBTOYJROW_NEON
 ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7)
 #endif
+#ifdef HAS_ARGBTOYJROW_MSA
+ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15)
+#endif
 #ifdef HAS_BGRATOYROW_NEON
 ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7)
 #endif
+#ifdef HAS_BGRATOYROW_MSA
+ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15)
+#endif
 #ifdef HAS_ABGRTOYROW_NEON
 ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7)
 #endif
+#ifdef HAS_ABGRTOYROW_MSA
+ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7)
+#endif
 #ifdef HAS_RGBATOYROW_NEON
 ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7)
 #endif
+#ifdef HAS_RGBATOYROW_MSA
+ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15)
+#endif
 #ifdef HAS_RGB24TOYROW_NEON
 ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7)
 #endif
+#ifdef HAS_RGB24TOYROW_MSA
+ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15)
+#endif
 #ifdef HAS_RAWTOYROW_NEON
 ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7)
 #endif
+#ifdef HAS_RAWTOYROW_MSA
+ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15)
+#endif
 #ifdef HAS_RGB565TOYROW_NEON
 ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7)
 #endif
+#ifdef HAS_RGB565TOYROW_MSA
+ANY11(RGB565ToYRow_Any_MSA, RGB565ToYRow_MSA, 0, 2, 1, 15)
+#endif
 #ifdef HAS_ARGB1555TOYROW_NEON
 ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7)
 #endif
+#ifdef HAS_ARGB1555TOYROW_MSA
+ANY11(ARGB1555ToYRow_Any_MSA, ARGB1555ToYRow_MSA, 0, 2, 1, 15)
+#endif
 #ifdef HAS_ARGB4444TOYROW_NEON
 ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
 #endif
@@ -434,23 +613,44 @@ ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
 ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
 #endif
 #ifdef HAS_UYVYTOYROW_NEON
-ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 0, 2, 1, 15)
+ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 1, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOYROW_MSA
+ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31)
+#endif
+#ifdef HAS_UYVYTOYROW_MSA
+ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31)
 #endif
 #ifdef HAS_RGB24TOARGBROW_NEON
 ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
 #endif
+#ifdef HAS_RGB24TOARGBROW_MSA
+ANY11(RGB24ToARGBRow_Any_MSA, RGB24ToARGBRow_MSA, 0, 3, 4, 15)
+#endif
 #ifdef HAS_RAWTOARGBROW_NEON
 ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7)
 #endif
+#ifdef HAS_RAWTOARGBROW_MSA
+ANY11(RAWToARGBRow_Any_MSA, RAWToARGBRow_MSA, 0, 3, 4, 15)
+#endif
 #ifdef HAS_RGB565TOARGBROW_NEON
 ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7)
 #endif
+#ifdef HAS_RGB565TOARGBROW_MSA
+ANY11(RGB565ToARGBRow_Any_MSA, RGB565ToARGBRow_MSA, 0, 2, 4, 15)
+#endif
 #ifdef HAS_ARGB1555TOARGBROW_NEON
 ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7)
 #endif
+#ifdef HAS_ARGB1555TOARGBROW_MSA
+ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15)
+#endif
 #ifdef HAS_ARGB4444TOARGBROW_NEON
 ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7)
 #endif
+#ifdef HAS_ARGB4444TOARGBROW_MSA
+ANY11(ARGB4444ToARGBRow_Any_MSA, ARGB4444ToARGBRow_MSA, 0, 2, 4, 15)
+#endif
 #ifdef HAS_ARGBATTENUATEROW_SSSE3
 ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3)
 #endif
@@ -466,29 +666,38 @@ ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7)
 #ifdef HAS_ARGBATTENUATEROW_NEON
 ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)
 #endif
+#ifdef HAS_ARGBATTENUATEROW_MSA
+ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7)
+#endif
 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
 ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7)
 #endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
+ANY11(ARGBExtractAlphaRow_Any_AVX2, ARGBExtractAlphaRow_AVX2, 0, 4, 1, 31)
+#endif
 #ifdef HAS_ARGBEXTRACTALPHAROW_NEON
 ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15)
 #endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_MSA
+ANY11(ARGBExtractAlphaRow_Any_MSA, ARGBExtractAlphaRow_MSA, 0, 4, 1, 15)
+#endif
 #undef ANY11
 
 // Any 1 to 1 blended.  Destination is read, modify, write.
-#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                    \
-    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \
-      SIMD_ALIGNED(uint8 temp[128 * 2]);                                       \
-      memset(temp, 0, 128 * 2);  /* for YUY2 and msan */                       \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(src_ptr, dst_ptr, n);                                         \
-      }                                                                        \
-      memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP);    \
-      memcpy(temp + 128, dst_ptr + n * BPP, r * BPP);                          \
-      ANY_SIMD(temp, temp + 128, MASK + 1);                                    \
-      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
-    }
+#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)               \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {     \
+    SIMD_ALIGNED(uint8_t temp[64 * 2]);                                   \
+    memset(temp, 0, 64 * 2); /* for msan */                               \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(src_ptr, dst_ptr, n);                                      \
+    }                                                                     \
+    memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+    memcpy(temp + 64, dst_ptr + n * BPP, r * BPP);                        \
+    ANY_SIMD(temp, temp + 64, MASK + 1);                                  \
+    memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                        \
+  }
 
 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
 ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15)
@@ -506,61 +715,184 @@ ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
 
 // Any 1 to 1 with parameter.
 #define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)                          \
-    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr,                         \
-                 T shuffler, int width) {                                      \
-      SIMD_ALIGNED(uint8 temp[64 * 2]);                                        \
-      memset(temp, 0, 64);  /* for msan */                                     \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(src_ptr, dst_ptr, shuffler, n);                               \
-      }                                                                        \
-      memcpy(temp, src_ptr + n * SBPP, r * SBPP);                              \
-      ANY_SIMD(temp, temp + 64, shuffler, MASK + 1);                           \
-      memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                           \
-    }
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, T param, int width) { \
+    SIMD_ALIGNED(uint8_t temp[64 * 2]);                                        \
+    memset(temp, 0, 64); /* for msan */                                        \
+    int r = width & MASK;                                                      \
+    int n = width & ~MASK;                                                     \
+    if (n > 0) {                                                               \
+      ANY_SIMD(src_ptr, dst_ptr, param, n);                                    \
+    }                                                                          \
+    memcpy(temp, src_ptr + n * SBPP, r * SBPP);                                \
+    ANY_SIMD(temp, temp + 64, param, MASK + 1);                                \
+    memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                             \
+  }
 
 #if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
-ANY11P(ARGBToRGB565DitherRow_Any_SSE2, ARGBToRGB565DitherRow_SSE2,
-       const uint32, 4, 2, 3)
+ANY11P(ARGBToRGB565DitherRow_Any_SSE2,
+       ARGBToRGB565DitherRow_SSE2,
+       const uint32_t,
+       4,
+       2,
+       3)
 #endif
 #if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
-ANY11P(ARGBToRGB565DitherRow_Any_AVX2, ARGBToRGB565DitherRow_AVX2,
-       const uint32, 4, 2, 7)
+ANY11P(ARGBToRGB565DitherRow_Any_AVX2,
+       ARGBToRGB565DitherRow_AVX2,
+       const uint32_t,
+       4,
+       2,
+       7)
 #endif
 #if defined(HAS_ARGBTORGB565DITHERROW_NEON)
-ANY11P(ARGBToRGB565DitherRow_Any_NEON, ARGBToRGB565DitherRow_NEON,
-       const uint32, 4, 2, 7)
+ANY11P(ARGBToRGB565DitherRow_Any_NEON,
+       ARGBToRGB565DitherRow_NEON,
+       const uint32_t,
+       4,
+       2,
+       7)
 #endif
-#ifdef HAS_ARGBSHUFFLEROW_SSE2
-ANY11P(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, const uint8*, 4, 4, 3)
+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
+ANY11P(ARGBToRGB565DitherRow_Any_MSA,
+       ARGBToRGB565DitherRow_MSA,
+       const uint32_t,
+       4,
+       2,
+       7)
 #endif
 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
-ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8*, 4, 4, 7)
+ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8_t*, 4, 4, 7)
 #endif
 #ifdef HAS_ARGBSHUFFLEROW_AVX2
-ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8*, 4, 4, 15)
+ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8_t*, 4, 4, 15)
 #endif
 #ifdef HAS_ARGBSHUFFLEROW_NEON
-ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3)
+ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8_t*, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_MSA
+ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8_t*, 4, 4, 7)
 #endif
 #undef ANY11P
 
+// Any 1 to 1 with parameter and shorts.  BPP measures in shorts.
+#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK)             \
+  void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \
+    SIMD_ALIGNED(STYPE temp[32]);                                            \
+    SIMD_ALIGNED(DTYPE out[32]);                                             \
+    memset(temp, 0, 32 * SBPP); /* for msan */                               \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(src_ptr, dst_ptr, scale, n);                                  \
+    }                                                                        \
+    memcpy(temp, src_ptr + n, r * SBPP);                                     \
+    ANY_SIMD(temp, out, scale, MASK + 1);                                    \
+    memcpy(dst_ptr + n, out, r * BPP);                                       \
+  }
+
+#ifdef HAS_CONVERT16TO8ROW_SSSE3
+ANY11C(Convert16To8Row_Any_SSSE3,
+       Convert16To8Row_SSSE3,
+       2,
+       1,
+       uint16_t,
+       uint8_t,
+       15)
+#endif
+#ifdef HAS_CONVERT16TO8ROW_AVX2
+ANY11C(Convert16To8Row_Any_AVX2,
+       Convert16To8Row_AVX2,
+       2,
+       1,
+       uint16_t,
+       uint8_t,
+       31)
+#endif
+#ifdef HAS_CONVERT8TO16ROW_SSE2
+ANY11C(Convert8To16Row_Any_SSE2,
+       Convert8To16Row_SSE2,
+       1,
+       2,
+       uint8_t,
+       uint16_t,
+       15)
+#endif
+#ifdef HAS_CONVERT8TO16ROW_AVX2
+ANY11C(Convert8To16Row_Any_AVX2,
+       Convert8To16Row_AVX2,
+       1,
+       2,
+       uint8_t,
+       uint16_t,
+       31)
+#endif
+#undef ANY11C
+
+// Any 1 to 1 with parameter and shorts to byte.  BPP measures in shorts.
+#define ANY11P16(NAMEANY, ANY_SIMD, ST, T, SBPP, BPP, MASK)             \
+  void NAMEANY(const ST* src_ptr, T* dst_ptr, float param, int width) { \
+    SIMD_ALIGNED(ST temp[32]);                                          \
+    SIMD_ALIGNED(T out[32]);                                            \
+    memset(temp, 0, SBPP * 32); /* for msan */                          \
+    int r = width & MASK;                                               \
+    int n = width & ~MASK;                                              \
+    if (n > 0) {                                                        \
+      ANY_SIMD(src_ptr, dst_ptr, param, n);                             \
+    }                                                                   \
+    memcpy(temp, src_ptr + n, r * SBPP);                                \
+    ANY_SIMD(temp, out, param, MASK + 1);                               \
+    memcpy(dst_ptr + n, out, r * BPP);                                  \
+  }
+
+#ifdef HAS_HALFFLOATROW_SSE2
+ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, uint16_t, uint16_t, 2, 2, 7)
+#endif
+#ifdef HAS_HALFFLOATROW_AVX2
+ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, uint16_t, uint16_t, 2, 2, 15)
+#endif
+#ifdef HAS_HALFFLOATROW_F16C
+ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, uint16_t, uint16_t, 2, 2, 15)
+ANY11P16(HalfFloat1Row_Any_F16C,
+         HalfFloat1Row_F16C,
+         uint16_t,
+         uint16_t,
+         2,
+         2,
+         15)
+#endif
+#ifdef HAS_HALFFLOATROW_NEON
+ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, uint16_t, uint16_t, 2, 2, 7)
+ANY11P16(HalfFloat1Row_Any_NEON,
+         HalfFloat1Row_NEON,
+         uint16_t,
+         uint16_t,
+         2,
+         2,
+         7)
+#endif
+#ifdef HAS_HALFFLOATROW_MSA
+ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, uint16_t, uint16_t, 2, 2, 31)
+#endif
+#ifdef HAS_BYTETOFLOATROW_NEON
+ANY11P16(ByteToFloatRow_Any_NEON, ByteToFloatRow_NEON, uint8_t, float, 1, 3, 7)
+#endif
+#undef ANY11P16
+
 // Any 1 to 1 with yuvconstants
-#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                    \
-    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr,                         \
-                 const struct YuvConstants* yuvconstants, int width) {         \
-      SIMD_ALIGNED(uint8 temp[128 * 2]);                                       \
-      memset(temp, 0, 128);  /* for YUY2 and msan */                           \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n);                           \
-      }                                                                        \
-      memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP);    \
-      ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1);                      \
-      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
-    }
+#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)               \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr,                  \
+               const struct YuvConstants* yuvconstants, int width) {      \
+    SIMD_ALIGNED(uint8_t temp[128 * 2]);                                  \
+    memset(temp, 0, 128); /* for YUY2 and msan */                         \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n);                        \
+    }                                                                     \
+    memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+    ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1);                   \
+    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                       \
+  }
 #if defined(HAS_YUY2TOARGBROW_SSSE3)
 ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)
 ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)
@@ -573,25 +905,28 @@ ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)
 ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)
 ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
 #endif
+#if defined(HAS_YUY2TOARGBROW_MSA)
+ANY11C(YUY2ToARGBRow_Any_MSA, YUY2ToARGBRow_MSA, 1, 4, 4, 7)
+ANY11C(UYVYToARGBRow_Any_MSA, UYVYToARGBRow_MSA, 1, 4, 4, 7)
+#endif
 #undef ANY11C
 
 // Any 1 to 1 interpolate.  Takes 2 rows of source via stride.
-#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                             \
-    void NAMEANY(uint8* dst_ptr, const uint8* src_ptr,                         \
-                 ptrdiff_t src_stride_ptr, int width,                          \
-                 int source_y_fraction) {                                      \
-      SIMD_ALIGNED(uint8 temp[64 * 3]);                                        \
-      memset(temp, 0, 64 * 2);  /* for msan */                                 \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction);      \
-      }                                                                        \
-      memcpy(temp, src_ptr + n * SBPP, r * SBPP);                              \
-      memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP);        \
-      ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction);             \
-      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
-    }
+#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                           \
+  void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr,                     \
+               ptrdiff_t src_stride_ptr, int width, int source_y_fraction) { \
+    SIMD_ALIGNED(uint8_t temp[64 * 3]);                                      \
+    memset(temp, 0, 64 * 2); /* for msan */                                  \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction);      \
+    }                                                                        \
+    memcpy(temp, src_ptr + n * SBPP, r * SBPP);                              \
+    memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP);        \
+    ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction);             \
+    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
+  }
 
 #ifdef HAS_INTERPOLATEROW_AVX2
 ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31)
@@ -602,25 +937,25 @@ ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15)
 #ifdef HAS_INTERPOLATEROW_NEON
 ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)
 #endif
-#ifdef HAS_INTERPOLATEROW_DSPR2
-ANY11T(InterpolateRow_Any_DSPR2, InterpolateRow_DSPR2, 1, 1, 3)
+#ifdef HAS_INTERPOLATEROW_MSA
+ANY11T(InterpolateRow_Any_MSA, InterpolateRow_MSA, 1, 1, 31)
 #endif
 #undef ANY11T
 
 // Any 1 to 1 mirror.
-#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK)                                   \
-    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \
-      SIMD_ALIGNED(uint8 temp[64 * 2]);                                        \
-      memset(temp, 0, 64);  /* for msan */                                     \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(src_ptr + r * BPP, dst_ptr, n);                               \
-      }                                                                        \
-      memcpy(temp, src_ptr, r * BPP);                                          \
-      ANY_SIMD(temp, temp + 64, MASK + 1);                                     \
-      memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP);    \
-    }
+#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK)                              \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {     \
+    SIMD_ALIGNED(uint8_t temp[64 * 2]);                                   \
+    memset(temp, 0, 64); /* for msan */                                   \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(src_ptr + r * BPP, dst_ptr, n);                            \
+    }                                                                     \
+    memcpy(temp, src_ptr, r* BPP);                                        \
+    ANY_SIMD(temp, temp + 64, MASK + 1);                                  \
+    memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP); \
+  }
 
 #ifdef HAS_MIRRORROW_AVX2
 ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
@@ -631,6 +966,9 @@ ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
 #ifdef HAS_MIRRORROW_NEON
 ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15)
 #endif
+#ifdef HAS_MIRRORROW_MSA
+ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63)
+#endif
 #ifdef HAS_ARGBMIRRORROW_AVX2
 ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
 #endif
@@ -640,67 +978,54 @@ ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
 #ifdef HAS_ARGBMIRRORROW_NEON
 ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3)
 #endif
+#ifdef HAS_ARGBMIRRORROW_MSA
+ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15)
+#endif
 #undef ANY11M
 
 // Any 1 plane. (memset)
-#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK)                                  \
-    void NAMEANY(uint8* dst_ptr, T v32, int width) {                           \
-      SIMD_ALIGNED(uint8 temp[64]);                                            \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(dst_ptr, v32, n);                                             \
-      }                                                                        \
-      ANY_SIMD(temp, v32, MASK + 1);                                           \
-      memcpy(dst_ptr + n * BPP, temp, r * BPP);                                \
-    }
+#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK)        \
+  void NAMEANY(uint8_t* dst_ptr, T v32, int width) { \
+    SIMD_ALIGNED(uint8_t temp[64]);                  \
+    int r = width & MASK;                            \
+    int n = width & ~MASK;                           \
+    if (n > 0) {                                     \
+      ANY_SIMD(dst_ptr, v32, n);                     \
+    }                                                \
+    ANY_SIMD(temp, v32, MASK + 1);                   \
+    memcpy(dst_ptr + n * BPP, temp, r * BPP);        \
+  }
 
 #ifdef HAS_SETROW_X86
-ANY1(SetRow_Any_X86, SetRow_X86, uint8, 1, 3)
+ANY1(SetRow_Any_X86, SetRow_X86, uint8_t, 1, 3)
 #endif
 #ifdef HAS_SETROW_NEON
-ANY1(SetRow_Any_NEON, SetRow_NEON, uint8, 1, 15)
+ANY1(SetRow_Any_NEON, SetRow_NEON, uint8_t, 1, 15)
 #endif
 #ifdef HAS_ARGBSETROW_NEON
-ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32, 4, 3)
+ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32_t, 4, 3)
+#endif
+#ifdef HAS_ARGBSETROW_MSA
+ANY1(ARGBSetRow_Any_MSA, ARGBSetRow_MSA, uint32_t, 4, 3)
 #endif
 #undef ANY1
 
 // Any 1 to 2.  Outputs UV planes.
-#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK)                 \
-    void NAMEANY(const uint8* src_ptr, uint8* dst_u, uint8* dst_v, int width) {\
-      SIMD_ALIGNED(uint8 temp[128 * 3]);                                       \
-      memset(temp, 0, 128);  /* for msan */                                    \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(src_ptr, dst_u, dst_v, n);                                    \
-      }                                                                        \
-      memcpy(temp, src_ptr  + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);     \
-      /* repeat last 4 bytes for 422 subsampler */                             \
-      if ((width & 1) && BPP == 4 && DUVSHIFT == 1) {                          \
-        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
-               temp + SS(r, UVSHIFT) * BPP - BPP, BPP);                        \
-      }                                                                        \
-      /* repeat last 4 - 12 bytes for 411 subsampler */                        \
-      if (((width & 3) == 1) && BPP == 4 && DUVSHIFT == 2) {                   \
-        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
-               temp + SS(r, UVSHIFT) * BPP - BPP, BPP);                        \
-        memcpy(temp + SS(r, UVSHIFT) * BPP + BPP,                              \
-               temp + SS(r, UVSHIFT) * BPP - BPP, BPP * 2);                    \
-      }                                                                        \
-      if (((width & 3) == 2) && BPP == 4 && DUVSHIFT == 2) {                   \
-        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
-               temp + SS(r, UVSHIFT) * BPP - BPP * 2, BPP * 2);                \
-      }                                                                        \
-      if (((width & 3) == 3) && BPP == 4 && DUVSHIFT == 2) {                   \
-        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
-               temp + SS(r, UVSHIFT) * BPP - BPP, BPP);                        \
-      }                                                                        \
-      ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1);                        \
-      memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT));            \
-      memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT));            \
-    }
+#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK)          \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v,  \
+               int width) {                                             \
+    SIMD_ALIGNED(uint8_t temp[128 * 3]);                                \
+    memset(temp, 0, 128); /* for msan */                                \
+    int r = width & MASK;                                               \
+    int n = width & ~MASK;                                              \
+    if (n > 0) {                                                        \
+      ANY_SIMD(src_ptr, dst_u, dst_v, n);                               \
+    }                                                                   \
+    memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
+    ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1);                   \
+    memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT));       \
+    memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT));       \
+  }
 
 #ifdef HAS_SPLITUVROW_SSE2
 ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15)
@@ -711,8 +1036,8 @@ ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31)
 #ifdef HAS_SPLITUVROW_NEON
 ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)
 #endif
-#ifdef HAS_SPLITUVROW_DSPR2
-ANY12(SplitUVRow_Any_DSPR2, SplitUVRow_DSPR2, 0, 2, 0, 15)
+#ifdef HAS_SPLITUVROW_MSA
+ANY12(SplitUVRow_Any_MSA, SplitUVRow_MSA, 0, 2, 0, 31)
 #endif
 #ifdef HAS_ARGBTOUV444ROW_SSSE3
 ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
@@ -727,37 +1052,66 @@ ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15)
 #endif
 #ifdef HAS_YUY2TOUV422ROW_NEON
 ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7)
-ANY12(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, 0, 4, 2, 31)
 ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
 ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
 #endif
+#ifdef HAS_YUY2TOUV422ROW_MSA
+ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15)
+ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31)
+ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31)
+#endif
 #undef ANY12
 
+// Any 1 to 3.  Outputs RGB planes.
+#define ANY13(NAMEANY, ANY_SIMD, BPP, MASK)                                \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g,     \
+               uint8_t* dst_b, int width) {                                \
+    SIMD_ALIGNED(uint8_t temp[16 * 6]);                                    \
+    memset(temp, 0, 16 * 3); /* for msan */                                \
+    int r = width & MASK;                                                  \
+    int n = width & ~MASK;                                                 \
+    if (n > 0) {                                                           \
+      ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, n);                           \
+    }                                                                      \
+    memcpy(temp, src_ptr + n * BPP, r * BPP);                              \
+    ANY_SIMD(temp, temp + 16 * 3, temp + 16 * 4, temp + 16 * 5, MASK + 1); \
+    memcpy(dst_r + n, temp + 16 * 3, r);                                   \
+    memcpy(dst_g + n, temp + 16 * 4, r);                                   \
+    memcpy(dst_b + n, temp + 16 * 5, r);                                   \
+  }
+
+#ifdef HAS_SPLITRGBROW_SSSE3
+ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15)
+#endif
+#ifdef HAS_SPLITRGBROW_NEON
+ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15)
+#endif
+
 // Any 1 to 2 with source stride (2 rows of source).  Outputs UV planes.
 // 128 byte row allows for 32 avx ARGB pixels.
-#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                          \
-    void NAMEANY(const uint8* src_ptr, int src_stride_ptr,                     \
-                 uint8* dst_u, uint8* dst_v, int width) {                      \
-      SIMD_ALIGNED(uint8 temp[128 * 4]);                                       \
-      memset(temp, 0, 128 * 2);  /* for msan */                                \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n);                    \
-      }                                                                        \
-      memcpy(temp, src_ptr  + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);     \
-      memcpy(temp + 128, src_ptr  + src_stride_ptr + (n >> UVSHIFT) * BPP,     \
-             SS(r, UVSHIFT) * BPP);                                            \
-      if ((width & 1) && UVSHIFT == 0) {  /* repeat last pixel for subsample */\
-        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
-               temp + SS(r, UVSHIFT) * BPP - BPP, BPP);                        \
-        memcpy(temp + 128 + SS(r, UVSHIFT) * BPP,                              \
-               temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                  \
-      }                                                                        \
-      ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1);                   \
-      memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1));                          \
-      memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1));                          \
-    }
+#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                        \
+  void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u,   \
+               uint8_t* dst_v, int width) {                                  \
+    SIMD_ALIGNED(uint8_t temp[128 * 4]);                                     \
+    memset(temp, 0, 128 * 2); /* for msan */                                 \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n);                    \
+    }                                                                        \
+    memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);      \
+    memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP,      \
+           SS(r, UVSHIFT) * BPP);                                            \
+    if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
+      memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \
+             BPP);                                                           \
+      memcpy(temp + 128 + SS(r, UVSHIFT) * BPP,                              \
+             temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                  \
+    }                                                                        \
+    ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1);                   \
+    memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1));                          \
+    memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1));                          \
+  }
 
 #ifdef HAS_ARGBTOUVROW_AVX2
 ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
@@ -783,30 +1137,57 @@ ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15)
 #ifdef HAS_ARGBTOUVROW_NEON
 ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15)
 #endif
+#ifdef HAS_ARGBTOUVROW_MSA
+ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31)
+#endif
 #ifdef HAS_ARGBTOUVJROW_NEON
 ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
 #endif
+#ifdef HAS_ARGBTOUVJROW_MSA
+ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31)
+#endif
 #ifdef HAS_BGRATOUVROW_NEON
 ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15)
 #endif
+#ifdef HAS_BGRATOUVROW_MSA
+ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 31)
+#endif
 #ifdef HAS_ABGRTOUVROW_NEON
 ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15)
 #endif
+#ifdef HAS_ABGRTOUVROW_MSA
+ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 31)
+#endif
 #ifdef HAS_RGBATOUVROW_NEON
 ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15)
 #endif
+#ifdef HAS_RGBATOUVROW_MSA
+ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 31)
+#endif
 #ifdef HAS_RGB24TOUVROW_NEON
 ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15)
 #endif
+#ifdef HAS_RGB24TOUVROW_MSA
+ANY12S(RGB24ToUVRow_Any_MSA, RGB24ToUVRow_MSA, 0, 3, 15)
+#endif
 #ifdef HAS_RAWTOUVROW_NEON
 ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15)
 #endif
+#ifdef HAS_RAWTOUVROW_MSA
+ANY12S(RAWToUVRow_Any_MSA, RAWToUVRow_MSA, 0, 3, 15)
+#endif
 #ifdef HAS_RGB565TOUVROW_NEON
 ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15)
 #endif
+#ifdef HAS_RGB565TOUVROW_MSA
+ANY12S(RGB565ToUVRow_Any_MSA, RGB565ToUVRow_MSA, 0, 2, 15)
+#endif
 #ifdef HAS_ARGB1555TOUVROW_NEON
 ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15)
 #endif
+#ifdef HAS_ARGB1555TOUVROW_MSA
+ANY12S(ARGB1555ToUVRow_Any_MSA, ARGB1555ToUVRow_MSA, 0, 2, 15)
+#endif
 #ifdef HAS_ARGB4444TOUVROW_NEON
 ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15)
 #endif
@@ -816,6 +1197,12 @@ ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15)
 #ifdef HAS_UYVYTOUVROW_NEON
 ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)
 #endif
+#ifdef HAS_YUY2TOUVROW_MSA
+ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31)
+#endif
+#ifdef HAS_UYVYTOUVROW_MSA
+ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31)
+#endif
 #undef ANY12S
 
 #ifdef __cplusplus
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/row_common.cc b/media/libvpx/libvpx/third_party/libyuv/source/row_common.cc
index aefa38c495..2bbc5adbf1 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/row_common.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/row_common.cc
@@ -10,6 +10,7 @@
 
 #include "libyuv/row.h"
 
+#include <stdio.h>
 #include <string.h>  // For memcpy and memset.
 
 #include "libyuv/basic_types.h"
@@ -23,59 +24,69 @@ extern "C" {
 
 #define USE_BRANCHLESS 1
 #if USE_BRANCHLESS
-static __inline int32 clamp0(int32 v) {
+static __inline int32_t clamp0(int32_t v) {
   return ((-(v) >> 31) & (v));
 }
 
-static __inline int32 clamp255(int32 v) {
+static __inline int32_t clamp255(int32_t v) {
   return (((255 - (v)) >> 31) | (v)) & 255;
 }
 
-static __inline uint32 Clamp(int32 val) {
-  int v = clamp0(val);
-  return (uint32)(clamp255(v));
+static __inline int32_t clamp1023(int32_t v) {
+  return (((1023 - (v)) >> 31) | (v)) & 1023;
 }
 
-static __inline uint32 Abs(int32 v) {
+static __inline uint32_t Abs(int32_t v) {
   int m = v >> 31;
   return (v + m) ^ m;
 }
-#else  // USE_BRANCHLESS
-static __inline int32 clamp0(int32 v) {
+#else   // USE_BRANCHLESS
+static __inline int32_t clamp0(int32_t v) {
   return (v < 0) ? 0 : v;
 }
 
-static __inline int32 clamp255(int32 v) {
+static __inline int32_t clamp255(int32_t v) {
   return (v > 255) ? 255 : v;
 }
 
-static __inline uint32 Clamp(int32 val) {
-  int v = clamp0(val);
-  return (uint32)(clamp255(v));
+static __inline int32_t clamp1023(int32_t v) {
+  return (v > 1023) ? 1023 : v;
 }
 
-static __inline uint32 Abs(int32 v) {
+static __inline uint32_t Abs(int32_t v) {
   return (v < 0) ? -v : v;
 }
 #endif  // USE_BRANCHLESS
+static __inline uint32_t Clamp(int32_t val) {
+  int v = clamp0(val);
+  return (uint32_t)(clamp255(v));
+}
 
-#ifdef LIBYUV_LITTLE_ENDIAN
-#define WRITEWORD(p, v) *(uint32*)(p) = v
+static __inline uint32_t Clamp10(int32_t val) {
+  int v = clamp0(val);
+  return (uint32_t)(clamp1023(v));
+}
+
+// Little Endian
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
+    defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) ||     \
+    (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define WRITEWORD(p, v) *(uint32_t*)(p) = v
 #else
-static inline void WRITEWORD(uint8* p, uint32 v) {
-  p[0] = (uint8)(v & 255);
-  p[1] = (uint8)((v >> 8) & 255);
-  p[2] = (uint8)((v >> 16) & 255);
-  p[3] = (uint8)((v >> 24) & 255);
+static inline void WRITEWORD(uint8_t* p, uint32_t v) {
+  p[0] = (uint8_t)(v & 255);
+  p[1] = (uint8_t)((v >> 8) & 255);
+  p[2] = (uint8_t)((v >> 16) & 255);
+  p[3] = (uint8_t)((v >> 24) & 255);
 }
 #endif
 
-void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
+void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 b = src_rgb24[0];
-    uint8 g = src_rgb24[1];
-    uint8 r = src_rgb24[2];
+    uint8_t b = src_rgb24[0];
+    uint8_t g = src_rgb24[1];
+    uint8_t r = src_rgb24[2];
     dst_argb[0] = b;
     dst_argb[1] = g;
     dst_argb[2] = r;
@@ -85,12 +96,12 @@ void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
   }
 }
 
-void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
+void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 r = src_raw[0];
-    uint8 g = src_raw[1];
-    uint8 b = src_raw[2];
+    uint8_t r = src_raw[0];
+    uint8_t g = src_raw[1];
+    uint8_t b = src_raw[2];
     dst_argb[0] = b;
     dst_argb[1] = g;
     dst_argb[2] = r;
@@ -100,12 +111,12 @@ void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
   }
 }
 
-void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width) {
+void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 r = src_raw[0];
-    uint8 g = src_raw[1];
-    uint8 b = src_raw[2];
+    uint8_t r = src_raw[0];
+    uint8_t g = src_raw[1];
+    uint8_t b = src_raw[2];
     dst_rgb24[0] = b;
     dst_rgb24[1] = g;
     dst_rgb24[2] = r;
@@ -114,12 +125,14 @@ void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width) {
   }
 }
 
-void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) {
+void RGB565ToARGBRow_C(const uint8_t* src_rgb565,
+                       uint8_t* dst_argb,
+                       int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 b = src_rgb565[0] & 0x1f;
-    uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
-    uint8 r = src_rgb565[1] >> 3;
+    uint8_t b = src_rgb565[0] & 0x1f;
+    uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8_t r = src_rgb565[1] >> 3;
     dst_argb[0] = (b << 3) | (b >> 2);
     dst_argb[1] = (g << 2) | (g >> 4);
     dst_argb[2] = (r << 3) | (r >> 2);
@@ -129,14 +142,15 @@ void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) {
   }
 }
 
-void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb,
+void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555,
+                         uint8_t* dst_argb,
                          int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 b = src_argb1555[0] & 0x1f;
-    uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
-    uint8 r = (src_argb1555[1] & 0x7c) >> 2;
-    uint8 a = src_argb1555[1] >> 7;
+    uint8_t b = src_argb1555[0] & 0x1f;
+    uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8_t r = (src_argb1555[1] & 0x7c) >> 2;
+    uint8_t a = src_argb1555[1] >> 7;
     dst_argb[0] = (b << 3) | (b >> 2);
     dst_argb[1] = (g << 3) | (g >> 2);
     dst_argb[2] = (r << 3) | (r >> 2);
@@ -146,14 +160,15 @@ void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb,
   }
 }
 
-void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb,
+void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444,
+                         uint8_t* dst_argb,
                          int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 b = src_argb4444[0] & 0x0f;
-    uint8 g = src_argb4444[0] >> 4;
-    uint8 r = src_argb4444[1] & 0x0f;
-    uint8 a = src_argb4444[1] >> 4;
+    uint8_t b = src_argb4444[0] & 0x0f;
+    uint8_t g = src_argb4444[0] >> 4;
+    uint8_t r = src_argb4444[1] & 0x0f;
+    uint8_t a = src_argb4444[1] >> 4;
     dst_argb[0] = (b << 4) | b;
     dst_argb[1] = (g << 4) | g;
     dst_argb[2] = (r << 4) | r;
@@ -163,12 +178,53 @@ void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb,
   }
 }
 
-void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 b = src_argb[0];
-    uint8 g = src_argb[1];
-    uint8 r = src_argb[2];
+    uint32_t ar30 = *(const uint32_t*)src_ar30;
+    uint32_t b = (ar30 >> 2) & 0xff;
+    uint32_t g = (ar30 >> 12) & 0xff;
+    uint32_t r = (ar30 >> 22) & 0xff;
+    uint32_t a = (ar30 >> 30) * 0x55;  // Replicate 2 bits to 8 bits.
+    *(uint32_t*)(dst_argb) = b | (g << 8) | (r << 16) | (a << 24);
+    dst_argb += 4;
+    src_ar30 += 4;
+  }
+}
+
+void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint32_t ar30 = *(const uint32_t*)src_ar30;
+    uint32_t b = (ar30 >> 2) & 0xff;
+    uint32_t g = (ar30 >> 12) & 0xff;
+    uint32_t r = (ar30 >> 22) & 0xff;
+    uint32_t a = (ar30 >> 30) * 0x55;  // Replicate 2 bits to 8 bits.
+    *(uint32_t*)(dst_abgr) = r | (g << 8) | (b << 16) | (a << 24);
+    dst_abgr += 4;
+    src_ar30 += 4;
+  }
+}
+
+void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint32_t ar30 = *(const uint32_t*)src_ar30;
+    uint32_t b = ar30 & 0x3ff;
+    uint32_t ga = ar30 & 0xc00ffc00;
+    uint32_t r = (ar30 >> 20) & 0x3ff;
+    *(uint32_t*)(dst_ab30) = r | ga | (b << 20);
+    dst_ab30 += 4;
+    src_ar30 += 4;
+  }
+}
+
+void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t b = src_argb[0];
+    uint8_t g = src_argb[1];
+    uint8_t r = src_argb[2];
     dst_rgb[0] = b;
     dst_rgb[1] = g;
     dst_rgb[2] = r;
@@ -177,12 +233,12 @@ void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
   }
 }
 
-void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 b = src_argb[0];
-    uint8 g = src_argb[1];
-    uint8 r = src_argb[2];
+    uint8_t b = src_argb[0];
+    uint8_t g = src_argb[1];
+    uint8_t r = src_argb[2];
     dst_rgb[0] = r;
     dst_rgb[1] = g;
     dst_rgb[2] = b;
@@ -191,25 +247,25 @@ void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
   }
 }
 
-void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint8 b0 = src_argb[0] >> 3;
-    uint8 g0 = src_argb[1] >> 2;
-    uint8 r0 = src_argb[2] >> 3;
-    uint8 b1 = src_argb[4] >> 3;
-    uint8 g1 = src_argb[5] >> 2;
-    uint8 r1 = src_argb[6] >> 3;
-    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
-              (b1 << 16) | (g1 << 21) | (r1 << 27));
+    uint8_t b0 = src_argb[0] >> 3;
+    uint8_t g0 = src_argb[1] >> 2;
+    uint8_t r0 = src_argb[2] >> 3;
+    uint8_t b1 = src_argb[4] >> 3;
+    uint8_t g1 = src_argb[5] >> 2;
+    uint8_t r1 = src_argb[6] >> 3;
+    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
+                           (r1 << 27));
     dst_rgb += 4;
     src_argb += 8;
   }
   if (width & 1) {
-    uint8 b0 = src_argb[0] >> 3;
-    uint8 g0 = src_argb[1] >> 2;
-    uint8 r0 = src_argb[2] >> 3;
-    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+    uint8_t b0 = src_argb[0] >> 3;
+    uint8_t g0 = src_argb[1] >> 2;
+    uint8_t r0 = src_argb[2] >> 3;
+    *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
   }
 }
 
@@ -221,132 +277,160 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
 // endian will not affect order of the original matrix.  But the dither4
 // will containing the first pixel in the lower byte for little endian
 // or the upper byte for big endian.
-void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
-                             const uint32 dither4, int width) {
+void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
+                             uint8_t* dst_rgb,
+                             const uint32_t dither4,
+                             int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
     int dither0 = ((const unsigned char*)(&dither4))[x & 3];
     int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3];
-    uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
-    uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
-    uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
-    uint8 b1 = clamp255(src_argb[4] + dither1) >> 3;
-    uint8 g1 = clamp255(src_argb[5] + dither1) >> 2;
-    uint8 r1 = clamp255(src_argb[6] + dither1) >> 3;
-    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
-              (b1 << 16) | (g1 << 21) | (r1 << 27));
+    uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3;
+    uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2;
+    uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3;
+    uint8_t b1 = clamp255(src_argb[4] + dither1) >> 3;
+    uint8_t g1 = clamp255(src_argb[5] + dither1) >> 2;
+    uint8_t r1 = clamp255(src_argb[6] + dither1) >> 3;
+    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
+                           (r1 << 27));
     dst_rgb += 4;
     src_argb += 8;
   }
   if (width & 1) {
     int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3];
-    uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
-    uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
-    uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
-    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+    uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3;
+    uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2;
+    uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3;
+    *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
   }
 }
 
-void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint8 b0 = src_argb[0] >> 3;
-    uint8 g0 = src_argb[1] >> 3;
-    uint8 r0 = src_argb[2] >> 3;
-    uint8 a0 = src_argb[3] >> 7;
-    uint8 b1 = src_argb[4] >> 3;
-    uint8 g1 = src_argb[5] >> 3;
-    uint8 r1 = src_argb[6] >> 3;
-    uint8 a1 = src_argb[7] >> 7;
-    *(uint32*)(dst_rgb) =
-        b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
-        (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
+    uint8_t b0 = src_argb[0] >> 3;
+    uint8_t g0 = src_argb[1] >> 3;
+    uint8_t r0 = src_argb[2] >> 3;
+    uint8_t a0 = src_argb[3] >> 7;
+    uint8_t b1 = src_argb[4] >> 3;
+    uint8_t g1 = src_argb[5] >> 3;
+    uint8_t r1 = src_argb[6] >> 3;
+    uint8_t a1 = src_argb[7] >> 7;
+    *(uint32_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
+                            (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
     dst_rgb += 4;
     src_argb += 8;
   }
   if (width & 1) {
-    uint8 b0 = src_argb[0] >> 3;
-    uint8 g0 = src_argb[1] >> 3;
-    uint8 r0 = src_argb[2] >> 3;
-    uint8 a0 = src_argb[3] >> 7;
-    *(uint16*)(dst_rgb) =
-        b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
+    uint8_t b0 = src_argb[0] >> 3;
+    uint8_t g0 = src_argb[1] >> 3;
+    uint8_t r0 = src_argb[2] >> 3;
+    uint8_t a0 = src_argb[3] >> 7;
+    *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
   }
 }
 
-void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint8 b0 = src_argb[0] >> 4;
-    uint8 g0 = src_argb[1] >> 4;
-    uint8 r0 = src_argb[2] >> 4;
-    uint8 a0 = src_argb[3] >> 4;
-    uint8 b1 = src_argb[4] >> 4;
-    uint8 g1 = src_argb[5] >> 4;
-    uint8 r1 = src_argb[6] >> 4;
-    uint8 a1 = src_argb[7] >> 4;
-    *(uint32*)(dst_rgb) =
-        b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
-        (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
+    uint8_t b0 = src_argb[0] >> 4;
+    uint8_t g0 = src_argb[1] >> 4;
+    uint8_t r0 = src_argb[2] >> 4;
+    uint8_t a0 = src_argb[3] >> 4;
+    uint8_t b1 = src_argb[4] >> 4;
+    uint8_t g1 = src_argb[5] >> 4;
+    uint8_t r1 = src_argb[6] >> 4;
+    uint8_t a1 = src_argb[7] >> 4;
+    *(uint32_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
+                            (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
     dst_rgb += 4;
     src_argb += 8;
   }
   if (width & 1) {
-    uint8 b0 = src_argb[0] >> 4;
-    uint8 g0 = src_argb[1] >> 4;
-    uint8 r0 = src_argb[2] >> 4;
-    uint8 a0 = src_argb[3] >> 4;
-    *(uint16*)(dst_rgb) =
-        b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
+    uint8_t b0 = src_argb[0] >> 4;
+    uint8_t g0 = src_argb[1] >> 4;
+    uint8_t r0 = src_argb[2] >> 4;
+    uint8_t a0 = src_argb[3] >> 4;
+    *(uint16_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
   }
 }
 
-static __inline int RGBToY(uint8 r, uint8 g, uint8 b) {
-  return (66 * r + 129 * g +  25 * b + 0x1080) >> 8;
+void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint32_t b0 = (src_abgr[0] >> 6) | ((uint32_t)(src_abgr[0]) << 2);
+    uint32_t g0 = (src_abgr[1] >> 6) | ((uint32_t)(src_abgr[1]) << 2);
+    uint32_t r0 = (src_abgr[2] >> 6) | ((uint32_t)(src_abgr[2]) << 2);
+    uint32_t a0 = (src_abgr[3] >> 6);
+    *(uint32_t*)(dst_ar30) = r0 | (g0 << 10) | (b0 << 20) | (a0 << 30);
+    dst_ar30 += 4;
+    src_abgr += 4;
+  }
 }
 
-static __inline int RGBToU(uint8 r, uint8 g, uint8 b) {
+void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint32_t b0 = (src_argb[0] >> 6) | ((uint32_t)(src_argb[0]) << 2);
+    uint32_t g0 = (src_argb[1] >> 6) | ((uint32_t)(src_argb[1]) << 2);
+    uint32_t r0 = (src_argb[2] >> 6) | ((uint32_t)(src_argb[2]) << 2);
+    uint32_t a0 = (src_argb[3] >> 6);
+    *(uint32_t*)(dst_ar30) = b0 | (g0 << 10) | (r0 << 20) | (a0 << 30);
+    dst_ar30 += 4;
+    src_argb += 4;
+  }
+}
+
+static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
+  return (66 * r + 129 * g + 25 * b + 0x1080) >> 8;
+}
+
+static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
   return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
 }
-static __inline int RGBToV(uint8 r, uint8 g, uint8 b) {
+static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
   return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
 }
 
-#define MAKEROWY(NAME, R, G, B, BPP) \
-void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {       \
-  int x;                                                                       \
-  for (x = 0; x < width; ++x) {                                                \
-    dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);               \
-    src_argb0 += BPP;                                                          \
-    dst_y += 1;                                                                \
-  }                                                                            \
-}                                                                              \
-void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb,              \
-                       uint8* dst_u, uint8* dst_v, int width) {                \
-  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
-  int x;                                                                       \
-  for (x = 0; x < width - 1; x += 2) {                                         \
-    uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] +                              \
-               src_rgb1[B] + src_rgb1[B + BPP]) >> 2;                          \
-    uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] +                              \
-               src_rgb1[G] + src_rgb1[G + BPP]) >> 2;                          \
-    uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] +                              \
-               src_rgb1[R] + src_rgb1[R + BPP]) >> 2;                          \
-    dst_u[0] = RGBToU(ar, ag, ab);                                             \
-    dst_v[0] = RGBToV(ar, ag, ab);                                             \
-    src_rgb0 += BPP * 2;                                                       \
-    src_rgb1 += BPP * 2;                                                       \
-    dst_u += 1;                                                                \
-    dst_v += 1;                                                                \
-  }                                                                            \
-  if (width & 1) {                                                             \
-    uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1;                               \
-    uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1;                               \
-    uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1;                               \
-    dst_u[0] = RGBToU(ar, ag, ab);                                             \
-    dst_v[0] = RGBToV(ar, ag, ab);                                             \
-  }                                                                            \
-}
+// ARGBToY_C and ARGBToUV_C
+#define MAKEROWY(NAME, R, G, B, BPP)                                         \
+  void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
+    int x;                                                                   \
+    for (x = 0; x < width; ++x) {                                            \
+      dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);           \
+      src_argb0 += BPP;                                                      \
+      dst_y += 1;                                                            \
+    }                                                                        \
+  }                                                                          \
+  void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb,          \
+                       uint8_t* dst_u, uint8_t* dst_v, int width) {          \
+    const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb;                     \
+    int x;                                                                   \
+    for (x = 0; x < width - 1; x += 2) {                                     \
+      uint8_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] +          \
+                    src_rgb1[B + BPP]) >>                                    \
+                   2;                                                        \
+      uint8_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] +          \
+                    src_rgb1[G + BPP]) >>                                    \
+                   2;                                                        \
+      uint8_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] +          \
+                    src_rgb1[R + BPP]) >>                                    \
+                   2;                                                        \
+      dst_u[0] = RGBToU(ar, ag, ab);                                         \
+      dst_v[0] = RGBToV(ar, ag, ab);                                         \
+      src_rgb0 += BPP * 2;                                                   \
+      src_rgb1 += BPP * 2;                                                   \
+      dst_u += 1;                                                            \
+      dst_v += 1;                                                            \
+    }                                                                        \
+    if (width & 1) {                                                         \
+      uint8_t ab = (src_rgb0[B] + src_rgb1[B]) >> 1;                         \
+      uint8_t ag = (src_rgb0[G] + src_rgb1[G]) >> 1;                         \
+      uint8_t ar = (src_rgb0[R] + src_rgb1[R]) >> 1;                         \
+      dst_u[0] = RGBToU(ar, ag, ab);                                         \
+      dst_v[0] = RGBToV(ar, ag, ab);                                         \
+    }                                                                        \
+  }
 
 MAKEROWY(ARGB, 2, 1, 0, 4)
 MAKEROWY(BGRA, 1, 2, 3, 4)
@@ -381,64 +465,65 @@ MAKEROWY(RAW, 0, 1, 2, 3)
 // g -0.41869 * 255 = -106.76595 = -107
 // r  0.50000 * 255 = 127.5 = 127
 
-static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) {
-  return (38 * r + 75 * g +  15 * b + 64) >> 7;
+static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
+  return (38 * r + 75 * g + 15 * b + 64) >> 7;
 }
 
-static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) {
+static __inline int RGBToUJ(uint8_t r, uint8_t g, uint8_t b) {
   return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
 }
-static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) {
+static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
   return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
 }
 
 #define AVGB(a, b) (((a) + (b) + 1) >> 1)
 
-#define MAKEROWYJ(NAME, R, G, B, BPP) \
-void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) {      \
-  int x;                                                                       \
-  for (x = 0; x < width; ++x) {                                                \
-    dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);              \
-    src_argb0 += BPP;                                                          \
-    dst_y += 1;                                                                \
-  }                                                                            \
-}                                                                              \
-void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb,             \
-                        uint8* dst_u, uint8* dst_v, int width) {               \
-  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
-  int x;                                                                       \
-  for (x = 0; x < width - 1; x += 2) {                                         \
-    uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                            \
-                    AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));               \
-    uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                            \
-                    AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));               \
-    uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                            \
-                    AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));               \
-    dst_u[0] = RGBToUJ(ar, ag, ab);                                            \
-    dst_v[0] = RGBToVJ(ar, ag, ab);                                            \
-    src_rgb0 += BPP * 2;                                                       \
-    src_rgb1 += BPP * 2;                                                       \
-    dst_u += 1;                                                                \
-    dst_v += 1;                                                                \
-  }                                                                            \
-  if (width & 1) {                                                             \
-    uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]);                                 \
-    uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]);                                 \
-    uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]);                                 \
-    dst_u[0] = RGBToUJ(ar, ag, ab);                                            \
-    dst_v[0] = RGBToVJ(ar, ag, ab);                                            \
-  }                                                                            \
-}
+// ARGBToYJ_C and ARGBToUVJ_C
+#define MAKEROWYJ(NAME, R, G, B, BPP)                                         \
+  void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
+    int x;                                                                    \
+    for (x = 0; x < width; ++x) {                                             \
+      dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);           \
+      src_argb0 += BPP;                                                       \
+      dst_y += 1;                                                             \
+    }                                                                         \
+  }                                                                           \
+  void NAME##ToUVJRow_C(const uint8_t* src_rgb0, int src_stride_rgb,          \
+                        uint8_t* dst_u, uint8_t* dst_v, int width) {          \
+    const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb;                      \
+    int x;                                                                    \
+    for (x = 0; x < width - 1; x += 2) {                                      \
+      uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                       \
+                        AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));          \
+      uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                       \
+                        AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));          \
+      uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                       \
+                        AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));          \
+      dst_u[0] = RGBToUJ(ar, ag, ab);                                         \
+      dst_v[0] = RGBToVJ(ar, ag, ab);                                         \
+      src_rgb0 += BPP * 2;                                                    \
+      src_rgb1 += BPP * 2;                                                    \
+      dst_u += 1;                                                             \
+      dst_v += 1;                                                             \
+    }                                                                         \
+    if (width & 1) {                                                          \
+      uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]);                            \
+      uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]);                            \
+      uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]);                            \
+      dst_u[0] = RGBToUJ(ar, ag, ab);                                         \
+      dst_v[0] = RGBToVJ(ar, ag, ab);                                         \
+    }                                                                         \
+  }
 
 MAKEROWYJ(ARGB, 2, 1, 0, 4)
 #undef MAKEROWYJ
 
-void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {
+void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 b = src_rgb565[0] & 0x1f;
-    uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
-    uint8 r = src_rgb565[1] >> 3;
+    uint8_t b = src_rgb565[0] & 0x1f;
+    uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8_t r = src_rgb565[1] >> 3;
     b = (b << 3) | (b >> 2);
     g = (g << 2) | (g >> 4);
     r = (r << 3) | (r >> 2);
@@ -448,12 +533,12 @@ void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {
   }
 }
 
-void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) {
+void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 b = src_argb1555[0] & 0x1f;
-    uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
-    uint8 r = (src_argb1555[1] & 0x7c) >> 2;
+    uint8_t b = src_argb1555[0] & 0x1f;
+    uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8_t r = (src_argb1555[1] & 0x7c) >> 2;
     b = (b << 3) | (b >> 2);
     g = (g << 3) | (g >> 2);
     r = (r << 3) | (r >> 2);
@@ -463,12 +548,12 @@ void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) {
   }
 }
 
-void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) {
+void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 b = src_argb4444[0] & 0x0f;
-    uint8 g = src_argb4444[0] >> 4;
-    uint8 r = src_argb4444[1] & 0x0f;
+    uint8_t b = src_argb4444[0] & 0x0f;
+    uint8_t g = src_argb4444[0] >> 4;
+    uint8_t r = src_argb4444[1] & 0x0f;
     b = (b << 4) | b;
     g = (g << 4) | g;
     r = (r << 4) | r;
@@ -478,26 +563,29 @@ void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) {
   }
 }
 
-void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
-                     uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565;
+void RGB565ToUVRow_C(const uint8_t* src_rgb565,
+                     int src_stride_rgb565,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565;
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint8 b0 = src_rgb565[0] & 0x1f;
-    uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
-    uint8 r0 = src_rgb565[1] >> 3;
-    uint8 b1 = src_rgb565[2] & 0x1f;
-    uint8 g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
-    uint8 r1 = src_rgb565[3] >> 3;
-    uint8 b2 = next_rgb565[0] & 0x1f;
-    uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
-    uint8 r2 = next_rgb565[1] >> 3;
-    uint8 b3 = next_rgb565[2] & 0x1f;
-    uint8 g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
-    uint8 r3 = next_rgb565[3] >> 3;
-    uint8 b = (b0 + b1 + b2 + b3);  // 565 * 4 = 787.
-    uint8 g = (g0 + g1 + g2 + g3);
-    uint8 r = (r0 + r1 + r2 + r3);
+    uint8_t b0 = src_rgb565[0] & 0x1f;
+    uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8_t r0 = src_rgb565[1] >> 3;
+    uint8_t b1 = src_rgb565[2] & 0x1f;
+    uint8_t g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
+    uint8_t r1 = src_rgb565[3] >> 3;
+    uint8_t b2 = next_rgb565[0] & 0x1f;
+    uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
+    uint8_t r2 = next_rgb565[1] >> 3;
+    uint8_t b3 = next_rgb565[2] & 0x1f;
+    uint8_t g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
+    uint8_t r3 = next_rgb565[3] >> 3;
+    uint8_t b = (b0 + b1 + b2 + b3);  // 565 * 4 = 787.
+    uint8_t g = (g0 + g1 + g2 + g3);
+    uint8_t r = (r0 + r1 + r2 + r3);
     b = (b << 1) | (b >> 6);  // 787 -> 888.
     r = (r << 1) | (r >> 6);
     dst_u[0] = RGBToU(r, g, b);
@@ -508,15 +596,15 @@ void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
     dst_v += 1;
   }
   if (width & 1) {
-    uint8 b0 = src_rgb565[0] & 0x1f;
-    uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
-    uint8 r0 = src_rgb565[1] >> 3;
-    uint8 b2 = next_rgb565[0] & 0x1f;
-    uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
-    uint8 r2 = next_rgb565[1] >> 3;
-    uint8 b = (b0 + b2);  // 565 * 2 = 676.
-    uint8 g = (g0 + g2);
-    uint8 r = (r0 + r2);
+    uint8_t b0 = src_rgb565[0] & 0x1f;
+    uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8_t r0 = src_rgb565[1] >> 3;
+    uint8_t b2 = next_rgb565[0] & 0x1f;
+    uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
+    uint8_t r2 = next_rgb565[1] >> 3;
+    uint8_t b = (b0 + b2);  // 565 * 2 = 676.
+    uint8_t g = (g0 + g2);
+    uint8_t r = (r0 + r2);
     b = (b << 2) | (b >> 4);  // 676 -> 888
     g = (g << 1) | (g >> 6);
     r = (r << 2) | (r >> 4);
@@ -525,26 +613,29 @@ void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
   }
 }
 
-void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555;
+void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
+                       int src_stride_argb1555,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555;
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint8 b0 = src_argb1555[0] & 0x1f;
-    uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
-    uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
-    uint8 b1 = src_argb1555[2] & 0x1f;
-    uint8 g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
-    uint8 r1 = (src_argb1555[3] & 0x7c) >> 2;
-    uint8 b2 = next_argb1555[0] & 0x1f;
-    uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
-    uint8 r2 = (next_argb1555[1] & 0x7c) >> 2;
-    uint8 b3 = next_argb1555[2] & 0x1f;
-    uint8 g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
-    uint8 r3 = (next_argb1555[3] & 0x7c) >> 2;
-    uint8 b = (b0 + b1 + b2 + b3);  // 555 * 4 = 777.
-    uint8 g = (g0 + g1 + g2 + g3);
-    uint8 r = (r0 + r1 + r2 + r3);
+    uint8_t b0 = src_argb1555[0] & 0x1f;
+    uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2;
+    uint8_t b1 = src_argb1555[2] & 0x1f;
+    uint8_t g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
+    uint8_t r1 = (src_argb1555[3] & 0x7c) >> 2;
+    uint8_t b2 = next_argb1555[0] & 0x1f;
+    uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
+    uint8_t r2 = (next_argb1555[1] & 0x7c) >> 2;
+    uint8_t b3 = next_argb1555[2] & 0x1f;
+    uint8_t g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
+    uint8_t r3 = (next_argb1555[3] & 0x7c) >> 2;
+    uint8_t b = (b0 + b1 + b2 + b3);  // 555 * 4 = 777.
+    uint8_t g = (g0 + g1 + g2 + g3);
+    uint8_t r = (r0 + r1 + r2 + r3);
     b = (b << 1) | (b >> 6);  // 777 -> 888.
     g = (g << 1) | (g >> 6);
     r = (r << 1) | (r >> 6);
@@ -556,15 +647,15 @@ void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
     dst_v += 1;
   }
   if (width & 1) {
-    uint8 b0 = src_argb1555[0] & 0x1f;
-    uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
-    uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
-    uint8 b2 = next_argb1555[0] & 0x1f;
-    uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
-    uint8 r2 = next_argb1555[1] >> 3;
-    uint8 b = (b0 + b2);  // 555 * 2 = 666.
-    uint8 g = (g0 + g2);
-    uint8 r = (r0 + r2);
+    uint8_t b0 = src_argb1555[0] & 0x1f;
+    uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2;
+    uint8_t b2 = next_argb1555[0] & 0x1f;
+    uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
+    uint8_t r2 = next_argb1555[1] >> 3;
+    uint8_t b = (b0 + b2);  // 555 * 2 = 666.
+    uint8_t g = (g0 + g2);
+    uint8_t r = (r0 + r2);
     b = (b << 2) | (b >> 4);  // 666 -> 888.
     g = (g << 2) | (g >> 4);
     r = (r << 2) | (r >> 4);
@@ -573,26 +664,29 @@ void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
   }
 }
 
-void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444;
+void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
+                       int src_stride_argb4444,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  const uint8_t* next_argb4444 = src_argb4444 + src_stride_argb4444;
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint8 b0 = src_argb4444[0] & 0x0f;
-    uint8 g0 = src_argb4444[0] >> 4;
-    uint8 r0 = src_argb4444[1] & 0x0f;
-    uint8 b1 = src_argb4444[2] & 0x0f;
-    uint8 g1 = src_argb4444[2] >> 4;
-    uint8 r1 = src_argb4444[3] & 0x0f;
-    uint8 b2 = next_argb4444[0] & 0x0f;
-    uint8 g2 = next_argb4444[0] >> 4;
-    uint8 r2 = next_argb4444[1] & 0x0f;
-    uint8 b3 = next_argb4444[2] & 0x0f;
-    uint8 g3 = next_argb4444[2] >> 4;
-    uint8 r3 = next_argb4444[3] & 0x0f;
-    uint8 b = (b0 + b1 + b2 + b3);  // 444 * 4 = 666.
-    uint8 g = (g0 + g1 + g2 + g3);
-    uint8 r = (r0 + r1 + r2 + r3);
+    uint8_t b0 = src_argb4444[0] & 0x0f;
+    uint8_t g0 = src_argb4444[0] >> 4;
+    uint8_t r0 = src_argb4444[1] & 0x0f;
+    uint8_t b1 = src_argb4444[2] & 0x0f;
+    uint8_t g1 = src_argb4444[2] >> 4;
+    uint8_t r1 = src_argb4444[3] & 0x0f;
+    uint8_t b2 = next_argb4444[0] & 0x0f;
+    uint8_t g2 = next_argb4444[0] >> 4;
+    uint8_t r2 = next_argb4444[1] & 0x0f;
+    uint8_t b3 = next_argb4444[2] & 0x0f;
+    uint8_t g3 = next_argb4444[2] >> 4;
+    uint8_t r3 = next_argb4444[3] & 0x0f;
+    uint8_t b = (b0 + b1 + b2 + b3);  // 444 * 4 = 666.
+    uint8_t g = (g0 + g1 + g2 + g3);
+    uint8_t r = (r0 + r1 + r2 + r3);
     b = (b << 2) | (b >> 4);  // 666 -> 888.
     g = (g << 2) | (g >> 4);
     r = (r << 2) | (r >> 4);
@@ -604,15 +698,15 @@ void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
     dst_v += 1;
   }
   if (width & 1) {
-    uint8 b0 = src_argb4444[0] & 0x0f;
-    uint8 g0 = src_argb4444[0] >> 4;
-    uint8 r0 = src_argb4444[1] & 0x0f;
-    uint8 b2 = next_argb4444[0] & 0x0f;
-    uint8 g2 = next_argb4444[0] >> 4;
-    uint8 r2 = next_argb4444[1] & 0x0f;
-    uint8 b = (b0 + b2);  // 444 * 2 = 555.
-    uint8 g = (g0 + g2);
-    uint8 r = (r0 + r2);
+    uint8_t b0 = src_argb4444[0] & 0x0f;
+    uint8_t g0 = src_argb4444[0] >> 4;
+    uint8_t r0 = src_argb4444[1] & 0x0f;
+    uint8_t b2 = next_argb4444[0] & 0x0f;
+    uint8_t g2 = next_argb4444[0] >> 4;
+    uint8_t r2 = next_argb4444[1] & 0x0f;
+    uint8_t b = (b0 + b2);  // 444 * 2 = 555.
+    uint8_t g = (g0 + g2);
+    uint8_t r = (r0 + r2);
     b = (b << 3) | (b >> 2);  // 555 -> 888.
     g = (g << 3) | (g >> 2);
     r = (r << 3) | (r >> 2);
@@ -621,13 +715,15 @@ void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
   }
 }
 
-void ARGBToUV444Row_C(const uint8* src_argb,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void ARGBToUV444Row_C(const uint8_t* src_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 ab = src_argb[0];
-    uint8 ag = src_argb[1];
-    uint8 ar = src_argb[2];
+    uint8_t ab = src_argb[0];
+    uint8_t ag = src_argb[1];
+    uint8_t ar = src_argb[2];
     dst_u[0] = RGBToU(ar, ag, ab);
     dst_v[0] = RGBToV(ar, ag, ab);
     src_argb += 4;
@@ -636,45 +732,10 @@ void ARGBToUV444Row_C(const uint8* src_argb,
   }
 }
 
-void ARGBToUV411Row_C(const uint8* src_argb,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  int x;
-  for (x = 0; x < width - 3; x += 4) {
-    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[12]) >> 2;
-    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[13]) >> 2;
-    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[14]) >> 2;
-    dst_u[0] = RGBToU(ar, ag, ab);
-    dst_v[0] = RGBToV(ar, ag, ab);
-    src_argb += 16;
-    dst_u += 1;
-    dst_v += 1;
-  }
-  // Odd width handling mimics 'any' function which replicates last pixel.
-  if ((width & 3) == 3) {
-    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[8]) >> 2;
-    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[9]) >> 2;
-    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[10]) >> 2;
-    dst_u[0] = RGBToU(ar, ag, ab);
-    dst_v[0] = RGBToV(ar, ag, ab);
-  } else if ((width & 3) == 2) {
-    uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
-    uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
-    uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
-    dst_u[0] = RGBToU(ar, ag, ab);
-    dst_v[0] = RGBToV(ar, ag, ab);
-  } else if ((width & 3) == 1) {
-    uint8 ab = src_argb[0];
-    uint8 ag = src_argb[1];
-    uint8 ar = src_argb[2];
-    dst_u[0] = RGBToU(ar, ag, ab);
-    dst_v[0] = RGBToV(ar, ag, ab);
-  }
-}
-
-void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
+    uint8_t y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
     dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
     dst_argb[3] = src_argb[3];
     dst_argb += 4;
@@ -683,7 +744,7 @@ void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
 }
 
 // Convert a row of image to Sepia tone.
-void ARGBSepiaRow_C(uint8* dst_argb, int width) {
+void ARGBSepiaRow_C(uint8_t* dst_argb, int width) {
   int x;
   for (x = 0; x < width; ++x) {
     int b = dst_argb[0];
@@ -702,22 +763,28 @@ void ARGBSepiaRow_C(uint8* dst_argb, int width) {
 
 // Apply color matrix to a row of image. Matrix is signed.
 // TODO(fbarchard): Consider adding rounding (+32).
-void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
-                          const int8* matrix_argb, int width) {
+void ARGBColorMatrixRow_C(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          const int8_t* matrix_argb,
+                          int width) {
   int x;
   for (x = 0; x < width; ++x) {
     int b = src_argb[0];
     int g = src_argb[1];
     int r = src_argb[2];
     int a = src_argb[3];
-    int sb = (b * matrix_argb[0] + g * matrix_argb[1] +
-              r * matrix_argb[2] + a * matrix_argb[3]) >> 6;
-    int sg = (b * matrix_argb[4] + g * matrix_argb[5] +
-              r * matrix_argb[6] + a * matrix_argb[7]) >> 6;
-    int sr = (b * matrix_argb[8] + g * matrix_argb[9] +
-              r * matrix_argb[10] + a * matrix_argb[11]) >> 6;
-    int sa = (b * matrix_argb[12] + g * matrix_argb[13] +
-              r * matrix_argb[14] + a * matrix_argb[15]) >> 6;
+    int sb = (b * matrix_argb[0] + g * matrix_argb[1] + r * matrix_argb[2] +
+              a * matrix_argb[3]) >>
+             6;
+    int sg = (b * matrix_argb[4] + g * matrix_argb[5] + r * matrix_argb[6] +
+              a * matrix_argb[7]) >>
+             6;
+    int sr = (b * matrix_argb[8] + g * matrix_argb[9] + r * matrix_argb[10] +
+              a * matrix_argb[11]) >>
+             6;
+    int sa = (b * matrix_argb[12] + g * matrix_argb[13] + r * matrix_argb[14] +
+              a * matrix_argb[15]) >>
+             6;
     dst_argb[0] = Clamp(sb);
     dst_argb[1] = Clamp(sg);
     dst_argb[2] = Clamp(sr);
@@ -728,7 +795,9 @@ void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
 }
 
 // Apply color table to a row of image.
-void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
+void ARGBColorTableRow_C(uint8_t* dst_argb,
+                         const uint8_t* table_argb,
+                         int width) {
   int x;
   for (x = 0; x < width; ++x) {
     int b = dst_argb[0];
@@ -744,7 +813,9 @@ void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
 }
 
 // Apply color table to a row of image.
-void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
+void RGBColorTableRow_C(uint8_t* dst_argb,
+                        const uint8_t* table_argb,
+                        int width) {
   int x;
   for (x = 0; x < width; ++x) {
     int b = dst_argb[0];
@@ -757,8 +828,11 @@ void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
   }
 }
 
-void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
-                       int interval_offset, int width) {
+void ARGBQuantizeRow_C(uint8_t* dst_argb,
+                       int scale,
+                       int interval_size,
+                       int interval_offset,
+                       int width) {
   int x;
   for (x = 0; x < width; ++x) {
     int b = dst_argb[0];
@@ -772,21 +846,23 @@ void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
 }
 
 #define REPEAT8(v) (v) | ((v) << 8)
-#define SHADE(f, v) v * f >> 24
+#define SHADE(f, v) v* f >> 24
 
-void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
-                    uint32 value) {
-  const uint32 b_scale = REPEAT8(value & 0xff);
-  const uint32 g_scale = REPEAT8((value >> 8) & 0xff);
-  const uint32 r_scale = REPEAT8((value >> 16) & 0xff);
-  const uint32 a_scale = REPEAT8(value >> 24);
+void ARGBShadeRow_C(const uint8_t* src_argb,
+                    uint8_t* dst_argb,
+                    int width,
+                    uint32_t value) {
+  const uint32_t b_scale = REPEAT8(value & 0xff);
+  const uint32_t g_scale = REPEAT8((value >> 8) & 0xff);
+  const uint32_t r_scale = REPEAT8((value >> 16) & 0xff);
+  const uint32_t a_scale = REPEAT8(value >> 24);
 
   int i;
   for (i = 0; i < width; ++i) {
-    const uint32 b = REPEAT8(src_argb[0]);
-    const uint32 g = REPEAT8(src_argb[1]);
-    const uint32 r = REPEAT8(src_argb[2]);
-    const uint32 a = REPEAT8(src_argb[3]);
+    const uint32_t b = REPEAT8(src_argb[0]);
+    const uint32_t g = REPEAT8(src_argb[1]);
+    const uint32_t r = REPEAT8(src_argb[2]);
+    const uint32_t a = REPEAT8(src_argb[3]);
     dst_argb[0] = SHADE(b, b_scale);
     dst_argb[1] = SHADE(g, g_scale);
     dst_argb[2] = SHADE(r, r_scale);
@@ -799,20 +875,22 @@ void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
 #undef SHADE
 
 #define REPEAT8(v) (v) | ((v) << 8)
-#define SHADE(f, v) v * f >> 16
+#define SHADE(f, v) v* f >> 16
 
-void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1,
-                       uint8* dst_argb, int width) {
+void ARGBMultiplyRow_C(const uint8_t* src_argb0,
+                       const uint8_t* src_argb1,
+                       uint8_t* dst_argb,
+                       int width) {
   int i;
   for (i = 0; i < width; ++i) {
-    const uint32 b = REPEAT8(src_argb0[0]);
-    const uint32 g = REPEAT8(src_argb0[1]);
-    const uint32 r = REPEAT8(src_argb0[2]);
-    const uint32 a = REPEAT8(src_argb0[3]);
-    const uint32 b_scale = src_argb1[0];
-    const uint32 g_scale = src_argb1[1];
-    const uint32 r_scale = src_argb1[2];
-    const uint32 a_scale = src_argb1[3];
+    const uint32_t b = REPEAT8(src_argb0[0]);
+    const uint32_t g = REPEAT8(src_argb0[1]);
+    const uint32_t r = REPEAT8(src_argb0[2]);
+    const uint32_t a = REPEAT8(src_argb0[3]);
+    const uint32_t b_scale = src_argb1[0];
+    const uint32_t g_scale = src_argb1[1];
+    const uint32_t r_scale = src_argb1[2];
+    const uint32_t a_scale = src_argb1[3];
     dst_argb[0] = SHADE(b, b_scale);
     dst_argb[1] = SHADE(g, g_scale);
     dst_argb[2] = SHADE(r, r_scale);
@@ -827,8 +905,10 @@ void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1,
 
 #define SHADE(f, v) clamp255(v + f)
 
-void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1,
-                  uint8* dst_argb, int width) {
+void ARGBAddRow_C(const uint8_t* src_argb0,
+                  const uint8_t* src_argb1,
+                  uint8_t* dst_argb,
+                  int width) {
   int i;
   for (i = 0; i < width; ++i) {
     const int b = src_argb0[0];
@@ -852,8 +932,10 @@ void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1,
 
 #define SHADE(f, v) clamp0(f - v)
 
-void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1,
-                       uint8* dst_argb, int width) {
+void ARGBSubtractRow_C(const uint8_t* src_argb0,
+                       const uint8_t* src_argb1,
+                       uint8_t* dst_argb,
+                       int width) {
   int i;
   for (i = 0; i < width; ++i) {
     const int b = src_argb0[0];
@@ -876,8 +958,11 @@ void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1,
 #undef SHADE
 
 // Sobel functions which mimics SSSE3.
-void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
-                 uint8* dst_sobelx, int width) {
+void SobelXRow_C(const uint8_t* src_y0,
+                 const uint8_t* src_y1,
+                 const uint8_t* src_y2,
+                 uint8_t* dst_sobelx,
+                 int width) {
   int i;
   for (i = 0; i < width; ++i) {
     int a = src_y0[i];
@@ -890,12 +975,14 @@ void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
     int b_diff = b - b_sub;
     int c_diff = c - c_sub;
     int sobel = Abs(a_diff + b_diff * 2 + c_diff);
-    dst_sobelx[i] = (uint8)(clamp255(sobel));
+    dst_sobelx[i] = (uint8_t)(clamp255(sobel));
   }
 }
 
-void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
-                 uint8* dst_sobely, int width) {
+void SobelYRow_C(const uint8_t* src_y0,
+                 const uint8_t* src_y1,
+                 uint8_t* dst_sobely,
+                 int width) {
   int i;
   for (i = 0; i < width; ++i) {
     int a = src_y0[i + 0];
@@ -908,56 +995,62 @@ void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
     int b_diff = b - b_sub;
     int c_diff = c - c_sub;
     int sobel = Abs(a_diff + b_diff * 2 + c_diff);
-    dst_sobely[i] = (uint8)(clamp255(sobel));
+    dst_sobely[i] = (uint8_t)(clamp255(sobel));
   }
 }
 
-void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
-                uint8* dst_argb, int width) {
+void SobelRow_C(const uint8_t* src_sobelx,
+                const uint8_t* src_sobely,
+                uint8_t* dst_argb,
+                int width) {
   int i;
   for (i = 0; i < width; ++i) {
     int r = src_sobelx[i];
     int b = src_sobely[i];
     int s = clamp255(r + b);
-    dst_argb[0] = (uint8)(s);
-    dst_argb[1] = (uint8)(s);
-    dst_argb[2] = (uint8)(s);
-    dst_argb[3] = (uint8)(255u);
+    dst_argb[0] = (uint8_t)(s);
+    dst_argb[1] = (uint8_t)(s);
+    dst_argb[2] = (uint8_t)(s);
+    dst_argb[3] = (uint8_t)(255u);
     dst_argb += 4;
   }
 }
 
-void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
-                       uint8* dst_y, int width) {
+void SobelToPlaneRow_C(const uint8_t* src_sobelx,
+                       const uint8_t* src_sobely,
+                       uint8_t* dst_y,
+                       int width) {
   int i;
   for (i = 0; i < width; ++i) {
     int r = src_sobelx[i];
     int b = src_sobely[i];
     int s = clamp255(r + b);
-    dst_y[i] = (uint8)(s);
+    dst_y[i] = (uint8_t)(s);
   }
 }
 
-void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
-                  uint8* dst_argb, int width) {
+void SobelXYRow_C(const uint8_t* src_sobelx,
+                  const uint8_t* src_sobely,
+                  uint8_t* dst_argb,
+                  int width) {
   int i;
   for (i = 0; i < width; ++i) {
     int r = src_sobelx[i];
     int b = src_sobely[i];
     int g = clamp255(r + b);
-    dst_argb[0] = (uint8)(b);
-    dst_argb[1] = (uint8)(g);
-    dst_argb[2] = (uint8)(r);
-    dst_argb[3] = (uint8)(255u);
+    dst_argb[0] = (uint8_t)(b);
+    dst_argb[1] = (uint8_t)(g);
+    dst_argb[2] = (uint8_t)(r);
+    dst_argb[3] = (uint8_t)(255u);
     dst_argb += 4;
   }
 }
 
-void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
+void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
   // Copy a Y to RGB.
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 y = src_y[0];
+    uint8_t y = src_y[0];
     dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
     dst_argb[3] = 255u;
     dst_argb += 4;
@@ -974,75 +1067,69 @@ void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
 //  B = (Y - 16) * 1.164 - U * -2.018
 
 // Y contribution to R,G,B.  Scale and bias.
-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YG 18997  /* round(1.164 * 64 * 256 * 256 / 257) */
 #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
 
 // U and V contributions to R,G,B.
 #define UB -128 /* max(-128, round(-2.018 * 64)) */
-#define UG 25 /* round(0.391 * 64) */
-#define VG 52 /* round(0.813 * 64) */
+#define UG 25   /* round(0.391 * 64) */
+#define VG 52   /* round(0.813 * 64) */
 #define VR -102 /* round(-1.596 * 64) */
 
 // Bias values to subtract 16 from Y and 128 from U and V.
-#define BB (UB * 128            + YGB)
+#define BB (UB * 128 + YGB)
 #define BG (UG * 128 + VG * 128 + YGB)
-#define BR            (VR * 128 + YGB)
+#define BR (VR * 128 + YGB)
 
 #if defined(__aarch64__)  // 64 bit arm
 const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
-  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
-  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
-  { UG, VG, UG, VG, UG, VG, UG, VG },
-  { UG, VG, UG, VG, UG, VG, UG, VG },
-  { BB, BG, BR, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {BB, BG, BR, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
 const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
-  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
-  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
-  { VG, UG, VG, UG, VG, UG, VG, UG },
-  { VG, UG, VG, UG, VG, UG, VG, UG },
-  { BR, BG, BB, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {BR, BG, BB, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
 #elif defined(__arm__)  // 32 bit arm
 const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
-  { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { BB, BG, BR, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
+    {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
+    {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BB, BG, BR, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
 const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
-  { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { BR, BG, BB, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
+    {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
+    {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BR, BG, BB, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
 #else
 const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
-  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
-    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
-  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
-    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
-  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
-    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
+    {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
+    {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
+    {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
 const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
-  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
-    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
-  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
-    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
-  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
-    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
+    {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+     VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
+    {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+     VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
+    {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+     0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
 #endif
 
 #undef BB
@@ -1062,74 +1149,68 @@ const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
 
 // Y contribution to R,G,B.  Scale and bias.
 #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
-#define YGB 32  /* 64 / 2 */
+#define YGB 32   /* 64 / 2 */
 
 // U and V contributions to R,G,B.
 #define UB -113 /* round(-1.77200 * 64) */
-#define UG 22 /* round(0.34414 * 64) */
-#define VG 46 /* round(0.71414  * 64) */
-#define VR -90 /* round(-1.40200 * 64) */
+#define UG 22   /* round(0.34414 * 64) */
+#define VG 46   /* round(0.71414  * 64) */
+#define VR -90  /* round(-1.40200 * 64) */
 
 // Bias values to round, and subtract 128 from U and V.
-#define BB (UB * 128            + YGB)
+#define BB (UB * 128 + YGB)
 #define BG (UG * 128 + VG * 128 + YGB)
-#define BR            (VR * 128 + YGB)
+#define BR (VR * 128 + YGB)
 
 #if defined(__aarch64__)
 const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
-  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
-  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
-  { UG, VG, UG, VG, UG, VG, UG, VG },
-  { UG, VG, UG, VG, UG, VG, UG, VG },
-  { BB, BG, BR, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {BB, BG, BR, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
 const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
-  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
-  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
-  { VG, UG, VG, UG, VG, UG, VG, UG },
-  { VG, UG, VG, UG, VG, UG, VG, UG },
-  { BR, BG, BB, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {BR, BG, BB, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
 #elif defined(__arm__)
 const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
-  { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { BB, BG, BR, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
+    {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
+    {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BB, BG, BR, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
 const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
-  { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { BR, BG, BB, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
+    {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
+    {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BR, BG, BB, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
 #else
 const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
-  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
-    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
-  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
-    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
-  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
-    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
+    {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
+    {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
+    {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
 const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
-  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
-    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
-  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
-    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
-  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
-    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
+    {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+     VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
+    {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+     VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
+    {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+     0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
 #endif
 
 #undef BB
@@ -1143,81 +1224,76 @@ const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
 #undef YG
 
 // BT.709 YUV to RGB reference
-// *  R = Y                - V * -1.28033
-// *  G = Y - U *  0.21482 - V *  0.38059
-// *  B = Y - U * -2.12798
+//  R = (Y - 16) * 1.164              - V * -1.793
+//  G = (Y - 16) * 1.164 - U *  0.213 - V *  0.533
+//  B = (Y - 16) * 1.164 - U * -2.112
+// See also http://www.equasys.de/colorconversion.html
 
 // Y contribution to R,G,B.  Scale and bias.
-#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
-#define YGB 32  /* 64 / 2 */
+#define YG 18997  /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
 
-// TODO(fbarchard): Find way to express 2.12 instead of 2.0.
+// TODO(fbarchard): Find way to express 2.112 instead of 2.0.
 // U and V contributions to R,G,B.
-#define UB -128 /* max(-128, round(-2.12798 * 64)) */
-#define UG 14 /* round(0.21482 * 64) */
-#define VG 24 /* round(0.38059  * 64) */
-#define VR -82 /* round(-1.28033 * 64) */
+#define UB -128 /* max(-128, round(-2.112 * 64)) */
+#define UG 14   /* round(0.213 * 64) */
+#define VG 34   /* round(0.533  * 64) */
+#define VR -115 /* round(-1.793 * 64) */
 
 // Bias values to round, and subtract 128 from U and V.
-#define BB (UB * 128            + YGB)
+#define BB (UB * 128 + YGB)
 #define BG (UG * 128 + VG * 128 + YGB)
-#define BR            (VR * 128 + YGB)
+#define BR (VR * 128 + YGB)
 
 #if defined(__aarch64__)
 const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
-  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
-  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
-  { UG, VG, UG, VG, UG, VG, UG, VG },
-  { UG, VG, UG, VG, UG, VG, UG, VG },
-  { BB, BG, BR, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {BB, BG, BR, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
 const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
-  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
-  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
-  { VG, UG, VG, UG, VG, UG, VG, UG },
-  { VG, UG, VG, UG, VG, UG, VG, UG },
-  { BR, BG, BB, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {BR, BG, BB, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
 #elif defined(__arm__)
 const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
-  { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { BB, BG, BR, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
+    {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
+    {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BB, BG, BR, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
 const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
-  { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { BR, BG, BB, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
+    {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
+    {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BR, BG, BB, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
 #else
 const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
-  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
-    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
-  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
-    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
-  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
-    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
+    {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
+    {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
+    {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
 const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
-  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
-    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
-  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
-    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
-  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
-    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
+    {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+     VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
+    {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+     VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
+    {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+     0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
 #endif
 
 #undef BB
@@ -1231,8 +1307,14 @@ const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
 #undef YG
 
 // C reference code that mimics the YUV assembly.
-static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
-                              uint8* b, uint8* g, uint8* r,
+// Reads 8 bit YUV and leaves result as 16 bit.
+
+static __inline void YuvPixel(uint8_t y,
+                              uint8_t u,
+                              uint8_t v,
+                              uint8_t* b,
+                              uint8_t* g,
+                              uint8_t* r,
                               const struct YuvConstants* yuvconstants) {
 #if defined(__aarch64__)
   int ub = -yuvconstants->kUVToRB[0];
@@ -1263,22 +1345,129 @@ static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
   int yg = yuvconstants->kYToRgb[0];
 #endif
 
-  uint32 y1 = (uint32)(y * 0x0101 * yg) >> 16;
-  *b = Clamp((int32)(-(u * ub)          + y1 + bb) >> 6);
-  *g = Clamp((int32)(-(u * ug + v * vg) + y1 + bg) >> 6);
-  *r = Clamp((int32)         (-(v * vr) + y1 + br) >> 6);
+  uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
+  *b = Clamp((int32_t)(-(u * ub) + y1 + bb) >> 6);
+  *g = Clamp((int32_t)(-(u * ug + v * vg) + y1 + bg) >> 6);
+  *r = Clamp((int32_t)(-(v * vr) + y1 + br) >> 6);
+}
+
+// Reads 8 bit YUV and leaves result as 16 bit.
+static __inline void YuvPixel8_16(uint8_t y,
+                                  uint8_t u,
+                                  uint8_t v,
+                                  int* b,
+                                  int* g,
+                                  int* r,
+                                  const struct YuvConstants* yuvconstants) {
+#if defined(__aarch64__)
+  int ub = -yuvconstants->kUVToRB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[1];
+  int vr = -yuvconstants->kUVToRB[1];
+  int bb = yuvconstants->kUVBiasBGR[0];
+  int bg = yuvconstants->kUVBiasBGR[1];
+  int br = yuvconstants->kUVBiasBGR[2];
+  int yg = yuvconstants->kYToRgb[0] / 0x0101;
+#elif defined(__arm__)
+  int ub = -yuvconstants->kUVToRB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[4];
+  int vr = -yuvconstants->kUVToRB[4];
+  int bb = yuvconstants->kUVBiasBGR[0];
+  int bg = yuvconstants->kUVBiasBGR[1];
+  int br = yuvconstants->kUVBiasBGR[2];
+  int yg = yuvconstants->kYToRgb[0] / 0x0101;
+#else
+  int ub = yuvconstants->kUVToB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[1];
+  int vr = yuvconstants->kUVToR[1];
+  int bb = yuvconstants->kUVBiasB[0];
+  int bg = yuvconstants->kUVBiasG[0];
+  int br = yuvconstants->kUVBiasR[0];
+  int yg = yuvconstants->kYToRgb[0];
+#endif
+
+  uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
+  *b = (int)(-(u * ub) + y1 + bb);
+  *g = (int)(-(u * ug + v * vg) + y1 + bg);
+  *r = (int)(-(v * vr) + y1 + br);
+}
+
+// C reference code that mimics the YUV 16 bit assembly.
+// Reads 10 bit YUV and leaves result as 16 bit.
+static __inline void YuvPixel16(int16_t y,
+                                int16_t u,
+                                int16_t v,
+                                int* b,
+                                int* g,
+                                int* r,
+                                const struct YuvConstants* yuvconstants) {
+#if defined(__aarch64__)
+  int ub = -yuvconstants->kUVToRB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[1];
+  int vr = -yuvconstants->kUVToRB[1];
+  int bb = yuvconstants->kUVBiasBGR[0];
+  int bg = yuvconstants->kUVBiasBGR[1];
+  int br = yuvconstants->kUVBiasBGR[2];
+  int yg = yuvconstants->kYToRgb[0] / 0x0101;
+#elif defined(__arm__)
+  int ub = -yuvconstants->kUVToRB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[4];
+  int vr = -yuvconstants->kUVToRB[4];
+  int bb = yuvconstants->kUVBiasBGR[0];
+  int bg = yuvconstants->kUVBiasBGR[1];
+  int br = yuvconstants->kUVBiasBGR[2];
+  int yg = yuvconstants->kYToRgb[0] / 0x0101;
+#else
+  int ub = yuvconstants->kUVToB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[1];
+  int vr = yuvconstants->kUVToR[1];
+  int bb = yuvconstants->kUVBiasB[0];
+  int bg = yuvconstants->kUVBiasG[0];
+  int br = yuvconstants->kUVBiasR[0];
+  int yg = yuvconstants->kYToRgb[0];
+#endif
+
+  uint32_t y1 = (uint32_t)((y << 6) * yg) >> 16;
+  u = clamp255(u >> 2);
+  v = clamp255(v >> 2);
+  *b = (int)(-(u * ub) + y1 + bb);
+  *g = (int)(-(u * ug + v * vg) + y1 + bg);
+  *r = (int)(-(v * vr) + y1 + br);
+}
+
+// C reference code that mimics the YUV 10 bit assembly.
+// Reads 10 bit YUV and clamps down to 8 bit RGB.
+static __inline void YuvPixel10(uint16_t y,
+                                uint16_t u,
+                                uint16_t v,
+                                uint8_t* b,
+                                uint8_t* g,
+                                uint8_t* r,
+                                const struct YuvConstants* yuvconstants) {
+  int b16;
+  int g16;
+  int r16;
+  YuvPixel16(y, u, v, &b16, &g16, &r16, yuvconstants);
+  *b = Clamp(b16 >> 6);
+  *g = Clamp(g16 >> 6);
+  *r = Clamp(r16 >> 6);
 }
 
 // Y contribution to R,G,B.  Scale and bias.
-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YG 18997  /* round(1.164 * 64 * 256 * 256 / 257) */
 #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
 
 // C reference code that mimics the YUV assembly.
-static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
-  uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
-  *b = Clamp((int32)(y1 + YGB) >> 6);
-  *g = Clamp((int32)(y1 + YGB) >> 6);
-  *r = Clamp((int32)(y1 + YGB) >> 6);
+static __inline void YPixel(uint8_t y, uint8_t* b, uint8_t* g, uint8_t* r) {
+  uint32_t y1 = (uint32_t)(y * 0x0101 * YG) >> 16;
+  *b = Clamp((int32_t)(y1 + YGB) >> 6);
+  *g = Clamp((int32_t)(y1 + YGB) >> 6);
+  *r = Clamp((int32_t)(y1 + YGB) >> 6);
 }
 
 #undef YG
@@ -1288,16 +1477,16 @@ static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
     (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
 // C mimic assembly.
 // TODO(fbarchard): Remove subsampling from Neon.
-void I444ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* rgb_buf,
+void I444ToARGBRow_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint8 u = (src_u[0] + src_u[1] + 1) >> 1;
-    uint8 v = (src_v[0] + src_v[1] + 1) >> 1;
+    uint8_t u = (src_u[0] + src_u[1] + 1) >> 1;
+    uint8_t v = (src_v[0] + src_v[1] + 1) >> 1;
     YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2,
              yuvconstants);
     rgb_buf[3] = 255;
@@ -1310,22 +1499,22 @@ void I444ToARGBRow_C(const uint8* src_y,
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
   }
 }
 #else
-void I444ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* rgb_buf,
+void I444ToARGBRow_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
     src_y += 1;
     src_u += 1;
@@ -1336,19 +1525,19 @@ void I444ToARGBRow_C(const uint8* src_y,
 #endif
 
 // Also used for 420
-void I422ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* rgb_buf,
+void I422ToARGBRow_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
-    YuvPixel(src_y[1], src_u[0], src_v[0],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+             rgb_buf + 6, yuvconstants);
     rgb_buf[7] = 255;
     src_y += 2;
     src_u += 1;
@@ -1356,26 +1545,120 @@ void I422ToARGBRow_C(const uint8* src_y,
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
   }
 }
 
-void I422AlphaToARGBRow_C(const uint8* src_y,
-                          const uint8* src_u,
-                          const uint8* src_v,
-                          const uint8* src_a,
-                          uint8* rgb_buf,
+// 10 bit YUV to ARGB
+void I210ToARGBRow_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+               rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+    YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+               rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+               rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+  }
+}
+
+static void StoreAR30(uint8_t* rgb_buf, int b, int g, int r) {
+  uint32_t ar30;
+  b = b >> 4;  // convert 10.6 to 10 bit.
+  g = g >> 4;
+  r = r >> 4;
+  b = Clamp10(b);
+  g = Clamp10(g);
+  r = Clamp10(r);
+  ar30 = b | ((uint32_t)g << 10) | ((uint32_t)r << 20) | 0xc0000000;
+  (*(uint32_t*)rgb_buf) = ar30;
+}
+
+// 10 bit YUV to 10 bit AR30
+void I210ToAR30Row_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  int b;
+  int g;
+  int r;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf, b, g, r);
+    YuvPixel16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf + 4, b, g, r);
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf, b, g, r);
+  }
+}
+
+// 8 bit YUV to 10 bit AR30
+// Uses same code as 10 bit YUV bit shifts the 8 bit values up to 10 bits.
+void I422ToAR30Row_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  int b;
+  int g;
+  int r;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf, b, g, r);
+    YuvPixel8_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf + 4, b, g, r);
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf, b, g, r);
+  }
+}
+
+void I422AlphaToARGBRow_C(const uint8_t* src_y,
+                          const uint8_t* src_u,
+                          const uint8_t* src_v,
+                          const uint8_t* src_a,
+                          uint8_t* rgb_buf,
                           const struct YuvConstants* yuvconstants,
                           int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = src_a[0];
-    YuvPixel(src_y[1], src_u[0], src_v[0],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+             rgb_buf + 6, yuvconstants);
     rgb_buf[7] = src_a[1];
     src_y += 2;
     src_u += 1;
@@ -1384,47 +1667,47 @@ void I422AlphaToARGBRow_C(const uint8* src_y,
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = src_a[0];
   }
 }
 
-void I422ToRGB24Row_C(const uint8* src_y,
-                      const uint8* src_u,
-                      const uint8* src_v,
-                      uint8* rgb_buf,
+void I422ToRGB24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_u,
+                      const uint8_t* src_v,
+                      uint8_t* rgb_buf,
                       const struct YuvConstants* yuvconstants,
                       int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
-    YuvPixel(src_y[1], src_u[0], src_v[0],
-             rgb_buf + 3, rgb_buf + 4, rgb_buf + 5, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 3, rgb_buf + 4,
+             rgb_buf + 5, yuvconstants);
     src_y += 2;
     src_u += 1;
     src_v += 1;
     rgb_buf += 6;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
   }
 }
 
-void I422ToARGB4444Row_C(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb4444,
+void I422ToARGB4444Row_C(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_argb4444,
                          const struct YuvConstants* yuvconstants,
                          int width) {
-  uint8 b0;
-  uint8 g0;
-  uint8 r0;
-  uint8 b1;
-  uint8 g1;
-  uint8 r1;
+  uint8_t b0;
+  uint8_t g0;
+  uint8_t r0;
+  uint8_t b1;
+  uint8_t g1;
+  uint8_t r1;
   int x;
   for (x = 0; x < width - 1; x += 2) {
     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
@@ -1435,8 +1718,8 @@ void I422ToARGB4444Row_C(const uint8* src_y,
     b1 = b1 >> 4;
     g1 = g1 >> 4;
     r1 = r1 >> 4;
-    *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
-        (b1 << 16) | (g1 << 20) | (r1 << 24) | 0xf000f000;
+    *(uint32_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | (b1 << 16) |
+                                 (g1 << 20) | (r1 << 24) | 0xf000f000;
     src_y += 2;
     src_u += 1;
     src_v += 1;
@@ -1447,23 +1730,22 @@ void I422ToARGB4444Row_C(const uint8* src_y,
     b0 = b0 >> 4;
     g0 = g0 >> 4;
     r0 = r0 >> 4;
-    *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
-        0xf000;
+    *(uint16_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | 0xf000;
   }
 }
 
-void I422ToARGB1555Row_C(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb1555,
+void I422ToARGB1555Row_C(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_argb1555,
                          const struct YuvConstants* yuvconstants,
                          int width) {
-  uint8 b0;
-  uint8 g0;
-  uint8 r0;
-  uint8 b1;
-  uint8 g1;
-  uint8 r1;
+  uint8_t b0;
+  uint8_t g0;
+  uint8_t r0;
+  uint8_t b1;
+  uint8_t g1;
+  uint8_t r1;
   int x;
   for (x = 0; x < width - 1; x += 2) {
     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
@@ -1474,8 +1756,8 @@ void I422ToARGB1555Row_C(const uint8* src_y,
     b1 = b1 >> 3;
     g1 = g1 >> 3;
     r1 = r1 >> 3;
-    *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
-        (b1 << 16) | (g1 << 21) | (r1 << 26) | 0x80008000;
+    *(uint32_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | (b1 << 16) |
+                                 (g1 << 21) | (r1 << 26) | 0x80008000;
     src_y += 2;
     src_u += 1;
     src_v += 1;
@@ -1486,23 +1768,22 @@ void I422ToARGB1555Row_C(const uint8* src_y,
     b0 = b0 >> 3;
     g0 = g0 >> 3;
     r0 = r0 >> 3;
-    *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
-        0x8000;
+    *(uint16_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | 0x8000;
   }
 }
 
-void I422ToRGB565Row_C(const uint8* src_y,
-                       const uint8* src_u,
-                       const uint8* src_v,
-                       uint8* dst_rgb565,
+void I422ToRGB565Row_C(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_rgb565,
                        const struct YuvConstants* yuvconstants,
                        int width) {
-  uint8 b0;
-  uint8 g0;
-  uint8 r0;
-  uint8 b1;
-  uint8 g1;
-  uint8 r1;
+  uint8_t b0;
+  uint8_t g0;
+  uint8_t r0;
+  uint8_t b1;
+  uint8_t g1;
+  uint8_t r1;
   int x;
   for (x = 0; x < width - 1; x += 2) {
     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
@@ -1513,8 +1794,8 @@ void I422ToRGB565Row_C(const uint8* src_y,
     b1 = b1 >> 3;
     g1 = g1 >> 2;
     r1 = r1 >> 3;
-    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
-        (b1 << 16) | (g1 << 21) | (r1 << 27);
+    *(uint32_t*)(dst_rgb565) =
+        b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27);
     src_y += 2;
     src_u += 1;
     src_v += 1;
@@ -1525,111 +1806,111 @@ void I422ToRGB565Row_C(const uint8* src_y,
     b0 = b0 >> 3;
     g0 = g0 >> 2;
     r0 = r0 >> 3;
-    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+    *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
   }
 }
 
-void I411ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
-  int x;
-  for (x = 0; x < width - 3; x += 4) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-    YuvPixel(src_y[1], src_u[0], src_v[0],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
-    rgb_buf[7] = 255;
-    YuvPixel(src_y[2], src_u[0], src_v[0],
-             rgb_buf + 8, rgb_buf + 9, rgb_buf + 10, yuvconstants);
-    rgb_buf[11] = 255;
-    YuvPixel(src_y[3], src_u[0], src_v[0],
-             rgb_buf + 12, rgb_buf + 13, rgb_buf + 14, yuvconstants);
-    rgb_buf[15] = 255;
-    src_y += 4;
-    src_u += 1;
-    src_v += 1;
-    rgb_buf += 16;  // Advance 4 pixels.
-  }
-  if (width & 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-    YuvPixel(src_y[1], src_u[0], src_v[0],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
-    rgb_buf[7] = 255;
-    src_y += 2;
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
-  if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-  }
-}
-
-void NV12ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_uv,
-                     uint8* rgb_buf,
+void NV12ToARGBRow_C(const uint8_t* src_y,
+                     const uint8_t* src_uv,
+                     uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_uv[0], src_uv[1],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
-    YuvPixel(src_y[1], src_uv[0], src_uv[1],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 4, rgb_buf + 5,
+             rgb_buf + 6, yuvconstants);
     rgb_buf[7] = 255;
     src_y += 2;
     src_uv += 2;
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_uv[0], src_uv[1],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
   }
 }
 
-void NV21ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_vu,
-                     uint8* rgb_buf,
+void NV21ToARGBRow_C(const uint8_t* src_y,
+                     const uint8_t* src_vu,
+                     uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_vu[1], src_vu[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
-    YuvPixel(src_y[1], src_vu[1], src_vu[0],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 4, rgb_buf + 5,
+             rgb_buf + 6, yuvconstants);
     rgb_buf[7] = 255;
     src_y += 2;
     src_vu += 2;
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_vu[1], src_vu[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
   }
 }
 
-void NV12ToRGB565Row_C(const uint8* src_y,
-                       const uint8* src_uv,
-                       uint8* dst_rgb565,
+void NV12ToRGB24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_uv,
+                      uint8_t* rgb_buf,
+                      const struct YuvConstants* yuvconstants,
+                      int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 3, rgb_buf + 4,
+             rgb_buf + 5, yuvconstants);
+    src_y += 2;
+    src_uv += 2;
+    rgb_buf += 6;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+  }
+}
+
+void NV21ToRGB24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_vu,
+                      uint8_t* rgb_buf,
+                      const struct YuvConstants* yuvconstants,
+                      int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 3, rgb_buf + 4,
+             rgb_buf + 5, yuvconstants);
+    src_y += 2;
+    src_vu += 2;
+    rgb_buf += 6;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+  }
+}
+
+void NV12ToRGB565Row_C(const uint8_t* src_y,
+                       const uint8_t* src_uv,
+                       uint8_t* dst_rgb565,
                        const struct YuvConstants* yuvconstants,
                        int width) {
-  uint8 b0;
-  uint8 g0;
-  uint8 r0;
-  uint8 b1;
-  uint8 g1;
-  uint8 r1;
+  uint8_t b0;
+  uint8_t g0;
+  uint8_t r0;
+  uint8_t b1;
+  uint8_t g1;
+  uint8_t r1;
   int x;
   for (x = 0; x < width - 1; x += 2) {
     YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
@@ -1640,8 +1921,8 @@ void NV12ToRGB565Row_C(const uint8* src_y,
     b1 = b1 >> 3;
     g1 = g1 >> 2;
     r1 = r1 >> 3;
-    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
-        (b1 << 16) | (g1 << 21) | (r1 << 27);
+    *(uint32_t*)(dst_rgb565) =
+        b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27);
     src_y += 2;
     src_uv += 2;
     dst_rgb565 += 4;  // Advance 2 pixels.
@@ -1651,67 +1932,67 @@ void NV12ToRGB565Row_C(const uint8* src_y,
     b0 = b0 >> 3;
     g0 = g0 >> 2;
     r0 = r0 >> 3;
-    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+    *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
   }
 }
 
-void YUY2ToARGBRow_C(const uint8* src_yuy2,
-                     uint8* rgb_buf,
+void YUY2ToARGBRow_C(const uint8_t* src_yuy2,
+                     uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
-    YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3], rgb_buf + 4, rgb_buf + 5,
+             rgb_buf + 6, yuvconstants);
     rgb_buf[7] = 255;
     src_yuy2 += 4;
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
   }
 }
 
-void UYVYToARGBRow_C(const uint8* src_uyvy,
-                     uint8* rgb_buf,
+void UYVYToARGBRow_C(const uint8_t* src_uyvy,
+                     uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
-    YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2], rgb_buf + 4, rgb_buf + 5,
+             rgb_buf + 6, yuvconstants);
     rgb_buf[7] = 255;
     src_uyvy += 4;
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
   }
 }
 
-void I422ToRGBARow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* rgb_buf,
+void I422ToRGBARow_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
+             rgb_buf + 3, yuvconstants);
     rgb_buf[0] = 255;
-    YuvPixel(src_y[1], src_u[0], src_v[0],
-             rgb_buf + 5, rgb_buf + 6, rgb_buf + 7, yuvconstants);
+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 5, rgb_buf + 6,
+             rgb_buf + 7, yuvconstants);
     rgb_buf[4] = 255;
     src_y += 2;
     src_u += 1;
@@ -1719,13 +2000,13 @@ void I422ToRGBARow_C(const uint8* src_y,
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
+             rgb_buf + 3, yuvconstants);
     rgb_buf[0] = 255;
   }
 }
 
-void I400ToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
+void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
     YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
@@ -1741,7 +2022,7 @@ void I400ToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
   }
 }
 
-void MirrorRow_C(const uint8* src, uint8* dst, int width) {
+void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
   int x;
   src += width - 1;
   for (x = 0; x < width - 1; x += 2) {
@@ -1754,7 +2035,10 @@ void MirrorRow_C(const uint8* src, uint8* dst, int width) {
   }
 }
 
-void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+void MirrorUVRow_C(const uint8_t* src_uv,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width) {
   int x;
   src_uv += (width - 1) << 1;
   for (x = 0; x < width - 1; x += 2) {
@@ -1770,10 +2054,10 @@ void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
   }
 }
 
-void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) {
+void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
   int x;
-  const uint32* src32 = (const uint32*)(src);
-  uint32* dst32 = (uint32*)(dst);
+  const uint32_t* src32 = (const uint32_t*)(src);
+  uint32_t* dst32 = (uint32_t*)(dst);
   src32 += width - 1;
   for (x = 0; x < width - 1; x += 2) {
     dst32[x] = src32[0];
@@ -1785,7 +2069,10 @@ void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) {
   }
 }
 
-void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+void SplitUVRow_C(const uint8_t* src_uv,
+                  uint8_t* dst_u,
+                  uint8_t* dst_v,
+                  int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
     dst_u[x] = src_uv[0];
@@ -1800,7 +2087,9 @@ void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
   }
 }
 
-void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_C(const uint8_t* src_u,
+                  const uint8_t* src_v,
+                  uint8_t* dst_uv,
                   int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
@@ -1816,20 +2105,110 @@ void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
   }
 }
 
-void CopyRow_C(const uint8* src, uint8* dst, int count) {
+void SplitRGBRow_C(const uint8_t* src_rgb,
+                   uint8_t* dst_r,
+                   uint8_t* dst_g,
+                   uint8_t* dst_b,
+                   int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_r[x] = src_rgb[0];
+    dst_g[x] = src_rgb[1];
+    dst_b[x] = src_rgb[2];
+    src_rgb += 3;
+  }
+}
+
+void MergeRGBRow_C(const uint8_t* src_r,
+                   const uint8_t* src_g,
+                   const uint8_t* src_b,
+                   uint8_t* dst_rgb,
+                   int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_rgb[0] = src_r[x];
+    dst_rgb[1] = src_g[x];
+    dst_rgb[2] = src_b[x];
+    dst_rgb += 3;
+  }
+}
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 128 = 9 bits
+// 64 = 10 bits
+// 16 = 12 bits
+// 1 = 16 bits
+void MergeUVRow_16_C(const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint16_t* dst_uv,
+                     int scale,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_uv[0] = src_u[x] * scale;
+    dst_uv[1] = src_v[x] * scale;
+    dst_uv[2] = src_u[x + 1] * scale;
+    dst_uv[3] = src_v[x + 1] * scale;
+    dst_uv += 4;
+  }
+  if (width & 1) {
+    dst_uv[0] = src_u[width - 1] * scale;
+    dst_uv[1] = src_v[width - 1] * scale;
+  }
+}
+
+void MultiplyRow_16_C(const uint16_t* src_y,
+                      uint16_t* dst_y,
+                      int scale,
+                      int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_y[x] = src_y[x] * scale;
+  }
+}
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+void Convert16To8Row_C(const uint16_t* src_y,
+                       uint8_t* dst_y,
+                       int scale,
+                       int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_y[x] = clamp255((src_y[x] * scale) >> 16);
+  }
+}
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 1024 = 10 bits
+void Convert8To16Row_C(const uint8_t* src_y,
+                       uint16_t* dst_y,
+                       int scale,
+                       int width) {
+  int x;
+  scale *= 0x0101;  // replicates the byte.
+  for (x = 0; x < width; ++x) {
+    dst_y[x] = (src_y[x] * scale) >> 16;
+  }
+}
+
+void CopyRow_C(const uint8_t* src, uint8_t* dst, int count) {
   memcpy(dst, src, count);
 }
 
-void CopyRow_16_C(const uint16* src, uint16* dst, int count) {
+void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count) {
   memcpy(dst, src, count * 2);
 }
 
-void SetRow_C(uint8* dst, uint8 v8, int width) {
+void SetRow_C(uint8_t* dst, uint8_t v8, int width) {
   memset(dst, v8, width);
 }
 
-void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) {
-  uint32* d = (uint32*)(dst_argb);
+void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width) {
+  uint32_t* d = (uint32_t*)(dst_argb);
   int x;
   for (x = 0; x < width; ++x) {
     d[x] = v32;
@@ -1837,8 +2216,11 @@ void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) {
 }
 
 // Filter 2 rows of YUY2 UV's (422) into U and V (420).
-void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
-                   uint8* dst_u, uint8* dst_v, int width) {
+void YUY2ToUVRow_C(const uint8_t* src_yuy2,
+                   int src_stride_yuy2,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width) {
   // Output a row of UV values, filtering 2 rows of YUY2.
   int x;
   for (x = 0; x < width; x += 2) {
@@ -1851,8 +2233,10 @@ void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
 }
 
 // Copy row of YUY2 UV's (422) into U and V (422).
-void YUY2ToUV422Row_C(const uint8* src_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void YUY2ToUV422Row_C(const uint8_t* src_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
   // Output a row of UV values.
   int x;
   for (x = 0; x < width; x += 2) {
@@ -1865,7 +2249,7 @@ void YUY2ToUV422Row_C(const uint8* src_yuy2,
 }
 
 // Copy row of YUY2 Y's (422) into Y (420/422).
-void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
+void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
   // Output a row of Y values.
   int x;
   for (x = 0; x < width - 1; x += 2) {
@@ -1879,8 +2263,11 @@ void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
 }
 
 // Filter 2 rows of UYVY UV's (422) into U and V (420).
-void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
-                   uint8* dst_u, uint8* dst_v, int width) {
+void UYVYToUVRow_C(const uint8_t* src_uyvy,
+                   int src_stride_uyvy,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width) {
   // Output a row of UV values.
   int x;
   for (x = 0; x < width; x += 2) {
@@ -1893,8 +2280,10 @@ void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
 }
 
 // Copy row of UYVY UV's (422) into U and V (422).
-void UYVYToUV422Row_C(const uint8* src_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void UYVYToUV422Row_C(const uint8_t* src_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
   // Output a row of UV values.
   int x;
   for (x = 0; x < width; x += 2) {
@@ -1907,7 +2296,7 @@ void UYVYToUV422Row_C(const uint8* src_uyvy,
 }
 
 // Copy row of UYVY Y's (422) into Y (420/422).
-void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {
+void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
   // Output a row of Y values.
   int x;
   for (x = 0; x < width - 1; x += 2) {
@@ -1925,17 +2314,19 @@ void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {
 // Blend src_argb0 over src_argb1 and store to dst_argb.
 // dst_argb may be src_argb0 or src_argb1.
 // This code mimics the SSSE3 version for better testability.
-void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
-                    uint8* dst_argb, int width) {
+void ARGBBlendRow_C(const uint8_t* src_argb0,
+                    const uint8_t* src_argb1,
+                    uint8_t* dst_argb,
+                    int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint32 fb = src_argb0[0];
-    uint32 fg = src_argb0[1];
-    uint32 fr = src_argb0[2];
-    uint32 a = src_argb0[3];
-    uint32 bb = src_argb1[0];
-    uint32 bg = src_argb1[1];
-    uint32 br = src_argb1[2];
+    uint32_t fb = src_argb0[0];
+    uint32_t fg = src_argb0[1];
+    uint32_t fr = src_argb0[2];
+    uint32_t a = src_argb0[3];
+    uint32_t bb = src_argb1[0];
+    uint32_t bg = src_argb1[1];
+    uint32_t br = src_argb1[2];
     dst_argb[0] = BLEND(fb, bb, a);
     dst_argb[1] = BLEND(fg, bg, a);
     dst_argb[2] = BLEND(fr, br, a);
@@ -1958,13 +2349,13 @@ void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
   }
 
   if (width & 1) {
-    uint32 fb = src_argb0[0];
-    uint32 fg = src_argb0[1];
-    uint32 fr = src_argb0[2];
-    uint32 a = src_argb0[3];
-    uint32 bb = src_argb1[0];
-    uint32 bg = src_argb1[1];
-    uint32 br = src_argb1[2];
+    uint32_t fb = src_argb0[0];
+    uint32_t fg = src_argb0[1];
+    uint32_t fr = src_argb0[2];
+    uint32_t a = src_argb0[3];
+    uint32_t bb = src_argb1[0];
+    uint32_t bg = src_argb1[1];
+    uint32_t br = src_argb1[2];
     dst_argb[0] = BLEND(fb, bb, a);
     dst_argb[1] = BLEND(fg, bg, a);
     dst_argb[2] = BLEND(fr, br, a);
@@ -1973,9 +2364,12 @@ void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
 }
 #undef BLEND
 
-#define UBLEND(f, b, a) (((a) * f) + ((255 - a) * b) + 255) >> 8
-void BlendPlaneRow_C(const uint8* src0, const uint8* src1,
-                     const uint8* alpha, uint8* dst, int width) {
+#define UBLEND(f, b, a) (((a)*f) + ((255 - a) * b) + 255) >> 8
+void BlendPlaneRow_C(const uint8_t* src0,
+                     const uint8_t* src1,
+                     const uint8_t* alpha,
+                     uint8_t* dst,
+                     int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
     dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
@@ -1995,13 +2389,13 @@ void BlendPlaneRow_C(const uint8* src0, const uint8* src1,
 
 // Multiply source RGB by alpha and store to destination.
 // This code mimics the SSSE3 version for better testability.
-void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
   int i;
   for (i = 0; i < width - 1; i += 2) {
-    uint32 b = src_argb[0];
-    uint32 g = src_argb[1];
-    uint32 r = src_argb[2];
-    uint32 a = src_argb[3];
+    uint32_t b = src_argb[0];
+    uint32_t g = src_argb[1];
+    uint32_t r = src_argb[2];
+    uint32_t a = src_argb[3];
     dst_argb[0] = ATTENUATE(b, a);
     dst_argb[1] = ATTENUATE(g, a);
     dst_argb[2] = ATTENUATE(r, a);
@@ -2019,10 +2413,10 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
   }
 
   if (width & 1) {
-    const uint32 b = src_argb[0];
-    const uint32 g = src_argb[1];
-    const uint32 r = src_argb[2];
-    const uint32 a = src_argb[3];
+    const uint32_t b = src_argb[0];
+    const uint32_t g = src_argb[1];
+    const uint32_t r = src_argb[2];
+    const uint32_t a = src_argb[3];
     dst_argb[0] = ATTENUATE(b, a);
     dst_argb[1] = ATTENUATE(g, a);
     dst_argb[2] = ATTENUATE(r, a);
@@ -2038,49 +2432,56 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
 // Reciprocal method is off by 1 on some values. ie 125
 // 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
 #define T(a) 0x01000000 + (0x10000 / a)
-const uint32 fixed_invtbl8[256] = {
-  0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
-  T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
-  T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
-  T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
-  T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
-  T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
-  T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
-  T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
-  T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
-  T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
-  T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
-  T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
-  T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
-  T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
-  T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
-  T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
-  T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
-  T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
-  T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
-  T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
-  T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
-  T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
-  T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
-  T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
-  T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
-  T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
-  T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
-  T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
-  T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
-  T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
-  T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
-  T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x01000100 };
+const uint32_t fixed_invtbl8[256] = {
+    0x01000000, 0x0100ffff, T(0x02), T(0x03),   T(0x04), T(0x05), T(0x06),
+    T(0x07),    T(0x08),    T(0x09), T(0x0a),   T(0x0b), T(0x0c), T(0x0d),
+    T(0x0e),    T(0x0f),    T(0x10), T(0x11),   T(0x12), T(0x13), T(0x14),
+    T(0x15),    T(0x16),    T(0x17), T(0x18),   T(0x19), T(0x1a), T(0x1b),
+    T(0x1c),    T(0x1d),    T(0x1e), T(0x1f),   T(0x20), T(0x21), T(0x22),
+    T(0x23),    T(0x24),    T(0x25), T(0x26),   T(0x27), T(0x28), T(0x29),
+    T(0x2a),    T(0x2b),    T(0x2c), T(0x2d),   T(0x2e), T(0x2f), T(0x30),
+    T(0x31),    T(0x32),    T(0x33), T(0x34),   T(0x35), T(0x36), T(0x37),
+    T(0x38),    T(0x39),    T(0x3a), T(0x3b),   T(0x3c), T(0x3d), T(0x3e),
+    T(0x3f),    T(0x40),    T(0x41), T(0x42),   T(0x43), T(0x44), T(0x45),
+    T(0x46),    T(0x47),    T(0x48), T(0x49),   T(0x4a), T(0x4b), T(0x4c),
+    T(0x4d),    T(0x4e),    T(0x4f), T(0x50),   T(0x51), T(0x52), T(0x53),
+    T(0x54),    T(0x55),    T(0x56), T(0x57),   T(0x58), T(0x59), T(0x5a),
+    T(0x5b),    T(0x5c),    T(0x5d), T(0x5e),   T(0x5f), T(0x60), T(0x61),
+    T(0x62),    T(0x63),    T(0x64), T(0x65),   T(0x66), T(0x67), T(0x68),
+    T(0x69),    T(0x6a),    T(0x6b), T(0x6c),   T(0x6d), T(0x6e), T(0x6f),
+    T(0x70),    T(0x71),    T(0x72), T(0x73),   T(0x74), T(0x75), T(0x76),
+    T(0x77),    T(0x78),    T(0x79), T(0x7a),   T(0x7b), T(0x7c), T(0x7d),
+    T(0x7e),    T(0x7f),    T(0x80), T(0x81),   T(0x82), T(0x83), T(0x84),
+    T(0x85),    T(0x86),    T(0x87), T(0x88),   T(0x89), T(0x8a), T(0x8b),
+    T(0x8c),    T(0x8d),    T(0x8e), T(0x8f),   T(0x90), T(0x91), T(0x92),
+    T(0x93),    T(0x94),    T(0x95), T(0x96),   T(0x97), T(0x98), T(0x99),
+    T(0x9a),    T(0x9b),    T(0x9c), T(0x9d),   T(0x9e), T(0x9f), T(0xa0),
+    T(0xa1),    T(0xa2),    T(0xa3), T(0xa4),   T(0xa5), T(0xa6), T(0xa7),
+    T(0xa8),    T(0xa9),    T(0xaa), T(0xab),   T(0xac), T(0xad), T(0xae),
+    T(0xaf),    T(0xb0),    T(0xb1), T(0xb2),   T(0xb3), T(0xb4), T(0xb5),
+    T(0xb6),    T(0xb7),    T(0xb8), T(0xb9),   T(0xba), T(0xbb), T(0xbc),
+    T(0xbd),    T(0xbe),    T(0xbf), T(0xc0),   T(0xc1), T(0xc2), T(0xc3),
+    T(0xc4),    T(0xc5),    T(0xc6), T(0xc7),   T(0xc8), T(0xc9), T(0xca),
+    T(0xcb),    T(0xcc),    T(0xcd), T(0xce),   T(0xcf), T(0xd0), T(0xd1),
+    T(0xd2),    T(0xd3),    T(0xd4), T(0xd5),   T(0xd6), T(0xd7), T(0xd8),
+    T(0xd9),    T(0xda),    T(0xdb), T(0xdc),   T(0xdd), T(0xde), T(0xdf),
+    T(0xe0),    T(0xe1),    T(0xe2), T(0xe3),   T(0xe4), T(0xe5), T(0xe6),
+    T(0xe7),    T(0xe8),    T(0xe9), T(0xea),   T(0xeb), T(0xec), T(0xed),
+    T(0xee),    T(0xef),    T(0xf0), T(0xf1),   T(0xf2), T(0xf3), T(0xf4),
+    T(0xf5),    T(0xf6),    T(0xf7), T(0xf8),   T(0xf9), T(0xfa), T(0xfb),
+    T(0xfc),    T(0xfd),    T(0xfe), 0x01000100};
 #undef T
 
-void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+void ARGBUnattenuateRow_C(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width) {
   int i;
   for (i = 0; i < width; ++i) {
-    uint32 b = src_argb[0];
-    uint32 g = src_argb[1];
-    uint32 r = src_argb[2];
-    const uint32 a = src_argb[3];
-    const uint32 ia = fixed_invtbl8[a] & 0xffff;  // 8.8 fixed point
+    uint32_t b = src_argb[0];
+    uint32_t g = src_argb[1];
+    uint32_t r = src_argb[2];
+    const uint32_t a = src_argb[3];
+    const uint32_t ia = fixed_invtbl8[a] & 0xffff;  // 8.8 fixed point
     b = (b * ia) >> 8;
     g = (g * ia) >> 8;
     r = (r * ia) >> 8;
@@ -2094,31 +2495,37 @@ void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
   }
 }
 
-void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
-                               const int32* previous_cumsum, int width) {
-  int32 row_sum[4] = {0, 0, 0, 0};
+void ComputeCumulativeSumRow_C(const uint8_t* row,
+                               int32_t* cumsum,
+                               const int32_t* previous_cumsum,
+                               int width) {
+  int32_t row_sum[4] = {0, 0, 0, 0};
   int x;
   for (x = 0; x < width; ++x) {
     row_sum[0] += row[x * 4 + 0];
     row_sum[1] += row[x * 4 + 1];
     row_sum[2] += row[x * 4 + 2];
     row_sum[3] += row[x * 4 + 3];
-    cumsum[x * 4 + 0] = row_sum[0]  + previous_cumsum[x * 4 + 0];
-    cumsum[x * 4 + 1] = row_sum[1]  + previous_cumsum[x * 4 + 1];
-    cumsum[x * 4 + 2] = row_sum[2]  + previous_cumsum[x * 4 + 2];
-    cumsum[x * 4 + 3] = row_sum[3]  + previous_cumsum[x * 4 + 3];
+    cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0];
+    cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1];
+    cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2];
+    cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3];
   }
 }
 
-void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl,
-                                int w, int area, uint8* dst, int count) {
+void CumulativeSumToAverageRow_C(const int32_t* tl,
+                                 const int32_t* bl,
+                                 int w,
+                                 int area,
+                                 uint8_t* dst,
+                                 int count) {
   float ooa = 1.0f / area;
   int i;
   for (i = 0; i < count; ++i) {
-    dst[0] = (uint8)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
-    dst[1] = (uint8)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
-    dst[2] = (uint8)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
-    dst[3] = (uint8)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
+    dst[0] = (uint8_t)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
+    dst[1] = (uint8_t)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
+    dst[2] = (uint8_t)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
+    dst[3] = (uint8_t)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
     dst += 4;
     tl += 4;
     bl += 4;
@@ -2127,8 +2534,11 @@ void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl,
 
 // Copy pixels from rotated source to destination row with a slope.
 LIBYUV_API
-void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
-                     uint8* dst_argb, const float* uv_dudv, int width) {
+void ARGBAffineRow_C(const uint8_t* src_argb,
+                     int src_argb_stride,
+                     uint8_t* dst_argb,
+                     const float* uv_dudv,
+                     int width) {
   int i;
   // Render a row of pixels from source into a buffer.
   float uv[2];
@@ -2137,9 +2547,8 @@ void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
   for (i = 0; i < width; ++i) {
     int x = (int)(uv[0]);
     int y = (int)(uv[1]);
-    *(uint32*)(dst_argb) =
-        *(const uint32*)(src_argb + y * src_argb_stride +
-                                         x * 4);
+    *(uint32_t*)(dst_argb) =
+        *(const uint32_t*)(src_argb + y * src_argb_stride + x * 4);
     dst_argb += 4;
     uv[0] += uv_dudv[2];
     uv[1] += uv_dudv[3];
@@ -2147,16 +2556,20 @@ void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
 }
 
 // Blend 2 rows into 1.
-static void HalfRow_C(const uint8* src_uv, ptrdiff_t src_uv_stride,
-                      uint8* dst_uv, int width) {
+static void HalfRow_C(const uint8_t* src_uv,
+                      ptrdiff_t src_uv_stride,
+                      uint8_t* dst_uv,
+                      int width) {
   int x;
   for (x = 0; x < width; ++x) {
     dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
   }
 }
 
-static void HalfRow_16_C(const uint16* src_uv, ptrdiff_t src_uv_stride,
-                         uint16* dst_uv, int width) {
+static void HalfRow_16_C(const uint16_t* src_uv,
+                         ptrdiff_t src_uv_stride,
+                         uint16_t* dst_uv,
+                         int width) {
   int x;
   for (x = 0; x < width; ++x) {
     dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
@@ -2164,12 +2577,14 @@ static void HalfRow_16_C(const uint16* src_uv, ptrdiff_t src_uv_stride,
 }
 
 // C version 2x2 -> 2x1.
-void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
+void InterpolateRow_C(uint8_t* dst_ptr,
+                      const uint8_t* src_ptr,
                       ptrdiff_t src_stride,
-                      int width, int source_y_fraction) {
+                      int width,
+                      int source_y_fraction) {
   int y1_fraction = source_y_fraction;
   int y0_fraction = 256 - y1_fraction;
-  const uint8* src_ptr1 = src_ptr + src_stride;
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
   int x;
   if (y1_fraction == 0) {
     memcpy(dst_ptr, src_ptr, width);
@@ -2194,12 +2609,14 @@ void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
   }
 }
 
-void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
+void InterpolateRow_16_C(uint16_t* dst_ptr,
+                         const uint16_t* src_ptr,
                          ptrdiff_t src_stride,
-                         int width, int source_y_fraction) {
+                         int width,
+                         int source_y_fraction) {
   int y1_fraction = source_y_fraction;
   int y0_fraction = 256 - y1_fraction;
-  const uint16* src_ptr1 = src_ptr + src_stride;
+  const uint16_t* src_ptr1 = src_ptr + src_stride;
   int x;
   if (source_y_fraction == 0) {
     memcpy(dst_ptr, src_ptr, width * 2);
@@ -2222,8 +2639,10 @@ void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
 }
 
 // Use first 4 shuffler values to reorder ARGB channels.
-void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
-                      const uint8* shuffler, int width) {
+void ARGBShuffleRow_C(const uint8_t* src_argb,
+                      uint8_t* dst_argb,
+                      const uint8_t* shuffler,
+                      int width) {
   int index0 = shuffler[0];
   int index1 = shuffler[1];
   int index2 = shuffler[2];
@@ -2232,10 +2651,10 @@ void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
   int x;
   for (x = 0; x < width; ++x) {
     // To support in-place conversion.
-    uint8 b = src_argb[index0];
-    uint8 g = src_argb[index1];
-    uint8 r = src_argb[index2];
-    uint8 a = src_argb[index3];
+    uint8_t b = src_argb[index0];
+    uint8_t g = src_argb[index1];
+    uint8_t r = src_argb[index2];
+    uint8_t a = src_argb[index3];
     dst_argb[0] = b;
     dst_argb[1] = g;
     dst_argb[2] = r;
@@ -2245,10 +2664,11 @@ void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
   }
 }
 
-void I422ToYUY2Row_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_frame, int width) {
+void I422ToYUY2Row_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_frame,
+                     int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
     dst_frame[0] = src_y[0];
@@ -2268,10 +2688,11 @@ void I422ToYUY2Row_C(const uint8* src_y,
   }
 }
 
-void I422ToUYVYRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_frame, int width) {
+void I422ToUYVYRow_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_frame,
+                     int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
     dst_frame[0] = src_u[0];
@@ -2291,9 +2712,8 @@ void I422ToUYVYRow_C(const uint8* src_y,
   }
 }
 
-
-void ARGBPolynomialRow_C(const uint8* src_argb,
-                         uint8* dst_argb,
+void ARGBPolynomialRow_C(const uint8_t* src_argb,
+                         uint8_t* dst_argb,
                          const float* poly,
                          int width) {
   int i;
@@ -2323,33 +2743,75 @@ void ARGBPolynomialRow_C(const uint8* src_argb,
     dr += poly[14] * r3;
     da += poly[15] * a3;
 
-    dst_argb[0] = Clamp((int32)(db));
-    dst_argb[1] = Clamp((int32)(dg));
-    dst_argb[2] = Clamp((int32)(dr));
-    dst_argb[3] = Clamp((int32)(da));
+    dst_argb[0] = Clamp((int32_t)(db));
+    dst_argb[1] = Clamp((int32_t)(dg));
+    dst_argb[2] = Clamp((int32_t)(dr));
+    dst_argb[3] = Clamp((int32_t)(da));
     src_argb += 4;
     dst_argb += 4;
   }
 }
 
-void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
-                             const uint8* luma, uint32 lumacoeff) {
-  uint32 bc = lumacoeff & 0xff;
-  uint32 gc = (lumacoeff >> 8) & 0xff;
-  uint32 rc = (lumacoeff >> 16) & 0xff;
+// Samples assumed to be unsigned in low 9, 10 or 12 bits.  Scale factor
+// adjust the source integer range to the half float range desired.
+
+// This magic constant is 2^-112. Multiplying by this
+// is the same as subtracting 112 from the exponent, which
+// is the difference in exponent bias between 32-bit and
+// 16-bit floats. Once we've done this subtraction, we can
+// simply extract the low bits of the exponent and the high
+// bits of the mantissa from our float and we're done.
+
+// Work around GCC 7 punning warning -Wstrict-aliasing
+#if defined(__GNUC__)
+typedef uint32_t __attribute__((__may_alias__)) uint32_alias_t;
+#else
+typedef uint32_t uint32_alias_t;
+#endif
+
+void HalfFloatRow_C(const uint16_t* src,
+                    uint16_t* dst,
+                    float scale,
+                    int width) {
+  int i;
+  float mult = 1.9259299444e-34f * scale;
+  for (i = 0; i < width; ++i) {
+    float value = src[i] * mult;
+    dst[i] = (uint16_t)((*(const uint32_alias_t*)&value) >> 13);
+  }
+}
+
+void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    float value = src[i] * scale;
+    dst[i] = value;
+  }
+}
+
+void ARGBLumaColorTableRow_C(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             int width,
+                             const uint8_t* luma,
+                             uint32_t lumacoeff) {
+  uint32_t bc = lumacoeff & 0xff;
+  uint32_t gc = (lumacoeff >> 8) & 0xff;
+  uint32_t rc = (lumacoeff >> 16) & 0xff;
 
   int i;
   for (i = 0; i < width - 1; i += 2) {
     // Luminance in rows, color values in columns.
-    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
-                           src_argb[2] * rc) & 0x7F00u) + luma;
-    const uint8* luma1;
+    const uint8_t* luma0 =
+        ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) +
+        luma;
+    const uint8_t* luma1;
     dst_argb[0] = luma0[src_argb[0]];
     dst_argb[1] = luma0[src_argb[1]];
     dst_argb[2] = luma0[src_argb[2]];
     dst_argb[3] = src_argb[3];
-    luma1 = ((src_argb[4] * bc + src_argb[5] * gc +
-              src_argb[6] * rc) & 0x7F00u) + luma;
+    luma1 =
+        ((src_argb[4] * bc + src_argb[5] * gc + src_argb[6] * rc) & 0x7F00u) +
+        luma;
     dst_argb[4] = luma1[src_argb[4]];
     dst_argb[5] = luma1[src_argb[5]];
     dst_argb[6] = luma1[src_argb[6]];
@@ -2359,8 +2821,9 @@ void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
   }
   if (width & 1) {
     // Luminance in rows, color values in columns.
-    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
-                           src_argb[2] * rc) & 0x7F00u) + luma;
+    const uint8_t* luma0 =
+        ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) +
+        luma;
     dst_argb[0] = luma0[src_argb[0]];
     dst_argb[1] = luma0[src_argb[1]];
     dst_argb[2] = luma0[src_argb[2]];
@@ -2368,7 +2831,7 @@ void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
   }
 }
 
-void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) {
+void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) {
   int i;
   for (i = 0; i < width - 1; i += 2) {
     dst[3] = src[3];
@@ -2381,7 +2844,7 @@ void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) {
   }
 }
 
-void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width) {
+void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width) {
   int i;
   for (i = 0; i < width - 1; i += 2) {
     dst_a[0] = src_argb[3];
@@ -2394,7 +2857,7 @@ void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width) {
   }
 }
 
-void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) {
+void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) {
   int i;
   for (i = 0; i < width - 1; i += 2) {
     dst[3] = src[0];
@@ -2413,13 +2876,13 @@ void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) {
 #if !(defined(_MSC_VER) && defined(_M_IX86)) && \
     defined(HAS_I422TORGB565ROW_SSSE3)
 // row_win.cc has asm version, but GCC uses 2 step wrapper.
-void I422ToRGB565Row_SSSE3(const uint8* src_y,
-                           const uint8* src_u,
-                           const uint8* src_v,
-                           uint8* dst_rgb565,
+void I422ToRGB565Row_SSSE3(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_rgb565,
                            const struct YuvConstants* yuvconstants,
                            int width) {
-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
@@ -2434,14 +2897,14 @@ void I422ToRGB565Row_SSSE3(const uint8* src_y,
 #endif
 
 #if defined(HAS_I422TOARGB1555ROW_SSSE3)
-void I422ToARGB1555Row_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb1555,
+void I422ToARGB1555Row_SSSE3(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             uint8_t* dst_argb1555,
                              const struct YuvConstants* yuvconstants,
                              int width) {
   // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
@@ -2456,14 +2919,14 @@ void I422ToARGB1555Row_SSSE3(const uint8* src_y,
 #endif
 
 #if defined(HAS_I422TOARGB4444ROW_SSSE3)
-void I422ToARGB4444Row_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb4444,
+void I422ToARGB4444Row_SSSE3(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             uint8_t* dst_argb4444,
                              const struct YuvConstants* yuvconstants,
                              int width) {
   // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
@@ -2478,13 +2941,13 @@ void I422ToARGB4444Row_SSSE3(const uint8* src_y,
 #endif
 
 #if defined(HAS_NV12TORGB565ROW_SSSE3)
-void NV12ToRGB565Row_SSSE3(const uint8* src_y,
-                           const uint8* src_uv,
-                           uint8* dst_rgb565,
+void NV12ToRGB565Row_SSSE3(const uint8_t* src_y,
+                           const uint8_t* src_uv,
+                           uint8_t* dst_rgb565,
                            const struct YuvConstants* yuvconstants,
                            int width) {
   // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth);
@@ -2497,14 +2960,102 @@ void NV12ToRGB565Row_SSSE3(const uint8* src_y,
 }
 #endif
 
-#if defined(HAS_I422TORGB565ROW_AVX2)
-void I422ToRGB565Row_AVX2(const uint8* src_y,
-                          const uint8* src_u,
-                          const uint8* src_v,
-                          uint8* dst_rgb565,
+#if defined(HAS_NV12TORGB24ROW_SSSE3)
+void NV12ToRGB24Row_SSSE3(const uint8_t* src_y,
+                          const uint8_t* src_uv,
+                          uint8_t* dst_rgb24,
                           const struct YuvConstants* yuvconstants,
                           int width) {
-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth);
+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+    src_y += twidth;
+    src_uv += twidth;
+    dst_rgb24 += twidth * 3;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_NV21TORGB24ROW_SSSE3)
+void NV21ToRGB24Row_SSSE3(const uint8_t* src_y,
+                          const uint8_t* src_vu,
+                          uint8_t* dst_rgb24,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV21ToARGBRow_SSSE3(src_y, src_vu, row, yuvconstants, twidth);
+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+    src_y += twidth;
+    src_vu += twidth;
+    dst_rgb24 += twidth * 3;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_NV12TORGB24ROW_AVX2)
+void NV12ToRGB24Row_AVX2(const uint8_t* src_y,
+                         const uint8_t* src_uv,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
+    ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
+#else
+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+#endif
+    src_y += twidth;
+    src_uv += twidth;
+    dst_rgb24 += twidth * 3;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_NV21TORGB24ROW_AVX2)
+void NV21ToRGB24Row_AVX2(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV21ToARGBRow_AVX2(src_y, src_vu, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
+    ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
+#else
+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+#endif
+    src_y += twidth;
+    src_vu += twidth;
+    dst_rgb24 += twidth * 3;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TORGB565ROW_AVX2)
+void I422ToRGB565Row_AVX2(const uint8_t* src_y,
+                          const uint8_t* src_u,
+                          const uint8_t* src_v,
+                          uint8_t* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
@@ -2523,14 +3074,14 @@ void I422ToRGB565Row_AVX2(const uint8* src_y,
 #endif
 
 #if defined(HAS_I422TOARGB1555ROW_AVX2)
-void I422ToARGB1555Row_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb1555,
+void I422ToARGB1555Row_AVX2(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb1555,
                             const struct YuvConstants* yuvconstants,
                             int width) {
   // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
@@ -2549,14 +3100,14 @@ void I422ToARGB1555Row_AVX2(const uint8* src_y,
 #endif
 
 #if defined(HAS_I422TOARGB4444ROW_AVX2)
-void I422ToARGB4444Row_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb4444,
+void I422ToARGB4444Row_AVX2(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb4444,
                             const struct YuvConstants* yuvconstants,
                             int width) {
   // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
@@ -2575,19 +3126,22 @@ void I422ToARGB4444Row_AVX2(const uint8* src_y,
 #endif
 
 #if defined(HAS_I422TORGB24ROW_AVX2)
-void I422ToRGB24Row_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_rgb24,
-                            const struct YuvConstants* yuvconstants,
-                            int width) {
+void I422ToRGB24Row_AVX2(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
   // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
-    // TODO(fbarchard): ARGBToRGB24Row_AVX2
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
+    ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
+#else
     ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+#endif
     src_y += twidth;
     src_u += twidth / 2;
     src_v += twidth / 2;
@@ -2598,13 +3152,13 @@ void I422ToRGB24Row_AVX2(const uint8* src_y,
 #endif
 
 #if defined(HAS_NV12TORGB565ROW_AVX2)
-void NV12ToRGB565Row_AVX2(const uint8* src_y,
-                          const uint8* src_uv,
-                          uint8* dst_rgb565,
+void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
+                          const uint8_t* src_uv,
+                          uint8_t* dst_rgb565,
                           const struct YuvConstants* yuvconstants,
                           int width) {
   // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
@@ -2621,6 +3175,62 @@ void NV12ToRGB565Row_AVX2(const uint8* src_y,
 }
 #endif
 
+float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
+  float fsum = 0.f;
+  int i;
+#if defined(__clang__)
+#pragma clang loop vectorize_width(4)
+#endif
+  for (i = 0; i < width; ++i) {
+    float v = *src++;
+    fsum += v * v;
+    *dst++ = v * scale;
+  }
+  return fsum;
+}
+
+float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width) {
+  float fmax = 0.f;
+  int i;
+  for (i = 0; i < width; ++i) {
+    float v = *src++;
+    float vs = v * scale;
+    fmax = (v > fmax) ? v : fmax;
+    *dst++ = vs;
+  }
+  return fmax;
+}
+
+void ScaleSamples_C(const float* src, float* dst, float scale, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    *dst++ = *src++ * scale;
+  }
+}
+
+void GaussRow_C(const uint32_t* src, uint16_t* dst, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    *dst++ =
+        (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8;
+    ++src;
+  }
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_C(const uint16_t* src0,
+                const uint16_t* src1,
+                const uint16_t* src2,
+                const uint16_t* src3,
+                const uint16_t* src4,
+                uint32_t* dst,
+                int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++;
+  }
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/row_gcc.cc b/media/libvpx/libvpx/third_party/libyuv/source/row_gcc.cc
index 1ac7ef1aa3..8d3cb81cec 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/row_gcc.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/row_gcc.cc
@@ -1,4 +1,3 @@
-// VERSION 2
 /*
  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
@@ -23,1663 +22,2001 @@ extern "C" {
 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
 
 // Constants for ARGB
-static vec8 kARGBToY = {
-  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
-};
+static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
+                              13, 65, 33, 0, 13, 65, 33, 0};
 
 // JPeg full range.
-static vec8 kARGBToYJ = {
-  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
-};
+static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
+                               15, 75, 38, 0, 15, 75, 38, 0};
 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
 
 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
 
-static vec8 kARGBToU = {
-  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
-};
+static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
+                              112, -74, -38, 0, 112, -74, -38, 0};
 
-static vec8 kARGBToUJ = {
-  127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
-};
+static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
+                               127, -84, -43, 0, 127, -84, -43, 0};
 
-static vec8 kARGBToV = {
-  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
-};
+static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
+                              -18, -94, 112, 0, -18, -94, 112, 0};
 
-static vec8 kARGBToVJ = {
-  -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
-};
+static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
+                               -20, -107, 127, 0, -20, -107, 127, 0};
 
 // Constants for BGRA
-static vec8 kBGRAToY = {
-  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
-};
+static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
+                              0, 33, 65, 13, 0, 33, 65, 13};
 
-static vec8 kBGRAToU = {
-  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
-};
+static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
+                              0, -38, -74, 112, 0, -38, -74, 112};
 
-static vec8 kBGRAToV = {
-  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
-};
+static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
+                              0, 112, -94, -18, 0, 112, -94, -18};
 
 // Constants for ABGR
-static vec8 kABGRToY = {
-  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
-};
+static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
+                              33, 65, 13, 0, 33, 65, 13, 0};
 
-static vec8 kABGRToU = {
-  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
-};
+static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
+                              -38, -74, 112, 0, -38, -74, 112, 0};
 
-static vec8 kABGRToV = {
-  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
-};
+static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
+                              112, -94, -18, 0, 112, -94, -18, 0};
 
 // Constants for RGBA.
-static vec8 kRGBAToY = {
-  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
-};
+static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
+                              0, 13, 65, 33, 0, 13, 65, 33};
 
-static vec8 kRGBAToU = {
-  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
-};
+static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
+                              0, 112, -74, -38, 0, 112, -74, -38};
 
-static vec8 kRGBAToV = {
-  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
-};
+static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
+                              0, -18, -94, 112, 0, -18, -94, 112};
 
-static uvec8 kAddY16 = {
-  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
-};
+static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
+                              16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
 
 // 7 bit fixed point 0.5.
-static vec16 kAddYJ64 = {
-  64, 64, 64, 64, 64, 64, 64, 64
-};
+static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
 
-static uvec8 kAddUV128 = {
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+                                128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
 
-static uvec16 kAddUVJ128 = {
-  0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
-};
+static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
+                                  0x8080u, 0x8080u, 0x8080u, 0x8080u};
 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
 
 #ifdef HAS_RGB24TOARGBROW_SSSE3
 
 // Shuffle table for converting RGB24 to ARGB.
-static uvec8 kShuffleMaskRGB24ToARGB = {
-  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
-};
+static const uvec8 kShuffleMaskRGB24ToARGB = {
+    0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
 
 // Shuffle table for converting RAW to ARGB.
-static uvec8 kShuffleMaskRAWToARGB = {
-  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
-};
+static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u,  4u,  3u, 13u,
+                                            8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
 
 // Shuffle table for converting RAW to RGB24.  First 8.
 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
-  2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+    2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,
+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting RAW to RGB24.  Middle 8.
 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
-  2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+    2u,   7u,   6u,   5u,   10u,  9u,   8u,   13u,
+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting RAW to RGB24.  Last 8.
 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
-  8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+    8u,   7u,   12u,  11u,  10u,  15u,  14u,  13u,
+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting ARGB to RGB24.
-static uvec8 kShuffleMaskARGBToRGB24 = {
-  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
-};
+static const uvec8 kShuffleMaskARGBToRGB24 = {
+    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting ARGB to RAW.
-static uvec8 kShuffleMaskARGBToRAW = {
-  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
-};
+static const uvec8 kShuffleMaskARGBToRAW = {
+    2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
-static uvec8 kShuffleMaskARGBToRGB24_0 = {
-  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
-};
+static const uvec8 kShuffleMaskARGBToRGB24_0 = {
+    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
 
 // YUY2 shuf 16 Y to 32 Y.
-static const lvec8 kShuffleYUY2Y = {
-  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
-  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
-};
+static const lvec8 kShuffleYUY2Y = {0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10,
+                                    10, 12, 12, 14, 14, 0,  0,  2,  2,  4, 4,
+                                    6,  6,  8,  8,  10, 10, 12, 12, 14, 14};
 
 // YUY2 shuf 8 UV to 16 UV.
-static const lvec8 kShuffleYUY2UV = {
-  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
-  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
-};
+static const lvec8 kShuffleYUY2UV = {1,  3,  1,  3,  5,  7,  5,  7,  9,  11, 9,
+                                     11, 13, 15, 13, 15, 1,  3,  1,  3,  5,  7,
+                                     5,  7,  9,  11, 9,  11, 13, 15, 13, 15};
 
 // UYVY shuf 16 Y to 32 Y.
-static const lvec8 kShuffleUYVYY = {
-  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
-  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
-};
+static const lvec8 kShuffleUYVYY = {1,  1,  3,  3,  5,  5,  7,  7,  9,  9, 11,
+                                    11, 13, 13, 15, 15, 1,  1,  3,  3,  5, 5,
+                                    7,  7,  9,  9,  11, 11, 13, 13, 15, 15};
 
 // UYVY shuf 8 UV to 16 UV.
-static const lvec8 kShuffleUYVYUV = {
-  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
-  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
-};
+static const lvec8 kShuffleUYVYUV = {0,  2,  0,  2,  4,  6,  4,  6,  8,  10, 8,
+                                     10, 12, 14, 12, 14, 0,  2,  0,  2,  4,  6,
+                                     4,  6,  8,  10, 8,  10, 12, 14, 12, 14};
 
 // NV21 shuf 8 VU to 16 UV.
 static const lvec8 kShuffleNV21 = {
-  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
-  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
 };
 #endif  // HAS_RGB24TOARGBROW_SSSE3
 
 #ifdef HAS_J400TOARGBROW_SSE2
-void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pslld     $0x18,%%xmm5                    \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm0,%%xmm0                   \n"
-    "punpckhwd %%xmm1,%%xmm1                   \n"
-    "por       %%xmm5,%%xmm0                   \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_y),     // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)        // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
-  );
+void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "pslld     $0x18,%%xmm5                    \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq      (%0),%%xmm0                     \n"
+      "lea       0x8(%0),%0                      \n"
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpcklwd %%xmm0,%%xmm0                   \n"
+      "punpckhwd %%xmm1,%%xmm1                   \n"
+      "por       %%xmm5,%%xmm0                   \n"
+      "por       %%xmm5,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_y),     // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm5");
 }
 #endif  // HAS_J400TOARGBROW_SSE2
 
 #ifdef HAS_RGB24TOARGBROW_SSSE3
-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
-    "pslld     $0x18,%%xmm5                    \n"
-    "movdqa    %3,%%xmm4                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
-    "lea       " MEMLEA(0x30,0) ",%0           \n"
-    "movdqa    %%xmm3,%%xmm2                   \n"
-    "palignr   $0x8,%%xmm1,%%xmm2              \n"
-    "pshufb    %%xmm4,%%xmm2                   \n"
-    "por       %%xmm5,%%xmm2                   \n"
-    "palignr   $0xc,%%xmm0,%%xmm1              \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
-    "por       %%xmm5,%%xmm0                   \n"
-    "pshufb    %%xmm4,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "palignr   $0x4,%%xmm3,%%xmm3              \n"
-    "pshufb    %%xmm4,%%xmm3                   \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "por       %%xmm5,%%xmm3                   \n"
-    "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_rgb24),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)        // %2
-  : "m"(kShuffleMaskRGB24ToARGB)  // %3
-  : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"  // 0xff000000
+      "pslld     $0x18,%%xmm5                    \n"
+      "movdqa    %3,%%xmm4                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm3                 \n"
+      "lea       0x30(%0),%0                     \n"
+      "movdqa    %%xmm3,%%xmm2                   \n"
+      "palignr   $0x8,%%xmm1,%%xmm2              \n"
+      "pshufb    %%xmm4,%%xmm2                   \n"
+      "por       %%xmm5,%%xmm2                   \n"
+      "palignr   $0xc,%%xmm0,%%xmm1              \n"
+      "pshufb    %%xmm4,%%xmm0                   \n"
+      "movdqu    %%xmm2,0x20(%1)                 \n"
+      "por       %%xmm5,%%xmm0                   \n"
+      "pshufb    %%xmm4,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "por       %%xmm5,%%xmm1                   \n"
+      "palignr   $0x4,%%xmm3,%%xmm3              \n"
+      "pshufb    %%xmm4,%%xmm3                   \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "por       %%xmm5,%%xmm3                   \n"
+      "movdqu    %%xmm3,0x30(%1)                 \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_rgb24),              // %0
+        "+r"(dst_argb),               // %1
+        "+r"(width)                   // %2
+      : "m"(kShuffleMaskRGB24ToARGB)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
-void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
-    "pslld     $0x18,%%xmm5                    \n"
-    "movdqa    %3,%%xmm4                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
-    "lea       " MEMLEA(0x30,0) ",%0           \n"
-    "movdqa    %%xmm3,%%xmm2                   \n"
-    "palignr   $0x8,%%xmm1,%%xmm2              \n"
-    "pshufb    %%xmm4,%%xmm2                   \n"
-    "por       %%xmm5,%%xmm2                   \n"
-    "palignr   $0xc,%%xmm0,%%xmm1              \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
-    "por       %%xmm5,%%xmm0                   \n"
-    "pshufb    %%xmm4,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "palignr   $0x4,%%xmm3,%%xmm3              \n"
-    "pshufb    %%xmm4,%%xmm3                   \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "por       %%xmm5,%%xmm3                   \n"
-    "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_raw),   // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)        // %2
-  : "m"(kShuffleMaskRAWToARGB)  // %3
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"  // 0xff000000
+      "pslld     $0x18,%%xmm5                    \n"
+      "movdqa    %3,%%xmm4                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm3                 \n"
+      "lea       0x30(%0),%0                     \n"
+      "movdqa    %%xmm3,%%xmm2                   \n"
+      "palignr   $0x8,%%xmm1,%%xmm2              \n"
+      "pshufb    %%xmm4,%%xmm2                   \n"
+      "por       %%xmm5,%%xmm2                   \n"
+      "palignr   $0xc,%%xmm0,%%xmm1              \n"
+      "pshufb    %%xmm4,%%xmm0                   \n"
+      "movdqu    %%xmm2,0x20(%1)                 \n"
+      "por       %%xmm5,%%xmm0                   \n"
+      "pshufb    %%xmm4,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "por       %%xmm5,%%xmm1                   \n"
+      "palignr   $0x4,%%xmm3,%%xmm3              \n"
+      "pshufb    %%xmm4,%%xmm3                   \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "por       %%xmm5,%%xmm3                   \n"
+      "movdqu    %%xmm3,0x30(%1)                 \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_raw),              // %0
+        "+r"(dst_argb),             // %1
+        "+r"(width)                 // %2
+      : "m"(kShuffleMaskRAWToARGB)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
-void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
-  asm volatile (
-   "movdqa     %3,%%xmm3                       \n"
-   "movdqa     %4,%%xmm4                       \n"
-   "movdqa     %5,%%xmm5                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x4,0) ",%%xmm1    \n"
-    "movdqu    " MEMACCESS2(0x8,0) ",%%xmm2    \n"
-    "lea       " MEMLEA(0x18,0) ",%0           \n"
-    "pshufb    %%xmm3,%%xmm0                   \n"
-    "pshufb    %%xmm4,%%xmm1                   \n"
-    "pshufb    %%xmm5,%%xmm2                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"
-    "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x18,1) ",%1           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_raw),    // %0
-    "+r"(dst_rgb24),  // %1
-    "+r"(width)       // %2
-  : "m"(kShuffleMaskRAWToRGB24_0),  // %3
-    "m"(kShuffleMaskRAWToRGB24_1),  // %4
-    "m"(kShuffleMaskRAWToRGB24_2)   // %5
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
+                         uint8_t* dst_rgb24,
+                         int width) {
+  asm volatile(
+      "movdqa     %3,%%xmm3                       \n"
+      "movdqa     %4,%%xmm4                       \n"
+      "movdqa     %5,%%xmm5                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x4(%0),%%xmm1                  \n"
+      "movdqu    0x8(%0),%%xmm2                  \n"
+      "lea       0x18(%0),%0                     \n"
+      "pshufb    %%xmm3,%%xmm0                   \n"
+      "pshufb    %%xmm4,%%xmm1                   \n"
+      "pshufb    %%xmm5,%%xmm2                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "movq      %%xmm1,0x8(%1)                  \n"
+      "movq      %%xmm2,0x10(%1)                 \n"
+      "lea       0x18(%1),%1                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_raw),                  // %0
+        "+r"(dst_rgb24),                // %1
+        "+r"(width)                     // %2
+      : "m"(kShuffleMaskRAWToRGB24_0),  // %3
+        "m"(kShuffleMaskRAWToRGB24_1),  // %4
+        "m"(kShuffleMaskRAWToRGB24_2)   // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
-void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "mov       $0x1080108,%%eax                \n"
-    "movd      %%eax,%%xmm5                    \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "mov       $0x20802080,%%eax               \n"
-    "movd      %%eax,%%xmm6                    \n"
-    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
-    "pcmpeqb   %%xmm3,%%xmm3                   \n"
-    "psllw     $0xb,%%xmm3                     \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "psllw     $0xa,%%xmm4                     \n"
-    "psrlw     $0x5,%%xmm4                     \n"
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "psllw     $0x8,%%xmm7                     \n"
-    "sub       %0,%1                           \n"
-    "sub       %0,%1                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "pand      %%xmm3,%%xmm1                   \n"
-    "psllw     $0xb,%%xmm2                     \n"
-    "pmulhuw   %%xmm5,%%xmm1                   \n"
-    "pmulhuw   %%xmm5,%%xmm2                   \n"
-    "psllw     $0x8,%%xmm1                     \n"
-    "por       %%xmm2,%%xmm1                   \n"
-    "pand      %%xmm4,%%xmm0                   \n"
-    "pmulhuw   %%xmm6,%%xmm0                   \n"
-    "por       %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "punpcklbw %%xmm0,%%xmm1                   \n"
-    "punpckhbw %%xmm0,%%xmm2                   \n"
-    MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
-    MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  :
-  : "memory", "cc", "eax", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "mov       $0x1080108,%%eax                \n"
+      "movd      %%eax,%%xmm5                    \n"
+      "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+      "mov       $0x20802080,%%eax               \n"
+      "movd      %%eax,%%xmm6                    \n"
+      "pshufd    $0x0,%%xmm6,%%xmm6              \n"
+      "pcmpeqb   %%xmm3,%%xmm3                   \n"
+      "psllw     $0xb,%%xmm3                     \n"
+      "pcmpeqb   %%xmm4,%%xmm4                   \n"
+      "psllw     $0xa,%%xmm4                     \n"
+      "psrlw     $0x5,%%xmm4                     \n"
+      "pcmpeqb   %%xmm7,%%xmm7                   \n"
+      "psllw     $0x8,%%xmm7                     \n"
+      "sub       %0,%1                           \n"
+      "sub       %0,%1                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "pand      %%xmm3,%%xmm1                   \n"
+      "psllw     $0xb,%%xmm2                     \n"
+      "pmulhuw   %%xmm5,%%xmm1                   \n"
+      "pmulhuw   %%xmm5,%%xmm2                   \n"
+      "psllw     $0x8,%%xmm1                     \n"
+      "por       %%xmm2,%%xmm1                   \n"
+      "pand      %%xmm4,%%xmm0                   \n"
+      "pmulhuw   %%xmm6,%%xmm0                   \n"
+      "por       %%xmm7,%%xmm0                   \n"
+      "movdqa    %%xmm1,%%xmm2                   \n"
+      "punpcklbw %%xmm0,%%xmm1                   \n"
+      "punpckhbw %%xmm0,%%xmm2                   \n"
+      "movdqu    %%xmm1,0x00(%1,%0,2)            \n"
+      "movdqu    %%xmm2,0x10(%1,%0,2)            \n"
+      "lea       0x10(%0),%0                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+        "xmm6", "xmm7");
 }
 
-void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "mov       $0x1080108,%%eax                \n"
-    "movd      %%eax,%%xmm5                    \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "mov       $0x42004200,%%eax               \n"
-    "movd      %%eax,%%xmm6                    \n"
-    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
-    "pcmpeqb   %%xmm3,%%xmm3                   \n"
-    "psllw     $0xb,%%xmm3                     \n"
-    "movdqa    %%xmm3,%%xmm4                   \n"
-    "psrlw     $0x6,%%xmm4                     \n"
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "psllw     $0x8,%%xmm7                     \n"
-    "sub       %0,%1                           \n"
-    "sub       %0,%1                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "psllw     $0x1,%%xmm1                     \n"
-    "psllw     $0xb,%%xmm2                     \n"
-    "pand      %%xmm3,%%xmm1                   \n"
-    "pmulhuw   %%xmm5,%%xmm2                   \n"
-    "pmulhuw   %%xmm5,%%xmm1                   \n"
-    "psllw     $0x8,%%xmm1                     \n"
-    "por       %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "pand      %%xmm4,%%xmm0                   \n"
-    "psraw     $0x8,%%xmm2                     \n"
-    "pmulhuw   %%xmm6,%%xmm0                   \n"
-    "pand      %%xmm7,%%xmm2                   \n"
-    "por       %%xmm2,%%xmm0                   \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "punpcklbw %%xmm0,%%xmm1                   \n"
-    "punpckhbw %%xmm0,%%xmm2                   \n"
-    MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
-    MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  :
-  : "memory", "cc", "eax", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "mov       $0x1080108,%%eax                \n"
+      "movd      %%eax,%%xmm5                    \n"
+      "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+      "mov       $0x42004200,%%eax               \n"
+      "movd      %%eax,%%xmm6                    \n"
+      "pshufd    $0x0,%%xmm6,%%xmm6              \n"
+      "pcmpeqb   %%xmm3,%%xmm3                   \n"
+      "psllw     $0xb,%%xmm3                     \n"
+      "movdqa    %%xmm3,%%xmm4                   \n"
+      "psrlw     $0x6,%%xmm4                     \n"
+      "pcmpeqb   %%xmm7,%%xmm7                   \n"
+      "psllw     $0x8,%%xmm7                     \n"
+      "sub       %0,%1                           \n"
+      "sub       %0,%1                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "psllw     $0x1,%%xmm1                     \n"
+      "psllw     $0xb,%%xmm2                     \n"
+      "pand      %%xmm3,%%xmm1                   \n"
+      "pmulhuw   %%xmm5,%%xmm2                   \n"
+      "pmulhuw   %%xmm5,%%xmm1                   \n"
+      "psllw     $0x8,%%xmm1                     \n"
+      "por       %%xmm2,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "pand      %%xmm4,%%xmm0                   \n"
+      "psraw     $0x8,%%xmm2                     \n"
+      "pmulhuw   %%xmm6,%%xmm0                   \n"
+      "pand      %%xmm7,%%xmm2                   \n"
+      "por       %%xmm2,%%xmm0                   \n"
+      "movdqa    %%xmm1,%%xmm2                   \n"
+      "punpcklbw %%xmm0,%%xmm1                   \n"
+      "punpckhbw %%xmm0,%%xmm2                   \n"
+      "movdqu    %%xmm1,0x00(%1,%0,2)            \n"
+      "movdqu    %%xmm2,0x10(%1,%0,2)            \n"
+      "lea       0x10(%0),%0                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+        "xmm6", "xmm7");
 }
 
-void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "mov       $0xf0f0f0f,%%eax                \n"
-    "movd      %%eax,%%xmm4                    \n"
-    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
-    "movdqa    %%xmm4,%%xmm5                   \n"
-    "pslld     $0x4,%%xmm5                     \n"
-    "sub       %0,%1                           \n"
-    "sub       %0,%1                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "pand      %%xmm4,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm3                   \n"
-    "psllw     $0x4,%%xmm1                     \n"
-    "psrlw     $0x4,%%xmm3                     \n"
-    "por       %%xmm1,%%xmm0                   \n"
-    "por       %%xmm3,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm2,%%xmm0                   \n"
-    "punpckhbw %%xmm2,%%xmm1                   \n"
-    MEMOPMEM(movdqu,xmm0,0x00,1,0,2)           //  movdqu  %%xmm0,(%1,%0,2)
-    MEMOPMEM(movdqu,xmm1,0x10,1,0,2)           //  movdqu  %%xmm1,0x10(%1,%0,2)
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  :
-  : "memory", "cc", "eax", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "mov       $0xf0f0f0f,%%eax                \n"
+      "movd      %%eax,%%xmm4                    \n"
+      "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+      "movdqa    %%xmm4,%%xmm5                   \n"
+      "pslld     $0x4,%%xmm5                     \n"
+      "sub       %0,%1                           \n"
+      "sub       %0,%1                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "pand      %%xmm4,%%xmm0                   \n"
+      "pand      %%xmm5,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm2,%%xmm3                   \n"
+      "psllw     $0x4,%%xmm1                     \n"
+      "psrlw     $0x4,%%xmm3                     \n"
+      "por       %%xmm1,%%xmm0                   \n"
+      "por       %%xmm3,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpcklbw %%xmm2,%%xmm0                   \n"
+      "punpckhbw %%xmm2,%%xmm1                   \n"
+      "movdqu    %%xmm0,0x00(%1,%0,2)            \n"
+      "movdqu    %%xmm1,0x10(%1,%0,2)            \n"
+      "lea       0x10(%0),%0                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
-void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "movdqa    %3,%%xmm6                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "pshufb    %%xmm6,%%xmm0                   \n"
-    "pshufb    %%xmm6,%%xmm1                   \n"
-    "pshufb    %%xmm6,%%xmm2                   \n"
-    "pshufb    %%xmm6,%%xmm3                   \n"
-    "movdqa    %%xmm1,%%xmm4                   \n"
-    "psrldq    $0x4,%%xmm1                     \n"
-    "pslldq    $0xc,%%xmm4                     \n"
-    "movdqa    %%xmm2,%%xmm5                   \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pslldq    $0x8,%%xmm5                     \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "psrldq    $0x8,%%xmm2                     \n"
-    "pslldq    $0x4,%%xmm3                     \n"
-    "por       %%xmm3,%%xmm2                   \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
-    "lea       " MEMLEA(0x30,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  : "m"(kShuffleMaskARGBToRGB24)  // %3
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+
+      "movdqa    %3,%%xmm6                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "pshufb    %%xmm6,%%xmm0                   \n"
+      "pshufb    %%xmm6,%%xmm1                   \n"
+      "pshufb    %%xmm6,%%xmm2                   \n"
+      "pshufb    %%xmm6,%%xmm3                   \n"
+      "movdqa    %%xmm1,%%xmm4                   \n"
+      "psrldq    $0x4,%%xmm1                     \n"
+      "pslldq    $0xc,%%xmm4                     \n"
+      "movdqa    %%xmm2,%%xmm5                   \n"
+      "por       %%xmm4,%%xmm0                   \n"
+      "pslldq    $0x8,%%xmm5                     \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "por       %%xmm5,%%xmm1                   \n"
+      "psrldq    $0x8,%%xmm2                     \n"
+      "pslldq    $0x4,%%xmm3                     \n"
+      "por       %%xmm3,%%xmm2                   \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "movdqu    %%xmm2,0x20(%1)                 \n"
+      "lea       0x30(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src),                    // %0
+        "+r"(dst),                    // %1
+        "+r"(width)                   // %2
+      : "m"(kShuffleMaskARGBToRGB24)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 
-void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "movdqa    %3,%%xmm6                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "pshufb    %%xmm6,%%xmm0                   \n"
-    "pshufb    %%xmm6,%%xmm1                   \n"
-    "pshufb    %%xmm6,%%xmm2                   \n"
-    "pshufb    %%xmm6,%%xmm3                   \n"
-    "movdqa    %%xmm1,%%xmm4                   \n"
-    "psrldq    $0x4,%%xmm1                     \n"
-    "pslldq    $0xc,%%xmm4                     \n"
-    "movdqa    %%xmm2,%%xmm5                   \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pslldq    $0x8,%%xmm5                     \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "psrldq    $0x8,%%xmm2                     \n"
-    "pslldq    $0x4,%%xmm3                     \n"
-    "por       %%xmm3,%%xmm2                   \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
-    "lea       " MEMLEA(0x30,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  : "m"(kShuffleMaskARGBToRAW)  // %3
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+
+      "movdqa    %3,%%xmm6                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "pshufb    %%xmm6,%%xmm0                   \n"
+      "pshufb    %%xmm6,%%xmm1                   \n"
+      "pshufb    %%xmm6,%%xmm2                   \n"
+      "pshufb    %%xmm6,%%xmm3                   \n"
+      "movdqa    %%xmm1,%%xmm4                   \n"
+      "psrldq    $0x4,%%xmm1                     \n"
+      "pslldq    $0xc,%%xmm4                     \n"
+      "movdqa    %%xmm2,%%xmm5                   \n"
+      "por       %%xmm4,%%xmm0                   \n"
+      "pslldq    $0x8,%%xmm5                     \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "por       %%xmm5,%%xmm1                   \n"
+      "psrldq    $0x8,%%xmm2                     \n"
+      "pslldq    $0x4,%%xmm3                     \n"
+      "por       %%xmm3,%%xmm2                   \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "movdqu    %%xmm2,0x20(%1)                 \n"
+      "lea       0x30(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src),                  // %0
+        "+r"(dst),                  // %1
+        "+r"(width)                 // %2
+      : "m"(kShuffleMaskARGBToRAW)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 
-void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm3,%%xmm3                   \n"
-    "psrld     $0x1b,%%xmm3                    \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "psrld     $0x1a,%%xmm4                    \n"
-    "pslld     $0x5,%%xmm4                     \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pslld     $0xb,%%xmm5                     \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "pslld     $0x8,%%xmm0                     \n"
-    "psrld     $0x3,%%xmm1                     \n"
-    "psrld     $0x5,%%xmm2                     \n"
-    "psrad     $0x10,%%xmm0                    \n"
-    "pand      %%xmm3,%%xmm1                   \n"
-    "pand      %%xmm4,%%xmm2                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "por       %%xmm2,%%xmm1                   \n"
-    "por       %%xmm1,%%xmm0                   \n"
-    "packssdw  %%xmm0,%%xmm0                   \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+#ifdef HAS_ARGBTORGB24ROW_AVX2
+// vpermd for 12+12 to 24
+static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
+
+void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm6                  \n"
+      "vmovdqa    %4,%%ymm7                      \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x40(%0),%%ymm2                \n"
+      "vmovdqu    0x60(%0),%%ymm3                \n"
+      "lea        0x80(%0),%0                    \n"
+      "vpshufb    %%ymm6,%%ymm0,%%ymm0           \n"  // xxx0yyy0
+      "vpshufb    %%ymm6,%%ymm1,%%ymm1           \n"
+      "vpshufb    %%ymm6,%%ymm2,%%ymm2           \n"
+      "vpshufb    %%ymm6,%%ymm3,%%ymm3           \n"
+      "vpermd     %%ymm0,%%ymm7,%%ymm0           \n"  // pack to 24 bytes
+      "vpermd     %%ymm1,%%ymm7,%%ymm1           \n"
+      "vpermd     %%ymm2,%%ymm7,%%ymm2           \n"
+      "vpermd     %%ymm3,%%ymm7,%%ymm3           \n"
+      "vpermq     $0x3f,%%ymm1,%%ymm4            \n"  // combine 24 + 8
+      "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "vpermq     $0xf9,%%ymm1,%%ymm1            \n"  // combine 16 + 16
+      "vpermq     $0x4f,%%ymm2,%%ymm4            \n"
+      "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
+      "vmovdqu    %%ymm1,0x20(%1)                \n"
+      "vpermq     $0xfe,%%ymm2,%%ymm2            \n"  // combine 8 + 24
+      "vpermq     $0x93,%%ymm3,%%ymm3            \n"
+      "vpor       %%ymm3,%%ymm2,%%ymm2           \n"
+      "vmovdqu    %%ymm2,0x40(%1)                \n"
+      "lea        0x60(%1),%1                    \n"
+      "sub        $0x20,%2                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src),                     // %0
+        "+r"(dst),                     // %1
+        "+r"(width)                    // %2
+      : "m"(kShuffleMaskARGBToRGB24),  // %3
+        "m"(kPermdRGB24_AVX)           // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif
+
+#ifdef HAS_ARGBTORGB24ROW_AVX512VBMI
+// Shuffle table for converting ARGBToRGB24
+static const ulvec8 kPermARGBToRGB24_0 = {
+    0u,  1u,  2u,  4u,  5u,  6u,  8u,  9u,  10u, 12u, 13u,
+    14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u,
+    29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u};
+static const ulvec8 kPermARGBToRGB24_1 = {
+    10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u,
+    25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u,
+    40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u};
+static const ulvec8 kPermARGBToRGB24_2 = {
+    21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u,
+    36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u,
+    50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u};
+
+void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "vmovdqa    %3,%%ymm5                      \n"
+      "vmovdqa    %4,%%ymm6                      \n"
+      "vmovdqa    %5,%%ymm7                      \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x40(%0),%%ymm2                \n"
+      "vmovdqu    0x60(%0),%%ymm3                \n"
+      "lea        0x80(%0),%0                    \n"
+      "vpermt2b   %%ymm1,%%ymm5,%%ymm0           \n"
+      "vpermt2b   %%ymm2,%%ymm6,%%ymm1           \n"
+      "vpermt2b   %%ymm3,%%ymm7,%%ymm2           \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "vmovdqu    %%ymm1,0x20(%1)                \n"
+      "vmovdqu    %%ymm2,0x40(%1)                \n"
+      "lea        0x60(%1),%1                    \n"
+      "sub        $0x20,%2                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src),                // %0
+        "+r"(dst),                // %1
+        "+r"(width)               // %2
+      : "m"(kPermARGBToRGB24_0),  // %3
+        "m"(kPermARGBToRGB24_1),  // %4
+        "m"(kPermARGBToRGB24_2)   // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7");
+}
+#endif
+
+#ifdef HAS_ARGBTORAWROW_AVX2
+void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm6                  \n"
+      "vmovdqa    %4,%%ymm7                      \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x40(%0),%%ymm2                \n"
+      "vmovdqu    0x60(%0),%%ymm3                \n"
+      "lea        0x80(%0),%0                    \n"
+      "vpshufb    %%ymm6,%%ymm0,%%ymm0           \n"  // xxx0yyy0
+      "vpshufb    %%ymm6,%%ymm1,%%ymm1           \n"
+      "vpshufb    %%ymm6,%%ymm2,%%ymm2           \n"
+      "vpshufb    %%ymm6,%%ymm3,%%ymm3           \n"
+      "vpermd     %%ymm0,%%ymm7,%%ymm0           \n"  // pack to 24 bytes
+      "vpermd     %%ymm1,%%ymm7,%%ymm1           \n"
+      "vpermd     %%ymm2,%%ymm7,%%ymm2           \n"
+      "vpermd     %%ymm3,%%ymm7,%%ymm3           \n"
+      "vpermq     $0x3f,%%ymm1,%%ymm4            \n"  // combine 24 + 8
+      "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "vpermq     $0xf9,%%ymm1,%%ymm1            \n"  // combine 16 + 16
+      "vpermq     $0x4f,%%ymm2,%%ymm4            \n"
+      "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
+      "vmovdqu    %%ymm1,0x20(%1)                \n"
+      "vpermq     $0xfe,%%ymm2,%%ymm2            \n"  // combine 8 + 24
+      "vpermq     $0x93,%%ymm3,%%ymm3            \n"
+      "vpor       %%ymm3,%%ymm2,%%ymm2           \n"
+      "vmovdqu    %%ymm2,0x40(%1)                \n"
+      "lea        0x60(%1),%1                    \n"
+      "sub        $0x20,%2                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src),                   // %0
+        "+r"(dst),                   // %1
+        "+r"(width)                  // %2
+      : "m"(kShuffleMaskARGBToRAW),  // %3
+        "m"(kPermdRGB24_AVX)         // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif
+
+void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm3,%%xmm3                   \n"
+      "psrld     $0x1b,%%xmm3                    \n"
+      "pcmpeqb   %%xmm4,%%xmm4                   \n"
+      "psrld     $0x1a,%%xmm4                    \n"
+      "pslld     $0x5,%%xmm4                     \n"
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "pslld     $0xb,%%xmm5                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "pslld     $0x8,%%xmm0                     \n"
+      "psrld     $0x3,%%xmm1                     \n"
+      "psrld     $0x5,%%xmm2                     \n"
+      "psrad     $0x10,%%xmm0                    \n"
+      "pand      %%xmm3,%%xmm1                   \n"
+      "pand      %%xmm4,%%xmm2                   \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "por       %%xmm2,%%xmm1                   \n"
+      "por       %%xmm1,%%xmm0                   \n"
+      "packssdw  %%xmm0,%%xmm0                   \n"
+      "lea       0x10(%0),%0                     \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
-void ARGBToRGB565DitherRow_SSE2(const uint8* src, uint8* dst,
-                                const uint32 dither4, int width) {
-  asm volatile (
-    "movd       %3,%%xmm6                      \n"
-    "punpcklbw  %%xmm6,%%xmm6                  \n"
-    "movdqa     %%xmm6,%%xmm7                  \n"
-    "punpcklwd  %%xmm6,%%xmm6                  \n"
-    "punpckhwd  %%xmm7,%%xmm7                  \n"
-    "pcmpeqb    %%xmm3,%%xmm3                  \n"
-    "psrld      $0x1b,%%xmm3                   \n"
-    "pcmpeqb    %%xmm4,%%xmm4                  \n"
-    "psrld      $0x1a,%%xmm4                   \n"
-    "pslld      $0x5,%%xmm4                    \n"
-    "pcmpeqb    %%xmm5,%%xmm5                  \n"
-    "pslld      $0xb,%%xmm5                    \n"
+void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
+                                uint8_t* dst,
+                                const uint32_t dither4,
+                                int width) {
+  asm volatile(
+      "movd       %3,%%xmm6                      \n"
+      "punpcklbw  %%xmm6,%%xmm6                  \n"
+      "movdqa     %%xmm6,%%xmm7                  \n"
+      "punpcklwd  %%xmm6,%%xmm6                  \n"
+      "punpckhwd  %%xmm7,%%xmm7                  \n"
+      "pcmpeqb    %%xmm3,%%xmm3                  \n"
+      "psrld      $0x1b,%%xmm3                   \n"
+      "pcmpeqb    %%xmm4,%%xmm4                  \n"
+      "psrld      $0x1a,%%xmm4                   \n"
+      "pslld      $0x5,%%xmm4                    \n"
+      "pcmpeqb    %%xmm5,%%xmm5                  \n"
+      "pslld      $0xb,%%xmm5                    \n"
 
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu     (%0),%%xmm0                    \n"
-    "paddusb    %%xmm6,%%xmm0                  \n"
-    "movdqa     %%xmm0,%%xmm1                  \n"
-    "movdqa     %%xmm0,%%xmm2                  \n"
-    "pslld      $0x8,%%xmm0                    \n"
-    "psrld      $0x3,%%xmm1                    \n"
-    "psrld      $0x5,%%xmm2                    \n"
-    "psrad      $0x10,%%xmm0                   \n"
-    "pand       %%xmm3,%%xmm1                  \n"
-    "pand       %%xmm4,%%xmm2                  \n"
-    "pand       %%xmm5,%%xmm0                  \n"
-    "por        %%xmm2,%%xmm1                  \n"
-    "por        %%xmm1,%%xmm0                  \n"
-    "packssdw   %%xmm0,%%xmm0                  \n"
-    "lea        0x10(%0),%0                    \n"
-    "movq       %%xmm0,(%1)                    \n"
-    "lea        0x8(%1),%1                     \n"
-    "sub        $0x4,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  : "m"(dither4) // %3
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu     (%0),%%xmm0                    \n"
+      "paddusb    %%xmm6,%%xmm0                  \n"
+      "movdqa     %%xmm0,%%xmm1                  \n"
+      "movdqa     %%xmm0,%%xmm2                  \n"
+      "pslld      $0x8,%%xmm0                    \n"
+      "psrld      $0x3,%%xmm1                    \n"
+      "psrld      $0x5,%%xmm2                    \n"
+      "psrad      $0x10,%%xmm0                   \n"
+      "pand       %%xmm3,%%xmm1                  \n"
+      "pand       %%xmm4,%%xmm2                  \n"
+      "pand       %%xmm5,%%xmm0                  \n"
+      "por        %%xmm2,%%xmm1                  \n"
+      "por        %%xmm1,%%xmm0                  \n"
+      "packssdw   %%xmm0,%%xmm0                  \n"
+      "lea        0x10(%0),%0                    \n"
+      "movq       %%xmm0,(%1)                    \n"
+      "lea        0x8(%1),%1                     \n"
+      "sub        $0x4,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src),    // %0
+        "+r"(dst),    // %1
+        "+r"(width)   // %2
+      : "m"(dither4)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 
 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
-void ARGBToRGB565DitherRow_AVX2(const uint8* src, uint8* dst,
-                                const uint32 dither4, int width) {
-  asm volatile (
-    "vbroadcastss %3,%%xmm6                    \n"
-    "vpunpcklbw %%xmm6,%%xmm6,%%xmm6           \n"
-    "vpermq     $0xd8,%%ymm6,%%ymm6            \n"
-    "vpunpcklwd %%ymm6,%%ymm6,%%ymm6           \n"
-    "vpcmpeqb   %%ymm3,%%ymm3,%%ymm3           \n"
-    "vpsrld     $0x1b,%%ymm3,%%ymm3            \n"
-    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
-    "vpsrld     $0x1a,%%ymm4,%%ymm4            \n"
-    "vpslld     $0x5,%%ymm4,%%ymm4             \n"
-    "vpslld     $0xb,%%ymm3,%%ymm5             \n"
+void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
+                                uint8_t* dst,
+                                const uint32_t dither4,
+                                int width) {
+  asm volatile(
+      "vbroadcastss %3,%%xmm6                    \n"
+      "vpunpcklbw %%xmm6,%%xmm6,%%xmm6           \n"
+      "vpermq     $0xd8,%%ymm6,%%ymm6            \n"
+      "vpunpcklwd %%ymm6,%%ymm6,%%ymm6           \n"
+      "vpcmpeqb   %%ymm3,%%ymm3,%%ymm3           \n"
+      "vpsrld     $0x1b,%%ymm3,%%ymm3            \n"
+      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpsrld     $0x1a,%%ymm4,%%ymm4            \n"
+      "vpslld     $0x5,%%ymm4,%%ymm4             \n"
+      "vpslld     $0xb,%%ymm3,%%ymm5             \n"
 
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    (%0),%%ymm0                    \n"
-    "vpaddusb   %%ymm6,%%ymm0,%%ymm0           \n"
-    "vpsrld     $0x5,%%ymm0,%%ymm2             \n"
-    "vpsrld     $0x3,%%ymm0,%%ymm1             \n"
-    "vpsrld     $0x8,%%ymm0,%%ymm0             \n"
-    "vpand      %%ymm4,%%ymm2,%%ymm2           \n"
-    "vpand      %%ymm3,%%ymm1,%%ymm1           \n"
-    "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
-    "vpor       %%ymm2,%%ymm1,%%ymm1           \n"
-    "vpor       %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpackusdw  %%ymm0,%%ymm0,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "lea        0x20(%0),%0                    \n"
-    "vmovdqu    %%xmm0,(%1)                    \n"
-    "lea        0x10(%1),%1                    \n"
-    "sub        $0x8,%2                        \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  : "m"(dither4) // %3
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vpaddusb   %%ymm6,%%ymm0,%%ymm0           \n"
+      "vpsrld     $0x5,%%ymm0,%%ymm2             \n"
+      "vpsrld     $0x3,%%ymm0,%%ymm1             \n"
+      "vpsrld     $0x8,%%ymm0,%%ymm0             \n"
+      "vpand      %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpand      %%ymm3,%%ymm1,%%ymm1           \n"
+      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpor       %%ymm2,%%ymm1,%%ymm1           \n"
+      "vpor       %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpackusdw  %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "lea        0x20(%0),%0                    \n"
+      "vmovdqu    %%xmm0,(%1)                    \n"
+      "lea        0x10(%1),%1                    \n"
+      "sub        $0x8,%2                        \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src),    // %0
+        "+r"(dst),    // %1
+        "+r"(width)   // %2
+      : "m"(dither4)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBTORGB565DITHERROW_AVX2
 
+void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm4,%%xmm4                   \n"
+      "psrld     $0x1b,%%xmm4                    \n"
+      "movdqa    %%xmm4,%%xmm5                   \n"
+      "pslld     $0x5,%%xmm5                     \n"
+      "movdqa    %%xmm4,%%xmm6                   \n"
+      "pslld     $0xa,%%xmm6                     \n"
+      "pcmpeqb   %%xmm7,%%xmm7                   \n"
+      "pslld     $0xf,%%xmm7                     \n"
 
-void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "psrld     $0x1b,%%xmm4                    \n"
-    "movdqa    %%xmm4,%%xmm5                   \n"
-    "pslld     $0x5,%%xmm5                     \n"
-    "movdqa    %%xmm4,%%xmm6                   \n"
-    "pslld     $0xa,%%xmm6                     \n"
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "pslld     $0xf,%%xmm7                     \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm3                   \n"
-    "psrad     $0x10,%%xmm0                    \n"
-    "psrld     $0x3,%%xmm1                     \n"
-    "psrld     $0x6,%%xmm2                     \n"
-    "psrld     $0x9,%%xmm3                     \n"
-    "pand      %%xmm7,%%xmm0                   \n"
-    "pand      %%xmm4,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm2                   \n"
-    "pand      %%xmm6,%%xmm3                   \n"
-    "por       %%xmm1,%%xmm0                   \n"
-    "por       %%xmm3,%%xmm2                   \n"
-    "por       %%xmm2,%%xmm0                   \n"
-    "packssdw  %%xmm0,%%xmm0                   \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  :: "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm3                   \n"
+      "psrad     $0x10,%%xmm0                    \n"
+      "psrld     $0x3,%%xmm1                     \n"
+      "psrld     $0x6,%%xmm2                     \n"
+      "psrld     $0x9,%%xmm3                     \n"
+      "pand      %%xmm7,%%xmm0                   \n"
+      "pand      %%xmm4,%%xmm1                   \n"
+      "pand      %%xmm5,%%xmm2                   \n"
+      "pand      %%xmm6,%%xmm3                   \n"
+      "por       %%xmm1,%%xmm0                   \n"
+      "por       %%xmm3,%%xmm2                   \n"
+      "por       %%xmm2,%%xmm0                   \n"
+      "packssdw  %%xmm0,%%xmm0                   \n"
+      "lea       0x10(%0),%0                     \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
 }
 
-void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "psllw     $0xc,%%xmm4                     \n"
-    "movdqa    %%xmm4,%%xmm3                   \n"
-    "psrlw     $0x8,%%xmm3                     \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm3,%%xmm0                   \n"
-    "pand      %%xmm4,%%xmm1                   \n"
-    "psrlq     $0x4,%%xmm0                     \n"
-    "psrlq     $0x8,%%xmm1                     \n"
-    "por       %%xmm1,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-  );
+void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm4,%%xmm4                   \n"
+      "psllw     $0xc,%%xmm4                     \n"
+      "movdqa    %%xmm4,%%xmm3                   \n"
+      "psrlw     $0x8,%%xmm3                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "pand      %%xmm3,%%xmm0                   \n"
+      "pand      %%xmm4,%%xmm1                   \n"
+      "psrlq     $0x4,%%xmm0                     \n"
+      "psrlq     $0x8,%%xmm1                     \n"
+      "por       %%xmm1,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "lea       0x10(%0),%0                     \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
 }
 #endif  // HAS_RGB24TOARGBROW_SSSE3
 
+/*
+
+ARGBToAR30Row:
+
+Red Blue
+With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will
+produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats
+wanted for the blue channel. The red needs to be shifted 4 left, so multiply by
+(1024+4)*16 for red.
+
+Alpha Green
+Alpha and Green are already in the high bits so vpand can zero out the other
+bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier
+could be used for Green - (1024+4) putting the 10 bit green in the lsb.  Alpha
+would be a simple multiplier to shift it into position.  It wants a gap of 10
+above the green.  Green is 10 bits, so there are 6 bits in the low short.  4
+more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits,
+and then a shift of 4 is a multiply of 16, so (4*16) = 64.  Then shift the
+result left 10 to position the A and G channels.
+*/
+
+// Shuffle table for converting RAW to RGB24.  Last 8.
+static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u,  128u, 4u,  128u, 6u,
+                                   128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u};
+
+static const uvec8 kShuffleBR30 = {128u, 2u,  128u, 0u, 128u, 6u,  128u, 4u,
+                                   128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u};
+
+static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028;
+static const uint32_t kMaskRB10 = 0x3ff003ff;
+static const uint32_t kMaskAG10 = 0xc000ff00;
+static const uint32_t kMulAG10 = 64 * 65536 + 1028;
+
+void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "movdqa     %3,%%xmm2                     \n"  // shuffler for RB
+      "movd       %4,%%xmm3                     \n"  // multipler for RB
+      "movd       %5,%%xmm4                     \n"  // mask for R10 B10
+      "movd       %6,%%xmm5                     \n"  // mask for AG
+      "movd       %7,%%xmm6                     \n"  // multipler for AG
+      "pshufd     $0x0,%%xmm3,%%xmm3            \n"
+      "pshufd     $0x0,%%xmm4,%%xmm4            \n"
+      "pshufd     $0x0,%%xmm5,%%xmm5            \n"
+      "pshufd     $0x0,%%xmm6,%%xmm6            \n"
+      "sub        %0,%1                         \n"
+
+      "1:                                       \n"
+      "movdqu     (%0),%%xmm0                   \n"  // fetch 4 ARGB pixels
+      "movdqa     %%xmm0,%%xmm1                 \n"
+      "pshufb     %%xmm2,%%xmm1                 \n"  // R0B0
+      "pand       %%xmm5,%%xmm0                 \n"  // A0G0
+      "pmulhuw    %%xmm3,%%xmm1                 \n"  // X2 R16 X4  B10
+      "pmulhuw    %%xmm6,%%xmm0                 \n"  // X10 A2 X10 G10
+      "pand       %%xmm4,%%xmm1                 \n"  // X2 R10 X10 B10
+      "pslld      $10,%%xmm0                    \n"  // A2 x10 G10 x10
+      "por        %%xmm1,%%xmm0                 \n"  // A2 R10 G10 B10
+      "movdqu     %%xmm0,(%1,%0)                \n"  // store 4 AR30 pixels
+      "add        $0x10,%0                      \n"
+      "sub        $0x4,%2                       \n"
+      "jg         1b                            \n"
+
+      : "+r"(src),          // %0
+        "+r"(dst),          // %1
+        "+r"(width)         // %2
+      : "m"(kShuffleRB30),  // %3
+        "m"(kMulRB10),      // %4
+        "m"(kMaskRB10),     // %5
+        "m"(kMaskAG10),     // %6
+        "m"(kMulAG10)       // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "movdqa     %3,%%xmm2                     \n"  // shuffler for RB
+      "movd       %4,%%xmm3                     \n"  // multipler for RB
+      "movd       %5,%%xmm4                     \n"  // mask for R10 B10
+      "movd       %6,%%xmm5                     \n"  // mask for AG
+      "movd       %7,%%xmm6                     \n"  // multipler for AG
+      "pshufd     $0x0,%%xmm3,%%xmm3            \n"
+      "pshufd     $0x0,%%xmm4,%%xmm4            \n"
+      "pshufd     $0x0,%%xmm5,%%xmm5            \n"
+      "pshufd     $0x0,%%xmm6,%%xmm6            \n"
+      "sub        %0,%1                         \n"
+
+      "1:                                       \n"
+      "movdqu     (%0),%%xmm0                   \n"  // fetch 4 ABGR pixels
+      "movdqa     %%xmm0,%%xmm1                 \n"
+      "pshufb     %%xmm2,%%xmm1                 \n"  // R0B0
+      "pand       %%xmm5,%%xmm0                 \n"  // A0G0
+      "pmulhuw    %%xmm3,%%xmm1                 \n"  // X2 R16 X4  B10
+      "pmulhuw    %%xmm6,%%xmm0                 \n"  // X10 A2 X10 G10
+      "pand       %%xmm4,%%xmm1                 \n"  // X2 R10 X10 B10
+      "pslld      $10,%%xmm0                    \n"  // A2 x10 G10 x10
+      "por        %%xmm1,%%xmm0                 \n"  // A2 R10 G10 B10
+      "movdqu     %%xmm0,(%1,%0)                \n"  // store 4 AR30 pixels
+      "add        $0x10,%0                      \n"
+      "sub        $0x4,%2                       \n"
+      "jg         1b                            \n"
+
+      : "+r"(src),          // %0
+        "+r"(dst),          // %1
+        "+r"(width)         // %2
+      : "m"(kShuffleBR30),  // %3  reversed shuffler
+        "m"(kMulRB10),      // %4
+        "m"(kMaskRB10),     // %5
+        "m"(kMaskAG10),     // %6
+        "m"(kMulAG10)       // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+#ifdef HAS_ARGBTOAR30ROW_AVX2
+void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
+      "vbroadcastss  %4,%%ymm3                   \n"  // multipler for RB
+      "vbroadcastss  %5,%%ymm4                   \n"  // mask for R10 B10
+      "vbroadcastss  %6,%%ymm5                   \n"  // mask for AG
+      "vbroadcastss  %7,%%ymm6                   \n"  // multipler for AG
+      "sub        %0,%1                          \n"
+
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"  // fetch 8 ARGB pixels
+      "vpshufb    %%ymm2,%%ymm0,%%ymm1           \n"  // R0B0
+      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"  // A0G0
+      "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"  // X2 R16 X4  B10
+      "vpmulhuw   %%ymm6,%%ymm0,%%ymm0           \n"  // X10 A2 X10 G10
+      "vpand      %%ymm4,%%ymm1,%%ymm1           \n"  // X2 R10 X10 B10
+      "vpslld     $10,%%ymm0,%%ymm0              \n"  // A2 x10 G10 x10
+      "vpor       %%ymm1,%%ymm0,%%ymm0           \n"  // A2 R10 G10 B10
+      "vmovdqu    %%ymm0,(%1,%0)                 \n"  // store 8 AR30 pixels
+      "add        $0x20,%0                       \n"
+      "sub        $0x8,%2                        \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+
+      : "+r"(src),          // %0
+        "+r"(dst),          // %1
+        "+r"(width)         // %2
+      : "m"(kShuffleRB30),  // %3
+        "m"(kMulRB10),      // %4
+        "m"(kMaskRB10),     // %5
+        "m"(kMaskAG10),     // %6
+        "m"(kMulAG10)       // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#ifdef HAS_ABGRTOAR30ROW_AVX2
+void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
+      "vbroadcastss  %4,%%ymm3                   \n"  // multipler for RB
+      "vbroadcastss  %5,%%ymm4                   \n"  // mask for R10 B10
+      "vbroadcastss  %6,%%ymm5                   \n"  // mask for AG
+      "vbroadcastss  %7,%%ymm6                   \n"  // multipler for AG
+      "sub        %0,%1                          \n"
+
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"  // fetch 8 ABGR pixels
+      "vpshufb    %%ymm2,%%ymm0,%%ymm1           \n"  // R0B0
+      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"  // A0G0
+      "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"  // X2 R16 X4  B10
+      "vpmulhuw   %%ymm6,%%ymm0,%%ymm0           \n"  // X10 A2 X10 G10
+      "vpand      %%ymm4,%%ymm1,%%ymm1           \n"  // X2 R10 X10 B10
+      "vpslld     $10,%%ymm0,%%ymm0              \n"  // A2 x10 G10 x10
+      "vpor       %%ymm1,%%ymm0,%%ymm0           \n"  // A2 R10 G10 B10
+      "vmovdqu    %%ymm0,(%1,%0)                 \n"  // store 8 AR30 pixels
+      "add        $0x20,%0                       \n"
+      "sub        $0x8,%2                        \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+
+      : "+r"(src),          // %0
+        "+r"(dst),          // %1
+        "+r"(width)         // %2
+      : "m"(kShuffleBR30),  // %3  reversed shuffler
+        "m"(kMulRB10),      // %4
+        "m"(kMaskRB10),     // %5
+        "m"(kMaskAG10),     // %6
+        "m"(kMulAG10)       // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
 #ifdef HAS_ARGBTOYROW_SSSE3
 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
-  asm volatile (
-    "movdqa    %3,%%xmm4                       \n"
-    "movdqa    %4,%%xmm5                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  : "m"(kARGBToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movdqa    %3,%%xmm4                       \n"
+      "movdqa    %4,%%xmm5                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm4,%%xmm3                   \n"
+      "lea       0x40(%0),%0                     \n"
+      "phaddw    %%xmm1,%%xmm0                   \n"
+      "phaddw    %%xmm3,%%xmm2                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "psrlw     $0x7,%%xmm2                     \n"
+      "packuswb  %%xmm2,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kARGBToY),   // %3
+        "m"(kAddY16)     // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_ARGBTOYROW_SSSE3
 
 #ifdef HAS_ARGBTOYJROW_SSSE3
 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
-void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
-  asm volatile (
-    "movdqa    %3,%%xmm4                       \n"
-    "movdqa    %4,%%xmm5                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "paddw     %%xmm5,%%xmm0                   \n"
-    "paddw     %%xmm5,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  : "m"(kARGBToYJ),  // %3
-    "m"(kAddYJ64)    // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movdqa    %3,%%xmm4                       \n"
+      "movdqa    %4,%%xmm5                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm4,%%xmm3                   \n"
+      "lea       0x40(%0),%0                     \n"
+      "phaddw    %%xmm1,%%xmm0                   \n"
+      "phaddw    %%xmm3,%%xmm2                   \n"
+      "paddw     %%xmm5,%%xmm0                   \n"
+      "paddw     %%xmm5,%%xmm2                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "psrlw     $0x7,%%xmm2                     \n"
+      "packuswb  %%xmm2,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kARGBToYJ),  // %3
+        "m"(kAddYJ64)    // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_ARGBTOYJROW_SSSE3
 
 #ifdef HAS_ARGBTOYROW_AVX2
 // vpermd for vphaddw + vpackuswb vpermd.
-static const lvec32 kPermdARGBToY_AVX = {
-  0, 4, 1, 5, 2, 6, 3, 7
-};
+static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
 
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
-  asm volatile (
-    "vbroadcastf128 %3,%%ymm4                  \n"
-    "vbroadcastf128 %4,%%ymm5                  \n"
-    "vmovdqu    %5,%%ymm6                      \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
-    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
-    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
-    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
-    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-    "lea       " MEMLEA(0x80,0) ",%0           \n"
-    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
-    "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
-    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
-    "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
-    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
-    "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
-    "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"  // add 16 for Y
-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  : "m"(kARGBToY),   // %3
-    "m"(kAddY16),    // %4
-    "m"(kPermdARGBToY_AVX)  // %5
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm4                  \n"
+      "vbroadcastf128 %4,%%ymm5                  \n"
+      "vmovdqu    %5,%%ymm6                      \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x40(%0),%%ymm2                \n"
+      "vmovdqu    0x60(%0),%%ymm3                \n"
+      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "lea       0x80(%0),%0                     \n"
+      "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
+      "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
+      "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
+      "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
+      "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
+      "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
+      "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"  // add 16 for Y
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),         // %0
+        "+r"(dst_y),            // %1
+        "+r"(width)             // %2
+      : "m"(kARGBToY),          // %3
+        "m"(kAddY16),           // %4
+        "m"(kPermdARGBToY_AVX)  // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 #endif  // HAS_ARGBTOYROW_AVX2
 
 #ifdef HAS_ARGBTOYJROW_AVX2
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
-  asm volatile (
-    "vbroadcastf128 %3,%%ymm4                  \n"
-    "vbroadcastf128 %4,%%ymm5                  \n"
-    "vmovdqu    %5,%%ymm6                      \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
-    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
-    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
-    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
-    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-    "lea       " MEMLEA(0x80,0) ",%0           \n"
-    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
-    "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
-    "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"  // Add .5 for rounding.
-    "vpaddw     %%ymm5,%%ymm2,%%ymm2           \n"
-    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
-    "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
-    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
-    "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  : "m"(kARGBToYJ),   // %3
-    "m"(kAddYJ64),    // %4
-    "m"(kPermdARGBToY_AVX)  // %5
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm4                  \n"
+      "vbroadcastf128 %4,%%ymm5                  \n"
+      "vmovdqu    %5,%%ymm6                      \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x40(%0),%%ymm2                \n"
+      "vmovdqu    0x60(%0),%%ymm3                \n"
+      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "lea       0x80(%0),%0                     \n"
+      "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
+      "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
+      "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"  // Add .5 for rounding.
+      "vpaddw     %%ymm5,%%ymm2,%%ymm2           \n"
+      "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
+      "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
+      "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
+      "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),         // %0
+        "+r"(dst_y),            // %1
+        "+r"(width)             // %2
+      : "m"(kARGBToYJ),         // %3
+        "m"(kAddYJ64),          // %4
+        "m"(kPermdARGBToY_AVX)  // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 #endif  // HAS_ARGBTOYJROW_AVX2
 
 #ifdef HAS_ARGBTOUVROW_SSSE3
-void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %5,%%xmm3                       \n"
-    "movdqa    %6,%%xmm4                       \n"
-    "movdqa    %7,%%xmm5                       \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm1                   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm6                   \n"
+void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  asm volatile(
+      "movdqa    %5,%%xmm3                       \n"
+      "movdqa    %6,%%xmm4                       \n"
+      "movdqa    %7,%%xmm5                       \n"
+      "sub       %1,%2                           \n"
 
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps    %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_argb)), // %4
-    "m"(kARGBToV),  // %5
-    "m"(kARGBToU),  // %6
-    "m"(kAddUV128)  // %7
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x10(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm1                   \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x20(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqu    0x30(%0),%%xmm6                 \n"
+      "movdqu    0x30(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+
+      "lea       0x40(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqa    %%xmm2,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm6,%%xmm2             \n"
+      "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm2,%%xmm6                   \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm3,%%xmm1                   \n"
+      "pmaddubsw %%xmm3,%%xmm6                   \n"
+      "phaddw    %%xmm2,%%xmm0                   \n"
+      "phaddw    %%xmm6,%%xmm1                   \n"
+      "psraw     $0x8,%%xmm0                     \n"
+      "psraw     $0x8,%%xmm1                     \n"
+      "packsswb  %%xmm1,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movlps    %%xmm0,(%1)                     \n"
+      "movhps    %%xmm0,0x00(%1,%2,1)            \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_argb)),  // %4
+        "m"(kARGBToV),                     // %5
+        "m"(kARGBToU),                     // %6
+        "m"(kAddUV128)                     // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
 }
 #endif  // HAS_ARGBTOUVROW_SSSE3
 
 #ifdef HAS_ARGBTOUVROW_AVX2
 // vpshufb for vphaddw + vpackuswb packed to shorts.
 static const lvec8 kShufARGBToUV_AVX = {
-  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
-  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
-};
-void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "vbroadcastf128 %5,%%ymm5                  \n"
-    "vbroadcastf128 %6,%%ymm6                  \n"
-    "vbroadcastf128 %7,%%ymm7                  \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
-    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
-    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
-    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
-    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
-    VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
-    VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
-    "lea       " MEMLEA(0x80,0) ",%0           \n"
-    "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
-    "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
-    "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
-    "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
-    "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
-    "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
+    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
+void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
+                      int src_stride_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "vbroadcastf128 %5,%%ymm5                  \n"
+      "vbroadcastf128 %6,%%ymm6                  \n"
+      "vbroadcastf128 %7,%%ymm7                  \n"
+      "sub        %1,%2                          \n"
 
-    "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
-    "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
-    "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
-    "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
-    "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
-    "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
-    "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vpshufb    %8,%%ymm0,%%ymm0               \n"
-    "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x40(%0),%%ymm2                \n"
+      "vmovdqu    0x60(%0),%%ymm3                \n"
+      "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"
+      "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"
+      "vpavgb    0x40(%0,%4,1),%%ymm2,%%ymm2     \n"
+      "vpavgb    0x60(%0,%4,1),%%ymm3,%%ymm3     \n"
+      "lea        0x80(%0),%0                    \n"
+      "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
+      "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
+      "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
+      "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
+      "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
+      "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
 
-    "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
-    VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x20,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_argb)), // %4
-    "m"(kAddUV128),  // %5
-    "m"(kARGBToV),   // %6
-    "m"(kARGBToU),   // %7
-    "m"(kShufARGBToUV_AVX)  // %8
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+      "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
+      "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
+      "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
+      "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
+      "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
+      "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
+      "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vpshufb    %8,%%ymm0,%%ymm0               \n"
+      "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"
+
+      "vextractf128 $0x0,%%ymm0,(%1)             \n"
+      "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
+      "lea        0x10(%1),%1                    \n"
+      "sub        $0x20,%3                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_argb)),  // %4
+        "m"(kAddUV128),                    // %5
+        "m"(kARGBToV),                     // %6
+        "m"(kARGBToU),                     // %7
+        "m"(kShufARGBToUV_AVX)             // %8
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBTOUVROW_AVX2
 
 #ifdef HAS_ARGBTOUVJROW_AVX2
-void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "vbroadcastf128 %5,%%ymm5                  \n"
-    "vbroadcastf128 %6,%%ymm6                  \n"
-    "vbroadcastf128 %7,%%ymm7                  \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
-    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
-    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
-    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
-    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
-    VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
-    VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
-    "lea       " MEMLEA(0x80,0) ",%0           \n"
-    "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
-    "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
-    "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
-    "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
-    "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
-    "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
+void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  asm volatile(
+      "vbroadcastf128 %5,%%ymm5                  \n"
+      "vbroadcastf128 %6,%%ymm6                  \n"
+      "vbroadcastf128 %7,%%ymm7                  \n"
+      "sub        %1,%2                          \n"
 
-    "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
-    "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
-    "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
-    "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
-    "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
-    "vpaddw     %%ymm5,%%ymm1,%%ymm1           \n"
-    "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
-    "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
-    "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vpshufb    %8,%%ymm0,%%ymm0               \n"
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x40(%0),%%ymm2                \n"
+      "vmovdqu    0x60(%0),%%ymm3                \n"
+      "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"
+      "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"
+      "vpavgb    0x40(%0,%4,1),%%ymm2,%%ymm2     \n"
+      "vpavgb    0x60(%0,%4,1),%%ymm3,%%ymm3     \n"
+      "lea       0x80(%0),%0                     \n"
+      "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
+      "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
+      "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
+      "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
+      "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
+      "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
 
-    "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
-    VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x20,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_argb)), // %4
-    "m"(kAddUVJ128),  // %5
-    "m"(kARGBToVJ),  // %6
-    "m"(kARGBToUJ),  // %7
-    "m"(kShufARGBToUV_AVX)  // %8
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+      "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
+      "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
+      "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
+      "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
+      "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpaddw     %%ymm5,%%ymm1,%%ymm1           \n"
+      "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
+      "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
+      "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vpshufb    %8,%%ymm0,%%ymm0               \n"
+
+      "vextractf128 $0x0,%%ymm0,(%1)             \n"
+      "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x20,%3                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_argb)),  // %4
+        "m"(kAddUVJ128),                   // %5
+        "m"(kARGBToVJ),                    // %6
+        "m"(kARGBToUJ),                    // %7
+        "m"(kShufARGBToUV_AVX)             // %8
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBTOUVJROW_AVX2
 
 #ifdef HAS_ARGBTOUVJROW_SSSE3
-void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                        uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %5,%%xmm3                       \n"
-    "movdqa    %6,%%xmm4                       \n"
-    "movdqa    %7,%%xmm5                       \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm1                   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm6                   \n"
+void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
+                        int src_stride_argb,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  asm volatile(
+      "movdqa    %5,%%xmm3                       \n"
+      "movdqa    %6,%%xmm4                       \n"
+      "movdqa    %7,%%xmm5                       \n"
+      "sub       %1,%2                           \n"
 
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "paddw     %%xmm5,%%xmm0                   \n"
-    "paddw     %%xmm5,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_argb)), // %4
-    "m"(kARGBToVJ),  // %5
-    "m"(kARGBToUJ),  // %6
-    "m"(kAddUVJ128)  // %7
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x10(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm1                   \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x20(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqu    0x30(%0),%%xmm6                 \n"
+      "movdqu    0x30(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+
+      "lea       0x40(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqa    %%xmm2,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm6,%%xmm2             \n"
+      "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm2,%%xmm6                   \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm3,%%xmm1                   \n"
+      "pmaddubsw %%xmm3,%%xmm6                   \n"
+      "phaddw    %%xmm2,%%xmm0                   \n"
+      "phaddw    %%xmm6,%%xmm1                   \n"
+      "paddw     %%xmm5,%%xmm0                   \n"
+      "paddw     %%xmm5,%%xmm1                   \n"
+      "psraw     $0x8,%%xmm0                     \n"
+      "psraw     $0x8,%%xmm1                     \n"
+      "packsswb  %%xmm1,%%xmm0                   \n"
+      "movlps    %%xmm0,(%1)                     \n"
+      "movhps    %%xmm0,0x00(%1,%2,1)            \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_argb)),  // %4
+        "m"(kARGBToVJ),                    // %5
+        "m"(kARGBToUJ),                    // %6
+        "m"(kAddUVJ128)                    // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
 }
 #endif  // HAS_ARGBTOUVJROW_SSSE3
 
 #ifdef HAS_ARGBTOUV444ROW_SSSE3
-void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
                           int width) {
-  asm volatile (
-    "movdqa    %4,%%xmm3                       \n"
-    "movdqa    %5,%%xmm4                       \n"
-    "movdqa    %6,%%xmm5                       \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm6                   \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm2                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm2                     \n"
-    "packsswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    "pmaddubsw %%xmm3,%%xmm0                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm2                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm2                     \n"
-    "packsswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    MEMOPMEM(movdqu,xmm0,0x00,1,2,1)           //  movdqu  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),        // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "m"(kARGBToV),  // %4
-    "m"(kARGBToU),  // %5
-    "m"(kAddUV128)  // %6
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm6"
-  );
+  asm volatile(
+      "movdqa    %4,%%xmm3                       \n"
+      "movdqa    %5,%%xmm4                       \n"
+      "movdqa    %6,%%xmm5                       \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm6                 \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm4,%%xmm6                   \n"
+      "phaddw    %%xmm1,%%xmm0                   \n"
+      "phaddw    %%xmm6,%%xmm2                   \n"
+      "psraw     $0x8,%%xmm0                     \n"
+      "psraw     $0x8,%%xmm2                     \n"
+      "packsswb  %%xmm2,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm6                 \n"
+      "pmaddubsw %%xmm3,%%xmm0                   \n"
+      "pmaddubsw %%xmm3,%%xmm1                   \n"
+      "pmaddubsw %%xmm3,%%xmm2                   \n"
+      "pmaddubsw %%xmm3,%%xmm6                   \n"
+      "phaddw    %%xmm1,%%xmm0                   \n"
+      "phaddw    %%xmm6,%%xmm2                   \n"
+      "psraw     $0x8,%%xmm0                     \n"
+      "psraw     $0x8,%%xmm2                     \n"
+      "packsswb  %%xmm2,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "lea       0x40(%0),%0                     \n"
+      "movdqu    %%xmm0,0x00(%1,%2,1)            \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+rm"(width)     // %3
+      : "m"(kARGBToV),   // %4
+        "m"(kARGBToU),   // %5
+        "m"(kAddUV128)   // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6");
 }
 #endif  // HAS_ARGBTOUV444ROW_SSSE3
 
-void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) {
-  asm volatile (
-    "movdqa    %4,%%xmm5                       \n"
-    "movdqa    %3,%%xmm4                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_bgra),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  : "m"(kBGRAToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movdqa    %4,%%xmm5                       \n"
+      "movdqa    %3,%%xmm4                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm4,%%xmm3                   \n"
+      "lea       0x40(%0),%0                     \n"
+      "phaddw    %%xmm1,%%xmm0                   \n"
+      "phaddw    %%xmm3,%%xmm2                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "psrlw     $0x7,%%xmm2                     \n"
+      "packuswb  %%xmm2,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_bgra),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kBGRAToY),   // %3
+        "m"(kAddY16)     // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
-void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %5,%%xmm3                       \n"
-    "movdqa    %6,%%xmm4                       \n"
-    "movdqa    %7,%%xmm5                       \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm1                   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm6                   \n"
+void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
+                       int src_stride_bgra,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  asm volatile(
+      "movdqa    %5,%%xmm3                       \n"
+      "movdqa    %6,%%xmm4                       \n"
+      "movdqa    %7,%%xmm5                       \n"
+      "sub       %1,%2                           \n"
 
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_bgra0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_bgra)), // %4
-    "m"(kBGRAToV),  // %5
-    "m"(kBGRAToU),  // %6
-    "m"(kAddUV128)  // %7
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x10(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm1                   \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x20(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqu    0x30(%0),%%xmm6                 \n"
+      "movdqu    0x30(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+
+      "lea       0x40(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqa    %%xmm2,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm6,%%xmm2             \n"
+      "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm2,%%xmm6                   \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm3,%%xmm1                   \n"
+      "pmaddubsw %%xmm3,%%xmm6                   \n"
+      "phaddw    %%xmm2,%%xmm0                   \n"
+      "phaddw    %%xmm6,%%xmm1                   \n"
+      "psraw     $0x8,%%xmm0                     \n"
+      "psraw     $0x8,%%xmm1                     \n"
+      "packsswb  %%xmm1,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movlps    %%xmm0,(%1)                     \n"
+      "movhps    %%xmm0,0x00(%1,%2,1)            \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_bgra0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_bgra)),  // %4
+        "m"(kBGRAToV),                     // %5
+        "m"(kBGRAToU),                     // %6
+        "m"(kAddUV128)                     // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
 }
 
-void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width) {
-  asm volatile (
-    "movdqa    %4,%%xmm5                       \n"
-    "movdqa    %3,%%xmm4                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_abgr),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  : "m"(kABGRToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movdqa    %4,%%xmm5                       \n"
+      "movdqa    %3,%%xmm4                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm4,%%xmm3                   \n"
+      "lea       0x40(%0),%0                     \n"
+      "phaddw    %%xmm1,%%xmm0                   \n"
+      "phaddw    %%xmm3,%%xmm2                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "psrlw     $0x7,%%xmm2                     \n"
+      "packuswb  %%xmm2,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_abgr),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kABGRToY),   // %3
+        "m"(kAddY16)     // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
-void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) {
-  asm volatile (
-    "movdqa    %4,%%xmm5                       \n"
-    "movdqa    %3,%%xmm4                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_rgba),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  : "m"(kRGBAToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movdqa    %4,%%xmm5                       \n"
+      "movdqa    %3,%%xmm4                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm4,%%xmm3                   \n"
+      "lea       0x40(%0),%0                     \n"
+      "phaddw    %%xmm1,%%xmm0                   \n"
+      "phaddw    %%xmm3,%%xmm2                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "psrlw     $0x7,%%xmm2                     \n"
+      "packuswb  %%xmm2,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_rgba),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kRGBAToY),   // %3
+        "m"(kAddY16)     // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
-void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %5,%%xmm3                       \n"
-    "movdqa    %6,%%xmm4                       \n"
-    "movdqa    %7,%%xmm5                       \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm1                   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm6                   \n"
+void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
+                       int src_stride_abgr,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  asm volatile(
+      "movdqa    %5,%%xmm3                       \n"
+      "movdqa    %6,%%xmm4                       \n"
+      "movdqa    %7,%%xmm5                       \n"
+      "sub       %1,%2                           \n"
 
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_abgr0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_abgr)), // %4
-    "m"(kABGRToV),  // %5
-    "m"(kABGRToU),  // %6
-    "m"(kAddUV128)  // %7
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x10(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm1                   \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x20(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqu    0x30(%0),%%xmm6                 \n"
+      "movdqu    0x30(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+
+      "lea       0x40(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqa    %%xmm2,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm6,%%xmm2             \n"
+      "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm2,%%xmm6                   \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm3,%%xmm1                   \n"
+      "pmaddubsw %%xmm3,%%xmm6                   \n"
+      "phaddw    %%xmm2,%%xmm0                   \n"
+      "phaddw    %%xmm6,%%xmm1                   \n"
+      "psraw     $0x8,%%xmm0                     \n"
+      "psraw     $0x8,%%xmm1                     \n"
+      "packsswb  %%xmm1,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movlps    %%xmm0,(%1)                     \n"
+      "movhps    %%xmm0,0x00(%1,%2,1)            \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_abgr0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_abgr)),  // %4
+        "m"(kABGRToV),                     // %5
+        "m"(kABGRToU),                     // %6
+        "m"(kAddUV128)                     // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
 }
 
-void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %5,%%xmm3                       \n"
-    "movdqa    %6,%%xmm4                       \n"
-    "movdqa    %7,%%xmm5                       \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm1                   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm6                   \n"
+void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
+                       int src_stride_rgba,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  asm volatile(
+      "movdqa    %5,%%xmm3                       \n"
+      "movdqa    %6,%%xmm4                       \n"
+      "movdqa    %7,%%xmm5                       \n"
+      "sub       %1,%2                           \n"
 
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_rgba0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_rgba)), // %4
-    "m"(kRGBAToV),  // %5
-    "m"(kRGBAToU),  // %6
-    "m"(kAddUV128)  // %7
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x10(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm1                   \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x20(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqu    0x30(%0),%%xmm6                 \n"
+      "movdqu    0x30(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+
+      "lea       0x40(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqa    %%xmm2,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm6,%%xmm2             \n"
+      "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm2,%%xmm6                   \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm3,%%xmm1                   \n"
+      "pmaddubsw %%xmm3,%%xmm6                   \n"
+      "phaddw    %%xmm2,%%xmm0                   \n"
+      "phaddw    %%xmm6,%%xmm1                   \n"
+      "psraw     $0x8,%%xmm0                     \n"
+      "psraw     $0x8,%%xmm1                     \n"
+      "packsswb  %%xmm1,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movlps    %%xmm0,(%1)                     \n"
+      "movhps    %%xmm0,0x00(%1,%2,1)            \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_rgba0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_rgba)),  // %4
+        "m"(kRGBAToV),                     // %5
+        "m"(kRGBAToU),                     // %6
+        "m"(kAddUV128)                     // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
 }
 
 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
 
 // Read 8 UV from 444
-#define READYUV444                                                             \
-    "movq       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
-    MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
-    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]               \n"            \
-    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
-    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
-    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
-    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
+#define READYUV444                                                \
+  "movq       (%[u_buf]),%%xmm0                               \n" \
+  "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
+  "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
+  "punpcklbw  %%xmm1,%%xmm0                                   \n" \
+  "movq       (%[y_buf]),%%xmm4                               \n" \
+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
+  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
 
 // Read 4 UV from 422, upsample to 8 UV
-#define READYUV422                                                             \
-    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
-    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
-    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
-    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
-    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
-    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
-    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
-    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
+#define READYUV422                                                \
+  "movd       (%[u_buf]),%%xmm0                               \n" \
+  "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
+  "lea        0x4(%[u_buf]),%[u_buf]                          \n" \
+  "punpcklbw  %%xmm1,%%xmm0                                   \n" \
+  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
+  "movq       (%[y_buf]),%%xmm4                               \n" \
+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
+  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
+
+// Read 4 UV from 422 10 bit, upsample to 8 UV
+// TODO(fbarchard): Consider shufb to replace pack/unpack
+// TODO(fbarchard): Consider pmulhuw to replace psraw
+// TODO(fbarchard): Consider pmullw to replace psllw and allow different bits.
+#define READYUV210                                                \
+  "movq       (%[u_buf]),%%xmm0                               \n" \
+  "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
+  "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
+  "punpcklwd  %%xmm1,%%xmm0                                   \n" \
+  "psraw      $0x2,%%xmm0                                     \n" \
+  "packuswb   %%xmm0,%%xmm0                                   \n" \
+  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
+  "movdqu     (%[y_buf]),%%xmm4                               \n" \
+  "psllw      $0x6,%%xmm4                                     \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                         \n"
 
 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
-#define READYUVA422                                                            \
-    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
-    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
-    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
-    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
-    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
-    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
-    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
-    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
-    "movq       " MEMACCESS([a_buf]) ",%%xmm5                   \n"            \
-    "lea        " MEMLEA(0x8, [a_buf]) ",%[a_buf]               \n"
-
-// Read 2 UV from 411, upsample to 8 UV.
-// reading 4 bytes is an msan violation.
-//    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"
-//    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)
-// pinsrw fails with drmemory
-//  __asm pinsrw     xmm0, [esi], 0        /* U */
-//  __asm pinsrw     xmm1, [esi + edi], 0  /* V */
-#define READYUV411_TEMP                                                        \
-    "movzwl     " MEMACCESS([u_buf]) ",%[temp]                  \n"            \
-    "movd       %[temp],%%xmm0                                  \n"            \
-    MEMOPARG(movzwl, 0x00, [u_buf], [v_buf], 1, [temp]) "       \n"            \
-    "movd       %[temp],%%xmm1                                  \n"            \
-    "lea        " MEMLEA(0x2, [u_buf]) ",%[u_buf]               \n"            \
-    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
-    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
-    "punpckldq  %%xmm0,%%xmm0                                   \n"            \
-    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
-    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
-    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
+#define READYUVA422                                               \
+  "movd       (%[u_buf]),%%xmm0                               \n" \
+  "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
+  "lea        0x4(%[u_buf]),%[u_buf]                          \n" \
+  "punpcklbw  %%xmm1,%%xmm0                                   \n" \
+  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
+  "movq       (%[y_buf]),%%xmm4                               \n" \
+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
+  "lea        0x8(%[y_buf]),%[y_buf]                          \n" \
+  "movq       (%[a_buf]),%%xmm5                               \n" \
+  "lea        0x8(%[a_buf]),%[a_buf]                          \n"
 
 // Read 4 UV from NV12, upsample to 8 UV
-#define READNV12                                                               \
-    "movq       " MEMACCESS([uv_buf]) ",%%xmm0                  \n"            \
-    "lea        " MEMLEA(0x8, [uv_buf]) ",%[uv_buf]             \n"            \
-    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
-    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
-    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
-    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
+#define READNV12                                                  \
+  "movq       (%[uv_buf]),%%xmm0                              \n" \
+  "lea        0x8(%[uv_buf]),%[uv_buf]                        \n" \
+  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
+  "movq       (%[y_buf]),%%xmm4                               \n" \
+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
+  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
 
 // Read 4 VU from NV21, upsample to 8 UV
-#define READNV21                                                               \
-    "movq       " MEMACCESS([vu_buf]) ",%%xmm0                  \n"            \
-    "lea        " MEMLEA(0x8, [vu_buf]) ",%[vu_buf]             \n"            \
-    "pshufb     %[kShuffleNV21], %%xmm0                         \n"            \
-    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
-    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
-    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
+#define READNV21                                                  \
+  "movq       (%[vu_buf]),%%xmm0                              \n" \
+  "lea        0x8(%[vu_buf]),%[vu_buf]                        \n" \
+  "pshufb     %[kShuffleNV21], %%xmm0                         \n" \
+  "movq       (%[y_buf]),%%xmm4                               \n" \
+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
+  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
 
 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
-#define READYUY2                                                               \
-    "movdqu     " MEMACCESS([yuy2_buf]) ",%%xmm4                \n"            \
-    "pshufb     %[kShuffleYUY2Y], %%xmm4                        \n"            \
-    "movdqu     " MEMACCESS([yuy2_buf]) ",%%xmm0                \n"            \
-    "pshufb     %[kShuffleYUY2UV], %%xmm0                       \n"            \
-    "lea        " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf]        \n"
+#define READYUY2                                                  \
+  "movdqu     (%[yuy2_buf]),%%xmm4                            \n" \
+  "pshufb     %[kShuffleYUY2Y], %%xmm4                        \n" \
+  "movdqu     (%[yuy2_buf]),%%xmm0                            \n" \
+  "pshufb     %[kShuffleYUY2UV], %%xmm0                       \n" \
+  "lea        0x10(%[yuy2_buf]),%[yuy2_buf]                   \n"
 
 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
-#define READUYVY                                                               \
-    "movdqu     " MEMACCESS([uyvy_buf]) ",%%xmm4                \n"            \
-    "pshufb     %[kShuffleUYVYY], %%xmm4                        \n"            \
-    "movdqu     " MEMACCESS([uyvy_buf]) ",%%xmm0                \n"            \
-    "pshufb     %[kShuffleUYVYUV], %%xmm0                       \n"            \
-    "lea        " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf]        \n"
+#define READUYVY                                                  \
+  "movdqu     (%[uyvy_buf]),%%xmm4                            \n" \
+  "pshufb     %[kShuffleUYVYY], %%xmm4                        \n" \
+  "movdqu     (%[uyvy_buf]),%%xmm0                            \n" \
+  "pshufb     %[kShuffleUYVYUV], %%xmm0                       \n" \
+  "lea        0x10(%[uyvy_buf]),%[uyvy_buf]                   \n"
 
 #if defined(__x86_64__)
-#define YUVTORGB_SETUP(yuvconstants)                                           \
-    "movdqa     " MEMACCESS([yuvconstants]) ",%%xmm8            \n"            \
-    "movdqa     " MEMACCESS2(32, [yuvconstants]) ",%%xmm9       \n"            \
-    "movdqa     " MEMACCESS2(64, [yuvconstants]) ",%%xmm10      \n"            \
-    "movdqa     " MEMACCESS2(96, [yuvconstants]) ",%%xmm11      \n"            \
-    "movdqa     " MEMACCESS2(128, [yuvconstants]) ",%%xmm12     \n"            \
-    "movdqa     " MEMACCESS2(160, [yuvconstants]) ",%%xmm13     \n"            \
-    "movdqa     " MEMACCESS2(192, [yuvconstants]) ",%%xmm14     \n"
+#define YUVTORGB_SETUP(yuvconstants)                              \
+  "movdqa     (%[yuvconstants]),%%xmm8                        \n" \
+  "movdqa     32(%[yuvconstants]),%%xmm9                      \n" \
+  "movdqa     64(%[yuvconstants]),%%xmm10                     \n" \
+  "movdqa     96(%[yuvconstants]),%%xmm11                     \n" \
+  "movdqa     128(%[yuvconstants]),%%xmm12                    \n" \
+  "movdqa     160(%[yuvconstants]),%%xmm13                    \n" \
+  "movdqa     192(%[yuvconstants]),%%xmm14                    \n"
 // Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB(yuvconstants)                                                 \
-    "movdqa     %%xmm0,%%xmm1                                   \n"            \
-    "movdqa     %%xmm0,%%xmm2                                   \n"            \
-    "movdqa     %%xmm0,%%xmm3                                   \n"            \
-    "movdqa     %%xmm11,%%xmm0                                  \n"            \
-    "pmaddubsw  %%xmm8,%%xmm1                                   \n"            \
-    "psubw      %%xmm1,%%xmm0                                   \n"            \
-    "movdqa     %%xmm12,%%xmm1                                  \n"            \
-    "pmaddubsw  %%xmm9,%%xmm2                                   \n"            \
-    "psubw      %%xmm2,%%xmm1                                   \n"            \
-    "movdqa     %%xmm13,%%xmm2                                  \n"            \
-    "pmaddubsw  %%xmm10,%%xmm3                                  \n"            \
-    "psubw      %%xmm3,%%xmm2                                   \n"            \
-    "pmulhuw    %%xmm14,%%xmm4                                  \n"            \
-    "paddsw     %%xmm4,%%xmm0                                   \n"            \
-    "paddsw     %%xmm4,%%xmm1                                   \n"            \
-    "paddsw     %%xmm4,%%xmm2                                   \n"            \
-    "psraw      $0x6,%%xmm0                                     \n"            \
-    "psraw      $0x6,%%xmm1                                     \n"            \
-    "psraw      $0x6,%%xmm2                                     \n"            \
-    "packuswb   %%xmm0,%%xmm0                                   \n"            \
-    "packuswb   %%xmm1,%%xmm1                                   \n"            \
-    "packuswb   %%xmm2,%%xmm2                                   \n"
+#define YUVTORGB16(yuvconstants)                                  \
+  "movdqa     %%xmm0,%%xmm1                                   \n" \
+  "movdqa     %%xmm0,%%xmm2                                   \n" \
+  "movdqa     %%xmm0,%%xmm3                                   \n" \
+  "movdqa     %%xmm11,%%xmm0                                  \n" \
+  "pmaddubsw  %%xmm8,%%xmm1                                   \n" \
+  "psubw      %%xmm1,%%xmm0                                   \n" \
+  "movdqa     %%xmm12,%%xmm1                                  \n" \
+  "pmaddubsw  %%xmm9,%%xmm2                                   \n" \
+  "psubw      %%xmm2,%%xmm1                                   \n" \
+  "movdqa     %%xmm13,%%xmm2                                  \n" \
+  "pmaddubsw  %%xmm10,%%xmm3                                  \n" \
+  "psubw      %%xmm3,%%xmm2                                   \n" \
+  "pmulhuw    %%xmm14,%%xmm4                                  \n" \
+  "paddsw     %%xmm4,%%xmm0                                   \n" \
+  "paddsw     %%xmm4,%%xmm1                                   \n" \
+  "paddsw     %%xmm4,%%xmm2                                   \n"
 #define YUVTORGB_REGS \
-    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+  "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
 
 #else
 #define YUVTORGB_SETUP(yuvconstants)
 // Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB(yuvconstants)                                                 \
-    "movdqa     %%xmm0,%%xmm1                                   \n"            \
-    "movdqa     %%xmm0,%%xmm2                                   \n"            \
-    "movdqa     %%xmm0,%%xmm3                                   \n"            \
-    "movdqa     " MEMACCESS2(96, [yuvconstants]) ",%%xmm0       \n"            \
-    "pmaddubsw  " MEMACCESS([yuvconstants]) ",%%xmm1            \n"            \
-    "psubw      %%xmm1,%%xmm0                                   \n"            \
-    "movdqa     " MEMACCESS2(128, [yuvconstants]) ",%%xmm1      \n"            \
-    "pmaddubsw  " MEMACCESS2(32, [yuvconstants]) ",%%xmm2       \n"            \
-    "psubw      %%xmm2,%%xmm1                                   \n"            \
-    "movdqa     " MEMACCESS2(160, [yuvconstants]) ",%%xmm2      \n"            \
-    "pmaddubsw  " MEMACCESS2(64, [yuvconstants]) ",%%xmm3       \n"            \
-    "psubw      %%xmm3,%%xmm2                                   \n"            \
-    "pmulhuw    " MEMACCESS2(192, [yuvconstants]) ",%%xmm4      \n"            \
-    "paddsw     %%xmm4,%%xmm0                                   \n"            \
-    "paddsw     %%xmm4,%%xmm1                                   \n"            \
-    "paddsw     %%xmm4,%%xmm2                                   \n"            \
-    "psraw      $0x6,%%xmm0                                     \n"            \
-    "psraw      $0x6,%%xmm1                                     \n"            \
-    "psraw      $0x6,%%xmm2                                     \n"            \
-    "packuswb   %%xmm0,%%xmm0                                   \n"            \
-    "packuswb   %%xmm1,%%xmm1                                   \n"            \
-    "packuswb   %%xmm2,%%xmm2                                   \n"
+#define YUVTORGB16(yuvconstants)                                  \
+  "movdqa     %%xmm0,%%xmm1                                   \n" \
+  "movdqa     %%xmm0,%%xmm2                                   \n" \
+  "movdqa     %%xmm0,%%xmm3                                   \n" \
+  "movdqa     96(%[yuvconstants]),%%xmm0                      \n" \
+  "pmaddubsw  (%[yuvconstants]),%%xmm1                        \n" \
+  "psubw      %%xmm1,%%xmm0                                   \n" \
+  "movdqa     128(%[yuvconstants]),%%xmm1                     \n" \
+  "pmaddubsw  32(%[yuvconstants]),%%xmm2                      \n" \
+  "psubw      %%xmm2,%%xmm1                                   \n" \
+  "movdqa     160(%[yuvconstants]),%%xmm2                     \n" \
+  "pmaddubsw  64(%[yuvconstants]),%%xmm3                      \n" \
+  "psubw      %%xmm3,%%xmm2                                   \n" \
+  "pmulhuw    192(%[yuvconstants]),%%xmm4                     \n" \
+  "paddsw     %%xmm4,%%xmm0                                   \n" \
+  "paddsw     %%xmm4,%%xmm1                                   \n" \
+  "paddsw     %%xmm4,%%xmm2                                   \n"
 #define YUVTORGB_REGS
 #endif
 
+#define YUVTORGB(yuvconstants)                                    \
+  YUVTORGB16(yuvconstants)                                        \
+  "psraw      $0x6,%%xmm0                                     \n" \
+  "psraw      $0x6,%%xmm1                                     \n" \
+  "psraw      $0x6,%%xmm2                                     \n" \
+  "packuswb   %%xmm0,%%xmm0                                   \n" \
+  "packuswb   %%xmm1,%%xmm1                                   \n" \
+  "packuswb   %%xmm2,%%xmm2                                   \n"
+
 // Store 8 ARGB values.
-#define STOREARGB                                                              \
-    "punpcklbw  %%xmm1,%%xmm0                                    \n"           \
-    "punpcklbw  %%xmm5,%%xmm2                                    \n"           \
-    "movdqa     %%xmm0,%%xmm1                                    \n"           \
-    "punpcklwd  %%xmm2,%%xmm0                                    \n"           \
-    "punpckhwd  %%xmm2,%%xmm1                                    \n"           \
-    "movdqu     %%xmm0," MEMACCESS([dst_argb]) "                 \n"           \
-    "movdqu     %%xmm1," MEMACCESS2(0x10, [dst_argb]) "          \n"           \
-    "lea        " MEMLEA(0x20, [dst_argb]) ", %[dst_argb]        \n"
+#define STOREARGB                                                  \
+  "punpcklbw  %%xmm1,%%xmm0                                    \n" \
+  "punpcklbw  %%xmm5,%%xmm2                                    \n" \
+  "movdqa     %%xmm0,%%xmm1                                    \n" \
+  "punpcklwd  %%xmm2,%%xmm0                                    \n" \
+  "punpckhwd  %%xmm2,%%xmm1                                    \n" \
+  "movdqu     %%xmm0,(%[dst_argb])                             \n" \
+  "movdqu     %%xmm1,0x10(%[dst_argb])                         \n" \
+  "lea        0x20(%[dst_argb]), %[dst_argb]                   \n"
 
 // Store 8 RGBA values.
-#define STORERGBA                                                              \
-    "pcmpeqb   %%xmm5,%%xmm5                                     \n"           \
-    "punpcklbw %%xmm2,%%xmm1                                     \n"           \
-    "punpcklbw %%xmm0,%%xmm5                                     \n"           \
-    "movdqa    %%xmm5,%%xmm0                                     \n"           \
-    "punpcklwd %%xmm1,%%xmm5                                     \n"           \
-    "punpckhwd %%xmm1,%%xmm0                                     \n"           \
-    "movdqu    %%xmm5," MEMACCESS([dst_rgba]) "                  \n"           \
-    "movdqu    %%xmm0," MEMACCESS2(0x10, [dst_rgba]) "           \n"           \
-    "lea       " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba]          \n"
+#define STORERGBA                                                  \
+  "pcmpeqb   %%xmm5,%%xmm5                                     \n" \
+  "punpcklbw %%xmm2,%%xmm1                                     \n" \
+  "punpcklbw %%xmm0,%%xmm5                                     \n" \
+  "movdqa    %%xmm5,%%xmm0                                     \n" \
+  "punpcklwd %%xmm1,%%xmm5                                     \n" \
+  "punpckhwd %%xmm1,%%xmm0                                     \n" \
+  "movdqu    %%xmm5,(%[dst_rgba])                              \n" \
+  "movdqu    %%xmm0,0x10(%[dst_rgba])                          \n" \
+  "lea       0x20(%[dst_rgba]),%[dst_rgba]                     \n"
 
-void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
-                                const uint8* u_buf,
-                                const uint8* v_buf,
-                                uint8* dst_argb,
+// Store 8 AR30 values.
+#define STOREAR30                                                  \
+  "psraw      $0x4,%%xmm0                                      \n" \
+  "psraw      $0x4,%%xmm1                                      \n" \
+  "psraw      $0x4,%%xmm2                                      \n" \
+  "pminsw     %%xmm7,%%xmm0                                    \n" \
+  "pminsw     %%xmm7,%%xmm1                                    \n" \
+  "pminsw     %%xmm7,%%xmm2                                    \n" \
+  "pmaxsw     %%xmm6,%%xmm0                                    \n" \
+  "pmaxsw     %%xmm6,%%xmm1                                    \n" \
+  "pmaxsw     %%xmm6,%%xmm2                                    \n" \
+  "psllw      $0x4,%%xmm2                                      \n" \
+  "movdqa     %%xmm0,%%xmm3                                    \n" \
+  "punpcklwd  %%xmm2,%%xmm0                                    \n" \
+  "punpckhwd  %%xmm2,%%xmm3                                    \n" \
+  "movdqa     %%xmm1,%%xmm2                                    \n" \
+  "punpcklwd  %%xmm5,%%xmm1                                    \n" \
+  "punpckhwd  %%xmm5,%%xmm2                                    \n" \
+  "pslld      $0xa,%%xmm1                                      \n" \
+  "pslld      $0xa,%%xmm2                                      \n" \
+  "por        %%xmm1,%%xmm0                                    \n" \
+  "por        %%xmm2,%%xmm3                                    \n" \
+  "movdqu     %%xmm0,(%[dst_ar30])                             \n" \
+  "movdqu     %%xmm3,0x10(%[dst_ar30])                         \n" \
+  "lea        0x20(%[dst_ar30]), %[dst_ar30]                   \n"
+
+void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READYUV444
     YUVTORGB(yuvconstants)
     STOREARGB
@@ -1691,15 +2028,15 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS
+  : "memory", "cc", YUVTORGB_REGS
     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 
-void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
-                                 const uint8* u_buf,
-                                 const uint8* v_buf,
-                                 uint8* dst_rgb24,
+void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
+                                 const uint8_t* u_buf,
+                                 const uint8_t* v_buf,
+                                 uint8_t* dst_rgb24,
                                  const struct YuvConstants* yuvconstants,
                                  int width) {
   asm volatile (
@@ -1707,8 +2044,9 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
     "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
     "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
     "sub       %[u_buf],%[v_buf]               \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READYUV422
     YUVTORGB(yuvconstants)
     "punpcklbw %%xmm1,%%xmm0                   \n"
@@ -1719,16 +2057,16 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
     "pshufb    %%xmm5,%%xmm0                   \n"
     "pshufb    %%xmm6,%%xmm1                   \n"
     "palignr   $0xc,%%xmm0,%%xmm1              \n"
-    "movq      %%xmm0," MEMACCESS([dst_rgb24]) "\n"
-    "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
-    "lea       " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
+    "movq      %%xmm0,(%[dst_rgb24])           \n"
+    "movdqu    %%xmm1,0x8(%[dst_rgb24])        \n"
+    "lea       0x18(%[dst_rgb24]),%[dst_rgb24] \n"
     "subl      $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
     [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
-#if defined(__i386__) && defined(__pic__)
+#if defined(__i386__)
     [width]"+m"(width)     // %[width]
 #else
     [width]"+rm"(width)    // %[width]
@@ -1736,23 +2074,24 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
   : [yuvconstants]"r"(yuvconstants),  // %[yuvconstants]
     [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
     [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS
+  : "memory", "cc", YUVTORGB_REGS
     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   );
 }
 
-void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
-                                const uint8* u_buf,
-                                const uint8* v_buf,
-                                uint8* dst_argb,
+void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READYUV422
     YUVTORGB(yuvconstants)
     STOREARGB
@@ -1764,24 +2103,125 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS
+  : "memory", "cc", YUVTORGB_REGS
     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 
-#ifdef HAS_I422ALPHATOARGBROW_SSSE3
-void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
-                                     const uint8* u_buf,
-                                     const uint8* v_buf,
-                                     const uint8* a_buf,
-                                     uint8* dst_argb,
-                                     const struct YuvConstants* yuvconstants,
-                                     int width) {
+void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_ar30,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
     "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // AR30 constants
+    "psrlw     $14,%%xmm5                      \n"
+    "psllw     $4,%%xmm5                       \n"  // 2 alpha bits
+    "pxor      %%xmm6,%%xmm6                   \n"
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"  // 0 for min
+    "psrlw     $6,%%xmm7                       \n"  // 1023 for max
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
+    READYUV422
+    YUVTORGB16(yuvconstants)
+    STOREAR30
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+// 10 bit YUV to ARGB
+void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
+                                const uint16_t* u_buf,
+                                const uint16_t* v_buf,
+                                uint8_t* dst_argb,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
+    LABELALIGN
+    "1:                                        \n"
+    READYUV210
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+// 10 bit YUV to AR30
+void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
+                                const uint16_t* u_buf,
+                                const uint16_t* v_buf,
+                                uint8_t* dst_ar30,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $14,%%xmm5                      \n"
+    "psllw     $4,%%xmm5                       \n"  // 2 alpha bits
+    "pxor      %%xmm6,%%xmm6                   \n"
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"  // 0 for min
+    "psrlw     $6,%%xmm7                       \n"  // 1023 for max
+
+    LABELALIGN
+    "1:                                        \n"
+    READYUV210
+    YUVTORGB16(yuvconstants)
+    STOREAR30
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+#ifdef HAS_I422ALPHATOARGBROW_SSSE3
+void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
+                                     const uint8_t* u_buf,
+                                     const uint8_t* v_buf,
+                                     const uint8_t* a_buf,
+                                     uint8_t* dst_argb,
+                                     const struct YuvConstants* yuvconstants,
+                                     int width) {
+  // clang-format off
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+
+    LABELALIGN
+    "1:                                        \n"
     READYUVA422
     YUVTORGB(yuvconstants)
     STOREARGB
@@ -1792,64 +2232,31 @@ void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
     [v_buf]"+r"(v_buf),    // %[v_buf]
     [a_buf]"+r"(a_buf),    // %[a_buf]
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-#if defined(__i386__) && defined(__pic__)
+#if defined(__i386__)
     [width]"+m"(width)     // %[width]
 #else
     [width]"+rm"(width)    // %[width]
 #endif
   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS
+  : "memory", "cc", YUVTORGB_REGS
     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
 #endif  // HAS_I422ALPHATOARGBROW_SSSE3
 
-#ifdef HAS_I411TOARGBROW_SSSE3
-void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
-                                const uint8* u_buf,
-                                const uint8* v_buf,
-                                uint8* dst_argb,
+void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
+                                const uint8_t* uv_buf,
+                                uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
-  int temp;
+  // clang-format off
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    LABELALIGN
-  "1:                                          \n"
-    READYUV411_TEMP
-    YUVTORGB(yuvconstants)
-    STOREARGB
-    "subl      $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),        // %[y_buf]
-    [u_buf]"+r"(u_buf),        // %[u_buf]
-    [v_buf]"+r"(v_buf),        // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [temp]"=&r"(temp),         // %[temp]
-#if defined(__i386__) && defined(__pic__)
-    [width]"+m"(width)         // %[width]
-#else
-    [width]"+rm"(width)        // %[width]
-#endif
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif
 
-void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
-                                const uint8* uv_buf,
-                                uint8* dst_argb,
-                                const struct YuvConstants* yuvconstants,
-                                int width) {
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READNV12
     YUVTORGB(yuvconstants)
     STOREARGB
@@ -1860,21 +2267,24 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
+    : "memory", "cc", YUVTORGB_REGS
       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
 
-void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
-                                const uint8* vu_buf,
-                                uint8* dst_argb,
+void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
+                                const uint8_t* vu_buf,
+                                uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
+  // clang-format off
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READNV21
     YUVTORGB(yuvconstants)
     STOREARGB
@@ -1886,20 +2296,23 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
     [width]"+rm"(width)    // %[width]
   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
     [kShuffleNV21]"m"(kShuffleNV21)
-    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
+    : "memory", "cc", YUVTORGB_REGS
       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
 
-void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,
-                                uint8* dst_argb,
+void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
+                                uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
+  // clang-format off
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READYUY2
     YUVTORGB(yuvconstants)
     STOREARGB
@@ -1911,20 +2324,23 @@ void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,
   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
     [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
     [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
-    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
+    : "memory", "cc", YUVTORGB_REGS
       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
 
-void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf,
-                                uint8* dst_argb,
+void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
+                                uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
+  // clang-format off
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READUYVY
     YUVTORGB(yuvconstants)
     STOREARGB
@@ -1936,23 +2352,25 @@ void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf,
   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
     [kShuffleUYVYY]"m"(kShuffleUYVYY),
     [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
-    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
+    : "memory", "cc", YUVTORGB_REGS
       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
 
-void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
-                                const uint8* u_buf,
-                                const uint8* v_buf,
-                                uint8* dst_rgba,
+void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_rgba,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READYUV422
     YUVTORGB(yuvconstants)
     STORERGBA
@@ -1964,7 +2382,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
     [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
     [width]"+rm"(width)    // %[width]
   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS
+  : "memory", "cc", YUVTORGB_REGS
     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
@@ -1972,179 +2390,211 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
 #endif  // HAS_I422TOARGBROW_SSSE3
 
 // Read 16 UV from 444
-#define READYUV444_AVX2                                                        \
-    "vmovdqu    " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \
-    MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1)                         \
-    "lea        " MEMLEA(0x10, [u_buf]) ",%[u_buf]                  \n"        \
-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
-    "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n"        \
-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
-    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
-    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
-    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
-    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
+#define READYUV444_AVX2                                               \
+  "vmovdqu    (%[u_buf]),%%xmm0                                   \n" \
+  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
+  "lea        0x10(%[u_buf]),%[u_buf]                             \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
+  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
 
 // Read 8 UV from 422, upsample to 16 UV.
-#define READYUV422_AVX2                                                        \
-    "vmovq      " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \
-    MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
-    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \
-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
-    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
-    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
-    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
-    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
-    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
+#define READYUV422_AVX2                                               \
+  "vmovq      (%[u_buf]),%%xmm0                                   \n" \
+  "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
+  "lea        0x8(%[u_buf]),%[u_buf]                              \n" \
+  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \
+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
+
+// Read 8 UV from 210 10 bit, upsample to 16 UV
+// TODO(fbarchard): Consider vshufb to replace pack/unpack
+// TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
+#define READYUV210_AVX2                                            \
+  "vmovdqu    (%[u_buf]),%%xmm0                                \n" \
+  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \
+  "lea        0x10(%[u_buf]),%[u_buf]                          \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                              \n" \
+  "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \
+  "vpunpcklwd %%ymm1,%%ymm0,%%ymm0                             \n" \
+  "vpsraw     $0x2,%%ymm0,%%ymm0                               \n" \
+  "vpackuswb  %%ymm0,%%ymm0,%%ymm0                             \n" \
+  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                             \n" \
+  "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
+  "vpsllw     $0x6,%%ymm4,%%ymm4                               \n" \
+  "lea        0x20(%[y_buf]),%[y_buf]                          \n"
 
 // Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
-#define READYUVA422_AVX2                                                       \
-    "vmovq      " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \
-    MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
-    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \
-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
-    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
-    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
-    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
-    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
-    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"        \
-    "vmovdqu    " MEMACCESS([a_buf]) ",%%xmm5                       \n"        \
-    "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n"        \
-    "lea        " MEMLEA(0x10, [a_buf]) ",%[a_buf]                  \n"
-
-// Read 4 UV from 411, upsample to 16 UV.
-#define READYUV411_AVX2                                                        \
-    "vmovd      " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \
-    MEMOPREG(vmovd, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
-    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]                   \n"        \
-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
-    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
-    "vpunpckldq %%ymm0,%%ymm0,%%ymm0                                \n"        \
-    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
-    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
-    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
-    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
+#define READYUVA422_AVX2                                              \
+  "vmovq      (%[u_buf]),%%xmm0                                   \n" \
+  "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
+  "lea        0x8(%[u_buf]),%[u_buf]                              \n" \
+  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \
+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                             \n" \
+  "vmovdqu    (%[a_buf]),%%xmm5                                   \n" \
+  "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n" \
+  "lea        0x10(%[a_buf]),%[a_buf]                             \n"
 
 // Read 8 UV from NV12, upsample to 16 UV.
-#define READNV12_AVX2                                                          \
-    "vmovdqu    " MEMACCESS([uv_buf]) ",%%xmm0                      \n"        \
-    "lea        " MEMLEA(0x10, [uv_buf]) ",%[uv_buf]                \n"        \
-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
-    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
-    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
-    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
-    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
-    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
+#define READNV12_AVX2                                                 \
+  "vmovdqu    (%[uv_buf]),%%xmm0                                  \n" \
+  "lea        0x10(%[uv_buf]),%[uv_buf]                           \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \
+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
 
 // Read 8 VU from NV21, upsample to 16 UV.
-#define READNV21_AVX2                                                          \
-    "vmovdqu    " MEMACCESS([vu_buf]) ",%%xmm0                      \n"        \
-    "lea        " MEMLEA(0x10, [vu_buf]) ",%[vu_buf]                \n"        \
-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
-    "vpshufb     %[kShuffleNV21], %%ymm0, %%ymm0                    \n"        \
-    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
-    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
-    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
-    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
+#define READNV21_AVX2                                                 \
+  "vmovdqu    (%[vu_buf]),%%xmm0                                  \n" \
+  "lea        0x10(%[vu_buf]),%[vu_buf]                           \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpshufb     %[kShuffleNV21], %%ymm0, %%ymm0                    \n" \
+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
 
 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
-#define READYUY2_AVX2                                                          \
-    "vmovdqu    " MEMACCESS([yuy2_buf]) ",%%ymm4                    \n"        \
-    "vpshufb    %[kShuffleYUY2Y], %%ymm4, %%ymm4                    \n"        \
-    "vmovdqu    " MEMACCESS([yuy2_buf]) ",%%ymm0                    \n"        \
-    "vpshufb    %[kShuffleYUY2UV], %%ymm0, %%ymm0                   \n"        \
-    "lea        " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf]            \n"
+#define READYUY2_AVX2                                                 \
+  "vmovdqu    (%[yuy2_buf]),%%ymm4                                \n" \
+  "vpshufb    %[kShuffleYUY2Y], %%ymm4, %%ymm4                    \n" \
+  "vmovdqu    (%[yuy2_buf]),%%ymm0                                \n" \
+  "vpshufb    %[kShuffleYUY2UV], %%ymm0, %%ymm0                   \n" \
+  "lea        0x20(%[yuy2_buf]),%[yuy2_buf]                       \n"
 
 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
-#define READUYVY_AVX2                                                          \
-    "vmovdqu     " MEMACCESS([uyvy_buf]) ",%%ymm4                   \n"        \
-    "vpshufb     %[kShuffleUYVYY], %%ymm4, %%ymm4                   \n"        \
-    "vmovdqu     " MEMACCESS([uyvy_buf]) ",%%ymm0                   \n"        \
-    "vpshufb     %[kShuffleUYVYUV], %%ymm0, %%ymm0                  \n"        \
-    "lea        " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf]            \n"
+#define READUYVY_AVX2                                                 \
+  "vmovdqu    (%[uyvy_buf]),%%ymm4                                \n" \
+  "vpshufb    %[kShuffleUYVYY], %%ymm4, %%ymm4                    \n" \
+  "vmovdqu    (%[uyvy_buf]),%%ymm0                                \n" \
+  "vpshufb    %[kShuffleUYVYUV], %%ymm0, %%ymm0                   \n" \
+  "lea        0x20(%[uyvy_buf]),%[uyvy_buf]                       \n"
 
 #if defined(__x86_64__)
-#define YUVTORGB_SETUP_AVX2(yuvconstants)                                      \
-    "vmovdqa     " MEMACCESS([yuvconstants]) ",%%ymm8            \n"           \
-    "vmovdqa     " MEMACCESS2(32, [yuvconstants]) ",%%ymm9       \n"           \
-    "vmovdqa     " MEMACCESS2(64, [yuvconstants]) ",%%ymm10      \n"           \
-    "vmovdqa     " MEMACCESS2(96, [yuvconstants]) ",%%ymm11      \n"           \
-    "vmovdqa     " MEMACCESS2(128, [yuvconstants]) ",%%ymm12     \n"           \
-    "vmovdqa     " MEMACCESS2(160, [yuvconstants]) ",%%ymm13     \n"           \
-    "vmovdqa     " MEMACCESS2(192, [yuvconstants]) ",%%ymm14     \n"
-#define YUVTORGB_AVX2(yuvconstants)                                            \
-    "vpmaddubsw  %%ymm10,%%ymm0,%%ymm2                              \n"        \
-    "vpmaddubsw  %%ymm9,%%ymm0,%%ymm1                               \n"        \
-    "vpmaddubsw  %%ymm8,%%ymm0,%%ymm0                               \n"        \
-    "vpsubw      %%ymm2,%%ymm13,%%ymm2                              \n"        \
-    "vpsubw      %%ymm1,%%ymm12,%%ymm1                              \n"        \
-    "vpsubw      %%ymm0,%%ymm11,%%ymm0                              \n"        \
-    "vpmulhuw    %%ymm14,%%ymm4,%%ymm4                              \n"        \
-    "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n"        \
-    "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n"        \
-    "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"        \
-    "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n"        \
-    "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n"        \
-    "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n"        \
-    "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n"        \
-    "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n"        \
-    "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
+#define YUVTORGB_SETUP_AVX2(yuvconstants)                            \
+  "vmovdqa     (%[yuvconstants]),%%ymm8                          \n" \
+  "vmovdqa     32(%[yuvconstants]),%%ymm9                        \n" \
+  "vmovdqa     64(%[yuvconstants]),%%ymm10                       \n" \
+  "vmovdqa     96(%[yuvconstants]),%%ymm11                       \n" \
+  "vmovdqa     128(%[yuvconstants]),%%ymm12                      \n" \
+  "vmovdqa     160(%[yuvconstants]),%%ymm13                      \n" \
+  "vmovdqa     192(%[yuvconstants]),%%ymm14                      \n"
+
+#define YUVTORGB16_AVX2(yuvconstants)                                 \
+  "vpmaddubsw  %%ymm10,%%ymm0,%%ymm2                              \n" \
+  "vpmaddubsw  %%ymm9,%%ymm0,%%ymm1                               \n" \
+  "vpmaddubsw  %%ymm8,%%ymm0,%%ymm0                               \n" \
+  "vpsubw      %%ymm2,%%ymm13,%%ymm2                              \n" \
+  "vpsubw      %%ymm1,%%ymm12,%%ymm1                              \n" \
+  "vpsubw      %%ymm0,%%ymm11,%%ymm0                              \n" \
+  "vpmulhuw    %%ymm14,%%ymm4,%%ymm4                              \n" \
+  "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
+  "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n" \
+  "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"
+
 #define YUVTORGB_REGS_AVX2 \
-    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+  "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+
 #else  // Convert 16 pixels: 16 UV and 16 Y.
+
 #define YUVTORGB_SETUP_AVX2(yuvconstants)
-#define YUVTORGB_AVX2(yuvconstants)                                            \
-    "vpmaddubsw  " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2   \n"        \
-    "vpmaddubsw  " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1   \n"        \
-    "vpmaddubsw  " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0        \n"        \
-    "vmovdqu     " MEMACCESS2(160, [yuvconstants]) ",%%ymm3         \n"        \
-    "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n"        \
-    "vmovdqu     " MEMACCESS2(128, [yuvconstants]) ",%%ymm3         \n"        \
-    "vpsubw      %%ymm1,%%ymm3,%%ymm1                               \n"        \
-    "vmovdqu     " MEMACCESS2(96, [yuvconstants]) ",%%ymm3          \n"        \
-    "vpsubw      %%ymm0,%%ymm3,%%ymm0                               \n"        \
-    "vpmulhuw    " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4  \n"        \
-    "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n"        \
-    "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n"        \
-    "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"        \
-    "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n"        \
-    "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n"        \
-    "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n"        \
-    "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n"        \
-    "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n"        \
-    "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
+#define YUVTORGB16_AVX2(yuvconstants)                                 \
+  "vpmaddubsw  64(%[yuvconstants]),%%ymm0,%%ymm2                  \n" \
+  "vpmaddubsw  32(%[yuvconstants]),%%ymm0,%%ymm1                  \n" \
+  "vpmaddubsw  (%[yuvconstants]),%%ymm0,%%ymm0                    \n" \
+  "vmovdqu     160(%[yuvconstants]),%%ymm3                        \n" \
+  "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n" \
+  "vmovdqu     128(%[yuvconstants]),%%ymm3                        \n" \
+  "vpsubw      %%ymm1,%%ymm3,%%ymm1                               \n" \
+  "vmovdqu     96(%[yuvconstants]),%%ymm3                         \n" \
+  "vpsubw      %%ymm0,%%ymm3,%%ymm0                               \n" \
+  "vpmulhuw    192(%[yuvconstants]),%%ymm4,%%ymm4                 \n" \
+  "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
+  "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n" \
+  "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"
 #define YUVTORGB_REGS_AVX2
 #endif
 
+#define YUVTORGB_AVX2(yuvconstants)                                   \
+  YUVTORGB16_AVX2(yuvconstants)                                       \
+  "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n" \
+  "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n" \
+  "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n" \
+  "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n" \
+  "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n" \
+  "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
+
 // Store 16 ARGB values.
-#define STOREARGB_AVX2                                                         \
-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
-    "vpunpcklbw %%ymm5,%%ymm2,%%ymm2                                \n"        \
-    "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n"        \
-    "vpunpcklwd %%ymm2,%%ymm0,%%ymm1                                \n"        \
-    "vpunpckhwd %%ymm2,%%ymm0,%%ymm0                                \n"        \
-    "vmovdqu    %%ymm1," MEMACCESS([dst_argb]) "                    \n"        \
-    "vmovdqu    %%ymm0," MEMACCESS2(0x20, [dst_argb]) "             \n"        \
-    "lea       " MEMLEA(0x40, [dst_argb]) ", %[dst_argb]            \n"
+#define STOREARGB_AVX2                                                \
+  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpunpcklbw %%ymm5,%%ymm2,%%ymm2                                \n" \
+  "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n" \
+  "vpunpcklwd %%ymm2,%%ymm0,%%ymm1                                \n" \
+  "vpunpckhwd %%ymm2,%%ymm0,%%ymm0                                \n" \
+  "vmovdqu    %%ymm1,(%[dst_argb])                                \n" \
+  "vmovdqu    %%ymm0,0x20(%[dst_argb])                            \n" \
+  "lea       0x40(%[dst_argb]), %[dst_argb]                       \n"
+
+// Store 16 AR30 values.
+#define STOREAR30_AVX2                                                \
+  "vpsraw     $0x4,%%ymm0,%%ymm0                                  \n" \
+  "vpsraw     $0x4,%%ymm1,%%ymm1                                  \n" \
+  "vpsraw     $0x4,%%ymm2,%%ymm2                                  \n" \
+  "vpminsw    %%ymm7,%%ymm0,%%ymm0                                \n" \
+  "vpminsw    %%ymm7,%%ymm1,%%ymm1                                \n" \
+  "vpminsw    %%ymm7,%%ymm2,%%ymm2                                \n" \
+  "vpmaxsw    %%ymm6,%%ymm0,%%ymm0                                \n" \
+  "vpmaxsw    %%ymm6,%%ymm1,%%ymm1                                \n" \
+  "vpmaxsw    %%ymm6,%%ymm2,%%ymm2                                \n" \
+  "vpsllw     $0x4,%%ymm2,%%ymm2                                  \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
+  "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n" \
+  "vpunpckhwd %%ymm2,%%ymm0,%%ymm3                                \n" \
+  "vpunpcklwd %%ymm2,%%ymm0,%%ymm0                                \n" \
+  "vpunpckhwd %%ymm5,%%ymm1,%%ymm2                                \n" \
+  "vpunpcklwd %%ymm5,%%ymm1,%%ymm1                                \n" \
+  "vpslld     $0xa,%%ymm1,%%ymm1                                  \n" \
+  "vpslld     $0xa,%%ymm2,%%ymm2                                  \n" \
+  "vpor       %%ymm1,%%ymm0,%%ymm0                                \n" \
+  "vpor       %%ymm2,%%ymm3,%%ymm3                                \n" \
+  "vmovdqu    %%ymm0,(%[dst_ar30])                                \n" \
+  "vmovdqu    %%ymm3,0x20(%[dst_ar30])                            \n" \
+  "lea        0x40(%[dst_ar30]), %[dst_ar30]                      \n"
 
 #ifdef HAS_I444TOARGBROW_AVX2
 // 16 pixels
 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               uint8* dst_argb,
+void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
   asm volatile (
     YUVTORGB_SETUP_AVX2(yuvconstants)
     "sub       %[u_buf],%[v_buf]               \n"
     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READYUV444_AVX2
     YUVTORGB_AVX2(yuvconstants)
     STOREARGB_AVX2
@@ -2157,65 +2607,34 @@ void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf,
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
+  : "memory", "cc", YUVTORGB_REGS_AVX2
     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 #endif  // HAS_I444TOARGBROW_AVX2
 
-#ifdef HAS_I411TOARGBROW_AVX2
-// 16 pixels
-// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP I411ToARGBRow_AVX2(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               uint8* dst_argb,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-    "sub       %[u_buf],%[v_buf]               \n"
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-    LABELALIGN
-  "1:                                          \n"
-    READYUV411_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-}
-#endif  // HAS_I411TOARGBROW_AVX2
-
 #if defined(HAS_I422TOARGBROW_AVX2)
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               uint8* dst_argb,
+void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
   asm volatile (
     YUVTORGB_SETUP_AVX2(yuvconstants)
     "sub       %[u_buf],%[v_buf]               \n"
     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READYUV422_AVX2
     YUVTORGB_AVX2(yuvconstants)
     STOREARGB_AVX2
     "sub       $0x10,%[width]                  \n"
     "jg        1b                              \n"
+
     "vzeroupper                                \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
@@ -2223,27 +2642,144 @@ void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
+  : "memory", "cc", YUVTORGB_REGS_AVX2
     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 #endif  // HAS_I422TOARGBROW_AVX2
 
-#if defined(HAS_I422ALPHATOARGBROW_AVX2)
+#if defined(HAS_I422TOAR30ROW_AVX2)
 // 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
-void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               const uint8* a_buf,
-                               uint8* dst_argb,
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
+void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_ar30,
                                const struct YuvConstants* yuvconstants,
                                int width) {
   asm volatile (
     YUVTORGB_SETUP_AVX2(yuvconstants)
     "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"  // AR30 constants
+    "vpsrlw    $14,%%ymm5,%%ymm5               \n"
+    "vpsllw    $4,%%ymm5,%%ymm5                \n"  // 2 alpha bits
+    "vpxor     %%ymm6,%%ymm6,%%ymm6            \n"  // 0 for min
+    "vpcmpeqb  %%ymm7,%%ymm7,%%ymm7            \n"  // 1023 for max
+    "vpsrlw    $6,%%ymm7,%%ymm7                \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
+    READYUV422_AVX2
+    YUVTORGB16_AVX2(yuvconstants)
+    STOREAR30_AVX2
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_I422TOAR30ROW_AVX2
+
+#if defined(HAS_I210TOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
+                               const uint16_t* u_buf,
+                               const uint16_t* v_buf,
+                               uint8_t* dst_argb,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+
+    LABELALIGN
+    "1:                                        \n"
+    READYUV210_AVX2
+    YUVTORGB_AVX2(yuvconstants)
+    STOREARGB_AVX2
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_I210TOARGBROW_AVX2
+
+#if defined(HAS_I210TOAR30ROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
+void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
+                               const uint16_t* u_buf,
+                               const uint16_t* v_buf,
+                               uint8_t* dst_ar30,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"  // AR30 constants
+    "vpsrlw    $14,%%ymm5,%%ymm5               \n"
+    "vpsllw    $4,%%ymm5,%%ymm5                \n"  // 2 alpha bits
+    "vpxor     %%ymm6,%%ymm6,%%ymm6            \n"  // 0 for min
+    "vpcmpeqb  %%ymm7,%%ymm7,%%ymm7            \n"  // 1023 for max
+    "vpsrlw    $6,%%ymm7,%%ymm7                \n"
+
+    LABELALIGN
+    "1:                                        \n"
+    READYUV210_AVX2
+    YUVTORGB16_AVX2(yuvconstants)
+    STOREAR30_AVX2
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_I210TOAR30ROW_AVX2
+
+#if defined(HAS_I422ALPHATOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
+void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
+                                    const uint8_t* u_buf,
+                                    const uint8_t* v_buf,
+                                    const uint8_t* a_buf,
+                                    uint8_t* dst_argb,
+                                    const struct YuvConstants* yuvconstants,
+                                    int width) {
+  // clang-format off
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+
+    LABELALIGN
+    "1:                                        \n"
     READYUVA422_AVX2
     YUVTORGB_AVX2(yuvconstants)
     STOREARGB_AVX2
@@ -2255,33 +2791,35 @@ void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf,
     [v_buf]"+r"(v_buf),    // %[v_buf]
     [a_buf]"+r"(a_buf),    // %[a_buf]
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-#if defined(__i386__) && defined(__pic__)
+#if defined(__i386__)
     [width]"+m"(width)     // %[width]
 #else
     [width]"+rm"(width)    // %[width]
 #endif
   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
+  : "memory", "cc", YUVTORGB_REGS_AVX2
     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
 #endif  // HAS_I422ALPHATOARGBROW_AVX2
 
 #if defined(HAS_I422TORGBAROW_AVX2)
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
-void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               uint8* dst_argb,
+void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
   asm volatile (
     YUVTORGB_SETUP_AVX2(yuvconstants)
     "sub       %[u_buf],%[v_buf]               \n"
     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READYUV422_AVX2
     YUVTORGB_AVX2(yuvconstants)
 
@@ -2292,11 +2830,11 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
     "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
     "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"
     "vpunpckhwd %%ymm1,%%ymm2,%%ymm1           \n"
-    "vmovdqu    %%ymm0," MEMACCESS([dst_argb]) "\n"
-    "vmovdqu    %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
-    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
+    "vmovdqu    %%ymm0,(%[dst_argb])           \n"
+    "vmovdqu    %%ymm1,0x20(%[dst_argb])       \n"
+    "lea        0x40(%[dst_argb]),%[dst_argb]  \n"
+    "sub        $0x10,%[width]                 \n"
+    "jg         1b                             \n"
     "vzeroupper                                \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
@@ -2304,7 +2842,7 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
+  : "memory", "cc", YUVTORGB_REGS_AVX2
     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
@@ -2313,16 +2851,18 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
 #if defined(HAS_NV12TOARGBROW_AVX2)
 // 16 pixels.
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,
-                               const uint8* uv_buf,
-                               uint8* dst_argb,
+void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
+                               const uint8_t* uv_buf,
+                               uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
+  // clang-format off
   asm volatile (
     YUVTORGB_SETUP_AVX2(yuvconstants)
     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READNV12_AVX2
     YUVTORGB_AVX2(yuvconstants)
     STOREARGB_AVX2
@@ -2334,25 +2874,28 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
+    : "memory", "cc", YUVTORGB_REGS_AVX2
     "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
 #endif  // HAS_NV12TOARGBROW_AVX2
 
 #if defined(HAS_NV21TOARGBROW_AVX2)
 // 16 pixels.
 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf,
-                               const uint8* vu_buf,
-                               uint8* dst_argb,
+void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
+                               const uint8_t* vu_buf,
+                               uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
+  // clang-format off
   asm volatile (
     YUVTORGB_SETUP_AVX2(yuvconstants)
     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READNV21_AVX2
     YUVTORGB_AVX2(yuvconstants)
     STOREARGB_AVX2
@@ -2365,24 +2908,27 @@ void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf,
     [width]"+rm"(width)    // %[width]
   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
     [kShuffleNV21]"m"(kShuffleNV21)
-    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
+    : "memory", "cc", YUVTORGB_REGS_AVX2
       "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
 #endif  // HAS_NV21TOARGBROW_AVX2
 
 #if defined(HAS_YUY2TOARGBROW_AVX2)
 // 16 pixels.
 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
-void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,
-                               uint8* dst_argb,
+void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
+                               uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
+  // clang-format off
   asm volatile (
     YUVTORGB_SETUP_AVX2(yuvconstants)
     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READYUY2_AVX2
     YUVTORGB_AVX2(yuvconstants)
     STOREARGB_AVX2
@@ -2395,24 +2941,27 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,
   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
     [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
     [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
-    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
+    : "memory", "cc", YUVTORGB_REGS_AVX2
       "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
 #endif  // HAS_YUY2TOARGBROW_AVX2
 
 #if defined(HAS_UYVYTOARGBROW_AVX2)
 // 16 pixels.
 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
-void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,
-                               uint8* dst_argb,
+void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
+                               uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
+  // clang-format off
   asm volatile (
     YUVTORGB_SETUP_AVX2(yuvconstants)
     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READUYVY_AVX2
     YUVTORGB_AVX2(yuvconstants)
     STOREARGB_AVX2
@@ -2425,1131 +2974,1603 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,
   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
     [kShuffleUYVYY]"m"(kShuffleUYVYY),
     [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
-    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
+    : "memory", "cc", YUVTORGB_REGS_AVX2
       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
 #endif  // HAS_UYVYTOARGBROW_AVX2
 
 #ifdef HAS_I400TOARGBROW_SSE2
-void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
-  asm volatile (
-    "mov       $0x4a354a35,%%eax               \n"  // 4a35 = 18997 = 1.164
-    "movd      %%eax,%%xmm2                    \n"
-    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
-    "mov       $0x04880488,%%eax               \n"  // 0488 = 1160 = 1.164 * 16
-    "movd      %%eax,%%xmm3                    \n"
-    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "pslld     $0x18,%%xmm4                    \n"
-    LABELALIGN
-  "1:                                          \n"
-    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "psubusw   %%xmm3,%%xmm0                   \n"
-    "psrlw     $6, %%xmm0                      \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
+void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "mov       $0x4a354a35,%%eax               \n"  // 4a35 = 18997 = 1.164
+      "movd      %%eax,%%xmm2                    \n"
+      "pshufd    $0x0,%%xmm2,%%xmm2              \n"
+      "mov       $0x04880488,%%eax               \n"  // 0488 = 1160 = 1.164 *
+                                                      // 16
+      "movd      %%eax,%%xmm3                    \n"
+      "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+      "pcmpeqb   %%xmm4,%%xmm4                   \n"
+      "pslld     $0x18,%%xmm4                    \n"
 
-    // Step 2: Weave into ARGB
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm0,%%xmm0                   \n"
-    "punpckhwd %%xmm1,%%xmm1                   \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "por       %%xmm4,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
+      LABELALIGN
+      "1:                                        \n"
+      // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+      "movq      (%0),%%xmm0                     \n"
+      "lea       0x8(%0),%0                      \n"
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "pmulhuw   %%xmm2,%%xmm0                   \n"
+      "psubusw   %%xmm3,%%xmm0                   \n"
+      "psrlw     $6, %%xmm0                      \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
 
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(y_buf),     // %0
-    "+r"(dst_argb),  // %1
-    "+rm"(width)     // %2
-  :
-  : "memory", "cc", "eax"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-  );
+      // Step 2: Weave into ARGB
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpcklwd %%xmm0,%%xmm0                   \n"
+      "punpckhwd %%xmm1,%%xmm1                   \n"
+      "por       %%xmm4,%%xmm0                   \n"
+      "por       %%xmm4,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(y_buf),     // %0
+        "+r"(dst_argb),  // %1
+        "+rm"(width)     // %2
+      :
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
 }
 #endif  // HAS_I400TOARGBROW_SSE2
 
 #ifdef HAS_I400TOARGBROW_AVX2
 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
 // note: vpunpcklbw mutates and vpackuswb unmutates.
-void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
-  asm volatile (
-    "mov        $0x4a354a35,%%eax              \n" // 0488 = 1160 = 1.164 * 16
-    "vmovd      %%eax,%%xmm2                   \n"
-    "vbroadcastss %%xmm2,%%ymm2                \n"
-    "mov        $0x4880488,%%eax               \n" // 4a35 = 18997 = 1.164
-    "vmovd      %%eax,%%xmm3                   \n"
-    "vbroadcastss %%xmm3,%%ymm3                \n"
-    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
-    "vpslld     $0x18,%%ymm4,%%ymm4            \n"
+void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "mov        $0x4a354a35,%%eax              \n"  // 0488 = 1160 = 1.164 *
+                                                      // 16
+      "vmovd      %%eax,%%xmm2                   \n"
+      "vbroadcastss %%xmm2,%%ymm2                \n"
+      "mov        $0x4880488,%%eax               \n"  // 4a35 = 18997 = 1.164
+      "vmovd      %%eax,%%xmm3                   \n"
+      "vbroadcastss %%xmm3,%%ymm3                \n"
+      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpslld     $0x18,%%ymm4,%%ymm4            \n"
 
-    LABELALIGN
-  "1:                                          \n"
-    // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
-    "vmovdqu    " MEMACCESS(0) ",%%xmm0        \n"
-    "lea        " MEMLEA(0x10,0) ",%0          \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
-    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpsubusw   %%ymm3,%%ymm0,%%ymm0           \n"
-    "vpsrlw     $0x6,%%ymm0,%%ymm0             \n"
-    "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
-    "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
-    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
-    "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
-    "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
-    "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
-    "vmovdqu    %%ymm1," MEMACCESS2(0x20,1) "  \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub        $0x10,%2                       \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(y_buf),     // %0
-    "+r"(dst_argb),  // %1
-    "+rm"(width)     // %2
-  :
-  : "memory", "cc", "eax"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
+      "vmovdqu    (%0),%%xmm0                    \n"
+      "lea        0x10(%0),%0                    \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpsubusw   %%ymm3,%%ymm0,%%ymm0           \n"
+      "vpsrlw     $0x6,%%ymm0,%%ymm0             \n"
+      "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
+      "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+      "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
+      "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
+      "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "vmovdqu    %%ymm1,0x20(%1)                \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub        $0x10,%2                       \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(y_buf),     // %0
+        "+r"(dst_argb),  // %1
+        "+rm"(width)     // %2
+      :
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
 }
 #endif  // HAS_I400TOARGBROW_AVX2
 
 #ifdef HAS_MIRRORROW_SSSE3
 // Shuffle table for reversing the bytes.
-static uvec8 kShuffleMirror = {
-  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
-};
+static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
+                                     7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
 
-void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
-  asm volatile (
-    "movdqa    %3,%%xmm5                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    MEMOPREG(movdqu,-0x10,0,2,1,xmm0)          //  movdqu -0x10(%0,%2),%%xmm0
-    "pshufb    %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(temp_width)  // %2
-  : "m"(kShuffleMirror) // %3
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm5"
-  );
+  asm volatile(
+
+      "movdqa    %3,%%xmm5                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    -0x10(%0,%2,1),%%xmm0           \n"
+      "pshufb    %%xmm5,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src),           // %0
+        "+r"(dst),           // %1
+        "+r"(temp_width)     // %2
+      : "m"(kShuffleMirror)  // %3
+      : "memory", "cc", "xmm0", "xmm5");
 }
 #endif  // HAS_MIRRORROW_SSSE3
 
 #ifdef HAS_MIRRORROW_AVX2
-void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
-  asm volatile (
-    "vbroadcastf128 %3,%%ymm5                  \n"
-    LABELALIGN
-  "1:                                          \n"
-    MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0)         //  vmovdqu -0x20(%0,%2),%%ymm0
-    "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"
-    "vpermq     $0x4e,%%ymm0,%%ymm0            \n"
-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(temp_width)  // %2
-  : "m"(kShuffleMirror) // %3
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm5"
-  );
+  asm volatile(
+
+      "vbroadcastf128 %3,%%ymm5                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    -0x20(%0,%2,1),%%ymm0          \n"
+      "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpermq     $0x4e,%%ymm0,%%ymm0            \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src),           // %0
+        "+r"(dst),           // %1
+        "+r"(temp_width)     // %2
+      : "m"(kShuffleMirror)  // %3
+      : "memory", "cc", "xmm0", "xmm5");
 }
 #endif  // HAS_MIRRORROW_AVX2
 
 #ifdef HAS_MIRRORUVROW_SSSE3
 // Shuffle table for reversing the bytes of UV channels.
-static uvec8 kShuffleMirrorUV = {
-  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
-};
-void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
+static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
+                                       15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
+void MirrorUVRow_SSSE3(const uint8_t* src,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
                        int width) {
   intptr_t temp_width = (intptr_t)(width);
-  asm volatile (
-    "movdqa    %4,%%xmm1                       \n"
-    "lea       " MEMLEA4(-0x10,0,3,2) ",%0     \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(-0x10,0) ",%0          \n"
-    "pshufb    %%xmm1,%%xmm0                   \n"
-    "movlpd    %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movhpd,xmm0,0x00,1,2,1)           //  movhpd    %%xmm0,(%1,%2)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $8,%3                           \n"
-    "jg        1b                              \n"
-  : "+r"(src),      // %0
-    "+r"(dst_u),    // %1
-    "+r"(dst_v),    // %2
-    "+r"(temp_width)  // %3
-  : "m"(kShuffleMirrorUV)  // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1"
-  );
+  asm volatile(
+      "movdqa    %4,%%xmm1                       \n"
+      "lea       -0x10(%0,%3,2),%0               \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "lea       -0x10(%0),%0                    \n"
+      "pshufb    %%xmm1,%%xmm0                   \n"
+      "movlpd    %%xmm0,(%1)                     \n"
+      "movhpd    %%xmm0,0x00(%1,%2,1)            \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $8,%3                           \n"
+      "jg        1b                              \n"
+      : "+r"(src),             // %0
+        "+r"(dst_u),           // %1
+        "+r"(dst_v),           // %2
+        "+r"(temp_width)       // %3
+      : "m"(kShuffleMirrorUV)  // %4
+      : "memory", "cc", "xmm0", "xmm1");
 }
 #endif  // HAS_MIRRORUVROW_SSSE3
 
 #ifdef HAS_ARGBMIRRORROW_SSE2
 
-void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
-  asm volatile (
-    "lea       " MEMLEA4(-0x10,0,2,4) ",%0     \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "pshufd    $0x1b,%%xmm0,%%xmm0             \n"
-    "lea       " MEMLEA(-0x10,0) ",%0          \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(temp_width)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0"
-  );
+  asm volatile(
+
+      "lea       -0x10(%0,%2,4),%0               \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "pshufd    $0x1b,%%xmm0,%%xmm0             \n"
+      "lea       -0x10(%0),%0                    \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),        // %0
+        "+r"(dst),        // %1
+        "+r"(temp_width)  // %2
+      :
+      : "memory", "cc", "xmm0");
 }
 #endif  // HAS_ARGBMIRRORROW_SSE2
 
 #ifdef HAS_ARGBMIRRORROW_AVX2
 // Shuffle table for reversing the bytes.
-static const ulvec32 kARGBShuffleMirror_AVX2 = {
-  7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
-};
-void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
+void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
-  asm volatile (
-    "vmovdqu    %3,%%ymm5                      \n"
-    LABELALIGN
-  "1:                                          \n"
-    VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
-    "lea        " MEMLEA(0x20,1) ",%1          \n"
-    "sub        $0x8,%2                        \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(temp_width)  // %2
-  : "m"(kARGBShuffleMirror_AVX2) // %3
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm5"
-  );
+  asm volatile(
+
+      "vmovdqu    %3,%%ymm5                      \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vpermd    -0x20(%0,%2,4),%%ymm5,%%ymm0    \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea        0x20(%1),%1                    \n"
+      "sub        $0x8,%2                        \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src),                    // %0
+        "+r"(dst),                    // %1
+        "+r"(temp_width)              // %2
+      : "m"(kARGBShuffleMirror_AVX2)  // %3
+      : "memory", "cc", "xmm0", "xmm5");
 }
 #endif  // HAS_ARGBMIRRORROW_AVX2
 
 #ifdef HAS_SPLITUVROW_AVX2
-void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void SplitUVRow_AVX2(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
                      int width) {
-  asm volatile (
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5             \n"
-    "vpsrlw     $0x8,%%ymm5,%%ymm5               \n"
-    "sub        %1,%2                            \n"
-    LABELALIGN
-  "1:                                            \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0          \n"
-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1    \n"
-    "lea        " MEMLEA(0x40,0) ",%0            \n"
-    "vpsrlw     $0x8,%%ymm0,%%ymm2               \n"
-    "vpsrlw     $0x8,%%ymm1,%%ymm3               \n"
-    "vpand      %%ymm5,%%ymm0,%%ymm0             \n"
-    "vpand      %%ymm5,%%ymm1,%%ymm1             \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0             \n"
-    "vpackuswb  %%ymm3,%%ymm2,%%ymm2             \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0              \n"
-    "vpermq     $0xd8,%%ymm2,%%ymm2              \n"
-    "vmovdqu    %%ymm0," MEMACCESS(1) "          \n"
-    MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1)             //  vmovdqu %%ymm2,(%1,%2)
-    "lea        " MEMLEA(0x20,1) ",%1            \n"
-    "sub        $0x20,%3                         \n"
-    "jg         1b                               \n"
-    "vzeroupper                                  \n"
-  : "+r"(src_uv),     // %0
-    "+r"(dst_u),      // %1
-    "+r"(dst_v),      // %2
-    "+r"(width)         // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+  asm volatile(
+      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+      "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
+      "sub        %1,%2                          \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "lea        0x40(%0),%0                    \n"
+      "vpsrlw     $0x8,%%ymm0,%%ymm2             \n"
+      "vpsrlw     $0x8,%%ymm1,%%ymm3             \n"
+      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpand      %%ymm5,%%ymm1,%%ymm1           \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpackuswb  %%ymm3,%%ymm2,%%ymm2           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "vmovdqu    %%ymm2,0x00(%1,%2,1)            \n"
+      "lea        0x20(%1),%1                    \n"
+      "sub        $0x20,%3                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_u),   // %1
+        "+r"(dst_v),   // %2
+        "+r"(width)    // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_SPLITUVROW_AVX2
 
 #ifdef HAS_SPLITUVROW_SSE2
-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void SplitUVRow_SSE2(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
                      int width) {
-  asm volatile (
-    "pcmpeqb    %%xmm5,%%xmm5                    \n"
-    "psrlw      $0x8,%%xmm5                      \n"
-    "sub        %1,%2                            \n"
-    LABELALIGN
-  "1:                                            \n"
-    "movdqu     " MEMACCESS(0) ",%%xmm0          \n"
-    "movdqu     " MEMACCESS2(0x10,0) ",%%xmm1    \n"
-    "lea        " MEMLEA(0x20,0) ",%0            \n"
-    "movdqa     %%xmm0,%%xmm2                    \n"
-    "movdqa     %%xmm1,%%xmm3                    \n"
-    "pand       %%xmm5,%%xmm0                    \n"
-    "pand       %%xmm5,%%xmm1                    \n"
-    "packuswb   %%xmm1,%%xmm0                    \n"
-    "psrlw      $0x8,%%xmm2                      \n"
-    "psrlw      $0x8,%%xmm3                      \n"
-    "packuswb   %%xmm3,%%xmm2                    \n"
-    "movdqu     %%xmm0," MEMACCESS(1) "          \n"
-    MEMOPMEM(movdqu,xmm2,0x00,1,2,1)             //  movdqu     %%xmm2,(%1,%2)
-    "lea        " MEMLEA(0x10,1) ",%1            \n"
-    "sub        $0x10,%3                         \n"
-    "jg         1b                               \n"
-  : "+r"(src_uv),     // %0
-    "+r"(dst_u),      // %1
-    "+r"(dst_v),      // %2
-    "+r"(width)         // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+  asm volatile(
+      "pcmpeqb    %%xmm5,%%xmm5                  \n"
+      "psrlw      $0x8,%%xmm5                    \n"
+      "sub        %1,%2                          \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu     (%0),%%xmm0                    \n"
+      "movdqu     0x10(%0),%%xmm1                \n"
+      "lea        0x20(%0),%0                    \n"
+      "movdqa     %%xmm0,%%xmm2                  \n"
+      "movdqa     %%xmm1,%%xmm3                  \n"
+      "pand       %%xmm5,%%xmm0                  \n"
+      "pand       %%xmm5,%%xmm1                  \n"
+      "packuswb   %%xmm1,%%xmm0                  \n"
+      "psrlw      $0x8,%%xmm2                    \n"
+      "psrlw      $0x8,%%xmm3                    \n"
+      "packuswb   %%xmm3,%%xmm2                  \n"
+      "movdqu     %%xmm0,(%1)                    \n"
+      "movdqu    %%xmm2,0x00(%1,%2,1)            \n"
+      "lea        0x10(%1),%1                    \n"
+      "sub        $0x10,%3                       \n"
+      "jg         1b                             \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_u),   // %1
+        "+r"(dst_v),   // %2
+        "+r"(width)    // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_SPLITUVROW_SSE2
 
 #ifdef HAS_MERGEUVROW_AVX2
-void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_AVX2(const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_uv,
                      int width) {
-  asm volatile (
-    "sub       %0,%1                             \n"
-    LABELALIGN
-  "1:                                            \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0           \n"
-    MEMOPREG(vmovdqu,0x00,0,1,1,ymm1)             //  vmovdqu (%0,%1,1),%%ymm1
-    "lea       " MEMLEA(0x20,0) ",%0             \n"
-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm2             \n"
-    "vpunpckhbw %%ymm1,%%ymm0,%%ymm0             \n"
-    "vextractf128 $0x0,%%ymm2," MEMACCESS(2) "   \n"
-    "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
-    "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
-    "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
-    "lea       " MEMLEA(0x40,2) ",%2             \n"
-    "sub       $0x20,%3                          \n"
-    "jg        1b                                \n"
-    "vzeroupper                                  \n"
-  : "+r"(src_u),     // %0
-    "+r"(src_v),     // %1
-    "+r"(dst_uv),    // %2
-    "+r"(width)      // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2"
-  );
+  asm volatile(
+
+      "sub       %0,%1                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu    0x00(%0,%1,1),%%ymm1           \n"
+      "lea       0x20(%0),%0                     \n"
+      "vpunpcklbw %%ymm1,%%ymm0,%%ymm2           \n"
+      "vpunpckhbw %%ymm1,%%ymm0,%%ymm0           \n"
+      "vextractf128 $0x0,%%ymm2,(%2)             \n"
+      "vextractf128 $0x0,%%ymm0,0x10(%2)         \n"
+      "vextractf128 $0x1,%%ymm2,0x20(%2)         \n"
+      "vextractf128 $0x1,%%ymm0,0x30(%2)         \n"
+      "lea       0x40(%2),%2                     \n"
+      "sub       $0x20,%3                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_u),   // %0
+        "+r"(src_v),   // %1
+        "+r"(dst_uv),  // %2
+        "+r"(width)    // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_MERGEUVROW_AVX2
 
 #ifdef HAS_MERGEUVROW_SSE2
-void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_SSE2(const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_uv,
                      int width) {
-  asm volatile (
-    "sub       %0,%1                             \n"
-    LABELALIGN
-  "1:                                            \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
-    MEMOPREG(movdqu,0x00,0,1,1,xmm1)             //  movdqu    (%0,%1,1),%%xmm1
-    "lea       " MEMLEA(0x10,0) ",%0             \n"
-    "movdqa    %%xmm0,%%xmm2                     \n"
-    "punpcklbw %%xmm1,%%xmm0                     \n"
-    "punpckhbw %%xmm1,%%xmm2                     \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "           \n"
-    "movdqu    %%xmm2," MEMACCESS2(0x10,2) "     \n"
-    "lea       " MEMLEA(0x20,2) ",%2             \n"
-    "sub       $0x10,%3                          \n"
-    "jg        1b                                \n"
-  : "+r"(src_u),     // %0
-    "+r"(src_v),     // %1
-    "+r"(dst_uv),    // %2
-    "+r"(width)      // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2"
-  );
+  asm volatile(
+
+      "sub       %0,%1                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%1,1),%%xmm1            \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "punpcklbw %%xmm1,%%xmm0                   \n"
+      "punpckhbw %%xmm1,%%xmm2                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "movdqu    %%xmm2,0x10(%2)                 \n"
+      "lea       0x20(%2),%2                     \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_u),   // %0
+        "+r"(src_v),   // %1
+        "+r"(dst_uv),  // %2
+        "+r"(width)    // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_MERGEUVROW_SSE2
 
-#ifdef HAS_COPYROW_SSE2
-void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 128 = 9 bits
+// 64 = 10 bits
+// 16 = 12 bits
+// 1 = 16 bits
+#ifdef HAS_MERGEUVROW_16_AVX2
+void MergeUVRow_16_AVX2(const uint16_t* src_u,
+                        const uint16_t* src_v,
+                        uint16_t* dst_uv,
+                        int scale,
+                        int width) {
+  // clang-format off
   asm volatile (
-    "test       $0xf,%0                        \n"
-    "jne        2f                             \n"
-    "test       $0xf,%1                        \n"
-    "jne        2f                             \n"
+    "vmovd      %4,%%xmm3                      \n"
+    "vpunpcklwd %%xmm3,%%xmm3,%%xmm3           \n"
+    "vbroadcastss %%xmm3,%%ymm3                \n"
+    "sub       %0,%1                           \n"
+
+    // 16 pixels per loop.
     LABELALIGN
-  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "1:                                        \n"
+    "vmovdqu   (%0),%%ymm0                     \n"
+    "vmovdqu   (%0,%1,1),%%ymm1                \n"
+    "add        $0x20,%0                       \n"
+
+    "vpmullw   %%ymm3,%%ymm0,%%ymm0            \n"
+    "vpmullw   %%ymm3,%%ymm1,%%ymm1            \n"
+    "vpunpcklwd %%ymm1,%%ymm0,%%ymm2           \n"  // mutates
+    "vpunpckhwd %%ymm1,%%ymm0,%%ymm0           \n"
+    "vextractf128 $0x0,%%ymm2,(%2)             \n"
+    "vextractf128 $0x0,%%ymm0,0x10(%2)         \n"
+    "vextractf128 $0x1,%%ymm2,0x20(%2)         \n"
+    "vextractf128 $0x1,%%ymm0,0x30(%2)         \n"
+    "add       $0x40,%2                        \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_u),   // %0
+    "+r"(src_v),   // %1
+    "+r"(dst_uv),  // %2
+    "+r"(width)    // %3
+  : "r"(scale)     // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+  // clang-format on
+}
+#endif  // HAS_MERGEUVROW_AVX2
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 128 = 9 bits
+// 64 = 10 bits
+// 16 = 12 bits
+// 1 = 16 bits
+#ifdef HAS_MULTIPLYROW_16_AVX2
+void MultiplyRow_16_AVX2(const uint16_t* src_y,
+                         uint16_t* dst_y,
+                         int scale,
+                         int width) {
+  // clang-format off
+  asm volatile (
+    "vmovd      %3,%%xmm3                      \n"
+    "vpunpcklwd %%xmm3,%%xmm3,%%xmm3           \n"
+    "vbroadcastss %%xmm3,%%ymm3                \n"
+    "sub       %0,%1                           \n"
+
+    // 16 pixels per loop.
+    LABELALIGN
+    "1:                                        \n"
+    "vmovdqu   (%0),%%ymm0                     \n"
+    "vmovdqu   0x20(%0),%%ymm1                 \n"
+    "vpmullw   %%ymm3,%%ymm0,%%ymm0            \n"
+    "vpmullw   %%ymm3,%%ymm1,%%ymm1            \n"
+    "vmovdqu   %%ymm0,(%0,%1)                  \n"
+    "vmovdqu   %%ymm1,0x20(%0,%1)              \n"
+    "add        $0x40,%0                       \n"
     "sub       $0x20,%2                        \n"
     "jg        1b                              \n"
-    "jmp       9f                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_y),   // %0
+    "+r"(dst_y),   // %1
+    "+r"(width)    // %2
+  : "r"(scale)     // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm3");
+  // clang-format on
+}
+#endif  // HAS_MULTIPLYROW_16_AVX2
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+void Convert16To8Row_SSSE3(const uint16_t* src_y,
+                           uint8_t* dst_y,
+                           int scale,
+                           int width) {
+  // clang-format off
+  asm volatile (
+    "movd      %3,%%xmm2                      \n"
+    "punpcklwd %%xmm2,%%xmm2                  \n"
+    "pshufd    $0x0,%%xmm2,%%xmm2             \n"
+
+    // 32 pixels per loop.
     LABELALIGN
-  "2:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "1:                                       \n"
+    "movdqu    (%0),%%xmm0                    \n"
+    "movdqu    0x10(%0),%%xmm1                \n"
+    "add       $0x20,%0                       \n"
+    "pmulhuw   %%xmm2,%%xmm0                  \n"
+    "pmulhuw   %%xmm2,%%xmm1                  \n"
+    "packuswb  %%xmm1,%%xmm0                  \n"
+    "movdqu    %%xmm0,(%1)                    \n"
+    "add       $0x10,%1                       \n"
+    "sub       $0x10,%2                       \n"
+    "jg        1b                             \n"
+  : "+r"(src_y),   // %0
+    "+r"(dst_y),   // %1
+    "+r"(width)    // %2
+  : "r"(scale)     // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2");
+  // clang-format on
+}
+
+#ifdef HAS_CONVERT16TO8ROW_AVX2
+void Convert16To8Row_AVX2(const uint16_t* src_y,
+                          uint8_t* dst_y,
+                          int scale,
+                          int width) {
+  // clang-format off
+  asm volatile (
+    "vmovd      %3,%%xmm2                      \n"
+    "vpunpcklwd %%xmm2,%%xmm2,%%xmm2           \n"
+    "vbroadcastss %%xmm2,%%ymm2                \n"
+
+    // 32 pixels per loop.
+    LABELALIGN
+    "1:                                        \n"
+    "vmovdqu   (%0),%%ymm0                     \n"
+    "vmovdqu   0x20(%0),%%ymm1                 \n"
+    "add       $0x40,%0                        \n"
+    "vpmulhuw  %%ymm2,%%ymm0,%%ymm0            \n"
+    "vpmulhuw  %%ymm2,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"  // mutates
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vmovdqu   %%ymm0,(%1)                     \n"
+    "add       $0x20,%1                        \n"
     "sub       $0x20,%2                        \n"
-    "jg        2b                              \n"
-  "9:                                          \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(count)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1"
-  );
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_y),   // %0
+    "+r"(dst_y),   // %1
+    "+r"(width)    // %2
+  : "r"(scale)     // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2");
+  // clang-format on
+}
+#endif  // HAS_CONVERT16TO8ROW_AVX2
+
+// Use scale to convert to lsb formats depending how many bits there are:
+// 512 = 9 bits
+// 1024 = 10 bits
+// 4096 = 12 bits
+// TODO(fbarchard): reduce to SSE2
+void Convert8To16Row_SSE2(const uint8_t* src_y,
+                          uint16_t* dst_y,
+                          int scale,
+                          int width) {
+  // clang-format off
+  asm volatile (
+    "movd      %3,%%xmm2                      \n"
+    "punpcklwd %%xmm2,%%xmm2                  \n"
+    "pshufd    $0x0,%%xmm2,%%xmm2             \n"
+
+    // 32 pixels per loop.
+    LABELALIGN
+    "1:                                       \n"
+    "movdqu    (%0),%%xmm0                    \n"
+    "movdqa    %%xmm0,%%xmm1                  \n"
+    "punpcklbw %%xmm0,%%xmm0                  \n"
+    "punpckhbw %%xmm1,%%xmm1                  \n"
+    "add       $0x10,%0                       \n"
+    "pmulhuw   %%xmm2,%%xmm0                  \n"
+    "pmulhuw   %%xmm2,%%xmm1                  \n"
+    "movdqu    %%xmm0,(%1)                    \n"
+    "movdqu    %%xmm1,0x10(%1)                \n"
+    "add       $0x20,%1                       \n"
+    "sub       $0x10,%2                       \n"
+    "jg        1b                             \n"
+  : "+r"(src_y),   // %0
+    "+r"(dst_y),   // %1
+    "+r"(width)    // %2
+  : "r"(scale)     // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2");
+  // clang-format on
+}
+
+#ifdef HAS_CONVERT8TO16ROW_AVX2
+void Convert8To16Row_AVX2(const uint8_t* src_y,
+                          uint16_t* dst_y,
+                          int scale,
+                          int width) {
+  // clang-format off
+  asm volatile (
+    "vmovd      %3,%%xmm2                      \n"
+    "vpunpcklwd %%xmm2,%%xmm2,%%xmm2           \n"
+    "vbroadcastss %%xmm2,%%ymm2                \n"
+
+    // 32 pixels per loop.
+    LABELALIGN
+    "1:                                        \n"
+    "vmovdqu   (%0),%%ymm0                     \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "add       $0x20,%0                        \n"
+    "vpunpckhbw %%ymm0,%%ymm0,%%ymm1           \n"
+    "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
+    "vpmulhuw  %%ymm2,%%ymm0,%%ymm0            \n"
+    "vpmulhuw  %%ymm2,%%ymm1,%%ymm1            \n"
+    "vmovdqu   %%ymm0,(%1)                     \n"
+    "vmovdqu   %%ymm1,0x20(%1)                 \n"
+    "add       $0x40,%1                        \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_y),   // %0
+    "+r"(dst_y),   // %1
+    "+r"(width)    // %2
+  : "r"(scale)     // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2");
+  // clang-format on
+}
+#endif  // HAS_CONVERT8TO16ROW_AVX2
+
+#ifdef HAS_SPLITRGBROW_SSSE3
+
+// Shuffle table for converting RGB to Planar.
+static const uvec8 kShuffleMaskRGBToR0 = {0u,   3u,   6u,   9u,   12u,  15u,
+                                          128u, 128u, 128u, 128u, 128u, 128u,
+                                          128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u,
+                                          2u,   5u,   8u,   11u,  14u,  128u,
+                                          128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u,
+                                          128u, 128u, 128u, 128u, 128u, 1u,
+                                          4u,   7u,   10u,  13u};
+
+static const uvec8 kShuffleMaskRGBToG0 = {1u,   4u,   7u,   10u,  13u,  128u,
+                                          128u, 128u, 128u, 128u, 128u, 128u,
+                                          128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u,
+                                          3u,   6u,   9u,   12u,  15u,  128u,
+                                          128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u,
+                                          128u, 128u, 128u, 128u, 128u, 2u,
+                                          5u,   8u,   11u,  14u};
+
+static const uvec8 kShuffleMaskRGBToB0 = {2u,   5u,   8u,   11u,  14u,  128u,
+                                          128u, 128u, 128u, 128u, 128u, 128u,
+                                          128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u,
+                                          4u,   7u,   10u,  13u,  128u, 128u,
+                                          128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u,
+                                          128u, 128u, 128u, 128u, 0u,   3u,
+                                          6u,   9u,   12u,  15u};
+
+void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
+                       uint8_t* dst_r,
+                       uint8_t* dst_g,
+                       uint8_t* dst_b,
+                       int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu     (%0),%%xmm0                    \n"
+      "movdqu     0x10(%0),%%xmm1                \n"
+      "movdqu     0x20(%0),%%xmm2                \n"
+      "pshufb     %5, %%xmm0                     \n"
+      "pshufb     %6, %%xmm1                     \n"
+      "pshufb     %7, %%xmm2                     \n"
+      "por        %%xmm1,%%xmm0                  \n"
+      "por        %%xmm2,%%xmm0                  \n"
+      "movdqu     %%xmm0,(%1)                    \n"
+      "lea        0x10(%1),%1                    \n"
+
+      "movdqu     (%0),%%xmm0                    \n"
+      "movdqu     0x10(%0),%%xmm1                \n"
+      "movdqu     0x20(%0),%%xmm2                \n"
+      "pshufb     %8, %%xmm0                     \n"
+      "pshufb     %9, %%xmm1                     \n"
+      "pshufb     %10, %%xmm2                    \n"
+      "por        %%xmm1,%%xmm0                  \n"
+      "por        %%xmm2,%%xmm0                  \n"
+      "movdqu     %%xmm0,(%2)                    \n"
+      "lea        0x10(%2),%2                    \n"
+
+      "movdqu     (%0),%%xmm0                    \n"
+      "movdqu     0x10(%0),%%xmm1                \n"
+      "movdqu     0x20(%0),%%xmm2                \n"
+      "pshufb     %11, %%xmm0                    \n"
+      "pshufb     %12, %%xmm1                    \n"
+      "pshufb     %13, %%xmm2                    \n"
+      "por        %%xmm1,%%xmm0                  \n"
+      "por        %%xmm2,%%xmm0                  \n"
+      "movdqu     %%xmm0,(%3)                    \n"
+      "lea        0x10(%3),%3                    \n"
+      "lea        0x30(%0),%0                    \n"
+      "sub        $0x10,%4                       \n"
+      "jg         1b                             \n"
+      : "+r"(src_rgb),             // %0
+        "+r"(dst_r),               // %1
+        "+r"(dst_g),               // %2
+        "+r"(dst_b),               // %3
+        "+r"(width)                // %4
+      : "m"(kShuffleMaskRGBToR0),  // %5
+        "m"(kShuffleMaskRGBToR1),  // %6
+        "m"(kShuffleMaskRGBToR2),  // %7
+        "m"(kShuffleMaskRGBToG0),  // %8
+        "m"(kShuffleMaskRGBToG1),  // %9
+        "m"(kShuffleMaskRGBToG2),  // %10
+        "m"(kShuffleMaskRGBToB0),  // %11
+        "m"(kShuffleMaskRGBToB1),  // %12
+        "m"(kShuffleMaskRGBToB2)   // %13
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_SPLITRGBROW_SSSE3
+
+#ifdef HAS_MERGERGBROW_SSSE3
+
+// Shuffle table for converting RGB to Planar.
+static const uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u,
+                                          2u, 128u, 128u, 3u, 128u, 128u,
+                                          4u, 128u, 128u, 5u};
+static const uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u,
+                                          128u, 2u, 128u, 128u, 3u, 128u,
+                                          128u, 4u, 128u, 128u};
+static const uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u,
+                                          128u, 128u, 2u, 128u, 128u, 3u,
+                                          128u, 128u, 4u, 128u};
+
+static const uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u,
+                                          7u, 128u, 128u, 8u, 128u, 128u,
+                                          9u, 128u, 128u, 10u};
+static const uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u,
+                                          128u, 7u, 128u, 128u, 8u, 128u,
+                                          128u, 9u, 128u, 128u};
+static const uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u,  128u, 128u, 7u,
+                                          128u, 128u, 8u,  128u, 128u, 9u,
+                                          128u, 128u, 10u, 128u};
+
+static const uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u,
+                                          12u, 128u, 128u, 13u, 128u, 128u,
+                                          14u, 128u, 128u, 15u};
+static const uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u,
+                                          128u, 13u, 128u, 128u, 14u, 128u,
+                                          128u, 15u, 128u, 128u};
+static const uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u,
+                                          128u, 128u, 13u, 128u, 128u, 14u,
+                                          128u, 128u, 15u, 128u};
+
+void MergeRGBRow_SSSE3(const uint8_t* src_r,
+                       const uint8_t* src_g,
+                       const uint8_t* src_b,
+                       uint8_t* dst_rgb,
+                       int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu     (%0),%%xmm0                    \n"
+      "movdqu     (%1),%%xmm1                    \n"
+      "movdqu     (%2),%%xmm2                    \n"
+      "pshufb     %5, %%xmm0                     \n"
+      "pshufb     %6, %%xmm1                     \n"
+      "pshufb     %7, %%xmm2                     \n"
+      "por        %%xmm1,%%xmm0                  \n"
+      "por        %%xmm2,%%xmm0                  \n"
+      "movdqu     %%xmm0,(%3)                    \n"
+
+      "movdqu     (%0),%%xmm0                    \n"
+      "movdqu     (%1),%%xmm1                    \n"
+      "movdqu     (%2),%%xmm2                    \n"
+      "pshufb     %8, %%xmm0                     \n"
+      "pshufb     %9, %%xmm1                     \n"
+      "pshufb     %10, %%xmm2                    \n"
+      "por        %%xmm1,%%xmm0                  \n"
+      "por        %%xmm2,%%xmm0                  \n"
+      "movdqu     %%xmm0,16(%3)                  \n"
+
+      "movdqu     (%0),%%xmm0                    \n"
+      "movdqu     (%1),%%xmm1                    \n"
+      "movdqu     (%2),%%xmm2                    \n"
+      "pshufb     %11, %%xmm0                    \n"
+      "pshufb     %12, %%xmm1                    \n"
+      "pshufb     %13, %%xmm2                    \n"
+      "por        %%xmm1,%%xmm0                  \n"
+      "por        %%xmm2,%%xmm0                  \n"
+      "movdqu     %%xmm0,32(%3)                  \n"
+
+      "lea        0x10(%0),%0                    \n"
+      "lea        0x10(%1),%1                    \n"
+      "lea        0x10(%2),%2                    \n"
+      "lea        0x30(%3),%3                    \n"
+      "sub        $0x10,%4                       \n"
+      "jg         1b                             \n"
+      : "+r"(src_r),               // %0
+        "+r"(src_g),               // %1
+        "+r"(src_b),               // %2
+        "+r"(dst_rgb),             // %3
+        "+r"(width)                // %4
+      : "m"(kShuffleMaskRToRGB0),  // %5
+        "m"(kShuffleMaskGToRGB0),  // %6
+        "m"(kShuffleMaskBToRGB0),  // %7
+        "m"(kShuffleMaskRToRGB1),  // %8
+        "m"(kShuffleMaskGToRGB1),  // %9
+        "m"(kShuffleMaskBToRGB1),  // %10
+        "m"(kShuffleMaskRToRGB2),  // %11
+        "m"(kShuffleMaskGToRGB2),  // %12
+        "m"(kShuffleMaskBToRGB2)   // %13
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_MERGERGBROW_SSSE3
+
+#ifdef HAS_COPYROW_SSE2
+void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "test       $0xf,%0                        \n"
+      "jne        2f                             \n"
+      "test       $0xf,%1                        \n"
+      "jne        2f                             \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqa    (%0),%%xmm0                     \n"
+      "movdqa    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "movdqa    %%xmm0,(%1)                     \n"
+      "movdqa    %%xmm1,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+      "jmp       9f                              \n"
+
+      LABELALIGN
+      "2:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        2b                              \n"
+
+      LABELALIGN "9:                             \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 #endif  // HAS_COPYROW_SSE2
 
 #ifdef HAS_COPYROW_AVX
-void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
-    "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub       $0x40,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(count)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1"
-  );
+void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "vmovdqu   %%ymm0,(%1)                     \n"
+      "vmovdqu   %%ymm1,0x20(%1)                 \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x40,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 #endif  // HAS_COPYROW_AVX
 
 #ifdef HAS_COPYROW_ERMS
 // Multiple of 1.
-void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
+void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
   size_t width_tmp = (size_t)(width);
-  asm volatile (
-    "rep movsb " MEMMOVESTRING(0,1) "          \n"
-  : "+S"(src),  // %0
-    "+D"(dst),  // %1
-    "+c"(width_tmp) // %2
-  :
-  : "memory", "cc"
-  );
+  asm volatile(
+
+      "rep movsb                      \n"
+      : "+S"(src),       // %0
+        "+D"(dst),       // %1
+        "+c"(width_tmp)  // %2
+      :
+      : "memory", "cc");
 }
 #endif  // HAS_COPYROW_ERMS
 
 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
 // width in pixels
-void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm0,%%xmm0                   \n"
-    "pslld     $0x18,%%xmm0                    \n"
-    "pcmpeqb   %%xmm1,%%xmm1                   \n"
-    "psrld     $0x8,%%xmm1                     \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
-    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
-    "pand      %%xmm0,%%xmm2                   \n"
-    "pand      %%xmm0,%%xmm3                   \n"
-    "pand      %%xmm1,%%xmm4                   \n"
-    "pand      %%xmm1,%%xmm5                   \n"
-    "por       %%xmm4,%%xmm2                   \n"
-    "por       %%xmm5,%%xmm3                   \n"
-    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm0,%%xmm0                   \n"
+      "pslld     $0x18,%%xmm0                    \n"
+      "pcmpeqb   %%xmm1,%%xmm1                   \n"
+      "psrld     $0x8,%%xmm1                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm2                     \n"
+      "movdqu    0x10(%0),%%xmm3                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "movdqu    (%1),%%xmm4                     \n"
+      "movdqu    0x10(%1),%%xmm5                 \n"
+      "pand      %%xmm0,%%xmm2                   \n"
+      "pand      %%xmm0,%%xmm3                   \n"
+      "pand      %%xmm1,%%xmm4                   \n"
+      "pand      %%xmm1,%%xmm5                   \n"
+      "por       %%xmm4,%%xmm2                   \n"
+      "por       %%xmm5,%%xmm3                   \n"
+      "movdqu    %%xmm2,(%1)                     \n"
+      "movdqu    %%xmm3,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_ARGBCOPYALPHAROW_SSE2
 
 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
 // width in pixels
-void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
-    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm1         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm2   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
-    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
-    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
-    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2"
-  );
+void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
+      "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm1                     \n"
+      "vmovdqu   0x20(%0),%%ymm2                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1       \n"
+      "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2   \n"
+      "vmovdqu   %%ymm1,(%1)                     \n"
+      "vmovdqu   %%ymm2,0x20(%1)                 \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_ARGBCOPYALPHAROW_AVX2
 
 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
 // width in pixels
-void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
- asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ", %%xmm0        \n"
-    "movdqu    " MEMACCESS2(0x10, 0) ", %%xmm1 \n"
-    "lea       " MEMLEA(0x20, 0) ", %0         \n"
-    "psrld     $0x18, %%xmm0                   \n"
-    "psrld     $0x18, %%xmm1                   \n"
-    "packssdw  %%xmm1, %%xmm0                  \n"
-    "packuswb  %%xmm0, %%xmm0                  \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x8, 1) ", %1          \n"
-    "sub       $0x8, %2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_a),     // %1
-    "+rm"(width)     // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1"
-  );
+void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0), %%xmm0                    \n"
+      "movdqu    0x10(%0), %%xmm1                \n"
+      "lea       0x20(%0), %0                    \n"
+      "psrld     $0x18, %%xmm0                   \n"
+      "psrld     $0x18, %%xmm1                   \n"
+      "packssdw  %%xmm1, %%xmm0                  \n"
+      "packuswb  %%xmm0, %%xmm0                  \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "lea       0x8(%1), %1                     \n"
+      "sub       $0x8, %2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_a),     // %1
+        "+rm"(width)     // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 #endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
 
+#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
+static const uvec8 kShuffleAlphaShort_AVX2 = {
+    3u,  128u, 128u, 128u, 7u,  128u, 128u, 128u,
+    11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
+
+void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width) {
+  asm volatile(
+      "vmovdqa    %3,%%ymm4                      \n"
+      "vbroadcastf128 %4,%%ymm5                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0), %%ymm0                    \n"
+      "vmovdqu   0x20(%0), %%ymm1                \n"
+      "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"  // vpsrld $0x18, %%ymm0
+      "vpshufb    %%ymm5,%%ymm1,%%ymm1           \n"
+      "vmovdqu   0x40(%0), %%ymm2                \n"
+      "vmovdqu   0x60(%0), %%ymm3                \n"
+      "lea       0x80(%0), %0                    \n"
+      "vpackssdw  %%ymm1, %%ymm0, %%ymm0         \n"  // mutates
+      "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
+      "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
+      "vpackssdw  %%ymm3, %%ymm2, %%ymm2         \n"  // mutates
+      "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
+      "vpermd     %%ymm0,%%ymm4,%%ymm0           \n"  // unmutate.
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub        $0x20, %2                      \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),               // %0
+        "+r"(dst_a),                  // %1
+        "+rm"(width)                  // %2
+      : "m"(kPermdARGBToY_AVX),       // %3
+        "m"(kShuffleAlphaShort_AVX2)  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_ARGBEXTRACTALPHAROW_AVX2
+
 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
 // width in pixels
-void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm0,%%xmm0                   \n"
-    "pslld     $0x18,%%xmm0                    \n"
-    "pcmpeqb   %%xmm1,%%xmm1                   \n"
-    "psrld     $0x8,%%xmm1                     \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movq      " MEMACCESS(0) ",%%xmm2         \n"
-    "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "punpcklbw %%xmm2,%%xmm2                   \n"
-    "punpckhwd %%xmm2,%%xmm3                   \n"
-    "punpcklwd %%xmm2,%%xmm2                   \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
-    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
-    "pand      %%xmm0,%%xmm2                   \n"
-    "pand      %%xmm0,%%xmm3                   \n"
-    "pand      %%xmm1,%%xmm4                   \n"
-    "pand      %%xmm1,%%xmm5                   \n"
-    "por       %%xmm4,%%xmm2                   \n"
-    "por       %%xmm5,%%xmm3                   \n"
-    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm0,%%xmm0                   \n"
+      "pslld     $0x18,%%xmm0                    \n"
+      "pcmpeqb   %%xmm1,%%xmm1                   \n"
+      "psrld     $0x8,%%xmm1                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq      (%0),%%xmm2                     \n"
+      "lea       0x8(%0),%0                      \n"
+      "punpcklbw %%xmm2,%%xmm2                   \n"
+      "punpckhwd %%xmm2,%%xmm3                   \n"
+      "punpcklwd %%xmm2,%%xmm2                   \n"
+      "movdqu    (%1),%%xmm4                     \n"
+      "movdqu    0x10(%1),%%xmm5                 \n"
+      "pand      %%xmm0,%%xmm2                   \n"
+      "pand      %%xmm0,%%xmm3                   \n"
+      "pand      %%xmm1,%%xmm4                   \n"
+      "pand      %%xmm1,%%xmm5                   \n"
+      "por       %%xmm4,%%xmm2                   \n"
+      "por       %%xmm5,%%xmm3                   \n"
+      "movdqu    %%xmm2,(%1)                     \n"
+      "movdqu    %%xmm3,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
 
 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
 // width in pixels
-void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
-    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vpmovzxbd " MEMACCESS(0) ",%%ymm1         \n"
-    "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2    \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "vpslld    $0x18,%%ymm1,%%ymm1             \n"
-    "vpslld    $0x18,%%ymm2,%%ymm2             \n"
-    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
-    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
-    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
-    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2"
-  );
+void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
+      "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vpmovzxbd (%0),%%ymm1                     \n"
+      "vpmovzxbd 0x8(%0),%%ymm2                  \n"
+      "lea       0x10(%0),%0                     \n"
+      "vpslld    $0x18,%%ymm1,%%ymm1             \n"
+      "vpslld    $0x18,%%ymm2,%%ymm2             \n"
+      "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1       \n"
+      "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2   \n"
+      "vmovdqu   %%ymm1,(%1)                     \n"
+      "vmovdqu   %%ymm2,0x20(%1)                 \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
 
 #ifdef HAS_SETROW_X86
-void SetRow_X86(uint8* dst, uint8 v8, int width) {
+void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
   size_t width_tmp = (size_t)(width >> 2);
-  const uint32 v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.
-  asm volatile (
-    "rep stosl " MEMSTORESTRING(eax,0) "       \n"
-    : "+D"(dst),       // %0
-      "+c"(width_tmp)  // %1
-    : "a"(v32)         // %2
-    : "memory", "cc");
+  const uint32_t v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.
+  asm volatile(
+
+      "rep stosl                      \n"
+      : "+D"(dst),       // %0
+        "+c"(width_tmp)  // %1
+      : "a"(v32)         // %2
+      : "memory", "cc");
 }
 
-void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
+void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
   size_t width_tmp = (size_t)(width);
-  asm volatile (
-    "rep stosb " MEMSTORESTRING(al,0) "        \n"
-    : "+D"(dst),       // %0
-      "+c"(width_tmp)  // %1
-    : "a"(v8)          // %2
-    : "memory", "cc");
+  asm volatile(
+
+      "rep stosb                      \n"
+      : "+D"(dst),       // %0
+        "+c"(width_tmp)  // %1
+      : "a"(v8)          // %2
+      : "memory", "cc");
 }
 
-void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
+void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
   size_t width_tmp = (size_t)(width);
-  asm volatile (
-    "rep stosl " MEMSTORESTRING(eax,0) "       \n"
-    : "+D"(dst_argb),  // %0
-      "+c"(width_tmp)  // %1
-    : "a"(v32)         // %2
-    : "memory", "cc");
+  asm volatile(
+
+      "rep stosl                      \n"
+      : "+D"(dst_argb),  // %0
+        "+c"(width_tmp)  // %1
+      : "a"(v32)         // %2
+      : "memory", "cc");
 }
 #endif  // HAS_SETROW_X86
 
 #ifdef HAS_YUY2TOYROW_SSE2
-void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_yuy2),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm5"
-  );
+void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "psrlw     $0x8,%%xmm5                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "pand      %%xmm5,%%xmm1                   \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 
-void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
-    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_yuy2),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(width)          // %3
-  : "r"((intptr_t)(stride_yuy2))  // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
+                      int stride_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "psrlw     $0x8,%%xmm5                     \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x00(%0,%4,1),%%xmm2            \n"
+      "movdqu    0x10(%0,%4,1),%%xmm3            \n"
+      "lea       0x20(%0),%0                     \n"
+      "pavgb     %%xmm2,%%xmm0                   \n"
+      "pavgb     %%xmm3,%%xmm1                   \n"
+      "psrlw     $0x8,%%xmm0                     \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm1                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "movq    %%xmm1,0x00(%1,%2,1)              \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_yuy2),               // %0
+        "+r"(dst_u),                  // %1
+        "+r"(dst_v),                  // %2
+        "+r"(width)                   // %3
+      : "r"((intptr_t)(stride_yuy2))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 
-void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_yuy2),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(width)          // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm5"
-  );
+void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "psrlw     $0x8,%%xmm5                     \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "psrlw     $0x8,%%xmm0                     \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm1                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "movq    %%xmm1,0x00(%1,%2,1)              \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 
-void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_uyvy),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1"
-  );
+void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "psrlw     $0x8,%%xmm0                     \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 
-void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
-    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_uyvy),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(width)          // %3
-  : "r"((intptr_t)(stride_uyvy))  // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
+                      int stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "psrlw     $0x8,%%xmm5                     \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x00(%0,%4,1),%%xmm2            \n"
+      "movdqu    0x10(%0,%4,1),%%xmm3            \n"
+      "lea       0x20(%0),%0                     \n"
+      "pavgb     %%xmm2,%%xmm0                   \n"
+      "pavgb     %%xmm3,%%xmm1                   \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "pand      %%xmm5,%%xmm1                   \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm1                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "movq    %%xmm1,0x00(%1,%2,1)              \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_uyvy),               // %0
+        "+r"(dst_u),                  // %1
+        "+r"(dst_v),                  // %2
+        "+r"(width)                   // %3
+      : "r"((intptr_t)(stride_uyvy))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 
-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_uyvy),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(width)          // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm5"
-  );
+void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "psrlw     $0x8,%%xmm5                     \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "pand      %%xmm5,%%xmm1                   \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm1                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "movq    %%xmm1,0x00(%1,%2,1)              \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 #endif  // HAS_YUY2TOYROW_SSE2
 
 #ifdef HAS_YUY2TOYROW_AVX2
-void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
-  asm volatile (
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
-    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
-    "lea      " MEMLEA(0x20,1) ",%1            \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_yuy2),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm5"
-  );
+void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+      "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
+      "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
+      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vmovdqu   %%ymm0,(%1)                     \n"
+      "lea      0x20(%1),%1                      \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 
-void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
-    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
-    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
-    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
-    "lea      " MEMLEA(0x10,1) ",%1            \n"
-    "sub       $0x20,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_yuy2),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(width)          // %3
-  : "r"((intptr_t)(stride_yuy2))  // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm5"
-  );
+void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
+                      int stride_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+      "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"
+      "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+      "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
+      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+      "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+      "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vextractf128 $0x0,%%ymm1,(%1)             \n"
+      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
+      "lea      0x10(%1),%1                      \n"
+      "sub       $0x20,%3                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_yuy2),               // %0
+        "+r"(dst_u),                  // %1
+        "+r"(dst_v),                  // %2
+        "+r"(width)                   // %3
+      : "r"((intptr_t)(stride_yuy2))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 
-void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
-    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
-    "lea      " MEMLEA(0x10,1) ",%1            \n"
-    "sub       $0x20,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_yuy2),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(width)          // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm5"
-  );
+void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+      "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+      "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
+      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+      "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+      "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vextractf128 $0x0,%%ymm1,(%1)             \n"
+      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
+      "lea      0x10(%1),%1                      \n"
+      "sub       $0x20,%3                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 
-void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
-    "lea      " MEMLEA(0x20,1) ",%1            \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_uyvy),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm5"
-  );
-}
-void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
-    "sub       %1,%2                           \n"
+void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+  asm volatile(
 
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
-    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
-    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
-    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
-    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
-    "lea      " MEMLEA(0x10,1) ",%1            \n"
-    "sub       $0x20,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_uyvy),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(width)          // %3
-  : "r"((intptr_t)(stride_uyvy))  // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+      "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
+      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vmovdqu   %%ymm0,(%1)                     \n"
+      "lea      0x20(%1),%1                      \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
+                      int stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+      "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"
+      "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
+      "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
+      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+      "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+      "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vextractf128 $0x0,%%ymm1,(%1)             \n"
+      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
+      "lea      0x10(%1),%1                      \n"
+      "sub       $0x20,%3                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_uyvy),               // %0
+        "+r"(dst_u),                  // %1
+        "+r"(dst_v),                  // %2
+        "+r"(width)                   // %3
+      : "r"((intptr_t)(stride_uyvy))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 
-void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-    "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
-    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
-    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
-    "lea      " MEMLEA(0x10,1) ",%1            \n"
-    "sub       $0x20,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_uyvy),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(width)          // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm5"
-  );
+void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+      "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
+      "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
+      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+      "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+      "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vextractf128 $0x0,%%ymm1,(%1)             \n"
+      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
+      "lea      0x10(%1),%1                      \n"
+      "sub       $0x20,%3                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 #endif  // HAS_YUY2TOYROW_AVX2
 
 #ifdef HAS_ARGBBLENDROW_SSSE3
 // Shuffle table for isolating alpha.
-static uvec8 kShuffleAlpha = {
-  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
-  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
-};
+static const uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,
+                                    11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
 
 // Blend 8 pixels at a time
-void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
-                        uint8* dst_argb, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "psrlw     $0xf,%%xmm7                     \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "psrlw     $0x8,%%xmm6                     \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psllw     $0x8,%%xmm5                     \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "pslld     $0x18,%%xmm4                    \n"
-    "sub       $0x4,%3                         \n"
-    "jl        49f                             \n"
+void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
+                        const uint8_t* src_argb1,
+                        uint8_t* dst_argb,
+                        int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm7,%%xmm7                   \n"
+      "psrlw     $0xf,%%xmm7                     \n"
+      "pcmpeqb   %%xmm6,%%xmm6                   \n"
+      "psrlw     $0x8,%%xmm6                     \n"
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "psllw     $0x8,%%xmm5                     \n"
+      "pcmpeqb   %%xmm4,%%xmm4                   \n"
+      "pslld     $0x18,%%xmm4                    \n"
+      "sub       $0x4,%3                         \n"
+      "jl        49f                             \n"
 
-    // 4 pixel loop.
-    LABELALIGN
-  "40:                                         \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm3,%%xmm0                   \n"
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
-    "pshufb    %4,%%xmm3                       \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%3                         \n"
-    "jge       40b                             \n"
+      // 4 pixel loop.
+      LABELALIGN
+      "40:                                       \n"
+      "movdqu    (%0),%%xmm3                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqa    %%xmm3,%%xmm0                   \n"
+      "pxor      %%xmm4,%%xmm3                   \n"
+      "movdqu    (%1),%%xmm2                     \n"
+      "pshufb    %4,%%xmm3                       \n"
+      "pand      %%xmm6,%%xmm2                   \n"
+      "paddw     %%xmm7,%%xmm3                   \n"
+      "pmullw    %%xmm3,%%xmm2                   \n"
+      "movdqu    (%1),%%xmm1                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "por       %%xmm4,%%xmm0                   \n"
+      "pmullw    %%xmm3,%%xmm1                   \n"
+      "psrlw     $0x8,%%xmm2                     \n"
+      "paddusb   %%xmm2,%%xmm0                   \n"
+      "pand      %%xmm5,%%xmm1                   \n"
+      "paddusb   %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jge       40b                             \n"
 
-  "49:                                         \n"
-    "add       $0x3,%3                         \n"
-    "jl        99f                             \n"
+      "49:                                       \n"
+      "add       $0x3,%3                         \n"
+      "jl        99f                             \n"
 
-    // 1 pixel loop.
-  "91:                                         \n"
-    "movd      " MEMACCESS(0) ",%%xmm3         \n"
-    "lea       " MEMLEA(0x4,0) ",%0            \n"
-    "movdqa    %%xmm3,%%xmm0                   \n"
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movd      " MEMACCESS(1) ",%%xmm2         \n"
-    "pshufb    %4,%%xmm3                       \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movd      " MEMACCESS(1) ",%%xmm1         \n"
-    "lea       " MEMLEA(0x4,1) ",%1            \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movd      %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x4,2) ",%2            \n"
-    "sub       $0x1,%3                         \n"
-    "jge       91b                             \n"
-  "99:                                         \n"
-  : "+r"(src_argb0),    // %0
-    "+r"(src_argb1),    // %1
-    "+r"(dst_argb),     // %2
-    "+r"(width)         // %3
-  : "m"(kShuffleAlpha)  // %4
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+      // 1 pixel loop.
+      "91:                                       \n"
+      "movd      (%0),%%xmm3                     \n"
+      "lea       0x4(%0),%0                      \n"
+      "movdqa    %%xmm3,%%xmm0                   \n"
+      "pxor      %%xmm4,%%xmm3                   \n"
+      "movd      (%1),%%xmm2                     \n"
+      "pshufb    %4,%%xmm3                       \n"
+      "pand      %%xmm6,%%xmm2                   \n"
+      "paddw     %%xmm7,%%xmm3                   \n"
+      "pmullw    %%xmm3,%%xmm2                   \n"
+      "movd      (%1),%%xmm1                     \n"
+      "lea       0x4(%1),%1                      \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "por       %%xmm4,%%xmm0                   \n"
+      "pmullw    %%xmm3,%%xmm1                   \n"
+      "psrlw     $0x8,%%xmm2                     \n"
+      "paddusb   %%xmm2,%%xmm0                   \n"
+      "pand      %%xmm5,%%xmm1                   \n"
+      "paddusb   %%xmm1,%%xmm0                   \n"
+      "movd      %%xmm0,(%2)                     \n"
+      "lea       0x4(%2),%2                      \n"
+      "sub       $0x1,%3                         \n"
+      "jge       91b                             \n"
+      "99:                                       \n"
+      : "+r"(src_argb0),    // %0
+        "+r"(src_argb1),    // %1
+        "+r"(dst_argb),     // %2
+        "+r"(width)         // %3
+      : "m"(kShuffleAlpha)  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBBLENDROW_SSSE3
 
@@ -3559,46 +4580,49 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
 // =((A2*C2)+(B2*(255-C2))+255)/256
 // signed version of math
 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
-                         const uint8* alpha, uint8* dst, int width) {
-  asm volatile (
-    "pcmpeqb    %%xmm5,%%xmm5                  \n"
-    "psllw      $0x8,%%xmm5                    \n"
-    "mov        $0x80808080,%%eax              \n"
-    "movd       %%eax,%%xmm6                   \n"
-    "pshufd     $0x0,%%xmm6,%%xmm6             \n"
-    "mov        $0x807f807f,%%eax              \n"
-    "movd       %%eax,%%xmm7                   \n"
-    "pshufd     $0x0,%%xmm7,%%xmm7             \n"
-    "sub        %2,%0                          \n"
-    "sub        %2,%1                          \n"
-    "sub        %2,%3                          \n"
+void BlendPlaneRow_SSSE3(const uint8_t* src0,
+                         const uint8_t* src1,
+                         const uint8_t* alpha,
+                         uint8_t* dst,
+                         int width) {
+  asm volatile(
+      "pcmpeqb    %%xmm5,%%xmm5                  \n"
+      "psllw      $0x8,%%xmm5                    \n"
+      "mov        $0x80808080,%%eax              \n"
+      "movd       %%eax,%%xmm6                   \n"
+      "pshufd     $0x0,%%xmm6,%%xmm6             \n"
+      "mov        $0x807f807f,%%eax              \n"
+      "movd       %%eax,%%xmm7                   \n"
+      "pshufd     $0x0,%%xmm7,%%xmm7             \n"
+      "sub        %2,%0                          \n"
+      "sub        %2,%1                          \n"
+      "sub        %2,%3                          \n"
 
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movq       (%2),%%xmm0                    \n"
-    "punpcklbw  %%xmm0,%%xmm0                  \n"
-    "pxor       %%xmm5,%%xmm0                  \n"
-    "movq       (%0,%2,1),%%xmm1               \n"
-    "movq       (%1,%2,1),%%xmm2               \n"
-    "punpcklbw  %%xmm2,%%xmm1                  \n"
-    "psubb      %%xmm6,%%xmm1                  \n"
-    "pmaddubsw  %%xmm1,%%xmm0                  \n"
-    "paddw      %%xmm7,%%xmm0                  \n"
-    "psrlw      $0x8,%%xmm0                    \n"
-    "packuswb   %%xmm0,%%xmm0                  \n"
-    "movq       %%xmm0,(%3,%2,1)               \n"
-    "lea        0x8(%2),%2                     \n"
-    "sub        $0x8,%4                        \n"
-    "jg        1b                              \n"
-  : "+r"(src0),       // %0
-    "+r"(src1),       // %1
-    "+r"(alpha),      // %2
-    "+r"(dst),        // %3
-    "+rm"(width)      // %4
-  :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"
-  );
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movq       (%2),%%xmm0                    \n"
+      "punpcklbw  %%xmm0,%%xmm0                  \n"
+      "pxor       %%xmm5,%%xmm0                  \n"
+      "movq       (%0,%2,1),%%xmm1               \n"
+      "movq       (%1,%2,1),%%xmm2               \n"
+      "punpcklbw  %%xmm2,%%xmm1                  \n"
+      "psubb      %%xmm6,%%xmm1                  \n"
+      "pmaddubsw  %%xmm1,%%xmm0                  \n"
+      "paddw      %%xmm7,%%xmm0                  \n"
+      "psrlw      $0x8,%%xmm0                    \n"
+      "packuswb   %%xmm0,%%xmm0                  \n"
+      "movq       %%xmm0,(%3,%2,1)               \n"
+      "lea        0x8(%2),%2                     \n"
+      "sub        $0x8,%4                        \n"
+      "jg        1b                              \n"
+      : "+r"(src0),   // %0
+        "+r"(src1),   // %1
+        "+r"(alpha),  // %2
+        "+r"(dst),    // %3
+        "+rm"(width)  // %4
+        ::"memory",
+        "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
 }
 #endif  // HAS_BLENDPLANEROW_SSSE3
 
@@ -3608,312 +4632,308 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
 // =((A2*C2)+(B2*(255-C2))+255)/256
 // signed version of math
 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
-                        const uint8* alpha, uint8* dst, int width) {
-  asm volatile (
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-    "vpsllw     $0x8,%%ymm5,%%ymm5             \n"
-    "mov        $0x80808080,%%eax              \n"
-    "vmovd      %%eax,%%xmm6                   \n"
-    "vbroadcastss %%xmm6,%%ymm6                \n"
-    "mov        $0x807f807f,%%eax              \n"
-    "vmovd      %%eax,%%xmm7                   \n"
-    "vbroadcastss %%xmm7,%%ymm7                \n"
-    "sub        %2,%0                          \n"
-    "sub        %2,%1                          \n"
-    "sub        %2,%3                          \n"
+void BlendPlaneRow_AVX2(const uint8_t* src0,
+                        const uint8_t* src1,
+                        const uint8_t* alpha,
+                        uint8_t* dst,
+                        int width) {
+  asm volatile(
+      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+      "vpsllw     $0x8,%%ymm5,%%ymm5             \n"
+      "mov        $0x80808080,%%eax              \n"
+      "vmovd      %%eax,%%xmm6                   \n"
+      "vbroadcastss %%xmm6,%%ymm6                \n"
+      "mov        $0x807f807f,%%eax              \n"
+      "vmovd      %%eax,%%xmm7                   \n"
+      "vbroadcastss %%xmm7,%%ymm7                \n"
+      "sub        %2,%0                          \n"
+      "sub        %2,%1                          \n"
+      "sub        %2,%3                          \n"
 
-    // 32 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    (%2),%%ymm0                    \n"
-    "vpunpckhbw %%ymm0,%%ymm0,%%ymm3           \n"
-    "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
-    "vpxor      %%ymm5,%%ymm3,%%ymm3           \n"
-    "vpxor      %%ymm5,%%ymm0,%%ymm0           \n"
-    "vmovdqu    (%0,%2,1),%%ymm1               \n"
-    "vmovdqu    (%1,%2,1),%%ymm2               \n"
-    "vpunpckhbw %%ymm2,%%ymm1,%%ymm4           \n"
-    "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
-    "vpsubb     %%ymm6,%%ymm4,%%ymm4           \n"
-    "vpsubb     %%ymm6,%%ymm1,%%ymm1           \n"
-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-    "vpmaddubsw %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpaddw     %%ymm7,%%ymm3,%%ymm3           \n"
-    "vpaddw     %%ymm7,%%ymm0,%%ymm0           \n"
-    "vpsrlw     $0x8,%%ymm3,%%ymm3             \n"
-    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
-    "vpackuswb  %%ymm3,%%ymm0,%%ymm0           \n"
-    "vmovdqu    %%ymm0,(%3,%2,1)               \n"
-    "lea        0x20(%2),%2                    \n"
-    "sub        $0x20,%4                       \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src0),       // %0
-    "+r"(src1),       // %1
-    "+r"(alpha),      // %2
-    "+r"(dst),        // %3
-    "+rm"(width)      // %4
-  :: "memory", "cc", "eax",
-     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+      // 32 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%2),%%ymm0                    \n"
+      "vpunpckhbw %%ymm0,%%ymm0,%%ymm3           \n"
+      "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpxor      %%ymm5,%%ymm3,%%ymm3           \n"
+      "vpxor      %%ymm5,%%ymm0,%%ymm0           \n"
+      "vmovdqu    (%0,%2,1),%%ymm1               \n"
+      "vmovdqu    (%1,%2,1),%%ymm2               \n"
+      "vpunpckhbw %%ymm2,%%ymm1,%%ymm4           \n"
+      "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
+      "vpsubb     %%ymm6,%%ymm4,%%ymm4           \n"
+      "vpsubb     %%ymm6,%%ymm1,%%ymm1           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "vpmaddubsw %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpaddw     %%ymm7,%%ymm3,%%ymm3           \n"
+      "vpaddw     %%ymm7,%%ymm0,%%ymm0           \n"
+      "vpsrlw     $0x8,%%ymm3,%%ymm3             \n"
+      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+      "vpackuswb  %%ymm3,%%ymm0,%%ymm0           \n"
+      "vmovdqu    %%ymm0,(%3,%2,1)               \n"
+      "lea        0x20(%2),%2                    \n"
+      "sub        $0x20,%4                       \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src0),   // %0
+        "+r"(src1),   // %1
+        "+r"(alpha),  // %2
+        "+r"(dst),    // %3
+        "+rm"(width)  // %4
+        ::"memory",
+        "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_BLENDPLANEROW_AVX2
 
 #ifdef HAS_ARGBATTENUATEROW_SSSE3
 // Shuffle table duplicating alpha
-static uvec8 kShuffleAlpha0 = {
-  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u
-};
-static uvec8 kShuffleAlpha1 = {
-  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
-  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u
-};
+static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
+                                     7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
+static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+                                     15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
 // Attenuate 4 pixels at a time.
-void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm3,%%xmm3                   \n"
-    "pslld     $0x18,%%xmm3                    \n"
-    "movdqa    %3,%%xmm4                       \n"
-    "movdqa    %4,%%xmm5                       \n"
+void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm3,%%xmm3                   \n"
+      "pslld     $0x18,%%xmm3                    \n"
+      "movdqa    %3,%%xmm4                       \n"
+      "movdqa    %4,%%xmm5                       \n"
 
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
-    "punpcklbw %%xmm1,%%xmm1                   \n"
-    "pmulhuw   %%xmm1,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
-    "pshufb    %%xmm5,%%xmm1                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
-    "punpckhbw %%xmm2,%%xmm2                   \n"
-    "pmulhuw   %%xmm2,%%xmm1                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "pand      %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "por       %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)        // %2
-  : "m"(kShuffleAlpha0),  // %3
-    "m"(kShuffleAlpha1)  // %4
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "pshufb    %%xmm4,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm1                     \n"
+      "punpcklbw %%xmm1,%%xmm1                   \n"
+      "pmulhuw   %%xmm1,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm1                     \n"
+      "pshufb    %%xmm5,%%xmm1                   \n"
+      "movdqu    (%0),%%xmm2                     \n"
+      "punpckhbw %%xmm2,%%xmm2                   \n"
+      "pmulhuw   %%xmm2,%%xmm1                   \n"
+      "movdqu    (%0),%%xmm2                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "pand      %%xmm3,%%xmm2                   \n"
+      "psrlw     $0x8,%%xmm0                     \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "por       %%xmm2,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),       // %0
+        "+r"(dst_argb),       // %1
+        "+r"(width)           // %2
+      : "m"(kShuffleAlpha0),  // %3
+        "m"(kShuffleAlpha1)   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_ARGBATTENUATEROW_SSSE3
 
 #ifdef HAS_ARGBATTENUATEROW_AVX2
 // Shuffle table duplicating alpha.
-static const uvec8 kShuffleAlpha_AVX2 = {
-  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
-};
+static const uvec8 kShuffleAlpha_AVX2 = {6u,   7u,   6u,   7u,  6u,  7u,
+                                         128u, 128u, 14u,  15u, 14u, 15u,
+                                         14u,  15u,  128u, 128u};
 // Attenuate 8 pixels at a time.
-void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    "vbroadcastf128 %3,%%ymm4                  \n"
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-    "vpslld     $0x18,%%ymm5,%%ymm5            \n"
-    "sub        %0,%1                          \n"
+void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm4                  \n"
+      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+      "vpslld     $0x18,%%ymm5,%%ymm5            \n"
+      "sub        %0,%1                          \n"
 
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
-    "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
-    "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
-    "vpshufb    %%ymm4,%%ymm0,%%ymm2           \n"
-    "vpshufb    %%ymm4,%%ymm1,%%ymm3           \n"
-    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
-    "vpand      %%ymm5,%%ymm6,%%ymm6           \n"
-    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
-    "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpor       %%ymm6,%%ymm0,%%ymm0           \n"
-    MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "sub        $0x8,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)        // %2
-  : "m"(kShuffleAlpha_AVX2)  // %3
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm6                    \n"
+      "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
+      "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
+      "vpshufb    %%ymm4,%%ymm0,%%ymm2           \n"
+      "vpshufb    %%ymm4,%%ymm1,%%ymm3           \n"
+      "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+      "vpand      %%ymm5,%%ymm6,%%ymm6           \n"
+      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+      "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpor       %%ymm6,%%ymm0,%%ymm0           \n"
+      "vmovdqu    %%ymm0,0x00(%0,%1,1)           \n"
+      "lea       0x20(%0),%0                     \n"
+      "sub        $0x8,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),          // %0
+        "+r"(dst_argb),          // %1
+        "+r"(width)              // %2
+      : "m"(kShuffleAlpha_AVX2)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 #endif  // HAS_ARGBATTENUATEROW_AVX2
 
 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
 // Unattenuate 4 pixels at a time.
-void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
                              int width) {
   uintptr_t alpha;
-  asm volatile (
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
-    "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
-    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
-    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
-    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
-    "movlhps   %%xmm3,%%xmm2                   \n"
-    "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
-    "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
-    "punpckhbw %%xmm1,%%xmm1                   \n"
-    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
-    "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
-    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
-    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
-    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
-    "movlhps   %%xmm3,%%xmm2                   \n"
-    "pmulhuw   %%xmm2,%%xmm1                   \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),     // %0
-    "+r"(dst_argb),     // %1
-    "+r"(width),        // %2
-    "=&r"(alpha)        // %3
-  : "r"(fixed_invtbl8)  // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+  asm volatile(
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movzb     0x03(%0),%3                     \n"
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "movd      0x00(%4,%3,4),%%xmm2            \n"
+      "movzb     0x07(%0),%3                     \n"
+      "movd      0x00(%4,%3,4),%%xmm3            \n"
+      "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
+      "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
+      "movlhps   %%xmm3,%%xmm2                   \n"
+      "pmulhuw   %%xmm2,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm1                     \n"
+      "movzb     0x0b(%0),%3                     \n"
+      "punpckhbw %%xmm1,%%xmm1                   \n"
+      "movd      0x00(%4,%3,4),%%xmm2            \n"
+      "movzb     0x0f(%0),%3                     \n"
+      "movd      0x00(%4,%3,4),%%xmm3            \n"
+      "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
+      "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
+      "movlhps   %%xmm3,%%xmm2                   \n"
+      "pmulhuw   %%xmm2,%%xmm1                   \n"
+      "lea       0x10(%0),%0                     \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),     // %0
+        "+r"(dst_argb),     // %1
+        "+r"(width),        // %2
+        "=&r"(alpha)        // %3
+      : "r"(fixed_invtbl8)  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
 
 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
 // Shuffle table duplicating alpha.
 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
-  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
-};
+    0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
 // Unattenuate 8 pixels at a time.
-void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
                              int width) {
   uintptr_t alpha;
-  asm volatile (
-    "sub        %0,%1                          \n"
-    "vbroadcastf128 %5,%%ymm5                  \n"
+  asm volatile(
+      "sub        %0,%1                          \n"
+      "vbroadcastf128 %5,%%ymm5                  \n"
 
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    // replace VPGATHER
-    "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
-    MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
-    "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
-    MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
-    "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
-    "vpunpckldq %%xmm1,%%xmm0,%%xmm6           \n"
-    MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
-    "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
-    MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
-    "movzb     " MEMACCESS2(0x13,0) ",%3       \n"
-    "vpunpckldq %%xmm3,%%xmm2,%%xmm7           \n"
-    MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
-    "movzb     " MEMACCESS2(0x17,0) ",%3       \n"
-    MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
-    "movzb     " MEMACCESS2(0x1b,0) ",%3       \n"
-    "vpunpckldq %%xmm1,%%xmm0,%%xmm0           \n"
-    MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
-    "movzb     " MEMACCESS2(0x1f,0) ",%3       \n"
-    MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
-    "vpunpckldq %%xmm3,%%xmm2,%%xmm2           \n"
-    "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
-    "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
-    "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
-    // end of VPGATHER
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      // replace VPGATHER
+      "movzb     0x03(%0),%3                     \n"
+      "vmovd     0x00(%4,%3,4),%%xmm0            \n"
+      "movzb     0x07(%0),%3                     \n"
+      "vmovd     0x00(%4,%3,4),%%xmm1            \n"
+      "movzb     0x0b(%0),%3                     \n"
+      "vpunpckldq %%xmm1,%%xmm0,%%xmm6           \n"
+      "vmovd     0x00(%4,%3,4),%%xmm2            \n"
+      "movzb     0x0f(%0),%3                     \n"
+      "vmovd     0x00(%4,%3,4),%%xmm3            \n"
+      "movzb     0x13(%0),%3                     \n"
+      "vpunpckldq %%xmm3,%%xmm2,%%xmm7           \n"
+      "vmovd     0x00(%4,%3,4),%%xmm0            \n"
+      "movzb     0x17(%0),%3                     \n"
+      "vmovd     0x00(%4,%3,4),%%xmm1            \n"
+      "movzb     0x1b(%0),%3                     \n"
+      "vpunpckldq %%xmm1,%%xmm0,%%xmm0           \n"
+      "vmovd     0x00(%4,%3,4),%%xmm2            \n"
+      "movzb     0x1f(%0),%3                     \n"
+      "vmovd     0x00(%4,%3,4),%%xmm3            \n"
+      "vpunpckldq %%xmm3,%%xmm2,%%xmm2           \n"
+      "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
+      "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
+      "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
+      // end of VPGATHER
 
-    "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
-    "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
-    "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
-    "vpunpcklwd %%ymm3,%%ymm3,%%ymm2           \n"
-    "vpunpckhwd %%ymm3,%%ymm3,%%ymm3           \n"
-    "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
-    "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
-    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "sub        $0x8,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb),      // %0
-    "+r"(dst_argb),      // %1
-    "+r"(width),         // %2
-    "=&r"(alpha)         // %3
-  : "r"(fixed_invtbl8),  // %4
-    "m"(kUnattenShuffleAlpha_AVX2)  // %5
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+      "vmovdqu    (%0),%%ymm6                    \n"
+      "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
+      "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
+      "vpunpcklwd %%ymm3,%%ymm3,%%ymm2           \n"
+      "vpunpckhwd %%ymm3,%%ymm3,%%ymm3           \n"
+      "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
+      "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
+      "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vmovdqu    %%ymm0,0x00(%0,%1,1)           \n"
+      "lea       0x20(%0),%0                     \n"
+      "sub        $0x8,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),                 // %0
+        "+r"(dst_argb),                 // %1
+        "+r"(width),                    // %2
+        "=&r"(alpha)                    // %3
+      : "r"(fixed_invtbl8),             // %4
+        "m"(kUnattenShuffleAlpha_AVX2)  // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBUNATTENUATEROW_AVX2
 
 #ifdef HAS_ARGBGRAYROW_SSSE3
 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
-void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    "movdqa    %3,%%xmm4                       \n"
-    "movdqa    %4,%%xmm5                       \n"
+void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "movdqa    %3,%%xmm4                       \n"
+      "movdqa    %4,%%xmm5                       \n"
 
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "paddw     %%xmm5,%%xmm0                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "psrld     $0x18,%%xmm2                    \n"
-    "psrld     $0x18,%%xmm3                    \n"
-    "packuswb  %%xmm3,%%xmm2                   \n"
-    "packuswb  %%xmm2,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm3                   \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "punpcklbw %%xmm2,%%xmm3                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm3,%%xmm0                   \n"
-    "punpckhwd %%xmm3,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_argb),   // %1
-    "+r"(width)       // %2
-  : "m"(kARGBToYJ),   // %3
-    "m"(kAddYJ64)     // %4
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "phaddw    %%xmm1,%%xmm0                   \n"
+      "paddw     %%xmm5,%%xmm0                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm2                     \n"
+      "movdqu    0x10(%0),%%xmm3                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "psrld     $0x18,%%xmm2                    \n"
+      "psrld     $0x18,%%xmm3                    \n"
+      "packuswb  %%xmm3,%%xmm2                   \n"
+      "packuswb  %%xmm2,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm3                   \n"
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "punpcklbw %%xmm2,%%xmm3                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpcklwd %%xmm3,%%xmm0                   \n"
+      "punpckhwd %%xmm3,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "m"(kARGBToYJ),  // %3
+        "m"(kAddYJ64)    // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_ARGBGRAYROW_SSSE3
 
@@ -3922,412 +4942,415 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
 //    g = (r * 45 + g * 88 + b * 22) >> 7
 //    r = (r * 50 + g * 98 + b * 24) >> 7
 // Constant for ARGB color to sepia tone
-static vec8 kARGBToSepiaB = {
-  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
-};
+static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
+                                   17, 68, 35, 0, 17, 68, 35, 0};
 
-static vec8 kARGBToSepiaG = {
-  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
-};
+static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
+                                   22, 88, 45, 0, 22, 88, 45, 0};
 
-static vec8 kARGBToSepiaR = {
-  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
-};
+static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
+                                   24, 98, 50, 0, 24, 98, 50, 0};
 
 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
-  asm volatile (
-    "movdqa    %2,%%xmm2                       \n"
-    "movdqa    %3,%%xmm3                       \n"
-    "movdqa    %4,%%xmm4                       \n"
+void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
+  asm volatile(
+      "movdqa    %2,%%xmm2                       \n"
+      "movdqa    %3,%%xmm3                       \n"
+      "movdqa    %4,%%xmm4                       \n"
 
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
-    "pmaddubsw %%xmm2,%%xmm0                   \n"
-    "pmaddubsw %%xmm2,%%xmm6                   \n"
-    "phaddw    %%xmm6,%%xmm0                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "pmaddubsw %%xmm3,%%xmm5                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "phaddw    %%xmm1,%%xmm5                   \n"
-    "psrlw     $0x7,%%xmm5                     \n"
-    "packuswb  %%xmm5,%%xmm5                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "pmaddubsw %%xmm4,%%xmm5                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "phaddw    %%xmm1,%%xmm5                   \n"
-    "psrlw     $0x7,%%xmm5                     \n"
-    "packuswb  %%xmm5,%%xmm5                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "psrld     $0x18,%%xmm6                    \n"
-    "psrld     $0x18,%%xmm1                    \n"
-    "packuswb  %%xmm1,%%xmm6                   \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "punpcklbw %%xmm6,%%xmm5                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm5,%%xmm0                   \n"
-    "punpckhwd %%xmm5,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "sub       $0x8,%1                         \n"
-    "jg        1b                              \n"
-  : "+r"(dst_argb),      // %0
-    "+r"(width)          // %1
-  : "m"(kARGBToSepiaB),  // %2
-    "m"(kARGBToSepiaG),  // %3
-    "m"(kARGBToSepiaR)   // %4
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm6                 \n"
+      "pmaddubsw %%xmm2,%%xmm0                   \n"
+      "pmaddubsw %%xmm2,%%xmm6                   \n"
+      "phaddw    %%xmm6,%%xmm0                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm5                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "pmaddubsw %%xmm3,%%xmm5                   \n"
+      "pmaddubsw %%xmm3,%%xmm1                   \n"
+      "phaddw    %%xmm1,%%xmm5                   \n"
+      "psrlw     $0x7,%%xmm5                     \n"
+      "packuswb  %%xmm5,%%xmm5                   \n"
+      "punpcklbw %%xmm5,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm5                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "pmaddubsw %%xmm4,%%xmm5                   \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "phaddw    %%xmm1,%%xmm5                   \n"
+      "psrlw     $0x7,%%xmm5                     \n"
+      "packuswb  %%xmm5,%%xmm5                   \n"
+      "movdqu    (%0),%%xmm6                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "psrld     $0x18,%%xmm6                    \n"
+      "psrld     $0x18,%%xmm1                    \n"
+      "packuswb  %%xmm1,%%xmm6                   \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "punpcklbw %%xmm6,%%xmm5                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpcklwd %%xmm5,%%xmm0                   \n"
+      "punpckhwd %%xmm5,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%0)                     \n"
+      "movdqu    %%xmm1,0x10(%0)                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "sub       $0x8,%1                         \n"
+      "jg        1b                              \n"
+      : "+r"(dst_argb),      // %0
+        "+r"(width)          // %1
+      : "m"(kARGBToSepiaB),  // %2
+        "m"(kARGBToSepiaG),  // %3
+        "m"(kARGBToSepiaR)   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 #endif  // HAS_ARGBSEPIAROW_SSSE3
 
 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
 // Same as Sepia except matrix is provided.
-void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                              const int8* matrix_argb, int width) {
-  asm volatile (
-    "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
-    "pshufd    $0x00,%%xmm5,%%xmm2             \n"
-    "pshufd    $0x55,%%xmm5,%%xmm3             \n"
-    "pshufd    $0xaa,%%xmm5,%%xmm4             \n"
-    "pshufd    $0xff,%%xmm5,%%xmm5             \n"
+void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
+                              uint8_t* dst_argb,
+                              const int8_t* matrix_argb,
+                              int width) {
+  asm volatile(
+      "movdqu    (%3),%%xmm5                     \n"
+      "pshufd    $0x00,%%xmm5,%%xmm2             \n"
+      "pshufd    $0x55,%%xmm5,%%xmm3             \n"
+      "pshufd    $0xaa,%%xmm5,%%xmm4             \n"
+      "pshufd    $0xff,%%xmm5,%%xmm5             \n"
 
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
-    "pmaddubsw %%xmm2,%%xmm0                   \n"
-    "pmaddubsw %%xmm2,%%xmm7                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "phaddsw   %%xmm7,%%xmm0                   \n"
-    "phaddsw   %%xmm1,%%xmm6                   \n"
-    "psraw     $0x6,%%xmm0                     \n"
-    "psraw     $0x6,%%xmm6                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "punpcklbw %%xmm6,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm7                   \n"
-    "phaddsw   %%xmm7,%%xmm1                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
-    "pmaddubsw %%xmm5,%%xmm6                   \n"
-    "pmaddubsw %%xmm5,%%xmm7                   \n"
-    "phaddsw   %%xmm7,%%xmm6                   \n"
-    "psraw     $0x6,%%xmm1                     \n"
-    "psraw     $0x6,%%xmm6                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "punpcklbw %%xmm6,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm6                   \n"
-    "punpcklwd %%xmm1,%%xmm0                   \n"
-    "punpckhwd %%xmm1,%%xmm6                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm6," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),      // %0
-    "+r"(dst_argb),      // %1
-    "+r"(width)          // %2
-  : "r"(matrix_argb)     // %3
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm7                 \n"
+      "pmaddubsw %%xmm2,%%xmm0                   \n"
+      "pmaddubsw %%xmm2,%%xmm7                   \n"
+      "movdqu    (%0),%%xmm6                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "pmaddubsw %%xmm3,%%xmm6                   \n"
+      "pmaddubsw %%xmm3,%%xmm1                   \n"
+      "phaddsw   %%xmm7,%%xmm0                   \n"
+      "phaddsw   %%xmm1,%%xmm6                   \n"
+      "psraw     $0x6,%%xmm0                     \n"
+      "psraw     $0x6,%%xmm6                     \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "punpcklbw %%xmm6,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm1                     \n"
+      "movdqu    0x10(%0),%%xmm7                 \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "pmaddubsw %%xmm4,%%xmm7                   \n"
+      "phaddsw   %%xmm7,%%xmm1                   \n"
+      "movdqu    (%0),%%xmm6                     \n"
+      "movdqu    0x10(%0),%%xmm7                 \n"
+      "pmaddubsw %%xmm5,%%xmm6                   \n"
+      "pmaddubsw %%xmm5,%%xmm7                   \n"
+      "phaddsw   %%xmm7,%%xmm6                   \n"
+      "psraw     $0x6,%%xmm1                     \n"
+      "psraw     $0x6,%%xmm6                     \n"
+      "packuswb  %%xmm1,%%xmm1                   \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "punpcklbw %%xmm6,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm6                   \n"
+      "punpcklwd %%xmm1,%%xmm0                   \n"
+      "punpckhwd %%xmm1,%%xmm6                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    %%xmm6,0x10(%1)                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),   // %0
+        "+r"(dst_argb),   // %1
+        "+r"(width)       // %2
+      : "r"(matrix_argb)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
 
 #ifdef HAS_ARGBQUANTIZEROW_SSE2
 // Quantize 4 ARGB pixels (16 bytes).
-void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
-                          int interval_offset, int width) {
-  asm volatile (
-    "movd      %2,%%xmm2                       \n"
-    "movd      %3,%%xmm3                       \n"
-    "movd      %4,%%xmm4                       \n"
-    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
-    "pshufd    $0x44,%%xmm2,%%xmm2             \n"
-    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
-    "pshufd    $0x44,%%xmm3,%%xmm3             \n"
-    "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
-    "pshufd    $0x44,%%xmm4,%%xmm4             \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "pslld     $0x18,%%xmm6                    \n"
+void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
+                          int scale,
+                          int interval_size,
+                          int interval_offset,
+                          int width) {
+  asm volatile(
+      "movd      %2,%%xmm2                       \n"
+      "movd      %3,%%xmm3                       \n"
+      "movd      %4,%%xmm4                       \n"
+      "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
+      "pshufd    $0x44,%%xmm2,%%xmm2             \n"
+      "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
+      "pshufd    $0x44,%%xmm3,%%xmm3             \n"
+      "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
+      "pshufd    $0x44,%%xmm4,%%xmm4             \n"
+      "pxor      %%xmm5,%%xmm5                   \n"
+      "pcmpeqb   %%xmm6,%%xmm6                   \n"
+      "pslld     $0x18,%%xmm6                    \n"
 
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
-    "punpckhbw %%xmm5,%%xmm1                   \n"
-    "pmulhuw   %%xmm2,%%xmm1                   \n"
-    "pmullw    %%xmm3,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm7         \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "pand      %%xmm6,%%xmm7                   \n"
-    "paddw     %%xmm4,%%xmm0                   \n"
-    "paddw     %%xmm4,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "por       %%xmm7,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "sub       $0x4,%1                         \n"
-    "jg        1b                              \n"
-  : "+r"(dst_argb),       // %0
-    "+r"(width)           // %1
-  : "r"(scale),           // %2
-    "r"(interval_size),   // %3
-    "r"(interval_offset)  // %4
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "punpcklbw %%xmm5,%%xmm0                   \n"
+      "pmulhuw   %%xmm2,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm1                     \n"
+      "punpckhbw %%xmm5,%%xmm1                   \n"
+      "pmulhuw   %%xmm2,%%xmm1                   \n"
+      "pmullw    %%xmm3,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm7                     \n"
+      "pmullw    %%xmm3,%%xmm1                   \n"
+      "pand      %%xmm6,%%xmm7                   \n"
+      "paddw     %%xmm4,%%xmm0                   \n"
+      "paddw     %%xmm4,%%xmm1                   \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "por       %%xmm7,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%0)                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "sub       $0x4,%1                         \n"
+      "jg        1b                              \n"
+      : "+r"(dst_argb),       // %0
+        "+r"(width)           // %1
+      : "r"(scale),           // %2
+        "r"(interval_size),   // %3
+        "r"(interval_offset)  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBQUANTIZEROW_SSE2
 
 #ifdef HAS_ARGBSHADEROW_SSE2
 // Shade 4 pixels at a time by specified value.
-void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
-                       uint32 value) {
-  asm volatile (
-    "movd      %3,%%xmm2                       \n"
-    "punpcklbw %%xmm2,%%xmm2                   \n"
-    "punpcklqdq %%xmm2,%%xmm2                  \n"
+void ARGBShadeRow_SSE2(const uint8_t* src_argb,
+                       uint8_t* dst_argb,
+                       int width,
+                       uint32_t value) {
+  asm volatile(
+      "movd      %3,%%xmm2                       \n"
+      "punpcklbw %%xmm2,%%xmm2                   \n"
+      "punpcklqdq %%xmm2,%%xmm2                  \n"
 
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "punpckhbw %%xmm1,%%xmm1                   \n"
-    "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "pmulhuw   %%xmm2,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)      // %2
-  : "r"(value)       // %3
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2"
-  );
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "punpckhbw %%xmm1,%%xmm1                   \n"
+      "pmulhuw   %%xmm2,%%xmm0                   \n"
+      "pmulhuw   %%xmm2,%%xmm1                   \n"
+      "psrlw     $0x8,%%xmm0                     \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(value)       // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_ARGBSHADEROW_SSE2
 
 #ifdef HAS_ARGBMULTIPLYROW_SSE2
 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  asm volatile (
-    "pxor      %%xmm5,%%xmm5                  \n"
+void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
 
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "movdqu    %%xmm0,%%xmm1                   \n"
-    "movdqu    %%xmm2,%%xmm3                   \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "punpckhbw %%xmm1,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "punpckhbw %%xmm5,%%xmm3                   \n"
-    "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "pmulhuw   %%xmm3,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+      "pxor      %%xmm5,%%xmm5                   \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqu    (%1),%%xmm2                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "movdqu    %%xmm0,%%xmm1                   \n"
+      "movdqu    %%xmm2,%%xmm3                   \n"
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "punpckhbw %%xmm1,%%xmm1                   \n"
+      "punpcklbw %%xmm5,%%xmm2                   \n"
+      "punpckhbw %%xmm5,%%xmm3                   \n"
+      "pmulhuw   %%xmm2,%%xmm0                   \n"
+      "pmulhuw   %%xmm3,%%xmm1                   \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_ARGBMULTIPLYROW_SSE2
 
 #ifdef HAS_ARGBMULTIPLYROW_AVX2
 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  asm volatile (
-    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
 
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm1        \n"
-    "lea        " MEMLEA(0x20,0) ",%0          \n"
-    "vmovdqu    " MEMACCESS(1) ",%%ymm3        \n"
-    "lea        " MEMLEA(0x20,1) ",%1          \n"
-    "vpunpcklbw %%ymm1,%%ymm1,%%ymm0           \n"
-    "vpunpckhbw %%ymm1,%%ymm1,%%ymm1           \n"
-    "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
-    "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
-    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
-    "lea       " MEMLEA(0x20,2) ",%2           \n"
-    "sub        $0x8,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "memory", "cc"
+      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm1                    \n"
+      "lea        0x20(%0),%0                    \n"
+      "vmovdqu    (%1),%%ymm3                    \n"
+      "lea        0x20(%1),%1                    \n"
+      "vpunpcklbw %%ymm1,%%ymm1,%%ymm0           \n"
+      "vpunpckhbw %%ymm1,%%ymm1,%%ymm1           \n"
+      "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
+      "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
+      "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vmovdqu    %%ymm0,(%2)                    \n"
+      "lea       0x20(%2),%2                     \n"
+      "sub        $0x8,%3                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc"
 #if defined(__AVX2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+        ,
+        "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
 #endif
-  );
+      );
 }
 #endif  // HAS_ARGBMULTIPLYROW_AVX2
 
 #ifdef HAS_ARGBADDROW_SSE2
 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1"
-  );
+void ARGBAddRow_SSE2(const uint8_t* src_argb0,
+                     const uint8_t* src_argb1,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqu    (%1),%%xmm1                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "paddusb   %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 #endif  // HAS_ARGBADDROW_SSE2
 
 #ifdef HAS_ARGBADDROW_AVX2
 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "lea        " MEMLEA(0x20,0) ",%0          \n"
-    "vpaddusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
-    "lea        " MEMLEA(0x20,1) ",%1          \n"
-    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
-    "lea        " MEMLEA(0x20,2) ",%2          \n"
-    "sub        $0x8,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "memory", "cc"
-    , "xmm0"
-  );
+void ARGBAddRow_AVX2(const uint8_t* src_argb0,
+                     const uint8_t* src_argb1,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "lea        0x20(%0),%0                    \n"
+      "vpaddusb   (%1),%%ymm0,%%ymm0             \n"
+      "lea        0x20(%1),%1                    \n"
+      "vmovdqu    %%ymm0,(%2)                    \n"
+      "lea        0x20(%2),%2                    \n"
+      "sub        $0x8,%3                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc", "xmm0");
 }
 #endif  // HAS_ARGBADDROW_AVX2
 
 #ifdef HAS_ARGBSUBTRACTROW_SSE2
 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
-void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  asm volatile (
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "psubusb   %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1"
-  );
+void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqu    (%1),%%xmm1                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "psubusb   %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 #endif  // HAS_ARGBSUBTRACTROW_SSE2
 
 #ifdef HAS_ARGBSUBTRACTROW_AVX2
 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  asm volatile (
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "lea        " MEMLEA(0x20,0) ",%0          \n"
-    "vpsubusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
-    "lea        " MEMLEA(0x20,1) ",%1          \n"
-    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
-    "lea        " MEMLEA(0x20,2) ",%2          \n"
-    "sub        $0x8,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "memory", "cc"
-    , "xmm0"
-  );
+void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "lea        0x20(%0),%0                    \n"
+      "vpsubusb   (%1),%%ymm0,%%ymm0             \n"
+      "lea        0x20(%1),%1                    \n"
+      "vmovdqu    %%ymm0,(%2)                    \n"
+      "lea        0x20(%2),%2                    \n"
+      "sub        $0x8,%3                        \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc", "xmm0");
 }
 #endif  // HAS_ARGBSUBTRACTROW_AVX2
 
@@ -4336,52 +5359,53 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
 // -1  0  1
 // -2  0  2
 // -1  0  1
-void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
-                    const uint8* src_y2, uint8* dst_sobelx, int width) {
-  asm volatile (
-    "sub       %0,%1                           \n"
-    "sub       %0,%2                           \n"
-    "sub       %0,%3                           \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
+void SobelXRow_SSE2(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    const uint8_t* src_y2,
+                    uint8_t* dst_sobelx,
+                    int width) {
+  asm volatile(
+      "sub       %0,%1                           \n"
+      "sub       %0,%2                           \n"
+      "sub       %0,%3                           \n"
+      "pxor      %%xmm5,%%xmm5                   \n"
 
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
-    "movq      " MEMACCESS2(0x2,0) ",%%xmm1    \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "psubw     %%xmm1,%%xmm0                   \n"
-    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
-    MEMOPREG(movq,0x02,0,1,1,xmm2)             //  movq      0x2(%0,%1,1),%%xmm2
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "psubw     %%xmm2,%%xmm1                   \n"
-    MEMOPREG(movq,0x00,0,2,1,xmm2)             //  movq      (%0,%2,1),%%xmm2
-    MEMOPREG(movq,0x02,0,2,1,xmm3)             //  movq      0x2(%0,%2,1),%%xmm3
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "punpcklbw %%xmm5,%%xmm3                   \n"
-    "psubw     %%xmm3,%%xmm2                   \n"
-    "paddw     %%xmm2,%%xmm0                   \n"
-    "paddw     %%xmm1,%%xmm0                   \n"
-    "paddw     %%xmm1,%%xmm0                   \n"
-    "pxor      %%xmm1,%%xmm1                   \n"
-    "psubw     %%xmm0,%%xmm1                   \n"
-    "pmaxsw    %%xmm1,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    MEMOPMEM(movq,xmm0,0x00,0,3,1)             //  movq      %%xmm0,(%0,%3,1)
-    "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "sub       $0x8,%4                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_y0),      // %0
-    "+r"(src_y1),      // %1
-    "+r"(src_y2),      // %2
-    "+r"(dst_sobelx),  // %3
-    "+r"(width)        // %4
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movq      (%0),%%xmm0                     \n"
+      "movq      0x2(%0),%%xmm1                  \n"
+      "punpcklbw %%xmm5,%%xmm0                   \n"
+      "punpcklbw %%xmm5,%%xmm1                   \n"
+      "psubw     %%xmm1,%%xmm0                   \n"
+      "movq      0x00(%0,%1,1),%%xmm1            \n"
+      "movq      0x02(%0,%1,1),%%xmm2            \n"
+      "punpcklbw %%xmm5,%%xmm1                   \n"
+      "punpcklbw %%xmm5,%%xmm2                   \n"
+      "psubw     %%xmm2,%%xmm1                   \n"
+      "movq      0x00(%0,%2,1),%%xmm2            \n"
+      "movq      0x02(%0,%2,1),%%xmm3            \n"
+      "punpcklbw %%xmm5,%%xmm2                   \n"
+      "punpcklbw %%xmm5,%%xmm3                   \n"
+      "psubw     %%xmm3,%%xmm2                   \n"
+      "paddw     %%xmm2,%%xmm0                   \n"
+      "paddw     %%xmm1,%%xmm0                   \n"
+      "paddw     %%xmm1,%%xmm0                   \n"
+      "pxor      %%xmm1,%%xmm1                   \n"
+      "psubw     %%xmm0,%%xmm1                   \n"
+      "pmaxsw    %%xmm1,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movq      %%xmm0,0x00(%0,%3,1)            \n"
+      "lea       0x8(%0),%0                      \n"
+      "sub       $0x8,%4                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_y0),      // %0
+        "+r"(src_y1),      // %1
+        "+r"(src_y2),      // %2
+        "+r"(dst_sobelx),  // %3
+        "+r"(width)        // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_SOBELXROW_SSE2
 
@@ -4390,50 +5414,50 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
 // -1 -2 -1
 //  0  0  0
 //  1  2  1
-void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width) {
-  asm volatile (
-    "sub       %0,%1                           \n"
-    "sub       %0,%2                           \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
+void SobelYRow_SSE2(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    uint8_t* dst_sobely,
+                    int width) {
+  asm volatile(
+      "sub       %0,%1                           \n"
+      "sub       %0,%2                           \n"
+      "pxor      %%xmm5,%%xmm5                   \n"
 
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "psubw     %%xmm1,%%xmm0                   \n"
-    "movq      " MEMACCESS2(0x1,0) ",%%xmm1    \n"
-    MEMOPREG(movq,0x01,0,1,1,xmm2)             //  movq      0x1(%0,%1,1),%%xmm2
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "psubw     %%xmm2,%%xmm1                   \n"
-    "movq      " MEMACCESS2(0x2,0) ",%%xmm2    \n"
-    MEMOPREG(movq,0x02,0,1,1,xmm3)             //  movq      0x2(%0,%1,1),%%xmm3
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "punpcklbw %%xmm5,%%xmm3                   \n"
-    "psubw     %%xmm3,%%xmm2                   \n"
-    "paddw     %%xmm2,%%xmm0                   \n"
-    "paddw     %%xmm1,%%xmm0                   \n"
-    "paddw     %%xmm1,%%xmm0                   \n"
-    "pxor      %%xmm1,%%xmm1                   \n"
-    "psubw     %%xmm0,%%xmm1                   \n"
-    "pmaxsw    %%xmm1,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    MEMOPMEM(movq,xmm0,0x00,0,2,1)             //  movq      %%xmm0,(%0,%2,1)
-    "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "sub       $0x8,%3                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_y0),      // %0
-    "+r"(src_y1),      // %1
-    "+r"(dst_sobely),  // %2
-    "+r"(width)        // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movq      (%0),%%xmm0                     \n"
+      "movq      0x00(%0,%1,1),%%xmm1            \n"
+      "punpcklbw %%xmm5,%%xmm0                   \n"
+      "punpcklbw %%xmm5,%%xmm1                   \n"
+      "psubw     %%xmm1,%%xmm0                   \n"
+      "movq      0x1(%0),%%xmm1                  \n"
+      "movq      0x01(%0,%1,1),%%xmm2            \n"
+      "punpcklbw %%xmm5,%%xmm1                   \n"
+      "punpcklbw %%xmm5,%%xmm2                   \n"
+      "psubw     %%xmm2,%%xmm1                   \n"
+      "movq      0x2(%0),%%xmm2                  \n"
+      "movq      0x02(%0,%1,1),%%xmm3            \n"
+      "punpcklbw %%xmm5,%%xmm2                   \n"
+      "punpcklbw %%xmm5,%%xmm3                   \n"
+      "psubw     %%xmm3,%%xmm2                   \n"
+      "paddw     %%xmm2,%%xmm0                   \n"
+      "paddw     %%xmm1,%%xmm0                   \n"
+      "paddw     %%xmm1,%%xmm0                   \n"
+      "pxor      %%xmm1,%%xmm1                   \n"
+      "psubw     %%xmm0,%%xmm1                   \n"
+      "pmaxsw    %%xmm1,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movq      %%xmm0,0x00(%0,%2,1)            \n"
+      "lea       0x8(%0),%0                      \n"
+      "sub       $0x8,%3                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_y0),      // %0
+        "+r"(src_y1),      // %1
+        "+r"(dst_sobely),  // %2
+        "+r"(width)        // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_SOBELYROW_SSE2
 
@@ -4443,79 +5467,79 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
 // R = Sobel
 // G = Sobel
 // B = Sobel
-void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                   uint8* dst_argb, int width) {
-  asm volatile (
-    "sub       %0,%1                           \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pslld     $0x18,%%xmm5                    \n"
+void SobelRow_SSE2(const uint8_t* src_sobelx,
+                   const uint8_t* src_sobely,
+                   uint8_t* dst_argb,
+                   int width) {
+  asm volatile(
+      "sub       %0,%1                           \n"
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "pslld     $0x18,%%xmm5                    \n"
 
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "punpcklbw %%xmm0,%%xmm2                   \n"
-    "punpckhbw %%xmm0,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm1                   \n"
-    "punpckhwd %%xmm2,%%xmm2                   \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "por       %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm3                   \n"
-    "punpcklwd %%xmm0,%%xmm3                   \n"
-    "punpckhwd %%xmm0,%%xmm0                   \n"
-    "por       %%xmm5,%%xmm3                   \n"
-    "por       %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm1," MEMACCESS(2) "         \n"
-    "movdqu    %%xmm2," MEMACCESS2(0x10,2) "   \n"
-    "movdqu    %%xmm3," MEMACCESS2(0x20,2) "   \n"
-    "movdqu    %%xmm0," MEMACCESS2(0x30,2) "   \n"
-    "lea       " MEMLEA(0x40,2) ",%2           \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(width)        // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%1,1),%%xmm1            \n"
+      "lea       0x10(%0),%0                     \n"
+      "paddusb   %%xmm1,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "punpcklbw %%xmm0,%%xmm2                   \n"
+      "punpckhbw %%xmm0,%%xmm0                   \n"
+      "movdqa    %%xmm2,%%xmm1                   \n"
+      "punpcklwd %%xmm2,%%xmm1                   \n"
+      "punpckhwd %%xmm2,%%xmm2                   \n"
+      "por       %%xmm5,%%xmm1                   \n"
+      "por       %%xmm5,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm3                   \n"
+      "punpcklwd %%xmm0,%%xmm3                   \n"
+      "punpckhwd %%xmm0,%%xmm0                   \n"
+      "por       %%xmm5,%%xmm3                   \n"
+      "por       %%xmm5,%%xmm0                   \n"
+      "movdqu    %%xmm1,(%2)                     \n"
+      "movdqu    %%xmm2,0x10(%2)                 \n"
+      "movdqu    %%xmm3,0x20(%2)                 \n"
+      "movdqu    %%xmm0,0x30(%2)                 \n"
+      "lea       0x40(%2),%2                     \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(width)        // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_SOBELROW_SSE2
 
 #ifdef HAS_SOBELTOPLANEROW_SSE2
 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
-void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                          uint8* dst_y, int width) {
-  asm volatile (
-    "sub       %0,%1                           \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pslld     $0x18,%%xmm5                    \n"
+void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
+                          const uint8_t* src_sobely,
+                          uint8_t* dst_y,
+                          int width) {
+  asm volatile(
+      "sub       %0,%1                           \n"
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "pslld     $0x18,%%xmm5                    \n"
 
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_y),       // %2
-    "+r"(width)        // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1"
-  );
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%1,1),%%xmm1            \n"
+      "lea       0x10(%0),%0                     \n"
+      "paddusb   %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_y),       // %2
+        "+r"(width)        // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 #endif  // HAS_SOBELTOPLANEROW_SSE2
 
@@ -4525,1004 +5549,1123 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
 // R = Sobel X
 // G = Sobel
 // B = Sobel Y
-void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    "sub       %0,%1                           \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+void SobelXYRow_SSE2(const uint8_t* src_sobelx,
+                     const uint8_t* src_sobely,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      "sub       %0,%1                           \n"
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
 
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "paddusb   %%xmm1,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm3                   \n"
-    "punpcklbw %%xmm5,%%xmm3                   \n"
-    "punpckhbw %%xmm5,%%xmm0                   \n"
-    "movdqa    %%xmm1,%%xmm4                   \n"
-    "punpcklbw %%xmm2,%%xmm4                   \n"
-    "punpckhbw %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm4,%%xmm6                   \n"
-    "punpcklwd %%xmm3,%%xmm6                   \n"
-    "punpckhwd %%xmm3,%%xmm4                   \n"
-    "movdqa    %%xmm1,%%xmm7                   \n"
-    "punpcklwd %%xmm0,%%xmm7                   \n"
-    "punpckhwd %%xmm0,%%xmm1                   \n"
-    "movdqu    %%xmm6," MEMACCESS(2) "         \n"
-    "movdqu    %%xmm4," MEMACCESS2(0x10,2) "   \n"
-    "movdqu    %%xmm7," MEMACCESS2(0x20,2) "   \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x30,2) "   \n"
-    "lea       " MEMLEA(0x40,2) ",%2           \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(width)        // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%1,1),%%xmm1            \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "paddusb   %%xmm1,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm3                   \n"
+      "punpcklbw %%xmm5,%%xmm3                   \n"
+      "punpckhbw %%xmm5,%%xmm0                   \n"
+      "movdqa    %%xmm1,%%xmm4                   \n"
+      "punpcklbw %%xmm2,%%xmm4                   \n"
+      "punpckhbw %%xmm2,%%xmm1                   \n"
+      "movdqa    %%xmm4,%%xmm6                   \n"
+      "punpcklwd %%xmm3,%%xmm6                   \n"
+      "punpckhwd %%xmm3,%%xmm4                   \n"
+      "movdqa    %%xmm1,%%xmm7                   \n"
+      "punpcklwd %%xmm0,%%xmm7                   \n"
+      "punpckhwd %%xmm0,%%xmm1                   \n"
+      "movdqu    %%xmm6,(%2)                     \n"
+      "movdqu    %%xmm4,0x10(%2)                 \n"
+      "movdqu    %%xmm7,0x20(%2)                 \n"
+      "movdqu    %%xmm1,0x30(%2)                 \n"
+      "lea       0x40(%2),%2                     \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(width)        // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_SOBELXYROW_SSE2
 
 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
 // Creates a table of cumulative sums where each value is a sum of all values
 // above and to the left of the value, inclusive of the value.
-void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
-                                  const int32* previous_cumsum, int width) {
-  asm volatile (
-    "pxor      %%xmm0,%%xmm0                   \n"
-    "pxor      %%xmm1,%%xmm1                   \n"
-    "sub       $0x4,%3                         \n"
-    "jl        49f                             \n"
-    "test      $0xf,%1                         \n"
-    "jne       49f                             \n"
+void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
+                                  int32_t* cumsum,
+                                  const int32_t* previous_cumsum,
+                                  int width) {
+  asm volatile(
+      "pxor      %%xmm0,%%xmm0                   \n"
+      "pxor      %%xmm1,%%xmm1                   \n"
+      "sub       $0x4,%3                         \n"
+      "jl        49f                             \n"
+      "test      $0xf,%1                         \n"
+      "jne       49f                             \n"
 
-  // 4 pixel loop                              \n"
-    LABELALIGN
-  "40:                                         \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm2,%%xmm4                   \n"
-    "punpcklbw %%xmm1,%%xmm2                   \n"
-    "movdqa    %%xmm2,%%xmm3                   \n"
-    "punpcklwd %%xmm1,%%xmm2                   \n"
-    "punpckhwd %%xmm1,%%xmm3                   \n"
-    "punpckhbw %%xmm1,%%xmm4                   \n"
-    "movdqa    %%xmm4,%%xmm5                   \n"
-    "punpcklwd %%xmm1,%%xmm4                   \n"
-    "punpckhwd %%xmm1,%%xmm5                   \n"
-    "paddd     %%xmm2,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
-    "paddd     %%xmm0,%%xmm2                   \n"
-    "paddd     %%xmm3,%%xmm0                   \n"
-    "movdqu    " MEMACCESS2(0x10,2) ",%%xmm3   \n"
-    "paddd     %%xmm0,%%xmm3                   \n"
-    "paddd     %%xmm4,%%xmm0                   \n"
-    "movdqu    " MEMACCESS2(0x20,2) ",%%xmm4   \n"
-    "paddd     %%xmm0,%%xmm4                   \n"
-    "paddd     %%xmm5,%%xmm0                   \n"
-    "movdqu    " MEMACCESS2(0x30,2) ",%%xmm5   \n"
-    "lea       " MEMLEA(0x40,2) ",%2           \n"
-    "paddd     %%xmm0,%%xmm5                   \n"
-    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
-    "movdqu    %%xmm4," MEMACCESS2(0x20,1) "   \n"
-    "movdqu    %%xmm5," MEMACCESS2(0x30,1) "   \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub       $0x4,%3                         \n"
-    "jge       40b                             \n"
+      // 4 pixel loop.
+      LABELALIGN
+      "40:                                       \n"
+      "movdqu    (%0),%%xmm2                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqa    %%xmm2,%%xmm4                   \n"
+      "punpcklbw %%xmm1,%%xmm2                   \n"
+      "movdqa    %%xmm2,%%xmm3                   \n"
+      "punpcklwd %%xmm1,%%xmm2                   \n"
+      "punpckhwd %%xmm1,%%xmm3                   \n"
+      "punpckhbw %%xmm1,%%xmm4                   \n"
+      "movdqa    %%xmm4,%%xmm5                   \n"
+      "punpcklwd %%xmm1,%%xmm4                   \n"
+      "punpckhwd %%xmm1,%%xmm5                   \n"
+      "paddd     %%xmm2,%%xmm0                   \n"
+      "movdqu    (%2),%%xmm2                     \n"
+      "paddd     %%xmm0,%%xmm2                   \n"
+      "paddd     %%xmm3,%%xmm0                   \n"
+      "movdqu    0x10(%2),%%xmm3                 \n"
+      "paddd     %%xmm0,%%xmm3                   \n"
+      "paddd     %%xmm4,%%xmm0                   \n"
+      "movdqu    0x20(%2),%%xmm4                 \n"
+      "paddd     %%xmm0,%%xmm4                   \n"
+      "paddd     %%xmm5,%%xmm0                   \n"
+      "movdqu    0x30(%2),%%xmm5                 \n"
+      "lea       0x40(%2),%2                     \n"
+      "paddd     %%xmm0,%%xmm5                   \n"
+      "movdqu    %%xmm2,(%1)                     \n"
+      "movdqu    %%xmm3,0x10(%1)                 \n"
+      "movdqu    %%xmm4,0x20(%1)                 \n"
+      "movdqu    %%xmm5,0x30(%1)                 \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x4,%3                         \n"
+      "jge       40b                             \n"
 
-  "49:                                         \n"
-    "add       $0x3,%3                         \n"
-    "jl        19f                             \n"
+      "49:                                       \n"
+      "add       $0x3,%3                         \n"
+      "jl        19f                             \n"
 
-  // 1 pixel loop                              \n"
-    LABELALIGN
-  "10:                                         \n"
-    "movd      " MEMACCESS(0) ",%%xmm2         \n"
-    "lea       " MEMLEA(0x4,0) ",%0            \n"
-    "punpcklbw %%xmm1,%%xmm2                   \n"
-    "punpcklwd %%xmm1,%%xmm2                   \n"
-    "paddd     %%xmm2,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "paddd     %%xmm0,%%xmm2                   \n"
-    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x1,%3                         \n"
-    "jge       10b                             \n"
+      // 1 pixel loop.
+      LABELALIGN
+      "10:                                       \n"
+      "movd      (%0),%%xmm2                     \n"
+      "lea       0x4(%0),%0                      \n"
+      "punpcklbw %%xmm1,%%xmm2                   \n"
+      "punpcklwd %%xmm1,%%xmm2                   \n"
+      "paddd     %%xmm2,%%xmm0                   \n"
+      "movdqu    (%2),%%xmm2                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "paddd     %%xmm0,%%xmm2                   \n"
+      "movdqu    %%xmm2,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x1,%3                         \n"
+      "jge       10b                             \n"
 
-  "19:                                         \n"
-  : "+r"(row),  // %0
-    "+r"(cumsum),  // %1
-    "+r"(previous_cumsum),  // %2
-    "+r"(width)  // %3
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+      "19:                                       \n"
+      : "+r"(row),              // %0
+        "+r"(cumsum),           // %1
+        "+r"(previous_cumsum),  // %2
+        "+r"(width)             // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
 
 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
-                                    int width, int area, uint8* dst,
+void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
+                                    const int32_t* botleft,
+                                    int width,
+                                    int area,
+                                    uint8_t* dst,
                                     int count) {
-  asm volatile (
-    "movd      %5,%%xmm5                       \n"
-    "cvtdq2ps  %%xmm5,%%xmm5                   \n"
-    "rcpss     %%xmm5,%%xmm4                   \n"
-    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
-    "sub       $0x4,%3                         \n"
-    "jl        49f                             \n"
-    "cmpl      $0x80,%5                        \n"
-    "ja        40f                             \n"
+  asm volatile(
+      "movd      %5,%%xmm5                       \n"
+      "cvtdq2ps  %%xmm5,%%xmm5                   \n"
+      "rcpss     %%xmm5,%%xmm4                   \n"
+      "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+      "sub       $0x4,%3                         \n"
+      "jl        49f                             \n"
+      "cmpl      $0x80,%5                        \n"
+      "ja        40f                             \n"
 
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "psrld     $0x10,%%xmm6                    \n"
-    "cvtdq2ps  %%xmm6,%%xmm6                   \n"
-    "addps     %%xmm6,%%xmm5                   \n"
-    "mulps     %%xmm4,%%xmm5                   \n"
-    "cvtps2dq  %%xmm5,%%xmm5                   \n"
-    "packssdw  %%xmm5,%%xmm5                   \n"
+      "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+      "pcmpeqb   %%xmm6,%%xmm6                   \n"
+      "psrld     $0x10,%%xmm6                    \n"
+      "cvtdq2ps  %%xmm6,%%xmm6                   \n"
+      "addps     %%xmm6,%%xmm5                   \n"
+      "mulps     %%xmm4,%%xmm5                   \n"
+      "cvtps2dq  %%xmm5,%%xmm5                   \n"
+      "packssdw  %%xmm5,%%xmm5                   \n"
 
-  // 4 pixel small loop                        \n"
-    LABELALIGN
-  "4:                                         \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
-    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
-    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
-    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
-    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
-    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
-    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
-    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
-    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
-    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
-    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "packssdw  %%xmm1,%%xmm0                   \n"
-    "packssdw  %%xmm3,%%xmm2                   \n"
-    "pmulhuw   %%xmm5,%%xmm0                   \n"
-    "pmulhuw   %%xmm5,%%xmm2                   \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%3                         \n"
-    "jge       4b                              \n"
-    "jmp       49f                             \n"
+      // 4 pixel small loop.
+      LABELALIGN
+      "4:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "psubd     0x00(%0,%4,4),%%xmm0            \n"
+      "psubd     0x10(%0,%4,4),%%xmm1            \n"
+      "psubd     0x20(%0,%4,4),%%xmm2            \n"
+      "psubd     0x30(%0,%4,4),%%xmm3            \n"
+      "lea       0x40(%0),%0                     \n"
+      "psubd     (%1),%%xmm0                     \n"
+      "psubd     0x10(%1),%%xmm1                 \n"
+      "psubd     0x20(%1),%%xmm2                 \n"
+      "psubd     0x30(%1),%%xmm3                 \n"
+      "paddd     0x00(%1,%4,4),%%xmm0            \n"
+      "paddd     0x10(%1,%4,4),%%xmm1            \n"
+      "paddd     0x20(%1,%4,4),%%xmm2            \n"
+      "paddd     0x30(%1,%4,4),%%xmm3            \n"
+      "lea       0x40(%1),%1                     \n"
+      "packssdw  %%xmm1,%%xmm0                   \n"
+      "packssdw  %%xmm3,%%xmm2                   \n"
+      "pmulhuw   %%xmm5,%%xmm0                   \n"
+      "pmulhuw   %%xmm5,%%xmm2                   \n"
+      "packuswb  %%xmm2,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jge       4b                              \n"
+      "jmp       49f                             \n"
 
-  // 4 pixel loop                              \n"
-    LABELALIGN
-  "40:                                         \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
-    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
-    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
-    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
-    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
-    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
-    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
-    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
-    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
-    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
-    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
-    "cvtdq2ps  %%xmm1,%%xmm1                   \n"
-    "mulps     %%xmm4,%%xmm0                   \n"
-    "mulps     %%xmm4,%%xmm1                   \n"
-    "cvtdq2ps  %%xmm2,%%xmm2                   \n"
-    "cvtdq2ps  %%xmm3,%%xmm3                   \n"
-    "mulps     %%xmm4,%%xmm2                   \n"
-    "mulps     %%xmm4,%%xmm3                   \n"
-    "cvtps2dq  %%xmm0,%%xmm0                   \n"
-    "cvtps2dq  %%xmm1,%%xmm1                   \n"
-    "cvtps2dq  %%xmm2,%%xmm2                   \n"
-    "cvtps2dq  %%xmm3,%%xmm3                   \n"
-    "packssdw  %%xmm1,%%xmm0                   \n"
-    "packssdw  %%xmm3,%%xmm2                   \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%3                         \n"
-    "jge       40b                             \n"
+      // 4 pixel loop
+      LABELALIGN
+      "40:                                       \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "psubd     0x00(%0,%4,4),%%xmm0            \n"
+      "psubd     0x10(%0,%4,4),%%xmm1            \n"
+      "psubd     0x20(%0,%4,4),%%xmm2            \n"
+      "psubd     0x30(%0,%4,4),%%xmm3            \n"
+      "lea       0x40(%0),%0                     \n"
+      "psubd     (%1),%%xmm0                     \n"
+      "psubd     0x10(%1),%%xmm1                 \n"
+      "psubd     0x20(%1),%%xmm2                 \n"
+      "psubd     0x30(%1),%%xmm3                 \n"
+      "paddd     0x00(%1,%4,4),%%xmm0            \n"
+      "paddd     0x10(%1,%4,4),%%xmm1            \n"
+      "paddd     0x20(%1,%4,4),%%xmm2            \n"
+      "paddd     0x30(%1,%4,4),%%xmm3            \n"
+      "lea       0x40(%1),%1                     \n"
+      "cvtdq2ps  %%xmm0,%%xmm0                   \n"
+      "cvtdq2ps  %%xmm1,%%xmm1                   \n"
+      "mulps     %%xmm4,%%xmm0                   \n"
+      "mulps     %%xmm4,%%xmm1                   \n"
+      "cvtdq2ps  %%xmm2,%%xmm2                   \n"
+      "cvtdq2ps  %%xmm3,%%xmm3                   \n"
+      "mulps     %%xmm4,%%xmm2                   \n"
+      "mulps     %%xmm4,%%xmm3                   \n"
+      "cvtps2dq  %%xmm0,%%xmm0                   \n"
+      "cvtps2dq  %%xmm1,%%xmm1                   \n"
+      "cvtps2dq  %%xmm2,%%xmm2                   \n"
+      "cvtps2dq  %%xmm3,%%xmm3                   \n"
+      "packssdw  %%xmm1,%%xmm0                   \n"
+      "packssdw  %%xmm3,%%xmm2                   \n"
+      "packuswb  %%xmm2,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jge       40b                             \n"
 
-  "49:                                         \n"
-    "add       $0x3,%3                         \n"
-    "jl        19f                             \n"
+      "49:                                       \n"
+      "add       $0x3,%3                         \n"
+      "jl        19f                             \n"
 
-  // 1 pixel loop                              \n"
-    LABELALIGN
-  "10:                                         \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
-    "mulps     %%xmm4,%%xmm0                   \n"
-    "cvtps2dq  %%xmm0,%%xmm0                   \n"
-    "packssdw  %%xmm0,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movd      %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x4,2) ",%2            \n"
-    "sub       $0x1,%3                         \n"
-    "jge       10b                             \n"
-  "19:                                         \n"
-  : "+r"(topleft),  // %0
-    "+r"(botleft),  // %1
-    "+r"(dst),      // %2
-    "+rm"(count)    // %3
-  : "r"((intptr_t)(width)),  // %4
-    "rm"(area)     // %5
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+      // 1 pixel loop
+      LABELALIGN
+      "10:                                       \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "psubd     0x00(%0,%4,4),%%xmm0            \n"
+      "lea       0x10(%0),%0                     \n"
+      "psubd     (%1),%%xmm0                     \n"
+      "paddd     0x00(%1,%4,4),%%xmm0            \n"
+      "lea       0x10(%1),%1                     \n"
+      "cvtdq2ps  %%xmm0,%%xmm0                   \n"
+      "mulps     %%xmm4,%%xmm0                   \n"
+      "cvtps2dq  %%xmm0,%%xmm0                   \n"
+      "packssdw  %%xmm0,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movd      %%xmm0,(%2)                     \n"
+      "lea       0x4(%2),%2                      \n"
+      "sub       $0x1,%3                         \n"
+      "jge       10b                             \n"
+      "19:                                       \n"
+      : "+r"(topleft),           // %0
+        "+r"(botleft),           // %1
+        "+r"(dst),               // %2
+        "+rm"(count)             // %3
+      : "r"((intptr_t)(width)),  // %4
+        "rm"(area)               // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
 
 #ifdef HAS_ARGBAFFINEROW_SSE2
 // Copy ARGB pixels from source image with slope to a row of destination.
 LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
-                        uint8* dst_argb, const float* src_dudv, int width) {
+void ARGBAffineRow_SSE2(const uint8_t* src_argb,
+                        int src_argb_stride,
+                        uint8_t* dst_argb,
+                        const float* src_dudv,
+                        int width) {
   intptr_t src_argb_stride_temp = src_argb_stride;
   intptr_t temp;
-  asm volatile (
-    "movq      " MEMACCESS(3) ",%%xmm2         \n"
-    "movq      " MEMACCESS2(0x08,3) ",%%xmm7   \n"
-    "shl       $0x10,%1                        \n"
-    "add       $0x4,%1                         \n"
-    "movd      %1,%%xmm5                       \n"
-    "sub       $0x4,%4                         \n"
-    "jl        49f                             \n"
+  asm volatile(
+      "movq      (%3),%%xmm2                     \n"
+      "movq      0x08(%3),%%xmm7                 \n"
+      "shl       $0x10,%1                        \n"
+      "add       $0x4,%1                         \n"
+      "movd      %1,%%xmm5                       \n"
+      "sub       $0x4,%4                         \n"
+      "jl        49f                             \n"
 
-    "pshufd    $0x44,%%xmm7,%%xmm7             \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "movdqa    %%xmm2,%%xmm0                   \n"
-    "addps     %%xmm7,%%xmm0                   \n"
-    "movlhps   %%xmm0,%%xmm2                   \n"
-    "movdqa    %%xmm7,%%xmm4                   \n"
-    "addps     %%xmm4,%%xmm4                   \n"
-    "movdqa    %%xmm2,%%xmm3                   \n"
-    "addps     %%xmm4,%%xmm3                   \n"
-    "addps     %%xmm4,%%xmm4                   \n"
+      "pshufd    $0x44,%%xmm7,%%xmm7             \n"
+      "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+      "movdqa    %%xmm2,%%xmm0                   \n"
+      "addps     %%xmm7,%%xmm0                   \n"
+      "movlhps   %%xmm0,%%xmm2                   \n"
+      "movdqa    %%xmm7,%%xmm4                   \n"
+      "addps     %%xmm4,%%xmm4                   \n"
+      "movdqa    %%xmm2,%%xmm3                   \n"
+      "addps     %%xmm4,%%xmm3                   \n"
+      "addps     %%xmm4,%%xmm4                   \n"
 
-  // 4 pixel loop                              \n"
-    LABELALIGN
-  "40:                                         \n"
-    "cvttps2dq %%xmm2,%%xmm0                   \n"  // x, y float to int first 2
-    "cvttps2dq %%xmm3,%%xmm1                   \n"  // x, y float to int next 2
-    "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts
-    "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x * 4 + y * stride
-    "movd      %%xmm0,%k1                      \n"
-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-    "movd      %%xmm0,%k5                      \n"
-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-    MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
-    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
-    "punpckldq %%xmm6,%%xmm1                   \n"
-    "addps     %%xmm4,%%xmm2                   \n"
-    "movq      %%xmm1," MEMACCESS(2) "         \n"
-    "movd      %%xmm0,%k1                      \n"
-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-    "movd      %%xmm0,%k5                      \n"
-    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
-    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
-    "punpckldq %%xmm6,%%xmm0                   \n"
-    "addps     %%xmm4,%%xmm3                   \n"
-    "movq      %%xmm0," MEMACCESS2(0x08,2) "   \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%4                         \n"
-    "jge       40b                             \n"
+      // 4 pixel loop
+      LABELALIGN
+      "40:                                       \n"
+      "cvttps2dq %%xmm2,%%xmm0                   \n"  // x,y float->int first 2
+      "cvttps2dq %%xmm3,%%xmm1                   \n"  // x,y float->int next 2
+      "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts
+      "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x*4 + y*stride
+      "movd      %%xmm0,%k1                      \n"
+      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+      "movd      %%xmm0,%k5                      \n"
+      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+      "movd      0x00(%0,%1,1),%%xmm1            \n"
+      "movd      0x00(%0,%5,1),%%xmm6            \n"
+      "punpckldq %%xmm6,%%xmm1                   \n"
+      "addps     %%xmm4,%%xmm2                   \n"
+      "movq      %%xmm1,(%2)                     \n"
+      "movd      %%xmm0,%k1                      \n"
+      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+      "movd      %%xmm0,%k5                      \n"
+      "movd      0x00(%0,%1,1),%%xmm0            \n"
+      "movd      0x00(%0,%5,1),%%xmm6            \n"
+      "punpckldq %%xmm6,%%xmm0                   \n"
+      "addps     %%xmm4,%%xmm3                   \n"
+      "movq      %%xmm0,0x08(%2)                 \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%4                         \n"
+      "jge       40b                             \n"
 
-  "49:                                         \n"
-    "add       $0x3,%4                         \n"
-    "jl        19f                             \n"
+      "49:                                       \n"
+      "add       $0x3,%4                         \n"
+      "jl        19f                             \n"
 
-  // 1 pixel loop                              \n"
-    LABELALIGN
-  "10:                                         \n"
-    "cvttps2dq %%xmm2,%%xmm0                   \n"
-    "packssdw  %%xmm0,%%xmm0                   \n"
-    "pmaddwd   %%xmm5,%%xmm0                   \n"
-    "addps     %%xmm7,%%xmm2                   \n"
-    "movd      %%xmm0,%k1                      \n"
-    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
-    "movd      %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x04,2) ",%2           \n"
-    "sub       $0x1,%4                         \n"
-    "jge       10b                             \n"
-  "19:                                         \n"
-  : "+r"(src_argb),  // %0
-    "+r"(src_argb_stride_temp),  // %1
-    "+r"(dst_argb),  // %2
-    "+r"(src_dudv),  // %3
-    "+rm"(width),    // %4
-    "=&r"(temp)      // %5
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+      // 1 pixel loop
+      LABELALIGN
+      "10:                                       \n"
+      "cvttps2dq %%xmm2,%%xmm0                   \n"
+      "packssdw  %%xmm0,%%xmm0                   \n"
+      "pmaddwd   %%xmm5,%%xmm0                   \n"
+      "addps     %%xmm7,%%xmm2                   \n"
+      "movd      %%xmm0,%k1                      \n"
+      "movd      0x00(%0,%1,1),%%xmm0            \n"
+      "movd      %%xmm0,(%2)                     \n"
+      "lea       0x04(%2),%2                     \n"
+      "sub       $0x1,%4                         \n"
+      "jge       10b                             \n"
+      "19:                                       \n"
+      : "+r"(src_argb),              // %0
+        "+r"(src_argb_stride_temp),  // %1
+        "+r"(dst_argb),              // %2
+        "+r"(src_dudv),              // %3
+        "+rm"(width),                // %4
+        "=&r"(temp)                  // %5
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBAFFINEROW_SSE2
 
 #ifdef HAS_INTERPOLATEROW_SSSE3
 // Bilinear filter 16x2 -> 16x1
-void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                          ptrdiff_t src_stride, int dst_width,
+void InterpolateRow_SSSE3(uint8_t* dst_ptr,
+                          const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          int dst_width,
                           int source_y_fraction) {
-  asm volatile (
-    "sub       %1,%0                           \n"
-    "cmp       $0x0,%3                         \n"
-    "je        100f                            \n"
-    "cmp       $0x80,%3                        \n"
-    "je        50f                             \n"
+  asm volatile(
+      "sub       %1,%0                           \n"
+      "cmp       $0x0,%3                         \n"
+      "je        100f                            \n"
+      "cmp       $0x80,%3                        \n"
+      "je        50f                             \n"
 
-    "movd      %3,%%xmm0                       \n"
-    "neg       %3                              \n"
-    "add       $0x100,%3                       \n"
-    "movd      %3,%%xmm5                       \n"
-    "punpcklbw %%xmm0,%%xmm5                   \n"
-    "punpcklwd %%xmm5,%%xmm5                   \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "mov       $0x80808080,%%eax               \n"
-    "movd      %%eax,%%xmm4                    \n"
-    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+      "movd      %3,%%xmm0                       \n"
+      "neg       %3                              \n"
+      "add       $0x100,%3                       \n"
+      "movd      %3,%%xmm5                       \n"
+      "punpcklbw %%xmm0,%%xmm5                   \n"
+      "punpcklwd %%xmm5,%%xmm5                   \n"
+      "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+      "mov       $0x80808080,%%eax               \n"
+      "movd      %%eax,%%xmm4                    \n"
+      "pshufd    $0x0,%%xmm4,%%xmm4              \n"
 
-    // General purpose row blend.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,1,4,1,xmm2)
-    "movdqa     %%xmm0,%%xmm1                  \n"
-    "punpcklbw  %%xmm2,%%xmm0                  \n"
-    "punpckhbw  %%xmm2,%%xmm1                  \n"
-    "psubb      %%xmm4,%%xmm0                  \n"
-    "psubb      %%xmm4,%%xmm1                  \n"
-    "movdqa     %%xmm5,%%xmm2                  \n"
-    "movdqa     %%xmm5,%%xmm3                  \n"
-    "pmaddubsw  %%xmm0,%%xmm2                  \n"
-    "pmaddubsw  %%xmm1,%%xmm3                  \n"
-    "paddw      %%xmm4,%%xmm2                  \n"
-    "paddw      %%xmm4,%%xmm3                  \n"
-    "psrlw      $0x8,%%xmm2                    \n"
-    "psrlw      $0x8,%%xmm3                    \n"
-    "packuswb   %%xmm3,%%xmm2                  \n"
-    MEMOPMEM(movdqu,xmm2,0x00,1,0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-    "jmp       99f                             \n"
+      // General purpose row blend.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%1),%%xmm0                     \n"
+      "movdqu    0x00(%1,%4,1),%%xmm2            \n"
+      "movdqa     %%xmm0,%%xmm1                  \n"
+      "punpcklbw  %%xmm2,%%xmm0                  \n"
+      "punpckhbw  %%xmm2,%%xmm1                  \n"
+      "psubb      %%xmm4,%%xmm0                  \n"
+      "psubb      %%xmm4,%%xmm1                  \n"
+      "movdqa     %%xmm5,%%xmm2                  \n"
+      "movdqa     %%xmm5,%%xmm3                  \n"
+      "pmaddubsw  %%xmm0,%%xmm2                  \n"
+      "pmaddubsw  %%xmm1,%%xmm3                  \n"
+      "paddw      %%xmm4,%%xmm2                  \n"
+      "paddw      %%xmm4,%%xmm3                  \n"
+      "psrlw      $0x8,%%xmm2                    \n"
+      "psrlw      $0x8,%%xmm3                    \n"
+      "packuswb   %%xmm3,%%xmm2                  \n"
+      "movdqu    %%xmm2,0x00(%1,%0,1)            \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      "jmp       99f                             \n"
 
-    // Blend 50 / 50.
-    LABELALIGN
-  "50:                                         \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,1,4,1,xmm1)
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        50b                             \n"
-    "jmp       99f                             \n"
+      // Blend 50 / 50.
+      LABELALIGN
+      "50:                                       \n"
+      "movdqu    (%1),%%xmm0                     \n"
+      "movdqu    0x00(%1,%4,1),%%xmm1            \n"
+      "pavgb     %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,0x00(%1,%0,1)            \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        50b                             \n"
+      "jmp       99f                             \n"
 
-    // Blend 100 / 0 - Copy row unchanged.
-    LABELALIGN
-  "100:                                        \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        100b                            \n"
+      // Blend 100 / 0 - Copy row unchanged.
+      LABELALIGN
+      "100:                                      \n"
+      "movdqu    (%1),%%xmm0                     \n"
+      "movdqu    %%xmm0,0x00(%1,%0,1)            \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        100b                            \n"
 
-  "99:                                         \n"
-  : "+r"(dst_ptr),     // %0
-    "+r"(src_ptr),     // %1
-    "+rm"(dst_width),  // %2
-    "+r"(source_y_fraction)  // %3
-  : "r"((intptr_t)(src_stride))  // %4
-  : "memory", "cc", "eax", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+      "99:                                       \n"
+      : "+r"(dst_ptr),               // %0
+        "+r"(src_ptr),               // %1
+        "+rm"(dst_width),            // %2
+        "+r"(source_y_fraction)      // %3
+      : "r"((intptr_t)(src_stride))  // %4
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_INTERPOLATEROW_SSSE3
 
 #ifdef HAS_INTERPOLATEROW_AVX2
 // Bilinear filter 32x2 -> 32x1
-void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
-                         ptrdiff_t src_stride, int dst_width,
+void InterpolateRow_AVX2(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         int dst_width,
                          int source_y_fraction) {
-  asm volatile (
-    "cmp       $0x0,%3                         \n"
-    "je        100f                            \n"
-    "sub       %1,%0                           \n"
-    "cmp       $0x80,%3                        \n"
-    "je        50f                             \n"
+  asm volatile(
+      "cmp       $0x0,%3                         \n"
+      "je        100f                            \n"
+      "sub       %1,%0                           \n"
+      "cmp       $0x80,%3                        \n"
+      "je        50f                             \n"
 
-    "vmovd      %3,%%xmm0                      \n"
-    "neg        %3                             \n"
-    "add        $0x100,%3                      \n"
-    "vmovd      %3,%%xmm5                      \n"
-    "vpunpcklbw %%xmm0,%%xmm5,%%xmm5           \n"
-    "vpunpcklwd %%xmm5,%%xmm5,%%xmm5           \n"
-    "vbroadcastss %%xmm5,%%ymm5                \n"
-    "mov        $0x80808080,%%eax              \n"
-    "vmovd      %%eax,%%xmm4                   \n"
-    "vbroadcastss %%xmm4,%%ymm4                \n"
+      "vmovd      %3,%%xmm0                      \n"
+      "neg        %3                             \n"
+      "add        $0x100,%3                      \n"
+      "vmovd      %3,%%xmm5                      \n"
+      "vpunpcklbw %%xmm0,%%xmm5,%%xmm5           \n"
+      "vpunpcklwd %%xmm5,%%xmm5,%%xmm5           \n"
+      "vbroadcastss %%xmm5,%%ymm5                \n"
+      "mov        $0x80808080,%%eax              \n"
+      "vmovd      %%eax,%%xmm4                   \n"
+      "vbroadcastss %%xmm4,%%ymm4                \n"
 
-    // General purpose row blend.
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
-    MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
-    "vpunpckhbw %%ymm2,%%ymm0,%%ymm1           \n"
-    "vpunpcklbw %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpsubb     %%ymm4,%%ymm1,%%ymm1           \n"
-    "vpsubb     %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm1,%%ymm5,%%ymm1           \n"
-    "vpmaddubsw %%ymm0,%%ymm5,%%ymm0           \n"
-    "vpaddw     %%ymm4,%%ymm1,%%ymm1           \n"
-    "vpaddw     %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
-    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "jmp       99f                             \n"
+      // General purpose row blend.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%1),%%ymm0                    \n"
+      "vmovdqu    0x00(%1,%4,1),%%ymm2           \n"
+      "vpunpckhbw %%ymm2,%%ymm0,%%ymm1           \n"
+      "vpunpcklbw %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpsubb     %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpsubb     %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm1,%%ymm5,%%ymm1           \n"
+      "vpmaddubsw %%ymm0,%%ymm5,%%ymm0           \n"
+      "vpaddw     %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpaddw     %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
+      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vmovdqu    %%ymm0,0x00(%1,%0,1)           \n"
+      "lea        0x20(%1),%1                    \n"
+      "sub        $0x20,%2                       \n"
+      "jg         1b                             \n"
+      "jmp        99f                            \n"
 
-    // Blend 50 / 50.
-    LABELALIGN
-  "50:                                         \n"
-    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
-    VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0)     // vpavgb (%1,%4,1),%%ymm0,%%ymm0
-    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x20,%2                        \n"
-    "jg        50b                             \n"
-    "jmp       99f                             \n"
+      // Blend 50 / 50.
+      LABELALIGN
+      "50:                                       \n"
+      "vmovdqu   (%1),%%ymm0                     \n"
+      "vpavgb    0x00(%1,%4,1),%%ymm0,%%ymm0     \n"
+      "vmovdqu   %%ymm0,0x00(%1,%0,1)            \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        50b                             \n"
+      "jmp       99f                             \n"
 
-    // Blend 100 / 0 - Copy row unchanged.
-    LABELALIGN
-  "100:                                        \n"
-    "rep movsb " MEMMOVESTRING(1,0) "          \n"
-    "jmp       999f                            \n"
+      // Blend 100 / 0 - Copy row unchanged.
+      LABELALIGN
+      "100:                                      \n"
+      "rep movsb                                 \n"
+      "jmp       999f                            \n"
 
-  "99:                                         \n"
-    "vzeroupper                                \n"
-  "999:                                        \n"
-  : "+D"(dst_ptr),    // %0
-    "+S"(src_ptr),    // %1
-    "+cm"(dst_width),  // %2
-    "+r"(source_y_fraction)  // %3
-  : "r"((intptr_t)(src_stride))  // %4
-  : "memory", "cc", "eax", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"
-  );
+      "99:                                       \n"
+      "vzeroupper                                \n"
+      "999:                                      \n"
+      : "+D"(dst_ptr),               // %0
+        "+S"(src_ptr),               // %1
+        "+cm"(dst_width),            // %2
+        "+r"(source_y_fraction)      // %3
+      : "r"((intptr_t)(src_stride))  // %4
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5");
 }
 #endif  // HAS_INTERPOLATEROW_AVX2
 
 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                          const uint8* shuffler, int width) {
-  asm volatile (
-    "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pshufb    %%xmm5,%%xmm0                   \n"
-    "pshufb    %%xmm5,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)        // %2
-  : "r"(shuffler)    // %3
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm5"
-  );
+void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          const uint8_t* shuffler,
+                          int width) {
+  asm volatile(
+
+      "movdqu    (%3),%%xmm5                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "pshufb    %%xmm5,%%xmm0                   \n"
+      "pshufb    %%xmm5,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(shuffler)    // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 #endif  // HAS_ARGBSHUFFLEROW_SSSE3
 
 #ifdef HAS_ARGBSHUFFLEROW_AVX2
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int width) {
-  asm volatile (
-    "vbroadcastf128 " MEMACCESS(3) ",%%ymm5    \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
-    "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
-    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
-    "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)        // %2
-  : "r"(shuffler)    // %3
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm5"
-  );
+void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
+                         uint8_t* dst_argb,
+                         const uint8_t* shuffler,
+                         int width) {
+  asm volatile(
+
+      "vbroadcastf128 (%3),%%ymm5                \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
+      "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
+      "vmovdqu   %%ymm0,(%1)                     \n"
+      "vmovdqu   %%ymm1,0x20(%1)                 \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(shuffler)    // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 #endif  // HAS_ARGBSHUFFLEROW_AVX2
 
-#ifdef HAS_ARGBSHUFFLEROW_SSE2
-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int width) {
-  uintptr_t pixel_temp;
-  asm volatile (
-    "pxor      %%xmm5,%%xmm5                   \n"
-    "mov       " MEMACCESS(4) ",%k2            \n"
-    "cmp       $0x3000102,%k2                  \n"
-    "je        3012f                           \n"
-    "cmp       $0x10203,%k2                    \n"
-    "je        123f                            \n"
-    "cmp       $0x30201,%k2                    \n"
-    "je        321f                            \n"
-    "cmp       $0x2010003,%k2                  \n"
-    "je        2103f                           \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "movzb     " MEMACCESS(4) ",%2             \n"
-    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
-    "mov       %b2," MEMACCESS(1) "            \n"
-    "movzb     " MEMACCESS2(0x1,4) ",%2        \n"
-    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
-    "mov       %b2," MEMACCESS2(0x1,1) "       \n"
-    "movzb     " MEMACCESS2(0x2,4) ",%2        \n"
-    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
-    "mov       %b2," MEMACCESS2(0x2,1) "       \n"
-    "movzb     " MEMACCESS2(0x3,4) ",%2        \n"
-    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
-    "mov       %b2," MEMACCESS2(0x3,1) "       \n"
-    "lea       " MEMLEA(0x4,0) ",%0            \n"
-    "lea       " MEMLEA(0x4,1) ",%1            \n"
-    "sub       $0x1,%3                         \n"
-    "jg        1b                              \n"
-    "jmp       99f                             \n"
-
-    LABELALIGN
-  "123:                                        \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "punpckhbw %%xmm5,%%xmm1                   \n"
-    "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
-    "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
-    "pshufhw   $0x1b,%%xmm1,%%xmm1             \n"
-    "pshuflw   $0x1b,%%xmm1,%%xmm1             \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        123b                            \n"
-    "jmp       99f                             \n"
-
-    LABELALIGN
-  "321:                                        \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "punpckhbw %%xmm5,%%xmm1                   \n"
-    "pshufhw   $0x39,%%xmm0,%%xmm0             \n"
-    "pshuflw   $0x39,%%xmm0,%%xmm0             \n"
-    "pshufhw   $0x39,%%xmm1,%%xmm1             \n"
-    "pshuflw   $0x39,%%xmm1,%%xmm1             \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        321b                            \n"
-    "jmp       99f                             \n"
-
-    LABELALIGN
-  "2103:                                       \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "punpckhbw %%xmm5,%%xmm1                   \n"
-    "pshufhw   $0x93,%%xmm0,%%xmm0             \n"
-    "pshuflw   $0x93,%%xmm0,%%xmm0             \n"
-    "pshufhw   $0x93,%%xmm1,%%xmm1             \n"
-    "pshuflw   $0x93,%%xmm1,%%xmm1             \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        2103b                           \n"
-    "jmp       99f                             \n"
-
-    LABELALIGN
-  "3012:                                       \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "punpckhbw %%xmm5,%%xmm1                   \n"
-    "pshufhw   $0xc6,%%xmm0,%%xmm0             \n"
-    "pshuflw   $0xc6,%%xmm0,%%xmm0             \n"
-    "pshufhw   $0xc6,%%xmm1,%%xmm1             \n"
-    "pshuflw   $0xc6,%%xmm1,%%xmm1             \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        3012b                           \n"
-
-  "99:                                         \n"
-  : "+r"(src_argb),     // %0
-    "+r"(dst_argb),     // %1
-    "=&d"(pixel_temp),  // %2
-    "+r"(width)         // %3
-  : "r"(shuffler)       // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm5"
-  );
-}
-#endif  // HAS_ARGBSHUFFLEROW_SSE2
-
 #ifdef HAS_I422TOYUY2ROW_SSE2
-void I422ToYUY2Row_SSE2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_frame, int width) {
- asm volatile (
-    "sub       %1,%2                             \n"
-    LABELALIGN
-  "1:                                            \n"
-    "movq      " MEMACCESS(1) ",%%xmm2           \n"
-    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
-    "lea       " MEMLEA(0x8,1) ",%1              \n"
-    "punpcklbw %%xmm3,%%xmm2                     \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
-    "lea       " MEMLEA(0x10,0) ",%0             \n"
-    "movdqa    %%xmm0,%%xmm1                     \n"
-    "punpcklbw %%xmm2,%%xmm0                     \n"
-    "punpckhbw %%xmm2,%%xmm1                     \n"
-    "movdqu    %%xmm0," MEMACCESS(3) "           \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,3) "     \n"
-    "lea       " MEMLEA(0x20,3) ",%3             \n"
-    "sub       $0x10,%4                          \n"
-    "jg         1b                               \n"
-    : "+r"(src_y),  // %0
-      "+r"(src_u),  // %1
-      "+r"(src_v),  // %2
-      "+r"(dst_frame),  // %3
-      "+rm"(width)  // %4
-    :
-    : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3"
-  );
+void I422ToYUY2Row_SSE2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_yuy2,
+                        int width) {
+  asm volatile(
+
+      "sub       %1,%2                             \n"
+
+      LABELALIGN
+      "1:                                          \n"
+      "movq      (%1),%%xmm2                       \n"
+      "movq      0x00(%1,%2,1),%%xmm1              \n"
+      "add       $0x8,%1                           \n"
+      "punpcklbw %%xmm1,%%xmm2                     \n"
+      "movdqu    (%0),%%xmm0                       \n"
+      "add       $0x10,%0                          \n"
+      "movdqa    %%xmm0,%%xmm1                     \n"
+      "punpcklbw %%xmm2,%%xmm0                     \n"
+      "punpckhbw %%xmm2,%%xmm1                     \n"
+      "movdqu    %%xmm0,(%3)                       \n"
+      "movdqu    %%xmm1,0x10(%3)                   \n"
+      "lea       0x20(%3),%3                       \n"
+      "sub       $0x10,%4                          \n"
+      "jg         1b                               \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_yuy2),  // %3
+        "+rm"(width)     // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_I422TOYUY2ROW_SSE2
 
 #ifdef HAS_I422TOUYVYROW_SSE2
-void I422ToUYVYRow_SSE2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_frame, int width) {
- asm volatile (
-    "sub        %1,%2                            \n"
-    LABELALIGN
-  "1:                                            \n"
-    "movq      " MEMACCESS(1) ",%%xmm2           \n"
-    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
-    "lea       " MEMLEA(0x8,1) ",%1              \n"
-    "punpcklbw %%xmm3,%%xmm2                     \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
-    "movdqa    %%xmm2,%%xmm1                     \n"
-    "lea       " MEMLEA(0x10,0) ",%0             \n"
-    "punpcklbw %%xmm0,%%xmm1                     \n"
-    "punpckhbw %%xmm0,%%xmm2                     \n"
-    "movdqu    %%xmm1," MEMACCESS(3) "           \n"
-    "movdqu    %%xmm2," MEMACCESS2(0x10,3) "     \n"
-    "lea       " MEMLEA(0x20,3) ",%3             \n"
-    "sub       $0x10,%4                          \n"
-    "jg         1b                               \n"
-    : "+r"(src_y),  // %0
-      "+r"(src_u),  // %1
-      "+r"(src_v),  // %2
-      "+r"(dst_frame),  // %3
-      "+rm"(width)  // %4
-    :
-    : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3"
-  );
+void I422ToUYVYRow_SSE2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_uyvy,
+                        int width) {
+  asm volatile(
+
+      "sub        %1,%2                            \n"
+
+      LABELALIGN
+      "1:                                          \n"
+      "movq      (%1),%%xmm2                       \n"
+      "movq      0x00(%1,%2,1),%%xmm1              \n"
+      "add       $0x8,%1                           \n"
+      "punpcklbw %%xmm1,%%xmm2                     \n"
+      "movdqu    (%0),%%xmm0                       \n"
+      "movdqa    %%xmm2,%%xmm1                     \n"
+      "add       $0x10,%0                          \n"
+      "punpcklbw %%xmm0,%%xmm1                     \n"
+      "punpckhbw %%xmm0,%%xmm2                     \n"
+      "movdqu    %%xmm1,(%3)                       \n"
+      "movdqu    %%xmm2,0x10(%3)                   \n"
+      "lea       0x20(%3),%3                       \n"
+      "sub       $0x10,%4                          \n"
+      "jg         1b                               \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_uyvy),  // %3
+        "+rm"(width)     // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_I422TOUYVYROW_SSE2
 
-#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
-void ARGBPolynomialRow_SSE2(const uint8* src_argb,
-                            uint8* dst_argb, const float* poly,
-                            int width) {
-  asm volatile (
-    "pxor      %%xmm3,%%xmm3                   \n"
+#ifdef HAS_I422TOYUY2ROW_AVX2
+void I422ToYUY2Row_AVX2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_yuy2,
+                        int width) {
+  asm volatile(
 
-    // 2 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "punpcklbw %%xmm3,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm4                   \n"
-    "punpcklwd %%xmm3,%%xmm0                   \n"
-    "punpckhwd %%xmm3,%%xmm4                   \n"
-    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
-    "cvtdq2ps  %%xmm4,%%xmm4                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm4,%%xmm5                   \n"
-    "mulps     " MEMACCESS2(0x10,3) ",%%xmm0   \n"
-    "mulps     " MEMACCESS2(0x10,3) ",%%xmm4   \n"
-    "addps     " MEMACCESS(3) ",%%xmm0         \n"
-    "addps     " MEMACCESS(3) ",%%xmm4         \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "movdqa    %%xmm5,%%xmm6                   \n"
-    "mulps     %%xmm1,%%xmm2                   \n"
-    "mulps     %%xmm5,%%xmm6                   \n"
-    "mulps     %%xmm2,%%xmm1                   \n"
-    "mulps     %%xmm6,%%xmm5                   \n"
-    "mulps     " MEMACCESS2(0x20,3) ",%%xmm2   \n"
-    "mulps     " MEMACCESS2(0x20,3) ",%%xmm6   \n"
-    "mulps     " MEMACCESS2(0x30,3) ",%%xmm1   \n"
-    "mulps     " MEMACCESS2(0x30,3) ",%%xmm5   \n"
-    "addps     %%xmm2,%%xmm0                   \n"
-    "addps     %%xmm6,%%xmm4                   \n"
-    "addps     %%xmm1,%%xmm0                   \n"
-    "addps     %%xmm5,%%xmm4                   \n"
-    "cvttps2dq %%xmm0,%%xmm0                   \n"
-    "cvttps2dq %%xmm4,%%xmm4                   \n"
-    "packuswb  %%xmm4,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x2,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)      // %2
-  : "r"(poly)        // %3
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+      "sub       %1,%2                             \n"
+
+      LABELALIGN
+      "1:                                          \n"
+      "vpmovzxbw  (%1),%%ymm1                      \n"
+      "vpmovzxbw  0x00(%1,%2,1),%%ymm2             \n"
+      "add        $0x10,%1                         \n"
+      "vpsllw     $0x8,%%ymm2,%%ymm2               \n"
+      "vpor       %%ymm1,%%ymm2,%%ymm2             \n"
+      "vmovdqu    (%0),%%ymm0                      \n"
+      "add        $0x20,%0                         \n"
+      "vpunpcklbw %%ymm2,%%ymm0,%%ymm1             \n"
+      "vpunpckhbw %%ymm2,%%ymm0,%%ymm2             \n"
+      "vextractf128 $0x0,%%ymm1,(%3)               \n"
+      "vextractf128 $0x0,%%ymm2,0x10(%3)           \n"
+      "vextractf128 $0x1,%%ymm1,0x20(%3)           \n"
+      "vextractf128 $0x1,%%ymm2,0x30(%3)           \n"
+      "lea        0x40(%3),%3                      \n"
+      "sub        $0x20,%4                         \n"
+      "jg         1b                               \n"
+      "vzeroupper                                  \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_yuy2),  // %3
+        "+rm"(width)     // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_I422TOYUY2ROW_AVX2
+
+#ifdef HAS_I422TOUYVYROW_AVX2
+void I422ToUYVYRow_AVX2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_uyvy,
+                        int width) {
+  asm volatile(
+
+      "sub        %1,%2                            \n"
+
+      LABELALIGN
+      "1:                                          \n"
+      "vpmovzxbw  (%1),%%ymm1                      \n"
+      "vpmovzxbw  0x00(%1,%2,1),%%ymm2             \n"
+      "add        $0x10,%1                         \n"
+      "vpsllw     $0x8,%%ymm2,%%ymm2               \n"
+      "vpor       %%ymm1,%%ymm2,%%ymm2             \n"
+      "vmovdqu    (%0),%%ymm0                      \n"
+      "add        $0x20,%0                         \n"
+      "vpunpcklbw %%ymm0,%%ymm2,%%ymm1             \n"
+      "vpunpckhbw %%ymm0,%%ymm2,%%ymm2             \n"
+      "vextractf128 $0x0,%%ymm1,(%3)               \n"
+      "vextractf128 $0x0,%%ymm2,0x10(%3)           \n"
+      "vextractf128 $0x1,%%ymm1,0x20(%3)           \n"
+      "vextractf128 $0x1,%%ymm2,0x30(%3)           \n"
+      "lea        0x40(%3),%3                      \n"
+      "sub        $0x20,%4                         \n"
+      "jg         1b                               \n"
+      "vzeroupper                                  \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_uyvy),  // %3
+        "+rm"(width)     // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_I422TOUYVYROW_AVX2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
+void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            const float* poly,
+                            int width) {
+  asm volatile(
+
+      "pxor      %%xmm3,%%xmm3                   \n"
+
+      // 2 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movq      (%0),%%xmm0                     \n"
+      "lea       0x8(%0),%0                      \n"
+      "punpcklbw %%xmm3,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm4                   \n"
+      "punpcklwd %%xmm3,%%xmm0                   \n"
+      "punpckhwd %%xmm3,%%xmm4                   \n"
+      "cvtdq2ps  %%xmm0,%%xmm0                   \n"
+      "cvtdq2ps  %%xmm4,%%xmm4                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm4,%%xmm5                   \n"
+      "mulps     0x10(%3),%%xmm0                 \n"
+      "mulps     0x10(%3),%%xmm4                 \n"
+      "addps     (%3),%%xmm0                     \n"
+      "addps     (%3),%%xmm4                     \n"
+      "movdqa    %%xmm1,%%xmm2                   \n"
+      "movdqa    %%xmm5,%%xmm6                   \n"
+      "mulps     %%xmm1,%%xmm2                   \n"
+      "mulps     %%xmm5,%%xmm6                   \n"
+      "mulps     %%xmm2,%%xmm1                   \n"
+      "mulps     %%xmm6,%%xmm5                   \n"
+      "mulps     0x20(%3),%%xmm2                 \n"
+      "mulps     0x20(%3),%%xmm6                 \n"
+      "mulps     0x30(%3),%%xmm1                 \n"
+      "mulps     0x30(%3),%%xmm5                 \n"
+      "addps     %%xmm2,%%xmm0                   \n"
+      "addps     %%xmm6,%%xmm4                   \n"
+      "addps     %%xmm1,%%xmm0                   \n"
+      "addps     %%xmm5,%%xmm4                   \n"
+      "cvttps2dq %%xmm0,%%xmm0                   \n"
+      "cvttps2dq %%xmm4,%%xmm4                   \n"
+      "packuswb  %%xmm4,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x2,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(poly)        // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
 
 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
-void ARGBPolynomialRow_AVX2(const uint8* src_argb,
-                            uint8* dst_argb, const float* poly,
+void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            const float* poly,
                             int width) {
-  asm volatile (
-    "vbroadcastf128 " MEMACCESS(3) ",%%ymm4     \n"
-    "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
-    "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
-    "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
+  asm volatile(
+      "vbroadcastf128 (%3),%%ymm4                \n"
+      "vbroadcastf128 0x10(%3),%%ymm5            \n"
+      "vbroadcastf128 0x20(%3),%%ymm6            \n"
+      "vbroadcastf128 0x30(%3),%%ymm7            \n"
 
-    // 2 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "vpmovzxbd   " MEMACCESS(0) ",%%ymm0       \n"  // 2 ARGB pixels
-    "lea         " MEMLEA(0x8,0) ",%0          \n"
-    "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
-    "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
-    "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
-    "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
-    "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
-    "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X * X
-    "vcvttps2dq  %%ymm0,%%ymm0                 \n"
-    "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
-    "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-    "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
-    "vmovq       %%xmm0," MEMACCESS(1) "       \n"
-    "lea         " MEMLEA(0x8,1) ",%1          \n"
-    "sub         $0x2,%2                       \n"
-    "jg          1b                            \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)      // %2
-  : "r"(poly)        // %3
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+      // 2 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vpmovzxbd   (%0),%%ymm0                   \n"  // 2 ARGB pixels
+      "lea         0x8(%0),%0                    \n"
+      "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
+      "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
+      "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
+      "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
+      "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
+      "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X *
+                                                      // X
+      "vcvttps2dq  %%ymm0,%%ymm0                 \n"
+      "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
+      "vmovq       %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x2,%2                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(poly)        // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
 
+#ifdef HAS_HALFFLOATROW_SSE2
+static float kScaleBias = 1.9259299444e-34f;
+void HalfFloatRow_SSE2(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width) {
+  scale *= kScaleBias;
+  asm volatile(
+      "movd        %3,%%xmm4                     \n"
+      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
+      "sub         %0,%1                         \n"
+
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm2                   \n"  // 8 shorts
+      "add         $0x10,%0                      \n"
+      "movdqa      %%xmm2,%%xmm3                 \n"
+      "punpcklwd   %%xmm5,%%xmm2                 \n"  // 8 ints in xmm2/1
+      "cvtdq2ps    %%xmm2,%%xmm2                 \n"  // 8 floats
+      "punpckhwd   %%xmm5,%%xmm3                 \n"
+      "cvtdq2ps    %%xmm3,%%xmm3                 \n"
+      "mulps       %%xmm4,%%xmm2                 \n"
+      "mulps       %%xmm4,%%xmm3                 \n"
+      "psrld       $0xd,%%xmm2                   \n"
+      "psrld       $0xd,%%xmm3                   \n"
+      "packssdw    %%xmm3,%%xmm2                 \n"
+      "movdqu      %%xmm2,-0x10(%0,%1,1)         \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      : "m"(scale)   // %3
+      : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_HALFFLOATROW_SSE2
+
+#ifdef HAS_HALFFLOATROW_AVX2
+void HalfFloatRow_AVX2(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width) {
+  scale *= kScaleBias;
+  asm volatile(
+      "vbroadcastss  %3, %%ymm4                  \n"
+      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+      "sub        %0,%1                          \n"
+
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm2                    \n"  // 16 shorts
+      "add        $0x20,%0                       \n"
+      "vpunpckhwd %%ymm5,%%ymm2,%%ymm3           \n"  // mutates
+      "vpunpcklwd %%ymm5,%%ymm2,%%ymm2           \n"
+      "vcvtdq2ps  %%ymm3,%%ymm3                  \n"
+      "vcvtdq2ps  %%ymm2,%%ymm2                  \n"
+      "vmulps     %%ymm3,%%ymm4,%%ymm3           \n"
+      "vmulps     %%ymm2,%%ymm4,%%ymm2           \n"
+      "vpsrld     $0xd,%%ymm3,%%ymm3             \n"
+      "vpsrld     $0xd,%%ymm2,%%ymm2             \n"
+      "vpackssdw  %%ymm3, %%ymm2, %%ymm2         \n"  // unmutates
+      "vmovdqu    %%ymm2,-0x20(%0,%1,1)          \n"
+      "sub        $0x10,%2                       \n"
+      "jg         1b                             \n"
+
+      "vzeroupper                                \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+#if defined(__x86_64__)
+      : "x"(scale)  // %3
+#else
+      : "m"(scale)  // %3
+#endif
+      : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_HALFFLOATROW_AVX2
+
+#ifdef HAS_HALFFLOATROW_F16C
+void HalfFloatRow_F16C(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width) {
+  asm volatile(
+      "vbroadcastss  %3, %%ymm4                  \n"
+      "sub        %0,%1                          \n"
+
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vpmovzxwd   (%0),%%ymm2                   \n"  // 16 shorts -> 16 ints
+      "vpmovzxwd   0x10(%0),%%ymm3               \n"
+      "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
+      "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
+      "vmulps      %%ymm2,%%ymm4,%%ymm2          \n"
+      "vmulps      %%ymm3,%%ymm4,%%ymm3          \n"
+      "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
+      "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
+      "vmovdqu     %%xmm2,0x00(%0,%1,1)          \n"
+      "vmovdqu     %%xmm3,0x10(%0,%1,1)          \n"
+      "add         $0x20,%0                      \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+#if defined(__x86_64__)
+      : "x"(scale)  // %3
+#else
+      : "m"(scale)  // %3
+#endif
+      : "memory", "cc", "xmm2", "xmm3", "xmm4");
+}
+#endif  // HAS_HALFFLOATROW_F16C
+
+#ifdef HAS_HALFFLOATROW_F16C
+void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
+  asm volatile(
+      "sub        %0,%1                          \n"
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vpmovzxwd   (%0),%%ymm2                   \n"  // 16 shorts -> 16 ints
+      "vpmovzxwd   0x10(%0),%%ymm3               \n"
+      "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
+      "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
+      "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
+      "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
+      "vmovdqu     %%xmm2,0x00(%0,%1,1)          \n"
+      "vmovdqu     %%xmm3,0x10(%0,%1,1)          \n"
+      "add         $0x20,%0                      \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm2", "xmm3");
+}
+#endif  // HAS_HALFFLOATROW_F16C
+
 #ifdef HAS_ARGBCOLORTABLEROW_X86
 // Tranform ARGB pixels with color table.
-void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
+void ARGBColorTableRow_X86(uint8_t* dst_argb,
+                           const uint8_t* table_argb,
                            int width) {
   uintptr_t pixel_temp;
-  asm volatile (
-    // 1 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movzb     " MEMACCESS(0) ",%1             \n"
-    "lea       " MEMLEA(0x4,0) ",%0            \n"
-    MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
-    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
-    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
-    MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
-    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
-    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
-    MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
-    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
-    "movzb     " MEMACCESS2(-0x1,0) ",%1       \n"
-    MEMOPARG(movzb,0x03,3,1,4,1) "             \n"  // movzb 0x3(%3,%1,4),%1
-    "mov       %b1," MEMACCESS2(-0x1,0) "      \n"
-    "dec       %2                              \n"
-    "jg        1b                              \n"
-  : "+r"(dst_argb),     // %0
-    "=&d"(pixel_temp),  // %1
-    "+r"(width)         // %2
-  : "r"(table_argb)     // %3
-  : "memory", "cc");
+  asm volatile(
+      // 1 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movzb     (%0),%1                         \n"
+      "lea       0x4(%0),%0                      \n"
+      "movzb     0x00(%3,%1,4),%1                \n"
+      "mov       %b1,-0x4(%0)                    \n"
+      "movzb     -0x3(%0),%1                     \n"
+      "movzb     0x01(%3,%1,4),%1                \n"
+      "mov       %b1,-0x3(%0)                    \n"
+      "movzb     -0x2(%0),%1                     \n"
+      "movzb     0x02(%3,%1,4),%1                \n"
+      "mov       %b1,-0x2(%0)                    \n"
+      "movzb     -0x1(%0),%1                     \n"
+      "movzb     0x03(%3,%1,4),%1                \n"
+      "mov       %b1,-0x1(%0)                    \n"
+      "dec       %2                              \n"
+      "jg        1b                              \n"
+      : "+r"(dst_argb),     // %0
+        "=&d"(pixel_temp),  // %1
+        "+r"(width)         // %2
+      : "r"(table_argb)     // %3
+      : "memory", "cc");
 }
 #endif  // HAS_ARGBCOLORTABLEROW_X86
 
 #ifdef HAS_RGBCOLORTABLEROW_X86
 // Tranform RGB pixels with color table.
-void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
+void RGBColorTableRow_X86(uint8_t* dst_argb,
+                          const uint8_t* table_argb,
+                          int width) {
   uintptr_t pixel_temp;
-  asm volatile (
-    // 1 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movzb     " MEMACCESS(0) ",%1             \n"
-    "lea       " MEMLEA(0x4,0) ",%0            \n"
-    MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
-    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
-    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
-    MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
-    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
-    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
-    MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
-    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
-    "dec       %2                              \n"
-    "jg        1b                              \n"
-  : "+r"(dst_argb),     // %0
-    "=&d"(pixel_temp),  // %1
-    "+r"(width)         // %2
-  : "r"(table_argb)     // %3
-  : "memory", "cc");
+  asm volatile(
+      // 1 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movzb     (%0),%1                         \n"
+      "lea       0x4(%0),%0                      \n"
+      "movzb     0x00(%3,%1,4),%1                \n"
+      "mov       %b1,-0x4(%0)                    \n"
+      "movzb     -0x3(%0),%1                     \n"
+      "movzb     0x01(%3,%1,4),%1                \n"
+      "mov       %b1,-0x3(%0)                    \n"
+      "movzb     -0x2(%0),%1                     \n"
+      "movzb     0x02(%3,%1,4),%1                \n"
+      "mov       %b1,-0x2(%0)                    \n"
+      "dec       %2                              \n"
+      "jg        1b                              \n"
+      : "+r"(dst_argb),     // %0
+        "=&d"(pixel_temp),  // %1
+        "+r"(width)         // %2
+      : "r"(table_argb)     // %3
+      : "memory", "cc");
 }
 #endif  // HAS_RGBCOLORTABLEROW_X86
 
 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
 // Tranform RGB pixels with luma table.
-void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
+                                 uint8_t* dst_argb,
                                  int width,
-                                 const uint8* luma, uint32 lumacoeff) {
+                                 const uint8_t* luma,
+                                 uint32_t lumacoeff) {
   uintptr_t pixel_temp;
   uintptr_t table_temp;
-  asm volatile (
-    "movd      %6,%%xmm3                       \n"
-    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "psllw     $0x8,%%xmm4                     \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
+  asm volatile(
+      "movd      %6,%%xmm3                       \n"
+      "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+      "pcmpeqb   %%xmm4,%%xmm4                   \n"
+      "psllw     $0x8,%%xmm4                     \n"
+      "pxor      %%xmm5,%%xmm5                   \n"
 
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(2) ",%%xmm0         \n"
-    "pmaddubsw %%xmm3,%%xmm0                   \n"
-    "phaddw    %%xmm0,%%xmm0                   \n"
-    "pand      %%xmm4,%%xmm0                   \n"
-    "punpcklwd %%xmm5,%%xmm0                   \n"
-    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
-    "add       %5,%1                           \n"
-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%2),%%xmm0                     \n"
+      "pmaddubsw %%xmm3,%%xmm0                   \n"
+      "phaddw    %%xmm0,%%xmm0                   \n"
+      "pand      %%xmm4,%%xmm0                   \n"
+      "punpcklwd %%xmm5,%%xmm0                   \n"
+      "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+      "add       %5,%1                           \n"
+      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
 
-    "movzb     " MEMACCESS(2) ",%0             \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS(3) "            \n"
-    "movzb     " MEMACCESS2(0x1,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0x1,3) "       \n"
-    "movzb     " MEMACCESS2(0x2,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0x2,3) "       \n"
-    "movzb     " MEMACCESS2(0x3,2) ",%0        \n"
-    "mov       %b0," MEMACCESS2(0x3,3) "       \n"
+      "movzb     (%2),%0                         \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,(%3)                        \n"
+      "movzb     0x1(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0x1(%3)                     \n"
+      "movzb     0x2(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0x2(%3)                     \n"
+      "movzb     0x3(%2),%0                      \n"
+      "mov       %b0,0x3(%3)                     \n"
 
-    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
-    "add       %5,%1                           \n"
-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+      "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+      "add       %5,%1                           \n"
+      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
 
-    "movzb     " MEMACCESS2(0x4,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0x4,3) "       \n"
-    "movzb     " MEMACCESS2(0x5,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0x5,3) "       \n"
-    "movzb     " MEMACCESS2(0x6,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0x6,3) "       \n"
-    "movzb     " MEMACCESS2(0x7,2) ",%0        \n"
-    "mov       %b0," MEMACCESS2(0x7,3) "       \n"
+      "movzb     0x4(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0x4(%3)                     \n"
+      "movzb     0x5(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0x5(%3)                     \n"
+      "movzb     0x6(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0x6(%3)                     \n"
+      "movzb     0x7(%2),%0                      \n"
+      "mov       %b0,0x7(%3)                     \n"
 
-    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
-    "add       %5,%1                           \n"
-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+      "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+      "add       %5,%1                           \n"
+      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
 
-    "movzb     " MEMACCESS2(0x8,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0x8,3) "       \n"
-    "movzb     " MEMACCESS2(0x9,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0x9,3) "       \n"
-    "movzb     " MEMACCESS2(0xa,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0xa,3) "       \n"
-    "movzb     " MEMACCESS2(0xb,2) ",%0        \n"
-    "mov       %b0," MEMACCESS2(0xb,3) "       \n"
+      "movzb     0x8(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0x8(%3)                     \n"
+      "movzb     0x9(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0x9(%3)                     \n"
+      "movzb     0xa(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0xa(%3)                     \n"
+      "movzb     0xb(%2),%0                      \n"
+      "mov       %b0,0xb(%3)                     \n"
 
-    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
-    "add       %5,%1                           \n"
+      "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+      "add       %5,%1                           \n"
 
-    "movzb     " MEMACCESS2(0xc,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0xc,3) "       \n"
-    "movzb     " MEMACCESS2(0xd,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0xd,3) "       \n"
-    "movzb     " MEMACCESS2(0xe,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0xe,3) "       \n"
-    "movzb     " MEMACCESS2(0xf,2) ",%0        \n"
-    "mov       %b0," MEMACCESS2(0xf,3) "       \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "lea       " MEMLEA(0x10,3) ",%3           \n"
-    "sub       $0x4,%4                         \n"
-    "jg        1b                              \n"
-  : "=&d"(pixel_temp),  // %0
-    "=&a"(table_temp),  // %1
-    "+r"(src_argb),     // %2
-    "+r"(dst_argb),     // %3
-    "+rm"(width)        // %4
-  : "r"(luma),          // %5
-    "rm"(lumacoeff)     // %6
-  : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
-  );
+      "movzb     0xc(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0xc(%3)                     \n"
+      "movzb     0xd(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0xd(%3)                     \n"
+      "movzb     0xe(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0xe(%3)                     \n"
+      "movzb     0xf(%2),%0                      \n"
+      "mov       %b0,0xf(%3)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "lea       0x10(%3),%3                     \n"
+      "sub       $0x4,%4                         \n"
+      "jg        1b                              \n"
+      : "=&d"(pixel_temp),  // %0
+        "=&a"(table_temp),  // %1
+        "+r"(src_argb),     // %2
+        "+r"(dst_argb),     // %3
+        "+rm"(width)        // %4
+      : "r"(luma),          // %5
+        "rm"(lumacoeff)     // %6
+      : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
 
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/row_mips.cc b/media/libvpx/libvpx/third_party/libyuv/source/row_mips.cc
deleted file mode 100644
index 285f0b5adc..0000000000
--- a/media/libvpx/libvpx/third_party/libyuv/source/row_mips.cc
+++ /dev/null
@@ -1,782 +0,0 @@
-/*
- *  Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// The following are available on Mips platforms:
-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
-    (_MIPS_SIM == _MIPS_SIM_ABI32)
-
-#ifdef HAS_COPYROW_MIPS
-void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
-  __asm__ __volatile__ (
-    ".set      noreorder                         \n"
-    ".set      noat                              \n"
-    "slti      $at, %[count], 8                  \n"
-    "bne       $at ,$zero, $last8                \n"
-    "xor       $t8, %[src], %[dst]               \n"
-    "andi      $t8, $t8, 0x3                     \n"
-
-    "bne       $t8, $zero, unaligned             \n"
-    "negu      $a3, %[dst]                       \n"
-    // make dst/src aligned
-    "andi      $a3, $a3, 0x3                     \n"
-    "beq       $a3, $zero, $chk16w               \n"
-    // word-aligned now count is the remining bytes count
-    "subu     %[count], %[count], $a3            \n"
-
-    "lwr       $t8, 0(%[src])                    \n"
-    "addu      %[src], %[src], $a3               \n"
-    "swr       $t8, 0(%[dst])                    \n"
-    "addu      %[dst], %[dst], $a3               \n"
-
-    // Now the dst/src are mutually word-aligned with word-aligned addresses
-    "$chk16w:                                    \n"
-    "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?
-    // t8 is the byte count after 64-byte chunks
-    "beq       %[count], $t8, chk8w              \n"
-    // There will be at most 1 32-byte chunk after it
-    "subu      $a3, %[count], $t8                \n"  // the reminder
-    // Here a3 counts bytes in 16w chunks
-    "addu      $a3, %[dst], $a3                  \n"
-    // Now a3 is the final dst after 64-byte chunks
-    "addu      $t0, %[dst], %[count]             \n"
-    // t0 is the "past the end" address
-
-    // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be past
-    // the "t0-32" address
-    // This means: for x=128 the last "safe" a1 address is "t0-160"
-    // Alternatively, for x=64 the last "safe" a1 address is "t0-96"
-    // we will use "pref 30,128(a1)", so "t0-160" is the limit
-    "subu      $t9, $t0, 160                     \n"
-    // t9 is the "last safe pref 30,128(a1)" address
-    "pref      0, 0(%[src])                      \n"  // first line of src
-    "pref      0, 32(%[src])                     \n"  // second line of src
-    "pref      0, 64(%[src])                     \n"
-    "pref      30, 32(%[dst])                    \n"
-    // In case the a1 > t9 don't use "pref 30" at all
-    "sgtu      $v1, %[dst], $t9                  \n"
-    "bgtz      $v1, $loop16w                     \n"
-    "nop                                         \n"
-    // otherwise, start with using pref30
-    "pref      30, 64(%[dst])                    \n"
-    "$loop16w:                                    \n"
-    "pref      0, 96(%[src])                     \n"
-    "lw        $t0, 0(%[src])                    \n"
-    "bgtz      $v1, $skip_pref30_96              \n"  // skip
-    "lw        $t1, 4(%[src])                    \n"
-    "pref      30, 96(%[dst])                    \n"  // continue
-    "$skip_pref30_96:                            \n"
-    "lw        $t2, 8(%[src])                    \n"
-    "lw        $t3, 12(%[src])                   \n"
-    "lw        $t4, 16(%[src])                   \n"
-    "lw        $t5, 20(%[src])                   \n"
-    "lw        $t6, 24(%[src])                   \n"
-    "lw        $t7, 28(%[src])                   \n"
-    "pref      0, 128(%[src])                    \n"
-    //  bring the next lines of src, addr 128
-    "sw        $t0, 0(%[dst])                    \n"
-    "sw        $t1, 4(%[dst])                    \n"
-    "sw        $t2, 8(%[dst])                    \n"
-    "sw        $t3, 12(%[dst])                   \n"
-    "sw        $t4, 16(%[dst])                   \n"
-    "sw        $t5, 20(%[dst])                   \n"
-    "sw        $t6, 24(%[dst])                   \n"
-    "sw        $t7, 28(%[dst])                   \n"
-    "lw        $t0, 32(%[src])                   \n"
-    "bgtz      $v1, $skip_pref30_128             \n"  // skip pref 30,128(a1)
-    "lw        $t1, 36(%[src])                   \n"
-    "pref      30, 128(%[dst])                   \n"  // set dest, addr 128
-    "$skip_pref30_128:                           \n"
-    "lw        $t2, 40(%[src])                   \n"
-    "lw        $t3, 44(%[src])                   \n"
-    "lw        $t4, 48(%[src])                   \n"
-    "lw        $t5, 52(%[src])                   \n"
-    "lw        $t6, 56(%[src])                   \n"
-    "lw        $t7, 60(%[src])                   \n"
-    "pref      0, 160(%[src])                    \n"
-    // bring the next lines of src, addr 160
-    "sw        $t0, 32(%[dst])                   \n"
-    "sw        $t1, 36(%[dst])                   \n"
-    "sw        $t2, 40(%[dst])                   \n"
-    "sw        $t3, 44(%[dst])                   \n"
-    "sw        $t4, 48(%[dst])                   \n"
-    "sw        $t5, 52(%[dst])                   \n"
-    "sw        $t6, 56(%[dst])                   \n"
-    "sw        $t7, 60(%[dst])                   \n"
-
-    "addiu     %[dst], %[dst], 64                \n"  // adding 64 to dest
-    "sgtu      $v1, %[dst], $t9                  \n"
-    "bne       %[dst], $a3, $loop16w             \n"
-    " addiu    %[src], %[src], 64                \n"  // adding 64 to src
-    "move      %[count], $t8                     \n"
-
-    // Here we have src and dest word-aligned but less than 64-bytes to go
-
-    "chk8w:                                      \n"
-    "pref      0, 0x0(%[src])                    \n"
-    "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
-    // the t8 is the reminder count past 32-bytes
-    "beq       %[count], $t8, chk1w              \n"
-    // count=t8,no 32-byte chunk
-    " nop                                        \n"
-
-    "lw        $t0, 0(%[src])                    \n"
-    "lw        $t1, 4(%[src])                    \n"
-    "lw        $t2, 8(%[src])                    \n"
-    "lw        $t3, 12(%[src])                   \n"
-    "lw        $t4, 16(%[src])                   \n"
-    "lw        $t5, 20(%[src])                   \n"
-    "lw        $t6, 24(%[src])                   \n"
-    "lw        $t7, 28(%[src])                   \n"
-    "addiu     %[src], %[src], 32                \n"
-
-    "sw        $t0, 0(%[dst])                    \n"
-    "sw        $t1, 4(%[dst])                    \n"
-    "sw        $t2, 8(%[dst])                    \n"
-    "sw        $t3, 12(%[dst])                   \n"
-    "sw        $t4, 16(%[dst])                   \n"
-    "sw        $t5, 20(%[dst])                   \n"
-    "sw        $t6, 24(%[dst])                   \n"
-    "sw        $t7, 28(%[dst])                   \n"
-    "addiu     %[dst], %[dst], 32                \n"
-
-    "chk1w:                                      \n"
-    "andi      %[count], $t8, 0x3                \n"
-    // now count is the reminder past 1w chunks
-    "beq       %[count], $t8, $last8             \n"
-    " subu     $a3, $t8, %[count]                \n"
-    // a3 is count of bytes in 1w chunks
-    "addu      $a3, %[dst], $a3                  \n"
-    // now a3 is the dst address past the 1w chunks
-    // copying in words (4-byte chunks)
-    "$wordCopy_loop:                             \n"
-    "lw        $t3, 0(%[src])                    \n"
-    // the first t3 may be equal t0 ... optimize?
-    "addiu     %[src], %[src],4                  \n"
-    "addiu     %[dst], %[dst],4                  \n"
-    "bne       %[dst], $a3,$wordCopy_loop        \n"
-    " sw       $t3, -4(%[dst])                   \n"
-
-    // For the last (<8) bytes
-    "$last8:                                     \n"
-    "blez      %[count], leave                   \n"
-    " addu     $a3, %[dst], %[count]             \n"  // a3 -last dst address
-    "$last8loop:                                 \n"
-    "lb        $v1, 0(%[src])                    \n"
-    "addiu     %[src], %[src], 1                 \n"
-    "addiu     %[dst], %[dst], 1                 \n"
-    "bne       %[dst], $a3, $last8loop           \n"
-    " sb       $v1, -1(%[dst])                   \n"
-
-    "leave:                                      \n"
-    "  j       $ra                               \n"
-    "  nop                                       \n"
-
-    //
-    // UNALIGNED case
-    //
-
-    "unaligned:                                  \n"
-    // got here with a3="negu a1"
-    "andi      $a3, $a3, 0x3                     \n"  // a1 is word aligned?
-    "beqz      $a3, $ua_chk16w                   \n"
-    " subu     %[count], %[count], $a3           \n"
-    // bytes left after initial a3 bytes
-    "lwr       $v1, 0(%[src])                    \n"
-    "lwl       $v1, 3(%[src])                    \n"
-    "addu      %[src], %[src], $a3               \n"  // a3 may be 1, 2 or 3
-    "swr       $v1, 0(%[dst])                    \n"
-    "addu      %[dst], %[dst], $a3               \n"
-    // below the dst will be word aligned (NOTE1)
-    "$ua_chk16w:                                 \n"
-    "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?
-    // t8 is the byte count after 64-byte chunks
-    "beq       %[count], $t8, ua_chk8w           \n"
-    // if a2==t8, no 64-byte chunks
-    // There will be at most 1 32-byte chunk after it
-    "subu      $a3, %[count], $t8                \n"  // the reminder
-    // Here a3 counts bytes in 16w chunks
-    "addu      $a3, %[dst], $a3                  \n"
-    // Now a3 is the final dst after 64-byte chunks
-    "addu      $t0, %[dst], %[count]             \n"  // t0 "past the end"
-    "subu      $t9, $t0, 160                     \n"
-    // t9 is the "last safe pref 30,128(a1)" address
-    "pref      0, 0(%[src])                      \n"  // first line of src
-    "pref      0, 32(%[src])                     \n"  // second line  addr 32
-    "pref      0, 64(%[src])                     \n"
-    "pref      30, 32(%[dst])                    \n"
-    // safe, as we have at least 64 bytes ahead
-    // In case the a1 > t9 don't use "pref 30" at all
-    "sgtu      $v1, %[dst], $t9                  \n"
-    "bgtz      $v1, $ua_loop16w                  \n"
-    // skip "pref 30,64(a1)" for too short arrays
-    " nop                                        \n"
-    // otherwise, start with using pref30
-    "pref      30, 64(%[dst])                    \n"
-    "$ua_loop16w:                                \n"
-    "pref      0, 96(%[src])                     \n"
-    "lwr       $t0, 0(%[src])                    \n"
-    "lwl       $t0, 3(%[src])                    \n"
-    "lwr       $t1, 4(%[src])                    \n"
-    "bgtz      $v1, $ua_skip_pref30_96           \n"
-    " lwl      $t1, 7(%[src])                    \n"
-    "pref      30, 96(%[dst])                    \n"
-    // continue setting up the dest, addr 96
-    "$ua_skip_pref30_96:                         \n"
-    "lwr       $t2, 8(%[src])                    \n"
-    "lwl       $t2, 11(%[src])                   \n"
-    "lwr       $t3, 12(%[src])                   \n"
-    "lwl       $t3, 15(%[src])                   \n"
-    "lwr       $t4, 16(%[src])                   \n"
-    "lwl       $t4, 19(%[src])                   \n"
-    "lwr       $t5, 20(%[src])                   \n"
-    "lwl       $t5, 23(%[src])                   \n"
-    "lwr       $t6, 24(%[src])                   \n"
-    "lwl       $t6, 27(%[src])                   \n"
-    "lwr       $t7, 28(%[src])                   \n"
-    "lwl       $t7, 31(%[src])                   \n"
-    "pref      0, 128(%[src])                    \n"
-    // bring the next lines of src, addr 128
-    "sw        $t0, 0(%[dst])                    \n"
-    "sw        $t1, 4(%[dst])                    \n"
-    "sw        $t2, 8(%[dst])                    \n"
-    "sw        $t3, 12(%[dst])                   \n"
-    "sw        $t4, 16(%[dst])                   \n"
-    "sw        $t5, 20(%[dst])                   \n"
-    "sw        $t6, 24(%[dst])                   \n"
-    "sw        $t7, 28(%[dst])                   \n"
-    "lwr       $t0, 32(%[src])                   \n"
-    "lwl       $t0, 35(%[src])                   \n"
-    "lwr       $t1, 36(%[src])                   \n"
-    "bgtz      $v1, ua_skip_pref30_128           \n"
-    " lwl      $t1, 39(%[src])                   \n"
-    "pref      30, 128(%[dst])                   \n"
-    // continue setting up the dest, addr 128
-    "ua_skip_pref30_128:                         \n"
-
-    "lwr       $t2, 40(%[src])                   \n"
-    "lwl       $t2, 43(%[src])                   \n"
-    "lwr       $t3, 44(%[src])                   \n"
-    "lwl       $t3, 47(%[src])                   \n"
-    "lwr       $t4, 48(%[src])                   \n"
-    "lwl       $t4, 51(%[src])                   \n"
-    "lwr       $t5, 52(%[src])                   \n"
-    "lwl       $t5, 55(%[src])                   \n"
-    "lwr       $t6, 56(%[src])                   \n"
-    "lwl       $t6, 59(%[src])                   \n"
-    "lwr       $t7, 60(%[src])                   \n"
-    "lwl       $t7, 63(%[src])                   \n"
-    "pref      0, 160(%[src])                    \n"
-    // bring the next lines of src, addr 160
-    "sw        $t0, 32(%[dst])                   \n"
-    "sw        $t1, 36(%[dst])                   \n"
-    "sw        $t2, 40(%[dst])                   \n"
-    "sw        $t3, 44(%[dst])                   \n"
-    "sw        $t4, 48(%[dst])                   \n"
-    "sw        $t5, 52(%[dst])                   \n"
-    "sw        $t6, 56(%[dst])                   \n"
-    "sw        $t7, 60(%[dst])                   \n"
-
-    "addiu     %[dst],%[dst],64                  \n"  // adding 64 to dest
-    "sgtu      $v1,%[dst],$t9                    \n"
-    "bne       %[dst],$a3,$ua_loop16w            \n"
-    " addiu    %[src],%[src],64                  \n"  // adding 64 to src
-    "move      %[count],$t8                      \n"
-
-    // Here we have src and dest word-aligned but less than 64-bytes to go
-
-    "ua_chk8w:                                   \n"
-    "pref      0, 0x0(%[src])                    \n"
-    "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
-    // the t8 is the reminder count
-    "beq       %[count], $t8, $ua_chk1w          \n"
-    // when count==t8, no 32-byte chunk
-
-    "lwr       $t0, 0(%[src])                    \n"
-    "lwl       $t0, 3(%[src])                    \n"
-    "lwr       $t1, 4(%[src])                    \n"
-    "lwl       $t1, 7(%[src])                    \n"
-    "lwr       $t2, 8(%[src])                    \n"
-    "lwl       $t2, 11(%[src])                   \n"
-    "lwr       $t3, 12(%[src])                   \n"
-    "lwl       $t3, 15(%[src])                   \n"
-    "lwr       $t4, 16(%[src])                   \n"
-    "lwl       $t4, 19(%[src])                   \n"
-    "lwr       $t5, 20(%[src])                   \n"
-    "lwl       $t5, 23(%[src])                   \n"
-    "lwr       $t6, 24(%[src])                   \n"
-    "lwl       $t6, 27(%[src])                   \n"
-    "lwr       $t7, 28(%[src])                   \n"
-    "lwl       $t7, 31(%[src])                   \n"
-    "addiu     %[src], %[src], 32                \n"
-
-    "sw        $t0, 0(%[dst])                    \n"
-    "sw        $t1, 4(%[dst])                    \n"
-    "sw        $t2, 8(%[dst])                    \n"
-    "sw        $t3, 12(%[dst])                   \n"
-    "sw        $t4, 16(%[dst])                   \n"
-    "sw        $t5, 20(%[dst])                   \n"
-    "sw        $t6, 24(%[dst])                   \n"
-    "sw        $t7, 28(%[dst])                   \n"
-    "addiu     %[dst], %[dst], 32                \n"
-
-    "$ua_chk1w:                                  \n"
-    "andi      %[count], $t8, 0x3                \n"
-    // now count is the reminder past 1w chunks
-    "beq       %[count], $t8, ua_smallCopy       \n"
-    "subu      $a3, $t8, %[count]                \n"
-    // a3 is count of bytes in 1w chunks
-    "addu      $a3, %[dst], $a3                  \n"
-    // now a3 is the dst address past the 1w chunks
-
-    // copying in words (4-byte chunks)
-    "$ua_wordCopy_loop:                          \n"
-    "lwr       $v1, 0(%[src])                    \n"
-    "lwl       $v1, 3(%[src])                    \n"
-    "addiu     %[src], %[src], 4                 \n"
-    "addiu     %[dst], %[dst], 4                 \n"
-    // note: dst=a1 is word aligned here, see NOTE1
-    "bne       %[dst], $a3, $ua_wordCopy_loop    \n"
-    " sw       $v1,-4(%[dst])                    \n"
-
-    // Now less than 4 bytes (value in count) left to copy
-    "ua_smallCopy:                               \n"
-    "beqz      %[count], leave                   \n"
-    " addu     $a3, %[dst], %[count]             \n" // a3 = last dst address
-    "$ua_smallCopy_loop:                         \n"
-    "lb        $v1, 0(%[src])                    \n"
-    "addiu     %[src], %[src], 1                 \n"
-    "addiu     %[dst], %[dst], 1                 \n"
-    "bne       %[dst],$a3,$ua_smallCopy_loop     \n"
-    " sb       $v1, -1(%[dst])                   \n"
-
-    "j         $ra                               \n"
-    " nop                                        \n"
-    ".set      at                                \n"
-    ".set      reorder                           \n"
-       : [dst] "+r" (dst), [src] "+r" (src)
-       : [count] "r" (count)
-       : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",
-       "t8", "t9", "a3", "v1", "at"
-  );
-}
-#endif  // HAS_COPYROW_MIPS
-
-// DSPR2 functions
-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
-    (__mips_dsp_rev >= 2) && \
-    (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
-
-void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                      int width) {
-  __asm__ __volatile__ (
-    ".set push                                     \n"
-    ".set noreorder                                \n"
-    "srl             $t4, %[width], 4              \n"  // multiplies of 16
-    "blez            $t4, 2f                       \n"
-    " andi           %[width], %[width], 0xf       \n"  // residual
-
-  "1:                                              \n"
-    "addiu           $t4, $t4, -1                  \n"
-    "lw              $t0, 0(%[src_uv])             \n"  // V1 | U1 | V0 | U0
-    "lw              $t1, 4(%[src_uv])             \n"  // V3 | U3 | V2 | U2
-    "lw              $t2, 8(%[src_uv])             \n"  // V5 | U5 | V4 | U4
-    "lw              $t3, 12(%[src_uv])            \n"  // V7 | U7 | V6 | U6
-    "lw              $t5, 16(%[src_uv])            \n"  // V9 | U9 | V8 | U8
-    "lw              $t6, 20(%[src_uv])            \n"  // V11 | U11 | V10 | U10
-    "lw              $t7, 24(%[src_uv])            \n"  // V13 | U13 | V12 | U12
-    "lw              $t8, 28(%[src_uv])            \n"  // V15 | U15 | V14 | U14
-    "addiu           %[src_uv], %[src_uv], 32      \n"
-    "precrq.qb.ph    $t9, $t1, $t0                 \n"  // V3 | V2 | V1 | V0
-    "precr.qb.ph     $t0, $t1, $t0                 \n"  // U3 | U2 | U1 | U0
-    "precrq.qb.ph    $t1, $t3, $t2                 \n"  // V7 | V6 | V5 | V4
-    "precr.qb.ph     $t2, $t3, $t2                 \n"  // U7 | U6 | U5 | U4
-    "precrq.qb.ph    $t3, $t6, $t5                 \n"  // V11 | V10 | V9 | V8
-    "precr.qb.ph     $t5, $t6, $t5                 \n"  // U11 | U10 | U9 | U8
-    "precrq.qb.ph    $t6, $t8, $t7                 \n"  // V15 | V14 | V13 | V12
-    "precr.qb.ph     $t7, $t8, $t7                 \n"  // U15 | U14 | U13 | U12
-    "sw              $t9, 0(%[dst_v])              \n"
-    "sw              $t0, 0(%[dst_u])              \n"
-    "sw              $t1, 4(%[dst_v])              \n"
-    "sw              $t2, 4(%[dst_u])              \n"
-    "sw              $t3, 8(%[dst_v])              \n"
-    "sw              $t5, 8(%[dst_u])              \n"
-    "sw              $t6, 12(%[dst_v])             \n"
-    "sw              $t7, 12(%[dst_u])             \n"
-    "addiu           %[dst_v], %[dst_v], 16        \n"
-    "bgtz            $t4, 1b                       \n"
-    " addiu          %[dst_u], %[dst_u], 16        \n"
-
-    "beqz            %[width], 3f                  \n"
-    " nop                                          \n"
-
-  "2:                                              \n"
-    "lbu             $t0, 0(%[src_uv])             \n"
-    "lbu             $t1, 1(%[src_uv])             \n"
-    "addiu           %[src_uv], %[src_uv], 2       \n"
-    "addiu           %[width], %[width], -1        \n"
-    "sb              $t0, 0(%[dst_u])              \n"
-    "sb              $t1, 0(%[dst_v])              \n"
-    "addiu           %[dst_u], %[dst_u], 1         \n"
-    "bgtz            %[width], 2b                  \n"
-    " addiu          %[dst_v], %[dst_v], 1         \n"
-
-  "3:                                              \n"
-    ".set pop                                      \n"
-     : [src_uv] "+r" (src_uv),
-       [width] "+r" (width),
-       [dst_u] "+r" (dst_u),
-       [dst_v] "+r" (dst_v)
-     :
-     : "t0", "t1", "t2", "t3",
-     "t4", "t5", "t6", "t7", "t8", "t9"
-  );
-}
-
-void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {
-  __asm__ __volatile__ (
-    ".set push                             \n"
-    ".set noreorder                        \n"
-
-    "srl       $t4, %[width], 4            \n"  // multiplies of 16
-    "andi      $t5, %[width], 0xf          \n"
-    "blez      $t4, 2f                     \n"
-    " addu     %[src], %[src], %[width]    \n"  // src += width
-
-   "1:                                     \n"
-    "lw        $t0, -16(%[src])            \n"  // |3|2|1|0|
-    "lw        $t1, -12(%[src])            \n"  // |7|6|5|4|
-    "lw        $t2, -8(%[src])             \n"  // |11|10|9|8|
-    "lw        $t3, -4(%[src])             \n"  // |15|14|13|12|
-    "wsbh      $t0, $t0                    \n"  // |2|3|0|1|
-    "wsbh      $t1, $t1                    \n"  // |6|7|4|5|
-    "wsbh      $t2, $t2                    \n"  // |10|11|8|9|
-    "wsbh      $t3, $t3                    \n"  // |14|15|12|13|
-    "rotr      $t0, $t0, 16                \n"  // |0|1|2|3|
-    "rotr      $t1, $t1, 16                \n"  // |4|5|6|7|
-    "rotr      $t2, $t2, 16                \n"  // |8|9|10|11|
-    "rotr      $t3, $t3, 16                \n"  // |12|13|14|15|
-    "addiu     %[src], %[src], -16         \n"
-    "addiu     $t4, $t4, -1                \n"
-    "sw        $t3, 0(%[dst])              \n"  // |15|14|13|12|
-    "sw        $t2, 4(%[dst])              \n"  // |11|10|9|8|
-    "sw        $t1, 8(%[dst])              \n"  // |7|6|5|4|
-    "sw        $t0, 12(%[dst])             \n"  // |3|2|1|0|
-    "bgtz      $t4, 1b                     \n"
-    " addiu    %[dst], %[dst], 16          \n"
-    "beqz      $t5, 3f                     \n"
-    " nop                                  \n"
-
-   "2:                                     \n"
-    "lbu       $t0, -1(%[src])             \n"
-    "addiu     $t5, $t5, -1                \n"
-    "addiu     %[src], %[src], -1          \n"
-    "sb        $t0, 0(%[dst])              \n"
-    "bgez      $t5, 2b                     \n"
-    " addiu    %[dst], %[dst], 1           \n"
-
-   "3:                                     \n"
-    ".set pop                              \n"
-      : [src] "+r" (src), [dst] "+r" (dst)
-      : [width] "r" (width)
-      : "t0", "t1", "t2", "t3", "t4", "t5"
-  );
-}
-
-void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                       int width) {
-  int x;
-  int y;
-  __asm__ __volatile__ (
-    ".set push                                    \n"
-    ".set noreorder                               \n"
-
-    "addu            $t4, %[width], %[width]      \n"
-    "srl             %[x], %[width], 4            \n"
-    "andi            %[y], %[width], 0xf          \n"
-    "blez            %[x], 2f                     \n"
-    " addu           %[src_uv], %[src_uv], $t4    \n"
-
-   "1:                                            \n"
-    "lw              $t0, -32(%[src_uv])          \n"  // |3|2|1|0|
-    "lw              $t1, -28(%[src_uv])          \n"  // |7|6|5|4|
-    "lw              $t2, -24(%[src_uv])          \n"  // |11|10|9|8|
-    "lw              $t3, -20(%[src_uv])          \n"  // |15|14|13|12|
-    "lw              $t4, -16(%[src_uv])          \n"  // |19|18|17|16|
-    "lw              $t6, -12(%[src_uv])          \n"  // |23|22|21|20|
-    "lw              $t7, -8(%[src_uv])           \n"  // |27|26|25|24|
-    "lw              $t8, -4(%[src_uv])           \n"  // |31|30|29|28|
-
-    "rotr            $t0, $t0, 16                 \n"  // |1|0|3|2|
-    "rotr            $t1, $t1, 16                 \n"  // |5|4|7|6|
-    "rotr            $t2, $t2, 16                 \n"  // |9|8|11|10|
-    "rotr            $t3, $t3, 16                 \n"  // |13|12|15|14|
-    "rotr            $t4, $t4, 16                 \n"  // |17|16|19|18|
-    "rotr            $t6, $t6, 16                 \n"  // |21|20|23|22|
-    "rotr            $t7, $t7, 16                 \n"  // |25|24|27|26|
-    "rotr            $t8, $t8, 16                 \n"  // |29|28|31|30|
-    "precr.qb.ph     $t9, $t0, $t1                \n"  // |0|2|4|6|
-    "precrq.qb.ph    $t5, $t0, $t1                \n"  // |1|3|5|7|
-    "precr.qb.ph     $t0, $t2, $t3                \n"  // |8|10|12|14|
-    "precrq.qb.ph    $t1, $t2, $t3                \n"  // |9|11|13|15|
-    "precr.qb.ph     $t2, $t4, $t6                \n"  // |16|18|20|22|
-    "precrq.qb.ph    $t3, $t4, $t6                \n"  // |17|19|21|23|
-    "precr.qb.ph     $t4, $t7, $t8                \n"  // |24|26|28|30|
-    "precrq.qb.ph    $t6, $t7, $t8                \n"  // |25|27|29|31|
-    "addiu           %[src_uv], %[src_uv], -32    \n"
-    "addiu           %[x], %[x], -1               \n"
-    "swr             $t4, 0(%[dst_u])             \n"
-    "swl             $t4, 3(%[dst_u])             \n"  // |30|28|26|24|
-    "swr             $t6, 0(%[dst_v])             \n"
-    "swl             $t6, 3(%[dst_v])             \n"  // |31|29|27|25|
-    "swr             $t2, 4(%[dst_u])             \n"
-    "swl             $t2, 7(%[dst_u])             \n"  // |22|20|18|16|
-    "swr             $t3, 4(%[dst_v])             \n"
-    "swl             $t3, 7(%[dst_v])             \n"  // |23|21|19|17|
-    "swr             $t0, 8(%[dst_u])             \n"
-    "swl             $t0, 11(%[dst_u])            \n"  // |14|12|10|8|
-    "swr             $t1, 8(%[dst_v])             \n"
-    "swl             $t1, 11(%[dst_v])            \n"  // |15|13|11|9|
-    "swr             $t9, 12(%[dst_u])            \n"
-    "swl             $t9, 15(%[dst_u])            \n"  // |6|4|2|0|
-    "swr             $t5, 12(%[dst_v])            \n"
-    "swl             $t5, 15(%[dst_v])            \n"  // |7|5|3|1|
-    "addiu           %[dst_v], %[dst_v], 16       \n"
-    "bgtz            %[x], 1b                     \n"
-    " addiu          %[dst_u], %[dst_u], 16       \n"
-    "beqz            %[y], 3f                     \n"
-    " nop                                         \n"
-    "b               2f                           \n"
-    " nop                                         \n"
-
-   "2:                                            \n"
-    "lbu             $t0, -2(%[src_uv])           \n"
-    "lbu             $t1, -1(%[src_uv])           \n"
-    "addiu           %[src_uv], %[src_uv], -2     \n"
-    "addiu           %[y], %[y], -1               \n"
-    "sb              $t0, 0(%[dst_u])             \n"
-    "sb              $t1, 0(%[dst_v])             \n"
-    "addiu           %[dst_u], %[dst_u], 1        \n"
-    "bgtz            %[y], 2b                     \n"
-    " addiu          %[dst_v], %[dst_v], 1        \n"
-
-   "3:                                            \n"
-    ".set pop                                     \n"
-      : [src_uv] "+r" (src_uv),
-        [dst_u] "+r" (dst_u),
-        [dst_v] "+r" (dst_v),
-        [x] "=&r" (x),
-        [y] "=&r" (y)
-      : [width] "r" (width)
-      : "t0", "t1", "t2", "t3", "t4",
-      "t5", "t7", "t8", "t9"
-  );
-}
-
-// Convert (4 Y and 2 VU) I422 and arrange RGB values into
-// t5 = | 0 | B0 | 0 | b0 |
-// t4 = | 0 | B1 | 0 | b1 |
-// t9 = | 0 | G0 | 0 | g0 |
-// t8 = | 0 | G1 | 0 | g1 |
-// t2 = | 0 | R0 | 0 | r0 |
-// t1 = | 0 | R1 | 0 | r1 |
-#define YUVTORGB                                                               \
-      "lw                $t0, 0(%[y_buf])       \n"                            \
-      "lhu               $t1, 0(%[u_buf])       \n"                            \
-      "lhu               $t2, 0(%[v_buf])       \n"                            \
-      "preceu.ph.qbr     $t1, $t1               \n"                            \
-      "preceu.ph.qbr     $t2, $t2               \n"                            \
-      "preceu.ph.qbra    $t3, $t0               \n"                            \
-      "preceu.ph.qbla    $t0, $t0               \n"                            \
-      "subu.ph           $t1, $t1, $s5          \n"                            \
-      "subu.ph           $t2, $t2, $s5          \n"                            \
-      "subu.ph           $t3, $t3, $s4          \n"                            \
-      "subu.ph           $t0, $t0, $s4          \n"                            \
-      "mul.ph            $t3, $t3, $s0          \n"                            \
-      "mul.ph            $t0, $t0, $s0          \n"                            \
-      "shll.ph           $t4, $t1, 0x7          \n"                            \
-      "subu.ph           $t4, $t4, $t1          \n"                            \
-      "mul.ph            $t6, $t1, $s1          \n"                            \
-      "mul.ph            $t1, $t2, $s2          \n"                            \
-      "addq_s.ph         $t5, $t4, $t3          \n"                            \
-      "addq_s.ph         $t4, $t4, $t0          \n"                            \
-      "shra.ph           $t5, $t5, 6            \n"                            \
-      "shra.ph           $t4, $t4, 6            \n"                            \
-      "addiu             %[u_buf], 2            \n"                            \
-      "addiu             %[v_buf], 2            \n"                            \
-      "addu.ph           $t6, $t6, $t1          \n"                            \
-      "mul.ph            $t1, $t2, $s3          \n"                            \
-      "addu.ph           $t9, $t6, $t3          \n"                            \
-      "addu.ph           $t8, $t6, $t0          \n"                            \
-      "shra.ph           $t9, $t9, 6            \n"                            \
-      "shra.ph           $t8, $t8, 6            \n"                            \
-      "addu.ph           $t2, $t1, $t3          \n"                            \
-      "addu.ph           $t1, $t1, $t0          \n"                            \
-      "shra.ph           $t2, $t2, 6            \n"                            \
-      "shra.ph           $t1, $t1, 6            \n"                            \
-      "subu.ph           $t5, $t5, $s5          \n"                            \
-      "subu.ph           $t4, $t4, $s5          \n"                            \
-      "subu.ph           $t9, $t9, $s5          \n"                            \
-      "subu.ph           $t8, $t8, $s5          \n"                            \
-      "subu.ph           $t2, $t2, $s5          \n"                            \
-      "subu.ph           $t1, $t1, $s5          \n"                            \
-      "shll_s.ph         $t5, $t5, 8            \n"                            \
-      "shll_s.ph         $t4, $t4, 8            \n"                            \
-      "shll_s.ph         $t9, $t9, 8            \n"                            \
-      "shll_s.ph         $t8, $t8, 8            \n"                            \
-      "shll_s.ph         $t2, $t2, 8            \n"                            \
-      "shll_s.ph         $t1, $t1, 8            \n"                            \
-      "shra.ph           $t5, $t5, 8            \n"                            \
-      "shra.ph           $t4, $t4, 8            \n"                            \
-      "shra.ph           $t9, $t9, 8            \n"                            \
-      "shra.ph           $t8, $t8, 8            \n"                            \
-      "shra.ph           $t2, $t2, 8            \n"                            \
-      "shra.ph           $t1, $t1, 8            \n"                            \
-      "addu.ph           $t5, $t5, $s5          \n"                            \
-      "addu.ph           $t4, $t4, $s5          \n"                            \
-      "addu.ph           $t9, $t9, $s5          \n"                            \
-      "addu.ph           $t8, $t8, $s5          \n"                            \
-      "addu.ph           $t2, $t2, $s5          \n"                            \
-      "addu.ph           $t1, $t1, $s5          \n"
-
-// TODO(fbarchard): accept yuv conversion constants.
-void I422ToARGBRow_DSPR2(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* rgb_buf,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  __asm__ __volatile__ (
-    ".set push                                \n"
-    ".set noreorder                           \n"
-    "beqz              %[width], 2f           \n"
-    " repl.ph          $s0, 74                \n"  // |YG|YG| = |74|74|
-    "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|
-    "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|
-    "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|
-    "repl.ph           $s4, 16                \n"  // |0|16|0|16|
-    "repl.ph           $s5, 128               \n"  // |128|128| // clipping
-    "lui               $s6, 0xff00            \n"
-    "ori               $s6, 0xff00            \n"  // |ff|00|ff|00|ff|
-
-   "1:                                        \n"
-      YUVTORGB
-// Arranging into argb format
-    "precr.qb.ph       $t4, $t8, $t4          \n"  // |G1|g1|B1|b1|
-    "precr.qb.ph       $t5, $t9, $t5          \n"  // |G0|g0|B0|b0|
-    "addiu             %[width], -4           \n"
-    "precrq.qb.ph      $t8, $t4, $t5          \n"  // |G1|B1|G0|B0|
-    "precr.qb.ph       $t9, $t4, $t5          \n"  // |g1|b1|g0|b0|
-    "precr.qb.ph       $t2, $t1, $t2          \n"  // |R1|r1|R0|r0|
-
-    "addiu             %[y_buf], 4            \n"
-    "preceu.ph.qbla    $t1, $t2               \n"  // |0 |R1|0 |R0|
-    "preceu.ph.qbra    $t2, $t2               \n"  // |0 |r1|0 |r0|
-    "or                $t1, $t1, $s6          \n"  // |ff|R1|ff|R0|
-    "or                $t2, $t2, $s6          \n"  // |ff|r1|ff|r0|
-    "precrq.ph.w       $t0, $t2, $t9          \n"  // |ff|r1|g1|b1|
-    "precrq.ph.w       $t3, $t1, $t8          \n"  // |ff|R1|G1|B1|
-    "sll               $t9, $t9, 16           \n"
-    "sll               $t8, $t8, 16           \n"
-    "packrl.ph         $t2, $t2, $t9          \n"  // |ff|r0|g0|b0|
-    "packrl.ph         $t1, $t1, $t8          \n"  // |ff|R0|G0|B0|
-// Store results.
-    "sw                $t2, 0(%[rgb_buf])     \n"
-    "sw                $t0, 4(%[rgb_buf])     \n"
-    "sw                $t1, 8(%[rgb_buf])     \n"
-    "sw                $t3, 12(%[rgb_buf])    \n"
-    "bnez              %[width], 1b           \n"
-    " addiu            %[rgb_buf], 16         \n"
-   "2:                                        \n"
-    ".set pop                                 \n"
-      :[y_buf] "+r" (y_buf),
-       [u_buf] "+r" (u_buf),
-       [v_buf] "+r" (v_buf),
-       [width] "+r" (width),
-       [rgb_buf] "+r" (rgb_buf)
-      :
-      : "t0", "t1",  "t2", "t3",  "t4", "t5",
-      "t6", "t7", "t8", "t9",
-      "s0", "s1", "s2", "s3",
-      "s4", "s5", "s6"
-  );
-}
-
-// Bilinear filter 8x2 -> 8x1
-void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
-                          ptrdiff_t src_stride, int dst_width,
-                          int source_y_fraction) {
-    int y0_fraction = 256 - source_y_fraction;
-    const uint8* src_ptr1 = src_ptr + src_stride;
-
-  __asm__ __volatile__ (
-     ".set push                                           \n"
-     ".set noreorder                                      \n"
-
-     "replv.ph          $t0, %[y0_fraction]               \n"
-     "replv.ph          $t1, %[source_y_fraction]         \n"
-
-   "1:                                                    \n"
-     "lw                $t2, 0(%[src_ptr])                \n"
-     "lw                $t3, 0(%[src_ptr1])               \n"
-     "lw                $t4, 4(%[src_ptr])                \n"
-     "lw                $t5, 4(%[src_ptr1])               \n"
-     "muleu_s.ph.qbl    $t6, $t2, $t0                     \n"
-     "muleu_s.ph.qbr    $t7, $t2, $t0                     \n"
-     "muleu_s.ph.qbl    $t8, $t3, $t1                     \n"
-     "muleu_s.ph.qbr    $t9, $t3, $t1                     \n"
-     "muleu_s.ph.qbl    $t2, $t4, $t0                     \n"
-     "muleu_s.ph.qbr    $t3, $t4, $t0                     \n"
-     "muleu_s.ph.qbl    $t4, $t5, $t1                     \n"
-     "muleu_s.ph.qbr    $t5, $t5, $t1                     \n"
-     "addq.ph           $t6, $t6, $t8                     \n"
-     "addq.ph           $t7, $t7, $t9                     \n"
-     "addq.ph           $t2, $t2, $t4                     \n"
-     "addq.ph           $t3, $t3, $t5                     \n"
-     "shra.ph           $t6, $t6, 8                       \n"
-     "shra.ph           $t7, $t7, 8                       \n"
-     "shra.ph           $t2, $t2, 8                       \n"
-     "shra.ph           $t3, $t3, 8                       \n"
-     "precr.qb.ph       $t6, $t6, $t7                     \n"
-     "precr.qb.ph       $t2, $t2, $t3                     \n"
-     "addiu             %[src_ptr], %[src_ptr], 8         \n"
-     "addiu             %[src_ptr1], %[src_ptr1], 8       \n"
-     "addiu             %[dst_width], %[dst_width], -8    \n"
-     "sw                $t6, 0(%[dst_ptr])                \n"
-     "sw                $t2, 4(%[dst_ptr])                \n"
-     "bgtz              %[dst_width], 1b                  \n"
-     " addiu            %[dst_ptr], %[dst_ptr], 8         \n"
-
-     ".set pop                                            \n"
-  : [dst_ptr] "+r" (dst_ptr),
-    [src_ptr1] "+r" (src_ptr1),
-    [src_ptr] "+r" (src_ptr),
-    [dst_width] "+r" (dst_width)
-  : [source_y_fraction] "r" (source_y_fraction),
-    [y0_fraction] "r" (y0_fraction),
-    [src_stride] "r" (src_stride)
-  : "t0", "t1", "t2", "t3", "t4", "t5",
-    "t6", "t7", "t8", "t9"
-  );
-}
-#endif  // __mips_dsp_rev >= 2
-
-#endif  // defined(__mips__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/row_msa.cc b/media/libvpx/libvpx/third_party/libyuv/source/row_msa.cc
new file mode 100644
index 0000000000..4fb2631f0b
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/libyuv/source/row_msa.cc
@@ -0,0 +1,3512 @@
+/*
+ *  Copyright 2016 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string.h>
+
+#include "libyuv/row.h"
+
+// This module is for GCC MSA
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#include "libyuv/macros_msa.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define ALPHA_VAL (-1)
+
+// Fill YUV -> RGB conversion constants into vectors
+#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, bb, bg, br, yg) \
+  {                                                              \
+    ub = __msa_fill_w(yuvconst->kUVToB[0]);                      \
+    vr = __msa_fill_w(yuvconst->kUVToR[1]);                      \
+    ug = __msa_fill_w(yuvconst->kUVToG[0]);                      \
+    vg = __msa_fill_w(yuvconst->kUVToG[1]);                      \
+    bb = __msa_fill_w(yuvconst->kUVBiasB[0]);                    \
+    bg = __msa_fill_w(yuvconst->kUVBiasG[0]);                    \
+    br = __msa_fill_w(yuvconst->kUVBiasR[0]);                    \
+    yg = __msa_fill_w(yuvconst->kYToRgb[0]);                     \
+  }
+
+// Load YUV 422 pixel data
+#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v)    \
+  {                                                                \
+    uint64_t y_m;                                                  \
+    uint32_t u_m, v_m;                                             \
+    v4i32 zero_m = {0};                                            \
+    y_m = LD(psrc_y);                                              \
+    u_m = LW(psrc_u);                                              \
+    v_m = LW(psrc_v);                                              \
+    out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64_t)y_m); \
+    out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)u_m);        \
+    out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)v_m);        \
+  }
+
+// Clip input vector elements between 0 to 255
+#define CLIP_0TO255(in0, in1, in2, in3, in4, in5) \
+  {                                               \
+    v4i32 max_m = __msa_ldi_w(0xFF);              \
+                                                  \
+    in0 = __msa_maxi_s_w(in0, 0);                 \
+    in1 = __msa_maxi_s_w(in1, 0);                 \
+    in2 = __msa_maxi_s_w(in2, 0);                 \
+    in3 = __msa_maxi_s_w(in3, 0);                 \
+    in4 = __msa_maxi_s_w(in4, 0);                 \
+    in5 = __msa_maxi_s_w(in5, 0);                 \
+    in0 = __msa_min_s_w(max_m, in0);              \
+    in1 = __msa_min_s_w(max_m, in1);              \
+    in2 = __msa_min_s_w(max_m, in2);              \
+    in3 = __msa_min_s_w(max_m, in3);              \
+    in4 = __msa_min_s_w(max_m, in4);              \
+    in5 = __msa_min_s_w(max_m, in5);              \
+  }
+
+// Convert 8 pixels of YUV 420 to RGB.
+#define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \
+  {                                                                            \
+    v8i16 vec0_m, vec1_m;                                                      \
+    v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m;                              \
+    v4i32 reg5_m, reg6_m, reg7_m;                                              \
+    v16i8 zero_m = {0};                                                        \
+                                                                               \
+    vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y);                    \
+    vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv);                 \
+    reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m);                \
+    reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m);                \
+    reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec1_m);                \
+    reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec1_m);                \
+    reg0_m *= yg;                                                              \
+    reg1_m *= yg;                                                              \
+    reg2_m *= ubvr;                                                            \
+    reg3_m *= ubvr;                                                            \
+    reg0_m = __msa_srai_w(reg0_m, 16);                                         \
+    reg1_m = __msa_srai_w(reg1_m, 16);                                         \
+    reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg);                       \
+    reg5_m = __msa_ilvev_w(reg2_m, reg2_m);                                    \
+    reg6_m = __msa_ilvev_w(reg3_m, reg3_m);                                    \
+    reg7_m = __msa_ilvr_w(reg4_m, reg4_m);                                     \
+    reg2_m = __msa_ilvod_w(reg2_m, reg2_m);                                    \
+    reg3_m = __msa_ilvod_w(reg3_m, reg3_m);                                    \
+    reg4_m = __msa_ilvl_w(reg4_m, reg4_m);                                     \
+    reg5_m = reg0_m - reg5_m;                                                  \
+    reg6_m = reg1_m - reg6_m;                                                  \
+    reg2_m = reg0_m - reg2_m;                                                  \
+    reg3_m = reg1_m - reg3_m;                                                  \
+    reg7_m = reg0_m - reg7_m;                                                  \
+    reg4_m = reg1_m - reg4_m;                                                  \
+    reg5_m += bb;                                                              \
+    reg6_m += bb;                                                              \
+    reg7_m += bg;                                                              \
+    reg4_m += bg;                                                              \
+    reg2_m += br;                                                              \
+    reg3_m += br;                                                              \
+    reg5_m = __msa_srai_w(reg5_m, 6);                                          \
+    reg6_m = __msa_srai_w(reg6_m, 6);                                          \
+    reg7_m = __msa_srai_w(reg7_m, 6);                                          \
+    reg4_m = __msa_srai_w(reg4_m, 6);                                          \
+    reg2_m = __msa_srai_w(reg2_m, 6);                                          \
+    reg3_m = __msa_srai_w(reg3_m, 6);                                          \
+    CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m);               \
+    out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m);                       \
+    out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m);                       \
+    out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m);                       \
+  }
+
+// Pack and Store 8 ARGB values.
+#define STOREARGB(in0, in1, in2, in3, pdst_argb)           \
+  {                                                        \
+    v8i16 vec0_m, vec1_m;                                  \
+    v16u8 dst0_m, dst1_m;                                  \
+    vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
+    vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
+    dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m);          \
+    dst1_m = (v16u8)__msa_ilvl_h(vec1_m, vec0_m);          \
+    ST_UB2(dst0_m, dst1_m, pdst_argb, 16);                 \
+  }
+
+// Takes ARGB input and calculates Y.
+#define ARGBTOY(argb0, argb1, argb2, argb3, const0, const1, const2, shift, \
+                y_out)                                                     \
+  {                                                                        \
+    v16u8 vec0_m, vec1_m, vec2_m, vec3_m;                                  \
+    v8u16 reg0_m, reg1_m;                                                  \
+                                                                           \
+    vec0_m = (v16u8)__msa_pckev_h((v8i16)argb1, (v8i16)argb0);             \
+    vec1_m = (v16u8)__msa_pckev_h((v8i16)argb3, (v8i16)argb2);             \
+    vec2_m = (v16u8)__msa_pckod_h((v8i16)argb1, (v8i16)argb0);             \
+    vec3_m = (v16u8)__msa_pckod_h((v8i16)argb3, (v8i16)argb2);             \
+    reg0_m = __msa_dotp_u_h(vec0_m, const0);                               \
+    reg1_m = __msa_dotp_u_h(vec1_m, const0);                               \
+    reg0_m = __msa_dpadd_u_h(reg0_m, vec2_m, const1);                      \
+    reg1_m = __msa_dpadd_u_h(reg1_m, vec3_m, const1);                      \
+    reg0_m += const2;                                                      \
+    reg1_m += const2;                                                      \
+    reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, shift);                    \
+    reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, shift);                    \
+    y_out = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m);            \
+  }
+
+// Loads current and next row of ARGB input and averages it to calculate U and V
+#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3)               \
+  {                                                                       \
+    v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \
+    v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+    v16u8 vec8_m, vec9_m;                                                 \
+    v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \
+    v8u16 reg8_m, reg9_m;                                                 \
+                                                                          \
+    src0_m = (v16u8)__msa_ld_b((v16i8*)s, 0);                             \
+    src1_m = (v16u8)__msa_ld_b((v16i8*)s, 16);                            \
+    src2_m = (v16u8)__msa_ld_b((v16i8*)s, 32);                            \
+    src3_m = (v16u8)__msa_ld_b((v16i8*)s, 48);                            \
+    src4_m = (v16u8)__msa_ld_b((v16i8*)t, 0);                             \
+    src5_m = (v16u8)__msa_ld_b((v16i8*)t, 16);                            \
+    src6_m = (v16u8)__msa_ld_b((v16i8*)t, 32);                            \
+    src7_m = (v16u8)__msa_ld_b((v16i8*)t, 48);                            \
+    vec0_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m);           \
+    vec1_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m);           \
+    vec2_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m);           \
+    vec3_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m);           \
+    vec4_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m);           \
+    vec5_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m);           \
+    vec6_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m);           \
+    vec7_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m);           \
+    reg0_m = __msa_hadd_u_h(vec0_m, vec0_m);                              \
+    reg1_m = __msa_hadd_u_h(vec1_m, vec1_m);                              \
+    reg2_m = __msa_hadd_u_h(vec2_m, vec2_m);                              \
+    reg3_m = __msa_hadd_u_h(vec3_m, vec3_m);                              \
+    reg4_m = __msa_hadd_u_h(vec4_m, vec4_m);                              \
+    reg5_m = __msa_hadd_u_h(vec5_m, vec5_m);                              \
+    reg6_m = __msa_hadd_u_h(vec6_m, vec6_m);                              \
+    reg7_m = __msa_hadd_u_h(vec7_m, vec7_m);                              \
+    reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m);          \
+    reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m);          \
+    reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m);         \
+    reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m);         \
+    reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m);          \
+    reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m);          \
+    reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m);         \
+    reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m);         \
+    reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2);                       \
+    reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2);                       \
+    reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2);                       \
+    reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2);                       \
+    argb0 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m);           \
+    argb1 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m);           \
+    src0_m = (v16u8)__msa_ld_b((v16i8*)s, 64);                            \
+    src1_m = (v16u8)__msa_ld_b((v16i8*)s, 80);                            \
+    src2_m = (v16u8)__msa_ld_b((v16i8*)s, 96);                            \
+    src3_m = (v16u8)__msa_ld_b((v16i8*)s, 112);                           \
+    src4_m = (v16u8)__msa_ld_b((v16i8*)t, 64);                            \
+    src5_m = (v16u8)__msa_ld_b((v16i8*)t, 80);                            \
+    src6_m = (v16u8)__msa_ld_b((v16i8*)t, 96);                            \
+    src7_m = (v16u8)__msa_ld_b((v16i8*)t, 112);                           \
+    vec2_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m);           \
+    vec3_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m);           \
+    vec4_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m);           \
+    vec5_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m);           \
+    vec6_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m);           \
+    vec7_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m);           \
+    vec8_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m);           \
+    vec9_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m);           \
+    reg0_m = __msa_hadd_u_h(vec2_m, vec2_m);                              \
+    reg1_m = __msa_hadd_u_h(vec3_m, vec3_m);                              \
+    reg2_m = __msa_hadd_u_h(vec4_m, vec4_m);                              \
+    reg3_m = __msa_hadd_u_h(vec5_m, vec5_m);                              \
+    reg4_m = __msa_hadd_u_h(vec6_m, vec6_m);                              \
+    reg5_m = __msa_hadd_u_h(vec7_m, vec7_m);                              \
+    reg6_m = __msa_hadd_u_h(vec8_m, vec8_m);                              \
+    reg7_m = __msa_hadd_u_h(vec9_m, vec9_m);                              \
+    reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m);          \
+    reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m);          \
+    reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m);         \
+    reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m);         \
+    reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m);          \
+    reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m);          \
+    reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m);         \
+    reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m);         \
+    reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2);                       \
+    reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2);                       \
+    reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2);                       \
+    reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2);                       \
+    argb2 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m);           \
+    argb3 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m);           \
+  }
+
+// Takes ARGB input and calculates U and V.
+#define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \
+                 shf0, shf1, shf2, shf3, v_out, u_out)                       \
+  {                                                                          \
+    v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \
+    v8u16 reg0_m, reg1_m, reg2_m, reg3_m;                                    \
+                                                                             \
+    vec0_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb1, (v16i8)argb0);          \
+    vec1_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb3, (v16i8)argb2);          \
+    vec2_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb1, (v16i8)argb0);          \
+    vec3_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb3, (v16i8)argb2);          \
+    vec4_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb1, (v16i8)argb0);          \
+    vec5_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb3, (v16i8)argb2);          \
+    vec6_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb1, (v16i8)argb0);          \
+    vec7_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb3, (v16i8)argb2);          \
+    reg0_m = __msa_dotp_u_h(vec0_m, const1);                                 \
+    reg1_m = __msa_dotp_u_h(vec1_m, const1);                                 \
+    reg2_m = __msa_dotp_u_h(vec4_m, const1);                                 \
+    reg3_m = __msa_dotp_u_h(vec5_m, const1);                                 \
+    reg0_m += const3;                                                        \
+    reg1_m += const3;                                                        \
+    reg2_m += const3;                                                        \
+    reg3_m += const3;                                                        \
+    reg0_m -= __msa_dotp_u_h(vec2_m, const0);                                \
+    reg1_m -= __msa_dotp_u_h(vec3_m, const0);                                \
+    reg2_m -= __msa_dotp_u_h(vec6_m, const2);                                \
+    reg3_m -= __msa_dotp_u_h(vec7_m, const2);                                \
+    v_out = (v16u8)__msa_pckod_b((v16i8)reg1_m, (v16i8)reg0_m);              \
+    u_out = (v16u8)__msa_pckod_b((v16i8)reg3_m, (v16i8)reg2_m);              \
+  }
+
+// Load I444 pixel data
+#define READI444(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \
+  {                                                           \
+    uint64_t y_m, u_m, v_m;                                   \
+    v2i64 zero_m = {0};                                       \
+    y_m = LD(psrc_y);                                         \
+    u_m = LD(psrc_u);                                         \
+    v_m = LD(psrc_v);                                         \
+    out_y = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)y_m);   \
+    out_u = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)u_m);   \
+    out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)v_m);   \
+  }
+
+void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3;
+  v16u8 dst0, dst1, dst2, dst3;
+  v16i8 shuffler = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+  src += width - 64;
+
+  for (x = 0; x < width; x += 64) {
+    LD_UB4(src, 16, src3, src2, src1, src0);
+    VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2);
+    VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0);
+    ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
+    dst += 64;
+    src -= 64;
+  }
+}
+
+void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3;
+  v16u8 dst0, dst1, dst2, dst3;
+  v16i8 shuffler = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
+  src += width * 4 - 64;
+
+  for (x = 0; x < width; x += 16) {
+    LD_UB4(src, 16, src3, src2, src1, src0);
+    VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2);
+    VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0);
+    ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
+    dst += 64;
+    src -= 64;
+  }
+}
+
+void I422ToYUY2Row_MSA(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_yuy2,
+                       int width) {
+  int x;
+  v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
+  v16u8 dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3;
+
+  for (x = 0; x < width; x += 32) {
+    src_u0 = LD_UB(src_u);
+    src_v0 = LD_UB(src_v);
+    LD_UB2(src_y, 16, src_y0, src_y1);
+    ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);
+    ILVRL_B2_UB(vec_uv0, src_y0, dst_yuy2_0, dst_yuy2_1);
+    ILVRL_B2_UB(vec_uv1, src_y1, dst_yuy2_2, dst_yuy2_3);
+    ST_UB4(dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3, dst_yuy2, 16);
+    src_u += 16;
+    src_v += 16;
+    src_y += 32;
+    dst_yuy2 += 64;
+  }
+}
+
+void I422ToUYVYRow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_uyvy,
+                       int width) {
+  int x;
+  v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
+  v16u8 dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3;
+
+  for (x = 0; x < width; x += 32) {
+    src_u0 = LD_UB(src_u);
+    src_v0 = LD_UB(src_v);
+    LD_UB2(src_y, 16, src_y0, src_y1);
+    ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);
+    ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1);
+    ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3);
+    ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16);
+    src_u += 16;
+    src_v += 16;
+    src_y += 32;
+    dst_uyvy += 64;
+  }
+}
+
+void I422ToARGBRow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  v16u8 src0, src1, src2;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    READYUV422(src_y, src_u, src_v, src0, src1, src2);
+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
+    src_y += 8;
+    src_u += 4;
+    src_v += 4;
+    dst_argb += 32;
+  }
+}
+
+void I422ToRGBARow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  v16u8 src0, src1, src2;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    READYUV422(src_y, src_u, src_v, src0, src1, src2);
+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    STOREARGB(alpha, vec0, vec1, vec2, dst_argb);
+    src_y += 8;
+    src_u += 4;
+    src_v += 4;
+    dst_argb += 32;
+  }
+}
+
+void I422AlphaToARGBRow_MSA(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            const uint8_t* src_a,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  int x;
+  int64_t data_a;
+  v16u8 src0, src1, src2, src3;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v4i32 zero = {0};
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    data_a = LD(src_a);
+    READYUV422(src_y, src_u, src_v, src0, src1, src2);
+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+    src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3);
+    STOREARGB(vec0, vec1, vec2, src3, dst_argb);
+    src_y += 8;
+    src_u += 4;
+    src_v += 4;
+    src_a += 8;
+    dst_argb += 32;
+  }
+}
+
+void I422ToRGB24Row_MSA(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int32_t width) {
+  int x;
+  int64_t data_u, data_v;
+  v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v16u8 reg0, reg1, reg2, reg3;
+  v2i64 zero = {0};
+  v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10};
+  v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10};
+  v16i8 shuffler2 = {26, 6,  7,  27, 8,  9,  28, 10,
+                     11, 29, 12, 13, 30, 14, 15, 31};
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16u8*)src_y, 0);
+    data_u = LD(src_u);
+    data_v = LD(src_v);
+    src1 = (v16u8)__msa_insert_d(zero, 0, data_u);
+    src2 = (v16u8)__msa_insert_d(zero, 0, data_v);
+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+    src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8);
+    src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec3, vec4, vec5);
+    reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
+    reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3);
+    reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2);
+    reg1 = (v16u8)__msa_sldi_b((v16i8)reg2, (v16i8)reg0, 11);
+    dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)reg3, (v16i8)reg0);
+    dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)reg3, (v16i8)reg1);
+    dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)reg3, (v16i8)reg2);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    ST_UB(dst2, (dst_argb + 32));
+    src_y += 16;
+    src_u += 8;
+    src_v += 8;
+    dst_argb += 48;
+  }
+}
+
+// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R.
+void I422ToRGB565Row_MSA(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb565,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  int x;
+  v16u8 src0, src1, src2, dst0;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    READYUV422(src_y, src_u, src_v, src0, src1, src2);
+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec2, vec1);
+    vec0 = __msa_srai_h(vec0, 3);
+    vec1 = __msa_srai_h(vec1, 3);
+    vec2 = __msa_srai_h(vec2, 2);
+    vec1 = __msa_slli_h(vec1, 11);
+    vec2 = __msa_slli_h(vec2, 5);
+    vec0 |= vec1;
+    dst0 = (v16u8)(vec2 | vec0);
+    ST_UB(dst0, dst_rgb565);
+    src_y += 8;
+    src_u += 4;
+    src_v += 4;
+    dst_rgb565 += 16;
+  }
+}
+
+// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G.
+void I422ToARGB4444Row_MSA(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_argb4444,
+                           const struct YuvConstants* yuvconstants,
+                           int width) {
+  int x;
+  v16u8 src0, src1, src2, dst0;
+  v8i16 vec0, vec1, vec2;
+  v8u16 reg0, reg1, reg2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    READYUV422(src_y, src_u, src_v, src0, src1, src2);
+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    reg0 = (v8u16)__msa_srai_h(vec0, 4);
+    reg1 = (v8u16)__msa_srai_h(vec1, 4);
+    reg2 = (v8u16)__msa_srai_h(vec2, 4);
+    reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 4);
+    reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 8);
+    reg1 |= const_0xF000;
+    reg0 |= reg2;
+    dst0 = (v16u8)(reg1 | reg0);
+    ST_UB(dst0, dst_argb4444);
+    src_y += 8;
+    src_u += 4;
+    src_v += 4;
+    dst_argb4444 += 16;
+  }
+}
+
+void I422ToARGB1555Row_MSA(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_argb1555,
+                           const struct YuvConstants* yuvconstants,
+                           int width) {
+  int x;
+  v16u8 src0, src1, src2, dst0;
+  v8i16 vec0, vec1, vec2;
+  v8u16 reg0, reg1, reg2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    READYUV422(src_y, src_u, src_v, src0, src1, src2);
+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    reg0 = (v8u16)__msa_srai_h(vec0, 3);
+    reg1 = (v8u16)__msa_srai_h(vec1, 3);
+    reg2 = (v8u16)__msa_srai_h(vec2, 3);
+    reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5);
+    reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10);
+    reg1 |= const_0x8000;
+    reg0 |= reg2;
+    dst0 = (v16u8)(reg1 | reg0);
+    ST_UB(dst0, dst_argb1555);
+    src_y += 8;
+    src_u += 4;
+    src_v += 4;
+    dst_argb1555 += 16;
+  }
+}
+
+void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    ST_UB2(dst0, dst1, dst_y, 16);
+    src_yuy2 += 64;
+    dst_y += 32;
+  }
+}
+
+void YUY2ToUVRow_MSA(const uint8_t* src_yuy2,
+                     int src_stride_yuy2,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2;
+  int x;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 vec0, vec1, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
+    LD_UB4(src_yuy2_next, 16, src4, src5, src6, src7);
+    src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    src2 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
+    src3 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
+    vec0 = __msa_aver_u_b(src0, src2);
+    vec1 = __msa_aver_u_b(src1, src3);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    src_yuy2 += 64;
+    src_yuy2_next += 64;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
+    src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    src_yuy2 += 64;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
+    dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    ST_UB2(dst0, dst1, dst_y, 16);
+    src_uyvy += 64;
+    dst_y += 32;
+  }
+}
+
+void UYVYToUVRow_MSA(const uint8_t* src_uyvy,
+                     int src_stride_uyvy,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy;
+  int x;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 vec0, vec1, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
+    LD_UB4(src_uyvy_next, 16, src4, src5, src6, src7);
+    src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    src2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
+    src3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
+    vec0 = __msa_aver_u_b(src0, src2);
+    vec1 = __msa_aver_u_b(src1, src3);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    src_uyvy += 64;
+    src_uyvy_next += 64;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void UYVYToUV422Row_MSA(const uint8_t* src_uyvy,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
+    src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    src_uyvy += 64;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;
+  v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
+  v16i8 zero = {0};
+  v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19);
+  v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81);
+  v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42);
+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
+    src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
+    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    vec2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    reg0 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec0);
+    reg1 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec1);
+    reg2 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec2);
+    reg3 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec3);
+    reg4 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec0);
+    reg5 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec1);
+    reg0 *= const_0x19;
+    reg1 *= const_0x19;
+    reg2 *= const_0x81;
+    reg3 *= const_0x81;
+    reg4 *= const_0x42;
+    reg5 *= const_0x42;
+    reg0 += reg2;
+    reg1 += reg3;
+    reg0 += reg4;
+    reg1 += reg5;
+    reg0 += const_0x1080;
+    reg1 += const_0x1080;
+    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
+    reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+    ST_UB(dst0, dst_y);
+    src_argb0 += 64;
+    dst_y += 16;
+  }
+}
+
+void ARGBToUVRow_MSA(const uint8_t* src_argb0,
+                     int src_stride_argb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  int x;
+  const uint8_t* src_argb0_next = src_argb0 + src_stride_argb;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+  v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
+  v16u8 dst0, dst1;
+  v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
+  v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
+  v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
+  v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
+  v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+
+  for (x = 0; x < width; x += 32) {
+    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
+    src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
+    src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 64);
+    src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 80);
+    src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 96);
+    src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 112);
+    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
+    vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
+    vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
+    vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
+    vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+    vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
+    vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+    vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2);
+    reg0 = __msa_hadd_u_h(vec8, vec8);
+    reg1 = __msa_hadd_u_h(vec9, vec9);
+    reg2 = __msa_hadd_u_h(vec4, vec4);
+    reg3 = __msa_hadd_u_h(vec5, vec5);
+    reg4 = __msa_hadd_u_h(vec0, vec0);
+    reg5 = __msa_hadd_u_h(vec1, vec1);
+    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 0);
+    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 16);
+    src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 32);
+    src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 48);
+    src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 64);
+    src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 80);
+    src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 96);
+    src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 112);
+    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
+    vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
+    vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
+    vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
+    vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+    vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
+    vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+    vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2);
+    reg0 += __msa_hadd_u_h(vec8, vec8);
+    reg1 += __msa_hadd_u_h(vec9, vec9);
+    reg2 += __msa_hadd_u_h(vec4, vec4);
+    reg3 += __msa_hadd_u_h(vec5, vec5);
+    reg4 += __msa_hadd_u_h(vec0, vec0);
+    reg5 += __msa_hadd_u_h(vec1, vec1);
+    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 2);
+    reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 2);
+    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 2);
+    reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 2);
+    reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 2);
+    reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 2);
+    reg6 = reg0 * const_0x70;
+    reg7 = reg1 * const_0x70;
+    reg8 = reg2 * const_0x4A;
+    reg9 = reg3 * const_0x4A;
+    reg6 += const_0x8080;
+    reg7 += const_0x8080;
+    reg8 += reg4 * const_0x26;
+    reg9 += reg5 * const_0x26;
+    reg0 *= const_0x12;
+    reg1 *= const_0x12;
+    reg2 *= const_0x5E;
+    reg3 *= const_0x5E;
+    reg4 *= const_0x70;
+    reg5 *= const_0x70;
+    reg2 += reg0;
+    reg3 += reg1;
+    reg4 += const_0x8080;
+    reg5 += const_0x8080;
+    reg6 -= reg8;
+    reg7 -= reg9;
+    reg4 -= reg2;
+    reg5 -= reg3;
+    reg6 = (v8u16)__msa_srai_h((v8i16)reg6, 8);
+    reg7 = (v8u16)__msa_srai_h((v8i16)reg7, 8);
+    reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 8);
+    reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg7, (v16i8)reg6);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    src_argb0 += 128;
+    src_argb0_next += 128;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2;
+  v16i8 shuffler0 = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20};
+  v16i8 shuffler1 = {5,  6,  8,  9,  10, 12, 13, 14,
+                     16, 17, 18, 20, 21, 22, 24, 25};
+  v16i8 shuffler2 = {10, 12, 13, 14, 16, 17, 18, 20,
+                     21, 22, 24, 25, 26, 28, 29, 30};
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 32);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 48);
+    dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1);
+    dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2);
+    ST_UB2(dst0, dst1, dst_rgb, 16);
+    ST_UB(dst2, (dst_rgb + 32));
+    src_argb += 64;
+    dst_rgb += 48;
+  }
+}
+
+void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2;
+  v16i8 shuffler0 = {2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22};
+  v16i8 shuffler1 = {5,  4,  10, 9,  8,  14, 13, 12,
+                     18, 17, 16, 22, 21, 20, 26, 25};
+  v16i8 shuffler2 = {8,  14, 13, 12, 18, 17, 16, 22,
+                     21, 20, 26, 25, 24, 30, 29, 28};
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 32);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 48);
+    dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1);
+    dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2);
+    ST_UB2(dst0, dst1, dst_rgb, 16);
+    ST_UB(dst2, (dst_rgb + 32));
+    src_argb += 64;
+    dst_rgb += 48;
+  }
+}
+
+void ARGBToRGB565Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+  int x;
+  v16u8 src0, src1, dst0;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16);
+    vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3);
+    vec1 = (v16u8)__msa_slli_b((v16i8)src0, 3);
+    vec2 = (v16u8)__msa_srai_b((v16i8)src0, 5);
+    vec4 = (v16u8)__msa_srai_b((v16i8)src1, 3);
+    vec5 = (v16u8)__msa_slli_b((v16i8)src1, 3);
+    vec6 = (v16u8)__msa_srai_b((v16i8)src1, 5);
+    vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1);
+    vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1);
+    vec5 = (v16u8)__msa_sldi_b(zero, (v16i8)vec5, 1);
+    vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1);
+    vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 2);
+    vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 2);
+    vec0 = __msa_binsli_b(vec0, vec1, 2);
+    vec1 = __msa_binsli_b(vec2, vec3, 4);
+    vec4 = __msa_binsli_b(vec4, vec5, 2);
+    vec5 = __msa_binsli_b(vec6, vec7, 4);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
+    vec4 = (v16u8)__msa_ilvev_b((v16i8)vec5, (v16i8)vec4);
+    dst0 = (v16u8)__msa_pckev_h((v8i16)vec4, (v8i16)vec0);
+    ST_UB(dst0, dst_rgb);
+    src_argb += 32;
+    dst_rgb += 16;
+  }
+}
+
+void ARGBToARGB1555Row_MSA(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width) {
+  int x;
+  v16u8 src0, src1, dst0;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16);
+    vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3);
+    vec1 = (v16u8)__msa_slli_b((v16i8)src0, 2);
+    vec2 = (v16u8)__msa_srai_b((v16i8)vec0, 3);
+    vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1);
+    vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1);
+    vec3 = (v16u8)__msa_srai_b((v16i8)src0, 1);
+    vec5 = (v16u8)__msa_srai_b((v16i8)src1, 3);
+    vec6 = (v16u8)__msa_slli_b((v16i8)src1, 2);
+    vec7 = (v16u8)__msa_srai_b((v16i8)vec5, 3);
+    vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1);
+    vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)vec7, 1);
+    vec8 = (v16u8)__msa_srai_b((v16i8)src1, 1);
+    vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)vec3, 2);
+    vec8 = (v16u8)__msa_sldi_b(zero, (v16i8)vec8, 2);
+    vec4 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 3);
+    vec9 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 3);
+    vec0 = __msa_binsli_b(vec0, vec1, 2);
+    vec5 = __msa_binsli_b(vec5, vec6, 2);
+    vec1 = __msa_binsli_b(vec2, vec3, 5);
+    vec6 = __msa_binsli_b(vec7, vec8, 5);
+    vec1 = __msa_binsli_b(vec1, vec4, 0);
+    vec6 = __msa_binsli_b(vec6, vec9, 0);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
+    vec1 = (v16u8)__msa_ilvev_b((v16i8)vec6, (v16i8)vec5);
+    dst0 = (v16u8)__msa_pckev_h((v8i16)vec1, (v8i16)vec0);
+    ST_UB(dst0, dst_rgb);
+    src_argb += 32;
+    dst_rgb += 16;
+  }
+}
+
+void ARGBToARGB4444Row_MSA(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width) {
+  int x;
+  v16u8 src0, src1;
+  v16u8 vec0, vec1;
+  v16u8 dst0;
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16);
+    vec0 = (v16u8)__msa_srai_b((v16i8)src0, 4);
+    vec1 = (v16u8)__msa_srai_b((v16i8)src1, 4);
+    src0 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 1);
+    src1 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 1);
+    vec0 = __msa_binsli_b(vec0, src0, 3);
+    vec1 = __msa_binsli_b(vec1, src1, 3);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_rgb);
+    src_argb += 32;
+    dst_rgb += 16;
+  }
+}
+
+void ARGBToUV444Row_MSA(const uint8_t* src_argb,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int32_t width) {
+  int32_t x;
+  v16u8 src0, src1, src2, src3, reg0, reg1, reg2, reg3, dst0, dst1;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 vec8, vec9, vec10, vec11;
+  v8u16 const_112 = (v8u16)__msa_ldi_h(112);
+  v8u16 const_74 = (v8u16)__msa_ldi_h(74);
+  v8u16 const_38 = (v8u16)__msa_ldi_h(38);
+  v8u16 const_94 = (v8u16)__msa_ldi_h(94);
+  v8u16 const_18 = (v8u16)__msa_ldi_h(18);
+  v8u16 const_32896 = (v8u16)__msa_fill_h(32896);
+  v16i8 zero = {0};
+
+  for (x = width; x > 0; x -= 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 32);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 48);
+    reg0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    reg1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    reg2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    reg3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    src0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+    src1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2);
+    src2 = (v16u8)__msa_pckod_b((v16i8)reg1, (v16i8)reg0);
+    vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1);
+    vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2);
+    vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2);
+    vec10 = vec0 * const_18;
+    vec11 = vec1 * const_18;
+    vec8 = vec2 * const_94;
+    vec9 = vec3 * const_94;
+    vec6 = vec4 * const_112;
+    vec7 = vec5 * const_112;
+    vec0 *= const_112;
+    vec1 *= const_112;
+    vec2 *= const_74;
+    vec3 *= const_74;
+    vec4 *= const_38;
+    vec5 *= const_38;
+    vec8 += vec10;
+    vec9 += vec11;
+    vec6 += const_32896;
+    vec7 += const_32896;
+    vec0 += const_32896;
+    vec1 += const_32896;
+    vec2 += vec4;
+    vec3 += vec5;
+    vec0 -= vec2;
+    vec1 -= vec3;
+    vec6 -= vec8;
+    vec7 -= vec9;
+    vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
+    vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
+    vec6 = (v8u16)__msa_srai_h((v8i16)vec6, 8);
+    vec7 = (v8u16)__msa_srai_h((v8i16)vec7, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    src_argb += 64;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void ARGBMultiplyRow_MSA(const uint8_t* src_argb0,
+                         const uint8_t* src_argb1,
+                         uint8_t* dst_argb,
+                         int width) {
+  int x;
+  v16u8 src0, src1, dst0;
+  v8u16 vec0, vec1, vec2, vec3;
+  v4u32 reg0, reg1, reg2, reg3;
+  v8i16 zero = {0};
+
+  for (x = 0; x < width; x += 4) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 0);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src1);
+    reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
+    reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
+    reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
+    reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
+    reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2);
+    reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2);
+    reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3);
+    reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3);
+    reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 16);
+    reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 16);
+    reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 16);
+    reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 16);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_argb);
+    src_argb0 += 16;
+    src_argb1 += 16;
+    dst_argb += 16;
+  }
+}
+
+void ARGBAddRow_MSA(const uint8_t* src_argb0,
+                    const uint8_t* src_argb1,
+                    uint8_t* dst_argb,
+                    int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 0);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 16);
+    dst0 = __msa_adds_u_b(src0, src2);
+    dst1 = __msa_adds_u_b(src1, src3);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_argb0 += 32;
+    src_argb1 += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBSubtractRow_MSA(const uint8_t* src_argb0,
+                         const uint8_t* src_argb1,
+                         uint8_t* dst_argb,
+                         int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 0);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 16);
+    dst0 = __msa_subs_u_b(src0, src2);
+    dst1 = __msa_subs_u_b(src1, src3);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_argb0 += 32;
+    src_argb1 += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBAttenuateRow_MSA(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width) {
+  int x;
+  v16u8 src0, src1, dst0, dst1;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+  v4u32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  v8i16 zero = {0};
+  v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b((v16i8)src1, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b((v16i8)src1, (v16i8)src1);
+    vec4 = (v8u16)__msa_fill_h(vec0[3]);
+    vec5 = (v8u16)__msa_fill_h(vec0[7]);
+    vec6 = (v8u16)__msa_fill_h(vec1[3]);
+    vec7 = (v8u16)__msa_fill_h(vec1[7]);
+    vec4 = (v8u16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
+    vec5 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
+    vec6 = (v8u16)__msa_fill_h(vec2[3]);
+    vec7 = (v8u16)__msa_fill_h(vec2[7]);
+    vec8 = (v8u16)__msa_fill_h(vec3[3]);
+    vec9 = (v8u16)__msa_fill_h(vec3[7]);
+    vec6 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
+    vec7 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8);
+    reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec4);
+    reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec4);
+    reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec5);
+    reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec5);
+    reg4 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec6);
+    reg5 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec6);
+    reg6 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec7);
+    reg7 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec7);
+    reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
+    reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
+    reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
+    reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
+    reg4 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2);
+    reg5 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2);
+    reg6 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3);
+    reg7 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3);
+    reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24);
+    reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24);
+    reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24);
+    reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24);
+    reg4 = (v4u32)__msa_srai_w((v4i32)reg4, 24);
+    reg5 = (v4u32)__msa_srai_w((v4i32)reg5, 24);
+    reg6 = (v4u32)__msa_srai_w((v4i32)reg6, 24);
+    reg7 = (v4u32)__msa_srai_w((v4i32)reg7, 24);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
+    vec3 = (v8u16)__msa_pckev_h((v8i16)reg7, (v8i16)reg6);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    dst0 = __msa_bmnz_v(dst0, src0, mask);
+    dst1 = __msa_bmnz_v(dst1, src1, mask);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_argb += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb,
+                               uint8_t* dst_rgb,
+                               uint32_t dither4,
+                               int width) {
+  int x;
+  v16u8 src0, src1, dst0, vec0, vec1;
+  v8i16 vec_d0;
+  v8i16 reg0, reg1, reg2;
+  v16i8 zero = {0};
+  v8i16 max = __msa_ldi_h(0xFF);
+
+  vec_d0 = (v8i16)__msa_fill_w(dither4);
+  vec_d0 = (v8i16)__msa_ilvr_b(zero, (v16i8)vec_d0);
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16);
+    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    reg0 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec0);
+    reg1 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec1);
+    reg2 = (v8i16)__msa_ilvod_b(zero, (v16i8)vec0);
+    reg0 += vec_d0;
+    reg1 += vec_d0;
+    reg2 += vec_d0;
+    reg0 = __msa_maxi_s_h((v8i16)reg0, 0);
+    reg1 = __msa_maxi_s_h((v8i16)reg1, 0);
+    reg2 = __msa_maxi_s_h((v8i16)reg2, 0);
+    reg0 = __msa_min_s_h((v8i16)max, (v8i16)reg0);
+    reg1 = __msa_min_s_h((v8i16)max, (v8i16)reg1);
+    reg2 = __msa_min_s_h((v8i16)max, (v8i16)reg2);
+    reg0 = __msa_srai_h(reg0, 3);
+    reg2 = __msa_srai_h(reg2, 3);
+    reg1 = __msa_srai_h(reg1, 2);
+    reg2 = __msa_slli_h(reg2, 11);
+    reg1 = __msa_slli_h(reg1, 5);
+    reg0 |= reg1;
+    dst0 = (v16u8)(reg0 | reg2);
+    ST_UB(dst0, dst_rgb);
+    src_argb += 32;
+    dst_rgb += 16;
+  }
+}
+
+void ARGBShuffleRow_MSA(const uint8_t* src_argb,
+                        uint8_t* dst_argb,
+                        const uint8_t* shuffler,
+                        int width) {
+  int x;
+  v16u8 src0, src1, dst0, dst1;
+  v16i8 vec0;
+  v16i8 shuffler_vec = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
+  int32_t val = LW((int32_t*)shuffler);
+
+  vec0 = (v16i8)__msa_fill_w(val);
+  shuffler_vec += vec0;
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((const v16u8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((const v16u8*)src_argb, 16);
+    dst0 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src0, (v16i8)src0);
+    dst1 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src1, (v16i8)src1);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_argb += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBShadeRow_MSA(const uint8_t* src_argb,
+                      uint8_t* dst_argb,
+                      int width,
+                      uint32_t value) {
+  int x;
+  v16u8 src0, dst0;
+  v8u16 vec0, vec1;
+  v4u32 reg0, reg1, reg2, reg3, rgba_scale;
+  v8i16 zero = {0};
+
+  rgba_scale[0] = value;
+  rgba_scale = (v4u32)__msa_ilvr_b((v16i8)rgba_scale, (v16i8)rgba_scale);
+  rgba_scale = (v4u32)__msa_ilvr_h(zero, (v8i16)rgba_scale);
+
+  for (x = 0; x < width; x += 4) {
+    src0 = (v16u8)__msa_ld_b((const v16u8*)src_argb, 0);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
+    reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
+    reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
+    reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
+    reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
+    reg0 *= rgba_scale;
+    reg1 *= rgba_scale;
+    reg2 *= rgba_scale;
+    reg3 *= rgba_scale;
+    reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24);
+    reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24);
+    reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24);
+    reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_argb);
+    src_argb += 16;
+    dst_argb += 16;
+  }
+}
+
+void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+  int x;
+  v16u8 src0, src1, vec0, vec1, dst0, dst1;
+  v8u16 reg0;
+  v16u8 const_0x26 = (v16u8)__msa_ldi_h(0x26);
+  v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((const v16u8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((const v16u8*)src_argb, 16);
+    vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
+    vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
+    reg0 = __msa_dotp_u_h(vec0, const_0x4B0F);
+    reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x26);
+    reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 7);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0);
+    vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_argb += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width) {
+  int x;
+  v16u8 src0, src1, dst0, dst1, vec0, vec1, vec2, vec3, vec4, vec5;
+  v8u16 reg0, reg1, reg2;
+  v16u8 const_0x4411 = (v16u8)__msa_fill_h(0x4411);
+  v16u8 const_0x23 = (v16u8)__msa_ldi_h(0x23);
+  v16u8 const_0x5816 = (v16u8)__msa_fill_h(0x5816);
+  v16u8 const_0x2D = (v16u8)__msa_ldi_h(0x2D);
+  v16u8 const_0x6218 = (v16u8)__msa_fill_h(0x6218);
+  v16u8 const_0x32 = (v16u8)__msa_ldi_h(0x32);
+  v8u16 const_0xFF = (v8u16)__msa_ldi_h(0xFF);
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 16);
+    vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
+    vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
+    vec3 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec1);
+    reg0 = (v8u16)__msa_dotp_u_h(vec0, const_0x4411);
+    reg1 = (v8u16)__msa_dotp_u_h(vec0, const_0x5816);
+    reg2 = (v8u16)__msa_dotp_u_h(vec0, const_0x6218);
+    reg0 = (v8u16)__msa_dpadd_u_h(reg0, vec1, const_0x23);
+    reg1 = (v8u16)__msa_dpadd_u_h(reg1, vec1, const_0x2D);
+    reg2 = (v8u16)__msa_dpadd_u_h(reg2, vec1, const_0x32);
+    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 7);
+    reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 7);
+    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 7);
+    reg1 = (v8u16)__msa_min_u_h((v8u16)reg1, const_0xFF);
+    reg2 = (v8u16)__msa_min_u_h((v8u16)reg2, const_0xFF);
+    vec0 = (v16u8)__msa_pckev_b((v16i8)reg0, (v16i8)reg0);
+    vec1 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg1);
+    vec2 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg2);
+    vec4 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
+    vec5 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)vec5, (v16i8)vec4);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)vec5, (v16i8)vec4);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    dst_argb += 32;
+  }
+}
+
+void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444,
+                           uint8_t* dst_argb,
+                           int width) {
+  int x;
+  v16u8 src0, src1;
+  v8u16 vec0, vec1, vec2, vec3;
+  v16u8 dst0, dst1, dst2, dst3;
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16u8*)src_argb4444, 0);
+    src1 = (v16u8)__msa_ld_b((const v16u8*)src_argb4444, 16);
+    vec0 = (v8u16)__msa_andi_b(src0, 0x0F);
+    vec1 = (v8u16)__msa_andi_b(src1, 0x0F);
+    vec2 = (v8u16)__msa_andi_b(src0, 0xF0);
+    vec3 = (v8u16)__msa_andi_b(src1, 0xF0);
+    vec0 |= (v8u16)__msa_slli_b((v16i8)vec0, 4);
+    vec1 |= (v8u16)__msa_slli_b((v16i8)vec1, 4);
+    vec2 |= (v8u16)__msa_srli_b((v16i8)vec2, 4);
+    vec3 |= (v8u16)__msa_srli_b((v16i8)vec3, 4);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0);
+    dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
+    dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_argb4444 += 32;
+    dst_argb += 64;
+  }
+}
+
+void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555,
+                           uint8_t* dst_argb,
+                           int width) {
+  int x;
+  v8u16 src0, src1;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5;
+  v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6;
+  v16u8 dst0, dst1, dst2, dst3;
+  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v8u16)__msa_ld_h((const v8u16*)src_argb1555, 0);
+    src1 = (v8u16)__msa_ld_h((const v8u16*)src_argb1555, 16);
+    vec0 = src0 & const_0x1F;
+    vec1 = src1 & const_0x1F;
+    src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
+    src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
+    vec2 = src0 & const_0x1F;
+    vec3 = src1 & const_0x1F;
+    src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
+    src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
+    vec4 = src0 & const_0x1F;
+    vec5 = src1 & const_0x1F;
+    src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
+    src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
+    reg0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    reg1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    reg2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+    reg3 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    reg4 = (v16u8)__msa_slli_b((v16i8)reg0, 3);
+    reg5 = (v16u8)__msa_slli_b((v16i8)reg1, 3);
+    reg6 = (v16u8)__msa_slli_b((v16i8)reg2, 3);
+    reg4 |= (v16u8)__msa_srai_b((v16i8)reg0, 2);
+    reg5 |= (v16u8)__msa_srai_b((v16i8)reg1, 2);
+    reg6 |= (v16u8)__msa_srai_b((v16i8)reg2, 2);
+    reg3 = -reg3;
+    reg0 = (v16u8)__msa_ilvr_b((v16i8)reg6, (v16i8)reg4);
+    reg1 = (v16u8)__msa_ilvl_b((v16i8)reg6, (v16i8)reg4);
+    reg2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg5);
+    reg3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg5);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)reg2, (v16i8)reg0);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)reg2, (v16i8)reg0);
+    dst2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg1);
+    dst3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg1);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_argb1555 += 32;
+    dst_argb += 64;
+  }
+}
+
+void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565,
+                         uint8_t* dst_argb,
+                         int width) {
+  int x;
+  v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;
+  v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
+  v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+  v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0);
+  v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v8u16)__msa_ld_h((const v8u16*)src_rgb565, 0);
+    src1 = (v8u16)__msa_ld_h((const v8u16*)src_rgb565, 16);
+    vec0 = src0 & const_0x1F;
+    vec1 = src0 & const_0x7E0;
+    vec2 = src0 & const_0xF800;
+    vec3 = src1 & const_0x1F;
+    vec4 = src1 & const_0x7E0;
+    vec5 = src1 & const_0xF800;
+    reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
+    reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3);
+    reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8);
+    reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
+    reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3);
+    reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8);
+    reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2);
+    reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9);
+    reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13);
+    reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2);
+    reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9);
+    reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13);
+    res0 = (v16u8)__msa_ilvev_b((v16i8)reg2, (v16i8)reg0);
+    res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg1);
+    res2 = (v16u8)__msa_ilvev_b((v16i8)reg5, (v16i8)reg3);
+    res3 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg4);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
+    dst2 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res2);
+    dst3 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res2);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_rgb565 += 32;
+    dst_argb += 64;
+  }
+}
+
+void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24,
+                        uint8_t* dst_argb,
+                        int width) {
+  int x;
+  v16u8 src0, src1, src2;
+  v16u8 vec0, vec1, vec2;
+  v16u8 dst0, dst1, dst2, dst3;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+  v16i8 shuffler = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_rgb24, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_rgb24, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_rgb24, 32);
+    vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12);
+    vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
+    vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4);
+    dst0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)src0);
+    dst1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec0);
+    dst2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec1);
+    dst3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec2);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_rgb24 += 48;
+    dst_argb += 64;
+  }
+}
+
+void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+  int x;
+  v16u8 src0, src1, src2;
+  v16u8 vec0, vec1, vec2;
+  v16u8 dst0, dst1, dst2, dst3;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+  v16i8 mask = {2, 1, 0, 16, 5, 4, 3, 17, 8, 7, 6, 18, 11, 10, 9, 19};
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 32);
+    vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12);
+    vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
+    vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4);
+    dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)src0);
+    dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec0);
+    dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec1);
+    dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec2);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_raw += 48;
+    dst_argb += 64;
+  }
+}
+
+void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555,
+                        uint8_t* dst_y,
+                        int width) {
+  int x;
+  v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;
+  v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
+  v16u8 dst0;
+  v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19);
+  v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81);
+  v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42);
+  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v8u16)__msa_ld_b((const v8i16*)src_argb1555, 0);
+    src1 = (v8u16)__msa_ld_b((const v8i16*)src_argb1555, 16);
+    vec0 = src0 & const_0x1F;
+    vec1 = src1 & const_0x1F;
+    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
+    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
+    vec2 = src0 & const_0x1F;
+    vec3 = src1 & const_0x1F;
+    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
+    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
+    vec4 = src0 & const_0x1F;
+    vec5 = src1 & const_0x1F;
+    reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
+    reg1 = (v8u16)__msa_slli_h((v8i16)vec1, 3);
+    reg0 |= (v8u16)__msa_srai_h((v8i16)vec0, 2);
+    reg1 |= (v8u16)__msa_srai_h((v8i16)vec1, 2);
+    reg2 = (v8u16)__msa_slli_h((v8i16)vec2, 3);
+    reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
+    reg2 |= (v8u16)__msa_srai_h((v8i16)vec2, 2);
+    reg3 |= (v8u16)__msa_srai_h((v8i16)vec3, 2);
+    reg4 = (v8u16)__msa_slli_h((v8i16)vec4, 3);
+    reg5 = (v8u16)__msa_slli_h((v8i16)vec5, 3);
+    reg4 |= (v8u16)__msa_srai_h((v8i16)vec4, 2);
+    reg5 |= (v8u16)__msa_srai_h((v8i16)vec5, 2);
+    reg0 *= const_0x19;
+    reg1 *= const_0x19;
+    reg2 *= const_0x81;
+    reg3 *= const_0x81;
+    reg4 *= const_0x42;
+    reg5 *= const_0x42;
+    reg0 += reg2;
+    reg1 += reg3;
+    reg0 += reg4;
+    reg1 += reg5;
+    reg0 += const_0x1080;
+    reg1 += const_0x1080;
+    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
+    reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+    ST_UB(dst0, dst_y);
+    src_argb1555 += 32;
+    dst_y += 16;
+  }
+}
+
+void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
+  int x;
+  v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
+  v4u32 res0, res1, res2, res3;
+  v16u8 dst0;
+  v4u32 const_0x810019 = (v4u32)__msa_fill_w(0x810019);
+  v4u32 const_0x010042 = (v4u32)__msa_fill_w(0x010042);
+  v8i16 const_0x1080 = __msa_fill_h(0x1080);
+  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+  v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0);
+  v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v8u16)__msa_ld_b((const v8i16*)src_rgb565, 0);
+    src1 = (v8u16)__msa_ld_b((const v8i16*)src_rgb565, 16);
+    vec0 = src0 & const_0x1F;
+    vec1 = src0 & const_0x7E0;
+    vec2 = src0 & const_0xF800;
+    vec3 = src1 & const_0x1F;
+    vec4 = src1 & const_0x7E0;
+    vec5 = src1 & const_0xF800;
+    reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
+    reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3);
+    reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8);
+    reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
+    reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3);
+    reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8);
+    reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2);
+    reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9);
+    reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13);
+    reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2);
+    reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9);
+    reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13);
+    vec0 = (v8u16)__msa_ilvr_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8u16)__msa_ilvl_h((v8i16)reg1, (v8i16)reg0);
+    vec2 = (v8u16)__msa_ilvr_h((v8i16)reg4, (v8i16)reg3);
+    vec3 = (v8u16)__msa_ilvl_h((v8i16)reg4, (v8i16)reg3);
+    vec4 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg2);
+    vec5 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg2);
+    vec6 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg5);
+    vec7 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg5);
+    res0 = __msa_dotp_u_w(vec0, (v8u16)const_0x810019);
+    res1 = __msa_dotp_u_w(vec1, (v8u16)const_0x810019);
+    res2 = __msa_dotp_u_w(vec2, (v8u16)const_0x810019);
+    res3 = __msa_dotp_u_w(vec3, (v8u16)const_0x810019);
+    res0 = __msa_dpadd_u_w(res0, vec4, (v8u16)const_0x010042);
+    res1 = __msa_dpadd_u_w(res1, vec5, (v8u16)const_0x010042);
+    res2 = __msa_dpadd_u_w(res2, vec6, (v8u16)const_0x010042);
+    res3 = __msa_dpadd_u_w(res3, vec7, (v8u16)const_0x010042);
+    res0 = (v4u32)__msa_srai_w((v4i32)res0, 8);
+    res1 = (v4u32)__msa_srai_w((v4i32)res1, 8);
+    res2 = (v4u32)__msa_srai_w((v4i32)res2, 8);
+    res3 = (v4u32)__msa_srai_w((v4i32)res3, 8);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)res1, (v8i16)res0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)res3, (v8i16)res2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_y);
+    src_rgb565 += 32;
+    dst_y += 16;
+  }
+}
+
+void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
+  v8u16 vec0, vec1, vec2, vec3;
+  v8u16 const_0x8119 = (v8u16)__msa_fill_h(0x8119);
+  v8u16 const_0x42 = (v8u16)__msa_fill_h(0x42);
+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+  v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12};
+  v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18,
+                 18, 19, 20, 21, 21, 22, 23, 24};
+  v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20};
+  v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16};
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32);
+    reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
+    reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
+    reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
+    reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
+    vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
+    vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8119);
+    vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8119);
+    vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x42);
+    vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x42);
+    vec0 += const_0x1080;
+    vec1 += const_0x1080;
+    vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
+    vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_y);
+    src_argb0 += 48;
+    dst_y += 16;
+  }
+}
+
+void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
+  v8u16 vec0, vec1, vec2, vec3;
+  v8u16 const_0x8142 = (v8u16)__msa_fill_h(0x8142);
+  v8u16 const_0x19 = (v8u16)__msa_fill_h(0x19);
+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+  v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12};
+  v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18,
+                 18, 19, 20, 21, 21, 22, 23, 24};
+  v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20};
+  v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16};
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32);
+    reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
+    reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
+    reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
+    reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
+    vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
+    vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8142);
+    vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8142);
+    vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x19);
+    vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x19);
+    vec0 += const_0x1080;
+    vec1 += const_0x1080;
+    vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
+    vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_y);
+    src_argb0 += 48;
+    dst_y += 16;
+  }
+}
+
+void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555,
+                         int src_stride_argb1555,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  int x;
+  const uint16_t* s = (const uint16_t*)src_argb1555;
+  const uint16_t* t = (const uint16_t*)(src_argb1555 + src_stride_argb1555);
+  int64_t res0, res1;
+  v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6;
+  v16u8 dst0;
+  v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
+  v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
+  v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
+  v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
+  v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v8u16)__msa_ld_b((v8i16*)s, 0);
+    src1 = (v8u16)__msa_ld_b((v8i16*)s, 16);
+    src2 = (v8u16)__msa_ld_b((v8i16*)t, 0);
+    src3 = (v8u16)__msa_ld_b((v8i16*)t, 16);
+    vec0 = src0 & const_0x1F;
+    vec1 = src1 & const_0x1F;
+    vec0 += src2 & const_0x1F;
+    vec1 += src3 & const_0x1F;
+    vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
+    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
+    src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
+    src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
+    vec2 = src0 & const_0x1F;
+    vec3 = src1 & const_0x1F;
+    vec2 += src2 & const_0x1F;
+    vec3 += src3 & const_0x1F;
+    vec2 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
+    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
+    src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
+    src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
+    vec4 = src0 & const_0x1F;
+    vec5 = src1 & const_0x1F;
+    vec4 += src2 & const_0x1F;
+    vec5 += src3 & const_0x1F;
+    vec4 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+    vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+    vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+    vec4 = __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
+    vec6 = (v8u16)__msa_slli_h((v8i16)vec0, 1);
+    vec6 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);
+    vec0 = (v8u16)__msa_slli_h((v8i16)vec2, 1);
+    vec0 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);
+    vec2 = (v8u16)__msa_slli_h((v8i16)vec4, 1);
+    vec2 |= (v8u16)__msa_srai_h((v8i16)vec4, 6);
+    reg0 = vec6 * const_0x70;
+    reg1 = vec0 * const_0x4A;
+    reg2 = vec2 * const_0x70;
+    reg3 = vec0 * const_0x5E;
+    reg0 += const_0x8080;
+    reg1 += vec2 * const_0x26;
+    reg2 += const_0x8080;
+    reg3 += vec6 * const_0x12;
+    reg0 -= reg1;
+    reg2 -= reg3;
+    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
+    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
+    res0 = __msa_copy_u_d((v2i64)dst0, 0);
+    res1 = __msa_copy_u_d((v2i64)dst0, 1);
+    SD(res0, dst_u);
+    SD(res1, dst_v);
+    s += 16;
+    t += 16;
+    dst_u += 8;
+    dst_v += 8;
+  }
+}
+
+void RGB565ToUVRow_MSA(const uint8_t* src_rgb565,
+                       int src_stride_rgb565,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  int x;
+  const uint16_t* s = (const uint16_t*)src_rgb565;
+  const uint16_t* t = (const uint16_t*)(src_rgb565 + src_stride_rgb565);
+  int64_t res0, res1;
+  v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5;
+  v16u8 dst0;
+  v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
+  v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
+  v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
+  v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
+  v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
+  v8u16 const_32896 = (v8u16)__msa_fill_h(0x8080);
+  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+  v8u16 const_0x3F = (v8u16)__msa_fill_h(0x3F);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v8u16)__msa_ld_b((v8i16*)s, 0);
+    src1 = (v8u16)__msa_ld_b((v8i16*)s, 16);
+    src2 = (v8u16)__msa_ld_b((v8i16*)t, 0);
+    src3 = (v8u16)__msa_ld_b((v8i16*)t, 16);
+    vec0 = src0 & const_0x1F;
+    vec1 = src1 & const_0x1F;
+    vec0 += src2 & const_0x1F;
+    vec1 += src3 & const_0x1F;
+    vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
+    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
+    src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
+    src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
+    vec2 = src0 & const_0x3F;
+    vec3 = src1 & const_0x3F;
+    vec2 += src2 & const_0x3F;
+    vec3 += src3 & const_0x3F;
+    vec1 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    src0 = (v8u16)__msa_srai_h((v8i16)src0, 6);
+    src1 = (v8u16)__msa_srai_h((v8i16)src1, 6);
+    src2 = (v8u16)__msa_srai_h((v8i16)src2, 6);
+    src3 = (v8u16)__msa_srai_h((v8i16)src3, 6);
+    vec4 = src0 & const_0x1F;
+    vec5 = src1 & const_0x1F;
+    vec4 += src2 & const_0x1F;
+    vec5 += src3 & const_0x1F;
+    vec2 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+    vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+    vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
+    vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+    vec3 = (v8u16)__msa_slli_h((v8i16)vec0, 1);
+    vec3 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);
+    vec4 = (v8u16)__msa_slli_h((v8i16)vec2, 1);
+    vec4 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);
+    reg0 = vec3 * const_0x70;
+    reg1 = vec1 * const_0x4A;
+    reg2 = vec4 * const_0x70;
+    reg3 = vec1 * const_0x5E;
+    reg0 += const_32896;
+    reg1 += vec4 * const_0x26;
+    reg2 += const_32896;
+    reg3 += vec3 * const_0x12;
+    reg0 -= reg1;
+    reg2 -= reg3;
+    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
+    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
+    res0 = __msa_copy_u_d((v2i64)dst0, 0);
+    res1 = __msa_copy_u_d((v2i64)dst0, 1);
+    SD(res0, dst_u);
+    SD(res1, dst_v);
+    s += 16;
+    t += 16;
+    dst_u += 8;
+    dst_v += 8;
+  }
+}
+
+void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
+                      int src_stride_rgb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  int x;
+  const uint8_t* s = src_rgb0;
+  const uint8_t* t = src_rgb0 + src_stride_rgb;
+  int64_t res0, res1;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8i16 reg0, reg1, reg2, reg3;
+  v16u8 dst0;
+  v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
+  v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
+  v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
+  v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
+  v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+  v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 16) {
+    inp0 = (v16u8)__msa_ld_b((const v16i8*)s, 0);
+    inp1 = (v16u8)__msa_ld_b((const v16i8*)s, 16);
+    inp2 = (v16u8)__msa_ld_b((const v16i8*)s, 32);
+    inp3 = (v16u8)__msa_ld_b((const v16i8*)t, 0);
+    inp4 = (v16u8)__msa_ld_b((const v16i8*)t, 16);
+    inp5 = (v16u8)__msa_ld_b((const v16i8*)t, 32);
+    src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12);
+    src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12);
+    src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8);
+    src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8);
+    src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4);
+    src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4);
+    src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0);
+    src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1);
+    src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2);
+    src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3);
+    src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3);
+    src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5);
+    src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6);
+    src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1);
+    vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2);
+    vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2);
+    vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3);
+    vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3);
+    vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+    vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
+    vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+    vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
+    vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
+    vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
+    vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
+    vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
+    reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0);
+    reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2);
+    reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
+    reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
+    reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0);
+    reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
+    reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
+    reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
+    reg0 = __msa_srai_h((v8i16)reg0, 2);
+    reg1 = __msa_srai_h((v8i16)reg1, 2);
+    reg2 = __msa_srai_h((v8i16)reg2, 2);
+    reg3 = __msa_srai_h((v8i16)reg3, 2);
+    vec4 = (v8u16)__msa_pckev_h(reg1, reg0);
+    vec5 = (v8u16)__msa_pckev_h(reg3, reg2);
+    vec6 = (v8u16)__msa_pckod_h(reg1, reg0);
+    vec7 = (v8u16)__msa_pckod_h(reg3, reg2);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
+    vec2 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4);
+    vec3 = vec0 * const_0x70;
+    vec4 = vec1 * const_0x4A;
+    vec5 = vec2 * const_0x26;
+    vec2 *= const_0x70;
+    vec1 *= const_0x5E;
+    vec0 *= const_0x12;
+    reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4);
+    reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5);
+    reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1);
+    reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0);
+    reg0 += reg1;
+    reg2 += reg3;
+    reg0 = __msa_srai_h(reg0, 8);
+    reg2 = __msa_srai_h(reg2, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
+    res0 = __msa_copy_u_d((v2i64)dst0, 0);
+    res1 = __msa_copy_u_d((v2i64)dst0, 1);
+    SD(res0, dst_u);
+    SD(res1, dst_v);
+    t += 48;
+    s += 48;
+    dst_u += 8;
+    dst_v += 8;
+  }
+}
+
+void RAWToUVRow_MSA(const uint8_t* src_rgb0,
+                    int src_stride_rgb,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width) {
+  int x;
+  const uint8_t* s = src_rgb0;
+  const uint8_t* t = src_rgb0 + src_stride_rgb;
+  int64_t res0, res1;
+  v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8i16 reg0, reg1, reg2, reg3;
+  v16u8 dst0;
+  v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
+  v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
+  v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
+  v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
+  v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+  v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 16) {
+    inp0 = (v16u8)__msa_ld_b((const v16i8*)s, 0);
+    inp1 = (v16u8)__msa_ld_b((const v16i8*)s, 16);
+    inp2 = (v16u8)__msa_ld_b((const v16i8*)s, 32);
+    inp3 = (v16u8)__msa_ld_b((const v16i8*)t, 0);
+    inp4 = (v16u8)__msa_ld_b((const v16i8*)t, 16);
+    inp5 = (v16u8)__msa_ld_b((const v16i8*)t, 32);
+    src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12);
+    src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12);
+    src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8);
+    src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8);
+    src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4);
+    src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4);
+    src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0);
+    src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1);
+    src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2);
+    src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3);
+    src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3);
+    src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5);
+    src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6);
+    src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1);
+    vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2);
+    vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2);
+    vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3);
+    vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3);
+    vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+    vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
+    vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+    vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
+    vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
+    vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
+    vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
+    vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
+    reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0);
+    reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2);
+    reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
+    reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
+    reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0);
+    reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
+    reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
+    reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
+    reg0 = __msa_srai_h(reg0, 2);
+    reg1 = __msa_srai_h(reg1, 2);
+    reg2 = __msa_srai_h(reg2, 2);
+    reg3 = __msa_srai_h(reg3, 2);
+    vec4 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec5 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    vec6 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
+    vec7 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
+    vec0 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
+    vec2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
+    vec3 = vec0 * const_0x70;
+    vec4 = vec1 * const_0x4A;
+    vec5 = vec2 * const_0x26;
+    vec2 *= const_0x70;
+    vec1 *= const_0x5E;
+    vec0 *= const_0x12;
+    reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4);
+    reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5);
+    reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1);
+    reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0);
+    reg0 += reg1;
+    reg2 += reg3;
+    reg0 = __msa_srai_h(reg0, 8);
+    reg2 = __msa_srai_h(reg2, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
+    res0 = __msa_copy_u_d((v2i64)dst0, 0);
+    res1 = __msa_copy_u_d((v2i64)dst0, 1);
+    SD(res0, dst_u);
+    SD(res1, dst_v);
+    t += 48;
+    s += 48;
+    dst_u += 8;
+    dst_v += 8;
+  }
+}
+
+void NV12ToARGBRow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_uv,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  uint64_t val0, val1;
+  v16u8 src0, src1, res0, res1, dst0, dst1;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v16u8 zero = {0};
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    val0 = LD(src_y);
+    val1 = LD(src_uv);
+    src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
+    src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
+    res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_y += 8;
+    src_uv += 8;
+    dst_argb += 32;
+  }
+}
+
+void NV12ToRGB565Row_MSA(const uint8_t* src_y,
+                         const uint8_t* src_uv,
+                         uint8_t* dst_rgb565,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  int x;
+  uint64_t val0, val1;
+  v16u8 src0, src1, dst0;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v16u8 zero = {0};
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    val0 = LD(src_y);
+    val1 = LD(src_uv);
+    src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
+    src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    vec0 = vec0 >> 3;
+    vec1 = (vec1 >> 2) << 5;
+    vec2 = (vec2 >> 3) << 11;
+    dst0 = (v16u8)(vec0 | vec1 | vec2);
+    ST_UB(dst0, dst_rgb565);
+    src_y += 8;
+    src_uv += 8;
+    dst_rgb565 += 16;
+  }
+}
+
+void NV21ToARGBRow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_vu,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  uint64_t val0, val1;
+  v16u8 src0, src1, res0, res1, dst0, dst1;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+  v16u8 zero = {0};
+  v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    val0 = LD(src_y);
+    val1 = LD(src_vu);
+    src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
+    src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
+    src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
+    res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_y += 8;
+    src_vu += 8;
+    dst_argb += 32;
+  }
+}
+
+void SobelRow_MSA(const uint8_t* src_sobelx,
+                  const uint8_t* src_sobely,
+                  uint8_t* dst_argb,
+                  int width) {
+  int x;
+  v16u8 src0, src1, vec0, dst0, dst1, dst2, dst3;
+  v16i8 mask0 = {0, 0, 0, 16, 1, 1, 1, 16, 2, 2, 2, 16, 3, 3, 3, 16};
+  v16i8 const_0x4 = __msa_ldi_b(0x4);
+  v16i8 mask1 = mask0 + const_0x4;
+  v16i8 mask2 = mask1 + const_0x4;
+  v16i8 mask3 = mask2 + const_0x4;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_sobelx, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_sobely, 0);
+    vec0 = __msa_adds_u_b(src0, src1);
+    dst0 = (v16u8)__msa_vshf_b(mask0, (v16i8)alpha, (v16i8)vec0);
+    dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)alpha, (v16i8)vec0);
+    dst2 = (v16u8)__msa_vshf_b(mask2, (v16i8)alpha, (v16i8)vec0);
+    dst3 = (v16u8)__msa_vshf_b(mask3, (v16i8)alpha, (v16i8)vec0);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_sobelx += 16;
+    src_sobely += 16;
+    dst_argb += 64;
+  }
+}
+
+void SobelToPlaneRow_MSA(const uint8_t* src_sobelx,
+                         const uint8_t* src_sobely,
+                         uint8_t* dst_y,
+                         int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_sobelx, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_sobelx, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_sobely, 0);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_sobely, 16);
+    dst0 = __msa_adds_u_b(src0, src2);
+    dst1 = __msa_adds_u_b(src1, src3);
+    ST_UB2(dst0, dst1, dst_y, 16);
+    src_sobelx += 32;
+    src_sobely += 32;
+    dst_y += 32;
+  }
+}
+
+void SobelXYRow_MSA(const uint8_t* src_sobelx,
+                    const uint8_t* src_sobely,
+                    uint8_t* dst_argb,
+                    int width) {
+  int x;
+  v16u8 src0, src1, vec0, vec1, vec2;
+  v16u8 reg0, reg1, dst0, dst1, dst2, dst3;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_sobelx, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_sobely, 0);
+    vec0 = __msa_adds_u_b(src0, src1);
+    vec1 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src1);
+    vec2 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src1);
+    reg0 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)vec0);
+    reg1 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)vec0);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)vec1);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1);
+    dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2);
+    dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_sobelx += 16;
+    src_sobely += 16;
+    dst_argb += 64;
+  }
+}
+
+void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0;
+  v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
+  v16u8 const_0x26 = (v16u8)__msa_fill_h(0x26);
+  v8u16 const_0x40 = (v8u16)__msa_fill_h(0x40);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 48);
+    ARGBTOY(src0, src1, src2, src3, const_0x4B0F, const_0x26, const_0x40, 7,
+            dst0);
+    ST_UB(dst0, dst_y);
+    src_argb0 += 64;
+    dst_y += 16;
+  }
+}
+
+void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0;
+  v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200);
+  v16u8 const_0x1981 = (v16u8)__msa_fill_h(0x1981);
+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 48);
+    ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8,
+            dst0);
+    ST_UB(dst0, dst_y);
+    src_argb0 += 64;
+    dst_y += 16;
+  }
+}
+
+void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0;
+  v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142);
+  v16u8 const_0x19 = (v16u8)__msa_fill_h(0x19);
+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 48);
+    ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8,
+            dst0);
+    ST_UB(dst0, dst_y);
+    src_argb0 += 64;
+    dst_y += 16;
+  }
+}
+
+void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0;
+  v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900);
+  v16u8 const_0x4281 = (v16u8)__msa_fill_h(0x4281);
+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 48);
+    ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8,
+            dst0);
+    ST_UB(dst0, dst_y);
+    src_argb0 += 64;
+    dst_y += 16;
+  }
+}
+
+void ARGBToUVJRow_MSA(const uint8_t* src_rgb0,
+                      int src_stride_rgb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  int x;
+  const uint8_t* s = src_rgb0;
+  const uint8_t* t = src_rgb0 + src_stride_rgb;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 vec0, vec1, vec2, vec3;
+  v16u8 dst0, dst1;
+  v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
+  v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,
+                     18, 19, 22, 23, 26, 27, 30, 31};
+  v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
+  v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
+  v16u8 const_0x7F = (v16u8)__msa_fill_h(0x7F);
+  v16u8 const_0x6B14 = (v16u8)__msa_fill_h(0x6B14);
+  v16u8 const_0x2B54 = (v16u8)__msa_fill_h(0x2B54);
+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+
+  for (x = 0; x < width; x += 32) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)s, 32);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)s, 48);
+    src4 = (v16u8)__msa_ld_b((const v16i8*)t, 0);
+    src5 = (v16u8)__msa_ld_b((const v16i8*)t, 16);
+    src6 = (v16u8)__msa_ld_b((const v16i8*)t, 32);
+    src7 = (v16u8)__msa_ld_b((const v16i8*)t, 48);
+    src0 = __msa_aver_u_b(src0, src4);
+    src1 = __msa_aver_u_b(src1, src5);
+    src2 = __msa_aver_u_b(src2, src6);
+    src3 = __msa_aver_u_b(src3, src7);
+    src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
+    src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
+    src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
+    src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
+    vec0 = __msa_aver_u_b(src4, src6);
+    vec1 = __msa_aver_u_b(src5, src7);
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 64);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 80);
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 96);
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 112);
+    src4 = (v16u8)__msa_ld_b((v16i8*)t, 64);
+    src5 = (v16u8)__msa_ld_b((v16i8*)t, 80);
+    src6 = (v16u8)__msa_ld_b((v16i8*)t, 96);
+    src7 = (v16u8)__msa_ld_b((v16i8*)t, 112);
+    src0 = __msa_aver_u_b(src0, src4);
+    src1 = __msa_aver_u_b(src1, src5);
+    src2 = __msa_aver_u_b(src2, src6);
+    src3 = __msa_aver_u_b(src3, src7);
+    src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
+    src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
+    src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
+    src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
+    vec2 = __msa_aver_u_b(src4, src6);
+    vec3 = __msa_aver_u_b(src5, src7);
+    ARGBTOUV(vec0, vec1, vec2, vec3, const_0x6B14, const_0x7F, const_0x2B54,
+             const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
+             dst1);
+    ST_UB(dst0, dst_v);
+    ST_UB(dst1, dst_u);
+    s += 128;
+    t += 128;
+    dst_v += 16;
+    dst_u += 16;
+  }
+}
+
+void BGRAToUVRow_MSA(const uint8_t* src_rgb0,
+                     int src_stride_rgb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  int x;
+  const uint8_t* s = src_rgb0;
+  const uint8_t* t = src_rgb0 + src_stride_rgb;
+  v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
+  v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
+  v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,
+                     18, 19, 22, 23, 26, 27, 30, 31};
+  v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
+  v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
+  v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
+  v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
+  v16u8 const_0x264A = (v16u8)__msa_fill_h(0x264A);
+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+
+  for (x = 0; x < width; x += 32) {
+    READ_ARGB(s, t, vec0, vec1, vec2, vec3);
+    ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
+             const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
+             dst1);
+    ST_UB(dst0, dst_v);
+    ST_UB(dst1, dst_u);
+    s += 128;
+    t += 128;
+    dst_v += 16;
+    dst_u += 16;
+  }
+}
+
+void ABGRToUVRow_MSA(const uint8_t* src_rgb0,
+                     int src_stride_rgb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  int x;
+  const uint8_t* s = src_rgb0;
+  const uint8_t* t = src_rgb0 + src_stride_rgb;
+  v16u8 src0, src1, src2, src3;
+  v16u8 dst0, dst1;
+  v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
+  v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,
+                     18, 19, 22, 23, 26, 27, 30, 31};
+  v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
+  v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
+  v16u8 const_0x4A26 = (v16u8)__msa_fill_h(0x4A26);
+  v16u8 const_0x0070 = (v16u8)__msa_fill_h(0x0070);
+  v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+
+  for (x = 0; x < width; x += 32) {
+    READ_ARGB(s, t, src0, src1, src2, src3);
+    ARGBTOUV(src0, src1, src2, src3, const_0x4A26, const_0x0070, const_0x125E,
+             const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
+             dst1);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    s += 128;
+    t += 128;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void RGBAToUVRow_MSA(const uint8_t* src_rgb0,
+                     int src_stride_rgb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  int x;
+  const uint8_t* s = src_rgb0;
+  const uint8_t* t = src_rgb0 + src_stride_rgb;
+  v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
+  v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
+  v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,
+                     18, 19, 22, 23, 26, 27, 30, 31};
+  v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
+  v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
+  v16u8 const_0x125E = (v16u8)__msa_fill_h(0x264A);
+  v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
+  v16u8 const_0x264A = (v16u8)__msa_fill_h(0x125E);
+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+
+  for (x = 0; x < width; x += 32) {
+    READ_ARGB(s, t, vec0, vec1, vec2, vec3);
+    ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
+             const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
+             dst1);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    s += 128;
+    t += 128;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void I444ToARGBRow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  v16u8 src0, src1, src2, dst0, dst1;
+  v8u16 vec0, vec1, vec2;
+  v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+  v8i16 zero = {0};
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+
+  for (x = 0; x < width; x += 8) {
+    READI444(src_y, src_u, src_v, src0, src1, src2);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+    reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
+    reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
+    reg0 *= vec_yg;
+    reg1 *= vec_yg;
+    reg0 = __msa_srai_w(reg0, 16);
+    reg1 = __msa_srai_w(reg1, 16);
+    reg4 = reg0 + vec_br;
+    reg5 = reg1 + vec_br;
+    reg2 = reg0 + vec_bg;
+    reg3 = reg1 + vec_bg;
+    reg0 += vec_bb;
+    reg1 += vec_bb;
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
+    vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src2);
+    reg6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
+    reg7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
+    reg8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
+    reg9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
+    reg0 -= reg6 * vec_ub;
+    reg1 -= reg7 * vec_ub;
+    reg2 -= reg6 * vec_ug;
+    reg3 -= reg7 * vec_ug;
+    reg4 -= reg8 * vec_vr;
+    reg5 -= reg9 * vec_vr;
+    reg2 -= reg8 * vec_vg;
+    reg3 -= reg9 * vec_vg;
+    reg0 = __msa_srai_w(reg0, 6);
+    reg1 = __msa_srai_w(reg1, 6);
+    reg2 = __msa_srai_w(reg2, 6);
+    reg3 = __msa_srai_w(reg3, 6);
+    reg4 = __msa_srai_w(reg4, 6);
+    reg5 = __msa_srai_w(reg5, 6);
+    CLIP_0TO255(reg0, reg1, reg2, reg3, reg4, reg5);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
+    vec0 = (v8u16)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
+    vec1 = (v8u16)__msa_ilvev_b((v16i8)alpha, (v16i8)vec2);
+    dst0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0);
+    dst1 = (v16u8)__msa_ilvl_h((v8i16)vec1, (v8i16)vec0);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_y += 8;
+    src_u += 8;
+    src_v += 8;
+    dst_argb += 32;
+  }
+}
+
+void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+  int x;
+  v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3;
+  v8i16 vec0, vec1;
+  v4i32 reg0, reg1, reg2, reg3;
+  v4i32 vec_yg = __msa_fill_w(0x4A35);
+  v8i16 vec_ygb = __msa_fill_h(0xFB78);
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+  v8i16 max = __msa_ldi_h(0xFF);
+  v8i16 zero = {0};
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_y, 0);
+    vec0 = (v8i16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+    vec1 = (v8i16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
+    reg0 = (v4i32)__msa_ilvr_h(zero, vec0);
+    reg1 = (v4i32)__msa_ilvl_h(zero, vec0);
+    reg2 = (v4i32)__msa_ilvr_h(zero, vec1);
+    reg3 = (v4i32)__msa_ilvl_h(zero, vec1);
+    reg0 *= vec_yg;
+    reg1 *= vec_yg;
+    reg2 *= vec_yg;
+    reg3 *= vec_yg;
+    reg0 = __msa_srai_w(reg0, 16);
+    reg1 = __msa_srai_w(reg1, 16);
+    reg2 = __msa_srai_w(reg2, 16);
+    reg3 = __msa_srai_w(reg3, 16);
+    vec0 = (v8i16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8i16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    vec0 += vec_ygb;
+    vec1 += vec_ygb;
+    vec0 = __msa_srai_h(vec0, 6);
+    vec1 = __msa_srai_h(vec1, 6);
+    vec0 = __msa_maxi_s_h(vec0, 0);
+    vec1 = __msa_maxi_s_h(vec1, 0);
+    vec0 = __msa_min_s_h(max, vec0);
+    vec1 = __msa_min_s_h(max, vec1);
+    res0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    res1 = (v16u8)__msa_ilvr_b((v16i8)res0, (v16i8)res0);
+    res2 = (v16u8)__msa_ilvl_b((v16i8)res0, (v16i8)res0);
+    res3 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)res0);
+    res4 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)res0);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res1);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res1);
+    dst2 = (v16u8)__msa_ilvr_b((v16i8)res4, (v16i8)res2);
+    dst3 = (v16u8)__msa_ilvl_b((v16i8)res4, (v16i8)res2);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_y += 16;
+    dst_argb += 64;
+  }
+}
+
+void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+  int x;
+  v16u8 src0, vec0, vec1, vec2, vec3, dst0, dst1, dst2, dst3;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_y, 0);
+    vec0 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+    vec1 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
+    vec2 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)src0);
+    vec3 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)src0);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0);
+    dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
+    dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_y += 16;
+    dst_argb += 64;
+  }
+}
+
+void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  v16u8 src0, src1, src2;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_yuy2, 0);
+    src1 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
+    src2 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
+    YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
+    src_yuy2 += 16;
+    dst_argb += 32;
+  }
+}
+
+void UYVYToARGBRow_MSA(const uint8_t* src_uyvy,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  v16u8 src0, src1, src2;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_uyvy, 0);
+    src1 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
+    src2 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
+    YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
+    src_uyvy += 16;
+    dst_argb += 32;
+  }
+}
+
+void InterpolateRow_MSA(uint8_t* dst_ptr,
+                        const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        int width,
+                        int32_t source_y_fraction) {
+  int32_t y1_fraction = source_y_fraction;
+  int32_t y0_fraction = 256 - y1_fraction;
+  uint16_t y_fractions;
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+  v8u16 vec0, vec1, vec2, vec3, y_frac;
+
+  if (0 == y1_fraction) {
+    memcpy(dst_ptr, src_ptr, width);
+    return;
+  }
+
+  if (128 == y1_fraction) {
+    for (x = 0; x < width; x += 32) {
+      src0 = (v16u8)__msa_ld_b((const v16i8*)s, 0);
+      src1 = (v16u8)__msa_ld_b((const v16i8*)s, 16);
+      src2 = (v16u8)__msa_ld_b((const v16i8*)t, 0);
+      src3 = (v16u8)__msa_ld_b((const v16i8*)t, 16);
+      dst0 = __msa_aver_u_b(src0, src2);
+      dst1 = __msa_aver_u_b(src1, src3);
+      ST_UB2(dst0, dst1, dst_ptr, 16);
+      s += 32;
+      t += 32;
+      dst_ptr += 32;
+    }
+    return;
+  }
+
+  y_fractions = (uint16_t)(y0_fraction + (y1_fraction << 8));
+  y_frac = (v8u16)__msa_fill_h(y_fractions);
+
+  for (x = 0; x < width; x += 32) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)t, 0);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)t, 16);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
+    vec0 = (v8u16)__msa_dotp_u_h((v16u8)vec0, (v16u8)y_frac);
+    vec1 = (v8u16)__msa_dotp_u_h((v16u8)vec1, (v16u8)y_frac);
+    vec2 = (v8u16)__msa_dotp_u_h((v16u8)vec2, (v16u8)y_frac);
+    vec3 = (v8u16)__msa_dotp_u_h((v16u8)vec3, (v16u8)y_frac);
+    vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 8);
+    vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 8);
+    vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 8);
+    vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    ST_UB2(dst0, dst1, dst_ptr, 16);
+    s += 32;
+    t += 32;
+    dst_ptr += 32;
+  }
+}
+
+void ARGBSetRow_MSA(uint8_t* dst_argb, uint32_t v32, int width) {
+  int x;
+  v4i32 dst0 = __builtin_msa_fill_w(v32);
+
+  for (x = 0; x < width; x += 4) {
+    ST_UB(dst0, dst_argb);
+    dst_argb += 16;
+  }
+}
+
+void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
+  v16i8 shuffler0 = {2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17};
+  v16i8 shuffler1 = {8,  7,  12, 11, 10, 15, 14, 13,
+                     18, 17, 16, 21, 20, 19, 24, 23};
+  v16i8 shuffler2 = {14, 19, 18, 17, 22, 21, 20, 25,
+                     24, 23, 28, 27, 26, 31, 30, 29};
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 32);
+    src3 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 8);
+    src4 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
+    dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src4, (v16i8)src3);
+    dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src2, (v16i8)src1);
+    ST_UB2(dst0, dst1, dst_rgb24, 16);
+    ST_UB(dst2, (dst_rgb24 + 32));
+    src_raw += 48;
+    dst_rgb24 += 48;
+  }
+}
+
+void MergeUVRow_MSA(const uint8_t* src_u,
+                    const uint8_t* src_v,
+                    uint8_t* dst_uv,
+                    int width) {
+  int x;
+  v16u8 src0, src1, dst0, dst1;
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_u, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_v, 0);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)src1, (v16i8)src0);
+    ST_UB2(dst0, dst1, dst_uv, 16);
+    src_u += 16;
+    src_v += 16;
+    dst_uv += 32;
+  }
+}
+
+void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb,
+                             uint8_t* dst_a,
+                             int width) {
+  int i;
+  v16u8 src0, src1, src2, src3, vec0, vec1, dst0;
+
+  for (i = 0; i < width; i += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 32);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 48);
+    vec0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    vec1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_a);
+    src_argb += 64;
+    dst_a += 16;
+  }
+}
+
+void ARGBBlendRow_MSA(const uint8_t* src_argb0,
+                      const uint8_t* src_argb1,
+                      uint8_t* dst_argb,
+                      int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 vec8, vec9, vec10, vec11, vec12, vec13;
+  v8u16 const_256 = (v8u16)__msa_ldi_h(256);
+  v16u8 const_255 = (v16u8)__msa_ldi_b(255);
+  v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 0);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 16);
+    vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1);
+    vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2);
+    vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2);
+    vec6 = (v8u16)__msa_ilvr_b(zero, (v16i8)src3);
+    vec7 = (v8u16)__msa_ilvl_b(zero, (v16i8)src3);
+    vec8 = (v8u16)__msa_fill_h(vec0[3]);
+    vec9 = (v8u16)__msa_fill_h(vec0[7]);
+    vec10 = (v8u16)__msa_fill_h(vec1[3]);
+    vec11 = (v8u16)__msa_fill_h(vec1[7]);
+    vec8 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8);
+    vec9 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10);
+    vec10 = (v8u16)__msa_fill_h(vec2[3]);
+    vec11 = (v8u16)__msa_fill_h(vec2[7]);
+    vec12 = (v8u16)__msa_fill_h(vec3[3]);
+    vec13 = (v8u16)__msa_fill_h(vec3[7]);
+    vec10 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10);
+    vec11 = (v8u16)__msa_pckev_d((v2i64)vec13, (v2i64)vec12);
+    vec8 = const_256 - vec8;
+    vec9 = const_256 - vec9;
+    vec10 = const_256 - vec10;
+    vec11 = const_256 - vec11;
+    vec8 *= vec4;
+    vec9 *= vec5;
+    vec10 *= vec6;
+    vec11 *= vec7;
+    vec8 = (v8u16)__msa_srai_h((v8i16)vec8, 8);
+    vec9 = (v8u16)__msa_srai_h((v8i16)vec9, 8);
+    vec10 = (v8u16)__msa_srai_h((v8i16)vec10, 8);
+    vec11 = (v8u16)__msa_srai_h((v8i16)vec11, 8);
+    vec0 += vec8;
+    vec1 += vec9;
+    vec2 += vec10;
+    vec3 += vec11;
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    dst0 = __msa_bmnz_v(dst0, const_255, mask);
+    dst1 = __msa_bmnz_v(dst1, const_255, mask);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_argb0 += 32;
+    src_argb1 += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBQuantizeRow_MSA(uint8_t* dst_argb,
+                         int scale,
+                         int interval_size,
+                         int interval_offset,
+                         int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  v4i32 vec_scale = __msa_fill_w(scale);
+  v16u8 vec_int_sz = (v16u8)__msa_fill_b(interval_size);
+  v16u8 vec_int_ofst = (v16u8)__msa_fill_b(interval_offset);
+  v16i8 mask = {0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31};
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 48);
+    vec0 = (v8i16)__msa_ilvr_b(zero, (v16i8)src0);
+    vec1 = (v8i16)__msa_ilvl_b(zero, (v16i8)src0);
+    vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1);
+    vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1);
+    vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2);
+    vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2);
+    vec6 = (v8i16)__msa_ilvr_b(zero, (v16i8)src3);
+    vec7 = (v8i16)__msa_ilvl_b(zero, (v16i8)src3);
+    tmp0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
+    tmp1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
+    tmp2 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
+    tmp3 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
+    tmp4 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec2);
+    tmp5 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec2);
+    tmp6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec3);
+    tmp7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec3);
+    tmp8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec4);
+    tmp9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec4);
+    tmp10 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec5);
+    tmp11 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec5);
+    tmp12 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec6);
+    tmp13 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec6);
+    tmp14 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec7);
+    tmp15 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec7);
+    tmp0 *= vec_scale;
+    tmp1 *= vec_scale;
+    tmp2 *= vec_scale;
+    tmp3 *= vec_scale;
+    tmp4 *= vec_scale;
+    tmp5 *= vec_scale;
+    tmp6 *= vec_scale;
+    tmp7 *= vec_scale;
+    tmp8 *= vec_scale;
+    tmp9 *= vec_scale;
+    tmp10 *= vec_scale;
+    tmp11 *= vec_scale;
+    tmp12 *= vec_scale;
+    tmp13 *= vec_scale;
+    tmp14 *= vec_scale;
+    tmp15 *= vec_scale;
+    tmp0 >>= 16;
+    tmp1 >>= 16;
+    tmp2 >>= 16;
+    tmp3 >>= 16;
+    tmp4 >>= 16;
+    tmp5 >>= 16;
+    tmp6 >>= 16;
+    tmp7 >>= 16;
+    tmp8 >>= 16;
+    tmp9 >>= 16;
+    tmp10 >>= 16;
+    tmp11 >>= 16;
+    tmp12 >>= 16;
+    tmp13 >>= 16;
+    tmp14 >>= 16;
+    tmp15 >>= 16;
+    vec0 = (v8i16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+    vec1 = (v8i16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
+    vec2 = (v8i16)__msa_pckev_h((v8i16)tmp5, (v8i16)tmp4);
+    vec3 = (v8i16)__msa_pckev_h((v8i16)tmp7, (v8i16)tmp6);
+    vec4 = (v8i16)__msa_pckev_h((v8i16)tmp9, (v8i16)tmp8);
+    vec5 = (v8i16)__msa_pckev_h((v8i16)tmp11, (v8i16)tmp10);
+    vec6 = (v8i16)__msa_pckev_h((v8i16)tmp13, (v8i16)tmp12);
+    vec7 = (v8i16)__msa_pckev_h((v8i16)tmp15, (v8i16)tmp14);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    dst2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+    dst3 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
+    dst0 *= vec_int_sz;
+    dst1 *= vec_int_sz;
+    dst2 *= vec_int_sz;
+    dst3 *= vec_int_sz;
+    dst0 += vec_int_ofst;
+    dst1 += vec_int_ofst;
+    dst2 += vec_int_ofst;
+    dst3 += vec_int_ofst;
+    dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)src0, (v16i8)dst0);
+    dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)dst1);
+    dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)src2, (v16i8)dst2);
+    dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)src3, (v16i8)dst3);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    dst_argb += 64;
+  }
+}
+
+void ARGBColorMatrixRow_MSA(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            const int8_t* matrix_argb,
+                            int width) {
+  int32_t x;
+  v16i8 src0;
+  v16u8 src1, src2, dst0, dst1;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+  v8i16 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
+  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  v16i8 zero = {0};
+  v8i16 max = __msa_ldi_h(255);
+
+  src0 = __msa_ld_b((v16i8*)matrix_argb, 0);
+  vec0 = (v8i16)__msa_ilvr_b(zero, src0);
+  vec1 = (v8i16)__msa_ilvl_b(zero, src0);
+
+  for (x = 0; x < width; x += 8) {
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16);
+    vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1);
+    vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1);
+    vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2);
+    vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2);
+    vec6 = (v8i16)__msa_pckod_d((v2i64)vec2, (v2i64)vec2);
+    vec7 = (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec3);
+    vec8 = (v8i16)__msa_pckod_d((v2i64)vec4, (v2i64)vec4);
+    vec9 = (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec5);
+    vec2 = (v8i16)__msa_pckev_d((v2i64)vec2, (v2i64)vec2);
+    vec3 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec3);
+    vec4 = (v8i16)__msa_pckev_d((v2i64)vec4, (v2i64)vec4);
+    vec5 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec5);
+    vec10 = vec2 * vec0;
+    vec11 = vec2 * vec1;
+    vec12 = vec6 * vec0;
+    vec13 = vec6 * vec1;
+    tmp0 = __msa_hadd_s_w(vec10, vec10);
+    tmp1 = __msa_hadd_s_w(vec11, vec11);
+    tmp2 = __msa_hadd_s_w(vec12, vec12);
+    tmp3 = __msa_hadd_s_w(vec13, vec13);
+    vec14 = vec3 * vec0;
+    vec15 = vec3 * vec1;
+    vec16 = vec7 * vec0;
+    vec17 = vec7 * vec1;
+    tmp4 = __msa_hadd_s_w(vec14, vec14);
+    tmp5 = __msa_hadd_s_w(vec15, vec15);
+    tmp6 = __msa_hadd_s_w(vec16, vec16);
+    tmp7 = __msa_hadd_s_w(vec17, vec17);
+    vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+    vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
+    vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4);
+    vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6);
+    tmp0 = __msa_hadd_s_w(vec10, vec10);
+    tmp1 = __msa_hadd_s_w(vec11, vec11);
+    tmp2 = __msa_hadd_s_w(vec12, vec12);
+    tmp3 = __msa_hadd_s_w(vec13, vec13);
+    tmp0 = __msa_srai_w(tmp0, 6);
+    tmp1 = __msa_srai_w(tmp1, 6);
+    tmp2 = __msa_srai_w(tmp2, 6);
+    tmp3 = __msa_srai_w(tmp3, 6);
+    vec2 = vec4 * vec0;
+    vec6 = vec4 * vec1;
+    vec3 = vec8 * vec0;
+    vec7 = vec8 * vec1;
+    tmp8 = __msa_hadd_s_w(vec2, vec2);
+    tmp9 = __msa_hadd_s_w(vec6, vec6);
+    tmp10 = __msa_hadd_s_w(vec3, vec3);
+    tmp11 = __msa_hadd_s_w(vec7, vec7);
+    vec4 = vec5 * vec0;
+    vec8 = vec5 * vec1;
+    vec5 = vec9 * vec0;
+    vec9 = vec9 * vec1;
+    tmp12 = __msa_hadd_s_w(vec4, vec4);
+    tmp13 = __msa_hadd_s_w(vec8, vec8);
+    tmp14 = __msa_hadd_s_w(vec5, vec5);
+    tmp15 = __msa_hadd_s_w(vec9, vec9);
+    vec14 = __msa_pckev_h((v8i16)tmp9, (v8i16)tmp8);
+    vec15 = __msa_pckev_h((v8i16)tmp11, (v8i16)tmp10);
+    vec16 = __msa_pckev_h((v8i16)tmp13, (v8i16)tmp12);
+    vec17 = __msa_pckev_h((v8i16)tmp15, (v8i16)tmp14);
+    tmp4 = __msa_hadd_s_w(vec14, vec14);
+    tmp5 = __msa_hadd_s_w(vec15, vec15);
+    tmp6 = __msa_hadd_s_w(vec16, vec16);
+    tmp7 = __msa_hadd_s_w(vec17, vec17);
+    tmp4 = __msa_srai_w(tmp4, 6);
+    tmp5 = __msa_srai_w(tmp5, 6);
+    tmp6 = __msa_srai_w(tmp6, 6);
+    tmp7 = __msa_srai_w(tmp7, 6);
+    vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+    vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
+    vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4);
+    vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6);
+    vec10 = __msa_maxi_s_h(vec10, 0);
+    vec11 = __msa_maxi_s_h(vec11, 0);
+    vec12 = __msa_maxi_s_h(vec12, 0);
+    vec13 = __msa_maxi_s_h(vec13, 0);
+    vec10 = __msa_min_s_h(vec10, max);
+    vec11 = __msa_min_s_h(vec11, max);
+    vec12 = __msa_min_s_h(vec12, max);
+    vec13 = __msa_min_s_h(vec13, max);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)vec13, (v16i8)vec12);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_argb += 32;
+    dst_argb += 32;
+  }
+}
+
+void SplitUVRow_MSA(const uint8_t* src_uv,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+
+  for (x = 0; x < width; x += 32) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 32);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 48);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    dst2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    dst3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    ST_UB2(dst0, dst1, dst_u, 16);
+    ST_UB2(dst2, dst3, dst_v, 16);
+    src_uv += 64;
+    dst_u += 32;
+    dst_v += 32;
+  }
+}
+
+void SetRow_MSA(uint8_t* dst, uint8_t v8, int width) {
+  int x;
+  v16u8 dst0 = (v16u8)__msa_fill_b(v8);
+
+  for (x = 0; x < width; x += 16) {
+    ST_UB(dst0, dst);
+    dst += 16;
+  }
+}
+
+void MirrorUVRow_MSA(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  int x;
+  v16u8 src0, src1, src2, src3;
+  v16u8 dst0, dst1, dst2, dst3;
+  v16i8 mask0 = {30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0};
+  v16i8 mask1 = {31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1};
+
+  src_uv += (2 * width);
+
+  for (x = 0; x < width; x += 32) {
+    src_uv -= 64;
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 0);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 16);
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 32);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 48);
+    dst0 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
+    dst2 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);
+    dst3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2);
+    ST_UB2(dst0, dst1, dst_v, 16);
+    ST_UB2(dst2, dst3, dst_u, 16);
+    dst_u += 32;
+    dst_v += 32;
+  }
+}
+
+void SobelXRow_MSA(const uint8_t* src_y0,
+                   const uint8_t* src_y1,
+                   const uint8_t* src_y2,
+                   uint8_t* dst_sobelx,
+                   int32_t width) {
+  int x;
+  v16u8 src0, src1, src2, src3, src4, src5, dst0;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
+  v16i8 mask0 = {0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9};
+  v16i8 tmp = __msa_ldi_b(8);
+  v16i8 mask1 = mask0 + tmp;
+  v8i16 zero = {0};
+  v8i16 max = __msa_ldi_h(255);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_y0, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_y0, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_y1, 0);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_y1, 16);
+    src4 = (v16u8)__msa_ld_b((const v16i8*)src_y2, 0);
+    src5 = (v16u8)__msa_ld_b((const v16i8*)src_y2, 16);
+    vec0 = (v8i16)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);
+    vec1 = (v8i16)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
+    vec2 = (v8i16)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2);
+    vec3 = (v8i16)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
+    vec4 = (v8i16)__msa_vshf_b(mask0, (v16i8)src5, (v16i8)src4);
+    vec5 = (v8i16)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4);
+    vec0 = (v8i16)__msa_hsub_u_h((v16u8)vec0, (v16u8)vec0);
+    vec1 = (v8i16)__msa_hsub_u_h((v16u8)vec1, (v16u8)vec1);
+    vec2 = (v8i16)__msa_hsub_u_h((v16u8)vec2, (v16u8)vec2);
+    vec3 = (v8i16)__msa_hsub_u_h((v16u8)vec3, (v16u8)vec3);
+    vec4 = (v8i16)__msa_hsub_u_h((v16u8)vec4, (v16u8)vec4);
+    vec5 = (v8i16)__msa_hsub_u_h((v16u8)vec5, (v16u8)vec5);
+    vec0 += vec2;
+    vec1 += vec3;
+    vec4 += vec2;
+    vec5 += vec3;
+    vec0 += vec4;
+    vec1 += vec5;
+    vec0 = __msa_add_a_h(zero, vec0);
+    vec1 = __msa_add_a_h(zero, vec1);
+    vec0 = __msa_maxi_s_h(vec0, 0);
+    vec1 = __msa_maxi_s_h(vec1, 0);
+    vec0 = __msa_min_s_h(max, vec0);
+    vec1 = __msa_min_s_h(max, vec1);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_sobelx);
+    src_y0 += 16;
+    src_y1 += 16;
+    src_y2 += 16;
+    dst_sobelx += 16;
+  }
+}
+
+void SobelYRow_MSA(const uint8_t* src_y0,
+                   const uint8_t* src_y1,
+                   uint8_t* dst_sobely,
+                   int32_t width) {
+  int x;
+  v16u8 src0, src1, dst0;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6;
+  v8i16 zero = {0};
+  v8i16 max = __msa_ldi_h(255);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_y0, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_y1, 0);
+    vec0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src0);
+    vec1 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src0);
+    vec2 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
+    vec3 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src1);
+    vec0 -= vec2;
+    vec1 -= vec3;
+    vec6[0] = src_y0[16] - src_y1[16];
+    vec6[1] = src_y0[17] - src_y1[17];
+    vec2 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 2);
+    vec3 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 2);
+    vec4 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 4);
+    vec5 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 4);
+    vec0 += vec2;
+    vec1 += vec3;
+    vec4 += vec2;
+    vec5 += vec3;
+    vec0 += vec4;
+    vec1 += vec5;
+    vec0 = __msa_add_a_h(zero, vec0);
+    vec1 = __msa_add_a_h(zero, vec1);
+    vec0 = __msa_maxi_s_h(vec0, 0);
+    vec1 = __msa_maxi_s_h(vec1, 0);
+    vec0 = __msa_min_s_h(max, vec0);
+    vec1 = __msa_min_s_h(max, vec1);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_sobely);
+    src_y0 += 16;
+    src_y1 += 16;
+    dst_sobely += 16;
+  }
+}
+
+void HalfFloatRow_MSA(const uint16_t* src,
+                      uint16_t* dst,
+                      float scale,
+                      int width) {
+  int i;
+  v8u16 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+  v4u32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v4f32 fvec0, fvec1, fvec2, fvec3, fvec4, fvec5, fvec6, fvec7;
+  v4f32 mult_vec;
+  v8i16 zero = {0};
+  mult_vec[0] = 1.9259299444e-34f * scale;
+  mult_vec = (v4f32)__msa_splati_w((v4i32)mult_vec, 0);
+
+  for (i = 0; i < width; i += 32) {
+    src0 = (v8u16)__msa_ld_h((v8i16*)src, 0);
+    src1 = (v8u16)__msa_ld_h((v8i16*)src, 16);
+    src2 = (v8u16)__msa_ld_h((v8i16*)src, 32);
+    src3 = (v8u16)__msa_ld_h((v8i16*)src, 48);
+    vec0 = (v4u32)__msa_ilvr_h(zero, (v8i16)src0);
+    vec1 = (v4u32)__msa_ilvl_h(zero, (v8i16)src0);
+    vec2 = (v4u32)__msa_ilvr_h(zero, (v8i16)src1);
+    vec3 = (v4u32)__msa_ilvl_h(zero, (v8i16)src1);
+    vec4 = (v4u32)__msa_ilvr_h(zero, (v8i16)src2);
+    vec5 = (v4u32)__msa_ilvl_h(zero, (v8i16)src2);
+    vec6 = (v4u32)__msa_ilvr_h(zero, (v8i16)src3);
+    vec7 = (v4u32)__msa_ilvl_h(zero, (v8i16)src3);
+    fvec0 = __msa_ffint_u_w(vec0);
+    fvec1 = __msa_ffint_u_w(vec1);
+    fvec2 = __msa_ffint_u_w(vec2);
+    fvec3 = __msa_ffint_u_w(vec3);
+    fvec4 = __msa_ffint_u_w(vec4);
+    fvec5 = __msa_ffint_u_w(vec5);
+    fvec6 = __msa_ffint_u_w(vec6);
+    fvec7 = __msa_ffint_u_w(vec7);
+    fvec0 *= mult_vec;
+    fvec1 *= mult_vec;
+    fvec2 *= mult_vec;
+    fvec3 *= mult_vec;
+    fvec4 *= mult_vec;
+    fvec5 *= mult_vec;
+    fvec6 *= mult_vec;
+    fvec7 *= mult_vec;
+    vec0 = ((v4u32)fvec0) >> 13;
+    vec1 = ((v4u32)fvec1) >> 13;
+    vec2 = ((v4u32)fvec2) >> 13;
+    vec3 = ((v4u32)fvec3) >> 13;
+    vec4 = ((v4u32)fvec4) >> 13;
+    vec5 = ((v4u32)fvec5) >> 13;
+    vec6 = ((v4u32)fvec6) >> 13;
+    vec7 = ((v4u32)fvec7) >> 13;
+    dst0 = (v8u16)__msa_pckev_h((v8i16)vec1, (v8i16)vec0);
+    dst1 = (v8u16)__msa_pckev_h((v8i16)vec3, (v8i16)vec2);
+    dst2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
+    dst3 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
+    ST_UH2(dst0, dst1, dst, 8);
+    ST_UH2(dst2, dst3, dst + 16, 8);
+    src += 32;
+    dst += 32;
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/row_neon.cc b/media/libvpx/libvpx/third_party/libyuv/source/row_neon.cc
index 909df060c6..ff87e74c62 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/row_neon.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/row_neon.cc
@@ -10,6 +10,8 @@
 
 #include "libyuv/row.h"
 
+#include <stdio.h>
+
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
@@ -20,1446 +22,1311 @@ extern "C" {
     !defined(__aarch64__)
 
 // Read 8 Y, 4 U and 4 V from 422
-#define READYUV422                                                             \
-    MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
-    MEMACCESS(1)                                                               \
-    "vld1.32    {d2[0]}, [%1]!                 \n"                             \
-    MEMACCESS(2)                                                               \
-    "vld1.32    {d2[1]}, [%2]!                 \n"
-
-// Read 8 Y, 2 U and 2 V from 422
-#define READYUV411                                                             \
-    MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
-    MEMACCESS(1)                                                               \
-    "vld1.16    {d2[0]}, [%1]!                 \n"                             \
-    MEMACCESS(2)                                                               \
-    "vld1.16    {d2[1]}, [%2]!                 \n"                             \
-    "vmov.u8    d3, d2                         \n"                             \
-    "vzip.u8    d2, d3                         \n"
+#define READYUV422                               \
+  "vld1.8     {d0}, [%0]!                    \n" \
+  "vld1.32    {d2[0]}, [%1]!                 \n" \
+  "vld1.32    {d2[1]}, [%2]!                 \n"
 
 // Read 8 Y, 8 U and 8 V from 444
-#define READYUV444                                                             \
-    MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
-    MEMACCESS(1)                                                               \
-    "vld1.8     {d2}, [%1]!                    \n"                             \
-    MEMACCESS(2)                                                               \
-    "vld1.8     {d3}, [%2]!                    \n"                             \
-    "vpaddl.u8  q1, q1                         \n"                             \
-    "vrshrn.u16 d2, q1, #1                     \n"
+#define READYUV444                               \
+  "vld1.8     {d0}, [%0]!                    \n" \
+  "vld1.8     {d2}, [%1]!                    \n" \
+  "vld1.8     {d3}, [%2]!                    \n" \
+  "vpaddl.u8  q1, q1                         \n" \
+  "vrshrn.u16 d2, q1, #1                     \n"
 
 // Read 8 Y, and set 4 U and 4 V to 128
-#define READYUV400                                                             \
-    MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
-    "vmov.u8    d2, #128                       \n"
+#define READYUV400                               \
+  "vld1.8     {d0}, [%0]!                    \n" \
+  "vmov.u8    d2, #128                       \n"
 
 // Read 8 Y and 4 UV from NV12
 #define READNV12                                                               \
-    MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
-    MEMACCESS(1)                                                               \
-    "vld1.8     {d2}, [%1]!                    \n"                             \
-    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
-    "vuzp.u8    d2, d3                         \n"                             \
-    "vtrn.u32   d2, d3                         \n"
+  "vld1.8     {d0}, [%0]!                    \n"                               \
+  "vld1.8     {d2}, [%1]!                    \n"                               \
+  "vmov.u8    d3, d2                         \n" /* split odd/even uv apart */ \
+  "vuzp.u8    d2, d3                         \n"                               \
+  "vtrn.u32   d2, d3                         \n"
 
 // Read 8 Y and 4 VU from NV21
 #define READNV21                                                               \
-    MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
-    MEMACCESS(1)                                                               \
-    "vld1.8     {d2}, [%1]!                    \n"                             \
-    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
-    "vuzp.u8    d3, d2                         \n"                             \
-    "vtrn.u32   d2, d3                         \n"
+  "vld1.8     {d0}, [%0]!                    \n"                               \
+  "vld1.8     {d2}, [%1]!                    \n"                               \
+  "vmov.u8    d3, d2                         \n" /* split odd/even uv apart */ \
+  "vuzp.u8    d3, d2                         \n"                               \
+  "vtrn.u32   d2, d3                         \n"
 
 // Read 8 YUY2
-#define READYUY2                                                               \
-    MEMACCESS(0)                                                               \
-    "vld2.8     {d0, d2}, [%0]!                \n"                             \
-    "vmov.u8    d3, d2                         \n"                             \
-    "vuzp.u8    d2, d3                         \n"                             \
-    "vtrn.u32   d2, d3                         \n"
+#define READYUY2                                 \
+  "vld2.8     {d0, d2}, [%0]!                \n" \
+  "vmov.u8    d3, d2                         \n" \
+  "vuzp.u8    d2, d3                         \n" \
+  "vtrn.u32   d2, d3                         \n"
 
 // Read 8 UYVY
-#define READUYVY                                                               \
-    MEMACCESS(0)                                                               \
-    "vld2.8     {d2, d3}, [%0]!                \n"                             \
-    "vmov.u8    d0, d3                         \n"                             \
-    "vmov.u8    d3, d2                         \n"                             \
-    "vuzp.u8    d2, d3                         \n"                             \
-    "vtrn.u32   d2, d3                         \n"
+#define READUYVY                                 \
+  "vld2.8     {d2, d3}, [%0]!                \n" \
+  "vmov.u8    d0, d3                         \n" \
+  "vmov.u8    d3, d2                         \n" \
+  "vuzp.u8    d2, d3                         \n" \
+  "vtrn.u32   d2, d3                         \n"
 
-#define YUVTORGB_SETUP                                                         \
-    MEMACCESS([kUVToRB])                                                       \
-    "vld1.8     {d24}, [%[kUVToRB]]            \n"                             \
-    MEMACCESS([kUVToG])                                                        \
-    "vld1.8     {d25}, [%[kUVToG]]             \n"                             \
-    MEMACCESS([kUVBiasBGR])                                                    \
-    "vld1.16    {d26[], d27[]}, [%[kUVBiasBGR]]! \n"                           \
-    MEMACCESS([kUVBiasBGR])                                                    \
-    "vld1.16    {d8[], d9[]}, [%[kUVBiasBGR]]!   \n"                           \
-    MEMACCESS([kUVBiasBGR])                                                    \
-    "vld1.16    {d28[], d29[]}, [%[kUVBiasBGR]]  \n"                           \
-    MEMACCESS([kYToRgb])                                                       \
-    "vld1.32    {d30[], d31[]}, [%[kYToRgb]]     \n"
+#define YUVTORGB_SETUP                             \
+  "vld1.8     {d24}, [%[kUVToRB]]            \n"   \
+  "vld1.8     {d25}, [%[kUVToG]]             \n"   \
+  "vld1.16    {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \
+  "vld1.16    {d8[], d9[]}, [%[kUVBiasBGR]]!   \n" \
+  "vld1.16    {d28[], d29[]}, [%[kUVBiasBGR]]  \n" \
+  "vld1.32    {d30[], d31[]}, [%[kYToRgb]]     \n"
 
-#define YUVTORGB                                                               \
-    "vmull.u8   q8, d2, d24                    \n" /* u/v B/R component      */\
-    "vmull.u8   q9, d2, d25                    \n" /* u/v G component        */\
-    "vmovl.u8   q0, d0                         \n" /* Y                      */\
-    "vmovl.s16  q10, d1                        \n"                             \
-    "vmovl.s16  q0, d0                         \n"                             \
-    "vmul.s32   q10, q10, q15                  \n"                             \
-    "vmul.s32   q0, q0, q15                    \n"                             \
-    "vqshrun.s32 d0, q0, #16                   \n"                             \
-    "vqshrun.s32 d1, q10, #16                  \n" /* Y                      */\
-    "vadd.s16   d18, d19                       \n"                             \
-    "vshll.u16  q1, d16, #16                   \n" /* Replicate u * UB       */\
-    "vshll.u16  q10, d17, #16                  \n" /* Replicate v * VR       */\
-    "vshll.u16  q3, d18, #16                   \n" /* Replicate (v*VG + u*UG)*/\
-    "vaddw.u16  q1, q1, d16                    \n"                             \
-    "vaddw.u16  q10, q10, d17                  \n"                             \
-    "vaddw.u16  q3, q3, d18                    \n"                             \
-    "vqadd.s16  q8, q0, q13                    \n" /* B */                     \
-    "vqadd.s16  q9, q0, q14                    \n" /* R */                     \
-    "vqadd.s16  q0, q0, q4                     \n" /* G */                     \
-    "vqadd.s16  q8, q8, q1                     \n" /* B */                     \
-    "vqadd.s16  q9, q9, q10                    \n" /* R */                     \
-    "vqsub.s16  q0, q0, q3                     \n" /* G */                     \
-    "vqshrun.s16 d20, q8, #6                   \n" /* B */                     \
-    "vqshrun.s16 d22, q9, #6                   \n" /* R */                     \
-    "vqshrun.s16 d21, q0, #6                   \n" /* G */
+#define YUVTORGB                                                              \
+  "vmull.u8   q8, d2, d24                    \n" /* u/v B/R component      */ \
+  "vmull.u8   q9, d2, d25                    \n" /* u/v G component        */ \
+  "vmovl.u8   q0, d0                         \n" /* Y                      */ \
+  "vmovl.s16  q10, d1                        \n"                              \
+  "vmovl.s16  q0, d0                         \n"                              \
+  "vmul.s32   q10, q10, q15                  \n"                              \
+  "vmul.s32   q0, q0, q15                    \n"                              \
+  "vqshrun.s32 d0, q0, #16                   \n"                              \
+  "vqshrun.s32 d1, q10, #16                  \n" /* Y                      */ \
+  "vadd.s16   d18, d19                       \n"                              \
+  "vshll.u16  q1, d16, #16                   \n" /* Replicate u * UB       */ \
+  "vshll.u16  q10, d17, #16                  \n" /* Replicate v * VR       */ \
+  "vshll.u16  q3, d18, #16                   \n" /* Replicate (v*VG + u*UG)*/ \
+  "vaddw.u16  q1, q1, d16                    \n"                              \
+  "vaddw.u16  q10, q10, d17                  \n"                              \
+  "vaddw.u16  q3, q3, d18                    \n"                              \
+  "vqadd.s16  q8, q0, q13                    \n" /* B */                      \
+  "vqadd.s16  q9, q0, q14                    \n" /* R */                      \
+  "vqadd.s16  q0, q0, q4                     \n" /* G */                      \
+  "vqadd.s16  q8, q8, q1                     \n" /* B */                      \
+  "vqadd.s16  q9, q9, q10                    \n" /* R */                      \
+  "vqsub.s16  q0, q0, q3                     \n" /* G */                      \
+  "vqshrun.s16 d20, q8, #6                   \n" /* B */                      \
+  "vqshrun.s16 d22, q9, #6                   \n" /* R */                      \
+  "vqshrun.s16 d21, q0, #6                   \n" /* G */
 
-void I444ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+void I444ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "vmov.u8    d23, #255                      \n"
-  "1:                                          \n"
-    READYUV444
-    YUVTORGB
-    "subs       %4, %4, #8                     \n"
-    MEMACCESS(3)
-    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_argb),  // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8    d23, #255                      \n"
+      "1:                                        \n" READYUV444 YUVTORGB
+      "subs       %4, %4, #8                     \n"
+      "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_argb),  // %3
+        "+r"(width)      // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
 }
 
-void I422ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+void I422ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "vmov.u8    d23, #255                      \n"
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "subs       %4, %4, #8                     \n"
-    MEMACCESS(3)
-    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_argb),  // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8    d23, #255                      \n"
+      "1:                                        \n" READYUV422 YUVTORGB
+      "subs       %4, %4, #8                     \n"
+      "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_argb),  // %3
+        "+r"(width)      // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
 }
 
-void I422AlphaToARGBRow_NEON(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             const uint8* src_a,
-                             uint8* dst_argb,
+void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             const uint8_t* src_a,
+                             uint8_t* dst_argb,
                              const struct YuvConstants* yuvconstants,
                              int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "subs       %5, %5, #8                     \n"
-    MEMACCESS(3)
-    "vld1.8     {d23}, [%3]!                   \n"
-    MEMACCESS(4)
-    "vst4.8     {d20, d21, d22, d23}, [%4]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(src_a),     // %3
-      "+r"(dst_argb),  // %4
-      "+r"(width)      // %5
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV422 YUVTORGB
+      "subs       %5, %5, #8                     \n"
+      "vld1.8     {d23}, [%3]!                   \n"
+      "vst4.8     {d20, d21, d22, d23}, [%4]!    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(src_a),     // %3
+        "+r"(dst_argb),  // %4
+        "+r"(width)      // %5
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
 }
 
-void I411ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+void I422ToRGBARow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgba,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "vmov.u8    d23, #255                      \n"
-  "1:                                          \n"
-    READYUV411
-    YUVTORGB
-    "subs       %4, %4, #8                     \n"
-    MEMACCESS(3)
-    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_argb),  // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV422 YUVTORGB
+      "subs       %4, %4, #8                     \n"
+      "vmov.u8    d19, #255                      \n"  // YUVTORGB modified d19
+      "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_rgba),  // %3
+        "+r"(width)      // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
 }
 
-void I422ToRGBARow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_rgba,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "subs       %4, %4, #8                     \n"
-    "vmov.u8    d19, #255                      \n"  // d19 modified by YUVTORGB
-    MEMACCESS(3)
-    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_rgba),  // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void I422ToRGB24Row_NEON(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_rgb24,
+void I422ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "subs       %4, %4, #8                     \n"
-    MEMACCESS(3)
-    "vst3.8     {d20, d21, d22}, [%3]!         \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),      // %0
-      "+r"(src_u),      // %1
-      "+r"(src_v),      // %2
-      "+r"(dst_rgb24),  // %3
-      "+r"(width)       // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV422 YUVTORGB
+      "subs       %4, %4, #8                     \n"
+      "vst3.8     {d20, d21, d22}, [%3]!         \n"
+      "bgt        1b                             \n"
+      : "+r"(src_y),      // %0
+        "+r"(src_u),      // %1
+        "+r"(src_v),      // %2
+        "+r"(dst_rgb24),  // %3
+        "+r"(width)       // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
 }
 
-#define ARGBTORGB565                                                           \
-    "vshll.u8    q0, d22, #8                   \n"  /* R                    */ \
-    "vshll.u8    q8, d21, #8                   \n"  /* G                    */ \
-    "vshll.u8    q9, d20, #8                   \n"  /* B                    */ \
-    "vsri.16     q0, q8, #5                    \n"  /* RG                   */ \
-    "vsri.16     q0, q9, #11                   \n"  /* RGB                  */
+#define ARGBTORGB565                                                        \
+  "vshll.u8    q0, d22, #8                   \n" /* R                    */ \
+  "vshll.u8    q8, d21, #8                   \n" /* G                    */ \
+  "vshll.u8    q9, d20, #8                   \n" /* B                    */ \
+  "vsri.16     q0, q8, #5                    \n" /* RG                   */ \
+  "vsri.16     q0, q9, #11                   \n" /* RGB                  */
 
-void I422ToRGB565Row_NEON(const uint8* src_y,
-                          const uint8* src_u,
-                          const uint8* src_v,
-                          uint8* dst_rgb565,
+void I422ToRGB565Row_NEON(const uint8_t* src_y,
+                          const uint8_t* src_u,
+                          const uint8_t* src_v,
+                          uint8_t* dst_rgb565,
                           const struct YuvConstants* yuvconstants,
                           int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "subs       %4, %4, #8                     \n"
-    ARGBTORGB565
-    MEMACCESS(3)
-    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels RGB565.
-    "bgt        1b                             \n"
-    : "+r"(src_y),    // %0
-      "+r"(src_u),    // %1
-      "+r"(src_v),    // %2
-      "+r"(dst_rgb565),  // %3
-      "+r"(width)     // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV422 YUVTORGB
+      "subs       %4, %4, #8                     \n" ARGBTORGB565
+      "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels RGB565.
+      "bgt        1b                             \n"
+      : "+r"(src_y),       // %0
+        "+r"(src_u),       // %1
+        "+r"(src_v),       // %2
+        "+r"(dst_rgb565),  // %3
+        "+r"(width)        // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
 }
 
-#define ARGBTOARGB1555                                                         \
-    "vshll.u8    q0, d23, #8                   \n"  /* A                    */ \
-    "vshll.u8    q8, d22, #8                   \n"  /* R                    */ \
-    "vshll.u8    q9, d21, #8                   \n"  /* G                    */ \
-    "vshll.u8    q10, d20, #8                  \n"  /* B                    */ \
-    "vsri.16     q0, q8, #1                    \n"  /* AR                   */ \
-    "vsri.16     q0, q9, #6                    \n"  /* ARG                  */ \
-    "vsri.16     q0, q10, #11                  \n"  /* ARGB                 */
+#define ARGBTOARGB1555                                                      \
+  "vshll.u8    q0, d23, #8                   \n" /* A                    */ \
+  "vshll.u8    q8, d22, #8                   \n" /* R                    */ \
+  "vshll.u8    q9, d21, #8                   \n" /* G                    */ \
+  "vshll.u8    q10, d20, #8                  \n" /* B                    */ \
+  "vsri.16     q0, q8, #1                    \n" /* AR                   */ \
+  "vsri.16     q0, q9, #6                    \n" /* ARG                  */ \
+  "vsri.16     q0, q10, #11                  \n" /* ARGB                 */
 
-void I422ToARGB1555Row_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb1555,
+void I422ToARGB1555Row_NEON(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb1555,
                             const struct YuvConstants* yuvconstants,
                             int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "subs       %4, %4, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
-    ARGBTOARGB1555
-    MEMACCESS(3)
-    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB1555.
-    "bgt        1b                             \n"
-    : "+r"(src_y),    // %0
-      "+r"(src_u),    // %1
-      "+r"(src_v),    // %2
-      "+r"(dst_argb1555),  // %3
-      "+r"(width)     // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV422 YUVTORGB
+      "subs       %4, %4, #8                     \n"
+      "vmov.u8    d23, #255                      \n" ARGBTOARGB1555
+      "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels
+      "bgt        1b                             \n"
+      : "+r"(src_y),         // %0
+        "+r"(src_u),         // %1
+        "+r"(src_v),         // %2
+        "+r"(dst_argb1555),  // %3
+        "+r"(width)          // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
 }
 
-#define ARGBTOARGB4444                                                         \
-    "vshr.u8    d20, d20, #4                   \n"  /* B                    */ \
-    "vbic.32    d21, d21, d4                   \n"  /* G                    */ \
-    "vshr.u8    d22, d22, #4                   \n"  /* R                    */ \
-    "vbic.32    d23, d23, d4                   \n"  /* A                    */ \
-    "vorr       d0, d20, d21                   \n"  /* BG                   */ \
-    "vorr       d1, d22, d23                   \n"  /* RA                   */ \
-    "vzip.u8    d0, d1                         \n"  /* BGRA                 */
+#define ARGBTOARGB4444                                                      \
+  "vshr.u8    d20, d20, #4                   \n" /* B                    */ \
+  "vbic.32    d21, d21, d4                   \n" /* G                    */ \
+  "vshr.u8    d22, d22, #4                   \n" /* R                    */ \
+  "vbic.32    d23, d23, d4                   \n" /* A                    */ \
+  "vorr       d0, d20, d21                   \n" /* BG                   */ \
+  "vorr       d1, d22, d23                   \n" /* RA                   */ \
+  "vzip.u8    d0, d1                         \n" /* BGRA                 */
 
-void I422ToARGB4444Row_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb4444,
+void I422ToARGB4444Row_NEON(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb4444,
                             const struct YuvConstants* yuvconstants,
                             int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "subs       %4, %4, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
-    ARGBTOARGB4444
-    MEMACCESS(3)
-    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB4444.
-    "bgt        1b                             \n"
-    : "+r"(src_y),    // %0
-      "+r"(src_u),    // %1
-      "+r"(src_v),    // %2
-      "+r"(dst_argb4444),  // %3
-      "+r"(width)     // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8    d4, #0x0f                      \n"  // vbic bits to clear
+      "1:                                        \n"
+
+      READYUV422 YUVTORGB
+      "subs       %4, %4, #8                     \n"
+      "vmov.u8    d23, #255                      \n" ARGBTOARGB4444
+      "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels
+      "bgt        1b                             \n"
+      : "+r"(src_y),         // %0
+        "+r"(src_u),         // %1
+        "+r"(src_v),         // %2
+        "+r"(dst_argb4444),  // %3
+        "+r"(width)          // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
 }
 
-void I400ToARGBRow_NEON(const uint8* src_y,
-                        uint8* dst_argb,
-                        int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "vmov.u8    d23, #255                      \n"
-  "1:                                          \n"
-    READYUV400
-    YUVTORGB
-    "subs       %2, %2, #8                     \n"
-    MEMACCESS(1)
-    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(dst_argb),  // %1
-      "+r"(width)      // %2
-    : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
-      [kUVToG]"r"(&kYuvI601Constants.kUVToG),
-      [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
-      [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
+void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8    d23, #255                      \n"
+      "1:                                        \n" READYUV400 YUVTORGB
+      "subs       %2, %2, #8                     \n"
+      "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_y),     // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : [kUVToRB] "r"(&kYuvI601Constants.kUVToRB),
+        [kUVToG] "r"(&kYuvI601Constants.kUVToG),
+        [kUVBiasBGR] "r"(&kYuvI601Constants.kUVBiasBGR),
+        [kYToRgb] "r"(&kYuvI601Constants.kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
 }
 
-void J400ToARGBRow_NEON(const uint8* src_y,
-                        uint8* dst_argb,
-                        int width) {
-  asm volatile (
-    "vmov.u8    d23, #255                      \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {d20}, [%0]!                   \n"
-    "vmov       d21, d20                       \n"
-    "vmov       d22, d20                       \n"
-    "subs       %2, %2, #8                     \n"
-    MEMACCESS(1)
-    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(dst_argb),  // %1
-      "+r"(width)      // %2
-    :
-    : "cc", "memory", "d20", "d21", "d22", "d23"
-  );
+void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "vmov.u8    d23, #255                      \n"
+      "1:                                        \n"
+      "vld1.8     {d20}, [%0]!                   \n"
+      "vmov       d21, d20                       \n"
+      "vmov       d22, d20                       \n"
+      "subs       %2, %2, #8                     \n"
+      "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_y),     // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "d20", "d21", "d22", "d23");
 }
 
-void NV12ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_uv,
-                        uint8* dst_argb,
+void NV12ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_uv,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "vmov.u8    d23, #255                      \n"
-  "1:                                          \n"
-    READNV12
-    YUVTORGB
-    "subs       %3, %3, #8                     \n"
-    MEMACCESS(2)
-    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_uv),    // %1
-      "+r"(dst_argb),  // %2
-      "+r"(width)      // %3
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
+  asm volatile(YUVTORGB_SETUP
+               "vmov.u8    d23, #255                      \n"
+               "1:                                        \n" READNV12 YUVTORGB
+               "subs       %3, %3, #8                     \n"
+               "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
+               "bgt        1b                             \n"
+               : "+r"(src_y),     // %0
+                 "+r"(src_uv),    // %1
+                 "+r"(dst_argb),  // %2
+                 "+r"(width)      // %3
+               : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+                 [kUVToG] "r"(&yuvconstants->kUVToG),
+                 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+                 [kYToRgb] "r"(&yuvconstants->kYToRgb)
+               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
+                 "q10", "q11", "q12", "q13", "q14", "q15");
 }
 
-void NV21ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_vu,
-                        uint8* dst_argb,
+void NV21ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_vu,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "vmov.u8    d23, #255                      \n"
-  "1:                                          \n"
-    READNV21
-    YUVTORGB
-    "subs       %3, %3, #8                     \n"
-    MEMACCESS(2)
-    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_vu),    // %1
-      "+r"(dst_argb),  // %2
-      "+r"(width)      // %3
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
+  asm volatile(YUVTORGB_SETUP
+               "vmov.u8    d23, #255                      \n"
+               "1:                                        \n" READNV21 YUVTORGB
+               "subs       %3, %3, #8                     \n"
+               "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
+               "bgt        1b                             \n"
+               : "+r"(src_y),     // %0
+                 "+r"(src_vu),    // %1
+                 "+r"(dst_argb),  // %2
+                 "+r"(width)      // %3
+               : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+                 [kUVToG] "r"(&yuvconstants->kUVToG),
+                 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+                 [kYToRgb] "r"(&yuvconstants->kYToRgb)
+               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
+                 "q10", "q11", "q12", "q13", "q14", "q15");
 }
 
-void NV12ToRGB565Row_NEON(const uint8* src_y,
-                          const uint8* src_uv,
-                          uint8* dst_rgb565,
+void NV12ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_uv,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  asm volatile(
+
+      YUVTORGB_SETUP
+
+      "1:                                        \n"
+
+      READNV12 YUVTORGB
+      "subs       %3, %3, #8                     \n"
+      "vst3.8     {d20, d21, d22}, [%2]!         \n"
+      "bgt        1b                             \n"
+      : "+r"(src_y),      // %0
+        "+r"(src_uv),     // %1
+        "+r"(dst_rgb24),  // %2
+        "+r"(width)       // %3
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+void NV21ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  asm volatile(
+
+      YUVTORGB_SETUP
+
+      "1:                                        \n"
+
+      READNV21 YUVTORGB
+      "subs       %3, %3, #8                     \n"
+      "vst3.8     {d20, d21, d22}, [%2]!         \n"
+      "bgt        1b                             \n"
+      : "+r"(src_y),      // %0
+        "+r"(src_vu),     // %1
+        "+r"(dst_rgb24),  // %2
+        "+r"(width)       // %3
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+void NV12ToRGB565Row_NEON(const uint8_t* src_y,
+                          const uint8_t* src_uv,
+                          uint8_t* dst_rgb565,
                           const struct YuvConstants* yuvconstants,
                           int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-  "1:                                          \n"
-    READNV12
-    YUVTORGB
-    "subs       %3, %3, #8                     \n"
-    ARGBTORGB565
-    MEMACCESS(2)
-    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_uv),    // %1
-      "+r"(dst_rgb565),  // %2
-      "+r"(width)      // %3
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READNV12 YUVTORGB
+      "subs       %3, %3, #8                     \n" ARGBTORGB565
+      "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
+      "bgt        1b                             \n"
+      : "+r"(src_y),       // %0
+        "+r"(src_uv),      // %1
+        "+r"(dst_rgb565),  // %2
+        "+r"(width)        // %3
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
 }
 
-void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
-                        uint8* dst_argb,
+void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "vmov.u8    d23, #255                      \n"
-  "1:                                          \n"
-    READYUY2
-    YUVTORGB
-    "subs       %2, %2, #8                     \n"
-    MEMACCESS(1)
-    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_yuy2),  // %0
-      "+r"(dst_argb),  // %1
-      "+r"(width)      // %2
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
+  asm volatile(YUVTORGB_SETUP
+               "vmov.u8    d23, #255                      \n"
+               "1:                                        \n" READYUY2 YUVTORGB
+               "subs       %2, %2, #8                     \n"
+               "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+               "bgt        1b                             \n"
+               : "+r"(src_yuy2),  // %0
+                 "+r"(dst_argb),  // %1
+                 "+r"(width)      // %2
+               : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+                 [kUVToG] "r"(&yuvconstants->kUVToG),
+                 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+                 [kYToRgb] "r"(&yuvconstants->kYToRgb)
+               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
+                 "q10", "q11", "q12", "q13", "q14", "q15");
 }
 
-void UYVYToARGBRow_NEON(const uint8* src_uyvy,
-                        uint8* dst_argb,
+void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "vmov.u8    d23, #255                      \n"
-  "1:                                          \n"
-    READUYVY
-    YUVTORGB
-    "subs       %2, %2, #8                     \n"
-    MEMACCESS(1)
-    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_uyvy),  // %0
-      "+r"(dst_argb),  // %1
-      "+r"(width)      // %2
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
+  asm volatile(YUVTORGB_SETUP
+               "vmov.u8    d23, #255                      \n"
+               "1:                                        \n" READUYVY YUVTORGB
+               "subs       %2, %2, #8                     \n"
+               "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+               "bgt        1b                             \n"
+               : "+r"(src_uyvy),  // %0
+                 "+r"(dst_argb),  // %1
+                 "+r"(width)      // %2
+               : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+                 [kUVToG] "r"(&yuvconstants->kUVToG),
+                 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+                 [kYToRgb] "r"(&yuvconstants->kYToRgb)
+               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
+                 "q10", "q11", "q12", "q13", "q14", "q15");
 }
 
 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
-void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void SplitUVRow_NEON(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
                      int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop
-    MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"  // store U
-    MEMACCESS(2)
-    "vst1.8     {q1}, [%2]!                    \n"  // store V
-    "bgt        1b                             \n"
-    : "+r"(src_uv),  // %0
-      "+r"(dst_u),   // %1
-      "+r"(dst_v),   // %2
-      "+r"(width)    // %3  // Output registers
-    :                       // Input registers
-    : "cc", "memory", "q0", "q1"  // Clobber List
-  );
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
+      "subs       %3, %3, #16                    \n"  // 16 processed per loop
+      "vst1.8     {q0}, [%1]!                    \n"  // store U
+      "vst1.8     {q1}, [%2]!                    \n"  // store V
+      "bgt        1b                             \n"
+      : "+r"(src_uv),               // %0
+        "+r"(dst_u),                // %1
+        "+r"(dst_v),                // %2
+        "+r"(width)                 // %3  // Output registers
+      :                             // Input registers
+      : "cc", "memory", "q0", "q1"  // Clobber List
+      );
 }
 
 // Reads 16 U's and V's and writes out 16 pairs of UV.
-void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_NEON(const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_uv,
                      int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load U
-    MEMACCESS(1)
-    "vld1.8     {q1}, [%1]!                    \n"  // load V
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop
-    MEMACCESS(2)
-    "vst2.u8    {q0, q1}, [%2]!                \n"  // store 16 pairs of UV
-    "bgt        1b                             \n"
-    :
-      "+r"(src_u),   // %0
-      "+r"(src_v),   // %1
-      "+r"(dst_uv),  // %2
-      "+r"(width)    // %3  // Output registers
-    :                       // Input registers
-    : "cc", "memory", "q0", "q1"  // Clobber List
-  );
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load U
+      "vld1.8     {q1}, [%1]!                    \n"  // load V
+      "subs       %3, %3, #16                    \n"  // 16 processed per loop
+      "vst2.8     {q0, q1}, [%2]!                \n"  // store 16 pairs of UV
+      "bgt        1b                             \n"
+      : "+r"(src_u),                // %0
+        "+r"(src_v),                // %1
+        "+r"(dst_uv),               // %2
+        "+r"(width)                 // %3  // Output registers
+      :                             // Input registers
+      : "cc", "memory", "q0", "q1"  // Clobber List
+      );
+}
+
+// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
+void SplitRGBRow_NEON(const uint8_t* src_rgb,
+                      uint8_t* dst_r,
+                      uint8_t* dst_g,
+                      uint8_t* dst_b,
+                      int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB
+      "vld3.8     {d1, d3, d5}, [%0]!            \n"  // next 8 RGB
+      "subs       %4, %4, #16                    \n"  // 16 processed per loop
+      "vst1.8     {q0}, [%1]!                    \n"  // store R
+      "vst1.8     {q1}, [%2]!                    \n"  // store G
+      "vst1.8     {q2}, [%3]!                    \n"  // store B
+      "bgt        1b                             \n"
+      : "+r"(src_rgb),                    // %0
+        "+r"(dst_r),                      // %1
+        "+r"(dst_g),                      // %2
+        "+r"(dst_b),                      // %3
+        "+r"(width)                       // %4
+      :                                   // Input registers
+      : "cc", "memory", "d0", "d1", "d2"  // Clobber List
+      );
+}
+
+// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
+void MergeRGBRow_NEON(const uint8_t* src_r,
+                      const uint8_t* src_g,
+                      const uint8_t* src_b,
+                      uint8_t* dst_rgb,
+                      int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load R
+      "vld1.8     {q1}, [%1]!                    \n"  // load G
+      "vld1.8     {q2}, [%2]!                    \n"  // load B
+      "subs       %4, %4, #16                    \n"  // 16 processed per loop
+      "vst3.8     {d0, d2, d4}, [%3]!            \n"  // store 8 RGB
+      "vst3.8     {d1, d3, d5}, [%3]!            \n"  // next 8 RGB
+      "bgt        1b                             \n"
+      : "+r"(src_r),                      // %0
+        "+r"(src_g),                      // %1
+        "+r"(src_b),                      // %2
+        "+r"(dst_rgb),                    // %3
+        "+r"(width)                       // %4
+      :                                   // Input registers
+      : "cc", "memory", "q0", "q1", "q2"  // Clobber List
+      );
 }
 
 // Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
-void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32
-    "subs       %2, %2, #32                    \n"  // 32 processed per loop
-    MEMACCESS(1)
-    "vst1.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 32
-    "bgt        1b                             \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(count)  // %2  // Output registers
-  :                     // Input registers
-  : "cc", "memory", "q0", "q1"  // Clobber List
-  );
+void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32
+      "subs       %2, %2, #32                    \n"  // 32 processed per loop
+      "vst1.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 32
+      "bgt        1b                             \n"
+      : "+r"(src),                  // %0
+        "+r"(dst),                  // %1
+        "+r"(width)                 // %2  // Output registers
+      :                             // Input registers
+      : "cc", "memory", "q0", "q1"  // Clobber List
+      );
 }
 
-// SetRow writes 'count' bytes using an 8 bit value repeated.
-void SetRow_NEON(uint8* dst, uint8 v8, int count) {
-  asm volatile (
-    "vdup.8    q0, %2                          \n"  // duplicate 16 bytes
-  "1:                                          \n"
-    "subs      %1, %1, #16                     \n"  // 16 bytes per loop
-    MEMACCESS(0)
-    "vst1.8    {q0}, [%0]!                     \n"  // store
-    "bgt       1b                              \n"
-  : "+r"(dst),   // %0
-    "+r"(count)  // %1
-  : "r"(v8)      // %2
-  : "cc", "memory", "q0"
-  );
+// SetRow writes 'width' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
+  asm volatile(
+      "vdup.8    q0, %2                          \n"  // duplicate 16 bytes
+      "1:                                        \n"
+      "subs      %1, %1, #16                     \n"  // 16 bytes per loop
+      "vst1.8    {q0}, [%0]!                     \n"  // store
+      "bgt       1b                              \n"
+      : "+r"(dst),   // %0
+        "+r"(width)  // %1
+      : "r"(v8)      // %2
+      : "cc", "memory", "q0");
 }
 
-// ARGBSetRow writes 'count' pixels using an 32 bit value repeated.
-void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
-  asm volatile (
-    "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
-  "1:                                          \n"
-    "subs      %1, %1, #4                      \n"  // 4 pixels per loop
-    MEMACCESS(0)
-    "vst1.8    {q0}, [%0]!                     \n"  // store
-    "bgt       1b                              \n"
-  : "+r"(dst),   // %0
-    "+r"(count)  // %1
-  : "r"(v32)     // %2
-  : "cc", "memory", "q0"
-  );
+// ARGBSetRow writes 'width' pixels using an 32 bit value repeated.
+void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
+  asm volatile(
+      "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
+      "1:                                        \n"
+      "subs      %1, %1, #4                      \n"  // 4 pixels per loop
+      "vst1.8    {q0}, [%0]!                     \n"  // store
+      "bgt       1b                              \n"
+      : "+r"(dst),   // %0
+        "+r"(width)  // %1
+      : "r"(v32)     // %2
+      : "cc", "memory", "q0");
 }
 
-void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    // Start at end of source row.
-    "mov        r3, #-16                       \n"
-    "add        %0, %0, %2                     \n"
-    "sub        %0, #16                        \n"
+void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      // Start at end of source row.
+      "mov        r3, #-16                       \n"
+      "add        %0, %0, %2                     \n"
+      "sub        %0, #16                        \n"
 
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
-    "subs       %2, #16                        \n"  // 16 pixels per loop.
-    "vrev64.8   q0, q0                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-  :
-  : "cc", "memory", "r3", "q0"
-  );
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
+      "subs       %2, #16                        \n"  // 16 pixels per loop.
+      "vrev64.8   q0, q0                         \n"
+      "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
+      "vst1.8     {d0}, [%1]!                    \n"
+      "bgt        1b                             \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "cc", "memory", "r3", "q0");
 }
 
-void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void MirrorUVRow_NEON(const uint8_t* src_uv,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
                       int width) {
-  asm volatile (
-    // Start at end of source row.
-    "mov        r12, #-16                      \n"
-    "add        %0, %0, %3, lsl #1             \n"
-    "sub        %0, #16                        \n"
+  asm volatile(
+      // Start at end of source row.
+      "mov        r12, #-16                      \n"
+      "add        %0, %0, %3, lsl #1             \n"
+      "sub        %0, #16                        \n"
 
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16
-    "subs       %3, #8                         \n"  // 8 pixels per loop.
-    "vrev64.8   q0, q0                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // dst += 8
-    MEMACCESS(2)
-    "vst1.8     {d1}, [%2]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_uv),  // %0
-    "+r"(dst_u),   // %1
-    "+r"(dst_v),   // %2
-    "+r"(width)    // %3
-  :
-  : "cc", "memory", "r12", "q0"
-  );
+      "1:                                        \n"
+      "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16
+      "subs       %3, #8                         \n"  // 8 pixels per loop.
+      "vrev64.8   q0, q0                         \n"
+      "vst1.8     {d0}, [%1]!                    \n"  // dst += 8
+      "vst1.8     {d1}, [%2]!                    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_u),   // %1
+        "+r"(dst_v),   // %2
+        "+r"(width)    // %3
+      :
+      : "cc", "memory", "r12", "q0");
 }
 
-void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    // Start at end of source row.
-    "mov        r3, #-16                       \n"
-    "add        %0, %0, %2, lsl #2             \n"
-    "sub        %0, #16                        \n"
+void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      // Start at end of source row.
+      "mov        r3, #-16                       \n"
+      "add        %0, %0, %2, lsl #2             \n"
+      "sub        %0, #16                        \n"
 
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
-    "subs       %2, #4                         \n"  // 4 pixels per loop.
-    "vrev64.32  q0, q0                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-  :
-  : "cc", "memory", "r3", "q0"
-  );
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
+      "subs       %2, #4                         \n"  // 4 pixels per loop.
+      "vrev64.32  q0, q0                         \n"
+      "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
+      "vst1.8     {d0}, [%1]!                    \n"
+      "bgt        1b                             \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "cc", "memory", "r3", "q0");
 }
 
-void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
-  asm volatile (
-    "vmov.u8    d4, #255                       \n"  // Alpha
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    MEMACCESS(1)
-    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(src_rgb24),  // %0
-    "+r"(dst_argb),   // %1
-    "+r"(width)         // %2
-  :
-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-  );
+void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
+                         uint8_t* dst_argb,
+                         int width) {
+  asm volatile(
+      "vmov.u8    d4, #255                       \n"  // Alpha
+      "1:                                        \n"
+      "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
+      "bgt        1b                             \n"
+      : "+r"(src_rgb24),  // %0
+        "+r"(dst_argb),   // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+      );
 }
 
-void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
-  asm volatile (
-    "vmov.u8    d4, #255                       \n"  // Alpha
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vswp.u8    d1, d3                         \n"  // swap R, B
-    MEMACCESS(1)
-    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(src_raw),   // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)      // %2
-  :
-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-  );
+void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "vmov.u8    d4, #255                       \n"  // Alpha
+      "1:                                        \n"
+      "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vswp.u8    d1, d3                         \n"  // swap R, B
+      "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
+      "bgt        1b                             \n"
+      : "+r"(src_raw),   // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+      );
 }
 
-void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vswp.u8    d1, d3                         \n"  // swap R, B
-    MEMACCESS(1)
-    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.
-    "bgt        1b                             \n"
-  : "+r"(src_raw),    // %0
-    "+r"(dst_rgb24),  // %1
-    "+r"(width)       // %2
-  :
-  : "cc", "memory", "d1", "d2", "d3"  // Clobber List
-  );
+void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vswp.u8    d1, d3                         \n"  // swap R, B
+      "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of
+                                                      // RGB24.
+      "bgt        1b                             \n"
+      : "+r"(src_raw),    // %0
+        "+r"(dst_rgb24),  // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "d1", "d2", "d3"  // Clobber List
+      );
 }
 
-#define RGB565TOARGB                                                           \
-    "vshrn.u16  d6, q0, #5                     \n"  /* G xxGGGGGG           */ \
-    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB RRRRRxxx */ \
-    "vshl.u8    d6, d6, #2                     \n"  /* G GGGGGG00 upper 6   */ \
-    "vshr.u8    d1, d1, #3                     \n"  /* R 000RRRRR lower 5   */ \
-    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
-    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
-    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
-    "vshr.u8    d4, d6, #6                     \n"  /* G 000000GG lower 2   */ \
-    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
-    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
+#define RGB565TOARGB                                                        \
+  "vshrn.u16  d6, q0, #5                     \n" /* G xxGGGGGG           */ \
+  "vuzp.u8    d0, d1                         \n" /* d0 xxxBBBBB RRRRRxxx */ \
+  "vshl.u8    d6, d6, #2                     \n" /* G GGGGGG00 upper 6   */ \
+  "vshr.u8    d1, d1, #3                     \n" /* R 000RRRRR lower 5   */ \
+  "vshl.u8    q0, q0, #3                     \n" /* B,R BBBBB000 upper 5 */ \
+  "vshr.u8    q2, q0, #5                     \n" /* B,R 00000BBB lower 3 */ \
+  "vorr.u8    d0, d0, d4                     \n" /* B                    */ \
+  "vshr.u8    d4, d6, #6                     \n" /* G 000000GG lower 2   */ \
+  "vorr.u8    d2, d1, d5                     \n" /* R                    */ \
+  "vorr.u8    d1, d4, d6                     \n" /* G                    */
 
-void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
-  asm volatile (
-    "vmov.u8    d3, #255                       \n"  // Alpha
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    RGB565TOARGB
-    MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(src_rgb565),  // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)          // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
-  );
+void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      "vmov.u8    d3, #255                       \n"  // Alpha
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      RGB565TOARGB
+      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+      "bgt        1b                             \n"
+      : "+r"(src_rgb565),  // %0
+        "+r"(dst_argb),    // %1
+        "+r"(width)        // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+      );
 }
 
-#define ARGB1555TOARGB                                                         \
-    "vshrn.u16  d7, q0, #8                     \n"  /* A Arrrrrxx           */ \
-    "vshr.u8    d6, d7, #2                     \n"  /* R xxxRRRRR           */ \
-    "vshrn.u16  d5, q0, #5                     \n"  /* G xxxGGGGG           */ \
-    "vmovn.u16  d4, q0                         \n"  /* B xxxBBBBB           */ \
-    "vshr.u8    d7, d7, #7                     \n"  /* A 0000000A           */ \
-    "vneg.s8    d7, d7                         \n"  /* A AAAAAAAA upper 8   */ \
-    "vshl.u8    d6, d6, #3                     \n"  /* R RRRRR000 upper 5   */ \
-    "vshr.u8    q1, q3, #5                     \n"  /* R,A 00000RRR lower 3 */ \
-    "vshl.u8    q0, q2, #3                     \n"  /* B,G BBBBB000 upper 5 */ \
-    "vshr.u8    q2, q0, #5                     \n"  /* B,G 00000BBB lower 3 */ \
-    "vorr.u8    q1, q1, q3                     \n"  /* R,A                  */ \
-    "vorr.u8    q0, q0, q2                     \n"  /* B,G                  */ \
+#define ARGB1555TOARGB                                                      \
+  "vshrn.u16  d7, q0, #8                     \n" /* A Arrrrrxx           */ \
+  "vshr.u8    d6, d7, #2                     \n" /* R xxxRRRRR           */ \
+  "vshrn.u16  d5, q0, #5                     \n" /* G xxxGGGGG           */ \
+  "vmovn.u16  d4, q0                         \n" /* B xxxBBBBB           */ \
+  "vshr.u8    d7, d7, #7                     \n" /* A 0000000A           */ \
+  "vneg.s8    d7, d7                         \n" /* A AAAAAAAA upper 8   */ \
+  "vshl.u8    d6, d6, #3                     \n" /* R RRRRR000 upper 5   */ \
+  "vshr.u8    q1, q3, #5                     \n" /* R,A 00000RRR lower 3 */ \
+  "vshl.u8    q0, q2, #3                     \n" /* B,G BBBBB000 upper 5 */ \
+  "vshr.u8    q2, q0, #5                     \n" /* B,G 00000BBB lower 3 */ \
+  "vorr.u8    q1, q1, q3                     \n" /* R,A                  */ \
+  "vorr.u8    q0, q0, q2                     \n" /* B,G                  */
 
 // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
-#define RGB555TOARGB                                                           \
-    "vshrn.u16  d6, q0, #5                     \n"  /* G xxxGGGGG           */ \
-    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB xRRRRRxx */ \
-    "vshl.u8    d6, d6, #3                     \n"  /* G GGGGG000 upper 5   */ \
-    "vshr.u8    d1, d1, #2                     \n"  /* R 00xRRRRR lower 5   */ \
-    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
-    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
-    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
-    "vshr.u8    d4, d6, #5                     \n"  /* G 00000GGG lower 3   */ \
-    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
-    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
+#define RGB555TOARGB                                                        \
+  "vshrn.u16  d6, q0, #5                     \n" /* G xxxGGGGG           */ \
+  "vuzp.u8    d0, d1                         \n" /* d0 xxxBBBBB xRRRRRxx */ \
+  "vshl.u8    d6, d6, #3                     \n" /* G GGGGG000 upper 5   */ \
+  "vshr.u8    d1, d1, #2                     \n" /* R 00xRRRRR lower 5   */ \
+  "vshl.u8    q0, q0, #3                     \n" /* B,R BBBBB000 upper 5 */ \
+  "vshr.u8    q2, q0, #5                     \n" /* B,R 00000BBB lower 3 */ \
+  "vorr.u8    d0, d0, d4                     \n" /* B                    */ \
+  "vshr.u8    d4, d6, #5                     \n" /* G 00000GGG lower 3   */ \
+  "vorr.u8    d2, d1, d5                     \n" /* R                    */ \
+  "vorr.u8    d1, d4, d6                     \n" /* G                    */
 
-void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
+void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
+                            uint8_t* dst_argb,
                             int width) {
-  asm volatile (
-    "vmov.u8    d3, #255                       \n"  // Alpha
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    ARGB1555TOARGB
-    MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(src_argb1555),  // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)          // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
-  );
+  asm volatile(
+      "vmov.u8    d3, #255                       \n"  // Alpha
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      ARGB1555TOARGB
+      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+      "bgt        1b                             \n"
+      : "+r"(src_argb1555),  // %0
+        "+r"(dst_argb),      // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+      );
 }
 
-#define ARGB4444TOARGB                                                         \
-    "vuzp.u8    d0, d1                         \n"  /* d0 BG, d1 RA         */ \
-    "vshl.u8    q2, q0, #4                     \n"  /* B,R BBBB0000         */ \
-    "vshr.u8    q1, q0, #4                     \n"  /* G,A 0000GGGG         */ \
-    "vshr.u8    q0, q2, #4                     \n"  /* B,R 0000BBBB         */ \
-    "vorr.u8    q0, q0, q2                     \n"  /* B,R BBBBBBBB         */ \
-    "vshl.u8    q2, q1, #4                     \n"  /* G,A GGGG0000         */ \
-    "vorr.u8    q1, q1, q2                     \n"  /* G,A GGGGGGGG         */ \
-    "vswp.u8    d1, d2                         \n"  /* B,R,G,A -> B,G,R,A   */
+#define ARGB4444TOARGB                                                      \
+  "vuzp.u8    d0, d1                         \n" /* d0 BG, d1 RA         */ \
+  "vshl.u8    q2, q0, #4                     \n" /* B,R BBBB0000         */ \
+  "vshr.u8    q1, q0, #4                     \n" /* G,A 0000GGGG         */ \
+  "vshr.u8    q0, q2, #4                     \n" /* B,R 0000BBBB         */ \
+  "vorr.u8    q0, q0, q2                     \n" /* B,R BBBBBBBB         */ \
+  "vshl.u8    q2, q1, #4                     \n" /* G,A GGGG0000         */ \
+  "vorr.u8    q1, q1, q2                     \n" /* G,A GGGGGGGG         */ \
+  "vswp.u8    d1, d2                         \n" /* B,R,G,A -> B,G,R,A   */
 
-void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
+void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
+                            uint8_t* dst_argb,
                             int width) {
-  asm volatile (
-    "vmov.u8    d3, #255                       \n"  // Alpha
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    ARGB4444TOARGB
-    MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(src_argb4444),  // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)          // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
-  );
+  asm volatile(
+      "vmov.u8    d3, #255                       \n"  // Alpha
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      ARGB4444TOARGB
+      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+      "bgt        1b                             \n"
+      : "+r"(src_argb4444),  // %0
+        "+r"(dst_argb),      // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2"  // Clobber List
+      );
 }
 
-void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    MEMACCESS(1)
-    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_rgb24),  // %1
-    "+r"(width)         // %2
-  :
-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-  );
-}
-
-void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vswp.u8    d1, d3                         \n"  // swap R, B
-    MEMACCESS(1)
-    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RAW.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_raw),   // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-  );
-}
-
-void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
-    MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"  // store 16 pixels of Y.
-    "bgt        1b                             \n"
-  : "+r"(src_yuy2),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-
-void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
-    MEMACCESS(1)
-    "vst1.8     {q1}, [%1]!                    \n"  // store 16 pixels of Y.
-    "bgt        1b                             \n"
-  : "+r"(src_uyvy),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-
-void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_rgb24,
                          int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
-    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
-    MEMACCESS(1)
-    "vst1.8     {d1}, [%1]!                    \n"  // store 8 U.
-    MEMACCESS(2)
-    "vst1.8     {d3}, [%2]!                    \n"  // store 8 V.
-    "bgt        1b                             \n"
-  : "+r"(src_yuy2),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
-  );
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of
+                                                      // RGB24.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),   // %0
+        "+r"(dst_rgb24),  // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+      );
 }
 
-void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
+void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vswp.u8    d1, d3                         \n"  // swap R, B
+      "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RAW.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_raw),   // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+      );
+}
+
+void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.
+      "subs       %2, %2, #16                    \n"  // 16 processed per loop.
+      "vst1.8     {q0}, [%1]!                    \n"  // store 16 pixels of Y.
+      "bgt        1b                             \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1"  // Clobber List
+      );
+}
+
+void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.
+      "subs       %2, %2, #16                    \n"  // 16 processed per loop.
+      "vst1.8     {q1}, [%1]!                    \n"  // store 16 pixels of Y.
+      "bgt        1b                             \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1"  // Clobber List
+      );
+}
+
+void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
                          int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
-    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 U.
-    MEMACCESS(2)
-    "vst1.8     {d2}, [%2]!                    \n"  // store 8 V.
-    "bgt        1b                             \n"
-  : "+r"(src_uyvy),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
-  );
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
+      "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
+      "vst1.8     {d1}, [%1]!                    \n"  // store 8 U.
+      "vst1.8     {d3}, [%2]!                    \n"  // store 8 V.
+      "bgt        1b                             \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
+      );
 }
 
-void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // stride + src_yuy2
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
-    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
-    MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.
-    "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U
-    "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V
-    MEMACCESS(2)
-    "vst1.8     {d1}, [%2]!                    \n"  // store 8 U.
-    MEMACCESS(3)
-    "vst1.8     {d3}, [%3]!                    \n"  // store 8 V.
-    "bgt        1b                             \n"
-  : "+r"(src_yuy2),     // %0
-    "+r"(stride_yuy2),  // %1
-    "+r"(dst_u),        // %2
-    "+r"(dst_v),        // %3
-    "+r"(width)           // %4
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
-  );
+void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
+      "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 U.
+      "vst1.8     {d2}, [%2]!                    \n"  // store 8 V.
+      "bgt        1b                             \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
+      );
 }
 
-void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // stride + src_uyvy
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
-    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
-    MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.
-    "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U
-    "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 U.
-    MEMACCESS(3)
-    "vst1.8     {d2}, [%3]!                    \n"  // store 8 V.
-    "bgt        1b                             \n"
-  : "+r"(src_uyvy),     // %0
-    "+r"(stride_uyvy),  // %1
-    "+r"(dst_u),        // %2
-    "+r"(dst_v),        // %3
-    "+r"(width)           // %4
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
-  );
+void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
+                      int stride_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "add        %1, %0, %1                     \n"  // stride + src_yuy2
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
+      "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
+      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.
+      "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U
+      "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V
+      "vst1.8     {d1}, [%2]!                    \n"  // store 8 U.
+      "vst1.8     {d3}, [%3]!                    \n"  // store 8 V.
+      "bgt        1b                             \n"
+      : "+r"(src_yuy2),     // %0
+        "+r"(stride_yuy2),  // %1
+        "+r"(dst_u),        // %2
+        "+r"(dst_v),        // %3
+        "+r"(width)         // %4
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
+        "d7"  // Clobber List
+      );
+}
+
+void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
+                      int stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "add        %1, %0, %1                     \n"  // stride + src_uyvy
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
+      "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
+      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.
+      "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U
+      "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V
+      "vst1.8     {d0}, [%2]!                    \n"  // store 8 U.
+      "vst1.8     {d2}, [%3]!                    \n"  // store 8 V.
+      "bgt        1b                             \n"
+      : "+r"(src_uyvy),     // %0
+        "+r"(stride_uyvy),  // %1
+        "+r"(dst_u),        // %2
+        "+r"(dst_v),        // %3
+        "+r"(width)         // %4
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
+        "d7"  // Clobber List
+      );
 }
 
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int width) {
-  asm volatile (
-    MEMACCESS(3)
-    "vld1.8     {q2}, [%3]                     \n"  // shuffler
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 4 pixels.
-    "subs       %2, %2, #4                     \n"  // 4 processed per loop
-    "vtbl.8     d2, {d0, d1}, d4               \n"  // look up 2 first pixels
-    "vtbl.8     d3, {d0, d1}, d5               \n"  // look up 2 next pixels
-    MEMACCESS(1)
-    "vst1.8     {q1}, [%1]!                    \n"  // store 4.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)        // %2
-  : "r"(shuffler)    // %3
-  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
-  );
+void ARGBShuffleRow_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_argb,
+                         const uint8_t* shuffler,
+                         int width) {
+  asm volatile(
+      "vld1.8     {q2}, [%3]                     \n"  // shuffler
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 4 pixels.
+      "subs       %2, %2, #4                     \n"  // 4 processed per loop
+      "vtbl.8     d2, {d0, d1}, d4               \n"  // look up 2 first pixels
+      "vtbl.8     d3, {d0, d1}, d5               \n"  // look up 2 next pixels
+      "vst1.8     {q1}, [%1]!                    \n"  // store 4.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),                   // %0
+        "+r"(dst_argb),                   // %1
+        "+r"(width)                       // %2
+      : "r"(shuffler)                     // %3
+      : "cc", "memory", "q0", "q1", "q2"  // Clobber List
+      );
 }
 
-void I422ToYUY2Row_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_yuy2, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys
-    MEMACCESS(1)
-    "vld1.8     {d1}, [%1]!                    \n"  // load 8 Us
-    MEMACCESS(2)
-    "vld1.8     {d3}, [%2]!                    \n"  // load 8 Vs
-    "subs       %4, %4, #16                    \n"  // 16 pixels
-    MEMACCESS(3)
-    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 YUY2/16 pixels.
-    "bgt        1b                             \n"
-  : "+r"(src_y),     // %0
-    "+r"(src_u),     // %1
-    "+r"(src_v),     // %2
-    "+r"(dst_yuy2),  // %3
-    "+r"(width)      // %4
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3"
-  );
+void I422ToYUY2Row_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_yuy2,
+                        int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys
+      "vld1.8     {d1}, [%1]!                    \n"  // load 8 Us
+      "vld1.8     {d3}, [%2]!                    \n"  // load 8 Vs
+      "subs       %4, %4, #16                    \n"  // 16 pixels
+      "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 YUY2/16 pixels.
+      "bgt        1b                             \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_yuy2),  // %3
+        "+r"(width)      // %4
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3");
 }
 
-void I422ToUYVYRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_uyvy, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys
-    MEMACCESS(1)
-    "vld1.8     {d0}, [%1]!                    \n"  // load 8 Us
-    MEMACCESS(2)
-    "vld1.8     {d2}, [%2]!                    \n"  // load 8 Vs
-    "subs       %4, %4, #16                    \n"  // 16 pixels
-    MEMACCESS(3)
-    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 UYVY/16 pixels.
-    "bgt        1b                             \n"
-  : "+r"(src_y),     // %0
-    "+r"(src_u),     // %1
-    "+r"(src_v),     // %2
-    "+r"(dst_uyvy),  // %3
-    "+r"(width)      // %4
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3"
-  );
+void I422ToUYVYRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_uyvy,
+                        int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys
+      "vld1.8     {d0}, [%1]!                    \n"  // load 8 Us
+      "vld1.8     {d2}, [%2]!                    \n"  // load 8 Vs
+      "subs       %4, %4, #16                    \n"  // 16 pixels
+      "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 UYVY/16 pixels.
+      "bgt        1b                             \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_uyvy),  // %3
+        "+r"(width)      // %4
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3");
 }
 
-void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    ARGBTORGB565
-    MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels RGB565.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_rgb565),  // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
-  );
+void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
+                          uint8_t* dst_rgb565,
+                          int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      ARGBTORGB565
+      "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels RGB565.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),    // %0
+        "+r"(dst_rgb565),  // %1
+        "+r"(width)        // %2
+      :
+      : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
 }
 
-void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
-                                const uint32 dither4, int width) {
-  asm volatile (
-    "vdup.32    d2, %2                         \n"  // dither4
-  "1:                                          \n"
-    MEMACCESS(1)
-    "vld4.8     {d20, d21, d22, d23}, [%1]!    \n"  // load 8 pixels of ARGB.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vqadd.u8   d20, d20, d2                   \n"
-    "vqadd.u8   d21, d21, d2                   \n"
-    "vqadd.u8   d22, d22, d2                   \n"
-    ARGBTORGB565
-    MEMACCESS(0)
-    "vst1.8     {q0}, [%0]!                    \n"  // store 8 pixels RGB565.
-    "bgt        1b                             \n"
-  : "+r"(dst_rgb)    // %0
-  : "r"(src_argb),   // %1
-    "r"(dither4),    // %2
-    "r"(width)       // %3
-  : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11"
-  );
+void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
+                                uint8_t* dst_rgb,
+                                const uint32_t dither4,
+                                int width) {
+  asm volatile(
+      "vdup.32    d2, %2                         \n"  // dither4
+      "1:                                        \n"
+      "vld4.8     {d20, d21, d22, d23}, [%1]!    \n"  // load 8 pixels of ARGB.
+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+      "vqadd.u8   d20, d20, d2                   \n"
+      "vqadd.u8   d21, d21, d2                   \n"
+      "vqadd.u8   d22, d22, d2                   \n"  // add for dither
+      ARGBTORGB565
+      "vst1.8     {q0}, [%0]!                    \n"  // store 8 RGB565.
+      "bgt        1b                             \n"
+      : "+r"(dst_rgb)   // %0
+      : "r"(src_argb),  // %1
+        "r"(dither4),   // %2
+        "r"(width)      // %3
+      : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11");
 }
 
-void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
+void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
+                            uint8_t* dst_argb1555,
                             int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    ARGBTOARGB1555
-    MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB1555.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb1555),  // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
-  );
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      ARGBTOARGB1555
+      "vst1.8     {q0}, [%1]!                    \n"  // store 8 ARGB1555.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),      // %0
+        "+r"(dst_argb1555),  // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
 }
 
-void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
+void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
+                            uint8_t* dst_argb4444,
                             int width) {
-  asm volatile (
-    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    ARGBTOARGB4444
-    MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB4444.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),      // %0
-    "+r"(dst_argb4444),  // %1
-    "+r"(width)            // %2
-  :
-  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
-  );
+  asm volatile(
+      "vmov.u8    d4, #0x0f                      \n"  // bits to clear with
+                                                      // vbic.
+      "1:                                        \n"
+      "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      ARGBTOARGB4444
+      "vst1.8     {q0}, [%1]!                    \n"  // store 8 ARGB4444.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),      // %0
+        "+r"(dst_argb4444),  // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
 }
 
-void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
-  asm volatile (
-    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
-    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
-    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
-    "vmov.u8    d27, #16                       \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d27                        \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
-  );
+void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+      "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+      "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+      "vmov.u8    d27, #16                       \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q2, d0, d24                    \n"  // B
+      "vmlal.u8   q2, d1, d25                    \n"  // G
+      "vmlal.u8   q2, d2, d26                    \n"  // R
+      "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+      "vqadd.u8   d0, d27                        \n"
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
 }
 
-void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop
-    MEMACCESS(1)
-    "vst1.8     {q3}, [%1]!                    \n"  // store 16 A's.
-    "bgt       1b                              \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_a),      // %1
-    "+r"(width)       // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
-  );
+void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels
+      "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels
+      "subs       %2, %2, #16                    \n"  // 16 processed per loop
+      "vst1.8     {q3}, [%1]!                    \n"  // store 16 A's.
+      "bgt       1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_a),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+      );
 }
 
-void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
-  asm volatile (
-    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
-    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
-    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit Y
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
-  );
+void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
+      "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
+      "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q2, d0, d24                    \n"  // B
+      "vmlal.u8   q2, d1, d25                    \n"  // G
+      "vmlal.u8   q2, d2, d26                    \n"  // R
+      "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit Y
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
 }
 
 // 8x1 pixels.
-void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+void ARGBToUV444Row_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
                          int width) {
-  asm volatile (
-    "vmov.u8    d24, #112                      \n"  // UB / VR 0.875 coefficient
-    "vmov.u8    d25, #74                       \n"  // UG -0.5781 coefficient
-    "vmov.u8    d26, #38                       \n"  // UR -0.2969 coefficient
-    "vmov.u8    d27, #18                       \n"  // VB -0.1406 coefficient
-    "vmov.u8    d28, #94                       \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlsl.u8   q2, d1, d25                    \n"  // G
-    "vmlsl.u8   q2, d2, d26                    \n"  // R
-    "vadd.u16   q2, q2, q15                    \n"  // +128 -> unsigned
+  asm volatile(
+      "vmov.u8    d24, #112                      \n"  // UB / VR 0.875
+                                                      // coefficient
+      "vmov.u8    d25, #74                       \n"  // UG -0.5781 coefficient
+      "vmov.u8    d26, #38                       \n"  // UR -0.2969 coefficient
+      "vmov.u8    d27, #18                       \n"  // VB -0.1406 coefficient
+      "vmov.u8    d28, #94                       \n"  // VG -0.7344 coefficient
+      "vmov.u16   q15, #0x8080                   \n"  // 128.5
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q2, d0, d24                    \n"  // B
+      "vmlsl.u8   q2, d1, d25                    \n"  // G
+      "vmlsl.u8   q2, d2, d26                    \n"  // R
+      "vadd.u16   q2, q2, q15                    \n"  // +128 -> unsigned
 
-    "vmull.u8   q3, d2, d24                    \n"  // R
-    "vmlsl.u8   q3, d1, d28                    \n"  // G
-    "vmlsl.u8   q3, d0, d27                    \n"  // B
-    "vadd.u16   q3, q3, q15                    \n"  // +128 -> unsigned
+      "vmull.u8   q3, d2, d24                    \n"  // R
+      "vmlsl.u8   q3, d1, d28                    \n"  // G
+      "vmlsl.u8   q3, d0, d27                    \n"  // B
+      "vadd.u16   q3, q3, q15                    \n"  // +128 -> unsigned
 
-    "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V
+      "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U
+      "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V
 
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
-    MEMACCESS(2)
-    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"
-  );
-}
-
-// 32x1 pixels -> 8x1.  width is number of argb pixels. e.g. 32.
-void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                         int width) {
-  asm volatile (
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(0)
-    "vld4.8     {d8, d10, d12, d14}, [%0]!     \n"  // load 8 more ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d9, d11, d13, d15}, [%0]!     \n"  // load last 8 ARGB pixels.
-    "vpaddl.u8  q4, q4                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q5, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q6, q6                         \n"  // R 16 bytes -> 8 shorts.
-
-    "vpadd.u16  d0, d0, d1                     \n"  // B 16 shorts -> 8 shorts.
-    "vpadd.u16  d1, d8, d9                     \n"  // B
-    "vpadd.u16  d2, d2, d3                     \n"  // G 16 shorts -> 8 shorts.
-    "vpadd.u16  d3, d10, d11                   \n"  // G
-    "vpadd.u16  d4, d4, d5                     \n"  // R 16 shorts -> 8 shorts.
-    "vpadd.u16  d5, d12, d13                   \n"  // R
-
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
-
-    "subs       %3, %3, #32                    \n"  // 32 processed per loop.
-    "vmul.s16   q8, q0, q10                    \n"  // B
-    "vmls.s16   q8, q1, q11                    \n"  // G
-    "vmls.s16   q8, q2, q12                    \n"  // R
-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
-    "vmul.s16   q9, q2, q10                    \n"  // R
-    "vmls.s16   q9, q1, q14                    \n"  // G
-    "vmls.s16   q9, q0, q13                    \n"  // B
-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
-    MEMACCESS(2)
-    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
+      "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14",
+        "q15");
 }
 
+// clang-format off
 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-#define RGBTOUV(QB, QG, QR) \
-    "vmul.s16   q8, " #QB ", q10               \n"  /* B                    */ \
-    "vmls.s16   q8, " #QG ", q11               \n"  /* G                    */ \
-    "vmls.s16   q8, " #QR ", q12               \n"  /* R                    */ \
-    "vadd.u16   q8, q8, q15                    \n"  /* +128 -> unsigned     */ \
-    "vmul.s16   q9, " #QR ", q10               \n"  /* R                    */ \
-    "vmls.s16   q9, " #QG ", q14               \n"  /* G                    */ \
-    "vmls.s16   q9, " #QB ", q13               \n"  /* B                    */ \
-    "vadd.u16   q9, q9, q15                    \n"  /* +128 -> unsigned     */ \
-    "vqshrn.u16  d0, q8, #8                    \n"  /* 16 bit to 8 bit U    */ \
-    "vqshrn.u16  d1, q9, #8                    \n"  /* 16 bit to 8 bit V    */
+#define RGBTOUV(QB, QG, QR)                                                 \
+  "vmul.s16   q8, " #QB ", q10               \n" /* B                    */ \
+  "vmls.s16   q8, " #QG ", q11               \n" /* G                    */ \
+  "vmls.s16   q8, " #QR ", q12               \n" /* R                    */ \
+  "vadd.u16   q8, q8, q15                    \n" /* +128 -> unsigned     */ \
+  "vmul.s16   q9, " #QR ", q10               \n" /* R                    */ \
+  "vmls.s16   q9, " #QG ", q14               \n" /* G                    */ \
+  "vmls.s16   q9, " #QB ", q13               \n" /* B                    */ \
+  "vadd.u16   q9, q9, q15                    \n" /* +128 -> unsigned     */ \
+  "vqshrn.u16  d0, q8, #8                    \n" /* 16 bit to 8 bit U    */ \
+  "vqshrn.u16  d1, q9, #8                    \n" /* 16 bit to 8 bit V    */
+// clang-format on
 
 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
-void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void ARGBToUVRow_NEON(const uint8_t* src_argb,
+                      int src_stride_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
   asm volatile (
     "add        %1, %0, %1                     \n"  // src_stride + src_argb
     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
@@ -1468,17 +1335,13 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
+    "1:                                        \n"
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(0)
     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
     "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
     "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
-    MEMACCESS(1)
     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
     "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
@@ -1490,9 +1353,7 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
     RGBTOUV(q0, q1, q2)
-    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
@@ -1507,8 +1368,11 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
 }
 
 // TODO(fbarchard): Subsample match C code.
-void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
+void ARGBToUVJRow_NEON(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
   asm volatile (
     "add        %1, %0, %1                     \n"  // src_stride + src_argb
     "vmov.s16   q10, #127 / 2                  \n"  // UB / VR 0.500 coefficient
@@ -1517,17 +1381,13 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
     "vmov.s16   q13, #20 / 2                   \n"  // VB -0.08131 coefficient
     "vmov.s16   q14, #107 / 2                  \n"  // VG -0.41869 coefficient
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
+    "1:                                        \n"
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(0)
     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
     "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
     "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
-    MEMACCESS(1)
     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
     "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
@@ -1539,9 +1399,7 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
     RGBTOUV(q0, q1, q2)
-    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
@@ -1555,8 +1413,11 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
   );
 }
 
-void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void BGRAToUVRow_NEON(const uint8_t* src_bgra,
+                      int src_stride_bgra,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
   asm volatile (
     "add        %1, %0, %1                     \n"  // src_stride + src_bgra
     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
@@ -1565,17 +1426,13 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
+    "1:                                        \n"
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 BGRA pixels.
-    MEMACCESS(0)
     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 BGRA pixels.
     "vpaddl.u8  q3, q3                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q2, q2                         \n"  // G 16 bytes -> 8 shorts.
     "vpaddl.u8  q1, q1                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more BGRA pixels.
-    MEMACCESS(1)
     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 BGRA pixels.
     "vpadal.u8  q3, q7                         \n"  // B 16 bytes -> 8 shorts.
     "vpadal.u8  q2, q6                         \n"  // G 16 bytes -> 8 shorts.
@@ -1587,9 +1444,7 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
     RGBTOUV(q3, q2, q1)
-    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_bgra),  // %0
@@ -1603,8 +1458,11 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
   );
 }
 
-void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void ABGRToUVRow_NEON(const uint8_t* src_abgr,
+                      int src_stride_abgr,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
   asm volatile (
     "add        %1, %0, %1                     \n"  // src_stride + src_abgr
     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
@@ -1613,17 +1471,13 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
+    "1:                                        \n"
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ABGR pixels.
-    MEMACCESS(0)
     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ABGR pixels.
     "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
     "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ABGR pixels.
-    MEMACCESS(1)
     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ABGR pixels.
     "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
@@ -1635,9 +1489,7 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
     RGBTOUV(q2, q1, q0)
-    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_abgr),  // %0
@@ -1651,8 +1503,11 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
   );
 }
 
-void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void RGBAToUVRow_NEON(const uint8_t* src_rgba,
+                      int src_stride_rgba,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
   asm volatile (
     "add        %1, %0, %1                     \n"  // src_stride + src_rgba
     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
@@ -1661,17 +1516,13 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
+    "1:                                        \n"
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 RGBA pixels.
-    MEMACCESS(0)
     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 RGBA pixels.
     "vpaddl.u8  q0, q1                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q1, q2                         \n"  // G 16 bytes -> 8 shorts.
     "vpaddl.u8  q2, q3                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more RGBA pixels.
-    MEMACCESS(1)
     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 RGBA pixels.
     "vpadal.u8  q0, q5                         \n"  // B 16 bytes -> 8 shorts.
     "vpadal.u8  q1, q6                         \n"  // G 16 bytes -> 8 shorts.
@@ -1683,9 +1534,7 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
     RGBTOUV(q0, q1, q2)
-    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_rgba),  // %0
@@ -1699,8 +1548,11 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
   );
 }
 
-void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
-                       uint8* dst_u, uint8* dst_v, int width) {
+void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
+                       int src_stride_rgb24,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
   asm volatile (
     "add        %1, %0, %1                     \n"  // src_stride + src_rgb24
     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
@@ -1709,17 +1561,13 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
+    "1:                                        \n"
     "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB24 pixels.
-    MEMACCESS(0)
     "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RGB24 pixels.
     "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
     "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
     "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RGB24 pixels.
-    MEMACCESS(1)
     "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RGB24 pixels.
     "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
@@ -1731,9 +1579,7 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
     RGBTOUV(q0, q1, q2)
-    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_rgb24),  // %0
@@ -1747,8 +1593,11 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
   );
 }
 
-void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
-                     uint8* dst_u, uint8* dst_v, int width) {
+void RAWToUVRow_NEON(const uint8_t* src_raw,
+                     int src_stride_raw,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
   asm volatile (
     "add        %1, %0, %1                     \n"  // src_stride + src_raw
     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
@@ -1757,17 +1606,13 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
+    "1:                                        \n"
     "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RAW pixels.
-    MEMACCESS(0)
     "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RAW pixels.
     "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
     "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
     "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RAW pixels.
-    MEMACCESS(1)
     "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RAW pixels.
     "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
@@ -1779,9 +1624,7 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
     RGBTOUV(q2, q1, q0)
-    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_raw),  // %0
@@ -1796,875 +1639,815 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
 }
 
 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
-                        uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
-    RGB565TOARGB
-    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // next 8 RGB565 pixels.
-    RGB565TOARGB
-    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
+                        int src_stride_rgb565,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  asm volatile(
+      "add        %1, %0, %1                     \n"  // src_stride + src_argb
+      "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875
+                                                      // coefficient
+      "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+      "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+      "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+      "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+      "vmov.u16   q15, #0x8080                   \n"  // 128.5
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
+      RGB565TOARGB
+      "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8     {q0}, [%0]!                    \n"  // next 8 RGB565 pixels.
+      RGB565TOARGB
+      "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
 
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // load 8 RGB565 pixels.
-    RGB565TOARGB
-    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // next 8 RGB565 pixels.
-    RGB565TOARGB
-    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8     {q0}, [%1]!                    \n"  // load 8 RGB565 pixels.
+      RGB565TOARGB
+      "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8     {q0}, [%1]!                    \n"  // next 8 RGB565 pixels.
+      RGB565TOARGB
+      "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
 
-    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
-    "vrshr.u16  q5, q5, #1                     \n"
-    "vrshr.u16  q6, q6, #1                     \n"
+      "vrshr.u16  q4, q4, #1                     \n"  // 2x average
+      "vrshr.u16  q5, q5, #1                     \n"
+      "vrshr.u16  q6, q6, #1                     \n"
 
-    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
-    "vmul.s16   q8, q4, q10                    \n"  // B
-    "vmls.s16   q8, q5, q11                    \n"  // G
-    "vmls.s16   q8, q6, q12                    \n"  // R
-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
-    "vmul.s16   q9, q6, q10                    \n"  // R
-    "vmls.s16   q9, q5, q14                    \n"  // G
-    "vmls.s16   q9, q4, q13                    \n"  // B
-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_rgb565),  // %0
-    "+r"(src_stride_rgb565),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
+      "subs       %4, %4, #16                    \n"  // 16 processed per loop.
+      "vmul.s16   q8, q4, q10                    \n"  // B
+      "vmls.s16   q8, q5, q11                    \n"  // G
+      "vmls.s16   q8, q6, q12                    \n"  // R
+      "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+      "vmul.s16   q9, q6, q10                    \n"  // R
+      "vmls.s16   q9, q5, q14                    \n"  // G
+      "vmls.s16   q9, q4, q13                    \n"  // B
+      "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+      "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+      "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+      "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+      "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+      "bgt        1b                             \n"
+      : "+r"(src_rgb565),         // %0
+        "+r"(src_stride_rgb565),  // %1
+        "+r"(dst_u),              // %2
+        "+r"(dst_v),              // %3
+        "+r"(width)               // %4
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+        "q9", "q10", "q11", "q12", "q13", "q14", "q15");
 }
 
 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
-                        uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
-    RGB555TOARGB
-    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB1555 pixels.
-    RGB555TOARGB
-    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
+                          int src_stride_argb1555,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
+  asm volatile(
+      "add        %1, %0, %1                     \n"  // src_stride + src_argb
+      "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875
+                                                      // coefficient
+      "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+      "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+      "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+      "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+      "vmov.u16   q15, #0x8080                   \n"  // 128.5
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
 
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB1555 pixels.
-    RGB555TOARGB
-    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB1555 pixels.
-    RGB555TOARGB
-    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
 
-    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
-    "vrshr.u16  q5, q5, #1                     \n"
-    "vrshr.u16  q6, q6, #1                     \n"
+      "vrshr.u16  q4, q4, #1                     \n"  // 2x average
+      "vrshr.u16  q5, q5, #1                     \n"
+      "vrshr.u16  q6, q6, #1                     \n"
 
-    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
-    "vmul.s16   q8, q4, q10                    \n"  // B
-    "vmls.s16   q8, q5, q11                    \n"  // G
-    "vmls.s16   q8, q6, q12                    \n"  // R
-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
-    "vmul.s16   q9, q6, q10                    \n"  // R
-    "vmls.s16   q9, q5, q14                    \n"  // G
-    "vmls.s16   q9, q4, q13                    \n"  // B
-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_argb1555),  // %0
-    "+r"(src_stride_argb1555),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
+      "subs       %4, %4, #16                    \n"  // 16 processed per loop.
+      "vmul.s16   q8, q4, q10                    \n"  // B
+      "vmls.s16   q8, q5, q11                    \n"  // G
+      "vmls.s16   q8, q6, q12                    \n"  // R
+      "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+      "vmul.s16   q9, q6, q10                    \n"  // R
+      "vmls.s16   q9, q5, q14                    \n"  // G
+      "vmls.s16   q9, q4, q13                    \n"  // B
+      "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+      "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+      "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+      "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+      "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+      "bgt        1b                             \n"
+      : "+r"(src_argb1555),         // %0
+        "+r"(src_stride_argb1555),  // %1
+        "+r"(dst_u),                // %2
+        "+r"(dst_v),                // %3
+        "+r"(width)                 // %4
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+        "q9", "q10", "q11", "q12", "q13", "q14", "q15");
 }
 
 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
-                          uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
-    ARGB4444TOARGB
-    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB4444 pixels.
-    ARGB4444TOARGB
-    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
+                          int src_stride_argb4444,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
+  asm volatile(
+      "add        %1, %0, %1                     \n"  // src_stride + src_argb
+      "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875
+                                                      // coefficient
+      "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+      "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+      "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+      "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+      "vmov.u16   q15, #0x8080                   \n"  // 128.5
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
 
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB4444 pixels.
-    ARGB4444TOARGB
-    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB4444 pixels.
-    ARGB4444TOARGB
-    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
 
-    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
-    "vrshr.u16  q5, q5, #1                     \n"
-    "vrshr.u16  q6, q6, #1                     \n"
+      "vrshr.u16  q4, q4, #1                     \n"  // 2x average
+      "vrshr.u16  q5, q5, #1                     \n"
+      "vrshr.u16  q6, q6, #1                     \n"
 
-    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
-    "vmul.s16   q8, q4, q10                    \n"  // B
-    "vmls.s16   q8, q5, q11                    \n"  // G
-    "vmls.s16   q8, q6, q12                    \n"  // R
-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
-    "vmul.s16   q9, q6, q10                    \n"  // R
-    "vmls.s16   q9, q5, q14                    \n"  // G
-    "vmls.s16   q9, q4, q13                    \n"  // B
-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_argb4444),  // %0
-    "+r"(src_stride_argb4444),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
+      "subs       %4, %4, #16                    \n"  // 16 processed per loop.
+      "vmul.s16   q8, q4, q10                    \n"  // B
+      "vmls.s16   q8, q5, q11                    \n"  // G
+      "vmls.s16   q8, q6, q12                    \n"  // R
+      "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+      "vmul.s16   q9, q6, q10                    \n"  // R
+      "vmls.s16   q9, q5, q14                    \n"  // G
+      "vmls.s16   q9, q4, q13                    \n"  // B
+      "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+      "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+      "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+      "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+      "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+      "bgt        1b                             \n"
+      : "+r"(src_argb4444),         // %0
+        "+r"(src_stride_argb4444),  // %1
+        "+r"(dst_u),                // %2
+        "+r"(dst_v),                // %3
+        "+r"(width)                 // %4
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+        "q9", "q10", "q11", "q12", "q13", "q14", "q15");
 }
 
-void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
-  asm volatile (
-    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
-    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
-    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
-    "vmov.u8    d27, #16                       \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    RGB565TOARGB
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d27                        \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_rgb565),  // %0
-    "+r"(dst_y),       // %1
-    "+r"(width)          // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
-  );
+void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+      "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+      "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+      "vmov.u8    d27, #16                       \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      RGB565TOARGB
+      "vmull.u8   q2, d0, d24                    \n"  // B
+      "vmlal.u8   q2, d1, d25                    \n"  // G
+      "vmlal.u8   q2, d2, d26                    \n"  // R
+      "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+      "vqadd.u8   d0, d27                        \n"
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+      "bgt        1b                             \n"
+      : "+r"(src_rgb565),  // %0
+        "+r"(dst_y),       // %1
+        "+r"(width)        // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
 }
 
-void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
-  asm volatile (
-    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
-    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
-    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
-    "vmov.u8    d27, #16                       \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    ARGB1555TOARGB
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d27                        \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_argb1555),  // %0
-    "+r"(dst_y),         // %1
-    "+r"(width)            // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
-  );
+void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
+                         uint8_t* dst_y,
+                         int width) {
+  asm volatile(
+      "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+      "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+      "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+      "vmov.u8    d27, #16                       \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      ARGB1555TOARGB
+      "vmull.u8   q2, d0, d24                    \n"  // B
+      "vmlal.u8   q2, d1, d25                    \n"  // G
+      "vmlal.u8   q2, d2, d26                    \n"  // R
+      "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+      "vqadd.u8   d0, d27                        \n"
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+      "bgt        1b                             \n"
+      : "+r"(src_argb1555),  // %0
+        "+r"(dst_y),         // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
 }
 
-void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
-  asm volatile (
-    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
-    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
-    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
-    "vmov.u8    d27, #16                       \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    ARGB4444TOARGB
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d27                        \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_argb4444),  // %0
-    "+r"(dst_y),         // %1
-    "+r"(width)            // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
-  );
+void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
+                         uint8_t* dst_y,
+                         int width) {
+  asm volatile(
+      "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+      "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+      "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+      "vmov.u8    d27, #16                       \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      ARGB4444TOARGB
+      "vmull.u8   q2, d0, d24                    \n"  // B
+      "vmlal.u8   q2, d1, d25                    \n"  // G
+      "vmlal.u8   q2, d2, d26                    \n"  // R
+      "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+      "vqadd.u8   d0, d27                        \n"
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+      "bgt        1b                             \n"
+      : "+r"(src_argb4444),  // %0
+        "+r"(dst_y),         // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
 }
 
-void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
-  asm volatile (
-    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q8, d1, d4                     \n"  // R
-    "vmlal.u8   q8, d2, d5                     \n"  // G
-    "vmlal.u8   q8, d3, d6                     \n"  // B
-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d7                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_bgra),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
-  );
+void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
+      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+      "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
+      "vmov.u8    d7, #16                        \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q8, d1, d4                     \n"  // R
+      "vmlal.u8   q8, d2, d5                     \n"  // G
+      "vmlal.u8   q8, d3, d6                     \n"  // B
+      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+      "vqadd.u8   d0, d7                         \n"
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+      "bgt        1b                             \n"
+      : "+r"(src_bgra),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
 }
 
-void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
-  asm volatile (
-    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q8, d0, d4                     \n"  // R
-    "vmlal.u8   q8, d1, d5                     \n"  // G
-    "vmlal.u8   q8, d2, d6                     \n"  // B
-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d7                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_abgr),  // %0
-    "+r"(dst_y),  // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
-  );
+void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
+      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+      "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
+      "vmov.u8    d7, #16                        \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q8, d0, d4                     \n"  // R
+      "vmlal.u8   q8, d1, d5                     \n"  // G
+      "vmlal.u8   q8, d2, d6                     \n"  // B
+      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+      "vqadd.u8   d0, d7                         \n"
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+      "bgt        1b                             \n"
+      : "+r"(src_abgr),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
 }
 
-void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
-  asm volatile (
-    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q8, d1, d4                     \n"  // B
-    "vmlal.u8   q8, d2, d5                     \n"  // G
-    "vmlal.u8   q8, d3, d6                     \n"  // R
-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d7                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_rgba),  // %0
-    "+r"(dst_y),  // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
-  );
+void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
+      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+      "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
+      "vmov.u8    d7, #16                        \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q8, d1, d4                     \n"  // B
+      "vmlal.u8   q8, d2, d5                     \n"  // G
+      "vmlal.u8   q8, d3, d6                     \n"  // R
+      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+      "vqadd.u8   d0, d7                         \n"
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+      "bgt        1b                             \n"
+      : "+r"(src_rgba),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
 }
 
-void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
-  asm volatile (
-    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q8, d0, d4                     \n"  // B
-    "vmlal.u8   q8, d1, d5                     \n"  // G
-    "vmlal.u8   q8, d2, d6                     \n"  // R
-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d7                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_rgb24),  // %0
-    "+r"(dst_y),  // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
-  );
+void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
+      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+      "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
+      "vmov.u8    d7, #16                        \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q8, d0, d4                     \n"  // B
+      "vmlal.u8   q8, d1, d5                     \n"  // G
+      "vmlal.u8   q8, d2, d6                     \n"  // R
+      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+      "vqadd.u8   d0, d7                         \n"
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+      "bgt        1b                             \n"
+      : "+r"(src_rgb24),  // %0
+        "+r"(dst_y),      // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
 }
 
-void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
-  asm volatile (
-    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q8, d0, d4                     \n"  // B
-    "vmlal.u8   q8, d1, d5                     \n"  // G
-    "vmlal.u8   q8, d2, d6                     \n"  // R
-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d7                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_raw),  // %0
-    "+r"(dst_y),  // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
-  );
+void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
+      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+      "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
+      "vmov.u8    d7, #16                        \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q8, d0, d4                     \n"  // B
+      "vmlal.u8   q8, d1, d5                     \n"  // G
+      "vmlal.u8   q8, d2, d6                     \n"  // R
+      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+      "vqadd.u8   d0, d7                         \n"
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+      "bgt        1b                             \n"
+      : "+r"(src_raw),  // %0
+        "+r"(dst_y),    // %1
+        "+r"(width)     // %2
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
 }
 
 // Bilinear filter 16x2 -> 16x1
-void InterpolateRow_NEON(uint8* dst_ptr,
-                         const uint8* src_ptr, ptrdiff_t src_stride,
-                         int dst_width, int source_y_fraction) {
+void InterpolateRow_NEON(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         int dst_width,
+                         int source_y_fraction) {
   int y1_fraction = source_y_fraction;
-  asm volatile (
-    "cmp        %4, #0                         \n"
-    "beq        100f                           \n"
-    "add        %2, %1                         \n"
-    "cmp        %4, #128                       \n"
-    "beq        50f                            \n"
+  asm volatile(
+      "cmp        %4, #0                         \n"
+      "beq        100f                           \n"
+      "add        %2, %1                         \n"
+      "cmp        %4, #128                       \n"
+      "beq        50f                            \n"
 
-    "vdup.8     d5, %4                         \n"
-    "rsb        %4, #256                       \n"
-    "vdup.8     d4, %4                         \n"
-    // General purpose row blend.
-  "1:                                          \n"
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"
-    MEMACCESS(2)
-    "vld1.8     {q1}, [%2]!                    \n"
-    "subs       %3, %3, #16                    \n"
-    "vmull.u8   q13, d0, d4                    \n"
-    "vmull.u8   q14, d1, d4                    \n"
-    "vmlal.u8   q13, d2, d5                    \n"
-    "vmlal.u8   q14, d3, d5                    \n"
-    "vrshrn.u16 d0, q13, #8                    \n"
-    "vrshrn.u16 d1, q14, #8                    \n"
-    MEMACCESS(0)
-    "vst1.8     {q0}, [%0]!                    \n"
-    "bgt        1b                             \n"
-    "b          99f                            \n"
+      "vdup.8     d5, %4                         \n"
+      "rsb        %4, #256                       \n"
+      "vdup.8     d4, %4                         \n"
+      // General purpose row blend.
+      "1:                                        \n"
+      "vld1.8     {q0}, [%1]!                    \n"
+      "vld1.8     {q1}, [%2]!                    \n"
+      "subs       %3, %3, #16                    \n"
+      "vmull.u8   q13, d0, d4                    \n"
+      "vmull.u8   q14, d1, d4                    \n"
+      "vmlal.u8   q13, d2, d5                    \n"
+      "vmlal.u8   q14, d3, d5                    \n"
+      "vrshrn.u16 d0, q13, #8                    \n"
+      "vrshrn.u16 d1, q14, #8                    \n"
+      "vst1.8     {q0}, [%0]!                    \n"
+      "bgt        1b                             \n"
+      "b          99f                            \n"
 
-    // Blend 50 / 50.
-  "50:                                         \n"
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"
-    MEMACCESS(2)
-    "vld1.8     {q1}, [%2]!                    \n"
-    "subs       %3, %3, #16                    \n"
-    "vrhadd.u8  q0, q1                         \n"
-    MEMACCESS(0)
-    "vst1.8     {q0}, [%0]!                    \n"
-    "bgt        50b                            \n"
-    "b          99f                            \n"
+      // Blend 50 / 50.
+      "50:                                       \n"
+      "vld1.8     {q0}, [%1]!                    \n"
+      "vld1.8     {q1}, [%2]!                    \n"
+      "subs       %3, %3, #16                    \n"
+      "vrhadd.u8  q0, q1                         \n"
+      "vst1.8     {q0}, [%0]!                    \n"
+      "bgt        50b                            \n"
+      "b          99f                            \n"
 
-    // Blend 100 / 0 - Copy row unchanged.
-  "100:                                        \n"
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"
-    "subs       %3, %3, #16                    \n"
-    MEMACCESS(0)
-    "vst1.8     {q0}, [%0]!                    \n"
-    "bgt        100b                           \n"
+      // Blend 100 / 0 - Copy row unchanged.
+      "100:                                      \n"
+      "vld1.8     {q0}, [%1]!                    \n"
+      "subs       %3, %3, #16                    \n"
+      "vst1.8     {q0}, [%0]!                    \n"
+      "bgt        100b                           \n"
 
-  "99:                                         \n"
-  : "+r"(dst_ptr),          // %0
-    "+r"(src_ptr),          // %1
-    "+r"(src_stride),       // %2
-    "+r"(dst_width),        // %3
-    "+r"(y1_fraction)       // %4
-  :
-  : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"
-  );
+      "99:                                       \n"
+      : "+r"(dst_ptr),     // %0
+        "+r"(src_ptr),     // %1
+        "+r"(src_stride),  // %2
+        "+r"(dst_width),   // %3
+        "+r"(y1_fraction)  // %4
+      :
+      : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14");
 }
 
 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
-void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                       uint8* dst_argb, int width) {
-  asm volatile (
-    "subs       %3, #8                         \n"
-    "blt        89f                            \n"
-    // Blend 8 pixels.
-  "8:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB0.
-    MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 pixels of ARGB1.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q10, d4, d3                    \n"  // db * a
-    "vmull.u8   q11, d5, d3                    \n"  // dg * a
-    "vmull.u8   q12, d6, d3                    \n"  // dr * a
-    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
-    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
-    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
-    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
-    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
-    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
-    "vqadd.u8   d2, d2, d6                     \n"  // + sr
-    "vmov.u8    d3, #255                       \n"  // a = 255
-    MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 pixels of ARGB.
-    "bge        8b                             \n"
+void ARGBBlendRow_NEON(const uint8_t* src_argb0,
+                       const uint8_t* src_argb1,
+                       uint8_t* dst_argb,
+                       int width) {
+  asm volatile(
+      "subs       %3, #8                         \n"
+      "blt        89f                            \n"
+      // Blend 8 pixels.
+      "8:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB0.
+      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 pixels of ARGB1.
+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q10, d4, d3                    \n"  // db * a
+      "vmull.u8   q11, d5, d3                    \n"  // dg * a
+      "vmull.u8   q12, d6, d3                    \n"  // dr * a
+      "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
+      "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
+      "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
+      "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
+      "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
+      "vqadd.u8   q0, q0, q2                     \n"  // + sbg
+      "vqadd.u8   d2, d2, d6                     \n"  // + sr
+      "vmov.u8    d3, #255                       \n"  // a = 255
+      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 pixels of ARGB.
+      "bge        8b                             \n"
 
-  "89:                                         \n"
-    "adds       %3, #8-1                       \n"
-    "blt        99f                            \n"
+      "89:                                       \n"
+      "adds       %3, #8-1                       \n"
+      "blt        99f                            \n"
 
-    // Blend 1 pixels.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.
-    MEMACCESS(1)
-    "vld4.8     {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.
-    "subs       %3, %3, #1                     \n"  // 1 processed per loop.
-    "vmull.u8   q10, d4, d3                    \n"  // db * a
-    "vmull.u8   q11, d5, d3                    \n"  // dg * a
-    "vmull.u8   q12, d6, d3                    \n"  // dr * a
-    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
-    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
-    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
-    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
-    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
-    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
-    "vqadd.u8   d2, d2, d6                     \n"  // + sr
-    "vmov.u8    d3, #255                       \n"  // a = 255
-    MEMACCESS(2)
-    "vst4.8     {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.
-    "bge        1b                             \n"
+      // Blend 1 pixels.
+      "1:                                        \n"
+      "vld4.8     {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.
+      "vld4.8     {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.
+      "subs       %3, %3, #1                     \n"    // 1 processed per loop.
+      "vmull.u8   q10, d4, d3                    \n"    // db * a
+      "vmull.u8   q11, d5, d3                    \n"    // dg * a
+      "vmull.u8   q12, d6, d3                    \n"    // dr * a
+      "vqrshrn.u16 d20, q10, #8                  \n"    // db >>= 8
+      "vqrshrn.u16 d21, q11, #8                  \n"    // dg >>= 8
+      "vqrshrn.u16 d22, q12, #8                  \n"    // dr >>= 8
+      "vqsub.u8   q2, q2, q10                    \n"    // dbg - dbg * a / 256
+      "vqsub.u8   d6, d6, d22                    \n"    // dr - dr * a / 256
+      "vqadd.u8   q0, q0, q2                     \n"    // + sbg
+      "vqadd.u8   d2, d2, d6                     \n"    // + sr
+      "vmov.u8    d3, #255                       \n"    // a = 255
+      "vst4.8     {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.
+      "bge        1b                             \n"
 
-  "99:                                         \n"
+      "99:                                         \n"
 
-  : "+r"(src_argb0),    // %0
-    "+r"(src_argb1),    // %1
-    "+r"(dst_argb),     // %2
-    "+r"(width)         // %3
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"
-  );
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12");
 }
 
 // Attenuate 8 pixels at a time.
-void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    // Attenuate 8 pixels.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q10, d0, d3                    \n"  // b * a
-    "vmull.u8   q11, d1, d3                    \n"  // g * a
-    "vmull.u8   q12, d2, d3                    \n"  // r * a
-    "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8
-    "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8
-    "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8
-    MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_argb),   // %1
-    "+r"(width)       // %2
-  :
-  : "cc", "memory", "q0", "q1", "q10", "q11", "q12"
-  );
+void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int width) {
+  asm volatile(
+      // Attenuate 8 pixels.
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q10, d0, d3                    \n"  // b * a
+      "vmull.u8   q11, d1, d3                    \n"  // g * a
+      "vmull.u8   q12, d2, d3                    \n"  // r * a
+      "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8
+      "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8
+      "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8
+      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q10", "q11", "q12");
 }
 
 // Quantize 8 ARGB pixels (32 bytes).
 // dst = (dst * scale >> 16) * interval_size + interval_offset;
-void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
-                          int interval_offset, int width) {
-  asm volatile (
-    "vdup.u16   q8, %2                         \n"
-    "vshr.u16   q8, q8, #1                     \n"  // scale >>= 1
-    "vdup.u16   q9, %3                         \n"  // interval multiply.
-    "vdup.u16   q10, %4                        \n"  // interval add
+void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
+                          int scale,
+                          int interval_size,
+                          int interval_offset,
+                          int width) {
+  asm volatile(
+      "vdup.u16   q8, %2                         \n"
+      "vshr.u16   q8, q8, #1                     \n"  // scale >>= 1
+      "vdup.u16   q9, %3                         \n"  // interval multiply.
+      "vdup.u16   q10, %4                        \n"  // interval add
 
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]         \n"  // load 8 pixels of ARGB.
-    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
-    "vmovl.u8   q0, d0                         \n"  // b (0 .. 255)
-    "vmovl.u8   q1, d2                         \n"
-    "vmovl.u8   q2, d4                         \n"
-    "vqdmulh.s16 q0, q0, q8                    \n"  // b * scale
-    "vqdmulh.s16 q1, q1, q8                    \n"  // g
-    "vqdmulh.s16 q2, q2, q8                    \n"  // r
-    "vmul.u16   q0, q0, q9                     \n"  // b * interval_size
-    "vmul.u16   q1, q1, q9                     \n"  // g
-    "vmul.u16   q2, q2, q9                     \n"  // r
-    "vadd.u16   q0, q0, q10                    \n"  // b + interval_offset
-    "vadd.u16   q1, q1, q10                    \n"  // g
-    "vadd.u16   q2, q2, q10                    \n"  // r
-    "vqmovn.u16 d0, q0                         \n"
-    "vqmovn.u16 d2, q1                         \n"
-    "vqmovn.u16 d4, q2                         \n"
-    MEMACCESS(0)
-    "vst4.8     {d0, d2, d4, d6}, [%0]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(dst_argb),       // %0
-    "+r"(width)           // %1
-  : "r"(scale),           // %2
-    "r"(interval_size),   // %3
-    "r"(interval_offset)  // %4
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
-  );
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld4.8     {d0, d2, d4, d6}, [%0]         \n"  // load 8 pixels of ARGB.
+      "subs       %1, %1, #8                     \n"  // 8 processed per loop.
+      "vmovl.u8   q0, d0                         \n"  // b (0 .. 255)
+      "vmovl.u8   q1, d2                         \n"
+      "vmovl.u8   q2, d4                         \n"
+      "vqdmulh.s16 q0, q0, q8                    \n"  // b * scale
+      "vqdmulh.s16 q1, q1, q8                    \n"  // g
+      "vqdmulh.s16 q2, q2, q8                    \n"  // r
+      "vmul.u16   q0, q0, q9                     \n"  // b * interval_size
+      "vmul.u16   q1, q1, q9                     \n"  // g
+      "vmul.u16   q2, q2, q9                     \n"  // r
+      "vadd.u16   q0, q0, q10                    \n"  // b + interval_offset
+      "vadd.u16   q1, q1, q10                    \n"  // g
+      "vadd.u16   q2, q2, q10                    \n"  // r
+      "vqmovn.u16 d0, q0                         \n"
+      "vqmovn.u16 d2, q1                         \n"
+      "vqmovn.u16 d4, q2                         \n"
+      "vst4.8     {d0, d2, d4, d6}, [%0]!        \n"  // store 8 pixels of ARGB.
+      "bgt        1b                             \n"
+      : "+r"(dst_argb),       // %0
+        "+r"(width)           // %1
+      : "r"(scale),           // %2
+        "r"(interval_size),   // %3
+        "r"(interval_offset)  // %4
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10");
 }
 
 // Shade 8 pixels at a time by specified value.
 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
-void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
-                       uint32 value) {
-  asm volatile (
-    "vdup.u32   q0, %3                         \n"  // duplicate scale value.
-    "vzip.u8    d0, d1                         \n"  // d0 aarrggbb.
-    "vshr.u16   q0, q0, #1                     \n"  // scale / 2.
+void ARGBShadeRow_NEON(const uint8_t* src_argb,
+                       uint8_t* dst_argb,
+                       int width,
+                       uint32_t value) {
+  asm volatile(
+      "vdup.u32   q0, %3                         \n"  // duplicate scale value.
+      "vzip.u8    d0, d1                         \n"  // d0 aarrggbb.
+      "vshr.u16   q0, q0, #1                     \n"  // scale / 2.
 
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d20, d22, d24, d26}, [%0]!    \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmovl.u8   q10, d20                       \n"  // b (0 .. 255)
-    "vmovl.u8   q11, d22                       \n"
-    "vmovl.u8   q12, d24                       \n"
-    "vmovl.u8   q13, d26                       \n"
-    "vqrdmulh.s16 q10, q10, d0[0]              \n"  // b * scale * 2
-    "vqrdmulh.s16 q11, q11, d0[1]              \n"  // g
-    "vqrdmulh.s16 q12, q12, d0[2]              \n"  // r
-    "vqrdmulh.s16 q13, q13, d0[3]              \n"  // a
-    "vqmovn.u16 d20, q10                       \n"
-    "vqmovn.u16 d22, q11                       \n"
-    "vqmovn.u16 d24, q12                       \n"
-    "vqmovn.u16 d26, q13                       \n"
-    MEMACCESS(1)
-    "vst4.8     {d20, d22, d24, d26}, [%1]!    \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),       // %0
-    "+r"(dst_argb),       // %1
-    "+r"(width)           // %2
-  : "r"(value)            // %3
-  : "cc", "memory", "q0", "q10", "q11", "q12", "q13"
-  );
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld4.8     {d20, d22, d24, d26}, [%0]!    \n"  // load 8 pixels of ARGB.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vmovl.u8   q10, d20                       \n"  // b (0 .. 255)
+      "vmovl.u8   q11, d22                       \n"
+      "vmovl.u8   q12, d24                       \n"
+      "vmovl.u8   q13, d26                       \n"
+      "vqrdmulh.s16 q10, q10, d0[0]              \n"  // b * scale * 2
+      "vqrdmulh.s16 q11, q11, d0[1]              \n"  // g
+      "vqrdmulh.s16 q12, q12, d0[2]              \n"  // r
+      "vqrdmulh.s16 q13, q13, d0[3]              \n"  // a
+      "vqmovn.u16 d20, q10                       \n"
+      "vqmovn.u16 d22, q11                       \n"
+      "vqmovn.u16 d24, q12                       \n"
+      "vqmovn.u16 d26, q13                       \n"
+      "vst4.8     {d20, d22, d24, d26}, [%1]!    \n"  // store 8 pixels of ARGB.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(value)       // %3
+      : "cc", "memory", "q0", "q10", "q11", "q12", "q13");
 }
 
 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
 // Similar to ARGBToYJ but stores ARGB.
 // C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
-void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
-    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
-    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit B
-    "vmov       d1, d0                         \n"  // G
-    "vmov       d2, d0                         \n"  // R
-    MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)      // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
-  );
+void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
+      "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
+      "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q2, d0, d24                    \n"  // B
+      "vmlal.u8   q2, d1, d25                    \n"  // G
+      "vmlal.u8   q2, d2, d26                    \n"  // R
+      "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit B
+      "vmov       d1, d0                         \n"  // G
+      "vmov       d2, d0                         \n"  // R
+      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 ARGB pixels.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
 }
 
 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
 //    b = (r * 35 + g * 68 + b * 17) >> 7
 //    g = (r * 45 + g * 88 + b * 22) >> 7
 //    r = (r * 50 + g * 98 + b * 24) >> 7
-void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
-  asm volatile (
-    "vmov.u8    d20, #17                       \n"  // BB coefficient
-    "vmov.u8    d21, #68                       \n"  // BG coefficient
-    "vmov.u8    d22, #35                       \n"  // BR coefficient
-    "vmov.u8    d24, #22                       \n"  // GB coefficient
-    "vmov.u8    d25, #88                       \n"  // GG coefficient
-    "vmov.u8    d26, #45                       \n"  // GR coefficient
-    "vmov.u8    d28, #24                       \n"  // BB coefficient
-    "vmov.u8    d29, #98                       \n"  // BG coefficient
-    "vmov.u8    d30, #50                       \n"  // BR coefficient
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels.
-    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q2, d0, d20                    \n"  // B to Sepia B
-    "vmlal.u8   q2, d1, d21                    \n"  // G
-    "vmlal.u8   q2, d2, d22                    \n"  // R
-    "vmull.u8   q3, d0, d24                    \n"  // B to Sepia G
-    "vmlal.u8   q3, d1, d25                    \n"  // G
-    "vmlal.u8   q3, d2, d26                    \n"  // R
-    "vmull.u8   q8, d0, d28                    \n"  // B to Sepia R
-    "vmlal.u8   q8, d1, d29                    \n"  // G
-    "vmlal.u8   q8, d2, d30                    \n"  // R
-    "vqshrn.u16 d0, q2, #7                     \n"  // 16 bit to 8 bit B
-    "vqshrn.u16 d1, q3, #7                     \n"  // 16 bit to 8 bit G
-    "vqshrn.u16 d2, q8, #7                     \n"  // 16 bit to 8 bit R
-    MEMACCESS(0)
-    "vst4.8     {d0, d1, d2, d3}, [%0]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
-  : "+r"(dst_argb),  // %0
-    "+r"(width)      // %1
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3",
-    "q10", "q11", "q12", "q13", "q14", "q15"
-  );
+void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
+  asm volatile(
+      "vmov.u8    d20, #17                       \n"  // BB coefficient
+      "vmov.u8    d21, #68                       \n"  // BG coefficient
+      "vmov.u8    d22, #35                       \n"  // BR coefficient
+      "vmov.u8    d24, #22                       \n"  // GB coefficient
+      "vmov.u8    d25, #88                       \n"  // GG coefficient
+      "vmov.u8    d26, #45                       \n"  // GR coefficient
+      "vmov.u8    d28, #24                       \n"  // BB coefficient
+      "vmov.u8    d29, #98                       \n"  // BG coefficient
+      "vmov.u8    d30, #50                       \n"  // BR coefficient
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels.
+      "subs       %1, %1, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q2, d0, d20                    \n"  // B to Sepia B
+      "vmlal.u8   q2, d1, d21                    \n"  // G
+      "vmlal.u8   q2, d2, d22                    \n"  // R
+      "vmull.u8   q3, d0, d24                    \n"  // B to Sepia G
+      "vmlal.u8   q3, d1, d25                    \n"  // G
+      "vmlal.u8   q3, d2, d26                    \n"  // R
+      "vmull.u8   q8, d0, d28                    \n"  // B to Sepia R
+      "vmlal.u8   q8, d1, d29                    \n"  // G
+      "vmlal.u8   q8, d2, d30                    \n"  // R
+      "vqshrn.u16 d0, q2, #7                     \n"  // 16 bit to 8 bit B
+      "vqshrn.u16 d1, q3, #7                     \n"  // 16 bit to 8 bit G
+      "vqshrn.u16 d2, q8, #7                     \n"  // 16 bit to 8 bit R
+      "vst4.8     {d0, d1, d2, d3}, [%0]!        \n"  // store 8 ARGB pixels.
+      "bgt        1b                             \n"
+      : "+r"(dst_argb),  // %0
+        "+r"(width)      // %1
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12", "q13",
+        "q14", "q15");
 }
 
 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
 // TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
 // needs to saturate.  Consider doing a non-saturating version.
-void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
-                             const int8* matrix_argb, int width) {
-  asm volatile (
-    MEMACCESS(3)
-    "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.
-    "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.
-    "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.
+void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             const int8_t* matrix_argb,
+                             int width) {
+  asm volatile(
+      "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.
+      "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.
+      "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.
 
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit
-    "vmovl.u8   q9, d18                        \n"  // g
-    "vmovl.u8   q10, d20                       \n"  // r
-    "vmovl.u8   q11, d22                       \n"  // a
-    "vmul.s16   q12, q8, d0[0]                 \n"  // B = B * Matrix B
-    "vmul.s16   q13, q8, d1[0]                 \n"  // G = B * Matrix G
-    "vmul.s16   q14, q8, d2[0]                 \n"  // R = B * Matrix R
-    "vmul.s16   q15, q8, d3[0]                 \n"  // A = B * Matrix A
-    "vmul.s16   q4, q9, d0[1]                  \n"  // B += G * Matrix B
-    "vmul.s16   q5, q9, d1[1]                  \n"  // G += G * Matrix G
-    "vmul.s16   q6, q9, d2[1]                  \n"  // R += G * Matrix R
-    "vmul.s16   q7, q9, d3[1]                  \n"  // A += G * Matrix A
-    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
-    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
-    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
-    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
-    "vmul.s16   q4, q10, d0[2]                 \n"  // B += R * Matrix B
-    "vmul.s16   q5, q10, d1[2]                 \n"  // G += R * Matrix G
-    "vmul.s16   q6, q10, d2[2]                 \n"  // R += R * Matrix R
-    "vmul.s16   q7, q10, d3[2]                 \n"  // A += R * Matrix A
-    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
-    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
-    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
-    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
-    "vmul.s16   q4, q11, d0[3]                 \n"  // B += A * Matrix B
-    "vmul.s16   q5, q11, d1[3]                 \n"  // G += A * Matrix G
-    "vmul.s16   q6, q11, d2[3]                 \n"  // R += A * Matrix R
-    "vmul.s16   q7, q11, d3[3]                 \n"  // A += A * Matrix A
-    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
-    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
-    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
-    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
-    "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B
-    "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G
-    "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R
-    "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A
-    MEMACCESS(1)
-    "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_argb),   // %1
-    "+r"(width)       // %2
-  : "r"(matrix_argb)  // %3
-  : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
-    "q10", "q11", "q12", "q13", "q14", "q15"
-  );
+      "1:                                        \n"
+      "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit
+      "vmovl.u8   q9, d18                        \n"  // g
+      "vmovl.u8   q10, d20                       \n"  // r
+      "vmovl.u8   q11, d22                       \n"  // a
+      "vmul.s16   q12, q8, d0[0]                 \n"  // B = B * Matrix B
+      "vmul.s16   q13, q8, d1[0]                 \n"  // G = B * Matrix G
+      "vmul.s16   q14, q8, d2[0]                 \n"  // R = B * Matrix R
+      "vmul.s16   q15, q8, d3[0]                 \n"  // A = B * Matrix A
+      "vmul.s16   q4, q9, d0[1]                  \n"  // B += G * Matrix B
+      "vmul.s16   q5, q9, d1[1]                  \n"  // G += G * Matrix G
+      "vmul.s16   q6, q9, d2[1]                  \n"  // R += G * Matrix R
+      "vmul.s16   q7, q9, d3[1]                  \n"  // A += G * Matrix A
+      "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
+      "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
+      "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
+      "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
+      "vmul.s16   q4, q10, d0[2]                 \n"  // B += R * Matrix B
+      "vmul.s16   q5, q10, d1[2]                 \n"  // G += R * Matrix G
+      "vmul.s16   q6, q10, d2[2]                 \n"  // R += R * Matrix R
+      "vmul.s16   q7, q10, d3[2]                 \n"  // A += R * Matrix A
+      "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
+      "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
+      "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
+      "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
+      "vmul.s16   q4, q11, d0[3]                 \n"  // B += A * Matrix B
+      "vmul.s16   q5, q11, d1[3]                 \n"  // G += A * Matrix G
+      "vmul.s16   q6, q11, d2[3]                 \n"  // R += A * Matrix R
+      "vmul.s16   q7, q11, d3[3]                 \n"  // A += A * Matrix A
+      "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
+      "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
+      "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
+      "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
+      "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B
+      "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G
+      "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R
+      "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A
+      "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),   // %0
+        "+r"(dst_argb),   // %1
+        "+r"(width)       // %2
+      : "r"(matrix_argb)  // %3
+      : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
+        "q10", "q11", "q12", "q13", "q14", "q15");
 }
 
 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  asm volatile (
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(1)
-    "vld4.8     {d1, d3, d5, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q0, d0, d1                     \n"  // multiply B
-    "vmull.u8   q1, d2, d3                     \n"  // multiply G
-    "vmull.u8   q2, d4, d5                     \n"  // multiply R
-    "vmull.u8   q3, d6, d7                     \n"  // multiply A
-    "vrshrn.u16 d0, q0, #8                     \n"  // 16 bit to 8 bit B
-    "vrshrn.u16 d1, q1, #8                     \n"  // 16 bit to 8 bit G
-    "vrshrn.u16 d2, q2, #8                     \n"  // 16 bit to 8 bit R
-    "vrshrn.u16 d3, q3, #8                     \n"  // 16 bit to 8 bit A
-    MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
-
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3"
-  );
+void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+      "vld4.8     {d1, d3, d5, d7}, [%1]!        \n"  // load 8 more ARGB
+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q0, d0, d1                     \n"  // multiply B
+      "vmull.u8   q1, d2, d3                     \n"  // multiply G
+      "vmull.u8   q2, d4, d5                     \n"  // multiply R
+      "vmull.u8   q3, d6, d7                     \n"  // multiply A
+      "vrshrn.u16 d0, q0, #8                     \n"  // 16 bit to 8 bit B
+      "vrshrn.u16 d1, q1, #8                     \n"  // 16 bit to 8 bit G
+      "vrshrn.u16 d2, q2, #8                     \n"  // 16 bit to 8 bit R
+      "vrshrn.u16 d3, q3, #8                     \n"  // 16 bit to 8 bit A
+      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+      "bgt        1b                             \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
 }
 
 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vqadd.u8   q0, q0, q2                     \n"  // add B, G
-    "vqadd.u8   q1, q1, q3                     \n"  // add R, A
-    MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
-
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3"
-  );
+void ARGBAddRow_NEON(const uint8_t* src_argb0,
+                     const uint8_t* src_argb1,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB
+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+      "vqadd.u8   q0, q0, q2                     \n"  // add B, G
+      "vqadd.u8   q1, q1, q3                     \n"  // add R, A
+      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+      "bgt        1b                             \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
 }
 
 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  asm volatile (
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vqsub.u8   q0, q0, q2                     \n"  // subtract B, G
-    "vqsub.u8   q1, q1, q3                     \n"  // subtract R, A
-    MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
-
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3"
-  );
+void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB
+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+      "vqsub.u8   q0, q0, q2                     \n"  // subtract B, G
+      "vqsub.u8   q1, q1, q3                     \n"  // subtract R, A
+      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+      "bgt        1b                             \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
 }
 
 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
@@ -2672,54 +2455,50 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
 // R = Sobel
 // G = Sobel
 // B = Sobel
-void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    "vmov.u8    d3, #255                       \n"  // alpha
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.
-    MEMACCESS(1)
-    "vld1.8     {d1}, [%1]!                    \n"  // load 8 sobely.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vqadd.u8   d0, d0, d1                     \n"  // add
-    "vmov.u8    d1, d0                         \n"
-    "vmov.u8    d2, d0                         \n"
-    MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "q0", "q1"
-  );
+void SobelRow_NEON(const uint8_t* src_sobelx,
+                   const uint8_t* src_sobely,
+                   uint8_t* dst_argb,
+                   int width) {
+  asm volatile(
+      "vmov.u8    d3, #255                       \n"  // alpha
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.
+      "vld1.8     {d1}, [%1]!                    \n"  // load 8 sobely.
+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+      "vqadd.u8   d0, d0, d1                     \n"  // add
+      "vmov.u8    d1, d0                         \n"
+      "vmov.u8    d2, d0                         \n"
+      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+      "bgt        1b                             \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "q0", "q1");
 }
 
 // Adds Sobel X and Sobel Y and stores Sobel into plane.
-void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                          uint8* dst_y, int width) {
-  asm volatile (
-    // 16 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 16 sobelx.
-    MEMACCESS(1)
-    "vld1.8     {q1}, [%1]!                    \n"  // load 16 sobely.
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop.
-    "vqadd.u8   q0, q0, q1                     \n"  // add
-    MEMACCESS(2)
-    "vst1.8     {q0}, [%2]!                    \n"  // store 16 pixels.
-    "bgt        1b                             \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_y),       // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "q0", "q1"
-  );
+void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
+                          const uint8_t* src_sobely,
+                          uint8_t* dst_y,
+                          int width) {
+  asm volatile(
+      // 16 pixel loop.
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 16 sobelx.
+      "vld1.8     {q1}, [%1]!                    \n"  // load 16 sobely.
+      "subs       %3, %3, #16                    \n"  // 16 processed per loop.
+      "vqadd.u8   q0, q0, q1                     \n"  // add
+      "vst1.8     {q0}, [%2]!                    \n"  // store 16 pixels.
+      "bgt        1b                             \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_y),       // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "q0", "q1");
 }
 
 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
@@ -2727,115 +2506,186 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
 // R = Sobel X
 // G = Sobel
 // B = Sobel Y
-void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    "vmov.u8    d3, #255                       \n"  // alpha
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.
-    MEMACCESS(1)
-    "vld1.8     {d0}, [%1]!                    \n"  // load 8 sobely.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vqadd.u8   d1, d0, d2                     \n"  // add
-    MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "q0", "q1"
-  );
+void SobelXYRow_NEON(const uint8_t* src_sobelx,
+                     const uint8_t* src_sobely,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      "vmov.u8    d3, #255                       \n"  // alpha
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.
+      "vld1.8     {d0}, [%1]!                    \n"  // load 8 sobely.
+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+      "vqadd.u8   d1, d0, d2                     \n"  // add
+      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+      "bgt        1b                             \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "q0", "q1");
 }
 
 // SobelX as a matrix is
 // -1  0  1
 // -2  0  2
 // -1  0  1
-void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
-                    const uint8* src_y2, uint8* dst_sobelx, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {d0}, [%0],%5                  \n"  // top
-    MEMACCESS(0)
-    "vld1.8     {d1}, [%0],%6                  \n"
-    "vsubl.u8   q0, d0, d1                     \n"
-    MEMACCESS(1)
-    "vld1.8     {d2}, [%1],%5                  \n"  // center * 2
-    MEMACCESS(1)
-    "vld1.8     {d3}, [%1],%6                  \n"
-    "vsubl.u8   q1, d2, d3                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    MEMACCESS(2)
-    "vld1.8     {d2}, [%2],%5                  \n"  // bottom
-    MEMACCESS(2)
-    "vld1.8     {d3}, [%2],%6                  \n"
-    "subs       %4, %4, #8                     \n"  // 8 pixels
-    "vsubl.u8   q1, d2, d3                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    "vabs.s16   q0, q0                         \n"
-    "vqmovn.u16 d0, q0                         \n"
-    MEMACCESS(3)
-    "vst1.8     {d0}, [%3]!                    \n"  // store 8 sobelx
-    "bgt        1b                             \n"
-  : "+r"(src_y0),      // %0
-    "+r"(src_y1),      // %1
-    "+r"(src_y2),      // %2
-    "+r"(dst_sobelx),  // %3
-    "+r"(width)        // %4
-  : "r"(2),            // %5
-    "r"(6)             // %6
-  : "cc", "memory", "q0", "q1"  // Clobber List
-  );
+void SobelXRow_NEON(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    const uint8_t* src_y2,
+                    uint8_t* dst_sobelx,
+                    int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8     {d0}, [%0],%5                  \n"  // top
+      "vld1.8     {d1}, [%0],%6                  \n"
+      "vsubl.u8   q0, d0, d1                     \n"
+      "vld1.8     {d2}, [%1],%5                  \n"  // center * 2
+      "vld1.8     {d3}, [%1],%6                  \n"
+      "vsubl.u8   q1, d2, d3                     \n"
+      "vadd.s16   q0, q0, q1                     \n"
+      "vadd.s16   q0, q0, q1                     \n"
+      "vld1.8     {d2}, [%2],%5                  \n"  // bottom
+      "vld1.8     {d3}, [%2],%6                  \n"
+      "subs       %4, %4, #8                     \n"  // 8 pixels
+      "vsubl.u8   q1, d2, d3                     \n"
+      "vadd.s16   q0, q0, q1                     \n"
+      "vabs.s16   q0, q0                         \n"
+      "vqmovn.u16 d0, q0                         \n"
+      "vst1.8     {d0}, [%3]!                    \n"  // store 8 sobelx
+      "bgt        1b                             \n"
+      : "+r"(src_y0),               // %0
+        "+r"(src_y1),               // %1
+        "+r"(src_y2),               // %2
+        "+r"(dst_sobelx),           // %3
+        "+r"(width)                 // %4
+      : "r"(2),                     // %5
+        "r"(6)                      // %6
+      : "cc", "memory", "q0", "q1"  // Clobber List
+      );
 }
 
 // SobelY as a matrix is
 // -1 -2 -1
 //  0  0  0
 //  1  2  1
-void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {d0}, [%0],%4                  \n"  // left
-    MEMACCESS(1)
-    "vld1.8     {d1}, [%1],%4                  \n"
-    "vsubl.u8   q0, d0, d1                     \n"
-    MEMACCESS(0)
-    "vld1.8     {d2}, [%0],%4                  \n"  // center * 2
-    MEMACCESS(1)
-    "vld1.8     {d3}, [%1],%4                  \n"
-    "vsubl.u8   q1, d2, d3                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    MEMACCESS(0)
-    "vld1.8     {d2}, [%0],%5                  \n"  // right
-    MEMACCESS(1)
-    "vld1.8     {d3}, [%1],%5                  \n"
-    "subs       %3, %3, #8                     \n"  // 8 pixels
-    "vsubl.u8   q1, d2, d3                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    "vabs.s16   q0, q0                         \n"
-    "vqmovn.u16 d0, q0                         \n"
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 sobely
-    "bgt        1b                             \n"
-  : "+r"(src_y0),      // %0
-    "+r"(src_y1),      // %1
-    "+r"(dst_sobely),  // %2
-    "+r"(width)        // %3
-  : "r"(1),            // %4
-    "r"(6)             // %5
-  : "cc", "memory", "q0", "q1"  // Clobber List
-  );
+void SobelYRow_NEON(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    uint8_t* dst_sobely,
+                    int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8     {d0}, [%0],%4                  \n"  // left
+      "vld1.8     {d1}, [%1],%4                  \n"
+      "vsubl.u8   q0, d0, d1                     \n"
+      "vld1.8     {d2}, [%0],%4                  \n"  // center * 2
+      "vld1.8     {d3}, [%1],%4                  \n"
+      "vsubl.u8   q1, d2, d3                     \n"
+      "vadd.s16   q0, q0, q1                     \n"
+      "vadd.s16   q0, q0, q1                     \n"
+      "vld1.8     {d2}, [%0],%5                  \n"  // right
+      "vld1.8     {d3}, [%1],%5                  \n"
+      "subs       %3, %3, #8                     \n"  // 8 pixels
+      "vsubl.u8   q1, d2, d3                     \n"
+      "vadd.s16   q0, q0, q1                     \n"
+      "vabs.s16   q0, q0                         \n"
+      "vqmovn.u16 d0, q0                         \n"
+      "vst1.8     {d0}, [%2]!                    \n"  // store 8 sobely
+      "bgt        1b                             \n"
+      : "+r"(src_y0),               // %0
+        "+r"(src_y1),               // %1
+        "+r"(dst_sobely),           // %2
+        "+r"(width)                 // %3
+      : "r"(1),                     // %4
+        "r"(6)                      // %5
+      : "cc", "memory", "q0", "q1"  // Clobber List
+      );
 }
-#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
+
+// %y passes a float as a scalar vector for vector * scalar multiply.
+// the regoster must be d0 to d15 and indexed with [0] or [1] to access
+// the float in the first or second float of the d-reg
+
+void HalfFloat1Row_NEON(const uint16_t* src,
+                        uint16_t* dst,
+                        float /*unused*/,
+                        int width) {
+  asm volatile(
+
+      "1:                                        \n"
+      "vld1.8     {q1}, [%0]!                    \n"  // load 8 shorts
+      "subs       %2, %2, #8                     \n"  // 8 pixels per loop
+      "vmovl.u16  q2, d2                         \n"  // 8 int's
+      "vmovl.u16  q3, d3                         \n"
+      "vcvt.f32.u32  q2, q2                      \n"  // 8 floats
+      "vcvt.f32.u32  q3, q3                      \n"
+      "vmul.f32   q2, q2, %y3                    \n"  // adjust exponent
+      "vmul.f32   q3, q3, %y3                    \n"
+      "vqshrn.u32 d2, q2, #13                    \n"  // isolate halffloat
+      "vqshrn.u32 d3, q3, #13                    \n"
+      "vst1.8     {q1}, [%1]!                    \n"
+      "bgt        1b                             \n"
+      : "+r"(src),              // %0
+        "+r"(dst),              // %1
+        "+r"(width)             // %2
+      : "w"(1.9259299444e-34f)  // %3
+      : "cc", "memory", "q1", "q2", "q3");
+}
+
+void HalfFloatRow_NEON(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width) {
+  asm volatile(
+
+      "1:                                        \n"
+      "vld1.8     {q1}, [%0]!                    \n"  // load 8 shorts
+      "subs       %2, %2, #8                     \n"  // 8 pixels per loop
+      "vmovl.u16  q2, d2                         \n"  // 8 int's
+      "vmovl.u16  q3, d3                         \n"
+      "vcvt.f32.u32  q2, q2                      \n"  // 8 floats
+      "vcvt.f32.u32  q3, q3                      \n"
+      "vmul.f32   q2, q2, %y3                    \n"  // adjust exponent
+      "vmul.f32   q3, q3, %y3                    \n"
+      "vqshrn.u32 d2, q2, #13                    \n"  // isolate halffloat
+      "vqshrn.u32 d3, q3, #13                    \n"
+      "vst1.8     {q1}, [%1]!                    \n"
+      "bgt        1b                             \n"
+      : "+r"(src),                      // %0
+        "+r"(dst),                      // %1
+        "+r"(width)                     // %2
+      : "w"(scale * 1.9259299444e-34f)  // %3
+      : "cc", "memory", "q1", "q2", "q3");
+}
+
+void ByteToFloatRow_NEON(const uint8_t* src,
+                         float* dst,
+                         float scale,
+                         int width) {
+  asm volatile(
+
+      "1:                                        \n"
+      "vld1.8     {d2}, [%0]!                    \n"  // load 8 bytes
+      "subs       %2, %2, #8                     \n"  // 8 pixels per loop
+      "vmovl.u8   q1, d2                         \n"  // 8 shorts
+      "vmovl.u16  q2, d2                         \n"  // 8 ints
+      "vmovl.u16  q3, d3                         \n"
+      "vcvt.f32.u32  q2, q2                      \n"  // 8 floats
+      "vcvt.f32.u32  q3, q3                      \n"
+      "vmul.f32   q2, q2, %y3                    \n"  // scale
+      "vmul.f32   q3, q3, %y3                    \n"
+      "vst1.8     {q2, q3}, [%1]!                \n"  // store 8 floats
+      "bgt        1b                             \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      : "w"(scale)   // %3
+      : "cc", "memory", "q1", "q2", "q3");
+}
+
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/row_neon64.cc b/media/libvpx/libvpx/third_party/libyuv/source/row_neon64.cc
index 6375d4f55f..24b4520bab 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/row_neon64.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/row_neon64.cc
@@ -19,118 +19,103 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
 // Read 8 Y, 4 U and 4 V from 422
-#define READYUV422                                                             \
-    MEMACCESS(0)                                                               \
-    "ld1        {v0.8b}, [%0], #8              \n"                             \
-    MEMACCESS(1)                                                               \
-    "ld1        {v1.s}[0], [%1], #4            \n"                             \
-    MEMACCESS(2)                                                               \
-    "ld1        {v1.s}[1], [%2], #4            \n"
-
-// Read 8 Y, 2 U and 2 V from 422
-#define READYUV411                                                             \
-    MEMACCESS(0)                                                               \
-    "ld1        {v0.8b}, [%0], #8              \n"                             \
-    MEMACCESS(1)                                                               \
-    "ld1        {v2.h}[0], [%1], #2            \n"                             \
-    MEMACCESS(2)                                                               \
-    "ld1        {v2.h}[1], [%2], #2            \n"                             \
-    "zip1       v1.8b, v2.8b, v2.8b            \n"
+#define READYUV422                               \
+  "ld1        {v0.8b}, [%0], #8              \n" \
+  "ld1        {v1.s}[0], [%1], #4            \n" \
+  "ld1        {v1.s}[1], [%2], #4            \n"
 
 // Read 8 Y, 8 U and 8 V from 444
-#define READYUV444                                                             \
-    MEMACCESS(0)                                                               \
-    "ld1        {v0.8b}, [%0], #8              \n"                             \
-    MEMACCESS(1)                                                               \
-    "ld1        {v1.d}[0], [%1], #8            \n"                             \
-    MEMACCESS(2)                                                               \
-    "ld1        {v1.d}[1], [%2], #8            \n"                             \
-    "uaddlp     v1.8h, v1.16b                  \n"                             \
-    "rshrn      v1.8b, v1.8h, #1               \n"
+#define READYUV444                               \
+  "ld1        {v0.8b}, [%0], #8              \n" \
+  "ld1        {v1.d}[0], [%1], #8            \n" \
+  "ld1        {v1.d}[1], [%2], #8            \n" \
+  "uaddlp     v1.8h, v1.16b                  \n" \
+  "rshrn      v1.8b, v1.8h, #1               \n"
 
 // Read 8 Y, and set 4 U and 4 V to 128
-#define READYUV400                                                             \
-    MEMACCESS(0)                                                               \
-    "ld1        {v0.8b}, [%0], #8              \n"                             \
-    "movi       v1.8b , #128                   \n"
+#define READYUV400                               \
+  "ld1        {v0.8b}, [%0], #8              \n" \
+  "movi       v1.8b , #128                   \n"
 
 // Read 8 Y and 4 UV from NV12
-#define READNV12                                                               \
-    MEMACCESS(0)                                                               \
-    "ld1        {v0.8b}, [%0], #8              \n"                             \
-    MEMACCESS(1)                                                               \
-    "ld1        {v2.8b}, [%1], #8              \n"                             \
-    "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
-    "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
-    "ins        v1.s[1], v3.s[0]               \n"
+#define READNV12                                 \
+  "ld1        {v0.8b}, [%0], #8              \n" \
+  "ld1        {v2.8b}, [%1], #8              \n" \
+  "uzp1       v1.8b, v2.8b, v2.8b            \n" \
+  "uzp2       v3.8b, v2.8b, v2.8b            \n" \
+  "ins        v1.s[1], v3.s[0]               \n"
 
 // Read 8 Y and 4 VU from NV21
-#define READNV21                                                               \
-    MEMACCESS(0)                                                               \
-    "ld1        {v0.8b}, [%0], #8              \n"                             \
-    MEMACCESS(1)                                                               \
-    "ld1        {v2.8b}, [%1], #8              \n"                             \
-    "uzp1       v3.8b, v2.8b, v2.8b            \n"                             \
-    "uzp2       v1.8b, v2.8b, v2.8b            \n"                             \
-    "ins        v1.s[1], v3.s[0]               \n"
+#define READNV21                                 \
+  "ld1        {v0.8b}, [%0], #8              \n" \
+  "ld1        {v2.8b}, [%1], #8              \n" \
+  "uzp1       v3.8b, v2.8b, v2.8b            \n" \
+  "uzp2       v1.8b, v2.8b, v2.8b            \n" \
+  "ins        v1.s[1], v3.s[0]               \n"
 
 // Read 8 YUY2
-#define READYUY2                                                               \
-    MEMACCESS(0)                                                               \
-    "ld2        {v0.8b, v1.8b}, [%0], #16      \n"                             \
-    "uzp2       v3.8b, v1.8b, v1.8b            \n"                             \
-    "uzp1       v1.8b, v1.8b, v1.8b            \n"                             \
-    "ins        v1.s[1], v3.s[0]               \n"
+#define READYUY2                                 \
+  "ld2        {v0.8b, v1.8b}, [%0], #16      \n" \
+  "uzp2       v3.8b, v1.8b, v1.8b            \n" \
+  "uzp1       v1.8b, v1.8b, v1.8b            \n" \
+  "ins        v1.s[1], v3.s[0]               \n"
 
 // Read 8 UYVY
-#define READUYVY                                                               \
-    MEMACCESS(0)                                                               \
-    "ld2        {v2.8b, v3.8b}, [%0], #16      \n"                             \
-    "orr        v0.8b, v3.8b, v3.8b            \n"                             \
-    "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
-    "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
-    "ins        v1.s[1], v3.s[0]               \n"
+#define READUYVY                                 \
+  "ld2        {v2.8b, v3.8b}, [%0], #16      \n" \
+  "orr        v0.8b, v3.8b, v3.8b            \n" \
+  "uzp1       v1.8b, v2.8b, v2.8b            \n" \
+  "uzp2       v3.8b, v2.8b, v2.8b            \n" \
+  "ins        v1.s[1], v3.s[0]               \n"
 
-#define YUVTORGB_SETUP                                                         \
-    "ld1r       {v24.8h}, [%[kUVBiasBGR]], #2  \n"                             \
-    "ld1r       {v25.8h}, [%[kUVBiasBGR]], #2  \n"                             \
-    "ld1r       {v26.8h}, [%[kUVBiasBGR]]      \n"                             \
-    "ld1r       {v31.4s}, [%[kYToRgb]]         \n"                             \
-    "ld2        {v27.8h, v28.8h}, [%[kUVToRB]] \n"                             \
-    "ld2        {v29.8h, v30.8h}, [%[kUVToG]]  \n"
+#define YUVTORGB_SETUP                           \
+  "ld1r       {v24.8h}, [%[kUVBiasBGR]], #2  \n" \
+  "ld1r       {v25.8h}, [%[kUVBiasBGR]], #2  \n" \
+  "ld1r       {v26.8h}, [%[kUVBiasBGR]]      \n" \
+  "ld1r       {v31.4s}, [%[kYToRgb]]         \n" \
+  "ld2        {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
+  "ld2        {v29.8h, v30.8h}, [%[kUVToG]]  \n"
 
-#define YUVTORGB(vR, vG, vB)                                                   \
-    "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */          \
-    "shll       v2.8h, v1.8b, #8               \n" /* Replicate UV */          \
-    "ushll2     v3.4s, v0.8h, #0               \n" /* Y */                     \
-    "ushll      v0.4s, v0.4h, #0               \n"                             \
-    "mul        v3.4s, v3.4s, v31.4s           \n"                             \
-    "mul        v0.4s, v0.4s, v31.4s           \n"                             \
-    "sqshrun    v0.4h, v0.4s, #16              \n"                             \
-    "sqshrun2   v0.8h, v3.4s, #16              \n" /* Y */                     \
-    "uaddw      v1.8h, v2.8h, v1.8b            \n" /* Replicate UV */          \
-    "mov        v2.d[0], v1.d[1]               \n" /* Extract V */             \
-    "uxtl       v2.8h, v2.8b                   \n"                             \
-    "uxtl       v1.8h, v1.8b                   \n" /* Extract U */             \
-    "mul        v3.8h, v1.8h, v27.8h           \n"                             \
-    "mul        v5.8h, v1.8h, v29.8h           \n"                             \
-    "mul        v6.8h, v2.8h, v30.8h           \n"                             \
-    "mul        v7.8h, v2.8h, v28.8h           \n"                             \
-    "sqadd      v6.8h, v6.8h, v5.8h            \n"                             \
-    "sqadd      " #vB ".8h, v24.8h, v0.8h      \n" /* B */                     \
-    "sqadd      " #vG ".8h, v25.8h, v0.8h      \n" /* G */                     \
-    "sqadd      " #vR ".8h, v26.8h, v0.8h      \n" /* R */                     \
-    "sqadd      " #vB ".8h, " #vB ".8h, v3.8h  \n" /* B */                     \
-    "sqsub      " #vG ".8h, " #vG ".8h, v6.8h  \n" /* G */                     \
-    "sqadd      " #vR ".8h, " #vR ".8h, v7.8h  \n" /* R */                     \
-    "sqshrun    " #vB ".8b, " #vB ".8h, #6     \n" /* B */                     \
-    "sqshrun    " #vG ".8b, " #vG ".8h, #6     \n" /* G */                     \
-    "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */                     \
+#define YUVTORGB(vR, vG, vB)                                        \
+  "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */ \
+  "shll       v2.8h, v1.8b, #8               \n" /* Replicate UV */ \
+  "ushll2     v3.4s, v0.8h, #0               \n" /* Y */            \
+  "ushll      v0.4s, v0.4h, #0               \n"                    \
+  "mul        v3.4s, v3.4s, v31.4s           \n"                    \
+  "mul        v0.4s, v0.4s, v31.4s           \n"                    \
+  "sqshrun    v0.4h, v0.4s, #16              \n"                    \
+  "sqshrun2   v0.8h, v3.4s, #16              \n" /* Y */            \
+  "uaddw      v1.8h, v2.8h, v1.8b            \n" /* Replicate UV */ \
+  "mov        v2.d[0], v1.d[1]               \n" /* Extract V */    \
+  "uxtl       v2.8h, v2.8b                   \n"                    \
+  "uxtl       v1.8h, v1.8b                   \n" /* Extract U */    \
+  "mul        v3.8h, v1.8h, v27.8h           \n"                    \
+  "mul        v5.8h, v1.8h, v29.8h           \n"                    \
+  "mul        v6.8h, v2.8h, v30.8h           \n"                    \
+  "mul        v7.8h, v2.8h, v28.8h           \n"                    \
+  "sqadd      v6.8h, v6.8h, v5.8h            \n"                    \
+  "sqadd      " #vB                                                 \
+  ".8h, v24.8h, v0.8h      \n" /* B */                              \
+  "sqadd      " #vG                                                 \
+  ".8h, v25.8h, v0.8h      \n" /* G */                              \
+  "sqadd      " #vR                                                 \
+  ".8h, v26.8h, v0.8h      \n" /* R */                              \
+  "sqadd      " #vB ".8h, " #vB                                     \
+  ".8h, v3.8h  \n" /* B */                                          \
+  "sqsub      " #vG ".8h, " #vG                                     \
+  ".8h, v6.8h  \n" /* G */                                          \
+  "sqadd      " #vR ".8h, " #vR                                     \
+  ".8h, v7.8h  \n" /* R */                                          \
+  "sqshrun    " #vB ".8b, " #vB                                     \
+  ".8h, #6     \n" /* B */                                          \
+  "sqshrun    " #vG ".8b, " #vG                                     \
+  ".8h, #6     \n"                               /* G */            \
+  "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */
 
-void I444ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+void I444ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
   asm volatile (
@@ -140,7 +125,6 @@ void I444ToARGBRow_NEON(const uint8* src_y,
     READYUV444
     YUVTORGB(v22, v21, v20)
     "subs       %w4, %w4, #8                   \n"
-    MEMACCESS(3)
     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
     "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
@@ -157,10 +141,10 @@ void I444ToARGBRow_NEON(const uint8* src_y,
   );
 }
 
-void I422ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+void I422ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
   asm volatile (
@@ -170,7 +154,6 @@ void I422ToARGBRow_NEON(const uint8* src_y,
     READYUV422
     YUVTORGB(v22, v21, v20)
     "subs       %w4, %w4, #8                   \n"
-    MEMACCESS(3)
     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
     "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
@@ -187,11 +170,11 @@ void I422ToARGBRow_NEON(const uint8* src_y,
   );
 }
 
-void I422AlphaToARGBRow_NEON(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             const uint8* src_a,
-                             uint8* dst_argb,
+void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             const uint8_t* src_a,
+                             uint8_t* dst_argb,
                              const struct YuvConstants* yuvconstants,
                              int width) {
   asm volatile (
@@ -199,10 +182,8 @@ void I422AlphaToARGBRow_NEON(const uint8* src_y,
   "1:                                          \n"
     READYUV422
     YUVTORGB(v22, v21, v20)
-    MEMACCESS(3)
     "ld1        {v23.8b}, [%3], #8             \n"
     "subs       %w5, %w5, #8                   \n"
-    MEMACCESS(4)
     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32     \n"
     "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
@@ -220,40 +201,10 @@ void I422AlphaToARGBRow_NEON(const uint8* src_y,
   );
 }
 
-void I411ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "movi       v23.8b, #255                   \n" /* A */
-  "1:                                          \n"
-    READYUV411
-    YUVTORGB(v22, v21, v20)
-    "subs       %w4, %w4, #8                   \n"
-    MEMACCESS(3)
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
-    "b.gt       1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_argb),  // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-
-void I422ToRGBARow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_rgba,
+void I422ToRGBARow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgba,
                         const struct YuvConstants* yuvconstants,
                         int width) {
   asm volatile (
@@ -263,7 +214,6 @@ void I422ToRGBARow_NEON(const uint8* src_y,
     READYUV422
     YUVTORGB(v23, v22, v21)
     "subs       %w4, %w4, #8                   \n"
-    MEMACCESS(3)
     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
     "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
@@ -280,10 +230,10 @@ void I422ToRGBARow_NEON(const uint8* src_y,
   );
 }
 
-void I422ToRGB24Row_NEON(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_rgb24,
+void I422ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width) {
   asm volatile (
@@ -292,7 +242,6 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
     READYUV422
     YUVTORGB(v22, v21, v20)
     "subs       %w4, %w4, #8                   \n"
-    MEMACCESS(3)
     "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
     "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
@@ -309,97 +258,91 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
   );
 }
 
-#define ARGBTORGB565                                                           \
-    "shll       v0.8h,  v22.8b, #8             \n"  /* R                    */ \
-    "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
-    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
-    "sri        v0.8h,  v21.8h, #5             \n"  /* RG                   */ \
-    "sri        v0.8h,  v20.8h, #11            \n"  /* RGB                  */
+#define ARGBTORGB565                                                        \
+  "shll       v0.8h,  v22.8b, #8             \n" /* R                    */ \
+  "shll       v21.8h, v21.8b, #8             \n" /* G                    */ \
+  "shll       v20.8h, v20.8b, #8             \n" /* B                    */ \
+  "sri        v0.8h,  v21.8h, #5             \n" /* RG                   */ \
+  "sri        v0.8h,  v20.8h, #11            \n" /* RGB                  */
 
-void I422ToRGB565Row_NEON(const uint8* src_y,
-                          const uint8* src_u,
-                          const uint8* src_v,
-                          uint8* dst_rgb565,
+void I422ToRGB565Row_NEON(const uint8_t* src_y,
+                          const uint8_t* src_u,
+                          const uint8_t* src_v,
+                          uint8_t* dst_rgb565,
                           const struct YuvConstants* yuvconstants,
                           int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB(v22, v21, v20)
-    "subs       %w4, %w4, #8                   \n"
-    ARGBTORGB565
-    MEMACCESS(3)
-    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
-    "b.gt       1b                             \n"
-    : "+r"(src_y),    // %0
-      "+r"(src_u),    // %1
-      "+r"(src_v),    // %2
-      "+r"(dst_rgb565),  // %3
-      "+r"(width)     // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV422 YUVTORGB(
+          v22, v21,
+          v20) "subs       %w4, %w4, #8                   \n" ARGBTORGB565
+               "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels
+                                                               // RGB565.
+               "b.gt       1b                             \n"
+      : "+r"(src_y),       // %0
+        "+r"(src_u),       // %1
+        "+r"(src_v),       // %2
+        "+r"(dst_rgb565),  // %3
+        "+r"(width)        // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
 }
 
-#define ARGBTOARGB1555                                                         \
-    "shll       v0.8h,  v23.8b, #8             \n"  /* A                    */ \
-    "shll       v22.8h, v22.8b, #8             \n"  /* R                    */ \
-    "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
-    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
-    "sri        v0.8h,  v22.8h, #1             \n"  /* AR                   */ \
-    "sri        v0.8h,  v21.8h, #6             \n"  /* ARG                  */ \
-    "sri        v0.8h,  v20.8h, #11            \n"  /* ARGB                 */
+#define ARGBTOARGB1555                                                      \
+  "shll       v0.8h,  v23.8b, #8             \n" /* A                    */ \
+  "shll       v22.8h, v22.8b, #8             \n" /* R                    */ \
+  "shll       v21.8h, v21.8b, #8             \n" /* G                    */ \
+  "shll       v20.8h, v20.8b, #8             \n" /* B                    */ \
+  "sri        v0.8h,  v22.8h, #1             \n" /* AR                   */ \
+  "sri        v0.8h,  v21.8h, #6             \n" /* ARG                  */ \
+  "sri        v0.8h,  v20.8h, #11            \n" /* ARGB                 */
 
-void I422ToARGB1555Row_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb1555,
+void I422ToARGB1555Row_NEON(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb1555,
                             const struct YuvConstants* yuvconstants,
                             int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "movi       v23.8b, #255                   \n"
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB(v22, v21, v20)
-    "subs       %w4, %w4, #8                   \n"
-    ARGBTOARGB1555
-    MEMACCESS(3)
-    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
-    "b.gt       1b                             \n"
-    : "+r"(src_y),    // %0
-      "+r"(src_u),    // %1
-      "+r"(src_v),    // %2
-      "+r"(dst_argb1555),  // %3
-      "+r"(width)     // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
+  asm volatile(
+      YUVTORGB_SETUP
+      "movi       v23.8b, #255                   \n"
+      "1:                                        \n" READYUV422 YUVTORGB(
+          v22, v21,
+          v20) "subs       %w4, %w4, #8                   \n" ARGBTOARGB1555
+               "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels
+                                                               // RGB565.
+               "b.gt       1b                             \n"
+      : "+r"(src_y),         // %0
+        "+r"(src_u),         // %1
+        "+r"(src_v),         // %2
+        "+r"(dst_argb1555),  // %3
+        "+r"(width)          // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
 }
 
-#define ARGBTOARGB4444                                                         \
-    /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \
-    "ushr       v20.8b, v20.8b, #4             \n"  /* B                    */ \
-    "bic        v21.8b, v21.8b, v4.8b          \n"  /* G                    */ \
-    "ushr       v22.8b, v22.8b, #4             \n"  /* R                    */ \
-    "bic        v23.8b, v23.8b, v4.8b          \n"  /* A                    */ \
-    "orr        v0.8b,  v20.8b, v21.8b         \n"  /* BG                   */ \
-    "orr        v1.8b,  v22.8b, v23.8b         \n"  /* RA                   */ \
-    "zip1       v0.16b, v0.16b, v1.16b         \n"  /* BGRA                 */
+#define ARGBTOARGB4444                                                       \
+  /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \
+  "ushr       v20.8b, v20.8b, #4             \n" /* B                    */  \
+  "bic        v21.8b, v21.8b, v4.8b          \n" /* G                    */  \
+  "ushr       v22.8b, v22.8b, #4             \n" /* R                    */  \
+  "bic        v23.8b, v23.8b, v4.8b          \n" /* A                    */  \
+  "orr        v0.8b,  v20.8b, v21.8b         \n" /* BG                   */  \
+  "orr        v1.8b,  v22.8b, v23.8b         \n" /* RA                   */  \
+  "zip1       v0.16b, v0.16b, v1.16b         \n" /* BGRA                 */
 
-void I422ToARGB4444Row_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb4444,
+void I422ToARGB4444Row_NEON(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb4444,
                             const struct YuvConstants* yuvconstants,
                             int width) {
   asm volatile (
@@ -411,7 +354,6 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
     "subs       %w4, %w4, #8                   \n"
     "movi       v23.8b, #255                   \n"
     ARGBTOARGB4444
-    MEMACCESS(3)
     "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels ARGB4444.
     "b.gt       1b                             \n"
     : "+r"(src_y),    // %0
@@ -428,9 +370,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
   );
 }
 
-void I400ToARGBRow_NEON(const uint8* src_y,
-                        uint8* dst_argb,
-                        int width) {
+void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
   asm volatile (
     YUVTORGB_SETUP
     "movi       v23.8b, #255                   \n"
@@ -438,7 +378,6 @@ void I400ToARGBRow_NEON(const uint8* src_y,
     READYUV400
     YUVTORGB(v22, v21, v20)
     "subs       %w2, %w2, #8                   \n"
-    MEMACCESS(1)
     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
     "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
@@ -453,31 +392,26 @@ void I400ToARGBRow_NEON(const uint8* src_y,
   );
 }
 
-void J400ToARGBRow_NEON(const uint8* src_y,
-                        uint8* dst_argb,
-                        int width) {
-  asm volatile (
-    "movi       v23.8b, #255                   \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v20.8b}, [%0], #8             \n"
-    "orr        v21.8b, v20.8b, v20.8b         \n"
-    "orr        v22.8b, v20.8b, v20.8b         \n"
-    "subs       %w2, %w2, #8                   \n"
-    MEMACCESS(1)
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
-    "b.gt       1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(dst_argb),  // %1
-      "+r"(width)      // %2
-    :
-    : "cc", "memory", "v20", "v21", "v22", "v23"
-  );
+void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "movi       v23.8b, #255                   \n"
+      "1:                                        \n"
+      "ld1        {v20.8b}, [%0], #8             \n"
+      "orr        v21.8b, v20.8b, v20.8b         \n"
+      "orr        v22.8b, v20.8b, v20.8b         \n"
+      "subs       %w2, %w2, #8                   \n"
+      "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_y),     // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v20", "v21", "v22", "v23");
 }
 
-void NV12ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_uv,
-                        uint8* dst_argb,
+void NV12ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_uv,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
   asm volatile (
@@ -487,7 +421,6 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
     READNV12
     YUVTORGB(v22, v21, v20)
     "subs       %w3, %w3, #8                   \n"
-    MEMACCESS(2)
     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
     "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
@@ -503,9 +436,9 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
   );
 }
 
-void NV21ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_vu,
-                        uint8* dst_argb,
+void NV21ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_vu,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
   asm volatile (
@@ -515,7 +448,6 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
     READNV21
     YUVTORGB(v22, v21, v20)
     "subs       %w3, %w3, #8                   \n"
-    MEMACCESS(2)
     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
     "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
@@ -531,24 +463,22 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
   );
 }
 
-void NV12ToRGB565Row_NEON(const uint8* src_y,
-                          const uint8* src_uv,
-                          uint8* dst_rgb565,
-                          const struct YuvConstants* yuvconstants,
-                          int width) {
+void NV12ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_uv,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
   asm volatile (
     YUVTORGB_SETUP
   "1:                                          \n"
     READNV12
     YUVTORGB(v22, v21, v20)
     "subs       %w3, %w3, #8                   \n"
-    ARGBTORGB565
-    MEMACCESS(2)
-    "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565.
+    "st3        {v20.8b,v21.8b,v22.8b}, [%2], #24     \n"
     "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
       "+r"(src_uv),    // %1
-      "+r"(dst_rgb565),  // %2
+      "+r"(dst_rgb24),  // %2
       "+r"(width)      // %3
     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
       [kUVToG]"r"(&yuvconstants->kUVToG),
@@ -559,8 +489,59 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
   );
 }
 
-void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
-                        uint8* dst_argb,
+void NV21ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  asm volatile (
+    YUVTORGB_SETUP
+  "1:                                          \n"
+    READNV21
+    YUVTORGB(v22, v21, v20)
+    "subs       %w3, %w3, #8                   \n"
+    "st3        {v20.8b,v21.8b,v22.8b}, [%2], #24     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_vu),    // %1
+      "+r"(dst_rgb24),  // %2
+      "+r"(width)      // %3
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+
+void NV12ToRGB565Row_NEON(const uint8_t* src_y,
+                          const uint8_t* src_uv,
+                          uint8_t* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READNV12 YUVTORGB(
+          v22, v21,
+          v20) "subs       %w3, %w3, #8                   \n" ARGBTORGB565
+               "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels
+                                                               // RGB565.
+               "b.gt       1b                             \n"
+      : "+r"(src_y),       // %0
+        "+r"(src_uv),      // %1
+        "+r"(dst_rgb565),  // %2
+        "+r"(width)        // %3
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
+}
+
+void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
   asm volatile (
@@ -570,7 +551,6 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
     READYUY2
     YUVTORGB(v22, v21, v20)
     "subs       %w2, %w2, #8                   \n"
-    MEMACCESS(1)
     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32      \n"
     "b.gt       1b                             \n"
     : "+r"(src_yuy2),  // %0
@@ -585,8 +565,8 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
   );
 }
 
-void UYVYToARGBRow_NEON(const uint8* src_uyvy,
-                        uint8* dst_argb,
+void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
   asm volatile (
@@ -596,7 +576,6 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
     READUYVY
     YUVTORGB(v22, v21, v20)
     "subs       %w2, %w2, #8                   \n"
-    MEMACCESS(1)
     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32      \n"
     "b.gt       1b                             \n"
     : "+r"(src_uyvy),  // %0
@@ -612,869 +591,819 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
 }
 
 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
-void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void SplitUVRow_NEON(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
                      int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pairs of UV
-    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
-    MEMACCESS(1)
-    "st1        {v0.16b}, [%1], #16            \n"  // store U
-    MEMACCESS(2)
-    "st1        {v1.16b}, [%2], #16            \n"  // store V
-    "b.gt       1b                             \n"
-    : "+r"(src_uv),  // %0
-      "+r"(dst_u),   // %1
-      "+r"(dst_v),   // %2
-      "+r"(width)    // %3  // Output registers
-    :                       // Input registers
-    : "cc", "memory", "v0", "v1"  // Clobber List
-  );
+  asm volatile(
+      "1:                                        \n"
+      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pairs of UV
+      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
+      "st1        {v0.16b}, [%1], #16            \n"  // store U
+      "st1        {v1.16b}, [%2], #16            \n"  // store V
+      "b.gt       1b                             \n"
+      : "+r"(src_uv),               // %0
+        "+r"(dst_u),                // %1
+        "+r"(dst_v),                // %2
+        "+r"(width)                 // %3  // Output registers
+      :                             // Input registers
+      : "cc", "memory", "v0", "v1"  // Clobber List
+      );
 }
 
 // Reads 16 U's and V's and writes out 16 pairs of UV.
-void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_NEON(const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_uv,
                      int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load U
-    MEMACCESS(1)
-    "ld1        {v1.16b}, [%1], #16            \n"  // load V
-    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
-    MEMACCESS(2)
-    "st2        {v0.16b,v1.16b}, [%2], #32     \n"  // store 16 pairs of UV
-    "b.gt       1b                             \n"
-    :
-      "+r"(src_u),   // %0
-      "+r"(src_v),   // %1
-      "+r"(dst_uv),  // %2
-      "+r"(width)    // %3  // Output registers
-    :                       // Input registers
-    : "cc", "memory", "v0", "v1"  // Clobber List
-  );
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load U
+      "ld1        {v1.16b}, [%1], #16            \n"  // load V
+      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
+      "st2        {v0.16b,v1.16b}, [%2], #32     \n"  // store 16 pairs of UV
+      "b.gt       1b                             \n"
+      : "+r"(src_u),                // %0
+        "+r"(src_v),                // %1
+        "+r"(dst_uv),               // %2
+        "+r"(width)                 // %3  // Output registers
+      :                             // Input registers
+      : "cc", "memory", "v0", "v1"  // Clobber List
+      );
 }
 
-// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
-void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32       \n"  // load 32
-    "subs       %w2, %w2, #32                  \n"  // 32 processed per loop
-    MEMACCESS(1)
-    "st1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32       \n"  // store 32
-    "b.gt       1b                             \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(count)  // %2  // Output registers
-  :                     // Input registers
-  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-
-// SetRow writes 'count' bytes using an 8 bit value repeated.
-void SetRow_NEON(uint8* dst, uint8 v8, int count) {
-  asm volatile (
-    "dup        v0.16b, %w2                    \n"  // duplicate 16 bytes
-  "1:                                          \n"
-    "subs       %w1, %w1, #16                  \n"  // 16 bytes per loop
-    MEMACCESS(0)
-    "st1        {v0.16b}, [%0], #16            \n"  // store
-    "b.gt       1b                             \n"
-  : "+r"(dst),   // %0
-    "+r"(count)  // %1
-  : "r"(v8)      // %2
-  : "cc", "memory", "v0"
-  );
-}
-
-void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
-  asm volatile (
-    "dup        v0.4s, %w2                     \n"  // duplicate 4 ints
-  "1:                                          \n"
-    "subs       %w1, %w1, #4                   \n"  // 4 ints per loop
-    MEMACCESS(0)
-    "st1        {v0.16b}, [%0], #16            \n"  // store
-    "b.gt       1b                             \n"
-  : "+r"(dst),   // %0
-    "+r"(count)  // %1
-  : "r"(v32)     // %2
-  : "cc", "memory", "v0"
-  );
-}
-
-void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    // Start at end of source row.
-    "add        %0, %0, %w2, sxtw              \n"
-    "sub        %0, %0, #16                    \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
-    "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop.
-    "rev64      v0.16b, v0.16b                 \n"
-    MEMACCESS(1)
-    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
-    MEMACCESS(1)
-    "st1        {v0.D}[0], [%1], #8            \n"
-    "b.gt       1b                             \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-  : "r"((ptrdiff_t)-16)    // %3
-  : "cc", "memory", "v0"
-  );
-}
-
-void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
+void SplitRGBRow_NEON(const uint8_t* src_rgb,
+                      uint8_t* dst_r,
+                      uint8_t* dst_g,
+                      uint8_t* dst_b,
                       int width) {
-  asm volatile (
-    // Start at end of source row.
-    "add        %0, %0, %w3, sxtw #1           \n"
-    "sub        %0, %0, #16                    \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16
-    "subs       %w3, %w3, #8                   \n"  // 8 pixels per loop.
-    "rev64      v0.8b, v0.8b                   \n"
-    "rev64      v1.8b, v1.8b                   \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // dst += 8
-    MEMACCESS(2)
-    "st1        {v1.8b}, [%2], #8              \n"
-    "b.gt       1b                             \n"
-  : "+r"(src_uv),  // %0
-    "+r"(dst_u),   // %1
-    "+r"(dst_v),   // %2
-    "+r"(width)    // %3
-  : "r"((ptrdiff_t)-16)      // %4
-  : "cc", "memory", "v0", "v1"
-  );
+  asm volatile(
+      "1:                                        \n"
+      "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 RGB
+      "subs       %w4, %w4, #16                  \n"  // 16 processed per loop
+      "st1        {v0.16b}, [%1], #16            \n"  // store R
+      "st1        {v1.16b}, [%2], #16            \n"  // store G
+      "st1        {v2.16b}, [%3], #16            \n"  // store B
+      "b.gt       1b                             \n"
+      : "+r"(src_rgb),                    // %0
+        "+r"(dst_r),                      // %1
+        "+r"(dst_g),                      // %2
+        "+r"(dst_b),                      // %3
+        "+r"(width)                       // %4
+      :                                   // Input registers
+      : "cc", "memory", "v0", "v1", "v2"  // Clobber List
+      );
 }
 
-void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-  // Start at end of source row.
-    "add        %0, %0, %w2, sxtw #2           \n"
-    "sub        %0, %0, #16                    \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
-    "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
-    "rev64      v0.4s, v0.4s                   \n"
-    MEMACCESS(1)
-    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
-    MEMACCESS(1)
-    "st1        {v0.D}[0], [%1], #8            \n"
-    "b.gt       1b                             \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-  : "r"((ptrdiff_t)-16)    // %3
-  : "cc", "memory", "v0"
-  );
+// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
+void MergeRGBRow_NEON(const uint8_t* src_r,
+                      const uint8_t* src_g,
+                      const uint8_t* src_b,
+                      uint8_t* dst_rgb,
+                      int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load R
+      "ld1        {v1.16b}, [%1], #16            \n"  // load G
+      "ld1        {v2.16b}, [%2], #16            \n"  // load B
+      "subs       %w4, %w4, #16                  \n"  // 16 processed per loop
+      "st3        {v0.16b,v1.16b,v2.16b}, [%3], #48 \n"  // store 16 RGB
+      "b.gt       1b                             \n"
+      : "+r"(src_r),                      // %0
+        "+r"(src_g),                      // %1
+        "+r"(src_b),                      // %2
+        "+r"(dst_rgb),                    // %3
+        "+r"(width)                       // %4
+      :                                   // Input registers
+      : "cc", "memory", "v0", "v1", "v2"  // Clobber List
+      );
 }
 
-void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
-  asm volatile (
-    "movi       v4.8b, #255                    \n"  // Alpha
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld3        {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of RGB24.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    MEMACCESS(1)
-    "st4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_rgb24),  // %0
-    "+r"(dst_argb),   // %1
-    "+r"(width)       // %2
-  :
-  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
-  );
+// Copy multiple of 32.
+void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ldp        q0, q1, [%0], #32              \n"
+      "subs       %w2, %w2, #32                  \n"  // 32 processed per loop
+      "stp        q0, q1, [%1], #32              \n"
+      "b.gt       1b                             \n"
+      : "+r"(src),                  // %0
+        "+r"(dst),                  // %1
+        "+r"(width)                 // %2  // Output registers
+      :                             // Input registers
+      : "cc", "memory", "v0", "v1"  // Clobber List
+      );
 }
 
-void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
-  asm volatile (
-    "movi       v5.8b, #255                    \n"  // Alpha
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
-    "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
-    MEMACCESS(1)
-    "st4        {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
-    "b.gt       1b                             \n"
-  : "+r"(src_raw),   // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)      // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
-  );
+// SetRow writes 'width' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
+  asm volatile(
+      "dup        v0.16b, %w2                    \n"  // duplicate 16 bytes
+      "1:                                        \n"
+      "subs       %w1, %w1, #16                  \n"  // 16 bytes per loop
+      "st1        {v0.16b}, [%0], #16            \n"  // store
+      "b.gt       1b                             \n"
+      : "+r"(dst),   // %0
+        "+r"(width)  // %1
+      : "r"(v8)      // %2
+      : "cc", "memory", "v0");
 }
 
-void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
-    "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
-    MEMACCESS(1)
-    "st3        {v2.8b,v3.8b,v4.8b}, [%1], #24 \n"  // store b g r
-    "b.gt       1b                             \n"
-  : "+r"(src_raw),    // %0
-    "+r"(dst_rgb24),  // %1
-    "+r"(width)       // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
-  );
+void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
+  asm volatile(
+      "dup        v0.4s, %w2                     \n"  // duplicate 4 ints
+      "1:                                        \n"
+      "subs       %w1, %w1, #4                   \n"  // 4 ints per loop
+      "st1        {v0.16b}, [%0], #16            \n"  // store
+      "b.gt       1b                             \n"
+      : "+r"(dst),   // %0
+        "+r"(width)  // %1
+      : "r"(v32)     // %2
+      : "cc", "memory", "v0");
 }
 
-#define RGB565TOARGB                                                           \
-    "shrn       v6.8b, v0.8h, #5               \n"  /* G xxGGGGGG           */ \
-    "shl        v6.8b, v6.8b, #2               \n"  /* G GGGGGG00 upper 6   */ \
-    "ushr       v4.8b, v6.8b, #6               \n"  /* G 000000GG lower 2   */ \
-    "orr        v1.8b, v4.8b, v6.8b            \n"  /* G                    */ \
-    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
-    "ushr       v0.8h, v0.8h, #11              \n"  /* R 000RRRRR           */ \
-    "xtn2       v2.16b,v0.8h                   \n"  /* R in upper part      */ \
-    "shl        v2.16b, v2.16b, #3             \n"  /* R,B BBBBB000 upper 5 */ \
-    "ushr       v0.16b, v2.16b, #5             \n"  /* R,B 00000BBB lower 3 */ \
-    "orr        v0.16b, v0.16b, v2.16b         \n"  /* R,B                  */ \
-    "dup        v2.2D, v0.D[1]                 \n"  /* R                    */
-
-void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
-  asm volatile (
-    "movi       v3.8b, #255                    \n"  // Alpha
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    RGB565TOARGB
-    MEMACCESS(1)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_rgb565),  // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)          // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List
-  );
+void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      // Start at end of source row.
+      "add        %0, %0, %w2, sxtw              \n"
+      "sub        %0, %0, #16                    \n"
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
+      "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop.
+      "rev64      v0.16b, v0.16b                 \n"
+      "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
+      "st1        {v0.D}[0], [%1], #8            \n"
+      "b.gt       1b                             \n"
+      : "+r"(src),           // %0
+        "+r"(dst),           // %1
+        "+r"(width)          // %2
+      : "r"((ptrdiff_t)-16)  // %3
+      : "cc", "memory", "v0");
 }
 
-#define ARGB1555TOARGB                                                         \
-    "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
-    "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
-    "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000 AAAAAAAA    */ \
-                                                                               \
-    "sshr       v2.8h, v0.8h, #15              \n"  /* A AAAAAAAA           */ \
-    "xtn2       v3.16b, v2.8h                  \n"                             \
-                                                                               \
-    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
-    "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
-                                                                               \
-    "ushr       v1.16b, v3.16b, #5             \n"  /* R,A 00000RRR lower 3 */ \
-    "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
-    "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
-                                                                               \
-    "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
-    "orr        v2.16b, v1.16b, v3.16b         \n"  /* R,A                  */ \
-    "dup        v1.2D, v0.D[1]                 \n"                             \
-    "dup        v3.2D, v2.D[1]                 \n"
+void MirrorUVRow_NEON(const uint8_t* src_uv,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      // Start at end of source row.
+      "add        %0, %0, %w3, sxtw #1           \n"
+      "sub        %0, %0, #16                    \n"
+      "1:                                        \n"
+      "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16
+      "subs       %w3, %w3, #8                   \n"  // 8 pixels per loop.
+      "rev64      v0.8b, v0.8b                   \n"
+      "rev64      v1.8b, v1.8b                   \n"
+      "st1        {v0.8b}, [%1], #8              \n"  // dst += 8
+      "st1        {v1.8b}, [%2], #8              \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_uv),        // %0
+        "+r"(dst_u),         // %1
+        "+r"(dst_v),         // %2
+        "+r"(width)          // %3
+      : "r"((ptrdiff_t)-16)  // %4
+      : "cc", "memory", "v0", "v1");
+}
+
+void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      // Start at end of source row.
+      "add        %0, %0, %w2, sxtw #2           \n"
+      "sub        %0, %0, #16                    \n"
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
+      "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
+      "rev64      v0.4s, v0.4s                   \n"
+      "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
+      "st1        {v0.D}[0], [%1], #8            \n"
+      "b.gt       1b                             \n"
+      : "+r"(src),           // %0
+        "+r"(dst),           // %1
+        "+r"(width)          // %2
+      : "r"((ptrdiff_t)-16)  // %3
+      : "cc", "memory", "v0");
+}
+
+void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
+                         uint8_t* dst_argb,
+                         int width) {
+  asm volatile(
+      "movi       v4.8b, #255                    \n"  // Alpha
+      "1:                                        \n"
+      "ld3        {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of RGB24.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "st4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB
+      "b.gt       1b                             \n"
+      : "+r"(src_rgb24),  // %0
+        "+r"(dst_argb),   // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
+      );
+}
+
+void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "movi       v5.8b, #255                    \n"  // Alpha
+      "1:                                        \n"
+      "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
+      "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
+      "st4        {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
+      "b.gt       1b                             \n"
+      : "+r"(src_raw),   // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
+      );
+}
+
+void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
+      "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
+      "st3        {v2.8b,v3.8b,v4.8b}, [%1], #24 \n"  // store b g r
+      "b.gt       1b                             \n"
+      : "+r"(src_raw),    // %0
+        "+r"(dst_rgb24),  // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
+      );
+}
+
+#define RGB565TOARGB                                                        \
+  "shrn       v6.8b, v0.8h, #5               \n" /* G xxGGGGGG           */ \
+  "shl        v6.8b, v6.8b, #2               \n" /* G GGGGGG00 upper 6   */ \
+  "ushr       v4.8b, v6.8b, #6               \n" /* G 000000GG lower 2   */ \
+  "orr        v1.8b, v4.8b, v6.8b            \n" /* G                    */ \
+  "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
+  "ushr       v0.8h, v0.8h, #11              \n" /* R 000RRRRR           */ \
+  "xtn2       v2.16b,v0.8h                   \n" /* R in upper part      */ \
+  "shl        v2.16b, v2.16b, #3             \n" /* R,B BBBBB000 upper 5 */ \
+  "ushr       v0.16b, v2.16b, #5             \n" /* R,B 00000BBB lower 3 */ \
+  "orr        v0.16b, v0.16b, v2.16b         \n" /* R,B                  */ \
+  "dup        v2.2D, v0.D[1]                 \n" /* R                    */
+
+void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      "movi       v3.8b, #255                    \n"  // Alpha
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      RGB565TOARGB
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
+      "b.gt       1b                             \n"
+      : "+r"(src_rgb565),  // %0
+        "+r"(dst_argb),    // %1
+        "+r"(width)        // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List
+      );
+}
+
+#define ARGB1555TOARGB                                                      \
+  "ushr       v2.8h, v0.8h, #10              \n" /* R xxxRRRRR           */ \
+  "shl        v2.8h, v2.8h, #3               \n" /* R RRRRR000 upper 5   */ \
+  "xtn        v3.8b, v2.8h                   \n" /* RRRRR000 AAAAAAAA    */ \
+                                                                            \
+  "sshr       v2.8h, v0.8h, #15              \n" /* A AAAAAAAA           */ \
+  "xtn2       v3.16b, v2.8h                  \n"                            \
+                                                                            \
+  "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
+  "shrn2      v2.16b,v0.8h, #5               \n" /* G xxxGGGGG           */ \
+                                                                            \
+  "ushr       v1.16b, v3.16b, #5             \n" /* R,A 00000RRR lower 3 */ \
+  "shl        v0.16b, v2.16b, #3             \n" /* B,G BBBBB000 upper 5 */ \
+  "ushr       v2.16b, v0.16b, #5             \n" /* B,G 00000BBB lower 3 */ \
+                                                                            \
+  "orr        v0.16b, v0.16b, v2.16b         \n" /* B,G                  */ \
+  "orr        v2.16b, v1.16b, v3.16b         \n" /* R,A                  */ \
+  "dup        v1.2D, v0.D[1]                 \n"                            \
+  "dup        v3.2D, v2.D[1]                 \n"
 
 // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
-#define RGB555TOARGB                                                           \
-    "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
-    "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
-    "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000             */ \
-                                                                               \
-    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
-    "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
-                                                                               \
-    "ushr       v1.16b, v3.16b, #5             \n"  /* R   00000RRR lower 3 */ \
-    "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
-    "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
-                                                                               \
-    "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
-    "orr        v2.16b, v1.16b, v3.16b         \n"  /* R                    */ \
-    "dup        v1.2D, v0.D[1]                 \n"  /* G */                    \
+#define RGB555TOARGB                                                        \
+  "ushr       v2.8h, v0.8h, #10              \n" /* R xxxRRRRR           */ \
+  "shl        v2.8h, v2.8h, #3               \n" /* R RRRRR000 upper 5   */ \
+  "xtn        v3.8b, v2.8h                   \n" /* RRRRR000             */ \
+                                                                            \
+  "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
+  "shrn2      v2.16b,v0.8h, #5               \n" /* G xxxGGGGG           */ \
+                                                                            \
+  "ushr       v1.16b, v3.16b, #5             \n" /* R   00000RRR lower 3 */ \
+  "shl        v0.16b, v2.16b, #3             \n" /* B,G BBBBB000 upper 5 */ \
+  "ushr       v2.16b, v0.16b, #5             \n" /* B,G 00000BBB lower 3 */ \
+                                                                            \
+  "orr        v0.16b, v0.16b, v2.16b         \n" /* B,G                  */ \
+  "orr        v2.16b, v1.16b, v3.16b         \n" /* R                    */ \
+  "dup        v1.2D, v0.D[1]                 \n" /* G */
 
-void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
+void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
+                            uint8_t* dst_argb,
                             int width) {
-  asm volatile (
-    "movi       v3.8b, #255                    \n"  // Alpha
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    ARGB1555TOARGB
-    MEMACCESS(1)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_argb1555),  // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)          // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
+  asm volatile(
+      "movi       v3.8b, #255                    \n"  // Alpha
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      ARGB1555TOARGB
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
+                                                            // pixels
+      "b.gt       1b                             \n"
+      : "+r"(src_argb1555),  // %0
+        "+r"(dst_argb),      // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+      );
 }
 
-#define ARGB4444TOARGB                                                         \
-    "shrn       v1.8b,  v0.8h, #8              \n"  /* v1(l) AR             */ \
-    "xtn2       v1.16b, v0.8h                  \n"  /* v1(h) GB             */ \
-    "shl        v2.16b, v1.16b, #4             \n"  /* B,R BBBB0000         */ \
-    "ushr       v3.16b, v1.16b, #4             \n"  /* G,A 0000GGGG         */ \
-    "ushr       v0.16b, v2.16b, #4             \n"  /* B,R 0000BBBB         */ \
-    "shl        v1.16b, v3.16b, #4             \n"  /* G,A GGGG0000         */ \
-    "orr        v2.16b, v0.16b, v2.16b         \n"  /* B,R BBBBBBBB         */ \
-    "orr        v3.16b, v1.16b, v3.16b         \n"  /* G,A GGGGGGGG         */ \
-    "dup        v0.2D, v2.D[1]                 \n"                             \
-    "dup        v1.2D, v3.D[1]                 \n"
+#define ARGB4444TOARGB                                                      \
+  "shrn       v1.8b,  v0.8h, #8              \n" /* v1(l) AR             */ \
+  "xtn2       v1.16b, v0.8h                  \n" /* v1(h) GB             */ \
+  "shl        v2.16b, v1.16b, #4             \n" /* B,R BBBB0000         */ \
+  "ushr       v3.16b, v1.16b, #4             \n" /* G,A 0000GGGG         */ \
+  "ushr       v0.16b, v2.16b, #4             \n" /* B,R 0000BBBB         */ \
+  "shl        v1.16b, v3.16b, #4             \n" /* G,A GGGG0000         */ \
+  "orr        v2.16b, v0.16b, v2.16b         \n" /* B,R BBBBBBBB         */ \
+  "orr        v3.16b, v1.16b, v3.16b         \n" /* G,A GGGGGGGG         */ \
+  "dup        v0.2D, v2.D[1]                 \n"                            \
+  "dup        v1.2D, v3.D[1]                 \n"
 
-void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
+void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
+                            uint8_t* dst_argb,
                             int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    ARGB4444TOARGB
-    MEMACCESS(1)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_argb4444),  // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)          // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
-  );
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      ARGB4444TOARGB
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
+                                                            // pixels
+      "b.gt       1b                             \n"
+      : "+r"(src_argb4444),  // %0
+        "+r"(dst_argb),      // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
+      );
 }
 
-void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB pixels
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    MEMACCESS(1)
-    "st3        {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of RGB24.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_rgb24),  // %1
-    "+r"(width)         // %2
-  :
-  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
-  );
-}
-
-void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "orr        v4.8b, v2.8b, v2.8b            \n"  // mov g
-    "orr        v5.8b, v1.8b, v1.8b            \n"  // mov b
-    MEMACCESS(1)
-    "st3        {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_raw),   // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
-  );
-}
-
-void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of YUY2.
-    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
-    MEMACCESS(1)
-    "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_yuy2),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "v0", "v1"  // Clobber List
-  );
-}
-
-void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of UYVY.
-    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
-    MEMACCESS(1)
-    "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_uyvy),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "v0", "v1"  // Clobber List
-  );
-}
-
-void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_rgb24,
                          int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2 pixels
-    "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
-    MEMACCESS(1)
-    "st1        {v1.8b}, [%1], #8              \n"  // store 8 U.
-    MEMACCESS(2)
-    "st1        {v3.8b}, [%2], #8              \n"  // store 8 V.
-    "b.gt       1b                             \n"
-  : "+r"(src_yuy2),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
+  asm volatile(
+      "1:                                        \n"
+      "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "st3        {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of
+                                                      // RGB24.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),   // %0
+        "+r"(dst_rgb24),  // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
+      );
 }
 
-void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
+void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "orr        v4.8b, v2.8b, v2.8b            \n"  // mov g
+      "orr        v5.8b, v1.8b, v1.8b            \n"  // mov b
+      "st3        {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_raw),   // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
+      );
+}
+
+void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of YUY2.
+      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
+      "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1"  // Clobber List
+      );
+}
+
+void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of UYVY.
+      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
+      "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1"  // Clobber List
+      );
+}
+
+void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
                          int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY pixels
-    "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 U.
-    MEMACCESS(2)
-    "st1        {v2.8b}, [%2], #8              \n"  // store 8 V.
-    "b.gt       1b                             \n"
-  : "+r"(src_uyvy),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
+  asm volatile(
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2
+      "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
+      "st1        {v1.8b}, [%1], #8              \n"  // store 8 U.
+      "st1        {v3.8b}, [%2], #8              \n"  // store 8 V.
+      "b.gt       1b                             \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+      );
 }
 
-void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
-    "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
-    MEMACCESS(1)
-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
-    "urhadd     v1.8b, v1.8b, v5.8b            \n"  // average rows of U
-    "urhadd     v3.8b, v3.8b, v7.8b            \n"  // average rows of V
-    MEMACCESS(2)
-    "st1        {v1.8b}, [%2], #8              \n"  // store 8 U.
-    MEMACCESS(3)
-    "st1        {v3.8b}, [%3], #8              \n"  // store 8 V.
-    "b.gt       1b                             \n"
-  : "+r"(src_yuy2),     // %0
-    "+r"(src_yuy2b),    // %1
-    "+r"(dst_u),        // %2
-    "+r"(dst_v),        // %3
-    "+r"(width)           // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
-    "v5", "v6", "v7"  // Clobber List
-  );
+void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY
+      "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 U.
+      "st1        {v2.8b}, [%2], #8              \n"  // store 8 V.
+      "b.gt       1b                             \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+      );
 }
 
-void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_uyvyb = src_uyvy + stride_uyvy;
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
-    "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
-    MEMACCESS(1)
-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
-    "urhadd     v0.8b, v0.8b, v4.8b            \n"  // average rows of U
-    "urhadd     v2.8b, v2.8b, v6.8b            \n"  // average rows of V
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 U.
-    MEMACCESS(3)
-    "st1        {v2.8b}, [%3], #8              \n"  // store 8 V.
-    "b.gt       1b                             \n"
-  : "+r"(src_uyvy),     // %0
-    "+r"(src_uyvyb),    // %1
-    "+r"(dst_u),        // %2
-    "+r"(dst_v),        // %3
-    "+r"(width)           // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
-    "v5", "v6", "v7"  // Clobber List
-  );
+void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
+                      int stride_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
+  asm volatile(
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
+      "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
+      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
+      "urhadd     v1.8b, v1.8b, v5.8b            \n"        // average rows of U
+      "urhadd     v3.8b, v3.8b, v7.8b            \n"        // average rows of V
+      "st1        {v1.8b}, [%2], #8              \n"        // store 8 U.
+      "st1        {v3.8b}, [%3], #8              \n"        // store 8 V.
+      "b.gt       1b                             \n"
+      : "+r"(src_yuy2),   // %0
+        "+r"(src_yuy2b),  // %1
+        "+r"(dst_u),      // %2
+        "+r"(dst_v),      // %3
+        "+r"(width)       // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+        "v7"  // Clobber List
+      );
+}
+
+void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
+                      int stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_uyvyb = src_uyvy + stride_uyvy;
+  asm volatile(
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
+      "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
+      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
+      "urhadd     v0.8b, v0.8b, v4.8b            \n"        // average rows of U
+      "urhadd     v2.8b, v2.8b, v6.8b            \n"        // average rows of V
+      "st1        {v0.8b}, [%2], #8              \n"        // store 8 U.
+      "st1        {v2.8b}, [%3], #8              \n"        // store 8 V.
+      "b.gt       1b                             \n"
+      : "+r"(src_uyvy),   // %0
+        "+r"(src_uyvyb),  // %1
+        "+r"(dst_u),      // %2
+        "+r"(dst_v),      // %3
+        "+r"(width)       // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+        "v7"  // Clobber List
+      );
 }
 
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int width) {
-  asm volatile (
-    MEMACCESS(3)
-    "ld1        {v2.16b}, [%3]                 \n"  // shuffler
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 4 pixels.
-    "subs       %w2, %w2, #4                   \n"  // 4 processed per loop
-    "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels
-    MEMACCESS(1)
-    "st1        {v1.16b}, [%1], #16            \n"  // store 4.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)        // %2
-  : "r"(shuffler)    // %3
-  : "cc", "memory", "v0", "v1", "v2"  // Clobber List
-  );
+void ARGBShuffleRow_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_argb,
+                         const uint8_t* shuffler,
+                         int width) {
+  asm volatile(
+      "ld1        {v2.16b}, [%3]                 \n"  // shuffler
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 4 pixels.
+      "subs       %w2, %w2, #4                   \n"  // 4 processed per loop
+      "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels
+      "st1        {v1.16b}, [%1], #16            \n"  // store 4.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),                   // %0
+        "+r"(dst_argb),                   // %1
+        "+r"(width)                       // %2
+      : "r"(shuffler)                     // %3
+      : "cc", "memory", "v0", "v1", "v2"  // Clobber List
+      );
 }
 
-void I422ToYUY2Row_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_yuy2, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld2        {v0.8b, v1.8b}, [%0], #16      \n"  // load 16 Ys
-    "orr        v2.8b, v1.8b, v1.8b            \n"
-    MEMACCESS(1)
-    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 Us
-    MEMACCESS(2)
-    "ld1        {v3.8b}, [%2], #8              \n"  // load 8 Vs
-    "subs       %w4, %w4, #16                  \n"  // 16 pixels
-    MEMACCESS(3)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
-    "b.gt       1b                             \n"
-  : "+r"(src_y),     // %0
-    "+r"(src_u),     // %1
-    "+r"(src_v),     // %2
-    "+r"(dst_yuy2),  // %3
-    "+r"(width)      // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3"
-  );
+void I422ToYUY2Row_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_yuy2,
+                        int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld2        {v0.8b, v1.8b}, [%0], #16      \n"  // load 16 Ys
+      "orr        v2.8b, v1.8b, v1.8b            \n"
+      "ld1        {v1.8b}, [%1], #8              \n"        // load 8 Us
+      "ld1        {v3.8b}, [%2], #8              \n"        // load 8 Vs
+      "subs       %w4, %w4, #16                  \n"        // 16 pixels
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
+      "b.gt       1b                             \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_yuy2),  // %3
+        "+r"(width)      // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3");
 }
 
-void I422ToUYVYRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_uyvy, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld2        {v1.8b,v2.8b}, [%0], #16       \n"  // load 16 Ys
-    "orr        v3.8b, v2.8b, v2.8b            \n"
-    MEMACCESS(1)
-    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 Us
-    MEMACCESS(2)
-    "ld1        {v2.8b}, [%2], #8              \n"  // load 8 Vs
-    "subs       %w4, %w4, #16                  \n"  // 16 pixels
-    MEMACCESS(3)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
-    "b.gt       1b                             \n"
-  : "+r"(src_y),     // %0
-    "+r"(src_u),     // %1
-    "+r"(src_v),     // %2
-    "+r"(dst_uyvy),  // %3
-    "+r"(width)      // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3"
-  );
+void I422ToUYVYRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_uyvy,
+                        int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld2        {v1.8b,v2.8b}, [%0], #16       \n"  // load 16 Ys
+      "orr        v3.8b, v2.8b, v2.8b            \n"
+      "ld1        {v0.8b}, [%1], #8              \n"        // load 8 Us
+      "ld1        {v2.8b}, [%2], #8              \n"        // load 8 Vs
+      "subs       %w4, %w4, #16                  \n"        // 16 pixels
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
+      "b.gt       1b                             \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_uyvy),  // %3
+        "+r"(width)      // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3");
 }
 
-void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    ARGBTORGB565
-    MEMACCESS(1)
-    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels RGB565.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_rgb565),  // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
-  );
+void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
+                          uint8_t* dst_rgb565,
+                          int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      ARGBTORGB565
+      "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels RGB565.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),    // %0
+        "+r"(dst_rgb565),  // %1
+        "+r"(width)        // %2
+      :
+      : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
 }
 
-void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
-                                const uint32 dither4, int width) {
-  asm volatile (
-    "dup        v1.4s, %w2                     \n"  // dither4
-  "1:                                          \n"
-    MEMACCESS(1)
-    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"  // load 8 pixels
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "uqadd      v20.8b, v20.8b, v1.8b          \n"
-    "uqadd      v21.8b, v21.8b, v1.8b          \n"
-    "uqadd      v22.8b, v22.8b, v1.8b          \n"
-    ARGBTORGB565
-    MEMACCESS(0)
-    "st1        {v0.16b}, [%0], #16            \n"  // store 8 pixels RGB565.
-    "b.gt       1b                             \n"
-  : "+r"(dst_rgb)    // %0
-  : "r"(src_argb),   // %1
-    "r"(dither4),    // %2
-    "r"(width)       // %3
-  : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"
-  );
+void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
+                                uint8_t* dst_rgb,
+                                const uint32_t dither4,
+                                int width) {
+  asm volatile(
+      "dup        v1.4s, %w2                     \n"  // dither4
+      "1:                                        \n"
+      "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"  // load 8 pixels
+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+      "uqadd      v20.8b, v20.8b, v1.8b          \n"
+      "uqadd      v21.8b, v21.8b, v1.8b          \n"
+      "uqadd      v22.8b, v22.8b, v1.8b          \n" ARGBTORGB565
+      "st1        {v0.16b}, [%0], #16            \n"  // store 8 pixels RGB565.
+      "b.gt       1b                             \n"
+      : "+r"(dst_rgb)   // %0
+      : "r"(src_argb),  // %1
+        "r"(dither4),   // %2
+        "r"(width)      // %3
+      : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23");
 }
 
-void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
+void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
+                            uint8_t* dst_argb1555,
                             int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    ARGBTOARGB1555
-    MEMACCESS(1)
-    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB1555.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb1555),  // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
-  );
+  asm volatile(
+      "1:                                        \n"
+      "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      ARGBTOARGB1555
+      "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels
+                                                      // ARGB1555.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),      // %0
+        "+r"(dst_argb1555),  // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
 }
 
-void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
+void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
+                            uint8_t* dst_argb4444,
                             int width) {
-  asm volatile (
-    "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    ARGBTOARGB4444
-    MEMACCESS(1)
-    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB4444.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),      // %0
-    "+r"(dst_argb4444),  // %1
-    "+r"(width)            // %2
-  :
-  : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"
-  );
+  asm volatile(
+      "movi       v4.16b, #0x0f                  \n"  // bits to clear with
+                                                      // vbic.
+      "1:                                        \n"
+      "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      ARGBTOARGB4444
+      "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels
+                                                      // ARGB4444.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),      // %0
+        "+r"(dst_argb4444),  // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23");
 }
 
-void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
-  asm volatile (
-    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
-    "movi       v7.8b, #16                     \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
-    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
-    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
-    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v7.8b            \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-  );
+void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+      "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+      "movi       v7.8b, #16                     \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "umull      v3.8h, v0.8b, v4.8b            \n"  // B
+      "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
+      "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
+      "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+      "uqadd      v0.8b, v0.8b, v7.8b            \n"
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 }
 
-void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load row 16 pixels
-    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
-    MEMACCESS(1)
-    "st1        {v3.16b}, [%1], #16            \n"  // store 16 A's.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_a),      // %1
-    "+r"(width)       // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
+void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load row 16
+                                                                // pixels
+      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
+      "st1        {v3.16b}, [%1], #16            \n"  // store 16 A's.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_a),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+      );
 }
 
-void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
-  asm volatile (
-    "movi       v4.8b, #15                     \n"  // B * 0.11400 coefficient
-    "movi       v5.8b, #75                     \n"  // G * 0.58700 coefficient
-    "movi       v6.8b, #38                     \n"  // R * 0.29900 coefficient
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
-    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
-    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
-    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 15 bit to 8 bit Y
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
-  );
+void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi       v4.8b, #15                     \n"  // B * 0.11400 coefficient
+      "movi       v5.8b, #75                     \n"  // G * 0.58700 coefficient
+      "movi       v6.8b, #38                     \n"  // R * 0.29900 coefficient
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "umull      v3.8h, v0.8b, v4.8b            \n"  // B
+      "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
+      "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
+      "sqrshrun   v0.8b, v3.8h, #7               \n"  // 15 bit to 8 bit Y
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
 }
 
 // 8x1 pixels.
-void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+void ARGBToUV444Row_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
                          int width) {
-  asm volatile (
-    "movi       v24.8b, #112                   \n"  // UB / VR 0.875 coefficient
-    "movi       v25.8b, #74                    \n"  // UG -0.5781 coefficient
-    "movi       v26.8b, #38                    \n"  // UR -0.2969 coefficient
-    "movi       v27.8b, #18                    \n"  // VB -0.1406 coefficient
-    "movi       v28.8b, #94                    \n"  // VG -0.7344 coefficient
-    "movi       v29.16b,#0x80                  \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "umull      v4.8h, v0.8b, v24.8b           \n"  // B
-    "umlsl      v4.8h, v1.8b, v25.8b           \n"  // G
-    "umlsl      v4.8h, v2.8b, v26.8b           \n"  // R
-    "add        v4.8h, v4.8h, v29.8h           \n"  // +128 -> unsigned
+  asm volatile(
+      "movi       v24.8b, #112                   \n"  // UB / VR 0.875
+                                                      // coefficient
+      "movi       v25.8b, #74                    \n"  // UG -0.5781 coefficient
+      "movi       v26.8b, #38                    \n"  // UR -0.2969 coefficient
+      "movi       v27.8b, #18                    \n"  // VB -0.1406 coefficient
+      "movi       v28.8b, #94                    \n"  // VG -0.7344 coefficient
+      "movi       v29.16b,#0x80                  \n"  // 128.5
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+                                                            // pixels.
+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+      "umull      v4.8h, v0.8b, v24.8b           \n"  // B
+      "umlsl      v4.8h, v1.8b, v25.8b           \n"  // G
+      "umlsl      v4.8h, v2.8b, v26.8b           \n"  // R
+      "add        v4.8h, v4.8h, v29.8h           \n"  // +128 -> unsigned
 
-    "umull      v3.8h, v2.8b, v24.8b           \n"  // R
-    "umlsl      v3.8h, v1.8b, v28.8b           \n"  // G
-    "umlsl      v3.8h, v0.8b, v27.8b           \n"  // B
-    "add        v3.8h, v3.8h, v29.8h           \n"  // +128 -> unsigned
+      "umull      v3.8h, v2.8b, v24.8b           \n"  // R
+      "umlsl      v3.8h, v1.8b, v28.8b           \n"  // G
+      "umlsl      v3.8h, v0.8b, v27.8b           \n"  // B
+      "add        v3.8h, v3.8h, v29.8h           \n"  // +128 -> unsigned
 
-    "uqshrn     v0.8b, v4.8h, #8               \n"  // 16 bit to 8 bit U
-    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
+      "uqshrn     v0.8b, v4.8h, #8               \n"  // 16 bit to 8 bit U
+      "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
 
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
-    MEMACCESS(2)
-    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
-    "v24", "v25", "v26", "v27", "v28", "v29"
-  );
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
+      "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26",
+        "v27", "v28", "v29");
 }
 
-#define RGBTOUV_SETUP_REG                                                      \
-    "movi       v20.8h, #56, lsl #0  \n"  /* UB/VR coefficient (0.875) / 2 */  \
-    "movi       v21.8h, #37, lsl #0  \n"  /* UG coefficient (-0.5781) / 2  */  \
-    "movi       v22.8h, #19, lsl #0  \n"  /* UR coefficient (-0.2969) / 2  */  \
-    "movi       v23.8h, #9,  lsl #0  \n"  /* VB coefficient (-0.1406) / 2  */  \
-    "movi       v24.8h, #47, lsl #0  \n"  /* VG coefficient (-0.7344) / 2  */  \
-    "movi       v25.16b, #0x80       \n"  /* 128.5 (0x8080 in 16-bit)      */
-
-// 32x1 pixels -> 8x1.  width is number of argb pixels. e.g. 32.
-void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                         int width) {
-  asm volatile (
-    RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(0)
-    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n"  // load next 16.
-    "uaddlp     v4.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v5.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v6.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "addp       v0.8h, v0.8h, v4.8h            \n"  // B 16 shorts -> 8 shorts.
-    "addp       v1.8h, v1.8h, v5.8h            \n"  // G 16 shorts -> 8 shorts.
-    "addp       v2.8h, v2.8h, v6.8h            \n"  // R 16 shorts -> 8 shorts.
-
-    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
-    "urshr      v1.8h, v1.8h, #1               \n"
-    "urshr      v2.8h, v2.8h, #1               \n"
-
-    "subs       %w3, %w3, #32                  \n"  // 32 processed per loop.
-    "mul        v3.8h, v0.8h, v20.8h           \n"  // B
-    "mls        v3.8h, v1.8h, v21.8h           \n"  // G
-    "mls        v3.8h, v2.8h, v22.8h           \n"  // R
-    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
-    "mul        v4.8h, v2.8h, v20.8h           \n"  // R
-    "mls        v4.8h, v1.8h, v24.8h           \n"  // G
-    "mls        v4.8h, v0.8h, v23.8h           \n"  // B
-    "add        v4.8h, v4.8h, v25.8h           \n"  // +128 -> unsigned
-    "uqshrn     v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit U
-    "uqshrn     v1.8b, v4.8h, #8               \n"  // 16 bit to 8 bit V
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
-    MEMACCESS(2)
-    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v23", "v24", "v25"
-  );
-}
+#define RGBTOUV_SETUP_REG                                                  \
+  "movi       v20.8h, #56, lsl #0  \n" /* UB/VR coefficient (0.875) / 2 */ \
+  "movi       v21.8h, #37, lsl #0  \n" /* UG coefficient (-0.5781) / 2  */ \
+  "movi       v22.8h, #19, lsl #0  \n" /* UR coefficient (-0.2969) / 2  */ \
+  "movi       v23.8h, #9,  lsl #0  \n" /* VB coefficient (-0.1406) / 2  */ \
+  "movi       v24.8h, #47, lsl #0  \n" /* VG coefficient (-0.7344) / 2  */ \
+  "movi       v25.16b, #0x80       \n" /* 128.5 (0x8080 in 16-bit)      */
 
 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-#define RGBTOUV(QB, QG, QR) \
-    "mul        v3.8h, " #QB ",v20.8h          \n"  /* B                    */ \
-    "mul        v4.8h, " #QR ",v20.8h          \n"  /* R                    */ \
-    "mls        v3.8h, " #QG ",v21.8h          \n"  /* G                    */ \
-    "mls        v4.8h, " #QG ",v24.8h          \n"  /* G                    */ \
-    "mls        v3.8h, " #QR ",v22.8h          \n"  /* R                    */ \
-    "mls        v4.8h, " #QB ",v23.8h          \n"  /* B                    */ \
-    "add        v3.8h, v3.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
-    "add        v4.8h, v4.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
-    "uqshrn     v0.8b, v3.8h, #8               \n"  /* 16 bit to 8 bit U    */ \
-    "uqshrn     v1.8b, v4.8h, #8               \n"  /* 16 bit to 8 bit V    */
+// clang-format off
+#define RGBTOUV(QB, QG, QR)                                                 \
+  "mul        v3.8h, " #QB ",v20.8h          \n" /* B                    */ \
+  "mul        v4.8h, " #QR ",v20.8h          \n" /* R                    */ \
+  "mls        v3.8h, " #QG ",v21.8h          \n" /* G                    */ \
+  "mls        v4.8h, " #QG ",v24.8h          \n" /* G                    */ \
+  "mls        v3.8h, " #QR ",v22.8h          \n" /* R                    */ \
+  "mls        v4.8h, " #QB ",v23.8h          \n" /* B                    */ \
+  "add        v3.8h, v3.8h, v25.8h           \n" /* +128 -> unsigned     */ \
+  "add        v4.8h, v4.8h, v25.8h           \n" /* +128 -> unsigned     */ \
+  "uqshrn     v0.8b, v3.8h, #8               \n" /* 16 bit to 8 bit U    */ \
+  "uqshrn     v1.8b, v4.8h, #8               \n" /* 16 bit to 8 bit V    */
+// clang-format on
 
 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
 // TODO(fbarchard): consider ptrdiff_t for all strides.
 
-void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_argb_1 = src_argb + src_stride_argb;
+void ARGBToUVRow_NEON(const uint8_t* src_argb,
+                      int src_stride_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_argb_1 = src_argb + src_stride_argb;
   asm volatile (
     RGBTOUV_SETUP_REG
   "1:                                          \n"
-    MEMACCESS(0)
     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
 
-    MEMACCESS(1)
     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
     "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
@@ -1486,9 +1415,7 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
 
     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
     RGBTOUV(v0.8h, v1.8h, v2.8h)
-    MEMACCESS(2)
     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
     "b.gt       1b                             \n"
   : "+r"(src_argb),  // %0
@@ -1503,9 +1430,12 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
 }
 
 // TODO(fbarchard): Subsample match C code.
-void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_argb_1 = src_argb + src_stride_argb;
+void ARGBToUVJRow_NEON(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  const uint8_t* src_argb_1 = src_argb + src_stride_argb;
   asm volatile (
     "movi       v20.8h, #63, lsl #0            \n"  // UB/VR coeff (0.500) / 2
     "movi       v21.8h, #42, lsl #0            \n"  // UG coeff (-0.33126) / 2
@@ -1514,12 +1444,10 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
     "movi       v24.8h, #53, lsl #0            \n"  // VG coeff (-0.41869) / 2
     "movi       v25.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
   "1:                                          \n"
-    MEMACCESS(0)
     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64  \n"  // load next 16
     "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
@@ -1531,9 +1459,7 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
 
     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
     RGBTOUV(v0.8h, v1.8h, v2.8h)
-    MEMACCESS(2)
     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
     "b.gt       1b                             \n"
   : "+r"(src_argb),  // %0
@@ -1547,18 +1473,19 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
   );
 }
 
-void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
+void BGRAToUVRow_NEON(const uint8_t* src_bgra,
+                      int src_stride_bgra,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra;
   asm volatile (
     RGBTOUV_SETUP_REG
   "1:                                          \n"
-    MEMACCESS(0)
     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
     "uaddlp     v0.8h, v3.16b                  \n"  // B 16 bytes -> 8 shorts.
     "uaddlp     v3.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
     "uaddlp     v2.8h, v1.16b                  \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more
     "uadalp     v0.8h, v7.16b                  \n"  // B 16 bytes -> 8 shorts.
     "uadalp     v3.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
@@ -1570,9 +1497,7 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
 
     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
     RGBTOUV(v0.8h, v1.8h, v2.8h)
-    MEMACCESS(2)
     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
     "b.gt       1b                             \n"
   : "+r"(src_bgra),  // %0
@@ -1586,18 +1511,19 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
   );
 }
 
-void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
+void ABGRToUVRow_NEON(const uint8_t* src_abgr,
+                      int src_stride_abgr,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
   asm volatile (
     RGBTOUV_SETUP_REG
   "1:                                          \n"
-    MEMACCESS(0)
     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
     "uaddlp     v3.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
     "uaddlp     v2.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
     "uaddlp     v1.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
     "uadalp     v3.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
     "uadalp     v2.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
@@ -1609,9 +1535,7 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
 
     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
     RGBTOUV(v0.8h, v2.8h, v1.8h)
-    MEMACCESS(2)
     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
     "b.gt       1b                             \n"
   : "+r"(src_abgr),  // %0
@@ -1625,18 +1549,19 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
   );
 }
 
-void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
+void RGBAToUVRow_NEON(const uint8_t* src_rgba,
+                      int src_stride_rgba,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba;
   asm volatile (
     RGBTOUV_SETUP_REG
   "1:                                          \n"
-    MEMACCESS(0)
     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
     "uaddlp     v0.8h, v1.16b                  \n"  // B 16 bytes -> 8 shorts.
     "uaddlp     v1.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
     "uaddlp     v2.8h, v3.16b                  \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
     "uadalp     v0.8h, v5.16b                  \n"  // B 16 bytes -> 8 shorts.
     "uadalp     v1.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
@@ -1648,9 +1573,7 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
 
     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
     RGBTOUV(v0.8h, v1.8h, v2.8h)
-    MEMACCESS(2)
     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
     "b.gt       1b                             \n"
   : "+r"(src_rgba),  // %0
@@ -1664,18 +1587,19 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
   );
 }
 
-void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
+void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
+                       int src_stride_rgb24,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
   asm volatile (
     RGBTOUV_SETUP_REG
   "1:                                          \n"
-    MEMACCESS(0)
     "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
     "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 16 more.
     "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
@@ -1687,9 +1611,7 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
 
     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
     RGBTOUV(v0.8h, v1.8h, v2.8h)
-    MEMACCESS(2)
     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
     "b.gt       1b                             \n"
   : "+r"(src_rgb24),  // %0
@@ -1703,18 +1625,19 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
   );
 }
 
-void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
-                     uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_raw_1 = src_raw + src_stride_raw;
+void RAWToUVRow_NEON(const uint8_t* src_raw,
+                     int src_stride_raw,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  const uint8_t* src_raw_1 = src_raw + src_stride_raw;
   asm volatile (
     RGBTOUV_SETUP_REG
   "1:                                          \n"
-    MEMACCESS(0)
     "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 8 RAW pixels.
     "uaddlp     v2.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
     "uaddlp     v0.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
     "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 8 more RAW pixels
     "uadalp     v2.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
@@ -1726,9 +1649,7 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
 
     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
     RGBTOUV(v2.8h, v1.8h, v0.8h)
-    MEMACCESS(2)
     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
     "b.gt       1b                             \n"
   : "+r"(src_raw),  // %0
@@ -1743,699 +1664,656 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
 }
 
 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
-                        uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
-  asm volatile (
-    "movi       v22.8h, #56, lsl #0            \n"  // UB / VR coeff (0.875) / 2
-    "movi       v23.8h, #37, lsl #0            \n"  // UG coeff (-0.5781) / 2
-    "movi       v24.8h, #19, lsl #0            \n"  // UR coeff (-0.2969) / 2
-    "movi       v25.8h, #9 , lsl #0            \n"  // VB coeff (-0.1406) / 2
-    "movi       v26.8h, #47, lsl #0            \n"  // VG coeff (-0.7344) / 2
-    "movi       v27.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
-    RGB565TOARGB
-    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uaddlp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uaddlp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 RGB565 pixels.
-    RGB565TOARGB
-    "uaddlp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uaddlp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uaddlp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
+                        int src_stride_rgb565,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
+  asm volatile(
+      "movi       v22.8h, #56, lsl #0            \n"  // UB / VR coeff (0.875) /
+                                                      // 2
+      "movi       v23.8h, #37, lsl #0            \n"  // UG coeff (-0.5781) / 2
+      "movi       v24.8h, #19, lsl #0            \n"  // UR coeff (-0.2969) / 2
+      "movi       v25.8h, #9 , lsl #0            \n"  // VB coeff (-0.1406) / 2
+      "movi       v26.8h, #47, lsl #0            \n"  // VG coeff (-0.7344) / 2
+      "movi       v27.16b, #0x80                 \n"  // 128.5 0x8080 in 16bit
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
+      RGB565TOARGB
+      "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+      "ld1        {v0.16b}, [%0], #16            \n"  // next 8 RGB565 pixels.
+      RGB565TOARGB
+      "uaddlp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
 
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 RGB565 pixels.
-    RGB565TOARGB
-    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uadalp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uadalp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 RGB565 pixels.
-    RGB565TOARGB
-    "uadalp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uadalp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uadalp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+      "ld1        {v0.16b}, [%1], #16            \n"  // load 8 RGB565 pixels.
+      RGB565TOARGB
+      "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uadalp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uadalp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+      "ld1        {v0.16b}, [%1], #16            \n"  // next 8 RGB565 pixels.
+      RGB565TOARGB
+      "uadalp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uadalp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uadalp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
 
-    "ins        v16.D[1], v17.D[0]             \n"
-    "ins        v18.D[1], v19.D[0]             \n"
-    "ins        v20.D[1], v21.D[0]             \n"
+      "ins        v16.D[1], v17.D[0]             \n"
+      "ins        v18.D[1], v19.D[0]             \n"
+      "ins        v20.D[1], v21.D[0]             \n"
 
-    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
-    "urshr      v5.8h, v18.8h, #1              \n"
-    "urshr      v6.8h, v20.8h, #1              \n"
+      "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
+      "urshr      v5.8h, v18.8h, #1              \n"
+      "urshr      v6.8h, v20.8h, #1              \n"
 
-    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
-    "mul        v16.8h, v4.8h, v22.8h          \n"  // B
-    "mls        v16.8h, v5.8h, v23.8h          \n"  // G
-    "mls        v16.8h, v6.8h, v24.8h          \n"  // R
-    "add        v16.8h, v16.8h, v27.8h         \n"  // +128 -> unsigned
-    "mul        v17.8h, v6.8h, v22.8h          \n"  // R
-    "mls        v17.8h, v5.8h, v26.8h          \n"  // G
-    "mls        v17.8h, v4.8h, v25.8h          \n"  // B
-    "add        v17.8h, v17.8h, v27.8h         \n"  // +128 -> unsigned
-    "uqshrn     v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit U
-    "uqshrn     v1.8b, v17.8h, #8              \n"  // 16 bit to 8 bit V
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_rgb565),  // %0
-    "+r"(src_rgb565_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
-    "v25", "v26", "v27"
-  );
+      "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
+      "mul        v16.8h, v4.8h, v22.8h          \n"  // B
+      "mls        v16.8h, v5.8h, v23.8h          \n"  // G
+      "mls        v16.8h, v6.8h, v24.8h          \n"  // R
+      "add        v16.8h, v16.8h, v27.8h         \n"  // +128 -> unsigned
+      "mul        v17.8h, v6.8h, v22.8h          \n"  // R
+      "mls        v17.8h, v5.8h, v26.8h          \n"  // G
+      "mls        v17.8h, v4.8h, v25.8h          \n"  // B
+      "add        v17.8h, v17.8h, v27.8h         \n"  // +128 -> unsigned
+      "uqshrn     v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit U
+      "uqshrn     v1.8b, v17.8h, #8              \n"  // 16 bit to 8 bit V
+      "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+      "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+      "b.gt       1b                             \n"
+      : "+r"(src_rgb565),    // %0
+        "+r"(src_rgb565_1),  // %1
+        "+r"(dst_u),         // %2
+        "+r"(dst_v),         // %3
+        "+r"(width)          // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+        "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+        "v27");
 }
 
 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
-                        uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
-  asm volatile (
-    RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
-    RGB555TOARGB
-    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB1555 pixels.
-    RGB555TOARGB
-    "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
+                          int src_stride_argb1555,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
+  const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
+  asm volatile(
+      RGBTOUV_SETUP_REG
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+      "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
 
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB1555 pixels.
-    RGB555TOARGB
-    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB1555 pixels.
-    RGB555TOARGB
-    "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+      "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+      "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
 
-    "ins        v16.D[1], v26.D[0]             \n"
-    "ins        v17.D[1], v27.D[0]             \n"
-    "ins        v18.D[1], v28.D[0]             \n"
+      "ins        v16.D[1], v26.D[0]             \n"
+      "ins        v17.D[1], v27.D[0]             \n"
+      "ins        v18.D[1], v28.D[0]             \n"
 
-    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
-    "urshr      v5.8h, v17.8h, #1              \n"
-    "urshr      v6.8h, v18.8h, #1              \n"
+      "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
+      "urshr      v5.8h, v17.8h, #1              \n"
+      "urshr      v6.8h, v18.8h, #1              \n"
 
-    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
-    "mul        v2.8h, v4.8h, v20.8h           \n"  // B
-    "mls        v2.8h, v5.8h, v21.8h           \n"  // G
-    "mls        v2.8h, v6.8h, v22.8h           \n"  // R
-    "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
-    "mul        v3.8h, v6.8h, v20.8h           \n"  // R
-    "mls        v3.8h, v5.8h, v24.8h           \n"  // G
-    "mls        v3.8h, v4.8h, v23.8h           \n"  // B
-    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
-    "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
-    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb1555),  // %0
-    "+r"(src_argb1555_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
-    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
-    "v26", "v27", "v28"
-  );
+      "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
+      "mul        v2.8h, v4.8h, v20.8h           \n"  // B
+      "mls        v2.8h, v5.8h, v21.8h           \n"  // G
+      "mls        v2.8h, v6.8h, v22.8h           \n"  // R
+      "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
+      "mul        v3.8h, v6.8h, v20.8h           \n"  // R
+      "mls        v3.8h, v5.8h, v24.8h           \n"  // G
+      "mls        v3.8h, v4.8h, v23.8h           \n"  // B
+      "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
+      "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
+      "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
+      "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+      "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb1555),    // %0
+        "+r"(src_argb1555_1),  // %1
+        "+r"(dst_u),           // %2
+        "+r"(dst_v),           // %3
+        "+r"(width)            // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
+        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+        "v28");
 }
 
 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
-                          uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
-  asm volatile (
-    RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
-    ARGB4444TOARGB
-    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB4444 pixels.
-    ARGB4444TOARGB
-    "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
+                          int src_stride_argb4444,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
+  const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
+  asm volatile(
+      RGBTOUV_SETUP_REG
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+      "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
 
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB4444 pixels.
-    ARGB4444TOARGB
-    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB4444 pixels.
-    ARGB4444TOARGB
-    "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+      "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+      "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
 
-    "ins        v16.D[1], v26.D[0]             \n"
-    "ins        v17.D[1], v27.D[0]             \n"
-    "ins        v18.D[1], v28.D[0]             \n"
+      "ins        v16.D[1], v26.D[0]             \n"
+      "ins        v17.D[1], v27.D[0]             \n"
+      "ins        v18.D[1], v28.D[0]             \n"
 
-    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
-    "urshr      v5.8h, v17.8h, #1              \n"
-    "urshr      v6.8h, v18.8h, #1              \n"
+      "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
+      "urshr      v5.8h, v17.8h, #1              \n"
+      "urshr      v6.8h, v18.8h, #1              \n"
 
-    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
-    "mul        v2.8h, v4.8h, v20.8h           \n"  // B
-    "mls        v2.8h, v5.8h, v21.8h           \n"  // G
-    "mls        v2.8h, v6.8h, v22.8h           \n"  // R
-    "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
-    "mul        v3.8h, v6.8h, v20.8h           \n"  // R
-    "mls        v3.8h, v5.8h, v24.8h           \n"  // G
-    "mls        v3.8h, v4.8h, v23.8h           \n"  // B
-    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
-    "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
-    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb4444),  // %0
-    "+r"(src_argb4444_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
-    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
-    "v26", "v27", "v28"
+      "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
+      "mul        v2.8h, v4.8h, v20.8h           \n"  // B
+      "mls        v2.8h, v5.8h, v21.8h           \n"  // G
+      "mls        v2.8h, v6.8h, v22.8h           \n"  // R
+      "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
+      "mul        v3.8h, v6.8h, v20.8h           \n"  // R
+      "mls        v3.8h, v5.8h, v24.8h           \n"  // G
+      "mls        v3.8h, v4.8h, v23.8h           \n"  // B
+      "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
+      "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
+      "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
+      "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+      "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb4444),    // %0
+        "+r"(src_argb4444_1),  // %1
+        "+r"(dst_u),           // %2
+        "+r"(dst_v),           // %3
+        "+r"(width)            // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
+        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+        "v28"
 
-  );
+      );
 }
 
-void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
-  asm volatile (
-    "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
-    "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
-    "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
-    "movi       v27.8b, #16                    \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    RGB565TOARGB
-    "umull      v3.8h, v0.8b, v24.8b           \n"  // B
-    "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
-    "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
-    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v27.8b           \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_rgb565),  // %0
-    "+r"(dst_y),       // %1
-    "+r"(width)          // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",
-    "v24", "v25", "v26", "v27"
-  );
+void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
+      "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
+      "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
+      "movi       v27.8b, #16                    \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      RGB565TOARGB
+      "umull      v3.8h, v0.8b, v24.8b           \n"  // B
+      "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
+      "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
+      "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+      "uqadd      v0.8b, v0.8b, v27.8b           \n"
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_rgb565),  // %0
+        "+r"(dst_y),       // %1
+        "+r"(width)        // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", "v24", "v25", "v26",
+        "v27");
 }
 
-void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
-  asm volatile (
-    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
-    "movi       v7.8b, #16                     \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    ARGB1555TOARGB
-    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
-    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
-    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
-    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v7.8b            \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb1555),  // %0
-    "+r"(dst_y),         // %1
-    "+r"(width)            // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-  );
+void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
+                         uint8_t* dst_y,
+                         int width) {
+  asm volatile(
+      "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+      "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+      "movi       v7.8b, #16                     \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      ARGB1555TOARGB
+      "umull      v3.8h, v0.8b, v4.8b            \n"  // B
+      "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
+      "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
+      "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+      "uqadd      v0.8b, v0.8b, v7.8b            \n"
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb1555),  // %0
+        "+r"(dst_y),         // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 }
 
-void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
-  asm volatile (
-    "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
-    "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
-    "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
-    "movi       v27.8b, #16                    \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    ARGB4444TOARGB
-    "umull      v3.8h, v0.8b, v24.8b           \n"  // B
-    "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
-    "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
-    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v27.8b           \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb4444),  // %0
-    "+r"(dst_y),         // %1
-    "+r"(width)            // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"
-  );
+void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
+                         uint8_t* dst_y,
+                         int width) {
+  asm volatile(
+      "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
+      "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
+      "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
+      "movi       v27.8b, #16                    \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      ARGB4444TOARGB
+      "umull      v3.8h, v0.8b, v24.8b           \n"  // B
+      "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
+      "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
+      "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+      "uqadd      v0.8b, v0.8b, v27.8b           \n"
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb4444),  // %0
+        "+r"(dst_y),         // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27");
 }
 
-void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
-  asm volatile (
-    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
-    "movi       v7.8b, #16                     \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v16.8h, v1.8b, v4.8b           \n"  // R
-    "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
-    "umlal      v16.8h, v3.8b, v6.8b           \n"  // B
-    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v7.8b            \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_bgra),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
-  );
+void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
+      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+      "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
+      "movi       v7.8b, #16                     \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "umull      v16.8h, v1.8b, v4.8b           \n"  // R
+      "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
+      "umlal      v16.8h, v3.8b, v6.8b           \n"  // B
+      "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+      "uqadd      v0.8b, v0.8b, v7.8b            \n"
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_bgra),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
 }
 
-void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
-  asm volatile (
-    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
-    "movi       v7.8b, #16                     \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v16.8h, v0.8b, v4.8b           \n"  // R
-    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
-    "umlal      v16.8h, v2.8b, v6.8b           \n"  // B
-    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v7.8b            \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_abgr),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
-  );
+void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
+      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+      "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
+      "movi       v7.8b, #16                     \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "umull      v16.8h, v0.8b, v4.8b           \n"  // R
+      "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
+      "umlal      v16.8h, v2.8b, v6.8b           \n"  // B
+      "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+      "uqadd      v0.8b, v0.8b, v7.8b            \n"
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_abgr),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
 }
 
-void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
-  asm volatile (
-    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
-    "movi       v7.8b, #16                     \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v16.8h, v1.8b, v4.8b           \n"  // B
-    "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
-    "umlal      v16.8h, v3.8b, v6.8b           \n"  // R
-    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v7.8b            \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_rgba),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
-  );
+void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+      "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+      "movi       v7.8b, #16                     \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "umull      v16.8h, v1.8b, v4.8b           \n"  // B
+      "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
+      "umlal      v16.8h, v3.8b, v6.8b           \n"  // R
+      "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+      "uqadd      v0.8b, v0.8b, v7.8b            \n"
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_rgba),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
 }
 
-void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
-  asm volatile (
-    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
-    "movi       v7.8b, #16                     \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v16.8h, v0.8b, v4.8b           \n"  // B
-    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
-    "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
-    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v7.8b            \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_rgb24),  // %0
-    "+r"(dst_y),      // %1
-    "+r"(width)         // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
-  );
+void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+      "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+      "movi       v7.8b, #16                     \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "umull      v16.8h, v0.8b, v4.8b           \n"  // B
+      "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
+      "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
+      "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+      "uqadd      v0.8b, v0.8b, v7.8b            \n"
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_rgb24),  // %0
+        "+r"(dst_y),      // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
 }
 
-void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
-  asm volatile (
-    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
-    "movi       v7.8b, #16                     \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v16.8h, v0.8b, v4.8b           \n"  // B
-    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
-    "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
-    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v7.8b            \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_raw),  // %0
-    "+r"(dst_y),    // %1
-    "+r"(width)       // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
-  );
+void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
+      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+      "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
+      "movi       v7.8b, #16                     \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "umull      v16.8h, v0.8b, v4.8b           \n"  // B
+      "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
+      "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
+      "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+      "uqadd      v0.8b, v0.8b, v7.8b            \n"
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_raw),  // %0
+        "+r"(dst_y),    // %1
+        "+r"(width)     // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
 }
 
 // Bilinear filter 16x2 -> 16x1
-void InterpolateRow_NEON(uint8* dst_ptr,
-                         const uint8* src_ptr, ptrdiff_t src_stride,
-                         int dst_width, int source_y_fraction) {
+void InterpolateRow_NEON(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         int dst_width,
+                         int source_y_fraction) {
   int y1_fraction = source_y_fraction;
   int y0_fraction = 256 - y1_fraction;
-  const uint8* src_ptr1 = src_ptr + src_stride;
-  asm volatile (
-    "cmp        %w4, #0                        \n"
-    "b.eq       100f                           \n"
-    "cmp        %w4, #128                      \n"
-    "b.eq       50f                            \n"
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  asm volatile(
+      "cmp        %w4, #0                        \n"
+      "b.eq       100f                           \n"
+      "cmp        %w4, #128                      \n"
+      "b.eq       50f                            \n"
 
-    "dup        v5.16b, %w4                    \n"
-    "dup        v4.16b, %w5                    \n"
-    // General purpose row blend.
-  "1:                                          \n"
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"
-    MEMACCESS(2)
-    "ld1        {v1.16b}, [%2], #16            \n"
-    "subs       %w3, %w3, #16                  \n"
-    "umull      v2.8h, v0.8b,  v4.8b           \n"
-    "umull2     v3.8h, v0.16b, v4.16b          \n"
-    "umlal      v2.8h, v1.8b,  v5.8b           \n"
-    "umlal2     v3.8h, v1.16b, v5.16b          \n"
-    "rshrn      v0.8b,  v2.8h, #8              \n"
-    "rshrn2     v0.16b, v3.8h, #8              \n"
-    MEMACCESS(0)
-    "st1        {v0.16b}, [%0], #16            \n"
-    "b.gt       1b                             \n"
-    "b          99f                            \n"
+      "dup        v5.16b, %w4                    \n"
+      "dup        v4.16b, %w5                    \n"
+      // General purpose row blend.
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%1], #16            \n"
+      "ld1        {v1.16b}, [%2], #16            \n"
+      "subs       %w3, %w3, #16                  \n"
+      "umull      v2.8h, v0.8b,  v4.8b           \n"
+      "umull2     v3.8h, v0.16b, v4.16b          \n"
+      "umlal      v2.8h, v1.8b,  v5.8b           \n"
+      "umlal2     v3.8h, v1.16b, v5.16b          \n"
+      "rshrn      v0.8b,  v2.8h, #8              \n"
+      "rshrn2     v0.16b, v3.8h, #8              \n"
+      "st1        {v0.16b}, [%0], #16            \n"
+      "b.gt       1b                             \n"
+      "b          99f                            \n"
 
-    // Blend 50 / 50.
-  "50:                                         \n"
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"
-    MEMACCESS(2)
-    "ld1        {v1.16b}, [%2], #16            \n"
-    "subs       %w3, %w3, #16                  \n"
-    "urhadd     v0.16b, v0.16b, v1.16b         \n"
-    MEMACCESS(0)
-    "st1        {v0.16b}, [%0], #16            \n"
-    "b.gt       50b                            \n"
-    "b          99f                            \n"
+      // Blend 50 / 50.
+      "50:                                       \n"
+      "ld1        {v0.16b}, [%1], #16            \n"
+      "ld1        {v1.16b}, [%2], #16            \n"
+      "subs       %w3, %w3, #16                  \n"
+      "urhadd     v0.16b, v0.16b, v1.16b         \n"
+      "st1        {v0.16b}, [%0], #16            \n"
+      "b.gt       50b                            \n"
+      "b          99f                            \n"
 
-    // Blend 100 / 0 - Copy row unchanged.
-  "100:                                        \n"
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"
-    "subs       %w3, %w3, #16                  \n"
-    MEMACCESS(0)
-    "st1        {v0.16b}, [%0], #16            \n"
-    "b.gt       100b                           \n"
+      // Blend 100 / 0 - Copy row unchanged.
+      "100:                                      \n"
+      "ld1        {v0.16b}, [%1], #16            \n"
+      "subs       %w3, %w3, #16                  \n"
+      "st1        {v0.16b}, [%0], #16            \n"
+      "b.gt       100b                           \n"
 
-  "99:                                         \n"
-  : "+r"(dst_ptr),          // %0
-    "+r"(src_ptr),          // %1
-    "+r"(src_ptr1),         // %2
-    "+r"(dst_width),        // %3
-    "+r"(y1_fraction),      // %4
-    "+r"(y0_fraction)       // %5
-  :
-  : "cc", "memory", "v0", "v1", "v3", "v4", "v5"
-  );
+      "99:                                       \n"
+      : "+r"(dst_ptr),      // %0
+        "+r"(src_ptr),      // %1
+        "+r"(src_ptr1),     // %2
+        "+r"(dst_width),    // %3
+        "+r"(y1_fraction),  // %4
+        "+r"(y0_fraction)   // %5
+      :
+      : "cc", "memory", "v0", "v1", "v3", "v4", "v5");
 }
 
 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
-void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                       uint8* dst_argb, int width) {
-  asm volatile (
-    "subs       %w3, %w3, #8                   \n"
-    "b.lt       89f                            \n"
-    // Blend 8 pixels.
-  "8:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0 pixels
-    MEMACCESS(1)
-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1 pixels
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
-    "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
-    "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
-    "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
-    "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
-    "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
-    "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
-    "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
-    "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
-    "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
-    "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
-    "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
-    "movi       v3.8b, #255                    \n"  // a = 255
-    MEMACCESS(2)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
-    "b.ge       8b                             \n"
+void ARGBBlendRow_NEON(const uint8_t* src_argb0,
+                       const uint8_t* src_argb1,
+                       uint8_t* dst_argb,
+                       int width) {
+  asm volatile(
+      "subs       %w3, %w3, #8                   \n"
+      "b.lt       89f                            \n"
+      // Blend 8 pixels.
+      "8:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0
+                                                            // pixels
+      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1
+                                                            // pixels
+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+      "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
+      "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
+      "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
+      "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
+      "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
+      "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
+      "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
+      "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
+      "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
+      "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
+      "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
+      "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
+      "movi       v3.8b, #255                    \n"  // a = 255
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+                                                            // pixels
+      "b.ge       8b                             \n"
 
-  "89:                                         \n"
-    "adds       %w3, %w3, #8-1                 \n"
-    "b.lt       99f                            \n"
+      "89:                                       \n"
+      "adds       %w3, %w3, #8-1                 \n"
+      "b.lt       99f                            \n"
 
-    // Blend 1 pixels.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel ARGB0.
-    MEMACCESS(1)
-    "ld4        {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel ARGB1.
-    "subs       %w3, %w3, #1                   \n"  // 1 processed per loop.
-    "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
-    "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
-    "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
-    "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
-    "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
-    "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
-    "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
-    "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
-    "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
-    "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
-    "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
-    "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
-    "movi       v3.8b, #255                    \n"  // a = 255
-    MEMACCESS(2)
-    "st4        {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel.
-    "b.ge       1b                             \n"
+      // Blend 1 pixels.
+      "1:                                        \n"
+      "ld4        {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel ARGB0.
+      "ld4        {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel ARGB1.
+      "subs       %w3, %w3, #1                   \n"  // 1 processed per loop.
+      "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
+      "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
+      "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
+      "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
+      "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
+      "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
+      "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
+      "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
+      "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
+      "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
+      "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
+      "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
+      "movi       v3.8b, #255                    \n"  // a = 255
+      "st4        {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel.
+      "b.ge       1b                             \n"
 
-  "99:                                         \n"
+      "99:                                       \n"
 
-  : "+r"(src_argb0),    // %0
-    "+r"(src_argb1),    // %1
-    "+r"(dst_argb),     // %2
-    "+r"(width)         // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v16", "v17", "v18"
-  );
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+        "v17", "v18");
 }
 
 // Attenuate 8 pixels at a time.
-void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    // Attenuate 8 pixels.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v4.8h, v0.8b, v3.8b            \n"  // b * a
-    "umull      v5.8h, v1.8b, v3.8b            \n"  // g * a
-    "umull      v6.8h, v2.8b, v3.8b            \n"  // r * a
-    "uqrshrn    v0.8b, v4.8h, #8               \n"  // b >>= 8
-    "uqrshrn    v1.8b, v5.8h, #8               \n"  // g >>= 8
-    "uqrshrn    v2.8b, v6.8h, #8               \n"  // r >>= 8
-    MEMACCESS(1)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_argb),   // %1
-    "+r"(width)       // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
-  );
+void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int width) {
+  asm volatile(
+      // Attenuate 8 pixels.
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "umull      v4.8h, v0.8b, v3.8b            \n"  // b * a
+      "umull      v5.8h, v1.8b, v3.8b            \n"  // g * a
+      "umull      v6.8h, v2.8b, v3.8b            \n"  // r * a
+      "uqrshrn    v0.8b, v4.8h, #8               \n"  // b >>= 8
+      "uqrshrn    v1.8b, v5.8h, #8               \n"  // g >>= 8
+      "uqrshrn    v2.8b, v6.8h, #8               \n"  // r >>= 8
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
+                                                            // pixels
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
 }
 
 // Quantize 8 ARGB pixels (32 bytes).
 // dst = (dst * scale >> 16) * interval_size + interval_offset;
-void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
-                          int interval_offset, int width) {
-  asm volatile (
-    "dup        v4.8h, %w2                     \n"
-    "ushr       v4.8h, v4.8h, #1               \n"  // scale >>= 1
-    "dup        v5.8h, %w3                     \n"  // interval multiply.
-    "dup        v6.8h, %w4                     \n"  // interval add
+void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
+                          int scale,
+                          int interval_size,
+                          int interval_offset,
+                          int width) {
+  asm volatile(
+      "dup        v4.8h, %w2                     \n"
+      "ushr       v4.8h, v4.8h, #1               \n"  // scale >>= 1
+      "dup        v5.8h, %w3                     \n"  // interval multiply.
+      "dup        v6.8h, %w4                     \n"  // interval add
 
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0]  \n"  // load 8 pixels of ARGB.
-    "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
-    "uxtl       v0.8h, v0.8b                   \n"  // b (0 .. 255)
-    "uxtl       v1.8h, v1.8b                   \n"
-    "uxtl       v2.8h, v2.8b                   \n"
-    "sqdmulh    v0.8h, v0.8h, v4.8h            \n"  // b * scale
-    "sqdmulh    v1.8h, v1.8h, v4.8h            \n"  // g
-    "sqdmulh    v2.8h, v2.8h, v4.8h            \n"  // r
-    "mul        v0.8h, v0.8h, v5.8h            \n"  // b * interval_size
-    "mul        v1.8h, v1.8h, v5.8h            \n"  // g
-    "mul        v2.8h, v2.8h, v5.8h            \n"  // r
-    "add        v0.8h, v0.8h, v6.8h            \n"  // b + interval_offset
-    "add        v1.8h, v1.8h, v6.8h            \n"  // g
-    "add        v2.8h, v2.8h, v6.8h            \n"  // r
-    "uqxtn      v0.8b, v0.8h                   \n"
-    "uqxtn      v1.8b, v1.8h                   \n"
-    "uqxtn      v2.8b, v2.8h                   \n"
-    MEMACCESS(0)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(dst_argb),       // %0
-    "+r"(width)           // %1
-  : "r"(scale),           // %2
-    "r"(interval_size),   // %3
-    "r"(interval_offset)  // %4
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
-  );
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0]  \n"  // load 8  ARGB.
+      "subs       %w1, %w1, #8                   \n"    // 8 processed per loop.
+      "uxtl       v0.8h, v0.8b                   \n"    // b (0 .. 255)
+      "uxtl       v1.8h, v1.8b                   \n"
+      "uxtl       v2.8h, v2.8b                   \n"
+      "sqdmulh    v0.8h, v0.8h, v4.8h            \n"  // b * scale
+      "sqdmulh    v1.8h, v1.8h, v4.8h            \n"  // g
+      "sqdmulh    v2.8h, v2.8h, v4.8h            \n"  // r
+      "mul        v0.8h, v0.8h, v5.8h            \n"  // b * interval_size
+      "mul        v1.8h, v1.8h, v5.8h            \n"  // g
+      "mul        v2.8h, v2.8h, v5.8h            \n"  // r
+      "add        v0.8h, v0.8h, v6.8h            \n"  // b + interval_offset
+      "add        v1.8h, v1.8h, v6.8h            \n"  // g
+      "add        v2.8h, v2.8h, v6.8h            \n"  // r
+      "uqxtn      v0.8b, v0.8h                   \n"
+      "uqxtn      v1.8b, v1.8h                   \n"
+      "uqxtn      v2.8b, v2.8h                   \n"
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB
+      "b.gt       1b                             \n"
+      : "+r"(dst_argb),       // %0
+        "+r"(width)           // %1
+      : "r"(scale),           // %2
+        "r"(interval_size),   // %3
+        "r"(interval_offset)  // %4
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
 }
 
 // Shade 8 pixels at a time by specified value.
 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
-void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
-                       uint32 value) {
-  asm volatile (
-    "dup        v0.4s, %w3                     \n"  // duplicate scale value.
-    "zip1       v0.8b, v0.8b, v0.8b            \n"  // v0.8b aarrggbb.
-    "ushr       v0.8h, v0.8h, #1               \n"  // scale / 2.
+void ARGBShadeRow_NEON(const uint8_t* src_argb,
+                       uint8_t* dst_argb,
+                       int width,
+                       uint32_t value) {
+  asm volatile(
+      "dup        v0.4s, %w3                     \n"  // duplicate scale value.
+      "zip1       v0.8b, v0.8b, v0.8b            \n"  // v0.8b aarrggbb.
+      "ushr       v0.8h, v0.8h, #1               \n"  // scale / 2.
 
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "uxtl       v4.8h, v4.8b                   \n"  // b (0 .. 255)
-    "uxtl       v5.8h, v5.8b                   \n"
-    "uxtl       v6.8h, v6.8b                   \n"
-    "uxtl       v7.8h, v7.8b                   \n"
-    "sqrdmulh   v4.8h, v4.8h, v0.h[0]          \n"  // b * scale * 2
-    "sqrdmulh   v5.8h, v5.8h, v0.h[1]          \n"  // g
-    "sqrdmulh   v6.8h, v6.8h, v0.h[2]          \n"  // r
-    "sqrdmulh   v7.8h, v7.8h, v0.h[3]          \n"  // a
-    "uqxtn      v4.8b, v4.8h                   \n"
-    "uqxtn      v5.8b, v5.8h                   \n"
-    "uqxtn      v6.8b, v6.8h                   \n"
-    "uqxtn      v7.8b, v7.8h                   \n"
-    MEMACCESS(1)
-    "st4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),       // %0
-    "+r"(dst_argb),       // %1
-    "+r"(width)           // %2
-  : "r"(value)            // %3
-  : "cc", "memory", "v0", "v4", "v5", "v6", "v7"
-  );
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "uxtl       v4.8h, v4.8b                   \n"  // b (0 .. 255)
+      "uxtl       v5.8h, v5.8b                   \n"
+      "uxtl       v6.8h, v6.8b                   \n"
+      "uxtl       v7.8h, v7.8b                   \n"
+      "sqrdmulh   v4.8h, v4.8h, v0.h[0]          \n"  // b * scale * 2
+      "sqrdmulh   v5.8h, v5.8h, v0.h[1]          \n"  // g
+      "sqrdmulh   v6.8h, v6.8h, v0.h[2]          \n"  // r
+      "sqrdmulh   v7.8h, v7.8h, v0.h[3]          \n"  // a
+      "uqxtn      v4.8b, v4.8h                   \n"
+      "uqxtn      v5.8b, v5.8h                   \n"
+      "uqxtn      v6.8b, v6.8h                   \n"
+      "uqxtn      v7.8b, v7.8h                   \n"
+      "st4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(value)       // %3
+      : "cc", "memory", "v0", "v4", "v5", "v6", "v7");
 }
 
 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
 // Similar to ARGBToYJ but stores ARGB.
 // C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
-void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    "movi       v24.8b, #15                    \n"  // B * 0.11400 coefficient
-    "movi       v25.8b, #75                    \n"  // G * 0.58700 coefficient
-    "movi       v26.8b, #38                    \n"  // R * 0.29900 coefficient
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v4.8h, v0.8b, v24.8b           \n"  // B
-    "umlal      v4.8h, v1.8b, v25.8b           \n"  // G
-    "umlal      v4.8h, v2.8b, v26.8b           \n"  // R
-    "sqrshrun   v0.8b, v4.8h, #7               \n"  // 15 bit to 8 bit B
-    "orr        v1.8b, v0.8b, v0.8b            \n"  // G
-    "orr        v2.8b, v0.8b, v0.8b            \n"  // R
-    MEMACCESS(1)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)      // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
-  );
+void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "movi       v24.8b, #15                    \n"  // B * 0.11400 coefficient
+      "movi       v25.8b, #75                    \n"  // G * 0.58700 coefficient
+      "movi       v26.8b, #38                    \n"  // R * 0.29900 coefficient
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "umull      v4.8h, v0.8b, v24.8b           \n"  // B
+      "umlal      v4.8h, v1.8b, v25.8b           \n"  // G
+      "umlal      v4.8h, v2.8b, v26.8b           \n"  // R
+      "sqrshrun   v0.8b, v4.8h, #7               \n"  // 15 bit to 8 bit B
+      "orr        v1.8b, v0.8b, v0.8b            \n"  // G
+      "orr        v2.8b, v0.8b, v0.8b            \n"  // R
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26");
 }
 
 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
@@ -2443,194 +2321,180 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
 //    g = (r * 45 + g * 88 + b * 22) >> 7
 //    r = (r * 50 + g * 98 + b * 24) >> 7
 
-void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
-  asm volatile (
-    "movi       v20.8b, #17                    \n"  // BB coefficient
-    "movi       v21.8b, #68                    \n"  // BG coefficient
-    "movi       v22.8b, #35                    \n"  // BR coefficient
-    "movi       v24.8b, #22                    \n"  // GB coefficient
-    "movi       v25.8b, #88                    \n"  // GG coefficient
-    "movi       v26.8b, #45                    \n"  // GR coefficient
-    "movi       v28.8b, #24                    \n"  // BB coefficient
-    "movi       v29.8b, #98                    \n"  // BG coefficient
-    "movi       v30.8b, #50                    \n"  // BR coefficient
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.
-    "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
-    "umull      v4.8h, v0.8b, v20.8b           \n"  // B to Sepia B
-    "umlal      v4.8h, v1.8b, v21.8b           \n"  // G
-    "umlal      v4.8h, v2.8b, v22.8b           \n"  // R
-    "umull      v5.8h, v0.8b, v24.8b           \n"  // B to Sepia G
-    "umlal      v5.8h, v1.8b, v25.8b           \n"  // G
-    "umlal      v5.8h, v2.8b, v26.8b           \n"  // R
-    "umull      v6.8h, v0.8b, v28.8b           \n"  // B to Sepia R
-    "umlal      v6.8h, v1.8b, v29.8b           \n"  // G
-    "umlal      v6.8h, v2.8b, v30.8b           \n"  // R
-    "uqshrn     v0.8b, v4.8h, #7               \n"  // 16 bit to 8 bit B
-    "uqshrn     v1.8b, v5.8h, #7               \n"  // 16 bit to 8 bit G
-    "uqshrn     v2.8b, v6.8h, #7               \n"  // 16 bit to 8 bit R
-    MEMACCESS(0)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.
-    "b.gt       1b                             \n"
-  : "+r"(dst_argb),  // %0
-    "+r"(width)      // %1
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
-  );
+void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
+  asm volatile(
+      "movi       v20.8b, #17                    \n"  // BB coefficient
+      "movi       v21.8b, #68                    \n"  // BG coefficient
+      "movi       v22.8b, #35                    \n"  // BR coefficient
+      "movi       v24.8b, #22                    \n"  // GB coefficient
+      "movi       v25.8b, #88                    \n"  // GG coefficient
+      "movi       v26.8b, #45                    \n"  // GR coefficient
+      "movi       v28.8b, #24                    \n"  // BB coefficient
+      "movi       v29.8b, #98                    \n"  // BG coefficient
+      "movi       v30.8b, #50                    \n"  // BR coefficient
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.
+      "subs       %w1, %w1, #8                   \n"   // 8 processed per loop.
+      "umull      v4.8h, v0.8b, v20.8b           \n"   // B to Sepia B
+      "umlal      v4.8h, v1.8b, v21.8b           \n"   // G
+      "umlal      v4.8h, v2.8b, v22.8b           \n"   // R
+      "umull      v5.8h, v0.8b, v24.8b           \n"   // B to Sepia G
+      "umlal      v5.8h, v1.8b, v25.8b           \n"   // G
+      "umlal      v5.8h, v2.8b, v26.8b           \n"   // R
+      "umull      v6.8h, v0.8b, v28.8b           \n"   // B to Sepia R
+      "umlal      v6.8h, v1.8b, v29.8b           \n"   // G
+      "umlal      v6.8h, v2.8b, v30.8b           \n"   // R
+      "uqshrn     v0.8b, v4.8h, #7               \n"   // 16 bit to 8 bit B
+      "uqshrn     v1.8b, v5.8h, #7               \n"   // 16 bit to 8 bit G
+      "uqshrn     v2.8b, v6.8h, #7               \n"   // 16 bit to 8 bit R
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.
+      "b.gt       1b                             \n"
+      : "+r"(dst_argb),  // %0
+        "+r"(width)      // %1
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+        "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30");
 }
 
 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
 // TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
 // needs to saturate.  Consider doing a non-saturating version.
-void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
-                             const int8* matrix_argb, int width) {
-  asm volatile (
-    MEMACCESS(3)
-    "ld1        {v2.16b}, [%3]                 \n"  // load 3 ARGB vectors.
-    "sxtl       v0.8h, v2.8b                   \n"  // B,G coefficients s16.
-    "sxtl2      v1.8h, v2.16b                  \n"  // R,A coefficients s16.
+void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             const int8_t* matrix_argb,
+                             int width) {
+  asm volatile(
+      "ld1        {v2.16b}, [%3]                 \n"  // load 3 ARGB vectors.
+      "sxtl       v0.8h, v2.8b                   \n"  // B,G coefficients s16.
+      "sxtl2      v1.8h, v2.16b                  \n"  // R,A coefficients s16.
 
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "uxtl       v16.8h, v16.8b                 \n"  // b (0 .. 255) 16 bit
-    "uxtl       v17.8h, v17.8b                 \n"  // g
-    "uxtl       v18.8h, v18.8b                 \n"  // r
-    "uxtl       v19.8h, v19.8b                 \n"  // a
-    "mul        v22.8h, v16.8h, v0.h[0]        \n"  // B = B * Matrix B
-    "mul        v23.8h, v16.8h, v0.h[4]        \n"  // G = B * Matrix G
-    "mul        v24.8h, v16.8h, v1.h[0]        \n"  // R = B * Matrix R
-    "mul        v25.8h, v16.8h, v1.h[4]        \n"  // A = B * Matrix A
-    "mul        v4.8h, v17.8h, v0.h[1]         \n"  // B += G * Matrix B
-    "mul        v5.8h, v17.8h, v0.h[5]         \n"  // G += G * Matrix G
-    "mul        v6.8h, v17.8h, v1.h[1]         \n"  // R += G * Matrix R
-    "mul        v7.8h, v17.8h, v1.h[5]         \n"  // A += G * Matrix A
-    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
-    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
-    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
-    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
-    "mul        v4.8h, v18.8h, v0.h[2]         \n"  // B += R * Matrix B
-    "mul        v5.8h, v18.8h, v0.h[6]         \n"  // G += R * Matrix G
-    "mul        v6.8h, v18.8h, v1.h[2]         \n"  // R += R * Matrix R
-    "mul        v7.8h, v18.8h, v1.h[6]         \n"  // A += R * Matrix A
-    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
-    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
-    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
-    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
-    "mul        v4.8h, v19.8h, v0.h[3]         \n"  // B += A * Matrix B
-    "mul        v5.8h, v19.8h, v0.h[7]         \n"  // G += A * Matrix G
-    "mul        v6.8h, v19.8h, v1.h[3]         \n"  // R += A * Matrix R
-    "mul        v7.8h, v19.8h, v1.h[7]         \n"  // A += A * Matrix A
-    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
-    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
-    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
-    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
-    "sqshrun    v16.8b, v22.8h, #6             \n"  // 16 bit to 8 bit B
-    "sqshrun    v17.8b, v23.8h, #6             \n"  // 16 bit to 8 bit G
-    "sqshrun    v18.8b, v24.8h, #6             \n"  // 16 bit to 8 bit R
-    "sqshrun    v19.8b, v25.8h, #6             \n"  // 16 bit to 8 bit A
-    MEMACCESS(1)
-    "st4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 pixels.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_argb),   // %1
-    "+r"(width)       // %2
-  : "r"(matrix_argb)  // %3
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
-    "v18", "v19", "v22", "v23", "v24", "v25"
-  );
+      "1:                                        \n"
+      "ld4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 ARGB
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "uxtl       v16.8h, v16.8b                 \n"  // b (0 .. 255) 16 bit
+      "uxtl       v17.8h, v17.8b                 \n"  // g
+      "uxtl       v18.8h, v18.8b                 \n"  // r
+      "uxtl       v19.8h, v19.8b                 \n"  // a
+      "mul        v22.8h, v16.8h, v0.h[0]        \n"  // B = B * Matrix B
+      "mul        v23.8h, v16.8h, v0.h[4]        \n"  // G = B * Matrix G
+      "mul        v24.8h, v16.8h, v1.h[0]        \n"  // R = B * Matrix R
+      "mul        v25.8h, v16.8h, v1.h[4]        \n"  // A = B * Matrix A
+      "mul        v4.8h, v17.8h, v0.h[1]         \n"  // B += G * Matrix B
+      "mul        v5.8h, v17.8h, v0.h[5]         \n"  // G += G * Matrix G
+      "mul        v6.8h, v17.8h, v1.h[1]         \n"  // R += G * Matrix R
+      "mul        v7.8h, v17.8h, v1.h[5]         \n"  // A += G * Matrix A
+      "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
+      "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
+      "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
+      "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
+      "mul        v4.8h, v18.8h, v0.h[2]         \n"  // B += R * Matrix B
+      "mul        v5.8h, v18.8h, v0.h[6]         \n"  // G += R * Matrix G
+      "mul        v6.8h, v18.8h, v1.h[2]         \n"  // R += R * Matrix R
+      "mul        v7.8h, v18.8h, v1.h[6]         \n"  // A += R * Matrix A
+      "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
+      "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
+      "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
+      "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
+      "mul        v4.8h, v19.8h, v0.h[3]         \n"  // B += A * Matrix B
+      "mul        v5.8h, v19.8h, v0.h[7]         \n"  // G += A * Matrix G
+      "mul        v6.8h, v19.8h, v1.h[3]         \n"  // R += A * Matrix R
+      "mul        v7.8h, v19.8h, v1.h[7]         \n"  // A += A * Matrix A
+      "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
+      "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
+      "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
+      "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
+      "sqshrun    v16.8b, v22.8h, #6             \n"  // 16 bit to 8 bit B
+      "sqshrun    v17.8b, v23.8h, #6             \n"  // 16 bit to 8 bit G
+      "sqshrun    v18.8b, v24.8h, #6             \n"  // 16 bit to 8 bit R
+      "sqshrun    v19.8b, v25.8h, #6             \n"  // 16 bit to 8 bit A
+      "st4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 ARGB
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),   // %0
+        "+r"(dst_argb),   // %1
+        "+r"(width)       // %2
+      : "r"(matrix_argb)  // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+        "v17", "v18", "v19", "v22", "v23", "v24", "v25");
 }
 
 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  asm volatile (
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    MEMACCESS(1)
-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "umull      v0.8h, v0.8b, v4.8b            \n"  // multiply B
-    "umull      v1.8h, v1.8b, v5.8b            \n"  // multiply G
-    "umull      v2.8h, v2.8b, v6.8b            \n"  // multiply R
-    "umull      v3.8h, v3.8b, v7.8b            \n"  // multiply A
-    "rshrn      v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit B
-    "rshrn      v1.8b, v1.8h, #8               \n"  // 16 bit to 8 bit G
-    "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R
-    "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A
-    MEMACCESS(2)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-  );
+void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+      "umull      v0.8h, v0.8b, v4.8b            \n"  // multiply B
+      "umull      v1.8h, v1.8b, v5.8b            \n"  // multiply G
+      "umull      v2.8h, v2.8b, v6.8b            \n"  // multiply R
+      "umull      v3.8h, v3.8b, v7.8b            \n"  // multiply A
+      "rshrn      v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit B
+      "rshrn      v1.8b, v1.8h, #8               \n"  // 16 bit to 8 bit G
+      "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R
+      "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "b.gt       1b                             \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 }
 
 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    MEMACCESS(1)
-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "uqadd      v0.8b, v0.8b, v4.8b            \n"
-    "uqadd      v1.8b, v1.8b, v5.8b            \n"
-    "uqadd      v2.8b, v2.8b, v6.8b            \n"
-    "uqadd      v3.8b, v3.8b, v7.8b            \n"
-    MEMACCESS(2)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-  );
+void ARGBAddRow_NEON(const uint8_t* src_argb0,
+                     const uint8_t* src_argb1,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+      "uqadd      v0.8b, v0.8b, v4.8b            \n"
+      "uqadd      v1.8b, v1.8b, v5.8b            \n"
+      "uqadd      v2.8b, v2.8b, v6.8b            \n"
+      "uqadd      v3.8b, v3.8b, v7.8b            \n"
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "b.gt       1b                             \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 }
 
 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  asm volatile (
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    MEMACCESS(1)
-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "uqsub      v0.8b, v0.8b, v4.8b            \n"
-    "uqsub      v1.8b, v1.8b, v5.8b            \n"
-    "uqsub      v2.8b, v2.8b, v6.8b            \n"
-    "uqsub      v3.8b, v3.8b, v7.8b            \n"
-    MEMACCESS(2)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-  );
+void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+      "uqsub      v0.8b, v0.8b, v4.8b            \n"
+      "uqsub      v1.8b, v1.8b, v5.8b            \n"
+      "uqsub      v2.8b, v2.8b, v6.8b            \n"
+      "uqsub      v3.8b, v3.8b, v7.8b            \n"
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "b.gt       1b                             \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 }
 
 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
@@ -2638,54 +2502,50 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
 // R = Sobel
 // G = Sobel
 // B = Sobel
-void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    "movi       v3.8b, #255                    \n"  // alpha
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.8b}, [%0], #8              \n"  // load 8 sobelx.
-    MEMACCESS(1)
-    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 sobely.
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "uqadd      v0.8b, v0.8b, v1.8b            \n"  // add
-    "orr        v1.8b, v0.8b, v0.8b            \n"
-    "orr        v2.8b, v0.8b, v0.8b            \n"
-    MEMACCESS(2)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3"
-  );
+void SobelRow_NEON(const uint8_t* src_sobelx,
+                   const uint8_t* src_sobely,
+                   uint8_t* dst_argb,
+                   int width) {
+  asm volatile(
+      "movi       v3.8b, #255                    \n"  // alpha
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld1        {v0.8b}, [%0], #8              \n"  // load 8 sobelx.
+      "ld1        {v1.8b}, [%1], #8              \n"  // load 8 sobely.
+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+      "uqadd      v0.8b, v0.8b, v1.8b            \n"  // add
+      "orr        v1.8b, v0.8b, v0.8b            \n"
+      "orr        v2.8b, v0.8b, v0.8b            \n"
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "b.gt       1b                             \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3");
 }
 
 // Adds Sobel X and Sobel Y and stores Sobel into plane.
-void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                          uint8* dst_y, int width) {
-  asm volatile (
-    // 16 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 16 sobelx.
-    MEMACCESS(1)
-    "ld1        {v1.16b}, [%1], #16            \n"  // load 16 sobely.
-    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
-    "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add
-    MEMACCESS(2)
-    "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels.
-    "b.gt       1b                             \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_y),       // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "v0", "v1"
-  );
+void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
+                          const uint8_t* src_sobely,
+                          uint8_t* dst_y,
+                          int width) {
+  asm volatile(
+      // 16 pixel loop.
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 16 sobelx.
+      "ld1        {v1.16b}, [%1], #16            \n"  // load 16 sobely.
+      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
+      "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add
+      "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels.
+      "b.gt       1b                             \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_y),       // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "v0", "v1");
 }
 
 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
@@ -2693,114 +2553,329 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
 // R = Sobel X
 // G = Sobel
 // B = Sobel Y
-void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    "movi       v3.8b, #255                    \n"  // alpha
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v2.8b}, [%0], #8              \n"  // load 8 sobelx.
-    MEMACCESS(1)
-    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 sobely.
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add
-    MEMACCESS(2)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3"
-  );
+void SobelXYRow_NEON(const uint8_t* src_sobelx,
+                     const uint8_t* src_sobely,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      "movi       v3.8b, #255                    \n"  // alpha
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld1        {v2.8b}, [%0], #8              \n"  // load 8 sobelx.
+      "ld1        {v0.8b}, [%1], #8              \n"  // load 8 sobely.
+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+      "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "b.gt       1b                             \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3");
 }
 
 // SobelX as a matrix is
 // -1  0  1
 // -2  0  2
 // -1  0  1
-void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
-                    const uint8* src_y2, uint8* dst_sobelx, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.8b}, [%0],%5               \n"  // top
-    MEMACCESS(0)
-    "ld1        {v1.8b}, [%0],%6               \n"
-    "usubl      v0.8h, v0.8b, v1.8b            \n"
-    MEMACCESS(1)
-    "ld1        {v2.8b}, [%1],%5               \n"  // center * 2
-    MEMACCESS(1)
-    "ld1        {v3.8b}, [%1],%6               \n"
-    "usubl      v1.8h, v2.8b, v3.8b            \n"
-    "add        v0.8h, v0.8h, v1.8h            \n"
-    "add        v0.8h, v0.8h, v1.8h            \n"
-    MEMACCESS(2)
-    "ld1        {v2.8b}, [%2],%5               \n"  // bottom
-    MEMACCESS(2)
-    "ld1        {v3.8b}, [%2],%6               \n"
-    "subs       %w4, %w4, #8                   \n"  // 8 pixels
-    "usubl      v1.8h, v2.8b, v3.8b            \n"
-    "add        v0.8h, v0.8h, v1.8h            \n"
-    "abs        v0.8h, v0.8h                   \n"
-    "uqxtn      v0.8b, v0.8h                   \n"
-    MEMACCESS(3)
-    "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx
-    "b.gt       1b                             \n"
-  : "+r"(src_y0),      // %0
-    "+r"(src_y1),      // %1
-    "+r"(src_y2),      // %2
-    "+r"(dst_sobelx),  // %3
-    "+r"(width)        // %4
-  : "r"(2LL),          // %5
-    "r"(6LL)           // %6
-  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
+void SobelXRow_NEON(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    const uint8_t* src_y2,
+                    uint8_t* dst_sobelx,
+                    int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v0.8b}, [%0],%5               \n"  // top
+      "ld1        {v1.8b}, [%0],%6               \n"
+      "usubl      v0.8h, v0.8b, v1.8b            \n"
+      "ld1        {v2.8b}, [%1],%5               \n"  // center * 2
+      "ld1        {v3.8b}, [%1],%6               \n"
+      "usubl      v1.8h, v2.8b, v3.8b            \n"
+      "add        v0.8h, v0.8h, v1.8h            \n"
+      "add        v0.8h, v0.8h, v1.8h            \n"
+      "ld1        {v2.8b}, [%2],%5               \n"  // bottom
+      "ld1        {v3.8b}, [%2],%6               \n"
+      "subs       %w4, %w4, #8                   \n"  // 8 pixels
+      "usubl      v1.8h, v2.8b, v3.8b            \n"
+      "add        v0.8h, v0.8h, v1.8h            \n"
+      "abs        v0.8h, v0.8h                   \n"
+      "uqxtn      v0.8b, v0.8h                   \n"
+      "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx
+      "b.gt       1b                             \n"
+      : "+r"(src_y0),                           // %0
+        "+r"(src_y1),                           // %1
+        "+r"(src_y2),                           // %2
+        "+r"(dst_sobelx),                       // %3
+        "+r"(width)                             // %4
+      : "r"(2LL),                               // %5
+        "r"(6LL)                                // %6
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+      );
 }
 
 // SobelY as a matrix is
 // -1 -2 -1
 //  0  0  0
 //  1  2  1
-void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.8b}, [%0],%4               \n"  // left
-    MEMACCESS(1)
-    "ld1        {v1.8b}, [%1],%4               \n"
-    "usubl      v0.8h, v0.8b, v1.8b            \n"
-    MEMACCESS(0)
-    "ld1        {v2.8b}, [%0],%4               \n"  // center * 2
-    MEMACCESS(1)
-    "ld1        {v3.8b}, [%1],%4               \n"
-    "usubl      v1.8h, v2.8b, v3.8b            \n"
-    "add        v0.8h, v0.8h, v1.8h            \n"
-    "add        v0.8h, v0.8h, v1.8h            \n"
-    MEMACCESS(0)
-    "ld1        {v2.8b}, [%0],%5               \n"  // right
-    MEMACCESS(1)
-    "ld1        {v3.8b}, [%1],%5               \n"
-    "subs       %w3, %w3, #8                   \n"  // 8 pixels
-    "usubl      v1.8h, v2.8b, v3.8b            \n"
-    "add        v0.8h, v0.8h, v1.8h            \n"
-    "abs        v0.8h, v0.8h                   \n"
-    "uqxtn      v0.8b, v0.8h                   \n"
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely
-    "b.gt       1b                             \n"
-  : "+r"(src_y0),      // %0
-    "+r"(src_y1),      // %1
-    "+r"(dst_sobely),  // %2
-    "+r"(width)        // %3
-  : "r"(1LL),          // %4
-    "r"(6LL)           // %5
-  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
+void SobelYRow_NEON(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    uint8_t* dst_sobely,
+                    int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v0.8b}, [%0],%4               \n"  // left
+      "ld1        {v1.8b}, [%1],%4               \n"
+      "usubl      v0.8h, v0.8b, v1.8b            \n"
+      "ld1        {v2.8b}, [%0],%4               \n"  // center * 2
+      "ld1        {v3.8b}, [%1],%4               \n"
+      "usubl      v1.8h, v2.8b, v3.8b            \n"
+      "add        v0.8h, v0.8h, v1.8h            \n"
+      "add        v0.8h, v0.8h, v1.8h            \n"
+      "ld1        {v2.8b}, [%0],%5               \n"  // right
+      "ld1        {v3.8b}, [%1],%5               \n"
+      "subs       %w3, %w3, #8                   \n"  // 8 pixels
+      "usubl      v1.8h, v2.8b, v3.8b            \n"
+      "add        v0.8h, v0.8h, v1.8h            \n"
+      "abs        v0.8h, v0.8h                   \n"
+      "uqxtn      v0.8b, v0.8h                   \n"
+      "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely
+      "b.gt       1b                             \n"
+      : "+r"(src_y0),                           // %0
+        "+r"(src_y1),                           // %1
+        "+r"(dst_sobely),                       // %2
+        "+r"(width)                             // %3
+      : "r"(1LL),                               // %4
+        "r"(6LL)                                // %5
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+      );
 }
+
+// Caveat - rounds float to half float whereas scaling version truncates.
+void HalfFloat1Row_NEON(const uint16_t* src,
+                        uint16_t* dst,
+                        float /*unused*/,
+                        int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
+      "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
+      "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
+      "uxtl2      v3.4s, v1.8h                   \n"
+      "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
+      "scvtf      v3.4s, v3.4s                   \n"
+      "fcvtn      v1.4h, v2.4s                   \n"  // 8 half floats
+      "fcvtn2     v1.8h, v3.4s                   \n"
+      "st1        {v1.16b}, [%1], #16            \n"  // store 8 shorts
+      "b.gt       1b                             \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "cc", "memory", "v1", "v2", "v3");
+}
+
+void HalfFloatRow_NEON(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
+      "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
+      "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
+      "uxtl2      v3.4s, v1.8h                   \n"
+      "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
+      "scvtf      v3.4s, v3.4s                   \n"
+      "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // adjust exponent
+      "fmul       v3.4s, v3.4s, %3.s[0]          \n"
+      "uqshrn     v1.4h, v2.4s, #13              \n"  // isolate halffloat
+      "uqshrn2    v1.8h, v3.4s, #13              \n"
+      "st1        {v1.16b}, [%1], #16            \n"  // store 8 shorts
+      "b.gt       1b                             \n"
+      : "+r"(src),                      // %0
+        "+r"(dst),                      // %1
+        "+r"(width)                     // %2
+      : "w"(scale * 1.9259299444e-34f)  // %3
+      : "cc", "memory", "v1", "v2", "v3");
+}
+
+void ByteToFloatRow_NEON(const uint8_t* src,
+                         float* dst,
+                         float scale,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v1.8b}, [%0], #8              \n"  // load 8 bytes
+      "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
+      "uxtl       v1.8h, v1.8b                   \n"  // 8 shorts
+      "uxtl       v2.4s, v1.4h                   \n"  // 8 ints
+      "uxtl2      v3.4s, v1.8h                   \n"
+      "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
+      "scvtf      v3.4s, v3.4s                   \n"
+      "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // scale
+      "fmul       v3.4s, v3.4s, %3.s[0]          \n"
+      "st1        {v2.16b, v3.16b}, [%1], #32    \n"  // store 8 floats
+      "b.gt       1b                             \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      : "w"(scale)   // %3
+      : "cc", "memory", "v1", "v2", "v3");
+}
+
+float ScaleMaxSamples_NEON(const float* src,
+                           float* dst,
+                           float scale,
+                           int width) {
+  float fmax;
+  asm volatile(
+      "movi       v5.4s, #0                      \n"  // max
+      "movi       v6.4s, #0                      \n"
+
+      "1:                                        \n"
+      "ld1        {v1.4s, v2.4s}, [%0], #32      \n"  // load 8 samples
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+      "fmul       v3.4s, v1.4s, %4.s[0]          \n"  // scale
+      "fmul       v4.4s, v2.4s, %4.s[0]          \n"  // scale
+      "fmax       v5.4s, v5.4s, v1.4s            \n"  // max
+      "fmax       v6.4s, v6.4s, v2.4s            \n"
+      "st1        {v3.4s, v4.4s}, [%1], #32      \n"  // store 8 samples
+      "b.gt       1b                             \n"
+      "fmax       v5.4s, v5.4s, v6.4s            \n"  // max
+      "fmaxv      %s3, v5.4s                     \n"  // signed max acculator
+      : "+r"(src),                                    // %0
+        "+r"(dst),                                    // %1
+        "+r"(width),                                  // %2
+        "=w"(fmax)                                    // %3
+      : "w"(scale)                                    // %4
+      : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
+  return fmax;
+}
+
+float ScaleSumSamples_NEON(const float* src,
+                           float* dst,
+                           float scale,
+                           int width) {
+  float fsum;
+  asm volatile(
+      "movi       v5.4s, #0                      \n"  // max
+      "movi       v6.4s, #0                      \n"  // max
+
+      "1:                                        \n"
+      "ld1        {v1.4s, v2.4s}, [%0], #32      \n"  // load 8 samples
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+      "fmul       v3.4s, v1.4s, %4.s[0]          \n"  // scale
+      "fmul       v4.4s, v2.4s, %4.s[0]          \n"
+      "fmla       v5.4s, v1.4s, v1.4s            \n"  // sum of squares
+      "fmla       v6.4s, v2.4s, v2.4s            \n"
+      "st1        {v3.4s, v4.4s}, [%1], #32      \n"  // store 8 samples
+      "b.gt       1b                             \n"
+      "faddp      v5.4s, v5.4s, v6.4s            \n"
+      "faddp      v5.4s, v5.4s, v5.4s            \n"
+      "faddp      %3.4s, v5.4s, v5.4s            \n"  // sum
+      : "+r"(src),                                    // %0
+        "+r"(dst),                                    // %1
+        "+r"(width),                                  // %2
+        "=w"(fsum)                                    // %3
+      : "w"(scale)                                    // %4
+      : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
+  return fsum;
+}
+
+void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v1.4s, v2.4s}, [%0], #32      \n"  // load 8 samples
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+      "fmul       v1.4s, v1.4s, %3.s[0]          \n"  // scale
+      "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // scale
+      "st1        {v1.4s, v2.4s}, [%1], #32      \n"  // store 8 samples
+      "b.gt       1b                             \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      : "w"(scale)   // %3
+      : "cc", "memory", "v1", "v2");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_NEON(const uint16_t* src0,
+                   const uint16_t* src1,
+                   const uint16_t* src2,
+                   const uint16_t* src3,
+                   const uint16_t* src4,
+                   uint32_t* dst,
+                   int width) {
+  asm volatile(
+      "movi       v6.8h, #4                      \n"  // constant 4
+      "movi       v7.8h, #6                      \n"  // constant 6
+
+      "1:                                        \n"
+      "ld1        {v1.8h}, [%0], #16             \n"  // load 8 samples, 5 rows
+      "ld1        {v2.8h}, [%4], #16             \n"
+      "uaddl      v0.4s, v1.4h, v2.4h            \n"  // * 1
+      "uaddl2     v1.4s, v1.8h, v2.8h            \n"  // * 1
+      "ld1        {v2.8h}, [%1], #16             \n"
+      "umlal      v0.4s, v2.4h, v6.4h            \n"  // * 4
+      "umlal2     v1.4s, v2.8h, v6.8h            \n"  // * 4
+      "ld1        {v2.8h}, [%2], #16             \n"
+      "umlal      v0.4s, v2.4h, v7.4h            \n"  // * 6
+      "umlal2     v1.4s, v2.8h, v7.8h            \n"  // * 6
+      "ld1        {v2.8h}, [%3], #16             \n"
+      "umlal      v0.4s, v2.4h, v6.4h            \n"  // * 4
+      "umlal2     v1.4s, v2.8h, v6.8h            \n"  // * 4
+      "subs       %w6, %w6, #8                   \n"  // 8 processed per loop
+      "st1        {v0.4s,v1.4s}, [%5], #32       \n"  // store 8 samples
+      "b.gt       1b                             \n"
+      : "+r"(src0),  // %0
+        "+r"(src1),  // %1
+        "+r"(src2),  // %2
+        "+r"(src3),  // %3
+        "+r"(src4),  // %4
+        "+r"(dst),   // %5
+        "+r"(width)  // %6
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v6", "v7");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
+  const uint32_t* src1 = src + 1;
+  const uint32_t* src2 = src + 2;
+  const uint32_t* src3 = src + 3;
+  asm volatile(
+      "movi       v6.4s, #4                      \n"  // constant 4
+      "movi       v7.4s, #6                      \n"  // constant 6
+
+      "1:                                        \n"
+      "ld1        {v0.4s,v1.4s,v2.4s}, [%0], %6  \n"  // load 12 source samples
+      "add        v0.4s, v0.4s, v1.4s            \n"  // * 1
+      "add        v1.4s, v1.4s, v2.4s            \n"  // * 1
+      "ld1        {v2.4s,v3.4s}, [%2], #32       \n"
+      "mla        v0.4s, v2.4s, v7.4s            \n"  // * 6
+      "mla        v1.4s, v3.4s, v7.4s            \n"  // * 6
+      "ld1        {v2.4s,v3.4s}, [%1], #32       \n"
+      "ld1        {v4.4s,v5.4s}, [%3], #32       \n"
+      "add        v2.4s, v2.4s, v4.4s            \n"  // add rows for * 4
+      "add        v3.4s, v3.4s, v5.4s            \n"
+      "mla        v0.4s, v2.4s, v6.4s            \n"  // * 4
+      "mla        v1.4s, v3.4s, v6.4s            \n"  // * 4
+      "subs       %w5, %w5, #8                   \n"  // 8 processed per loop
+      "uqrshrn    v0.4h, v0.4s, #8               \n"  // round and pack
+      "uqrshrn2   v0.8h, v1.4s, #8               \n"
+      "st1        {v0.8h}, [%4], #16             \n"  // store 8 samples
+      "b.gt       1b                             \n"
+      : "+r"(src),   // %0
+        "+r"(src1),  // %1
+        "+r"(src2),  // %2
+        "+r"(src3),  // %3
+        "+r"(dst),   // %4
+        "+r"(width)  // %5
+      : "r"(32LL)    // %6
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
 #ifdef __cplusplus
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/row_win.cc b/media/libvpx/libvpx/third_party/libyuv/source/row_win.cc
index 2a3da8969f..5500d7f5a6 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/row_win.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/row_win.cc
@@ -28,72 +28,71 @@ extern "C" {
 #if defined(_M_X64)
 
 // Read 4 UV from 422, upsample to 8 UV.
-#define READYUV422                                                             \
-    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);                                 \
-    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));                      \
-    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \
-    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                                     \
-    u_buf += 4;                                                                \
-    xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                                   \
-    xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                                      \
-    y_buf += 8;
+#define READYUV422                                        \
+  xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf);            \
+  xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
+  xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                   \
+  xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                  \
+  u_buf += 4;                                             \
+  xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                \
+  xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                   \
+  y_buf += 8;
 
 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
-#define READYUVA422                                                            \
-    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);                                 \
-    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));                      \
-    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \
-    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                                     \
-    u_buf += 4;                                                                \
-    xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                                   \
-    xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                                      \
-    y_buf += 8;                                                                \
-    xmm5 = _mm_loadl_epi64((__m128i*)a_buf);                                   \
-    a_buf += 8;
+#define READYUVA422                                       \
+  xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf);            \
+  xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
+  xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                   \
+  xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                  \
+  u_buf += 4;                                             \
+  xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                \
+  xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                   \
+  y_buf += 8;                                             \
+  xmm5 = _mm_loadl_epi64((__m128i*)a_buf);                \
+  a_buf += 8;
 
 // Convert 8 pixels: 8 UV and 8 Y.
-#define YUVTORGB(yuvconstants)                                                 \
-    xmm1 = _mm_loadu_si128(&xmm0);                                             \
-    xmm2 = _mm_loadu_si128(&xmm0);                                             \
-    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB);           \
-    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG);           \
-    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR);           \
-    xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0);             \
-    xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1);             \
-    xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2);             \
-    xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb);            \
-    xmm0 = _mm_adds_epi16(xmm0, xmm4);                                         \
-    xmm1 = _mm_adds_epi16(xmm1, xmm4);                                         \
-    xmm2 = _mm_adds_epi16(xmm2, xmm4);                                         \
-    xmm0 = _mm_srai_epi16(xmm0, 6);                                            \
-    xmm1 = _mm_srai_epi16(xmm1, 6);                                            \
-    xmm2 = _mm_srai_epi16(xmm2, 6);                                            \
-    xmm0 = _mm_packus_epi16(xmm0, xmm0);                                       \
-    xmm1 = _mm_packus_epi16(xmm1, xmm1);                                       \
-    xmm2 = _mm_packus_epi16(xmm2, xmm2);
+#define YUVTORGB(yuvconstants)                                     \
+  xmm1 = _mm_loadu_si128(&xmm0);                                   \
+  xmm2 = _mm_loadu_si128(&xmm0);                                   \
+  xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
+  xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
+  xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
+  xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0);   \
+  xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1);   \
+  xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2);   \
+  xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb);  \
+  xmm0 = _mm_adds_epi16(xmm0, xmm4);                               \
+  xmm1 = _mm_adds_epi16(xmm1, xmm4);                               \
+  xmm2 = _mm_adds_epi16(xmm2, xmm4);                               \
+  xmm0 = _mm_srai_epi16(xmm0, 6);                                  \
+  xmm1 = _mm_srai_epi16(xmm1, 6);                                  \
+  xmm2 = _mm_srai_epi16(xmm2, 6);                                  \
+  xmm0 = _mm_packus_epi16(xmm0, xmm0);                             \
+  xmm1 = _mm_packus_epi16(xmm1, xmm1);                             \
+  xmm2 = _mm_packus_epi16(xmm2, xmm2);
 
 // Store 8 ARGB values.
-#define STOREARGB                                                              \
-    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \
-    xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);                                      \
-    xmm1 = _mm_loadu_si128(&xmm0);                                             \
-    xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);                                     \
-    xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);                                     \
-    _mm_storeu_si128((__m128i *)dst_argb, xmm0);                               \
-    _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);                        \
-    dst_argb += 32;
-
+#define STOREARGB                                    \
+  xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);              \
+  xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);              \
+  xmm1 = _mm_loadu_si128(&xmm0);                     \
+  xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);             \
+  xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);             \
+  _mm_storeu_si128((__m128i*)dst_argb, xmm0);        \
+  _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \
+  dst_argb += 32;
 
 #if defined(HAS_I422TOARGBROW_SSSE3)
-void I422ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* dst_argb,
+void I422ToARGBRow_SSSE3(const uint8_t* y_buf,
+                         const uint8_t* u_buf,
+                         const uint8_t* v_buf,
+                         uint8_t* dst_argb,
                          const struct YuvConstants* yuvconstants,
                          int width) {
   __m128i xmm0, xmm1, xmm2, xmm4;
   const __m128i xmm5 = _mm_set1_epi8(-1);
-  const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
+  const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
   while (width > 0) {
     READYUV422
     YUVTORGB(yuvconstants)
@@ -104,15 +103,15 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
 #endif
 
 #if defined(HAS_I422ALPHATOARGBROW_SSSE3)
-void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              const uint8* a_buf,
-                              uint8* dst_argb,
+void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
+                              const uint8_t* u_buf,
+                              const uint8_t* v_buf,
+                              const uint8_t* a_buf,
+                              uint8_t* dst_argb,
                               const struct YuvConstants* yuvconstants,
                               int width) {
   __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
-  const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
+  const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
   while (width > 0) {
     READYUVA422
     YUVTORGB(yuvconstants)
@@ -127,175 +126,143 @@ void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
 #ifdef HAS_ARGBTOYROW_SSSE3
 
 // Constants for ARGB.
-static const vec8 kARGBToY = {
-  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
-};
+static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
+                              13, 65, 33, 0, 13, 65, 33, 0};
 
 // JPeg full range.
-static const vec8 kARGBToYJ = {
-  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
-};
+static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
+                               15, 75, 38, 0, 15, 75, 38, 0};
 
-static const vec8 kARGBToU = {
-  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
-};
+static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
+                              112, -74, -38, 0, 112, -74, -38, 0};
 
-static const vec8 kARGBToUJ = {
-  127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
-};
+static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
+                               127, -84, -43, 0, 127, -84, -43, 0};
 
 static const vec8 kARGBToV = {
-  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
+    -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
 };
 
-static const vec8 kARGBToVJ = {
-  -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
-};
+static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
+                               -20, -107, 127, 0, -20, -107, 127, 0};
 
 // vpshufb for vphaddw + vpackuswb packed to shorts.
 static const lvec8 kShufARGBToUV_AVX = {
-  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
-  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
-};
+    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
 
 // Constants for BGRA.
-static const vec8 kBGRAToY = {
-  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
-};
+static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
+                              0, 33, 65, 13, 0, 33, 65, 13};
 
-static const vec8 kBGRAToU = {
-  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
-};
+static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
+                              0, -38, -74, 112, 0, -38, -74, 112};
 
-static const vec8 kBGRAToV = {
-  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
-};
+static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
+                              0, 112, -94, -18, 0, 112, -94, -18};
 
 // Constants for ABGR.
-static const vec8 kABGRToY = {
-  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
-};
+static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
+                              33, 65, 13, 0, 33, 65, 13, 0};
 
-static const vec8 kABGRToU = {
-  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
-};
+static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
+                              -38, -74, 112, 0, -38, -74, 112, 0};
 
-static const vec8 kABGRToV = {
-  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
-};
+static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
+                              112, -94, -18, 0, 112, -94, -18, 0};
 
 // Constants for RGBA.
-static const vec8 kRGBAToY = {
-  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
-};
+static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
+                              0, 13, 65, 33, 0, 13, 65, 33};
 
-static const vec8 kRGBAToU = {
-  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
-};
+static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
+                              0, 112, -74, -38, 0, 112, -74, -38};
 
-static const vec8 kRGBAToV = {
-  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
-};
+static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
+                              0, -18, -94, 112, 0, -18, -94, 112};
 
-static const uvec8 kAddY16 = {
-  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
-};
+static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
+                              16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
 
 // 7 bit fixed point 0.5.
-static const vec16 kAddYJ64 = {
-  64, 64, 64, 64, 64, 64, 64, 64
-};
+static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
 
-static const uvec8 kAddUV128 = {
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+                                128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
 
-static const uvec16 kAddUVJ128 = {
-  0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
-};
+static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
+                                  0x8080u, 0x8080u, 0x8080u, 0x8080u};
 
 // Shuffle table for converting RGB24 to ARGB.
 static const uvec8 kShuffleMaskRGB24ToARGB = {
-  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
-};
+    0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
 
 // Shuffle table for converting RAW to ARGB.
-static const uvec8 kShuffleMaskRAWToARGB = {
-  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
-};
+static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u,  4u,  3u, 13u,
+                                            8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
 
 // Shuffle table for converting RAW to RGB24.  First 8.
 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
-  2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+    2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,
+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting RAW to RGB24.  Middle 8.
 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
-  2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+    2u,   7u,   6u,   5u,   10u,  9u,   8u,   13u,
+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting RAW to RGB24.  Last 8.
 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
-  8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+    8u,   7u,   12u,  11u,  10u,  15u,  14u,  13u,
+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting ARGB to RGB24.
 static const uvec8 kShuffleMaskARGBToRGB24 = {
-  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
-};
+    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting ARGB to RAW.
 static const uvec8 kShuffleMaskARGBToRAW = {
-  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
-};
+    2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
-  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
-};
+    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
 
 // YUY2 shuf 16 Y to 32 Y.
-static const lvec8 kShuffleYUY2Y = {
-  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
-  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
-};
+static const lvec8 kShuffleYUY2Y = {0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10,
+                                    10, 12, 12, 14, 14, 0,  0,  2,  2,  4, 4,
+                                    6,  6,  8,  8,  10, 10, 12, 12, 14, 14};
 
 // YUY2 shuf 8 UV to 16 UV.
-static const lvec8 kShuffleYUY2UV = {
-  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
-  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
-};
+static const lvec8 kShuffleYUY2UV = {1,  3,  1,  3,  5,  7,  5,  7,  9,  11, 9,
+                                     11, 13, 15, 13, 15, 1,  3,  1,  3,  5,  7,
+                                     5,  7,  9,  11, 9,  11, 13, 15, 13, 15};
 
 // UYVY shuf 16 Y to 32 Y.
-static const lvec8 kShuffleUYVYY = {
-  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
-  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
-};
+static const lvec8 kShuffleUYVYY = {1,  1,  3,  3,  5,  5,  7,  7,  9,  9, 11,
+                                    11, 13, 13, 15, 15, 1,  1,  3,  3,  5, 5,
+                                    7,  7,  9,  9,  11, 11, 13, 13, 15, 15};
 
 // UYVY shuf 8 UV to 16 UV.
-static const lvec8 kShuffleUYVYUV = {
-  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
-  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
-};
+static const lvec8 kShuffleUYVYUV = {0,  2,  0,  2,  4,  6,  4,  6,  8,  10, 8,
+                                     10, 12, 14, 12, 14, 0,  2,  0,  2,  4,  6,
+                                     4,  6,  8,  10, 8,  10, 12, 14, 12, 14};
 
 // NV21 shuf 8 VU to 16 UV.
 static const lvec8 kShuffleNV21 = {
-  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
-  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
 };
 
 // Duplicates gray value 3 times and fills in alpha opaque.
-__declspec(naked)
-void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
+__declspec(naked) void J400ToARGBRow_SSE2(const uint8_t* src_y,
+                                          uint8_t* dst_argb,
+                                          int width) {
   __asm {
-    mov        eax, [esp + 4]        // src_y
-    mov        edx, [esp + 8]        // dst_argb
-    mov        ecx, [esp + 12]       // width
-    pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
+    mov        eax, [esp + 4]  // src_y
+    mov        edx, [esp + 8]  // dst_argb
+    mov        ecx, [esp + 12]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0xff000000
     pslld      xmm5, 24
 
   convertloop:
@@ -318,13 +285,14 @@ void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
 
 #ifdef HAS_J400TOARGBROW_AVX2
 // Duplicates gray value 3 times and fills in alpha opaque.
-__declspec(naked)
-void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width) {
+__declspec(naked) void J400ToARGBRow_AVX2(const uint8_t* src_y,
+                                          uint8_t* dst_argb,
+                                          int width) {
   __asm {
-    mov         eax, [esp + 4]        // src_y
-    mov         edx, [esp + 8]        // dst_argb
-    mov         ecx, [esp + 12]       // width
-    vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0xff000000
+    mov         eax, [esp + 4]  // src_y
+    mov         edx, [esp + 8]  // dst_argb
+    mov         ecx, [esp + 12]  // width
+    vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0xff000000
     vpslld      ymm5, ymm5, 24
 
   convertloop:
@@ -348,13 +316,14 @@ void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width) {
 }
 #endif  // HAS_J400TOARGBROW_AVX2
 
-__declspec(naked)
-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
+__declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
+                                            uint8_t* dst_argb,
+                                            int width) {
   __asm {
-    mov       eax, [esp + 4]   // src_rgb24
-    mov       edx, [esp + 8]   // dst_argb
+    mov       eax, [esp + 4]  // src_rgb24
+    mov       edx, [esp + 8]  // dst_argb
     mov       ecx, [esp + 12]  // width
-    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
+    pcmpeqb   xmm5, xmm5  // generate mask 0xff000000
     pslld     xmm5, 24
     movdqa    xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
 
@@ -364,17 +333,17 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
     movdqu    xmm3, [eax + 32]
     lea       eax, [eax + 48]
     movdqa    xmm2, xmm3
-    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
+    palignr   xmm2, xmm1, 8  // xmm2 = { xmm3[0:3] xmm1[8:15]}
     pshufb    xmm2, xmm4
     por       xmm2, xmm5
-    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
+    palignr   xmm1, xmm0, 12  // xmm1 = { xmm3[0:7] xmm0[12:15]}
     pshufb    xmm0, xmm4
     movdqu    [edx + 32], xmm2
     por       xmm0, xmm5
     pshufb    xmm1, xmm4
     movdqu    [edx], xmm0
     por       xmm1, xmm5
-    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
+    palignr   xmm3, xmm3, 4  // xmm3 = { xmm3[4:15]}
     pshufb    xmm3, xmm4
     movdqu    [edx + 16], xmm1
     por       xmm3, xmm5
@@ -386,14 +355,14 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
   }
 }
 
-__declspec(naked)
-void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
-                        int width) {
+__declspec(naked) void RAWToARGBRow_SSSE3(const uint8_t* src_raw,
+                                          uint8_t* dst_argb,
+                                          int width) {
   __asm {
-    mov       eax, [esp + 4]   // src_raw
-    mov       edx, [esp + 8]   // dst_argb
+    mov       eax, [esp + 4]  // src_raw
+    mov       edx, [esp + 8]  // dst_argb
     mov       ecx, [esp + 12]  // width
-    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
+    pcmpeqb   xmm5, xmm5  // generate mask 0xff000000
     pslld     xmm5, 24
     movdqa    xmm4, xmmword ptr kShuffleMaskRAWToARGB
 
@@ -403,17 +372,17 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
     movdqu    xmm3, [eax + 32]
     lea       eax, [eax + 48]
     movdqa    xmm2, xmm3
-    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
+    palignr   xmm2, xmm1, 8  // xmm2 = { xmm3[0:3] xmm1[8:15]}
     pshufb    xmm2, xmm4
     por       xmm2, xmm5
-    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
+    palignr   xmm1, xmm0, 12  // xmm1 = { xmm3[0:7] xmm0[12:15]}
     pshufb    xmm0, xmm4
     movdqu    [edx + 32], xmm2
     por       xmm0, xmm5
     pshufb    xmm1, xmm4
     movdqu    [edx], xmm0
     por       xmm1, xmm5
-    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
+    palignr   xmm3, xmm3, 4  // xmm3 = { xmm3[4:15]}
     pshufb    xmm3, xmm4
     movdqu    [edx + 16], xmm1
     por       xmm3, xmm5
@@ -425,11 +394,12 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
   }
 }
 
-__declspec(naked)
-void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
+__declspec(naked) void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
+                                           uint8_t* dst_rgb24,
+                                           int width) {
   __asm {
-    mov       eax, [esp + 4]   // src_raw
-    mov       edx, [esp + 8]   // dst_rgb24
+    mov       eax, [esp + 4]  // src_raw
+    mov       edx, [esp + 8]  // dst_rgb24
     mov       ecx, [esp + 12]  // width
     movdqa    xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
     movdqa    xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
@@ -460,9 +430,9 @@ void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
 // v * (256 + 8)
 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
 // 20 instructions.
-__declspec(naked)
-void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
-                          int width) {
+__declspec(naked) void RGB565ToARGBRow_SSE2(const uint8_t* src_rgb565,
+                                            uint8_t* dst_argb,
+                                            int width) {
   __asm {
     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
     movd      xmm5, eax
@@ -470,33 +440,33 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
     mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
     movd      xmm6, eax
     pshufd    xmm6, xmm6, 0
-    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
+    pcmpeqb   xmm3, xmm3  // generate mask 0xf800f800 for Red
     psllw     xmm3, 11
-    pcmpeqb   xmm4, xmm4       // generate mask 0x07e007e0 for Green
+    pcmpeqb   xmm4, xmm4  // generate mask 0x07e007e0 for Green
     psllw     xmm4, 10
     psrlw     xmm4, 5
-    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
+    pcmpeqb   xmm7, xmm7  // generate mask 0xff00ff00 for Alpha
     psllw     xmm7, 8
 
-    mov       eax, [esp + 4]   // src_rgb565
-    mov       edx, [esp + 8]   // dst_argb
+    mov       eax, [esp + 4]  // src_rgb565
+    mov       edx, [esp + 8]  // dst_argb
     mov       ecx, [esp + 12]  // width
     sub       edx, eax
     sub       edx, eax
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565
+    movdqu    xmm0, [eax]  // fetch 8 pixels of bgr565
     movdqa    xmm1, xmm0
     movdqa    xmm2, xmm0
-    pand      xmm1, xmm3    // R in upper 5 bits
-    psllw     xmm2, 11      // B in upper 5 bits
-    pmulhuw   xmm1, xmm5    // * (256 + 8)
-    pmulhuw   xmm2, xmm5    // * (256 + 8)
+    pand      xmm1, xmm3  // R in upper 5 bits
+    psllw     xmm2, 11  // B in upper 5 bits
+    pmulhuw   xmm1, xmm5  // * (256 + 8)
+    pmulhuw   xmm2, xmm5  // * (256 + 8)
     psllw     xmm1, 8
-    por       xmm1, xmm2    // RB
-    pand      xmm0, xmm4    // G in middle 6 bits
-    pmulhuw   xmm0, xmm6    // << 5 * (256 + 4)
-    por       xmm0, xmm7    // AG
+    por       xmm1, xmm2  // RB
+    pand      xmm0, xmm4  // G in middle 6 bits
+    pmulhuw   xmm0, xmm6  // << 5 * (256 + 4)
+    por       xmm0, xmm7  // AG
     movdqa    xmm2, xmm1
     punpcklbw xmm1, xmm0
     punpckhbw xmm2, xmm0
@@ -516,9 +486,9 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
 // v * 256 + v * 8
 // v * (256 + 8)
 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
-__declspec(naked)
-void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
-                          int width) {
+__declspec(naked) void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565,
+                                            uint8_t* dst_argb,
+                                            int width) {
   __asm {
     mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
     vmovd      xmm5, eax
@@ -526,32 +496,32 @@ void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
     mov        eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
     vmovd      xmm6, eax
     vbroadcastss ymm6, xmm6
-    vpcmpeqb   ymm3, ymm3, ymm3       // generate mask 0xf800f800 for Red
+    vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0xf800f800 for Red
     vpsllw     ymm3, ymm3, 11
-    vpcmpeqb   ymm4, ymm4, ymm4       // generate mask 0x07e007e0 for Green
+    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x07e007e0 for Green
     vpsllw     ymm4, ymm4, 10
     vpsrlw     ymm4, ymm4, 5
-    vpcmpeqb   ymm7, ymm7, ymm7       // generate mask 0xff00ff00 for Alpha
+    vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xff00ff00 for Alpha
     vpsllw     ymm7, ymm7, 8
 
-    mov        eax, [esp + 4]   // src_rgb565
-    mov        edx, [esp + 8]   // dst_argb
+    mov        eax, [esp + 4]  // src_rgb565
+    mov        edx, [esp + 8]  // dst_argb
     mov        ecx, [esp + 12]  // width
     sub        edx, eax
     sub        edx, eax
 
  convertloop:
-    vmovdqu    ymm0, [eax]   // fetch 16 pixels of bgr565
-    vpand      ymm1, ymm0, ymm3    // R in upper 5 bits
-    vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
-    vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
-    vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
+    vmovdqu    ymm0, [eax]  // fetch 16 pixels of bgr565
+    vpand      ymm1, ymm0, ymm3  // R in upper 5 bits
+    vpsllw     ymm2, ymm0, 11  // B in upper 5 bits
+    vpmulhuw   ymm1, ymm1, ymm5  // * (256 + 8)
+    vpmulhuw   ymm2, ymm2, ymm5  // * (256 + 8)
     vpsllw     ymm1, ymm1, 8
-    vpor       ymm1, ymm1, ymm2    // RB
-    vpand      ymm0, ymm0, ymm4    // G in middle 6 bits
-    vpmulhuw   ymm0, ymm0, ymm6    // << 5 * (256 + 4)
-    vpor       ymm0, ymm0, ymm7    // AG
-    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
+    vpor       ymm1, ymm1, ymm2  // RB
+    vpand      ymm0, ymm0, ymm4  // G in middle 6 bits
+    vpmulhuw   ymm0, ymm0, ymm6  // << 5 * (256 + 4)
+    vpor       ymm0, ymm0, ymm7  // AG
+    vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
     vpermq     ymm1, ymm1, 0xd8
     vpunpckhbw ymm2, ymm1, ymm0
     vpunpcklbw ymm1, ymm1, ymm0
@@ -567,9 +537,9 @@ void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
 #endif  // HAS_RGB565TOARGBROW_AVX2
 
 #ifdef HAS_ARGB1555TOARGBROW_AVX2
-__declspec(naked)
-void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
-                            int width) {
+__declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555,
+                                              uint8_t* dst_argb,
+                                              int width) {
   __asm {
     mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
     vmovd      xmm5, eax
@@ -577,33 +547,33 @@ void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
     mov        eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
     vmovd      xmm6, eax
     vbroadcastss ymm6, xmm6
-    vpcmpeqb   ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
+    vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0xf800f800 for Red
     vpsllw     ymm3, ymm3, 11
-    vpsrlw     ymm4, ymm3, 6    // generate mask 0x03e003e0 for Green
-    vpcmpeqb   ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
+    vpsrlw     ymm4, ymm3, 6  // generate mask 0x03e003e0 for Green
+    vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xff00ff00 for Alpha
     vpsllw     ymm7, ymm7, 8
 
-    mov        eax,  [esp + 4]   // src_argb1555
-    mov        edx,  [esp + 8]   // dst_argb
+    mov        eax,  [esp + 4]  // src_argb1555
+    mov        edx,  [esp + 8]  // dst_argb
     mov        ecx,  [esp + 12]  // width
     sub        edx,  eax
     sub        edx,  eax
 
  convertloop:
-    vmovdqu    ymm0, [eax]         // fetch 16 pixels of 1555
-    vpsllw     ymm1, ymm0, 1       // R in upper 5 bits
-    vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
+    vmovdqu    ymm0, [eax]  // fetch 16 pixels of 1555
+    vpsllw     ymm1, ymm0, 1  // R in upper 5 bits
+    vpsllw     ymm2, ymm0, 11  // B in upper 5 bits
     vpand      ymm1, ymm1, ymm3
-    vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
-    vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
+    vpmulhuw   ymm2, ymm2, ymm5  // * (256 + 8)
+    vpmulhuw   ymm1, ymm1, ymm5  // * (256 + 8)
     vpsllw     ymm1, ymm1, 8
-    vpor       ymm1, ymm1, ymm2    // RB
-    vpsraw     ymm2, ymm0, 8       // A
-    vpand      ymm0, ymm0, ymm4    // G in middle 5 bits
-    vpmulhuw   ymm0, ymm0, ymm6    // << 6 * (256 + 8)
+    vpor       ymm1, ymm1, ymm2  // RB
+    vpsraw     ymm2, ymm0, 8  // A
+    vpand      ymm0, ymm0, ymm4  // G in middle 5 bits
+    vpmulhuw   ymm0, ymm0, ymm6  // << 6 * (256 + 8)
     vpand      ymm2, ymm2, ymm7
-    vpor       ymm0, ymm0, ymm2    // AG
-    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
+    vpor       ymm0, ymm0, ymm2  // AG
+    vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
     vpermq     ymm1, ymm1, 0xd8
     vpunpckhbw ymm2, ymm1, ymm0
     vpunpcklbw ymm1, ymm1, ymm0
@@ -619,29 +589,29 @@ void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
 #endif  // HAS_ARGB1555TOARGBROW_AVX2
 
 #ifdef HAS_ARGB4444TOARGBROW_AVX2
-__declspec(naked)
-void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
-                            int width) {
+__declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444,
+                                              uint8_t* dst_argb,
+                                              int width) {
   __asm {
     mov       eax,  0x0f0f0f0f  // generate mask 0x0f0f0f0f
     vmovd     xmm4, eax
     vbroadcastss ymm4, xmm4
-    vpslld    ymm5, ymm4, 4     // 0xf0f0f0f0 for high nibbles
-    mov       eax,  [esp + 4]   // src_argb4444
-    mov       edx,  [esp + 8]   // dst_argb
+    vpslld    ymm5, ymm4, 4  // 0xf0f0f0f0 for high nibbles
+    mov       eax,  [esp + 4]  // src_argb4444
+    mov       edx,  [esp + 8]  // dst_argb
     mov       ecx,  [esp + 12]  // width
     sub       edx,  eax
     sub       edx,  eax
 
  convertloop:
-    vmovdqu    ymm0, [eax]         // fetch 16 pixels of bgra4444
-    vpand      ymm2, ymm0, ymm5    // mask high nibbles
-    vpand      ymm0, ymm0, ymm4    // mask low nibbles
+    vmovdqu    ymm0, [eax]  // fetch 16 pixels of bgra4444
+    vpand      ymm2, ymm0, ymm5  // mask high nibbles
+    vpand      ymm0, ymm0, ymm4  // mask low nibbles
     vpsrlw     ymm3, ymm2, 4
     vpsllw     ymm1, ymm0, 4
     vpor       ymm2, ymm2, ymm3
     vpor       ymm0, ymm0, ymm1
-    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
+    vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
     vpermq     ymm2, ymm2, 0xd8
     vpunpckhbw ymm1, ymm0, ymm2
     vpunpcklbw ymm0, ymm0, ymm2
@@ -657,9 +627,9 @@ void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
 #endif  // HAS_ARGB4444TOARGBROW_AVX2
 
 // 24 instructions
-__declspec(naked)
-void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
-                            int width) {
+__declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8_t* src_argb1555,
+                                              uint8_t* dst_argb,
+                                              int width) {
   __asm {
     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
     movd      xmm5, eax
@@ -667,36 +637,36 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
     mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
     movd      xmm6, eax
     pshufd    xmm6, xmm6, 0
-    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
+    pcmpeqb   xmm3, xmm3  // generate mask 0xf800f800 for Red
     psllw     xmm3, 11
-    movdqa    xmm4, xmm3       // generate mask 0x03e003e0 for Green
+    movdqa    xmm4, xmm3  // generate mask 0x03e003e0 for Green
     psrlw     xmm4, 6
-    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
+    pcmpeqb   xmm7, xmm7  // generate mask 0xff00ff00 for Alpha
     psllw     xmm7, 8
 
-    mov       eax, [esp + 4]   // src_argb1555
-    mov       edx, [esp + 8]   // dst_argb
+    mov       eax, [esp + 4]  // src_argb1555
+    mov       edx, [esp + 8]  // dst_argb
     mov       ecx, [esp + 12]  // width
     sub       edx, eax
     sub       edx, eax
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 8 pixels of 1555
+    movdqu    xmm0, [eax]  // fetch 8 pixels of 1555
     movdqa    xmm1, xmm0
     movdqa    xmm2, xmm0
-    psllw     xmm1, 1       // R in upper 5 bits
-    psllw     xmm2, 11      // B in upper 5 bits
+    psllw     xmm1, 1  // R in upper 5 bits
+    psllw     xmm2, 11  // B in upper 5 bits
     pand      xmm1, xmm3
-    pmulhuw   xmm2, xmm5    // * (256 + 8)
-    pmulhuw   xmm1, xmm5    // * (256 + 8)
+    pmulhuw   xmm2, xmm5  // * (256 + 8)
+    pmulhuw   xmm1, xmm5  // * (256 + 8)
     psllw     xmm1, 8
-    por       xmm1, xmm2    // RB
+    por       xmm1, xmm2  // RB
     movdqa    xmm2, xmm0
-    pand      xmm0, xmm4    // G in middle 5 bits
-    psraw     xmm2, 8       // A
-    pmulhuw   xmm0, xmm6    // << 6 * (256 + 8)
+    pand      xmm0, xmm4  // G in middle 5 bits
+    psraw     xmm2, 8  // A
+    pmulhuw   xmm0, xmm6  // << 6 * (256 + 8)
     pand      xmm2, xmm7
-    por       xmm0, xmm2    // AG
+    por       xmm0, xmm2  // AG
     movdqa    xmm2, xmm1
     punpcklbw xmm1, xmm0
     punpckhbw xmm2, xmm0
@@ -710,26 +680,26 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
 }
 
 // 18 instructions.
-__declspec(naked)
-void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
-                            int width) {
+__declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8_t* src_argb4444,
+                                              uint8_t* dst_argb,
+                                              int width) {
   __asm {
     mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
     movd      xmm4, eax
     pshufd    xmm4, xmm4, 0
-    movdqa    xmm5, xmm4       // 0xf0f0f0f0 for high nibbles
+    movdqa    xmm5, xmm4  // 0xf0f0f0f0 for high nibbles
     pslld     xmm5, 4
-    mov       eax, [esp + 4]   // src_argb4444
-    mov       edx, [esp + 8]   // dst_argb
+    mov       eax, [esp + 4]  // src_argb4444
+    mov       edx, [esp + 8]  // dst_argb
     mov       ecx, [esp + 12]  // width
     sub       edx, eax
     sub       edx, eax
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444
+    movdqu    xmm0, [eax]  // fetch 8 pixels of bgra4444
     movdqa    xmm2, xmm0
-    pand      xmm0, xmm4    // mask low nibbles
-    pand      xmm2, xmm5    // mask high nibbles
+    pand      xmm0, xmm4  // mask low nibbles
+    pand      xmm2, xmm5  // mask high nibbles
     movdqa    xmm1, xmm0
     movdqa    xmm3, xmm2
     psllw     xmm1, 4
@@ -748,37 +718,38 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
   }
 }
 
-__declspec(naked)
-void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
+__declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8_t* src_argb,
+                                            uint8_t* dst_rgb,
+                                            int width) {
   __asm {
-    mov       eax, [esp + 4]   // src_argb
-    mov       edx, [esp + 8]   // dst_rgb
+    mov       eax, [esp + 4]  // src_argb
+    mov       edx, [esp + 8]  // dst_rgb
     mov       ecx, [esp + 12]  // width
     movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRGB24
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
+    movdqu    xmm0, [eax]  // fetch 16 pixels of argb
     movdqu    xmm1, [eax + 16]
     movdqu    xmm2, [eax + 32]
     movdqu    xmm3, [eax + 48]
     lea       eax, [eax + 64]
-    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
+    pshufb    xmm0, xmm6  // pack 16 bytes of ARGB to 12 bytes of RGB
     pshufb    xmm1, xmm6
     pshufb    xmm2, xmm6
     pshufb    xmm3, xmm6
-    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
-    psrldq    xmm1, 4      // 8 bytes from 1
-    pslldq    xmm4, 12     // 4 bytes from 1 for 0
-    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
-    por       xmm0, xmm4   // 4 bytes from 1 for 0
-    pslldq    xmm5, 8      // 8 bytes from 2 for 1
+    movdqa    xmm4, xmm1  // 4 bytes from 1 for 0
+    psrldq    xmm1, 4  // 8 bytes from 1
+    pslldq    xmm4, 12  // 4 bytes from 1 for 0
+    movdqa    xmm5, xmm2  // 8 bytes from 2 for 1
+    por       xmm0, xmm4  // 4 bytes from 1 for 0
+    pslldq    xmm5, 8  // 8 bytes from 2 for 1
     movdqu    [edx], xmm0  // store 0
-    por       xmm1, xmm5   // 8 bytes from 2 for 1
-    psrldq    xmm2, 8      // 4 bytes from 2
-    pslldq    xmm3, 4      // 12 bytes from 3 for 2
-    por       xmm2, xmm3   // 12 bytes from 3 for 2
-    movdqu    [edx + 16], xmm1   // store 1
-    movdqu    [edx + 32], xmm2   // store 2
+    por       xmm1, xmm5  // 8 bytes from 2 for 1
+    psrldq    xmm2, 8  // 4 bytes from 2
+    pslldq    xmm3, 4  // 12 bytes from 3 for 2
+    por       xmm2, xmm3  // 12 bytes from 3 for 2
+    movdqu    [edx + 16], xmm1  // store 1
+    movdqu    [edx + 32], xmm2  // store 2
     lea       edx, [edx + 48]
     sub       ecx, 16
     jg        convertloop
@@ -786,37 +757,38 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
   }
 }
 
-__declspec(naked)
-void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
+__declspec(naked) void ARGBToRAWRow_SSSE3(const uint8_t* src_argb,
+                                          uint8_t* dst_rgb,
+                                          int width) {
   __asm {
-    mov       eax, [esp + 4]   // src_argb
-    mov       edx, [esp + 8]   // dst_rgb
+    mov       eax, [esp + 4]  // src_argb
+    mov       edx, [esp + 8]  // dst_rgb
     mov       ecx, [esp + 12]  // width
     movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRAW
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
+    movdqu    xmm0, [eax]  // fetch 16 pixels of argb
     movdqu    xmm1, [eax + 16]
     movdqu    xmm2, [eax + 32]
     movdqu    xmm3, [eax + 48]
     lea       eax, [eax + 64]
-    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
+    pshufb    xmm0, xmm6  // pack 16 bytes of ARGB to 12 bytes of RGB
     pshufb    xmm1, xmm6
     pshufb    xmm2, xmm6
     pshufb    xmm3, xmm6
-    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
-    psrldq    xmm1, 4      // 8 bytes from 1
-    pslldq    xmm4, 12     // 4 bytes from 1 for 0
-    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
-    por       xmm0, xmm4   // 4 bytes from 1 for 0
-    pslldq    xmm5, 8      // 8 bytes from 2 for 1
+    movdqa    xmm4, xmm1  // 4 bytes from 1 for 0
+    psrldq    xmm1, 4  // 8 bytes from 1
+    pslldq    xmm4, 12  // 4 bytes from 1 for 0
+    movdqa    xmm5, xmm2  // 8 bytes from 2 for 1
+    por       xmm0, xmm4  // 4 bytes from 1 for 0
+    pslldq    xmm5, 8  // 8 bytes from 2 for 1
     movdqu    [edx], xmm0  // store 0
-    por       xmm1, xmm5   // 8 bytes from 2 for 1
-    psrldq    xmm2, 8      // 4 bytes from 2
-    pslldq    xmm3, 4      // 12 bytes from 3 for 2
-    por       xmm2, xmm3   // 12 bytes from 3 for 2
-    movdqu    [edx + 16], xmm1   // store 1
-    movdqu    [edx + 32], xmm2   // store 2
+    por       xmm1, xmm5  // 8 bytes from 2 for 1
+    psrldq    xmm2, 8  // 4 bytes from 2
+    pslldq    xmm3, 4  // 12 bytes from 3 for 2
+    por       xmm2, xmm3  // 12 bytes from 3 for 2
+    movdqu    [edx + 16], xmm1  // store 1
+    movdqu    [edx + 32], xmm2  // store 2
     lea       edx, [edx + 48]
     sub       ecx, 16
     jg        convertloop
@@ -824,33 +796,34 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
   }
 }
 
-__declspec(naked)
-void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
+__declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb,
+                                            uint8_t* dst_rgb,
+                                            int width) {
   __asm {
-    mov       eax, [esp + 4]   // src_argb
-    mov       edx, [esp + 8]   // dst_rgb
+    mov       eax, [esp + 4]  // src_argb
+    mov       edx, [esp + 8]  // dst_rgb
     mov       ecx, [esp + 12]  // width
-    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
+    pcmpeqb   xmm3, xmm3  // generate mask 0x0000001f
     psrld     xmm3, 27
-    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
+    pcmpeqb   xmm4, xmm4  // generate mask 0x000007e0
     psrld     xmm4, 26
     pslld     xmm4, 5
-    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
+    pcmpeqb   xmm5, xmm5  // generate mask 0xfffff800
     pslld     xmm5, 11
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
-    movdqa    xmm1, xmm0    // B
-    movdqa    xmm2, xmm0    // G
-    pslld     xmm0, 8       // R
-    psrld     xmm1, 3       // B
-    psrld     xmm2, 5       // G
-    psrad     xmm0, 16      // R
-    pand      xmm1, xmm3    // B
-    pand      xmm2, xmm4    // G
-    pand      xmm0, xmm5    // R
-    por       xmm1, xmm2    // BG
-    por       xmm0, xmm1    // BGR
+    movdqu    xmm0, [eax]  // fetch 4 pixels of argb
+    movdqa    xmm1, xmm0  // B
+    movdqa    xmm2, xmm0  // G
+    pslld     xmm0, 8  // R
+    psrld     xmm1, 3  // B
+    psrld     xmm2, 5  // G
+    psrad     xmm0, 16  // R
+    pand      xmm1, xmm3  // B
+    pand      xmm2, xmm4  // G
+    pand      xmm0, xmm5  // R
+    por       xmm1, xmm2  // BG
+    por       xmm0, xmm1  // BGR
     packssdw  xmm0, xmm0
     lea       eax, [eax + 16]
     movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
@@ -861,41 +834,42 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
   }
 }
 
-__declspec(naked)
-void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
-                                const uint32 dither4, int width) {
+__declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb,
+                                                  uint8_t* dst_rgb,
+                                                  const uint32_t dither4,
+                                                  int width) {
   __asm {
 
-    mov       eax, [esp + 4]   // src_argb
-    mov       edx, [esp + 8]   // dst_rgb
-    movd      xmm6, [esp + 12] // dither4
+    mov       eax, [esp + 4]  // src_argb
+    mov       edx, [esp + 8]  // dst_rgb
+    movd      xmm6, [esp + 12]  // dither4
     mov       ecx, [esp + 16]  // width
-    punpcklbw xmm6, xmm6       // make dither 16 bytes
+    punpcklbw xmm6, xmm6  // make dither 16 bytes
     movdqa    xmm7, xmm6
     punpcklwd xmm6, xmm6
     punpckhwd xmm7, xmm7
-    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
+    pcmpeqb   xmm3, xmm3  // generate mask 0x0000001f
     psrld     xmm3, 27
-    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
+    pcmpeqb   xmm4, xmm4  // generate mask 0x000007e0
     psrld     xmm4, 26
     pslld     xmm4, 5
-    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
+    pcmpeqb   xmm5, xmm5  // generate mask 0xfffff800
     pslld     xmm5, 11
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
-    paddusb   xmm0, xmm6    // add dither
-    movdqa    xmm1, xmm0    // B
-    movdqa    xmm2, xmm0    // G
-    pslld     xmm0, 8       // R
-    psrld     xmm1, 3       // B
-    psrld     xmm2, 5       // G
-    psrad     xmm0, 16      // R
-    pand      xmm1, xmm3    // B
-    pand      xmm2, xmm4    // G
-    pand      xmm0, xmm5    // R
-    por       xmm1, xmm2    // BG
-    por       xmm0, xmm1    // BGR
+    movdqu    xmm0, [eax]  // fetch 4 pixels of argb
+    paddusb   xmm0, xmm6  // add dither
+    movdqa    xmm1, xmm0  // B
+    movdqa    xmm2, xmm0  // G
+    pslld     xmm0, 8  // R
+    psrld     xmm1, 3  // B
+    psrld     xmm2, 5  // G
+    psrad     xmm0, 16  // R
+    pand      xmm1, xmm3  // B
+    pand      xmm2, xmm4  // G
+    pand      xmm0, xmm5  // R
+    por       xmm1, xmm2  // BG
+    por       xmm0, xmm1  // BGR
     packssdw  xmm0, xmm0
     lea       eax, [eax + 16]
     movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
@@ -907,39 +881,40 @@ void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
 }
 
 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
-__declspec(naked)
-void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
-                                const uint32 dither4, int width) {
+__declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb,
+                                                  uint8_t* dst_rgb,
+                                                  const uint32_t dither4,
+                                                  int width) {
   __asm {
-    mov        eax, [esp + 4]      // src_argb
-    mov        edx, [esp + 8]      // dst_rgb
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_rgb
     vbroadcastss xmm6, [esp + 12]  // dither4
-    mov        ecx, [esp + 16]     // width
-    vpunpcklbw xmm6, xmm6, xmm6    // make dither 32 bytes
+    mov        ecx, [esp + 16]  // width
+    vpunpcklbw xmm6, xmm6, xmm6  // make dither 32 bytes
     vpermq     ymm6, ymm6, 0xd8
     vpunpcklwd ymm6, ymm6, ymm6
-    vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
+    vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0x0000001f
     vpsrld     ymm3, ymm3, 27
-    vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
+    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x000007e0
     vpsrld     ymm4, ymm4, 26
     vpslld     ymm4, ymm4, 5
-    vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
+    vpslld     ymm5, ymm3, 11  // generate mask 0x0000f800
 
  convertloop:
-    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
-    vpaddusb   ymm0, ymm0, ymm6    // add dither
-    vpsrld     ymm2, ymm0, 5       // G
-    vpsrld     ymm1, ymm0, 3       // B
-    vpsrld     ymm0, ymm0, 8       // R
-    vpand      ymm2, ymm2, ymm4    // G
-    vpand      ymm1, ymm1, ymm3    // B
-    vpand      ymm0, ymm0, ymm5    // R
-    vpor       ymm1, ymm1, ymm2    // BG
-    vpor       ymm0, ymm0, ymm1    // BGR
+    vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
+    vpaddusb   ymm0, ymm0, ymm6  // add dither
+    vpsrld     ymm2, ymm0, 5  // G
+    vpsrld     ymm1, ymm0, 3  // B
+    vpsrld     ymm0, ymm0, 8  // R
+    vpand      ymm2, ymm2, ymm4  // G
+    vpand      ymm1, ymm1, ymm3  // B
+    vpand      ymm0, ymm0, ymm5  // R
+    vpor       ymm1, ymm1, ymm2  // BG
+    vpor       ymm0, ymm0, ymm1  // BGR
     vpackusdw  ymm0, ymm0, ymm0
     vpermq     ymm0, ymm0, 0xd8
     lea        eax, [eax + 32]
-    vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
+    vmovdqu    [edx], xmm0  // store 8 pixels of RGB565
     lea        edx, [edx + 16]
     sub        ecx, 8
     jg         convertloop
@@ -950,37 +925,38 @@ void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
 #endif  // HAS_ARGBTORGB565DITHERROW_AVX2
 
 // TODO(fbarchard): Improve sign extension/packing.
-__declspec(naked)
-void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
+__declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8_t* src_argb,
+                                              uint8_t* dst_rgb,
+                                              int width) {
   __asm {
-    mov       eax, [esp + 4]   // src_argb
-    mov       edx, [esp + 8]   // dst_rgb
+    mov       eax, [esp + 4]  // src_argb
+    mov       edx, [esp + 8]  // dst_rgb
     mov       ecx, [esp + 12]  // width
-    pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f
+    pcmpeqb   xmm4, xmm4  // generate mask 0x0000001f
     psrld     xmm4, 27
-    movdqa    xmm5, xmm4       // generate mask 0x000003e0
+    movdqa    xmm5, xmm4  // generate mask 0x000003e0
     pslld     xmm5, 5
-    movdqa    xmm6, xmm4       // generate mask 0x00007c00
+    movdqa    xmm6, xmm4  // generate mask 0x00007c00
     pslld     xmm6, 10
-    pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000
+    pcmpeqb   xmm7, xmm7  // generate mask 0xffff8000
     pslld     xmm7, 15
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
-    movdqa    xmm1, xmm0    // B
-    movdqa    xmm2, xmm0    // G
-    movdqa    xmm3, xmm0    // R
-    psrad     xmm0, 16      // A
-    psrld     xmm1, 3       // B
-    psrld     xmm2, 6       // G
-    psrld     xmm3, 9       // R
-    pand      xmm0, xmm7    // A
-    pand      xmm1, xmm4    // B
-    pand      xmm2, xmm5    // G
-    pand      xmm3, xmm6    // R
-    por       xmm0, xmm1    // BA
-    por       xmm2, xmm3    // GR
-    por       xmm0, xmm2    // BGRA
+    movdqu    xmm0, [eax]  // fetch 4 pixels of argb
+    movdqa    xmm1, xmm0  // B
+    movdqa    xmm2, xmm0  // G
+    movdqa    xmm3, xmm0  // R
+    psrad     xmm0, 16  // A
+    psrld     xmm1, 3  // B
+    psrld     xmm2, 6  // G
+    psrld     xmm3, 9  // R
+    pand      xmm0, xmm7  // A
+    pand      xmm1, xmm4  // B
+    pand      xmm2, xmm5  // G
+    pand      xmm3, xmm6  // R
+    por       xmm0, xmm1  // BA
+    por       xmm2, xmm3  // GR
+    por       xmm0, xmm2  // BGRA
     packssdw  xmm0, xmm0
     lea       eax, [eax + 16]
     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
@@ -991,22 +967,23 @@ void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
   }
 }
 
-__declspec(naked)
-void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
+__declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8_t* src_argb,
+                                              uint8_t* dst_rgb,
+                                              int width) {
   __asm {
-    mov       eax, [esp + 4]   // src_argb
-    mov       edx, [esp + 8]   // dst_rgb
+    mov       eax, [esp + 4]  // src_argb
+    mov       edx, [esp + 8]  // dst_rgb
     mov       ecx, [esp + 12]  // width
-    pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000
+    pcmpeqb   xmm4, xmm4  // generate mask 0xf000f000
     psllw     xmm4, 12
-    movdqa    xmm3, xmm4       // generate mask 0x00f000f0
+    movdqa    xmm3, xmm4  // generate mask 0x00f000f0
     psrlw     xmm3, 8
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
+    movdqu    xmm0, [eax]  // fetch 4 pixels of argb
     movdqa    xmm1, xmm0
-    pand      xmm0, xmm3    // low nibble
-    pand      xmm1, xmm4    // high nibble
+    pand      xmm0, xmm3  // low nibble
+    pand      xmm1, xmm4  // high nibble
     psrld     xmm0, 4
     psrld     xmm1, 8
     por       xmm0, xmm1
@@ -1021,33 +998,34 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
 }
 
 #ifdef HAS_ARGBTORGB565ROW_AVX2
-__declspec(naked)
-void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
+__declspec(naked) void ARGBToRGB565Row_AVX2(const uint8_t* src_argb,
+                                            uint8_t* dst_rgb,
+                                            int width) {
   __asm {
-    mov        eax, [esp + 4]      // src_argb
-    mov        edx, [esp + 8]      // dst_rgb
-    mov        ecx, [esp + 12]     // width
-    vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_rgb
+    mov        ecx, [esp + 12]  // width
+    vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0x0000001f
     vpsrld     ymm3, ymm3, 27
-    vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
+    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x000007e0
     vpsrld     ymm4, ymm4, 26
     vpslld     ymm4, ymm4, 5
-    vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
+    vpslld     ymm5, ymm3, 11  // generate mask 0x0000f800
 
  convertloop:
-    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
-    vpsrld     ymm2, ymm0, 5       // G
-    vpsrld     ymm1, ymm0, 3       // B
-    vpsrld     ymm0, ymm0, 8       // R
-    vpand      ymm2, ymm2, ymm4    // G
-    vpand      ymm1, ymm1, ymm3    // B
-    vpand      ymm0, ymm0, ymm5    // R
-    vpor       ymm1, ymm1, ymm2    // BG
-    vpor       ymm0, ymm0, ymm1    // BGR
+    vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
+    vpsrld     ymm2, ymm0, 5  // G
+    vpsrld     ymm1, ymm0, 3  // B
+    vpsrld     ymm0, ymm0, 8  // R
+    vpand      ymm2, ymm2, ymm4  // G
+    vpand      ymm1, ymm1, ymm3  // B
+    vpand      ymm0, ymm0, ymm5  // R
+    vpor       ymm1, ymm1, ymm2  // BG
+    vpor       ymm0, ymm0, ymm1  // BGR
     vpackusdw  ymm0, ymm0, ymm0
     vpermq     ymm0, ymm0, 0xd8
     lea        eax, [eax + 32]
-    vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
+    vmovdqu    [edx], xmm0  // store 8 pixels of RGB565
     lea        edx, [edx + 16]
     sub        ecx, 8
     jg         convertloop
@@ -1058,36 +1036,37 @@ void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
 #endif  // HAS_ARGBTORGB565ROW_AVX2
 
 #ifdef HAS_ARGBTOARGB1555ROW_AVX2
-__declspec(naked)
-void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
+__declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb,
+                                              uint8_t* dst_rgb,
+                                              int width) {
   __asm {
-    mov        eax, [esp + 4]      // src_argb
-    mov        edx, [esp + 8]      // dst_rgb
-    mov        ecx, [esp + 12]     // width
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_rgb
+    mov        ecx, [esp + 12]  // width
     vpcmpeqb   ymm4, ymm4, ymm4
-    vpsrld     ymm4, ymm4, 27      // generate mask 0x0000001f
-    vpslld     ymm5, ymm4, 5       // generate mask 0x000003e0
-    vpslld     ymm6, ymm4, 10      // generate mask 0x00007c00
-    vpcmpeqb   ymm7, ymm7, ymm7    // generate mask 0xffff8000
+    vpsrld     ymm4, ymm4, 27  // generate mask 0x0000001f
+    vpslld     ymm5, ymm4, 5  // generate mask 0x000003e0
+    vpslld     ymm6, ymm4, 10  // generate mask 0x00007c00
+    vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xffff8000
     vpslld     ymm7, ymm7, 15
 
  convertloop:
-    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
-    vpsrld     ymm3, ymm0, 9       // R
-    vpsrld     ymm2, ymm0, 6       // G
-    vpsrld     ymm1, ymm0, 3       // B
-    vpsrad     ymm0, ymm0, 16      // A
-    vpand      ymm3, ymm3, ymm6    // R
-    vpand      ymm2, ymm2, ymm5    // G
-    vpand      ymm1, ymm1, ymm4    // B
-    vpand      ymm0, ymm0, ymm7    // A
-    vpor       ymm0, ymm0, ymm1    // BA
-    vpor       ymm2, ymm2, ymm3    // GR
-    vpor       ymm0, ymm0, ymm2    // BGRA
+    vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
+    vpsrld     ymm3, ymm0, 9  // R
+    vpsrld     ymm2, ymm0, 6  // G
+    vpsrld     ymm1, ymm0, 3  // B
+    vpsrad     ymm0, ymm0, 16  // A
+    vpand      ymm3, ymm3, ymm6  // R
+    vpand      ymm2, ymm2, ymm5  // G
+    vpand      ymm1, ymm1, ymm4  // B
+    vpand      ymm0, ymm0, ymm7  // A
+    vpor       ymm0, ymm0, ymm1  // BA
+    vpor       ymm2, ymm2, ymm3  // GR
+    vpor       ymm0, ymm0, ymm2  // BGRA
     vpackssdw  ymm0, ymm0, ymm0
     vpermq     ymm0, ymm0, 0xd8
     lea        eax, [eax + 32]
-    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB1555
+    vmovdqu    [edx], xmm0  // store 8 pixels of ARGB1555
     lea        edx, [edx + 16]
     sub        ecx, 8
     jg         convertloop
@@ -1098,27 +1077,28 @@ void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
 #endif  // HAS_ARGBTOARGB1555ROW_AVX2
 
 #ifdef HAS_ARGBTOARGB4444ROW_AVX2
-__declspec(naked)
-void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
+__declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb,
+                                              uint8_t* dst_rgb,
+                                              int width) {
   __asm {
-    mov        eax, [esp + 4]   // src_argb
-    mov        edx, [esp + 8]   // dst_rgb
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_rgb
     mov        ecx, [esp + 12]  // width
-    vpcmpeqb   ymm4, ymm4, ymm4   // generate mask 0xf000f000
+    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0xf000f000
     vpsllw     ymm4, ymm4, 12
-    vpsrlw     ymm3, ymm4, 8      // generate mask 0x00f000f0
+    vpsrlw     ymm3, ymm4, 8  // generate mask 0x00f000f0
 
  convertloop:
-    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
-    vpand      ymm1, ymm0, ymm4    // high nibble
-    vpand      ymm0, ymm0, ymm3    // low nibble
+    vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
+    vpand      ymm1, ymm0, ymm4  // high nibble
+    vpand      ymm0, ymm0, ymm3  // low nibble
     vpsrld     ymm1, ymm1, 8
     vpsrld     ymm0, ymm0, 4
     vpor       ymm0, ymm0, ymm1
     vpackuswb  ymm0, ymm0, ymm0
     vpermq     ymm0, ymm0, 0xd8
     lea        eax, [eax + 32]
-    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB4444
+    vmovdqu    [edx], xmm0  // store 8 pixels of ARGB4444
     lea        edx, [edx + 16]
     sub        ecx, 8
     jg         convertloop
@@ -1129,12 +1109,13 @@ void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
 #endif  // HAS_ARGBTOARGB4444ROW_AVX2
 
 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
-__declspec(naked)
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+__declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb,
+                                        uint8_t* dst_y,
+                                        int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* width */
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
     movdqa     xmm4, xmmword ptr kARGBToY
     movdqa     xmm5, xmmword ptr kAddY16
 
@@ -1164,12 +1145,13 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
 
 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
-__declspec(naked)
-void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+__declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb,
+                                         uint8_t* dst_y,
+                                         int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* width */
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
     movdqa     xmm4, xmmword ptr kARGBToYJ
     movdqa     xmm5, xmmword ptr kAddYJ64
 
@@ -1200,17 +1182,16 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
 
 #ifdef HAS_ARGBTOYROW_AVX2
 // vpermd for vphaddw + vpackuswb vpermd.
-static const lvec32 kPermdARGBToY_AVX = {
-  0, 4, 1, 5, 2, 6, 3, 7
-};
+static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
 
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-__declspec(naked)
-void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
+__declspec(naked) void ARGBToYRow_AVX2(const uint8_t* src_argb,
+                                       uint8_t* dst_y,
+                                       int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* width */
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
     vbroadcastf128 ymm4, xmmword ptr kARGBToY
     vbroadcastf128 ymm5, xmmword ptr kAddY16
     vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
@@ -1244,12 +1225,13 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
 
 #ifdef HAS_ARGBTOYJROW_AVX2
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-__declspec(naked)
-void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
+__declspec(naked) void ARGBToYJRow_AVX2(const uint8_t* src_argb,
+                                        uint8_t* dst_y,
+                                        int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* width */
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
     vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
     vbroadcastf128 ymm5, xmmword ptr kAddYJ64
     vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
@@ -1283,12 +1265,13 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
 }
 #endif  //  HAS_ARGBTOYJROW_AVX2
 
-__declspec(naked)
-void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+__declspec(naked) void BGRAToYRow_SSSE3(const uint8_t* src_argb,
+                                        uint8_t* dst_y,
+                                        int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* width */
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
     movdqa     xmm4, xmmword ptr kBGRAToY
     movdqa     xmm5, xmmword ptr kAddY16
 
@@ -1316,12 +1299,13 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
   }
 }
 
-__declspec(naked)
-void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+__declspec(naked) void ABGRToYRow_SSSE3(const uint8_t* src_argb,
+                                        uint8_t* dst_y,
+                                        int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* width */
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
     movdqa     xmm4, xmmword ptr kABGRToY
     movdqa     xmm5, xmmword ptr kAddY16
 
@@ -1349,12 +1333,13 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
   }
 }
 
-__declspec(naked)
-void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+__declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb,
+                                        uint8_t* dst_y,
+                                        int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* width */
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
     movdqa     xmm4, xmmword ptr kRGBAToY
     movdqa     xmm5, xmmword ptr kAddY16
 
@@ -1382,24 +1367,26 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
   }
 }
 
-__declspec(naked)
-void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
+                                         int src_stride_argb,
+                                         uint8_t* dst_u,
+                                         uint8_t* dst_v,
+                                         int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
     movdqa     xmm5, xmmword ptr kAddUV128
     movdqa     xmm6, xmmword ptr kARGBToV
     movdqa     xmm7, xmmword ptr kARGBToU
-    sub        edi, edx             // stride from u to v
+    sub        edi, edx  // stride from u to v
 
  convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+         /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
     movdqu     xmm4, [eax + esi]
     pavgb      xmm0, xmm4
@@ -1423,9 +1410,9 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     shufps     xmm4, xmm3, 0xdd
     pavgb      xmm2, xmm4
 
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+        // step 2 - convert to U and V
+        // from here down is very similar to Y code except
+        // instead of 16 different pixels, its 8 pixels of U and 8 of V
     movdqa     xmm1, xmm0
     movdqa     xmm3, xmm2
     pmaddubsw  xmm0, xmm7  // U
@@ -1437,11 +1424,11 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     psraw      xmm0, 8
     psraw      xmm1, 8
     packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
+    paddb      xmm0, xmm5  // -> unsigned
 
-    // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
+        // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0  // U
+    movhps     qword ptr [edx + edi], xmm0  // V
     lea        edx, [edx + 8]
     sub        ecx, 16
     jg         convertloop
@@ -1452,24 +1439,26 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   }
 }
 
-__declspec(naked)
-void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                        uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
+                                          int src_stride_argb,
+                                          uint8_t* dst_u,
+                                          uint8_t* dst_v,
+                                          int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
     movdqa     xmm5, xmmword ptr kAddUVJ128
     movdqa     xmm6, xmmword ptr kARGBToVJ
     movdqa     xmm7, xmmword ptr kARGBToUJ
-    sub        edi, edx             // stride from u to v
+    sub        edi, edx  // stride from u to v
 
  convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+         /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
     movdqu     xmm4, [eax + esi]
     pavgb      xmm0, xmm4
@@ -1493,9 +1482,9 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     shufps     xmm4, xmm3, 0xdd
     pavgb      xmm2, xmm4
 
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+        // step 2 - convert to U and V
+        // from here down is very similar to Y code except
+        // instead of 16 different pixels, its 8 pixels of U and 8 of V
     movdqa     xmm1, xmm0
     movdqa     xmm3, xmm2
     pmaddubsw  xmm0, xmm7  // U
@@ -1510,9 +1499,9 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     psraw      xmm1, 8
     packsswb   xmm0, xmm1
 
-    // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
+        // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0  // U
+    movhps     qword ptr [edx + edi], xmm0  // V
     lea        edx, [edx + 8]
     sub        ecx, 16
     jg         convertloop
@@ -1524,24 +1513,26 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
 }
 
 #ifdef HAS_ARGBTOUVROW_AVX2
-__declspec(naked)
-void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
+                                        int src_stride_argb,
+                                        uint8_t* dst_u,
+                                        uint8_t* dst_v,
+                                        int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
     vbroadcastf128 ymm5, xmmword ptr kAddUV128
     vbroadcastf128 ymm6, xmmword ptr kARGBToV
     vbroadcastf128 ymm7, xmmword ptr kARGBToU
-    sub        edi, edx             // stride from u to v
+    sub        edi, edx   // stride from u to v
 
  convertloop:
-    /* step 1 - subsample 32x2 argb pixels to 16x1 */
+        /* step 1 - subsample 32x2 argb pixels to 16x1 */
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
     vmovdqu    ymm2, [eax + 64]
@@ -1558,9 +1549,9 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
     vshufps    ymm2, ymm2, ymm3, 0xdd
     vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
 
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 32 different pixels, its 16 pixels of U and 16 of V
+        // step 2 - convert to U and V
+        // from here down is very similar to Y code except
+        // instead of 32 different pixels, its 16 pixels of U and 16 of V
     vpmaddubsw ymm1, ymm0, ymm7  // U
     vpmaddubsw ymm3, ymm2, ymm7
     vpmaddubsw ymm0, ymm0, ymm6  // V
@@ -1574,9 +1565,9 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
     vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
     vpaddb     ymm0, ymm0, ymm5  // -> unsigned
 
-    // step 3 - store 16 U and 16 V values
-    vextractf128 [edx], ymm0, 0 // U
-    vextractf128 [edx + edi], ymm0, 1 // V
+        // step 3 - store 16 U and 16 V values
+    vextractf128 [edx], ymm0, 0  // U
+    vextractf128 [edx + edi], ymm0, 1  // V
     lea        edx, [edx + 16]
     sub        ecx, 32
     jg         convertloop
@@ -1590,24 +1581,26 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
 #endif  // HAS_ARGBTOUVROW_AVX2
 
 #ifdef HAS_ARGBTOUVJROW_AVX2
-__declspec(naked)
-void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
+                                         int src_stride_argb,
+                                         uint8_t* dst_u,
+                                         uint8_t* dst_v,
+                                         int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
     vbroadcastf128 ymm5, xmmword ptr kAddUV128
     vbroadcastf128 ymm6, xmmword ptr kARGBToV
     vbroadcastf128 ymm7, xmmword ptr kARGBToU
-    sub        edi, edx             // stride from u to v
+    sub        edi, edx   // stride from u to v
 
  convertloop:
-    /* step 1 - subsample 32x2 argb pixels to 16x1 */
+        /* step 1 - subsample 32x2 argb pixels to 16x1 */
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
     vmovdqu    ymm2, [eax + 64]
@@ -1624,9 +1617,9 @@ void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
     vshufps    ymm2, ymm2, ymm3, 0xdd
     vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
 
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 32 different pixels, its 16 pixels of U and 16 of V
+        // step 2 - convert to U and V
+        // from here down is very similar to Y code except
+        // instead of 32 different pixels, its 16 pixels of U and 16 of V
     vpmaddubsw ymm1, ymm0, ymm7  // U
     vpmaddubsw ymm3, ymm2, ymm7
     vpmaddubsw ymm0, ymm0, ymm6  // V
@@ -1641,9 +1634,9 @@ void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
     vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
     vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
 
-    // step 3 - store 16 U and 16 V values
-    vextractf128 [edx], ymm0, 0 // U
-    vextractf128 [edx + edi], ymm0, 1 // V
+        // step 3 - store 16 U and 16 V values
+    vextractf128 [edx], ymm0, 0  // U
+    vextractf128 [edx + edi], ymm0, 1  // V
     lea        edx, [edx + 16]
     sub        ecx, 32
     jg         convertloop
@@ -1656,23 +1649,24 @@ void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
 }
 #endif  // HAS_ARGBTOUVJROW_AVX2
 
-__declspec(naked)
-void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
-                          uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0,
+                                            uint8_t* dst_u,
+                                            uint8_t* dst_v,
+                                            int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]   // src_argb
-    mov        edx, [esp + 4 + 8]   // dst_u
+    mov        eax, [esp + 4 + 4]  // src_argb
+    mov        edx, [esp + 4 + 8]  // dst_u
     mov        edi, [esp + 4 + 12]  // dst_v
     mov        ecx, [esp + 4 + 16]  // width
     movdqa     xmm5, xmmword ptr kAddUV128
     movdqa     xmm6, xmmword ptr kARGBToV
     movdqa     xmm7, xmmword ptr kARGBToU
-    sub        edi, edx             // stride from u to v
+    sub        edi, edx    // stride from u to v
 
  convertloop:
-    /* convert to U and V */
-    movdqu     xmm0, [eax]          // U
+        /* convert to U and V */
+    movdqu     xmm0, [eax]  // U
     movdqu     xmm1, [eax + 16]
     movdqu     xmm2, [eax + 32]
     movdqu     xmm3, [eax + 48]
@@ -1688,7 +1682,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
     paddb      xmm0, xmm5
     movdqu     [edx], xmm0
 
-    movdqu     xmm0, [eax]          // V
+    movdqu     xmm0, [eax]  // V
     movdqu     xmm1, [eax + 16]
     movdqu     xmm2, [eax + 32]
     movdqu     xmm3, [eax + 48]
@@ -1713,24 +1707,26 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
   }
 }
 
-__declspec(naked)
-void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0,
+                                         int src_stride_argb,
+                                         uint8_t* dst_u,
+                                         uint8_t* dst_v,
+                                         int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
     movdqa     xmm5, xmmword ptr kAddUV128
     movdqa     xmm6, xmmword ptr kBGRAToV
     movdqa     xmm7, xmmword ptr kBGRAToU
-    sub        edi, edx             // stride from u to v
+    sub        edi, edx  // stride from u to v
 
  convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+         /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
     movdqu     xmm4, [eax + esi]
     pavgb      xmm0, xmm4
@@ -1754,9 +1750,9 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     shufps     xmm4, xmm3, 0xdd
     pavgb      xmm2, xmm4
 
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+        // step 2 - convert to U and V
+        // from here down is very similar to Y code except
+        // instead of 16 different pixels, its 8 pixels of U and 8 of V
     movdqa     xmm1, xmm0
     movdqa     xmm3, xmm2
     pmaddubsw  xmm0, xmm7  // U
@@ -1768,11 +1764,11 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     psraw      xmm0, 8
     psraw      xmm1, 8
     packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
+    paddb      xmm0, xmm5  // -> unsigned
 
-    // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
+        // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0  // U
+    movhps     qword ptr [edx + edi], xmm0  // V
     lea        edx, [edx + 8]
     sub        ecx, 16
     jg         convertloop
@@ -1783,24 +1779,26 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   }
 }
 
-__declspec(naked)
-void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0,
+                                         int src_stride_argb,
+                                         uint8_t* dst_u,
+                                         uint8_t* dst_v,
+                                         int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
     movdqa     xmm5, xmmword ptr kAddUV128
     movdqa     xmm6, xmmword ptr kABGRToV
     movdqa     xmm7, xmmword ptr kABGRToU
-    sub        edi, edx             // stride from u to v
+    sub        edi, edx  // stride from u to v
 
  convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+         /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
     movdqu     xmm4, [eax + esi]
     pavgb      xmm0, xmm4
@@ -1824,9 +1822,9 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     shufps     xmm4, xmm3, 0xdd
     pavgb      xmm2, xmm4
 
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+        // step 2 - convert to U and V
+        // from here down is very similar to Y code except
+        // instead of 16 different pixels, its 8 pixels of U and 8 of V
     movdqa     xmm1, xmm0
     movdqa     xmm3, xmm2
     pmaddubsw  xmm0, xmm7  // U
@@ -1838,11 +1836,11 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     psraw      xmm0, 8
     psraw      xmm1, 8
     packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
+    paddb      xmm0, xmm5  // -> unsigned
 
-    // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
+        // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0  // U
+    movhps     qword ptr [edx + edi], xmm0  // V
     lea        edx, [edx + 8]
     sub        ecx, 16
     jg         convertloop
@@ -1853,24 +1851,26 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   }
 }
 
-__declspec(naked)
-void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
+                                         int src_stride_argb,
+                                         uint8_t* dst_u,
+                                         uint8_t* dst_v,
+                                         int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
     movdqa     xmm5, xmmword ptr kAddUV128
     movdqa     xmm6, xmmword ptr kRGBAToV
     movdqa     xmm7, xmmword ptr kRGBAToU
-    sub        edi, edx             // stride from u to v
+    sub        edi, edx  // stride from u to v
 
  convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+         /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
     movdqu     xmm4, [eax + esi]
     pavgb      xmm0, xmm4
@@ -1894,9 +1894,9 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     shufps     xmm4, xmm3, 0xdd
     pavgb      xmm2, xmm4
 
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+        // step 2 - convert to U and V
+        // from here down is very similar to Y code except
+        // instead of 16 different pixels, its 8 pixels of U and 8 of V
     movdqa     xmm1, xmm0
     movdqa     xmm3, xmm2
     pmaddubsw  xmm0, xmm7  // U
@@ -1908,11 +1908,11 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     psraw      xmm0, 8
     psraw      xmm1, 8
     packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
+    paddb      xmm0, xmm5  // -> unsigned
 
-    // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
+        // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0  // U
+    movhps     qword ptr [edx + edi], xmm0  // V
     lea        edx, [edx + 8]
     sub        ecx, 16
     jg         convertloop
@@ -1925,109 +1925,95 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
 #endif  // HAS_ARGBTOYROW_SSSE3
 
 // Read 16 UV from 444
-#define READYUV444_AVX2 __asm {                                                \
-    __asm vmovdqu    xmm0, [esi]                  /* U */                      \
-    __asm vmovdqu    xmm1, [esi + edi]            /* V */                      \
+#define READYUV444_AVX2 \
+  __asm {                                                \
+    __asm vmovdqu    xmm0, [esi] /* U */                      \
+    __asm vmovdqu    xmm1, [esi + edi] /* V */                      \
     __asm lea        esi,  [esi + 16]                                          \
     __asm vpermq     ymm0, ymm0, 0xd8                                          \
     __asm vpermq     ymm1, ymm1, 0xd8                                          \
-    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
-    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
+    __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \
+    __asm vmovdqu    xmm4, [eax] /* Y */                      \
     __asm vpermq     ymm4, ymm4, 0xd8                                          \
     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
-    __asm lea        eax, [eax + 16]                                           \
-  }
+    __asm lea        eax, [eax + 16]}
 
 // Read 8 UV from 422, upsample to 16 UV.
-#define READYUV422_AVX2 __asm {                                                \
-    __asm vmovq      xmm0, qword ptr [esi]        /* U */                      \
-    __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */                      \
+#define READYUV422_AVX2 \
+  __asm {                                                \
+    __asm vmovq      xmm0, qword ptr [esi] /* U */                      \
+    __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                      \
     __asm lea        esi,  [esi + 8]                                           \
-    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
+    __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \
     __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
-    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
+    __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \
+    __asm vmovdqu    xmm4, [eax] /* Y */                      \
     __asm vpermq     ymm4, ymm4, 0xd8                                          \
     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
-    __asm lea        eax, [eax + 16]                                           \
-  }
+    __asm lea        eax, [eax + 16]}
 
 // Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
-#define READYUVA422_AVX2 __asm {                                               \
-    __asm vmovq      xmm0, qword ptr [esi]        /* U */                      \
-    __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */                      \
+#define READYUVA422_AVX2 \
+  __asm {                                               \
+    __asm vmovq      xmm0, qword ptr [esi] /* U */                      \
+    __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                      \
     __asm lea        esi,  [esi + 8]                                           \
-    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
+    __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \
     __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
-    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
+    __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \
+    __asm vmovdqu    xmm4, [eax] /* Y */                      \
     __asm vpermq     ymm4, ymm4, 0xd8                                          \
     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
     __asm lea        eax, [eax + 16]                                           \
-    __asm vmovdqu    xmm5, [ebp]                  /* A */                      \
+    __asm vmovdqu    xmm5, [ebp] /* A */                      \
     __asm vpermq     ymm5, ymm5, 0xd8                                          \
-    __asm lea        ebp, [ebp + 16]                                           \
-  }
-
-// Read 4 UV from 411, upsample to 16 UV.
-#define READYUV411_AVX2 __asm {                                                \
-    __asm vmovd      xmm0, dword ptr [esi]        /* U */                      \
-    __asm vmovd      xmm1, dword ptr [esi + edi]  /* V */                      \
-    __asm lea        esi,  [esi + 4]                                           \
-    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
-    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
-    __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpckldq ymm0, ymm0, ymm0             /* UVUVUVUV (upsample) */    \
-    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
-    __asm vpermq     ymm4, ymm4, 0xd8                                          \
-    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
-    __asm lea        eax, [eax + 16]                                           \
-  }
+    __asm lea        ebp, [ebp + 16]}
 
 // Read 8 UV from NV12, upsample to 16 UV.
-#define READNV12_AVX2 __asm {                                                  \
-    __asm vmovdqu    xmm0, [esi]                  /* UV */                     \
+#define READNV12_AVX2 \
+  __asm {                                                  \
+    __asm vmovdqu    xmm0, [esi] /* UV */                     \
     __asm lea        esi,  [esi + 16]                                          \
     __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
-    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
+    __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \
+    __asm vmovdqu    xmm4, [eax] /* Y */                      \
     __asm vpermq     ymm4, ymm4, 0xd8                                          \
     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
-    __asm lea        eax, [eax + 16]                                           \
-  }
+    __asm lea        eax, [eax + 16]}
 
 // Read 8 UV from NV21, upsample to 16 UV.
-#define READNV21_AVX2 __asm {                                                  \
-    __asm vmovdqu    xmm0, [esi]                  /* UV */                     \
+#define READNV21_AVX2 \
+  __asm {                                                  \
+    __asm vmovdqu    xmm0, [esi] /* UV */                     \
     __asm lea        esi,  [esi + 16]                                          \
     __asm vpermq     ymm0, ymm0, 0xd8                                          \
     __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleNV21                      \
-    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
+    __asm vmovdqu    xmm4, [eax] /* Y */                      \
     __asm vpermq     ymm4, ymm4, 0xd8                                          \
     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
-    __asm lea        eax, [eax + 16]                                           \
-  }
+    __asm lea        eax, [eax + 16]}
 
 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
-#define READYUY2_AVX2 __asm {                                                  \
-    __asm vmovdqu    ymm4, [eax]          /* YUY2 */                           \
+#define READYUY2_AVX2 \
+  __asm {                                                  \
+    __asm vmovdqu    ymm4, [eax] /* YUY2 */                           \
     __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleYUY2Y                     \
-    __asm vmovdqu    ymm0, [eax]          /* UV */                             \
+    __asm vmovdqu    ymm0, [eax] /* UV */                             \
     __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleYUY2UV                    \
-    __asm lea        eax, [eax + 32]                                           \
-  }
+    __asm lea        eax, [eax + 32]}
 
 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
-#define READUYVY_AVX2 __asm {                                                  \
-    __asm vmovdqu    ymm4, [eax]          /* UYVY */                           \
+#define READUYVY_AVX2 \
+  __asm {                                                  \
+    __asm vmovdqu    ymm4, [eax] /* UYVY */                           \
     __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleUYVYY                     \
-    __asm vmovdqu    ymm0, [eax]          /* UV */                             \
+    __asm vmovdqu    ymm0, [eax] /* UV */                             \
     __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleUYVYUV                    \
-    __asm lea        eax, [eax + 32]                                           \
-  }
+    __asm lea        eax, [eax + 32]}
 
 // Convert 16 pixels: 16 UV and 16 Y.
-#define YUVTORGB_AVX2(YuvConstants) __asm {                                    \
+#define YUVTORGB_AVX2(YuvConstants) \
+  __asm {                                    \
     __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
     __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
     __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
@@ -2036,68 +2022,67 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASG]               \
     __asm vpsubw     ymm1, ymm3, ymm1                                          \
     __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASB]               \
-    __asm vpsubw     ymm0, ymm3, ymm0                                          \
-    /* Step 2: Find Y contribution to 16 R,G,B values */                       \
+    __asm vpsubw     ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */                       \
     __asm vpmulhuw   ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB]          \
-    __asm vpaddsw    ymm0, ymm0, ymm4           /* B += Y */                   \
-    __asm vpaddsw    ymm1, ymm1, ymm4           /* G += Y */                   \
-    __asm vpaddsw    ymm2, ymm2, ymm4           /* R += Y */                   \
+    __asm vpaddsw    ymm0, ymm0, ymm4 /* B += Y */                   \
+    __asm vpaddsw    ymm1, ymm1, ymm4 /* G += Y */                   \
+    __asm vpaddsw    ymm2, ymm2, ymm4 /* R += Y */                   \
     __asm vpsraw     ymm0, ymm0, 6                                             \
     __asm vpsraw     ymm1, ymm1, 6                                             \
     __asm vpsraw     ymm2, ymm2, 6                                             \
-    __asm vpackuswb  ymm0, ymm0, ymm0           /* B */                        \
-    __asm vpackuswb  ymm1, ymm1, ymm1           /* G */                        \
-    __asm vpackuswb  ymm2, ymm2, ymm2           /* R */                        \
+    __asm vpackuswb  ymm0, ymm0, ymm0 /* B */                        \
+    __asm vpackuswb  ymm1, ymm1, ymm1 /* G */                        \
+    __asm vpackuswb  ymm2, ymm2, ymm2 /* R */                  \
   }
 
 // Store 16 ARGB values.
-#define STOREARGB_AVX2 __asm {                                                 \
-    __asm vpunpcklbw ymm0, ymm0, ymm1           /* BG */                       \
+#define STOREARGB_AVX2 \
+  __asm {                                                 \
+    __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */                       \
     __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpcklbw ymm2, ymm2, ymm5           /* RA */                       \
+    __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */                       \
     __asm vpermq     ymm2, ymm2, 0xd8                                          \
-    __asm vpunpcklwd ymm1, ymm0, ymm2           /* BGRA first 8 pixels */      \
-    __asm vpunpckhwd ymm0, ymm0, ymm2           /* BGRA next 8 pixels */       \
+    __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */      \
+    __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */       \
     __asm vmovdqu    0[edx], ymm1                                              \
     __asm vmovdqu    32[edx], ymm0                                             \
-    __asm lea        edx,  [edx + 64]                                          \
-  }
+    __asm lea        edx,  [edx + 64]}
 
 // Store 16 RGBA values.
-#define STORERGBA_AVX2 __asm {                                                 \
-    __asm vpunpcklbw ymm1, ymm1, ymm2           /* GR */                       \
+#define STORERGBA_AVX2 \
+  __asm {                                                 \
+    __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */                       \
     __asm vpermq     ymm1, ymm1, 0xd8                                          \
-    __asm vpunpcklbw ymm2, ymm5, ymm0           /* AB */                       \
+    __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */                       \
     __asm vpermq     ymm2, ymm2, 0xd8                                          \
-    __asm vpunpcklwd ymm0, ymm2, ymm1           /* ABGR first 8 pixels */      \
-    __asm vpunpckhwd ymm1, ymm2, ymm1           /* ABGR next 8 pixels */       \
+    __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */      \
+    __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */       \
     __asm vmovdqu    [edx], ymm0                                               \
     __asm vmovdqu    [edx + 32], ymm1                                          \
-    __asm lea        edx,  [edx + 64]                                          \
-  }
+    __asm lea        edx,  [edx + 64]}
 
 #ifdef HAS_I422TOARGBROW_AVX2
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked)
-void I422ToARGBRow_AVX2(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
+__declspec(naked) void I422ToARGBRow_AVX2(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
     push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
     mov        edi, [esp + 12 + 12]  // V
     mov        edx, [esp + 12 + 16]  // argb
     mov        ebx, [esp + 12 + 20]  // yuvconstants
     mov        ecx, [esp + 12 + 24]  // width
     sub        edi, esi
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
 
  convertloop:
     READYUV422_AVX2
@@ -2119,21 +2104,21 @@ void I422ToARGBRow_AVX2(const uint8* y_buf,
 #ifdef HAS_I422ALPHATOARGBROW_AVX2
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
-__declspec(naked)
-void I422AlphaToARGBRow_AVX2(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
-                             const uint8* a_buf,
-                             uint8* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width) {
+__declspec(naked) void I422AlphaToARGBRow_AVX2(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    const uint8_t* a_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
     push       ebx
     push       ebp
-    mov        eax, [esp + 16 + 4]   // Y
-    mov        esi, [esp + 16 + 8]   // U
+    mov        eax, [esp + 16 + 4]  // Y
+    mov        esi, [esp + 16 + 8]  // U
     mov        edi, [esp + 16 + 12]  // V
     mov        ebp, [esp + 16 + 16]  // A
     mov        edx, [esp + 16 + 20]  // argb
@@ -2162,25 +2147,25 @@ void I422AlphaToARGBRow_AVX2(const uint8* y_buf,
 #ifdef HAS_I444TOARGBROW_AVX2
 // 16 pixels
 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked)
-void I444ToARGBRow_AVX2(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
+__declspec(naked) void I444ToARGBRow_AVX2(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
     push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
     mov        edi, [esp + 12 + 12]  // V
     mov        edx, [esp + 12 + 16]  // argb
     mov        ebx, [esp + 12 + 20]  // yuvconstants
     mov        ecx, [esp + 12 + 24]  // width
     sub        edi, esi
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
  convertloop:
     READYUV444_AVX2
     YUVTORGB_AVX2(ebx)
@@ -2198,64 +2183,24 @@ void I444ToARGBRow_AVX2(const uint8* y_buf,
 }
 #endif  // HAS_I444TOARGBROW_AVX2
 
-#ifdef HAS_I411TOARGBROW_AVX2
-// 16 pixels
-// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked)
-void I411ToARGBRow_AVX2(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // abgr
-    mov        ebx, [esp + 12 + 20]  // yuvconstants
-    mov        ecx, [esp + 12 + 24]  // width
-    sub        edi, esi
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
-
- convertloop:
-    READYUV411_AVX2
-    YUVTORGB_AVX2(ebx)
-    STOREARGB_AVX2
-
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        ebx
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_I411TOARGBROW_AVX2
-
 #ifdef HAS_NV12TOARGBROW_AVX2
 // 16 pixels.
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked)
-void NV12ToARGBRow_AVX2(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
+__declspec(naked) void NV12ToARGBRow_AVX2(
+    const uint8_t* y_buf,
+    const uint8_t* uv_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       ebx
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // UV
+    mov        eax, [esp + 8 + 4]  // Y
+    mov        esi, [esp + 8 + 8]  // UV
     mov        edx, [esp + 8 + 12]  // argb
     mov        ebx, [esp + 8 + 16]  // yuvconstants
     mov        ecx, [esp + 8 + 20]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
 
  convertloop:
     READNV12_AVX2
@@ -2276,21 +2221,21 @@ void NV12ToARGBRow_AVX2(const uint8* y_buf,
 #ifdef HAS_NV21TOARGBROW_AVX2
 // 16 pixels.
 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked)
-void NV21ToARGBRow_AVX2(const uint8* y_buf,
-                        const uint8* vu_buf,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
+__declspec(naked) void NV21ToARGBRow_AVX2(
+    const uint8_t* y_buf,
+    const uint8_t* vu_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       ebx
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // VU
+    mov        eax, [esp + 8 + 4]  // Y
+    mov        esi, [esp + 8 + 8]  // VU
     mov        edx, [esp + 8 + 12]  // argb
     mov        ebx, [esp + 8 + 16]  // yuvconstants
     mov        ecx, [esp + 8 + 20]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
 
  convertloop:
     READNV21_AVX2
@@ -2311,18 +2256,18 @@ void NV21ToARGBRow_AVX2(const uint8* y_buf,
 #ifdef HAS_YUY2TOARGBROW_AVX2
 // 16 pixels.
 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
-__declspec(naked)
-void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
+__declspec(naked) void YUY2ToARGBRow_AVX2(
+    const uint8_t* src_yuy2,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       ebx
-    mov        eax, [esp + 4 + 4]   // yuy2
-    mov        edx, [esp + 4 + 8]   // argb
+    mov        eax, [esp + 4 + 4]  // yuy2
+    mov        edx, [esp + 4 + 8]  // argb
     mov        ebx, [esp + 4 + 12]  // yuvconstants
     mov        ecx, [esp + 4 + 16]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
 
  convertloop:
     READYUY2_AVX2
@@ -2342,18 +2287,18 @@ void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
 #ifdef HAS_UYVYTOARGBROW_AVX2
 // 16 pixels.
 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
-__declspec(naked)
-void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
+__declspec(naked) void UYVYToARGBRow_AVX2(
+    const uint8_t* src_uyvy,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       ebx
-    mov        eax, [esp + 4 + 4]   // uyvy
-    mov        edx, [esp + 4 + 8]   // argb
+    mov        eax, [esp + 4 + 4]  // uyvy
+    mov        edx, [esp + 4 + 8]  // argb
     mov        ebx, [esp + 4 + 12]  // yuvconstants
     mov        ecx, [esp + 4 + 16]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
 
  convertloop:
     READUYVY_AVX2
@@ -2373,25 +2318,25 @@ void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
 #ifdef HAS_I422TORGBAROW_AVX2
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
-__declspec(naked)
-void I422ToRGBARow_AVX2(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
+__declspec(naked) void I422ToRGBARow_AVX2(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
     push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
     mov        edi, [esp + 12 + 12]  // V
     mov        edx, [esp + 12 + 16]  // abgr
     mov        ebx, [esp + 12 + 20]  // yuvconstants
     mov        ecx, [esp + 12 + 24]  // width
     sub        edi, esi
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
 
  convertloop:
     READYUV422_AVX2
@@ -2415,100 +2360,83 @@ void I422ToRGBARow_AVX2(const uint8* y_buf,
 // Allows a conversion with half size scaling.
 
 // Read 8 UV from 444.
-#define READYUV444 __asm {                                                     \
+#define READYUV444 \
+  __asm {                                                     \
     __asm movq       xmm0, qword ptr [esi] /* U */                             \
     __asm movq       xmm1, qword ptr [esi + edi] /* V */                       \
     __asm lea        esi,  [esi + 8]                                           \
-    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
+    __asm punpcklbw  xmm0, xmm1 /* UV */                             \
     __asm movq       xmm4, qword ptr [eax]                                     \
     __asm punpcklbw  xmm4, xmm4                                                \
-    __asm lea        eax, [eax + 8]                                            \
-  }
+    __asm lea        eax, [eax + 8]}
 
 // Read 4 UV from 422, upsample to 8 UV.
-#define READYUV422 __asm {                                                     \
-    __asm movd       xmm0, [esi]          /* U */                              \
-    __asm movd       xmm1, [esi + edi]    /* V */                              \
+#define READYUV422 \
+  __asm {                                                     \
+    __asm movd       xmm0, [esi] /* U */                              \
+    __asm movd       xmm1, [esi + edi] /* V */                              \
     __asm lea        esi,  [esi + 4]                                           \
-    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
-    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
+    __asm punpcklbw  xmm0, xmm1 /* UV */                             \
+    __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \
     __asm movq       xmm4, qword ptr [eax]                                     \
     __asm punpcklbw  xmm4, xmm4                                                \
-    __asm lea        eax, [eax + 8]                                            \
-  }
+    __asm lea        eax, [eax + 8]}
 
 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
-#define READYUVA422 __asm {                                                    \
-    __asm movd       xmm0, [esi]          /* U */                              \
-    __asm movd       xmm1, [esi + edi]    /* V */                              \
+#define READYUVA422 \
+  __asm {                                                    \
+    __asm movd       xmm0, [esi] /* U */                              \
+    __asm movd       xmm1, [esi + edi] /* V */                              \
     __asm lea        esi,  [esi + 4]                                           \
-    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
-    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
-    __asm movq       xmm4, qword ptr [eax]   /* Y */                           \
+    __asm punpcklbw  xmm0, xmm1 /* UV */                             \
+    __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \
+    __asm movq       xmm4, qword ptr [eax] /* Y */                           \
     __asm punpcklbw  xmm4, xmm4                                                \
     __asm lea        eax, [eax + 8]                                            \
-    __asm movq       xmm5, qword ptr [ebp]   /* A */                           \
-    __asm lea        ebp, [ebp + 8]                                            \
-  }
-
-// Read 2 UV from 411, upsample to 8 UV.
-// drmemory fails with memory fault if pinsrw used. libyuv bug: 525
-//  __asm pinsrw     xmm0, [esi], 0        /* U */
-//  __asm pinsrw     xmm1, [esi + edi], 0  /* V */
-#define READYUV411_EBX __asm {                                                 \
-    __asm movzx      ebx, word ptr [esi]        /* U */                        \
-    __asm movd       xmm0, ebx                                                 \
-    __asm movzx      ebx, word ptr [esi + edi]  /* V */                        \
-    __asm movd       xmm1, ebx                                                 \
-    __asm lea        esi,  [esi + 2]                                           \
-    __asm punpcklbw  xmm0, xmm1            /* UV */                            \
-    __asm punpcklwd  xmm0, xmm0            /* UVUV (upsample) */               \
-    __asm punpckldq  xmm0, xmm0            /* UVUVUVUV (upsample) */           \
-    __asm movq       xmm4, qword ptr [eax]                                     \
-    __asm punpcklbw  xmm4, xmm4                                                \
-    __asm lea        eax, [eax + 8]                                            \
-  }
+    __asm movq       xmm5, qword ptr [ebp] /* A */                           \
+    __asm lea        ebp, [ebp + 8]}
 
 // Read 4 UV from NV12, upsample to 8 UV.
-#define READNV12 __asm {                                                       \
+#define READNV12 \
+  __asm {                                                       \
     __asm movq       xmm0, qword ptr [esi] /* UV */                            \
     __asm lea        esi,  [esi + 8]                                           \
-    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
+    __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \
     __asm movq       xmm4, qword ptr [eax]                                     \
     __asm punpcklbw  xmm4, xmm4                                                \
-    __asm lea        eax, [eax + 8]                                            \
-  }
+    __asm lea        eax, [eax + 8]}
 
 // Read 4 VU from NV21, upsample to 8 UV.
-#define READNV21 __asm {                                                       \
+#define READNV21 \
+  __asm {                                                       \
     __asm movq       xmm0, qword ptr [esi] /* UV */                            \
     __asm lea        esi,  [esi + 8]                                           \
     __asm pshufb     xmm0, xmmword ptr kShuffleNV21                            \
     __asm movq       xmm4, qword ptr [eax]                                     \
     __asm punpcklbw  xmm4, xmm4                                                \
-    __asm lea        eax, [eax + 8]                                            \
-  }
+    __asm lea        eax, [eax + 8]}
 
 // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
-#define READYUY2 __asm {                                                       \
-    __asm movdqu     xmm4, [eax]          /* YUY2 */                           \
+#define READYUY2 \
+  __asm {                                                       \
+    __asm movdqu     xmm4, [eax] /* YUY2 */                           \
     __asm pshufb     xmm4, xmmword ptr kShuffleYUY2Y                           \
-    __asm movdqu     xmm0, [eax]          /* UV */                             \
+    __asm movdqu     xmm0, [eax] /* UV */                             \
     __asm pshufb     xmm0, xmmword ptr kShuffleYUY2UV                          \
-    __asm lea        eax, [eax + 16]                                           \
-  }
+    __asm lea        eax, [eax + 16]}
 
 // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
-#define READUYVY __asm {                                                       \
-    __asm movdqu     xmm4, [eax]          /* UYVY */                           \
+#define READUYVY \
+  __asm {                                                       \
+    __asm movdqu     xmm4, [eax] /* UYVY */                           \
     __asm pshufb     xmm4, xmmword ptr kShuffleUYVYY                           \
-    __asm movdqu     xmm0, [eax]          /* UV */                             \
+    __asm movdqu     xmm0, [eax] /* UV */                             \
     __asm pshufb     xmm0, xmmword ptr kShuffleUYVYUV                          \
-    __asm lea        eax, [eax + 16]                                           \
-  }
+    __asm lea        eax, [eax + 16]}
 
 // Convert 8 pixels: 8 UV and 8 Y.
-#define YUVTORGB(YuvConstants) __asm {                                         \
+#define YUVTORGB(YuvConstants) \
+  __asm {                                         \
     __asm movdqa     xmm1, xmm0                                                \
     __asm movdqa     xmm2, xmm0                                                \
     __asm movdqa     xmm3, xmm0                                                \
@@ -2522,129 +2450,125 @@ void I422ToRGBARow_AVX2(const uint8* y_buf,
     __asm pmaddubsw  xmm3, xmmword ptr [YuvConstants + KUVTOR]                 \
     __asm psubw      xmm2, xmm3                                                \
     __asm pmulhuw    xmm4, xmmword ptr [YuvConstants + KYTORGB]                \
-    __asm paddsw     xmm0, xmm4           /* B += Y */                         \
-    __asm paddsw     xmm1, xmm4           /* G += Y */                         \
-    __asm paddsw     xmm2, xmm4           /* R += Y */                         \
+    __asm paddsw     xmm0, xmm4 /* B += Y */                         \
+    __asm paddsw     xmm1, xmm4 /* G += Y */                         \
+    __asm paddsw     xmm2, xmm4 /* R += Y */                         \
     __asm psraw      xmm0, 6                                                   \
     __asm psraw      xmm1, 6                                                   \
     __asm psraw      xmm2, 6                                                   \
-    __asm packuswb   xmm0, xmm0           /* B */                              \
-    __asm packuswb   xmm1, xmm1           /* G */                              \
-    __asm packuswb   xmm2, xmm2           /* R */                              \
+    __asm packuswb   xmm0, xmm0 /* B */                              \
+    __asm packuswb   xmm1, xmm1 /* G */                              \
+    __asm packuswb   xmm2, xmm2 /* R */             \
   }
 
 // Store 8 ARGB values.
-#define STOREARGB __asm {                                                      \
-    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
-    __asm punpcklbw  xmm2, xmm5           /* RA */                             \
+#define STOREARGB \
+  __asm {                                                      \
+    __asm punpcklbw  xmm0, xmm1 /* BG */                             \
+    __asm punpcklbw  xmm2, xmm5 /* RA */                             \
     __asm movdqa     xmm1, xmm0                                                \
-    __asm punpcklwd  xmm0, xmm2           /* BGRA first 4 pixels */            \
-    __asm punpckhwd  xmm1, xmm2           /* BGRA next 4 pixels */             \
+    __asm punpcklwd  xmm0, xmm2 /* BGRA first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2 /* BGRA next 4 pixels */             \
     __asm movdqu     0[edx], xmm0                                              \
     __asm movdqu     16[edx], xmm1                                             \
-    __asm lea        edx,  [edx + 32]                                          \
-  }
+    __asm lea        edx,  [edx + 32]}
 
 // Store 8 BGRA values.
-#define STOREBGRA __asm {                                                      \
-    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
-    __asm punpcklbw  xmm1, xmm0           /* GB */                             \
-    __asm punpcklbw  xmm5, xmm2           /* AR */                             \
+#define STOREBGRA \
+  __asm {                                                      \
+    __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */  \
+    __asm punpcklbw  xmm1, xmm0 /* GB */                             \
+    __asm punpcklbw  xmm5, xmm2 /* AR */                             \
     __asm movdqa     xmm0, xmm5                                                \
-    __asm punpcklwd  xmm5, xmm1           /* BGRA first 4 pixels */            \
-    __asm punpckhwd  xmm0, xmm1           /* BGRA next 4 pixels */             \
+    __asm punpcklwd  xmm5, xmm1 /* BGRA first 4 pixels */            \
+    __asm punpckhwd  xmm0, xmm1 /* BGRA next 4 pixels */             \
     __asm movdqu     0[edx], xmm5                                              \
     __asm movdqu     16[edx], xmm0                                             \
-    __asm lea        edx,  [edx + 32]                                          \
-  }
+    __asm lea        edx,  [edx + 32]}
 
 // Store 8 RGBA values.
-#define STORERGBA __asm {                                                      \
-    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
-    __asm punpcklbw  xmm1, xmm2           /* GR */                             \
-    __asm punpcklbw  xmm5, xmm0           /* AB */                             \
+#define STORERGBA \
+  __asm {                                                      \
+    __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */  \
+    __asm punpcklbw  xmm1, xmm2 /* GR */                             \
+    __asm punpcklbw  xmm5, xmm0 /* AB */                             \
     __asm movdqa     xmm0, xmm5                                                \
-    __asm punpcklwd  xmm5, xmm1           /* RGBA first 4 pixels */            \
-    __asm punpckhwd  xmm0, xmm1           /* RGBA next 4 pixels */             \
+    __asm punpcklwd  xmm5, xmm1 /* RGBA first 4 pixels */            \
+    __asm punpckhwd  xmm0, xmm1 /* RGBA next 4 pixels */             \
     __asm movdqu     0[edx], xmm5                                              \
     __asm movdqu     16[edx], xmm0                                             \
-    __asm lea        edx,  [edx + 32]                                          \
-  }
+    __asm lea        edx,  [edx + 32]}
 
 // Store 8 RGB24 values.
-#define STORERGB24 __asm {                                                     \
-    /* Weave into RRGB */                                                      \
-    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
-    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
+#define STORERGB24 \
+  __asm {/* Weave into RRGB */                                                      \
+    __asm punpcklbw  xmm0, xmm1 /* BG */                             \
+    __asm punpcklbw  xmm2, xmm2 /* RR */                             \
     __asm movdqa     xmm1, xmm0                                                \
-    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
-    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
-    /* RRGB -> RGB24 */                                                        \
-    __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \
-    __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \
-    __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \
-    __asm movq       qword ptr 0[edx], xmm0  /* First 8 bytes */               \
-    __asm movdqu     8[edx], xmm1         /* Last 16 bytes */                  \
-    __asm lea        edx,  [edx + 24]                                          \
-  }
+    __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */                                                        \
+    __asm pshufb     xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
+    __asm pshufb     xmm1, xmm6 /* Pack first 12 bytes. */           \
+    __asm palignr    xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
+    __asm movq       qword ptr 0[edx], xmm0 /* First 8 bytes */               \
+    __asm movdqu     8[edx], xmm1 /* Last 16 bytes */                  \
+    __asm lea        edx,  [edx + 24]}
 
 // Store 8 RGB565 values.
-#define STORERGB565 __asm {                                                    \
-    /* Weave into RRGB */                                                      \
-    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
-    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
+#define STORERGB565 \
+  __asm {/* Weave into RRGB */                                                      \
+    __asm punpcklbw  xmm0, xmm1 /* BG */                             \
+    __asm punpcklbw  xmm2, xmm2 /* RR */                             \
     __asm movdqa     xmm1, xmm0                                                \
-    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
-    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
-    /* RRGB -> RGB565 */                                                       \
-    __asm movdqa     xmm3, xmm0    /* B  first 4 pixels of argb */             \
-    __asm movdqa     xmm2, xmm0    /* G */                                     \
-    __asm pslld      xmm0, 8       /* R */                                     \
-    __asm psrld      xmm3, 3       /* B */                                     \
-    __asm psrld      xmm2, 5       /* G */                                     \
-    __asm psrad      xmm0, 16      /* R */                                     \
-    __asm pand       xmm3, xmm5    /* B */                                     \
-    __asm pand       xmm2, xmm6    /* G */                                     \
-    __asm pand       xmm0, xmm7    /* R */                                     \
-    __asm por        xmm3, xmm2    /* BG */                                    \
-    __asm por        xmm0, xmm3    /* BGR */                                   \
-    __asm movdqa     xmm3, xmm1    /* B  next 4 pixels of argb */              \
-    __asm movdqa     xmm2, xmm1    /* G */                                     \
-    __asm pslld      xmm1, 8       /* R */                                     \
-    __asm psrld      xmm3, 3       /* B */                                     \
-    __asm psrld      xmm2, 5       /* G */                                     \
-    __asm psrad      xmm1, 16      /* R */                                     \
-    __asm pand       xmm3, xmm5    /* B */                                     \
-    __asm pand       xmm2, xmm6    /* G */                                     \
-    __asm pand       xmm1, xmm7    /* R */                                     \
-    __asm por        xmm3, xmm2    /* BG */                                    \
-    __asm por        xmm1, xmm3    /* BGR */                                   \
+    __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */                                                       \
+    __asm movdqa     xmm3, xmm0 /* B  first 4 pixels of argb */             \
+    __asm movdqa     xmm2, xmm0 /* G */                                     \
+    __asm pslld      xmm0, 8 /* R */                                     \
+    __asm psrld      xmm3, 3 /* B */                                     \
+    __asm psrld      xmm2, 5 /* G */                                     \
+    __asm psrad      xmm0, 16 /* R */                                     \
+    __asm pand       xmm3, xmm5 /* B */                                     \
+    __asm pand       xmm2, xmm6 /* G */                                     \
+    __asm pand       xmm0, xmm7 /* R */                                     \
+    __asm por        xmm3, xmm2 /* BG */                                    \
+    __asm por        xmm0, xmm3 /* BGR */                                   \
+    __asm movdqa     xmm3, xmm1 /* B  next 4 pixels of argb */              \
+    __asm movdqa     xmm2, xmm1 /* G */                                     \
+    __asm pslld      xmm1, 8 /* R */                                     \
+    __asm psrld      xmm3, 3 /* B */                                     \
+    __asm psrld      xmm2, 5 /* G */                                     \
+    __asm psrad      xmm1, 16 /* R */                                     \
+    __asm pand       xmm3, xmm5 /* B */                                     \
+    __asm pand       xmm2, xmm6 /* G */                                     \
+    __asm pand       xmm1, xmm7 /* R */                                     \
+    __asm por        xmm3, xmm2 /* BG */                                    \
+    __asm por        xmm1, xmm3 /* BGR */                                   \
     __asm packssdw   xmm0, xmm1                                                \
-    __asm movdqu     0[edx], xmm0  /* store 8 pixels of RGB565 */              \
-    __asm lea        edx, [edx + 16]                                           \
-  }
+    __asm movdqu     0[edx], xmm0 /* store 8 pixels of RGB565 */              \
+    __asm lea        edx, [edx + 16]}
 
 // 8 pixels.
 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked)
-void I444ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
+__declspec(naked) void I444ToARGBRow_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
     push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
     mov        edi, [esp + 12 + 12]  // V
     mov        edx, [esp + 12 + 16]  // argb
     mov        ebx, [esp + 12 + 20]  // yuvconstants
     mov        ecx, [esp + 12 + 24]  // width
     sub        edi, esi
-    pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha
+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
 
  convertloop:
     READYUV444
@@ -2663,19 +2587,19 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
 
 // 8 pixels.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
-__declspec(naked)
-void I422ToRGB24Row_SSSE3(const uint8* y_buf,
-                          const uint8* u_buf,
-                          const uint8* v_buf,
-                          uint8* dst_rgb24,
-                          const struct YuvConstants* yuvconstants,
-                          int width) {
+__declspec(naked) void I422ToRGB24Row_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* dst_rgb24,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
     push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
     mov        edi, [esp + 12 + 12]  // V
     mov        edx, [esp + 12 + 16]  // argb
     mov        ebx, [esp + 12 + 20]  // yuvconstants
@@ -2701,30 +2625,30 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf,
 
 // 8 pixels
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
-__declspec(naked)
-void I422ToRGB565Row_SSSE3(const uint8* y_buf,
-                           const uint8* u_buf,
-                           const uint8* v_buf,
-                           uint8* rgb565_buf,
-                           const struct YuvConstants* yuvconstants,
-                           int width) {
+__declspec(naked) void I422ToRGB565Row_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* rgb565_buf,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
     push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
     mov        edi, [esp + 12 + 12]  // V
     mov        edx, [esp + 12 + 16]  // argb
     mov        ebx, [esp + 12 + 20]  // yuvconstants
     mov        ecx, [esp + 12 + 24]  // width
     sub        edi, esi
-    pcmpeqb    xmm5, xmm5       // generate mask 0x0000001f
+    pcmpeqb    xmm5, xmm5  // generate mask 0x0000001f
     psrld      xmm5, 27
-    pcmpeqb    xmm6, xmm6       // generate mask 0x000007e0
+    pcmpeqb    xmm6, xmm6  // generate mask 0x000007e0
     psrld      xmm6, 26
     pslld      xmm6, 5
-    pcmpeqb    xmm7, xmm7       // generate mask 0xfffff800
+    pcmpeqb    xmm7, xmm7  // generate mask 0xfffff800
     pslld      xmm7, 11
 
  convertloop:
@@ -2744,25 +2668,25 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
 
 // 8 pixels.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked)
-void I422ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
+__declspec(naked) void I422ToARGBRow_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
     push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
     mov        edi, [esp + 12 + 12]  // V
     mov        edx, [esp + 12 + 16]  // argb
     mov        ebx, [esp + 12 + 20]  // yuvconstants
     mov        ecx, [esp + 12 + 24]  // width
     sub        edi, esi
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
 
  convertloop:
     READYUV422
@@ -2781,21 +2705,21 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
 
 // 8 pixels.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
-__declspec(naked)
-void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              const uint8* a_buf,
-                              uint8* dst_argb,
-                              const struct YuvConstants* yuvconstants,
-                              int width) {
+__declspec(naked) void I422AlphaToARGBRow_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    const uint8_t* a_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
     push       ebx
     push       ebp
-    mov        eax, [esp + 16 + 4]   // Y
-    mov        esi, [esp + 16 + 8]   // U
+    mov        eax, [esp + 16 + 4]  // Y
+    mov        esi, [esp + 16 + 8]  // U
     mov        edi, [esp + 16 + 12]  // V
     mov        ebp, [esp + 16 + 16]  // A
     mov        edx, [esp + 16 + 20]  // argb
@@ -2819,63 +2743,23 @@ void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
   }
 }
 
-// 8 pixels.
-// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-// Similar to I420 but duplicate UV once more.
-__declspec(naked)
-void I411ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    push       ebp
-    mov        eax, [esp + 16 + 4]   // Y
-    mov        esi, [esp + 16 + 8]   // U
-    mov        edi, [esp + 16 + 12]  // V
-    mov        edx, [esp + 16 + 16]  // abgr
-    mov        ebp, [esp + 16 + 20]  // yuvconstants
-    mov        ecx, [esp + 16 + 24]  // width
-    sub        edi, esi
-    pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha
-
- convertloop:
-    READYUV411_EBX
-    YUVTORGB(ebp)
-    STOREARGB
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebp
-    pop        ebx
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
 // 8 pixels.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked)
-void NV12ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* uv_buf,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
+__declspec(naked) void NV12ToARGBRow_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* uv_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       ebx
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // UV
+    mov        eax, [esp + 8 + 4]  // Y
+    mov        esi, [esp + 8 + 8]  // UV
     mov        edx, [esp + 8 + 12]  // argb
     mov        ebx, [esp + 8 + 16]  // yuvconstants
     mov        ecx, [esp + 8 + 20]  // width
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
 
  convertloop:
     READNV12
@@ -2893,21 +2777,21 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf,
 
 // 8 pixels.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked)
-void NV21ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* vu_buf,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
+__declspec(naked) void NV21ToARGBRow_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* vu_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       ebx
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // VU
+    mov        eax, [esp + 8 + 4]  // Y
+    mov        esi, [esp + 8 + 8]  // VU
     mov        edx, [esp + 8 + 12]  // argb
     mov        ebx, [esp + 8 + 16]  // yuvconstants
     mov        ecx, [esp + 8 + 20]  // width
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
 
  convertloop:
     READNV21
@@ -2925,18 +2809,18 @@ void NV21ToARGBRow_SSSE3(const uint8* y_buf,
 
 // 8 pixels.
 // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
-__declspec(naked)
-void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
+__declspec(naked) void YUY2ToARGBRow_SSSE3(
+    const uint8_t* src_yuy2,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       ebx
-    mov        eax, [esp + 4 + 4]   // yuy2
-    mov        edx, [esp + 4 + 8]   // argb
+    mov        eax, [esp + 4 + 4]  // yuy2
+    mov        edx, [esp + 4 + 8]  // argb
     mov        ebx, [esp + 4 + 12]  // yuvconstants
     mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
 
  convertloop:
     READYUY2
@@ -2953,18 +2837,18 @@ void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
 
 // 8 pixels.
 // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
-__declspec(naked)
-void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
+__declspec(naked) void UYVYToARGBRow_SSSE3(
+    const uint8_t* src_uyvy,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       ebx
-    mov        eax, [esp + 4 + 4]   // uyvy
-    mov        edx, [esp + 4 + 8]   // argb
+    mov        eax, [esp + 4 + 4]  // uyvy
+    mov        edx, [esp + 4 + 8]  // argb
     mov        ebx, [esp + 4 + 12]  // yuvconstants
     mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
 
  convertloop:
     READUYVY
@@ -2979,19 +2863,19 @@ void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
   }
 }
 
-__declspec(naked)
-void I422ToRGBARow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* dst_rgba,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
+__declspec(naked) void I422ToRGBARow_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* dst_rgba,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
     push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
     mov        edi, [esp + 12 + 12]  // V
     mov        edx, [esp + 12 + 16]  // argb
     mov        ebx, [esp + 12 + 20]  // yuvconstants
@@ -3016,39 +2900,38 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf,
 
 #ifdef HAS_I400TOARGBROW_SSE2
 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
-__declspec(naked)
-void I400ToARGBRow_SSE2(const uint8* y_buf,
-                        uint8* rgb_buf,
-                        int width) {
+__declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf,
+                                          uint8_t* rgb_buf,
+                                          int width) {
   __asm {
-    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
+    mov        eax, 0x4a354a35  // 4a35 = 18997 = round(1.164 * 64 * 256)
     movd       xmm2, eax
     pshufd     xmm2, xmm2,0
-    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
+    mov        eax, 0x04880488  // 0488 = 1160 = round(1.164 * 64 * 16)
     movd       xmm3, eax
     pshufd     xmm3, xmm3, 0
-    pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
+    pcmpeqb    xmm4, xmm4  // generate mask 0xff000000
     pslld      xmm4, 24
 
-    mov        eax, [esp + 4]       // Y
-    mov        edx, [esp + 8]       // rgb
-    mov        ecx, [esp + 12]      // width
+    mov        eax, [esp + 4]  // Y
+    mov        edx, [esp + 8]  // rgb
+    mov        ecx, [esp + 12]  // width
 
  convertloop:
-    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+        // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
     movq       xmm0, qword ptr [eax]
     lea        eax, [eax + 8]
-    punpcklbw  xmm0, xmm0           // Y.Y
+    punpcklbw  xmm0, xmm0  // Y.Y
     pmulhuw    xmm0, xmm2
     psubusw    xmm0, xmm3
     psrlw      xmm0, 6
-    packuswb   xmm0, xmm0           // G
+    packuswb   xmm0, xmm0        // G
 
-    // Step 2: Weave into ARGB
-    punpcklbw  xmm0, xmm0           // GG
+        // Step 2: Weave into ARGB
+    punpcklbw  xmm0, xmm0  // GG
     movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm0           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm1           // BGRA next 4 pixels
+    punpcklwd  xmm0, xmm0  // BGRA first 4 pixels
+    punpckhwd  xmm1, xmm1  // BGRA next 4 pixels
     por        xmm0, xmm4
     por        xmm1, xmm4
     movdqu     [edx], xmm0
@@ -3064,41 +2947,40 @@ void I400ToARGBRow_SSE2(const uint8* y_buf,
 #ifdef HAS_I400TOARGBROW_AVX2
 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
 // note: vpunpcklbw mutates and vpackuswb unmutates.
-__declspec(naked)
-void I400ToARGBRow_AVX2(const uint8* y_buf,
-                        uint8* rgb_buf,
-                        int width) {
+__declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf,
+                                          uint8_t* rgb_buf,
+                                          int width) {
   __asm {
-    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
+    mov        eax, 0x4a354a35  // 4a35 = 18997 = round(1.164 * 64 * 256)
     vmovd      xmm2, eax
     vbroadcastss ymm2, xmm2
-    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
+    mov        eax, 0x04880488  // 0488 = 1160 = round(1.164 * 64 * 16)
     vmovd      xmm3, eax
     vbroadcastss ymm3, xmm3
-    vpcmpeqb   ymm4, ymm4, ymm4     // generate mask 0xff000000
+    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0xff000000
     vpslld     ymm4, ymm4, 24
 
-    mov        eax, [esp + 4]       // Y
-    mov        edx, [esp + 8]       // rgb
-    mov        ecx, [esp + 12]      // width
+    mov        eax, [esp + 4]  // Y
+    mov        edx, [esp + 8]  // rgb
+    mov        ecx, [esp + 12]  // width
 
  convertloop:
-    // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
+        // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
     vmovdqu    xmm0, [eax]
     lea        eax, [eax + 16]
-    vpermq     ymm0, ymm0, 0xd8           // vpunpcklbw mutates
-    vpunpcklbw ymm0, ymm0, ymm0           // Y.Y
+    vpermq     ymm0, ymm0, 0xd8  // vpunpcklbw mutates
+    vpunpcklbw ymm0, ymm0, ymm0  // Y.Y
     vpmulhuw   ymm0, ymm0, ymm2
     vpsubusw   ymm0, ymm0, ymm3
     vpsrlw     ymm0, ymm0, 6
-    vpackuswb  ymm0, ymm0, ymm0           // G.  still mutated: 3120
+    vpackuswb  ymm0, ymm0, ymm0        // G.  still mutated: 3120
 
-    // TODO(fbarchard): Weave alpha with unpack.
-    // Step 2: Weave into ARGB
-    vpunpcklbw ymm1, ymm0, ymm0           // GG - mutates
+        // TODO(fbarchard): Weave alpha with unpack.
+        // Step 2: Weave into ARGB
+    vpunpcklbw ymm1, ymm0, ymm0  // GG - mutates
     vpermq     ymm1, ymm1, 0xd8
-    vpunpcklwd ymm0, ymm1, ymm1           // GGGG first 8 pixels
-    vpunpckhwd ymm1, ymm1, ymm1           // GGGG next 8 pixels
+    vpunpcklwd ymm0, ymm1, ymm1  // GGGG first 8 pixels
+    vpunpckhwd ymm1, ymm1, ymm1  // GGGG next 8 pixels
     vpor       ymm0, ymm0, ymm4
     vpor       ymm1, ymm1, ymm4
     vmovdqu    [edx], ymm0
@@ -3114,16 +2996,16 @@ void I400ToARGBRow_AVX2(const uint8* y_buf,
 
 #ifdef HAS_MIRRORROW_SSSE3
 // Shuffle table for reversing the bytes.
-static const uvec8 kShuffleMirror = {
-  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
-};
+static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
+                                     7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
 
 // TODO(fbarchard): Replace lea with -16 offset.
-__declspec(naked)
-void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void MirrorRow_SSSE3(const uint8_t* src,
+                                       uint8_t* dst,
+                                       int width) {
   __asm {
-    mov       eax, [esp + 4]   // src
-    mov       edx, [esp + 8]   // dst
+    mov       eax, [esp + 4]  // src
+    mov       edx, [esp + 8]  // dst
     mov       ecx, [esp + 12]  // width
     movdqa    xmm5, xmmword ptr kShuffleMirror
 
@@ -3140,11 +3022,12 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
 #endif  // HAS_MIRRORROW_SSSE3
 
 #ifdef HAS_MIRRORROW_AVX2
-__declspec(naked)
-void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void MirrorRow_AVX2(const uint8_t* src,
+                                      uint8_t* dst,
+                                      int width) {
   __asm {
-    mov       eax, [esp + 4]   // src
-    mov       edx, [esp + 8]   // dst
+    mov       eax, [esp + 4]  // src
+    mov       edx, [esp + 8]  // dst
     mov       ecx, [esp + 12]  // width
     vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
 
@@ -3164,17 +3047,17 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
 
 #ifdef HAS_MIRRORUVROW_SSSE3
 // Shuffle table for reversing the bytes of UV channels.
-static const uvec8 kShuffleMirrorUV = {
-  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
-};
+static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
+                                       15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
 
-__declspec(naked)
-void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
-                       int width) {
+__declspec(naked) void MirrorUVRow_SSSE3(const uint8_t* src,
+                                         uint8_t* dst_u,
+                                         uint8_t* dst_v,
+                                         int width) {
   __asm {
     push      edi
-    mov       eax, [esp + 4 + 4]   // src
-    mov       edx, [esp + 4 + 8]   // dst_u
+    mov       eax, [esp + 4 + 4]  // src
+    mov       edx, [esp + 4 + 8]  // dst_u
     mov       edi, [esp + 4 + 12]  // dst_v
     mov       ecx, [esp + 4 + 16]  // width
     movdqa    xmm1, xmmword ptr kShuffleMirrorUV
@@ -3198,11 +3081,12 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
 #endif  // HAS_MIRRORUVROW_SSSE3
 
 #ifdef HAS_ARGBMIRRORROW_SSE2
-__declspec(naked)
-void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src,
+                                          uint8_t* dst,
+                                          int width) {
   __asm {
-    mov       eax, [esp + 4]   // src
-    mov       edx, [esp + 8]   // dst
+    mov       eax, [esp + 4]  // src
+    mov       edx, [esp + 8]  // dst
     mov       ecx, [esp + 12]  // width
     lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
 
@@ -3221,15 +3105,14 @@ void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
 
 #ifdef HAS_ARGBMIRRORROW_AVX2
 // Shuffle table for reversing the bytes.
-static const ulvec32 kARGBShuffleMirror_AVX2 = {
-  7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
-};
+static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
 
-__declspec(naked)
-void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void ARGBMirrorRow_AVX2(const uint8_t* src,
+                                          uint8_t* dst,
+                                          int width) {
   __asm {
-    mov       eax, [esp + 4]   // src
-    mov       edx, [esp + 8]   // dst
+    mov       eax, [esp + 4]  // src
+    mov       edx, [esp + 8]  // dst
     mov       ecx, [esp + 12]  // width
     vmovdqu   ymm5, ymmword ptr kARGBShuffleMirror_AVX2
 
@@ -3246,16 +3129,17 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
 #endif  // HAS_ARGBMIRRORROW_AVX2
 
 #ifdef HAS_SPLITUVROW_SSE2
-__declspec(naked)
-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                     int width) {
+__declspec(naked) void SplitUVRow_SSE2(const uint8_t* src_uv,
+                                       uint8_t* dst_u,
+                                       uint8_t* dst_v,
+                                       int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]    // src_uv
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    mov        eax, [esp + 4 + 4]  // src_uv
+    mov        edx, [esp + 4 + 8]  // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
     psrlw      xmm5, 8
     sub        edi, edx
 
@@ -3265,10 +3149,10 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
     lea        eax,  [eax + 32]
     movdqa     xmm2, xmm0
     movdqa     xmm3, xmm1
-    pand       xmm0, xmm5   // even bytes
+    pand       xmm0, xmm5  // even bytes
     pand       xmm1, xmm5
     packuswb   xmm0, xmm1
-    psrlw      xmm2, 8      // odd bytes
+    psrlw      xmm2, 8  // odd bytes
     psrlw      xmm3, 8
     packuswb   xmm2, xmm3
     movdqu     [edx], xmm0
@@ -3285,16 +3169,17 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
 #endif  // HAS_SPLITUVROW_SSE2
 
 #ifdef HAS_SPLITUVROW_AVX2
-__declspec(naked)
-void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                     int width) {
+__declspec(naked) void SplitUVRow_AVX2(const uint8_t* src_uv,
+                                       uint8_t* dst_u,
+                                       uint8_t* dst_v,
+                                       int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]    // src_uv
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // width
-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    mov        eax, [esp + 4 + 4]  // src_uv
+    mov        edx, [esp + 4 + 8]  // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
@@ -3302,9 +3187,9 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
     lea        eax,  [eax + 64]
-    vpsrlw     ymm2, ymm0, 8      // odd bytes
+    vpsrlw     ymm2, ymm0, 8  // odd bytes
     vpsrlw     ymm3, ymm1, 8
-    vpand      ymm0, ymm0, ymm5   // even bytes
+    vpand      ymm0, ymm0, ymm5  // even bytes
     vpand      ymm1, ymm1, ymm5
     vpackuswb  ymm0, ymm0, ymm1
     vpackuswb  ymm2, ymm2, ymm3
@@ -3324,24 +3209,25 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
 #endif  // HAS_SPLITUVROW_AVX2
 
 #ifdef HAS_MERGEUVROW_SSE2
-__declspec(naked)
-void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                     int width) {
+__declspec(naked) void MergeUVRow_SSE2(const uint8_t* src_u,
+                                       const uint8_t* src_v,
+                                       uint8_t* dst_uv,
+                                       int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]    // src_u
-    mov        edx, [esp + 4 + 8]    // src_v
-    mov        edi, [esp + 4 + 12]   // dst_uv
-    mov        ecx, [esp + 4 + 16]   // width
+    mov        eax, [esp + 4 + 4]  // src_u
+    mov        edx, [esp + 4 + 8]  // src_v
+    mov        edi, [esp + 4 + 12]  // dst_uv
+    mov        ecx, [esp + 4 + 16]  // width
     sub        edx, eax
 
   convertloop:
-    movdqu     xmm0, [eax]      // read 16 U's
+    movdqu     xmm0, [eax]  // read 16 U's
     movdqu     xmm1, [eax + edx]  // and 16 V's
     lea        eax,  [eax + 16]
     movdqa     xmm2, xmm0
-    punpcklbw  xmm0, xmm1       // first 8 UV pairs
-    punpckhbw  xmm2, xmm1       // next 8 UV pairs
+    punpcklbw  xmm0, xmm1  // first 8 UV pairs
+    punpckhbw  xmm2, xmm1  // next 8 UV pairs
     movdqu     [edi], xmm0
     movdqu     [edi + 16], xmm2
     lea        edi, [edi + 32]
@@ -3355,24 +3241,25 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
 #endif  //  HAS_MERGEUVROW_SSE2
 
 #ifdef HAS_MERGEUVROW_AVX2
-__declspec(naked)
-void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                     int width) {
+__declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u,
+                                       const uint8_t* src_v,
+                                       uint8_t* dst_uv,
+                                       int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]    // src_u
-    mov        edx, [esp + 4 + 8]    // src_v
-    mov        edi, [esp + 4 + 12]   // dst_uv
-    mov        ecx, [esp + 4 + 16]   // width
+    mov        eax, [esp + 4 + 4]  // src_u
+    mov        edx, [esp + 4 + 8]  // src_v
+    mov        edi, [esp + 4 + 12]  // dst_uv
+    mov        ecx, [esp + 4 + 16]  // width
     sub        edx, eax
 
   convertloop:
-    vmovdqu    ymm0, [eax]           // read 32 U's
-    vmovdqu    ymm1, [eax + edx]     // and 32 V's
+    vmovdqu    ymm0, [eax]  // read 32 U's
+    vmovdqu    ymm1, [eax + edx]  // and 32 V's
     lea        eax,  [eax + 32]
-    vpunpcklbw ymm2, ymm0, ymm1      // low 16 UV pairs. mutated qqword 0,2
-    vpunpckhbw ymm0, ymm0, ymm1      // high 16 UV pairs. mutated qqword 1,3
-    vextractf128 [edi], ymm2, 0       // bytes 0..15
+    vpunpcklbw ymm2, ymm0, ymm1  // low 16 UV pairs. mutated qqword 0,2
+    vpunpckhbw ymm0, ymm0, ymm1  // high 16 UV pairs. mutated qqword 1,3
+    vextractf128 [edi], ymm2, 0  // bytes 0..15
     vextractf128 [edi + 16], ymm0, 0  // bytes 16..31
     vextractf128 [edi + 32], ymm2, 1  // bytes 32..47
     vextractf128 [edi + 48], ymm0, 1  // bytes 47..63
@@ -3388,13 +3275,14 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
 #endif  //  HAS_MERGEUVROW_AVX2
 
 #ifdef HAS_COPYROW_SSE2
-// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
-__declspec(naked)
-void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
+// CopyRow copys 'width' bytes using a 16 byte load/store, 32 bytes at time.
+__declspec(naked) void CopyRow_SSE2(const uint8_t* src,
+                                    uint8_t* dst,
+                                    int width) {
   __asm {
-    mov        eax, [esp + 4]   // src
-    mov        edx, [esp + 8]   // dst
-    mov        ecx, [esp + 12]  // count
+    mov        eax, [esp + 4]  // src
+    mov        edx, [esp + 8]  // dst
+    mov        ecx, [esp + 12]  // width
     test       eax, 15
     jne        convertloopu
     test       edx, 15
@@ -3426,13 +3314,14 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
 #endif  // HAS_COPYROW_SSE2
 
 #ifdef HAS_COPYROW_AVX
-// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
-__declspec(naked)
-void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
+// CopyRow copys 'width' bytes using a 32 byte load/store, 64 bytes at time.
+__declspec(naked) void CopyRow_AVX(const uint8_t* src,
+                                   uint8_t* dst,
+                                   int width) {
   __asm {
-    mov        eax, [esp + 4]   // src
-    mov        edx, [esp + 8]   // dst
-    mov        ecx, [esp + 12]  // count
+    mov        eax, [esp + 4]  // src
+    mov        edx, [esp + 8]  // dst
+    mov        ecx, [esp + 12]  // width
 
   convertloop:
     vmovdqu    ymm0, [eax]
@@ -3451,14 +3340,15 @@ void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
 #endif  // HAS_COPYROW_AVX
 
 // Multiple of 1.
-__declspec(naked)
-void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
+__declspec(naked) void CopyRow_ERMS(const uint8_t* src,
+                                    uint8_t* dst,
+                                    int width) {
   __asm {
     mov        eax, esi
     mov        edx, edi
-    mov        esi, [esp + 4]   // src
-    mov        edi, [esp + 8]   // dst
-    mov        ecx, [esp + 12]  // count
+    mov        esi, [esp + 4]  // src
+    mov        edi, [esp + 8]  // dst
+    mov        ecx, [esp + 12]  // width
     rep movsb
     mov        edi, edx
     mov        esi, eax
@@ -3468,15 +3358,16 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
 
 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
 // width in pixels
-__declspec(naked)
-void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8_t* src,
+                                             uint8_t* dst,
+                                             int width) {
   __asm {
-    mov        eax, [esp + 4]   // src
-    mov        edx, [esp + 8]   // dst
-    mov        ecx, [esp + 12]  // count
-    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
+    mov        eax, [esp + 4]  // src
+    mov        edx, [esp + 8]  // dst
+    mov        ecx, [esp + 12]  // width
+    pcmpeqb    xmm0, xmm0  // generate mask 0xff000000
     pslld      xmm0, 24
-    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
+    pcmpeqb    xmm1, xmm1  // generate mask 0x00ffffff
     psrld      xmm1, 8
 
   convertloop:
@@ -3504,14 +3395,15 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
 
 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
 // width in pixels
-__declspec(naked)
-void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8_t* src,
+                                             uint8_t* dst,
+                                             int width) {
   __asm {
-    mov        eax, [esp + 4]   // src
-    mov        edx, [esp + 8]   // dst
-    mov        ecx, [esp + 12]  // count
+    mov        eax, [esp + 4]  // src
+    mov        edx, [esp + 8]  // dst
+    mov        ecx, [esp + 12]  // width
     vpcmpeqb   ymm0, ymm0, ymm0
-    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
+    vpsrld     ymm0, ymm0, 8  // generate mask 0x00ffffff
 
   convertloop:
     vmovdqu    ymm1, [eax]
@@ -3533,11 +3425,12 @@ void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
 
 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
 // width in pixels
-__declspec(naked)
-void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
+__declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
+                                                uint8_t* dst_a,
+                                                int width) {
   __asm {
-    mov        eax, [esp + 4]   // src_argb
-    mov        edx, [esp + 8]   // dst_a
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_a
     mov        ecx, [esp + 12]  // width
 
   extractloop:
@@ -3558,17 +3451,54 @@ void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
 }
 #endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
 
+#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
+// width in pixels
+__declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
+                                                uint8_t* dst_a,
+                                                int width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_a
+    mov        ecx, [esp + 12]  // width
+    vmovdqa    ymm4, ymmword ptr kPermdARGBToY_AVX
+
+  extractloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vpsrld     ymm0, ymm0, 24
+    vpsrld     ymm1, ymm1, 24
+    vmovdqu    ymm2, [eax + 64]
+    vmovdqu    ymm3, [eax + 96]
+    lea        eax, [eax + 128]
+    vpackssdw  ymm0, ymm0, ymm1  // mutates
+    vpsrld     ymm2, ymm2, 24
+    vpsrld     ymm3, ymm3, 24
+    vpackssdw  ymm2, ymm2, ymm3  // mutates
+    vpackuswb  ymm0, ymm0, ymm2  // mutates
+    vpermd     ymm0, ymm4, ymm0  // unmutate
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         extractloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBEXTRACTALPHAROW_AVX2
+
 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
 // width in pixels
-__declspec(naked)
-void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src,
+                                                uint8_t* dst,
+                                                int width) {
   __asm {
-    mov        eax, [esp + 4]   // src
-    mov        edx, [esp + 8]   // dst
-    mov        ecx, [esp + 12]  // count
-    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
+    mov        eax, [esp + 4]  // src
+    mov        edx, [esp + 8]  // dst
+    mov        ecx, [esp + 12]  // width
+    pcmpeqb    xmm0, xmm0  // generate mask 0xff000000
     pslld      xmm0, 24
-    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
+    pcmpeqb    xmm1, xmm1  // generate mask 0x00ffffff
     psrld      xmm1, 8
 
   convertloop:
@@ -3598,14 +3528,15 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
 
 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
 // width in pixels
-__declspec(naked)
-void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src,
+                                                uint8_t* dst,
+                                                int width) {
   __asm {
-    mov        eax, [esp + 4]   // src
-    mov        edx, [esp + 8]   // dst
-    mov        ecx, [esp + 12]  // count
+    mov        eax, [esp + 4]  // src
+    mov        edx, [esp + 8]  // dst
+    mov        ecx, [esp + 12]  // width
     vpcmpeqb   ymm0, ymm0, ymm0
-    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
+    vpsrld     ymm0, ymm0, 8  // generate mask 0x00ffffff
 
   convertloop:
     vpmovzxbd  ymm1, qword ptr [eax]
@@ -3628,17 +3559,16 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
 
 #ifdef HAS_SETROW_X86
-// Write 'count' bytes using an 8 bit value repeated.
-// Count should be multiple of 4.
-__declspec(naked)
-void SetRow_X86(uint8* dst, uint8 v8, int count) {
+// Write 'width' bytes using an 8 bit value repeated.
+// width should be multiple of 4.
+__declspec(naked) void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
   __asm {
-    movzx      eax, byte ptr [esp + 8]    // v8
+    movzx      eax, byte ptr [esp + 8]  // v8
     mov        edx, 0x01010101  // Duplicate byte to all bytes.
-    mul        edx              // overwrites edx with upper part of result.
+    mul        edx  // overwrites edx with upper part of result.
     mov        edx, edi
-    mov        edi, [esp + 4]   // dst
-    mov        ecx, [esp + 12]  // count
+    mov        edi, [esp + 4]  // dst
+    mov        ecx, [esp + 12]  // width
     shr        ecx, 2
     rep stosd
     mov        edi, edx
@@ -3646,28 +3576,28 @@ void SetRow_X86(uint8* dst, uint8 v8, int count) {
   }
 }
 
-// Write 'count' bytes using an 8 bit value repeated.
-__declspec(naked)
-void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
+// Write 'width' bytes using an 8 bit value repeated.
+__declspec(naked) void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
   __asm {
     mov        edx, edi
-    mov        edi, [esp + 4]   // dst
-    mov        eax, [esp + 8]   // v8
-    mov        ecx, [esp + 12]  // count
+    mov        edi, [esp + 4]  // dst
+    mov        eax, [esp + 8]  // v8
+    mov        ecx, [esp + 12]  // width
     rep stosb
     mov        edi, edx
     ret
   }
 }
 
-// Write 'count' 32 bit values.
-__declspec(naked)
-void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
+// Write 'width' 32 bit values.
+__declspec(naked) void ARGBSetRow_X86(uint8_t* dst_argb,
+                                      uint32_t v32,
+                                      int width) {
   __asm {
     mov        edx, edi
-    mov        edi, [esp + 4]   // dst
-    mov        eax, [esp + 8]   // v32
-    mov        ecx, [esp + 12]  // count
+    mov        edi, [esp + 4]  // dst
+    mov        eax, [esp + 8]  // v32
+    mov        ecx, [esp + 12]  // width
     rep stosd
     mov        edi, edx
     ret
@@ -3676,12 +3606,13 @@ void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
 #endif  // HAS_SETROW_X86
 
 #ifdef HAS_YUY2TOYROW_AVX2
-__declspec(naked)
-void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
+__declspec(naked) void YUY2ToYRow_AVX2(const uint8_t* src_yuy2,
+                                       uint8_t* dst_y,
+                                       int width) {
   __asm {
-    mov        eax, [esp + 4]    // src_yuy2
-    mov        edx, [esp + 8]    // dst_y
-    mov        ecx, [esp + 12]   // width
+    mov        eax, [esp + 4]  // src_yuy2
+    mov        edx, [esp + 8]  // dst_y
+    mov        ecx, [esp + 12]  // width
     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
     vpsrlw     ymm5, ymm5, 8
 
@@ -3689,9 +3620,9 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
     lea        eax,  [eax + 64]
-    vpand      ymm0, ymm0, ymm5   // even bytes are Y
+    vpand      ymm0, ymm0, ymm5  // even bytes are Y
     vpand      ymm1, ymm1, ymm5
-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpackuswb  ymm0, ymm0, ymm1  // mutates.
     vpermq     ymm0, ymm0, 0xd8
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
@@ -3702,18 +3633,20 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
   }
 }
 
-__declspec(naked)
-void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
+                                        int stride_yuy2,
+                                        uint8_t* dst_u,
+                                        uint8_t* dst_v,
+                                        int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]    // src_yuy2
-    mov        esi, [esp + 8 + 8]    // stride_yuy2
-    mov        edx, [esp + 8 + 12]   // dst_u
-    mov        edi, [esp + 8 + 16]   // dst_v
-    mov        ecx, [esp + 8 + 20]   // width
-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    mov        eax, [esp + 8 + 4]  // src_yuy2
+    mov        esi, [esp + 8 + 8]  // stride_yuy2
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
@@ -3723,18 +3656,18 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
     vpavgb     ymm0, ymm0, [eax + esi]
     vpavgb     ymm1, ymm1, [eax + esi + 32]
     lea        eax,  [eax + 64]
-    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
+    vpsrlw     ymm0, ymm0, 8  // YUYV -> UVUV
     vpsrlw     ymm1, ymm1, 8
-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpackuswb  ymm0, ymm0, ymm1  // mutates.
     vpermq     ymm0, ymm0, 0xd8
     vpand      ymm1, ymm0, ymm5  // U
-    vpsrlw     ymm0, ymm0, 8     // V
+    vpsrlw     ymm0, ymm0, 8  // V
     vpackuswb  ymm1, ymm1, ymm1  // mutates.
     vpackuswb  ymm0, ymm0, ymm0  // mutates.
     vpermq     ymm1, ymm1, 0xd8
     vpermq     ymm0, ymm0, 0xd8
     vextractf128 [edx], ymm1, 0  // U
-    vextractf128 [edx + edi], ymm0, 0 // V
+    vextractf128 [edx + edi], ymm0, 0  // V
     lea        edx, [edx + 16]
     sub        ecx, 32
     jg         convertloop
@@ -3746,16 +3679,17 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
   }
 }
 
-__declspec(naked)
-void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
+                                           uint8_t* dst_u,
+                                           uint8_t* dst_v,
+                                           int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]    // src_yuy2
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // width
-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    mov        eax, [esp + 4 + 4]  // src_yuy2
+    mov        edx, [esp + 4 + 8]  // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
@@ -3763,18 +3697,18 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
     lea        eax,  [eax + 64]
-    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
+    vpsrlw     ymm0, ymm0, 8  // YUYV -> UVUV
     vpsrlw     ymm1, ymm1, 8
-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpackuswb  ymm0, ymm0, ymm1  // mutates.
     vpermq     ymm0, ymm0, 0xd8
     vpand      ymm1, ymm0, ymm5  // U
-    vpsrlw     ymm0, ymm0, 8     // V
+    vpsrlw     ymm0, ymm0, 8  // V
     vpackuswb  ymm1, ymm1, ymm1  // mutates.
     vpackuswb  ymm0, ymm0, ymm0  // mutates.
     vpermq     ymm1, ymm1, 0xd8
     vpermq     ymm0, ymm0, 0xd8
     vextractf128 [edx], ymm1, 0  // U
-    vextractf128 [edx + edi], ymm0, 0 // V
+    vextractf128 [edx + edi], ymm0, 0  // V
     lea        edx, [edx + 16]
     sub        ecx, 32
     jg         convertloop
@@ -3785,21 +3719,21 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
   }
 }
 
-__declspec(naked)
-void UYVYToYRow_AVX2(const uint8* src_uyvy,
-                     uint8* dst_y, int width) {
+__declspec(naked) void UYVYToYRow_AVX2(const uint8_t* src_uyvy,
+                                       uint8_t* dst_y,
+                                       int width) {
   __asm {
-    mov        eax, [esp + 4]    // src_uyvy
-    mov        edx, [esp + 8]    // dst_y
-    mov        ecx, [esp + 12]   // width
+    mov        eax, [esp + 4]  // src_uyvy
+    mov        edx, [esp + 8]  // dst_y
+    mov        ecx, [esp + 12]  // width
 
   convertloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
     lea        eax,  [eax + 64]
-    vpsrlw     ymm0, ymm0, 8      // odd bytes are Y
+    vpsrlw     ymm0, ymm0, 8  // odd bytes are Y
     vpsrlw     ymm1, ymm1, 8
-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpackuswb  ymm0, ymm0, ymm1  // mutates.
     vpermq     ymm0, ymm0, 0xd8
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
@@ -3810,18 +3744,20 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy,
   }
 }
 
-__declspec(naked)
-void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
+                                        int stride_uyvy,
+                                        uint8_t* dst_u,
+                                        uint8_t* dst_v,
+                                        int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]    // src_yuy2
-    mov        esi, [esp + 8 + 8]    // stride_yuy2
-    mov        edx, [esp + 8 + 12]   // dst_u
-    mov        edi, [esp + 8 + 16]   // dst_v
-    mov        ecx, [esp + 8 + 20]   // width
-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    mov        eax, [esp + 8 + 4]  // src_yuy2
+    mov        esi, [esp + 8 + 8]  // stride_yuy2
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
@@ -3831,18 +3767,18 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
     vpavgb     ymm0, ymm0, [eax + esi]
     vpavgb     ymm1, ymm1, [eax + esi + 32]
     lea        eax,  [eax + 64]
-    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
+    vpand      ymm0, ymm0, ymm5  // UYVY -> UVUV
     vpand      ymm1, ymm1, ymm5
-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpackuswb  ymm0, ymm0, ymm1  // mutates.
     vpermq     ymm0, ymm0, 0xd8
     vpand      ymm1, ymm0, ymm5  // U
-    vpsrlw     ymm0, ymm0, 8     // V
+    vpsrlw     ymm0, ymm0, 8  // V
     vpackuswb  ymm1, ymm1, ymm1  // mutates.
     vpackuswb  ymm0, ymm0, ymm0  // mutates.
     vpermq     ymm1, ymm1, 0xd8
     vpermq     ymm0, ymm0, 0xd8
     vextractf128 [edx], ymm1, 0  // U
-    vextractf128 [edx + edi], ymm0, 0 // V
+    vextractf128 [edx + edi], ymm0, 0  // V
     lea        edx, [edx + 16]
     sub        ecx, 32
     jg         convertloop
@@ -3854,16 +3790,17 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
   }
 }
 
-__declspec(naked)
-void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
+                                           uint8_t* dst_u,
+                                           uint8_t* dst_v,
+                                           int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]    // src_yuy2
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // width
-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    mov        eax, [esp + 4 + 4]  // src_yuy2
+    mov        edx, [esp + 4 + 8]  // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
@@ -3871,18 +3808,18 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
     lea        eax,  [eax + 64]
-    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
+    vpand      ymm0, ymm0, ymm5  // UYVY -> UVUV
     vpand      ymm1, ymm1, ymm5
-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpackuswb  ymm0, ymm0, ymm1  // mutates.
     vpermq     ymm0, ymm0, 0xd8
     vpand      ymm1, ymm0, ymm5  // U
-    vpsrlw     ymm0, ymm0, 8     // V
+    vpsrlw     ymm0, ymm0, 8  // V
     vpackuswb  ymm1, ymm1, ymm1  // mutates.
     vpackuswb  ymm0, ymm0, ymm0  // mutates.
     vpermq     ymm1, ymm1, 0xd8
     vpermq     ymm0, ymm0, 0xd8
     vextractf128 [edx], ymm1, 0  // U
-    vextractf128 [edx + edi], ymm0, 0 // V
+    vextractf128 [edx + edi], ymm0, 0  // V
     lea        edx, [edx + 16]
     sub        ecx, 32
     jg         convertloop
@@ -3895,21 +3832,21 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
 #endif  // HAS_YUY2TOYROW_AVX2
 
 #ifdef HAS_YUY2TOYROW_SSE2
-__declspec(naked)
-void YUY2ToYRow_SSE2(const uint8* src_yuy2,
-                     uint8* dst_y, int width) {
+__declspec(naked) void YUY2ToYRow_SSE2(const uint8_t* src_yuy2,
+                                       uint8_t* dst_y,
+                                       int width) {
   __asm {
-    mov        eax, [esp + 4]    // src_yuy2
-    mov        edx, [esp + 8]    // dst_y
-    mov        ecx, [esp + 12]   // width
-    pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
+    mov        eax, [esp + 4]  // src_yuy2
+    mov        edx, [esp + 8]  // dst_y
+    mov        ecx, [esp + 12]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
     psrlw      xmm5, 8
 
   convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
-    pand       xmm0, xmm5   // even bytes are Y
+    pand       xmm0, xmm5  // even bytes are Y
     pand       xmm1, xmm5
     packuswb   xmm0, xmm1
     movdqu     [edx], xmm0
@@ -3920,18 +3857,20 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2,
   }
 }
 
-__declspec(naked)
-void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
+                                        int stride_yuy2,
+                                        uint8_t* dst_u,
+                                        uint8_t* dst_v,
+                                        int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]    // src_yuy2
-    mov        esi, [esp + 8 + 8]    // stride_yuy2
-    mov        edx, [esp + 8 + 12]   // dst_u
-    mov        edi, [esp + 8 + 16]   // dst_v
-    mov        ecx, [esp + 8 + 20]   // width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    mov        eax, [esp + 8 + 4]  // src_yuy2
+    mov        esi, [esp + 8 + 8]  // stride_yuy2
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
     psrlw      xmm5, 8
     sub        edi, edx
 
@@ -3943,13 +3882,13 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
     lea        eax,  [eax + 32]
     pavgb      xmm0, xmm2
     pavgb      xmm1, xmm3
-    psrlw      xmm0, 8      // YUYV -> UVUV
+    psrlw      xmm0, 8  // YUYV -> UVUV
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
     movdqa     xmm1, xmm0
     pand       xmm0, xmm5  // U
     packuswb   xmm0, xmm0
-    psrlw      xmm1, 8     // V
+    psrlw      xmm1, 8  // V
     packuswb   xmm1, xmm1
     movq       qword ptr [edx], xmm0
     movq       qword ptr [edx + edi], xmm1
@@ -3963,16 +3902,17 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
   }
 }
 
-__declspec(naked)
-void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
+                                           uint8_t* dst_u,
+                                           uint8_t* dst_v,
+                                           int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]    // src_yuy2
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    mov        eax, [esp + 4 + 4]  // src_yuy2
+    mov        edx, [esp + 4 + 8]  // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
     psrlw      xmm5, 8
     sub        edi, edx
 
@@ -3980,13 +3920,13 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
-    psrlw      xmm0, 8      // YUYV -> UVUV
+    psrlw      xmm0, 8  // YUYV -> UVUV
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
     movdqa     xmm1, xmm0
     pand       xmm0, xmm5  // U
     packuswb   xmm0, xmm0
-    psrlw      xmm1, 8     // V
+    psrlw      xmm1, 8  // V
     packuswb   xmm1, xmm1
     movq       qword ptr [edx], xmm0
     movq       qword ptr [edx + edi], xmm1
@@ -3999,19 +3939,19 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
   }
 }
 
-__declspec(naked)
-void UYVYToYRow_SSE2(const uint8* src_uyvy,
-                     uint8* dst_y, int width) {
+__declspec(naked) void UYVYToYRow_SSE2(const uint8_t* src_uyvy,
+                                       uint8_t* dst_y,
+                                       int width) {
   __asm {
-    mov        eax, [esp + 4]    // src_uyvy
-    mov        edx, [esp + 8]    // dst_y
-    mov        ecx, [esp + 12]   // width
+    mov        eax, [esp + 4]  // src_uyvy
+    mov        edx, [esp + 8]  // dst_y
+    mov        ecx, [esp + 12]  // width
 
   convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
-    psrlw      xmm0, 8    // odd bytes are Y
+    psrlw      xmm0, 8  // odd bytes are Y
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
     movdqu     [edx], xmm0
@@ -4022,18 +3962,20 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy,
   }
 }
 
-__declspec(naked)
-void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
+                                        int stride_uyvy,
+                                        uint8_t* dst_u,
+                                        uint8_t* dst_v,
+                                        int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]    // src_yuy2
-    mov        esi, [esp + 8 + 8]    // stride_yuy2
-    mov        edx, [esp + 8 + 12]   // dst_u
-    mov        edi, [esp + 8 + 16]   // dst_v
-    mov        ecx, [esp + 8 + 20]   // width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    mov        eax, [esp + 8 + 4]  // src_yuy2
+    mov        esi, [esp + 8 + 8]  // stride_yuy2
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
     psrlw      xmm5, 8
     sub        edi, edx
 
@@ -4045,13 +3987,13 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
     lea        eax,  [eax + 32]
     pavgb      xmm0, xmm2
     pavgb      xmm1, xmm3
-    pand       xmm0, xmm5   // UYVY -> UVUV
+    pand       xmm0, xmm5  // UYVY -> UVUV
     pand       xmm1, xmm5
     packuswb   xmm0, xmm1
     movdqa     xmm1, xmm0
     pand       xmm0, xmm5  // U
     packuswb   xmm0, xmm0
-    psrlw      xmm1, 8     // V
+    psrlw      xmm1, 8  // V
     packuswb   xmm1, xmm1
     movq       qword ptr [edx], xmm0
     movq       qword ptr [edx + edi], xmm1
@@ -4065,16 +4007,17 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
   }
 }
 
-__declspec(naked)
-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
+                                           uint8_t* dst_u,
+                                           uint8_t* dst_v,
+                                           int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]    // src_yuy2
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    mov        eax, [esp + 4 + 4]  // src_yuy2
+    mov        edx, [esp + 4 + 8]  // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
     psrlw      xmm5, 8
     sub        edi, edx
 
@@ -4082,13 +4025,13 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
-    pand       xmm0, xmm5   // UYVY -> UVUV
+    pand       xmm0, xmm5  // UYVY -> UVUV
     pand       xmm1, xmm5
     packuswb   xmm0, xmm1
     movdqa     xmm1, xmm0
     pand       xmm0, xmm5  // U
     packuswb   xmm0, xmm0
-    psrlw      xmm1, 8     // V
+    psrlw      xmm1, 8  // V
     packuswb   xmm1, xmm1
     movq       qword ptr [edx], xmm0
     movq       qword ptr [edx + edi], xmm1
@@ -4108,13 +4051,15 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
 // =((A2*C2)+(B2*(255-C2))+255)/256
 // signed version of math
 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-__declspec(naked)
-void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
-                         const uint8* alpha, uint8* dst, int width) {
+__declspec(naked) void BlendPlaneRow_SSSE3(const uint8_t* src0,
+                                           const uint8_t* src1,
+                                           const uint8_t* alpha,
+                                           uint8_t* dst,
+                                           int width) {
   __asm {
     push       esi
     push       edi
-    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
+    pcmpeqb    xmm5, xmm5  // generate mask 0xff00ff00
     psllw      xmm5, 8
     mov        eax, 0x80808080  // 128 for biasing image to signed.
     movd       xmm6, eax
@@ -4123,8 +4068,8 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
     mov        eax, 0x807f807f  // 32768 + 127 for unbias and round.
     movd       xmm7, eax
     pshufd     xmm7, xmm7, 0x00
-    mov        eax, [esp + 8 + 4]   // src0
-    mov        edx, [esp + 8 + 8]   // src1
+    mov        eax, [esp + 8 + 4]  // src0
+    mov        edx, [esp + 8 + 8]  // src1
     mov        esi, [esp + 8 + 12]  // alpha
     mov        edi, [esp + 8 + 16]  // dst
     mov        ecx, [esp + 8 + 20]  // width
@@ -4132,17 +4077,17 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
     sub        edx, esi
     sub        edi, esi
 
-    // 8 pixel loop.
+        // 8 pixel loop.
   convertloop8:
-    movq       xmm0, qword ptr [esi]        // alpha
+    movq       xmm0, qword ptr [esi]  // alpha
     punpcklbw  xmm0, xmm0
-    pxor       xmm0, xmm5         // a, 255-a
+    pxor       xmm0, xmm5  // a, 255-a
     movq       xmm1, qword ptr [eax + esi]  // src0
     movq       xmm2, qword ptr [edx + esi]  // src1
     punpcklbw  xmm1, xmm2
-    psubb      xmm1, xmm6         // bias src0/1 - 128
+    psubb      xmm1, xmm6  // bias src0/1 - 128
     pmaddubsw  xmm0, xmm1
-    paddw      xmm0, xmm7         // unbias result - 32768 and round.
+    paddw      xmm0, xmm7  // unbias result - 32768 and round.
     psrlw      xmm0, 8
     packuswb   xmm0, xmm0
     movq       qword ptr [edi + esi], xmm0
@@ -4163,13 +4108,15 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
 // =((A2*C2)+(B2*(255-C2))+255)/256
 // signed version of math
 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-__declspec(naked)
-void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
-                         const uint8* alpha, uint8* dst, int width) {
+__declspec(naked) void BlendPlaneRow_AVX2(const uint8_t* src0,
+                                          const uint8_t* src1,
+                                          const uint8_t* alpha,
+                                          uint8_t* dst,
+                                          int width) {
   __asm {
     push        esi
     push        edi
-    vpcmpeqb    ymm5, ymm5, ymm5       // generate mask 0xff00ff00
+    vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0xff00ff00
     vpsllw      ymm5, ymm5, 8
     mov         eax, 0x80808080  // 128 for biasing image to signed.
     vmovd       xmm6, eax
@@ -4177,8 +4124,8 @@ void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
     mov         eax, 0x807f807f  // 32768 + 127 for unbias and round.
     vmovd       xmm7, eax
     vbroadcastss ymm7, xmm7
-    mov         eax, [esp + 8 + 4]   // src0
-    mov         edx, [esp + 8 + 8]   // src1
+    mov         eax, [esp + 8 + 4]  // src0
+    mov         edx, [esp + 8 + 8]  // src1
     mov         esi, [esp + 8 + 12]  // alpha
     mov         edi, [esp + 8 + 16]  // dst
     mov         ecx, [esp + 8 + 20]  // width
@@ -4186,23 +4133,23 @@ void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
     sub         edx, esi
     sub         edi, esi
 
-    // 32 pixel loop.
+        // 32 pixel loop.
   convertloop32:
-    vmovdqu     ymm0, [esi]        // alpha
-    vpunpckhbw  ymm3, ymm0, ymm0   // 8..15, 24..31
-    vpunpcklbw  ymm0, ymm0, ymm0   // 0..7, 16..23
-    vpxor       ymm3, ymm3, ymm5   // a, 255-a
-    vpxor       ymm0, ymm0, ymm5   // a, 255-a
+    vmovdqu     ymm0, [esi]  // alpha
+    vpunpckhbw  ymm3, ymm0, ymm0  // 8..15, 24..31
+    vpunpcklbw  ymm0, ymm0, ymm0  // 0..7, 16..23
+    vpxor       ymm3, ymm3, ymm5  // a, 255-a
+    vpxor       ymm0, ymm0, ymm5  // a, 255-a
     vmovdqu     ymm1, [eax + esi]  // src0
     vmovdqu     ymm2, [edx + esi]  // src1
     vpunpckhbw  ymm4, ymm1, ymm2
     vpunpcklbw  ymm1, ymm1, ymm2
-    vpsubb      ymm4, ymm4, ymm6   // bias src0/1 - 128
-    vpsubb      ymm1, ymm1, ymm6   // bias src0/1 - 128
+    vpsubb      ymm4, ymm4, ymm6  // bias src0/1 - 128
+    vpsubb      ymm1, ymm1, ymm6  // bias src0/1 - 128
     vpmaddubsw  ymm3, ymm3, ymm4
     vpmaddubsw  ymm0, ymm0, ymm1
-    vpaddw      ymm3, ymm3, ymm7   // unbias result - 32768 and round.
-    vpaddw      ymm0, ymm0, ymm7   // unbias result - 32768 and round.
+    vpaddw      ymm3, ymm3, ymm7  // unbias result - 32768 and round.
+    vpaddw      ymm0, ymm0, ymm7  // unbias result - 32768 and round.
     vpsrlw      ymm3, ymm3, 8
     vpsrlw      ymm0, ymm0, 8
     vpackuswb   ymm0, ymm0, ymm3
@@ -4221,52 +4168,51 @@ void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
 
 #ifdef HAS_ARGBBLENDROW_SSSE3
 // Shuffle table for isolating alpha.
-static const uvec8 kShuffleAlpha = {
-  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
-  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
-};
+static const uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,
+                                    11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
 
 // Blend 8 pixels at a time.
-__declspec(naked)
-void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
-                        uint8* dst_argb, int width) {
+__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
+                                          const uint8_t* src_argb1,
+                                          uint8_t* dst_argb,
+                                          int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm7, xmm7       // generate constant 0x0001
+    pcmpeqb    xmm7, xmm7  // generate constant 0x0001
     psrlw      xmm7, 15
-    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
+    pcmpeqb    xmm6, xmm6  // generate mask 0x00ff00ff
     psrlw      xmm6, 8
-    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
+    pcmpeqb    xmm5, xmm5  // generate mask 0xff00ff00
     psllw      xmm5, 8
-    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
+    pcmpeqb    xmm4, xmm4  // generate mask 0xff000000
     pslld      xmm4, 24
     sub        ecx, 4
-    jl         convertloop4b    // less than 4 pixels?
+    jl         convertloop4b  // less than 4 pixels?
 
-    // 4 pixel loop.
+        // 4 pixel loop.
   convertloop4:
-    movdqu     xmm3, [eax]      // src argb
+    movdqu     xmm3, [eax]  // src argb
     lea        eax, [eax + 16]
-    movdqa     xmm0, xmm3       // src argb
-    pxor       xmm3, xmm4       // ~alpha
-    movdqu     xmm2, [esi]      // _r_b
-    pshufb     xmm3, xmmword ptr kShuffleAlpha // alpha
-    pand       xmm2, xmm6       // _r_b
-    paddw      xmm3, xmm7       // 256 - alpha
-    pmullw     xmm2, xmm3       // _r_b * alpha
-    movdqu     xmm1, [esi]      // _a_g
+    movdqa     xmm0, xmm3  // src argb
+    pxor       xmm3, xmm4  // ~alpha
+    movdqu     xmm2, [esi]  // _r_b
+    pshufb     xmm3, xmmword ptr kShuffleAlpha  // alpha
+    pand       xmm2, xmm6  // _r_b
+    paddw      xmm3, xmm7  // 256 - alpha
+    pmullw     xmm2, xmm3  // _r_b * alpha
+    movdqu     xmm1, [esi]  // _a_g
     lea        esi, [esi + 16]
-    psrlw      xmm1, 8          // _a_g
-    por        xmm0, xmm4       // set alpha to 255
-    pmullw     xmm1, xmm3       // _a_g * alpha
-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2       // + src argb
-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1       // + src argb
+    psrlw      xmm1, 8  // _a_g
+    por        xmm0, xmm4  // set alpha to 255
+    pmullw     xmm1, xmm3  // _a_g * alpha
+    psrlw      xmm2, 8  // _r_b convert to 8 bits again
+    paddusb    xmm0, xmm2  // + src argb
+    pand       xmm1, xmm5  // a_g_ convert to 8 bits again
+    paddusb    xmm0, xmm1  // + src argb
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 4
@@ -4276,26 +4222,26 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
     add        ecx, 4 - 1
     jl         convertloop1b
 
-    // 1 pixel loop.
+        // 1 pixel loop.
   convertloop1:
-    movd       xmm3, [eax]      // src argb
+    movd       xmm3, [eax]  // src argb
     lea        eax, [eax + 4]
-    movdqa     xmm0, xmm3       // src argb
-    pxor       xmm3, xmm4       // ~alpha
-    movd       xmm2, [esi]      // _r_b
-    pshufb     xmm3, xmmword ptr kShuffleAlpha // alpha
-    pand       xmm2, xmm6       // _r_b
-    paddw      xmm3, xmm7       // 256 - alpha
-    pmullw     xmm2, xmm3       // _r_b * alpha
-    movd       xmm1, [esi]      // _a_g
+    movdqa     xmm0, xmm3  // src argb
+    pxor       xmm3, xmm4  // ~alpha
+    movd       xmm2, [esi]  // _r_b
+    pshufb     xmm3, xmmword ptr kShuffleAlpha  // alpha
+    pand       xmm2, xmm6  // _r_b
+    paddw      xmm3, xmm7  // 256 - alpha
+    pmullw     xmm2, xmm3  // _r_b * alpha
+    movd       xmm1, [esi]  // _a_g
     lea        esi, [esi + 4]
-    psrlw      xmm1, 8          // _a_g
-    por        xmm0, xmm4       // set alpha to 255
-    pmullw     xmm1, xmm3       // _a_g * alpha
-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2       // + src argb
-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1       // + src argb
+    psrlw      xmm1, 8  // _a_g
+    por        xmm0, xmm4  // set alpha to 255
+    pmullw     xmm1, xmm3  // _a_g * alpha
+    psrlw      xmm2, 8  // _r_b convert to 8 bits again
+    paddusb    xmm0, xmm2  // + src argb
+    pand       xmm1, xmm5  // a_g_ convert to 8 bits again
+    paddusb    xmm0, xmm1  // + src argb
     movd       [edx], xmm0
     lea        edx, [edx + 4]
     sub        ecx, 1
@@ -4311,41 +4257,42 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
 #ifdef HAS_ARGBATTENUATEROW_SSSE3
 // Shuffle table duplicating alpha.
 static const uvec8 kShuffleAlpha0 = {
-  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
+    3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
 };
 static const uvec8 kShuffleAlpha1 = {
-  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
-  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
+    11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+    15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
 };
-__declspec(naked)
-void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+__declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
+                                              uint8_t* dst_argb,
+                                              int width) {
   __asm {
-    mov        eax, [esp + 4]   // src_argb0
-    mov        edx, [esp + 8]   // dst_argb
+    mov        eax, [esp + 4]  // src_argb0
+    mov        edx, [esp + 8]  // dst_argb
     mov        ecx, [esp + 12]  // width
-    pcmpeqb    xmm3, xmm3       // generate mask 0xff000000
+    pcmpeqb    xmm3, xmm3  // generate mask 0xff000000
     pslld      xmm3, 24
     movdqa     xmm4, xmmword ptr kShuffleAlpha0
     movdqa     xmm5, xmmword ptr kShuffleAlpha1
 
  convertloop:
-    movdqu     xmm0, [eax]      // read 4 pixels
-    pshufb     xmm0, xmm4       // isolate first 2 alphas
-    movdqu     xmm1, [eax]      // read 4 pixels
-    punpcklbw  xmm1, xmm1       // first 2 pixel rgbs
-    pmulhuw    xmm0, xmm1       // rgb * a
-    movdqu     xmm1, [eax]      // read 4 pixels
-    pshufb     xmm1, xmm5       // isolate next 2 alphas
-    movdqu     xmm2, [eax]      // read 4 pixels
-    punpckhbw  xmm2, xmm2       // next 2 pixel rgbs
-    pmulhuw    xmm1, xmm2       // rgb * a
-    movdqu     xmm2, [eax]      // mask original alpha
+    movdqu     xmm0, [eax]  // read 4 pixels
+    pshufb     xmm0, xmm4  // isolate first 2 alphas
+    movdqu     xmm1, [eax]  // read 4 pixels
+    punpcklbw  xmm1, xmm1  // first 2 pixel rgbs
+    pmulhuw    xmm0, xmm1  // rgb * a
+    movdqu     xmm1, [eax]  // read 4 pixels
+    pshufb     xmm1, xmm5  // isolate next 2 alphas
+    movdqu     xmm2, [eax]  // read 4 pixels
+    punpckhbw  xmm2, xmm2  // next 2 pixel rgbs
+    pmulhuw    xmm1, xmm2  // rgb * a
+    movdqu     xmm2, [eax]  // mask original alpha
     lea        eax, [eax + 16]
     pand       xmm2, xmm3
     psrlw      xmm0, 8
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
-    por        xmm0, xmm2       // copy original alpha
+    por        xmm0, xmm2  // copy original alpha
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 4
@@ -4358,22 +4305,23 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
 
 #ifdef HAS_ARGBATTENUATEROW_AVX2
 // Shuffle table duplicating alpha.
-static const uvec8 kShuffleAlpha_AVX2 = {
-  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
-};
-__declspec(naked)
-void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
+static const uvec8 kShuffleAlpha_AVX2 = {6u,   7u,   6u,   7u,  6u,  7u,
+                                         128u, 128u, 14u,  15u, 14u, 15u,
+                                         14u,  15u,  128u, 128u};
+__declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
+                                             uint8_t* dst_argb,
+                                             int width) {
   __asm {
-    mov        eax, [esp + 4]   // src_argb0
-    mov        edx, [esp + 8]   // dst_argb
+    mov        eax, [esp + 4]  // src_argb0
+    mov        edx, [esp + 8]  // dst_argb
     mov        ecx, [esp + 12]  // width
     sub        edx, eax
     vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
-    vpcmpeqb   ymm5, ymm5, ymm5 // generate mask 0xff000000
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xff000000
     vpslld     ymm5, ymm5, 24
 
  convertloop:
-    vmovdqu    ymm6, [eax]       // read 8 pixels.
+    vmovdqu    ymm6, [eax]  // read 8 pixels.
     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
     vpshufb    ymm2, ymm0, ymm4  // low 4 alphas
@@ -4398,40 +4346,40 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
 
 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
 // Unattenuate 4 pixels at a time.
-__declspec(naked)
-void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
-                             int width) {
+__declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
+                                               uint8_t* dst_argb,
+                                               int width) {
   __asm {
     push       ebx
     push       esi
     push       edi
-    mov        eax, [esp + 12 + 4]   // src_argb
-    mov        edx, [esp + 12 + 8]   // dst_argb
+    mov        eax, [esp + 12 + 4]  // src_argb
+    mov        edx, [esp + 12 + 8]  // dst_argb
     mov        ecx, [esp + 12 + 12]  // width
     lea        ebx, fixed_invtbl8
 
  convertloop:
-    movdqu     xmm0, [eax]      // read 4 pixels
+    movdqu     xmm0, [eax]  // read 4 pixels
     movzx      esi, byte ptr [eax + 3]  // first alpha
     movzx      edi, byte ptr [eax + 7]  // second alpha
-    punpcklbw  xmm0, xmm0       // first 2
+    punpcklbw  xmm0, xmm0  // first 2
     movd       xmm2, dword ptr [ebx + esi * 4]
     movd       xmm3, dword ptr [ebx + edi * 4]
-    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words.  1, a, a, a
-    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
+    pshuflw    xmm2, xmm2, 040h  // first 4 inv_alpha words.  1, a, a, a
+    pshuflw    xmm3, xmm3, 040h  // next 4 inv_alpha words
     movlhps    xmm2, xmm3
-    pmulhuw    xmm0, xmm2       // rgb * a
+    pmulhuw    xmm0, xmm2  // rgb * a
 
-    movdqu     xmm1, [eax]      // read 4 pixels
+    movdqu     xmm1, [eax]  // read 4 pixels
     movzx      esi, byte ptr [eax + 11]  // third alpha
     movzx      edi, byte ptr [eax + 15]  // forth alpha
-    punpckhbw  xmm1, xmm1       // next 2
+    punpckhbw  xmm1, xmm1  // next 2
     movd       xmm2, dword ptr [ebx + esi * 4]
     movd       xmm3, dword ptr [ebx + edi * 4]
-    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words
-    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
+    pshuflw    xmm2, xmm2, 040h  // first 4 inv_alpha words
+    pshuflw    xmm3, xmm3, 040h  // next 4 inv_alpha words
     movlhps    xmm2, xmm3
-    pmulhuw    xmm1, xmm2       // rgb * a
+    pmulhuw    xmm1, xmm2  // rgb * a
     lea        eax, [eax + 16]
     packuswb   xmm0, xmm1
     movdqu     [edx], xmm0
@@ -4450,25 +4398,24 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
 // Shuffle table duplicating alpha.
 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
-  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
-};
+    0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
 // USE_GATHER is not on by default, due to being a slow instruction.
 #ifdef USE_GATHER
-__declspec(naked)
-void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
-                             int width) {
+__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
+                                               uint8_t* dst_argb,
+                                               int width) {
   __asm {
-    mov        eax, [esp + 4]   // src_argb0
-    mov        edx, [esp + 8]   // dst_argb
+    mov        eax, [esp + 4]  // src_argb0
+    mov        edx, [esp + 8]  // dst_argb
     mov        ecx, [esp + 12]  // width
     sub        edx, eax
     vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
 
  convertloop:
-    vmovdqu    ymm6, [eax]       // read 8 pixels.
+    vmovdqu    ymm6, [eax]  // read 8 pixels.
     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
-    vpsrld     ymm2, ymm6, 24    // alpha in low 8 bits.
+    vpsrld     ymm2, ymm6, 24  // alpha in low 8 bits.
     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
     vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5  // ymm5 cleared.  1, a
@@ -4488,50 +4435,50 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
     ret
   }
 }
-#else  // USE_GATHER
-__declspec(naked)
-void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
-                             int width) {
+#else   // USE_GATHER
+__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
+                                               uint8_t* dst_argb,
+                                               int width) {
   __asm {
 
     push       ebx
     push       esi
     push       edi
-    mov        eax, [esp + 12 + 4]   // src_argb
-    mov        edx, [esp + 12 + 8]   // dst_argb
+    mov        eax, [esp + 12 + 4]  // src_argb
+    mov        edx, [esp + 12 + 8]  // dst_argb
     mov        ecx, [esp + 12 + 12]  // width
     sub        edx, eax
     lea        ebx, fixed_invtbl8
     vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
 
  convertloop:
-    // replace VPGATHER
-    movzx      esi, byte ptr [eax + 3]                 // alpha0
-    movzx      edi, byte ptr [eax + 7]                 // alpha1
+        // replace VPGATHER
+    movzx      esi, byte ptr [eax + 3]  // alpha0
+    movzx      edi, byte ptr [eax + 7]  // alpha1
     vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a0]
     vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a1]
-    movzx      esi, byte ptr [eax + 11]                // alpha2
-    movzx      edi, byte ptr [eax + 15]                // alpha3
-    vpunpckldq xmm6, xmm0, xmm1                        // [1,a1,1,a0]
+    movzx      esi, byte ptr [eax + 11]  // alpha2
+    movzx      edi, byte ptr [eax + 15]  // alpha3
+    vpunpckldq xmm6, xmm0, xmm1  // [1,a1,1,a0]
     vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a2]
     vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a3]
-    movzx      esi, byte ptr [eax + 19]                // alpha4
-    movzx      edi, byte ptr [eax + 23]                // alpha5
-    vpunpckldq xmm7, xmm2, xmm3                        // [1,a3,1,a2]
+    movzx      esi, byte ptr [eax + 19]  // alpha4
+    movzx      edi, byte ptr [eax + 23]  // alpha5
+    vpunpckldq xmm7, xmm2, xmm3  // [1,a3,1,a2]
     vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a4]
     vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a5]
-    movzx      esi, byte ptr [eax + 27]                // alpha6
-    movzx      edi, byte ptr [eax + 31]                // alpha7
-    vpunpckldq xmm0, xmm0, xmm1                        // [1,a5,1,a4]
+    movzx      esi, byte ptr [eax + 27]  // alpha6
+    movzx      edi, byte ptr [eax + 31]  // alpha7
+    vpunpckldq xmm0, xmm0, xmm1  // [1,a5,1,a4]
     vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a6]
     vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a7]
-    vpunpckldq xmm2, xmm2, xmm3                        // [1,a7,1,a6]
-    vpunpcklqdq xmm3, xmm6, xmm7                       // [1,a3,1,a2,1,a1,1,a0]
-    vpunpcklqdq xmm0, xmm0, xmm2                       // [1,a7,1,a6,1,a5,1,a4]
-    vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
+    vpunpckldq xmm2, xmm2, xmm3  // [1,a7,1,a6]
+    vpunpcklqdq xmm3, xmm6, xmm7  // [1,a3,1,a2,1,a1,1,a0]
+    vpunpcklqdq xmm0, xmm0, xmm2  // [1,a7,1,a6,1,a5,1,a4]
+    vinserti128 ymm3, ymm3, xmm0, 1                // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
     // end of VPGATHER
 
-    vmovdqu    ymm6, [eax]       // read 8 pixels.
+    vmovdqu    ymm6, [eax]  // read 8 pixels.
     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
     vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
@@ -4540,7 +4487,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
     vpshufb    ymm3, ymm3, ymm5  // replicate high 4 alphas
     vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
     vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
-    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
+    vpackuswb  ymm0, ymm0, ymm1             // unmutated.
     vmovdqu    [eax + edx], ymm0
     lea        eax, [eax + 32]
     sub        ecx, 8
@@ -4558,12 +4505,13 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
 
 #ifdef HAS_ARGBGRAYROW_SSSE3
 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
-__declspec(naked)
-void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+__declspec(naked) void ARGBGrayRow_SSSE3(const uint8_t* src_argb,
+                                         uint8_t* dst_argb,
+                                         int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_argb */
-    mov        ecx, [esp + 12]  /* width */
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_argb */
+    mov        ecx, [esp + 12] /* width */
     movdqa     xmm4, xmmword ptr kARGBToYJ
     movdqa     xmm5, xmmword ptr kAddYJ64
 
@@ -4575,20 +4523,20 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
     phaddw     xmm0, xmm1
     paddw      xmm0, xmm5  // Add .5 for rounding.
     psrlw      xmm0, 7
-    packuswb   xmm0, xmm0   // 8 G bytes
+    packuswb   xmm0, xmm0  // 8 G bytes
     movdqu     xmm2, [eax]  // A
     movdqu     xmm3, [eax + 16]
     lea        eax, [eax + 32]
     psrld      xmm2, 24
     psrld      xmm3, 24
     packuswb   xmm2, xmm3
-    packuswb   xmm2, xmm2   // 8 A bytes
-    movdqa     xmm3, xmm0   // Weave into GG, GA, then GGGA
-    punpcklbw  xmm0, xmm0   // 8 GG words
-    punpcklbw  xmm3, xmm2   // 8 GA words
+    packuswb   xmm2, xmm2  // 8 A bytes
+    movdqa     xmm3, xmm0  // Weave into GG, GA, then GGGA
+    punpcklbw  xmm0, xmm0  // 8 GG words
+    punpcklbw  xmm3, xmm2  // 8 GA words
     movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm3   // GGGA first 4
-    punpckhwd  xmm1, xmm3   // GGGA next 4
+    punpcklwd  xmm0, xmm3  // GGGA first 4
+    punpckhwd  xmm1, xmm3  // GGGA next 4
     movdqu     [edx], xmm0
     movdqu     [edx + 16], xmm1
     lea        edx, [edx + 32]
@@ -4604,24 +4552,20 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
 //    g = (r * 45 + g * 88 + b * 22) >> 7
 //    r = (r * 50 + g * 98 + b * 24) >> 7
 // Constant for ARGB color to sepia tone.
-static const vec8 kARGBToSepiaB = {
-  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
-};
+static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
+                                   17, 68, 35, 0, 17, 68, 35, 0};
 
-static const vec8 kARGBToSepiaG = {
-  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
-};
+static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
+                                   22, 88, 45, 0, 22, 88, 45, 0};
 
-static const vec8 kARGBToSepiaR = {
-  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
-};
+static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
+                                   24, 98, 50, 0, 24, 98, 50, 0};
 
 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-__declspec(naked)
-void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
+__declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
   __asm {
-    mov        eax, [esp + 4]   /* dst_argb */
-    mov        ecx, [esp + 8]   /* width */
+    mov        eax, [esp + 4] /* dst_argb */
+    mov        ecx, [esp + 8] /* width */
     movdqa     xmm2, xmmword ptr kARGBToSepiaB
     movdqa     xmm3, xmmword ptr kARGBToSepiaG
     movdqa     xmm4, xmmword ptr kARGBToSepiaR
@@ -4633,32 +4577,32 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
     pmaddubsw  xmm6, xmm2
     phaddw     xmm0, xmm6
     psrlw      xmm0, 7
-    packuswb   xmm0, xmm0   // 8 B values
+    packuswb   xmm0, xmm0  // 8 B values
     movdqu     xmm5, [eax]  // G
     movdqu     xmm1, [eax + 16]
     pmaddubsw  xmm5, xmm3
     pmaddubsw  xmm1, xmm3
     phaddw     xmm5, xmm1
     psrlw      xmm5, 7
-    packuswb   xmm5, xmm5   // 8 G values
-    punpcklbw  xmm0, xmm5   // 8 BG values
+    packuswb   xmm5, xmm5  // 8 G values
+    punpcklbw  xmm0, xmm5  // 8 BG values
     movdqu     xmm5, [eax]  // R
     movdqu     xmm1, [eax + 16]
     pmaddubsw  xmm5, xmm4
     pmaddubsw  xmm1, xmm4
     phaddw     xmm5, xmm1
     psrlw      xmm5, 7
-    packuswb   xmm5, xmm5   // 8 R values
+    packuswb   xmm5, xmm5  // 8 R values
     movdqu     xmm6, [eax]  // A
     movdqu     xmm1, [eax + 16]
     psrld      xmm6, 24
     psrld      xmm1, 24
     packuswb   xmm6, xmm1
-    packuswb   xmm6, xmm6   // 8 A values
-    punpcklbw  xmm5, xmm6   // 8 RA values
-    movdqa     xmm1, xmm0   // Weave BG, RA together
-    punpcklwd  xmm0, xmm5   // BGRA first 4
-    punpckhwd  xmm1, xmm5   // BGRA next 4
+    packuswb   xmm6, xmm6  // 8 A values
+    punpcklbw  xmm5, xmm6  // 8 RA values
+    movdqa     xmm1, xmm0  // Weave BG, RA together
+    punpcklwd  xmm0, xmm5  // BGRA first 4
+    punpckhwd  xmm1, xmm5  // BGRA next 4
     movdqu     [eax], xmm0
     movdqu     [eax + 16], xmm1
     lea        eax, [eax + 32]
@@ -4674,19 +4618,20 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
 // Same as Sepia except matrix is provided.
 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
-__declspec(naked)
-void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                              const int8* matrix_argb, int width) {
+__declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
+                                                uint8_t* dst_argb,
+                                                const int8_t* matrix_argb,
+                                                int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_argb */
-    mov        ecx, [esp + 12]  /* matrix_argb */
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_argb */
+    mov        ecx, [esp + 12] /* matrix_argb */
     movdqu     xmm5, [ecx]
     pshufd     xmm2, xmm5, 0x00
     pshufd     xmm3, xmm5, 0x55
     pshufd     xmm4, xmm5, 0xaa
     pshufd     xmm5, xmm5, 0xff
-    mov        ecx, [esp + 16]  /* width */
+    mov        ecx, [esp + 16] /* width */
 
  convertloop:
     movdqu     xmm0, [eax]  // B
@@ -4697,31 +4642,31 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
     movdqu     xmm1, [eax + 16]
     pmaddubsw  xmm6, xmm3
     pmaddubsw  xmm1, xmm3
-    phaddsw    xmm0, xmm7   // B
-    phaddsw    xmm6, xmm1   // G
-    psraw      xmm0, 6      // B
-    psraw      xmm6, 6      // G
-    packuswb   xmm0, xmm0   // 8 B values
-    packuswb   xmm6, xmm6   // 8 G values
-    punpcklbw  xmm0, xmm6   // 8 BG values
+    phaddsw    xmm0, xmm7  // B
+    phaddsw    xmm6, xmm1  // G
+    psraw      xmm0, 6  // B
+    psraw      xmm6, 6  // G
+    packuswb   xmm0, xmm0  // 8 B values
+    packuswb   xmm6, xmm6  // 8 G values
+    punpcklbw  xmm0, xmm6  // 8 BG values
     movdqu     xmm1, [eax]  // R
     movdqu     xmm7, [eax + 16]
     pmaddubsw  xmm1, xmm4
     pmaddubsw  xmm7, xmm4
-    phaddsw    xmm1, xmm7   // R
+    phaddsw    xmm1, xmm7  // R
     movdqu     xmm6, [eax]  // A
     movdqu     xmm7, [eax + 16]
     pmaddubsw  xmm6, xmm5
     pmaddubsw  xmm7, xmm5
-    phaddsw    xmm6, xmm7   // A
-    psraw      xmm1, 6      // R
-    psraw      xmm6, 6      // A
-    packuswb   xmm1, xmm1   // 8 R values
-    packuswb   xmm6, xmm6   // 8 A values
-    punpcklbw  xmm1, xmm6   // 8 RA values
-    movdqa     xmm6, xmm0   // Weave BG, RA together
-    punpcklwd  xmm0, xmm1   // BGRA first 4
-    punpckhwd  xmm6, xmm1   // BGRA next 4
+    phaddsw    xmm6, xmm7  // A
+    psraw      xmm1, 6  // R
+    psraw      xmm6, 6  // A
+    packuswb   xmm1, xmm1  // 8 R values
+    packuswb   xmm6, xmm6  // 8 A values
+    punpcklbw  xmm1, xmm6  // 8 RA values
+    movdqa     xmm6, xmm0  // Weave BG, RA together
+    punpcklwd  xmm0, xmm1  // BGRA first 4
+    punpckhwd  xmm6, xmm1  // BGRA next 4
     movdqu     [edx], xmm0
     movdqu     [edx + 16], xmm6
     lea        eax, [eax + 32]
@@ -4735,15 +4680,17 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
 
 #ifdef HAS_ARGBQUANTIZEROW_SSE2
 // Quantize 4 ARGB pixels (16 bytes).
-__declspec(naked)
-void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
-                          int interval_offset, int width) {
+__declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
+                                            int scale,
+                                            int interval_size,
+                                            int interval_offset,
+                                            int width) {
   __asm {
-    mov        eax, [esp + 4]    /* dst_argb */
-    movd       xmm2, [esp + 8]   /* scale */
-    movd       xmm3, [esp + 12]  /* interval_size */
-    movd       xmm4, [esp + 16]  /* interval_offset */
-    mov        ecx, [esp + 20]   /* width */
+    mov        eax, [esp + 4] /* dst_argb */
+    movd       xmm2, [esp + 8] /* scale */
+    movd       xmm3, [esp + 12] /* interval_size */
+    movd       xmm4, [esp + 16] /* interval_offset */
+    mov        ecx, [esp + 20] /* width */
     pshuflw    xmm2, xmm2, 040h
     pshufd     xmm2, xmm2, 044h
     pshuflw    xmm3, xmm3, 040h
@@ -4756,16 +4703,16 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
 
  convertloop:
     movdqu     xmm0, [eax]  // read 4 pixels
-    punpcklbw  xmm0, xmm5   // first 2 pixels
-    pmulhuw    xmm0, xmm2   // pixel * scale >> 16
+    punpcklbw  xmm0, xmm5  // first 2 pixels
+    pmulhuw    xmm0, xmm2  // pixel * scale >> 16
     movdqu     xmm1, [eax]  // read 4 pixels
-    punpckhbw  xmm1, xmm5   // next 2 pixels
+    punpckhbw  xmm1, xmm5  // next 2 pixels
     pmulhuw    xmm1, xmm2
-    pmullw     xmm0, xmm3   // * interval_size
+    pmullw     xmm0, xmm3  // * interval_size
     movdqu     xmm7, [eax]  // read 4 pixels
     pmullw     xmm1, xmm3
-    pand       xmm7, xmm6   // mask alpha
-    paddw      xmm0, xmm4   // + interval_size / 2
+    pand       xmm7, xmm6  // mask alpha
+    paddw      xmm0, xmm4  // + interval_size / 2
     paddw      xmm1, xmm4
     packuswb   xmm0, xmm1
     por        xmm0, xmm7
@@ -4780,25 +4727,26 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
 
 #ifdef HAS_ARGBSHADEROW_SSE2
 // Shade 4 pixels at a time by specified value.
-__declspec(naked)
-void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
-                       uint32 value) {
+__declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
+                                         uint8_t* dst_argb,
+                                         int width,
+                                         uint32_t value) {
   __asm {
-    mov        eax, [esp + 4]   // src_argb
-    mov        edx, [esp + 8]   // dst_argb
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_argb
     mov        ecx, [esp + 12]  // width
     movd       xmm2, [esp + 16]  // value
     punpcklbw  xmm2, xmm2
     punpcklqdq xmm2, xmm2
 
  convertloop:
-    movdqu     xmm0, [eax]      // read 4 pixels
+    movdqu     xmm0, [eax]  // read 4 pixels
     lea        eax, [eax + 16]
     movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm0       // first 2
-    punpckhbw  xmm1, xmm1       // next 2
-    pmulhuw    xmm0, xmm2       // argb * value
-    pmulhuw    xmm1, xmm2       // argb * value
+    punpcklbw  xmm0, xmm0  // first 2
+    punpckhbw  xmm1, xmm1  // next 2
+    pmulhuw    xmm0, xmm2  // argb * value
+    pmulhuw    xmm1, xmm2  // argb * value
     psrlw      xmm0, 8
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
@@ -4814,28 +4762,29 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
 
 #ifdef HAS_ARGBMULTIPLYROW_SSE2
 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked)
-void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
+__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
+                                            const uint8_t* src_argb1,
+                                            uint8_t* dst_argb,
+                                            int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
     pxor       xmm5, xmm5  // constant 0
 
  convertloop:
-    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
-    movdqu     xmm2, [esi]        // read 4 pixels from src_argb1
+    movdqu     xmm0, [eax]  // read 4 pixels from src_argb0
+    movdqu     xmm2, [esi]  // read 4 pixels from src_argb1
     movdqu     xmm1, xmm0
     movdqu     xmm3, xmm2
-    punpcklbw  xmm0, xmm0         // first 2
-    punpckhbw  xmm1, xmm1         // next 2
-    punpcklbw  xmm2, xmm5         // first 2
-    punpckhbw  xmm3, xmm5         // next 2
-    pmulhuw    xmm0, xmm2         // src_argb0 * src_argb1 first 2
-    pmulhuw    xmm1, xmm3         // src_argb0 * src_argb1 next 2
+    punpcklbw  xmm0, xmm0  // first 2
+    punpckhbw  xmm1, xmm1  // next 2
+    punpcklbw  xmm2, xmm5  // first 2
+    punpckhbw  xmm3, xmm5  // next 2
+    pmulhuw    xmm0, xmm2  // src_argb0 * src_argb1 first 2
+    pmulhuw    xmm1, xmm3  // src_argb0 * src_argb1 next 2
     lea        eax, [eax + 16]
     lea        esi, [esi + 16]
     packuswb   xmm0, xmm1
@@ -4853,13 +4802,14 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 #ifdef HAS_ARGBADDROW_SSE2
 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
 // TODO(fbarchard): Port this to posix, neon and other math functions.
-__declspec(naked)
-void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                     uint8* dst_argb, int width) {
+__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
+                                       const uint8_t* src_argb1,
+                                       uint8_t* dst_argb,
+                                       int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
 
@@ -4867,11 +4817,11 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     jl         convertloop49
 
  convertloop4:
-    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
+    movdqu     xmm0, [eax]  // read 4 pixels from src_argb0
     lea        eax, [eax + 16]
-    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
+    movdqu     xmm1, [esi]  // read 4 pixels from src_argb1
     lea        esi, [esi + 16]
-    paddusb    xmm0, xmm1         // src_argb0 + src_argb1
+    paddusb    xmm0, xmm1  // src_argb0 + src_argb1
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 4
@@ -4882,11 +4832,11 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     jl         convertloop19
 
  convertloop1:
-    movd       xmm0, [eax]        // read 1 pixels from src_argb0
+    movd       xmm0, [eax]  // read 1 pixels from src_argb0
     lea        eax, [eax + 4]
-    movd       xmm1, [esi]        // read 1 pixels from src_argb1
+    movd       xmm1, [esi]  // read 1 pixels from src_argb1
     lea        esi, [esi + 4]
-    paddusb    xmm0, xmm1         // src_argb0 + src_argb1
+    paddusb    xmm0, xmm1  // src_argb0 + src_argb1
     movd       [edx], xmm0
     lea        edx, [edx + 4]
     sub        ecx, 1
@@ -4901,22 +4851,23 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 
 #ifdef HAS_ARGBSUBTRACTROW_SSE2
 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked)
-void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
+__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
+                                            const uint8_t* src_argb1,
+                                            uint8_t* dst_argb,
+                                            int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
 
  convertloop:
-    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
+    movdqu     xmm0, [eax]  // read 4 pixels from src_argb0
     lea        eax, [eax + 16]
-    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
+    movdqu     xmm1, [esi]  // read 4 pixels from src_argb1
     lea        esi, [esi + 16]
-    psubusb    xmm0, xmm1         // src_argb0 - src_argb1
+    psubusb    xmm0, xmm1  // src_argb0 - src_argb1
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 4
@@ -4930,28 +4881,29 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 
 #ifdef HAS_ARGBMULTIPLYROW_AVX2
 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked)
-void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
+__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
+                                            const uint8_t* src_argb1,
+                                            uint8_t* dst_argb,
+                                            int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
-    vpxor      ymm5, ymm5, ymm5     // constant 0
+    vpxor      ymm5, ymm5, ymm5  // constant 0
 
  convertloop:
-    vmovdqu    ymm1, [eax]        // read 8 pixels from src_argb0
+    vmovdqu    ymm1, [eax]  // read 8 pixels from src_argb0
     lea        eax, [eax + 32]
-    vmovdqu    ymm3, [esi]        // read 8 pixels from src_argb1
+    vmovdqu    ymm3, [esi]  // read 8 pixels from src_argb1
     lea        esi, [esi + 32]
-    vpunpcklbw ymm0, ymm1, ymm1   // low 4
-    vpunpckhbw ymm1, ymm1, ymm1   // high 4
-    vpunpcklbw ymm2, ymm3, ymm5   // low 4
-    vpunpckhbw ymm3, ymm3, ymm5   // high 4
-    vpmulhuw   ymm0, ymm0, ymm2   // src_argb0 * src_argb1 low 4
-    vpmulhuw   ymm1, ymm1, ymm3   // src_argb0 * src_argb1 high 4
+    vpunpcklbw ymm0, ymm1, ymm1  // low 4
+    vpunpckhbw ymm1, ymm1, ymm1  // high 4
+    vpunpcklbw ymm2, ymm3, ymm5  // low 4
+    vpunpckhbw ymm3, ymm3, ymm5  // high 4
+    vpmulhuw   ymm0, ymm0, ymm2  // src_argb0 * src_argb1 low 4
+    vpmulhuw   ymm1, ymm1, ymm3  // src_argb0 * src_argb1 high 4
     vpackuswb  ymm0, ymm0, ymm1
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
@@ -4967,20 +4919,21 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
 
 #ifdef HAS_ARGBADDROW_AVX2
 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked)
-void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
-                     uint8* dst_argb, int width) {
+__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0,
+                                       const uint8_t* src_argb1,
+                                       uint8_t* dst_argb,
+                                       int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
 
  convertloop:
-    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
+    vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb0
     lea        eax, [eax + 32]
-    vpaddusb   ymm0, ymm0, [esi]        // add 8 pixels from src_argb1
+    vpaddusb   ymm0, ymm0, [esi]  // add 8 pixels from src_argb1
     lea        esi, [esi + 32]
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
@@ -4996,20 +4949,21 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
 
 #ifdef HAS_ARGBSUBTRACTROW_AVX2
 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked)
-void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
+__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
+                                            const uint8_t* src_argb1,
+                                            uint8_t* dst_argb,
+                                            int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
 
  convertloop:
-    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
+    vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb0
     lea        eax, [eax + 32]
-    vpsubusb   ymm0, ymm0, [esi]        // src_argb0 - src_argb1
+    vpsubusb   ymm0, ymm0, [esi]  // src_argb0 - src_argb1
     lea        esi, [esi + 32]
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
@@ -5028,14 +4982,16 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
 // -1  0  1
 // -2  0  2
 // -1  0  1
-__declspec(naked)
-void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
-                    const uint8* src_y2, uint8* dst_sobelx, int width) {
+__declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0,
+                                      const uint8_t* src_y1,
+                                      const uint8_t* src_y2,
+                                      uint8_t* dst_sobelx,
+                                      int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_y0
-    mov        esi, [esp + 8 + 8]   // src_y1
+    mov        eax, [esp + 8 + 4]  // src_y0
+    mov        esi, [esp + 8 + 8]  // src_y1
     mov        edi, [esp + 8 + 12]  // src_y2
     mov        edx, [esp + 8 + 16]  // dst_sobelx
     mov        ecx, [esp + 8 + 20]  // width
@@ -5045,17 +5001,17 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
     pxor       xmm5, xmm5  // constant 0
 
  convertloop:
-    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
-    movq       xmm1, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
+    movq       xmm0, qword ptr [eax]  // read 8 pixels from src_y0[0]
+    movq       xmm1, qword ptr [eax + 2]  // read 8 pixels from src_y0[2]
     punpcklbw  xmm0, xmm5
     punpcklbw  xmm1, xmm5
     psubw      xmm0, xmm1
-    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
+    movq       xmm1, qword ptr [eax + esi]  // read 8 pixels from src_y1[0]
     movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
     punpcklbw  xmm1, xmm5
     punpcklbw  xmm2, xmm5
     psubw      xmm1, xmm2
-    movq       xmm2, qword ptr [eax + edi]      // read 8 pixels from src_y2[0]
+    movq       xmm2, qword ptr [eax + edi]  // read 8 pixels from src_y2[0]
     movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]
     punpcklbw  xmm2, xmm5
     punpcklbw  xmm3, xmm5
@@ -5063,7 +5019,7 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
     paddw      xmm0, xmm2
     paddw      xmm0, xmm1
     paddw      xmm0, xmm1
-    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
+    pxor       xmm1, xmm1  // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
     psubw      xmm1, xmm0
     pmaxsw     xmm0, xmm1
     packuswb   xmm0, xmm0
@@ -5084,13 +5040,14 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
 // -1 -2 -1
 //  0  0  0
 //  1  2  1
-__declspec(naked)
-void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width) {
+__declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0,
+                                      const uint8_t* src_y1,
+                                      uint8_t* dst_sobely,
+                                      int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_y0
-    mov        esi, [esp + 4 + 8]   // src_y1
+    mov        eax, [esp + 4 + 4]  // src_y0
+    mov        esi, [esp + 4 + 8]  // src_y1
     mov        edx, [esp + 4 + 12]  // dst_sobely
     mov        ecx, [esp + 4 + 16]  // width
     sub        esi, eax
@@ -5098,17 +5055,17 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
     pxor       xmm5, xmm5  // constant 0
 
  convertloop:
-    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
-    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
+    movq       xmm0, qword ptr [eax]  // read 8 pixels from src_y0[0]
+    movq       xmm1, qword ptr [eax + esi]  // read 8 pixels from src_y1[0]
     punpcklbw  xmm0, xmm5
     punpcklbw  xmm1, xmm5
     psubw      xmm0, xmm1
-    movq       xmm1, qword ptr [eax + 1]        // read 8 pixels from src_y0[1]
+    movq       xmm1, qword ptr [eax + 1]  // read 8 pixels from src_y0[1]
     movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]
     punpcklbw  xmm1, xmm5
     punpcklbw  xmm2, xmm5
     psubw      xmm1, xmm2
-    movq       xmm2, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
+    movq       xmm2, qword ptr [eax + 2]  // read 8 pixels from src_y0[2]
     movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
     punpcklbw  xmm2, xmm5
     punpcklbw  xmm3, xmm5
@@ -5116,7 +5073,7 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
     paddw      xmm0, xmm2
     paddw      xmm0, xmm1
     paddw      xmm0, xmm1
-    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
+    pxor       xmm1, xmm1  // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
     psubw      xmm1, xmm0
     pmaxsw     xmm0, xmm1
     packuswb   xmm0, xmm0
@@ -5137,36 +5094,37 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
 // R = Sobel
 // G = Sobel
 // B = Sobel
-__declspec(naked)
-void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                   uint8* dst_argb, int width) {
+__declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx,
+                                     const uint8_t* src_sobely,
+                                     uint8_t* dst_argb,
+                                     int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_sobelx
-    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        eax, [esp + 4 + 4]  // src_sobelx
+    mov        esi, [esp + 4 + 8]  // src_sobely
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
     sub        esi, eax
-    pcmpeqb    xmm5, xmm5           // alpha 255
-    pslld      xmm5, 24             // 0xff000000
+    pcmpeqb    xmm5, xmm5  // alpha 255
+    pslld      xmm5, 24  // 0xff000000
 
  convertloop:
-    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
-    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
     lea        eax, [eax + 16]
-    paddusb    xmm0, xmm1             // sobel = sobelx + sobely
-    movdqa     xmm2, xmm0             // GG
-    punpcklbw  xmm2, xmm0             // First 8
-    punpckhbw  xmm0, xmm0             // Next 8
-    movdqa     xmm1, xmm2             // GGGG
-    punpcklwd  xmm1, xmm2             // First 4
-    punpckhwd  xmm2, xmm2             // Next 4
-    por        xmm1, xmm5             // GGGA
+    paddusb    xmm0, xmm1  // sobel = sobelx + sobely
+    movdqa     xmm2, xmm0  // GG
+    punpcklbw  xmm2, xmm0  // First 8
+    punpckhbw  xmm0, xmm0  // Next 8
+    movdqa     xmm1, xmm2  // GGGG
+    punpcklwd  xmm1, xmm2  // First 4
+    punpckhwd  xmm2, xmm2  // Next 4
+    por        xmm1, xmm5  // GGGA
     por        xmm2, xmm5
-    movdqa     xmm3, xmm0             // GGGG
-    punpcklwd  xmm3, xmm0             // Next 4
-    punpckhwd  xmm0, xmm0             // Last 4
-    por        xmm3, xmm5             // GGGA
+    movdqa     xmm3, xmm0  // GGGG
+    punpcklwd  xmm3, xmm0  // Next 4
+    punpckhwd  xmm0, xmm0  // Last 4
+    por        xmm3, xmm5  // GGGA
     por        xmm0, xmm5
     movdqu     [edx], xmm1
     movdqu     [edx + 16], xmm2
@@ -5184,22 +5142,23 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
 
 #ifdef HAS_SOBELTOPLANEROW_SSE2
 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
-__declspec(naked)
-void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                          uint8* dst_y, int width) {
+__declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
+                                            const uint8_t* src_sobely,
+                                            uint8_t* dst_y,
+                                            int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_sobelx
-    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        eax, [esp + 4 + 4]  // src_sobelx
+    mov        esi, [esp + 4 + 8]  // src_sobely
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
     sub        esi, eax
 
  convertloop:
-    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
-    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
     lea        eax, [eax + 16]
-    paddusb    xmm0, xmm1             // sobel = sobelx + sobely
+    paddusb    xmm0, xmm1  // sobel = sobelx + sobely
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 16
@@ -5217,36 +5176,37 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
 // R = Sobel X
 // G = Sobel
 // B = Sobel Y
-__declspec(naked)
-void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) {
+__declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx,
+                                       const uint8_t* src_sobely,
+                                       uint8_t* dst_argb,
+                                       int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_sobelx
-    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        eax, [esp + 4 + 4]  // src_sobelx
+    mov        esi, [esp + 4 + 8]  // src_sobely
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
     sub        esi, eax
-    pcmpeqb    xmm5, xmm5           // alpha 255
+    pcmpeqb    xmm5, xmm5  // alpha 255
 
  convertloop:
-    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
-    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
     lea        eax, [eax + 16]
     movdqa     xmm2, xmm0
-    paddusb    xmm2, xmm1             // sobel = sobelx + sobely
-    movdqa     xmm3, xmm0             // XA
+    paddusb    xmm2, xmm1  // sobel = sobelx + sobely
+    movdqa     xmm3, xmm0  // XA
     punpcklbw  xmm3, xmm5
     punpckhbw  xmm0, xmm5
-    movdqa     xmm4, xmm1             // YS
+    movdqa     xmm4, xmm1  // YS
     punpcklbw  xmm4, xmm2
     punpckhbw  xmm1, xmm2
-    movdqa     xmm6, xmm4             // YSXA
-    punpcklwd  xmm6, xmm3             // First 4
-    punpckhwd  xmm4, xmm3             // Next 4
-    movdqa     xmm7, xmm1             // YSXA
-    punpcklwd  xmm7, xmm0             // Next 4
-    punpckhwd  xmm1, xmm0             // Last 4
+    movdqa     xmm6, xmm4  // YSXA
+    punpcklwd  xmm6, xmm3  // First 4
+    punpckhwd  xmm4, xmm3  // Next 4
+    movdqa     xmm7, xmm1  // YSXA
+    punpcklwd  xmm7, xmm0  // Next 4
+    punpckhwd  xmm1, xmm0  // Last 4
     movdqu     [edx], xmm6
     movdqu     [edx + 16], xmm4
     movdqu     [edx + 32], xmm7
@@ -5275,8 +5235,11 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
 // count is number of averaged pixels to produce.
 // Does 4 pixels at a time.
 // This function requires alignment on accumulation buffer pointers.
-void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
-                                    int width, int area, uint8* dst,
+void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
+                                    const int32_t* botleft,
+                                    int width,
+                                    int area,
+                                    uint8_t* dst,
                                     int count) {
   __asm {
     mov        eax, topleft  // eax topleft
@@ -5294,18 +5257,18 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
     cmp        area, 128  // 128 pixels will not overflow 15 bits.
     ja         l4
 
-    pshufd     xmm5, xmm5, 0        // area
-    pcmpeqb    xmm6, xmm6           // constant of 65536.0 - 1 = 65535.0
+    pshufd     xmm5, xmm5, 0  // area
+    pcmpeqb    xmm6, xmm6  // constant of 65536.0 - 1 = 65535.0
     psrld      xmm6, 16
     cvtdq2ps   xmm6, xmm6
-    addps      xmm5, xmm6           // (65536.0 + area - 1)
-    mulps      xmm5, xmm4           // (65536.0 + area - 1) * 1 / area
-    cvtps2dq   xmm5, xmm5           // 0.16 fixed point
-    packssdw   xmm5, xmm5           // 16 bit shorts
+    addps      xmm5, xmm6  // (65536.0 + area - 1)
+    mulps      xmm5, xmm4  // (65536.0 + area - 1) * 1 / area
+    cvtps2dq   xmm5, xmm5  // 0.16 fixed point
+    packssdw   xmm5, xmm5  // 16 bit shorts
 
-    // 4 pixel loop small blocks.
+        // 4 pixel loop small blocks.
   s4:
-    // top left
+        // top left
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     movdqu     xmm2, [eax + 32]
@@ -5345,9 +5308,9 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
 
     jmp        l4b
 
-    // 4 pixel loop
+            // 4 pixel loop
   l4:
-    // top left
+        // top left
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     movdqu     xmm2, [eax + 32]
@@ -5373,7 +5336,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
     paddd      xmm3, [esi + edx * 4 + 48]
     lea        esi, [esi + 64]
 
-    cvtdq2ps   xmm0, xmm0   // Average = Sum * 1 / Area
+    cvtdq2ps   xmm0, xmm0  // Average = Sum * 1 / Area
     cvtdq2ps   xmm1, xmm1
     mulps      xmm0, xmm4
     mulps      xmm1, xmm4
@@ -5397,7 +5360,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
     add        ecx, 4 - 1
     jl         l1b
 
-    // 1 pixel loop
+        // 1 pixel loop
   l1:
     movdqu     xmm0, [eax]
     psubd      xmm0, [eax + edx * 4]
@@ -5422,8 +5385,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
 // Creates a table of cumulative sums where each value is a sum of all values
 // above and to the left of the value.
-void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
-                                  const int32* previous_cumsum, int width) {
+void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
+                                  int32_t* cumsum,
+                                  const int32_t* previous_cumsum,
+                                  int width) {
   __asm {
     mov        eax, row
     mov        edx, cumsum
@@ -5437,7 +5402,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
     test       edx, 15
     jne        l4b
 
-    // 4 pixel loop
+        // 4 pixel loop
   l4:
     movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
     lea        eax, [eax + 16]
@@ -5483,7 +5448,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
     add        ecx, 4 - 1
     jl         l1b
 
-    // 1 pixel loop
+        // 1 pixel loop
   l1:
     movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
     lea        eax, [eax + 4]
@@ -5505,10 +5470,11 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
 
 #ifdef HAS_ARGBAFFINEROW_SSE2
 // Copy ARGB pixels from source image with slope to a row of destination.
-__declspec(naked)
-LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
-                        uint8* dst_argb, const float* uv_dudv, int width) {
+__declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
+                                                     int src_argb_stride,
+                                                     uint8_t* dst_argb,
+                                                     const float* uv_dudv,
+                                                     int width) {
   __asm {
     push       esi
     push       edi
@@ -5519,46 +5485,46 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
     movq       xmm2, qword ptr [ecx]  // uv
     movq       xmm7, qword ptr [ecx + 8]  // dudv
     mov        ecx, [esp + 28]  // width
-    shl        esi, 16          // 4, stride
+    shl        esi, 16  // 4, stride
     add        esi, 4
     movd       xmm5, esi
     sub        ecx, 4
     jl         l4b
 
-    // setup for 4 pixel loop
+        // setup for 4 pixel loop
     pshufd     xmm7, xmm7, 0x44  // dup dudv
     pshufd     xmm5, xmm5, 0  // dup 4, stride
-    movdqa     xmm0, xmm2    // x0, y0, x1, y1
+    movdqa     xmm0, xmm2  // x0, y0, x1, y1
     addps      xmm0, xmm7
     movlhps    xmm2, xmm0
     movdqa     xmm4, xmm7
-    addps      xmm4, xmm4    // dudv *= 2
-    movdqa     xmm3, xmm2    // x2, y2, x3, y3
+    addps      xmm4, xmm4  // dudv *= 2
+    movdqa     xmm3, xmm2  // x2, y2, x3, y3
     addps      xmm3, xmm4
-    addps      xmm4, xmm4    // dudv *= 4
+    addps      xmm4, xmm4  // dudv *= 4
 
-    // 4 pixel loop
+        // 4 pixel loop
   l4:
-    cvttps2dq  xmm0, xmm2    // x, y float to int first 2
-    cvttps2dq  xmm1, xmm3    // x, y float to int next 2
-    packssdw   xmm0, xmm1    // x, y as 8 shorts
-    pmaddwd    xmm0, xmm5    // offsets = x * 4 + y * stride.
+    cvttps2dq  xmm0, xmm2  // x, y float to int first 2
+    cvttps2dq  xmm1, xmm3  // x, y float to int next 2
+    packssdw   xmm0, xmm1  // x, y as 8 shorts
+    pmaddwd    xmm0, xmm5  // offsets = x * 4 + y * stride.
     movd       esi, xmm0
     pshufd     xmm0, xmm0, 0x39  // shift right
     movd       edi, xmm0
     pshufd     xmm0, xmm0, 0x39  // shift right
     movd       xmm1, [eax + esi]  // read pixel 0
     movd       xmm6, [eax + edi]  // read pixel 1
-    punpckldq  xmm1, xmm6     // combine pixel 0 and 1
-    addps      xmm2, xmm4    // x, y += dx, dy first 2
+    punpckldq  xmm1, xmm6  // combine pixel 0 and 1
+    addps      xmm2, xmm4  // x, y += dx, dy first 2
     movq       qword ptr [edx], xmm1
     movd       esi, xmm0
     pshufd     xmm0, xmm0, 0x39  // shift right
     movd       edi, xmm0
     movd       xmm6, [eax + esi]  // read pixel 2
     movd       xmm0, [eax + edi]  // read pixel 3
-    punpckldq  xmm6, xmm0     // combine pixel 2 and 3
-    addps      xmm3, xmm4    // x, y += dx, dy next 2
+    punpckldq  xmm6, xmm0  // combine pixel 2 and 3
+    addps      xmm3, xmm4  // x, y += dx, dy next 2
     movq       qword ptr 8[edx], xmm6
     lea        edx, [edx + 16]
     sub        ecx, 4
@@ -5568,12 +5534,12 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
     add        ecx, 4 - 1
     jl         l1b
 
-    // 1 pixel loop
+        // 1 pixel loop
   l1:
-    cvttps2dq  xmm0, xmm2    // x, y float to int
-    packssdw   xmm0, xmm0    // x, y as shorts
-    pmaddwd    xmm0, xmm5    // offset = x * 4 + y * stride
-    addps      xmm2, xmm7    // x, y += dx, dy
+    cvttps2dq  xmm0, xmm2  // x, y float to int
+    packssdw   xmm0, xmm0  // x, y as shorts
+    pmaddwd    xmm0, xmm5  // offset = x * 4 + y * stride
+    addps      xmm2, xmm7  // x, y += dx, dy
     movd       esi, xmm0
     movd       xmm0, [eax + esi]  // copy a pixel
     movd       [edx], xmm0
@@ -5590,15 +5556,16 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
 
 #ifdef HAS_INTERPOLATEROW_AVX2
 // Bilinear filter 32x2 -> 32x1
-__declspec(naked)
-void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
-                         ptrdiff_t src_stride, int dst_width,
-                         int source_y_fraction) {
+__declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
+                                           const uint8_t* src_ptr,
+                                           ptrdiff_t src_stride,
+                                           int dst_width,
+                                           int source_y_fraction) {
   __asm {
     push       esi
     push       edi
-    mov        edi, [esp + 8 + 4]   // dst_ptr
-    mov        esi, [esp + 8 + 8]   // src_ptr
+    mov        edi, [esp + 8 + 4]  // dst_ptr
+    mov        esi, [esp + 8 + 8]  // src_ptr
     mov        edx, [esp + 8 + 12]  // src_stride
     mov        ecx, [esp + 8 + 16]  // dst_width
     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
@@ -5607,7 +5574,7 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
     je         xloop100  // 0 / 256.  Blend 100 / 0.
     sub        edi, esi
     cmp        eax, 128
-    je         xloop50   // 128 /256 is 0.50.  Blend 50 / 50.
+    je         xloop50  // 128 /256 is 0.50.  Blend 50 / 50.
 
     vmovd      xmm0, eax  // high fraction 0..255
     neg        eax
@@ -5634,14 +5601,14 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
     vpaddw     ymm0, ymm0, ymm4
     vpsrlw     ymm1, ymm1, 8
     vpsrlw     ymm0, ymm0, 8
-    vpackuswb  ymm0, ymm0, ymm1  // unmutates
+    vpackuswb  ymm0, ymm0, ymm1            // unmutates
     vmovdqu    [esi + edi], ymm0
     lea        esi, [esi + 32]
     sub        ecx, 32
     jg         xloop
     jmp        xloop99
 
-   // Blend 50 / 50.
+        // Blend 50 / 50.
  xloop50:
    vmovdqu    ymm0, [esi]
    vpavgb     ymm0, ymm0, [esi + edx]
@@ -5651,7 +5618,7 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
    jg         xloop50
    jmp        xloop99
 
-   // Blend 100 / 0 - Copy row unchanged.
+        // Blend 100 / 0 - Copy row unchanged.
  xloop100:
    rep movsb
 
@@ -5666,25 +5633,26 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
 
 // Bilinear filter 16x2 -> 16x1
 // TODO(fbarchard): Consider allowing 256 using memcpy.
-__declspec(naked)
-void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                          ptrdiff_t src_stride, int dst_width,
-                          int source_y_fraction) {
+__declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr,
+                                            const uint8_t* src_ptr,
+                                            ptrdiff_t src_stride,
+                                            int dst_width,
+                                            int source_y_fraction) {
   __asm {
     push       esi
     push       edi
 
-    mov        edi, [esp + 8 + 4]   // dst_ptr
-    mov        esi, [esp + 8 + 8]   // src_ptr
+    mov        edi, [esp + 8 + 4]  // dst_ptr
+    mov        esi, [esp + 8 + 8]  // src_ptr
     mov        edx, [esp + 8 + 12]  // src_stride
     mov        ecx, [esp + 8 + 16]  // dst_width
     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
     sub        edi, esi
-    // Dispatch to specialized filters if applicable.
+        // Dispatch to specialized filters if applicable.
     cmp        eax, 0
     je         xloop100  // 0 /256.  Blend 100 / 0.
     cmp        eax, 128
-    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
+    je         xloop50  // 128 / 256 is 0.50.  Blend 50 / 50.
 
     movd       xmm0, eax  // high fraction 0..255
     neg        eax
@@ -5703,7 +5671,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     movdqu     xmm1, xmm0
     punpcklbw  xmm0, xmm2
     punpckhbw  xmm1, xmm2
-    psubb      xmm0, xmm4  // bias image by -128
+    psubb      xmm0, xmm4            // bias image by -128
     psubb      xmm1, xmm4
     movdqa     xmm2, xmm5
     movdqa     xmm3, xmm5
@@ -5720,7 +5688,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     jg         xloop
     jmp        xloop99
 
-    // Blend 50 / 50.
+        // Blend 50 / 50.
   xloop50:
     movdqu     xmm0, [esi]
     movdqu     xmm1, [esi + edx]
@@ -5731,7 +5699,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     jg         xloop50
     jmp        xloop99
 
-    // Blend 100 / 0 - Copy row unchanged.
+        // Blend 100 / 0 - Copy row unchanged.
   xloop100:
     movdqu     xmm0, [esi]
     movdqu     [esi + edi], xmm0
@@ -5747,15 +5715,16 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
 }
 
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-__declspec(naked)
-void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                          const uint8* shuffler, int width) {
+__declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
+                                            uint8_t* dst_argb,
+                                            const uint8_t* shuffler,
+                                            int width) {
   __asm {
-    mov        eax, [esp + 4]    // src_argb
-    mov        edx, [esp + 8]    // dst_argb
-    mov        ecx, [esp + 12]   // shuffler
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_argb
+    mov        ecx, [esp + 12]  // shuffler
     movdqu     xmm5, [ecx]
-    mov        ecx, [esp + 16]   // width
+    mov        ecx, [esp + 16]  // width
 
   wloop:
     movdqu     xmm0, [eax]
@@ -5773,15 +5742,16 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
 }
 
 #ifdef HAS_ARGBSHUFFLEROW_AVX2
-__declspec(naked)
-void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int width) {
+__declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
+                                           uint8_t* dst_argb,
+                                           const uint8_t* shuffler,
+                                           int width) {
   __asm {
-    mov        eax, [esp + 4]     // src_argb
-    mov        edx, [esp + 8]     // dst_argb
-    mov        ecx, [esp + 12]    // shuffler
-    vbroadcastf128 ymm5, [ecx]    // same shuffle in high as low.
-    mov        ecx, [esp + 16]    // width
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_argb
+    mov        ecx, [esp + 12]  // shuffler
+    vbroadcastf128 ymm5, [ecx]  // same shuffle in high as low.
+    mov        ecx, [esp + 16]  // width
 
   wloop:
     vmovdqu    ymm0, [eax]
@@ -5801,152 +5771,36 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
 }
 #endif  // HAS_ARGBSHUFFLEROW_AVX2
 
-__declspec(naked)
-void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int width) {
-  __asm {
-    push       ebx
-    push       esi
-    mov        eax, [esp + 8 + 4]    // src_argb
-    mov        edx, [esp + 8 + 8]    // dst_argb
-    mov        esi, [esp + 8 + 12]   // shuffler
-    mov        ecx, [esp + 8 + 16]   // width
-    pxor       xmm5, xmm5
-
-    mov        ebx, [esi]   // shuffler
-    cmp        ebx, 0x03000102
-    je         shuf_3012
-    cmp        ebx, 0x00010203
-    je         shuf_0123
-    cmp        ebx, 0x00030201
-    je         shuf_0321
-    cmp        ebx, 0x02010003
-    je         shuf_2103
-
-  // TODO(fbarchard): Use one source pointer and 3 offsets.
-  shuf_any1:
-    movzx      ebx, byte ptr [esi]
-    movzx      ebx, byte ptr [eax + ebx]
-    mov        [edx], bl
-    movzx      ebx, byte ptr [esi + 1]
-    movzx      ebx, byte ptr [eax + ebx]
-    mov        [edx + 1], bl
-    movzx      ebx, byte ptr [esi + 2]
-    movzx      ebx, byte ptr [eax + ebx]
-    mov        [edx + 2], bl
-    movzx      ebx, byte ptr [esi + 3]
-    movzx      ebx, byte ptr [eax + ebx]
-    mov        [edx + 3], bl
-    lea        eax, [eax + 4]
-    lea        edx, [edx + 4]
-    sub        ecx, 1
-    jg         shuf_any1
-    jmp        shuf99
-
-  shuf_0123:
-    movdqu     xmm0, [eax]
-    lea        eax, [eax + 16]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm5
-    punpckhbw  xmm1, xmm5
-    pshufhw    xmm0, xmm0, 01Bh   // 1B = 00011011 = 0x0123 = BGRAToARGB
-    pshuflw    xmm0, xmm0, 01Bh
-    pshufhw    xmm1, xmm1, 01Bh
-    pshuflw    xmm1, xmm1, 01Bh
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         shuf_0123
-    jmp        shuf99
-
-  shuf_0321:
-    movdqu     xmm0, [eax]
-    lea        eax, [eax + 16]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm5
-    punpckhbw  xmm1, xmm5
-    pshufhw    xmm0, xmm0, 039h   // 39 = 00111001 = 0x0321 = RGBAToARGB
-    pshuflw    xmm0, xmm0, 039h
-    pshufhw    xmm1, xmm1, 039h
-    pshuflw    xmm1, xmm1, 039h
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         shuf_0321
-    jmp        shuf99
-
-  shuf_2103:
-    movdqu     xmm0, [eax]
-    lea        eax, [eax + 16]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm5
-    punpckhbw  xmm1, xmm5
-    pshufhw    xmm0, xmm0, 093h   // 93 = 10010011 = 0x2103 = ARGBToRGBA
-    pshuflw    xmm0, xmm0, 093h
-    pshufhw    xmm1, xmm1, 093h
-    pshuflw    xmm1, xmm1, 093h
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         shuf_2103
-    jmp        shuf99
-
-  shuf_3012:
-    movdqu     xmm0, [eax]
-    lea        eax, [eax + 16]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm5
-    punpckhbw  xmm1, xmm5
-    pshufhw    xmm0, xmm0, 0C6h   // C6 = 11000110 = 0x3012 = ABGRToARGB
-    pshuflw    xmm0, xmm0, 0C6h
-    pshufhw    xmm1, xmm1, 0C6h
-    pshuflw    xmm1, xmm1, 0C6h
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         shuf_3012
-
-  shuf99:
-    pop        esi
-    pop        ebx
-    ret
-  }
-}
-
 // YUY2 - Macro-pixel = 2 image pixels
 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
 
 // UYVY - Macro-pixel = 2 image pixels
 // U0Y0V0Y1
 
-__declspec(naked)
-void I422ToYUY2Row_SSE2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_frame, int width) {
+__declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y,
+                                          const uint8_t* src_u,
+                                          const uint8_t* src_v,
+                                          uint8_t* dst_frame,
+                                          int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]    // src_y
-    mov        esi, [esp + 8 + 8]    // src_u
-    mov        edx, [esp + 8 + 12]   // src_v
-    mov        edi, [esp + 8 + 16]   // dst_frame
-    mov        ecx, [esp + 8 + 20]   // width
+    mov        eax, [esp + 8 + 4]  // src_y
+    mov        esi, [esp + 8 + 8]  // src_u
+    mov        edx, [esp + 8 + 12]  // src_v
+    mov        edi, [esp + 8 + 16]  // dst_frame
+    mov        ecx, [esp + 8 + 20]  // width
     sub        edx, esi
 
   convertloop:
-    movq       xmm2, qword ptr [esi] // U
-    movq       xmm3, qword ptr [esi + edx] // V
+    movq       xmm2, qword ptr [esi]  // U
+    movq       xmm3, qword ptr [esi + edx]  // V
     lea        esi, [esi + 8]
-    punpcklbw  xmm2, xmm3 // UV
-    movdqu     xmm0, [eax] // Y
+    punpcklbw  xmm2, xmm3  // UV
+    movdqu     xmm0, [eax]  // Y
     lea        eax, [eax + 16]
     movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm2 // YUYV
+    punpcklbw  xmm0, xmm2  // YUYV
     punpckhbw  xmm1, xmm2
     movdqu     [edi], xmm0
     movdqu     [edi + 16], xmm1
@@ -5960,30 +5814,30 @@ void I422ToYUY2Row_SSE2(const uint8* src_y,
   }
 }
 
-__declspec(naked)
-void I422ToUYVYRow_SSE2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_frame, int width) {
+__declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y,
+                                          const uint8_t* src_u,
+                                          const uint8_t* src_v,
+                                          uint8_t* dst_frame,
+                                          int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]    // src_y
-    mov        esi, [esp + 8 + 8]    // src_u
-    mov        edx, [esp + 8 + 12]   // src_v
-    mov        edi, [esp + 8 + 16]   // dst_frame
-    mov        ecx, [esp + 8 + 20]   // width
+    mov        eax, [esp + 8 + 4]  // src_y
+    mov        esi, [esp + 8 + 8]  // src_u
+    mov        edx, [esp + 8 + 12]  // src_v
+    mov        edi, [esp + 8 + 16]  // dst_frame
+    mov        ecx, [esp + 8 + 20]  // width
     sub        edx, esi
 
   convertloop:
-    movq       xmm2, qword ptr [esi] // U
-    movq       xmm3, qword ptr [esi + edx] // V
+    movq       xmm2, qword ptr [esi]  // U
+    movq       xmm3, qword ptr [esi + edx]  // V
     lea        esi, [esi + 8]
-    punpcklbw  xmm2, xmm3 // UV
-    movdqu     xmm0, [eax] // Y
+    punpcklbw  xmm2, xmm3  // UV
+    movdqu     xmm0, [eax]  // Y
     movdqa     xmm1, xmm2
     lea        eax, [eax + 16]
-    punpcklbw  xmm1, xmm0 // UYVY
+    punpcklbw  xmm1, xmm0  // UYVY
     punpckhbw  xmm2, xmm0
     movdqu     [edi], xmm1
     movdqu     [edi + 16], xmm2
@@ -5998,22 +5852,22 @@ void I422ToUYVYRow_SSE2(const uint8* src_y,
 }
 
 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
-__declspec(naked)
-void ARGBPolynomialRow_SSE2(const uint8* src_argb,
-                            uint8* dst_argb, const float* poly,
-                            int width) {
+__declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
+                                              uint8_t* dst_argb,
+                                              const float* poly,
+                                              int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   /* src_argb */
-    mov        edx, [esp + 4 + 8]   /* dst_argb */
-    mov        esi, [esp + 4 + 12]  /* poly */
-    mov        ecx, [esp + 4 + 16]  /* width */
+    mov        eax, [esp + 4 + 4] /* src_argb */
+    mov        edx, [esp + 4 + 8] /* dst_argb */
+    mov        esi, [esp + 4 + 12] /* poly */
+    mov        ecx, [esp + 4 + 16] /* width */
     pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
 
-    // 2 pixel loop.
+        // 2 pixel loop.
  convertloop:
-//    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
-//    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
+        //    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
+        //    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
     movq       xmm0, qword ptr [eax]  // BGRABGRA
     lea        eax, [eax + 8]
     punpcklbw  xmm0, xmm3
@@ -6057,25 +5911,25 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
 
 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
-__declspec(naked)
-void ARGBPolynomialRow_AVX2(const uint8* src_argb,
-                            uint8* dst_argb, const float* poly,
-                            int width) {
+__declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
+                                              uint8_t* dst_argb,
+                                              const float* poly,
+                                              int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_argb */
-    mov        ecx, [esp + 12]   /* poly */
-    vbroadcastf128 ymm4, [ecx]       // C0
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_argb */
+    mov        ecx, [esp + 12] /* poly */
+    vbroadcastf128 ymm4, [ecx]  // C0
     vbroadcastf128 ymm5, [ecx + 16]  // C1
     vbroadcastf128 ymm6, [ecx + 32]  // C2
     vbroadcastf128 ymm7, [ecx + 48]  // C3
-    mov        ecx, [esp + 16]  /* width */
+    mov        ecx, [esp + 16] /* width */
 
     // 2 pixel loop.
  convertloop:
     vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
     lea         eax, [eax + 8]
-    vcvtdq2ps   ymm0, ymm0        // X 8 floats
+    vcvtdq2ps   ymm0, ymm0  // X 8 floats
     vmulps      ymm2, ymm0, ymm0  // X * X
     vmulps      ymm3, ymm0, ymm7  // C3 * X
     vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X
@@ -6095,16 +5949,125 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
 }
 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
 
+#ifdef HAS_HALFFLOATROW_SSE2
+static float kExpBias = 1.9259299444e-34f;
+__declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src,
+                                         uint16_t* dst,
+                                         float scale,
+                                         int width) {
+  __asm {
+    mov        eax, [esp + 4] /* src */
+    mov        edx, [esp + 8] /* dst */
+    movd       xmm4, dword ptr [esp + 12] /* scale */
+    mov        ecx, [esp + 16] /* width */
+    mulss      xmm4, kExpBias
+    pshufd     xmm4, xmm4, 0
+    pxor       xmm5, xmm5
+    sub        edx, eax
+
+        // 8 pixel loop.
+ convertloop:
+    movdqu      xmm2, xmmword ptr [eax]  // 8 shorts
+    add         eax, 16
+    movdqa      xmm3, xmm2
+    punpcklwd   xmm2, xmm5
+    cvtdq2ps    xmm2, xmm2  // convert 8 ints to floats
+    punpckhwd   xmm3, xmm5
+    cvtdq2ps    xmm3, xmm3
+    mulps       xmm2, xmm4
+    mulps       xmm3, xmm4
+    psrld       xmm2, 13
+    psrld       xmm3, 13
+    packssdw    xmm2, xmm3
+    movdqu      [eax + edx - 16], xmm2
+    sub         ecx, 8
+    jg          convertloop
+    ret
+  }
+}
+#endif  // HAS_HALFFLOATROW_SSE2
+
+#ifdef HAS_HALFFLOATROW_AVX2
+__declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src,
+                                         uint16_t* dst,
+                                         float scale,
+                                         int width) {
+  __asm {
+    mov        eax, [esp + 4] /* src */
+    mov        edx, [esp + 8] /* dst */
+    movd       xmm4, dword ptr [esp + 12] /* scale */
+    mov        ecx, [esp + 16] /* width */
+
+    vmulss     xmm4, xmm4, kExpBias
+    vbroadcastss ymm4, xmm4
+    vpxor      ymm5, ymm5, ymm5
+    sub        edx, eax
+
+        // 16 pixel loop.
+ convertloop:
+    vmovdqu     ymm2, [eax]  // 16 shorts
+    add         eax, 32
+    vpunpckhwd  ymm3, ymm2, ymm5  // convert 16 shorts to 16 ints
+    vpunpcklwd  ymm2, ymm2, ymm5
+    vcvtdq2ps   ymm3, ymm3  // convert 16 ints to floats
+    vcvtdq2ps   ymm2, ymm2
+    vmulps      ymm3, ymm3, ymm4  // scale to adjust exponent for 5 bit range.
+    vmulps      ymm2, ymm2, ymm4
+    vpsrld      ymm3, ymm3, 13  // float convert to 8 half floats truncate
+    vpsrld      ymm2, ymm2, 13
+    vpackssdw   ymm2, ymm2, ymm3
+    vmovdqu     [eax + edx - 32], ymm2
+    sub         ecx, 16
+    jg          convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_HALFFLOATROW_AVX2
+
+#ifdef HAS_HALFFLOATROW_F16C
+__declspec(naked) void HalfFloatRow_F16C(const uint16_t* src,
+                                         uint16_t* dst,
+                                         float scale,
+                                         int width) {
+  __asm {
+    mov        eax, [esp + 4] /* src */
+    mov        edx, [esp + 8] /* dst */
+    vbroadcastss ymm4, [esp + 12] /* scale */
+    mov        ecx, [esp + 16] /* width */
+    sub        edx, eax
+
+        // 16 pixel loop.
+ convertloop:
+    vpmovzxwd   ymm2, xmmword ptr [eax]  // 8 shorts -> 8 ints
+    vpmovzxwd   ymm3, xmmword ptr [eax + 16]  // 8 more shorts
+    add         eax, 32
+    vcvtdq2ps   ymm2, ymm2  // convert 8 ints to floats
+    vcvtdq2ps   ymm3, ymm3
+    vmulps      ymm2, ymm2, ymm4  // scale to normalized range 0 to 1
+    vmulps      ymm3, ymm3, ymm4
+    vcvtps2ph   xmm2, ymm2, 3  // float convert to 8 half floats truncate
+    vcvtps2ph   xmm3, ymm3, 3
+    vmovdqu     [eax + edx + 32], xmm2
+    vmovdqu     [eax + edx + 32 + 16], xmm3
+    sub         ecx, 16
+    jg          convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_HALFFLOATROW_F16C
+
 #ifdef HAS_ARGBCOLORTABLEROW_X86
 // Tranform ARGB pixels with color table.
-__declspec(naked)
-void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
-                           int width) {
+__declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb,
+                                             const uint8_t* table_argb,
+                                             int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   /* dst_argb */
-    mov        esi, [esp + 4 + 8]   /* table_argb */
-    mov        ecx, [esp + 4 + 12]  /* width */
+    mov        eax, [esp + 4 + 4] /* dst_argb */
+    mov        esi, [esp + 4 + 8] /* table_argb */
+    mov        ecx, [esp + 4 + 12] /* width */
 
     // 1 pixel loop.
   convertloop:
@@ -6131,13 +6094,14 @@ void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
 
 #ifdef HAS_RGBCOLORTABLEROW_X86
 // Tranform RGB pixels with color table.
-__declspec(naked)
-void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
+__declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb,
+                                            const uint8_t* table_argb,
+                                            int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   /* dst_argb */
-    mov        esi, [esp + 4 + 8]   /* table_argb */
-    mov        ecx, [esp + 4 + 12]  /* width */
+    mov        eax, [esp + 4 + 4] /* dst_argb */
+    mov        esi, [esp + 4 + 8] /* table_argb */
+    mov        ecx, [esp + 4 + 12] /* width */
 
     // 1 pixel loop.
   convertloop:
@@ -6162,27 +6126,28 @@ void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
 
 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
 // Tranform RGB pixels with luma table.
-__declspec(naked)
-void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                                 int width,
-                                 const uint8* luma, uint32 lumacoeff) {
+__declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
+                                                   uint8_t* dst_argb,
+                                                   int width,
+                                                   const uint8_t* luma,
+                                                   uint32_t lumacoeff) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   /* src_argb */
-    mov        edi, [esp + 8 + 8]   /* dst_argb */
-    mov        ecx, [esp + 8 + 12]  /* width */
+    mov        eax, [esp + 8 + 4] /* src_argb */
+    mov        edi, [esp + 8 + 8] /* dst_argb */
+    mov        ecx, [esp + 8 + 12] /* width */
     movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
     movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
     pshufd     xmm2, xmm2, 0
     pshufd     xmm3, xmm3, 0
-    pcmpeqb    xmm4, xmm4        // generate mask 0xff00ff00
+    pcmpeqb    xmm4, xmm4  // generate mask 0xff00ff00
     psllw      xmm4, 8
     pxor       xmm5, xmm5
 
-    // 4 pixel loop.
+        // 4 pixel loop.
   convertloop:
-    movdqu     xmm0, xmmword ptr [eax]      // generate luma ptr
+    movdqu     xmm0, xmmword ptr [eax]  // generate luma ptr
     pmaddubsw  xmm0, xmm3
     phaddw     xmm0, xmm0
     pand       xmm0, xmm4  // mask out low bits
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/scale.cc b/media/libvpx/libvpx/third_party/libyuv/source/scale.cc
index 36e3fe5281..2cfa1c6cb1 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/scale.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/scale.cc
@@ -33,17 +33,25 @@ static __inline int Abs(int v) {
 // This is an optimized version for scaling down a plane to 1/2 of
 // its original size.
 
-static void ScalePlaneDown2(int src_width, int src_height,
-                            int dst_width, int dst_height,
-                            int src_stride, int dst_stride,
-                            const uint8* src_ptr, uint8* dst_ptr,
+static void ScalePlaneDown2(int src_width,
+                            int src_height,
+                            int dst_width,
+                            int dst_height,
+                            int src_stride,
+                            int dst_stride,
+                            const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
                             enum FilterMode filtering) {
   int y;
-  void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) =
-      filtering == kFilterNone ? ScaleRowDown2_C :
-      (filtering == kFilterLinear ? ScaleRowDown2Linear_C : ScaleRowDown2Box_C);
+  void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                        uint8_t* dst_ptr, int dst_width) =
+      filtering == kFilterNone
+          ? ScaleRowDown2_C
+          : (filtering == kFilterLinear ? ScaleRowDown2Linear_C
+                                        : ScaleRowDown2Box_C);
   int row_stride = src_stride << 1;
+  (void)src_width;
+  (void)src_height;
   if (!filtering) {
     src_ptr += src_stride;  // Point to odd rows.
     src_stride = 0;
@@ -51,46 +59,63 @@ static void ScalePlaneDown2(int src_width, int src_height,
 
 #if defined(HAS_SCALEROWDOWN2_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_NEON :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON :
-        ScaleRowDown2Box_Any_NEON);
+    ScaleRowDown2 =
+        filtering == kFilterNone
+            ? ScaleRowDown2_Any_NEON
+            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON
+                                          : ScaleRowDown2Box_Any_NEON);
     if (IS_ALIGNED(dst_width, 16)) {
-      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON :
-          (filtering == kFilterLinear ? ScaleRowDown2Linear_NEON :
-          ScaleRowDown2Box_NEON);
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON
+                                               : (filtering == kFilterLinear
+                                                      ? ScaleRowDown2Linear_NEON
+                                                      : ScaleRowDown2Box_NEON);
     }
   }
 #endif
 #if defined(HAS_SCALEROWDOWN2_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSSE3 :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3 :
-        ScaleRowDown2Box_Any_SSSE3);
+    ScaleRowDown2 =
+        filtering == kFilterNone
+            ? ScaleRowDown2_Any_SSSE3
+            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3
+                                          : ScaleRowDown2Box_Any_SSSE3);
     if (IS_ALIGNED(dst_width, 16)) {
-      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSSE3 :
-          (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3 :
-          ScaleRowDown2Box_SSSE3);
+      ScaleRowDown2 =
+          filtering == kFilterNone
+              ? ScaleRowDown2_SSSE3
+              : (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3
+                                            : ScaleRowDown2Box_SSSE3);
     }
   }
 #endif
 #if defined(HAS_SCALEROWDOWN2_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_AVX2 :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 :
-        ScaleRowDown2Box_Any_AVX2);
+    ScaleRowDown2 =
+        filtering == kFilterNone
+            ? ScaleRowDown2_Any_AVX2
+            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2
+                                          : ScaleRowDown2Box_Any_AVX2);
     if (IS_ALIGNED(dst_width, 32)) {
-      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 :
-          (filtering == kFilterLinear ? ScaleRowDown2Linear_AVX2 :
-          ScaleRowDown2Box_AVX2);
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2
+                                               : (filtering == kFilterLinear
+                                                      ? ScaleRowDown2Linear_AVX2
+                                                      : ScaleRowDown2Box_AVX2);
     }
   }
 #endif
-#if defined(HAS_SCALEROWDOWN2_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) &&
-      IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
-    ScaleRowDown2 = filtering ?
-        ScaleRowDown2Box_DSPR2 : ScaleRowDown2_DSPR2;
+#if defined(HAS_SCALEROWDOWN2_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleRowDown2 =
+        filtering == kFilterNone
+            ? ScaleRowDown2_Any_MSA
+            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MSA
+                                          : ScaleRowDown2Box_Any_MSA);
+    if (IS_ALIGNED(dst_width, 32)) {
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MSA
+                                               : (filtering == kFilterLinear
+                                                      ? ScaleRowDown2Linear_MSA
+                                                      : ScaleRowDown2Box_MSA);
+    }
   }
 #endif
 
@@ -105,18 +130,25 @@ static void ScalePlaneDown2(int src_width, int src_height,
   }
 }
 
-static void ScalePlaneDown2_16(int src_width, int src_height,
-                               int dst_width, int dst_height,
-                               int src_stride, int dst_stride,
-                               const uint16* src_ptr, uint16* dst_ptr,
+static void ScalePlaneDown2_16(int src_width,
+                               int src_height,
+                               int dst_width,
+                               int dst_height,
+                               int src_stride,
+                               int dst_stride,
+                               const uint16_t* src_ptr,
+                               uint16_t* dst_ptr,
                                enum FilterMode filtering) {
   int y;
-  void (*ScaleRowDown2)(const uint16* src_ptr, ptrdiff_t src_stride,
-                        uint16* dst_ptr, int dst_width) =
-    filtering == kFilterNone ? ScaleRowDown2_16_C :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C :
-        ScaleRowDown2Box_16_C);
+  void (*ScaleRowDown2)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+                        uint16_t* dst_ptr, int dst_width) =
+      filtering == kFilterNone
+          ? ScaleRowDown2_16_C
+          : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C
+                                        : ScaleRowDown2Box_16_C);
   int row_stride = src_stride << 1;
+  (void)src_width;
+  (void)src_height;
   if (!filtering) {
     src_ptr += src_stride;  // Point to odd rows.
     src_stride = 0;
@@ -124,23 +156,17 @@ static void ScalePlaneDown2_16(int src_width, int src_height,
 
 #if defined(HAS_SCALEROWDOWN2_16_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
-    ScaleRowDown2 = filtering ? ScaleRowDown2Box_16_NEON :
-        ScaleRowDown2_16_NEON;
+    ScaleRowDown2 =
+        filtering ? ScaleRowDown2Box_16_NEON : ScaleRowDown2_16_NEON;
   }
 #endif
 #if defined(HAS_SCALEROWDOWN2_16_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
-    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 :
-        ScaleRowDown2Box_16_SSE2);
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN2_16_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) &&
-      IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
-    ScaleRowDown2 = filtering ?
-        ScaleRowDown2Box_16_DSPR2 : ScaleRowDown2_16_DSPR2;
+    ScaleRowDown2 =
+        filtering == kFilterNone
+            ? ScaleRowDown2_16_SSE2
+            : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2
+                                          : ScaleRowDown2Box_16_SSE2);
   }
 #endif
 
@@ -159,24 +185,30 @@ static void ScalePlaneDown2_16(int src_width, int src_height,
 // This is an optimized version for scaling down a plane to 1/4 of
 // its original size.
 
-static void ScalePlaneDown4(int src_width, int src_height,
-                            int dst_width, int dst_height,
-                            int src_stride, int dst_stride,
-                            const uint8* src_ptr, uint8* dst_ptr,
+static void ScalePlaneDown4(int src_width,
+                            int src_height,
+                            int dst_width,
+                            int dst_height,
+                            int src_stride,
+                            int dst_stride,
+                            const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
                             enum FilterMode filtering) {
   int y;
-  void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) =
+  void (*ScaleRowDown4)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                        uint8_t* dst_ptr, int dst_width) =
       filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C;
   int row_stride = src_stride << 2;
+  (void)src_width;
+  (void)src_height;
   if (!filtering) {
     src_ptr += src_stride * 2;  // Point to row 2.
     src_stride = 0;
   }
 #if defined(HAS_SCALEROWDOWN4_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowDown4 = filtering ?
-        ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON;
+    ScaleRowDown4 =
+        filtering ? ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON;
     if (IS_ALIGNED(dst_width, 8)) {
       ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
     }
@@ -184,8 +216,8 @@ static void ScalePlaneDown4(int src_width, int src_height,
 #endif
 #if defined(HAS_SCALEROWDOWN4_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ScaleRowDown4 = filtering ?
-        ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3;
+    ScaleRowDown4 =
+        filtering ? ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3;
     if (IS_ALIGNED(dst_width, 8)) {
       ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSSE3 : ScaleRowDown4_SSSE3;
     }
@@ -193,19 +225,20 @@ static void ScalePlaneDown4(int src_width, int src_height,
 #endif
 #if defined(HAS_SCALEROWDOWN4_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowDown4 = filtering ?
-        ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2;
+    ScaleRowDown4 =
+        filtering ? ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2;
     if (IS_ALIGNED(dst_width, 16)) {
       ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2;
     }
   }
 #endif
-#if defined(HAS_SCALEROWDOWN4_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) &&
-      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
-    ScaleRowDown4 = filtering ?
-        ScaleRowDown4Box_DSPR2 : ScaleRowDown4_DSPR2;
+#if defined(HAS_SCALEROWDOWN4_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleRowDown4 =
+        filtering ? ScaleRowDown4Box_Any_MSA : ScaleRowDown4_Any_MSA;
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleRowDown4 = filtering ? ScaleRowDown4Box_MSA : ScaleRowDown4_MSA;
+    }
   }
 #endif
 
@@ -219,38 +252,36 @@ static void ScalePlaneDown4(int src_width, int src_height,
   }
 }
 
-static void ScalePlaneDown4_16(int src_width, int src_height,
-                               int dst_width, int dst_height,
-                               int src_stride, int dst_stride,
-                               const uint16* src_ptr, uint16* dst_ptr,
+static void ScalePlaneDown4_16(int src_width,
+                               int src_height,
+                               int dst_width,
+                               int dst_height,
+                               int src_stride,
+                               int dst_stride,
+                               const uint16_t* src_ptr,
+                               uint16_t* dst_ptr,
                                enum FilterMode filtering) {
   int y;
-  void (*ScaleRowDown4)(const uint16* src_ptr, ptrdiff_t src_stride,
-                        uint16* dst_ptr, int dst_width) =
+  void (*ScaleRowDown4)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+                        uint16_t* dst_ptr, int dst_width) =
       filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C;
   int row_stride = src_stride << 2;
+  (void)src_width;
+  (void)src_height;
   if (!filtering) {
     src_ptr += src_stride * 2;  // Point to row 2.
     src_stride = 0;
   }
 #if defined(HAS_SCALEROWDOWN4_16_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
-    ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_NEON :
-        ScaleRowDown4_16_NEON;
+    ScaleRowDown4 =
+        filtering ? ScaleRowDown4Box_16_NEON : ScaleRowDown4_16_NEON;
   }
 #endif
 #if defined(HAS_SCALEROWDOWN4_16_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
-    ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_SSE2 :
-        ScaleRowDown4_16_SSE2;
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN4_16_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) &&
-      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
-    ScaleRowDown4 = filtering ?
-        ScaleRowDown4Box_16_DSPR2 : ScaleRowDown4_16_DSPR2;
+    ScaleRowDown4 =
+        filtering ? ScaleRowDown4Box_16_SSE2 : ScaleRowDown4_16_SSE2;
   }
 #endif
 
@@ -265,18 +296,23 @@ static void ScalePlaneDown4_16(int src_width, int src_height,
 }
 
 // Scale plane down, 3/4
-
-static void ScalePlaneDown34(int src_width, int src_height,
-                             int dst_width, int dst_height,
-                             int src_stride, int dst_stride,
-                             const uint8* src_ptr, uint8* dst_ptr,
+static void ScalePlaneDown34(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
                              enum FilterMode filtering) {
   int y;
-  void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
-  void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
+  void (*ScaleRowDown34_0)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                           uint8_t* dst_ptr, int dst_width);
+  void (*ScaleRowDown34_1)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                           uint8_t* dst_ptr, int dst_width);
   const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  (void)src_width;
+  (void)src_height;
   assert(dst_width % 3 == 0);
   if (!filtering) {
     ScaleRowDown34_0 = ScaleRowDown34_C;
@@ -305,6 +341,26 @@ static void ScalePlaneDown34(int src_width, int src_height,
     }
   }
 #endif
+#if defined(HAS_SCALEROWDOWN34_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_Any_MSA;
+      ScaleRowDown34_1 = ScaleRowDown34_Any_MSA;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_MSA;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_MSA;
+    }
+    if (dst_width % 48 == 0) {
+      if (!filtering) {
+        ScaleRowDown34_0 = ScaleRowDown34_MSA;
+        ScaleRowDown34_1 = ScaleRowDown34_MSA;
+      } else {
+        ScaleRowDown34_0 = ScaleRowDown34_0_Box_MSA;
+        ScaleRowDown34_1 = ScaleRowDown34_1_Box_MSA;
+      }
+    }
+  }
+#endif
 #if defined(HAS_SCALEROWDOWN34_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     if (!filtering) {
@@ -325,19 +381,6 @@ static void ScalePlaneDown34(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_SCALEROWDOWN34_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) &&
-      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
-    if (!filtering) {
-      ScaleRowDown34_0 = ScaleRowDown34_DSPR2;
-      ScaleRowDown34_1 = ScaleRowDown34_DSPR2;
-    } else {
-      ScaleRowDown34_0 = ScaleRowDown34_0_Box_DSPR2;
-      ScaleRowDown34_1 = ScaleRowDown34_1_Box_DSPR2;
-    }
-  }
-#endif
 
   for (y = 0; y < dst_height - 2; y += 3) {
     ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
@@ -346,8 +389,7 @@ static void ScalePlaneDown34(int src_width, int src_height,
     ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride;
     dst_ptr += dst_stride;
-    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,
-                     dst_ptr, dst_width);
+    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride * 2;
     dst_ptr += dst_stride;
   }
@@ -363,17 +405,23 @@ static void ScalePlaneDown34(int src_width, int src_height,
   }
 }
 
-static void ScalePlaneDown34_16(int src_width, int src_height,
-                                int dst_width, int dst_height,
-                                int src_stride, int dst_stride,
-                                const uint16* src_ptr, uint16* dst_ptr,
+static void ScalePlaneDown34_16(int src_width,
+                                int src_height,
+                                int dst_width,
+                                int dst_height,
+                                int src_stride,
+                                int dst_stride,
+                                const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
                                 enum FilterMode filtering) {
   int y;
-  void (*ScaleRowDown34_0)(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst_ptr, int dst_width);
-  void (*ScaleRowDown34_1)(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst_ptr, int dst_width);
+  void (*ScaleRowDown34_0)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+                           uint16_t* dst_ptr, int dst_width);
+  void (*ScaleRowDown34_1)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+                           uint16_t* dst_ptr, int dst_width);
   const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  (void)src_width;
+  (void)src_height;
   assert(dst_width % 3 == 0);
   if (!filtering) {
     ScaleRowDown34_0 = ScaleRowDown34_16_C;
@@ -404,19 +452,6 @@ static void ScalePlaneDown34_16(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_SCALEROWDOWN34_16_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) &&
-      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
-    if (!filtering) {
-      ScaleRowDown34_0 = ScaleRowDown34_16_DSPR2;
-      ScaleRowDown34_1 = ScaleRowDown34_16_DSPR2;
-    } else {
-      ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_DSPR2;
-      ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_DSPR2;
-    }
-  }
-#endif
 
   for (y = 0; y < dst_height - 2; y += 3) {
     ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
@@ -425,8 +460,7 @@ static void ScalePlaneDown34_16(int src_width, int src_height,
     ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride;
     dst_ptr += dst_stride;
-    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,
-                     dst_ptr, dst_width);
+    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride * 2;
     dst_ptr += dst_stride;
   }
@@ -442,7 +476,6 @@ static void ScalePlaneDown34_16(int src_width, int src_height,
   }
 }
 
-
 // Scale plane, 3/8
 // This is an optimized version for scaling down a plane to 3/8
 // of its original size.
@@ -458,18 +491,24 @@ static void ScalePlaneDown34_16(int src_width, int src_height,
 // ggghhhii
 // Boxes are 3x3, 2x3, 3x2 and 2x2
 
-static void ScalePlaneDown38(int src_width, int src_height,
-                             int dst_width, int dst_height,
-                             int src_stride, int dst_stride,
-                             const uint8* src_ptr, uint8* dst_ptr,
+static void ScalePlaneDown38(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
                              enum FilterMode filtering) {
   int y;
-  void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
-  void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
+  void (*ScaleRowDown38_3)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                           uint8_t* dst_ptr, int dst_width);
+  void (*ScaleRowDown38_2)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                           uint8_t* dst_ptr, int dst_width);
   const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
   assert(dst_width % 3 == 0);
+  (void)src_width;
+  (void)src_height;
   if (!filtering) {
     ScaleRowDown38_3 = ScaleRowDown38_C;
     ScaleRowDown38_2 = ScaleRowDown38_C;
@@ -517,16 +556,23 @@ static void ScalePlaneDown38(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_SCALEROWDOWN38_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) &&
-      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+#if defined(HAS_SCALEROWDOWN38_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
     if (!filtering) {
-      ScaleRowDown38_3 = ScaleRowDown38_DSPR2;
-      ScaleRowDown38_2 = ScaleRowDown38_DSPR2;
+      ScaleRowDown38_3 = ScaleRowDown38_Any_MSA;
+      ScaleRowDown38_2 = ScaleRowDown38_Any_MSA;
     } else {
-      ScaleRowDown38_3 = ScaleRowDown38_3_Box_DSPR2;
-      ScaleRowDown38_2 = ScaleRowDown38_2_Box_DSPR2;
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_MSA;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_MSA;
+    }
+    if (dst_width % 12 == 0) {
+      if (!filtering) {
+        ScaleRowDown38_3 = ScaleRowDown38_MSA;
+        ScaleRowDown38_2 = ScaleRowDown38_MSA;
+      } else {
+        ScaleRowDown38_3 = ScaleRowDown38_3_Box_MSA;
+        ScaleRowDown38_2 = ScaleRowDown38_2_Box_MSA;
+      }
     }
   }
 #endif
@@ -554,17 +600,23 @@ static void ScalePlaneDown38(int src_width, int src_height,
   }
 }
 
-static void ScalePlaneDown38_16(int src_width, int src_height,
-                                int dst_width, int dst_height,
-                                int src_stride, int dst_stride,
-                                const uint16* src_ptr, uint16* dst_ptr,
+static void ScalePlaneDown38_16(int src_width,
+                                int src_height,
+                                int dst_width,
+                                int dst_height,
+                                int src_stride,
+                                int dst_stride,
+                                const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
                                 enum FilterMode filtering) {
   int y;
-  void (*ScaleRowDown38_3)(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst_ptr, int dst_width);
-  void (*ScaleRowDown38_2)(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst_ptr, int dst_width);
+  void (*ScaleRowDown38_3)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+                           uint16_t* dst_ptr, int dst_width);
+  void (*ScaleRowDown38_2)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+                           uint16_t* dst_ptr, int dst_width);
   const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  (void)src_width;
+  (void)src_height;
   assert(dst_width % 3 == 0);
   if (!filtering) {
     ScaleRowDown38_3 = ScaleRowDown38_16_C;
@@ -595,19 +647,6 @@ static void ScalePlaneDown38_16(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_SCALEROWDOWN38_16_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) &&
-      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
-    if (!filtering) {
-      ScaleRowDown38_3 = ScaleRowDown38_16_DSPR2;
-      ScaleRowDown38_2 = ScaleRowDown38_16_DSPR2;
-    } else {
-      ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_DSPR2;
-      ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_DSPR2;
-    }
-  }
-#endif
 
   for (y = 0; y < dst_height - 2; y += 3) {
     ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
@@ -634,8 +673,8 @@ static void ScalePlaneDown38_16(int src_width, int src_height,
 
 #define MIN1(x) ((x) < 1 ? 1 : (x))
 
-static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
-  uint32 sum = 0u;
+static __inline uint32_t SumPixels(int iboxwidth, const uint16_t* src_ptr) {
+  uint32_t sum = 0u;
   int x;
   assert(iboxwidth > 0);
   for (x = 0; x < iboxwidth; ++x) {
@@ -644,8 +683,8 @@ static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
   return sum;
 }
 
-static __inline uint32 SumPixels_16(int iboxwidth, const uint32* src_ptr) {
-  uint32 sum = 0u;
+static __inline uint32_t SumPixels_16(int iboxwidth, const uint32_t* src_ptr) {
+  uint32_t sum = 0u;
   int x;
   assert(iboxwidth > 0);
   for (x = 0; x < iboxwidth; ++x) {
@@ -654,8 +693,12 @@ static __inline uint32 SumPixels_16(int iboxwidth, const uint32* src_ptr) {
   return sum;
 }
 
-static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
-                            const uint16* src_ptr, uint8* dst_ptr) {
+static void ScaleAddCols2_C(int dst_width,
+                            int boxheight,
+                            int x,
+                            int dx,
+                            const uint16_t* src_ptr,
+                            uint8_t* dst_ptr) {
   int i;
   int scaletbl[2];
   int minboxwidth = dx >> 16;
@@ -666,13 +709,18 @@ static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
     int ix = x >> 16;
     x += dx;
     boxwidth = MIN1((x >> 16) - ix);
-    *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) *
-        scaletbl[boxwidth - minboxwidth] >> 16;
+    *dst_ptr++ =
+        SumPixels(boxwidth, src_ptr + ix) * scaletbl[boxwidth - minboxwidth] >>
+        16;
   }
 }
 
-static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx,
-                               const uint32* src_ptr, uint16* dst_ptr) {
+static void ScaleAddCols2_16_C(int dst_width,
+                               int boxheight,
+                               int x,
+                               int dx,
+                               const uint32_t* src_ptr,
+                               uint16_t* dst_ptr) {
   int i;
   int scaletbl[2];
   int minboxwidth = dx >> 16;
@@ -684,22 +732,32 @@ static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx,
     x += dx;
     boxwidth = MIN1((x >> 16) - ix);
     *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) *
-        scaletbl[boxwidth - minboxwidth]  >> 16;
+                     scaletbl[boxwidth - minboxwidth] >>
+                 16;
   }
 }
 
-static void ScaleAddCols0_C(int dst_width, int boxheight, int x, int,
-                            const uint16* src_ptr, uint8* dst_ptr) {
+static void ScaleAddCols0_C(int dst_width,
+                            int boxheight,
+                            int x,
+                            int dx,
+                            const uint16_t* src_ptr,
+                            uint8_t* dst_ptr) {
   int scaleval = 65536 / boxheight;
   int i;
+  (void)dx;
   src_ptr += (x >> 16);
   for (i = 0; i < dst_width; ++i) {
     *dst_ptr++ = src_ptr[i] * scaleval >> 16;
   }
 }
 
-static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
-                            const uint16* src_ptr, uint8* dst_ptr) {
+static void ScaleAddCols1_C(int dst_width,
+                            int boxheight,
+                            int x,
+                            int dx,
+                            const uint16_t* src_ptr,
+                            uint8_t* dst_ptr) {
   int boxwidth = MIN1(dx >> 16);
   int scaleval = 65536 / (boxwidth * boxheight);
   int i;
@@ -710,8 +768,12 @@ static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
   }
 }
 
-static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx,
-                               const uint32* src_ptr, uint16* dst_ptr) {
+static void ScaleAddCols1_16_C(int dst_width,
+                               int boxheight,
+                               int x,
+                               int dx,
+                               const uint32_t* src_ptr,
+                               uint16_t* dst_ptr) {
   int boxwidth = MIN1(dx >> 16);
   int scaleval = 65536 / (boxwidth * boxheight);
   int i;
@@ -728,10 +790,14 @@ static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx,
 // one pixel of destination using fixed point (16.16) to step
 // through source, sampling a box of pixel with simple
 // averaging.
-static void ScalePlaneBox(int src_width, int src_height,
-                          int dst_width, int dst_height,
-                          int src_stride, int dst_stride,
-                          const uint8* src_ptr, uint8* dst_ptr) {
+static void ScalePlaneBox(int src_width,
+                          int src_height,
+                          int dst_width,
+                          int dst_height,
+                          int src_stride,
+                          int dst_stride,
+                          const uint8_t* src_ptr,
+                          uint8_t* dst_ptr) {
   int j, k;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -739,18 +805,18 @@ static void ScalePlaneBox(int src_width, int src_height,
   int dx = 0;
   int dy = 0;
   const int max_y = (src_height << 16);
-  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
-             &x, &y, &dx, &dy);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
   {
-    // Allocate a row buffer of uint16.
+    // Allocate a row buffer of uint16_t.
     align_buffer_64(row16, src_width * 2);
     void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
-        const uint16* src_ptr, uint8* dst_ptr) =
-        (dx & 0xffff) ? ScaleAddCols2_C:
-        ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
-    void (*ScaleAddRow)(const uint8* src_ptr, uint16* dst_ptr, int src_width) =
-        ScaleAddRow_C;
+                         const uint16_t* src_ptr, uint8_t* dst_ptr) =
+        (dx & 0xffff) ? ScaleAddCols2_C
+                      : ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
+    void (*ScaleAddRow)(const uint8_t* src_ptr, uint16_t* dst_ptr,
+                        int src_width) = ScaleAddRow_C;
 #if defined(HAS_SCALEADDROW_SSE2)
     if (TestCpuFlag(kCpuHasSSE2)) {
       ScaleAddRow = ScaleAddRow_Any_SSE2;
@@ -775,11 +841,19 @@ static void ScalePlaneBox(int src_width, int src_height,
       }
     }
 #endif
+#if defined(HAS_SCALEADDROW_MSA)
+    if (TestCpuFlag(kCpuHasMSA)) {
+      ScaleAddRow = ScaleAddRow_Any_MSA;
+      if (IS_ALIGNED(src_width, 16)) {
+        ScaleAddRow = ScaleAddRow_MSA;
+      }
+    }
+#endif
 
     for (j = 0; j < dst_height; ++j) {
       int boxheight;
       int iy = y >> 16;
-      const uint8* src = src_ptr + iy * src_stride;
+      const uint8_t* src = src_ptr + iy * src_stride;
       y += dy;
       if (y > max_y) {
         y = max_y;
@@ -787,20 +861,24 @@ static void ScalePlaneBox(int src_width, int src_height,
       boxheight = MIN1((y >> 16) - iy);
       memset(row16, 0, src_width * 2);
       for (k = 0; k < boxheight; ++k) {
-        ScaleAddRow(src, (uint16 *)(row16), src_width);
+        ScaleAddRow(src, (uint16_t*)(row16), src_width);
         src += src_stride;
       }
-      ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), dst_ptr);
+      ScaleAddCols(dst_width, boxheight, x, dx, (uint16_t*)(row16), dst_ptr);
       dst_ptr += dst_stride;
     }
     free_aligned_buffer_64(row16);
   }
 }
 
-static void ScalePlaneBox_16(int src_width, int src_height,
-                             int dst_width, int dst_height,
-                             int src_stride, int dst_stride,
-                             const uint16* src_ptr, uint16* dst_ptr) {
+static void ScalePlaneBox_16(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint16_t* src_ptr,
+                             uint16_t* dst_ptr) {
   int j, k;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -808,17 +886,17 @@ static void ScalePlaneBox_16(int src_width, int src_height,
   int dx = 0;
   int dy = 0;
   const int max_y = (src_height << 16);
-  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
-             &x, &y, &dx, &dy);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
   {
-    // Allocate a row buffer of uint32.
+    // Allocate a row buffer of uint32_t.
     align_buffer_64(row32, src_width * 4);
     void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
-        const uint32* src_ptr, uint16* dst_ptr) =
-        (dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C;
-    void (*ScaleAddRow)(const uint16* src_ptr, uint32* dst_ptr, int src_width) =
-        ScaleAddRow_16_C;
+                         const uint32_t* src_ptr, uint16_t* dst_ptr) =
+        (dx & 0xffff) ? ScaleAddCols2_16_C : ScaleAddCols1_16_C;
+    void (*ScaleAddRow)(const uint16_t* src_ptr, uint32_t* dst_ptr,
+                        int src_width) = ScaleAddRow_16_C;
 
 #if defined(HAS_SCALEADDROW_16_SSE2)
     if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) {
@@ -829,7 +907,7 @@ static void ScalePlaneBox_16(int src_width, int src_height,
     for (j = 0; j < dst_height; ++j) {
       int boxheight;
       int iy = y >> 16;
-      const uint16* src = src_ptr + iy * src_stride;
+      const uint16_t* src = src_ptr + iy * src_stride;
       y += dy;
       if (y > max_y) {
         y = max_y;
@@ -837,10 +915,10 @@ static void ScalePlaneBox_16(int src_width, int src_height,
       boxheight = MIN1((y >> 16) - iy);
       memset(row32, 0, src_width * 4);
       for (k = 0; k < boxheight; ++k) {
-        ScaleAddRow(src, (uint32 *)(row32), src_width);
+        ScaleAddRow(src, (uint32_t*)(row32), src_width);
         src += src_stride;
       }
-      ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), dst_ptr);
+      ScaleAddCols(dst_width, boxheight, x, dx, (uint32_t*)(row32), dst_ptr);
       dst_ptr += dst_stride;
     }
     free_aligned_buffer_64(row32);
@@ -848,10 +926,14 @@ static void ScalePlaneBox_16(int src_width, int src_height,
 }
 
 // Scale plane down with bilinear interpolation.
-void ScalePlaneBilinearDown(int src_width, int src_height,
-                            int dst_width, int dst_height,
-                            int src_stride, int dst_stride,
-                            const uint8* src_ptr, uint8* dst_ptr,
+void ScalePlaneBilinearDown(int src_width,
+                            int src_height,
+                            int dst_width,
+                            int dst_height,
+                            int src_stride,
+                            int dst_stride,
+                            const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
                             enum FilterMode filtering) {
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -864,14 +946,14 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
 
   const int max_y = (src_height - 1) << 16;
   int j;
-  void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
-      int dst_width, int x, int dx) =
+  void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+                          int dst_width, int x, int dx) =
       (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C;
-  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_C;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
-             &x, &y, &dx, &dy);
+  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
 
 #if defined(HAS_INTERPOLATEROW_SSSE3)
@@ -898,16 +980,15 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2)) {
-    InterpolateRow = InterpolateRow_Any_DSPR2;
-    if (IS_ALIGNED(src_width, 4)) {
-      InterpolateRow = InterpolateRow_DSPR2;
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(src_width, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
     }
   }
 #endif
 
-
 #if defined(HAS_SCALEFILTERCOLS_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
     ScaleFilterCols = ScaleFilterCols_SSSE3;
@@ -920,6 +1001,14 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
       ScaleFilterCols = ScaleFilterCols_NEON;
     }
   }
+#endif
+#if defined(HAS_SCALEFILTERCOLS_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleFilterCols = ScaleFilterCols_MSA;
+    }
+  }
 #endif
   if (y > max_y) {
     y = max_y;
@@ -927,7 +1016,7 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
 
   for (j = 0; j < dst_height; ++j) {
     int yi = y >> 16;
-    const uint8* src = src_ptr + yi * src_stride;
+    const uint8_t* src = src_ptr + yi * src_stride;
     if (filtering == kFilterLinear) {
       ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
     } else {
@@ -944,10 +1033,14 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
   free_aligned_buffer_64(row);
 }
 
-void ScalePlaneBilinearDown_16(int src_width, int src_height,
-                               int dst_width, int dst_height,
-                               int src_stride, int dst_stride,
-                               const uint16* src_ptr, uint16* dst_ptr,
+void ScalePlaneBilinearDown_16(int src_width,
+                               int src_height,
+                               int dst_width,
+                               int dst_height,
+                               int src_stride,
+                               int dst_stride,
+                               const uint16_t* src_ptr,
+                               uint16_t* dst_ptr,
                                enum FilterMode filtering) {
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -960,14 +1053,14 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height,
 
   const int max_y = (src_height - 1) << 16;
   int j;
-  void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,
-      int dst_width, int x, int dx) =
+  void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+                          int dst_width, int x, int dx) =
       (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C;
-  void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_16_C;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
-             &x, &y, &dx, &dy);
+  void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_16_C;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
 
 #if defined(HAS_INTERPOLATEROW_16_SSE2)
@@ -1002,15 +1095,6 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_16_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2)) {
-    InterpolateRow = InterpolateRow_Any_16_DSPR2;
-    if (IS_ALIGNED(src_width, 4)) {
-      InterpolateRow = InterpolateRow_16_DSPR2;
-    }
-  }
-#endif
-
 
 #if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
@@ -1023,13 +1107,13 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height,
 
   for (j = 0; j < dst_height; ++j) {
     int yi = y >> 16;
-    const uint16* src = src_ptr + yi * src_stride;
+    const uint16_t* src = src_ptr + yi * src_stride;
     if (filtering == kFilterLinear) {
       ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
     } else {
       int yf = (y >> 8) & 255;
-      InterpolateRow((uint16*)row, src, src_stride, src_width, yf);
-      ScaleFilterCols(dst_ptr, (uint16*)row, dst_width, x, dx);
+      InterpolateRow((uint16_t*)row, src, src_stride, src_width, yf);
+      ScaleFilterCols(dst_ptr, (uint16_t*)row, dst_width, x, dx);
     }
     dst_ptr += dst_stride;
     y += dy;
@@ -1041,10 +1125,14 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height,
 }
 
 // Scale up down with bilinear interpolation.
-void ScalePlaneBilinearUp(int src_width, int src_height,
-                          int dst_width, int dst_height,
-                          int src_stride, int dst_stride,
-                          const uint8* src_ptr, uint8* dst_ptr,
+void ScalePlaneBilinearUp(int src_width,
+                          int src_height,
+                          int dst_width,
+                          int dst_height,
+                          int src_stride,
+                          int dst_stride,
+                          const uint8_t* src_ptr,
+                          uint8_t* dst_ptr,
                           enum FilterMode filtering) {
   int j;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
@@ -1053,14 +1141,14 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
   int dx = 0;
   int dy = 0;
   const int max_y = (src_height - 1) << 16;
-  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_C;
-  void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
-      int dst_width, int x, int dx) =
+  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+                          int dst_width, int x, int dx) =
       filtering ? ScaleFilterCols_C : ScaleCols_C;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
-             &x, &y, &dx, &dy);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
 
 #if defined(HAS_INTERPOLATEROW_SSSE3)
@@ -1087,14 +1175,6 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2)) {
-    InterpolateRow = InterpolateRow_Any_DSPR2;
-    if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_DSPR2;
-    }
-  }
-#endif
 
   if (filtering && src_width >= 32768) {
     ScaleFilterCols = ScaleFilterCols64_C;
@@ -1111,6 +1191,14 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
       ScaleFilterCols = ScaleFilterCols_NEON;
     }
   }
+#endif
+#if defined(HAS_SCALEFILTERCOLS_MSA)
+  if (filtering && TestCpuFlag(kCpuHasMSA) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleFilterCols = ScaleFilterCols_MSA;
+    }
+  }
 #endif
   if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
     ScaleFilterCols = ScaleColsUp2_C;
@@ -1126,13 +1214,13 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
   }
   {
     int yi = y >> 16;
-    const uint8* src = src_ptr + yi * src_stride;
+    const uint8_t* src = src_ptr + yi * src_stride;
 
     // Allocate 2 row buffers.
     const int kRowSize = (dst_width + 31) & ~31;
     align_buffer_64(row, kRowSize * 2);
 
-    uint8* rowptr = row;
+    uint8_t* rowptr = row;
     int rowstride = kRowSize;
     int lasty = yi;
 
@@ -1172,10 +1260,14 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
   }
 }
 
-void ScalePlaneBilinearUp_16(int src_width, int src_height,
-                             int dst_width, int dst_height,
-                             int src_stride, int dst_stride,
-                             const uint16* src_ptr, uint16* dst_ptr,
+void ScalePlaneBilinearUp_16(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint16_t* src_ptr,
+                             uint16_t* dst_ptr,
                              enum FilterMode filtering) {
   int j;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
@@ -1184,14 +1276,14 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
   int dx = 0;
   int dy = 0;
   const int max_y = (src_height - 1) << 16;
-  void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_16_C;
-  void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,
-      int dst_width, int x, int dx) =
+  void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_16_C;
+  void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+                          int dst_width, int x, int dx) =
       filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
-             &x, &y, &dx, &dy);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
 
 #if defined(HAS_INTERPOLATEROW_16_SSE2)
@@ -1226,14 +1318,6 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_16_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2)) {
-    InterpolateRow = InterpolateRow_Any_16_DSPR2;
-    if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_16_DSPR2;
-    }
-  }
-#endif
 
   if (filtering && src_width >= 32768) {
     ScaleFilterCols = ScaleFilterCols64_16_C;
@@ -1257,13 +1341,13 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
   }
   {
     int yi = y >> 16;
-    const uint16* src = src_ptr + yi * src_stride;
+    const uint16_t* src = src_ptr + yi * src_stride;
 
     // Allocate 2 row buffers.
     const int kRowSize = (dst_width + 31) & ~31;
     align_buffer_64(row, kRowSize * 4);
 
-    uint16* rowptr = (uint16*)row;
+    uint16_t* rowptr = (uint16_t*)row;
     int rowstride = kRowSize;
     int lasty = yi;
 
@@ -1308,20 +1392,24 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
 // of x and dx is the integer part of the source position and
 // the lower 16 bits are the fixed decimal part.
 
-static void ScalePlaneSimple(int src_width, int src_height,
-                             int dst_width, int dst_height,
-                             int src_stride, int dst_stride,
-                             const uint8* src_ptr, uint8* dst_ptr) {
+static void ScalePlaneSimple(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint8_t* src_ptr,
+                             uint8_t* dst_ptr) {
   int i;
-  void (*ScaleCols)(uint8* dst_ptr, const uint8* src_ptr,
-      int dst_width, int x, int dx) = ScaleCols_C;
+  void (*ScaleCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, int dst_width,
+                    int x, int dx) = ScaleCols_C;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
   int y = 0;
   int dx = 0;
   int dy = 0;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,
-             &x, &y, &dx, &dy);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
 
   if (src_width * 2 == dst_width && x < 0x8000) {
@@ -1340,20 +1428,24 @@ static void ScalePlaneSimple(int src_width, int src_height,
   }
 }
 
-static void ScalePlaneSimple_16(int src_width, int src_height,
-                                int dst_width, int dst_height,
-                                int src_stride, int dst_stride,
-                                const uint16* src_ptr, uint16* dst_ptr) {
+static void ScalePlaneSimple_16(int src_width,
+                                int src_height,
+                                int dst_width,
+                                int dst_height,
+                                int src_stride,
+                                int dst_stride,
+                                const uint16_t* src_ptr,
+                                uint16_t* dst_ptr) {
   int i;
-  void (*ScaleCols)(uint16* dst_ptr, const uint16* src_ptr,
-      int dst_width, int x, int dx) = ScaleCols_16_C;
+  void (*ScaleCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, int dst_width,
+                    int x, int dx) = ScaleCols_16_C;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
   int y = 0;
   int dx = 0;
   int dy = 0;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,
-             &x, &y, &dx, &dy);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
 
   if (src_width * 2 == dst_width && x < 0x8000) {
@@ -1366,8 +1458,7 @@ static void ScalePlaneSimple_16(int src_width, int src_height,
   }
 
   for (i = 0; i < dst_height; ++i) {
-    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride,
-              dst_width, x, dx);
+    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
     dst_ptr += dst_stride;
     y += dy;
   }
@@ -1377,14 +1468,18 @@ static void ScalePlaneSimple_16(int src_width, int src_height,
 // This function dispatches to a specialized scaler based on scale factor.
 
 LIBYUV_API
-void ScalePlane(const uint8* src, int src_stride,
-                int src_width, int src_height,
-                uint8* dst, int dst_stride,
-                int dst_width, int dst_height,
+void ScalePlane(const uint8_t* src,
+                int src_stride,
+                int src_width,
+                int src_height,
+                uint8_t* dst,
+                int dst_stride,
+                int dst_width,
+                int dst_height,
                 enum FilterMode filtering) {
   // Simplify filtering when possible.
-  filtering = ScaleFilterReduce(src_width, src_height,
-                                dst_width, dst_height, filtering);
+  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+                                filtering);
 
   // Negative height means invert the image.
   if (src_height < 0) {
@@ -1403,46 +1498,42 @@ void ScalePlane(const uint8* src, int src_stride,
   if (dst_width == src_width && filtering != kFilterBox) {
     int dy = FixedDiv(src_height, dst_height);
     // Arbitrary scale vertically, but unscaled horizontally.
-    ScalePlaneVertical(src_height,
-                       dst_width, dst_height,
-                       src_stride, dst_stride, src, dst,
-                       0, 0, dy, 1, filtering);
+    ScalePlaneVertical(src_height, dst_width, dst_height, src_stride,
+                       dst_stride, src, dst, 0, 0, dy, 1, filtering);
     return;
   }
   if (dst_width <= Abs(src_width) && dst_height <= src_height) {
     // Scale down.
-    if (4 * dst_width == 3 * src_width &&
-        4 * dst_height == 3 * src_height) {
+    if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) {
       // optimized, 3/4
-      ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
-                       src_stride, dst_stride, src, dst, filtering);
+      ScalePlaneDown34(src_width, src_height, dst_width, dst_height, src_stride,
+                       dst_stride, src, dst, filtering);
       return;
     }
     if (2 * dst_width == src_width && 2 * dst_height == src_height) {
       // optimized, 1/2
-      ScalePlaneDown2(src_width, src_height, dst_width, dst_height,
-                      src_stride, dst_stride, src, dst, filtering);
+      ScalePlaneDown2(src_width, src_height, dst_width, dst_height, src_stride,
+                      dst_stride, src, dst, filtering);
       return;
     }
     // 3/8 rounded up for odd sized chroma height.
-    if (8 * dst_width == 3 * src_width &&
-        dst_height == ((src_height * 3 + 7) / 8)) {
+    if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) {
       // optimized, 3/8
-      ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
-                       src_stride, dst_stride, src, dst, filtering);
+      ScalePlaneDown38(src_width, src_height, dst_width, dst_height, src_stride,
+                       dst_stride, src, dst, filtering);
       return;
     }
     if (4 * dst_width == src_width && 4 * dst_height == src_height &&
         (filtering == kFilterBox || filtering == kFilterNone)) {
       // optimized, 1/4
-      ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
-                      src_stride, dst_stride, src, dst, filtering);
+      ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride,
+                      dst_stride, src, dst, filtering);
       return;
     }
   }
   if (filtering == kFilterBox && dst_height * 2 < src_height) {
-    ScalePlaneBox(src_width, src_height, dst_width, dst_height,
-                  src_stride, dst_stride, src, dst);
+    ScalePlaneBox(src_width, src_height, dst_width, dst_height, src_stride,
+                  dst_stride, src, dst);
     return;
   }
   if (filtering && dst_height > src_height) {
@@ -1455,19 +1546,23 @@ void ScalePlane(const uint8* src, int src_stride,
                            src_stride, dst_stride, src, dst, filtering);
     return;
   }
-  ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
-                   src_stride, dst_stride, src, dst);
+  ScalePlaneSimple(src_width, src_height, dst_width, dst_height, src_stride,
+                   dst_stride, src, dst);
 }
 
 LIBYUV_API
-void ScalePlane_16(const uint16* src, int src_stride,
-                  int src_width, int src_height,
-                  uint16* dst, int dst_stride,
-                  int dst_width, int dst_height,
-                  enum FilterMode filtering) {
+void ScalePlane_16(const uint16_t* src,
+                   int src_stride,
+                   int src_width,
+                   int src_height,
+                   uint16_t* dst,
+                   int dst_stride,
+                   int dst_width,
+                   int dst_height,
+                   enum FilterMode filtering) {
   // Simplify filtering when possible.
-  filtering = ScaleFilterReduce(src_width, src_height,
-                                dst_width, dst_height, filtering);
+  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+                                filtering);
 
   // Negative height means invert the image.
   if (src_height < 0) {
@@ -1483,19 +1578,16 @@ void ScalePlane_16(const uint16* src, int src_stride,
     CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height);
     return;
   }
-  if (dst_width == src_width) {
+  if (dst_width == src_width && filtering != kFilterBox) {
     int dy = FixedDiv(src_height, dst_height);
     // Arbitrary scale vertically, but unscaled vertically.
-    ScalePlaneVertical_16(src_height,
-                          dst_width, dst_height,
-                          src_stride, dst_stride, src, dst,
-                          0, 0, dy, 1, filtering);
+    ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride,
+                          dst_stride, src, dst, 0, 0, dy, 1, filtering);
     return;
   }
   if (dst_width <= Abs(src_width) && dst_height <= src_height) {
     // Scale down.
-    if (4 * dst_width == 3 * src_width &&
-        4 * dst_height == 3 * src_height) {
+    if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) {
       // optimized, 3/4
       ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height,
                           src_stride, dst_stride, src, dst, filtering);
@@ -1508,15 +1600,14 @@ void ScalePlane_16(const uint16* src, int src_stride,
       return;
     }
     // 3/8 rounded up for odd sized chroma height.
-    if (8 * dst_width == 3 * src_width &&
-        dst_height == ((src_height * 3 + 7) / 8)) {
+    if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) {
       // optimized, 3/8
       ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height,
                           src_stride, dst_stride, src, dst, filtering);
       return;
     }
     if (4 * dst_width == src_width && 4 * dst_height == src_height &&
-               filtering != kFilterBilinear) {
+        (filtering == kFilterBox || filtering == kFilterNone)) {
       // optimized, 1/4
       ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height,
                          src_stride, dst_stride, src, dst, filtering);
@@ -1524,8 +1615,8 @@ void ScalePlane_16(const uint16* src, int src_stride,
     }
   }
   if (filtering == kFilterBox && dst_height * 2 < src_height) {
-    ScalePlaneBox_16(src_width, src_height, dst_width, dst_height,
-                     src_stride, dst_stride, src, dst);
+    ScalePlaneBox_16(src_width, src_height, dst_width, dst_height, src_stride,
+                     dst_stride, src, dst);
     return;
   }
   if (filtering && dst_height > src_height) {
@@ -1538,132 +1629,110 @@ void ScalePlane_16(const uint16* src, int src_stride,
                               src_stride, dst_stride, src, dst, filtering);
     return;
   }
-  ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height,
-                      src_stride, dst_stride, src, dst);
+  ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height, src_stride,
+                      dst_stride, src, dst);
 }
 
 // Scale an I420 image.
 // This function in turn calls a scaling function for each plane.
 
 LIBYUV_API
-int I420Scale(const uint8* src_y, int src_stride_y,
-              const uint8* src_u, int src_stride_u,
-              const uint8* src_v, int src_stride_v,
-              int src_width, int src_height,
-              uint8* dst_y, int dst_stride_y,
-              uint8* dst_u, int dst_stride_u,
-              uint8* dst_v, int dst_stride_v,
-              int dst_width, int dst_height,
+int I420Scale(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              int src_width,
+              int src_height,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int dst_width,
+              int dst_height,
               enum FilterMode filtering) {
   int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
   int src_halfheight = SUBSAMPLE(src_height, 1, 1);
   int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
   int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
   if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
-      src_width > 32768 || src_height > 32768 ||
-      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
 
-  ScalePlane(src_y, src_stride_y, src_width, src_height,
-             dst_y, dst_stride_y, dst_width, dst_height,
-             filtering);
-  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
-             dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
-             filtering);
-  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
-             dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
-             filtering);
+  ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+             dst_width, dst_height, filtering);
+  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
+             dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
+  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
+             dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
   return 0;
 }
 
 LIBYUV_API
-int I420Scale_16(const uint16* src_y, int src_stride_y,
-                 const uint16* src_u, int src_stride_u,
-                 const uint16* src_v, int src_stride_v,
-                 int src_width, int src_height,
-                 uint16* dst_y, int dst_stride_y,
-                 uint16* dst_u, int dst_stride_u,
-                 uint16* dst_v, int dst_stride_v,
-                 int dst_width, int dst_height,
+int I420Scale_16(const uint16_t* src_y,
+                 int src_stride_y,
+                 const uint16_t* src_u,
+                 int src_stride_u,
+                 const uint16_t* src_v,
+                 int src_stride_v,
+                 int src_width,
+                 int src_height,
+                 uint16_t* dst_y,
+                 int dst_stride_y,
+                 uint16_t* dst_u,
+                 int dst_stride_u,
+                 uint16_t* dst_v,
+                 int dst_stride_v,
+                 int dst_width,
+                 int dst_height,
                  enum FilterMode filtering) {
   int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
   int src_halfheight = SUBSAMPLE(src_height, 1, 1);
   int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
   int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
   if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
-      src_width > 32768 || src_height > 32768 ||
-      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
 
-  ScalePlane_16(src_y, src_stride_y, src_width, src_height,
-                dst_y, dst_stride_y, dst_width, dst_height,
-                filtering);
-  ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight,
-                dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
-                filtering);
-  ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight,
-                dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
-                filtering);
+  ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+                dst_width, dst_height, filtering);
+  ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
+                dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
+  ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
+                dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
   return 0;
 }
 
 // Deprecated api
 LIBYUV_API
-int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
-          int src_stride_y, int src_stride_u, int src_stride_v,
-          int src_width, int src_height,
-          uint8* dst_y, uint8* dst_u, uint8* dst_v,
-          int dst_stride_y, int dst_stride_u, int dst_stride_v,
-          int dst_width, int dst_height,
+int Scale(const uint8_t* src_y,
+          const uint8_t* src_u,
+          const uint8_t* src_v,
+          int src_stride_y,
+          int src_stride_u,
+          int src_stride_v,
+          int src_width,
+          int src_height,
+          uint8_t* dst_y,
+          uint8_t* dst_u,
+          uint8_t* dst_v,
+          int dst_stride_y,
+          int dst_stride_u,
+          int dst_stride_v,
+          int dst_width,
+          int dst_height,
           LIBYUV_BOOL interpolate) {
-  return I420Scale(src_y, src_stride_y,
-                   src_u, src_stride_u,
-                   src_v, src_stride_v,
-                   src_width, src_height,
-                   dst_y, dst_stride_y,
-                   dst_u, dst_stride_u,
-                   dst_v, dst_stride_v,
-                   dst_width, dst_height,
-                   interpolate ? kFilterBox : kFilterNone);
-}
-
-// Deprecated api
-LIBYUV_API
-int ScaleOffset(const uint8* src, int src_width, int src_height,
-                uint8* dst, int dst_width, int dst_height, int dst_yoffset,
-                LIBYUV_BOOL interpolate) {
-  // Chroma requires offset to multiple of 2.
-  int dst_yoffset_even = dst_yoffset & ~1;
-  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
-  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
-  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
-  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
-  int aheight = dst_height - dst_yoffset_even * 2;  // actual output height
-  const uint8* src_y = src;
-  const uint8* src_u = src + src_width * src_height;
-  const uint8* src_v = src + src_width * src_height +
-                             src_halfwidth * src_halfheight;
-  uint8* dst_y = dst + dst_yoffset_even * dst_width;
-  uint8* dst_u = dst + dst_width * dst_height +
-                 (dst_yoffset_even >> 1) * dst_halfwidth;
-  uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
-                 (dst_yoffset_even >> 1) * dst_halfwidth;
-  if (!src || src_width <= 0 || src_height <= 0 ||
-      !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset_even < 0 ||
-      dst_yoffset_even >= dst_height) {
-    return -1;
-  }
-  return I420Scale(src_y, src_width,
-                   src_u, src_halfwidth,
-                   src_v, src_halfwidth,
-                   src_width, src_height,
-                   dst_y, dst_width,
-                   dst_u, dst_halfwidth,
-                   dst_v, dst_halfwidth,
-                   dst_width, aheight,
-                   interpolate ? kFilterBox : kFilterNone);
+  return I420Scale(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                   src_stride_v, src_width, src_height, dst_y, dst_stride_y,
+                   dst_u, dst_stride_u, dst_v, dst_stride_v, dst_width,
+                   dst_height, interpolate ? kFilterBox : kFilterNone);
 }
 
 #ifdef __cplusplus
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/scale_any.cc b/media/libvpx/libvpx/third_party/libyuv/source/scale_any.cc
index ed76a9e4c0..53ad136404 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/scale_any.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/scale_any.cc
@@ -20,184 +20,429 @@ extern "C" {
 
 // Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
 #define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK)                            \
-    void NAMEANY(uint8* dst_ptr, const uint8* src_ptr,                         \
-                 int dst_width, int x, int dx) {                               \
-      int n = dst_width & ~MASK;                                               \
-      if (n > 0) {                                                             \
-        TERP_SIMD(dst_ptr, src_ptr, n, x, dx);                                 \
-      }                                                                        \
-      TERP_C(dst_ptr + n * BPP, src_ptr,                                       \
-             dst_width & MASK, x + n * dx, dx);                                \
-    }
+  void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \
+               int dx) {                                                       \
+    int r = dst_width & MASK;                                                  \
+    int n = dst_width & ~MASK;                                                 \
+    if (n > 0) {                                                               \
+      TERP_SIMD(dst_ptr, src_ptr, n, x, dx);                                   \
+    }                                                                          \
+    TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx);                     \
+  }
 
 #ifdef HAS_SCALEFILTERCOLS_NEON
 CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
 #endif
+#ifdef HAS_SCALEFILTERCOLS_MSA
+CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15)
+#endif
 #ifdef HAS_SCALEARGBCOLS_NEON
 CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
 #endif
+#ifdef HAS_SCALEARGBCOLS_MSA
+CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3)
+#endif
 #ifdef HAS_SCALEARGBFILTERCOLS_NEON
-CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON,
-     ScaleARGBFilterCols_C, 4, 3)
+CANY(ScaleARGBFilterCols_Any_NEON,
+     ScaleARGBFilterCols_NEON,
+     ScaleARGBFilterCols_C,
+     4,
+     3)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_MSA
+CANY(ScaleARGBFilterCols_Any_MSA,
+     ScaleARGBFilterCols_MSA,
+     ScaleARGBFilterCols_C,
+     4,
+     7)
 #endif
 #undef CANY
 
 // Fixed scale down.
+// Mask may be non-power of 2, so use MOD
 #define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
-    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride,                   \
-                 uint8* dst_ptr, int dst_width) {                              \
-      int r = (int)((unsigned int)dst_width % (MASK + 1));                     \
-      int n = dst_width - r;                                                   \
-      if (n > 0) {                                                             \
-        SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                    \
-      }                                                                        \
-      SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                 \
-                     dst_ptr + n * BPP, r);                                    \
-    }
+  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \
+               int dst_width) {                                                \
+    int r = (int)((unsigned int)dst_width % (MASK + 1)); /* NOLINT */          \
+    int n = dst_width - r;                                                     \
+    if (n > 0) {                                                               \
+      SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                      \
+    }                                                                          \
+    SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                   \
+                   dst_ptr + n * BPP, r);                                      \
+  }
 
 // Fixed scale down for odd source width.  Used by I420Blend subsampling.
 // Since dst_width is (width + 1) / 2, this function scales one less pixel
 // and copies the last pixel.
 #define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
-    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride,                   \
-                 uint8* dst_ptr, int dst_width) {                              \
-      int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1));               \
-      int n = dst_width - r;                                                   \
-      if (n > 0) {                                                             \
-        SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                    \
-      }                                                                        \
-      SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                 \
-                     dst_ptr + n * BPP, r);                                    \
-    }
+  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \
+               int dst_width) {                                                \
+    int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); /* NOLINT */    \
+    int n = (dst_width - 1) - r;                                               \
+    if (n > 0) {                                                               \
+      SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                      \
+    }                                                                          \
+    SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                   \
+                   dst_ptr + n * BPP, r + 1);                                  \
+  }
 
 #ifdef HAS_SCALEROWDOWN2_SSSE3
 SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)
-SDANY(ScaleRowDown2Linear_Any_SSSE3, ScaleRowDown2Linear_SSSE3,
-      ScaleRowDown2Linear_C, 2, 1, 15)
-SDANY(ScaleRowDown2Box_Any_SSSE3, ScaleRowDown2Box_SSSE3, ScaleRowDown2Box_C,
-      2, 1, 15)
-SDODD(ScaleRowDown2Box_Odd_SSSE3, ScaleRowDown2Box_SSSE3,
-      ScaleRowDown2Box_Odd_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_SSSE3,
+      ScaleRowDown2Linear_SSSE3,
+      ScaleRowDown2Linear_C,
+      2,
+      1,
+      15)
+SDANY(ScaleRowDown2Box_Any_SSSE3,
+      ScaleRowDown2Box_SSSE3,
+      ScaleRowDown2Box_C,
+      2,
+      1,
+      15)
+SDODD(ScaleRowDown2Box_Odd_SSSE3,
+      ScaleRowDown2Box_SSSE3,
+      ScaleRowDown2Box_Odd_C,
+      2,
+      1,
+      15)
 #endif
 #ifdef HAS_SCALEROWDOWN2_AVX2
 SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
-SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2,
-      ScaleRowDown2Linear_C, 2, 1, 31)
-SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C,
-      2, 1, 31)
-SDODD(ScaleRowDown2Box_Odd_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_Odd_C,
-      2, 1, 31)
+SDANY(ScaleRowDown2Linear_Any_AVX2,
+      ScaleRowDown2Linear_AVX2,
+      ScaleRowDown2Linear_C,
+      2,
+      1,
+      31)
+SDANY(ScaleRowDown2Box_Any_AVX2,
+      ScaleRowDown2Box_AVX2,
+      ScaleRowDown2Box_C,
+      2,
+      1,
+      31)
+SDODD(ScaleRowDown2Box_Odd_AVX2,
+      ScaleRowDown2Box_AVX2,
+      ScaleRowDown2Box_Odd_C,
+      2,
+      1,
+      31)
 #endif
 #ifdef HAS_SCALEROWDOWN2_NEON
 SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
-SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON,
-      ScaleRowDown2Linear_C, 2, 1, 15)
-SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON,
-      ScaleRowDown2Box_C, 2, 1, 15)
-SDODD(ScaleRowDown2Box_Odd_NEON, ScaleRowDown2Box_NEON,
-      ScaleRowDown2Box_Odd_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_NEON,
+      ScaleRowDown2Linear_NEON,
+      ScaleRowDown2Linear_C,
+      2,
+      1,
+      15)
+SDANY(ScaleRowDown2Box_Any_NEON,
+      ScaleRowDown2Box_NEON,
+      ScaleRowDown2Box_C,
+      2,
+      1,
+      15)
+SDODD(ScaleRowDown2Box_Odd_NEON,
+      ScaleRowDown2Box_NEON,
+      ScaleRowDown2Box_Odd_C,
+      2,
+      1,
+      15)
+#endif
+#ifdef HAS_SCALEROWDOWN2_MSA
+SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31)
+SDANY(ScaleRowDown2Linear_Any_MSA,
+      ScaleRowDown2Linear_MSA,
+      ScaleRowDown2Linear_C,
+      2,
+      1,
+      31)
+SDANY(ScaleRowDown2Box_Any_MSA,
+      ScaleRowDown2Box_MSA,
+      ScaleRowDown2Box_C,
+      2,
+      1,
+      31)
 #endif
 #ifdef HAS_SCALEROWDOWN4_SSSE3
 SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
-SDANY(ScaleRowDown4Box_Any_SSSE3, ScaleRowDown4Box_SSSE3, ScaleRowDown4Box_C,
-      4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_SSSE3,
+      ScaleRowDown4Box_SSSE3,
+      ScaleRowDown4Box_C,
+      4,
+      1,
+      7)
 #endif
 #ifdef HAS_SCALEROWDOWN4_AVX2
 SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)
-SDANY(ScaleRowDown4Box_Any_AVX2, ScaleRowDown4Box_AVX2, ScaleRowDown4Box_C,
-      4, 1, 15)
+SDANY(ScaleRowDown4Box_Any_AVX2,
+      ScaleRowDown4Box_AVX2,
+      ScaleRowDown4Box_C,
+      4,
+      1,
+      15)
 #endif
 #ifdef HAS_SCALEROWDOWN4_NEON
 SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7)
-SDANY(ScaleRowDown4Box_Any_NEON, ScaleRowDown4Box_NEON, ScaleRowDown4Box_C,
-      4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_NEON,
+      ScaleRowDown4Box_NEON,
+      ScaleRowDown4Box_C,
+      4,
+      1,
+      7)
+#endif
+#ifdef HAS_SCALEROWDOWN4_MSA
+SDANY(ScaleRowDown4_Any_MSA, ScaleRowDown4_MSA, ScaleRowDown4_C, 4, 1, 15)
+SDANY(ScaleRowDown4Box_Any_MSA,
+      ScaleRowDown4Box_MSA,
+      ScaleRowDown4Box_C,
+      4,
+      1,
+      15)
 #endif
 #ifdef HAS_SCALEROWDOWN34_SSSE3
-SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3,
-      ScaleRowDown34_C, 4 / 3, 1, 23)
-SDANY(ScaleRowDown34_0_Box_Any_SSSE3, ScaleRowDown34_0_Box_SSSE3,
-      ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
-SDANY(ScaleRowDown34_1_Box_Any_SSSE3, ScaleRowDown34_1_Box_SSSE3,
-      ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_Any_SSSE3,
+      ScaleRowDown34_SSSE3,
+      ScaleRowDown34_C,
+      4 / 3,
+      1,
+      23)
+SDANY(ScaleRowDown34_0_Box_Any_SSSE3,
+      ScaleRowDown34_0_Box_SSSE3,
+      ScaleRowDown34_0_Box_C,
+      4 / 3,
+      1,
+      23)
+SDANY(ScaleRowDown34_1_Box_Any_SSSE3,
+      ScaleRowDown34_1_Box_SSSE3,
+      ScaleRowDown34_1_Box_C,
+      4 / 3,
+      1,
+      23)
 #endif
 #ifdef HAS_SCALEROWDOWN34_NEON
-SDANY(ScaleRowDown34_Any_NEON, ScaleRowDown34_NEON,
-      ScaleRowDown34_C, 4 / 3, 1, 23)
-SDANY(ScaleRowDown34_0_Box_Any_NEON, ScaleRowDown34_0_Box_NEON,
-      ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
-SDANY(ScaleRowDown34_1_Box_Any_NEON, ScaleRowDown34_1_Box_NEON,
-      ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_Any_NEON,
+      ScaleRowDown34_NEON,
+      ScaleRowDown34_C,
+      4 / 3,
+      1,
+      23)
+SDANY(ScaleRowDown34_0_Box_Any_NEON,
+      ScaleRowDown34_0_Box_NEON,
+      ScaleRowDown34_0_Box_C,
+      4 / 3,
+      1,
+      23)
+SDANY(ScaleRowDown34_1_Box_Any_NEON,
+      ScaleRowDown34_1_Box_NEON,
+      ScaleRowDown34_1_Box_C,
+      4 / 3,
+      1,
+      23)
+#endif
+#ifdef HAS_SCALEROWDOWN34_MSA
+SDANY(ScaleRowDown34_Any_MSA,
+      ScaleRowDown34_MSA,
+      ScaleRowDown34_C,
+      4 / 3,
+      1,
+      47)
+SDANY(ScaleRowDown34_0_Box_Any_MSA,
+      ScaleRowDown34_0_Box_MSA,
+      ScaleRowDown34_0_Box_C,
+      4 / 3,
+      1,
+      47)
+SDANY(ScaleRowDown34_1_Box_Any_MSA,
+      ScaleRowDown34_1_Box_MSA,
+      ScaleRowDown34_1_Box_C,
+      4 / 3,
+      1,
+      47)
 #endif
 #ifdef HAS_SCALEROWDOWN38_SSSE3
-SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3,
-      ScaleRowDown38_C, 8 / 3, 1, 11)
-SDANY(ScaleRowDown38_3_Box_Any_SSSE3, ScaleRowDown38_3_Box_SSSE3,
-      ScaleRowDown38_3_Box_C, 8 / 3, 1, 5)
-SDANY(ScaleRowDown38_2_Box_Any_SSSE3, ScaleRowDown38_2_Box_SSSE3,
-      ScaleRowDown38_2_Box_C, 8 / 3, 1, 5)
+SDANY(ScaleRowDown38_Any_SSSE3,
+      ScaleRowDown38_SSSE3,
+      ScaleRowDown38_C,
+      8 / 3,
+      1,
+      11)
+SDANY(ScaleRowDown38_3_Box_Any_SSSE3,
+      ScaleRowDown38_3_Box_SSSE3,
+      ScaleRowDown38_3_Box_C,
+      8 / 3,
+      1,
+      5)
+SDANY(ScaleRowDown38_2_Box_Any_SSSE3,
+      ScaleRowDown38_2_Box_SSSE3,
+      ScaleRowDown38_2_Box_C,
+      8 / 3,
+      1,
+      5)
 #endif
 #ifdef HAS_SCALEROWDOWN38_NEON
-SDANY(ScaleRowDown38_Any_NEON, ScaleRowDown38_NEON,
-      ScaleRowDown38_C, 8 / 3, 1, 11)
-SDANY(ScaleRowDown38_3_Box_Any_NEON, ScaleRowDown38_3_Box_NEON,
-      ScaleRowDown38_3_Box_C, 8 / 3, 1, 11)
-SDANY(ScaleRowDown38_2_Box_Any_NEON, ScaleRowDown38_2_Box_NEON,
-      ScaleRowDown38_2_Box_C, 8 / 3, 1, 11)
+SDANY(ScaleRowDown38_Any_NEON,
+      ScaleRowDown38_NEON,
+      ScaleRowDown38_C,
+      8 / 3,
+      1,
+      11)
+SDANY(ScaleRowDown38_3_Box_Any_NEON,
+      ScaleRowDown38_3_Box_NEON,
+      ScaleRowDown38_3_Box_C,
+      8 / 3,
+      1,
+      11)
+SDANY(ScaleRowDown38_2_Box_Any_NEON,
+      ScaleRowDown38_2_Box_NEON,
+      ScaleRowDown38_2_Box_C,
+      8 / 3,
+      1,
+      11)
+#endif
+#ifdef HAS_SCALEROWDOWN38_MSA
+SDANY(ScaleRowDown38_Any_MSA,
+      ScaleRowDown38_MSA,
+      ScaleRowDown38_C,
+      8 / 3,
+      1,
+      11)
+SDANY(ScaleRowDown38_3_Box_Any_MSA,
+      ScaleRowDown38_3_Box_MSA,
+      ScaleRowDown38_3_Box_C,
+      8 / 3,
+      1,
+      11)
+SDANY(ScaleRowDown38_2_Box_Any_MSA,
+      ScaleRowDown38_2_Box_MSA,
+      ScaleRowDown38_2_Box_C,
+      8 / 3,
+      1,
+      11)
 #endif
 
 #ifdef HAS_SCALEARGBROWDOWN2_SSE2
-SDANY(ScaleARGBRowDown2_Any_SSE2, ScaleARGBRowDown2_SSE2,
-      ScaleARGBRowDown2_C, 2, 4, 3)
-SDANY(ScaleARGBRowDown2Linear_Any_SSE2, ScaleARGBRowDown2Linear_SSE2,
-      ScaleARGBRowDown2Linear_C, 2, 4, 3)
-SDANY(ScaleARGBRowDown2Box_Any_SSE2, ScaleARGBRowDown2Box_SSE2,
-      ScaleARGBRowDown2Box_C, 2, 4, 3)
+SDANY(ScaleARGBRowDown2_Any_SSE2,
+      ScaleARGBRowDown2_SSE2,
+      ScaleARGBRowDown2_C,
+      2,
+      4,
+      3)
+SDANY(ScaleARGBRowDown2Linear_Any_SSE2,
+      ScaleARGBRowDown2Linear_SSE2,
+      ScaleARGBRowDown2Linear_C,
+      2,
+      4,
+      3)
+SDANY(ScaleARGBRowDown2Box_Any_SSE2,
+      ScaleARGBRowDown2Box_SSE2,
+      ScaleARGBRowDown2Box_C,
+      2,
+      4,
+      3)
 #endif
 #ifdef HAS_SCALEARGBROWDOWN2_NEON
-SDANY(ScaleARGBRowDown2_Any_NEON, ScaleARGBRowDown2_NEON,
-      ScaleARGBRowDown2_C, 2, 4, 7)
-SDANY(ScaleARGBRowDown2Linear_Any_NEON, ScaleARGBRowDown2Linear_NEON,
-      ScaleARGBRowDown2Linear_C, 2, 4, 7)
-SDANY(ScaleARGBRowDown2Box_Any_NEON, ScaleARGBRowDown2Box_NEON,
-      ScaleARGBRowDown2Box_C, 2, 4, 7)
+SDANY(ScaleARGBRowDown2_Any_NEON,
+      ScaleARGBRowDown2_NEON,
+      ScaleARGBRowDown2_C,
+      2,
+      4,
+      7)
+SDANY(ScaleARGBRowDown2Linear_Any_NEON,
+      ScaleARGBRowDown2Linear_NEON,
+      ScaleARGBRowDown2Linear_C,
+      2,
+      4,
+      7)
+SDANY(ScaleARGBRowDown2Box_Any_NEON,
+      ScaleARGBRowDown2Box_NEON,
+      ScaleARGBRowDown2Box_C,
+      2,
+      4,
+      7)
+#endif
+#ifdef HAS_SCALEARGBROWDOWN2_MSA
+SDANY(ScaleARGBRowDown2_Any_MSA,
+      ScaleARGBRowDown2_MSA,
+      ScaleARGBRowDown2_C,
+      2,
+      4,
+      3)
+SDANY(ScaleARGBRowDown2Linear_Any_MSA,
+      ScaleARGBRowDown2Linear_MSA,
+      ScaleARGBRowDown2Linear_C,
+      2,
+      4,
+      3)
+SDANY(ScaleARGBRowDown2Box_Any_MSA,
+      ScaleARGBRowDown2Box_MSA,
+      ScaleARGBRowDown2Box_C,
+      2,
+      4,
+      3)
 #endif
 #undef SDANY
 
 // Scale down by even scale factor.
-#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK)          \
-    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx,    \
-                 uint8* dst_ptr, int dst_width) {                              \
-      int r = (int)((unsigned int)dst_width % (MASK + 1));                     \
-      int n = dst_width - r;                                                   \
-      if (n > 0) {                                                             \
-        SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n);         \
-      }                                                                        \
-      SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride,              \
-                     src_stepx, dst_ptr + n * BPP, r);                         \
-    }
+#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK)       \
+  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, \
+               uint8_t* dst_ptr, int dst_width) {                           \
+    int r = dst_width & MASK;                                               \
+    int n = dst_width & ~MASK;                                              \
+    if (n > 0) {                                                            \
+      SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n);        \
+    }                                                                       \
+    SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx,  \
+                   dst_ptr + n * BPP, r);                                   \
+  }
 
 #ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2
-SDAANY(ScaleARGBRowDownEven_Any_SSE2, ScaleARGBRowDownEven_SSE2,
-       ScaleARGBRowDownEven_C, 4, 3)
-SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, ScaleARGBRowDownEvenBox_SSE2,
-       ScaleARGBRowDownEvenBox_C, 4, 3)
+SDAANY(ScaleARGBRowDownEven_Any_SSE2,
+       ScaleARGBRowDownEven_SSE2,
+       ScaleARGBRowDownEven_C,
+       4,
+       3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2,
+       ScaleARGBRowDownEvenBox_SSE2,
+       ScaleARGBRowDownEvenBox_C,
+       4,
+       3)
 #endif
 #ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
-SDAANY(ScaleARGBRowDownEven_Any_NEON, ScaleARGBRowDownEven_NEON,
-       ScaleARGBRowDownEven_C, 4, 3)
-SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON,
-       ScaleARGBRowDownEvenBox_C, 4, 3)
+SDAANY(ScaleARGBRowDownEven_Any_NEON,
+       ScaleARGBRowDownEven_NEON,
+       ScaleARGBRowDownEven_C,
+       4,
+       3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_NEON,
+       ScaleARGBRowDownEvenBox_NEON,
+       ScaleARGBRowDownEvenBox_C,
+       4,
+       3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWNEVEN_MSA
+SDAANY(ScaleARGBRowDownEven_Any_MSA,
+       ScaleARGBRowDownEven_MSA,
+       ScaleARGBRowDownEven_C,
+       4,
+       3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_MSA,
+       ScaleARGBRowDownEvenBox_MSA,
+       ScaleARGBRowDownEvenBox_C,
+       4,
+       3)
 #endif
 
 // Add rows box filter scale down.
-#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK)                  \
-  void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) {         \
-      int n = src_width & ~MASK;                                               \
-      if (n > 0) {                                                             \
-        SCALEADDROW_SIMD(src_ptr, dst_ptr, n);                                 \
-      }                                                                        \
-      SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK);               \
-    }
+#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK)              \
+  void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { \
+    int n = src_width & ~MASK;                                             \
+    if (n > 0) {                                                           \
+      SCALEADDROW_SIMD(src_ptr, dst_ptr, n);                               \
+    }                                                                      \
+    SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK);             \
+  }
 
 #ifdef HAS_SCALEADDROW_SSE2
 SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
@@ -208,14 +453,12 @@ SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
 #ifdef HAS_SCALEADDROW_NEON
 SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
 #endif
+#ifdef HAS_SCALEADDROW_MSA
+SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15)
+#endif
 #undef SAANY
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
-
-
-
-
-
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/scale_argb.cc b/media/libvpx/libvpx/third_party/libyuv/source/scale_argb.cc
index 17f51ae9bf..53a22e8b41 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/scale_argb.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/scale_argb.cc
@@ -30,20 +30,31 @@ static __inline int Abs(int v) {
 // ScaleARGB ARGB, 1/2
 // This is an optimized version for scaling down a ARGB to 1/2 of
 // its original size.
-static void ScaleARGBDown2(int src_width, int src_height,
-                           int dst_width, int dst_height,
-                           int src_stride, int dst_stride,
-                           const uint8* src_argb, uint8* dst_argb,
-                           int x, int dx, int y, int dy,
+static void ScaleARGBDown2(int src_width,
+                           int src_height,
+                           int dst_width,
+                           int dst_height,
+                           int src_stride,
+                           int dst_stride,
+                           const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int x,
+                           int dx,
+                           int y,
+                           int dy,
                            enum FilterMode filtering) {
   int j;
   int row_stride = src_stride * (dy >> 16);
-  void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
-                            uint8* dst_argb, int dst_width) =
-    filtering == kFilterNone ? ScaleARGBRowDown2_C :
-        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C :
-        ScaleARGBRowDown2Box_C);
-  assert(dx == 65536 * 2);  // Test scale factor of 2.
+  void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride,
+                            uint8_t* dst_argb, int dst_width) =
+      filtering == kFilterNone
+          ? ScaleARGBRowDown2_C
+          : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C
+                                        : ScaleARGBRowDown2Box_C);
+  (void)src_width;
+  (void)src_height;
+  (void)dx;
+  assert(dx == 65536 * 2);      // Test scale factor of 2.
   assert((dy & 0x1ffff) == 0);  // Test vertical scale is multiple of 2.
   // Advance to odd row, even column.
   if (filtering == kFilterBilinear) {
@@ -54,25 +65,49 @@ static void ScaleARGBDown2(int src_width, int src_height,
 
 #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 :
-        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 :
-        ScaleARGBRowDown2Box_Any_SSE2);
+    ScaleARGBRowDown2 =
+        filtering == kFilterNone
+            ? ScaleARGBRowDown2_Any_SSE2
+            : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2
+                                          : ScaleARGBRowDown2Box_Any_SSE2);
     if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
-          (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
-          ScaleARGBRowDown2Box_SSE2);
+      ScaleARGBRowDown2 =
+          filtering == kFilterNone
+              ? ScaleARGBRowDown2_SSE2
+              : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2
+                                            : ScaleARGBRowDown2Box_SSE2);
     }
   }
 #endif
 #if defined(HAS_SCALEARGBROWDOWN2_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON :
-        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON :
-        ScaleARGBRowDown2Box_Any_NEON);
+    ScaleARGBRowDown2 =
+        filtering == kFilterNone
+            ? ScaleARGBRowDown2_Any_NEON
+            : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON
+                                          : ScaleARGBRowDown2Box_Any_NEON);
     if (IS_ALIGNED(dst_width, 8)) {
-      ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON :
-          (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON :
-          ScaleARGBRowDown2Box_NEON);
+      ScaleARGBRowDown2 =
+          filtering == kFilterNone
+              ? ScaleARGBRowDown2_NEON
+              : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON
+                                            : ScaleARGBRowDown2Box_NEON);
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBRowDown2 =
+        filtering == kFilterNone
+            ? ScaleARGBRowDown2_Any_MSA
+            : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MSA
+                                          : ScaleARGBRowDown2Box_Any_MSA);
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDown2 =
+          filtering == kFilterNone
+              ? ScaleARGBRowDown2_MSA
+              : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MSA
+                                            : ScaleARGBRowDown2Box_MSA);
     }
   }
 #endif
@@ -90,21 +125,32 @@ static void ScaleARGBDown2(int src_width, int src_height,
 // ScaleARGB ARGB, 1/4
 // This is an optimized version for scaling down a ARGB to 1/4 of
 // its original size.
-static void ScaleARGBDown4Box(int src_width, int src_height,
-                              int dst_width, int dst_height,
-                              int src_stride, int dst_stride,
-                              const uint8* src_argb, uint8* dst_argb,
-                              int x, int dx, int y, int dy) {
+static void ScaleARGBDown4Box(int src_width,
+                              int src_height,
+                              int dst_width,
+                              int dst_height,
+                              int src_stride,
+                              int dst_stride,
+                              const uint8_t* src_argb,
+                              uint8_t* dst_argb,
+                              int x,
+                              int dx,
+                              int y,
+                              int dy) {
   int j;
   // Allocate 2 rows of ARGB.
   const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;
   align_buffer_64(row, kRowSize * 2);
   int row_stride = src_stride * (dy >> 16);
-  void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
-    uint8* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C;
+  void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride,
+                            uint8_t* dst_argb, int dst_width) =
+      ScaleARGBRowDown2Box_C;
   // Advance to odd row, even column.
   src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
-  assert(dx == 65536 * 4);  // Test scale factor of 4.
+  (void)src_width;
+  (void)src_height;
+  (void)dx;
+  assert(dx == 65536 * 4);      // Test scale factor of 4.
   assert((dy & 0x3ffff) == 0);  // Test vertical scale is multiple of 4.
 #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
@@ -125,8 +171,8 @@ static void ScaleARGBDown4Box(int src_width, int src_height,
 
   for (j = 0; j < dst_height; ++j) {
     ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
-    ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride,
-                      row + kRowSize, dst_width * 2);
+    ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, row + kRowSize,
+                      dst_width * 2);
     ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width);
     src_argb += row_stride;
     dst_argb += dst_stride;
@@ -137,38 +183,57 @@ static void ScaleARGBDown4Box(int src_width, int src_height,
 // ScaleARGB ARGB Even
 // This is an optimized version for scaling down a ARGB to even
 // multiple of its original size.
-static void ScaleARGBDownEven(int src_width, int src_height,
-                              int dst_width, int dst_height,
-                              int src_stride, int dst_stride,
-                              const uint8* src_argb, uint8* dst_argb,
-                              int x, int dx, int y, int dy,
+static void ScaleARGBDownEven(int src_width,
+                              int src_height,
+                              int dst_width,
+                              int dst_height,
+                              int src_stride,
+                              int dst_stride,
+                              const uint8_t* src_argb,
+                              uint8_t* dst_argb,
+                              int x,
+                              int dx,
+                              int y,
+                              int dy,
                               enum FilterMode filtering) {
   int j;
   int col_step = dx >> 16;
   int row_stride = (dy >> 16) * src_stride;
-  void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride,
-                               int src_step, uint8* dst_argb, int dst_width) =
+  void (*ScaleARGBRowDownEven)(const uint8_t* src_argb, ptrdiff_t src_stride,
+                               int src_step, uint8_t* dst_argb, int dst_width) =
       filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
+  (void)src_width;
+  (void)src_height;
   assert(IS_ALIGNED(src_width, 2));
   assert(IS_ALIGNED(src_height, 2));
   src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 :
-        ScaleARGBRowDownEven_Any_SSE2;
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2
+                                     : ScaleARGBRowDownEven_Any_SSE2;
     if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
-          ScaleARGBRowDownEven_SSE2;
+      ScaleARGBRowDownEven =
+          filtering ? ScaleARGBRowDownEvenBox_SSE2 : ScaleARGBRowDownEven_SSE2;
     }
   }
 #endif
 #if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON :
-        ScaleARGBRowDownEven_Any_NEON;
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON
+                                     : ScaleARGBRowDownEven_Any_NEON;
     if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
-          ScaleARGBRowDownEven_NEON;
+      ScaleARGBRowDownEven =
+          filtering ? ScaleARGBRowDownEvenBox_NEON : ScaleARGBRowDownEven_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MSA
+                                     : ScaleARGBRowDownEven_Any_MSA;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDownEven =
+          filtering ? ScaleARGBRowDownEvenBox_MSA : ScaleARGBRowDownEven_MSA;
     }
   }
 #endif
@@ -184,25 +249,32 @@ static void ScaleARGBDownEven(int src_width, int src_height,
 }
 
 // Scale ARGB down with bilinear interpolation.
-static void ScaleARGBBilinearDown(int src_width, int src_height,
-                                  int dst_width, int dst_height,
-                                  int src_stride, int dst_stride,
-                                  const uint8* src_argb, uint8* dst_argb,
-                                  int x, int dx, int y, int dy,
+static void ScaleARGBBilinearDown(int src_width,
+                                  int src_height,
+                                  int dst_width,
+                                  int dst_height,
+                                  int src_stride,
+                                  int dst_stride,
+                                  const uint8_t* src_argb,
+                                  uint8_t* dst_argb,
+                                  int x,
+                                  int dx,
+                                  int y,
+                                  int dy,
                                   enum FilterMode filtering) {
   int j;
-  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_C;
-  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
-      int dst_width, int x, int dx) =
+  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+                              int dst_width, int x, int dx) =
       (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
-  int64 xlast = x + (int64)(dst_width - 1) * dx;
-  int64 xl = (dx >= 0) ? x : xlast;
-  int64 xr = (dx >= 0) ? xlast : x;
+  int64_t xlast = x + (int64_t)(dst_width - 1) * dx;
+  int64_t xl = (dx >= 0) ? x : xlast;
+  int64_t xr = (dx >= 0) ? xlast : x;
   int clip_src_width;
-  xl = (xl >> 16) & ~3;  // Left edge aligned.
-  xr = (xr >> 16) + 1;  // Right most pixel used.  Bilinear uses 2 pixels.
+  xl = (xl >> 16) & ~3;    // Left edge aligned.
+  xr = (xr >> 16) + 1;     // Right most pixel used.  Bilinear uses 2 pixels.
   xr = (xr + 1 + 3) & ~3;  // 1 beyond 4 pixel aligned right most pixel.
   if (xr > src_width) {
     xr = src_width;
@@ -234,12 +306,11 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
-    InterpolateRow = InterpolateRow_Any_DSPR2;
-    if (IS_ALIGNED(clip_src_width, 4)) {
-      InterpolateRow = InterpolateRow_DSPR2;
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(clip_src_width, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
     }
   }
 #endif
@@ -255,6 +326,14 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
       ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
     }
   }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_MSA;
+    }
+  }
 #endif
   // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
   // Allocate a row of ARGB.
@@ -267,7 +346,7 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
     }
     for (j = 0; j < dst_height; ++j) {
       int yi = y >> 16;
-      const uint8* src = src_argb + yi * src_stride;
+      const uint8_t* src = src_argb + yi * src_stride;
       if (filtering == kFilterLinear) {
         ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
       } else {
@@ -286,18 +365,25 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
 }
 
 // Scale ARGB up with bilinear interpolation.
-static void ScaleARGBBilinearUp(int src_width, int src_height,
-                                int dst_width, int dst_height,
-                                int src_stride, int dst_stride,
-                                const uint8* src_argb, uint8* dst_argb,
-                                int x, int dx, int y, int dy,
+static void ScaleARGBBilinearUp(int src_width,
+                                int src_height,
+                                int dst_width,
+                                int dst_height,
+                                int src_stride,
+                                int dst_stride,
+                                const uint8_t* src_argb,
+                                uint8_t* dst_argb,
+                                int x,
+                                int dx,
+                                int y,
+                                int dy,
                                 enum FilterMode filtering) {
   int j;
-  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_C;
-  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
-      int dst_width, int x, int dx) =
+  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+                              int dst_width, int x, int dx) =
       filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
   const int max_y = (src_height - 1) << 16;
 #if defined(HAS_INTERPOLATEROW_SSSE3)
@@ -324,15 +410,17 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
-    InterpolateRow = InterpolateRow_DSPR2;
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
   }
 #endif
   if (src_width >= 32768) {
-    ScaleARGBFilterCols = filtering ?
-        ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+    ScaleARGBFilterCols =
+        filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
   }
 #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
   if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
@@ -347,6 +435,14 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
     }
   }
 #endif
+#if defined(HAS_SCALEARGBFILTERCOLS_MSA)
+  if (filtering && TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_MSA;
+    }
+  }
+#endif
 #if defined(HAS_SCALEARGBCOLS_SSE2)
   if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
     ScaleARGBFilterCols = ScaleARGBCols_SSE2;
@@ -359,6 +455,14 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
       ScaleARGBFilterCols = ScaleARGBCols_NEON;
     }
   }
+#endif
+#if defined(HAS_SCALEARGBCOLS_MSA)
+  if (!filtering && TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBCols_MSA;
+    }
+  }
 #endif
   if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
     ScaleARGBFilterCols = ScaleARGBColsUp2_C;
@@ -375,13 +479,13 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
 
   {
     int yi = y >> 16;
-    const uint8* src = src_argb + yi * src_stride;
+    const uint8_t* src = src_argb + yi * src_stride;
 
     // Allocate 2 rows of ARGB.
     const int kRowSize = (dst_width * 4 + 31) & ~31;
     align_buffer_64(row, kRowSize * 2);
 
-    uint8* rowptr = row;
+    uint8_t* rowptr = row;
     int rowstride = kRowSize;
     int lasty = yi;
 
@@ -423,24 +527,27 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
 
 #ifdef YUVSCALEUP
 // Scale YUV to ARGB up with bilinear interpolation.
-static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
-                                     int dst_width, int dst_height,
+static void ScaleYUVToARGBBilinearUp(int src_width,
+                                     int src_height,
+                                     int dst_width,
+                                     int dst_height,
                                      int src_stride_y,
                                      int src_stride_u,
                                      int src_stride_v,
                                      int dst_stride_argb,
-                                     const uint8* src_y,
-                                     const uint8* src_u,
-                                     const uint8* src_v,
-                                     uint8* dst_argb,
-                                     int x, int dx, int y, int dy,
+                                     const uint8_t* src_y,
+                                     const uint8_t* src_u,
+                                     const uint8_t* src_v,
+                                     uint8_t* dst_argb,
+                                     int x,
+                                     int dx,
+                                     int y,
+                                     int dy,
                                      enum FilterMode filtering) {
   int j;
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToARGBRow_C;
+  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf, int width) =
+      I422ToARGBRow_C;
 #if defined(HAS_I422TOARGBROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
@@ -465,19 +572,18 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_I422TOARGBROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_width, 4) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
-    I422ToARGBRow = I422ToARGBRow_DSPR2;
+#if defined(HAS_I422TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGBRow = I422ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(src_width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_MSA;
+    }
   }
 #endif
 
-  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_C;
+  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
 #if defined(HAS_INTERPOLATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -502,19 +608,21 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
-    InterpolateRow = InterpolateRow_DSPR2;
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
   }
 #endif
 
-  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
-      int dst_width, int x, int dx) =
+  void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+                              int dst_width, int x, int dx) =
       filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
   if (src_width >= 32768) {
-    ScaleARGBFilterCols = filtering ?
-        ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+    ScaleARGBFilterCols =
+        filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
   }
 #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
   if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
@@ -529,6 +637,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
     }
   }
 #endif
+#if defined(HAS_SCALEARGBFILTERCOLS_MSA)
+  if (filtering && TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_MSA;
+    }
+  }
+#endif
 #if defined(HAS_SCALEARGBCOLS_SSE2)
   if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
     ScaleARGBFilterCols = ScaleARGBCols_SSE2;
@@ -541,6 +657,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
       ScaleARGBFilterCols = ScaleARGBCols_NEON;
     }
   }
+#endif
+#if defined(HAS_SCALEARGBCOLS_MSA)
+  if (!filtering && TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBCols_MSA;
+    }
+  }
 #endif
   if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
     ScaleARGBFilterCols = ScaleARGBColsUp2_C;
@@ -558,9 +682,9 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
   const int kYShift = 1;  // Shift Y by 1 to convert Y plane to UV coordinate.
   int yi = y >> 16;
   int uv_yi = yi >> kYShift;
-  const uint8* src_row_y = src_y + yi * src_stride_y;
-  const uint8* src_row_u = src_u + uv_yi * src_stride_u;
-  const uint8* src_row_v = src_v + uv_yi * src_stride_v;
+  const uint8_t* src_row_y = src_y + yi * src_stride_y;
+  const uint8_t* src_row_u = src_u + uv_yi * src_stride_u;
+  const uint8_t* src_row_v = src_v + uv_yi * src_stride_v;
 
   // Allocate 2 rows of ARGB.
   const int kRowSize = (dst_width * 4 + 31) & ~31;
@@ -569,7 +693,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
   // Allocate 1 row of ARGB for source conversion.
   align_buffer_64(argb_row, src_width * 4);
 
-  uint8* rowptr = row;
+  uint8_t* rowptr = row;
   int rowstride = kRowSize;
   int lasty = yi;
 
@@ -635,15 +759,23 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
 // of x and dx is the integer part of the source position and
 // the lower 16 bits are the fixed decimal part.
 
-static void ScaleARGBSimple(int src_width, int src_height,
-                            int dst_width, int dst_height,
-                            int src_stride, int dst_stride,
-                            const uint8* src_argb, uint8* dst_argb,
-                            int x, int dx, int y, int dy) {
+static void ScaleARGBSimple(int src_width,
+                            int src_height,
+                            int dst_width,
+                            int dst_height,
+                            int src_stride,
+                            int dst_stride,
+                            const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            int x,
+                            int dx,
+                            int y,
+                            int dy) {
   int j;
-  void (*ScaleARGBCols)(uint8* dst_argb, const uint8* src_argb,
-      int dst_width, int x, int dx) =
+  void (*ScaleARGBCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+                        int dst_width, int x, int dx) =
       (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;
+  (void)src_height;
 #if defined(HAS_SCALEARGBCOLS_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
     ScaleARGBCols = ScaleARGBCols_SSE2;
@@ -656,6 +788,14 @@ static void ScaleARGBSimple(int src_width, int src_height,
       ScaleARGBCols = ScaleARGBCols_NEON;
     }
   }
+#endif
+#if defined(HAS_SCALEARGBCOLS_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBCols = ScaleARGBCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBCols = ScaleARGBCols_MSA;
+    }
+  }
 #endif
   if (src_width * 2 == dst_width && x < 0x8000) {
     ScaleARGBCols = ScaleARGBColsUp2_C;
@@ -667,8 +807,8 @@ static void ScaleARGBSimple(int src_width, int src_height,
   }
 
   for (j = 0; j < dst_height; ++j) {
-    ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride,
-                  dst_width, x, dx);
+    ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride, dst_width, x,
+                  dx);
     dst_argb += dst_stride;
     y += dy;
   }
@@ -677,11 +817,18 @@ static void ScaleARGBSimple(int src_width, int src_height,
 // ScaleARGB a ARGB.
 // This function in turn calls a scaling function
 // suitable for handling the desired resolutions.
-static void ScaleARGB(const uint8* src, int src_stride,
-                      int src_width, int src_height,
-                      uint8* dst, int dst_stride,
-                      int dst_width, int dst_height,
-                      int clip_x, int clip_y, int clip_width, int clip_height,
+static void ScaleARGB(const uint8_t* src,
+                      int src_stride,
+                      int src_width,
+                      int src_height,
+                      uint8_t* dst,
+                      int dst_stride,
+                      int dst_width,
+                      int dst_height,
+                      int clip_x,
+                      int clip_y,
+                      int clip_width,
+                      int clip_height,
                       enum FilterMode filtering) {
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -690,8 +837,7 @@ static void ScaleARGB(const uint8* src, int src_stride,
   int dy = 0;
   // ARGB does not support box filter yet, but allow the user to pass it.
   // Simplify filtering when possible.
-  filtering = ScaleFilterReduce(src_width, src_height,
-                                dst_width, dst_height,
+  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
                                 filtering);
 
   // Negative src_height means invert the image.
@@ -700,17 +846,17 @@ static void ScaleARGB(const uint8* src, int src_stride,
     src = src + (src_height - 1) * src_stride;
     src_stride = -src_stride;
   }
-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
-             &x, &y, &dx, &dy);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
   if (clip_x) {
-    int64 clipf = (int64)(clip_x) * dx;
+    int64_t clipf = (int64_t)(clip_x)*dx;
     x += (clipf & 0xffff);
     src += (clipf >> 16) * 4;
     dst += clip_x * 4;
   }
   if (clip_y) {
-    int64 clipf = (int64)(clip_y) * dy;
+    int64_t clipf = (int64_t)(clip_y)*dy;
     y += (clipf & 0xffff);
     src += (clipf >> 16) * src_stride;
     dst += clip_y * dst_stride;
@@ -725,24 +871,20 @@ static void ScaleARGB(const uint8* src, int src_stride,
       if (!(dx & 0x10000) && !(dy & 0x10000)) {
         if (dx == 0x20000) {
           // Optimized 1/2 downsample.
-          ScaleARGBDown2(src_width, src_height,
-                         clip_width, clip_height,
-                         src_stride, dst_stride, src, dst,
-                         x, dx, y, dy, filtering);
+          ScaleARGBDown2(src_width, src_height, clip_width, clip_height,
+                         src_stride, dst_stride, src, dst, x, dx, y, dy,
+                         filtering);
           return;
         }
         if (dx == 0x40000 && filtering == kFilterBox) {
           // Optimized 1/4 box downsample.
-          ScaleARGBDown4Box(src_width, src_height,
-                            clip_width, clip_height,
-                            src_stride, dst_stride, src, dst,
-                            x, dx, y, dy);
+          ScaleARGBDown4Box(src_width, src_height, clip_width, clip_height,
+                            src_stride, dst_stride, src, dst, x, dx, y, dy);
           return;
         }
-        ScaleARGBDownEven(src_width, src_height,
-                          clip_width, clip_height,
-                          src_stride, dst_stride, src, dst,
-                          x, dx, y, dy, filtering);
+        ScaleARGBDownEven(src_width, src_height, clip_width, clip_height,
+                          src_stride, dst_stride, src, dst, x, dx, y, dy,
+                          filtering);
         return;
       }
       // Optimized odd scale down. ie 3, 5, 7, 9x.
@@ -759,96 +901,105 @@ static void ScaleARGB(const uint8* src, int src_stride,
   }
   if (dx == 0x10000 && (x & 0xffff) == 0) {
     // Arbitrary scale vertically, but unscaled vertically.
-    ScalePlaneVertical(src_height,
-                       clip_width, clip_height,
-                       src_stride, dst_stride, src, dst,
-                       x, y, dy, 4, filtering);
+    ScalePlaneVertical(src_height, clip_width, clip_height, src_stride,
+                       dst_stride, src, dst, x, y, dy, 4, filtering);
     return;
   }
   if (filtering && dy < 65536) {
-    ScaleARGBBilinearUp(src_width, src_height,
-                        clip_width, clip_height,
-                        src_stride, dst_stride, src, dst,
-                        x, dx, y, dy, filtering);
+    ScaleARGBBilinearUp(src_width, src_height, clip_width, clip_height,
+                        src_stride, dst_stride, src, dst, x, dx, y, dy,
+                        filtering);
     return;
   }
   if (filtering) {
-    ScaleARGBBilinearDown(src_width, src_height,
-                          clip_width, clip_height,
-                          src_stride, dst_stride, src, dst,
-                          x, dx, y, dy, filtering);
+    ScaleARGBBilinearDown(src_width, src_height, clip_width, clip_height,
+                          src_stride, dst_stride, src, dst, x, dx, y, dy,
+                          filtering);
     return;
   }
-  ScaleARGBSimple(src_width, src_height, clip_width, clip_height,
-                  src_stride, dst_stride, src, dst,
-                  x, dx, y, dy);
+  ScaleARGBSimple(src_width, src_height, clip_width, clip_height, src_stride,
+                  dst_stride, src, dst, x, dx, y, dy);
 }
 
 LIBYUV_API
-int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
-                  int src_width, int src_height,
-                  uint8* dst_argb, int dst_stride_argb,
-                  int dst_width, int dst_height,
-                  int clip_x, int clip_y, int clip_width, int clip_height,
+int ARGBScaleClip(const uint8_t* src_argb,
+                  int src_stride_argb,
+                  int src_width,
+                  int src_height,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int dst_width,
+                  int dst_height,
+                  int clip_x,
+                  int clip_y,
+                  int clip_width,
+                  int clip_height,
                   enum FilterMode filtering) {
-  if (!src_argb || src_width == 0 || src_height == 0 ||
-      !dst_argb || dst_width <= 0 || dst_height <= 0 ||
-      clip_x < 0 || clip_y < 0 ||
+  if (!src_argb || src_width == 0 || src_height == 0 || !dst_argb ||
+      dst_width <= 0 || dst_height <= 0 || clip_x < 0 || clip_y < 0 ||
       clip_width > 32768 || clip_height > 32768 ||
       (clip_x + clip_width) > dst_width ||
       (clip_y + clip_height) > dst_height) {
     return -1;
   }
-  ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
-            dst_argb, dst_stride_argb, dst_width, dst_height,
-            clip_x, clip_y, clip_width, clip_height, filtering);
+  ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
+            dst_stride_argb, dst_width, dst_height, clip_x, clip_y, clip_width,
+            clip_height, filtering);
   return 0;
 }
 
 // Scale an ARGB image.
 LIBYUV_API
-int ARGBScale(const uint8* src_argb, int src_stride_argb,
-              int src_width, int src_height,
-              uint8* dst_argb, int dst_stride_argb,
-              int dst_width, int dst_height,
+int ARGBScale(const uint8_t* src_argb,
+              int src_stride_argb,
+              int src_width,
+              int src_height,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int dst_width,
+              int dst_height,
               enum FilterMode filtering) {
-  if (!src_argb || src_width == 0 || src_height == 0 ||
-      src_width > 32768 || src_height > 32768 ||
-      !dst_argb || dst_width <= 0 || dst_height <= 0) {
+  if (!src_argb || src_width == 0 || src_height == 0 || src_width > 32768 ||
+      src_height > 32768 || !dst_argb || dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
-  ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
-            dst_argb, dst_stride_argb, dst_width, dst_height,
-            0, 0, dst_width, dst_height, filtering);
+  ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
+            dst_stride_argb, dst_width, dst_height, 0, 0, dst_width, dst_height,
+            filtering);
   return 0;
 }
 
 // Scale with YUV conversion to ARGB and clipping.
 LIBYUV_API
-int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
-                       const uint8* src_u, int src_stride_u,
-                       const uint8* src_v, int src_stride_v,
-                       uint32 src_fourcc,
-                       int src_width, int src_height,
-                       uint8* dst_argb, int dst_stride_argb,
-                       uint32 dst_fourcc,
-                       int dst_width, int dst_height,
-                       int clip_x, int clip_y, int clip_width, int clip_height,
+int YUVToARGBScaleClip(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint32_t src_fourcc,
+                       int src_width,
+                       int src_height,
+                       uint8_t* dst_argb,
+                       int dst_stride_argb,
+                       uint32_t dst_fourcc,
+                       int dst_width,
+                       int dst_height,
+                       int clip_x,
+                       int clip_y,
+                       int clip_width,
+                       int clip_height,
                        enum FilterMode filtering) {
-  uint8* argb_buffer = (uint8*)malloc(src_width * src_height * 4);
+  uint8_t* argb_buffer = (uint8_t*)malloc(src_width * src_height * 4);
   int r;
-  I420ToARGB(src_y, src_stride_y,
-             src_u, src_stride_u,
-             src_v, src_stride_v,
-             argb_buffer, src_width * 4,
-             src_width, src_height);
+  (void)src_fourcc;  // TODO(fbarchard): implement and/or assert.
+  (void)dst_fourcc;
+  I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+             argb_buffer, src_width * 4, src_width, src_height);
 
-  r = ARGBScaleClip(argb_buffer, src_width * 4,
-                    src_width, src_height,
-                    dst_argb, dst_stride_argb,
-                    dst_width, dst_height,
-                    clip_x, clip_y, clip_width, clip_height,
-                    filtering);
+  r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, src_height, dst_argb,
+                    dst_stride_argb, dst_width, dst_height, clip_x, clip_y,
+                    clip_width, clip_height, filtering);
   free(argb_buffer);
   return r;
 }
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/scale_common.cc b/media/libvpx/libvpx/third_party/libyuv/source/scale_common.cc
index 3507aa4d9f..b28d7da41f 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/scale_common.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/scale_common.cc
@@ -28,9 +28,12 @@ static __inline int Abs(int v) {
 }
 
 // CPU agnostic row functions
-void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                     uint8* dst, int dst_width) {
+void ScaleRowDown2_C(const uint8_t* src_ptr,
+                     ptrdiff_t src_stride,
+                     uint8_t* dst,
+                     int dst_width) {
   int x;
+  (void)src_stride;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = src_ptr[1];
     dst[1] = src_ptr[3];
@@ -42,9 +45,12 @@ void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                        uint16* dst, int dst_width) {
+void ScaleRowDown2_16_C(const uint16_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint16_t* dst,
+                        int dst_width) {
   int x;
+  (void)src_stride;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = src_ptr[1];
     dst[1] = src_ptr[3];
@@ -56,10 +62,13 @@ void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width) {
-  const uint8* s = src_ptr;
+void ScaleRowDown2Linear_C(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst,
+                           int dst_width) {
+  const uint8_t* s = src_ptr;
   int x;
+  (void)src_stride;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = (s[0] + s[1] + 1) >> 1;
     dst[1] = (s[2] + s[3] + 1) >> 1;
@@ -71,10 +80,13 @@ void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                              uint16* dst, int dst_width) {
-  const uint16* s = src_ptr;
+void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint16_t* dst,
+                              int dst_width) {
+  const uint16_t* s = src_ptr;
   int x;
+  (void)src_stride;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = (s[0] + s[1] + 1) >> 1;
     dst[1] = (s[2] + s[3] + 1) >> 1;
@@ -86,10 +98,12 @@ void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width) {
-  const uint8* s = src_ptr;
-  const uint8* t = src_ptr + src_stride;
+void ScaleRowDown2Box_C(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width) {
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
   int x;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
@@ -103,10 +117,12 @@ void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width) {
-  const uint8* s = src_ptr;
-  const uint8* t = src_ptr + src_stride;
+void ScaleRowDown2Box_Odd_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst,
+                            int dst_width) {
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
   int x;
   dst_width -= 1;
   for (x = 0; x < dst_width - 1; x += 2) {
@@ -125,10 +141,12 @@ void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
   dst[0] = (s[0] + t[0] + 1) >> 1;
 }
 
-void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst, int dst_width) {
-  const uint16* s = src_ptr;
-  const uint16* t = src_ptr + src_stride;
+void ScaleRowDown2Box_16_C(const uint16_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint16_t* dst,
+                           int dst_width) {
+  const uint16_t* s = src_ptr;
+  const uint16_t* t = src_ptr + src_stride;
   int x;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
@@ -142,9 +160,12 @@ void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                     uint8* dst, int dst_width) {
+void ScaleRowDown4_C(const uint8_t* src_ptr,
+                     ptrdiff_t src_stride,
+                     uint8_t* dst,
+                     int dst_width) {
   int x;
+  (void)src_stride;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = src_ptr[2];
     dst[1] = src_ptr[6];
@@ -156,9 +177,12 @@ void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                        uint16* dst, int dst_width) {
+void ScaleRowDown4_16_C(const uint16_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint16_t* dst,
+                        int dst_width) {
   int x;
+  (void)src_stride;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = src_ptr[2];
     dst[1] = src_ptr[6];
@@ -170,81 +194,88 @@ void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width) {
+void ScaleRowDown4Box_C(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width) {
   intptr_t stride = src_stride;
   int x;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-             src_ptr[stride + 0] + src_ptr[stride + 1] +
-             src_ptr[stride + 2] + src_ptr[stride + 3] +
-             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
-             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
-             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
-             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
-             8) >> 4;
+              src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+              src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+              src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+              src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+              src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+              src_ptr[stride * 3 + 3] + 8) >>
+             4;
     dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
-             src_ptr[stride + 4] + src_ptr[stride + 5] +
-             src_ptr[stride + 6] + src_ptr[stride + 7] +
-             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
-             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
-             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
-             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
-             8) >> 4;
+              src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] +
+              src_ptr[stride + 7] + src_ptr[stride * 2 + 4] +
+              src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] +
+              src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] +
+              src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] +
+              src_ptr[stride * 3 + 7] + 8) >>
+             4;
     dst += 2;
     src_ptr += 8;
   }
   if (dst_width & 1) {
     dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-             src_ptr[stride + 0] + src_ptr[stride + 1] +
-             src_ptr[stride + 2] + src_ptr[stride + 3] +
-             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
-             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
-             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
-             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
-             8) >> 4;
+              src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+              src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+              src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+              src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+              src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+              src_ptr[stride * 3 + 3] + 8) >>
+             4;
   }
 }
 
-void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst, int dst_width) {
+void ScaleRowDown4Box_16_C(const uint16_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint16_t* dst,
+                           int dst_width) {
   intptr_t stride = src_stride;
   int x;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-             src_ptr[stride + 0] + src_ptr[stride + 1] +
-             src_ptr[stride + 2] + src_ptr[stride + 3] +
-             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
-             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
-             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
-             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
-             8) >> 4;
+              src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+              src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+              src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+              src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+              src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+              src_ptr[stride * 3 + 3] + 8) >>
+             4;
     dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
-             src_ptr[stride + 4] + src_ptr[stride + 5] +
-             src_ptr[stride + 6] + src_ptr[stride + 7] +
-             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
-             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
-             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
-             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
-             8) >> 4;
+              src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] +
+              src_ptr[stride + 7] + src_ptr[stride * 2 + 4] +
+              src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] +
+              src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] +
+              src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] +
+              src_ptr[stride * 3 + 7] + 8) >>
+             4;
     dst += 2;
     src_ptr += 8;
   }
   if (dst_width & 1) {
     dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-             src_ptr[stride + 0] + src_ptr[stride + 1] +
-             src_ptr[stride + 2] + src_ptr[stride + 3] +
-             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
-             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
-             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
-             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
-             8) >> 4;
+              src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+              src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+              src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+              src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+              src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+              src_ptr[stride * 3 + 3] + 8) >>
+             4;
   }
 }
 
-void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                      uint8* dst, int dst_width) {
+void ScaleRowDown34_C(const uint8_t* src_ptr,
+                      ptrdiff_t src_stride,
+                      uint8_t* dst,
+                      int dst_width) {
   int x;
+  (void)src_stride;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (x = 0; x < dst_width; x += 3) {
     dst[0] = src_ptr[0];
@@ -255,9 +286,12 @@ void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                         uint16* dst, int dst_width) {
+void ScaleRowDown34_16_C(const uint16_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint16_t* dst,
+                         int dst_width) {
   int x;
+  (void)src_stride;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (x = 0; x < dst_width; x += 3) {
     dst[0] = src_ptr[0];
@@ -269,19 +303,21 @@ void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
 }
 
 // Filter rows 0 and 1 together, 3 : 1
-void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* d, int dst_width) {
-  const uint8* s = src_ptr;
-  const uint8* t = src_ptr + src_stride;
+void ScaleRowDown34_0_Box_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* d,
+                            int dst_width) {
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
   int x;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (x = 0; x < dst_width; x += 3) {
-    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
-    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
-    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
-    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
-    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
-    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
     d[0] = (a0 * 3 + b0 + 2) >> 2;
     d[1] = (a1 * 3 + b1 + 2) >> 2;
     d[2] = (a2 * 3 + b2 + 2) >> 2;
@@ -291,19 +327,21 @@ void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                               uint16* d, int dst_width) {
-  const uint16* s = src_ptr;
-  const uint16* t = src_ptr + src_stride;
+void ScaleRowDown34_0_Box_16_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16_t* d,
+                               int dst_width) {
+  const uint16_t* s = src_ptr;
+  const uint16_t* t = src_ptr + src_stride;
   int x;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (x = 0; x < dst_width; x += 3) {
-    uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
-    uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
-    uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
-    uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
-    uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
-    uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    uint16_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint16_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint16_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint16_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint16_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint16_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
     d[0] = (a0 * 3 + b0 + 2) >> 2;
     d[1] = (a1 * 3 + b1 + 2) >> 2;
     d[2] = (a2 * 3 + b2 + 2) >> 2;
@@ -314,19 +352,21 @@ void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
 }
 
 // Filter rows 1 and 2 together, 1 : 1
-void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* d, int dst_width) {
-  const uint8* s = src_ptr;
-  const uint8* t = src_ptr + src_stride;
+void ScaleRowDown34_1_Box_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* d,
+                            int dst_width) {
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
   int x;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (x = 0; x < dst_width; x += 3) {
-    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
-    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
-    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
-    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
-    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
-    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
     d[0] = (a0 + b0 + 1) >> 1;
     d[1] = (a1 + b1 + 1) >> 1;
     d[2] = (a2 + b2 + 1) >> 1;
@@ -336,19 +376,21 @@ void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                               uint16* d, int dst_width) {
-  const uint16* s = src_ptr;
-  const uint16* t = src_ptr + src_stride;
+void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16_t* d,
+                               int dst_width) {
+  const uint16_t* s = src_ptr;
+  const uint16_t* t = src_ptr + src_stride;
   int x;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (x = 0; x < dst_width; x += 3) {
-    uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
-    uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
-    uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
-    uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
-    uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
-    uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    uint16_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint16_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint16_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint16_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint16_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint16_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
     d[0] = (a0 + b0 + 1) >> 1;
     d[1] = (a1 + b1 + 1) >> 1;
     d[2] = (a2 + b2 + 1) >> 1;
@@ -359,8 +401,11 @@ void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
 }
 
 // Scales a single row of pixels using point sampling.
-void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
-                 int dst_width, int x, int dx) {
+void ScaleCols_C(uint8_t* dst_ptr,
+                 const uint8_t* src_ptr,
+                 int dst_width,
+                 int x,
+                 int dx) {
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
     dst_ptr[0] = src_ptr[x >> 16];
@@ -374,8 +419,11 @@ void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
   }
 }
 
-void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                    int dst_width, int x, int dx) {
+void ScaleCols_16_C(uint16_t* dst_ptr,
+                    const uint16_t* src_ptr,
+                    int dst_width,
+                    int x,
+                    int dx) {
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
     dst_ptr[0] = src_ptr[x >> 16];
@@ -390,9 +438,14 @@ void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
 }
 
 // Scales a single row of pixels up by 2x using point sampling.
-void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
-                    int dst_width, int x, int dx) {
+void ScaleColsUp2_C(uint8_t* dst_ptr,
+                    const uint8_t* src_ptr,
+                    int dst_width,
+                    int x,
+                    int dx) {
   int j;
+  (void)x;
+  (void)dx;
   for (j = 0; j < dst_width - 1; j += 2) {
     dst_ptr[1] = dst_ptr[0] = src_ptr[0];
     src_ptr += 1;
@@ -403,9 +456,14 @@ void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
   }
 }
 
-void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                       int dst_width, int x, int dx) {
+void ScaleColsUp2_16_C(uint16_t* dst_ptr,
+                       const uint16_t* src_ptr,
+                       int dst_width,
+                       int x,
+                       int dx) {
   int j;
+  (void)x;
+  (void)dx;
   for (j = 0; j < dst_width - 1; j += 2) {
     dst_ptr[1] = dst_ptr[0] = src_ptr[0];
     src_ptr += 1;
@@ -418,16 +476,19 @@ void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
 
 // (1-f)a + fb can be replaced with a + f(b-a)
 #if defined(__arm__) || defined(__aarch64__)
-#define BLENDER(a, b, f) (uint8)((int)(a) + \
-    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+#define BLENDER(a, b, f) \
+  (uint8_t)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
 #else
-// inteluses 7 bit math with rounding.
-#define BLENDER(a, b, f) (uint8)((int)(a) + \
-    (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7))
+// Intel uses 7 bit math with rounding.
+#define BLENDER(a, b, f) \
+  (uint8_t)((int)(a) + (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7))
 #endif
 
-void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
-                       int dst_width, int x, int dx) {
+void ScaleFilterCols_C(uint8_t* dst_ptr,
+                       const uint8_t* src_ptr,
+                       int dst_width,
+                       int x,
+                       int dx) {
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
     int xi = x >> 16;
@@ -450,12 +511,15 @@ void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
   }
 }
 
-void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
-                         int dst_width, int x32, int dx) {
-  int64 x = (int64)(x32);
+void ScaleFilterCols64_C(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         int dst_width,
+                         int x32,
+                         int dx) {
+  int64_t x = (int64_t)(x32);
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
-    int64 xi = x >> 16;
+    int64_t xi = x >> 16;
     int a = src_ptr[xi];
     int b = src_ptr[xi + 1];
     dst_ptr[0] = BLENDER(a, b, x & 0xffff);
@@ -468,7 +532,7 @@ void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
     dst_ptr += 2;
   }
   if (dst_width & 1) {
-    int64 xi = x >> 16;
+    int64_t xi = x >> 16;
     int a = src_ptr[xi];
     int b = src_ptr[xi + 1];
     dst_ptr[0] = BLENDER(a, b, x & 0xffff);
@@ -476,12 +540,15 @@ void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
 }
 #undef BLENDER
 
-// Same as 8 bit arm blender but return is cast to uint16
-#define BLENDER(a, b, f) (uint16)((int)(a) + \
-    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+// Same as 8 bit arm blender but return is cast to uint16_t
+#define BLENDER(a, b, f) \
+  (uint16_t)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
 
-void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                       int dst_width, int x, int dx) {
+void ScaleFilterCols_16_C(uint16_t* dst_ptr,
+                          const uint16_t* src_ptr,
+                          int dst_width,
+                          int x,
+                          int dx) {
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
     int xi = x >> 16;
@@ -504,12 +571,15 @@ void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
   }
 }
 
-void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                         int dst_width, int x32, int dx) {
-  int64 x = (int64)(x32);
+void ScaleFilterCols64_16_C(uint16_t* dst_ptr,
+                            const uint16_t* src_ptr,
+                            int dst_width,
+                            int x32,
+                            int dx) {
+  int64_t x = (int64_t)(x32);
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
-    int64 xi = x >> 16;
+    int64_t xi = x >> 16;
     int a = src_ptr[xi];
     int b = src_ptr[xi + 1];
     dst_ptr[0] = BLENDER(a, b, x & 0xffff);
@@ -522,7 +592,7 @@ void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
     dst_ptr += 2;
   }
   if (dst_width & 1) {
-    int64 xi = x >> 16;
+    int64_t xi = x >> 16;
     int a = src_ptr[xi];
     int b = src_ptr[xi + 1];
     dst_ptr[0] = BLENDER(a, b, x & 0xffff);
@@ -530,9 +600,12 @@ void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
 }
 #undef BLENDER
 
-void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                      uint8* dst, int dst_width) {
+void ScaleRowDown38_C(const uint8_t* src_ptr,
+                      ptrdiff_t src_stride,
+                      uint8_t* dst,
+                      int dst_width) {
   int x;
+  (void)src_stride;
   assert(dst_width % 3 == 0);
   for (x = 0; x < dst_width; x += 3) {
     dst[0] = src_ptr[0];
@@ -543,9 +616,12 @@ void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                         uint16* dst, int dst_width) {
+void ScaleRowDown38_16_C(const uint16_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint16_t* dst,
+                         int dst_width) {
   int x;
+  (void)src_stride;
   assert(dst_width % 3 == 0);
   for (x = 0; x < dst_width; x += 3) {
     dst[0] = src_ptr[0];
@@ -557,100 +633,118 @@ void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
 }
 
 // 8x3 -> 3x1
-void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
+void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr,
                             ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width) {
+                            uint8_t* dst_ptr,
+                            int dst_width) {
   intptr_t stride = src_stride;
   int i;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-        src_ptr[stride + 0] + src_ptr[stride + 1] +
-        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
-        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
-        (65536 / 9) >> 16;
-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-        src_ptr[stride + 3] + src_ptr[stride + 4] +
-        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
-        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
-        (65536 / 9) >> 16;
-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
-        src_ptr[stride + 6] + src_ptr[stride + 7] +
-        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
-        (65536 / 6) >> 16;
+    dst_ptr[0] =
+        (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+         src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+         src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+            (65536 / 9) >>
+        16;
+    dst_ptr[1] =
+        (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+         src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+         src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+            (65536 / 9) >>
+        16;
+    dst_ptr[2] =
+        (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] +
+         src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+            (65536 / 6) >>
+        16;
     src_ptr += 8;
     dst_ptr += 3;
   }
 }
 
-void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
+void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint16* dst_ptr, int dst_width) {
+                               uint16_t* dst_ptr,
+                               int dst_width) {
   intptr_t stride = src_stride;
   int i;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-        src_ptr[stride + 0] + src_ptr[stride + 1] +
-        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
-        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
-        (65536 / 9) >> 16;
-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-        src_ptr[stride + 3] + src_ptr[stride + 4] +
-        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
-        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
-        (65536 / 9) >> 16;
-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
-        src_ptr[stride + 6] + src_ptr[stride + 7] +
-        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
-        (65536 / 6) >> 16;
+    dst_ptr[0] =
+        (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+         src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+         src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+            (65536 / 9) >>
+        16;
+    dst_ptr[1] =
+        (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+         src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+         src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+            (65536 / 9) >>
+        16;
+    dst_ptr[2] =
+        (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] +
+         src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+            (65536 / 6) >>
+        16;
     src_ptr += 8;
     dst_ptr += 3;
   }
 }
 
 // 8x2 -> 3x1
-void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width) {
+void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width) {
   intptr_t stride = src_stride;
   int i;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-        src_ptr[stride + 0] + src_ptr[stride + 1] +
-        src_ptr[stride + 2]) * (65536 / 6) >> 16;
-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-        src_ptr[stride + 3] + src_ptr[stride + 4] +
-        src_ptr[stride + 5]) * (65536 / 6) >> 16;
-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
-        src_ptr[stride + 6] + src_ptr[stride + 7]) *
-        (65536 / 4) >> 16;
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+                  src_ptr[stride + 1] + src_ptr[stride + 2]) *
+                     (65536 / 6) >>
+                 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+                  src_ptr[stride + 4] + src_ptr[stride + 5]) *
+                     (65536 / 6) >>
+                 16;
+    dst_ptr[2] =
+        (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) *
+            (65536 / 4) >>
+        16;
     src_ptr += 8;
     dst_ptr += 3;
   }
 }
 
-void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                               uint16* dst_ptr, int dst_width) {
+void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16_t* dst_ptr,
+                               int dst_width) {
   intptr_t stride = src_stride;
   int i;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-        src_ptr[stride + 0] + src_ptr[stride + 1] +
-        src_ptr[stride + 2]) * (65536 / 6) >> 16;
-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-        src_ptr[stride + 3] + src_ptr[stride + 4] +
-        src_ptr[stride + 5]) * (65536 / 6) >> 16;
-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
-        src_ptr[stride + 6] + src_ptr[stride + 7]) *
-        (65536 / 4) >> 16;
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+                  src_ptr[stride + 1] + src_ptr[stride + 2]) *
+                     (65536 / 6) >>
+                 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+                  src_ptr[stride + 4] + src_ptr[stride + 5]) *
+                     (65536 / 6) >>
+                 16;
+    dst_ptr[2] =
+        (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) *
+            (65536 / 4) >>
+        16;
     src_ptr += 8;
     dst_ptr += 3;
   }
 }
 
-void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+void ScaleAddRow_C(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
   int x;
   assert(src_width > 0);
   for (x = 0; x < src_width - 1; x += 2) {
@@ -664,7 +758,9 @@ void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
   }
 }
 
-void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) {
+void ScaleAddRow_16_C(const uint16_t* src_ptr,
+                      uint32_t* dst_ptr,
+                      int src_width) {
   int x;
   assert(src_width > 0);
   for (x = 0; x < src_width - 1; x += 2) {
@@ -678,13 +774,14 @@ void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) {
   }
 }
 
-void ScaleARGBRowDown2_C(const uint8* src_argb,
+void ScaleARGBRowDown2_C(const uint8_t* src_argb,
                          ptrdiff_t src_stride,
-                         uint8* dst_argb, int dst_width) {
-  const uint32* src = (const uint32*)(src_argb);
-  uint32* dst = (uint32*)(dst_argb);
-
+                         uint8_t* dst_argb,
+                         int dst_width) {
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
   int x;
+  (void)src_stride;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = src[1];
     dst[1] = src[3];
@@ -696,10 +793,12 @@ void ScaleARGBRowDown2_C(const uint8* src_argb,
   }
 }
 
-void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
+void ScaleARGBRowDown2Linear_C(const uint8_t* src_argb,
                                ptrdiff_t src_stride,
-                               uint8* dst_argb, int dst_width) {
+                               uint8_t* dst_argb,
+                               int dst_width) {
   int x;
+  (void)src_stride;
   for (x = 0; x < dst_width; ++x) {
     dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1;
     dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1;
@@ -710,29 +809,37 @@ void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
   }
 }
 
-void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
-                            uint8* dst_argb, int dst_width) {
+void ScaleARGBRowDown2Box_C(const uint8_t* src_argb,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_argb,
+                            int dst_width) {
   int x;
   for (x = 0; x < dst_width; ++x) {
-    dst_argb[0] = (src_argb[0] + src_argb[4] +
-                  src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
-    dst_argb[1] = (src_argb[1] + src_argb[5] +
-                  src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
-    dst_argb[2] = (src_argb[2] + src_argb[6] +
-                  src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
-    dst_argb[3] = (src_argb[3] + src_argb[7] +
-                  src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
+    dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] +
+                   src_argb[src_stride + 4] + 2) >>
+                  2;
+    dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] +
+                   src_argb[src_stride + 5] + 2) >>
+                  2;
+    dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] +
+                   src_argb[src_stride + 6] + 2) >>
+                  2;
+    dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] +
+                   src_argb[src_stride + 7] + 2) >>
+                  2;
     src_argb += 8;
     dst_argb += 4;
   }
 }
 
-void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
+void ScaleARGBRowDownEven_C(const uint8_t* src_argb,
+                            ptrdiff_t src_stride,
                             int src_stepx,
-                            uint8* dst_argb, int dst_width) {
-  const uint32* src = (const uint32*)(src_argb);
-  uint32* dst = (uint32*)(dst_argb);
-
+                            uint8_t* dst_argb,
+                            int dst_width) {
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
+  (void)src_stride;
   int x;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = src[0];
@@ -745,30 +852,38 @@ void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
   }
 }
 
-void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
+void ScaleARGBRowDownEvenBox_C(const uint8_t* src_argb,
                                ptrdiff_t src_stride,
                                int src_stepx,
-                               uint8* dst_argb, int dst_width) {
+                               uint8_t* dst_argb,
+                               int dst_width) {
   int x;
   for (x = 0; x < dst_width; ++x) {
-    dst_argb[0] = (src_argb[0] + src_argb[4] +
-                  src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
-    dst_argb[1] = (src_argb[1] + src_argb[5] +
-                  src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
-    dst_argb[2] = (src_argb[2] + src_argb[6] +
-                  src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
-    dst_argb[3] = (src_argb[3] + src_argb[7] +
-                  src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
+    dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] +
+                   src_argb[src_stride + 4] + 2) >>
+                  2;
+    dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] +
+                   src_argb[src_stride + 5] + 2) >>
+                  2;
+    dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] +
+                   src_argb[src_stride + 6] + 2) >>
+                  2;
+    dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] +
+                   src_argb[src_stride + 7] + 2) >>
+                  2;
     src_argb += src_stepx * 4;
     dst_argb += 4;
   }
 }
 
 // Scales a single row of pixels using point sampling.
-void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
-                     int dst_width, int x, int dx) {
-  const uint32* src = (const uint32*)(src_argb);
-  uint32* dst = (uint32*)(dst_argb);
+void ScaleARGBCols_C(uint8_t* dst_argb,
+                     const uint8_t* src_argb,
+                     int dst_width,
+                     int x,
+                     int dx) {
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
     dst[0] = src[x >> 16];
@@ -782,11 +897,14 @@ void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
   }
 }
 
-void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
-                       int dst_width, int x32, int dx) {
-  int64 x = (int64)(x32);
-  const uint32* src = (const uint32*)(src_argb);
-  uint32* dst = (uint32*)(dst_argb);
+void ScaleARGBCols64_C(uint8_t* dst_argb,
+                       const uint8_t* src_argb,
+                       int dst_width,
+                       int x32,
+                       int dx) {
+  int64_t x = (int64_t)(x32);
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
     dst[0] = src[x >> 16];
@@ -801,11 +919,16 @@ void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
 }
 
 // Scales a single row of pixels up by 2x using point sampling.
-void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx) {
-  const uint32* src = (const uint32*)(src_argb);
-  uint32* dst = (uint32*)(dst_argb);
+void ScaleARGBColsUp2_C(uint8_t* dst_argb,
+                        const uint8_t* src_argb,
+                        int dst_width,
+                        int x,
+                        int dx) {
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
   int j;
+  (void)x;
+  (void)dx;
   for (j = 0; j < dst_width - 1; j += 2) {
     dst[1] = dst[0] = src[0];
     src += 1;
@@ -818,23 +941,26 @@ void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
 
 // TODO(fbarchard): Replace 0x7f ^ f with 128-f.  bug=607.
 // Mimics SSSE3 blender
-#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
-#define BLENDERC(a, b, f, s) (uint32)( \
-    BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
-#define BLENDER(a, b, f) \
-    BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \
-    BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
+#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7
+#define BLENDERC(a, b, f, s) \
+  (uint32_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
+#define BLENDER(a, b, f)                                                 \
+  BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | BLENDERC(a, b, f, 8) | \
+      BLENDERC(a, b, f, 0)
 
-void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
-                           int dst_width, int x, int dx) {
-  const uint32* src = (const uint32*)(src_argb);
-  uint32* dst = (uint32*)(dst_argb);
+void ScaleARGBFilterCols_C(uint8_t* dst_argb,
+                           const uint8_t* src_argb,
+                           int dst_width,
+                           int x,
+                           int dx) {
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
     int xi = x >> 16;
     int xf = (x >> 9) & 0x7f;
-    uint32 a = src[xi];
-    uint32 b = src[xi + 1];
+    uint32_t a = src[xi];
+    uint32_t b = src[xi + 1];
     dst[0] = BLENDER(a, b, xf);
     x += dx;
     xi = x >> 16;
@@ -848,23 +974,26 @@ void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
   if (dst_width & 1) {
     int xi = x >> 16;
     int xf = (x >> 9) & 0x7f;
-    uint32 a = src[xi];
-    uint32 b = src[xi + 1];
+    uint32_t a = src[xi];
+    uint32_t b = src[xi + 1];
     dst[0] = BLENDER(a, b, xf);
   }
 }
 
-void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
-                             int dst_width, int x32, int dx) {
-  int64 x = (int64)(x32);
-  const uint32* src = (const uint32*)(src_argb);
-  uint32* dst = (uint32*)(dst_argb);
+void ScaleARGBFilterCols64_C(uint8_t* dst_argb,
+                             const uint8_t* src_argb,
+                             int dst_width,
+                             int x32,
+                             int dx) {
+  int64_t x = (int64_t)(x32);
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
-    int64 xi = x >> 16;
+    int64_t xi = x >> 16;
     int xf = (x >> 9) & 0x7f;
-    uint32 a = src[xi];
-    uint32 b = src[xi + 1];
+    uint32_t a = src[xi];
+    uint32_t b = src[xi + 1];
     dst[0] = BLENDER(a, b, xf);
     x += dx;
     xi = x >> 16;
@@ -876,10 +1005,10 @@ void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
     dst += 2;
   }
   if (dst_width & 1) {
-    int64 xi = x >> 16;
+    int64_t xi = x >> 16;
     int xf = (x >> 9) & 0x7f;
-    uint32 a = src[xi];
-    uint32 b = src[xi + 1];
+    uint32_t a = src[xi];
+    uint32_t b = src[xi + 1];
     dst[0] = BLENDER(a, b, xf);
   }
 }
@@ -889,16 +1018,22 @@ void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
 
 // Scale plane vertically with bilinear interpolation.
 void ScalePlaneVertical(int src_height,
-                        int dst_width, int dst_height,
-                        int src_stride, int dst_stride,
-                        const uint8* src_argb, uint8* dst_argb,
-                        int x, int y, int dy,
-                        int bpp, enum FilterMode filtering) {
+                        int dst_width,
+                        int dst_height,
+                        int src_stride,
+                        int dst_stride,
+                        const uint8_t* src_argb,
+                        uint8_t* dst_argb,
+                        int x,
+                        int y,
+                        int dy,
+                        int bpp,
+                        enum FilterMode filtering) {
   // TODO(fbarchard): Allow higher bpp.
   int dst_width_bytes = dst_width * bpp;
-  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_C;
+  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
   const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
   int j;
   assert(bpp >= 1 && bpp <= 4);
@@ -930,13 +1065,11 @@ void ScalePlaneVertical(int src_height,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
-    InterpolateRow = InterpolateRow_Any_DSPR2;
-    if (IS_ALIGNED(dst_width_bytes, 4)) {
-      InterpolateRow = InterpolateRow_DSPR2;
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(dst_width_bytes, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
     }
   }
 #endif
@@ -948,23 +1081,29 @@ void ScalePlaneVertical(int src_height,
     }
     yi = y >> 16;
     yf = filtering ? ((y >> 8) & 255) : 0;
-    InterpolateRow(dst_argb, src_argb + yi * src_stride,
-                   src_stride, dst_width_bytes, yf);
+    InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride,
+                   dst_width_bytes, yf);
     dst_argb += dst_stride;
     y += dy;
   }
 }
 void ScalePlaneVertical_16(int src_height,
-                           int dst_width, int dst_height,
-                           int src_stride, int dst_stride,
-                           const uint16* src_argb, uint16* dst_argb,
-                           int x, int y, int dy,
-                           int wpp, enum FilterMode filtering) {
+                           int dst_width,
+                           int dst_height,
+                           int src_stride,
+                           int dst_stride,
+                           const uint16_t* src_argb,
+                           uint16_t* dst_argb,
+                           int x,
+                           int y,
+                           int dy,
+                           int wpp,
+                           enum FilterMode filtering) {
   // TODO(fbarchard): Allow higher wpp.
   int dst_width_words = dst_width * wpp;
-  void (*InterpolateRow)(uint16* dst_argb, const uint16* src_argb,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_16_C;
+  void (*InterpolateRow)(uint16_t * dst_argb, const uint16_t* src_argb,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_16_C;
   const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
   int j;
   assert(wpp >= 1 && wpp <= 2);
@@ -1003,16 +1142,6 @@ void ScalePlaneVertical_16(int src_height,
       InterpolateRow = InterpolateRow_16_NEON;
     }
   }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
-    InterpolateRow = InterpolateRow_Any_16_DSPR2;
-    if (IS_ALIGNED(dst_width_bytes, 4)) {
-      InterpolateRow = InterpolateRow_16_DSPR2;
-    }
-  }
 #endif
   for (j = 0; j < dst_height; ++j) {
     int yi;
@@ -1022,16 +1151,18 @@ void ScalePlaneVertical_16(int src_height,
     }
     yi = y >> 16;
     yf = filtering ? ((y >> 8) & 255) : 0;
-    InterpolateRow(dst_argb, src_argb + yi * src_stride,
-                   src_stride, dst_width_words, yf);
+    InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride,
+                   dst_width_words, yf);
     dst_argb += dst_stride;
     y += dy;
   }
 }
 
 // Simplify the filtering based on scale factors.
-enum FilterMode ScaleFilterReduce(int src_width, int src_height,
-                                  int dst_width, int dst_height,
+enum FilterMode ScaleFilterReduce(int src_width,
+                                  int src_height,
+                                  int dst_width,
+                                  int dst_height,
                                   enum FilterMode filtering) {
   if (src_width < 0) {
     src_width = -src_width;
@@ -1073,22 +1204,26 @@ enum FilterMode ScaleFilterReduce(int src_width, int src_height,
 
 // Divide num by div and return as 16.16 fixed point result.
 int FixedDiv_C(int num, int div) {
-  return (int)(((int64)(num) << 16) / div);
+  return (int)(((int64_t)(num) << 16) / div);
 }
 
 // Divide num by div and return as 16.16 fixed point result.
 int FixedDiv1_C(int num, int div) {
-  return (int)((((int64)(num) << 16) - 0x00010001) /
-                          (div - 1));
+  return (int)((((int64_t)(num) << 16) - 0x00010001) / (div - 1));
 }
 
 #define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
 
 // Compute slope values for stepping.
-void ScaleSlope(int src_width, int src_height,
-                int dst_width, int dst_height,
+void ScaleSlope(int src_width,
+                int src_height,
+                int dst_width,
+                int dst_height,
                 enum FilterMode filtering,
-                int* x, int* y, int* dx, int* dy) {
+                int* x,
+                int* y,
+                int* dx,
+                int* dy) {
   assert(x != NULL);
   assert(y != NULL);
   assert(dx != NULL);
@@ -1120,7 +1255,7 @@ void ScaleSlope(int src_width, int src_height,
       *x = 0;
     }
     if (dst_height <= src_height) {
-      *dy = FixedDiv(src_height,  dst_height);
+      *dy = FixedDiv(src_height, dst_height);
       *y = CENTERSTART(*dy, -32768);  // Subtract 0.5 (32768) to center filter.
     } else if (dst_height > 1) {
       *dy = FixedDiv1(src_height, dst_height);
@@ -1153,6 +1288,35 @@ void ScaleSlope(int src_width, int src_height,
 }
 #undef CENTERSTART
 
+// Read 8x2 upsample with filtering and write 16x1.
+// actually reads an extra pixel, so 9x2.
+void ScaleRowUp2_16_C(const uint16_t* src_ptr,
+                      ptrdiff_t src_stride,
+                      uint16_t* dst,
+                      int dst_width) {
+  const uint16_t* src2 = src_ptr + src_stride;
+
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    uint16_t p0 = src_ptr[0];
+    uint16_t p1 = src_ptr[1];
+    uint16_t p2 = src2[0];
+    uint16_t p3 = src2[1];
+    dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;
+    dst[1] = (p0 * 3 + p1 * 9 + p2 + p3 * 3 + 8) >> 4;
+    ++src_ptr;
+    ++src2;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    uint16_t p0 = src_ptr[0];
+    uint16_t p1 = src_ptr[1];
+    uint16_t p2 = src2[0];
+    uint16_t p3 = src2[1];
+    dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;
+  }
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/scale_gcc.cc b/media/libvpx/libvpx/third_party/libyuv/source/scale_gcc.cc
index e2f88544b7..312236d2df 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/scale_gcc.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/scale_gcc.cc
@@ -21,1296 +21,1348 @@ extern "C" {
     (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
 
 // Offsets for source bytes 0 to 9
-static uvec8 kShuf0 =
-  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,
+                             128, 128, 128, 128, 128, 128, 128, 128};
 
 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
-static uvec8 kShuf1 =
-  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShuf1 = {3,   4,   5,   7,   8,   9,   11,  12,
+                             128, 128, 128, 128, 128, 128, 128, 128};
 
 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static uvec8 kShuf2 =
-  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShuf2 = {5,   7,   8,   9,   11,  12,  13,  15,
+                             128, 128, 128, 128, 128, 128, 128, 128};
 
 // Offsets for source bytes 0 to 10
-static uvec8 kShuf01 =
-  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
+static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
 
 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
-static uvec8 kShuf11 =
-  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
+static const uvec8 kShuf11 = {2, 3, 4, 5,  5,  6,  6,  7,
+                              8, 9, 9, 10, 10, 11, 12, 13};
 
 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static uvec8 kShuf21 =
-  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
+static const uvec8 kShuf21 = {5,  6,  6,  7,  8,  9,  9,  10,
+                              10, 11, 12, 13, 13, 14, 14, 15};
 
 // Coefficients for source bytes 0 to 10
-static uvec8 kMadd01 =
-  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
+static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
 
 // Coefficients for source bytes 10 to 21
-static uvec8 kMadd11 =
-  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
+static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
 
 // Coefficients for source bytes 21 to 31
-static uvec8 kMadd21 =
-  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
+static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
 
 // Coefficients for source bytes 21 to 31
-static vec16 kRound34 =
-  { 2, 2, 2, 2, 2, 2, 2, 2 };
+static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
 
-static uvec8 kShuf38a =
-  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShuf38a = {0,   3,   6,   8,   11,  14,  128, 128,
+                               128, 128, 128, 128, 128, 128, 128, 128};
 
-static uvec8 kShuf38b =
-  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
+static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0,   3,
+                               6,   8,   11,  14,  128, 128, 128, 128};
 
 // Arrange words 0,3,6 into 0,1,2
-static uvec8 kShufAc =
-  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShufAc = {0,   1,   6,   7,   12,  13,  128, 128,
+                              128, 128, 128, 128, 128, 128, 128, 128};
 
 // Arrange words 0,3,6 into 3,4,5
-static uvec8 kShufAc3 =
-  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
+static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0,   1,
+                               6,   7,   12,  13,  128, 128, 128, 128};
 
 // Scaling values for boxes of 3x3 and 2x3
-static uvec16 kScaleAc33 =
-  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
+static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
+                                  65536 / 9, 65536 / 6, 0,         0};
 
 // Arrange first value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb0 =
-  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
+static const uvec8 kShufAb0 = {0,  128, 3,  128, 6,   128, 8,   128,
+                               11, 128, 14, 128, 128, 128, 128, 128};
 
 // Arrange second value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb1 =
-  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
+static const uvec8 kShufAb1 = {1,  128, 4,  128, 7,   128, 9,   128,
+                               12, 128, 15, 128, 128, 128, 128, 128};
 
 // Arrange third value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb2 =
-  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShufAb2 = {2,  128, 5,   128, 128, 128, 10,  128,
+                               13, 128, 128, 128, 128, 128, 128, 128};
 
 // Scaling values for boxes of 3x2 and 2x2
-static uvec16 kScaleAb2 =
-  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
+static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
+                                 65536 / 3, 65536 / 2, 0,         0};
 
 // GCC versions of row functions are verbatim conversions from Visual C.
 // Generated using gcc disassembly on Visual C object file:
 // objdump -D yuvscaler.obj >yuvscaler.txt
 
-void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1"
-  );
+void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "psrlw     $0x8,%%xmm0                     \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
 }
 
-void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "pcmpeqb    %%xmm4,%%xmm4                  \n"
-    "psrlw      $0xf,%%xmm4                    \n"
-    "packuswb   %%xmm4,%%xmm4                  \n"
-    "pxor       %%xmm5,%%xmm5                  \n"
+void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "pcmpeqb    %%xmm4,%%xmm4                  \n"
+      "psrlw      $0xf,%%xmm4                    \n"
+      "packuswb   %%xmm4,%%xmm4                  \n"
+      "pxor       %%xmm5,%%xmm5                  \n"
 
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10, 0) ",%%xmm1  \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pmaddubsw  %%xmm4,%%xmm0                  \n"
-    "pmaddubsw  %%xmm4,%%xmm1                  \n"
-    "pavgw      %%xmm5,%%xmm0                  \n"
-    "pavgw      %%xmm5,%%xmm1                  \n"
-    "packuswb   %%xmm1,%%xmm0                  \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "pmaddubsw  %%xmm4,%%xmm0                  \n"
+      "pmaddubsw  %%xmm4,%%xmm1                  \n"
+      "pavgw      %%xmm5,%%xmm0                  \n"
+      "pavgw      %%xmm5,%%xmm1                  \n"
+      "packuswb   %%xmm1,%%xmm0                  \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm4", "xmm5");
 }
 
-void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "pcmpeqb    %%xmm4,%%xmm4                  \n"
-    "psrlw      $0xf,%%xmm4                    \n"
-    "packuswb   %%xmm4,%%xmm4                  \n"
-    "pxor       %%xmm5,%%xmm5                  \n"
+void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width) {
+  asm volatile(
+      "pcmpeqb    %%xmm4,%%xmm4                  \n"
+      "psrlw      $0xf,%%xmm4                    \n"
+      "packuswb   %%xmm4,%%xmm4                  \n"
+      "pxor       %%xmm5,%%xmm5                  \n"
 
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2
-    MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pmaddubsw  %%xmm4,%%xmm0                  \n"
-    "pmaddubsw  %%xmm4,%%xmm1                  \n"
-    "pmaddubsw  %%xmm4,%%xmm2                  \n"
-    "pmaddubsw  %%xmm4,%%xmm3                  \n"
-    "paddw      %%xmm2,%%xmm0                  \n"
-    "paddw      %%xmm3,%%xmm1                  \n"
-    "psrlw      $0x1,%%xmm0                    \n"
-    "psrlw      $0x1,%%xmm1                    \n"
-    "pavgw      %%xmm5,%%xmm0                  \n"
-    "pavgw      %%xmm5,%%xmm1                  \n"
-    "packuswb   %%xmm1,%%xmm0                  \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  : "r"((intptr_t)(src_stride))   // %3
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x00(%0,%3,1),%%xmm2            \n"
+      "movdqu    0x10(%0,%3,1),%%xmm3            \n"
+      "lea       0x20(%0),%0                     \n"
+      "pmaddubsw  %%xmm4,%%xmm0                  \n"
+      "pmaddubsw  %%xmm4,%%xmm1                  \n"
+      "pmaddubsw  %%xmm4,%%xmm2                  \n"
+      "pmaddubsw  %%xmm4,%%xmm3                  \n"
+      "paddw      %%xmm2,%%xmm0                  \n"
+      "paddw      %%xmm3,%%xmm1                  \n"
+      "psrlw      $0x1,%%xmm0                    \n"
+      "psrlw      $0x1,%%xmm1                    \n"
+      "pavgw      %%xmm5,%%xmm0                  \n"
+      "pavgw      %%xmm5,%%xmm1                  \n"
+      "packuswb   %%xmm1,%%xmm0                  \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 
 #ifdef HAS_SCALEROWDOWN2_AVX2
-void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
-    "lea        " MEMLEA(0x40,0) ",%0          \n"
-    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
-    "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
-    "lea        " MEMLEA(0x20,1) ",%1          \n"
-    "sub        $0x20,%2                       \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1"
-  );
+void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width) {
+  (void)src_stride;
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "lea        0x40(%0),%0                    \n"
+      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+      "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea        0x20(%1),%1                    \n"
+      "sub        $0x20,%2                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
 }
 
-void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
-    "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
-    "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
-    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
+      "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
 
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "vmovdqu    " MEMACCESS2(0x20, 0) ",%%ymm1 \n"
-    "lea        " MEMLEA(0x40,0) ",%0          \n"
-    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
-    "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
-    "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
-    "lea        " MEMLEA(0x20,1) ",%1          \n"
-    "sub        $0x20,%2                       \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "lea        0x40(%0),%0                    \n"
+      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea        0x20(%1),%1                    \n"
+      "sub        $0x20,%2                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm4", "xmm5");
 }
 
-void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
-    "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
-    "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
-    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width) {
+  asm volatile(
+      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
+      "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
 
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
-    MEMOPREG(vmovdqu,0x00,0,3,1,ymm2)          //  vmovdqu  (%0,%3,1),%%ymm2
-    MEMOPREG(vmovdqu,0x20,0,3,1,ymm3)          //  vmovdqu  0x20(%0,%3,1),%%ymm3
-    "lea        " MEMLEA(0x40,0) ",%0          \n"
-    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
-    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-    "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
-    "vpsrlw     $0x1,%%ymm0,%%ymm0             \n"
-    "vpsrlw     $0x1,%%ymm1,%%ymm1             \n"
-    "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
-    "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
-    "lea        " MEMLEA(0x20,1) ",%1          \n"
-    "sub        $0x20,%2                       \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  : "r"((intptr_t)(src_stride))   // %3
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x00(%0,%3,1),%%ymm2           \n"
+      "vmovdqu    0x20(%0,%3,1),%%ymm3           \n"
+      "lea        0x40(%0),%0                    \n"
+      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
+      "vpsrlw     $0x1,%%ymm0,%%ymm0             \n"
+      "vpsrlw     $0x1,%%ymm1,%%ymm1             \n"
+      "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea        0x20(%1),%1                    \n"
+      "sub        $0x20,%2                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_SCALEROWDOWN2_AVX2
 
-void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrld     $0x18,%%xmm5                    \n"
-    "pslld     $0x10,%%xmm5                    \n"
+void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "psrld     $0x18,%%xmm5                    \n"
+      "pslld     $0x10,%%xmm5                    \n"
 
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "pand      %%xmm5,%%xmm1                   \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "psrlw     $0x8,%%xmm0                     \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm5");
 }
 
-void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
+void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width) {
   intptr_t stridex3;
-  asm volatile (
-    "pcmpeqb    %%xmm4,%%xmm4                  \n"
-    "psrlw      $0xf,%%xmm4                    \n"
-    "movdqa     %%xmm4,%%xmm5                  \n"
-    "packuswb   %%xmm4,%%xmm4                  \n"
-    "psllw      $0x3,%%xmm5                    \n"
-    "lea       " MEMLEA4(0x00,4,4,2) ",%3      \n"
+  asm volatile(
+      "pcmpeqb    %%xmm4,%%xmm4                  \n"
+      "psrlw      $0xf,%%xmm4                    \n"
+      "movdqa     %%xmm4,%%xmm5                  \n"
+      "packuswb   %%xmm4,%%xmm4                  \n"
+      "psllw      $0x3,%%xmm5                    \n"
+      "lea       0x00(%4,%4,2),%3                \n"
 
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
-    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
-    "pmaddubsw  %%xmm4,%%xmm0                  \n"
-    "pmaddubsw  %%xmm4,%%xmm1                  \n"
-    "pmaddubsw  %%xmm4,%%xmm2                  \n"
-    "pmaddubsw  %%xmm4,%%xmm3                  \n"
-    "paddw      %%xmm2,%%xmm0                  \n"
-    "paddw      %%xmm3,%%xmm1                  \n"
-    MEMOPREG(movdqu,0x00,0,4,2,xmm2)           //  movdqu  (%0,%4,2),%%xmm2
-    MEMOPREG(movdqu,0x10,0,4,2,xmm3)           //  movdqu  0x10(%0,%4,2),%%xmm3
-    "pmaddubsw  %%xmm4,%%xmm2                  \n"
-    "pmaddubsw  %%xmm4,%%xmm3                  \n"
-    "paddw      %%xmm2,%%xmm0                  \n"
-    "paddw      %%xmm3,%%xmm1                  \n"
-    MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2
-    MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pmaddubsw  %%xmm4,%%xmm2                  \n"
-    "pmaddubsw  %%xmm4,%%xmm3                  \n"
-    "paddw      %%xmm2,%%xmm0                  \n"
-    "paddw      %%xmm3,%%xmm1                  \n"
-    "phaddw     %%xmm1,%%xmm0                  \n"
-    "paddw      %%xmm5,%%xmm0                  \n"
-    "psrlw      $0x4,%%xmm0                    \n"
-    "packuswb   %%xmm0,%%xmm0                  \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),     // %0
-    "+r"(dst_ptr),     // %1
-    "+r"(dst_width),   // %2
-    "=&r"(stridex3)    // %3
-  : "r"((intptr_t)(src_stride))    // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x00(%0,%4,1),%%xmm2            \n"
+      "movdqu    0x10(%0,%4,1),%%xmm3            \n"
+      "pmaddubsw  %%xmm4,%%xmm0                  \n"
+      "pmaddubsw  %%xmm4,%%xmm1                  \n"
+      "pmaddubsw  %%xmm4,%%xmm2                  \n"
+      "pmaddubsw  %%xmm4,%%xmm3                  \n"
+      "paddw      %%xmm2,%%xmm0                  \n"
+      "paddw      %%xmm3,%%xmm1                  \n"
+      "movdqu    0x00(%0,%4,2),%%xmm2            \n"
+      "movdqu    0x10(%0,%4,2),%%xmm3            \n"
+      "pmaddubsw  %%xmm4,%%xmm2                  \n"
+      "pmaddubsw  %%xmm4,%%xmm3                  \n"
+      "paddw      %%xmm2,%%xmm0                  \n"
+      "paddw      %%xmm3,%%xmm1                  \n"
+      "movdqu    0x00(%0,%3,1),%%xmm2            \n"
+      "movdqu    0x10(%0,%3,1),%%xmm3            \n"
+      "lea       0x20(%0),%0                     \n"
+      "pmaddubsw  %%xmm4,%%xmm2                  \n"
+      "pmaddubsw  %%xmm4,%%xmm3                  \n"
+      "paddw      %%xmm2,%%xmm0                  \n"
+      "paddw      %%xmm3,%%xmm1                  \n"
+      "phaddw     %%xmm1,%%xmm0                  \n"
+      "paddw      %%xmm5,%%xmm0                  \n"
+      "psrlw      $0x4,%%xmm0                    \n"
+      "packuswb   %%xmm0,%%xmm0                  \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width),             // %2
+        "=&r"(stridex3)              // %3
+      : "r"((intptr_t)(src_stride))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
-
 #ifdef HAS_SCALEROWDOWN4_AVX2
-void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-    "vpsrld     $0x18,%%ymm5,%%ymm5            \n"
-    "vpslld     $0x10,%%ymm5,%%ymm5            \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
-    "lea        " MEMLEA(0x40,0) ",%0          \n"
-    "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
-    "vpand      %%ymm5,%%ymm1,%%ymm1           \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
-    "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vmovdqu    %%xmm0," MEMACCESS(1) "        \n"
-    "lea        " MEMLEA(0x10,1) ",%1          \n"
-    "sub        $0x10,%2                       \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
-  );
+void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+      "vpsrld     $0x18,%%ymm5,%%ymm5            \n"
+      "vpslld     $0x10,%%ymm5,%%ymm5            \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "lea        0x40(%0),%0                    \n"
+      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpand      %%ymm5,%%ymm1,%%ymm1           \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+      "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vmovdqu    %%xmm0,(%1)                    \n"
+      "lea        0x10(%1),%1                    \n"
+      "sub        $0x10,%2                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm5");
 }
 
-void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
-    "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
-    "vpsllw     $0x3,%%ymm4,%%ymm5             \n"
-    "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
+void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width) {
+  asm volatile(
+      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
+      "vpsllw     $0x3,%%ymm4,%%ymm5             \n"
+      "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
 
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
-    MEMOPREG(vmovdqu,0x00,0,3,1,ymm2)          //  vmovdqu  (%0,%3,1),%%ymm2
-    MEMOPREG(vmovdqu,0x20,0,3,1,ymm3)          //  vmovdqu  0x20(%0,%3,1),%%ymm3
-    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
-    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-    "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
-    MEMOPREG(vmovdqu,0x00,0,3,2,ymm2)          //  vmovdqu  (%0,%3,2),%%ymm2
-    MEMOPREG(vmovdqu,0x20,0,3,2,ymm3)          //  vmovdqu  0x20(%0,%3,2),%%ymm3
-    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-    "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
-    MEMOPREG(vmovdqu,0x00,0,4,1,ymm2)          //  vmovdqu  (%0,%4,1),%%ymm2
-    MEMOPREG(vmovdqu,0x20,0,4,1,ymm3)          //  vmovdqu  0x20(%0,%4,1),%%ymm3
-    "lea        " MEMLEA(0x40,0) ",%0          \n"
-    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-    "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
-    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
-    "vpsrlw     $0x4,%%ymm0,%%ymm0             \n"
-    "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vmovdqu    %%xmm0," MEMACCESS(1) "        \n"
-    "lea        " MEMLEA(0x10,1) ",%1          \n"
-    "sub        $0x10,%2                       \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  : "r"((intptr_t)(src_stride)),  // %3
-    "r"((intptr_t)(src_stride * 3))   // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x00(%0,%3,1),%%ymm2           \n"
+      "vmovdqu    0x20(%0,%3,1),%%ymm3           \n"
+      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
+      "vmovdqu    0x00(%0,%3,2),%%ymm2           \n"
+      "vmovdqu    0x20(%0,%3,2),%%ymm3           \n"
+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
+      "vmovdqu    0x00(%0,%4,1),%%ymm2           \n"
+      "vmovdqu    0x20(%0,%4,1),%%ymm3           \n"
+      "lea        0x40(%0),%0                    \n"
+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
+      "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpsrlw     $0x4,%%ymm0,%%ymm0             \n"
+      "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vmovdqu    %%xmm0,(%1)                    \n"
+      "lea        0x10(%1),%1                    \n"
+      "sub        $0x10,%2                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),                   // %0
+        "+r"(dst_ptr),                   // %1
+        "+r"(dst_width)                  // %2
+      : "r"((intptr_t)(src_stride)),     // %3
+        "r"((intptr_t)(src_stride * 3))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_SCALEROWDOWN4_AVX2
 
-void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm3                       \n"
-    "movdqa    %1,%%xmm4                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kShuf0),  // %0
-    "m"(kShuf1),  // %1
-    "m"(kShuf2)   // %2
-  );
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm2   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "movdqa    %%xmm2,%%xmm1                   \n"
-    "palignr   $0x8,%%xmm0,%%xmm1              \n"
-    "pshufb    %%xmm3,%%xmm0                   \n"
-    "pshufb    %%xmm4,%%xmm1                   \n"
-    "pshufb    %%xmm5,%%xmm2                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"
-    "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x18,1) ",%1           \n"
-    "sub       $0x18,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_ptr,
+                          int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "movdqa    %0,%%xmm3                       \n"
+      "movdqa    %1,%%xmm4                       \n"
+      "movdqa    %2,%%xmm5                       \n"
+      :
+      : "m"(kShuf0),  // %0
+        "m"(kShuf1),  // %1
+        "m"(kShuf2)   // %2
+      );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm2                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "movdqa    %%xmm2,%%xmm1                   \n"
+      "palignr   $0x8,%%xmm0,%%xmm1              \n"
+      "pshufb    %%xmm3,%%xmm0                   \n"
+      "pshufb    %%xmm4,%%xmm1                   \n"
+      "pshufb    %%xmm5,%%xmm2                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "movq      %%xmm1,0x8(%1)                  \n"
+      "movq      %%xmm2,0x10(%1)                 \n"
+      "lea       0x18(%1),%1                     \n"
+      "sub       $0x18,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
-void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm2                       \n"  // kShuf01
-    "movdqa    %1,%%xmm3                       \n"  // kShuf11
-    "movdqa    %2,%%xmm4                       \n"  // kShuf21
-  :
-  : "m"(kShuf01),  // %0
-    "m"(kShuf11),  // %1
-    "m"(kShuf21)   // %2
-  );
-  asm volatile (
-    "movdqa    %0,%%xmm5                       \n"  // kMadd01
-    "movdqa    %1,%%xmm0                       \n"  // kMadd11
-    "movdqa    %2,%%xmm1                       \n"  // kRound34
-  :
-  : "m"(kMadd01),  // %0
-    "m"(kMadd11),  // %1
-    "m"(kRound34)  // %2
-  );
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
-    MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3),%%xmm7
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm5,%%xmm6                   \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6," MEMACCESS(1) "         \n"
-    "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
-    MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3),%%xmm7
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm3,%%xmm6                   \n"
-    "pmaddubsw %%xmm0,%%xmm6                   \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
-    MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3),%%xmm7
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm4,%%xmm6                   \n"
-    "pmaddubsw %4,%%xmm6                       \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x18,1) ",%1           \n"
-    "sub       $0x18,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-  : "r"((intptr_t)(src_stride)),  // %3
-    "m"(kMadd21)     // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+                                uint8_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "movdqa    %0,%%xmm2                       \n"  // kShuf01
+      "movdqa    %1,%%xmm3                       \n"  // kShuf11
+      "movdqa    %2,%%xmm4                       \n"  // kShuf21
+      :
+      : "m"(kShuf01),  // %0
+        "m"(kShuf11),  // %1
+        "m"(kShuf21)   // %2
+      );
+  asm volatile(
+      "movdqa    %0,%%xmm5                       \n"  // kMadd01
+      "movdqa    %1,%%xmm0                       \n"  // kMadd11
+      "movdqa    %2,%%xmm1                       \n"  // kRound34
+      :
+      : "m"(kMadd01),  // %0
+        "m"(kMadd11),  // %1
+        "m"(kRound34)  // %2
+      );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm6                     \n"
+      "movdqu    0x00(%0,%3,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+      "pshufb    %%xmm2,%%xmm6                   \n"
+      "pmaddubsw %%xmm5,%%xmm6                   \n"
+      "paddsw    %%xmm1,%%xmm6                   \n"
+      "psrlw     $0x2,%%xmm6                     \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movq      %%xmm6,(%1)                     \n"
+      "movdqu    0x8(%0),%%xmm6                  \n"
+      "movdqu    0x8(%0,%3,1),%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+      "pshufb    %%xmm3,%%xmm6                   \n"
+      "pmaddubsw %%xmm0,%%xmm6                   \n"
+      "paddsw    %%xmm1,%%xmm6                   \n"
+      "psrlw     $0x2,%%xmm6                     \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movq      %%xmm6,0x8(%1)                  \n"
+      "movdqu    0x10(%0),%%xmm6                 \n"
+      "movdqu    0x10(%0,%3,1),%%xmm7            \n"
+      "lea       0x20(%0),%0                     \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+      "pshufb    %%xmm4,%%xmm6                   \n"
+      "pmaddubsw %4,%%xmm6                       \n"
+      "paddsw    %%xmm1,%%xmm6                   \n"
+      "psrlw     $0x2,%%xmm6                     \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movq      %%xmm6,0x10(%1)                 \n"
+      "lea       0x18(%1),%1                     \n"
+      "sub       $0x18,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "m"(kMadd21)                  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 
-void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm2                       \n"  // kShuf01
-    "movdqa    %1,%%xmm3                       \n"  // kShuf11
-    "movdqa    %2,%%xmm4                       \n"  // kShuf21
-  :
-  : "m"(kShuf01),  // %0
-    "m"(kShuf11),  // %1
-    "m"(kShuf21)   // %2
-  );
-  asm volatile (
-    "movdqa    %0,%%xmm5                       \n"  // kMadd01
-    "movdqa    %1,%%xmm0                       \n"  // kMadd11
-    "movdqa    %2,%%xmm1                       \n"  // kRound34
-  :
-  : "m"(kMadd01),  // %0
-    "m"(kMadd11),  // %1
-    "m"(kRound34)  // %2
-  );
+                                uint8_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "movdqa    %0,%%xmm2                       \n"  // kShuf01
+      "movdqa    %1,%%xmm3                       \n"  // kShuf11
+      "movdqa    %2,%%xmm4                       \n"  // kShuf21
+      :
+      : "m"(kShuf01),  // %0
+        "m"(kShuf11),  // %1
+        "m"(kShuf21)   // %2
+      );
+  asm volatile(
+      "movdqa    %0,%%xmm5                       \n"  // kMadd01
+      "movdqa    %1,%%xmm0                       \n"  // kMadd11
+      "movdqa    %2,%%xmm1                       \n"  // kRound34
+      :
+      : "m"(kMadd01),  // %0
+        "m"(kMadd11),  // %1
+        "m"(kRound34)  // %2
+      );
 
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
-    MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3,1),%%xmm7
-    "pavgb     %%xmm6,%%xmm7                   \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm5,%%xmm6                   \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6," MEMACCESS(1) "         \n"
-    "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
-    MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3,1),%%xmm7
-    "pavgb     %%xmm6,%%xmm7                   \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm3,%%xmm6                   \n"
-    "pmaddubsw %%xmm0,%%xmm6                   \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
-    MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3,1),%%xmm7
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pavgb     %%xmm6,%%xmm7                   \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm4,%%xmm6                   \n"
-    "pmaddubsw %4,%%xmm6                       \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x18,1) ",%1           \n"
-    "sub       $0x18,%2                        \n"
-    "jg        1b                              \n"
-    : "+r"(src_ptr),   // %0
-      "+r"(dst_ptr),   // %1
-      "+r"(dst_width)  // %2
-    : "r"((intptr_t)(src_stride)),  // %3
-      "m"(kMadd21)     // %4
-    : "memory", "cc", NACL_R14
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm6                     \n"
+      "movdqu    0x00(%0,%3,1),%%xmm7            \n"
+      "pavgb     %%xmm6,%%xmm7                   \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+      "pshufb    %%xmm2,%%xmm6                   \n"
+      "pmaddubsw %%xmm5,%%xmm6                   \n"
+      "paddsw    %%xmm1,%%xmm6                   \n"
+      "psrlw     $0x2,%%xmm6                     \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movq      %%xmm6,(%1)                     \n"
+      "movdqu    0x8(%0),%%xmm6                  \n"
+      "movdqu    0x8(%0,%3,1),%%xmm7             \n"
+      "pavgb     %%xmm6,%%xmm7                   \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+      "pshufb    %%xmm3,%%xmm6                   \n"
+      "pmaddubsw %%xmm0,%%xmm6                   \n"
+      "paddsw    %%xmm1,%%xmm6                   \n"
+      "psrlw     $0x2,%%xmm6                     \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movq      %%xmm6,0x8(%1)                  \n"
+      "movdqu    0x10(%0),%%xmm6                 \n"
+      "movdqu    0x10(%0,%3,1),%%xmm7            \n"
+      "lea       0x20(%0),%0                     \n"
+      "pavgb     %%xmm6,%%xmm7                   \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+      "pshufb    %%xmm4,%%xmm6                   \n"
+      "pmaddubsw %4,%%xmm6                       \n"
+      "paddsw    %%xmm1,%%xmm6                   \n"
+      "psrlw     $0x2,%%xmm6                     \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movq      %%xmm6,0x10(%1)                 \n"
+      "lea       0x18(%1),%1                     \n"
+      "sub       $0x18,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "m"(kMadd21)                  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 
-void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %3,%%xmm4                       \n"
-    "movdqa    %4,%%xmm5                       \n"
+void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_ptr,
+                          int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "movdqa    %3,%%xmm4                       \n"
+      "movdqa    %4,%%xmm5                       \n"
 
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "pshufb    %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "movhlps   %%xmm0,%%xmm1                   \n"
-    "movd      %%xmm1," MEMACCESS2(0x8,1) "    \n"
-    "lea       " MEMLEA(0xc,1) ",%1            \n"
-    "sub       $0xc,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-  : "m"(kShuf38a),   // %3
-    "m"(kShuf38b)    // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "pshufb    %%xmm4,%%xmm0                   \n"
+      "pshufb    %%xmm5,%%xmm1                   \n"
+      "paddusb   %%xmm1,%%xmm0                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "movhlps   %%xmm0,%%xmm1                   \n"
+      "movd      %%xmm1,0x8(%1)                  \n"
+      "lea       0xc(%1),%1                      \n"
+      "sub       $0xc,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      : "m"(kShuf38a),   // %3
+        "m"(kShuf38b)    // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
 }
 
-void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm2                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm4                       \n"
-    "movdqa    %3,%%xmm5                       \n"
-  :
-  : "m"(kShufAb0),   // %0
-    "m"(kShufAb1),   // %1
-    "m"(kShufAb2),   // %2
-    "m"(kScaleAb2)   // %3
-  );
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,3,1,xmm1)           //  movdqu  (%0,%3,1),%%xmm1
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pshufb    %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm6                   \n"
-    "pshufb    %%xmm3,%%xmm6                   \n"
-    "paddusw   %%xmm6,%%xmm1                   \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "paddusw   %%xmm0,%%xmm1                   \n"
-    "pmulhuw   %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movd      %%xmm1," MEMACCESS(1) "         \n"
-    "psrlq     $0x10,%%xmm1                    \n"
-    "movd      %%xmm1," MEMACCESS2(0x2,1) "    \n"
-    "lea       " MEMLEA(0x6,1) ",%1            \n"
-    "sub       $0x6,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),     // %0
-    "+r"(dst_ptr),     // %1
-    "+r"(dst_width)    // %2
-  : "r"((intptr_t)(src_stride))  // %3
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+                                uint8_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "movdqa    %0,%%xmm2                       \n"
+      "movdqa    %1,%%xmm3                       \n"
+      "movdqa    %2,%%xmm4                       \n"
+      "movdqa    %3,%%xmm5                       \n"
+      :
+      : "m"(kShufAb0),  // %0
+        "m"(kShufAb1),  // %1
+        "m"(kShufAb2),  // %2
+        "m"(kScaleAb2)  // %3
+      );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%3,1),%%xmm1            \n"
+      "lea       0x10(%0),%0                     \n"
+      "pavgb     %%xmm1,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "pshufb    %%xmm2,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm6                   \n"
+      "pshufb    %%xmm3,%%xmm6                   \n"
+      "paddusw   %%xmm6,%%xmm1                   \n"
+      "pshufb    %%xmm4,%%xmm0                   \n"
+      "paddusw   %%xmm0,%%xmm1                   \n"
+      "pmulhuw   %%xmm5,%%xmm1                   \n"
+      "packuswb  %%xmm1,%%xmm1                   \n"
+      "movd      %%xmm1,(%1)                     \n"
+      "psrlq     $0x10,%%xmm1                    \n"
+      "movd      %%xmm1,0x2(%1)                  \n"
+      "lea       0x6(%1),%1                      \n"
+      "sub       $0x6,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 
-void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm2                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm4                       \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
-  :
-  : "m"(kShufAc),    // %0
-    "m"(kShufAc3),   // %1
-    "m"(kScaleAc33)  // %2
-  );
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,3,1,xmm6)           //  movdqu  (%0,%3,1),%%xmm6
-    "movhlps   %%xmm0,%%xmm1                   \n"
-    "movhlps   %%xmm6,%%xmm7                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm6                   \n"
-    "punpcklbw %%xmm5,%%xmm7                   \n"
-    "paddusw   %%xmm6,%%xmm0                   \n"
-    "paddusw   %%xmm7,%%xmm1                   \n"
-    MEMOPREG(movdqu,0x00,0,3,2,xmm6)           //  movdqu  (%0,%3,2),%%xmm6
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movhlps   %%xmm6,%%xmm7                   \n"
-    "punpcklbw %%xmm5,%%xmm6                   \n"
-    "punpcklbw %%xmm5,%%xmm7                   \n"
-    "paddusw   %%xmm6,%%xmm0                   \n"
-    "paddusw   %%xmm7,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm6                   \n"
-    "psrldq    $0x2,%%xmm0                     \n"
-    "paddusw   %%xmm0,%%xmm6                   \n"
-    "psrldq    $0x2,%%xmm0                     \n"
-    "paddusw   %%xmm0,%%xmm6                   \n"
-    "pshufb    %%xmm2,%%xmm6                   \n"
-    "movdqa    %%xmm1,%%xmm7                   \n"
-    "psrldq    $0x2,%%xmm1                     \n"
-    "paddusw   %%xmm1,%%xmm7                   \n"
-    "psrldq    $0x2,%%xmm1                     \n"
-    "paddusw   %%xmm1,%%xmm7                   \n"
-    "pshufb    %%xmm3,%%xmm7                   \n"
-    "paddusw   %%xmm7,%%xmm6                   \n"
-    "pmulhuw   %%xmm4,%%xmm6                   \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movd      %%xmm6," MEMACCESS(1) "         \n"
-    "psrlq     $0x10,%%xmm6                    \n"
-    "movd      %%xmm6," MEMACCESS2(0x2,1) "    \n"
-    "lea       " MEMLEA(0x6,1) ",%1            \n"
-    "sub       $0x6,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  : "r"((intptr_t)(src_stride))   // %3
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+                                uint8_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "movdqa    %0,%%xmm2                       \n"
+      "movdqa    %1,%%xmm3                       \n"
+      "movdqa    %2,%%xmm4                       \n"
+      "pxor      %%xmm5,%%xmm5                   \n"
+      :
+      : "m"(kShufAc),    // %0
+        "m"(kShufAc3),   // %1
+        "m"(kScaleAc33)  // %2
+      );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%3,1),%%xmm6            \n"
+      "movhlps   %%xmm0,%%xmm1                   \n"
+      "movhlps   %%xmm6,%%xmm7                   \n"
+      "punpcklbw %%xmm5,%%xmm0                   \n"
+      "punpcklbw %%xmm5,%%xmm1                   \n"
+      "punpcklbw %%xmm5,%%xmm6                   \n"
+      "punpcklbw %%xmm5,%%xmm7                   \n"
+      "paddusw   %%xmm6,%%xmm0                   \n"
+      "paddusw   %%xmm7,%%xmm1                   \n"
+      "movdqu    0x00(%0,%3,2),%%xmm6            \n"
+      "lea       0x10(%0),%0                     \n"
+      "movhlps   %%xmm6,%%xmm7                   \n"
+      "punpcklbw %%xmm5,%%xmm6                   \n"
+      "punpcklbw %%xmm5,%%xmm7                   \n"
+      "paddusw   %%xmm6,%%xmm0                   \n"
+      "paddusw   %%xmm7,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm6                   \n"
+      "psrldq    $0x2,%%xmm0                     \n"
+      "paddusw   %%xmm0,%%xmm6                   \n"
+      "psrldq    $0x2,%%xmm0                     \n"
+      "paddusw   %%xmm0,%%xmm6                   \n"
+      "pshufb    %%xmm2,%%xmm6                   \n"
+      "movdqa    %%xmm1,%%xmm7                   \n"
+      "psrldq    $0x2,%%xmm1                     \n"
+      "paddusw   %%xmm1,%%xmm7                   \n"
+      "psrldq    $0x2,%%xmm1                     \n"
+      "paddusw   %%xmm1,%%xmm7                   \n"
+      "pshufb    %%xmm3,%%xmm7                   \n"
+      "paddusw   %%xmm7,%%xmm6                   \n"
+      "pmulhuw   %%xmm4,%%xmm6                   \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movd      %%xmm6,(%1)                     \n"
+      "psrlq     $0x10,%%xmm6                    \n"
+      "movd      %%xmm6,0x2(%1)                  \n"
+      "lea       0x6(%1),%1                      \n"
+      "sub       $0x6,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 
 // Reads 16xN bytes and produces 16 shorts at a time.
-void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
-  asm volatile (
-    "pxor      %%xmm5,%%xmm5                   \n"
+void ScaleAddRow_SSE2(const uint8_t* src_ptr,
+                      uint16_t* dst_ptr,
+                      int src_width) {
+  asm volatile(
 
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"  // src_ptr += 16
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm1   \n"
-    "movdqa    %%xmm3,%%xmm2                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "punpckhbw %%xmm5,%%xmm3                   \n"
-    "paddusw   %%xmm2,%%xmm0                   \n"
-    "paddusw   %%xmm3,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),     // %0
-    "+r"(dst_ptr),     // %1
-    "+r"(src_width)    // %2
-  :
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+      "pxor      %%xmm5,%%xmm5                   \n"
+
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm3                     \n"
+      "lea       0x10(%0),%0                     \n"  // src_ptr += 16
+      "movdqu    (%1),%%xmm0                     \n"
+      "movdqu    0x10(%1),%%xmm1                 \n"
+      "movdqa    %%xmm3,%%xmm2                   \n"
+      "punpcklbw %%xmm5,%%xmm2                   \n"
+      "punpckhbw %%xmm5,%%xmm3                   \n"
+      "paddusw   %%xmm2,%%xmm0                   \n"
+      "paddusw   %%xmm3,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(src_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 
-
 #ifdef HAS_SCALEADDROW_AVX2
 // Reads 32 bytes and accumulates to 32 shorts at a time.
-void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
-  asm volatile (
-    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+void ScaleAddRow_AVX2(const uint8_t* src_ptr,
+                      uint16_t* dst_ptr,
+                      int src_width) {
+  asm volatile(
 
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm3        \n"
-    "lea        " MEMLEA(0x20,0) ",%0          \n"  // src_ptr += 32
-    "vpermq     $0xd8,%%ymm3,%%ymm3            \n"
-    "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
-    "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
-    "vpaddusw   " MEMACCESS(1) ",%%ymm2,%%ymm0 \n"
-    "vpaddusw   " MEMACCESS2(0x20,1) ",%%ymm3,%%ymm1 \n"
-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
-    "vmovdqu    %%ymm1," MEMACCESS2(0x20,1) "  \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_ptr),     // %0
-    "+r"(dst_ptr),     // %1
-    "+r"(src_width)    // %2
-  :
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm3                    \n"
+      "lea        0x20(%0),%0                    \n"  // src_ptr += 32
+      "vpermq     $0xd8,%%ymm3,%%ymm3            \n"
+      "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
+      "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
+      "vpaddusw   (%1),%%ymm2,%%ymm0             \n"
+      "vpaddusw   0x20(%1),%%ymm3,%%ymm1         \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "vmovdqu    %%ymm1,0x20(%1)                \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(src_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_SCALEADDROW_AVX2
 
 // Constant for making pixels signed to avoid pmaddubsw
 // saturation.
-static uvec8 kFsub80 =
-  { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
+static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+                              0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
 
 // Constant for making pixels unsigned and adding .5 for rounding.
-static uvec16 kFadd40 =
-  { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
+static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
+                               0x4040, 0x4040, 0x4040, 0x4040};
 
 // Bilinear column filtering. SSSE3 version.
-void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                           int dst_width, int x, int dx) {
+void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
+                           const uint8_t* src_ptr,
+                           int dst_width,
+                           int x,
+                           int dx) {
   intptr_t x0, x1, temp_pixel;
-  asm volatile (
-    "movd      %6,%%xmm2                       \n"
-    "movd      %7,%%xmm3                       \n"
-    "movl      $0x04040000,%k2                 \n"
-    "movd      %k2,%%xmm5                      \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "psrlw     $0x9,%%xmm6                     \n"  // 0x007f007f
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "psrlw     $15,%%xmm7                      \n"  // 0x00010001
+  asm volatile(
+      "movd      %6,%%xmm2                       \n"
+      "movd      %7,%%xmm3                       \n"
+      "movl      $0x04040000,%k2                 \n"
+      "movd      %k2,%%xmm5                      \n"
+      "pcmpeqb   %%xmm6,%%xmm6                   \n"
+      "psrlw     $0x9,%%xmm6                     \n"  // 0x007f007f
+      "pcmpeqb   %%xmm7,%%xmm7                   \n"
+      "psrlw     $15,%%xmm7                      \n"  // 0x00010001
 
-    "pextrw    $0x1,%%xmm2,%k3                 \n"
-    "subl      $0x2,%5                         \n"
-    "jl        29f                             \n"
-    "movdqa    %%xmm2,%%xmm0                   \n"
-    "paddd     %%xmm3,%%xmm0                   \n"
-    "punpckldq %%xmm0,%%xmm2                   \n"
-    "punpckldq %%xmm3,%%xmm3                   \n"
-    "paddd     %%xmm3,%%xmm3                   \n"
-    "pextrw    $0x3,%%xmm2,%k4                 \n"
+      "pextrw    $0x1,%%xmm2,%k3                 \n"
+      "subl      $0x2,%5                         \n"
+      "jl        29f                             \n"
+      "movdqa    %%xmm2,%%xmm0                   \n"
+      "paddd     %%xmm3,%%xmm0                   \n"
+      "punpckldq %%xmm0,%%xmm2                   \n"
+      "punpckldq %%xmm3,%%xmm3                   \n"
+      "paddd     %%xmm3,%%xmm3                   \n"
+      "pextrw    $0x3,%%xmm2,%k4                 \n"
 
-    LABELALIGN
-  "2:                                          \n"
-    "movdqa    %%xmm2,%%xmm1                   \n"
-    "paddd     %%xmm3,%%xmm2                   \n"
-    MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
-    "movd      %k2,%%xmm0                      \n"
-    "psrlw     $0x9,%%xmm1                     \n"
-    MEMOPARG(movzwl,0x00,1,4,1,k2)             //  movzwl  (%1,%4,1),%k2
-    "movd      %k2,%%xmm4                      \n"
-    "pshufb    %%xmm5,%%xmm1                   \n"
-    "punpcklwd %%xmm4,%%xmm0                   \n"
-    "psubb     %8,%%xmm0                       \n"  // make pixels signed.
-    "pxor      %%xmm6,%%xmm1                   \n"  // 128 -f = (f ^ 127 ) + 1
-    "paddusb   %%xmm7,%%xmm1                   \n"
-    "pmaddubsw %%xmm0,%%xmm1                   \n"
-    "pextrw    $0x1,%%xmm2,%k3                 \n"
-    "pextrw    $0x3,%%xmm2,%k4                 \n"
-    "paddw     %9,%%xmm1                       \n"  // make pixels unsigned.
-    "psrlw     $0x7,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movd      %%xmm1,%k2                      \n"
-    "mov       %w2," MEMACCESS(0) "            \n"
-    "lea       " MEMLEA(0x2,0) ",%0            \n"
-    "subl      $0x2,%5                         \n"
-    "jge       2b                              \n"
+      LABELALIGN
+      "2:                                        \n"
+      "movdqa    %%xmm2,%%xmm1                   \n"
+      "paddd     %%xmm3,%%xmm2                   \n"
+      "movzwl    0x00(%1,%3,1),%k2               \n"
+      "movd      %k2,%%xmm0                      \n"
+      "psrlw     $0x9,%%xmm1                     \n"
+      "movzwl    0x00(%1,%4,1),%k2               \n"
+      "movd      %k2,%%xmm4                      \n"
+      "pshufb    %%xmm5,%%xmm1                   \n"
+      "punpcklwd %%xmm4,%%xmm0                   \n"
+      "psubb     %8,%%xmm0                       \n"  // make pixels signed.
+      "pxor      %%xmm6,%%xmm1                   \n"  // 128 - f = (f ^ 127 ) +
+                                                      // 1
+      "paddusb   %%xmm7,%%xmm1                   \n"
+      "pmaddubsw %%xmm0,%%xmm1                   \n"
+      "pextrw    $0x1,%%xmm2,%k3                 \n"
+      "pextrw    $0x3,%%xmm2,%k4                 \n"
+      "paddw     %9,%%xmm1                       \n"  // make pixels unsigned.
+      "psrlw     $0x7,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm1                   \n"
+      "movd      %%xmm1,%k2                      \n"
+      "mov       %w2,(%0)                        \n"
+      "lea       0x2(%0),%0                      \n"
+      "subl      $0x2,%5                         \n"
+      "jge       2b                              \n"
 
-    LABELALIGN
-  "29:                                         \n"
-    "addl      $0x1,%5                         \n"
-    "jl        99f                             \n"
-    MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
-    "movd      %k2,%%xmm0                      \n"
-    "psrlw     $0x9,%%xmm2                     \n"
-    "pshufb    %%xmm5,%%xmm2                   \n"
-    "psubb     %8,%%xmm0                       \n"  // make pixels signed.
-    "pxor      %%xmm6,%%xmm2                   \n"
-    "paddusb   %%xmm7,%%xmm2                   \n"
-    "pmaddubsw %%xmm0,%%xmm2                   \n"
-    "paddw     %9,%%xmm2                       \n"  // make pixels unsigned.
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm2                   \n"
-    "movd      %%xmm2,%k2                      \n"
-    "mov       %b2," MEMACCESS(0) "            \n"
-  "99:                                         \n"
-  : "+r"(dst_ptr),      // %0
-    "+r"(src_ptr),      // %1
-    "=&a"(temp_pixel),  // %2
-    "=&r"(x0),          // %3
-    "=&r"(x1),          // %4
+      LABELALIGN
+      "29:                                       \n"
+      "addl      $0x1,%5                         \n"
+      "jl        99f                             \n"
+      "movzwl    0x00(%1,%3,1),%k2               \n"
+      "movd      %k2,%%xmm0                      \n"
+      "psrlw     $0x9,%%xmm2                     \n"
+      "pshufb    %%xmm5,%%xmm2                   \n"
+      "psubb     %8,%%xmm0                       \n"  // make pixels signed.
+      "pxor      %%xmm6,%%xmm2                   \n"
+      "paddusb   %%xmm7,%%xmm2                   \n"
+      "pmaddubsw %%xmm0,%%xmm2                   \n"
+      "paddw     %9,%%xmm2                       \n"  // make pixels unsigned.
+      "psrlw     $0x7,%%xmm2                     \n"
+      "packuswb  %%xmm2,%%xmm2                   \n"
+      "movd      %%xmm2,%k2                      \n"
+      "mov       %b2,(%0)                        \n"
+      "99:                                       \n"
+      : "+r"(dst_ptr),      // %0
+        "+r"(src_ptr),      // %1
+        "=&a"(temp_pixel),  // %2
+        "=&r"(x0),          // %3
+        "=&r"(x1),          // %4
 #if defined(__x86_64__)
-    "+rm"(dst_width)    // %5
+        "+rm"(dst_width)  // %5
 #else
-    "+m"(dst_width)    // %5
+        "+m"(dst_width)  // %5
 #endif
-  : "rm"(x),            // %6
-    "rm"(dx),           // %7
+      : "rm"(x),   // %6
+        "rm"(dx),  // %7
 #if defined(__x86_64__)
-    "x"(kFsub80),       // %8
-    "x"(kFadd40)        // %9
+        "x"(kFsub80),  // %8
+        "x"(kFadd40)   // %9
 #else
-    "m"(kFsub80),       // %8
-    "m"(kFadd40)        // %9
+        "m"(kFsub80),    // %8
+        "m"(kFadd40)     // %9
 #endif
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 
 // Reads 4 pixels, duplicates them and writes 8 pixels.
 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
-                       int dst_width, int x, int dx) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "punpckhbw %%xmm1,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "sub       $0x20,%2                         \n"
-    "jg        1b                              \n"
+void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
+                       const uint8_t* src_ptr,
+                       int dst_width,
+                       int x,
+                       int dx) {
+  (void)x;
+  (void)dx;
+  asm volatile(
 
-  : "+r"(dst_ptr),     // %0
-    "+r"(src_ptr),     // %1
-    "+r"(dst_width)    // %2
-  :: "memory", "cc", "xmm0", "xmm1"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%1),%%xmm0                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "punpckhbw %%xmm1,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%0)                     \n"
+      "movdqu    %%xmm1,0x10(%0)                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+
+      : "+r"(dst_ptr),   // %0
+        "+r"(src_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
 }
 
-void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
                             ptrdiff_t src_stride,
-                            uint8* dst_argb, int dst_width) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "shufps    $0xdd,%%xmm1,%%xmm0             \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(dst_width)  // %2
-  :: "memory", "cc", "xmm0", "xmm1"
-  );
+                            uint8_t* dst_argb,
+                            int dst_width) {
+  (void)src_stride;
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "shufps    $0xdd,%%xmm1,%%xmm0             \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
 }
 
-void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
                                   ptrdiff_t src_stride,
-                                  uint8* dst_argb, int dst_width) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(dst_width)  // %2
-  :: "memory", "cc", "xmm0", "xmm1"
-  );
+                                  uint8_t* dst_argb,
+                                  int dst_width) {
+  (void)src_stride;
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+      "pavgb     %%xmm2,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
 }
 
-void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
                                ptrdiff_t src_stride,
-                               uint8* dst_argb, int dst_width) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu   (%0,%3,1),%%xmm2
-    MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu   0x10(%0,%3,1),%%xmm3
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_argb),   // %1
-    "+r"(dst_width)   // %2
-  : "r"((intptr_t)(src_stride))   // %3
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3"
-  );
+                               uint8_t* dst_argb,
+                               int dst_width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x00(%0,%3,1),%%xmm2            \n"
+      "movdqu    0x10(%0,%3,1),%%xmm3            \n"
+      "lea       0x20(%0),%0                     \n"
+      "pavgb     %%xmm2,%%xmm0                   \n"
+      "pavgb     %%xmm3,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+      "pavgb     %%xmm2,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),              // %0
+        "+r"(dst_argb),              // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
 }
 
 // Reads 4 pixels at a time.
 // Alignment requirement: dst_argb 16 byte aligned.
-void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                               int src_stepx, uint8* dst_argb, int dst_width) {
+void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8_t* dst_argb,
+                               int dst_width) {
   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
   intptr_t src_stepx_x12;
-  asm volatile (
-    "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
-    "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movd      " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
-    "punpckldq %%xmm1,%%xmm0                   \n"
-    MEMOPREG(movd,0x00,0,1,2,xmm2)             //  movd      (%0,%1,2),%%xmm2
-    MEMOPREG(movd,0x00,0,4,1,xmm3)             //  movd      (%0,%4,1),%%xmm3
-    "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
-    "punpckldq %%xmm3,%%xmm2                   \n"
-    "punpcklqdq %%xmm2,%%xmm0                  \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),       // %0
-    "+r"(src_stepx_x4),   // %1
-    "+r"(dst_argb),       // %2
-    "+r"(dst_width),      // %3
-    "=&r"(src_stepx_x12)  // %4
-  :: "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3"
-  );
+  (void)src_stride;
+  asm volatile(
+      "lea       0x00(,%1,4),%1                  \n"
+      "lea       0x00(%1,%1,2),%4                \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movd      (%0),%%xmm0                     \n"
+      "movd      0x00(%0,%1,1),%%xmm1            \n"
+      "punpckldq %%xmm1,%%xmm0                   \n"
+      "movd      0x00(%0,%1,2),%%xmm2            \n"
+      "movd      0x00(%0,%4,1),%%xmm3            \n"
+      "lea       0x00(%0,%1,4),%0                \n"
+      "punpckldq %%xmm3,%%xmm2                   \n"
+      "punpcklqdq %%xmm2,%%xmm0                  \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),       // %0
+        "+r"(src_stepx_x4),   // %1
+        "+r"(dst_argb),       // %2
+        "+r"(dst_width),      // %3
+        "=&r"(src_stepx_x12)  // %4
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3");
 }
 
 // Blends four 2x2 to 4x1.
 // Alignment requirement: dst_argb 16 byte aligned.
-void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
-                                  ptrdiff_t src_stride, int src_stepx,
-                                  uint8* dst_argb, int dst_width) {
+void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8_t* dst_argb,
+                                  int dst_width) {
   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
   intptr_t src_stepx_x12;
   intptr_t row1 = (intptr_t)(src_stride);
-  asm volatile (
-    "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
-    "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
-    "lea       " MEMLEA4(0x00,0,5,1) ",%5      \n"
+  asm volatile(
+      "lea       0x00(,%1,4),%1                  \n"
+      "lea       0x00(%1,%1,2),%4                \n"
+      "lea       0x00(%0,%5,1),%5                \n"
 
-    LABELALIGN
-  "1:                                          \n"
-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movhps,0x00,0,1,1,xmm0)           //  movhps    (%0,%1,1),%%xmm0
-    MEMOPREG(movq,0x00,0,1,2,xmm1)             //  movq      (%0,%1,2),%%xmm1
-    MEMOPREG(movhps,0x00,0,4,1,xmm1)           //  movhps    (%0,%4,1),%%xmm1
-    "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
-    "movq      " MEMACCESS(5) ",%%xmm2         \n"
-    MEMOPREG(movhps,0x00,5,1,1,xmm2)           //  movhps    (%5,%1,1),%%xmm2
-    MEMOPREG(movq,0x00,5,1,2,xmm3)             //  movq      (%5,%1,2),%%xmm3
-    MEMOPREG(movhps,0x00,5,4,1,xmm3)           //  movhps    (%5,%4,1),%%xmm3
-    "lea       " MEMLEA4(0x00,5,1,4) ",%5      \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),        // %0
-    "+r"(src_stepx_x4),    // %1
-    "+r"(dst_argb),        // %2
-    "+rm"(dst_width),      // %3
-    "=&r"(src_stepx_x12),  // %4
-    "+r"(row1)             // %5
-  :: "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movq      (%0),%%xmm0                     \n"
+      "movhps    0x00(%0,%1,1),%%xmm0            \n"
+      "movq      0x00(%0,%1,2),%%xmm1            \n"
+      "movhps    0x00(%0,%4,1),%%xmm1            \n"
+      "lea       0x00(%0,%1,4),%0                \n"
+      "movq      (%5),%%xmm2                     \n"
+      "movhps    0x00(%5,%1,1),%%xmm2            \n"
+      "movq      0x00(%5,%1,2),%%xmm3            \n"
+      "movhps    0x00(%5,%4,1),%%xmm3            \n"
+      "lea       0x00(%5,%1,4),%5                \n"
+      "pavgb     %%xmm2,%%xmm0                   \n"
+      "pavgb     %%xmm3,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+      "pavgb     %%xmm2,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),        // %0
+        "+r"(src_stepx_x4),    // %1
+        "+r"(dst_argb),        // %2
+        "+rm"(dst_width),      // %3
+        "=&r"(src_stepx_x12),  // %4
+        "+r"(row1)             // %5
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3");
 }
 
-void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx) {
+void ScaleARGBCols_SSE2(uint8_t* dst_argb,
+                        const uint8_t* src_argb,
+                        int dst_width,
+                        int x,
+                        int dx) {
   intptr_t x0, x1;
-  asm volatile (
-    "movd      %5,%%xmm2                       \n"
-    "movd      %6,%%xmm3                       \n"
-    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
-    "pshufd    $0x11,%%xmm3,%%xmm0             \n"
-    "paddd     %%xmm0,%%xmm2                   \n"
-    "paddd     %%xmm3,%%xmm3                   \n"
-    "pshufd    $0x5,%%xmm3,%%xmm0              \n"
-    "paddd     %%xmm0,%%xmm2                   \n"
-    "paddd     %%xmm3,%%xmm3                   \n"
-    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
-    "pextrw    $0x1,%%xmm2,%k0                 \n"
-    "pextrw    $0x3,%%xmm2,%k1                 \n"
-    "cmp       $0x0,%4                         \n"
-    "jl        99f                             \n"
-    "sub       $0x4,%4                         \n"
-    "jl        49f                             \n"
+  asm volatile(
+      "movd      %5,%%xmm2                       \n"
+      "movd      %6,%%xmm3                       \n"
+      "pshufd    $0x0,%%xmm2,%%xmm2              \n"
+      "pshufd    $0x11,%%xmm3,%%xmm0             \n"
+      "paddd     %%xmm0,%%xmm2                   \n"
+      "paddd     %%xmm3,%%xmm3                   \n"
+      "pshufd    $0x5,%%xmm3,%%xmm0              \n"
+      "paddd     %%xmm0,%%xmm2                   \n"
+      "paddd     %%xmm3,%%xmm3                   \n"
+      "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+      "pextrw    $0x1,%%xmm2,%k0                 \n"
+      "pextrw    $0x3,%%xmm2,%k1                 \n"
+      "cmp       $0x0,%4                         \n"
+      "jl        99f                             \n"
+      "sub       $0x4,%4                         \n"
+      "jl        49f                             \n"
 
-    LABELALIGN
-  "40:                                         \n"
-    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
-    MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
-    "pextrw    $0x5,%%xmm2,%k0                 \n"
-    "pextrw    $0x7,%%xmm2,%k1                 \n"
-    "paddd     %%xmm3,%%xmm2                   \n"
-    "punpckldq %%xmm1,%%xmm0                   \n"
-    MEMOPREG(movd,0x00,3,0,4,xmm1)             //  movd      (%3,%0,4),%%xmm1
-    MEMOPREG(movd,0x00,3,1,4,xmm4)             //  movd      (%3,%1,4),%%xmm4
-    "pextrw    $0x1,%%xmm2,%k0                 \n"
-    "pextrw    $0x3,%%xmm2,%k1                 \n"
-    "punpckldq %%xmm4,%%xmm1                   \n"
-    "punpcklqdq %%xmm1,%%xmm0                  \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%4                         \n"
-    "jge       40b                             \n"
+      LABELALIGN
+      "40:                                       \n"
+      "movd      0x00(%3,%0,4),%%xmm0            \n"
+      "movd      0x00(%3,%1,4),%%xmm1            \n"
+      "pextrw    $0x5,%%xmm2,%k0                 \n"
+      "pextrw    $0x7,%%xmm2,%k1                 \n"
+      "paddd     %%xmm3,%%xmm2                   \n"
+      "punpckldq %%xmm1,%%xmm0                   \n"
+      "movd      0x00(%3,%0,4),%%xmm1            \n"
+      "movd      0x00(%3,%1,4),%%xmm4            \n"
+      "pextrw    $0x1,%%xmm2,%k0                 \n"
+      "pextrw    $0x3,%%xmm2,%k1                 \n"
+      "punpckldq %%xmm4,%%xmm1                   \n"
+      "punpcklqdq %%xmm1,%%xmm0                  \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%4                         \n"
+      "jge       40b                             \n"
 
-  "49:                                         \n"
-    "test      $0x2,%4                         \n"
-    "je        29f                             \n"
-    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
-    MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
-    "pextrw    $0x5,%%xmm2,%k0                 \n"
-    "punpckldq %%xmm1,%%xmm0                   \n"
-    "movq      %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x8,2) ",%2            \n"
-  "29:                                         \n"
-    "test      $0x1,%4                         \n"
-    "je        99f                             \n"
-    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
-    "movd      %%xmm0," MEMACCESS(2) "         \n"
-  "99:                                         \n"
-  : "=&a"(x0),         // %0
-    "=&d"(x1),         // %1
-    "+r"(dst_argb),    // %2
-    "+r"(src_argb),    // %3
-    "+r"(dst_width)    // %4
-  : "rm"(x),           // %5
-    "rm"(dx)           // %6
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-  );
+      "49:                                       \n"
+      "test      $0x2,%4                         \n"
+      "je        29f                             \n"
+      "movd      0x00(%3,%0,4),%%xmm0            \n"
+      "movd      0x00(%3,%1,4),%%xmm1            \n"
+      "pextrw    $0x5,%%xmm2,%k0                 \n"
+      "punpckldq %%xmm1,%%xmm0                   \n"
+      "movq      %%xmm0,(%2)                     \n"
+      "lea       0x8(%2),%2                      \n"
+      "29:                                       \n"
+      "test      $0x1,%4                         \n"
+      "je        99f                             \n"
+      "movd      0x00(%3,%0,4),%%xmm0            \n"
+      "movd      %%xmm0,(%2)                     \n"
+      "99:                                       \n"
+      : "=&a"(x0),       // %0
+        "=&d"(x1),       // %1
+        "+r"(dst_argb),  // %2
+        "+r"(src_argb),  // %3
+        "+r"(dst_width)  // %4
+      : "rm"(x),         // %5
+        "rm"(dx)         // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
 }
 
 // Reads 4 pixels, duplicates them and writes 8 pixels.
 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
-                           int dst_width, int x, int dx) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpckldq %%xmm0,%%xmm0                   \n"
-    "punpckhdq %%xmm1,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
+void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
+                           const uint8_t* src_argb,
+                           int dst_width,
+                           int x,
+                           int dx) {
+  (void)x;
+  (void)dx;
+  asm volatile(
 
-  : "+r"(dst_argb),    // %0
-    "+r"(src_argb),    // %1
-    "+r"(dst_width)    // %2
-  :: "memory", "cc", NACL_R14
-    "xmm0", "xmm1"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%1),%%xmm0                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpckldq %%xmm0,%%xmm0                   \n"
+      "punpckhdq %%xmm1,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%0)                     \n"
+      "movdqu    %%xmm1,0x10(%0)                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+
+      : "+r"(dst_argb),  // %0
+        "+r"(src_argb),  // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
 }
 
 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
-static uvec8 kShuffleColARGB = {
-  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
-  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
+static const uvec8 kShuffleColARGB = {
+    0u, 4u,  1u, 5u,  2u,  6u,  3u,  7u,  // bbggrraa 1st pixel
+    8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
 };
 
 // Shuffle table for duplicating 2 fractions into 8 bytes each
-static uvec8 kShuffleFractions = {
-  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+static const uvec8 kShuffleFractions = {
+    0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
 };
 
 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
-void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
-                               int dst_width, int x, int dx) {
+void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
+                               const uint8_t* src_argb,
+                               int dst_width,
+                               int x,
+                               int dx) {
   intptr_t x0, x1;
-  asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm5                       \n"
-  :
-  : "m"(kShuffleColARGB),  // %0
-    "m"(kShuffleFractions)  // %1
-  );
+  asm volatile(
+      "movdqa    %0,%%xmm4                       \n"
+      "movdqa    %1,%%xmm5                       \n"
+      :
+      : "m"(kShuffleColARGB),   // %0
+        "m"(kShuffleFractions)  // %1
+      );
 
-  asm volatile (
-    "movd      %5,%%xmm2                       \n"
-    "movd      %6,%%xmm3                       \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "psrlw     $0x9,%%xmm6                     \n"
-    "pextrw    $0x1,%%xmm2,%k3                 \n"
-    "sub       $0x2,%2                         \n"
-    "jl        29f                             \n"
-    "movdqa    %%xmm2,%%xmm0                   \n"
-    "paddd     %%xmm3,%%xmm0                   \n"
-    "punpckldq %%xmm0,%%xmm2                   \n"
-    "punpckldq %%xmm3,%%xmm3                   \n"
-    "paddd     %%xmm3,%%xmm3                   \n"
-    "pextrw    $0x3,%%xmm2,%k4                 \n"
+  asm volatile(
+      "movd      %5,%%xmm2                       \n"
+      "movd      %6,%%xmm3                       \n"
+      "pcmpeqb   %%xmm6,%%xmm6                   \n"
+      "psrlw     $0x9,%%xmm6                     \n"
+      "pextrw    $0x1,%%xmm2,%k3                 \n"
+      "sub       $0x2,%2                         \n"
+      "jl        29f                             \n"
+      "movdqa    %%xmm2,%%xmm0                   \n"
+      "paddd     %%xmm3,%%xmm0                   \n"
+      "punpckldq %%xmm0,%%xmm2                   \n"
+      "punpckldq %%xmm3,%%xmm3                   \n"
+      "paddd     %%xmm3,%%xmm3                   \n"
+      "pextrw    $0x3,%%xmm2,%k4                 \n"
 
-    LABELALIGN
-  "2:                                          \n"
-    "movdqa    %%xmm2,%%xmm1                   \n"
-    "paddd     %%xmm3,%%xmm2                   \n"
-    MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
-    "psrlw     $0x9,%%xmm1                     \n"
-    MEMOPREG(movhps,0x00,1,4,4,xmm0)           //  movhps    (%1,%4,4),%%xmm0
-    "pshufb    %%xmm5,%%xmm1                   \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "pxor      %%xmm6,%%xmm1                   \n"
-    "pmaddubsw %%xmm1,%%xmm0                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "pextrw    $0x1,%%xmm2,%k3                 \n"
-    "pextrw    $0x3,%%xmm2,%k4                 \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movq      %%xmm0," MEMACCESS(0) "         \n"
-    "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "sub       $0x2,%2                         \n"
-    "jge       2b                              \n"
+      LABELALIGN
+      "2:                                        \n"
+      "movdqa    %%xmm2,%%xmm1                   \n"
+      "paddd     %%xmm3,%%xmm2                   \n"
+      "movq      0x00(%1,%3,4),%%xmm0            \n"
+      "psrlw     $0x9,%%xmm1                     \n"
+      "movhps    0x00(%1,%4,4),%%xmm0            \n"
+      "pshufb    %%xmm5,%%xmm1                   \n"
+      "pshufb    %%xmm4,%%xmm0                   \n"
+      "pxor      %%xmm6,%%xmm1                   \n"
+      "pmaddubsw %%xmm1,%%xmm0                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "pextrw    $0x1,%%xmm2,%k3                 \n"
+      "pextrw    $0x3,%%xmm2,%k4                 \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movq      %%xmm0,(%0)                     \n"
+      "lea       0x8(%0),%0                      \n"
+      "sub       $0x2,%2                         \n"
+      "jge       2b                              \n"
 
-    LABELALIGN
-  "29:                                         \n"
-    "add       $0x1,%2                         \n"
-    "jl        99f                             \n"
-    "psrlw     $0x9,%%xmm2                     \n"
-    MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
-    "pshufb    %%xmm5,%%xmm2                   \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "pxor      %%xmm6,%%xmm2                   \n"
-    "pmaddubsw %%xmm2,%%xmm0                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movd      %%xmm0," MEMACCESS(0) "         \n"
+      LABELALIGN
+      "29:                                       \n"
+      "add       $0x1,%2                         \n"
+      "jl        99f                             \n"
+      "psrlw     $0x9,%%xmm2                     \n"
+      "movq      0x00(%1,%3,4),%%xmm0            \n"
+      "pshufb    %%xmm5,%%xmm2                   \n"
+      "pshufb    %%xmm4,%%xmm0                   \n"
+      "pxor      %%xmm6,%%xmm2                   \n"
+      "pmaddubsw %%xmm2,%%xmm0                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movd      %%xmm0,(%0)                     \n"
 
-    LABELALIGN
-  "99:                                         \n"
-  : "+r"(dst_argb),    // %0
-    "+r"(src_argb),    // %1
-    "+rm"(dst_width),  // %2
-    "=&r"(x0),         // %3
-    "=&r"(x1)          // %4
-  : "rm"(x),           // %5
-    "rm"(dx)           // %6
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+      LABELALIGN "99:                            \n"  // clang-format error.
+
+      : "+r"(dst_argb),    // %0
+        "+r"(src_argb),    // %1
+        "+rm"(dst_width),  // %2
+        "=&r"(x0),         // %3
+        "=&r"(x1)          // %4
+      : "rm"(x),           // %5
+        "rm"(dx)           // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 
 // Divide num by div and return as 16.16 fixed point result.
 int FixedDiv_X86(int num, int div) {
-  asm volatile (
-    "cdq                                       \n"
-    "shld      $0x10,%%eax,%%edx               \n"
-    "shl       $0x10,%%eax                     \n"
-    "idiv      %1                              \n"
-    "mov       %0, %%eax                       \n"
-    : "+a"(num)  // %0
-    : "c"(div)   // %1
-    : "memory", "cc", "edx"
-  );
+  asm volatile(
+      "cdq                                       \n"
+      "shld      $0x10,%%eax,%%edx               \n"
+      "shl       $0x10,%%eax                     \n"
+      "idiv      %1                              \n"
+      "mov       %0, %%eax                       \n"
+      : "+a"(num)  // %0
+      : "c"(div)   // %1
+      : "memory", "cc", "edx");
   return num;
 }
 
 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
 int FixedDiv1_X86(int num, int div) {
-  asm volatile (
-    "cdq                                       \n"
-    "shld      $0x10,%%eax,%%edx               \n"
-    "shl       $0x10,%%eax                     \n"
-    "sub       $0x10001,%%eax                  \n"
-    "sbb       $0x0,%%edx                      \n"
-    "sub       $0x1,%1                         \n"
-    "idiv      %1                              \n"
-    "mov       %0, %%eax                       \n"
-    : "+a"(num)  // %0
-    : "c"(div)   // %1
-    : "memory", "cc", "edx"
-  );
+  asm volatile(
+      "cdq                                       \n"
+      "shld      $0x10,%%eax,%%edx               \n"
+      "shl       $0x10,%%eax                     \n"
+      "sub       $0x10001,%%eax                  \n"
+      "sbb       $0x0,%%edx                      \n"
+      "sub       $0x1,%1                         \n"
+      "idiv      %1                              \n"
+      "mov       %0, %%eax                       \n"
+      : "+a"(num)  // %0
+      : "c"(div)   // %1
+      : "memory", "cc", "edx");
   return num;
 }
 
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/scale_mips.cc b/media/libvpx/libvpx/third_party/libyuv/source/scale_mips.cc
deleted file mode 100644
index ae953073fa..0000000000
--- a/media/libvpx/libvpx/third_party/libyuv/source/scale_mips.cc
+++ /dev/null
@@ -1,644 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC MIPS DSPR2
-#if !defined(LIBYUV_DISABLE_MIPS) && \
-    defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
-    (_MIPS_SIM == _MIPS_SIM_ABI32)
-
-void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst, int dst_width) {
-  __asm__ __volatile__(
-    ".set push                                     \n"
-    ".set noreorder                                \n"
-
-    "srl            $t9, %[dst_width], 4           \n"  // iterations -> by 16
-    "beqz           $t9, 2f                        \n"
-    " nop                                          \n"
-
-  "1:                                              \n"
-    "lw             $t0, 0(%[src_ptr])             \n"  // |3|2|1|0|
-    "lw             $t1, 4(%[src_ptr])             \n"  // |7|6|5|4|
-    "lw             $t2, 8(%[src_ptr])             \n"  // |11|10|9|8|
-    "lw             $t3, 12(%[src_ptr])            \n"  // |15|14|13|12|
-    "lw             $t4, 16(%[src_ptr])            \n"  // |19|18|17|16|
-    "lw             $t5, 20(%[src_ptr])            \n"  // |23|22|21|20|
-    "lw             $t6, 24(%[src_ptr])            \n"  // |27|26|25|24|
-    "lw             $t7, 28(%[src_ptr])            \n"  // |31|30|29|28|
-    // TODO(fbarchard): Use odd pixels instead of even.
-    "precr.qb.ph    $t8, $t1, $t0                  \n"  // |6|4|2|0|
-    "precr.qb.ph    $t0, $t3, $t2                  \n"  // |14|12|10|8|
-    "precr.qb.ph    $t1, $t5, $t4                  \n"  // |22|20|18|16|
-    "precr.qb.ph    $t2, $t7, $t6                  \n"  // |30|28|26|24|
-    "addiu          %[src_ptr], %[src_ptr], 32     \n"
-    "addiu          $t9, $t9, -1                   \n"
-    "sw             $t8, 0(%[dst])                 \n"
-    "sw             $t0, 4(%[dst])                 \n"
-    "sw             $t1, 8(%[dst])                 \n"
-    "sw             $t2, 12(%[dst])                \n"
-    "bgtz           $t9, 1b                        \n"
-    " addiu         %[dst], %[dst], 16             \n"
-
-  "2:                                              \n"
-    "andi           $t9, %[dst_width], 0xf         \n"  // residue
-    "beqz           $t9, 3f                        \n"
-    " nop                                          \n"
-
-  "21:                                             \n"
-    "lbu            $t0, 0(%[src_ptr])             \n"
-    "addiu          %[src_ptr], %[src_ptr], 2      \n"
-    "addiu          $t9, $t9, -1                   \n"
-    "sb             $t0, 0(%[dst])                 \n"
-    "bgtz           $t9, 21b                       \n"
-    " addiu         %[dst], %[dst], 1              \n"
-
-  "3:                                              \n"
-    ".set pop                                      \n"
-  : [src_ptr] "+r" (src_ptr),
-    [dst] "+r" (dst)
-  : [dst_width] "r" (dst_width)
-  : "t0", "t1", "t2", "t3", "t4", "t5",
-    "t6", "t7", "t8", "t9"
-  );
-}
-
-void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width) {
-  const uint8* t = src_ptr + src_stride;
-
-  __asm__ __volatile__ (
-    ".set push                                    \n"
-    ".set noreorder                               \n"
-
-    "srl            $t9, %[dst_width], 3          \n"  // iterations -> step 8
-    "bltz           $t9, 2f                       \n"
-    " nop                                         \n"
-
-  "1:                                             \n"
-    "lw             $t0, 0(%[src_ptr])            \n"  // |3|2|1|0|
-    "lw             $t1, 4(%[src_ptr])            \n"  // |7|6|5|4|
-    "lw             $t2, 8(%[src_ptr])            \n"  // |11|10|9|8|
-    "lw             $t3, 12(%[src_ptr])           \n"  // |15|14|13|12|
-    "lw             $t4, 0(%[t])                  \n"  // |19|18|17|16|
-    "lw             $t5, 4(%[t])                  \n"  // |23|22|21|20|
-    "lw             $t6, 8(%[t])                  \n"  // |27|26|25|24|
-    "lw             $t7, 12(%[t])                 \n"  // |31|30|29|28|
-    "addiu          $t9, $t9, -1                  \n"
-    "srl            $t8, $t0, 16                  \n"  // |X|X|3|2|
-    "ins            $t0, $t4, 16, 16              \n"  // |17|16|1|0|
-    "ins            $t4, $t8, 0, 16               \n"  // |19|18|3|2|
-    "raddu.w.qb     $t0, $t0                      \n"  // |17+16+1+0|
-    "raddu.w.qb     $t4, $t4                      \n"  // |19+18+3+2|
-    "shra_r.w       $t0, $t0, 2                   \n"  // |t0+2|>>2
-    "shra_r.w       $t4, $t4, 2                   \n"  // |t4+2|>>2
-    "srl            $t8, $t1, 16                  \n"  // |X|X|7|6|
-    "ins            $t1, $t5, 16, 16              \n"  // |21|20|5|4|
-    "ins            $t5, $t8, 0, 16               \n"  // |22|23|7|6|
-    "raddu.w.qb     $t1, $t1                      \n"  // |21+20+5+4|
-    "raddu.w.qb     $t5, $t5                      \n"  // |23+22+7+6|
-    "shra_r.w       $t1, $t1, 2                   \n"  // |t1+2|>>2
-    "shra_r.w       $t5, $t5, 2                   \n"  // |t5+2|>>2
-    "srl            $t8, $t2, 16                  \n"  // |X|X|11|10|
-    "ins            $t2, $t6, 16, 16              \n"  // |25|24|9|8|
-    "ins            $t6, $t8, 0, 16               \n"  // |27|26|11|10|
-    "raddu.w.qb     $t2, $t2                      \n"  // |25+24+9+8|
-    "raddu.w.qb     $t6, $t6                      \n"  // |27+26+11+10|
-    "shra_r.w       $t2, $t2, 2                   \n"  // |t2+2|>>2
-    "shra_r.w       $t6, $t6, 2                   \n"  // |t5+2|>>2
-    "srl            $t8, $t3, 16                  \n"  // |X|X|15|14|
-    "ins            $t3, $t7, 16, 16              \n"  // |29|28|13|12|
-    "ins            $t7, $t8, 0, 16               \n"  // |31|30|15|14|
-    "raddu.w.qb     $t3, $t3                      \n"  // |29+28+13+12|
-    "raddu.w.qb     $t7, $t7                      \n"  // |31+30+15+14|
-    "shra_r.w       $t3, $t3, 2                   \n"  // |t3+2|>>2
-    "shra_r.w       $t7, $t7, 2                   \n"  // |t7+2|>>2
-    "addiu          %[src_ptr], %[src_ptr], 16    \n"
-    "addiu          %[t], %[t], 16                \n"
-    "sb             $t0, 0(%[dst])                \n"
-    "sb             $t4, 1(%[dst])                \n"
-    "sb             $t1, 2(%[dst])                \n"
-    "sb             $t5, 3(%[dst])                \n"
-    "sb             $t2, 4(%[dst])                \n"
-    "sb             $t6, 5(%[dst])                \n"
-    "sb             $t3, 6(%[dst])                \n"
-    "sb             $t7, 7(%[dst])                \n"
-    "bgtz           $t9, 1b                       \n"
-    " addiu         %[dst], %[dst], 8             \n"
-
-  "2:                                             \n"
-    "andi           $t9, %[dst_width], 0x7        \n"  // x = residue
-    "beqz           $t9, 3f                       \n"
-    " nop                                         \n"
-
-    "21:                                          \n"
-    "lwr            $t1, 0(%[src_ptr])            \n"
-    "lwl            $t1, 3(%[src_ptr])            \n"
-    "lwr            $t2, 0(%[t])                  \n"
-    "lwl            $t2, 3(%[t])                  \n"
-    "srl            $t8, $t1, 16                  \n"
-    "ins            $t1, $t2, 16, 16              \n"
-    "ins            $t2, $t8, 0, 16               \n"
-    "raddu.w.qb     $t1, $t1                      \n"
-    "raddu.w.qb     $t2, $t2                      \n"
-    "shra_r.w       $t1, $t1, 2                   \n"
-    "shra_r.w       $t2, $t2, 2                   \n"
-    "sb             $t1, 0(%[dst])                \n"
-    "sb             $t2, 1(%[dst])                \n"
-    "addiu          %[src_ptr], %[src_ptr], 4     \n"
-    "addiu          $t9, $t9, -2                  \n"
-    "addiu          %[t], %[t], 4                 \n"
-    "bgtz           $t9, 21b                      \n"
-    " addiu         %[dst], %[dst], 2             \n"
-
-  "3:                                             \n"
-    ".set pop                                     \n"
-
-  : [src_ptr] "+r" (src_ptr),
-    [dst] "+r" (dst), [t] "+r" (t)
-  : [dst_width] "r" (dst_width)
-  : "t0", "t1", "t2", "t3", "t4", "t5",
-    "t6", "t7", "t8", "t9"
-  );
-}
-
-void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst, int dst_width) {
-  __asm__ __volatile__ (
-      ".set push                                    \n"
-      ".set noreorder                               \n"
-
-      "srl            $t9, %[dst_width], 3          \n"
-      "beqz           $t9, 2f                       \n"
-      " nop                                         \n"
-
-     "1:                                            \n"
-      "lw             $t1, 0(%[src_ptr])            \n"  // |3|2|1|0|
-      "lw             $t2, 4(%[src_ptr])            \n"  // |7|6|5|4|
-      "lw             $t3, 8(%[src_ptr])            \n"  // |11|10|9|8|
-      "lw             $t4, 12(%[src_ptr])           \n"  // |15|14|13|12|
-      "lw             $t5, 16(%[src_ptr])           \n"  // |19|18|17|16|
-      "lw             $t6, 20(%[src_ptr])           \n"  // |23|22|21|20|
-      "lw             $t7, 24(%[src_ptr])           \n"  // |27|26|25|24|
-      "lw             $t8, 28(%[src_ptr])           \n"  // |31|30|29|28|
-      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |6|4|2|0|
-      "precr.qb.ph    $t2, $t4, $t3                 \n"  // |14|12|10|8|
-      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |22|20|18|16|
-      "precr.qb.ph    $t6, $t8, $t7                 \n"  // |30|28|26|24|
-      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |12|8|4|0|
-      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |28|24|20|16|
-      "addiu          %[src_ptr], %[src_ptr], 32    \n"
-      "addiu          $t9, $t9, -1                  \n"
-      "sw             $t1, 0(%[dst])                \n"
-      "sw             $t5, 4(%[dst])                \n"
-      "bgtz           $t9, 1b                       \n"
-      " addiu         %[dst], %[dst], 8             \n"
-
-    "2:                                             \n"
-      "andi           $t9, %[dst_width], 7          \n"  // residue
-      "beqz           $t9, 3f                       \n"
-      " nop                                         \n"
-
-    "21:                                            \n"
-      "lbu            $t1, 0(%[src_ptr])            \n"
-      "addiu          %[src_ptr], %[src_ptr], 4     \n"
-      "addiu          $t9, $t9, -1                  \n"
-      "sb             $t1, 0(%[dst])                \n"
-      "bgtz           $t9, 21b                      \n"
-      " addiu         %[dst], %[dst], 1             \n"
-
-    "3:                                             \n"
-      ".set pop                                     \n"
-      : [src_ptr] "+r" (src_ptr),
-        [dst] "+r" (dst)
-      : [dst_width] "r" (dst_width)
-      : "t1", "t2", "t3", "t4", "t5",
-        "t6", "t7", "t8", "t9"
-  );
-}
-
-void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width) {
-  intptr_t stride = src_stride;
-  const uint8* s1 = src_ptr + stride;
-  const uint8* s2 = s1 + stride;
-  const uint8* s3 = s2 + stride;
-
-  __asm__ __volatile__ (
-      ".set push                                  \n"
-      ".set noreorder                             \n"
-
-      "srl           $t9, %[dst_width], 1         \n"
-      "andi          $t8, %[dst_width], 1         \n"
-
-     "1:                                          \n"
-      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
-      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
-      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
-      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
-      "lw            $t4, 4(%[src_ptr])           \n"  // |19|18|17|16|
-      "lw            $t5, 4(%[s1])                \n"  // |23|22|21|20|
-      "lw            $t6, 4(%[s2])                \n"  // |27|26|25|24|
-      "lw            $t7, 4(%[s3])                \n"  // |31|30|29|28|
-      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
-      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
-      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
-      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
-      "raddu.w.qb    $t4, $t4                     \n"  // |19 + 18 + 17 + 16|
-      "raddu.w.qb    $t5, $t5                     \n"  // |23 + 22 + 21 + 20|
-      "raddu.w.qb    $t6, $t6                     \n"  // |27 + 26 + 25 + 24|
-      "raddu.w.qb    $t7, $t7                     \n"  // |31 + 30 + 29 + 28|
-      "add           $t0, $t0, $t1                \n"
-      "add           $t1, $t2, $t3                \n"
-      "add           $t0, $t0, $t1                \n"
-      "add           $t4, $t4, $t5                \n"
-      "add           $t6, $t6, $t7                \n"
-      "add           $t4, $t4, $t6                \n"
-      "shra_r.w      $t0, $t0, 4                  \n"
-      "shra_r.w      $t4, $t4, 4                  \n"
-      "sb            $t0, 0(%[dst])               \n"
-      "sb            $t4, 1(%[dst])               \n"
-      "addiu         %[src_ptr], %[src_ptr], 8    \n"
-      "addiu         %[s1], %[s1], 8              \n"
-      "addiu         %[s2], %[s2], 8              \n"
-      "addiu         %[s3], %[s3], 8              \n"
-      "addiu         $t9, $t9, -1                 \n"
-      "bgtz          $t9, 1b                      \n"
-      " addiu        %[dst], %[dst], 2            \n"
-      "beqz          $t8, 2f                      \n"
-      " nop                                       \n"
-
-      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
-      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
-      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
-      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
-      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
-      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
-      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
-      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
-      "add           $t0, $t0, $t1                \n"
-      "add           $t1, $t2, $t3                \n"
-      "add           $t0, $t0, $t1                \n"
-      "shra_r.w      $t0, $t0, 4                  \n"
-      "sb            $t0, 0(%[dst])               \n"
-
-      "2:                                         \n"
-      ".set pop                                   \n"
-
-      : [src_ptr] "+r" (src_ptr),
-        [dst] "+r" (dst),
-        [s1] "+r" (s1),
-        [s2] "+r" (s2),
-        [s3] "+r" (s3)
-      : [dst_width] "r" (dst_width)
-      : "t0", "t1", "t2", "t3", "t4", "t5",
-        "t6","t7", "t8", "t9"
-  );
-}
-
-void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst, int dst_width) {
-  __asm__ __volatile__ (
-      ".set push                                          \n"
-      ".set noreorder                                     \n"
-    "1:                                                   \n"
-      "lw              $t1, 0(%[src_ptr])                 \n"  // |3|2|1|0|
-      "lw              $t2, 4(%[src_ptr])                 \n"  // |7|6|5|4|
-      "lw              $t3, 8(%[src_ptr])                 \n"  // |11|10|9|8|
-      "lw              $t4, 12(%[src_ptr])                \n"  // |15|14|13|12|
-      "lw              $t5, 16(%[src_ptr])                \n"  // |19|18|17|16|
-      "lw              $t6, 20(%[src_ptr])                \n"  // |23|22|21|20|
-      "lw              $t7, 24(%[src_ptr])                \n"  // |27|26|25|24|
-      "lw              $t8, 28(%[src_ptr])                \n"  // |31|30|29|28|
-      "precrq.qb.ph    $t0, $t2, $t4                      \n"  // |7|5|15|13|
-      "precrq.qb.ph    $t9, $t6, $t8                      \n"  // |23|21|31|30|
-      "addiu           %[dst_width], %[dst_width], -24    \n"
-      "ins             $t1, $t1, 8, 16                    \n"  // |3|1|0|X|
-      "ins             $t4, $t0, 8, 16                    \n"  // |X|15|13|12|
-      "ins             $t5, $t5, 8, 16                    \n"  // |19|17|16|X|
-      "ins             $t8, $t9, 8, 16                    \n"  // |X|31|29|28|
-      "addiu           %[src_ptr], %[src_ptr], 32         \n"
-      "packrl.ph       $t0, $t3, $t0                      \n"  // |9|8|7|5|
-      "packrl.ph       $t9, $t7, $t9                      \n"  // |25|24|23|21|
-      "prepend         $t1, $t2, 8                        \n"  // |4|3|1|0|
-      "prepend         $t3, $t4, 24                       \n"  // |15|13|12|11|
-      "prepend         $t5, $t6, 8                        \n"  // |20|19|17|16|
-      "prepend         $t7, $t8, 24                       \n"  // |31|29|28|27|
-      "sw              $t1, 0(%[dst])                     \n"
-      "sw              $t0, 4(%[dst])                     \n"
-      "sw              $t3, 8(%[dst])                     \n"
-      "sw              $t5, 12(%[dst])                    \n"
-      "sw              $t9, 16(%[dst])                    \n"
-      "sw              $t7, 20(%[dst])                    \n"
-      "bnez            %[dst_width], 1b                   \n"
-      " addiu          %[dst], %[dst], 24                 \n"
-      ".set pop                                           \n"
-      : [src_ptr] "+r" (src_ptr),
-        [dst] "+r" (dst),
-        [dst_width] "+r" (dst_width)
-      :
-      : "t0", "t1", "t2", "t3", "t4", "t5",
-        "t6","t7", "t8", "t9"
-  );
-}
-
-void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* d, int dst_width) {
-  __asm__ __volatile__ (
-      ".set push                                         \n"
-      ".set noreorder                                    \n"
-      "repl.ph           $t3, 3                          \n"  // 0x00030003
-
-    "1:                                                  \n"
-      "lw                $t0, 0(%[src_ptr])              \n"  // |S3|S2|S1|S0|
-      "lwx               $t1, %[src_stride](%[src_ptr])  \n"  // |T3|T2|T1|T0|
-      "rotr              $t2, $t0, 8                     \n"  // |S0|S3|S2|S1|
-      "rotr              $t6, $t1, 8                     \n"  // |T0|T3|T2|T1|
-      "muleu_s.ph.qbl    $t4, $t2, $t3                   \n"  // |S0*3|S3*3|
-      "muleu_s.ph.qbl    $t5, $t6, $t3                   \n"  // |T0*3|T3*3|
-      "andi              $t0, $t2, 0xFFFF                \n"  // |0|0|S2|S1|
-      "andi              $t1, $t6, 0xFFFF                \n"  // |0|0|T2|T1|
-      "raddu.w.qb        $t0, $t0                        \n"
-      "raddu.w.qb        $t1, $t1                        \n"
-      "shra_r.w          $t0, $t0, 1                     \n"
-      "shra_r.w          $t1, $t1, 1                     \n"
-      "preceu.ph.qbr     $t2, $t2                        \n"  // |0|S2|0|S1|
-      "preceu.ph.qbr     $t6, $t6                        \n"  // |0|T2|0|T1|
-      "rotr              $t2, $t2, 16                    \n"  // |0|S1|0|S2|
-      "rotr              $t6, $t6, 16                    \n"  // |0|T1|0|T2|
-      "addu.ph           $t2, $t2, $t4                   \n"
-      "addu.ph           $t6, $t6, $t5                   \n"
-      "sll               $t5, $t0, 1                     \n"
-      "add               $t0, $t5, $t0                   \n"
-      "shra_r.ph         $t2, $t2, 2                     \n"
-      "shra_r.ph         $t6, $t6, 2                     \n"
-      "shll.ph           $t4, $t2, 1                     \n"
-      "addq.ph           $t4, $t4, $t2                   \n"
-      "addu              $t0, $t0, $t1                   \n"
-      "addiu             %[src_ptr], %[src_ptr], 4       \n"
-      "shra_r.w          $t0, $t0, 2                     \n"
-      "addu.ph           $t6, $t6, $t4                   \n"
-      "shra_r.ph         $t6, $t6, 2                     \n"
-      "srl               $t1, $t6, 16                    \n"
-      "addiu             %[dst_width], %[dst_width], -3  \n"
-      "sb                $t1, 0(%[d])                    \n"
-      "sb                $t0, 1(%[d])                    \n"
-      "sb                $t6, 2(%[d])                    \n"
-      "bgtz              %[dst_width], 1b                \n"
-      " addiu            %[d], %[d], 3                   \n"
-    "3:                                                  \n"
-      ".set pop                                          \n"
-      : [src_ptr] "+r" (src_ptr),
-        [src_stride] "+r" (src_stride),
-        [d] "+r" (d),
-        [dst_width] "+r" (dst_width)
-      :
-      : "t0", "t1", "t2", "t3",
-        "t4", "t5", "t6"
-  );
-}
-
-void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* d, int dst_width) {
-  __asm__ __volatile__ (
-      ".set push                                           \n"
-      ".set noreorder                                      \n"
-      "repl.ph           $t2, 3                            \n"  // 0x00030003
-
-    "1:                                                    \n"
-      "lw                $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
-      "lwx               $t1, %[src_stride](%[src_ptr])    \n"  // |T3|T2|T1|T0|
-      "rotr              $t4, $t0, 8                       \n"  // |S0|S3|S2|S1|
-      "rotr              $t6, $t1, 8                       \n"  // |T0|T3|T2|T1|
-      "muleu_s.ph.qbl    $t3, $t4, $t2                     \n"  // |S0*3|S3*3|
-      "muleu_s.ph.qbl    $t5, $t6, $t2                     \n"  // |T0*3|T3*3|
-      "andi              $t0, $t4, 0xFFFF                  \n"  // |0|0|S2|S1|
-      "andi              $t1, $t6, 0xFFFF                  \n"  // |0|0|T2|T1|
-      "raddu.w.qb        $t0, $t0                          \n"
-      "raddu.w.qb        $t1, $t1                          \n"
-      "shra_r.w          $t0, $t0, 1                       \n"
-      "shra_r.w          $t1, $t1, 1                       \n"
-      "preceu.ph.qbr     $t4, $t4                          \n"  // |0|S2|0|S1|
-      "preceu.ph.qbr     $t6, $t6                          \n"  // |0|T2|0|T1|
-      "rotr              $t4, $t4, 16                      \n"  // |0|S1|0|S2|
-      "rotr              $t6, $t6, 16                      \n"  // |0|T1|0|T2|
-      "addu.ph           $t4, $t4, $t3                     \n"
-      "addu.ph           $t6, $t6, $t5                     \n"
-      "shra_r.ph         $t6, $t6, 2                       \n"
-      "shra_r.ph         $t4, $t4, 2                       \n"
-      "addu.ph           $t6, $t6, $t4                     \n"
-      "addiu             %[src_ptr], %[src_ptr], 4         \n"
-      "shra_r.ph         $t6, $t6, 1                       \n"
-      "addu              $t0, $t0, $t1                     \n"
-      "addiu             %[dst_width], %[dst_width], -3    \n"
-      "shra_r.w          $t0, $t0, 1                       \n"
-      "srl               $t1, $t6, 16                      \n"
-      "sb                $t1, 0(%[d])                      \n"
-      "sb                $t0, 1(%[d])                      \n"
-      "sb                $t6, 2(%[d])                      \n"
-      "bgtz              %[dst_width], 1b                  \n"
-      " addiu            %[d], %[d], 3                     \n"
-    "3:                                                    \n"
-      ".set pop                                            \n"
-      : [src_ptr] "+r" (src_ptr),
-        [src_stride] "+r" (src_stride),
-        [d] "+r" (d),
-        [dst_width] "+r" (dst_width)
-      :
-      : "t0", "t1", "t2", "t3",
-        "t4", "t5", "t6"
-  );
-}
-
-void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst, int dst_width) {
-  __asm__ __volatile__ (
-      ".set push                                     \n"
-      ".set noreorder                                \n"
-
-    "1:                                              \n"
-      "lw         $t0, 0(%[src_ptr])                 \n"  // |3|2|1|0|
-      "lw         $t1, 4(%[src_ptr])                 \n"  // |7|6|5|4|
-      "lw         $t2, 8(%[src_ptr])                 \n"  // |11|10|9|8|
-      "lw         $t3, 12(%[src_ptr])                \n"  // |15|14|13|12|
-      "lw         $t4, 16(%[src_ptr])                \n"  // |19|18|17|16|
-      "lw         $t5, 20(%[src_ptr])                \n"  // |23|22|21|20|
-      "lw         $t6, 24(%[src_ptr])                \n"  // |27|26|25|24|
-      "lw         $t7, 28(%[src_ptr])                \n"  // |31|30|29|28|
-      "wsbh       $t0, $t0                           \n"  // |2|3|0|1|
-      "wsbh       $t6, $t6                           \n"  // |26|27|24|25|
-      "srl        $t0, $t0, 8                        \n"  // |X|2|3|0|
-      "srl        $t3, $t3, 16                       \n"  // |X|X|15|14|
-      "srl        $t5, $t5, 16                       \n"  // |X|X|23|22|
-      "srl        $t7, $t7, 16                       \n"  // |X|X|31|30|
-      "ins        $t1, $t2, 24, 8                    \n"  // |8|6|5|4|
-      "ins        $t6, $t5, 0, 8                     \n"  // |26|27|24|22|
-      "ins        $t1, $t0, 0, 16                    \n"  // |8|6|3|0|
-      "ins        $t6, $t7, 24, 8                    \n"  // |30|27|24|22|
-      "prepend    $t2, $t3, 24                       \n"  // |X|15|14|11|
-      "ins        $t4, $t4, 16, 8                    \n"  // |19|16|17|X|
-      "ins        $t4, $t2, 0, 16                    \n"  // |19|16|14|11|
-      "addiu      %[src_ptr], %[src_ptr], 32         \n"
-      "addiu      %[dst_width], %[dst_width], -12    \n"
-      "addiu      $t8,%[dst_width], -12              \n"
-      "sw         $t1, 0(%[dst])                     \n"
-      "sw         $t4, 4(%[dst])                     \n"
-      "sw         $t6, 8(%[dst])                     \n"
-      "bgez       $t8, 1b                            \n"
-      " addiu     %[dst], %[dst], 12                 \n"
-      ".set pop                                      \n"
-      : [src_ptr] "+r" (src_ptr),
-        [dst] "+r" (dst),
-        [dst_width] "+r" (dst_width)
-      :
-      : "t0", "t1", "t2", "t3", "t4",
-        "t5", "t6", "t7", "t8"
-  );
-}
-
-void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
-  intptr_t stride = src_stride;
-  const uint8* t = src_ptr + stride;
-  const int c = 0x2AAA;
-
-  __asm__ __volatile__ (
-      ".set push                                         \n"
-      ".set noreorder                                    \n"
-
-    "1:                                                  \n"
-      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
-      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
-      "lw              $t2, 0(%[t])                      \n"  // |T3|T2|T1|T0|
-      "lw              $t3, 4(%[t])                      \n"  // |T7|T6|T5|T4|
-      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
-      "packrl.ph       $t4, $t1, $t3                     \n"  // |S7|S6|T7|T6|
-      "packrl.ph       $t5, $t3, $t1                     \n"  // |T5|T4|S5|S4|
-      "raddu.w.qb      $t4, $t4                          \n"  // S7+S6+T7+T6
-      "raddu.w.qb      $t5, $t5                          \n"  // T5+T4+S5+S4
-      "precrq.qb.ph    $t6, $t0, $t2                     \n"  // |S3|S1|T3|T1|
-      "precrq.qb.ph    $t6, $t6, $t6                     \n"  // |S3|T3|S3|T3|
-      "srl             $t4, $t4, 2                       \n"  // t4 / 4
-      "srl             $t6, $t6, 16                      \n"  // |0|0|S3|T3|
-      "raddu.w.qb      $t6, $t6                          \n"  // 0+0+S3+T3
-      "addu            $t6, $t5, $t6                     \n"
-      "mul             $t6, $t6, %[c]                    \n"  // t6 * 0x2AAA
-      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
-      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
-      "raddu.w.qb      $t0, $t0                          \n"  // S2+S1+S0+0
-      "raddu.w.qb      $t2, $t2                          \n"  // T2+T1+T0+0
-      "addu            $t0, $t0, $t2                     \n"
-      "mul             $t0, $t0, %[c]                    \n"  // t0 * 0x2AAA
-      "addiu           %[src_ptr], %[src_ptr], 8         \n"
-      "addiu           %[t], %[t], 8                     \n"
-      "addiu           %[dst_width], %[dst_width], -3    \n"
-      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
-      "srl             $t6, $t6, 16                      \n"
-      "srl             $t0, $t0, 16                      \n"
-      "sb              $t4, -1(%[dst_ptr])               \n"
-      "sb              $t6, -2(%[dst_ptr])               \n"
-      "bgtz            %[dst_width], 1b                  \n"
-      " sb             $t0, -3(%[dst_ptr])               \n"
-      ".set pop                                          \n"
-      : [src_ptr] "+r" (src_ptr),
-        [dst_ptr] "+r" (dst_ptr),
-        [t] "+r" (t),
-        [dst_width] "+r" (dst_width)
-      : [c] "r" (c)
-      : "t0", "t1", "t2", "t3", "t4", "t5", "t6"
-  );
-}
-
-void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
-  intptr_t stride = src_stride;
-  const uint8* s1 = src_ptr + stride;
-  stride += stride;
-  const uint8* s2 = src_ptr + stride;
-  const int c1 = 0x1C71;
-  const int c2 = 0x2AAA;
-
-  __asm__ __volatile__ (
-      ".set push                                         \n"
-      ".set noreorder                                    \n"
-
-    "1:                                                  \n"
-      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
-      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
-      "lw              $t2, 0(%[s1])                     \n"  // |T3|T2|T1|T0|
-      "lw              $t3, 4(%[s1])                     \n"  // |T7|T6|T5|T4|
-      "lw              $t4, 0(%[s2])                     \n"  // |R3|R2|R1|R0|
-      "lw              $t5, 4(%[s2])                     \n"  // |R7|R6|R5|R4|
-      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
-      "packrl.ph       $t6, $t1, $t3                     \n"  // |S7|S6|T7|T6|
-      "raddu.w.qb      $t6, $t6                          \n"  // S7+S6+T7+T6
-      "packrl.ph       $t7, $t3, $t1                     \n"  // |T5|T4|S5|S4|
-      "raddu.w.qb      $t7, $t7                          \n"  // T5+T4+S5+S4
-      "sll             $t8, $t5, 16                      \n"  // |R5|R4|0|0|
-      "raddu.w.qb      $t8, $t8                          \n"  // R5+R4
-      "addu            $t7, $t7, $t8                     \n"
-      "srl             $t8, $t5, 16                      \n"  // |0|0|R7|R6|
-      "raddu.w.qb      $t8, $t8                          \n"  // R7 + R6
-      "addu            $t6, $t6, $t8                     \n"
-      "mul             $t6, $t6, %[c2]                   \n"  // t6 * 0x2AAA
-      "precrq.qb.ph    $t8, $t0, $t2                     \n"  // |S3|S1|T3|T1|
-      "precrq.qb.ph    $t8, $t8, $t4                     \n"  // |S3|T3|R3|R1|
-      "srl             $t8, $t8, 8                       \n"  // |0|S3|T3|R3|
-      "raddu.w.qb      $t8, $t8                          \n"  // S3 + T3 + R3
-      "addu            $t7, $t7, $t8                     \n"
-      "mul             $t7, $t7, %[c1]                   \n"  // t7 * 0x1C71
-      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
-      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
-      "sll             $t4, $t4, 8                       \n"  // |R2|R1|R0|0|
-      "raddu.w.qb      $t0, $t0                          \n"
-      "raddu.w.qb      $t2, $t2                          \n"
-      "raddu.w.qb      $t4, $t4                          \n"
-      "addu            $t0, $t0, $t2                     \n"
-      "addu            $t0, $t0, $t4                     \n"
-      "mul             $t0, $t0, %[c1]                   \n"  // t0 * 0x1C71
-      "addiu           %[src_ptr], %[src_ptr], 8         \n"
-      "addiu           %[s1], %[s1], 8                   \n"
-      "addiu           %[s2], %[s2], 8                   \n"
-      "addiu           %[dst_width], %[dst_width], -3    \n"
-      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
-      "srl             $t6, $t6, 16                      \n"
-      "srl             $t7, $t7, 16                      \n"
-      "srl             $t0, $t0, 16                      \n"
-      "sb              $t6, -1(%[dst_ptr])               \n"
-      "sb              $t7, -2(%[dst_ptr])               \n"
-      "bgtz            %[dst_width], 1b                  \n"
-      " sb             $t0, -3(%[dst_ptr])               \n"
-      ".set pop                                          \n"
-      : [src_ptr] "+r" (src_ptr),
-        [dst_ptr] "+r" (dst_ptr),
-        [s1] "+r" (s1),
-        [s2] "+r" (s2),
-        [dst_width] "+r" (dst_width)
-      : [c1] "r" (c1), [c2] "r" (c2)
-      : "t0", "t1", "t2", "t3", "t4",
-        "t5", "t6", "t7", "t8"
-  );
-}
-
-#endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/scale_msa.cc b/media/libvpx/libvpx/third_party/libyuv/source/scale_msa.cc
new file mode 100644
index 0000000000..482a521f0d
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/libyuv/source/scale_msa.cc
@@ -0,0 +1,949 @@
+/*
+ *  Copyright 2016 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "libyuv/scale_row.h"
+
+// This module is for GCC MSA
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#include "libyuv/macros_msa.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define LOAD_INDEXED_DATA(srcp, indx0, out0) \
+  {                                          \
+    out0[0] = srcp[indx0[0]];                \
+    out0[1] = srcp[indx0[1]];                \
+    out0[2] = srcp[indx0[2]];                \
+    out0[3] = srcp[indx0[3]];                \
+  }
+
+void ScaleARGBRowDown2_MSA(const uint8_t* src_argb,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_argb,
+                           int dst_width) {
+  int x;
+  v16u8 src0, src1, dst0;
+  (void)src_stride;
+
+  for (x = 0; x < dst_width; x += 4) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+    dst0 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
+    ST_UB(dst0, dst_argb);
+    src_argb += 32;
+    dst_argb += 16;
+  }
+}
+
+void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_argb,
+                                 int dst_width) {
+  int x;
+  v16u8 src0, src1, vec0, vec1, dst0;
+  (void)src_stride;
+
+  for (x = 0; x < dst_width; x += 4) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+    vec0 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
+    vec1 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
+    dst0 = (v16u8)__msa_aver_u_b((v16u8)vec0, (v16u8)vec1);
+    ST_UB(dst0, dst_argb);
+    src_argb += 32;
+    dst_argb += 16;
+  }
+}
+
+void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_argb,
+                              int dst_width) {
+  int x;
+  const uint8_t* s = src_argb;
+  const uint8_t* t = src_argb + src_stride;
+  v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;
+  v8u16 reg0, reg1, reg2, reg3;
+  v16i8 shuffler = {0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15};
+
+  for (x = 0; x < dst_width; x += 4) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+    src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+    vec0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src0, (v16i8)src0);
+    vec1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1);
+    vec2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src2, (v16i8)src2);
+    vec3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src3, (v16i8)src3);
+    reg0 = __msa_hadd_u_h(vec0, vec0);
+    reg1 = __msa_hadd_u_h(vec1, vec1);
+    reg2 = __msa_hadd_u_h(vec2, vec2);
+    reg3 = __msa_hadd_u_h(vec3, vec3);
+    reg0 += reg2;
+    reg1 += reg3;
+    reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 2);
+    reg1 = (v8u16)__msa_srari_h((v8i16)reg1, 2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+    ST_UB(dst0, dst_argb);
+    s += 32;
+    t += 32;
+    dst_argb += 16;
+  }
+}
+
+void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              int32_t src_stepx,
+                              uint8_t* dst_argb,
+                              int dst_width) {
+  int x;
+  int32_t stepx = src_stepx * 4;
+  int32_t data0, data1, data2, data3;
+  (void)src_stride;
+
+  for (x = 0; x < dst_width; x += 4) {
+    data0 = LW(src_argb);
+    data1 = LW(src_argb + stepx);
+    data2 = LW(src_argb + stepx * 2);
+    data3 = LW(src_argb + stepx * 3);
+    SW(data0, dst_argb);
+    SW(data1, dst_argb + 4);
+    SW(data2, dst_argb + 8);
+    SW(data3, dst_argb + 12);
+    src_argb += stepx * 4;
+    dst_argb += 16;
+  }
+}
+
+void ScaleARGBRowDownEvenBox_MSA(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 int src_stepx,
+                                 uint8_t* dst_argb,
+                                 int dst_width) {
+  int x;
+  const uint8_t* nxt_argb = src_argb + src_stride;
+  int32_t stepx = src_stepx * 4;
+  int64_t data0, data1, data2, data3;
+  v16u8 src0 = {0}, src1 = {0}, src2 = {0}, src3 = {0};
+  v16u8 vec0, vec1, vec2, vec3;
+  v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  v16u8 dst0;
+
+  for (x = 0; x < dst_width; x += 4) {
+    data0 = LD(src_argb);
+    data1 = LD(src_argb + stepx);
+    data2 = LD(src_argb + stepx * 2);
+    data3 = LD(src_argb + stepx * 3);
+    src0 = (v16u8)__msa_insert_d((v2i64)src0, 0, data0);
+    src0 = (v16u8)__msa_insert_d((v2i64)src0, 1, data1);
+    src1 = (v16u8)__msa_insert_d((v2i64)src1, 0, data2);
+    src1 = (v16u8)__msa_insert_d((v2i64)src1, 1, data3);
+    data0 = LD(nxt_argb);
+    data1 = LD(nxt_argb + stepx);
+    data2 = LD(nxt_argb + stepx * 2);
+    data3 = LD(nxt_argb + stepx * 3);
+    src2 = (v16u8)__msa_insert_d((v2i64)src2, 0, data0);
+    src2 = (v16u8)__msa_insert_d((v2i64)src2, 1, data1);
+    src3 = (v16u8)__msa_insert_d((v2i64)src3, 0, data2);
+    src3 = (v16u8)__msa_insert_d((v2i64)src3, 1, data3);
+    vec0 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
+    vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
+    vec2 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
+    vec3 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
+    reg0 = __msa_hadd_u_h(vec0, vec0);
+    reg1 = __msa_hadd_u_h(vec1, vec1);
+    reg2 = __msa_hadd_u_h(vec2, vec2);
+    reg3 = __msa_hadd_u_h(vec3, vec3);
+    reg4 = (v8u16)__msa_pckev_d((v2i64)reg2, (v2i64)reg0);
+    reg5 = (v8u16)__msa_pckev_d((v2i64)reg3, (v2i64)reg1);
+    reg6 = (v8u16)__msa_pckod_d((v2i64)reg2, (v2i64)reg0);
+    reg7 = (v8u16)__msa_pckod_d((v2i64)reg3, (v2i64)reg1);
+    reg4 += reg6;
+    reg5 += reg7;
+    reg4 = (v8u16)__msa_srari_h((v8i16)reg4, 2);
+    reg5 = (v8u16)__msa_srari_h((v8i16)reg5, 2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
+    ST_UB(dst0, dst_argb);
+    src_argb += stepx * 4;
+    nxt_argb += stepx * 4;
+    dst_argb += 16;
+  }
+}
+
+void ScaleRowDown2_MSA(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst,
+                       int dst_width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+  (void)src_stride;
+
+  for (x = 0; x < dst_width; x += 32) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
+    dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    ST_UB2(dst0, dst1, dst, 16);
+    src_ptr += 64;
+    dst += 32;
+  }
+}
+
+void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst,
+                             int dst_width) {
+  int x;
+  v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0, dst1;
+  (void)src_stride;
+
+  for (x = 0; x < dst_width; x += 32) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
+    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    vec2 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    dst0 = __msa_aver_u_b(vec1, vec0);
+    dst1 = __msa_aver_u_b(vec3, vec2);
+    ST_UB2(dst0, dst1, dst, 16);
+    src_ptr += 64;
+    dst += 32;
+  }
+}
+
+void ScaleRowDown2Box_MSA(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width) {
+  int x;
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1;
+  v8u16 vec0, vec1, vec2, vec3;
+
+  for (x = 0; x < dst_width; x += 32) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
+    src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+    src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+    src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
+    src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
+    vec0 = __msa_hadd_u_h(src0, src0);
+    vec1 = __msa_hadd_u_h(src1, src1);
+    vec2 = __msa_hadd_u_h(src2, src2);
+    vec3 = __msa_hadd_u_h(src3, src3);
+    vec0 += __msa_hadd_u_h(src4, src4);
+    vec1 += __msa_hadd_u_h(src5, src5);
+    vec2 += __msa_hadd_u_h(src6, src6);
+    vec3 += __msa_hadd_u_h(src7, src7);
+    vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 2);
+    vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 2);
+    vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 2);
+    vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    ST_UB2(dst0, dst1, dst, 16);
+    s += 64;
+    t += 64;
+    dst += 32;
+  }
+}
+
+void ScaleRowDown4_MSA(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst,
+                       int dst_width) {
+  int x;
+  v16u8 src0, src1, src2, src3, vec0, vec1, dst0;
+  (void)src_stride;
+
+  for (x = 0; x < dst_width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
+    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst);
+    src_ptr += 64;
+    dst += 16;
+  }
+}
+
+void ScaleRowDown4Box_MSA(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width) {
+  int x;
+  const uint8_t* s = src_ptr;
+  const uint8_t* t0 = s + src_stride;
+  const uint8_t* t1 = s + src_stride * 2;
+  const uint8_t* t2 = s + src_stride * 3;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0;
+  v8u16 vec0, vec1, vec2, vec3;
+  v4u32 reg0, reg1, reg2, reg3;
+
+  for (x = 0; x < dst_width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
+    src4 = (v16u8)__msa_ld_b((v16i8*)t0, 0);
+    src5 = (v16u8)__msa_ld_b((v16i8*)t0, 16);
+    src6 = (v16u8)__msa_ld_b((v16i8*)t0, 32);
+    src7 = (v16u8)__msa_ld_b((v16i8*)t0, 48);
+    vec0 = __msa_hadd_u_h(src0, src0);
+    vec1 = __msa_hadd_u_h(src1, src1);
+    vec2 = __msa_hadd_u_h(src2, src2);
+    vec3 = __msa_hadd_u_h(src3, src3);
+    vec0 += __msa_hadd_u_h(src4, src4);
+    vec1 += __msa_hadd_u_h(src5, src5);
+    vec2 += __msa_hadd_u_h(src6, src6);
+    vec3 += __msa_hadd_u_h(src7, src7);
+    src0 = (v16u8)__msa_ld_b((v16i8*)t1, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)t1, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)t1, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)t1, 48);
+    src4 = (v16u8)__msa_ld_b((v16i8*)t2, 0);
+    src5 = (v16u8)__msa_ld_b((v16i8*)t2, 16);
+    src6 = (v16u8)__msa_ld_b((v16i8*)t2, 32);
+    src7 = (v16u8)__msa_ld_b((v16i8*)t2, 48);
+    vec0 += __msa_hadd_u_h(src0, src0);
+    vec1 += __msa_hadd_u_h(src1, src1);
+    vec2 += __msa_hadd_u_h(src2, src2);
+    vec3 += __msa_hadd_u_h(src3, src3);
+    vec0 += __msa_hadd_u_h(src4, src4);
+    vec1 += __msa_hadd_u_h(src5, src5);
+    vec2 += __msa_hadd_u_h(src6, src6);
+    vec3 += __msa_hadd_u_h(src7, src7);
+    reg0 = __msa_hadd_u_w(vec0, vec0);
+    reg1 = __msa_hadd_u_w(vec1, vec1);
+    reg2 = __msa_hadd_u_w(vec2, vec2);
+    reg3 = __msa_hadd_u_w(vec3, vec3);
+    reg0 = (v4u32)__msa_srari_w((v4i32)reg0, 4);
+    reg1 = (v4u32)__msa_srari_w((v4i32)reg1, 4);
+    reg2 = (v4u32)__msa_srari_w((v4i32)reg2, 4);
+    reg3 = (v4u32)__msa_srari_w((v4i32)reg3, 4);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst);
+    s += 64;
+    t0 += 64;
+    t1 += 64;
+    t2 += 64;
+    dst += 16;
+  }
+}
+
+void ScaleRowDown38_MSA(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width) {
+  int x, width;
+  uint64_t dst0;
+  uint32_t dst1;
+  v16u8 src0, src1, vec0;
+  v16i8 mask = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0};
+  (void)src_stride;
+
+  assert(dst_width % 3 == 0);
+  width = dst_width / 3;
+
+  for (x = 0; x < width; x += 4) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
+    vec0 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)src0);
+    dst0 = __msa_copy_u_d((v2i64)vec0, 0);
+    dst1 = __msa_copy_u_w((v4i32)vec0, 2);
+    SD(dst0, dst);
+    SW(dst1, dst + 8);
+    src_ptr += 32;
+    dst += 12;
+  }
+}
+
+void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  int x, width;
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  uint64_t dst0;
+  uint32_t dst1;
+  v16u8 src0, src1, src2, src3, out;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v4u32 tmp0, tmp1, tmp2, tmp3, tmp4;
+  v8i16 zero = {0};
+  v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9};
+  v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0};
+  v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA);
+  v4u32 const_0x4000 = (v4u32)__msa_fill_w(0x4000);
+
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  width = dst_width / 3;
+
+  for (x = 0; x < width; x += 4) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+    src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
+    vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+    vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
+    vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+    vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
+    vec4 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec0);
+    vec5 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec1);
+    vec6 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec2);
+    vec7 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec3);
+    vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
+    vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2);
+    vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
+    tmp0 = __msa_hadd_u_w(vec4, vec4);
+    tmp1 = __msa_hadd_u_w(vec5, vec5);
+    tmp2 = __msa_hadd_u_w(vec6, vec6);
+    tmp3 = __msa_hadd_u_w(vec7, vec7);
+    tmp4 = __msa_hadd_u_w(vec0, vec0);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
+    tmp0 = __msa_hadd_u_w(vec0, vec0);
+    tmp1 = __msa_hadd_u_w(vec1, vec1);
+    tmp0 *= const_0x2AAA;
+    tmp1 *= const_0x2AAA;
+    tmp4 *= const_0x4000;
+    tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16);
+    tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16);
+    tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4);
+    out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0);
+    dst0 = __msa_copy_u_d((v2i64)out, 0);
+    dst1 = __msa_copy_u_w((v4i32)out, 2);
+    SD(dst0, dst_ptr);
+    SW(dst1, dst_ptr + 8);
+    s += 32;
+    t += 32;
+    dst_ptr += 12;
+  }
+}
+
+void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  int x, width;
+  const uint8_t* s = src_ptr;
+  const uint8_t* t0 = s + src_stride;
+  const uint8_t* t1 = s + src_stride * 2;
+  uint64_t dst0;
+  uint32_t dst1;
+  v16u8 src0, src1, src2, src3, src4, src5, out;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v4u32 tmp0, tmp1, tmp2, tmp3, tmp4;
+  v8u16 zero = {0};
+  v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9};
+  v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0};
+  v4u32 const_0x1C71 = (v4u32)__msa_fill_w(0x1C71);
+  v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA);
+
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  width = dst_width / 3;
+
+  for (x = 0; x < width; x += 4) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)t0, 0);
+    src3 = (v16u8)__msa_ld_b((v16i8*)t0, 16);
+    src4 = (v16u8)__msa_ld_b((v16i8*)t1, 0);
+    src5 = (v16u8)__msa_ld_b((v16i8*)t1, 16);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
+    vec4 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src4);
+    vec5 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src4);
+    vec6 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src5);
+    vec7 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src5);
+    vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+    vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
+    vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+    vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
+    vec0 += __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
+    vec1 += __msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
+    vec2 += __msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
+    vec3 += __msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
+    vec4 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec0);
+    vec5 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec1);
+    vec6 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec2);
+    vec7 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec3);
+    vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
+    vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2);
+    vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
+    tmp0 = __msa_hadd_u_w(vec4, vec4);
+    tmp1 = __msa_hadd_u_w(vec5, vec5);
+    tmp2 = __msa_hadd_u_w(vec6, vec6);
+    tmp3 = __msa_hadd_u_w(vec7, vec7);
+    tmp4 = __msa_hadd_u_w(vec0, vec0);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
+    tmp0 = __msa_hadd_u_w(vec0, vec0);
+    tmp1 = __msa_hadd_u_w(vec1, vec1);
+    tmp0 *= const_0x1C71;
+    tmp1 *= const_0x1C71;
+    tmp4 *= const_0x2AAA;
+    tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16);
+    tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16);
+    tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4);
+    out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0);
+    dst0 = __msa_copy_u_d((v2i64)out, 0);
+    dst1 = __msa_copy_u_w((v4i32)out, 2);
+    SD(dst0, dst_ptr);
+    SW(dst1, dst_ptr + 8);
+    s += 32;
+    t0 += 32;
+    t1 += 32;
+    dst_ptr += 12;
+  }
+}
+
+void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
+  int x;
+  v16u8 src0;
+  v8u16 dst0, dst1;
+  v16i8 zero = {0};
+
+  assert(src_width > 0);
+
+  for (x = 0; x < src_width; x += 16) {
+    src0 = LD_UB(src_ptr);
+    dst0 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 0);
+    dst1 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 16);
+    dst0 += (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
+    dst1 += (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
+    ST_UH2(dst0, dst1, dst_ptr, 8);
+    src_ptr += 16;
+    dst_ptr += 16;
+  }
+}
+
+void ScaleFilterCols_MSA(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         int dst_width,
+                         int x,
+                         int dx) {
+  int j;
+  v4i32 vec_x = __msa_fill_w(x);
+  v4i32 vec_dx = __msa_fill_w(dx);
+  v4i32 vec_const = {0, 1, 2, 3};
+  v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  v8u16 reg0, reg1;
+  v16u8 dst0;
+  v4i32 const_0xFFFF = __msa_fill_w(0xFFFF);
+  v4i32 const_0x40 = __msa_fill_w(0x40);
+
+  vec0 = vec_dx * vec_const;
+  vec1 = vec_dx * 4;
+  vec_x += vec0;
+
+  for (j = 0; j < dst_width - 1; j += 16) {
+    vec2 = vec_x >> 16;
+    vec6 = vec_x & const_0xFFFF;
+    vec_x += vec1;
+    vec3 = vec_x >> 16;
+    vec7 = vec_x & const_0xFFFF;
+    vec_x += vec1;
+    vec4 = vec_x >> 16;
+    vec8 = vec_x & const_0xFFFF;
+    vec_x += vec1;
+    vec5 = vec_x >> 16;
+    vec9 = vec_x & const_0xFFFF;
+    vec_x += vec1;
+    vec6 >>= 9;
+    vec7 >>= 9;
+    vec8 >>= 9;
+    vec9 >>= 9;
+    LOAD_INDEXED_DATA(src_ptr, vec2, tmp0);
+    LOAD_INDEXED_DATA(src_ptr, vec3, tmp1);
+    LOAD_INDEXED_DATA(src_ptr, vec4, tmp2);
+    LOAD_INDEXED_DATA(src_ptr, vec5, tmp3);
+    vec2 += 1;
+    vec3 += 1;
+    vec4 += 1;
+    vec5 += 1;
+    LOAD_INDEXED_DATA(src_ptr, vec2, tmp4);
+    LOAD_INDEXED_DATA(src_ptr, vec3, tmp5);
+    LOAD_INDEXED_DATA(src_ptr, vec4, tmp6);
+    LOAD_INDEXED_DATA(src_ptr, vec5, tmp7);
+    tmp4 -= tmp0;
+    tmp5 -= tmp1;
+    tmp6 -= tmp2;
+    tmp7 -= tmp3;
+    tmp4 *= vec6;
+    tmp5 *= vec7;
+    tmp6 *= vec8;
+    tmp7 *= vec9;
+    tmp4 += const_0x40;
+    tmp5 += const_0x40;
+    tmp6 += const_0x40;
+    tmp7 += const_0x40;
+    tmp4 >>= 7;
+    tmp5 >>= 7;
+    tmp6 >>= 7;
+    tmp7 >>= 7;
+    tmp0 += tmp4;
+    tmp1 += tmp5;
+    tmp2 += tmp6;
+    tmp3 += tmp7;
+    reg0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+    reg1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+    __msa_st_b(dst0, dst_ptr, 0);
+    dst_ptr += 16;
+  }
+}
+
+void ScaleARGBCols_MSA(uint8_t* dst_argb,
+                       const uint8_t* src_argb,
+                       int dst_width,
+                       int x,
+                       int dx) {
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
+  int j;
+  v4i32 x_vec = __msa_fill_w(x);
+  v4i32 dx_vec = __msa_fill_w(dx);
+  v4i32 const_vec = {0, 1, 2, 3};
+  v4i32 vec0, vec1, vec2;
+  v4i32 dst0;
+
+  vec0 = dx_vec * const_vec;
+  vec1 = dx_vec * 4;
+  x_vec += vec0;
+
+  for (j = 0; j < dst_width; j += 4) {
+    vec2 = x_vec >> 16;
+    x_vec += vec1;
+    LOAD_INDEXED_DATA(src, vec2, dst0);
+    __msa_st_w(dst0, dst, 0);
+    dst += 4;
+  }
+}
+
+void ScaleARGBFilterCols_MSA(uint8_t* dst_argb,
+                             const uint8_t* src_argb,
+                             int dst_width,
+                             int x,
+                             int dx) {
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  int j;
+  v4u32 src0, src1, src2, src3;
+  v4u32 vec0, vec1, vec2, vec3;
+  v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  v16u8 mult0, mult1, mult2, mult3;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v16u8 dst0, dst1;
+  v4u32 vec_x = (v4u32)__msa_fill_w(x);
+  v4u32 vec_dx = (v4u32)__msa_fill_w(dx);
+  v4u32 vec_const = {0, 1, 2, 3};
+  v16u8 const_0x7f = (v16u8)__msa_fill_b(0x7f);
+
+  vec0 = vec_dx * vec_const;
+  vec1 = vec_dx * 4;
+  vec_x += vec0;
+
+  for (j = 0; j < dst_width - 1; j += 8) {
+    vec2 = vec_x >> 16;
+    reg0 = (v16u8)(vec_x >> 9);
+    vec_x += vec1;
+    vec3 = vec_x >> 16;
+    reg1 = (v16u8)(vec_x >> 9);
+    vec_x += vec1;
+    reg0 = reg0 & const_0x7f;
+    reg1 = reg1 & const_0x7f;
+    reg0 = (v16u8)__msa_shf_b((v16i8)reg0, 0);
+    reg1 = (v16u8)__msa_shf_b((v16i8)reg1, 0);
+    reg2 = reg0 ^ const_0x7f;
+    reg3 = reg1 ^ const_0x7f;
+    mult0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)reg2);
+    mult1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)reg2);
+    mult2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)reg3);
+    mult3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)reg3);
+    LOAD_INDEXED_DATA(src, vec2, src0);
+    LOAD_INDEXED_DATA(src, vec3, src1);
+    vec2 += 1;
+    vec3 += 1;
+    LOAD_INDEXED_DATA(src, vec2, src2);
+    LOAD_INDEXED_DATA(src, vec3, src3);
+    reg4 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
+    reg5 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
+    reg6 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
+    reg7 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
+    tmp0 = __msa_dotp_u_h(reg4, mult0);
+    tmp1 = __msa_dotp_u_h(reg5, mult1);
+    tmp2 = __msa_dotp_u_h(reg6, mult2);
+    tmp3 = __msa_dotp_u_h(reg7, mult3);
+    tmp0 >>= 7;
+    tmp1 >>= 7;
+    tmp2 >>= 7;
+    tmp3 >>= 7;
+    dst0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+    __msa_st_b(dst0, dst_argb, 0);
+    __msa_st_b(dst1, dst_argb, 16);
+    dst_argb += 32;
+  }
+}
+
+void ScaleRowDown34_MSA(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width) {
+  int x;
+  (void)src_stride;
+  v16u8 src0, src1, src2, src3;
+  v16u8 vec0, vec1, vec2;
+  v16i8 mask0 = {0, 1, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20};
+  v16i8 mask1 = {5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20, 21, 23, 24, 25};
+  v16i8 mask2 = {11, 12, 13, 15, 16, 17, 19, 20,
+                 21, 23, 24, 25, 27, 28, 29, 31};
+
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+
+  for (x = 0; x < dst_width; x += 48) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
+    vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);
+    vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src2, (v16i8)src1);
+    vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src2);
+    __msa_st_b((v16i8)vec0, dst, 0);
+    __msa_st_b((v16i8)vec1, dst, 16);
+    __msa_st_b((v16i8)vec2, dst, 32);
+    src_ptr += 64;
+    dst += 48;
+  }
+}
+
+void ScaleRowDown34_0_Box_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* d,
+                              int dst_width) {
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  int x;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5;
+  v16u8 vec6, vec7, vec8, vec9, vec10, vec11;
+  v8i16 reg0, reg1, reg2, reg3, reg4, reg5;
+  v8i16 reg6, reg7, reg8, reg9, reg10, reg11;
+  v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1};
+  v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1};
+  v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3};
+  v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
+  v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15,
+                 16, 17, 17, 18, 18, 19, 20, 21};
+  v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15};
+  v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1};
+  v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2};
+  v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2};
+
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+
+  for (x = 0; x < dst_width; x += 48) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
+    src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+    src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+    src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
+    src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
+    vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0);
+    vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
+    vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1);
+    vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2);
+    vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
+    vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3);
+    vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4);
+    vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4);
+    vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5);
+    vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6);
+    vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6);
+    vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7);
+    reg0 = (v8i16)__msa_dotp_u_h(vec0, const0);
+    reg1 = (v8i16)__msa_dotp_u_h(vec1, const1);
+    reg2 = (v8i16)__msa_dotp_u_h(vec2, const2);
+    reg3 = (v8i16)__msa_dotp_u_h(vec3, const0);
+    reg4 = (v8i16)__msa_dotp_u_h(vec4, const1);
+    reg5 = (v8i16)__msa_dotp_u_h(vec5, const2);
+    reg6 = (v8i16)__msa_dotp_u_h(vec6, const0);
+    reg7 = (v8i16)__msa_dotp_u_h(vec7, const1);
+    reg8 = (v8i16)__msa_dotp_u_h(vec8, const2);
+    reg9 = (v8i16)__msa_dotp_u_h(vec9, const0);
+    reg10 = (v8i16)__msa_dotp_u_h(vec10, const1);
+    reg11 = (v8i16)__msa_dotp_u_h(vec11, const2);
+    reg0 = __msa_srar_h(reg0, shft0);
+    reg1 = __msa_srar_h(reg1, shft1);
+    reg2 = __msa_srar_h(reg2, shft2);
+    reg3 = __msa_srar_h(reg3, shft0);
+    reg4 = __msa_srar_h(reg4, shft1);
+    reg5 = __msa_srar_h(reg5, shft2);
+    reg6 = __msa_srar_h(reg6, shft0);
+    reg7 = __msa_srar_h(reg7, shft1);
+    reg8 = __msa_srar_h(reg8, shft2);
+    reg9 = __msa_srar_h(reg9, shft0);
+    reg10 = __msa_srar_h(reg10, shft1);
+    reg11 = __msa_srar_h(reg11, shft2);
+    reg0 = reg0 * 3 + reg6;
+    reg1 = reg1 * 3 + reg7;
+    reg2 = reg2 * 3 + reg8;
+    reg3 = reg3 * 3 + reg9;
+    reg4 = reg4 * 3 + reg10;
+    reg5 = reg5 * 3 + reg11;
+    reg0 = __msa_srari_h(reg0, 2);
+    reg1 = __msa_srari_h(reg1, 2);
+    reg2 = __msa_srari_h(reg2, 2);
+    reg3 = __msa_srari_h(reg3, 2);
+    reg4 = __msa_srari_h(reg4, 2);
+    reg5 = __msa_srari_h(reg5, 2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2);
+    dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
+    __msa_st_b((v16i8)dst0, d, 0);
+    __msa_st_b((v16i8)dst1, d, 16);
+    __msa_st_b((v16i8)dst2, d, 32);
+    s += 64;
+    t += 64;
+    d += 48;
+  }
+}
+
+void ScaleRowDown34_1_Box_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* d,
+                              int dst_width) {
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  int x;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5;
+  v16u8 vec6, vec7, vec8, vec9, vec10, vec11;
+  v8i16 reg0, reg1, reg2, reg3, reg4, reg5;
+  v8i16 reg6, reg7, reg8, reg9, reg10, reg11;
+  v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1};
+  v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1};
+  v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3};
+  v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
+  v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15,
+                 16, 17, 17, 18, 18, 19, 20, 21};
+  v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15};
+  v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1};
+  v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2};
+  v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2};
+
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+
+  for (x = 0; x < dst_width; x += 48) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
+    src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+    src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+    src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
+    src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
+    vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0);
+    vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
+    vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1);
+    vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2);
+    vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
+    vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3);
+    vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4);
+    vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4);
+    vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5);
+    vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6);
+    vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6);
+    vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7);
+    reg0 = (v8i16)__msa_dotp_u_h(vec0, const0);
+    reg1 = (v8i16)__msa_dotp_u_h(vec1, const1);
+    reg2 = (v8i16)__msa_dotp_u_h(vec2, const2);
+    reg3 = (v8i16)__msa_dotp_u_h(vec3, const0);
+    reg4 = (v8i16)__msa_dotp_u_h(vec4, const1);
+    reg5 = (v8i16)__msa_dotp_u_h(vec5, const2);
+    reg6 = (v8i16)__msa_dotp_u_h(vec6, const0);
+    reg7 = (v8i16)__msa_dotp_u_h(vec7, const1);
+    reg8 = (v8i16)__msa_dotp_u_h(vec8, const2);
+    reg9 = (v8i16)__msa_dotp_u_h(vec9, const0);
+    reg10 = (v8i16)__msa_dotp_u_h(vec10, const1);
+    reg11 = (v8i16)__msa_dotp_u_h(vec11, const2);
+    reg0 = __msa_srar_h(reg0, shft0);
+    reg1 = __msa_srar_h(reg1, shft1);
+    reg2 = __msa_srar_h(reg2, shft2);
+    reg3 = __msa_srar_h(reg3, shft0);
+    reg4 = __msa_srar_h(reg4, shft1);
+    reg5 = __msa_srar_h(reg5, shft2);
+    reg6 = __msa_srar_h(reg6, shft0);
+    reg7 = __msa_srar_h(reg7, shft1);
+    reg8 = __msa_srar_h(reg8, shft2);
+    reg9 = __msa_srar_h(reg9, shft0);
+    reg10 = __msa_srar_h(reg10, shft1);
+    reg11 = __msa_srar_h(reg11, shft2);
+    reg0 += reg6;
+    reg1 += reg7;
+    reg2 += reg8;
+    reg3 += reg9;
+    reg4 += reg10;
+    reg5 += reg11;
+    reg0 = __msa_srari_h(reg0, 1);
+    reg1 = __msa_srari_h(reg1, 1);
+    reg2 = __msa_srari_h(reg2, 1);
+    reg3 = __msa_srari_h(reg3, 1);
+    reg4 = __msa_srari_h(reg4, 1);
+    reg5 = __msa_srari_h(reg5, 1);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2);
+    dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
+    __msa_st_b((v16i8)dst0, d, 0);
+    __msa_st_b((v16i8)dst1, d, 16);
+    __msa_st_b((v16i8)dst2, d, 32);
+    s += 64;
+    t += 64;
+    d += 48;
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/scale_neon.cc b/media/libvpx/libvpx/third_party/libyuv/source/scale_neon.cc
index 44b0c8080d..459a2995df 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/scale_neon.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/scale_neon.cc
@@ -23,564 +23,541 @@ extern "C" {
 // Provided by Fritz Koenig
 
 // Read 32x1 throw away even pixels, and write 16x1.
-void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    // load even pixels into q0, odd into q1
-    MEMACCESS(0)
-    "vld2.8     {q0, q1}, [%0]!                \n"
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop
-    MEMACCESS(1)
-    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst),              // %1
-    "+r"(dst_width)         // %2
-  :
-  : "q0", "q1"              // Clobber List
-  );
+void ScaleRowDown2_NEON(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      // load even pixels into q0, odd into q1
+      "vld2.8     {q0, q1}, [%0]!                \n"
+      "subs       %2, %2, #16                    \n"  // 16 processed per loop
+      "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
+      "bgt        1b                             \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "q0", "q1"  // Clobber List
+      );
 }
 
 // Read 32x1 average down and write 16x1.
-void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0, q1}, [%0]!                \n"  // load pixels and post inc
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop
-    "vpaddl.u8  q0, q0                         \n"  // add adjacent
-    "vpaddl.u8  q1, q1                         \n"
-    "vrshrn.u16 d0, q0, #1                     \n"  // downshift, round and pack
-    "vrshrn.u16 d1, q1, #1                     \n"
-    MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst),              // %1
-    "+r"(dst_width)         // %2
-  :
-  : "q0", "q1"     // Clobber List
-  );
+void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst,
+                              int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8     {q0, q1}, [%0]!                \n"  // load 32 pixels
+      "subs       %2, %2, #16                    \n"  // 16 processed per loop
+      "vrhadd.u8  q0, q0, q1                     \n"  // rounding half add
+      "vst1.8     {q0}, [%1]!                    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "q0", "q1"  // Clobber List
+      );
 }
 
 // Read 32x2 average down and write 16x1.
-void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width) {
-  asm volatile (
-    // change the stride to row 2 pointer
-    "add        %1, %0                         \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0, q1}, [%0]!                \n"  // load row 1 and post inc
-    MEMACCESS(1)
-    "vld1.8     {q2, q3}, [%1]!                \n"  // load row 2 and post inc
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop
-    "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
-    "vpaddl.u8  q1, q1                         \n"
-    "vpadal.u8  q0, q2                         \n"  // row 2 add adjacent + row1
-    "vpadal.u8  q1, q3                         \n"
-    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
-    "vrshrn.u16 d1, q1, #2                     \n"
-    MEMACCESS(2)
-    "vst1.8     {q0}, [%2]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(src_stride),       // %1
-    "+r"(dst),              // %2
-    "+r"(dst_width)         // %3
-  :
-  : "q0", "q1", "q2", "q3"     // Clobber List
-  );
+void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst,
+                           int dst_width) {
+  asm volatile(
+      // change the stride to row 2 pointer
+      "add        %1, %0                         \n"
+      "1:                                        \n"
+      "vld1.8     {q0, q1}, [%0]!                \n"  // load row 1 and post inc
+      "vld1.8     {q2, q3}, [%1]!                \n"  // load row 2 and post inc
+      "subs       %3, %3, #16                    \n"  // 16 processed per loop
+      "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
+      "vpaddl.u8  q1, q1                         \n"
+      "vpadal.u8  q0, q2                         \n"  // row 2 add adjacent +
+                                                      // row1
+      "vpadal.u8  q1, q3                         \n"
+      "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and
+                                                      // pack
+      "vrshrn.u16 d1, q1, #2                     \n"
+      "vst1.8     {q0}, [%2]!                    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      :
+      : "q0", "q1", "q2", "q3"  // Clobber List
+      );
 }
 
-void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n" // src line 0
-    "subs       %2, %2, #8                     \n" // 8 processed per loop
-    MEMACCESS(1)
-    "vst1.8     {d2}, [%1]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width)         // %2
-  :
-  : "q0", "q1", "memory", "cc"
-  );
+void ScaleRowDown4_NEON(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // src line 0
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop
+      "vst1.8     {d2}, [%1]!                    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "q0", "q1", "memory", "cc");
 }
 
-void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
-  const uint8* src_ptr1 = src_ptr + src_stride;
-  const uint8* src_ptr2 = src_ptr + src_stride * 2;
-  const uint8* src_ptr3 = src_ptr + src_stride * 3;
-asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"   // load up 16x4
-    MEMACCESS(3)
-    "vld1.8     {q1}, [%3]!                    \n"
-    MEMACCESS(4)
-    "vld1.8     {q2}, [%4]!                    \n"
-    MEMACCESS(5)
-    "vld1.8     {q3}, [%5]!                    \n"
-    "subs       %2, %2, #4                     \n"
-    "vpaddl.u8  q0, q0                         \n"
-    "vpadal.u8  q0, q1                         \n"
-    "vpadal.u8  q0, q2                         \n"
-    "vpadal.u8  q0, q3                         \n"
-    "vpaddl.u16 q0, q0                         \n"
-    "vrshrn.u32 d0, q0, #4                     \n"   // divide by 16 w/rounding
-    "vmovn.u16  d0, q0                         \n"
-    MEMACCESS(1)
-    "vst1.32    {d0[0]}, [%1]!                 \n"
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width), // %2
-    "+r"(src_ptr1),  // %3
-    "+r"(src_ptr2),  // %4
-    "+r"(src_ptr3)   // %5
-  :
-  : "q0", "q1", "q2", "q3", "memory", "cc"
-  );
+void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
+  const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load up 16x4
+      "vld1.8     {q1}, [%3]!                    \n"
+      "vld1.8     {q2}, [%4]!                    \n"
+      "vld1.8     {q3}, [%5]!                    \n"
+      "subs       %2, %2, #4                     \n"
+      "vpaddl.u8  q0, q0                         \n"
+      "vpadal.u8  q0, q1                         \n"
+      "vpadal.u8  q0, q2                         \n"
+      "vpadal.u8  q0, q3                         \n"
+      "vpaddl.u16 q0, q0                         \n"
+      "vrshrn.u32 d0, q0, #4                     \n"  // divide by 16 w/rounding
+      "vmovn.u16  d0, q0                         \n"
+      "vst1.32    {d0[0]}, [%1]!                 \n"
+      "bgt        1b                             \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_ptr1),   // %3
+        "+r"(src_ptr2),   // %4
+        "+r"(src_ptr3)    // %5
+      :
+      : "q0", "q1", "q2", "q3", "memory", "cc");
 }
 
 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
 // to load up the every 4th pixel into a 4 different registers.
 // Point samples 32 pixels to 24 pixels.
-void ScaleRowDown34_NEON(const uint8* src_ptr,
+void ScaleRowDown34_NEON(const uint8_t* src_ptr,
                          ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!      \n" // src line 0
-    "subs       %2, %2, #24                  \n"
-    "vmov       d2, d3                       \n" // order d0, d1, d2
-    MEMACCESS(1)
-    "vst3.8     {d0, d1, d2}, [%1]!          \n"
-    "bgt        1b                           \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width)         // %2
-  :
-  : "d0", "d1", "d2", "d3", "memory", "cc"
-  );
+                         uint8_t* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // src line 0
+      "subs       %2, %2, #24                    \n"
+      "vmov       d2, d3                         \n"  // order d0, d1, d2
+      "vst3.8     {d0, d1, d2}, [%1]!            \n"
+      "bgt        1b                             \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "d0", "d1", "d2", "d3", "memory", "cc");
 }
 
-void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "vmov.u8    d24, #3                        \n"
-    "add        %3, %0                         \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
-    MEMACCESS(3)
-    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
-    "subs         %2, %2, #24                  \n"
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  asm volatile(
+      "vmov.u8    d24, #3                        \n"
+      "add        %3, %0                         \n"
+      "1:                                        \n"
+      "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"  // src line 0
+      "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"  // src line 1
+      "subs         %2, %2, #24                  \n"
 
-    // filter src line 0 with src line 1
-    // expand chars to shorts to allow for room
-    // when adding lines together
-    "vmovl.u8     q8, d4                       \n"
-    "vmovl.u8     q9, d5                       \n"
-    "vmovl.u8     q10, d6                      \n"
-    "vmovl.u8     q11, d7                      \n"
+      // filter src line 0 with src line 1
+      // expand chars to shorts to allow for room
+      // when adding lines together
+      "vmovl.u8     q8, d4                       \n"
+      "vmovl.u8     q9, d5                       \n"
+      "vmovl.u8     q10, d6                      \n"
+      "vmovl.u8     q11, d7                      \n"
 
-    // 3 * line_0 + line_1
-    "vmlal.u8     q8, d0, d24                  \n"
-    "vmlal.u8     q9, d1, d24                  \n"
-    "vmlal.u8     q10, d2, d24                 \n"
-    "vmlal.u8     q11, d3, d24                 \n"
+      // 3 * line_0 + line_1
+      "vmlal.u8     q8, d0, d24                  \n"
+      "vmlal.u8     q9, d1, d24                  \n"
+      "vmlal.u8     q10, d2, d24                 \n"
+      "vmlal.u8     q11, d3, d24                 \n"
 
-    // (3 * line_0 + line_1) >> 2
-    "vqrshrn.u16  d0, q8, #2                   \n"
-    "vqrshrn.u16  d1, q9, #2                   \n"
-    "vqrshrn.u16  d2, q10, #2                  \n"
-    "vqrshrn.u16  d3, q11, #2                  \n"
+      // (3 * line_0 + line_1) >> 2
+      "vqrshrn.u16  d0, q8, #2                   \n"
+      "vqrshrn.u16  d1, q9, #2                   \n"
+      "vqrshrn.u16  d2, q10, #2                  \n"
+      "vqrshrn.u16  d3, q11, #2                  \n"
 
-    // a0 = (src[0] * 3 + s[1] * 1) >> 2
-    "vmovl.u8     q8, d1                       \n"
-    "vmlal.u8     q8, d0, d24                  \n"
-    "vqrshrn.u16  d0, q8, #2                   \n"
+      // a0 = (src[0] * 3 + s[1] * 1) >> 2
+      "vmovl.u8     q8, d1                       \n"
+      "vmlal.u8     q8, d0, d24                  \n"
+      "vqrshrn.u16  d0, q8, #2                   \n"
 
-    // a1 = (src[1] * 1 + s[2] * 1) >> 1
-    "vrhadd.u8    d1, d1, d2                   \n"
+      // a1 = (src[1] * 1 + s[2] * 1) >> 1
+      "vrhadd.u8    d1, d1, d2                   \n"
 
-    // a2 = (src[2] * 1 + s[3] * 3) >> 2
-    "vmovl.u8     q8, d2                       \n"
-    "vmlal.u8     q8, d3, d24                  \n"
-    "vqrshrn.u16  d2, q8, #2                   \n"
+      // a2 = (src[2] * 1 + s[3] * 3) >> 2
+      "vmovl.u8     q8, d2                       \n"
+      "vmlal.u8     q8, d3, d24                  \n"
+      "vqrshrn.u16  d2, q8, #2                   \n"
 
-    MEMACCESS(1)
-    "vst3.8       {d0, d1, d2}, [%1]!          \n"
+      "vst3.8       {d0, d1, d2}, [%1]!          \n"
 
-    "bgt          1b                           \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width),        // %2
-    "+r"(src_stride)        // %3
-  :
-  : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
-  );
+      "bgt          1b                           \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_stride)  // %3
+      :
+      : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory",
+        "cc");
 }
 
-void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "vmov.u8    d24, #3                        \n"
-    "add        %3, %0                         \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
-    MEMACCESS(3)
-    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
-    "subs         %2, %2, #24                  \n"
-    // average src line 0 with src line 1
-    "vrhadd.u8    q0, q0, q2                   \n"
-    "vrhadd.u8    q1, q1, q3                   \n"
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  asm volatile(
+      "vmov.u8    d24, #3                        \n"
+      "add        %3, %0                         \n"
+      "1:                                        \n"
+      "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"  // src line 0
+      "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"  // src line 1
+      "subs         %2, %2, #24                  \n"
+      // average src line 0 with src line 1
+      "vrhadd.u8    q0, q0, q2                   \n"
+      "vrhadd.u8    q1, q1, q3                   \n"
 
-    // a0 = (src[0] * 3 + s[1] * 1) >> 2
-    "vmovl.u8     q3, d1                       \n"
-    "vmlal.u8     q3, d0, d24                  \n"
-    "vqrshrn.u16  d0, q3, #2                   \n"
+      // a0 = (src[0] * 3 + s[1] * 1) >> 2
+      "vmovl.u8     q3, d1                       \n"
+      "vmlal.u8     q3, d0, d24                  \n"
+      "vqrshrn.u16  d0, q3, #2                   \n"
 
-    // a1 = (src[1] * 1 + s[2] * 1) >> 1
-    "vrhadd.u8    d1, d1, d2                   \n"
+      // a1 = (src[1] * 1 + s[2] * 1) >> 1
+      "vrhadd.u8    d1, d1, d2                   \n"
 
-    // a2 = (src[2] * 1 + s[3] * 3) >> 2
-    "vmovl.u8     q3, d2                       \n"
-    "vmlal.u8     q3, d3, d24                  \n"
-    "vqrshrn.u16  d2, q3, #2                   \n"
+      // a2 = (src[2] * 1 + s[3] * 3) >> 2
+      "vmovl.u8     q3, d2                       \n"
+      "vmlal.u8     q3, d3, d24                  \n"
+      "vqrshrn.u16  d2, q3, #2                   \n"
 
-    MEMACCESS(1)
-    "vst3.8       {d0, d1, d2}, [%1]!          \n"
-    "bgt          1b                           \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width),        // %2
-    "+r"(src_stride)        // %3
-  :
-  : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
-  );
+      "vst3.8       {d0, d1, d2}, [%1]!          \n"
+      "bgt          1b                           \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_stride)  // %3
+      :
+      : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc");
 }
 
 #define HAS_SCALEROWDOWN38_NEON
-static uvec8 kShuf38 =
-  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
-static uvec8 kShuf38_2 =
-  { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
-static vec16 kMult38_Div6 =
-  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
-    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
-static vec16 kMult38_Div9 =
-  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
-    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+static const uvec8 kShuf38 = {0,  3,  6,  8,  11, 14, 16, 19,
+                              22, 24, 27, 30, 0,  0,  0,  0};
+static const uvec8 kShuf38_2 = {0,  8, 16, 2,  10, 17, 4, 12,
+                                18, 6, 14, 19, 0,  0,  0, 0};
+static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
+                                   65536 / 12, 65536 / 12, 65536 / 12,
+                                   65536 / 12, 65536 / 12};
+static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
+                                   65536 / 18, 65536 / 18, 65536 / 18,
+                                   65536 / 18, 65536 / 18};
 
 // 32 -> 12
-void ScaleRowDown38_NEON(const uint8* src_ptr,
+void ScaleRowDown38_NEON(const uint8_t* src_ptr,
                          ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    MEMACCESS(3)
-    "vld1.8     {q3}, [%3]                     \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"
-    "subs       %2, %2, #12                    \n"
-    "vtbl.u8    d4, {d0, d1, d2, d3}, d6       \n"
-    "vtbl.u8    d5, {d0, d1, d2, d3}, d7       \n"
-    MEMACCESS(1)
-    "vst1.8     {d4}, [%1]!                    \n"
-    MEMACCESS(1)
-    "vst1.32    {d5[0]}, [%1]!                 \n"
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width)         // %2
-  : "r"(&kShuf38)           // %3
-  : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
-  );
+                         uint8_t* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "vld1.8     {q3}, [%3]                     \n"
+      "1:                                        \n"
+      "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"
+      "subs       %2, %2, #12                    \n"
+      "vtbl.u8    d4, {d0, d1, d2, d3}, d6       \n"
+      "vtbl.u8    d5, {d0, d1, d2, d3}, d7       \n"
+      "vst1.8     {d4}, [%1]!                    \n"
+      "vst1.32    {d5[0]}, [%1]!                 \n"
+      "bgt        1b                             \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      : "r"(&kShuf38)    // %3
+      : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc");
 }
 
 // 32x3 -> 12x1
-void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
                                       ptrdiff_t src_stride,
-                                      uint8* dst_ptr, int dst_width) {
-  const uint8* src_ptr1 = src_ptr + src_stride * 2;
+                                      uint8_t* dst_ptr,
+                                      int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
 
-  asm volatile (
-    MEMACCESS(5)
-    "vld1.16    {q13}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {q14}, [%6]                    \n"
-    MEMACCESS(7)
-    "vld1.8     {q15}, [%7]                    \n"
-    "add        %3, %0                         \n"
-  "1:                                          \n"
+  asm volatile(
+      "vld1.16    {q13}, [%5]                    \n"
+      "vld1.8     {q14}, [%6]                    \n"
+      "vld1.8     {q15}, [%7]                    \n"
+      "add        %3, %0                         \n"
+      "1:                                        \n"
 
-    // d0 = 00 40 01 41 02 42 03 43
-    // d1 = 10 50 11 51 12 52 13 53
-    // d2 = 20 60 21 61 22 62 23 63
-    // d3 = 30 70 31 71 32 72 33 73
-    MEMACCESS(0)
-    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
-    MEMACCESS(3)
-    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
-    MEMACCESS(4)
-    "vld4.8       {d16, d17, d18, d19}, [%4]!  \n"
-    "subs         %2, %2, #12                  \n"
+      // d0 = 00 40 01 41 02 42 03 43
+      // d1 = 10 50 11 51 12 52 13 53
+      // d2 = 20 60 21 61 22 62 23 63
+      // d3 = 30 70 31 71 32 72 33 73
+      "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
+      "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
+      "vld4.8       {d16, d17, d18, d19}, [%4]!  \n"
+      "subs         %2, %2, #12                  \n"
 
-    // Shuffle the input data around to get align the data
-    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
-    // d0 = 00 10 01 11 02 12 03 13
-    // d1 = 40 50 41 51 42 52 43 53
-    "vtrn.u8      d0, d1                       \n"
-    "vtrn.u8      d4, d5                       \n"
-    "vtrn.u8      d16, d17                     \n"
+      // Shuffle the input data around to get align the data
+      //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+      // d0 = 00 10 01 11 02 12 03 13
+      // d1 = 40 50 41 51 42 52 43 53
+      "vtrn.u8      d0, d1                       \n"
+      "vtrn.u8      d4, d5                       \n"
+      "vtrn.u8      d16, d17                     \n"
 
-    // d2 = 20 30 21 31 22 32 23 33
-    // d3 = 60 70 61 71 62 72 63 73
-    "vtrn.u8      d2, d3                       \n"
-    "vtrn.u8      d6, d7                       \n"
-    "vtrn.u8      d18, d19                     \n"
+      // d2 = 20 30 21 31 22 32 23 33
+      // d3 = 60 70 61 71 62 72 63 73
+      "vtrn.u8      d2, d3                       \n"
+      "vtrn.u8      d6, d7                       \n"
+      "vtrn.u8      d18, d19                     \n"
 
-    // d0 = 00+10 01+11 02+12 03+13
-    // d2 = 40+50 41+51 42+52 43+53
-    "vpaddl.u8    q0, q0                       \n"
-    "vpaddl.u8    q2, q2                       \n"
-    "vpaddl.u8    q8, q8                       \n"
+      // d0 = 00+10 01+11 02+12 03+13
+      // d2 = 40+50 41+51 42+52 43+53
+      "vpaddl.u8    q0, q0                       \n"
+      "vpaddl.u8    q2, q2                       \n"
+      "vpaddl.u8    q8, q8                       \n"
 
-    // d3 = 60+70 61+71 62+72 63+73
-    "vpaddl.u8    d3, d3                       \n"
-    "vpaddl.u8    d7, d7                       \n"
-    "vpaddl.u8    d19, d19                     \n"
+      // d3 = 60+70 61+71 62+72 63+73
+      "vpaddl.u8    d3, d3                       \n"
+      "vpaddl.u8    d7, d7                       \n"
+      "vpaddl.u8    d19, d19                     \n"
 
-    // combine source lines
-    "vadd.u16     q0, q2                       \n"
-    "vadd.u16     q0, q8                       \n"
-    "vadd.u16     d4, d3, d7                   \n"
-    "vadd.u16     d4, d19                      \n"
+      // combine source lines
+      "vadd.u16     q0, q2                       \n"
+      "vadd.u16     q0, q8                       \n"
+      "vadd.u16     d4, d3, d7                   \n"
+      "vadd.u16     d4, d19                      \n"
 
-    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
-    //             + s[6 + st * 1] + s[7 + st * 1]
-    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
-    "vqrdmulh.s16 q2, q2, q13                  \n"
-    "vmovn.u16    d4, q2                       \n"
+      // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+      //             + s[6 + st * 1] + s[7 + st * 1]
+      //             + s[6 + st * 2] + s[7 + st * 2]) / 6
+      "vqrdmulh.s16 q2, q2, q13                  \n"
+      "vmovn.u16    d4, q2                       \n"
 
-    // Shuffle 2,3 reg around so that 2 can be added to the
-    //  0,1 reg and 3 can be added to the 4,5 reg. This
-    //  requires expanding from u8 to u16 as the 0,1 and 4,5
-    //  registers are already expanded. Then do transposes
-    //  to get aligned.
-    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
-    "vmovl.u8     q1, d2                       \n"
-    "vmovl.u8     q3, d6                       \n"
-    "vmovl.u8     q9, d18                      \n"
+      // Shuffle 2,3 reg around so that 2 can be added to the
+      //  0,1 reg and 3 can be added to the 4,5 reg. This
+      //  requires expanding from u8 to u16 as the 0,1 and 4,5
+      //  registers are already expanded. Then do transposes
+      //  to get aligned.
+      // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+      "vmovl.u8     q1, d2                       \n"
+      "vmovl.u8     q3, d6                       \n"
+      "vmovl.u8     q9, d18                      \n"
 
-    // combine source lines
-    "vadd.u16     q1, q3                       \n"
-    "vadd.u16     q1, q9                       \n"
+      // combine source lines
+      "vadd.u16     q1, q3                       \n"
+      "vadd.u16     q1, q9                       \n"
 
-    // d4 = xx 20 xx 30 xx 22 xx 32
-    // d5 = xx 21 xx 31 xx 23 xx 33
-    "vtrn.u32     d2, d3                       \n"
+      // d4 = xx 20 xx 30 xx 22 xx 32
+      // d5 = xx 21 xx 31 xx 23 xx 33
+      "vtrn.u32     d2, d3                       \n"
 
-    // d4 = xx 20 xx 21 xx 22 xx 23
-    // d5 = xx 30 xx 31 xx 32 xx 33
-    "vtrn.u16     d2, d3                       \n"
+      // d4 = xx 20 xx 21 xx 22 xx 23
+      // d5 = xx 30 xx 31 xx 32 xx 33
+      "vtrn.u16     d2, d3                       \n"
 
-    // 0+1+2, 3+4+5
-    "vadd.u16     q0, q1                       \n"
+      // 0+1+2, 3+4+5
+      "vadd.u16     q0, q1                       \n"
 
-    // Need to divide, but can't downshift as the the value
-    //  isn't a power of 2. So multiply by 65536 / n
-    //  and take the upper 16 bits.
-    "vqrdmulh.s16 q0, q0, q15                  \n"
+      // Need to divide, but can't downshift as the the value
+      //  isn't a power of 2. So multiply by 65536 / n
+      //  and take the upper 16 bits.
+      "vqrdmulh.s16 q0, q0, q15                  \n"
 
-    // Align for table lookup, vtbl requires registers to
-    //  be adjacent
-    "vmov.u8      d2, d4                       \n"
+      // Align for table lookup, vtbl requires registers to
+      //  be adjacent
+      "vmov.u8      d2, d4                       \n"
 
-    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
-    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
+      "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
+      "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
 
-    MEMACCESS(1)
-    "vst1.8       {d3}, [%1]!                  \n"
-    MEMACCESS(1)
-    "vst1.32      {d4[0]}, [%1]!               \n"
-    "bgt          1b                           \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width),        // %2
-    "+r"(src_stride),       // %3
-    "+r"(src_ptr1)          // %4
-  : "r"(&kMult38_Div6),     // %5
-    "r"(&kShuf38_2),        // %6
-    "r"(&kMult38_Div9)      // %7
-  : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
-  );
+      "vst1.8       {d3}, [%1]!                  \n"
+      "vst1.32      {d4[0]}, [%1]!               \n"
+      "bgt          1b                           \n"
+      : "+r"(src_ptr),       // %0
+        "+r"(dst_ptr),       // %1
+        "+r"(dst_width),     // %2
+        "+r"(src_stride),    // %3
+        "+r"(src_ptr1)       // %4
+      : "r"(&kMult38_Div6),  // %5
+        "r"(&kShuf38_2),     // %6
+        "r"(&kMult38_Div9)   // %7
+      : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory",
+        "cc");
 }
 
 // 32x2 -> 12x1
-void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    MEMACCESS(4)
-    "vld1.16    {q13}, [%4]                    \n"
-    MEMACCESS(5)
-    "vld1.8     {q14}, [%5]                    \n"
-    "add        %3, %0                         \n"
-  "1:                                          \n"
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  asm volatile(
+      "vld1.16    {q13}, [%4]                    \n"
+      "vld1.8     {q14}, [%5]                    \n"
+      "add        %3, %0                         \n"
+      "1:                                        \n"
 
-    // d0 = 00 40 01 41 02 42 03 43
-    // d1 = 10 50 11 51 12 52 13 53
-    // d2 = 20 60 21 61 22 62 23 63
-    // d3 = 30 70 31 71 32 72 33 73
-    MEMACCESS(0)
-    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
-    MEMACCESS(3)
-    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
-    "subs         %2, %2, #12                  \n"
+      // d0 = 00 40 01 41 02 42 03 43
+      // d1 = 10 50 11 51 12 52 13 53
+      // d2 = 20 60 21 61 22 62 23 63
+      // d3 = 30 70 31 71 32 72 33 73
+      "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
+      "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
+      "subs         %2, %2, #12                  \n"
 
-    // Shuffle the input data around to get align the data
-    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
-    // d0 = 00 10 01 11 02 12 03 13
-    // d1 = 40 50 41 51 42 52 43 53
-    "vtrn.u8      d0, d1                       \n"
-    "vtrn.u8      d4, d5                       \n"
+      // Shuffle the input data around to get align the data
+      //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+      // d0 = 00 10 01 11 02 12 03 13
+      // d1 = 40 50 41 51 42 52 43 53
+      "vtrn.u8      d0, d1                       \n"
+      "vtrn.u8      d4, d5                       \n"
 
-    // d2 = 20 30 21 31 22 32 23 33
-    // d3 = 60 70 61 71 62 72 63 73
-    "vtrn.u8      d2, d3                       \n"
-    "vtrn.u8      d6, d7                       \n"
+      // d2 = 20 30 21 31 22 32 23 33
+      // d3 = 60 70 61 71 62 72 63 73
+      "vtrn.u8      d2, d3                       \n"
+      "vtrn.u8      d6, d7                       \n"
 
-    // d0 = 00+10 01+11 02+12 03+13
-    // d2 = 40+50 41+51 42+52 43+53
-    "vpaddl.u8    q0, q0                       \n"
-    "vpaddl.u8    q2, q2                       \n"
+      // d0 = 00+10 01+11 02+12 03+13
+      // d2 = 40+50 41+51 42+52 43+53
+      "vpaddl.u8    q0, q0                       \n"
+      "vpaddl.u8    q2, q2                       \n"
 
-    // d3 = 60+70 61+71 62+72 63+73
-    "vpaddl.u8    d3, d3                       \n"
-    "vpaddl.u8    d7, d7                       \n"
+      // d3 = 60+70 61+71 62+72 63+73
+      "vpaddl.u8    d3, d3                       \n"
+      "vpaddl.u8    d7, d7                       \n"
 
-    // combine source lines
-    "vadd.u16     q0, q2                       \n"
-    "vadd.u16     d4, d3, d7                   \n"
+      // combine source lines
+      "vadd.u16     q0, q2                       \n"
+      "vadd.u16     d4, d3, d7                   \n"
 
-    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
-    "vqrshrn.u16  d4, q2, #2                   \n"
+      // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+      "vqrshrn.u16  d4, q2, #2                   \n"
 
-    // Shuffle 2,3 reg around so that 2 can be added to the
-    //  0,1 reg and 3 can be added to the 4,5 reg. This
-    //  requires expanding from u8 to u16 as the 0,1 and 4,5
-    //  registers are already expanded. Then do transposes
-    //  to get aligned.
-    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
-    "vmovl.u8     q1, d2                       \n"
-    "vmovl.u8     q3, d6                       \n"
+      // Shuffle 2,3 reg around so that 2 can be added to the
+      //  0,1 reg and 3 can be added to the 4,5 reg. This
+      //  requires expanding from u8 to u16 as the 0,1 and 4,5
+      //  registers are already expanded. Then do transposes
+      //  to get aligned.
+      // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+      "vmovl.u8     q1, d2                       \n"
+      "vmovl.u8     q3, d6                       \n"
 
-    // combine source lines
-    "vadd.u16     q1, q3                       \n"
+      // combine source lines
+      "vadd.u16     q1, q3                       \n"
 
-    // d4 = xx 20 xx 30 xx 22 xx 32
-    // d5 = xx 21 xx 31 xx 23 xx 33
-    "vtrn.u32     d2, d3                       \n"
+      // d4 = xx 20 xx 30 xx 22 xx 32
+      // d5 = xx 21 xx 31 xx 23 xx 33
+      "vtrn.u32     d2, d3                       \n"
 
-    // d4 = xx 20 xx 21 xx 22 xx 23
-    // d5 = xx 30 xx 31 xx 32 xx 33
-    "vtrn.u16     d2, d3                       \n"
+      // d4 = xx 20 xx 21 xx 22 xx 23
+      // d5 = xx 30 xx 31 xx 32 xx 33
+      "vtrn.u16     d2, d3                       \n"
 
-    // 0+1+2, 3+4+5
-    "vadd.u16     q0, q1                       \n"
+      // 0+1+2, 3+4+5
+      "vadd.u16     q0, q1                       \n"
 
-    // Need to divide, but can't downshift as the the value
-    //  isn't a power of 2. So multiply by 65536 / n
-    //  and take the upper 16 bits.
-    "vqrdmulh.s16 q0, q0, q13                  \n"
+      // Need to divide, but can't downshift as the the value
+      //  isn't a power of 2. So multiply by 65536 / n
+      //  and take the upper 16 bits.
+      "vqrdmulh.s16 q0, q0, q13                  \n"
 
-    // Align for table lookup, vtbl requires registers to
-    //  be adjacent
-    "vmov.u8      d2, d4                       \n"
+      // Align for table lookup, vtbl requires registers to
+      //  be adjacent
+      "vmov.u8      d2, d4                       \n"
 
-    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
-    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
+      "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
+      "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
 
-    MEMACCESS(1)
-    "vst1.8       {d3}, [%1]!                  \n"
-    MEMACCESS(1)
-    "vst1.32      {d4[0]}, [%1]!               \n"
-    "bgt          1b                           \n"
-  : "+r"(src_ptr),       // %0
-    "+r"(dst_ptr),       // %1
-    "+r"(dst_width),     // %2
-    "+r"(src_stride)     // %3
-  : "r"(&kMult38_Div6),  // %4
-    "r"(&kShuf38_2)      // %5
-  : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
-  );
+      "vst1.8       {d3}, [%1]!                  \n"
+      "vst1.32      {d4[0]}, [%1]!               \n"
+      "bgt          1b                           \n"
+      : "+r"(src_ptr),       // %0
+        "+r"(dst_ptr),       // %1
+        "+r"(dst_width),     // %2
+        "+r"(src_stride)     // %3
+      : "r"(&kMult38_Div6),  // %4
+        "r"(&kShuf38_2)      // %5
+      : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc");
 }
 
-void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                    uint16* dst_ptr, int src_width, int src_height) {
-  const uint8* src_tmp;
-  asm volatile (
-  "1:                                          \n"
-    "mov       %0, %1                          \n"
-    "mov       r12, %5                         \n"
-    "veor      q2, q2, q2                      \n"
-    "veor      q3, q3, q3                      \n"
-  "2:                                          \n"
-    // load 16 pixels into q0
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0], %3                 \n"
-    "vaddw.u8   q3, q3, d1                     \n"
-    "vaddw.u8   q2, q2, d0                     \n"
-    "subs       r12, r12, #1                   \n"
-    "bgt        2b                             \n"
-    MEMACCESS(2)
-    "vst1.16    {q2, q3}, [%2]!                \n"  // store pixels
-    "add        %1, %1, #16                    \n"
-    "subs       %4, %4, #16                    \n"  // 16 processed per loop
-    "bgt        1b                             \n"
-  : "=&r"(src_tmp),    // %0
-    "+r"(src_ptr),     // %1
-    "+r"(dst_ptr),     // %2
-    "+r"(src_stride),  // %3
-    "+r"(src_width),   // %4
-    "+r"(src_height)   // %5
-  :
-  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"  // Clobber List
-  );
+void ScaleAddRows_NEON(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint16_t* dst_ptr,
+                       int src_width,
+                       int src_height) {
+  const uint8_t* src_tmp;
+  asm volatile(
+      "1:                                        \n"
+      "mov       %0, %1                          \n"
+      "mov       r12, %5                         \n"
+      "veor      q2, q2, q2                      \n"
+      "veor      q3, q3, q3                      \n"
+      "2:                                        \n"
+      // load 16 pixels into q0
+      "vld1.8     {q0}, [%0], %3                 \n"
+      "vaddw.u8   q3, q3, d1                     \n"
+      "vaddw.u8   q2, q2, d0                     \n"
+      "subs       r12, r12, #1                   \n"
+      "bgt        2b                             \n"
+      "vst1.16    {q2, q3}, [%2]!                \n"  // store pixels
+      "add        %1, %1, #16                    \n"
+      "subs       %4, %4, #16                    \n"  // 16 processed per loop
+      "bgt        1b                             \n"
+      : "=&r"(src_tmp),    // %0
+        "+r"(src_ptr),     // %1
+        "+r"(dst_ptr),     // %2
+        "+r"(src_stride),  // %3
+        "+r"(src_width),   // %4
+        "+r"(src_height)   // %5
+      :
+      : "memory", "cc", "r12", "q0", "q1", "q2", "q3"  // Clobber List
+      );
 }
 
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
-#define LOAD2_DATA8_LANE(n)                                    \
-    "lsr        %5, %3, #16                    \n"             \
-    "add        %6, %1, %5                     \n"             \
-    "add        %3, %3, %4                     \n"             \
-    MEMACCESS(6)                                               \
-    "vld2.8     {d6["#n"], d7["#n"]}, [%6]     \n"
+#define LOAD2_DATA8_LANE(n)                      \
+  "lsr        %5, %3, #16                    \n" \
+  "add        %6, %1, %5                     \n" \
+  "add        %3, %3, %4                     \n" \
+  "vld2.8     {d6[" #n "], d7[" #n "]}, [%6] \n"
 
-// The NEON version mimics this formula:
-// #define BLENDER(a, b, f) (uint8)((int)(a) +
-//    ((int)(f) * ((int)(b) - (int)(a)) >> 16))
+// The NEON version mimics this formula (from row_common.cc):
+// #define BLENDER(a, b, f) (uint8_t)((int)(a) +
+//    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
 
-void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
-                          int dst_width, int x, int dx) {
+void ScaleFilterCols_NEON(uint8_t* dst_ptr,
+                          const uint8_t* src_ptr,
+                          int dst_width,
+                          int x,
+                          int dx) {
   int dx_offset[4] = {0, 1, 2, 3};
   int* tmp = dx_offset;
-  const uint8* src_tmp = src_ptr;
+  const uint8_t* src_tmp = src_ptr;
   asm volatile (
     "vdup.32    q0, %3                         \n"  // x
     "vdup.32    q1, %4                         \n"  // dx
@@ -617,7 +594,6 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
     "vadd.s16   q8, q8, q9                     \n"
     "vmovn.s16  d6, q8                         \n"
 
-    MEMACCESS(0)
     "vst1.8     {d6}, [%0]!                    \n"  // store pixels
     "vadd.s32   q1, q1, q0                     \n"
     "vadd.s32   q2, q2, q0                     \n"
@@ -639,325 +615,299 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
 #undef LOAD2_DATA8_LANE
 
 // 16x2 -> 16x1
-void ScaleFilterRows_NEON(uint8* dst_ptr,
-                          const uint8* src_ptr, ptrdiff_t src_stride,
-                          int dst_width, int source_y_fraction) {
-  asm volatile (
-    "cmp          %4, #0                       \n"
-    "beq          100f                         \n"
-    "add          %2, %1                       \n"
-    "cmp          %4, #64                      \n"
-    "beq          75f                          \n"
-    "cmp          %4, #128                     \n"
-    "beq          50f                          \n"
-    "cmp          %4, #192                     \n"
-    "beq          25f                          \n"
+void ScaleFilterRows_NEON(uint8_t* dst_ptr,
+                          const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          int dst_width,
+                          int source_y_fraction) {
+  asm volatile(
+      "cmp          %4, #0                       \n"
+      "beq          100f                         \n"
+      "add          %2, %1                       \n"
+      "cmp          %4, #64                      \n"
+      "beq          75f                          \n"
+      "cmp          %4, #128                     \n"
+      "beq          50f                          \n"
+      "cmp          %4, #192                     \n"
+      "beq          25f                          \n"
 
-    "vdup.8       d5, %4                       \n"
-    "rsb          %4, #256                     \n"
-    "vdup.8       d4, %4                       \n"
-    // General purpose row blend.
-  "1:                                          \n"
-    MEMACCESS(1)
-    "vld1.8       {q0}, [%1]!                  \n"
-    MEMACCESS(2)
-    "vld1.8       {q1}, [%2]!                  \n"
-    "subs         %3, %3, #16                  \n"
-    "vmull.u8     q13, d0, d4                  \n"
-    "vmull.u8     q14, d1, d4                  \n"
-    "vmlal.u8     q13, d2, d5                  \n"
-    "vmlal.u8     q14, d3, d5                  \n"
-    "vrshrn.u16   d0, q13, #8                  \n"
-    "vrshrn.u16   d1, q14, #8                  \n"
-    MEMACCESS(0)
-    "vst1.8       {q0}, [%0]!                  \n"
-    "bgt          1b                           \n"
-    "b            99f                          \n"
+      "vdup.8       d5, %4                       \n"
+      "rsb          %4, #256                     \n"
+      "vdup.8       d4, %4                       \n"
+      // General purpose row blend.
+      "1:                                        \n"
+      "vld1.8       {q0}, [%1]!                  \n"
+      "vld1.8       {q1}, [%2]!                  \n"
+      "subs         %3, %3, #16                  \n"
+      "vmull.u8     q13, d0, d4                  \n"
+      "vmull.u8     q14, d1, d4                  \n"
+      "vmlal.u8     q13, d2, d5                  \n"
+      "vmlal.u8     q14, d3, d5                  \n"
+      "vrshrn.u16   d0, q13, #8                  \n"
+      "vrshrn.u16   d1, q14, #8                  \n"
+      "vst1.8       {q0}, [%0]!                  \n"
+      "bgt          1b                           \n"
+      "b            99f                          \n"
 
-    // Blend 25 / 75.
-  "25:                                         \n"
-    MEMACCESS(1)
-    "vld1.8       {q0}, [%1]!                  \n"
-    MEMACCESS(2)
-    "vld1.8       {q1}, [%2]!                  \n"
-    "subs         %3, %3, #16                  \n"
-    "vrhadd.u8    q0, q1                       \n"
-    "vrhadd.u8    q0, q1                       \n"
-    MEMACCESS(0)
-    "vst1.8       {q0}, [%0]!                  \n"
-    "bgt          25b                          \n"
-    "b            99f                          \n"
+      // Blend 25 / 75.
+      "25:                                       \n"
+      "vld1.8       {q0}, [%1]!                  \n"
+      "vld1.8       {q1}, [%2]!                  \n"
+      "subs         %3, %3, #16                  \n"
+      "vrhadd.u8    q0, q1                       \n"
+      "vrhadd.u8    q0, q1                       \n"
+      "vst1.8       {q0}, [%0]!                  \n"
+      "bgt          25b                          \n"
+      "b            99f                          \n"
 
-    // Blend 50 / 50.
-  "50:                                         \n"
-    MEMACCESS(1)
-    "vld1.8       {q0}, [%1]!                  \n"
-    MEMACCESS(2)
-    "vld1.8       {q1}, [%2]!                  \n"
-    "subs         %3, %3, #16                  \n"
-    "vrhadd.u8    q0, q1                       \n"
-    MEMACCESS(0)
-    "vst1.8       {q0}, [%0]!                  \n"
-    "bgt          50b                          \n"
-    "b            99f                          \n"
+      // Blend 50 / 50.
+      "50:                                       \n"
+      "vld1.8       {q0}, [%1]!                  \n"
+      "vld1.8       {q1}, [%2]!                  \n"
+      "subs         %3, %3, #16                  \n"
+      "vrhadd.u8    q0, q1                       \n"
+      "vst1.8       {q0}, [%0]!                  \n"
+      "bgt          50b                          \n"
+      "b            99f                          \n"
 
-    // Blend 75 / 25.
-  "75:                                         \n"
-    MEMACCESS(1)
-    "vld1.8       {q1}, [%1]!                  \n"
-    MEMACCESS(2)
-    "vld1.8       {q0}, [%2]!                  \n"
-    "subs         %3, %3, #16                  \n"
-    "vrhadd.u8    q0, q1                       \n"
-    "vrhadd.u8    q0, q1                       \n"
-    MEMACCESS(0)
-    "vst1.8       {q0}, [%0]!                  \n"
-    "bgt          75b                          \n"
-    "b            99f                          \n"
+      // Blend 75 / 25.
+      "75:                                       \n"
+      "vld1.8       {q1}, [%1]!                  \n"
+      "vld1.8       {q0}, [%2]!                  \n"
+      "subs         %3, %3, #16                  \n"
+      "vrhadd.u8    q0, q1                       \n"
+      "vrhadd.u8    q0, q1                       \n"
+      "vst1.8       {q0}, [%0]!                  \n"
+      "bgt          75b                          \n"
+      "b            99f                          \n"
 
-    // Blend 100 / 0 - Copy row unchanged.
-  "100:                                        \n"
-    MEMACCESS(1)
-    "vld1.8       {q0}, [%1]!                  \n"
-    "subs         %3, %3, #16                  \n"
-    MEMACCESS(0)
-    "vst1.8       {q0}, [%0]!                  \n"
-    "bgt          100b                         \n"
+      // Blend 100 / 0 - Copy row unchanged.
+      "100:                                      \n"
+      "vld1.8       {q0}, [%1]!                  \n"
+      "subs         %3, %3, #16                  \n"
+      "vst1.8       {q0}, [%0]!                  \n"
+      "bgt          100b                         \n"
 
-  "99:                                         \n"
-    MEMACCESS(0)
-    "vst1.8       {d1[7]}, [%0]                \n"
-  : "+r"(dst_ptr),          // %0
-    "+r"(src_ptr),          // %1
-    "+r"(src_stride),       // %2
-    "+r"(dst_width),        // %3
-    "+r"(source_y_fraction) // %4
-  :
-  : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
-  );
+      "99:                                       \n"
+      "vst1.8       {d1[7]}, [%0]                \n"
+      : "+r"(dst_ptr),           // %0
+        "+r"(src_ptr),           // %1
+        "+r"(src_stride),        // %2
+        "+r"(dst_width),         // %3
+        "+r"(source_y_fraction)  // %4
+      :
+      : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc");
 }
 
-void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    // load even pixels into q0, odd into q1
-    MEMACCESS(0)
-    "vld2.32    {q0, q1}, [%0]!                \n"
-    MEMACCESS(0)
-    "vld2.32    {q2, q3}, [%0]!                \n"
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop
-    MEMACCESS(1)
-    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
-    MEMACCESS(1)
-    "vst1.8     {q3}, [%1]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst),              // %1
-    "+r"(dst_width)         // %2
-  :
-  : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
-  );
+void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst,
+                            int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld4.32    {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+      "vld4.32    {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop
+      "vmov       q2, q1                         \n"  // load next 8 ARGB
+      "vst2.32    {q2, q3}, [%1]!                \n"  // store odd pixels
+      "bgt        1b                             \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
+      );
 }
 
-void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
-                                  uint8* dst_argb, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
-    "vrshrn.u16 d0, q0, #1                     \n"  // downshift, round and pack
-    "vrshrn.u16 d1, q1, #1                     \n"
-    "vrshrn.u16 d2, q2, #1                     \n"
-    "vrshrn.u16 d3, q3, #1                     \n"
-    MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"
-    "bgt       1b                              \n"
-  : "+r"(src_argb),         // %0
-    "+r"(dst_argb),         // %1
-    "+r"(dst_width)         // %2
-  :
-  : "memory", "cc", "q0", "q1", "q2", "q3"     // Clobber List
-  );
+//  46:  f964 018d   vld4.32  {d16,d18,d20,d22}, [r4]!
+//  4a:  3e04        subs  r6, #4
+//  4c:  f964 118d   vld4.32  {d17,d19,d21,d23}, [r4]!
+//  50:  ef64 21f4   vorr  q9, q10, q10
+//  54:  f942 038d   vst2.32  {d16-d19}, [r2]!
+//  58:  d1f5        bne.n  46 <ScaleARGBRowDown2_C+0x46>
+
+void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_argb,
+                                  int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld4.32    {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+      "vld4.32    {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop
+      "vrhadd.u8  q0, q0, q1                     \n"  // rounding half add
+      "vrhadd.u8  q1, q2, q3                     \n"  // rounding half add
+      "vst2.32    {q0, q1}, [%1]!                \n"
+      "bgt       1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
+      );
 }
 
-void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst, int dst_width) {
-  asm volatile (
-    // change the stride to row 2 pointer
-    "add        %1, %1, %0                     \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "vld4.8     {d16, d18, d20, d22}, [%1]!    \n"  // load 8 more ARGB pixels.
-    MEMACCESS(1)
-    "vld4.8     {d17, d19, d21, d23}, [%1]!    \n"  // load last 8 ARGB pixels.
-    "vpadal.u8  q0, q8                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q9                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q10                        \n"  // R 16 bytes -> 8 shorts.
-    "vpadal.u8  q3, q11                        \n"  // A 16 bytes -> 8 shorts.
-    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
-    "vrshrn.u16 d1, q1, #2                     \n"
-    "vrshrn.u16 d2, q2, #2                     \n"
-    "vrshrn.u16 d3, q3, #2                     \n"
-    MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(src_stride),       // %1
-    "+r"(dst),              // %2
-    "+r"(dst_width)         // %3
-  :
-  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
-  );
+void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst,
+                               int dst_width) {
+  asm volatile(
+      // change the stride to row 2 pointer
+      "add        %1, %1, %0                     \n"
+      "1:                                        \n"
+      "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+      "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB
+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+      "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+      "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
+      "vld4.8     {d16, d18, d20, d22}, [%1]!    \n"  // load 8 more ARGB
+      "vld4.8     {d17, d19, d21, d23}, [%1]!    \n"  // load last 8 ARGB
+      "vpadal.u8  q0, q8                         \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8  q1, q9                         \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8  q2, q10                        \n"  // R 16 bytes -> 8 shorts.
+      "vpadal.u8  q3, q11                        \n"  // A 16 bytes -> 8 shorts.
+      "vrshrn.u16 d0, q0, #2                     \n"  // round and pack to bytes
+      "vrshrn.u16 d1, q1, #2                     \n"
+      "vrshrn.u16 d2, q2, #2                     \n"
+      "vrshrn.u16 d3, q3, #2                     \n"
+      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"
+      "bgt        1b                             \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
 }
 
 // Reads 4 pixels at a time.
 // Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
-                               int src_stepx, uint8* dst_argb, int dst_width) {
-  asm volatile (
-    "mov        r12, %3, lsl #2                \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.32    {d0[0]}, [%0], r12             \n"
-    MEMACCESS(0)
-    "vld1.32    {d0[1]}, [%0], r12             \n"
-    MEMACCESS(0)
-    "vld1.32    {d1[0]}, [%0], r12             \n"
-    MEMACCESS(0)
-    "vld1.32    {d1[1]}, [%0], r12             \n"
-    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
-    MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+r"(dst_width)    // %2
-  : "r"(src_stepx)     // %3
-  : "memory", "cc", "r12", "q0"
-  );
+void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8_t* dst_argb,
+                               int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "mov        r12, %3, lsl #2                \n"
+      "1:                                        \n"
+      "vld1.32    {d0[0]}, [%0], r12             \n"
+      "vld1.32    {d0[1]}, [%0], r12             \n"
+      "vld1.32    {d1[0]}, [%0], r12             \n"
+      "vld1.32    {d1[1]}, [%0], r12             \n"
+      "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
+      "vst1.8     {q0}, [%1]!                    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(dst_width)  // %2
+      : "r"(src_stepx)   // %3
+      : "memory", "cc", "r12", "q0");
 }
 
 // Reads 4 pixels at a time.
 // Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
                                   int src_stepx,
-                                  uint8* dst_argb, int dst_width) {
-  asm volatile (
-    "mov        r12, %4, lsl #2                \n"
-    "add        %1, %1, %0                     \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {d0}, [%0], r12                \n"  // Read 4 2x2 blocks -> 2x1
-    MEMACCESS(1)
-    "vld1.8     {d1}, [%1], r12                \n"
-    MEMACCESS(0)
-    "vld1.8     {d2}, [%0], r12                \n"
-    MEMACCESS(1)
-    "vld1.8     {d3}, [%1], r12                \n"
-    MEMACCESS(0)
-    "vld1.8     {d4}, [%0], r12                \n"
-    MEMACCESS(1)
-    "vld1.8     {d5}, [%1], r12                \n"
-    MEMACCESS(0)
-    "vld1.8     {d6}, [%0], r12                \n"
-    MEMACCESS(1)
-    "vld1.8     {d7}, [%1], r12                \n"
-    "vaddl.u8   q0, d0, d1                     \n"
-    "vaddl.u8   q1, d2, d3                     \n"
-    "vaddl.u8   q2, d4, d5                     \n"
-    "vaddl.u8   q3, d6, d7                     \n"
-    "vswp.8     d1, d2                         \n"  // ab_cd -> ac_bd
-    "vswp.8     d5, d6                         \n"  // ef_gh -> eg_fh
-    "vadd.u16   q0, q0, q1                     \n"  // (a+b)_(c+d)
-    "vadd.u16   q2, q2, q3                     \n"  // (e+f)_(g+h)
-    "vrshrn.u16 d0, q0, #2                     \n"  // first 2 pixels.
-    "vrshrn.u16 d1, q2, #2                     \n"  // next 2 pixels.
-    "subs       %3, %3, #4                     \n"  // 4 pixels per loop.
-    MEMACCESS(2)
-    "vst1.8     {q0}, [%2]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_argb),    // %0
-    "+r"(src_stride),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(dst_width)    // %3
-  : "r"(src_stepx)     // %4
-  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
-  );
+                                  uint8_t* dst_argb,
+                                  int dst_width) {
+  asm volatile(
+      "mov        r12, %4, lsl #2                \n"
+      "add        %1, %1, %0                     \n"
+      "1:                                        \n"
+      "vld1.8     {d0}, [%0], r12                \n"  // 4 2x2 blocks -> 2x1
+      "vld1.8     {d1}, [%1], r12                \n"
+      "vld1.8     {d2}, [%0], r12                \n"
+      "vld1.8     {d3}, [%1], r12                \n"
+      "vld1.8     {d4}, [%0], r12                \n"
+      "vld1.8     {d5}, [%1], r12                \n"
+      "vld1.8     {d6}, [%0], r12                \n"
+      "vld1.8     {d7}, [%1], r12                \n"
+      "vaddl.u8   q0, d0, d1                     \n"
+      "vaddl.u8   q1, d2, d3                     \n"
+      "vaddl.u8   q2, d4, d5                     \n"
+      "vaddl.u8   q3, d6, d7                     \n"
+      "vswp.8     d1, d2                         \n"  // ab_cd -> ac_bd
+      "vswp.8     d5, d6                         \n"  // ef_gh -> eg_fh
+      "vadd.u16   q0, q0, q1                     \n"  // (a+b)_(c+d)
+      "vadd.u16   q2, q2, q3                     \n"  // (e+f)_(g+h)
+      "vrshrn.u16 d0, q0, #2                     \n"  // first 2 pixels.
+      "vrshrn.u16 d1, q2, #2                     \n"  // next 2 pixels.
+      "subs       %3, %3, #4                     \n"  // 4 pixels per loop.
+      "vst1.8     {q0}, [%2]!                    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_argb),    // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(dst_width)    // %3
+      : "r"(src_stepx)     // %4
+      : "memory", "cc", "r12", "q0", "q1", "q2", "q3");
 }
 
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
-#define LOAD1_DATA32_LANE(dn, n)                               \
-    "lsr        %5, %3, #16                    \n"             \
-    "add        %6, %1, %5, lsl #2             \n"             \
-    "add        %3, %3, %4                     \n"             \
-    MEMACCESS(6)                                               \
-    "vld1.32    {"#dn"["#n"]}, [%6]            \n"
+#define LOAD1_DATA32_LANE(dn, n)                 \
+  "lsr        %5, %3, #16                    \n" \
+  "add        %6, %1, %5, lsl #2             \n" \
+  "add        %3, %3, %4                     \n" \
+  "vld1.32    {" #dn "[" #n "]}, [%6]        \n"
 
-void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx) {
+void ScaleARGBCols_NEON(uint8_t* dst_argb,
+                        const uint8_t* src_argb,
+                        int dst_width,
+                        int x,
+                        int dx) {
   int tmp;
-  const uint8* src_tmp = src_argb;
-  asm volatile (
-  "1:                                          \n"
-    LOAD1_DATA32_LANE(d0, 0)
-    LOAD1_DATA32_LANE(d0, 1)
-    LOAD1_DATA32_LANE(d1, 0)
-    LOAD1_DATA32_LANE(d1, 1)
-    LOAD1_DATA32_LANE(d2, 0)
-    LOAD1_DATA32_LANE(d2, 1)
-    LOAD1_DATA32_LANE(d3, 0)
-    LOAD1_DATA32_LANE(d3, 1)
-
-    MEMACCESS(0)
-    "vst1.32     {q0, q1}, [%0]!               \n"  // store pixels
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop
-    "bgt        1b                             \n"
-  : "+r"(dst_argb),   // %0
-    "+r"(src_argb),   // %1
-    "+r"(dst_width),  // %2
-    "+r"(x),          // %3
-    "+r"(dx),         // %4
-    "=&r"(tmp),       // %5
-    "+r"(src_tmp)     // %6
-  :
-  : "memory", "cc", "q0", "q1"
-  );
+  const uint8_t* src_tmp = src_argb;
+  asm volatile(
+      "1:                                        \n"
+      // clang-format off
+      LOAD1_DATA32_LANE(d0, 0)
+      LOAD1_DATA32_LANE(d0, 1)
+      LOAD1_DATA32_LANE(d1, 0)
+      LOAD1_DATA32_LANE(d1, 1)
+      LOAD1_DATA32_LANE(d2, 0)
+      LOAD1_DATA32_LANE(d2, 1)
+      LOAD1_DATA32_LANE(d3, 0)
+      LOAD1_DATA32_LANE(d3, 1)
+      // clang-format on
+      "vst1.32     {q0, q1}, [%0]!               \n"  // store pixels
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop
+      "bgt        1b                             \n"
+      : "+r"(dst_argb),   // %0
+        "+r"(src_argb),   // %1
+        "+r"(dst_width),  // %2
+        "+r"(x),          // %3
+        "+r"(dx),         // %4
+        "=&r"(tmp),       // %5
+        "+r"(src_tmp)     // %6
+      :
+      : "memory", "cc", "q0", "q1");
 }
 
 #undef LOAD1_DATA32_LANE
 
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
-#define LOAD2_DATA32_LANE(dn1, dn2, n)                         \
-    "lsr        %5, %3, #16                           \n"      \
-    "add        %6, %1, %5, lsl #2                    \n"      \
-    "add        %3, %3, %4                            \n"      \
-    MEMACCESS(6)                                               \
-    "vld2.32    {"#dn1"["#n"], "#dn2"["#n"]}, [%6]    \n"
+#define LOAD2_DATA32_LANE(dn1, dn2, n)                       \
+  "lsr        %5, %3, #16                                \n" \
+  "add        %6, %1, %5, lsl #2                         \n" \
+  "add        %3, %3, %4                                 \n" \
+  "vld2.32    {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n"
 
-void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
-                              int dst_width, int x, int dx) {
+void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
+                              const uint8_t* src_argb,
+                              int dst_width,
+                              int x,
+                              int dx) {
   int dx_offset[4] = {0, 1, 2, 3};
   int* tmp = dx_offset;
-  const uint8* src_tmp = src_argb;
+  const uint8_t* src_tmp = src_argb;
   asm volatile (
     "vdup.32    q0, %3                         \n"  // x
     "vdup.32    q1, %4                         \n"  // dx
@@ -993,7 +943,6 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
     "vshrn.i16   d0, q11, #7                   \n"
     "vshrn.i16   d1, q12, #7                   \n"
 
-    MEMACCESS(0)
     "vst1.32     {d0, d1}, [%0]!               \n"  // store pixels
     "vadd.s32    q8, q8, q9                    \n"
     "subs        %2, %2, #4                    \n"  // 4 processed per loop
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/scale_neon64.cc b/media/libvpx/libvpx/third_party/libyuv/source/scale_neon64.cc
index ff277f26ff..494a9cfbfb 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/scale_neon64.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/scale_neon64.cc
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/scale.h"
 #include "libyuv/row.h"
+#include "libyuv/scale.h"
 #include "libyuv/scale_row.h"
 
 #ifdef __cplusplus
@@ -21,580 +21,556 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
 // Read 32x1 throw away even pixels, and write 16x1.
-void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    // load even pixels into v0, odd into v1
-    MEMACCESS(0)
-    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"
-    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
-    MEMACCESS(1)
-    "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst),              // %1
-    "+r"(dst_width)         // %2
-  :
-  : "v0", "v1"              // Clobber List
-  );
+void ScaleRowDown2_NEON(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      // load even pixels into v0, odd into v1
+      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"
+      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
+      "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
+      "b.gt       1b                             \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "v0", "v1"  // Clobber List
+      );
 }
 
 // Read 32x1 average down and write 16x1.
-void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b,v1.16b}, [%0], #32     \n"  // load pixels and post inc
-    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
-    "uaddlp     v0.8h, v0.16b                  \n"  // add adjacent
-    "uaddlp     v1.8h, v1.16b                  \n"
-    "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack
-    "rshrn2     v0.16b, v1.8h, #1              \n"
-    MEMACCESS(1)
-    "st1        {v0.16b}, [%1], #16            \n"
-    "b.gt       1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst),              // %1
-    "+r"(dst_width)         // %2
-  :
-  : "v0", "v1"     // Clobber List
-  );
+void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst,
+                              int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      // load even pixels into v0, odd into v1
+      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"
+      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
+      "urhadd     v0.16b, v0.16b, v1.16b         \n"  // rounding half add
+      "st1        {v0.16b}, [%1], #16            \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "v0", "v1"  // Clobber List
+      );
 }
 
 // Read 32x2 average down and write 16x1.
-void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width) {
-  asm volatile (
-    // change the stride to row 2 pointer
-    "add        %1, %1, %0                     \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b,v1.16b}, [%0], #32    \n"  // load row 1 and post inc
-    MEMACCESS(1)
-    "ld1        {v2.16b, v3.16b}, [%1], #32    \n"  // load row 2 and post inc
-    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
-    "uaddlp     v0.8h, v0.16b                  \n"  // row 1 add adjacent
-    "uaddlp     v1.8h, v1.16b                  \n"
-    "uadalp     v0.8h, v2.16b                  \n"  // row 2 add adjacent + row1
-    "uadalp     v1.8h, v3.16b                  \n"
-    "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack
-    "rshrn2     v0.16b, v1.8h, #2              \n"
-    MEMACCESS(2)
-    "st1        {v0.16b}, [%2], #16            \n"
-    "b.gt       1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(src_stride),       // %1
-    "+r"(dst),              // %2
-    "+r"(dst_width)         // %3
-  :
-  : "v0", "v1", "v2", "v3"     // Clobber List
-  );
+void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst,
+                           int dst_width) {
+  asm volatile(
+      // change the stride to row 2 pointer
+      "add        %1, %1, %0                     \n"
+      "1:                                        \n"
+      "ld1        {v0.16b, v1.16b}, [%0], #32    \n"  // load row 1 and post inc
+      "ld1        {v2.16b, v3.16b}, [%1], #32    \n"  // load row 2 and post inc
+      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
+      "uaddlp     v0.8h, v0.16b                  \n"  // row 1 add adjacent
+      "uaddlp     v1.8h, v1.16b                  \n"
+      "uadalp     v0.8h, v2.16b                  \n"  // += row 2 add adjacent
+      "uadalp     v1.8h, v3.16b                  \n"
+      "rshrn      v0.8b, v0.8h, #2               \n"  // round and pack
+      "rshrn2     v0.16b, v1.8h, #2              \n"
+      "st1        {v0.16b}, [%2], #16            \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      :
+      : "v0", "v1", "v2", "v3"  // Clobber List
+      );
 }
 
-void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4     {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32          \n"  // src line 0
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
-    MEMACCESS(1)
-    "st1     {v2.8b}, [%1], #8                 \n"
-    "b.gt       1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width)         // %2
-  :
-  : "v0", "v1", "v2", "v3", "memory", "cc"
-  );
+void ScaleRowDown4_NEON(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "ld4     {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32  \n"  // src line 0
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+      "st1     {v2.8b}, [%1], #8                 \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "v0", "v1", "v2", "v3", "memory", "cc");
 }
 
-void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
-  const uint8* src_ptr1 = src_ptr + src_stride;
-  const uint8* src_ptr2 = src_ptr + src_stride * 2;
-  const uint8* src_ptr3 = src_ptr + src_stride * 3;
-asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1     {v0.16b}, [%0], #16               \n"   // load up 16x4
-    MEMACCESS(3)
-    "ld1     {v1.16b}, [%2], #16               \n"
-    MEMACCESS(4)
-    "ld1     {v2.16b}, [%3], #16               \n"
-    MEMACCESS(5)
-    "ld1     {v3.16b}, [%4], #16               \n"
-    "subs    %w5, %w5, #4                      \n"
-    "uaddlp  v0.8h, v0.16b                     \n"
-    "uadalp  v0.8h, v1.16b                     \n"
-    "uadalp  v0.8h, v2.16b                     \n"
-    "uadalp  v0.8h, v3.16b                     \n"
-    "addp    v0.8h, v0.8h, v0.8h               \n"
-    "rshrn   v0.8b, v0.8h, #4                  \n"   // divide by 16 w/rounding
-    MEMACCESS(1)
-    "st1    {v0.s}[0], [%1], #4                \n"
-    "b.gt       1b                             \n"
-  : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(src_ptr1),  // %2
-    "+r"(src_ptr2),  // %3
-    "+r"(src_ptr3),  // %4
-    "+r"(dst_width)  // %5
-  :
-  : "v0", "v1", "v2", "v3", "memory", "cc"
-  );
+void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
+  const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
+  asm volatile(
+      "1:                                        \n"
+      "ld1     {v0.16b}, [%0], #16               \n"  // load up 16x4
+      "ld1     {v1.16b}, [%2], #16               \n"
+      "ld1     {v2.16b}, [%3], #16               \n"
+      "ld1     {v3.16b}, [%4], #16               \n"
+      "subs    %w5, %w5, #4                      \n"
+      "uaddlp  v0.8h, v0.16b                     \n"
+      "uadalp  v0.8h, v1.16b                     \n"
+      "uadalp  v0.8h, v2.16b                     \n"
+      "uadalp  v0.8h, v3.16b                     \n"
+      "addp    v0.8h, v0.8h, v0.8h               \n"
+      "rshrn   v0.8b, v0.8h, #4                  \n"  // divide by 16 w/rounding
+      "st1    {v0.s}[0], [%1], #4                \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(src_ptr1),  // %2
+        "+r"(src_ptr2),  // %3
+        "+r"(src_ptr3),  // %4
+        "+r"(dst_width)  // %5
+      :
+      : "v0", "v1", "v2", "v3", "memory", "cc");
 }
 
 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
 // to load up the every 4th pixel into a 4 different registers.
 // Point samples 32 pixels to 24 pixels.
-void ScaleRowDown34_NEON(const uint8* src_ptr,
+void ScaleRowDown34_NEON(const uint8_t* src_ptr,
                          ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width) {
-  asm volatile (
-  "1:                                                  \n"
-    MEMACCESS(0)
-    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
-    "subs      %w2, %w2, #24                           \n"
-    "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0, v1, v2
-    MEMACCESS(1)
-    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
-    "b.gt      1b                                      \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width)         // %2
-  :
-  : "v0", "v1", "v2", "v3", "memory", "cc"
-  );
+                         uint8_t* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                                \n"
+      "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"  // src line 0
+      "subs      %w2, %w2, #24                           \n"
+      "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0,v1,v2
+      "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"
+      "b.gt      1b                                      \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "v0", "v1", "v2", "v3", "memory", "cc");
 }
 
-void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movi      v20.8b, #3                              \n"
-    "add       %3, %3, %0                              \n"
-  "1:                                                  \n"
-    MEMACCESS(0)
-    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
-    MEMACCESS(3)
-    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1
-    "subs         %w2, %w2, #24                        \n"
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  asm volatile(
+      "movi      v20.8b, #3                              \n"
+      "add       %3, %3, %0                              \n"
+      "1:                                                \n"
+      "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"  // src line 0
+      "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32    \n"  // src line 1
+      "subs         %w2, %w2, #24                        \n"
 
-    // filter src line 0 with src line 1
-    // expand chars to shorts to allow for room
-    // when adding lines together
-    "ushll     v16.8h, v4.8b, #0                       \n"
-    "ushll     v17.8h, v5.8b, #0                       \n"
-    "ushll     v18.8h, v6.8b, #0                       \n"
-    "ushll     v19.8h, v7.8b, #0                       \n"
+      // filter src line 0 with src line 1
+      // expand chars to shorts to allow for room
+      // when adding lines together
+      "ushll     v16.8h, v4.8b, #0                       \n"
+      "ushll     v17.8h, v5.8b, #0                       \n"
+      "ushll     v18.8h, v6.8b, #0                       \n"
+      "ushll     v19.8h, v7.8b, #0                       \n"
 
-    // 3 * line_0 + line_1
-    "umlal     v16.8h, v0.8b, v20.8b                   \n"
-    "umlal     v17.8h, v1.8b, v20.8b                   \n"
-    "umlal     v18.8h, v2.8b, v20.8b                   \n"
-    "umlal     v19.8h, v3.8b, v20.8b                   \n"
+      // 3 * line_0 + line_1
+      "umlal     v16.8h, v0.8b, v20.8b                   \n"
+      "umlal     v17.8h, v1.8b, v20.8b                   \n"
+      "umlal     v18.8h, v2.8b, v20.8b                   \n"
+      "umlal     v19.8h, v3.8b, v20.8b                   \n"
 
-    // (3 * line_0 + line_1) >> 2
-    "uqrshrn   v0.8b, v16.8h, #2                       \n"
-    "uqrshrn   v1.8b, v17.8h, #2                       \n"
-    "uqrshrn   v2.8b, v18.8h, #2                       \n"
-    "uqrshrn   v3.8b, v19.8h, #2                       \n"
+      // (3 * line_0 + line_1) >> 2
+      "uqrshrn   v0.8b, v16.8h, #2                       \n"
+      "uqrshrn   v1.8b, v17.8h, #2                       \n"
+      "uqrshrn   v2.8b, v18.8h, #2                       \n"
+      "uqrshrn   v3.8b, v19.8h, #2                       \n"
 
-    // a0 = (src[0] * 3 + s[1] * 1) >> 2
-    "ushll     v16.8h, v1.8b, #0                       \n"
-    "umlal     v16.8h, v0.8b, v20.8b                   \n"
-    "uqrshrn   v0.8b, v16.8h, #2                       \n"
+      // a0 = (src[0] * 3 + s[1] * 1) >> 2
+      "ushll     v16.8h, v1.8b, #0                       \n"
+      "umlal     v16.8h, v0.8b, v20.8b                   \n"
+      "uqrshrn   v0.8b, v16.8h, #2                       \n"
 
-    // a1 = (src[1] * 1 + s[2] * 1) >> 1
-    "urhadd    v1.8b, v1.8b, v2.8b                     \n"
+      // a1 = (src[1] * 1 + s[2] * 1) >> 1
+      "urhadd    v1.8b, v1.8b, v2.8b                     \n"
 
-    // a2 = (src[2] * 1 + s[3] * 3) >> 2
-    "ushll     v16.8h, v2.8b, #0                       \n"
-    "umlal     v16.8h, v3.8b, v20.8b                   \n"
-    "uqrshrn   v2.8b, v16.8h, #2                       \n"
+      // a2 = (src[2] * 1 + s[3] * 3) >> 2
+      "ushll     v16.8h, v2.8b, #0                       \n"
+      "umlal     v16.8h, v3.8b, v20.8b                   \n"
+      "uqrshrn   v2.8b, v16.8h, #2                       \n"
 
-    MEMACCESS(1)
-    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
+      "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"
 
-    "b.gt      1b                                      \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width),        // %2
-    "+r"(src_stride)        // %3
-  :
-  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19",
-    "v20", "memory", "cc"
-  );
+      "b.gt      1b                                      \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_stride)  // %3
+      :
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+        "v19", "v20", "memory", "cc");
 }
 
-void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movi      v20.8b, #3                              \n"
-    "add       %3, %3, %0                              \n"
-  "1:                                                  \n"
-    MEMACCESS(0)
-    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
-    MEMACCESS(3)
-    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1
-    "subs         %w2, %w2, #24                        \n"
-    // average src line 0 with src line 1
-    "urhadd    v0.8b, v0.8b, v4.8b                     \n"
-    "urhadd    v1.8b, v1.8b, v5.8b                     \n"
-    "urhadd    v2.8b, v2.8b, v6.8b                     \n"
-    "urhadd    v3.8b, v3.8b, v7.8b                     \n"
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  asm volatile(
+      "movi      v20.8b, #3                              \n"
+      "add       %3, %3, %0                              \n"
+      "1:                                                \n"
+      "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"  // src line 0
+      "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32    \n"  // src line 1
+      "subs         %w2, %w2, #24                        \n"
+      // average src line 0 with src line 1
+      "urhadd    v0.8b, v0.8b, v4.8b                     \n"
+      "urhadd    v1.8b, v1.8b, v5.8b                     \n"
+      "urhadd    v2.8b, v2.8b, v6.8b                     \n"
+      "urhadd    v3.8b, v3.8b, v7.8b                     \n"
 
-    // a0 = (src[0] * 3 + s[1] * 1) >> 2
-    "ushll     v4.8h, v1.8b, #0                        \n"
-    "umlal     v4.8h, v0.8b, v20.8b                    \n"
-    "uqrshrn   v0.8b, v4.8h, #2                        \n"
+      // a0 = (src[0] * 3 + s[1] * 1) >> 2
+      "ushll     v4.8h, v1.8b, #0                        \n"
+      "umlal     v4.8h, v0.8b, v20.8b                    \n"
+      "uqrshrn   v0.8b, v4.8h, #2                        \n"
 
-    // a1 = (src[1] * 1 + s[2] * 1) >> 1
-    "urhadd    v1.8b, v1.8b, v2.8b                     \n"
+      // a1 = (src[1] * 1 + s[2] * 1) >> 1
+      "urhadd    v1.8b, v1.8b, v2.8b                     \n"
 
-    // a2 = (src[2] * 1 + s[3] * 3) >> 2
-    "ushll     v4.8h, v2.8b, #0                        \n"
-    "umlal     v4.8h, v3.8b, v20.8b                    \n"
-    "uqrshrn   v2.8b, v4.8h, #2                        \n"
+      // a2 = (src[2] * 1 + s[3] * 3) >> 2
+      "ushll     v4.8h, v2.8b, #0                        \n"
+      "umlal     v4.8h, v3.8b, v20.8b                    \n"
+      "uqrshrn   v2.8b, v4.8h, #2                        \n"
 
-    MEMACCESS(1)
-    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
-    "b.gt      1b                                      \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width),        // %2
-    "+r"(src_stride)        // %3
-  :
-  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"
-  );
+      "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"
+      "b.gt      1b                                      \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_stride)  // %3
+      :
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc");
 }
 
-static uvec8 kShuf38 =
-  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
-static uvec8 kShuf38_2 =
-  { 0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0 };
-static vec16 kMult38_Div6 =
-  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
-    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
-static vec16 kMult38_Div9 =
-  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
-    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+static const uvec8 kShuf38 = {0,  3,  6,  8,  11, 14, 16, 19,
+                              22, 24, 27, 30, 0,  0,  0,  0};
+static const uvec8 kShuf38_2 = {0,  16, 32, 2,  18, 33, 4, 20,
+                                34, 6,  22, 35, 0,  0,  0, 0};
+static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
+                                   65536 / 12, 65536 / 12, 65536 / 12,
+                                   65536 / 12, 65536 / 12};
+static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
+                                   65536 / 18, 65536 / 18, 65536 / 18,
+                                   65536 / 18, 65536 / 18};
 
 // 32 -> 12
-void ScaleRowDown38_NEON(const uint8* src_ptr,
+void ScaleRowDown38_NEON(const uint8_t* src_ptr,
                          ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    MEMACCESS(3)
-    "ld1       {v3.16b}, [%3]                          \n"
-  "1:                                                  \n"
-    MEMACCESS(0)
-    "ld1       {v0.16b,v1.16b}, [%0], #32             \n"
-    "subs      %w2, %w2, #12                           \n"
-    "tbl       v2.16b, {v0.16b,v1.16b}, v3.16b        \n"
-    MEMACCESS(1)
-    "st1       {v2.8b}, [%1], #8                       \n"
-    MEMACCESS(1)
-    "st1       {v2.s}[2], [%1], #4                     \n"
-    "b.gt      1b                                      \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width)         // %2
-  : "r"(&kShuf38)           // %3
-  : "v0", "v1", "v2", "v3", "memory", "cc"
-  );
+                         uint8_t* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "ld1       {v3.16b}, [%3]                          \n"
+      "1:                                                \n"
+      "ld1       {v0.16b,v1.16b}, [%0], #32              \n"
+      "subs      %w2, %w2, #12                           \n"
+      "tbl       v2.16b, {v0.16b,v1.16b}, v3.16b         \n"
+      "st1       {v2.8b}, [%1], #8                       \n"
+      "st1       {v2.s}[2], [%1], #4                     \n"
+      "b.gt      1b                                      \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      : "r"(&kShuf38)    // %3
+      : "v0", "v1", "v2", "v3", "memory", "cc");
 }
 
 // 32x3 -> 12x1
-void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
                                       ptrdiff_t src_stride,
-                                      uint8* dst_ptr, int dst_width) {
-  const uint8* src_ptr1 = src_ptr + src_stride * 2;
+                                      uint8_t* dst_ptr,
+                                      int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
   ptrdiff_t tmp_src_stride = src_stride;
 
-  asm volatile (
-    MEMACCESS(5)
-    "ld1       {v29.8h}, [%5]                          \n"
-    MEMACCESS(6)
-    "ld1       {v30.16b}, [%6]                         \n"
-    MEMACCESS(7)
-    "ld1       {v31.8h}, [%7]                          \n"
-    "add       %2, %2, %0                              \n"
-  "1:                                                  \n"
+  asm volatile(
+      "ld1       {v29.8h}, [%5]                          \n"
+      "ld1       {v30.16b}, [%6]                         \n"
+      "ld1       {v31.8h}, [%7]                          \n"
+      "add       %2, %2, %0                              \n"
+      "1:                                                \n"
 
-    // 00 40 01 41 02 42 03 43
-    // 10 50 11 51 12 52 13 53
-    // 20 60 21 61 22 62 23 63
-    // 30 70 31 71 32 72 33 73
-    MEMACCESS(0)
-    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
-    MEMACCESS(3)
-    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
-    MEMACCESS(4)
-    "ld4       {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32              \n"
-    "subs      %w4, %w4, #12                           \n"
+      // 00 40 01 41 02 42 03 43
+      // 10 50 11 51 12 52 13 53
+      // 20 60 21 61 22 62 23 63
+      // 30 70 31 71 32 72 33 73
+      "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"
+      "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32    \n"
+      "ld4       {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32  \n"
+      "subs      %w4, %w4, #12                           \n"
 
-    // Shuffle the input data around to get align the data
-    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
-    // 00 10 01 11 02 12 03 13
-    // 40 50 41 51 42 52 43 53
-    "trn1      v20.8b, v0.8b, v1.8b                    \n"
-    "trn2      v21.8b, v0.8b, v1.8b                    \n"
-    "trn1      v22.8b, v4.8b, v5.8b                    \n"
-    "trn2      v23.8b, v4.8b, v5.8b                    \n"
-    "trn1      v24.8b, v16.8b, v17.8b                  \n"
-    "trn2      v25.8b, v16.8b, v17.8b                  \n"
+      // Shuffle the input data around to get align the data
+      //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+      // 00 10 01 11 02 12 03 13
+      // 40 50 41 51 42 52 43 53
+      "trn1      v20.8b, v0.8b, v1.8b                    \n"
+      "trn2      v21.8b, v0.8b, v1.8b                    \n"
+      "trn1      v22.8b, v4.8b, v5.8b                    \n"
+      "trn2      v23.8b, v4.8b, v5.8b                    \n"
+      "trn1      v24.8b, v16.8b, v17.8b                  \n"
+      "trn2      v25.8b, v16.8b, v17.8b                  \n"
 
-    // 20 30 21 31 22 32 23 33
-    // 60 70 61 71 62 72 63 73
-    "trn1      v0.8b, v2.8b, v3.8b                     \n"
-    "trn2      v1.8b, v2.8b, v3.8b                     \n"
-    "trn1      v4.8b, v6.8b, v7.8b                     \n"
-    "trn2      v5.8b, v6.8b, v7.8b                     \n"
-    "trn1      v16.8b, v18.8b, v19.8b                  \n"
-    "trn2      v17.8b, v18.8b, v19.8b                  \n"
+      // 20 30 21 31 22 32 23 33
+      // 60 70 61 71 62 72 63 73
+      "trn1      v0.8b, v2.8b, v3.8b                     \n"
+      "trn2      v1.8b, v2.8b, v3.8b                     \n"
+      "trn1      v4.8b, v6.8b, v7.8b                     \n"
+      "trn2      v5.8b, v6.8b, v7.8b                     \n"
+      "trn1      v16.8b, v18.8b, v19.8b                  \n"
+      "trn2      v17.8b, v18.8b, v19.8b                  \n"
 
-    // 00+10 01+11 02+12 03+13
-    // 40+50 41+51 42+52 43+53
-    "uaddlp    v20.4h, v20.8b                          \n"
-    "uaddlp    v21.4h, v21.8b                          \n"
-    "uaddlp    v22.4h, v22.8b                          \n"
-    "uaddlp    v23.4h, v23.8b                          \n"
-    "uaddlp    v24.4h, v24.8b                          \n"
-    "uaddlp    v25.4h, v25.8b                          \n"
+      // 00+10 01+11 02+12 03+13
+      // 40+50 41+51 42+52 43+53
+      "uaddlp    v20.4h, v20.8b                          \n"
+      "uaddlp    v21.4h, v21.8b                          \n"
+      "uaddlp    v22.4h, v22.8b                          \n"
+      "uaddlp    v23.4h, v23.8b                          \n"
+      "uaddlp    v24.4h, v24.8b                          \n"
+      "uaddlp    v25.4h, v25.8b                          \n"
 
-    // 60+70 61+71 62+72 63+73
-    "uaddlp    v1.4h, v1.8b                            \n"
-    "uaddlp    v5.4h, v5.8b                            \n"
-    "uaddlp    v17.4h, v17.8b                          \n"
+      // 60+70 61+71 62+72 63+73
+      "uaddlp    v1.4h, v1.8b                            \n"
+      "uaddlp    v5.4h, v5.8b                            \n"
+      "uaddlp    v17.4h, v17.8b                          \n"
 
-    // combine source lines
-    "add       v20.4h, v20.4h, v22.4h                  \n"
-    "add       v21.4h, v21.4h, v23.4h                  \n"
-    "add       v20.4h, v20.4h, v24.4h                  \n"
-    "add       v21.4h, v21.4h, v25.4h                  \n"
-    "add       v2.4h, v1.4h, v5.4h                     \n"
-    "add       v2.4h, v2.4h, v17.4h                    \n"
+      // combine source lines
+      "add       v20.4h, v20.4h, v22.4h                  \n"
+      "add       v21.4h, v21.4h, v23.4h                  \n"
+      "add       v20.4h, v20.4h, v24.4h                  \n"
+      "add       v21.4h, v21.4h, v25.4h                  \n"
+      "add       v2.4h, v1.4h, v5.4h                     \n"
+      "add       v2.4h, v2.4h, v17.4h                    \n"
 
-    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
-    //             + s[6 + st * 1] + s[7 + st * 1]
-    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
-    "sqrdmulh  v2.8h, v2.8h, v29.8h                    \n"
-    "xtn       v2.8b,  v2.8h                           \n"
+      // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+      //             + s[6 + st * 1] + s[7 + st * 1]
+      //             + s[6 + st * 2] + s[7 + st * 2]) / 6
+      "sqrdmulh  v2.8h, v2.8h, v29.8h                    \n"
+      "xtn       v2.8b,  v2.8h                           \n"
 
-    // Shuffle 2,3 reg around so that 2 can be added to the
-    //  0,1 reg and 3 can be added to the 4,5 reg. This
-    //  requires expanding from u8 to u16 as the 0,1 and 4,5
-    //  registers are already expanded. Then do transposes
-    //  to get aligned.
-    // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
-    "ushll     v16.8h, v16.8b, #0                      \n"
-    "uaddl     v0.8h, v0.8b, v4.8b                     \n"
+      // Shuffle 2,3 reg around so that 2 can be added to the
+      //  0,1 reg and 3 can be added to the 4,5 reg. This
+      //  requires expanding from u8 to u16 as the 0,1 and 4,5
+      //  registers are already expanded. Then do transposes
+      //  to get aligned.
+      // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+      "ushll     v16.8h, v16.8b, #0                      \n"
+      "uaddl     v0.8h, v0.8b, v4.8b                     \n"
 
-    // combine source lines
-    "add       v0.8h, v0.8h, v16.8h                    \n"
+      // combine source lines
+      "add       v0.8h, v0.8h, v16.8h                    \n"
 
-    // xx 20 xx 21 xx 22 xx 23
-    // xx 30 xx 31 xx 32 xx 33
-    "trn1      v1.8h, v0.8h, v0.8h                     \n"
-    "trn2      v4.8h, v0.8h, v0.8h                     \n"
-    "xtn       v0.4h, v1.4s                            \n"
-    "xtn       v4.4h, v4.4s                            \n"
+      // xx 20 xx 21 xx 22 xx 23
+      // xx 30 xx 31 xx 32 xx 33
+      "trn1      v1.8h, v0.8h, v0.8h                     \n"
+      "trn2      v4.8h, v0.8h, v0.8h                     \n"
+      "xtn       v0.4h, v1.4s                            \n"
+      "xtn       v4.4h, v4.4s                            \n"
 
-    // 0+1+2, 3+4+5
-    "add       v20.8h, v20.8h, v0.8h                   \n"
-    "add       v21.8h, v21.8h, v4.8h                   \n"
+      // 0+1+2, 3+4+5
+      "add       v20.8h, v20.8h, v0.8h                   \n"
+      "add       v21.8h, v21.8h, v4.8h                   \n"
 
-    // Need to divide, but can't downshift as the the value
-    //  isn't a power of 2. So multiply by 65536 / n
-    //  and take the upper 16 bits.
-    "sqrdmulh  v0.8h, v20.8h, v31.8h                   \n"
-    "sqrdmulh  v1.8h, v21.8h, v31.8h                   \n"
+      // Need to divide, but can't downshift as the the value
+      //  isn't a power of 2. So multiply by 65536 / n
+      //  and take the upper 16 bits.
+      "sqrdmulh  v0.8h, v20.8h, v31.8h                   \n"
+      "sqrdmulh  v1.8h, v21.8h, v31.8h                   \n"
 
-    // Align for table lookup, vtbl requires registers to
-    //  be adjacent
-    "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
+      // Align for table lookup, vtbl requires registers to be adjacent
+      "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
 
-    MEMACCESS(1)
-    "st1       {v3.8b}, [%1], #8                       \n"
-    MEMACCESS(1)
-    "st1       {v3.s}[2], [%1], #4                     \n"
-    "b.gt      1b                                      \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(tmp_src_stride),   // %2
-    "+r"(src_ptr1),         // %3
-    "+r"(dst_width)         // %4
-  : "r"(&kMult38_Div6),     // %5
-    "r"(&kShuf38_2),        // %6
-    "r"(&kMult38_Div9)      // %7
-  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
-    "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29",
-    "v30", "v31", "memory", "cc"
-  );
+      "st1       {v3.8b}, [%1], #8                       \n"
+      "st1       {v3.s}[2], [%1], #4                     \n"
+      "b.gt      1b                                      \n"
+      : "+r"(src_ptr),         // %0
+        "+r"(dst_ptr),         // %1
+        "+r"(tmp_src_stride),  // %2
+        "+r"(src_ptr1),        // %3
+        "+r"(dst_width)        // %4
+      : "r"(&kMult38_Div6),    // %5
+        "r"(&kShuf38_2),       // %6
+        "r"(&kMult38_Div9)     // %7
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+        "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", "v30", "v31",
+        "memory", "cc");
 }
 
 // 32x2 -> 12x1
-void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
+                               uint8_t* dst_ptr,
+                               int dst_width) {
   // TODO(fbarchard): use src_stride directly for clang 3.5+.
   ptrdiff_t tmp_src_stride = src_stride;
-  asm volatile (
-    MEMACCESS(4)
-    "ld1       {v30.8h}, [%4]                          \n"
-    MEMACCESS(5)
-    "ld1       {v31.16b}, [%5]                         \n"
-    "add       %2, %2, %0                              \n"
-  "1:                                                  \n"
+  asm volatile(
+      "ld1       {v30.8h}, [%4]                          \n"
+      "ld1       {v31.16b}, [%5]                         \n"
+      "add       %2, %2, %0                              \n"
+      "1:                                                \n"
 
-    // 00 40 01 41 02 42 03 43
-    // 10 50 11 51 12 52 13 53
-    // 20 60 21 61 22 62 23 63
-    // 30 70 31 71 32 72 33 73
-    MEMACCESS(0)
-    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
-    MEMACCESS(3)
-    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
-    "subs      %w3, %w3, #12                           \n"
+      // 00 40 01 41 02 42 03 43
+      // 10 50 11 51 12 52 13 53
+      // 20 60 21 61 22 62 23 63
+      // 30 70 31 71 32 72 33 73
+      "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"
+      "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32    \n"
+      "subs      %w3, %w3, #12                           \n"
 
-    // Shuffle the input data around to get align the data
-    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
-    // 00 10 01 11 02 12 03 13
-    // 40 50 41 51 42 52 43 53
-    "trn1      v16.8b, v0.8b, v1.8b                    \n"
-    "trn2      v17.8b, v0.8b, v1.8b                    \n"
-    "trn1      v18.8b, v4.8b, v5.8b                    \n"
-    "trn2      v19.8b, v4.8b, v5.8b                    \n"
+      // Shuffle the input data around to get align the data
+      //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+      // 00 10 01 11 02 12 03 13
+      // 40 50 41 51 42 52 43 53
+      "trn1      v16.8b, v0.8b, v1.8b                    \n"
+      "trn2      v17.8b, v0.8b, v1.8b                    \n"
+      "trn1      v18.8b, v4.8b, v5.8b                    \n"
+      "trn2      v19.8b, v4.8b, v5.8b                    \n"
 
-    // 20 30 21 31 22 32 23 33
-    // 60 70 61 71 62 72 63 73
-    "trn1      v0.8b, v2.8b, v3.8b                     \n"
-    "trn2      v1.8b, v2.8b, v3.8b                     \n"
-    "trn1      v4.8b, v6.8b, v7.8b                     \n"
-    "trn2      v5.8b, v6.8b, v7.8b                     \n"
+      // 20 30 21 31 22 32 23 33
+      // 60 70 61 71 62 72 63 73
+      "trn1      v0.8b, v2.8b, v3.8b                     \n"
+      "trn2      v1.8b, v2.8b, v3.8b                     \n"
+      "trn1      v4.8b, v6.8b, v7.8b                     \n"
+      "trn2      v5.8b, v6.8b, v7.8b                     \n"
 
-    // 00+10 01+11 02+12 03+13
-    // 40+50 41+51 42+52 43+53
-    "uaddlp    v16.4h, v16.8b                          \n"
-    "uaddlp    v17.4h, v17.8b                          \n"
-    "uaddlp    v18.4h, v18.8b                          \n"
-    "uaddlp    v19.4h, v19.8b                          \n"
+      // 00+10 01+11 02+12 03+13
+      // 40+50 41+51 42+52 43+53
+      "uaddlp    v16.4h, v16.8b                          \n"
+      "uaddlp    v17.4h, v17.8b                          \n"
+      "uaddlp    v18.4h, v18.8b                          \n"
+      "uaddlp    v19.4h, v19.8b                          \n"
 
-    // 60+70 61+71 62+72 63+73
-    "uaddlp    v1.4h, v1.8b                            \n"
-    "uaddlp    v5.4h, v5.8b                            \n"
+      // 60+70 61+71 62+72 63+73
+      "uaddlp    v1.4h, v1.8b                            \n"
+      "uaddlp    v5.4h, v5.8b                            \n"
 
-    // combine source lines
-    "add       v16.4h, v16.4h, v18.4h                  \n"
-    "add       v17.4h, v17.4h, v19.4h                  \n"
-    "add       v2.4h, v1.4h, v5.4h                     \n"
+      // combine source lines
+      "add       v16.4h, v16.4h, v18.4h                  \n"
+      "add       v17.4h, v17.4h, v19.4h                  \n"
+      "add       v2.4h, v1.4h, v5.4h                     \n"
 
-    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
-    "uqrshrn   v2.8b, v2.8h, #2                        \n"
+      // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+      "uqrshrn   v2.8b, v2.8h, #2                        \n"
 
-    // Shuffle 2,3 reg around so that 2 can be added to the
-    //  0,1 reg and 3 can be added to the 4,5 reg. This
-    //  requires expanding from u8 to u16 as the 0,1 and 4,5
-    //  registers are already expanded. Then do transposes
-    //  to get aligned.
-    // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+      // Shuffle 2,3 reg around so that 2 can be added to the
+      //  0,1 reg and 3 can be added to the 4,5 reg. This
+      //  requires expanding from u8 to u16 as the 0,1 and 4,5
+      //  registers are already expanded. Then do transposes
+      //  to get aligned.
+      // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
 
-    // combine source lines
-    "uaddl     v0.8h, v0.8b, v4.8b                     \n"
+      // combine source lines
+      "uaddl     v0.8h, v0.8b, v4.8b                     \n"
 
-    // xx 20 xx 21 xx 22 xx 23
-    // xx 30 xx 31 xx 32 xx 33
-    "trn1      v1.8h, v0.8h, v0.8h                     \n"
-    "trn2      v4.8h, v0.8h, v0.8h                     \n"
-    "xtn       v0.4h, v1.4s                            \n"
-    "xtn       v4.4h, v4.4s                            \n"
+      // xx 20 xx 21 xx 22 xx 23
+      // xx 30 xx 31 xx 32 xx 33
+      "trn1      v1.8h, v0.8h, v0.8h                     \n"
+      "trn2      v4.8h, v0.8h, v0.8h                     \n"
+      "xtn       v0.4h, v1.4s                            \n"
+      "xtn       v4.4h, v4.4s                            \n"
 
-    // 0+1+2, 3+4+5
-    "add       v16.8h, v16.8h, v0.8h                   \n"
-    "add       v17.8h, v17.8h, v4.8h                   \n"
+      // 0+1+2, 3+4+5
+      "add       v16.8h, v16.8h, v0.8h                   \n"
+      "add       v17.8h, v17.8h, v4.8h                   \n"
 
-    // Need to divide, but can't downshift as the the value
-    //  isn't a power of 2. So multiply by 65536 / n
-    //  and take the upper 16 bits.
-    "sqrdmulh  v0.8h, v16.8h, v30.8h                   \n"
-    "sqrdmulh  v1.8h, v17.8h, v30.8h                   \n"
+      // Need to divide, but can't downshift as the the value
+      //  isn't a power of 2. So multiply by 65536 / n
+      //  and take the upper 16 bits.
+      "sqrdmulh  v0.8h, v16.8h, v30.8h                   \n"
+      "sqrdmulh  v1.8h, v17.8h, v30.8h                   \n"
 
-    // Align for table lookup, vtbl requires registers to
-    //  be adjacent
+      // Align for table lookup, vtbl requires registers to
+      //  be adjacent
 
-    "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
+      "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
 
-    MEMACCESS(1)
-    "st1       {v3.8b}, [%1], #8                       \n"
-    MEMACCESS(1)
-    "st1       {v3.s}[2], [%1], #4                     \n"
-    "b.gt      1b                                      \n"
-  : "+r"(src_ptr),         // %0
-    "+r"(dst_ptr),         // %1
-    "+r"(tmp_src_stride),  // %2
-    "+r"(dst_width)        // %3
-  : "r"(&kMult38_Div6),    // %4
-    "r"(&kShuf38_2)        // %5
-  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
-    "v18", "v19", "v30", "v31", "memory", "cc"
-  );
+      "st1       {v3.8b}, [%1], #8                       \n"
+      "st1       {v3.s}[2], [%1], #4                     \n"
+      "b.gt      1b                                      \n"
+      : "+r"(src_ptr),         // %0
+        "+r"(dst_ptr),         // %1
+        "+r"(tmp_src_stride),  // %2
+        "+r"(dst_width)        // %3
+      : "r"(&kMult38_Div6),    // %4
+        "r"(&kShuf38_2)        // %5
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+        "v19", "v30", "v31", "memory", "cc");
 }
 
-void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                    uint16* dst_ptr, int src_width, int src_height) {
-  const uint8* src_tmp;
-  asm volatile (
-  "1:                                          \n"
-    "mov       %0, %1                          \n"
-    "mov       w12, %w5                        \n"
-    "eor       v2.16b, v2.16b, v2.16b          \n"
-    "eor       v3.16b, v3.16b, v3.16b          \n"
-  "2:                                          \n"
-    // load 16 pixels into q0
-    MEMACCESS(0)
-    "ld1       {v0.16b}, [%0], %3              \n"
-    "uaddw2    v3.8h, v3.8h, v0.16b            \n"
-    "uaddw     v2.8h, v2.8h, v0.8b             \n"
-    "subs      w12, w12, #1                    \n"
-    "b.gt      2b                              \n"
-    MEMACCESS(2)
-    "st1      {v2.8h, v3.8h}, [%2], #32        \n"  // store pixels
-    "add      %1, %1, #16                      \n"
-    "subs     %w4, %w4, #16                    \n"  // 16 processed per loop
-    "b.gt     1b                               \n"
-  : "=&r"(src_tmp),    // %0
-    "+r"(src_ptr),     // %1
-    "+r"(dst_ptr),     // %2
-    "+r"(src_stride),  // %3
-    "+r"(src_width),   // %4
-    "+r"(src_height)   // %5
-  :
-  : "memory", "cc", "w12", "v0", "v1", "v2", "v3"  // Clobber List
-  );
+void ScaleAddRows_NEON(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint16_t* dst_ptr,
+                       int src_width,
+                       int src_height) {
+  const uint8_t* src_tmp;
+  asm volatile(
+      "1:                                        \n"
+      "mov       %0, %1                          \n"
+      "mov       w12, %w5                        \n"
+      "eor       v2.16b, v2.16b, v2.16b          \n"
+      "eor       v3.16b, v3.16b, v3.16b          \n"
+      "2:                                        \n"
+      // load 16 pixels into q0
+      "ld1       {v0.16b}, [%0], %3              \n"
+      "uaddw2    v3.8h, v3.8h, v0.16b            \n"
+      "uaddw     v2.8h, v2.8h, v0.8b             \n"
+      "subs      w12, w12, #1                    \n"
+      "b.gt      2b                              \n"
+      "st1      {v2.8h, v3.8h}, [%2], #32        \n"  // store pixels
+      "add      %1, %1, #16                      \n"
+      "subs     %w4, %w4, #16                    \n"  // 16 processed per loop
+      "b.gt     1b                               \n"
+      : "=&r"(src_tmp),    // %0
+        "+r"(src_ptr),     // %1
+        "+r"(dst_ptr),     // %2
+        "+r"(src_stride),  // %3
+        "+r"(src_width),   // %4
+        "+r"(src_height)   // %5
+      :
+      : "memory", "cc", "w12", "v0", "v1", "v2", "v3"  // Clobber List
+      );
 }
 
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
-#define LOAD2_DATA8_LANE(n)                                    \
-    "lsr        %5, %3, #16                    \n"             \
-    "add        %6, %1, %5                    \n"              \
-    "add        %3, %3, %4                     \n"             \
-    MEMACCESS(6)                                               \
-    "ld2        {v4.b, v5.b}["#n"], [%6]      \n"
+#define LOAD2_DATA8_LANE(n)                      \
+  "lsr        %5, %3, #16                    \n" \
+  "add        %6, %1, %5                     \n" \
+  "add        %3, %3, %4                     \n" \
+  "ld2        {v4.b, v5.b}[" #n "], [%6]     \n"
 
-void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
-                          int dst_width, int x, int dx) {
+// The NEON version mimics this formula (from row_common.cc):
+// #define BLENDER(a, b, f) (uint8_t)((int)(a) +
+//    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+
+void ScaleFilterCols_NEON(uint8_t* dst_ptr,
+                          const uint8_t* src_ptr,
+                          int dst_width,
+                          int x,
+                          int dx) {
   int dx_offset[4] = {0, 1, 2, 3};
   int* tmp = dx_offset;
-  const uint8* src_tmp = src_ptr;
-  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
-  int64 x64 = (int64) x;
-  int64 dx64 = (int64) dx;
+  const uint8_t* src_tmp = src_ptr;
+  int64_t x64 = (int64_t)x;    // NOLINT
+  int64_t dx64 = (int64_t)dx;  // NOLINT
   asm volatile (
     "dup        v0.4s, %w3                     \n"  // x
     "dup        v1.4s, %w4                     \n"  // dx
@@ -626,12 +602,11 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
     "ushll2    v6.4s, v6.8h, #0                \n"
     "mul       v16.4s, v16.4s, v7.4s           \n"
     "mul       v17.4s, v17.4s, v6.4s           \n"
-    "rshrn      v6.4h, v16.4s, #16             \n"
-    "rshrn2     v6.8h, v17.4s, #16             \n"
+    "rshrn     v6.4h, v16.4s, #16              \n"
+    "rshrn2    v6.8h, v17.4s, #16              \n"
     "add       v4.8h, v4.8h, v6.8h             \n"
     "xtn       v4.8b, v4.8h                    \n"
 
-    MEMACCESS(0)
     "st1       {v4.8b}, [%0], #8               \n"  // store pixels
     "add       v1.4s, v1.4s, v0.4s             \n"
     "add       v2.4s, v2.4s, v0.4s             \n"
@@ -639,7 +614,7 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
     "b.gt      1b                              \n"
   : "+r"(dst_ptr),          // %0
     "+r"(src_ptr),          // %1
-    "+r"(dst_width64),      // %2
+    "+r"(dst_width),        // %2
     "+r"(x64),              // %3
     "+r"(dx64),             // %4
     "+r"(tmp),              // %5
@@ -653,331 +628,300 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
 #undef LOAD2_DATA8_LANE
 
 // 16x2 -> 16x1
-void ScaleFilterRows_NEON(uint8* dst_ptr,
-                          const uint8* src_ptr, ptrdiff_t src_stride,
-                          int dst_width, int source_y_fraction) {
-    int y_fraction = 256 - source_y_fraction;
-  asm volatile (
-    "cmp          %w4, #0                      \n"
-    "b.eq         100f                         \n"
-    "add          %2, %2, %1                   \n"
-    "cmp          %w4, #64                     \n"
-    "b.eq         75f                          \n"
-    "cmp          %w4, #128                    \n"
-    "b.eq         50f                          \n"
-    "cmp          %w4, #192                    \n"
-    "b.eq         25f                          \n"
+void ScaleFilterRows_NEON(uint8_t* dst_ptr,
+                          const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          int dst_width,
+                          int source_y_fraction) {
+  int y_fraction = 256 - source_y_fraction;
+  asm volatile(
+      "cmp          %w4, #0                      \n"
+      "b.eq         100f                         \n"
+      "add          %2, %2, %1                   \n"
+      "cmp          %w4, #64                     \n"
+      "b.eq         75f                          \n"
+      "cmp          %w4, #128                    \n"
+      "b.eq         50f                          \n"
+      "cmp          %w4, #192                    \n"
+      "b.eq         25f                          \n"
 
-    "dup          v5.8b, %w4                   \n"
-    "dup          v4.8b, %w5                   \n"
-    // General purpose row blend.
-  "1:                                          \n"
-    MEMACCESS(1)
-    "ld1          {v0.16b}, [%1], #16          \n"
-    MEMACCESS(2)
-    "ld1          {v1.16b}, [%2], #16          \n"
-    "subs         %w3, %w3, #16                \n"
-    "umull        v6.8h, v0.8b, v4.8b          \n"
-    "umull2       v7.8h, v0.16b, v4.16b        \n"
-    "umlal        v6.8h, v1.8b, v5.8b          \n"
-    "umlal2       v7.8h, v1.16b, v5.16b        \n"
-    "rshrn        v0.8b, v6.8h, #8             \n"
-    "rshrn2       v0.16b, v7.8h, #8            \n"
-    MEMACCESS(0)
-    "st1          {v0.16b}, [%0], #16          \n"
-    "b.gt         1b                           \n"
-    "b            99f                          \n"
+      "dup          v5.8b, %w4                   \n"
+      "dup          v4.8b, %w5                   \n"
+      // General purpose row blend.
+      "1:                                        \n"
+      "ld1          {v0.16b}, [%1], #16          \n"
+      "ld1          {v1.16b}, [%2], #16          \n"
+      "subs         %w3, %w3, #16                \n"
+      "umull        v6.8h, v0.8b, v4.8b          \n"
+      "umull2       v7.8h, v0.16b, v4.16b        \n"
+      "umlal        v6.8h, v1.8b, v5.8b          \n"
+      "umlal2       v7.8h, v1.16b, v5.16b        \n"
+      "rshrn        v0.8b, v6.8h, #8             \n"
+      "rshrn2       v0.16b, v7.8h, #8            \n"
+      "st1          {v0.16b}, [%0], #16          \n"
+      "b.gt         1b                           \n"
+      "b            99f                          \n"
 
-    // Blend 25 / 75.
-  "25:                                         \n"
-    MEMACCESS(1)
-    "ld1          {v0.16b}, [%1], #16          \n"
-    MEMACCESS(2)
-    "ld1          {v1.16b}, [%2], #16          \n"
-    "subs         %w3, %w3, #16                \n"
-    "urhadd       v0.16b, v0.16b, v1.16b       \n"
-    "urhadd       v0.16b, v0.16b, v1.16b       \n"
-    MEMACCESS(0)
-    "st1          {v0.16b}, [%0], #16          \n"
-    "b.gt         25b                          \n"
-    "b            99f                          \n"
+      // Blend 25 / 75.
+      "25:                                       \n"
+      "ld1          {v0.16b}, [%1], #16          \n"
+      "ld1          {v1.16b}, [%2], #16          \n"
+      "subs         %w3, %w3, #16                \n"
+      "urhadd       v0.16b, v0.16b, v1.16b       \n"
+      "urhadd       v0.16b, v0.16b, v1.16b       \n"
+      "st1          {v0.16b}, [%0], #16          \n"
+      "b.gt         25b                          \n"
+      "b            99f                          \n"
 
-    // Blend 50 / 50.
-  "50:                                         \n"
-    MEMACCESS(1)
-    "ld1          {v0.16b}, [%1], #16          \n"
-    MEMACCESS(2)
-    "ld1          {v1.16b}, [%2], #16          \n"
-    "subs         %w3, %w3, #16                \n"
-    "urhadd       v0.16b, v0.16b, v1.16b       \n"
-    MEMACCESS(0)
-    "st1          {v0.16b}, [%0], #16          \n"
-    "b.gt         50b                          \n"
-    "b            99f                          \n"
+      // Blend 50 / 50.
+      "50:                                       \n"
+      "ld1          {v0.16b}, [%1], #16          \n"
+      "ld1          {v1.16b}, [%2], #16          \n"
+      "subs         %w3, %w3, #16                \n"
+      "urhadd       v0.16b, v0.16b, v1.16b       \n"
+      "st1          {v0.16b}, [%0], #16          \n"
+      "b.gt         50b                          \n"
+      "b            99f                          \n"
 
-    // Blend 75 / 25.
-  "75:                                         \n"
-    MEMACCESS(1)
-    "ld1          {v1.16b}, [%1], #16          \n"
-    MEMACCESS(2)
-    "ld1          {v0.16b}, [%2], #16          \n"
-    "subs         %w3, %w3, #16                \n"
-    "urhadd       v0.16b, v0.16b, v1.16b       \n"
-    "urhadd       v0.16b, v0.16b, v1.16b       \n"
-    MEMACCESS(0)
-    "st1          {v0.16b}, [%0], #16          \n"
-    "b.gt         75b                          \n"
-    "b            99f                          \n"
+      // Blend 75 / 25.
+      "75:                                       \n"
+      "ld1          {v1.16b}, [%1], #16          \n"
+      "ld1          {v0.16b}, [%2], #16          \n"
+      "subs         %w3, %w3, #16                \n"
+      "urhadd       v0.16b, v0.16b, v1.16b       \n"
+      "urhadd       v0.16b, v0.16b, v1.16b       \n"
+      "st1          {v0.16b}, [%0], #16          \n"
+      "b.gt         75b                          \n"
+      "b            99f                          \n"
 
-    // Blend 100 / 0 - Copy row unchanged.
-  "100:                                        \n"
-    MEMACCESS(1)
-    "ld1          {v0.16b}, [%1], #16          \n"
-    "subs         %w3, %w3, #16                \n"
-    MEMACCESS(0)
-    "st1          {v0.16b}, [%0], #16          \n"
-    "b.gt         100b                         \n"
+      // Blend 100 / 0 - Copy row unchanged.
+      "100:                                      \n"
+      "ld1          {v0.16b}, [%1], #16          \n"
+      "subs         %w3, %w3, #16                \n"
+      "st1          {v0.16b}, [%0], #16          \n"
+      "b.gt         100b                         \n"
 
-  "99:                                         \n"
-    MEMACCESS(0)
-    "st1          {v0.b}[15], [%0]             \n"
-  : "+r"(dst_ptr),          // %0
-    "+r"(src_ptr),          // %1
-    "+r"(src_stride),       // %2
-    "+r"(dst_width),        // %3
-    "+r"(source_y_fraction),// %4
-    "+r"(y_fraction)        // %5
-  :
-  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"
-  );
+      "99:                                       \n"
+      "st1          {v0.b}[15], [%0]             \n"
+      : "+r"(dst_ptr),            // %0
+        "+r"(src_ptr),            // %1
+        "+r"(src_stride),         // %2
+        "+r"(dst_width),          // %3
+        "+r"(source_y_fraction),  // %4
+        "+r"(y_fraction)          // %5
+      :
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc");
 }
 
-void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    // load even pixels into q0, odd into q1
-    MEMACCESS (0)
-    "ld2        {v0.4s, v1.4s}, [%0], #32      \n"
-    MEMACCESS (0)
-    "ld2        {v2.4s, v3.4s}, [%0], #32      \n"
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
-    MEMACCESS (1)
-    "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
-    MEMACCESS (1)
-    "st1        {v3.16b}, [%1], #16            \n"
-    "b.gt       1b                             \n"
-  : "+r" (src_ptr),          // %0
-    "+r" (dst),              // %1
-    "+r" (dst_width)         // %2
-  :
-  : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
-  );
+void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst,
+                            int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
+      "ld4        {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+      "mov        v2.16b, v3.16b                 \n"
+      "st2        {v1.4s,v2.4s}, [%1], #32       \n"  // store 8 odd pixels
+      "b.gt       1b                             \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
+      );
 }
 
-void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
-                                  uint8* dst_argb, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS (0)
-    // load 8 ARGB pixels.
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
-    "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
-    "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack
-    "rshrn      v1.8b, v1.8h, #1               \n"
-    "rshrn      v2.8b, v2.8h, #1               \n"
-    "rshrn      v3.8b, v3.8h, #1               \n"
-    MEMACCESS (1)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32     \n"
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),         // %0
-    "+r"(dst_argb),         // %1
-    "+r"(dst_width)         // %2
-  :
-  : "memory", "cc", "v0", "v1", "v2", "v3"    // Clobber List
-  );
+void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_argb,
+                                  int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
+      "ld4        {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+
+      "urhadd     v0.16b, v0.16b, v1.16b         \n"  // rounding half add
+      "urhadd     v1.16b, v2.16b, v3.16b         \n"
+      "st2        {v0.4s,v1.4s}, [%1], #32       \n"  // store 8 pixels
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
+      );
 }
 
-void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst, int dst_width) {
-  asm volatile (
-    // change the stride to row 2 pointer
-    "add        %1, %1, %0                     \n"
-  "1:                                          \n"
-    MEMACCESS (0)
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"  // load 8 ARGB pixels.
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
-    "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
-    MEMACCESS (1)
-    "ld4        {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n"  // load 8 more ARGB pixels.
-    "uadalp     v0.8h, v16.16b                 \n"  // B 16 bytes -> 8 shorts.
-    "uadalp     v1.8h, v17.16b                 \n"  // G 16 bytes -> 8 shorts.
-    "uadalp     v2.8h, v18.16b                 \n"  // R 16 bytes -> 8 shorts.
-    "uadalp     v3.8h, v19.16b                 \n"  // A 16 bytes -> 8 shorts.
-    "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack
-    "rshrn      v1.8b, v1.8h, #2               \n"
-    "rshrn      v2.8b, v2.8h, #2               \n"
-    "rshrn      v3.8b, v3.8h, #2               \n"
-    MEMACCESS (2)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32     \n"
-    "b.gt       1b                             \n"
-  : "+r" (src_ptr),          // %0
-    "+r" (src_stride),       // %1
-    "+r" (dst),              // %2
-    "+r" (dst_width)         // %3
-  :
-  : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"
-  );
+void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst,
+                               int dst_width) {
+  asm volatile(
+      // change the stride to row 2 pointer
+      "add        %1, %1, %0                     \n"
+      "1:                                        \n"
+      "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 8 ARGB
+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+      "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+      "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+      "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
+      "ld4        {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n"  // load 8
+      "uadalp     v0.8h, v16.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uadalp     v1.8h, v17.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp     v2.8h, v18.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "uadalp     v3.8h, v19.16b                 \n"  // A 16 bytes -> 8 shorts.
+      "rshrn      v0.8b, v0.8h, #2               \n"  // round and pack
+      "rshrn      v1.8b, v1.8h, #2               \n"
+      "rshrn      v2.8b, v2.8h, #2               \n"
+      "rshrn      v3.8b, v3.8h, #2               \n"
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32     \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
 }
 
 // Reads 4 pixels at a time.
 // Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
-                               int src_stepx, uint8* dst_argb, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.s}[0], [%0], %3            \n"
-    MEMACCESS(0)
-    "ld1        {v0.s}[1], [%0], %3            \n"
-    MEMACCESS(0)
-    "ld1        {v0.s}[2], [%0], %3            \n"
-    MEMACCESS(0)
-    "ld1        {v0.s}[3], [%0], %3            \n"
-    "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
-    MEMACCESS(1)
-    "st1        {v0.16b}, [%1], #16            \n"
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+r"(dst_width)    // %2
-  : "r"((int64)(src_stepx * 4)) // %3
-  : "memory", "cc", "v0"
-  );
+void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8_t* dst_argb,
+                               int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v0.s}[0], [%0], %3            \n"
+      "ld1        {v0.s}[1], [%0], %3            \n"
+      "ld1        {v0.s}[2], [%0], %3            \n"
+      "ld1        {v0.s}[3], [%0], %3            \n"
+      "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
+      "st1        {v0.16b}, [%1], #16            \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),                // %0
+        "+r"(dst_argb),                // %1
+        "+r"(dst_width)                // %2
+      : "r"((int64_t)(src_stepx * 4))  // %3
+      : "memory", "cc", "v0");
 }
 
 // Reads 4 pixels at a time.
 // Alignment requirement: src_argb 4 byte aligned.
 // TODO(Yang Zhang): Might be worth another optimization pass in future.
 // It could be upgraded to 8 pixels at a time to start with.
-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
                                   int src_stepx,
-                                  uint8* dst_argb, int dst_width) {
-  asm volatile (
-    "add        %1, %1, %0                     \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.8b}, [%0], %4              \n"  // Read 4 2x2 blocks -> 2x1
-    MEMACCESS(1)
-    "ld1        {v1.8b}, [%1], %4              \n"
-    MEMACCESS(0)
-    "ld1        {v2.8b}, [%0], %4              \n"
-    MEMACCESS(1)
-    "ld1        {v3.8b}, [%1], %4              \n"
-    MEMACCESS(0)
-    "ld1        {v4.8b}, [%0], %4              \n"
-    MEMACCESS(1)
-    "ld1        {v5.8b}, [%1], %4              \n"
-    MEMACCESS(0)
-    "ld1        {v6.8b}, [%0], %4              \n"
-    MEMACCESS(1)
-    "ld1        {v7.8b}, [%1], %4              \n"
-    "uaddl      v0.8h, v0.8b, v1.8b            \n"
-    "uaddl      v2.8h, v2.8b, v3.8b            \n"
-    "uaddl      v4.8h, v4.8b, v5.8b            \n"
-    "uaddl      v6.8h, v6.8b, v7.8b            \n"
-    "mov        v16.d[1], v0.d[1]              \n"  // ab_cd -> ac_bd
-    "mov        v0.d[1], v2.d[0]               \n"
-    "mov        v2.d[0], v16.d[1]              \n"
-    "mov        v16.d[1], v4.d[1]              \n"  // ef_gh -> eg_fh
-    "mov        v4.d[1], v6.d[0]               \n"
-    "mov        v6.d[0], v16.d[1]              \n"
-    "add        v0.8h, v0.8h, v2.8h            \n"  // (a+b)_(c+d)
-    "add        v4.8h, v4.8h, v6.8h            \n"  // (e+f)_(g+h)
-    "rshrn      v0.8b, v0.8h, #2               \n"  // first 2 pixels.
-    "rshrn2     v0.16b, v4.8h, #2              \n"  // next 2 pixels.
-    "subs       %w3, %w3, #4                   \n"  // 4 pixels per loop.
-    MEMACCESS(2)
-    "st1     {v0.16b}, [%2], #16               \n"
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),    // %0
-    "+r"(src_stride),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(dst_width)    // %3
-  : "r"((int64)(src_stepx * 4)) // %4
-  : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
-  );
+                                  uint8_t* dst_argb,
+                                  int dst_width) {
+  asm volatile(
+      "add        %1, %1, %0                     \n"
+      "1:                                        \n"
+      "ld1        {v0.8b}, [%0], %4              \n"  // Read 4 2x2 -> 2x1
+      "ld1        {v1.8b}, [%1], %4              \n"
+      "ld1        {v2.8b}, [%0], %4              \n"
+      "ld1        {v3.8b}, [%1], %4              \n"
+      "ld1        {v4.8b}, [%0], %4              \n"
+      "ld1        {v5.8b}, [%1], %4              \n"
+      "ld1        {v6.8b}, [%0], %4              \n"
+      "ld1        {v7.8b}, [%1], %4              \n"
+      "uaddl      v0.8h, v0.8b, v1.8b            \n"
+      "uaddl      v2.8h, v2.8b, v3.8b            \n"
+      "uaddl      v4.8h, v4.8b, v5.8b            \n"
+      "uaddl      v6.8h, v6.8b, v7.8b            \n"
+      "mov        v16.d[1], v0.d[1]              \n"  // ab_cd -> ac_bd
+      "mov        v0.d[1], v2.d[0]               \n"
+      "mov        v2.d[0], v16.d[1]              \n"
+      "mov        v16.d[1], v4.d[1]              \n"  // ef_gh -> eg_fh
+      "mov        v4.d[1], v6.d[0]               \n"
+      "mov        v6.d[0], v16.d[1]              \n"
+      "add        v0.8h, v0.8h, v2.8h            \n"  // (a+b)_(c+d)
+      "add        v4.8h, v4.8h, v6.8h            \n"  // (e+f)_(g+h)
+      "rshrn      v0.8b, v0.8h, #2               \n"  // first 2 pixels.
+      "rshrn2     v0.16b, v4.8h, #2              \n"  // next 2 pixels.
+      "subs       %w3, %w3, #4                   \n"  // 4 pixels per loop.
+      "st1     {v0.16b}, [%2], #16               \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),                // %0
+        "+r"(src_stride),              // %1
+        "+r"(dst_argb),                // %2
+        "+r"(dst_width)                // %3
+      : "r"((int64_t)(src_stepx * 4))  // %4
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
 }
 
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
-#define LOAD1_DATA32_LANE(vn, n)                               \
-    "lsr        %5, %3, #16                    \n"             \
-    "add        %6, %1, %5, lsl #2             \n"             \
-    "add        %3, %3, %4                     \n"             \
-    MEMACCESS(6)                                               \
-    "ld1        {"#vn".s}["#n"], [%6]          \n"
+#define LOAD1_DATA32_LANE(vn, n)                 \
+  "lsr        %5, %3, #16                    \n" \
+  "add        %6, %1, %5, lsl #2             \n" \
+  "add        %3, %3, %4                     \n" \
+  "ld1        {" #vn ".s}[" #n "], [%6]      \n"
 
-void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx) {
-  const uint8* src_tmp = src_argb;
-  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
-  int64 x64 = (int64) x;
-  int64 dx64 = (int64) dx;
-  int64 tmp64;
-  asm volatile (
-  "1:                                          \n"
-    LOAD1_DATA32_LANE(v0, 0)
-    LOAD1_DATA32_LANE(v0, 1)
-    LOAD1_DATA32_LANE(v0, 2)
-    LOAD1_DATA32_LANE(v0, 3)
-    LOAD1_DATA32_LANE(v1, 0)
-    LOAD1_DATA32_LANE(v1, 1)
-    LOAD1_DATA32_LANE(v1, 2)
-    LOAD1_DATA32_LANE(v1, 3)
-
-    MEMACCESS(0)
-    "st1        {v0.4s, v1.4s}, [%0], #32      \n"  // store pixels
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
-    "b.gt        1b                            \n"
-  : "+r"(dst_argb),     // %0
-    "+r"(src_argb),     // %1
-    "+r"(dst_width64),  // %2
-    "+r"(x64),          // %3
-    "+r"(dx64),         // %4
-    "=&r"(tmp64),       // %5
-    "+r"(src_tmp)       // %6
-  :
-  : "memory", "cc", "v0", "v1"
-  );
+void ScaleARGBCols_NEON(uint8_t* dst_argb,
+                        const uint8_t* src_argb,
+                        int dst_width,
+                        int x,
+                        int dx) {
+  const uint8_t* src_tmp = src_argb;
+  int64_t x64 = (int64_t)x;    // NOLINT
+  int64_t dx64 = (int64_t)dx;  // NOLINT
+  int64_t tmp64;
+  asm volatile(
+      "1:                                        \n"
+      // clang-format off
+      LOAD1_DATA32_LANE(v0, 0)
+      LOAD1_DATA32_LANE(v0, 1)
+      LOAD1_DATA32_LANE(v0, 2)
+      LOAD1_DATA32_LANE(v0, 3)
+      LOAD1_DATA32_LANE(v1, 0)
+      LOAD1_DATA32_LANE(v1, 1)
+      LOAD1_DATA32_LANE(v1, 2)
+      LOAD1_DATA32_LANE(v1, 3)
+      // clang-format on
+      "st1        {v0.4s, v1.4s}, [%0], #32      \n"  // store pixels
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+      "b.gt       1b                             \n"
+      : "+r"(dst_argb),   // %0
+        "+r"(src_argb),   // %1
+        "+r"(dst_width),  // %2
+        "+r"(x64),        // %3
+        "+r"(dx64),       // %4
+        "=&r"(tmp64),     // %5
+        "+r"(src_tmp)     // %6
+      :
+      : "memory", "cc", "v0", "v1");
 }
 
 #undef LOAD1_DATA32_LANE
 
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
-#define LOAD2_DATA32_LANE(vn1, vn2, n)                         \
-    "lsr        %5, %3, #16                           \n"      \
-    "add        %6, %1, %5, lsl #2                    \n"      \
-    "add        %3, %3, %4                            \n"      \
-    MEMACCESS(6)                                               \
-    "ld2        {"#vn1".s, "#vn2".s}["#n"], [%6]      \n"
+#define LOAD2_DATA32_LANE(vn1, vn2, n)                  \
+  "lsr        %5, %3, #16                           \n" \
+  "add        %6, %1, %5, lsl #2                    \n" \
+  "add        %3, %3, %4                            \n" \
+  "ld2        {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6]  \n"
 
-void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
-                              int dst_width, int x, int dx) {
+void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
+                              const uint8_t* src_argb,
+                              int dst_width,
+                              int x,
+                              int dx) {
   int dx_offset[4] = {0, 1, 2, 3};
   int* tmp = dx_offset;
-  const uint8* src_tmp = src_argb;
-  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
-  int64 x64 = (int64) x;
-  int64 dx64 = (int64) dx;
+  const uint8_t* src_tmp = src_argb;
+  int64_t x64 = (int64_t)x;    // NOLINT
+  int64_t dx64 = (int64_t)dx;  // NOLINT
   asm volatile (
     "dup        v0.4s, %w3                     \n"  // x
     "dup        v1.4s, %w4                     \n"  // dx
@@ -1014,14 +958,13 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
     "shrn       v0.8b, v16.8h, #7              \n"
     "shrn2      v0.16b, v17.8h, #7             \n"
 
-    MEMACCESS(0)
     "st1     {v0.4s}, [%0], #16                \n"  // store pixels
     "add     v5.4s, v5.4s, v6.4s               \n"
     "subs    %w2, %w2, #4                      \n"  // 4 processed per loop
     "b.gt    1b                                \n"
   : "+r"(dst_argb),         // %0
     "+r"(src_argb),         // %1
-    "+r"(dst_width64),      // %2
+    "+r"(dst_width),        // %2
     "+r"(x64),              // %3
     "+r"(dx64),             // %4
     "+r"(tmp),              // %5
@@ -1034,6 +977,85 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
 
 #undef LOAD2_DATA32_LANE
 
+// Read 16x2 average down and write 8x1.
+void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint16_t* dst,
+                              int dst_width) {
+  asm volatile(
+      // change the stride to row 2 pointer
+      "add        %1, %0, %1, lsl #1             \n"  // ptr + stide * 2
+      "1:                                        \n"
+      "ld1        {v0.8h, v1.8h}, [%0], #32      \n"  // load row 1 and post inc
+      "ld1        {v2.8h, v3.8h}, [%1], #32      \n"  // load row 2 and post inc
+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop
+      "uaddlp     v0.4s, v0.8h                   \n"  // row 1 add adjacent
+      "uaddlp     v1.4s, v1.8h                   \n"
+      "uadalp     v0.4s, v2.8h                   \n"  // +row 2 add adjacent
+      "uadalp     v1.4s, v3.8h                   \n"
+      "rshrn      v0.4h, v0.4s, #2               \n"  // round and pack
+      "rshrn2     v0.8h, v1.4s, #2               \n"
+      "st1        {v0.8h}, [%2], #16             \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      :
+      : "v0", "v1", "v2", "v3"  // Clobber List
+      );
+}
+
+// Read 8x2 upsample with filtering and write 16x1.
+// Actually reads an extra pixel, so 9x2.
+void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint16_t* dst,
+                         int dst_width) {
+  asm volatile(
+      "add        %1, %0, %1, lsl #1             \n"  // ptr + stide * 2
+      "movi       v0.8h, #9                      \n"  // constants
+      "movi       v1.4s, #3                      \n"
+
+      "1:                                        \n"
+      "ld1        {v3.8h}, [%0], %4              \n"  // TL read first 8
+      "ld1        {v4.8h}, [%0], %5              \n"  // TR read 8 offset by 1
+      "ld1        {v5.8h}, [%1], %4              \n"  // BL read 8 from next row
+      "ld1        {v6.8h}, [%1], %5              \n"  // BR offset by 1
+      "subs       %w3, %w3, #16                  \n"  // 16 dst pixels per loop
+      "umull      v16.4s, v3.4h, v0.4h           \n"
+      "umull2     v7.4s, v3.8h, v0.8h            \n"
+      "umull      v18.4s, v4.4h, v0.4h           \n"
+      "umull2     v17.4s, v4.8h, v0.8h           \n"
+      "uaddw      v16.4s, v16.4s, v6.4h          \n"
+      "uaddl2     v19.4s, v6.8h, v3.8h           \n"
+      "uaddl      v3.4s, v6.4h, v3.4h            \n"
+      "uaddw2     v6.4s, v7.4s, v6.8h            \n"
+      "uaddl2     v7.4s, v5.8h, v4.8h            \n"
+      "uaddl      v4.4s, v5.4h, v4.4h            \n"
+      "uaddw      v18.4s, v18.4s, v5.4h          \n"
+      "mla        v16.4s, v4.4s, v1.4s           \n"
+      "mla        v18.4s, v3.4s, v1.4s           \n"
+      "mla        v6.4s, v7.4s, v1.4s            \n"
+      "uaddw2     v4.4s, v17.4s, v5.8h           \n"
+      "uqrshrn    v16.4h,  v16.4s, #4            \n"
+      "mla        v4.4s, v19.4s, v1.4s           \n"
+      "uqrshrn2   v16.8h, v6.4s, #4              \n"
+      "uqrshrn    v17.4h, v18.4s, #4             \n"
+      "uqrshrn2   v17.8h, v4.4s, #4              \n"
+      "st2        {v16.8h-v17.8h}, [%2], #32     \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      : "r"(2LL),          // %4
+        "r"(14LL)          // %5
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+        "v19"  // Clobber List
+      );
+}
+
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
 #ifdef __cplusplus
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/scale_win.cc b/media/libvpx/libvpx/third_party/libyuv/source/scale_win.cc
index f17097365c..c5fc86f3e9 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/scale_win.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/scale_win.cc
@@ -17,97 +17,93 @@ extern "C" {
 #endif
 
 // This module is for 32 bit Visual C x86 and clangcl
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
 
 // Offsets for source bytes 0 to 9
-static uvec8 kShuf0 =
-  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,
+                             128, 128, 128, 128, 128, 128, 128, 128};
 
 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
-static uvec8 kShuf1 =
-  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShuf1 = {3,   4,   5,   7,   8,   9,   11,  12,
+                             128, 128, 128, 128, 128, 128, 128, 128};
 
 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static uvec8 kShuf2 =
-  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShuf2 = {5,   7,   8,   9,   11,  12,  13,  15,
+                             128, 128, 128, 128, 128, 128, 128, 128};
 
 // Offsets for source bytes 0 to 10
-static uvec8 kShuf01 =
-  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
+static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
 
 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
-static uvec8 kShuf11 =
-  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
+static const uvec8 kShuf11 = {2, 3, 4, 5,  5,  6,  6,  7,
+                              8, 9, 9, 10, 10, 11, 12, 13};
 
 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static uvec8 kShuf21 =
-  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
+static const uvec8 kShuf21 = {5,  6,  6,  7,  8,  9,  9,  10,
+                              10, 11, 12, 13, 13, 14, 14, 15};
 
 // Coefficients for source bytes 0 to 10
-static uvec8 kMadd01 =
-  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
+static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
 
 // Coefficients for source bytes 10 to 21
-static uvec8 kMadd11 =
-  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
+static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
 
 // Coefficients for source bytes 21 to 31
-static uvec8 kMadd21 =
-  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
+static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
 
 // Coefficients for source bytes 21 to 31
-static vec16 kRound34 =
-  { 2, 2, 2, 2, 2, 2, 2, 2 };
+static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
 
-static uvec8 kShuf38a =
-  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShuf38a = {0,   3,   6,   8,   11,  14,  128, 128,
+                               128, 128, 128, 128, 128, 128, 128, 128};
 
-static uvec8 kShuf38b =
-  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
+static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0,   3,
+                               6,   8,   11,  14,  128, 128, 128, 128};
 
 // Arrange words 0,3,6 into 0,1,2
-static uvec8 kShufAc =
-  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShufAc = {0,   1,   6,   7,   12,  13,  128, 128,
+                              128, 128, 128, 128, 128, 128, 128, 128};
 
 // Arrange words 0,3,6 into 3,4,5
-static uvec8 kShufAc3 =
-  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
+static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0,   1,
+                               6,   7,   12,  13,  128, 128, 128, 128};
 
 // Scaling values for boxes of 3x3 and 2x3
-static uvec16 kScaleAc33 =
-  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
+static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
+                                  65536 / 9, 65536 / 6, 0,         0};
 
 // Arrange first value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb0 =
-  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
+static const uvec8 kShufAb0 = {0,  128, 3,  128, 6,   128, 8,   128,
+                               11, 128, 14, 128, 128, 128, 128, 128};
 
 // Arrange second value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb1 =
-  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
+static const uvec8 kShufAb1 = {1,  128, 4,  128, 7,   128, 9,   128,
+                               12, 128, 15, 128, 128, 128, 128, 128};
 
 // Arrange third value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb2 =
-  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShufAb2 = {2,  128, 5,   128, 128, 128, 10,  128,
+                               13, 128, 128, 128, 128, 128, 128, 128};
 
 // Scaling values for boxes of 3x2 and 2x2
-static uvec16 kScaleAb2 =
-  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
+static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
+                                 65536 / 3, 65536 / 2, 0,         0};
 
 // Reads 32 pixels, throws half away and writes 16 pixels.
-__declspec(naked)
-void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
+                                           ptrdiff_t src_stride,
+                                           uint8_t* dst_ptr,
+                                           int dst_width) {
   __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
+    mov        eax, [esp + 4]  // src_ptr
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_ptr
+    mov        ecx, [esp + 16]  // dst_width
 
   wloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
-    psrlw      xmm0, 8               // isolate odd pixels.
+    psrlw      xmm0, 8          // isolate odd pixels.
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
     movdqu     [edx], xmm0
@@ -120,27 +116,28 @@ void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 }
 
 // Blends 32x1 rectangle to 16x1.
-__declspec(naked)
-void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
+                                                 ptrdiff_t src_stride,
+                                                 uint8_t* dst_ptr,
+                                                 int dst_width) {
   __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
+    mov        eax, [esp + 4]  // src_ptr
+    // src_stride
+    mov        edx, [esp + 12]  // dst_ptr
+    mov        ecx, [esp + 16]  // dst_width
 
-    pcmpeqb    xmm4, xmm4            // constant 0x0101
+    pcmpeqb    xmm4, xmm4  // constant 0x0101
     psrlw      xmm4, 15
     packuswb   xmm4, xmm4
-    pxor       xmm5, xmm5            // constant 0
+    pxor       xmm5, xmm5  // constant 0
 
   wloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
-    pmaddubsw  xmm0, xmm4      // horizontal add
+    pmaddubsw  xmm0, xmm4  // horizontal add
     pmaddubsw  xmm1, xmm4
-    pavgw      xmm0, xmm5      // (x + 1) / 2
+    pavgw      xmm0, xmm5       // (x + 1) / 2
     pavgw      xmm1, xmm5
     packuswb   xmm0, xmm1
     movdqu     [edx], xmm0
@@ -153,20 +150,21 @@ void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 }
 
 // Blends 32x2 rectangle to 16x1.
-__declspec(naked)
-void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
+                                              ptrdiff_t src_stride,
+                                              uint8_t* dst_ptr,
+                                              int dst_width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
+    mov        eax, [esp + 4 + 4]  // src_ptr
+    mov        esi, [esp + 4 + 8]  // src_stride
+    mov        edx, [esp + 4 + 12]  // dst_ptr
+    mov        ecx, [esp + 4 + 16]  // dst_width
 
-    pcmpeqb    xmm4, xmm4            // constant 0x0101
+    pcmpeqb    xmm4, xmm4  // constant 0x0101
     psrlw      xmm4, 15
     packuswb   xmm4, xmm4
-    pxor       xmm5, xmm5            // constant 0
+    pxor       xmm5, xmm5  // constant 0
 
   wloop:
     movdqu     xmm0, [eax]
@@ -174,15 +172,15 @@ void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
     movdqu     xmm2, [eax + esi]
     movdqu     xmm3, [eax + esi + 16]
     lea        eax,  [eax + 32]
-    pmaddubsw  xmm0, xmm4      // horizontal add
+    pmaddubsw  xmm0, xmm4  // horizontal add
     pmaddubsw  xmm1, xmm4
     pmaddubsw  xmm2, xmm4
     pmaddubsw  xmm3, xmm4
-    paddw      xmm0, xmm2      // vertical add
+    paddw      xmm0, xmm2  // vertical add
     paddw      xmm1, xmm3
     psrlw      xmm0, 1
     psrlw      xmm1, 1
-    pavgw      xmm0, xmm5      // (x + 1) / 2
+    pavgw      xmm0, xmm5  // (x + 1) / 2
     pavgw      xmm1, xmm5
     packuswb   xmm0, xmm1
     movdqu     [edx], xmm0
@@ -197,23 +195,24 @@ void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 
 #ifdef HAS_SCALEROWDOWN2_AVX2
 // Reads 64 pixels, throws half away and writes 32 pixels.
-__declspec(naked)
-void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
+                                          ptrdiff_t src_stride,
+                                          uint8_t* dst_ptr,
+                                          int dst_width) {
   __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
+    mov        eax, [esp + 4]  // src_ptr
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_ptr
+    mov        ecx, [esp + 16]  // dst_width
 
   wloop:
     vmovdqu     ymm0, [eax]
     vmovdqu     ymm1, [eax + 32]
     lea         eax,  [eax + 64]
-    vpsrlw      ymm0, ymm0, 8        // isolate odd pixels.
+    vpsrlw      ymm0, ymm0, 8  // isolate odd pixels.
     vpsrlw      ymm1, ymm1, 8
     vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8     // unmutate vpackuswb
+    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
     vmovdqu     [edx], ymm0
     lea         edx, [edx + 32]
     sub         ecx, 32
@@ -225,30 +224,31 @@ void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
 }
 
 // Blends 64x1 rectangle to 32x1.
-__declspec(naked)
-void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
+                                                ptrdiff_t src_stride,
+                                                uint8_t* dst_ptr,
+                                                int dst_width) {
   __asm {
-    mov         eax, [esp + 4]        // src_ptr
-                                      // src_stride
-    mov         edx, [esp + 12]       // dst_ptr
-    mov         ecx, [esp + 16]       // dst_width
+    mov         eax, [esp + 4]  // src_ptr
+    // src_stride
+    mov         edx, [esp + 12]  // dst_ptr
+    mov         ecx, [esp + 16]  // dst_width
 
-    vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
+    vpcmpeqb    ymm4, ymm4, ymm4  // '1' constant, 8b
     vpsrlw      ymm4, ymm4, 15
     vpackuswb   ymm4, ymm4, ymm4
-    vpxor       ymm5, ymm5, ymm5      // constant 0
+    vpxor       ymm5, ymm5, ymm5  // constant 0
 
   wloop:
     vmovdqu     ymm0, [eax]
     vmovdqu     ymm1, [eax + 32]
     lea         eax,  [eax + 64]
-    vpmaddubsw  ymm0, ymm0, ymm4      // horizontal add
+    vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
     vpmaddubsw  ymm1, ymm1, ymm4
-    vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
+    vpavgw      ymm0, ymm0, ymm5  // (x + 1) / 2
     vpavgw      ymm1, ymm1, ymm5
     vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
     vmovdqu     [edx], ymm0
     lea         edx, [edx + 32]
     sub         ecx, 32
@@ -262,20 +262,21 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
 // For rounding, average = (sum + 2) / 4
 // becomes average((sum >> 1), 0)
 // Blends 64x2 rectangle to 32x1.
-__declspec(naked)
-void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
+                                             ptrdiff_t src_stride,
+                                             uint8_t* dst_ptr,
+                                             int dst_width) {
   __asm {
     push        esi
-    mov         eax, [esp + 4 + 4]    // src_ptr
-    mov         esi, [esp + 4 + 8]    // src_stride
-    mov         edx, [esp + 4 + 12]   // dst_ptr
-    mov         ecx, [esp + 4 + 16]   // dst_width
+    mov         eax, [esp + 4 + 4]  // src_ptr
+    mov         esi, [esp + 4 + 8]  // src_stride
+    mov         edx, [esp + 4 + 12]  // dst_ptr
+    mov         ecx, [esp + 4 + 16]  // dst_width
 
-    vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
+    vpcmpeqb    ymm4, ymm4, ymm4  // '1' constant, 8b
     vpsrlw      ymm4, ymm4, 15
     vpackuswb   ymm4, ymm4, ymm4
-    vpxor       ymm5, ymm5, ymm5      // constant 0
+    vpxor       ymm5, ymm5, ymm5  // constant 0
 
   wloop:
     vmovdqu     ymm0, [eax]
@@ -283,18 +284,18 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
     vmovdqu     ymm2, [eax + esi]
     vmovdqu     ymm3, [eax + esi + 32]
     lea         eax,  [eax + 64]
-    vpmaddubsw  ymm0, ymm0, ymm4      // horizontal add
+    vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
     vpmaddubsw  ymm1, ymm1, ymm4
     vpmaddubsw  ymm2, ymm2, ymm4
     vpmaddubsw  ymm3, ymm3, ymm4
-    vpaddw      ymm0, ymm0, ymm2      // vertical add
+    vpaddw      ymm0, ymm0, ymm2  // vertical add
     vpaddw      ymm1, ymm1, ymm3
-    vpsrlw      ymm0, ymm0, 1         // (x + 2) / 4 = (x / 2 + 1) / 2
+    vpsrlw      ymm0, ymm0, 1  // (x + 2) / 4 = (x / 2 + 1) / 2
     vpsrlw      ymm1, ymm1, 1
-    vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
+    vpavgw      ymm0, ymm0, ymm5  // (x + 1) / 2
     vpavgw      ymm1, ymm1, ymm5
     vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
     vmovdqu     [edx], ymm0
     lea         edx, [edx + 32]
     sub         ecx, 32
@@ -308,15 +309,16 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
 #endif  // HAS_SCALEROWDOWN2_AVX2
 
 // Point samples 32 pixels to 8 pixels.
-__declspec(naked)
-void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
+                                           ptrdiff_t src_stride,
+                                           uint8_t* dst_ptr,
+                                           int dst_width) {
   __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff0000
+    mov        eax, [esp + 4]  // src_ptr
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_ptr
+    mov        ecx, [esp + 16]  // dst_width
+    pcmpeqb    xmm5, xmm5       // generate mask 0x00ff0000
     psrld      xmm5, 24
     pslld      xmm5, 16
 
@@ -339,50 +341,51 @@ void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 }
 
 // Blends 32x4 rectangle to 8x1.
-__declspec(naked)
-void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
+                                              ptrdiff_t src_stride,
+                                              uint8_t* dst_ptr,
+                                              int dst_width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]    // src_ptr
-    mov        esi, [esp + 8 + 8]    // src_stride
-    mov        edx, [esp + 8 + 12]   // dst_ptr
-    mov        ecx, [esp + 8 + 16]   // dst_width
+    mov        eax, [esp + 8 + 4]  // src_ptr
+    mov        esi, [esp + 8 + 8]  // src_stride
+    mov        edx, [esp + 8 + 12]  // dst_ptr
+    mov        ecx, [esp + 8 + 16]  // dst_width
     lea        edi, [esi + esi * 2]  // src_stride * 3
-    pcmpeqb    xmm4, xmm4            // constant 0x0101
+    pcmpeqb    xmm4, xmm4  // constant 0x0101
     psrlw      xmm4, 15
     movdqa     xmm5, xmm4
     packuswb   xmm4, xmm4
-    psllw      xmm5, 3               // constant 0x0008
+    psllw      xmm5, 3  // constant 0x0008
 
   wloop:
-    movdqu     xmm0, [eax]           // average rows
+    movdqu     xmm0, [eax]  // average rows
     movdqu     xmm1, [eax + 16]
     movdqu     xmm2, [eax + esi]
     movdqu     xmm3, [eax + esi + 16]
-    pmaddubsw  xmm0, xmm4      // horizontal add
+    pmaddubsw  xmm0, xmm4  // horizontal add
     pmaddubsw  xmm1, xmm4
     pmaddubsw  xmm2, xmm4
     pmaddubsw  xmm3, xmm4
-    paddw      xmm0, xmm2      // vertical add rows 0, 1
+    paddw      xmm0, xmm2  // vertical add rows 0, 1
     paddw      xmm1, xmm3
     movdqu     xmm2, [eax + esi * 2]
     movdqu     xmm3, [eax + esi * 2 + 16]
     pmaddubsw  xmm2, xmm4
     pmaddubsw  xmm3, xmm4
-    paddw      xmm0, xmm2      // add row 2
+    paddw      xmm0, xmm2  // add row 2
     paddw      xmm1, xmm3
     movdqu     xmm2, [eax + edi]
     movdqu     xmm3, [eax + edi + 16]
     lea        eax, [eax + 32]
     pmaddubsw  xmm2, xmm4
     pmaddubsw  xmm3, xmm4
-    paddw      xmm0, xmm2      // add row 3
+    paddw      xmm0, xmm2  // add row 3
     paddw      xmm1, xmm3
     phaddw     xmm0, xmm1
-    paddw      xmm0, xmm5      // + 8 for round
-    psrlw      xmm0, 4         // /16 for average of 4 * 4
+    paddw      xmm0, xmm5  // + 8 for round
+    psrlw      xmm0, 4  // /16 for average of 4 * 4
     packuswb   xmm0, xmm0
     movq       qword ptr [edx], xmm0
     lea        edx, [edx + 8]
@@ -397,15 +400,16 @@ void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 
 #ifdef HAS_SCALEROWDOWN4_AVX2
 // Point samples 64 pixels to 16 pixels.
-__declspec(naked)
-void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
+                                          ptrdiff_t src_stride,
+                                          uint8_t* dst_ptr,
+                                          int dst_width) {
   __asm {
-    mov         eax, [esp + 4]        // src_ptr
-                                      // src_stride ignored
-    mov         edx, [esp + 12]       // dst_ptr
-    mov         ecx, [esp + 16]       // dst_width
-    vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0x00ff0000
+    mov         eax, [esp + 4]  // src_ptr
+    // src_stride ignored
+    mov         edx, [esp + 12]  // dst_ptr
+    mov         ecx, [esp + 16]  // dst_width
+    vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0x00ff0000
     vpsrld      ymm5, ymm5, 24
     vpslld      ymm5, ymm5, 16
 
@@ -416,10 +420,10 @@ void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
     vpand       ymm0, ymm0, ymm5
     vpand       ymm1, ymm1, ymm5
     vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
     vpsrlw      ymm0, ymm0, 8
     vpackuswb   ymm0, ymm0, ymm0
-    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
     vmovdqu     [edx], xmm0
     lea         edx, [edx + 16]
     sub         ecx, 16
@@ -431,52 +435,53 @@ void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
 }
 
 // Blends 64x4 rectangle to 16x1.
-__declspec(naked)
-void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
+                                             ptrdiff_t src_stride,
+                                             uint8_t* dst_ptr,
+                                             int dst_width) {
   __asm {
     push        esi
     push        edi
-    mov         eax, [esp + 8 + 4]    // src_ptr
-    mov         esi, [esp + 8 + 8]    // src_stride
-    mov         edx, [esp + 8 + 12]   // dst_ptr
-    mov         ecx, [esp + 8 + 16]   // dst_width
+    mov         eax, [esp + 8 + 4]  // src_ptr
+    mov         esi, [esp + 8 + 8]  // src_stride
+    mov         edx, [esp + 8 + 12]  // dst_ptr
+    mov         ecx, [esp + 8 + 16]  // dst_width
     lea         edi, [esi + esi * 2]  // src_stride * 3
-    vpcmpeqb    ymm4, ymm4, ymm4            // constant 0x0101
+    vpcmpeqb    ymm4, ymm4, ymm4  // constant 0x0101
     vpsrlw      ymm4, ymm4, 15
-    vpsllw      ymm5, ymm4, 3               // constant 0x0008
+    vpsllw      ymm5, ymm4, 3  // constant 0x0008
     vpackuswb   ymm4, ymm4, ymm4
 
   wloop:
-    vmovdqu     ymm0, [eax]           // average rows
+    vmovdqu     ymm0, [eax]  // average rows
     vmovdqu     ymm1, [eax + 32]
     vmovdqu     ymm2, [eax + esi]
     vmovdqu     ymm3, [eax + esi + 32]
-    vpmaddubsw  ymm0, ymm0, ymm4      // horizontal add
+    vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
     vpmaddubsw  ymm1, ymm1, ymm4
     vpmaddubsw  ymm2, ymm2, ymm4
     vpmaddubsw  ymm3, ymm3, ymm4
-    vpaddw      ymm0, ymm0, ymm2      // vertical add rows 0, 1
+    vpaddw      ymm0, ymm0, ymm2  // vertical add rows 0, 1
     vpaddw      ymm1, ymm1, ymm3
     vmovdqu     ymm2, [eax + esi * 2]
     vmovdqu     ymm3, [eax + esi * 2 + 32]
     vpmaddubsw  ymm2, ymm2, ymm4
     vpmaddubsw  ymm3, ymm3, ymm4
-    vpaddw      ymm0, ymm0, ymm2      // add row 2
+    vpaddw      ymm0, ymm0, ymm2  // add row 2
     vpaddw      ymm1, ymm1, ymm3
     vmovdqu     ymm2, [eax + edi]
     vmovdqu     ymm3, [eax + edi + 32]
     lea         eax,  [eax + 64]
     vpmaddubsw  ymm2, ymm2, ymm4
     vpmaddubsw  ymm3, ymm3, ymm4
-    vpaddw      ymm0, ymm0, ymm2      // add row 3
+    vpaddw      ymm0, ymm0, ymm2  // add row 3
     vpaddw      ymm1, ymm1, ymm3
-    vphaddw     ymm0, ymm0, ymm1      // mutates
-    vpermq      ymm0, ymm0, 0xd8      // unmutate vphaddw
-    vpaddw      ymm0, ymm0, ymm5      // + 8 for round
-    vpsrlw      ymm0, ymm0, 4         // /32 for average of 4 * 4
+    vphaddw     ymm0, ymm0, ymm1  // mutates
+    vpermq      ymm0, ymm0, 0xd8  // unmutate vphaddw
+    vpaddw      ymm0, ymm0, ymm5  // + 8 for round
+    vpsrlw      ymm0, ymm0, 4  // /32 for average of 4 * 4
     vpackuswb   ymm0, ymm0, ymm0
-    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
     vmovdqu     [edx], xmm0
     lea         edx, [edx + 16]
     sub         ecx, 16
@@ -494,14 +499,15 @@ void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
 // Then shuffled to do the scaling.
 
-__declspec(naked)
-void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
+                                            ptrdiff_t src_stride,
+                                            uint8_t* dst_ptr,
+                                            int dst_width) {
   __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
+    mov        eax, [esp + 4]   // src_ptr
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_ptr
+    mov        ecx, [esp + 16]  // dst_width
     movdqa     xmm3, xmmword ptr kShuf0
     movdqa     xmm4, xmmword ptr kShuf1
     movdqa     xmm5, xmmword ptr kShuf2
@@ -541,16 +547,16 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 // xmm7 kRound34
 
 // Note that movdqa+palign may be better than movdqu.
-__declspec(naked)
-void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
+                                                  ptrdiff_t src_stride,
+                                                  uint8_t* dst_ptr,
+                                                  int dst_width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
+    mov        eax, [esp + 4 + 4]  // src_ptr
+    mov        esi, [esp + 4 + 8]  // src_stride
+    mov        edx, [esp + 4 + 12]  // dst_ptr
+    mov        ecx, [esp + 4 + 16]  // dst_width
     movdqa     xmm2, xmmword ptr kShuf01
     movdqa     xmm3, xmmword ptr kShuf11
     movdqa     xmm4, xmmword ptr kShuf21
@@ -559,7 +565,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
     movdqa     xmm7, xmmword ptr kRound34
 
   wloop:
-    movdqu     xmm0, [eax]           // pixels 0..7
+    movdqu     xmm0, [eax]  // pixels 0..7
     movdqu     xmm1, [eax + esi]
     pavgb      xmm0, xmm1
     pshufb     xmm0, xmm2
@@ -568,7 +574,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
     psrlw      xmm0, 2
     packuswb   xmm0, xmm0
     movq       qword ptr [edx], xmm0
-    movdqu     xmm0, [eax + 8]       // pixels 8..15
+    movdqu     xmm0, [eax + 8]  // pixels 8..15
     movdqu     xmm1, [eax + esi + 8]
     pavgb      xmm0, xmm1
     pshufb     xmm0, xmm3
@@ -577,7 +583,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
     psrlw      xmm0, 2
     packuswb   xmm0, xmm0
     movq       qword ptr [edx + 8], xmm0
-    movdqu     xmm0, [eax + 16]      // pixels 16..23
+    movdqu     xmm0, [eax + 16]  // pixels 16..23
     movdqu     xmm1, [eax + esi + 16]
     lea        eax, [eax + 32]
     pavgb      xmm0, xmm1
@@ -598,16 +604,16 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
 }
 
 // Note that movdqa+palign may be better than movdqu.
-__declspec(naked)
-void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
+                                                  ptrdiff_t src_stride,
+                                                  uint8_t* dst_ptr,
+                                                  int dst_width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
+    mov        eax, [esp + 4 + 4]  // src_ptr
+    mov        esi, [esp + 4 + 8]  // src_stride
+    mov        edx, [esp + 4 + 12]  // dst_ptr
+    mov        ecx, [esp + 4 + 16]  // dst_width
     movdqa     xmm2, xmmword ptr kShuf01
     movdqa     xmm3, xmmword ptr kShuf11
     movdqa     xmm4, xmmword ptr kShuf21
@@ -616,7 +622,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
     movdqa     xmm7, xmmword ptr kRound34
 
   wloop:
-    movdqu     xmm0, [eax]           // pixels 0..7
+    movdqu     xmm0, [eax]  // pixels 0..7
     movdqu     xmm1, [eax + esi]
     pavgb      xmm1, xmm0
     pavgb      xmm0, xmm1
@@ -626,7 +632,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
     psrlw      xmm0, 2
     packuswb   xmm0, xmm0
     movq       qword ptr [edx], xmm0
-    movdqu     xmm0, [eax + 8]       // pixels 8..15
+    movdqu     xmm0, [eax + 8]  // pixels 8..15
     movdqu     xmm1, [eax + esi + 8]
     pavgb      xmm1, xmm0
     pavgb      xmm0, xmm1
@@ -636,7 +642,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
     psrlw      xmm0, 2
     packuswb   xmm0, xmm0
     movq       qword ptr [edx + 8], xmm0
-    movdqu     xmm0, [eax + 16]      // pixels 16..23
+    movdqu     xmm0, [eax + 16]  // pixels 16..23
     movdqu     xmm1, [eax + esi + 16]
     lea        eax, [eax + 32]
     pavgb      xmm1, xmm0
@@ -660,26 +666,27 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
 // 3/8 point sampler
 
 // Scale 32 pixels to 12
-__declspec(naked)
-void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
+                                            ptrdiff_t src_stride,
+                                            uint8_t* dst_ptr,
+                                            int dst_width) {
   __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
+    mov        eax, [esp + 4]  // src_ptr
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_ptr
+    mov        ecx, [esp + 16]  // dst_width
     movdqa     xmm4, xmmword ptr kShuf38a
     movdqa     xmm5, xmmword ptr kShuf38b
 
   xloop:
-    movdqu     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
-    movdqu     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
+    movdqu     xmm0, [eax]  // 16 pixels -> 0,1,2,3,4,5
+    movdqu     xmm1, [eax + 16]  // 16 pixels -> 6,7,8,9,10,11
     lea        eax, [eax + 32]
     pshufb     xmm0, xmm4
     pshufb     xmm1, xmm5
     paddusb    xmm0, xmm1
 
-    movq       qword ptr [edx], xmm0  // write 12 pixels
+    movq       qword ptr [edx], xmm0       // write 12 pixels
     movhlps    xmm1, xmm0
     movd       [edx + 8], xmm1
     lea        edx, [edx + 12]
@@ -691,23 +698,23 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 }
 
 // Scale 16x3 pixels to 6x1 with interpolation
-__declspec(naked)
-void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
+                                                  ptrdiff_t src_stride,
+                                                  uint8_t* dst_ptr,
+                                                  int dst_width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
+    mov        eax, [esp + 4 + 4]  // src_ptr
+    mov        esi, [esp + 4 + 8]  // src_stride
+    mov        edx, [esp + 4 + 12]  // dst_ptr
+    mov        ecx, [esp + 4 + 16]  // dst_width
     movdqa     xmm2, xmmword ptr kShufAc
     movdqa     xmm3, xmmword ptr kShufAc3
     movdqa     xmm4, xmmword ptr kScaleAc33
     pxor       xmm5, xmm5
 
   xloop:
-    movdqu     xmm0, [eax]           // sum up 3 rows into xmm0/1
+    movdqu     xmm0, [eax]  // sum up 3 rows into xmm0/1
     movdqu     xmm6, [eax + esi]
     movhlps    xmm1, xmm0
     movhlps    xmm7, xmm6
@@ -725,14 +732,14 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
     paddusw    xmm0, xmm6
     paddusw    xmm1, xmm7
 
-    movdqa     xmm6, xmm0            // 8 pixels -> 0,1,2 of xmm6
+    movdqa     xmm6, xmm0  // 8 pixels -> 0,1,2 of xmm6
     psrldq     xmm0, 2
     paddusw    xmm6, xmm0
     psrldq     xmm0, 2
     paddusw    xmm6, xmm0
     pshufb     xmm6, xmm2
 
-    movdqa     xmm7, xmm1            // 8 pixels -> 3,4,5 of xmm6
+    movdqa     xmm7, xmm1  // 8 pixels -> 3,4,5 of xmm6
     psrldq     xmm1, 2
     paddusw    xmm7, xmm1
     psrldq     xmm1, 2
@@ -740,10 +747,10 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
     pshufb     xmm7, xmm3
     paddusw    xmm6, xmm7
 
-    pmulhuw    xmm6, xmm4            // divide by 9,9,6, 9,9,6
+    pmulhuw    xmm6, xmm4  // divide by 9,9,6, 9,9,6
     packuswb   xmm6, xmm6
 
-    movd       [edx], xmm6           // write 6 pixels
+    movd       [edx], xmm6  // write 6 pixels
     psrlq      xmm6, 16
     movd       [edx + 2], xmm6
     lea        edx, [edx + 6]
@@ -756,28 +763,28 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
 }
 
 // Scale 16x2 pixels to 6x1 with interpolation
-__declspec(naked)
-void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
+                                                  ptrdiff_t src_stride,
+                                                  uint8_t* dst_ptr,
+                                                  int dst_width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
+    mov        eax, [esp + 4 + 4]  // src_ptr
+    mov        esi, [esp + 4 + 8]  // src_stride
+    mov        edx, [esp + 4 + 12]  // dst_ptr
+    mov        ecx, [esp + 4 + 16]  // dst_width
     movdqa     xmm2, xmmword ptr kShufAb0
     movdqa     xmm3, xmmword ptr kShufAb1
     movdqa     xmm4, xmmword ptr kShufAb2
     movdqa     xmm5, xmmword ptr kScaleAb2
 
   xloop:
-    movdqu     xmm0, [eax]           // average 2 rows into xmm0
+    movdqu     xmm0, [eax]  // average 2 rows into xmm0
     movdqu     xmm1, [eax + esi]
     lea        eax, [eax + 16]
     pavgb      xmm0, xmm1
 
-    movdqa     xmm1, xmm0            // 16 pixels -> 0,1,2,3,4,5 of xmm1
+    movdqa     xmm1, xmm0  // 16 pixels -> 0,1,2,3,4,5 of xmm1
     pshufb     xmm1, xmm2
     movdqa     xmm6, xmm0
     pshufb     xmm6, xmm3
@@ -785,10 +792,10 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
     pshufb     xmm0, xmm4
     paddusw    xmm1, xmm0
 
-    pmulhuw    xmm1, xmm5            // divide by 3,3,2, 3,3,2
+    pmulhuw    xmm1, xmm5  // divide by 3,3,2, 3,3,2
     packuswb   xmm1, xmm1
 
-    movd       [edx], xmm1           // write 6 pixels
+    movd       [edx], xmm1  // write 6 pixels
     psrlq      xmm1, 16
     movd       [edx + 2], xmm1
     lea        edx, [edx + 6]
@@ -801,26 +808,27 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
 }
 
 // Reads 16 bytes and accumulates to 16 shorts at a time.
-__declspec(naked)
-void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+__declspec(naked) void ScaleAddRow_SSE2(const uint8_t* src_ptr,
+                                        uint16_t* dst_ptr,
+                                        int src_width) {
   __asm {
-    mov        eax, [esp + 4]   // src_ptr
-    mov        edx, [esp + 8]   // dst_ptr
+    mov        eax, [esp + 4]  // src_ptr
+    mov        edx, [esp + 8]  // dst_ptr
     mov        ecx, [esp + 12]  // src_width
     pxor       xmm5, xmm5
 
-  // sum rows
+        // sum rows
   xloop:
-    movdqu     xmm3, [eax]       // read 16 bytes
+    movdqu     xmm3, [eax]  // read 16 bytes
     lea        eax, [eax + 16]
-    movdqu     xmm0, [edx]       // read 16 words from destination
+    movdqu     xmm0, [edx]  // read 16 words from destination
     movdqu     xmm1, [edx + 16]
     movdqa     xmm2, xmm3
     punpcklbw  xmm2, xmm5
     punpckhbw  xmm3, xmm5
-    paddusw    xmm0, xmm2        // sum 16 words
+    paddusw    xmm0, xmm2  // sum 16 words
     paddusw    xmm1, xmm3
-    movdqu     [edx], xmm0       // write 16 words to destination
+    movdqu     [edx], xmm0  // write 16 words to destination
     movdqu     [edx + 16], xmm1
     lea        edx, [edx + 32]
     sub        ecx, 16
@@ -831,24 +839,25 @@ void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
 
 #ifdef HAS_SCALEADDROW_AVX2
 // Reads 32 bytes and accumulates to 32 shorts at a time.
-__declspec(naked)
-void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+__declspec(naked) void ScaleAddRow_AVX2(const uint8_t* src_ptr,
+                                        uint16_t* dst_ptr,
+                                        int src_width) {
   __asm {
-    mov         eax, [esp + 4]   // src_ptr
-    mov         edx, [esp + 8]   // dst_ptr
+    mov         eax, [esp + 4]  // src_ptr
+    mov         edx, [esp + 8]  // dst_ptr
     mov         ecx, [esp + 12]  // src_width
     vpxor       ymm5, ymm5, ymm5
 
-  // sum rows
+        // sum rows
   xloop:
-    vmovdqu     ymm3, [eax]       // read 32 bytes
+    vmovdqu     ymm3, [eax]  // read 32 bytes
     lea         eax, [eax + 32]
     vpermq      ymm3, ymm3, 0xd8  // unmutate for vpunpck
     vpunpcklbw  ymm2, ymm3, ymm5
     vpunpckhbw  ymm3, ymm3, ymm5
-    vpaddusw    ymm0, ymm2, [edx] // sum 16 words
+    vpaddusw    ymm0, ymm2, [edx]  // sum 16 words
     vpaddusw    ymm1, ymm3, [edx + 32]
-    vmovdqu     [edx], ymm0       // write 32 words to destination
+    vmovdqu     [edx], ymm0  // write 32 words to destination
     vmovdqu     [edx + 32], ymm1
     lea         edx, [edx + 64]
     sub         ecx, 32
@@ -862,86 +871,87 @@ void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
 
 // Constant for making pixels signed to avoid pmaddubsw
 // saturation.
-static uvec8 kFsub80 =
-  { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
+static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+                              0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
 
 // Constant for making pixels unsigned and adding .5 for rounding.
-static uvec16 kFadd40 =
-  { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
+static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
+                               0x4040, 0x4040, 0x4040, 0x4040};
 
 // Bilinear column filtering. SSSE3 version.
-__declspec(naked)
-void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                           int dst_width, int x, int dx) {
+__declspec(naked) void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
+                                             const uint8_t* src_ptr,
+                                             int dst_width,
+                                             int x,
+                                             int dx) {
   __asm {
     push       ebx
     push       esi
     push       edi
-    mov        edi, [esp + 12 + 4]    // dst_ptr
-    mov        esi, [esp + 12 + 8]    // src_ptr
-    mov        ecx, [esp + 12 + 12]   // dst_width
+    mov        edi, [esp + 12 + 4]  // dst_ptr
+    mov        esi, [esp + 12 + 8]  // src_ptr
+    mov        ecx, [esp + 12 + 12]  // dst_width
     movd       xmm2, [esp + 12 + 16]  // x
     movd       xmm3, [esp + 12 + 20]  // dx
-    mov        eax, 0x04040000      // shuffle to line up fractions with pixel.
+    mov        eax, 0x04040000  // shuffle to line up fractions with pixel.
     movd       xmm5, eax
-    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
+    pcmpeqb    xmm6, xmm6  // generate 0x007f for inverting fraction.
     psrlw      xmm6, 9
-    pcmpeqb    xmm7, xmm7           // generate 0x0001
+    pcmpeqb    xmm7, xmm7  // generate 0x0001
     psrlw      xmm7, 15
-    pextrw     eax, xmm2, 1         // get x0 integer. preroll
+    pextrw     eax, xmm2, 1  // get x0 integer. preroll
     sub        ecx, 2
     jl         xloop29
 
-    movdqa     xmm0, xmm2           // x1 = x0 + dx
+    movdqa     xmm0, xmm2  // x1 = x0 + dx
     paddd      xmm0, xmm3
-    punpckldq  xmm2, xmm0           // x0 x1
-    punpckldq  xmm3, xmm3           // dx dx
-    paddd      xmm3, xmm3           // dx * 2, dx * 2
-    pextrw     edx, xmm2, 3         // get x1 integer. preroll
+    punpckldq  xmm2, xmm0  // x0 x1
+    punpckldq  xmm3, xmm3  // dx dx
+    paddd      xmm3, xmm3  // dx * 2, dx * 2
+    pextrw     edx, xmm2, 3  // get x1 integer. preroll
 
     // 2 Pixel loop.
   xloop2:
-    movdqa     xmm1, xmm2           // x0, x1 fractions.
-    paddd      xmm2, xmm3           // x += dx
+    movdqa     xmm1, xmm2  // x0, x1 fractions.
+    paddd      xmm2, xmm3  // x += dx
     movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
     movd       xmm0, ebx
-    psrlw      xmm1, 9              // 7 bit fractions.
+    psrlw      xmm1, 9  // 7 bit fractions.
     movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels
     movd       xmm4, ebx
-    pshufb     xmm1, xmm5           // 0011
+    pshufb     xmm1, xmm5  // 0011
     punpcklwd  xmm0, xmm4
     psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.
-    pxor       xmm1, xmm6           // 0..7f and 7f..0
-    paddusb    xmm1, xmm7           // +1 so 0..7f and 80..1
-    pmaddubsw  xmm1, xmm0           // 16 bit, 2 pixels.
-    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
-    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
+    pxor       xmm1, xmm6  // 0..7f and 7f..0
+    paddusb    xmm1, xmm7  // +1 so 0..7f and 80..1
+    pmaddubsw  xmm1, xmm0  // 16 bit, 2 pixels.
+    pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
+    pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
     paddw      xmm1, xmmword ptr kFadd40  // make pixels unsigned and round.
-    psrlw      xmm1, 7              // 8.7 fixed point to low 8 bits.
-    packuswb   xmm1, xmm1           // 8 bits, 2 pixels.
+    psrlw      xmm1, 7  // 8.7 fixed point to low 8 bits.
+    packuswb   xmm1, xmm1  // 8 bits, 2 pixels.
     movd       ebx, xmm1
     mov        [edi], bx
     lea        edi, [edi + 2]
-    sub        ecx, 2               // 2 pixels
+    sub        ecx, 2  // 2 pixels
     jge        xloop2
 
  xloop29:
     add        ecx, 2 - 1
     jl         xloop99
 
-    // 1 pixel remainder
+            // 1 pixel remainder
     movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
     movd       xmm0, ebx
-    psrlw      xmm2, 9              // 7 bit fractions.
-    pshufb     xmm2, xmm5           // 0011
+    psrlw      xmm2, 9  // 7 bit fractions.
+    pshufb     xmm2, xmm5  // 0011
     psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.
-    pxor       xmm2, xmm6           // 0..7f and 7f..0
-    paddusb    xmm2, xmm7           // +1 so 0..7f and 80..1
-    pmaddubsw  xmm2, xmm0           // 16 bit
+    pxor       xmm2, xmm6  // 0..7f and 7f..0
+    paddusb    xmm2, xmm7  // +1 so 0..7f and 80..1
+    pmaddubsw  xmm2, xmm0  // 16 bit
     paddw      xmm2, xmmword ptr kFadd40  // make pixels unsigned and round.
-    psrlw      xmm2, 7              // 8.7 fixed point to low 8 bits.
-    packuswb   xmm2, xmm2           // 8 bits
+    psrlw      xmm2, 7  // 8.7 fixed point to low 8 bits.
+    packuswb   xmm2, xmm2  // 8 bits
     movd       ebx, xmm2
     mov        [edi], bl
 
@@ -955,13 +965,15 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
 }
 
 // Reads 16 pixels, duplicates them and writes 32 pixels.
-__declspec(naked)
-void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
-                       int dst_width, int x, int dx) {
+__declspec(naked) void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
+                                         const uint8_t* src_ptr,
+                                         int dst_width,
+                                         int x,
+                                         int dx) {
   __asm {
-    mov        edx, [esp + 4]    // dst_ptr
-    mov        eax, [esp + 8]    // src_ptr
-    mov        ecx, [esp + 12]   // dst_width
+    mov        edx, [esp + 4]  // dst_ptr
+    mov        eax, [esp + 8]  // src_ptr
+    mov        ecx, [esp + 12]  // dst_width
 
   wloop:
     movdqu     xmm0, [eax]
@@ -980,15 +992,15 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
 }
 
 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
-__declspec(naked)
-void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
-                            ptrdiff_t src_stride,
-                            uint8* dst_argb, int dst_width) {
+__declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
+                                              ptrdiff_t src_stride,
+                                              uint8_t* dst_argb,
+                                              int dst_width) {
   __asm {
-    mov        eax, [esp + 4]        // src_argb
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_argb
-    mov        ecx, [esp + 16]       // dst_width
+    mov        eax, [esp + 4]   // src_argb
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_argb
+    mov        ecx, [esp + 16]  // dst_width
 
   wloop:
     movdqu     xmm0, [eax]
@@ -1005,23 +1017,23 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
 }
 
 // Blends 8x1 rectangle to 4x1.
-__declspec(naked)
-void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
-                                  ptrdiff_t src_stride,
-                                  uint8* dst_argb, int dst_width) {
+__declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
+                                                    ptrdiff_t src_stride,
+                                                    uint8_t* dst_argb,
+                                                    int dst_width) {
   __asm {
-    mov        eax, [esp + 4]        // src_argb
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_argb
-    mov        ecx, [esp + 16]       // dst_width
+    mov        eax, [esp + 4]  // src_argb
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_argb
+    mov        ecx, [esp + 16]  // dst_width
 
   wloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
     movdqa     xmm2, xmm0
-    shufps     xmm0, xmm1, 0x88      // even pixels
-    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    shufps     xmm0, xmm1, 0x88  // even pixels
+    shufps     xmm2, xmm1, 0xdd       // odd pixels
     pavgb      xmm0, xmm2
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
@@ -1033,16 +1045,16 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
 }
 
 // Blends 8x2 rectangle to 4x1.
-__declspec(naked)
-void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
-                               ptrdiff_t src_stride,
-                               uint8* dst_argb, int dst_width) {
+__declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
+                                                 ptrdiff_t src_stride,
+                                                 uint8_t* dst_argb,
+                                                 int dst_width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]    // src_argb
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_argb
-    mov        ecx, [esp + 4 + 16]   // dst_width
+    mov        eax, [esp + 4 + 4]  // src_argb
+    mov        esi, [esp + 4 + 8]  // src_stride
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // dst_width
 
   wloop:
     movdqu     xmm0, [eax]
@@ -1050,11 +1062,11 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
     movdqu     xmm2, [eax + esi]
     movdqu     xmm3, [eax + esi + 16]
     lea        eax,  [eax + 32]
-    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm0, xmm2  // average rows
     pavgb      xmm1, xmm3
-    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
-    shufps     xmm0, xmm1, 0x88      // even pixels
-    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    movdqa     xmm2, xmm0  // average columns (8 to 4 pixels)
+    shufps     xmm0, xmm1, 0x88  // even pixels
+    shufps     xmm2, xmm1, 0xdd  // odd pixels
     pavgb      xmm0, xmm2
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
@@ -1067,18 +1079,19 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
 }
 
 // Reads 4 pixels at a time.
-__declspec(naked)
-void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                               int src_stepx,
-                               uint8* dst_argb, int dst_width) {
+__declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
+                                                 ptrdiff_t src_stride,
+                                                 int src_stepx,
+                                                 uint8_t* dst_argb,
+                                                 int dst_width) {
   __asm {
     push       ebx
     push       edi
-    mov        eax, [esp + 8 + 4]    // src_argb
-                                     // src_stride ignored
-    mov        ebx, [esp + 8 + 12]   // src_stepx
-    mov        edx, [esp + 8 + 16]   // dst_argb
-    mov        ecx, [esp + 8 + 20]   // dst_width
+    mov        eax, [esp + 8 + 4]   // src_argb
+    // src_stride ignored
+    mov        ebx, [esp + 8 + 12]  // src_stepx
+    mov        edx, [esp + 8 + 16]  // dst_argb
+    mov        ecx, [esp + 8 + 20]  // dst_width
     lea        ebx, [ebx * 4]
     lea        edi, [ebx + ebx * 2]
 
@@ -1103,21 +1116,21 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
 }
 
 // Blends four 2x2 to 4x1.
-__declspec(naked)
-void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
-                                  ptrdiff_t src_stride,
-                                  int src_stepx,
-                                  uint8* dst_argb, int dst_width) {
+__declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
+                                                    ptrdiff_t src_stride,
+                                                    int src_stepx,
+                                                    uint8_t* dst_argb,
+                                                    int dst_width) {
   __asm {
     push       ebx
     push       esi
     push       edi
-    mov        eax, [esp + 12 + 4]    // src_argb
-    mov        esi, [esp + 12 + 8]    // src_stride
-    mov        ebx, [esp + 12 + 12]   // src_stepx
-    mov        edx, [esp + 12 + 16]   // dst_argb
-    mov        ecx, [esp + 12 + 20]   // dst_width
-    lea        esi, [eax + esi]       // row1 pointer
+    mov        eax, [esp + 12 + 4]  // src_argb
+    mov        esi, [esp + 12 + 8]  // src_stride
+    mov        ebx, [esp + 12 + 12]  // src_stepx
+    mov        edx, [esp + 12 + 16]  // dst_argb
+    mov        ecx, [esp + 12 + 20]  // dst_width
+    lea        esi, [eax + esi]  // row1 pointer
     lea        ebx, [ebx * 4]
     lea        edi, [ebx + ebx * 2]
 
@@ -1132,11 +1145,11 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
     movq       xmm3, qword ptr [esi + ebx * 2]
     movhps     xmm3, qword ptr [esi + edi]
     lea        esi,  [esi + ebx * 4]
-    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm0, xmm2  // average rows
     pavgb      xmm1, xmm3
-    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
-    shufps     xmm0, xmm1, 0x88      // even pixels
-    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    movdqa     xmm2, xmm0  // average columns (8 to 4 pixels)
+    shufps     xmm0, xmm1, 0x88  // even pixels
+    shufps     xmm2, xmm1, 0xdd  // odd pixels
     pavgb      xmm0, xmm2
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
@@ -1151,64 +1164,66 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
 }
 
 // Column scaling unfiltered. SSE2 version.
-__declspec(naked)
-void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx) {
+__declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb,
+                                          const uint8_t* src_argb,
+                                          int dst_width,
+                                          int x,
+                                          int dx) {
   __asm {
     push       edi
     push       esi
-    mov        edi, [esp + 8 + 4]    // dst_argb
-    mov        esi, [esp + 8 + 8]    // src_argb
-    mov        ecx, [esp + 8 + 12]   // dst_width
+    mov        edi, [esp + 8 + 4]  // dst_argb
+    mov        esi, [esp + 8 + 8]  // src_argb
+    mov        ecx, [esp + 8 + 12]  // dst_width
     movd       xmm2, [esp + 8 + 16]  // x
     movd       xmm3, [esp + 8 + 20]  // dx
 
-    pshufd     xmm2, xmm2, 0         // x0 x0 x0 x0
-    pshufd     xmm0, xmm3, 0x11      // dx  0 dx  0
+    pshufd     xmm2, xmm2, 0  // x0 x0 x0 x0
+    pshufd     xmm0, xmm3, 0x11  // dx  0 dx  0
     paddd      xmm2, xmm0
-    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 2
-    pshufd     xmm0, xmm3, 0x05      // dx * 2, dx * 2, 0, 0
-    paddd      xmm2, xmm0            // x3 x2 x1 x0
-    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 4
-    pshufd     xmm3, xmm3, 0         // dx * 4, dx * 4, dx * 4, dx * 4
+    paddd      xmm3, xmm3  // 0, 0, 0,  dx * 2
+    pshufd     xmm0, xmm3, 0x05  // dx * 2, dx * 2, 0, 0
+    paddd      xmm2, xmm0  // x3 x2 x1 x0
+    paddd      xmm3, xmm3  // 0, 0, 0,  dx * 4
+    pshufd     xmm3, xmm3, 0  // dx * 4, dx * 4, dx * 4, dx * 4
 
-    pextrw     eax, xmm2, 1          // get x0 integer.
-    pextrw     edx, xmm2, 3          // get x1 integer.
+    pextrw     eax, xmm2, 1  // get x0 integer.
+    pextrw     edx, xmm2, 3  // get x1 integer.
 
     cmp        ecx, 0
     jle        xloop99
     sub        ecx, 4
     jl         xloop49
 
-    // 4 Pixel loop.
+        // 4 Pixel loop.
  xloop4:
     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
-    pextrw     eax, xmm2, 5           // get x2 integer.
-    pextrw     edx, xmm2, 7           // get x3 integer.
-    paddd      xmm2, xmm3             // x += dx
-    punpckldq  xmm0, xmm1             // x0 x1
+    pextrw     eax, xmm2, 5  // get x2 integer.
+    pextrw     edx, xmm2, 7  // get x3 integer.
+    paddd      xmm2, xmm3  // x += dx
+    punpckldq  xmm0, xmm1  // x0 x1
 
     movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels
     movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels
-    pextrw     eax, xmm2, 1           // get x0 integer. next iteration.
-    pextrw     edx, xmm2, 3           // get x1 integer. next iteration.
-    punpckldq  xmm1, xmm4             // x2 x3
-    punpcklqdq xmm0, xmm1             // x0 x1 x2 x3
+    pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
+    pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
+    punpckldq  xmm1, xmm4  // x2 x3
+    punpcklqdq xmm0, xmm1  // x0 x1 x2 x3
     movdqu     [edi], xmm0
     lea        edi, [edi + 16]
-    sub        ecx, 4                 // 4 pixels
+    sub        ecx, 4  // 4 pixels
     jge        xloop4
 
  xloop49:
     test       ecx, 2
     je         xloop29
 
-    // 2 Pixels.
+        // 2 Pixels.
     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
-    pextrw     eax, xmm2, 5           // get x2 integer.
-    punpckldq  xmm0, xmm1             // x0 x1
+    pextrw     eax, xmm2, 5  // get x2 integer.
+    punpckldq  xmm0, xmm1  // x0 x1
 
     movq       qword ptr [edi], xmm0
     lea        edi, [edi + 8]
@@ -1217,7 +1232,7 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
     test       ecx, 1
     je         xloop99
 
-    // 1 Pixels.
+        // 1 Pixels.
     movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
     movd       dword ptr [edi], xmm0
  xloop99:
@@ -1232,60 +1247,62 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
 // TODO(fbarchard): Port to Neon
 
 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
-static uvec8 kShuffleColARGB = {
-  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
-  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
+static const uvec8 kShuffleColARGB = {
+    0u, 4u,  1u, 5u,  2u,  6u,  3u,  7u,  // bbggrraa 1st pixel
+    8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
 };
 
 // Shuffle table for duplicating 2 fractions into 8 bytes each
-static uvec8 kShuffleFractions = {
-  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+static const uvec8 kShuffleFractions = {
+    0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
 };
 
-__declspec(naked)
-void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
-                               int dst_width, int x, int dx) {
+__declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
+                                                 const uint8_t* src_argb,
+                                                 int dst_width,
+                                                 int x,
+                                                 int dx) {
   __asm {
     push       esi
     push       edi
-    mov        edi, [esp + 8 + 4]    // dst_argb
-    mov        esi, [esp + 8 + 8]    // src_argb
-    mov        ecx, [esp + 8 + 12]   // dst_width
+    mov        edi, [esp + 8 + 4]  // dst_argb
+    mov        esi, [esp + 8 + 8]  // src_argb
+    mov        ecx, [esp + 8 + 12]  // dst_width
     movd       xmm2, [esp + 8 + 16]  // x
     movd       xmm3, [esp + 8 + 20]  // dx
     movdqa     xmm4, xmmword ptr kShuffleColARGB
     movdqa     xmm5, xmmword ptr kShuffleFractions
-    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
+    pcmpeqb    xmm6, xmm6  // generate 0x007f for inverting fraction.
     psrlw      xmm6, 9
-    pextrw     eax, xmm2, 1         // get x0 integer. preroll
+    pextrw     eax, xmm2, 1  // get x0 integer. preroll
     sub        ecx, 2
     jl         xloop29
 
-    movdqa     xmm0, xmm2           // x1 = x0 + dx
+    movdqa     xmm0, xmm2  // x1 = x0 + dx
     paddd      xmm0, xmm3
-    punpckldq  xmm2, xmm0           // x0 x1
-    punpckldq  xmm3, xmm3           // dx dx
-    paddd      xmm3, xmm3           // dx * 2, dx * 2
-    pextrw     edx, xmm2, 3         // get x1 integer. preroll
+    punpckldq  xmm2, xmm0  // x0 x1
+    punpckldq  xmm3, xmm3  // dx dx
+    paddd      xmm3, xmm3  // dx * 2, dx * 2
+    pextrw     edx, xmm2, 3  // get x1 integer. preroll
 
     // 2 Pixel loop.
   xloop2:
-    movdqa     xmm1, xmm2           // x0, x1 fractions.
-    paddd      xmm2, xmm3           // x += dx
+    movdqa     xmm1, xmm2  // x0, x1 fractions.
+    paddd      xmm2, xmm3  // x += dx
     movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
-    psrlw      xmm1, 9              // 7 bit fractions.
+    psrlw      xmm1, 9  // 7 bit fractions.
     movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels
-    pshufb     xmm1, xmm5           // 0000000011111111
-    pshufb     xmm0, xmm4           // arrange pixels into pairs
-    pxor       xmm1, xmm6           // 0..7f and 7f..0
-    pmaddubsw  xmm0, xmm1           // argb_argb 16 bit, 2 pixels.
-    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
-    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
-    psrlw      xmm0, 7              // argb 8.7 fixed point to low 8 bits.
-    packuswb   xmm0, xmm0           // argb_argb 8 bits, 2 pixels.
+    pshufb     xmm1, xmm5  // 0000000011111111
+    pshufb     xmm0, xmm4  // arrange pixels into pairs
+    pxor       xmm1, xmm6  // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm1  // argb_argb 16 bit, 2 pixels.
+    pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
+    pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
+    psrlw      xmm0, 7  // argb 8.7 fixed point to low 8 bits.
+    packuswb   xmm0, xmm0  // argb_argb 8 bits, 2 pixels.
     movq       qword ptr [edi], xmm0
     lea        edi, [edi + 8]
-    sub        ecx, 2               // 2 pixels
+    sub        ecx, 2  // 2 pixels
     jge        xloop2
 
  xloop29:
@@ -1293,15 +1310,15 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
     add        ecx, 2 - 1
     jl         xloop99
 
-    // 1 pixel remainder
-    psrlw      xmm2, 9              // 7 bit fractions.
+            // 1 pixel remainder
+    psrlw      xmm2, 9  // 7 bit fractions.
     movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
-    pshufb     xmm2, xmm5           // 00000000
-    pshufb     xmm0, xmm4           // arrange pixels into pairs
-    pxor       xmm2, xmm6           // 0..7f and 7f..0
-    pmaddubsw  xmm0, xmm2           // argb 16 bit, 1 pixel.
+    pshufb     xmm2, xmm5  // 00000000
+    pshufb     xmm0, xmm4  // arrange pixels into pairs
+    pxor       xmm2, xmm6  // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm2  // argb 16 bit, 1 pixel.
     psrlw      xmm0, 7
-    packuswb   xmm0, xmm0           // argb 8 bits, 1 pixel.
+    packuswb   xmm0, xmm0  // argb 8 bits, 1 pixel.
     movd       [edi], xmm0
 
  xloop99:
@@ -1313,13 +1330,15 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
 }
 
 // Reads 4 pixels, duplicates them and writes 8 pixels.
-__declspec(naked)
-void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
-                           int dst_width, int x, int dx) {
+__declspec(naked) void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
+                                             const uint8_t* src_argb,
+                                             int dst_width,
+                                             int x,
+                                             int dx) {
   __asm {
-    mov        edx, [esp + 4]    // dst_argb
-    mov        eax, [esp + 8]    // src_argb
-    mov        ecx, [esp + 12]   // dst_width
+    mov        edx, [esp + 4]  // dst_argb
+    mov        eax, [esp + 8]  // src_argb
+    mov        ecx, [esp + 12]  // dst_width
 
   wloop:
     movdqu     xmm0, [eax]
@@ -1338,12 +1357,11 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
 }
 
 // Divide num by div and return as 16.16 fixed point result.
-__declspec(naked)
-int FixedDiv_X86(int num, int div) {
+__declspec(naked) int FixedDiv_X86(int num, int div) {
   __asm {
-    mov        eax, [esp + 4]    // num
-    cdq                          // extend num to 64 bits
-    shld       edx, eax, 16      // 32.16
+    mov        eax, [esp + 4]  // num
+    cdq  // extend num to 64 bits
+    shld       edx, eax, 16  // 32.16
     shl        eax, 16
     idiv       dword ptr [esp + 8]
     ret
@@ -1351,13 +1369,12 @@ int FixedDiv_X86(int num, int div) {
 }
 
 // Divide num by div and return as 16.16 fixed point result.
-__declspec(naked)
-int FixedDiv1_X86(int num, int div) {
+__declspec(naked) int FixedDiv1_X86(int num, int div) {
   __asm {
-    mov        eax, [esp + 4]    // num
-    mov        ecx, [esp + 8]    // denom
-    cdq                          // extend num to 64 bits
-    shld       edx, eax, 16      // 32.16
+    mov        eax, [esp + 4]  // num
+    mov        ecx, [esp + 8]  // denom
+    cdq  // extend num to 64 bits
+    shld       edx, eax, 16  // 32.16
     shl        eax, 16
     sub        eax, 0x00010001
     sbb        edx, 0
diff --git a/media/libvpx/libvpx/third_party/libyuv/source/video_common.cc b/media/libvpx/libvpx/third_party/libyuv/source/video_common.cc
index 00fb71e18b..92384c050c 100644
--- a/media/libvpx/libvpx/third_party/libyuv/source/video_common.cc
+++ b/media/libvpx/libvpx/third_party/libyuv/source/video_common.cc
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #include "libyuv/video_common.h"
 
 #ifdef __cplusplus
@@ -16,40 +15,39 @@ namespace libyuv {
 extern "C" {
 #endif
 
-#define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof(x[0]))
-
 struct FourCCAliasEntry {
-  uint32 alias;
-  uint32 canonical;
+  uint32_t alias;
+  uint32_t canonical;
 };
 
-static const struct FourCCAliasEntry kFourCCAliases[] = {
-  {FOURCC_IYUV, FOURCC_I420},
-  {FOURCC_YU12, FOURCC_I420},
-  {FOURCC_YU16, FOURCC_I422},
-  {FOURCC_YU24, FOURCC_I444},
-  {FOURCC_YUYV, FOURCC_YUY2},
-  {FOURCC_YUVS, FOURCC_YUY2},  // kCMPixelFormat_422YpCbCr8_yuvs
-  {FOURCC_HDYC, FOURCC_UYVY},
-  {FOURCC_2VUY, FOURCC_UYVY},  // kCMPixelFormat_422YpCbCr8
-  {FOURCC_JPEG, FOURCC_MJPG},  // Note: JPEG has DHT while MJPG does not.
-  {FOURCC_DMB1, FOURCC_MJPG},
-  {FOURCC_BA81, FOURCC_BGGR},  // deprecated.
-  {FOURCC_RGB3, FOURCC_RAW },
-  {FOURCC_BGR3, FOURCC_24BG},
-  {FOURCC_CM32, FOURCC_BGRA},  // kCMPixelFormat_32ARGB
-  {FOURCC_CM24, FOURCC_RAW },  // kCMPixelFormat_24RGB
-  {FOURCC_L555, FOURCC_RGBO},  // kCMPixelFormat_16LE555
-  {FOURCC_L565, FOURCC_RGBP},  // kCMPixelFormat_16LE565
-  {FOURCC_5551, FOURCC_RGBO},  // kCMPixelFormat_16LE5551
+#define NUM_ALIASES 18
+static const struct FourCCAliasEntry kFourCCAliases[NUM_ALIASES] = {
+    {FOURCC_IYUV, FOURCC_I420},
+    {FOURCC_YU12, FOURCC_I420},
+    {FOURCC_YU16, FOURCC_I422},
+    {FOURCC_YU24, FOURCC_I444},
+    {FOURCC_YUYV, FOURCC_YUY2},
+    {FOURCC_YUVS, FOURCC_YUY2},  // kCMPixelFormat_422YpCbCr8_yuvs
+    {FOURCC_HDYC, FOURCC_UYVY},
+    {FOURCC_2VUY, FOURCC_UYVY},  // kCMPixelFormat_422YpCbCr8
+    {FOURCC_JPEG, FOURCC_MJPG},  // Note: JPEG has DHT while MJPG does not.
+    {FOURCC_DMB1, FOURCC_MJPG},
+    {FOURCC_BA81, FOURCC_BGGR},  // deprecated.
+    {FOURCC_RGB3, FOURCC_RAW},
+    {FOURCC_BGR3, FOURCC_24BG},
+    {FOURCC_CM32, FOURCC_BGRA},  // kCMPixelFormat_32ARGB
+    {FOURCC_CM24, FOURCC_RAW},   // kCMPixelFormat_24RGB
+    {FOURCC_L555, FOURCC_RGBO},  // kCMPixelFormat_16LE555
+    {FOURCC_L565, FOURCC_RGBP},  // kCMPixelFormat_16LE565
+    {FOURCC_5551, FOURCC_RGBO},  // kCMPixelFormat_16LE5551
 };
 // TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB.
 //  {FOURCC_BGRA, FOURCC_ARGB},  // kCMPixelFormat_32BGRA
 
 LIBYUV_API
-uint32 CanonicalFourCC(uint32 fourcc) {
+uint32_t CanonicalFourCC(uint32_t fourcc) {
   int i;
-  for (i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) {
+  for (i = 0; i < NUM_ALIASES; ++i) {
     if (kFourCCAliases[i].alias == fourcc) {
       return kFourCCAliases[i].canonical;
     }
@@ -62,4 +60,3 @@ uint32 CanonicalFourCC(uint32 fourcc) {
 }  // extern "C"
 }  // namespace libyuv
 #endif
-
diff --git a/media/libvpx/libvpx/third_party/nalloc/LICENSE b/media/libvpx/libvpx/third_party/nalloc/LICENSE
new file mode 100644
index 0000000000..14c2b9e737
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/nalloc/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 Catena cyber
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/media/libvpx/libvpx/third_party/nalloc/README.libvpx b/media/libvpx/libvpx/third_party/nalloc/README.libvpx
new file mode 100644
index 0000000000..0acafdf2f7
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/nalloc/README.libvpx
@@ -0,0 +1,11 @@
+Name: nalloc
+URL: https://github.com/catenacyber/nallocfuzz
+Version: dc351a94bbded5ede5b7550d6d08e78e0cc6dcef
+License: MIT
+License File: LICENSE
+
+Description:
+Nalloc is a tool to inject allocation failures while fuzzing.
+
+Local Modifications:
+None
diff --git a/media/libvpx/libvpx/third_party/nalloc/nalloc.h b/media/libvpx/libvpx/third_party/nalloc/nalloc.h
new file mode 100644
index 0000000000..f910926b18
--- /dev/null
+++ b/media/libvpx/libvpx/third_party/nalloc/nalloc.h
@@ -0,0 +1,342 @@
+/*
+ MIT License
+
+ Copyright (c) 2025 Catena cyber
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in all
+ copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+#ifndef NALLOC_H_
+#define NALLOC_H_
+
+#if defined(__clang__) && defined(__has_feature)
+#if __has_feature(address_sanitizer)
+#define NALLOC_ASAN 1
+#endif
+#if __has_feature(memory_sanitizer)
+#define FUZZER_DISABLE_NALLOC 1
+#endif
+#endif
+
+#if defined(FUZZER_DISABLE_NALLOC)
+#define nalloc_init(x)
+#define nalloc_restrict_file_prefix(x)
+#define nalloc_start(x, y)
+#define nalloc_end()
+#else
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static const uint32_t nalloc_crc32_table[] = {
+  0x00000000, 0x04c11db7, 0x09823b6e, 0x0d4326d9, 0x130476dc, 0x17c56b6b,
+  0x1a864db2, 0x1e475005, 0x2608edb8, 0x22c9f00f, 0x2f8ad6d6, 0x2b4bcb61,
+  0x350c9b64, 0x31cd86d3, 0x3c8ea00a, 0x384fbdbd, 0x4c11db70, 0x48d0c6c7,
+  0x4593e01e, 0x4152fda9, 0x5f15adac, 0x5bd4b01b, 0x569796c2, 0x52568b75,
+  0x6a1936c8, 0x6ed82b7f, 0x639b0da6, 0x675a1011, 0x791d4014, 0x7ddc5da3,
+  0x709f7b7a, 0x745e66cd, 0x9823b6e0, 0x9ce2ab57, 0x91a18d8e, 0x95609039,
+  0x8b27c03c, 0x8fe6dd8b, 0x82a5fb52, 0x8664e6e5, 0xbe2b5b58, 0xbaea46ef,
+  0xb7a96036, 0xb3687d81, 0xad2f2d84, 0xa9ee3033, 0xa4ad16ea, 0xa06c0b5d,
+  0xd4326d90, 0xd0f37027, 0xddb056fe, 0xd9714b49, 0xc7361b4c, 0xc3f706fb,
+  0xceb42022, 0xca753d95, 0xf23a8028, 0xf6fb9d9f, 0xfbb8bb46, 0xff79a6f1,
+  0xe13ef6f4, 0xe5ffeb43, 0xe8bccd9a, 0xec7dd02d, 0x34867077, 0x30476dc0,
+  0x3d044b19, 0x39c556ae, 0x278206ab, 0x23431b1c, 0x2e003dc5, 0x2ac12072,
+  0x128e9dcf, 0x164f8078, 0x1b0ca6a1, 0x1fcdbb16, 0x018aeb13, 0x054bf6a4,
+  0x0808d07d, 0x0cc9cdca, 0x7897ab07, 0x7c56b6b0, 0x71159069, 0x75d48dde,
+  0x6b93dddb, 0x6f52c06c, 0x6211e6b5, 0x66d0fb02, 0x5e9f46bf, 0x5a5e5b08,
+  0x571d7dd1, 0x53dc6066, 0x4d9b3063, 0x495a2dd4, 0x44190b0d, 0x40d816ba,
+  0xaca5c697, 0xa864db20, 0xa527fdf9, 0xa1e6e04e, 0xbfa1b04b, 0xbb60adfc,
+  0xb6238b25, 0xb2e29692, 0x8aad2b2f, 0x8e6c3698, 0x832f1041, 0x87ee0df6,
+  0x99a95df3, 0x9d684044, 0x902b669d, 0x94ea7b2a, 0xe0b41de7, 0xe4750050,
+  0xe9362689, 0xedf73b3e, 0xf3b06b3b, 0xf771768c, 0xfa325055, 0xfef34de2,
+  0xc6bcf05f, 0xc27dede8, 0xcf3ecb31, 0xcbffd686, 0xd5b88683, 0xd1799b34,
+  0xdc3abded, 0xd8fba05a, 0x690ce0ee, 0x6dcdfd59, 0x608edb80, 0x644fc637,
+  0x7a089632, 0x7ec98b85, 0x738aad5c, 0x774bb0eb, 0x4f040d56, 0x4bc510e1,
+  0x46863638, 0x42472b8f, 0x5c007b8a, 0x58c1663d, 0x558240e4, 0x51435d53,
+  0x251d3b9e, 0x21dc2629, 0x2c9f00f0, 0x285e1d47, 0x36194d42, 0x32d850f5,
+  0x3f9b762c, 0x3b5a6b9b, 0x0315d626, 0x07d4cb91, 0x0a97ed48, 0x0e56f0ff,
+  0x1011a0fa, 0x14d0bd4d, 0x19939b94, 0x1d528623, 0xf12f560e, 0xf5ee4bb9,
+  0xf8ad6d60, 0xfc6c70d7, 0xe22b20d2, 0xe6ea3d65, 0xeba91bbc, 0xef68060b,
+  0xd727bbb6, 0xd3e6a601, 0xdea580d8, 0xda649d6f, 0xc423cd6a, 0xc0e2d0dd,
+  0xcda1f604, 0xc960ebb3, 0xbd3e8d7e, 0xb9ff90c9, 0xb4bcb610, 0xb07daba7,
+  0xae3afba2, 0xaafbe615, 0xa7b8c0cc, 0xa379dd7b, 0x9b3660c6, 0x9ff77d71,
+  0x92b45ba8, 0x9675461f, 0x8832161a, 0x8cf30bad, 0x81b02d74, 0x857130c3,
+  0x5d8a9099, 0x594b8d2e, 0x5408abf7, 0x50c9b640, 0x4e8ee645, 0x4a4ffbf2,
+  0x470cdd2b, 0x43cdc09c, 0x7b827d21, 0x7f436096, 0x7200464f, 0x76c15bf8,
+  0x68860bfd, 0x6c47164a, 0x61043093, 0x65c52d24, 0x119b4be9, 0x155a565e,
+  0x18197087, 0x1cd86d30, 0x029f3d35, 0x065e2082, 0x0b1d065b, 0x0fdc1bec,
+  0x3793a651, 0x3352bbe6, 0x3e119d3f, 0x3ad08088, 0x2497d08d, 0x2056cd3a,
+  0x2d15ebe3, 0x29d4f654, 0xc5a92679, 0xc1683bce, 0xcc2b1d17, 0xc8ea00a0,
+  0xd6ad50a5, 0xd26c4d12, 0xdf2f6bcb, 0xdbee767c, 0xe3a1cbc1, 0xe760d676,
+  0xea23f0af, 0xeee2ed18, 0xf0a5bd1d, 0xf464a0aa, 0xf9278673, 0xfde69bc4,
+  0x89b8fd09, 0x8d79e0be, 0x803ac667, 0x84fbdbd0, 0x9abc8bd5, 0x9e7d9662,
+  0x933eb0bb, 0x97ffad0c, 0xafb010b1, 0xab710d06, 0xa6322bdf, 0xa2f33668,
+  0xbcb4666d, 0xb8757bda, 0xb5365d03, 0xb1f740b4
+};
+
+// Nallocfuzz data to take a decision
+uint32_t nalloc_random_state = 0;
+__thread unsigned int nalloc_running = 0;
+bool nalloc_initialized = false;
+uint32_t nalloc_runs = 0;
+
+// Nalloc fuzz parameters
+uint32_t nalloc_bitmask = 0xFF;
+bool nalloc_random_bitmask = true;
+uint32_t nalloc_magic = 0x294cee63;
+bool nalloc_verbose = false;
+
+#ifdef NALLOC_ASAN
+extern void __sanitizer_print_stack_trace(void);
+#endif
+
+// Generic init, using env variables to get parameters
+void nalloc_init(const char *prog) {
+  if (nalloc_initialized) {
+    return;
+  }
+  nalloc_initialized = true;
+  char *bitmask = getenv("NALLOC_FREQ");
+  if (bitmask) {
+    int shift = atoi(bitmask);
+    if (shift > 0 && shift < 31) {
+      nalloc_bitmask = 1 << shift;
+      nalloc_random_bitmask = false;
+    } else if (shift == 0) {
+      nalloc_random_bitmask = false;
+      nalloc_bitmask = 0;
+    }
+  } else if (prog == NULL || strstr(prog, "nalloc") == NULL) {
+    nalloc_random_bitmask = false;
+    nalloc_bitmask = 0;
+    return;
+  }
+
+  char *magic = getenv("NALLOC_MAGIC");
+  if (magic) {
+    nalloc_magic = (uint32_t)strtol(magic, NULL, 0);
+  }
+
+  char *verbose = getenv("NALLOC_VERBOSE");
+  if (verbose) {
+    nalloc_verbose = true;
+  }
+}
+
+// add one byte to the CRC
+static inline void nalloc_random_update(uint8_t b) {
+  nalloc_random_state =
+      ((uint32_t)((uint32_t)nalloc_random_state << 8)) ^
+      nalloc_crc32_table[((nalloc_random_state >> 24) ^ b) & 0xFF];
+}
+
+// Start the failure injections, using a buffer as seed
+static int nalloc_start(const uint8_t *data, size_t size) {
+  if (nalloc_random_bitmask) {
+    if (nalloc_random_state & 0x10) {
+      nalloc_bitmask = 0xFFFFFFFF;
+    } else {
+      nalloc_bitmask = 1 << (5 + (nalloc_random_state & 0xF));
+    }
+  } else if (nalloc_bitmask == 0) {
+    // nalloc disabled
+    return 0;
+  }
+  nalloc_random_state = 0;
+  for (size_t i = 0; i < size; i++) {
+    nalloc_random_update(data[i]);
+  }
+  if (__sync_fetch_and_add(&nalloc_running, 1)) {
+    __sync_fetch_and_sub(&nalloc_running, 1);
+    return 0;
+  }
+  nalloc_runs++;
+  return 1;
+}
+
+// Stop the failure injections
+static void nalloc_end() { __sync_fetch_and_sub(&nalloc_running, 1); }
+
+static bool nalloc_backtrace_exclude(size_t size, const char *op) {
+  if (nalloc_verbose) {
+    fprintf(stderr, "failed %s(%zu) \n", op, size);
+#ifdef NALLOC_ASAN
+    __sanitizer_print_stack_trace();
+#endif
+  }
+
+  return false;
+}
+
+//
+static bool nalloc_fail(size_t size, const char *op) {
+  // do not fail before thread init
+  if (nalloc_runs == 0) {
+    return false;
+  }
+  if (__sync_fetch_and_add(&nalloc_running, 1) != 1) {
+    // do not fail allocations outside of fuzzer input
+    // and do not fail inside of this function
+    __sync_fetch_and_sub(&nalloc_running, 1);
+    return false;
+  }
+  nalloc_random_update((uint8_t)size);
+  if (size >= 0x100) {
+    nalloc_random_update((uint8_t)(size >> 8));
+    if (size >= 0x10000) {
+      nalloc_random_update((uint8_t)(size >> 16));
+      // bigger may already fail or oom
+    }
+  }
+  if (((nalloc_random_state ^ nalloc_magic) & nalloc_bitmask) == 0) {
+    if (nalloc_backtrace_exclude(size, op)) {
+      __sync_fetch_and_sub(&nalloc_running, 1);
+      return false;
+    }
+    __sync_fetch_and_sub(&nalloc_running, 1);
+    return true;
+  }
+  __sync_fetch_and_sub(&nalloc_running, 1);
+  return false;
+}
+
+// ASAN interceptor for libc routines
+#ifdef NALLOC_ASAN
+extern void *__interceptor_malloc(size_t);
+extern void *__interceptor_calloc(size_t, size_t);
+extern void *__interceptor_realloc(void *, size_t);
+extern void *__interceptor_reallocarray(void *, size_t, size_t);
+
+extern ssize_t __interceptor_read(int, void *, size_t);
+extern ssize_t __interceptor_write(int, const void *, size_t);
+extern ssize_t __interceptor_recv(int, void *, size_t, int);
+extern ssize_t __interceptor_send(int, const void *, size_t, int);
+
+#define nalloc_malloc(s) __interceptor_malloc(s)
+#define nalloc_calloc(s, n) __interceptor_calloc(s, n)
+#define nalloc_realloc(p, s) __interceptor_realloc(p, s)
+#define nalloc_reallocarray(p, s, n) __interceptor_reallocarray(p, s, n)
+
+#define nalloc_read(f, b, s) __interceptor_read(f, b, s)
+#define nalloc_write(f, b, s) __interceptor_write(f, b, s)
+#define nalloc_recv(f, b, s, x) __interceptor_recv(f, b, s, x)
+#define nalloc_send(f, b, s, x) __interceptor_send(f, b, s, x)
+
+#else
+extern void *__libc_malloc(size_t);
+extern void *__libc_calloc(size_t, size_t);
+extern void *__libc_realloc(void *, size_t);
+extern void *__libc_reallocarray(void *, size_t, size_t);
+
+extern ssize_t __read(int, void *, size_t);
+extern ssize_t __write(int, const void *, size_t);
+extern ssize_t __recv(int, void *, size_t, int);
+extern ssize_t __send(int, const void *, size_t, int);
+
+#define nalloc_malloc(s) __libc_malloc(s)
+#define nalloc_calloc(s, n) __libc_calloc(s, n)
+#define nalloc_realloc(p, s) __libc_realloc(p, s)
+#define nalloc_reallocarray(p, s, n) __libc_reallocarray(p, s, n)
+
+#define nalloc_read(f, b, s) __read(f, b, s)
+#define nalloc_write(f, b, s) __write(f, b, s)
+#define nalloc_recv(f, b, s, x) __recv(f, b, s, x)
+#define nalloc_send(f, b, s, x) __send(f, b, s, x)
+#endif
+
+// nalloc standard function overwrites with pseudo-random failures
+ssize_t read(int fd, void *buf, size_t count) {
+  if (nalloc_fail(count, "read")) {
+    errno = EIO;
+    return -1;
+  }
+  return nalloc_read(fd, buf, count);
+}
+
+ssize_t write(int fd, const void *buf, size_t count) {
+  if (nalloc_fail(count, "write")) {
+    errno = EIO;
+    return -1;
+  }
+  return nalloc_write(fd, buf, count);
+}
+
+ssize_t recv(int fd, void *buf, size_t count, int flags) {
+  if (nalloc_fail(count, "recv")) {
+    errno = EIO;
+    return -1;
+  }
+  return nalloc_recv(fd, buf, count, flags);
+}
+
+ssize_t send(int fd, const void *buf, size_t count, int flags) {
+  if (nalloc_fail(count, "send")) {
+    errno = EIO;
+    return -1;
+  }
+  return nalloc_send(fd, buf, count, flags);
+}
+
+void *calloc(size_t nmemb, size_t size) {
+  if (nalloc_fail(size, "calloc")) {
+    errno = ENOMEM;
+    return NULL;
+  }
+  return nalloc_calloc(nmemb, size);
+}
+
+void *malloc(size_t size) {
+  if (nalloc_fail(size, "malloc")) {
+    errno = ENOMEM;
+    return NULL;
+  }
+  return nalloc_malloc(size);
+}
+
+void *realloc(void *ptr, size_t size) {
+  if (nalloc_fail(size, "realloc")) {
+    errno = ENOMEM;
+    return NULL;
+  }
+  return nalloc_realloc(ptr, size);
+}
+
+void *reallocarray(void *ptr, size_t nmemb, size_t size) {
+  if (nalloc_fail(size, "reallocarray")) {
+    errno = ENOMEM;
+    return NULL;
+  }
+  return nalloc_reallocarray(ptr, nmemb, size);
+}
+
+#ifdef __cplusplus
+}  // extern "C" {
+#endif
+
+#endif // FUZZER_DISABLE_NALLOC
+
+#endif  // NALLOC_H_
diff --git a/media/libvpx/libvpx/third_party/x86inc/README.libvpx b/media/libvpx/libvpx/third_party/x86inc/README.libvpx
index 8d3cd966da..195654f7bb 100644
--- a/media/libvpx/libvpx/third_party/x86inc/README.libvpx
+++ b/media/libvpx/libvpx/third_party/x86inc/README.libvpx
@@ -1,5 +1,5 @@
 URL: https://git.videolan.org/git/x264.git
-Version: d23d18655249944c1ca894b451e2c82c7a584c62
+Version: 3e5aed95cc470f37e2db3e6506a8deb89b527720
 License: ISC
 License File: LICENSE
 
@@ -12,9 +12,8 @@ Get configuration from vpx_config.asm.
 Prefix functions with vpx by default.
 Manage name mangling (prefixing with '_') manually because 'PREFIX' does not
   exist in libvpx.
-Expand PIC default to macho64 and respect CONFIG_PIC from libvpx
-Set 'private_extern' visibility for macho targets.
 Copy PIC 'GLOBAL' macros from x86_abi_support.asm
 Use .text instead of .rodata on macho to avoid broken tables in PIC mode.
-Use .text with no alignment for aout
-Only use 'hidden' visibility with Chromium
+Use .text with no alignment for aout.
+Only use 'hidden' visibility with Chromium.
+Prefix ARCH_* with VPX_.
diff --git a/media/libvpx/libvpx/third_party/x86inc/x86inc.asm b/media/libvpx/libvpx/third_party/x86inc/x86inc.asm
index b647dff2f8..3d55e921c7 100644
--- a/media/libvpx/libvpx/third_party/x86inc/x86inc.asm
+++ b/media/libvpx/libvpx/third_party/x86inc/x86inc.asm
@@ -1,12 +1,12 @@
 ;*****************************************************************************
 ;* x86inc.asm: x264asm abstraction layer
 ;*****************************************************************************
-;* Copyright (C) 2005-2016 x264 project
+;* Copyright (C) 2005-2019 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;*          Henrik Gramner <henrik@gramner.com>
 ;*          Anton Mitrofanov <BugMaster@narod.ru>
 ;*          Fiona Glaser <fiona@x264.com>
-;*          Henrik Gramner <henrik@gramner.com>
 ;*
 ;* Permission to use, copy, modify, and/or distribute this software for any
 ;* purpose with or without fee is hereby granted, provided that the above
@@ -45,7 +45,7 @@
 %endif
 
 %ifndef STACK_ALIGNMENT
-    %if ARCH_X86_64
+    %if VPX_ARCH_X86_64
         %define STACK_ALIGNMENT 16
     %else
         %define STACK_ALIGNMENT 4
@@ -54,7 +54,7 @@
 
 %define WIN64  0
 %define UNIX64 0
-%if ARCH_X86_64
+%if VPX_ARCH_X86_64
     %ifidn __OUTPUT_FORMAT__,win32
         %define WIN64  1
     %elifidn __OUTPUT_FORMAT__,win64
@@ -67,19 +67,19 @@
 %endif
 
 %define FORMAT_ELF 0
+%define FORMAT_MACHO 0
 %ifidn __OUTPUT_FORMAT__,elf
     %define FORMAT_ELF 1
 %elifidn __OUTPUT_FORMAT__,elf32
     %define FORMAT_ELF 1
 %elifidn __OUTPUT_FORMAT__,elf64
     %define FORMAT_ELF 1
-%endif
-
-%define FORMAT_MACHO 0
-%ifidn __OUTPUT_FORMAT__,macho32
-     %define FORMAT_MACHO 1
+%elifidn __OUTPUT_FORMAT__,macho
+    %define FORMAT_MACHO 1
+%elifidn __OUTPUT_FORMAT__,macho32
+    %define FORMAT_MACHO 1
 %elifidn __OUTPUT_FORMAT__,macho64
-     %define FORMAT_MACHO 1
+    %define FORMAT_MACHO 1
 %endif
 
 ; Set PREFIX for libvpx builds.
@@ -103,7 +103,11 @@
 ; works around the issue. It appears to be specific to the way libvpx
 ; handles the tables.
 %macro SECTION_RODATA 0-1 16
-    %ifidn __OUTPUT_FORMAT__,macho32
+    %ifidn __OUTPUT_FORMAT__,win32
+        SECTION .rdata align=%1
+    %elif WIN64
+        SECTION .rdata align=%1
+    %elifidn __OUTPUT_FORMAT__,macho32
         SECTION .text align=%1
         fakegot:
     %elifidn __OUTPUT_FORMAT__,aout
@@ -113,8 +117,7 @@
     %endif
 %endmacro
 
-; PIC macros are copied from vpx_ports/x86_abi_support.asm. The "define PIC"
-; from original code is added in for 64bit.
+; PIC macros from vpx_ports/x86_abi_support.asm.
 %ifidn __OUTPUT_FORMAT__,elf32
 %define ABI_IS_32BIT 1
 %elifidn __OUTPUT_FORMAT__,macho32
@@ -165,7 +168,7 @@
         %endif
     %endif
 
-    %if ARCH_X86_64 == 0
+    %if VPX_ARCH_X86_64 == 0
         %undef PIC
     %endif
 
@@ -203,10 +206,24 @@
 %ifndef GET_GOT_DEFINED
     %define GET_GOT_DEFINED 0
 %endif
-; Done with PIC macros
+; End PIC macros from vpx_ports/x86_abi_support.asm.
+
+; libvpx explicitly sets visibilty in shared object builds. Avoid setting
+; visibility to hidden as it may break builds that split sources on e.g.,
+; directory boundaries.
+%ifdef CHROMIUM
+    %define VISIBILITY hidden
+    %define HAVE_PRIVATE_EXTERN 1
+%else
+    %define VISIBILITY
+    %define HAVE_PRIVATE_EXTERN 0
+%endif
 
 %ifdef __NASM_VER__
     %use smartalign
+    %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14
+        %define HAVE_PRIVATE_EXTERN 0
+    %endif
 %endif
 
 ; Macros to eliminate most code duplication between x86_32 and x86_64:
@@ -260,7 +277,7 @@
     %if %0 == 2
         %define r%1m  %2d
         %define r%1mp %2
-    %elif ARCH_X86_64 ; memory
+    %elif VPX_ARCH_X86_64 ; memory
         %define r%1m [rstk + stack_offset + %3]
         %define r%1mp qword r %+ %1 %+ m
     %else
@@ -281,7 +298,7 @@
     %define e%1h %3
     %define r%1b %2
     %define e%1b %2
-    %if ARCH_X86_64 == 0
+    %if VPX_ARCH_X86_64 == 0
         %define r%1 e%1
     %endif
 %endmacro
@@ -318,12 +335,24 @@ DECLARE_REG_SIZE bp, bpl, null
 
 DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
 
-%if ARCH_X86_64
+%if VPX_ARCH_X86_64
     %define gprsize 8
 %else
     %define gprsize 4
 %endif
 
+%macro LEA 2
+%if VPX_ARCH_X86_64
+    lea %1, [%2]
+%elif PIC
+    call $+5 ; special-cased to not affect the RSB on most CPU:s
+    pop %1
+    add %1, (%2)-$+1
+%else
+    mov %1, %2
+%endif
+%endmacro
+
 %macro PUSH 1
     push %1
     %ifidn rstk, rsp
@@ -385,6 +414,10 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
     %endif
 %endmacro
 
+%if VPX_ARCH_X86_64 == 0
+    %define movsxd movifnidn
+%endif
+
 %macro movsxdifnidn 2
     %ifnidn %1, %2
         movsxd %1, %2
@@ -433,6 +466,8 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
 %endmacro
 
 %define required_stack_alignment ((mmsize + 15) & ~15)
+%define vzeroupper_required (mmsize > 16 && (VPX_ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512)))
+%define high_mm_regs (16*cpuflag(avx512))
 
 %macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
     %ifnum %1
@@ -483,10 +518,18 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
     %ifnum %1
         %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
             %if %1 > 0
+                ; Reserve an additional register for storing the original stack pointer, but avoid using
+                ; eax/rax for this purpose since it can potentially get overwritten as a return value.
                 %assign regs_used (regs_used + 1)
+                %if VPX_ARCH_X86_64 && regs_used == 7
+                    %assign regs_used 8
+                %elif VPX_ARCH_X86_64 == 0 && regs_used == 1
+                    %assign regs_used 2
+                %endif
             %endif
-            %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3
-                ; Ensure that we don't clobber any registers containing arguments
+            %if VPX_ARCH_X86_64 && regs_used < 5 + UNIX64 * 3
+                ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax)
+                ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used.
                 %assign regs_used 5 + UNIX64 * 3
             %endif
         %endif
@@ -516,10 +559,10 @@ DECLARE_REG 7,  rdi, 64
 DECLARE_REG 8,  rsi, 72
 DECLARE_REG 9,  rbx, 80
 DECLARE_REG 10, rbp, 88
-DECLARE_REG 11, R12, 96
-DECLARE_REG 12, R13, 104
-DECLARE_REG 13, R14, 112
-DECLARE_REG 14, R15, 120
+DECLARE_REG 11, R14, 96
+DECLARE_REG 12, R15, 104
+DECLARE_REG 13, R12, 112
+DECLARE_REG 14, R13, 120
 
 %macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
     %assign num_args %1
@@ -538,15 +581,16 @@ DECLARE_REG 14, R15, 120
 
 %macro WIN64_PUSH_XMM 0
     ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
-    %if xmm_regs_used > 6
+    %if xmm_regs_used > 6 + high_mm_regs
         movaps [rstk + stack_offset +  8], xmm6
     %endif
-    %if xmm_regs_used > 7
+    %if xmm_regs_used > 7 + high_mm_regs
         movaps [rstk + stack_offset + 24], xmm7
     %endif
-    %if xmm_regs_used > 8
+    %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+    %if %%xmm_regs_on_stack > 0
         %assign %%i 8
-        %rep xmm_regs_used-8
+        %rep %%xmm_regs_on_stack
             movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
             %assign %%i %%i+1
         %endrep
@@ -555,59 +599,62 @@ DECLARE_REG 14, R15, 120
 
 %macro WIN64_SPILL_XMM 1
     %assign xmm_regs_used %1
-    ASSERT xmm_regs_used <= 16
-    %if xmm_regs_used > 8
+    ASSERT xmm_regs_used <= 16 + high_mm_regs
+    %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+    %if %%xmm_regs_on_stack > 0
         ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
-        %assign %%pad (xmm_regs_used-8)*16 + 32
+        %assign %%pad %%xmm_regs_on_stack*16 + 32
         %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
         SUB rsp, stack_size_padded
     %endif
     WIN64_PUSH_XMM
 %endmacro
 
-%macro WIN64_RESTORE_XMM_INTERNAL 1
+%macro WIN64_RESTORE_XMM_INTERNAL 0
     %assign %%pad_size 0
-    %if xmm_regs_used > 8
-        %assign %%i xmm_regs_used
-        %rep xmm_regs_used-8
+    %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+    %if %%xmm_regs_on_stack > 0
+        %assign %%i xmm_regs_used - high_mm_regs
+        %rep %%xmm_regs_on_stack
             %assign %%i %%i-1
-            movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32]
+            movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32]
         %endrep
     %endif
     %if stack_size_padded > 0
         %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT
             mov rsp, rstkm
         %else
-            add %1, stack_size_padded
+            add rsp, stack_size_padded
             %assign %%pad_size stack_size_padded
         %endif
     %endif
-    %if xmm_regs_used > 7
-        movaps xmm7, [%1 + stack_offset - %%pad_size + 24]
+    %if xmm_regs_used > 7 + high_mm_regs
+        movaps xmm7, [rsp + stack_offset - %%pad_size + 24]
     %endif
-    %if xmm_regs_used > 6
-        movaps xmm6, [%1 + stack_offset - %%pad_size +  8]
+    %if xmm_regs_used > 6 + high_mm_regs
+        movaps xmm6, [rsp + stack_offset - %%pad_size +  8]
     %endif
 %endmacro
 
-%macro WIN64_RESTORE_XMM 1
-    WIN64_RESTORE_XMM_INTERNAL %1
+%macro WIN64_RESTORE_XMM 0
+    WIN64_RESTORE_XMM_INTERNAL
     %assign stack_offset (stack_offset-stack_size_padded)
+    %assign stack_size_padded 0
     %assign xmm_regs_used 0
 %endmacro
 
-%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
+%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs
 
 %macro RET 0
-    WIN64_RESTORE_XMM_INTERNAL rsp
+    WIN64_RESTORE_XMM_INTERNAL
     POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
-    %if mmsize == 32
+    %if vzeroupper_required
         vzeroupper
     %endif
     AUTO_REP_RET
 %endmacro
 
-%elif ARCH_X86_64 ; *nix x64 ;=============================================
+%elif VPX_ARCH_X86_64 ; *nix x64 ;=============================================
 
 DECLARE_REG 0,  rdi
 DECLARE_REG 1,  rsi
@@ -620,14 +667,15 @@ DECLARE_REG 7,  R10, 16
 DECLARE_REG 8,  R11, 24
 DECLARE_REG 9,  rbx, 32
 DECLARE_REG 10, rbp, 40
-DECLARE_REG 11, R12, 48
-DECLARE_REG 12, R13, 56
-DECLARE_REG 13, R14, 64
-DECLARE_REG 14, R15, 72
+DECLARE_REG 11, R14, 48
+DECLARE_REG 12, R15, 56
+DECLARE_REG 13, R12, 64
+DECLARE_REG 14, R13, 72
 
-%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
     %assign num_args %1
     %assign regs_used %2
+    %assign xmm_regs_used %3
     ASSERT regs_used >= num_args
     SETUP_STACK_POINTER %4
     ASSERT regs_used <= 15
@@ -637,7 +685,7 @@ DECLARE_REG 14, R15, 72
     DEFINE_ARGS_INTERNAL %0, %4, %5
 %endmacro
 
-%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
+%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
 
 %macro RET 0
     %if stack_size_padded > 0
@@ -648,7 +696,7 @@ DECLARE_REG 14, R15, 72
         %endif
     %endif
     POP_IF_USED 14, 13, 12, 11, 10, 9
-    %if mmsize == 32
+    %if vzeroupper_required
         vzeroupper
     %endif
     AUTO_REP_RET
@@ -693,7 +741,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
     DEFINE_ARGS_INTERNAL %0, %4, %5
 %endmacro
 
-%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
+%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
 
 %macro RET 0
     %if stack_size_padded > 0
@@ -704,7 +752,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
         %endif
     %endif
     POP_IF_USED 6, 5, 4, 3
-    %if mmsize == 32
+    %if vzeroupper_required
         vzeroupper
     %endif
     AUTO_REP_RET
@@ -715,7 +763,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
 %if WIN64 == 0
     %macro WIN64_SPILL_XMM 1
     %endmacro
-    %macro WIN64_RESTORE_XMM 1
+    %macro WIN64_RESTORE_XMM 0
     %endmacro
     %macro WIN64_PUSH_XMM 0
     %endmacro
@@ -726,7 +774,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
 ; We can automatically detect "follows a branch", but not a branch target.
 ; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
 %macro REP_RET 0
-    %if has_epilogue
+    %if has_epilogue || cpuflag(ssse3)
         RET
     %else
         rep ret
@@ -758,7 +806,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
 
 BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
 
-%macro TAIL_CALL 2 ; callee, is_nonadjacent
+%macro TAIL_CALL 1-2 1 ; callee, is_nonadjacent
     %if has_epilogue
         call %1
         RET
@@ -788,35 +836,25 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
 %endmacro
 %macro cglobal_internal 2-3+
     annotate_function_size
-    %if %1
-        %xdefine %%FUNCTION_PREFIX private_prefix
-        ; libvpx explicitly sets visibility in shared object builds. Avoid
-        ; setting visibility to hidden as it may break builds that split
-        ; sources on e.g., directory boundaries.
-        %ifdef CHROMIUM
-            %xdefine %%VISIBILITY hidden
-        %else
-            %xdefine %%VISIBILITY
-        %endif
-    %else
-        %xdefine %%FUNCTION_PREFIX public_prefix
-        %xdefine %%VISIBILITY
-    %endif
     %ifndef cglobaled_%2
-        %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2)
+        %if %1
+            %xdefine %2 mangle(private_prefix %+ _ %+ %2)
+        %else
+            %xdefine %2 mangle(public_prefix %+ _ %+ %2)
+        %endif
         %xdefine %2.skip_prologue %2 %+ .skip_prologue
         CAT_XDEFINE cglobaled_, %2, 1
     %endif
     %xdefine current_function %2
     %xdefine current_function_section __SECT__
     %if FORMAT_ELF
-        global %2:function %%VISIBILITY
-    %elif FORMAT_MACHO
-        %ifdef __NASM_VER__
-            global %2
+        %if %1
+            global %2:function VISIBILITY
         %else
-            global %2:private_extern
+            global %2:function
         %endif
+    %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN && %1
+        global %2:private_extern
     %else
         global %2
     %endif
@@ -827,12 +865,24 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
     %assign stack_offset 0      ; stack pointer offset relative to the return address
     %assign stack_size 0        ; amount of stack space that can be freely used inside a function
     %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
-    %assign xmm_regs_used 0     ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
+    %assign xmm_regs_used 0     ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper
     %ifnidn %3, ""
         PROLOGUE %3
     %endif
 %endmacro
 
+; Create a global symbol from a local label with the correct name mangling and type
+%macro cglobal_label 1
+    %if FORMAT_ELF
+        global current_function %+ %1:function VISIBILITY
+    %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN
+        global current_function %+ %1:private_extern
+    %else
+        global current_function %+ %1
+    %endif
+    %1:
+%endmacro
+
 %macro cextern 1
     %xdefine %1 mangle(private_prefix %+ _ %+ %1)
     CAT_XDEFINE cglobaled_, %1, 1
@@ -851,7 +901,9 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
 %macro const 1-2+
     %xdefine %1 mangle(private_prefix %+ _ %+ %1)
     %if FORMAT_ELF
-        global %1:data hidden
+        global %1:data VISIBILITY
+    %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN
+        global %1:private_extern
     %else
         global %1
     %endif
@@ -890,24 +942,26 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
 %assign cpuflags_sse      (1<<4) | cpuflags_mmx2
 %assign cpuflags_sse2     (1<<5) | cpuflags_sse
 %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
-%assign cpuflags_sse3     (1<<7) | cpuflags_sse2
-%assign cpuflags_ssse3    (1<<8) | cpuflags_sse3
-%assign cpuflags_sse4     (1<<9) | cpuflags_ssse3
-%assign cpuflags_sse42    (1<<10)| cpuflags_sse4
-%assign cpuflags_avx      (1<<11)| cpuflags_sse42
-%assign cpuflags_xop      (1<<12)| cpuflags_avx
-%assign cpuflags_fma4     (1<<13)| cpuflags_avx
-%assign cpuflags_fma3     (1<<14)| cpuflags_avx
-%assign cpuflags_avx2     (1<<15)| cpuflags_fma3
+%assign cpuflags_lzcnt    (1<<7) | cpuflags_sse2
+%assign cpuflags_sse3     (1<<8) | cpuflags_sse2
+%assign cpuflags_ssse3    (1<<9) | cpuflags_sse3
+%assign cpuflags_sse4     (1<<10)| cpuflags_ssse3
+%assign cpuflags_sse42    (1<<11)| cpuflags_sse4
+%assign cpuflags_aesni    (1<<12)| cpuflags_sse42
+%assign cpuflags_gfni     (1<<13)| cpuflags_sse42
+%assign cpuflags_avx      (1<<14)| cpuflags_sse42
+%assign cpuflags_xop      (1<<15)| cpuflags_avx
+%assign cpuflags_fma4     (1<<16)| cpuflags_avx
+%assign cpuflags_fma3     (1<<17)| cpuflags_avx
+%assign cpuflags_bmi1     (1<<18)| cpuflags_avx|cpuflags_lzcnt
+%assign cpuflags_bmi2     (1<<19)| cpuflags_bmi1
+%assign cpuflags_avx2     (1<<20)| cpuflags_fma3|cpuflags_bmi2
+%assign cpuflags_avx512   (1<<21)| cpuflags_avx2 ; F, CD, BW, DQ, VL
 
-%assign cpuflags_cache32  (1<<16)
-%assign cpuflags_cache64  (1<<17)
-%assign cpuflags_slowctz  (1<<18)
-%assign cpuflags_lzcnt    (1<<19)
-%assign cpuflags_aligned  (1<<20) ; not a cpu feature, but a function variant
-%assign cpuflags_atom     (1<<21)
-%assign cpuflags_bmi1     (1<<22)|cpuflags_lzcnt
-%assign cpuflags_bmi2     (1<<23)|cpuflags_bmi1
+%assign cpuflags_cache32  (1<<22)
+%assign cpuflags_cache64  (1<<23)
+%assign cpuflags_aligned  (1<<24) ; not a cpu feature, but a function variant
+%assign cpuflags_atom     (1<<25)
 
 ; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
 %define    cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
@@ -948,9 +1002,9 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
         %endif
     %endif
 
-    %if ARCH_X86_64 || cpuflag(sse2)
+    %if VPX_ARCH_X86_64 || cpuflag(sse2)
         %ifdef __NASM_VER__
-            ALIGNMODE k8
+            ALIGNMODE p6
         %else
             CPU amdnop
         %endif
@@ -963,11 +1017,12 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
     %endif
 %endmacro
 
-; Merge mmx and sse*
+; Merge mmx, sse*, and avx*
 ; m# is a simd register of the currently selected size
 ; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
 ; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
-; (All 3 remain in sync through SWAP.)
+; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m#
+; (All 4 remain in sync through SWAP.)
 
 %macro CAT_XDEFINE 3
     %xdefine %1%2 %3
@@ -977,69 +1032,99 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
     %undef %1%2
 %endmacro
 
+%macro DEFINE_MMREGS 1 ; mmtype
+    %assign %%prev_mmregs 0
+    %ifdef num_mmregs
+        %assign %%prev_mmregs num_mmregs
+    %endif
+
+    %assign num_mmregs 8
+    %if VPX_ARCH_X86_64 && mmsize >= 16
+        %assign num_mmregs 16
+        %if cpuflag(avx512) || mmsize == 64
+            %assign num_mmregs 32
+        %endif
+    %endif
+
+    %assign %%i 0
+    %rep num_mmregs
+        CAT_XDEFINE m, %%i, %1 %+ %%i
+        CAT_XDEFINE nn%1, %%i, %%i
+        %assign %%i %%i+1
+    %endrep
+    %if %%prev_mmregs > num_mmregs
+        %rep %%prev_mmregs - num_mmregs
+            CAT_UNDEF m, %%i
+            CAT_UNDEF nn %+ mmtype, %%i
+            %assign %%i %%i+1
+        %endrep
+    %endif
+    %xdefine mmtype %1
+%endmacro
+
+; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper
+%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg
+    %if VPX_ARCH_X86_64 && cpuflag(avx512)
+        %assign %%i %1
+        %rep 16-%1
+            %assign %%i_high %%i+16
+            SWAP %%i, %%i_high
+            %assign %%i %%i+1
+        %endrep
+    %endif
+%endmacro
+
 %macro INIT_MMX 0-1+
     %assign avx_enabled 0
     %define RESET_MM_PERMUTATION INIT_MMX %1
     %define mmsize 8
-    %define num_mmregs 8
     %define mova movq
     %define movu movq
     %define movh movd
     %define movnta movntq
-    %assign %%i 0
-    %rep 8
-        CAT_XDEFINE m, %%i, mm %+ %%i
-        CAT_XDEFINE nnmm, %%i, %%i
-        %assign %%i %%i+1
-    %endrep
-    %rep 8
-        CAT_UNDEF m, %%i
-        CAT_UNDEF nnmm, %%i
-        %assign %%i %%i+1
-    %endrep
     INIT_CPUFLAGS %1
+    DEFINE_MMREGS mm
 %endmacro
 
 %macro INIT_XMM 0-1+
     %assign avx_enabled 0
     %define RESET_MM_PERMUTATION INIT_XMM %1
     %define mmsize 16
-    %define num_mmregs 8
-    %if ARCH_X86_64
-        %define num_mmregs 16
-    %endif
     %define mova movdqa
     %define movu movdqu
     %define movh movq
     %define movnta movntdq
-    %assign %%i 0
-    %rep num_mmregs
-        CAT_XDEFINE m, %%i, xmm %+ %%i
-        CAT_XDEFINE nnxmm, %%i, %%i
-        %assign %%i %%i+1
-    %endrep
     INIT_CPUFLAGS %1
+    DEFINE_MMREGS xmm
+    %if WIN64
+        AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers
+    %endif
 %endmacro
 
 %macro INIT_YMM 0-1+
     %assign avx_enabled 1
     %define RESET_MM_PERMUTATION INIT_YMM %1
     %define mmsize 32
-    %define num_mmregs 8
-    %if ARCH_X86_64
-        %define num_mmregs 16
-    %endif
     %define mova movdqa
     %define movu movdqu
     %undef movh
     %define movnta movntdq
-    %assign %%i 0
-    %rep num_mmregs
-        CAT_XDEFINE m, %%i, ymm %+ %%i
-        CAT_XDEFINE nnymm, %%i, %%i
-        %assign %%i %%i+1
-    %endrep
     INIT_CPUFLAGS %1
+    DEFINE_MMREGS ymm
+    AVX512_MM_PERMUTATION
+%endmacro
+
+%macro INIT_ZMM 0-1+
+    %assign avx_enabled 1
+    %define RESET_MM_PERMUTATION INIT_ZMM %1
+    %define mmsize 64
+    %define mova movdqa
+    %define movu movdqu
+    %undef movh
+    %define movnta movntdq
+    INIT_CPUFLAGS %1
+    DEFINE_MMREGS zmm
+    AVX512_MM_PERMUTATION
 %endmacro
 
 INIT_XMM
@@ -1048,18 +1133,26 @@ INIT_XMM
     %define  mmmm%1   mm%1
     %define  mmxmm%1  mm%1
     %define  mmymm%1  mm%1
+    %define  mmzmm%1  mm%1
     %define xmmmm%1   mm%1
     %define xmmxmm%1 xmm%1
     %define xmmymm%1 xmm%1
+    %define xmmzmm%1 xmm%1
     %define ymmmm%1   mm%1
     %define ymmxmm%1 xmm%1
     %define ymmymm%1 ymm%1
+    %define ymmzmm%1 ymm%1
+    %define zmmmm%1   mm%1
+    %define zmmxmm%1 xmm%1
+    %define zmmymm%1 ymm%1
+    %define zmmzmm%1 zmm%1
     %define xm%1 xmm %+ m%1
     %define ym%1 ymm %+ m%1
+    %define zm%1 zmm %+ m%1
 %endmacro
 
 %assign i 0
-%rep 16
+%rep 32
     DECLARE_MMCAST i
     %assign i i+1
 %endrep
@@ -1129,25 +1222,42 @@ INIT_XMM
     %endif
     %assign %%i 0
     %rep num_mmregs
-        CAT_XDEFINE %%f, %%i, m %+ %%i
+        %xdefine %%tmp m %+ %%i
+        CAT_XDEFINE %%f, %%i, regnumof %+ %%tmp
         %assign %%i %%i+1
     %endrep
 %endmacro
 
-%macro LOAD_MM_PERMUTATION 1 ; name to load from
-    %ifdef %1_m0
+%macro LOAD_MM_PERMUTATION 0-1 ; name to load from
+    %if %0
+        %xdefine %%f %1_m
+    %else
+        %xdefine %%f current_function %+ _m
+    %endif
+    %xdefine %%tmp %%f %+ 0
+    %ifnum %%tmp
+        RESET_MM_PERMUTATION
         %assign %%i 0
         %rep num_mmregs
-            CAT_XDEFINE m, %%i, %1_m %+ %%i
-            CAT_XDEFINE nn, m %+ %%i, %%i
+            %xdefine %%tmp %%f %+ %%i
+            CAT_XDEFINE %%m, %%i, m %+ %%tmp
             %assign %%i %%i+1
         %endrep
+        %rep num_mmregs
+            %assign %%i %%i-1
+            CAT_XDEFINE m, %%i, %%m %+ %%i
+            CAT_XDEFINE nn, m %+ %%i, %%i
+        %endrep
     %endif
 %endmacro
 
 ; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
 %macro call 1
-    call_internal %1 %+ SUFFIX, %1
+    %ifid %1
+        call_internal %1 %+ SUFFIX, %1
+    %else
+        call %1
+    %endif
 %endmacro
 %macro call_internal 2
     %xdefine %%i %2
@@ -1190,12 +1300,17 @@ INIT_XMM
 ;=============================================================================
 
 %assign i 0
-%rep 16
+%rep 32
     %if i < 8
         CAT_XDEFINE sizeofmm, i, 8
+        CAT_XDEFINE regnumofmm, i, i
     %endif
     CAT_XDEFINE sizeofxmm, i, 16
     CAT_XDEFINE sizeofymm, i, 32
+    CAT_XDEFINE sizeofzmm, i, 64
+    CAT_XDEFINE regnumofxmm, i, i
+    CAT_XDEFINE regnumofymm, i, i
+    CAT_XDEFINE regnumofzmm, i, i
     %assign i i+1
 %endrep
 %undef i
@@ -1214,7 +1329,7 @@ INIT_XMM
 ;%1 == instruction
 ;%2 == minimal instruction set
 ;%3 == 1 if float, 0 if int
-;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
+;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation)
 ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
 ;%6+: operands
 %macro RUN_AVX_INSTR 6-9+
@@ -1238,8 +1353,22 @@ INIT_XMM
         %ifdef cpuname
             %if notcpuflag(%2)
                 %error use of ``%1'' %2 instruction in cpuname function: current_function
-            %elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && __sizeofreg > 8
+            %elif %3 == 0 && __sizeofreg == 16 && notcpuflag(sse2)
                 %error use of ``%1'' sse2 instruction in cpuname function: current_function
+            %elif %3 == 0 && __sizeofreg == 32 && notcpuflag(avx2)
+                %error use of ``%1'' avx2 instruction in cpuname function: current_function
+            %elif __sizeofreg == 16 && notcpuflag(sse)
+                %error use of ``%1'' sse instruction in cpuname function: current_function
+            %elif __sizeofreg == 32 && notcpuflag(avx)
+                %error use of ``%1'' avx instruction in cpuname function: current_function
+            %elif __sizeofreg == 64 && notcpuflag(avx512)
+                %error use of ``%1'' avx512 instruction in cpuname function: current_function
+            %elifidn %1, pextrw ; special case because the base instruction is mmx2,
+                %ifnid %6       ; but sse4 is required for memory operands
+                    %if notcpuflag(sse4)
+                        %error use of ``%1'' sse4 instruction in cpuname function: current_function
+                    %endif
+                %endif
             %endif
         %endif
     %endif
@@ -1247,14 +1376,12 @@ INIT_XMM
     %if __emulate_avx
         %xdefine __src1 %7
         %xdefine __src2 %8
-        %ifnidn %6, %7
-            %if %0 >= 9
-                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, %8, %9
-            %else
-                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, %8
-            %endif
-            %if %5 && %4 == 0
-                %ifnid %8
+        %if %5 && %4 == 0
+            %ifnidn %6, %7
+                %ifidn %6, %8
+                    %xdefine __src1 %8
+                    %xdefine __src2 %7
+                %elifnnum sizeof%8
                     ; 3-operand AVX instructions with a memory arg can only have it in src2,
                     ; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
                     ; So, if the instruction is commutative with a memory arg, swap them.
@@ -1262,6 +1389,13 @@ INIT_XMM
                     %xdefine __src2 %7
                 %endif
             %endif
+        %endif
+        %ifnidn %6, __src1
+            %if %0 >= 9
+                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, __src2, %9
+            %else
+                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, __src2
+            %endif
             %if __sizeofreg == 8
                 MOVQ %6, __src1
             %elif %3
@@ -1278,9 +1412,40 @@ INIT_XMM
     %elif %0 >= 9
         __instr %6, %7, %8, %9
     %elif %0 == 8
-        __instr %6, %7, %8
+        %if avx_enabled && %5
+            %xdefine __src1 %7
+            %xdefine __src2 %8
+            %ifnum regnumof%7
+                %ifnum regnumof%8
+                    %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32
+                        ; Most VEX-encoded instructions require an additional byte to encode when
+                        ; src2 is a high register (e.g. m8..15). If the instruction is commutative
+                        ; we can swap src1 and src2 when doing so reduces the instruction length.
+                        %xdefine __src1 %8
+                        %xdefine __src2 %7
+                    %endif
+                %endif
+            %endif
+            __instr %6, __src1, __src2
+        %else
+            __instr %6, %7, %8
+        %endif
     %elif %0 == 7
-        __instr %6, %7
+        %if avx_enabled && %5
+            %xdefine __src1 %6
+            %xdefine __src2 %7
+            %ifnum regnumof%6
+                %ifnum regnumof%7
+                    %if regnumof%6 < 8 && regnumof%7 >= 8 && regnumof%7 < 16 && sizeof%7 <= 32
+                        %xdefine __src1 %7
+                        %xdefine __src2 %6
+                    %endif
+                %endif
+            %endif
+            __instr %6, __src1, __src2
+        %else
+            __instr %6, %7
+        %endif
     %else
         __instr %6
     %endif
@@ -1289,9 +1454,9 @@ INIT_XMM
 ;%1 == instruction
 ;%2 == minimal instruction set
 ;%3 == 1 if float, 0 if int
-;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
+;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation)
 ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
-%macro AVX_INSTR 1-5 fnord, 0, 1, 0
+%macro AVX_INSTR 1-5 fnord, 0, 255, 0
     %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5
         %ifidn %2, fnord
             RUN_AVX_INSTR %6, %7, %8, %9, %10, %1
@@ -1307,77 +1472,112 @@ INIT_XMM
     %endmacro
 %endmacro
 
-; Instructions with both VEX and non-VEX encodings
+; Instructions with both VEX/EVEX and legacy encodings
 ; Non-destructive instructions are written without parameters
 AVX_INSTR addpd, sse2, 1, 0, 1
 AVX_INSTR addps, sse, 1, 0, 1
-AVX_INSTR addsd, sse2, 1, 0, 1
-AVX_INSTR addss, sse, 1, 0, 1
+AVX_INSTR addsd, sse2, 1, 0, 0
+AVX_INSTR addss, sse, 1, 0, 0
 AVX_INSTR addsubpd, sse3, 1, 0, 0
 AVX_INSTR addsubps, sse3, 1, 0, 0
-AVX_INSTR aesdec, fnord, 0, 0, 0
-AVX_INSTR aesdeclast, fnord, 0, 0, 0
-AVX_INSTR aesenc, fnord, 0, 0, 0
-AVX_INSTR aesenclast, fnord, 0, 0, 0
-AVX_INSTR aesimc
-AVX_INSTR aeskeygenassist
+AVX_INSTR aesdec, aesni, 0, 0, 0
+AVX_INSTR aesdeclast, aesni, 0, 0, 0
+AVX_INSTR aesenc, aesni, 0, 0, 0
+AVX_INSTR aesenclast, aesni, 0, 0, 0
+AVX_INSTR aesimc, aesni
+AVX_INSTR aeskeygenassist, aesni
 AVX_INSTR andnpd, sse2, 1, 0, 0
 AVX_INSTR andnps, sse, 1, 0, 0
 AVX_INSTR andpd, sse2, 1, 0, 1
 AVX_INSTR andps, sse, 1, 0, 1
-AVX_INSTR blendpd, sse4, 1, 0, 0
-AVX_INSTR blendps, sse4, 1, 0, 0
-AVX_INSTR blendvpd, sse4, 1, 0, 0
-AVX_INSTR blendvps, sse4, 1, 0, 0
+AVX_INSTR blendpd, sse4, 1, 1, 0
+AVX_INSTR blendps, sse4, 1, 1, 0
+AVX_INSTR blendvpd, sse4 ; can't be emulated
+AVX_INSTR blendvps, sse4 ; can't be emulated
+AVX_INSTR cmpeqpd, sse2, 1, 0, 1
+AVX_INSTR cmpeqps, sse, 1, 0, 1
+AVX_INSTR cmpeqsd, sse2, 1, 0, 0
+AVX_INSTR cmpeqss, sse, 1, 0, 0
+AVX_INSTR cmplepd, sse2, 1, 0, 0
+AVX_INSTR cmpleps, sse, 1, 0, 0
+AVX_INSTR cmplesd, sse2, 1, 0, 0
+AVX_INSTR cmpless, sse, 1, 0, 0
+AVX_INSTR cmpltpd, sse2, 1, 0, 0
+AVX_INSTR cmpltps, sse, 1, 0, 0
+AVX_INSTR cmpltsd, sse2, 1, 0, 0
+AVX_INSTR cmpltss, sse, 1, 0, 0
+AVX_INSTR cmpneqpd, sse2, 1, 0, 1
+AVX_INSTR cmpneqps, sse, 1, 0, 1
+AVX_INSTR cmpneqsd, sse2, 1, 0, 0
+AVX_INSTR cmpneqss, sse, 1, 0, 0
+AVX_INSTR cmpnlepd, sse2, 1, 0, 0
+AVX_INSTR cmpnleps, sse, 1, 0, 0
+AVX_INSTR cmpnlesd, sse2, 1, 0, 0
+AVX_INSTR cmpnless, sse, 1, 0, 0
+AVX_INSTR cmpnltpd, sse2, 1, 0, 0
+AVX_INSTR cmpnltps, sse, 1, 0, 0
+AVX_INSTR cmpnltsd, sse2, 1, 0, 0
+AVX_INSTR cmpnltss, sse, 1, 0, 0
+AVX_INSTR cmpordpd, sse2 1, 0, 1
+AVX_INSTR cmpordps, sse 1, 0, 1
+AVX_INSTR cmpordsd, sse2 1, 0, 0
+AVX_INSTR cmpordss, sse 1, 0, 0
 AVX_INSTR cmppd, sse2, 1, 1, 0
 AVX_INSTR cmpps, sse, 1, 1, 0
 AVX_INSTR cmpsd, sse2, 1, 1, 0
 AVX_INSTR cmpss, sse, 1, 1, 0
-AVX_INSTR comisd, sse2
-AVX_INSTR comiss, sse
-AVX_INSTR cvtdq2pd, sse2
-AVX_INSTR cvtdq2ps, sse2
-AVX_INSTR cvtpd2dq, sse2
-AVX_INSTR cvtpd2ps, sse2
-AVX_INSTR cvtps2dq, sse2
-AVX_INSTR cvtps2pd, sse2
-AVX_INSTR cvtsd2si, sse2
-AVX_INSTR cvtsd2ss, sse2
-AVX_INSTR cvtsi2sd, sse2
-AVX_INSTR cvtsi2ss, sse
-AVX_INSTR cvtss2sd, sse2
-AVX_INSTR cvtss2si, sse
-AVX_INSTR cvttpd2dq, sse2
-AVX_INSTR cvttps2dq, sse2
-AVX_INSTR cvttsd2si, sse2
-AVX_INSTR cvttss2si, sse
+AVX_INSTR cmpunordpd, sse2, 1, 0, 1
+AVX_INSTR cmpunordps, sse, 1, 0, 1
+AVX_INSTR cmpunordsd, sse2, 1, 0, 0
+AVX_INSTR cmpunordss, sse, 1, 0, 0
+AVX_INSTR comisd, sse2, 1
+AVX_INSTR comiss, sse, 1
+AVX_INSTR cvtdq2pd, sse2, 1
+AVX_INSTR cvtdq2ps, sse2, 1
+AVX_INSTR cvtpd2dq, sse2, 1
+AVX_INSTR cvtpd2ps, sse2, 1
+AVX_INSTR cvtps2dq, sse2, 1
+AVX_INSTR cvtps2pd, sse2, 1
+AVX_INSTR cvtsd2si, sse2, 1
+AVX_INSTR cvtsd2ss, sse2, 1, 0, 0
+AVX_INSTR cvtsi2sd, sse2, 1, 0, 0
+AVX_INSTR cvtsi2ss, sse, 1, 0, 0
+AVX_INSTR cvtss2sd, sse2, 1, 0, 0
+AVX_INSTR cvtss2si, sse, 1
+AVX_INSTR cvttpd2dq, sse2, 1
+AVX_INSTR cvttps2dq, sse2, 1
+AVX_INSTR cvttsd2si, sse2, 1
+AVX_INSTR cvttss2si, sse, 1
 AVX_INSTR divpd, sse2, 1, 0, 0
 AVX_INSTR divps, sse, 1, 0, 0
 AVX_INSTR divsd, sse2, 1, 0, 0
 AVX_INSTR divss, sse, 1, 0, 0
 AVX_INSTR dppd, sse4, 1, 1, 0
 AVX_INSTR dpps, sse4, 1, 1, 0
-AVX_INSTR extractps, sse4
+AVX_INSTR extractps, sse4, 1
+AVX_INSTR gf2p8affineinvqb, gfni, 0, 1, 0
+AVX_INSTR gf2p8affineqb, gfni, 0, 1, 0
+AVX_INSTR gf2p8mulb, gfni, 0, 0, 0
 AVX_INSTR haddpd, sse3, 1, 0, 0
 AVX_INSTR haddps, sse3, 1, 0, 0
 AVX_INSTR hsubpd, sse3, 1, 0, 0
 AVX_INSTR hsubps, sse3, 1, 0, 0
 AVX_INSTR insertps, sse4, 1, 1, 0
 AVX_INSTR lddqu, sse3
-AVX_INSTR ldmxcsr, sse
+AVX_INSTR ldmxcsr, sse, 1
 AVX_INSTR maskmovdqu, sse2
 AVX_INSTR maxpd, sse2, 1, 0, 1
 AVX_INSTR maxps, sse, 1, 0, 1
-AVX_INSTR maxsd, sse2, 1, 0, 1
-AVX_INSTR maxss, sse, 1, 0, 1
+AVX_INSTR maxsd, sse2, 1, 0, 0
+AVX_INSTR maxss, sse, 1, 0, 0
 AVX_INSTR minpd, sse2, 1, 0, 1
 AVX_INSTR minps, sse, 1, 0, 1
-AVX_INSTR minsd, sse2, 1, 0, 1
-AVX_INSTR minss, sse, 1, 0, 1
-AVX_INSTR movapd, sse2
-AVX_INSTR movaps, sse
+AVX_INSTR minsd, sse2, 1, 0, 0
+AVX_INSTR minss, sse, 1, 0, 0
+AVX_INSTR movapd, sse2, 1
+AVX_INSTR movaps, sse, 1
 AVX_INSTR movd, mmx
-AVX_INSTR movddup, sse3
+AVX_INSTR movddup, sse3, 1
 AVX_INSTR movdqa, sse2
 AVX_INSTR movdqu, sse2
 AVX_INSTR movhlps, sse, 1, 0, 0
@@ -1386,24 +1586,24 @@ AVX_INSTR movhps, sse, 1, 0, 0
 AVX_INSTR movlhps, sse, 1, 0, 0
 AVX_INSTR movlpd, sse2, 1, 0, 0
 AVX_INSTR movlps, sse, 1, 0, 0
-AVX_INSTR movmskpd, sse2
-AVX_INSTR movmskps, sse
+AVX_INSTR movmskpd, sse2, 1
+AVX_INSTR movmskps, sse, 1
 AVX_INSTR movntdq, sse2
 AVX_INSTR movntdqa, sse4
-AVX_INSTR movntpd, sse2
-AVX_INSTR movntps, sse
+AVX_INSTR movntpd, sse2, 1
+AVX_INSTR movntps, sse, 1
 AVX_INSTR movq, mmx
 AVX_INSTR movsd, sse2, 1, 0, 0
-AVX_INSTR movshdup, sse3
-AVX_INSTR movsldup, sse3
+AVX_INSTR movshdup, sse3, 1
+AVX_INSTR movsldup, sse3, 1
 AVX_INSTR movss, sse, 1, 0, 0
-AVX_INSTR movupd, sse2
-AVX_INSTR movups, sse
-AVX_INSTR mpsadbw, sse4
+AVX_INSTR movupd, sse2, 1
+AVX_INSTR movups, sse, 1
+AVX_INSTR mpsadbw, sse4, 0, 1, 0
 AVX_INSTR mulpd, sse2, 1, 0, 1
 AVX_INSTR mulps, sse, 1, 0, 1
-AVX_INSTR mulsd, sse2, 1, 0, 1
-AVX_INSTR mulss, sse, 1, 0, 1
+AVX_INSTR mulsd, sse2, 1, 0, 0
+AVX_INSTR mulss, sse, 1, 0, 0
 AVX_INSTR orpd, sse2, 1, 0, 1
 AVX_INSTR orps, sse, 1, 0, 1
 AVX_INSTR pabsb, ssse3
@@ -1421,14 +1621,18 @@ AVX_INSTR paddsb, mmx, 0, 0, 1
 AVX_INSTR paddsw, mmx, 0, 0, 1
 AVX_INSTR paddusb, mmx, 0, 0, 1
 AVX_INSTR paddusw, mmx, 0, 0, 1
-AVX_INSTR palignr, ssse3
+AVX_INSTR palignr, ssse3, 0, 1, 0
 AVX_INSTR pand, mmx, 0, 0, 1
 AVX_INSTR pandn, mmx, 0, 0, 0
 AVX_INSTR pavgb, mmx2, 0, 0, 1
 AVX_INSTR pavgw, mmx2, 0, 0, 1
-AVX_INSTR pblendvb, sse4, 0, 0, 0
-AVX_INSTR pblendw, sse4
-AVX_INSTR pclmulqdq
+AVX_INSTR pblendvb, sse4 ; can't be emulated
+AVX_INSTR pblendw, sse4, 0, 1, 0
+AVX_INSTR pclmulqdq, fnord, 0, 1, 0
+AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0
+AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0
+AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0
+AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0
 AVX_INSTR pcmpestri, sse42
 AVX_INSTR pcmpestrm, sse42
 AVX_INSTR pcmpistri, sse42
@@ -1452,10 +1656,10 @@ AVX_INSTR phminposuw, sse4
 AVX_INSTR phsubw, ssse3, 0, 0, 0
 AVX_INSTR phsubd, ssse3, 0, 0, 0
 AVX_INSTR phsubsw, ssse3, 0, 0, 0
-AVX_INSTR pinsrb, sse4
-AVX_INSTR pinsrd, sse4
-AVX_INSTR pinsrq, sse4
-AVX_INSTR pinsrw, mmx2
+AVX_INSTR pinsrb, sse4, 0, 1, 0
+AVX_INSTR pinsrd, sse4, 0, 1, 0
+AVX_INSTR pinsrq, sse4, 0, 1, 0
+AVX_INSTR pinsrw, mmx2, 0, 1, 0
 AVX_INSTR pmaddwd, mmx, 0, 0, 1
 AVX_INSTR pmaddubsw, ssse3, 0, 0, 0
 AVX_INSTR pmaxsb, sse4, 0, 0, 1
@@ -1527,27 +1731,27 @@ AVX_INSTR punpcklwd, mmx, 0, 0, 0
 AVX_INSTR punpckldq, mmx, 0, 0, 0
 AVX_INSTR punpcklqdq, sse2, 0, 0, 0
 AVX_INSTR pxor, mmx, 0, 0, 1
-AVX_INSTR rcpps, sse, 1, 0, 0
+AVX_INSTR rcpps, sse, 1
 AVX_INSTR rcpss, sse, 1, 0, 0
-AVX_INSTR roundpd, sse4
-AVX_INSTR roundps, sse4
-AVX_INSTR roundsd, sse4
-AVX_INSTR roundss, sse4
-AVX_INSTR rsqrtps, sse, 1, 0, 0
+AVX_INSTR roundpd, sse4, 1
+AVX_INSTR roundps, sse4, 1
+AVX_INSTR roundsd, sse4, 1, 1, 0
+AVX_INSTR roundss, sse4, 1, 1, 0
+AVX_INSTR rsqrtps, sse, 1
 AVX_INSTR rsqrtss, sse, 1, 0, 0
 AVX_INSTR shufpd, sse2, 1, 1, 0
 AVX_INSTR shufps, sse, 1, 1, 0
-AVX_INSTR sqrtpd, sse2, 1, 0, 0
-AVX_INSTR sqrtps, sse, 1, 0, 0
+AVX_INSTR sqrtpd, sse2, 1
+AVX_INSTR sqrtps, sse, 1
 AVX_INSTR sqrtsd, sse2, 1, 0, 0
 AVX_INSTR sqrtss, sse, 1, 0, 0
-AVX_INSTR stmxcsr, sse
+AVX_INSTR stmxcsr, sse, 1
 AVX_INSTR subpd, sse2, 1, 0, 0
 AVX_INSTR subps, sse, 1, 0, 0
 AVX_INSTR subsd, sse2, 1, 0, 0
 AVX_INSTR subss, sse, 1, 0, 0
-AVX_INSTR ucomisd, sse2
-AVX_INSTR ucomiss, sse
+AVX_INSTR ucomisd, sse2, 1
+AVX_INSTR ucomiss, sse, 1
 AVX_INSTR unpckhpd, sse2, 1, 0, 0
 AVX_INSTR unpckhps, sse, 1, 0, 0
 AVX_INSTR unpcklpd, sse2, 1, 0, 0
@@ -1560,6 +1764,38 @@ AVX_INSTR pfadd, 3dnow, 1, 0, 1
 AVX_INSTR pfsub, 3dnow, 1, 0, 0
 AVX_INSTR pfmul, 3dnow, 1, 0, 1
 
+;%1 == instruction
+;%2 == minimal instruction set
+%macro GPR_INSTR 2
+    %macro %1 2-5 fnord, %1, %2
+        %ifdef cpuname
+            %if notcpuflag(%5)
+                %error use of ``%4'' %5 instruction in cpuname function: current_function
+            %endif
+        %endif
+        %ifidn %3, fnord
+            %4 %1, %2
+        %else
+            %4 %1, %2, %3
+        %endif
+    %endmacro
+%endmacro
+
+GPR_INSTR andn, bmi1
+GPR_INSTR bextr, bmi1
+GPR_INSTR blsi, bmi1
+GPR_INSTR blsr, bmi1
+GPR_INSTR blsmsk, bmi1
+GPR_INSTR bzhi, bmi2
+GPR_INSTR mulx, bmi2
+GPR_INSTR pdep, bmi2
+GPR_INSTR pext, bmi2
+GPR_INSTR popcnt, sse42
+GPR_INSTR rorx, bmi2
+GPR_INSTR sarx, bmi2
+GPR_INSTR shlx, bmi2
+GPR_INSTR shrx, bmi2
+
 ; base-4 constants for shuffles
 %assign i 0
 %rep 256
@@ -1610,7 +1846,7 @@ FMA_INSTR pmadcswd, pmaddwd, paddd
                 v%5%6 %1, %2, %3, %4
             %elifidn %1, %2
                 ; If %3 or %4 is a memory operand it needs to be encoded as the last operand.
-                %ifid %3
+                %ifnum sizeof%3
                     v%{5}213%6 %2, %3, %4
                 %else
                     v%{5}132%6 %2, %4, %3
@@ -1635,15 +1871,53 @@ FMA4_INSTR fmsubadd, pd, ps
 FMA4_INSTR fnmadd,   pd, ps, sd, ss
 FMA4_INSTR fnmsub,   pd, ps, sd, ss
 
-; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0)
-%ifdef __YASM_VER__
-    %if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0
-        %macro vpbroadcastq 2
-            %if sizeof%1 == 16
-                movddup %1, %2
-            %else
-                vbroadcastsd %1, %2
+; Macros for converting VEX instructions to equivalent EVEX ones.
+%macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex
+    %macro %1 2-7 fnord, fnord, %1, %2, %3
+        %ifidn %3, fnord
+            %define %%args %1, %2
+        %elifidn %4, fnord
+            %define %%args %1, %2, %3
+        %else
+            %define %%args %1, %2, %3, %4
+        %endif
+        %assign %%evex_required cpuflag(avx512) & %7
+        %ifnum regnumof%1
+            %if regnumof%1 >= 16 || sizeof%1 > 32
+                %assign %%evex_required 1
             %endif
-        %endmacro
-    %endif
-%endif
+        %endif
+        %ifnum regnumof%2
+            %if regnumof%2 >= 16 || sizeof%2 > 32
+                %assign %%evex_required 1
+            %endif
+        %endif
+        %ifnum regnumof%3
+            %if regnumof%3 >= 16 || sizeof%3 > 32
+                %assign %%evex_required 1
+            %endif
+        %endif
+        %if %%evex_required
+            %6 %%args
+        %else
+            %5 %%args ; Prefer VEX over EVEX due to shorter instruction length
+        %endif
+    %endmacro
+%endmacro
+
+EVEX_INSTR vbroadcastf128, vbroadcastf32x4
+EVEX_INSTR vbroadcasti128, vbroadcasti32x4
+EVEX_INSTR vextractf128,   vextractf32x4
+EVEX_INSTR vextracti128,   vextracti32x4
+EVEX_INSTR vinsertf128,    vinsertf32x4
+EVEX_INSTR vinserti128,    vinserti32x4
+EVEX_INSTR vmovdqa,        vmovdqa32
+EVEX_INSTR vmovdqu,        vmovdqu32
+EVEX_INSTR vpand,          vpandd
+EVEX_INSTR vpandn,         vpandnd
+EVEX_INSTR vpor,           vpord
+EVEX_INSTR vpxor,          vpxord
+EVEX_INSTR vrcpps,         vrcp14ps,   1 ; EVEX versions have higher precision
+EVEX_INSTR vrcpss,         vrcp14ss,   1
+EVEX_INSTR vrsqrtps,       vrsqrt14ps, 1
+EVEX_INSTR vrsqrtss,       vrsqrt14ss, 1
diff --git a/media/libvpx/libvpx/tools.mk b/media/libvpx/libvpx/tools.mk
index 3c660b1dfd..79bb0cb8d6 100644
--- a/media/libvpx/libvpx/tools.mk
+++ b/media/libvpx/libvpx/tools.mk
@@ -10,7 +10,12 @@
 
 # List of tools to build.
 TOOLS-yes            += tiny_ssim.c
-tiny_ssim.SRCS       += vpx/vpx_integer.h
+tiny_ssim.SRCS       += vpx/vpx_integer.h y4minput.c y4minput.h \
+                        vpx/vpx_codec.h vpx/src/vpx_image.c
+tiny_ssim.SRCS       += vpx_mem/vpx_mem.c vpx_mem/vpx_mem.h
+tiny_ssim.SRCS       += vpx_dsp/ssim.h vpx_scale/yv12config.h
+tiny_ssim.SRCS       += vpx_ports/mem.h vpx_ports/mem.h
+tiny_ssim.SRCS       += vpx_mem/include/vpx_mem_intrnl.h
 tiny_ssim.GUID        = 3afa9b05-940b-4d68-b5aa-55157d8ed7b4
 tiny_ssim.DESCRIPTION = Generate SSIM/PSNR from raw .yuv files
 
@@ -23,7 +28,11 @@ tiny_ssim.DESCRIPTION = Generate SSIM/PSNR from raw .yuv files
 # Expand list of selected tools to build (as specified above)
 TOOLS           = $(addprefix tools/,$(call enabled,TOOLS))
 ALL_SRCS        = $(foreach ex,$(TOOLS),$($(notdir $(ex:.c=)).SRCS))
+CFLAGS += -I../include
 
+ifneq ($(CONFIG_CODEC_SRCS), yes)
+  CFLAGS += -I../include/vpx
+endif
 
 # Expand all tools sources into a variable containing all sources
 # for that tools (not just them main one specified in TOOLS)
@@ -39,15 +48,11 @@ DIST-SRCS-yes              += $(ALL_SRCS)
 OBJS-$(NOT_MSVS)           += $(call objs,$(ALL_SRCS))
 BINS-$(NOT_MSVS)           += $(addprefix $(BUILD_PFX),$(TOOLS:.c=$(EXE_SFX)))
 
-
 # Instantiate linker template for all tools.
 $(foreach bin,$(BINS-yes),\
     $(eval $(bin):)\
     $(eval $(call linker_template,$(bin),\
-        $(call objs,$($(notdir $(bin:$(EXE_SFX)=)).SRCS)) \
-				-lm\
-        )))
-
+        $(call objs,$($(notdir $(bin:$(EXE_SFX)=)).SRCS)) -lm)))
 
 # The following pairs define a mapping of locations in the distribution
 # tree to locations in the source/build trees.
@@ -74,6 +79,7 @@ $(1): $($(1:.$(VCPROJ_SFX)=).SRCS) vpx.$(VCPROJ_SFX)
             --ver=$$(CONFIG_VS_VERSION)\
             --proj-guid=$$($$(@:.$(VCPROJ_SFX)=).GUID)\
             --src-path-bare="$(SRC_PATH_BARE)" \
+            --as=$$(AS) \
             $$(if $$(CONFIG_STATIC_MSVCRT),--static-crt) \
             --out=$$@ $$(INTERNAL_CFLAGS) $$(CFLAGS) \
             $$(INTERNAL_LDFLAGS) $$(LDFLAGS) $$^
@@ -85,6 +91,13 @@ INSTALL-BINS-$(CONFIG_MSVS) += $(foreach p,$(VS_PLATFORMS),\
 $(foreach proj,$(call enabled,PROJECTS),\
     $(eval $(call vcproj_template,$(proj))))
 
+# Generate a list of all enabled sources, in particular for exporting to gyp
+# based build systems.
+tiny_ssim_srcs.txt:
+	@echo "    [CREATE] $@"
+	@echo $(tiny_ssim.SRCS) | xargs -n1 echo | LC_ALL=C sort -u > $@
+CLEAN-OBJS += tiny_ssim_srcs.txt
+
 #
 # Documentation Rules
 #
diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/Anandan.py b/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/Anandan.py
new file mode 100644
index 0000000000..5ff9e98932
--- /dev/null
+++ b/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/Anandan.py
@@ -0,0 +1,193 @@
+##  Copyright (c) 2020 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+# coding: utf-8
+import numpy as np
+import numpy.linalg as LA
+from scipy.ndimage.filters import gaussian_filter
+from scipy.sparse import csc_matrix
+from scipy.sparse.linalg import inv
+from MotionEST import MotionEST
+"""Anandan Model"""
+
+
+class Anandan(MotionEST):
+  """
+    constructor:
+        cur_f: current frame
+        ref_f: reference frame
+        blk_sz: block size
+        beta: smooth constrain weight
+        k1,k2,k3: confidence coefficients
+        max_iter: maximum number of iterations
+    """
+
+  def __init__(self, cur_f, ref_f, blk_sz, beta, k1, k2, k3, max_iter=100):
+    super(Anandan, self).__init__(cur_f, ref_f, blk_sz)
+    self.levels = int(np.log2(blk_sz))
+    self.intensity_hierarchy()
+    self.c_maxs = []
+    self.c_mins = []
+    self.e_maxs = []
+    self.e_mins = []
+    for l in xrange(self.levels + 1):
+      c_max, c_min, e_max, e_min = self.get_curvature(self.cur_Is[l])
+      self.c_maxs.append(c_max)
+      self.c_mins.append(c_min)
+      self.e_maxs.append(e_max)
+      self.e_mins.append(e_min)
+    self.beta = beta
+    self.k1, self.k2, self.k3 = k1, k2, k3
+    self.max_iter = max_iter
+
+  """
+    build intensity hierarchy
+    """
+
+  def intensity_hierarchy(self):
+    level = 0
+    self.cur_Is = []
+    self.ref_Is = []
+    #build each level itensity by using gaussian filters
+    while level <= self.levels:
+      cur_I = gaussian_filter(self.cur_yuv[:, :, 0], sigma=(2**level) * 0.56)
+      ref_I = gaussian_filter(self.ref_yuv[:, :, 0], sigma=(2**level) * 0.56)
+      self.ref_Is.append(ref_I)
+      self.cur_Is.append(cur_I)
+      level += 1
+
+  """
+    get curvature of each block
+    """
+
+  def get_curvature(self, I):
+    c_max = np.zeros((self.num_row, self.num_col))
+    c_min = np.zeros((self.num_row, self.num_col))
+    e_max = np.zeros((self.num_row, self.num_col, 2))
+    e_min = np.zeros((self.num_row, self.num_col, 2))
+    for r in xrange(self.num_row):
+      for c in xrange(self.num_col):
+        h11, h12, h21, h22 = 0, 0, 0, 0
+        for i in xrange(r * self.blk_sz, r * self.blk_sz + self.blk_sz):
+          for j in xrange(c * self.blk_sz, c * self.blk_sz + self.blk_sz):
+            if 0 <= i < self.height - 1 and 0 <= j < self.width - 1:
+              Ix = I[i][j + 1] - I[i][j]
+              Iy = I[i + 1][j] - I[i][j]
+              h11 += Iy * Iy
+              h12 += Ix * Iy
+              h21 += Ix * Iy
+              h22 += Ix * Ix
+        U, S, _ = LA.svd(np.array([[h11, h12], [h21, h22]]))
+        c_max[r, c], c_min[r, c] = S[0], S[1]
+        e_max[r, c] = U[:, 0]
+        e_min[r, c] = U[:, 1]
+    return c_max, c_min, e_max, e_min
+
+  """
+    get ssd of motion vector:
+      cur_I: current intensity
+      ref_I: reference intensity
+      center: current position
+      mv: motion vector
+    """
+
+  def get_ssd(self, cur_I, ref_I, center, mv):
+    ssd = 0
+    for r in xrange(int(center[0]), int(center[0]) + self.blk_sz):
+      for c in xrange(int(center[1]), int(center[1]) + self.blk_sz):
+        if 0 <= r < self.height and 0 <= c < self.width:
+          tr, tc = r + int(mv[0]), c + int(mv[1])
+          if 0 <= tr < self.height and 0 <= tc < self.width:
+            ssd += (ref_I[tr, tc] - cur_I[r, c])**2
+          else:
+            ssd += cur_I[r, c]**2
+    return ssd
+
+  """
+    get region match of level l
+      l: current level
+      last_mvs: matchine results of last level
+      radius: movenment radius
+    """
+
+  def region_match(self, l, last_mvs, radius):
+    mvs = np.zeros((self.num_row, self.num_col, 2))
+    min_ssds = np.zeros((self.num_row, self.num_col))
+    for r in xrange(self.num_row):
+      for c in xrange(self.num_col):
+        center = np.array([r * self.blk_sz, c * self.blk_sz])
+        #use overlap hierarchy policy
+        init_mvs = []
+        if last_mvs is None:
+          init_mvs = [np.array([0, 0])]
+        else:
+          for i, j in {(r, c), (r, c + 1), (r + 1, c), (r + 1, c + 1)}:
+            if 0 <= i < last_mvs.shape[0] and 0 <= j < last_mvs.shape[1]:
+              init_mvs.append(last_mvs[i, j])
+        #use last matching results as the start position as current level
+        min_ssd = None
+        min_mv = None
+        for init_mv in init_mvs:
+          for i in xrange(-2, 3):
+            for j in xrange(-2, 3):
+              mv = init_mv + np.array([i, j]) * radius
+              ssd = self.get_ssd(self.cur_Is[l], self.ref_Is[l], center, mv)
+              if min_ssd is None or ssd < min_ssd:
+                min_ssd = ssd
+                min_mv = mv
+        min_ssds[r, c] = min_ssd
+        mvs[r, c] = min_mv
+    return mvs, min_ssds
+
+  """
+    smooth motion field based on neighbor constraint
+      uvs: current estimation
+      mvs: matching results
+      min_ssds: minimum ssd of matching results
+      l: current level
+    """
+
+  def smooth(self, uvs, mvs, min_ssds, l):
+    sm_uvs = np.zeros((self.num_row, self.num_col, 2))
+    c_max = self.c_maxs[l]
+    c_min = self.c_mins[l]
+    e_max = self.e_maxs[l]
+    e_min = self.e_mins[l]
+    for r in xrange(self.num_row):
+      for c in xrange(self.num_col):
+        w_max = c_max[r, c] / (
+            self.k1 + self.k2 * min_ssds[r, c] + self.k3 * c_max[r, c])
+        w_min = c_min[r, c] / (
+            self.k1 + self.k2 * min_ssds[r, c] + self.k3 * c_min[r, c])
+        w = w_max * w_min / (w_max + w_min + 1e-6)
+        if w < 0:
+          w = 0
+        avg_uv = np.array([0.0, 0.0])
+        for i, j in {(r - 1, c), (r + 1, c), (r, c - 1), (r, c + 1)}:
+          if 0 <= i < self.num_row and 0 <= j < self.num_col:
+            avg_uv += 0.25 * uvs[i, j]
+        sm_uvs[r, c] = (w * w * mvs[r, c] + self.beta * avg_uv) / (
+            self.beta + w * w)
+    return sm_uvs
+
+  """
+    motion field estimation
+    """
+
+  def motion_field_estimation(self):
+    last_mvs = None
+    for l in xrange(self.levels, -1, -1):
+      mvs, min_ssds = self.region_match(l, last_mvs, 2**l)
+      uvs = np.zeros(mvs.shape)
+      for _ in xrange(self.max_iter):
+        uvs = self.smooth(uvs, mvs, min_ssds, l)
+      last_mvs = uvs
+    for r in xrange(self.num_row):
+      for c in xrange(self.num_col):
+        self.mf[r, c] = uvs[r, c]
diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/Exhaust.py b/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/Exhaust.py
new file mode 100644
index 0000000000..d763de8562
--- /dev/null
+++ b/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/Exhaust.py
@@ -0,0 +1,259 @@
+##  Copyright (c) 2020 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+# coding: utf-8
+import numpy as np
+import numpy.linalg as LA
+from Util import MSE
+from MotionEST import MotionEST
+"""Exhaust Search:"""
+
+
+class Exhaust(MotionEST):
+  """
+    Constructor:
+        cur_f: current frame
+        ref_f: reference frame
+        blk_sz: block size
+        wnd_size: search window size
+        metric: metric to compare the blocks distrotion
+    """
+
+  def __init__(self, cur_f, ref_f, blk_size, wnd_size, metric=MSE):
+    self.name = 'exhaust'
+    self.wnd_sz = wnd_size
+    self.metric = metric
+    super(Exhaust, self).__init__(cur_f, ref_f, blk_size)
+
+  """
+    search method:
+        cur_r: start row
+        cur_c: start column
+    """
+
+  def search(self, cur_r, cur_c):
+    min_loss = self.block_dist(cur_r, cur_c, [0, 0], self.metric)
+    cur_x = cur_c * self.blk_sz
+    cur_y = cur_r * self.blk_sz
+    ref_x = cur_x
+    ref_y = cur_y
+    #search all validate positions and select the one with minimum distortion
+    for y in xrange(cur_y - self.wnd_sz, cur_y + self.wnd_sz):
+      for x in xrange(cur_x - self.wnd_sz, cur_x + self.wnd_sz):
+        if 0 <= x < self.width - self.blk_sz and 0 <= y < self.height - self.blk_sz:
+          loss = self.block_dist(cur_r, cur_c, [y - cur_y, x - cur_x],
+                                 self.metric)
+          if loss < min_loss:
+            min_loss = loss
+            ref_x = x
+            ref_y = y
+    return ref_x, ref_y
+
+  def motion_field_estimation(self):
+    for i in xrange(self.num_row):
+      for j in xrange(self.num_col):
+        ref_x, ref_y = self.search(i, j)
+        self.mf[i, j] = np.array(
+            [ref_y - i * self.blk_sz, ref_x - j * self.blk_sz])
+
+
+"""Exhaust with Neighbor Constraint"""
+
+
+class ExhaustNeighbor(MotionEST):
+  """
+    Constructor:
+        cur_f: current frame
+        ref_f: reference frame
+        blk_sz: block size
+        wnd_size: search window size
+        beta: neigbor loss weight
+        metric: metric to compare the blocks distrotion
+    """
+
+  def __init__(self, cur_f, ref_f, blk_size, wnd_size, beta, metric=MSE):
+    self.name = 'exhaust + neighbor'
+    self.wnd_sz = wnd_size
+    self.beta = beta
+    self.metric = metric
+    super(ExhaustNeighbor, self).__init__(cur_f, ref_f, blk_size)
+    self.assign = np.zeros((self.num_row, self.num_col), dtype=bool)
+
+  """
+    estimate neighbor loss:
+        cur_r: current row
+        cur_c: current column
+        mv: current motion vector
+    """
+
+  def neighborLoss(self, cur_r, cur_c, mv):
+    loss = 0
+    #accumulate difference between current block's motion vector with neighbors'
+    for i, j in {(-1, 0), (1, 0), (0, 1), (0, -1)}:
+      nb_r = cur_r + i
+      nb_c = cur_c + j
+      if 0 <= nb_r < self.num_row and 0 <= nb_c < self.num_col and self.assign[
+          nb_r, nb_c]:
+        loss += LA.norm(mv - self.mf[nb_r, nb_c])
+    return loss
+
+  """
+    search method:
+        cur_r: start row
+        cur_c: start column
+    """
+
+  def search(self, cur_r, cur_c):
+    dist_loss = self.block_dist(cur_r, cur_c, [0, 0], self.metric)
+    nb_loss = self.neighborLoss(cur_r, cur_c, np.array([0, 0]))
+    min_loss = dist_loss + self.beta * nb_loss
+    cur_x = cur_c * self.blk_sz
+    cur_y = cur_r * self.blk_sz
+    ref_x = cur_x
+    ref_y = cur_y
+    #search all validate positions and select the one with minimum distortion
+    # as well as weighted neighbor loss
+    for y in xrange(cur_y - self.wnd_sz, cur_y + self.wnd_sz):
+      for x in xrange(cur_x - self.wnd_sz, cur_x + self.wnd_sz):
+        if 0 <= x < self.width - self.blk_sz and 0 <= y < self.height - self.blk_sz:
+          dist_loss = self.block_dist(cur_r, cur_c, [y - cur_y, x - cur_x],
+                                      self.metric)
+          nb_loss = self.neighborLoss(cur_r, cur_c, [y - cur_y, x - cur_x])
+          loss = dist_loss + self.beta * nb_loss
+          if loss < min_loss:
+            min_loss = loss
+            ref_x = x
+            ref_y = y
+    return ref_x, ref_y
+
+  def motion_field_estimation(self):
+    for i in xrange(self.num_row):
+      for j in xrange(self.num_col):
+        ref_x, ref_y = self.search(i, j)
+        self.mf[i, j] = np.array(
+            [ref_y - i * self.blk_sz, ref_x - j * self.blk_sz])
+        self.assign[i, j] = True
+
+
+"""Exhaust with Neighbor Constraint and Feature Score"""
+
+
+class ExhaustNeighborFeatureScore(MotionEST):
+  """
+    Constructor:
+        cur_f: current frame
+        ref_f: reference frame
+        blk_sz: block size
+        wnd_size: search window size
+        beta: neigbor loss weight
+        max_iter: maximum number of iterations
+        metric: metric to compare the blocks distrotion
+    """
+
+  def __init__(self,
+               cur_f,
+               ref_f,
+               blk_size,
+               wnd_size,
+               beta=1,
+               max_iter=100,
+               metric=MSE):
+    self.name = 'exhaust + neighbor+feature score'
+    self.wnd_sz = wnd_size
+    self.beta = beta
+    self.metric = metric
+    self.max_iter = max_iter
+    super(ExhaustNeighborFeatureScore, self).__init__(cur_f, ref_f, blk_size)
+    self.fs = self.getFeatureScore()
+
+  """
+    get feature score of each block
+    """
+
+  def getFeatureScore(self):
+    fs = np.zeros((self.num_row, self.num_col))
+    for r in xrange(self.num_row):
+      for c in xrange(self.num_col):
+        IxIx = 0
+        IyIy = 0
+        IxIy = 0
+        #get ssd surface
+        for x in xrange(self.blk_sz - 1):
+          for y in xrange(self.blk_sz - 1):
+            ox = c * self.blk_sz + x
+            oy = r * self.blk_sz + y
+            Ix = self.cur_yuv[oy, ox + 1, 0] - self.cur_yuv[oy, ox, 0]
+            Iy = self.cur_yuv[oy + 1, ox, 0] - self.cur_yuv[oy, ox, 0]
+            IxIx += Ix * Ix
+            IyIy += Iy * Iy
+            IxIy += Ix * Iy
+        #get maximum and minimum eigenvalues
+        lambda_max = 0.5 * ((IxIx + IyIy) + np.sqrt(4 * IxIy * IxIy +
+                                                    (IxIx - IyIy)**2))
+        lambda_min = 0.5 * ((IxIx + IyIy) - np.sqrt(4 * IxIy * IxIy +
+                                                    (IxIx - IyIy)**2))
+        fs[r, c] = lambda_max * lambda_min / (1e-6 + lambda_max + lambda_min)
+        if fs[r, c] < 0:
+          fs[r, c] = 0
+    return fs
+
+  """
+    do exhaust search
+    """
+
+  def search(self, cur_r, cur_c):
+    min_loss = self.block_dist(cur_r, cur_c, [0, 0], self.metric)
+    cur_x = cur_c * self.blk_sz
+    cur_y = cur_r * self.blk_sz
+    ref_x = cur_x
+    ref_y = cur_y
+    #search all validate positions and select the one with minimum distortion
+    for y in xrange(cur_y - self.wnd_sz, cur_y + self.wnd_sz):
+      for x in xrange(cur_x - self.wnd_sz, cur_x + self.wnd_sz):
+        if 0 <= x < self.width - self.blk_sz and 0 <= y < self.height - self.blk_sz:
+          loss = self.block_dist(cur_r, cur_c, [y - cur_y, x - cur_x],
+                                 self.metric)
+          if loss < min_loss:
+            min_loss = loss
+            ref_x = x
+            ref_y = y
+    return ref_x, ref_y
+
+  """
+    add smooth constraint
+    """
+
+  def smooth(self, uvs, mvs):
+    sm_uvs = np.zeros(uvs.shape)
+    for r in xrange(self.num_row):
+      for c in xrange(self.num_col):
+        avg_uv = np.array([0.0, 0.0])
+        for i, j in {(r - 1, c), (r + 1, c), (r, c - 1), (r, c + 1)}:
+          if 0 <= i < self.num_row and 0 <= j < self.num_col:
+            avg_uv += uvs[i, j] / 6.0
+        for i, j in {(r - 1, c - 1), (r - 1, c + 1), (r + 1, c - 1),
+                     (r + 1, c + 1)}:
+          if 0 <= i < self.num_row and 0 <= j < self.num_col:
+            avg_uv += uvs[i, j] / 12.0
+        sm_uvs[r, c] = (self.fs[r, c] * mvs[r, c] + self.beta * avg_uv) / (
+            self.beta + self.fs[r, c])
+    return sm_uvs
+
+  def motion_field_estimation(self):
+    #get matching results
+    mvs = np.zeros(self.mf.shape)
+    for r in xrange(self.num_row):
+      for c in xrange(self.num_col):
+        ref_x, ref_y = self.search(r, c)
+        mvs[r, c] = np.array([ref_y - r * self.blk_sz, ref_x - c * self.blk_sz])
+    #add smoothness constraint
+    uvs = np.zeros(self.mf.shape)
+    for _ in xrange(self.max_iter):
+      uvs = self.smooth(uvs, mvs)
+    self.mf = uvs
diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/GroundTruth.py b/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/GroundTruth.py
new file mode 100644
index 0000000000..37305898a7
--- /dev/null
+++ b/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/GroundTruth.py
@@ -0,0 +1,48 @@
+##  Copyright (c) 2020 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+#coding : utf - 8
+import numpy as np
+import numpy.linalg as LA
+from MotionEST import MotionEST
+"""Ground Truth:
+
+  Load in ground truth motion field and mask
+"""
+
+
+class GroundTruth(MotionEST):
+  """constructor:
+
+    cur_f:current
+    frame ref_f:reference
+    frame blk_sz:block size
+    gt_path:ground truth motion field file path
+    """
+
+  def __init__(self, cur_f, ref_f, blk_sz, gt_path, mf=None, mask=None):
+    self.name = 'ground truth'
+    super(GroundTruth, self).__init__(cur_f, ref_f, blk_sz)
+    self.mask = np.zeros((self.num_row, self.num_col), dtype=bool)
+    if gt_path:
+      with open(gt_path) as gt_file:
+        lines = gt_file.readlines()
+        for i in xrange(len(lines)):
+          info = lines[i].split(';')
+          for j in xrange(len(info)):
+            x, y = info[j].split(',')
+            #-, - stands for nothing
+            if x == '-' or y == '-':
+              self.mask[i, -j - 1] = True
+              continue
+            #the order of original file is flipped on the x axis
+            self.mf[i, -j - 1] = np.array([float(y), -float(x)], dtype=int)
+    else:
+      self.mf = mf
+      self.mask = mask
diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/HornSchunck.py b/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/HornSchunck.py
new file mode 100644
index 0000000000..976bd4a178
--- /dev/null
+++ b/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/HornSchunck.py
@@ -0,0 +1,212 @@
+##  Copyright (c) 2020 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+# coding: utf-8
+import numpy as np
+import numpy.linalg as LA
+from scipy.ndimage.filters import gaussian_filter
+from scipy.sparse import csc_matrix
+from scipy.sparse.linalg import inv
+from MotionEST import MotionEST
+"""Horn & Schunck Model"""
+
+
+class HornSchunck(MotionEST):
+  """
+    constructor:
+        cur_f: current frame
+        ref_f: reference frame
+        blk_sz: block size
+        alpha: smooth constrain weight
+        sigma: gaussian blur parameter
+    """
+
+  def __init__(self, cur_f, ref_f, blk_sz, alpha, sigma, max_iter=100):
+    super(HornSchunck, self).__init__(cur_f, ref_f, blk_sz)
+    self.cur_I, self.ref_I = self.getIntensity()
+    #perform gaussian blur to smooth the intensity
+    self.cur_I = gaussian_filter(self.cur_I, sigma=sigma)
+    self.ref_I = gaussian_filter(self.ref_I, sigma=sigma)
+    self.alpha = alpha
+    self.max_iter = max_iter
+    self.Ix, self.Iy, self.It = self.intensityDiff()
+
+  """
+    Build Frame Intensity
+    """
+
+  def getIntensity(self):
+    cur_I = np.zeros((self.num_row, self.num_col))
+    ref_I = np.zeros((self.num_row, self.num_col))
+    #use average intensity as block's intensity
+    for i in xrange(self.num_row):
+      for j in xrange(self.num_col):
+        r = i * self.blk_sz
+        c = j * self.blk_sz
+        cur_I[i, j] = np.mean(self.cur_yuv[r:r + self.blk_sz, c:c + self.blk_sz,
+                                           0])
+        ref_I[i, j] = np.mean(self.ref_yuv[r:r + self.blk_sz, c:c + self.blk_sz,
+                                           0])
+    return cur_I, ref_I
+
+  """
+    Get First Order Derivative
+    """
+
+  def intensityDiff(self):
+    Ix = np.zeros((self.num_row, self.num_col))
+    Iy = np.zeros((self.num_row, self.num_col))
+    It = np.zeros((self.num_row, self.num_col))
+    sz = self.blk_sz
+    for i in xrange(self.num_row - 1):
+      for j in xrange(self.num_col - 1):
+        """
+                Ix:
+                (i  ,j) <--- (i  ,j+1)
+                (i+1,j) <--- (i+1,j+1)
+                """
+        count = 0
+        for r, c in {(i, j + 1), (i + 1, j + 1)}:
+          if 0 <= r < self.num_row and 0 < c < self.num_col:
+            Ix[i, j] += (
+                self.cur_I[r, c] - self.cur_I[r, c - 1] + self.ref_I[r, c] -
+                self.ref_I[r, c - 1])
+            count += 2
+        Ix[i, j] /= count
+        """
+                Iy:
+                (i  ,j)      (i  ,j+1)
+                   ^             ^
+                   |             |
+                (i+1,j)      (i+1,j+1)
+                """
+        count = 0
+        for r, c in {(i + 1, j), (i + 1, j + 1)}:
+          if 0 < r < self.num_row and 0 <= c < self.num_col:
+            Iy[i, j] += (
+                self.cur_I[r, c] - self.cur_I[r - 1, c] + self.ref_I[r, c] -
+                self.ref_I[r - 1, c])
+            count += 2
+        Iy[i, j] /= count
+        count = 0
+        #It:
+        for r in xrange(i, i + 2):
+          for c in xrange(j, j + 2):
+            if 0 <= r < self.num_row and 0 <= c < self.num_col:
+              It[i, j] += (self.ref_I[r, c] - self.cur_I[r, c])
+              count += 1
+        It[i, j] /= count
+    return Ix, Iy, It
+
+  """
+    Get weighted average of neighbor motion vectors
+    for evaluation of laplacian
+    """
+
+  def averageMV(self):
+    avg = np.zeros((self.num_row, self.num_col, 2))
+    """
+        1/12 ---  1/6 --- 1/12
+         |         |       |
+        1/6  --- -1/8 --- 1/6
+         |         |       |
+        1/12 ---  1/6 --- 1/12
+        """
+    for i in xrange(self.num_row):
+      for j in xrange(self.num_col):
+        for r, c in {(-1, 0), (1, 0), (0, -1), (0, 1)}:
+          if 0 <= i + r < self.num_row and 0 <= j + c < self.num_col:
+            avg[i, j] += self.mf[i + r, j + c] / 6.0
+        for r, c in {(-1, -1), (-1, 1), (1, -1), (1, 1)}:
+          if 0 <= i + r < self.num_row and 0 <= j + c < self.num_col:
+            avg[i, j] += self.mf[i + r, j + c] / 12.0
+    return avg
+
+  def motion_field_estimation(self):
+    count = 0
+    """
+        u_{n+1} = ~u_n - Ix(Ix.~u_n+Iy.~v+It)/(IxIx+IyIy+alpha^2)
+        v_{n+1} = ~v_n - Iy(Ix.~u_n+Iy.~v+It)/(IxIx+IyIy+alpha^2)
+        """
+    denom = self.alpha**2 + np.power(self.Ix, 2) + np.power(self.Iy, 2)
+    while count < self.max_iter:
+      avg = self.averageMV()
+      self.mf[:, :, 1] = avg[:, :, 1] - self.Ix * (
+          self.Ix * avg[:, :, 1] + self.Iy * avg[:, :, 0] + self.It) / denom
+      self.mf[:, :, 0] = avg[:, :, 0] - self.Iy * (
+          self.Ix * avg[:, :, 1] + self.Iy * avg[:, :, 0] + self.It) / denom
+      count += 1
+    self.mf *= self.blk_sz
+
+  def motion_field_estimation_mat(self):
+    row_idx = []
+    col_idx = []
+    data = []
+
+    N = 2 * self.num_row * self.num_col
+    b = np.zeros((N, 1))
+    for i in xrange(self.num_row):
+      for j in xrange(self.num_col):
+        """(IxIx+alpha^2)u+IxIy.v-alpha^2~u IxIy.u+(IyIy+alpha^2)v-alpha^2~v"""
+        u_idx = i * 2 * self.num_col + 2 * j
+        v_idx = u_idx + 1
+        b[u_idx, 0] = -self.Ix[i, j] * self.It[i, j]
+        b[v_idx, 0] = -self.Iy[i, j] * self.It[i, j]
+        #u: (IxIx+alpha^2)u
+        row_idx.append(u_idx)
+        col_idx.append(u_idx)
+        data.append(self.Ix[i, j] * self.Ix[i, j] + self.alpha**2)
+        #IxIy.v
+        row_idx.append(u_idx)
+        col_idx.append(v_idx)
+        data.append(self.Ix[i, j] * self.Iy[i, j])
+
+        #v: IxIy.u
+        row_idx.append(v_idx)
+        col_idx.append(u_idx)
+        data.append(self.Ix[i, j] * self.Iy[i, j])
+        #(IyIy+alpha^2)v
+        row_idx.append(v_idx)
+        col_idx.append(v_idx)
+        data.append(self.Iy[i, j] * self.Iy[i, j] + self.alpha**2)
+
+        #-alpha^2~u
+        #-alpha^2~v
+        for r, c in {(-1, 0), (1, 0), (0, -1), (0, 1)}:
+          if 0 <= i + r < self.num_row and 0 <= j + c < self.num_col:
+            u_nb = (i + r) * 2 * self.num_col + 2 * (j + c)
+            v_nb = u_nb + 1
+
+            row_idx.append(u_idx)
+            col_idx.append(u_nb)
+            data.append(-1 * self.alpha**2 / 6.0)
+
+            row_idx.append(v_idx)
+            col_idx.append(v_nb)
+            data.append(-1 * self.alpha**2 / 6.0)
+        for r, c in {(-1, -1), (-1, 1), (1, -1), (1, 1)}:
+          if 0 <= i + r < self.num_row and 0 <= j + c < self.num_col:
+            u_nb = (i + r) * 2 * self.num_col + 2 * (j + c)
+            v_nb = u_nb + 1
+
+            row_idx.append(u_idx)
+            col_idx.append(u_nb)
+            data.append(-1 * self.alpha**2 / 12.0)
+
+            row_idx.append(v_idx)
+            col_idx.append(v_nb)
+            data.append(-1 * self.alpha**2 / 12.0)
+    M = csc_matrix((data, (row_idx, col_idx)), shape=(N, N))
+    M_inv = inv(M)
+    uv = M_inv.dot(b)
+
+    for i in xrange(self.num_row):
+      for j in xrange(self.num_col):
+        self.mf[i, j, 0] = uv[i * 2 * self.num_col + 2 * j + 1, 0] * self.blk_sz
+        self.mf[i, j, 1] = uv[i * 2 * self.num_col + 2 * j, 0] * self.blk_sz
diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/MotionEST.py b/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/MotionEST.py
new file mode 100644
index 0000000000..fc393818d9
--- /dev/null
+++ b/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/MotionEST.py
@@ -0,0 +1,117 @@
+##  Copyright (c) 2020 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+#coding : utf - 8
+import numpy as np
+import numpy.linalg as LA
+import matplotlib.pyplot as plt
+from Util import drawMF, MSE
+"""The Base Class of Estimators"""
+
+
+class MotionEST(object):
+  """
+    constructor:
+        cur_f: current frame
+        ref_f: reference frame
+        blk_sz: block size
+    """
+
+  def __init__(self, cur_f, ref_f, blk_sz):
+    self.cur_f = cur_f
+    self.ref_f = ref_f
+    self.blk_sz = blk_sz
+    #convert RGB to YUV
+    self.cur_yuv = np.array(self.cur_f.convert('YCbCr'), dtype=int)
+    self.ref_yuv = np.array(self.ref_f.convert('YCbCr'), dtype=int)
+    #frame size
+    self.width = self.cur_f.size[0]
+    self.height = self.cur_f.size[1]
+    #motion field size
+    self.num_row = self.height // self.blk_sz
+    self.num_col = self.width // self.blk_sz
+    #initialize motion field
+    self.mf = np.zeros((self.num_row, self.num_col, 2))
+
+  """estimation function Override by child classes"""
+
+  def motion_field_estimation(self):
+    pass
+
+  """
+    distortion of a block:
+      cur_r: current row
+      cur_c: current column
+      mv: motion vector
+      metric: distortion metric
+  """
+
+  def block_dist(self, cur_r, cur_c, mv, metric=MSE):
+    cur_x = cur_c * self.blk_sz
+    cur_y = cur_r * self.blk_sz
+    h = min(self.blk_sz, self.height - cur_y)
+    w = min(self.blk_sz, self.width - cur_x)
+    cur_blk = self.cur_yuv[cur_y:cur_y + h, cur_x:cur_x + w, :]
+    ref_x = int(cur_x + mv[1])
+    ref_y = int(cur_y + mv[0])
+    if 0 <= ref_x < self.width - w and 0 <= ref_y < self.height - h:
+      ref_blk = self.ref_yuv[ref_y:ref_y + h, ref_x:ref_x + w, :]
+    else:
+      ref_blk = np.zeros((h, w, 3))
+    return metric(cur_blk, ref_blk)
+
+  """
+    distortion of motion field
+  """
+
+  def distortion(self, mask=None, metric=MSE):
+    loss = 0
+    count = 0
+    for i in xrange(self.num_row):
+      for j in xrange(self.num_col):
+        if mask is not None and mask[i, j]:
+          continue
+        loss += self.block_dist(i, j, self.mf[i, j], metric)
+        count += 1
+    return loss / count
+
+  """evaluation compare the difference with ground truth"""
+
+  def motion_field_evaluation(self, ground_truth):
+    loss = 0
+    count = 0
+    gt = ground_truth.mf
+    mask = ground_truth.mask
+    for i in xrange(self.num_row):
+      for j in xrange(self.num_col):
+        if mask is not None and mask[i][j]:
+          continue
+        loss += LA.norm(gt[i, j] - self.mf[i, j])
+        count += 1
+    return loss / count
+
+  """render the motion field"""
+
+  def show(self, ground_truth=None, size=10):
+    cur_mf = drawMF(self.cur_f, self.blk_sz, self.mf)
+    if ground_truth is None:
+      n_row = 1
+    else:
+      gt_mf = drawMF(self.cur_f, self.blk_sz, ground_truth)
+      n_row = 2
+    plt.figure(figsize=(n_row * size, size * self.height / self.width))
+    plt.subplot(1, n_row, 1)
+    plt.imshow(cur_mf)
+    plt.title('Estimated Motion Field')
+    if ground_truth is not None:
+      plt.subplot(1, n_row, 2)
+      plt.imshow(gt_mf)
+      plt.title('Ground Truth')
+    plt.tight_layout()
+    plt.show()
diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/SearchSmooth.py b/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/SearchSmooth.py
new file mode 100644
index 0000000000..2dc6771ee5
--- /dev/null
+++ b/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/SearchSmooth.py
@@ -0,0 +1,221 @@
+##  Copyright (c) 2020 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+# coding: utf-8
+import numpy as np
+import numpy.linalg as LA
+from Util import MSE
+from MotionEST import MotionEST
+"""Search & Smooth Model with Adapt Weights"""
+
+
+class SearchSmoothAdapt(MotionEST):
+  """
+    Constructor:
+        cur_f: current frame
+        ref_f: reference frame
+        blk_sz: block size
+        wnd_size: search window size
+        beta: neigbor loss weight
+        max_iter: maximum number of iterations
+        metric: metric to compare the blocks distrotion
+    """
+
+  def __init__(self, cur_f, ref_f, blk_size, search, max_iter=100):
+    self.search = search
+    self.max_iter = max_iter
+    super(SearchSmoothAdapt, self).__init__(cur_f, ref_f, blk_size)
+
+  """
+    get local diffiencial of refernce
+    """
+
+  def getRefLocalDiff(self, mvs):
+    m, n = self.num_row, self.num_col
+    localDiff = [[] for _ in xrange(m)]
+    blk_sz = self.blk_sz
+    for r in xrange(m):
+      for c in xrange(n):
+        I_row = 0
+        I_col = 0
+        #get ssd surface
+        count = 0
+        center = self.cur_yuv[r * blk_sz:(r + 1) * blk_sz,
+                              c * blk_sz:(c + 1) * blk_sz, 0]
+        ty = np.clip(r * blk_sz + int(mvs[r, c, 0]), 0, self.height - blk_sz)
+        tx = np.clip(c * blk_sz + int(mvs[r, c, 1]), 0, self.width - blk_sz)
+        target = self.ref_yuv[ty:ty + blk_sz, tx:tx + blk_sz, 0]
+        for y, x in {(ty - blk_sz, tx), (ty + blk_sz, tx)}:
+          if 0 <= y < self.height - blk_sz and 0 <= x < self.width - blk_sz:
+            nb = self.ref_yuv[y:y + blk_sz, x:x + blk_sz, 0]
+            I_row += np.sum(np.abs(nb - center)) - np.sum(
+                np.abs(target - center))
+            count += 1
+        I_row //= (count * blk_sz * blk_sz)
+        count = 0
+        for y, x in {(ty, tx - blk_sz), (ty, tx + blk_sz)}:
+          if 0 <= y < self.height - blk_sz and 0 <= x < self.width - blk_sz:
+            nb = self.ref_yuv[y:y + blk_sz, x:x + blk_sz, 0]
+            I_col += np.sum(np.abs(nb - center)) - np.sum(
+                np.abs(target - center))
+            count += 1
+        I_col //= (count * blk_sz * blk_sz)
+        localDiff[r].append(
+            np.array([[I_row * I_row, I_row * I_col],
+                      [I_col * I_row, I_col * I_col]]))
+    return localDiff
+
+  """
+    add smooth constraint
+    """
+
+  def smooth(self, uvs, mvs):
+    sm_uvs = np.zeros(uvs.shape)
+    blk_sz = self.blk_sz
+    for r in xrange(self.num_row):
+      for c in xrange(self.num_col):
+        nb_uv = np.array([0.0, 0.0])
+        for i, j in {(r - 1, c), (r + 1, c), (r, c - 1), (r, c + 1)}:
+          if 0 <= i < self.num_row and 0 <= j < self.num_col:
+            nb_uv += uvs[i, j] / 6.0
+          else:
+            nb_uv += uvs[r, c] / 6.0
+        for i, j in {(r - 1, c - 1), (r - 1, c + 1), (r + 1, c - 1),
+                     (r + 1, c + 1)}:
+          if 0 <= i < self.num_row and 0 <= j < self.num_col:
+            nb_uv += uvs[i, j] / 12.0
+          else:
+            nb_uv += uvs[r, c] / 12.0
+        ssd_nb = self.block_dist(r, c, self.blk_sz * nb_uv)
+        mv = mvs[r, c]
+        ssd_mv = self.block_dist(r, c, mv)
+        alpha = (ssd_nb - ssd_mv) / (ssd_mv + 1e-6)
+        M = alpha * self.localDiff[r][c]
+        P = M + np.identity(2)
+        inv_P = LA.inv(P)
+        sm_uvs[r, c] = np.dot(inv_P, nb_uv) + np.dot(
+            np.matmul(inv_P, M), mv / blk_sz)
+    return sm_uvs
+
+  def block_matching(self):
+    self.search.motion_field_estimation()
+
+  def motion_field_estimation(self):
+    self.localDiff = self.getRefLocalDiff(self.search.mf)
+    #get matching results
+    mvs = self.search.mf
+    #add smoothness constraint
+    uvs = mvs / self.blk_sz
+    for _ in xrange(self.max_iter):
+      uvs = self.smooth(uvs, mvs)
+    self.mf = uvs * self.blk_sz
+
+
+"""Search & Smooth Model with Fixed Weights"""
+
+
+class SearchSmoothFix(MotionEST):
+  """
+    Constructor:
+        cur_f: current frame
+        ref_f: reference frame
+        blk_sz: block size
+        wnd_size: search window size
+        beta: neigbor loss weight
+        max_iter: maximum number of iterations
+        metric: metric to compare the blocks distrotion
+    """
+
+  def __init__(self, cur_f, ref_f, blk_size, search, beta, max_iter=100):
+    self.search = search
+    self.max_iter = max_iter
+    self.beta = beta
+    super(SearchSmoothFix, self).__init__(cur_f, ref_f, blk_size)
+
+  """
+    get local diffiencial of refernce
+    """
+
+  def getRefLocalDiff(self, mvs):
+    m, n = self.num_row, self.num_col
+    localDiff = [[] for _ in xrange(m)]
+    blk_sz = self.blk_sz
+    for r in xrange(m):
+      for c in xrange(n):
+        I_row = 0
+        I_col = 0
+        #get ssd surface
+        count = 0
+        center = self.cur_yuv[r * blk_sz:(r + 1) * blk_sz,
+                              c * blk_sz:(c + 1) * blk_sz, 0]
+        ty = np.clip(r * blk_sz + int(mvs[r, c, 0]), 0, self.height - blk_sz)
+        tx = np.clip(c * blk_sz + int(mvs[r, c, 1]), 0, self.width - blk_sz)
+        target = self.ref_yuv[ty:ty + blk_sz, tx:tx + blk_sz, 0]
+        for y, x in {(ty - blk_sz, tx), (ty + blk_sz, tx)}:
+          if 0 <= y < self.height - blk_sz and 0 <= x < self.width - blk_sz:
+            nb = self.ref_yuv[y:y + blk_sz, x:x + blk_sz, 0]
+            I_row += np.sum(np.abs(nb - center)) - np.sum(
+                np.abs(target - center))
+            count += 1
+        I_row //= (count * blk_sz * blk_sz)
+        count = 0
+        for y, x in {(ty, tx - blk_sz), (ty, tx + blk_sz)}:
+          if 0 <= y < self.height - blk_sz and 0 <= x < self.width - blk_sz:
+            nb = self.ref_yuv[y:y + blk_sz, x:x + blk_sz, 0]
+            I_col += np.sum(np.abs(nb - center)) - np.sum(
+                np.abs(target - center))
+            count += 1
+        I_col //= (count * blk_sz * blk_sz)
+        localDiff[r].append(
+            np.array([[I_row * I_row, I_row * I_col],
+                      [I_col * I_row, I_col * I_col]]))
+    return localDiff
+
+  """
+    add smooth constraint
+    """
+
+  def smooth(self, uvs, mvs):
+    sm_uvs = np.zeros(uvs.shape)
+    blk_sz = self.blk_sz
+    for r in xrange(self.num_row):
+      for c in xrange(self.num_col):
+        nb_uv = np.array([0.0, 0.0])
+        for i, j in {(r - 1, c), (r + 1, c), (r, c - 1), (r, c + 1)}:
+          if 0 <= i < self.num_row and 0 <= j < self.num_col:
+            nb_uv += uvs[i, j] / 6.0
+          else:
+            nb_uv += uvs[r, c] / 6.0
+        for i, j in {(r - 1, c - 1), (r - 1, c + 1), (r + 1, c - 1),
+                     (r + 1, c + 1)}:
+          if 0 <= i < self.num_row and 0 <= j < self.num_col:
+            nb_uv += uvs[i, j] / 12.0
+          else:
+            nb_uv += uvs[r, c] / 12.0
+        mv = mvs[r, c] / blk_sz
+        M = self.localDiff[r][c]
+        P = M + self.beta * np.identity(2)
+        inv_P = LA.inv(P)
+        sm_uvs[r, c] = np.dot(inv_P, self.beta * nb_uv) + np.dot(
+            np.matmul(inv_P, M), mv)
+    return sm_uvs
+
+  def block_matching(self):
+    self.search.motion_field_estimation()
+
+  def motion_field_estimation(self):
+    #get local structure
+    self.localDiff = self.getRefLocalDiff(self.search.mf)
+    #get matching results
+    mvs = self.search.mf
+    #add smoothness constraint
+    uvs = mvs / self.blk_sz
+    for _ in xrange(self.max_iter):
+      uvs = self.smooth(uvs, mvs)
+    self.mf = uvs * self.blk_sz
diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/Util.py b/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/Util.py
new file mode 100644
index 0000000000..c2416163be
--- /dev/null
+++ b/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/Util.py
@@ -0,0 +1,46 @@
+##  Copyright (c) 2020 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+# coding: utf-8
+import numpy as np
+import numpy.linalg as LA
+import matplotlib.pyplot as plt
+from scipy.ndimage import filters
+from PIL import Image, ImageDraw
+
+
+def MSE(blk1, blk2):
+  return np.mean(
+      LA.norm(
+          np.array(blk1, dtype=int) - np.array(blk2, dtype=int), axis=2))
+
+
+def drawMF(img, blk_sz, mf):
+  img_rgba = img.convert('RGBA')
+  mf_layer = Image.new(mode='RGBA', size=img_rgba.size, color=(0, 0, 0, 0))
+  draw = ImageDraw.Draw(mf_layer)
+  width = img_rgba.size[0]
+  height = img_rgba.size[1]
+  num_row = height // blk_sz
+  num_col = width // blk_sz
+  for i in xrange(num_row):
+    left = (0, i * blk_sz)
+    right = (width, i * blk_sz)
+    draw.line([left, right], fill=(0, 0, 255, 255))
+  for j in xrange(num_col):
+    up = (j * blk_sz, 0)
+    down = (j * blk_sz, height)
+    draw.line([up, down], fill=(0, 0, 255, 255))
+  for i in xrange(num_row):
+    for j in xrange(num_col):
+      center = (j * blk_sz + 0.5 * blk_sz, i * blk_sz + 0.5 * blk_sz)
+      """mf[i,j][0] is the row shift and mf[i,j][1] is the column shift In PIL coordinates, head[0] is x (column shift) and head[1] is y (row shift)."""
+      head = (center[0] + mf[i, j][1], center[1] + mf[i, j][0])
+      draw.line([center, head], fill=(255, 0, 0, 255))
+  return Image.alpha_composite(img_rgba, mf_layer)
diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/genY4M/genY4M.py b/media/libvpx/libvpx/tools/3D-Reconstruction/genY4M/genY4M.py
new file mode 100644
index 0000000000..8028102f0c
--- /dev/null
+++ b/media/libvpx/libvpx/tools/3D-Reconstruction/genY4M/genY4M.py
@@ -0,0 +1,85 @@
+##  Copyright (c) 2020 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+import argparse
+from os import listdir, path
+from PIL import Image
+import sys
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--frame_path", default="../data/frame/", type=str)
+parser.add_argument("--frame_rate", default="25:1", type=str)
+parser.add_argument("--interlacing", default="Ip", type=str)
+parser.add_argument("--pix_ratio", default="0:0", type=str)
+parser.add_argument("--color_space", default="4:2:0", type=str)
+parser.add_argument("--output", default="output.y4m", type=str)
+
+
+def generate(args, frames):
+  if len(frames) == 0:
+    return
+  #sort the frames based on the frame index
+  frames = sorted(frames, key=lambda x: x[0])
+  #convert the frames to YUV form
+  frames = [f.convert("YCbCr") for _, f in frames]
+  #write the header
+  header = "YUV4MPEG2 W%d H%d F%s %s A%s" % (frames[0].width, frames[0].height,
+                                             args.frame_rate, args.interlacing,
+                                             args.pix_ratio)
+  cs = args.color_space.split(":")
+  header += " C%s%s%s\n" % (cs[0], cs[1], cs[2])
+  #estimate the sample step based on subsample value
+  subsamples = [int(c) for c in cs]
+  r_step = [1, int(subsamples[2] == 0) + 1, int(subsamples[2] == 0) + 1]
+  c_step = [1, 4 // subsamples[1], 4 // subsamples[1]]
+  #write in frames
+  with open(args.output, "wb") as y4m:
+    y4m.write(header)
+    for f in frames:
+      y4m.write("FRAME\n")
+      px = f.load()
+      for k in xrange(3):
+        for i in xrange(0, f.height, r_step[k]):
+          for j in xrange(0, f.width, c_step[k]):
+            yuv = px[j, i]
+            y4m.write(chr(yuv[k]))
+
+
+if __name__ == "__main__":
+  args = parser.parse_args()
+  frames = []
+  frames_mv = []
+  for filename in listdir(args.frame_path):
+    name, ext = filename.split(".")
+    if ext == "png":
+      name_parse = name.split("_")
+      idx = int(name_parse[-1])
+      img = Image.open(path.join(args.frame_path, filename))
+      if name_parse[-2] == "mv":
+        frames_mv.append((idx, img))
+      else:
+        frames.append((idx, img))
+  if len(frames) == 0:
+    print("No frames in directory: " + args.frame_path)
+    sys.exit()
+  print("----------------------Y4M Info----------------------")
+  print("width:  %d" % frames[0][1].width)
+  print("height: %d" % frames[0][1].height)
+  print("#frame: %d" % len(frames))
+  print("frame rate: %s" % args.frame_rate)
+  print("interlacing: %s" % args.interlacing)
+  print("pixel ratio: %s" % args.pix_ratio)
+  print("color space: %s" % args.color_space)
+  print("----------------------------------------------------")
+
+  print("Generating ...")
+  generate(args, frames)
+  if len(frames_mv) != 0:
+    args.output = args.output.replace(".y4m", "_mv.y4m")
+    generate(args, frames_mv)
diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/BVH.pde b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/BVH.pde
new file mode 100644
index 0000000000..7249ee972e
--- /dev/null
+++ b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/BVH.pde
@@ -0,0 +1,163 @@
+/*
+ *AABB bounding box
+ *Bouding Volume Hierarchy
+ */
+class BoundingBox {
+  float min_x, min_y, min_z, max_x, max_y, max_z;
+  PVector center;
+  BoundingBox() {
+    min_x = Float.POSITIVE_INFINITY;
+    min_y = Float.POSITIVE_INFINITY;
+    min_z = Float.POSITIVE_INFINITY;
+    max_x = Float.NEGATIVE_INFINITY;
+    max_y = Float.NEGATIVE_INFINITY;
+    max_z = Float.NEGATIVE_INFINITY;
+    center = new PVector();
+  }
+  // build a bounding box for a triangle
+  void create(Triangle t) {
+    min_x = min(t.p1.x, min(t.p2.x, t.p3.x));
+    max_x = max(t.p1.x, max(t.p2.x, t.p3.x));
+
+    min_y = min(t.p1.y, min(t.p2.y, t.p3.y));
+    max_y = max(t.p1.y, max(t.p2.y, t.p3.y));
+
+    min_z = min(t.p1.z, min(t.p2.z, t.p3.z));
+    max_z = max(t.p1.z, max(t.p2.z, t.p3.z));
+    center.x = (max_x + min_x) / 2;
+    center.y = (max_y + min_y) / 2;
+    center.z = (max_z + min_z) / 2;
+  }
+  // merge two bounding boxes
+  void add(BoundingBox bbx) {
+    min_x = min(min_x, bbx.min_x);
+    min_y = min(min_y, bbx.min_y);
+    min_z = min(min_z, bbx.min_z);
+
+    max_x = max(max_x, bbx.max_x);
+    max_y = max(max_y, bbx.max_y);
+    max_z = max(max_z, bbx.max_z);
+    center.x = (max_x + min_x) / 2;
+    center.y = (max_y + min_y) / 2;
+    center.z = (max_z + min_z) / 2;
+  }
+  // get bounding box center axis value
+  float getCenterAxisValue(int axis) {
+    if (axis == 1) {
+      return center.x;
+    } else if (axis == 2) {
+      return center.y;
+    }
+    // when axis == 3
+    return center.z;
+  }
+  // check if a ray is intersected with the bounding box
+  boolean intersect(Ray r) {
+    float tmin, tmax;
+    if (r.dir.x >= 0) {
+      tmin = (min_x - r.ori.x) * (1.0f / r.dir.x);
+      tmax = (max_x - r.ori.x) * (1.0f / r.dir.x);
+    } else {
+      tmin = (max_x - r.ori.x) * (1.0f / r.dir.x);
+      tmax = (min_x - r.ori.x) * (1.0f / r.dir.x);
+    }
+
+    float tymin, tymax;
+    if (r.dir.y >= 0) {
+      tymin = (min_y - r.ori.y) * (1.0f / r.dir.y);
+      tymax = (max_y - r.ori.y) * (1.0f / r.dir.y);
+    } else {
+      tymin = (max_y - r.ori.y) * (1.0f / r.dir.y);
+      tymax = (min_y - r.ori.y) * (1.0f / r.dir.y);
+    }
+
+    if (tmax < tymin || tymax < tmin) {
+      return false;
+    }
+
+    tmin = tmin < tymin ? tymin : tmin;
+    tmax = tmax > tymax ? tymax : tmax;
+
+    float tzmin, tzmax;
+    if (r.dir.z >= 0) {
+      tzmin = (min_z - r.ori.z) * (1.0f / r.dir.z);
+      tzmax = (max_z - r.ori.z) * (1.0f / r.dir.z);
+    } else {
+      tzmin = (max_z - r.ori.z) * (1.0f / r.dir.z);
+      tzmax = (min_z - r.ori.z) * (1.0f / r.dir.z);
+    }
+    if (tmax < tzmin || tmin > tzmax) {
+      return false;
+    }
+    return true;
+  }
+}
+// Bounding Volume Hierarchy
+class BVH {
+  // Binary Tree
+  BVH left, right;
+  BoundingBox overall_bbx;
+  ArrayList<Triangle> mesh;
+  BVH(ArrayList<Triangle> mesh) {
+    this.mesh = mesh;
+    overall_bbx = new BoundingBox();
+    left = null;
+    right = null;
+    int mesh_size = this.mesh.size();
+    if (mesh_size <= 1) {
+      return;
+    }
+    // random select an axis
+    int axis = int(random(100)) % 3 + 1;
+    // build bounding box and save the selected center component
+    float[] axis_values = new float[mesh_size];
+    for (int i = 0; i < mesh_size; i++) {
+      Triangle t = this.mesh.get(i);
+      overall_bbx.add(t.bbx);
+      axis_values[i] = t.bbx.getCenterAxisValue(axis);
+    }
+    // find the median value of selected center component as pivot
+    axis_values = sort(axis_values);
+    float pivot;
+    if (mesh_size % 2 == 1) {
+      pivot = axis_values[mesh_size / 2];
+    } else {
+      pivot =
+          0.5f * (axis_values[mesh_size / 2 - 1] + axis_values[mesh_size / 2]);
+    }
+    // Build left node and right node by partitioning the mesh based on triangle
+    // bounding box center component value
+    ArrayList<Triangle> left_mesh = new ArrayList<Triangle>();
+    ArrayList<Triangle> right_mesh = new ArrayList<Triangle>();
+    for (int i = 0; i < mesh_size; i++) {
+      Triangle t = this.mesh.get(i);
+      if (t.bbx.getCenterAxisValue(axis) < pivot) {
+        left_mesh.add(t);
+      } else if (t.bbx.getCenterAxisValue(axis) > pivot) {
+        right_mesh.add(t);
+      } else if (left_mesh.size() < right_mesh.size()) {
+        left_mesh.add(t);
+      } else {
+        right_mesh.add(t);
+      }
+    }
+    left = new BVH(left_mesh);
+    right = new BVH(right_mesh);
+  }
+  // check if a ray intersect with current volume
+  boolean intersect(Ray r, float[] param) {
+    if (mesh.size() == 0) {
+      return false;
+    }
+    if (mesh.size() == 1) {
+      Triangle t = mesh.get(0);
+      return t.intersect(r, param);
+    }
+    if (!overall_bbx.intersect(r)) {
+      return false;
+    }
+    boolean left_res = left.intersect(r, param);
+    boolean right_res = right.intersect(r, param);
+    return left_res || right_res;
+  }
+}
diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Camera.pde b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Camera.pde
new file mode 100644
index 0000000000..b39dae3a19
--- /dev/null
+++ b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Camera.pde
@@ -0,0 +1,138 @@
+class Camera {
+  // camera's field of view
+  float fov;
+  // camera's position, look at point and axis
+  PVector pos, center, axis;
+  PVector init_pos, init_center, init_axis;
+  float move_speed;
+  float rot_speed;
+  Camera(float fov, PVector pos, PVector center, PVector axis) {
+    this.fov = fov;
+    this.pos = pos;
+    this.center = center;
+    this.axis = axis;
+    this.axis.normalize();
+    move_speed = 0.001;
+    rot_speed = 0.01 * PI;
+    init_pos = pos.copy();
+    init_center = center.copy();
+    init_axis = axis.copy();
+  }
+
+  Camera copy() {
+    Camera cam = new Camera(fov, pos.copy(), center.copy(), axis.copy());
+    return cam;
+  }
+
+  PVector project(PVector pos) {
+    PVector proj = MatxVec3(getCameraMat(), PVector.sub(pos, this.pos));
+    proj.x = (float)height / 2.0 * proj.x / proj.z / tan(fov / 2.0f);
+    proj.y = (float)height / 2.0 * proj.y / proj.z / tan(fov / 2.0f);
+    proj.z = proj.z;
+    return proj;
+  }
+
+  float[] getCameraMat() {
+    float[] mat = new float[9];
+    PVector dir = PVector.sub(center, pos);
+    dir.normalize();
+    PVector left = dir.cross(axis);
+    left.normalize();
+    // processing camera system does not follow right hand rule
+    mat[0] = -left.x;
+    mat[1] = -left.y;
+    mat[2] = -left.z;
+    mat[3] = axis.x;
+    mat[4] = axis.y;
+    mat[5] = axis.z;
+    mat[6] = dir.x;
+    mat[7] = dir.y;
+    mat[8] = dir.z;
+
+    return mat;
+  }
+
+  void run() {
+    PVector dir, left;
+    if (mousePressed) {
+      float angleX = (float)mouseX / width * PI - PI / 2;
+      float angleY = (float)mouseY / height * PI - PI;
+      PVector diff = PVector.sub(center, pos);
+      float radius = diff.mag();
+      pos.x = radius * sin(angleY) * sin(angleX) + center.x;
+      pos.y = radius * cos(angleY) + center.y;
+      pos.z = radius * sin(angleY) * cos(angleX) + center.z;
+      dir = PVector.sub(center, pos);
+      dir.normalize();
+      PVector up = new PVector(0, 1, 0);
+      left = up.cross(dir);
+      left.normalize();
+      axis = dir.cross(left);
+      axis.normalize();
+    }
+
+    if (keyPressed) {
+      switch (key) {
+        case 'w':
+          dir = PVector.sub(center, pos);
+          dir.normalize();
+          pos = PVector.add(pos, PVector.mult(dir, move_speed));
+          center = PVector.add(center, PVector.mult(dir, move_speed));
+          break;
+        case 's':
+          dir = PVector.sub(center, pos);
+          dir.normalize();
+          pos = PVector.sub(pos, PVector.mult(dir, move_speed));
+          center = PVector.sub(center, PVector.mult(dir, move_speed));
+          break;
+        case 'a':
+          dir = PVector.sub(center, pos);
+          dir.normalize();
+          left = axis.cross(dir);
+          left.normalize();
+          pos = PVector.add(pos, PVector.mult(left, move_speed));
+          center = PVector.add(center, PVector.mult(left, move_speed));
+          break;
+        case 'd':
+          dir = PVector.sub(center, pos);
+          dir.normalize();
+          left = axis.cross(dir);
+          left.normalize();
+          pos = PVector.sub(pos, PVector.mult(left, move_speed));
+          center = PVector.sub(center, PVector.mult(left, move_speed));
+          break;
+        case 'r':
+          dir = PVector.sub(center, pos);
+          dir.normalize();
+          float[] mat = getRotationMat3x3(rot_speed, dir.x, dir.y, dir.z);
+          axis = MatxVec3(mat, axis);
+          axis.normalize();
+          break;
+        case 'b':
+          pos = init_pos.copy();
+          center = init_center.copy();
+          axis = init_axis.copy();
+          break;
+        case '+': move_speed *= 2.0f; break;
+        case '-': move_speed /= 2.0; break;
+        case CODED:
+          if (keyCode == UP) {
+            pos = PVector.add(pos, PVector.mult(axis, move_speed));
+            center = PVector.add(center, PVector.mult(axis, move_speed));
+          } else if (keyCode == DOWN) {
+            pos = PVector.sub(pos, PVector.mult(axis, move_speed));
+            center = PVector.sub(center, PVector.mult(axis, move_speed));
+          }
+      }
+    }
+  }
+  void open() {
+    perspective(fov, float(width) / height, 1e-6, 1e5);
+    camera(pos.x, pos.y, pos.z, center.x, center.y, center.z, axis.x, axis.y,
+           axis.z);
+  }
+  void close() {
+    ortho(-width, 0, -height, 0);
+    camera(0, 0, 0, 0, 0, 1, 0, 1, 0);
+  }
+}
diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/MotionField.pde b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/MotionField.pde
new file mode 100644
index 0000000000..a5e04b6a92
--- /dev/null
+++ b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/MotionField.pde
@@ -0,0 +1,102 @@
+class MotionField {
+  int block_size;
+  ArrayList<PVector> motion_field;
+  MotionField(int block_size) {
+    this.block_size = block_size;
+    motion_field = new ArrayList<PVector>();
+  }
+
+  void update(Camera last_cam, Camera current_cam, PointCloud point_cloud,
+              BVH bvh) {
+    // clear motion field
+    motion_field = new ArrayList<PVector>();
+    int r_num = height / block_size, c_num = width / block_size;
+    for (int i = 0; i < r_num * c_num; i++)
+      motion_field.add(new PVector(0, 0, 0));
+    // estimate motion vector of each point in point cloud
+    for (int i = 0; i < point_cloud.size(); i++) {
+      PVector p = point_cloud.getPosition(i);
+      PVector p0 = current_cam.project(p);
+      PVector p1 = last_cam.project(p);
+      int row = int((p0.y + height / 2.0f) / block_size);
+      int col = int((p0.x + width / 2.0f) / block_size);
+      if (row >= 0 && row < r_num && col >= 0 && col < c_num) {
+        PVector accu = motion_field.get(row * c_num + col);
+        accu.x += p1.x - p0.x;
+        accu.y += p1.y - p0.y;
+        accu.z += 1;
+      }
+    }
+    // if some blocks do not have point, then use ray tracing to see if they are
+    // in triangles
+    for (int i = 0; i < r_num; i++)
+      for (int j = 0; j < c_num; j++) {
+        PVector accu = motion_field.get(i * c_num + j);
+        if (accu.z > 0) {
+          continue;
+        }
+        // use the center of the block to generate view ray
+        float cx = j * block_size + block_size / 2.0f - width / 2.0f;
+        float cy = i * block_size + block_size / 2.0f - height / 2.0f;
+        float cz = 0.5f * height / tan(current_cam.fov / 2.0f);
+        PVector dir = new PVector(cx, cy, cz);
+        float[] camMat = current_cam.getCameraMat();
+        dir = MatxVec3(transpose3x3(camMat), dir);
+        dir.normalize();
+        Ray r = new Ray(current_cam.pos, dir);
+        // ray tracing
+        float[] param = new float[4];
+        param[0] = Float.POSITIVE_INFINITY;
+        if (bvh.intersect(r, param)) {
+          PVector p = new PVector(param[1], param[2], param[3]);
+          PVector p0 = current_cam.project(p);
+          PVector p1 = last_cam.project(p);
+          accu.x += p1.x - p0.x;
+          accu.y += p1.y - p0.y;
+          accu.z += 1;
+        }
+      }
+    // estimate the motion vector of each block
+    for (int i = 0; i < r_num * c_num; i++) {
+      PVector mv = motion_field.get(i);
+      if (mv.z > 0) {
+        motion_field.set(i, new PVector(mv.x / mv.z, mv.y / mv.z, 0));
+      } else  // there is nothing in the block, use -1 to mark it.
+      {
+        motion_field.set(i, new PVector(0.0, 0.0, -1));
+      }
+    }
+  }
+
+  void render() {
+    int r_num = height / block_size, c_num = width / block_size;
+    for (int i = 0; i < r_num; i++)
+      for (int j = 0; j < c_num; j++) {
+        PVector mv = motion_field.get(i * c_num + j);
+        float ox = j * block_size + 0.5f * block_size;
+        float oy = i * block_size + 0.5f * block_size;
+        stroke(255, 0, 0);
+        line(ox, oy, ox + mv.x, oy + mv.y);
+      }
+  }
+
+  void save(String path) {
+    int r_num = height / block_size;
+    int c_num = width / block_size;
+    String[] mvs = new String[r_num];
+    for (int i = 0; i < r_num; i++) {
+      mvs[i] = "";
+      for (int j = 0; j < c_num; j++) {
+        PVector mv = motion_field.get(i * c_num + j);
+        if (mv.z != -1) {
+          mvs[i] += str(mv.x) + "," + str(mv.y);
+        } else  // there is nothing
+        {
+          mvs[i] += "-,-";
+        }
+        if (j != c_num - 1) mvs[i] += ";";
+      }
+    }
+    saveStrings(path, mvs);
+  }
+}
diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/PointCloud.pde b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/PointCloud.pde
new file mode 100644
index 0000000000..714a6f3a0b
--- /dev/null
+++ b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/PointCloud.pde
@@ -0,0 +1,138 @@
+class PointCloud {
+  ArrayList<PVector> points;  // array to save points
+  IntList point_colors;       // array to save points color
+  PVector cloud_mass;
+  float[] depth;
+  boolean[] real;
+  PointCloud() {
+    // initialize
+    points = new ArrayList<PVector>();
+    point_colors = new IntList();
+    cloud_mass = new PVector(0, 0, 0);
+    depth = new float[width * height];
+    real = new boolean[width * height];
+  }
+
+  void generate(PImage rgb_img, PImage depth_img, Transform trans) {
+    if (depth_img.width != width || depth_img.height != height ||
+        rgb_img.width != width || rgb_img.height != height) {
+      println("rgb and depth file dimension should be same with window size");
+      exit();
+    }
+    // clear depth and real
+    for (int i = 0; i < width * height; i++) {
+      depth[i] = 0;
+      real[i] = false;
+    }
+    for (int v = 0; v < height; v++)
+      for (int u = 0; u < width; u++) {
+        // get depth value (red channel)
+        color depth_px = depth_img.get(u, v);
+        depth[v * width + u] = depth_px & 0x0000FFFF;
+        if (int(depth[v * width + u]) != 0) {
+          real[v * width + u] = true;
+        }
+        point_colors.append(rgb_img.get(u, v));
+      }
+    for (int v = 0; v < height; v++)
+      for (int u = 0; u < width; u++) {
+        if (int(depth[v * width + u]) == 0) {
+          interpolateDepth(v, u);
+        }
+        // add transformed pixel as well as pixel color to the list
+        PVector pos = trans.transform(u, v, int(depth[v * width + u]));
+        points.add(pos);
+        // accumulate z value
+        cloud_mass = PVector.add(cloud_mass, pos);
+      }
+  }
+  void fillInDepthAlongPath(float d, Node node) {
+    node = node.parent;
+    while (node != null) {
+      int i = node.row;
+      int j = node.col;
+      if (depth[i * width + j] == 0) {
+        depth[i * width + j] = d;
+      }
+      node = node.parent;
+    }
+  }
+  // interpolate
+  void interpolateDepth(int row, int col) {
+    if (row < 0 || row >= height || col < 0 || col >= width ||
+        int(depth[row * width + col]) != 0) {
+      return;
+    }
+    ArrayList<Node> queue = new ArrayList<Node>();
+    queue.add(new Node(row, col, null));
+    boolean[] visited = new boolean[width * height];
+    for (int i = 0; i < width * height; i++) visited[i] = false;
+    visited[row * width + col] = true;
+    // Using BFS to Find the Nearest Neighbor
+    while (queue.size() > 0) {
+      // pop
+      Node node = queue.get(0);
+      queue.remove(0);
+      int i = node.row;
+      int j = node.col;
+      // if current position have a real depth
+      if (depth[i * width + j] != 0 && real[i * width + j]) {
+        fillInDepthAlongPath(depth[i * width + j], node);
+        break;
+      } else {
+        // search unvisited 8 neighbors
+        for (int r = max(0, i - 1); r < min(height, i + 2); r++) {
+          for (int c = max(0, j - 1); c < min(width, j + 2); c++) {
+            if (!visited[r * width + c]) {
+              visited[r * width + c] = true;
+              queue.add(new Node(r, c, node));
+            }
+          }
+        }
+      }
+    }
+  }
+  // get point cloud size
+  int size() { return points.size(); }
+  // get ith position
+  PVector getPosition(int i) {
+    if (i >= points.size()) {
+      println("point position: index " + str(i) + " exceeds");
+      exit();
+    }
+    return points.get(i);
+  }
+  // get ith color
+  color getColor(int i) {
+    if (i >= point_colors.size()) {
+      println("point color: index " + str(i) + " exceeds");
+      exit();
+    }
+    return point_colors.get(i);
+  }
+  // get cloud center
+  PVector getCloudCenter() {
+    if (points.size() > 0) {
+      return PVector.div(cloud_mass, points.size());
+    }
+    return new PVector(0, 0, 0);
+  }
+  // merge two clouds
+  void merge(PointCloud point_cloud) {
+    for (int i = 0; i < point_cloud.size(); i++) {
+      points.add(point_cloud.getPosition(i));
+      point_colors.append(point_cloud.getColor(i));
+    }
+    cloud_mass = PVector.add(cloud_mass, point_cloud.cloud_mass);
+  }
+}
+
+class Node {
+  int row, col;
+  Node parent;
+  Node(int row, int col, Node parent) {
+    this.row = row;
+    this.col = col;
+    this.parent = parent;
+  }
+}
diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Ray_Tracing.pde b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Ray_Tracing.pde
new file mode 100644
index 0000000000..ef4be691c2
--- /dev/null
+++ b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Ray_Tracing.pde
@@ -0,0 +1,61 @@
+// Triangle
+class Triangle {
+  // position
+  PVector p1, p2, p3;
+  // color
+  color c1, c2, c3;
+  BoundingBox bbx;
+  Triangle(PVector p1, PVector p2, PVector p3, color c1, color c2, color c3) {
+    this.p1 = p1;
+    this.p2 = p2;
+    this.p3 = p3;
+    this.c1 = c1;
+    this.c2 = c2;
+    this.c3 = c3;
+    bbx = new BoundingBox();
+    bbx.create(this);
+  }
+  // check to see if a ray intersects with the triangle
+  boolean intersect(Ray r, float[] param) {
+    PVector p21 = PVector.sub(p2, p1);
+    PVector p31 = PVector.sub(p3, p1);
+    PVector po1 = PVector.sub(r.ori, p1);
+
+    PVector dxp31 = r.dir.cross(p31);
+    PVector po1xp21 = po1.cross(p21);
+    float denom = p21.dot(dxp31);
+    float t = p31.dot(po1xp21) / denom;
+    float alpha = po1.dot(dxp31) / denom;
+    float beta = r.dir.dot(po1xp21) / denom;
+
+    boolean res = t > 0 && alpha > 0 && alpha < 1 && beta > 0 && beta < 1 &&
+                  alpha + beta < 1;
+    // depth test
+    if (res && t < param[0]) {
+      param[0] = t;
+      param[1] = alpha * p1.x + beta * p2.x + (1 - alpha - beta) * p3.x;
+      param[2] = alpha * p1.y + beta * p2.y + (1 - alpha - beta) * p3.y;
+      param[3] = alpha * p1.z + beta * p2.z + (1 - alpha - beta) * p3.z;
+    }
+    return res;
+  }
+  void render() {
+    beginShape(TRIANGLES);
+    fill(c1);
+    vertex(p1.x, p1.y, p1.z);
+    fill(c2);
+    vertex(p2.x, p2.y, p2.z);
+    fill(c3);
+    vertex(p3.x, p3.y, p3.z);
+    endShape();
+  }
+}
+// Ray
+class Ray {
+  // origin and direction
+  PVector ori, dir;
+  Ray(PVector ori, PVector dir) {
+    this.ori = ori;
+    this.dir = dir;
+  }
+}
diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Scene.pde b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Scene.pde
new file mode 100644
index 0000000000..cf79ab7141
--- /dev/null
+++ b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Scene.pde
@@ -0,0 +1,59 @@
+class Scene {
+  PointCloud point_cloud;
+  ArrayList<Triangle> mesh;
+  BVH bvh;
+  MotionField motion_field;
+  Camera last_cam;
+  Camera current_cam;
+  int frame_count;
+
+  Scene(Camera camera, PointCloud point_cloud, MotionField motion_field) {
+    this.point_cloud = point_cloud;
+    this.motion_field = motion_field;
+    mesh = new ArrayList<Triangle>();
+    for (int v = 0; v < height - 1; v++)
+      for (int u = 0; u < width - 1; u++) {
+        PVector p1 = point_cloud.getPosition(v * width + u);
+        PVector p2 = point_cloud.getPosition(v * width + u + 1);
+        PVector p3 = point_cloud.getPosition((v + 1) * width + u + 1);
+        PVector p4 = point_cloud.getPosition((v + 1) * width + u);
+        color c1 = point_cloud.getColor(v * width + u);
+        color c2 = point_cloud.getColor(v * width + u + 1);
+        color c3 = point_cloud.getColor((v + 1) * width + u + 1);
+        color c4 = point_cloud.getColor((v + 1) * width + u);
+        mesh.add(new Triangle(p1, p2, p3, c1, c2, c3));
+        mesh.add(new Triangle(p3, p4, p1, c3, c4, c1));
+      }
+    bvh = new BVH(mesh);
+    last_cam = camera.copy();
+    current_cam = camera;
+    frame_count = 0;
+  }
+
+  void run() {
+    last_cam = current_cam.copy();
+    current_cam.run();
+    motion_field.update(last_cam, current_cam, point_cloud, bvh);
+    frame_count += 1;
+  }
+
+  void render(boolean show_motion_field) {
+    // build mesh
+    current_cam.open();
+    noStroke();
+    for (int i = 0; i < mesh.size(); i++) {
+      Triangle t = mesh.get(i);
+      t.render();
+    }
+    if (show_motion_field) {
+      current_cam.close();
+      motion_field.render();
+    }
+  }
+
+  void save(String path) { saveFrame(path + "_" + str(frame_count) + ".png"); }
+
+  void saveMotionField(String path) {
+    motion_field.save(path + "_" + str(frame_count) + ".txt");
+  }
+}
diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Transform.pde b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Transform.pde
new file mode 100644
index 0000000000..af2204e8cf
--- /dev/null
+++ b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Transform.pde
@@ -0,0 +1,82 @@
+class Transform {
+  float[] inv_rot;  // inverse of rotation matrix
+  PVector inv_mov;  // inverse of movement vector
+  float focal;      // the focal distacne of real camera
+  int w, h;         // the width and height of the frame
+  float normalier;  // nomalization factor of depth
+  Transform(float tx, float ty, float tz, float qx, float qy, float qz,
+            float qw, float fov, int w, int h, float normalier) {
+    // currently, we did not use the info of real camera's position and
+    // quaternion maybe we will use it in the future when combine all frames
+    float[] rot = quaternion2Mat3x3(qx, qy, qz, qw);
+    inv_rot = transpose3x3(rot);
+    inv_mov = new PVector(-tx, -ty, -tz);
+    this.focal = 0.5f * h / tan(fov / 2.0);
+    this.w = w;
+    this.h = h;
+    this.normalier = normalier;
+  }
+
+  PVector transform(int i, int j, float d) {
+    // transfer from camera view to world view
+    float z = d / normalier;
+    float x = (i - w / 2.0f) * z / focal;
+    float y = (j - h / 2.0f) * z / focal;
+    return new PVector(x, y, z);
+  }
+}
+
+// get rotation matrix by using rotation axis and angle
+float[] getRotationMat3x3(float angle, float ax, float ay, float az) {
+  float[] mat = new float[9];
+  float c = cos(angle);
+  float s = sin(angle);
+  mat[0] = c + ax * ax * (1 - c);
+  mat[1] = ax * ay * (1 - c) - az * s;
+  mat[2] = ax * az * (1 - c) + ay * s;
+  mat[3] = ay * ax * (1 - c) + az * s;
+  mat[4] = c + ay * ay * (1 - c);
+  mat[5] = ay * az * (1 - c) - ax * s;
+  mat[6] = az * ax * (1 - c) - ay * s;
+  mat[7] = az * ay * (1 - c) + ax * s;
+  mat[8] = c + az * az * (1 - c);
+  return mat;
+}
+
+// get rotation matrix by using quaternion
+float[] quaternion2Mat3x3(float qx, float qy, float qz, float qw) {
+  float[] mat = new float[9];
+  mat[0] = 1 - 2 * qy * qy - 2 * qz * qz;
+  mat[1] = 2 * qx * qy - 2 * qz * qw;
+  mat[2] = 2 * qx * qz + 2 * qy * qw;
+  mat[3] = 2 * qx * qy + 2 * qz * qw;
+  mat[4] = 1 - 2 * qx * qx - 2 * qz * qz;
+  mat[5] = 2 * qy * qz - 2 * qx * qw;
+  mat[6] = 2 * qx * qz - 2 * qy * qw;
+  mat[7] = 2 * qy * qz + 2 * qx * qw;
+  mat[8] = 1 - 2 * qx * qx - 2 * qy * qy;
+  return mat;
+}
+
+// tranpose a 3x3 matrix
+float[] transpose3x3(float[] mat) {
+  float[] Tmat = new float[9];
+  for (int i = 0; i < 3; i++)
+    for (int j = 0; j < 3; j++) {
+      Tmat[i * 3 + j] = mat[j * 3 + i];
+    }
+  return Tmat;
+}
+
+// multiply a matrix with vector
+PVector MatxVec3(float[] mat, PVector v) {
+  float[] vec = v.array();
+  float[] res = new float[3];
+  for (int i = 0; i < 3; i++) {
+    res[i] = 0.0f;
+    for (int j = 0; j < 3; j++) {
+      res[i] += mat[i * 3 + j] * vec[j];
+    }
+  }
+  return new PVector(res[0], res[1], res[2]);
+}
diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Util.pde b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Util.pde
new file mode 100644
index 0000000000..19d124a0b3
--- /dev/null
+++ b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Util.pde
@@ -0,0 +1,28 @@
+// show grids
+void showGrids(int block_size) {
+  ortho(-width, 0, -height, 0);
+  camera(0, 0, 0, 0, 0, 1, 0, 1, 0);
+  stroke(0, 0, 255);
+  for (int i = 0; i < height; i += block_size) {
+    line(0, i, width, i);
+  }
+  for (int i = 0; i < width; i += block_size) {
+    line(i, 0, i, height);
+  }
+}
+
+// save the point clould information
+void savePointCloud(PointCloud point_cloud, String file_name) {
+  String[] positions = new String[point_cloud.points.size()];
+  String[] colors = new String[point_cloud.points.size()];
+  for (int i = 0; i < point_cloud.points.size(); i++) {
+    PVector point = point_cloud.getPosition(i);
+    color point_color = point_cloud.getColor(i);
+    positions[i] = str(point.x) + ' ' + str(point.y) + ' ' + str(point.z);
+    colors[i] = str(((point_color >> 16) & 0xFF) / 255.0) + ' ' +
+                str(((point_color >> 8) & 0xFF) / 255.0) + ' ' +
+                str((point_color & 0xFF) / 255.0);
+  }
+  saveStrings(file_name + "_pos.txt", positions);
+  saveStrings(file_name + "_color.txt", colors);
+}
diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/sketch_3D_reconstruction.pde b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/sketch_3D_reconstruction.pde
new file mode 100644
index 0000000000..22a495432d
--- /dev/null
+++ b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/sketch_3D_reconstruction.pde
@@ -0,0 +1,74 @@
+/*The dataset is from
+ *Computer Vision Group
+ *TUM Department of Informatics Technical
+ *University of Munich
+ *https://vision.in.tum.de/data/datasets/rgbd-dataset/download#freiburg1_xyz
+ */
+Scene scene;
+void setup() {
+  size(640, 480, P3D);
+  // default settings
+  int frame_no = 0;            // frame number
+  float fov = PI / 3;          // field of view
+  int block_size = 8;          // block size
+  float normalizer = 5000.0f;  // normalizer
+  // initialize
+  PointCloud point_cloud = new PointCloud();
+  // synchronized rgb, depth and ground truth
+  String head = "../data/";
+  String[] rgb_depth_gt = loadStrings(head + "rgb_depth_groundtruth.txt");
+  // read in rgb and depth image file paths as well as corresponding camera
+  // posiiton and quaternion
+  String[] info = split(rgb_depth_gt[frame_no], ' ');
+  String rgb_path = head + info[1];
+  String depth_path = head + info[3];
+  float tx = float(info[7]), ty = float(info[8]),
+        tz = float(info[9]);  // real camera position
+  float qx = float(info[10]), qy = float(info[11]), qz = float(info[12]),
+        qw = float(info[13]);  // quaternion
+
+  // build transformer
+  Transform trans =
+      new Transform(tx, ty, tz, qx, qy, qz, qw, fov, width, height, normalizer);
+  PImage rgb = loadImage(rgb_path);
+  PImage depth = loadImage(depth_path);
+  // generate point cloud
+  point_cloud.generate(rgb, depth, trans);
+  // initialize camera
+  Camera camera = new Camera(fov, new PVector(0, 0, 0), new PVector(0, 0, 1),
+                             new PVector(0, 1, 0));
+  // initialize motion field
+  MotionField motion_field = new MotionField(block_size);
+  // initialize scene
+  scene = new Scene(camera, point_cloud, motion_field);
+}
+boolean inter = false;
+void draw() {
+  background(0);
+  // run camera dragged mouse to rotate camera
+  // w: go forward
+  // s: go backward
+  // a: go left
+  // d: go right
+  // up arrow: go up
+  // down arrow: go down
+  //+ increase move speed
+  //- decrease move speed
+  // r: rotate the camera
+  // b: reset to initial position
+  scene.run();  // true: make interpolation; false: do not make
+                // interpolation
+  if (keyPressed && key == 'o') {
+    inter = true;
+  }
+  scene.render(
+      false);  // true: turn on motion field; false: turn off motion field
+  // save frame with no motion field
+  scene.save("../data/frame/raw");
+  background(0);
+  scene.render(true);
+  showGrids(scene.motion_field.block_size);
+  // save frame with motion field
+  scene.save("../data/frame/raw_mv");
+  scene.saveMotionField("../data/frame/mv");
+}
diff --git a/media/libvpx/libvpx/tools/README.pgo.md b/media/libvpx/libvpx/tools/README.pgo.md
new file mode 100644
index 0000000000..414743f8fc
--- /dev/null
+++ b/media/libvpx/libvpx/tools/README.pgo.md
@@ -0,0 +1,24 @@
+# Using Profile Guided Optimizations to identify compiler optimization failures
+
+When using Clang, the `-Rpass-missed` flag enables the verbose log of failed
+compiler optimizations. However, the extensive log messages can obscure
+potential optimization opportunities.
+
+Use the following steps to generate a more transparent optimization report
+using a previously created PGO profile file. The report also includes code
+hotness diagnostics:
+
+```bash
+$ ../libvpx/configure --use-profile=perf.profdata \
+  --extra-cflags="-fsave-optimization-record -fdiagnostics-show-hotness"
+```
+
+Convert the generated YAML files into a detailed HTML report using the
+[optviewer2](https://github.com/OfekShilon/optview2) tool:
+
+```bash
+$ opt-viewer.py --output-dir=out/ --source-dir=libvpx .
+```
+
+The HTML report displays each code line's relative hotness, cross-referenced
+with the failed compiler optimizations.
diff --git a/media/libvpx/libvpx/tools/all_builds.py b/media/libvpx/libvpx/tools/all_builds.py
deleted file mode 100644
index 54176f58ab..0000000000
--- a/media/libvpx/libvpx/tools/all_builds.py
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/usr/bin/python
-
-import getopt
-import subprocess
-import sys
-
-LONG_OPTIONS = ["shard=", "shards="]
-BASE_COMMAND = "./configure --enable-internal-stats"
-
-def RunCommand(command):
-  run = subprocess.Popen(command, shell=True)
-  output = run.communicate()
-  if run.returncode:
-    print "Non-zero return code: " + str(run.returncode) + " => exiting!"
-    sys.exit(1)
-
-def list_of_experiments():
-  experiments = []
-  configure_file = open("configure")
-  list_start = False
-  for line in configure_file.read().split("\n"):
-    if line == 'EXPERIMENT_LIST="':
-      list_start = True
-    elif line == '"':
-      list_start = False
-    elif list_start:
-      currently_broken = ["csm"]
-      experiment = line[4:]
-      if experiment not in currently_broken:
-        experiments.append(experiment)
-  return experiments
-
-def main(argv):
-  # Parse arguments
-  options = {"--shard": 0, "--shards": 1}
-  if "--" in argv:
-    opt_end_index = argv.index("--")
-  else:
-    opt_end_index = len(argv)
-  try:
-    o, _ = getopt.getopt(argv[1:opt_end_index], None, LONG_OPTIONS)
-  except getopt.GetoptError, err:
-    print str(err)
-    print "Usage: %s [--shard=<n> --shards=<n>] -- [configure flag ...]"%argv[0]
-    sys.exit(2)
-
-  options.update(o)
-  extra_args = argv[opt_end_index + 1:]
-
-  # Shard experiment list
-  shard = int(options["--shard"])
-  shards = int(options["--shards"])
-  experiments = list_of_experiments()
-  base_command = " ".join([BASE_COMMAND] + extra_args)
-  configs = [base_command]
-  configs += ["%s --enable-%s" % (base_command, e) for e in experiments]
-  my_configs = zip(configs, range(len(configs)))
-  my_configs = filter(lambda x: x[1] % shards == shard, my_configs)
-  my_configs = [e[0] for e in my_configs]
-
-  # Run configs for this shard
-  for config in my_configs:
-    test_build(config)
-
-def test_build(configure_command):
-  print "\033[34m\033[47mTesting %s\033[0m" % (configure_command)
-  RunCommand(configure_command)
-  RunCommand("make clean")
-  RunCommand("make")
-
-if __name__ == "__main__":
-  main(sys.argv)
diff --git a/media/libvpx/libvpx/tools/author_first_release.sh b/media/libvpx/libvpx/tools/author_first_release.sh
deleted file mode 100644
index 7b0b797212..0000000000
--- a/media/libvpx/libvpx/tools/author_first_release.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-##
-## List the release each author first contributed to.
-##
-## Usage: author_first_release.sh [TAGS]
-##
-## If the TAGS arguments are unspecified, all tags reported by `git tag`
-## will be considered.
-##
-tags=${@:-$(git tag)}
-for tag in $tags; do
-  git shortlog -n -e -s $tag |
-      cut -f2- |
-      awk "{print \"${tag#v}\t\"\$0}"
-done | sort -k2  | uniq -f2
diff --git a/media/libvpx/libvpx/tools/cpplint.py b/media/libvpx/libvpx/tools/cpplint.py
index 25fbef73d8..e3ebde2f5a 100644
--- a/media/libvpx/libvpx/tools/cpplint.py
+++ b/media/libvpx/libvpx/tools/cpplint.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # Copyright (c) 2009 Google Inc. All rights reserved.
 #
@@ -51,16 +51,23 @@ import sre_compile
 import string
 import sys
 import unicodedata
+import sysconfig
+
+try:
+  xrange          # Python 2
+except NameError:
+  xrange = range  # Python 3
 
 
 _USAGE = """
 Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...]
                    [--counting=total|toplevel|detailed] [--root=subdir]
-                   [--linelength=digits]
+                   [--linelength=digits] [--headers=x,y,...]
+                   [--quiet]
         <file> [file] ...
 
   The style guidelines this tries to follow are those in
-    http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml
+    https://google-styleguide.googlecode.com/svn/trunk/cppguide.xml
 
   Every problem is given a confidence score from 1-5, with 5 meaning we are
   certain of the problem, and 1 meaning it could be a legitimate construct.
@@ -83,6 +90,9 @@ Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...]
     verbose=#
       Specify a number 0-5 to restrict errors to certain verbosity levels.
 
+    quiet
+      Don't print anything if no errors are found.
+
     filter=-x,+y,...
       Specify a comma-separated list of category-filters to apply: only
       error messages whose category names pass the filters will be printed.
@@ -114,12 +124,13 @@ Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...]
       ignored.
 
       Examples:
-        Assuing that src/.git exists, the header guard CPP variables for
-        src/chrome/browser/ui/browser.h are:
+        Assuming that top/src/.git exists (and cwd=top/src), the header guard
+        CPP variables for top/src/chrome/browser/ui/browser.h are:
 
         No flag => CHROME_BROWSER_UI_BROWSER_H_
         --root=chrome => BROWSER_UI_BROWSER_H_
         --root=chrome/browser => UI_BROWSER_H_
+        --root=.. => SRC_CHROME_BROWSER_UI_BROWSER_H_
 
     linelength=digits
       This is the allowed line length for the project. The default value is
@@ -133,6 +144,57 @@ Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...]
 
       Examples:
         --extensions=hpp,cpp
+
+    headers=x,y,...
+      The header extensions that cpplint will treat as .h in checks. Values are
+      automatically added to --extensions list.
+
+      Examples:
+        --headers=hpp,hxx
+        --headers=hpp
+
+    cpplint.py supports per-directory configurations specified in CPPLINT.cfg
+    files. CPPLINT.cfg file can contain a number of key=value pairs.
+    Currently the following options are supported:
+
+      set noparent
+      filter=+filter1,-filter2,...
+      exclude_files=regex
+      linelength=80
+      root=subdir
+      headers=x,y,...
+
+    "set noparent" option prevents cpplint from traversing directory tree
+    upwards looking for more .cfg files in parent directories. This option
+    is usually placed in the top-level project directory.
+
+    The "filter" option is similar in function to --filter flag. It specifies
+    message filters in addition to the |_DEFAULT_FILTERS| and those specified
+    through --filter command-line flag.
+
+    "exclude_files" allows to specify a regular expression to be matched against
+    a file name. If the expression matches, the file is skipped and not run
+    through liner.
+
+    "linelength" allows to specify the allowed line length for the project.
+
+    The "root" option is similar in function to the --root flag (see example
+    above). Paths are relative to the directory of the CPPLINT.cfg.
+
+    The "headers" option is similar in function to the --headers flag
+    (see example above).
+
+    CPPLINT.cfg has an effect on files in the same directory and all
+    sub-directories, unless overridden by a nested configuration file.
+
+      Example file:
+        filter=-build/include_order,+build/include_alpha
+        exclude_files=.*\.cc
+
+    The above example disables build/include_order warning and enables
+    build/include_alpha as well as excludes all .cc from being
+    processed by linter, in the current directory (where the .cfg
+    file is located) and all sub-directories.
 """
 
 # We categorize each error message we print.  Here are the categories.
@@ -140,81 +202,101 @@ Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...]
 # If you add a new error message with a new category, add it to the list
 # here!  cpplint_unittest.py should tell you if you forget to do this.
 _ERROR_CATEGORIES = [
-  'build/class',
-  'build/deprecated',
-  'build/endif_comment',
-  'build/explicit_make_pair',
-  'build/forward_decl',
-  'build/header_guard',
-  'build/include',
-  'build/include_alpha',
-  'build/include_order',
-  'build/include_what_you_use',
-  'build/namespaces',
-  'build/printf_format',
-  'build/storage_class',
-  'legal/copyright',
-  'readability/alt_tokens',
-  'readability/braces',
-  'readability/casting',
-  'readability/check',
-  'readability/constructors',
-  'readability/fn_size',
-  'readability/function',
-  'readability/multiline_comment',
-  'readability/multiline_string',
-  'readability/namespace',
-  'readability/nolint',
-  'readability/nul',
-  'readability/streams',
-  'readability/todo',
-  'readability/utf8',
-  'runtime/arrays',
-  'runtime/casting',
-  'runtime/explicit',
-  'runtime/int',
-  'runtime/init',
-  'runtime/invalid_increment',
-  'runtime/member_string_references',
-  'runtime/memset',
-  'runtime/operator',
-  'runtime/printf',
-  'runtime/printf_format',
-  'runtime/references',
-  'runtime/sizeof',
-  'runtime/string',
-  'runtime/threadsafe_fn',
-  'runtime/vlog',
-  'whitespace/blank_line',
-  'whitespace/braces',
-  'whitespace/comma',
-  'whitespace/comments',
-  'whitespace/empty_conditional_body',
-  'whitespace/empty_loop_body',
-  'whitespace/end_of_line',
-  'whitespace/ending_newline',
-  'whitespace/forcolon',
-  'whitespace/indent',
-  'whitespace/line_length',
-  'whitespace/newline',
-  'whitespace/operators',
-  'whitespace/parens',
-  'whitespace/semicolon',
-  'whitespace/tab',
-  'whitespace/todo'
-  ]
+    'build/class',
+    'build/c++11',
+    'build/c++14',
+    'build/c++tr1',
+    'build/deprecated',
+    'build/endif_comment',
+    'build/explicit_make_pair',
+    'build/forward_decl',
+    'build/header_guard',
+    'build/include',
+    'build/include_alpha',
+    'build/include_order',
+    'build/include_what_you_use',
+    'build/namespaces',
+    'build/printf_format',
+    'build/storage_class',
+    'legal/copyright',
+    'readability/alt_tokens',
+    'readability/braces',
+    'readability/casting',
+    'readability/check',
+    'readability/constructors',
+    'readability/fn_size',
+    'readability/inheritance',
+    'readability/multiline_comment',
+    'readability/multiline_string',
+    'readability/namespace',
+    'readability/nolint',
+    'readability/nul',
+    'readability/strings',
+    'readability/todo',
+    'readability/utf8',
+    'runtime/arrays',
+    'runtime/casting',
+    'runtime/explicit',
+    'runtime/int',
+    'runtime/init',
+    'runtime/invalid_increment',
+    'runtime/member_string_references',
+    'runtime/memset',
+    'runtime/indentation_namespace',
+    'runtime/operator',
+    'runtime/printf',
+    'runtime/printf_format',
+    'runtime/references',
+    'runtime/string',
+    'runtime/threadsafe_fn',
+    'runtime/vlog',
+    'whitespace/blank_line',
+    'whitespace/braces',
+    'whitespace/comma',
+    'whitespace/comments',
+    'whitespace/empty_conditional_body',
+    'whitespace/empty_if_body',
+    'whitespace/empty_loop_body',
+    'whitespace/end_of_line',
+    'whitespace/ending_newline',
+    'whitespace/forcolon',
+    'whitespace/indent',
+    'whitespace/line_length',
+    'whitespace/newline',
+    'whitespace/operators',
+    'whitespace/parens',
+    'whitespace/semicolon',
+    'whitespace/tab',
+    'whitespace/todo',
+    ]
 
-# The default state of the category filter. This is overrided by the --filter=
+# These error categories are no longer enforced by cpplint, but for backwards-
+# compatibility they may still appear in NOLINT comments.
+_LEGACY_ERROR_CATEGORIES = [
+    'readability/streams',
+    'readability/function',
+    ]
+
+# The default state of the category filter. This is overridden by the --filter=
 # flag. By default all errors are on, so only add here categories that should be
 # off by default (i.e., categories that must be enabled by the --filter= flags).
 # All entries here should start with a '-' or '+', as in the --filter= flag.
 _DEFAULT_FILTERS = ['-build/include_alpha']
 
+# The default list of categories suppressed for C (not C++) files.
+_DEFAULT_C_SUPPRESSED_CATEGORIES = [
+    'readability/casting',
+    ]
+
+# The default list of categories suppressed for Linux Kernel files.
+_DEFAULT_KERNEL_SUPPRESSED_CATEGORIES = [
+    'whitespace/tab',
+    ]
+
 # We used to check for high-bit characters, but after much discussion we
 # decided those were OK, as long as they were in UTF-8 and didn't represent
 # hard-coded international strings, which belong in a separate i18n file.
 
-
 # C++ headers
 _CPP_HEADERS = frozenset([
     # Legacy
@@ -304,6 +386,7 @@ _CPP_HEADERS = frozenset([
     'random',
     'ratio',
     'regex',
+    'scoped_allocator',
     'set',
     'sstream',
     'stack',
@@ -351,15 +434,40 @@ _CPP_HEADERS = frozenset([
     'cwctype',
     ])
 
+# Type names
+_TYPES = re.compile(
+    r'^(?:'
+    # [dcl.type.simple]
+    r'(char(16_t|32_t)?)|wchar_t|'
+    r'bool|short|int|long|signed|unsigned|float|double|'
+    # [support.types]
+    r'(ptrdiff_t|size_t|max_align_t|nullptr_t)|'
+    # [cstdint.syn]
+    r'(u?int(_fast|_least)?(8|16|32|64)_t)|'
+    r'(u?int(max|ptr)_t)|'
+    r')$')
+
+
+# These headers are excluded from [build/include] and [build/include_order]
+# checks:
+# - Anything not following google file name conventions (containing an
+#   uppercase character, such as Python.h or nsStringAPI.h, for example).
+# - Lua headers.
+_THIRD_PARTY_HEADERS_PATTERN = re.compile(
+    r'^(?:[^/]*[A-Z][^/]*\.h|lua\.h|lauxlib\.h|lualib\.h)$')
+
+# Pattern for matching FileInfo.BaseName() against test file name
+_TEST_FILE_SUFFIX = r'(_test|_unittest|_regtest)$'
+
+# Pattern that matches only complete whitespace, possibly across multiple lines.
+_EMPTY_CONDITIONAL_BODY_PATTERN = re.compile(r'^\s*$', re.DOTALL)
+
 # Assertion macros.  These are defined in base/logging.h and
-# testing/base/gunit.h.  Note that the _M versions need to come first
-# for substring matching to work.
+# testing/base/public/gunit.h.
 _CHECK_MACROS = [
     'DCHECK', 'CHECK',
-    'EXPECT_TRUE_M', 'EXPECT_TRUE',
-    'ASSERT_TRUE_M', 'ASSERT_TRUE',
-    'EXPECT_FALSE_M', 'EXPECT_FALSE',
-    'ASSERT_FALSE_M', 'ASSERT_FALSE',
+    'EXPECT_TRUE', 'ASSERT_TRUE',
+    'EXPECT_FALSE', 'ASSERT_FALSE',
     ]
 
 # Replacement macros for CHECK/DCHECK/EXPECT_TRUE/EXPECT_FALSE
@@ -372,16 +480,12 @@ for op, replacement in [('==', 'EQ'), ('!=', 'NE'),
   _CHECK_REPLACEMENT['CHECK'][op] = 'CHECK_%s' % replacement
   _CHECK_REPLACEMENT['EXPECT_TRUE'][op] = 'EXPECT_%s' % replacement
   _CHECK_REPLACEMENT['ASSERT_TRUE'][op] = 'ASSERT_%s' % replacement
-  _CHECK_REPLACEMENT['EXPECT_TRUE_M'][op] = 'EXPECT_%s_M' % replacement
-  _CHECK_REPLACEMENT['ASSERT_TRUE_M'][op] = 'ASSERT_%s_M' % replacement
 
 for op, inv_replacement in [('==', 'NE'), ('!=', 'EQ'),
                             ('>=', 'LT'), ('>', 'LE'),
                             ('<=', 'GT'), ('<', 'GE')]:
   _CHECK_REPLACEMENT['EXPECT_FALSE'][op] = 'EXPECT_%s' % inv_replacement
   _CHECK_REPLACEMENT['ASSERT_FALSE'][op] = 'ASSERT_%s' % inv_replacement
-  _CHECK_REPLACEMENT['EXPECT_FALSE_M'][op] = 'EXPECT_%s_M' % inv_replacement
-  _CHECK_REPLACEMENT['ASSERT_FALSE_M'][op] = 'ASSERT_%s_M' % inv_replacement
 
 # Alternative tokens and their replacements.  For full list, see section 2.5
 # Alternative tokens [lex.digraph] in the C++ standard.
@@ -430,12 +534,15 @@ _MATCH_ASM = re.compile(r'^\s*(?:asm|_asm|__asm|__asm__)'
                         r'(?:\s+(volatile|__volatile__))?'
                         r'\s*[{(]')
 
+# Match strings that indicate we're working on a C (not C++) file.
+_SEARCH_C_FILE = re.compile(r'\b(?:LINT_C_FILE|'
+                            r'vim?:\s*.*(\s*|:)filetype=c(\s*|:|$))')
+
+# Match string that indicates we're working on a Linux Kernel file.
+_SEARCH_KERNEL_FILE = re.compile(r'\b(?:LINT_KERNEL_FILE)')
 
 _regexp_compile_cache = {}
 
-# Finds occurrences of NOLINT or NOLINT(...).
-_RE_SUPPRESSION = re.compile(r'\bNOLINT\b(\([^)]*\))?')
-
 # {str, set(int)}: a map from error categories to sets of linenumbers
 # on which those errors are expected and should be suppressed.
 _error_suppressions = {}
@@ -443,6 +550,7 @@ _error_suppressions = {}
 # The root directory used for deriving header guard CPP variable.
 # This is set by --root flag.
 _root = None
+_root_debug = False
 
 # The allowed line length of files.
 # This is set by --linelength flag.
@@ -452,8 +560,28 @@ _line_length = 80
 # This is set by --extensions flag.
 _valid_extensions = set(['cc', 'h', 'cpp', 'cu', 'cuh'])
 
+# Treat all headers starting with 'h' equally: .h, .hpp, .hxx etc.
+# This is set by --headers flag.
+_hpp_headers = set(['h'])
+
+# {str, bool}: a map from error categories to booleans which indicate if the
+# category should be suppressed for every line.
+_global_error_suppressions = {}
+
+def ProcessHppHeadersOption(val):
+  global _hpp_headers
+  try:
+    _hpp_headers = set(val.split(','))
+    # Automatically append to extensions list so it does not have to be set 2 times
+    _valid_extensions.update(_hpp_headers)
+  except ValueError:
+    PrintUsage('Header extensions must be comma separated list.')
+
+def IsHeaderExtension(file_extension):
+  return file_extension in _hpp_headers
+
 def ParseNolintSuppressions(filename, raw_line, linenum, error):
-  """Updates the global list of error-suppressions.
+  """Updates the global list of line error-suppressions.
 
   Parses any NOLINT comments on the current line, updating the global
   error_suppressions store.  Reports an error if the NOLINT comment
@@ -465,42 +593,67 @@ def ParseNolintSuppressions(filename, raw_line, linenum, error):
     linenum: int, the number of the current line.
     error: function, an error handler.
   """
-  # FIXME(adonovan): "NOLINT(" is misparsed as NOLINT(*).
-  matched = _RE_SUPPRESSION.search(raw_line)
+  matched = Search(r'\bNOLINT(NEXTLINE)?\b(\([^)]+\))?', raw_line)
   if matched:
-    category = matched.group(1)
+    if matched.group(1):
+      suppressed_line = linenum + 1
+    else:
+      suppressed_line = linenum
+    category = matched.group(2)
     if category in (None, '(*)'):  # => "suppress all"
-      _error_suppressions.setdefault(None, set()).add(linenum)
+      _error_suppressions.setdefault(None, set()).add(suppressed_line)
     else:
       if category.startswith('(') and category.endswith(')'):
         category = category[1:-1]
         if category in _ERROR_CATEGORIES:
-          _error_suppressions.setdefault(category, set()).add(linenum)
-        else:
+          _error_suppressions.setdefault(category, set()).add(suppressed_line)
+        elif category not in _LEGACY_ERROR_CATEGORIES:
           error(filename, linenum, 'readability/nolint', 5,
                 'Unknown NOLINT error category: %s' % category)
 
 
+def ProcessGlobalSuppresions(lines):
+  """Updates the list of global error suppressions.
+
+  Parses any lint directives in the file that have global effect.
+
+  Args:
+    lines: An array of strings, each representing a line of the file, with the
+           last element being empty if the file is terminated with a newline.
+  """
+  for line in lines:
+    if _SEARCH_C_FILE.search(line):
+      for category in _DEFAULT_C_SUPPRESSED_CATEGORIES:
+        _global_error_suppressions[category] = True
+    if _SEARCH_KERNEL_FILE.search(line):
+      for category in _DEFAULT_KERNEL_SUPPRESSED_CATEGORIES:
+        _global_error_suppressions[category] = True
+
+
 def ResetNolintSuppressions():
-  "Resets the set of NOLINT suppressions to empty."
+  """Resets the set of NOLINT suppressions to empty."""
   _error_suppressions.clear()
+  _global_error_suppressions.clear()
 
 
 def IsErrorSuppressedByNolint(category, linenum):
   """Returns true if the specified error category is suppressed on this line.
 
   Consults the global error_suppressions map populated by
-  ParseNolintSuppressions/ResetNolintSuppressions.
+  ParseNolintSuppressions/ProcessGlobalSuppresions/ResetNolintSuppressions.
 
   Args:
     category: str, the category of the error.
     linenum: int, the current line number.
   Returns:
-    bool, True iff the error should be suppressed due to a NOLINT comment.
+    bool, True iff the error should be suppressed due to a NOLINT comment or
+    global suppression.
   """
-  return (linenum in _error_suppressions.get(category, set()) or
+  return (_global_error_suppressions.get(category, False) or
+          linenum in _error_suppressions.get(category, set()) or
           linenum in _error_suppressions.get(None, set()))
 
+
 def Match(pattern, s):
   """Matches the string with the pattern, caching the compiled regexp."""
   # The regexp compilation caching is inlined in both Match and Search for
@@ -536,11 +689,17 @@ def Search(pattern, s):
   return _regexp_compile_cache[pattern].search(s)
 
 
-class _IncludeState(dict):
+def _IsSourceExtension(s):
+  """File extension (excluding dot) matches a source file extension."""
+  return s in ('c', 'cc', 'cpp', 'cxx')
+
+
+class _IncludeState(object):
   """Tracks line numbers for includes, and the order in which includes appear.
 
-  As a dict, an _IncludeState object serves as a mapping between include
-  filename and line number on which that file was included.
+  include_list contains list of lists of (header, line number) pairs.
+  It's a lists of lists rather than just one flat list to make it
+  easier to update across preprocessor boundaries.
 
   Call CheckNextIncludeOrder() once for each header in the file, passing
   in the type constants defined above. Calls in an illegal order will
@@ -571,15 +730,42 @@ class _IncludeState(dict):
       }
 
   def __init__(self):
-    dict.__init__(self)
-    self.ResetSection()
+    self.include_list = [[]]
+    self.ResetSection('')
 
-  def ResetSection(self):
+  def FindHeader(self, header):
+    """Check if a header has already been included.
+
+    Args:
+      header: header to check.
+    Returns:
+      Line number of previous occurrence, or -1 if the header has not
+      been seen before.
+    """
+    for section_list in self.include_list:
+      for f in section_list:
+        if f[0] == header:
+          return f[1]
+    return -1
+
+  def ResetSection(self, directive):
+    """Reset section checking for preprocessor directive.
+
+    Args:
+      directive: preprocessor directive (e.g. "if", "else").
+    """
     # The name of the current section.
     self._section = self._INITIAL_SECTION
     # The path of last found header.
     self._last_header = ''
 
+    # Update list of includes.  Note that we never pop from the
+    # include list.
+    if directive in ('if', 'ifdef', 'ifndef'):
+      self.include_list.append([])
+    elif directive in ('else', 'elif'):
+      self.include_list[-1] = []
+
   def SetLastHeader(self, header_path):
     self._last_header = header_path
 
@@ -615,7 +801,7 @@ class _IncludeState(dict):
     # If previous line was a blank line, assume that the headers are
     # intentionally sorted the way they are.
     if (self._last_header > header_path and
-        not Match(r'^\s*$', clean_lines.elided[linenum - 1])):
+        Match(r'^\s*#\s*include\b', clean_lines.elided[linenum - 1])):
       return False
     return True
 
@@ -681,8 +867,11 @@ class _CppLintState(object):
     self.error_count = 0    # global count of reported errors
     # filters to apply when emitting error messages
     self.filters = _DEFAULT_FILTERS[:]
+    # backup of filter list. Used to restore the state after each file.
+    self._filters_backup = self.filters[:]
     self.counting = 'total'  # In what way are we counting errors?
     self.errors_by_category = {}  # string to int dict storing error counts
+    self.quiet = False  # Suppress non-error messagess?
 
     # output format:
     # "emacs" - format that emacs can parse (default)
@@ -693,6 +882,12 @@ class _CppLintState(object):
     """Sets the output format for errors."""
     self.output_format = output_format
 
+  def SetQuiet(self, quiet):
+    """Sets the module's quiet settings, and returns the previous setting."""
+    last_quiet = self.quiet
+    self.quiet = quiet
+    return last_quiet
+
   def SetVerboseLevel(self, level):
     """Sets the module's verbosity, and returns the previous setting."""
     last_verbose_level = self.verbose_level
@@ -719,6 +914,10 @@ class _CppLintState(object):
     """
     # Default filters always have less priority than the flag ones.
     self.filters = _DEFAULT_FILTERS[:]
+    self.AddFilters(filters)
+
+  def AddFilters(self, filters):
+    """ Adds more filters to the existing list of error-message filters. """
     for filt in filters.split(','):
       clean_filt = filt.strip()
       if clean_filt:
@@ -728,6 +927,14 @@ class _CppLintState(object):
         raise ValueError('Every filter in --filters must start with + or -'
                          ' (%s does not)' % filt)
 
+  def BackupFilters(self):
+    """ Saves the current filter list to backup storage."""
+    self._filters_backup = self.filters[:]
+
+  def RestoreFilters(self):
+    """ Restores filters previously backed up."""
+    self.filters = self._filters_backup[:]
+
   def ResetErrorCounts(self):
     """Sets the module's error statistic back to zero."""
     self.error_count = 0
@@ -748,7 +955,7 @@ class _CppLintState(object):
     for category, count in self.errors_by_category.iteritems():
       sys.stderr.write('Category \'%s\' errors found: %d\n' %
                        (category, count))
-    sys.stderr.write('Total errors found: %d\n' % self.error_count)
+    sys.stdout.write('Total errors found: %d\n' % self.error_count)
 
 _cpplint_state = _CppLintState()
 
@@ -762,6 +969,14 @@ def _SetOutputFormat(output_format):
   """Sets the module's output format."""
   _cpplint_state.SetOutputFormat(output_format)
 
+def _Quiet():
+  """Return's the module's quiet setting."""
+  return _cpplint_state.quiet
+
+def _SetQuiet(quiet):
+  """Set the module's quiet status, and return previous setting."""
+  return _cpplint_state.SetQuiet(quiet)
+
 
 def _VerboseLevel():
   """Returns the module's verbosity setting."""
@@ -795,6 +1010,25 @@ def _SetFilters(filters):
   """
   _cpplint_state.SetFilters(filters)
 
+def _AddFilters(filters):
+  """Adds more filter overrides.
+
+  Unlike _SetFilters, this function does not reset the current list of filters
+  available.
+
+  Args:
+    filters: A string of comma-separated filters (eg "whitespace/indent").
+             Each filter should start with + or -; else we die.
+  """
+  _cpplint_state.AddFilters(filters)
+
+def _BackupFilters():
+  """ Saves the current filter list to backup storage."""
+  _cpplint_state.BackupFilters()
+
+def _RestoreFilters():
+  """ Restores filters previously backed up."""
+  _cpplint_state.RestoreFilters()
 
 class _FunctionState(object):
   """Tracks current function name and the number of lines in its body."""
@@ -830,6 +1064,9 @@ class _FunctionState(object):
       filename: The name of the current file.
       linenum: The number of the line to check.
     """
+    if not self.in_a_function:
+      return
+
     if Match(r'T(EST|est)', self.current_function):
       base_trigger = self._TEST_TRIGGER
     else:
@@ -857,7 +1094,7 @@ class _IncludeError(Exception):
   pass
 
 
-class FileInfo:
+class FileInfo(object):
   """Provides utility functions for filenames.
 
   FileInfo provides easy access to the components of a file's path
@@ -900,12 +1137,13 @@ class FileInfo:
 
       # Not SVN <= 1.6? Try to find a git, hg, or svn top level directory by
       # searching up from the current path.
-      root_dir = os.path.dirname(fullname)
-      while (root_dir != os.path.dirname(root_dir) and
-             not os.path.exists(os.path.join(root_dir, ".git")) and
-             not os.path.exists(os.path.join(root_dir, ".hg")) and
-             not os.path.exists(os.path.join(root_dir, ".svn"))):
-        root_dir = os.path.dirname(root_dir)
+      root_dir = current_dir = os.path.dirname(fullname)
+      while current_dir != os.path.dirname(current_dir):
+        if (os.path.exists(os.path.join(current_dir, ".git")) or
+            os.path.exists(os.path.join(current_dir, ".hg")) or
+            os.path.exists(os.path.join(current_dir, ".svn"))):
+          root_dir = current_dir
+        current_dir = os.path.dirname(current_dir)
 
       if (os.path.exists(os.path.join(root_dir, ".git")) or
           os.path.exists(os.path.join(root_dir, ".hg")) or
@@ -944,7 +1182,7 @@ class FileInfo:
 
   def IsSource(self):
     """File has a source file extension."""
-    return self.Extension()[1:] in ('c', 'cc', 'cpp', 'cxx')
+    return _IsSourceExtension(self.Extension()[1:])
 
 
 def _ShouldPrintError(category, confidence, linenum):
@@ -955,6 +1193,7 @@ def _ShouldPrintError(category, confidence, linenum):
   # the verbosity level isn't high enough, or the filters filter it out.
   if IsErrorSuppressedByNolint(category, linenum):
     return False
+
   if confidence < _cpplint_state.verbose_level:
     return False
 
@@ -999,8 +1238,8 @@ def Error(filename, linenum, category, confidence, message):
   if _ShouldPrintError(category, confidence, linenum):
     _cpplint_state.IncrementErrorCount(category)
     if _cpplint_state.output_format == 'vs7':
-      sys.stderr.write('%s(%s):  %s  [%s] [%d]\n' % (
-          filename, linenum, message, category, confidence))
+      sys.stderr.write('%s(%s): error cpplint: [%s] %s [%d]\n' % (
+          filename, linenum, category, message, confidence))
     elif _cpplint_state.output_format == 'eclipse':
       sys.stderr.write('%s:%s: warning: %s  [%s] [%d]\n' % (
           filename, linenum, message, category, confidence))
@@ -1012,11 +1251,9 @@ def Error(filename, linenum, category, confidence, message):
 # Matches standard C++ escape sequences per 2.13.2.3 of the C++ standard.
 _RE_PATTERN_CLEANSE_LINE_ESCAPES = re.compile(
     r'\\([abfnrtv?"\\\']|\d+|x[0-9a-fA-F]+)')
-# Matches strings.  Escape codes should already be removed by ESCAPES.
-_RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES = re.compile(r'"[^"]*"')
-# Matches characters.  Escape codes should already be removed by ESCAPES.
-_RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES = re.compile(r"'.'")
-# Matches multi-line C++ comments.
+# Match a single C style comment on the same line.
+_RE_PATTERN_C_COMMENTS = r'/\*(?:[^*]|\*(?!/))*\*/'
+# Matches multi-line C style comments.
 # This RE is a little bit more complicated than one might expect, because we
 # have to take care of space removals tools so we can handle comments inside
 # statements better.
@@ -1025,10 +1262,10 @@ _RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES = re.compile(r"'.'")
 # if this doesn't work we try on left side but only if there's a non-character
 # on the right.
 _RE_PATTERN_CLEANSE_LINE_C_COMMENTS = re.compile(
-    r"""(\s*/\*.*\*/\s*$|
-            /\*.*\*/\s+|
-         \s+/\*.*\*/(?=\W)|
-            /\*.*\*/)""", re.VERBOSE)
+    r'(\s*' + _RE_PATTERN_C_COMMENTS + r'\s*$|' +
+    _RE_PATTERN_C_COMMENTS + r'\s+|' +
+    r'\s+' + _RE_PATTERN_C_COMMENTS + r'(?=\W)|' +
+    _RE_PATTERN_C_COMMENTS + r')')
 
 
 def IsCppString(line):
@@ -1083,13 +1320,26 @@ def CleanseRawStrings(raw_lines):
         delimiter = None
       else:
         # Haven't found the end yet, append a blank line.
-        line = ''
+        line = '""'
 
-    else:
+    # Look for beginning of a raw string, and replace them with
+    # empty strings.  This is done in a loop to handle multiple raw
+    # strings on the same line.
+    while delimiter is None:
       # Look for beginning of a raw string.
       # See 2.14.15 [lex.string] for syntax.
-      matched = Match(r'^(.*)\b(?:R|u8R|uR|UR|LR)"([^\s\\()]*)\((.*)$', line)
-      if matched:
+      #
+      # Once we have matched a raw string, we check the prefix of the
+      # line to make sure that the line is not part of a single line
+      # comment.  It's done this way because we remove raw strings
+      # before removing comments as opposed to removing comments
+      # before removing raw strings.  This is because there are some
+      # cpplint checks that requires the comments to be preserved, but
+      # we don't want to check comments that are inside raw strings.
+      matched = Match(r'^(.*?)\b(?:R|u8R|uR|UR|LR)"([^\s\\()]*)\((.*)$', line)
+      if (matched and
+          not Match(r'^([^\'"]|\'(\\.|[^\'])*\'|"(\\.|[^"])*")*//',
+                    matched.group(1))):
         delimiter = ')' + matched.group(2) + '"'
 
         end = matched.group(3).find(delimiter)
@@ -1101,6 +1351,8 @@ def CleanseRawStrings(raw_lines):
         else:
           # Start of a multi-line raw string
           line = matched.group(1) + '""'
+      else:
+        break
 
     lines_without_raw_strings.append(line)
 
@@ -1131,10 +1383,10 @@ def FindNextMultiLineCommentEnd(lines, lineix):
 
 def RemoveMultiLineCommentsFromRange(lines, begin, end):
   """Clears a range of lines for multi-line comments."""
-  # Having // dummy comments makes the lines non-empty, so we will not get
+  # Having // <empty> comments makes the lines non-empty, so we will not get
   # unnecessary blank line warnings later in the code.
   for i in range(begin, end):
-    lines[i] = '// dummy'
+    lines[i] = '/**/'
 
 
 def RemoveMultiLineComments(filename, lines, error):
@@ -1170,12 +1422,14 @@ def CleanseComments(line):
 
 
 class CleansedLines(object):
-  """Holds 3 copies of all lines with different preprocessing applied to them.
+  """Holds 4 copies of all lines with different preprocessing applied to them.
 
-  1) elided member contains lines without strings and comments,
-  2) lines member contains lines without comments, and
+  1) elided member contains lines without strings and comments.
+  2) lines member contains lines without comments.
   3) raw_lines member contains all the lines without processing.
-  All these three members are of <type 'list'>, and of the same length.
+  4) lines_without_raw_strings member is same as raw_lines, but with C++11 raw
+     strings removed.
+  All these members are of <type 'list'>, and of the same length.
   """
 
   def __init__(self, lines):
@@ -1206,38 +1460,138 @@ class CleansedLines(object):
     Returns:
       The line with collapsed strings.
     """
-    if not _RE_PATTERN_INCLUDE.match(elided):
-      # Remove escaped characters first to make quote/single quote collapsing
-      # basic.  Things that look like escaped characters shouldn't occur
-      # outside of strings and chars.
-      elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided)
-      elided = _RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES.sub("''", elided)
-      elided = _RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES.sub('""', elided)
-    return elided
+    if _RE_PATTERN_INCLUDE.match(elided):
+      return elided
+
+    # Remove escaped characters first to make quote/single quote collapsing
+    # basic.  Things that look like escaped characters shouldn't occur
+    # outside of strings and chars.
+    elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided)
+
+    # Replace quoted strings and digit separators.  Both single quotes
+    # and double quotes are processed in the same loop, otherwise
+    # nested quotes wouldn't work.
+    collapsed = ''
+    while True:
+      # Find the first quote character
+      match = Match(r'^([^\'"]*)([\'"])(.*)$', elided)
+      if not match:
+        collapsed += elided
+        break
+      head, quote, tail = match.groups()
+
+      if quote == '"':
+        # Collapse double quoted strings
+        second_quote = tail.find('"')
+        if second_quote >= 0:
+          collapsed += head + '""'
+          elided = tail[second_quote + 1:]
+        else:
+          # Unmatched double quote, don't bother processing the rest
+          # of the line since this is probably a multiline string.
+          collapsed += elided
+          break
+      else:
+        # Found single quote, check nearby text to eliminate digit separators.
+        #
+        # There is no special handling for floating point here, because
+        # the integer/fractional/exponent parts would all be parsed
+        # correctly as long as there are digits on both sides of the
+        # separator.  So we are fine as long as we don't see something
+        # like "0.'3" (gcc 4.9.0 will not allow this literal).
+        if Search(r'\b(?:0[bBxX]?|[1-9])[0-9a-fA-F]*$', head):
+          match_literal = Match(r'^((?:\'?[0-9a-zA-Z_])*)(.*)$', "'" + tail)
+          collapsed += head + match_literal.group(1).replace("'", '')
+          elided = match_literal.group(2)
+        else:
+          second_quote = tail.find('\'')
+          if second_quote >= 0:
+            collapsed += head + "''"
+            elided = tail[second_quote + 1:]
+          else:
+            # Unmatched single quote
+            collapsed += elided
+            break
+
+    return collapsed
 
 
-def FindEndOfExpressionInLine(line, startpos, depth, startchar, endchar):
-  """Find the position just after the matching endchar.
+def FindEndOfExpressionInLine(line, startpos, stack):
+  """Find the position just after the end of current parenthesized expression.
 
   Args:
     line: a CleansedLines line.
     startpos: start searching at this position.
-    depth: nesting level at startpos.
-    startchar: expression opening character.
-    endchar: expression closing character.
+    stack: nesting stack at startpos.
 
   Returns:
-    On finding matching endchar: (index just after matching endchar, 0)
-    Otherwise: (-1, new depth at end of this line)
+    On finding matching end: (index just after matching end, None)
+    On finding an unclosed expression: (-1, None)
+    Otherwise: (-1, new stack at end of this line)
   """
   for i in xrange(startpos, len(line)):
-    if line[i] == startchar:
-      depth += 1
-    elif line[i] == endchar:
-      depth -= 1
-      if depth == 0:
-        return (i + 1, 0)
-  return (-1, depth)
+    char = line[i]
+    if char in '([{':
+      # Found start of parenthesized expression, push to expression stack
+      stack.append(char)
+    elif char == '<':
+      # Found potential start of template argument list
+      if i > 0 and line[i - 1] == '<':
+        # Left shift operator
+        if stack and stack[-1] == '<':
+          stack.pop()
+          if not stack:
+            return (-1, None)
+      elif i > 0 and Search(r'\boperator\s*$', line[0:i]):
+        # operator<, don't add to stack
+        continue
+      else:
+        # Tentative start of template argument list
+        stack.append('<')
+    elif char in ')]}':
+      # Found end of parenthesized expression.
+      #
+      # If we are currently expecting a matching '>', the pending '<'
+      # must have been an operator.  Remove them from expression stack.
+      while stack and stack[-1] == '<':
+        stack.pop()
+      if not stack:
+        return (-1, None)
+      if ((stack[-1] == '(' and char == ')') or
+          (stack[-1] == '[' and char == ']') or
+          (stack[-1] == '{' and char == '}')):
+        stack.pop()
+        if not stack:
+          return (i + 1, None)
+      else:
+        # Mismatched parentheses
+        return (-1, None)
+    elif char == '>':
+      # Found potential end of template argument list.
+
+      # Ignore "->" and operator functions
+      if (i > 0 and
+          (line[i - 1] == '-' or Search(r'\boperator\s*$', line[0:i - 1]))):
+        continue
+
+      # Pop the stack if there is a matching '<'.  Otherwise, ignore
+      # this '>' since it must be an operator.
+      if stack:
+        if stack[-1] == '<':
+          stack.pop()
+          if not stack:
+            return (i + 1, None)
+    elif char == ';':
+      # Found something that look like end of statements.  If we are currently
+      # expecting a '>', the matching '<' must have been an operator, since
+      # template argument list should not contain statements.
+      while stack and stack[-1] == '<':
+        stack.pop()
+      if not stack:
+        return (-1, None)
+
+  # Did not find end of expression or unbalanced parentheses on this line
+  return (-1, stack)
 
 
 def CloseExpression(clean_lines, linenum, pos):
@@ -1246,6 +1600,11 @@ def CloseExpression(clean_lines, linenum, pos):
   If lines[linenum][pos] points to a '(' or '{' or '[' or '<', finds the
   linenum/pos that correspond to the closing of the expression.
 
+  TODO(unknown): cpplint spends a fair bit of time matching parentheses.
+  Ideally we would want to index all opening and closing parentheses once
+  and have CloseExpression be just a simple lookup, but due to preprocessor
+  tricks, this is not so easy.
+
   Args:
     clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
@@ -1259,35 +1618,28 @@ def CloseExpression(clean_lines, linenum, pos):
   """
 
   line = clean_lines.elided[linenum]
-  startchar = line[pos]
-  if startchar not in '({[<':
+  if (line[pos] not in '({[<') or Match(r'<[<=]', line[pos:]):
     return (line, clean_lines.NumLines(), -1)
-  if startchar == '(': endchar = ')'
-  if startchar == '[': endchar = ']'
-  if startchar == '{': endchar = '}'
-  if startchar == '<': endchar = '>'
 
   # Check first line
-  (end_pos, num_open) = FindEndOfExpressionInLine(
-      line, pos, 0, startchar, endchar)
+  (end_pos, stack) = FindEndOfExpressionInLine(line, pos, [])
   if end_pos > -1:
     return (line, linenum, end_pos)
 
   # Continue scanning forward
-  while linenum < clean_lines.NumLines() - 1:
+  while stack and linenum < clean_lines.NumLines() - 1:
     linenum += 1
     line = clean_lines.elided[linenum]
-    (end_pos, num_open) = FindEndOfExpressionInLine(
-        line, 0, num_open, startchar, endchar)
+    (end_pos, stack) = FindEndOfExpressionInLine(line, 0, stack)
     if end_pos > -1:
       return (line, linenum, end_pos)
 
-  # Did not find endchar before end of file, give up
+  # Did not find end of expression before end of file, give up
   return (line, clean_lines.NumLines(), -1)
 
 
-def FindStartOfExpressionInLine(line, endpos, depth, startchar, endchar):
-  """Find position at the matching startchar.
+def FindStartOfExpressionInLine(line, endpos, stack):
+  """Find position at the matching start of current expression.
 
   This is almost the reverse of FindEndOfExpressionInLine, but note
   that the input position and returned position differs by 1.
@@ -1295,22 +1647,72 @@ def FindStartOfExpressionInLine(line, endpos, depth, startchar, endchar):
   Args:
     line: a CleansedLines line.
     endpos: start searching at this position.
-    depth: nesting level at endpos.
-    startchar: expression opening character.
-    endchar: expression closing character.
+    stack: nesting stack at endpos.
 
   Returns:
-    On finding matching startchar: (index at matching startchar, 0)
-    Otherwise: (-1, new depth at beginning of this line)
+    On finding matching start: (index at matching start, None)
+    On finding an unclosed expression: (-1, None)
+    Otherwise: (-1, new stack at beginning of this line)
   """
-  for i in xrange(endpos, -1, -1):
-    if line[i] == endchar:
-      depth += 1
-    elif line[i] == startchar:
-      depth -= 1
-      if depth == 0:
-        return (i, 0)
-  return (-1, depth)
+  i = endpos
+  while i >= 0:
+    char = line[i]
+    if char in ')]}':
+      # Found end of expression, push to expression stack
+      stack.append(char)
+    elif char == '>':
+      # Found potential end of template argument list.
+      #
+      # Ignore it if it's a "->" or ">=" or "operator>"
+      if (i > 0 and
+          (line[i - 1] == '-' or
+           Match(r'\s>=\s', line[i - 1:]) or
+           Search(r'\boperator\s*$', line[0:i]))):
+        i -= 1
+      else:
+        stack.append('>')
+    elif char == '<':
+      # Found potential start of template argument list
+      if i > 0 and line[i - 1] == '<':
+        # Left shift operator
+        i -= 1
+      else:
+        # If there is a matching '>', we can pop the expression stack.
+        # Otherwise, ignore this '<' since it must be an operator.
+        if stack and stack[-1] == '>':
+          stack.pop()
+          if not stack:
+            return (i, None)
+    elif char in '([{':
+      # Found start of expression.
+      #
+      # If there are any unmatched '>' on the stack, they must be
+      # operators.  Remove those.
+      while stack and stack[-1] == '>':
+        stack.pop()
+      if not stack:
+        return (-1, None)
+      if ((char == '(' and stack[-1] == ')') or
+          (char == '[' and stack[-1] == ']') or
+          (char == '{' and stack[-1] == '}')):
+        stack.pop()
+        if not stack:
+          return (i, None)
+      else:
+        # Mismatched parentheses
+        return (-1, None)
+    elif char == ';':
+      # Found something that look like end of statements.  If we are currently
+      # expecting a '<', the matching '>' must have been an operator, since
+      # template argument list should not contain statements.
+      while stack and stack[-1] == '>':
+        stack.pop()
+      if not stack:
+        return (-1, None)
+
+    i -= 1
+
+  return (-1, stack)
 
 
 def ReverseCloseExpression(clean_lines, linenum, pos):
@@ -1331,30 +1733,23 @@ def ReverseCloseExpression(clean_lines, linenum, pos):
     return is the 'cleansed' line at linenum.
   """
   line = clean_lines.elided[linenum]
-  endchar = line[pos]
-  if endchar not in ')}]>':
+  if line[pos] not in ')}]>':
     return (line, 0, -1)
-  if endchar == ')': startchar = '('
-  if endchar == ']': startchar = '['
-  if endchar == '}': startchar = '{'
-  if endchar == '>': startchar = '<'
 
   # Check last line
-  (start_pos, num_open) = FindStartOfExpressionInLine(
-      line, pos, 0, startchar, endchar)
+  (start_pos, stack) = FindStartOfExpressionInLine(line, pos, [])
   if start_pos > -1:
     return (line, linenum, start_pos)
 
   # Continue scanning backward
-  while linenum > 0:
+  while stack and linenum > 0:
     linenum -= 1
     line = clean_lines.elided[linenum]
-    (start_pos, num_open) = FindStartOfExpressionInLine(
-        line, len(line) - 1, num_open, startchar, endchar)
+    (start_pos, stack) = FindStartOfExpressionInLine(line, len(line) - 1, stack)
     if start_pos > -1:
       return (line, linenum, start_pos)
 
-  # Did not find startchar before beginning of file, give up
+  # Did not find start of expression before beginning of file, give up
   return (line, 0, -1)
 
 
@@ -1362,7 +1757,7 @@ def CheckForCopyright(filename, lines, error):
   """Logs an error if no Copyright message appears at the top of the file."""
 
   # We'll say it should occur by line 10. Don't forget there's a
-  # dummy line at the front.
+  # placeholder line at the front.
   for line in xrange(1, min(len(lines), 11)):
     if re.search(r'Copyright', lines[line], re.I): break
   else:                       # means no copyright line was found
@@ -1371,6 +1766,46 @@ def CheckForCopyright(filename, lines, error):
           'You should have a line: "Copyright [year] <Copyright Owner>"')
 
 
+def GetIndentLevel(line):
+  """Return the number of leading spaces in line.
+
+  Args:
+    line: A string to check.
+
+  Returns:
+    An integer count of leading spaces, possibly zero.
+  """
+  indent = Match(r'^( *)\S', line)
+  if indent:
+    return len(indent.group(1))
+  else:
+    return 0
+
+def PathSplitToList(path):
+  """Returns the path split into a list by the separator.
+
+  Args:
+    path: An absolute or relative path (e.g. '/a/b/c/' or '../a')
+
+  Returns:
+    A list of path components (e.g. ['a', 'b', 'c]).
+  """
+  lst = []
+  while True:
+    (head, tail) = os.path.split(path)
+    if head == path: # absolute paths end
+      lst.append(head)
+      break
+    if tail == path: # relative paths end
+      lst.append(tail)
+      break
+
+    path = head
+    lst.append(tail)
+
+  lst.reverse()
+  return lst
+
 def GetHeaderGuardCPPVariable(filename):
   """Returns the CPP variable that should be used as a header guard.
 
@@ -1387,15 +1822,67 @@ def GetHeaderGuardCPPVariable(filename):
   # flymake.
   filename = re.sub(r'_flymake\.h$', '.h', filename)
   filename = re.sub(r'/\.flymake/([^/]*)$', r'/\1', filename)
+  # Replace 'c++' with 'cpp'.
+  filename = filename.replace('C++', 'cpp').replace('c++', 'cpp')
 
   fileinfo = FileInfo(filename)
   file_path_from_root = fileinfo.RepositoryName()
-  if _root:
-    file_path_from_root = re.sub('^' + _root + os.sep, '', file_path_from_root)
-  return re.sub(r'[-./\s]', '_', file_path_from_root).upper() + '_'
+
+  def FixupPathFromRoot():
+    if _root_debug:
+      sys.stderr.write("\n_root fixup, _root = '%s', repository name = '%s'\n"
+          %(_root, fileinfo.RepositoryName()))
+
+    # Process the file path with the --root flag if it was set.
+    if not _root:
+      if _root_debug:
+        sys.stderr.write("_root unspecified\n")
+      return file_path_from_root
+
+    def StripListPrefix(lst, prefix):
+      # f(['x', 'y'], ['w, z']) -> None  (not a valid prefix)
+      if lst[:len(prefix)] != prefix:
+        return None
+      # f(['a, 'b', 'c', 'd'], ['a', 'b']) -> ['c', 'd']
+      return lst[(len(prefix)):]
+
+    # root behavior:
+    #   --root=subdir , lstrips subdir from the header guard
+    maybe_path = StripListPrefix(PathSplitToList(file_path_from_root),
+                                 PathSplitToList(_root))
+
+    if _root_debug:
+      sys.stderr.write(("_root lstrip (maybe_path=%s, file_path_from_root=%s," +
+          " _root=%s)\n") %(maybe_path, file_path_from_root, _root))
+
+    if maybe_path:
+      return os.path.join(*maybe_path)
+
+    #   --root=.. , will prepend the outer directory to the header guard
+    full_path = fileinfo.FullName()
+    root_abspath = os.path.abspath(_root)
+
+    maybe_path = StripListPrefix(PathSplitToList(full_path),
+                                 PathSplitToList(root_abspath))
+
+    if _root_debug:
+      sys.stderr.write(("_root prepend (maybe_path=%s, full_path=%s, " +
+          "root_abspath=%s)\n") %(maybe_path, full_path, root_abspath))
+
+    if maybe_path:
+      return os.path.join(*maybe_path)
+
+    if _root_debug:
+      sys.stderr.write("_root ignore, returning %s\n" %(file_path_from_root))
+
+    #   --root=FAKE_DIR is ignored
+    return file_path_from_root
+
+  file_path_from_root = FixupPathFromRoot()
+  return re.sub(r'[^a-zA-Z0-9]', '_', file_path_from_root).upper() + '_'
 
 
-def CheckForHeaderGuard(filename, lines, error):
+def CheckForHeaderGuard(filename, clean_lines, error):
   """Checks that the file contains a header guard.
 
   Logs an error if no #ifndef header guard is present.  For other
@@ -1403,18 +1890,29 @@ def CheckForHeaderGuard(filename, lines, error):
 
   Args:
     filename: The name of the C++ header file.
-    lines: An array of strings, each representing a line of the file.
+    clean_lines: A CleansedLines instance containing the file.
     error: The function to call with any errors found.
   """
 
+  # Don't check for header guards if there are error suppression
+  # comments somewhere in this file.
+  #
+  # Because this is silencing a warning for a nonexistent line, we
+  # only support the very specific NOLINT(build/header_guard) syntax,
+  # and not the general NOLINT or NOLINT(*) syntax.
+  raw_lines = clean_lines.lines_without_raw_strings
+  for i in raw_lines:
+    if Search(r'//\s*NOLINT\(build/header_guard\)', i):
+      return
+
   cppvar = GetHeaderGuardCPPVariable(filename)
 
-  ifndef = None
+  ifndef = ''
   ifndef_linenum = 0
-  define = None
-  endif = None
+  define = ''
+  endif = ''
   endif_linenum = 0
-  for linenum, line in enumerate(lines):
+  for linenum, line in enumerate(raw_lines):
     linesplit = line.split()
     if len(linesplit) >= 2:
       # find the first occurrence of #ifndef and #define, save arg
@@ -1429,18 +1927,12 @@ def CheckForHeaderGuard(filename, lines, error):
       endif = line
       endif_linenum = linenum
 
-  if not ifndef:
+  if not ifndef or not define or ifndef != define:
     error(filename, 0, 'build/header_guard', 5,
           'No #ifndef header guard found, suggested CPP variable is: %s' %
           cppvar)
     return
 
-  if not define:
-    error(filename, 0, 'build/header_guard', 5,
-          'No #define header guard found, suggested CPP variable is: %s' %
-          cppvar)
-    return
-
   # The guard should be PATH_FILE_H_, but we also allow PATH_FILE_H__
   # for backward compatibility.
   if ifndef != cppvar:
@@ -1448,26 +1940,69 @@ def CheckForHeaderGuard(filename, lines, error):
     if ifndef != cppvar + '_':
       error_level = 5
 
-    ParseNolintSuppressions(filename, lines[ifndef_linenum], ifndef_linenum,
+    ParseNolintSuppressions(filename, raw_lines[ifndef_linenum], ifndef_linenum,
                             error)
     error(filename, ifndef_linenum, 'build/header_guard', error_level,
           '#ifndef header guard has wrong style, please use: %s' % cppvar)
 
-  if define != ifndef:
-    error(filename, 0, 'build/header_guard', 5,
-          '#ifndef and #define don\'t match, suggested CPP variable is: %s' %
-          cppvar)
+  # Check for "//" comments on endif line.
+  ParseNolintSuppressions(filename, raw_lines[endif_linenum], endif_linenum,
+                          error)
+  match = Match(r'#endif\s*//\s*' + cppvar + r'(_)?\b', endif)
+  if match:
+    if match.group(1) == '_':
+      # Issue low severity warning for deprecated double trailing underscore
+      error(filename, endif_linenum, 'build/header_guard', 0,
+            '#endif line should be "#endif  // %s"' % cppvar)
     return
 
-  if endif != ('#endif  // %s' % cppvar):
-    error_level = 0
-    if endif != ('#endif  // %s' % (cppvar + '_')):
-      error_level = 5
+  # Didn't find the corresponding "//" comment.  If this file does not
+  # contain any "//" comments at all, it could be that the compiler
+  # only wants "/**/" comments, look for those instead.
+  no_single_line_comments = True
+  for i in xrange(1, len(raw_lines) - 1):
+    line = raw_lines[i]
+    if Match(r'^(?:(?:\'(?:\.|[^\'])*\')|(?:"(?:\.|[^"])*")|[^\'"])*//', line):
+      no_single_line_comments = False
+      break
 
-    ParseNolintSuppressions(filename, lines[endif_linenum], endif_linenum,
-                            error)
-    error(filename, endif_linenum, 'build/header_guard', error_level,
-          '#endif line should be "#endif  // %s"' % cppvar)
+  if no_single_line_comments:
+    match = Match(r'#endif\s*/\*\s*' + cppvar + r'(_)?\s*\*/', endif)
+    if match:
+      if match.group(1) == '_':
+        # Low severity warning for double trailing underscore
+        error(filename, endif_linenum, 'build/header_guard', 0,
+              '#endif line should be "#endif  /* %s */"' % cppvar)
+      return
+
+  # Didn't find anything
+  error(filename, endif_linenum, 'build/header_guard', 5,
+        '#endif line should be "#endif  // %s"' % cppvar)
+
+
+def CheckHeaderFileIncluded(filename, include_state, error):
+  """Logs an error if a .cc file does not include its header."""
+
+  # Do not check test files
+  fileinfo = FileInfo(filename)
+  if Search(_TEST_FILE_SUFFIX, fileinfo.BaseName()):
+    return
+
+  headerfile = filename[0:len(filename) - len(fileinfo.Extension())] + '.h'
+  if not os.path.exists(headerfile):
+    return
+  headername = FileInfo(headerfile).RepositoryName()
+  first_include = 0
+  for section_list in include_state.include_list:
+    for f in section_list:
+      if headername in f[0] or f[0] in headername:
+        return
+      if not first_include:
+        first_include = f[1]
+
+  error(filename, first_include, 'build/include', 5,
+        '%s should include its header file %s' % (fileinfo.RepositoryName(),
+                                                  headername))
 
 
 def CheckForBadCharacters(filename, lines, error):
@@ -1551,19 +2086,33 @@ def CheckForMultilineCommentsAndStrings(filename, clean_lines, linenum, error):
           'Use C++11 raw strings or concatenation instead.')
 
 
-threading_list = (
-    ('asctime(', 'asctime_r('),
-    ('ctime(', 'ctime_r('),
-    ('getgrgid(', 'getgrgid_r('),
-    ('getgrnam(', 'getgrnam_r('),
-    ('getlogin(', 'getlogin_r('),
-    ('getpwnam(', 'getpwnam_r('),
-    ('getpwuid(', 'getpwuid_r('),
-    ('gmtime(', 'gmtime_r('),
-    ('localtime(', 'localtime_r('),
-    ('rand(', 'rand_r('),
-    ('strtok(', 'strtok_r('),
-    ('ttyname(', 'ttyname_r('),
+# (non-threadsafe name, thread-safe alternative, validation pattern)
+#
+# The validation pattern is used to eliminate false positives such as:
+#  _rand();               // false positive due to substring match.
+#  ->rand();              // some member function rand().
+#  ACMRandom rand(seed);  // some variable named rand.
+#  ISAACRandom rand();    // another variable named rand.
+#
+# Basically we require the return value of these functions to be used
+# in some expression context on the same line by matching on some
+# operator before the function name.  This eliminates constructors and
+# member function calls.
+_UNSAFE_FUNC_PREFIX = r'(?:[-+*/=%^&|(<]\s*|>\s+)'
+_THREADING_LIST = (
+    ('asctime(', 'asctime_r(', _UNSAFE_FUNC_PREFIX + r'asctime\([^)]+\)'),
+    ('ctime(', 'ctime_r(', _UNSAFE_FUNC_PREFIX + r'ctime\([^)]+\)'),
+    ('getgrgid(', 'getgrgid_r(', _UNSAFE_FUNC_PREFIX + r'getgrgid\([^)]+\)'),
+    ('getgrnam(', 'getgrnam_r(', _UNSAFE_FUNC_PREFIX + r'getgrnam\([^)]+\)'),
+    ('getlogin(', 'getlogin_r(', _UNSAFE_FUNC_PREFIX + r'getlogin\(\)'),
+    ('getpwnam(', 'getpwnam_r(', _UNSAFE_FUNC_PREFIX + r'getpwnam\([^)]+\)'),
+    ('getpwuid(', 'getpwuid_r(', _UNSAFE_FUNC_PREFIX + r'getpwuid\([^)]+\)'),
+    ('gmtime(', 'gmtime_r(', _UNSAFE_FUNC_PREFIX + r'gmtime\([^)]+\)'),
+    ('localtime(', 'localtime_r(', _UNSAFE_FUNC_PREFIX + r'localtime\([^)]+\)'),
+    ('rand(', 'rand_r(', _UNSAFE_FUNC_PREFIX + r'rand\(\)'),
+    ('strtok(', 'strtok_r(',
+     _UNSAFE_FUNC_PREFIX + r'strtok\([^)]+\)'),
+    ('ttyname(', 'ttyname_r(', _UNSAFE_FUNC_PREFIX + r'ttyname\([^)]+\)'),
     )
 
 
@@ -1583,14 +2132,13 @@ def CheckPosixThreading(filename, clean_lines, linenum, error):
     error: The function to call with any errors found.
   """
   line = clean_lines.elided[linenum]
-  for single_thread_function, multithread_safe_function in threading_list:
-    ix = line.find(single_thread_function)
-    # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison
-    if ix >= 0 and (ix == 0 or (not line[ix - 1].isalnum() and
-                                line[ix - 1] not in ('_', '.', '>'))):
+  for single_thread_func, multithread_safe_func, pattern in _THREADING_LIST:
+    # Additional pattern matching check to confirm that this is the
+    # function we are looking for
+    if Search(pattern, line):
       error(filename, linenum, 'runtime/threadsafe_fn', 2,
-            'Consider using ' + multithread_safe_function +
-            '...) instead of ' + single_thread_function +
+            'Consider using ' + multithread_safe_func +
+            '...) instead of ' + single_thread_func +
             '...) for improved thread safety.')
 
 
@@ -1612,7 +2160,6 @@ def CheckVlogArguments(filename, clean_lines, linenum, error):
           'VLOG() should be used with numeric verbosity level.  '
           'Use LOG() if you want symbolic severity levels.')
 
-
 # Matches invalid increment: *count++, which moves pointer instead of
 # incrementing a value.
 _RE_PATTERN_INVALID_INCREMENT = re.compile(
@@ -1641,13 +2188,29 @@ def CheckInvalidIncrement(filename, clean_lines, linenum, error):
           'Changing pointer instead of value (or unused value of operator*).')
 
 
+def IsMacroDefinition(clean_lines, linenum):
+  if Search(r'^#define', clean_lines[linenum]):
+    return True
+
+  if linenum > 0 and Search(r'\\$', clean_lines[linenum - 1]):
+    return True
+
+  return False
+
+
+def IsForwardClassDeclaration(clean_lines, linenum):
+  return Match(r'^\s*(\btemplate\b)*.*class\s+\w+;\s*$', clean_lines[linenum])
+
+
 class _BlockInfo(object):
   """Stores information about a generic block of code."""
 
-  def __init__(self, seen_open_brace):
+  def __init__(self, linenum, seen_open_brace):
+    self.starting_linenum = linenum
     self.seen_open_brace = seen_open_brace
     self.open_parentheses = 0
     self.inline_asm = _NO_ASM
+    self.check_namespace_indentation = False
 
   def CheckBegin(self, filename, clean_lines, linenum, error):
     """Run checks that applies to text up to the opening brace.
@@ -1677,15 +2240,33 @@ class _BlockInfo(object):
     """
     pass
 
+  def IsBlockInfo(self):
+    """Returns true if this block is a _BlockInfo.
+
+    This is convenient for verifying that an object is an instance of
+    a _BlockInfo, but not an instance of any of the derived classes.
+
+    Returns:
+      True for this class, False for derived classes.
+    """
+    return self.__class__ == _BlockInfo
+
+
+class _ExternCInfo(_BlockInfo):
+  """Stores information about an 'extern "C"' block."""
+
+  def __init__(self, linenum):
+    _BlockInfo.__init__(self, linenum, True)
+
 
 class _ClassInfo(_BlockInfo):
   """Stores information about a class."""
 
   def __init__(self, name, class_or_struct, clean_lines, linenum):
-    _BlockInfo.__init__(self, False)
+    _BlockInfo.__init__(self, linenum, False)
     self.name = name
-    self.starting_linenum = linenum
     self.is_derived = False
+    self.check_namespace_indentation = True
     if class_or_struct == 'struct':
       self.access = 'public'
       self.is_struct = True
@@ -1695,11 +2276,7 @@ class _ClassInfo(_BlockInfo):
 
     # Remember initial indentation level for this class.  Using raw_lines here
     # instead of elided to account for leading comments.
-    initial_indent = Match(r'^( *)\S', clean_lines.raw_lines[linenum])
-    if initial_indent:
-      self.class_indent = len(initial_indent.group(1))
-    else:
-      self.class_indent = 0
+    self.class_indent = GetIndentLevel(clean_lines.raw_lines[linenum])
 
     # Try to find the end of the class.  This will be confused by things like:
     #   class A {
@@ -1721,6 +2298,23 @@ class _ClassInfo(_BlockInfo):
       self.is_derived = True
 
   def CheckEnd(self, filename, clean_lines, linenum, error):
+    # If there is a DISALLOW macro, it should appear near the end of
+    # the class.
+    seen_last_thing_in_class = False
+    for i in xrange(linenum - 1, self.starting_linenum, -1):
+      match = Search(
+          r'\b(DISALLOW_COPY_AND_ASSIGN|DISALLOW_IMPLICIT_CONSTRUCTORS)\(' +
+          self.name + r'\)',
+          clean_lines.elided[i])
+      if match:
+        if seen_last_thing_in_class:
+          error(filename, i, 'readability/constructors', 3,
+                match.group(1) + ' should be the last thing in the class')
+        break
+
+      if not Match(r'^\s*$', clean_lines.elided[i]):
+        seen_last_thing_in_class = True
+
     # Check that closing brace is aligned with beginning of the class.
     # Only do this if the closing brace is indented by only whitespaces.
     # This means we will not check single-line class definitions.
@@ -1738,9 +2332,9 @@ class _NamespaceInfo(_BlockInfo):
   """Stores information about a namespace."""
 
   def __init__(self, name, linenum):
-    _BlockInfo.__init__(self, False)
+    _BlockInfo.__init__(self, linenum, False)
     self.name = name or ''
-    self.starting_linenum = linenum
+    self.check_namespace_indentation = True
 
   def CheckEnd(self, filename, clean_lines, linenum, error):
     """Check end of namespace comments."""
@@ -1758,7 +2352,7 @@ class _NamespaceInfo(_BlockInfo):
     # deciding what these nontrivial things are, so this check is
     # triggered by namespace size only, which works most of the time.
     if (linenum - self.starting_linenum < 10
-        and not Match(r'};*\s*(//|/\*).*\bnamespace\b', line)):
+        and not Match(r'^\s*};*\s*(//|/\*).*\bnamespace\b', line)):
       return
 
     # Look for matching comment at end of namespace.
@@ -1775,17 +2369,24 @@ class _NamespaceInfo(_BlockInfo):
     # expected namespace.
     if self.name:
       # Named namespace
-      if not Match((r'};*\s*(//|/\*).*\bnamespace\s+' + re.escape(self.name) +
-                    r'[\*/\.\\\s]*$'),
+      if not Match((r'^\s*};*\s*(//|/\*).*\bnamespace\s+' +
+                    re.escape(self.name) + r'[\*/\.\\\s]*$'),
                    line):
         error(filename, linenum, 'readability/namespace', 5,
               'Namespace should be terminated with "// namespace %s"' %
               self.name)
     else:
       # Anonymous namespace
-      if not Match(r'};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line):
-        error(filename, linenum, 'readability/namespace', 5,
-              'Namespace should be terminated with "// namespace"')
+      if not Match(r'^\s*};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line):
+        # If "// namespace anonymous" or "// anonymous namespace (more text)",
+        # mention "// anonymous namespace" as an acceptable form
+        if Match(r'^\s*}.*\b(namespace anonymous|anonymous namespace)\b', line):
+          error(filename, linenum, 'readability/namespace', 5,
+                'Anonymous namespace should be terminated with "// namespace"'
+                ' or "// anonymous namespace"')
+        else:
+          error(filename, linenum, 'readability/namespace', 5,
+                'Anonymous namespace should be terminated with "// namespace"')
 
 
 class _PreprocessorInfo(object):
@@ -1802,7 +2403,7 @@ class _PreprocessorInfo(object):
     self.seen_else = False
 
 
-class _NestingState(object):
+class NestingState(object):
   """Holds states related to parsing braces."""
 
   def __init__(self):
@@ -1814,6 +2415,17 @@ class _NestingState(object):
     # - _BlockInfo: some other type of block.
     self.stack = []
 
+    # Top of the previous stack before each Update().
+    #
+    # Because the nesting_stack is updated at the end of each line, we
+    # had to do some convoluted checks to find out what is the current
+    # scope at the beginning of the line.  This check is simplified by
+    # saving the previous top of nesting stack.
+    #
+    # We could save the full stack, but we only need the top.  Copying
+    # the full nesting stack would slow down cpplint by ~10%.
+    self.previous_stack_top = []
+
     # Stack of _PreprocessorInfo objects.
     self.pp_stack = []
 
@@ -1834,6 +2446,82 @@ class _NestingState(object):
     """
     return self.stack and isinstance(self.stack[-1], _NamespaceInfo)
 
+  def InExternC(self):
+    """Check if we are currently one level inside an 'extern "C"' block.
+
+    Returns:
+      True if top of the stack is an extern block, False otherwise.
+    """
+    return self.stack and isinstance(self.stack[-1], _ExternCInfo)
+
+  def InClassDeclaration(self):
+    """Check if we are currently one level inside a class or struct declaration.
+
+    Returns:
+      True if top of the stack is a class/struct, False otherwise.
+    """
+    return self.stack and isinstance(self.stack[-1], _ClassInfo)
+
+  def InAsmBlock(self):
+    """Check if we are currently one level inside an inline ASM block.
+
+    Returns:
+      True if the top of the stack is a block containing inline ASM.
+    """
+    return self.stack and self.stack[-1].inline_asm != _NO_ASM
+
+  def InTemplateArgumentList(self, clean_lines, linenum, pos):
+    """Check if current position is inside template argument list.
+
+    Args:
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      pos: position just after the suspected template argument.
+    Returns:
+      True if (linenum, pos) is inside template arguments.
+    """
+    while linenum < clean_lines.NumLines():
+      # Find the earliest character that might indicate a template argument
+      line = clean_lines.elided[linenum]
+      match = Match(r'^[^{};=\[\]\.<>]*(.)', line[pos:])
+      if not match:
+        linenum += 1
+        pos = 0
+        continue
+      token = match.group(1)
+      pos += len(match.group(0))
+
+      # These things do not look like template argument list:
+      #   class Suspect {
+      #   class Suspect x; }
+      if token in ('{', '}', ';'): return False
+
+      # These things look like template argument list:
+      #   template <class Suspect>
+      #   template <class Suspect = default_value>
+      #   template <class Suspect[]>
+      #   template <class Suspect...>
+      if token in ('>', '=', '[', ']', '.'): return True
+
+      # Check if token is an unmatched '<'.
+      # If not, move on to the next character.
+      if token != '<':
+        pos += 1
+        if pos >= len(line):
+          linenum += 1
+          pos = 0
+        continue
+
+      # We can't be sure if we just find a single '<', and need to
+      # find the matching '>'.
+      (_, end_line, end_pos) = CloseExpression(clean_lines, linenum, pos - 1)
+      if end_pos < 0:
+        # Not sure if template argument list or syntax error in file
+        return False
+      linenum = end_line
+      pos = end_pos
+    return False
+
   def UpdatePreprocessor(self, line):
     """Update preprocessor stack.
 
@@ -1890,6 +2578,7 @@ class _NestingState(object):
         # TODO(unknown): unexpected #endif, issue warning?
         pass
 
+  # TODO(unknown): Update() is too long, but we will refactor later.
   def Update(self, filename, clean_lines, linenum, error):
     """Update nesting state with current line.
 
@@ -1901,7 +2590,17 @@ class _NestingState(object):
     """
     line = clean_lines.elided[linenum]
 
-    # Update pp_stack first
+    # Remember top of the previous nesting stack.
+    #
+    # The stack is always pushed/popped and not modified in place, so
+    # we can just do a shallow copy instead of copy.deepcopy.  Using
+    # deepcopy would slow down cpplint by ~28%.
+    if self.stack:
+      self.previous_stack_top = self.stack[-1]
+    else:
+      self.previous_stack_top = None
+
+    # Update pp_stack
     self.UpdatePreprocessor(line)
 
     # Count parentheses.  This is to avoid adding struct arguments to
@@ -1952,32 +2651,27 @@ class _NestingState(object):
     # such as in:
     #   class LOCKABLE API Object {
     #   };
-    #
-    # Templates with class arguments may confuse the parser, for example:
-    #   template <class T
-    #             class Comparator = less<T>,
-    #             class Vector = vector<T> >
-    #   class HeapQueue {
-    #
-    # Because this parser has no nesting state about templates, by the
-    # time it saw "class Comparator", it may think that it's a new class.
-    # Nested templates have a similar problem:
-    #   template <
-    #       typename ExportedType,
-    #       typename TupleType,
-    #       template <typename, typename> class ImplTemplate>
-    #
-    # To avoid these cases, we ignore classes that are followed by '=' or '>'
     class_decl_match = Match(
-        r'\s*(template\s*<[\w\s<>,:]*>\s*)?'
-        r'(class|struct)\s+([A-Z_]+\s+)*(\w+(?:::\w+)*)'
-        r'(([^=>]|<[^<>]*>|<[^<>]*<[^<>]*>\s*>)*)$', line)
+        r'^(\s*(?:template\s*<[\w\s<>,:]*>\s*)?'
+        r'(class|struct)\s+(?:[A-Z_]+\s+)*(\w+(?:::\w+)*))'
+        r'(.*)$', line)
     if (class_decl_match and
         (not self.stack or self.stack[-1].open_parentheses == 0)):
-      self.stack.append(_ClassInfo(
-          class_decl_match.group(4), class_decl_match.group(2),
-          clean_lines, linenum))
-      line = class_decl_match.group(5)
+      # We do not want to accept classes that are actually template arguments:
+      #   template <class Ignore1,
+      #             class Ignore2 = Default<Args>,
+      #             template <Args> class Ignore3>
+      #   void Function() {};
+      #
+      # To avoid template argument cases, we scan forward and look for
+      # an unmatched '>'.  If we see one, assume we are inside a
+      # template argument list.
+      end_declaration = len(class_decl_match.group(1))
+      if not self.InTemplateArgumentList(clean_lines, linenum, end_declaration):
+        self.stack.append(_ClassInfo(
+            class_decl_match.group(3), class_decl_match.group(2),
+            clean_lines, linenum))
+        line = class_decl_match.group(4)
 
     # If we have not yet seen the opening brace for the innermost block,
     # run checks here.
@@ -2024,10 +2718,13 @@ class _NestingState(object):
         # stack otherwise.
         if not self.SeenOpenBrace():
           self.stack[-1].seen_open_brace = True
+        elif Match(r'^extern\s*"[^"]*"\s*\{', line):
+          self.stack.append(_ExternCInfo(linenum))
         else:
-          self.stack.append(_BlockInfo(True))
+          self.stack.append(_BlockInfo(linenum, True))
           if _MATCH_ASM.match(line):
             self.stack[-1].inline_asm = _BLOCK_ASM
+
       elif token == ';' or token == ')':
         # If we haven't seen an opening brace yet, but we already saw
         # a semicolon, this is probably a forward declaration.  Pop
@@ -2103,7 +2800,7 @@ def CheckForNonStandardConstructs(filename, clean_lines, linenum,
     filename: The name of the current file.
     clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
-    nesting_state: A _NestingState instance which maintains information about
+    nesting_state: A NestingState instance which maintains information about
                    the current stack of nested blocks being parsed.
     error: A callable to which errors are reported, which takes 4 arguments:
            filename, line number, error level, and message
@@ -2136,7 +2833,8 @@ def CheckForNonStandardConstructs(filename, clean_lines, linenum,
             r'\s+(register|static|extern|typedef)\b',
             line):
     error(filename, linenum, 'build/storage_class', 5,
-          'Storage class (static, extern, typedef, etc) should be first.')
+          'Storage-class specifier (static, extern, typedef, etc) should be '
+          'at the beginning of the declaration.')
 
   if Match(r'\s*#\s*endif\s*[^/\s]+', line):
     error(filename, linenum, 'build/endif_comment', 5,
@@ -2176,26 +2874,79 @@ def CheckForNonStandardConstructs(filename, clean_lines, linenum,
 
   # Look for single-argument constructors that aren't marked explicit.
   # Technically a valid construct, but against style.
-  args = Match(r'\s+(?:inline\s+)?%s\s*\(([^,()]+)\)'
-               % re.escape(base_classname),
-               line)
-  if (args and
-      args.group(1) != 'void' and
-      not Match(r'(const\s+)?%s(\s+const)?\s*(?:<\w+>\s*)?&'
-                % re.escape(base_classname), args.group(1).strip())):
-    error(filename, linenum, 'runtime/explicit', 5,
-          'Single-argument constructors should be marked explicit.')
+  explicit_constructor_match = Match(
+      r'\s+(?:(?:inline|constexpr)\s+)*(explicit\s+)?'
+      r'(?:(?:inline|constexpr)\s+)*%s\s*'
+      r'\(((?:[^()]|\([^()]*\))*)\)'
+      % re.escape(base_classname),
+      line)
+
+  if explicit_constructor_match:
+    is_marked_explicit = explicit_constructor_match.group(1)
+
+    if not explicit_constructor_match.group(2):
+      constructor_args = []
+    else:
+      constructor_args = explicit_constructor_match.group(2).split(',')
+
+    # collapse arguments so that commas in template parameter lists and function
+    # argument parameter lists don't split arguments in two
+    i = 0
+    while i < len(constructor_args):
+      constructor_arg = constructor_args[i]
+      while (constructor_arg.count('<') > constructor_arg.count('>') or
+             constructor_arg.count('(') > constructor_arg.count(')')):
+        constructor_arg += ',' + constructor_args[i + 1]
+        del constructor_args[i + 1]
+      constructor_args[i] = constructor_arg
+      i += 1
+
+    defaulted_args = [arg for arg in constructor_args if '=' in arg]
+    noarg_constructor = (not constructor_args or  # empty arg list
+                         # 'void' arg specifier
+                         (len(constructor_args) == 1 and
+                          constructor_args[0].strip() == 'void'))
+    onearg_constructor = ((len(constructor_args) == 1 and  # exactly one arg
+                           not noarg_constructor) or
+                          # all but at most one arg defaulted
+                          (len(constructor_args) >= 1 and
+                           not noarg_constructor and
+                           len(defaulted_args) >= len(constructor_args) - 1))
+    initializer_list_constructor = bool(
+        onearg_constructor and
+        Search(r'\bstd\s*::\s*initializer_list\b', constructor_args[0]))
+    copy_constructor = bool(
+        onearg_constructor and
+        Match(r'(const\s+)?%s(\s*<[^>]*>)?(\s+const)?\s*(?:<\w+>\s*)?&'
+              % re.escape(base_classname), constructor_args[0].strip()))
+
+    if (not is_marked_explicit and
+        onearg_constructor and
+        not initializer_list_constructor and
+        not copy_constructor):
+      if defaulted_args:
+        error(filename, linenum, 'runtime/explicit', 5,
+              'Constructors callable with one argument '
+              'should be marked explicit.')
+      else:
+        error(filename, linenum, 'runtime/explicit', 5,
+              'Single-parameter constructors should be marked explicit.')
+    elif is_marked_explicit and not onearg_constructor:
+      if noarg_constructor:
+        error(filename, linenum, 'runtime/explicit', 5,
+              'Zero-parameter constructors should not be marked explicit.')
 
 
-def CheckSpacingForFunctionCall(filename, line, linenum, error):
+def CheckSpacingForFunctionCall(filename, clean_lines, linenum, error):
   """Checks for the correctness of various spacing around function calls.
 
   Args:
     filename: The name of the current file.
-    line: The text of the line to check.
+    clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
     error: The function to call with any errors found.
   """
+  line = clean_lines.elided[linenum]
 
   # Since function calls often occur inside if/for/while/switch
   # expressions - which have their own, more liberal conventions - we
@@ -2238,10 +2989,18 @@ def CheckSpacingForFunctionCall(filename, line, linenum, error):
       error(filename, linenum, 'whitespace/parens', 2,
             'Extra space after (')
     if (Search(r'\w\s+\(', fncall) and
-        not Search(r'#\s*define|typedef', fncall) and
-        not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall)):
-      error(filename, linenum, 'whitespace/parens', 4,
-            'Extra space before ( in function call')
+        not Search(r'_{0,2}asm_{0,2}\s+_{0,2}volatile_{0,2}\s+\(', fncall) and
+        not Search(r'#\s*define|typedef|using\s+\w+\s*=', fncall) and
+        not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall) and
+        not Search(r'\bcase\s+\(', fncall)):
+      # TODO(unknown): Space after an operator function seem to be a common
+      # error, silence those for now by restricting them to highest verbosity.
+      if Search(r'\boperator_*\b', line):
+        error(filename, linenum, 'whitespace/parens', 0,
+              'Extra space before ( in function call')
+      else:
+        error(filename, linenum, 'whitespace/parens', 4,
+              'Extra space before ( in function call')
     # If the ) is followed only by a newline or a { + newline, assume it's
     # part of a control statement (if/while/etc), and don't complain
     if Search(r'[^)]\s+\)\s*[^{\s]', fncall):
@@ -2270,12 +3029,26 @@ def IsBlankLine(line):
   return not line or line.isspace()
 
 
+def CheckForNamespaceIndentation(filename, nesting_state, clean_lines, line,
+                                 error):
+  is_namespace_indent_item = (
+      len(nesting_state.stack) > 1 and
+      nesting_state.stack[-1].check_namespace_indentation and
+      isinstance(nesting_state.previous_stack_top, _NamespaceInfo) and
+      nesting_state.previous_stack_top == nesting_state.stack[-2])
+
+  if ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item,
+                                     clean_lines.elided, line):
+    CheckItemIndentationInNamespace(filename, clean_lines.elided,
+                                    line, error)
+
+
 def CheckForFunctionLengths(filename, clean_lines, linenum,
                             function_state, error):
   """Reports for long function bodies.
 
   For an overview why this is done, see:
-  http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Write_Short_Functions
+  https://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Write_Short_Functions
 
   Uses a simplistic algorithm assuming other style guidelines
   (especially spacing) are followed.
@@ -2295,8 +3068,6 @@ def CheckForFunctionLengths(filename, clean_lines, linenum,
   """
   lines = clean_lines.lines
   line = lines[linenum]
-  raw = clean_lines.raw_lines
-  raw_line = raw[linenum]
   joined_line = ''
 
   starting_func = False
@@ -2343,190 +3114,58 @@ def CheckForFunctionLengths(filename, clean_lines, linenum,
 _RE_PATTERN_TODO = re.compile(r'^//(\s*)TODO(\(.+?\))?:?(\s|$)?')
 
 
-def CheckComment(comment, filename, linenum, error):
-  """Checks for common mistakes in TODO comments.
+def CheckComment(line, filename, linenum, next_line_start, error):
+  """Checks for common mistakes in comments.
 
   Args:
-    comment: The text of the comment from the line in question.
+    line: The line in question.
     filename: The name of the current file.
     linenum: The number of the line to check.
+    next_line_start: The first non-whitespace column of the next line.
     error: The function to call with any errors found.
   """
-  match = _RE_PATTERN_TODO.match(comment)
-  if match:
-    # One whitespace is correct; zero whitespace is handled elsewhere.
-    leading_whitespace = match.group(1)
-    if len(leading_whitespace) > 1:
-      error(filename, linenum, 'whitespace/todo', 2,
-            'Too many spaces before TODO')
+  commentpos = line.find('//')
+  if commentpos != -1:
+    # Check if the // may be in quotes.  If so, ignore it
+    if re.sub(r'\\.', '', line[0:commentpos]).count('"') % 2 == 0:
+      # Allow one space for new scopes, two spaces otherwise:
+      if (not (Match(r'^.*{ *//', line) and next_line_start == commentpos) and
+          ((commentpos >= 1 and
+            line[commentpos-1] not in string.whitespace) or
+           (commentpos >= 2 and
+            line[commentpos-2] not in string.whitespace))):
+        error(filename, linenum, 'whitespace/comments', 2,
+              'At least two spaces is best between code and comments')
 
-    username = match.group(2)
-    if not username:
-      error(filename, linenum, 'readability/todo', 2,
-            'Missing username in TODO; it should look like '
-            '"// TODO(my_username): Stuff."')
+      # Checks for common mistakes in TODO comments.
+      comment = line[commentpos:]
+      match = _RE_PATTERN_TODO.match(comment)
+      if match:
+        # One whitespace is correct; zero whitespace is handled elsewhere.
+        leading_whitespace = match.group(1)
+        if len(leading_whitespace) > 1:
+          error(filename, linenum, 'whitespace/todo', 2,
+                'Too many spaces before TODO')
 
-    middle_whitespace = match.group(3)
-    # Comparisons made explicit for correctness -- pylint: disable=g-explicit-bool-comparison
-    if middle_whitespace != ' ' and middle_whitespace != '':
-      error(filename, linenum, 'whitespace/todo', 2,
-            'TODO(my_username) should be followed by a space')
+        username = match.group(2)
+        if not username:
+          error(filename, linenum, 'readability/todo', 2,
+                'Missing username in TODO; it should look like '
+                '"// TODO(my_username): Stuff."')
 
-def CheckAccess(filename, clean_lines, linenum, nesting_state, error):
-  """Checks for improper use of DISALLOW* macros.
+        middle_whitespace = match.group(3)
+        # Comparisons made explicit for correctness -- pylint: disable=g-explicit-bool-comparison
+        if middle_whitespace != ' ' and middle_whitespace != '':
+          error(filename, linenum, 'whitespace/todo', 2,
+                'TODO(my_username) should be followed by a space')
 
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    nesting_state: A _NestingState instance which maintains information about
-                   the current stack of nested blocks being parsed.
-    error: The function to call with any errors found.
-  """
-  line = clean_lines.elided[linenum]  # get rid of comments and strings
-
-  matched = Match((r'\s*(DISALLOW_COPY_AND_ASSIGN|'
-                   r'DISALLOW_EVIL_CONSTRUCTORS|'
-                   r'DISALLOW_IMPLICIT_CONSTRUCTORS)'), line)
-  if not matched:
-    return
-  if nesting_state.stack and isinstance(nesting_state.stack[-1], _ClassInfo):
-    if nesting_state.stack[-1].access != 'private':
-      error(filename, linenum, 'readability/constructors', 3,
-            '%s must be in the private: section' % matched.group(1))
-
-  else:
-    # Found DISALLOW* macro outside a class declaration, or perhaps it
-    # was used inside a function when it should have been part of the
-    # class declaration.  We could issue a warning here, but it
-    # probably resulted in a compiler error already.
-    pass
-
-
-def FindNextMatchingAngleBracket(clean_lines, linenum, init_suffix):
-  """Find the corresponding > to close a template.
-
-  Args:
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: Current line number.
-    init_suffix: Remainder of the current line after the initial <.
-
-  Returns:
-    True if a matching bracket exists.
-  """
-  line = init_suffix
-  nesting_stack = ['<']
-  while True:
-    # Find the next operator that can tell us whether < is used as an
-    # opening bracket or as a less-than operator.  We only want to
-    # warn on the latter case.
-    #
-    # We could also check all other operators and terminate the search
-    # early, e.g. if we got something like this "a<b+c", the "<" is
-    # most likely a less-than operator, but then we will get false
-    # positives for default arguments and other template expressions.
-    match = Search(r'^[^<>(),;\[\]]*([<>(),;\[\]])(.*)$', line)
-    if match:
-      # Found an operator, update nesting stack
-      operator = match.group(1)
-      line = match.group(2)
-
-      if nesting_stack[-1] == '<':
-        # Expecting closing angle bracket
-        if operator in ('<', '(', '['):
-          nesting_stack.append(operator)
-        elif operator == '>':
-          nesting_stack.pop()
-          if not nesting_stack:
-            # Found matching angle bracket
-            return True
-        elif operator == ',':
-          # Got a comma after a bracket, this is most likely a template
-          # argument.  We have not seen a closing angle bracket yet, but
-          # it's probably a few lines later if we look for it, so just
-          # return early here.
-          return True
-        else:
-          # Got some other operator.
-          return False
-
-      else:
-        # Expecting closing parenthesis or closing bracket
-        if operator in ('<', '(', '['):
-          nesting_stack.append(operator)
-        elif operator in (')', ']'):
-          # We don't bother checking for matching () or [].  If we got
-          # something like (] or [), it would have been a syntax error.
-          nesting_stack.pop()
-
-    else:
-      # Scan the next line
-      linenum += 1
-      if linenum >= len(clean_lines.elided):
-        break
-      line = clean_lines.elided[linenum]
-
-  # Exhausted all remaining lines and still no matching angle bracket.
-  # Most likely the input was incomplete, otherwise we should have
-  # seen a semicolon and returned early.
-  return True
-
-
-def FindPreviousMatchingAngleBracket(clean_lines, linenum, init_prefix):
-  """Find the corresponding < that started a template.
-
-  Args:
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: Current line number.
-    init_prefix: Part of the current line before the initial >.
-
-  Returns:
-    True if a matching bracket exists.
-  """
-  line = init_prefix
-  nesting_stack = ['>']
-  while True:
-    # Find the previous operator
-    match = Search(r'^(.*)([<>(),;\[\]])[^<>(),;\[\]]*$', line)
-    if match:
-      # Found an operator, update nesting stack
-      operator = match.group(2)
-      line = match.group(1)
-
-      if nesting_stack[-1] == '>':
-        # Expecting opening angle bracket
-        if operator in ('>', ')', ']'):
-          nesting_stack.append(operator)
-        elif operator == '<':
-          nesting_stack.pop()
-          if not nesting_stack:
-            # Found matching angle bracket
-            return True
-        elif operator == ',':
-          # Got a comma before a bracket, this is most likely a
-          # template argument.  The opening angle bracket is probably
-          # there if we look for it, so just return early here.
-          return True
-        else:
-          # Got some other operator.
-          return False
-
-      else:
-        # Expecting opening parenthesis or opening bracket
-        if operator in ('>', ')', ']'):
-          nesting_stack.append(operator)
-        elif operator in ('(', '['):
-          nesting_stack.pop()
-
-    else:
-      # Scan the previous line
-      linenum -= 1
-      if linenum < 0:
-        break
-      line = clean_lines.elided[linenum]
-
-  # Exhausted all earlier lines and still no matching angle bracket.
-  return False
+      # If the comment contains an alphanumeric character, there
+      # should be a space somewhere between it and the // unless
+      # it's a /// or //! Doxygen comment.
+      if (Match(r'//[^ ]*\w', comment) and
+          not Match(r'(///|//\!)(\s+|$)', comment)):
+        error(filename, linenum, 'whitespace/comments', 4,
+              'Should have a space between // and comment')
 
 
 def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
@@ -2542,7 +3181,7 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
     filename: The name of the current file.
     clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
-    nesting_state: A _NestingState instance which maintains information about
+    nesting_state: A NestingState instance which maintains information about
                    the current stack of nested blocks being parsed.
     error: The function to call with any errors found.
   """
@@ -2565,7 +3204,12 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
   #   }
   #
   # A warning about missing end of namespace comments will be issued instead.
-  if IsBlankLine(line) and not nesting_state.InNamespaceBody():
+  #
+  # Also skip blank line checks for 'extern "C"' blocks, which are formatted
+  # like namespaces.
+  if (IsBlankLine(line) and
+      not nesting_state.InNamespaceBody() and
+      not nesting_state.InExternC()):
     elided = clean_lines.elided
     prev_line = elided[linenum - 1]
     prevbrace = prev_line.rfind('{')
@@ -2628,54 +3272,64 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
       error(filename, linenum, 'whitespace/blank_line', 3,
             'Do not leave a blank line after "%s:"' % matched.group(1))
 
-  # Next, we complain if there's a comment too near the text
-  commentpos = line.find('//')
-  if commentpos != -1:
-    # Check if the // may be in quotes.  If so, ignore it
-    # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison
-    if (line.count('"', 0, commentpos) -
-        line.count('\\"', 0, commentpos)) % 2 == 0:   # not in quotes
-      # Allow one space for new scopes, two spaces otherwise:
-      if (not Match(r'^\s*{ //', line) and
-          ((commentpos >= 1 and
-            line[commentpos-1] not in string.whitespace) or
-           (commentpos >= 2 and
-            line[commentpos-2] not in string.whitespace))):
-        error(filename, linenum, 'whitespace/comments', 2,
-              'At least two spaces is best between code and comments')
-      # There should always be a space between the // and the comment
-      commentend = commentpos + 2
-      if commentend < len(line) and not line[commentend] == ' ':
-        # but some lines are exceptions -- e.g. if they're big
-        # comment delimiters like:
-        # //----------------------------------------------------------
-        # or are an empty C++ style Doxygen comment, like:
-        # ///
-        # or C++ style Doxygen comments placed after the variable:
-        # ///<  Header comment
-        # //!<  Header comment
-        # or they begin with multiple slashes followed by a space:
-        # //////// Header comment
-        match = (Search(r'[=/-]{4,}\s*$', line[commentend:]) or
-                 Search(r'^/$', line[commentend:]) or
-                 Search(r'^!< ', line[commentend:]) or
-                 Search(r'^/< ', line[commentend:]) or
-                 Search(r'^/+ ', line[commentend:]))
-        if not match:
-          error(filename, linenum, 'whitespace/comments', 4,
-                'Should have a space between // and comment')
-      CheckComment(line[commentpos:], filename, linenum, error)
+  # Next, check comments
+  next_line_start = 0
+  if linenum + 1 < clean_lines.NumLines():
+    next_line = raw[linenum + 1]
+    next_line_start = len(next_line) - len(next_line.lstrip())
+  CheckComment(line, filename, linenum, next_line_start, error)
 
-  line = clean_lines.elided[linenum]  # get rid of comments and strings
+  # get rid of comments and strings
+  line = clean_lines.elided[linenum]
 
-  # Don't try to do spacing checks for operator methods
-  line = re.sub(r'operator(==|!=|<|<<|<=|>=|>>|>)\(', 'operator\(', line)
+  # You shouldn't have spaces before your brackets, except maybe after
+  # 'delete []', 'return []() {};', or 'auto [abc, ...] = ...;'.
+  if Search(r'\w\s+\[', line) and not Search(r'(?:auto&?|delete|return)\s+\[', line):
+    error(filename, linenum, 'whitespace/braces', 5,
+          'Extra space before [')
+
+  # In range-based for, we wanted spaces before and after the colon, but
+  # not around "::" tokens that might appear.
+  if (Search(r'for *\(.*[^:]:[^: ]', line) or
+      Search(r'for *\(.*[^: ]:[^:]', line)):
+    error(filename, linenum, 'whitespace/forcolon', 2,
+          'Missing space around colon in range-based for loop')
+
+
+def CheckOperatorSpacing(filename, clean_lines, linenum, error):
+  """Checks for horizontal spacing around operators.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Don't try to do spacing checks for operator methods.  Do this by
+  # replacing the troublesome characters with something else,
+  # preserving column position for all other characters.
+  #
+  # The replacement is done repeatedly to avoid false positives from
+  # operators that call operators.
+  while True:
+    match = Match(r'^(.*\boperator\b)(\S+)(\s*\(.*)$', line)
+    if match:
+      line = match.group(1) + ('_' * len(match.group(2))) + match.group(3)
+    else:
+      break
 
   # We allow no-spaces around = within an if: "if ( (a=Foo()) == 0 )".
   # Otherwise not.  Note we only check for non-spaces on *both* sides;
   # sometimes people put non-spaces on one side when aligning ='s among
   # many lines (not that this is behavior that I approve of...)
-  if Search(r'[\w.]=[\w.]', line) and not Search(r'\b(if|while) ', line):
+  if ((Search(r'[\w.]=', line) or
+       Search(r'=[\w.]', line))
+      and not Search(r'\b(if|while|for) ', line)
+      # Operators taken from [lex.operators] in C++11 standard.
+      and not Search(r'(>=|<=|==|!=|&=|\^=|\|=|\+=|\*=|\/=|\%=)', line)
+      and not Search(r'operator=', line)):
     error(filename, linenum, 'whitespace/operators', 4,
           'Missing spaces around =')
 
@@ -2687,42 +3341,51 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
   #
   # Check <= and >= first to avoid false positives with < and >, then
   # check non-include lines for spacing around < and >.
-  match = Search(r'[^<>=!\s](==|!=|<=|>=)[^<>=!\s]', line)
+  #
+  # If the operator is followed by a comma, assume it's be used in a
+  # macro context and don't do any checks.  This avoids false
+  # positives.
+  #
+  # Note that && is not included here.  This is because there are too
+  # many false positives due to RValue references.
+  match = Search(r'[^<>=!\s](==|!=|<=|>=|\|\|)[^<>=!\s,;\)]', line)
   if match:
     error(filename, linenum, 'whitespace/operators', 3,
           'Missing spaces around %s' % match.group(1))
-  # We allow no-spaces around << when used like this: 10<<20, but
-  # not otherwise (particularly, not when used as streams)
-  # Also ignore using ns::operator<<;
-  match = Search(r'(operator|\S)(?:L|UL|ULL|l|ul|ull)?<<(\S)', line)
-  if (match and
-      not (match.group(1).isdigit() and match.group(2).isdigit()) and
-      not (match.group(1) == 'operator' and match.group(2) == ';')):
-    error(filename, linenum, 'whitespace/operators', 3,
-          'Missing spaces around <<')
   elif not Match(r'#.*include', line):
-    # Avoid false positives on ->
-    reduced_line = line.replace('->', '')
-
     # Look for < that is not surrounded by spaces.  This is only
     # triggered if both sides are missing spaces, even though
     # technically should should flag if at least one side is missing a
     # space.  This is done to avoid some false positives with shifts.
-    match = Search(r'[^\s<]<([^\s=<].*)', reduced_line)
-    if (match and
-        not FindNextMatchingAngleBracket(clean_lines, linenum, match.group(1))):
-      error(filename, linenum, 'whitespace/operators', 3,
-            'Missing spaces around <')
+    match = Match(r'^(.*[^\s<])<[^\s=<,]', line)
+    if match:
+      (_, _, end_pos) = CloseExpression(
+          clean_lines, linenum, len(match.group(1)))
+      if end_pos <= -1:
+        error(filename, linenum, 'whitespace/operators', 3,
+              'Missing spaces around <')
 
     # Look for > that is not surrounded by spaces.  Similar to the
     # above, we only trigger if both sides are missing spaces to avoid
     # false positives with shifts.
-    match = Search(r'^(.*[^\s>])>[^\s=>]', reduced_line)
-    if (match and
-        not FindPreviousMatchingAngleBracket(clean_lines, linenum,
-                                             match.group(1))):
-      error(filename, linenum, 'whitespace/operators', 3,
-            'Missing spaces around >')
+    match = Match(r'^(.*[^-\s>])>[^\s=>,]', line)
+    if match:
+      (_, _, start_pos) = ReverseCloseExpression(
+          clean_lines, linenum, len(match.group(1)))
+      if start_pos <= -1:
+        error(filename, linenum, 'whitespace/operators', 3,
+              'Missing spaces around >')
+
+  # We allow no-spaces around << when used like this: 10<<20, but
+  # not otherwise (particularly, not when used as streams)
+  #
+  # We also allow operators following an opening parenthesis, since
+  # those tend to be macros that deal with operators.
+  match = Search(r'(operator|[^\s(<])(?:L|UL|LL|ULL|l|ul|ll|ull)?<<([^\s,=<])', line)
+  if (match and not (match.group(1).isdigit() and match.group(2).isdigit()) and
+      not (match.group(1) == 'operator' and match.group(2) == ';')):
+    error(filename, linenum, 'whitespace/operators', 3,
+          'Missing spaces around <<')
 
   # We allow no-spaces around >> for almost anything.  This is because
   # C++11 allows ">>" to close nested templates, which accounts for
@@ -2747,7 +3410,19 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
     error(filename, linenum, 'whitespace/operators', 4,
           'Extra space for operator %s' % match.group(1))
 
-  # A pet peeve of mine: no spaces after an if, while, switch, or for
+
+def CheckParenthesisSpacing(filename, clean_lines, linenum, error):
+  """Checks for horizontal spacing around parentheses.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # No spaces after an if, while, switch, or for
   match = Search(r' (if\(|for\(|while\(|switch\()', line)
   if match:
     error(filename, linenum, 'whitespace/parens', 5,
@@ -2773,6 +3448,19 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
             'Should have zero or one spaces inside ( and ) in %s' %
             match.group(1))
 
+
+def CheckCommaSpacing(filename, clean_lines, linenum, error):
+  """Checks for horizontal spacing near commas and semicolons.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  raw = clean_lines.lines_without_raw_strings
+  line = clean_lines.elided[linenum]
+
   # You should always have a space after a comma (either as fn arg or operator)
   #
   # This does not apply when the non-space character following the
@@ -2783,7 +3471,8 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
   # verify that lines contain missing whitespaces, second pass on raw
   # lines to confirm that those missing whitespaces are not due to
   # elided comments.
-  if Search(r',[^,\s]', line) and Search(r',[^,\s]', raw[linenum]):
+  if (Search(r',[^,\s]', ReplaceAll(r'\boperator\s*,\s*\(', 'F(', line)) and
+      Search(r',[^,\s]', raw[linenum])):
     error(filename, linenum, 'whitespace/comma', 3,
           'Missing space after ,')
 
@@ -2795,14 +3484,91 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
     error(filename, linenum, 'whitespace/semicolon', 3,
           'Missing space after ;')
 
-  # Next we will look for issues with function calls.
-  CheckSpacingForFunctionCall(filename, line, linenum, error)
+
+def _IsType(clean_lines, nesting_state, expr):
+  """Check if expression looks like a type name, returns true if so.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    nesting_state: A NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    expr: The expression to check.
+  Returns:
+    True, if token looks like a type.
+  """
+  # Keep only the last token in the expression
+  last_word = Match(r'^.*(\b\S+)$', expr)
+  if last_word:
+    token = last_word.group(1)
+  else:
+    token = expr
+
+  # Match native types and stdint types
+  if _TYPES.match(token):
+    return True
+
+  # Try a bit harder to match templated types.  Walk up the nesting
+  # stack until we find something that resembles a typename
+  # declaration for what we are looking for.
+  typename_pattern = (r'\b(?:typename|class|struct)\s+' + re.escape(token) +
+                      r'\b')
+  block_index = len(nesting_state.stack) - 1
+  while block_index >= 0:
+    if isinstance(nesting_state.stack[block_index], _NamespaceInfo):
+      return False
+
+    # Found where the opening brace is.  We want to scan from this
+    # line up to the beginning of the function, minus a few lines.
+    #   template <typename Type1,  // stop scanning here
+    #             ...>
+    #   class C
+    #     : public ... {  // start scanning here
+    last_line = nesting_state.stack[block_index].starting_linenum
+
+    next_block_start = 0
+    if block_index > 0:
+      next_block_start = nesting_state.stack[block_index - 1].starting_linenum
+    first_line = last_line
+    while first_line >= next_block_start:
+      if clean_lines.elided[first_line].find('template') >= 0:
+        break
+      first_line -= 1
+    if first_line < next_block_start:
+      # Didn't find any "template" keyword before reaching the next block,
+      # there are probably no template things to check for this block
+      block_index -= 1
+      continue
+
+    # Look for typename in the specified range
+    for i in xrange(first_line, last_line + 1, 1):
+      if Search(typename_pattern, clean_lines.elided[i]):
+        return True
+    block_index -= 1
+
+  return False
+
+
+def CheckBracesSpacing(filename, clean_lines, linenum, nesting_state, error):
+  """Checks for horizontal spacing near commas.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
 
   # Except after an opening paren, or after another opening brace (in case of
   # an initializer list, for instance), you should have spaces before your
-  # braces. And since you should never have braces at the beginning of a line,
-  # this is an easy test.
-  match = Match(r'^(.*[^ ({]){', line)
+  # braces when they are delimiting blocks, classes, namespaces etc.
+  # And since you should never have braces at the beginning of a line,
+  # this is an easy test.  Except that braces used for initialization don't
+  # follow the same rule; we often don't want spaces before those.
+  match = Match(r'^(.*[^ ({>]){', line)
+
   if match:
     # Try a bit harder to check for brace initialization.  This
     # happens in one of the following forms:
@@ -2813,10 +3579,12 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
     #   LastArgument(..., type{});
     #   LOG(INFO) << type{} << " ...";
     #   map_of_type[{...}] = ...;
+    #   ternary = expr ? new type{} : nullptr;
+    #   OuterTemplate<InnerTemplateConstructor<Type>{}>
     #
     # We check for the character following the closing brace, and
     # silence the warning if it's one of those listed above, i.e.
-    # "{.;,)<]".
+    # "{.;,)<>]:".
     #
     # To account for nested initializer list, we allow any number of
     # closing braces up to "{;,)<".  We can't simply silence the
@@ -2830,6 +3598,7 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
     # There is a false negative with this approach if people inserted
     # spurious semicolons, e.g. "if (cond){};", but we will catch the
     # spurious semicolon with a separate check.
+    leading_text = match.group(1)
     (endline, endlinenum, endpos) = CloseExpression(
         clean_lines, linenum, len(match.group(1)))
     trailing_text = ''
@@ -2838,7 +3607,11 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
     for offset in xrange(endlinenum + 1,
                          min(endlinenum + 3, clean_lines.NumLines() - 1)):
       trailing_text += clean_lines.elided[offset]
-    if not Match(r'^[\s}]*[{.;,)<\]]', trailing_text):
+    # We also suppress warnings for `uint64_t{expression}` etc., as the style
+    # guide recommends brace initialization for integral types to avoid
+    # overflow/truncation.
+    if (not Match(r'^[\s}]*[{.;,)<>\]:]', trailing_text)
+        and not _IsType(clean_lines, nesting_state, leading_text)):
       error(filename, linenum, 'whitespace/braces', 5,
             'Missing space before {')
 
@@ -2847,12 +3620,6 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
     error(filename, linenum, 'whitespace/braces', 5,
           'Missing space before else')
 
-  # You shouldn't have spaces before your brackets, except maybe after
-  # 'delete []' or 'new char * []'.
-  if Search(r'\w\s+\[', line) and not Search(r'delete\s+\[', line):
-    error(filename, linenum, 'whitespace/braces', 5,
-          'Extra space before [')
-
   # You shouldn't have a space before a semicolon at the end of the line.
   # There's a special case for "for" since the style guide allows space before
   # the semicolon there.
@@ -2869,12 +3636,23 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
           'Extra space before last semicolon. If this should be an empty '
           'statement, use {} instead.')
 
-  # In range-based for, we wanted spaces before and after the colon, but
-  # not around "::" tokens that might appear.
-  if (Search('for *\(.*[^:]:[^: ]', line) or
-      Search('for *\(.*[^: ]:[^:]', line)):
-    error(filename, linenum, 'whitespace/forcolon', 2,
-          'Missing space around colon in range-based for loop')
+
+def IsDecltype(clean_lines, linenum, column):
+  """Check if the token ending on (linenum, column) is decltype().
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: the number of the line to check.
+    column: end column of the token to check.
+  Returns:
+    True if this token is decltype() expression, False otherwise.
+  """
+  (text, _, start_col) = ReverseCloseExpression(clean_lines, linenum, column)
+  if start_col < 0:
+    return False
+  if Search(r'\bdecltype\s*$', text[0:start_col]):
+    return True
+  return False
 
 
 def CheckSectionSpacing(filename, clean_lines, class_info, linenum, error):
@@ -2974,15 +3752,18 @@ def CheckBraces(filename, clean_lines, linenum, error):
     # used for brace initializers inside function calls.  We don't detect this
     # perfectly: we just don't complain if the last non-whitespace character on
     # the previous non-blank line is ',', ';', ':', '(', '{', or '}', or if the
-    # previous line starts a preprocessor block.
+    # previous line starts a preprocessor block. We also allow a brace on the
+    # following line if it is part of an array initialization and would not fit
+    # within the 80 character limit of the preceding line.
     prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
     if (not Search(r'[,;:}{(]\s*$', prevline) and
-        not Match(r'\s*#', prevline)):
+        not Match(r'\s*#', prevline) and
+        not (GetLineWidth(prevline) > _line_length - 2 and '[]' in prevline)):
       error(filename, linenum, 'whitespace/braces', 4,
             '{ should almost always be at the end of the previous line')
 
   # An else clause should be on the same line as the preceding closing brace.
-  if Match(r'\s*else\s*', line):
+  if Match(r'\s*else\b\s*(?:if\b|\{|$)', line):
     prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
     if Match(r'\s*}\s*$', prevline):
       error(filename, linenum, 'whitespace/newline', 4,
@@ -2990,19 +3771,20 @@ def CheckBraces(filename, clean_lines, linenum, error):
 
   # If braces come on one side of an else, they should be on both.
   # However, we have to worry about "else if" that spans multiple lines!
-  if Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line):
-    if Search(r'}\s*else if([^{]*)$', line):       # could be multi-line if
-      # find the ( after the if
-      pos = line.find('else if')
-      pos = line.find('(', pos)
-      if pos > 0:
-        (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos)
-        if endline[endpos:].find('{') == -1:    # must be brace after if
-          error(filename, linenum, 'readability/braces', 5,
-                'If an else has a brace on one side, it should have it on both')
-    else:            # common case: else not followed by a multi-line if
-      error(filename, linenum, 'readability/braces', 5,
-            'If an else has a brace on one side, it should have it on both')
+  if Search(r'else if\s*\(', line):       # could be multi-line if
+    brace_on_left = bool(Search(r'}\s*else if\s*\(', line))
+    # find the ( after the if
+    pos = line.find('else if')
+    pos = line.find('(', pos)
+    if pos > 0:
+      (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos)
+      brace_on_right = endline[endpos:].find('{') != -1
+      if brace_on_left != brace_on_right:    # must be brace after if
+        error(filename, linenum, 'readability/braces', 5,
+              'If an else has a brace on one side, it should have it on both')
+  elif Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line):
+    error(filename, linenum, 'readability/braces', 5,
+          'If an else has a brace on one side, it should have it on both')
 
   # Likewise, an else should never have the else clause on the same line
   if Search(r'\belse [^\s{]', line) and not Search(r'\belse if\b', line):
@@ -3014,11 +3796,79 @@ def CheckBraces(filename, clean_lines, linenum, error):
     error(filename, linenum, 'whitespace/newline', 4,
           'do/while clauses should not be on a single line')
 
+  # Check single-line if/else bodies. The style guide says 'curly braces are not
+  # required for single-line statements'. We additionally allow multi-line,
+  # single statements, but we reject anything with more than one semicolon in
+  # it. This means that the first semicolon after the if should be at the end of
+  # its line, and the line after that should have an indent level equal to or
+  # lower than the if. We also check for ambiguous if/else nesting without
+  # braces.
+  if_else_match = Search(r'\b(if\s*\(|else\b)', line)
+  if if_else_match and not Match(r'\s*#', line):
+    if_indent = GetIndentLevel(line)
+    endline, endlinenum, endpos = line, linenum, if_else_match.end()
+    if_match = Search(r'\bif\s*\(', line)
+    if if_match:
+      # This could be a multiline if condition, so find the end first.
+      pos = if_match.end() - 1
+      (endline, endlinenum, endpos) = CloseExpression(clean_lines, linenum, pos)
+    # Check for an opening brace, either directly after the if or on the next
+    # line. If found, this isn't a single-statement conditional.
+    if (not Match(r'\s*{', endline[endpos:])
+        and not (Match(r'\s*$', endline[endpos:])
+                 and endlinenum < (len(clean_lines.elided) - 1)
+                 and Match(r'\s*{', clean_lines.elided[endlinenum + 1]))):
+      while (endlinenum < len(clean_lines.elided)
+             and ';' not in clean_lines.elided[endlinenum][endpos:]):
+        endlinenum += 1
+        endpos = 0
+      if endlinenum < len(clean_lines.elided):
+        endline = clean_lines.elided[endlinenum]
+        # We allow a mix of whitespace and closing braces (e.g. for one-liner
+        # methods) and a single \ after the semicolon (for macros)
+        endpos = endline.find(';')
+        if not Match(r';[\s}]*(\\?)$', endline[endpos:]):
+          # Semicolon isn't the last character, there's something trailing.
+          # Output a warning if the semicolon is not contained inside
+          # a lambda expression.
+          if not Match(r'^[^{};]*\[[^\[\]]*\][^{}]*\{[^{}]*\}\s*\)*[;,]\s*$',
+                       endline):
+            error(filename, linenum, 'readability/braces', 4,
+                  'If/else bodies with multiple statements require braces')
+        elif endlinenum < len(clean_lines.elided) - 1:
+          # Make sure the next line is dedented
+          next_line = clean_lines.elided[endlinenum + 1]
+          next_indent = GetIndentLevel(next_line)
+          # With ambiguous nested if statements, this will error out on the
+          # if that *doesn't* match the else, regardless of whether it's the
+          # inner one or outer one.
+          if (if_match and Match(r'\s*else\b', next_line)
+              and next_indent != if_indent):
+            error(filename, linenum, 'readability/braces', 4,
+                  'Else clause should be indented at the same level as if. '
+                  'Ambiguous nested if/else chains require braces.')
+          elif next_indent > if_indent:
+            error(filename, linenum, 'readability/braces', 4,
+                  'If/else bodies with multiple statements require braces')
+
+
+def CheckTrailingSemicolon(filename, clean_lines, linenum, error):
+  """Looks for redundant trailing semicolon.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  line = clean_lines.elided[linenum]
+
   # Block bodies should not be followed by a semicolon.  Due to C++11
   # brace initialization, there are more places where semicolons are
-  # required than not, so we use a whitelist approach to check these
-  # rather than a blacklist.  These are the places where "};" should
-  # be replaced by just "}":
+  # required than not, so we explicitly list the allowed rules rather
+  # than listing the disallowed ones.  These are the places where "};"
+  # should be replaced by just "}":
   # 1. Some flavor of block following closing parenthesis:
   #    for (;;) {};
   #    while (...) {};
@@ -3074,28 +3924,40 @@ def CheckBraces(filename, clean_lines, linenum, error):
     #  - INTERFACE_DEF
     #  - EXCLUSIVE_LOCKS_REQUIRED, SHARED_LOCKS_REQUIRED, LOCKS_EXCLUDED:
     #
-    # We implement a whitelist of safe macros instead of a blacklist of
+    # We implement a list of safe macros instead of a list of
     # unsafe macros, even though the latter appears less frequently in
     # google code and would have been easier to implement.  This is because
-    # the downside for getting the whitelist wrong means some extra
-    # semicolons, while the downside for getting the blacklist wrong
+    # the downside for getting the allowed checks wrong means some extra
+    # semicolons, while the downside for getting disallowed checks wrong
     # would result in compile errors.
     #
-    # In addition to macros, we also don't want to warn on compound
-    # literals.
+    # In addition to macros, we also don't want to warn on
+    #  - Compound literals
+    #  - Lambdas
+    #  - alignas specifier with anonymous structs
+    #  - decltype
     closing_brace_pos = match.group(1).rfind(')')
     opening_parenthesis = ReverseCloseExpression(
         clean_lines, linenum, closing_brace_pos)
     if opening_parenthesis[2] > -1:
       line_prefix = opening_parenthesis[0][0:opening_parenthesis[2]]
-      macro = Search(r'\b([A-Z_]+)\s*$', line_prefix)
+      macro = Search(r'\b([A-Z_][A-Z0-9_]*)\s*$', line_prefix)
+      func = Match(r'^(.*\])\s*$', line_prefix)
       if ((macro and
            macro.group(1) not in (
                'TEST', 'TEST_F', 'MATCHER', 'MATCHER_P', 'TYPED_TEST',
                'EXCLUSIVE_LOCKS_REQUIRED', 'SHARED_LOCKS_REQUIRED',
                'LOCKS_EXCLUDED', 'INTERFACE_DEF')) or
+          (func and not Search(r'\boperator\s*\[\s*\]', func.group(1))) or
+          Search(r'\b(?:struct|union)\s+alignas\s*$', line_prefix) or
+          Search(r'\bdecltype$', line_prefix) or
           Search(r'\s+=\s*$', line_prefix)):
         match = None
+    if (match and
+        opening_parenthesis[1] > 1 and
+        Search(r'\]\s*$', clean_lines.elided[opening_parenthesis[1] - 1])):
+      # Multi-line lambda-expression
+      match = None
 
   else:
     # Try matching cases 2-3.
@@ -3125,6 +3987,14 @@ def CheckBraces(filename, clean_lines, linenum, error):
       # outputting warnings for the matching closing brace, if there are
       # nested blocks with trailing semicolons, we will get the error
       # messages in reversed order.
+
+      # We need to check the line forward for NOLINT
+      raw_lines = clean_lines.raw_lines
+      ParseNolintSuppressions(filename, raw_lines[endlinenum-1], endlinenum-1,
+                              error)
+      ParseNolintSuppressions(filename, raw_lines[endlinenum], endlinenum,
+                              error)
+
       error(filename, endlinenum, 'readability/braces', 4,
             "You don't need a ; after a }")
 
@@ -3148,7 +4018,7 @@ def CheckEmptyBlockBody(filename, clean_lines, linenum, error):
   line = clean_lines.elided[linenum]
   matched = Match(r'\s*(for|while|if)\s*\(', line)
   if matched:
-    # Find the end of the conditional expression
+    # Find the end of the conditional expression.
     (end_line, end_linenum, end_pos) = CloseExpression(
         clean_lines, linenum, line.find('('))
 
@@ -3163,6 +4033,98 @@ def CheckEmptyBlockBody(filename, clean_lines, linenum, error):
         error(filename, end_linenum, 'whitespace/empty_loop_body', 5,
               'Empty loop bodies should use {} or continue')
 
+    # Check for if statements that have completely empty bodies (no comments)
+    # and no else clauses.
+    if end_pos >= 0 and matched.group(1) == 'if':
+      # Find the position of the opening { for the if statement.
+      # Return without logging an error if it has no brackets.
+      opening_linenum = end_linenum
+      opening_line_fragment = end_line[end_pos:]
+      # Loop until EOF or find anything that's not whitespace or opening {.
+      while not Search(r'^\s*\{', opening_line_fragment):
+        if Search(r'^(?!\s*$)', opening_line_fragment):
+          # Conditional has no brackets.
+          return
+        opening_linenum += 1
+        if opening_linenum == len(clean_lines.elided):
+          # Couldn't find conditional's opening { or any code before EOF.
+          return
+        opening_line_fragment = clean_lines.elided[opening_linenum]
+      # Set opening_line (opening_line_fragment may not be entire opening line).
+      opening_line = clean_lines.elided[opening_linenum]
+
+      # Find the position of the closing }.
+      opening_pos = opening_line_fragment.find('{')
+      if opening_linenum == end_linenum:
+        # We need to make opening_pos relative to the start of the entire line.
+        opening_pos += end_pos
+      (closing_line, closing_linenum, closing_pos) = CloseExpression(
+          clean_lines, opening_linenum, opening_pos)
+      if closing_pos < 0:
+        return
+
+      # Now construct the body of the conditional. This consists of the portion
+      # of the opening line after the {, all lines until the closing line,
+      # and the portion of the closing line before the }.
+      if (clean_lines.raw_lines[opening_linenum] !=
+          CleanseComments(clean_lines.raw_lines[opening_linenum])):
+        # Opening line ends with a comment, so conditional isn't empty.
+        return
+      if closing_linenum > opening_linenum:
+        # Opening line after the {. Ignore comments here since we checked above.
+        body = list(opening_line[opening_pos+1:])
+        # All lines until closing line, excluding closing line, with comments.
+        body.extend(clean_lines.raw_lines[opening_linenum+1:closing_linenum])
+        # Closing line before the }. Won't (and can't) have comments.
+        body.append(clean_lines.elided[closing_linenum][:closing_pos-1])
+        body = '\n'.join(body)
+      else:
+        # If statement has brackets and fits on a single line.
+        body = opening_line[opening_pos+1:closing_pos-1]
+
+      # Check if the body is empty
+      if not _EMPTY_CONDITIONAL_BODY_PATTERN.search(body):
+        return
+      # The body is empty. Now make sure there's not an else clause.
+      current_linenum = closing_linenum
+      current_line_fragment = closing_line[closing_pos:]
+      # Loop until EOF or find anything that's not whitespace or else clause.
+      while Search(r'^\s*$|^(?=\s*else)', current_line_fragment):
+        if Search(r'^(?=\s*else)', current_line_fragment):
+          # Found an else clause, so don't log an error.
+          return
+        current_linenum += 1
+        if current_linenum == len(clean_lines.elided):
+          break
+        current_line_fragment = clean_lines.elided[current_linenum]
+
+      # The body is empty and there's no else clause until EOF or other code.
+      error(filename, end_linenum, 'whitespace/empty_if_body', 4,
+            ('If statement had no body and no else clause'))
+
+
+def FindCheckMacro(line):
+  """Find a replaceable CHECK-like macro.
+
+  Args:
+    line: line to search on.
+  Returns:
+    (macro name, start position), or (None, -1) if no replaceable
+    macro is found.
+  """
+  for macro in _CHECK_MACROS:
+    i = line.find(macro)
+    if i >= 0:
+      # Find opening parenthesis.  Do a regular expression match here
+      # to make sure that we are matching the expected CHECK macro, as
+      # opposed to some other macro that happens to contain the CHECK
+      # substring.
+      matched = Match(r'^(.*\b' + macro + r'\s*)\(', line)
+      if not matched:
+        continue
+      return (macro, len(matched.group(1)))
+  return (None, -1)
+
 
 def CheckCheck(filename, clean_lines, linenum, error):
   """Checks the use of CHECK and EXPECT macros.
@@ -3176,24 +4138,8 @@ def CheckCheck(filename, clean_lines, linenum, error):
 
   # Decide the set of replacement macros that should be suggested
   lines = clean_lines.elided
-  check_macro = None
-  start_pos = -1
-  for macro in _CHECK_MACROS:
-    i = lines[linenum].find(macro)
-    if i >= 0:
-      check_macro = macro
-
-      # Find opening parenthesis.  Do a regular expression match here
-      # to make sure that we are matching the expected CHECK macro, as
-      # opposed to some other macro that happens to contain the CHECK
-      # substring.
-      matched = Match(r'^(.*\b' + check_macro + r'\s*)\(', lines[linenum])
-      if not matched:
-        continue
-      start_pos = len(matched.group(1))
-      break
-  if not check_macro or start_pos < 0:
-    # Don't waste time here if line doesn't contain 'CHECK' or 'EXPECT'
+  (check_macro, start_pos) = FindCheckMacro(lines[linenum])
+  if not check_macro:
     return
 
   # Find end of the boolean expression by matching parentheses
@@ -3201,6 +4147,13 @@ def CheckCheck(filename, clean_lines, linenum, error):
       clean_lines, linenum, start_pos)
   if end_pos < 0:
     return
+
+  # If the check macro is followed by something other than a
+  # semicolon, assume users will log their own custom error messages
+  # and don't suggest any replacements.
+  if not Match(r'\s*;', last_line[end_pos:]):
+    return
+
   if linenum == end_line:
     expression = lines[linenum][start_pos + 1:end_pos - 1]
   else:
@@ -3223,7 +4176,7 @@ def CheckCheck(filename, clean_lines, linenum, error):
       if token == '(':
         # Parenthesized operand
         expression = matched.group(2)
-        (end, _) = FindEndOfExpressionInLine(expression, 0, 1, '(', ')')
+        (end, _) = FindEndOfExpressionInLine(expression, 0, ['('])
         if end < 0:
           return  # Unmatched parenthesis
         lhs += '(' + expression[0:end]
@@ -3339,6 +4292,16 @@ def GetLineWidth(line):
       if unicodedata.east_asian_width(uc) in ('W', 'F'):
         width += 2
       elif not unicodedata.combining(uc):
+        # Issue 337
+        # https://mail.python.org/pipermail/python-list/2012-August/628809.html
+        if (sys.version_info.major, sys.version_info.minor) <= (3, 2):
+          # https://github.com/python/cpython/blob/2.7/Include/unicodeobject.h#L81
+          is_wide_build = sysconfig.get_config_var("Py_UNICODE_SIZE") >= 4
+          # https://github.com/python/cpython/blob/2.7/Objects/unicodeobject.c#L564
+          is_low_surrogate = 0xDC00 <= ord(uc) <= 0xDFFF
+          if not is_wide_build and is_low_surrogate:
+            width -= 1
+
         width += 1
     return width
   else:
@@ -3358,7 +4321,7 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
     clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
     file_extension: The extension (without the dot) of the filename.
-    nesting_state: A _NestingState instance which maintains information about
+    nesting_state: A NestingState instance which maintains information about
                    the current stack of nested blocks being parsed.
     error: The function to call with any errors found.
   """
@@ -3368,6 +4331,7 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
   # raw strings,
   raw_lines = clean_lines.lines_without_raw_strings
   line = raw_lines[linenum]
+  prev = raw_lines[linenum - 1] if linenum > 0 else ''
 
   if line.find('\t') != -1:
     error(filename, linenum, 'whitespace/tab', 1,
@@ -3385,23 +4349,33 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
   # if(match($0, " <<")) complain = 0;
   # if(match(prev, " +for \\(")) complain = 0;
   # if(prevodd && match(prevprev, " +for \\(")) complain = 0;
+  scope_or_label_pattern = r'\s*\w+\s*:\s*\\?$'
+  classinfo = nesting_state.InnermostClass()
   initial_spaces = 0
   cleansed_line = clean_lines.elided[linenum]
   while initial_spaces < len(line) and line[initial_spaces] == ' ':
     initial_spaces += 1
-  if line and line[-1].isspace():
-    error(filename, linenum, 'whitespace/end_of_line', 4,
-          'Line ends in whitespace.  Consider deleting these extra spaces.')
-  # There are certain situations we allow one space, notably for section labels
-  elif ((initial_spaces == 1 or initial_spaces == 3) and
-        not Match(r'\s*\w+\s*:\s*$', cleansed_line)):
+  # There are certain situations we allow one space, notably for
+  # section labels, and also lines containing multi-line raw strings.
+  # We also don't check for lines that look like continuation lines
+  # (of lines ending in double quotes, commas, equals, or angle brackets)
+  # because the rules for how to indent those are non-trivial.
+  if (not Search(r'[",=><] *$', prev) and
+      (initial_spaces == 1 or initial_spaces == 3) and
+      not Match(scope_or_label_pattern, cleansed_line) and
+      not (clean_lines.raw_lines[linenum] != line and
+           Match(r'^\s*""', line))):
     error(filename, linenum, 'whitespace/indent', 3,
           'Weird number of spaces at line-start.  '
           'Are you using a 2-space indent?')
 
+  if line and line[-1].isspace():
+    error(filename, linenum, 'whitespace/end_of_line', 4,
+          'Line ends in whitespace.  Consider deleting these extra spaces.')
+
   # Check if the line is a header guard.
   is_header_guard = False
-  if file_extension == 'h':
+  if IsHeaderExtension(file_extension):
     cppvar = GetHeaderGuardCPPVariable(filename)
     if (line.startswith('#ifndef %s' % cppvar) or
         line.startswith('#define %s' % cppvar) or
@@ -3417,14 +4391,10 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
   # developers fault.
   if (not line.startswith('#include') and not is_header_guard and
       not Match(r'^\s*//.*http(s?)://\S*$', line) and
+      not Match(r'^\s*//\s*[^\s]*$', line) and
       not Match(r'^// \$Id:.*#[0-9]+ \$$', line)):
     line_width = GetLineWidth(line)
-    extended_length = int((_line_length * 1.25))
-    if line_width > extended_length:
-      error(filename, linenum, 'whitespace/line_length', 4,
-            'Lines should very rarely be longer than %i characters' %
-            extended_length)
-    elif line_width > _line_length:
+    if line_width > _line_length:
       error(filename, linenum, 'whitespace/line_length', 2,
             'Lines should be <= %i characters long' % _line_length)
 
@@ -3442,9 +4412,14 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
 
   # Some more style checks
   CheckBraces(filename, clean_lines, linenum, error)
+  CheckTrailingSemicolon(filename, clean_lines, linenum, error)
   CheckEmptyBlockBody(filename, clean_lines, linenum, error)
-  CheckAccess(filename, clean_lines, linenum, nesting_state, error)
   CheckSpacing(filename, clean_lines, linenum, nesting_state, error)
+  CheckOperatorSpacing(filename, clean_lines, linenum, error)
+  CheckParenthesisSpacing(filename, clean_lines, linenum, error)
+  CheckCommaSpacing(filename, clean_lines, linenum, error)
+  CheckBracesSpacing(filename, clean_lines, linenum, nesting_state, error)
+  CheckSpacingForFunctionCall(filename, clean_lines, linenum, error)
   CheckCheck(filename, clean_lines, linenum, error)
   CheckAltTokens(filename, clean_lines, linenum, error)
   classinfo = nesting_state.InnermostClass()
@@ -3452,7 +4427,6 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
     CheckSectionSpacing(filename, clean_lines, classinfo, linenum, error)
 
 
-_RE_PATTERN_INCLUDE_NEW_STYLE = re.compile(r'#include +"[^/]+\.h"')
 _RE_PATTERN_INCLUDE = re.compile(r'^\s*#\s*include\s*([<"])([^>"]*)[>"].*$')
 # Matches the first component of a filename delimited by -s and _s. That is:
 #  _RE_FIRST_COMPONENT.match('foo').group(0) == 'foo'
@@ -3489,23 +4463,6 @@ def _DropCommonSuffixes(filename):
   return os.path.splitext(filename)[0]
 
 
-def _IsTestFilename(filename):
-  """Determines if the given filename has a suffix that identifies it as a test.
-
-  Args:
-    filename: The input filename.
-
-  Returns:
-    True if 'filename' looks like a test, False otherwise.
-  """
-  if (filename.endswith('_test.cc') or
-      filename.endswith('_unittest.cc') or
-      filename.endswith('_regtest.cc')):
-    return True
-  else:
-    return False
-
-
 def _ClassifyInclude(fileinfo, include, is_system):
   """Figures out what kind of header 'include' is.
 
@@ -3581,11 +4538,17 @@ def CheckIncludeLine(filename, clean_lines, linenum, include_state, error):
     error: The function to call with any errors found.
   """
   fileinfo = FileInfo(filename)
-
   line = clean_lines.lines[linenum]
 
   # "include" should use the new style "foo/bar.h" instead of just "bar.h"
-  if _RE_PATTERN_INCLUDE_NEW_STYLE.search(line):
+  # Only do this check if the included header follows google naming
+  # conventions.  If not, assume that it's a 3rd party API that
+  # requires special include conventions.
+  #
+  # We also make an exception for Lua headers, which follow google
+  # naming convention but not the include convention.
+  match = Match(r'#include\s*"([^/]+\.h)"', line)
+  if match and not _THIRD_PARTY_HEADERS_PATTERN.match(match.group(1)):
     error(filename, linenum, 'build/include', 4,
           'Include the directory when naming .h files')
 
@@ -3596,12 +4559,17 @@ def CheckIncludeLine(filename, clean_lines, linenum, include_state, error):
   if match:
     include = match.group(2)
     is_system = (match.group(1) == '<')
-    if include in include_state:
+    duplicate_line = include_state.FindHeader(include)
+    if duplicate_line >= 0:
       error(filename, linenum, 'build/include', 4,
             '"%s" already included at %s:%s' %
-            (include, filename, include_state[include]))
-    else:
-      include_state[include] = linenum
+            (include, filename, duplicate_line))
+    elif (include.endswith('.cc') and
+          os.path.dirname(fileinfo.RepositoryName()) != os.path.dirname(include)):
+      error(filename, linenum, 'build/include', 4,
+            'Do not include .cc files from other packages')
+    elif not _THIRD_PARTY_HEADERS_PATTERN.match(include):
+      include_state.include_list[-1].append((include, linenum))
 
       # We want to ensure that headers appear in the right order:
       # 1) for foo.cc, foo.h  (preferred location)
@@ -3627,15 +4595,6 @@ def CheckIncludeLine(filename, clean_lines, linenum, include_state, error):
               'Include "%s" not in alphabetical order' % include)
       include_state.SetLastHeader(canonical_include)
 
-  # Look for any of the stream classes that are part of standard C++.
-  match = _RE_PATTERN_INCLUDE.match(line)
-  if match:
-    include = match.group(2)
-    if Match(r'(f|ind|io|i|o|parse|pf|stdio|str|)?stream$', include):
-      # Many unit tests use cout, so we exempt them.
-      if not _IsTestFilename(filename):
-        error(filename, linenum, 'readability/streams', 3,
-              'Streams are highly discouraged.')
 
 
 def _GetTextInside(text, start_pattern):
@@ -3658,7 +4617,7 @@ def _GetTextInside(text, start_pattern):
     The extracted text.
     None if either the opening string or ending punctuation could not be found.
   """
-  # TODO(sugawarayu): Audit cpplint.py to see what places could be profitably
+  # TODO(unknown): Audit cpplint.py to see what places could be profitably
   # rewritten to use _GetTextInside (and use inferior regexp matching today).
 
   # Give opening punctuations to get the matching close-punctuations.
@@ -3718,6 +4677,9 @@ _RE_PATTERN_REF_PARAM = re.compile(
 _RE_PATTERN_CONST_REF_PARAM = (
     r'(?:.*\s*\bconst\s*&\s*' + _RE_PATTERN_IDENT +
     r'|const\s+' + _RE_PATTERN_TYPE + r'\s*&\s*' + _RE_PATTERN_IDENT + r')')
+# Stream types.
+_RE_PATTERN_REF_STREAM_PARAM = (
+    r'(?:.*stream\s*&\s*' + _RE_PATTERN_IDENT + r')')
 
 
 def CheckLanguage(filename, clean_lines, linenum, file_extension,
@@ -3733,7 +4695,7 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension,
     linenum: The number of the line to check.
     file_extension: The extension (without the dot) of the filename.
     include_state: An _IncludeState instance in which the headers are inserted.
-    nesting_state: A _NestingState instance which maintains information about
+    nesting_state: A NestingState instance which maintains information about
                    the current stack of nested blocks being parsed.
     error: The function to call with any errors found.
   """
@@ -3750,129 +4712,23 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension,
 
   # Reset include state across preprocessor directives.  This is meant
   # to silence warnings for conditional includes.
-  if Match(r'^\s*#\s*(?:ifdef|elif|else|endif)\b', line):
-    include_state.ResetSection()
+  match = Match(r'^\s*#\s*(if|ifdef|ifndef|elif|else|endif)\b', line)
+  if match:
+    include_state.ResetSection(match.group(1))
 
   # Make Windows paths like Unix.
   fullname = os.path.abspath(filename).replace('\\', '/')
 
-  # TODO(unknown): figure out if they're using default arguments in fn proto.
+  # Perform other checks now that we are sure that this is not an include line
+  CheckCasts(filename, clean_lines, linenum, error)
+  CheckGlobalStatic(filename, clean_lines, linenum, error)
+  CheckPrintf(filename, clean_lines, linenum, error)
 
-  # Check to see if they're using an conversion function cast.
-  # I just try to capture the most common basic types, though there are more.
-  # Parameterless conversion functions, such as bool(), are allowed as they are
-  # probably a member operator declaration or default constructor.
-  match = Search(
-      r'(\bnew\s+)?\b'  # Grab 'new' operator, if it's there
-      r'(int|float|double|bool|char|int32|uint32|int64|uint64)'
-      r'(\([^)].*)', line)
-  if match:
-    matched_new = match.group(1)
-    matched_type = match.group(2)
-    matched_funcptr = match.group(3)
-
-    # gMock methods are defined using some variant of MOCK_METHODx(name, type)
-    # where type may be float(), int(string), etc.  Without context they are
-    # virtually indistinguishable from int(x) casts. Likewise, gMock's
-    # MockCallback takes a template parameter of the form return_type(arg_type),
-    # which looks much like the cast we're trying to detect.
-    #
-    # std::function<> wrapper has a similar problem.
-    #
-    # Return types for function pointers also look like casts if they
-    # don't have an extra space.
-    if (matched_new is None and  # If new operator, then this isn't a cast
-        not (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or
-             Search(r'\bMockCallback<.*>', line) or
-             Search(r'\bstd::function<.*>', line)) and
-        not (matched_funcptr and
-             Match(r'\((?:[^() ]+::\s*\*\s*)?[^() ]+\)\s*\(',
-                   matched_funcptr))):
-      # Try a bit harder to catch gmock lines: the only place where
-      # something looks like an old-style cast is where we declare the
-      # return type of the mocked method, and the only time when we
-      # are missing context is if MOCK_METHOD was split across
-      # multiple lines.  The missing MOCK_METHOD is usually one or two
-      # lines back, so scan back one or two lines.
-      #
-      # It's not possible for gmock macros to appear in the first 2
-      # lines, since the class head + section name takes up 2 lines.
-      if (linenum < 2 or
-          not (Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\((?:\S+,)?\s*$',
-                     clean_lines.elided[linenum - 1]) or
-               Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\(\s*$',
-                     clean_lines.elided[linenum - 2]))):
-        error(filename, linenum, 'readability/casting', 4,
-              'Using deprecated casting style.  '
-              'Use static_cast<%s>(...) instead' %
-              matched_type)
-
-  CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
-                  'static_cast',
-                  r'\((int|float|double|bool|char|u?int(16|32|64))\)', error)
-
-  # This doesn't catch all cases. Consider (const char * const)"hello".
-  #
-  # (char *) "foo" should always be a const_cast (reinterpret_cast won't
-  # compile).
-  if CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
-                     'const_cast', r'\((char\s?\*+\s?)\)\s*"', error):
-    pass
-  else:
-    # Check pointer casts for other than string constants
-    CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
-                    'reinterpret_cast', r'\((\w+\s?\*+\s?)\)', error)
-
-  # In addition, we look for people taking the address of a cast.  This
-  # is dangerous -- casts can assign to temporaries, so the pointer doesn't
-  # point where you think.
-  match = Search(
-      r'(?:&\(([^)]+)\)[\w(])|'
-      r'(?:&(static|dynamic|down|reinterpret)_cast\b)', line)
-  if match and match.group(1) != '*':
-    error(filename, linenum, 'runtime/casting', 4,
-          ('Are you taking an address of a cast?  '
-           'This is dangerous: could be a temp var.  '
-           'Take the address before doing the cast, rather than after'))
-
-  # Create an extended_line, which is the concatenation of the current and
-  # next lines, for more effective checking of code that may span more than one
-  # line.
-  if linenum + 1 < clean_lines.NumLines():
-    extended_line = line + clean_lines.elided[linenum + 1]
-  else:
-    extended_line = line
-
-  # Check for people declaring static/global STL strings at the top level.
-  # This is dangerous because the C++ language does not guarantee that
-  # globals with constructors are initialized before the first access.
-  match = Match(
-      r'((?:|static +)(?:|const +))string +([a-zA-Z0-9_:]+)\b(.*)',
-      line)
-  # Make sure it's not a function.
-  # Function template specialization looks like: "string foo<Type>(...".
-  # Class template definitions look like: "string Foo<Type>::Method(...".
-  #
-  # Also ignore things that look like operators.  These are matched separately
-  # because operator names cross non-word boundaries.  If we change the pattern
-  # above, we would decrease the accuracy of matching identifiers.
-  if (match and
-      not Search(r'\boperator\W', line) and
-      not Match(r'\s*(<.*>)?(::[a-zA-Z0-9_]+)?\s*\(([^"]|$)', match.group(3))):
-    error(filename, linenum, 'runtime/string', 4,
-          'For a static/global string constant, use a C style string instead: '
-          '"%schar %s[]".' %
-          (match.group(1), match.group(2)))
-
-  if Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line):
-    error(filename, linenum, 'runtime/init', 4,
-          'You seem to be initializing a member variable with itself.')
-
-  if file_extension == 'h':
+  if IsHeaderExtension(file_extension):
     # TODO(unknown): check that 1-arg constructors are explicit.
     #                How to tell it's a constructor?
     #                (handled in CheckForNonStandardConstructs for now)
-    # TODO(unknown): check that classes have DISALLOW_EVIL_CONSTRUCTORS
+    # TODO(unknown): check that classes declare or disable copy/assign
     #                (level 1 error)
     pass
 
@@ -3888,23 +4744,6 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension,
       error(filename, linenum, 'runtime/int', 4,
             'Use int16/int64/etc, rather than the C type %s' % match.group(1))
 
-  # When snprintf is used, the second argument shouldn't be a literal.
-  match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line)
-  if match and match.group(2) != '0':
-    # If 2nd arg is zero, snprintf is used to calculate size.
-    error(filename, linenum, 'runtime/printf', 3,
-          'If you can, use sizeof(%s) instead of %s as the 2nd arg '
-          'to snprintf.' % (match.group(1), match.group(2)))
-
-  # Check if some verboten C functions are being used.
-  if Search(r'\bsprintf\b', line):
-    error(filename, linenum, 'runtime/printf', 5,
-          'Never use sprintf.  Use snprintf instead.')
-  match = Search(r'\b(strcpy|strcat)\b', line)
-  if match:
-    error(filename, linenum, 'runtime/printf', 4,
-          'Almost always, snprintf is better than %s' % match.group(1))
-
   # Check if some verboten operator overloading is going on
   # TODO(unknown): catch out-of-line unary operator&:
   #   class X {};
@@ -3924,7 +4763,7 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension,
   # Check for potential format string bugs like printf(foo).
   # We constrain the pattern not to pick things like DocidForPrintf(foo).
   # Not perfect but it can catch printf(foo.c_str()) and printf(foo->c_str())
-  # TODO(sugawarayu): Catch the following case. Need to change the calling
+  # TODO(unknown): Catch the following case. Need to change the calling
   # convention of the whole function to process multiple line to handle it.
   #   printf(
   #       boy_this_is_a_really_long_variable_that_cannot_fit_on_the_prev_line);
@@ -3989,37 +4828,188 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension,
             'Do not use variable-length arrays.  Use an appropriately named '
             "('k' followed by CamelCase) compile-time constant for the size.")
 
-  # If DISALLOW_EVIL_CONSTRUCTORS, DISALLOW_COPY_AND_ASSIGN, or
-  # DISALLOW_IMPLICIT_CONSTRUCTORS is present, then it should be the last thing
-  # in the class declaration.
-  match = Match(
-      (r'\s*'
-       r'(DISALLOW_(EVIL_CONSTRUCTORS|COPY_AND_ASSIGN|IMPLICIT_CONSTRUCTORS))'
-       r'\(.*\);$'),
-      line)
-  if match and linenum + 1 < clean_lines.NumLines():
-    next_line = clean_lines.elided[linenum + 1]
-    # We allow some, but not all, declarations of variables to be present
-    # in the statement that defines the class.  The [\w\*,\s]* fragment of
-    # the regular expression below allows users to declare instances of
-    # the class or pointers to instances, but not less common types such
-    # as function pointers or arrays.  It's a tradeoff between allowing
-    # reasonable code and avoiding trying to parse more C++ using regexps.
-    if not Search(r'^\s*}[\w\*,\s]*;', next_line):
-      error(filename, linenum, 'readability/constructors', 3,
-            match.group(1) + ' should be the last thing in the class')
-
   # Check for use of unnamed namespaces in header files.  Registration
   # macros are typically OK, so we allow use of "namespace {" on lines
   # that end with backslashes.
-  if (file_extension == 'h'
+  if (IsHeaderExtension(file_extension)
       and Search(r'\bnamespace\s*{', line)
       and line[-1] != '\\'):
     error(filename, linenum, 'build/namespaces', 4,
           'Do not use unnamed namespaces in header files.  See '
-          'http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces'
+          'https://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces'
           ' for more information.')
 
+
+def CheckGlobalStatic(filename, clean_lines, linenum, error):
+  """Check for unsafe global or static objects.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Match two lines at a time to support multiline declarations
+  if linenum + 1 < clean_lines.NumLines() and not Search(r'[;({]', line):
+    line += clean_lines.elided[linenum + 1].strip()
+
+  # Check for people declaring static/global STL strings at the top level.
+  # This is dangerous because the C++ language does not guarantee that
+  # globals with constructors are initialized before the first access, and
+  # also because globals can be destroyed when some threads are still running.
+  # TODO(unknown): Generalize this to also find static unique_ptr instances.
+  # TODO(unknown): File bugs for clang-tidy to find these.
+  match = Match(
+      r'((?:|static +)(?:|const +))(?::*std::)?string( +const)? +'
+      r'([a-zA-Z0-9_:]+)\b(.*)',
+      line)
+
+  # Remove false positives:
+  # - String pointers (as opposed to values).
+  #    string *pointer
+  #    const string *pointer
+  #    string const *pointer
+  #    string *const pointer
+  #
+  # - Functions and template specializations.
+  #    string Function<Type>(...
+  #    string Class<Type>::Method(...
+  #
+  # - Operators.  These are matched separately because operator names
+  #   cross non-word boundaries, and trying to match both operators
+  #   and functions at the same time would decrease accuracy of
+  #   matching identifiers.
+  #    string Class::operator*()
+  if (match and
+      not Search(r'\bstring\b(\s+const)?\s*[\*\&]\s*(const\s+)?\w', line) and
+      not Search(r'\boperator\W', line) and
+      not Match(r'\s*(<.*>)?(::[a-zA-Z0-9_]+)*\s*\(([^"]|$)', match.group(4))):
+    if Search(r'\bconst\b', line):
+      error(filename, linenum, 'runtime/string', 4,
+            'For a static/global string constant, use a C style string '
+            'instead: "%schar%s %s[]".' %
+            (match.group(1), match.group(2) or '', match.group(3)))
+    else:
+      error(filename, linenum, 'runtime/string', 4,
+            'Static/global string variables are not permitted.')
+
+  if (Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line) or
+      Search(r'\b([A-Za-z0-9_]*_)\(CHECK_NOTNULL\(\1\)\)', line)):
+    error(filename, linenum, 'runtime/init', 4,
+          'You seem to be initializing a member variable with itself.')
+
+
+def CheckPrintf(filename, clean_lines, linenum, error):
+  """Check for printf related issues.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # When snprintf is used, the second argument shouldn't be a literal.
+  match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line)
+  if match and match.group(2) != '0':
+    # If 2nd arg is zero, snprintf is used to calculate size.
+    error(filename, linenum, 'runtime/printf', 3,
+          'If you can, use sizeof(%s) instead of %s as the 2nd arg '
+          'to snprintf.' % (match.group(1), match.group(2)))
+
+  # Check if some verboten C functions are being used.
+  if Search(r'\bsprintf\s*\(', line):
+    error(filename, linenum, 'runtime/printf', 5,
+          'Never use sprintf. Use snprintf instead.')
+  match = Search(r'\b(strcpy|strcat)\s*\(', line)
+  if match:
+    error(filename, linenum, 'runtime/printf', 4,
+          'Almost always, snprintf is better than %s' % match.group(1))
+
+
+def IsDerivedFunction(clean_lines, linenum):
+  """Check if current line contains an inherited function.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+  Returns:
+    True if current line contains a function with "override"
+    virt-specifier.
+  """
+  # Scan back a few lines for start of current function
+  for i in xrange(linenum, max(-1, linenum - 10), -1):
+    match = Match(r'^([^()]*\w+)\(', clean_lines.elided[i])
+    if match:
+      # Look for "override" after the matching closing parenthesis
+      line, _, closing_paren = CloseExpression(
+          clean_lines, i, len(match.group(1)))
+      return (closing_paren >= 0 and
+              Search(r'\boverride\b', line[closing_paren:]))
+  return False
+
+
+def IsOutOfLineMethodDefinition(clean_lines, linenum):
+  """Check if current line contains an out-of-line method definition.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+  Returns:
+    True if current line contains an out-of-line method definition.
+  """
+  # Scan back a few lines for start of current function
+  for i in xrange(linenum, max(-1, linenum - 10), -1):
+    if Match(r'^([^()]*\w+)\(', clean_lines.elided[i]):
+      return Match(r'^[^()]*\w+::\w+\(', clean_lines.elided[i]) is not None
+  return False
+
+
+def IsInitializerList(clean_lines, linenum):
+  """Check if current line is inside constructor initializer list.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+  Returns:
+    True if current line appears to be inside constructor initializer
+    list, False otherwise.
+  """
+  for i in xrange(linenum, 1, -1):
+    line = clean_lines.elided[i]
+    if i == linenum:
+      remove_function_body = Match(r'^(.*)\{\s*$', line)
+      if remove_function_body:
+        line = remove_function_body.group(1)
+
+    if Search(r'\s:\s*\w+[({]', line):
+      # A lone colon tend to indicate the start of a constructor
+      # initializer list.  It could also be a ternary operator, which
+      # also tend to appear in constructor initializer lists as
+      # opposed to parameter lists.
+      return True
+    if Search(r'\}\s*,\s*$', line):
+      # A closing brace followed by a comma is probably the end of a
+      # brace-initialized member in constructor initializer list.
+      return True
+    if Search(r'[{};]\s*$', line):
+      # Found one of the following:
+      # - A closing brace or semicolon, probably the end of the previous
+      #   function.
+      # - An opening brace, probably the start of current class or namespace.
+      #
+      # Current line is probably not inside an initializer list since
+      # we saw one of those things without seeing the starting colon.
+      return False
+
+  # Got to the beginning of the file without seeing the start of
+  # constructor initializer list.
+  return False
+
+
 def CheckForNonConstReference(filename, clean_lines, linenum,
                               nesting_state, error):
   """Check for non-const references.
@@ -4031,7 +5021,7 @@ def CheckForNonConstReference(filename, clean_lines, linenum,
     filename: The name of the current file.
     clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
-    nesting_state: A _NestingState instance which maintains information about
+    nesting_state: A NestingState instance which maintains information about
                    the current stack of nested blocks being parsed.
     error: The function to call with any errors found.
   """
@@ -4040,6 +5030,17 @@ def CheckForNonConstReference(filename, clean_lines, linenum,
   if '&' not in line:
     return
 
+  # If a function is inherited, current function doesn't have much of
+  # a choice, so any non-const references should not be blamed on
+  # derived function.
+  if IsDerivedFunction(clean_lines, linenum):
+    return
+
+  # Don't warn on out-of-line method definitions, as we would warn on the
+  # in-line declaration, if it isn't marked with 'override'.
+  if IsOutOfLineMethodDefinition(clean_lines, linenum):
+    return
+
   # Long type names may be broken across multiple lines, usually in one
   # of these forms:
   #   LongType
@@ -4088,60 +5089,192 @@ def CheckForNonConstReference(filename, clean_lines, linenum,
   #   inside declarators: reference parameter
   # We will exclude the first two cases by checking that we are not inside a
   # function body, including one that was just introduced by a trailing '{'.
-  # TODO(unknwon): Doesn't account for preprocessor directives.
   # TODO(unknown): Doesn't account for 'catch(Exception& e)' [rare].
-  check_params = False
-  if not nesting_state.stack:
-    check_params = True  # top level
-  elif (isinstance(nesting_state.stack[-1], _ClassInfo) or
-        isinstance(nesting_state.stack[-1], _NamespaceInfo)):
-    check_params = True  # within class or namespace
-  elif Match(r'.*{\s*$', line):
-    if (len(nesting_state.stack) == 1 or
-        isinstance(nesting_state.stack[-2], _ClassInfo) or
-        isinstance(nesting_state.stack[-2], _NamespaceInfo)):
-      check_params = True  # just opened global/class/namespace block
+  if (nesting_state.previous_stack_top and
+      not (isinstance(nesting_state.previous_stack_top, _ClassInfo) or
+           isinstance(nesting_state.previous_stack_top, _NamespaceInfo))):
+    # Not at toplevel, not within a class, and not within a namespace
+    return
+
+  # Avoid initializer lists.  We only need to scan back from the
+  # current line for something that starts with ':'.
+  #
+  # We don't need to check the current line, since the '&' would
+  # appear inside the second set of parentheses on the current line as
+  # opposed to the first set.
+  if linenum > 0:
+    for i in xrange(linenum - 1, max(0, linenum - 10), -1):
+      previous_line = clean_lines.elided[i]
+      if not Search(r'[),]\s*$', previous_line):
+        break
+      if Match(r'^\s*:\s+\S', previous_line):
+        return
+
+  # Avoid preprocessors
+  if Search(r'\\\s*$', line):
+    return
+
+  # Avoid constructor initializer lists
+  if IsInitializerList(clean_lines, linenum):
+    return
+
   # We allow non-const references in a few standard places, like functions
   # called "swap()" or iostream operators like "<<" or ">>".  Do not check
   # those function parameters.
   #
   # We also accept & in static_assert, which looks like a function but
   # it's actually a declaration expression.
-  whitelisted_functions = (r'(?:[sS]wap(?:<\w:+>)?|'
+  allowed_functions = (r'(?:[sS]wap(?:<\w:+>)?|'
                            r'operator\s*[<>][<>]|'
                            r'static_assert|COMPILE_ASSERT'
                            r')\s*\(')
-  if Search(whitelisted_functions, line):
-    check_params = False
+  if Search(allowed_functions, line):
+    return
   elif not Search(r'\S+\([^)]*$', line):
-    # Don't see a whitelisted function on this line.  Actually we
+    # Don't see an allowed function on this line.  Actually we
     # didn't see any function name on this line, so this is likely a
     # multi-line parameter list.  Try a bit harder to catch this case.
     for i in xrange(2):
       if (linenum > i and
-          Search(whitelisted_functions, clean_lines.elided[linenum - i - 1])):
-        check_params = False
-        break
+          Search(allowed_functions, clean_lines.elided[linenum - i - 1])):
+        return
 
-  if check_params:
-    decls = ReplaceAll(r'{[^}]*}', ' ', line)  # exclude function body
-    for parameter in re.findall(_RE_PATTERN_REF_PARAM, decls):
-      if not Match(_RE_PATTERN_CONST_REF_PARAM, parameter):
-        error(filename, linenum, 'runtime/references', 2,
-              'Is this a non-const reference? '
-              'If so, make const or use a pointer: ' +
-              ReplaceAll(' *<', '<', parameter))
+  decls = ReplaceAll(r'{[^}]*}', ' ', line)  # exclude function body
+  for parameter in re.findall(_RE_PATTERN_REF_PARAM, decls):
+    if (not Match(_RE_PATTERN_CONST_REF_PARAM, parameter) and
+        not Match(_RE_PATTERN_REF_STREAM_PARAM, parameter)):
+      error(filename, linenum, 'runtime/references', 2,
+            'Is this a non-const reference? '
+            'If so, make const or use a pointer: ' +
+            ReplaceAll(' *<', '<', parameter))
 
 
-def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern,
-                    error):
+def CheckCasts(filename, clean_lines, linenum, error):
+  """Various cast related checks.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Check to see if they're using an conversion function cast.
+  # I just try to capture the most common basic types, though there are more.
+  # Parameterless conversion functions, such as bool(), are allowed as they are
+  # probably a member operator declaration or default constructor.
+  match = Search(
+      r'(\bnew\s+(?:const\s+)?|\S<\s*(?:const\s+)?)?\b'
+      r'(int|float|double|bool|char|int32|uint32|int64|uint64)'
+      r'(\([^)].*)', line)
+  expecting_function = ExpectingFunctionArgs(clean_lines, linenum)
+  if match and not expecting_function:
+    matched_type = match.group(2)
+
+    # matched_new_or_template is used to silence two false positives:
+    # - New operators
+    # - Template arguments with function types
+    #
+    # For template arguments, we match on types immediately following
+    # an opening bracket without any spaces.  This is a fast way to
+    # silence the common case where the function type is the first
+    # template argument.  False negative with less-than comparison is
+    # avoided because those operators are usually followed by a space.
+    #
+    #   function<double(double)>   // bracket + no space = false positive
+    #   value < double(42)         // bracket + space = true positive
+    matched_new_or_template = match.group(1)
+
+    # Avoid arrays by looking for brackets that come after the closing
+    # parenthesis.
+    if Match(r'\([^()]+\)\s*\[', match.group(3)):
+      return
+
+    # Other things to ignore:
+    # - Function pointers
+    # - Casts to pointer types
+    # - Placement new
+    # - Alias declarations
+    matched_funcptr = match.group(3)
+    if (matched_new_or_template is None and
+        not (matched_funcptr and
+             (Match(r'\((?:[^() ]+::\s*\*\s*)?[^() ]+\)\s*\(',
+                    matched_funcptr) or
+              matched_funcptr.startswith('(*)'))) and
+        not Match(r'\s*using\s+\S+\s*=\s*' + matched_type, line) and
+        not Search(r'new\(\S+\)\s*' + matched_type, line)):
+      error(filename, linenum, 'readability/casting', 4,
+            'Using deprecated casting style.  '
+            'Use static_cast<%s>(...) instead' %
+            matched_type)
+
+  if not expecting_function:
+    CheckCStyleCast(filename, clean_lines, linenum, 'static_cast',
+                    r'\((int|float|double|bool|char|u?int(16|32|64))\)', error)
+
+  # This doesn't catch all cases. Consider (const char * const)"hello".
+  #
+  # (char *) "foo" should always be a const_cast (reinterpret_cast won't
+  # compile).
+  if CheckCStyleCast(filename, clean_lines, linenum, 'const_cast',
+                     r'\((char\s?\*+\s?)\)\s*"', error):
+    pass
+  else:
+    # Check pointer casts for other than string constants
+    CheckCStyleCast(filename, clean_lines, linenum, 'reinterpret_cast',
+                    r'\((\w+\s?\*+\s?)\)', error)
+
+  # In addition, we look for people taking the address of a cast.  This
+  # is dangerous -- casts can assign to temporaries, so the pointer doesn't
+  # point where you think.
+  #
+  # Some non-identifier character is required before the '&' for the
+  # expression to be recognized as a cast.  These are casts:
+  #   expression = &static_cast<int*>(temporary());
+  #   function(&(int*)(temporary()));
+  #
+  # This is not a cast:
+  #   reference_type&(int* function_param);
+  match = Search(
+      r'(?:[^\w]&\(([^)*][^)]*)\)[\w(])|'
+      r'(?:[^\w]&(static|dynamic|down|reinterpret)_cast\b)', line)
+  if match:
+    # Try a better error message when the & is bound to something
+    # dereferenced by the casted pointer, as opposed to the casted
+    # pointer itself.
+    parenthesis_error = False
+    match = Match(r'^(.*&(?:static|dynamic|down|reinterpret)_cast\b)<', line)
+    if match:
+      _, y1, x1 = CloseExpression(clean_lines, linenum, len(match.group(1)))
+      if x1 >= 0 and clean_lines.elided[y1][x1] == '(':
+        _, y2, x2 = CloseExpression(clean_lines, y1, x1)
+        if x2 >= 0:
+          extended_line = clean_lines.elided[y2][x2:]
+          if y2 < clean_lines.NumLines() - 1:
+            extended_line += clean_lines.elided[y2 + 1]
+          if Match(r'\s*(?:->|\[)', extended_line):
+            parenthesis_error = True
+
+    if parenthesis_error:
+      error(filename, linenum, 'readability/casting', 4,
+            ('Are you taking an address of something dereferenced '
+             'from a cast?  Wrapping the dereferenced expression in '
+             'parentheses will make the binding more obvious'))
+    else:
+      error(filename, linenum, 'runtime/casting', 4,
+            ('Are you taking an address of a cast?  '
+             'This is dangerous: could be a temp var.  '
+             'Take the address before doing the cast, rather than after'))
+
+
+def CheckCStyleCast(filename, clean_lines, linenum, cast_type, pattern, error):
   """Checks for a C-style cast by looking for the pattern.
 
   Args:
     filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
-    line: The line of code to check.
-    raw_line: The raw line of code to check, with comments.
     cast_type: The string for the C++ cast to recommend.  This is either
       reinterpret_cast, static_cast, or const_cast, depending.
     pattern: The regular expression used to find C-style casts.
@@ -4151,75 +5284,34 @@ def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern,
     True if an error was emitted.
     False otherwise.
   """
+  line = clean_lines.elided[linenum]
   match = Search(pattern, line)
   if not match:
     return False
 
-  # e.g., sizeof(int)
-  sizeof_match = Match(r'.*sizeof\s*$', line[0:match.start(1) - 1])
-  if sizeof_match:
-    error(filename, linenum, 'runtime/sizeof', 1,
-          'Using sizeof(type).  Use sizeof(varname) instead if possible')
-    return True
-
-  # operator++(int) and operator--(int)
-  if (line[0:match.start(1) - 1].endswith(' operator++') or
-      line[0:match.start(1) - 1].endswith(' operator--')):
+  # Exclude lines with keywords that tend to look like casts
+  context = line[0:match.start(1) - 1]
+  if Match(r'.*\b(?:sizeof|alignof|alignas|[_A-Z][_A-Z0-9]*)\s*$', context):
     return False
 
-  # A single unnamed argument for a function tends to look like old
-  # style cast.  If we see those, don't issue warnings for deprecated
-  # casts, instead issue warnings for unnamed arguments where
-  # appropriate.
-  #
-  # These are things that we want warnings for, since the style guide
-  # explicitly require all parameters to be named:
-  #   Function(int);
-  #   Function(int) {
-  #   ConstMember(int) const;
-  #   ConstMember(int) const {
-  #   ExceptionMember(int) throw (...);
-  #   ExceptionMember(int) throw (...) {
-  #   PureVirtual(int) = 0;
-  #
-  # These are functions of some sort, where the compiler would be fine
-  # if they had named parameters, but people often omit those
-  # identifiers to reduce clutter:
-  #   (FunctionPointer)(int);
-  #   (FunctionPointer)(int) = value;
-  #   Function((function_pointer_arg)(int))
-  #   <TemplateArgument(int)>;
-  #   <(FunctionPointerTemplateArgument)(int)>;
+  # Try expanding current context to see if we one level of
+  # parentheses inside a macro.
+  if linenum > 0:
+    for i in xrange(linenum - 1, max(0, linenum - 5), -1):
+      context = clean_lines.elided[i] + context
+  if Match(r'.*\b[_A-Z][_A-Z0-9]*\s*\((?:\([^()]*\)|[^()])*$', context):
+    return False
+
+  # operator++(int) and operator--(int)
+  if context.endswith(' operator++') or context.endswith(' operator--'):
+    return False
+
+  # A single unnamed argument for a function tends to look like old style cast.
+  # If we see those, don't issue warnings for deprecated casts.
   remainder = line[match.end(0):]
-  if Match(r'^\s*(?:;|const\b|throw\b|=|>|\{|\))', remainder):
-    # Looks like an unnamed parameter.
-
-    # Don't warn on any kind of template arguments.
-    if Match(r'^\s*>', remainder):
-      return False
-
-    # Don't warn on assignments to function pointers, but keep warnings for
-    # unnamed parameters to pure virtual functions.  Note that this pattern
-    # will also pass on assignments of "0" to function pointers, but the
-    # preferred values for those would be "nullptr" or "NULL".
-    matched_zero = Match(r'^\s=\s*(\S+)\s*;', remainder)
-    if matched_zero and matched_zero.group(1) != '0':
-      return False
-
-    # Don't warn on function pointer declarations.  For this we need
-    # to check what came before the "(type)" string.
-    if Match(r'.*\)\s*$', line[0:match.start(0)]):
-      return False
-
-    # Don't warn if the parameter is named with block comments, e.g.:
-    #  Function(int /*unused_param*/);
-    if '/*' in raw_line:
-      return False
-
-    # Passed all filters, issue warning here.
-    error(filename, linenum, 'readability/function', 3,
-          'All parameters should be named in a function')
-    return True
+  if Match(r'^\s*(?:;|const\b|throw\b|final\b|override\b|[=>{),]|->)',
+           remainder):
+    return False
 
   # At this point, all that should be left is actual casts.
   error(filename, linenum, 'readability/casting', 4,
@@ -4229,6 +5321,28 @@ def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern,
   return True
 
 
+def ExpectingFunctionArgs(clean_lines, linenum):
+  """Checks whether where function type arguments are expected.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+
+  Returns:
+    True if the line at 'linenum' is inside something that expects arguments
+    of function types.
+  """
+  line = clean_lines.elided[linenum]
+  return (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or
+          (linenum >= 2 and
+           (Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\((?:\S+,)?\s*$',
+                  clean_lines.elided[linenum - 1]) or
+            Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\(\s*$',
+                  clean_lines.elided[linenum - 2]) or
+            Search(r'\bstd::m?function\s*\<\s*$',
+                   clean_lines.elided[linenum - 1]))))
+
+
 _HEADERS_CONTAINING_TEMPLATES = (
     ('<deque>', ('deque',)),
     ('<functional>', ('unary_function', 'binary_function',
@@ -4251,11 +5365,15 @@ _HEADERS_CONTAINING_TEMPLATES = (
     ('<limits>', ('numeric_limits',)),
     ('<list>', ('list',)),
     ('<map>', ('map', 'multimap',)),
-    ('<memory>', ('allocator',)),
+    ('<memory>', ('allocator', 'make_shared', 'make_unique', 'shared_ptr',
+                  'unique_ptr', 'weak_ptr')),
     ('<queue>', ('queue', 'priority_queue',)),
     ('<set>', ('set', 'multiset',)),
     ('<stack>', ('stack',)),
     ('<string>', ('char_traits', 'basic_string',)),
+    ('<tuple>', ('tuple',)),
+    ('<unordered_map>', ('unordered_map', 'unordered_multimap')),
+    ('<unordered_set>', ('unordered_set', 'unordered_multiset')),
     ('<utility>', ('pair',)),
     ('<vector>', ('vector',)),
 
@@ -4266,18 +5384,26 @@ _HEADERS_CONTAINING_TEMPLATES = (
     ('<slist>', ('slist',)),
     )
 
+_HEADERS_MAYBE_TEMPLATES = (
+    ('<algorithm>', ('copy', 'max', 'min', 'min_element', 'sort',
+                     'transform',
+                    )),
+    ('<utility>', ('forward', 'make_pair', 'move', 'swap')),
+    )
+
 _RE_PATTERN_STRING = re.compile(r'\bstring\b')
 
-_re_pattern_algorithm_header = []
-for _template in ('copy', 'max', 'min', 'min_element', 'sort', 'swap',
-                  'transform'):
-  # Match max<type>(..., ...), max(..., ...), but not foo->max, foo.max or
-  # type::max().
-  _re_pattern_algorithm_header.append(
-      (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'),
-       _template,
-       '<algorithm>'))
+_re_pattern_headers_maybe_templates = []
+for _header, _templates in _HEADERS_MAYBE_TEMPLATES:
+  for _template in _templates:
+    # Match max<type>(..., ...), max(..., ...), but not foo->max, foo.max or
+    # type::max().
+    _re_pattern_headers_maybe_templates.append(
+        (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'),
+            _template,
+            _header))
 
+# Other scripts may reach in and modify this pattern.
 _re_pattern_templates = []
 for _header, _templates in _HEADERS_CONTAINING_TEMPLATES:
   for _template in _templates:
@@ -4317,13 +5443,13 @@ def FilesBelongToSameModule(filename_cc, filename_h):
     string: the additional prefix needed to open the header file.
   """
 
-  if not filename_cc.endswith('.cc'):
+  fileinfo = FileInfo(filename_cc)
+  if not fileinfo.IsSource():
     return (False, '')
-  filename_cc = filename_cc[:-len('.cc')]
-  if filename_cc.endswith('_unittest'):
-    filename_cc = filename_cc[:-len('_unittest')]
-  elif filename_cc.endswith('_test'):
-    filename_cc = filename_cc[:-len('_test')]
+  filename_cc = filename_cc[:-len(fileinfo.Extension())]
+  matched_test_suffix = Search(_TEST_FILE_SUFFIX, fileinfo.BaseName())
+  if matched_test_suffix:
+    filename_cc = filename_cc[:-len(matched_test_suffix.group(1))]
   filename_cc = filename_cc.replace('/public/', '/')
   filename_cc = filename_cc.replace('/internal/', '/')
 
@@ -4342,16 +5468,16 @@ def FilesBelongToSameModule(filename_cc, filename_h):
   return files_belong_to_same_module, common_path
 
 
-def UpdateIncludeState(filename, include_state, io=codecs):
-  """Fill up the include_state with new includes found from the file.
+def UpdateIncludeState(filename, include_dict, io=codecs):
+  """Fill up the include_dict with new includes found from the file.
 
   Args:
     filename: the name of the header to read.
-    include_state: an _IncludeState instance in which the headers are inserted.
+    include_dict: a dictionary in which the headers are inserted.
     io: The io factory to use to read the file. Provided for testability.
 
   Returns:
-    True if a header was succesfully added. False otherwise.
+    True if a header was successfully added. False otherwise.
   """
   headerfile = None
   try:
@@ -4365,9 +5491,7 @@ def UpdateIncludeState(filename, include_state, io=codecs):
     match = _RE_PATTERN_INCLUDE.search(clean_line)
     if match:
       include = match.group(2)
-      # The value formatting is cute, but not really used right now.
-      # What matters here is that the key is in include_state.
-      include_state.setdefault(include, '%s:%d' % (filename, linenum))
+      include_dict.setdefault(include, linenum)
   return True
 
 
@@ -4406,7 +5530,7 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error,
       if prefix.endswith('std::') or not prefix.endswith('::'):
         required['<string>'] = (linenum, 'string')
 
-    for pattern, template, header in _re_pattern_algorithm_header:
+    for pattern, template, header in _re_pattern_headers_maybe_templates:
       if pattern.search(line):
         required[header] = (linenum, template)
 
@@ -4415,15 +5539,21 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error,
       continue
 
     for pattern, template, header in _re_pattern_templates:
-      if pattern.search(line):
-        required[header] = (linenum, template)
+      matched = pattern.search(line)
+      if matched:
+        # Don't warn about IWYU in non-STL namespaces:
+        # (We check only the first match per line; good enough.)
+        prefix = line[:matched.start()]
+        if prefix.endswith('std::') or not prefix.endswith('::'):
+          required[header] = (linenum, template)
 
   # The policy is that if you #include something in foo.h you don't need to
   # include it again in foo.cc. Here, we will look at possible includes.
-  # Let's copy the include_state so it is only messed up within this function.
-  include_state = include_state.copy()
+  # Let's flatten the include_state include_list and copy it into a dictionary.
+  include_dict = dict([item for sublist in include_state.include_list
+                       for item in sublist])
 
-  # Did we find the header for this file (if any) and succesfully load it?
+  # Did we find the header for this file (if any) and successfully load it?
   header_found = False
 
   # Use the absolute path so that matching works properly.
@@ -4438,13 +5568,13 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error,
   # instead of 'foo_flymake.h'
   abs_filename = re.sub(r'_flymake\.cc$', '.cc', abs_filename)
 
-  # include_state is modified during iteration, so we iterate over a copy of
+  # include_dict is modified during iteration, so we iterate over a copy of
   # the keys.
-  header_keys = include_state.keys()
+  header_keys = include_dict.keys()
   for header in header_keys:
     (same_module, common_path) = FilesBelongToSameModule(abs_filename, header)
     fullpath = common_path + header
-    if same_module and UpdateIncludeState(fullpath, include_state, io):
+    if same_module and UpdateIncludeState(fullpath, include_dict, io):
       header_found = True
 
   # If we can't find the header file for a .cc, assume it's because we don't
@@ -4458,7 +5588,7 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error,
   # All the lines have been processed, report the errors found.
   for required_header_unstripped in required:
     template = required[required_header_unstripped][1]
-    if required_header_unstripped.strip('<>"') not in include_state:
+    if required_header_unstripped.strip('<>"') not in include_dict:
       error(filename, required[required_header_unstripped][0],
             'build/include_what_you_use', 4,
             'Add #include ' + required_header_unstripped + ' for ' + template)
@@ -4470,7 +5600,7 @@ _RE_PATTERN_EXPLICIT_MAKEPAIR = re.compile(r'\bmake_pair\s*<')
 def CheckMakePairUsesDeduction(filename, clean_lines, linenum, error):
   """Check that make_pair's template arguments are deduced.
 
-  G++ 4.6 in C++0x mode fails badly if make_pair's template arguments are
+  G++ 4.6 in C++11 mode fails badly if make_pair's template arguments are
   specified explicitly, and such use isn't intended in any case.
 
   Args:
@@ -4488,6 +5618,165 @@ def CheckMakePairUsesDeduction(filename, clean_lines, linenum, error):
           ' OR use pair directly OR if appropriate, construct a pair directly')
 
 
+def CheckRedundantVirtual(filename, clean_lines, linenum, error):
+  """Check if line contains a redundant "virtual" function-specifier.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  # Look for "virtual" on current line.
+  line = clean_lines.elided[linenum]
+  virtual = Match(r'^(.*)(\bvirtual\b)(.*)$', line)
+  if not virtual: return
+
+  # Ignore "virtual" keywords that are near access-specifiers.  These
+  # are only used in class base-specifier and do not apply to member
+  # functions.
+  if (Search(r'\b(public|protected|private)\s+$', virtual.group(1)) or
+      Match(r'^\s+(public|protected|private)\b', virtual.group(3))):
+    return
+
+  # Ignore the "virtual" keyword from virtual base classes.  Usually
+  # there is a column on the same line in these cases (virtual base
+  # classes are rare in google3 because multiple inheritance is rare).
+  if Match(r'^.*[^:]:[^:].*$', line): return
+
+  # Look for the next opening parenthesis.  This is the start of the
+  # parameter list (possibly on the next line shortly after virtual).
+  # TODO(unknown): doesn't work if there are virtual functions with
+  # decltype() or other things that use parentheses, but csearch suggests
+  # that this is rare.
+  end_col = -1
+  end_line = -1
+  start_col = len(virtual.group(2))
+  for start_line in xrange(linenum, min(linenum + 3, clean_lines.NumLines())):
+    line = clean_lines.elided[start_line][start_col:]
+    parameter_list = Match(r'^([^(]*)\(', line)
+    if parameter_list:
+      # Match parentheses to find the end of the parameter list
+      (_, end_line, end_col) = CloseExpression(
+          clean_lines, start_line, start_col + len(parameter_list.group(1)))
+      break
+    start_col = 0
+
+  if end_col < 0:
+    return  # Couldn't find end of parameter list, give up
+
+  # Look for "override" or "final" after the parameter list
+  # (possibly on the next few lines).
+  for i in xrange(end_line, min(end_line + 3, clean_lines.NumLines())):
+    line = clean_lines.elided[i][end_col:]
+    match = Search(r'\b(override|final)\b', line)
+    if match:
+      error(filename, linenum, 'readability/inheritance', 4,
+            ('"virtual" is redundant since function is '
+             'already declared as "%s"' % match.group(1)))
+
+    # Set end_col to check whole lines after we are done with the
+    # first line.
+    end_col = 0
+    if Search(r'[^\w]\s*$', line):
+      break
+
+
+def CheckRedundantOverrideOrFinal(filename, clean_lines, linenum, error):
+  """Check if line contains a redundant "override" or "final" virt-specifier.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  # Look for closing parenthesis nearby.  We need one to confirm where
+  # the declarator ends and where the virt-specifier starts to avoid
+  # false positives.
+  line = clean_lines.elided[linenum]
+  declarator_end = line.rfind(')')
+  if declarator_end >= 0:
+    fragment = line[declarator_end:]
+  else:
+    if linenum > 1 and clean_lines.elided[linenum - 1].rfind(')') >= 0:
+      fragment = line
+    else:
+      return
+
+  # Check that at most one of "override" or "final" is present, not both
+  if Search(r'\boverride\b', fragment) and Search(r'\bfinal\b', fragment):
+    error(filename, linenum, 'readability/inheritance', 4,
+          ('"override" is redundant since function is '
+           'already declared as "final"'))
+
+
+
+
+# Returns true if we are at a new block, and it is directly
+# inside of a namespace.
+def IsBlockInNameSpace(nesting_state, is_forward_declaration):
+  """Checks that the new block is directly in a namespace.
+
+  Args:
+    nesting_state: The _NestingState object that contains info about our state.
+    is_forward_declaration: If the class is a forward declared class.
+  Returns:
+    Whether or not the new block is directly in a namespace.
+  """
+  if is_forward_declaration:
+    if len(nesting_state.stack) >= 1 and (
+        isinstance(nesting_state.stack[-1], _NamespaceInfo)):
+      return True
+    else:
+      return False
+
+  return (len(nesting_state.stack) > 1 and
+          nesting_state.stack[-1].check_namespace_indentation and
+          isinstance(nesting_state.stack[-2], _NamespaceInfo))
+
+
+def ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item,
+                                    raw_lines_no_comments, linenum):
+  """This method determines if we should apply our namespace indentation check.
+
+  Args:
+    nesting_state: The current nesting state.
+    is_namespace_indent_item: If we just put a new class on the stack, True.
+      If the top of the stack is not a class, or we did not recently
+      add the class, False.
+    raw_lines_no_comments: The lines without the comments.
+    linenum: The current line number we are processing.
+
+  Returns:
+    True if we should apply our namespace indentation check. Currently, it
+    only works for classes and namespaces inside of a namespace.
+  """
+
+  is_forward_declaration = IsForwardClassDeclaration(raw_lines_no_comments,
+                                                     linenum)
+
+  if not (is_namespace_indent_item or is_forward_declaration):
+    return False
+
+  # If we are in a macro, we do not want to check the namespace indentation.
+  if IsMacroDefinition(raw_lines_no_comments, linenum):
+    return False
+
+  return IsBlockInNameSpace(nesting_state, is_forward_declaration)
+
+
+# Call this method if the line is directly inside of a namespace.
+# If the line above is blank (excluding comments) or the start of
+# an inner namespace, it cannot be indented.
+def CheckItemIndentationInNamespace(filename, raw_lines_no_comments, linenum,
+                                    error):
+  line = raw_lines_no_comments[linenum]
+  if Match(r'^\s+', line):
+    error(filename, linenum, 'runtime/indentation_namespace', 4,
+          'Do not indent within a namespace')
+
+
 def ProcessLine(filename, file_extension, clean_lines, line,
                 include_state, function_state, nesting_state, error,
                 extra_check_functions=[]):
@@ -4501,7 +5790,7 @@ def ProcessLine(filename, file_extension, clean_lines, line,
     line: Number of line being processed.
     include_state: An _IncludeState instance in which the headers are inserted.
     function_state: A _FunctionState instance which counts function lines, etc.
-    nesting_state: A _NestingState instance which maintains information about
+    nesting_state: A NestingState instance which maintains information about
                    the current stack of nested blocks being parsed.
     error: A callable to which errors are reported, which takes 4 arguments:
            filename, line number, error level, and message
@@ -4512,8 +5801,9 @@ def ProcessLine(filename, file_extension, clean_lines, line,
   raw_lines = clean_lines.raw_lines
   ParseNolintSuppressions(filename, raw_lines[line], line, error)
   nesting_state.Update(filename, clean_lines, line, error)
-  if nesting_state.stack and nesting_state.stack[-1].inline_asm != _NO_ASM:
-    return
+  CheckForNamespaceIndentation(filename, nesting_state, clean_lines, line,
+                               error)
+  if nesting_state.InAsmBlock(): return
   CheckForFunctionLengths(filename, clean_lines, line, function_state, error)
   CheckForMultilineCommentsAndStrings(filename, clean_lines, line, error)
   CheckStyle(filename, clean_lines, line, file_extension, nesting_state, error)
@@ -4526,9 +5816,82 @@ def ProcessLine(filename, file_extension, clean_lines, line,
   CheckPosixThreading(filename, clean_lines, line, error)
   CheckInvalidIncrement(filename, clean_lines, line, error)
   CheckMakePairUsesDeduction(filename, clean_lines, line, error)
+  CheckRedundantVirtual(filename, clean_lines, line, error)
+  CheckRedundantOverrideOrFinal(filename, clean_lines, line, error)
   for check_fn in extra_check_functions:
     check_fn(filename, clean_lines, line, error)
 
+def FlagCxx11Features(filename, clean_lines, linenum, error):
+  """Flag those c++11 features that we only allow in certain places.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  include = Match(r'\s*#\s*include\s+[<"]([^<"]+)[">]', line)
+
+  # Flag unapproved C++ TR1 headers.
+  if include and include.group(1).startswith('tr1/'):
+    error(filename, linenum, 'build/c++tr1', 5,
+          ('C++ TR1 headers such as <%s> are unapproved.') % include.group(1))
+
+  # Flag unapproved C++11 headers.
+  if include and include.group(1) in ('cfenv',
+                                      'condition_variable',
+                                      'fenv.h',
+                                      'future',
+                                      'mutex',
+                                      'thread',
+                                      'chrono',
+                                      'ratio',
+                                      'regex',
+                                      'system_error',
+                                     ):
+    error(filename, linenum, 'build/c++11', 5,
+          ('<%s> is an unapproved C++11 header.') % include.group(1))
+
+  # The only place where we need to worry about C++11 keywords and library
+  # features in preprocessor directives is in macro definitions.
+  if Match(r'\s*#', line) and not Match(r'\s*#\s*define\b', line): return
+
+  # These are classes and free functions.  The classes are always
+  # mentioned as std::*, but we only catch the free functions if
+  # they're not found by ADL.  They're alphabetical by header.
+  for top_name in (
+      # type_traits
+      'alignment_of',
+      'aligned_union',
+      ):
+    if Search(r'\bstd::%s\b' % top_name, line):
+      error(filename, linenum, 'build/c++11', 5,
+            ('std::%s is an unapproved C++11 class or function.  Send c-style '
+             'an example of where it would make your code more readable, and '
+             'they may let you use it.') % top_name)
+
+
+def FlagCxx14Features(filename, clean_lines, linenum, error):
+  """Flag those C++14 features that we restrict.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  include = Match(r'\s*#\s*include\s+[<"]([^<"]+)[">]', line)
+
+  # Flag unapproved C++14 headers.
+  if include and include.group(1) in ('scoped_allocator', 'shared_mutex'):
+    error(filename, linenum, 'build/c++14', 5,
+          ('<%s> is an unapproved C++14 header.') % include.group(1))
+
+
 def ProcessFileData(filename, file_extension, lines, error,
                     extra_check_functions=[]):
   """Performs lint checks and reports any errors to the given error function.
@@ -4549,31 +5912,122 @@ def ProcessFileData(filename, file_extension, lines, error,
 
   include_state = _IncludeState()
   function_state = _FunctionState()
-  nesting_state = _NestingState()
+  nesting_state = NestingState()
 
   ResetNolintSuppressions()
 
   CheckForCopyright(filename, lines, error)
-
-  if file_extension == 'h':
-    CheckForHeaderGuard(filename, lines, error)
-
+  ProcessGlobalSuppresions(lines)
   RemoveMultiLineComments(filename, lines, error)
   clean_lines = CleansedLines(lines)
+
+  if IsHeaderExtension(file_extension):
+    CheckForHeaderGuard(filename, clean_lines, error)
+
   for line in xrange(clean_lines.NumLines()):
     ProcessLine(filename, file_extension, clean_lines, line,
                 include_state, function_state, nesting_state, error,
                 extra_check_functions)
+    FlagCxx11Features(filename, clean_lines, line, error)
   nesting_state.CheckCompletedBlocks(filename, error)
 
   CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error)
 
+  # Check that the .cc file has included its header if it exists.
+  if _IsSourceExtension(file_extension):
+    CheckHeaderFileIncluded(filename, include_state, error)
+
   # We check here rather than inside ProcessLine so that we see raw
   # lines rather than "cleaned" lines.
   CheckForBadCharacters(filename, lines, error)
 
   CheckForNewlineAtEOF(filename, lines, error)
 
+def ProcessConfigOverrides(filename):
+  """ Loads the configuration files and processes the config overrides.
+
+  Args:
+    filename: The name of the file being processed by the linter.
+
+  Returns:
+    False if the current |filename| should not be processed further.
+  """
+
+  abs_filename = os.path.abspath(filename)
+  cfg_filters = []
+  keep_looking = True
+  while keep_looking:
+    abs_path, base_name = os.path.split(abs_filename)
+    if not base_name:
+      break  # Reached the root directory.
+
+    cfg_file = os.path.join(abs_path, "CPPLINT.cfg")
+    abs_filename = abs_path
+    if not os.path.isfile(cfg_file):
+      continue
+
+    try:
+      with open(cfg_file) as file_handle:
+        for line in file_handle:
+          line, _, _ = line.partition('#')  # Remove comments.
+          if not line.strip():
+            continue
+
+          name, _, val = line.partition('=')
+          name = name.strip()
+          val = val.strip()
+          if name == 'set noparent':
+            keep_looking = False
+          elif name == 'filter':
+            cfg_filters.append(val)
+          elif name == 'exclude_files':
+            # When matching exclude_files pattern, use the base_name of
+            # the current file name or the directory name we are processing.
+            # For example, if we are checking for lint errors in /foo/bar/baz.cc
+            # and we found the .cfg file at /foo/CPPLINT.cfg, then the config
+            # file's "exclude_files" filter is meant to be checked against "bar"
+            # and not "baz" nor "bar/baz.cc".
+            if base_name:
+              pattern = re.compile(val)
+              if pattern.match(base_name):
+                if _cpplint_state.quiet:
+                  # Suppress "Ignoring file" warning when using --quiet.
+                  return False
+                sys.stderr.write('Ignoring "%s": file excluded by "%s". '
+                                 'File path component "%s" matches '
+                                 'pattern "%s"\n' %
+                                 (filename, cfg_file, base_name, val))
+                return False
+          elif name == 'linelength':
+            global _line_length
+            try:
+                _line_length = int(val)
+            except ValueError:
+                sys.stderr.write('Line length must be numeric.')
+          elif name == 'root':
+            global _root
+            # root directories are specified relative to CPPLINT.cfg dir.
+            _root = os.path.join(os.path.dirname(cfg_file), val)
+          elif name == 'headers':
+            ProcessHppHeadersOption(val)
+          else:
+            sys.stderr.write(
+                'Invalid configuration option (%s) in file %s\n' %
+                (name, cfg_file))
+
+    except IOError:
+      sys.stderr.write(
+          "Skipping config file '%s': Can't open for reading\n" % cfg_file)
+      keep_looking = False
+
+  # Apply all the accumulated filters in reverse order (top-level directory
+  # config options having the least priority).
+  for filter in reversed(cfg_filters):
+     _AddFilters(filter)
+
+  return True
+
+
 def ProcessFile(filename, vlevel, extra_check_functions=[]):
   """Does google-lint on a single file.
 
@@ -4589,7 +6043,15 @@ def ProcessFile(filename, vlevel, extra_check_functions=[]):
   """
 
   _SetVerboseLevel(vlevel)
+  _BackupFilters()
+  old_errors = _cpplint_state.error_count
 
+  if not ProcessConfigOverrides(filename):
+    _RestoreFilters()
+    return
+
+  lf_lines = []
+  crlf_lines = []
   try:
     # Support the UNIX convention of using "-" for stdin.  Note that
     # we are not opening the file with universal newline support
@@ -4597,10 +6059,7 @@ def ProcessFile(filename, vlevel, extra_check_functions=[]):
     # contain trailing '\r' characters if we are reading a file that
     # has CRLF endings.
     # If after the split a trailing '\r' is present, it is removed
-    # below. If it is not expected to be present (i.e. os.linesep !=
-    # '\r\n' as in Windows), a warning is issued below if this file
-    # is processed.
-
+    # below.
     if filename == '-':
       lines = codecs.StreamReaderWriter(sys.stdin,
                                         codecs.getreader('utf8'),
@@ -4609,16 +6068,19 @@ def ProcessFile(filename, vlevel, extra_check_functions=[]):
     else:
       lines = codecs.open(filename, 'r', 'utf8', 'replace').read().split('\n')
 
-    carriage_return_found = False
     # Remove trailing '\r'.
-    for linenum in range(len(lines)):
+    # The -1 accounts for the extra trailing blank line we get from split()
+    for linenum in range(len(lines) - 1):
       if lines[linenum].endswith('\r'):
         lines[linenum] = lines[linenum].rstrip('\r')
-        carriage_return_found = True
+        crlf_lines.append(linenum + 1)
+      else:
+        lf_lines.append(linenum + 1)
 
   except IOError:
     sys.stderr.write(
         "Skipping input '%s': Can't open for reading\n" % filename)
+    _RestoreFilters()
     return
 
   # Note, if no dot is found, this will give the entire filename as the ext.
@@ -4632,14 +6094,30 @@ def ProcessFile(filename, vlevel, extra_check_functions=[]):
   else:
     ProcessFileData(filename, file_extension, lines, Error,
                     extra_check_functions)
-    if carriage_return_found and os.linesep != '\r\n':
-      # Use 0 for linenum since outputting only one error for potentially
-      # several lines.
-      Error(filename, 0, 'whitespace/newline', 1,
-            'One or more unexpected \\r (^M) found;'
-            'better to use only a \\n')
 
-  sys.stderr.write('Done processing %s\n' % filename)
+    # If end-of-line sequences are a mix of LF and CR-LF, issue
+    # warnings on the lines with CR.
+    #
+    # Don't issue any warnings if all lines are uniformly LF or CR-LF,
+    # since critique can handle these just fine, and the style guide
+    # doesn't dictate a particular end of line sequence.
+    #
+    # We can't depend on os.linesep to determine what the desired
+    # end-of-line sequence should be, since that will return the
+    # server-side end-of-line sequence.
+    if lf_lines and crlf_lines:
+      # Warn on every line with CR.  An alternative approach might be to
+      # check whether the file is mostly CRLF or just LF, and warn on the
+      # minority, we bias toward LF here since most tools prefer LF.
+      for linenum in crlf_lines:
+        Error(filename, linenum, 'whitespace/newline', 1,
+              'Unexpected \\r (^M) found; better to use only \\n')
+
+  # Suppress printing anything if --quiet was passed unless the error
+  # count has increased after processing this file.
+  if not _cpplint_state.quiet or old_errors != _cpplint_state.error_count:
+    sys.stdout.write('Done processing %s\n' % filename)
+  _RestoreFilters()
 
 
 def PrintUsage(message):
@@ -4681,13 +6159,16 @@ def ParseArguments(args):
                                                  'filter=',
                                                  'root=',
                                                  'linelength=',
-                                                 'extensions='])
+                                                 'extensions=',
+                                                 'headers=',
+                                                 'quiet'])
   except getopt.GetoptError:
     PrintUsage('Invalid arguments.')
 
   verbosity = _VerboseLevel()
   output_format = _OutputFormat()
   filters = ''
+  quiet = _Quiet()
   counting_style = ''
 
   for (opt, val) in opts:
@@ -4697,6 +6178,8 @@ def ParseArguments(args):
       if val not in ('emacs', 'vs7', 'eclipse'):
         PrintUsage('The only allowed output formats are emacs, vs7 and eclipse.')
       output_format = val
+    elif opt == '--quiet':
+      quiet = True
     elif opt == '--verbose':
       verbosity = int(val)
     elif opt == '--filter':
@@ -4721,12 +6204,15 @@ def ParseArguments(args):
       try:
           _valid_extensions = set(val.split(','))
       except ValueError:
-          PrintUsage('Extensions must be comma seperated list.')
+          PrintUsage('Extensions must be comma separated list.')
+    elif opt == '--headers':
+      ProcessHppHeadersOption(val)
 
   if not filenames:
     PrintUsage('No files were specified.')
 
   _SetOutputFormat(output_format)
+  _SetQuiet(quiet)
   _SetVerboseLevel(verbosity)
   _SetFilters(filters)
   _SetCountingStyle(counting_style)
@@ -4747,7 +6233,9 @@ def main():
   _cpplint_state.ResetErrorCounts()
   for filename in filenames:
     ProcessFile(filename, _cpplint_state.verbose_level)
-  _cpplint_state.PrintErrorCounts()
+  # If --quiet is passed, suppress printing error count unless there are errors.
+  if not _cpplint_state.quiet or _cpplint_state.error_count > 0:
+    _cpplint_state.PrintErrorCounts()
 
   sys.exit(_cpplint_state.error_count > 0)
 
diff --git a/media/libvpx/libvpx/tools/diff.py b/media/libvpx/libvpx/tools/diff.py
index a96c7db851..860a6b051b 100644
--- a/media/libvpx/libvpx/tools/diff.py
+++ b/media/libvpx/libvpx/tools/diff.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 ##  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
 ##
 ##  Use of this source code is governed by a BSD-style license
diff --git a/media/libvpx/libvpx/tools/ftfy.sh b/media/libvpx/libvpx/tools/ftfy.sh
deleted file mode 100644
index c005918fe7..0000000000
--- a/media/libvpx/libvpx/tools/ftfy.sh
+++ /dev/null
@@ -1,158 +0,0 @@
-#!/bin/sh
-self="$0"
-dirname_self=$(dirname "$self")
-
-usage() {
-  cat <<EOF >&2
-Usage: $self [option]
-
-This script applies a whitespace transformation to the commit at HEAD. If no
-options are given, then the modified files are left in the working tree.
-
-Options:
-  -h, --help     Shows this message
-  -n, --dry-run  Shows a diff of the changes to be made.
-  --amend        Squashes the changes into the commit at HEAD
-                     This option will also reformat the commit message.
-  --commit       Creates a new commit containing only the whitespace changes
-  --msg-only     Reformat the commit message only, ignore the patch itself.
-
-EOF
-  rm -f ${CLEAN_FILES}
-  exit 1
-}
-
-
-log() {
-  echo "${self##*/}: $@" >&2
-}
-
-
-vpx_style() {
-  for f; do
-    case "$f" in
-      *.h|*.c|*.cc)
-        clang-format -i --style=file "$f"
-        ;;
-    esac
-  done
-}
-
-
-apply() {
-  [ $INTERSECT_RESULT -ne 0 ] && patch -p1 < "$1"
-}
-
-
-commit() {
-  LAST_CHANGEID=$(git show | awk '/Change-Id:/{print $2}')
-  if [ -z "$LAST_CHANGEID" ]; then
-    log "HEAD doesn't have a Change-Id, unable to generate a new commit"
-    exit 1
-  fi
-
-  # Build a deterministic Change-Id from the parent's
-  NEW_CHANGEID=${LAST_CHANGEID}-styled
-  NEW_CHANGEID=I$(echo $NEW_CHANGEID | git hash-object --stdin)
-
-  # Commit, preserving authorship from the parent commit.
-  git commit -a -C HEAD > /dev/null
-  git commit --amend -F- << EOF
-Cosmetic: Fix whitespace in change ${LAST_CHANGEID:0:9}
-
-Change-Id: ${NEW_CHANGEID}
-EOF
-}
-
-
-show_commit_msg_diff() {
-  if [ $DIFF_MSG_RESULT -ne 0 ]; then
-    log "Modified commit message:"
-    diff -u "$ORIG_COMMIT_MSG" "$NEW_COMMIT_MSG" | tail -n +3
-  fi
-}
-
-
-amend() {
-  show_commit_msg_diff
-  if [ $DIFF_MSG_RESULT -ne 0 ] || [ $INTERSECT_RESULT -ne 0 ]; then
-    git commit -a --amend -F "$NEW_COMMIT_MSG"
-  fi
-}
-
-
-diff_msg() {
-  git log -1 --format=%B > "$ORIG_COMMIT_MSG"
-  "${dirname_self}"/wrap-commit-msg.py \
-      < "$ORIG_COMMIT_MSG" > "$NEW_COMMIT_MSG"
-  cmp -s "$ORIG_COMMIT_MSG" "$NEW_COMMIT_MSG"
-  DIFF_MSG_RESULT=$?
-}
-
-
-# Temporary files
-ORIG_DIFF=orig.diff.$$
-MODIFIED_DIFF=modified.diff.$$
-FINAL_DIFF=final.diff.$$
-ORIG_COMMIT_MSG=orig.commit-msg.$$
-NEW_COMMIT_MSG=new.commit-msg.$$
-CLEAN_FILES="${ORIG_DIFF} ${MODIFIED_DIFF} ${FINAL_DIFF}"
-CLEAN_FILES="${CLEAN_FILES} ${ORIG_COMMIT_MSG} ${NEW_COMMIT_MSG}"
-
-# Preconditions
-[ $# -lt 2 ] || usage
-
-if ! clang-format -version >/dev/null 2>&1; then
-  log "clang-format not found"
-  exit 1
-fi
-
-if ! git diff --quiet HEAD; then
-  log "Working tree is dirty, commit your changes first"
-  exit 1
-fi
-
-# Need to be in the root
-cd "$(git rev-parse --show-toplevel)"
-
-# Collect the original diff
-git show > "${ORIG_DIFF}"
-
-# Apply the style guide on new and modified files and collect its diff
-for f in $(git diff HEAD^ --name-only -M90 --diff-filter=AM); do
-  case "$f" in
-    third_party/*) continue;;
-  esac
-  vpx_style "$f"
-done
-git diff --no-color --no-ext-diff > "${MODIFIED_DIFF}"
-
-# Intersect the two diffs
-"${dirname_self}"/intersect-diffs.py \
-    "${ORIG_DIFF}" "${MODIFIED_DIFF}" > "${FINAL_DIFF}"
-INTERSECT_RESULT=$?
-git reset --hard >/dev/null
-
-# Fixup the commit message
-diff_msg
-
-# Handle options
-if [ -n "$1" ]; then
-  case "$1" in
-    -h|--help) usage;;
-    -n|--dry-run) cat "${FINAL_DIFF}"; show_commit_msg_diff;;
-    --commit) apply "${FINAL_DIFF}"; commit;;
-    --amend) apply "${FINAL_DIFF}"; amend;;
-    --msg-only) amend;;
-    *) usage;;
-  esac
-else
-  apply "${FINAL_DIFF}"
-  if ! git diff --quiet; then
-    log "Formatting changes applied, verify and commit."
-    log "See also: http://www.webmproject.org/code/contribute/conventions/"
-    git diff --stat
-  fi
-fi
-
-rm -f ${CLEAN_FILES}
diff --git a/media/libvpx/libvpx/tools/intersect-diffs.py b/media/libvpx/libvpx/tools/intersect-diffs.py
index 4dbafa90b7..590e687b47 100644
--- a/media/libvpx/libvpx/tools/intersect-diffs.py
+++ b/media/libvpx/libvpx/tools/intersect-diffs.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 ##  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
 ##
 ##  Use of this source code is governed by a BSD-style license
@@ -69,7 +69,7 @@ def main():
                 break
 
     if out_hunks:
-        print FormatDiffHunks(out_hunks)
+        print(FormatDiffHunks(out_hunks))
         sys.exit(1)
 
 if __name__ == "__main__":
diff --git a/media/libvpx/libvpx/tools/lint-hunks.py b/media/libvpx/libvpx/tools/lint-hunks.py
index 6e25d93624..0a94afebb9 100644
--- a/media/libvpx/libvpx/tools/lint-hunks.py
+++ b/media/libvpx/libvpx/tools/lint-hunks.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 ##  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
 ##
 ##  Use of this source code is governed by a BSD-style license
@@ -10,7 +10,7 @@
 """Performs style checking on each diff hunk."""
 import getopt
 import os
-import StringIO
+import io
 import subprocess
 import sys
 
@@ -63,21 +63,21 @@ def main(argv=None):
     try:
         try:
             opts, args = getopt.getopt(argv[1:], SHORT_OPTIONS, LONG_OPTIONS)
-        except getopt.error, msg:
+        except getopt.error as msg:
             raise Usage(msg)
 
         # process options
         for o, _ in opts:
             if o in ("-h", "--help"):
-                print __doc__
+                print(__doc__)
                 sys.exit(0)
 
         if args and len(args) > 1:
-            print __doc__
+            print(__doc__)
             sys.exit(0)
 
         # Find the fully qualified path to the root of the tree
-        tl = Subprocess(TOPLEVEL_CMD, stdout=subprocess.PIPE)
+        tl = Subprocess(TOPLEVEL_CMD, stdout=subprocess.PIPE, text=True)
         tl = tl.communicate()[0].strip()
 
         # See if we're working on the index or not.
@@ -93,9 +93,9 @@ def main(argv=None):
 
         # Get a list of all affected lines
         file_affected_line_map = {}
-        p = Subprocess(diff_cmd, stdout=subprocess.PIPE)
+        p = Subprocess(diff_cmd, stdout=subprocess.PIPE, text=True)
         stdout = p.communicate()[0]
-        for hunk in diff.ParseDiffHunks(StringIO.StringIO(stdout)):
+        for hunk in diff.ParseDiffHunks(io.StringIO(stdout)):
             filename = hunk.right.filename[2:]
             if filename not in file_affected_line_map:
                 file_affected_line_map[filename] = set()
@@ -103,21 +103,25 @@ def main(argv=None):
 
         # Run each affected file through cpplint
         lint_failed = False
-        for filename, affected_lines in file_affected_line_map.iteritems():
+        for filename, affected_lines in file_affected_line_map.items():
             if filename.split(".")[-1] not in ("c", "h", "cc"):
                 continue
+            if filename.startswith("third_party"):
+                continue
 
             if args:
                 # File contents come from git
                 show_cmd = SHOW_CMD + [args[0] + ":" + filename]
-                show = Subprocess(show_cmd, stdout=subprocess.PIPE)
+                show = Subprocess(show_cmd, stdout=subprocess.PIPE, text=True)
                 lint = Subprocess(cpplint_cmd, expected_returncode=(0, 1),
-                                  stdin=show.stdout, stderr=subprocess.PIPE)
+                                  stdin=show.stdout, stderr=subprocess.PIPE,
+                                  text=True)
                 lint_out = lint.communicate()[1]
             else:
                 # File contents come from the working tree
                 lint = Subprocess(cpplint_cmd, expected_returncode=(0, 1),
-                                  stdin=subprocess.PIPE, stderr=subprocess.PIPE)
+                                  stdin=subprocess.PIPE, stderr=subprocess.PIPE,
+                                  text=True)
                 stdin = open(os.path.join(tl, filename)).read()
                 lint_out = lint.communicate(stdin)[1]
 
@@ -127,17 +131,17 @@ def main(argv=None):
                     continue
                 warning_line_num = int(fields[1])
                 if warning_line_num in affected_lines:
-                    print "%s:%d:%s"%(filename, warning_line_num,
-                                      ":".join(fields[2:]))
+                    print("%s:%d:%s"%(filename, warning_line_num,
+                                      ":".join(fields[2:])))
                     lint_failed = True
 
         # Set exit code if any relevant lint errors seen
         if lint_failed:
             return 1
 
-    except Usage, err:
-        print >>sys.stderr, err
-        print >>sys.stderr, "for help use --help"
+    except Usage as err:
+        print(err, file=sys.stderr)
+        print("for help use --help", file=sys.stderr)
         return 2
 
 if __name__ == "__main__":
diff --git a/media/libvpx/libvpx/tools/non_greedy_mv/non_greedy_mv.py b/media/libvpx/libvpx/tools/non_greedy_mv/non_greedy_mv.py
new file mode 100644
index 0000000000..a46b7e760c
--- /dev/null
+++ b/media/libvpx/libvpx/tools/non_greedy_mv/non_greedy_mv.py
@@ -0,0 +1,195 @@
+##  Copyright (c) 2020 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+import sys
+import matplotlib.pyplot as plt
+from matplotlib.collections import LineCollection
+from matplotlib import colors as mcolors
+import numpy as np
+import math
+
+
+def draw_mv_ls(axis, mv_ls, mode=0):
+  colors = np.array([(1., 0., 0., 1.)])
+  segs = np.array([
+      np.array([[ptr[0], ptr[1]], [ptr[0] + ptr[2], ptr[1] + ptr[3]]])
+      for ptr in mv_ls
+  ])
+  line_segments = LineCollection(
+      segs, linewidths=(1.,), colors=colors, linestyle='solid')
+  axis.add_collection(line_segments)
+  if mode == 0:
+    axis.scatter(mv_ls[:, 0], mv_ls[:, 1], s=2, c='b')
+  else:
+    axis.scatter(
+        mv_ls[:, 0] + mv_ls[:, 2], mv_ls[:, 1] + mv_ls[:, 3], s=2, c='b')
+
+
+def draw_pred_block_ls(axis, mv_ls, bs, mode=0):
+  colors = np.array([(0., 0., 0., 1.)])
+  segs = []
+  for ptr in mv_ls:
+    if mode == 0:
+      x = ptr[0]
+      y = ptr[1]
+    else:
+      x = ptr[0] + ptr[2]
+      y = ptr[1] + ptr[3]
+    x_ls = [x, x + bs, x + bs, x, x]
+    y_ls = [y, y, y + bs, y + bs, y]
+
+    segs.append(np.column_stack([x_ls, y_ls]))
+  line_segments = LineCollection(
+      segs, linewidths=(.5,), colors=colors, linestyle='solid')
+  axis.add_collection(line_segments)
+
+
+def read_frame(fp, no_swap=0):
+  plane = [None, None, None]
+  for i in range(3):
+    line = fp.readline()
+    word_ls = line.split()
+    word_ls = [int(item) for item in word_ls]
+    rows = word_ls[0]
+    cols = word_ls[1]
+
+    line = fp.readline()
+    word_ls = line.split()
+    word_ls = [int(item) for item in word_ls]
+
+    plane[i] = np.array(word_ls).reshape(rows, cols)
+    if i > 0:
+      plane[i] = plane[i].repeat(2, axis=0).repeat(2, axis=1)
+  plane = np.array(plane)
+  if no_swap == 0:
+    plane = np.swapaxes(np.swapaxes(plane, 0, 1), 1, 2)
+  return plane
+
+
+def yuv_to_rgb(yuv):
+  #mat = np.array([
+  #    [1.164,   0   , 1.596  ],
+  #    [1.164, -0.391, -0.813],
+  #    [1.164, 2.018 , 0     ] ]
+  #               )
+  #c = np.array([[ -16 , -16 , -16  ],
+  #              [ 0   , -128, -128 ],
+  #              [ -128, -128,   0  ]])
+
+  mat = np.array([[1, 0, 1.4075], [1, -0.3445, -0.7169], [1, 1.7790, 0]])
+  c = np.array([[0, 0, 0], [0, -128, -128], [-128, -128, 0]])
+  mat_c = np.dot(mat, c)
+  v = np.array([mat_c[0, 0], mat_c[1, 1], mat_c[2, 2]])
+  mat = mat.transpose()
+  rgb = np.dot(yuv, mat) + v
+  rgb = rgb.astype(int)
+  rgb = rgb.clip(0, 255)
+  return rgb / 255.
+
+
+def read_feature_score(fp, mv_rows, mv_cols):
+  line = fp.readline()
+  word_ls = line.split()
+  feature_score = np.array([math.log(float(v) + 1, 2) for v in word_ls])
+  feature_score = feature_score.reshape(mv_rows, mv_cols)
+  return feature_score
+
+def read_mv_mode_arr(fp, mv_rows, mv_cols):
+  line = fp.readline()
+  word_ls = line.split()
+  mv_mode_arr = np.array([int(v) for v in word_ls])
+  mv_mode_arr = mv_mode_arr.reshape(mv_rows, mv_cols)
+  return mv_mode_arr
+
+
+def read_frame_dpl_stats(fp):
+  line = fp.readline()
+  word_ls = line.split()
+  frame_idx = int(word_ls[1])
+  mi_rows = int(word_ls[3])
+  mi_cols = int(word_ls[5])
+  bs = int(word_ls[7])
+  ref_frame_idx = int(word_ls[9])
+  rf_idx = int(word_ls[11])
+  gf_frame_offset = int(word_ls[13])
+  ref_gf_frame_offset = int(word_ls[15])
+  mi_size = bs / 8
+  mv_ls = []
+  mv_rows = int((math.ceil(mi_rows * 1. / mi_size)))
+  mv_cols = int((math.ceil(mi_cols * 1. / mi_size)))
+  for i in range(mv_rows * mv_cols):
+    line = fp.readline()
+    word_ls = line.split()
+    row = int(word_ls[0]) * 8.
+    col = int(word_ls[1]) * 8.
+    mv_row = int(word_ls[2]) / 8.
+    mv_col = int(word_ls[3]) / 8.
+    mv_ls.append([col, row, mv_col, mv_row])
+  mv_ls = np.array(mv_ls)
+  feature_score = read_feature_score(fp, mv_rows, mv_cols)
+  mv_mode_arr = read_mv_mode_arr(fp, mv_rows, mv_cols)
+  img = yuv_to_rgb(read_frame(fp))
+  ref = yuv_to_rgb(read_frame(fp))
+  return rf_idx, frame_idx, ref_frame_idx, gf_frame_offset, ref_gf_frame_offset, mv_ls, img, ref, bs, feature_score, mv_mode_arr
+
+
+def read_dpl_stats_file(filename, frame_num=0):
+  fp = open(filename)
+  line = fp.readline()
+  width = 0
+  height = 0
+  data_ls = []
+  while (line):
+    if line[0] == '=':
+      data_ls.append(read_frame_dpl_stats(fp))
+    line = fp.readline()
+    if frame_num > 0 and len(data_ls) == frame_num:
+      break
+  return data_ls
+
+
+if __name__ == '__main__':
+  filename = sys.argv[1]
+  data_ls = read_dpl_stats_file(filename, frame_num=5)
+  for rf_idx, frame_idx, ref_frame_idx, gf_frame_offset, ref_gf_frame_offset, mv_ls, img, ref, bs, feature_score, mv_mode_arr in data_ls:
+    fig, axes = plt.subplots(2, 2)
+
+    axes[0][0].imshow(img)
+    draw_mv_ls(axes[0][0], mv_ls)
+    draw_pred_block_ls(axes[0][0], mv_ls, bs, mode=0)
+    #axes[0].grid(color='k', linestyle='-')
+    axes[0][0].set_ylim(img.shape[0], 0)
+    axes[0][0].set_xlim(0, img.shape[1])
+
+    if ref is not None:
+      axes[0][1].imshow(ref)
+      draw_mv_ls(axes[0][1], mv_ls, mode=1)
+      draw_pred_block_ls(axes[0][1], mv_ls, bs, mode=1)
+      #axes[1].grid(color='k', linestyle='-')
+      axes[0][1].set_ylim(ref.shape[0], 0)
+      axes[0][1].set_xlim(0, ref.shape[1])
+
+    axes[1][0].imshow(feature_score)
+    #feature_score_arr = feature_score.flatten()
+    #feature_score_max = feature_score_arr.max()
+    #feature_score_min = feature_score_arr.min()
+    #step = (feature_score_max - feature_score_min) / 20.
+    #feature_score_bins = np.arange(feature_score_min, feature_score_max, step)
+    #axes[1][1].hist(feature_score_arr, bins=feature_score_bins)
+    im = axes[1][1].imshow(mv_mode_arr)
+    #axes[1][1].figure.colorbar(im, ax=axes[1][1])
+
+    print rf_idx, frame_idx, ref_frame_idx, gf_frame_offset, ref_gf_frame_offset, len(mv_ls)
+
+    flatten_mv_mode = mv_mode_arr.flatten()
+    zero_mv_count = sum(flatten_mv_mode == 0);
+    new_mv_count = sum(flatten_mv_mode == 1);
+    ref_mv_count = sum(flatten_mv_mode == 2) + sum(flatten_mv_mode == 3);
+    print zero_mv_count, new_mv_count, ref_mv_count
+    plt.show()
diff --git a/media/libvpx/libvpx/tools/set_analyzer_env.sh b/media/libvpx/libvpx/tools/set_analyzer_env.sh
new file mode 100644
index 0000000000..a3e8a1ae99
--- /dev/null
+++ b/media/libvpx/libvpx/tools/set_analyzer_env.sh
@@ -0,0 +1,135 @@
+##  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  Sourcing this file sets environment variables to simplify setting up
+##  sanitizer builds and testing.
+
+sanitizer="${1}"
+
+case "${sanitizer}" in
+  address) ;;
+  cfi) ;;
+  integer) ;;
+  memory) ;;
+  thread) ;;
+  undefined) ;;
+  clear)
+    echo "Clearing environment:"
+    set -x
+    unset CC CXX LD AR
+    unset CFLAGS CXXFLAGS LDFLAGS
+    unset ASAN_OPTIONS MSAN_OPTIONS TSAN_OPTIONS UBSAN_OPTIONS
+    set +x
+    return
+    ;;
+  *)
+    echo "Usage: source set_analyzer_env.sh [<sanitizer>|clear]"
+    echo "  Supported sanitizers:"
+    echo "    address cfi integer memory thread undefined"
+    return 1
+    ;;
+esac
+
+if [ ! $(which clang) ]; then
+  # TODO(johannkoenig): Support gcc analyzers.
+  echo "ERROR: 'clang' must be in your PATH"
+  return 1
+fi
+
+# Warnings.
+if [ "${sanitizer}" = "undefined" -o "${sanitizer}" = "integer" ]; then
+  echo "WARNING: When building the ${sanitizer} sanitizer for 32 bit targets"
+  echo "you must run:"
+  echo "export LDFLAGS=\"\${LDFLAGS} --rtlib=compiler-rt -lgcc_s\""
+  echo "See http://llvm.org/bugs/show_bug.cgi?id=17693 for details."
+fi
+
+if [ "${sanitizer}" = "undefined" ]; then
+  major_version=$(clang --version | head -n 1 \
+    | grep -o -E "[[:digit:]]\.[[:digit:]]\.[[:digit:]]" | cut -f1 -d.)
+  if [ ${major_version} -eq 5 ]; then
+    echo "WARNING: clang v5 has a problem with vp9 x86_64 high bit depth"
+    echo "configurations. It can take ~40 minutes to compile"
+    echo "vpx_dsp/x86/fwd_txfm_sse2.c"
+    echo "clang v4 did not have this issue."
+  fi
+fi
+
+echo "It is recommended to configure with '--enable-debug' to improve stack"
+echo "traces. On mac builds, run 'dysmutil' on the output binaries (vpxenc,"
+echo "test_libvpx, etc) to link the stack traces to source code lines."
+
+# Build configuration.
+cflags="-fsanitize=${sanitizer}"
+ldflags="-fsanitize=${sanitizer}"
+
+# Useful backtraces.
+cflags="${cflags} -fno-omit-frame-pointer"
+# Exact backtraces.
+cflags="${cflags} -fno-optimize-sibling-calls"
+
+case "${sanitizer}" in
+  cfi)
+    # https://clang.llvm.org/docs/ControlFlowIntegrity.html
+    cflags="${cflags} -fno-sanitize-trap=cfi -flto -fvisibility=hidden"
+    ldflags="${ldflags} -fno-sanitize-trap=cfi -flto -fuse-ld=gold"
+    export AR="llvm-ar"
+    ;;
+  integer|undefined)
+    # https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html
+    cflags="${cflags} -fsanitize=float-cast-overflow"
+    ;;
+esac
+
+set -x
+export CC="clang"
+export CXX="clang++"
+export LD="clang++"
+
+export CFLAGS="${cflags}"
+export CXXFLAGS="${cflags}"
+export LDFLAGS="${ldflags}"
+set +x
+
+# Execution configuration.
+sanitizer_options=""
+sanitizer_options="${sanitizer_options}:handle_segv=1"
+sanitizer_options="${sanitizer_options}:handle_abort=1"
+sanitizer_options="${sanitizer_options}:handle_sigfpe=1"
+sanitizer_options="${sanitizer_options}:fast_unwind_on_fatal=1"
+sanitizer_options="${sanitizer_options}:allocator_may_return_null=1"
+
+case "${sanitizer}" in
+  address)
+    sanitizer_options="${sanitizer_options}:detect_stack_use_after_return=1"
+    sanitizer_options="${sanitizer_options}:max_uar_stack_size_log=17"
+    set -x
+    export ASAN_OPTIONS="${sanitizer_options}"
+    set +x
+    ;;
+  cfi)
+    # No environment settings
+    ;;
+  memory)
+    set -x
+    export MSAN_OPTIONS="${sanitizer_options}"
+    set +x
+    ;;
+  thread)
+    # The thread sanitizer uses an entirely independent set of options.
+    set -x
+    export TSAN_OPTIONS="halt_on_error=1"
+    set +x
+    ;;
+  undefined|integer)
+    sanitizer_options="${sanitizer_options}:print_stacktrace=1"
+    set -x
+    export UBSAN_OPTIONS="${sanitizer_options}"
+    set +x
+    ;;
+esac
diff --git a/media/libvpx/libvpx/tools/tiny_ssim.c b/media/libvpx/libvpx/tools/tiny_ssim.c
index 28052e0a84..c07a9d2118 100644
--- a/media/libvpx/libvpx/tools/tiny_ssim.c
+++ b/media/libvpx/libvpx/tools/tiny_ssim.c
@@ -8,17 +8,188 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 #include <errno.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
+#include "./y4minput.h"
+#include "vpx_dsp/ssim.h"
+#include "vpx_ports/mem.h"
 
-void vp8_ssim_parms_8x8_c(unsigned char *s, int sp, unsigned char *r, int rp,
-                          uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
-                          uint32_t *sum_sq_r, uint32_t *sum_sxr) {
+static const int64_t cc1 = 26634;        // (64^2*(.01*255)^2
+static const int64_t cc2 = 239708;       // (64^2*(.03*255)^2
+static const int64_t cc1_10 = 428658;    // (64^2*(.01*1023)^2
+static const int64_t cc2_10 = 3857925;   // (64^2*(.03*1023)^2
+static const int64_t cc1_12 = 6868593;   // (64^2*(.01*4095)^2
+static const int64_t cc2_12 = 61817334;  // (64^2*(.03*4095)^2
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static uint64_t calc_plane_error16(uint16_t *orig, int orig_stride,
+                                   uint16_t *recon, int recon_stride,
+                                   unsigned int cols, unsigned int rows) {
+  unsigned int row, col;
+  uint64_t total_sse = 0;
+  int diff;
+  if (orig == NULL || recon == NULL) {
+    assert(0);
+    return 0;
+  }
+
+  for (row = 0; row < rows; row++) {
+    for (col = 0; col < cols; col++) {
+      diff = orig[col] - recon[col];
+      total_sse += diff * diff;
+    }
+
+    orig += orig_stride;
+    recon += recon_stride;
+  }
+  return total_sse;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static uint64_t calc_plane_error(uint8_t *orig, int orig_stride, uint8_t *recon,
+                                 int recon_stride, unsigned int cols,
+                                 unsigned int rows) {
+  unsigned int row, col;
+  uint64_t total_sse = 0;
+  int diff;
+  if (orig == NULL || recon == NULL) {
+    assert(0);
+    return 0;
+  }
+
+  for (row = 0; row < rows; row++) {
+    for (col = 0; col < cols; col++) {
+      diff = orig[col] - recon[col];
+      total_sse += diff * diff;
+    }
+
+    orig += orig_stride;
+    recon += recon_stride;
+  }
+  return total_sse;
+}
+
+#define MAX_PSNR 100
+static double mse2psnr(double samples, double peak, double mse) {
+  double psnr;
+
+  if (mse > 0.0)
+    psnr = 10.0 * log10(peak * peak * samples / mse);
+  else
+    psnr = MAX_PSNR;  // Limit to prevent / 0
+
+  if (psnr > MAX_PSNR) psnr = MAX_PSNR;
+
+  return psnr;
+}
+
+typedef enum { RAW_YUV, Y4M } input_file_type;
+
+typedef struct input_file {
+  FILE *file;
+  input_file_type type;
+  unsigned char *buf;
+  y4m_input y4m;
+  vpx_image_t img;
+  int w;
+  int h;
+  int bit_depth;
+  int frame_size;
+} input_file_t;
+
+// Open a file and determine if its y4m or raw.  If y4m get the header.
+static int open_input_file(const char *file_name, input_file_t *input, int w,
+                           int h, int bit_depth) {
+  char y4m_buf[4];
+  input->w = w;
+  input->h = h;
+  input->bit_depth = bit_depth;
+  input->type = RAW_YUV;
+  input->buf = NULL;
+  input->file = strcmp(file_name, "-") ? fopen(file_name, "rb") : stdin;
+  if (input->file == NULL) return -1;
+  if (fread(y4m_buf, 1, 4, input->file) != 4) return -1;
+  if (memcmp(y4m_buf, "YUV4", 4) == 0) input->type = Y4M;
+  switch (input->type) {
+    case Y4M:
+      y4m_input_open(&input->y4m, input->file, y4m_buf, 4, 0);
+      input->w = input->y4m.pic_w;
+      input->h = input->y4m.pic_h;
+      input->bit_depth = input->y4m.bit_depth;
+      // Y4M alloc's its own buf. Init this to avoid problems if we never
+      // read frames.
+      memset(&input->img, 0, sizeof(input->img));
+      break;
+    case RAW_YUV:
+      fseek(input->file, 0, SEEK_SET);
+      input->w = w;
+      input->h = h;
+      // handle odd frame sizes
+      input->frame_size = w * h + ((w + 1) / 2) * ((h + 1) / 2) * 2;
+      if (bit_depth > 8) {
+        input->frame_size *= 2;
+      }
+      input->buf = malloc(input->frame_size);
+      break;
+  }
+  return 0;
+}
+
+static void close_input_file(input_file_t *in) {
+  if (in->file) fclose(in->file);
+  if (in->type == Y4M) {
+    vpx_img_free(&in->img);
+  } else {
+    free(in->buf);
+  }
+}
+
+// Returns 1 on success, 0 on failure due to a read error or eof (or format
+// error in the case of y4m).
+static int read_input_file(input_file_t *in, unsigned char **y,
+                           unsigned char **u, unsigned char **v, int bd) {
+  size_t r1 = 0;
+  switch (in->type) {
+    case Y4M:
+      r1 = y4m_input_fetch_frame(&in->y4m, in->file, &in->img);
+      if (r1 == (size_t)-1) return 0;
+      *y = in->img.planes[0];
+      *u = in->img.planes[1];
+      *v = in->img.planes[2];
+      break;
+    case RAW_YUV:
+      if (bd < 9) {
+        r1 = fread(in->buf, in->frame_size, 1, in->file);
+        *y = in->buf;
+        *u = in->buf + in->w * in->h;
+        *v = *u + ((1 + in->w) / 2) * ((1 + in->h) / 2);
+      } else {
+        r1 = fread(in->buf, in->frame_size, 1, in->file);
+        *y = in->buf;
+        *u = in->buf + (in->w * in->h) * 2;
+        *v = *u + 2 * ((1 + in->w) / 2) * ((1 + in->h) / 2);
+      }
+      break;
+  }
+
+  return r1 != 0;
+}
+
+static void ssim_parms_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp,
+                           uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
+                           uint32_t *sum_sq_r, uint32_t *sum_sxr) {
   int i, j;
+  if (s == NULL || r == NULL || sum_s == NULL || sum_r == NULL ||
+      sum_sq_s == NULL || sum_sq_r == NULL || sum_sxr == NULL) {
+    assert(0);
+    return;
+  }
   for (i = 0; i < 8; i++, s += sp, r += rp) {
     for (j = 0; j < 8; j++) {
       *sum_s += s[j];
@@ -30,40 +201,79 @@ void vp8_ssim_parms_8x8_c(unsigned char *s, int sp, unsigned char *r, int rp,
   }
 }
 
-static const int64_t cc1 = 26634;   // (64^2*(.01*255)^2
-static const int64_t cc2 = 239708;  // (64^2*(.03*255)^2
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_ssim_parms_8x8(const uint16_t *s, int sp, const uint16_t *r,
+                                  int rp, uint32_t *sum_s, uint32_t *sum_r,
+                                  uint32_t *sum_sq_s, uint32_t *sum_sq_r,
+                                  uint32_t *sum_sxr) {
+  int i, j;
+  if (s == NULL || r == NULL || sum_s == NULL || sum_r == NULL ||
+      sum_sq_s == NULL || sum_sq_r == NULL || sum_sxr == NULL) {
+    assert(0);
+    return;
+  }
+  for (i = 0; i < 8; i++, s += sp, r += rp) {
+    for (j = 0; j < 8; j++) {
+      *sum_s += s[j];
+      *sum_r += r[j];
+      *sum_sq_s += s[j] * s[j];
+      *sum_sq_r += r[j] * r[j];
+      *sum_sxr += s[j] * r[j];
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
-                         uint32_t sum_sq_r, uint32_t sum_sxr, int count) {
-  int64_t ssim_n, ssim_d;
-  int64_t c1, c2;
+                         uint32_t sum_sq_r, uint32_t sum_sxr, int count,
+                         uint32_t bd) {
+  double ssim_n, ssim_d;
+  int64_t c1 = 0, c2 = 0;
+  if (bd == 8) {
+    // scale the constants by number of pixels
+    c1 = (cc1 * count * count) >> 12;
+    c2 = (cc2 * count * count) >> 12;
+  } else if (bd == 10) {
+    c1 = (cc1_10 * count * count) >> 12;
+    c2 = (cc2_10 * count * count) >> 12;
+  } else if (bd == 12) {
+    c1 = (cc1_12 * count * count) >> 12;
+    c2 = (cc2_12 * count * count) >> 12;
+  } else {
+    assert(0);
+  }
 
-  // scale the constants by number of pixels
-  c1 = (cc1 * count * count) >> 12;
-  c2 = (cc2 * count * count) >> 12;
+  ssim_n = (2.0 * sum_s * sum_r + c1) *
+           (2.0 * count * sum_sxr - 2.0 * sum_s * sum_r + c2);
 
-  ssim_n = (2 * sum_s * sum_r + c1) *
-           ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2);
+  ssim_d = ((double)sum_s * sum_s + (double)sum_r * sum_r + c1) *
+           ((double)count * sum_sq_s - (double)sum_s * sum_s +
+            (double)count * sum_sq_r - (double)sum_r * sum_r + c2);
 
-  ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) *
-           ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s +
-            (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2);
-
-  return ssim_n * 1.0 / ssim_d;
+  return ssim_n / ssim_d;
 }
 
-static double ssim_8x8(unsigned char *s, int sp, unsigned char *r, int rp) {
+static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) {
   uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
-  vp8_ssim_parms_8x8_c(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
-                       &sum_sxr);
-  return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
+  ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+  return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r,
+                              int rp, uint32_t bd) {
+  uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+  highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+                        &sum_sxr);
+  return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 // We are using a 8x8 moving window with starting location of each 8x8 window
 // on the 4x4 pixel grid. Such arrangement allows the windows to overlap
 // block boundaries to penalize blocking artifacts.
-double vp8_ssim2(unsigned char *img1, unsigned char *img2, int stride_img1,
-                 int stride_img2, int width, int height) {
+static double ssim2(const uint8_t *img1, const uint8_t *img2, int stride_img1,
+                    int stride_img2, int width, int height) {
   int i, j;
   int samples = 0;
   double ssim_total = 0;
@@ -81,120 +291,295 @@ double vp8_ssim2(unsigned char *img1, unsigned char *img2, int stride_img1,
   return ssim_total;
 }
 
-static uint64_t calc_plane_error(uint8_t *orig, int orig_stride, uint8_t *recon,
-                                 int recon_stride, unsigned int cols,
-                                 unsigned int rows) {
-  unsigned int row, col;
-  uint64_t total_sse = 0;
-  int diff;
+#if CONFIG_VP9_HIGHBITDEPTH
+static double highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
+                           int stride_img1, int stride_img2, int width,
+                           int height, uint32_t bd) {
+  int i, j;
+  int samples = 0;
+  double ssim_total = 0;
 
-  for (row = 0; row < rows; row++) {
-    for (col = 0; col < cols; col++) {
-      diff = orig[col] - recon[col];
-      total_sse += diff * diff;
+  // sample point start with each 4x4 location
+  for (i = 0; i <= height - 8;
+       i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+    for (j = 0; j <= width - 8; j += 4) {
+      double v =
+          highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1,
+                          CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd);
+      ssim_total += v;
+      samples++;
     }
-
-    orig += orig_stride;
-    recon += recon_stride;
   }
-
-  return total_sse;
-}
-
-#define MAX_PSNR 100
-
-double vp9_mse2psnr(double samples, double peak, double mse) {
-  double psnr;
-
-  if (mse > 0.0)
-    psnr = 10.0 * log10(peak * peak * samples / mse);
-  else
-    psnr = MAX_PSNR;  // Limit to prevent / 0
-
-  if (psnr > MAX_PSNR) psnr = MAX_PSNR;
-
-  return psnr;
+  ssim_total /= samples;
+  return ssim_total;
 }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 int main(int argc, char *argv[]) {
-  FILE *f[2];
-  uint8_t *buf[2];
-  int w, h, n_frames, tl_skip = 0, tl_skips_remaining = 0;
-  double ssim = 0, psnravg = 0, psnrglb = 0;
-  double ssimy, ssimu, ssimv;
-  uint64_t psnry, psnru, psnrv;
+  FILE *framestats = NULL;
+  int bit_depth = 8;
+  int w = 0, h = 0, tl_skip = 0, tl_skips_remaining = 0;
+  double ssimavg = 0, ssimyavg = 0, ssimuavg = 0, ssimvavg = 0;
+  double psnrglb = 0, psnryglb = 0, psnruglb = 0, psnrvglb = 0;
+  double psnravg = 0, psnryavg = 0, psnruavg = 0, psnrvavg = 0;
+  double *ssimy = NULL, *ssimu = NULL, *ssimv = NULL;
+  uint64_t *psnry = NULL, *psnru = NULL, *psnrv = NULL;
+  size_t i, n_frames = 0, allocated_frames = 0;
+  int return_value = 0;
+  input_file_t in[2];
+  double peak = 255.0;
 
-  if (argc < 4) {
-    fprintf(stderr, "Usage: %s file1.yuv file2.yuv WxH [tl_skip={0,1,3}]\n",
+  memset(in, 0, sizeof(in));
+
+  if (argc < 3) {
+    fprintf(stderr,
+            "Usage: %s file1.{yuv|y4m} file2.{yuv|y4m}"
+            " [WxH tl_skip={0,1,3} frame_stats_file bits]\n",
             argv[0]);
     return 1;
   }
-  f[0] = strcmp(argv[1], "-") ? fopen(argv[1], "rb") : stdin;
-  f[1] = strcmp(argv[2], "-") ? fopen(argv[2], "rb") : stdin;
-  sscanf(argv[3], "%dx%d", &w, &h);
-  // Number of frames to skip from file1.yuv for every frame used. Normal values
-  // 0, 1 and 3 correspond to TL2, TL1 and TL0 respectively for a 3TL encoding
-  // in mode 10. 7 would be reasonable for comparing TL0 of a 4-layer encoding.
-  if (argc > 4) {
-    sscanf(argv[4], "%d", &tl_skip);
-  }
-  if (!f[0] || !f[1]) {
-    fprintf(stderr, "Could not open input files: %s\n", strerror(errno));
-    return 1;
-  }
-  if (w <= 0 || h <= 0 || w & 1 || h & 1) {
-    fprintf(stderr, "Invalid size %dx%d\n", w, h);
-    return 1;
-  }
-  buf[0] = malloc(w * h * 3 / 2);
-  buf[1] = malloc(w * h * 3 / 2);
-  n_frames = 0;
-  while (1) {
-    size_t r1, r2;
-    r1 = fread(buf[0], w * h * 3 / 2, 1, f[0]);
-    if (r1) {
-      // Reading parts of file1.yuv that were not used in temporal layer.
-      if (tl_skips_remaining > 0) {
-        --tl_skips_remaining;
-        continue;
-      }
-      // Use frame, but skip |tl_skip| after it.
-      tl_skips_remaining = tl_skip;
+
+  if (argc > 3) {
+    if (sscanf(argv[3], "%dx%d", &w, &h) != 2) {
+      fprintf(stderr, "arguments for w/h not assigned!\n");
+      goto clean_up;
     }
-    r2 = fread(buf[1], w * h * 3 / 2, 1, f[1]);
-    if (r1 && r2 && r1 != r2) {
-      fprintf(stderr, "Failed to read data: %s [%d/%d]\n", strerror(errno),
-              (int)r1, (int)r2);
-      return 1;
-    } else if (r1 == 0 || r2 == 0) {
+    // Limit width/height to 4K. The frame_size set in the function
+    // open_input_file() will still be within range of int.
+    if (w < 1 || w > 4096 || h < 1 || h > 4096) {
+      fprintf(stderr,
+              "width or height is too large (above 4096) or below 1!\n");
+      goto clean_up;
+    }
+  }
+
+  if (argc > 6) {
+    if (sscanf(argv[6], "%d", &bit_depth) != 1) {
+      fprintf(stderr, "argument for bit_depth not assigned!\n");
+      goto clean_up;
+    }
+  }
+
+  if (open_input_file(argv[1], &in[0], w, h, bit_depth) < 0) {
+    fprintf(stderr, "File %s can't be opened or parsed!\n", argv[1]);
+    goto clean_up;
+  }
+
+  if (w == 0 && h == 0) {
+    // If a y4m is the first file and w, h is not set grab from first file.
+    w = in[0].w;
+    h = in[0].h;
+    bit_depth = in[0].bit_depth;
+  }
+  if (bit_depth == 10) peak = 1023.0;
+
+  if (bit_depth == 12) peak = 4095.0;
+
+  if (open_input_file(argv[2], &in[1], w, h, bit_depth) < 0) {
+    fprintf(stderr, "File %s can't be opened or parsed!\n", argv[2]);
+    goto clean_up;
+  }
+
+  if (in[0].w != in[1].w || in[0].h != in[1].h || in[0].w != w ||
+      in[0].h != h || w == 0 || h == 0) {
+    fprintf(stderr,
+            "Failing: Image dimensions don't match or are unspecified!\n");
+    return_value = 1;
+    goto clean_up;
+  }
+
+  if (in[0].bit_depth != in[1].bit_depth) {
+    fprintf(stderr,
+            "Failing: Image bit depths don't match or are unspecified!\n");
+    return_value = 1;
+    goto clean_up;
+  }
+
+  bit_depth = in[0].bit_depth;
+
+  // Number of frames to skip from file1.yuv for every frame used. Normal
+  // values 0, 1 and 3 correspond to TL2, TL1 and TL0 respectively for a 3TL
+  // encoding in mode 10. 7 would be reasonable for comparing TL0 of a 4-layer
+  // encoding.
+  if (argc > 4) {
+    if (sscanf(argv[4], "%d", &tl_skip) != 1) {
+      fprintf(stderr, "argument for tl_skip not assigned!\n");
+      goto clean_up;
+    }
+    if (argc > 5) {
+      framestats = fopen(argv[5], "w");
+      if (!framestats) {
+        fprintf(stderr, "Could not open \"%s\" for writing: %s\n", argv[5],
+                strerror(errno));
+        return_value = 1;
+        goto clean_up;
+      }
+    }
+  }
+
+  while (1) {
+    int r1, r2;
+    unsigned char *y[2], *u[2], *v[2];
+
+    r1 = read_input_file(&in[0], &y[0], &u[0], &v[0], bit_depth);
+    if (r1 == 0) {
+      if (ferror(in[0].file)) {
+        fprintf(stderr, "Failed to read data from '%s'\n", argv[1]);
+        return_value = 1;
+        goto clean_up;
+      }
       break;
     }
-#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h) \
-  ssim = vp8_ssim2(buf0, buf1, w, w, w, h);         \
-  psnr = calc_plane_error(buf0, w, buf1, w, w, h);
-    psnr_and_ssim(ssimy, psnry, buf[0], buf[1], w, h);
-    psnr_and_ssim(ssimu, psnru, buf[0] + w * h, buf[1] + w * h, w / 2, h / 2);
-    psnr_and_ssim(ssimv, psnrv, buf[0] + w * h * 5 / 4, buf[1] + w * h * 5 / 4,
-                  w / 2, h / 2);
-    ssim += 0.8 * ssimy + 0.1 * (ssimu + ssimv);
-    psnravg +=
-        vp9_mse2psnr(w * h * 6 / 4, 255.0, (double)psnry + psnru + psnrv);
-    psnrglb += psnry + psnru + psnrv;
+
+    // Reading parts of file1.yuv that were not used in temporal layer.
+    if (tl_skips_remaining > 0) {
+      --tl_skips_remaining;
+      continue;
+    }
+    // Use frame, but skip |tl_skip| after it.
+    tl_skips_remaining = tl_skip;
+
+    r2 = read_input_file(&in[1], &y[1], &u[1], &v[1], bit_depth);
+    if (r2 == 0) {
+      if (ferror(in[1].file)) {
+        fprintf(stderr, "Failed to read data from '%s'\n", argv[2]);
+        return_value = 1;
+        goto clean_up;
+      }
+      break;
+    }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h)                           \
+  do {                                                                        \
+    if (bit_depth < 9) {                                                      \
+      ssim = ssim2(buf0, buf1, w, w, w, h);                                   \
+      psnr = calc_plane_error(buf0, w, buf1, w, w, h);                        \
+    } else {                                                                  \
+      ssim = highbd_ssim2(CONVERT_TO_BYTEPTR(buf0), CONVERT_TO_BYTEPTR(buf1), \
+                          w, w, w, h, bit_depth);                             \
+      psnr = calc_plane_error16(CAST_TO_SHORTPTR(buf0), w,                    \
+                                CAST_TO_SHORTPTR(buf1), w, w, h);             \
+    }                                                                         \
+  } while (0)
+#else
+#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h)  \
+  do {                                               \
+    ssim = ssim2(buf0, buf1, w, w, w, h);            \
+    psnr = calc_plane_error(buf0, w, buf1, w, w, h); \
+  } while (0)
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    if (n_frames == allocated_frames) {
+      allocated_frames = allocated_frames == 0 ? 1024 : allocated_frames * 2;
+      ssimy = realloc(ssimy, allocated_frames * sizeof(*ssimy));
+      ssimu = realloc(ssimu, allocated_frames * sizeof(*ssimu));
+      ssimv = realloc(ssimv, allocated_frames * sizeof(*ssimv));
+      psnry = realloc(psnry, allocated_frames * sizeof(*psnry));
+      psnru = realloc(psnru, allocated_frames * sizeof(*psnru));
+      psnrv = realloc(psnrv, allocated_frames * sizeof(*psnrv));
+      if (!(ssimy && ssimu && ssimv && psnry && psnru && psnrv)) {
+        fprintf(stderr, "Error allocating SSIM/PSNR data.\n");
+        exit(EXIT_FAILURE);
+      }
+    }
+    psnr_and_ssim(ssimy[n_frames], psnry[n_frames], y[0], y[1], w, h);
+    psnr_and_ssim(ssimu[n_frames], psnru[n_frames], u[0], u[1], (w + 1) / 2,
+                  (h + 1) / 2);
+    psnr_and_ssim(ssimv[n_frames], psnrv[n_frames], v[0], v[1], (w + 1) / 2,
+                  (h + 1) / 2);
+
     n_frames++;
   }
-  free(buf[0]);
-  free(buf[1]);
-  ssim /= n_frames;
+
+  if (framestats) {
+    fprintf(framestats,
+            "ssim,ssim-y,ssim-u,ssim-v,psnr,psnr-y,psnr-u,psnr-v\n");
+  }
+
+  for (i = 0; i < n_frames; ++i) {
+    double frame_ssim;
+    double frame_psnr, frame_psnry, frame_psnru, frame_psnrv;
+
+    frame_ssim = 0.8 * ssimy[i] + 0.1 * (ssimu[i] + ssimv[i]);
+    ssimavg += frame_ssim;
+    ssimyavg += ssimy[i];
+    ssimuavg += ssimu[i];
+    ssimvavg += ssimv[i];
+
+    frame_psnr =
+        mse2psnr(w * h * 6 / 4, peak, (double)psnry[i] + psnru[i] + psnrv[i]);
+    frame_psnry = mse2psnr(w * h * 4 / 4, peak, (double)psnry[i]);
+    frame_psnru = mse2psnr(w * h * 1 / 4, peak, (double)psnru[i]);
+    frame_psnrv = mse2psnr(w * h * 1 / 4, peak, (double)psnrv[i]);
+
+    psnravg += frame_psnr;
+    psnryavg += frame_psnry;
+    psnruavg += frame_psnru;
+    psnrvavg += frame_psnrv;
+
+    psnryglb += psnry[i];
+    psnruglb += psnru[i];
+    psnrvglb += psnrv[i];
+
+    if (framestats) {
+      fprintf(framestats, "%lf,%lf,%lf,%lf,%lf,%lf,%lf,%lf\n", frame_ssim,
+              ssimy[i], ssimu[i], ssimv[i], frame_psnr, frame_psnry,
+              frame_psnru, frame_psnrv);
+    }
+  }
+
+  ssimavg /= n_frames;
+  ssimyavg /= n_frames;
+  ssimuavg /= n_frames;
+  ssimvavg /= n_frames;
+
+  printf("VpxSSIM: %lf\n", 100 * pow(ssimavg, 8.0));
+  printf("SSIM: %lf\n", ssimavg);
+  printf("SSIM-Y: %lf\n", ssimyavg);
+  printf("SSIM-U: %lf\n", ssimuavg);
+  printf("SSIM-V: %lf\n", ssimvavg);
+  puts("");
+
   psnravg /= n_frames;
-  psnrglb = vp9_mse2psnr((double)n_frames * w * h * 6 / 4, 255.0, psnrglb);
+  psnryavg /= n_frames;
+  psnruavg /= n_frames;
+  psnrvavg /= n_frames;
 
   printf("AvgPSNR: %lf\n", psnravg);
+  printf("AvgPSNR-Y: %lf\n", psnryavg);
+  printf("AvgPSNR-U: %lf\n", psnruavg);
+  printf("AvgPSNR-V: %lf\n", psnrvavg);
+  puts("");
+
+  psnrglb = psnryglb + psnruglb + psnrvglb;
+  psnrglb = mse2psnr((double)n_frames * w * h * 6 / 4, peak, psnrglb);
+  psnryglb = mse2psnr((double)n_frames * w * h * 4 / 4, peak, psnryglb);
+  psnruglb = mse2psnr((double)n_frames * w * h * 1 / 4, peak, psnruglb);
+  psnrvglb = mse2psnr((double)n_frames * w * h * 1 / 4, peak, psnrvglb);
+
   printf("GlbPSNR: %lf\n", psnrglb);
-  printf("SSIM: %lf\n", 100 * pow(ssim, 8.0));
-  printf("Nframes: %d\n", n_frames);
+  printf("GlbPSNR-Y: %lf\n", psnryglb);
+  printf("GlbPSNR-U: %lf\n", psnruglb);
+  printf("GlbPSNR-V: %lf\n", psnrvglb);
+  puts("");
 
-  if (strcmp(argv[1], "-")) fclose(f[0]);
-  if (strcmp(argv[2], "-")) fclose(f[1]);
+  printf("Nframes: %d\n", (int)n_frames);
 
-  return 0;
+clean_up:
+
+  close_input_file(&in[0]);
+  close_input_file(&in[1]);
+
+  if (framestats) fclose(framestats);
+
+  free(ssimy);
+  free(ssimu);
+  free(ssimv);
+
+  free(psnry);
+  free(psnru);
+  free(psnrv);
+
+  return return_value;
 }
diff --git a/media/libvpx/libvpx/tools/wrap-commit-msg.py b/media/libvpx/libvpx/tools/wrap-commit-msg.py
index d5b4b046b1..ba3fa58732 100644
--- a/media/libvpx/libvpx/tools/wrap-commit-msg.py
+++ b/media/libvpx/libvpx/tools/wrap-commit-msg.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 ##  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
 ##
 ##  Use of this source code is governed by a BSD-style license
diff --git a/media/libvpx/libvpx/tools_common.c b/media/libvpx/libvpx/tools_common.c
index 6f14c25561..5af971f720 100644
--- a/media/libvpx/libvpx/tools_common.c
+++ b/media/libvpx/libvpx/tools_common.c
@@ -24,15 +24,11 @@
 #include "vpx/vp8dx.h"
 #endif
 
-#if defined(_WIN32) || defined(__OS2__)
+#include "vpx/vpx_codec.h"
+
+#if defined(_WIN32)
 #include <io.h>
 #include <fcntl.h>
-
-#ifdef __OS2__
-#define _setmode setmode
-#define _fileno fileno
-#define _O_BINARY O_BINARY
-#endif
 #endif
 
 #define LOG_ERROR(label)               \
@@ -46,9 +42,17 @@
     va_end(ap);                        \
   } while (0)
 
+#if CONFIG_ENCODERS
+/* Swallow warnings about unused results of fread/fwrite */
+static size_t wrap_fread(void *ptr, size_t size, size_t nmemb, FILE *stream) {
+  return fread(ptr, size, nmemb, stream);
+}
+#define fread wrap_fread
+#endif
+
 FILE *set_binary_mode(FILE *stream) {
   (void)stream;
-#if defined(_WIN32) || defined(__OS2__)
+#if defined(_WIN32)
   _setmode(_fileno(stream), _O_BINARY);
 #endif
   return stream;
@@ -69,8 +73,8 @@ void warn(const char *fmt, ...) { LOG_ERROR("Warning"); }
 void die_codec(vpx_codec_ctx_t *ctx, const char *s) {
   const char *detail = vpx_codec_error_detail(ctx);
 
-  printf("%s: %s\n", s, vpx_codec_error(ctx));
-  if (detail) printf("    %s\n", detail);
+  fprintf(stderr, "%s: %s\n", s, vpx_codec_error(ctx));
+  if (detail) fprintf(stderr, "    %s\n", detail);
   exit(EXIT_FAILURE);
 }
 
@@ -83,10 +87,13 @@ int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame) {
 
   for (plane = 0; plane < 3; ++plane) {
     uint8_t *ptr;
-    const int w = vpx_img_plane_width(yuv_frame, plane);
+    int w = vpx_img_plane_width(yuv_frame, plane);
     const int h = vpx_img_plane_height(yuv_frame, plane);
     int r;
-
+    // Assuming that for nv12 we read all chroma data at once
+    if (yuv_frame->fmt == VPX_IMG_FMT_NV12 && plane > 1) break;
+    // Fixing NV12 chroma width if it is odd
+    if (yuv_frame->fmt == VPX_IMG_FMT_NV12 && plane == 1) w = (w + 1) & ~1;
     /* Determine the correct plane based on the image format. The for-loop
      * always counts in Y,U,V order, but this may not match the order of
      * the data on disk.
@@ -200,8 +207,6 @@ const VpxInterface *get_vpx_decoder_by_fourcc(uint32_t fourcc) {
 
 #endif  // CONFIG_DECODERS
 
-// TODO(dkovalev): move this function to vpx_image.{c, h}, so it will be part
-// of vpx_image_t support
 int vpx_img_plane_width(const vpx_image_t *img, int plane) {
   if (plane > 0 && img->x_chroma_shift > 0)
     return (img->d_w + 1) >> img->x_chroma_shift;
@@ -218,17 +223,22 @@ int vpx_img_plane_height(const vpx_image_t *img, int plane) {
 
 void vpx_img_write(const vpx_image_t *img, FILE *file) {
   int plane;
+  const int bytespp = (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
 
   for (plane = 0; plane < 3; ++plane) {
     const unsigned char *buf = img->planes[plane];
     const int stride = img->stride[plane];
-    const int w = vpx_img_plane_width(img, plane) *
-                  ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+    int w = vpx_img_plane_width(img, plane);
     const int h = vpx_img_plane_height(img, plane);
     int y;
 
+    // Assuming that for nv12 we write all chroma data at once
+    if (img->fmt == VPX_IMG_FMT_NV12 && plane > 1) break;
+    // Fixing NV12 chroma width if it is odd
+    if (img->fmt == VPX_IMG_FMT_NV12 && plane == 1) w = (w + 1) & ~1;
+
     for (y = 0; y < h; ++y) {
-      fwrite(buf, 1, w, file);
+      fwrite(buf, bytespp, w, file);
       buf += stride;
     }
   }
@@ -236,17 +246,22 @@ void vpx_img_write(const vpx_image_t *img, FILE *file) {
 
 int vpx_img_read(vpx_image_t *img, FILE *file) {
   int plane;
+  const int bytespp = (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
 
   for (plane = 0; plane < 3; ++plane) {
     unsigned char *buf = img->planes[plane];
     const int stride = img->stride[plane];
-    const int w = vpx_img_plane_width(img, plane) *
-                  ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+    int w = vpx_img_plane_width(img, plane);
     const int h = vpx_img_plane_height(img, plane);
     int y;
 
+    // Assuming that for nv12 we read all chroma data at once
+    if (img->fmt == VPX_IMG_FMT_NV12 && plane > 1) break;
+    // Fixing NV12 chroma width if it is odd
+    if (img->fmt == VPX_IMG_FMT_NV12 && plane == 1) w = (w + 1) & ~1;
+
     for (y = 0; y < h; ++y) {
-      if (fread(buf, 1, w, file) != (size_t)w) return 0;
+      if (fread(buf, bytespp, w, file) != (size_t)w) return 0;
       buf += stride;
     }
   }
@@ -266,6 +281,88 @@ double sse_to_psnr(double samples, double peak, double sse) {
   }
 }
 
+#if CONFIG_ENCODERS
+int read_frame(struct VpxInputContext *input_ctx, vpx_image_t *img) {
+  FILE *f = input_ctx->file;
+  y4m_input *y4m = &input_ctx->y4m;
+  int shortread = 0;
+
+  if (input_ctx->file_type == FILE_TYPE_Y4M) {
+    if (y4m_input_fetch_frame(y4m, f, img) < 1) return 0;
+  } else {
+    shortread = read_yuv_frame(input_ctx, img);
+  }
+
+  return !shortread;
+}
+
+int file_is_y4m(const char detect[4]) {
+  if (memcmp(detect, "YUV4", 4) == 0) {
+    return 1;
+  }
+  return 0;
+}
+
+int fourcc_is_ivf(const char detect[4]) {
+  if (memcmp(detect, "DKIF", 4) == 0) {
+    return 1;
+  }
+  return 0;
+}
+
+void open_input_file(struct VpxInputContext *input) {
+  /* Parse certain options from the input file, if possible */
+  input->file = strcmp(input->filename, "-") ? fopen(input->filename, "rb")
+                                             : set_binary_mode(stdin);
+
+  if (!input->file) fatal("Failed to open input file");
+
+  if (!fseeko(input->file, 0, SEEK_END)) {
+    /* Input file is seekable. Figure out how long it is, so we can get
+     * progress info.
+     */
+    input->length = ftello(input->file);
+    rewind(input->file);
+  }
+
+  /* Default to 1:1 pixel aspect ratio. */
+  input->pixel_aspect_ratio.numerator = 1;
+  input->pixel_aspect_ratio.denominator = 1;
+
+  /* For RAW input sources, these bytes will applied on the first frame
+   *  in read_frame().
+   */
+  input->detect.buf_read = fread(input->detect.buf, 1, 4, input->file);
+  input->detect.position = 0;
+
+  if (input->detect.buf_read == 4 && file_is_y4m(input->detect.buf)) {
+    if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4,
+                       input->only_i420) >= 0) {
+      input->file_type = FILE_TYPE_Y4M;
+      input->width = input->y4m.pic_w;
+      input->height = input->y4m.pic_h;
+      input->pixel_aspect_ratio.numerator = input->y4m.par_n;
+      input->pixel_aspect_ratio.denominator = input->y4m.par_d;
+      input->framerate.numerator = input->y4m.fps_n;
+      input->framerate.denominator = input->y4m.fps_d;
+      input->fmt = input->y4m.vpx_fmt;
+      input->bit_depth = input->y4m.bit_depth;
+    } else {
+      fatal("Unsupported Y4M stream.");
+    }
+  } else if (input->detect.buf_read == 4 && fourcc_is_ivf(input->detect.buf)) {
+    fatal("IVF is not supported as input.");
+  } else {
+    input->file_type = FILE_TYPE_RAW;
+  }
+}
+
+void close_input_file(struct VpxInputContext *input) {
+  fclose(input->file);
+  if (input->file_type == FILE_TYPE_Y4M) y4m_input_close(&input->y4m);
+}
+#endif
+
 // TODO(debargha): Consolidate the functions below into a separate file.
 #if CONFIG_VP9_HIGHBITDEPTH
 static void highbd_img_upshift(vpx_image_t *dst, vpx_image_t *src,
@@ -284,7 +381,7 @@ static void highbd_img_upshift(vpx_image_t *dst, vpx_image_t *src,
     case VPX_IMG_FMT_I42216:
     case VPX_IMG_FMT_I44416:
     case VPX_IMG_FMT_I44016: break;
-    default: fatal("Unsupported image conversion"); break;
+    default: fatal("Unsupported image conversion");
   }
   for (plane = 0; plane < 3; plane++) {
     int w = src->d_w;
@@ -320,7 +417,7 @@ static void lowbd_img_upshift(vpx_image_t *dst, vpx_image_t *src,
     case VPX_IMG_FMT_I422:
     case VPX_IMG_FMT_I444:
     case VPX_IMG_FMT_I440: break;
-    default: fatal("Unsupported image conversion"); break;
+    default: fatal("Unsupported image conversion");
   }
   for (plane = 0; plane < 3; plane++) {
     int w = src->d_w;
@@ -361,7 +458,7 @@ void vpx_img_truncate_16_to_8(vpx_image_t *dst, vpx_image_t *src) {
     case VPX_IMG_FMT_I422:
     case VPX_IMG_FMT_I444:
     case VPX_IMG_FMT_I440: break;
-    default: fatal("Unsupported image conversion"); break;
+    default: fatal("Unsupported image conversion");
   }
   for (plane = 0; plane < 3; plane++) {
     int w = src->d_w;
@@ -396,7 +493,7 @@ static void highbd_img_downshift(vpx_image_t *dst, vpx_image_t *src,
     case VPX_IMG_FMT_I42216:
     case VPX_IMG_FMT_I44416:
     case VPX_IMG_FMT_I44016: break;
-    default: fatal("Unsupported image conversion"); break;
+    default: fatal("Unsupported image conversion");
   }
   for (plane = 0; plane < 3; plane++) {
     int w = src->d_w;
@@ -430,7 +527,7 @@ static void lowbd_img_downshift(vpx_image_t *dst, vpx_image_t *src,
     case VPX_IMG_FMT_I422:
     case VPX_IMG_FMT_I444:
     case VPX_IMG_FMT_I440: break;
-    default: fatal("Unsupported image conversion"); break;
+    default: fatal("Unsupported image conversion");
   }
   for (plane = 0; plane < 3; plane++) {
     int w = src->d_w;
@@ -459,3 +556,225 @@ void vpx_img_downshift(vpx_image_t *dst, vpx_image_t *src, int down_shift) {
   }
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+
+int compare_img(const vpx_image_t *const img1, const vpx_image_t *const img2) {
+  uint32_t l_w = img1->d_w;
+  uint32_t c_w = (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
+  const uint32_t c_h =
+      (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
+  uint32_t i;
+  int match = 1;
+
+  match &= (img1->fmt == img2->fmt);
+  match &= (img1->d_w == img2->d_w);
+  match &= (img1->d_h == img2->d_h);
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (img1->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+    l_w *= 2;
+    c_w *= 2;
+  }
+#endif
+
+  for (i = 0; i < img1->d_h; ++i)
+    match &= (memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y],
+                     img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y],
+                     l_w) == 0);
+
+  for (i = 0; i < c_h; ++i)
+    match &= (memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U],
+                     img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U],
+                     c_w) == 0);
+
+  for (i = 0; i < c_h; ++i)
+    match &= (memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V],
+                     img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V],
+                     c_w) == 0);
+
+  return match;
+}
+
+#define mmin(a, b) ((a) < (b) ? (a) : (b))
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void find_mismatch_high(const vpx_image_t *const img1,
+                        const vpx_image_t *const img2, int yloc[4], int uloc[4],
+                        int vloc[4]) {
+  uint16_t *plane1, *plane2;
+  uint32_t stride1, stride2;
+  const uint32_t bsize = 64;
+  const uint32_t bsizey = bsize >> img1->y_chroma_shift;
+  const uint32_t bsizex = bsize >> img1->x_chroma_shift;
+  const uint32_t c_w =
+      (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
+  const uint32_t c_h =
+      (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
+  int match = 1;
+  uint32_t i, j;
+  yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1;
+  plane1 = (uint16_t *)img1->planes[VPX_PLANE_Y];
+  plane2 = (uint16_t *)img2->planes[VPX_PLANE_Y];
+  stride1 = img1->stride[VPX_PLANE_Y] / 2;
+  stride2 = img2->stride[VPX_PLANE_Y] / 2;
+  for (i = 0, match = 1; match && i < img1->d_h; i += bsize) {
+    for (j = 0; match && j < img1->d_w; j += bsize) {
+      int k, l;
+      const int si = mmin(i + bsize, img1->d_h) - i;
+      const int sj = mmin(j + bsize, img1->d_w) - j;
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          if (*(plane1 + (i + k) * stride1 + j + l) !=
+              *(plane2 + (i + k) * stride2 + j + l)) {
+            yloc[0] = i + k;
+            yloc[1] = j + l;
+            yloc[2] = *(plane1 + (i + k) * stride1 + j + l);
+            yloc[3] = *(plane2 + (i + k) * stride2 + j + l);
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1;
+  plane1 = (uint16_t *)img1->planes[VPX_PLANE_U];
+  plane2 = (uint16_t *)img2->planes[VPX_PLANE_U];
+  stride1 = img1->stride[VPX_PLANE_U] / 2;
+  stride2 = img2->stride[VPX_PLANE_U] / 2;
+  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
+    for (j = 0; match && j < c_w; j += bsizex) {
+      int k, l;
+      const int si = mmin(i + bsizey, c_h - i);
+      const int sj = mmin(j + bsizex, c_w - j);
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          if (*(plane1 + (i + k) * stride1 + j + l) !=
+              *(plane2 + (i + k) * stride2 + j + l)) {
+            uloc[0] = i + k;
+            uloc[1] = j + l;
+            uloc[2] = *(plane1 + (i + k) * stride1 + j + l);
+            uloc[3] = *(plane2 + (i + k) * stride2 + j + l);
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1;
+  plane1 = (uint16_t *)img1->planes[VPX_PLANE_V];
+  plane2 = (uint16_t *)img2->planes[VPX_PLANE_V];
+  stride1 = img1->stride[VPX_PLANE_V] / 2;
+  stride2 = img2->stride[VPX_PLANE_V] / 2;
+  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
+    for (j = 0; match && j < c_w; j += bsizex) {
+      int k, l;
+      const int si = mmin(i + bsizey, c_h - i);
+      const int sj = mmin(j + bsizex, c_w - j);
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          if (*(plane1 + (i + k) * stride1 + j + l) !=
+              *(plane2 + (i + k) * stride2 + j + l)) {
+            vloc[0] = i + k;
+            vloc[1] = j + l;
+            vloc[2] = *(plane1 + (i + k) * stride1 + j + l);
+            vloc[3] = *(plane2 + (i + k) * stride2 + j + l);
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void find_mismatch(const vpx_image_t *const img1, const vpx_image_t *const img2,
+                   int yloc[4], int uloc[4], int vloc[4]) {
+  const uint32_t bsize = 64;
+  const uint32_t bsizey = bsize >> img1->y_chroma_shift;
+  const uint32_t bsizex = bsize >> img1->x_chroma_shift;
+  const uint32_t c_w =
+      (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
+  const uint32_t c_h =
+      (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
+  int match = 1;
+  uint32_t i, j;
+  yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1;
+  for (i = 0, match = 1; match && i < img1->d_h; i += bsize) {
+    for (j = 0; match && j < img1->d_w; j += bsize) {
+      int k, l;
+      const int si = mmin(i + bsize, img1->d_h) - i;
+      const int sj = mmin(j + bsize, img1->d_w) - j;
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          if (*(img1->planes[VPX_PLANE_Y] +
+                (i + k) * img1->stride[VPX_PLANE_Y] + j + l) !=
+              *(img2->planes[VPX_PLANE_Y] +
+                (i + k) * img2->stride[VPX_PLANE_Y] + j + l)) {
+            yloc[0] = i + k;
+            yloc[1] = j + l;
+            yloc[2] = *(img1->planes[VPX_PLANE_Y] +
+                        (i + k) * img1->stride[VPX_PLANE_Y] + j + l);
+            yloc[3] = *(img2->planes[VPX_PLANE_Y] +
+                        (i + k) * img2->stride[VPX_PLANE_Y] + j + l);
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1;
+  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
+    for (j = 0; match && j < c_w; j += bsizex) {
+      int k, l;
+      const int si = mmin(i + bsizey, c_h - i);
+      const int sj = mmin(j + bsizex, c_w - j);
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          if (*(img1->planes[VPX_PLANE_U] +
+                (i + k) * img1->stride[VPX_PLANE_U] + j + l) !=
+              *(img2->planes[VPX_PLANE_U] +
+                (i + k) * img2->stride[VPX_PLANE_U] + j + l)) {
+            uloc[0] = i + k;
+            uloc[1] = j + l;
+            uloc[2] = *(img1->planes[VPX_PLANE_U] +
+                        (i + k) * img1->stride[VPX_PLANE_U] + j + l);
+            uloc[3] = *(img2->planes[VPX_PLANE_U] +
+                        (i + k) * img2->stride[VPX_PLANE_U] + j + l);
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+  vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1;
+  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
+    for (j = 0; match && j < c_w; j += bsizex) {
+      int k, l;
+      const int si = mmin(i + bsizey, c_h - i);
+      const int sj = mmin(j + bsizex, c_w - j);
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          if (*(img1->planes[VPX_PLANE_V] +
+                (i + k) * img1->stride[VPX_PLANE_V] + j + l) !=
+              *(img2->planes[VPX_PLANE_V] +
+                (i + k) * img2->stride[VPX_PLANE_V] + j + l)) {
+            vloc[0] = i + k;
+            vloc[1] = j + l;
+            vloc[2] = *(img1->planes[VPX_PLANE_V] +
+                        (i + k) * img1->stride[VPX_PLANE_V] + j + l);
+            vloc[3] = *(img2->planes[VPX_PLANE_V] +
+                        (i + k) * img2->stride[VPX_PLANE_V] + j + l);
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/media/libvpx/libvpx/tools_common.h b/media/libvpx/libvpx/tools_common.h
index 73ba1bc03b..81453cc065 100644
--- a/media/libvpx/libvpx/tools_common.h
+++ b/media/libvpx/libvpx/tools_common.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef TOOLS_COMMON_H_
-#define TOOLS_COMMON_H_
+#ifndef VPX_TOOLS_COMMON_H_
+#define VPX_TOOLS_COMMON_H_
 
 #include <stdio.h>
 
@@ -16,7 +16,6 @@
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_image.h"
 #include "vpx/vpx_integer.h"
-#include "vpx_ports/msvc.h"
 
 #if CONFIG_ENCODERS
 #include "./y4minput.h"
@@ -26,11 +25,27 @@
 /* MSVS uses _f{seek,tell}i64. */
 #define fseeko _fseeki64
 #define ftello _ftelli64
+typedef int64_t FileOffset;
 #elif defined(_WIN32)
 /* MinGW uses f{seek,tell}o64 for large files. */
 #define fseeko fseeko64
 #define ftello ftello64
-#endif /* _WIN32 */
+typedef off64_t FileOffset;
+#elif CONFIG_OS_SUPPORT &&                                                  \
+    !(defined(__ANDROID__) && __ANDROID_API__ < 24 && !defined(__LP64__) && \
+      defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS == 64)
+/* POSIX.1 has fseeko and ftello. fseeko and ftello are not available before
+ * Android API level 24. See
+ * https://android.googlesource.com/platform/bionic/+/main/docs/32-bit-abi.md */
+#include <sys/types.h> /* NOLINT */
+typedef off_t FileOffset;
+/* Use 32-bit file operations in WebM file format when building ARM
+ * executables (.axf) with RVCT. */
+#else
+#define fseeko fseek
+#define ftello ftell
+typedef long FileOffset; /* NOLINT */
+#endif /* CONFIG_OS_SUPPORT */
 
 #if CONFIG_OS_SUPPORT
 #if defined(_MSC_VER)
@@ -42,13 +57,6 @@
 #endif              /* _MSC_VER */
 #endif              /* CONFIG_OS_SUPPORT */
 
-/* Use 32-bit file operations in WebM file format when building ARM
- * executables (.axf) with RVCT. */
-#if !CONFIG_OS_SUPPORT
-#define fseeko fseek
-#define ftello ftell
-#endif /* CONFIG_OS_SUPPORT */
-
 #define LITERALU64(hi, lo) ((((uint64_t)hi) << 32) | lo)
 
 #ifndef PATH_MAX
@@ -106,30 +114,44 @@ extern "C" {
 
 #if defined(__GNUC__)
 #define VPX_NO_RETURN __attribute__((noreturn))
+#elif defined(_MSC_VER)
+#define VPX_NO_RETURN __declspec(noreturn)
 #else
 #define VPX_NO_RETURN
 #endif
 
+// Tells the compiler to perform `printf` format string checking if the
+// compiler supports it; see the 'format' attribute in
+// <https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html>.
+#define VPX_TOOLS_FORMAT_PRINTF(string_index, first_to_check)
+#if defined(__has_attribute)
+#if __has_attribute(format)
+#undef VPX_TOOLS_FORMAT_PRINTF
+#define VPX_TOOLS_FORMAT_PRINTF(string_index, first_to_check) \
+  __attribute__((__format__(__printf__, string_index, first_to_check)))
+#endif
+#endif
+
 /* Sets a stdio stream into binary mode */
 FILE *set_binary_mode(FILE *stream);
 
-void die(const char *fmt, ...) VPX_NO_RETURN;
-void fatal(const char *fmt, ...) VPX_NO_RETURN;
-void warn(const char *fmt, ...);
+VPX_NO_RETURN void die(const char *fmt, ...) VPX_TOOLS_FORMAT_PRINTF(1, 2);
+VPX_NO_RETURN void fatal(const char *fmt, ...) VPX_TOOLS_FORMAT_PRINTF(1, 2);
+void warn(const char *fmt, ...) VPX_TOOLS_FORMAT_PRINTF(1, 2);
 
-void die_codec(vpx_codec_ctx_t *ctx, const char *s) VPX_NO_RETURN;
+VPX_NO_RETURN void die_codec(vpx_codec_ctx_t *ctx, const char *s);
 
 /* The tool including this file must define usage_exit() */
-void usage_exit(void) VPX_NO_RETURN;
+VPX_NO_RETURN void usage_exit(void);
 
 #undef VPX_NO_RETURN
 
 int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame);
 
 typedef struct VpxInterface {
-  const char *const name;
-  const uint32_t fourcc;
-  vpx_codec_iface_t *(*const codec_interface)();
+  const char *name;
+  uint32_t fourcc;
+  vpx_codec_iface_t *(*codec_interface)(void);
 } VpxInterface;
 
 int get_vpx_encoder_count(void);
@@ -141,8 +163,6 @@ const VpxInterface *get_vpx_decoder_by_index(int i);
 const VpxInterface *get_vpx_decoder_by_name(const char *name);
 const VpxInterface *get_vpx_decoder_by_fourcc(uint32_t fourcc);
 
-// TODO(dkovalev): move this function to vpx_image.{c, h}, so it will be part
-// of vpx_image_t support
 int vpx_img_plane_width(const vpx_image_t *img, int plane);
 int vpx_img_plane_height(const vpx_image_t *img, int plane);
 void vpx_img_write(const vpx_image_t *img, FILE *file);
@@ -150,14 +170,31 @@ int vpx_img_read(vpx_image_t *img, FILE *file);
 
 double sse_to_psnr(double samples, double peak, double mse);
 
+#if CONFIG_ENCODERS
+int read_frame(struct VpxInputContext *input_ctx, vpx_image_t *img);
+int file_is_y4m(const char detect[4]);
+int fourcc_is_ivf(const char detect[4]);
+void open_input_file(struct VpxInputContext *input);
+void close_input_file(struct VpxInputContext *input);
+#endif
+
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_img_upshift(vpx_image_t *dst, vpx_image_t *src, int input_shift);
 void vpx_img_downshift(vpx_image_t *dst, vpx_image_t *src, int down_shift);
 void vpx_img_truncate_16_to_8(vpx_image_t *dst, vpx_image_t *src);
 #endif
 
+int compare_img(const vpx_image_t *const img1, const vpx_image_t *const img2);
+#if CONFIG_VP9_HIGHBITDEPTH
+void find_mismatch_high(const vpx_image_t *const img1,
+                        const vpx_image_t *const img2, int yloc[4], int uloc[4],
+                        int vloc[4]);
+#endif
+void find_mismatch(const vpx_image_t *const img1, const vpx_image_t *const img2,
+                   int yloc[4], int uloc[4], int vloc[4]);
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
 
-#endif  // TOOLS_COMMON_H_
+#endif  // VPX_TOOLS_COMMON_H_
diff --git a/media/libvpx/libvpx/usage_cx.dox b/media/libvpx/libvpx/usage_cx.dox
index 92b0d34ef4..b2220cfdde 100644
--- a/media/libvpx/libvpx/usage_cx.dox
+++ b/media/libvpx/libvpx/usage_cx.dox
@@ -8,6 +8,8 @@
     \ref usage_deadline.
 
 
+    \if samples
     \ref samples
+    \endif
 
 */
diff --git a/media/libvpx/libvpx/usage_dx.dox b/media/libvpx/libvpx/usage_dx.dox
index 883ce24926..85063f705b 100644
--- a/media/libvpx/libvpx/usage_dx.dox
+++ b/media/libvpx/libvpx/usage_dx.dox
@@ -11,7 +11,9 @@
     \ref usage_postproc based on the amount of free CPU time. For more
     information on the <code>deadline</code> parameter, see \ref usage_deadline.
 
+    \if samples
     \ref samples
+    \endif
 
 
     \section usage_cb Callback Based Decoding
diff --git a/media/libvpx/libvpx/video_common.h b/media/libvpx/libvpx/video_common.h
index 44b27a8390..77eb9fac0c 100644
--- a/media/libvpx/libvpx/video_common.h
+++ b/media/libvpx/libvpx/video_common.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VIDEO_COMMON_H_
-#define VIDEO_COMMON_H_
+#ifndef VPX_VIDEO_COMMON_H_
+#define VPX_VIDEO_COMMON_H_
 
 #include "./tools_common.h"
 
@@ -20,4 +20,4 @@ typedef struct {
   struct VpxRational time_base;
 } VpxVideoInfo;
 
-#endif  // VIDEO_COMMON_H_
+#endif  // VPX_VIDEO_COMMON_H_
diff --git a/media/libvpx/libvpx/video_reader.c b/media/libvpx/libvpx/video_reader.c
index a0ba2521c6..16822eff3c 100644
--- a/media/libvpx/libvpx/video_reader.c
+++ b/media/libvpx/libvpx/video_reader.c
@@ -30,17 +30,37 @@ VpxVideoReader *vpx_video_reader_open(const char *filename) {
   char header[32];
   VpxVideoReader *reader = NULL;
   FILE *const file = fopen(filename, "rb");
-  if (!file) return NULL;  // Can't open file
+  if (!file) {
+    fprintf(stderr, "%s can't be opened.\n", filename);  // Can't open file
+    return NULL;
+  }
 
-  if (fread(header, 1, 32, file) != 32) return NULL;  // Can't read file header
+  if (fread(header, 1, 32, file) != 32) {
+    fprintf(stderr, "File header on %s can't be read.\n",
+            filename);  // Can't read file header
+    return NULL;
+  }
+  if (memcmp(kIVFSignature, header, 4) != 0) {
+    fprintf(stderr, "The IVF signature on %s is wrong.\n",
+            filename);  // Wrong IVF signature
 
-  if (memcmp(kIVFSignature, header, 4) != 0)
-    return NULL;  // Wrong IVF signature
+    return NULL;
+  }
+  if (mem_get_le16(header + 4) != 0) {
+    fprintf(stderr, "%s uses the wrong IVF version.\n",
+            filename);  // Wrong IVF version
 
-  if (mem_get_le16(header + 4) != 0) return NULL;  // Wrong IVF version
+    return NULL;
+  }
 
   reader = calloc(1, sizeof(*reader));
-  if (!reader) return NULL;  // Can't allocate VpxVideoReader
+  if (!reader) {
+    fprintf(
+        stderr,
+        "Can't allocate VpxVideoReader\n");  // Can't allocate VpxVideoReader
+
+    return NULL;
+  }
 
   reader->file = file;
   reader->info.codec_fourcc = mem_get_le32(header + 8);
diff --git a/media/libvpx/libvpx/video_reader.h b/media/libvpx/libvpx/video_reader.h
index 73c25b00a7..1f5c8088bb 100644
--- a/media/libvpx/libvpx/video_reader.h
+++ b/media/libvpx/libvpx/video_reader.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VIDEO_READER_H_
-#define VIDEO_READER_H_
+#ifndef VPX_VIDEO_READER_H_
+#define VPX_VIDEO_READER_H_
 
 #include "./video_common.h"
 
@@ -48,4 +48,4 @@ const VpxVideoInfo *vpx_video_reader_get_info(VpxVideoReader *reader);
 }  // extern "C"
 #endif
 
-#endif  // VIDEO_READER_H_
+#endif  // VPX_VIDEO_READER_H_
diff --git a/media/libvpx/libvpx/video_writer.c b/media/libvpx/libvpx/video_writer.c
index 56d428b072..6e9a848bc3 100644
--- a/media/libvpx/libvpx/video_writer.c
+++ b/media/libvpx/libvpx/video_writer.c
@@ -37,11 +37,15 @@ VpxVideoWriter *vpx_video_writer_open(const char *filename,
   if (container == kContainerIVF) {
     VpxVideoWriter *writer = NULL;
     FILE *const file = fopen(filename, "wb");
-    if (!file) return NULL;
-
+    if (!file) {
+      fprintf(stderr, "%s can't be written to.\n", filename);
+      return NULL;
+    }
     writer = malloc(sizeof(*writer));
-    if (!writer) return NULL;
-
+    if (!writer) {
+      fprintf(stderr, "Can't allocate VpxVideoWriter.\n");
+      return NULL;
+    }
     writer->frame_count = 0;
     writer->info = *info;
     writer->file = file;
@@ -50,7 +54,7 @@ VpxVideoWriter *vpx_video_writer_open(const char *filename,
 
     return writer;
   }
-
+  fprintf(stderr, "VpxVideoWriter supports only IVF.\n");
   return NULL;
 }
 
diff --git a/media/libvpx/libvpx/video_writer.h b/media/libvpx/libvpx/video_writer.h
index a769811c44..b4d242b920 100644
--- a/media/libvpx/libvpx/video_writer.h
+++ b/media/libvpx/libvpx/video_writer.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VIDEO_WRITER_H_
-#define VIDEO_WRITER_H_
+#ifndef VPX_VIDEO_WRITER_H_
+#define VPX_VIDEO_WRITER_H_
 
 #include "./video_common.h"
 
@@ -41,4 +41,4 @@ int vpx_video_writer_write_frame(VpxVideoWriter *writer, const uint8_t *buffer,
 }  // extern "C"
 #endif
 
-#endif  // VIDEO_WRITER_H_
+#endif  // VPX_VIDEO_WRITER_H_
diff --git a/media/libvpx/libvpx/vp8/common/alloccommon.h b/media/libvpx/libvpx/vp8/common/alloccommon.h
index 5d0840c670..2d376bbac3 100644
--- a/media/libvpx/libvpx/vp8/common/alloccommon.h
+++ b/media/libvpx/libvpx/vp8/common/alloccommon.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_ALLOCCOMMON_H_
-#define VP8_COMMON_ALLOCCOMMON_H_
+#ifndef VPX_VP8_COMMON_ALLOCCOMMON_H_
+#define VPX_VP8_COMMON_ALLOCCOMMON_H_
 
 #include "onyxc_int.h"
 
@@ -21,10 +21,10 @@ void vp8_create_common(VP8_COMMON *oci);
 void vp8_remove_common(VP8_COMMON *oci);
 void vp8_de_alloc_frame_buffers(VP8_COMMON *oci);
 int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height);
-void vp8_setup_version(VP8_COMMON *oci);
+void vp8_setup_version(VP8_COMMON *cm);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_ALLOCCOMMON_H_
+#endif  // VPX_VP8_COMMON_ALLOCCOMMON_H_
diff --git a/media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.c b/media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.c
index e12f65a042..48a1972048 100644
--- a/media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.c
+++ b/media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.c
@@ -8,28 +8,12 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
+#include "./vpx_config.h"
+#include "./vp8_rtcd.h"
+#include "vp8/common/arm/loopfilter_arm.h"
 #include "vp8/common/loopfilter.h"
 #include "vp8/common/onyxc_int.h"
 
-typedef void loopfilter_y_neon(unsigned char *src, int pitch,
-                               unsigned char blimit, unsigned char limit,
-                               unsigned char thresh);
-typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
-                                unsigned char blimit, unsigned char limit,
-                                unsigned char thresh, unsigned char *v);
-
-extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon;
-extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon;
-extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
-extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
-
-extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
-extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
-extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
-extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon;
-
 /* NEON loopfilter functions */
 /* Horizontal MB filtering */
 void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr,
diff --git a/media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.h b/media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.h
new file mode 100644
index 0000000000..6cf660d228
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.h
@@ -0,0 +1,31 @@
+/*
+ *  Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_ARM_LOOPFILTER_ARM_H_
+#define VPX_VP8_COMMON_ARM_LOOPFILTER_ARM_H_
+
+typedef void loopfilter_y_neon(unsigned char *src, int pitch,
+                               unsigned char blimit, unsigned char limit,
+                               unsigned char thresh);
+typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
+                                unsigned char blimit, unsigned char limit,
+                                unsigned char thresh, unsigned char *v);
+
+loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon;
+loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon;
+loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
+loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
+
+loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
+loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
+loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
+loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon;
+
+#endif  // VPX_VP8_COMMON_ARM_LOOPFILTER_ARM_H_
diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c
index af566c2c41..590956dde1 100644
--- a/media/libvpx/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c
+++ b/media/libvpx/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c
@@ -10,7 +10,10 @@
 
 #include <arm_neon.h>
 #include <string.h>
+
 #include "./vpx_config.h"
+#include "./vp8_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
 
 static const uint8_t bifilter4_coeff[8][2] = { { 128, 0 }, { 112, 16 },
                                                { 96, 32 }, { 80, 48 },
@@ -21,35 +24,6 @@ static INLINE uint8x8_t load_and_shift(const unsigned char *a) {
   return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vld1_u8(a)), 32));
 }
 
-static INLINE void store4x4(unsigned char *dst, int dst_stride,
-                            const uint8x8_t a0, const uint8x8_t a1) {
-  if (!((uintptr_t)dst & 0x3) && !(dst_stride & 0x3)) {
-    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a0), 0);
-    dst += dst_stride;
-    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a0), 1);
-    dst += dst_stride;
-    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a1), 0);
-    dst += dst_stride;
-    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a1), 1);
-  } else {
-    // Store to the aligned local buffer and memcpy instead of vget_lane_u8
-    // which is really really slow.
-    uint32_t output_buffer[4];
-    vst1_lane_u32(output_buffer, vreinterpret_u32_u8(a0), 0);
-    vst1_lane_u32(output_buffer + 1, vreinterpret_u32_u8(a0), 1);
-    vst1_lane_u32(output_buffer + 2, vreinterpret_u32_u8(a1), 0);
-    vst1_lane_u32(output_buffer + 3, vreinterpret_u32_u8(a1), 1);
-
-    memcpy(dst, output_buffer, 4);
-    dst += dst_stride;
-    memcpy(dst, output_buffer + 1, 4);
-    dst += dst_stride;
-    memcpy(dst, output_buffer + 2, 4);
-    dst += dst_stride;
-    memcpy(dst, output_buffer + 3, 4);
-  }
-}
-
 void vp8_bilinear_predict4x4_neon(unsigned char *src_ptr,
                                   int src_pixels_per_line, int xoffset,
                                   int yoffset, unsigned char *dst_ptr,
@@ -122,7 +96,7 @@ void vp8_bilinear_predict4x4_neon(unsigned char *src_ptr,
 
   // secondpass_filter
   if (yoffset == 0) {  // skip_2ndpass_filter
-    store4x4(dst_ptr, dst_pitch, e0, e1);
+    store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(e0, e1));
   } else {
     uint8x8_t f0, f1;
     const uint8x8_t filter0 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
@@ -140,7 +114,7 @@ void vp8_bilinear_predict4x4_neon(unsigned char *src_ptr,
     f0 = vqrshrn_n_u16(b0, 7);
     f1 = vqrshrn_n_u16(b1, 7);
 
-    store4x4(dst_ptr, dst_pitch, f0, f1);
+    store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(f0, f1));
   }
 }
 
diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/copymem_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/copymem_neon.c
index c1d293b58d..c89b47d628 100644
--- a/media/libvpx/libvpx/vp8/common/arm/neon/copymem_neon.c
+++ b/media/libvpx/libvpx/vp8/common/arm/neon/copymem_neon.c
@@ -10,6 +10,8 @@
 
 #include <arm_neon.h>
 
+#include "./vp8_rtcd.h"
+
 void vp8_copy_mem8x4_neon(unsigned char *src, int src_stride,
                           unsigned char *dst, int dst_stride) {
   uint8x8_t vtmp;
diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/dequantizeb_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/dequantizeb_neon.c
index 6edff3c69f..791aaea2ae 100644
--- a/media/libvpx/libvpx/vp8/common/arm/neon/dequantizeb_neon.c
+++ b/media/libvpx/libvpx/vp8/common/arm/neon/dequantizeb_neon.c
@@ -10,6 +10,7 @@
 
 #include <arm_neon.h>
 
+#include "./vp8_rtcd.h"
 #include "vp8/common/blockd.h"
 
 void vp8_dequantize_b_neon(BLOCKD *d, short *DQC) {
diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/idct_blk_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/idct_blk_neon.c
index d61dde86cf..5c26ce67a4 100644
--- a/media/libvpx/libvpx/vp8/common/arm/neon/idct_blk_neon.c
+++ b/media/libvpx/libvpx/vp8/common/arm/neon/idct_blk_neon.c
@@ -8,15 +8,226 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
+#include <arm_neon.h>
 
-/* place these declarations here because we don't want to maintain them
- * outside of this scope
- */
-void idct_dequant_full_2x_neon(short *q, short *dq, unsigned char *dst,
-                               int stride);
-void idct_dequant_0_2x_neon(short *q, short dq, unsigned char *dst, int stride);
+#include "./vp8_rtcd.h"
+
+static void idct_dequant_0_2x_neon(int16_t *q, int16_t dq, unsigned char *dst,
+                                   int stride) {
+  unsigned char *dst0;
+  int i, a0, a1;
+  int16x8x2_t q2Add;
+  int32x2_t d2s32 = vdup_n_s32(0), d4s32 = vdup_n_s32(0);
+  uint8x8_t d2u8, d4u8;
+  uint16x8_t q1u16, q2u16;
+
+  a0 = ((q[0] * dq) + 4) >> 3;
+  a1 = ((q[16] * dq) + 4) >> 3;
+  q[0] = q[16] = 0;
+  q2Add.val[0] = vdupq_n_s16((int16_t)a0);
+  q2Add.val[1] = vdupq_n_s16((int16_t)a1);
+
+  for (i = 0; i < 2; i++, dst += 4) {
+    dst0 = dst;
+    d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0);
+    dst0 += stride;
+    d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1);
+    dst0 += stride;
+    d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0);
+    dst0 += stride;
+    d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1);
+
+    q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
+                     vreinterpret_u8_s32(d2s32));
+    q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
+                     vreinterpret_u8_s32(d4s32));
+
+    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
+    d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
+
+    d2s32 = vreinterpret_s32_u8(d2u8);
+    d4s32 = vreinterpret_s32_u8(d4u8);
+
+    dst0 = dst;
+    vst1_lane_s32((int32_t *)dst0, d2s32, 0);
+    dst0 += stride;
+    vst1_lane_s32((int32_t *)dst0, d2s32, 1);
+    dst0 += stride;
+    vst1_lane_s32((int32_t *)dst0, d4s32, 0);
+    dst0 += stride;
+    vst1_lane_s32((int32_t *)dst0, d4s32, 1);
+  }
+}
+
+static const int16_t cospi8sqrt2minus1 = 20091;
+static const int16_t sinpi8sqrt2 = 17734;
+// because the lowest bit in 0x8a8c is 0, we can pre-shift this
+
+static void idct_dequant_full_2x_neon(int16_t *q, int16_t *dq,
+                                      unsigned char *dst, int stride) {
+  unsigned char *dst0, *dst1;
+  int32x2_t d28, d29, d30, d31;
+  int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
+  int16x8_t qEmpty = vdupq_n_s16(0);
+  int32x4x2_t q2tmp0, q2tmp1;
+  int16x8x2_t q2tmp2, q2tmp3;
+  int16x4_t dLow0, dLow1, dHigh0, dHigh1;
+
+  d28 = d29 = d30 = d31 = vdup_n_s32(0);
+
+  // load dq
+  q0 = vld1q_s16(dq);
+  dq += 8;
+  q1 = vld1q_s16(dq);
+
+  // load q
+  q2 = vld1q_s16(q);
+  vst1q_s16(q, qEmpty);
+  q += 8;
+  q3 = vld1q_s16(q);
+  vst1q_s16(q, qEmpty);
+  q += 8;
+  q4 = vld1q_s16(q);
+  vst1q_s16(q, qEmpty);
+  q += 8;
+  q5 = vld1q_s16(q);
+  vst1q_s16(q, qEmpty);
+
+  // load src from dst
+  dst0 = dst;
+  dst1 = dst + 4;
+  d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0);
+  dst0 += stride;
+  d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1);
+  dst1 += stride;
+  d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0);
+  dst0 += stride;
+  d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1);
+  dst1 += stride;
+
+  d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0);
+  dst0 += stride;
+  d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1);
+  dst1 += stride;
+  d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0);
+  d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1);
+
+  q2 = vmulq_s16(q2, q0);
+  q3 = vmulq_s16(q3, q1);
+  q4 = vmulq_s16(q4, q0);
+  q5 = vmulq_s16(q5, q1);
+
+  // vswp
+  dLow0 = vget_low_s16(q2);
+  dHigh0 = vget_high_s16(q2);
+  dLow1 = vget_low_s16(q4);
+  dHigh1 = vget_high_s16(q4);
+  q2 = vcombine_s16(dLow0, dLow1);
+  q4 = vcombine_s16(dHigh0, dHigh1);
+
+  dLow0 = vget_low_s16(q3);
+  dHigh0 = vget_high_s16(q3);
+  dLow1 = vget_low_s16(q5);
+  dHigh1 = vget_high_s16(q5);
+  q3 = vcombine_s16(dLow0, dLow1);
+  q5 = vcombine_s16(dHigh0, dHigh1);
+
+  q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2);
+  q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2);
+  q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1);
+  q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1);
+
+  q10 = vqaddq_s16(q2, q3);
+  q11 = vqsubq_s16(q2, q3);
+
+  q8 = vshrq_n_s16(q8, 1);
+  q9 = vshrq_n_s16(q9, 1);
+
+  q4 = vqaddq_s16(q4, q8);
+  q5 = vqaddq_s16(q5, q9);
+
+  q2 = vqsubq_s16(q6, q5);
+  q3 = vqaddq_s16(q7, q4);
+
+  q4 = vqaddq_s16(q10, q3);
+  q5 = vqaddq_s16(q11, q2);
+  q6 = vqsubq_s16(q11, q2);
+  q7 = vqsubq_s16(q10, q3);
+
+  q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
+  q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
+  q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
+                     vreinterpretq_s16_s32(q2tmp1.val[0]));
+  q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
+                     vreinterpretq_s16_s32(q2tmp1.val[1]));
+
+  // loop 2
+  q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2);
+  q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2);
+  q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1);
+  q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1);
+
+  q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]);
+  q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]);
+
+  q10 = vshrq_n_s16(q10, 1);
+  q11 = vshrq_n_s16(q11, 1);
+
+  q10 = vqaddq_s16(q2tmp2.val[1], q10);
+  q11 = vqaddq_s16(q2tmp3.val[1], q11);
+
+  q8 = vqsubq_s16(q8, q11);
+  q9 = vqaddq_s16(q9, q10);
+
+  q4 = vqaddq_s16(q2, q9);
+  q5 = vqaddq_s16(q3, q8);
+  q6 = vqsubq_s16(q3, q8);
+  q7 = vqsubq_s16(q2, q9);
+
+  q4 = vrshrq_n_s16(q4, 3);
+  q5 = vrshrq_n_s16(q5, 3);
+  q6 = vrshrq_n_s16(q6, 3);
+  q7 = vrshrq_n_s16(q7, 3);
+
+  q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
+  q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
+  q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
+                     vreinterpretq_s16_s32(q2tmp1.val[0]));
+  q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
+                     vreinterpretq_s16_s32(q2tmp1.val[1]));
+
+  q4 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]), vreinterpret_u8_s32(d28)));
+  q5 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]), vreinterpret_u8_s32(d29)));
+  q6 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]), vreinterpret_u8_s32(d30)));
+  q7 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]), vreinterpret_u8_s32(d31)));
+
+  d28 = vreinterpret_s32_u8(vqmovun_s16(q4));
+  d29 = vreinterpret_s32_u8(vqmovun_s16(q5));
+  d30 = vreinterpret_s32_u8(vqmovun_s16(q6));
+  d31 = vreinterpret_s32_u8(vqmovun_s16(q7));
+
+  dst0 = dst;
+  dst1 = dst + 4;
+  vst1_lane_s32((int32_t *)dst0, d28, 0);
+  dst0 += stride;
+  vst1_lane_s32((int32_t *)dst1, d28, 1);
+  dst1 += stride;
+  vst1_lane_s32((int32_t *)dst0, d29, 0);
+  dst0 += stride;
+  vst1_lane_s32((int32_t *)dst1, d29, 1);
+  dst1 += stride;
+
+  vst1_lane_s32((int32_t *)dst0, d30, 0);
+  dst0 += stride;
+  vst1_lane_s32((int32_t *)dst1, d30, 1);
+  dst1 += stride;
+  vst1_lane_s32((int32_t *)dst0, d31, 0);
+  vst1_lane_s32((int32_t *)dst1, d31, 1);
+}
 
 void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst,
                                        int stride, char *eobs) {
@@ -43,42 +254,42 @@ void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst,
 }
 
 void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq,
-                                        unsigned char *dstu,
-                                        unsigned char *dstv, int stride,
+                                        unsigned char *dst_u,
+                                        unsigned char *dst_v, int stride,
                                         char *eobs) {
   if (((short *)(eobs))[0]) {
     if (((short *)eobs)[0] & 0xfefe)
-      idct_dequant_full_2x_neon(q, dq, dstu, stride);
+      idct_dequant_full_2x_neon(q, dq, dst_u, stride);
     else
-      idct_dequant_0_2x_neon(q, dq[0], dstu, stride);
+      idct_dequant_0_2x_neon(q, dq[0], dst_u, stride);
   }
 
   q += 32;
-  dstu += 4 * stride;
+  dst_u += 4 * stride;
 
   if (((short *)(eobs))[1]) {
     if (((short *)eobs)[1] & 0xfefe)
-      idct_dequant_full_2x_neon(q, dq, dstu, stride);
+      idct_dequant_full_2x_neon(q, dq, dst_u, stride);
     else
-      idct_dequant_0_2x_neon(q, dq[0], dstu, stride);
+      idct_dequant_0_2x_neon(q, dq[0], dst_u, stride);
   }
 
   q += 32;
 
   if (((short *)(eobs))[2]) {
     if (((short *)eobs)[2] & 0xfefe)
-      idct_dequant_full_2x_neon(q, dq, dstv, stride);
+      idct_dequant_full_2x_neon(q, dq, dst_v, stride);
     else
-      idct_dequant_0_2x_neon(q, dq[0], dstv, stride);
+      idct_dequant_0_2x_neon(q, dq[0], dst_v, stride);
   }
 
   q += 32;
-  dstv += 4 * stride;
+  dst_v += 4 * stride;
 
   if (((short *)(eobs))[3]) {
     if (((short *)eobs)[3] & 0xfefe)
-      idct_dequant_full_2x_neon(q, dq, dstv, stride);
+      idct_dequant_full_2x_neon(q, dq, dst_v, stride);
     else
-      idct_dequant_0_2x_neon(q, dq[0], dstv, stride);
+      idct_dequant_0_2x_neon(q, dq[0], dst_v, stride);
   }
 }
diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c
deleted file mode 100644
index c83102a5cc..0000000000
--- a/media/libvpx/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-void idct_dequant_0_2x_neon(int16_t *q, int16_t dq, unsigned char *dst,
-                            int stride) {
-  unsigned char *dst0;
-  int i, a0, a1;
-  int16x8x2_t q2Add;
-  int32x2_t d2s32 = vdup_n_s32(0), d4s32 = vdup_n_s32(0);
-  uint8x8_t d2u8, d4u8;
-  uint16x8_t q1u16, q2u16;
-
-  a0 = ((q[0] * dq) + 4) >> 3;
-  a1 = ((q[16] * dq) + 4) >> 3;
-  q[0] = q[16] = 0;
-  q2Add.val[0] = vdupq_n_s16((int16_t)a0);
-  q2Add.val[1] = vdupq_n_s16((int16_t)a1);
-
-  for (i = 0; i < 2; i++, dst += 4) {
-    dst0 = dst;
-    d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0);
-    dst0 += stride;
-    d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1);
-    dst0 += stride;
-    d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0);
-    dst0 += stride;
-    d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1);
-
-    q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
-                     vreinterpret_u8_s32(d2s32));
-    q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
-                     vreinterpret_u8_s32(d4s32));
-
-    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
-    d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
-
-    d2s32 = vreinterpret_s32_u8(d2u8);
-    d4s32 = vreinterpret_s32_u8(d4u8);
-
-    dst0 = dst;
-    vst1_lane_s32((int32_t *)dst0, d2s32, 0);
-    dst0 += stride;
-    vst1_lane_s32((int32_t *)dst0, d2s32, 1);
-    dst0 += stride;
-    vst1_lane_s32((int32_t *)dst0, d4s32, 0);
-    dst0 += stride;
-    vst1_lane_s32((int32_t *)dst0, d4s32, 1);
-  }
-  return;
-}
diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c
deleted file mode 100644
index f30671cc3f..0000000000
--- a/media/libvpx/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-static const int16_t cospi8sqrt2minus1 = 20091;
-static const int16_t sinpi8sqrt2 = 17734;
-// because the lowest bit in 0x8a8c is 0, we can pre-shift this
-
-void idct_dequant_full_2x_neon(int16_t *q, int16_t *dq, unsigned char *dst,
-                               int stride) {
-  unsigned char *dst0, *dst1;
-  int32x2_t d28, d29, d30, d31;
-  int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
-  int16x8_t qEmpty = vdupq_n_s16(0);
-  int32x4x2_t q2tmp0, q2tmp1;
-  int16x8x2_t q2tmp2, q2tmp3;
-  int16x4_t dLow0, dLow1, dHigh0, dHigh1;
-
-  d28 = d29 = d30 = d31 = vdup_n_s32(0);
-
-  // load dq
-  q0 = vld1q_s16(dq);
-  dq += 8;
-  q1 = vld1q_s16(dq);
-
-  // load q
-  q2 = vld1q_s16(q);
-  vst1q_s16(q, qEmpty);
-  q += 8;
-  q3 = vld1q_s16(q);
-  vst1q_s16(q, qEmpty);
-  q += 8;
-  q4 = vld1q_s16(q);
-  vst1q_s16(q, qEmpty);
-  q += 8;
-  q5 = vld1q_s16(q);
-  vst1q_s16(q, qEmpty);
-
-  // load src from dst
-  dst0 = dst;
-  dst1 = dst + 4;
-  d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0);
-  dst0 += stride;
-  d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1);
-  dst1 += stride;
-  d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0);
-  dst0 += stride;
-  d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1);
-  dst1 += stride;
-
-  d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0);
-  dst0 += stride;
-  d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1);
-  dst1 += stride;
-  d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0);
-  d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1);
-
-  q2 = vmulq_s16(q2, q0);
-  q3 = vmulq_s16(q3, q1);
-  q4 = vmulq_s16(q4, q0);
-  q5 = vmulq_s16(q5, q1);
-
-  // vswp
-  dLow0 = vget_low_s16(q2);
-  dHigh0 = vget_high_s16(q2);
-  dLow1 = vget_low_s16(q4);
-  dHigh1 = vget_high_s16(q4);
-  q2 = vcombine_s16(dLow0, dLow1);
-  q4 = vcombine_s16(dHigh0, dHigh1);
-
-  dLow0 = vget_low_s16(q3);
-  dHigh0 = vget_high_s16(q3);
-  dLow1 = vget_low_s16(q5);
-  dHigh1 = vget_high_s16(q5);
-  q3 = vcombine_s16(dLow0, dLow1);
-  q5 = vcombine_s16(dHigh0, dHigh1);
-
-  q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2);
-  q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2);
-  q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1);
-  q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1);
-
-  q10 = vqaddq_s16(q2, q3);
-  q11 = vqsubq_s16(q2, q3);
-
-  q8 = vshrq_n_s16(q8, 1);
-  q9 = vshrq_n_s16(q9, 1);
-
-  q4 = vqaddq_s16(q4, q8);
-  q5 = vqaddq_s16(q5, q9);
-
-  q2 = vqsubq_s16(q6, q5);
-  q3 = vqaddq_s16(q7, q4);
-
-  q4 = vqaddq_s16(q10, q3);
-  q5 = vqaddq_s16(q11, q2);
-  q6 = vqsubq_s16(q11, q2);
-  q7 = vqsubq_s16(q10, q3);
-
-  q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
-  q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
-  q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
-                     vreinterpretq_s16_s32(q2tmp1.val[0]));
-  q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
-                     vreinterpretq_s16_s32(q2tmp1.val[1]));
-
-  // loop 2
-  q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2);
-  q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2);
-  q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1);
-  q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1);
-
-  q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]);
-  q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]);
-
-  q10 = vshrq_n_s16(q10, 1);
-  q11 = vshrq_n_s16(q11, 1);
-
-  q10 = vqaddq_s16(q2tmp2.val[1], q10);
-  q11 = vqaddq_s16(q2tmp3.val[1], q11);
-
-  q8 = vqsubq_s16(q8, q11);
-  q9 = vqaddq_s16(q9, q10);
-
-  q4 = vqaddq_s16(q2, q9);
-  q5 = vqaddq_s16(q3, q8);
-  q6 = vqsubq_s16(q3, q8);
-  q7 = vqsubq_s16(q2, q9);
-
-  q4 = vrshrq_n_s16(q4, 3);
-  q5 = vrshrq_n_s16(q5, 3);
-  q6 = vrshrq_n_s16(q6, 3);
-  q7 = vrshrq_n_s16(q7, 3);
-
-  q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
-  q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
-  q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
-                     vreinterpretq_s16_s32(q2tmp1.val[0]));
-  q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
-                     vreinterpretq_s16_s32(q2tmp1.val[1]));
-
-  q4 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]), vreinterpret_u8_s32(d28)));
-  q5 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]), vreinterpret_u8_s32(d29)));
-  q6 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]), vreinterpret_u8_s32(d30)));
-  q7 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]), vreinterpret_u8_s32(d31)));
-
-  d28 = vreinterpret_s32_u8(vqmovun_s16(q4));
-  d29 = vreinterpret_s32_u8(vqmovun_s16(q5));
-  d30 = vreinterpret_s32_u8(vqmovun_s16(q6));
-  d31 = vreinterpret_s32_u8(vqmovun_s16(q7));
-
-  dst0 = dst;
-  dst1 = dst + 4;
-  vst1_lane_s32((int32_t *)dst0, d28, 0);
-  dst0 += stride;
-  vst1_lane_s32((int32_t *)dst1, d28, 1);
-  dst1 += stride;
-  vst1_lane_s32((int32_t *)dst0, d29, 0);
-  dst0 += stride;
-  vst1_lane_s32((int32_t *)dst1, d29, 1);
-  dst1 += stride;
-
-  vst1_lane_s32((int32_t *)dst0, d30, 0);
-  dst0 += stride;
-  vst1_lane_s32((int32_t *)dst1, d30, 1);
-  dst1 += stride;
-  vst1_lane_s32((int32_t *)dst0, d31, 0);
-  vst1_lane_s32((int32_t *)dst1, d31, 1);
-  return;
-}
diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/iwalsh_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/iwalsh_neon.c
index 6c4bcc134b..91600bfc00 100644
--- a/media/libvpx/libvpx/vp8/common/arm/neon/iwalsh_neon.c
+++ b/media/libvpx/libvpx/vp8/common/arm/neon/iwalsh_neon.c
@@ -10,6 +10,8 @@
 
 #include <arm_neon.h>
 
+#include "./vp8_rtcd.h"
+
 void vp8_short_inv_walsh4x4_neon(int16_t *input, int16_t *mb_dqcoeff) {
   int16x8_t q0s16, q1s16, q2s16, q3s16;
   int16x4_t d4s16, d5s16, d6s16, d7s16;
diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c
index a168219705..df983b23a3 100644
--- a/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c
+++ b/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c
@@ -9,7 +9,9 @@
  */
 
 #include <arm_neon.h>
+
 #include "./vpx_config.h"
+#include "./vp8_rtcd.h"
 
 static INLINE void vp8_loop_filter_simple_horizontal_edge_neon(
     unsigned char *s, int p, const unsigned char *blimit) {
diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
index 80a222d248..fbc83ae290 100644
--- a/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
+++ b/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
@@ -9,7 +9,9 @@
  */
 
 #include <arm_neon.h>
+
 #include "./vpx_config.h"
+#include "./vp8_rtcd.h"
 #include "vpx_ports/arm.h"
 
 #ifdef VPX_INCOMPATIBLE_GCC
diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c
index 65eec300ff..fafaf2d451 100644
--- a/media/libvpx/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c
+++ b/media/libvpx/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c
@@ -9,7 +9,9 @@
  */
 
 #include <arm_neon.h>
+
 #include "./vpx_config.h"
+#include "vp8/common/arm/loopfilter_arm.h"
 
 static INLINE void vp8_mbloop_filter_neon(uint8x16_t qblimit,  // mblimit
                                           uint8x16_t qlimit,   // limit
diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
index fbb552ebe2..a54e81084b 100644
--- a/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
+++ b/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
@@ -11,10 +11,12 @@
 #include <arm_neon.h>
 #include <string.h>
 #include "./vpx_config.h"
+#include "./vp8_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_ports/mem.h"
 
 static const int8_t vp8_sub_pel_filters[8][8] = {
-  { 0, 0, 128, 0, 0, 0, 0, 0 },     /* note that 1/8 pel positionyys are */
+  { 0, 0, -128, 0, 0, 0, 0, 0 },    /* note that 1/8 pel positions are */
   { 0, -6, 123, 12, -1, 0, 0, 0 },  /*    just as per alpha -0.5 bicubic */
   { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */
   { 0, -9, 93, 50, -6, 0, 0, 0 },
@@ -42,35 +44,6 @@ static INLINE uint8x8_t load_and_shift(const unsigned char *a) {
   return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vld1_u8(a)), 32));
 }
 
-static INLINE void store4x4(unsigned char *dst, int dst_stride,
-                            const uint8x8_t a0, const uint8x8_t a1) {
-  if (!((uintptr_t)dst & 0x3) && !(dst_stride & 0x3)) {
-    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a0), 0);
-    dst += dst_stride;
-    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a0), 1);
-    dst += dst_stride;
-    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a1), 0);
-    dst += dst_stride;
-    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a1), 1);
-  } else {
-    // Store to the aligned local buffer and memcpy instead of vget_lane_u8
-    // which is really really slow.
-    uint32_t output_buffer[4];
-    vst1_lane_u32(output_buffer, vreinterpret_u32_u8(a0), 0);
-    vst1_lane_u32(output_buffer + 1, vreinterpret_u32_u8(a0), 1);
-    vst1_lane_u32(output_buffer + 2, vreinterpret_u32_u8(a1), 0);
-    vst1_lane_u32(output_buffer + 3, vreinterpret_u32_u8(a1), 1);
-
-    memcpy(dst, output_buffer, 4);
-    dst += dst_stride;
-    memcpy(dst, output_buffer + 1, 4);
-    dst += dst_stride;
-    memcpy(dst, output_buffer + 2, 4);
-    dst += dst_stride;
-    memcpy(dst, output_buffer + 3, 4);
-  }
-}
-
 static INLINE void filter_add_accumulate(const uint8x16_t a, const uint8x16_t b,
                                          const uint8x8_t filter, uint16x8_t *c,
                                          uint16x8_t *d) {
@@ -180,7 +153,7 @@ static INLINE void yonly4x4(const unsigned char *src, int src_stride,
   e0 = vqrshrun_n_s16(d0, 7);
   e1 = vqrshrun_n_s16(d1, 7);
 
-  store4x4(dst, dst_stride, e0, e1);
+  store_unaligned_u8q(dst, dst_stride, vcombine_u8(e0, e1));
 }
 
 void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
@@ -297,7 +270,7 @@ void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
   b2 = vqrshrun_n_s16(e4567, 7);
 
   if (yoffset == 0) {  // firstpass_filter4x4_only
-    store4x4(dst_ptr, dst_pitch, b0, b2);
+    store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(b0, b2));
     return;
   }
 
@@ -411,7 +384,7 @@ void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
   e0 = vqrshrun_n_s16(d0, 7);
   e1 = vqrshrun_n_s16(d1, 7);
 
-  store4x4(dst_ptr, dst_pitch, e0, e1);
+  store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(e0, e1));
 }
 
 void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
@@ -808,7 +781,6 @@ void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
   vst1_u8(dst_ptr, d8u8);
   dst_ptr += dst_pitch;
   vst1_u8(dst_ptr, d9u8);
-  return;
 }
 
 void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line,
@@ -1277,7 +1249,6 @@ void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line,
     vst1_u8(dst_ptr, d9u8);
     dst_ptr += dst_pitch;
   }
-  return;
 }
 
 void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr,
@@ -1531,7 +1502,9 @@ void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr,
     src += src_pixels_per_line;
     d12u8 = vld1_u8(src);
     d13u8 = vld1_u8(src + 8);
-    d14u8 = vld1_u8(src + 16);
+    // Only 5 pixels are needed, avoid a potential out of bounds read.
+    d14u8 = vld1_u8(src + 13);
+    d14u8 = vext_u8(d14u8, d14u8, 3);
     src += src_pixels_per_line;
 
     __builtin_prefetch(src);
@@ -1753,5 +1726,4 @@ void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr,
       dst += dst_pitch;
     }
   }
-  return;
 }
diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c
index d7286739da..ebc004a048 100644
--- a/media/libvpx/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c
+++ b/media/libvpx/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c
@@ -9,7 +9,9 @@
  */
 
 #include <arm_neon.h>
+
 #include "./vpx_config.h"
+#include "vp8/common/arm/loopfilter_arm.h"
 #include "vpx_ports/arm.h"
 
 static INLINE void vp8_loop_filter_neon(uint8x16_t qblimit,  // flimit
diff --git a/media/libvpx/libvpx/vp8/common/blockd.c b/media/libvpx/libvpx/vp8/common/blockd.c
index f47c5bae15..22905c10a6 100644
--- a/media/libvpx/libvpx/vp8/common/blockd.c
+++ b/media/libvpx/libvpx/vp8/common/blockd.c
@@ -11,9 +11,9 @@
 #include "blockd.h"
 #include "vpx_mem/vpx_mem.h"
 
-const unsigned char vp8_block2left[25] = {
-  0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-};
-const unsigned char vp8_block2above[25] = {
-  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8
-};
+const unsigned char vp8_block2left[25] = { 0, 0, 0, 0, 1, 1, 1, 1, 2,
+                                           2, 2, 2, 3, 3, 3, 3, 4, 4,
+                                           5, 5, 6, 6, 7, 7, 8 };
+const unsigned char vp8_block2above[25] = { 0, 1, 2, 3, 0, 1, 2, 3, 0,
+                                            1, 2, 3, 0, 1, 2, 3, 4, 5,
+                                            4, 5, 6, 7, 6, 7, 8 };
diff --git a/media/libvpx/libvpx/vp8/common/blockd.h b/media/libvpx/libvpx/vp8/common/blockd.h
index 74fc5d6dbf..8300aad941 100644
--- a/media/libvpx/libvpx/vp8/common/blockd.h
+++ b/media/libvpx/libvpx/vp8/common/blockd.h
@@ -8,11 +8,12 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_BLOCKD_H_
-#define VP8_COMMON_BLOCKD_H_
+#ifndef VPX_VP8_COMMON_BLOCKD_H_
+#define VPX_VP8_COMMON_BLOCKD_H_
 
 void vpx_log(const char *format, ...);
 
+#include "vpx/internal/vpx_codec_internal.h"
 #include "vpx_config.h"
 #include "vpx_scale/yv12config.h"
 #include "mv.h"
@@ -37,7 +38,9 @@ extern "C" {
 #define SEGMENT_DELTADATA 0
 #define SEGMENT_ABSDATA 1
 
-typedef struct { int r, c; } POS;
+typedef struct {
+  int r, c;
+} POS;
 
 #define PLANE_TYPE_Y_NO_DC 0
 #define PLANE_TYPE_Y2 1
@@ -55,7 +58,7 @@ typedef struct {
 extern const unsigned char vp8_block2left[25];
 extern const unsigned char vp8_block2above[25];
 
-#define VP8_COMBINEENTROPYCONTEXTS(Dest, A, B) Dest = (A) + (B);
+#define VP8_COMBINEENTROPYCONTEXTS(Dest, A, B) Dest = (A) + (B)
 
 typedef enum { KEY_FRAME = 0, INTER_FRAME = 1 } FRAME_TYPE;
 
@@ -169,12 +172,20 @@ typedef struct {
 typedef struct {
   FRAME_TYPE frame_type;
   int is_frame_dropped;
+  // If frame is dropped due to overshoot after encode_frame. This triggers a
+  // drop and resets rate control with Q forced to max for following frame.
+  // The check for this dropping due to overshoot is only done on lowest stream,
+  // and if set will force drop on all spatial streams for that current frame.
+  int is_frame_dropped_overshoot_maxqp;
   // The frame rate for the lowest resolution.
   double low_res_framerate;
   /* The frame number of each reference frames */
   unsigned int low_res_ref_frames[MAX_REF_FRAMES];
   // The video frame counter value for the key frame, for lowest resolution.
   unsigned int key_frame_counter_value;
+  // Flags to signal skipped encoding of previous and base layer stream.
+  unsigned int skip_encoding_prev_stream;
+  unsigned int skip_encoding_base_stream;
   LOWER_RES_MB_INFO *mb_info;
 } LOWER_RES_FRAME_INFO;
 #endif
@@ -191,8 +202,9 @@ typedef struct blockd {
   union b_mode_info bmi;
 } BLOCKD;
 
-typedef void (*vp8_subpix_fn_t)(unsigned char *src, int src_pitch, int xofst,
-                                int yofst, unsigned char *dst, int dst_pitch);
+typedef void (*vp8_subpix_fn_t)(unsigned char *src_ptr, int src_pixels_per_line,
+                                int xoffset, int yoffset,
+                                unsigned char *dst_ptr, int dst_pitch);
 
 typedef struct macroblockd {
   DECLARE_ALIGNED(16, unsigned char, predictor[384]);
@@ -239,7 +251,7 @@ typedef struct macroblockd {
   unsigned char update_mb_segmentation_data;
 
   /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
-  unsigned char mb_segement_abs_delta;
+  unsigned char mb_segment_abs_delta;
 
   /* Per frame flags that define which MB level features (such as quantizer or
    * loop filter level) */
@@ -278,7 +290,9 @@ typedef struct macroblockd {
 
   int corrupted;
 
-#if ARCH_X86 || ARCH_X86_64
+  struct vpx_internal_error_info error_info;
+
+#if VPX_ARCH_X86 || VPX_ARCH_X86_64
   /* This is an intermediate buffer currently used in sub-pixel motion search
    * to keep a copy of the reference area. This buffer can be used for other
    * purpose.
@@ -294,4 +308,4 @@ extern void vp8_setup_block_dptrs(MACROBLOCKD *x);
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_BLOCKD_H_
+#endif  // VPX_VP8_COMMON_BLOCKD_H_
diff --git a/media/libvpx/libvpx/vp8/common/coefupdateprobs.h b/media/libvpx/libvpx/vp8/common/coefupdateprobs.h
index 9b01bba312..b342096b55 100644
--- a/media/libvpx/libvpx/vp8/common/coefupdateprobs.h
+++ b/media/libvpx/libvpx/vp8/common/coefupdateprobs.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_COEFUPDATEPROBS_H_
-#define VP8_COMMON_COEFUPDATEPROBS_H_
+#ifndef VPX_VP8_COMMON_COEFUPDATEPROBS_H_
+#define VPX_VP8_COMMON_COEFUPDATEPROBS_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -194,4 +194,4 @@ const vp8_prob vp8_coef_update_probs
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_COEFUPDATEPROBS_H_
+#endif  // VPX_VP8_COMMON_COEFUPDATEPROBS_H_
diff --git a/media/libvpx/libvpx/vp8/common/common.h b/media/libvpx/libvpx/vp8/common/common.h
index bbfc4f3934..562569f9ab 100644
--- a/media/libvpx/libvpx/vp8/common/common.h
+++ b/media/libvpx/libvpx/vp8/common/common.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_COMMON_H_
-#define VP8_COMMON_COMMON_H_
+#ifndef VPX_VP8_COMMON_COMMON_H_
+#define VPX_VP8_COMMON_COMMON_H_
 
 #include <assert.h>
 
@@ -24,25 +24,25 @@ extern "C" {
 /* Only need this for fixed-size arrays, for structs just assign. */
 
 #define vp8_copy(Dest, Src)              \
-  {                                      \
+  do {                                   \
     assert(sizeof(Dest) == sizeof(Src)); \
     memcpy(Dest, Src, sizeof(Src));      \
-  }
+  } while (0)
 
 /* Use this for variably-sized arrays. */
 
-#define vp8_copy_array(Dest, Src, N)       \
-  {                                        \
-    assert(sizeof(*Dest) == sizeof(*Src)); \
-    memcpy(Dest, Src, N * sizeof(*Src));   \
-  }
+#define vp8_copy_array(Dest, Src, N)           \
+  do {                                         \
+    assert(sizeof(*(Dest)) == sizeof(*(Src))); \
+    memcpy(Dest, Src, (N) * sizeof(*(Src)));   \
+  } while (0)
 
-#define vp8_zero(Dest) memset(&Dest, 0, sizeof(Dest));
+#define vp8_zero(Dest) memset(&(Dest), 0, sizeof(Dest))
 
-#define vp8_zero_array(Dest, N) memset(Dest, 0, N * sizeof(*Dest));
+#define vp8_zero_array(Dest, N) memset(Dest, 0, (N) * sizeof(*(Dest)))
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_COMMON_H_
+#endif  // VPX_VP8_COMMON_COMMON_H_
diff --git a/media/libvpx/libvpx/vp8/common/default_coef_probs.h b/media/libvpx/libvpx/vp8/common/default_coef_probs.h
index 8c861ac876..b25e4a45a3 100644
--- a/media/libvpx/libvpx/vp8/common/default_coef_probs.h
+++ b/media/libvpx/libvpx/vp8/common/default_coef_probs.h
@@ -6,10 +6,10 @@
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
-*/
+ */
 
-#ifndef VP8_COMMON_DEFAULT_COEF_PROBS_H_
-#define VP8_COMMON_DEFAULT_COEF_PROBS_H_
+#ifndef VPX_VP8_COMMON_DEFAULT_COEF_PROBS_H_
+#define VPX_VP8_COMMON_DEFAULT_COEF_PROBS_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -157,4 +157,4 @@ static const vp8_prob default_coef_probs
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_DEFAULT_COEF_PROBS_H_
+#endif  // VPX_VP8_COMMON_DEFAULT_COEF_PROBS_H_
diff --git a/media/libvpx/libvpx/vp8/common/entropy.c b/media/libvpx/libvpx/vp8/common/entropy.c
index f61fa9e8e4..b9efc0cc1f 100644
--- a/media/libvpx/libvpx/vp8/common/entropy.c
+++ b/media/libvpx/libvpx/vp8/common/entropy.c
@@ -28,9 +28,9 @@ DECLARE_ALIGNED(16, const unsigned char, vp8_norm[256]) = {
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 
-DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]) = {
-  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7
-};
+DECLARE_ALIGNED(16, const unsigned char,
+                vp8_coef_bands[16]) = { 0, 1, 2, 3, 6, 4, 5, 6,
+                                        6, 6, 6, 6, 6, 6, 6, 7 };
 
 DECLARE_ALIGNED(16, const unsigned char,
                 vp8_prev_token_class[MAX_ENTROPY_TOKENS]) = {
@@ -41,9 +41,9 @@ DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]) = {
   0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15,
 };
 
-DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]) = {
-  1, 2, 6, 7, 3, 5, 8, 13, 4, 9, 12, 14, 10, 11, 15, 16
-};
+DECLARE_ALIGNED(16, const short,
+                vp8_default_inv_zig_zag[16]) = { 1, 2, 6,  7,  3,  5,  8,  13,
+                                                 4, 9, 12, 14, 10, 11, 15, 16 };
 
 /* vp8_default_zig_zag_mask generated with:
 
@@ -114,7 +114,7 @@ static const vp8_prob Pcat6[] = { 254, 254, 243, 230, 196, 177,
       p[0] = p[1] = 0;
     }
 
-    void init_bit_trees() {
+    void init_bit_trees(void) {
       init_bit_tree(cat1, 1);
       init_bit_tree(cat2, 2);
       init_bit_tree(cat3, 3);
@@ -129,9 +129,9 @@ static const vp8_tree_index cat2[4] = { 2, 2, 0, 0 };
 static const vp8_tree_index cat3[6] = { 2, 2, 4, 4, 0, 0 };
 static const vp8_tree_index cat4[8] = { 2, 2, 4, 4, 6, 6, 0, 0 };
 static const vp8_tree_index cat5[10] = { 2, 2, 4, 4, 6, 6, 8, 8, 0, 0 };
-static const vp8_tree_index cat6[22] = {
-  2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, 16, 16, 18, 18, 20, 20, 0, 0
-};
+static const vp8_tree_index cat6[22] = { 2,  2,  4,  4,  6,  6,  8,  8,
+                                         10, 10, 12, 12, 14, 14, 16, 16,
+                                         18, 18, 20, 20, 0,  0 };
 
 const vp8_extra_bit_struct vp8_extra_bits[12] = {
   { 0, 0, 0, 0 },         { 0, 0, 0, 1 },          { 0, 0, 0, 2 },
diff --git a/media/libvpx/libvpx/vp8/common/entropy.h b/media/libvpx/libvpx/vp8/common/entropy.h
index d088560011..fbdb7bcfca 100644
--- a/media/libvpx/libvpx/vp8/common/entropy.h
+++ b/media/libvpx/libvpx/vp8/common/entropy.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_ENTROPY_H_
-#define VP8_COMMON_ENTROPY_H_
+#ifndef VPX_VP8_COMMON_ENTROPY_H_
+#define VPX_VP8_COMMON_ENTROPY_H_
 
 #include "treecoder.h"
 #include "blockd.h"
@@ -105,4 +105,4 @@ void vp8_coef_tree_initialize(void);
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_ENTROPY_H_
+#endif  // VPX_VP8_COMMON_ENTROPY_H_
diff --git a/media/libvpx/libvpx/vp8/common/entropymode.c b/media/libvpx/libvpx/vp8/common/entropymode.c
index 30c2fa86ae..f61e0c2e2b 100644
--- a/media/libvpx/libvpx/vp8/common/entropymode.c
+++ b/media/libvpx/libvpx/vp8/common/entropymode.c
@@ -34,12 +34,13 @@ int vp8_mv_cont(const int_mv *l, const int_mv *a) {
 
 static const vp8_prob sub_mv_ref_prob[VP8_SUBMVREFS - 1] = { 180, 162, 25 };
 
-const vp8_prob vp8_sub_mv_ref_prob2[SUBMVREF_COUNT]
-                                   [VP8_SUBMVREFS - 1] = { { 147, 136, 18 },
-                                                           { 106, 145, 1 },
-                                                           { 179, 121, 1 },
-                                                           { 223, 1, 34 },
-                                                           { 208, 1, 1 } };
+const vp8_prob vp8_sub_mv_ref_prob2[SUBMVREF_COUNT][VP8_SUBMVREFS - 1] = {
+  { 147, 136, 18 },
+  { 106, 145, 1 },
+  { 179, 121, 1 },
+  { 223, 1, 34 },
+  { 208, 1, 1 }
+};
 
 const vp8_mbsplit vp8_mbsplits[VP8_NUMMBSPLITS] = {
   { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 },
@@ -74,9 +75,9 @@ const vp8_tree_index vp8_ymode_tree[8] = {
   -DC_PRED, 2, 4, 6, -V_PRED, -H_PRED, -TM_PRED, -B_PRED
 };
 
-const vp8_tree_index vp8_kf_ymode_tree[8] = {
-  -B_PRED, 2, 4, 6, -DC_PRED, -V_PRED, -H_PRED, -TM_PRED
-};
+const vp8_tree_index vp8_kf_ymode_tree[8] = { -B_PRED, 2,        4,
+                                              6,       -DC_PRED, -V_PRED,
+                                              -H_PRED, -TM_PRED };
 
 const vp8_tree_index vp8_uv_mode_tree[6] = { -DC_PRED, 2,       -V_PRED,
                                              4,        -H_PRED, -TM_PRED };
@@ -98,6 +99,6 @@ void vp8_init_mbmode_probs(VP8_COMMON *x) {
   memcpy(x->fc.sub_mv_ref_prob, sub_mv_ref_prob, sizeof(sub_mv_ref_prob));
 }
 
-void vp8_default_bmode_probs(vp8_prob p[VP8_BINTRAMODES - 1]) {
-  memcpy(p, vp8_bmode_prob, sizeof(vp8_bmode_prob));
+void vp8_default_bmode_probs(vp8_prob dest[VP8_BINTRAMODES - 1]) {
+  memcpy(dest, vp8_bmode_prob, sizeof(vp8_bmode_prob));
 }
diff --git a/media/libvpx/libvpx/vp8/common/entropymode.h b/media/libvpx/libvpx/vp8/common/entropymode.h
index e0a17df10c..c772cece57 100644
--- a/media/libvpx/libvpx/vp8/common/entropymode.h
+++ b/media/libvpx/libvpx/vp8/common/entropymode.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_ENTROPYMODE_H_
-#define VP8_COMMON_ENTROPYMODE_H_
+#ifndef VPX_VP8_COMMON_ENTROPYMODE_H_
+#define VPX_VP8_COMMON_ENTROPYMODE_H_
 
 #include "onyxc_int.h"
 #include "treecoder.h"
@@ -78,11 +78,11 @@ extern const vp8_prob vp8_kf_ymode_prob[VP8_YMODES - 1];
 
 void vp8_init_mbmode_probs(VP8_COMMON *x);
 void vp8_default_bmode_probs(vp8_prob dest[VP8_BINTRAMODES - 1]);
-void vp8_kf_default_bmode_probs(vp8_prob dest[VP8_BINTRAMODES][VP8_BINTRAMODES]
-                                             [VP8_BINTRAMODES - 1]);
+void vp8_kf_default_bmode_probs(
+    vp8_prob dest[VP8_BINTRAMODES][VP8_BINTRAMODES][VP8_BINTRAMODES - 1]);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_ENTROPYMODE_H_
+#endif  // VPX_VP8_COMMON_ENTROPYMODE_H_
diff --git a/media/libvpx/libvpx/vp8/common/entropymv.h b/media/libvpx/libvpx/vp8/common/entropymv.h
index 6373000903..40039f5b2c 100644
--- a/media/libvpx/libvpx/vp8/common/entropymv.h
+++ b/media/libvpx/libvpx/vp8/common/entropymv.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_ENTROPYMV_H_
-#define VP8_COMMON_ENTROPYMV_H_
+#ifndef VPX_VP8_COMMON_ENTROPYMV_H_
+#define VPX_VP8_COMMON_ENTROPYMV_H_
 
 #include "treecoder.h"
 
@@ -46,4 +46,4 @@ extern const MV_CONTEXT vp8_mv_update_probs[2], vp8_default_mv_context[2];
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_ENTROPYMV_H_
+#endif  // VPX_VP8_COMMON_ENTROPYMV_H_
diff --git a/media/libvpx/libvpx/vp8/common/extend.c b/media/libvpx/libvpx/vp8/common/extend.c
index 2d67b516be..b52e9fe93c 100644
--- a/media/libvpx/libvpx/vp8/common/extend.c
+++ b/media/libvpx/libvpx/vp8/common/extend.c
@@ -11,31 +11,40 @@
 #include "extend.h"
 #include "vpx_mem/vpx_mem.h"
 
-static void copy_and_extend_plane(unsigned char *s, /* source */
-                                  int sp,           /* source pitch */
-                                  unsigned char *d, /* destination */
-                                  int dp,           /* destination pitch */
-                                  int h,            /* height */
-                                  int w,            /* width */
-                                  int et,           /* extend top border */
-                                  int el,           /* extend left border */
-                                  int eb,           /* extend bottom border */
-                                  int er            /* extend right border */
-                                  ) {
-  int i;
+static void copy_and_extend_plane(
+    unsigned char *s,      /* source */
+    int sp,                /* source pitch */
+    unsigned char *d,      /* destination */
+    int dp,                /* destination pitch */
+    int h,                 /* height */
+    int w,                 /* width */
+    int et,                /* extend top border */
+    int el,                /* extend left border */
+    int eb,                /* extend bottom border */
+    int er,                /* extend right border */
+    int interleave_step) { /* step between pixels of the current plane */
+  int i, j;
   unsigned char *src_ptr1, *src_ptr2;
   unsigned char *dest_ptr1, *dest_ptr2;
   int linesize;
 
+  if (interleave_step < 1) interleave_step = 1;
+
   /* copy the left and right most columns out */
   src_ptr1 = s;
-  src_ptr2 = s + w - 1;
+  src_ptr2 = s + (w - 1) * interleave_step;
   dest_ptr1 = d - el;
   dest_ptr2 = d + w;
 
   for (i = 0; i < h; ++i) {
     memset(dest_ptr1, src_ptr1[0], el);
-    memcpy(dest_ptr1 + el, src_ptr1, w);
+    if (interleave_step == 1) {
+      memcpy(dest_ptr1 + el, src_ptr1, w);
+    } else {
+      for (j = 0; j < w; j++) {
+        dest_ptr1[el + j] = src_ptr1[interleave_step * j];
+      }
+    }
     memset(dest_ptr2, src_ptr2[0], er);
     src_ptr1 += sp;
     src_ptr2 += sp;
@@ -70,9 +79,12 @@ void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
   int eb = dst->border + dst->y_height - src->y_height;
   int er = dst->border + dst->y_width - src->y_width;
 
+  // detect nv12 colorspace
+  int chroma_step = src->v_buffer - src->u_buffer == 1 ? 2 : 1;
+
   copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
                         dst->y_stride, src->y_height, src->y_width, et, el, eb,
-                        er);
+                        er, 1);
 
   et = dst->border >> 1;
   el = dst->border >> 1;
@@ -81,11 +93,11 @@ void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
 
   copy_and_extend_plane(src->u_buffer, src->uv_stride, dst->u_buffer,
                         dst->uv_stride, src->uv_height, src->uv_width, et, el,
-                        eb, er);
+                        eb, er, chroma_step);
 
   copy_and_extend_plane(src->v_buffer, src->uv_stride, dst->v_buffer,
                         dst->uv_stride, src->uv_height, src->uv_width, et, el,
-                        eb, er);
+                        eb, er, chroma_step);
 }
 
 void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
@@ -99,6 +111,8 @@ void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
   int dst_y_offset = srcy * dst->y_stride + srcx;
   int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);
   int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);
+  // detect nv12 colorspace
+  int chroma_step = src->v_buffer - src->u_buffer == 1 ? 2 : 1;
 
   /* If the side is not touching the bounder then don't extend. */
   if (srcy) et = 0;
@@ -108,7 +122,7 @@ void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
 
   copy_and_extend_plane(src->y_buffer + src_y_offset, src->y_stride,
                         dst->y_buffer + dst_y_offset, dst->y_stride, srch, srcw,
-                        et, el, eb, er);
+                        et, el, eb, er, 1);
 
   et = (et + 1) >> 1;
   el = (el + 1) >> 1;
@@ -119,11 +133,11 @@ void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
 
   copy_and_extend_plane(src->u_buffer + src_uv_offset, src->uv_stride,
                         dst->u_buffer + dst_uv_offset, dst->uv_stride, srch,
-                        srcw, et, el, eb, er);
+                        srcw, et, el, eb, er, chroma_step);
 
   copy_and_extend_plane(src->v_buffer + src_uv_offset, src->uv_stride,
                         dst->v_buffer + dst_uv_offset, dst->uv_stride, srch,
-                        srcw, et, el, eb, er);
+                        srcw, et, el, eb, er, chroma_step);
 }
 
 /* note the extension is only for the last row, for intra prediction purpose */
diff --git a/media/libvpx/libvpx/vp8/common/extend.h b/media/libvpx/libvpx/vp8/common/extend.h
index 7da5ce31da..586a38a4f3 100644
--- a/media/libvpx/libvpx/vp8/common/extend.h
+++ b/media/libvpx/libvpx/vp8/common/extend.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_EXTEND_H_
-#define VP8_COMMON_EXTEND_H_
+#ifndef VPX_VP8_COMMON_EXTEND_H_
+#define VPX_VP8_COMMON_EXTEND_H_
 
 #include "vpx_scale/yv12config.h"
 
@@ -29,4 +29,4 @@ void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_EXTEND_H_
+#endif  // VPX_VP8_COMMON_EXTEND_H_
diff --git a/media/libvpx/libvpx/vp8/common/filter.h b/media/libvpx/libvpx/vp8/common/filter.h
index f1d5ece4a5..6acee22b21 100644
--- a/media/libvpx/libvpx/vp8/common/filter.h
+++ b/media/libvpx/libvpx/vp8/common/filter.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_FILTER_H_
-#define VP8_COMMON_FILTER_H_
+#ifndef VPX_VP8_COMMON_FILTER_H_
+#define VPX_VP8_COMMON_FILTER_H_
 
 #include "vpx_ports/mem.h"
 
@@ -28,4 +28,4 @@ extern DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]);
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_FILTER_H_
+#endif  // VPX_VP8_COMMON_FILTER_H_
diff --git a/media/libvpx/libvpx/vp8/common/findnearmv.c b/media/libvpx/libvpx/vp8/common/findnearmv.c
index f40d2c6bde..3b31923621 100644
--- a/media/libvpx/libvpx/vp8/common/findnearmv.c
+++ b/media/libvpx/libvpx/vp8/common/findnearmv.c
@@ -21,19 +21,20 @@ const unsigned char vp8_mbsplit_offset[4][16] = {
    Note that we only consider one 4x4 subblock from each candidate 16x16
    macroblock.   */
 void vp8_find_near_mvs(MACROBLOCKD *xd, const MODE_INFO *here, int_mv *nearest,
-                       int_mv *nearby, int_mv *best_mv, int cnt[4],
+                       int_mv *nearby, int_mv *best_mv, int near_mv_ref_cnts[4],
                        int refframe, int *ref_frame_sign_bias) {
   const MODE_INFO *above = here - xd->mode_info_stride;
   const MODE_INFO *left = here - 1;
   const MODE_INFO *aboveleft = above - 1;
   int_mv near_mvs[4];
   int_mv *mv = near_mvs;
-  int *cntx = cnt;
+  int *cntx = near_mv_ref_cnts;
   enum { CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
 
   /* Zero accumulators */
   mv[0].as_int = mv[1].as_int = mv[2].as_int = 0;
-  cnt[0] = cnt[1] = cnt[2] = cnt[3] = 0;
+  near_mv_ref_cnts[0] = near_mv_ref_cnts[1] = near_mv_ref_cnts[2] =
+      near_mv_ref_cnts[3] = 0;
 
   /* Process above */
   if (above->mbmi.ref_frame != INTRA_FRAME) {
@@ -63,7 +64,7 @@ void vp8_find_near_mvs(MACROBLOCKD *xd, const MODE_INFO *here, int_mv *nearest,
 
       *cntx += 2;
     } else {
-      cnt[CNT_INTRA] += 2;
+      near_mv_ref_cnts[CNT_INTRA] += 2;
     }
   }
 
@@ -83,33 +84,34 @@ void vp8_find_near_mvs(MACROBLOCKD *xd, const MODE_INFO *here, int_mv *nearest,
 
       *cntx += 1;
     } else {
-      cnt[CNT_INTRA] += 1;
+      near_mv_ref_cnts[CNT_INTRA] += 1;
     }
   }
 
   /* If we have three distinct MV's ... */
-  if (cnt[CNT_SPLITMV]) {
+  if (near_mv_ref_cnts[CNT_SPLITMV]) {
     /* See if above-left MV can be merged with NEAREST */
-    if (mv->as_int == near_mvs[CNT_NEAREST].as_int) cnt[CNT_NEAREST] += 1;
+    if (mv->as_int == near_mvs[CNT_NEAREST].as_int)
+      near_mv_ref_cnts[CNT_NEAREST] += 1;
   }
 
-  cnt[CNT_SPLITMV] =
+  near_mv_ref_cnts[CNT_SPLITMV] =
       ((above->mbmi.mode == SPLITMV) + (left->mbmi.mode == SPLITMV)) * 2 +
       (aboveleft->mbmi.mode == SPLITMV);
 
   /* Swap near and nearest if necessary */
-  if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
+  if (near_mv_ref_cnts[CNT_NEAR] > near_mv_ref_cnts[CNT_NEAREST]) {
     int tmp;
-    tmp = cnt[CNT_NEAREST];
-    cnt[CNT_NEAREST] = cnt[CNT_NEAR];
-    cnt[CNT_NEAR] = tmp;
-    tmp = near_mvs[CNT_NEAREST].as_int;
+    tmp = near_mv_ref_cnts[CNT_NEAREST];
+    near_mv_ref_cnts[CNT_NEAREST] = near_mv_ref_cnts[CNT_NEAR];
+    near_mv_ref_cnts[CNT_NEAR] = tmp;
+    tmp = (int)near_mvs[CNT_NEAREST].as_int;
     near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int;
-    near_mvs[CNT_NEAR].as_int = tmp;
+    near_mvs[CNT_NEAR].as_int = (uint32_t)tmp;
   }
 
   /* Use near_mvs[0] to store the "best" MV */
-  if (cnt[CNT_NEAREST] >= cnt[CNT_INTRA]) {
+  if (near_mv_ref_cnts[CNT_NEAREST] >= near_mv_ref_cnts[CNT_INTRA]) {
     near_mvs[CNT_INTRA] = near_mvs[CNT_NEAREST];
   }
 
diff --git a/media/libvpx/libvpx/vp8/common/findnearmv.h b/media/libvpx/libvpx/vp8/common/findnearmv.h
index c1eaa26980..d7db9544aa 100644
--- a/media/libvpx/libvpx/vp8/common/findnearmv.h
+++ b/media/libvpx/libvpx/vp8/common/findnearmv.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_FINDNEARMV_H_
-#define VP8_COMMON_FINDNEARMV_H_
+#ifndef VPX_VP8_COMMON_FINDNEARMV_H_
+#define VPX_VP8_COMMON_FINDNEARMV_H_
 
 #include "./vpx_config.h"
 #include "mv.h"
@@ -70,7 +70,7 @@ static INLINE unsigned int vp8_check_mv_bounds(int_mv *mv, int mb_to_left_edge,
 }
 
 void vp8_find_near_mvs(MACROBLOCKD *xd, const MODE_INFO *here, int_mv *nearest,
-                       int_mv *nearby, int_mv *best, int near_mv_ref_cts[4],
+                       int_mv *nearby, int_mv *best_mv, int near_mv_ref_cnts[4],
                        int refframe, int *ref_frame_sign_bias);
 
 int vp8_find_near_mvs_bias(MACROBLOCKD *xd, const MODE_INFO *here,
@@ -148,4 +148,4 @@ static INLINE B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, int b,
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_FINDNEARMV_H_
+#endif  // VPX_VP8_COMMON_FINDNEARMV_H_
diff --git a/media/libvpx/libvpx/vp8/common/generic/systemdependent.c b/media/libvpx/libvpx/vp8/common/generic/systemdependent.c
index 28c981a607..58c778aba5 100644
--- a/media/libvpx/libvpx/vp8/common/generic/systemdependent.c
+++ b/media/libvpx/libvpx/vp8/common/generic/systemdependent.c
@@ -10,32 +10,34 @@
 
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
-#if ARCH_ARM
+#if VPX_ARCH_ARM
 #include "vpx_ports/arm.h"
-#elif ARCH_X86 || ARCH_X86_64
+#elif VPX_ARCH_X86 || VPX_ARCH_X86_64
 #include "vpx_ports/x86.h"
+#elif VPX_ARCH_PPC
+#include "vpx_ports/ppc.h"
+#elif VPX_ARCH_MIPS
+#include "vpx_ports/mips.h"
+#elif VPX_ARCH_LOONGARCH
+#include "vpx_ports/loongarch.h"
 #endif
 #include "vp8/common/onyxc_int.h"
 #include "vp8/common/systemdependent.h"
 
 #if CONFIG_MULTITHREAD
-#if HAVE_UNISTD_H && !defined(__OS2__)
+#if HAVE_UNISTD_H
 #include <unistd.h>
 #elif defined(_WIN32)
 #include <windows.h>
 typedef void(WINAPI *PGNSI)(LPSYSTEM_INFO);
-#elif defined(__OS2__)
-#define INCL_DOS
-#define INCL_DOSSPINLOCK
-#include <os2.h>
 #endif
 #endif
 
 #if CONFIG_MULTITHREAD
-static int get_cpu_count() {
+static int get_cpu_count(void) {
   int core_count = 16;
 
-#if HAVE_UNISTD_H && !defined(__OS2__)
+#if HAVE_UNISTD_H
 #if defined(_SC_NPROCESSORS_ONLN)
   core_count = (int)sysconf(_SC_NPROCESSORS_ONLN);
 #elif defined(_SC_NPROC_ONLN)
@@ -43,38 +45,13 @@ static int get_cpu_count() {
 #endif
 #elif defined(_WIN32)
   {
-#if _WIN32_WINNT >= 0x0501
+#if _WIN32_WINNT < 0x0501
+#error _WIN32_WINNT must target Windows XP or newer.
+#endif
     SYSTEM_INFO sysinfo;
     GetNativeSystemInfo(&sysinfo);
-#else
-    PGNSI pGNSI;
-    SYSTEM_INFO sysinfo;
-
-    /* Call GetNativeSystemInfo if supported or
-     * GetSystemInfo otherwise. */
-
-    pGNSI = (PGNSI)GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")),
-                                  "GetNativeSystemInfo");
-    if (pGNSI != NULL)
-      pGNSI(&sysinfo);
-    else
-      GetSystemInfo(&sysinfo);
-#endif
-
     core_count = (int)sysinfo.dwNumberOfProcessors;
   }
-#elif defined(__OS2__)
-  {
-    ULONG proc_id;
-    ULONG status;
-
-    core_count = 0;
-    for (proc_id = 1;; ++proc_id) {
-      if (DosGetProcessorStatus(proc_id, &status)) break;
-
-      if (status == PROC_ONLINE) core_count++;
-    }
-  }
 #else
 /* other platforms */
 #endif
@@ -89,10 +66,4 @@ void vp8_machine_specific_config(VP8_COMMON *ctx) {
 #else
   (void)ctx;
 #endif /* CONFIG_MULTITHREAD */
-
-#if ARCH_ARM
-  ctx->cpu_caps = arm_cpu_caps();
-#elif ARCH_X86 || ARCH_X86_64
-  ctx->cpu_caps = x86_simd_caps();
-#endif
 }
diff --git a/media/libvpx/libvpx/vp8/common/header.h b/media/libvpx/libvpx/vp8/common/header.h
index 1df01fc6fa..e64e241908 100644
--- a/media/libvpx/libvpx/vp8/common/header.h
+++ b/media/libvpx/libvpx/vp8/common/header.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_HEADER_H_
-#define VP8_COMMON_HEADER_H_
+#ifndef VPX_VP8_COMMON_HEADER_H_
+#define VPX_VP8_COMMON_HEADER_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -45,4 +45,4 @@ typedef struct {
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_HEADER_H_
+#endif  // VPX_VP8_COMMON_HEADER_H_
diff --git a/media/libvpx/libvpx/vp8/common/idct_blk.c b/media/libvpx/libvpx/vp8/common/idct_blk.c
index ff9f3eb7f2..ebe1774f56 100644
--- a/media/libvpx/libvpx/vp8/common/idct_blk.c
+++ b/media/libvpx/libvpx/vp8/common/idct_blk.c
@@ -12,12 +12,6 @@
 #include "vp8_rtcd.h"
 #include "vpx_mem/vpx_mem.h"
 
-void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest,
-                            int stride);
-void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred,
-                            int pred_stride, unsigned char *dst_ptr,
-                            int dst_stride);
-
 void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst,
                                     int stride, char *eobs) {
   int i, j;
@@ -39,40 +33,40 @@ void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst,
   }
 }
 
-void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dstu,
-                                     unsigned char *dstv, int stride,
+void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u,
+                                     unsigned char *dst_v, int stride,
                                      char *eobs) {
   int i, j;
 
   for (i = 0; i < 2; ++i) {
     for (j = 0; j < 2; ++j) {
       if (*eobs++ > 1) {
-        vp8_dequant_idct_add_c(q, dq, dstu, stride);
+        vp8_dequant_idct_add_c(q, dq, dst_u, stride);
       } else {
-        vp8_dc_only_idct_add_c(q[0] * dq[0], dstu, stride, dstu, stride);
+        vp8_dc_only_idct_add_c(q[0] * dq[0], dst_u, stride, dst_u, stride);
         memset(q, 0, 2 * sizeof(q[0]));
       }
 
       q += 16;
-      dstu += 4;
+      dst_u += 4;
     }
 
-    dstu += 4 * stride - 8;
+    dst_u += 4 * stride - 8;
   }
 
   for (i = 0; i < 2; ++i) {
     for (j = 0; j < 2; ++j) {
       if (*eobs++ > 1) {
-        vp8_dequant_idct_add_c(q, dq, dstv, stride);
+        vp8_dequant_idct_add_c(q, dq, dst_v, stride);
       } else {
-        vp8_dc_only_idct_add_c(q[0] * dq[0], dstv, stride, dstv, stride);
+        vp8_dc_only_idct_add_c(q[0] * dq[0], dst_v, stride, dst_v, stride);
         memset(q, 0, 2 * sizeof(q[0]));
       }
 
       q += 16;
-      dstv += 4;
+      dst_v += 4;
     }
 
-    dstv += 4 * stride - 8;
+    dst_v += 4 * stride - 8;
   }
 }
diff --git a/media/libvpx/libvpx/vp8/common/invtrans.h b/media/libvpx/libvpx/vp8/common/invtrans.h
index c7af32fb67..aed7bb0600 100644
--- a/media/libvpx/libvpx/vp8/common/invtrans.h
+++ b/media/libvpx/libvpx/vp8/common/invtrans.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_INVTRANS_H_
-#define VP8_COMMON_INVTRANS_H_
+#ifndef VPX_VP8_COMMON_INVTRANS_H_
+#define VPX_VP8_COMMON_INVTRANS_H_
 
 #include "./vpx_config.h"
 #include "vp8_rtcd.h"
@@ -54,4 +54,4 @@ static INLINE void vp8_inverse_transform_mby(MACROBLOCKD *xd) {
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_INVTRANS_H_
+#endif  // VPX_VP8_COMMON_INVTRANS_H_
diff --git a/media/libvpx/libvpx/vp8/common/loongarch/idct_lsx.c b/media/libvpx/libvpx/vp8/common/loongarch/idct_lsx.c
new file mode 100644
index 0000000000..eee871eec4
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/loongarch/idct_lsx.c
@@ -0,0 +1,322 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/blockd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static const int32_t cospi8sqrt2minus1 = 20091;
+static const int32_t sinpi8sqrt2 = 35468;
+
+#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3)    \
+  do {                                                                    \
+    __m128i tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \
+                                                                          \
+    DUP2_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, tmp0_m, tmp1_m);         \
+    DUP2_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, tmp2_m, tmp3_m);         \
+    DUP2_ARG2(__lsx_vilvl_w, tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \
+    DUP2_ARG2(__lsx_vilvh_w, tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \
+  } while (0)
+
+#define TRANSPOSE_TWO_4x4_H(in0, in1, in2, in3, out0, out1, out2, out3) \
+  do {                                                                  \
+    __m128i s4_m, s5_m, s6_m, s7_m;                                     \
+                                                                        \
+    TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, s4_m, s5_m, s6_m, s7_m);     \
+    DUP2_ARG2(__lsx_vilvl_d, s6_m, s4_m, s7_m, s5_m, out0, out2);       \
+    out1 = __lsx_vilvh_d(s6_m, s4_m);                                   \
+    out3 = __lsx_vilvh_d(s7_m, s5_m);                                   \
+  } while (0)
+
+#define EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in0, in1)   \
+  do {                                                        \
+    __m128i zero_m = __lsx_vldi(0);                           \
+    __m128i tmp1_m, tmp2_m;                                   \
+    __m128i sinpi8_sqrt2_m = __lsx_vreplgr2vr_w(sinpi8sqrt2); \
+                                                              \
+    tmp1_m = __lsx_vilvl_h(in0, zero_m);                      \
+    tmp2_m = __lsx_vilvh_h(in0, zero_m);                      \
+    tmp1_m = __lsx_vsrai_w(tmp1_m, 16);                       \
+    tmp2_m = __lsx_vsrai_w(tmp2_m, 16);                       \
+    tmp1_m = __lsx_vmul_w(tmp1_m, sinpi8_sqrt2_m);            \
+    tmp1_m = __lsx_vsrai_w(tmp1_m, 16);                       \
+    tmp2_m = __lsx_vmul_w(tmp2_m, sinpi8_sqrt2_m);            \
+    tmp2_m = __lsx_vsrai_w(tmp2_m, 16);                       \
+    in1 = __lsx_vpickev_h(tmp2_m, tmp1_m);                    \
+  } while (0)
+
+#define VP8_IDCT_1D_H(in0, in1, in2, in3, out0, out1, out2, out3)      \
+  do {                                                                 \
+    __m128i a1_m, b1_m, c1_m, d1_m;                                    \
+    __m128i c_tmp1_m, c_tmp2_m;                                        \
+    __m128i d_tmp1_m, d_tmp2_m;                                        \
+    __m128i const_cospi8sqrt2minus1_m;                                 \
+                                                                       \
+    const_cospi8sqrt2minus1_m = __lsx_vreplgr2vr_h(cospi8sqrt2minus1); \
+    a1_m = __lsx_vadd_h(in0, in2);                                     \
+    b1_m = __lsx_vsub_h(in0, in2);                                     \
+    EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in1, c_tmp1_m);          \
+                                                                       \
+    c_tmp2_m = __lsx_vmuh_h(in3, const_cospi8sqrt2minus1_m);           \
+    c_tmp2_m = __lsx_vslli_h(c_tmp2_m, 1);                             \
+    c_tmp2_m = __lsx_vsrai_h(c_tmp2_m, 1);                             \
+    c_tmp2_m = __lsx_vadd_h(in3, c_tmp2_m);                            \
+    c1_m = __lsx_vsub_h(c_tmp1_m, c_tmp2_m);                           \
+                                                                       \
+    d_tmp1_m = __lsx_vmuh_h(in1, const_cospi8sqrt2minus1_m);           \
+    d_tmp1_m = __lsx_vslli_h(d_tmp1_m, 1);                             \
+    d_tmp1_m = __lsx_vsrai_h(d_tmp1_m, 1);                             \
+    d_tmp1_m = __lsx_vadd_h(in1, d_tmp1_m);                            \
+    EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in3, d_tmp2_m);          \
+    d1_m = __lsx_vadd_h(d_tmp1_m, d_tmp2_m);                           \
+    LSX_BUTTERFLY_4_H(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
+  } while (0)
+
+#define VP8_IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3)      \
+  do {                                                                 \
+    __m128i a1_m, b1_m, c1_m, d1_m;                                    \
+    __m128i c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m;                    \
+    __m128i const_cospi8sqrt2minus1_m, sinpi8_sqrt2_m;                 \
+                                                                       \
+    const_cospi8sqrt2minus1_m = __lsx_vreplgr2vr_w(cospi8sqrt2minus1); \
+    sinpi8_sqrt2_m = __lsx_vreplgr2vr_w(sinpi8sqrt2);                  \
+    a1_m = __lsx_vadd_w(in0, in2);                                     \
+    b1_m = __lsx_vsub_w(in0, in2);                                     \
+    c_tmp1_m = __lsx_vmul_w(in1, sinpi8_sqrt2_m);                      \
+    c_tmp1_m = __lsx_vsrai_w(c_tmp1_m, 16);                            \
+    c_tmp2_m = __lsx_vmul_w(in3, const_cospi8sqrt2minus1_m);           \
+    c_tmp2_m = __lsx_vsrai_w(c_tmp2_m, 16);                            \
+    c_tmp2_m = __lsx_vadd_w(in3, c_tmp2_m);                            \
+    c1_m = __lsx_vsub_w(c_tmp1_m, c_tmp2_m);                           \
+    d_tmp1_m = __lsx_vmul_w(in1, const_cospi8sqrt2minus1_m);           \
+    d_tmp1_m = __lsx_vsrai_w(d_tmp1_m, 16);                            \
+    d_tmp1_m = __lsx_vadd_w(in1, d_tmp1_m);                            \
+    d_tmp2_m = __lsx_vmul_w(in3, sinpi8_sqrt2_m);                      \
+    d_tmp2_m = __lsx_vsrai_w(d_tmp2_m, 16);                            \
+    d1_m = __lsx_vadd_w(d_tmp1_m, d_tmp2_m);                           \
+    LSX_BUTTERFLY_4_W(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
+  } while (0)
+
+#define UNPCK_SH_SW(in, out0, out1)  \
+  do {                               \
+    out0 = __lsx_vsllwil_w_h(in, 0); \
+    out1 = __lsx_vexth_w_h(in);      \
+  } while (0)
+
+static void idct4x4_addconst_lsx(int16_t in_dc, uint8_t *pred,
+                                 int32_t pred_stride, uint8_t *dest,
+                                 int32_t dest_stride) {
+  __m128i vec, res0, res1, res2, res3, dst0, dst1;
+  __m128i pred0, pred1, pred2, pred3;
+  __m128i zero = __lsx_vldi(0);
+
+  int32_t pred_stride2 = pred_stride << 1;
+  int32_t pred_stride3 = pred_stride2 + pred_stride;
+
+  vec = __lsx_vreplgr2vr_h(in_dc);
+  vec = __lsx_vsrari_h(vec, 3);
+  pred0 = __lsx_vld(pred, 0);
+  DUP2_ARG2(__lsx_vldx, pred, pred_stride, pred, pred_stride2, pred1, pred2);
+  pred3 = __lsx_vldx(pred, pred_stride3);
+  DUP4_ARG2(__lsx_vilvl_b, zero, pred0, zero, pred1, zero, pred2, zero, pred3,
+            res0, res1, res2, res3);
+  DUP4_ARG2(__lsx_vadd_h, res0, vec, res1, vec, res2, vec, res3, vec, res0,
+            res1, res2, res3);
+  res0 = __lsx_vclip255_h(res0);
+  res1 = __lsx_vclip255_h(res1);
+  res2 = __lsx_vclip255_h(res2);
+  res3 = __lsx_vclip255_h(res3);
+
+  DUP2_ARG2(__lsx_vpickev_b, res1, res0, res3, res2, dst0, dst1);
+  dst0 = __lsx_vpickev_w(dst1, dst0);
+  __lsx_vstelm_w(dst0, dest, 0, 0);
+  dest += dest_stride;
+  __lsx_vstelm_w(dst0, dest, 0, 1);
+  dest += dest_stride;
+  __lsx_vstelm_w(dst0, dest, 0, 2);
+  dest += dest_stride;
+  __lsx_vstelm_w(dst0, dest, 0, 3);
+}
+
+void vp8_dc_only_idct_add_lsx(int16_t input_dc, uint8_t *pred_ptr,
+                              int32_t pred_stride, uint8_t *dst_ptr,
+                              int32_t dst_stride) {
+  idct4x4_addconst_lsx(input_dc, pred_ptr, pred_stride, dst_ptr, dst_stride);
+}
+
+static void dequant_idct4x4_addblk_2x_lsx(int16_t *input,
+                                          int16_t *dequant_input, uint8_t *dest,
+                                          int32_t dest_stride) {
+  __m128i dest0, dest1, dest2, dest3;
+  __m128i in0, in1, in2, in3, mul0, mul1, mul2, mul3, dequant_in0, dequant_in1;
+  __m128i hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3, res0, res1, res2, res3;
+  __m128i hz0l, hz1l, hz2l, hz3l, hz0r, hz1r, hz2r, hz3r;
+  __m128i vt0l, vt1l, vt2l, vt3l, vt0r, vt1r, vt2r, vt3r;
+  __m128i zero = __lsx_vldi(0);
+
+  int32_t dest_stride2 = dest_stride << 1;
+  int32_t dest_stride3 = dest_stride2 + dest_stride;
+
+  DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48, in0, in1, in2,
+            in3);
+  DUP2_ARG2(__lsx_vld, dequant_input, 0, dequant_input, 16, dequant_in0,
+            dequant_in1);
+
+  DUP4_ARG2(__lsx_vmul_h, in0, dequant_in0, in1, dequant_in1, in2, dequant_in0,
+            in3, dequant_in1, mul0, mul1, mul2, mul3);
+  DUP2_ARG2(__lsx_vpickev_d, mul2, mul0, mul3, mul1, in0, in2);
+  DUP2_ARG2(__lsx_vpickod_d, mul2, mul0, mul3, mul1, in1, in3);
+
+  VP8_IDCT_1D_H(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
+  TRANSPOSE_TWO_4x4_H(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
+  UNPCK_SH_SW(hz0, hz0r, hz0l);
+  UNPCK_SH_SW(hz1, hz1r, hz1l);
+  UNPCK_SH_SW(hz2, hz2r, hz2l);
+  UNPCK_SH_SW(hz3, hz3r, hz3l);
+  VP8_IDCT_1D_W(hz0l, hz1l, hz2l, hz3l, vt0l, vt1l, vt2l, vt3l);
+  DUP4_ARG2(__lsx_vsrari_w, vt0l, 3, vt1l, 3, vt2l, 3, vt3l, 3, vt0l, vt1l,
+            vt2l, vt3l);
+  VP8_IDCT_1D_W(hz0r, hz1r, hz2r, hz3r, vt0r, vt1r, vt2r, vt3r);
+  DUP4_ARG2(__lsx_vsrari_w, vt0r, 3, vt1r, 3, vt2r, 3, vt3r, 3, vt0r, vt1r,
+            vt2r, vt3r);
+  DUP4_ARG2(__lsx_vpickev_h, vt0l, vt0r, vt1l, vt1r, vt2l, vt2r, vt3l, vt3r,
+            vt0, vt1, vt2, vt3);
+  TRANSPOSE_TWO_4x4_H(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
+  dest0 = __lsx_vld(dest, 0);
+  DUP2_ARG2(__lsx_vldx, dest, dest_stride, dest, dest_stride2, dest1, dest2);
+  dest3 = __lsx_vldx(dest, dest_stride3);
+  DUP4_ARG2(__lsx_vilvl_b, zero, dest0, zero, dest1, zero, dest2, zero, dest3,
+            res0, res1, res2, res3);
+  DUP4_ARG2(__lsx_vadd_h, res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0,
+            res1, res2, res3);
+
+  res0 = __lsx_vclip255_h(res0);
+  res1 = __lsx_vclip255_h(res1);
+  res2 = __lsx_vclip255_h(res2);
+  res3 = __lsx_vclip255_h(res3);
+  DUP2_ARG2(__lsx_vpickev_b, res1, res0, res3, res2, vt0l, vt1l);
+
+  __lsx_vstelm_d(vt0l, dest, 0, 0);
+  __lsx_vstelm_d(vt0l, dest + dest_stride, 0, 1);
+  __lsx_vstelm_d(vt1l, dest + dest_stride2, 0, 0);
+  __lsx_vstelm_d(vt1l, dest + dest_stride3, 0, 1);
+
+  __lsx_vst(zero, input, 0);
+  __lsx_vst(zero, input, 16);
+  __lsx_vst(zero, input, 32);
+  __lsx_vst(zero, input, 48);
+}
+
+static void dequant_idct_addconst_2x_lsx(int16_t *input, int16_t *dequant_input,
+                                         uint8_t *dest, int32_t dest_stride) {
+  __m128i input_dc0, input_dc1, vec, res0, res1, res2, res3;
+  __m128i dest0, dest1, dest2, dest3;
+  __m128i zero = __lsx_vldi(0);
+  int32_t dest_stride2 = dest_stride << 1;
+  int32_t dest_stride3 = dest_stride2 + dest_stride;
+
+  input_dc0 = __lsx_vreplgr2vr_h(input[0] * dequant_input[0]);
+  input_dc1 = __lsx_vreplgr2vr_h(input[16] * dequant_input[0]);
+  DUP2_ARG2(__lsx_vsrari_h, input_dc0, 3, input_dc1, 3, input_dc0, input_dc1);
+  vec = __lsx_vpickev_d(input_dc1, input_dc0);
+  input[0] = 0;
+  input[16] = 0;
+  dest0 = __lsx_vld(dest, 0);
+  DUP2_ARG2(__lsx_vldx, dest, dest_stride, dest, dest_stride2, dest1, dest2);
+  dest3 = __lsx_vldx(dest, dest_stride3);
+  DUP4_ARG2(__lsx_vilvl_b, zero, dest0, zero, dest1, zero, dest2, zero, dest3,
+            res0, res1, res2, res3);
+  DUP4_ARG2(__lsx_vadd_h, res0, vec, res1, vec, res2, vec, res3, vec, res0,
+            res1, res2, res3);
+  res0 = __lsx_vclip255_h(res0);
+  res1 = __lsx_vclip255_h(res1);
+  res2 = __lsx_vclip255_h(res2);
+  res3 = __lsx_vclip255_h(res3);
+
+  DUP2_ARG2(__lsx_vpickev_b, res1, res0, res3, res2, res0, res1);
+  __lsx_vstelm_d(res0, dest, 0, 0);
+  __lsx_vstelm_d(res0, dest + dest_stride, 0, 1);
+  __lsx_vstelm_d(res1, dest + dest_stride2, 0, 0);
+  __lsx_vstelm_d(res1, dest + dest_stride3, 0, 1);
+}
+
+void vp8_dequant_idct_add_y_block_lsx(int16_t *q, int16_t *dq, uint8_t *dst,
+                                      int32_t stride, char *eobs) {
+  int16_t *eobs_h = (int16_t *)eobs;
+  uint8_t i;
+
+  for (i = 4; i--;) {
+    if (eobs_h[0]) {
+      if (eobs_h[0] & 0xfefe) {
+        dequant_idct4x4_addblk_2x_lsx(q, dq, dst, stride);
+      } else {
+        dequant_idct_addconst_2x_lsx(q, dq, dst, stride);
+      }
+    }
+
+    q += 32;
+
+    if (eobs_h[1]) {
+      if (eobs_h[1] & 0xfefe) {
+        dequant_idct4x4_addblk_2x_lsx(q, dq, dst + 8, stride);
+      } else {
+        dequant_idct_addconst_2x_lsx(q, dq, dst + 8, stride);
+      }
+    }
+
+    q += 32;
+    dst += (4 * stride);
+    eobs_h += 2;
+  }
+}
+
+void vp8_dequant_idct_add_uv_block_lsx(int16_t *q, int16_t *dq, uint8_t *dst_u,
+                                       uint8_t *dst_v, int32_t stride,
+                                       char *eobs) {
+  int16_t *eobs_h = (int16_t *)eobs;
+  if (eobs_h[0]) {
+    if (eobs_h[0] & 0xfefe) {
+      dequant_idct4x4_addblk_2x_lsx(q, dq, dst_u, stride);
+    } else {
+      dequant_idct_addconst_2x_lsx(q, dq, dst_u, stride);
+    }
+  }
+
+  q += 32;
+  dst_u += (stride * 4);
+
+  if (eobs_h[1]) {
+    if (eobs_h[1] & 0xfefe) {
+      dequant_idct4x4_addblk_2x_lsx(q, dq, dst_u, stride);
+    } else {
+      dequant_idct_addconst_2x_lsx(q, dq, dst_u, stride);
+    }
+  }
+
+  q += 32;
+
+  if (eobs_h[2]) {
+    if (eobs_h[2] & 0xfefe) {
+      dequant_idct4x4_addblk_2x_lsx(q, dq, dst_v, stride);
+    } else {
+      dequant_idct_addconst_2x_lsx(q, dq, dst_v, stride);
+    }
+  }
+  q += 32;
+  dst_v += (stride * 4);
+
+  if (eobs_h[3]) {
+    if (eobs_h[3] & 0xfefe) {
+      dequant_idct4x4_addblk_2x_lsx(q, dq, dst_v, stride);
+    } else {
+      dequant_idct_addconst_2x_lsx(q, dq, dst_v, stride);
+    }
+  }
+}
diff --git a/media/libvpx/libvpx/vp8/common/loongarch/loopfilter_filters_lsx.c b/media/libvpx/libvpx/vp8/common/loongarch/loopfilter_filters_lsx.c
new file mode 100644
index 0000000000..79c3ea6dbb
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/loongarch/loopfilter_filters_lsx.c
@@ -0,0 +1,743 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/loopfilter.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+#define VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev)        \
+  do {                                                       \
+    __m128i p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \
+    const __m128i cnst4b = __lsx_vldi(4);                    \
+    const __m128i cnst3b = __lsx_vldi(3);                    \
+                                                             \
+    p1_m = __lsx_vxori_b(p1, 0x80);                          \
+    p0_m = __lsx_vxori_b(p0, 0x80);                          \
+    q0_m = __lsx_vxori_b(q0, 0x80);                          \
+    q1_m = __lsx_vxori_b(q1, 0x80);                          \
+                                                             \
+    filt = __lsx_vssub_b(p1_m, q1_m);                        \
+    filt = __lsx_vand_v(filt, hev);                          \
+    q0_sub_p0 = __lsx_vssub_b(q0_m, p0_m);                   \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);                   \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);                   \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);                   \
+    filt = __lsx_vand_v(filt, mask);                         \
+    t1 = __lsx_vsadd_b(filt, cnst4b);                        \
+    t1 = __lsx_vsra_b(t1, cnst3b);                           \
+    t2 = __lsx_vsadd_b(filt, cnst3b);                        \
+    t2 = __lsx_vsra_b(t2, cnst3b);                           \
+    q0_m = __lsx_vssub_b(q0_m, t1);                          \
+    q0 = __lsx_vxori_b(q0_m, 0x80);                          \
+    p0_m = __lsx_vsadd_b(p0_m, t2);                          \
+    p0 = __lsx_vxori_b(p0_m, 0x80);                          \
+    filt = __lsx_vsrari_b(t1, 1);                            \
+    hev = __lsx_vxori_b(hev, 0xff);                          \
+    filt = __lsx_vand_v(filt, hev);                          \
+    q1_m = __lsx_vssub_b(q1_m, filt);                        \
+    q1 = __lsx_vxori_b(q1_m, 0x80);                          \
+    p1_m = __lsx_vsadd_b(p1_m, filt);                        \
+    p1 = __lsx_vxori_b(p1_m, 0x80);                          \
+  } while (0)
+
+#define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) \
+  do {                                                  \
+    __m128i p2_m, p1_m, p0_m, q2_m, q1_m, q0_m;         \
+    __m128i u, filt, t1, t2, filt_sign, q0_sub_p0;      \
+    __m128i filt_r, filt_l;                             \
+    __m128i temp0, temp1, temp2, temp3;                 \
+    const __m128i cnst4b = __lsx_vldi(4);               \
+    const __m128i cnst3b = __lsx_vldi(3);               \
+    const __m128i cnst9h = __lsx_vldi(1033);            \
+    const __m128i cnst63h = __lsx_vldi(1087);           \
+                                                        \
+    p2_m = __lsx_vxori_b(p2, 0x80);                     \
+    p1_m = __lsx_vxori_b(p1, 0x80);                     \
+    p0_m = __lsx_vxori_b(p0, 0x80);                     \
+    q0_m = __lsx_vxori_b(q0, 0x80);                     \
+    q1_m = __lsx_vxori_b(q1, 0x80);                     \
+    q2_m = __lsx_vxori_b(q2, 0x80);                     \
+                                                        \
+    filt = __lsx_vssub_b(p1_m, q1_m);                   \
+    q0_sub_p0 = __lsx_vssub_b(q0_m, p0_m);              \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);              \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);              \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);              \
+    filt = __lsx_vand_v(filt, mask);                    \
+                                                        \
+    t2 = __lsx_vand_v(filt, hev);                       \
+    hev = __lsx_vxori_b(hev, 0xff);                     \
+    filt = __lsx_vand_v(hev, filt);                     \
+    t1 = __lsx_vsadd_b(t2, cnst4b);                     \
+    t1 = __lsx_vsra_b(t1, cnst3b);                      \
+    t2 = __lsx_vsadd_b(t2, cnst3b);                     \
+    t2 = __lsx_vsra_b(t2, cnst3b);                      \
+    q0_m = __lsx_vssub_b(q0_m, t1);                     \
+    p0_m = __lsx_vsadd_b(p0_m, t2);                     \
+    filt_sign = __lsx_vslti_b(filt, 0);                 \
+    filt_r = __lsx_vilvl_b(filt_sign, filt);            \
+    filt_l = __lsx_vilvh_b(filt_sign, filt);            \
+    temp0 = __lsx_vmul_h(filt_r, cnst9h);               \
+    temp1 = __lsx_vadd_h(temp0, cnst63h);               \
+    temp2 = __lsx_vmul_h(filt_l, cnst9h);               \
+    temp3 = __lsx_vadd_h(temp2, cnst63h);               \
+                                                        \
+    u = __lsx_vssrani_b_h(temp3, temp1, 7);             \
+    q2_m = __lsx_vssub_b(q2_m, u);                      \
+    p2_m = __lsx_vsadd_b(p2_m, u);                      \
+    q2 = __lsx_vxori_b(q2_m, 0x80);                     \
+    p2 = __lsx_vxori_b(p2_m, 0x80);                     \
+                                                        \
+    temp1 = __lsx_vadd_h(temp1, temp0);                 \
+    temp3 = __lsx_vadd_h(temp3, temp2);                 \
+                                                        \
+    u = __lsx_vssrani_b_h(temp3, temp1, 7);             \
+    q1_m = __lsx_vssub_b(q1_m, u);                      \
+    p1_m = __lsx_vsadd_b(p1_m, u);                      \
+    q1 = __lsx_vxori_b(q1_m, 0x80);                     \
+    p1 = __lsx_vxori_b(p1_m, 0x80);                     \
+                                                        \
+    temp1 = __lsx_vadd_h(temp1, temp0);                 \
+    temp3 = __lsx_vadd_h(temp3, temp2);                 \
+                                                        \
+    u = __lsx_vssrani_b_h(temp3, temp1, 7);             \
+    q0_m = __lsx_vssub_b(q0_m, u);                      \
+    p0_m = __lsx_vsadd_b(p0_m, u);                      \
+    q0 = __lsx_vxori_b(q0_m, 0x80);                     \
+    p0 = __lsx_vxori_b(p0_m, 0x80);                     \
+  } while (0)
+
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
+                     limit_in, b_limit_in, thresh_in, hev_out, mask_out,     \
+                     flat_out)                                               \
+  do {                                                                       \
+    __m128i p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;          \
+    __m128i p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;          \
+                                                                             \
+    p3_asub_p2_m = __lsx_vabsd_bu(p3_in, p2_in);                             \
+    p2_asub_p1_m = __lsx_vabsd_bu(p2_in, p1_in);                             \
+    p1_asub_p0_m = __lsx_vabsd_bu(p1_in, p0_in);                             \
+    q1_asub_q0_m = __lsx_vabsd_bu(q1_in, q0_in);                             \
+    q2_asub_q1_m = __lsx_vabsd_bu(q2_in, q1_in);                             \
+    q3_asub_q2_m = __lsx_vabsd_bu(q3_in, q2_in);                             \
+    p0_asub_q0_m = __lsx_vabsd_bu(p0_in, q0_in);                             \
+    p1_asub_q1_m = __lsx_vabsd_bu(p1_in, q1_in);                             \
+    flat_out = __lsx_vmax_bu(p1_asub_p0_m, q1_asub_q0_m);                    \
+    hev_out = __lsx_vslt_bu(thresh_in, flat_out);                            \
+    p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p0_asub_q0_m);               \
+    p1_asub_q1_m = __lsx_vsrli_b(p1_asub_q1_m, 1);                           \
+    p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p1_asub_q1_m);               \
+    mask_out = __lsx_vslt_bu(b_limit_in, p0_asub_q0_m);                      \
+    mask_out = __lsx_vmax_bu(flat_out, mask_out);                            \
+    p3_asub_p2_m = __lsx_vmax_bu(p3_asub_p2_m, p2_asub_p1_m);                \
+    mask_out = __lsx_vmax_bu(p3_asub_p2_m, mask_out);                        \
+    q2_asub_q1_m = __lsx_vmax_bu(q2_asub_q1_m, q3_asub_q2_m);                \
+    mask_out = __lsx_vmax_bu(q2_asub_q1_m, mask_out);                        \
+    mask_out = __lsx_vslt_bu(limit_in, mask_out);                            \
+    mask_out = __lsx_vxori_b(mask_out, 0xff);                                \
+  } while (0)
+
+#define VP8_ST6x1_B(in0, in0_idx, in1, in1_idx, pdst, stride) \
+  do {                                                        \
+    __lsx_vstelm_w(in0, pdst, 0, in0_idx);                    \
+    __lsx_vstelm_h(in1, pdst + stride, 0, in1_idx);           \
+  } while (0)
+
+static void loop_filter_horizontal_4_dual_lsx(uint8_t *src, int32_t pitch,
+                                              const uint8_t *b_limit0_ptr,
+                                              const uint8_t *limit0_ptr,
+                                              const uint8_t *thresh0_ptr,
+                                              const uint8_t *b_limit1_ptr,
+                                              const uint8_t *limit1_ptr,
+                                              const uint8_t *thresh1_ptr) {
+  int32_t pitch_x2 = pitch << 1;
+  int32_t pitch_x3 = pitch_x2 + pitch;
+  int32_t pitch_x4 = pitch << 2;
+
+  __m128i mask, hev, flat;
+  __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+
+  DUP4_ARG2(__lsx_vldx, src, -pitch_x4, src, -pitch_x3, src, -pitch_x2, src,
+            -pitch, p3, p2, p1, p0);
+  q0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch_x2, q1, q2);
+  q3 = __lsx_vldx(src, pitch_x3);
+
+  thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0);
+  thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0);
+  thresh0 = __lsx_vilvl_d(thresh1, thresh0);
+
+  b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0);
+  b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0);
+  b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
+
+  limit0 = __lsx_vldrepl_b(limit0_ptr, 0);
+  limit1 = __lsx_vldrepl_b(limit1_ptr, 0);
+  limit0 = __lsx_vilvl_d(limit1, limit0);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+               mask, flat);
+  VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+  __lsx_vstx(p1, src, -pitch_x2);
+  __lsx_vstx(p0, src, -pitch);
+  __lsx_vst(q0, src, 0);
+  __lsx_vstx(q1, src, pitch);
+}
+
+static void loop_filter_vertical_4_dual_lsx(uint8_t *src, int32_t pitch,
+                                            const uint8_t *b_limit0_ptr,
+                                            const uint8_t *limit0_ptr,
+                                            const uint8_t *thresh0_ptr,
+                                            const uint8_t *b_limit1_ptr,
+                                            const uint8_t *limit1_ptr,
+                                            const uint8_t *thresh1_ptr) {
+  uint8_t *src_tmp0 = src - 4;
+  int32_t pitch_x2 = pitch << 1;
+  int32_t pitch_x3 = pitch_x2 + pitch;
+  int32_t pitch_x4 = pitch << 2;
+  __m128i mask, hev, flat;
+  __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+  __m128i row8, row9, row10, row11, row12, row13, row14, row15;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+  row0 = __lsx_vld(src_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row1, row2);
+  row3 = __lsx_vldx(src_tmp0, pitch_x3);
+  src_tmp0 += pitch_x4;
+  row4 = __lsx_vld(src_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row5, row6);
+  row7 = __lsx_vldx(src_tmp0, pitch_x3);
+  src_tmp0 += pitch_x4;
+
+  row8 = __lsx_vld(src_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row9, row10);
+  row11 = __lsx_vldx(src_tmp0, pitch_x3);
+  src_tmp0 += pitch_x4;
+  row12 = __lsx_vld(src_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row13, row14);
+  row15 = __lsx_vldx(src_tmp0, pitch_x3);
+
+  LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+                      row9, row10, row11, row12, row13, row14, row15, p3, p2,
+                      p1, p0, q0, q1, q2, q3);
+
+  thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0);
+  thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0);
+  thresh0 = __lsx_vilvl_d(thresh1, thresh0);
+
+  b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0);
+  b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0);
+  b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
+
+  limit0 = __lsx_vldrepl_b(limit0_ptr, 0);
+  limit1 = __lsx_vldrepl_b(limit1_ptr, 0);
+  limit0 = __lsx_vilvl_d(limit1, limit0);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+               mask, flat);
+  VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+  DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1);
+  tmp2 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp3 = __lsx_vilvh_h(tmp1, tmp0);
+  DUP2_ARG2(__lsx_vilvh_b, p0, p1, q1, q0, tmp0, tmp1);
+  tmp4 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp5 = __lsx_vilvh_h(tmp1, tmp0);
+
+  src -= 2;
+  __lsx_vstelm_w(tmp2, src, 0, 0);
+  src += pitch;
+  __lsx_vstelm_w(tmp2, src, 0, 1);
+  src += pitch;
+  __lsx_vstelm_w(tmp2, src, 0, 2);
+  src += pitch;
+  __lsx_vstelm_w(tmp2, src, 0, 3);
+  src += pitch;
+
+  __lsx_vstelm_w(tmp3, src, 0, 0);
+  src += pitch;
+  __lsx_vstelm_w(tmp3, src, 0, 1);
+  src += pitch;
+  __lsx_vstelm_w(tmp3, src, 0, 2);
+  src += pitch;
+  __lsx_vstelm_w(tmp3, src, 0, 3);
+  src += pitch;
+
+  __lsx_vstelm_w(tmp4, src, 0, 0);
+  src += pitch;
+  __lsx_vstelm_w(tmp4, src, 0, 1);
+  src += pitch;
+  __lsx_vstelm_w(tmp4, src, 0, 2);
+  src += pitch;
+  __lsx_vstelm_w(tmp4, src, 0, 3);
+  src += pitch;
+
+  __lsx_vstelm_w(tmp5, src, 0, 0);
+  src += pitch;
+  __lsx_vstelm_w(tmp5, src, 0, 1);
+  src += pitch;
+  __lsx_vstelm_w(tmp5, src, 0, 2);
+  src += pitch;
+  __lsx_vstelm_w(tmp5, src, 0, 3);
+}
+
+static void loop_filter_horizontal_edge_uv_lsx(uint8_t *src_u, uint8_t *src_v,
+                                               int32_t pitch,
+                                               const uint8_t b_limit_in,
+                                               const uint8_t limit_in,
+                                               const uint8_t thresh_in) {
+  int32_t pitch_x2 = pitch << 1;
+  int32_t pitch_x3 = pitch_x2 + pitch;
+  int32_t pitch_x4 = pitch << 2;
+
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i mask, hev, flat, thresh, limit, b_limit;
+  __m128i p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+  __m128i p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+
+  thresh = __lsx_vreplgr2vr_b(thresh_in);
+  limit = __lsx_vreplgr2vr_b(limit_in);
+  b_limit = __lsx_vreplgr2vr_b(b_limit_in);
+
+  DUP4_ARG2(__lsx_vldx, src_u, -pitch_x4, src_u, -pitch_x3, src_u, -pitch_x2,
+            src_u, -pitch, p3_u, p2_u, p1_u, p0_u);
+  q0_u = __lsx_vld(src_u, 0);
+  DUP2_ARG2(__lsx_vldx, src_u, pitch, src_u, pitch_x2, q1_u, q2_u);
+  q3_u = __lsx_vldx(src_u, pitch_x3);
+
+  DUP4_ARG2(__lsx_vldx, src_v, -pitch_x4, src_v, -pitch_x3, src_v, -pitch_x2,
+            src_v, -pitch, p3_v, p2_v, p1_v, p0_v);
+  q0_v = __lsx_vld(src_v, 0);
+  DUP2_ARG2(__lsx_vldx, src_v, pitch, src_v, pitch_x2, q1_v, q2_v);
+  q3_v = __lsx_vldx(src_v, pitch_x3);
+
+  /* right 8 element of p3 are u pixel and
+     left 8 element of p3 are v pixel */
+  DUP4_ARG2(__lsx_vilvl_d, p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3,
+            p2, p1, p0);
+  DUP4_ARG2(__lsx_vilvl_d, q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0,
+            q1, q2, q3);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+  __lsx_vstelm_d(q1, src_u + pitch, 0, 0);
+  __lsx_vstelm_d(q0, src_u, 0, 0);
+  __lsx_vstelm_d(p0, src_u - pitch, 0, 0);
+  __lsx_vstelm_d(p1, src_u - pitch_x2, 0, 0);
+
+  __lsx_vstelm_d(q1, src_v + pitch, 0, 1);
+  __lsx_vstelm_d(q0, src_v, 0, 1);
+  __lsx_vstelm_d(p0, src_v - pitch, 0, 1);
+  __lsx_vstelm_d(p1, src_v - pitch_x2, 0, 1);
+}
+
+static void loop_filter_vertical_edge_uv_lsx(uint8_t *src_u, uint8_t *src_v,
+                                             int32_t pitch,
+                                             const uint8_t b_limit_in,
+                                             const uint8_t limit_in,
+                                             const uint8_t thresh_in) {
+  uint8_t *src_u_tmp, *src_v_tmp;
+  int32_t pitch_x2 = pitch << 1;
+  int32_t pitch_x3 = pitch_x2 + pitch;
+  int32_t pitch_x4 = pitch << 2;
+
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i mask, hev, flat, thresh, limit, b_limit;
+  __m128i row0, row1, row2, row3, row4, row5, row6, row7, row8;
+  __m128i row9, row10, row11, row12, row13, row14, row15;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+  thresh = __lsx_vreplgr2vr_b(thresh_in);
+  limit = __lsx_vreplgr2vr_b(limit_in);
+  b_limit = __lsx_vreplgr2vr_b(b_limit_in);
+
+  src_u_tmp = src_u - 4;
+  row0 = __lsx_vld(src_u_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_u_tmp, pitch, src_u_tmp, pitch_x2, row1, row2);
+  row3 = __lsx_vldx(src_u_tmp, pitch_x3);
+  src_u_tmp += pitch_x4;
+  row4 = __lsx_vld(src_u_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_u_tmp, pitch, src_u_tmp, pitch_x2, row5, row6);
+  row7 = __lsx_vldx(src_u_tmp, pitch_x3);
+
+  src_v_tmp = src_v - 4;
+  row8 = __lsx_vld(src_v_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_v_tmp, pitch, src_v_tmp, pitch_x2, row9, row10);
+  row11 = __lsx_vldx(src_v_tmp, pitch_x3);
+  src_v_tmp += pitch_x4;
+  row12 = __lsx_vld(src_v_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_v_tmp, pitch, src_v_tmp, pitch_x2, row13, row14);
+  row15 = __lsx_vldx(src_v_tmp, pitch_x3);
+
+  LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+                      row9, row10, row11, row12, row13, row14, row15, p3, p2,
+                      p1, p0, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+  DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1);
+  tmp2 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp3 = __lsx_vilvh_h(tmp1, tmp0);
+
+  tmp0 = __lsx_vilvh_b(p0, p1);
+  tmp1 = __lsx_vilvh_b(q1, q0);
+  tmp4 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp5 = __lsx_vilvh_h(tmp1, tmp0);
+
+  src_u_tmp += 2;
+  __lsx_vstelm_w(tmp2, src_u_tmp - pitch_x4, 0, 0);
+  __lsx_vstelm_w(tmp2, src_u_tmp - pitch_x3, 0, 1);
+  __lsx_vstelm_w(tmp2, src_u_tmp - pitch_x2, 0, 2);
+  __lsx_vstelm_w(tmp2, src_u_tmp - pitch, 0, 3);
+
+  __lsx_vstelm_w(tmp3, src_u_tmp, 0, 0);
+  __lsx_vstelm_w(tmp3, src_u_tmp + pitch, 0, 1);
+  __lsx_vstelm_w(tmp3, src_u_tmp + pitch_x2, 0, 2);
+  __lsx_vstelm_w(tmp3, src_u_tmp + pitch_x3, 0, 3);
+
+  src_v_tmp += 2;
+  __lsx_vstelm_w(tmp4, src_v_tmp - pitch_x4, 0, 0);
+  __lsx_vstelm_w(tmp4, src_v_tmp - pitch_x3, 0, 1);
+  __lsx_vstelm_w(tmp4, src_v_tmp - pitch_x2, 0, 2);
+  __lsx_vstelm_w(tmp4, src_v_tmp - pitch, 0, 3);
+
+  __lsx_vstelm_w(tmp5, src_v_tmp, 0, 0);
+  __lsx_vstelm_w(tmp5, src_v_tmp + pitch, 0, 1);
+  __lsx_vstelm_w(tmp5, src_v_tmp + pitch_x2, 0, 2);
+  __lsx_vstelm_w(tmp5, src_v_tmp + pitch_x3, 0, 3);
+}
+
+static inline void mbloop_filter_horizontal_edge_y_lsx(
+    uint8_t *src, int32_t pitch, const uint8_t b_limit_in,
+    const uint8_t limit_in, const uint8_t thresh_in) {
+  uint8_t *temp_src;
+  int32_t pitch_x2 = pitch << 1;
+  int32_t pitch_x3 = pitch_x2 + pitch;
+  int32_t pitch_x4 = pitch << 2;
+
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i mask, hev, flat, thresh, limit, b_limit;
+
+  DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit);
+  thresh = __lsx_vldrepl_b(&thresh_in, 0);
+
+  temp_src = src - pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, p3, p2, p1, p0);
+  temp_src += pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+
+  temp_src = src - pitch_x3;
+  __lsx_vstx(p2, temp_src, 0);
+  __lsx_vstx(p1, temp_src, pitch);
+  __lsx_vstx(p0, temp_src, pitch_x2);
+  __lsx_vstx(q0, temp_src, pitch_x3);
+  temp_src += pitch_x4;
+  __lsx_vstx(q1, temp_src, 0);
+  __lsx_vstx(q2, temp_src, pitch);
+}
+
+static inline void mbloop_filter_horizontal_edge_uv_lsx(
+    uint8_t *src_u, uint8_t *src_v, int32_t pitch, const uint8_t b_limit_in,
+    const uint8_t limit_in, const uint8_t thresh_in) {
+  uint8_t *temp_src;
+  int32_t pitch_x2 = pitch << 1;
+  int32_t pitch_x3 = pitch_x2 + pitch;
+  int32_t pitch_x4 = pitch << 2;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i mask, hev, flat, thresh, limit, b_limit;
+  __m128i p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+  __m128i p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+
+  DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit);
+  thresh = __lsx_vldrepl_b(&thresh_in, 0);
+
+  temp_src = src_u - pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, p3_u, p2_u, p1_u, p0_u);
+  temp_src += pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, q0_u, q1_u, q2_u, q3_u);
+  temp_src = src_v - pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, p3_v, p2_v, p1_v, p0_v);
+  temp_src += pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, q0_v, q1_v, q2_v, q3_v);
+
+  DUP4_ARG2(__lsx_vilvl_d, p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3,
+            p2, p1, p0);
+  DUP4_ARG2(__lsx_vilvl_d, q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0,
+            q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+
+  src_u -= pitch_x3;
+  __lsx_vstelm_d(p2, src_u, 0, 0);
+  __lsx_vstelm_d(p1, src_u + pitch, 0, 0);
+  __lsx_vstelm_d(p0, src_u + pitch_x2, 0, 0);
+  __lsx_vstelm_d(q0, src_u + pitch_x3, 0, 0);
+  src_u += pitch_x4;
+  __lsx_vstelm_d(q1, src_u, 0, 0);
+  src_u += pitch;
+  __lsx_vstelm_d(q2, src_u, 0, 0);
+
+  src_v -= pitch_x3;
+  __lsx_vstelm_d(p2, src_v, 0, 1);
+  __lsx_vstelm_d(p1, src_v + pitch, 0, 1);
+  __lsx_vstelm_d(p0, src_v + pitch_x2, 0, 1);
+  __lsx_vstelm_d(q0, src_v + pitch_x3, 0, 1);
+  src_v += pitch_x4;
+  __lsx_vstelm_d(q1, src_v, 0, 1);
+  src_v += pitch;
+  __lsx_vstelm_d(q2, src_v, 0, 1);
+}
+
+static inline void mbloop_filter_vertical_edge_y_lsx(uint8_t *src,
+                                                     int32_t pitch,
+                                                     const uint8_t b_limit_in,
+                                                     const uint8_t limit_in,
+                                                     const uint8_t thresh_in) {
+  uint8_t *temp_src;
+  int32_t pitch_x2 = pitch << 1;
+  int32_t pitch_x3 = pitch_x2 + pitch;
+  int32_t pitch_x4 = pitch << 2;
+
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i mask, hev, flat, thresh, limit, b_limit;
+  __m128i row0, row1, row2, row3, row4, row5, row6, row7, row8;
+  __m128i row9, row10, row11, row12, row13, row14, row15;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+  DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit);
+  thresh = __lsx_vldrepl_b(&thresh_in, 0);
+  temp_src = src - 4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, row0, row1, row2, row3);
+  temp_src += pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, row4, row5, row6, row7);
+  temp_src += pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, row8, row9, row10, row11);
+  temp_src += pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, row12, row13, row14, row15);
+  temp_src -= pitch_x4;
+  LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+                      row9, row10, row11, row12, row13, row14, row15, p3, p2,
+                      p1, p0, q0, q1, q2, q3);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+  DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, tmp0, tmp1);
+  tmp3 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp4 = __lsx_vilvh_h(tmp1, tmp0);
+  DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, tmp0, tmp1);
+  tmp6 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp7 = __lsx_vilvh_h(tmp1, tmp0);
+  tmp2 = __lsx_vilvl_b(q2, q1);
+  tmp5 = __lsx_vilvh_b(q2, q1);
+
+  temp_src = src - 3;
+  VP8_ST6x1_B(tmp3, 0, tmp2, 0, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp3, 1, tmp2, 1, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp3, 2, tmp2, 2, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp3, 3, tmp2, 3, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp4, 0, tmp2, 4, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp4, 1, tmp2, 5, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp4, 2, tmp2, 6, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp4, 3, tmp2, 7, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp6, 0, tmp5, 0, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp6, 1, tmp5, 1, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp6, 2, tmp5, 2, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp6, 3, tmp5, 3, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp7, 0, tmp5, 4, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp7, 1, tmp5, 5, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp7, 2, tmp5, 6, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp7, 3, tmp5, 7, temp_src, 4);
+}
+
+static inline void mbloop_filter_vertical_edge_uv_lsx(
+    uint8_t *src_u, uint8_t *src_v, int32_t pitch, const uint8_t b_limit_in,
+    const uint8_t limit_in, const uint8_t thresh_in) {
+  int32_t pitch_x2 = pitch << 1;
+  int32_t pitch_x3 = pitch_x2 + pitch;
+  int32_t pitch_x4 = pitch << 2;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i mask, hev, flat, thresh, limit, b_limit;
+  __m128i row0, row1, row2, row3, row4, row5, row6, row7, row8;
+  __m128i row9, row10, row11, row12, row13, row14, row15;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+  DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit);
+  thresh = __lsx_vldrepl_b(&thresh_in, 0);
+
+  src_u -= 4;
+  DUP4_ARG2(__lsx_vldx, src_u, 0, src_u, pitch, src_u, pitch_x2, src_u,
+            pitch_x3, row0, row1, row2, row3);
+  src_u += pitch_x4;
+  DUP4_ARG2(__lsx_vldx, src_u, 0, src_u, pitch, src_u, pitch_x2, src_u,
+            pitch_x3, row4, row5, row6, row7);
+  src_v -= 4;
+  DUP4_ARG2(__lsx_vldx, src_v, 0, src_v, pitch, src_v, pitch_x2, src_v,
+            pitch_x3, row8, row9, row10, row11);
+  src_v += pitch_x4;
+  DUP4_ARG2(__lsx_vldx, src_v, 0, src_v, pitch, src_v, pitch_x2, src_v,
+            pitch_x3, row12, row13, row14, row15);
+  LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+                      row9, row10, row11, row12, row13, row14, row15, p3, p2,
+                      p1, p0, q0, q1, q2, q3);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+
+  DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, tmp0, tmp1);
+  tmp3 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp4 = __lsx_vilvh_h(tmp1, tmp0);
+  DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, tmp0, tmp1);
+  tmp6 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp7 = __lsx_vilvh_h(tmp1, tmp0);
+  tmp2 = __lsx_vilvl_b(q2, q1);
+  tmp5 = __lsx_vilvh_b(q2, q1);
+
+  src_u += 1 - pitch_x4;
+  VP8_ST6x1_B(tmp3, 0, tmp2, 0, src_u, 4);
+  src_u += pitch;
+  VP8_ST6x1_B(tmp3, 1, tmp2, 1, src_u, 4);
+  src_u += pitch;
+  VP8_ST6x1_B(tmp3, 2, tmp2, 2, src_u, 4);
+  src_u += pitch;
+  VP8_ST6x1_B(tmp3, 3, tmp2, 3, src_u, 4);
+  src_u += pitch;
+  VP8_ST6x1_B(tmp4, 0, tmp2, 4, src_u, 4);
+  src_u += pitch;
+  VP8_ST6x1_B(tmp4, 1, tmp2, 5, src_u, 4);
+  src_u += pitch;
+  VP8_ST6x1_B(tmp4, 2, tmp2, 6, src_u, 4);
+  src_u += pitch;
+  VP8_ST6x1_B(tmp4, 3, tmp2, 7, src_u, 4);
+
+  src_v += 1 - pitch_x4;
+  VP8_ST6x1_B(tmp6, 0, tmp5, 0, src_v, 4);
+  src_v += pitch;
+  VP8_ST6x1_B(tmp6, 1, tmp5, 1, src_v, 4);
+  src_v += pitch;
+  VP8_ST6x1_B(tmp6, 2, tmp5, 2, src_v, 4);
+  src_v += pitch;
+  VP8_ST6x1_B(tmp6, 3, tmp5, 3, src_v, 4);
+  src_v += pitch;
+  VP8_ST6x1_B(tmp7, 0, tmp5, 4, src_v, 4);
+  src_v += pitch;
+  VP8_ST6x1_B(tmp7, 1, tmp5, 5, src_v, 4);
+  src_v += pitch;
+  VP8_ST6x1_B(tmp7, 2, tmp5, 6, src_v, 4);
+  src_v += pitch;
+  VP8_ST6x1_B(tmp7, 3, tmp5, 7, src_v, 4);
+}
+
+void vp8_loop_filter_mbh_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v,
+                             int32_t pitch_y, int32_t pitch_u_v,
+                             loop_filter_info *lpf_info_ptr) {
+  mbloop_filter_horizontal_edge_y_lsx(src_y, pitch_y, *lpf_info_ptr->mblim,
+                                      *lpf_info_ptr->lim,
+                                      *lpf_info_ptr->hev_thr);
+  if (src_u) {
+    mbloop_filter_horizontal_edge_uv_lsx(
+        src_u, src_v, pitch_u_v, *lpf_info_ptr->mblim, *lpf_info_ptr->lim,
+        *lpf_info_ptr->hev_thr);
+  }
+}
+
+void vp8_loop_filter_mbv_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v,
+                             int32_t pitch_y, int32_t pitch_u_v,
+                             loop_filter_info *lpf_info_ptr) {
+  mbloop_filter_vertical_edge_y_lsx(src_y, pitch_y, *lpf_info_ptr->mblim,
+                                    *lpf_info_ptr->lim, *lpf_info_ptr->hev_thr);
+  if (src_u) {
+    mbloop_filter_vertical_edge_uv_lsx(src_u, src_v, pitch_u_v,
+                                       *lpf_info_ptr->mblim, *lpf_info_ptr->lim,
+                                       *lpf_info_ptr->hev_thr);
+  }
+}
+
+void vp8_loop_filter_bh_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v,
+                            int32_t pitch_y, int32_t pitch_u_v,
+                            loop_filter_info *lpf_info_ptr) {
+  loop_filter_horizontal_4_dual_lsx(src_y + 4 * pitch_y, pitch_y,
+                                    lpf_info_ptr->blim, lpf_info_ptr->lim,
+                                    lpf_info_ptr->hev_thr, lpf_info_ptr->blim,
+                                    lpf_info_ptr->lim, lpf_info_ptr->hev_thr);
+  loop_filter_horizontal_4_dual_lsx(src_y + 8 * pitch_y, pitch_y,
+                                    lpf_info_ptr->blim, lpf_info_ptr->lim,
+                                    lpf_info_ptr->hev_thr, lpf_info_ptr->blim,
+                                    lpf_info_ptr->lim, lpf_info_ptr->hev_thr);
+  loop_filter_horizontal_4_dual_lsx(src_y + 12 * pitch_y, pitch_y,
+                                    lpf_info_ptr->blim, lpf_info_ptr->lim,
+                                    lpf_info_ptr->hev_thr, lpf_info_ptr->blim,
+                                    lpf_info_ptr->lim, lpf_info_ptr->hev_thr);
+  if (src_u) {
+    loop_filter_horizontal_edge_uv_lsx(
+        src_u + (4 * pitch_u_v), src_v + (4 * pitch_u_v), pitch_u_v,
+        *lpf_info_ptr->blim, *lpf_info_ptr->lim, *lpf_info_ptr->hev_thr);
+  }
+}
+
+void vp8_loop_filter_bv_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v,
+                            int32_t pitch_y, int32_t pitch_u_v,
+                            loop_filter_info *lpf_info_ptr) {
+  loop_filter_vertical_4_dual_lsx(src_y + 4, pitch_y, lpf_info_ptr->blim,
+                                  lpf_info_ptr->lim, lpf_info_ptr->hev_thr,
+                                  lpf_info_ptr->blim, lpf_info_ptr->lim,
+                                  lpf_info_ptr->hev_thr);
+  loop_filter_vertical_4_dual_lsx(src_y + 8, pitch_y, lpf_info_ptr->blim,
+                                  lpf_info_ptr->lim, lpf_info_ptr->hev_thr,
+                                  lpf_info_ptr->blim, lpf_info_ptr->lim,
+                                  lpf_info_ptr->hev_thr);
+  loop_filter_vertical_4_dual_lsx(src_y + 12, pitch_y, lpf_info_ptr->blim,
+                                  lpf_info_ptr->lim, lpf_info_ptr->hev_thr,
+                                  lpf_info_ptr->blim, lpf_info_ptr->lim,
+                                  lpf_info_ptr->hev_thr);
+  if (src_u) {
+    loop_filter_vertical_edge_uv_lsx(src_u + 4, src_v + 4, pitch_u_v,
+                                     *lpf_info_ptr->blim, *lpf_info_ptr->lim,
+                                     *lpf_info_ptr->hev_thr);
+  }
+}
diff --git a/media/libvpx/libvpx/vp8/common/loongarch/sixtap_filter_lsx.c b/media/libvpx/libvpx/vp8/common/loongarch/sixtap_filter_lsx.c
new file mode 100644
index 0000000000..9867633415
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/loongarch/sixtap_filter_lsx.c
@@ -0,0 +1,1904 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS.  All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/filter.h"
+#include "vpx_ports/mem.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+DECLARE_ALIGNED(16, static const int8_t, vp8_subpel_filters_lsx[7][8]) = {
+  { 0, -6, 123, 12, -1, 0, 0, 0 },
+  { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */
+  { 0, -9, 93, 50, -6, 0, 0, 0 },
+  { 3, -16, 77, 77, -16, 3, 0, 0 }, /* New 1/2 pel 6 tap filter */
+  { 0, -6, 50, 93, -9, 0, 0, 0 },
+  { 1, -8, 36, 108, -11, 2, 0, 0 }, /* New 1/4 pel 6 tap filter */
+  { 0, -1, 12, 123, -6, 0, 0, 0 },
+};
+
+static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
+  /* 8 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+  /* 4 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+  /* 4 width cases */
+  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static INLINE __m128i dpadd_h3(__m128i in0, __m128i in1, __m128i in2,
+                               __m128i coeff0, __m128i coeff1, __m128i coeff2) {
+  __m128i out0_m;
+
+  out0_m = __lsx_vdp2_h_b(in0, coeff0);
+  out0_m = __lsx_vdp2add_h_b(out0_m, in1, coeff1);
+  out0_m = __lsx_vdp2add_h_b(out0_m, in2, coeff2);
+
+  return out0_m;
+}
+
+static INLINE __m128i horiz_6tap_filt(__m128i src0, __m128i src1, __m128i mask0,
+                                      __m128i mask1, __m128i mask2,
+                                      __m128i filt_h0, __m128i filt_h1,
+                                      __m128i filt_h2) {
+  __m128i vec0_m, vec1_m, vec2_m;
+  __m128i hz_out_m;
+
+  DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m,
+            vec1_m);
+  vec2_m = __lsx_vshuf_b(src1, src0, mask2);
+  hz_out_m = dpadd_h3(vec0_m, vec1_m, vec2_m, filt_h0, filt_h1, filt_h2);
+  hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT);
+  hz_out_m = __lsx_vsat_h(hz_out_m, 7);
+
+  return hz_out_m;
+}
+
+static INLINE __m128i filt_4tap_dpadd_h(__m128i vec0, __m128i vec1,
+                                        __m128i filt0, __m128i filt1) {
+  __m128i tmp_m;
+
+  tmp_m = __lsx_vdp2_h_b(vec0, filt0);
+  tmp_m = __lsx_vdp2add_h_b(tmp_m, vec1, filt1);
+
+  return tmp_m;
+}
+
+static INLINE __m128i horiz_4tap_filt(__m128i src0, __m128i src1, __m128i mask0,
+                                      __m128i mask1, __m128i filt_h0,
+                                      __m128i filt_h1) {
+  __m128i vec0_m, vec1_m, hz_out_m;
+
+  DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m,
+            vec1_m);
+  hz_out_m = filt_4tap_dpadd_h(vec0_m, vec1_m, filt_h0, filt_h1);
+  hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT);
+  hz_out_m = __lsx_vsat_h(hz_out_m, 7);
+
+  return hz_out_m;
+}
+
+#define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,   \
+                                   mask2, filt0, filt1, filt2, out0, out1) \
+  do {                                                                     \
+    __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m;                \
+                                                                           \
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src3, src2, mask0, vec0_m, \
+              vec1_m);                                                     \
+    DUP2_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, out0, out1);   \
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask1, src3, src2, mask1, vec2_m, \
+              vec3_m);                                                     \
+    DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec2_m, filt1, out1, vec3_m, filt1, \
+              out0, out1);                                                 \
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask2, src3, src2, mask2, vec4_m, \
+              vec5_m);                                                     \
+    DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2, \
+              out0, out1);                                                 \
+  } while (0)
+
+#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,    \
+                                   mask2, filt0, filt1, filt2, out0, out1,  \
+                                   out2, out3)                              \
+  do {                                                                      \
+    __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+                                                                            \
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, vec0_m,  \
+              vec1_m);                                                      \
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0, vec2_m,  \
+              vec3_m);                                                      \
+    DUP4_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, vec2_m, filt0,  \
+              vec3_m, filt0, out0, out1, out2, out3);                       \
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, vec0_m,  \
+              vec1_m);                                                      \
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1, vec2_m,  \
+              vec3_m);                                                      \
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, vec4_m,  \
+              vec5_m);                                                      \
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2, vec6_m,  \
+              vec7_m);                                                      \
+    DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec0_m, filt1, out1, vec1_m, filt1,  \
+              out2, vec2_m, filt1, out3, vec3_m, filt1, out0, out1, out2,   \
+              out3);                                                        \
+    DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2,  \
+              out2, vec6_m, filt2, out3, vec7_m, filt2, out0, out1, out2,   \
+              out3);                                                        \
+  } while (0)
+
+#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,   \
+                                   filt0, filt1, out0, out1)               \
+  do {                                                                     \
+    __m128i vec0_m, vec1_m, vec2_m, vec3_m;                                \
+                                                                           \
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src3, src2, mask0, vec0_m, \
+              vec1_m);                                                     \
+    DUP2_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, out0, out1);   \
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask1, src3, src2, mask1, vec2_m, \
+              vec3_m);                                                     \
+    DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec2_m, filt1, out1, vec3_m, filt1, \
+              out0, out1);                                                 \
+  } while (0)
+
+#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,   \
+                                   filt0, filt1, out0, out1, out2, out3)   \
+  do {                                                                     \
+    __m128i vec0_m, vec1_m, vec2_m, vec3_m;                                \
+                                                                           \
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, vec0_m, \
+              vec1_m);                                                     \
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0, vec2_m, \
+              vec3_m);                                                     \
+    DUP4_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, vec2_m, filt0, \
+              vec3_m, filt0, out0, out1, out2, out3);                      \
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, vec0_m, \
+              vec1_m);                                                     \
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1, vec2_m, \
+              vec3_m);                                                     \
+    DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec0_m, filt1, out1, vec1_m, filt1, \
+              out2, vec2_m, filt1, out3, vec3_m, filt1, out0, out1, out2,  \
+              out3);                                                       \
+  } while (0)
+
+static inline void common_hz_6t_4x4_lsx(uint8_t *RESTRICT src,
+                                        int32_t src_stride,
+                                        uint8_t *RESTRICT dst,
+                                        int32_t dst_stride,
+                                        const int8_t *filter) {
+  __m128i src0, src1, src2, src3, filt0, filt1, filt2;
+  __m128i mask0, mask1, mask2, out0, out1;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+  src -= 2;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+  filt2 = __lsx_vldrepl_h(filter, 4);
+
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride_x3);
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
+                             filt1, filt2, out0, out1);
+  out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT);
+  out0 = __lsx_vxori_b(out0, 128);
+
+  __lsx_vstelm_w(out0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 3);
+}
+
+static void common_hz_6t_4x8_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter) {
+  __m128i src0, src1, src2, src3, filt0, filt1, filt2;
+  __m128i mask0, mask1, mask2, out0, out1, out2, out3;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride_x2 << 1;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+  src -= 2;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+  filt2 = __lsx_vldrepl_h(filter, 4);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride_x3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  src += src_stride_x4;
+  HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
+                             filt1, filt2, out0, out1);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride_x3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
+                             filt1, filt2, out2, out3);
+
+  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
+            VP8_FILTER_SHIFT, out0, out1);
+  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+  __lsx_vstelm_w(out0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 3);
+  dst += dst_stride;
+
+  __lsx_vstelm_w(out1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 3);
+}
+
+static void common_hz_6t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  if (height == 4) {
+    common_hz_6t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else if (height == 8) {
+    common_hz_6t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_6t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, filt0, filt1, filt2;
+  __m128i mask0, mask1, mask2, tmp0, tmp1;
+  __m128i filt, out0, out1, out2, out3;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+  src -= 2;
+
+  filt = __lsx_vld(filter, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+  filt2 = __lsx_vreplvei_h(filt, 2);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+  DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+            src_stride_x3, src0, src1, src2, src3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  src += src_stride_x4;
+  HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
+                             filt1, filt2, out0, out1, out2, out3);
+  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
+            VP8_FILTER_SHIFT, tmp0, tmp1);
+  DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+  __lsx_vstelm_d(tmp0, dst, 0, 0);
+  __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
+  __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
+  dst += dst_stride_x4;
+
+  for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    src += src_stride_x4;
+    HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               filt0, filt1, filt2, out0, out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
+              VP8_FILTER_SHIFT, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+    __lsx_vstelm_d(tmp0, dst, 0, 0);
+    __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
+    __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
+    dst += dst_stride_x4;
+  }
+}
+
+static void common_hz_6t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2;
+  __m128i mask0, mask1, mask2, out;
+  __m128i filt, out0, out1, out2, out3, out4, out5, out6, out7;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+  src -= 2;
+
+  filt = __lsx_vld(filter, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+  filt2 = __lsx_vreplvei_h(filt, 2);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src0, src2, src4, src6);
+    src += 8;
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src1, src3, src5, src7);
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    DUP4_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src6, 128, src7, 128, src4,
+              src5, src6, src7);
+    src += src_stride_x4 - 8;
+
+    HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               filt0, filt1, filt2, out0, out1, out2, out3);
+    HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
+                               filt0, filt1, filt2, out4, out5, out6, out7);
+    DUP4_ARG2(__lsx_vsrari_h, out0, VP8_FILTER_SHIFT, out1, VP8_FILTER_SHIFT,
+              out2, VP8_FILTER_SHIFT, out3, VP8_FILTER_SHIFT, out0, out1, out2,
+              out3);
+    DUP4_ARG2(__lsx_vsrari_h, out4, VP8_FILTER_SHIFT, out5, VP8_FILTER_SHIFT,
+              out6, VP8_FILTER_SHIFT, out7, VP8_FILTER_SHIFT, out4, out5, out6,
+              out7);
+    DUP4_ARG2(__lsx_vsat_h, out0, 7, out1, 7, out2, 7, out3, 7, out0, out1,
+              out2, out3);
+    DUP4_ARG2(__lsx_vsat_h, out4, 7, out5, 7, out6, 7, out7, 7, out4, out5,
+              out6, out7);
+    out = __lsx_vpickev_b(out1, out0);
+    out = __lsx_vxori_b(out, 128);
+    __lsx_vst(out, dst, 0);
+    out = __lsx_vpickev_b(out3, out2);
+    out = __lsx_vxori_b(out, 128);
+    __lsx_vstx(out, dst, dst_stride);
+    out = __lsx_vpickev_b(out5, out4);
+    out = __lsx_vxori_b(out, 128);
+    __lsx_vstx(out, dst, dst_stride_x2);
+    out = __lsx_vpickev_b(out7, out6);
+    out = __lsx_vxori_b(out, 128);
+    __lsx_vstx(out, dst, dst_stride_x3);
+    dst += dst_stride_x4;
+  }
+}
+
+static void common_vt_6t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  __m128i src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+  __m128i src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
+  __m128i out0, out1;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+  filt2 = __lsx_vldrepl_h(filter, 4);
+
+  DUP2_ARG2(__lsx_vldx, src, -src_stride_x2, src, -src_stride, src0, src1);
+  src2 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src3, src4);
+  src += src_stride_x3;
+
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+            src10_r, src21_r, src32_r, src43_r);
+  DUP2_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r, src2110,
+            src4332);
+  DUP2_ARG2(__lsx_vxori_b, src2110, 128, src4332, 128, src2110, src4332);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    src5 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src6, src7);
+    src8 = __lsx_vldx(src, src_stride_x3);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
+              src54_r, src65_r, src76_r, src87_r);
+    DUP2_ARG2(__lsx_vilvl_d, src65_r, src54_r, src87_r, src76_r, src6554,
+              src8776);
+    DUP2_ARG2(__lsx_vxori_b, src6554, 128, src8776, 128, src6554, src8776);
+    out0 = dpadd_h3(src2110, src4332, src6554, filt0, filt1, filt2);
+    out1 = dpadd_h3(src4332, src6554, src8776, filt0, filt1, filt2);
+
+    out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT);
+    out0 = __lsx_vxori_b(out0, 128);
+
+    __lsx_vstelm_w(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 3);
+    dst += dst_stride;
+
+    src2110 = src6554;
+    src4332 = src8776;
+    src4 = src8;
+  }
+}
+
+static void common_vt_6t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, src4, src7, src8, src9, src10;
+  __m128i src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
+  __m128i src109_r, filt0, filt1, filt2;
+  __m128i tmp0, tmp1;
+  __m128i filt, out0_r, out1_r, out2_r, out3_r;
+
+  src -= src_stride_x2;
+  filt = __lsx_vld(filter, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+  filt2 = __lsx_vreplvei_h(filt, 2);
+
+  DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+            src_stride_x3, src0, src1, src2, src3);
+  src += src_stride_x4;
+  src4 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  src4 = __lsx_vxori_b(src4, 128);
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src2, src1, src4, src3,
+            src10_r, src32_r, src21_r, src43_r);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src7, src8, src9, src10);
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vilvl_b, src7, src4, src8, src7, src9, src8, src10, src9,
+              src76_r, src87_r, src98_r, src109_r);
+    out0_r = dpadd_h3(src10_r, src32_r, src76_r, filt0, filt1, filt2);
+    out1_r = dpadd_h3(src21_r, src43_r, src87_r, filt0, filt1, filt2);
+    out2_r = dpadd_h3(src32_r, src76_r, src98_r, filt0, filt1, filt2);
+    out3_r = dpadd_h3(src43_r, src87_r, src109_r, filt0, filt1, filt2);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1_r, out0_r, VP8_FILTER_SHIFT, out3_r,
+              out2_r, VP8_FILTER_SHIFT, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+    __lsx_vstelm_d(tmp0, dst, 0, 0);
+    __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
+    __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
+    dst += dst_stride_x4;
+
+    src10_r = src76_r;
+    src32_r = src98_r;
+    src21_r = src87_r;
+    src43_r = src109_r;
+    src4 = src10;
+  }
+}
+
+static void common_vt_6t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  __m128i src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+  __m128i src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
+  __m128i src65_l, src87_l, filt0, filt1, filt2;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+  src -= src_stride_x2;
+  filt = __lsx_vld(filter, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+  filt2 = __lsx_vreplvei_h(filt, 2);
+
+  DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+            src_stride_x3, src0, src1, src2, src3);
+  src += src_stride_x4;
+  src4 = __lsx_vldx(src, 0);
+  src += src_stride;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  src4 = __lsx_vxori_b(src4, 128);
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src4, src3, src2, src1,
+            src10_r, src32_r, src43_r, src21_r);
+  DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src4, src3, src2, src1,
+            src10_l, src32_l, src43_l, src21_l);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src5, src6, src7, src8);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5,
+              src6, src7, src8);
+    DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
+              src54_r, src65_r, src76_r, src87_r);
+    DUP4_ARG2(__lsx_vilvh_b, src5, src4, src6, src5, src7, src6, src8, src7,
+              src54_l, src65_l, src76_l, src87_l);
+    out0_r = dpadd_h3(src10_r, src32_r, src54_r, filt0, filt1, filt2);
+    out1_r = dpadd_h3(src21_r, src43_r, src65_r, filt0, filt1, filt2);
+    out2_r = dpadd_h3(src32_r, src54_r, src76_r, filt0, filt1, filt2);
+    out3_r = dpadd_h3(src43_r, src65_r, src87_r, filt0, filt1, filt2);
+    out0_l = dpadd_h3(src10_l, src32_l, src54_l, filt0, filt1, filt2);
+    out1_l = dpadd_h3(src21_l, src43_l, src65_l, filt0, filt1, filt2);
+    out2_l = dpadd_h3(src32_l, src54_l, src76_l, filt0, filt1, filt2);
+    out3_l = dpadd_h3(src43_l, src65_l, src87_l, filt0, filt1, filt2);
+    DUP4_ARG3(__lsx_vssrarni_b_h, out0_l, out0_r, VP8_FILTER_SHIFT, out1_l,
+              out1_r, VP8_FILTER_SHIFT, out2_l, out2_r, VP8_FILTER_SHIFT,
+              out3_l, out3_r, VP8_FILTER_SHIFT, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp2, 128, tmp3, 128, tmp0,
+              tmp1, tmp2, tmp3);
+    __lsx_vstx(tmp0, dst, 0);
+    __lsx_vstx(tmp1, dst, dst_stride);
+    __lsx_vstx(tmp2, dst, dst_stride_x2);
+    __lsx_vstx(tmp3, dst, dst_stride_x3);
+    dst += dst_stride_x4;
+
+    src10_r = src54_r;
+    src32_r = src76_r;
+    src21_r = src65_r;
+    src43_r = src87_r;
+    src10_l = src54_l;
+    src32_l = src76_l;
+    src21_l = src65_l;
+    src43_l = src87_l;
+    src4 = src8;
+  }
+}
+
+static void common_hv_6ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, tmp0, tmp1;
+  __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  __m128i hz_out7, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+  src -= 2;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0,
+            filt_hz1);
+  filt_hz2 = __lsx_vldrepl_h(filter_horiz, 4);
+  DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0,
+            filt_vt1);
+  filt_vt2 = __lsx_vldrepl_h(filter_vert, 4);
+
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+  DUP2_ARG2(__lsx_vldx, src, -src_stride_x2, src, -src_stride, src0, src1);
+  src2 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src3, src4);
+  src += src_stride_x3;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  src4 = __lsx_vxori_b(src4, 128);
+
+  hz_out0 = horiz_6tap_filt(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out2 = horiz_6tap_filt(src2, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
+  hz_out3 = horiz_6tap_filt(src3, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    src5 = __lsx_vld(src, 0);
+    src6 = __lsx_vldx(src, src_stride);
+    src += src_stride_x2;
+
+    DUP2_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src5, src6);
+    hz_out5 = horiz_6tap_filt(src5, src6, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
+
+    src7 = __lsx_vld(src, 0);
+    src8 = __lsx_vldx(src, src_stride);
+    src += src_stride_x2;
+
+    DUP2_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src7, src8);
+    hz_out7 = horiz_6tap_filt(src7, src8, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out6 = __lsx_vshuf_b(hz_out7, hz_out5, shuff);
+
+    out2 = __lsx_vpackev_b(hz_out5, hz_out4);
+    tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+    out3 = __lsx_vpackev_b(hz_out7, hz_out6);
+    tmp1 = dpadd_h3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
+
+    tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
+    tmp0 = __lsx_vxori_b(tmp0, 128);
+    __lsx_vstelm_w(tmp0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 3);
+    dst += dst_stride;
+
+    hz_out3 = hz_out7;
+    out0 = out2;
+    out1 = out3;
+  }
+}
+
+static void common_hv_6ht_6vt_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  __m128i filt_hz0, filt_hz1, filt_hz2;
+  __m128i mask0, mask1, mask2, vec0, vec1;
+  __m128i filt, filt_vt0, filt_vt1, filt_vt2;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  __m128i hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+  src -= (2 + src_stride_x2);
+
+  filt = __lsx_vld(filter_horiz, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1);
+  filt_hz2 = __lsx_vreplvei_h(filt, 2);
+
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+            src_stride_x3, src0, src1, src2, src3);
+  src += src_stride_x4;
+  src4 = __lsx_vldx(src, 0);
+  src += src_stride;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  src4 = __lsx_vxori_b(src4, 128);
+
+  hz_out0 = horiz_6tap_filt(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out1 = horiz_6tap_filt(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out2 = horiz_6tap_filt(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out3 = horiz_6tap_filt(src3, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out4 = horiz_6tap_filt(src4, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  filt = __lsx_vld(filter_vert, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1);
+  filt_vt2 = __lsx_vreplvei_h(filt, 2);
+
+  DUP4_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, hz_out2,
+            hz_out1, hz_out4, hz_out3, out0, out1, out3, out4);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src5, src6, src7, src8);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5,
+              src6, src7, src8);
+    hz_out5 = horiz_6tap_filt(src5, src5, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    out2 = __lsx_vpackev_b(hz_out5, hz_out4);
+    tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+    hz_out6 = horiz_6tap_filt(src6, src6, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    out5 = __lsx_vpackev_b(hz_out6, hz_out5);
+    tmp1 = dpadd_h3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
+
+    hz_out7 = horiz_6tap_filt(src7, src7, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    out7 = __lsx_vpackev_b(hz_out7, hz_out6);
+    tmp2 = dpadd_h3(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
+
+    hz_out8 = horiz_6tap_filt(src8, src8, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    out6 = __lsx_vpackev_b(hz_out8, hz_out7);
+    tmp3 = dpadd_h3(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
+
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, VP8_FILTER_SHIFT, tmp3, tmp2,
+              VP8_FILTER_SHIFT, vec0, vec1);
+    DUP2_ARG2(__lsx_vxori_b, vec0, 128, vec1, 128, vec0, vec1);
+
+    __lsx_vstelm_d(vec0, dst, 0, 0);
+    __lsx_vstelm_d(vec0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(vec1, dst + dst_stride_x2, 0, 0);
+    __lsx_vstelm_d(vec1, dst + dst_stride_x3, 0, 1);
+    dst += dst_stride_x4;
+
+    hz_out4 = hz_out8;
+    out0 = out2;
+    out1 = out7;
+    out3 = out5;
+    out4 = out6;
+  }
+}
+
+static void common_hv_6ht_6vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                      uint8_t *RESTRICT dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height) {
+  common_hv_6ht_6vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                           filter_vert, height);
+  common_hv_6ht_6vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride,
+                           filter_horiz, filter_vert, height);
+}
+
+static void common_hz_4t_4x4_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter) {
+  __m128i src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+  __m128i out0, out1;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+  src -= 1;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+  mask1 = __lsx_vaddi_bu(mask0, 2);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride_x3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
+                             out0, out1);
+
+  out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT);
+  out0 = __lsx_vxori_b(out0, 128);
+
+  __lsx_vstelm_w(out0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 3);
+}
+
+static void common_hz_4t_4x8_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter) {
+  __m128i src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+  __m128i out0, out1, out2, out3;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+  src -= 1;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+  mask1 = __lsx_vaddi_bu(mask0, 2);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride_x3);
+  src += src_stride_x4;
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
+                             out0, out1);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride_x3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
+                             out2, out3);
+  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
+            VP8_FILTER_SHIFT, out0, out1);
+  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+  __lsx_vstelm_w(out0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 3);
+  dst += dst_stride;
+
+  __lsx_vstelm_w(out1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 3);
+}
+
+static void common_hz_4t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  if (height == 4) {
+    common_hz_4t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else if (height == 8) {
+    common_hz_4t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_4t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+  __m128i tmp0, tmp1;
+  __m128i filt, out0, out1, out2, out3;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+  src -= 1;
+
+  filt = __lsx_vld(filter, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+  mask1 = __lsx_vaddi_bu(mask0, 2);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src0, src1, src2, src3);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+                               filt1, out0, out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
+              VP8_FILTER_SHIFT, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+    __lsx_vstelm_d(tmp0, dst, 0, 0);
+    __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
+    __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
+    dst += dst_stride_x4;
+  }
+}
+
+static void common_hz_4t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i filt0, filt1, mask0, mask1;
+  __m128i filt, out0, out1, out2, out3, out4, out5, out6, out7;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+  src -= 1;
+
+  filt = __lsx_vld(filter, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+  mask1 = __lsx_vaddi_bu(mask0, 2);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src0, src2, src4, src6);
+    src += 8;
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src1, src3, src5, src7);
+    src += src_stride_x4 - 8;
+
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    DUP4_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src6, 128, src7, 128, src4,
+              src5, src6, src7);
+    HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+                               filt1, out0, out1, out2, out3);
+    HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0,
+                               filt1, out4, out5, out6, out7);
+    DUP4_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
+              VP8_FILTER_SHIFT, out5, out4, VP8_FILTER_SHIFT, out7, out6,
+              VP8_FILTER_SHIFT, out0, out1, out2, out3);
+    DUP4_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out2, 128, out3, 128, out0,
+              out1, out2, out3);
+    __lsx_vstx(out0, dst, 0);
+    __lsx_vstx(out1, dst, dst_stride);
+    __lsx_vstx(out2, dst, dst_stride_x2);
+    __lsx_vstx(out3, dst, dst_stride_x3);
+    dst += dst_stride_x4;
+  }
+}
+
+static void common_vt_4t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  __m128i src0, src1, src2, src3, src4, src5;
+  __m128i src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
+  __m128i src2110, src4332, filt0, filt1, out0, out1;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+  DUP2_ARG2(__lsx_vldx, src, -src_stride, src, src_stride, src0, src2);
+  src1 = __lsx_vld(src, 0);
+  src += src_stride_x2;
+
+  DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+
+  src2110 = __lsx_vilvl_d(src21_r, src10_r);
+  src2110 = __lsx_vxori_b(src2110, 128);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    src3 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src4, src5);
+    src += src_stride_x3;
+    DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
+    src4332 = __lsx_vilvl_d(src43_r, src32_r);
+    src4332 = __lsx_vxori_b(src4332, 128);
+    out0 = filt_4tap_dpadd_h(src2110, src4332, filt0, filt1);
+
+    src2 = __lsx_vld(src, 0);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src54_r, src65_r);
+    src2110 = __lsx_vilvl_d(src65_r, src54_r);
+    src2110 = __lsx_vxori_b(src2110, 128);
+    out1 = filt_4tap_dpadd_h(src4332, src2110, filt0, filt1);
+    out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT);
+    out0 = __lsx_vxori_b(out0, 128);
+
+    __lsx_vstelm_w(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 3);
+    dst += dst_stride;
+  }
+}
+
+static void common_vt_4t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src7, src8, src9, src10;
+  __m128i src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
+  __m128i tmp0, tmp1;
+  __m128i filt, out0_r, out1_r, out2_r, out3_r;
+
+  src -= src_stride;
+  filt = __lsx_vld(filter, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+
+  DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1);
+  src2 = __lsx_vldx(src, src_stride_x2);
+  src += src_stride_x3;
+
+  DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
+  src2 = __lsx_vxori_b(src2, 128);
+  DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src7, src8, src9, src10);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    DUP4_ARG2(__lsx_vilvl_b, src7, src2, src8, src7, src9, src8, src10, src9,
+              src72_r, src87_r, src98_r, src109_r);
+    out0_r = filt_4tap_dpadd_h(src10_r, src72_r, filt0, filt1);
+    out1_r = filt_4tap_dpadd_h(src21_r, src87_r, filt0, filt1);
+    out2_r = filt_4tap_dpadd_h(src72_r, src98_r, filt0, filt1);
+    out3_r = filt_4tap_dpadd_h(src87_r, src109_r, filt0, filt1);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1_r, out0_r, VP8_FILTER_SHIFT, out3_r,
+              out2_r, VP8_FILTER_SHIFT, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+    __lsx_vstelm_d(tmp0, dst, 0, 0);
+    __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
+    __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
+    dst += dst_stride_x4;
+
+    src10_r = src98_r;
+    src21_r = src109_r;
+    src2 = src10;
+  }
+}
+
+static void common_vt_4t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6;
+  __m128i src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
+  __m128i src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+  src -= src_stride;
+  filt = __lsx_vld(filter, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+
+  DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1);
+  src2 = __lsx_vldx(src, src_stride_x2);
+  src += src_stride_x3;
+
+  DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
+  src2 = __lsx_vxori_b(src2, 128);
+  DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+  DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src3, src4, src5, src6);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
+              src4, src5, src6);
+    DUP4_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src5, src4, src6, src5,
+              src32_r, src43_r, src54_r, src65_r);
+    DUP4_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src5, src4, src6, src5,
+              src32_l, src43_l, src54_l, src65_l);
+    out0_r = filt_4tap_dpadd_h(src10_r, src32_r, filt0, filt1);
+    out1_r = filt_4tap_dpadd_h(src21_r, src43_r, filt0, filt1);
+    out2_r = filt_4tap_dpadd_h(src32_r, src54_r, filt0, filt1);
+    out3_r = filt_4tap_dpadd_h(src43_r, src65_r, filt0, filt1);
+    out0_l = filt_4tap_dpadd_h(src10_l, src32_l, filt0, filt1);
+    out1_l = filt_4tap_dpadd_h(src21_l, src43_l, filt0, filt1);
+    out2_l = filt_4tap_dpadd_h(src32_l, src54_l, filt0, filt1);
+    out3_l = filt_4tap_dpadd_h(src43_l, src65_l, filt0, filt1);
+    DUP4_ARG3(__lsx_vssrarni_b_h, out0_l, out0_r, VP8_FILTER_SHIFT, out1_l,
+              out1_r, VP8_FILTER_SHIFT, out2_l, out2_r, VP8_FILTER_SHIFT,
+              out3_l, out3_r, VP8_FILTER_SHIFT, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp2, 128, tmp3, 128, tmp0,
+              tmp1, tmp2, tmp3);
+    __lsx_vstx(tmp0, dst, 0);
+    __lsx_vstx(tmp1, dst, dst_stride);
+    __lsx_vstx(tmp2, dst, dst_stride_x2);
+    __lsx_vstx(tmp3, dst, dst_stride_x3);
+    dst += dst_stride_x4;
+
+    src10_r = src54_r;
+    src21_r = src65_r;
+    src10_l = src54_l;
+    src21_l = src65_l;
+    src2 = src6;
+  }
+}
+
+static void common_hv_4ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt;
+  __m128i src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
+  __m128i mask0, mask1, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+  src -= 1;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0,
+            filt_hz1);
+  mask1 = __lsx_vaddi_bu(mask0, 2);
+
+  src1 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, -src_stride, src, src_stride, src0, src2);
+  src += src_stride_x2;
+
+  DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
+  src2 = __lsx_vxori_b(src2, 128);
+  hz_out0 = horiz_4tap_filt(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out1 = horiz_4tap_filt(src1, src2, mask0, mask1, filt_hz0, filt_hz1);
+  vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0,
+            filt_vt1);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    src3 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src4, src5);
+    src6 = __lsx_vldx(src, src_stride_x3);
+    src += src_stride_x4;
+
+    DUP2_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src3, src4);
+    hz_out3 = horiz_4tap_filt(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out2 = __lsx_vshuf_b(hz_out3, hz_out1, shuff);
+    vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
+    tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
+
+    DUP2_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src5, src6);
+    hz_out5 = horiz_4tap_filt(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
+    vec2 = __lsx_vpackev_b(hz_out5, hz_out4);
+    tmp1 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1);
+
+    tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
+    tmp0 = __lsx_vxori_b(tmp0, 128);
+    __lsx_vstelm_w(tmp0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 3);
+    dst += dst_stride;
+
+    hz_out1 = hz_out5;
+    vec0 = vec2;
+  }
+}
+
+static inline void common_hv_4ht_4vt_8w_lsx(
+    uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
+    int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
+    int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
+  __m128i mask0, mask1, out0, out1;
+  __m128i filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3;
+  __m128i vec0, vec1, vec2, vec3, vec4;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+  src -= 1 + src_stride;
+
+  filt = __lsx_vld(filter_horiz, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1);
+  mask1 = __lsx_vaddi_bu(mask0, 2);
+
+  DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1);
+  src2 = __lsx_vldx(src, src_stride_x2);
+  src += src_stride_x3;
+
+  DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
+  src2 = __lsx_vxori_b(src2, 128);
+  hz_out0 = horiz_4tap_filt(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out1 = horiz_4tap_filt(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out2 = horiz_4tap_filt(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
+  DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2);
+
+  filt = __lsx_vld(filter_vert, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src3, src4, src5, src6);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
+              src4, src5, src6);
+    hz_out3 = horiz_4tap_filt(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
+    vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
+    tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
+
+    hz_out0 = horiz_4tap_filt(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
+    vec3 = __lsx_vpackev_b(hz_out0, hz_out3);
+    tmp1 = filt_4tap_dpadd_h(vec2, vec3, filt_vt0, filt_vt1);
+
+    hz_out1 = horiz_4tap_filt(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
+    vec4 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp2 = filt_4tap_dpadd_h(vec1, vec4, filt_vt0, filt_vt1);
+
+    hz_out2 = horiz_4tap_filt(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec0, vec1);
+    tmp3 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
+
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(out1, dst + dst_stride_x2, 0, 0);
+    __lsx_vstelm_d(out1, dst + dst_stride_x3, 0, 1);
+    dst += dst_stride_x4;
+
+    vec0 = vec4;
+    vec2 = vec1;
+  }
+}
+
+static void common_hv_4ht_4vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                      uint8_t *RESTRICT dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height) {
+  common_hv_4ht_4vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                           filter_vert, height);
+  common_hv_4ht_4vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride,
+                           filter_horiz, filter_vert, height);
+}
+
+static void common_hv_6ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt;
+  __m128i src0, src1, src2, src3, src4, src5, src6;
+  __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
+  __m128i filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+  src -= 2;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0,
+            filt_hz1);
+  filt_hz2 = __lsx_vldrepl_h(filter_horiz, 4);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+  src1 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, -src_stride, src, src_stride, src0, src2);
+  src += src_stride_x2;
+
+  DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
+  src2 = __lsx_vxori_b(src2, 128);
+
+  hz_out0 = horiz_6tap_filt(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out1 = horiz_6tap_filt(src1, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0,
+            filt_vt1);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    src3 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src4, src5);
+    src6 = __lsx_vldx(src, src_stride_x3);
+    src += src_stride_x4;
+    DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
+              src4, src5, src6);
+
+    hz_out3 = horiz_6tap_filt(src3, src4, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out2 = __lsx_vshuf_b(hz_out3, hz_out1, shuff);
+    vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
+    tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
+
+    hz_out5 = horiz_6tap_filt(src5, src6, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
+    vec2 = __lsx_vpackev_b(hz_out5, hz_out4);
+    tmp1 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1);
+
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp0, tmp0, 7, tmp1, tmp1, 7, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+
+    __lsx_vstelm_w(tmp0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp1, dst, 0, 1);
+    dst += dst_stride;
+
+    hz_out1 = hz_out5;
+    vec0 = vec2;
+  }
+}
+
+static inline void common_hv_6ht_4vt_8w_lsx(
+    uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
+    int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
+    int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+
+  __m128i src0, src1, src2, src3, src4, src5, src6;
+  __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
+  __m128i filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3;
+  __m128i tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3;
+  __m128i out0, out1;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+  src -= (2 + src_stride);
+
+  filt = __lsx_vld(filter_horiz, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1);
+  filt_hz2 = __lsx_vreplvei_h(filt, 2);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+  DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1);
+  src2 = __lsx_vldx(src, src_stride_x2);
+  src += src_stride_x3;
+
+  DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
+  src2 = __lsx_vxori_b(src2, 128);
+  hz_out0 = horiz_6tap_filt(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out1 = horiz_6tap_filt(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out2 = horiz_6tap_filt(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2);
+
+  filt = __lsx_vld(filter_vert, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src3, src4, src5, src6);
+    src += src_stride_x4;
+    DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
+              src4, src5, src6);
+
+    hz_out3 = horiz_6tap_filt(src3, src3, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
+    tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
+
+    hz_out0 = horiz_6tap_filt(src4, src4, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    vec3 = __lsx_vpackev_b(hz_out0, hz_out3);
+    tmp1 = filt_4tap_dpadd_h(vec2, vec3, filt_vt0, filt_vt1);
+
+    hz_out1 = horiz_6tap_filt(src5, src5, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp2 = filt_4tap_dpadd_h(vec1, vec0, filt_vt0, filt_vt1);
+
+    hz_out2 = horiz_6tap_filt(src6, src6, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec1, vec2);
+    tmp3 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1);
+
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(out1, dst + dst_stride_x2, 0, 0);
+    __lsx_vstelm_d(out1, dst + dst_stride_x3, 0, 1);
+    dst += dst_stride_x4;
+  }
+}
+
+static void common_hv_6ht_4vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                      uint8_t *RESTRICT dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height) {
+  common_hv_6ht_4vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                           filter_vert, height);
+  common_hv_6ht_4vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride,
+                           filter_horiz, filter_vert, height);
+}
+
+static void common_hv_4ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  __m128i filt_hz0, filt_hz1, filt_vt0, filt_vt1, filt_vt2, mask0, mask1;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  __m128i hz_out7, tmp0, tmp1, out0, out1, out2, out3;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+
+  src -= 1;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0,
+            filt_hz1);
+  mask1 = __lsx_vaddi_bu(mask0, 2);
+
+  DUP4_ARG2(__lsx_vldx, src, -src_stride_x2, src, -src_stride, src, src_stride,
+            src, src_stride_x2, src0, src1, src3, src4);
+  src2 = __lsx_vld(src, 0);
+  src += src_stride_x3;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  src4 = __lsx_vxori_b(src4, 128);
+  hz_out0 = horiz_4tap_filt(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out2 = horiz_4tap_filt(src2, src3, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out3 = horiz_4tap_filt(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
+  DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1);
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0,
+            filt_vt1);
+  filt_vt2 = __lsx_vldrepl_h(filter_vert, 4);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    src5 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src6, src7);
+    src8 = __lsx_vldx(src, src_stride_x3);
+    DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5,
+              src6, src7, src8);
+    src += src_stride_x4;
+
+    hz_out5 = horiz_4tap_filt(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
+    out2 = __lsx_vpackev_b(hz_out5, hz_out4);
+    tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+    hz_out7 = horiz_4tap_filt(src7, src8, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out6 = __lsx_vshuf_b(hz_out7, hz_out5, shuff);
+    out3 = __lsx_vpackev_b(hz_out7, hz_out6);
+    tmp1 = dpadd_h3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
+
+    tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
+    tmp0 = __lsx_vxori_b(tmp0, 128);
+    __lsx_vstelm_w(tmp0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 3);
+    dst += dst_stride;
+
+    hz_out3 = hz_out7;
+    out0 = out2;
+    out1 = out3;
+  }
+}
+
+static inline void common_hv_4ht_6vt_8w_lsx(
+    uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
+    int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
+    int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  __m128i filt_hz0, filt_hz1, mask0, mask1;
+  __m128i filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  __m128i hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
+  __m128i vec0, vec1;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+  src -= 1 + src_stride_x2;
+
+  filt = __lsx_vld(filter_horiz, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1);
+  mask1 = __lsx_vaddi_bu(mask0, 2);
+
+  DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+            src_stride_x3, src0, src1, src2, src3);
+  src += src_stride_x4;
+  src4 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  src4 = __lsx_vxori_b(src4, 128);
+  hz_out0 = horiz_4tap_filt(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out1 = horiz_4tap_filt(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out2 = horiz_4tap_filt(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out3 = horiz_4tap_filt(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out4 = horiz_4tap_filt(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
+  DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1);
+  DUP2_ARG2(__lsx_vpackev_b, hz_out2, hz_out1, hz_out4, hz_out3, out3, out4);
+
+  filt = __lsx_vld(filter_vert, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1);
+  filt_vt2 = __lsx_vreplvei_h(filt, 2);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src5, src6, src7, src8);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5,
+              src6, src7, src8);
+    hz_out5 = horiz_4tap_filt(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
+    out2 = __lsx_vpackev_b(hz_out5, hz_out4);
+    tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+    hz_out6 = horiz_4tap_filt(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
+    out5 = __lsx_vpackev_b(hz_out6, hz_out5);
+    tmp1 = dpadd_h3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
+
+    hz_out7 = horiz_4tap_filt(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
+    out6 = __lsx_vpackev_b(hz_out7, hz_out6);
+    tmp2 = dpadd_h3(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
+
+    hz_out8 = horiz_4tap_filt(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
+    out7 = __lsx_vpackev_b(hz_out8, hz_out7);
+    tmp3 = dpadd_h3(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, vec0, vec1);
+    DUP2_ARG2(__lsx_vxori_b, vec0, 128, vec1, 128, vec0, vec1);
+    __lsx_vstelm_d(vec0, dst, 0, 0);
+    __lsx_vstelm_d(vec0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(vec1, dst + dst_stride_x2, 0, 0);
+    __lsx_vstelm_d(vec1, dst + dst_stride_x3, 0, 1);
+    dst += dst_stride_x4;
+    hz_out4 = hz_out8;
+    out0 = out2;
+    out1 = out6;
+    out3 = out5;
+    out4 = out7;
+  }
+}
+
+static void common_hv_4ht_6vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                      uint8_t *RESTRICT dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height) {
+  common_hv_4ht_6vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                           filter_vert, height);
+  common_hv_4ht_6vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride,
+                           filter_horiz, filter_vert, height);
+}
+
+typedef void (*PVp8SixtapPredictFunc1)(
+    uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
+    int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
+    int32_t height);
+
+typedef void (*PVp8SixtapPredictFunc2)(uint8_t *RESTRICT src,
+                                       int32_t src_stride,
+                                       uint8_t *RESTRICT dst,
+                                       int32_t dst_stride, const int8_t *filter,
+                                       int32_t height);
+
+void vp8_sixtap_predict4x4_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                               int32_t xoffset, int32_t yoffset,
+                               uint8_t *RESTRICT dst, int32_t dst_stride) {
+  const int8_t *h_filter = vp8_subpel_filters_lsx[xoffset - 1];
+  const int8_t *v_filter = vp8_subpel_filters_lsx[yoffset - 1];
+
+  static PVp8SixtapPredictFunc1 Predict4x4Funcs1[4] = {
+    common_hv_6ht_6vt_4w_lsx,
+    common_hv_6ht_4vt_4w_lsx,
+    common_hv_4ht_6vt_4w_lsx,
+    common_hv_4ht_4vt_4w_lsx,
+  };
+
+  static PVp8SixtapPredictFunc2 Predict4x4Funcs2[4] = { common_vt_6t_4w_lsx,
+                                                        common_vt_4t_4w_lsx,
+                                                        common_hz_6t_4w_lsx,
+                                                        common_hz_4t_4w_lsx };
+  if (yoffset < 8 && xoffset < 8) {
+    if (yoffset) {
+      if (xoffset) {
+        switch (xoffset & 1) {
+          case 0:
+            switch (yoffset & 1) {
+              case 0:
+                Predict4x4Funcs1[0](src, src_stride, dst, dst_stride, h_filter,
+                                    v_filter, 4);
+                break;
+              case 1:
+                Predict4x4Funcs1[1](src, src_stride, dst, dst_stride, h_filter,
+                                    v_filter + 1, 4);
+                break;
+            }
+            break;
+
+          case 1:
+            switch (yoffset & 1) {
+              case 0:
+                Predict4x4Funcs1[2](src, src_stride, dst, dst_stride,
+                                    h_filter + 1, v_filter, 4);
+                break;
+
+              case 1:
+                Predict4x4Funcs1[3](src, src_stride, dst, dst_stride,
+                                    h_filter + 1, v_filter + 1, 4);
+                break;
+            }
+            break;
+        }
+      } else {
+        switch (yoffset & 1) {
+          case 0:
+            Predict4x4Funcs2[0](src, src_stride, dst, dst_stride, v_filter, 4);
+            break;
+
+          case 1:
+            Predict4x4Funcs2[1](src, src_stride, dst, dst_stride, v_filter + 1,
+                                4);
+            break;
+        }
+      }
+    } else {
+      switch (xoffset) {
+        case 0: {
+          __m128i tp0;
+
+          tp0 = __lsx_vldrepl_w(src, 0);
+          src += src_stride;
+          __lsx_vstelm_w(tp0, dst, 0, 0);
+          dst += dst_stride;
+          tp0 = __lsx_vldrepl_w(src, 0);
+          src += src_stride;
+          __lsx_vstelm_w(tp0, dst, 0, 0);
+          dst += dst_stride;
+          tp0 = __lsx_vldrepl_w(src, 0);
+          src += src_stride;
+          __lsx_vstelm_w(tp0, dst, 0, 0);
+          dst += dst_stride;
+          tp0 = __lsx_vldrepl_w(src, 0);
+          __lsx_vstelm_w(tp0, dst, 0, 0);
+
+          break;
+        }
+        case 2:
+        case 4:
+        case 6:
+          Predict4x4Funcs2[2](src, src_stride, dst, dst_stride, h_filter, 4);
+          break;
+      }
+      switch (xoffset & 1) {
+        case 1:
+          Predict4x4Funcs2[3](src, src_stride, dst, dst_stride, h_filter + 1,
+                              4);
+          break;
+      }
+    }
+  }
+}
+
+void vp8_sixtap_predict8x8_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                               int32_t xoffset, int32_t yoffset,
+                               uint8_t *RESTRICT dst, int32_t dst_stride) {
+  const int8_t *h_filter = vp8_subpel_filters_lsx[xoffset - 1];
+  const int8_t *v_filter = vp8_subpel_filters_lsx[yoffset - 1];
+
+  static PVp8SixtapPredictFunc1 Predict8x8Funcs1[4] = {
+    common_hv_6ht_6vt_8w_lsx,
+    common_hv_6ht_4vt_8w_lsx,
+    common_hv_4ht_6vt_8w_lsx,
+    common_hv_4ht_4vt_8w_lsx,
+  };
+
+  static PVp8SixtapPredictFunc2 Predict8x8Funcs2[4] = { common_vt_6t_8w_lsx,
+                                                        common_vt_4t_8w_lsx,
+                                                        common_hz_6t_8w_lsx,
+                                                        common_hz_4t_8w_lsx };
+
+  if (yoffset < 8 && xoffset < 8) {
+    if (yoffset) {
+      if (xoffset) {
+        switch (xoffset & 1) {
+          case 0:
+            switch (yoffset & 1) {
+              case 0:
+                Predict8x8Funcs1[0](src, src_stride, dst, dst_stride, h_filter,
+                                    v_filter, 8);
+                break;
+
+              case 1:
+                Predict8x8Funcs1[1](src, src_stride, dst, dst_stride, h_filter,
+                                    v_filter + 1, 8);
+                break;
+            }
+            break;
+
+          case 1:
+            switch (yoffset & 1) {
+              case 0:
+                Predict8x8Funcs1[2](src, src_stride, dst, dst_stride,
+                                    h_filter + 1, v_filter, 8);
+                break;
+
+              case 1:
+                Predict8x8Funcs1[3](src, src_stride, dst, dst_stride,
+                                    h_filter + 1, v_filter + 1, 8);
+                break;
+            }
+            break;
+        }
+      } else {
+        switch (yoffset & 1) {
+          case 0:
+            Predict8x8Funcs2[0](src, src_stride, dst, dst_stride, v_filter, 8);
+            break;
+
+          case 1:
+            Predict8x8Funcs2[1](src, src_stride, dst, dst_stride, v_filter + 1,
+                                8);
+            break;
+        }
+      }
+    } else {
+      switch (xoffset & 1) {
+        case 1:
+          Predict8x8Funcs2[3](src, src_stride, dst, dst_stride, h_filter + 1,
+                              8);
+          break;
+      }
+      switch (xoffset) {
+        case 0: vp8_copy_mem8x8(src, src_stride, dst, dst_stride); break;
+        case 2:
+        case 4:
+        case 6:
+          Predict8x8Funcs2[2](src, src_stride, dst, dst_stride, h_filter, 8);
+          break;
+      }
+    }
+  }
+}
+
+void vp8_sixtap_predict16x16_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                 int32_t xoffset, int32_t yoffset,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride) {
+  const int8_t *h_filter = vp8_subpel_filters_lsx[xoffset - 1];
+  const int8_t *v_filter = vp8_subpel_filters_lsx[yoffset - 1];
+
+  static PVp8SixtapPredictFunc1 Predict16x16Funcs1[4] = {
+    common_hv_6ht_6vt_16w_lsx,
+    common_hv_6ht_4vt_16w_lsx,
+    common_hv_4ht_6vt_16w_lsx,
+    common_hv_4ht_4vt_16w_lsx,
+  };
+
+  static PVp8SixtapPredictFunc2 Predict16x16Funcs2[4] = {
+    common_vt_6t_16w_lsx, common_vt_4t_16w_lsx, common_hz_6t_16w_lsx,
+    common_hz_4t_16w_lsx
+  };
+
+  if (yoffset < 8 && xoffset < 8) {
+    if (yoffset) {
+      if (xoffset) {
+        switch (xoffset & 1) {
+          case 0:
+            switch (yoffset & 1) {
+              case 0:
+                Predict16x16Funcs1[0](src, src_stride, dst, dst_stride,
+                                      h_filter, v_filter, 16);
+                break;
+
+              case 1:
+                Predict16x16Funcs1[1](src, src_stride, dst, dst_stride,
+                                      h_filter, v_filter + 1, 16);
+                break;
+            }
+            break;
+
+          case 1:
+            switch (yoffset & 1) {
+              case 0:
+                Predict16x16Funcs1[2](src, src_stride, dst, dst_stride,
+                                      h_filter + 1, v_filter, 16);
+                break;
+
+              case 1:
+                Predict16x16Funcs1[3](src, src_stride, dst, dst_stride,
+                                      h_filter + 1, v_filter + 1, 16);
+                break;
+            }
+            break;
+        }
+      } else {
+        switch (yoffset & 1) {
+          case 0:
+            Predict16x16Funcs2[0](src, src_stride, dst, dst_stride, v_filter,
+                                  16);
+            break;
+
+          case 1:
+            Predict16x16Funcs2[1](src, src_stride, dst, dst_stride,
+                                  v_filter + 1, 16);
+            break;
+        }
+      }
+    } else {
+      switch (xoffset & 1) {
+        case 1:
+          Predict16x16Funcs2[3](src, src_stride, dst, dst_stride, h_filter + 1,
+                                16);
+          break;
+      }
+      switch (xoffset) {
+        case 0: vp8_copy_mem16x16(src, src_stride, dst, dst_stride); break;
+        case 2:
+        case 4:
+        case 6:
+          Predict16x16Funcs2[2](src, src_stride, dst, dst_stride, h_filter, 16);
+          break;
+      }
+    }
+  }
+}
diff --git a/media/libvpx/libvpx/vp8/common/loopfilter.h b/media/libvpx/libvpx/vp8/common/loopfilter.h
index 7484563e06..909e8df512 100644
--- a/media/libvpx/libvpx/vp8/common/loopfilter.h
+++ b/media/libvpx/libvpx/vp8/common/loopfilter.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_LOOPFILTER_H_
-#define VP8_COMMON_LOOPFILTER_H_
+#ifndef VPX_VP8_COMMON_LOOPFILTER_H_
+#define VPX_VP8_COMMON_LOOPFILTER_H_
 
 #include "vpx_ports/mem.h"
 #include "vpx_config.h"
@@ -26,7 +26,7 @@ extern "C" {
 
 typedef enum { NORMAL_LOOPFILTER = 0, SIMPLE_LOOPFILTER = 1 } LOOPFILTERTYPE;
 
-#if ARCH_ARM
+#if VPX_ARCH_ARM
 #define SIMD_WIDTH 1
 #else
 #define SIMD_WIDTH 16
@@ -93,11 +93,9 @@ void vp8_loop_filter_row_normal(struct VP8Common *cm,
 
 void vp8_loop_filter_row_simple(struct VP8Common *cm,
                                 struct modeinfo *mode_info_context, int mb_row,
-                                int post_ystride, int post_uvstride,
-                                unsigned char *y_ptr, unsigned char *u_ptr,
-                                unsigned char *v_ptr);
+                                int post_ystride, unsigned char *y_ptr);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_LOOPFILTER_H_
+#endif  // VPX_VP8_COMMON_LOOPFILTER_H_
diff --git a/media/libvpx/libvpx/vp8/common/loopfilter_filters.c b/media/libvpx/libvpx/vp8/common/loopfilter_filters.c
index 2a7cde8788..61a55d3c92 100644
--- a/media/libvpx/libvpx/vp8/common/loopfilter_filters.c
+++ b/media/libvpx/libvpx/vp8/common/loopfilter_filters.c
@@ -86,10 +86,12 @@ static void vp8_filter(signed char mask, uc hev, uc *op1, uc *op0, uc *oq0,
   u = vp8_signed_char_clamp(ps1 + filter_value);
   *op1 = u ^ 0x80;
 }
-void vp8_loop_filter_horizontal_edge_c(unsigned char *s, int p, /* pitch */
-                                       const unsigned char *blimit,
-                                       const unsigned char *limit,
-                                       const unsigned char *thresh, int count) {
+
+static void loop_filter_horizontal_edge_c(unsigned char *s, int p, /* pitch */
+                                          const unsigned char *blimit,
+                                          const unsigned char *limit,
+                                          const unsigned char *thresh,
+                                          int count) {
   int hev = 0; /* high edge variance */
   signed char mask = 0;
   int i = 0;
@@ -109,10 +111,11 @@ void vp8_loop_filter_horizontal_edge_c(unsigned char *s, int p, /* pitch */
   } while (++i < count * 8);
 }
 
-void vp8_loop_filter_vertical_edge_c(unsigned char *s, int p,
-                                     const unsigned char *blimit,
-                                     const unsigned char *limit,
-                                     const unsigned char *thresh, int count) {
+static void loop_filter_vertical_edge_c(unsigned char *s, int p,
+                                        const unsigned char *blimit,
+                                        const unsigned char *limit,
+                                        const unsigned char *thresh,
+                                        int count) {
   int hev = 0; /* high edge variance */
   signed char mask = 0;
   int i = 0;
@@ -185,11 +188,11 @@ static void vp8_mbfilter(signed char mask, uc hev, uc *op2, uc *op1, uc *op0,
   *op2 = s ^ 0x80;
 }
 
-void vp8_mbloop_filter_horizontal_edge_c(unsigned char *s, int p,
-                                         const unsigned char *blimit,
-                                         const unsigned char *limit,
-                                         const unsigned char *thresh,
-                                         int count) {
+static void mbloop_filter_horizontal_edge_c(unsigned char *s, int p,
+                                            const unsigned char *blimit,
+                                            const unsigned char *limit,
+                                            const unsigned char *thresh,
+                                            int count) {
   signed char hev = 0; /* high edge variance */
   signed char mask = 0;
   int i = 0;
@@ -210,10 +213,11 @@ void vp8_mbloop_filter_horizontal_edge_c(unsigned char *s, int p,
   } while (++i < count * 8);
 }
 
-void vp8_mbloop_filter_vertical_edge_c(unsigned char *s, int p,
-                                       const unsigned char *blimit,
-                                       const unsigned char *limit,
-                                       const unsigned char *thresh, int count) {
+static void mbloop_filter_vertical_edge_c(unsigned char *s, int p,
+                                          const unsigned char *blimit,
+                                          const unsigned char *limit,
+                                          const unsigned char *thresh,
+                                          int count) {
   signed char hev = 0; /* high edge variance */
   signed char mask = 0;
   int i = 0;
@@ -266,28 +270,32 @@ static void vp8_simple_filter(signed char mask, uc *op1, uc *op0, uc *oq0,
   *op0 = u ^ 0x80;
 }
 
-void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *s, int p,
+void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr,
+                                              int y_stride,
                                               const unsigned char *blimit) {
   signed char mask = 0;
   int i = 0;
 
   do {
-    mask = vp8_simple_filter_mask(blimit[0], s[-2 * p], s[-1 * p], s[0 * p],
-                                  s[1 * p]);
-    vp8_simple_filter(mask, s - 2 * p, s - 1 * p, s, s + 1 * p);
-    ++s;
+    mask = vp8_simple_filter_mask(blimit[0], y_ptr[-2 * y_stride],
+                                  y_ptr[-1 * y_stride], y_ptr[0 * y_stride],
+                                  y_ptr[1 * y_stride]);
+    vp8_simple_filter(mask, y_ptr - 2 * y_stride, y_ptr - 1 * y_stride, y_ptr,
+                      y_ptr + 1 * y_stride);
+    ++y_ptr;
   } while (++i < 16);
 }
 
-void vp8_loop_filter_simple_vertical_edge_c(unsigned char *s, int p,
+void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride,
                                             const unsigned char *blimit) {
   signed char mask = 0;
   int i = 0;
 
   do {
-    mask = vp8_simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]);
-    vp8_simple_filter(mask, s - 2, s - 1, s, s + 1);
-    s += p;
+    mask = vp8_simple_filter_mask(blimit[0], y_ptr[-2], y_ptr[-1], y_ptr[0],
+                                  y_ptr[1]);
+    vp8_simple_filter(mask, y_ptr - 2, y_ptr - 1, y_ptr, y_ptr + 1);
+    y_ptr += y_stride;
   } while (++i < 16);
 }
 
@@ -295,17 +303,17 @@ void vp8_loop_filter_simple_vertical_edge_c(unsigned char *s, int p,
 void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,
                            unsigned char *v_ptr, int y_stride, int uv_stride,
                            loop_filter_info *lfi) {
-  vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, 2);
+  mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim,
+                                  lfi->hev_thr, 2);
 
   if (u_ptr) {
-    vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim,
-                                        lfi->hev_thr, 1);
+    mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr, 1);
   }
 
   if (v_ptr) {
-    vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim,
-                                        lfi->hev_thr, 1);
+    mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr, 1);
   }
 }
 
@@ -313,17 +321,17 @@ void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,
 void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,
                            unsigned char *v_ptr, int y_stride, int uv_stride,
                            loop_filter_info *lfi) {
-  vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim,
-                                    lfi->hev_thr, 2);
+  mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim,
+                                lfi->hev_thr, 2);
 
   if (u_ptr) {
-    vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, 1);
+    mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim,
+                                  lfi->hev_thr, 1);
   }
 
   if (v_ptr) {
-    vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, 1);
+    mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim,
+                                  lfi->hev_thr, 1);
   }
 }
 
@@ -331,21 +339,21 @@ void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,
 void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr,
                           unsigned char *v_ptr, int y_stride, int uv_stride,
                           loop_filter_info *lfi) {
-  vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim,
-                                    lfi->lim, lfi->hev_thr, 2);
-  vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim,
-                                    lfi->lim, lfi->hev_thr, 2);
-  vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim,
-                                    lfi->lim, lfi->hev_thr, 2);
+  loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim,
+                                lfi->lim, lfi->hev_thr, 2);
+  loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim,
+                                lfi->lim, lfi->hev_thr, 2);
+  loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim,
+                                lfi->lim, lfi->hev_thr, 2);
 
   if (u_ptr) {
-    vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride,
-                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);
+    loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->blim,
+                                  lfi->lim, lfi->hev_thr, 1);
   }
 
   if (v_ptr) {
-    vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride,
-                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);
+    loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->blim,
+                                  lfi->lim, lfi->hev_thr, 1);
   }
 }
 
@@ -363,21 +371,21 @@ void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride,
 void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr,
                           unsigned char *v_ptr, int y_stride, int uv_stride,
                           loop_filter_info *lfi) {
-  vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim,
-                                  lfi->hev_thr, 2);
-  vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim,
-                                  lfi->hev_thr, 2);
-  vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim,
-                                  lfi->hev_thr, 2);
+  loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim,
+                              lfi->hev_thr, 2);
+  loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim,
+                              lfi->hev_thr, 2);
+  loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim,
+                              lfi->hev_thr, 2);
 
   if (u_ptr) {
-    vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim,
-                                    lfi->hev_thr, 1);
+    loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim,
+                                lfi->hev_thr, 1);
   }
 
   if (v_ptr) {
-    vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim,
-                                    lfi->hev_thr, 1);
+    loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim,
+                                lfi->hev_thr, 1);
   }
 }
 
diff --git a/media/libvpx/libvpx/vp8/common/mfqe.c b/media/libvpx/libvpx/vp8/common/mfqe.c
index 5aace8c99d..3d801e72e2 100644
--- a/media/libvpx/libvpx/vp8/common/mfqe.c
+++ b/media/libvpx/libvpx/vp8/common/mfqe.c
@@ -18,6 +18,7 @@
 
 #include "./vp8_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
+#include "vp8/common/common.h"
 #include "vp8/common/postproc.h"
 #include "vpx_dsp/variance.h"
 #include "vpx_mem/vpx_mem.h"
@@ -74,8 +75,7 @@ static void apply_ifactor(unsigned char *y_src, int y_src_stride,
                             src_weight);
     vp8_filter_by_weight8x8(v_src, uv_src_stride, v_dst, uv_dst_stride,
                             src_weight);
-  } else /* if (block_size == 8) */
-  {
+  } else {
     vp8_filter_by_weight8x8(y_src, y_src_stride, y_dst, y_dst_stride,
                             src_weight);
     vp8_filter_by_weight4x4(u_src, uv_src_stride, u_dst, uv_dst_stride,
@@ -136,8 +136,7 @@ static void multiframe_quality_enhance_block(
     usad = (vpx_sad8x8(u, uv_stride, ud, uvd_stride) + 32) >> 6;
     vsad = (vpx_sad8x8(v, uv_stride, vd, uvd_stride) + 32) >> 6;
 #endif
-  } else /* if (blksize == 8) */
-  {
+  } else {
     actd = (vpx_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse) + 32) >> 6;
     act = (vpx_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse) + 32) >> 6;
 #ifdef USE_SSD
@@ -186,14 +185,12 @@ static void multiframe_quality_enhance_block(
       apply_ifactor(y, y_stride, yd, yd_stride, u, v, uv_stride, ud, vd,
                     uvd_stride, blksize, ifactor);
     }
-  } else /* else implicitly copy from previous frame */
-  {
+  } else { /* else implicitly copy from previous frame */
     if (blksize == 16) {
       vp8_copy_mem16x16(y, y_stride, yd, yd_stride);
       vp8_copy_mem8x8(u, uv_stride, ud, uvd_stride);
       vp8_copy_mem8x8(v, uv_stride, vd, uvd_stride);
-    } else /* if (blksize == 8) */
-    {
+    } else {
       vp8_copy_mem8x8(y, y_stride, yd, yd_stride);
       for (up = u, udp = ud, i = 0; i < uvblksize;
            ++i, up += uv_stride, udp += uvd_stride) {
@@ -215,6 +212,7 @@ static int qualify_inter_mb(const MODE_INFO *mode_info_context, int *map) {
       { 0, 1, 4, 5 }, { 2, 3, 6, 7 }, { 8, 9, 12, 13 }, { 10, 11, 14, 15 }
     };
     int i, j;
+    vp8_zero_array(map, 4);
     for (i = 0; i < 4; ++i) {
       map[i] = 1;
       for (j = 0; j < 4 && map[j]; ++j) {
@@ -237,7 +235,7 @@ void vp8_multiframe_quality_enhance(VP8_COMMON *cm) {
 
   FRAME_TYPE frame_type = cm->frame_type;
   /* Point at base of Mb MODE_INFO list has motion vectors etc */
-  const MODE_INFO *mode_info_context = cm->show_frame_mi;
+  const MODE_INFO *mode_info_context = cm->mi;
   int mb_row;
   int mb_col;
   int totmap, map[4];
@@ -297,8 +295,7 @@ void vp8_multiframe_quality_enhance(VP8_COMMON *cm) {
               }
             }
           }
-        } else /* totmap = 4 */
-        {
+        } else { /* totmap = 4 */
           multiframe_quality_enhance_block(
               16, qcurr, qprev, y_ptr, u_ptr, v_ptr, show->y_stride,
               show->uv_stride, yd_ptr, ud_ptr, vd_ptr, dest->y_stride,
diff --git a/media/libvpx/libvpx/vp8/common/mips/dspr2/filter_dspr2.c b/media/libvpx/libvpx/vp8/common/mips/dspr2/filter_dspr2.c
index 2de343419a..b9da52084d 100644
--- a/media/libvpx/libvpx/vp8/common/mips/dspr2/filter_dspr2.c
+++ b/media/libvpx/libvpx/vp8/common/mips/dspr2/filter_dspr2.c
@@ -673,9 +673,9 @@ void vp8_filter_block2d_first_pass16_6tap(unsigned char *RESTRICT src_ptr,
 
         : [tn1] "=&r"(tn1), [tp2] "=&r"(tp2), [n2] "=&r"(n2), [p4] "=&r"(p4),
           [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
-          [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4)
+          [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4), [p1] "+r"(p1)
         : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp1] "r"(tp1),
-          [n1] "r"(n1), [p1] "r"(p1), [vector4a] "r"(vector4a), [p2] "r"(p2),
+          [n1] "r"(n1), [vector4a] "r"(vector4a), [p2] "r"(p2),
           [vector3b] "r"(vector3b), [p3] "r"(p3), [n3] "r"(n3),
           [src_ptr] "r"(src_ptr));
 
@@ -724,9 +724,9 @@ void vp8_filter_block2d_first_pass16_6tap(unsigned char *RESTRICT src_ptr,
 
         : [tn1] "=&r"(tn1), [tp1] "=&r"(tp1), [n1] "=&r"(n1), [p3] "=&r"(p3),
           [n3] "=&r"(n3), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
-          [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4)
+          [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4), [p4] "+r"(p4)
         : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp2] "r"(tp2),
-          [p2] "r"(p2), [n2] "r"(n2), [p4] "r"(p4), [n4] "r"(n4), [p1] "r"(p1),
+          [p2] "r"(p2), [n2] "r"(n2), [n4] "r"(n4), [p1] "r"(p1),
           [src_ptr] "r"(src_ptr), [vector4a] "r"(vector4a),
           [vector3b] "r"(vector3b));
 
@@ -781,9 +781,9 @@ void vp8_filter_block2d_first_pass16_6tap(unsigned char *RESTRICT src_ptr,
 
         : [tn1] "=&r"(tn1), [p2] "=&r"(p2), [n2] "=&r"(n2), [n4] "=&r"(n4),
           [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
-          [Temp4] "=r"(Temp4)
-        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp1] "r"(tp1),
-          [p4] "r"(p4), [n1] "r"(n1), [p1] "r"(p1), [vector4a] "r"(vector4a),
+          [Temp4] "=r"(Temp4), [tp1] "+r"(tp1)
+        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [p4] "r"(p4),
+          [n1] "r"(n1), [p1] "r"(p1), [vector4a] "r"(vector4a),
           [vector3b] "r"(vector3b), [p3] "r"(p3), [n3] "r"(n3),
           [src_ptr] "r"(src_ptr), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
 
@@ -816,8 +816,8 @@ void vp8_filter_block2d_first_pass16_0(unsigned char *RESTRICT src_ptr,
 
         : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
           [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr)
-        : [src_pixels_per_line] "r"(src_pixels_per_line),
-          [output_ptr] "r"(output_ptr));
+        : [src_pixels_per_line] "r"(src_pixels_per_line), [output_ptr] "r"(
+                                                              output_ptr));
 
     __asm__ __volatile__(
         "ulw    %[Temp1],   0(%[src_ptr])                               \n\t"
@@ -832,8 +832,8 @@ void vp8_filter_block2d_first_pass16_0(unsigned char *RESTRICT src_ptr,
 
         : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
           [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr)
-        : [src_pixels_per_line] "r"(src_pixels_per_line),
-          [output_ptr] "r"(output_ptr));
+        : [src_pixels_per_line] "r"(src_pixels_per_line), [output_ptr] "r"(
+                                                              output_ptr));
 
     __asm__ __volatile__(
         "ulw    %[Temp1],   0(%[src_ptr])                               \n\t"
@@ -848,8 +848,8 @@ void vp8_filter_block2d_first_pass16_0(unsigned char *RESTRICT src_ptr,
 
         : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
           [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr)
-        : [src_pixels_per_line] "r"(src_pixels_per_line),
-          [output_ptr] "r"(output_ptr));
+        : [src_pixels_per_line] "r"(src_pixels_per_line), [output_ptr] "r"(
+                                                              output_ptr));
 
     output_ptr += 48;
   }
diff --git a/media/libvpx/libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c b/media/libvpx/libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c
index 899dc10ad9..eae852d592 100644
--- a/media/libvpx/libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c
+++ b/media/libvpx/libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c
@@ -35,41 +35,41 @@ void vp8_dequant_idct_add_y_block_dspr2(short *q, short *dq, unsigned char *dst,
 }
 
 void vp8_dequant_idct_add_uv_block_dspr2(short *q, short *dq,
-                                         unsigned char *dstu,
-                                         unsigned char *dstv, int stride,
+                                         unsigned char *dst_u,
+                                         unsigned char *dst_v, int stride,
                                          char *eobs) {
   int i, j;
 
   for (i = 0; i < 2; ++i) {
     for (j = 0; j < 2; ++j) {
       if (*eobs++ > 1)
-        vp8_dequant_idct_add_dspr2(q, dq, dstu, stride);
+        vp8_dequant_idct_add_dspr2(q, dq, dst_u, stride);
       else {
-        vp8_dc_only_idct_add_dspr2(q[0] * dq[0], dstu, stride, dstu, stride);
+        vp8_dc_only_idct_add_dspr2(q[0] * dq[0], dst_u, stride, dst_u, stride);
         ((int *)q)[0] = 0;
       }
 
       q += 16;
-      dstu += 4;
+      dst_u += 4;
     }
 
-    dstu += 4 * stride - 8;
+    dst_u += 4 * stride - 8;
   }
 
   for (i = 0; i < 2; ++i) {
     for (j = 0; j < 2; ++j) {
       if (*eobs++ > 1)
-        vp8_dequant_idct_add_dspr2(q, dq, dstv, stride);
+        vp8_dequant_idct_add_dspr2(q, dq, dst_v, stride);
       else {
-        vp8_dc_only_idct_add_dspr2(q[0] * dq[0], dstv, stride, dstv, stride);
+        vp8_dc_only_idct_add_dspr2(q[0] * dq[0], dst_v, stride, dst_v, stride);
         ((int *)q)[0] = 0;
       }
 
       q += 16;
-      dstv += 4;
+      dst_v += 4;
     }
 
-    dstv += 4 * stride - 8;
+    dst_v += 4 * stride - 8;
   }
 }
 
diff --git a/media/libvpx/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c b/media/libvpx/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c
index d2c3442515..21446fb413 100644
--- a/media/libvpx/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c
+++ b/media/libvpx/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c
@@ -934,8 +934,8 @@ void vp8_loop_filter_uvvertical_edge_mips(unsigned char *s, int p,
   s4 = s3 + p;
 
   /* load quad-byte vectors
-  * memory is 4 byte aligned
-  */
+   * memory is 4 byte aligned
+   */
   p2 = *((uint32_t *)(s1 - 4));
   p6 = *((uint32_t *)(s1));
   p1 = *((uint32_t *)(s2 - 4));
@@ -990,8 +990,8 @@ void vp8_loop_filter_uvvertical_edge_mips(unsigned char *s, int p,
       :);
 
   /* if (p1 - p4 == 0) and (p2 - p3 == 0)
-  * mask will be zero and filtering is not needed
-  */
+   * mask will be zero and filtering is not needed
+   */
   if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
     vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
                              thresh, &hev, &mask);
@@ -2102,8 +2102,8 @@ void vp8_mbloop_filter_uvvertical_edge_mips(unsigned char *s, int p,
   s4 = s3 + p;
 
   /* load quad-byte vectors
-  * memory is 4 byte aligned
-  */
+   * memory is 4 byte aligned
+   */
   p2 = *((uint32_t *)(s1 - 4));
   p6 = *((uint32_t *)(s1));
   p1 = *((uint32_t *)(s2 - 4));
diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/copymem_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/copymem_mmi.c
new file mode 100644
index 0000000000..86a32aa9ef
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mips/mmi/copymem_mmi.c
@@ -0,0 +1,114 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+#define COPY_MEM_16X2 \
+  "gsldlc1    %[ftmp0],   0x07(%[src])                    \n\t" \
+  "gsldrc1    %[ftmp0],   0x00(%[src])                    \n\t" \
+  "ldl        %[tmp0],    0x0f(%[src])                    \n\t" \
+  "ldr        %[tmp0],    0x08(%[src])                    \n\t" \
+  MMI_ADDU(%[src],     %[src],         %[src_stride])           \
+  "gssdlc1    %[ftmp0],   0x07(%[dst])                    \n\t" \
+  "gssdrc1    %[ftmp0],   0x00(%[dst])                    \n\t" \
+  "sdl        %[tmp0],    0x0f(%[dst])                    \n\t" \
+  "sdr        %[tmp0],    0x08(%[dst])                    \n\t" \
+  MMI_ADDU(%[dst],      %[dst],        %[dst_stride])           \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                    \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                    \n\t" \
+  "ldl        %[tmp1],    0x0f(%[src])                    \n\t" \
+  "ldr        %[tmp1],    0x08(%[src])                    \n\t" \
+  MMI_ADDU(%[src],     %[src],         %[src_stride])           \
+  "gssdlc1    %[ftmp1],   0x07(%[dst])                    \n\t" \
+  "gssdrc1    %[ftmp1],   0x00(%[dst])                    \n\t" \
+  "sdl        %[tmp1],    0x0f(%[dst])                    \n\t" \
+  "sdr        %[tmp1],    0x08(%[dst])                    \n\t" \
+  MMI_ADDU(%[dst],     %[dst],         %[dst_stride])
+
+#define COPY_MEM_8X2 \
+  "gsldlc1    %[ftmp0],   0x07(%[src])                    \n\t" \
+  "gsldrc1    %[ftmp0],   0x00(%[src])                    \n\t" \
+  MMI_ADDU(%[src],     %[src],         %[src_stride])           \
+  "ldl        %[tmp0],    0x07(%[src])                    \n\t" \
+  "ldr        %[tmp0],    0x00(%[src])                    \n\t" \
+  MMI_ADDU(%[src],     %[src],         %[src_stride])           \
+                                                                \
+  "gssdlc1    %[ftmp0],   0x07(%[dst])                    \n\t" \
+  "gssdrc1    %[ftmp0],   0x00(%[dst])                    \n\t" \
+  MMI_ADDU(%[dst],      %[dst],        %[dst_stride])           \
+  "sdl        %[tmp0],    0x07(%[dst])                    \n\t" \
+  "sdr        %[tmp0],    0x00(%[dst])                    \n\t" \
+  MMI_ADDU(%[dst],     %[dst],         %[dst_stride])
+
+void vp8_copy_mem16x16_mmi(unsigned char *src, int src_stride,
+                           unsigned char *dst, int dst_stride) {
+  double ftmp[2];
+  uint64_t tmp[2];
+  uint8_t loop_count = 4;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "1:                                                     \n\t"
+    COPY_MEM_16X2
+    COPY_MEM_16X2
+    MMI_ADDIU(%[loop_count], %[loop_count], -0x01)
+    "bnez       %[loop_count],    1b                        \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+      [loop_count]"+&r"(loop_count),
+      [dst]"+&r"(dst),                  [src]"+&r"(src)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [dst_stride]"r"((mips_reg)dst_stride)
+    : "memory"
+  );
+  /* clang-format on */
+}
+
+void vp8_copy_mem8x8_mmi(unsigned char *src, int src_stride, unsigned char *dst,
+                         int dst_stride) {
+  double ftmp[2];
+  uint64_t tmp[1];
+  uint8_t loop_count = 4;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "1:                                                     \n\t"
+    COPY_MEM_8X2
+    MMI_ADDIU(%[loop_count], %[loop_count], -0x01)
+    "bnez       %[loop_count],    1b                        \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [tmp0]"=&r"(tmp[0]),              [loop_count]"+&r"(loop_count),
+      [dst]"+&r"(dst),                  [src]"+&r"(src)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [dst_stride]"r"((mips_reg)dst_stride)
+    : "memory"
+  );
+  /* clang-format on */
+}
+
+void vp8_copy_mem8x4_mmi(unsigned char *src, int src_stride, unsigned char *dst,
+                         int dst_stride) {
+  double ftmp[2];
+  uint64_t tmp[1];
+
+  /* clang-format off */
+  __asm__ volatile (
+    COPY_MEM_8X2
+    COPY_MEM_8X2
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [tmp0]"=&r"(tmp[0]),
+      [dst]"+&r"(dst),                  [src]"+&r"(src)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [dst_stride]"r"((mips_reg)dst_stride)
+    : "memory"
+  );
+  /* clang-format on */
+}
diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/dequantize_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/dequantize_mmi.c
new file mode 100644
index 0000000000..b9330a6663
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mips/mmi/dequantize_mmi.c
@@ -0,0 +1,115 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/blockd.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+void vp8_dequantize_b_mmi(BLOCKD *d, int16_t *DQC) {
+  double ftmp[8];
+
+  __asm__ volatile(
+      "gsldlc1    %[ftmp0],   0x07(%[qcoeff])                 \n\t"
+      "gsldrc1    %[ftmp0],   0x00(%[qcoeff])                 \n\t"
+      "gsldlc1    %[ftmp1],   0x0f(%[qcoeff])                 \n\t"
+      "gsldrc1    %[ftmp1],   0x08(%[qcoeff])                 \n\t"
+      "gsldlc1    %[ftmp2],   0x17(%[qcoeff])                 \n\t"
+      "gsldrc1    %[ftmp2],   0x10(%[qcoeff])                 \n\t"
+      "gsldlc1    %[ftmp3],   0x1f(%[qcoeff])                 \n\t"
+      "gsldrc1    %[ftmp3],   0x18(%[qcoeff])                 \n\t"
+
+      "gsldlc1    %[ftmp4],   0x07(%[DQC])                    \n\t"
+      "gsldrc1    %[ftmp4],   0x00(%[DQC])                    \n\t"
+      "gsldlc1    %[ftmp5],   0x0f(%[DQC])                    \n\t"
+      "gsldrc1    %[ftmp5],   0x08(%[DQC])                    \n\t"
+      "gsldlc1    %[ftmp6],   0x17(%[DQC])                    \n\t"
+      "gsldrc1    %[ftmp6],   0x10(%[DQC])                    \n\t"
+      "gsldlc1    %[ftmp7],   0x1f(%[DQC])                    \n\t"
+      "gsldrc1    %[ftmp7],   0x18(%[DQC])                    \n\t"
+
+      "pmullh     %[ftmp0],   %[ftmp0],       %[ftmp4]        \n\t"
+      "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp5]        \n\t"
+      "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp6]        \n\t"
+      "pmullh     %[ftmp3],   %[ftmp3],       %[ftmp7]        \n\t"
+
+      "gssdlc1    %[ftmp0],   0x07(%[dqcoeff])                \n\t"
+      "gssdrc1    %[ftmp0],   0x00(%[dqcoeff])                \n\t"
+      "gssdlc1    %[ftmp1],   0x0f(%[dqcoeff])                \n\t"
+      "gssdrc1    %[ftmp1],   0x08(%[dqcoeff])                \n\t"
+      "gssdlc1    %[ftmp2],   0x17(%[dqcoeff])                \n\t"
+      "gssdrc1    %[ftmp2],   0x10(%[dqcoeff])                \n\t"
+      "gssdlc1    %[ftmp3],   0x1f(%[dqcoeff])                \n\t"
+      "gssdrc1    %[ftmp3],   0x18(%[dqcoeff])                \n\t"
+      : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+        [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+        [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7])
+      : [dqcoeff] "r"(d->dqcoeff), [qcoeff] "r"(d->qcoeff), [DQC] "r"(DQC)
+      : "memory");
+}
+
+void vp8_dequant_idct_add_mmi(int16_t *input, int16_t *dq, unsigned char *dest,
+                              int stride) {
+  double ftmp[8];
+
+  __asm__ volatile(
+      "gsldlc1    %[ftmp0],   0x07(%[dq])                     \n\t"
+      "gsldrc1    %[ftmp0],   0x00(%[dq])                     \n\t"
+      "gsldlc1    %[ftmp1],   0x0f(%[dq])                     \n\t"
+      "gsldrc1    %[ftmp1],   0x08(%[dq])                     \n\t"
+      "gsldlc1    %[ftmp2],   0x17(%[dq])                     \n\t"
+      "gsldrc1    %[ftmp2],   0x10(%[dq])                     \n\t"
+      "gsldlc1    %[ftmp3],   0x1f(%[dq])                     \n\t"
+      "gsldrc1    %[ftmp3],   0x18(%[dq])                     \n\t"
+
+      "gsldlc1    %[ftmp4],   0x07(%[input])                  \n\t"
+      "gsldrc1    %[ftmp4],   0x00(%[input])                  \n\t"
+      "gsldlc1    %[ftmp5],   0x0f(%[input])                  \n\t"
+      "gsldrc1    %[ftmp5],   0x08(%[input])                  \n\t"
+      "gsldlc1    %[ftmp6],   0x17(%[input])                  \n\t"
+      "gsldrc1    %[ftmp6],   0x10(%[input])                  \n\t"
+      "gsldlc1    %[ftmp7],   0x1f(%[input])                  \n\t"
+      "gsldrc1    %[ftmp7],   0x18(%[input])                  \n\t"
+
+      "pmullh     %[ftmp0],   %[ftmp0],       %[ftmp4]        \n\t"
+      "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp5]        \n\t"
+      "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp6]        \n\t"
+      "pmullh     %[ftmp3],   %[ftmp3],       %[ftmp7]        \n\t"
+
+      "gssdlc1    %[ftmp0],   0x07(%[input])                  \n\t"
+      "gssdrc1    %[ftmp0],   0x00(%[input])                  \n\t"
+      "gssdlc1    %[ftmp1],   0x0f(%[input])                  \n\t"
+      "gssdrc1    %[ftmp1],   0x08(%[input])                  \n\t"
+      "gssdlc1    %[ftmp2],   0x17(%[input])                  \n\t"
+      "gssdrc1    %[ftmp2],   0x10(%[input])                  \n\t"
+      "gssdlc1    %[ftmp3],   0x1f(%[input])                  \n\t"
+      "gssdrc1    %[ftmp3],   0x18(%[input])                  \n\t"
+      : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+        [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+        [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7])
+      : [dq] "r"(dq), [input] "r"(input)
+      : "memory");
+
+  vp8_short_idct4x4llm_mmi(input, dest, stride, dest, stride);
+
+  __asm__ volatile(
+      "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]        \n\t"
+      "gssdlc1    %[ftmp0],   0x07(%[input])                  \n\t"
+      "gssdrc1    %[ftmp0],   0x00(%[input])                  \n\t"
+      "sdl        $0,         0x0f(%[input])                  \n\t"
+      "sdr        $0,         0x08(%[input])                  \n\t"
+      "gssdlc1    %[ftmp0],   0x17(%[input])                  \n\t"
+      "gssdrc1    %[ftmp0],   0x10(%[input])                  \n\t"
+      "sdl        $0,         0x1f(%[input])                  \n\t"
+      "sdr        $0,         0x18(%[input])                  \n\t"
+      : [ftmp0] "=&f"(ftmp[0])
+      : [input] "r"(input)
+      : "memory");
+}
diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c
new file mode 100644
index 0000000000..4fd6854c52
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c
@@ -0,0 +1,70 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vp8_dequant_idct_add_y_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst,
+                                      int stride, char *eobs) {
+  int i, j;
+
+  for (i = 0; i < 4; i++) {
+    for (j = 0; j < 4; j++) {
+      if (*eobs++ > 1) {
+        vp8_dequant_idct_add_mmi(q, dq, dst, stride);
+      } else {
+        vp8_dc_only_idct_add_mmi(q[0] * dq[0], dst, stride, dst, stride);
+        memset(q, 0, 2 * sizeof(q[0]));
+      }
+
+      q += 16;
+      dst += 4;
+    }
+
+    dst += 4 * stride - 16;
+  }
+}
+
+void vp8_dequant_idct_add_uv_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst_u,
+                                       uint8_t *dst_v, int stride, char *eobs) {
+  int i, j;
+
+  for (i = 0; i < 2; i++) {
+    for (j = 0; j < 2; j++) {
+      if (*eobs++ > 1) {
+        vp8_dequant_idct_add_mmi(q, dq, dst_u, stride);
+      } else {
+        vp8_dc_only_idct_add_mmi(q[0] * dq[0], dst_u, stride, dst_u, stride);
+        memset(q, 0, 2 * sizeof(q[0]));
+      }
+
+      q += 16;
+      dst_u += 4;
+    }
+
+    dst_u += 4 * stride - 8;
+  }
+
+  for (i = 0; i < 2; i++) {
+    for (j = 0; j < 2; j++) {
+      if (*eobs++ > 1) {
+        vp8_dequant_idct_add_mmi(q, dq, dst_v, stride);
+      } else {
+        vp8_dc_only_idct_add_mmi(q[0] * dq[0], dst_v, stride, dst_v, stride);
+        memset(q, 0, 2 * sizeof(q[0]));
+      }
+
+      q += 16;
+      dst_v += 4;
+    }
+
+    dst_v += 4 * stride - 8;
+  }
+}
diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/idctllm_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/idctllm_mmi.c
new file mode 100644
index 0000000000..a35689dd30
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mips/mmi/idctllm_mmi.c
@@ -0,0 +1,335 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+#define TRANSPOSE_4H \
+  "pxor          %[ftmp0],    %[ftmp0],    %[ftmp0]          \n\t" \
+  MMI_LI(%[tmp0], 0x93)                                            \
+  "mtc1          %[tmp0],     %[ftmp10]                      \n\t" \
+  "punpcklhw     %[ftmp5],    %[ftmp1],    %[ftmp0]          \n\t" \
+  "punpcklhw     %[ftmp9],    %[ftmp2],    %[ftmp0]          \n\t" \
+  "pshufh        %[ftmp9],    %[ftmp9],    %[ftmp10]         \n\t" \
+  "por           %[ftmp5],    %[ftmp5],    %[ftmp9]          \n\t" \
+  "punpckhhw     %[ftmp6],    %[ftmp1],    %[ftmp0]          \n\t" \
+  "punpckhhw     %[ftmp9],    %[ftmp2],    %[ftmp0]          \n\t" \
+  "pshufh        %[ftmp9],    %[ftmp9],    %[ftmp10]         \n\t" \
+  "por           %[ftmp6],    %[ftmp6],    %[ftmp9]          \n\t" \
+  "punpcklhw     %[ftmp7],    %[ftmp3],    %[ftmp0]          \n\t" \
+  "punpcklhw     %[ftmp9],    %[ftmp4],    %[ftmp0]          \n\t" \
+  "pshufh        %[ftmp9],    %[ftmp9],    %[ftmp10]         \n\t" \
+  "por           %[ftmp7],    %[ftmp7],    %[ftmp9]          \n\t" \
+  "punpckhhw     %[ftmp8],    %[ftmp3],    %[ftmp0]          \n\t" \
+  "punpckhhw     %[ftmp9],    %[ftmp4],    %[ftmp0]          \n\t" \
+  "pshufh        %[ftmp9],    %[ftmp9],    %[ftmp10]         \n\t" \
+  "por           %[ftmp8],    %[ftmp8],    %[ftmp9]          \n\t" \
+  "punpcklwd     %[ftmp1],    %[ftmp5],    %[ftmp7]          \n\t" \
+  "punpckhwd     %[ftmp2],    %[ftmp5],    %[ftmp7]          \n\t" \
+  "punpcklwd     %[ftmp3],    %[ftmp6],    %[ftmp8]          \n\t" \
+  "punpckhwd     %[ftmp4],    %[ftmp6],    %[ftmp8]          \n\t"
+
+void vp8_short_idct4x4llm_mmi(int16_t *input, unsigned char *pred_ptr,
+                              int pred_stride, unsigned char *dst_ptr,
+                              int dst_stride) {
+  double ftmp[12];
+  uint64_t tmp[1];
+  double ff_ph_04, ff_ph_4e7b, ff_ph_22a3;
+
+  __asm__ volatile (
+    "dli        %[tmp0],    0x0004000400040004                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_04]                         \n\t"
+    "dli        %[tmp0],    0x4e7b4e7b4e7b4e7b                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_4e7b]                       \n\t"
+    "dli        %[tmp0],    0x22a322a322a322a3                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_22a3]                       \n\t"
+    MMI_LI(%[tmp0], 0x02)
+    "dmtc1      %[tmp0],    %[ftmp11]                           \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+
+    "gsldlc1    %[ftmp1],   0x07(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[ip])                         \n\t"
+    "gsldlc1    %[ftmp2],   0x0f(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp2],   0x08(%[ip])                         \n\t"
+    "gsldlc1    %[ftmp3],   0x17(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp3],   0x10(%[ip])                         \n\t"
+    "gsldlc1    %[ftmp4],   0x1f(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp4],   0x18(%[ip])                         \n\t"
+
+    // ip[0...3] + ip[8...11]
+    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp3]            \n\t"
+    // ip[0...3] - ip[8...11]
+    "psubh      %[ftmp6],   %[ftmp1],       %[ftmp3]            \n\t"
+    // (ip[12...15] * sinpi8sqrt2) >> 16
+    "psllh      %[ftmp9],   %[ftmp4],       %[ftmp11]           \n\t"
+    "pmulhh     %[ftmp7],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
+    // (ip[ 4... 7] * sinpi8sqrt2) >> 16
+    "psllh      %[ftmp9],   %[ftmp2],       %[ftmp11]           \n\t"
+    "pmulhh     %[ftmp8],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
+    // ip[ 4... 7] + ((ip[ 4... 7] * cospi8sqrt2minus1) >> 16)
+    "pmulhh     %[ftmp9],   %[ftmp2],       %[ff_ph_4e7b]       \n\t"
+    "paddh      %[ftmp9],   %[ftmp9],       %[ftmp2]            \n\t"
+    // ip[12...15] + ((ip[12...15] * cospi8sqrt2minus1) >> 16)
+    "pmulhh     %[ftmp10],  %[ftmp4],       %[ff_ph_4e7b]       \n\t"
+    "paddh      %[ftmp10],  %[ftmp10],      %[ftmp4]            \n\t"
+
+    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp7]            \n\t"
+    "paddh      %[ftmp1],   %[ftmp1],       %[ftmp9]            \n\t"
+    "paddh      %[ftmp2],   %[ftmp6],       %[ftmp8]            \n\t"
+    "psubh      %[ftmp2],   %[ftmp2],       %[ftmp10]           \n\t"
+    "psubh      %[ftmp3],   %[ftmp6],       %[ftmp8]            \n\t"
+    "paddh      %[ftmp3],   %[ftmp3],       %[ftmp10]           \n\t"
+    "psubh      %[ftmp4],   %[ftmp5],       %[ftmp7]            \n\t"
+    "psubh      %[ftmp4],   %[ftmp4],       %[ftmp9]            \n\t"
+
+    TRANSPOSE_4H
+    // a
+    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp3]            \n\t"
+    // b
+    "psubh      %[ftmp6],   %[ftmp1],       %[ftmp3]            \n\t"
+    // c
+    "psllh      %[ftmp9],   %[ftmp2],       %[ftmp11]           \n\t"
+    "pmulhh     %[ftmp9],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
+    "psubh      %[ftmp7],   %[ftmp9],       %[ftmp4]            \n\t"
+    "pmulhh     %[ftmp10],  %[ftmp4],       %[ff_ph_4e7b]       \n\t"
+    "psubh      %[ftmp7],   %[ftmp7],       %[ftmp10]           \n\t"
+    // d
+    "psllh      %[ftmp9],   %[ftmp4],       %[ftmp11]           \n\t"
+    "pmulhh     %[ftmp9],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
+    "paddh      %[ftmp8],   %[ftmp9],       %[ftmp2]            \n\t"
+    "pmulhh     %[ftmp10],  %[ftmp2],       %[ff_ph_4e7b]       \n\t"
+    "paddh      %[ftmp8],   %[ftmp8],       %[ftmp10]           \n\t"
+
+    MMI_LI(%[tmp0], 0x03)
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    // a + d
+    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp8]            \n\t"
+    "paddh      %[ftmp1],   %[ftmp1],       %[ff_ph_04]         \n\t"
+    "psrah      %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
+    // b + c
+    "paddh      %[ftmp2],   %[ftmp6],       %[ftmp7]            \n\t"
+    "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_04]         \n\t"
+    "psrah      %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
+    // b - c
+    "psubh      %[ftmp3],   %[ftmp6],       %[ftmp7]            \n\t"
+    "paddh      %[ftmp3],   %[ftmp3],       %[ff_ph_04]         \n\t"
+    "psrah      %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
+    // a - d
+    "psubh      %[ftmp4],   %[ftmp5],       %[ftmp8]            \n\t"
+    "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_04]         \n\t"
+    "psrah      %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
+
+    TRANSPOSE_4H
+#if _MIPS_SIM == _ABIO32
+    "ulw        %[tmp0],    0x00(%[pred_prt])                   \n\t"
+    "mtc1       %[tmp0],    %[ftmp5]                            \n\t"
+#else
+    "gslwlc1    %[ftmp5],   0x03(%[pred_ptr])                   \n\t"
+    "gslwrc1    %[ftmp5],   0x00(%[pred_ptr])                   \n\t"
+#endif
+    "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]            \n\t"
+    "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
+    "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+    "gsswlc1    %[ftmp1],   0x03(%[dst_ptr])                    \n\t"
+    "gsswrc1    %[ftmp1],   0x00(%[dst_ptr])                    \n\t"
+    MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+    MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+
+#if _MIPS_SIM == _ABIO32
+    "ulw        %[tmp0],    0x00(%[pred_prt])                   \n\t"
+    "mtc1       %[tmp0],    %[ftmp6]                            \n\t"
+#else
+    "gslwlc1    %[ftmp6],   0x03(%[pred_ptr])                   \n\t"
+    "gslwrc1    %[ftmp6],   0x00(%[pred_ptr])                   \n\t"
+#endif
+    "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+    "paddh      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
+    "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
+    "gsswlc1    %[ftmp2],   0x03(%[dst_ptr])                    \n\t"
+    "gsswrc1    %[ftmp2],   0x00(%[dst_ptr])                    \n\t"
+    MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+    MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+
+#if _MIPS_SIM == _ABIO32
+    "ulw        %[tmp0],    0x00(%[pred_prt])                   \n\t"
+    "mtc1       %[tmp0],    %[ftmp7]                            \n\t"
+#else
+    "gslwlc1    %[ftmp7],   0x03(%[pred_ptr])                   \n\t"
+    "gslwrc1    %[ftmp7],   0x00(%[pred_ptr])                   \n\t"
+#endif
+    "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]            \n\t"
+    "paddh      %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
+    "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
+    "gsswlc1    %[ftmp3],   0x03(%[dst_ptr])                    \n\t"
+    "gsswrc1    %[ftmp3],   0x00(%[dst_ptr])                    \n\t"
+    MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+    MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+
+#if _MIPS_SIM == _ABIO32
+    "ulw        %[tmp0],    0x00(%[pred_prt])                   \n\t"
+    "mtc1       %[tmp0],    %[ftmp8]                            \n\t"
+#else
+    "gslwlc1    %[ftmp8],   0x03(%[pred_ptr])                   \n\t"
+    "gslwrc1    %[ftmp8],   0x00(%[pred_ptr])                   \n\t"
+#endif
+    "punpcklbh  %[ftmp8],   %[ftmp8],       %[ftmp0]            \n\t"
+    "paddh      %[ftmp4],   %[ftmp4],       %[ftmp8]            \n\t"
+    "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
+    "gsswlc1    %[ftmp4],   0x03(%[dst_ptr])                    \n\t"
+    "gsswrc1    %[ftmp4],   0x00(%[dst_ptr])                    \n\t"
+    : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
+      [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]),
+      [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]),
+      [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]),
+      [pred_ptr]"+&r"(pred_ptr), [dst_ptr]"+&r"(dst_ptr),
+      [ff_ph_4e7b]"=&f"(ff_ph_4e7b), [ff_ph_04]"=&f"(ff_ph_04),
+      [ff_ph_22a3]"=&f"(ff_ph_22a3)
+    : [ip]"r"(input),
+      [pred_stride]"r"((mips_reg)pred_stride),
+      [dst_stride]"r"((mips_reg)dst_stride)
+    : "memory"
+  );
+}
+
+void vp8_dc_only_idct_add_mmi(int16_t input_dc, unsigned char *pred_ptr,
+                              int pred_stride, unsigned char *dst_ptr,
+                              int dst_stride) {
+  int a0 = ((input_dc + 4) >> 3);
+  double a1, ftmp[5];
+  int low32;
+
+  __asm__ volatile (
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]        \n\t"
+    "dmtc1      %[a0],      %[a1]                           \n\t"
+    "pshufh     %[a1],      %[a1],          %[ftmp0]        \n\t"
+    "ulw        %[low32],   0x00(%[pred_ptr])               \n\t"
+    "mtc1       %[low32],   %[ftmp1]                        \n\t"
+    "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]        \n\t"
+    "paddsh     %[ftmp2],   %[ftmp2],       %[a1]           \n\t"
+    "packushb   %[ftmp1],   %[ftmp2],       %[ftmp0]        \n\t"
+    "gsswlc1    %[ftmp1],   0x03(%[dst_ptr])                \n\t"
+    "gsswrc1    %[ftmp1],   0x00(%[dst_ptr])                \n\t"
+
+    MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+    MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+    "ulw        %[low32],   0x00(%[pred_ptr])               \n\t"
+    "mtc1       %[low32],   %[ftmp1]                        \n\t"
+    "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]        \n\t"
+    "paddsh     %[ftmp2],   %[ftmp2],       %[a1]           \n\t"
+    "packushb   %[ftmp1],   %[ftmp2],       %[ftmp0]        \n\t"
+    "gsswlc1    %[ftmp1],   0x03(%[dst_ptr])                \n\t"
+    "gsswrc1    %[ftmp1],   0x00(%[dst_ptr])                \n\t"
+
+    MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+    MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+    "ulw        %[low32],   0x00(%[pred_ptr])               \n\t"
+    "mtc1       %[low32],   %[ftmp1]                        \n\t"
+    "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]        \n\t"
+    "paddsh     %[ftmp2],   %[ftmp2],       %[a1]           \n\t"
+    "packushb   %[ftmp1],   %[ftmp2],       %[ftmp0]        \n\t"
+    "gsswlc1    %[ftmp1],   0x03(%[dst_ptr])                \n\t"
+    "gsswrc1    %[ftmp1],   0x00(%[dst_ptr])                \n\t"
+
+    MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+    MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+    "ulw        %[low32],   0x00(%[pred_ptr])               \n\t"
+    "mtc1       %[low32],   %[ftmp1]                        \n\t"
+    "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]        \n\t"
+    "paddsh     %[ftmp2],   %[ftmp2],       %[a1]           \n\t"
+    "packushb   %[ftmp1],   %[ftmp2],       %[ftmp0]        \n\t"
+    "gsswlc1    %[ftmp1],   0x03(%[dst_ptr])                \n\t"
+    "gsswrc1    %[ftmp1],   0x00(%[dst_ptr])                \n\t"
+    : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
+      [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [low32]"=&r"(low32),
+      [dst_ptr]"+&r"(dst_ptr), [pred_ptr]"+&r"(pred_ptr), [a1]"=&f"(a1)
+    : [dst_stride]"r"((mips_reg)dst_stride),
+      [pred_stride]"r"((mips_reg)pred_stride), [a0]"r"(a0)
+    : "memory"
+  );
+}
+
+void vp8_short_inv_walsh4x4_mmi(int16_t *input, int16_t *mb_dqcoeff) {
+  int i;
+  int16_t output[16];
+  double ff_ph_03, ftmp[12];
+  uint64_t tmp[1];
+
+  __asm__ volatile (
+    "dli        %[tmp0],    0x0003000300030003                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_03]                         \n\t"
+    MMI_LI(%[tmp0], 0x03)
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "dmtc1      %[tmp0],    %[ftmp11]                           \n\t"
+    "gsldlc1    %[ftmp1],   0x07(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[ip])                         \n\t"
+    "gsldlc1    %[ftmp2],   0x0f(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp2],   0x08(%[ip])                         \n\t"
+    "gsldlc1    %[ftmp3],   0x17(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp3],   0x10(%[ip])                         \n\t"
+    "gsldlc1    %[ftmp4],   0x1f(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp4],   0x18(%[ip])                         \n\t"
+    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp2]            \n\t"
+    "psubh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
+    "paddh      %[ftmp7],   %[ftmp3],       %[ftmp4]            \n\t"
+    "psubh      %[ftmp8],   %[ftmp3],       %[ftmp4]            \n\t"
+
+    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp7]            \n\t"
+    "psubh      %[ftmp2],   %[ftmp5],       %[ftmp7]            \n\t"
+    "psubh      %[ftmp3],   %[ftmp6],       %[ftmp8]            \n\t"
+    "paddh      %[ftmp4],   %[ftmp6],       %[ftmp8]            \n\t"
+
+    TRANSPOSE_4H
+    // a
+    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp4]            \n\t"
+    // d
+    "psubh      %[ftmp6],   %[ftmp1],       %[ftmp4]            \n\t"
+    // b
+    "paddh      %[ftmp7],   %[ftmp2],       %[ftmp3]            \n\t"
+    // c
+    "psubh      %[ftmp8],   %[ftmp2],       %[ftmp3]            \n\t"
+
+    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp7]            \n\t"
+    "paddh      %[ftmp2],   %[ftmp6],       %[ftmp8]            \n\t"
+    "psubh      %[ftmp3],   %[ftmp5],       %[ftmp7]            \n\t"
+    "psubh      %[ftmp4],   %[ftmp6],       %[ftmp8]            \n\t"
+
+    "paddh      %[ftmp1],   %[ftmp1],       %[ff_ph_03]         \n\t"
+    "psrah      %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
+    "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_03]         \n\t"
+    "psrah      %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
+    "paddh      %[ftmp3],   %[ftmp3],       %[ff_ph_03]         \n\t"
+    "psrah      %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
+    "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_03]         \n\t"
+    "psrah      %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
+
+    TRANSPOSE_4H
+    "gssdlc1    %[ftmp1],   0x07(%[op])                         \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[op])                         \n\t"
+    "gssdlc1    %[ftmp2],   0x0f(%[op])                         \n\t"
+    "gssdrc1    %[ftmp2],   0x08(%[op])                         \n\t"
+    "gssdlc1    %[ftmp3],   0x17(%[op])                         \n\t"
+    "gssdrc1    %[ftmp3],   0x10(%[op])                         \n\t"
+    "gssdlc1    %[ftmp4],   0x1f(%[op])                         \n\t"
+    "gssdrc1    %[ftmp4],   0x18(%[op])                         \n\t"
+    : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
+      [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]),
+      [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]),
+      [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), [ff_ph_03]"=&f"(ff_ph_03)
+    : [ip]"r"(input), [op]"r"(output)
+    : "memory"
+  );
+
+  for (i = 0; i < 16; i++) {
+    mb_dqcoeff[i * 16] = output[i];
+  }
+}
diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c
new file mode 100644
index 0000000000..a07a7e3b41
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c
@@ -0,0 +1,1415 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/onyxc_int.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+void vp8_loop_filter_horizontal_edge_mmi(
+    unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
+    const unsigned char *limit, const unsigned char *thresh, int count) {
+  uint64_t tmp[1];
+  mips_reg addr[2];
+  double ftmp[12];
+  double ff_ph_01, ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03;
+  /* clang-format off */
+  __asm__ volatile (
+    "dli        %[tmp0],    0x0001000100010001                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_01]                             \n\t"
+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"
+    "dli        %[tmp0],    0x8080808080808080                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"
+    "dli        %[tmp0],    0x0404040404040404                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"
+    "dli        %[tmp0],    0x0303030303030303                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_03]                             \n\t"
+    "1:                                                             \n\t"
+    "gsldlc1    %[ftmp10],  0x07(%[limit])                          \n\t"
+    "gsldrc1    %[ftmp10],  0x00(%[limit])                          \n\t"
+
+    MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4])
+    "gsldlc1    %[ftmp1],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[addr1])                          \n\t"
+
+    MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4])
+    "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+    "pasubub    %[ftmp0],   %[ftmp1],           %[ftmp3]            \n\t"
+    "psubusb    %[ftmp0],   %[ftmp0],           %[ftmp10]           \n\t"
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+    "gsldlc1    %[ftmp4],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp4],   0x00(%[addr1])                          \n\t"
+    "pasubub    %[ftmp1],   %[ftmp3],           %[ftmp4]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp10]           \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp5],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp5],   0x00(%[addr1])                          \n\t"
+    "pasubub    %[ftmp9],   %[ftmp4],           %[ftmp5]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp9],           %[ftmp10]           \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    "gsldlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+
+    "gsldlc1    %[ftmp7],   0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[addr0])                          \n\t"
+    "pasubub    %[ftmp11],  %[ftmp7],           %[ftmp6]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp11],          %[ftmp10]           \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+    "gsldlc1    %[ftmp8],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[addr1])                          \n\t"
+    "pasubub    %[ftmp1],   %[ftmp8],           %[ftmp7]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp10]           \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2])
+    "gsldlc1    %[ftmp2],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[addr1])                          \n\t"
+    "pasubub    %[ftmp1],   %[ftmp2],           %[ftmp8]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp10]           \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    "pasubub    %[ftmp1],   %[ftmp5],           %[ftmp6]            \n\t"
+    "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
+    "pasubub    %[ftmp2],   %[ftmp4],           %[ftmp7]            \n\t"
+    "pand       %[ftmp2],   %[ftmp2],           %[ff_pb_fe]         \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
+    "psrlh      %[ftmp2],   %[ftmp2],           %[ftmp10]           \n\t"
+    "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+    "gsldlc1    %[ftmp10],  0x07(%[blimit])                         \n\t"
+    "gsldrc1    %[ftmp10],  0x00(%[blimit])                         \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp10]           \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "pxor       %[ftmp10],  %[ftmp10],          %[ftmp10]           \n\t"
+    "pcmpeqb    %[ftmp0],   %[ftmp0],           %[ftmp10]           \n\t"
+
+    "gsldlc1    %[ftmp10],  0x07(%[thresh])                         \n\t"
+    "gsldrc1    %[ftmp10],  0x00(%[thresh])                         \n\t"
+    "psubusb    %[ftmp1],   %[ftmp9],           %[ftmp10]           \n\t"
+    "psubusb    %[ftmp2],   %[ftmp11],          %[ftmp10]           \n\t"
+    "paddb      %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+    "pxor       %[ftmp2],   %[ftmp2],           %[ftmp2]            \n\t"
+    "pcmpeqb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+    "pcmpeqb    %[ftmp2],   %[ftmp2],           %[ftmp2]            \n\t"
+    "pxor       %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+
+    "pxor       %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
+
+    "psubsb     %[ftmp2],   %[ftmp4],           %[ftmp7]            \n\t"
+    "pand       %[ftmp2],   %[ftmp2],           %[ftmp1]            \n\t"
+    "psubsb     %[ftmp3],   %[ftmp6],           %[ftmp5]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp3]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp3]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp3]            \n\t"
+    "pand       %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
+
+    "paddsb     %[ftmp8],   %[ftmp2],           %[ff_pb_03]         \n\t"
+    "paddsb     %[ftmp9],   %[ftmp2],           %[ff_pb_04]         \n\t"
+
+    "pxor       %[ftmp0],   %[ftmp0],           %[ftmp0]            \n\t"
+    "pxor       %[ftmp11],  %[ftmp11],          %[ftmp11]           \n\t"
+    "punpcklbh  %[ftmp0],   %[ftmp0],           %[ftmp8]            \n\t"
+    "punpckhbh  %[ftmp11],  %[ftmp11],          %[ftmp8]            \n\t"
+
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
+    "psrah      %[ftmp0],   %[ftmp0],           %[ftmp10]           \n\t"
+    "psrah      %[ftmp11],  %[ftmp11],          %[ftmp10]           \n\t"
+    "packsshb   %[ftmp8],   %[ftmp0],           %[ftmp11]           \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],           %[ftmp0]            \n\t"
+    "punpcklbh  %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
+    "psrah      %[ftmp0],   %[ftmp0],           %[ftmp10]           \n\t"
+    "pxor       %[ftmp11],  %[ftmp11],          %[ftmp11]           \n\t"
+    "punpckhbh  %[ftmp9],   %[ftmp11],          %[ftmp9]            \n\t"
+    "psrah      %[ftmp9],   %[ftmp9],           %[ftmp10]           \n\t"
+    "paddsh     %[ftmp11],  %[ftmp0],           %[ff_ph_01]         \n\t"
+    "packsshb   %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
+    "paddsh     %[ftmp9],   %[ftmp9],           %[ff_ph_01]         \n\t"
+
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
+    "psrah      %[ftmp11],  %[ftmp11],          %[ftmp10]           \n\t"
+    "psrah      %[ftmp9],   %[ftmp9],           %[ftmp10]           \n\t"
+    "packsshb   %[ftmp11],  %[ftmp11],          %[ftmp9]            \n\t"
+    "pandn      %[ftmp1],   %[ftmp1],           %[ftmp11]           \n\t"
+    "paddsb     %[ftmp5],   %[ftmp5],           %[ftmp8]            \n\t"
+    "pxor       %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp5],   0x07(%[addr1])                          \n\t"
+    "gssdrc1    %[ftmp5],   0x00(%[addr1])                          \n\t"
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+    "paddsb     %[ftmp4],   %[ftmp4],           %[ftmp1]            \n\t"
+    "pxor       %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
+    "gssdlc1    %[ftmp4],   0x07(%[addr1])                          \n\t"
+    "gssdrc1    %[ftmp4],   0x00(%[addr1])                          \n\t"
+
+    "psubsb     %[ftmp6],   %[ftmp6],           %[ftmp0]            \n\t"
+    "pxor       %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "gssdlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+
+    "psubsb     %[ftmp7],   %[ftmp7],           %[ftmp1]            \n\t"
+    "pxor       %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
+    "gssdlc1    %[ftmp7],   0x07(%[addr0])                          \n\t"
+    "gssdrc1    %[ftmp7],   0x00(%[addr0])                          \n\t"
+
+    "addiu      %[count],   %[count],           -0x01               \n\t"
+    MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08)
+    "bnez       %[count],   1b                                      \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),              [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),              [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),              [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),              [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
+      [tmp0]"=&r"(tmp[0]),
+      [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count),
+      [ff_ph_01]"=&f"(ff_ph_01),        [ff_pb_fe]"=&f"(ff_pb_fe),
+      [ff_pb_80]"=&f"(ff_pb_80),        [ff_pb_04]"=&f"(ff_pb_04),
+      [ff_pb_03]"=&f"(ff_pb_03)
+    : [limit]"r"(limit),                [blimit]"r"(blimit),
+      [thresh]"r"(thresh),
+      [src_pixel_step]"r"((mips_reg)src_pixel_step),
+      [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),
+      [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2))
+    : "memory"
+  );
+  /* clang-format on */
+}
+
+void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
+                                       int src_pixel_step,
+                                       const unsigned char *blimit,
+                                       const unsigned char *limit,
+                                       const unsigned char *thresh, int count) {
+  uint64_t tmp[1];
+  mips_reg addr[2];
+  double ftmp[13];
+  double ff_pb_fe, ff_ph_01, ff_pb_03, ff_pb_04, ff_pb_80;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"
+    "dli        %[tmp0],    0x0001000100010001                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_01]                             \n\t"
+    "dli        %[tmp0],    0x0303030303030303                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_03]                             \n\t"
+    "dli        %[tmp0],    0x0404040404040404                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"
+    "dli        %[tmp0],    0x8080808080808080                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
+    MMI_SUBU(%[src_ptr], %[src_ptr], 0x04)
+
+    "1:                                                             \n\t"
+    MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
+
+    MMI_SLL (%[tmp0], %[src_pixel_step], 0x01)
+    MMI_ADDU(%[addr1], %[src_ptr], %[tmp0])
+    "gsldlc1    %[ftmp11],  0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp11],  0x00(%[addr1])                          \n\t"
+    MMI_ADDU(%[addr1], %[addr0], %[tmp0])
+    "gsldlc1    %[ftmp12],  0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp12],  0x00(%[addr1])                          \n\t"
+    "punpcklbh  %[ftmp1],   %[ftmp11],          %[ftmp12]           \n\t"
+    "punpckhbh  %[ftmp2],   %[ftmp11],          %[ftmp12]           \n\t"
+
+    "gsldlc1    %[ftmp11],  0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp11],  0x00(%[src_ptr])                        \n\t"
+    "gsldlc1    %[ftmp12],  0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp12],  0x00(%[addr0])                          \n\t"
+    "punpcklbh  %[ftmp3],   %[ftmp11],          %[ftmp12]           \n\t"
+    "punpckhbh  %[ftmp4],   %[ftmp11],          %[ftmp12]           \n\t"
+
+    "punpcklhw  %[ftmp5],   %[ftmp4],           %[ftmp2]            \n\t"
+    "punpckhhw  %[ftmp6],   %[ftmp4],           %[ftmp2]            \n\t"
+    "punpcklhw  %[ftmp7],   %[ftmp3],           %[ftmp1]            \n\t"
+    "punpckhhw  %[ftmp8],   %[ftmp3],           %[ftmp1]            \n\t"
+
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x01)
+    MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
+    "gsldlc1    %[ftmp11],  0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp11],  0x00(%[addr1])                          \n\t"
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp12],  0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp12],  0x00(%[addr1])                          \n\t"
+    "punpcklbh  %[ftmp9],   %[ftmp11],          %[ftmp12]           \n\t"
+    "punpckhbh  %[ftmp10],  %[ftmp11],          %[ftmp12]           \n\t"
+
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
+    "gsldlc1    %[ftmp11],  0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp11],  0x00(%[addr1])                          \n\t"
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_SUBU(%[addr1], %[addr0], %[tmp0])
+    "gsldlc1    %[ftmp12],  0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp12],  0x00(%[addr1])                          \n\t"
+    "punpcklbh  %[ftmp0],   %[ftmp11],          %[ftmp12]           \n\t"
+    "punpckhbh  %[ftmp11],  %[ftmp11],          %[ftmp12]           \n\t"
+
+    "punpcklhw  %[ftmp1],   %[ftmp11],          %[ftmp10]           \n\t"
+    "punpckhhw  %[ftmp2],   %[ftmp11],          %[ftmp10]           \n\t"
+    "punpcklhw  %[ftmp3],   %[ftmp0],           %[ftmp9]            \n\t"
+    "punpckhhw  %[ftmp4],   %[ftmp0],           %[ftmp9]            \n\t"
+
+    /* ftmp9:q0  ftmp10:q1 */
+    "punpcklwd  %[ftmp9],   %[ftmp1],           %[ftmp5]            \n\t"
+    "punpckhwd  %[ftmp10],  %[ftmp1],           %[ftmp5]            \n\t"
+    /* ftmp11:q2  ftmp12:q3 */
+    "punpcklwd  %[ftmp11],  %[ftmp2],           %[ftmp6]            \n\t"
+    "punpckhwd  %[ftmp12],  %[ftmp2],           %[ftmp6]            \n\t"
+    /* ftmp1:p3  ftmp2:p2 */
+    "punpcklwd  %[ftmp1],   %[ftmp3],           %[ftmp7]            \n\t"
+    "punpckhwd  %[ftmp2],   %[ftmp3],           %[ftmp7]            \n\t"
+    /* ftmp5:p1  ftmp6:p0 */
+    "punpcklwd  %[ftmp5],   %[ftmp4],           %[ftmp8]            \n\t"
+    "punpckhwd  %[ftmp6],   %[ftmp4],           %[ftmp8]            \n\t"
+
+    "gsldlc1    %[ftmp8],   0x07(%[limit])                          \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[limit])                          \n\t"
+
+    /* abs (q3-q2) */
+    "pasubub    %[ftmp7],   %[ftmp12],          %[ftmp11]           \n\t"
+    "psubusb    %[ftmp0],   %[ftmp7],           %[ftmp8]            \n\t"
+    /* abs (q2-q1) */
+    "pasubub    %[ftmp7],   %[ftmp11],          %[ftmp10]           \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    /* ftmp3: abs(q1-q0) */
+    "pasubub    %[ftmp3],   %[ftmp10],          %[ftmp9]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp3],           %[ftmp8]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    /* ftmp4: abs(p1-p0) */
+    "pasubub    %[ftmp4],   %[ftmp5],           %[ftmp6]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp4],           %[ftmp8]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    /* abs (p2-p1) */
+    "pasubub    %[ftmp7],   %[ftmp2],           %[ftmp5]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    /* abs (p3-p2) */
+    "pasubub    %[ftmp7],   %[ftmp1],           %[ftmp2]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+
+    "gsldlc1    %[ftmp8],   0x07(%[blimit])                         \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[blimit])                         \n\t"
+
+    /* abs (p0-q0) */
+    "pasubub    %[ftmp11],  %[ftmp9],           %[ftmp6]            \n\t"
+    "paddusb    %[ftmp11],  %[ftmp11],          %[ftmp11]           \n\t"
+    /* abs (p1-q1) */
+    "pasubub    %[ftmp12],  %[ftmp10],          %[ftmp5]            \n\t"
+    "pand       %[ftmp12],  %[ftmp12],          %[ff_pb_fe]         \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp1]                                \n\t"
+    "psrlh      %[ftmp12],  %[ftmp12],          %[ftmp1]            \n\t"
+    "paddusb    %[ftmp1],   %[ftmp11],          %[ftmp12]           \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp8]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "pxor       %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
+    /* ftmp0:mask */
+    "pcmpeqb    %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    "gsldlc1    %[ftmp8],   0x07(%[thresh])                         \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[thresh])                         \n\t"
+
+    /* ftmp3: abs(q1-q0)  ftmp4: abs(p1-p0) */
+    "psubusb    %[ftmp4],   %[ftmp4],           %[ftmp8]            \n\t"
+    "psubusb    %[ftmp3],   %[ftmp3],           %[ftmp8]            \n\t"
+    "por        %[ftmp2],   %[ftmp4],           %[ftmp3]            \n\t"
+    "pcmpeqb    %[ftmp2],   %[ftmp2],           %[ftmp1]            \n\t"
+    "pcmpeqb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
+    /* ftmp1:hev */
+    "pxor       %[ftmp1],   %[ftmp2],           %[ftmp1]            \n\t"
+
+    "pxor       %[ftmp10],  %[ftmp10],          %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp9],   %[ftmp9],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+
+    "psubsb     %[ftmp2],   %[ftmp5],           %[ftmp10]           \n\t"
+    "pand       %[ftmp2],   %[ftmp2],           %[ftmp1]            \n\t"
+    "psubsb     %[ftmp3],   %[ftmp9],           %[ftmp6]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp3]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp3]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp3]            \n\t"
+    /* ftmp2:filter_value */
+    "pand       %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
+
+    "paddsb     %[ftmp11],  %[ftmp2],           %[ff_pb_04]         \n\t"
+    "paddsb     %[ftmp12],  %[ftmp2],           %[ff_pb_03]         \n\t"
+
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp7]                                \n\t"
+    "pxor      %[ftmp0],    %[ftmp0],           %[ftmp0]            \n\t"
+    "pxor      %[ftmp8],    %[ftmp8],           %[ftmp8]            \n\t"
+    "punpcklbh %[ftmp0],    %[ftmp0],           %[ftmp12]           \n\t"
+    "punpckhbh %[ftmp8],    %[ftmp8],           %[ftmp12]           \n\t"
+    "psrah     %[ftmp0],    %[ftmp0],           %[ftmp7]            \n\t"
+    "psrah     %[ftmp8],    %[ftmp8],           %[ftmp7]            \n\t"
+    "packsshb  %[ftmp12],   %[ftmp0],           %[ftmp8]            \n\t"
+
+    "pxor      %[ftmp0],    %[ftmp0],           %[ftmp0]            \n\t"
+    "pxor      %[ftmp8],    %[ftmp8],           %[ftmp8]            \n\t"
+    "punpcklbh %[ftmp0],    %[ftmp0],           %[ftmp11]           \n\t"
+    "punpckhbh %[ftmp8],    %[ftmp8],           %[ftmp11]           \n\t"
+    "psrah     %[ftmp0],    %[ftmp0],           %[ftmp7]            \n\t"
+    "psrah     %[ftmp8],    %[ftmp8],           %[ftmp7]            \n\t"
+    "packsshb  %[ftmp11],   %[ftmp0],           %[ftmp8]            \n\t"
+
+    "psubsb     %[ftmp9],   %[ftmp9],           %[ftmp11]           \n\t"
+    "pxor       %[ftmp9],   %[ftmp9],           %[ff_pb_80]         \n\t"
+    "paddsb     %[ftmp6],   %[ftmp6],           %[ftmp12]           \n\t"
+    "pxor       %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "paddsh     %[ftmp0],   %[ftmp0],           %[ff_ph_01]         \n\t"
+    "paddsh     %[ftmp8],   %[ftmp8],           %[ff_ph_01]         \n\t"
+
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp7]                                \n\t"
+    "psrah      %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    "psrah      %[ftmp8],   %[ftmp8],           %[ftmp7]            \n\t"
+    "packsshb   %[ftmp2],   %[ftmp0],           %[ftmp8]            \n\t"
+    "pandn      %[ftmp2],   %[ftmp1],           %[ftmp2]            \n\t"
+    "psubsb     %[ftmp10],  %[ftmp10],          %[ftmp2]            \n\t"
+    "pxor       %[ftmp10],  %[ftmp10],          %[ff_pb_80]         \n\t"
+    "paddsb     %[ftmp5],   %[ftmp5],           %[ftmp2]            \n\t"
+    "pxor       %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+
+    /* ftmp5: *op1 ; ftmp6: *op0 */
+    "punpcklbh  %[ftmp2],   %[ftmp5],           %[ftmp6]            \n\t"
+    "punpckhbh  %[ftmp1],   %[ftmp5],           %[ftmp6]            \n\t"
+    /* ftmp9: *oq0 ; ftmp10: *oq1 */
+    "punpcklbh  %[ftmp4],   %[ftmp9],           %[ftmp10]           \n\t"
+    "punpckhbh  %[ftmp3],   %[ftmp9],           %[ftmp10]           \n\t"
+    "punpckhhw  %[ftmp6],   %[ftmp2],           %[ftmp4]            \n\t"
+    "punpcklhw  %[ftmp2],   %[ftmp2],           %[ftmp4]            \n\t"
+    "punpckhhw  %[ftmp5],   %[ftmp1],           %[ftmp3]            \n\t"
+    "punpcklhw  %[ftmp1],   %[ftmp1],           %[ftmp3]            \n\t"
+
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
+    "gsswlc1    %[ftmp2],   0x05(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp2],   0x02(%[addr1])                          \n\t"
+
+    "li         %[tmp0],    0x20                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "ssrld      %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_SUBU(%[addr1], %[addr0], %[tmp0])
+    "gsswlc1    %[ftmp2],   0x05(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp2],   0x02(%[addr1])                          \n\t"
+
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x01)
+    MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
+    "gsswlc1    %[ftmp6],   0x05(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp6],   0x02(%[addr1])                          \n\t"
+
+    "ssrld      %[ftmp6],   %[ftmp6],           %[ftmp9]            \n\t"
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gsswlc1    %[ftmp6],   0x05(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp6],   0x02(%[addr1])                          \n\t"
+    "gsswlc1    %[ftmp1],   0x05(%[src_ptr])                        \n\t"
+    "gsswrc1    %[ftmp1],   0x02(%[src_ptr])                        \n\t"
+
+    "ssrld      %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
+    "gsswlc1    %[ftmp1],   0x05(%[addr0])                          \n\t"
+    "gsswrc1    %[ftmp1],   0x02(%[addr0])                          \n\t"
+    MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step])
+    "gsswlc1    %[ftmp5],   0x05(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp5],   0x02(%[addr1])                          \n\t"
+
+    "ssrld      %[ftmp5],   %[ftmp5],           %[ftmp9]            \n\t"
+    MMI_ADDU(%[addr1], %[addr0], %[tmp0])
+    "gsswlc1    %[ftmp5],   0x05(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp5],   0x02(%[addr1])                          \n\t"
+
+    MMI_ADDIU(%[count], %[count], -0x01)
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x03)
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
+    "bnez       %[count],   1b                                      \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),              [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),              [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),              [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),              [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
+      [ftmp12]"=&f"(ftmp[12]),            [tmp0]"=&r"(tmp[0]),
+      [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count),
+      [ff_ph_01]"=&f"(ff_ph_01),        [ff_pb_03]"=&f"(ff_pb_03),
+      [ff_pb_04]"=&f"(ff_pb_04),        [ff_pb_80]"=&f"(ff_pb_80),
+      [ff_pb_fe]"=&f"(ff_pb_fe)
+    : [limit]"r"(limit),                [blimit]"r"(blimit),
+      [thresh]"r"(thresh),
+      [src_pixel_step]"r"((mips_reg)src_pixel_step)
+    : "memory"
+  );
+  /* clang-format on */
+}
+
+/* clang-format off */
+#define VP8_MBLOOP_HPSRAB                                               \
+  "punpcklbh  %[ftmp10],  %[ftmp10],          %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp11],  %[ftmp11],          %[ftmp0]            \n\t" \
+  "psrah      %[ftmp10],  %[ftmp10],          %[ftmp9]            \n\t" \
+  "psrah      %[ftmp11],  %[ftmp11],          %[ftmp9]            \n\t" \
+  "packsshb   %[ftmp0],   %[ftmp10],          %[ftmp11]            \n\t"
+
+#define VP8_MBLOOP_HPSRAB_ADD(reg)                                      \
+  "punpcklbh  %[ftmp1],   %[ftmp0],           %[ftmp12]           \n\t" \
+  "punpckhbh  %[ftmp2],   %[ftmp0],           %[ftmp12]           \n\t" \
+  "pmulhh     %[ftmp1],   %[ftmp1],         " #reg "              \n\t" \
+  "pmulhh     %[ftmp2],   %[ftmp2],         " #reg "              \n\t" \
+  "paddh      %[ftmp1],   %[ftmp1],           %[ff_ph_003f]       \n\t" \
+  "paddh      %[ftmp2],   %[ftmp2],           %[ff_ph_003f]       \n\t" \
+  "psrah      %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t" \
+  "psrah      %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t" \
+  "packsshb   %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+/* clang-format on */
+
+void vp8_mbloop_filter_horizontal_edge_mmi(
+    unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
+    const unsigned char *limit, const unsigned char *thresh, int count) {
+  uint64_t tmp[1];
+  double ftmp[13];
+  double ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03, ff_ph_003f, ff_ph_0900,
+      ff_ph_1200, ff_ph_1b00;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"
+    "dli        %[tmp0],    0x8080808080808080                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"
+    "dli        %[tmp0],    0x0404040404040404                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"
+    "dli        %[tmp0],    0x0303030303030303                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_03]                             \n\t"
+    "dli        %[tmp0],    0x003f003f003f003f                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_003f]                           \n\t"
+    "dli        %[tmp0],    0x0900090009000900                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_0900]                           \n\t"
+    "dli        %[tmp0],    0x1200120012001200                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_1200]                           \n\t"
+    "dli        %[tmp0],    0x1b001b001b001b00                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_1b00]                           \n\t"
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+    "1:                                                             \n\t"
+    "gsldlc1    %[ftmp9],   0x07(%[limit])                          \n\t"
+    "gsldrc1    %[ftmp9],   0x00(%[limit])                          \n\t"
+    /* ftmp1: p3 */
+    "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp3: p2 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp3],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp3],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp4: p1 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp4],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp4],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp5: p0 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp5],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp5],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp6: q0 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp7: q1 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp7],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp8: q2 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp8],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp2: q3 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp2],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[src_ptr])                        \n\t"
+
+    "gsldlc1    %[ftmp12],  0x07(%[blimit])                         \n\t"
+    "gsldrc1    %[ftmp12],  0x00(%[blimit])                         \n\t"
+
+    "pasubub    %[ftmp0],   %[ftmp1],           %[ftmp3]            \n\t"
+    "psubusb    %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
+    "pasubub    %[ftmp1],   %[ftmp3],           %[ftmp4]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "pasubub    %[ftmp10],  %[ftmp4],           %[ftmp5]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp10],          %[ftmp9]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "pasubub    %[ftmp11],  %[ftmp7],           %[ftmp6]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp11],          %[ftmp9]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "pasubub    %[ftmp1],   %[ftmp8],           %[ftmp7]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "pasubub    %[ftmp1],   %[ftmp2],           %[ftmp8]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    "pasubub    %[ftmp1],   %[ftmp5],           %[ftmp6]            \n\t"
+    "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
+    "pasubub    %[ftmp2],   %[ftmp4],           %[ftmp7]            \n\t"
+    "pand       %[ftmp2],   %[ftmp2],           %[ff_pb_fe]         \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
+    "psrlh      %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
+    "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp12]           \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "pxor       %[ftmp9],   %[ftmp9],           %[ftmp9]            \n\t"
+    /* ftmp0: mask */
+    "pcmpeqb    %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
+
+    "gsldlc1    %[ftmp9],   0x07(%[thresh])                         \n\t"
+    "gsldrc1    %[ftmp9],   0x00(%[thresh])                         \n\t"
+    "psubusb    %[ftmp1],   %[ftmp10],          %[ftmp9]            \n\t"
+    "psubusb    %[ftmp2],   %[ftmp11],          %[ftmp9]            \n\t"
+    "paddb      %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+    "pxor       %[ftmp2],   %[ftmp2],           %[ftmp2]            \n\t"
+    "pcmpeqb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+    "pcmpeqb    %[ftmp2],   %[ftmp2],           %[ftmp2]            \n\t"
+    /* ftmp1: hev */
+    "pxor       %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+
+    "pxor       %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
+    "psubsb     %[ftmp2],   %[ftmp4],           %[ftmp7]            \n\t"
+    "psubsb     %[ftmp9],   %[ftmp6],           %[ftmp5]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
+    "pand       %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
+    "pandn      %[ftmp12],  %[ftmp1],           %[ftmp2]            \n\t"
+    "pand       %[ftmp2],   %[ftmp2],           %[ftmp1]            \n\t"
+
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
+    "paddsb     %[ftmp0],   %[ftmp2],           %[ff_pb_03]         \n\t"
+    VP8_MBLOOP_HPSRAB
+    "paddsb     %[ftmp5],   %[ftmp5],           %[ftmp0]            \n\t"
+    "paddsb     %[ftmp0],   %[ftmp2],           %[ff_pb_04]         \n\t"
+    VP8_MBLOOP_HPSRAB
+    "psubsb     %[ftmp6],   %[ftmp6],           %[ftmp0]            \n\t"
+
+    "dli        %[tmp0],    0x07                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],           %[ftmp0]            \n\t"
+
+    VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1b00])
+    "psubsb     %[ftmp6],   %[ftmp6],           %[ftmp1]            \n\t"
+    "paddsb     %[ftmp5],   %[ftmp5],           %[ftmp1]            \n\t"
+    "pxor       %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+    "gssdlc1    %[ftmp5],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp5],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+
+    VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1200])
+    "paddsb     %[ftmp4],   %[ftmp4],           %[ftmp1]            \n\t"
+    "psubsb     %[ftmp7],   %[ftmp7],           %[ftmp1]            \n\t"
+    "pxor       %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp7],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp7],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp4],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp4],   0x00(%[src_ptr])                        \n\t"
+
+    VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_0900])
+    "pxor       %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp8],   %[ftmp8],           %[ff_pb_80]         \n\t"
+    "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp1]            \n\t"
+    "psubsb     %[ftmp8],   %[ftmp8],           %[ftmp1]            \n\t"
+    "pxor       %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp8],   %[ftmp8],           %[ff_pb_80]         \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
+    "gssdlc1    %[ftmp8],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp8],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp3],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp3],   0x00(%[src_ptr])                        \n\t"
+
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08)
+    "addiu      %[count],   %[count],           -0x01               \n\t"
+    "bnez       %[count],   1b                                      \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),              [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),              [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),              [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),              [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
+      [ftmp12]"=&f"(ftmp[12]),            [tmp0]"=&r"(tmp[0]),
+      [src_ptr]"+&r"(src_ptr),            [count]"+&r"(count),
+      [ff_pb_fe]"=&f"(ff_pb_fe),          [ff_pb_80]"=&f"(ff_pb_80),
+      [ff_pb_04]"=&f"(ff_pb_04),          [ff_pb_03]"=&f"(ff_pb_03),
+      [ff_ph_0900]"=&f"(ff_ph_0900),      [ff_ph_1b00]"=&f"(ff_ph_1b00),
+      [ff_ph_1200]"=&f"(ff_ph_1200),      [ff_ph_003f]"=&f"(ff_ph_003f)
+    : [limit]"r"(limit),                  [blimit]"r"(blimit),
+      [thresh]"r"(thresh),
+      [src_pixel_step]"r"((mips_reg)src_pixel_step)
+    : "memory"
+  );
+  /* clang-format on */
+}
+
+/* clang-format off */
+#define VP8_MBLOOP_VPSRAB_ADDH                                          \
+  "pxor       %[ftmp7],   %[ftmp7],           %[ftmp7]            \n\t" \
+  "pxor       %[ftmp8],   %[ftmp8],           %[ftmp8]            \n\t" \
+  "punpcklbh  %[ftmp7],   %[ftmp7],           %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp8],   %[ftmp8],           %[ftmp0]            \n\t"
+
+#define VP8_MBLOOP_VPSRAB_ADDT                                          \
+  "paddh      %[ftmp7],   %[ftmp7],           %[ff_ph_003f]       \n\t" \
+  "paddh      %[ftmp8],   %[ftmp8],           %[ff_ph_003f]       \n\t" \
+  "psrah      %[ftmp7],   %[ftmp7],           %[ftmp12]           \n\t" \
+  "psrah      %[ftmp8],   %[ftmp8],           %[ftmp12]           \n\t" \
+  "packsshb   %[ftmp3],   %[ftmp7],           %[ftmp8]            \n\t"
+/* clang-format on */
+
+void vp8_mbloop_filter_vertical_edge_mmi(
+    unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
+    const unsigned char *limit, const unsigned char *thresh, int count) {
+  mips_reg tmp[1];
+  DECLARE_ALIGNED(8, const uint64_t, srct[2]);
+  double ftmp[14];
+  double ff_ph_003f, ff_ph_0900, ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "dli        %[tmp0],    0x003f003f003f003f                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_003f]                           \n\t"
+    "dli        %[tmp0],    0x0900090009000900                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_0900]                           \n\t"
+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"
+    "dli        %[tmp0],    0x8080808080808080                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"
+    "dli        %[tmp0],    0x0404040404040404                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"
+    "dli        %[tmp0],    0x0303030303030303                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_03]                             \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], 0x04)
+
+    "1:                                                             \n\t"
+    "gsldlc1    %[ftmp5],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp5],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp7],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp8],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[src_ptr])                        \n\t"
+
+    "punpcklbh  %[ftmp11],  %[ftmp5],           %[ftmp6]            \n\t"
+    "punpckhbh  %[ftmp12],  %[ftmp5],           %[ftmp6]            \n\t"
+    "punpcklbh  %[ftmp9],   %[ftmp7],           %[ftmp8]            \n\t"
+    "punpckhbh  %[ftmp10],  %[ftmp7],           %[ftmp8]            \n\t"
+
+    "punpcklhw  %[ftmp1],   %[ftmp12],          %[ftmp10]           \n\t"
+    "punpckhhw  %[ftmp2],   %[ftmp12],          %[ftmp10]           \n\t"
+    "punpcklhw  %[ftmp3],   %[ftmp11],          %[ftmp9]            \n\t"
+    "punpckhhw  %[ftmp4],   %[ftmp11],          %[ftmp9]            \n\t"
+
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp5],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp5],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp7],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp8],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[src_ptr])                        \n\t"
+
+    "punpcklbh  %[ftmp11],  %[ftmp5],           %[ftmp6]            \n\t"
+    "punpckhbh  %[ftmp12],  %[ftmp5],           %[ftmp6]            \n\t"
+    "punpcklbh  %[ftmp9],   %[ftmp7],           %[ftmp8]            \n\t"
+    "punpckhbh  %[ftmp10],  %[ftmp7],           %[ftmp8]            \n\t"
+
+    "punpcklhw  %[ftmp5],   %[ftmp12],          %[ftmp10]           \n\t"
+    "punpckhhw  %[ftmp6],   %[ftmp12],          %[ftmp10]           \n\t"
+    "punpcklhw  %[ftmp7],   %[ftmp11],          %[ftmp9]            \n\t"
+    "punpckhhw  %[ftmp8],   %[ftmp11],          %[ftmp9]            \n\t"
+
+    "gsldlc1    %[ftmp13],  0x07(%[limit])                          \n\t"
+    "gsldrc1    %[ftmp13],  0x00(%[limit])                          \n\t"
+    /* ftmp9:q0  ftmp10:q1 */
+    "punpcklwd  %[ftmp9],   %[ftmp1],           %[ftmp5]            \n\t"
+    "punpckhwd  %[ftmp10],  %[ftmp1],           %[ftmp5]            \n\t"
+    /* ftmp11:q2  ftmp12:q3 */
+    "punpcklwd  %[ftmp11],  %[ftmp2],           %[ftmp6]            \n\t"
+    "punpckhwd  %[ftmp12],  %[ftmp2],           %[ftmp6]            \n\t"
+    /* srct[0x00]: q3 */
+    "sdc1       %[ftmp12],  0x00(%[srct])                           \n\t"
+    /* ftmp1:p3  ftmp2:p2 */
+    "punpcklwd  %[ftmp1],   %[ftmp3],           %[ftmp7]            \n\t"
+    "punpckhwd  %[ftmp2],   %[ftmp3],           %[ftmp7]            \n\t"
+    /* srct[0x08]: p3 */
+    "sdc1       %[ftmp1],   0x08(%[srct])                           \n\t"
+    /* ftmp5:p1  ftmp6:p0 */
+    "punpcklwd  %[ftmp5],   %[ftmp4],           %[ftmp8]            \n\t"
+    "punpckhwd  %[ftmp6],   %[ftmp4],           %[ftmp8]            \n\t"
+
+    /* abs (q3-q2) */
+    "pasubub    %[ftmp7],   %[ftmp12],          %[ftmp11]           \n\t"
+    "psubusb    %[ftmp0],   %[ftmp7],           %[ftmp13]           \n\t"
+    /* abs (q2-q1) */
+    "pasubub    %[ftmp7],   %[ftmp11],          %[ftmp10]           \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp13]           \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    /* ftmp3: abs(q1-q0) */
+    "pasubub    %[ftmp3],   %[ftmp10],          %[ftmp9]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp3],           %[ftmp13]           \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    /* ftmp4: abs(p1-p0) */
+    "pasubub    %[ftmp4],   %[ftmp5],           %[ftmp6]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp4],           %[ftmp13]           \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    /* abs (p2-p1) */
+    "pasubub    %[ftmp7],   %[ftmp2],           %[ftmp5]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp13]           \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    /* abs (p3-p2) */
+    "pasubub    %[ftmp7],   %[ftmp1],           %[ftmp2]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp13]           \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+
+    "gsldlc1    %[ftmp13],  0x07(%[blimit])                         \n\t"
+    "gsldrc1    %[ftmp13],  0x00(%[blimit])                         \n\t"
+    "gsldlc1    %[ftmp7],   0x07(%[thresh])                         \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[thresh])                         \n\t"
+    /* abs (p0-q0) * 2 */
+    "pasubub    %[ftmp1],   %[ftmp9],           %[ftmp6]            \n\t"
+    "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
+    /* abs (p1-q1) / 2 */
+    "pasubub    %[ftmp12],  %[ftmp10],          %[ftmp5]            \n\t"
+    "pand       %[ftmp12],  %[ftmp12],          %[ff_pb_fe]         \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp8]                                \n\t"
+    "psrlh      %[ftmp12],  %[ftmp12],          %[ftmp8]            \n\t"
+    "paddusb    %[ftmp12],  %[ftmp1],           %[ftmp12]           \n\t"
+    "psubusb    %[ftmp12],  %[ftmp12],          %[ftmp13]           \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp12]           \n\t"
+    "pxor       %[ftmp12],  %[ftmp12],          %[ftmp12]           \n\t"
+    /* ftmp0: mask */
+    "pcmpeqb    %[ftmp0],   %[ftmp0],           %[ftmp12]           \n\t"
+
+    /* abs(p1-p0) - thresh */
+    "psubusb    %[ftmp4],   %[ftmp4],           %[ftmp7]            \n\t"
+    /* abs(q1-q0) - thresh */
+    "psubusb    %[ftmp3],   %[ftmp3],           %[ftmp7]            \n\t"
+    "por        %[ftmp3],   %[ftmp4],           %[ftmp3]            \n\t"
+    "pcmpeqb    %[ftmp3],   %[ftmp3],           %[ftmp12]           \n\t"
+    "pcmpeqb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
+    /* ftmp1: hev */
+    "pxor       %[ftmp1],   %[ftmp3],           %[ftmp1]            \n\t"
+
+    /* ftmp2:ps2, ftmp5:ps1, ftmp6:ps0, ftmp9:qs0, ftmp10:qs1, ftmp11:qs2 */
+    "pxor       %[ftmp11],  %[ftmp11],          %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp10],  %[ftmp10],          %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp9],   %[ftmp9],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp2],   %[ftmp2],           %[ff_pb_80]         \n\t"
+
+    "psubsb     %[ftmp3],   %[ftmp5],           %[ftmp10]           \n\t"
+    "psubsb     %[ftmp4],   %[ftmp9],           %[ftmp6]            \n\t"
+    "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp4]            \n\t"
+    "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp4]            \n\t"
+    "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp4]            \n\t"
+    /* filter_value &= mask */
+    "pand       %[ftmp0],   %[ftmp0],           %[ftmp3]            \n\t"
+    /* Filter2 = filter_value & hev */
+    "pand       %[ftmp3],   %[ftmp1],           %[ftmp0]            \n\t"
+    /* filter_value &= ~hev */
+    "pandn      %[ftmp0],   %[ftmp1],           %[ftmp0]            \n\t"
+
+    "paddsb     %[ftmp4],   %[ftmp3],           %[ff_pb_04]         \n\t"
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp12]                               \n\t"
+    "punpcklbh  %[ftmp7],   %[ftmp7],           %[ftmp4]            \n\t"
+    "punpckhbh  %[ftmp8],   %[ftmp8],           %[ftmp4]            \n\t"
+    "psrah      %[ftmp7],   %[ftmp7],           %[ftmp12]           \n\t"
+    "psrah      %[ftmp8],   %[ftmp8],           %[ftmp12]           \n\t"
+    "packsshb   %[ftmp4],   %[ftmp7],           %[ftmp8]            \n\t"
+    /* ftmp9: qs0 */
+    "psubsb     %[ftmp9],   %[ftmp9],           %[ftmp4]            \n\t"
+    "paddsb     %[ftmp3],   %[ftmp3],           %[ff_pb_03]         \n\t"
+    "punpcklbh  %[ftmp7],   %[ftmp7],           %[ftmp3]            \n\t"
+    "punpckhbh  %[ftmp8],   %[ftmp8],           %[ftmp3]            \n\t"
+    "psrah      %[ftmp7],   %[ftmp7],           %[ftmp12]           \n\t"
+    "psrah      %[ftmp8],   %[ftmp8],           %[ftmp12]           \n\t"
+    "packsshb   %[ftmp3],   %[ftmp7],           %[ftmp8]            \n\t"
+    /* ftmp6: ps0 */
+    "paddsb     %[ftmp6],   %[ftmp6],           %[ftmp3]            \n\t"
+
+    "dli        %[tmp0],    0x07                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp12]                               \n\t"
+    VP8_MBLOOP_VPSRAB_ADDH
+    "paddh      %[ftmp1],   %[ff_ph_0900],      %[ff_ph_0900]       \n\t"
+    "paddh      %[ftmp1],   %[ftmp1],           %[ff_ph_0900]       \n\t"
+    "pmulhh     %[ftmp7],   %[ftmp7],           %[ftmp1]            \n\t"
+    "pmulhh     %[ftmp8],   %[ftmp8],           %[ftmp1]            \n\t"
+    VP8_MBLOOP_VPSRAB_ADDT
+    "psubsb     %[ftmp4],   %[ftmp9],           %[ftmp3]            \n\t"
+    /* ftmp9: oq0 */
+    "pxor       %[ftmp9],   %[ftmp4],           %[ff_pb_80]         \n\t"
+    "paddsb     %[ftmp4],   %[ftmp6],           %[ftmp3]            \n\t"
+    /* ftmp6: op0 */
+    "pxor       %[ftmp6],   %[ftmp4],           %[ff_pb_80]         \n\t"
+
+    VP8_MBLOOP_VPSRAB_ADDH
+    "paddh      %[ftmp1],   %[ff_ph_0900],      %[ff_ph_0900]       \n\t"
+    "pmulhh     %[ftmp7],   %[ftmp7],           %[ftmp1]            \n\t"
+    "pmulhh     %[ftmp8],   %[ftmp8],           %[ftmp1]            \n\t"
+    VP8_MBLOOP_VPSRAB_ADDT
+    "psubsb     %[ftmp4],   %[ftmp10],          %[ftmp3]            \n\t"
+    /* ftmp10: oq1 */
+    "pxor       %[ftmp10],   %[ftmp4],          %[ff_pb_80]         \n\t"
+    "paddsb     %[ftmp4],   %[ftmp5],           %[ftmp3]            \n\t"
+    /* ftmp5: op1 */
+    "pxor       %[ftmp5],   %[ftmp4],           %[ff_pb_80]         \n\t"
+
+    VP8_MBLOOP_VPSRAB_ADDH
+    "pmulhh     %[ftmp7],   %[ftmp7],           %[ff_ph_0900]       \n\t"
+    "pmulhh     %[ftmp8],   %[ftmp8],           %[ff_ph_0900]       \n\t"
+    VP8_MBLOOP_VPSRAB_ADDT
+    "psubsb     %[ftmp4],   %[ftmp11],          %[ftmp3]            \n\t"
+    /* ftmp11: oq2 */
+    "pxor       %[ftmp11],  %[ftmp4],           %[ff_pb_80]         \n\t"
+    "paddsb     %[ftmp4],   %[ftmp2],           %[ftmp3]            \n\t"
+    /* ftmp2: op2 */
+    "pxor       %[ftmp2],   %[ftmp4],           %[ff_pb_80]         \n\t"
+
+    "ldc1       %[ftmp12],  0x00(%[srct])                           \n\t"
+    "ldc1       %[ftmp8],   0x08(%[srct])                           \n\t"
+
+    "punpcklbh  %[ftmp0],   %[ftmp8],           %[ftmp2]            \n\t"
+    "punpckhbh  %[ftmp1],   %[ftmp8],           %[ftmp2]            \n\t"
+    "punpcklbh  %[ftmp2],   %[ftmp5],           %[ftmp6]            \n\t"
+    "punpckhbh  %[ftmp3],   %[ftmp5],           %[ftmp6]            \n\t"
+    "punpcklhw  %[ftmp4],   %[ftmp0],           %[ftmp2]            \n\t"
+    "punpckhhw  %[ftmp5],   %[ftmp0],           %[ftmp2]            \n\t"
+    "punpcklhw  %[ftmp6],   %[ftmp1],           %[ftmp3]            \n\t"
+    "punpckhhw  %[ftmp7],   %[ftmp1],           %[ftmp3]            \n\t"
+
+    "punpcklbh  %[ftmp0],   %[ftmp9],           %[ftmp10]           \n\t"
+    "punpckhbh  %[ftmp1],   %[ftmp9],           %[ftmp10]           \n\t"
+    "punpcklbh  %[ftmp2],   %[ftmp11],          %[ftmp12]           \n\t"
+    "punpckhbh  %[ftmp3],   %[ftmp11],          %[ftmp12]           \n\t"
+    "punpcklhw  %[ftmp8],   %[ftmp0],           %[ftmp2]            \n\t"
+    "punpckhhw  %[ftmp9],   %[ftmp0],           %[ftmp2]            \n\t"
+    "punpcklhw  %[ftmp10],  %[ftmp1],           %[ftmp3]            \n\t"
+    "punpckhhw  %[ftmp11],  %[ftmp1],           %[ftmp3]            \n\t"
+
+    "punpcklwd  %[ftmp0],   %[ftmp7],           %[ftmp11]           \n\t"
+    "punpckhwd  %[ftmp1],   %[ftmp7],           %[ftmp11]           \n\t"
+    "gssdlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
+
+    "punpcklwd  %[ftmp0],   %[ftmp6],           %[ftmp10]           \n\t"
+    "punpckhwd  %[ftmp1],   %[ftmp6],           %[ftmp10]           \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
+
+    "punpcklwd  %[ftmp1],   %[ftmp5],           %[ftmp9]            \n\t"
+    "punpckhwd  %[ftmp0],   %[ftmp5],           %[ftmp9]            \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+
+    "punpcklwd  %[ftmp1],   %[ftmp4],           %[ftmp8]            \n\t"
+    "punpckhwd  %[ftmp0],   %[ftmp4],           %[ftmp8]            \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+    "addiu      %[count],   %[count],           -0x01               \n\t"
+
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x03)
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
+    "bnez       %[count],   1b                                      \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),              [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),              [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),              [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),              [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
+      [ftmp12]"=&f"(ftmp[12]),            [ftmp13]"=&f"(ftmp[13]),
+      [tmp0]"=&r"(tmp[0]),                [src_ptr]"+&r"(src_ptr),
+      [count]"+&r"(count),
+      [ff_ph_003f]"=&f"(ff_ph_003f),    [ff_ph_0900]"=&f"(ff_ph_0900),
+      [ff_pb_03]"=&f"(ff_pb_03),        [ff_pb_04]"=&f"(ff_pb_04),
+      [ff_pb_80]"=&f"(ff_pb_80),        [ff_pb_fe]"=&f"(ff_pb_fe)
+    : [limit]"r"(limit),                [blimit]"r"(blimit),
+      [srct]"r"(srct),                  [thresh]"r"(thresh),
+      [src_pixel_step]"r"((mips_reg)src_pixel_step)
+    : "memory"
+  );
+  /* clang-format on */
+}
+
+/* clang-format off */
+#define VP8_SIMPLE_HPSRAB                                               \
+  "psllh      %[ftmp0],   %[ftmp5],           %[ftmp8]            \n\t" \
+  "psrah      %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t" \
+  "psrlh      %[ftmp0],   %[ftmp0],           %[ftmp8]            \n\t" \
+  "psrah      %[ftmp1],   %[ftmp5],           %[ftmp10]           \n\t" \
+  "psllh      %[ftmp1],   %[ftmp1],           %[ftmp8]            \n\t" \
+  "por        %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+/* clang-format on */
+
+void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr,
+                                                int src_pixel_step,
+                                                const unsigned char *blimit) {
+  uint64_t tmp[1], count = 2;
+  mips_reg addr[2];
+  double ftmp[12];
+  double ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_01;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp11]                               \n\t"
+    "dli        %[tmp0],    0x08                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp8]                                \n\t"
+    "dli        %[tmp0],    0x03                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp11]                               \n\t"
+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"
+    "dli        %[tmp0],    0x8080808080808080                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"
+    "dli        %[tmp0],    0x0404040404040404                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"
+    "dli        %[tmp0],    0x0101010101010101                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_01]                             \n\t"
+
+    "1:                                                             \n\t"
+    "gsldlc1    %[ftmp3],   0x07(%[blimit])                         \n\t"
+    "gsldrc1    %[ftmp3],   0x00(%[blimit])                         \n\t"
+
+    MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+    "gsldlc1    %[ftmp2],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[addr1])                          \n\t"
+    "gsldlc1    %[ftmp7],   0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[addr0])                          \n\t"
+    "pasubub    %[ftmp1],   %[ftmp7],           %[ftmp2]            \n\t"
+    "pand       %[ftmp1],   %[ftmp1],           %[ff_pb_fe]         \n\t"
+    "psrlh      %[ftmp1],   %[ftmp1],           %[ftmp11]           \n\t"
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp6],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp6],   0x00(%[addr1])                          \n\t"
+    "gsldlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
+    "pasubub    %[ftmp5],   %[ftmp6],           %[ftmp0]            \n\t"
+    "paddusb    %[ftmp5],   %[ftmp5],           %[ftmp5]            \n\t"
+    "paddusb    %[ftmp5],   %[ftmp5],           %[ftmp1]            \n\t"
+    "psubusb    %[ftmp5],   %[ftmp5],           %[ftmp3]            \n\t"
+    "pxor       %[ftmp3],   %[ftmp3],           %[ftmp3]            \n\t"
+    "pcmpeqb    %[ftmp5],   %[ftmp5],           %[ftmp3]            \n\t"
+
+    "pxor       %[ftmp2],   %[ftmp2],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
+    "psubsb     %[ftmp2],   %[ftmp2],           %[ftmp7]            \n\t"
+    "pxor       %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp3],   %[ftmp0],           %[ff_pb_80]         \n\t"
+    "psubsb     %[ftmp0],   %[ftmp3],           %[ftmp6]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
+    "pand       %[ftmp5],   %[ftmp5],           %[ftmp2]            \n\t"
+
+    "paddsb     %[ftmp5],   %[ftmp5],           %[ff_pb_04]         \n\t"
+    VP8_SIMPLE_HPSRAB
+    "psubsb     %[ftmp3],   %[ftmp3],           %[ftmp0]            \n\t"
+    "pxor       %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
+    "gssdlc1    %[ftmp3],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp3],   0x00(%[src_ptr])                        \n\t"
+
+    "psubsb     %[ftmp5],   %[ftmp5],           %[ff_pb_01]         \n\t"
+    VP8_SIMPLE_HPSRAB
+    "paddsb     %[ftmp6],   %[ftmp6],           %[ftmp0]            \n\t"
+    "pxor       %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp6],   0x07(%[addr1])                          \n\t"
+    "gssdrc1    %[ftmp6],   0x00(%[addr1])                          \n\t"
+
+    "addiu      %[count],   %[count],           -0x01               \n\t"
+    MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08)
+    "bnez       %[count],   1b                                      \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),              [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),              [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),              [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),              [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
+      [tmp0]"=&r"(tmp[0]),
+      [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count),
+      [ff_pb_fe]"=&f"(ff_pb_fe),        [ff_pb_80]"=&f"(ff_pb_80),
+      [ff_pb_04]"=&f"(ff_pb_04),        [ff_pb_01]"=&f"(ff_pb_01)
+    : [blimit]"r"(blimit),
+      [src_pixel_step]"r"((mips_reg)src_pixel_step),
+      [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1))
+    : "memory"
+  );
+  /* clang-format on */
+}
+
+void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr,
+                                              int src_pixel_step,
+                                              const unsigned char *blimit) {
+  uint64_t tmp[1], count = 2;
+  mips_reg addr[2];
+  DECLARE_ALIGNED(8, const uint64_t, srct[2]);
+  double ftmp[12], ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_01;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "dli        %[tmp0],    0x08                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp8]                                \n\t"
+    "dli        %[tmp0],    0x20                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
+    "dli        %[tmp0],    0x08                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp8]                                \n\t"
+    "dli        %[tmp0],    0x20                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp10]                               \n\t"
+    "dli        %[tmp0],    0xfefefefefefefefe                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_fe]                             \n\t"
+    "dli        %[tmp0],    0x8080808080808080                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_80]                             \n\t"
+    "dli        %[tmp0],    0x0404040404040404                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_04]                             \n\t"
+    "dli        %[tmp0],    0x0101010101010101                      \n\t"
+    "dmtc1      %[tmp0],    %[ff_pb_01]                             \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step_x4])
+    MMI_SUBU(%[src_ptr], %[src_ptr], 0x02)
+
+    "1:                                                             \n\t"
+    MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
+    MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2])
+    "gslwlc1    %[ftmp0],   0x03(%[addr1])                          \n\t"
+    "gslwrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+    MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+    "gslwlc1    %[ftmp6],   0x03(%[addr1])                          \n\t"
+    "gslwrc1    %[ftmp6],   0x00(%[addr1])                          \n\t"
+    "punpcklbh  %[ftmp6],   %[ftmp6],           %[ftmp0]            \n\t"
+
+    MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gslwlc1    %[ftmp0],   0x03(%[addr1])                          \n\t"
+    "gslwrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+    "gslwlc1    %[ftmp4],   0x03(%[src_ptr])                        \n\t"
+    "gslwrc1    %[ftmp4],   0x00(%[src_ptr])                        \n\t"
+
+    "punpcklbh  %[ftmp4],   %[ftmp4],           %[ftmp0]            \n\t"
+    "punpckhhw  %[ftmp5],   %[ftmp4],           %[ftmp6]            \n\t"
+    "punpcklhw  %[ftmp4],   %[ftmp4],           %[ftmp6]            \n\t"
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gslwlc1    %[ftmp7],   0x03(%[addr1])                          \n\t"
+    "gslwrc1    %[ftmp7],   0x00(%[addr1])                          \n\t"
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+    "gslwlc1    %[ftmp6],   0x03(%[addr1])                          \n\t"
+    "gslwrc1    %[ftmp6],   0x00(%[addr1])                          \n\t"
+    "punpcklbh  %[ftmp6],   %[ftmp6],           %[ftmp7]            \n\t"
+
+    MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4])
+    "gslwlc1    %[ftmp1],   0x03(%[addr1])                          \n\t"
+    "gslwrc1    %[ftmp1],   0x00(%[addr1])                          \n\t"
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4])
+    "gslwlc1    %[ftmp0],   0x03(%[addr1])                          \n\t"
+    "gslwrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+    "punpcklbh  %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    "punpckhhw  %[ftmp2],   %[ftmp0],           %[ftmp6]            \n\t"
+    "punpcklhw  %[ftmp0],   %[ftmp0],           %[ftmp6]            \n\t"
+    "punpckhwd  %[ftmp1],   %[ftmp0],           %[ftmp4]            \n\t"
+    "punpcklwd  %[ftmp0],   %[ftmp0],           %[ftmp4]            \n\t"
+    "punpckhwd  %[ftmp3],   %[ftmp2],           %[ftmp5]            \n\t"
+    "punpcklwd  %[ftmp2],   %[ftmp2],           %[ftmp5]            \n\t"
+
+    "dli        %[tmp0],    0x01                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
+    "pasubub    %[ftmp6],   %[ftmp3],           %[ftmp0]            \n\t"
+    "pand       %[ftmp6],   %[ftmp6],           %[ff_pb_fe]         \n\t"
+    "psrlh      %[ftmp6],   %[ftmp6],           %[ftmp9]            \n\t"
+    "pasubub    %[ftmp5],   %[ftmp1],           %[ftmp2]            \n\t"
+    "paddusb    %[ftmp5],   %[ftmp5],           %[ftmp5]            \n\t"
+    "paddusb    %[ftmp5],   %[ftmp5],           %[ftmp6]            \n\t"
+
+    "gsldlc1    %[ftmp7],   0x07(%[blimit])                         \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[blimit])                         \n\t"
+    "psubusb    %[ftmp5],   %[ftmp5],           %[ftmp7]            \n\t"
+    "pxor       %[ftmp7],   %[ftmp7],           %[ftmp7]            \n\t"
+    "pcmpeqb    %[ftmp5],   %[ftmp5],           %[ftmp7]            \n\t"
+
+    "sdc1       %[ftmp0],   0x00(%[srct])                           \n\t"
+    "sdc1       %[ftmp3],   0x08(%[srct])                           \n\t"
+
+    "pxor       %[ftmp0],   %[ftmp0],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
+    "psubsb     %[ftmp0],   %[ftmp0],           %[ftmp3]            \n\t"
+
+    "pxor       %[ftmp6],   %[ftmp1],           %[ff_pb_80]         \n\t"
+    "pxor       %[ftmp3],   %[ftmp2],           %[ff_pb_80]         \n\t"
+    "psubsb     %[ftmp7],   %[ftmp3],           %[ftmp6]            \n\t"
+    "paddsb     %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    "paddsb     %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    "paddsb     %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    "pand       %[ftmp5],   %[ftmp5],           %[ftmp0]            \n\t"
+    "paddsb     %[ftmp5],   %[ftmp5],           %[ff_pb_04]         \n\t"
+
+    "dli        %[tmp0],    0x03                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
+    "psllh      %[ftmp0],   %[ftmp5],           %[ftmp8]            \n\t"
+    "psrah      %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
+    "psrlh      %[ftmp0],   %[ftmp0],           %[ftmp8]            \n\t"
+
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
+    "psrah      %[ftmp7],   %[ftmp5],           %[ftmp9]            \n\t"
+    "psllh      %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    "psubsb     %[ftmp3],   %[ftmp3],           %[ftmp0]            \n\t"
+    "pxor       %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
+    "psubsb     %[ftmp5],   %[ftmp5],           %[ff_pb_01]         \n\t"
+
+    "dli        %[tmp0],    0x03                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
+    "psllh      %[ftmp0],   %[ftmp5],           %[ftmp8]            \n\t"
+    "psrah      %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
+    "psrlh      %[ftmp0],   %[ftmp0],           %[ftmp8]            \n\t"
+
+    "dli        %[tmp0],    0x0b                                    \n\t"
+    "dmtc1      %[tmp0],    %[ftmp9]                                \n\t"
+    "psrah      %[ftmp5],   %[ftmp5],           %[ftmp9]            \n\t"
+    "psllh      %[ftmp5],   %[ftmp5],           %[ftmp8]            \n\t"
+    "por        %[ftmp0],   %[ftmp0],           %[ftmp5]            \n\t"
+    "paddsb     %[ftmp6],   %[ftmp6],           %[ftmp0]            \n\t"
+    "pxor       %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+
+    "ldc1       %[ftmp0],   0x00(%[srct])                           \n\t"
+    "ldc1       %[ftmp4],   0x08(%[srct])                           \n\t"
+
+    "punpckhbh  %[ftmp1],   %[ftmp0],           %[ftmp6]            \n\t"
+    "punpcklbh  %[ftmp0],   %[ftmp0],           %[ftmp6]            \n\t"
+    "punpcklbh  %[ftmp2],   %[ftmp3],           %[ftmp4]            \n\t"
+    "punpckhbh  %[ftmp3],   %[ftmp3],           %[ftmp4]            \n\t"
+
+    "punpckhhw  %[ftmp6],   %[ftmp0],           %[ftmp2]            \n\t"
+    "punpcklhw  %[ftmp0],   %[ftmp0],           %[ftmp2]            \n\t"
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4])
+    "gsswlc1    %[ftmp0],   0x03(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+    "punpckhhw  %[ftmp5],   %[ftmp1],           %[ftmp3]            \n\t"
+    "punpcklhw  %[ftmp1],   %[ftmp1],           %[ftmp3]            \n\t"
+
+    "ssrld      %[ftmp0],   %[ftmp0],           %[ftmp10]           \n\t"
+    MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4])
+    "gsswlc1    %[ftmp0],   0x03(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+    "gsswlc1    %[ftmp6],   0x03(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp6],   0x00(%[addr1])                          \n\t"
+
+    "ssrld      %[ftmp6],   %[ftmp6],           %[ftmp10]           \n\t"
+    "gsswlc1    %[ftmp1],   0x03(%[src_ptr])                        \n\t"
+    "gsswrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gsswlc1    %[ftmp6],   0x03(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp6],   0x00(%[addr1])                          \n\t"
+
+    MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+    "gsswlc1    %[ftmp5],   0x03(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp5],   0x00(%[addr1])                          \n\t"
+
+    "ssrld      %[ftmp1],   %[ftmp1],           %[ftmp10]           \n\t"
+    "gsswlc1    %[ftmp1],   0x03(%[addr0])                          \n\t"
+    "gsswrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+
+    "ssrld      %[ftmp5],   %[ftmp5],           %[ftmp10]           \n\t"
+    MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2])
+    "gsswlc1    %[ftmp5],   0x03(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp5],   0x00(%[addr1])                          \n\t"
+
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step_x8])
+    "addiu      %[count],   %[count],           -0x01               \n\t"
+    "bnez       %[count],   1b                                      \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),              [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),              [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),              [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),              [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
+      [tmp0]"=&r"(tmp[0]),
+      [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count),
+      [ff_pb_fe]"=&f"(ff_pb_fe),        [ff_pb_80]"=&f"(ff_pb_80),
+      [ff_pb_04]"=&f"(ff_pb_04),        [ff_pb_01]"=&f"(ff_pb_01)
+    : [blimit]"r"(blimit),              [srct]"r"(srct),
+      [src_pixel_step]"r"((mips_reg)src_pixel_step),
+      [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),
+      [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)),
+      [src_pixel_step_x8]"r"((mips_reg)(src_pixel_step<<3))
+    : "memory"
+  );
+  /* clang-format on */
+}
+
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_mmi(unsigned char *y_ptr, unsigned char *u_ptr,
+                             unsigned char *v_ptr, int y_stride, int uv_stride,
+                             loop_filter_info *lfi) {
+  vp8_mbloop_filter_horizontal_edge_mmi(y_ptr, y_stride, lfi->mblim, lfi->lim,
+                                        lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp8_mbloop_filter_horizontal_edge_mmi(u_ptr, uv_stride, lfi->mblim,
+                                          lfi->lim, lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp8_mbloop_filter_horizontal_edge_mmi(v_ptr, uv_stride, lfi->mblim,
+                                          lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_mmi(unsigned char *y_ptr, unsigned char *u_ptr,
+                             unsigned char *v_ptr, int y_stride, int uv_stride,
+                             loop_filter_info *lfi) {
+  vp8_mbloop_filter_vertical_edge_mmi(y_ptr, y_stride, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp8_mbloop_filter_vertical_edge_mmi(u_ptr, uv_stride, lfi->mblim, lfi->lim,
+                                        lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp8_mbloop_filter_vertical_edge_mmi(v_ptr, uv_stride, lfi->mblim, lfi->lim,
+                                        lfi->hev_thr, 1);
+}
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_mmi(unsigned char *y_ptr, unsigned char *u_ptr,
+                            unsigned char *v_ptr, int y_stride, int uv_stride,
+                            loop_filter_info *lfi) {
+  vp8_loop_filter_horizontal_edge_mmi(y_ptr + 4 * y_stride, y_stride, lfi->blim,
+                                      lfi->lim, lfi->hev_thr, 2);
+  vp8_loop_filter_horizontal_edge_mmi(y_ptr + 8 * y_stride, y_stride, lfi->blim,
+                                      lfi->lim, lfi->hev_thr, 2);
+  vp8_loop_filter_horizontal_edge_mmi(y_ptr + 12 * y_stride, y_stride,
+                                      lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp8_loop_filter_horizontal_edge_mmi(u_ptr + 4 * uv_stride, uv_stride,
+                                        lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp8_loop_filter_horizontal_edge_mmi(v_ptr + 4 * uv_stride, uv_stride,
+                                        lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_mmi(unsigned char *y_ptr, unsigned char *u_ptr,
+                            unsigned char *v_ptr, int y_stride, int uv_stride,
+                            loop_filter_info *lfi) {
+  vp8_loop_filter_vertical_edge_mmi(y_ptr + 4, y_stride, lfi->blim, lfi->lim,
+                                    lfi->hev_thr, 2);
+  vp8_loop_filter_vertical_edge_mmi(y_ptr + 8, y_stride, lfi->blim, lfi->lim,
+                                    lfi->hev_thr, 2);
+  vp8_loop_filter_vertical_edge_mmi(y_ptr + 12, y_stride, lfi->blim, lfi->lim,
+                                    lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp8_loop_filter_vertical_edge_mmi(u_ptr + 4, uv_stride, lfi->blim, lfi->lim,
+                                      lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp8_loop_filter_vertical_edge_mmi(v_ptr + 4, uv_stride, lfi->blim, lfi->lim,
+                                      lfi->hev_thr, 1);
+}
+
+void vp8_loop_filter_bhs_mmi(unsigned char *y_ptr, int y_stride,
+                             const unsigned char *blimit) {
+  vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 4 * y_stride, y_stride,
+                                             blimit);
+  vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 8 * y_stride, y_stride,
+                                             blimit);
+  vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 12 * y_stride, y_stride,
+                                             blimit);
+}
+
+void vp8_loop_filter_bvs_mmi(unsigned char *y_ptr, int y_stride,
+                             const unsigned char *blimit) {
+  vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 4, y_stride, blimit);
+  vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 8, y_stride, blimit);
+  vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 12, y_stride, blimit);
+}
diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c
new file mode 100644
index 0000000000..b85f73fdff
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c
@@ -0,0 +1,427 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp8/common/filter.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+DECLARE_ALIGNED(8, static const int16_t, vp8_six_tap_mmi[8][6 * 8]) = {
+  { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
+  { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
+    0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b,
+    0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
+  { 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002,
+    0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5,
+    0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c,
+    0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024,
+    0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8,
+    0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001 },
+  { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7,
+    0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d,
+    0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032,
+    0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
+  { 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003,
+    0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0,
+    0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d,
+    0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d,
+    0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0,
+    0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003 },
+  { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
+    0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032,
+    0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d,
+    0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
+  { 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001,
+    0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8,
+    0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024,
+    0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c,
+    0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5,
+    0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002 },
+  { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c,
+    0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b,
+    0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }
+};
+
+/* Horizontal filter:  pixel_step is 1, output_height and output_width are
+   the size of horizontal filtering output, output_height is always H + 5 */
+static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
+                                             uint16_t *output_ptr,
+                                             unsigned int src_pixels_per_line,
+                                             unsigned int output_height,
+                                             unsigned int output_width,
+                                             const int16_t *vp8_filter) {
+  uint64_t tmp[1];
+  double ff_ph_40;
+#if _MIPS_SIM == _ABIO32
+  register double fzero asm("$f0");
+  register double ftmp0 asm("$f2");
+  register double ftmp1 asm("$f4");
+  register double ftmp2 asm("$f6");
+  register double ftmp3 asm("$f8");
+  register double ftmp4 asm("$f10");
+  register double ftmp5 asm("$f12");
+  register double ftmp6 asm("$f14");
+  register double ftmp7 asm("$f16");
+  register double ftmp8 asm("$f18");
+  register double ftmp9 asm("$f20");
+  register double ftmp10 asm("$f22");
+  register double ftmp11 asm("$f24");
+#else
+  register double fzero asm("$f0");
+  register double ftmp0 asm("$f1");
+  register double ftmp1 asm("$f2");
+  register double ftmp2 asm("$f3");
+  register double ftmp3 asm("$f4");
+  register double ftmp4 asm("$f5");
+  register double ftmp5 asm("$f6");
+  register double ftmp6 asm("$f7");
+  register double ftmp7 asm("$f8");
+  register double ftmp8 asm("$f9");
+  register double ftmp9 asm("$f10");
+  register double ftmp10 asm("$f11");
+  register double ftmp11 asm("$f12");
+#endif  // _MIPS_SIM == _ABIO32
+
+  /* clang-format off */
+  __asm__ volatile (
+    "dli        %[tmp0],        0x0040004000400040                    \n\t"
+    "dmtc1      %[tmp0],        %[ff_ph_40]                           \n\t"
+    "ldc1       %[ftmp0],       0x00(%[vp8_filter])                   \n\t"
+    "ldc1       %[ftmp1],       0x10(%[vp8_filter])                   \n\t"
+    "ldc1       %[ftmp2],       0x20(%[vp8_filter])                   \n\t"
+    "ldc1       %[ftmp3],       0x30(%[vp8_filter])                   \n\t"
+    "ldc1       %[ftmp4],       0x40(%[vp8_filter])                   \n\t"
+    "ldc1       %[ftmp5],       0x50(%[vp8_filter])                   \n\t"
+    "pxor       %[fzero],       %[fzero],           %[fzero]          \n\t"
+    "dli        %[tmp0],        0x07                                  \n\t"
+    "dmtc1      %[tmp0],        %[ftmp7]                              \n\t"
+    "dli        %[tmp0],        0x08                                  \n\t"
+    "dmtc1      %[tmp0],        %[ftmp11]                             \n\t"
+
+    "1:                                                               \n\t"
+    "gsldlc1    %[ftmp9],       0x05(%[src_ptr])                      \n\t"
+    "gsldrc1    %[ftmp9],       -0x02(%[src_ptr])                     \n\t"
+    "gsldlc1    %[ftmp10],      0x06(%[src_ptr])                      \n\t"
+    "gsldrc1    %[ftmp10],      -0x01(%[src_ptr])                     \n\t"
+
+    "punpcklbh  %[ftmp6],       %[ftmp9],          %[fzero]           \n\t"
+    "pmullh     %[ftmp8],       %[ftmp6],          %[ftmp0]           \n\t"
+
+    "punpckhbh  %[ftmp6],       %[ftmp9],          %[fzero]           \n\t"
+    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp4]           \n\t"
+    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
+
+    "punpcklbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
+    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp1]           \n\t"
+    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
+
+    "punpckhbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
+    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp5]           \n\t"
+    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
+
+    "ssrld      %[ftmp10],      %[ftmp10],         %[ftmp11]          \n\t"
+    "punpcklbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
+    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp2]           \n\t"
+    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
+
+    "ssrld      %[ftmp10],      %[ftmp10],         %[ftmp11]          \n\t"
+    "punpcklbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
+    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp3]           \n\t"
+    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
+
+    "paddsh     %[ftmp8],       %[ftmp8],          %[ff_ph_40]        \n\t"
+    "psrah      %[ftmp8],       %[ftmp8],          %[ftmp7]           \n\t"
+    "packushb   %[ftmp8],       %[ftmp8],          %[fzero]           \n\t"
+    "punpcklbh  %[ftmp8],       %[ftmp8],          %[fzero]           \n\t"
+    "gssdlc1    %[ftmp8],       0x07(%[output_ptr])                   \n\t"
+    "gssdrc1    %[ftmp8],       0x00(%[output_ptr])                   \n\t"
+
+    "addiu      %[output_height], %[output_height], -0x01             \n\t"
+    MMI_ADDU(%[output_ptr],  %[output_ptr],    %[output_width])
+    MMI_ADDU(%[src_ptr],  %[src_ptr], %[src_pixels_per_line])
+    "bnez       %[output_height],               1b                    \n\t"
+    : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0),
+      [ftmp1]"=&f"(ftmp1),              [ftmp2]"=&f"(ftmp2),
+      [ftmp3]"=&f"(ftmp3),              [ftmp4]"=&f"(ftmp4),
+      [ftmp5]"=&f"(ftmp5),              [ftmp6]"=&f"(ftmp6),
+      [ftmp7]"=&f"(ftmp7),              [ftmp8]"=&f"(ftmp8),
+      [ftmp9]"=&f"(ftmp9),              [ftmp10]"=&f"(ftmp10),
+      [ftmp11]"=&f"(ftmp11),            [tmp0]"=&r"(tmp[0]),
+      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height),
+      [src_ptr]"+&r"(src_ptr),          [ff_ph_40]"=&f"(ff_ph_40)
+    : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line),
+      [vp8_filter]"r"(vp8_filter),      [output_width]"r"(output_width)
+    : "memory"
+    );
+  /* clang-format on */
+}
+
+/* Horizontal filter:  pixel_step is always W */
+static INLINE void vp8_filter_block1dc_v6_mmi(
+    uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height,
+    int output_pitch, unsigned int pixels_per_line, const int16_t *vp8_filter) {
+  double ff_ph_40;
+  uint64_t tmp[1];
+  mips_reg addr[1];
+
+#if _MIPS_SIM == _ABIO32
+  register double fzero asm("$f0");
+  register double ftmp0 asm("$f2");
+  register double ftmp1 asm("$f4");
+  register double ftmp2 asm("$f6");
+  register double ftmp3 asm("$f8");
+  register double ftmp4 asm("$f10");
+  register double ftmp5 asm("$f12");
+  register double ftmp6 asm("$f14");
+  register double ftmp7 asm("$f16");
+  register double ftmp8 asm("$f18");
+  register double ftmp9 asm("$f20");
+  register double ftmp10 asm("$f22");
+  register double ftmp11 asm("$f24");
+  register double ftmp12 asm("$f26");
+  register double ftmp13 asm("$f28");
+#else
+  register double fzero asm("$f0");
+  register double ftmp0 asm("$f1");
+  register double ftmp1 asm("$f2");
+  register double ftmp2 asm("$f3");
+  register double ftmp3 asm("$f4");
+  register double ftmp4 asm("$f5");
+  register double ftmp5 asm("$f6");
+  register double ftmp6 asm("$f7");
+  register double ftmp7 asm("$f8");
+  register double ftmp8 asm("$f9");
+  register double ftmp9 asm("$f10");
+  register double ftmp10 asm("$f11");
+  register double ftmp11 asm("$f12");
+  register double ftmp12 asm("$f13");
+  register double ftmp13 asm("$f14");
+#endif  // _MIPS_SIM == _ABIO32
+
+  /* clang-format off */
+  __asm__ volatile (
+    "dli        %[tmp0],      0x0040004000400040                      \n\t"
+    "dmtc1      %[tmp0],      %[ff_ph_40]                             \n\t"
+    "ldc1       %[ftmp0],     0x00(%[vp8_filter])                     \n\t"
+    "ldc1       %[ftmp1],     0x10(%[vp8_filter])                     \n\t"
+    "ldc1       %[ftmp2],     0x20(%[vp8_filter])                     \n\t"
+    "ldc1       %[ftmp3],     0x30(%[vp8_filter])                     \n\t"
+    "ldc1       %[ftmp4],     0x40(%[vp8_filter])                     \n\t"
+    "ldc1       %[ftmp5],     0x50(%[vp8_filter])                     \n\t"
+    "pxor       %[fzero],     %[fzero],        %[fzero]               \n\t"
+    "dli        %[tmp0],      0x07                                    \n\t"
+    "dmtc1      %[tmp0],      %[ftmp13]                               \n\t"
+
+    /* In order to make full use of memory load delay slot,
+     * Operation of memory loading and calculating has been rearranged.
+     */
+    "1:                                                               \n\t"
+    "gsldlc1    %[ftmp6],     0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp6],     0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line])
+    "gsldlc1    %[ftmp7],     0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp7],     0x00(%[addr0])                          \n\t"
+    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x2])
+    "gsldlc1    %[ftmp8],     0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp8],     0x00(%[addr0])                          \n\t"
+
+    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x4])
+    "gsldlc1    %[ftmp9],     0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp9],     0x00(%[addr0])                          \n\t"
+    MMI_ADDU(%[src_ptr],   %[src_ptr],      %[pixels_per_line])
+    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x2])
+    "gsldlc1    %[ftmp10],    0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp10],    0x00(%[addr0])                          \n\t"
+    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x4])
+    "gsldlc1    %[ftmp11],    0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp11],    0x00(%[addr0])                          \n\t"
+
+    "pmullh     %[ftmp12],    %[ftmp6],        %[ftmp0]               \n\t"
+
+    "pmullh     %[ftmp7],     %[ftmp7],        %[ftmp1]               \n\t"
+    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp7]               \n\t"
+
+    "pmullh     %[ftmp8],     %[ftmp8],        %[ftmp2]               \n\t"
+    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp8]               \n\t"
+
+    "pmullh     %[ftmp9],     %[ftmp9],        %[ftmp4]               \n\t"
+    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp9]               \n\t"
+
+    "pmullh     %[ftmp10],    %[ftmp10],       %[ftmp3]               \n\t"
+    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp10]              \n\t"
+
+    "pmullh     %[ftmp11],    %[ftmp11],       %[ftmp5]               \n\t"
+    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp11]              \n\t"
+
+    "paddsh     %[ftmp12],    %[ftmp12],       %[ff_ph_40]            \n\t"
+    "psrah      %[ftmp12],    %[ftmp12],       %[ftmp13]              \n\t"
+    "packushb   %[ftmp12],    %[ftmp12],       %[fzero]               \n\t"
+    "gsswlc1    %[ftmp12],    0x03(%[output_ptr])                     \n\t"
+    "gsswrc1    %[ftmp12],    0x00(%[output_ptr])                     \n\t"
+
+    MMI_ADDIU(%[output_height], %[output_height], -0x01)
+    MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch])
+    "bnez       %[output_height], 1b                                  \n\t"
+    : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0),
+      [ftmp1]"=&f"(ftmp1),              [ftmp2]"=&f"(ftmp2),
+      [ftmp3]"=&f"(ftmp3),              [ftmp4]"=&f"(ftmp4),
+      [ftmp5]"=&f"(ftmp5),              [ftmp6]"=&f"(ftmp6),
+      [ftmp7]"=&f"(ftmp7),              [ftmp8]"=&f"(ftmp8),
+      [ftmp9]"=&f"(ftmp9),              [ftmp10]"=&f"(ftmp10),
+      [ftmp11]"=&f"(ftmp11),            [ftmp12]"=&f"(ftmp12),
+      [ftmp13]"=&f"(ftmp13),            [tmp0]"=&r"(tmp[0]),
+      [addr0]"=&r"(addr[0]),            [src_ptr]"+&r"(src_ptr),
+      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height),
+      [ff_ph_40]"=&f"(ff_ph_40)
+    : [pixels_per_line]"r"((mips_reg)pixels_per_line),
+      [pixels_per_line_x2]"r"((mips_reg)(pixels_per_line<<1)),
+      [pixels_per_line_x4]"r"((mips_reg)(pixels_per_line<<2)),
+      [vp8_filter]"r"(vp8_filter),
+      [output_pitch]"r"((mips_reg)output_pitch)
+    : "memory"
+    );
+  /* clang-format on */
+}
+
+/* When xoffset == 0, vp8_filter= {0,0,128,0,0,0},
+   function vp8_filter_block1d_h6_mmi and vp8_filter_block1d_v6_mmi can
+   be simplified */
+static INLINE void vp8_filter_block1d_h6_filter0_mmi(
+    unsigned char *src_ptr, uint16_t *output_ptr,
+    unsigned int src_pixels_per_line, unsigned int output_height,
+    unsigned int output_width) {
+#if _MIPS_SIM == _ABIO32
+  register double fzero asm("$f0");
+  register double ftmp0 asm("$f2");
+  register double ftmp1 asm("$f4");
+#else
+  register double fzero asm("$f0");
+  register double ftmp0 asm("$f1");
+  register double ftmp1 asm("$f2");
+#endif  // _MIPS_SIM == _ABIO32
+
+  /* clang-format off */
+  __asm__ volatile (
+    "pxor       %[fzero],       %[fzero],           %[fzero]          \n\t"
+
+    "1:                                                               \n\t"
+    "gsldlc1    %[ftmp0],       0x07(%[src_ptr])                      \n\t"
+    "gsldrc1    %[ftmp0],       0x00(%[src_ptr])                      \n\t"
+    MMI_ADDU(%[src_ptr],  %[src_ptr], %[src_pixels_per_line])
+
+    "punpcklbh  %[ftmp1],       %[ftmp0],          %[fzero]           \n\t"
+    "gssdlc1    %[ftmp1],       0x07(%[output_ptr])                   \n\t"
+    "gssdrc1    %[ftmp1],       0x00(%[output_ptr])                   \n\t"
+
+    "addiu      %[output_height], %[output_height], -0x01             \n\t"
+    MMI_ADDU(%[output_ptr],  %[output_ptr],    %[output_width])
+    "bnez       %[output_height],               1b                    \n\t"
+    : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0),
+      [ftmp1]"=&f"(ftmp1),              [src_ptr]"+&r"(src_ptr),
+      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height)
+    : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line),
+      [output_width]"r"(output_width)
+    : "memory"
+    );
+  /* clang-format on */
+}
+
+static INLINE void vp8_filter_block1dc_v6_filter0_mmi(
+    uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height,
+    int output_pitch, unsigned int pixels_per_line) {
+#if _MIPS_SIM == _ABIO32
+  register double fzero asm("$f0");
+  register double ftmp0 asm("$f2");
+  register double ftmp1 asm("$f4");
+#else
+  register double fzero asm("$f0");
+  register double ftmp0 asm("$f1");
+  register double ftmp1 asm("$f2");
+#endif  // _MIPS_SIM == _ABIO32
+
+  /* clang-format on */
+  __asm__ volatile (
+    "pxor       %[fzero],     %[fzero],        %[fzero]               \n\t"
+
+    "1:                                                               \n\t"
+    "gsldlc1    %[ftmp0],     0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp0],     0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr],   %[src_ptr],      %[pixels_per_line])
+    MMI_ADDIU(%[output_height], %[output_height], -0x01)
+    "packushb   %[ftmp1],     %[ftmp0],        %[fzero]               \n\t"
+    "gsswlc1    %[ftmp1],     0x03(%[output_ptr])                     \n\t"
+    "gsswrc1    %[ftmp1],     0x00(%[output_ptr])                     \n\t"
+
+    MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch])
+    "bnez       %[output_height], 1b                                  \n\t"
+    : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0),
+      [ftmp1]"=&f"(ftmp1),              [src_ptr]"+&r"(src_ptr),
+      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height)
+    : [pixels_per_line]"r"((mips_reg)pixels_per_line),
+      [output_pitch]"r"((mips_reg)output_pitch)
+    : "memory"
+    );
+  /* clang-format on */
+}
+
+#define sixtapNxM(n, m)                                                        \
+  void vp8_sixtap_predict##n##x##m##_mmi(                                      \
+      unsigned char *src_ptr, int src_pixels_per_line, int xoffset,            \
+      int yoffset, unsigned char *dst_ptr, int dst_pitch) {                    \
+    DECLARE_ALIGNED(16, uint16_t,                                              \
+                    FData2[(n + 5) * (n == 16 ? 24 : (n == 8 ? 16 : n))]);     \
+    const int16_t *HFilter, *VFilter;                                          \
+    int i, loop = n / 4;                                                       \
+    HFilter = vp8_six_tap_mmi[xoffset];                                        \
+    VFilter = vp8_six_tap_mmi[yoffset];                                        \
+                                                                               \
+    if (xoffset == 0) {                                                        \
+      for (i = 0; i < loop; ++i) {                                             \
+        vp8_filter_block1d_h6_filter0_mmi(                                     \
+            src_ptr - (2 * src_pixels_per_line) + i * 4, FData2 + i * 4,       \
+            src_pixels_per_line, m + 5, n * 2);                                \
+      }                                                                        \
+    } else {                                                                   \
+      for (i = 0; i < loop; ++i) {                                             \
+        vp8_filter_block1d_h6_mmi(src_ptr - (2 * src_pixels_per_line) + i * 4, \
+                                  FData2 + i * 4, src_pixels_per_line, m + 5,  \
+                                  n * 2, HFilter);                             \
+      }                                                                        \
+    }                                                                          \
+    if (yoffset == 0) {                                                        \
+      for (i = 0; i < loop; ++i) {                                             \
+        vp8_filter_block1dc_v6_filter0_mmi(                                    \
+            FData2 + n * 2 + i * 4, dst_ptr + i * 4, m, dst_pitch, n * 2);     \
+      }                                                                        \
+    } else {                                                                   \
+      for (i = 0; i < loop; ++i) {                                             \
+        vp8_filter_block1dc_v6_mmi(FData2 + i * 4, dst_ptr + i * 4, m,         \
+                                   dst_pitch, n * 2, VFilter);                 \
+      }                                                                        \
+    }                                                                          \
+  }
+
+sixtapNxM(4, 4);
+sixtapNxM(8, 8);
+sixtapNxM(8, 4);
+sixtapNxM(16, 16);
diff --git a/media/libvpx/libvpx/vp8/common/mips/msa/idct_msa.c b/media/libvpx/libvpx/vp8/common/mips/msa/idct_msa.c
index 3d516d0f81..efad0c29f8 100644
--- a/media/libvpx/libvpx/vp8/common/mips/msa/idct_msa.c
+++ b/media/libvpx/libvpx/vp8/common/mips/msa/idct_msa.c
@@ -134,7 +134,7 @@ static void idct4x4_addconst_msa(int16_t in_dc, uint8_t *pred,
   ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dest, dest_stride);
 }
 
-void vp8_short_inv_walsh4x4_msa(int16_t *input, int16_t *mb_dq_coeff) {
+void vp8_short_inv_walsh4x4_msa(int16_t *input, int16_t *mb_dqcoeff) {
   v8i16 input0, input1, tmp0, tmp1, tmp2, tmp3, out0, out1;
   const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
   const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
@@ -157,22 +157,22 @@ void vp8_short_inv_walsh4x4_msa(int16_t *input, int16_t *mb_dq_coeff) {
   ADD2(tmp0, 3, tmp1, 3, out0, out1);
   out0 >>= 3;
   out1 >>= 3;
-  mb_dq_coeff[0] = __msa_copy_s_h(out0, 0);
-  mb_dq_coeff[16] = __msa_copy_s_h(out0, 4);
-  mb_dq_coeff[32] = __msa_copy_s_h(out1, 0);
-  mb_dq_coeff[48] = __msa_copy_s_h(out1, 4);
-  mb_dq_coeff[64] = __msa_copy_s_h(out0, 1);
-  mb_dq_coeff[80] = __msa_copy_s_h(out0, 5);
-  mb_dq_coeff[96] = __msa_copy_s_h(out1, 1);
-  mb_dq_coeff[112] = __msa_copy_s_h(out1, 5);
-  mb_dq_coeff[128] = __msa_copy_s_h(out0, 2);
-  mb_dq_coeff[144] = __msa_copy_s_h(out0, 6);
-  mb_dq_coeff[160] = __msa_copy_s_h(out1, 2);
-  mb_dq_coeff[176] = __msa_copy_s_h(out1, 6);
-  mb_dq_coeff[192] = __msa_copy_s_h(out0, 3);
-  mb_dq_coeff[208] = __msa_copy_s_h(out0, 7);
-  mb_dq_coeff[224] = __msa_copy_s_h(out1, 3);
-  mb_dq_coeff[240] = __msa_copy_s_h(out1, 7);
+  mb_dqcoeff[0] = __msa_copy_s_h(out0, 0);
+  mb_dqcoeff[16] = __msa_copy_s_h(out0, 4);
+  mb_dqcoeff[32] = __msa_copy_s_h(out1, 0);
+  mb_dqcoeff[48] = __msa_copy_s_h(out1, 4);
+  mb_dqcoeff[64] = __msa_copy_s_h(out0, 1);
+  mb_dqcoeff[80] = __msa_copy_s_h(out0, 5);
+  mb_dqcoeff[96] = __msa_copy_s_h(out1, 1);
+  mb_dqcoeff[112] = __msa_copy_s_h(out1, 5);
+  mb_dqcoeff[128] = __msa_copy_s_h(out0, 2);
+  mb_dqcoeff[144] = __msa_copy_s_h(out0, 6);
+  mb_dqcoeff[160] = __msa_copy_s_h(out1, 2);
+  mb_dqcoeff[176] = __msa_copy_s_h(out1, 6);
+  mb_dqcoeff[192] = __msa_copy_s_h(out0, 3);
+  mb_dqcoeff[208] = __msa_copy_s_h(out0, 7);
+  mb_dqcoeff[224] = __msa_copy_s_h(out1, 3);
+  mb_dqcoeff[240] = __msa_copy_s_h(out1, 7);
 }
 
 static void dequant_idct4x4_addblk_msa(int16_t *input, int16_t *dequant_input,
@@ -359,27 +359,27 @@ void vp8_dequant_idct_add_y_block_msa(int16_t *q, int16_t *dq, uint8_t *dst,
   }
 }
 
-void vp8_dequant_idct_add_uv_block_msa(int16_t *q, int16_t *dq, uint8_t *dstu,
-                                       uint8_t *dstv, int32_t stride,
+void vp8_dequant_idct_add_uv_block_msa(int16_t *q, int16_t *dq, uint8_t *dst_u,
+                                       uint8_t *dst_v, int32_t stride,
                                        char *eobs) {
   int16_t *eobs_h = (int16_t *)eobs;
 
   if (eobs_h[0]) {
     if (eobs_h[0] & 0xfefe) {
-      dequant_idct4x4_addblk_2x_msa(q, dq, dstu, stride);
+      dequant_idct4x4_addblk_2x_msa(q, dq, dst_u, stride);
     } else {
-      dequant_idct_addconst_2x_msa(q, dq, dstu, stride);
+      dequant_idct_addconst_2x_msa(q, dq, dst_u, stride);
     }
   }
 
   q += 32;
-  dstu += (stride * 4);
+  dst_u += (stride * 4);
 
   if (eobs_h[1]) {
     if (eobs_h[1] & 0xfefe) {
-      dequant_idct4x4_addblk_2x_msa(q, dq, dstu, stride);
+      dequant_idct4x4_addblk_2x_msa(q, dq, dst_u, stride);
     } else {
-      dequant_idct_addconst_2x_msa(q, dq, dstu, stride);
+      dequant_idct_addconst_2x_msa(q, dq, dst_u, stride);
     }
   }
 
@@ -387,20 +387,20 @@ void vp8_dequant_idct_add_uv_block_msa(int16_t *q, int16_t *dq, uint8_t *dstu,
 
   if (eobs_h[2]) {
     if (eobs_h[2] & 0xfefe) {
-      dequant_idct4x4_addblk_2x_msa(q, dq, dstv, stride);
+      dequant_idct4x4_addblk_2x_msa(q, dq, dst_v, stride);
     } else {
-      dequant_idct_addconst_2x_msa(q, dq, dstv, stride);
+      dequant_idct_addconst_2x_msa(q, dq, dst_v, stride);
     }
   }
 
   q += 32;
-  dstv += (stride * 4);
+  dst_v += (stride * 4);
 
   if (eobs_h[3]) {
     if (eobs_h[3] & 0xfefe) {
-      dequant_idct4x4_addblk_2x_msa(q, dq, dstv, stride);
+      dequant_idct4x4_addblk_2x_msa(q, dq, dst_v, stride);
     } else {
-      dequant_idct_addconst_2x_msa(q, dq, dstv, stride);
+      dequant_idct_addconst_2x_msa(q, dq, dst_v, stride);
     }
   }
 }
diff --git a/media/libvpx/libvpx/vp8/common/mips/msa/sixtap_filter_msa.c b/media/libvpx/libvpx/vp8/common/mips/msa/sixtap_filter_msa.c
index b0affcff01..3a1bb7cd57 100644
--- a/media/libvpx/libvpx/vp8/common/mips/msa/sixtap_filter_msa.c
+++ b/media/libvpx/libvpx/vp8/common/mips/msa/sixtap_filter_msa.c
@@ -35,101 +35,134 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
 #define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_h0, filt_h1, \
                         filt_h2)                                           \
   ({                                                                       \
-    v16i8 vec0_m, vec1_m, vec2_m;                                          \
-    v8i16 hz_out_m;                                                        \
+    v16i8 _6tap_vec0_m, _6tap_vec1_m, _6tap_vec2_m;                        \
+    v8i16 _6tap_out_m;                                                     \
                                                                            \
     VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2,    \
-               vec0_m, vec1_m, vec2_m);                                    \
-    hz_out_m =                                                             \
-        DPADD_SH3_SH(vec0_m, vec1_m, vec2_m, filt_h0, filt_h1, filt_h2);   \
+               _6tap_vec0_m, _6tap_vec1_m, _6tap_vec2_m);                  \
+    _6tap_out_m = DPADD_SH3_SH(_6tap_vec0_m, _6tap_vec1_m, _6tap_vec2_m,   \
+                               filt_h0, filt_h1, filt_h2);                 \
                                                                            \
-    hz_out_m = __msa_srari_h(hz_out_m, VP8_FILTER_SHIFT);                  \
-    hz_out_m = __msa_sat_s_h(hz_out_m, 7);                                 \
+    _6tap_out_m = __msa_srari_h(_6tap_out_m, VP8_FILTER_SHIFT);            \
+    _6tap_out_m = __msa_sat_s_h(_6tap_out_m, 7);                           \
                                                                            \
-    hz_out_m;                                                              \
+    _6tap_out_m;                                                           \
   })
 
 #define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,   \
                                    mask2, filt0, filt1, filt2, out0, out1) \
   {                                                                        \
-    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m;                  \
+    v16i8 _6tap_4wid_vec0_m, _6tap_4wid_vec1_m, _6tap_4wid_vec2_m,         \
+        _6tap_4wid_vec3_m, _6tap_4wid_vec4_m, _6tap_4wid_vec5_m;           \
                                                                            \
-    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);      \
-    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);                 \
-    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);      \
-    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);                \
-    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);      \
-    DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1);                \
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, _6tap_4wid_vec0_m,    \
+               _6tap_4wid_vec1_m);                                         \
+    DOTP_SB2_SH(_6tap_4wid_vec0_m, _6tap_4wid_vec1_m, filt0, filt0, out0,  \
+                out1);                                                     \
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, _6tap_4wid_vec2_m,    \
+               _6tap_4wid_vec3_m);                                         \
+    DPADD_SB2_SH(_6tap_4wid_vec2_m, _6tap_4wid_vec3_m, filt1, filt1, out0, \
+                 out1);                                                    \
+    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, _6tap_4wid_vec4_m,    \
+               _6tap_4wid_vec5_m);                                         \
+    DPADD_SB2_SH(_6tap_4wid_vec4_m, _6tap_4wid_vec5_m, filt2, filt2, out0, \
+                 out1);                                                    \
   }
 
-#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,     \
-                                   mask2, filt0, filt1, filt2, out0, out1,   \
-                                   out2, out3)                               \
-  {                                                                          \
-    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \
-                                                                             \
-    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);        \
-    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);        \
-    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,  \
-                out0, out1, out2, out3);                                     \
-    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);        \
-    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);        \
-    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec4_m, vec5_m);        \
-    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec6_m, vec7_m);        \
-    DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
-                 out0, out1, out2, out3);                                    \
-    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt2, filt2, filt2, filt2, \
-                 out0, out1, out2, out3);                                    \
+#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,    \
+                                   mask2, filt0, filt1, filt2, out0, out1,  \
+                                   out2, out3)                              \
+  {                                                                         \
+    v16i8 _6tap_8wid_vec0_m, _6tap_8wid_vec1_m, _6tap_8wid_vec2_m,          \
+        _6tap_8wid_vec3_m, _6tap_8wid_vec4_m, _6tap_8wid_vec5_m,            \
+        _6tap_8wid_vec6_m, _6tap_8wid_vec7_m;                               \
+                                                                            \
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, _6tap_8wid_vec0_m,     \
+               _6tap_8wid_vec1_m);                                          \
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, _6tap_8wid_vec2_m,     \
+               _6tap_8wid_vec3_m);                                          \
+    DOTP_SB4_SH(_6tap_8wid_vec0_m, _6tap_8wid_vec1_m, _6tap_8wid_vec2_m,    \
+                _6tap_8wid_vec3_m, filt0, filt0, filt0, filt0, out0, out1,  \
+                out2, out3);                                                \
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, _6tap_8wid_vec0_m,     \
+               _6tap_8wid_vec1_m);                                          \
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, _6tap_8wid_vec2_m,     \
+               _6tap_8wid_vec3_m);                                          \
+    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, _6tap_8wid_vec4_m,     \
+               _6tap_8wid_vec5_m);                                          \
+    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, _6tap_8wid_vec6_m,     \
+               _6tap_8wid_vec7_m);                                          \
+    DPADD_SB4_SH(_6tap_8wid_vec0_m, _6tap_8wid_vec1_m, _6tap_8wid_vec2_m,   \
+                 _6tap_8wid_vec3_m, filt1, filt1, filt1, filt1, out0, out1, \
+                 out2, out3);                                               \
+    DPADD_SB4_SH(_6tap_8wid_vec4_m, _6tap_8wid_vec5_m, _6tap_8wid_vec6_m,   \
+                 _6tap_8wid_vec7_m, filt2, filt2, filt2, filt2, out0, out1, \
+                 out2, out3);                                               \
   }
 
-#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)        \
-  ({                                                         \
-    v8i16 tmp0;                                              \
-                                                             \
-    tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0);        \
-    tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1); \
-                                                             \
-    tmp0;                                                    \
-  })
-
-#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1)   \
+#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)                 \
   ({                                                                  \
-    v16i8 vec0_m, vec1_m;                                             \
-    v8i16 hz_out_m;                                                   \
+    v8i16 _4tap_dpadd_tmp0;                                           \
                                                                       \
-    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0_m, vec1_m); \
-    hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1); \
+    _4tap_dpadd_tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0);     \
+    _4tap_dpadd_tmp0 =                                                \
+        __msa_dpadd_s_h(_4tap_dpadd_tmp0, (v16i8)vec1, (v16i8)filt1); \
                                                                       \
-    hz_out_m = __msa_srari_h(hz_out_m, VP8_FILTER_SHIFT);             \
-    hz_out_m = __msa_sat_s_h(hz_out_m, 7);                            \
-                                                                      \
-    hz_out_m;                                                         \
+    _4tap_dpadd_tmp0;                                                 \
   })
 
-#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
-                                   filt0, filt1, out0, out1)             \
-  {                                                                      \
-    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                \
-                                                                         \
-    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);    \
-    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);               \
-    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);    \
-    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);              \
+#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1)        \
+  ({                                                                       \
+    v16i8 _4tap_vec0_m, _4tap_vec1_m;                                      \
+    v8i16 _4tap_out_m;                                                     \
+                                                                           \
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, _4tap_vec0_m,         \
+               _4tap_vec1_m);                                              \
+    _4tap_out_m =                                                          \
+        FILT_4TAP_DPADD_S_H(_4tap_vec0_m, _4tap_vec1_m, filt_h0, filt_h1); \
+                                                                           \
+    _4tap_out_m = __msa_srari_h(_4tap_out_m, VP8_FILTER_SHIFT);            \
+    _4tap_out_m = __msa_sat_s_h(_4tap_out_m, 7);                           \
+                                                                           \
+    _4tap_out_m;                                                           \
+  })
+
+#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,   \
+                                   filt0, filt1, out0, out1)               \
+  {                                                                        \
+    v16i8 _4tap_4wid_vec0_m, _4tap_4wid_vec1_m, _4tap_4wid_vec2_m,         \
+        _4tap_4wid_vec3_m;                                                 \
+                                                                           \
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, _4tap_4wid_vec0_m,    \
+               _4tap_4wid_vec1_m);                                         \
+    DOTP_SB2_SH(_4tap_4wid_vec0_m, _4tap_4wid_vec1_m, filt0, filt0, out0,  \
+                out1);                                                     \
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, _4tap_4wid_vec2_m,    \
+               _4tap_4wid_vec3_m);                                         \
+    DPADD_SB2_SH(_4tap_4wid_vec2_m, _4tap_4wid_vec3_m, filt1, filt1, out0, \
+                 out1);                                                    \
   }
 
-#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,     \
-                                   filt0, filt1, out0, out1, out2, out3)     \
-  {                                                                          \
-    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                    \
-                                                                             \
-    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);        \
-    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);        \
-    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,  \
-                out0, out1, out2, out3);                                     \
-    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);        \
-    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);        \
-    DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
-                 out0, out1, out2, out3);                                    \
+#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,    \
+                                   filt0, filt1, out0, out1, out2, out3)    \
+  {                                                                         \
+    v16i8 _4tap_8wid_vec0_m, _4tap_8wid_vec1_m, _4tap_8wid_vec2_m,          \
+        _4tap_8wid_vec3_m;                                                  \
+                                                                            \
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, _4tap_8wid_vec0_m,     \
+               _4tap_8wid_vec1_m);                                          \
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, _4tap_8wid_vec2_m,     \
+               _4tap_8wid_vec3_m);                                          \
+    DOTP_SB4_SH(_4tap_8wid_vec0_m, _4tap_8wid_vec1_m, _4tap_8wid_vec2_m,    \
+                _4tap_8wid_vec3_m, filt0, filt0, filt0, filt0, out0, out1,  \
+                out2, out3);                                                \
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, _4tap_8wid_vec0_m,     \
+               _4tap_8wid_vec1_m);                                          \
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, _4tap_8wid_vec2_m,     \
+               _4tap_8wid_vec3_m);                                          \
+    DPADD_SB4_SH(_4tap_8wid_vec0_m, _4tap_8wid_vec1_m, _4tap_8wid_vec2_m,   \
+                 _4tap_8wid_vec3_m, filt1, filt1, filt1, filt1, out0, out1, \
+                 out2, out3);                                               \
   }
 
 static void common_hz_6t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
diff --git a/media/libvpx/libvpx/vp8/common/mips/msa/vp8_macros_msa.h b/media/libvpx/libvpx/vp8/common/mips/msa/vp8_macros_msa.h
index 6bec3adec3..75b99146d2 100644
--- a/media/libvpx/libvpx/vp8/common/mips/msa/vp8_macros_msa.h
+++ b/media/libvpx/libvpx/vp8/common/mips/msa/vp8_macros_msa.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_
-#define VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_
+#ifndef VPX_VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_
+#define VPX_VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_
 
 #include <msa.h>
 
@@ -40,158 +40,160 @@
 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
 
 #if (__mips_isa_rev >= 6)
-#define LW(psrc)                                     \
-  ({                                                 \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc); \
-    uint32_t val_m;                                  \
-                                                     \
-    asm volatile("lw  %[val_m],  %[psrc_m]  \n\t"    \
-                                                     \
-                 : [val_m] "=r"(val_m)               \
-                 : [psrc_m] "m"(*psrc_m));           \
-                                                     \
-    val_m;                                           \
+#define LW(psrc)                                        \
+  ({                                                    \
+    const uint8_t *lw_psrc_m = (const uint8_t *)(psrc); \
+    uint32_t lw_val_m;                                  \
+                                                        \
+    asm volatile("lw  %[lw_val_m],  %[lw_psrc_m]  \n\t" \
+                                                        \
+                 : [lw_val_m] "=r"(lw_val_m)            \
+                 : [lw_psrc_m] "m"(*lw_psrc_m));        \
+                                                        \
+    lw_val_m;                                           \
   })
 
 #if (__mips == 64)
-#define LD(psrc)                                     \
-  ({                                                 \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc); \
-    uint64_t val_m = 0;                              \
-                                                     \
-    asm volatile("ld  %[val_m],  %[psrc_m]  \n\t"    \
-                                                     \
-                 : [val_m] "=r"(val_m)               \
-                 : [psrc_m] "m"(*psrc_m));           \
-                                                     \
-    val_m;                                           \
+#define LD(psrc)                                        \
+  ({                                                    \
+    const uint8_t *ld_psrc_m = (const uint8_t *)(psrc); \
+    uint64_t ld_val_m = 0;                              \
+                                                        \
+    asm volatile("ld  %[ld_val_m],  %[ld_psrc_m]  \n\t" \
+                                                        \
+                 : [ld_val_m] "=r"(ld_val_m)            \
+                 : [ld_psrc_m] "m"(*ld_psrc_m));        \
+                                                        \
+    ld_val_m;                                           \
   })
 #else  // !(__mips == 64)
-#define LD(psrc)                                            \
-  ({                                                        \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);        \
-    uint32_t val0_m, val1_m;                                \
-    uint64_t val_m = 0;                                     \
-                                                            \
-    val0_m = LW(psrc_m);                                    \
-    val1_m = LW(psrc_m + 4);                                \
-                                                            \
-    val_m = (uint64_t)(val1_m);                             \
-    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
-    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           \
-                                                            \
-    val_m;                                                  \
+#define LD(psrc)                                                  \
+  ({                                                              \
+    const uint8_t *ld_psrc_m = (const uint8_t *)(psrc);           \
+    uint32_t ld_val0_m, ld_val1_m;                                \
+    uint64_t ld_val_m = 0;                                        \
+                                                                  \
+    ld_val0_m = LW(ld_psrc_m);                                    \
+    ld_val1_m = LW(ld_psrc_m + 4);                                \
+                                                                  \
+    ld_val_m = (uint64_t)(ld_val1_m);                             \
+    ld_val_m = (uint64_t)((ld_val_m << 32) & 0xFFFFFFFF00000000); \
+    ld_val_m = (uint64_t)(ld_val_m | (uint64_t)ld_val0_m);        \
+                                                                  \
+    ld_val_m;                                                     \
   })
 #endif  // (__mips == 64)
 
-#define SH(val, pdst)                             \
-  {                                               \
-    uint8_t *pdst_m = (uint8_t *)(pdst);          \
-    const uint16_t val_m = (val);                 \
-                                                  \
-    asm volatile("sh  %[val_m],  %[pdst_m]  \n\t" \
-                                                  \
-                 : [pdst_m] "=m"(*pdst_m)         \
-                 : [val_m] "r"(val_m));           \
+#define SH(val, pdst)                                   \
+  {                                                     \
+    uint8_t *sh_pdst_m = (uint8_t *)(pdst);             \
+    const uint16_t sh_val_m = (val);                    \
+                                                        \
+    asm volatile("sh  %[sh_val_m],  %[sh_pdst_m]  \n\t" \
+                                                        \
+                 : [sh_pdst_m] "=m"(*sh_pdst_m)         \
+                 : [sh_val_m] "r"(sh_val_m));           \
   }
 
-#define SW(val, pdst)                             \
-  {                                               \
-    uint8_t *pdst_m = (uint8_t *)(pdst);          \
-    const uint32_t val_m = (val);                 \
-                                                  \
-    asm volatile("sw  %[val_m],  %[pdst_m]  \n\t" \
-                                                  \
-                 : [pdst_m] "=m"(*pdst_m)         \
-                 : [val_m] "r"(val_m));           \
+#define SW(val, pdst)                                   \
+  {                                                     \
+    uint8_t *sw_pdst_m = (uint8_t *)(pdst);             \
+    const uint32_t sw_val_m = (val);                    \
+                                                        \
+    asm volatile("sw  %[sw_val_m],  %[sw_pdst_m]  \n\t" \
+                                                        \
+                 : [sw_pdst_m] "=m"(*sw_pdst_m)         \
+                 : [sw_val_m] "r"(sw_val_m));           \
   }
 
-#define SD(val, pdst)                             \
-  {                                               \
-    uint8_t *pdst_m = (uint8_t *)(pdst);          \
-    const uint64_t val_m = (val);                 \
-                                                  \
-    asm volatile("sd  %[val_m],  %[pdst_m]  \n\t" \
-                                                  \
-                 : [pdst_m] "=m"(*pdst_m)         \
-                 : [val_m] "r"(val_m));           \
+#define SD(val, pdst)                                   \
+  {                                                     \
+    uint8_t *sd_pdst_m = (uint8_t *)(pdst);             \
+    const uint64_t sd_val_m = (val);                    \
+                                                        \
+    asm volatile("sd  %[sd_val_m],  %[sd_pdst_m]  \n\t" \
+                                                        \
+                 : [sd_pdst_m] "=m"(*sd_pdst_m)         \
+                 : [sd_val_m] "r"(sd_val_m));           \
   }
 #else  // !(__mips_isa_rev >= 6)
-#define LW(psrc)                                     \
-  ({                                                 \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc); \
-    uint32_t val_m;                                  \
-                                                     \
-    asm volatile("ulw  %[val_m],  %[psrc_m]  \n\t"   \
-                                                     \
-                 : [val_m] "=r"(val_m)               \
-                 : [psrc_m] "m"(*psrc_m));           \
-                                                     \
-    val_m;                                           \
+#define LW(psrc)                                        \
+  ({                                                    \
+    const uint8_t *lw_psrc_m = (const uint8_t *)(psrc); \
+    uint32_t lw_val_m;                                  \
+                                                        \
+    asm volatile(                                       \
+        "lwr %[lw_val_m], 0(%[lw_psrc_m]) \n\t"         \
+        "lwl %[lw_val_m], 3(%[lw_psrc_m]) \n\t"         \
+        : [lw_val_m] "=&r"(lw_val_m)                    \
+        : [lw_psrc_m] "r"(lw_psrc_m));                  \
+                                                        \
+    lw_val_m;                                           \
   })
 
 #if (__mips == 64)
-#define LD(psrc)                                     \
-  ({                                                 \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc); \
-    uint64_t val_m = 0;                              \
-                                                     \
-    asm volatile("uld  %[val_m],  %[psrc_m]  \n\t"   \
-                                                     \
-                 : [val_m] "=r"(val_m)               \
-                 : [psrc_m] "m"(*psrc_m));           \
-                                                     \
-    val_m;                                           \
+#define LD(psrc)                                        \
+  ({                                                    \
+    const uint8_t *ld_psrc_m = (const uint8_t *)(psrc); \
+    uint64_t ld_val_m = 0;                              \
+                                                        \
+    asm volatile(                                       \
+        "ldr %[ld_val_m], 0(%[ld_psrc_m]) \n\t"         \
+        "ldl %[ld_val_m], 7(%[ld_psrc_m]) \n\t"         \
+        : [ld_val_m] "=&r"(ld_val_m)                    \
+        : [ld_psrc_m] "r"(ld_psrc_m));                  \
+                                                        \
+    ld_val_m;                                           \
   })
 #else  // !(__mips == 64)
-#define LD(psrc)                                            \
-  ({                                                        \
-    const uint8_t *psrc_m1 = (const uint8_t *)(psrc);       \
-    uint32_t val0_m, val1_m;                                \
-    uint64_t val_m = 0;                                     \
-                                                            \
-    val0_m = LW(psrc_m1);                                   \
-    val1_m = LW(psrc_m1 + 4);                               \
-                                                            \
-    val_m = (uint64_t)(val1_m);                             \
-    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
-    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           \
-                                                            \
-    val_m;                                                  \
+#define LD(psrc)                                                  \
+  ({                                                              \
+    const uint8_t *ld_psrc_m1 = (const uint8_t *)(psrc);          \
+    uint32_t ld_val0_m, ld_val1_m;                                \
+    uint64_t ld_val_m = 0;                                        \
+                                                                  \
+    ld_val0_m = LW(ld_psrc_m1);                                   \
+    ld_val1_m = LW(ld_psrc_m1 + 4);                               \
+                                                                  \
+    ld_val_m = (uint64_t)(ld_val1_m);                             \
+    ld_val_m = (uint64_t)((ld_val_m << 32) & 0xFFFFFFFF00000000); \
+    ld_val_m = (uint64_t)(ld_val_m | (uint64_t)ld_val0_m);        \
+                                                                  \
+    ld_val_m;                                                     \
   })
 #endif  // (__mips == 64)
-#define SH(val, pdst)                              \
-  {                                                \
-    uint8_t *pdst_m = (uint8_t *)(pdst);           \
-    const uint16_t val_m = (val);                  \
-                                                   \
-    asm volatile("ush  %[val_m],  %[pdst_m]  \n\t" \
-                                                   \
-                 : [pdst_m] "=m"(*pdst_m)          \
-                 : [val_m] "r"(val_m));            \
+#define SH(val, pdst)                                    \
+  {                                                      \
+    uint8_t *sh_pdst_m = (uint8_t *)(pdst);              \
+    const uint16_t sh_val_m = (val);                     \
+                                                         \
+    asm volatile("ush  %[sh_val_m],  %[sh_pdst_m]  \n\t" \
+                                                         \
+                 : [sh_pdst_m] "=m"(*sh_pdst_m)          \
+                 : [sh_val_m] "r"(sh_val_m));            \
   }
 
-#define SW(val, pdst)                              \
-  {                                                \
-    uint8_t *pdst_m = (uint8_t *)(pdst);           \
-    const uint32_t val_m = (val);                  \
-                                                   \
-    asm volatile("usw  %[val_m],  %[pdst_m]  \n\t" \
-                                                   \
-                 : [pdst_m] "=m"(*pdst_m)          \
-                 : [val_m] "r"(val_m));            \
+#define SW(val, pdst)                                    \
+  {                                                      \
+    uint8_t *sw_pdst_m = (uint8_t *)(pdst);              \
+    const uint32_t sw_val_m = (val);                     \
+                                                         \
+    asm volatile("usw  %[sw_val_m],  %[sw_pdst_m]  \n\t" \
+                                                         \
+                 : [sw_pdst_m] "=m"(*sw_pdst_m)          \
+                 : [sw_val_m] "r"(sw_val_m));            \
   }
 
-#define SD(val, pdst)                                        \
-  {                                                          \
-    uint8_t *pdst_m1 = (uint8_t *)(pdst);                    \
-    uint32_t val0_m, val1_m;                                 \
-                                                             \
-    val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
-    val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
-                                                             \
-    SW(val0_m, pdst_m1);                                     \
-    SW(val1_m, pdst_m1 + 4);                                 \
+#define SD(val, pdst)                                           \
+  {                                                             \
+    uint8_t *sd_pdst_m1 = (uint8_t *)(pdst);                    \
+    uint32_t sd_val0_m, sd_val1_m;                              \
+                                                                \
+    sd_val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF);         \
+    sd_val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
+                                                                \
+    SW(sd_val0_m, sd_pdst_m1);                                  \
+    SW(sd_val1_m, sd_pdst_m1 + 4);                              \
   }
 #endif  // (__mips_isa_rev >= 6)
 
@@ -1757,4 +1759,4 @@
                                                                 \
     tmp1_m;                                                     \
   })
-#endif /* VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_ */
+#endif  // VPX_VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_
diff --git a/media/libvpx/libvpx/vp8/common/modecont.c b/media/libvpx/libvpx/vp8/common/modecont.c
index d6ad9bb99a..bab410374f 100644
--- a/media/libvpx/libvpx/vp8/common/modecont.c
+++ b/media/libvpx/libvpx/vp8/common/modecont.c
@@ -11,28 +11,16 @@
 #include "entropy.h"
 
 const int vp8_mode_contexts[6][4] = {
-  {
-      /* 0 */
-      7, 1, 1, 143,
-  },
-  {
-      /* 1 */
-      14, 18, 14, 107,
-  },
-  {
-      /* 2 */
-      135, 64, 57, 68,
-  },
-  {
-      /* 3 */
-      60, 56, 128, 65,
-  },
-  {
-      /* 4 */
-      159, 134, 128, 34,
-  },
-  {
-      /* 5 */
-      234, 188, 128, 28,
-  },
+  { /* 0 */
+    7, 1, 1, 143 },
+  { /* 1 */
+    14, 18, 14, 107 },
+  { /* 2 */
+    135, 64, 57, 68 },
+  { /* 3 */
+    60, 56, 128, 65 },
+  { /* 4 */
+    159, 134, 128, 34 },
+  { /* 5 */
+    234, 188, 128, 28 },
 };
diff --git a/media/libvpx/libvpx/vp8/common/modecont.h b/media/libvpx/libvpx/vp8/common/modecont.h
index b58c7dc2d3..031f74f2ff 100644
--- a/media/libvpx/libvpx/vp8/common/modecont.h
+++ b/media/libvpx/libvpx/vp8/common/modecont.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_MODECONT_H_
-#define VP8_COMMON_MODECONT_H_
+#ifndef VPX_VP8_COMMON_MODECONT_H_
+#define VPX_VP8_COMMON_MODECONT_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -21,4 +21,4 @@ extern const int vp8_mode_contexts[6][4];
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_MODECONT_H_
+#endif  // VPX_VP8_COMMON_MODECONT_H_
diff --git a/media/libvpx/libvpx/vp8/common/mv.h b/media/libvpx/libvpx/vp8/common/mv.h
index b6d2147af8..4cde12f201 100644
--- a/media/libvpx/libvpx/vp8/common/mv.h
+++ b/media/libvpx/libvpx/vp8/common/mv.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_MV_H_
-#define VP8_COMMON_MV_H_
+#ifndef VPX_VP8_COMMON_MV_H_
+#define VPX_VP8_COMMON_MV_H_
 #include "vpx/vpx_integer.h"
 
 #ifdef __cplusplus
@@ -30,4 +30,4 @@ typedef union int_mv {
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_MV_H_
+#endif  // VPX_VP8_COMMON_MV_H_
diff --git a/media/libvpx/libvpx/vp8/common/onyx.h b/media/libvpx/libvpx/vp8/common/onyx.h
index 72fba2ec56..2038c000b0 100644
--- a/media/libvpx/libvpx/vp8/common/onyx.h
+++ b/media/libvpx/libvpx/vp8/common/onyx.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_ONYX_H_
-#define VP8_COMMON_ONYX_H_
+#ifndef VPX_VP8_COMMON_ONYX_H_
+#define VPX_VP8_COMMON_ONYX_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -26,13 +26,6 @@ struct VP8_COMP;
 
 /* Create/destroy static data structures. */
 
-typedef enum {
-  NORMAL = 0,
-  FOURFIVE = 1,
-  THREEFIVE = 2,
-  ONETWO = 3
-} VPX_SCALING;
-
 typedef enum {
   USAGE_LOCAL_FILE_PLAYBACK = 0x0,
   USAGE_STREAM_FROM_SERVER = 0x1,
@@ -58,19 +51,19 @@ typedef enum {
 #include <assert.h>
 static INLINE void Scale2Ratio(int mode, int *hr, int *hs) {
   switch (mode) {
-    case NORMAL:
+    case VP8E_NORMAL:
       *hr = 1;
       *hs = 1;
       break;
-    case FOURFIVE:
+    case VP8E_FOURFIVE:
       *hr = 4;
       *hs = 5;
       break;
-    case THREEFIVE:
+    case VP8E_THREEFIVE:
       *hr = 3;
       *hs = 5;
       break;
-    case ONETWO:
+    case VP8E_ONETWO:
       *hr = 1;
       *hs = 2;
       break;
@@ -90,7 +83,14 @@ typedef struct {
   int Width;
   int Height;
   struct vpx_rational timebase;
-  unsigned int target_bandwidth; /* kilobits per second */
+  /* In either kilobits per second or bits per second, depending on which
+   * copy of oxcf this is in.
+   * - ctx->oxcf.target_bandwidth is in kilobits per second. See
+   *   set_vp8e_config().
+   * - ctx->cpi->oxcf.target_bandwidth in is bits per second. See
+   *   vp8_change_config().
+   */
+  unsigned int target_bandwidth;
 
   /* Parameter used for applying denoiser.
    * For temporal denoiser: noise_sensitivity = 0 means off,
@@ -221,6 +221,7 @@ typedef struct {
 
   /* Temporal scaling parameters */
   unsigned int number_of_layers;
+  /* kilobits per second */
   unsigned int target_bitrate[VPX_TS_MAX_PERIODICITY];
   unsigned int rate_decimator[VPX_TS_MAX_PERIODICITY];
   unsigned int periodicity;
@@ -241,44 +242,44 @@ typedef struct {
 #endif
 } VP8_CONFIG;
 
-void vp8_initialize();
+void vp8_initialize(void);
 
-struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf);
+struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf);
 void vp8_remove_compressor(struct VP8_COMP **comp);
 
 void vp8_init_config(struct VP8_COMP *onyx, VP8_CONFIG *oxcf);
-void vp8_change_config(struct VP8_COMP *onyx, VP8_CONFIG *oxcf);
+void vp8_change_config(struct VP8_COMP *cpi, const VP8_CONFIG *oxcf);
 
-int vp8_receive_raw_frame(struct VP8_COMP *comp, unsigned int frame_flags,
+int vp8_receive_raw_frame(struct VP8_COMP *cpi, unsigned int frame_flags,
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
-                          int64_t end_time_stamp);
-int vp8_get_compressed_data(struct VP8_COMP *comp, unsigned int *frame_flags,
+                          int64_t end_time);
+int vp8_get_compressed_data(struct VP8_COMP *cpi, unsigned int *frame_flags,
                             size_t *size, unsigned char *dest,
                             unsigned char *dest_end, int64_t *time_stamp,
                             int64_t *time_end, int flush);
-int vp8_get_preview_raw_frame(struct VP8_COMP *comp, YV12_BUFFER_CONFIG *dest,
+int vp8_get_preview_raw_frame(struct VP8_COMP *cpi, YV12_BUFFER_CONFIG *dest,
                               vp8_ppflags_t *flags);
 
-int vp8_use_as_reference(struct VP8_COMP *comp, int ref_frame_flags);
-int vp8_update_reference(struct VP8_COMP *comp, int ref_frame_flags);
-int vp8_get_reference(struct VP8_COMP *comp,
+int vp8_use_as_reference(struct VP8_COMP *cpi, int ref_frame_flags);
+int vp8_update_reference(struct VP8_COMP *cpi, int ref_frame_flags);
+int vp8_get_reference(struct VP8_COMP *cpi,
                       enum vpx_ref_frame_type ref_frame_flag,
                       YV12_BUFFER_CONFIG *sd);
-int vp8_set_reference(struct VP8_COMP *comp,
+int vp8_set_reference(struct VP8_COMP *cpi,
                       enum vpx_ref_frame_type ref_frame_flag,
                       YV12_BUFFER_CONFIG *sd);
-int vp8_update_entropy(struct VP8_COMP *comp, int update);
-int vp8_set_roimap(struct VP8_COMP *comp, unsigned char *map, unsigned int rows,
+int vp8_update_entropy(struct VP8_COMP *cpi, int update);
+int vp8_set_roimap(struct VP8_COMP *cpi, unsigned char *map, unsigned int rows,
                    unsigned int cols, int delta_q[4], int delta_lf[4],
                    unsigned int threshold[4]);
-int vp8_set_active_map(struct VP8_COMP *comp, unsigned char *map,
+int vp8_set_active_map(struct VP8_COMP *cpi, unsigned char *map,
                        unsigned int rows, unsigned int cols);
-int vp8_set_internal_size(struct VP8_COMP *comp, VPX_SCALING horiz_mode,
-                          VPX_SCALING vert_mode);
-int vp8_get_quantizer(struct VP8_COMP *c);
+int vp8_set_internal_size(struct VP8_COMP *cpi, VPX_SCALING_MODE horiz_mode,
+                          VPX_SCALING_MODE vert_mode);
+int vp8_get_quantizer(struct VP8_COMP *cpi);
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif  // VP8_COMMON_ONYX_H_
+#endif  // VPX_VP8_COMMON_ONYX_H_
diff --git a/media/libvpx/libvpx/vp8/common/onyxc_int.h b/media/libvpx/libvpx/vp8/common/onyxc_int.h
index 9a12c7fb67..d4824d24e4 100644
--- a/media/libvpx/libvpx/vp8/common/onyxc_int.h
+++ b/media/libvpx/libvpx/vp8/common/onyxc_int.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_ONYXC_INT_H_
-#define VP8_COMMON_ONYXC_INT_H_
+#ifndef VPX_VP8_COMMON_ONYXC_INT_H_
+#define VPX_VP8_COMMON_ONYXC_INT_H_
 
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
@@ -167,11 +167,10 @@ typedef struct VP8Common {
 #if CONFIG_POSTPROC
   struct postproc_state postproc_state;
 #endif
-  int cpu_caps;
 } VP8_COMMON;
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_ONYXC_INT_H_
+#endif  // VPX_VP8_COMMON_ONYXC_INT_H_
diff --git a/media/libvpx/libvpx/vp8/common/onyxd.h b/media/libvpx/libvpx/vp8/common/onyxd.h
index e05461aad0..217a598de7 100644
--- a/media/libvpx/libvpx/vp8/common/onyxd.h
+++ b/media/libvpx/libvpx/vp8/common/onyxd.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_ONYXD_H_
-#define VP8_COMMON_ONYXD_H_
+#ifndef VPX_VP8_COMMON_ONYXD_H_
+#define VPX_VP8_COMMON_ONYXD_H_
 
 /* Create/destroy static data structures. */
 #ifdef __cplusplus
@@ -22,6 +22,7 @@ extern "C" {
 #include "vpx/vp8.h"
 
 struct VP8D_COMP;
+struct VP8Common;
 
 typedef struct {
   int Width;
@@ -40,21 +41,21 @@ void vp8dx_set_setting(struct VP8D_COMP *comp, VP8D_SETTING oxst, int x);
 
 int vp8dx_get_setting(struct VP8D_COMP *comp, VP8D_SETTING oxst);
 
-int vp8dx_receive_compressed_data(struct VP8D_COMP *comp, size_t size,
-                                  const uint8_t *dest, int64_t time_stamp);
-int vp8dx_get_raw_frame(struct VP8D_COMP *comp, YV12_BUFFER_CONFIG *sd,
-                        int64_t *time_stamp, int64_t *time_end_stamp,
+int vp8dx_receive_compressed_data(struct VP8D_COMP *pbi);
+int vp8dx_get_raw_frame(struct VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd,
                         vp8_ppflags_t *flags);
+int vp8dx_references_buffer(struct VP8Common *oci, int ref_frame);
 
-vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP *comp,
+vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP *pbi,
                                     enum vpx_ref_frame_type ref_frame_flag,
                                     YV12_BUFFER_CONFIG *sd);
-vpx_codec_err_t vp8dx_set_reference(struct VP8D_COMP *comp,
+vpx_codec_err_t vp8dx_set_reference(struct VP8D_COMP *pbi,
                                     enum vpx_ref_frame_type ref_frame_flag,
                                     YV12_BUFFER_CONFIG *sd);
+int vp8dx_get_quantizer(const struct VP8D_COMP *pbi);
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif  // VP8_COMMON_ONYXD_H_
+#endif  // VPX_VP8_COMMON_ONYXD_H_
diff --git a/media/libvpx/libvpx/vp8/common/postproc.c b/media/libvpx/libvpx/vp8/common/postproc.c
index 8c292d6161..c03b16b2f5 100644
--- a/media/libvpx/libvpx/vp8/common/postproc.c
+++ b/media/libvpx/libvpx/vp8/common/postproc.c
@@ -60,8 +60,7 @@ static void vp8_de_mblock(YV12_BUFFER_CONFIG *post, int q) {
 }
 
 void vp8_deblock(VP8_COMMON *cm, YV12_BUFFER_CONFIG *source,
-                 YV12_BUFFER_CONFIG *post, int q, int low_var_thresh,
-                 int flag) {
+                 YV12_BUFFER_CONFIG *post, int q) {
   double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
   int ppl = (int)(level + .5);
 
@@ -72,8 +71,6 @@ void vp8_deblock(VP8_COMMON *cm, YV12_BUFFER_CONFIG *source,
    * is a skipped block.  */
   unsigned char *ylimits = cm->pp_limits_buffer;
   unsigned char *uvlimits = cm->pp_limits_buffer + 16 * cm->mb_cols;
-  (void)low_var_thresh;
-  (void)flag;
 
   if (ppl > 0) {
     for (mbr = 0; mbr < cm->mb_rows; ++mbr) {
@@ -116,8 +113,7 @@ void vp8_deblock(VP8_COMMON *cm, YV12_BUFFER_CONFIG *source,
   }
 }
 
-void vp8_de_noise(VP8_COMMON *cm, YV12_BUFFER_CONFIG *source,
-                  YV12_BUFFER_CONFIG *post, int q, int low_var_thresh, int flag,
+void vp8_de_noise(VP8_COMMON *cm, YV12_BUFFER_CONFIG *source, int q,
                   int uvfilter) {
   int mbr;
   double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
@@ -125,9 +121,6 @@ void vp8_de_noise(VP8_COMMON *cm, YV12_BUFFER_CONFIG *source,
   int mb_rows = cm->mb_rows;
   int mb_cols = cm->mb_cols;
   unsigned char *limits = cm->pp_limits_buffer;
-  (void)post;
-  (void)low_var_thresh;
-  (void)flag;
 
   memset(limits, (unsigned char)ppl, 16 * mb_cols);
 
@@ -151,124 +144,6 @@ void vp8_de_noise(VP8_COMMON *cm, YV12_BUFFER_CONFIG *source,
 }
 #endif  // CONFIG_POSTPROC
 
-/* Blend the macro block with a solid colored square.  Leave the
- * edges unblended to give distinction to macro blocks in areas
- * filled with the same color block.
- */
-void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v,
-                          int y_1, int u_1, int v_1, int alpha, int stride) {
-  int i, j;
-  int y1_const = y_1 * ((1 << 16) - alpha);
-  int u1_const = u_1 * ((1 << 16) - alpha);
-  int v1_const = v_1 * ((1 << 16) - alpha);
-
-  y += 2 * stride + 2;
-  for (i = 0; i < 12; ++i) {
-    for (j = 0; j < 12; ++j) {
-      y[j] = (y[j] * alpha + y1_const) >> 16;
-    }
-    y += stride;
-  }
-
-  stride >>= 1;
-
-  u += stride + 1;
-  v += stride + 1;
-
-  for (i = 0; i < 6; ++i) {
-    for (j = 0; j < 6; ++j) {
-      u[j] = (u[j] * alpha + u1_const) >> 16;
-      v[j] = (v[j] * alpha + v1_const) >> 16;
-    }
-    u += stride;
-    v += stride;
-  }
-}
-
-/* Blend only the edge of the macro block.  Leave center
- * unblended to allow for other visualizations to be layered.
- */
-void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v,
-                          int y_1, int u_1, int v_1, int alpha, int stride) {
-  int i, j;
-  int y1_const = y_1 * ((1 << 16) - alpha);
-  int u1_const = u_1 * ((1 << 16) - alpha);
-  int v1_const = v_1 * ((1 << 16) - alpha);
-
-  for (i = 0; i < 2; ++i) {
-    for (j = 0; j < 16; ++j) {
-      y[j] = (y[j] * alpha + y1_const) >> 16;
-    }
-    y += stride;
-  }
-
-  for (i = 0; i < 12; ++i) {
-    y[0] = (y[0] * alpha + y1_const) >> 16;
-    y[1] = (y[1] * alpha + y1_const) >> 16;
-    y[14] = (y[14] * alpha + y1_const) >> 16;
-    y[15] = (y[15] * alpha + y1_const) >> 16;
-    y += stride;
-  }
-
-  for (i = 0; i < 2; ++i) {
-    for (j = 0; j < 16; ++j) {
-      y[j] = (y[j] * alpha + y1_const) >> 16;
-    }
-    y += stride;
-  }
-
-  stride >>= 1;
-
-  for (j = 0; j < 8; ++j) {
-    u[j] = (u[j] * alpha + u1_const) >> 16;
-    v[j] = (v[j] * alpha + v1_const) >> 16;
-  }
-  u += stride;
-  v += stride;
-
-  for (i = 0; i < 6; ++i) {
-    u[0] = (u[0] * alpha + u1_const) >> 16;
-    v[0] = (v[0] * alpha + v1_const) >> 16;
-
-    u[7] = (u[7] * alpha + u1_const) >> 16;
-    v[7] = (v[7] * alpha + v1_const) >> 16;
-
-    u += stride;
-    v += stride;
-  }
-
-  for (j = 0; j < 8; ++j) {
-    u[j] = (u[j] * alpha + u1_const) >> 16;
-    v[j] = (v[j] * alpha + v1_const) >> 16;
-  }
-}
-
-void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v,
-                   int y_1, int u_1, int v_1, int alpha, int stride) {
-  int i, j;
-  int y1_const = y_1 * ((1 << 16) - alpha);
-  int u1_const = u_1 * ((1 << 16) - alpha);
-  int v1_const = v_1 * ((1 << 16) - alpha);
-
-  for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j) {
-      y[j] = (y[j] * alpha + y1_const) >> 16;
-    }
-    y += stride;
-  }
-
-  stride >>= 1;
-
-  for (i = 0; i < 2; ++i) {
-    for (j = 0; j < 2; ++j) {
-      u[j] = (u[j] * alpha + u1_const) >> 16;
-      v[j] = (v[j] * alpha + v1_const) >> 16;
-    }
-    u += stride;
-    v += stride;
-  }
-}
-
 #if CONFIG_POSTPROC
 int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest,
                         vp8_ppflags_t *ppflags) {
@@ -325,7 +200,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest,
   vpx_clear_system_state();
 
   if ((flags & VP8D_MFQE) && oci->postproc_state.last_frame_valid &&
-      oci->current_video_frame >= 2 &&
+      oci->current_video_frame > 10 &&
       oci->postproc_state.last_base_qindex < 60 &&
       oci->base_qindex - oci->postproc_state.last_base_qindex >= 20) {
     vp8_multiframe_quality_enhance(oci);
@@ -334,11 +209,10 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest,
       vp8_yv12_copy_frame(&oci->post_proc_buffer, &oci->post_proc_buffer_int);
       if (flags & VP8D_DEMACROBLOCK) {
         vp8_deblock(oci, &oci->post_proc_buffer_int, &oci->post_proc_buffer,
-                    q + (deblock_level - 5) * 10, 1, 0);
+                    q + (deblock_level - 5) * 10);
         vp8_de_mblock(&oci->post_proc_buffer, q + (deblock_level - 5) * 10);
       } else if (flags & VP8D_DEBLOCK) {
-        vp8_deblock(oci, &oci->post_proc_buffer_int, &oci->post_proc_buffer, q,
-                    1, 0);
+        vp8_deblock(oci, &oci->post_proc_buffer_int, &oci->post_proc_buffer, q);
       }
     }
     /* Move partially towards the base q of the previous frame */
@@ -346,12 +220,12 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest,
         (3 * oci->postproc_state.last_base_qindex + oci->base_qindex) >> 2;
   } else if (flags & VP8D_DEMACROBLOCK) {
     vp8_deblock(oci, oci->frame_to_show, &oci->post_proc_buffer,
-                q + (deblock_level - 5) * 10, 1, 0);
+                q + (deblock_level - 5) * 10);
     vp8_de_mblock(&oci->post_proc_buffer, q + (deblock_level - 5) * 10);
 
     oci->postproc_state.last_base_qindex = oci->base_qindex;
   } else if (flags & VP8D_DEBLOCK) {
-    vp8_deblock(oci, oci->frame_to_show, &oci->post_proc_buffer, q, 1, 0);
+    vp8_deblock(oci, oci->frame_to_show, &oci->post_proc_buffer, q);
     oci->postproc_state.last_base_qindex = oci->base_qindex;
   } else {
     vp8_yv12_copy_frame(oci->frame_to_show, &oci->post_proc_buffer);
diff --git a/media/libvpx/libvpx/vp8/common/postproc.h b/media/libvpx/libvpx/vp8/common/postproc.h
index 7be112b163..492c52aef6 100644
--- a/media/libvpx/libvpx/vp8/common/postproc.h
+++ b/media/libvpx/libvpx/vp8/common/postproc.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_POSTPROC_H_
-#define VP8_COMMON_POSTPROC_H_
+#ifndef VPX_VP8_COMMON_POSTPROC_H_
+#define VPX_VP8_COMMON_POSTPROC_H_
 
 #include "vpx_ports/mem.h"
 struct postproc_state {
@@ -27,14 +27,13 @@ struct postproc_state {
 extern "C" {
 #endif
 int vp8_post_proc_frame(struct VP8Common *oci, YV12_BUFFER_CONFIG *dest,
-                        vp8_ppflags_t *flags);
+                        vp8_ppflags_t *ppflags);
 
-void vp8_de_noise(struct VP8Common *oci, YV12_BUFFER_CONFIG *source,
-                  YV12_BUFFER_CONFIG *post, int q, int low_var_thresh, int flag,
+void vp8_de_noise(struct VP8Common *cm, YV12_BUFFER_CONFIG *source, int q,
                   int uvfilter);
 
-void vp8_deblock(struct VP8Common *oci, YV12_BUFFER_CONFIG *source,
-                 YV12_BUFFER_CONFIG *post, int q, int low_var_thresh, int flag);
+void vp8_deblock(struct VP8Common *cm, YV12_BUFFER_CONFIG *source,
+                 YV12_BUFFER_CONFIG *post, int q);
 
 #define MFQE_PRECISION 4
 
@@ -43,4 +42,4 @@ void vp8_multiframe_quality_enhance(struct VP8Common *cm);
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_POSTPROC_H_
+#endif  // VPX_VP8_COMMON_POSTPROC_H_
diff --git a/media/libvpx/libvpx/vp8/common/ppflags.h b/media/libvpx/libvpx/vp8/common/ppflags.h
index 96e3af6c9c..bdf08734b9 100644
--- a/media/libvpx/libvpx/vp8/common/ppflags.h
+++ b/media/libvpx/libvpx/vp8/common/ppflags.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_PPFLAGS_H_
-#define VP8_COMMON_PPFLAGS_H_
+#ifndef VPX_VP8_COMMON_PPFLAGS_H_
+#define VPX_VP8_COMMON_PPFLAGS_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -36,4 +36,4 @@ typedef struct {
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_PPFLAGS_H_
+#endif  // VPX_VP8_COMMON_PPFLAGS_H_
diff --git a/media/libvpx/libvpx/vp8/common/quant_common.h b/media/libvpx/libvpx/vp8/common/quant_common.h
index ff4203df87..049840a272 100644
--- a/media/libvpx/libvpx/vp8/common/quant_common.h
+++ b/media/libvpx/libvpx/vp8/common/quant_common.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_QUANT_COMMON_H_
-#define VP8_COMMON_QUANT_COMMON_H_
+#ifndef VPX_VP8_COMMON_QUANT_COMMON_H_
+#define VPX_VP8_COMMON_QUANT_COMMON_H_
 
 #include "string.h"
 #include "blockd.h"
@@ -30,4 +30,4 @@ extern int vp8_ac_uv_quant(int QIndex, int Delta);
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_QUANT_COMMON_H_
+#endif  // VPX_VP8_COMMON_QUANT_COMMON_H_
diff --git a/media/libvpx/libvpx/vp8/common/reconinter.c b/media/libvpx/libvpx/vp8/common/reconinter.c
index 48892c9b8e..2cb0709318 100644
--- a/media/libvpx/libvpx/vp8/common/reconinter.c
+++ b/media/libvpx/libvpx/vp8/common/reconinter.c
@@ -333,6 +333,13 @@ void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x, unsigned char *dst_y,
   _16x16mv.as_mv.row &= x->fullpixel_mask;
   _16x16mv.as_mv.col &= x->fullpixel_mask;
 
+  if (2 * _16x16mv.as_mv.col < (x->mb_to_left_edge - (19 << 3)) ||
+      2 * _16x16mv.as_mv.col > x->mb_to_right_edge + (18 << 3) ||
+      2 * _16x16mv.as_mv.row < (x->mb_to_top_edge - (19 << 3)) ||
+      2 * _16x16mv.as_mv.row > x->mb_to_bottom_edge + (18 << 3)) {
+    return;
+  }
+
   pre_stride >>= 1;
   offset = (_16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3);
   uptr = x->pre.u_buffer + offset;
diff --git a/media/libvpx/libvpx/vp8/common/reconinter.h b/media/libvpx/libvpx/vp8/common/reconinter.h
index 4cdd4fee0f..974e7ce754 100644
--- a/media/libvpx/libvpx/vp8/common/reconinter.h
+++ b/media/libvpx/libvpx/vp8/common/reconinter.h
@@ -8,30 +8,29 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_RECONINTER_H_
-#define VP8_COMMON_RECONINTER_H_
+#ifndef VPX_VP8_COMMON_RECONINTER_H_
+#define VPX_VP8_COMMON_RECONINTER_H_
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-extern void vp8_build_inter_predictors_mb(MACROBLOCKD *x);
-extern void vp8_build_inter16x16_predictors_mb(
-    MACROBLOCKD *x, unsigned char *dst_y, unsigned char *dst_u,
-    unsigned char *dst_v, int dst_ystride, int dst_uvstride);
+void vp8_build_inter_predictors_mb(MACROBLOCKD *xd);
+void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x, unsigned char *dst_y,
+                                        unsigned char *dst_u,
+                                        unsigned char *dst_v, int dst_ystride,
+                                        int dst_uvstride);
 
-extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x,
-                                                unsigned char *dst_y,
-                                                int dst_ystride);
-extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch,
-                                         unsigned char *base_pre,
-                                         int pre_stride, vp8_subpix_fn_t sppf);
+void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x, unsigned char *dst_y,
+                                         int dst_ystride);
+void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, unsigned char *base_pre,
+                                  int pre_stride, vp8_subpix_fn_t sppf);
 
-extern void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x);
-extern void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x);
+void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x);
+void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_RECONINTER_H_
+#endif  // VPX_VP8_COMMON_RECONINTER_H_
diff --git a/media/libvpx/libvpx/vp8/common/reconintra.c b/media/libvpx/libvpx/vp8/common/reconintra.c
index 986074ec7e..8e2094da87 100644
--- a/media/libvpx/libvpx/vp8/common/reconintra.c
+++ b/media/libvpx/libvpx/vp8/common/reconintra.c
@@ -71,8 +71,16 @@ void vp8_build_intra_predictors_mbuv_s(
     unsigned char *uleft, unsigned char *vleft, int left_stride,
     unsigned char *upred_ptr, unsigned char *vpred_ptr, int pred_stride) {
   MB_PREDICTION_MODE uvmode = x->mode_info_context->mbmi.uv_mode;
+#if HAVE_VSX
+  /* Power PC implementation uses "vec_vsx_ld" to read 16 bytes from
+     uleft_col and vleft_col. Play it safe by reserving enough stack
+     space here. */
+  unsigned char uleft_col[16];
+  unsigned char vleft_col[16];
+#else
   unsigned char uleft_col[8];
   unsigned char vleft_col[8];
+#endif
   int i;
   intra_pred_fn fn;
 
diff --git a/media/libvpx/libvpx/vp8/common/reconintra.h b/media/libvpx/libvpx/vp8/common/reconintra.h
index fd7c725f35..029ac00a24 100644
--- a/media/libvpx/libvpx/vp8/common/reconintra.h
+++ b/media/libvpx/libvpx/vp8/common/reconintra.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_RECONINTRA_H_
-#define VP8_COMMON_RECONINTRA_H_
+#ifndef VPX_VP8_COMMON_RECONINTRA_H_
+#define VPX_VP8_COMMON_RECONINTRA_H_
 
 #include "vp8/common/blockd.h"
 
@@ -32,4 +32,4 @@ void vp8_init_intra_predictors(void);
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_RECONINTRA_H_
+#endif  // VPX_VP8_COMMON_RECONINTRA_H_
diff --git a/media/libvpx/libvpx/vp8/common/reconintra4x4.c b/media/libvpx/libvpx/vp8/common/reconintra4x4.c
index 07c9223331..be936df5e0 100644
--- a/media/libvpx/libvpx/vp8/common/reconintra4x4.c
+++ b/media/libvpx/libvpx/vp8/common/reconintra4x4.c
@@ -16,7 +16,7 @@
 #include "blockd.h"
 #include "reconintra4x4.h"
 #include "vp8/common/common.h"
-#include "vpx_ports/mem.h"
+#include "vpx_ports/compiler_attributes.h"
 
 typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left);
@@ -31,7 +31,7 @@ void vp8_init_intra4x4_predictors_internal(void) {
   pred[B_LD_PRED] = vpx_d45e_predictor_4x4;
   pred[B_RD_PRED] = vpx_d135_predictor_4x4;
   pred[B_VR_PRED] = vpx_d117_predictor_4x4;
-  pred[B_VL_PRED] = vpx_d63f_predictor_4x4;
+  pred[B_VL_PRED] = vpx_d63e_predictor_4x4;
   pred[B_HD_PRED] = vpx_d153_predictor_4x4;
   pred[B_HU_PRED] = vpx_d207_predictor_4x4;
 }
@@ -40,7 +40,15 @@ void vp8_intra4x4_predict(unsigned char *above, unsigned char *yleft,
                           int left_stride, B_PREDICTION_MODE b_mode,
                           unsigned char *dst, int dst_stride,
                           unsigned char top_left) {
-  unsigned char Aboveb[12], *Above = Aboveb + 4;
+/* Power PC implementation uses "vec_vsx_ld" to read 16 bytes from
+   Above (aka, Aboveb + 4). Play it safe by reserving enough stack
+   space here. Similary for "Left". */
+#if HAVE_VSX
+  unsigned char Aboveb[20];
+#else
+  unsigned char Aboveb[12];
+#endif
+  unsigned char *Above = Aboveb + 4;
 #if HAVE_NEON
   // Neon intrinsics are unable to load 32 bits, or 4 8 bit values. Instead, it
   // over reads but does not use the extra 4 values.
@@ -50,6 +58,8 @@ void vp8_intra4x4_predict(unsigned char *above, unsigned char *yleft,
   // indeed read, they are not used.
   vp8_zero_array(Left, 8);
 #endif  // VPX_WITH_ASAN
+#elif HAVE_VSX
+  unsigned char Left[16];
 #else
   unsigned char Left[4];
 #endif  // HAVE_NEON
diff --git a/media/libvpx/libvpx/vp8/common/reconintra4x4.h b/media/libvpx/libvpx/vp8/common/reconintra4x4.h
index e17fc58c01..3618ec5cbe 100644
--- a/media/libvpx/libvpx/vp8/common/reconintra4x4.h
+++ b/media/libvpx/libvpx/vp8/common/reconintra4x4.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_RECONINTRA4X4_H_
-#define VP8_COMMON_RECONINTRA4X4_H_
+#ifndef VPX_VP8_COMMON_RECONINTRA4X4_H_
+#define VPX_VP8_COMMON_RECONINTRA4X4_H_
 #include "vp8/common/blockd.h"
 
 #ifdef __cplusplus
@@ -31,7 +31,7 @@ static INLINE void intra_prediction_down_copy(MACROBLOCKD *xd,
   *dst_ptr2 = *src_ptr;
 }
 
-void vp8_intra4x4_predict(unsigned char *Above, unsigned char *yleft,
+void vp8_intra4x4_predict(unsigned char *above, unsigned char *yleft,
                           int left_stride, B_PREDICTION_MODE b_mode,
                           unsigned char *dst, int dst_stride,
                           unsigned char top_left);
@@ -42,4 +42,4 @@ void vp8_init_intra4x4_predictors_internal(void);
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_RECONINTRA4X4_H_
+#endif  // VPX_VP8_COMMON_RECONINTRA4X4_H_
diff --git a/media/libvpx/libvpx/vp8/common/rtcd.c b/media/libvpx/libvpx/vp8/common/rtcd.c
index 09a0e2b4b3..102b7ccd54 100644
--- a/media/libvpx/libvpx/vp8/common/rtcd.c
+++ b/media/libvpx/libvpx/vp8/common/rtcd.c
@@ -12,4 +12,4 @@
 #include "./vp8_rtcd.h"
 #include "vpx_ports/vpx_once.h"
 
-void vp8_rtcd() { once(setup_rtcd_internal); }
+void vp8_rtcd(void) { once(setup_rtcd_internal); }
diff --git a/media/libvpx/libvpx/vp8/common/rtcd_defs.pl b/media/libvpx/libvpx/vp8/common/rtcd_defs.pl
index bc5e057999..12b474d939 100644
--- a/media/libvpx/libvpx/vp8/common/rtcd_defs.pl
+++ b/media/libvpx/libvpx/vp8/common/rtcd_defs.pl
@@ -1,3 +1,13 @@
+##
+##  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
 sub vp8_common_forward_decls() {
 print <<EOF
 /*
@@ -21,104 +31,102 @@ forward_decls qw/vp8_common_forward_decls/;
 #
 # Dequant
 #
-add_proto qw/void vp8_dequantize_b/, "struct blockd*, short *dqc";
-specialize qw/vp8_dequantize_b mmx neon msa/;
+add_proto qw/void vp8_dequantize_b/, "struct blockd*, short *DQC";
+specialize qw/vp8_dequantize_b mmx neon msa mmi/;
 
-add_proto qw/void vp8_dequant_idct_add/, "short *input, short *dq, unsigned char *output, int stride";
-specialize qw/vp8_dequant_idct_add mmx neon dspr2 msa/;
+add_proto qw/void vp8_dequant_idct_add/, "short *input, short *dq, unsigned char *dest, int stride";
+specialize qw/vp8_dequant_idct_add mmx neon dspr2 msa mmi/;
 
 add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs";
-specialize qw/vp8_dequant_idct_add_y_block sse2 neon dspr2 msa/;
+specialize qw/vp8_dequant_idct_add_y_block sse2 neon dspr2 msa mmi lsx/;
 
 add_proto qw/void vp8_dequant_idct_add_uv_block/, "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs";
-specialize qw/vp8_dequant_idct_add_uv_block sse2 neon dspr2 msa/;
+specialize qw/vp8_dequant_idct_add_uv_block sse2 neon dspr2 msa mmi lsx/;
 
 #
 # Loopfilter
 #
-add_proto qw/void vp8_loop_filter_mbv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_mbv sse2 neon dspr2 msa/;
+add_proto qw/void vp8_loop_filter_mbv/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi";
+specialize qw/vp8_loop_filter_mbv sse2 neon dspr2 msa mmi lsx/;
 
-add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_bv sse2 neon dspr2 msa/;
+add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi";
+specialize qw/vp8_loop_filter_bv sse2 neon dspr2 msa mmi lsx/;
 
-add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_mbh sse2 neon dspr2 msa/;
+add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi";
+specialize qw/vp8_loop_filter_mbh sse2 neon dspr2 msa mmi lsx/;
 
-add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_bh sse2 neon dspr2 msa/;
+add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi";
+specialize qw/vp8_loop_filter_bh sse2 neon dspr2 msa mmi lsx/;
 
 
-add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_mbv sse2 neon msa/;
+add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y_ptr, int y_stride, const unsigned char *blimit";
+specialize qw/vp8_loop_filter_simple_mbv sse2 neon msa mmi/;
 $vp8_loop_filter_simple_mbv_c=vp8_loop_filter_simple_vertical_edge_c;
 $vp8_loop_filter_simple_mbv_sse2=vp8_loop_filter_simple_vertical_edge_sse2;
 $vp8_loop_filter_simple_mbv_neon=vp8_loop_filter_mbvs_neon;
 $vp8_loop_filter_simple_mbv_msa=vp8_loop_filter_simple_vertical_edge_msa;
+$vp8_loop_filter_simple_mbv_mmi=vp8_loop_filter_simple_vertical_edge_mmi;
 
-add_proto qw/void vp8_loop_filter_simple_mbh/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_mbh sse2 neon msa/;
+add_proto qw/void vp8_loop_filter_simple_mbh/, "unsigned char *y_ptr, int y_stride, const unsigned char *blimit";
+specialize qw/vp8_loop_filter_simple_mbh sse2 neon msa mmi/;
 $vp8_loop_filter_simple_mbh_c=vp8_loop_filter_simple_horizontal_edge_c;
 $vp8_loop_filter_simple_mbh_sse2=vp8_loop_filter_simple_horizontal_edge_sse2;
 $vp8_loop_filter_simple_mbh_neon=vp8_loop_filter_mbhs_neon;
 $vp8_loop_filter_simple_mbh_msa=vp8_loop_filter_simple_horizontal_edge_msa;
+$vp8_loop_filter_simple_mbh_mmi=vp8_loop_filter_simple_horizontal_edge_mmi;
 
-add_proto qw/void vp8_loop_filter_simple_bv/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_bv sse2 neon msa/;
+add_proto qw/void vp8_loop_filter_simple_bv/, "unsigned char *y_ptr, int y_stride, const unsigned char *blimit";
+specialize qw/vp8_loop_filter_simple_bv sse2 neon msa mmi/;
 $vp8_loop_filter_simple_bv_c=vp8_loop_filter_bvs_c;
 $vp8_loop_filter_simple_bv_sse2=vp8_loop_filter_bvs_sse2;
 $vp8_loop_filter_simple_bv_neon=vp8_loop_filter_bvs_neon;
 $vp8_loop_filter_simple_bv_msa=vp8_loop_filter_bvs_msa;
+$vp8_loop_filter_simple_bv_mmi=vp8_loop_filter_bvs_mmi;
 
-add_proto qw/void vp8_loop_filter_simple_bh/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_bh sse2 neon msa/;
+add_proto qw/void vp8_loop_filter_simple_bh/, "unsigned char *y_ptr, int y_stride, const unsigned char *blimit";
+specialize qw/vp8_loop_filter_simple_bh sse2 neon msa mmi/;
 $vp8_loop_filter_simple_bh_c=vp8_loop_filter_bhs_c;
 $vp8_loop_filter_simple_bh_sse2=vp8_loop_filter_bhs_sse2;
 $vp8_loop_filter_simple_bh_neon=vp8_loop_filter_bhs_neon;
 $vp8_loop_filter_simple_bh_msa=vp8_loop_filter_bhs_msa;
+$vp8_loop_filter_simple_bh_mmi=vp8_loop_filter_bhs_mmi;
 
 #
 # IDCT
 #
 #idct16
-add_proto qw/void vp8_short_idct4x4llm/, "short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride";
-specialize qw/vp8_short_idct4x4llm mmx neon dspr2 msa/;
+add_proto qw/void vp8_short_idct4x4llm/, "short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride";
+specialize qw/vp8_short_idct4x4llm mmx neon dspr2 msa mmi/;
 
 #iwalsh1
-add_proto qw/void vp8_short_inv_walsh4x4_1/, "short *input, short *output";
+add_proto qw/void vp8_short_inv_walsh4x4_1/, "short *input, short *mb_dqcoeff";
 specialize qw/vp8_short_inv_walsh4x4_1 dspr2/;
 
 #iwalsh16
-add_proto qw/void vp8_short_inv_walsh4x4/, "short *input, short *output";
-specialize qw/vp8_short_inv_walsh4x4 sse2 neon dspr2 msa/;
+add_proto qw/void vp8_short_inv_walsh4x4/, "short *input, short *mb_dqcoeff";
+specialize qw/vp8_short_inv_walsh4x4 sse2 neon dspr2 msa mmi/;
 
 #idct1_scalar_add
-add_proto qw/void vp8_dc_only_idct_add/, "short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride";
-specialize qw/vp8_dc_only_idct_add mmx neon dspr2 msa/;
+add_proto qw/void vp8_dc_only_idct_add/, "short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride";
+specialize qw/vp8_dc_only_idct_add mmx neon dspr2 msa mmi lsx/;
 
 #
 # RECON
 #
-add_proto qw/void vp8_copy_mem16x16/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_copy_mem16x16 sse2 neon dspr2 msa/;
+add_proto qw/void vp8_copy_mem16x16/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride";
+specialize qw/vp8_copy_mem16x16 sse2 neon dspr2 msa mmi/;
 
-add_proto qw/void vp8_copy_mem8x8/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_copy_mem8x8 mmx neon dspr2 msa/;
+add_proto qw/void vp8_copy_mem8x8/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride";
+specialize qw/vp8_copy_mem8x8 mmx neon dspr2 msa mmi/;
 
-add_proto qw/void vp8_copy_mem8x4/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_copy_mem8x4 mmx neon dspr2 msa/;
+add_proto qw/void vp8_copy_mem8x4/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride";
+specialize qw/vp8_copy_mem8x4 mmx neon dspr2 msa mmi/;
 
 #
 # Postproc
 #
 if (vpx_config("CONFIG_POSTPROC") eq "yes") {
 
-    add_proto qw/void vp8_blend_mb_inner/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride";
-
-    add_proto qw/void vp8_blend_mb_outer/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride";
-
-    add_proto qw/void vp8_blend_b/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride";
-
     add_proto qw/void vp8_filter_by_weight16x16/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight";
     specialize qw/vp8_filter_by_weight16x16 sse2 msa/;
 
@@ -131,29 +139,29 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes") {
 #
 # Subpixel
 #
-add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 neon dspr2 msa/;
+add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
+specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 neon dspr2 msa mmi lsx/;
 
-add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_sixtap_predict8x8 sse2 ssse3 neon dspr2 msa/;
+add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
+specialize qw/vp8_sixtap_predict8x8 sse2 ssse3 neon dspr2 msa mmi lsx/;
 
-add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_sixtap_predict8x4 sse2 ssse3 neon dspr2 msa/;
+add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
+specialize qw/vp8_sixtap_predict8x4 sse2 ssse3 neon dspr2 msa mmi/;
 
-add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_sixtap_predict4x4 mmx ssse3 neon dspr2 msa/;
+add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
+specialize qw/vp8_sixtap_predict4x4 mmx ssse3 neon dspr2 msa mmi lsx/;
 
-add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
+add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
 specialize qw/vp8_bilinear_predict16x16 sse2 ssse3 neon msa/;
 
-add_proto qw/void vp8_bilinear_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
+add_proto qw/void vp8_bilinear_predict8x8/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
 specialize qw/vp8_bilinear_predict8x8 sse2 ssse3 neon msa/;
 
-add_proto qw/void vp8_bilinear_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_bilinear_predict8x4 mmx neon msa/;
+add_proto qw/void vp8_bilinear_predict8x4/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
+specialize qw/vp8_bilinear_predict8x4 sse2 neon msa/;
 
-add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_bilinear_predict4x4 mmx neon msa/;
+add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
+specialize qw/vp8_bilinear_predict4x4 sse2 neon msa/;
 
 #
 # Encoder functions below this point.
@@ -163,40 +171,38 @@ if (vpx_config("CONFIG_VP8_ENCODER") eq "yes") {
 #
 # Block copy
 #
-if ($opts{arch} =~ /x86/) {
-    add_proto qw/void vp8_copy32xn/, "const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n";
-    specialize qw/vp8_copy32xn sse2 sse3/;
-}
+add_proto qw/void vp8_copy32xn/, "const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height";
+specialize qw/vp8_copy32xn sse2 sse3/;
 
 #
 # Forward DCT
 #
 add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_fdct4x4 sse2 neon msa/;
+specialize qw/vp8_short_fdct4x4 sse2 neon msa mmi lsx/;
 
 add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_fdct8x4 sse2 neon msa/;
+specialize qw/vp8_short_fdct8x4 sse2 neon msa mmi lsx/;
 
 add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_walsh4x4 sse2 neon msa/;
+specialize qw/vp8_short_walsh4x4 sse2 neon msa mmi/;
 
 #
 # Quantizer
 #
 add_proto qw/void vp8_regular_quantize_b/, "struct block *, struct blockd *";
-specialize qw/vp8_regular_quantize_b sse2 sse4_1 msa/;
+specialize qw/vp8_regular_quantize_b sse2 sse4_1 msa mmi lsx/;
 
 add_proto qw/void vp8_fast_quantize_b/, "struct block *, struct blockd *";
-specialize qw/vp8_fast_quantize_b sse2 ssse3 neon msa/;
+specialize qw/vp8_fast_quantize_b sse2 ssse3 neon msa mmi/;
 
 #
 # Block subtraction
 #
 add_proto qw/int vp8_block_error/, "short *coeff, short *dqcoeff";
-specialize qw/vp8_block_error sse2 msa/;
+specialize qw/vp8_block_error sse2 msa lsx/;
 
 add_proto qw/int vp8_mbblock_error/, "struct macroblock *mb, int dc";
-specialize qw/vp8_mbblock_error sse2 msa/;
+specialize qw/vp8_mbblock_error sse2 msa lsx/;
 
 add_proto qw/int vp8_mbuverror/, "struct macroblock *mb";
 specialize qw/vp8_mbuverror sse2 msa/;
@@ -204,20 +210,16 @@ specialize qw/vp8_mbuverror sse2 msa/;
 #
 # Motion search
 #
-add_proto qw/int vp8_full_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
-specialize qw/vp8_full_search_sad sse3 sse4_1/;
-$vp8_full_search_sad_sse3=vp8_full_search_sadx3;
-$vp8_full_search_sad_sse4_1=vp8_full_search_sadx8;
-
-add_proto qw/int vp8_refining_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
+add_proto qw/int vp8_refining_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
 specialize qw/vp8_refining_search_sad sse2 msa/;
 $vp8_refining_search_sad_sse2=vp8_refining_search_sadx4;
 $vp8_refining_search_sad_msa=vp8_refining_search_sadx4;
 
 add_proto qw/int vp8_diamond_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
-specialize qw/vp8_diamond_search_sad sse2 msa/;
+specialize qw/vp8_diamond_search_sad sse2 msa lsx/;
 $vp8_diamond_search_sad_sse2=vp8_diamond_search_sadx4;
 $vp8_diamond_search_sad_msa=vp8_diamond_search_sadx4;
+$vp8_diamond_search_sad_lsx=vp8_diamond_search_sadx4;
 
 #
 # Alt-ref Noise Reduction (ARNR)
diff --git a/media/libvpx/libvpx/vp8/common/setupintrarecon.h b/media/libvpx/libvpx/vp8/common/setupintrarecon.h
index f3ffa16607..903a536aed 100644
--- a/media/libvpx/libvpx/vp8/common/setupintrarecon.h
+++ b/media/libvpx/libvpx/vp8/common/setupintrarecon.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_SETUPINTRARECON_H_
-#define VP8_COMMON_SETUPINTRARECON_H_
+#ifndef VPX_VP8_COMMON_SETUPINTRARECON_H_
+#define VPX_VP8_COMMON_SETUPINTRARECON_H_
 
 #include "./vpx_config.h"
 #include "vpx_scale/yv12config.h"
@@ -37,4 +37,4 @@ static INLINE void setup_intra_recon_left(unsigned char *y_buffer,
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_SETUPINTRARECON_H_
+#endif  // VPX_VP8_COMMON_SETUPINTRARECON_H_
diff --git a/media/libvpx/libvpx/vp8/common/swapyv12buffer.h b/media/libvpx/libvpx/vp8/common/swapyv12buffer.h
index 0ee9a52ceb..e37c471f63 100644
--- a/media/libvpx/libvpx/vp8/common/swapyv12buffer.h
+++ b/media/libvpx/libvpx/vp8/common/swapyv12buffer.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_SWAPYV12BUFFER_H_
-#define VP8_COMMON_SWAPYV12BUFFER_H_
+#ifndef VPX_VP8_COMMON_SWAPYV12BUFFER_H_
+#define VPX_VP8_COMMON_SWAPYV12BUFFER_H_
 
 #include "vpx_scale/yv12config.h"
 
@@ -24,4 +24,4 @@ void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame,
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_SWAPYV12BUFFER_H_
+#endif  // VPX_VP8_COMMON_SWAPYV12BUFFER_H_
diff --git a/media/libvpx/libvpx/vp8/common/systemdependent.h b/media/libvpx/libvpx/vp8/common/systemdependent.h
index 3d44e37cf2..83a5513aae 100644
--- a/media/libvpx/libvpx/vp8/common/systemdependent.h
+++ b/media/libvpx/libvpx/vp8/common/systemdependent.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_SYSTEMDEPENDENT_H_
-#define VP8_COMMON_SYSTEMDEPENDENT_H_
+#ifndef VPX_VP8_COMMON_SYSTEMDEPENDENT_H_
+#define VPX_VP8_COMMON_SYSTEMDEPENDENT_H_
 
 #include "vpx_config.h"
 
@@ -24,4 +24,4 @@ void vp8_machine_specific_config(struct VP8Common *);
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_SYSTEMDEPENDENT_H_
+#endif  // VPX_VP8_COMMON_SYSTEMDEPENDENT_H_
diff --git a/media/libvpx/libvpx/vp8/common/threading.h b/media/libvpx/libvpx/vp8/common/threading.h
index ece64f3fb4..0de75cfde3 100644
--- a/media/libvpx/libvpx/vp8/common/threading.h
+++ b/media/libvpx/libvpx/vp8/common/threading.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_THREADING_H_
-#define VP8_COMMON_THREADING_H_
+#ifndef VPX_VP8_COMMON_THREADING_H_
+#define VPX_VP8_COMMON_THREADING_H_
 
 #include "./vpx_config.h"
 
@@ -19,223 +19,92 @@ extern "C" {
 
 #if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD
 
-/* Thread management macros */
 #if defined(_WIN32) && !HAVE_PTHREAD_H
 /* Win32 */
-#include <process.h>
 #include <windows.h>
-#if defined(__GNUC__) && \
-    (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))
-#define THREAD_FUNCTION \
-  __attribute__((force_align_arg_pointer)) unsigned int __stdcall
-#else
-#define THREAD_FUNCTION unsigned int __stdcall
-#endif
-#define THREAD_FUNCTION_RETURN DWORD
-#define THREAD_SPECIFIC_INDEX DWORD
-#define pthread_t HANDLE
-#define pthread_attr_t DWORD
-#define pthread_detach(thread) \
-  if (thread != NULL) CloseHandle(thread)
-#define thread_sleep(nms) Sleep(nms)
-#define pthread_cancel(thread) terminate_thread(thread, 0)
-#define ts_key_create(ts_key, destructor) \
-  { ts_key = TlsAlloc(); };
-#define pthread_getspecific(ts_key) TlsGetValue(ts_key)
-#define pthread_setspecific(ts_key, value) TlsSetValue(ts_key, (void *)value)
-#define pthread_self() GetCurrentThreadId()
-
-#elif defined(__OS2__)
-/* OS/2 */
-#define INCL_DOS
-#include <os2.h>
-
-#include <stdlib.h>
-#define THREAD_FUNCTION void *
-#define THREAD_FUNCTION_RETURN void *
-#define THREAD_SPECIFIC_INDEX PULONG
-#define pthread_t TID
-#define pthread_attr_t ULONG
-#define pthread_detach(thread) 0
-#define thread_sleep(nms) DosSleep(nms)
-#define pthread_cancel(thread) DosKillThread(thread)
-#define ts_key_create(ts_key, destructor) \
-  DosAllocThreadLocalMemory(1, &(ts_key));
-#define pthread_getspecific(ts_key) ((void *)(*(ts_key)))
-#define pthread_setspecific(ts_key, value) (*(ts_key) = (ULONG)(value))
-#define pthread_self() _gettid()
 #else
+/* pthreads */
 #ifdef __APPLE__
 #include <mach/mach_init.h>
 #include <mach/semaphore.h>
 #include <mach/task.h>
 #include <time.h>
 #include <unistd.h>
-
 #else
 #include <semaphore.h>
 #endif
-
-#include <pthread.h>
-/* pthreads */
-/* Nearly everything is already defined */
-#define THREAD_FUNCTION void *
-#define THREAD_FUNCTION_RETURN void *
-#define THREAD_SPECIFIC_INDEX pthread_key_t
-#define ts_key_create(ts_key, destructor) \
-  pthread_key_create(&(ts_key), destructor);
 #endif
 
 /* Synchronization macros: Win32 and Pthreads */
 #if defined(_WIN32) && !HAVE_PTHREAD_H
-#define sem_t HANDLE
-#define pause(voidpara) __asm PAUSE
-#define sem_init(sem, sem_attr1, sem_init_value) \
-  (int)((*sem = CreateSemaphore(NULL, 0, 32768, NULL)) == NULL)
-#define sem_wait(sem) \
+#define vp8_sem_t HANDLE
+#define vp8_sem_init(sem, pshared, value) \
+  (int)((*sem = CreateSemaphore(NULL, value, 32768, NULL)) == NULL)
+#define vp8_sem_wait(sem) \
   (int)(WAIT_OBJECT_0 != WaitForSingleObject(*sem, INFINITE))
-#define sem_post(sem) ReleaseSemaphore(*sem, 1, NULL)
-#define sem_destroy(sem) \
+#define vp8_sem_post(sem) ReleaseSemaphore(*sem, 1, NULL)
+#define vp8_sem_destroy(sem) \
   if (*sem) ((int)(CloseHandle(*sem)) == TRUE)
 #define thread_sleep(nms) Sleep(nms)
 
-#elif defined(__OS2__)
-typedef struct {
-  HEV event;
-  HMTX wait_mutex;
-  HMTX count_mutex;
-  int count;
-} sem_t;
-
-static inline int sem_init(sem_t *sem, int pshared, unsigned int value) {
-  DosCreateEventSem(NULL, &sem->event, pshared ? DC_SEM_SHARED : 0,
-                    value > 0 ? TRUE : FALSE);
-  DosCreateMutexSem(NULL, &sem->wait_mutex, 0, FALSE);
-  DosCreateMutexSem(NULL, &sem->count_mutex, 0, FALSE);
-
-  sem->count = value;
-
-  return 0;
-}
-
-static inline int sem_wait(sem_t *sem) {
-  DosRequestMutexSem(sem->wait_mutex, -1);
-
-  DosWaitEventSem(sem->event, -1);
-
-  DosRequestMutexSem(sem->count_mutex, -1);
-
-  sem->count--;
-  if (sem->count == 0) {
-    ULONG post_count;
-
-    DosResetEventSem(sem->event, &post_count);
-  }
-
-  DosReleaseMutexSem(sem->count_mutex);
-
-  DosReleaseMutexSem(sem->wait_mutex);
-
-  return 0;
-}
-
-static inline int sem_post(sem_t *sem) {
-  DosRequestMutexSem(sem->count_mutex, -1);
-
-  if (sem->count < 32768) {
-    sem->count++;
-    DosPostEventSem(sem->event);
-  }
-
-  DosReleaseMutexSem(sem->count_mutex);
-
-  return 0;
-}
-
-static inline int sem_destroy(sem_t *sem) {
-  DosCloseEventSem(sem->event);
-  DosCloseMutexSem(sem->wait_mutex);
-  DosCloseMutexSem(sem->count_mutex);
-
-  return 0;
-}
-
-#define thread_sleep(nms) DosSleep(nms)
-
 #else
 
 #ifdef __APPLE__
-#define sem_t semaphore_t
-#define sem_init(X, Y, Z) \
-  semaphore_create(mach_task_self(), X, SYNC_POLICY_FIFO, Z)
-#define sem_wait(sem) (semaphore_wait(*sem))
-#define sem_post(sem) semaphore_signal(*sem)
-#define sem_destroy(sem) semaphore_destroy(mach_task_self(), *sem)
-#define thread_sleep(nms)
-/* { struct timespec ts;ts.tv_sec=0; ts.tv_nsec =
-   1000*nms;nanosleep(&ts, NULL);} */
+#define vp8_sem_t semaphore_t
+#define vp8_sem_init(sem, pshared, value) \
+  semaphore_create(mach_task_self(), sem, SYNC_POLICY_FIFO, value)
+#define vp8_sem_wait(sem) semaphore_wait(*sem)
+#define vp8_sem_post(sem) semaphore_signal(*sem)
+#define vp8_sem_destroy(sem) semaphore_destroy(mach_task_self(), *sem)
 #else
+#include <errno.h>
 #include <unistd.h>
 #include <sched.h>
-#define thread_sleep(nms) sched_yield();
-/* {struct timespec ts;ts.tv_sec=0;
-    ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} */
-#endif
+#define vp8_sem_t sem_t
+#define vp8_sem_init sem_init
+static INLINE int vp8_sem_wait(vp8_sem_t *sem) {
+  int ret;
+  while ((ret = sem_wait(sem)) == -1 && errno == EINTR) {
+  }
+  return ret;
+}
+#define vp8_sem_post sem_post
+#define vp8_sem_destroy sem_destroy
+#endif /* __APPLE__ */
 /* Not Windows. Assume pthreads */
 
+/* thread_sleep implementation: yield unless Linux/Unix. */
+#if defined(__unix__) || defined(__APPLE__)
+#define thread_sleep(nms)
+/* {struct timespec ts;ts.tv_sec=0;
+    ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} */
+#else
+#define thread_sleep(nms) sched_yield();
+#endif /* __unix__ || __APPLE__ */
+
 #endif
 
-#if ARCH_X86 || ARCH_X86_64
+#if VPX_ARCH_X86 || VPX_ARCH_X86_64
 #include "vpx_ports/x86.h"
 #else
 #define x86_pause_hint()
 #endif
 
-#if defined(__has_feature)
-#if __has_feature(thread_sanitizer)
-#define USE_MUTEX_LOCK 1
-#endif
-#endif
+#include "vpx_util/vpx_atomics.h"
 
-#include "vpx_util/vpx_thread.h"
-
-static INLINE int protected_read(pthread_mutex_t *const mutex, const int *p) {
-  (void)mutex;
-#if defined(USE_MUTEX_LOCK)
-  int ret;
-  pthread_mutex_lock(mutex);
-  ret = *p;
-  pthread_mutex_unlock(mutex);
-  return ret;
-#endif
-  return *p;
-}
-
-static INLINE void sync_read(pthread_mutex_t *const mutex, int mb_col,
-                             const int *last_row_current_mb_col,
-                             const int nsync) {
-  while (mb_col > (protected_read(mutex, last_row_current_mb_col) - nsync)) {
+static INLINE void vp8_atomic_spin_wait(
+    int mb_col, const vpx_atomic_int *last_row_current_mb_col,
+    const int nsync) {
+  while (mb_col > (vpx_atomic_load_acquire(last_row_current_mb_col) - nsync)) {
     x86_pause_hint();
     thread_sleep(0);
   }
 }
 
-static INLINE void protected_write(pthread_mutex_t *mutex, int *p, int v) {
-  (void)mutex;
-#if defined(USE_MUTEX_LOCK)
-  pthread_mutex_lock(mutex);
-  *p = v;
-  pthread_mutex_unlock(mutex);
-  return;
-#endif
-  *p = v;
-}
-
-#undef USE_MUTEX_LOCK
 #endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_THREADING_H_
+#endif  // VPX_VP8_COMMON_THREADING_H_
diff --git a/media/libvpx/libvpx/vp8/common/treecoder.c b/media/libvpx/libvpx/vp8/common/treecoder.c
index 9feb40a5a7..f1e78f4321 100644
--- a/media/libvpx/libvpx/vp8/common/treecoder.c
+++ b/media/libvpx/libvpx/vp8/common/treecoder.c
@@ -12,6 +12,7 @@
 #include <stdio.h>
 
 #include "vp8/common/treecoder.h"
+#include "vpx/vpx_integer.h"
 
 static void tree2tok(struct vp8_token_struct *const p, vp8_tree t, int i, int v,
                      int L) {
@@ -79,7 +80,7 @@ void vp8_tree_probs_from_distribution(int n, /* n = size of alphabet */
                                       vp8_prob probs[/* n-1 */],
                                       unsigned int branch_ct[/* n-1 */][2],
                                       const unsigned int num_events[/* n */],
-                                      unsigned int Pfac, int rd) {
+                                      unsigned int Pfactor, int Round) {
   const int tree_len = n - 1;
   int t = 0;
 
@@ -89,10 +90,10 @@ void vp8_tree_probs_from_distribution(int n, /* n = size of alphabet */
     const unsigned int *const c = branch_ct[t];
     const unsigned int tot = c[0] + c[1];
 
-    assert(tot < (1 << 24)); /* no overflow below */
-
     if (tot) {
-      const unsigned int p = ((c[0] * Pfac) + (rd ? tot >> 1 : 0)) / tot;
+      const unsigned int p =
+          (unsigned int)(((uint64_t)c[0] * Pfactor) + (Round ? tot >> 1 : 0)) /
+          tot;
       probs[t] = p < 256 ? (p ? p : 1) : 255; /* agree w/old version for now */
     } else {
       probs[t] = vp8_prob_half;
diff --git a/media/libvpx/libvpx/vp8/common/treecoder.h b/media/libvpx/libvpx/vp8/common/treecoder.h
index d8503cf3f8..d7d8d0ead0 100644
--- a/media/libvpx/libvpx/vp8/common/treecoder.h
+++ b/media/libvpx/libvpx/vp8/common/treecoder.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_TREECODER_H_
-#define VP8_COMMON_TREECODER_H_
+#ifndef VPX_VP8_COMMON_TREECODER_H_
+#define VPX_VP8_COMMON_TREECODER_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -32,7 +32,7 @@ typedef const bool_coder_spec c_bool_coder_spec;
 typedef const bool_writer c_bool_writer;
 typedef const bool_reader c_bool_reader;
 
-#define vp8_complement(x) (255 - x)
+#define vp8_complement(x) (255 - (x))
 
 /* We build coding trees compactly in arrays.
    Each node of the tree is a pair of vp8_tree_indices.
@@ -79,4 +79,4 @@ void vp8bc_tree_probs_from_distribution(int n, /* n = size of alphabet */
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_TREECODER_H_
+#endif  // VPX_VP8_COMMON_TREECODER_H_
diff --git a/media/libvpx/libvpx/vp8/common/vp8_entropymodedata.h b/media/libvpx/libvpx/vp8/common/vp8_entropymodedata.h
index 9a81ebfe62..3fc942e050 100644
--- a/media/libvpx/libvpx/vp8/common/vp8_entropymodedata.h
+++ b/media/libvpx/libvpx/vp8/common/vp8_entropymodedata.h
@@ -6,10 +6,10 @@
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
-*/
+ */
 
-#ifndef VP8_COMMON_VP8_ENTROPYMODEDATA_H_
-#define VP8_COMMON_VP8_ENTROPYMODEDATA_H_
+#ifndef VPX_VP8_COMMON_VP8_ENTROPYMODEDATA_H_
+#define VPX_VP8_COMMON_VP8_ENTROPYMODEDATA_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -169,4 +169,4 @@ const vp8_prob
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_VP8_ENTROPYMODEDATA_H_
+#endif  // VPX_VP8_COMMON_VP8_ENTROPYMODEDATA_H_
diff --git a/media/libvpx/libvpx/vp8/common/vp8_loopfilter.c b/media/libvpx/libvpx/vp8/common/vp8_loopfilter.c
index c6430be465..4576c18537 100644
--- a/media/libvpx/libvpx/vp8/common/vp8_loopfilter.c
+++ b/media/libvpx/libvpx/vp8/common/vp8_loopfilter.c
@@ -111,11 +111,9 @@ void vp8_loop_filter_frame_init(VP8_COMMON *cm, MACROBLOCKD *mbd,
 
     /* Note the baseline filter values for each segment */
     if (mbd->segmentation_enabled) {
-      /* Abs value */
-      if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) {
+      if (mbd->mb_segment_abs_delta == SEGMENT_ABSDATA) {
         lvl_seg = mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
-      } else /* Delta Value */
-      {
+      } else { /* Delta Value */
         lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
       }
       lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63 : lvl_seg) : 0;
@@ -221,13 +219,11 @@ void vp8_loop_filter_row_normal(VP8_COMMON *cm, MODE_INFO *mode_info_context,
 }
 
 void vp8_loop_filter_row_simple(VP8_COMMON *cm, MODE_INFO *mode_info_context,
-                                int mb_row, int post_ystride, int post_uvstride,
-                                unsigned char *y_ptr, unsigned char *u_ptr,
-                                unsigned char *v_ptr) {
+                                int mb_row, int post_ystride,
+                                unsigned char *y_ptr) {
   int mb_col;
   int filter_level;
   loop_filter_info_n *lfi_n = &cm->lf_info;
-  (void)post_uvstride;
 
   for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
     int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
@@ -260,8 +256,6 @@ void vp8_loop_filter_row_simple(VP8_COMMON *cm, MODE_INFO *mode_info_context,
     }
 
     y_ptr += 16;
-    u_ptr += 8;
-    v_ptr += 8;
 
     mode_info_context++; /* step to next MB */
   }
@@ -344,8 +338,7 @@ void vp8_loop_filter_frame(VP8_COMMON *cm, MACROBLOCKD *mbd, int frame_type) {
 
       mode_info_context++; /* Skip border mb */
     }
-  } else /* SIMPLE_LOOPFILTER */
-  {
+  } else { /* SIMPLE_LOOPFILTER */
     for (mb_row = 0; mb_row < mb_rows; ++mb_row) {
       for (mb_col = 0; mb_col < mb_cols; ++mb_col) {
         int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
diff --git a/media/libvpx/libvpx/vp8/common/vp8_skin_detection.c b/media/libvpx/libvpx/vp8/common/vp8_skin_detection.c
new file mode 100644
index 0000000000..6739efa5fe
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/vp8_skin_detection.c
@@ -0,0 +1,109 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp8/common/alloccommon.h"
+#include "vp8/common/vp8_skin_detection.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_util/vpx_write_yuv_frame.h"
+
+static int avg_2x2(const uint8_t *s, int p) {
+  int i, j;
+  int sum = 0;
+  for (i = 0; i < 2; ++i, s += p) {
+    for (j = 0; j < 2; ++j) {
+      sum += s[j];
+    }
+  }
+  return (sum + 2) >> 2;
+}
+
+int vp8_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
+                           int stride, int strideuv,
+                           SKIN_DETECTION_BLOCK_SIZE bsize, int consec_zeromv,
+                           int curr_motion_magn) {
+  // No skin if block has been zero/small motion for long consecutive time.
+  if (consec_zeromv > 60 && curr_motion_magn == 0) {
+    return 0;
+  } else {
+    int motion = 1;
+    if (consec_zeromv > 25 && curr_motion_magn == 0) motion = 0;
+    if (bsize == SKIN_16X16) {
+      // Take the average of center 2x2 pixels.
+      const int ysource = avg_2x2(y + 7 * stride + 7, stride);
+      const int usource = avg_2x2(u + 3 * strideuv + 3, strideuv);
+      const int vsource = avg_2x2(v + 3 * strideuv + 3, strideuv);
+      return vpx_skin_pixel(ysource, usource, vsource, motion);
+    } else {
+      int num_skin = 0;
+      int i, j;
+      for (i = 0; i < 2; i++) {
+        for (j = 0; j < 2; j++) {
+          // Take the average of center 2x2 pixels.
+          const int ysource = avg_2x2(y + 3 * stride + 3, stride);
+          const int usource = avg_2x2(u + strideuv + 1, strideuv);
+          const int vsource = avg_2x2(v + strideuv + 1, strideuv);
+          num_skin += vpx_skin_pixel(ysource, usource, vsource, motion);
+          if (num_skin >= 2) return 1;
+          y += 8;
+          u += 4;
+          v += 4;
+        }
+        y += (stride << 3) - 16;
+        u += (strideuv << 2) - 8;
+        v += (strideuv << 2) - 8;
+      }
+
+      return 0;
+    }
+  }
+}
+
+#ifdef OUTPUT_YUV_SKINMAP
+// For viewing skin map on input source.
+void vp8_compute_skin_map(VP8_COMP *const cpi, FILE *yuv_skinmap_file) {
+  int i, j, mb_row, mb_col, num_bl;
+  VP8_COMMON *const cm = &cpi->common;
+  uint8_t *y;
+  const uint8_t *src_y = cpi->Source->y_buffer;
+  const int src_ystride = cpi->Source->y_stride;
+  int offset = 0;
+
+  YV12_BUFFER_CONFIG skinmap;
+  memset(&skinmap, 0, sizeof(skinmap));
+  if (vp8_yv12_alloc_frame_buffer(&skinmap, cm->Width, cm->Height,
+                                  VP8BORDERINPIXELS) < 0) {
+    vpx_free_frame_buffer(&skinmap);
+    return;
+  }
+  memset(skinmap.buffer_alloc, 128, skinmap.frame_size);
+  y = skinmap.y_buffer;
+  // Loop through blocks and set skin map based on center pixel of block.
+  // Set y to white for skin block, otherwise set to source with gray scale.
+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 1) {
+    num_bl = 0;
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 1) {
+      const int is_skin = cpi->skin_map[offset++];
+      for (i = 0; i < 16; i++) {
+        for (j = 0; j < 16; j++) {
+          y[i * src_ystride + j] = is_skin ? 255 : src_y[i * src_ystride + j];
+        }
+      }
+      num_bl++;
+      y += 16;
+      src_y += 16;
+    }
+    y += (src_ystride << 4) - (num_bl << 4);
+    src_y += (src_ystride << 4) - (num_bl << 4);
+  }
+  vpx_write_yuv_frame(yuv_skinmap_file, &skinmap);
+  vpx_free_frame_buffer(&skinmap);
+}
+#endif  // OUTPUT_YUV_SKINMAP
diff --git a/media/libvpx/libvpx/vp8/common/vp8_skin_detection.h b/media/libvpx/libvpx/vp8/common/vp8_skin_detection.h
new file mode 100644
index 0000000000..ef0e4ae4fe
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/vp8_skin_detection.h
@@ -0,0 +1,47 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_VP8_SKIN_DETECTION_H_
+#define VPX_VP8_COMMON_VP8_SKIN_DETECTION_H_
+
+#include "vp8/encoder/onyx_int.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/skin_detection.h"
+#include "vpx_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8_COMP;
+
+typedef enum {
+  // Skin detection based on 8x8 block. If two of them are identified as skin,
+  // the macroblock is marked as skin.
+  SKIN_8X8,
+  // Skin detection based on 16x16 block.
+  SKIN_16X16
+} SKIN_DETECTION_BLOCK_SIZE;
+
+int vp8_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
+                           int stride, int strideuv,
+                           SKIN_DETECTION_BLOCK_SIZE bsize, int consec_zeromv,
+                           int curr_motion_magn);
+
+#ifdef OUTPUT_YUV_SKINMAP
+// For viewing skin map on input source.
+void vp8_compute_skin_map(struct VP8_COMP *const cpi, FILE *yuv_skinmap_file);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VP8_COMMON_VP8_SKIN_DETECTION_H_
diff --git a/media/libvpx/libvpx/vp8/common/x86/bilinear_filter_sse2.c b/media/libvpx/libvpx/vp8/common/x86/bilinear_filter_sse2.c
new file mode 100644
index 0000000000..ff6cbbd68c
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/common/x86/bilinear_filter_sse2.c
@@ -0,0 +1,336 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <xmmintrin.h>
+
+#include "./vp8_rtcd.h"
+#include "./vpx_config.h"
+#include "vp8/common/filter.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+#include "vpx_ports/mem.h"
+
+static INLINE void horizontal_16x16(uint8_t *src, const int stride,
+                                    uint16_t *dst, const int xoffset) {
+  int h;
+  const __m128i zero = _mm_setzero_si128();
+
+  if (xoffset == 0) {
+    for (h = 0; h < 17; ++h) {
+      const __m128i a = _mm_loadu_si128((__m128i *)src);
+      const __m128i a_lo = _mm_unpacklo_epi8(a, zero);
+      const __m128i a_hi = _mm_unpackhi_epi8(a, zero);
+      _mm_store_si128((__m128i *)dst, a_lo);
+      _mm_store_si128((__m128i *)(dst + 8), a_hi);
+      src += stride;
+      dst += 16;
+    }
+    return;
+  }
+
+  {
+    const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
+    const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]);
+    const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]);
+
+    for (h = 0; h < 17; ++h) {
+      const __m128i a = _mm_loadu_si128((__m128i *)src);
+      const __m128i a_lo = _mm_unpacklo_epi8(a, zero);
+      const __m128i a_hi = _mm_unpackhi_epi8(a, zero);
+      const __m128i a_lo_filtered = _mm_mullo_epi16(a_lo, hfilter_0);
+      const __m128i a_hi_filtered = _mm_mullo_epi16(a_hi, hfilter_0);
+
+      const __m128i b = _mm_loadu_si128((__m128i *)(src + 1));
+      const __m128i b_lo = _mm_unpacklo_epi8(b, zero);
+      const __m128i b_hi = _mm_unpackhi_epi8(b, zero);
+      const __m128i b_lo_filtered = _mm_mullo_epi16(b_lo, hfilter_1);
+      const __m128i b_hi_filtered = _mm_mullo_epi16(b_hi, hfilter_1);
+
+      const __m128i sum_lo = _mm_add_epi16(a_lo_filtered, b_lo_filtered);
+      const __m128i sum_hi = _mm_add_epi16(a_hi_filtered, b_hi_filtered);
+
+      const __m128i compensated_lo = _mm_add_epi16(sum_lo, round_factor);
+      const __m128i compensated_hi = _mm_add_epi16(sum_hi, round_factor);
+
+      const __m128i shifted_lo =
+          _mm_srai_epi16(compensated_lo, VP8_FILTER_SHIFT);
+      const __m128i shifted_hi =
+          _mm_srai_epi16(compensated_hi, VP8_FILTER_SHIFT);
+
+      _mm_store_si128((__m128i *)dst, shifted_lo);
+      _mm_store_si128((__m128i *)(dst + 8), shifted_hi);
+      src += stride;
+      dst += 16;
+    }
+  }
+}
+
+static INLINE void vertical_16x16(uint16_t *src, uint8_t *dst, const int stride,
+                                  const int yoffset) {
+  int h;
+
+  if (yoffset == 0) {
+    for (h = 0; h < 16; ++h) {
+      const __m128i row_lo = _mm_load_si128((__m128i *)src);
+      const __m128i row_hi = _mm_load_si128((__m128i *)(src + 8));
+      const __m128i packed = _mm_packus_epi16(row_lo, row_hi);
+      _mm_store_si128((__m128i *)dst, packed);
+      src += 16;
+      dst += stride;
+    }
+    return;
+  }
+
+  {
+    const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
+    const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]);
+    const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]);
+
+    __m128i row_0_lo = _mm_load_si128((__m128i *)src);
+    __m128i row_0_hi = _mm_load_si128((__m128i *)(src + 8));
+    src += 16;
+    for (h = 0; h < 16; ++h) {
+      const __m128i row_0_lo_filtered = _mm_mullo_epi16(row_0_lo, vfilter_0);
+      const __m128i row_0_hi_filtered = _mm_mullo_epi16(row_0_hi, vfilter_0);
+
+      const __m128i row_1_lo = _mm_load_si128((__m128i *)src);
+      const __m128i row_1_hi = _mm_load_si128((__m128i *)(src + 8));
+      const __m128i row_1_lo_filtered = _mm_mullo_epi16(row_1_lo, vfilter_1);
+      const __m128i row_1_hi_filtered = _mm_mullo_epi16(row_1_hi, vfilter_1);
+
+      const __m128i sum_lo =
+          _mm_add_epi16(row_0_lo_filtered, row_1_lo_filtered);
+      const __m128i sum_hi =
+          _mm_add_epi16(row_0_hi_filtered, row_1_hi_filtered);
+
+      const __m128i compensated_lo = _mm_add_epi16(sum_lo, round_factor);
+      const __m128i compensated_hi = _mm_add_epi16(sum_hi, round_factor);
+
+      const __m128i shifted_lo =
+          _mm_srai_epi16(compensated_lo, VP8_FILTER_SHIFT);
+      const __m128i shifted_hi =
+          _mm_srai_epi16(compensated_hi, VP8_FILTER_SHIFT);
+
+      const __m128i packed = _mm_packus_epi16(shifted_lo, shifted_hi);
+      _mm_store_si128((__m128i *)dst, packed);
+      row_0_lo = row_1_lo;
+      row_0_hi = row_1_hi;
+      src += 16;
+      dst += stride;
+    }
+  }
+}
+
+void vp8_bilinear_predict16x16_sse2(uint8_t *src_ptr, int src_pixels_per_line,
+                                    int xoffset, int yoffset, uint8_t *dst_ptr,
+                                    int dst_pitch) {
+  DECLARE_ALIGNED(16, uint16_t, FData[16 * 17]);
+
+  assert((xoffset | yoffset) != 0);
+
+  horizontal_16x16(src_ptr, src_pixels_per_line, FData, xoffset);
+
+  vertical_16x16(FData, dst_ptr, dst_pitch, yoffset);
+}
+
+static INLINE void horizontal_8xN(uint8_t *src, const int stride, uint16_t *dst,
+                                  const int xoffset, const int height) {
+  int h;
+  const __m128i zero = _mm_setzero_si128();
+
+  if (xoffset == 0) {
+    for (h = 0; h < height; ++h) {
+      const __m128i a = _mm_loadl_epi64((__m128i *)src);
+      const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);
+      _mm_store_si128((__m128i *)dst, a_u16);
+      src += stride;
+      dst += 8;
+    }
+    return;
+  }
+
+  {
+    const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
+    const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]);
+    const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]);
+
+    // Filter horizontally. Rather than load the whole array and transpose, load
+    // 16 values (overreading) and shift to set up the second value. Do an
+    // "extra" 9th line so the vertical pass has the necessary context.
+    for (h = 0; h < height; ++h) {
+      const __m128i a = _mm_loadu_si128((__m128i *)src);
+      const __m128i b = _mm_srli_si128(a, 1);
+      const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);
+      const __m128i b_u16 = _mm_unpacklo_epi8(b, zero);
+      const __m128i a_filtered = _mm_mullo_epi16(a_u16, hfilter_0);
+      const __m128i b_filtered = _mm_mullo_epi16(b_u16, hfilter_1);
+      const __m128i sum = _mm_add_epi16(a_filtered, b_filtered);
+      const __m128i compensated = _mm_add_epi16(sum, round_factor);
+      const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
+      _mm_store_si128((__m128i *)dst, shifted);
+      src += stride;
+      dst += 8;
+    }
+  }
+}
+
+static INLINE void vertical_8xN(uint16_t *src, uint8_t *dst, const int stride,
+                                const int yoffset, const int height) {
+  int h;
+
+  if (yoffset == 0) {
+    for (h = 0; h < height; ++h) {
+      const __m128i row = _mm_load_si128((__m128i *)src);
+      const __m128i packed = _mm_packus_epi16(row, row);
+      _mm_storel_epi64((__m128i *)dst, packed);
+      src += 8;
+      dst += stride;
+    }
+    return;
+  }
+
+  {
+    const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
+    const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]);
+    const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]);
+
+    __m128i row_0 = _mm_load_si128((__m128i *)src);
+    src += 8;
+    for (h = 0; h < height; ++h) {
+      const __m128i row_1 = _mm_load_si128((__m128i *)src);
+      const __m128i row_0_filtered = _mm_mullo_epi16(row_0, vfilter_0);
+      const __m128i row_1_filtered = _mm_mullo_epi16(row_1, vfilter_1);
+      const __m128i sum = _mm_add_epi16(row_0_filtered, row_1_filtered);
+      const __m128i compensated = _mm_add_epi16(sum, round_factor);
+      const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
+      const __m128i packed = _mm_packus_epi16(shifted, shifted);
+      _mm_storel_epi64((__m128i *)dst, packed);
+      row_0 = row_1;
+      src += 8;
+      dst += stride;
+    }
+  }
+}
+
+void vp8_bilinear_predict8x8_sse2(uint8_t *src_ptr, int src_pixels_per_line,
+                                  int xoffset, int yoffset, uint8_t *dst_ptr,
+                                  int dst_pitch) {
+  DECLARE_ALIGNED(16, uint16_t, FData[8 * 9]);
+
+  assert((xoffset | yoffset) != 0);
+
+  horizontal_8xN(src_ptr, src_pixels_per_line, FData, xoffset, 9);
+
+  vertical_8xN(FData, dst_ptr, dst_pitch, yoffset, 8);
+}
+
+void vp8_bilinear_predict8x4_sse2(uint8_t *src_ptr, int src_pixels_per_line,
+                                  int xoffset, int yoffset, uint8_t *dst_ptr,
+                                  int dst_pitch) {
+  DECLARE_ALIGNED(16, uint16_t, FData[8 * 5]);
+
+  assert((xoffset | yoffset) != 0);
+
+  horizontal_8xN(src_ptr, src_pixels_per_line, FData, xoffset, 5);
+
+  vertical_8xN(FData, dst_ptr, dst_pitch, yoffset, 4);
+}
+
+static INLINE void horizontal_4x4(uint8_t *src, const int stride, uint16_t *dst,
+                                  const int xoffset) {
+  int h;
+  const __m128i zero = _mm_setzero_si128();
+
+  if (xoffset == 0) {
+    for (h = 0; h < 5; ++h) {
+      const __m128i a = load_unaligned_u32(src);
+      const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);
+      _mm_storel_epi64((__m128i *)dst, a_u16);
+      src += stride;
+      dst += 4;
+    }
+    return;
+  }
+
+  {
+    const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
+    const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]);
+    const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]);
+
+    for (h = 0; h < 5; ++h) {
+      const __m128i a = load_unaligned_u32(src);
+      const __m128i b = load_unaligned_u32(src + 1);
+      const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);
+      const __m128i b_u16 = _mm_unpacklo_epi8(b, zero);
+      const __m128i a_filtered = _mm_mullo_epi16(a_u16, hfilter_0);
+      const __m128i b_filtered = _mm_mullo_epi16(b_u16, hfilter_1);
+      const __m128i sum = _mm_add_epi16(a_filtered, b_filtered);
+      const __m128i compensated = _mm_add_epi16(sum, round_factor);
+      const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
+      _mm_storel_epi64((__m128i *)dst, shifted);
+      src += stride;
+      dst += 4;
+    }
+  }
+}
+
+static INLINE void vertical_4x4(uint16_t *src, uint8_t *dst, const int stride,
+                                const int yoffset) {
+  int h;
+
+  if (yoffset == 0) {
+    for (h = 0; h < 4; h += 2) {
+      const __m128i row = _mm_load_si128((__m128i *)src);
+      __m128i packed = _mm_packus_epi16(row, row);
+      store_unaligned_u32(dst, packed);
+      dst += stride;
+      packed = _mm_srli_si128(packed, 4);
+      store_unaligned_u32(dst, packed);
+      dst += stride;
+      src += 8;
+    }
+    return;
+  }
+
+  {
+    const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
+    const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]);
+    const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]);
+
+    for (h = 0; h < 4; h += 2) {
+      const __m128i row_0 = _mm_load_si128((__m128i *)src);
+      const __m128i row_1 = _mm_loadu_si128((__m128i *)(src + 4));
+      const __m128i row_0_filtered = _mm_mullo_epi16(row_0, vfilter_0);
+      const __m128i row_1_filtered = _mm_mullo_epi16(row_1, vfilter_1);
+      const __m128i sum = _mm_add_epi16(row_0_filtered, row_1_filtered);
+      const __m128i compensated = _mm_add_epi16(sum, round_factor);
+      const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
+      __m128i packed = _mm_packus_epi16(shifted, shifted);
+      storeu_int32(dst, _mm_cvtsi128_si32(packed));
+      packed = _mm_srli_si128(packed, 4);
+      dst += stride;
+      storeu_int32(dst, _mm_cvtsi128_si32(packed));
+      dst += stride;
+      src += 8;
+    }
+  }
+}
+
+void vp8_bilinear_predict4x4_sse2(uint8_t *src_ptr, int src_pixels_per_line,
+                                  int xoffset, int yoffset, uint8_t *dst_ptr,
+                                  int dst_pitch) {
+  DECLARE_ALIGNED(16, uint16_t, FData[4 * 5]);
+
+  assert((xoffset | yoffset) != 0);
+
+  horizontal_4x4(src_ptr, src_pixels_per_line, FData, xoffset);
+
+  vertical_4x4(FData, dst_ptr, dst_pitch, yoffset);
+}
diff --git a/media/libvpx/libvpx/vp8/common/x86/dequantize_mmx.asm b/media/libvpx/libvpx/vp8/common/x86/dequantize_mmx.asm
index 4e551f00aa..0a269e15f7 100644
--- a/media/libvpx/libvpx/vp8/common/x86/dequantize_mmx.asm
+++ b/media/libvpx/libvpx/vp8/common/x86/dequantize_mmx.asm
@@ -11,9 +11,10 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
 
 ;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q)
-global sym(vp8_dequantize_b_impl_mmx) PRIVATE
+globalsym(vp8_dequantize_b_impl_mmx)
 sym(vp8_dequantize_b_impl_mmx):
     push        rbp
     mov         rbp, rsp
@@ -55,7 +56,7 @@ sym(vp8_dequantize_b_impl_mmx):
 ;short *dq,               1
 ;unsigned char *dest,     2
 ;int stride)              3
-global sym(vp8_dequant_idct_add_mmx) PRIVATE
+globalsym(vp8_dequant_idct_add_mmx)
 sym(vp8_dequant_idct_add_mmx):
     push        rbp
     mov         rbp, rsp
diff --git a/media/libvpx/libvpx/vp8/common/x86/filter_x86.c b/media/libvpx/libvpx/vp8/common/x86/filter_x86.c
deleted file mode 100644
index 2405342f02..0000000000
--- a/media/libvpx/libvpx/vp8/common/x86/filter_x86.c
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp8/common/x86/filter_x86.h"
-
-DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]) = {
-  { 128, 128, 128, 128, 0, 0, 0, 0 }, { 112, 112, 112, 112, 16, 16, 16, 16 },
-  { 96, 96, 96, 96, 32, 32, 32, 32 }, { 80, 80, 80, 80, 48, 48, 48, 48 },
-  { 64, 64, 64, 64, 64, 64, 64, 64 }, { 48, 48, 48, 48, 80, 80, 80, 80 },
-  { 32, 32, 32, 32, 96, 96, 96, 96 }, { 16, 16, 16, 16, 112, 112, 112, 112 }
-};
-
-DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]) = {
-  { 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 },
-  { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 },
-  { 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 },
-  { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-  { 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 },
-  { 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 },
-  { 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 }
-};
diff --git a/media/libvpx/libvpx/vp8/common/x86/filter_x86.h b/media/libvpx/libvpx/vp8/common/x86/filter_x86.h
deleted file mode 100644
index d282841bee..0000000000
--- a/media/libvpx/libvpx/vp8/common/x86/filter_x86.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP8_COMMON_X86_FILTER_X86_H_
-#define VP8_COMMON_X86_FILTER_X86_H_
-
-#include "vpx_ports/mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* x86 assembly specific copy of vp8/common/filter.c:vp8_bilinear_filters with
- * duplicated values */
-
-/* duplicated 4x */
-extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]);
-
-/* duplicated 8x */
-extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VP8_COMMON_X86_FILTER_X86_H_
diff --git a/media/libvpx/libvpx/vp8/common/x86/idct_blk_sse2.c b/media/libvpx/libvpx/vp8/common/x86/idct_blk_sse2.c
index 8aefb27997..897ed5b652 100644
--- a/media/libvpx/libvpx/vp8/common/x86/idct_blk_sse2.c
+++ b/media/libvpx/libvpx/vp8/common/x86/idct_blk_sse2.c
@@ -42,43 +42,43 @@ void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst,
 }
 
 void vp8_dequant_idct_add_uv_block_sse2(short *q, short *dq,
-                                        unsigned char *dstu,
-                                        unsigned char *dstv, int stride,
+                                        unsigned char *dst_u,
+                                        unsigned char *dst_v, int stride,
                                         char *eobs) {
   if (((short *)(eobs))[0]) {
     if (((short *)(eobs))[0] & 0xfefe) {
-      vp8_idct_dequant_full_2x_sse2(q, dq, dstu, stride);
+      vp8_idct_dequant_full_2x_sse2(q, dq, dst_u, stride);
     } else {
-      vp8_idct_dequant_0_2x_sse2(q, dq, dstu, stride);
+      vp8_idct_dequant_0_2x_sse2(q, dq, dst_u, stride);
     }
   }
   q += 32;
-  dstu += stride * 4;
+  dst_u += stride * 4;
 
   if (((short *)(eobs))[1]) {
     if (((short *)(eobs))[1] & 0xfefe) {
-      vp8_idct_dequant_full_2x_sse2(q, dq, dstu, stride);
+      vp8_idct_dequant_full_2x_sse2(q, dq, dst_u, stride);
     } else {
-      vp8_idct_dequant_0_2x_sse2(q, dq, dstu, stride);
+      vp8_idct_dequant_0_2x_sse2(q, dq, dst_u, stride);
     }
   }
   q += 32;
 
   if (((short *)(eobs))[2]) {
     if (((short *)(eobs))[2] & 0xfefe) {
-      vp8_idct_dequant_full_2x_sse2(q, dq, dstv, stride);
+      vp8_idct_dequant_full_2x_sse2(q, dq, dst_v, stride);
     } else {
-      vp8_idct_dequant_0_2x_sse2(q, dq, dstv, stride);
+      vp8_idct_dequant_0_2x_sse2(q, dq, dst_v, stride);
     }
   }
   q += 32;
-  dstv += stride * 4;
+  dst_v += stride * 4;
 
   if (((short *)(eobs))[3]) {
     if (((short *)(eobs))[3] & 0xfefe) {
-      vp8_idct_dequant_full_2x_sse2(q, dq, dstv, stride);
+      vp8_idct_dequant_full_2x_sse2(q, dq, dst_v, stride);
     } else {
-      vp8_idct_dequant_0_2x_sse2(q, dq, dstv, stride);
+      vp8_idct_dequant_0_2x_sse2(q, dq, dst_v, stride);
     }
   }
 }
diff --git a/media/libvpx/libvpx/vp8/common/x86/idctllm_mmx.asm b/media/libvpx/libvpx/vp8/common/x86/idctllm_mmx.asm
index 96fa2c60d0..6cea86fe03 100644
--- a/media/libvpx/libvpx/vp8/common/x86/idctllm_mmx.asm
+++ b/media/libvpx/libvpx/vp8/common/x86/idctllm_mmx.asm
@@ -31,10 +31,11 @@
 ; *
 ; **************************************************************************/
 
+SECTION .text
 
 ;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred,
 ;int pitch, unsigned char *dest,int stride)
-global sym(vp8_short_idct4x4llm_mmx) PRIVATE
+globalsym(vp8_short_idct4x4llm_mmx)
 sym(vp8_short_idct4x4llm_mmx):
     push        rbp
     mov         rbp, rsp
@@ -224,7 +225,7 @@ sym(vp8_short_idct4x4llm_mmx):
 ;int pred_stride,
 ;unsigned char *dst_ptr,
 ;int stride)
-global sym(vp8_dc_only_idct_add_mmx) PRIVATE
+globalsym(vp8_dc_only_idct_add_mmx)
 sym(vp8_dc_only_idct_add_mmx):
     push        rbp
     mov         rbp, rsp
diff --git a/media/libvpx/libvpx/vp8/common/x86/idctllm_sse2.asm b/media/libvpx/libvpx/vp8/common/x86/idctllm_sse2.asm
index bf8e2c4021..bb79d2da3b 100644
--- a/media/libvpx/libvpx/vp8/common/x86/idctllm_sse2.asm
+++ b/media/libvpx/libvpx/vp8/common/x86/idctllm_sse2.asm
@@ -19,7 +19,9 @@
 ;   int dst_stride      - 3
 ; )
 
-global sym(vp8_idct_dequant_0_2x_sse2) PRIVATE
+SECTION .text
+
+globalsym(vp8_idct_dequant_0_2x_sse2)
 sym(vp8_idct_dequant_0_2x_sse2):
     push        rbp
     mov         rbp, rsp
@@ -101,7 +103,7 @@ sym(vp8_idct_dequant_0_2x_sse2):
 ;   unsigned char *dst  - 2
 ;   int dst_stride      - 3
 ; )
-global sym(vp8_idct_dequant_full_2x_sse2) PRIVATE
+globalsym(vp8_idct_dequant_full_2x_sse2)
 sym(vp8_idct_dequant_full_2x_sse2):
     push        rbp
     mov         rbp, rsp
@@ -358,7 +360,7 @@ sym(vp8_idct_dequant_full_2x_sse2):
 ;   int dst_stride      - 3
 ;   short *dc           - 4
 ; )
-global sym(vp8_idct_dequant_dc_0_2x_sse2) PRIVATE
+globalsym(vp8_idct_dequant_dc_0_2x_sse2)
 sym(vp8_idct_dequant_dc_0_2x_sse2):
     push        rbp
     mov         rbp, rsp
@@ -434,7 +436,7 @@ sym(vp8_idct_dequant_dc_0_2x_sse2):
 ;   int dst_stride      - 3
 ;   short *dc           - 4
 ; )
-global sym(vp8_idct_dequant_dc_full_2x_sse2) PRIVATE
+globalsym(vp8_idct_dequant_dc_full_2x_sse2)
 sym(vp8_idct_dequant_dc_full_2x_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/media/libvpx/libvpx/vp8/common/x86/iwalsh_sse2.asm b/media/libvpx/libvpx/vp8/common/x86/iwalsh_sse2.asm
index 06e86a80b6..56f37c3e0f 100644
--- a/media/libvpx/libvpx/vp8/common/x86/iwalsh_sse2.asm
+++ b/media/libvpx/libvpx/vp8/common/x86/iwalsh_sse2.asm
@@ -11,8 +11,10 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
-;void vp8_short_inv_walsh4x4_sse2(short *input, short *output)
-global sym(vp8_short_inv_walsh4x4_sse2) PRIVATE
+SECTION .text
+
+;void vp8_short_inv_walsh4x4_sse2(short *input, short *mb_dqcoeff)
+globalsym(vp8_short_inv_walsh4x4_sse2)
 sym(vp8_short_inv_walsh4x4_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/media/libvpx/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm b/media/libvpx/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm
index 6d5aaa19db..8d12f5385d 100644
--- a/media/libvpx/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm
+++ b/media/libvpx/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm
@@ -125,6 +125,8 @@
         pxor        %1, [GLOBAL(t80)]
 %endmacro
 
+SECTION .text
+
 ;void vp8_loop_filter_bh_y_sse2
 ;(
 ;    unsigned char *src_ptr,
@@ -133,7 +135,7 @@
 ;    const char    *limit,
 ;    const char    *thresh
 ;)
-global sym(vp8_loop_filter_bh_y_sse2) PRIVATE
+globalsym(vp8_loop_filter_bh_y_sse2)
 sym(vp8_loop_filter_bh_y_sse2):
 
 %if LIBVPX_YASM_WIN64
@@ -275,7 +277,7 @@ LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2
 ;    const char    *thresh
 ;)
 
-global sym(vp8_loop_filter_bv_y_sse2) PRIVATE
+globalsym(vp8_loop_filter_bv_y_sse2)
 sym(vp8_loop_filter_bv_y_sse2):
 
 %if LIBVPX_YASM_WIN64
diff --git a/media/libvpx/libvpx/vp8/common/x86/loopfilter_sse2.asm b/media/libvpx/libvpx/vp8/common/x86/loopfilter_sse2.asm
index 1913abc69b..ce5c313138 100644
--- a/media/libvpx/libvpx/vp8/common/x86/loopfilter_sse2.asm
+++ b/media/libvpx/libvpx/vp8/common/x86/loopfilter_sse2.asm
@@ -276,6 +276,8 @@
 
 %endmacro
 
+SECTION .text
+
 %if ABI_IS_32BIT
 
 ;void vp8_loop_filter_horizontal_edge_sse2
@@ -286,7 +288,7 @@
 ;    const char    *limit,
 ;    const char    *thresh,
 ;)
-global sym(vp8_loop_filter_horizontal_edge_sse2) PRIVATE
+globalsym(vp8_loop_filter_horizontal_edge_sse2)
 sym(vp8_loop_filter_horizontal_edge_sse2):
     push        rbp
     mov         rbp, rsp
@@ -334,7 +336,7 @@ sym(vp8_loop_filter_horizontal_edge_sse2):
 ;    const char    *thresh,
 ;    int            count
 ;)
-global sym(vp8_loop_filter_horizontal_edge_uv_sse2) PRIVATE
+globalsym(vp8_loop_filter_horizontal_edge_uv_sse2)
 sym(vp8_loop_filter_horizontal_edge_uv_sse2):
     push        rbp
     mov         rbp, rsp
@@ -561,7 +563,7 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2):
 ;    const char    *limit,
 ;    const char    *thresh,
 ;)
-global sym(vp8_mbloop_filter_horizontal_edge_sse2) PRIVATE
+globalsym(vp8_mbloop_filter_horizontal_edge_sse2)
 sym(vp8_mbloop_filter_horizontal_edge_sse2):
     push        rbp
     mov         rbp, rsp
@@ -607,7 +609,7 @@ sym(vp8_mbloop_filter_horizontal_edge_sse2):
 ;    const char    *thresh,
 ;    unsigned char *v
 ;)
-global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2) PRIVATE
+globalsym(vp8_mbloop_filter_horizontal_edge_uv_sse2)
 sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
     push        rbp
     mov         rbp, rsp
@@ -928,7 +930,7 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
 ;    const char    *limit,
 ;    const char    *thresh,
 ;)
-global sym(vp8_loop_filter_vertical_edge_sse2) PRIVATE
+globalsym(vp8_loop_filter_vertical_edge_sse2)
 sym(vp8_loop_filter_vertical_edge_sse2):
     push        rbp
     mov         rbp, rsp
@@ -993,7 +995,7 @@ sym(vp8_loop_filter_vertical_edge_sse2):
 ;    const char    *thresh,
 ;    unsigned char *v
 ;)
-global sym(vp8_loop_filter_vertical_edge_uv_sse2) PRIVATE
+globalsym(vp8_loop_filter_vertical_edge_uv_sse2)
 sym(vp8_loop_filter_vertical_edge_uv_sse2):
     push        rbp
     mov         rbp, rsp
@@ -1142,7 +1144,7 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2):
 ;    const char    *limit,
 ;    const char    *thresh,
 ;)
-global sym(vp8_mbloop_filter_vertical_edge_sse2) PRIVATE
+globalsym(vp8_mbloop_filter_vertical_edge_sse2)
 sym(vp8_mbloop_filter_vertical_edge_sse2):
     push        rbp
     mov         rbp, rsp
@@ -1209,7 +1211,7 @@ sym(vp8_mbloop_filter_vertical_edge_sse2):
 ;    const char    *thresh,
 ;    unsigned char *v
 ;)
-global sym(vp8_mbloop_filter_vertical_edge_uv_sse2) PRIVATE
+globalsym(vp8_mbloop_filter_vertical_edge_uv_sse2)
 sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
     push        rbp
     mov         rbp, rsp
@@ -1269,7 +1271,7 @@ sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
 ;    int  src_pixel_step,
 ;    const char *blimit,
 ;)
-global sym(vp8_loop_filter_simple_horizontal_edge_sse2) PRIVATE
+globalsym(vp8_loop_filter_simple_horizontal_edge_sse2)
 sym(vp8_loop_filter_simple_horizontal_edge_sse2):
     push        rbp
     mov         rbp, rsp
@@ -1374,7 +1376,7 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
 ;    int  src_pixel_step,
 ;    const char *blimit,
 ;)
-global sym(vp8_loop_filter_simple_vertical_edge_sse2) PRIVATE
+globalsym(vp8_loop_filter_simple_vertical_edge_sse2)
 sym(vp8_loop_filter_simple_vertical_edge_sse2):
     push        rbp         ; save old base pointer value.
     mov         rbp, rsp    ; set new base pointer value.
diff --git a/media/libvpx/libvpx/vp8/common/x86/loopfilter_x86.c b/media/libvpx/libvpx/vp8/common/x86/loopfilter_x86.c
index a187d51fbe..cfa13a2ddb 100644
--- a/media/libvpx/libvpx/vp8/common/x86/loopfilter_x86.c
+++ b/media/libvpx/libvpx/vp8/common/x86/loopfilter_x86.c
@@ -22,7 +22,7 @@
 #define prototype_simple_loopfilter(sym) \
   void sym(unsigned char *y, int ystride, const unsigned char *blimit)
 
-#if HAVE_SSE2 && ARCH_X86_64
+#if HAVE_SSE2 && VPX_ARCH_X86_64
 prototype_loopfilter(vp8_loop_filter_bv_y_sse2);
 prototype_loopfilter(vp8_loop_filter_bh_y_sse2);
 #else
@@ -68,7 +68,7 @@ void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
 void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
                              unsigned char *v_ptr, int y_stride, int uv_stride,
                              loop_filter_info *lfi) {
-#if ARCH_X86_64
+#if VPX_ARCH_X86_64
   vp8_loop_filter_bh_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr,
                             2);
 #else
@@ -101,7 +101,7 @@ void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride,
 void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
                              unsigned char *v_ptr, int y_stride, int uv_stride,
                              loop_filter_info *lfi) {
-#if ARCH_X86_64
+#if VPX_ARCH_X86_64
   vp8_loop_filter_bv_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr,
                             2);
 #else
diff --git a/media/libvpx/libvpx/vp8/common/x86/mfqe_sse2.asm b/media/libvpx/libvpx/vp8/common/x86/mfqe_sse2.asm
index 8177b79226..3ec2a99ec2 100644
--- a/media/libvpx/libvpx/vp8/common/x86/mfqe_sse2.asm
+++ b/media/libvpx/libvpx/vp8/common/x86/mfqe_sse2.asm
@@ -11,6 +11,8 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
+
 ;void vp8_filter_by_weight16x16_sse2
 ;(
 ;    unsigned char *src,
@@ -19,7 +21,7 @@
 ;    int            dst_stride,
 ;    int            src_weight
 ;)
-global sym(vp8_filter_by_weight16x16_sse2) PRIVATE
+globalsym(vp8_filter_by_weight16x16_sse2)
 sym(vp8_filter_by_weight16x16_sse2):
     push        rbp
     mov         rbp, rsp
@@ -97,7 +99,7 @@ sym(vp8_filter_by_weight16x16_sse2):
 ;    int            dst_stride,
 ;    int            src_weight
 ;)
-global sym(vp8_filter_by_weight8x8_sse2) PRIVATE
+globalsym(vp8_filter_by_weight8x8_sse2)
 sym(vp8_filter_by_weight8x8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -165,7 +167,7 @@ sym(vp8_filter_by_weight8x8_sse2):
 ;    unsigned int  *variance,      4
 ;    unsigned int  *sad,           5
 ;)
-global sym(vp8_variance_and_sad_16x16_sse2) PRIVATE
+globalsym(vp8_variance_and_sad_16x16_sse2)
 sym(vp8_variance_and_sad_16x16_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/media/libvpx/libvpx/vp8/common/x86/recon_mmx.asm b/media/libvpx/libvpx/vp8/common/x86/recon_mmx.asm
index 43f2dc6c62..01cf066837 100644
--- a/media/libvpx/libvpx/vp8/common/x86/recon_mmx.asm
+++ b/media/libvpx/libvpx/vp8/common/x86/recon_mmx.asm
@@ -11,6 +11,7 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
 
 ;void copy_mem8x8_mmx(
 ;    unsigned char *src,
@@ -18,7 +19,7 @@
 ;    unsigned char *dst,
 ;    int dst_stride
 ;    )
-global sym(vp8_copy_mem8x8_mmx) PRIVATE
+globalsym(vp8_copy_mem8x8_mmx)
 sym(vp8_copy_mem8x8_mmx):
     push        rbp
     mov         rbp, rsp
@@ -81,7 +82,7 @@ sym(vp8_copy_mem8x8_mmx):
 ;    unsigned char *dst,
 ;    int dst_stride
 ;    )
-global sym(vp8_copy_mem8x4_mmx) PRIVATE
+globalsym(vp8_copy_mem8x4_mmx)
 sym(vp8_copy_mem8x4_mmx):
     push        rbp
     mov         rbp, rsp
diff --git a/media/libvpx/libvpx/vp8/common/x86/recon_sse2.asm b/media/libvpx/libvpx/vp8/common/x86/recon_sse2.asm
index cb89537f76..17baf094ef 100644
--- a/media/libvpx/libvpx/vp8/common/x86/recon_sse2.asm
+++ b/media/libvpx/libvpx/vp8/common/x86/recon_sse2.asm
@@ -11,13 +11,15 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
+
 ;void copy_mem16x16_sse2(
 ;    unsigned char *src,
 ;    int src_stride,
 ;    unsigned char *dst,
 ;    int dst_stride
 ;    )
-global sym(vp8_copy_mem16x16_sse2) PRIVATE
+globalsym(vp8_copy_mem16x16_sse2)
 sym(vp8_copy_mem16x16_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/media/libvpx/libvpx/vp8/common/x86/subpixel_mmx.asm b/media/libvpx/libvpx/vp8/common/x86/subpixel_mmx.asm
index 6ab7f1fdc7..8f0f6fcc89 100644
--- a/media/libvpx/libvpx/vp8/common/x86/subpixel_mmx.asm
+++ b/media/libvpx/libvpx/vp8/common/x86/subpixel_mmx.asm
@@ -10,13 +10,12 @@
 
 
 %include "vpx_ports/x86_abi_support.asm"
-extern sym(vp8_bilinear_filters_x86_8)
-
 
 %define BLOCK_HEIGHT_WIDTH 4
 %define vp8_filter_weight 128
 %define VP8_FILTER_SHIFT  7
 
+SECTION .text
 
 ;void vp8_filter_block1d_h6_mmx
 ;(
@@ -28,7 +27,7 @@ extern sym(vp8_bilinear_filters_x86_8)
 ;    unsigned int    output_width,
 ;    short           * vp8_filter
 ;)
-global sym(vp8_filter_block1d_h6_mmx) PRIVATE
+globalsym(vp8_filter_block1d_h6_mmx)
 sym(vp8_filter_block1d_h6_mmx):
     push        rbp
     mov         rbp, rsp
@@ -125,7 +124,7 @@ sym(vp8_filter_block1d_h6_mmx):
 ;   unsigned int output_width,
 ;   short * vp8_filter
 ;)
-global sym(vp8_filter_block1dc_v6_mmx) PRIVATE
+globalsym(vp8_filter_block1dc_v6_mmx)
 sym(vp8_filter_block1dc_v6_mmx):
     push        rbp
     mov         rbp, rsp
@@ -204,280 +203,6 @@ sym(vp8_filter_block1dc_v6_mmx):
     ret
 
 
-;void bilinear_predict8x4_mmx
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-global sym(vp8_bilinear_predict8x4_mmx) PRIVATE
-sym(vp8_bilinear_predict8x4_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
-    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
-
-        movsxd      rax,        dword ptr arg(2) ;xoffset
-        mov         rdi,        arg(4) ;dst_ptr           ;
-
-        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
-        shl         rax,        5
-
-        mov         rsi,        arg(0) ;src_ptr              ;
-        add         rax,        rcx
-
-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
-        movq        mm1,        [rax]               ;
-
-        movq        mm2,        [rax+16]            ;
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-
-        pxor        mm0,        mm0                 ;
-        shl         rax,        5
-
-        add         rax,        rcx
-        lea         rcx,        [rdi+rdx*4]          ;
-
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
-
-        ; get the first horizontal line done       ;
-        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movq        mm4,        mm3                 ; make a copy of current line
-
-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   mm4,        mm0                 ;
-
-        pmullw      mm3,        mm1                 ;
-        pmullw      mm4,        mm1                 ;
-
-        movq        mm5,        [rsi+1]             ;
-        movq        mm6,        mm5                 ;
-
-        punpcklbw   mm5,        mm0                 ;
-        punpckhbw   mm6,        mm0                 ;
-
-        pmullw      mm5,        mm2                 ;
-        pmullw      mm6,        mm2                 ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm4,        mm6                 ;
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       mm4,        [GLOBAL(rd)]                 ;
-        psraw       mm4,        VP8_FILTER_SHIFT        ;
-
-        movq        mm7,        mm3                 ;
-        packuswb    mm7,        mm4                 ;
-
-        add         rsi,        rdx                 ; next line
-.next_row_8x4:
-        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movq        mm4,        mm3                 ; make a copy of current line
-
-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   mm4,        mm0                 ;
-
-        pmullw      mm3,        mm1                 ;
-        pmullw      mm4,        mm1                 ;
-
-        movq        mm5,        [rsi+1]             ;
-        movq        mm6,        mm5                 ;
-
-        punpcklbw   mm5,        mm0                 ;
-        punpckhbw   mm6,        mm0                 ;
-
-        pmullw      mm5,        mm2                 ;
-        pmullw      mm6,        mm2                 ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm4,        mm6                 ;
-
-        movq        mm5,        mm7                 ;
-        movq        mm6,        mm7                 ;
-
-        punpcklbw   mm5,        mm0                 ;
-        punpckhbw   mm6,        mm0
-
-        pmullw      mm5,        [rax]               ;
-        pmullw      mm6,        [rax]               ;
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       mm4,        [GLOBAL(rd)]                 ;
-        psraw       mm4,        VP8_FILTER_SHIFT        ;
-
-        movq        mm7,        mm3                 ;
-        packuswb    mm7,        mm4                 ;
-
-
-        pmullw      mm3,        [rax+16]            ;
-        pmullw      mm4,        [rax+16]            ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm4,        mm6                 ;
-
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       mm4,        [GLOBAL(rd)]                 ;
-        psraw       mm4,        VP8_FILTER_SHIFT        ;
-
-        packuswb    mm3,        mm4
-
-        movq        [rdi],      mm3                 ; store the results in the destination
-
-%if ABI_IS_32BIT
-        add         rsi,        rdx                 ; next line
-        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
-%else
-        movsxd      r8,         dword ptr arg(5) ;dst_pitch
-        add         rsi,        rdx                 ; next line
-        add         rdi,        r8
-%endif
-        cmp         rdi,        rcx                 ;
-        jne         .next_row_8x4
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void bilinear_predict4x4_mmx
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-global sym(vp8_bilinear_predict4x4_mmx) PRIVATE
-sym(vp8_bilinear_predict4x4_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
-    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
-
-        movsxd      rax,        dword ptr arg(2) ;xoffset
-        mov         rdi,        arg(4) ;dst_ptr           ;
-
-        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
-        shl         rax,        5
-
-        add         rax,        rcx ; HFilter
-        mov         rsi,        arg(0) ;src_ptr              ;
-
-        movsxd      rdx,        dword ptr arg(5) ;ldst_pitch
-        movq        mm1,        [rax]               ;
-
-        movq        mm2,        [rax+16]            ;
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-
-        pxor        mm0,        mm0                 ;
-        shl         rax,        5
-
-        add         rax,        rcx
-        lea         rcx,        [rdi+rdx*4]          ;
-
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
-
-        ; get the first horizontal line done       ;
-        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
-
-        pmullw      mm3,        mm1                 ;
-        movd        mm5,        [rsi+1]             ;
-
-        punpcklbw   mm5,        mm0                 ;
-        pmullw      mm5,        mm2                 ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-
-        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        movq        mm7,        mm3                 ;
-        packuswb    mm7,        mm0                 ;
-
-        add         rsi,        rdx                 ; next line
-.next_row_4x4:
-        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
-
-        pmullw      mm3,        mm1                 ;
-        movd        mm5,        [rsi+1]             ;
-
-        punpcklbw   mm5,        mm0                 ;
-        pmullw      mm5,        mm2                 ;
-
-        paddw       mm3,        mm5                 ;
-
-        movq        mm5,        mm7                 ;
-        punpcklbw   mm5,        mm0                 ;
-
-        pmullw      mm5,        [rax]               ;
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-
-        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
-        movq        mm7,        mm3                 ;
-
-        packuswb    mm7,        mm0                 ;
-
-        pmullw      mm3,        [rax+16]            ;
-        paddw       mm3,        mm5                 ;
-
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        packuswb    mm3,        mm0
-        movd        [rdi],      mm3                 ; store the results in the destination
-
-%if ABI_IS_32BIT
-        add         rsi,        rdx                 ; next line
-        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
-%else
-        movsxd      r8,         dword ptr arg(5) ;dst_pitch                   ;
-        add         rsi,        rdx                 ; next line
-        add         rdi,        r8
-%endif
-
-        cmp         rdi,        rcx                 ;
-        jne         .next_row_4x4
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
 SECTION_RODATA
 align 16
 rd:
diff --git a/media/libvpx/libvpx/vp8/common/x86/subpixel_sse2.asm b/media/libvpx/libvpx/vp8/common/x86/subpixel_sse2.asm
index ca00583ca2..94e14aed6c 100644
--- a/media/libvpx/libvpx/vp8/common/x86/subpixel_sse2.asm
+++ b/media/libvpx/libvpx/vp8/common/x86/subpixel_sse2.asm
@@ -10,12 +10,12 @@
 
 
 %include "vpx_ports/x86_abi_support.asm"
-extern sym(vp8_bilinear_filters_x86_8)
 
 %define BLOCK_HEIGHT_WIDTH 4
 %define VP8_FILTER_WEIGHT 128
 %define VP8_FILTER_SHIFT  7
 
+SECTION .text
 
 ;/************************************************************************************
 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
@@ -33,7 +33,7 @@ extern sym(vp8_bilinear_filters_x86_8)
 ;    unsigned int    output_width,
 ;    short           *vp8_filter
 ;)
-global sym(vp8_filter_block1d8_h6_sse2) PRIVATE
+globalsym(vp8_filter_block1d8_h6_sse2)
 sym(vp8_filter_block1d8_h6_sse2):
     push        rbp
     mov         rbp, rsp
@@ -153,7 +153,7 @@ sym(vp8_filter_block1d8_h6_sse2):
 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
 ; rows each iteration to take advantage of the 128 bits operations.
 ;*************************************************************************************/
-global sym(vp8_filter_block1d16_h6_sse2) PRIVATE
+globalsym(vp8_filter_block1d16_h6_sse2)
 sym(vp8_filter_block1d16_h6_sse2):
     push        rbp
     mov         rbp, rsp
@@ -333,7 +333,7 @@ sym(vp8_filter_block1d16_h6_sse2):
 ; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
 ; input pixel array has output_height rows.
 ;*************************************************************************************/
-global sym(vp8_filter_block1d8_v6_sse2) PRIVATE
+globalsym(vp8_filter_block1d8_v6_sse2)
 sym(vp8_filter_block1d8_v6_sse2):
     push        rbp
     mov         rbp, rsp
@@ -428,7 +428,7 @@ sym(vp8_filter_block1d8_v6_sse2):
 ; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
 ; input pixel array has output_height rows.
 ;*************************************************************************************/
-global sym(vp8_filter_block1d16_v6_sse2) PRIVATE
+globalsym(vp8_filter_block1d16_v6_sse2)
 sym(vp8_filter_block1d16_v6_sse2):
     push        rbp
     mov         rbp, rsp
@@ -538,7 +538,7 @@ sym(vp8_filter_block1d16_v6_sse2):
 ;    const short    *vp8_filter
 ;)
 ; First-pass filter only when yoffset==0
-global sym(vp8_filter_block1d8_h6_only_sse2) PRIVATE
+globalsym(vp8_filter_block1d8_h6_only_sse2)
 sym(vp8_filter_block1d8_h6_only_sse2):
     push        rbp
     mov         rbp, rsp
@@ -651,7 +651,7 @@ sym(vp8_filter_block1d8_h6_only_sse2):
 ;    const short    *vp8_filter
 ;)
 ; First-pass filter only when yoffset==0
-global sym(vp8_filter_block1d16_h6_only_sse2) PRIVATE
+globalsym(vp8_filter_block1d16_h6_only_sse2)
 sym(vp8_filter_block1d16_h6_only_sse2):
     push        rbp
     mov         rbp, rsp
@@ -816,7 +816,7 @@ sym(vp8_filter_block1d16_h6_only_sse2):
 ;    const short    *vp8_filter
 ;)
 ; Second-pass filter only when xoffset==0
-global sym(vp8_filter_block1d8_v6_only_sse2) PRIVATE
+globalsym(vp8_filter_block1d8_v6_only_sse2)
 sym(vp8_filter_block1d8_v6_only_sse2):
     push        rbp
     mov         rbp, rsp
@@ -908,7 +908,7 @@ sym(vp8_filter_block1d8_v6_only_sse2):
 ;    unsigned int    output_height,
 ;    unsigned int    output_width
 ;)
-global sym(vp8_unpack_block1d16_h6_sse2) PRIVATE
+globalsym(vp8_unpack_block1d16_h6_sse2)
 sym(vp8_unpack_block1d16_h6_sse2):
     push        rbp
     mov         rbp, rsp
@@ -957,419 +957,6 @@ sym(vp8_unpack_block1d16_h6_sse2):
     ret
 
 
-;void vp8_bilinear_predict16x16_sse2
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-extern sym(vp8_bilinear_filters_x86_8)
-global sym(vp8_bilinear_predict16x16_sse2) PRIVATE
-sym(vp8_bilinear_predict16x16_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
-    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
-
-        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
-        movsxd      rax,        dword ptr arg(2) ;xoffset
-
-        cmp         rax,        0      ;skip first_pass filter if xoffset=0
-        je          .b16x16_sp_only
-
-        shl         rax,        5
-        add         rax,        rcx    ;HFilter
-
-        mov         rdi,        arg(4) ;dst_ptr
-        mov         rsi,        arg(0) ;src_ptr
-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
-
-        movdqa      xmm1,       [rax]
-        movdqa      xmm2,       [rax+16]
-
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-
-        cmp         rax,        0      ;skip second_pass filter if yoffset=0
-        je          .b16x16_fp_only
-
-        shl         rax,        5
-        add         rax,        rcx    ;VFilter
-
-        lea         rcx,        [rdi+rdx*8]
-        lea         rcx,        [rcx+rdx*8]
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
-
-        pxor        xmm0,       xmm0
-
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(5) ;dst_pitch
-%endif
-        ; get the first horizontal line done
-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-
-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   xmm4,       xmm0
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm1
-
-        movdqu      xmm5,       [rsi+1]
-        movdqa      xmm6,       xmm5
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-
-        pmullw      xmm5,       xmm2
-        pmullw      xmm6,       xmm2
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP8_FILTER_SHIFT
-
-        movdqa      xmm7,       xmm3
-        packuswb    xmm7,       xmm4
-
-        add         rsi,        rdx                 ; next line
-.next_row:
-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-
-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   xmm4,       xmm0
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm1
-
-        movdqu      xmm5,       [rsi+1]
-        movdqa      xmm6,       xmm5
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-
-        pmullw      xmm5,       xmm2
-        pmullw      xmm6,       xmm2
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        movdqa      xmm5,       xmm7
-        movdqa      xmm6,       xmm7
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-
-        pmullw      xmm5,       [rax]
-        pmullw      xmm6,       [rax]
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP8_FILTER_SHIFT
-
-        movdqa      xmm7,       xmm3
-        packuswb    xmm7,       xmm4
-
-        pmullw      xmm3,       [rax+16]
-        pmullw      xmm4,       [rax+16]
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP8_FILTER_SHIFT
-
-        packuswb    xmm3,       xmm4
-        movdqa      [rdi],      xmm3                 ; store the results in the destination
-
-        add         rsi,        rdx                 ; next line
-%if ABI_IS_32BIT
-        add         rdi,        DWORD PTR arg(5) ;dst_pitch
-%else
-        add         rdi,        r8
-%endif
-
-        cmp         rdi,        rcx
-        jne         .next_row
-
-        jmp         .done
-
-.b16x16_sp_only:
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-        shl         rax,        5
-        add         rax,        rcx    ;VFilter
-
-        mov         rdi,        arg(4) ;dst_ptr
-        mov         rsi,        arg(0) ;src_ptr
-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
-
-        movdqa      xmm1,       [rax]
-        movdqa      xmm2,       [rax+16]
-
-        lea         rcx,        [rdi+rdx*8]
-        lea         rcx,        [rcx+rdx*8]
-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
-
-        pxor        xmm0,       xmm0
-
-        ; get the first horizontal line done
-        movdqu      xmm7,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-
-        add         rsi,        rax                 ; next line
-.next_row_spo:
-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-
-        movdqa      xmm5,       xmm7
-        movdqa      xmm6,       xmm7
-
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-        movdqa      xmm7,       xmm3
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   xmm4,       xmm0
-
-        pmullw      xmm5,       xmm1
-        pmullw      xmm6,       xmm1
-        pmullw      xmm3,       xmm2
-        pmullw      xmm4,       xmm2
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP8_FILTER_SHIFT
-
-        packuswb    xmm3,       xmm4
-        movdqa      [rdi],      xmm3                 ; store the results in the destination
-
-        add         rsi,        rax                 ; next line
-        add         rdi,        rdx                 ;dst_pitch
-        cmp         rdi,        rcx
-        jne         .next_row_spo
-
-        jmp         .done
-
-.b16x16_fp_only:
-        lea         rcx,        [rdi+rdx*8]
-        lea         rcx,        [rcx+rdx*8]
-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
-        pxor        xmm0,       xmm0
-
-.next_row_fpo:
-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-
-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   xmm4,       xmm0
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm1
-
-        movdqu      xmm5,       [rsi+1]
-        movdqa      xmm6,       xmm5
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-
-        pmullw      xmm5,       xmm2
-        pmullw      xmm6,       xmm2
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP8_FILTER_SHIFT
-
-        packuswb    xmm3,       xmm4
-        movdqa      [rdi],      xmm3                 ; store the results in the destination
-
-        add         rsi,        rax                 ; next line
-        add         rdi,        rdx                 ; dst_pitch
-        cmp         rdi,        rcx
-        jne         .next_row_fpo
-
-.done:
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_bilinear_predict8x8_sse2
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-global sym(vp8_bilinear_predict8x8_sse2) PRIVATE
-sym(vp8_bilinear_predict8x8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 144                         ; reserve 144 bytes
-
-    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
-    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
-        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
-
-        mov         rsi,        arg(0) ;src_ptr
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
-
-    ;Read 9-line unaligned data in and put them on stack. This gives a big
-    ;performance boost.
-        movdqu      xmm0,       [rsi]
-        lea         rax,        [rdx + rdx*2]
-        movdqu      xmm1,       [rsi+rdx]
-        movdqu      xmm2,       [rsi+rdx*2]
-        add         rsi,        rax
-        movdqu      xmm3,       [rsi]
-        movdqu      xmm4,       [rsi+rdx]
-        movdqu      xmm5,       [rsi+rdx*2]
-        add         rsi,        rax
-        movdqu      xmm6,       [rsi]
-        movdqu      xmm7,       [rsi+rdx]
-
-        movdqa      XMMWORD PTR [rsp],            xmm0
-
-        movdqu      xmm0,       [rsi+rdx*2]
-
-        movdqa      XMMWORD PTR [rsp+16],         xmm1
-        movdqa      XMMWORD PTR [rsp+32],         xmm2
-        movdqa      XMMWORD PTR [rsp+48],         xmm3
-        movdqa      XMMWORD PTR [rsp+64],         xmm4
-        movdqa      XMMWORD PTR [rsp+80],         xmm5
-        movdqa      XMMWORD PTR [rsp+96],         xmm6
-        movdqa      XMMWORD PTR [rsp+112],        xmm7
-        movdqa      XMMWORD PTR [rsp+128],        xmm0
-
-        movsxd      rax,        dword ptr arg(2) ;xoffset
-        shl         rax,        5
-        add         rax,        rcx    ;HFilter
-
-        mov         rdi,        arg(4) ;dst_ptr
-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
-
-        movdqa      xmm1,       [rax]
-        movdqa      xmm2,       [rax+16]
-
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-        shl         rax,        5
-        add         rax,        rcx    ;VFilter
-
-        lea         rcx,        [rdi+rdx*8]
-
-        movdqa      xmm5,       [rax]
-        movdqa      xmm6,       [rax+16]
-
-        pxor        xmm0,       xmm0
-
-        ; get the first horizontal line done
-        movdqa      xmm3,       XMMWORD PTR [rsp]
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-        psrldq      xmm4,       1
-
-        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
-        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm2
-
-        paddw       xmm3,       xmm4
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        movdqa      xmm7,       xmm3
-        add         rsp,        16                 ; next line
-.next_row8x8:
-        movdqa      xmm3,       XMMWORD PTR [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-        psrldq      xmm4,       1
-
-        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
-        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm2
-
-        paddw       xmm3,       xmm4
-        pmullw      xmm7,       xmm5
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        movdqa      xmm4,       xmm3
-
-        pmullw      xmm3,       xmm6
-        paddw       xmm3,       xmm7
-
-        movdqa      xmm7,       xmm4
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        packuswb    xmm3,       xmm0
-        movq        [rdi],      xmm3                 ; store the results in the destination
-
-        add         rsp,        16                 ; next line
-        add         rdi,        rdx
-
-        cmp         rdi,        rcx
-        jne         .next_row8x8
-
-    ;add rsp, 144
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 SECTION_RODATA
 align 16
 rd:
diff --git a/media/libvpx/libvpx/vp8/common/x86/subpixel_ssse3.asm b/media/libvpx/libvpx/vp8/common/x86/subpixel_ssse3.asm
index 1f6cbd1d1e..17247227db 100644
--- a/media/libvpx/libvpx/vp8/common/x86/subpixel_ssse3.asm
+++ b/media/libvpx/libvpx/vp8/common/x86/subpixel_ssse3.asm
@@ -15,6 +15,7 @@
 %define VP8_FILTER_WEIGHT 128
 %define VP8_FILTER_SHIFT  7
 
+SECTION .text
 
 ;/************************************************************************************
 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
@@ -34,7 +35,7 @@
 ;    unsigned int    output_height,
 ;    unsigned int    vp8_filter_index
 ;)
-global sym(vp8_filter_block1d8_h6_ssse3) PRIVATE
+globalsym(vp8_filter_block1d8_h6_ssse3)
 sym(vp8_filter_block1d8_h6_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -177,7 +178,7 @@ vp8_filter_block1d8_h4_ssse3:
 ;    unsigned int    output_height,
 ;    unsigned int    vp8_filter_index
 ;)
-global sym(vp8_filter_block1d16_h6_ssse3) PRIVATE
+globalsym(vp8_filter_block1d16_h6_ssse3)
 sym(vp8_filter_block1d16_h6_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -284,7 +285,7 @@ sym(vp8_filter_block1d16_h6_ssse3):
 ;    unsigned int    output_height,
 ;    unsigned int    vp8_filter_index
 ;)
-global sym(vp8_filter_block1d4_h6_ssse3) PRIVATE
+globalsym(vp8_filter_block1d4_h6_ssse3)
 sym(vp8_filter_block1d4_h6_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -414,7 +415,7 @@ sym(vp8_filter_block1d4_h6_ssse3):
 ;    unsigned int   output_height,
 ;    unsigned int   vp8_filter_index
 ;)
-global sym(vp8_filter_block1d16_v6_ssse3) PRIVATE
+globalsym(vp8_filter_block1d16_v6_ssse3)
 sym(vp8_filter_block1d16_v6_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -602,7 +603,7 @@ sym(vp8_filter_block1d16_v6_ssse3):
 ;    unsigned int   output_height,
 ;    unsigned int   vp8_filter_index
 ;)
-global sym(vp8_filter_block1d8_v6_ssse3) PRIVATE
+globalsym(vp8_filter_block1d8_v6_ssse3)
 sym(vp8_filter_block1d8_v6_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -742,7 +743,7 @@ sym(vp8_filter_block1d8_v6_ssse3):
 ;    unsigned int   output_height,
 ;    unsigned int   vp8_filter_index
 ;)
-global sym(vp8_filter_block1d4_v6_ssse3) PRIVATE
+globalsym(vp8_filter_block1d4_v6_ssse3)
 sym(vp8_filter_block1d4_v6_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -881,7 +882,7 @@ sym(vp8_filter_block1d4_v6_ssse3):
 ;    unsigned char *dst_ptr,
 ;    int dst_pitch
 ;)
-global sym(vp8_bilinear_predict16x16_ssse3) PRIVATE
+globalsym(vp8_bilinear_predict16x16_ssse3)
 sym(vp8_bilinear_predict16x16_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -1144,7 +1145,7 @@ sym(vp8_bilinear_predict16x16_ssse3):
 ;    unsigned char *dst_ptr,
 ;    int dst_pitch
 ;)
-global sym(vp8_bilinear_predict8x8_ssse3) PRIVATE
+globalsym(vp8_bilinear_predict8x8_ssse3)
 sym(vp8_bilinear_predict8x8_ssse3):
     push        rbp
     mov         rbp, rsp
diff --git a/media/libvpx/libvpx/vp8/common/x86/vp8_asm_stubs.c b/media/libvpx/libvpx/vp8/common/x86/vp8_asm_stubs.c
index b9d087e20d..7fb83c2d5e 100644
--- a/media/libvpx/libvpx/vp8/common/x86/vp8_asm_stubs.c
+++ b/media/libvpx/libvpx/vp8/common/x86/vp8_asm_stubs.c
@@ -11,7 +11,6 @@
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
 #include "vpx_ports/mem.h"
-#include "filter_x86.h"
 
 extern const short vp8_six_tap_x86[8][6 * 8];
 
@@ -95,9 +94,7 @@ void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line,
 void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr,
                                   int src_pixels_per_line, int xoffset,
                                   int yoffset, unsigned char *dst_ptr,
-                                  int dst_pitch
-
-                                  ) {
+                                  int dst_pitch) {
   DECLARE_ALIGNED(16, unsigned short,
                   FData2[24 * 24]); /* Temp data bufffer used in filtering */
 
@@ -236,9 +233,7 @@ extern void vp8_filter_block1d4_v6_ssse3(unsigned char *src_ptr,
 void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr,
                                    int src_pixels_per_line, int xoffset,
                                    int yoffset, unsigned char *dst_ptr,
-                                   int dst_pitch
-
-                                   ) {
+                                   int dst_pitch) {
   DECLARE_ALIGNED(16, unsigned char, FData2[24 * 24]);
 
   if (xoffset) {
@@ -351,8 +346,8 @@ void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr,
                                    yoffset);
     } else {
       /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
-        * yoffset==0) case correctly. Add copy function here to guarantee
-        * six-tap function handles all possible offsets. */
+       * yoffset==0) case correctly. Add copy function here to guarantee
+       * six-tap function handles all possible offsets. */
       int r;
 
       for (r = 0; r < 4; ++r) {
diff --git a/media/libvpx/libvpx/vp8/decoder/dboolhuff.c b/media/libvpx/libvpx/vp8/decoder/dboolhuff.c
index 9cf74bf856..11099c453c 100644
--- a/media/libvpx/libvpx/vp8/decoder/dboolhuff.c
+++ b/media/libvpx/libvpx/vp8/decoder/dboolhuff.c
@@ -15,7 +15,13 @@
 int vp8dx_start_decode(BOOL_DECODER *br, const unsigned char *source,
                        unsigned int source_sz, vpx_decrypt_cb decrypt_cb,
                        void *decrypt_state) {
-  br->user_buffer_end = source + source_sz;
+  if (source_sz && !source) return 1;
+
+  // To simplify calling code this fuction can be called with |source| == null
+  // and |source_sz| == 0. This and vp8dx_bool_decoder_fill() are essentially
+  // no-ops in this case.
+  // Work around a ubsan warning with a ternary to avoid adding 0 to null.
+  br->user_buffer_end = source ? source + source_sz : source;
   br->user_buffer = source;
   br->value = 0;
   br->count = -8;
@@ -23,8 +29,6 @@ int vp8dx_start_decode(BOOL_DECODER *br, const unsigned char *source,
   br->decrypt_cb = decrypt_cb;
   br->decrypt_state = decrypt_state;
 
-  if (source_sz && !source) return 1;
-
   /* Populate the buffer */
   vp8dx_bool_decoder_fill(br);
 
diff --git a/media/libvpx/libvpx/vp8/decoder/dboolhuff.h b/media/libvpx/libvpx/vp8/decoder/dboolhuff.h
index 04c027cd78..673b2fbd5d 100644
--- a/media/libvpx/libvpx/vp8/decoder/dboolhuff.h
+++ b/media/libvpx/libvpx/vp8/decoder/dboolhuff.h
@@ -8,13 +8,14 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_DECODER_DBOOLHUFF_H_
-#define VP8_DECODER_DBOOLHUFF_H_
+#ifndef VPX_VP8_DECODER_DBOOLHUFF_H_
+#define VPX_VP8_DECODER_DBOOLHUFF_H_
 
 #include <stddef.h>
 #include <limits.h>
 
 #include "./vpx_config.h"
+#include "vpx_ports/compiler_attributes.h"
 #include "vpx_ports/mem.h"
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_integer.h"
@@ -50,7 +51,8 @@ int vp8dx_start_decode(BOOL_DECODER *br, const unsigned char *source,
 
 void vp8dx_bool_decoder_fill(BOOL_DECODER *br);
 
-static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {
+static VPX_NO_UNSIGNED_SHIFT_CHECK int vp8dx_decode_bool(BOOL_DECODER *br,
+                                                         int probability) {
   unsigned int bit = 0;
   VP8_BD_VALUE value;
   unsigned int split;
@@ -76,7 +78,7 @@ static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {
   }
 
   {
-    register int shift = vp8_norm[range];
+    const unsigned char shift = vp8_norm[(unsigned char)range];
     range <<= shift;
     value <<= shift;
     count -= shift;
@@ -127,4 +129,4 @@ static INLINE int vp8dx_bool_error(BOOL_DECODER *br) {
 }  // extern "C"
 #endif
 
-#endif  // VP8_DECODER_DBOOLHUFF_H_
+#endif  // VPX_VP8_DECODER_DBOOLHUFF_H_
diff --git a/media/libvpx/libvpx/vp8/decoder/decodeframe.c b/media/libvpx/libvpx/vp8/decoder/decodeframe.c
index 0aec2a01b0..a20f33dd8c 100644
--- a/media/libvpx/libvpx/vp8/decoder/decodeframe.c
+++ b/media/libvpx/libvpx/vp8/decoder/decodeframe.c
@@ -63,7 +63,7 @@ void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd) {
   /* Decide whether to use the default or alternate baseline Q value. */
   if (xd->segmentation_enabled) {
     /* Abs Value */
-    if (xd->mb_segement_abs_delta == SEGMENT_ABSDATA) {
+    if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) {
       QIndex = xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id];
 
       /* Delta Value */
@@ -211,7 +211,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
           vp8_short_inv_walsh4x4(&b->dqcoeff[0], xd->qcoeff);
           memset(b->qcoeff, 0, 16 * sizeof(b->qcoeff[0]));
         } else {
-          b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0];
+          b->dqcoeff[0] = (short)(b->qcoeff[0] * xd->dequant_y2[0]);
           vp8_short_inv_walsh4x4_1(&b->dqcoeff[0], xd->qcoeff);
           memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
         }
@@ -610,8 +610,7 @@ static void decode_mb_rows(VP8D_COMP *pbi) {
                                      lf_dst[2]);
         } else {
           vp8_loop_filter_row_simple(pc, lf_mic, mb_row - 1, recon_y_stride,
-                                     recon_uv_stride, lf_dst[0], lf_dst[1],
-                                     lf_dst[2]);
+                                     lf_dst[0]);
         }
         if (mb_row > 1) {
           yv12_extend_frame_left_right_c(yv12_fb_new, eb_dst[0], eb_dst[1],
@@ -647,8 +646,7 @@ static void decode_mb_rows(VP8D_COMP *pbi) {
                                  lf_dst[2]);
     } else {
       vp8_loop_filter_row_simple(pc, lf_mic, mb_row - 1, recon_y_stride,
-                                 recon_uv_stride, lf_dst[0], lf_dst[1],
-                                 lf_dst[2]);
+                                 lf_dst[0]);
     }
 
     yv12_extend_frame_left_right_c(yv12_fb_new, eb_dst[0], eb_dst[1],
@@ -674,7 +672,7 @@ static unsigned int read_partition_size(VP8D_COMP *pbi,
 
 static int read_is_valid(const unsigned char *start, size_t len,
                          const unsigned char *end) {
-  return (start + len > start && start + len <= end);
+  return len != 0 && end > start && len <= (size_t)(end - start);
 }
 
 static unsigned int read_available_partition_size(
@@ -686,6 +684,12 @@ static unsigned int read_available_partition_size(
   const unsigned char *partition_size_ptr = token_part_sizes + i * 3;
   unsigned int partition_size = 0;
   ptrdiff_t bytes_left = fragment_end - fragment_start;
+  if (bytes_left < 0) {
+    vpx_internal_error(
+        &pc->error, VPX_CODEC_CORRUPT_FRAME,
+        "Truncated packet or corrupt partition. No bytes left %d.",
+        (int)bytes_left);
+  }
   /* Calculate the length of this partition. The last partition
    * size is implicit. If the partition size can't be read, then
    * either use the remaining data in the buffer (for EC mode)
@@ -750,6 +754,9 @@ static void setup_token_decoder(VP8D_COMP *pbi,
       ptrdiff_t ext_first_part_size = token_part_sizes -
                                       pbi->fragments.ptrs[0] +
                                       3 * (num_token_partitions - 1);
+      if (fragment_size < (unsigned int)ext_first_part_size)
+        vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME,
+                           "Corrupted fragment size %d", fragment_size);
       fragment_size -= (unsigned int)ext_first_part_size;
       if (fragment_size > 0) {
         pbi->fragments.sizes[0] = (unsigned int)ext_first_part_size;
@@ -767,6 +774,9 @@ static void setup_token_decoder(VP8D_COMP *pbi,
           first_fragment_end, fragment_end, fragment_idx - 1,
           num_token_partitions);
       pbi->fragments.sizes[fragment_idx] = (unsigned int)partition_size;
+      if (fragment_size < (unsigned int)partition_size)
+        vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME,
+                           "Corrupted fragment size %d", fragment_size);
       fragment_size -= (unsigned int)partition_size;
       assert(fragment_idx <= num_token_partitions);
       if (fragment_size > 0) {
@@ -819,7 +829,7 @@ static void init_frame(VP8D_COMP *pbi) {
 
     /* reset the segment feature data to 0 with delta coding (Default state). */
     memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
-    xd->mb_segement_abs_delta = SEGMENT_DELTADATA;
+    xd->mb_segment_abs_delta = SEGMENT_DELTADATA;
 
     /* reset the mode ref deltasa for loop filter */
     memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
@@ -862,8 +872,8 @@ static void init_frame(VP8D_COMP *pbi) {
   xd->mode_info_stride = pc->mode_info_stride;
   xd->corrupted = 0; /* init without corruption */
 
-  xd->fullpixel_mask = 0xffffffff;
-  if (pc->full_pixel) xd->fullpixel_mask = 0xfffffff8;
+  xd->fullpixel_mask = ~0;
+  if (pc->full_pixel) xd->fullpixel_mask = ~7;
 }
 
 int vp8_decode_frame(VP8D_COMP *pbi) {
@@ -930,7 +940,7 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
       /* When error concealment is enabled we should only check the sync
        * code if we have enough bits available
        */
-      if (!pbi->ec_active || data + 3 < data_end) {
+      if (data + 3 < data_end) {
         if (clear[0] != 0x9d || clear[1] != 0x01 || clear[2] != 0x2a) {
           vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM,
                              "Invalid frame sync code");
@@ -941,16 +951,22 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
        * if we have enough data. Otherwise we will end up with the wrong
        * size.
        */
-      if (!pbi->ec_active || data + 6 < data_end) {
+      if (data + 6 < data_end) {
         pc->Width = (clear[3] | (clear[4] << 8)) & 0x3fff;
         pc->horiz_scale = clear[4] >> 6;
         pc->Height = (clear[5] | (clear[6] << 8)) & 0x3fff;
         pc->vert_scale = clear[6] >> 6;
+        data += 7;
+      } else if (!pbi->ec_active) {
+        vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                           "Truncated key frame header");
+      } else {
+        /* Error concealment is active, clear the frame. */
+        data = data_end;
       }
-      data += 7;
     } else {
-      memcpy(&xd->pre, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG));
-      memcpy(&xd->dst, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG));
+      xd->pre = *yv12_fb_new;
+      xd->dst = *yv12_fb_new;
     }
   }
   if ((!pbi->decoded_key_frame && pc->frame_type != KEY_FRAME)) {
@@ -979,7 +995,7 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
     xd->update_mb_segmentation_data = (unsigned char)vp8_read_bit(bc);
 
     if (xd->update_mb_segmentation_data) {
-      xd->mb_segement_abs_delta = (unsigned char)vp8_read_bit(bc);
+      xd->mb_segment_abs_delta = (unsigned char)vp8_read_bit(bc);
 
       memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
 
@@ -1140,7 +1156,7 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
   if (pbi->ec_active && xd->corrupted) pc->refresh_entropy_probs = 0;
 #endif
   if (pc->refresh_entropy_probs == 0) {
-    memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));
+    pc->lfc = pc->fc;
   }
 
   pc->refresh_last_frame = pc->frame_type == KEY_FRAME || vp8_read_bit(bc);
@@ -1151,14 +1167,6 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
   if (pbi->ec_active && xd->corrupted) pc->refresh_last_frame = 1;
 #endif
 
-  if (0) {
-    FILE *z = fopen("decodestats.stt", "a");
-    fprintf(z, "%6d F:%d,G:%d,A:%d,L:%d,Q:%d\n", pc->current_video_frame,
-            pc->frame_type, pc->refresh_golden_frame, pc->refresh_alt_ref_frame,
-            pc->refresh_last_frame, pc->base_qindex);
-    fclose(z);
-  }
-
   {
     pbi->independent_partitions = 1;
 
@@ -1199,9 +1207,14 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
   pbi->frame_corrupt_residual = 0;
 
 #if CONFIG_MULTITHREAD
-  if (pbi->b_multithreaded_rd && pc->multi_token_partition != ONE_PARTITION) {
+  if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) &&
+      pc->multi_token_partition != ONE_PARTITION) {
     unsigned int thread;
-    vp8mt_decode_mb_rows(pbi, xd);
+    if (vp8mt_decode_mb_rows(pbi, xd)) {
+      vp8_decoder_remove_threads(pbi);
+      pbi->restart_threads = 1;
+      vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME, NULL);
+    }
     vp8_yv12_extend_frame_borders(yv12_fb_new);
     for (thread = 0; thread < pbi->decoding_thread_count; ++thread) {
       corrupt_tokens |= pbi->mb_row_di[thread].mbd.corrupted;
@@ -1232,7 +1245,7 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
    * \n",bc->pos+pbi->bc2.pos); */
 
   if (pc->refresh_entropy_probs == 0) {
-    memcpy(&pc->fc, &pc->lfc, sizeof(pc->fc));
+    pc->fc = pc->lfc;
     pbi->independent_partitions = prev_independent_partitions;
   }
 
diff --git a/media/libvpx/libvpx/vp8/decoder/decodemv.c b/media/libvpx/libvpx/vp8/decoder/decodemv.c
index b946ab73d0..3f459d623f 100644
--- a/media/libvpx/libvpx/vp8/decoder/decodemv.c
+++ b/media/libvpx/libvpx/vp8/decoder/decodemv.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "decodemv.h"
 #include "treereader.h"
 #include "vp8/common/entropymv.h"
 #include "vp8/common/entropymode.h"
@@ -64,8 +65,7 @@ static int read_mvcomponent(vp8_reader *r, const MV_CONTEXT *mvc) {
   const vp8_prob *const p = (const vp8_prob *)mvc;
   int x = 0;
 
-  if (vp8_read(r, p[mvpis_short])) /* Large */
-  {
+  if (vp8_read(r, p[mvpis_short])) { /* Large */
     int i = 0;
 
     do {
@@ -173,7 +173,8 @@ const vp8_prob vp8_sub_mv_ref_prob3[8][VP8_SUBMVREFS - 1] = {
   { 208, 1, 1 }     /* SUBMVREF_LEFT_ABOVE_ZED  */
 };
 
-static const vp8_prob *get_sub_mv_ref_prob(const int left, const int above) {
+static const vp8_prob *get_sub_mv_ref_prob(const uint32_t left,
+                                           const uint32_t above) {
   int lez = (left == 0);
   int aez = (above == 0);
   int lea = (left == above);
@@ -284,8 +285,7 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi,
                              MB_MODE_INFO *mbmi) {
   vp8_reader *const bc = &pbi->mbc[8];
   mbmi->ref_frame = (MV_REFERENCE_FRAME)vp8_read(bc, pbi->prob_intra);
-  if (mbmi->ref_frame) /* inter MB */
-  {
+  if (mbmi->ref_frame) { /* inter MB */
     enum { CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
     int cnt[4];
     int *cntx = cnt;
@@ -372,9 +372,9 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi,
         tmp = cnt[CNT_NEAREST];
         cnt[CNT_NEAREST] = cnt[CNT_NEAR];
         cnt[CNT_NEAR] = tmp;
-        tmp = near_mvs[CNT_NEAREST].as_int;
+        tmp = (int)near_mvs[CNT_NEAREST].as_int;
         near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int;
-        near_mvs[CNT_NEAR].as_int = tmp;
+        near_mvs[CNT_NEAR].as_int = (uint32_t)tmp;
       }
 
       if (vp8_read(bc, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
@@ -486,10 +486,7 @@ static void read_mb_features(vp8_reader *r, MB_MODE_INFO *mi, MACROBLOCKD *x) {
   }
 }
 
-static void decode_mb_mode_mvs(VP8D_COMP *pbi, MODE_INFO *mi,
-                               MB_MODE_INFO *mbmi) {
-  (void)mbmi;
-
+static void decode_mb_mode_mvs(VP8D_COMP *pbi, MODE_INFO *mi) {
   /* Read the Macroblock segmentation map if it is being updated explicitly
    * this frame (reset to 0 above by default)
    * By default on a key frame reset all MBs to segment 0
@@ -538,7 +535,7 @@ void vp8_decode_mode_mvs(VP8D_COMP *pbi) {
       int mb_num = mb_row * pbi->common.mb_cols + mb_col;
 #endif
 
-      decode_mb_mode_mvs(pbi, mi, &mi->mbmi);
+      decode_mb_mode_mvs(pbi, mi);
 
 #if CONFIG_ERROR_CONCEALMENT
       /* look for corruption. set mvs_corrupt_from_mb to the current
diff --git a/media/libvpx/libvpx/vp8/decoder/decodemv.h b/media/libvpx/libvpx/vp8/decoder/decodemv.h
index f33b07351d..504e943d85 100644
--- a/media/libvpx/libvpx/vp8/decoder/decodemv.h
+++ b/media/libvpx/libvpx/vp8/decoder/decodemv.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_DECODER_DECODEMV_H_
-#define VP8_DECODER_DECODEMV_H_
+#ifndef VPX_VP8_DECODER_DECODEMV_H_
+#define VPX_VP8_DECODER_DECODEMV_H_
 
 #include "onyxd_int.h"
 
@@ -23,4 +23,4 @@ void vp8_decode_mode_mvs(VP8D_COMP *);
 }  // extern "C"
 #endif
 
-#endif  // VP8_DECODER_DECODEMV_H_
+#endif  // VPX_VP8_DECODER_DECODEMV_H_
diff --git a/media/libvpx/libvpx/vp8/decoder/decoderthreading.h b/media/libvpx/libvpx/vp8/decoder/decoderthreading.h
index c563cf6e93..3d49bc8317 100644
--- a/media/libvpx/libvpx/vp8/decoder/decoderthreading.h
+++ b/media/libvpx/libvpx/vp8/decoder/decoderthreading.h
@@ -8,15 +8,15 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_DECODER_DECODERTHREADING_H_
-#define VP8_DECODER_DECODERTHREADING_H_
+#ifndef VPX_VP8_DECODER_DECODERTHREADING_H_
+#define VPX_VP8_DECODER_DECODERTHREADING_H_
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 #if CONFIG_MULTITHREAD
-void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd);
+int vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd);
 void vp8_decoder_remove_threads(VP8D_COMP *pbi);
 void vp8_decoder_create_threads(VP8D_COMP *pbi);
 void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows);
@@ -27,4 +27,4 @@ void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows);
 }  // extern "C"
 #endif
 
-#endif  // VP8_DECODER_DECODERTHREADING_H_
+#endif  // VPX_VP8_DECODER_DECODERTHREADING_H_
diff --git a/media/libvpx/libvpx/vp8/decoder/detokenize.c b/media/libvpx/libvpx/vp8/decoder/detokenize.c
index b350bafbc5..1c77873f0b 100644
--- a/media/libvpx/libvpx/vp8/decoder/detokenize.c
+++ b/media/libvpx/libvpx/vp8/decoder/detokenize.c
@@ -11,6 +11,7 @@
 #include "vp8/common/blockd.h"
 #include "onyxd_int.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/compiler_attributes.h"
 #include "vpx_ports/mem.h"
 #include "detokenize.h"
 
@@ -52,7 +53,10 @@ static const uint8_t kZigzag[16] = { 0, 1,  4,  8,  5, 2,  3,  6,
 /* for const-casting */
 typedef const uint8_t (*ProbaArray)[NUM_CTX][NUM_PROBAS];
 
-static int GetSigned(BOOL_DECODER *br, int value_to_sign) {
+// With corrupt / fuzzed streams the calculation of br->value may overflow. See
+// b/148271109.
+static VPX_NO_UNSIGNED_OVERFLOW_CHECK int GetSigned(BOOL_DECODER *br,
+                                                    int value_to_sign) {
   int split = (br->range + 1) >> 1;
   VP8_BD_VALUE bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8);
   int v;
diff --git a/media/libvpx/libvpx/vp8/decoder/detokenize.h b/media/libvpx/libvpx/vp8/decoder/detokenize.h
index f0b125444f..410a431ba0 100644
--- a/media/libvpx/libvpx/vp8/decoder/detokenize.h
+++ b/media/libvpx/libvpx/vp8/decoder/detokenize.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_DECODER_DETOKENIZE_H_
-#define VP8_DECODER_DETOKENIZE_H_
+#ifndef VPX_VP8_DECODER_DETOKENIZE_H_
+#define VPX_VP8_DECODER_DETOKENIZE_H_
 
 #include "onyxd_int.h"
 
@@ -24,4 +24,4 @@ int vp8_decode_mb_tokens(VP8D_COMP *, MACROBLOCKD *);
 }  // extern "C"
 #endif
 
-#endif  // VP8_DECODER_DETOKENIZE_H_
+#endif  // VPX_VP8_DECODER_DETOKENIZE_H_
diff --git a/media/libvpx/libvpx/vp8/decoder/ec_types.h b/media/libvpx/libvpx/vp8/decoder/ec_types.h
index 0ab08b649a..84feb269df 100644
--- a/media/libvpx/libvpx/vp8/decoder/ec_types.h
+++ b/media/libvpx/libvpx/vp8/decoder/ec_types.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_DECODER_EC_TYPES_H_
-#define VP8_DECODER_EC_TYPES_H_
+#ifndef VPX_VP8_DECODER_EC_TYPES_H_
+#define VPX_VP8_DECODER_EC_TYPES_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -34,7 +34,9 @@ typedef struct {
 /* Structure used to hold all the overlaps of a macroblock. The overlaps of a
  * macroblock is further divided into block overlaps.
  */
-typedef struct { B_OVERLAP overlaps[16]; } MB_OVERLAP;
+typedef struct {
+  B_OVERLAP overlaps[16];
+} MB_OVERLAP;
 
 /* Structure for keeping track of motion vectors and which reference frame they
  * refer to. Used for motion vector interpolation.
@@ -48,4 +50,4 @@ typedef struct {
 }  // extern "C"
 #endif
 
-#endif  // VP8_DECODER_EC_TYPES_H_
+#endif  // VPX_VP8_DECODER_EC_TYPES_H_
diff --git a/media/libvpx/libvpx/vp8/decoder/error_concealment.c b/media/libvpx/libvpx/vp8/decoder/error_concealment.c
index e22141492c..85982e4de3 100644
--- a/media/libvpx/libvpx/vp8/decoder/error_concealment.c
+++ b/media/libvpx/libvpx/vp8/decoder/error_concealment.c
@@ -147,8 +147,8 @@ static void calculate_overlaps_mb(B_OVERLAP *b_overlaps, union b_mode_info *bmi,
   }
 }
 
-void vp8_calculate_overlaps(MB_OVERLAP *overlap_ul, int mb_rows, int mb_cols,
-                            union b_mode_info *bmi, int b_row, int b_col) {
+static void calculate_overlaps(MB_OVERLAP *overlap_ul, int mb_rows, int mb_cols,
+                               union b_mode_info *bmi, int b_row, int b_col) {
   MB_OVERLAP *mb_overlap;
   int row, col, rel_row, rel_col;
   int new_row, new_col;
@@ -280,9 +280,9 @@ static void calc_prev_mb_overlaps(MB_OVERLAP *overlaps, MODE_INFO *prev_mi,
   int sub_col;
   for (sub_row = 0; sub_row < 4; ++sub_row) {
     for (sub_col = 0; sub_col < 4; ++sub_col) {
-      vp8_calculate_overlaps(overlaps, mb_rows, mb_cols,
-                             &(prev_mi->bmi[sub_row * 4 + sub_col]),
-                             4 * mb_row + sub_row, 4 * mb_col + sub_col);
+      calculate_overlaps(overlaps, mb_rows, mb_cols,
+                         &(prev_mi->bmi[sub_row * 4 + sub_col]),
+                         4 * mb_row + sub_row, 4 * mb_col + sub_col);
     }
   }
 }
diff --git a/media/libvpx/libvpx/vp8/decoder/error_concealment.h b/media/libvpx/libvpx/vp8/decoder/error_concealment.h
index 89c78c1442..608a79f189 100644
--- a/media/libvpx/libvpx/vp8/decoder/error_concealment.h
+++ b/media/libvpx/libvpx/vp8/decoder/error_concealment.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_DECODER_ERROR_CONCEALMENT_H_
-#define VP8_DECODER_ERROR_CONCEALMENT_H_
+#ifndef VPX_VP8_DECODER_ERROR_CONCEALMENT_H_
+#define VPX_VP8_DECODER_ERROR_CONCEALMENT_H_
 
 #include "onyxd_int.h"
 #include "ec_types.h"
@@ -38,4 +38,4 @@ void vp8_interpolate_motion(MACROBLOCKD *mb, int mb_row, int mb_col,
 }  // extern "C"
 #endif
 
-#endif  // VP8_DECODER_ERROR_CONCEALMENT_H_
+#endif  // VPX_VP8_DECODER_ERROR_CONCEALMENT_H_
diff --git a/media/libvpx/libvpx/vp8/decoder/onyxd_if.c b/media/libvpx/libvpx/vp8/decoder/onyxd_if.c
index a1050c478d..88f2de024b 100644
--- a/media/libvpx/libvpx/vp8/decoder/onyxd_if.c
+++ b/media/libvpx/libvpx/vp8/decoder/onyxd_if.c
@@ -16,6 +16,7 @@
 #include "onyxd_int.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vp8/common/alloccommon.h"
+#include "vp8/common/common.h"
 #include "vp8/common/loopfilter.h"
 #include "vp8/common/swapyv12buffer.h"
 #include "vp8/common/threading.h"
@@ -36,12 +37,11 @@
 #if CONFIG_ERROR_CONCEALMENT
 #include "error_concealment.h"
 #endif
-#if ARCH_ARM
+#if VPX_ARCH_ARM
 #include "vpx_ports/arm.h"
 #endif
 
 extern void vp8_init_loop_filter(VP8_COMMON *cm);
-extern void vp8cx_init_de_quantizer(VP8D_COMP *pbi);
 static int get_free_fb(VP8_COMMON *cm);
 static void ref_cnt_fb(int *buf, int *idx, int new_idx);
 
@@ -302,12 +302,9 @@ static int check_fragments_for_errors(VP8D_COMP *pbi) {
   return 1;
 }
 
-int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size,
-                                  const uint8_t *source, int64_t time_stamp) {
+int vp8dx_receive_compressed_data(VP8D_COMP *pbi) {
   VP8_COMMON *cm = &pbi->common;
   int retcode = -1;
-  (void)size;
-  (void)source;
 
   pbi->common.error.error_code = VPX_CODEC_OK;
 
@@ -322,21 +319,6 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size,
   pbi->dec_fb_ref[GOLDEN_FRAME] = &cm->yv12_fb[cm->gld_fb_idx];
   pbi->dec_fb_ref[ALTREF_FRAME] = &cm->yv12_fb[cm->alt_fb_idx];
 
-  if (setjmp(pbi->common.error.jmp)) {
-    /* We do not know if the missing frame(s) was supposed to update
-     * any of the reference buffers, but we act conservative and
-     * mark only the last buffer as corrupted.
-     */
-    cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
-
-    if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0) {
-      cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
-    }
-    goto decode_exit;
-  }
-
-  pbi->common.error.setjmp = 1;
-
   retcode = vp8_decode_frame(pbi);
 
   if (retcode < 0) {
@@ -345,6 +327,12 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size,
     }
 
     pbi->common.error.error_code = VPX_CODEC_ERROR;
+    // Propagate the error info.
+    if (pbi->mb.error_info.error_code != 0) {
+      pbi->common.error.error_code = pbi->mb.error_info.error_code;
+      memcpy(pbi->common.error.detail, pbi->mb.error_info.detail,
+             sizeof(pbi->mb.error_info.detail));
+    }
     goto decode_exit;
   }
 
@@ -380,15 +368,12 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size,
 #endif
 
   pbi->ready_for_new_data = 0;
-  pbi->last_time_stamp = time_stamp;
 
 decode_exit:
-  pbi->common.error.setjmp = 0;
   vpx_clear_system_state();
   return retcode;
 }
 int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd,
-                        int64_t *time_stamp, int64_t *time_end_stamp,
                         vp8_ppflags_t *flags) {
   int ret = -1;
 
@@ -398,8 +383,6 @@ int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd,
   if (pbi->common.show_frame == 0) return ret;
 
   pbi->ready_for_new_data = 1;
-  *time_stamp = pbi->last_time_stamp;
-  *time_end_stamp = 0;
 
 #if CONFIG_POSTPROC
   ret = vp8_post_proc_frame(&pbi->common, sd, flags);
@@ -445,8 +428,9 @@ int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf) {
 
 #if CONFIG_MULTITHREAD
   if (setjmp(fb->pbi[0]->common.error.jmp)) {
+    fb->pbi[0]->common.error.setjmp = 0;
     vp8_remove_decoder_instances(fb);
-    memset(fb->pbi, 0, sizeof(fb->pbi) / sizeof(fb->pbi[0]));
+    vp8_zero(fb->pbi);
     vpx_clear_system_state();
     return VPX_CODEC_ERROR;
   }
@@ -469,5 +453,10 @@ int vp8_remove_decoder_instances(struct frame_buffers *fb) {
 
   /* decoder instance for single thread mode */
   remove_decompressor(pbi);
+  fb->pbi[0] = NULL;
   return VPX_CODEC_OK;
 }
+
+int vp8dx_get_quantizer(const VP8D_COMP *pbi) {
+  return pbi->common.base_qindex;
+}
diff --git a/media/libvpx/libvpx/vp8/decoder/onyxd_int.h b/media/libvpx/libvpx/vp8/decoder/onyxd_int.h
index 88b1ff16bc..08a60b31b9 100644
--- a/media/libvpx/libvpx/vp8/decoder/onyxd_int.h
+++ b/media/libvpx/libvpx/vp8/decoder/onyxd_int.h
@@ -8,10 +8,13 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_DECODER_ONYXD_INT_H_
-#define VP8_DECODER_ONYXD_INT_H_
+#ifndef VPX_VP8_DECODER_ONYXD_INT_H_
+#define VPX_VP8_DECODER_ONYXD_INT_H_
+
+#include <assert.h>
 
 #include "vpx_config.h"
+#include "vpx_util/vpx_pthread.h"
 #include "vp8/common/onyxd.h"
 #include "treereader.h"
 #include "vp8/common/onyxc_int.h"
@@ -31,7 +34,9 @@ typedef struct {
   void *ptr2;
 } DECODETHREAD_DATA;
 
-typedef struct { MACROBLOCKD mbd; } MB_ROW_DEC;
+typedef struct {
+  MACROBLOCKD mbd;
+} MB_ROW_DEC;
 
 typedef struct {
   int enabled;
@@ -68,7 +73,7 @@ typedef struct VP8D_COMP {
 #if CONFIG_MULTITHREAD
   /* variable for threading */
 
-  int b_multithreaded_rd;
+  vpx_atomic_int b_multithreaded_rd;
   int max_threads;
   int current_mb_col_main;
   unsigned int decoding_thread_count;
@@ -76,9 +81,8 @@ typedef struct VP8D_COMP {
 
   int mt_baseline_filter_level[MAX_MB_SEGMENTS];
   int sync_range;
-  int *mt_current_mb_col; /* Each row remembers its already decoded column. */
-  pthread_mutex_t *pmutex;
-  pthread_mutex_t mt_mutex; /* mutex for b_multithreaded_rd */
+  /* Each row remembers its already decoded column. */
+  vpx_atomic_int *mt_current_mb_col;
 
   unsigned char **mt_yabove_row; /* mb_rows x width */
   unsigned char **mt_uabove_row;
@@ -91,12 +95,11 @@ typedef struct VP8D_COMP {
   DECODETHREAD_DATA *de_thread_data;
 
   pthread_t *h_decoding_thread;
-  sem_t *h_event_start_decoding;
-  sem_t h_event_end_decoding;
+  vp8_sem_t *h_event_start_decoding;
+  vp8_sem_t h_event_end_decoding;
 /* end of threading data */
 #endif
 
-  int64_t last_time_stamp;
   int ready_for_new_data;
 
   vp8_prob prob_intra;
@@ -117,34 +120,23 @@ typedef struct VP8D_COMP {
 
   vpx_decrypt_cb decrypt_cb;
   void *decrypt_state;
+#if CONFIG_MULTITHREAD
+  // Restart threads on next frame if set to 1.
+  // This is set when error happens in multithreaded decoding and all threads
+  // are shut down.
+  int restart_threads;
+#endif
 } VP8D_COMP;
 
-int vp8_decode_frame(VP8D_COMP *cpi);
+void vp8cx_init_de_quantizer(VP8D_COMP *pbi);
+void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);
+int vp8_decode_frame(VP8D_COMP *pbi);
 
 int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf);
 int vp8_remove_decoder_instances(struct frame_buffers *fb);
 
-#if CONFIG_DEBUG
-#define CHECK_MEM_ERROR(lval, expr)                                         \
-  do {                                                                      \
-    lval = (expr);                                                          \
-    if (!lval)                                                              \
-      vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,           \
-                         "Failed to allocate " #lval " at %s:%d", __FILE__, \
-                         __LINE__);                                         \
-  } while (0)
-#else
-#define CHECK_MEM_ERROR(lval, expr)                               \
-  do {                                                            \
-    lval = (expr);                                                \
-    if (!lval)                                                    \
-      vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, \
-                         "Failed to allocate " #lval);            \
-  } while (0)
-#endif
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP8_DECODER_ONYXD_INT_H_
+#endif  // VPX_VP8_DECODER_ONYXD_INT_H_
diff --git a/media/libvpx/libvpx/vp8/decoder/threading.c b/media/libvpx/libvpx/vp8/decoder/threading.c
index 9f77519882..d16284d134 100644
--- a/media/libvpx/libvpx/vp8/decoder/threading.c
+++ b/media/libvpx/libvpx/vp8/decoder/threading.c
@@ -10,16 +10,18 @@
 
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
-#if !defined(WIN32) && CONFIG_OS_SUPPORT == 1
+#if !defined(_WIN32) && CONFIG_OS_SUPPORT == 1
 #include <unistd.h>
 #endif
 #include "onyxd_int.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_util/vpx_pthread.h"
+#include "vp8/common/common.h"
 #include "vp8/common/threading.h"
-
 #include "vp8/common/loopfilter.h"
 #include "vp8/common/extend.h"
 #include "vpx_ports/vpx_timer.h"
+#include "decoderthreading.h"
 #include "detokenize.h"
 #include "vp8/common/reconintra4x4.h"
 #include "vp8/common/reconinter.h"
@@ -29,15 +31,15 @@
 #include "error_concealment.h"
 #endif
 
-#define CALLOC_ARRAY(p, n) CHECK_MEM_ERROR((p), vpx_calloc(sizeof(*(p)), (n)))
-#define CALLOC_ARRAY_ALIGNED(p, n, algn)                            \
-  do {                                                              \
-    CHECK_MEM_ERROR((p), vpx_memalign((algn), sizeof(*(p)) * (n))); \
-    memset((p), 0, (n) * sizeof(*(p)));                             \
+#define CALLOC_ARRAY(p, n) \
+  CHECK_MEM_ERROR(&pbi->common.error, (p), vpx_calloc(sizeof(*(p)), (n)))
+#define CALLOC_ARRAY_ALIGNED(p, n, algn)                       \
+  do {                                                         \
+    CHECK_MEM_ERROR(&pbi->common.error, (p),                   \
+                    vpx_memalign((algn), sizeof(*(p)) * (n))); \
+    memset((p), 0, (n) * sizeof(*(p)));                        \
   } while (0)
 
-void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);
-
 static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd,
                                        MB_ROW_DEC *mbrd, int count) {
   VP8_COMMON *const pc = &pbi->common;
@@ -55,7 +57,7 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd,
     mbd->dst = xd->dst;
 
     mbd->segmentation_enabled = xd->segmentation_enabled;
-    mbd->mb_segement_abs_delta = xd->mb_segement_abs_delta;
+    mbd->mb_segment_abs_delta = xd->mb_segment_abs_delta;
     memcpy(mbd->segment_feature_data, xd->segment_feature_data,
            sizeof(xd->segment_feature_data));
 
@@ -75,12 +77,13 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd,
     memcpy(mbd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2));
     memcpy(mbd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv));
 
-    mbd->fullpixel_mask = 0xffffffff;
+    mbd->fullpixel_mask = ~0;
 
-    if (pc->full_pixel) mbd->fullpixel_mask = 0xfffffff8;
+    if (pc->full_pixel) mbd->fullpixel_mask = ~7;
   }
 
-  for (i = 0; i < pc->mb_rows; ++i) pbi->mt_current_mb_col[i] = -1;
+  for (i = 0; i < pc->mb_rows; ++i)
+    vpx_atomic_store_release(&pbi->mt_current_mb_col[i], -1);
 }
 
 static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
@@ -248,12 +251,13 @@ static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
 
 static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
                               int start_mb_row) {
-  const int *last_row_current_mb_col;
-  int *current_mb_col;
+  const vpx_atomic_int *last_row_current_mb_col;
+  vpx_atomic_int *current_mb_col;
   int mb_row;
   VP8_COMMON *pc = &pbi->common;
   const int nsync = pbi->sync_range;
-  const int first_row_no_sync_above = pc->mb_cols + nsync;
+  const vpx_atomic_int first_row_no_sync_above =
+      VPX_ATOMIC_INIT(pc->mb_cols + nsync);
   int num_part = 1 << pbi->common.multi_token_partition;
   int last_mb_row = start_mb_row;
 
@@ -357,13 +361,11 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
 
     for (mb_col = 0; mb_col < pc->mb_cols; ++mb_col) {
       if (((mb_col - 1) % nsync) == 0) {
-        pthread_mutex_t *mutex = &pbi->pmutex[mb_row];
-        protected_write(mutex, current_mb_col, mb_col - 1);
+        vpx_atomic_store_release(current_mb_col, mb_col - 1);
       }
 
       if (mb_row && !(mb_col & (nsync - 1))) {
-        pthread_mutex_t *mutex = &pbi->pmutex[mb_row - 1];
-        sync_read(mutex, mb_col, last_row_current_mb_col, nsync);
+        vp8_atomic_spin_wait(mb_col, last_row_current_mb_col, nsync);
       }
 
       /* Distance of MB to the various image edges.
@@ -401,16 +403,32 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
       xd->dst.u_buffer = dst_buffer[1] + recon_uvoffset;
       xd->dst.v_buffer = dst_buffer[2] + recon_uvoffset;
 
-      xd->pre.y_buffer =
-          ref_buffer[xd->mode_info_context->mbmi.ref_frame][0] + recon_yoffset;
-      xd->pre.u_buffer =
-          ref_buffer[xd->mode_info_context->mbmi.ref_frame][1] + recon_uvoffset;
-      xd->pre.v_buffer =
-          ref_buffer[xd->mode_info_context->mbmi.ref_frame][2] + recon_uvoffset;
-
       /* propagate errors from reference frames */
       xd->corrupted |= ref_fb_corrupted[xd->mode_info_context->mbmi.ref_frame];
 
+      if (xd->corrupted) {
+        // Move current decoding marcoblock to the end of row for all rows
+        // assigned to this thread, such that other threads won't be waiting.
+        for (; mb_row < pc->mb_rows;
+             mb_row += (pbi->decoding_thread_count + 1)) {
+          current_mb_col = &pbi->mt_current_mb_col[mb_row];
+          vpx_atomic_store_release(current_mb_col, pc->mb_cols + nsync);
+        }
+        vpx_internal_error(&xd->error_info, VPX_CODEC_CORRUPT_FRAME,
+                           "Corrupted reference frame");
+      }
+
+      if (xd->mode_info_context->mbmi.ref_frame >= LAST_FRAME) {
+        const MV_REFERENCE_FRAME ref = xd->mode_info_context->mbmi.ref_frame;
+        xd->pre.y_buffer = ref_buffer[ref][0] + recon_yoffset;
+        xd->pre.u_buffer = ref_buffer[ref][1] + recon_uvoffset;
+        xd->pre.v_buffer = ref_buffer[ref][2] + recon_uvoffset;
+      } else {
+        // ref_frame is INTRA_FRAME, pre buffer should not be used.
+        xd->pre.y_buffer = 0;
+        xd->pre.u_buffer = 0;
+        xd->pre.v_buffer = 0;
+      }
       mt_decode_macroblock(pbi, xd, 0);
 
       xd->left_available = 1;
@@ -549,7 +567,7 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
     }
 
     /* last MB of row is ready just after extension is done */
-    protected_write(&pbi->pmutex[mb_row], current_mb_col, mb_col + nsync);
+    vpx_atomic_store_release(current_mb_col, mb_col + nsync);
 
     ++xd->mode_info_context; /* skip prediction column */
     xd->up_available = 1;
@@ -558,41 +576,48 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
     xd->mode_info_context += xd->mode_info_stride * pbi->decoding_thread_count;
   }
 
-  /* signal end of frame decoding if this thread processed the last mb_row */
-  if (last_mb_row == (pc->mb_rows - 1)) sem_post(&pbi->h_event_end_decoding);
+  /* signal end of decoding of current thread for current frame */
+  if (last_mb_row + (int)pbi->decoding_thread_count + 1 >= pc->mb_rows)
+    vp8_sem_post(&pbi->h_event_end_decoding);
 }
 
-static THREAD_FUNCTION thread_decoding_proc(void *p_data) {
+static THREADFN thread_decoding_proc(void *p_data) {
   int ithread = ((DECODETHREAD_DATA *)p_data)->ithread;
   VP8D_COMP *pbi = (VP8D_COMP *)(((DECODETHREAD_DATA *)p_data)->ptr1);
   MB_ROW_DEC *mbrd = (MB_ROW_DEC *)(((DECODETHREAD_DATA *)p_data)->ptr2);
   ENTROPY_CONTEXT_PLANES mb_row_left_context;
 
   while (1) {
-    if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd) == 0) break;
+    if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) break;
 
-    if (sem_wait(&pbi->h_event_start_decoding[ithread]) == 0) {
-      if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd) == 0) {
+    if (vp8_sem_wait(&pbi->h_event_start_decoding[ithread]) == 0) {
+      if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) {
         break;
       } else {
         MACROBLOCKD *xd = &mbrd->mbd;
         xd->left_context = &mb_row_left_context;
-
+        if (setjmp(xd->error_info.jmp)) {
+          xd->error_info.setjmp = 0;
+          // Signal the end of decoding for current thread.
+          vp8_sem_post(&pbi->h_event_end_decoding);
+          continue;
+        }
+        xd->error_info.setjmp = 1;
         mt_decode_mb_rows(pbi, xd, ithread + 1);
+        xd->error_info.setjmp = 0;
       }
     }
   }
 
-  return 0;
+  return THREAD_EXIT_SUCCESS;
 }
 
 void vp8_decoder_create_threads(VP8D_COMP *pbi) {
   int core_count = 0;
   unsigned int ithread;
 
-  pbi->b_multithreaded_rd = 0;
+  vpx_atomic_init(&pbi->b_multithreaded_rd, 0);
   pbi->allocated_decoding_thread_count = 0;
-  pthread_mutex_init(&pbi->mt_mutex, NULL);
 
   /* limit decoding threads to the max number of token partitions */
   core_count = (pbi->max_threads > 8) ? 8 : pbi->max_threads;
@@ -603,7 +628,7 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) {
   }
 
   if (core_count > 1) {
-    pbi->b_multithreaded_rd = 1;
+    vpx_atomic_init(&pbi->b_multithreaded_rd, 1);
     pbi->decoding_thread_count = core_count - 1;
 
     CALLOC_ARRAY(pbi->h_decoding_thread, pbi->decoding_thread_count);
@@ -611,13 +636,13 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) {
     CALLOC_ARRAY_ALIGNED(pbi->mb_row_di, pbi->decoding_thread_count, 32);
     CALLOC_ARRAY(pbi->de_thread_data, pbi->decoding_thread_count);
 
-    if (sem_init(&pbi->h_event_end_decoding, 0, 0)) {
+    if (vp8_sem_init(&pbi->h_event_end_decoding, 0, 0)) {
       vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,
                          "Failed to initialize semaphore");
     }
 
     for (ithread = 0; ithread < pbi->decoding_thread_count; ++ithread) {
-      if (sem_init(&pbi->h_event_start_decoding[ithread], 0, 0)) break;
+      if (vp8_sem_init(&pbi->h_event_start_decoding[ithread], 0, 0)) break;
 
       vp8_setup_block_dptrs(&pbi->mb_row_di[ithread].mbd);
 
@@ -627,7 +652,7 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) {
 
       if (pthread_create(&pbi->h_decoding_thread[ithread], 0,
                          thread_decoding_proc, &pbi->de_thread_data[ithread])) {
-        sem_destroy(&pbi->h_event_start_decoding[ithread]);
+        vp8_sem_destroy(&pbi->h_event_start_decoding[ithread]);
         break;
       }
     }
@@ -638,7 +663,7 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) {
       /* the remainder of cleanup cases will be handled in
        * vp8_decoder_remove_threads(). */
       if (pbi->allocated_decoding_thread_count == 0) {
-        sem_destroy(&pbi->h_event_end_decoding);
+        vp8_sem_destroy(&pbi->h_event_end_decoding);
       }
       vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,
                          "Failed to create threads");
@@ -649,16 +674,6 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) {
 void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows) {
   int i;
 
-  /* De-allocate mutex */
-  if (pbi->pmutex != NULL) {
-    for (i = 0; i < mb_rows; ++i) {
-      pthread_mutex_destroy(&pbi->pmutex[i]);
-    }
-
-    vpx_free(pbi->pmutex);
-    pbi->pmutex = NULL;
-  }
-
   vpx_free(pbi->mt_current_mb_col);
   pbi->mt_current_mb_col = NULL;
 
@@ -724,7 +739,7 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
   int i;
   int uv_width;
 
-  if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd)) {
+  if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) {
     vp8mt_de_alloc_temp_buffers(pbi, prev_mb_rows);
 
     /* our internal buffers are always multiples of 16 */
@@ -742,73 +757,73 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
 
     uv_width = width >> 1;
 
-    /* Allocate mutex */
-    CHECK_MEM_ERROR(pbi->pmutex,
-                    vpx_malloc(sizeof(*pbi->pmutex) * pc->mb_rows));
-    if (pbi->pmutex) {
-      for (i = 0; i < pc->mb_rows; ++i) {
-        pthread_mutex_init(&pbi->pmutex[i], NULL);
-      }
-    }
-
-    /* Allocate an int for each mb row. */
-    CALLOC_ARRAY(pbi->mt_current_mb_col, pc->mb_rows);
+    /* Allocate a vpx_atomic_int for each mb row. */
+    CHECK_MEM_ERROR(&pc->error, pbi->mt_current_mb_col,
+                    vpx_malloc(sizeof(*pbi->mt_current_mb_col) * pc->mb_rows));
+    for (i = 0; i < pc->mb_rows; ++i)
+      vpx_atomic_init(&pbi->mt_current_mb_col[i], 0);
 
     /* Allocate memory for above_row buffers. */
     CALLOC_ARRAY(pbi->mt_yabove_row, pc->mb_rows);
-    for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(pbi->mt_yabove_row[i],
+    for (i = 0; i < pc->mb_rows; ++i) {
+      CHECK_MEM_ERROR(&pc->error, pbi->mt_yabove_row[i],
                       vpx_memalign(16, sizeof(unsigned char) *
                                            (width + (VP8BORDERINPIXELS << 1))));
+      vp8_zero_array(pbi->mt_yabove_row[i], width + (VP8BORDERINPIXELS << 1));
+    }
 
     CALLOC_ARRAY(pbi->mt_uabove_row, pc->mb_rows);
-    for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(pbi->mt_uabove_row[i],
+    for (i = 0; i < pc->mb_rows; ++i) {
+      CHECK_MEM_ERROR(&pc->error, pbi->mt_uabove_row[i],
                       vpx_memalign(16, sizeof(unsigned char) *
                                            (uv_width + VP8BORDERINPIXELS)));
+      vp8_zero_array(pbi->mt_uabove_row[i], uv_width + VP8BORDERINPIXELS);
+    }
 
     CALLOC_ARRAY(pbi->mt_vabove_row, pc->mb_rows);
-    for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(pbi->mt_vabove_row[i],
+    for (i = 0; i < pc->mb_rows; ++i) {
+      CHECK_MEM_ERROR(&pc->error, pbi->mt_vabove_row[i],
                       vpx_memalign(16, sizeof(unsigned char) *
                                            (uv_width + VP8BORDERINPIXELS)));
+      vp8_zero_array(pbi->mt_vabove_row[i], uv_width + VP8BORDERINPIXELS);
+    }
 
     /* Allocate memory for left_col buffers. */
     CALLOC_ARRAY(pbi->mt_yleft_col, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(pbi->mt_yleft_col[i],
+      CHECK_MEM_ERROR(&pc->error, pbi->mt_yleft_col[i],
                       vpx_calloc(sizeof(unsigned char) * 16, 1));
 
     CALLOC_ARRAY(pbi->mt_uleft_col, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(pbi->mt_uleft_col[i],
+      CHECK_MEM_ERROR(&pc->error, pbi->mt_uleft_col[i],
                       vpx_calloc(sizeof(unsigned char) * 8, 1));
 
     CALLOC_ARRAY(pbi->mt_vleft_col, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(pbi->mt_vleft_col[i],
+      CHECK_MEM_ERROR(&pc->error, pbi->mt_vleft_col[i],
                       vpx_calloc(sizeof(unsigned char) * 8, 1));
   }
 }
 
 void vp8_decoder_remove_threads(VP8D_COMP *pbi) {
   /* shutdown MB Decoding thread; */
-  if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd)) {
+  if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) {
     int i;
-    protected_write(&pbi->mt_mutex, &pbi->b_multithreaded_rd, 0);
+    vpx_atomic_store_release(&pbi->b_multithreaded_rd, 0);
 
     /* allow all threads to exit */
     for (i = 0; i < pbi->allocated_decoding_thread_count; ++i) {
-      sem_post(&pbi->h_event_start_decoding[i]);
+      vp8_sem_post(&pbi->h_event_start_decoding[i]);
       pthread_join(pbi->h_decoding_thread[i], NULL);
     }
 
     for (i = 0; i < pbi->allocated_decoding_thread_count; ++i) {
-      sem_destroy(&pbi->h_event_start_decoding[i]);
+      vp8_sem_destroy(&pbi->h_event_start_decoding[i]);
     }
 
     if (pbi->allocated_decoding_thread_count) {
-      sem_destroy(&pbi->h_event_end_decoding);
+      vp8_sem_destroy(&pbi->h_event_end_decoding);
     }
 
     vpx_free(pbi->h_decoding_thread);
@@ -825,10 +840,9 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi) {
 
     vp8mt_de_alloc_temp_buffers(pbi, pbi->common.mb_rows);
   }
-  pthread_mutex_destroy(&pbi->mt_mutex);
 }
 
-void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) {
+int vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) {
   VP8_COMMON *pc = &pbi->common;
   unsigned int i;
   int j;
@@ -871,10 +885,26 @@ void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) {
                              pbi->decoding_thread_count);
 
   for (i = 0; i < pbi->decoding_thread_count; ++i) {
-    sem_post(&pbi->h_event_start_decoding[i]);
+    vp8_sem_post(&pbi->h_event_start_decoding[i]);
   }
 
-  mt_decode_mb_rows(pbi, xd, 0);
+  if (setjmp(xd->error_info.jmp)) {
+    xd->error_info.setjmp = 0;
+    xd->corrupted = 1;
+    // Wait for other threads to finish. This prevents other threads decoding
+    // the current frame while the main thread starts decoding the next frame,
+    // which causes a data race.
+    for (i = 0; i < pbi->decoding_thread_count; ++i)
+      vp8_sem_wait(&pbi->h_event_end_decoding);
+    return -1;
+  }
 
-  sem_wait(&pbi->h_event_end_decoding); /* add back for each frame */
+  xd->error_info.setjmp = 1;
+  mt_decode_mb_rows(pbi, xd, 0);
+  xd->error_info.setjmp = 0;
+
+  for (i = 0; i < pbi->decoding_thread_count + 1; ++i)
+    vp8_sem_wait(&pbi->h_event_end_decoding); /* add back for each frame */
+
+  return 0;
 }
diff --git a/media/libvpx/libvpx/vp8/decoder/treereader.h b/media/libvpx/libvpx/vp8/decoder/treereader.h
index dd0f0986e9..4bf938a741 100644
--- a/media/libvpx/libvpx/vp8/decoder/treereader.h
+++ b/media/libvpx/libvpx/vp8/decoder/treereader.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_DECODER_TREEREADER_H_
-#define VP8_DECODER_TREEREADER_H_
+#ifndef VPX_VP8_DECODER_TREEREADER_H_
+#define VPX_VP8_DECODER_TREEREADER_H_
 
 #include "./vpx_config.h"
 #include "vp8/common/treecoder.h"
@@ -30,7 +30,7 @@ typedef BOOL_DECODER vp8_reader;
 static INLINE int vp8_treed_read(
     vp8_reader *const r, /* !!! must return a 0 or 1 !!! */
     vp8_tree t, const vp8_prob *const p) {
-  register vp8_tree_index i = 0;
+  vp8_tree_index i = 0;
 
   while ((i = t[i + vp8_read(r, p[i >> 1])]) > 0) {
   }
@@ -42,4 +42,4 @@ static INLINE int vp8_treed_read(
 }  // extern "C"
 #endif
 
-#endif  // VP8_DECODER_TREEREADER_H_
+#endif  // VPX_VP8_DECODER_TREEREADER_H_
diff --git a/media/libvpx/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c b/media/libvpx/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c
index c42005df6c..950c943343 100644
--- a/media/libvpx/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c
+++ b/media/libvpx/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c
@@ -9,6 +9,8 @@
  */
 
 #include <arm_neon.h>
+
+#include "./vp8_rtcd.h"
 #include "vp8/encoder/block.h"
 
 static const uint16_t inv_zig_zag[16] = { 1, 2, 6,  7,  3,  5,  8,  13,
@@ -26,9 +28,11 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
                    zig_zag1 = vld1q_u16(inv_zig_zag + 8);
   int16x8_t x0, x1, sz0, sz1, y0, y1;
   uint16x8_t eob0, eob1;
+#if !VPX_ARCH_AARCH64
   uint16x4_t eob_d16;
   uint32x2_t eob_d32;
   uint32x4_t eob_q32;
+#endif  // !VPX_ARCH_AARCH64
 
   /* sign of z: z >> 15 */
   sz0 = vshrq_n_s16(z0, 15);
@@ -66,11 +70,17 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
 
   /* select the largest value */
   eob0 = vmaxq_u16(eob0, eob1);
+#if VPX_ARCH_AARCH64
+  *d->eob = (int8_t)vmaxvq_u16(eob0);
+#else
   eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0));
   eob_q32 = vmovl_u16(eob_d16);
   eob_d32 = vmax_u32(vget_low_u32(eob_q32), vget_high_u32(eob_q32));
   eob_d32 = vpmax_u32(eob_d32, eob_d32);
 
+  vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0);
+#endif  // VPX_ARCH_AARCH64
+
   /* qcoeff = x */
   vst1q_s16(d->qcoeff, x0);
   vst1q_s16(d->qcoeff + 8, x1);
@@ -78,6 +88,4 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
   /* dqcoeff = x * dequant */
   vst1q_s16(d->dqcoeff, vmulq_s16(dequant0, x0));
   vst1q_s16(d->dqcoeff + 8, vmulq_s16(dequant1, x1));
-
-  vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0);
 }
diff --git a/media/libvpx/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c b/media/libvpx/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c
index 76853e6524..99dff6b520 100644
--- a/media/libvpx/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c
+++ b/media/libvpx/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c
@@ -10,6 +10,8 @@
 
 #include <arm_neon.h>
 
+#include "./vp8_rtcd.h"
+
 void vp8_short_fdct4x4_neon(int16_t *input, int16_t *output, int pitch) {
   int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
   int16x4_t d16s16, d17s16, d26s16, dEmptys16;
diff --git a/media/libvpx/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c b/media/libvpx/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c
index 8d6ea4ccbe..02056f2f90 100644
--- a/media/libvpx/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c
+++ b/media/libvpx/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c
@@ -9,6 +9,8 @@
  */
 
 #include <arm_neon.h>
+
+#include "./vp8_rtcd.h"
 #include "vpx_ports/arm.h"
 
 #ifdef VPX_INCOMPATIBLE_GCC
diff --git a/media/libvpx/libvpx/vp8/encoder/bitstream.c b/media/libvpx/libvpx/vp8/encoder/bitstream.c
index 7086faae98..7bcdf77708 100644
--- a/media/libvpx/libvpx/vp8/encoder/bitstream.c
+++ b/media/libvpx/libvpx/vp8/encoder/bitstream.c
@@ -19,6 +19,7 @@
 #include <limits.h>
 #include "vpx/vpx_encoder.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/compiler_attributes.h"
 #include "vpx_ports/system_state.h"
 #include "bitstream.h"
 
@@ -41,13 +42,6 @@ const int vp8cx_base_skip_false_prob[128] = {
 unsigned __int64 Sectionbits[500];
 #endif
 
-#ifdef VP8_ENTROPY_STATS
-int intra_mode_stats[10][10][10];
-static unsigned int tree_update_hist[BLOCK_TYPES][COEF_BANDS]
-                                    [PREV_COEF_CONTEXTS][ENTROPY_NODES][2];
-extern unsigned int active_section;
-#endif
-
 #ifdef MODE_STATS
 int count_mb_seg[4] = { 0, 0, 0, 0 };
 #endif
@@ -124,7 +118,9 @@ static void write_split(vp8_writer *bc, int x) {
                   vp8_mbsplit_encodings + x);
 }
 
-void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount) {
+void VPX_NO_UNSIGNED_SHIFT_CHECK vp8_pack_tokens(vp8_writer *w,
+                                                 const TOKENEXTRA *p,
+                                                 int xcount) {
   const TOKENEXTRA *stop = p + xcount;
   unsigned int split;
   int shift;
@@ -178,10 +174,9 @@ void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount) {
 
         validate_buffer(w->buffer + w->pos, 1, w->buffer_end, w->error);
 
-        w->buffer[w->pos++] = (lowvalue >> (24 - offset));
-        lowvalue <<= offset;
+        w->buffer[w->pos++] = (lowvalue >> (24 - offset)) & 0xff;
         shift = count;
-        lowvalue &= 0xffffff;
+        lowvalue = (int)(((uint64_t)lowvalue << offset) & 0xffffff);
         count -= 8;
       }
 
@@ -229,10 +224,9 @@ void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount) {
 
             validate_buffer(w->buffer + w->pos, 1, w->buffer_end, w->error);
 
-            w->buffer[w->pos++] = (lowvalue >> (24 - offset));
-            lowvalue <<= offset;
+            w->buffer[w->pos++] = (lowvalue >> (24 - offset)) & 0xff;
             shift = count;
-            lowvalue &= 0xffffff;
+            lowvalue = (int)(((uint64_t)lowvalue << offset) & 0xffffff);
             count -= 8;
           }
 
@@ -428,10 +422,6 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
 
   vp8_convert_rfct_to_prob(cpi);
 
-#ifdef VP8_ENTROPY_STATS
-  active_section = 1;
-#endif
-
   if (pc->mb_no_coeff_skip) {
     int total_mbs = pc->mb_rows * pc->mb_cols;
 
@@ -472,10 +462,6 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
       xd->mb_to_top_edge = -((mb_row * 16) << 3);
       xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
 
-#ifdef VP8_ENTROPY_STATS
-      active_section = 9;
-#endif
-
       if (cpi->mb.e_mbd.update_mb_segmentation_map) {
         write_mb_features(w, mi, &cpi->mb.e_mbd);
       }
@@ -486,9 +472,6 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
 
       if (rf == INTRA_FRAME) {
         vp8_write(w, 0, cpi->prob_intra_coded);
-#ifdef VP8_ENTROPY_STATS
-        active_section = 6;
-#endif
         write_ymode(w, mode, pc->fc.ymode_prob);
 
         if (mode == B_PRED) {
@@ -500,8 +483,7 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
         }
 
         write_uv_mode(w, mi->uv_mode, pc->fc.uv_mode_prob);
-      } else /* inter coded */
-      {
+      } else { /* inter coded */
         int_mv best_mv;
         vp8_prob mv_ref_p[VP8_MVREFS - 1];
 
@@ -519,32 +501,17 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
           int ct[4];
 
           vp8_find_near_mvs(xd, m, &n1, &n2, &best_mv, ct, rf,
-                            cpi->common.ref_frame_sign_bias);
+                            pc->ref_frame_sign_bias);
           vp8_clamp_mv2(&best_mv, xd);
 
           vp8_mv_ref_probs(mv_ref_p, ct);
-
-#ifdef VP8_ENTROPY_STATS
-          accum_mv_refs(mode, ct);
-#endif
         }
 
-#ifdef VP8_ENTROPY_STATS
-        active_section = 3;
-#endif
-
         write_mv_ref(w, mode, mv_ref_p);
 
         switch (mode) /* new, split require MVs */
         {
-          case NEWMV:
-
-#ifdef VP8_ENTROPY_STATS
-            active_section = 5;
-#endif
-
-            write_mv(w, &mi->mv.as_mv, &best_mv, mvc);
-            break;
+          case NEWMV: write_mv(w, &mi->mv.as_mv, &best_mv, mvc); break;
 
           case SPLITMV: {
             int j = 0;
@@ -575,9 +542,6 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
               write_sub_mv_ref(w, blockmode, vp8_sub_mv_ref_prob2[mv_contz]);
 
               if (blockmode == NEW4X4) {
-#ifdef VP8_ENTROPY_STATS
-                active_section = 11;
-#endif
                 write_mv(w, &blockmv.as_mv, &best_mv, (const MV_CONTEXT *)mvc);
               }
             } while (++j < cpi->mb.partition_info->count);
@@ -643,10 +607,6 @@ static void write_kfmodes(VP8_COMP *cpi) {
           const B_PREDICTION_MODE L = left_block_mode(m, i);
           const int bm = m->bmi[i].as_mode;
 
-#ifdef VP8_ENTROPY_STATS
-          ++intra_mode_stats[A][L][bm];
-#endif
-
           write_bmode(bc, bm, vp8_kf_bmode_prob[A][L]);
         } while (++i < 16);
       }
@@ -907,7 +867,6 @@ void vp8_update_coef_probs(VP8_COMP *cpi) {
 #if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
   vp8_writer *const w = cpi->bc;
 #endif
-  int savings = 0;
 
   vpx_clear_system_state();
 
@@ -974,10 +933,6 @@ void vp8_update_coef_probs(VP8_COMP *cpi) {
           vp8_write(w, u, upd);
 #endif
 
-#ifdef VP8_ENTROPY_STATS
-          ++tree_update_hist[i][j][k][t][u];
-#endif
-
           if (u) {
             /* send/use new probability */
 
@@ -985,22 +940,10 @@ void vp8_update_coef_probs(VP8_COMP *cpi) {
 #if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
             vp8_write_literal(w, newp, 8);
 #endif
-
-            savings += s;
           }
 
         } while (++t < ENTROPY_NODES);
 
-/* Accum token counts for generation of default statistics */
-#ifdef VP8_ENTROPY_STATS
-        t = 0;
-
-        do {
-          context_counters[i][j][k][t] += cpi->coef_counts[i][j][k][t];
-        } while (++t < MAX_ENTROPY_TOKENS);
-
-#endif
-
       } while (++k < PREV_COEF_CONTEXTS);
     } while (++j < COEF_BANDS);
   } while (++i < BLOCK_TYPES);
@@ -1078,7 +1021,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest,
 
   bc[0].error = &pc->error;
 
-  validate_buffer(cx_data, 3, cx_data_end, &cpi->common.error);
+  validate_buffer(cx_data, 3, cx_data_end, &pc->error);
   cx_data += 3;
 
 #if defined(SECTIONBITS_OUTPUT)
@@ -1091,19 +1034,25 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest,
   if (oh.type == KEY_FRAME) {
     int v;
 
-    validate_buffer(cx_data, 7, cx_data_end, &cpi->common.error);
+    validate_buffer(cx_data, 7, cx_data_end, &pc->error);
 
     /* Start / synch code */
     cx_data[0] = 0x9D;
     cx_data[1] = 0x01;
     cx_data[2] = 0x2a;
 
+    /* Pack scale and frame size into 16 bits. Store it 8 bits at a time.
+     * https://tools.ietf.org/html/rfc6386
+     * 9.1. Uncompressed Data Chunk
+     * 16 bits      :     (2 bits Horizontal Scale << 14) | Width (14 bits)
+     * 16 bits      :     (2 bits Vertical Scale << 14) | Height (14 bits)
+     */
     v = (pc->horiz_scale << 14) | pc->Width;
-    cx_data[3] = v;
+    cx_data[3] = v & 0xff;
     cx_data[4] = v >> 8;
 
     v = (pc->vert_scale << 14) | pc->Height;
-    cx_data[5] = v;
+    cx_data[5] = v & 0xff;
     cx_data[6] = v >> 8;
 
     extra_bytes_packed = 7;
@@ -1131,7 +1080,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest,
     if (xd->update_mb_segmentation_data) {
       signed char Data;
 
-      vp8_write_bit(bc, xd->mb_segement_abs_delta);
+      vp8_write_bit(bc, xd->mb_segment_abs_delta);
 
       /* For each segmentation feature (Quant and loop filter level) */
       for (i = 0; i < MB_LVL_MAX; ++i) {
@@ -1287,15 +1236,6 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest,
 
   if (pc->frame_type != KEY_FRAME) vp8_write_bit(bc, pc->refresh_last_frame);
 
-#ifdef VP8_ENTROPY_STATS
-
-  if (pc->frame_type == INTER_FRAME)
-    active_section = 0;
-  else
-    active_section = 7;
-
-#endif
-
   vpx_clear_system_state();
 
 #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
@@ -1303,31 +1243,19 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest,
 #else
   if (pc->refresh_entropy_probs == 0) {
     /* save a copy for later refresh */
-    memcpy(&cpi->common.lfc, &cpi->common.fc, sizeof(cpi->common.fc));
+    pc->lfc = pc->fc;
   }
 
   vp8_update_coef_probs(cpi);
 #endif
 
-#ifdef VP8_ENTROPY_STATS
-  active_section = 2;
-#endif
-
   /* Write out the mb_no_coeff_skip flag */
   vp8_write_bit(bc, pc->mb_no_coeff_skip);
 
   if (pc->frame_type == KEY_FRAME) {
     write_kfmodes(cpi);
-
-#ifdef VP8_ENTROPY_STATS
-    active_section = 8;
-#endif
   } else {
     pack_inter_mode_mvs(cpi);
-
-#ifdef VP8_ENTROPY_STATS
-    active_section = 1;
-#endif
   }
 
   vp8_stop_encode(bc);
@@ -1338,11 +1266,30 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest,
 
   /* update frame tag */
   {
+    /* Pack partition size, show frame, version and frame type into to 24 bits.
+     * Store it 8 bits at a time.
+     * https://tools.ietf.org/html/rfc6386
+     * 9.1. Uncompressed Data Chunk
+     *    The uncompressed data chunk comprises a common (for key frames and
+     *    interframes) 3-byte frame tag that contains four fields, as follows:
+     *
+     *    1.  A 1-bit frame type (0 for key frames, 1 for interframes).
+     *
+     *    2.  A 3-bit version number (0 - 3 are defined as four different
+     *        profiles with different decoding complexity; other values may be
+     *        defined for future variants of the VP8 data format).
+     *
+     *    3.  A 1-bit show_frame flag (0 when current frame is not for display,
+     *        1 when current frame is for display).
+     *
+     *    4.  A 19-bit field containing the size of the first data partition in
+     *        bytes
+     */
     int v = (oh.first_partition_length_in_bytes << 5) | (oh.show_frame << 4) |
             (oh.version << 1) | oh.type;
 
-    dest[0] = v;
-    dest[1] = v >> 8;
+    dest[0] = v & 0xff;
+    dest[1] = (v >> 8) & 0xff;
     dest[2] = v >> 16;
   }
 
@@ -1416,7 +1363,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest,
     vp8_start_encode(&cpi->bc[1], cx_data, cx_data_end);
 
 #if CONFIG_MULTITHREAD
-    if (cpi->b_multi_threaded) {
+    if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) {
       pack_mb_row_tokens(cpi, &cpi->bc[1]);
     } else {
       vp8_pack_tokens(&cpi->bc[1], cpi->tok, cpi->tok_count);
@@ -1432,50 +1379,3 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest,
   }
 #endif
 }
-
-#ifdef VP8_ENTROPY_STATS
-void print_tree_update_probs() {
-  int i, j, k, l;
-  FILE *f = fopen("context.c", "a");
-  int Sum;
-  fprintf(f, "\n/* Update probabilities for token entropy tree. */\n\n");
-  fprintf(f,
-          "const vp8_prob tree_update_probs[BLOCK_TYPES] [COEF_BANDS] "
-          "[PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {\n");
-
-  for (i = 0; i < BLOCK_TYPES; ++i) {
-    fprintf(f, "  { \n");
-
-    for (j = 0; j < COEF_BANDS; ++j) {
-      fprintf(f, "    {\n");
-
-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-        fprintf(f, "      {");
-
-        for (l = 0; l < ENTROPY_NODES; ++l) {
-          Sum =
-              tree_update_hist[i][j][k][l][0] + tree_update_hist[i][j][k][l][1];
-
-          if (Sum > 0) {
-            if (((tree_update_hist[i][j][k][l][0] * 255) / Sum) > 0)
-              fprintf(f, "%3ld, ",
-                      (tree_update_hist[i][j][k][l][0] * 255) / Sum);
-            else
-              fprintf(f, "%3ld, ", 1);
-          } else
-            fprintf(f, "%3ld, ", 128);
-        }
-
-        fprintf(f, "},\n");
-      }
-
-      fprintf(f, "    },\n");
-    }
-
-    fprintf(f, "  },\n");
-  }
-
-  fprintf(f, "};\n");
-  fclose(f);
-}
-#endif
diff --git a/media/libvpx/libvpx/vp8/encoder/bitstream.h b/media/libvpx/libvpx/vp8/encoder/bitstream.h
index 2b196dcd27..ee3f3e4aab 100644
--- a/media/libvpx/libvpx/vp8/encoder/bitstream.h
+++ b/media/libvpx/libvpx/vp8/encoder/bitstream.h
@@ -8,17 +8,25 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_BITSTREAM_H_
-#define VP8_ENCODER_BITSTREAM_H_
+#ifndef VPX_VP8_ENCODER_BITSTREAM_H_
+#define VPX_VP8_ENCODER_BITSTREAM_H_
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+#include "vp8/encoder/treewriter.h"
+#include "vp8/encoder/tokenize.h"
+
 void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount);
+void vp8_convert_rfct_to_prob(struct VP8_COMP *const cpi);
+void vp8_calc_ref_frame_costs(int *ref_frame_cost, int prob_intra,
+                              int prob_last, int prob_garf);
+int vp8_estimate_entropy_savings(struct VP8_COMP *cpi);
+void vp8_update_coef_probs(struct VP8_COMP *cpi);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_BITSTREAM_H_
+#endif  // VPX_VP8_ENCODER_BITSTREAM_H_
diff --git a/media/libvpx/libvpx/vp8/encoder/block.h b/media/libvpx/libvpx/vp8/encoder/block.h
index f9a273bd27..f0efd3e1e2 100644
--- a/media/libvpx/libvpx/vp8/encoder/block.h
+++ b/media/libvpx/libvpx/vp8/encoder/block.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_BLOCK_H_
-#define VP8_ENCODER_BLOCK_H_
+#ifndef VPX_VP8_ENCODER_BLOCK_H_
+#define VPX_VP8_ENCODER_BLOCK_H_
 
 #include "vp8/common/onyx.h"
 #include "vp8/common/blockd.h"
@@ -166,4 +166,4 @@ typedef struct macroblock {
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_BLOCK_H_
+#endif  // VPX_VP8_ENCODER_BLOCK_H_
diff --git a/media/libvpx/libvpx/vp8/encoder/boolhuff.c b/media/libvpx/libvpx/vp8/encoder/boolhuff.c
index 04f8db9331..819c2f22a0 100644
--- a/media/libvpx/libvpx/vp8/encoder/boolhuff.c
+++ b/media/libvpx/libvpx/vp8/encoder/boolhuff.c
@@ -15,10 +15,6 @@ unsigned __int64 Sectionbits[500];
 
 #endif
 
-#ifdef VP8_ENTROPY_STATS
-unsigned int active_section = 0;
-#endif
-
 const unsigned int vp8_prob_cost[256] = {
   2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129,
   1099, 1072, 1046, 1023, 1000, 979,  959,  940,  922,  905,  889,  873,  858,
@@ -42,26 +38,26 @@ const unsigned int vp8_prob_cost[256] = {
   12,   10,   9,    7,    6,    4,    3,    1,    1
 };
 
-void vp8_start_encode(BOOL_CODER *br, unsigned char *source,
+void vp8_start_encode(BOOL_CODER *bc, unsigned char *source,
                       unsigned char *source_end) {
-  br->lowvalue = 0;
-  br->range = 255;
-  br->count = -24;
-  br->buffer = source;
-  br->buffer_end = source_end;
-  br->pos = 0;
+  bc->lowvalue = 0;
+  bc->range = 255;
+  bc->count = -24;
+  bc->buffer = source;
+  bc->buffer_end = source_end;
+  bc->pos = 0;
 }
 
-void vp8_stop_encode(BOOL_CODER *br) {
+void vp8_stop_encode(BOOL_CODER *bc) {
   int i;
 
-  for (i = 0; i < 32; ++i) vp8_encode_bool(br, 0, 128);
+  for (i = 0; i < 32; ++i) vp8_encode_bool(bc, 0, 128);
 }
 
-void vp8_encode_value(BOOL_CODER *br, int data, int bits) {
+void vp8_encode_value(BOOL_CODER *bc, int data, int bits) {
   int bit;
 
   for (bit = bits - 1; bit >= 0; bit--) {
-    vp8_encode_bool(br, (1 & (data >> bit)), 0x80);
+    vp8_encode_bool(bc, (1 & (data >> bit)), 0x80);
   }
 }
diff --git a/media/libvpx/libvpx/vp8/encoder/boolhuff.h b/media/libvpx/libvpx/vp8/encoder/boolhuff.h
index d001eea9cd..a8c536b99c 100644
--- a/media/libvpx/libvpx/vp8/encoder/boolhuff.h
+++ b/media/libvpx/libvpx/vp8/encoder/boolhuff.h
@@ -9,14 +9,14 @@
  */
 
 /****************************************************************************
-*
-*   Module Title :     boolhuff.h
-*
-*   Description  :     Bool Coder header file.
-*
-****************************************************************************/
-#ifndef VP8_ENCODER_BOOLHUFF_H_
-#define VP8_ENCODER_BOOLHUFF_H_
+ *
+ *   Module Title :     boolhuff.h
+ *
+ *   Description  :     Bool Coder header file.
+ *
+ ****************************************************************************/
+#ifndef VPX_VP8_ENCODER_BOOLHUFF_H_
+#define VPX_VP8_ENCODER_BOOLHUFF_H_
 
 #include "vpx_ports/mem.h"
 #include "vpx/internal/vpx_codec_internal.h"
@@ -35,11 +35,11 @@ typedef struct {
   struct vpx_internal_error_info *error;
 } BOOL_CODER;
 
-extern void vp8_start_encode(BOOL_CODER *bc, unsigned char *buffer,
-                             unsigned char *buffer_end);
+void vp8_start_encode(BOOL_CODER *bc, unsigned char *source,
+                      unsigned char *source_end);
 
-extern void vp8_encode_value(BOOL_CODER *br, int data, int bits);
-extern void vp8_stop_encode(BOOL_CODER *bc);
+void vp8_encode_value(BOOL_CODER *bc, int data, int bits);
+void vp8_stop_encode(BOOL_CODER *bc);
 extern const unsigned int vp8_prob_cost[256];
 
 DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]);
@@ -56,23 +56,12 @@ static int validate_buffer(const unsigned char *start, size_t len,
 
   return 0;
 }
-static void vp8_encode_bool(BOOL_CODER *br, int bit, int probability) {
+static void vp8_encode_bool(BOOL_CODER *bc, int bit, int probability) {
   unsigned int split;
-  int count = br->count;
-  unsigned int range = br->range;
-  unsigned int lowvalue = br->lowvalue;
-  register int shift;
-
-#ifdef VP8_ENTROPY_STATS
-#if defined(SECTIONBITS_OUTPUT)
-
-  if (bit)
-    Sectionbits[active_section] += vp8_prob_cost[255 - probability];
-  else
-    Sectionbits[active_section] += vp8_prob_cost[probability];
-
-#endif
-#endif
+  int count = bc->count;
+  unsigned int range = bc->range;
+  unsigned int lowvalue = bc->lowvalue;
+  int shift;
 
   split = 1 + (((range - 1) * probability) >> 8);
 
@@ -80,7 +69,7 @@ static void vp8_encode_bool(BOOL_CODER *br, int bit, int probability) {
 
   if (bit) {
     lowvalue += split;
-    range = br->range - split;
+    range = bc->range - split;
   }
 
   shift = vp8_norm[range];
@@ -92,33 +81,32 @@ static void vp8_encode_bool(BOOL_CODER *br, int bit, int probability) {
     int offset = shift - count;
 
     if ((lowvalue << (offset - 1)) & 0x80000000) {
-      int x = br->pos - 1;
+      int x = bc->pos - 1;
 
-      while (x >= 0 && br->buffer[x] == 0xff) {
-        br->buffer[x] = (unsigned char)0;
+      while (x >= 0 && bc->buffer[x] == 0xff) {
+        bc->buffer[x] = (unsigned char)0;
         x--;
       }
 
-      br->buffer[x] += 1;
+      bc->buffer[x] += 1;
     }
 
-    validate_buffer(br->buffer + br->pos, 1, br->buffer_end, br->error);
-    br->buffer[br->pos++] = (lowvalue >> (24 - offset));
+    validate_buffer(bc->buffer + bc->pos, 1, bc->buffer_end, bc->error);
+    bc->buffer[bc->pos++] = (lowvalue >> (24 - offset) & 0xff);
 
-    lowvalue <<= offset;
     shift = count;
-    lowvalue &= 0xffffff;
+    lowvalue = (int)(((uint64_t)lowvalue << offset) & 0xffffff);
     count -= 8;
   }
 
   lowvalue <<= shift;
-  br->count = count;
-  br->lowvalue = lowvalue;
-  br->range = range;
+  bc->count = count;
+  bc->lowvalue = lowvalue;
+  bc->range = range;
 }
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_BOOLHUFF_H_
+#endif  // VPX_VP8_ENCODER_BOOLHUFF_H_
diff --git a/media/libvpx/libvpx/vp8/common/copy_c.c b/media/libvpx/libvpx/vp8/encoder/copy_c.c
similarity index 100%
rename from media/libvpx/libvpx/vp8/common/copy_c.c
rename to media/libvpx/libvpx/vp8/encoder/copy_c.c
diff --git a/media/libvpx/libvpx/vp8/encoder/dct_value_cost.h b/media/libvpx/libvpx/vp8/encoder/dct_value_cost.h
index 278dce73f4..0cd6cb4e65 100644
--- a/media/libvpx/libvpx/vp8/encoder/dct_value_cost.h
+++ b/media/libvpx/libvpx/vp8/encoder/dct_value_cost.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_DCT_VALUE_COST_H_
-#define VP8_ENCODER_DCT_VALUE_COST_H_
+#ifndef VPX_VP8_ENCODER_DCT_VALUE_COST_H_
+#define VPX_VP8_ENCODER_DCT_VALUE_COST_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -341,4 +341,4 @@ static const short dct_value_cost[2048 * 2] = {
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_DCT_VALUE_COST_H_
+#endif  // VPX_VP8_ENCODER_DCT_VALUE_COST_H_
diff --git a/media/libvpx/libvpx/vp8/encoder/dct_value_tokens.h b/media/libvpx/libvpx/vp8/encoder/dct_value_tokens.h
index 0597deab2d..5cc4505f09 100644
--- a/media/libvpx/libvpx/vp8/encoder/dct_value_tokens.h
+++ b/media/libvpx/libvpx/vp8/encoder/dct_value_tokens.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_DCT_VALUE_TOKENS_H_
-#define VP8_ENCODER_DCT_VALUE_TOKENS_H_
+#ifndef VPX_VP8_ENCODER_DCT_VALUE_TOKENS_H_
+#define VPX_VP8_ENCODER_DCT_VALUE_TOKENS_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -845,4 +845,4 @@ static const TOKENVALUE dct_value_tokens[2048 * 2] = {
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_DCT_VALUE_TOKENS_H_
+#endif  // VPX_VP8_ENCODER_DCT_VALUE_TOKENS_H_
diff --git a/media/libvpx/libvpx/vp8/encoder/defaultcoefcounts.h b/media/libvpx/libvpx/vp8/encoder/defaultcoefcounts.h
index 2976325dc5..a3ab34c8a0 100644
--- a/media/libvpx/libvpx/vp8/encoder/defaultcoefcounts.h
+++ b/media/libvpx/libvpx/vp8/encoder/defaultcoefcounts.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_DEFAULTCOEFCOUNTS_H_
-#define VP8_ENCODER_DEFAULTCOEFCOUNTS_H_
+#ifndef VPX_VP8_ENCODER_DEFAULTCOEFCOUNTS_H_
+#define VPX_VP8_ENCODER_DEFAULTCOEFCOUNTS_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -232,4 +232,4 @@ static const unsigned int default_coef_counts
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_DEFAULTCOEFCOUNTS_H_
+#endif  // VPX_VP8_ENCODER_DEFAULTCOEFCOUNTS_H_
diff --git a/media/libvpx/libvpx/vp8/encoder/denoising.c b/media/libvpx/libvpx/vp8/encoder/denoising.c
index eb963b97e3..a666bca4d2 100644
--- a/media/libvpx/libvpx/vp8/encoder/denoising.c
+++ b/media/libvpx/libvpx/vp8/encoder/denoising.c
@@ -135,7 +135,7 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride,
     // When adopting aggressive denoiser, the adj_val for each pixel
     // could be at most 8 (this is current max adjustment of the map).
     // In SSE code, we calculate the sum of adj_val for
-    // the columns, so the sum could be upto 128(16 rows). However,
+    // the columns, so the sum could be up to 128(16 rows). However,
     // the range of the value is -128 ~ 127 in SSE code, that's why
     // we do this change in C code.
     // We don't do this for UV denoiser, since there are only 8 rows,
@@ -213,13 +213,12 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride,
   return FILTER_BLOCK;
 }
 
-int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
-                             int mc_avg_uv_stride,
-                             unsigned char *running_avg_uv, int avg_uv_stride,
+int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride,
+                             unsigned char *running_avg, int avg_stride,
                              unsigned char *sig, int sig_stride,
                              unsigned int motion_magnitude,
                              int increase_denoising) {
-  unsigned char *running_avg_uv_start = running_avg_uv;
+  unsigned char *running_avg_start = running_avg;
   unsigned char *sig_start = sig;
   int sum_diff_thresh;
   int r, c;
@@ -259,13 +258,13 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
       int adjustment = 0;
       int absdiff = 0;
 
-      diff = mc_running_avg_uv[c] - sig[c];
+      diff = mc_running_avg[c] - sig[c];
       absdiff = abs(diff);
 
       // When |diff| <= |3 + shift_inc1|, use pixel value from
       // last denoised raw.
       if (absdiff <= 3 + shift_inc1) {
-        running_avg_uv[c] = mc_running_avg_uv[c];
+        running_avg[c] = mc_running_avg[c];
         sum_diff += diff;
       } else {
         if (absdiff >= 4 && absdiff <= 7) {
@@ -277,16 +276,16 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
         }
         if (diff > 0) {
           if ((sig[c] + adjustment) > 255) {
-            running_avg_uv[c] = 255;
+            running_avg[c] = 255;
           } else {
-            running_avg_uv[c] = sig[c] + adjustment;
+            running_avg[c] = sig[c] + adjustment;
           }
           sum_diff += adjustment;
         } else {
           if ((sig[c] - adjustment) < 0) {
-            running_avg_uv[c] = 0;
+            running_avg[c] = 0;
           } else {
-            running_avg_uv[c] = sig[c] - adjustment;
+            running_avg[c] = sig[c] - adjustment;
           }
           sum_diff -= adjustment;
         }
@@ -294,8 +293,8 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
     }
     /* Update pointers for next iteration. */
     sig += sig_stride;
-    mc_running_avg_uv += mc_avg_uv_stride;
-    running_avg_uv += avg_uv_stride;
+    mc_running_avg += mc_avg_stride;
+    running_avg += avg_stride;
   }
 
   sum_diff_thresh = SUM_DIFF_THRESHOLD_UV;
@@ -314,27 +313,27 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
     // Only apply the adjustment for max delta up to 3.
     if (delta < 4) {
       sig -= sig_stride * 8;
-      mc_running_avg_uv -= mc_avg_uv_stride * 8;
-      running_avg_uv -= avg_uv_stride * 8;
+      mc_running_avg -= mc_avg_stride * 8;
+      running_avg -= avg_stride * 8;
       for (r = 0; r < 8; ++r) {
         for (c = 0; c < 8; ++c) {
-          int diff = mc_running_avg_uv[c] - sig[c];
+          int diff = mc_running_avg[c] - sig[c];
           int adjustment = abs(diff);
           if (adjustment > delta) adjustment = delta;
           if (diff > 0) {
             // Bring denoised signal down.
-            if (running_avg_uv[c] - adjustment < 0) {
-              running_avg_uv[c] = 0;
+            if (running_avg[c] - adjustment < 0) {
+              running_avg[c] = 0;
             } else {
-              running_avg_uv[c] = running_avg_uv[c] - adjustment;
+              running_avg[c] = running_avg[c] - adjustment;
             }
             sum_diff -= adjustment;
           } else if (diff < 0) {
             // Bring denoised signal up.
-            if (running_avg_uv[c] + adjustment > 255) {
-              running_avg_uv[c] = 255;
+            if (running_avg[c] + adjustment > 255) {
+              running_avg[c] = 255;
             } else {
-              running_avg_uv[c] = running_avg_uv[c] + adjustment;
+              running_avg[c] = running_avg[c] + adjustment;
             }
             sum_diff += adjustment;
           }
@@ -342,8 +341,8 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
         // TODO(marpan): Check here if abs(sum_diff) has gone below the
         // threshold sum_diff_thresh, and if so, we can exit the row loop.
         sig += sig_stride;
-        mc_running_avg_uv += mc_avg_uv_stride;
-        running_avg_uv += avg_uv_stride;
+        mc_running_avg += mc_avg_stride;
+        running_avg += avg_stride;
       }
       if (abs(sum_diff) > sum_diff_thresh) return COPY_BLOCK;
     } else {
@@ -351,7 +350,7 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
     }
   }
 
-  vp8_copy_mem8x8(running_avg_uv_start, avg_uv_stride, sig_start, sig_stride);
+  vp8_copy_mem8x8(running_avg_start, avg_stride, sig_start, sig_stride);
   return FILTER_BLOCK;
 }
 
diff --git a/media/libvpx/libvpx/vp8/encoder/denoising.h b/media/libvpx/libvpx/vp8/encoder/denoising.h
index 91d87b3a1c..51ae3b0ab3 100644
--- a/media/libvpx/libvpx/vp8/encoder/denoising.h
+++ b/media/libvpx/libvpx/vp8/encoder/denoising.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_DENOISING_H_
-#define VP8_ENCODER_DENOISING_H_
+#ifndef VPX_VP8_ENCODER_DENOISING_H_
+#define VPX_VP8_ENCODER_DENOISING_H_
 
 #include "block.h"
 #include "vp8/common/loopfilter.h"
@@ -100,4 +100,4 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser, MACROBLOCK *x,
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_DENOISING_H_
+#endif  // VPX_VP8_ENCODER_DENOISING_H_
diff --git a/media/libvpx/libvpx/vp8/encoder/encodeframe.c b/media/libvpx/libvpx/vp8/encoder/encodeframe.c
index c7ad3bfe2c..97855ae003 100644
--- a/media/libvpx/libvpx/vp8/encoder/encodeframe.c
+++ b/media/libvpx/libvpx/vp8/encoder/encodeframe.c
@@ -7,41 +7,37 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <limits.h>
+#include <stdio.h>
 
 #include "vpx_config.h"
-#include "vp8_rtcd.h"
-#include "./vpx_dsp_rtcd.h"
-#include "encodemb.h"
-#include "encodemv.h"
+
 #include "vp8/common/common.h"
-#include "onyx_int.h"
-#include "vp8/common/extend.h"
 #include "vp8/common/entropymode.h"
-#include "vp8/common/quant_common.h"
-#include "segmentation.h"
-#include "vp8/common/setupintrarecon.h"
-#include "encodeintra.h"
-#include "vp8/common/reconinter.h"
-#include "rdopt.h"
-#include "pickinter.h"
-#include "vp8/common/findnearmv.h"
-#include <stdio.h>
-#include <limits.h>
+#include "vp8/common/extend.h"
 #include "vp8/common/invtrans.h"
+#include "vp8/common/quant_common.h"
+#include "vp8/common/reconinter.h"
+#include "vp8/common/setupintrarecon.h"
+#include "vp8/common/threading.h"
+#include "vp8/encoder/bitstream.h"
+#include "vp8/encoder/encodeframe.h"
+#include "vp8/encoder/encodeintra.h"
+#include "vp8/encoder/encodemb.h"
+#include "vp8/encoder/onyx_int.h"
+#include "vp8/encoder/pickinter.h"
+#include "vp8/encoder/rdopt.h"
+#include "vp8_rtcd.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx_dsp_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/vpx_timer.h"
-#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
-#include "bitstream.h"
+
+#if CONFIG_MULTITHREAD
+#include "vp8/encoder/ethreading.h"
 #endif
-#include "encodeframe.h"
 
 extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t);
-extern void vp8_calc_ref_frame_costs(int *ref_frame_cost, int prob_intra,
-                                     int prob_last, int prob_garf);
-extern void vp8_convert_rfct_to_prob(VP8_COMP *const cpi);
-extern void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex);
-extern void vp8_auto_select_speed(VP8_COMP *cpi);
-extern void vp8cx_init_mbrthread_data(VP8_COMP *cpi, MACROBLOCK *x,
-                                      MB_ROW_COMP *mbr_ei, int count);
 static void adjust_act_zbin(VP8_COMP *cpi, MACROBLOCK *x);
 
 #ifdef MODE_STATS
@@ -67,15 +63,14 @@ unsigned int b_modes[14] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
  * Eventually this should be replaced by custom no-reference routines,
  *  which will be faster.
  */
-static const unsigned char VP8_VAR_OFFS[16] = {
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
-};
+static const unsigned char VP8_VAR_OFFS[16] = { 128, 128, 128, 128, 128, 128,
+                                                128, 128, 128, 128, 128, 128,
+                                                128, 128, 128, 128 };
 
 /* Original activity measure from Tim T's code. */
-static unsigned int tt_activity_measure(VP8_COMP *cpi, MACROBLOCK *x) {
+static unsigned int tt_activity_measure(MACROBLOCK *x) {
   unsigned int act;
   unsigned int sse;
-  (void)cpi;
   /* TODO: This could also be done over smaller areas (8x8), but that would
    *  require extensive changes elsewhere, as lambda is assumed to be fixed
    *  over an entire MB in most of the code.
@@ -93,28 +88,21 @@ static unsigned int tt_activity_measure(VP8_COMP *cpi, MACROBLOCK *x) {
   return act;
 }
 
-/* Stub for alternative experimental activity measures. */
-static unsigned int alt_activity_measure(VP8_COMP *cpi, MACROBLOCK *x,
-                                         int use_dc_pred) {
-  return vp8_encode_intra(cpi, x, use_dc_pred);
-}
-
 /* Measure the activity of the current macroblock
  * What we measure here is TBD so abstracted to this function
  */
 #define ALT_ACT_MEASURE 1
-static unsigned int mb_activity_measure(VP8_COMP *cpi, MACROBLOCK *x,
-                                        int mb_row, int mb_col) {
+static unsigned int mb_activity_measure(MACROBLOCK *x, int mb_row, int mb_col) {
   unsigned int mb_activity;
 
   if (ALT_ACT_MEASURE) {
     int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
 
-    /* Or use and alternative. */
-    mb_activity = alt_activity_measure(cpi, x, use_dc_pred);
+    /* Or use an alternative. */
+    mb_activity = vp8_encode_intra(x, use_dc_pred);
   } else {
     /* Original activity measure from Tim T's code. */
-    mb_activity = tt_activity_measure(cpi, x);
+    mb_activity = tt_activity_measure(x);
   }
 
   if (mb_activity < VP8_ACTIVITY_AVG_MIN) mb_activity = VP8_ACTIVITY_AVG_MIN;
@@ -134,7 +122,7 @@ static void calc_av_activity(VP8_COMP *cpi, int64_t activity_sum) {
     unsigned int tmp;
 
     /* Create a list to sort to */
-    CHECK_MEM_ERROR(sortlist,
+    CHECK_MEM_ERROR(&cpi->common.error, sortlist,
                     vpx_calloc(sizeof(unsigned int), cpi->common.MBs));
 
     /* Copy map to sort list */
@@ -267,7 +255,7 @@ static void build_activity_map(VP8_COMP *cpi) {
       vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
 
       /* measure activity */
-      mb_activity = mb_activity_measure(cpi, x, mb_row, mb_col);
+      mb_activity = mb_activity_measure(x, mb_row, mb_col);
 
       /* Keep frame sum */
       activity_sum += mb_activity;
@@ -344,11 +332,14 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row,
 
 #if CONFIG_MULTITHREAD
   const int nsync = cpi->mt_sync_range;
-  const int rightmost_col = cm->mb_cols + nsync;
-  const int *last_row_current_mb_col;
-  int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
+  vpx_atomic_int rightmost_col = VPX_ATOMIC_INIT(cm->mb_cols + nsync);
+  const vpx_atomic_int *last_row_current_mb_col;
+  vpx_atomic_int *current_mb_col = NULL;
 
-  if ((cpi->b_multi_threaded != 0) && (mb_row != 0)) {
+  if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0) {
+    current_mb_col = &cpi->mt_current_mb_col[mb_row];
+  }
+  if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0 && mb_row != 0) {
     last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1];
   } else {
     last_row_current_mb_col = &rightmost_col;
@@ -418,15 +409,13 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row,
     vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
 
 #if CONFIG_MULTITHREAD
-    if (cpi->b_multi_threaded != 0) {
+    if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0) {
       if (((mb_col - 1) % nsync) == 0) {
-        pthread_mutex_t *mutex = &cpi->pmutex[mb_row];
-        protected_write(mutex, current_mb_col, mb_col - 1);
+        vpx_atomic_store_release(current_mb_col, mb_col - 1);
       }
 
       if (mb_row && !(mb_col & (nsync - 1))) {
-        pthread_mutex_t *mutex = &cpi->pmutex[mb_row - 1];
-        sync_read(mutex, mb_col, last_row_current_mb_col, nsync);
+        vp8_atomic_spin_wait(mb_col, last_row_current_mb_col, nsync);
       }
     }
 #endif
@@ -455,13 +444,21 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row,
     x->active_ptr = cpi->active_map + map_index + mb_col;
 
     if (cm->frame_type == KEY_FRAME) {
-      *totalrate += vp8cx_encode_intra_macroblock(cpi, x, tp);
+      const int intra_rate_cost = vp8cx_encode_intra_macroblock(cpi, x, tp);
+      if (INT_MAX - *totalrate > intra_rate_cost)
+        *totalrate += intra_rate_cost;
+      else
+        *totalrate = INT_MAX;
 #ifdef MODE_STATS
       y_modes[xd->mbmi.mode]++;
 #endif
     } else {
-      *totalrate += vp8cx_encode_inter_macroblock(
+      const int inter_rate_cost = vp8cx_encode_inter_macroblock(
           cpi, x, tp, recon_yoffset, recon_uvoffset, mb_row, mb_col);
+      if (INT_MAX - *totalrate > inter_rate_cost)
+        *totalrate += inter_rate_cost;
+      else
+        *totalrate = INT_MAX;
 
 #ifdef MODE_STATS
       inter_y_modes[xd->mbmi.mode]++;
@@ -566,8 +563,9 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row,
                     xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
 
 #if CONFIG_MULTITHREAD
-  if (cpi->b_multi_threaded != 0) {
-    protected_write(&cpi->pmutex[mb_row], current_mb_col, rightmost_col);
+  if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0) {
+    vpx_atomic_store_release(current_mb_col,
+                             vpx_atomic_load_acquire(&rightmost_col));
   }
 #endif
 
@@ -635,12 +633,13 @@ static void init_encode_frame_mb_context(VP8_COMP *cpi) {
                              cpi->prob_last_coded, cpi->prob_gf_coded);
   }
 
-  xd->fullpixel_mask = 0xffffffff;
-  if (cm->full_pixel) xd->fullpixel_mask = 0xfffffff8;
+  xd->fullpixel_mask = ~0;
+  if (cm->full_pixel) xd->fullpixel_mask = ~7;
 
   vp8_zero(x->coef_counts);
   vp8_zero(x->ymode_count);
-  vp8_zero(x->uv_mode_count) x->prediction_error = 0;
+  vp8_zero(x->uv_mode_count);
+  x->prediction_error = 0;
   x->intra_error = 0;
   vp8_zero(x->count_mb_ref_frame_usage);
 }
@@ -748,30 +747,42 @@ void vp8_encode_frame(VP8_COMP *cpi) {
 #endif
 
   {
+#if CONFIG_INTERNAL_STATS
     struct vpx_usec_timer emr_timer;
     vpx_usec_timer_start(&emr_timer);
+#endif
 
 #if CONFIG_MULTITHREAD
-    if (cpi->b_multi_threaded) {
+    if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) {
       int i;
 
       vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei,
                                 cpi->encoding_thread_count);
 
-      for (i = 0; i < cm->mb_rows; ++i) cpi->mt_current_mb_col[i] = -1;
+      if (cpi->mt_current_mb_col_size != cm->mb_rows) {
+        vpx_free(cpi->mt_current_mb_col);
+        cpi->mt_current_mb_col = NULL;
+        cpi->mt_current_mb_col_size = 0;
+        CHECK_MEM_ERROR(
+            &cpi->common.error, cpi->mt_current_mb_col,
+            vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows));
+        cpi->mt_current_mb_col_size = cm->mb_rows;
+      }
+      for (i = 0; i < cm->mb_rows; ++i)
+        vpx_atomic_store_release(&cpi->mt_current_mb_col[i], -1);
 
       for (i = 0; i < cpi->encoding_thread_count; ++i) {
-        sem_post(&cpi->h_event_start_encoding[i]);
+        vp8_sem_post(&cpi->h_event_start_encoding[i]);
       }
 
       for (mb_row = 0; mb_row < cm->mb_rows;
            mb_row += (cpi->encoding_thread_count + 1)) {
-        vp8_zero(cm->left_context)
+        vp8_zero(cm->left_context);
 
 #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
-            tp = cpi->tok;
+        tp = cpi->tok;
 #else
-            tp = cpi->tok + mb_row * (cm->mb_cols * 16 * 24);
+        tp = cpi->tok + mb_row * (cm->mb_cols * 16 * 24);
 #endif
 
         encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
@@ -794,7 +805,7 @@ void vp8_encode_frame(VP8_COMP *cpi) {
       }
       /* Wait for all the threads to finish. */
       for (i = 0; i < cpi->encoding_thread_count; ++i) {
-        sem_wait(&cpi->h_event_end_encoding[i]);
+        vp8_sem_wait(&cpi->h_event_end_encoding[i]);
       }
 
       for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
@@ -858,10 +869,10 @@ void vp8_encode_frame(VP8_COMP *cpi) {
 
       /* for each macroblock row in image */
       for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
-        vp8_zero(cm->left_context)
+        vp8_zero(cm->left_context);
 
 #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
-            tp = cpi->tok;
+        tp = cpi->tok;
 #endif
 
         encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
@@ -885,8 +896,10 @@ void vp8_encode_frame(VP8_COMP *cpi) {
     }
 #endif
 
+#if CONFIG_INTERNAL_STATS
     vpx_usec_timer_mark(&emr_timer);
     cpi->time_encode_mb_row += vpx_usec_timer_elapsed(&emr_timer);
+#endif
   }
 
   // Work out the segment probabilities if segmentation is enabled
diff --git a/media/libvpx/libvpx/vp8/encoder/encodeframe.h b/media/libvpx/libvpx/vp8/encoder/encodeframe.h
index c1d8634927..cc8cf4d713 100644
--- a/media/libvpx/libvpx/vp8/encoder/encodeframe.h
+++ b/media/libvpx/libvpx/vp8/encoder/encodeframe.h
@@ -7,29 +7,34 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef VP8_ENCODER_ENCODEFRAME_H_
-#define VP8_ENCODER_ENCODEFRAME_H_
+#ifndef VPX_VP8_ENCODER_ENCODEFRAME_H_
+#define VPX_VP8_ENCODER_ENCODEFRAME_H_
+
+#include "vp8/encoder/tokenize.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
-extern void vp8_activity_masking(VP8_COMP *cpi, MACROBLOCK *x);
 
-extern void vp8_build_block_offsets(MACROBLOCK *x);
+struct VP8_COMP;
+struct macroblock;
 
-extern void vp8_setup_block_ptrs(MACROBLOCK *x);
+void vp8_activity_masking(struct VP8_COMP *cpi, MACROBLOCK *x);
 
-extern void vp8_encode_frame(VP8_COMP *cpi);
+void vp8_build_block_offsets(struct macroblock *x);
 
-extern int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
-                                         TOKENEXTRA **t, int recon_yoffset,
-                                         int recon_uvoffset, int mb_row,
-                                         int mb_col);
+void vp8_setup_block_ptrs(struct macroblock *x);
 
-extern int vp8cx_encode_intra_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
-                                         TOKENEXTRA **t);
+void vp8_encode_frame(struct VP8_COMP *cpi);
+
+int vp8cx_encode_inter_macroblock(struct VP8_COMP *cpi, struct macroblock *x,
+                                  TOKENEXTRA **t, int recon_yoffset,
+                                  int recon_uvoffset, int mb_row, int mb_col);
+
+int vp8cx_encode_intra_macroblock(struct VP8_COMP *cpi, struct macroblock *x,
+                                  TOKENEXTRA **t);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_ENCODEFRAME_H_
+#endif  // VPX_VP8_ENCODER_ENCODEFRAME_H_
diff --git a/media/libvpx/libvpx/vp8/encoder/encodeintra.c b/media/libvpx/libvpx/vp8/encoder/encodeintra.c
index f89e7cb1fa..7d448c0ea0 100644
--- a/media/libvpx/libvpx/vp8/encoder/encodeintra.c
+++ b/media/libvpx/libvpx/vp8/encoder/encodeintra.c
@@ -18,10 +18,9 @@
 #include "vp8/common/invtrans.h"
 #include "encodeintra.h"
 
-int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred) {
+int vp8_encode_intra(MACROBLOCK *x, int use_dc_pred) {
   int i;
   int intra_pred_var = 0;
-  (void)cpi;
 
   if (use_dc_pred) {
     x->e_mbd.mode_info_context->mbmi.mode = DC_PRED;
diff --git a/media/libvpx/libvpx/vp8/encoder/encodeintra.h b/media/libvpx/libvpx/vp8/encoder/encodeintra.h
index 3956cf5fb1..9a378abf49 100644
--- a/media/libvpx/libvpx/vp8/encoder/encodeintra.h
+++ b/media/libvpx/libvpx/vp8/encoder/encodeintra.h
@@ -8,15 +8,15 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_ENCODEINTRA_H_
-#define VP8_ENCODER_ENCODEINTRA_H_
+#ifndef VPX_VP8_ENCODER_ENCODEINTRA_H_
+#define VPX_VP8_ENCODER_ENCODEINTRA_H_
 #include "onyx_int.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred);
+int vp8_encode_intra(MACROBLOCK *x, int use_dc_pred);
 void vp8_encode_intra16x16mby(MACROBLOCK *x);
 void vp8_encode_intra16x16mbuv(MACROBLOCK *x);
 void vp8_encode_intra4x4mby(MACROBLOCK *mb);
@@ -25,4 +25,4 @@ void vp8_encode_intra4x4block(MACROBLOCK *x, int ib);
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_ENCODEINTRA_H_
+#endif  // VPX_VP8_ENCODER_ENCODEINTRA_H_
diff --git a/media/libvpx/libvpx/vp8/encoder/encodemb.c b/media/libvpx/libvpx/vp8/encoder/encodemb.c
index 3fd8d5fabe..052d09ba3b 100644
--- a/media/libvpx/libvpx/vp8/encoder/encodemb.c
+++ b/media/libvpx/libvpx/vp8/encoder/encodemb.c
@@ -396,8 +396,8 @@ static void optimize_mb(MACROBLOCK *x) {
   ENTROPY_CONTEXT *ta;
   ENTROPY_CONTEXT *tl;
 
-  memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  t_above = *x->e_mbd.above_context;
+  t_left = *x->e_mbd.left_context;
 
   ta = (ENTROPY_CONTEXT *)&t_above;
   tl = (ENTROPY_CONTEXT *)&t_left;
@@ -437,8 +437,8 @@ void vp8_optimize_mby(MACROBLOCK *x) {
 
   if (!x->e_mbd.left_context) return;
 
-  memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  t_above = *x->e_mbd.above_context;
+  t_left = *x->e_mbd.left_context;
 
   ta = (ENTROPY_CONTEXT *)&t_above;
   tl = (ENTROPY_CONTEXT *)&t_left;
@@ -470,8 +470,8 @@ void vp8_optimize_mbuv(MACROBLOCK *x) {
 
   if (!x->e_mbd.left_context) return;
 
-  memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  t_above = *x->e_mbd.above_context;
+  t_left = *x->e_mbd.left_context;
 
   ta = (ENTROPY_CONTEXT *)&t_above;
   tl = (ENTROPY_CONTEXT *)&t_left;
diff --git a/media/libvpx/libvpx/vp8/encoder/encodemb.h b/media/libvpx/libvpx/vp8/encoder/encodemb.h
index b55ba3ac3f..db577ddc10 100644
--- a/media/libvpx/libvpx/vp8/encoder/encodemb.h
+++ b/media/libvpx/libvpx/vp8/encoder/encodemb.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_ENCODEMB_H_
-#define VP8_ENCODER_ENCODEMB_H_
+#ifndef VPX_VP8_ENCODER_ENCODEMB_H_
+#define VPX_VP8_ENCODER_ENCODEMB_H_
 
 #include "onyx_int.h"
 
@@ -37,4 +37,4 @@ void vp8_encode_inter16x16y(MACROBLOCK *x);
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_ENCODEMB_H_
+#endif  // VPX_VP8_ENCODER_ENCODEMB_H_
diff --git a/media/libvpx/libvpx/vp8/encoder/encodemv.c b/media/libvpx/libvpx/vp8/encoder/encodemv.c
index 36e9a9078c..384bb29389 100644
--- a/media/libvpx/libvpx/vp8/encoder/encodemv.c
+++ b/media/libvpx/libvpx/vp8/encoder/encodemv.c
@@ -16,38 +16,30 @@
 
 #include <math.h>
 
-#ifdef VP8_ENTROPY_STATS
-extern unsigned int active_section;
-#endif
-
 static void encode_mvcomponent(vp8_writer *const w, const int v,
                                const struct mv_context *mvc) {
   const vp8_prob *p = mvc->prob;
   const int x = v < 0 ? -v : v;
 
-  if (x < mvnum_short) /* Small */
-  {
+  if (x < mvnum_short) { /* Small */
     vp8_write(w, 0, p[mvpis_short]);
     vp8_treed_write(w, vp8_small_mvtree, p + MVPshort, x, 3);
 
     if (!x) return; /* no sign bit */
-  } else            /* Large */
-  {
+  } else {          /* Large */
     int i = 0;
 
     vp8_write(w, 1, p[mvpis_short]);
 
-    do
+    do {
       vp8_write(w, (x >> i) & 1, p[MVPbits + i]);
-
-    while (++i < 3);
+    } while (++i < 3);
 
     i = mvlong_width - 1; /* Skip bit 3, which is sometimes implicit */
 
-    do
+    do {
       vp8_write(w, (x >> i) & 1, p[MVPbits + i]);
-
-    while (--i > 3);
+    } while (--i > 3);
 
     if (x & 0xFFF0) vp8_write(w, (x >> 3) & 1, p[MVPbits + 3]);
   }
@@ -166,7 +158,7 @@ static void calc_prob(vp8_prob *p, const unsigned int ct[2]) {
   const unsigned int tot = ct[0] + ct[1];
 
   if (tot) {
-    const vp8_prob x = ((ct[0] * 255) / tot) & -2;
+    const vp8_prob x = ((ct[0] * 255) / tot) & ~1u;
     *p = x ? x : 1;
   }
 }
@@ -211,8 +203,11 @@ static void write_component_probs(vp8_writer *const w,
   (void)rc;
   vp8_copy_array(Pnew, default_mvc, MVPcount);
 
-  vp8_zero(is_short_ct) vp8_zero(sign_ct) vp8_zero(bit_ct) vp8_zero(short_ct)
-      vp8_zero(short_bct)
+  vp8_zero(is_short_ct);
+  vp8_zero(sign_ct);
+  vp8_zero(bit_ct);
+  vp8_zero(short_ct);
+  vp8_zero(short_bct);
 
   /* j=0 */
   {
@@ -311,9 +306,6 @@ void vp8_write_mvprobs(VP8_COMP *cpi) {
   vp8_writer *const w = cpi->bc;
   MV_CONTEXT *mvc = cpi->common.fc.mvc;
   int flags[2] = { 0, 0 };
-#ifdef VP8_ENTROPY_STATS
-  active_section = 4;
-#endif
   write_component_probs(w, &mvc[0], &vp8_default_mv_context[0],
                         &vp8_mv_update_probs[0], cpi->mb.MVcount[0], 0,
                         &flags[0]);
@@ -325,8 +317,4 @@ void vp8_write_mvprobs(VP8_COMP *cpi) {
     vp8_build_component_cost_table(
         cpi->mb.mvcost, (const MV_CONTEXT *)cpi->common.fc.mvc, flags);
   }
-
-#ifdef VP8_ENTROPY_STATS
-  active_section = 5;
-#endif
 }
diff --git a/media/libvpx/libvpx/vp8/encoder/encodemv.h b/media/libvpx/libvpx/vp8/encoder/encodemv.h
index 87db30f310..347b9feffe 100644
--- a/media/libvpx/libvpx/vp8/encoder/encodemv.h
+++ b/media/libvpx/libvpx/vp8/encoder/encodemv.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_ENCODEMV_H_
-#define VP8_ENCODER_ENCODEMV_H_
+#ifndef VPX_VP8_ENCODER_ENCODEMV_H_
+#define VPX_VP8_ENCODER_ENCODEMV_H_
 
 #include "onyx_int.h"
 
@@ -26,4 +26,4 @@ void vp8_build_component_cost_table(int *mvcost[2], const MV_CONTEXT *mvc,
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_ENCODEMV_H_
+#endif  // VPX_VP8_ENCODER_ENCODEMV_H_
diff --git a/media/libvpx/libvpx/vp8/encoder/ethreading.c b/media/libvpx/libvpx/vp8/encoder/ethreading.c
index df34997acc..98c87d3cbc 100644
--- a/media/libvpx/libvpx/vp8/encoder/ethreading.c
+++ b/media/libvpx/libvpx/vp8/encoder/ethreading.c
@@ -7,49 +7,52 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <stddef.h>
 
 #include "onyx_int.h"
+#include "vpx_util/vpx_pthread.h"
 #include "vp8/common/threading.h"
 #include "vp8/common/common.h"
 #include "vp8/common/extend.h"
 #include "bitstream.h"
 #include "encodeframe.h"
+#include "ethreading.h"
 
 #if CONFIG_MULTITHREAD
 
 extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x,
                                     int ok_to_skip);
 
-static THREAD_FUNCTION thread_loopfilter(void *p_data) {
+static THREADFN thread_loopfilter(void *p_data) {
   VP8_COMP *cpi = (VP8_COMP *)(((LPFTHREAD_DATA *)p_data)->ptr1);
   VP8_COMMON *cm = &cpi->common;
 
   while (1) {
-    if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break;
+    if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break;
 
-    if (sem_wait(&cpi->h_event_start_lpf) == 0) {
+    if (vp8_sem_wait(&cpi->h_event_start_lpf) == 0) {
       /* we're shutting down */
-      if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break;
+      if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break;
 
       vp8_loopfilter_frame(cpi, cm);
 
-      sem_post(&cpi->h_event_end_lpf);
+      vp8_sem_post(&cpi->h_event_end_lpf);
     }
   }
 
-  return 0;
+  return THREAD_EXIT_SUCCESS;
 }
 
-static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
+static THREADFN thread_encoding_proc(void *p_data) {
   int ithread = ((ENCODETHREAD_DATA *)p_data)->ithread;
   VP8_COMP *cpi = (VP8_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr1);
   MB_ROW_COMP *mbri = (MB_ROW_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr2);
   ENTROPY_CONTEXT_PLANES mb_row_left_context;
 
   while (1) {
-    if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break;
+    if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break;
 
-    if (sem_wait(&cpi->h_event_start_encoding[ithread]) == 0) {
+    if (vp8_sem_wait(&cpi->h_event_start_encoding[ithread]) == 0) {
       const int nsync = cpi->mt_sync_range;
       VP8_COMMON *cm = &cpi->common;
       int mb_row;
@@ -65,7 +68,7 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
       int *totalrate = &mbri->totalrate;
 
       /* we're shutting down */
-      if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break;
+      if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break;
 
       xd->mode_info_context = cm->mi + cm->mode_info_stride * (ithread + 1);
       xd->mode_info_stride = cm->mode_info_stride;
@@ -79,8 +82,8 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
         int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
         int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
         int map_index = (mb_row * cm->mb_cols);
-        const int *last_row_current_mb_col;
-        int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
+        const vpx_atomic_int *last_row_current_mb_col;
+        vpx_atomic_int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
 
 #if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
         vp8_writer *w = &cpi->bc[1 + (mb_row % num_part)];
@@ -107,13 +110,11 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
         /* for each macroblock col in image */
         for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
           if (((mb_col - 1) % nsync) == 0) {
-            pthread_mutex_t *mutex = &cpi->pmutex[mb_row];
-            protected_write(mutex, current_mb_col, mb_col - 1);
+            vpx_atomic_store_release(current_mb_col, mb_col - 1);
           }
 
           if (mb_row && !(mb_col & (nsync - 1))) {
-            pthread_mutex_t *mutex = &cpi->pmutex[mb_row - 1];
-            sync_read(mutex, mb_col, last_row_current_mb_col, nsync);
+            vp8_atomic_spin_wait(mb_col, last_row_current_mb_col, nsync);
           }
 
 #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
@@ -285,7 +286,7 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
         vp8_extend_mb_row(&cm->yv12_fb[dst_fb_idx], xd->dst.y_buffer + 16,
                           xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
 
-        protected_write(&cpi->pmutex[mb_row], current_mb_col, mb_col + nsync);
+        vpx_atomic_store_release(current_mb_col, mb_col + nsync);
 
         /* this is to account for the border */
         xd->mode_info_context++;
@@ -307,12 +308,12 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
         x->gf_active_ptr += cm->mb_cols * cpi->encoding_thread_count;
       }
       /* Signal that this thread has completed processing its rows. */
-      sem_post(&cpi->h_event_end_encoding[ithread]);
+      vp8_sem_post(&cpi->h_event_end_encoding[ithread]);
     }
   }
 
   /* printf("exit thread %d\n", ithread); */
-  return 0;
+  return THREAD_EXIT_SUCCESS;
 }
 
 static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc) {
@@ -403,7 +404,7 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc) {
     zd->subpixel_predict8x8 = xd->subpixel_predict8x8;
     zd->subpixel_predict16x16 = xd->subpixel_predict16x16;
     zd->segmentation_enabled = xd->segmentation_enabled;
-    zd->mb_segement_abs_delta = xd->mb_segement_abs_delta;
+    zd->mb_segment_abs_delta = xd->mb_segment_abs_delta;
     memcpy(zd->segment_feature_data, xd->segment_feature_data,
            sizeof(xd->segment_feature_data));
 
@@ -471,8 +472,8 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi, MACROBLOCK *x,
 
     setup_mbby_copy(&mbr_ei[i].mb, x);
 
-    mbd->fullpixel_mask = 0xffffffff;
-    if (cm->full_pixel) mbd->fullpixel_mask = 0xfffffff8;
+    mbd->fullpixel_mask = ~0;
+    if (cm->full_pixel) mbd->fullpixel_mask = ~7;
 
     vp8_zero(mb->coef_counts);
     vp8_zero(x->ymode_count);
@@ -488,17 +489,10 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi, MACROBLOCK *x,
 
 int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
   const VP8_COMMON *cm = &cpi->common;
-
-  cpi->b_multi_threaded = 0;
-  cpi->encoding_thread_count = 0;
-  cpi->b_lpf_running = 0;
-
-  pthread_mutex_init(&cpi->mt_mutex, NULL);
+  int th_count = 0;
 
   if (cm->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1) {
-    int ithread;
-    int th_count = cpi->oxcf.multi_threaded - 1;
-    int rc = 0;
+    th_count = cpi->oxcf.multi_threaded - 1;
 
     /* don't allocate more threads than cores available */
     if (cpi->oxcf.multi_threaded > cm->processor_core_count) {
@@ -510,22 +504,27 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
     if (th_count > ((cm->mb_cols / cpi->mt_sync_range) - 1)) {
       th_count = (cm->mb_cols / cpi->mt_sync_range) - 1;
     }
+  }
+  if (th_count == cpi->encoding_thread_count) return 0;
 
-    if (th_count == 0) return 0;
+  vp8cx_remove_encoder_threads(cpi);
+  if (th_count != 0) {
+    int ithread;
+    int rc = 0;
 
-    CHECK_MEM_ERROR(cpi->h_encoding_thread,
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->h_encoding_thread,
                     vpx_malloc(sizeof(pthread_t) * th_count));
-    CHECK_MEM_ERROR(cpi->h_event_start_encoding,
-                    vpx_malloc(sizeof(sem_t) * th_count));
-    CHECK_MEM_ERROR(cpi->h_event_end_encoding,
-                    vpx_malloc(sizeof(sem_t) * th_count));
-    CHECK_MEM_ERROR(cpi->mb_row_ei,
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->h_event_start_encoding,
+                    vpx_malloc(sizeof(vp8_sem_t) * th_count));
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->h_event_end_encoding,
+                    vpx_malloc(sizeof(vp8_sem_t) * th_count));
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->mb_row_ei,
                     vpx_memalign(32, sizeof(MB_ROW_COMP) * th_count));
     memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * th_count);
-    CHECK_MEM_ERROR(cpi->en_thread_data,
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->en_thread_data,
                     vpx_malloc(sizeof(ENCODETHREAD_DATA) * th_count));
 
-    cpi->b_multi_threaded = 1;
+    vpx_atomic_store_release(&cpi->b_multi_threaded, 1);
     cpi->encoding_thread_count = th_count;
 
     /*
@@ -540,8 +539,8 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
       vp8_setup_block_ptrs(&cpi->mb_row_ei[ithread].mb);
       vp8_setup_block_dptrs(&cpi->mb_row_ei[ithread].mb.e_mbd);
 
-      sem_init(&cpi->h_event_start_encoding[ithread], 0, 0);
-      sem_init(&cpi->h_event_end_encoding[ithread], 0, 0);
+      vp8_sem_init(&cpi->h_event_start_encoding[ithread], 0, 0);
+      vp8_sem_init(&cpi->h_event_end_encoding[ithread], 0, 0);
 
       ethd->ithread = ithread;
       ethd->ptr1 = (void *)cpi;
@@ -554,21 +553,27 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
 
     if (rc) {
       /* shutdown other threads */
-      protected_write(&cpi->mt_mutex, &cpi->b_multi_threaded, 0);
+      vpx_atomic_store_release(&cpi->b_multi_threaded, 0);
       for (--ithread; ithread >= 0; ithread--) {
+        vp8_sem_post(&cpi->h_event_start_encoding[ithread]);
+        vp8_sem_post(&cpi->h_event_end_encoding[ithread]);
         pthread_join(cpi->h_encoding_thread[ithread], 0);
-        sem_destroy(&cpi->h_event_start_encoding[ithread]);
-        sem_destroy(&cpi->h_event_end_encoding[ithread]);
+        vp8_sem_destroy(&cpi->h_event_start_encoding[ithread]);
+        vp8_sem_destroy(&cpi->h_event_end_encoding[ithread]);
       }
 
       /* free thread related resources */
       vpx_free(cpi->h_event_start_encoding);
+      cpi->h_event_start_encoding = NULL;
       vpx_free(cpi->h_event_end_encoding);
+      cpi->h_event_end_encoding = NULL;
       vpx_free(cpi->h_encoding_thread);
+      cpi->h_encoding_thread = NULL;
       vpx_free(cpi->mb_row_ei);
+      cpi->mb_row_ei = NULL;
       vpx_free(cpi->en_thread_data);
-
-      pthread_mutex_destroy(&cpi->mt_mutex);
+      cpi->en_thread_data = NULL;
+      cpi->encoding_thread_count = 0;
 
       return -1;
     }
@@ -576,33 +581,37 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
     {
       LPFTHREAD_DATA *lpfthd = &cpi->lpf_thread_data;
 
-      sem_init(&cpi->h_event_start_lpf, 0, 0);
-      sem_init(&cpi->h_event_end_lpf, 0, 0);
+      vp8_sem_init(&cpi->h_event_start_lpf, 0, 0);
+      vp8_sem_init(&cpi->h_event_end_lpf, 0, 0);
 
       lpfthd->ptr1 = (void *)cpi;
       rc = pthread_create(&cpi->h_filter_thread, 0, thread_loopfilter, lpfthd);
 
       if (rc) {
         /* shutdown other threads */
-        protected_write(&cpi->mt_mutex, &cpi->b_multi_threaded, 0);
+        vpx_atomic_store_release(&cpi->b_multi_threaded, 0);
         for (--ithread; ithread >= 0; ithread--) {
-          sem_post(&cpi->h_event_start_encoding[ithread]);
-          sem_post(&cpi->h_event_end_encoding[ithread]);
+          vp8_sem_post(&cpi->h_event_start_encoding[ithread]);
+          vp8_sem_post(&cpi->h_event_end_encoding[ithread]);
           pthread_join(cpi->h_encoding_thread[ithread], 0);
-          sem_destroy(&cpi->h_event_start_encoding[ithread]);
-          sem_destroy(&cpi->h_event_end_encoding[ithread]);
+          vp8_sem_destroy(&cpi->h_event_start_encoding[ithread]);
+          vp8_sem_destroy(&cpi->h_event_end_encoding[ithread]);
         }
-        sem_destroy(&cpi->h_event_end_lpf);
-        sem_destroy(&cpi->h_event_start_lpf);
+        vp8_sem_destroy(&cpi->h_event_end_lpf);
+        vp8_sem_destroy(&cpi->h_event_start_lpf);
 
         /* free thread related resources */
         vpx_free(cpi->h_event_start_encoding);
+        cpi->h_event_start_encoding = NULL;
         vpx_free(cpi->h_event_end_encoding);
+        cpi->h_event_end_encoding = NULL;
         vpx_free(cpi->h_encoding_thread);
+        cpi->h_encoding_thread = NULL;
         vpx_free(cpi->mb_row_ei);
+        cpi->mb_row_ei = NULL;
         vpx_free(cpi->en_thread_data);
-
-        pthread_mutex_destroy(&cpi->mt_mutex);
+        cpi->en_thread_data = NULL;
+        cpi->encoding_thread_count = 0;
 
         return -2;
       }
@@ -612,36 +621,45 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
 }
 
 void vp8cx_remove_encoder_threads(VP8_COMP *cpi) {
-  if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded)) {
+  if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) {
     /* shutdown other threads */
-    protected_write(&cpi->mt_mutex, &cpi->b_multi_threaded, 0);
+    vpx_atomic_store_release(&cpi->b_multi_threaded, 0);
     {
       int i;
 
       for (i = 0; i < cpi->encoding_thread_count; ++i) {
-        sem_post(&cpi->h_event_start_encoding[i]);
-        sem_post(&cpi->h_event_end_encoding[i]);
+        vp8_sem_post(&cpi->h_event_start_encoding[i]);
+        vp8_sem_post(&cpi->h_event_end_encoding[i]);
 
         pthread_join(cpi->h_encoding_thread[i], 0);
 
-        sem_destroy(&cpi->h_event_start_encoding[i]);
-        sem_destroy(&cpi->h_event_end_encoding[i]);
+        vp8_sem_destroy(&cpi->h_event_start_encoding[i]);
+        vp8_sem_destroy(&cpi->h_event_end_encoding[i]);
       }
 
-      sem_post(&cpi->h_event_start_lpf);
+      vp8_sem_post(&cpi->h_event_start_lpf);
       pthread_join(cpi->h_filter_thread, 0);
     }
 
-    sem_destroy(&cpi->h_event_end_lpf);
-    sem_destroy(&cpi->h_event_start_lpf);
+    vp8_sem_destroy(&cpi->h_event_end_lpf);
+    vp8_sem_destroy(&cpi->h_event_start_lpf);
+    cpi->b_lpf_running = 0;
 
     /* free thread related resources */
+    vpx_free(cpi->mt_current_mb_col);
+    cpi->mt_current_mb_col = NULL;
+    cpi->mt_current_mb_col_size = 0;
     vpx_free(cpi->h_event_start_encoding);
+    cpi->h_event_start_encoding = NULL;
     vpx_free(cpi->h_event_end_encoding);
+    cpi->h_event_end_encoding = NULL;
     vpx_free(cpi->h_encoding_thread);
+    cpi->h_encoding_thread = NULL;
     vpx_free(cpi->mb_row_ei);
+    cpi->mb_row_ei = NULL;
     vpx_free(cpi->en_thread_data);
+    cpi->en_thread_data = NULL;
+    cpi->encoding_thread_count = 0;
   }
-  pthread_mutex_destroy(&cpi->mt_mutex);
 }
 #endif
diff --git a/media/libvpx/libvpx/vp8/encoder/ethreading.h b/media/libvpx/libvpx/vp8/encoder/ethreading.h
new file mode 100644
index 0000000000..598fe60559
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/encoder/ethreading.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_ENCODER_ETHREADING_H_
+#define VPX_VP8_ENCODER_ETHREADING_H_
+
+#include "vp8/encoder/onyx_int.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8_COMP;
+struct macroblock;
+
+void vp8cx_init_mbrthread_data(struct VP8_COMP *cpi, struct macroblock *x,
+                               MB_ROW_COMP *mbr_ei, int count);
+int vp8cx_create_encoder_threads(struct VP8_COMP *cpi);
+void vp8cx_remove_encoder_threads(struct VP8_COMP *cpi);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // VPX_VP8_ENCODER_ETHREADING_H_
diff --git a/media/libvpx/libvpx/vp8/encoder/firstpass.c b/media/libvpx/libvpx/vp8/encoder/firstpass.c
index 884d6e18b1..097dc0ed78 100644
--- a/media/libvpx/libvpx/vp8/encoder/firstpass.c
+++ b/media/libvpx/libvpx/vp8/encoder/firstpass.c
@@ -10,6 +10,7 @@
 
 #include <math.h>
 #include <limits.h>
+#include <stdint.h>
 #include <stdio.h>
 
 #include "./vpx_dsp_rtcd.h"
@@ -17,6 +18,7 @@
 #include "block.h"
 #include "onyx_int.h"
 #include "vpx_dsp/variance.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "encodeintra.h"
 #include "vp8/common/common.h"
 #include "vp8/common/setupintrarecon.h"
@@ -52,7 +54,7 @@ extern const int vp8_gf_boost_qadjustment[QINDEX_RANGE];
 #define KF_MB_INTRA_MIN 300
 #define GF_MB_INTRA_MIN 200
 
-#define DOUBLE_DIVIDE_CHECK(X) ((X) < 0 ? (X)-.000001 : (X) + .000001)
+#define DOUBLE_DIVIDE_CHECK(X) ((X) < 0 ? (X) - .000001 : (X) + .000001)
 
 #define POW1 (double)cpi->oxcf.two_pass_vbrbias / 100.0
 #define POW2 (double)cpi->oxcf.two_pass_vbrbias / 100.0
@@ -113,11 +115,9 @@ static int input_stats(VP8_COMP *cpi, FIRSTPASS_STATS *fps) {
   return 1;
 }
 
-static void output_stats(const VP8_COMP *cpi,
-                         struct vpx_codec_pkt_list *pktlist,
+static void output_stats(struct vpx_codec_pkt_list *pktlist,
                          FIRSTPASS_STATS *stats) {
   struct vpx_codec_cx_pkt pkt;
-  (void)cpi;
   pkt.kind = VPX_CODEC_STATS_PKT;
   pkt.data.twopass_stats.buf = stats;
   pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS);
@@ -354,10 +354,11 @@ static int frame_max_bits(VP8_COMP *cpi) {
     /* For VBR base this on the bits and frames left plus the
      * two_pass_vbrmax_section rate passed in by the user
      */
-    max_bits = (int)(((double)cpi->twopass.bits_left /
-                      (cpi->twopass.total_stats.count -
-                       (double)cpi->common.current_video_frame)) *
-                     ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));
+    max_bits = saturate_cast_double_to_int(
+        ((double)cpi->twopass.bits_left /
+         (cpi->twopass.total_stats.count -
+          (double)cpi->common.current_video_frame)) *
+        ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));
   }
 
   /* Trap case where we are out of bits */
@@ -371,11 +372,10 @@ void vp8_init_first_pass(VP8_COMP *cpi) {
 }
 
 void vp8_end_first_pass(VP8_COMP *cpi) {
-  output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.total_stats);
+  output_stats(cpi->output_pkt_list, &cpi->twopass.total_stats);
 }
 
-static void zz_motion_search(VP8_COMP *cpi, MACROBLOCK *x,
-                             YV12_BUFFER_CONFIG *raw_buffer,
+static void zz_motion_search(MACROBLOCK *x, YV12_BUFFER_CONFIG *raw_buffer,
                              int *raw_motion_err,
                              YV12_BUFFER_CONFIG *recon_buffer,
                              int *best_motion_err, int recon_yoffset) {
@@ -389,7 +389,6 @@ static void zz_motion_search(VP8_COMP *cpi, MACROBLOCK *x,
   int raw_stride = raw_buffer->y_stride;
   unsigned char *ref_ptr;
   int ref_stride = x->e_mbd.pre.y_stride;
-  (void)cpi;
 
   /* Set up pointers for this macro block raw buffer */
   raw_ptr = (unsigned char *)(raw_buffer->y_buffer + recon_yoffset + d->offset);
@@ -416,7 +415,7 @@ static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x,
   int_mv ref_mv_full;
 
   int tmp_err;
-  int step_param = 3; /* Dont search over full range for first pass */
+  int step_param = 3; /* Don't search over full range for first pass */
   int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
   int n;
   vp8_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
@@ -571,7 +570,7 @@ void vp8_first_pass(VP8_COMP *cpi) {
       vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
 
       /* do intra 16x16 prediction */
-      this_error = vp8_encode_intra(cpi, x, use_dc_pred);
+      this_error = vp8_encode_intra(x, use_dc_pred);
 
       /* "intrapenalty" below deals with situations where the intra
        * and inter error scores are very low (eg a plain black frame)
@@ -603,9 +602,8 @@ void vp8_first_pass(VP8_COMP *cpi) {
         int raw_motion_error = INT_MAX;
 
         /* Simple 0,0 motion with no mv overhead */
-        zz_motion_search(cpi, x, cpi->last_frame_unscaled_source,
-                         &raw_motion_error, lst_yv12, &motion_error,
-                         recon_yoffset);
+        zz_motion_search(x, cpi->last_frame_unscaled_source, &raw_motion_error,
+                         lst_yv12, &motion_error, recon_yoffset);
         d->bmi.mv.as_mv.row = 0;
         d->bmi.mv.as_mv.col = 0;
 
@@ -797,8 +795,8 @@ void vp8_first_pass(VP8_COMP *cpi) {
     fps.duration = (double)(cpi->source->ts_end - cpi->source->ts_start);
 
     /* don't want to do output stats with a stack variable! */
-    memcpy(&cpi->twopass.this_frame_stats, &fps, sizeof(FIRSTPASS_STATS));
-    output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.this_frame_stats);
+    cpi->twopass.this_frame_stats = fps;
+    output_stats(cpi->output_pkt_list, &cpi->twopass.this_frame_stats);
     accumulate_stats(&cpi->twopass.total_stats, &fps);
   }
 
@@ -826,22 +824,6 @@ void vp8_first_pass(VP8_COMP *cpi) {
     vp8_yv12_copy_frame(lst_yv12, gld_yv12);
   }
 
-  /* use this to see what the first pass reconstruction looks like */
-  if (0) {
-    char filename[512];
-    FILE *recon_file;
-    sprintf(filename, "enc%04d.yuv", (int)cm->current_video_frame);
-
-    if (cm->current_video_frame == 0) {
-      recon_file = fopen(filename, "wb");
-    } else {
-      recon_file = fopen(filename, "ab");
-    }
-
-    (void)fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file);
-    fclose(recon_file);
-  }
-
   cm->current_video_frame++;
 }
 extern const int vp8_bits_per_mb[2][QINDEX_RANGE];
@@ -908,9 +890,9 @@ static double calc_correction_factor(double err_per_mb, double err_devisor,
   correction_factor = pow(error_term, power_term);
 
   /* Clip range */
-  correction_factor = (correction_factor < 0.05)
-                          ? 0.05
-                          : (correction_factor > 5.0) ? 5.0 : correction_factor;
+  correction_factor = (correction_factor < 0.05)  ? 0.05
+                      : (correction_factor > 5.0) ? 5.0
+                                                  : correction_factor;
 
   return correction_factor;
 }
@@ -952,11 +934,10 @@ static int estimate_max_q(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats,
     }
 
     cpi->twopass.est_max_qcorrection_factor =
-        (cpi->twopass.est_max_qcorrection_factor < 0.1)
-            ? 0.1
-            : (cpi->twopass.est_max_qcorrection_factor > 10.0)
-                  ? 10.0
-                  : cpi->twopass.est_max_qcorrection_factor;
+        (cpi->twopass.est_max_qcorrection_factor < 0.1) ? 0.1
+        : (cpi->twopass.est_max_qcorrection_factor > 10.0)
+            ? 10.0
+            : cpi->twopass.est_max_qcorrection_factor;
   }
 
   /* Corrections for higher compression speed settings
@@ -989,11 +970,11 @@ static int estimate_max_q(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats,
     bits_per_mb_at_this_q =
         vp8_bits_per_mb[INTER_FRAME][Q] + overhead_bits_per_mb;
 
-    bits_per_mb_at_this_q = (int)(.5 +
-                                  err_correction_factor * speed_correction *
-                                      cpi->twopass.est_max_qcorrection_factor *
-                                      cpi->twopass.section_max_qfactor *
-                                      (double)bits_per_mb_at_this_q);
+    bits_per_mb_at_this_q =
+        (int)(.5 + err_correction_factor * speed_correction *
+                       cpi->twopass.est_max_qcorrection_factor *
+                       cpi->twopass.section_max_qfactor *
+                       (double)bits_per_mb_at_this_q);
 
     /* Mode and motion overhead */
     /* As Q rises in real encode loop rd code will force overhead down
@@ -1044,12 +1025,6 @@ static int estimate_cq(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats,
   double clip_iifactor;
   int overhead_bits_per_mb;
 
-  if (0) {
-    FILE *f = fopen("epmp.stt", "a");
-    fprintf(f, "%10.2f\n", err_per_mb);
-    fclose(f);
-  }
-
   target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20))
                                 ? (512 * section_target_bandwitdh) / num_mbs
                                 : 512 * (section_target_bandwitdh / num_mbs);
@@ -1086,9 +1061,8 @@ static int estimate_cq(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats,
         vp8_bits_per_mb[INTER_FRAME][Q] + overhead_bits_per_mb;
 
     bits_per_mb_at_this_q =
-        (int)(.5 +
-              err_correction_factor * speed_correction * clip_iifactor *
-                  (double)bits_per_mb_at_this_q);
+        (int)(.5 + err_correction_factor * speed_correction * clip_iifactor *
+                       (double)bits_per_mb_at_this_q);
 
     /* Mode and motion overhead */
     /* As Q rises in real encode loop rd code will force overhead down
@@ -1184,10 +1158,9 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err,
   } else {
     current_spend_ratio = (double)cpi->long_rolling_actual_bits /
                           (double)cpi->long_rolling_target_bits;
-    current_spend_ratio =
-        (current_spend_ratio > 10.0) ? 10.0 : (current_spend_ratio < 0.1)
-                                                  ? 0.1
-                                                  : current_spend_ratio;
+    current_spend_ratio = (current_spend_ratio > 10.0)  ? 10.0
+                          : (current_spend_ratio < 0.1) ? 0.1
+                                                        : current_spend_ratio;
   }
 
   /* Calculate a correction factor based on the quality of prediction in
@@ -1238,17 +1211,6 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err,
     Q++;
   }
 
-  if (0) {
-    FILE *f = fopen("estkf_q.stt", "a");
-    fprintf(f, "%8d %8d %8d %8.2f %8.3f %8.2f %8.3f %8.3f %8.3f %8d\n",
-            cpi->common.current_video_frame, bits_per_mb_at_this_q,
-            target_norm_bits_per_mb, err_per_mb, err_correction_factor,
-            current_spend_ratio, group_iiratio, iiratio_correction_factor,
-            (double)cpi->buffer_level / (double)cpi->oxcf.optimal_buffer_level,
-            Q);
-    fclose(f);
-  }
-
   return Q;
 }
 
@@ -1276,7 +1238,6 @@ void vp8_init_second_pass(VP8_COMP *cpi) {
   vp8_new_framerate(cpi, 10000000.0 * cpi->twopass.total_stats.count /
                              cpi->twopass.total_stats.duration);
 
-  cpi->output_framerate = cpi->framerate;
   cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration *
                                      cpi->oxcf.target_bandwidth / 10000000.0);
   cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats.duration *
@@ -1338,12 +1299,10 @@ void vp8_end_second_pass(VP8_COMP *cpi) { (void)cpi; }
 /* This function gives and estimate of how badly we believe the prediction
  * quality is decaying from frame to frame.
  */
-static double get_prediction_decay_rate(VP8_COMP *cpi,
-                                        FIRSTPASS_STATS *next_frame) {
+static double get_prediction_decay_rate(FIRSTPASS_STATS *next_frame) {
   double prediction_decay_rate;
   double motion_decay;
   double motion_pct = next_frame->pcnt_motion;
-  (void)cpi;
 
   /* Initial basis is the % mbs inter coded */
   prediction_decay_rate = next_frame->pcnt_inter;
@@ -1400,7 +1359,7 @@ static int detect_transition_to_still(VP8_COMP *cpi, int frame_interval,
     for (j = 0; j < still_interval; ++j) {
       if (EOF == input_stats(cpi, &tmp_next_frame)) break;
 
-      decay_rate = get_prediction_decay_rate(cpi, &tmp_next_frame);
+      decay_rate = get_prediction_decay_rate(&tmp_next_frame);
       if (decay_rate < 0.999) break;
     }
     /* Reset file position */
@@ -1451,8 +1410,7 @@ static int detect_flash(VP8_COMP *cpi, int offset) {
 }
 
 /* Update the motion related elements to the GF arf boost calculation */
-static void accumulate_frame_motion_stats(VP8_COMP *cpi,
-                                          FIRSTPASS_STATS *this_frame,
+static void accumulate_frame_motion_stats(FIRSTPASS_STATS *this_frame,
                                           double *this_frame_mv_in_out,
                                           double *mv_in_out_accumulator,
                                           double *abs_mv_in_out_accumulator,
@@ -1460,7 +1418,6 @@ static void accumulate_frame_motion_stats(VP8_COMP *cpi,
   double this_frame_mvr_ratio;
   double this_frame_mvc_ratio;
   double motion_pct;
-  (void)cpi;
 
   /* Accumulate motion stats. */
   motion_pct = this_frame->pcnt_motion;
@@ -1543,13 +1500,13 @@ static int calc_arf_boost(VP8_COMP *cpi, int offset, int f_frames, int b_frames,
 
     /* Update the motion related elements to the boost calculation */
     accumulate_frame_motion_stats(
-        cpi, &this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
+        &this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
         &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
 
     /* Calculate the baseline boost number for this frame */
     r = calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out);
 
-    /* We want to discount the the flash frame itself and the recovery
+    /* We want to discount the flash frame itself and the recovery
      * frame that follows as both will have poor scores.
      */
     flash_detected =
@@ -1558,7 +1515,7 @@ static int calc_arf_boost(VP8_COMP *cpi, int offset, int f_frames, int b_frames,
     /* Cumulative effect of prediction quality decay */
     if (!flash_detected) {
       decay_accumulator =
-          decay_accumulator * get_prediction_decay_rate(cpi, &this_frame);
+          decay_accumulator * get_prediction_decay_rate(&this_frame);
       decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
     }
     boost_score += (decay_accumulator * r);
@@ -1587,13 +1544,13 @@ static int calc_arf_boost(VP8_COMP *cpi, int offset, int f_frames, int b_frames,
 
     /* Update the motion related elements to the boost calculation */
     accumulate_frame_motion_stats(
-        cpi, &this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
+        &this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
         &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
 
     /* Calculate the baseline boost number for this frame */
     r = calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out);
 
-    /* We want to discount the the flash frame itself and the recovery
+    /* We want to discount the flash frame itself and the recovery
      * frame that follows as both will have poor scores.
      */
     flash_detected =
@@ -1602,7 +1559,7 @@ static int calc_arf_boost(VP8_COMP *cpi, int offset, int f_frames, int b_frames,
     /* Cumulative effect of prediction quality decay */
     if (!flash_detected) {
       decay_accumulator =
-          decay_accumulator * get_prediction_decay_rate(cpi, &this_frame);
+          decay_accumulator * get_prediction_decay_rate(&this_frame);
       decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
     }
 
@@ -1641,7 +1598,6 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   double this_frame_mv_in_out = 0.0;
   double mv_in_out_accumulator = 0.0;
   double abs_mv_in_out_accumulator = 0.0;
-  double mod_err_per_mb_accumulator = 0.0;
 
   int max_bits = frame_max_bits(cpi); /* Max for a single frame */
 
@@ -1692,9 +1648,6 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
     gf_group_err += mod_frame_err;
 
-    mod_err_per_mb_accumulator +=
-        mod_frame_err / DOUBLE_DIVIDE_CHECK((double)cpi->common.MBs);
-
     if (EOF == input_stats(cpi, &next_frame)) break;
 
     /* Test for the case where there is a brief flash but the prediction
@@ -1704,7 +1657,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
     /* Update the motion related elements to the boost calculation */
     accumulate_frame_motion_stats(
-        cpi, &next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
+        &next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
         &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
 
     /* Calculate a baseline boost number for this frame */
@@ -1712,7 +1665,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
     /* Cumulative effect of prediction quality decay */
     if (!flash_detected) {
-      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+      loop_decay_rate = get_prediction_decay_rate(&next_frame);
       decay_accumulator = decay_accumulator * loop_decay_rate;
       decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
     }
@@ -1733,20 +1686,21 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
         /* Break at cpi->max_gf_interval unless almost totally static */
         (i >= cpi->max_gf_interval && (decay_accumulator < 0.995)) ||
         (
-            /* Dont break out with a very short interval */
+            /* Don't break out with a very short interval */
             (i > MIN_GF_INTERVAL) &&
-            /* Dont break out very close to a key frame */
+            /* Don't break out very close to a key frame */
             ((cpi->twopass.frames_to_key - i) >= MIN_GF_INTERVAL) &&
             ((boost_score > 20.0) || (next_frame.pcnt_inter < 0.75)) &&
-            (!flash_detected) && ((mv_ratio_accumulator > 100.0) ||
-                                  (abs_mv_in_out_accumulator > 3.0) ||
-                                  (mv_in_out_accumulator < -2.0) ||
-                                  ((boost_score - old_boost_score) < 2.0)))) {
+            (!flash_detected) &&
+            ((mv_ratio_accumulator > 100.0) ||
+             (abs_mv_in_out_accumulator > 3.0) ||
+             (mv_in_out_accumulator < -2.0) ||
+             ((boost_score - old_boost_score) < 2.0)))) {
       boost_score = old_boost_score;
       break;
     }
 
-    memcpy(this_frame, &next_frame, sizeof(*this_frame));
+    *this_frame = next_frame;
 
     old_boost_score = boost_score;
   }
@@ -1780,7 +1734,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     if (boost_score > max_boost) boost_score = max_boost;
   }
 
-  /* Dont allow conventional gf too near the next kf */
+  /* Don't allow conventional gf too near the next kf */
   if ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL) {
     while (i < cpi->twopass.frames_to_key) {
       i++;
@@ -1801,9 +1755,9 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   alt_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost);
 #endif
 
-  /* Should we use the alternate refernce frame */
+  /* Should we use the alternate reference frame */
   if (allow_alt_ref && (i >= MIN_GF_INTERVAL) &&
-      /* dont use ARF very near next kf */
+      /* don't use ARF very near next kf */
       (i <= (cpi->twopass.frames_to_key - MIN_GF_INTERVAL)) &&
 #if NEW_BOOST
       ((next_frame.pcnt_inter > 0.75) || (next_frame.pcnt_second_ref > 0.5)) &&
@@ -1814,8 +1768,9 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       (next_frame.pcnt_inter > 0.75) &&
       ((mv_in_out_accumulator / (double)i > -0.2) ||
        (mv_in_out_accumulator > -2.0)) &&
-      (cpi->gfu_boost > 100) && (cpi->twopass.gf_decay_rate <=
-                                 (ARF_DECAY_THRESH + (cpi->gfu_boost / 200))))
+      (cpi->gfu_boost > 100) &&
+      (cpi->twopass.gf_decay_rate <=
+       (ARF_DECAY_THRESH + (cpi->gfu_boost / 200))))
 #endif
   {
     int Boost;
@@ -1980,11 +1935,10 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   }
 
   cpi->twopass.gf_group_bits =
-      (cpi->twopass.gf_group_bits < 0)
-          ? 0
-          : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits)
-                ? cpi->twopass.kf_group_bits
-                : cpi->twopass.gf_group_bits;
+      (cpi->twopass.gf_group_bits < 0) ? 0
+      : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits)
+          ? cpi->twopass.kf_group_bits
+          : cpi->twopass.gf_group_bits;
 
   /* Clip cpi->twopass.gf_group_bits based on user supplied data rate
    * variability limit (cpi->oxcf.two_pass_vbrmax_section)
@@ -2052,8 +2006,9 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     /* Calculate the number of bits to be spent on the gf or arf based on
      * the boost number
      */
-    gf_bits = (int)((double)Boost *
-                    (cpi->twopass.gf_group_bits / (double)allocation_chunks));
+    gf_bits = saturate_cast_double_to_int(
+        (double)Boost *
+        (cpi->twopass.gf_group_bits / (double)allocation_chunks));
 
     /* If the frame that is to be boosted is simpler than the average for
      * the gf/arf group then use an alternative calculation
@@ -2080,9 +2035,10 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
      * score, otherwise it may be worse off than an "un-boosted" frame
      */
     else {
-      int alt_gf_bits =
-          (int)((double)cpi->twopass.kf_group_bits * mod_frame_err /
-                DOUBLE_DIVIDE_CHECK((double)cpi->twopass.kf_group_error_left));
+      // Avoid division by 0 by clamping cpi->twopass.kf_group_error_left to 1
+      int alt_gf_bits = saturate_cast_double_to_int(
+          (double)cpi->twopass.kf_group_bits * mod_frame_err /
+          (double)VPXMAX(cpi->twopass.kf_group_error_left, 1));
 
       if (alt_gf_bits > gf_bits) {
         gf_bits = alt_gf_bits;
@@ -2096,7 +2052,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       }
     }
 
-    /* Dont allow a negative value for gf_bits */
+    /* Don't allow a negative value for gf_bits */
     if (gf_bits < 0) gf_bits = 0;
 
     /* Add in minimum for a frame */
@@ -2137,7 +2093,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     if (cpi->twopass.gf_group_bits < 0) cpi->twopass.gf_group_bits = 0;
 
     /* This condition could fail if there are two kfs very close together
-     * despite (MIN_GF_INTERVAL) and would cause a devide by 0 in the
+     * despite (MIN_GF_INTERVAL) and would cause a divide by 0 in the
      * calculation of cpi->twopass.alt_extra_bits.
      */
     if (cpi->baseline_gf_interval >= 3) {
@@ -2216,7 +2172,8 @@ static void assign_std_frame_bits(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   }
 
   /* How many of those bits available for allocation should we give it? */
-  target_frame_size = (int)((double)cpi->twopass.gf_group_bits * err_fraction);
+  target_frame_size = saturate_cast_double_to_int(
+      (double)cpi->twopass.gf_group_bits * err_fraction);
 
   /* Clip to target size to 0 - max_bits (or cpi->twopass.gf_group_bits)
    * at the top end.
@@ -2279,7 +2236,7 @@ void vp8_second_pass(VP8_COMP *cpi) {
   /* keyframe and section processing ! */
   if (cpi->twopass.frames_to_key == 0) {
     /* Define next KF group and assign bits to it */
-    memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+    this_frame_copy = this_frame;
     find_next_key_frame(cpi, &this_frame_copy);
 
     /* Special case: Error error_resilient_mode mode does not make much
@@ -2301,7 +2258,7 @@ void vp8_second_pass(VP8_COMP *cpi) {
   /* Is this a GF / ARF (Note that a KF is always also a GF) */
   if (cpi->frames_till_gf_update_due == 0) {
     /* Define next gf group and assign bits to it */
-    memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+    this_frame_copy = this_frame;
     define_gf_group(cpi, &this_frame_copy);
 
     /* If we are going to code an altref frame at the end of the group
@@ -2316,7 +2273,7 @@ void vp8_second_pass(VP8_COMP *cpi) {
        * to the GF group
        */
       int bak = cpi->per_frame_bandwidth;
-      memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+      this_frame_copy = this_frame;
       assign_std_frame_bits(cpi, &this_frame_copy);
       cpi->per_frame_bandwidth = bak;
     }
@@ -2336,12 +2293,12 @@ void vp8_second_pass(VP8_COMP *cpi) {
 
       if (cpi->common.frame_type != KEY_FRAME) {
         /* Assign bits from those allocated to the GF group */
-        memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+        this_frame_copy = this_frame;
         assign_std_frame_bits(cpi, &this_frame_copy);
       }
     } else {
       /* Assign bits from those allocated to the GF group */
-      memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+      this_frame_copy = this_frame;
       assign_std_frame_bits(cpi, &this_frame_copy);
     }
   }
@@ -2371,13 +2328,15 @@ void vp8_second_pass(VP8_COMP *cpi) {
   if (cpi->common.current_video_frame == 0) {
     cpi->twopass.est_max_qcorrection_factor = 1.0;
 
+    int64_t section_target_bandwidth = cpi->twopass.bits_left / frames_left;
+    section_target_bandwidth = VPXMIN(section_target_bandwidth, INT_MAX);
+
     /* Set a cq_level in constrained quality mode. */
     if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
       int est_cq;
 
       est_cq = estimate_cq(cpi, &cpi->twopass.total_left_stats,
-                           (int)(cpi->twopass.bits_left / frames_left),
-                           overhead_bits);
+                           (int)section_target_bandwidth, overhead_bits);
 
       cpi->cq_target_quality = cpi->oxcf.cq_level;
       if (est_cq > cpi->cq_target_quality) cpi->cq_target_quality = est_cq;
@@ -2388,8 +2347,7 @@ void vp8_second_pass(VP8_COMP *cpi) {
     cpi->twopass.maxq_min_limit = cpi->best_quality;
 
     tmp_q = estimate_max_q(cpi, &cpi->twopass.total_left_stats,
-                           (int)(cpi->twopass.bits_left / frames_left),
-                           overhead_bits);
+                           (int)section_target_bandwidth, overhead_bits);
 
     /* Limit the maxq value returned subsequently.
      * This increases the risk of overspend or underspend if the initial
@@ -2407,7 +2365,7 @@ void vp8_second_pass(VP8_COMP *cpi) {
   }
 
   /* The last few frames of a clip almost always have to few or too many
-   * bits and for the sake of over exact rate control we dont want to make
+   * bits and for the sake of over exact rate control we don't want to make
    * radical adjustments to the allowed quantizer range just to use up a
    * few surplus bits or get beneath the target rate.
    */
@@ -2417,9 +2375,11 @@ void vp8_second_pass(VP8_COMP *cpi) {
             (unsigned int)cpi->twopass.total_stats.count)) {
     if (frames_left < 1) frames_left = 1;
 
+    int64_t section_target_bandwidth = cpi->twopass.bits_left / frames_left;
+    section_target_bandwidth = VPXMIN(section_target_bandwidth, INT_MAX);
+
     tmp_q = estimate_max_q(cpi, &cpi->twopass.total_left_stats,
-                           (int)(cpi->twopass.bits_left / frames_left),
-                           overhead_bits);
+                           (int)section_target_bandwidth, overhead_bits);
 
     /* Move active_worst_quality but in a damped way */
     if (tmp_q > cpi->active_worst_quality) {
@@ -2470,7 +2430,7 @@ static int test_candidate_kf(VP8_COMP *cpi, FIRSTPASS_STATS *last_frame,
     double decay_accumulator = 1.0;
     double next_iiratio;
 
-    memcpy(&local_next_frame, next_frame, sizeof(*next_frame));
+    local_next_frame = *next_frame;
 
     /* Note the starting file position so we can reset to it */
     start_pos = cpi->twopass.stats_in;
@@ -2563,7 +2523,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   cpi->twopass.frames_to_key = 1;
 
   /* Take a copy of the initial frame details */
-  memcpy(&first_frame, this_frame, sizeof(*this_frame));
+  first_frame = *this_frame;
 
   cpi->twopass.kf_group_bits = 0;
   cpi->twopass.kf_group_error_left = 0;
@@ -2585,7 +2545,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     kf_group_coded_err += this_frame->coded_error;
 
     /* Load the next frame's stats. */
-    memcpy(&last_frame, this_frame, sizeof(*this_frame));
+    last_frame = *this_frame;
     input_stats(cpi, this_frame);
 
     /* Provided that we are not at the end of the file... */
@@ -2598,7 +2558,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       }
 
       /* How fast is prediction quality decaying */
-      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+      loop_decay_rate = get_prediction_decay_rate(&next_frame);
 
       /* We want to know something about the recent past... rather than
        * as used elsewhere where we are concened with decay in prediction
@@ -2648,7 +2608,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     cpi->twopass.frames_to_key /= 2;
 
     /* Copy first frame details */
-    memcpy(&tmp_frame, &first_frame, sizeof(first_frame));
+    tmp_frame = first_frame;
 
     /* Reset to the start of the group */
     reset_fpf_position(cpi, start_position);
@@ -2745,9 +2705,10 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
         else if (cpi->twopass.kf_group_bits < av_group_bits) {
           int64_t bits_below_av = av_group_bits - cpi->twopass.kf_group_bits;
 
-          cpi->twopass.kf_group_bits += (int64_t)(
-              (double)bits_below_av * (double)(buffer_lvl - opt_buffer_lvl) /
-              (double)(high_water_mark - opt_buffer_lvl));
+          cpi->twopass.kf_group_bits +=
+              (int64_t)((double)bits_below_av *
+                        (double)(buffer_lvl - opt_buffer_lvl) /
+                        (double)(high_water_mark - opt_buffer_lvl));
         }
       }
     }
@@ -2780,7 +2741,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     if (r > RMAX) r = RMAX;
 
     /* How fast is prediction quality decaying */
-    loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+    loop_decay_rate = get_prediction_decay_rate(&next_frame);
 
     decay_accumulator = decay_accumulator * loop_decay_rate;
     decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
@@ -3004,12 +2965,12 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     }
 
     /* Set back to unscaled by defaults */
-    cpi->common.horiz_scale = NORMAL;
-    cpi->common.vert_scale = NORMAL;
+    cpi->common.horiz_scale = VP8E_NORMAL;
+    cpi->common.vert_scale = VP8E_NORMAL;
 
     /* Calculate Average bits per frame. */
-    av_bits_per_frame = cpi->oxcf.target_bandwidth /
-                        DOUBLE_DIVIDE_CHECK((double)cpi->framerate);
+    av_bits_per_frame =
+        cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK(cpi->framerate);
 
     /* CBR... Use the clip average as the target for deciding resample */
     if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
@@ -3025,7 +2986,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       bits_per_frame =
           (double)(cpi->twopass.kf_group_bits / cpi->twopass.frames_to_key);
 
-      /* Dont turn to resampling in easy sections just because they
+      /* Don't turn to resampling in easy sections just because they
        * have been assigned a small number of bits
        */
       if (bits_per_frame < av_bits_per_frame) {
@@ -3061,16 +3022,6 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
               (int)((projected_bits_perframe - av_bits_per_frame) *
                     cpi->twopass.frames_to_key));
 
-    if (0) {
-      FILE *f = fopen("Subsamle.stt", "a");
-      fprintf(f, " %8d %8d %8d %8d %12.0f %8d %8d %8d\n",
-              cpi->common.current_video_frame, kf_q, cpi->common.horiz_scale,
-              cpi->common.vert_scale, kf_group_err / cpi->twopass.frames_to_key,
-              (int)(cpi->twopass.kf_group_bits / cpi->twopass.frames_to_key),
-              new_height, new_width);
-      fclose(f);
-    }
-
     /* The trigger for spatial resampling depends on the various
      * parameters such as whether we are streaming (CBR) or VBR.
      */
@@ -3089,9 +3040,9 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
         resample_trigger = 0;
       }
     } else {
-      int64_t clip_bits = (int64_t)(
-          cpi->twopass.total_stats.count * cpi->oxcf.target_bandwidth /
-          DOUBLE_DIVIDE_CHECK((double)cpi->framerate));
+      int64_t clip_bits = (int64_t)(cpi->twopass.total_stats.count *
+                                    cpi->oxcf.target_bandwidth /
+                                    DOUBLE_DIVIDE_CHECK(cpi->framerate));
       int64_t over_spend = cpi->oxcf.starting_buffer_level - cpi->buffer_level;
 
       /* If triggered last time the threshold for triggering again is
@@ -3134,17 +3085,6 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
          */
         kf_q = estimate_kf_group_q(cpi, err_per_frame * effective_size_ratio,
                                    (int)bits_per_frame, group_iiratio);
-
-        if (0) {
-          FILE *f = fopen("Subsamle.stt", "a");
-          fprintf(
-              f, "******** %8d %8d %8d %12.0f %8d %8d %8d\n", kf_q,
-              cpi->common.horiz_scale, cpi->common.vert_scale,
-              kf_group_err / cpi->twopass.frames_to_key,
-              (int)(cpi->twopass.kf_group_bits / cpi->twopass.frames_to_key),
-              new_height, new_width);
-          fclose(f);
-        }
       }
     }
 
diff --git a/media/libvpx/libvpx/vp8/encoder/firstpass.h b/media/libvpx/libvpx/vp8/encoder/firstpass.h
index ac8a7b1bfb..f5490f1eff 100644
--- a/media/libvpx/libvpx/vp8/encoder/firstpass.h
+++ b/media/libvpx/libvpx/vp8/encoder/firstpass.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_FIRSTPASS_H_
-#define VP8_ENCODER_FIRSTPASS_H_
+#ifndef VPX_VP8_ENCODER_FIRSTPASS_H_
+#define VPX_VP8_ENCODER_FIRSTPASS_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -28,4 +28,4 @@ extern size_t vp8_firstpass_stats_sz(unsigned int mb_count);
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_FIRSTPASS_H_
+#endif  // VPX_VP8_ENCODER_FIRSTPASS_H_
diff --git a/media/libvpx/libvpx/vp8/encoder/lookahead.c b/media/libvpx/libvpx/vp8/encoder/lookahead.c
index 37aa9eee84..49f851d019 100644
--- a/media/libvpx/libvpx/vp8/encoder/lookahead.c
+++ b/media/libvpx/libvpx/vp8/encoder/lookahead.c
@@ -66,8 +66,8 @@ struct lookahead_ctx *vp8_lookahead_init(unsigned int width,
   depth += 1;
 
   /* Align the buffer dimensions */
-  width = (width + 15) & ~15;
-  height = (height + 15) & ~15;
+  width = (width + 15) & ~15u;
+  height = (height + 15) & ~15u;
 
   /* Allocate the lookahead structures */
   ctx = calloc(1, sizeof(*ctx));
diff --git a/media/libvpx/libvpx/vp8/encoder/lookahead.h b/media/libvpx/libvpx/vp8/encoder/lookahead.h
index a67f226946..bf0401190b 100644
--- a/media/libvpx/libvpx/vp8/encoder/lookahead.h
+++ b/media/libvpx/libvpx/vp8/encoder/lookahead.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef VP8_ENCODER_LOOKAHEAD_H_
-#define VP8_ENCODER_LOOKAHEAD_H_
+#ifndef VPX_VP8_ENCODER_LOOKAHEAD_H_
+#define VPX_VP8_ENCODER_LOOKAHEAD_H_
 #include "vpx_scale/yv12config.h"
 #include "vpx/vpx_integer.h"
 
@@ -74,7 +74,7 @@ int vp8_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
 struct lookahead_entry *vp8_lookahead_pop(struct lookahead_ctx *ctx, int drain);
 
 #define PEEK_FORWARD 1
-#define PEEK_BACKWARD -1
+#define PEEK_BACKWARD (-1)
 /**\brief Get a future source buffer to encode
  *
  * \param[in] ctx       Pointer to the lookahead context
@@ -96,4 +96,4 @@ unsigned int vp8_lookahead_depth(struct lookahead_ctx *ctx);
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_LOOKAHEAD_H_
+#endif  // VPX_VP8_ENCODER_LOOKAHEAD_H_
diff --git a/media/libvpx/libvpx/vp8/encoder/loongarch/dct_lsx.c b/media/libvpx/libvpx/vp8/encoder/loongarch/dct_lsx.c
new file mode 100644
index 0000000000..a08d4d3f63
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/encoder/loongarch/dct_lsx.c
@@ -0,0 +1,161 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdint.h>
+#include "./vp8_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+#define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    __m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3;                            \
+                                                                               \
+    DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1);                \
+    DUP2_ARG2(__lsx_vilvh_h, _in2, _in0, _in3, _in1, _s2, _s3);                \
+    _t0 = __lsx_vilvl_h(_s1, _s0);                                             \
+    _t1 = __lsx_vilvh_h(_s1, _s0);                                             \
+    _t2 = __lsx_vilvl_h(_s3, _s2);                                             \
+    _t3 = __lsx_vilvh_h(_s3, _s2);                                             \
+    DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2);              \
+    DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3);              \
+  }
+
+#define SET_DOTP_VALUES(coeff, val0, val1, val2, const1, const2)           \
+  {                                                                        \
+    __m128i tmp0_m, tmp1_m, tmp2_m;                                        \
+                                                                           \
+    tmp0_m = __lsx_vreplvei_h(coeff, val0);                                \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff, val1, coeff, val2, tmp1_m, tmp2_m); \
+    DUP2_ARG2(__lsx_vpackev_h, tmp1_m, tmp0_m, tmp0_m, tmp2_m, const1,     \
+              const2);                                                     \
+  }
+
+#define RET_1_IF_NZERO_H(_in)           \
+  ({                                    \
+    __m128i tmp_m;                      \
+    __m128i one_m = __lsx_vldi(0x401);  \
+    __m128i max_m = __lsx_vldi(0xFF);   \
+                                        \
+    tmp_m = __lsx_vseqi_h(_in, 0);      \
+    tmp_m = __lsx_vxor_v(tmp_m, max_m); \
+    tmp_m = __lsx_vand_v(tmp_m, one_m); \
+                                        \
+    tmp_m;                              \
+  })
+
+void vp8_short_fdct4x4_lsx(int16_t *input, int16_t *output, int32_t pitch) {
+  __m128i in0, in1, in2, in3;
+  __m128i tmp0, tmp1, tmp2, tmp3, const0, const1;
+  __m128i coeff = { 0x38a4eb1814e808a9, 0x659061a82ee01d4c };
+  __m128i out0, out1, out2, out3;
+  __m128i zero = __lsx_vldi(0);
+  int32_t pitch2 = pitch << 1;
+  int32_t pitch3 = pitch2 + pitch;
+
+  in0 = __lsx_vld(input, 0);
+  DUP2_ARG2(__lsx_vldx, input, pitch, input, pitch2, in1, in2);
+  in3 = __lsx_vldx(input, pitch3);
+
+  LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+  LSX_BUTTERFLY_4_H(in0, in1, in2, in3, tmp0, tmp1, in1, in3);
+  DUP4_ARG2(__lsx_vslli_h, tmp0, 3, tmp1, 3, in1, 3, in3, 3, tmp0, tmp1, in1,
+            in3);
+  in0 = __lsx_vadd_h(tmp0, tmp1);
+  in2 = __lsx_vsub_h(tmp0, tmp1);
+  SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1);
+  tmp0 = __lsx_vilvl_h(in3, in1);
+  in1 = __lsx_vreplvei_h(coeff, 3);
+  out0 = __lsx_vpackev_h(zero, in1);
+  coeff = __lsx_vilvl_h(zero, coeff);
+  out1 = __lsx_vreplvei_w(coeff, 0);
+  DUP2_ARG3(__lsx_vdp2add_w_h, out0, tmp0, const0, out1, tmp0, const1, out0,
+            out1);
+  DUP2_ARG3(__lsx_vsrani_h_w, out0, out0, 12, out1, out1, 12, in1, in3);
+  LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+  LSX_BUTTERFLY_4_H(in0, in1, in2, in3, tmp0, tmp1, in1, in3);
+  tmp2 = __lsx_vadd_h(tmp0, tmp1);
+  tmp3 = __lsx_vsub_h(tmp0, tmp1);
+  DUP2_ARG2(__lsx_vaddi_hu, tmp2, 7, tmp3, 7, in0, in2);
+  DUP2_ARG2(__lsx_vsrai_h, in0, 4, in2, 4, in0, in2);
+  DUP2_ARG2(__lsx_vilvl_h, zero, in0, zero, in2, out0, out2);
+  tmp1 = RET_1_IF_NZERO_H(in3);
+  DUP2_ARG2(__lsx_vilvl_h, zero, tmp1, in3, in1, tmp1, tmp0);
+  DUP2_ARG2(__lsx_vreplvei_w, coeff, 2, coeff, 3, out3, out1);
+  out3 = __lsx_vadd_w(out3, out1);
+  out1 = __lsx_vreplvei_w(coeff, 1);
+  DUP2_ARG3(__lsx_vdp2add_w_h, out1, tmp0, const0, out3, tmp0, const1, out1,
+            out3);
+  DUP2_ARG2(__lsx_vsrai_w, out1, 16, out3, 16, out1, out3);
+  out1 = __lsx_vadd_w(out1, tmp1);
+  DUP2_ARG2(__lsx_vpickev_h, out1, out0, out3, out2, in0, in2);
+  __lsx_vst(in0, output, 0);
+  __lsx_vst(in2, output, 16);
+}
+
+void vp8_short_fdct8x4_lsx(int16_t *input, int16_t *output, int32_t pitch) {
+  __m128i in0, in1, in2, in3, temp0, temp1, tmp0, tmp1;
+  __m128i const0, const1, const2, vec0_w, vec1_w, vec2_w, vec3_w;
+  __m128i coeff = { 0x38a4eb1814e808a9, 0x659061a82ee01d4c };
+  __m128i zero = __lsx_vldi(0);
+  int32_t pitch2 = pitch << 1;
+  int32_t pitch3 = pitch2 + pitch;
+
+  in0 = __lsx_vld(input, 0);
+  DUP2_ARG2(__lsx_vldx, input, pitch, input, pitch2, in1, in2);
+  in3 = __lsx_vldx(input, pitch3);
+  LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+
+  LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, temp1, in1, in3);
+  DUP4_ARG2(__lsx_vslli_h, temp0, 3, temp1, 3, in1, 3, in3, 3, temp0, temp1,
+            in1, in3);
+  in0 = __lsx_vadd_h(temp0, temp1);
+  in2 = __lsx_vsub_h(temp0, temp1);
+  SET_DOTP_VALUES(coeff, 0, 1, 2, const1, const2);
+  temp0 = __lsx_vreplvei_h(coeff, 3);
+  vec1_w = __lsx_vpackev_h(zero, temp0);
+  coeff = __lsx_vilvh_h(zero, coeff);
+  vec3_w = __lsx_vreplvei_w(coeff, 0);
+  tmp1 = __lsx_vilvl_h(in3, in1);
+  tmp0 = __lsx_vilvh_h(in3, in1);
+  vec0_w = vec1_w;
+  vec2_w = vec3_w;
+  DUP4_ARG3(__lsx_vdp2add_w_h, vec0_w, tmp1, const1, vec1_w, tmp0, const1,
+            vec2_w, tmp1, const2, vec3_w, tmp0, const2, vec0_w, vec1_w, vec2_w,
+            vec3_w);
+  DUP2_ARG3(__lsx_vsrani_h_w, vec1_w, vec0_w, 12, vec3_w, vec2_w, 12, in1, in3);
+  LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+
+  LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, temp1, in1, in3);
+  in0 = __lsx_vadd_h(temp0, temp1);
+  in0 = __lsx_vaddi_hu(in0, 7);
+  in2 = __lsx_vsub_h(temp0, temp1);
+  in2 = __lsx_vaddi_hu(in2, 7);
+  in0 = __lsx_vsrai_h(in0, 4);
+  in2 = __lsx_vsrai_h(in2, 4);
+  DUP2_ARG2(__lsx_vreplvei_w, coeff, 2, coeff, 3, vec3_w, vec1_w);
+  vec3_w = __lsx_vadd_w(vec3_w, vec1_w);
+  vec1_w = __lsx_vreplvei_w(coeff, 1);
+  const0 = RET_1_IF_NZERO_H(in3);
+  tmp1 = __lsx_vilvl_h(in3, in1);
+  tmp0 = __lsx_vilvh_h(in3, in1);
+  vec0_w = vec1_w;
+  vec2_w = vec3_w;
+  DUP4_ARG3(__lsx_vdp2add_w_h, vec0_w, tmp1, const1, vec1_w, tmp0, const1,
+            vec2_w, tmp1, const2, vec3_w, tmp0, const2, vec0_w, vec1_w, vec2_w,
+            vec3_w);
+  DUP2_ARG3(__lsx_vsrani_h_w, vec1_w, vec0_w, 16, vec3_w, vec2_w, 16, in1, in3);
+  in1 = __lsx_vadd_h(in1, const0);
+  DUP2_ARG2(__lsx_vpickev_d, in1, in0, in3, in2, temp0, temp1);
+  __lsx_vst(temp0, output, 0);
+  __lsx_vst(temp1, output, 16);
+
+  DUP2_ARG2(__lsx_vpickod_d, in1, in0, in3, in2, in0, in2);
+  __lsx_vst(in0, output, 32);
+  __lsx_vst(in2, output, 48);
+}
diff --git a/media/libvpx/libvpx/vp8/encoder/loongarch/encodeopt_lsx.c b/media/libvpx/libvpx/vp8/encoder/loongarch/encodeopt_lsx.c
new file mode 100644
index 0000000000..4ad4caba60
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/encoder/loongarch/encodeopt_lsx.c
@@ -0,0 +1,82 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+#include "vp8/encoder/block.h"
+
+int32_t vp8_block_error_lsx(int16_t *coeff_ptr, int16_t *dq_coeff_ptr) {
+  int32_t err = 0;
+  __m128i dq_coeff0, dq_coeff1, coeff0, coeff1;
+  __m128i reg0, reg1, reg2, reg3, error;
+
+  DUP4_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, dq_coeff_ptr, 0,
+            dq_coeff_ptr, 16, coeff0, coeff1, dq_coeff0, dq_coeff1);
+  DUP2_ARG2(__lsx_vsubwev_w_h, coeff0, dq_coeff0, coeff1, dq_coeff1, reg0,
+            reg2);
+  DUP2_ARG2(__lsx_vsubwod_w_h, coeff0, dq_coeff0, coeff1, dq_coeff1, reg1,
+            reg3);
+  error = __lsx_vmul_w(reg0, reg0);
+  DUP2_ARG3(__lsx_vmadd_w, error, reg1, reg1, error, reg2, reg2, error, error);
+  error = __lsx_vmadd_w(error, reg3, reg3);
+  error = __lsx_vhaddw_d_w(error, error);
+  err = __lsx_vpickve2gr_w(error, 0);
+  err += __lsx_vpickve2gr_w(error, 2);
+  return err;
+}
+
+int32_t vp8_mbblock_error_lsx(MACROBLOCK *mb, int32_t dc) {
+  BLOCK *be;
+  BLOCKD *bd;
+  int16_t *coeff, *dq_coeff;
+  int32_t err = 0;
+  uint32_t loop_cnt;
+  __m128i src0, src1, src2, src3;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, error;
+  __m128i mask0 = __lsx_vldi(0xFF);
+  __m128i zero = __lsx_vldi(0);
+
+  if (dc == 1) {
+    mask0 = __lsx_vinsgr2vr_w(mask0, 0, 0);
+  }
+
+  for (loop_cnt = 0; loop_cnt < 8; loop_cnt++) {
+    int32_t loop_tmp = loop_cnt << 1;
+    be = &mb->block[loop_tmp];
+    bd = &mb->e_mbd.block[loop_tmp];
+    coeff = be->coeff;
+    dq_coeff = bd->dqcoeff;
+    DUP4_ARG2(__lsx_vld, coeff, 0, coeff, 16, dq_coeff, 0, dq_coeff, 16, src0,
+              src1, tmp0, tmp1);
+    be = &mb->block[loop_tmp + 1];
+    bd = &mb->e_mbd.block[loop_tmp + 1];
+    coeff = be->coeff;
+    dq_coeff = bd->dqcoeff;
+    DUP4_ARG2(__lsx_vld, coeff, 0, coeff, 16, dq_coeff, 0, dq_coeff, 16, src2,
+              src3, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vsubwev_w_h, src0, tmp0, src1, tmp1, src2, tmp2, src3, tmp3,
+              reg0, reg2, reg4, reg6);
+    DUP4_ARG2(__lsx_vsubwod_w_h, src0, tmp0, src1, tmp1, src2, tmp2, src3, tmp3,
+              reg1, reg3, reg5, reg7);
+    DUP2_ARG3(__lsx_vbitsel_v, zero, reg0, mask0, zero, reg4, mask0, reg0,
+              reg4);
+    error = __lsx_vmul_w(reg0, reg0);
+    DUP4_ARG3(__lsx_vmadd_w, error, reg1, reg1, error, reg2, reg2, error, reg3,
+              reg3, error, reg4, reg4, error, error, error, error);
+    DUP2_ARG3(__lsx_vmadd_w, error, reg5, reg5, error, reg6, reg6, error,
+              error);
+    error = __lsx_vmadd_w(error, reg7, reg7);
+    error = __lsx_vhaddw_d_w(error, error);
+    error = __lsx_vhaddw_q_d(error, error);
+    err += __lsx_vpickve2gr_w(error, 0);
+  }
+  return err;
+}
diff --git a/media/libvpx/libvpx/vp8/encoder/loongarch/vp8_quantize_lsx.c b/media/libvpx/libvpx/vp8/encoder/loongarch/vp8_quantize_lsx.c
new file mode 100644
index 0000000000..75889192a7
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/encoder/loongarch/vp8_quantize_lsx.c
@@ -0,0 +1,145 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdint.h>
+#include "./vp8_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+#include "vp8/encoder/block.h"
+
+#define BOOST_QUANT1(_in0, _in1, _in2, _ui)               \
+  {                                                       \
+    if (boost_temp[0] <= __lsx_vpickve2gr_h(_in0, _ui)) { \
+      if (__lsx_vpickve2gr_h(_in1, _ui)) {                \
+        eob = _ui;                                        \
+        boost_temp = zbin_boost;                          \
+      } else {                                            \
+        boost_temp++;                                     \
+      }                                                   \
+    } else {                                              \
+      _in2 = __lsx_vinsgr2vr_h(_in2, 0, _ui);             \
+      boost_temp++;                                       \
+    }                                                     \
+  }
+
+#define BOOST_QUANT2(_in0, _in1, _in2, _ui)               \
+  {                                                       \
+    if (boost_temp[0] <= __lsx_vpickve2gr_h(_in0, _ui)) { \
+      if (__lsx_vpickve2gr_h(_in1, _ui)) {                \
+        eob = _ui + 8;                                    \
+        boost_temp = zbin_boost;                          \
+      } else {                                            \
+        boost_temp++;                                     \
+      }                                                   \
+    } else {                                              \
+      _in2 = __lsx_vinsgr2vr_h(_in2, 0, _ui);             \
+      boost_temp++;                                       \
+    }                                                     \
+  }
+
+static int8_t exact_regular_quantize_b_lsx(
+    int16_t *zbin_boost, int16_t *coeff_ptr, int16_t *zbin, int16_t *round,
+    int16_t *quant, int16_t *quant_shift, int16_t *de_quant, int16_t zbin_oq_in,
+    int16_t *q_coeff, int16_t *dq_coeff) {
+  int32_t eob;
+  int16_t *boost_temp = zbin_boost;
+  __m128i inv_zig_zag = { 0x0C07040206050100, 0x0F0E0A090D0B0803 };
+  __m128i sign_z0, sign_z1, q_coeff0, q_coeff1;
+  __m128i z_bin0, z_bin1, zbin_o_q, x0, x1, sign_x0, sign_x1, de_quant0,
+      de_quant1;
+  __m128i z0, z1, round0, round1, quant0, quant2;
+  __m128i inv_zig_zag0, inv_zig_zag1;
+  __m128i zigzag_mask0 = { 0x0008000400010000, 0x0006000300020005 };
+  __m128i zigzag_mask1 = { 0x000A000D000C0009, 0X000F000E000B0007 };
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i zero = __lsx_vldi(0);
+
+  zbin_o_q = __lsx_vreplgr2vr_h(zbin_oq_in);
+  inv_zig_zag0 = __lsx_vilvl_b(zero, inv_zig_zag);
+  inv_zig_zag1 = __lsx_vilvh_b(zero, inv_zig_zag);
+  eob = -1;
+  DUP4_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, round, 0, round, 16, tmp0,
+            tmp1, tmp2, tmp3);
+  DUP4_ARG3(__lsx_vshuf_h, zigzag_mask0, tmp1, tmp0, zigzag_mask1, tmp1, tmp0,
+            zigzag_mask0, tmp3, tmp2, zigzag_mask1, tmp3, tmp2, z0, z1, round0,
+            round1);
+  DUP4_ARG2(__lsx_vld, quant, 0, quant, 16, zbin, 0, zbin, 16, tmp0, tmp1, tmp2,
+            tmp3);
+  DUP4_ARG3(__lsx_vshuf_h, zigzag_mask0, tmp1, tmp0, zigzag_mask1, tmp1, tmp0,
+            zigzag_mask0, tmp3, tmp2, zigzag_mask1, tmp3, tmp2, quant0, quant2,
+            z_bin0, z_bin1);
+  DUP2_ARG2(__lsx_vsrai_h, z0, 15, z1, 15, sign_z0, sign_z1);
+  DUP2_ARG2(__lsx_vadda_h, z0, zero, z1, zero, x0, x1);
+  DUP2_ARG2(__lsx_vsub_h, x0, z_bin0, x1, z_bin1, z_bin0, z_bin1);
+  DUP2_ARG2(__lsx_vsub_h, z_bin0, zbin_o_q, z_bin1, zbin_o_q, z_bin0, z_bin1);
+  DUP2_ARG2(__lsx_vmulwev_w_h, quant0, round0, quant2, round1, tmp0, tmp2);
+  DUP2_ARG2(__lsx_vmulwod_w_h, quant0, round0, quant2, round1, tmp1, tmp3);
+  DUP2_ARG3(__lsx_vmaddwev_w_h, tmp0, quant0, x0, tmp2, quant2, x1, tmp0, tmp2);
+  DUP2_ARG3(__lsx_vmaddwod_w_h, tmp1, quant0, x0, tmp3, quant2, x1, tmp1, tmp3);
+  DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, q_coeff0, q_coeff1);
+
+  DUP2_ARG2(__lsx_vld, quant_shift, 0, quant_shift, 16, tmp1, tmp3);
+  DUP2_ARG3(__lsx_vshuf_h, zigzag_mask0, tmp3, tmp1, zigzag_mask1, tmp3, tmp1,
+            quant0, quant2);
+  DUP2_ARG2(__lsx_vadd_h, x0, round0, x1, round1, x0, x1);
+  DUP2_ARG2(__lsx_vmulwev_w_h, quant0, q_coeff0, quant2, q_coeff1, tmp0, tmp2);
+  DUP2_ARG2(__lsx_vmulwod_w_h, quant0, q_coeff0, quant2, q_coeff1, tmp1, tmp3);
+  DUP2_ARG3(__lsx_vmaddwev_w_h, tmp0, quant0, x0, tmp2, quant2, x1, tmp0, tmp2);
+  DUP2_ARG3(__lsx_vmaddwod_w_h, tmp1, quant0, x0, tmp3, quant2, x1, tmp1, tmp3);
+  DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, x0, x1);
+  DUP2_ARG2(__lsx_vxor_v, x0, sign_z0, x1, sign_z1, sign_x0, sign_x1);
+  DUP2_ARG2(__lsx_vsub_h, sign_x0, sign_z0, sign_x1, sign_z1, sign_x0, sign_x1);
+
+  BOOST_QUANT1(z_bin0, x0, sign_x0, 0);
+  BOOST_QUANT1(z_bin0, x0, sign_x0, 1);
+  BOOST_QUANT1(z_bin0, x0, sign_x0, 2);
+  BOOST_QUANT1(z_bin0, x0, sign_x0, 3);
+  BOOST_QUANT1(z_bin0, x0, sign_x0, 4);
+  BOOST_QUANT1(z_bin0, x0, sign_x0, 5);
+  BOOST_QUANT1(z_bin0, x0, sign_x0, 6);
+  BOOST_QUANT1(z_bin0, x0, sign_x0, 7);
+
+  BOOST_QUANT2(z_bin1, x1, sign_x1, 0);
+  BOOST_QUANT2(z_bin1, x1, sign_x1, 1);
+  BOOST_QUANT2(z_bin1, x1, sign_x1, 2);
+  BOOST_QUANT2(z_bin1, x1, sign_x1, 3);
+  BOOST_QUANT2(z_bin1, x1, sign_x1, 4);
+  BOOST_QUANT2(z_bin1, x1, sign_x1, 5);
+  BOOST_QUANT2(z_bin1, x1, sign_x1, 6);
+  BOOST_QUANT2(z_bin1, x1, sign_x1, 7);
+
+  DUP2_ARG2(__lsx_vld, de_quant, 0, de_quant, 16, de_quant0, de_quant1);
+  DUP2_ARG3(__lsx_vshuf_h, inv_zig_zag0, sign_x1, sign_x0, inv_zig_zag1,
+            sign_x1, sign_x0, q_coeff0, q_coeff1);
+  DUP2_ARG2(__lsx_vmul_h, de_quant0, q_coeff0, de_quant1, q_coeff1, de_quant0,
+            de_quant1);
+  __lsx_vst(q_coeff0, q_coeff, 0);
+  __lsx_vst(q_coeff1, q_coeff, 16);
+  __lsx_vst(de_quant0, dq_coeff, 0);
+  __lsx_vst(de_quant1, dq_coeff, 16);
+
+  return (int8_t)(eob + 1);
+}
+
+void vp8_regular_quantize_b_lsx(BLOCK *b, BLOCKD *d) {
+  int16_t *zbin_boost_ptr = b->zrun_zbin_boost;
+  int16_t *coeff_ptr = b->coeff;
+  int16_t *zbin_ptr = b->zbin;
+  int16_t *round_ptr = b->round;
+  int16_t *quant_ptr = b->quant;
+  int16_t *quant_shift_ptr = b->quant_shift;
+  int16_t *qcoeff_ptr = d->qcoeff;
+  int16_t *dqcoeff_ptr = d->dqcoeff;
+  int16_t *dequant_ptr = d->dequant;
+  int16_t zbin_oq_value = b->zbin_extra;
+
+  *d->eob = exact_regular_quantize_b_lsx(
+      zbin_boost_ptr, coeff_ptr, zbin_ptr, round_ptr, quant_ptr,
+      quant_shift_ptr, dequant_ptr, zbin_oq_value, qcoeff_ptr, dqcoeff_ptr);
+}
diff --git a/media/libvpx/libvpx/vp8/encoder/mcomp.c b/media/libvpx/libvpx/vp8/encoder/mcomp.c
index 970120f3b2..6861adaa86 100644
--- a/media/libvpx/libvpx/vp8/encoder/mcomp.c
+++ b/media/libvpx/libvpx/vp8/encoder/mcomp.c
@@ -21,11 +21,6 @@
 #include "vp8/common/common.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
-#ifdef VP8_ENTROPY_STATS
-static int mv_ref_ct[31][4][2];
-static int mv_mode_cts[4][2];
-#endif
-
 int vp8_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int Weight) {
   /* MV costing is based on the distribution of vectors in the previous
    * frame and as such will tend to over state the cost of vectors. In
@@ -34,19 +29,22 @@ int vp8_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int Weight) {
    * NEAREST for subsequent blocks. The "Weight" parameter allows, to a
    * limited extent, for some account to be taken of these factors.
    */
-  return ((mvcost[0][(mv->as_mv.row - ref->as_mv.row) >> 1] +
-           mvcost[1][(mv->as_mv.col - ref->as_mv.col) >> 1]) *
-          Weight) >>
-         7;
+  const int mv_idx_row =
+      clamp((mv->as_mv.row - ref->as_mv.row) >> 1, 0, MVvals);
+  const int mv_idx_col =
+      clamp((mv->as_mv.col - ref->as_mv.col) >> 1, 0, MVvals);
+  return ((mvcost[0][mv_idx_row] + mvcost[1][mv_idx_col]) * Weight) >> 7;
 }
 
 static int mv_err_cost(int_mv *mv, int_mv *ref, int *mvcost[2],
                        int error_per_bit) {
   /* Ignore mv costing if mvcost is NULL */
   if (mvcost) {
-    return ((mvcost[0][(mv->as_mv.row - ref->as_mv.row) >> 1] +
-             mvcost[1][(mv->as_mv.col - ref->as_mv.col) >> 1]) *
-                error_per_bit +
+    const int mv_idx_row =
+        clamp((mv->as_mv.row - ref->as_mv.row) >> 1, 0, MVvals);
+    const int mv_idx_col =
+        clamp((mv->as_mv.col - ref->as_mv.col) >> 1, 0, MVvals);
+    return ((mvcost[0][mv_idx_row] + mvcost[1][mv_idx_col]) * error_per_bit +
             128) >>
            8;
   }
@@ -190,14 +188,15 @@ void vp8_init3smotion_compensation(MACROBLOCK *x, int stride) {
  */
 
 /* estimated cost of a motion vector (r,c) */
-#define MVC(r, c)                                                             \
-  (mvcost                                                                     \
-       ? ((mvcost[0][(r)-rr] + mvcost[1][(c)-rc]) * error_per_bit + 128) >> 8 \
-       : 0)
+#define MVC(r, c)                                                          \
+  (mvcost ? ((mvcost[0][(r) - rr] + mvcost[1][(c) - rc]) * error_per_bit + \
+             128) >>                                                       \
+                8                                                          \
+          : 0)
 /* pointer to predictor base of a motionvector */
 #define PRE(r, c) (y + (((r) >> 2) * y_stride + ((c) >> 2) - (offset)))
 /* convert motion vector component to offset for svf calc */
-#define SP(x) (((x)&3) << 1)
+#define SP(x) (((x) & 3) << 1)
 /* returns subpixel variance error function. */
 #define DIST(r, c) \
   vfp->svf(PRE(r, c), y_stride, SP(c), SP(r), z, b->src_stride, &sse)
@@ -206,19 +205,22 @@ void vp8_init3smotion_compensation(MACROBLOCK *x, int stride) {
 /* returns distortion + motion vector cost */
 #define ERR(r, c) (MVC(r, c) + DIST(r, c))
 /* checks if (r,c) has better score than previous best */
-#define CHECK_BETTER(v, r, c)                           \
-  IFMVCV(r, c,                                          \
-         {                                              \
-           thismse = DIST(r, c);                        \
-           if ((v = (MVC(r, c) + thismse)) < besterr) { \
-             besterr = v;                               \
-             br = r;                                    \
-             bc = c;                                    \
-             *distortion = thismse;                     \
-             *sse1 = sse;                               \
-           }                                            \
-         },                                             \
-         v = UINT_MAX;)
+#define CHECK_BETTER(v, r, c)                          \
+  do {                                                 \
+    IFMVCV(                                            \
+        r, c,                                          \
+        {                                              \
+          thismse = DIST(r, c);                        \
+          if ((v = (MVC(r, c) + thismse)) < besterr) { \
+            besterr = v;                               \
+            br = r;                                    \
+            bc = c;                                    \
+            *distortion = thismse;                     \
+            *sse1 = sse;                               \
+          }                                            \
+        },                                             \
+        v = UINT_MAX;)                                 \
+  } while (0)
 
 int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                                              int_mv *bestmv, int_mv *ref_mv,
@@ -253,7 +255,7 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
   int pre_stride = x->e_mbd.pre.y_stride;
   unsigned char *base_pre = x->e_mbd.pre.y_buffer;
 
-#if ARCH_X86 || ARCH_X86_64
+#if VPX_ARCH_X86 || VPX_ARCH_X86_64
   MACROBLOCKD *xd = &x->e_mbd;
   unsigned char *y_0 = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride +
                        bestmv->as_mv.col;
@@ -285,8 +287,8 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
   offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
 
   /* central mv */
-  bestmv->as_mv.row *= 8;
-  bestmv->as_mv.col *= 8;
+  bestmv->as_mv.row = clamp(bestmv->as_mv.row * 8, SHRT_MIN, SHRT_MAX);
+  bestmv->as_mv.col = clamp(bestmv->as_mv.col * 8, SHRT_MIN, SHRT_MAX);
 
   /* calculate central point error */
   besterr = vfp->vf(y, y_stride, z, b->src_stride, sse1);
@@ -346,8 +348,8 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
     tc = bc;
   }
 
-  bestmv->as_mv.row = br * 2;
-  bestmv->as_mv.col = bc * 2;
+  bestmv->as_mv.row = clamp(br * 2, SHRT_MIN, SHRT_MAX);
+  bestmv->as_mv.col = clamp(bc * 2, SHRT_MIN, SHRT_MAX);
 
   if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
       (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3))) {
@@ -382,7 +384,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
   int pre_stride = x->e_mbd.pre.y_stride;
   unsigned char *base_pre = x->e_mbd.pre.y_buffer;
 
-#if ARCH_X86 || ARCH_X86_64
+#if VPX_ARCH_X86 || VPX_ARCH_X86_64
   MACROBLOCKD *xd = &x->e_mbd;
   unsigned char *y_0 = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride +
                        bestmv->as_mv.col;
@@ -399,8 +401,8 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
 #endif
 
   /* central mv */
-  bestmv->as_mv.row *= 8;
-  bestmv->as_mv.col *= 8;
+  bestmv->as_mv.row = clamp(bestmv->as_mv.row * 8, SHRT_MIN, SHRT_MAX);
+  bestmv->as_mv.col = clamp(bestmv->as_mv.col * 8, SHRT_MIN, SHRT_MAX);
   startmv = *bestmv;
 
   /* calculate central point error */
@@ -678,7 +680,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
   int pre_stride = x->e_mbd.pre.y_stride;
   unsigned char *base_pre = x->e_mbd.pre.y_buffer;
 
-#if ARCH_X86 || ARCH_X86_64
+#if VPX_ARCH_X86 || VPX_ARCH_X86_64
   MACROBLOCKD *xd = &x->e_mbd;
   unsigned char *y_0 = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride +
                        bestmv->as_mv.col;
@@ -695,8 +697,8 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
 #endif
 
   /* central mv */
-  bestmv->as_mv.row *= 8;
-  bestmv->as_mv.col *= 8;
+  bestmv->as_mv.row = clamp(bestmv->as_mv.row * 8, SHRT_MIN, SHRT_MAX);
+  bestmv->as_mv.col = clamp(bestmv->as_mv.col * 8, SHRT_MIN, SHRT_MAX);
   startmv = *bestmv;
 
   /* calculate central point error */
@@ -802,13 +804,13 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
 }
 
 #define CHECK_BOUNDS(range)                    \
-  {                                            \
+  do {                                         \
     all_in = 1;                                \
     all_in &= ((br - range) >= x->mv_row_min); \
     all_in &= ((br + range) <= x->mv_row_max); \
     all_in &= ((bc - range) >= x->mv_col_min); \
     all_in &= ((bc + range) <= x->mv_col_max); \
-  }
+  } while (0)
 
 #define CHECK_POINT                                  \
   {                                                  \
@@ -819,7 +821,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
   }
 
 #define CHECK_BETTER                                                     \
-  {                                                                      \
+  do {                                                                   \
     if (thissad < bestsad) {                                             \
       thissad +=                                                         \
           mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit); \
@@ -828,7 +830,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
         best_site = i;                                                   \
       }                                                                  \
     }                                                                    \
-  }
+  } while (0)
 
 static const MV next_chkpts[6][3] = {
   { { -2, 0 }, { -1, -2 }, { 1, -2 } }, { { -1, -2 }, { 1, -2 }, { 2, 0 } },
@@ -839,7 +841,7 @@ static const MV next_chkpts[6][3] = {
 int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
                    int_mv *best_mv, int search_param, int sad_per_bit,
                    const vp8_variance_fn_ptr_t *vfp, int *mvsadcost[2],
-                   int *mvcost[2], int_mv *center_mv) {
+                   int_mv *center_mv) {
   MV hex[6] = {
     { -1, -2 }, { 1, -2 }, { 2, 0 }, { 1, 2 }, { -1, 2 }, { -2, 0 }
   };
@@ -868,8 +870,6 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
   fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
   fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
 
-  (void)mvcost;
-
   /* adjust ref_mv to make sure it is within MV range */
   vp8_clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min,
                x->mv_row_max);
@@ -905,7 +905,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
 #endif
 
   /* hex search */
-  CHECK_BOUNDS(2)
+  CHECK_BOUNDS(2);
 
   if (all_in) {
     for (i = 0; i < 6; ++i) {
@@ -914,7 +914,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
       this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) +
                     this_mv.as_mv.col;
       thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
-      CHECK_BETTER
+      CHECK_BETTER;
     }
   } else {
     for (i = 0; i < 6; ++i) {
@@ -924,7 +924,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
       this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) +
                     this_mv.as_mv.col;
       thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
-      CHECK_BETTER
+      CHECK_BETTER;
     }
   }
 
@@ -938,7 +938,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
 
   for (j = 1; j < hex_range; ++j) {
     best_site = -1;
-    CHECK_BOUNDS(2)
+    CHECK_BOUNDS(2);
 
     if (all_in) {
       for (i = 0; i < 3; ++i) {
@@ -947,7 +947,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
         this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
                       this_mv.as_mv.col;
         thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
-        CHECK_BETTER
+        CHECK_BETTER;
       }
     } else {
       for (i = 0; i < 3; ++i) {
@@ -957,7 +957,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
         this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
                       this_mv.as_mv.col;
         thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
-        CHECK_BETTER
+        CHECK_BETTER;
       }
     }
 
@@ -979,7 +979,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
 cal_neighbors:
   for (j = 0; j < dia_range; ++j) {
     best_site = -1;
-    CHECK_BOUNDS(1)
+    CHECK_BOUNDS(1);
 
     if (all_in) {
       for (i = 0; i < 4; ++i) {
@@ -988,7 +988,7 @@ cal_neighbors:
         this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
                       this_mv.as_mv.col;
         thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
-        CHECK_BETTER
+        CHECK_BETTER;
       }
     } else {
       for (i = 0; i < 4; ++i) {
@@ -998,7 +998,7 @@ cal_neighbors:
         this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
                       this_mv.as_mv.col;
         thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
-        CHECK_BETTER
+        CHECK_BETTER;
       }
     }
 
@@ -1124,13 +1124,14 @@ int vp8_diamond_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
     }
   }
 
-  this_mv.as_mv.row = best_mv->as_mv.row << 3;
-  this_mv.as_mv.col = best_mv->as_mv.col << 3;
+  this_mv.as_mv.row = clamp(best_mv->as_mv.row * 8, SHRT_MIN, SHRT_MAX);
+  this_mv.as_mv.col = clamp(best_mv->as_mv.col * 8, SHRT_MIN, SHRT_MAX);
 
   return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) +
          mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
 
+#if HAVE_SSE2 || HAVE_MSA || HAVE_LSX
 int vp8_diamond_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
                              int_mv *best_mv, int search_param, int sad_per_bit,
                              int *num00, vp8_variance_fn_ptr_t *fn_ptr,
@@ -1273,17 +1274,18 @@ int vp8_diamond_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
     }
   }
 
-  this_mv.as_mv.row = best_mv->as_mv.row * 8;
-  this_mv.as_mv.col = best_mv->as_mv.col * 8;
+  this_mv.as_mv.row = clamp(best_mv->as_mv.row * 8, SHRT_MIN, SHRT_MAX);
+  this_mv.as_mv.col = clamp(best_mv->as_mv.col * 8, SHRT_MIN, SHRT_MAX);
 
   return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) +
          mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
+#endif  // HAVE_SSE2 || HAVE_MSA || HAVE_LSX
 
-int vp8_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
-                          int sad_per_bit, int distance,
-                          vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
-                          int_mv *center_mv) {
+int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+                        int sad_per_bit, int distance,
+                        vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
+                        int_mv *center_mv) {
   unsigned char *what = (*(b->base_src) + b->src);
   int what_stride = b->src_stride;
   unsigned char *in_what;
@@ -1327,8 +1329,8 @@ int vp8_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
   bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride) +
             mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
 
-  /* Apply further limits to prevent us looking using vectors that
-   * stretch beyiond the UMV border
+  /* Apply further limits to prevent us looking using vectors that stretch
+   * beyond the UMV border
    */
   if (col_min < x->mv_col_min) col_min = x->mv_col_min;
 
@@ -1345,121 +1347,6 @@ int vp8_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
     for (c = col_min; c < col_max; ++c) {
       thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
 
-      this_mv.as_mv.col = c;
-      thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
-      if (thissad < bestsad) {
-        bestsad = thissad;
-        best_mv->as_mv.row = r;
-        best_mv->as_mv.col = c;
-        bestaddress = check_here;
-      }
-
-      check_here++;
-    }
-  }
-
-  this_mv.as_mv.row = best_mv->as_mv.row << 3;
-  this_mv.as_mv.col = best_mv->as_mv.col << 3;
-
-  return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad) +
-         mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
-}
-
-int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
-                          int sad_per_bit, int distance,
-                          vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
-                          int_mv *center_mv) {
-  unsigned char *what = (*(b->base_src) + b->src);
-  int what_stride = b->src_stride;
-  unsigned char *in_what;
-  int pre_stride = x->e_mbd.pre.y_stride;
-  unsigned char *base_pre = x->e_mbd.pre.y_buffer;
-  int in_what_stride = pre_stride;
-  int mv_stride = pre_stride;
-  unsigned char *bestaddress;
-  int_mv *best_mv = &d->bmi.mv;
-  int_mv this_mv;
-  unsigned int bestsad;
-  unsigned int thissad;
-  int r, c;
-
-  unsigned char *check_here;
-
-  int ref_row = ref_mv->as_mv.row;
-  int ref_col = ref_mv->as_mv.col;
-
-  int row_min = ref_row - distance;
-  int row_max = ref_row + distance;
-  int col_min = ref_col - distance;
-  int col_max = ref_col + distance;
-
-  unsigned int sad_array[3];
-
-  int *mvsadcost[2];
-  int_mv fcenter_mv;
-
-  mvsadcost[0] = x->mvsadcost[0];
-  mvsadcost[1] = x->mvsadcost[1];
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
-
-  /* Work out the mid point for the search */
-  in_what = base_pre + d->offset;
-  bestaddress = in_what + (ref_row * pre_stride) + ref_col;
-
-  best_mv->as_mv.row = ref_row;
-  best_mv->as_mv.col = ref_col;
-
-  /* Baseline value at the centre */
-  bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride) +
-            mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
-  /* Apply further limits to prevent us looking using vectors that stretch
-   * beyond the UMV border
-   */
-  if (col_min < x->mv_col_min) col_min = x->mv_col_min;
-
-  if (col_max > x->mv_col_max) col_max = x->mv_col_max;
-
-  if (row_min < x->mv_row_min) row_min = x->mv_row_min;
-
-  if (row_max > x->mv_row_max) row_max = x->mv_row_max;
-
-  for (r = row_min; r < row_max; ++r) {
-    this_mv.as_mv.row = r;
-    check_here = r * mv_stride + in_what + col_min;
-    c = col_min;
-
-    while ((c + 2) < col_max) {
-      int i;
-
-      fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);
-
-      for (i = 0; i < 3; ++i) {
-        thissad = sad_array[i];
-
-        if (thissad < bestsad) {
-          this_mv.as_mv.col = c;
-          thissad +=
-              mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
-          if (thissad < bestsad) {
-            bestsad = thissad;
-            best_mv->as_mv.row = r;
-            best_mv->as_mv.col = c;
-            bestaddress = check_here;
-          }
-        }
-
-        check_here++;
-        c++;
-      }
-    }
-
-    while (c < col_max) {
-      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
-
       if (thissad < bestsad) {
         this_mv.as_mv.col = c;
         thissad +=
@@ -1474,158 +1361,11 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
       }
 
       check_here++;
-      c++;
     }
   }
 
-  this_mv.as_mv.row = best_mv->as_mv.row << 3;
-  this_mv.as_mv.col = best_mv->as_mv.col << 3;
-
-  return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad) +
-         mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
-}
-
-int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
-                          int sad_per_bit, int distance,
-                          vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
-                          int_mv *center_mv) {
-  unsigned char *what = (*(b->base_src) + b->src);
-  int what_stride = b->src_stride;
-  int pre_stride = x->e_mbd.pre.y_stride;
-  unsigned char *base_pre = x->e_mbd.pre.y_buffer;
-  unsigned char *in_what;
-  int in_what_stride = pre_stride;
-  int mv_stride = pre_stride;
-  unsigned char *bestaddress;
-  int_mv *best_mv = &d->bmi.mv;
-  int_mv this_mv;
-  unsigned int bestsad;
-  unsigned int thissad;
-  int r, c;
-
-  unsigned char *check_here;
-
-  int ref_row = ref_mv->as_mv.row;
-  int ref_col = ref_mv->as_mv.col;
-
-  int row_min = ref_row - distance;
-  int row_max = ref_row + distance;
-  int col_min = ref_col - distance;
-  int col_max = ref_col + distance;
-
-  DECLARE_ALIGNED(16, unsigned int, sad_array8[8]);
-  unsigned int sad_array[3];
-
-  int *mvsadcost[2];
-  int_mv fcenter_mv;
-
-  mvsadcost[0] = x->mvsadcost[0];
-  mvsadcost[1] = x->mvsadcost[1];
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
-
-  /* Work out the mid point for the search */
-  in_what = base_pre + d->offset;
-  bestaddress = in_what + (ref_row * pre_stride) + ref_col;
-
-  best_mv->as_mv.row = ref_row;
-  best_mv->as_mv.col = ref_col;
-
-  /* Baseline value at the centre */
-  bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride) +
-            mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
-  /* Apply further limits to prevent us looking using vectors that stretch
-   * beyond the UMV border
-   */
-  if (col_min < x->mv_col_min) col_min = x->mv_col_min;
-
-  if (col_max > x->mv_col_max) col_max = x->mv_col_max;
-
-  if (row_min < x->mv_row_min) row_min = x->mv_row_min;
-
-  if (row_max > x->mv_row_max) row_max = x->mv_row_max;
-
-  for (r = row_min; r < row_max; ++r) {
-    this_mv.as_mv.row = r;
-    check_here = r * mv_stride + in_what + col_min;
-    c = col_min;
-
-    while ((c + 7) < col_max) {
-      int i;
-
-      fn_ptr->sdx8f(what, what_stride, check_here, in_what_stride, sad_array8);
-
-      for (i = 0; i < 8; ++i) {
-        thissad = sad_array8[i];
-
-        if (thissad < bestsad) {
-          this_mv.as_mv.col = c;
-          thissad +=
-              mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
-          if (thissad < bestsad) {
-            bestsad = thissad;
-            best_mv->as_mv.row = r;
-            best_mv->as_mv.col = c;
-            bestaddress = check_here;
-          }
-        }
-
-        check_here++;
-        c++;
-      }
-    }
-
-    while ((c + 2) < col_max) {
-      int i;
-
-      fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);
-
-      for (i = 0; i < 3; ++i) {
-        thissad = sad_array[i];
-
-        if (thissad < bestsad) {
-          this_mv.as_mv.col = c;
-          thissad +=
-              mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
-          if (thissad < bestsad) {
-            bestsad = thissad;
-            best_mv->as_mv.row = r;
-            best_mv->as_mv.col = c;
-            bestaddress = check_here;
-          }
-        }
-
-        check_here++;
-        c++;
-      }
-    }
-
-    while (c < col_max) {
-      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
-
-      if (thissad < bestsad) {
-        this_mv.as_mv.col = c;
-        thissad +=
-            mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
-        if (thissad < bestsad) {
-          bestsad = thissad;
-          best_mv->as_mv.row = r;
-          best_mv->as_mv.col = c;
-          bestaddress = check_here;
-        }
-      }
-
-      check_here++;
-      c++;
-    }
-  }
-
-  this_mv.as_mv.row = best_mv->as_mv.row * 8;
-  this_mv.as_mv.col = best_mv->as_mv.col * 8;
+  this_mv.as_mv.row = clamp(best_mv->as_mv.row * 8, SHRT_MIN, SHRT_MAX);
+  this_mv.as_mv.col = clamp(best_mv->as_mv.col * 8, SHRT_MIN, SHRT_MAX);
 
   return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad) +
          mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
@@ -1702,13 +1442,14 @@ int vp8_refining_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
     }
   }
 
-  this_mv.as_mv.row = ref_mv->as_mv.row << 3;
-  this_mv.as_mv.col = ref_mv->as_mv.col << 3;
+  this_mv.as_mv.row = clamp(ref_mv->as_mv.row * 8, SHRT_MIN, SHRT_MAX);
+  this_mv.as_mv.col = clamp(ref_mv->as_mv.col * 8, SHRT_MIN, SHRT_MAX);
 
   return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) +
          mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
 
+#if HAVE_SSE2 || HAVE_MSA
 int vp8_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                               int_mv *ref_mv, int error_per_bit,
                               int search_range, vp8_variance_fn_ptr_t *fn_ptr,
@@ -1812,102 +1553,10 @@ int vp8_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
     }
   }
 
-  this_mv.as_mv.row = ref_mv->as_mv.row * 8;
-  this_mv.as_mv.col = ref_mv->as_mv.col * 8;
+  this_mv.as_mv.row = clamp(ref_mv->as_mv.row * 8, SHRT_MIN, SHRT_MAX);
+  this_mv.as_mv.col = clamp(ref_mv->as_mv.col * 8, SHRT_MIN, SHRT_MAX);
 
   return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) +
          mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
-
-#ifdef VP8_ENTROPY_STATS
-void print_mode_context(void) {
-  FILE *f = fopen("modecont.c", "w");
-  int i, j;
-
-  fprintf(f, "#include \"entropy.h\"\n");
-  fprintf(f, "const int vp8_mode_contexts[6][4] =\n");
-  fprintf(f, "{\n");
-
-  for (j = 0; j < 6; ++j) {
-    fprintf(f, "  { /* %d */\n", j);
-    fprintf(f, "    ");
-
-    for (i = 0; i < 4; ++i) {
-      int overal_prob;
-      int this_prob;
-      int count;
-
-      /* Overall probs */
-      count = mv_mode_cts[i][0] + mv_mode_cts[i][1];
-
-      if (count)
-        overal_prob = 256 * mv_mode_cts[i][0] / count;
-      else
-        overal_prob = 128;
-
-      if (overal_prob == 0) overal_prob = 1;
-
-      /* context probs */
-      count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1];
-
-      if (count)
-        this_prob = 256 * mv_ref_ct[j][i][0] / count;
-      else
-        this_prob = 128;
-
-      if (this_prob == 0) this_prob = 1;
-
-      fprintf(f, "%5d, ", this_prob);
-    }
-
-    fprintf(f, "  },\n");
-  }
-
-  fprintf(f, "};\n");
-  fclose(f);
-}
-
-/* MV ref count VP8_ENTROPY_STATS stats code */
-#ifdef VP8_ENTROPY_STATS
-void init_mv_ref_counts() {
-  memset(mv_ref_ct, 0, sizeof(mv_ref_ct));
-  memset(mv_mode_cts, 0, sizeof(mv_mode_cts));
-}
-
-void accum_mv_refs(MB_PREDICTION_MODE m, const int ct[4]) {
-  if (m == ZEROMV) {
-    ++mv_ref_ct[ct[0]][0][0];
-    ++mv_mode_cts[0][0];
-  } else {
-    ++mv_ref_ct[ct[0]][0][1];
-    ++mv_mode_cts[0][1];
-
-    if (m == NEARESTMV) {
-      ++mv_ref_ct[ct[1]][1][0];
-      ++mv_mode_cts[1][0];
-    } else {
-      ++mv_ref_ct[ct[1]][1][1];
-      ++mv_mode_cts[1][1];
-
-      if (m == NEARMV) {
-        ++mv_ref_ct[ct[2]][2][0];
-        ++mv_mode_cts[2][0];
-      } else {
-        ++mv_ref_ct[ct[2]][2][1];
-        ++mv_mode_cts[2][1];
-
-        if (m == NEWMV) {
-          ++mv_ref_ct[ct[3]][3][0];
-          ++mv_mode_cts[3][0];
-        } else {
-          ++mv_ref_ct[ct[3]][3][1];
-          ++mv_mode_cts[3][1];
-        }
-      }
-    }
-  }
-}
-
-#endif /* END MV ref count VP8_ENTROPY_STATS stats code */
-
-#endif
+#endif  // HAVE_SSE2 || HAVE_MSA
diff --git a/media/libvpx/libvpx/vp8/encoder/mcomp.h b/media/libvpx/libvpx/vp8/encoder/mcomp.h
index b6228798ff..1ee6fe5dd6 100644
--- a/media/libvpx/libvpx/vp8/encoder/mcomp.h
+++ b/media/libvpx/libvpx/vp8/encoder/mcomp.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_MCOMP_H_
-#define VP8_ENCODER_MCOMP_H_
+#ifndef VPX_VP8_ENCODER_MCOMP_H_
+#define VPX_VP8_ENCODER_MCOMP_H_
 
 #include "block.h"
 #include "vpx_dsp/variance.h"
@@ -18,11 +18,6 @@
 extern "C" {
 #endif
 
-#ifdef VP8_ENTROPY_STATS
-extern void init_mv_ref_counts();
-extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]);
-#endif
-
 /* The maximum number of steps in a step search given the largest allowed
  * initial step
  */
@@ -34,15 +29,14 @@ extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]);
 /* Maximum size of the first step in full pel units */
 #define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS - 1))
 
-extern void print_mode_context(void);
-extern int vp8_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int Weight);
-extern void vp8_init_dsmotion_compensation(MACROBLOCK *x, int stride);
-extern void vp8_init3smotion_compensation(MACROBLOCK *x, int stride);
+int vp8_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int Weight);
+void vp8_init_dsmotion_compensation(MACROBLOCK *x, int stride);
+void vp8_init3smotion_compensation(MACROBLOCK *x, int stride);
 
-extern int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
-                          int_mv *best_mv, int search_param, int error_per_bit,
-                          const vp8_variance_fn_ptr_t *vf, int *mvsadcost[2],
-                          int *mvcost[2], int_mv *center_mv);
+int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+                   int_mv *best_mv, int search_param, int sad_per_bit,
+                   const vp8_variance_fn_ptr_t *vfp, int *mvsadcost[2],
+                   int_mv *center_mv);
 
 typedef int(fractional_mv_step_fp)(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                                    int_mv *bestmv, int_mv *ref_mv,
@@ -51,15 +45,15 @@ typedef int(fractional_mv_step_fp)(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                                    int *mvcost[2], int *distortion,
                                    unsigned int *sse);
 
-extern fractional_mv_step_fp vp8_find_best_sub_pixel_step_iteratively;
-extern fractional_mv_step_fp vp8_find_best_sub_pixel_step;
-extern fractional_mv_step_fp vp8_find_best_half_pixel_step;
-extern fractional_mv_step_fp vp8_skip_fractional_mv_step;
+fractional_mv_step_fp vp8_find_best_sub_pixel_step_iteratively;
+fractional_mv_step_fp vp8_find_best_sub_pixel_step;
+fractional_mv_step_fp vp8_find_best_half_pixel_step;
+fractional_mv_step_fp vp8_skip_fractional_mv_step;
 
-typedef int (*vp8_full_search_fn_t)(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
-                                    int_mv *ref_mv, int sad_per_bit,
-                                    int distance, vp8_variance_fn_ptr_t *fn_ptr,
-                                    int *mvcost[2], int_mv *center_mv);
+int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+                        int sad_per_bit, int distance,
+                        vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
+                        int_mv *center_mv);
 
 typedef int (*vp8_refining_search_fn_t)(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                                         int_mv *ref_mv, int sad_per_bit,
@@ -78,4 +72,4 @@ typedef int (*vp8_diamond_search_fn_t)(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_MCOMP_H_
+#endif  // VPX_VP8_ENCODER_MCOMP_H_
diff --git a/media/libvpx/libvpx/vp8/encoder/mips/mmi/dct_mmi.c b/media/libvpx/libvpx/vp8/encoder/mips/mmi/dct_mmi.c
new file mode 100644
index 0000000000..0fd25fcda5
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/encoder/mips/mmi/dct_mmi.c
@@ -0,0 +1,434 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+/* clang-format off */
+/* TRANSPOSE_4H: transpose 4x4 matrix.
+   Input: ftmp1,ftmp2,ftmp3,ftmp4
+   Output: ftmp1,ftmp2,ftmp3,ftmp4
+   Note: ftmp0 always be 0, ftmp5~9 used for temporary value.
+ */
+#define TRANSPOSE_4H                                         \
+  MMI_LI(%[tmp0], 0x93)                                      \
+  "mtc1       %[tmp0],    %[ftmp10]                    \n\t" \
+  "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp0]         \n\t" \
+  "punpcklhw  %[ftmp9],   %[ftmp2],   %[ftmp0]         \n\t" \
+  "pshufh     %[ftmp9],   %[ftmp9],   %[ftmp10]        \n\t" \
+  "por        %[ftmp5],   %[ftmp5],   %[ftmp9]         \n\t" \
+  "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp0]         \n\t" \
+  "punpckhhw  %[ftmp9],   %[ftmp2],   %[ftmp0]         \n\t" \
+  "pshufh     %[ftmp9],   %[ftmp9],   %[ftmp10]        \n\t" \
+  "por        %[ftmp6],   %[ftmp6],   %[ftmp9]         \n\t" \
+  "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp0]         \n\t" \
+  "punpcklhw  %[ftmp9],   %[ftmp4],   %[ftmp0]         \n\t" \
+  "pshufh     %[ftmp9],   %[ftmp9],   %[ftmp10]        \n\t" \
+  "por        %[ftmp7],   %[ftmp7],   %[ftmp9]         \n\t" \
+  "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp0]         \n\t" \
+  "punpckhhw  %[ftmp9],   %[ftmp4],   %[ftmp0]         \n\t" \
+  "pshufh     %[ftmp9],   %[ftmp9],   %[ftmp10]        \n\t" \
+  "por        %[ftmp8],   %[ftmp8],   %[ftmp9]         \n\t" \
+  "punpcklwd  %[ftmp1],   %[ftmp5],   %[ftmp7]         \n\t" \
+  "punpckhwd  %[ftmp2],   %[ftmp5],   %[ftmp7]         \n\t" \
+  "punpcklwd  %[ftmp3],   %[ftmp6],   %[ftmp8]         \n\t" \
+  "punpckhwd  %[ftmp4],   %[ftmp6],   %[ftmp8]         \n\t"
+/* clang-format on */
+
+void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
+  uint64_t tmp[1];
+  int16_t *ip = input;
+  double ff_ph_op1, ff_ph_op3;
+
+#if _MIPS_SIM == _ABIO32
+  register double ftmp0 asm("$f0");
+  register double ftmp1 asm("$f2");
+  register double ftmp2 asm("$f4");
+  register double ftmp3 asm("$f6");
+  register double ftmp4 asm("$f8");
+  register double ftmp5 asm("$f10");
+  register double ftmp6 asm("$f12");
+  register double ftmp7 asm("$f14");
+  register double ftmp8 asm("$f16");
+  register double ftmp9 asm("$f18");
+  register double ftmp10 asm("$f20");
+  register double ftmp11 asm("$f22");
+  register double ftmp12 asm("$f24");
+#else
+  register double ftmp0 asm("$f0");
+  register double ftmp1 asm("$f1");
+  register double ftmp2 asm("$f2");
+  register double ftmp3 asm("$f3");
+  register double ftmp4 asm("$f4");
+  register double ftmp5 asm("$f5");
+  register double ftmp6 asm("$f6");
+  register double ftmp7 asm("$f7");
+  register double ftmp8 asm("$f8");
+  register double ftmp9 asm("$f9");
+  register double ftmp10 asm("$f10");
+  register double ftmp11 asm("$f11");
+  register double ftmp12 asm("$f12");
+#endif  // _MIPS_SIM == _ABIO32
+
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_07) = { 0x0007000700070007ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_pw_12000) = { 0x00002ee000002ee0ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_pw_51000) = { 0x0000c7380000c738ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_pw_14500) = { 0x000038a4000038a4ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_pw_7500) = { 0x00001d4c00001d4cULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_pw_5352) = { 0x000014e8000014e8ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_pw_2217) = { 0x000008a9000008a9ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_8) = { 0x0008000800080008ULL };
+
+  /* clang-format off */
+  __asm__ volatile (
+    "dli        %[tmp0],    0x14e808a914e808a9              \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_op1]                    \n\t"
+    "dli        %[tmp0],    0xeb1808a9eb1808a9              \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_op3]                    \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],      %[ftmp0]         \n\t"
+    "gsldlc1    %[ftmp1],   0x07(%[ip])                     \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[ip])                     \n\t"
+    MMI_ADDU(%[ip], %[ip], %[pitch])
+    "gsldlc1    %[ftmp2],   0x07(%[ip])                     \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[ip])                     \n\t"
+    MMI_ADDU(%[ip], %[ip], %[pitch])
+    "gsldlc1    %[ftmp3],   0x07(%[ip])                     \n\t"
+    "gsldrc1    %[ftmp3],   0x00(%[ip])                     \n\t"
+    MMI_ADDU(%[ip], %[ip], %[pitch])
+    "gsldlc1    %[ftmp4],   0x07(%[ip])                     \n\t"
+    "gsldrc1    %[ftmp4],   0x00(%[ip])                     \n\t"
+    MMI_ADDU(%[ip], %[ip], %[pitch])
+    TRANSPOSE_4H
+
+    "ldc1       %[ftmp11],  %[ff_ph_8]                      \n\t"
+    // f1 + f4
+    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp4]        \n\t"
+    // a1
+    "pmullh     %[ftmp5],   %[ftmp5],       %[ftmp11]       \n\t"
+    // f2 + f3
+    "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]        \n\t"
+    // b1
+    "pmullh     %[ftmp6],   %[ftmp6],       %[ftmp11]       \n\t"
+    // f2 - f3
+    "psubh      %[ftmp7],   %[ftmp2],       %[ftmp3]        \n\t"
+    // c1
+    "pmullh     %[ftmp7],   %[ftmp7],       %[ftmp11]       \n\t"
+    // f1 - f4
+    "psubh      %[ftmp8],   %[ftmp1],       %[ftmp4]        \n\t"
+    // d1
+    "pmullh     %[ftmp8],   %[ftmp8],       %[ftmp11]       \n\t"
+    // op[0] = a1 + b1
+    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp6]        \n\t"
+    // op[2] = a1 - b1
+    "psubh      %[ftmp3],   %[ftmp5],       %[ftmp6]        \n\t"
+
+    // op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12
+    MMI_LI(%[tmp0], 0x0c)
+    "dmtc1      %[tmp0],    %[ftmp11]                       \n\t"
+    "ldc1       %[ftmp12],  %[ff_pw_14500]                  \n\t"
+    "punpcklhw  %[ftmp9],   %[ftmp7],       %[ftmp8]        \n\t"
+    "pmaddhw    %[ftmp5],   %[ftmp9],       %[ff_ph_op1]    \n\t"
+    "punpckhhw  %[ftmp9],   %[ftmp7],       %[ftmp8]        \n\t"
+    "pmaddhw    %[ftmp6],   %[ftmp9],       %[ff_ph_op1]    \n\t"
+    "paddw      %[ftmp5],   %[ftmp5],       %[ftmp12]       \n\t"
+    "paddw      %[ftmp6],   %[ftmp6],       %[ftmp12]       \n\t"
+    "psraw      %[ftmp5],   %[ftmp5],       %[ftmp11]       \n\t"
+    "psraw      %[ftmp6],   %[ftmp6],       %[ftmp11]       \n\t"
+    "packsswh   %[ftmp2],   %[ftmp5],       %[ftmp6]        \n\t"
+
+    // op[3] = (d1 * 2217 - c1 * 5352 + 7500) >> 12
+    "ldc1       %[ftmp12],  %[ff_pw_7500]                   \n\t"
+    "punpcklhw  %[ftmp9],   %[ftmp8],       %[ftmp7]        \n\t"
+    "pmaddhw    %[ftmp5],   %[ftmp9],       %[ff_ph_op3]    \n\t"
+    "punpckhhw  %[ftmp9],   %[ftmp8],       %[ftmp7]        \n\t"
+    "pmaddhw    %[ftmp6],   %[ftmp9],       %[ff_ph_op3]    \n\t"
+    "paddw      %[ftmp5],   %[ftmp5],       %[ftmp12]       \n\t"
+    "paddw      %[ftmp6],   %[ftmp6],       %[ftmp12]       \n\t"
+    "psraw      %[ftmp5],   %[ftmp5],       %[ftmp11]       \n\t"
+    "psraw      %[ftmp6],   %[ftmp6],       %[ftmp11]       \n\t"
+    "packsswh   %[ftmp4],   %[ftmp5],       %[ftmp6]        \n\t"
+    TRANSPOSE_4H
+
+    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp4]        \n\t"
+    "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]        \n\t"
+    "psubh      %[ftmp7],   %[ftmp2],       %[ftmp3]        \n\t"
+    "psubh      %[ftmp8],   %[ftmp1],       %[ftmp4]        \n\t"
+
+    "pcmpeqh    %[ftmp0],   %[ftmp8],       %[ftmp0]        \n\t"
+    "ldc1       %[ftmp9],   %[ff_ph_01]                     \n\t"
+    "paddh      %[ftmp0],   %[ftmp0],       %[ftmp9]        \n\t"
+
+    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp6]        \n\t"
+    "psubh      %[ftmp2],   %[ftmp5],       %[ftmp6]        \n\t"
+    "ldc1       %[ftmp9],   %[ff_ph_07]                     \n\t"
+    "paddh      %[ftmp1],   %[ftmp1],       %[ftmp9]        \n\t"
+    "paddh      %[ftmp2],   %[ftmp2],       %[ftmp9]        \n\t"
+    MMI_LI(%[tmp0], 0x04)
+    "dmtc1      %[tmp0],    %[ftmp9]                        \n\t"
+    "psrah      %[ftmp1],   %[ftmp1],       %[ftmp9]        \n\t"
+    "psrah      %[ftmp2],   %[ftmp2],       %[ftmp9]        \n\t"
+
+    MMI_LI(%[tmp0], 0x10)
+    "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
+    "ldc1       %[ftmp12],  %[ff_pw_12000]                  \n\t"
+    "punpcklhw  %[ftmp5],   %[ftmp7],       %[ftmp8]        \n\t"
+    "pmaddhw    %[ftmp10],  %[ftmp5],       %[ff_ph_op1]    \n\t"
+    "punpckhhw  %[ftmp5],   %[ftmp7],       %[ftmp8]        \n\t"
+    "pmaddhw    %[ftmp11],  %[ftmp5],       %[ff_ph_op1]    \n\t"
+    "paddw      %[ftmp10],  %[ftmp10],      %[ftmp12]       \n\t"
+    "paddw      %[ftmp11],  %[ftmp11],      %[ftmp12]       \n\t"
+    "psraw      %[ftmp10],  %[ftmp10],      %[ftmp9]        \n\t"
+    "psraw      %[ftmp11],  %[ftmp11],      %[ftmp9]        \n\t"
+    "packsswh   %[ftmp3],   %[ftmp10],      %[ftmp11]       \n\t"
+    "paddh      %[ftmp3],   %[ftmp3],       %[ftmp0]        \n\t"
+
+    "ldc1       %[ftmp12],  %[ff_pw_51000]                  \n\t"
+    "punpcklhw  %[ftmp5],   %[ftmp8],       %[ftmp7]        \n\t"
+    "pmaddhw    %[ftmp10],  %[ftmp5],       %[ff_ph_op3]    \n\t"
+    "punpckhhw  %[ftmp5],   %[ftmp8],       %[ftmp7]        \n\t"
+    "pmaddhw    %[ftmp11],  %[ftmp5],       %[ff_ph_op3]    \n\t"
+    "paddw      %[ftmp10],  %[ftmp10],      %[ftmp12]       \n\t"
+    "paddw      %[ftmp11],  %[ftmp11],      %[ftmp12]       \n\t"
+    "psraw      %[ftmp10],  %[ftmp10],      %[ftmp9]        \n\t"
+    "psraw      %[ftmp11],  %[ftmp11],      %[ftmp9]        \n\t"
+    "packsswh   %[ftmp4],   %[ftmp10],      %[ftmp11]       \n\t"
+
+    "gssdlc1    %[ftmp1],   0x07(%[output])                 \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[output])                 \n\t"
+    "gssdlc1    %[ftmp3],   0x0f(%[output])                 \n\t"
+    "gssdrc1    %[ftmp3],   0x08(%[output])                 \n\t"
+    "gssdlc1    %[ftmp2],   0x17(%[output])                 \n\t"
+    "gssdrc1    %[ftmp2],   0x10(%[output])                 \n\t"
+    "gssdlc1    %[ftmp4],   0x1f(%[output])                 \n\t"
+    "gssdrc1    %[ftmp4],   0x18(%[output])                 \n\t"
+
+    : [ftmp0] "=&f"(ftmp0), [ftmp1] "=&f"(ftmp1), [ftmp2] "=&f"(ftmp2),
+      [ftmp3] "=&f"(ftmp3), [ftmp4] "=&f"(ftmp4), [ftmp5] "=&f"(ftmp5),
+      [ftmp6] "=&f"(ftmp6), [ftmp7] "=&f"(ftmp7), [ftmp8] "=&f"(ftmp8),
+      [ftmp9] "=&f"(ftmp9), [ftmp10] "=&f"(ftmp10), [ftmp11] "=&f"(ftmp11),
+      [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0]), [ip]"+&r"(ip),
+      [ff_ph_op1] "=&f"(ff_ph_op1), [ff_ph_op3] "=&f"(ff_ph_op3)
+    : [ff_ph_01] "m"(ff_ph_01), [ff_ph_07] "m"(ff_ph_07),
+      [ff_pw_14500] "m"(ff_pw_14500), [ff_pw_7500] "m"(ff_pw_7500),
+      [ff_pw_12000] "m"(ff_pw_12000), [ff_pw_51000] "m"(ff_pw_51000),
+      [ff_pw_5352]"m"(ff_pw_5352), [ff_pw_2217]"m"(ff_pw_2217),
+      [ff_ph_8]"m"(ff_ph_8), [pitch]"r"(pitch), [output] "r"(output)
+    : "memory"
+  );
+  /* clang-format on */
+}
+
+void vp8_short_fdct8x4_mmi(int16_t *input, int16_t *output, int pitch) {
+  vp8_short_fdct4x4_mmi(input, output, pitch);
+  vp8_short_fdct4x4_mmi(input + 4, output + 16, pitch);
+}
+
+void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) {
+  double ftmp[13], ff_ph_01, ff_pw_01, ff_pw_03, ff_pw_mask;
+  uint64_t tmp[1];
+
+  /* clang-format off */
+  __asm__ volatile (
+    "dli        %[tmp0],    0x0001000100010001                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_ph_01]                         \n\t"
+    "dli        %[tmp0],    0x0000000100000001                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_pw_01]                         \n\t"
+    "dli        %[tmp0],    0x0000000300000003                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_pw_03]                         \n\t"
+    "dli        %[tmp0],    0x0001000000010000                  \n\t"
+    "dmtc1      %[tmp0],    %[ff_pw_mask]                       \n\t"
+    MMI_LI(%[tmp0], 0x02)
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "dmtc1      %[tmp0],    %[ftmp11]                           \n\t"
+
+    "gsldlc1    %[ftmp1],   0x07(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[ip])                         \n\t"
+    MMI_ADDU(%[ip], %[ip], %[pitch])
+    "gsldlc1    %[ftmp2],   0x07(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[ip])                         \n\t"
+    MMI_ADDU(%[ip], %[ip], %[pitch])
+    "gsldlc1    %[ftmp3],   0x07(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp3],   0x00(%[ip])                         \n\t"
+    MMI_ADDU(%[ip], %[ip], %[pitch])
+    "gsldlc1    %[ftmp4],   0x07(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp4],   0x00(%[ip])                         \n\t"
+    TRANSPOSE_4H
+
+    "psllh      %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
+    "psllh      %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
+    "psllh      %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
+    "psllh      %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
+    // a
+    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp3]            \n\t"
+    // d
+    "paddh      %[ftmp6],   %[ftmp2],       %[ftmp4]            \n\t"
+    // c
+    "psubh      %[ftmp7],   %[ftmp2],       %[ftmp4]            \n\t"
+    // b
+    "psubh      %[ftmp8],   %[ftmp1],       %[ftmp3]            \n\t"
+
+    // a + d
+    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"
+    // b + c
+    "paddh      %[ftmp2],   %[ftmp8],       %[ftmp7]            \n\t"
+    // b - c
+    "psubh      %[ftmp3],   %[ftmp8],       %[ftmp7]            \n\t"
+    // a - d
+    "psubh      %[ftmp4],   %[ftmp5],       %[ftmp6]            \n\t"
+
+    "pcmpeqh    %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
+    "paddh      %[ftmp6],   %[ftmp6],       %[ff_ph_01]         \n\t"
+    "paddh      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
+    TRANSPOSE_4H
+
+    // op[2], op[0]
+    "pmaddhw    %[ftmp5],   %[ftmp1],       %[ff_pw_01]         \n\t"
+    // op[3], op[1]
+    "pmaddhw    %[ftmp1],   %[ftmp1],       %[ff_pw_mask]       \n\t"
+
+    // op[6], op[4]
+    "pmaddhw    %[ftmp6],   %[ftmp2],       %[ff_pw_01]         \n\t"
+    // op[7], op[5]
+    "pmaddhw    %[ftmp2],   %[ftmp2],       %[ff_pw_mask]       \n\t"
+
+    // op[10], op[8]
+    "pmaddhw    %[ftmp7],   %[ftmp3],       %[ff_pw_01]         \n\t"
+    // op[11], op[9]
+    "pmaddhw    %[ftmp3],   %[ftmp3],       %[ff_pw_mask]       \n\t"
+
+    // op[14], op[12]
+    "pmaddhw    %[ftmp8],   %[ftmp4],       %[ff_pw_01]         \n\t"
+    // op[15], op[13]
+    "pmaddhw    %[ftmp4],   %[ftmp4],       %[ff_pw_mask]       \n\t"
+
+    // a1, a3
+    "paddw      %[ftmp9],   %[ftmp5],       %[ftmp7]            \n\t"
+    // d1, d3
+    "paddw      %[ftmp10],  %[ftmp6],       %[ftmp8]            \n\t"
+    // c1, c3
+    "psubw      %[ftmp11],  %[ftmp6],       %[ftmp8]            \n\t"
+    // b1, b3
+    "psubw      %[ftmp12],  %[ftmp5],       %[ftmp7]            \n\t"
+
+    // a1 + d1, a3 + d3
+    "paddw      %[ftmp5],   %[ftmp9],       %[ftmp10]           \n\t"
+    // b1 + c1, b3 + c3
+    "paddw      %[ftmp6],   %[ftmp12],      %[ftmp11]           \n\t"
+    // b1 - c1, b3 - c3
+    "psubw      %[ftmp7],   %[ftmp12],      %[ftmp11]           \n\t"
+    // a1 - d1, a3 - d3
+    "psubw      %[ftmp8],   %[ftmp9],       %[ftmp10]           \n\t"
+
+    // a2, a4
+    "paddw      %[ftmp9],   %[ftmp1],       %[ftmp3]            \n\t"
+    // d2, d4
+    "paddw      %[ftmp10],  %[ftmp2],       %[ftmp4]            \n\t"
+    // c2, c4
+    "psubw      %[ftmp11],  %[ftmp2],       %[ftmp4]            \n\t"
+    // b2, b4
+    "psubw      %[ftmp12],  %[ftmp1],       %[ftmp3]            \n\t"
+
+    // a2 + d2, a4 + d4
+    "paddw      %[ftmp1],   %[ftmp9],       %[ftmp10]           \n\t"
+    // b2 + c2, b4 + c4
+    "paddw      %[ftmp2],   %[ftmp12],      %[ftmp11]           \n\t"
+    // b2 - c2, b4 - c4
+    "psubw      %[ftmp3],   %[ftmp12],      %[ftmp11]           \n\t"
+    // a2 - d2, a4 - d4
+    "psubw      %[ftmp4],   %[ftmp9],       %[ftmp10]           \n\t"
+
+    MMI_LI(%[tmp0], 0x03)
+    "dmtc1      %[tmp0],    %[ftmp11]                           \n\t"
+
+    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp1]            \n\t"
+    "pand       %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "paddw      %[ftmp1],   %[ftmp1],       %[ftmp9]            \n\t"
+    "paddw      %[ftmp1],   %[ftmp1],       %[ff_pw_03]         \n\t"
+    "psraw      %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
+
+    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp2]            \n\t"
+    "pand       %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "paddw      %[ftmp2],   %[ftmp2],       %[ftmp9]            \n\t"
+    "paddw      %[ftmp2],   %[ftmp2],       %[ff_pw_03]         \n\t"
+    "psraw      %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
+
+    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp3]            \n\t"
+    "pand       %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "paddw      %[ftmp3],   %[ftmp3],       %[ftmp9]            \n\t"
+    "paddw      %[ftmp3],   %[ftmp3],       %[ff_pw_03]         \n\t"
+    "psraw      %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
+
+    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp4]            \n\t"
+    "pand       %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "paddw      %[ftmp4],   %[ftmp4],       %[ftmp9]            \n\t"
+    "paddw      %[ftmp4],   %[ftmp4],       %[ff_pw_03]         \n\t"
+    "psraw      %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
+
+    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp5]            \n\t"
+    "pand       %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "paddw      %[ftmp5],   %[ftmp5],       %[ftmp9]            \n\t"
+    "paddw      %[ftmp5],   %[ftmp5],       %[ff_pw_03]         \n\t"
+    "psraw      %[ftmp5],   %[ftmp5],       %[ftmp11]           \n\t"
+
+    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp6]            \n\t"
+    "pand       %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "paddw      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+    "paddw      %[ftmp6],   %[ftmp6],       %[ff_pw_03]         \n\t"
+    "psraw      %[ftmp6],   %[ftmp6],       %[ftmp11]           \n\t"
+
+    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp7]            \n\t"
+    "pand       %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "paddw      %[ftmp7],   %[ftmp7],       %[ftmp9]            \n\t"
+    "paddw      %[ftmp7],   %[ftmp7],       %[ff_pw_03]         \n\t"
+    "psraw      %[ftmp7],   %[ftmp7],       %[ftmp11]           \n\t"
+
+    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp8]            \n\t"
+    "pand       %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "paddw      %[ftmp8],   %[ftmp8],       %[ftmp9]            \n\t"
+    "paddw      %[ftmp8],   %[ftmp8],       %[ff_pw_03]         \n\t"
+    "psraw      %[ftmp8],   %[ftmp8],       %[ftmp11]           \n\t"
+
+    "packsswh   %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
+    "packsswh   %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
+    "packsswh   %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
+    "packsswh   %[ftmp4],   %[ftmp4],       %[ftmp8]            \n\t"
+
+    MMI_LI(%[tmp0], 0x72)
+    "dmtc1      %[tmp0],    %[ftmp11]                           \n\t"
+    "pshufh     %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
+    "pshufh     %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
+    "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
+    "pshufh     %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
+
+    "gssdlc1    %[ftmp1],   0x07(%[op])                         \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[op])                         \n\t"
+    "gssdlc1    %[ftmp2],   0x0f(%[op])                         \n\t"
+    "gssdrc1    %[ftmp2],   0x08(%[op])                         \n\t"
+    "gssdlc1    %[ftmp3],   0x17(%[op])                         \n\t"
+    "gssdrc1    %[ftmp3],   0x10(%[op])                         \n\t"
+    "gssdlc1    %[ftmp4],   0x1f(%[op])                         \n\t"
+    "gssdrc1    %[ftmp4],   0x18(%[op])                         \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+      [ftmp12]"=&f"(ftmp[12]),          [ff_pw_mask]"=&f"(ff_pw_mask),
+      [tmp0]"=&r"(tmp[0]),              [ff_pw_01]"=&f"(ff_pw_01),
+      [ip]"+&r"(input),                 [ff_pw_03]"=&f"(ff_pw_03),
+      [ff_ph_01]"=&f"(ff_ph_01)
+    : [op]"r"(output),                  [pitch]"r"((mips_reg)pitch)
+    : "memory"
+  );
+  /* clang-format on */
+}
diff --git a/media/libvpx/libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c b/media/libvpx/libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c
new file mode 100644
index 0000000000..1986444aa3
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c
@@ -0,0 +1,263 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/asmdefs_mmi.h"
+#include "vp8/encoder/onyx_int.h"
+#include "vp8/encoder/quantize.h"
+#include "vp8/common/quant_common.h"
+
+#define REGULAR_SELECT_EOB(i, rc)                                        \
+  z = coeff_ptr[rc];                                                     \
+  sz = (z >> 31);                                                        \
+  x = (z ^ sz) - sz;                                                     \
+  zbin = zbin_ptr[rc] + *(zbin_boost_ptr++) + zbin_oq_value;             \
+  if (x >= zbin) {                                                       \
+    x += round_ptr[rc];                                                  \
+    y = ((((x * quant_ptr[rc]) >> 16) + x) * quant_shift_ptr[rc]) >> 16; \
+    if (y) {                                                             \
+      x = (y ^ sz) - sz;                                                 \
+      qcoeff_ptr[rc] = x;                                                \
+      dqcoeff_ptr[rc] = x * dequant_ptr[rc];                             \
+      eob = i;                                                           \
+      zbin_boost_ptr = b->zrun_zbin_boost;                               \
+    }                                                                    \
+  }
+
+void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) {
+  const int16_t *coeff_ptr = b->coeff;
+  const int16_t *round_ptr = b->round;
+  const int16_t *quant_ptr = b->quant_fast;
+  int16_t *qcoeff_ptr = d->qcoeff;
+  int16_t *dqcoeff_ptr = d->dqcoeff;
+  const int16_t *dequant_ptr = d->dequant;
+  const int16_t *inv_zig_zag = vp8_default_inv_zig_zag;
+
+  double ftmp[13];
+  uint64_t tmp[1];
+  int64_t eob = 0;
+  double ones;
+
+  __asm__ volatile(
+      // loop 0 ~ 7
+      "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]        \n\t"
+      "pcmpeqh    %[ones],    %[ones],        %[ones]         \n\t"
+      "gsldlc1    %[ftmp1],   0x07(%[coeff_ptr])              \n\t"
+      "gsldrc1    %[ftmp1],   0x00(%[coeff_ptr])              \n\t"
+      "dli        %[tmp0],    0x0f                            \n\t"
+      "dmtc1      %[tmp0],    %[ftmp9]                        \n\t"
+      "gsldlc1    %[ftmp2],   0x0f(%[coeff_ptr])              \n\t"
+      "gsldrc1    %[ftmp2],   0x08(%[coeff_ptr])              \n\t"
+
+      "psrah      %[ftmp3],   %[ftmp1],       %[ftmp9]        \n\t"
+      "pxor       %[ftmp1],   %[ftmp3],       %[ftmp1]        \n\t"
+      "psubh      %[ftmp1],   %[ftmp1],       %[ftmp3]        \n\t"
+      "psrah      %[ftmp4],   %[ftmp2],       %[ftmp9]        \n\t"
+      "pxor       %[ftmp2],   %[ftmp4],       %[ftmp2]        \n\t"
+      "psubh      %[ftmp2],   %[ftmp2],       %[ftmp4]        \n\t"
+
+      "gsldlc1    %[ftmp5],   0x07(%[round_ptr])              \n\t"
+      "gsldrc1    %[ftmp5],   0x00(%[round_ptr])              \n\t"
+      "gsldlc1    %[ftmp6],   0x0f(%[round_ptr])              \n\t"
+      "gsldrc1    %[ftmp6],   0x08(%[round_ptr])              \n\t"
+      "paddh      %[ftmp5],   %[ftmp5],       %[ftmp1]        \n\t"
+      "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]        \n\t"
+      "gsldlc1    %[ftmp7],   0x07(%[quant_ptr])              \n\t"
+      "gsldrc1    %[ftmp7],   0x00(%[quant_ptr])              \n\t"
+      "gsldlc1    %[ftmp8],   0x0f(%[quant_ptr])              \n\t"
+      "gsldrc1    %[ftmp8],   0x08(%[quant_ptr])              \n\t"
+      "pmulhuh    %[ftmp5],   %[ftmp5],       %[ftmp7]        \n\t"
+      "pmulhuh    %[ftmp6],   %[ftmp6],       %[ftmp8]        \n\t"
+
+      "pxor       %[ftmp7],   %[ftmp5],       %[ftmp3]        \n\t"
+      "pxor       %[ftmp8],   %[ftmp6],       %[ftmp4]        \n\t"
+      "psubh      %[ftmp7],   %[ftmp7],       %[ftmp3]        \n\t"
+      "psubh      %[ftmp8],   %[ftmp8],       %[ftmp4]        \n\t"
+      "gssdlc1    %[ftmp7],   0x07(%[qcoeff_ptr])             \n\t"
+      "gssdrc1    %[ftmp7],   0x00(%[qcoeff_ptr])             \n\t"
+      "gssdlc1    %[ftmp8],   0x0f(%[qcoeff_ptr])             \n\t"
+      "gssdrc1    %[ftmp8],   0x08(%[qcoeff_ptr])             \n\t"
+
+      "gsldlc1    %[ftmp1],   0x07(%[inv_zig_zag])            \n\t"
+      "gsldrc1    %[ftmp1],   0x00(%[inv_zig_zag])            \n\t"
+      "gsldlc1    %[ftmp2],   0x0f(%[inv_zig_zag])            \n\t"
+      "gsldrc1    %[ftmp2],   0x08(%[inv_zig_zag])            \n\t"
+      "pcmpeqh    %[ftmp5],   %[ftmp5],       %[ftmp0]        \n\t"
+      "pcmpeqh    %[ftmp6],   %[ftmp6],       %[ftmp0]        \n\t"
+      "pxor       %[ftmp5],   %[ftmp5],       %[ones]         \n\t"
+      "pxor       %[ftmp6],   %[ftmp6],       %[ones]         \n\t"
+      "pand       %[ftmp5],   %[ftmp5],       %[ftmp1]        \n\t"
+      "pand       %[ftmp6],   %[ftmp6],       %[ftmp2]        \n\t"
+      "pmaxsh     %[ftmp10],  %[ftmp5],       %[ftmp6]        \n\t"
+
+      "gsldlc1    %[ftmp5],   0x07(%[dequant_ptr])            \n\t"
+      "gsldrc1    %[ftmp5],   0x00(%[dequant_ptr])            \n\t"
+      "gsldlc1    %[ftmp6],   0x0f(%[dequant_ptr])            \n\t"
+      "gsldrc1    %[ftmp6],   0x08(%[dequant_ptr])            \n\t"
+      "pmullh     %[ftmp5],   %[ftmp5],       %[ftmp7]        \n\t"
+      "pmullh     %[ftmp6],   %[ftmp6],       %[ftmp8]        \n\t"
+      "gssdlc1    %[ftmp5],   0x07(%[dqcoeff_ptr])            \n\t"
+      "gssdrc1    %[ftmp5],   0x00(%[dqcoeff_ptr])            \n\t"
+      "gssdlc1    %[ftmp6],   0x0f(%[dqcoeff_ptr])            \n\t"
+      "gssdrc1    %[ftmp6],   0x08(%[dqcoeff_ptr])            \n\t"
+
+      // loop 8 ~ 15
+      "gsldlc1    %[ftmp1],   0x17(%[coeff_ptr])              \n\t"
+      "gsldrc1    %[ftmp1],   0x10(%[coeff_ptr])              \n\t"
+      "gsldlc1    %[ftmp2],   0x1f(%[coeff_ptr])              \n\t"
+      "gsldrc1    %[ftmp2],   0x18(%[coeff_ptr])              \n\t"
+
+      "psrah      %[ftmp3],   %[ftmp1],       %[ftmp9]        \n\t"
+      "pxor       %[ftmp1],   %[ftmp3],       %[ftmp1]        \n\t"
+      "psubh      %[ftmp1],   %[ftmp1],       %[ftmp3]        \n\t"
+      "psrah      %[ftmp4],   %[ftmp2],       %[ftmp9]        \n\t"
+      "pxor       %[ftmp2],   %[ftmp4],       %[ftmp2]        \n\t"
+      "psubh      %[ftmp2],   %[ftmp2],       %[ftmp4]        \n\t"
+
+      "gsldlc1    %[ftmp5],   0x17(%[round_ptr])              \n\t"
+      "gsldrc1    %[ftmp5],   0x10(%[round_ptr])              \n\t"
+      "gsldlc1    %[ftmp6],   0x1f(%[round_ptr])              \n\t"
+      "gsldrc1    %[ftmp6],   0x18(%[round_ptr])              \n\t"
+      "paddh      %[ftmp5],   %[ftmp5],       %[ftmp1]        \n\t"
+      "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]        \n\t"
+      "gsldlc1    %[ftmp7],   0x17(%[quant_ptr])              \n\t"
+      "gsldrc1    %[ftmp7],   0x10(%[quant_ptr])              \n\t"
+      "gsldlc1    %[ftmp8],   0x1f(%[quant_ptr])              \n\t"
+      "gsldrc1    %[ftmp8],   0x18(%[quant_ptr])              \n\t"
+      "pmulhuh    %[ftmp5],   %[ftmp5],       %[ftmp7]        \n\t"
+      "pmulhuh    %[ftmp6],   %[ftmp6],       %[ftmp8]        \n\t"
+
+      "pxor       %[ftmp7],   %[ftmp5],       %[ftmp3]        \n\t"
+      "pxor       %[ftmp8],   %[ftmp6],       %[ftmp4]        \n\t"
+      "psubh      %[ftmp7],   %[ftmp7],       %[ftmp3]        \n\t"
+      "psubh      %[ftmp8],   %[ftmp8],       %[ftmp4]        \n\t"
+      "gssdlc1    %[ftmp7],   0x17(%[qcoeff_ptr])             \n\t"
+      "gssdrc1    %[ftmp7],   0x10(%[qcoeff_ptr])             \n\t"
+      "gssdlc1    %[ftmp8],   0x1f(%[qcoeff_ptr])             \n\t"
+      "gssdrc1    %[ftmp8],   0x18(%[qcoeff_ptr])             \n\t"
+
+      "gsldlc1    %[ftmp1],   0x17(%[inv_zig_zag])            \n\t"
+      "gsldrc1    %[ftmp1],   0x10(%[inv_zig_zag])            \n\t"
+      "gsldlc1    %[ftmp2],   0x1f(%[inv_zig_zag])            \n\t"
+      "gsldrc1    %[ftmp2],   0x18(%[inv_zig_zag])            \n\t"
+      "pcmpeqh    %[ftmp5],   %[ftmp5],       %[ftmp0]        \n\t"
+      "pcmpeqh    %[ftmp6],   %[ftmp6],       %[ftmp0]        \n\t"
+      "pxor       %[ftmp5],   %[ftmp5],       %[ones]         \n\t"
+      "pxor       %[ftmp6],   %[ftmp6],       %[ones]         \n\t"
+      "pand       %[ftmp5],   %[ftmp5],       %[ftmp1]        \n\t"
+      "pand       %[ftmp6],   %[ftmp6],       %[ftmp2]        \n\t"
+      "pmaxsh     %[ftmp11],  %[ftmp5],       %[ftmp6]        \n\t"
+
+      "gsldlc1    %[ftmp5],   0x17(%[dequant_ptr])            \n\t"
+      "gsldrc1    %[ftmp5],   0x10(%[dequant_ptr])            \n\t"
+      "gsldlc1    %[ftmp6],   0x1f(%[dequant_ptr])            \n\t"
+      "gsldrc1    %[ftmp6],   0x18(%[dequant_ptr])            \n\t"
+      "pmullh     %[ftmp5],   %[ftmp5],       %[ftmp7]        \n\t"
+      "pmullh     %[ftmp6],   %[ftmp6],       %[ftmp8]        \n\t"
+      "gssdlc1    %[ftmp5],   0x17(%[dqcoeff_ptr])            \n\t"
+      "gssdrc1    %[ftmp5],   0x10(%[dqcoeff_ptr])            \n\t"
+      "gssdlc1    %[ftmp6],   0x1f(%[dqcoeff_ptr])            \n\t"
+      "gssdrc1    %[ftmp6],   0x18(%[dqcoeff_ptr])            \n\t"
+
+      "dli        %[tmp0],    0x10                            \n\t"
+      "dmtc1      %[tmp0],    %[ftmp9]                        \n\t"
+
+      "pmaxsh     %[ftmp10],  %[ftmp10],       %[ftmp11]      \n\t"
+      "psrlw      %[ftmp11],  %[ftmp10],       %[ftmp9]       \n\t"
+      "pmaxsh     %[ftmp10],  %[ftmp10],       %[ftmp11]      \n\t"
+      "dli        %[tmp0],    0xaa                            \n\t"
+      "dmtc1      %[tmp0],    %[ftmp9]                        \n\t"
+      "pshufh     %[ftmp11],  %[ftmp10],       %[ftmp9]       \n\t"
+      "pmaxsh     %[ftmp10],  %[ftmp10],       %[ftmp11]      \n\t"
+      "dli        %[tmp0],    0xffff                          \n\t"
+      "dmtc1      %[tmp0],    %[ftmp9]                        \n\t"
+      "pand       %[ftmp10],  %[ftmp10],       %[ftmp9]       \n\t"
+      "gssdlc1    %[ftmp10],  0x07(%[eob])                    \n\t"
+      "gssdrc1    %[ftmp10],  0x00(%[eob])                    \n\t"
+      : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+        [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+        [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
+        [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
+        [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
+        [tmp0] "=&r"(tmp[0]), [ones] "=&f"(ones)
+      : [coeff_ptr] "r"((mips_reg)coeff_ptr),
+        [qcoeff_ptr] "r"((mips_reg)qcoeff_ptr),
+        [dequant_ptr] "r"((mips_reg)dequant_ptr),
+        [round_ptr] "r"((mips_reg)round_ptr),
+        [quant_ptr] "r"((mips_reg)quant_ptr),
+        [dqcoeff_ptr] "r"((mips_reg)dqcoeff_ptr),
+        [inv_zig_zag] "r"((mips_reg)inv_zig_zag), [eob] "r"((mips_reg)&eob)
+      : "memory");
+
+  *d->eob = eob;
+}
+
+void vp8_regular_quantize_b_mmi(BLOCK *b, BLOCKD *d) {
+  int eob = 0;
+  int x, y, z, sz, zbin;
+  const int16_t *zbin_boost_ptr = b->zrun_zbin_boost;
+  const int16_t *coeff_ptr = b->coeff;
+  const int16_t *zbin_ptr = b->zbin;
+  const int16_t *round_ptr = b->round;
+  const int16_t *quant_ptr = b->quant;
+  const int16_t *quant_shift_ptr = b->quant_shift;
+  int16_t *qcoeff_ptr = d->qcoeff;
+  int16_t *dqcoeff_ptr = d->dqcoeff;
+  const int16_t *dequant_ptr = d->dequant;
+  const int16_t zbin_oq_value = b->zbin_extra;
+  register double ftmp0 asm("$f0");
+
+  //  memset(qcoeff_ptr, 0, 32);
+  //  memset(dqcoeff_ptr, 0, 32);
+  /* clang-format off */
+  __asm__ volatile (
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]        \n\t"
+    "gssdlc1    %[ftmp0],   0x07(%[qcoeff_ptr])             \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[qcoeff_ptr])             \n\t"
+    "gssdlc1    %[ftmp0],   0x0f(%[qcoeff_ptr])             \n\t"
+    "gssdrc1    %[ftmp0],   0x08(%[qcoeff_ptr])             \n\t"
+    "gssdlc1    %[ftmp0],   0x17(%[qcoeff_ptr])             \n\t"
+    "gssdrc1    %[ftmp0],   0x10(%[qcoeff_ptr])             \n\t"
+    "gssdlc1    %[ftmp0],   0x1f(%[qcoeff_ptr])             \n\t"
+    "gssdrc1    %[ftmp0],   0x18(%[qcoeff_ptr])             \n\t"
+
+    "gssdlc1    %[ftmp0],   0x07(%[dqcoeff_ptr])            \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[dqcoeff_ptr])            \n\t"
+    "gssdlc1    %[ftmp0],   0x0f(%[dqcoeff_ptr])            \n\t"
+    "gssdrc1    %[ftmp0],   0x08(%[dqcoeff_ptr])            \n\t"
+    "gssdlc1    %[ftmp0],   0x17(%[dqcoeff_ptr])            \n\t"
+    "gssdrc1    %[ftmp0],   0x10(%[dqcoeff_ptr])            \n\t"
+    "gssdlc1    %[ftmp0],   0x1f(%[dqcoeff_ptr])            \n\t"
+    "gssdrc1    %[ftmp0],   0x18(%[dqcoeff_ptr])            \n\t"
+    : [ftmp0]"=&f"(ftmp0)
+    : [qcoeff_ptr]"r"(qcoeff_ptr), [dqcoeff_ptr]"r"(dqcoeff_ptr)
+    : "memory"
+  );
+  /* clang-format on */
+
+  REGULAR_SELECT_EOB(1, 0);
+  REGULAR_SELECT_EOB(2, 1);
+  REGULAR_SELECT_EOB(3, 4);
+  REGULAR_SELECT_EOB(4, 8);
+  REGULAR_SELECT_EOB(5, 5);
+  REGULAR_SELECT_EOB(6, 2);
+  REGULAR_SELECT_EOB(7, 3);
+  REGULAR_SELECT_EOB(8, 6);
+  REGULAR_SELECT_EOB(9, 9);
+  REGULAR_SELECT_EOB(10, 12);
+  REGULAR_SELECT_EOB(11, 13);
+  REGULAR_SELECT_EOB(12, 10);
+  REGULAR_SELECT_EOB(13, 7);
+  REGULAR_SELECT_EOB(14, 11);
+  REGULAR_SELECT_EOB(15, 14);
+  REGULAR_SELECT_EOB(16, 15);
+
+  *d->eob = (char)eob;
+}
diff --git a/media/libvpx/libvpx/vp8/encoder/modecosts.h b/media/libvpx/libvpx/vp8/encoder/modecosts.h
index dfb8989f7f..09ee2b5520 100644
--- a/media/libvpx/libvpx/vp8/encoder/modecosts.h
+++ b/media/libvpx/libvpx/vp8/encoder/modecosts.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_MODECOSTS_H_
-#define VP8_ENCODER_MODECOSTS_H_
+#ifndef VPX_VP8_ENCODER_MODECOSTS_H_
+#define VPX_VP8_ENCODER_MODECOSTS_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -17,10 +17,10 @@ extern "C" {
 
 struct VP8_COMP;
 
-void vp8_init_mode_costs(struct VP8_COMP *x);
+void vp8_init_mode_costs(struct VP8_COMP *c);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_MODECOSTS_H_
+#endif  // VPX_VP8_ENCODER_MODECOSTS_H_
diff --git a/media/libvpx/libvpx/vp8/encoder/mr_dissim.c b/media/libvpx/libvpx/vp8/encoder/mr_dissim.c
index 011b62a08f..b1bfb4b54a 100644
--- a/media/libvpx/libvpx/vp8/encoder/mr_dissim.c
+++ b/media/libvpx/libvpx/vp8/encoder/mr_dissim.c
@@ -49,7 +49,6 @@ void vp8_cal_low_res_mb_cols(VP8_COMP *cpi) {
 
 void vp8_cal_dissimilarity(VP8_COMP *cpi) {
   VP8_COMMON *cm = &cpi->common;
-  int i;
 
   /* Note: The first row & first column in mip are outside the frame, which
    * were initialized to all 0.(ref_frame, mode, mv...)
@@ -67,6 +66,7 @@ void vp8_cal_dissimilarity(VP8_COMP *cpi) {
     store_info->frame_type = cm->frame_type;
 
     if (cm->frame_type != KEY_FRAME) {
+      int i;
       store_info->is_frame_dropped = 0;
       for (i = 1; i < MAX_REF_FRAMES; ++i)
         store_info->low_res_ref_frames[i] = cpi->current_ref_frames[i];
diff --git a/media/libvpx/libvpx/vp8/encoder/mr_dissim.h b/media/libvpx/libvpx/vp8/encoder/mr_dissim.h
index da36628afa..58f5a97623 100644
--- a/media/libvpx/libvpx/vp8/encoder/mr_dissim.h
+++ b/media/libvpx/libvpx/vp8/encoder/mr_dissim.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_MR_DISSIM_H_
-#define VP8_ENCODER_MR_DISSIM_H_
+#ifndef VPX_VP8_ENCODER_MR_DISSIM_H_
+#define VPX_VP8_ENCODER_MR_DISSIM_H_
 #include "vpx_config.h"
 
 #ifdef __cplusplus
@@ -24,4 +24,4 @@ extern void vp8_store_drop_frame_info(VP8_COMP *cpi);
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_MR_DISSIM_H_
+#endif  // VPX_VP8_ENCODER_MR_DISSIM_H_
diff --git a/media/libvpx/libvpx/vp8/encoder/onyx_if.c b/media/libvpx/libvpx/vp8/encoder/onyx_if.c
index 648a616c9e..e9cc2365b4 100644
--- a/media/libvpx/libvpx/vp8/encoder/onyx_if.c
+++ b/media/libvpx/libvpx/vp8/encoder/onyx_if.c
@@ -12,15 +12,18 @@
 #include "./vpx_scale_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "./vp8_rtcd.h"
+#include "bitstream.h"
 #include "vp8/common/onyxc_int.h"
 #include "vp8/common/blockd.h"
 #include "onyx_int.h"
 #include "vp8/common/systemdependent.h"
+#include "vp8/common/vp8_skin_detection.h"
 #include "vp8/encoder/quantize.h"
 #include "vp8/common/alloccommon.h"
 #include "mcomp.h"
 #include "firstpass.h"
 #include "vpx_dsp/psnr.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_scale/vpx_scale.h"
 #include "vp8/common/extend.h"
 #include "ratectrl.h"
@@ -34,43 +37,37 @@
 #include "vp8/common/swapyv12buffer.h"
 #include "vp8/common/threading.h"
 #include "vpx_ports/system_state.h"
+#include "vpx_ports/vpx_once.h"
 #include "vpx_ports/vpx_timer.h"
-#if ARCH_ARM
+#include "vpx_util/vpx_write_yuv_frame.h"
+#if VPX_ARCH_ARM
 #include "vpx_ports/arm.h"
 #endif
 #if CONFIG_MULTI_RES_ENCODING
 #include "mr_dissim.h"
 #endif
 #include "encodeframe.h"
+#if CONFIG_MULTITHREAD
+#include "ethreading.h"
+#endif
+#include "picklpf.h"
+#if !CONFIG_REALTIME_ONLY
+#include "temporal_filter.h"
+#endif
 
+#include <assert.h>
 #include <math.h>
 #include <stdio.h>
 #include <limits.h>
 
 #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
 extern int vp8_update_coef_context(VP8_COMP *cpi);
-extern void vp8_update_coef_probs(VP8_COMP *cpi);
 #endif
 
-extern void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi);
-extern void vp8cx_set_alt_lf_level(VP8_COMP *cpi, int filt_val);
-extern void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi);
-
-extern void vp8_deblock_frame(YV12_BUFFER_CONFIG *source,
-                              YV12_BUFFER_CONFIG *post, int filt_lvl,
-                              int low_var_thresh, int flag);
-extern void print_parms(VP8_CONFIG *ocf, char *filenam);
-extern unsigned int vp8_get_processor_freq();
-extern void print_tree_update_probs();
-extern int vp8cx_create_encoder_threads(VP8_COMP *cpi);
-extern void vp8cx_remove_encoder_threads(VP8_COMP *cpi);
-
-int vp8_estimate_entropy_savings(VP8_COMP *cpi);
+extern unsigned int vp8_get_processor_freq(void);
 
 int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);
 
-extern void vp8_temporal_filter_prepare_c(VP8_COMP *cpi, int distance);
-
 static void set_default_lf_deltas(VP8_COMP *cpi);
 
 extern const int vp8_gf_interval_table[101];
@@ -86,6 +83,9 @@ FILE *yuv_file;
 #ifdef OUTPUT_YUV_DENOISED
 FILE *yuv_denoised_file;
 #endif
+#ifdef OUTPUT_YUV_SKINMAP
+static FILE *yuv_skinmap_file = NULL;
+#endif
 
 #if 0
 FILE *framepsnr;
@@ -98,10 +98,6 @@ extern int skip_true_count;
 extern int skip_false_count;
 #endif
 
-#ifdef VP8_ENTROPY_STATS
-extern int intra_mode_stats[10][10][10];
-#endif
-
 #ifdef SPEEDSTATS
 unsigned int frames_at_speed[16] = { 0, 0, 0, 0, 0, 0, 0, 0,
                                      0, 0, 0, 0, 0, 0, 0, 0 };
@@ -189,7 +185,7 @@ static const unsigned char inter_minq[QINDEX_RANGE] = {
 extern FILE *vpxlogc;
 #endif
 
-static void save_layer_context(VP8_COMP *cpi) {
+void vp8_save_layer_context(VP8_COMP *cpi) {
   LAYER_CONTEXT *lc = &cpi->layer_context[cpi->current_layer];
 
   /* Save layer dependent coding state */
@@ -218,14 +214,17 @@ static void save_layer_context(VP8_COMP *cpi) {
   lc->inter_frame_target = cpi->inter_frame_target;
   lc->total_byte_count = cpi->total_byte_count;
   lc->filter_level = cpi->common.filter_level;
-
+  lc->frames_since_last_drop_overshoot = cpi->frames_since_last_drop_overshoot;
+  lc->force_maxqp = cpi->force_maxqp;
   lc->last_frame_percent_intra = cpi->last_frame_percent_intra;
+  lc->last_q[0] = cpi->last_q[0];
+  lc->last_q[1] = cpi->last_q[1];
 
   memcpy(lc->count_mb_ref_frame_usage, cpi->mb.count_mb_ref_frame_usage,
          sizeof(cpi->mb.count_mb_ref_frame_usage));
 }
 
-static void restore_layer_context(VP8_COMP *cpi, const int layer) {
+void vp8_restore_layer_context(VP8_COMP *cpi, const int layer) {
   LAYER_CONTEXT *lc = &cpi->layer_context[layer];
 
   /* Restore layer dependent coding state */
@@ -254,8 +253,11 @@ static void restore_layer_context(VP8_COMP *cpi, const int layer) {
   cpi->inter_frame_target = lc->inter_frame_target;
   cpi->total_byte_count = lc->total_byte_count;
   cpi->common.filter_level = lc->filter_level;
-
+  cpi->frames_since_last_drop_overshoot = lc->frames_since_last_drop_overshoot;
+  cpi->force_maxqp = lc->force_maxqp;
   cpi->last_frame_percent_intra = lc->last_frame_percent_intra;
+  cpi->last_q[0] = lc->last_q[0];
+  cpi->last_q[1] = lc->last_q[1];
 
   memcpy(cpi->mb.count_mb_ref_frame_usage, lc->count_mb_ref_frame_usage,
          sizeof(cpi->mb.count_mb_ref_frame_usage));
@@ -266,16 +268,23 @@ static int rescale(int val, int num, int denom) {
   int64_t llden = denom;
   int64_t llval = val;
 
-  return (int)(llval * llnum / llden);
+  int64_t result = (llval * llnum / llden);
+  if (result <= INT_MAX)
+    return (int)result;
+  else
+    return INT_MAX;
 }
 
-static void init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf,
-                                        const int layer,
-                                        double prev_layer_framerate) {
+void vp8_init_temporal_layer_context(VP8_COMP *cpi, const VP8_CONFIG *oxcf,
+                                     const int layer,
+                                     double prev_layer_framerate) {
   LAYER_CONTEXT *lc = &cpi->layer_context[layer];
 
   lc->framerate = cpi->output_framerate / cpi->oxcf.rate_decimator[layer];
-  lc->target_bandwidth = cpi->oxcf.target_bitrate[layer] * 1000;
+  if (cpi->oxcf.target_bitrate[layer] > INT_MAX / 1000)
+    lc->target_bandwidth = INT_MAX;
+  else
+    lc->target_bandwidth = cpi->oxcf.target_bitrate[layer] * 1000;
 
   lc->starting_buffer_level_in_ms = oxcf->starting_buffer_level;
   lc->optimal_buffer_level_in_ms = oxcf->optimal_buffer_level;
@@ -301,9 +310,9 @@ static void init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf,
   /* Work out the average size of a frame within this layer */
   if (layer > 0) {
     lc->avg_frame_size_for_layer =
-        (int)((cpi->oxcf.target_bitrate[layer] -
-               cpi->oxcf.target_bitrate[layer - 1]) *
-              1000 / (lc->framerate - prev_layer_framerate));
+        (int)round((cpi->oxcf.target_bitrate[layer] -
+                    cpi->oxcf.target_bitrate[layer - 1]) *
+                   1000 / (lc->framerate - prev_layer_framerate));
   }
 
   lc->active_worst_quality = cpi->oxcf.worst_allowed_q;
@@ -327,8 +336,8 @@ static void init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf,
 // for any "new" layers. For "existing" layers, let them inherit the parameters
 // from the previous layer state (at the same layer #). In future we may want
 // to better map the previous layer state(s) to the "new" ones.
-static void reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf,
-                                        const int prev_num_layers) {
+void vp8_reset_temporal_layer_change(VP8_COMP *cpi, const VP8_CONFIG *oxcf,
+                                     const int prev_num_layers) {
   int i;
   double prev_layer_framerate = 0;
   const int curr_num_layers = cpi->oxcf.number_of_layers;
@@ -336,12 +345,12 @@ static void reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf,
   // We need this to set the layer context for the new layers below.
   if (prev_num_layers == 1) {
     cpi->current_layer = 0;
-    save_layer_context(cpi);
+    vp8_save_layer_context(cpi);
   }
   for (i = 0; i < curr_num_layers; ++i) {
     LAYER_CONTEXT *lc = &cpi->layer_context[i];
     if (i >= prev_num_layers) {
-      init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate);
+      vp8_init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate);
     }
     // The initial buffer levels are set based on their starting levels.
     // We could set the buffer levels based on the previous state (normalized
@@ -356,7 +365,7 @@ static void reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf,
     // state (to smooth-out quality dips/rate fluctuation at transition)?
 
     // We need to treat the 1 layer case separately: oxcf.target_bitrate[i]
-    // is not set for 1 layer, and the restore_layer_context/save_context()
+    // is not set for 1 layer, and the vp8_restore_layer_context/save_context()
     // are not called in the encoding loop, so we need to call it here to
     // pass the layer context state to |cpi|.
     if (curr_num_layers == 1) {
@@ -364,7 +373,7 @@ static void reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf,
       lc->buffer_level =
           cpi->oxcf.starting_buffer_level_in_ms * lc->target_bandwidth / 1000;
       lc->bits_off_target = lc->buffer_level;
-      restore_layer_context(cpi, 0);
+      vp8_restore_layer_context(cpi, 0);
     }
     prev_layer_framerate = cpi->output_framerate / cpi->oxcf.rate_decimator[i];
   }
@@ -394,16 +403,13 @@ static void setup_features(VP8_COMP *cpi) {
 
 static void dealloc_raw_frame_buffers(VP8_COMP *cpi);
 
-void vp8_initialize_enc(void) {
-  static volatile int init_done = 0;
-
-  if (!init_done) {
-    vpx_dsp_rtcd();
-    vp8_init_intra_predictors();
-    init_done = 1;
-  }
+static void initialize_enc(void) {
+  vpx_dsp_rtcd();
+  vp8_init_intra_predictors();
 }
 
+void vp8_initialize_enc(void) { once(initialize_enc); }
+
 static void dealloc_compressor_data(VP8_COMP *cpi) {
   vpx_free(cpi->tplist);
   cpi->tplist = NULL;
@@ -444,23 +450,6 @@ static void dealloc_compressor_data(VP8_COMP *cpi) {
 
   vpx_free(cpi->mb.pip);
   cpi->mb.pip = 0;
-
-#if CONFIG_MULTITHREAD
-  /* De-allocate mutex */
-  if (cpi->pmutex != NULL) {
-    VP8_COMMON *const pc = &cpi->common;
-    int i;
-
-    for (i = 0; i < pc->mb_rows; ++i) {
-      pthread_mutex_destroy(&cpi->pmutex[i]);
-    }
-    vpx_free(cpi->pmutex);
-    cpi->pmutex = NULL;
-  }
-
-  vpx_free(cpi->mt_current_mb_col);
-  cpi->mt_current_mb_col = NULL;
-#endif
 }
 
 static void enable_segmentation(VP8_COMP *cpi) {
@@ -502,7 +491,7 @@ static void set_segmentation_map(VP8_COMP *cpi,
  */
 static void set_segment_data(VP8_COMP *cpi, signed char *feature_data,
                              unsigned char abs_delta) {
-  cpi->mb.e_mbd.mb_segement_abs_delta = abs_delta;
+  cpi->mb.e_mbd.mb_segment_abs_delta = abs_delta;
   memcpy(cpi->segment_feature_data, feature_data,
          sizeof(cpi->segment_feature_data));
 }
@@ -615,6 +604,59 @@ static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment) {
   set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA);
 }
 
+static void compute_skin_map(VP8_COMP *cpi) {
+  int mb_row, mb_col, num_bl;
+  VP8_COMMON *cm = &cpi->common;
+  const uint8_t *src_y = cpi->Source->y_buffer;
+  const uint8_t *src_u = cpi->Source->u_buffer;
+  const uint8_t *src_v = cpi->Source->v_buffer;
+  const int src_ystride = cpi->Source->y_stride;
+  const int src_uvstride = cpi->Source->uv_stride;
+
+  const SKIN_DETECTION_BLOCK_SIZE bsize =
+      (cm->Width * cm->Height <= 352 * 288) ? SKIN_8X8 : SKIN_16X16;
+
+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
+    num_bl = 0;
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+      const int bl_index = mb_row * cm->mb_cols + mb_col;
+      cpi->skin_map[bl_index] =
+          vp8_compute_skin_block(src_y, src_u, src_v, src_ystride, src_uvstride,
+                                 bsize, cpi->consec_zero_last[bl_index], 0);
+      num_bl++;
+      src_y += 16;
+      src_u += 8;
+      src_v += 8;
+    }
+    src_y += (src_ystride << 4) - (num_bl << 4);
+    src_u += (src_uvstride << 3) - (num_bl << 3);
+    src_v += (src_uvstride << 3) - (num_bl << 3);
+  }
+
+  // Remove isolated skin blocks (none of its neighbors are skin) and isolated
+  // non-skin blocks (all of its neighbors are skin). Skip the boundary.
+  for (mb_row = 1; mb_row < cm->mb_rows - 1; mb_row++) {
+    for (mb_col = 1; mb_col < cm->mb_cols - 1; mb_col++) {
+      const int bl_index = mb_row * cm->mb_cols + mb_col;
+      int num_neighbor = 0;
+      int mi, mj;
+      int non_skin_threshold = 8;
+
+      for (mi = -1; mi <= 1; mi += 1) {
+        for (mj = -1; mj <= 1; mj += 1) {
+          int bl_neighbor_index = (mb_row + mi) * cm->mb_cols + mb_col + mj;
+          if (cpi->skin_map[bl_neighbor_index]) num_neighbor++;
+        }
+      }
+
+      if (cpi->skin_map[bl_index] && num_neighbor < 2)
+        cpi->skin_map[bl_index] = 0;
+      if (!cpi->skin_map[bl_index] && num_neighbor == non_skin_threshold)
+        cpi->skin_map[bl_index] = 1;
+    }
+  }
+}
+
 static void set_default_lf_deltas(VP8_COMP *cpi) {
   cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 1;
   cpi->mb.e_mbd.mode_ref_lf_delta_update = 1;
@@ -643,8 +685,8 @@ static void set_default_lf_deltas(VP8_COMP *cpi) {
 /* Convenience macros for mapping speed and mode into a continuous
  * range
  */
-#define GOOD(x) (x + 1)
-#define RT(x) (x + 7)
+#define GOOD(x) ((x) + 1)
+#define RT(x) ((x) + 7)
 
 static int speed_map(int speed, const int *map) {
   int res;
@@ -697,9 +739,9 @@ static const int mode_check_freq_map_zn2[] = {
   0, RT(10), 1 << 1, RT(11), 1 << 2, RT(12), 1 << 3, INT_MAX
 };
 
-static const int mode_check_freq_map_vhbpred[] = {
-  0, GOOD(5), 2, RT(0), 0, RT(3), 2, RT(5), 4, INT_MAX
-};
+static const int mode_check_freq_map_vhbpred[] = { 0, GOOD(5), 2, RT(0),
+                                                   0, RT(3),   2, RT(5),
+                                                   4, INT_MAX };
 
 static const int mode_check_freq_map_near2[] = {
   0,      GOOD(5), 2,      RT(0),  0,      RT(3),  2,
@@ -715,18 +757,19 @@ static const int mode_check_freq_map_new2[] = { 0,      GOOD(5), 4,      RT(0),
                                                 1 << 3, RT(11),  1 << 4, RT(12),
                                                 1 << 5, INT_MAX };
 
-static const int mode_check_freq_map_split1[] = {
-  0, GOOD(2), 2, GOOD(3), 7, RT(1), 2, RT(2), 7, INT_MAX
-};
+static const int mode_check_freq_map_split1[] = { 0, GOOD(2), 2, GOOD(3),
+                                                  7, RT(1),   2, RT(2),
+                                                  7, INT_MAX };
 
-static const int mode_check_freq_map_split2[] = {
-  0, GOOD(1), 2, GOOD(2), 4, GOOD(3), 15, RT(1), 4, RT(2), 15, INT_MAX
-};
+static const int mode_check_freq_map_split2[] = { 0, GOOD(1), 2,  GOOD(2),
+                                                  4, GOOD(3), 15, RT(1),
+                                                  4, RT(2),   15, INT_MAX };
 
 void vp8_set_speed_features(VP8_COMP *cpi) {
   SPEED_FEATURES *sf = &cpi->sf;
   int Mode = cpi->compressor_speed;
   int Speed = cpi->Speed;
+  int Speed2;
   int i;
   VP8_COMMON *cm = &cpi->common;
   int last_improved_quant = sf->improved_quant;
@@ -828,9 +871,16 @@ void vp8_set_speed_features(VP8_COMP *cpi) {
   cpi->mode_check_freq[THR_V_PRED] = cpi->mode_check_freq[THR_H_PRED] =
       cpi->mode_check_freq[THR_B_PRED] =
           speed_map(Speed, mode_check_freq_map_vhbpred);
-  cpi->mode_check_freq[THR_NEW1] = speed_map(Speed, mode_check_freq_map_new1);
+
+  // For real-time mode at speed 10 keep the mode_check_freq threshold
+  // for NEW1 similar to that of speed 9.
+  Speed2 = Speed;
+  if (cpi->Speed == 10 && Mode == 2) Speed2 = RT(9);
+  cpi->mode_check_freq[THR_NEW1] = speed_map(Speed2, mode_check_freq_map_new1);
+
   cpi->mode_check_freq[THR_NEW2] = cpi->mode_check_freq[THR_NEW3] =
       speed_map(Speed, mode_check_freq_map_new2);
+
   cpi->mode_check_freq[THR_SPLIT1] =
       speed_map(Speed, mode_check_freq_map_split1);
   cpi->mode_check_freq[THR_SPLIT2] = cpi->mode_check_freq[THR_SPLIT3] =
@@ -974,7 +1024,7 @@ void vp8_set_speed_features(VP8_COMP *cpi) {
 
       memset(cpi->mb.error_bins, 0, sizeof(cpi->mb.error_bins));
 
-  }; /* switch */
+  } /* switch */
 
   /* Slow quant, dct and trellis not worthwhile for first pass
    * so make sure they are always turned off.
@@ -1087,9 +1137,6 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) {
 
   int width = cm->Width;
   int height = cm->Height;
-#if CONFIG_MULTITHREAD
-  int prev_mb_rows = cm->mb_rows;
-#endif
 
   if (vp8_alloc_frame_buffers(cm, width, height)) {
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
@@ -1125,7 +1172,8 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) {
 #else
     unsigned int tokens = cm->mb_rows * cm->mb_cols * 24 * 16;
 #endif
-    CHECK_MEM_ERROR(cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->tok,
+                    vpx_calloc(tokens, sizeof(*cpi->tok)));
   }
 
   /* Data used for real time vc mode to see if gf needs refreshing */
@@ -1134,37 +1182,39 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) {
   /* Structures used to monitor GF usage */
   vpx_free(cpi->gf_active_flags);
   CHECK_MEM_ERROR(
-      cpi->gf_active_flags,
+      &cpi->common.error, cpi->gf_active_flags,
       vpx_calloc(sizeof(*cpi->gf_active_flags), cm->mb_rows * cm->mb_cols));
   cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
 
   vpx_free(cpi->mb_activity_map);
   CHECK_MEM_ERROR(
-      cpi->mb_activity_map,
+      &cpi->common.error, cpi->mb_activity_map,
       vpx_calloc(sizeof(*cpi->mb_activity_map), cm->mb_rows * cm->mb_cols));
 
   /* allocate memory for storing last frame's MVs for MV prediction. */
   vpx_free(cpi->lfmv);
-  CHECK_MEM_ERROR(cpi->lfmv, vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2),
-                                        sizeof(*cpi->lfmv)));
+  CHECK_MEM_ERROR(
+      &cpi->common.error, cpi->lfmv,
+      vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2), sizeof(*cpi->lfmv)));
   vpx_free(cpi->lf_ref_frame_sign_bias);
-  CHECK_MEM_ERROR(cpi->lf_ref_frame_sign_bias,
+  CHECK_MEM_ERROR(&cpi->common.error, cpi->lf_ref_frame_sign_bias,
                   vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2),
                              sizeof(*cpi->lf_ref_frame_sign_bias)));
   vpx_free(cpi->lf_ref_frame);
-  CHECK_MEM_ERROR(cpi->lf_ref_frame,
+  CHECK_MEM_ERROR(&cpi->common.error, cpi->lf_ref_frame,
                   vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2),
                              sizeof(*cpi->lf_ref_frame)));
 
   /* Create the encoder segmentation map and set all entries to 0 */
   vpx_free(cpi->segmentation_map);
   CHECK_MEM_ERROR(
-      cpi->segmentation_map,
+      &cpi->common.error, cpi->segmentation_map,
       vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->segmentation_map)));
   cpi->cyclic_refresh_mode_index = 0;
   vpx_free(cpi->active_map);
-  CHECK_MEM_ERROR(cpi->active_map, vpx_calloc(cm->mb_rows * cm->mb_cols,
-                                              sizeof(*cpi->active_map)));
+  CHECK_MEM_ERROR(
+      &cpi->common.error, cpi->active_map,
+      vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->active_map)));
   memset(cpi->active_map, 1, (cm->mb_rows * cm->mb_cols));
 
 #if CONFIG_MULTITHREAD
@@ -1177,36 +1227,11 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) {
   } else {
     cpi->mt_sync_range = 16;
   }
-
-  if (cpi->oxcf.multi_threaded > 1) {
-    int i;
-
-    /* De-allocate and re-allocate mutex */
-    if (cpi->pmutex != NULL) {
-      for (i = 0; i < prev_mb_rows; ++i) {
-        pthread_mutex_destroy(&cpi->pmutex[i]);
-      }
-      vpx_free(cpi->pmutex);
-      cpi->pmutex = NULL;
-    }
-
-    CHECK_MEM_ERROR(cpi->pmutex,
-                    vpx_malloc(sizeof(*cpi->pmutex) * cm->mb_rows));
-    if (cpi->pmutex) {
-      for (i = 0; i < cm->mb_rows; ++i) {
-        pthread_mutex_init(&cpi->pmutex[i], NULL);
-      }
-    }
-
-    vpx_free(cpi->mt_current_mb_col);
-    CHECK_MEM_ERROR(cpi->mt_current_mb_col,
-                    vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows));
-  }
-
 #endif
 
   vpx_free(cpi->tplist);
-  CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cm->mb_rows));
+  CHECK_MEM_ERROR(&cpi->common.error, cpi->tplist,
+                  vpx_malloc(sizeof(TOKENLIST) * cm->mb_rows));
 
 #if CONFIG_TEMPORAL_DENOISING
   if (cpi->oxcf.noise_sensitivity > 0) {
@@ -1237,16 +1262,25 @@ int vp8_reverse_trans(int x) {
 
   return 63;
 }
-void vp8_new_framerate(VP8_COMP *cpi, double framerate) {
-  if (framerate < .1) framerate = 30;
 
+static double clamp_framerate(double framerate) {
+  if (framerate < .1)
+    return 30.0;
+  else
+    return framerate;
+}
+
+void vp8_new_framerate(VP8_COMP *cpi, double framerate) {
+  framerate = clamp_framerate(framerate);
   cpi->framerate = framerate;
   cpi->output_framerate = framerate;
-  cpi->per_frame_bandwidth =
-      (int)(cpi->oxcf.target_bandwidth / cpi->output_framerate);
+  const double per_frame_bandwidth =
+      round(cpi->oxcf.target_bandwidth / cpi->output_framerate);
+  cpi->per_frame_bandwidth = (int)VPXMIN(per_frame_bandwidth, INT_MAX);
   cpi->av_per_frame_bandwidth = cpi->per_frame_bandwidth;
-  cpi->min_frame_bandwidth = (int)(cpi->av_per_frame_bandwidth *
-                                   cpi->oxcf.two_pass_vbrmin_section / 100);
+  const int64_t vbr_min_bits = (int64_t)cpi->av_per_frame_bandwidth *
+                               cpi->oxcf.two_pass_vbrmin_section / 100;
+  cpi->min_frame_bandwidth = (int)VPXMIN(vbr_min_bits, INT_MAX);
 
   /* Set Maximum gf/arf interval */
   cpi->max_gf_interval = ((int)(cpi->output_framerate / 2.0) + 2);
@@ -1273,7 +1307,7 @@ void vp8_new_framerate(VP8_COMP *cpi, double framerate) {
   }
 }
 
-static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
+static void init_config(VP8_COMP *cpi, const VP8_CONFIG *oxcf) {
   VP8_COMMON *cm = &cpi->common;
 
   cpi->oxcf = *oxcf;
@@ -1334,7 +1368,7 @@ static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
     double prev_layer_framerate = 0;
 
     for (i = 0; i < cpi->oxcf.number_of_layers; ++i) {
-      init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate);
+      vp8_init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate);
       prev_layer_framerate =
           cpi->output_framerate / cpi->oxcf.rate_decimator[i];
     }
@@ -1351,7 +1385,7 @@ static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
 #endif
 }
 
-static void update_layer_contexts(VP8_COMP *cpi) {
+void vp8_update_layer_contexts(VP8_COMP *cpi) {
   VP8_CONFIG *oxcf = &cpi->oxcf;
 
   /* Update snapshots of the layer contexts to reflect new parameters */
@@ -1364,7 +1398,10 @@ static void update_layer_contexts(VP8_COMP *cpi) {
       LAYER_CONTEXT *lc = &cpi->layer_context[i];
 
       lc->framerate = cpi->ref_framerate / oxcf->rate_decimator[i];
-      lc->target_bandwidth = oxcf->target_bitrate[i] * 1000;
+      if (oxcf->target_bitrate[i] > INT_MAX / 1000)
+        lc->target_bandwidth = INT_MAX;
+      else
+        lc->target_bandwidth = oxcf->target_bitrate[i] * 1000;
 
       lc->starting_buffer_level = rescale(
           (int)oxcf->starting_buffer_level_in_ms, lc->target_bandwidth, 1000);
@@ -1386,8 +1423,8 @@ static void update_layer_contexts(VP8_COMP *cpi) {
       /* Work out the average size of a frame within this layer */
       if (i > 0) {
         lc->avg_frame_size_for_layer =
-            (int)((oxcf->target_bitrate[i] - oxcf->target_bitrate[i - 1]) *
-                  1000 / (lc->framerate - prev_layer_framerate));
+            (int)round((oxcf->target_bitrate[i] - oxcf->target_bitrate[i - 1]) *
+                       1000 / (lc->framerate - prev_layer_framerate));
       }
 
       prev_layer_framerate = lc->framerate;
@@ -1395,10 +1432,11 @@ static void update_layer_contexts(VP8_COMP *cpi) {
   }
 }
 
-void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
+void vp8_change_config(VP8_COMP *cpi, const VP8_CONFIG *oxcf) {
   VP8_COMMON *cm = &cpi->common;
   int last_w, last_h;
   unsigned int prev_number_of_layers;
+  double raw_target_rate;
 
   if (!cpi) return;
 
@@ -1413,10 +1451,6 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
   last_h = cpi->oxcf.Height;
   prev_number_of_layers = cpi->oxcf.number_of_layers;
 
-  if (cpi->initial_width) {
-      oxcf->multi_threaded = cpi->oxcf.multi_threaded;
-  }
-
   cpi->oxcf = *oxcf;
 
   switch (cpi->oxcf.Mode) {
@@ -1502,6 +1536,8 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
     }
   }
 
+  cpi->ext_refresh_frame_flags_pending = 0;
+
   cpi->baseline_gf_interval =
       cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL;
 
@@ -1521,9 +1557,8 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
 
   setup_features(cpi);
 
-  {
+  if (!cpi->use_roi_static_threshold) {
     int i;
-
     for (i = 0; i < MAX_MB_SEGMENTS; ++i) {
       cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;
     }
@@ -1542,6 +1577,10 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
     cpi->oxcf.maximum_buffer_size_in_ms = 240000;
   }
 
+  raw_target_rate = ((int64_t)cpi->oxcf.Width * cpi->oxcf.Height * 8 * 3 *
+                     cpi->framerate / 1000.0);
+  if (cpi->oxcf.target_bandwidth > raw_target_rate)
+    cpi->oxcf.target_bandwidth = (unsigned int)raw_target_rate;
   /* Convert target bandwidth from Kbit/s to Bit/s */
   cpi->oxcf.target_bandwidth *= 1000;
 
@@ -1612,7 +1651,7 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
       cpi->temporal_layer_id = 0;
     }
     cpi->temporal_pattern_counter = 0;
-    reset_temporal_layer_change(cpi, oxcf, prev_number_of_layers);
+    vp8_reset_temporal_layer_change(cpi, oxcf, prev_number_of_layers);
   }
 
   if (!cpi->initial_width) {
@@ -1636,9 +1675,8 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
 
   cm->sharpness_level = cpi->oxcf.Sharpness;
 
-  if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL) {
-    int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
-    int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
+  if (cm->horiz_scale != VP8E_NORMAL || cm->vert_scale != VP8E_NORMAL) {
+    int hr, hs, vr, vs;
 
     Scale2Ratio(cm->horiz_scale, &hr, &hs);
     Scale2Ratio(cm->vert_scale, &vr, &vs);
@@ -1721,7 +1759,7 @@ static void cal_mvsadcosts(int *mvsadcost[2]) {
   } while (++i <= mvfp_max);
 }
 
-struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
+struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf) {
   int i;
 
   VP8_COMP *cpi;
@@ -1743,8 +1781,9 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
 
   cpi->common.error.setjmp = 1;
 
-  CHECK_MEM_ERROR(cpi->mb.ss, vpx_calloc(sizeof(search_site),
-                                         (MAX_MVSEARCH_STEPS * 8) + 1));
+  CHECK_MEM_ERROR(
+      &cpi->common.error, cpi->mb.ss,
+      vpx_calloc(sizeof(search_site), (MAX_MVSEARCH_STEPS * 8) + 1));
 
   vp8_create_common(&cpi->common);
 
@@ -1784,6 +1823,8 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
 
   cpi->active_map_enabled = 0;
 
+  cpi->use_roi_static_threshold = 0;
+
 #if 0
     /* Experimental code for lagged and one pass */
     /* Initialise one_pass GF frames stats */
@@ -1841,26 +1882,25 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
             ? (2 * (cpi->common.mb_rows * cpi->common.mb_cols) /
                cpi->cyclic_refresh_mode_max_mbs_perframe)
             : 10;
-    cpi->gf_interval_onepass_cbr =
-        VPXMIN(40, VPXMAX(6, cpi->gf_interval_onepass_cbr));
+    cpi->gf_interval_onepass_cbr = clamp(cpi->gf_interval_onepass_cbr, 6, 40);
     cpi->baseline_gf_interval = cpi->gf_interval_onepass_cbr;
   }
 
   if (cpi->cyclic_refresh_mode_enabled) {
-    CHECK_MEM_ERROR(cpi->cyclic_refresh_map,
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->cyclic_refresh_map,
                     vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
   } else {
     cpi->cyclic_refresh_map = (signed char *)NULL;
   }
 
-  CHECK_MEM_ERROR(cpi->consec_zero_last,
-                  vpx_calloc(cm->mb_rows * cm->mb_cols, 1));
-  CHECK_MEM_ERROR(cpi->consec_zero_last_mvbias,
-                  vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
+  CHECK_MEM_ERROR(
+      &cpi->common.error, cpi->skin_map,
+      vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(cpi->skin_map[0])));
 
-#ifdef VP8_ENTROPY_STATS
-  init_context_counters();
-#endif
+  CHECK_MEM_ERROR(&cpi->common.error, cpi->consec_zero_last,
+                  vpx_calloc(cm->mb_rows * cm->mb_cols, 1));
+  CHECK_MEM_ERROR(&cpi->common.error, cpi->consec_zero_last_mvbias,
+                  vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
 
   /*Initialize the feed-forward activity masking.*/
   cpi->activity_avg = 90 << 12;
@@ -1876,6 +1916,9 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
   cpi->common.refresh_alt_ref_frame = 0;
 
   cpi->force_maxqp = 0;
+  cpi->frames_since_last_drop_overshoot = 0;
+  cpi->rt_always_update_correction_factor = 0;
+  cpi->rt_drop_recode_on_overshoot = 1;
 
   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
 #if CONFIG_INTERNAL_STATS
@@ -1929,6 +1972,9 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
 #ifdef OUTPUT_YUV_DENOISED
   yuv_denoised_file = fopen("denoised.yuv", "ab");
 #endif
+#ifdef OUTPUT_YUV_SKINMAP
+  yuv_skinmap_file = fopen("skinmap.yuv", "wb");
+#endif
 
 #if 0
     framepsnr = fopen("framepsnr.stt", "a");
@@ -1966,12 +2012,9 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
     cpi->mb.rd_thresh_mult[i] = 128;
   }
 
-#ifdef VP8_ENTROPY_STATS
-  init_mv_ref_counts();
-#endif
-
 #if CONFIG_MULTITHREAD
   if (vp8cx_create_encoder_threads(cpi)) {
+    cpi->common.error.setjmp = 0;
     vp8_remove_compressor(&cpi);
     return 0;
   }
@@ -1980,39 +2023,29 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
   cpi->fn_ptr[BLOCK_16X16].sdf = vpx_sad16x16;
   cpi->fn_ptr[BLOCK_16X16].vf = vpx_variance16x16;
   cpi->fn_ptr[BLOCK_16X16].svf = vpx_sub_pixel_variance16x16;
-  cpi->fn_ptr[BLOCK_16X16].sdx3f = vpx_sad16x16x3;
-  cpi->fn_ptr[BLOCK_16X16].sdx8f = vpx_sad16x16x8;
   cpi->fn_ptr[BLOCK_16X16].sdx4df = vpx_sad16x16x4d;
 
   cpi->fn_ptr[BLOCK_16X8].sdf = vpx_sad16x8;
   cpi->fn_ptr[BLOCK_16X8].vf = vpx_variance16x8;
   cpi->fn_ptr[BLOCK_16X8].svf = vpx_sub_pixel_variance16x8;
-  cpi->fn_ptr[BLOCK_16X8].sdx3f = vpx_sad16x8x3;
-  cpi->fn_ptr[BLOCK_16X8].sdx8f = vpx_sad16x8x8;
   cpi->fn_ptr[BLOCK_16X8].sdx4df = vpx_sad16x8x4d;
 
   cpi->fn_ptr[BLOCK_8X16].sdf = vpx_sad8x16;
   cpi->fn_ptr[BLOCK_8X16].vf = vpx_variance8x16;
   cpi->fn_ptr[BLOCK_8X16].svf = vpx_sub_pixel_variance8x16;
-  cpi->fn_ptr[BLOCK_8X16].sdx3f = vpx_sad8x16x3;
-  cpi->fn_ptr[BLOCK_8X16].sdx8f = vpx_sad8x16x8;
   cpi->fn_ptr[BLOCK_8X16].sdx4df = vpx_sad8x16x4d;
 
   cpi->fn_ptr[BLOCK_8X8].sdf = vpx_sad8x8;
   cpi->fn_ptr[BLOCK_8X8].vf = vpx_variance8x8;
   cpi->fn_ptr[BLOCK_8X8].svf = vpx_sub_pixel_variance8x8;
-  cpi->fn_ptr[BLOCK_8X8].sdx3f = vpx_sad8x8x3;
-  cpi->fn_ptr[BLOCK_8X8].sdx8f = vpx_sad8x8x8;
   cpi->fn_ptr[BLOCK_8X8].sdx4df = vpx_sad8x8x4d;
 
   cpi->fn_ptr[BLOCK_4X4].sdf = vpx_sad4x4;
   cpi->fn_ptr[BLOCK_4X4].vf = vpx_variance4x4;
   cpi->fn_ptr[BLOCK_4X4].svf = vpx_sub_pixel_variance4x4;
-  cpi->fn_ptr[BLOCK_4X4].sdx3f = vpx_sad4x4x3;
-  cpi->fn_ptr[BLOCK_4X4].sdx8f = vpx_sad4x4x8;
   cpi->fn_ptr[BLOCK_4X4].sdx4df = vpx_sad4x4x4d;
 
-#if ARCH_X86 || ARCH_X86_64
+#if VPX_ARCH_X86 || VPX_ARCH_X86_64
   cpi->fn_ptr[BLOCK_16X16].copymem = vp8_copy32xn;
   cpi->fn_ptr[BLOCK_16X8].copymem = vp8_copy32xn;
   cpi->fn_ptr[BLOCK_8X16].copymem = vp8_copy32xn;
@@ -2020,7 +2053,6 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
   cpi->fn_ptr[BLOCK_4X4].copymem = vp8_copy32xn;
 #endif
 
-  cpi->full_search_sad = vp8_full_search_sad;
   cpi->diamond_search_sad = vp8_diamond_search_sad;
   cpi->refining_search_sad = vp8_refining_search_sad;
 
@@ -2036,8 +2068,6 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
 
   vp8_loop_filter_init(cm);
 
-  cpi->common.error.setjmp = 0;
-
 #if CONFIG_MULTI_RES_ENCODING
 
   /* Calculate # of MBs in a row in lower-resolution level image. */
@@ -2064,11 +2094,13 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
   vp8_setup_block_ptrs(&cpi->mb);
   vp8_setup_block_dptrs(&cpi->mb.e_mbd);
 
+  cpi->common.error.setjmp = 0;
+
   return cpi;
 }
 
-void vp8_remove_compressor(VP8_COMP **ptr) {
-  VP8_COMP *cpi = *ptr;
+void vp8_remove_compressor(VP8_COMP **comp) {
+  VP8_COMP *cpi = *comp;
 
   if (!cpi) return;
 
@@ -2081,12 +2113,6 @@ void vp8_remove_compressor(VP8_COMP **ptr) {
 
 #endif
 
-#ifdef VP8_ENTROPY_STATS
-    print_context_counters();
-    print_tree_update_probs();
-    print_mode_context();
-#endif
-
 #if CONFIG_INTERNAL_STATS
 
     if (cpi->pass != 1) {
@@ -2094,11 +2120,6 @@ void vp8_remove_compressor(VP8_COMP **ptr) {
       double time_encoded =
           (cpi->last_end_time_stamp_seen - cpi->first_time_stamp_ever) /
           10000000.000;
-      double total_encode_time =
-          (cpi->time_receive_data + cpi->time_compress_data) / 1000.000;
-      double dr = (double)cpi->bytes * 8.0 / 1000.0 / time_encoded;
-      const double target_rate = (double)cpi->oxcf.target_bandwidth / 1000;
-      const double rate_err = ((100.0 * (dr - target_rate)) / target_rate);
 
       if (cpi->b_calculate_psnr) {
         if (cpi->oxcf.number_of_layers > 1) {
@@ -2127,6 +2148,7 @@ void vp8_remove_compressor(VP8_COMP **ptr) {
                     total_psnr2, total_ssim);
           }
         } else {
+          double dr = (double)cpi->bytes * 8.0 / 1000.0 / time_encoded;
           double samples =
               3.0 / 2 * cpi->count * cpi->common.Width * cpi->common.Height;
           double total_psnr =
@@ -2175,8 +2197,8 @@ void vp8_remove_compressor(VP8_COMP **ptr) {
     {
       extern int count_mb_seg[4];
       FILE *f = fopen("modes.stt", "a");
-      double dr = (double)cpi->framerate * (double)bytes * (double)8 /
-                  (double)count / (double)1000;
+      double dr = cpi->framerate * (double)bytes * (double)8 / (double)count /
+                  (double)1000;
       fprintf(f, "intra_mode in Intra Frames:\n");
       fprintf(f, "Y: %8d, %8d, %8d, %8d, %8d\n", y_modes[0], y_modes[1],
               y_modes[2], y_modes[3], y_modes[4]);
@@ -2217,40 +2239,6 @@ void vp8_remove_compressor(VP8_COMP **ptr) {
     }
 #endif
 
-#ifdef VP8_ENTROPY_STATS
-    {
-      int i, j, k;
-      FILE *fmode = fopen("modecontext.c", "w");
-
-      fprintf(fmode, "\n#include \"entropymode.h\"\n\n");
-      fprintf(fmode, "const unsigned int vp8_kf_default_bmode_counts ");
-      fprintf(fmode,
-              "[VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES] =\n{\n");
-
-      for (i = 0; i < 10; ++i) {
-        fprintf(fmode, "    { /* Above Mode :  %d */\n", i);
-
-        for (j = 0; j < 10; ++j) {
-          fprintf(fmode, "        {");
-
-          for (k = 0; k < 10; ++k) {
-            if (!intra_mode_stats[i][j][k])
-              fprintf(fmode, " %5d, ", 1);
-            else
-              fprintf(fmode, " %5d, ", intra_mode_stats[i][j][k]);
-          }
-
-          fprintf(fmode, "}, /* left_mode %d */\n", j);
-        }
-
-        fprintf(fmode, "    },\n");
-      }
-
-      fprintf(fmode, "};\n");
-      fclose(fmode);
-    }
-#endif
-
 #if defined(SECTIONBITS_OUTPUT)
 
     if (0) {
@@ -2268,7 +2256,7 @@ void vp8_remove_compressor(VP8_COMP **ptr) {
 #if 0
         {
             printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000);
-            printf("\n_frames recive_data encod_mb_row compress_frame  Total\n");
+            printf("\n_frames receive_data encod_mb_row compress_frame  Total\n");
             printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame, cpi->time_receive_data / 1000, cpi->time_encode_mb_row / 1000, cpi->time_compress_data / 1000, (cpi->time_receive_data + cpi->time_compress_data) / 1000);
         }
 #endif
@@ -2284,13 +2272,14 @@ void vp8_remove_compressor(VP8_COMP **ptr) {
   dealloc_compressor_data(cpi);
   vpx_free(cpi->mb.ss);
   vpx_free(cpi->tok);
+  vpx_free(cpi->skin_map);
   vpx_free(cpi->cyclic_refresh_map);
   vpx_free(cpi->consec_zero_last);
   vpx_free(cpi->consec_zero_last_mvbias);
 
   vp8_remove_common(&cpi->common);
   vpx_free(cpi);
-  *ptr = 0;
+  *comp = 0;
 
 #ifdef OUTPUT_YUV_SRC
   fclose(yuv_file);
@@ -2298,6 +2287,9 @@ void vp8_remove_compressor(VP8_COMP **ptr) {
 #ifdef OUTPUT_YUV_DENOISED
   fclose(yuv_denoised_file);
 #endif
+#ifdef OUTPUT_YUV_SKINMAP
+  fclose(yuv_skinmap_file);
+#endif
 
 #if 0
 
@@ -2425,6 +2417,7 @@ int vp8_update_reference(VP8_COMP *cpi, int ref_frame_flags) {
 
   if (ref_frame_flags & VP8_ALTR_FRAME) cpi->common.refresh_alt_ref_frame = 1;
 
+  cpi->ext_refresh_frame_flags_pending = 1;
   return 0;
 }
 
@@ -2474,42 +2467,13 @@ int vp8_update_entropy(VP8_COMP *cpi, int update) {
   return 0;
 }
 
-#if defined(OUTPUT_YUV_SRC) || defined(OUTPUT_YUV_DENOISED)
-void vp8_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s) {
-  unsigned char *src = s->y_buffer;
-  int h = s->y_height;
-
-  do {
-    fwrite(src, s->y_width, 1, yuv_file);
-    src += s->y_stride;
-  } while (--h);
-
-  src = s->u_buffer;
-  h = s->uv_height;
-
-  do {
-    fwrite(src, s->uv_width, 1, yuv_file);
-    src += s->uv_stride;
-  } while (--h);
-
-  src = s->v_buffer;
-  h = s->uv_height;
-
-  do {
-    fwrite(src, s->uv_width, 1, yuv_file);
-    src += s->uv_stride;
-  } while (--h);
-}
-#endif
-
 static void scale_and_extend_source(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) {
   VP8_COMMON *cm = &cpi->common;
 
   /* are we resizing the image */
   if (cm->horiz_scale != 0 || cm->vert_scale != 0) {
 #if CONFIG_SPATIAL_RESAMPLING
-    int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
-    int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
+    int hr, hs, vr, vs;
     int tmp_height;
 
     if (cm->vert_scale == 3) {
@@ -2542,8 +2506,7 @@ static int resize_key_frame(VP8_COMP *cpi) {
    */
   if (cpi->oxcf.allow_spatial_resampling &&
       (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)) {
-    int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
-    int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
+    int hr, hs, vr, vs;
     int new_width, new_height;
 
     /* If we are below the resample DOWN watermark then scale down a
@@ -2552,15 +2515,17 @@ static int resize_key_frame(VP8_COMP *cpi) {
     if (cpi->buffer_level < (cpi->oxcf.resample_down_water_mark *
                              cpi->oxcf.optimal_buffer_level / 100)) {
       cm->horiz_scale =
-          (cm->horiz_scale < ONETWO) ? cm->horiz_scale + 1 : ONETWO;
-      cm->vert_scale = (cm->vert_scale < ONETWO) ? cm->vert_scale + 1 : ONETWO;
+          (cm->horiz_scale < VP8E_ONETWO) ? cm->horiz_scale + 1 : VP8E_ONETWO;
+      cm->vert_scale =
+          (cm->vert_scale < VP8E_ONETWO) ? cm->vert_scale + 1 : VP8E_ONETWO;
     }
     /* Should we now start scaling back up */
     else if (cpi->buffer_level > (cpi->oxcf.resample_up_water_mark *
                                   cpi->oxcf.optimal_buffer_level / 100)) {
       cm->horiz_scale =
-          (cm->horiz_scale > NORMAL) ? cm->horiz_scale - 1 : NORMAL;
-      cm->vert_scale = (cm->vert_scale > NORMAL) ? cm->vert_scale - 1 : NORMAL;
+          (cm->horiz_scale > VP8E_NORMAL) ? cm->horiz_scale - 1 : VP8E_NORMAL;
+      cm->vert_scale =
+          (cm->vert_scale > VP8E_NORMAL) ? cm->vert_scale - 1 : VP8E_NORMAL;
     }
 
     /* Get the new height and width */
@@ -2791,7 +2756,7 @@ static int decide_key_frame(VP8_COMP *cpi) {
   }
   /* in addition if the following are true and this is not a golden frame
    * then code a key frame Note that on golden frames there often seems
-   * to be a pop in intra useage anyway hence this restriction is
+   * to be a pop in intra usage anyway hence this restriction is
    * designed to prevent spurious key frames. The Intra pop needs to be
    * investigated.
    */
@@ -2810,13 +2775,8 @@ static int decide_key_frame(VP8_COMP *cpi) {
   return code_key_frame;
 }
 
-static void Pass1Encode(VP8_COMP *cpi, size_t *size, unsigned char *dest,
-                        unsigned int *frame_flags) {
-  (void)size;
-  (void)dest;
-  (void)frame_flags;
+static void Pass1Encode(VP8_COMP *cpi) {
   vp8_set_quantizer(cpi, 26);
-
   vp8_first_pass(cpi);
 }
 #endif
@@ -2853,7 +2813,6 @@ void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame)
     fclose(yframe);
 }
 #endif
-/* return of 0 means drop frame */
 
 #if !CONFIG_REALTIME_ONLY
 /* Function to test for conditions that indeicate we should loop
@@ -2916,8 +2875,7 @@ static void update_reference_frames(VP8_COMP *cpi) {
 
     cpi->current_ref_frames[GOLDEN_FRAME] = cm->current_video_frame;
     cpi->current_ref_frames[ALTREF_FRAME] = cm->current_video_frame;
-  } else /* For non key frames */
-  {
+  } else {
     if (cm->refresh_alt_ref_frame) {
       assert(!cm->copy_buffer_to_arf);
 
@@ -2938,8 +2896,7 @@ static void update_reference_frames(VP8_COMP *cpi) {
           cpi->current_ref_frames[ALTREF_FRAME] =
               cpi->current_ref_frames[LAST_FRAME];
         }
-      } else /* if (cm->copy_buffer_to_arf == 2) */
-      {
+      } else {
         if (cm->alt_fb_idx != cm->gld_fb_idx) {
           yv12_fb[cm->gld_fb_idx].flags |= VP8_ALTR_FRAME;
           yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME;
@@ -2971,8 +2928,7 @@ static void update_reference_frames(VP8_COMP *cpi) {
           cpi->current_ref_frames[GOLDEN_FRAME] =
               cpi->current_ref_frames[LAST_FRAME];
         }
-      } else /* if (cm->copy_buffer_to_gf == 2) */
-      {
+      } else {
         if (cm->alt_fb_idx != cm->gld_fb_idx) {
           yv12_fb[cm->alt_fb_idx].flags |= VP8_GOLD_FRAME;
           yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME;
@@ -3003,8 +2959,7 @@ static void update_reference_frames(VP8_COMP *cpi) {
       int i;
       for (i = LAST_FRAME; i < MAX_REF_FRAMES; ++i)
         vp8_yv12_copy_frame(cpi->Source, &cpi->denoiser.yv12_running_avg[i]);
-    } else /* For non key frames */
-    {
+    } else {
       vp8_yv12_extend_frame_borders(
           &cpi->denoiser.yv12_running_avg[INTRA_FRAME]);
 
@@ -3059,6 +3014,7 @@ static int measure_square_diff_partial(YV12_BUFFER_CONFIG *source,
   }
   // Only return non-zero if we have at least ~1/16 samples for estimate.
   if (num_blocks > (tot_num_blocks >> 4)) {
+    assert(num_blocks != 0);
     return (Total / num_blocks);
   } else {
     return 0;
@@ -3191,11 +3147,15 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) {
   if (cm->no_lpf) {
     cm->filter_level = 0;
   } else {
+#if CONFIG_INTERNAL_STATS
     struct vpx_usec_timer timer;
+#endif
 
     vpx_clear_system_state();
 
+#if CONFIG_INTERNAL_STATS
     vpx_usec_timer_start(&timer);
+#endif
     if (cpi->sf.auto_filter == 0) {
 #if CONFIG_TEMPORAL_DENOISING
       if (cpi->oxcf.noise_sensitivity && cm->frame_type != KEY_FRAME) {
@@ -3230,13 +3190,16 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) {
       vp8cx_set_alt_lf_level(cpi, cm->filter_level);
     }
 
+#if CONFIG_INTERNAL_STATS
     vpx_usec_timer_mark(&timer);
     cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);
+#endif
   }
 
 #if CONFIG_MULTITHREAD
-  if (cpi->b_multi_threaded) {
-    sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */
+  if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) {
+    /* signal that we have set filter_level */
+    vp8_sem_post(&cpi->h_event_end_lpf);
   }
 #endif
 
@@ -3248,6 +3211,113 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) {
 
   vp8_yv12_extend_frame_borders(cm->frame_to_show);
 }
+// Return 1 if frame is to be dropped. Update frame drop decimation
+// counters.
+int vp8_check_drop_buffer(VP8_COMP *cpi) {
+  VP8_COMMON *cm = &cpi->common;
+  int drop_mark = (int)(cpi->oxcf.drop_frames_water_mark *
+                        cpi->oxcf.optimal_buffer_level / 100);
+  int drop_mark75 = drop_mark * 2 / 3;
+  int drop_mark50 = drop_mark / 4;
+  int drop_mark25 = drop_mark / 8;
+  if (cpi->drop_frames_allowed) {
+    /* The reset to decimation 0 is only done here for one pass.
+     * Once it is set two pass leaves decimation on till the next kf.
+     */
+    if (cpi->buffer_level > drop_mark && cpi->decimation_factor > 0) {
+      cpi->decimation_factor--;
+    }
+
+    if (cpi->buffer_level > drop_mark75 && cpi->decimation_factor > 0) {
+      cpi->decimation_factor = 1;
+
+    } else if (cpi->buffer_level < drop_mark25 &&
+               (cpi->decimation_factor == 2 || cpi->decimation_factor == 3)) {
+      cpi->decimation_factor = 3;
+    } else if (cpi->buffer_level < drop_mark50 &&
+               (cpi->decimation_factor == 1 || cpi->decimation_factor == 2)) {
+      cpi->decimation_factor = 2;
+    } else if (cpi->buffer_level < drop_mark75 &&
+               (cpi->decimation_factor == 0 || cpi->decimation_factor == 1)) {
+      cpi->decimation_factor = 1;
+    }
+  }
+
+  /* The following decimates the frame rate according to a regular
+   * pattern (i.e. to 1/2 or 2/3 frame rate) This can be used to help
+   * prevent buffer under-run in CBR mode. Alternatively it might be
+   * desirable in some situations to drop frame rate but throw more bits
+   * at each frame.
+   *
+   * Note that dropping a key frame can be problematic if spatial
+   * resampling is also active
+   */
+  if (cpi->decimation_factor > 0 && cpi->drop_frames_allowed) {
+    switch (cpi->decimation_factor) {
+      case 1:
+        cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 3 / 2;
+        break;
+      case 2:
+        cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4;
+        break;
+      case 3:
+        cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4;
+        break;
+    }
+
+    /* Note that we should not throw out a key frame (especially when
+     * spatial resampling is enabled).
+     */
+    if (cm->frame_type == KEY_FRAME) {
+      cpi->decimation_count = cpi->decimation_factor;
+    } else if (cpi->decimation_count > 0) {
+      cpi->decimation_count--;
+
+      cpi->bits_off_target += cpi->av_per_frame_bandwidth;
+      if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size) {
+        cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
+      }
+
+#if CONFIG_MULTI_RES_ENCODING
+      vp8_store_drop_frame_info(cpi);
+#endif
+
+      cm->current_video_frame++;
+      cpi->frames_since_key++;
+      cpi->ext_refresh_frame_flags_pending = 0;
+      // We advance the temporal pattern for dropped frames.
+      cpi->temporal_pattern_counter++;
+
+#if CONFIG_INTERNAL_STATS
+      cpi->count++;
+#endif
+
+      cpi->buffer_level = cpi->bits_off_target;
+
+      if (cpi->oxcf.number_of_layers > 1) {
+        unsigned int i;
+
+        /* Propagate bits saved by dropping the frame to higher
+         * layers
+         */
+        for (i = cpi->current_layer + 1; i < cpi->oxcf.number_of_layers; ++i) {
+          LAYER_CONTEXT *lc = &cpi->layer_context[i];
+          lc->bits_off_target += (int)(lc->target_bandwidth / lc->framerate);
+          if (lc->bits_off_target > lc->maximum_buffer_size) {
+            lc->bits_off_target = lc->maximum_buffer_size;
+          }
+          lc->buffer_level = lc->bits_off_target;
+        }
+      }
+      return 1;
+    } else {
+      cpi->decimation_count = cpi->decimation_factor;
+    }
+  } else {
+    cpi->decimation_count = 0;
+  }
+  return 0;
+}
 
 static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
                                       unsigned char *dest,
@@ -3258,7 +3328,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
   int frame_under_shoot_limit;
 
   int Loop = 0;
-  int loop_count;
 
   VP8_COMMON *cm = &cpi->common;
   int active_worst_qchanged = 0;
@@ -3274,12 +3343,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
   int undershoot_seen = 0;
 #endif
 
-  int drop_mark = (int)(cpi->oxcf.drop_frames_water_mark *
-                        cpi->oxcf.optimal_buffer_level / 100);
-  int drop_mark75 = drop_mark * 2 / 3;
-  int drop_mark50 = drop_mark / 4;
-  int drop_mark25 = drop_mark / 8;
-
   /* Clear down mmx registers to allow floating point in what follows */
   vpx_clear_system_state();
 
@@ -3303,10 +3366,12 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
       }
       break;
 #endif  // !CONFIG_REALTIME_ONLY
-    default:
-      cpi->per_frame_bandwidth =
-          (int)(cpi->target_bandwidth / cpi->output_framerate);
+    default: {
+      const double per_frame_bandwidth =
+          round(cpi->target_bandwidth / cpi->output_framerate);
+      cpi->per_frame_bandwidth = (int)VPXMIN(per_frame_bandwidth, INT_MAX);
       break;
+    }
   }
 
   /* Default turn off buffer to buffer copying */
@@ -3358,11 +3423,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
         (LOWER_RES_FRAME_INFO *)cpi->oxcf.mr_low_res_mode_info;
 
     if (cpi->oxcf.mr_encoder_id) {
-      // TODO(marpan): This constraint shouldn't be needed, as we would like
-      // to allow for key frame setting (forced or periodic) defined per
-      // spatial layer. For now, keep this in.
-      cm->frame_type = low_res_frame_info->frame_type;
-
       // Check if lower resolution is available for motion vector reuse.
       if (cm->frame_type != KEY_FRAME) {
         cpi->mr_low_res_mv_avail = 1;
@@ -3387,7 +3447,16 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
                      == low_res_frame_info->low_res_ref_frames[ALTREF_FRAME]);
         */
       }
+      // Disable motion vector reuse (i.e., disable any usage of the low_res)
+      // if the previous lower stream is skipped/disabled.
+      if (low_res_frame_info->skip_encoding_prev_stream) {
+        cpi->mr_low_res_mv_avail = 0;
+      }
     }
+    // This stream is not skipped (i.e., it's being encoded), so set this skip
+    // flag to 0. This is needed for the next stream (i.e., which is the next
+    // frame to be encoded).
+    low_res_frame_info->skip_encoding_prev_stream = 0;
 
     // On a key frame: For the lowest resolution, keep track of the key frame
     // counter value. For the higher resolutions, reset the current video
@@ -3489,101 +3558,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 
   update_rd_ref_frame_probs(cpi);
 
-  if (cpi->drop_frames_allowed) {
-    /* The reset to decimation 0 is only done here for one pass.
-     * Once it is set two pass leaves decimation on till the next kf.
-     */
-    if ((cpi->buffer_level > drop_mark) && (cpi->decimation_factor > 0)) {
-      cpi->decimation_factor--;
-    }
-
-    if (cpi->buffer_level > drop_mark75 && cpi->decimation_factor > 0) {
-      cpi->decimation_factor = 1;
-
-    } else if (cpi->buffer_level < drop_mark25 &&
-               (cpi->decimation_factor == 2 || cpi->decimation_factor == 3)) {
-      cpi->decimation_factor = 3;
-    } else if (cpi->buffer_level < drop_mark50 &&
-               (cpi->decimation_factor == 1 || cpi->decimation_factor == 2)) {
-      cpi->decimation_factor = 2;
-    } else if (cpi->buffer_level < drop_mark75 &&
-               (cpi->decimation_factor == 0 || cpi->decimation_factor == 1)) {
-      cpi->decimation_factor = 1;
-    }
-  }
-
-  /* The following decimates the frame rate according to a regular
-   * pattern (i.e. to 1/2 or 2/3 frame rate) This can be used to help
-   * prevent buffer under-run in CBR mode. Alternatively it might be
-   * desirable in some situations to drop frame rate but throw more bits
-   * at each frame.
-   *
-   * Note that dropping a key frame can be problematic if spatial
-   * resampling is also active
-   */
-  if (cpi->decimation_factor > 0) {
-    switch (cpi->decimation_factor) {
-      case 1:
-        cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 3 / 2;
-        break;
-      case 2:
-        cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4;
-        break;
-      case 3:
-        cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4;
-        break;
-    }
-
-    /* Note that we should not throw out a key frame (especially when
-     * spatial resampling is enabled).
-     */
-    if (cm->frame_type == KEY_FRAME) {
-      cpi->decimation_count = cpi->decimation_factor;
-    } else if (cpi->decimation_count > 0) {
-      cpi->decimation_count--;
-
-      cpi->bits_off_target += cpi->av_per_frame_bandwidth;
-      if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size) {
-        cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
-      }
-
-#if CONFIG_MULTI_RES_ENCODING
-      vp8_store_drop_frame_info(cpi);
-#endif
-
-      cm->current_video_frame++;
-      cpi->frames_since_key++;
-      // We advance the temporal pattern for dropped frames.
-      cpi->temporal_pattern_counter++;
-
-#if CONFIG_INTERNAL_STATS
-      cpi->count++;
-#endif
-
-      cpi->buffer_level = cpi->bits_off_target;
-
-      if (cpi->oxcf.number_of_layers > 1) {
-        unsigned int i;
-
-        /* Propagate bits saved by dropping the frame to higher
-         * layers
-         */
-        for (i = cpi->current_layer + 1; i < cpi->oxcf.number_of_layers; ++i) {
-          LAYER_CONTEXT *lc = &cpi->layer_context[i];
-          lc->bits_off_target += (int)(lc->target_bandwidth / lc->framerate);
-          if (lc->bits_off_target > lc->maximum_buffer_size) {
-            lc->bits_off_target = lc->maximum_buffer_size;
-          }
-          lc->buffer_level = lc->bits_off_target;
-        }
-      }
-
-      return;
-    } else {
-      cpi->decimation_count = cpi->decimation_factor;
-    }
-  } else {
-    cpi->decimation_count = 0;
+  if (vp8_check_drop_buffer(cpi)) {
+    return;
   }
 
   /* Decide how big to make the frame */
@@ -3594,6 +3570,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 #endif
     cm->current_video_frame++;
     cpi->frames_since_key++;
+    cpi->ext_refresh_frame_flags_pending = 0;
     // We advance the temporal pattern for dropped frames.
     cpi->temporal_pattern_counter++;
     return;
@@ -3659,7 +3636,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
         if (cpi->this_key_frame_forced) {
           if (cpi->active_best_quality > cpi->avg_frame_qindex * 7 / 8) {
             cpi->active_best_quality = cpi->avg_frame_qindex * 7 / 8;
-          } else if (cpi->active_best_quality<cpi->avg_frame_qindex>> 2) {
+          } else if (cpi->active_best_quality < (cpi->avg_frame_qindex >> 2)) {
             cpi->active_best_quality = cpi->avg_frame_qindex >> 2;
           }
         }
@@ -3681,7 +3658,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
         Q = cpi->avg_frame_qindex;
       }
 
-      /* For constrained quality dont allow Q less than the cq level */
+      /* For constrained quality don't allow Q less than the cq level */
       if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
           (Q < cpi->cq_target_quality)) {
         Q = cpi->cq_target_quality;
@@ -3708,7 +3685,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
     } else {
       cpi->active_best_quality = inter_minq[Q];
 
-      /* For the constant/constrained quality mode we dont want
+      /* For the constant/constrained quality mode we don't want
        * q to fall below the cq level.
        */
       if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
@@ -3729,7 +3706,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
      * higher quality on the frames to prevent bits just going to waste.
      */
     if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
-      /* Note that the use of >= here elliminates the risk of a devide
+      /* Note that the use of >= here elliminates the risk of a divide
        * by 0 error in the else if clause
        */
       if (cpi->buffer_level >= cpi->oxcf.maximum_buffer_size) {
@@ -3789,9 +3766,11 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
   }
 #endif
 
+  compute_skin_map(cpi);
+
   /* Setup background Q adjustment for error resilient mode.
    * For multi-layer encodes only enable this for the base layer.
-  */
+   */
   if (cpi->cyclic_refresh_mode_enabled) {
     // Special case for screen_content_mode with golden frame updates.
     int disable_cr_gf =
@@ -3816,8 +3795,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 
   vp8_save_coding_context(cpi);
 
-  loop_count = 0;
-
   scale_and_extend_source(cpi->un_scaled_source, cpi);
 
 #if CONFIG_TEMPORAL_DENOISING && CONFIG_POSTPROC
@@ -3825,8 +3802,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
   // (temporal denoising) mode.
   if (cpi->oxcf.noise_sensitivity >= 3) {
     if (cpi->denoiser.denoise_pars.spatial_blur != 0) {
-      vp8_de_noise(cm, cpi->Source, cpi->Source,
-                   cpi->denoiser.denoise_pars.spatial_blur, 1, 0, 0);
+      vp8_de_noise(cm, cpi->Source, cpi->denoiser.denoise_pars.spatial_blur, 1);
     }
   }
 #endif
@@ -3847,9 +3823,9 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
     }
 
     if (cm->frame_type == KEY_FRAME) {
-      vp8_de_noise(cm, cpi->Source, cpi->Source, l, 1, 0, 1);
+      vp8_de_noise(cm, cpi->Source, l, 1);
     } else {
-      vp8_de_noise(cm, cpi->Source, cpi->Source, l, 1, 0, 1);
+      vp8_de_noise(cm, cpi->Source, l, 1);
 
       src = cpi->Source->y_buffer;
 
@@ -3862,7 +3838,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 #endif
 
 #ifdef OUTPUT_YUV_SRC
-  vp8_write_yuv_frame(yuv_file, cpi->Source);
+  vpx_write_yuv_frame(yuv_file, cpi->Source);
 #endif
 
   do {
@@ -3973,7 +3949,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 
       if (cm->refresh_entropy_probs == 0) {
         /* save a copy for later refresh */
-        memcpy(&cm->lfc, &cm->fc, sizeof(cm->fc));
+        cm->lfc = cm->fc;
       }
 
       vp8_update_coef_context(cpi);
@@ -3990,8 +3966,16 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 #else
     /* transform / motion compensation build reconstruction frame */
     vp8_encode_frame(cpi);
-    if (cpi->oxcf.screen_content_mode == 2) {
-      if (vp8_drop_encodedframe_overshoot(cpi, Q)) return;
+
+    if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER &&
+        cpi->rt_drop_recode_on_overshoot == 1) {
+      if (vp8_drop_encodedframe_overshoot(cpi, Q)) {
+        vpx_clear_system_state();
+        return;
+      }
+      if (cm->frame_type != KEY_FRAME)
+        cpi->last_pred_err_mb =
+            (int)(cpi->mb.prediction_error / cpi->common.MBs);
     }
 
     cpi->projected_frame_size -= vp8_estimate_entropy_savings(cpi);
@@ -4034,7 +4018,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
         q_low = cpi->active_best_quality;
         q_high = cpi->active_worst_quality;
 
-        loop_count++;
         Loop = 1;
 
         continue;
@@ -4047,7 +4030,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
     if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
 
     /* Are we are overshooting and up against the limit of active max Q. */
-    if (((cpi->pass != 2) ||
+    if (!cpi->rt_always_update_correction_factor &&
+        ((cpi->pass != 2) ||
          (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)) &&
         (Q == cpi->active_worst_quality) &&
         (cpi->active_worst_quality < cpi->worst_quality) &&
@@ -4066,9 +4050,9 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 #if !CONFIG_REALTIME_ONLY
       top_index = cpi->active_worst_quality;
 #endif  // !CONFIG_REALTIME_ONLY
-        /* If we have updated the active max Q do not call
-         * vp8_update_rate_correction_factors() this loop.
-         */
+      /* If we have updated the active max Q do not call
+       * vp8_update_rate_correction_factors() this loop.
+       */
       active_worst_qchanged = 1;
     } else {
       active_worst_qchanged = 0;
@@ -4244,11 +4228,10 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
       }
 
       /* Clamp cpi->zbin_over_quant */
-      cpi->mb.zbin_over_quant = (cpi->mb.zbin_over_quant < zbin_oq_low)
-                                    ? zbin_oq_low
-                                    : (cpi->mb.zbin_over_quant > zbin_oq_high)
-                                          ? zbin_oq_high
-                                          : cpi->mb.zbin_over_quant;
+      cpi->mb.zbin_over_quant =
+          (cpi->mb.zbin_over_quant < zbin_oq_low)    ? zbin_oq_low
+          : (cpi->mb.zbin_over_quant > zbin_oq_high) ? zbin_oq_high
+                                                     : cpi->mb.zbin_over_quant;
 
       Loop = Q != last_q;
     } else {
@@ -4260,7 +4243,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 
     if (Loop == 1) {
       vp8_restore_coding_context(cpi);
-      loop_count++;
 #if CONFIG_INTERNAL_STATS
       cpi->tot_recode_hits++;
 #endif
@@ -4274,6 +4256,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
     cpi->common.current_video_frame++;
     cpi->frames_since_key++;
     cpi->drop_frame_count++;
+    cpi->ext_refresh_frame_flags_pending = 0;
     // We advance the temporal pattern for dropped frames.
     cpi->temporal_pattern_counter++;
     return;
@@ -4361,12 +4344,12 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
   vp8_cal_dissimilarity(cpi);
 #endif
 
-  /* Update the GF useage maps.
+  /* Update the GF usage maps.
    * This is done after completing the compression of a frame when all
    * modes etc. are finalized but before loop filter
    */
   if (cpi->oxcf.number_of_layers == 1) {
-    vp8_update_gf_useage_maps(cpi, cm, &cpi->mb);
+    vp8_update_gf_usage_maps(cpi, cm, &cpi->mb);
   }
 
   if (cm->frame_type == KEY_FRAME) cm->refresh_last_frame = 1;
@@ -4382,8 +4365,10 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
   /* For inter frames the current default behavior is that when
    * cm->refresh_golden_frame is set we copy the old GF over to the ARF buffer
    * This is purely an encoder decision at present.
+   * Avoid this behavior when refresh flags are set by the user.
    */
-  if (!cpi->oxcf.error_resilient_mode && cm->refresh_golden_frame) {
+  if (!cpi->oxcf.error_resilient_mode && cm->refresh_golden_frame &&
+      !cpi->ext_refresh_frame_flags_pending) {
     cm->copy_buffer_to_arf = 2;
   } else {
     cm->copy_buffer_to_arf = 0;
@@ -4422,11 +4407,20 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
   }
 #endif
 
+#ifdef OUTPUT_YUV_SKINMAP
+  if (cpi->common.current_video_frame > 1) {
+    vp8_compute_skin_map(cpi, yuv_skinmap_file);
+  }
+#endif
+
 #if CONFIG_MULTITHREAD
-  if (cpi->b_multi_threaded) {
+  if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) {
     /* start loopfilter in separate thread */
-    sem_post(&cpi->h_event_start_lpf);
+    vp8_sem_post(&cpi->h_event_start_lpf);
     cpi->b_lpf_running = 1;
+    /* wait for the filter_level to be picked so that we can continue with
+     * stream packing */
+    vp8_sem_wait(&cpi->h_event_end_lpf);
   } else
 #endif
   {
@@ -4436,7 +4430,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
   update_reference_frames(cpi);
 
 #ifdef OUTPUT_YUV_DENOISED
-  vp8_write_yuv_frame(yuv_denoised_file,
+  vpx_write_yuv_frame(yuv_denoised_file,
                       &cpi->denoiser.yv12_running_avg[INTRA_FRAME]);
 #endif
 
@@ -4446,12 +4440,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
   }
 #endif
 
-#if CONFIG_MULTITHREAD
-  /* wait that filter_level is picked so that we can continue with stream
-   * packing */
-  if (cpi->b_multi_threaded) sem_wait(&cpi->h_event_end_lpf);
-#endif
-
   /* build the bitstream */
   vp8_pack_bitstream(cpi, dest, dest_end, size);
 
@@ -4518,7 +4506,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
        * size within range) then use the last frame value - 1. The -1
        * is designed to stop Q and hence the data rate, from
        * progressively falling away during difficult sections, but at
-       * the same time reduce the number of itterations around the
+       * the same time reduce the number of iterations around the
        * recode loop.
        */
       if (Q > cpi->ni_av_qi) cpi->ni_av_qi = Q - 1;
@@ -4539,10 +4527,10 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
     cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
   }
 
-  // If the frame dropper is not enabled, don't let the buffer level go below
-  // some threshold, given here by -|maximum_buffer_size|. For now we only do
-  // this for screen content input.
-  if (cpi->drop_frames_allowed == 0 && cpi->oxcf.screen_content_mode &&
+  // Don't let the buffer level go below some threshold, given here
+  // by -|maximum_buffer_size|. For now we only do this for
+  // screen content input.
+  if (cpi->oxcf.screen_content_mode &&
       cpi->bits_off_target < -cpi->oxcf.maximum_buffer_size) {
     cpi->bits_off_target = -cpi->oxcf.maximum_buffer_size;
   }
@@ -4550,22 +4538,24 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
   /* Rolling monitors of whether we are over or underspending used to
    * help regulate min and Max Q in two pass.
    */
-  cpi->rolling_target_bits =
-      ((cpi->rolling_target_bits * 3) + cpi->this_frame_target + 2) / 4;
-  cpi->rolling_actual_bits =
-      ((cpi->rolling_actual_bits * 3) + cpi->projected_frame_size + 2) / 4;
-  cpi->long_rolling_target_bits =
-      ((cpi->long_rolling_target_bits * 31) + cpi->this_frame_target + 16) / 32;
-  cpi->long_rolling_actual_bits =
-      ((cpi->long_rolling_actual_bits * 31) + cpi->projected_frame_size + 16) /
-      32;
+  cpi->rolling_target_bits = (int)ROUND64_POWER_OF_TWO(
+      (int64_t)cpi->rolling_target_bits * 3 + cpi->this_frame_target, 2);
+  cpi->rolling_actual_bits = (int)ROUND64_POWER_OF_TWO(
+      (int64_t)cpi->rolling_actual_bits * 3 + cpi->projected_frame_size, 2);
+  cpi->long_rolling_target_bits = (int)ROUND64_POWER_OF_TWO(
+      (int64_t)cpi->long_rolling_target_bits * 31 + cpi->this_frame_target, 5);
+  cpi->long_rolling_actual_bits = (int)ROUND64_POWER_OF_TWO(
+      (int64_t)cpi->long_rolling_actual_bits * 31 + cpi->projected_frame_size,
+      5);
 
   /* Actual bits spent */
   cpi->total_actual_bits += cpi->projected_frame_size;
 
+#if 0 && CONFIG_INTERNAL_STATS
   /* Debug stats */
   cpi->total_target_vs_actual +=
       (cpi->this_frame_target - cpi->projected_frame_size);
+#endif
 
   cpi->buffer_level = cpi->bits_off_target;
 
@@ -4575,8 +4565,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 
     for (i = cpi->current_layer + 1; i < cpi->oxcf.number_of_layers; ++i) {
       LAYER_CONTEXT *lc = &cpi->layer_context[i];
-      int bits_off_for_this_layer = (int)(lc->target_bandwidth / lc->framerate -
-                                          cpi->projected_frame_size);
+      int bits_off_for_this_layer = (int)round(
+          lc->target_bandwidth / lc->framerate - cpi->projected_frame_size);
 
       lc->bits_off_target += bits_off_for_this_layer;
 
@@ -4687,6 +4677,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 
 #endif
 
+  cpi->ext_refresh_frame_flags_pending = 0;
+
   if (cm->refresh_golden_frame == 1) {
     cm->frame_flags = cm->frame_flags | FRAMEFLAGS_GOLDEN;
   } else {
@@ -4761,7 +4753,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
   cpi->mb.e_mbd.update_mb_segmentation_data = 0;
   cpi->mb.e_mbd.mode_ref_lf_delta_update = 0;
 
-  /* Dont increment frame counters if this was an altref buffer update
+  /* Don't increment frame counters if this was an altref buffer update
    * not a real frame
    */
   if (cm->show_frame) {
@@ -4770,8 +4762,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
     cpi->temporal_pattern_counter++;
   }
 
-/* reset to normal state now that we are done. */
-
 #if 0
     {
         char filename[512];
@@ -4785,7 +4775,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 #endif
 
   /* DEBUG */
-  /* vp8_write_yuv_frame("encoder_recon.yuv", cm->frame_to_show); */
+  /* vpx_write_yuv_frame("encoder_recon.yuv", cm->frame_to_show); */
 }
 #if !CONFIG_REALTIME_ONLY
 static void Pass2Encode(VP8_COMP *cpi, size_t *size, unsigned char *dest,
@@ -4807,10 +4797,14 @@ static void Pass2Encode(VP8_COMP *cpi, size_t *size, unsigned char *dest,
 int vp8_receive_raw_frame(VP8_COMP *cpi, unsigned int frame_flags,
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
                           int64_t end_time) {
+#if CONFIG_INTERNAL_STATS
   struct vpx_usec_timer timer;
+#endif
   int res = 0;
 
+#if CONFIG_INTERNAL_STATS
   vpx_usec_timer_start(&timer);
+#endif
 
   /* Reinit the lookahead buffer if the frame size changes */
   if (sd->y_width != cpi->oxcf.Width || sd->y_height != cpi->oxcf.Height) {
@@ -4823,8 +4817,10 @@ int vp8_receive_raw_frame(VP8_COMP *cpi, unsigned int frame_flags,
                          cpi->active_map_enabled ? cpi->active_map : NULL)) {
     res = -1;
   }
+#if CONFIG_INTERNAL_STATS
   vpx_usec_timer_mark(&timer);
   cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
+#endif
 
   return res;
 }
@@ -4845,24 +4841,19 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
                             unsigned char *dest_end, int64_t *time_stamp,
                             int64_t *time_end, int flush) {
   VP8_COMMON *cm;
-  struct vpx_usec_timer tsctimer;
   struct vpx_usec_timer ticktimer;
+#if CONFIG_INTERNAL_STATS
   struct vpx_usec_timer cmptimer;
+#endif
   YV12_BUFFER_CONFIG *force_src_buffer = NULL;
 
   if (!cpi) return -1;
 
   cm = &cpi->common;
 
-  if (setjmp(cpi->common.error.jmp)) {
-    cpi->common.error.setjmp = 0;
-    vpx_clear_system_state();
-    return VPX_CODEC_CORRUPT_FRAME;
-  }
-
-  cpi->common.error.setjmp = 1;
-
+#if CONFIG_INTERNAL_STATS
   vpx_usec_timer_start(&cmptimer);
+#endif
 
   cpi->source = NULL;
 
@@ -4950,6 +4941,8 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
 
       this_duration = cpi->source->ts_end - cpi->last_end_time_stamp_seen;
       last_duration = cpi->last_end_time_stamp_seen - cpi->last_time_stamp_seen;
+      // Cap this to avoid overflow of (this_duration - last_duration) * 10
+      this_duration = VPXMIN(this_duration, INT64_MAX / 10);
       /* do a step update if the duration changes by 10% */
       if (last_duration) {
         step = (int)(((this_duration - last_duration) * 10 / last_duration));
@@ -4987,13 +4980,17 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
         // be received for that high layer, which will yield an incorrect
         // frame rate (from time-stamp adjustment in above calculation).
         if (cpi->oxcf.mr_encoder_id) {
-          cpi->ref_framerate = low_res_frame_info->low_res_framerate;
+          if (!low_res_frame_info->skip_encoding_base_stream)
+            cpi->ref_framerate = low_res_frame_info->low_res_framerate;
         } else {
           // Keep track of frame rate for lowest resolution.
           low_res_frame_info->low_res_framerate = cpi->ref_framerate;
+          // The base stream is being encoded so set skip flag to 0.
+          low_res_frame_info->skip_encoding_base_stream = 0;
         }
       }
 #endif
+      cpi->ref_framerate = clamp_framerate(cpi->ref_framerate);
       if (cpi->oxcf.number_of_layers > 1) {
         unsigned int i;
 
@@ -5016,7 +5013,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
   if (cpi->oxcf.number_of_layers > 1) {
     int layer;
 
-    update_layer_contexts(cpi);
+    vp8_update_layer_contexts(cpi);
 
     /* Restore layer specific context & set frame rate */
     if (cpi->temporal_layer_id >= 0) {
@@ -5026,12 +5023,11 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
           cpi->oxcf
               .layer_id[cpi->temporal_pattern_counter % cpi->oxcf.periodicity];
     }
-    restore_layer_context(cpi, layer);
+    vp8_restore_layer_context(cpi, layer);
     vp8_new_framerate(cpi, cpi->layer_context[layer].framerate);
   }
 
   if (cpi->compressor_speed == 2) {
-    vpx_usec_timer_start(&tsctimer);
     vpx_usec_timer_start(&ticktimer);
   }
 
@@ -5096,7 +5092,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
   }
   switch (cpi->pass) {
 #if !CONFIG_REALTIME_ONLY
-    case 1: Pass1Encode(cpi, size, dest, frame_flags); break;
+    case 1: Pass1Encode(cpi); break;
     case 2: Pass2Encode(cpi, size, dest, dest_end, frame_flags); break;
 #endif  // !CONFIG_REALTIME_ONLY
     default:
@@ -5106,7 +5102,6 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
 
   if (cpi->compressor_speed == 2) {
     unsigned int duration, duration2;
-    vpx_usec_timer_mark(&tsctimer);
     vpx_usec_timer_mark(&ticktimer);
 
     duration = (int)(vpx_usec_timer_elapsed(&ticktimer));
@@ -5133,18 +5128,18 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
   }
 
   if (cm->refresh_entropy_probs == 0) {
-    memcpy(&cm->fc, &cm->lfc, sizeof(cm->fc));
+    cm->fc = cm->lfc;
   }
 
   /* Save the contexts separately for alt ref, gold and last. */
   /* (TODO jbb -> Optimize this with pointers to avoid extra copies. ) */
-  if (cm->refresh_alt_ref_frame) memcpy(&cpi->lfc_a, &cm->fc, sizeof(cm->fc));
+  if (cm->refresh_alt_ref_frame) cpi->lfc_a = cm->fc;
 
-  if (cm->refresh_golden_frame) memcpy(&cpi->lfc_g, &cm->fc, sizeof(cm->fc));
+  if (cm->refresh_golden_frame) cpi->lfc_g = cm->fc;
 
-  if (cm->refresh_last_frame) memcpy(&cpi->lfc_n, &cm->fc, sizeof(cm->fc));
+  if (cm->refresh_last_frame) cpi->lfc_n = cm->fc;
 
-  /* if its a dropped frame honor the requests on subsequent frames */
+  /* if it's a dropped frame honor the requests on subsequent frames */
   if (*size > 0) {
     cpi->droppable = !frame_is_reference(cpi);
 
@@ -5157,10 +5152,20 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
   }
 
   /* Save layer specific state */
-  if (cpi->oxcf.number_of_layers > 1) save_layer_context(cpi);
+  if (cpi->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi);
 
+#if CONFIG_INTERNAL_STATS
   vpx_usec_timer_mark(&cmptimer);
   cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
+#endif
+
+#if CONFIG_MULTITHREAD
+  /* wait for the lpf thread done */
+  if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) && cpi->b_lpf_running) {
+    vp8_sem_wait(&cpi->h_event_end_lpf);
+    cpi->b_lpf_running = 0;
+  }
+#endif
 
   if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame) {
     generate_psnr_packet(cpi);
@@ -5215,7 +5220,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
           double weight = 0;
 
           vp8_deblock(cm, cm->frame_to_show, &cm->post_proc_buffer,
-                      cm->filter_level * 10 / 6, 1, 0);
+                      cm->filter_level * 10 / 6);
           vpx_clear_system_state();
 
           ye = calc_plane_error(orig->y_buffer, orig->y_stride, pp->y_buffer,
@@ -5289,16 +5294,6 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
 #endif
 #endif
 
-  cpi->common.error.setjmp = 0;
-
-#if CONFIG_MULTITHREAD
-  /* wait for the lpf thread done */
-  if (cpi->b_multi_threaded && cpi->b_lpf_running) {
-    sem_wait(&cpi->h_event_end_lpf);
-    cpi->b_lpf_running = 0;
-  }
-#endif
-
   return 0;
 }
 
@@ -5339,28 +5334,25 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows,
   const int range = 63;
   int i;
 
-  // This method is currently incompatible with the cyclic refresh method
-  if (cpi->cyclic_refresh_mode_enabled) return -1;
-
   // Check number of rows and columns match
   if (cpi->common.mb_rows != (int)rows || cpi->common.mb_cols != (int)cols) {
     return -1;
   }
 
-  // Range check the delta Q values and convert the external Q range values
-  // to internal ones.
-  if ((abs(delta_q[0]) > range) || (abs(delta_q[1]) > range) ||
-      (abs(delta_q[2]) > range) || (abs(delta_q[3]) > range)) {
-    return -1;
+  for (i = 0; i < MAX_MB_SEGMENTS; ++i) {
+    // Note abs() alone can't be used as the behavior of abs(INT_MIN) is
+    // undefined.
+    if (delta_q[i] > range || delta_q[i] < -range || delta_lf[i] > range ||
+        delta_lf[i] < -range) {
+      return -1;
+    }
   }
 
-  // Range check the delta lf values
-  if ((abs(delta_lf[0]) > range) || (abs(delta_lf[1]) > range) ||
-      (abs(delta_lf[2]) > range) || (abs(delta_lf[3]) > range)) {
-    return -1;
-  }
-
-  if (!map) {
+  // Also disable segmentation if no deltas are specified.
+  if (!map || (delta_q[0] == 0 && delta_q[1] == 0 && delta_q[2] == 0 &&
+               delta_q[3] == 0 && delta_lf[0] == 0 && delta_lf[1] == 0 &&
+               delta_lf[2] == 0 && delta_lf[3] == 0 && threshold[0] == 0 &&
+               threshold[1] == 0 && threshold[2] == 0 && threshold[3] == 0)) {
     disable_segmentation(cpi);
     return 0;
   }
@@ -5397,6 +5389,11 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows,
   /* Initialise the feature data structure */
   set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA);
 
+  if (threshold[0] != 0 || threshold[1] != 0 || threshold[2] != 0 ||
+      threshold[3] != 0)
+    cpi->use_roi_static_threshold = 1;
+  cpi->cyclic_refresh_mode_enabled = 0;
+
   return 0;
 }
 
@@ -5416,15 +5413,15 @@ int vp8_set_active_map(VP8_COMP *cpi, unsigned char *map, unsigned int rows,
   }
 }
 
-int vp8_set_internal_size(VP8_COMP *cpi, VPX_SCALING horiz_mode,
-                          VPX_SCALING vert_mode) {
-  if (horiz_mode <= ONETWO) {
+int vp8_set_internal_size(VP8_COMP *cpi, VPX_SCALING_MODE horiz_mode,
+                          VPX_SCALING_MODE vert_mode) {
+  if (horiz_mode <= VP8E_ONETWO) {
     cpi->common.horiz_scale = horiz_mode;
   } else {
     return -1;
   }
 
-  if (vert_mode <= ONETWO) {
+  if (vert_mode <= VP8E_ONETWO) {
     cpi->common.vert_scale = vert_mode;
   } else {
     return -1;
diff --git a/media/libvpx/libvpx/vp8/encoder/onyx_int.h b/media/libvpx/libvpx/vp8/encoder/onyx_int.h
index fe775064a4..c2a137398e 100644
--- a/media/libvpx/libvpx/vp8/encoder/onyx_int.h
+++ b/media/libvpx/libvpx/vp8/encoder/onyx_int.h
@@ -8,16 +8,19 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_ONYX_INT_H_
-#define VP8_ENCODER_ONYX_INT_H_
+#ifndef VPX_VP8_ENCODER_ONYX_INT_H_
+#define VPX_VP8_ENCODER_ONYX_INT_H_
 
+#include <assert.h>
 #include <stdio.h>
+
 #include "vpx_config.h"
 #include "vp8/common/onyx.h"
 #include "treewriter.h"
 #include "tokenize.h"
 #include "vp8/common/onyxc_int.h"
 #include "vpx_dsp/variance.h"
+#include "vpx_util/vpx_pthread.h"
 #include "encodemb.h"
 #include "vp8/encoder/quantize.h"
 #include "vp8/common/entropy.h"
@@ -57,6 +60,9 @@ extern "C" {
 
 #define VP8_TEMPORAL_ALT_REF !CONFIG_REALTIME_ONLY
 
+/* vp8 uses 10,000,000 ticks/second as time stamp */
+#define TICKS_PER_SEC 10000000
+
 typedef struct {
   int kf_indicated;
   unsigned int frames_since_key;
@@ -210,7 +216,7 @@ enum {
 typedef struct {
   /* Layer configuration */
   double framerate;
-  int target_bandwidth;
+  int target_bandwidth; /* bits per second */
 
   /* Layer specific coding parameters */
   int64_t starting_buffer_level;
@@ -226,7 +232,7 @@ typedef struct {
   int64_t bits_off_target;
 
   int64_t total_actual_bits;
-  int total_target_vs_actual;
+  int64_t total_target_vs_actual;
 
   int worst_quality;
   int active_worst_quality;
@@ -249,10 +255,15 @@ typedef struct {
 
   int filter_level;
 
+  int frames_since_last_drop_overshoot;
+
+  int force_maxqp;
+
   int last_frame_percent_intra;
 
   int count_mb_ref_frame_usage[MAX_REF_FRAMES];
 
+  int last_q[2];
 } LAYER_CONTEXT;
 
 typedef struct VP8_COMP {
@@ -331,7 +342,7 @@ typedef struct VP8_COMP {
 
   CODING_CONTEXT coding_context;
 
-  /* Rate targetting variables */
+  /* Rate targeting variables */
   int64_t last_prediction_error;
   int64_t last_intra_error;
 
@@ -350,7 +361,7 @@ typedef struct VP8_COMP {
   /* GF interval chosen when we coded the last GF */
   int current_gf_interval;
 
-  /* Total bits overspent becasue of GF boost (cumulative) */
+  /* Total bits overspent because of GF boost (cumulative) */
   int gf_overspend_bits;
 
   /* Used in the few frames following a GF to recover the extra bits
@@ -402,7 +413,7 @@ typedef struct VP8_COMP {
   int long_rolling_actual_bits;
 
   int64_t total_actual_bits;
-  int total_target_vs_actual; /* debug stats */
+  int64_t total_target_vs_actual; /* debug stats */
 
   int worst_quality;
   int active_worst_quality;
@@ -428,7 +439,7 @@ typedef struct VP8_COMP {
   int kf_boost;
   int last_boost;
 
-  int target_bandwidth;
+  int target_bandwidth; /* bits per second */
   struct vpx_codec_pkt_list *output_pkt_list;
 
 #if 0
@@ -471,9 +482,11 @@ typedef struct VP8_COMP {
   int zeromv_count;
   int lf_zeromv_pct;
 
+  unsigned char *skin_map;
+
   unsigned char *segmentation_map;
   signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
-  int segment_encode_breakout[MAX_MB_SEGMENTS];
+  unsigned int segment_encode_breakout[MAX_MB_SEGMENTS];
 
   unsigned char *active_map;
   unsigned int active_map_enabled;
@@ -503,6 +516,8 @@ typedef struct VP8_COMP {
   int mse_source_denoised;
 
   int force_maxqp;
+  int frames_since_last_drop_overshoot;
+  int last_pred_err_mb;
 
   // GF update for 1 pass cbr.
   int gf_update_onepass_cbr;
@@ -511,11 +526,10 @@ typedef struct VP8_COMP {
 
 #if CONFIG_MULTITHREAD
   /* multithread data */
-  pthread_mutex_t *pmutex;
-  pthread_mutex_t mt_mutex; /* mutex for b_multi_threaded */
-  int *mt_current_mb_col;
+  vpx_atomic_int *mt_current_mb_col;
+  int mt_current_mb_col_size;
   int mt_sync_range;
-  int b_multi_threaded;
+  vpx_atomic_int b_multi_threaded;
   int encoding_thread_count;
   int b_lpf_running;
 
@@ -527,10 +541,10 @@ typedef struct VP8_COMP {
   LPFTHREAD_DATA lpf_thread_data;
 
   /* events */
-  sem_t *h_event_start_encoding;
-  sem_t *h_event_end_encoding;
-  sem_t h_event_start_lpf;
-  sem_t h_event_end_lpf;
+  vp8_sem_t *h_event_start_encoding;
+  vp8_sem_t *h_event_end_encoding;
+  vp8_sem_t h_event_start_lpf;
+  vp8_sem_t h_event_end_lpf;
 #endif
 
   TOKENLIST *tplist;
@@ -539,14 +553,15 @@ typedef struct VP8_COMP {
   unsigned char *partition_d_end[MAX_PARTITIONS];
 
   fractional_mv_step_fp *find_fractional_mv_step;
-  vp8_full_search_fn_t full_search_sad;
   vp8_refining_search_fn_t refining_search_sad;
   vp8_diamond_search_fn_t diamond_search_sad;
   vp8_variance_fn_ptr_t fn_ptr[BLOCK_MAX_SEGMENTS];
+#if CONFIG_INTERNAL_STATS
   uint64_t time_receive_data;
   uint64_t time_compress_data;
   uint64_t time_pick_lpf;
   uint64_t time_encode_mb_row;
+#endif
 
   int base_skip_false_prob[128];
 
@@ -610,7 +625,7 @@ typedef struct VP8_COMP {
   double totalp_v;
   double totalp;
   double total_sq_error2;
-  int bytes;
+  uint64_t bytes;
   double summed_quality;
   double summed_weights;
   unsigned int tot_recode_hits;
@@ -687,12 +702,33 @@ typedef struct VP8_COMP {
     int token_costs[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
                    [MAX_ENTROPY_TOKENS];
   } rd_costs;
+
+  // Use the static threshold from ROI settings.
+  int use_roi_static_threshold;
+
+  int ext_refresh_frame_flags_pending;
+
+  // Always update correction factor used for rate control after each frame for
+  // realtime encoding.
+  int rt_always_update_correction_factor;
+
+  // Flag to indicate frame may be dropped due to large expected overshoot,
+  // and re-encoded on next frame at max_qp.
+  int rt_drop_recode_on_overshoot;
 } VP8_COMP;
 
 void vp8_initialize_enc(void);
 
 void vp8_alloc_compressor_data(VP8_COMP *cpi);
 int vp8_reverse_trans(int x);
+void vp8_reset_temporal_layer_change(VP8_COMP *cpi, const VP8_CONFIG *oxcf,
+                                     const int prev_num_layers);
+void vp8_init_temporal_layer_context(VP8_COMP *cpi, const VP8_CONFIG *oxcf,
+                                     const int layer,
+                                     double prev_layer_framerate);
+void vp8_update_layer_contexts(VP8_COMP *cpi);
+void vp8_save_layer_context(VP8_COMP *cpi);
+void vp8_restore_layer_context(VP8_COMP *cpi, const int layer);
 void vp8_new_framerate(VP8_COMP *cpi, double framerate);
 void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm);
 
@@ -703,26 +739,10 @@ void vp8_tokenize_mb(VP8_COMP *, MACROBLOCK *, TOKENEXTRA **);
 
 void vp8_set_speed_features(VP8_COMP *cpi);
 
-#if CONFIG_DEBUG
-#define CHECK_MEM_ERROR(lval, expr)                                         \
-  do {                                                                      \
-    lval = (expr);                                                          \
-    if (!lval)                                                              \
-      vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,           \
-                         "Failed to allocate " #lval " at %s:%d", __FILE__, \
-                         __LINE__);                                         \
-  } while (0)
-#else
-#define CHECK_MEM_ERROR(lval, expr)                               \
-  do {                                                            \
-    lval = (expr);                                                \
-    if (!lval)                                                    \
-      vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, \
-                         "Failed to allocate " #lval);            \
-  } while (0)
-#endif
+int vp8_check_drop_buffer(VP8_COMP *cpi);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_ONYX_INT_H_
+#endif  // VPX_VP8_ENCODER_ONYX_INT_H_
diff --git a/media/libvpx/libvpx/vp8/encoder/pickinter.c b/media/libvpx/libvpx/vp8/encoder/pickinter.c
index 7b68d35f50..ca6c18f48e 100644
--- a/media/libvpx/libvpx/vp8/encoder/pickinter.c
+++ b/media/libvpx/libvpx/vp8/encoder/pickinter.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 #include <limits.h>
 #include "vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
@@ -24,6 +25,7 @@
 #include "vp8/common/reconintra4x4.h"
 #include "vpx_dsp/variance.h"
 #include "mcomp.h"
+#include "vp8/common/vp8_skin_detection.h"
 #include "rdopt.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
@@ -35,82 +37,9 @@
 extern unsigned int cnt_pm;
 #endif
 
-#define MODEL_MODE 1
-
 extern const int vp8_ref_frame_order[MAX_MODES];
 extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
 
-// Fixed point implementation of a skin color classifier. Skin color
-// is model by a Gaussian distribution in the CbCr color space.
-// See ../../test/skin_color_detector_test.cc where the reference
-// skin color classifier is defined.
-
-// Fixed-point skin color model parameters.
-static const int skin_mean[5][2] = { { 7463, 9614 },
-                                     { 6400, 10240 },
-                                     { 7040, 10240 },
-                                     { 8320, 9280 },
-                                     { 6800, 9614 } };
-static const int skin_inv_cov[4] = { 4107, 1663, 1663, 2157 };  // q16
-static const int skin_threshold[6] = { 1570636, 1400000, 800000,
-                                       800000,  800000,  800000 };  // q18
-
-// Evaluates the Mahalanobis distance measure for the input CbCr values.
-static int evaluate_skin_color_difference(int cb, int cr, int idx) {
-  const int cb_q6 = cb << 6;
-  const int cr_q6 = cr << 6;
-  const int cb_diff_q12 =
-      (cb_q6 - skin_mean[idx][0]) * (cb_q6 - skin_mean[idx][0]);
-  const int cbcr_diff_q12 =
-      (cb_q6 - skin_mean[idx][0]) * (cr_q6 - skin_mean[idx][1]);
-  const int cr_diff_q12 =
-      (cr_q6 - skin_mean[idx][1]) * (cr_q6 - skin_mean[idx][1]);
-  const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10;
-  const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10;
-  const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10;
-  const int skin_diff =
-      skin_inv_cov[0] * cb_diff_q2 + skin_inv_cov[1] * cbcr_diff_q2 +
-      skin_inv_cov[2] * cbcr_diff_q2 + skin_inv_cov[3] * cr_diff_q2;
-  return skin_diff;
-}
-
-// Checks if the input yCbCr values corresponds to skin color.
-static int is_skin_color(int y, int cb, int cr, int consec_zeromv) {
-  if (y < 40 || y > 220) {
-    return 0;
-  } else {
-    if (MODEL_MODE == 0) {
-      return (evaluate_skin_color_difference(cb, cr, 0) < skin_threshold[0]);
-    } else {
-      int i = 0;
-      // No skin if block has been zero motion for long consecutive time.
-      if (consec_zeromv > 60) return 0;
-      // Exit on grey.
-      if (cb == 128 && cr == 128) return 0;
-      // Exit on very strong cb.
-      if (cb > 150 && cr < 110) return 0;
-      for (; i < 5; ++i) {
-        int skin_color_diff = evaluate_skin_color_difference(cb, cr, i);
-        if (skin_color_diff < skin_threshold[i + 1]) {
-          if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2)) {
-            return 0;
-          } else if (consec_zeromv > 25 &&
-                     skin_color_diff > (skin_threshold[i + 1] >> 1)) {
-            return 0;
-          } else {
-            return 1;
-          }
-        }
-        // Exit if difference is much large than the threshold.
-        if (skin_color_diff > (skin_threshold[i + 1] << 3)) {
-          return 0;
-        }
-      }
-      return 0;
-    }
-  }
-}
-
 static int macroblock_corner_grad(unsigned char *signal, int stride,
                                   int offsetx, int offsety, int sgnx,
                                   int sgny) {
@@ -206,8 +135,8 @@ int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d,
   (void)mvcost;
   (void)distortion;
   (void)sse;
-  bestmv->as_mv.row <<= 3;
-  bestmv->as_mv.col <<= 3;
+  bestmv->as_mv.row = clamp(bestmv->as_mv.row * 8, SHRT_MIN, SHRT_MAX);
+  bestmv->as_mv.col = clamp(bestmv->as_mv.col * 8, SHRT_MIN, SHRT_MAX);
   return 0;
 }
 
@@ -244,9 +173,8 @@ static int get_prediction_error(BLOCK *be, BLOCKD *b) {
 
 static int pick_intra4x4block(MACROBLOCK *x, int ib,
                               B_PREDICTION_MODE *best_mode,
-                              const int *mode_costs,
-
-                              int *bestrate, int *bestdistortion) {
+                              const int *mode_costs, int *bestrate,
+                              int *bestdistortion) {
   BLOCKD *b = &x->e_mbd.block[ib];
   BLOCK *be = &x->block[ib];
   int dst_stride = x->e_mbd.dst.y_stride;
@@ -299,8 +227,8 @@ static int pick_intra4x4mby_modes(MACROBLOCK *mb, int *Rate, int *best_dist) {
     MODE_INFO *const mic = xd->mode_info_context;
     const int mis = xd->mode_info_stride;
 
-    B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
-    int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(d);
+    B_PREDICTION_MODE best_mode = B_MODE_COUNT;
+    int r = 0, d = 0;
 
     if (mb->e_mbd.frame_type == KEY_FRAME) {
       const B_PREDICTION_MODE A = above_block_mode(mic, i, mis);
@@ -313,6 +241,7 @@ static int pick_intra4x4mby_modes(MACROBLOCK *mb, int *Rate, int *best_dist) {
 
     cost += r;
     distortion += d;
+    assert(best_mode != B_MODE_COUNT);
     mic->bmi[i].as_mode = best_mode;
 
     /* Break out case where we have already exceeded best so far value
@@ -353,7 +282,7 @@ static void pick_intra_mbuv_mode(MACROBLOCK *mb) {
   int Vaverage = 0;
   int diff;
   int pred_error[4] = { 0, 0, 0, 0 }, best_error = INT_MAX;
-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
+  MB_PREDICTION_MODE best_mode = MB_MODE_COUNT;
 
   for (i = 0; i < 8; ++i) {
     uleft_col[i] = x->dst.u_buffer[i * x->dst.uv_stride - 1];
@@ -442,6 +371,7 @@ static void pick_intra_mbuv_mode(MACROBLOCK *mb) {
     }
   }
 
+  assert(best_mode != MB_MODE_COUNT);
   mb->e_mbd.mode_info_context->mbmi.uv_mode = best_mode;
 }
 
@@ -450,12 +380,18 @@ static void update_mvcount(MACROBLOCK *x, int_mv *best_ref_mv) {
   /* Split MV modes currently not supported when RD is nopt enabled,
    * therefore, only need to modify MVcount in NEWMV mode. */
   if (xd->mode_info_context->mbmi.mode == NEWMV) {
-    x->MVcount[0][mv_max + ((xd->mode_info_context->mbmi.mv.as_mv.row -
-                             best_ref_mv->as_mv.row) >>
-                            1)]++;
-    x->MVcount[1][mv_max + ((xd->mode_info_context->mbmi.mv.as_mv.col -
-                             best_ref_mv->as_mv.col) >>
-                            1)]++;
+    const int row_val =
+        ((xd->mode_info_context->mbmi.mv.as_mv.row - best_ref_mv->as_mv.row) >>
+         1);
+    const int row_idx = mv_max + row_val;
+    const int col_val =
+        ((xd->mode_info_context->mbmi.mv.as_mv.col - best_ref_mv->as_mv.col) >>
+         1);
+    const int col_idx = mv_max + col_val;
+    if (row_idx >= 0 && row_idx < MVvals && col_idx >= 0 && col_idx < MVvals) {
+      x->MVcount[0][row_idx]++;
+      x->MVcount[1][col_idx]++;
+    }
   }
 }
 
@@ -633,7 +569,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO best_mbmode;
 
-  int_mv best_ref_mv_sb[2];
+  int_mv best_ref_mv_sb[2] = { { 0 }, { 0 } };
   int_mv mode_mv_sb[2][MB_MODE_COUNT];
   int_mv best_ref_mv;
   int_mv *mode_mv;
@@ -671,7 +607,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
   /* search range got from mv_pred(). It uses step_param levels. (0-7) */
   int sr = 0;
 
-  unsigned char *plane[4][3];
+  unsigned char *plane[4][3] = { { 0, 0 } };
   int ref_frame_map[4];
   int sign_bias = 0;
   int dot_artifact_candidate = 0;
@@ -700,13 +636,16 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
       }
     }
 #endif
+    assert(plane[LAST_FRAME][0] != NULL);
     dot_artifact_candidate = check_dot_artifact_candidate(
         cpi, x, target_y, stride, plane[LAST_FRAME][0], mb_row, mb_col, 0);
     // If not found in Y channel, check UV channel.
     if (!dot_artifact_candidate) {
+      assert(plane[LAST_FRAME][1] != NULL);
       dot_artifact_candidate = check_dot_artifact_candidate(
           cpi, x, target_u, stride_uv, plane[LAST_FRAME][1], mb_row, mb_col, 1);
       if (!dot_artifact_candidate) {
+        assert(plane[LAST_FRAME][2] != NULL);
         dot_artifact_candidate = check_dot_artifact_candidate(
             cpi, x, target_v, stride_uv, plane[LAST_FRAME][2], mb_row, mb_col,
             2);
@@ -757,27 +696,10 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 #endif
 
   // Check if current macroblock is in skin area.
-  {
-    const int y = (x->src.y_buffer[7 * x->src.y_stride + 7] +
-                   x->src.y_buffer[7 * x->src.y_stride + 8] +
-                   x->src.y_buffer[8 * x->src.y_stride + 7] +
-                   x->src.y_buffer[8 * x->src.y_stride + 8]) >>
-                  2;
-    const int cb = (x->src.u_buffer[3 * x->src.uv_stride + 3] +
-                    x->src.u_buffer[3 * x->src.uv_stride + 4] +
-                    x->src.u_buffer[4 * x->src.uv_stride + 3] +
-                    x->src.u_buffer[4 * x->src.uv_stride + 4]) >>
-                   2;
-    const int cr = (x->src.v_buffer[3 * x->src.uv_stride + 3] +
-                    x->src.v_buffer[3 * x->src.uv_stride + 4] +
-                    x->src.v_buffer[4 * x->src.uv_stride + 3] +
-                    x->src.v_buffer[4 * x->src.uv_stride + 4]) >>
-                   2;
-    x->is_skin = 0;
-    if (!cpi->oxcf.screen_content_mode) {
-      int block_index = mb_row * cpi->common.mb_cols + mb_col;
-      x->is_skin = is_skin_color(y, cb, cr, cpi->consec_zero_last[block_index]);
-    }
+  x->is_skin = 0;
+  if (!cpi->oxcf.screen_content_mode) {
+    int block_index = mb_row * cpi->common.mb_cols + mb_col;
+    x->is_skin = cpi->skin_map[block_index];
   }
 #if CONFIG_TEMPORAL_DENOISING
   if (cpi->oxcf.noise_sensitivity) {
@@ -827,10 +749,10 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
   x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
 
   /* If the frame has big static background and current MB is in low
-  *  motion area, its mode decision is biased to ZEROMV mode.
-  *  No adjustment if cpu_used is <= -12 (i.e., cpi->Speed >= 12).
-  *  At such speed settings, ZEROMV is already heavily favored.
-  */
+   *  motion area, its mode decision is biased to ZEROMV mode.
+   *  No adjustment if cpu_used is <= -12 (i.e., cpi->Speed >= 12).
+   *  At such speed settings, ZEROMV is already heavily favored.
+   */
   if (cpi->Speed < 12) {
     calculate_zeromv_rd_adjustment(cpi, x, &rd_adjustment);
   }
@@ -1102,7 +1024,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 #endif
             bestsme = vp8_hex_search(x, b, d, &mvp_full, &d->bmi.mv, step_param,
                                      sadpb, &cpi->fn_ptr[BLOCK_16X16],
-                                     x->mvsadcost, x->mvcost, &best_ref_mv);
+                                     x->mvsadcost, &best_ref_mv);
             mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
           } else {
             bestsme = cpi->diamond_search_sad(
@@ -1154,10 +1076,12 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         rate2 +=
             vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, cpi->mb.mvcost, 128);
       }
+        // fall through
 
       case NEARESTMV:
       case NEARMV:
         if (mode_mv[this_mode].as_int == 0) continue;
+        // fall through
 
       case ZEROMV:
 
@@ -1185,7 +1109,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 #if CONFIG_TEMPORAL_DENOISING
     if (cpi->oxcf.noise_sensitivity) {
       /* Store for later use by denoiser. */
-      // Dont' denoise with GOLDEN OR ALTREF is they are old reference
+      // Don't denoise with GOLDEN OR ALTREF is they are old reference
       // frames (greater than MAX_GF_ARF_DENOISE_RANGE frames in past).
       int skip_old_reference = ((this_ref_frame != LAST_FRAME) &&
                                 (cpi->common.current_video_frame -
@@ -1387,9 +1311,9 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
   update_mvcount(x, &best_ref_mv);
 }
 
-void vp8_pick_intra_mode(MACROBLOCK *x, int *rate_) {
+void vp8_pick_intra_mode(MACROBLOCK *x, int *rate) {
   int error4x4, error16x16 = INT_MAX;
-  int rate, best_rate = 0, distortion, best_sse;
+  int rate_, best_rate = 0, distortion, best_sse;
   MB_PREDICTION_MODE mode, best_mode = DC_PRED;
   int this_rd;
   unsigned int sse;
@@ -1407,23 +1331,23 @@ void vp8_pick_intra_mode(MACROBLOCK *x, int *rate_) {
                                      xd->predictor, 16);
     distortion = vpx_variance16x16(*(b->base_src), b->src_stride, xd->predictor,
                                    16, &sse);
-    rate = x->mbmode_cost[xd->frame_type][mode];
-    this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+    rate_ = x->mbmode_cost[xd->frame_type][mode];
+    this_rd = RDCOST(x->rdmult, x->rddiv, rate_, distortion);
 
     if (error16x16 > this_rd) {
       error16x16 = this_rd;
       best_mode = mode;
       best_sse = sse;
-      best_rate = rate;
+      best_rate = rate_;
     }
   }
   xd->mode_info_context->mbmi.mode = best_mode;
 
-  error4x4 = pick_intra4x4mby_modes(x, &rate, &best_sse);
+  error4x4 = pick_intra4x4mby_modes(x, &rate_, &best_sse);
   if (error4x4 < error16x16) {
     xd->mode_info_context->mbmi.mode = B_PRED;
-    best_rate = rate;
+    best_rate = rate_;
   }
 
-  *rate_ = best_rate;
+  *rate = best_rate;
 }
diff --git a/media/libvpx/libvpx/vp8/encoder/pickinter.h b/media/libvpx/libvpx/vp8/encoder/pickinter.h
index bf1d0c9749..392fb41593 100644
--- a/media/libvpx/libvpx/vp8/encoder/pickinter.h
+++ b/media/libvpx/libvpx/vp8/encoder/pickinter.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_PICKINTER_H_
-#define VP8_ENCODER_PICKINTER_H_
+#ifndef VPX_VP8_ENCODER_PICKINTER_H_
+#define VPX_VP8_ENCODER_PICKINTER_H_
 #include "vpx_config.h"
 #include "vp8/common/onyxc_int.h"
 
@@ -30,4 +30,4 @@ extern int vp8_get_inter_mbpred_error(MACROBLOCK *mb,
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_PICKINTER_H_
+#endif  // VPX_VP8_ENCODER_PICKINTER_H_
diff --git a/media/libvpx/libvpx/vp8/encoder/picklpf.c b/media/libvpx/libvpx/vp8/encoder/picklpf.c
index 6f287322ec..498738fd6d 100644
--- a/media/libvpx/libvpx/vp8/encoder/picklpf.c
+++ b/media/libvpx/libvpx/vp8/encoder/picklpf.c
@@ -12,12 +12,13 @@
 #include "./vpx_scale_rtcd.h"
 #include "vp8/common/onyxc_int.h"
 #include "onyx_int.h"
+#include "vp8/encoder/picklpf.h"
 #include "vp8/encoder/quantize.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_scale/vpx_scale.h"
 #include "vp8/common/alloccommon.h"
 #include "vp8/common/loopfilter.h"
-#if ARCH_ARM
+#if VPX_ARCH_ARM
 #include "vpx_ports/arm.h"
 #endif
 
@@ -49,6 +50,14 @@ static void yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc,
   src_y = src_ybc->y_buffer + yoffset;
   dst_y = dst_ybc->y_buffer + yoffset;
 
+  // The border will be used in vp8_loop_filter_partial_frame so it needs to be
+  // extended to avoid a valgrind warning.
+  const unsigned char *const top_row = src_ybc->y_buffer;
+  for (int i = yoffset; i < 0; i += ystride, --linestocopy) {
+    memcpy(dst_y, top_row, ystride);
+    dst_y += ystride;
+    src_y += ystride;
+  }
   memcpy(dst_y, src_y, ystride * linestocopy);
 }
 
diff --git a/media/libvpx/libvpx/vp8/encoder/picklpf.h b/media/libvpx/libvpx/vp8/encoder/picklpf.h
new file mode 100644
index 0000000000..03597e5427
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/encoder/picklpf.h
@@ -0,0 +1,30 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_ENCODER_PICKLPF_H_
+#define VPX_VP8_ENCODER_PICKLPF_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8_COMP;
+struct yv12_buffer_config;
+
+void vp8cx_pick_filter_level_fast(struct yv12_buffer_config *sd,
+                                  struct VP8_COMP *cpi);
+void vp8cx_set_alt_lf_level(struct VP8_COMP *cpi, int filt_val);
+void vp8cx_pick_filter_level(struct yv12_buffer_config *sd, VP8_COMP *cpi);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // VPX_VP8_ENCODER_PICKLPF_H_
diff --git a/media/libvpx/libvpx/vp8/encoder/quantize.h b/media/libvpx/libvpx/vp8/encoder/quantize.h
index 267150f99f..78746c0c20 100644
--- a/media/libvpx/libvpx/vp8/encoder/quantize.h
+++ b/media/libvpx/libvpx/vp8/encoder/quantize.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_QUANTIZE_H_
-#define VP8_ENCODER_QUANTIZE_H_
+#ifndef VPX_VP8_ENCODER_QUANTIZE_H_
+#define VPX_VP8_ENCODER_QUANTIZE_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -31,4 +31,4 @@ extern void vp8cx_init_quantizer(struct VP8_COMP *cpi);
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_QUANTIZE_H_
+#endif  // VPX_VP8_ENCODER_QUANTIZE_H_
diff --git a/media/libvpx/libvpx/vp8/encoder/ratectrl.c b/media/libvpx/libvpx/vp8/encoder/ratectrl.c
index e89247ae4a..5313f4ad1b 100644
--- a/media/libvpx/libvpx/vp8/encoder/ratectrl.c
+++ b/media/libvpx/libvpx/vp8/encoder/ratectrl.c
@@ -259,9 +259,9 @@ void vp8_setup_key_frame(VP8_COMP *cpi) {
   /* Make sure we initialize separate contexts for altref,gold, and normal.
    * TODO shouldn't need 3 different copies of structure to do this!
    */
-  memcpy(&cpi->lfc_a, &cpi->common.fc, sizeof(cpi->common.fc));
-  memcpy(&cpi->lfc_g, &cpi->common.fc, sizeof(cpi->common.fc));
-  memcpy(&cpi->lfc_n, &cpi->common.fc, sizeof(cpi->common.fc));
+  cpi->lfc_a = cpi->common.fc;
+  cpi->lfc_g = cpi->common.fc;
+  cpi->lfc_n = cpi->common.fc;
 
   cpi->common.filter_level = cpi->common.base_qindex * 3 / 8;
 
@@ -314,7 +314,7 @@ static void calc_iframe_target_size(VP8_COMP *cpi) {
      * bandwidth per second * fraction of the initial buffer
      * level
      */
-    target = cpi->oxcf.starting_buffer_level / 2;
+    target = (uint64_t)cpi->oxcf.starting_buffer_level / 2;
 
     if (target > cpi->oxcf.target_bandwidth * 3 / 2) {
       target = cpi->oxcf.target_bandwidth * 3 / 2;
@@ -327,7 +327,13 @@ static void calc_iframe_target_size(VP8_COMP *cpi) {
     int initial_boost = 32; /* |3.0 * per_frame_bandwidth| */
     /* Boost depends somewhat on frame rate: only used for 1 layer case. */
     if (cpi->oxcf.number_of_layers == 1) {
-      kf_boost = VPXMAX(initial_boost, (int)(2 * cpi->output_framerate - 16));
+      kf_boost =
+          VPXMAX(initial_boost, (int)round(2 * cpi->output_framerate - 16));
+      // cpi->output_framerate may be as large as 10M. Keep kf_boost small
+      // enough to allow for integer math when multiplying by values in
+      // kf_boost_qadjustment[].
+      const int kMaxKfBoost = 2000;
+      if (kf_boost > kMaxKfBoost) kf_boost = kMaxKfBoost;
     } else {
       /* Initial factor: set target size to: |3.0 * per_frame_bandwidth|. */
       kf_boost = initial_boost;
@@ -345,19 +351,24 @@ static void calc_iframe_target_size(VP8_COMP *cpi) {
     /* Minimal target size is |2* per_frame_bandwidth|. */
     if (kf_boost < 16) kf_boost = 16;
 
-    target = ((16 + kf_boost) * cpi->per_frame_bandwidth) >> 4;
+    target = ((uint64_t)(16 + kf_boost) * cpi->per_frame_bandwidth) >> 4;
+    target = VPXMIN(INT_MAX, target);
   }
 
   if (cpi->oxcf.rc_max_intra_bitrate_pct) {
-    unsigned int max_rate =
-        cpi->per_frame_bandwidth * cpi->oxcf.rc_max_intra_bitrate_pct / 100;
+    unsigned int max_rate;
+    // This product may overflow unsigned int
+    uint64_t product = cpi->per_frame_bandwidth;
+    product *= cpi->oxcf.rc_max_intra_bitrate_pct;
+    product /= 100;
+    max_rate = (unsigned int)VPXMIN(INT_MAX, product);
 
     if (target > max_rate) target = max_rate;
   }
 
   cpi->this_frame_target = (int)target;
 
-  /* TODO: if we separate rate targeting from Q targetting, move this.
+  /* TODO: if we separate rate targeting from Q targeting, move this.
    * Reset the active worst quality to the baseline value for key frames.
    */
   if (cpi->pass != 2) cpi->active_worst_quality = cpi->worst_quality;
@@ -383,7 +394,7 @@ static void calc_gf_params(VP8_COMP *cpi) {
       (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
   int Boost = 0;
 
-  int gf_frame_useage = 0; /* Golden frame useage since last GF */
+  int gf_frame_usage = 0; /* Golden frame usage since last GF */
   int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME] +
                 cpi->recent_ref_frame_usage[LAST_FRAME] +
                 cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
@@ -393,12 +404,12 @@ static void calc_gf_params(VP8_COMP *cpi) {
                       (cpi->common.mb_rows * cpi->common.mb_cols);
 
   if (tot_mbs) {
-    gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
-                       cpi->recent_ref_frame_usage[ALTREF_FRAME]) *
-                      100 / tot_mbs;
+    gf_frame_usage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
+                      cpi->recent_ref_frame_usage[ALTREF_FRAME]) *
+                     100 / tot_mbs;
   }
 
-  if (pct_gf_active > gf_frame_useage) gf_frame_useage = pct_gf_active;
+  if (pct_gf_active > gf_frame_usage) gf_frame_usage = pct_gf_active;
 
   /* Not two pass */
   if (cpi->pass != 2) {
@@ -462,7 +473,7 @@ static void calc_gf_params(VP8_COMP *cpi) {
       /* Adjust boost based upon ambient Q */
       Boost = GFQ_ADJUSTMENT;
 
-      /* Adjust based upon most recently measure intra useage */
+      /* Adjust based upon most recently measure intra usage */
       Boost = Boost *
               gf_intra_usage_adjustment[(cpi->this_frame_percent_intra < 15)
                                             ? cpi->this_frame_percent_intra
@@ -470,7 +481,7 @@ static void calc_gf_params(VP8_COMP *cpi) {
               100;
 
       /* Adjust gf boost based upon GF usage since last GF */
-      Boost = Boost * gf_adjust_table[gf_frame_useage] / 100;
+      Boost = Boost * gf_adjust_table[gf_frame_usage] / 100;
 #endif
     }
 
@@ -498,11 +509,9 @@ static void calc_gf_params(VP8_COMP *cpi) {
    * This is updated once the real frame size/boost is known.
    */
   if (cpi->oxcf.fixed_q == -1) {
-    if (cpi->pass == 2) /* 2 Pass */
-    {
+    if (cpi->pass == 2) { /* 2 Pass */
       cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
-    } else /* 1 Pass */
-    {
+    } else { /* 1 Pass */
       cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
 
       if (cpi->last_boost > 750) cpi->frames_till_gf_update_due++;
@@ -513,8 +522,8 @@ static void calc_gf_params(VP8_COMP *cpi) {
 
       if (cpi->last_boost >= 1500) cpi->frames_till_gf_update_due++;
 
-      if (gf_interval_table[gf_frame_useage] > cpi->frames_till_gf_update_due) {
-        cpi->frames_till_gf_update_due = gf_interval_table[gf_frame_useage];
+      if (gf_interval_table[gf_frame_usage] > cpi->frames_till_gf_update_due) {
+        cpi->frames_till_gf_update_due = gf_interval_table[gf_frame_usage];
       }
 
       if (cpi->frames_till_gf_update_due > cpi->max_gf_interval) {
@@ -634,11 +643,10 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
         /* % Adjustment limited to the range 1% to 10% */
         Adjustment = (cpi->last_boost - 100) >> 5;
 
-        if (Adjustment < 1) {
-          Adjustment = 1;
-        } else if (Adjustment > 10) {
+        if (Adjustment > 10) {
           Adjustment = 10;
         }
+        assert(Adjustment >= 1);
 
         /* Convert to bits */
         Adjustment = (cpi->this_frame_target * Adjustment) / 100;
@@ -715,7 +723,8 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
         }
 
         /* lower the target bandwidth for this frame. */
-        cpi->this_frame_target -= (cpi->this_frame_target * percent_low) / 200;
+        cpi->this_frame_target -=
+            (int)(((int64_t)cpi->this_frame_target * percent_low) / 200);
 
         /* Are we using allowing control of active_worst_allowed_q
          * according to buffer level.
@@ -778,6 +787,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
         }
       } else {
         int percent_high = 0;
+        int64_t target = cpi->this_frame_target;
 
         if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) &&
             (cpi->buffer_level > cpi->oxcf.optimal_buffer_level)) {
@@ -785,8 +795,12 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
               (int)((cpi->buffer_level - cpi->oxcf.optimal_buffer_level) /
                     one_percent_bits);
         } else if (cpi->bits_off_target > cpi->oxcf.optimal_buffer_level) {
-          percent_high =
-              (int)((100 * cpi->bits_off_target) / (cpi->total_byte_count * 8));
+          if (cpi->total_byte_count > 0) {
+            percent_high = (int)((100 * cpi->bits_off_target) /
+                                 (cpi->total_byte_count * 8));
+          } else {
+            percent_high = cpi->oxcf.over_shoot_pct;
+          }
         }
 
         if (percent_high > cpi->oxcf.over_shoot_pct) {
@@ -795,7 +809,9 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
           percent_high = 0;
         }
 
-        cpi->this_frame_target += (cpi->this_frame_target * percent_high) / 200;
+        target += (target * percent_high) / 200;
+        target = VPXMIN(target, INT_MAX);
+        cpi->this_frame_target = (int)target;
 
         /* Are we allowing control of active_worst_allowed_q according
          * to buffer level.
@@ -889,7 +905,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
       int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME]
                                       : cpi->oxcf.fixed_q;
 
-      int gf_frame_useage = 0; /* Golden frame useage since last GF */
+      int gf_frame_usage = 0; /* Golden frame usage since last GF */
       int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME] +
                     cpi->recent_ref_frame_usage[LAST_FRAME] +
                     cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
@@ -899,20 +915,20 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
                           (cpi->common.mb_rows * cpi->common.mb_cols);
 
       if (tot_mbs) {
-        gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
-                           cpi->recent_ref_frame_usage[ALTREF_FRAME]) *
-                          100 / tot_mbs;
+        gf_frame_usage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
+                          cpi->recent_ref_frame_usage[ALTREF_FRAME]) *
+                         100 / tot_mbs;
       }
 
-      if (pct_gf_active > gf_frame_useage) gf_frame_useage = pct_gf_active;
+      if (pct_gf_active > gf_frame_usage) gf_frame_usage = pct_gf_active;
 
       /* Is a fixed manual GF frequency being used */
       if (cpi->auto_gold) {
-        /* For one pass throw a GF if recent frame intra useage is
-         * low or the GF useage is high
+        /* For one pass throw a GF if recent frame intra usage is
+         * low or the GF usage is high
          */
         if ((cpi->pass == 0) &&
-            (cpi->this_frame_percent_intra < 15 || gf_frame_useage >= 5)) {
+            (cpi->this_frame_percent_intra < 15 || gf_frame_usage >= 5)) {
           cpi->common.refresh_golden_frame = 1;
 
           /* Two pass GF descision */
@@ -927,10 +943,10 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
           if (0) {
               FILE *f;
 
-              f = fopen("gf_useaget.stt", "a");
+              f = fopen("gf_usaget.stt", "a");
               fprintf(f, " %8ld %10ld %10ld %10ld %10ld\n",
                       cpi->common.current_video_frame,  cpi->gfu_boost,
-                      GFQ_ADJUSTMENT, cpi->gfu_boost, gf_frame_useage);
+                      GFQ_ADJUSTMENT, cpi->gfu_boost, gf_frame_usage);
               fclose(f);
           }
 
@@ -998,7 +1014,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
            * bits on this frame even if it is a contructed arf.
            * The active maximum quantizer insures that an appropriate
            * number of bits will be spent if needed for contstructed ARFs.
-          */
+           */
           cpi->this_frame_target = 0;
         }
 
@@ -1054,9 +1070,8 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var) {
    * overflow when values are large
    */
   projected_size_based_on_q =
-      (int)(((.5 +
-              rate_correction_factor *
-                  vp8_bits_per_mb[cpi->common.frame_type][Q]) *
+      (int)(((.5 + rate_correction_factor *
+                       vp8_bits_per_mb[cpi->common.frame_type][Q]) *
              cpi->common.MBs) /
             (1 << BPER_MB_NORMBITS));
 
@@ -1077,8 +1092,8 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var) {
 
   /* Work out a size correction factor. */
   if (projected_size_based_on_q > 0) {
-    correction_factor =
-        (100 * cpi->projected_frame_size) / projected_size_based_on_q;
+    correction_factor = (int)((100 * (int64_t)cpi->projected_frame_size) /
+                              projected_size_based_on_q);
   }
 
   /* More heavily damped adjustment used if we have been oscillating
@@ -1128,6 +1143,14 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var) {
   }
 }
 
+static int limit_q_cbr_inter(int last_q, int current_q) {
+  int limit_down = 12;
+  if (last_q - current_q > limit_down)
+    return (last_q - limit_down);
+  else
+    return current_q;
+}
+
 int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) {
   int Q = cpi->active_worst_quality;
 
@@ -1175,10 +1198,13 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) {
     /* Calculate required scaling factor based on target frame size and
      * size of frame produced using previous Q
      */
-    if (target_bits_per_frame >= (INT_MAX >> BPER_MB_NORMBITS)) {
-      /* Case where we would overflow int */
-      target_bits_per_mb = (target_bits_per_frame / cpi->common.MBs)
-                           << BPER_MB_NORMBITS;
+    if (target_bits_per_frame > (INT_MAX >> BPER_MB_NORMBITS)) {
+      int temp = target_bits_per_frame / cpi->common.MBs;
+      if (temp > (INT_MAX >> BPER_MB_NORMBITS)) {
+        target_bits_per_mb = INT_MAX;
+      } else {
+        target_bits_per_mb = temp << BPER_MB_NORMBITS;
+      }
     } else {
       target_bits_per_mb =
           (target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs;
@@ -1267,6 +1293,12 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) {
     }
   }
 
+  // Limit decrease in Q for 1 pass CBR screen content mode.
+  if (cpi->common.frame_type != KEY_FRAME && cpi->pass == 0 &&
+      cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER &&
+      cpi->oxcf.screen_content_mode)
+    Q = limit_q_cbr_inter(cpi->last_q[1], Q);
+
   return Q;
 }
 
@@ -1364,14 +1396,17 @@ void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit,
     *frame_under_shoot_limit = 0;
     *frame_over_shoot_limit = INT_MAX;
   } else {
+    const int64_t this_frame_target = cpi->this_frame_target;
+    int64_t over_shoot_limit, under_shoot_limit;
+
     if (cpi->common.frame_type == KEY_FRAME) {
-      *frame_over_shoot_limit = cpi->this_frame_target * 9 / 8;
-      *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8;
+      over_shoot_limit = this_frame_target * 9 / 8;
+      under_shoot_limit = this_frame_target * 7 / 8;
     } else {
       if (cpi->oxcf.number_of_layers > 1 || cpi->common.refresh_alt_ref_frame ||
           cpi->common.refresh_golden_frame) {
-        *frame_over_shoot_limit = cpi->this_frame_target * 9 / 8;
-        *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8;
+        over_shoot_limit = this_frame_target * 9 / 8;
+        under_shoot_limit = this_frame_target * 7 / 8;
       } else {
         /* For CBR take buffer fullness into account */
         if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
@@ -1381,18 +1416,18 @@ void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit,
             /* Buffer is too full so relax overshoot and tighten
              * undershoot
              */
-            *frame_over_shoot_limit = cpi->this_frame_target * 12 / 8;
-            *frame_under_shoot_limit = cpi->this_frame_target * 6 / 8;
+            over_shoot_limit = this_frame_target * 12 / 8;
+            under_shoot_limit = this_frame_target * 6 / 8;
           } else if (cpi->buffer_level <=
                      (cpi->oxcf.optimal_buffer_level >> 1)) {
             /* Buffer is too low so relax undershoot and tighten
              * overshoot
              */
-            *frame_over_shoot_limit = cpi->this_frame_target * 10 / 8;
-            *frame_under_shoot_limit = cpi->this_frame_target * 4 / 8;
+            over_shoot_limit = this_frame_target * 10 / 8;
+            under_shoot_limit = this_frame_target * 4 / 8;
           } else {
-            *frame_over_shoot_limit = cpi->this_frame_target * 11 / 8;
-            *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8;
+            over_shoot_limit = this_frame_target * 11 / 8;
+            under_shoot_limit = this_frame_target * 5 / 8;
           }
         }
         /* VBR and CQ mode */
@@ -1402,11 +1437,11 @@ void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit,
         else {
           /* Stron overshoot limit for constrained quality */
           if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
-            *frame_over_shoot_limit = cpi->this_frame_target * 11 / 8;
-            *frame_under_shoot_limit = cpi->this_frame_target * 2 / 8;
+            over_shoot_limit = this_frame_target * 11 / 8;
+            under_shoot_limit = this_frame_target * 2 / 8;
           } else {
-            *frame_over_shoot_limit = cpi->this_frame_target * 11 / 8;
-            *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8;
+            over_shoot_limit = this_frame_target * 11 / 8;
+            under_shoot_limit = this_frame_target * 5 / 8;
           }
         }
       }
@@ -1416,9 +1451,13 @@ void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit,
      * (eg * 7/8) may be tiny make sure there is at least a minimum
      * range.
      */
-    *frame_over_shoot_limit += 200;
-    *frame_under_shoot_limit -= 200;
-    if (*frame_under_shoot_limit < 0) *frame_under_shoot_limit = 0;
+    over_shoot_limit += 200;
+    under_shoot_limit -= 200;
+    if (under_shoot_limit < 0) under_shoot_limit = 0;
+    if (under_shoot_limit > INT_MAX) under_shoot_limit = INT_MAX;
+    if (over_shoot_limit > INT_MAX) over_shoot_limit = INT_MAX;
+    *frame_under_shoot_limit = (int)under_shoot_limit;
+    *frame_over_shoot_limit = (int)over_shoot_limit;
   }
 }
 
@@ -1442,12 +1481,33 @@ int vp8_pick_frame_size(VP8_COMP *cpi) {
 // If this just encoded frame (mcomp/transform/quant, but before loopfilter and
 // pack_bitstream) has large overshoot, and was not being encoded close to the
 // max QP, then drop this frame and force next frame to be encoded at max QP.
-// Condition this on 1 pass CBR with screen content mode and frame dropper off.
+// Allow this for screen_content_mode = 2, or if drop frames is allowed.
 // TODO(marpan): Should do this exit condition during the encode_frame
 // (i.e., halfway during the encoding of the frame) to save cycles.
 int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) {
-  if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER &&
-      cpi->drop_frames_allowed == 0 && cpi->common.frame_type != KEY_FRAME) {
+  int force_drop_overshoot = 0;
+#if CONFIG_MULTI_RES_ENCODING
+  // Only check for dropping due to overshoot on the lowest stream.
+  // If the lowest stream of the multi-res encoding was dropped due to
+  // overshoot, then force dropping on all upper layer streams
+  // (mr_encoder_id > 0).
+  LOWER_RES_FRAME_INFO *low_res_frame_info =
+      (LOWER_RES_FRAME_INFO *)cpi->oxcf.mr_low_res_mode_info;
+  if (cpi->oxcf.mr_total_resolutions > 1 && cpi->oxcf.mr_encoder_id > 0) {
+    force_drop_overshoot = low_res_frame_info->is_frame_dropped_overshoot_maxqp;
+    if (!force_drop_overshoot) {
+      cpi->force_maxqp = 0;
+      cpi->frames_since_last_drop_overshoot++;
+      return 0;
+    }
+  }
+#endif
+  if (cpi->common.frame_type != KEY_FRAME &&
+      (cpi->oxcf.screen_content_mode == 2 ||
+       (cpi->drop_frames_allowed &&
+        (force_drop_overshoot ||
+         (cpi->rate_correction_factor < (8.0f * MIN_BPB_FACTOR) &&
+          cpi->frames_since_last_drop_overshoot > (int)cpi->framerate))))) {
     // Note: the "projected_frame_size" from encode_frame() only gives estimate
     // of mode/motion vector rate (in non-rd mode): so below we only require
     // that projected_frame_size is somewhat greater than per-frame-bandwidth,
@@ -1458,17 +1518,21 @@ int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) {
     // Rate threshold, in bytes.
     int thresh_rate = 2 * (cpi->av_per_frame_bandwidth >> 3);
     // Threshold for the average (over all macroblocks) of the pixel-sum
-    // residual error over 16x16 block. Should add QP dependence on threshold?
-    int thresh_pred_err_mb = (256 << 4);
+    // residual error over 16x16 block.
+    int thresh_pred_err_mb = (200 << 4);
     int pred_err_mb = (int)(cpi->mb.prediction_error / cpi->common.MBs);
-    if (Q < thresh_qp && cpi->projected_frame_size > thresh_rate &&
-        pred_err_mb > thresh_pred_err_mb) {
+    // Reduce/ignore thresh_rate if pred_err_mb much larger than its threshold,
+    // give more weight to pred_err metric for overshoot detection.
+    if (cpi->drop_frames_allowed && pred_err_mb > (thresh_pred_err_mb << 4))
+      thresh_rate = thresh_rate >> 3;
+    if ((Q < thresh_qp && cpi->projected_frame_size > thresh_rate &&
+         pred_err_mb > thresh_pred_err_mb &&
+         pred_err_mb > 2 * cpi->last_pred_err_mb) ||
+        force_drop_overshoot) {
+      unsigned int i;
       double new_correction_factor;
-      const int target_size = cpi->av_per_frame_bandwidth;
       int target_bits_per_mb;
-      // Drop this frame: advance frame counters, and set force_maxqp flag.
-      cpi->common.current_video_frame++;
-      cpi->frames_since_key++;
+      const int target_size = cpi->av_per_frame_bandwidth;
       // Flag to indicate we will force next frame to be encoded at max QP.
       cpi->force_maxqp = 1;
       // Reset the buffer levels.
@@ -1481,9 +1545,13 @@ int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) {
       // undershoots significantly, and then we end up dropping every other
       // frame because the QP/rate_correction_factor may have been too low
       // before the drop and then takes too long to come up.
-      if (target_size >= (INT_MAX >> BPER_MB_NORMBITS)) {
-        target_bits_per_mb = (target_size / cpi->common.MBs)
-                             << BPER_MB_NORMBITS;
+      if (target_size > (INT_MAX >> BPER_MB_NORMBITS)) {
+        int temp = target_size / cpi->common.MBs;
+        if (temp > (INT_MAX >> BPER_MB_NORMBITS)) {
+          target_bits_per_mb = INT_MAX;
+        } else {
+          target_bits_per_mb = temp << BPER_MB_NORMBITS;
+        }
       } else {
         target_bits_per_mb =
             (target_size << BPER_MB_NORMBITS) / cpi->common.MBs;
@@ -1499,14 +1567,40 @@ int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) {
       if (cpi->rate_correction_factor > MAX_BPB_FACTOR) {
         cpi->rate_correction_factor = MAX_BPB_FACTOR;
       }
+      // Drop this frame: update frame counters.
+      cpi->common.current_video_frame++;
+      cpi->frames_since_key++;
+      cpi->temporal_pattern_counter++;
+      cpi->frames_since_last_drop_overshoot = 0;
+      if (cpi->oxcf.number_of_layers > 1) {
+        // Set max_qp and rate correction for all temporal layers if overshoot
+        // is detected.
+        for (i = 0; i < cpi->oxcf.number_of_layers; ++i) {
+          LAYER_CONTEXT *lc = &cpi->layer_context[i];
+          lc->force_maxqp = 1;
+          lc->frames_since_last_drop_overshoot = 0;
+          lc->rate_correction_factor = cpi->rate_correction_factor;
+        }
+      }
+#if CONFIG_MULTI_RES_ENCODING
+      if (cpi->oxcf.mr_total_resolutions > 1)
+        low_res_frame_info->is_frame_dropped_overshoot_maxqp = 1;
+#endif
       return 1;
-    } else {
-      cpi->force_maxqp = 0;
-      return 0;
     }
     cpi->force_maxqp = 0;
+    cpi->frames_since_last_drop_overshoot++;
+#if CONFIG_MULTI_RES_ENCODING
+    if (cpi->oxcf.mr_total_resolutions > 1)
+      low_res_frame_info->is_frame_dropped_overshoot_maxqp = 0;
+#endif
     return 0;
   }
   cpi->force_maxqp = 0;
+  cpi->frames_since_last_drop_overshoot++;
+#if CONFIG_MULTI_RES_ENCODING
+  if (cpi->oxcf.mr_total_resolutions > 1)
+    low_res_frame_info->is_frame_dropped_overshoot_maxqp = 0;
+#endif
   return 0;
 }
diff --git a/media/libvpx/libvpx/vp8/encoder/ratectrl.h b/media/libvpx/libvpx/vp8/encoder/ratectrl.h
index 249de4e706..844c72cb86 100644
--- a/media/libvpx/libvpx/vp8/encoder/ratectrl.h
+++ b/media/libvpx/libvpx/vp8/encoder/ratectrl.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_RATECTRL_H_
-#define VP8_ENCODER_RATECTRL_H_
+#ifndef VPX_VP8_ENCODER_RATECTRL_H_
+#define VPX_VP8_ENCODER_RATECTRL_H_
 
 #include "onyx_int.h"
 
@@ -37,4 +37,4 @@ extern int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q);
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_RATECTRL_H_
+#endif  // VPX_VP8_ENCODER_RATECTRL_H_
diff --git a/media/libvpx/libvpx/vp8/encoder/rdopt.c b/media/libvpx/libvpx/vp8/encoder/rdopt.c
index 7bbeb28574..ad3866c770 100644
--- a/media/libvpx/libvpx/vp8/encoder/rdopt.c
+++ b/media/libvpx/libvpx/vp8/encoder/rdopt.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 #include <stdio.h>
 #include <math.h>
 #include <limits.h>
@@ -15,12 +16,14 @@
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
+#include "encodeframe.h"
 #include "tokenize.h"
 #include "treewriter.h"
 #include "onyx_int.h"
 #include "modecosts.h"
 #include "encodeintra.h"
 #include "pickinter.h"
+#include "vp8/common/common.h"
 #include "vp8/common/entropymode.h"
 #include "vp8/common/reconinter.h"
 #include "vp8/common/reconintra.h"
@@ -106,11 +109,10 @@ const int vp8_ref_frame_order[MAX_MODES] = {
   0,
 };
 
-static void fill_token_costs(int c[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
-                                  [MAX_ENTROPY_TOKENS],
-                             const vp8_prob p[BLOCK_TYPES][COEF_BANDS]
-                                             [PREV_COEF_CONTEXTS]
-                                             [ENTROPY_NODES]) {
+static void fill_token_costs(
+    int c[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS],
+    const vp8_prob p[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
+                    [ENTROPY_NODES]) {
   int i, j, k;
 
   for (i = 0; i < BLOCK_TYPES; ++i) {
@@ -449,8 +451,8 @@ static int vp8_rdcost_mby(MACROBLOCK *mb) {
   ENTROPY_CONTEXT *ta;
   ENTROPY_CONTEXT *tl;
 
-  memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  t_above = *mb->e_mbd.above_context;
+  t_left = *mb->e_mbd.left_context;
 
   ta = (ENTROPY_CONTEXT *)&t_above;
   tl = (ENTROPY_CONTEXT *)&t_left;
@@ -595,8 +597,8 @@ static int rd_pick_intra4x4mby_modes(MACROBLOCK *mb, int *Rate, int *rate_y,
   ENTROPY_CONTEXT *tl;
   const int *bmode_costs;
 
-  memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  t_above = *mb->e_mbd.above_context;
+  t_left = *mb->e_mbd.left_context;
 
   ta = (ENTROPY_CONTEXT *)&t_above;
   tl = (ENTROPY_CONTEXT *)&t_left;
@@ -608,9 +610,8 @@ static int rd_pick_intra4x4mby_modes(MACROBLOCK *mb, int *Rate, int *rate_y,
   for (i = 0; i < 16; ++i) {
     MODE_INFO *const mic = xd->mode_info_context;
     const int mis = xd->mode_info_stride;
-    B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
-    int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry),
-        UNINITIALIZED_IS_SAFE(d);
+    B_PREDICTION_MODE best_mode = B_MODE_COUNT;
+    int r = 0, ry = 0, d = 0;
 
     if (mb->e_mbd.frame_type == KEY_FRAME) {
       const B_PREDICTION_MODE A = above_block_mode(mic, i, mis);
@@ -627,6 +628,7 @@ static int rd_pick_intra4x4mby_modes(MACROBLOCK *mb, int *Rate, int *rate_y,
     distortion += d;
     tot_rate_y += ry;
 
+    assert(best_mode != B_MODE_COUNT);
     mic->bmi[i].as_mode = best_mode;
 
     if (total_rd >= (int64_t)best_rd) break;
@@ -644,7 +646,7 @@ static int rd_pick_intra4x4mby_modes(MACROBLOCK *mb, int *Rate, int *rate_y,
 static int rd_pick_intra16x16mby_mode(MACROBLOCK *x, int *Rate, int *rate_y,
                                       int *Distortion) {
   MB_PREDICTION_MODE mode;
-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+  MB_PREDICTION_MODE mode_selected = MB_MODE_COUNT;
   int rate, ratey;
   int distortion;
   int best_rd = INT_MAX;
@@ -674,6 +676,7 @@ static int rd_pick_intra16x16mby_mode(MACROBLOCK *x, int *Rate, int *rate_y,
     }
   }
 
+  assert(mode_selected != MB_MODE_COUNT);
   xd->mode_info_context->mbmi.mode = mode_selected;
   return best_rd;
 }
@@ -686,8 +689,8 @@ static int rd_cost_mbuv(MACROBLOCK *mb) {
   ENTROPY_CONTEXT *ta;
   ENTROPY_CONTEXT *tl;
 
-  memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  t_above = *mb->e_mbd.above_context;
+  t_left = *mb->e_mbd.left_context;
 
   ta = (ENTROPY_CONTEXT *)&t_above;
   tl = (ENTROPY_CONTEXT *)&t_left;
@@ -741,9 +744,9 @@ static int rd_inter4x4_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
 static void rd_pick_intra_mbuv_mode(MACROBLOCK *x, int *rate,
                                     int *rate_tokenonly, int *distortion) {
   MB_PREDICTION_MODE mode;
-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+  MB_PREDICTION_MODE mode_selected = MB_MODE_COUNT;
   int best_rd = INT_MAX;
-  int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r);
+  int d = 0, r = 0;
   int rate_to;
   MACROBLOCKD *xd = &x->e_mbd;
 
@@ -767,9 +770,9 @@ static void rd_pick_intra_mbuv_mode(MACROBLOCK *x, int *rate,
     vp8_quantize_mbuv(x);
 
     rate_to = rd_cost_mbuv(x);
-    this_rate = rate_to +
-                x->intra_uv_mode_cost[xd->frame_type]
-                                     [xd->mode_info_context->mbmi.uv_mode];
+    this_rate =
+        rate_to + x->intra_uv_mode_cost[xd->frame_type]
+                                       [xd->mode_info_context->mbmi.uv_mode];
 
     this_distortion = vp8_mbuverror(x) / 4;
 
@@ -787,6 +790,7 @@ static void rd_pick_intra_mbuv_mode(MACROBLOCK *x, int *rate,
   *rate = r;
   *distortion = d;
 
+  assert(mode_selected != MB_MODE_COUNT);
   xd->mode_info_context->mbmi.uv_mode = mode_selected;
 }
 
@@ -850,8 +854,7 @@ static int labels2mode(MACROBLOCK *x, int const *labelings, int which_label,
         default: break;
       }
 
-      if (m == ABOVE4X4) /* replace above with left if same */
-      {
+      if (m == ABOVE4X4) { /* replace above with left if same */
         int_mv left_mv;
 
         left_mv.as_int = col ? d[-1].bmi.mv.as_int : left_block_mv(mic, i);
@@ -957,19 +960,13 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
   vp8_variance_fn_ptr_t *v_fn_ptr;
 
   ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta;
-  ENTROPY_CONTEXT *tl;
   ENTROPY_CONTEXT_PLANES t_above_b, t_left_b;
-  ENTROPY_CONTEXT *ta_b;
-  ENTROPY_CONTEXT *tl_b;
 
-  memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  t_above = *x->e_mbd.above_context;
+  t_left = *x->e_mbd.left_context;
 
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
-  ta_b = (ENTROPY_CONTEXT *)&t_above_b;
-  tl_b = (ENTROPY_CONTEXT *)&t_left_b;
+  vp8_zero(t_above_b);
+  vp8_zero(t_left_b);
 
   br = 0;
   bd = 0;
@@ -992,7 +989,7 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
   br += rate;
 
   for (i = 0; i < label_count; ++i) {
-    int_mv mode_mv[B_MODE_COUNT];
+    int_mv mode_mv[B_MODE_COUNT] = { { 0 }, { 0 } };
     int best_label_rd = INT_MAX;
     B_PREDICTION_MODE mode_selected = ZERO4X4;
     int bestlabelyrate = 0;
@@ -1006,8 +1003,8 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
       ENTROPY_CONTEXT *ta_s;
       ENTROPY_CONTEXT *tl_s;
 
-      memcpy(&t_above_s, &t_above, sizeof(ENTROPY_CONTEXT_PLANES));
-      memcpy(&t_left_s, &t_left, sizeof(ENTROPY_CONTEXT_PLANES));
+      t_above_s = t_above;
+      t_left_s = t_left;
 
       ta_s = (ENTROPY_CONTEXT *)&t_above_s;
       tl_s = (ENTROPY_CONTEXT *)&t_left_s;
@@ -1024,7 +1021,7 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
         BLOCK *c;
         BLOCKD *e;
 
-        /* Is the best so far sufficiently good that we cant justify
+        /* Is the best so far sufficiently good that we can't justify
          * doing a new motion search.
          */
         if (best_label_rd < label_mv_thresh) break;
@@ -1100,8 +1097,8 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
             vp8_clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max, x->mv_row_min,
                          x->mv_row_max);
 
-            thissme = cpi->full_search_sad(x, c, e, &mvp_full, sadpb, 16,
-                                           v_fn_ptr, x->mvcost, bsi->ref_mv);
+            thissme = vp8_full_search_sad(x, c, e, &mvp_full, sadpb, 16,
+                                          v_fn_ptr, x->mvcost, bsi->ref_mv);
 
             if (thissme < bestsme) {
               bestsme = thissme;
@@ -1149,13 +1146,13 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
         mode_selected = this_mode;
         best_label_rd = this_rd;
 
-        memcpy(ta_b, ta_s, sizeof(ENTROPY_CONTEXT_PLANES));
-        memcpy(tl_b, tl_s, sizeof(ENTROPY_CONTEXT_PLANES));
+        t_above_b = t_above_s;
+        t_left_b = t_left_s;
       }
     } /*for each 4x4 mode*/
 
-    memcpy(ta, ta_b, sizeof(ENTROPY_CONTEXT_PLANES));
-    memcpy(tl, tl_b, sizeof(ENTROPY_CONTEXT_PLANES));
+    t_above = t_above_b;
+    t_left = t_left_b;
 
     labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected],
                 bsi->ref_mv, x->mvcost);
@@ -1567,21 +1564,34 @@ static void rd_update_mvcount(MACROBLOCK *x, int_mv *best_ref_mv) {
 
     for (i = 0; i < x->partition_info->count; ++i) {
       if (x->partition_info->bmi[i].mode == NEW4X4) {
-        x->MVcount[0][mv_max + ((x->partition_info->bmi[i].mv.as_mv.row -
-                                 best_ref_mv->as_mv.row) >>
-                                1)]++;
-        x->MVcount[1][mv_max + ((x->partition_info->bmi[i].mv.as_mv.col -
-                                 best_ref_mv->as_mv.col) >>
-                                1)]++;
+        const int row_val = ((x->partition_info->bmi[i].mv.as_mv.row -
+                              best_ref_mv->as_mv.row) >>
+                             1);
+        const int row_idx = mv_max + row_val;
+        const int col_val = ((x->partition_info->bmi[i].mv.as_mv.col -
+                              best_ref_mv->as_mv.col) >>
+                             1);
+        const int col_idx = mv_max + col_val;
+        if (row_idx >= 0 && row_idx < MVvals && col_idx >= 0 &&
+            col_idx < MVvals) {
+          x->MVcount[0][row_idx]++;
+          x->MVcount[1][col_idx]++;
+        }
       }
     }
   } else if (x->e_mbd.mode_info_context->mbmi.mode == NEWMV) {
-    x->MVcount[0][mv_max + ((x->e_mbd.mode_info_context->mbmi.mv.as_mv.row -
-                             best_ref_mv->as_mv.row) >>
-                            1)]++;
-    x->MVcount[1][mv_max + ((x->e_mbd.mode_info_context->mbmi.mv.as_mv.col -
-                             best_ref_mv->as_mv.col) >>
-                            1)]++;
+    const int row_val = ((x->e_mbd.mode_info_context->mbmi.mv.as_mv.row -
+                          best_ref_mv->as_mv.row) >>
+                         1);
+    const int row_idx = mv_max + row_val;
+    const int col_val = ((x->e_mbd.mode_info_context->mbmi.mv.as_mv.col -
+                          best_ref_mv->as_mv.col) >>
+                         1);
+    const int col_idx = mv_max + col_val;
+    if (row_idx >= 0 && row_idx < MVvals && col_idx >= 0 && col_idx < MVvals) {
+      x->MVcount[0][row_idx]++;
+      x->MVcount[1][col_idx]++;
+    }
   }
 }
 
@@ -1611,7 +1621,7 @@ static int evaluate_inter_mode_rd(int mdcounts[4], RATE_DISTORTION *rd,
       unsigned int q2dc = xd->block[24].dequant[0];
       /* If theres is no codeable 2nd order dc
          or a very small uniform pixel change change */
-      if ((sse - var<q2dc * q2dc>> 4) || (sse / 2 > var && sse - var < 64)) {
+      if ((sse - var < q2dc * q2dc >> 4) || (sse / 2 > var && sse - var < 64)) {
         /* Check u and v to make sure skip is ok */
         unsigned int sse2 = VP8_UVSSE(x);
         if (sse2 * 2 < threshold) {
@@ -1726,9 +1736,8 @@ static void update_best_mode(BEST_MODE *best_mode, int this_rd,
              (rd->distortion2 - rd->distortion_uv));
 
   best_mode->rd = this_rd;
-  memcpy(&best_mode->mbmode, &x->e_mbd.mode_info_context->mbmi,
-         sizeof(MB_MODE_INFO));
-  memcpy(&best_mode->partition, x->partition_info, sizeof(PARTITION_INFO));
+  best_mode->mbmode = x->e_mbd.mode_info_context->mbmi;
+  best_mode->partition = *x->partition_info;
 
   if ((this_mode == B_PRED) || (this_mode == SPLITMV)) {
     int i;
@@ -1770,7 +1779,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
   /* search range got from mv_pred(). It uses step_param levels. (0-7) */
   int sr = 0;
 
-  unsigned char *plane[4][3];
+  unsigned char *plane[4][3] = { { 0, 0 } };
   int ref_frame_map[4];
   int sign_bias = 0;
 
@@ -1782,6 +1791,10 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
                best_rd_sse = UINT_MAX;
 #endif
 
+  // _uv variables are not set consistantly before calling update_best_mode.
+  rd.rate_uv = 0;
+  rd.distortion_uv = 0;
+
   mode_mv = mode_mv_sb[sign_bias];
   best_ref_mv.as_int = 0;
   best_mode.rd = INT_MAX;
@@ -1849,6 +1862,9 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 
     /* everything but intra */
     if (x->e_mbd.mode_info_context->mbmi.ref_frame) {
+      assert(plane[this_ref_frame][0] != NULL &&
+             plane[this_ref_frame][1] != NULL &&
+             plane[this_ref_frame][2] != NULL);
       x->e_mbd.pre.y_buffer = plane[this_ref_frame][0];
       x->e_mbd.pre.u_buffer = plane[this_ref_frame][1];
       x->e_mbd.pre.v_buffer = plane[this_ref_frame][2];
@@ -1943,6 +1959,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         rd.distortion2 += distortion;
 
         if (tmp_rd < best_mode.yrd) {
+          assert(uv_intra_done);
           rd.rate2 += uv_intra_rate;
           rd.rate_uv = uv_intra_rate_tokenonly;
           rd.distortion2 += uv_intra_distortion;
@@ -1974,7 +1991,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         rd.distortion2 += distortion;
 
         /* If even the 'Y' rd value of split is higher than best so far
-         * then dont bother looking at UV
+         * then don't bother looking at UV
          */
         if (tmp_rd < best_mode.yrd) {
           /* Now work out UV cost and add it in */
@@ -2003,6 +2020,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         rd.distortion2 += distortion;
         rd.rate2 += x->mbmode_cost[x->e_mbd.frame_type]
                                   [x->e_mbd.mode_info_context->mbmi.mode];
+        assert(uv_intra_done);
         rd.rate2 += uv_intra_rate;
         rd.rate_uv = uv_intra_rate_tokenonly;
         rd.distortion2 += uv_intra_distortion;
@@ -2134,6 +2152,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         rd.rate2 +=
             vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, x->mvcost, 96);
       }
+        // fall through
 
       case NEARESTMV:
       case NEARMV:
@@ -2150,6 +2169,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
             (mode_mv[this_mode].as_int == 0)) {
           continue;
         }
+        // fall through
 
       case ZEROMV:
 
@@ -2327,8 +2347,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
   }
 
   /* macroblock modes */
-  memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mode.mbmode,
-         sizeof(MB_MODE_INFO));
+  x->e_mbd.mode_info_context->mbmi = best_mode.mbmode;
 
   if (best_mode.mbmode.mode == B_PRED) {
     for (i = 0; i < 16; ++i) {
@@ -2341,7 +2360,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
       xd->mode_info_context->bmi[i].mv.as_int = best_mode.bmodes[i].mv.as_int;
     }
 
-    memcpy(x->partition_info, &best_mode.partition, sizeof(PARTITION_INFO));
+    *x->partition_info = best_mode.partition;
 
     x->e_mbd.mode_info_context->mbmi.mv.as_int =
         x->partition_info->bmi[15].mv.as_int;
@@ -2355,11 +2374,11 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
   rd_update_mvcount(x, &best_ref_mv);
 }
 
-void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate_) {
+void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate) {
   int error4x4, error16x16;
   int rate4x4, rate16x16 = 0, rateuv;
   int dist4x4, dist16x16, distuv;
-  int rate;
+  int rate_;
   int rate4x4_tokenonly = 0;
   int rate16x16_tokenonly = 0;
   int rateuv_tokenonly = 0;
@@ -2367,7 +2386,7 @@ void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate_) {
   x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
 
   rd_pick_intra_mbuv_mode(x, &rateuv, &rateuv_tokenonly, &distuv);
-  rate = rateuv;
+  rate_ = rateuv;
 
   error16x16 = rd_pick_intra16x16mby_mode(x, &rate16x16, &rate16x16_tokenonly,
                                           &dist16x16);
@@ -2377,10 +2396,10 @@ void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate_) {
 
   if (error4x4 < error16x16) {
     x->e_mbd.mode_info_context->mbmi.mode = B_PRED;
-    rate += rate4x4;
+    rate_ += rate4x4;
   } else {
-    rate += rate16x16;
+    rate_ += rate16x16;
   }
 
-  *rate_ = rate;
+  *rate = rate_;
 }
diff --git a/media/libvpx/libvpx/vp8/encoder/rdopt.h b/media/libvpx/libvpx/vp8/encoder/rdopt.h
index 8186ff1051..cc3db8197c 100644
--- a/media/libvpx/libvpx/vp8/encoder/rdopt.h
+++ b/media/libvpx/libvpx/vp8/encoder/rdopt.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_RDOPT_H_
-#define VP8_ENCODER_RDOPT_H_
+#ifndef VPX_VP8_ENCODER_RDOPT_H_
+#define VPX_VP8_ENCODER_RDOPT_H_
 
 #include "./vpx_config.h"
 
@@ -19,6 +19,9 @@ extern "C" {
 
 #define RDCOST(RM, DM, R, D) (((128 + (R) * (RM)) >> 8) + (DM) * (D))
 
+void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex);
+void vp8_auto_select_speed(VP8_COMP *cpi);
+
 static INLINE void insertsortmv(int arr[], int len) {
   int i, j, k;
 
@@ -60,12 +63,12 @@ static INLINE void insertsortsad(int arr[], int idx[], int len) {
   }
 }
 
-extern void vp8_initialize_rd_consts(VP8_COMP *cpi, MACROBLOCK *x, int Qvalue);
-extern void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x,
-                                   int recon_yoffset, int recon_uvoffset,
-                                   int *returnrate, int *returndistortion,
-                                   int *returnintra, int mb_row, int mb_col);
-extern void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate);
+void vp8_initialize_rd_consts(VP8_COMP *cpi, MACROBLOCK *x, int Qvalue);
+void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
+                            int recon_uvoffset, int *returnrate,
+                            int *returndistortion, int *returnintra, int mb_row,
+                            int mb_col);
+void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate);
 
 static INLINE void get_plane_pointers(const YV12_BUFFER_CONFIG *fb,
                                       unsigned char *plane[3],
@@ -107,9 +110,9 @@ static INLINE void get_reference_search_order(const VP8_COMP *cpi,
   for (; i < 4; ++i) ref_frame_map[i] = -1;
 }
 
-extern void vp8_mv_pred(VP8_COMP *cpi, MACROBLOCKD *xd, const MODE_INFO *here,
-                        int_mv *mvp, int refframe, int *ref_frame_sign_bias,
-                        int *sr, int near_sadidx[]);
+void vp8_mv_pred(VP8_COMP *cpi, MACROBLOCKD *xd, const MODE_INFO *here,
+                 int_mv *mvp, int refframe, int *ref_frame_sign_bias, int *sr,
+                 int near_sadidx[]);
 void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x,
                  int recon_yoffset, int near_sadidx[]);
 int VP8_UVSSE(MACROBLOCK *x);
@@ -120,4 +123,4 @@ void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv);
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_RDOPT_H_
+#endif  // VPX_VP8_ENCODER_RDOPT_H_
diff --git a/media/libvpx/libvpx/vp8/encoder/segmentation.c b/media/libvpx/libvpx/vp8/encoder/segmentation.c
index dcb68119e1..2127258111 100644
--- a/media/libvpx/libvpx/vp8/encoder/segmentation.c
+++ b/media/libvpx/libvpx/vp8/encoder/segmentation.c
@@ -11,7 +11,7 @@
 #include "segmentation.h"
 #include "vpx_mem/vpx_mem.h"
 
-void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x) {
+void vp8_update_gf_usage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x) {
   int mb_row, mb_col;
 
   MODE_INFO *this_mb_mode_info = cm->mi;
@@ -19,7 +19,7 @@ void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x) {
   x->gf_active_ptr = (signed char *)cpi->gf_active_flags;
 
   if ((cm->frame_type == KEY_FRAME) || (cm->refresh_golden_frame)) {
-    /* Reset Gf useage monitors */
+    /* Reset Gf usage monitors */
     memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
     cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
   } else {
diff --git a/media/libvpx/libvpx/vp8/encoder/segmentation.h b/media/libvpx/libvpx/vp8/encoder/segmentation.h
index 1395a34118..0fecfc2212 100644
--- a/media/libvpx/libvpx/vp8/encoder/segmentation.h
+++ b/media/libvpx/libvpx/vp8/encoder/segmentation.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_SEGMENTATION_H_
-#define VP8_ENCODER_SEGMENTATION_H_
+#ifndef VPX_VP8_ENCODER_SEGMENTATION_H_
+#define VPX_VP8_ENCODER_SEGMENTATION_H_
 
 #include "string.h"
 #include "vp8/common/blockd.h"
@@ -19,11 +19,11 @@
 extern "C" {
 #endif
 
-extern void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm,
-                                      MACROBLOCK *x);
+extern void vp8_update_gf_usage_maps(VP8_COMP *cpi, VP8_COMMON *cm,
+                                     MACROBLOCK *x);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_SEGMENTATION_H_
+#endif  // VPX_VP8_ENCODER_SEGMENTATION_H_
diff --git a/media/libvpx/libvpx/vp8/encoder/temporal_filter.c b/media/libvpx/libvpx/vp8/encoder/temporal_filter.c
index 1b2f46bb69..1c1a55fde6 100644
--- a/media/libvpx/libvpx/vp8/encoder/temporal_filter.c
+++ b/media/libvpx/libvpx/vp8/encoder/temporal_filter.c
@@ -20,6 +20,7 @@
 #include "ratectrl.h"
 #include "vp8/common/quant_common.h"
 #include "segmentation.h"
+#include "temporal_filter.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vp8/common/swapyv12buffer.h"
 #include "vp8/common/threading.h"
@@ -157,7 +158,8 @@ static int vp8_temporal_filter_find_matching_mb_c(VP8_COMP *cpi,
   /* Ignore mv costing by sending NULL cost arrays */
   bestsme =
       vp8_hex_search(x, b, d, &best_ref_mv1_full, &d->bmi.mv, step_param, sadpb,
-                     &cpi->fn_ptr[BLOCK_16X16], NULL, NULL, &best_ref_mv1);
+                     &cpi->fn_ptr[BLOCK_16X16], NULL, &best_ref_mv1);
+  (void)bestsme;  // Ignore unused return value.
 
 #if ALT_REF_SUBPEL_ENABLED
   /* Try sub-pixel MC? */
diff --git a/media/libvpx/libvpx/vp8/encoder/temporal_filter.h b/media/libvpx/libvpx/vp8/encoder/temporal_filter.h
new file mode 100644
index 0000000000..fd39f5cb87
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/encoder/temporal_filter.h
@@ -0,0 +1,26 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_ENCODER_TEMPORAL_FILTER_H_
+#define VPX_VP8_ENCODER_TEMPORAL_FILTER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8_COMP;
+
+void vp8_temporal_filter_prepare_c(struct VP8_COMP *cpi, int distance);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // VPX_VP8_ENCODER_TEMPORAL_FILTER_H_
diff --git a/media/libvpx/libvpx/vp8/encoder/tokenize.c b/media/libvpx/libvpx/vp8/encoder/tokenize.c
index ca5f0e3d89..c3d7026607 100644
--- a/media/libvpx/libvpx/vp8/encoder/tokenize.c
+++ b/media/libvpx/libvpx/vp8/encoder/tokenize.c
@@ -19,10 +19,6 @@
 /* Global event counters used for accumulating statistics across several
    compressions, then generating context.c = initial stats. */
 
-#ifdef VP8_ENTROPY_STATS
-_int64 context_counters[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
-                       [MAX_ENTROPY_TOKENS];
-#endif
 void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t);
 void vp8_fix_contexts(MACROBLOCKD *x);
 
@@ -383,72 +379,6 @@ void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t) {
   tokenize1st_order_b(x, t, plane_type, cpi);
 }
 
-#ifdef VP8_ENTROPY_STATS
-
-void init_context_counters(void) {
-  memset(context_counters, 0, sizeof(context_counters));
-}
-
-void print_context_counters() {
-  int type, band, pt, t;
-
-  FILE *const f = fopen("context.c", "w");
-
-  fprintf(f, "#include \"entropy.h\"\n");
-
-  fprintf(f, "\n/* *** GENERATED FILE: DO NOT EDIT *** */\n\n");
-
-  fprintf(f,
-          "int Contexts[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] "
-          "[MAX_ENTROPY_TOKENS];\n\n");
-
-  fprintf(f,
-          "const int default_contexts[BLOCK_TYPES] [COEF_BANDS] "
-          "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {");
-
-#define Comma(X) (X ? "," : "")
-
-  type = 0;
-
-  do {
-    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
-
-    band = 0;
-
-    do {
-      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);
-
-      pt = 0;
-
-      do {
-        fprintf(f, "%s\n      {", Comma(pt));
-
-        t = 0;
-
-        do {
-          const _int64 x = context_counters[type][band][pt][t];
-          const int y = (int)x;
-
-          assert(x == (_int64)y); /* no overflow handling yet */
-          fprintf(f, "%s %d", Comma(t), y);
-
-        } while (++t < MAX_ENTROPY_TOKENS);
-
-        fprintf(f, "}");
-      } while (++pt < PREV_COEF_CONTEXTS);
-
-      fprintf(f, "\n    }");
-
-    } while (++band < COEF_BANDS);
-
-    fprintf(f, "\n  }");
-  } while (++type < BLOCK_TYPES);
-
-  fprintf(f, "\n};\n");
-  fclose(f);
-}
-#endif
-
 static void stuff2nd_order_b(TOKENEXTRA **tp, ENTROPY_CONTEXT *a,
                              ENTROPY_CONTEXT *l, VP8_COMP *cpi, MACROBLOCK *x) {
   int pt;              /* near block/prev token context index */
diff --git a/media/libvpx/libvpx/vp8/encoder/tokenize.h b/media/libvpx/libvpx/vp8/encoder/tokenize.h
index e5dbdfc5af..5223aa2d86 100644
--- a/media/libvpx/libvpx/vp8/encoder/tokenize.h
+++ b/media/libvpx/libvpx/vp8/encoder/tokenize.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_TOKENIZE_H_
-#define VP8_ENCODER_TOKENIZE_H_
+#ifndef VPX_VP8_ENCODER_TOKENIZE_H_
+#define VPX_VP8_ENCODER_TOKENIZE_H_
 
 #include "vp8/common/entropy.h"
 #include "block.h"
@@ -18,8 +18,6 @@
 extern "C" {
 #endif
 
-void vp8_tokenize_initialize();
-
 typedef struct {
   short Token;
   short Extra;
@@ -34,14 +32,6 @@ typedef struct {
 
 int rd_cost_mby(MACROBLOCKD *);
 
-#ifdef VP8_ENTROPY_STATS
-void init_context_counters();
-void print_context_counters();
-
-extern _int64 context_counters[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
-                              [MAX_ENTROPY_TOKENS];
-#endif
-
 extern const short *const vp8_dct_value_cost_ptr;
 /* TODO: The Token field should be broken out into a separate char array to
  *  improve cache locality, since it's needed for costing when the rest of the
@@ -53,4 +43,4 @@ extern const TOKENVALUE *const vp8_dct_value_tokens_ptr;
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_TOKENIZE_H_
+#endif  // VPX_VP8_ENCODER_TOKENIZE_H_
diff --git a/media/libvpx/libvpx/vp8/encoder/treewriter.h b/media/libvpx/libvpx/vp8/encoder/treewriter.h
index dadbbe3f80..4e9ed6af17 100644
--- a/media/libvpx/libvpx/vp8/encoder/treewriter.h
+++ b/media/libvpx/libvpx/vp8/encoder/treewriter.h
@@ -8,12 +8,14 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_TREEWRITER_H_
-#define VP8_ENCODER_TREEWRITER_H_
+#ifndef VPX_VP8_ENCODER_TREEWRITER_H_
+#define VPX_VP8_ENCODER_TREEWRITER_H_
 
 /* Trees map alphabets into huffman-like codes suitable for an arithmetic
    bit coder.  Timothy S Murphy  11 October 2004 */
 
+#include <stdint.h>
+
 #include "./vpx_config.h"
 #include "vp8/common/treecoder.h"
 
@@ -48,7 +50,9 @@ static INLINE unsigned int vp8_cost_branch(const unsigned int ct[2],
                                            vp8_prob p) {
   /* Imitate existing calculation */
 
-  return ((ct[0] * vp8_cost_zero(p)) + (ct[1] * vp8_cost_one(p))) >> 8;
+  return (unsigned int)(((((uint64_t)ct[0]) * vp8_cost_zero(p)) +
+                         (((uint64_t)ct[1]) * vp8_cost_one(p))) >>
+                        8);
 }
 
 /* Small functions to write explicit values and tokens, as well as
@@ -56,8 +60,7 @@ static INLINE unsigned int vp8_cost_branch(const unsigned int ct[2],
 
 static void vp8_treed_write(vp8_writer *const w, vp8_tree t,
                             const vp8_prob *const p, int v,
-                            int n /* number of bits in v, assumed nonzero */
-                            ) {
+                            int n) { /* number of bits in v, assumed nonzero */
   vp8_tree_index i = 0;
 
   do {
@@ -73,8 +76,7 @@ static INLINE void vp8_write_token(vp8_writer *const w, vp8_tree t,
 }
 
 static int vp8_treed_cost(vp8_tree t, const vp8_prob *const p, int v,
-                          int n /* number of bits in v, assumed nonzero */
-                          ) {
+                          int n) { /* number of bits in v, assumed nonzero */
   int c = 0;
   vp8_tree_index i = 0;
 
@@ -93,12 +95,12 @@ static INLINE int vp8_cost_token(vp8_tree t, const vp8_prob *const p,
 
 /* Fill array of costs for all possible token values. */
 
-void vp8_cost_tokens(int *Costs, const vp8_prob *, vp8_tree);
+void vp8_cost_tokens(int *c, const vp8_prob *, vp8_tree);
 
-void vp8_cost_tokens2(int *Costs, const vp8_prob *, vp8_tree, int);
+void vp8_cost_tokens2(int *c, const vp8_prob *, vp8_tree, int);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_TREEWRITER_H_
+#endif  // VPX_VP8_ENCODER_TREEWRITER_H_
diff --git a/media/libvpx/libvpx/vp8/encoder/vp8_quantize.c b/media/libvpx/libvpx/vp8/encoder/vp8_quantize.c
index ff6e04eaad..8b9b22babe 100644
--- a/media/libvpx/libvpx/vp8/encoder/vp8_quantize.c
+++ b/media/libvpx/libvpx/vp8/encoder/vp8_quantize.c
@@ -9,6 +9,9 @@
  */
 
 #include <math.h>
+
+#include "./vpx_config.h"
+#include "vpx_ports/bitops.h"
 #include "vpx_mem/vpx_mem.h"
 
 #include "onyx_int.h"
@@ -162,10 +165,10 @@ static const int qzbin_factors_y2[129] = {
 static void invert_quant(int improved_quant, short *quant, short *shift,
                          short d) {
   if (improved_quant) {
-    unsigned t;
+    unsigned int t;
     int l, m;
-    t = d;
-    for (l = 0; t > 1; ++l) t >>= 1;
+    t = (unsigned int)d;
+    l = get_msb(t);
     m = 1 + (1 << (16 + l)) / d;
     *quant = (short)(m - (1 << 16));
     *shift = l;
@@ -174,8 +177,6 @@ static void invert_quant(int improved_quant, short *quant, short *shift,
   } else {
     *quant = (1 << 16) / d;
     *shift = 0;
-    /* use multiplication and constant shift by 16 */
-    *shift = 1 << (16 - *shift);
   }
 }
 
@@ -296,7 +297,7 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip) {
   /* Select the baseline MB Q index. */
   if (xd->segmentation_enabled) {
     /* Abs Value */
-    if (xd->mb_segement_abs_delta == SEGMENT_ABSDATA) {
+    if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) {
       QIndex = xd->segment_feature_data[MB_LVL_ALT_Q]
                                        [xd->mode_info_context->mbmi.segment_id];
       /* Delta Value */
diff --git a/media/libvpx/libvpx/vp8/encoder/x86/encodeopt.asm b/media/libvpx/libvpx/vp8/encoder/x86/block_error_sse2.asm
similarity index 97%
rename from media/libvpx/libvpx/vp8/encoder/x86/encodeopt.asm
rename to media/libvpx/libvpx/vp8/encoder/x86/block_error_sse2.asm
index 0297220ee1..200b4ccfe6 100644
--- a/media/libvpx/libvpx/vp8/encoder/x86/encodeopt.asm
+++ b/media/libvpx/libvpx/vp8/encoder/x86/block_error_sse2.asm
@@ -11,8 +11,10 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
+
 ;int vp8_block_error_sse2(short *coeff_ptr,  short *dcoef_ptr)
-global sym(vp8_block_error_sse2) PRIVATE
+globalsym(vp8_block_error_sse2)
 sym(vp8_block_error_sse2):
     push        rbp
     mov         rbp, rsp
@@ -60,7 +62,7 @@ sym(vp8_block_error_sse2):
     ret
 
 ;int vp8_mbblock_error_sse2_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
-global sym(vp8_mbblock_error_sse2_impl) PRIVATE
+globalsym(vp8_mbblock_error_sse2_impl)
 sym(vp8_mbblock_error_sse2_impl):
     push        rbp
     mov         rbp, rsp
@@ -130,7 +132,7 @@ sym(vp8_mbblock_error_sse2_impl):
 
 
 ;int vp8_mbuverror_sse2_impl(short *s_ptr, short *d_ptr);
-global sym(vp8_mbuverror_sse2_impl) PRIVATE
+globalsym(vp8_mbuverror_sse2_impl)
 sym(vp8_mbuverror_sse2_impl):
     push        rbp
     mov         rbp, rsp
diff --git a/media/libvpx/libvpx/vp8/common/x86/copy_sse2.asm b/media/libvpx/libvpx/vp8/encoder/x86/copy_sse2.asm
similarity index 98%
rename from media/libvpx/libvpx/vp8/common/x86/copy_sse2.asm
rename to media/libvpx/libvpx/vp8/encoder/x86/copy_sse2.asm
index 86fae26956..fe78da398e 100644
--- a/media/libvpx/libvpx/vp8/common/x86/copy_sse2.asm
+++ b/media/libvpx/libvpx/vp8/encoder/x86/copy_sse2.asm
@@ -11,6 +11,7 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
 
 ;void vp8_copy32xn_sse2(
 ;    unsigned char *src_ptr,
@@ -18,7 +19,7 @@
 ;    unsigned char *dst_ptr,
 ;    int  dst_stride,
 ;    int height);
-global sym(vp8_copy32xn_sse2) PRIVATE
+globalsym(vp8_copy32xn_sse2)
 sym(vp8_copy32xn_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/media/libvpx/libvpx/vp8/common/x86/copy_sse3.asm b/media/libvpx/libvpx/vp8/encoder/x86/copy_sse3.asm
similarity index 99%
rename from media/libvpx/libvpx/vp8/common/x86/copy_sse3.asm
rename to media/libvpx/libvpx/vp8/encoder/x86/copy_sse3.asm
index d789a40ccf..c40b2d8bf6 100644
--- a/media/libvpx/libvpx/vp8/common/x86/copy_sse3.asm
+++ b/media/libvpx/libvpx/vp8/encoder/x86/copy_sse3.asm
@@ -83,6 +83,7 @@
     ret
 %endmacro
 
+SECTION .text
 
 ;void vp8_copy32xn_sse3(
 ;    unsigned char *src_ptr,
@@ -90,7 +91,7 @@
 ;    unsigned char *dst_ptr,
 ;    int  dst_stride,
 ;    int height);
-global sym(vp8_copy32xn_sse3) PRIVATE
+globalsym(vp8_copy32xn_sse3)
 sym(vp8_copy32xn_sse3):
 
     STACK_FRAME_CREATE_X3
diff --git a/media/libvpx/libvpx/vp8/encoder/x86/dct_sse2.asm b/media/libvpx/libvpx/vp8/encoder/x86/dct_sse2.asm
index d06bca5927..3c28cb902e 100644
--- a/media/libvpx/libvpx/vp8/encoder/x86/dct_sse2.asm
+++ b/media/libvpx/libvpx/vp8/encoder/x86/dct_sse2.asm
@@ -60,8 +60,10 @@
     ret
 %endmacro
 
+SECTION .text
+
 ;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
-global sym(vp8_short_fdct4x4_sse2) PRIVATE
+globalsym(vp8_short_fdct4x4_sse2)
 sym(vp8_short_fdct4x4_sse2):
 
     STACK_FRAME_CREATE
@@ -166,7 +168,7 @@ sym(vp8_short_fdct4x4_sse2):
     STACK_FRAME_DESTROY
 
 ;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
-global sym(vp8_short_fdct8x4_sse2) PRIVATE
+globalsym(vp8_short_fdct8x4_sse2)
 sym(vp8_short_fdct8x4_sse2):
 
     STACK_FRAME_CREATE
diff --git a/media/libvpx/libvpx/vp8/encoder/x86/denoising_sse2.c b/media/libvpx/libvpx/vp8/encoder/x86/denoising_sse2.c
index 89cad53356..f35b930169 100644
--- a/media/libvpx/libvpx/vp8/encoder/x86/denoising_sse2.c
+++ b/media/libvpx/libvpx/vp8/encoder/x86/denoising_sse2.c
@@ -30,7 +30,7 @@ static INLINE unsigned int abs_sum_diff_16x1(__m128i acc_diff) {
       _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8));
   const __m128i hgfedcba =
       _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4));
-  unsigned int sum_diff = abs(_mm_cvtsi128_si32(hgfedcba));
+  unsigned int sum_diff = (unsigned int)abs(_mm_cvtsi128_si32(hgfedcba));
 
   return sum_diff;
 }
diff --git a/media/libvpx/libvpx/vp8/encoder/x86/fwalsh_sse2.asm b/media/libvpx/libvpx/vp8/encoder/x86/fwalsh_sse2.asm
index f4989279f4..938fc173ff 100644
--- a/media/libvpx/libvpx/vp8/encoder/x86/fwalsh_sse2.asm
+++ b/media/libvpx/libvpx/vp8/encoder/x86/fwalsh_sse2.asm
@@ -11,8 +11,10 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
+
 ;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch)
-global sym(vp8_short_walsh4x4_sse2) PRIVATE
+globalsym(vp8_short_walsh4x4_sse2)
 sym(vp8_short_walsh4x4_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/media/libvpx/libvpx/vp8/encoder/x86/quantize_mmx.asm b/media/libvpx/libvpx/vp8/encoder/x86/quantize_mmx.asm
deleted file mode 100644
index 2864ce16d9..0000000000
--- a/media/libvpx/libvpx/vp8/encoder/x86/quantize_mmx.asm
+++ /dev/null
@@ -1,286 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
-;                           short *qcoeff_ptr,short *dequant_ptr,
-;                           short *scan_mask, short *round_ptr,
-;                           short *quant_ptr, short *dqcoeff_ptr);
-global sym(vp8_fast_quantize_b_impl_mmx) PRIVATE
-sym(vp8_fast_quantize_b_impl_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov             rsi,        arg(0) ;coeff_ptr
-        movq            mm0,        [rsi]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movq            mm1,        [rax]
-
-        movq            mm3,        mm0
-        psraw           mm0,        15
-
-        pxor            mm3,        mm0
-        psubw           mm3,        mm0         ; abs
-
-        movq            mm2,        mm3
-        pcmpgtw         mm1,        mm2
-
-        pandn           mm1,        mm2
-        movq            mm3,        mm1
-
-        mov             rdx,        arg(6) ;quant_ptr
-        movq            mm1,        [rdx]
-
-        mov             rcx,        arg(5) ;round_ptr
-        movq            mm2,        [rcx]
-
-        paddw           mm3,        mm2
-        pmulhuw         mm3,        mm1
-
-        pxor            mm3,        mm0
-        psubw           mm3,        mm0     ;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-        movq            mm0,        mm3
-
-        movq            [rdi],      mm3
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movq            mm2,        [rax]
-
-        pmullw          mm3,        mm2
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movq            [rax],      mm3
-
-        ; next 8
-        movq            mm4,        [rsi+8]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movq            mm5,        [rax+8]
-
-        movq            mm7,        mm4
-        psraw           mm4,        15
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4         ; abs
-
-        movq            mm6,        mm7
-        pcmpgtw         mm5,        mm6
-
-        pandn           mm5,        mm6
-        movq            mm7,        mm5
-
-        movq            mm5,        [rdx+8]
-        movq            mm6,        [rcx+8]
-
-        paddw           mm7,        mm6
-        pmulhuw         mm7,        mm5
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-
-        movq            mm1,        mm7
-        movq            [rdi+8],    mm7
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movq            mm6,        [rax+8]
-
-        pmullw          mm7,        mm6
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movq            [rax+8],    mm7
-
-
-                ; next 8
-        movq            mm4,        [rsi+16]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movq            mm5,        [rax+16]
-
-        movq            mm7,        mm4
-        psraw           mm4,        15
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4         ; abs
-
-        movq            mm6,        mm7
-        pcmpgtw         mm5,        mm6
-
-        pandn           mm5,        mm6
-        movq            mm7,        mm5
-
-        movq            mm5,        [rdx+16]
-        movq            mm6,        [rcx+16]
-
-        paddw           mm7,        mm6
-        pmulhuw         mm7,        mm5
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-
-        movq            mm1,        mm7
-        movq            [rdi+16],   mm7
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movq            mm6,        [rax+16]
-
-        pmullw          mm7,        mm6
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movq            [rax+16],   mm7
-
-
-                ; next 8
-        movq            mm4,        [rsi+24]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movq            mm5,        [rax+24]
-
-        movq            mm7,        mm4
-        psraw           mm4,        15
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4         ; abs
-
-        movq            mm6,        mm7
-        pcmpgtw         mm5,        mm6
-
-        pandn           mm5,        mm6
-        movq            mm7,        mm5
-
-        movq            mm5,        [rdx+24]
-        movq            mm6,        [rcx+24]
-
-        paddw           mm7,        mm6
-        pmulhuw         mm7,        mm5
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-
-        movq            mm1,        mm7
-        movq            [rdi+24],   mm7
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movq            mm6,        [rax+24]
-
-        pmullw          mm7,        mm6
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movq            [rax+24],   mm7
-
-
-
-        mov             rdi,        arg(4) ;scan_mask
-        mov             rsi,        arg(2) ;qcoeff_ptr
-
-        pxor            mm5,        mm5
-        pxor            mm7,        mm7
-
-        movq            mm0,        [rsi]
-        movq            mm1,        [rsi+8]
-
-        movq            mm2,        [rdi]
-        movq            mm3,        [rdi+8];
-
-        pcmpeqw         mm0,        mm7
-        pcmpeqw         mm1,        mm7
-
-        pcmpeqw         mm6,        mm6
-        pxor            mm0,        mm6
-
-        pxor            mm1,        mm6
-        psrlw           mm0,        15
-
-        psrlw           mm1,        15
-        pmaddwd         mm0,        mm2
-
-        pmaddwd         mm1,        mm3
-        movq            mm5,        mm0
-
-        paddd           mm5,        mm1
-
-        movq            mm0,        [rsi+16]
-        movq            mm1,        [rsi+24]
-
-        movq            mm2,        [rdi+16]
-        movq            mm3,        [rdi+24];
-
-        pcmpeqw         mm0,        mm7
-        pcmpeqw         mm1,        mm7
-
-        pcmpeqw         mm6,        mm6
-        pxor            mm0,        mm6
-
-        pxor            mm1,        mm6
-        psrlw           mm0,        15
-
-        psrlw           mm1,        15
-        pmaddwd         mm0,        mm2
-
-        pmaddwd         mm1,        mm3
-        paddd           mm5,        mm0
-
-        paddd           mm5,        mm1
-        movq            mm0,        mm5
-
-        psrlq           mm5,        32
-        paddd           mm0,        mm5
-
-        ; eob adjustment begins here
-        movq            rcx,        mm0
-        and             rcx,        0xffff
-
-        xor             rdx,        rdx
-        sub             rdx,        rcx ; rdx=-rcx
-
-        bsr             rax,        rcx
-        inc             rax
-
-        sar             rdx,        31
-        and             rax,        rdx
-        ; Substitute the sse assembly for the old mmx mixed assembly/C. The
-        ; following is kept as reference
-        ;    movq            rcx,        mm0
-        ;    bsr             rax,        rcx
-        ;
-        ;    mov             eob,        rax
-        ;    mov             eee,        rcx
-        ;
-        ;if(eee==0)
-        ;{
-        ;    eob=-1;
-        ;}
-        ;else if(eee<0)
-        ;{
-        ;    eob=15;
-        ;}
-        ;d->eob = eob+1;
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/media/libvpx/libvpx/vp8/encoder/x86/quantize_sse4.c b/media/libvpx/libvpx/vp8/encoder/x86/quantize_sse4.c
index 6f2c163492..4c2d24cc27 100644
--- a/media/libvpx/libvpx/vp8/encoder/x86/quantize_sse4.c
+++ b/media/libvpx/libvpx/vp8/encoder/x86/quantize_sse4.c
@@ -12,27 +12,17 @@
 
 #include "./vp8_rtcd.h"
 #include "vp8/encoder/block.h"
-#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */
+#include "vpx_ports/bitops.h" /* get_lsb */
+#include "vpx_ports/compiler_attributes.h"
 
-#define SELECT_EOB(i, z, x, y, q)         \
-  do {                                    \
-    short boost = *zbin_boost_ptr;        \
-    short x_z = _mm_extract_epi16(x, z);  \
-    short y_z = _mm_extract_epi16(y, z);  \
-    int cmp = (x_z < boost) | (y_z == 0); \
-    zbin_boost_ptr++;                     \
-    if (cmp) break;                       \
-    q = _mm_insert_epi16(q, y_z, z);      \
-    eob = i;                              \
-    zbin_boost_ptr = b->zrun_zbin_boost;  \
-  } while (0)
-
-void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) {
-  char eob = 0;
+// Unsigned shift overflow is disabled for the use of ~1U << eob with ymask.
+VPX_NO_UNSIGNED_SHIFT_CHECK void vp8_regular_quantize_b_sse4_1(BLOCK *b,
+                                                               BLOCKD *d) {
+  int eob = -1;
   short *zbin_boost_ptr = b->zrun_zbin_boost;
-
-  __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1, dqcoeff0,
-      dqcoeff1;
+  __m128i zbin_boost0 = _mm_load_si128((__m128i *)(zbin_boost_ptr));
+  __m128i zbin_boost1 = _mm_load_si128((__m128i *)(zbin_boost_ptr + 8));
+  __m128i x0, x1, y0, y1, x_minus_zbin0, x_minus_zbin1, dqcoeff0, dqcoeff1;
   __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift));
   __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8));
   __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
@@ -46,22 +36,20 @@ void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) {
   __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8));
   __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
   __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
-  __m128i qcoeff0 = _mm_setzero_si128();
-  __m128i qcoeff1 = _mm_setzero_si128();
+  __m128i qcoeff0, qcoeff1, t0, t1, x_shuf0, x_shuf1;
+  uint32_t mask, ymask;
+  DECLARE_ALIGNED(16, static const uint8_t,
+                  zig_zag_mask[16]) = { 0, 1,  4,  8,  5, 2,  3,  6,
+                                        9, 12, 13, 10, 7, 11, 14, 15 };
+  DECLARE_ALIGNED(16, uint16_t, qcoeff[16]) = { 0 };
 
   /* Duplicate to all lanes. */
   zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0);
   zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra);
 
-  /* Sign of z: z >> 15 */
-  sz0 = _mm_srai_epi16(z0, 15);
-  sz1 = _mm_srai_epi16(z1, 15);
-
-  /* x = abs(z): (z ^ sz) - sz */
-  x0 = _mm_xor_si128(z0, sz0);
-  x1 = _mm_xor_si128(z1, sz1);
-  x0 = _mm_sub_epi16(x0, sz0);
-  x1 = _mm_sub_epi16(x1, sz1);
+  /* x = abs(z) */
+  x0 = _mm_abs_epi16(z0);
+  x1 = _mm_abs_epi16(z1);
 
   /* zbin[] + zbin_extra */
   zbin0 = _mm_add_epi16(zbin0, zbin_extra);
@@ -89,29 +77,56 @@ void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) {
   y0 = _mm_mulhi_epi16(y0, quant_shift0);
   y1 = _mm_mulhi_epi16(y1, quant_shift1);
 
-  /* Return the sign: (y ^ sz) - sz */
-  y0 = _mm_xor_si128(y0, sz0);
-  y1 = _mm_xor_si128(y1, sz1);
-  y0 = _mm_sub_epi16(y0, sz0);
-  y1 = _mm_sub_epi16(y1, sz1);
+  /* Restore the sign. */
+  y0 = _mm_sign_epi16(y0, z0);
+  y1 = _mm_sign_epi16(y1, z1);
 
-  /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */
-  SELECT_EOB(1, 0, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(2, 1, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(3, 4, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(4, 0, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(5, 5, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(6, 2, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(7, 3, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(8, 6, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(9, 1, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(10, 4, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(11, 5, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(12, 2, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(13, 7, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(14, 3, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(15, 6, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(16, 7, x_minus_zbin1, y1, qcoeff1);
+  {
+    const __m128i zig_zag_i16_0 =
+        _mm_setr_epi8(0, 1, 2, 3, 8, 9, 14, 15, 10, 11, 4, 5, 6, 7, 12, 13);
+    const __m128i zig_zag_i16_1 =
+        _mm_setr_epi8(0, 1, 6, 7, 8, 9, 2, 3, 14, 15, 4, 5, 10, 11, 12, 13);
+
+    /* The first part of the zig zag needs a value
+     * from x_minus_zbin1 and vice versa. */
+    t1 = _mm_alignr_epi8(x_minus_zbin1, x_minus_zbin1, 2);
+    t0 = _mm_blend_epi16(x_minus_zbin0, t1, 0x80);
+    t1 = _mm_blend_epi16(t1, x_minus_zbin0, 0x80);
+    x_shuf0 = _mm_shuffle_epi8(t0, zig_zag_i16_0);
+    x_shuf1 = _mm_shuffle_epi8(t1, zig_zag_i16_1);
+  }
+
+  /* Check if y is nonzero and put it in zig zag order. */
+  t0 = _mm_packs_epi16(y0, y1);
+  t0 = _mm_cmpeq_epi8(t0, _mm_setzero_si128());
+  t0 = _mm_shuffle_epi8(t0, _mm_load_si128((const __m128i *)zig_zag_mask));
+  ymask = _mm_movemask_epi8(t0) ^ 0xffff;
+
+  for (;;) {
+    t0 = _mm_cmpgt_epi16(zbin_boost0, x_shuf0);
+    t1 = _mm_cmpgt_epi16(zbin_boost1, x_shuf1);
+    t0 = _mm_packs_epi16(t0, t1);
+    mask = _mm_movemask_epi8(t0);
+    mask = ~mask & ymask;
+    if (!mask) break;
+    /* |eob| will contain the index of the next found element where:
+     * boost[i - old_eob - 1] <= x[zigzag[i]] && y[zigzag[i]] != 0 */
+    eob = get_lsb(mask);
+    /* Need to clear the mask from processed elements so that
+     * they are no longer counted in the next iteration. */
+    ymask &= ~1U << eob;
+    /* It's safe to read ahead of this buffer if struct VP8_COMP has at
+     * least 32 bytes before the zrun_zbin_boost_* fields (it has 384).
+     * Any data read outside of the buffer is masked by the updated |ymask|. */
+    zbin_boost0 = _mm_loadu_si128((__m128i *)(zbin_boost_ptr - eob - 1));
+    zbin_boost1 = _mm_loadu_si128((__m128i *)(zbin_boost_ptr - eob + 7));
+    qcoeff[zig_zag_mask[eob]] = 0xffff;
+  }
+
+  qcoeff0 = _mm_load_si128((__m128i *)(qcoeff));
+  qcoeff1 = _mm_load_si128((__m128i *)(qcoeff + 8));
+  qcoeff0 = _mm_and_si128(qcoeff0, y0);
+  qcoeff1 = _mm_and_si128(qcoeff1, y1);
 
   _mm_store_si128((__m128i *)(d->qcoeff), qcoeff0);
   _mm_store_si128((__m128i *)(d->qcoeff + 8), qcoeff1);
@@ -122,5 +137,5 @@ void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) {
   _mm_store_si128((__m128i *)(d->dqcoeff), dqcoeff0);
   _mm_store_si128((__m128i *)(d->dqcoeff + 8), dqcoeff1);
 
-  *d->eob = eob;
+  *d->eob = eob + 1;
 }
diff --git a/media/libvpx/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/media/libvpx/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm
index bd92b398a0..67102064a1 100644
--- a/media/libvpx/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm
+++ b/media/libvpx/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm
@@ -11,6 +11,8 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
+
 ; void vp8_temporal_filter_apply_sse2 | arg
 ;  (unsigned char  *frame1,           |  0
 ;   unsigned int    stride,           |  1
@@ -20,7 +22,7 @@
 ;   int             filter_weight,    |  5
 ;   unsigned int   *accumulator,      |  6
 ;   unsigned short *count)            |  7
-global sym(vp8_temporal_filter_apply_sse2) PRIVATE
+globalsym(vp8_temporal_filter_apply_sse2)
 sym(vp8_temporal_filter_apply_sse2):
 
     push        rbp
@@ -203,5 +205,5 @@ align 16
 _const_top_bit:
     times 8 dw 1<<15
 align 16
-_const_16w
+_const_16w:
     times 8 dw 16
diff --git a/media/libvpx/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c b/media/libvpx/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c
deleted file mode 100644
index 4406dd0cc4..0000000000
--- a/media/libvpx/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
-#include "vpx_ports/x86.h"
-#include "vp8/encoder/block.h"
-
-int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
-                                 short *qcoeff_ptr, short *dequant_ptr,
-                                 const short *scan_mask, short *round_ptr,
-                                 short *quant_ptr, short *dqcoeff_ptr);
-void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d) {
-  const short *scan_mask = vp8_default_zig_zag_mask;
-  short *coeff_ptr = b->coeff;
-  short *zbin_ptr = b->zbin;
-  short *round_ptr = b->round;
-  short *quant_ptr = b->quant_fast;
-  short *qcoeff_ptr = d->qcoeff;
-  short *dqcoeff_ptr = d->dqcoeff;
-  short *dequant_ptr = d->dequant;
-
-  *d->eob = (char)vp8_fast_quantize_b_impl_mmx(
-      coeff_ptr, zbin_ptr, qcoeff_ptr, dequant_ptr, scan_mask,
-
-      round_ptr, quant_ptr, dqcoeff_ptr);
-}
diff --git a/media/libvpx/libvpx/vp8/encoder/x86/quantize_ssse3.c b/media/libvpx/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c
similarity index 76%
rename from media/libvpx/libvpx/vp8/encoder/x86/quantize_ssse3.c
rename to media/libvpx/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c
index 322f0a151f..f6df146f08 100644
--- a/media/libvpx/libvpx/vp8/encoder/x86/quantize_ssse3.c
+++ b/media/libvpx/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c
@@ -10,32 +10,9 @@
 
 #include <tmmintrin.h> /* SSSE3 */
 
+#include "./vp8_rtcd.h"
 #include "vp8/encoder/block.h"
-
-/* bitscan reverse (bsr) */
-#if defined(_MSC_VER)
-#include <intrin.h>
-#pragma intrinsic(_BitScanReverse)
-static int bsr(int mask) {
-  unsigned long eob;
-  _BitScanReverse(&eob, mask);
-  eob++;
-  if (mask == 0) eob = 0;
-  return eob;
-}
-#else
-static int bsr(int mask) {
-  int eob;
-#if defined(__GNUC__) && __GNUC__
-  __asm__ __volatile__("bsr %1, %0" : "=r"(eob) : "r"(mask) : "flags");
-#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
-  asm volatile("bsr %1, %0" : "=r"(eob) : "r"(mask) : "flags");
-#endif
-  eob++;
-  if (mask == 0) eob = 0;
-  return eob;
-}
-#endif
+#include "vpx_ports/bitops.h" /* get_msb */
 
 void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) {
   int eob, mask;
@@ -51,9 +28,9 @@ void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) {
 
   __m128i sz0, sz1, x, x0, x1, y0, y1, zeros, abs0, abs1;
 
-  DECLARE_ALIGNED(16, const uint8_t, pshufb_zig_zag_mask[16]) = {
-    0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
-  };
+  DECLARE_ALIGNED(16, const uint8_t,
+                  pshufb_zig_zag_mask[16]) = { 0, 1,  4,  8,  5, 2,  3,  6,
+                                               9, 12, 13, 10, 7, 11, 14, 15 };
   __m128i zig_zag = _mm_load_si128((const __m128i *)pshufb_zig_zag_mask);
 
   /* sign of z: z >> 15 */
@@ -107,7 +84,10 @@ void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) {
 
   mask = _mm_movemask_epi8(x);
 
-  eob = bsr(mask);
+  /* x2 is needed to increase the result from non-zero masks by 1,
+   * +1 is needed to mask undefined behavior for a null argument,
+   * the result of get_msb(1) is 0 */
+  eob = get_msb(mask * 2 + 1);
 
-  *d->eob = 0xFF & eob;
+  *d->eob = eob;
 }
diff --git a/media/libvpx/libvpx/vp8/vp8_common.mk b/media/libvpx/libvpx/vp8/vp8_common.mk
index 137f5bb627..d485965d3d 100644
--- a/media/libvpx/libvpx/vp8/vp8_common.mk
+++ b/media/libvpx/libvpx/vp8/vp8_common.mk
@@ -15,7 +15,6 @@ VP8_COMMON_SRCS-yes += common/onyxd.h
 VP8_COMMON_SRCS-yes += common/alloccommon.c
 VP8_COMMON_SRCS-yes += common/blockd.c
 VP8_COMMON_SRCS-yes += common/coefupdateprobs.h
-VP8_COMMON_SRCS-yes += common/copy_c.c
 # VP8_COMMON_SRCS-yes += common/debugmodes.c
 VP8_COMMON_SRCS-yes += common/default_coef_probs.h
 VP8_COMMON_SRCS-yes += common/dequantize.c
@@ -70,10 +69,8 @@ VP8_COMMON_SRCS-yes += common/vp8_entropymodedata.h
 
 VP8_COMMON_SRCS-yes += common/treecoder.c
 
-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.c
-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.h
-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp8_asm_stubs.c
-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.c
+VP8_COMMON_SRCS-$(VPX_ARCH_X86)$(VPX_ARCH_X86_64) += common/x86/vp8_asm_stubs.c
+VP8_COMMON_SRCS-$(VPX_ARCH_X86)$(VPX_ARCH_X86_64) += common/x86/loopfilter_x86.c
 VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/mfqe.c
 VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.h
 VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.c
@@ -82,21 +79,20 @@ VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idct_blk_mmx.c
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/copy_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/bilinear_filter_sse2.c
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm
-VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/copy_sse3.asm
 VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm
 
 ifeq ($(CONFIG_POSTPROC),yes)
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/mfqe_sse2.asm
 endif
 
-ifeq ($(ARCH_X86_64),yes)
+ifeq ($(VPX_ARCH_X86_64),yes)
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_block_sse2_x86_64.asm
 endif
 
@@ -116,20 +112,32 @@ VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/loopfilter_filters_msa.c
 VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/sixtap_filter_msa.c
 VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp8_macros_msa.h
 
+# common (c)
+VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/sixtap_filter_mmi.c
+VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/loopfilter_filters_mmi.c
+VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/idctllm_mmi.c
+VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/dequantize_mmi.c
+VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/copymem_mmi.c
+VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/idct_blk_mmi.c
+
 ifeq ($(CONFIG_POSTPROC),yes)
 VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c
 endif
 
+# common (loongarch LSX intrinsics)
+VP8_COMMON_SRCS-$(HAVE_LSX) += common/loongarch/loopfilter_filters_lsx.c
+VP8_COMMON_SRCS-$(HAVE_LSX) += common/loongarch/sixtap_filter_lsx.c
+VP8_COMMON_SRCS-$(HAVE_LSX) += common/loongarch/idct_lsx.c
+
 # common (neon intrinsics)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/loopfilter_arm.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/loopfilter_arm.h
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/bilinearpredict_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/copymem_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dc_only_idct_add_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dequant_idct_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dequantizeb_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_blk_neon.c
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_dequant_0_2x_neon.c
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_dequant_full_2x_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/iwalsh_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_loopfilter_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/loopfiltersimplehorizontaledge_neon.c
diff --git a/media/libvpx/libvpx/vp8/vp8_cx_iface.c b/media/libvpx/libvpx/vp8/vp8_cx_iface.c
index 4a3e8f2feb..d2b7184271 100644
--- a/media/libvpx/libvpx/vp8/vp8_cx_iface.c
+++ b/media/libvpx/libvpx/vp8/vp8_cx_iface.c
@@ -8,22 +8,31 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
+#include <limits.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
 #include "./vpx_config.h"
 #include "./vp8_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
-#include "vpx/vpx_codec.h"
+#include "vpx/vpx_encoder.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx_version.h"
 #include "vpx_mem/vpx_mem.h"
-#include "vpx_ports/vpx_once.h"
+#include "vpx_ports/static_assert.h"
+#include "vpx_ports/system_state.h"
+#include "vpx_util/vpx_timestamp.h"
+#if CONFIG_MULTITHREAD
+#include "vp8/encoder/ethreading.h"
+#endif
 #include "vp8/encoder/onyx_int.h"
 #include "vpx/vp8cx.h"
 #include "vp8/encoder/firstpass.h"
 #include "vp8/common/onyx.h"
 #include "vp8/common/common.h"
-#include <stdlib.h>
-#include <string.h>
 
 struct vp8_extracfg {
   struct vpx_codec_pkt_list *pkt_list;
@@ -74,6 +83,9 @@ struct vpx_codec_alg_priv {
   vpx_codec_priv_t base;
   vpx_codec_enc_cfg_t cfg;
   struct vp8_extracfg vp8_cfg;
+  vpx_rational64_t timestamp_ratio;
+  vpx_codec_pts_t pts_offset;
+  unsigned char pts_offset_initialized;
   VP8_CONFIG oxcf;
   struct VP8_COMP *cpi;
   unsigned char *cx_data;
@@ -87,13 +99,16 @@ struct vpx_codec_alg_priv {
   vpx_enc_frame_flags_t control_frame_flags;
 };
 
+// Called by vp8e_set_config() and vp8e_encode() only. Must not be called
+// by vp8e_init() because the `error` paramerer (cpi->common.error) will be
+// destroyed by vpx_codec_enc_init_ver() after vp8e_init() returns an error.
+// See the "IMPORTANT" comment in vpx_codec_enc_init_ver().
 static vpx_codec_err_t update_error_state(
     vpx_codec_alg_priv_t *ctx, const struct vpx_internal_error_info *error) {
-  vpx_codec_err_t res;
+  const vpx_codec_err_t res = error->error_code;
 
-  if ((res = error->error_code)) {
+  if (res != VPX_CODEC_OK)
     ctx->base.err_detail = error->has_detail ? error->detail : NULL;
-  }
 
   return res;
 }
@@ -105,10 +120,10 @@ static vpx_codec_err_t update_error_state(
     return VPX_CODEC_INVALID_PARAM; \
   } while (0)
 
-#define RANGE_CHECK(p, memb, lo, hi)                                 \
-  do {                                                               \
-    if (!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \
-      ERROR(#memb " out of range [" #lo ".." #hi "]");               \
+#define RANGE_CHECK(p, memb, lo, hi)                                     \
+  do {                                                                   \
+    if (!(((p)->memb == (lo) || (p)->memb > (lo)) && (p)->memb <= (hi))) \
+      ERROR(#memb " out of range [" #lo ".." #hi "]");                   \
   } while (0)
 
 #define RANGE_CHECK_HI(p, memb, hi)                                     \
@@ -146,8 +161,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   RANGE_CHECK_HI(cfg, g_lag_in_frames, 25);
 #endif
   RANGE_CHECK(cfg, rc_end_usage, VPX_VBR, VPX_Q);
-  RANGE_CHECK_HI(cfg, rc_undershoot_pct, 1000);
-  RANGE_CHECK_HI(cfg, rc_overshoot_pct, 1000);
+  RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100);
+  RANGE_CHECK_HI(cfg, rc_overshoot_pct, 100);
   RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
   RANGE_CHECK(cfg, kf_mode, VPX_KF_DISABLED, VPX_KF_AUTO);
 
@@ -251,6 +266,23 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
     ERROR("g_threads cannot be bigger than number of token partitions");
 #endif
 
+  // The range below shall be further tuned.
+  RANGE_CHECK(cfg, use_vizier_rc_params, 0, 1);
+  RANGE_CHECK(cfg, active_wq_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, err_per_mb_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, sr_default_decay_limit.den, 1, 1000);
+  RANGE_CHECK(cfg, sr_diff_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_err_per_mb_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_frame_min_boost_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_frame_max_boost_subs_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_max_total_boost_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, gf_max_total_boost_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, gf_frame_max_boost_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, zm_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, rd_mult_inter_qp_fac.den, 1, 1000);
+  RANGE_CHECK(cfg, rd_mult_arf_qp_fac.den, 1, 1000);
+  RANGE_CHECK(cfg, rd_mult_key_qp_fac.den, 1, 1000);
+
   return VPX_CODEC_OK;
 }
 
@@ -259,10 +291,11 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx,
   switch (img->fmt) {
     case VPX_IMG_FMT_YV12:
     case VPX_IMG_FMT_I420:
-    case VPX_IMG_FMT_VPXI420:
-    case VPX_IMG_FMT_VPXYV12: break;
+    case VPX_IMG_FMT_NV12: break;
     default:
-      ERROR("Invalid image format. Only YV12 and I420 images are supported");
+      ERROR(
+          "Invalid image format. Only YV12, I420 and NV12 images are "
+          "supported");
   }
 
   if ((img->d_w != ctx->cfg.g_w) || (img->d_h != ctx->cfg.g_h))
@@ -315,7 +348,9 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
     oxcf->end_usage = USAGE_CONSTANT_QUALITY;
   }
 
-  oxcf->target_bandwidth = cfg.rc_target_bitrate;
+  // Cap the target rate to 1000 Mbps to avoid some integer overflows in
+  // target bandwidth calculations.
+  oxcf->target_bandwidth = VPXMIN(cfg.rc_target_bitrate, 1000000);
   oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct;
   oxcf->gf_cbr_boost_pct = vp8_cfg.gf_cbr_boost_pct;
 
@@ -362,8 +397,7 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
   if (mr_cfg) {
     oxcf->mr_total_resolutions = mr_cfg->mr_total_resolutions;
     oxcf->mr_encoder_id = mr_cfg->mr_encoder_id;
-    oxcf->mr_down_sampling_factor.num = mr_cfg->mr_down_sampling_factor.num;
-    oxcf->mr_down_sampling_factor.den = mr_cfg->mr_down_sampling_factor.den;
+    oxcf->mr_down_sampling_factor = mr_cfg->mr_down_sampling_factor;
     oxcf->mr_low_res_mode_info = mr_cfg->mr_low_res_mode_info;
   }
 #else
@@ -371,6 +405,9 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
 #endif
 
   oxcf->cpu_used = vp8_cfg.cpu_used;
+  if (cfg.g_pass == VPX_RC_FIRST_PASS) {
+    oxcf->cpu_used = VPXMAX(4, oxcf->cpu_used);
+  }
   oxcf->encode_breakout = vp8_cfg.static_thresh;
   oxcf->play_alternate = vp8_cfg.enable_auto_alt_ref;
   oxcf->noise_sensitivity = vp8_cfg.noise_sensitivity;
@@ -445,14 +482,29 @@ static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t *ctx,
     ERROR("Cannot increase lag_in_frames");
 
   res = validate_config(ctx, cfg, &ctx->vp8_cfg, 0);
+  if (res != VPX_CODEC_OK) return res;
 
-  if (!res) {
-    ctx->cfg = *cfg;
-    set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL);
-    vp8_change_config(ctx->cpi, &ctx->oxcf);
+  if (setjmp(ctx->cpi->common.error.jmp)) {
+    const vpx_codec_err_t codec_err =
+        update_error_state(ctx, &ctx->cpi->common.error);
+    ctx->cpi->common.error.setjmp = 0;
+    vpx_clear_system_state();
+    assert(codec_err != VPX_CODEC_OK);
+    return codec_err;
   }
 
-  return res;
+  ctx->cpi->common.error.setjmp = 1;
+  ctx->cfg = *cfg;
+  set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL);
+  vp8_change_config(ctx->cpi, &ctx->oxcf);
+#if CONFIG_MULTITHREAD
+  if (vp8cx_create_encoder_threads(ctx->cpi)) {
+    ctx->cpi->common.error.setjmp = 0;
+    return VPX_CODEC_ERROR;
+  }
+#endif
+  ctx->cpi->common.error.setjmp = 0;
+  return VPX_CODEC_OK;
 }
 
 static vpx_codec_err_t get_quantizer(vpx_codec_alg_priv_t *ctx, va_list args) {
@@ -484,6 +536,9 @@ static vpx_codec_err_t update_extracfg(vpx_codec_alg_priv_t *ctx,
 static vpx_codec_err_t set_cpu_used(vpx_codec_alg_priv_t *ctx, va_list args) {
   struct vp8_extracfg extra_cfg = ctx->vp8_cfg;
   extra_cfg.cpu_used = CAST(VP8E_SET_CPUUSED, args);
+  // Use fastest speed setting (speed 16 or -16) if it's set beyond the range.
+  extra_cfg.cpu_used = VPXMIN(16, extra_cfg.cpu_used);
+  extra_cfg.cpu_used = VPXMAX(-16, extra_cfg.cpu_used);
   return update_extracfg(ctx, &extra_cfg);
 }
 
@@ -575,9 +630,21 @@ static vpx_codec_err_t set_screen_content_mode(vpx_codec_alg_priv_t *ctx,
   return update_extracfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_set_rtc_external_ratectrl(vpx_codec_alg_priv_t *ctx,
+                                                      va_list args) {
+  VP8_COMP *cpi = ctx->cpi;
+  const unsigned int data = CAST(VP8E_SET_RTC_EXTERNAL_RATECTRL, args);
+  if (data) {
+    cpi->cyclic_refresh_mode_enabled = 0;
+    cpi->rt_always_update_correction_factor = 1;
+    cpi->rt_drop_recode_on_overshoot = 0;
+  }
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t vp8e_mr_alloc_mem(const vpx_codec_enc_cfg_t *cfg,
                                          void **mem_loc) {
-  vpx_codec_err_t res = 0;
+  vpx_codec_err_t res = VPX_CODEC_OK;
 
 #if CONFIG_MULTI_RES_ENCODING
   LOWER_RES_FRAME_INFO *shared_mem_loc;
@@ -586,12 +653,13 @@ static vpx_codec_err_t vp8e_mr_alloc_mem(const vpx_codec_enc_cfg_t *cfg,
 
   shared_mem_loc = calloc(1, sizeof(LOWER_RES_FRAME_INFO));
   if (!shared_mem_loc) {
-    res = VPX_CODEC_MEM_ERROR;
+    return VPX_CODEC_MEM_ERROR;
   }
 
   shared_mem_loc->mb_info =
       calloc(mb_rows * mb_cols, sizeof(LOWER_RES_MB_INFO));
   if (!(shared_mem_loc->mb_info)) {
+    free(shared_mem_loc);
     res = VPX_CODEC_MEM_ERROR;
   } else {
     *mem_loc = (void *)shared_mem_loc;
@@ -599,11 +667,22 @@ static vpx_codec_err_t vp8e_mr_alloc_mem(const vpx_codec_enc_cfg_t *cfg,
   }
 #else
   (void)cfg;
-  (void)mem_loc;
+  *mem_loc = NULL;
 #endif
   return res;
 }
 
+static void vp8e_mr_free_mem(void *mem_loc) {
+#if CONFIG_MULTI_RES_ENCODING
+  LOWER_RES_FRAME_INFO *shared_mem_loc = (LOWER_RES_FRAME_INFO *)mem_loc;
+  free(shared_mem_loc->mb_info);
+  free(mem_loc);
+#else
+  (void)mem_loc;
+  assert(!mem_loc);
+#endif
+}
+
 static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx,
                                  vpx_codec_priv_enc_mr_cfg_t *mr_cfg) {
   vpx_codec_err_t res = VPX_CODEC_OK;
@@ -641,6 +720,7 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx,
     priv->cx_data = malloc(priv->cx_data_sz);
 
     if (!priv->cx_data) {
+      priv->cx_data_sz = 0;
       return VPX_CODEC_MEM_ERROR;
     }
 
@@ -650,14 +730,30 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx,
       ctx->priv->enc.total_encoders = 1;
     }
 
-    once(vp8_initialize_enc);
+    vp8_initialize_enc();
 
     res = validate_config(priv, &priv->cfg, &priv->vp8_cfg, 0);
 
     if (!res) {
+      priv->pts_offset_initialized = 0;
+      priv->timestamp_ratio.den = priv->cfg.g_timebase.den;
+      priv->timestamp_ratio.num = (int64_t)priv->cfg.g_timebase.num;
+      priv->timestamp_ratio.num *= TICKS_PER_SEC;
+      reduce_ratio(&priv->timestamp_ratio);
+
       set_vp8e_config(&priv->oxcf, priv->cfg, priv->vp8_cfg, mr_cfg);
       priv->cpi = vp8_create_compressor(&priv->oxcf);
-      if (!priv->cpi) res = VPX_CODEC_MEM_ERROR;
+      if (!priv->cpi) {
+#if CONFIG_MULTI_RES_ENCODING
+        // Release ownership of mr_cfg->mr_low_res_mode_info on failure. This
+        // prevents ownership confusion with the caller and avoids a double
+        // free when vpx_codec_destroy() is called on this instance.
+        priv->oxcf.mr_total_resolutions = 0;
+        priv->oxcf.mr_encoder_id = 0;
+        priv->oxcf.mr_low_res_mode_info = NULL;
+#endif
+        res = VPX_CODEC_MEM_ERROR;
+      }
     }
   }
 
@@ -669,10 +765,7 @@ static vpx_codec_err_t vp8e_destroy(vpx_codec_alg_priv_t *ctx) {
   /* Free multi-encoder shared memory */
   if (ctx->oxcf.mr_total_resolutions > 0 &&
       (ctx->oxcf.mr_encoder_id == ctx->oxcf.mr_total_resolutions - 1)) {
-    LOWER_RES_FRAME_INFO *shared_mem_loc =
-        (LOWER_RES_FRAME_INFO *)ctx->oxcf.mr_low_res_mode_info;
-    free(shared_mem_loc->mb_info);
-    free(ctx->oxcf.mr_low_res_mode_info);
+    vp8e_mr_free_mem(ctx->oxcf.mr_low_res_mode_info);
   }
 #endif
 
@@ -709,9 +802,9 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
   return res;
 }
 
-static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
-                                    unsigned long duration,
-                                    unsigned long deadline) {
+static vpx_codec_err_t pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
+                                               unsigned long duration,
+                                               vpx_enc_deadline_t deadline) {
   int new_qc;
 
 #if !(CONFIG_REALTIME_ONLY)
@@ -719,12 +812,16 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
   new_qc = MODE_BESTQUALITY;
 
   if (deadline) {
-    uint64_t duration_us;
-
     /* Convert duration parameter from stream timebase to microseconds */
-    duration_us = (uint64_t)duration * 1000000 *
-                  (uint64_t)ctx->cfg.g_timebase.num /
-                  (uint64_t)ctx->cfg.g_timebase.den;
+    VPX_STATIC_ASSERT(TICKS_PER_SEC > 1000000 &&
+                      (TICKS_PER_SEC % 1000000) == 0);
+
+    if (duration > UINT64_MAX / (uint64_t)ctx->timestamp_ratio.num) {
+      ERROR("duration is too big");
+    }
+    uint64_t duration_us =
+        duration * (uint64_t)ctx->timestamp_ratio.num /
+        ((uint64_t)ctx->timestamp_ratio.den * (TICKS_PER_SEC / 1000000));
 
     /* If the deadline is more that the duration this frame is to be shown,
      * use good quality mode. Otherwise use realtime mode.
@@ -750,6 +847,7 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
     ctx->oxcf.Mode = new_qc;
     vp8_change_config(ctx->cpi, &ctx->oxcf);
   }
+  return VPX_CODEC_OK;
 }
 
 static vpx_codec_err_t set_reference_and_update(vpx_codec_alg_priv_t *ctx,
@@ -798,17 +896,33 @@ static vpx_codec_err_t set_reference_and_update(vpx_codec_alg_priv_t *ctx,
 static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
                                    const vpx_image_t *img, vpx_codec_pts_t pts,
                                    unsigned long duration,
-                                   vpx_enc_frame_flags_t flags,
-                                   unsigned long deadline) {
-  vpx_codec_err_t res = VPX_CODEC_OK;
+                                   vpx_enc_frame_flags_t enc_flags,
+                                   vpx_enc_deadline_t deadline) {
+  volatile vpx_codec_err_t res = VPX_CODEC_OK;
+  // Make a copy as volatile to avoid -Wclobbered with longjmp.
+  volatile vpx_enc_frame_flags_t flags = enc_flags;
+  volatile vpx_codec_pts_t pts_val = pts;
 
-  if (!ctx->cfg.rc_target_bitrate) return res;
+  if (!ctx->cfg.rc_target_bitrate) {
+#if CONFIG_MULTI_RES_ENCODING
+    if (!ctx->cpi) return VPX_CODEC_ERROR;
+    if (ctx->cpi->oxcf.mr_total_resolutions > 1) {
+      LOWER_RES_FRAME_INFO *low_res_frame_info =
+          (LOWER_RES_FRAME_INFO *)ctx->cpi->oxcf.mr_low_res_mode_info;
+      if (!low_res_frame_info) return VPX_CODEC_ERROR;
+      low_res_frame_info->skip_encoding_prev_stream = 1;
+      if (ctx->cpi->oxcf.mr_encoder_id == 0)
+        low_res_frame_info->skip_encoding_base_stream = 1;
+    }
+#endif
+    return res;
+  }
 
   if (img) res = validate_img(ctx, img);
 
   if (!res) res = validate_config(ctx, &ctx->cfg, &ctx->vp8_cfg, 1);
 
-  pick_quickcompress_mode(ctx, duration, deadline);
+  if (!res) res = pick_quickcompress_mode(ctx, duration, deadline);
   vpx_codec_pkt_list_init(&ctx->pkt_list);
 
   // If no flags are set in the encode call, then use the frame flags as
@@ -829,20 +943,37 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
     }
   }
 
-  /* Initialize the encoder instance on the first frame*/
+  /* Initialize the encoder instance on the first frame */
   if (!res && ctx->cpi) {
     unsigned int lib_flags;
-    YV12_BUFFER_CONFIG sd;
     int64_t dst_time_stamp, dst_end_time_stamp;
     size_t size, cx_data_sz;
     unsigned char *cx_data;
     unsigned char *cx_data_end;
     int comp_data_state = 0;
 
-    /* Set up internal flags */
-    if (ctx->base.init_flags & VPX_CODEC_USE_PSNR) {
-      ((VP8_COMP *)ctx->cpi)->b_calculate_psnr = 1;
+    if (setjmp(ctx->cpi->common.error.jmp)) {
+      ctx->cpi->common.error.setjmp = 0;
+      res = update_error_state(ctx, &ctx->cpi->common.error);
+      vpx_clear_system_state();
+      return res;
     }
+    ctx->cpi->common.error.setjmp = 1;
+
+    // Per-frame PSNR is not supported when g_lag_in_frames is greater than 0.
+    if ((flags & VPX_EFLAG_CALCULATE_PSNR) && ctx->cfg.g_lag_in_frames != 0) {
+      vpx_internal_error(
+          &ctx->cpi->common.error, VPX_CODEC_INCAPABLE,
+          "Cannot calculate per-frame PSNR when g_lag_in_frames is nonzero");
+    }
+    /* Set up internal flags */
+#if CONFIG_INTERNAL_STATS
+    assert(((VP8_COMP *)ctx->cpi)->b_calculate_psnr == 1);
+#else
+    ((VP8_COMP *)ctx->cpi)->b_calculate_psnr =
+        (ctx->base.init_flags & VPX_CODEC_USE_PSNR) ||
+        (flags & VPX_EFLAG_CALCULATE_PSNR);
+#endif
 
     if (ctx->base.init_flags & VPX_CODEC_USE_OUTPUT_PARTITION) {
       ((VP8_COMP *)ctx->cpi)->output_partition = 1;
@@ -851,28 +982,50 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
     /* Convert API flags to internal codec lib flags */
     lib_flags = (flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
 
-    /* vp8 use 10,000,000 ticks/second as time stamp */
-    dst_time_stamp =
-        pts * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den;
-    dst_end_time_stamp = (pts + duration) * 10000000 * ctx->cfg.g_timebase.num /
-                         ctx->cfg.g_timebase.den;
-
     if (img != NULL) {
+      YV12_BUFFER_CONFIG sd;
+
+      if (!ctx->pts_offset_initialized) {
+        ctx->pts_offset = pts_val;
+        ctx->pts_offset_initialized = 1;
+      }
+      if (pts_val < ctx->pts_offset) {
+        vpx_internal_error(&ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM,
+                           "pts is smaller than initial pts");
+      }
+      pts_val -= ctx->pts_offset;
+      if (pts_val > INT64_MAX / ctx->timestamp_ratio.num) {
+        vpx_internal_error(
+            &ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM,
+            "conversion of relative pts to ticks would overflow");
+      }
+      dst_time_stamp =
+          pts_val * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den;
+#if ULONG_MAX > INT64_MAX
+      if (duration > INT64_MAX) {
+        vpx_internal_error(&ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM,
+                           "duration is too big");
+      }
+#endif
+      if (pts_val > INT64_MAX - (int64_t)duration) {
+        vpx_internal_error(&ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM,
+                           "relative pts + duration is too big");
+      }
+      vpx_codec_pts_t pts_end = pts_val + (int64_t)duration;
+      if (pts_end > INT64_MAX / ctx->timestamp_ratio.num) {
+        vpx_internal_error(
+            &ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM,
+            "conversion of relative pts + duration to ticks would overflow");
+      }
+      dst_end_time_stamp =
+          pts_end * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den;
+
       res = image2yuvconfig(img, &sd);
 
-      if (sd.y_width != ctx->cfg.g_w || sd.y_height != ctx->cfg.g_h) {
-        /* from vpx_encoder.h for g_w/g_h:
-           "Note that the frames passed as input to the encoder must have this
-           resolution"
-        */
-        ctx->base.err_detail = "Invalid input frame resolution";
-        res = VPX_CODEC_INVALID_PARAM;
-      } else {
-        if (vp8_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags,
-                                  &sd, dst_time_stamp, dst_end_time_stamp)) {
-          VP8_COMP *cpi = (VP8_COMP *)ctx->cpi;
-          res = update_error_state(ctx, &cpi->common.error);
-        }
+      if (vp8_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags, &sd,
+                                dst_time_stamp, dst_end_time_stamp)) {
+        VP8_COMP *cpi = (VP8_COMP *)ctx->cpi;
+        res = update_error_state(ctx, &cpi->common.error);
       }
 
       /* reset for next frame */
@@ -890,6 +1043,7 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
           &dst_end_time_stamp, !img);
 
       if (comp_data_state == VPX_CODEC_CORRUPT_FRAME) {
+        ctx->cpi->common.error.setjmp = 0;
         return VPX_CODEC_CORRUPT_FRAME;
       } else if (comp_data_state == -1) {
         break;
@@ -901,16 +1055,21 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
         VP8_COMP *cpi = (VP8_COMP *)ctx->cpi;
 
         /* Add the frame packet to the list of returned packets. */
-        round = (vpx_codec_pts_t)10000000 * ctx->cfg.g_timebase.num / 2 - 1;
+        round = (vpx_codec_pts_t)ctx->timestamp_ratio.num / 2;
+        if (round > 0) --round;
         delta = (dst_end_time_stamp - dst_time_stamp);
         pkt.kind = VPX_CODEC_CX_FRAME_PKT;
         pkt.data.frame.pts =
-            (dst_time_stamp * ctx->cfg.g_timebase.den + round) /
-            ctx->cfg.g_timebase.num / 10000000;
+            (dst_time_stamp * ctx->timestamp_ratio.den + round) /
+                ctx->timestamp_ratio.num +
+            ctx->pts_offset;
         pkt.data.frame.duration =
-            (unsigned long)((delta * ctx->cfg.g_timebase.den + round) /
-                            ctx->cfg.g_timebase.num / 10000000);
+            (unsigned long)((delta * ctx->timestamp_ratio.den + round) /
+                            ctx->timestamp_ratio.num);
         pkt.data.frame.flags = lib_flags << 16;
+        pkt.data.frame.width[0] = cpi->common.Width;
+        pkt.data.frame.height[0] = cpi->common.Height;
+        pkt.data.frame.spatial_layer_encoded[0] = 1;
 
         if (lib_flags & FRAMEFLAGS_KEY) {
           pkt.data.frame.flags |= VPX_FRAME_IS_KEY;
@@ -925,9 +1084,9 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
            * Invisible frames have no duration.
            */
           pkt.data.frame.pts =
-              ((cpi->last_time_stamp_seen * ctx->cfg.g_timebase.den + round) /
-               ctx->cfg.g_timebase.num / 10000000) +
-              1;
+              ((cpi->last_time_stamp_seen * ctx->timestamp_ratio.den + round) /
+               ctx->timestamp_ratio.num) +
+              ctx->pts_offset + 1;
           pkt.data.frame.duration = 0;
         }
 
@@ -974,6 +1133,7 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
         }
       }
     }
+    ctx->cpi->common.error.setjmp = 0;
   }
 
   return res;
@@ -1139,8 +1299,8 @@ static vpx_codec_err_t vp8e_set_scalemode(vpx_codec_alg_priv_t *ctx,
   if (data) {
     int res;
     vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data;
-    res = vp8_set_internal_size(ctx->cpi, (VPX_SCALING)scalemode.h_scaling_mode,
-                                (VPX_SCALING)scalemode.v_scaling_mode);
+    res = vp8_set_internal_size(ctx->cpi, scalemode.h_scaling_mode,
+                                scalemode.v_scaling_mode);
 
     if (!res) {
       /*force next frame a key frame to effect scaling mode */
@@ -1179,13 +1339,14 @@ static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] = {
   { VP8E_SET_MAX_INTRA_BITRATE_PCT, set_rc_max_intra_bitrate_pct },
   { VP8E_SET_SCREEN_CONTENT_MODE, set_screen_content_mode },
   { VP8E_SET_GF_CBR_BOOST_PCT, ctrl_set_rc_gf_cbr_boost_pct },
+  { VP8E_SET_RTC_EXTERNAL_RATECTRL, ctrl_set_rtc_external_ratectrl },
   { -1, NULL },
 };
 
 static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = {
   { 0,
     {
-        0, /* g_usage */
+        0, /* g_usage (unused) */
         0, /* g_threads */
         0, /* g_profile */
 
@@ -1206,13 +1367,13 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = {
         0,  /* rc_resize_allowed */
         1,  /* rc_scaled_width */
         1,  /* rc_scaled_height */
-        60, /* rc_resize_down_thresold */
-        30, /* rc_resize_up_thresold */
+        60, /* rc_resize_down_thresh */
+        30, /* rc_resize_up_thresh */
 
         VPX_VBR,     /* rc_end_usage */
         { NULL, 0 }, /* rc_twopass_stats_in */
         { NULL, 0 }, /* rc_firstpass_mb_stats_in */
-        256,         /* rc_target_bandwidth */
+        256,         /* rc_target_bitrate */
         4,           /* rc_min_quantizer */
         63,          /* rc_max_quantizer */
         100,         /* rc_undershoot_pct */
@@ -1225,6 +1386,7 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = {
         50,  /* rc_two_pass_vbrbias  */
         0,   /* rc_two_pass_vbrmin_section */
         400, /* rc_two_pass_vbrmax_section */
+        0,   // rc_2pass_vbr_corpus_complexity (only has meaningfull for VP9)
 
         /* keyframing settings (kf) */
         VPX_KF_AUTO, /* g_kfmode*/
@@ -1233,14 +1395,30 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = {
 
         VPX_SS_DEFAULT_LAYERS, /* ss_number_layers */
         { 0 },
-        { 0 }, /* ss_target_bitrate */
-        1,     /* ts_number_layers */
-        { 0 }, /* ts_target_bitrate */
-        { 0 }, /* ts_rate_decimator */
-        0,     /* ts_periodicity */
-        { 0 }, /* ts_layer_id */
-        { 0 }, /* layer_target_bitrate */
-        0      /* temporal_layering_mode */
+        { 0 },    /* ss_target_bitrate */
+        1,        /* ts_number_layers */
+        { 0 },    /* ts_target_bitrate */
+        { 0 },    /* ts_rate_decimator */
+        0,        /* ts_periodicity */
+        { 0 },    /* ts_layer_id */
+        { 0 },    /* layer_target_bitrate */
+        0,        /* temporal_layering_mode */
+        0,        /* use_vizier_rc_params */
+        { 1, 1 }, /* active_wq_factor */
+        { 1, 1 }, /* err_per_mb_factor */
+        { 1, 1 }, /* sr_default_decay_limit */
+        { 1, 1 }, /* sr_diff_factor */
+        { 1, 1 }, /* kf_err_per_mb_factor */
+        { 1, 1 }, /* kf_frame_min_boost_factor */
+        { 1, 1 }, /* kf_frame_max_boost_first_factor */
+        { 1, 1 }, /* kf_frame_max_boost_subs_factor */
+        { 1, 1 }, /* kf_max_total_boost_factor */
+        { 1, 1 }, /* gf_max_total_boost_factor */
+        { 1, 1 }, /* gf_frame_max_boost_factor */
+        { 1, 1 }, /* zm_factor */
+        { 1, 1 }, /* rd_mult_inter_qp_fac */
+        { 1, 1 }, /* rd_mult_arf_qp_fac */
+        { 1, 1 }, /* rd_mult_key_qp_fac */
     } },
 };
 
@@ -1267,6 +1445,10 @@ CODEC_INTERFACE(vpx_codec_vp8_cx) = {
       vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    cfg_maps; */
       vp8e_encode,        /* vpx_codec_encode_fn_t      encode; */
       vp8e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   get_cx_data; */
-      vp8e_set_config, NULL, vp8e_get_preview, vp8e_mr_alloc_mem,
+      vp8e_set_config,
+      NULL,
+      vp8e_get_preview,
+      vp8e_mr_alloc_mem,
+      vp8e_mr_free_mem,
   } /* encoder functions */
 };
diff --git a/media/libvpx/libvpx/vp8/vp8_dx_iface.c b/media/libvpx/libvpx/vp8/vp8_dx_iface.c
index 3cc3f92f90..65a86e2dc7 100644
--- a/media/libvpx/libvpx/vp8/vp8_dx_iface.c
+++ b/media/libvpx/libvpx/vp8/vp8_dx_iface.c
@@ -20,6 +20,7 @@
 #include "vpx_version.h"
 #include "common/alloccommon.h"
 #include "common/common.h"
+#include "common/onyxc_int.h"
 #include "common/onyxd.h"
 #include "decoder/onyxd_int.h"
 #include "vpx_dsp/vpx_dsp_common.h"
@@ -38,13 +39,19 @@ typedef vpx_codec_stream_info_t vp8_stream_info_t;
 
 /* Structures for handling memory allocations */
 typedef enum { VP8_SEG_ALG_PRIV = 256, VP8_SEG_MAX } mem_seg_id_t;
-#define NELEMENTS(x) ((int)(sizeof(x) / sizeof(x[0])))
+#define NELEMENTS(x) ((int)(sizeof(x) / sizeof((x)[0])))
 
 struct vpx_codec_alg_priv {
   vpx_codec_priv_t base;
   vpx_codec_dec_cfg_t cfg;
   vp8_stream_info_t si;
   int decoder_init;
+#if CONFIG_MULTITHREAD
+  // Restart threads on next frame if set to 1.
+  // This is set when error happens in multithreaded decoding and all threads
+  // are shut down.
+  int restart_threads;
+#endif
   int postproc_cfg_set;
   vp8_postproc_cfg_t postproc_cfg;
   vpx_decrypt_cb decrypt_cb;
@@ -80,7 +87,6 @@ static int vp8_init_ctx(vpx_codec_ctx_t *ctx) {
 static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
                                 vpx_codec_priv_enc_mr_cfg_t *data) {
   vpx_codec_err_t res = VPX_CODEC_OK;
-  vpx_codec_alg_priv_t *priv = NULL;
   (void)data;
 
   vp8_rtcd();
@@ -92,7 +98,10 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
    * information becomes known.
    */
   if (!ctx->priv) {
+    vpx_codec_alg_priv_t *priv;
+
     if (vp8_init_ctx(ctx)) return VPX_CODEC_MEM_ERROR;
+
     priv = (vpx_codec_alg_priv_t *)ctx->priv;
 
     /* initialize number of fragments to zero */
@@ -102,8 +111,6 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
         (priv->base.init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS);
 
     /*post processing level initialized to do nothing */
-  } else {
-    priv = (vpx_codec_alg_priv_t *)ctx->priv;
   }
 
   return res;
@@ -144,8 +151,7 @@ static vpx_codec_err_t vp8_peek_si_internal(const uint8_t *data,
     }
     si->is_kf = 0;
 
-    if (data_sz >= 10 && !(clear[0] & 0x01)) /* I-Frame */
-    {
+    if (data_sz >= 10 && !(clear[0] & 0x01)) { /* I-Frame */
       si->is_kf = 1;
 
       /* vet via sync code */
@@ -157,7 +163,10 @@ static vpx_codec_err_t vp8_peek_si_internal(const uint8_t *data,
       si->h = (clear[8] | (clear[9] << 8)) & 0x3fff;
 
       /*printf("w=%d, h=%d\n", si->w, si->h);*/
-      if (!(si->h && si->w)) res = VPX_CODEC_CORRUPT_FRAME;
+      if (!(si->h && si->w)) {
+        si->w = si->h = 0;
+        res = VPX_CODEC_CORRUPT_FRAME;
+      }
     } else {
       res = VPX_CODEC_UNSUP_BITSTREAM;
     }
@@ -201,9 +210,9 @@ static vpx_codec_err_t update_error_state(
 static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12,
                             void *user_priv) {
   /** vpx_img_wrap() doesn't allow specifying independent strides for
-    * the Y, U, and V planes, nor other alignment adjustments that
-    * might be representable by a YV12_BUFFER_CONFIG, so we just
-    * initialize all the fields.*/
+   * the Y, U, and V planes, nor other alignment adjustments that
+   * might be representable by a YV12_BUFFER_CONFIG, so we just
+   * initialize all the fields.*/
   img->fmt = VPX_IMG_FMT_I420;
   img->w = yv12->y_stride;
   img->h = (yv12->y_height + 2 * VP8BORDERINPIXELS + 15) & ~15;
@@ -228,7 +237,8 @@ static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12,
 }
 
 static int update_fragments(vpx_codec_alg_priv_t *ctx, const uint8_t *data,
-                            unsigned int data_sz, vpx_codec_err_t *res) {
+                            unsigned int data_sz,
+                            volatile vpx_codec_err_t *res) {
   *res = VPX_CODEC_OK;
 
   if (ctx->fragments.count == 0) {
@@ -240,14 +250,14 @@ static int update_fragments(vpx_codec_alg_priv_t *ctx, const uint8_t *data,
     /* Store a pointer to this fragment and return. We haven't
      * received the complete frame yet, so we will wait with decoding.
      */
-    ctx->fragments.ptrs[ctx->fragments.count] = data;
-    ctx->fragments.sizes[ctx->fragments.count] = data_sz;
-    ctx->fragments.count++;
-    if (ctx->fragments.count > (1 << EIGHT_PARTITION) + 1) {
+    if (ctx->fragments.count >= MAX_PARTITIONS) {
       ctx->fragments.count = 0;
       *res = VPX_CODEC_INVALID_PARAM;
       return -1;
     }
+    ctx->fragments.ptrs[ctx->fragments.count] = data;
+    ctx->fragments.sizes[ctx->fragments.count] = data_sz;
+    ctx->fragments.count++;
     return 0;
   }
 
@@ -266,10 +276,10 @@ static int update_fragments(vpx_codec_alg_priv_t *ctx, const uint8_t *data,
 
 static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
                                   const uint8_t *data, unsigned int data_sz,
-                                  void *user_priv, long deadline) {
-  vpx_codec_err_t res = VPX_CODEC_OK;
-  unsigned int resolution_change = 0;
-  unsigned int w, h;
+                                  void *user_priv) {
+  volatile vpx_codec_err_t res;
+  volatile unsigned int resolution_change = 0;
+  volatile unsigned int w, h;
 
   if (!ctx->fragments.enabled && (data == NULL && data_sz == 0)) {
     return 0;
@@ -295,9 +305,39 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
   }
 
   if (!ctx->decoder_init && !ctx->si.is_kf) res = VPX_CODEC_UNSUP_BITSTREAM;
+  if (!res && ctx->decoder_init && w == 0 && h == 0 && ctx->si.h == 0 &&
+      ctx->si.w == 0) {
+    VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0];
+    assert(pbi != NULL);
+    assert(!pbi->common.error.setjmp);
+    res = VPX_CODEC_CORRUPT_FRAME;
+    vpx_internal_error(&pbi->common.error, res,
+                       "Keyframe / intra-only frame required to reset decoder"
+                       " state");
+  }
 
   if ((ctx->si.h != h) || (ctx->si.w != w)) resolution_change = 1;
 
+#if CONFIG_MULTITHREAD
+  if (!res && ctx->restart_threads) {
+    VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0];
+    VP8_COMMON *const pc = &pbi->common;
+    if (setjmp(pbi->common.error.jmp)) {
+      pbi->common.error.setjmp = 0;
+      vp8_decoder_remove_threads(pbi);
+      vpx_clear_system_state();
+      return VPX_CODEC_ERROR;
+    }
+    pbi->common.error.setjmp = 1;
+    pbi->max_threads = ctx->cfg.threads;
+    vp8_decoder_create_threads(pbi);
+    if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) {
+      vp8mt_alloc_temp_buffers(pbi, pc->Width, pc->mb_rows);
+    }
+    ctx->restart_threads = 0;
+    pbi->common.error.setjmp = 0;
+  }
+#endif
   /* Initialize the decoder instance on the first frame*/
   if (!res && !ctx->decoder_init) {
     VP8D_CONFIG oxcf;
@@ -322,7 +362,14 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
     }
 
     res = vp8_create_decoder_instances(&ctx->yv12_frame_buffers, &oxcf);
-    if (res == VPX_CODEC_OK) ctx->decoder_init = 1;
+    if (res == VPX_CODEC_OK) {
+      ctx->decoder_init = 1;
+    } else {
+      /* on failure clear the cached resolution to ensure a full
+       * reallocation is attempted on resync. */
+      ctx->si.w = 0;
+      ctx->si.h = 0;
+    }
   }
 
   /* Set these even if already initialized.  The caller may have changed the
@@ -335,8 +382,8 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
 
   if (!res) {
     VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0];
+    VP8_COMMON *const pc = &pbi->common;
     if (resolution_change) {
-      VP8_COMMON *const pc = &pbi->common;
       MACROBLOCKD *const xd = &pbi->mb;
 #if CONFIG_MULTITHREAD
       int i;
@@ -344,8 +391,6 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
       pc->Width = ctx->si.w;
       pc->Height = ctx->si.h;
       {
-        int prev_mb_rows = pc->mb_rows;
-
         if (setjmp(pbi->common.error.jmp)) {
           pbi->common.error.setjmp = 0;
           /* on failure clear the cached resolution to ensure a full
@@ -371,6 +416,12 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
                              "Invalid frame height");
         }
 
+#if CONFIG_MULTITHREAD
+        if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) {
+          vp8mt_de_alloc_temp_buffers(pbi, pc->mb_rows);
+        }
+#endif
+
         if (vp8_alloc_frame_buffers(pc, pc->Width, pc->Height)) {
           vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
                              "Failed to allocate frame buffers");
@@ -414,11 +465,9 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
 #endif
 
 #if CONFIG_MULTITHREAD
-        if (pbi->b_multithreaded_rd) {
-          vp8mt_alloc_temp_buffers(pbi, pc->Width, prev_mb_rows);
+        if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) {
+          vp8mt_alloc_temp_buffers(pbi, pc->Width, 0);
         }
-#else
-        (void)prev_mb_rows;
 #endif
       }
 
@@ -428,16 +477,44 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
       pbi->common.fb_idx_ref_cnt[0] = 0;
     }
 
+    if (setjmp(pbi->common.error.jmp)) {
+      vpx_clear_system_state();
+      /* We do not know if the missing frame(s) was supposed to update
+       * any of the reference buffers, but we act conservative and
+       * mark only the last buffer as corrupted.
+       */
+      pc->yv12_fb[pc->lst_fb_idx].corrupted = 1;
+
+      if (pc->fb_idx_ref_cnt[pc->new_fb_idx] > 0) {
+        pc->fb_idx_ref_cnt[pc->new_fb_idx]--;
+      }
+      pbi->common.error.setjmp = 0;
+#if CONFIG_MULTITHREAD
+      if (pbi->restart_threads) {
+        ctx->si.w = 0;
+        ctx->si.h = 0;
+        ctx->restart_threads = 1;
+      }
+#endif
+      res = update_error_state(ctx, &pbi->common.error);
+      return res;
+    }
+
+    pbi->common.error.setjmp = 1;
+
     /* update the pbi fragment data */
     pbi->fragments = ctx->fragments;
-
+#if CONFIG_MULTITHREAD
+    pbi->restart_threads = 0;
+#endif
     ctx->user_priv = user_priv;
-    if (vp8dx_receive_compressed_data(pbi, data_sz, data, deadline)) {
+    if (vp8dx_receive_compressed_data(pbi)) {
       res = update_error_state(ctx, &pbi->common.error);
     }
 
     /* get ready for the next series of fragments */
     ctx->fragments.count = 0;
+    pbi->common.error.setjmp = 0;
   }
 
   return res;
@@ -452,7 +529,6 @@ static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t *ctx,
    */
   if (!(*iter) && ctx->yv12_frame_buffers.pbi[0]) {
     YV12_BUFFER_CONFIG sd;
-    int64_t time_stamp = 0, time_end_stamp = 0;
     vp8_ppflags_t flags;
     vp8_zero(flags);
 
@@ -462,8 +538,7 @@ static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t *ctx,
       flags.noise_level = ctx->postproc_cfg.noise_level;
     }
 
-    if (0 == vp8dx_get_raw_frame(ctx->yv12_frame_buffers.pbi[0], &sd,
-                                 &time_stamp, &time_end_stamp, &flags)) {
+    if (0 == vp8dx_get_raw_frame(ctx->yv12_frame_buffers.pbi[0], &sd, &flags)) {
       yuvconfig2image(&ctx->img, &sd, ctx->user_priv);
 
       img = &ctx->img;
@@ -535,6 +610,16 @@ static vpx_codec_err_t vp8_get_reference(vpx_codec_alg_priv_t *ctx,
   }
 }
 
+static vpx_codec_err_t vp8_get_quantizer(vpx_codec_alg_priv_t *ctx,
+                                         va_list args) {
+  int *const arg = va_arg(args, int *);
+  VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0];
+  if (arg == NULL) return VPX_CODEC_INVALID_PARAM;
+  if (pbi == NULL) return VPX_CODEC_CORRUPT_FRAME;
+  *arg = vp8dx_get_quantizer(pbi);
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx,
                                         va_list args) {
 #if CONFIG_POSTPROC
@@ -561,6 +646,7 @@ static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
 
   if (update_info) {
     VP8D_COMP *pbi = (VP8D_COMP *)ctx->yv12_frame_buffers.pbi[0];
+    if (pbi == NULL) return VPX_CODEC_CORRUPT_FRAME;
 
     *update_info = pbi->common.refresh_alt_ref_frame * (int)VP8_ALTR_FRAME +
                    pbi->common.refresh_golden_frame * (int)VP8_GOLD_FRAME +
@@ -572,20 +658,22 @@ static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
   }
 }
 
-extern int vp8dx_references_buffer(VP8_COMMON *oci, int ref_frame);
 static vpx_codec_err_t vp8_get_last_ref_frame(vpx_codec_alg_priv_t *ctx,
                                               va_list args) {
   int *ref_info = va_arg(args, int *);
 
   if (ref_info) {
     VP8D_COMP *pbi = (VP8D_COMP *)ctx->yv12_frame_buffers.pbi[0];
-    VP8_COMMON *oci = &pbi->common;
-    *ref_info =
-        (vp8dx_references_buffer(oci, ALTREF_FRAME) ? VP8_ALTR_FRAME : 0) |
-        (vp8dx_references_buffer(oci, GOLDEN_FRAME) ? VP8_GOLD_FRAME : 0) |
-        (vp8dx_references_buffer(oci, LAST_FRAME) ? VP8_LAST_FRAME : 0);
-
-    return VPX_CODEC_OK;
+    if (pbi) {
+      VP8_COMMON *oci = &pbi->common;
+      *ref_info =
+          (vp8dx_references_buffer(oci, ALTREF_FRAME) ? VP8_ALTR_FRAME : 0) |
+          (vp8dx_references_buffer(oci, GOLDEN_FRAME) ? VP8_GOLD_FRAME : 0) |
+          (vp8dx_references_buffer(oci, LAST_FRAME) ? VP8_LAST_FRAME : 0);
+      return VPX_CODEC_OK;
+    } else {
+      return VPX_CODEC_CORRUPT_FRAME;
+    }
   } else {
     return VPX_CODEC_INVALID_PARAM;
   }
@@ -620,13 +708,14 @@ static vpx_codec_err_t vp8_set_decryptor(vpx_codec_alg_priv_t *ctx,
   return VPX_CODEC_OK;
 }
 
-vpx_codec_ctrl_fn_map_t vp8_ctf_maps[] = {
+static vpx_codec_ctrl_fn_map_t vp8_ctf_maps[] = {
   { VP8_SET_REFERENCE, vp8_set_reference },
   { VP8_COPY_REFERENCE, vp8_get_reference },
   { VP8_SET_POSTPROC, vp8_set_postproc },
   { VP8D_GET_LAST_REF_UPDATES, vp8_get_last_ref_updates },
   { VP8D_GET_FRAME_CORRUPTED, vp8_get_frame_corrupted },
   { VP8D_GET_LAST_REF_USED, vp8_get_last_ref_frame },
+  { VPXD_GET_LAST_QUANTIZER, vp8_get_quantizer },
   { VPXD_SET_DECRYPTOR, vp8_set_decryptor },
   { -1, NULL },
 };
@@ -658,6 +747,7 @@ CODEC_INTERFACE(vpx_codec_vp8_dx) = {
       NULL,    /* vpx_codec_enc_config_set_fn_t */
       NULL,    /* vpx_codec_get_global_headers_fn_t */
       NULL,    /* vpx_codec_get_preview_frame_fn_t */
-      NULL     /* vpx_codec_enc_mr_get_mem_loc_fn_t */
+      NULL,    /* vpx_codec_enc_mr_get_mem_loc_fn_t */
+      NULL     /* vpx_codec_enc_mr_free_mem_loc_fn_t */
   }
 };
diff --git a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc
new file mode 100644
index 0000000000..312092f190
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc
@@ -0,0 +1,440 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp8/vp8_ratectrl_rtc.h"
+
+#include <math.h>
+
+#include <new>
+
+#include "vp8/common/common.h"
+#include "vp8/encoder/onyx_int.h"
+#include "vp8/encoder/ratectrl.h"
+#include "vpx_ports/system_state.h"
+
+namespace libvpx {
+/* Quant MOD */
+static const int kQTrans[] = {
+  0,  1,  2,  3,  4,  5,  7,   8,   9,   10,  12,  13,  15,  17,  18,  19,
+  20, 21, 23, 24, 25, 26, 27,  28,  29,  30,  31,  33,  35,  37,  39,  41,
+  43, 45, 47, 49, 51, 53, 55,  57,  59,  61,  64,  67,  70,  73,  76,  79,
+  82, 85, 88, 91, 94, 97, 100, 103, 106, 109, 112, 115, 118, 121, 124, 127,
+};
+
+static const unsigned char kf_high_motion_minq[QINDEX_RANGE] = {
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,
+  1,  1,  2,  2,  2,  2,  3,  3,  3,  3,  3,  3,  3,  3,  4,  4,  4,  4,  5,
+  5,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,  8,  8,  8,  8,  9,  9,  10, 10,
+  10, 10, 11, 11, 11, 11, 12, 12, 13, 13, 13, 13, 14, 14, 15, 15, 15, 15, 16,
+  16, 16, 16, 17, 17, 18, 18, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21,
+  22, 22, 23, 23, 24, 25, 25, 26, 26, 27, 28, 28, 29, 30
+};
+
+static const unsigned char inter_minq[QINDEX_RANGE] = {
+  0,  0,  1,  1,  2,  3,  3,  4,  4,  5,  6,  6,  7,  8,  8,  9,  9,  10, 11,
+  11, 12, 13, 13, 14, 15, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 22, 23, 24,
+  24, 25, 26, 27, 27, 28, 29, 30, 30, 31, 32, 33, 33, 34, 35, 36, 36, 37, 38,
+  39, 39, 40, 41, 42, 42, 43, 44, 45, 46, 46, 47, 48, 49, 50, 50, 51, 52, 53,
+  54, 55, 55, 56, 57, 58, 59, 60, 60, 61, 62, 63, 64, 65, 66, 67, 67, 68, 69,
+  70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 86,
+  87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100
+};
+
+static int rescale(int val, int num, int denom) {
+  int64_t llnum = num;
+  int64_t llden = denom;
+  int64_t llval = val;
+
+  return (int)(llval * llnum / llden);
+}
+
+std::unique_ptr<VP8RateControlRTC> VP8RateControlRTC::Create(
+    const VP8RateControlRtcConfig &cfg) {
+  std::unique_ptr<VP8RateControlRTC> rc_api(new (std::nothrow)
+                                                VP8RateControlRTC());
+  if (!rc_api) return nullptr;
+  rc_api->cpi_ = static_cast<VP8_COMP *>(vpx_memalign(32, sizeof(*cpi_)));
+  if (!rc_api->cpi_) return nullptr;
+  vp8_zero(*rc_api->cpi_);
+
+  if (!rc_api->InitRateControl(cfg)) return nullptr;
+
+  return rc_api;
+}
+
+VP8RateControlRTC::~VP8RateControlRTC() {
+  if (cpi_) {
+    vpx_free(cpi_->gf_active_flags);
+    vpx_free(cpi_);
+  }
+}
+
+bool VP8RateControlRTC::InitRateControl(const VP8RateControlRtcConfig &rc_cfg) {
+  VP8_COMMON *cm = &cpi_->common;
+  VP8_CONFIG *oxcf = &cpi_->oxcf;
+  oxcf->end_usage = USAGE_STREAM_FROM_SERVER;
+  cpi_->pass = 0;
+  cm->show_frame = 1;
+  oxcf->drop_frames_water_mark = 0;
+  cm->current_video_frame = 0;
+  cpi_->auto_gold = 1;
+  cpi_->key_frame_count = 1;
+  cpi_->rate_correction_factor = 1.0;
+  cpi_->key_frame_rate_correction_factor = 1.0;
+  cpi_->cyclic_refresh_mode_enabled = 0;
+  cpi_->auto_worst_q = 1;
+  cpi_->kf_overspend_bits = 0;
+  cpi_->kf_bitrate_adjustment = 0;
+  cpi_->gf_overspend_bits = 0;
+  cpi_->non_gf_bitrate_adjustment = 0;
+  if (!UpdateRateControl(rc_cfg)) return false;
+  cpi_->buffer_level = oxcf->starting_buffer_level;
+  cpi_->bits_off_target = oxcf->starting_buffer_level;
+  return true;
+}
+
+bool VP8RateControlRTC::UpdateRateControl(
+    const VP8RateControlRtcConfig &rc_cfg) {
+  if (rc_cfg.ts_number_layers < 1 ||
+      rc_cfg.ts_number_layers > VPX_TS_MAX_LAYERS) {
+    return false;
+  }
+
+  VP8_COMMON *cm = &cpi_->common;
+  VP8_CONFIG *oxcf = &cpi_->oxcf;
+  const unsigned int prev_number_of_layers = oxcf->number_of_layers;
+  vpx_clear_system_state();
+  cm->Width = rc_cfg.width;
+  cm->Height = rc_cfg.height;
+  oxcf->Width = rc_cfg.width;
+  oxcf->Height = rc_cfg.height;
+  oxcf->worst_allowed_q = kQTrans[rc_cfg.max_quantizer];
+  oxcf->best_allowed_q = kQTrans[rc_cfg.min_quantizer];
+  cpi_->worst_quality = oxcf->worst_allowed_q;
+  cpi_->best_quality = oxcf->best_allowed_q;
+  cpi_->output_framerate = rc_cfg.framerate;
+  oxcf->target_bandwidth =
+      static_cast<unsigned int>(1000 * rc_cfg.target_bandwidth);
+  cpi_->ref_framerate = cpi_->output_framerate;
+  oxcf->fixed_q = -1;
+  oxcf->error_resilient_mode = 1;
+  oxcf->starting_buffer_level_in_ms = rc_cfg.buf_initial_sz;
+  oxcf->optimal_buffer_level_in_ms = rc_cfg.buf_optimal_sz;
+  oxcf->maximum_buffer_size_in_ms = rc_cfg.buf_sz;
+  oxcf->starting_buffer_level = rc_cfg.buf_initial_sz;
+  oxcf->optimal_buffer_level = rc_cfg.buf_optimal_sz;
+  oxcf->maximum_buffer_size = rc_cfg.buf_sz;
+  oxcf->number_of_layers = rc_cfg.ts_number_layers;
+  cpi_->buffered_mode = oxcf->optimal_buffer_level > 0;
+  oxcf->under_shoot_pct = rc_cfg.undershoot_pct;
+  oxcf->over_shoot_pct = rc_cfg.overshoot_pct;
+  oxcf->drop_frames_water_mark = rc_cfg.frame_drop_thresh;
+  if (oxcf->drop_frames_water_mark > 0) cpi_->drop_frames_allowed = 1;
+  cpi_->oxcf.rc_max_intra_bitrate_pct = rc_cfg.max_intra_bitrate_pct;
+  cpi_->framerate = rc_cfg.framerate;
+  for (int i = 0; i < KEY_FRAME_CONTEXT; ++i) {
+    cpi_->prior_key_frame_distance[i] =
+        static_cast<int>(cpi_->output_framerate);
+  }
+  oxcf->screen_content_mode = rc_cfg.is_screen;
+  if (oxcf->number_of_layers > 1 || prev_number_of_layers > 1) {
+    memcpy(oxcf->target_bitrate, rc_cfg.layer_target_bitrate,
+           sizeof(rc_cfg.layer_target_bitrate));
+    memcpy(oxcf->rate_decimator, rc_cfg.ts_rate_decimator,
+           sizeof(rc_cfg.ts_rate_decimator));
+    if (cm->current_video_frame == 0) {
+      double prev_layer_framerate = 0;
+      for (unsigned int i = 0; i < oxcf->number_of_layers; ++i) {
+        vp8_init_temporal_layer_context(cpi_, oxcf, i, prev_layer_framerate);
+        prev_layer_framerate = cpi_->output_framerate / oxcf->rate_decimator[i];
+      }
+    } else if (oxcf->number_of_layers != prev_number_of_layers) {
+      // The number of temporal layers has changed, so reset/initialize the
+      // temporal layer context for the new layer configuration: this means
+      // calling vp8_reset_temporal_layer_change() below.
+
+      // Start at the base of the pattern cycle, so set the layer id to 0 and
+      // reset the temporal pattern counter.
+      // TODO(marpan/jianj): don't think lines 148-151 are needed (user controls
+      // the layer_id) so remove.
+      if (cpi_->temporal_layer_id > 0) {
+        cpi_->temporal_layer_id = 0;
+      }
+      cpi_->temporal_pattern_counter = 0;
+
+      vp8_reset_temporal_layer_change(cpi_, oxcf,
+                                      static_cast<int>(prev_number_of_layers));
+    }
+  }
+
+  cpi_->total_actual_bits = 0;
+  cpi_->total_target_vs_actual = 0;
+
+  cm->mb_rows = cm->Height >> 4;
+  cm->mb_cols = cm->Width >> 4;
+  cm->MBs = cm->mb_rows * cm->mb_cols;
+  cm->mode_info_stride = cm->mb_cols + 1;
+
+  // For temporal layers: starting/maximum/optimal_buffer_level is already set
+  // via vp8_init_temporal_layer_context() or vp8_reset_temporal_layer_change().
+  if (oxcf->number_of_layers <= 1 && prev_number_of_layers <= 1) {
+    oxcf->starting_buffer_level =
+        rescale((int)oxcf->starting_buffer_level, oxcf->target_bandwidth, 1000);
+    /* Set or reset optimal and maximum buffer levels. */
+    if (oxcf->optimal_buffer_level == 0) {
+      oxcf->optimal_buffer_level = oxcf->target_bandwidth / 8;
+    } else {
+      oxcf->optimal_buffer_level = rescale((int)oxcf->optimal_buffer_level,
+                                           oxcf->target_bandwidth, 1000);
+    }
+    if (oxcf->maximum_buffer_size == 0) {
+      oxcf->maximum_buffer_size = oxcf->target_bandwidth / 8;
+    } else {
+      oxcf->maximum_buffer_size =
+          rescale((int)oxcf->maximum_buffer_size, oxcf->target_bandwidth, 1000);
+    }
+  }
+
+  if (cpi_->bits_off_target > oxcf->maximum_buffer_size) {
+    cpi_->bits_off_target = oxcf->maximum_buffer_size;
+    cpi_->buffer_level = cpi_->bits_off_target;
+  }
+
+  vp8_new_framerate(cpi_, cpi_->framerate);
+  vpx_clear_system_state();
+  return true;
+}
+
+FrameDropDecision VP8RateControlRTC::ComputeQP(
+    const VP8FrameParamsQpRTC &frame_params) {
+  VP8_COMMON *const cm = &cpi_->common;
+  vpx_clear_system_state();
+  if (cpi_->oxcf.number_of_layers > 1) {
+    cpi_->temporal_layer_id = frame_params.temporal_layer_id;
+    const int layer = frame_params.temporal_layer_id;
+    vp8_update_layer_contexts(cpi_);
+    /* Restore layer specific context & set frame rate */
+    vp8_restore_layer_context(cpi_, layer);
+    vp8_new_framerate(cpi_, cpi_->layer_context[layer].framerate);
+  }
+  cm->frame_type = static_cast<FRAME_TYPE>(frame_params.frame_type);
+  cm->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0;
+  cm->refresh_alt_ref_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0;
+  if (cm->frame_type == KEY_FRAME && cpi_->common.current_video_frame > 0) {
+    cpi_->common.frame_flags |= FRAMEFLAGS_KEY;
+  }
+
+  cpi_->per_frame_bandwidth = static_cast<int>(
+      round(cpi_->oxcf.target_bandwidth / cpi_->output_framerate));
+  if (vp8_check_drop_buffer(cpi_)) {
+    if (cpi_->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi_);
+    return FrameDropDecision::kDrop;
+  }
+
+  if (!vp8_pick_frame_size(cpi_)) {
+    cm->current_video_frame++;
+    cpi_->frames_since_key++;
+    cpi_->ext_refresh_frame_flags_pending = 0;
+    if (cpi_->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi_);
+    return FrameDropDecision::kDrop;
+  }
+
+  if (cpi_->buffer_level >= cpi_->oxcf.optimal_buffer_level &&
+      cpi_->buffered_mode) {
+    /* Max adjustment is 1/4 */
+    int Adjustment = cpi_->active_worst_quality / 4;
+    if (Adjustment) {
+      int buff_lvl_step;
+      if (cpi_->buffer_level < cpi_->oxcf.maximum_buffer_size) {
+        buff_lvl_step = (int)((cpi_->oxcf.maximum_buffer_size -
+                               cpi_->oxcf.optimal_buffer_level) /
+                              Adjustment);
+        if (buff_lvl_step) {
+          Adjustment =
+              (int)((cpi_->buffer_level - cpi_->oxcf.optimal_buffer_level) /
+                    buff_lvl_step);
+        } else {
+          Adjustment = 0;
+        }
+      }
+      cpi_->active_worst_quality -= Adjustment;
+      if (cpi_->active_worst_quality < cpi_->active_best_quality) {
+        cpi_->active_worst_quality = cpi_->active_best_quality;
+      }
+    }
+  }
+
+  if (cpi_->ni_frames > 150) {
+    int q = cpi_->active_worst_quality;
+    if (cm->frame_type == KEY_FRAME) {
+      cpi_->active_best_quality = kf_high_motion_minq[q];
+    } else {
+      cpi_->active_best_quality = inter_minq[q];
+    }
+
+    if (cpi_->buffer_level >= cpi_->oxcf.maximum_buffer_size) {
+      cpi_->active_best_quality = cpi_->best_quality;
+
+    } else if (cpi_->buffer_level > cpi_->oxcf.optimal_buffer_level) {
+      int Fraction =
+          (int)(((cpi_->buffer_level - cpi_->oxcf.optimal_buffer_level) * 128) /
+                (cpi_->oxcf.maximum_buffer_size -
+                 cpi_->oxcf.optimal_buffer_level));
+      int min_qadjustment =
+          ((cpi_->active_best_quality - cpi_->best_quality) * Fraction) / 128;
+
+      cpi_->active_best_quality -= min_qadjustment;
+    }
+  }
+
+  /* Clip the active best and worst quality values to limits */
+  if (cpi_->active_worst_quality > cpi_->worst_quality) {
+    cpi_->active_worst_quality = cpi_->worst_quality;
+  }
+  if (cpi_->active_best_quality < cpi_->best_quality) {
+    cpi_->active_best_quality = cpi_->best_quality;
+  }
+  if (cpi_->active_worst_quality < cpi_->active_best_quality) {
+    cpi_->active_worst_quality = cpi_->active_best_quality;
+  }
+
+  q_ = vp8_regulate_q(cpi_, cpi_->this_frame_target);
+  vp8_set_quantizer(cpi_, q_);
+  vpx_clear_system_state();
+  return FrameDropDecision::kOk;
+}
+
+int VP8RateControlRTC::GetQP() const { return q_; }
+
+UVDeltaQP VP8RateControlRTC::GetUVDeltaQP() const {
+  VP8_COMMON *cm = &cpi_->common;
+  UVDeltaQP uv_delta_q;
+  uv_delta_q.uvdc_delta_q = cm->uvdc_delta_q;
+  uv_delta_q.uvac_delta_q = cm->uvac_delta_q;
+  return uv_delta_q;
+}
+
+int VP8RateControlRTC::GetLoopfilterLevel() const {
+  VP8_COMMON *cm = &cpi_->common;
+  const double qp = q_;
+
+  // This model is from linear regression
+  if (cm->Width * cm->Height <= 320 * 240) {
+    cm->filter_level = static_cast<int>(0.352685 * qp + 2.957774);
+  } else if (cm->Width * cm->Height <= 640 * 480) {
+    cm->filter_level = static_cast<int>(0.485069 * qp - 0.534462);
+  } else {
+    cm->filter_level = static_cast<int>(0.314875 * qp + 7.959003);
+  }
+
+  int min_filter_level = 0;
+  // This logic is from get_min_filter_level() in picklpf.c
+  if (q_ > 6 && q_ <= 16) {
+    min_filter_level = 1;
+  } else {
+    min_filter_level = (q_ / 8);
+  }
+
+  const int max_filter_level = 63;
+  if (cm->filter_level < min_filter_level) cm->filter_level = min_filter_level;
+  if (cm->filter_level > max_filter_level) cm->filter_level = max_filter_level;
+
+  return cm->filter_level;
+}
+
+void VP8RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) {
+  VP8_COMMON *const cm = &cpi_->common;
+  vpx_clear_system_state();
+  cpi_->total_byte_count += encoded_frame_size;
+  cpi_->projected_frame_size = static_cast<int>(encoded_frame_size << 3);
+  if (cpi_->oxcf.number_of_layers > 1) {
+    for (unsigned int i = cpi_->current_layer + 1;
+         i < cpi_->oxcf.number_of_layers; ++i) {
+      cpi_->layer_context[i].total_byte_count += encoded_frame_size;
+    }
+  }
+
+  vp8_update_rate_correction_factors(cpi_, 2);
+
+  cpi_->last_q[cm->frame_type] = cm->base_qindex;
+
+  if (cm->frame_type == KEY_FRAME) {
+    vp8_adjust_key_frame_context(cpi_);
+  }
+
+  /* Keep a record of ambient average Q. */
+  if (cm->frame_type != KEY_FRAME) {
+    cpi_->avg_frame_qindex =
+        (2 + 3 * cpi_->avg_frame_qindex + cm->base_qindex) >> 2;
+  }
+  /* Keep a record from which we can calculate the average Q excluding
+   * key frames.
+   */
+  if (cm->frame_type != KEY_FRAME) {
+    cpi_->ni_frames++;
+    /* Damp value for first few frames */
+    if (cpi_->ni_frames > 150) {
+      cpi_->ni_tot_qi += q_;
+      cpi_->ni_av_qi = (cpi_->ni_tot_qi / cpi_->ni_frames);
+    } else {
+      cpi_->ni_tot_qi += q_;
+      cpi_->ni_av_qi =
+          ((cpi_->ni_tot_qi / cpi_->ni_frames) + cpi_->worst_quality + 1) / 2;
+    }
+
+    /* If the average Q is higher than what was used in the last
+     * frame (after going through the recode loop to keep the frame
+     * size within range) then use the last frame value - 1. The -1
+     * is designed to stop Q and hence the data rate, from
+     * progressively falling away during difficult sections, but at
+     * the same time reduce the number of itterations around the
+     * recode loop.
+     */
+    if (q_ > cpi_->ni_av_qi) cpi_->ni_av_qi = q_ - 1;
+  }
+
+  cpi_->bits_off_target +=
+      cpi_->av_per_frame_bandwidth - cpi_->projected_frame_size;
+  if (cpi_->bits_off_target > cpi_->oxcf.maximum_buffer_size) {
+    cpi_->bits_off_target = cpi_->oxcf.maximum_buffer_size;
+  }
+
+  cpi_->total_actual_bits += cpi_->projected_frame_size;
+  cpi_->buffer_level = cpi_->bits_off_target;
+
+  /* Propagate values to higher temporal layers */
+  if (cpi_->oxcf.number_of_layers > 1) {
+    for (unsigned int i = cpi_->current_layer + 1;
+         i < cpi_->oxcf.number_of_layers; ++i) {
+      LAYER_CONTEXT *lc = &cpi_->layer_context[i];
+      int bits_off_for_this_layer = (int)round(
+          lc->target_bandwidth / lc->framerate - cpi_->projected_frame_size);
+
+      lc->bits_off_target += bits_off_for_this_layer;
+
+      /* Clip buffer level to maximum buffer size for the layer */
+      if (lc->bits_off_target > lc->maximum_buffer_size) {
+        lc->bits_off_target = lc->maximum_buffer_size;
+      }
+
+      lc->total_actual_bits += cpi_->projected_frame_size;
+      lc->total_target_vs_actual += bits_off_for_this_layer;
+      lc->buffer_level = lc->bits_off_target;
+    }
+  }
+
+  cpi_->common.current_video_frame++;
+  cpi_->frames_since_key++;
+
+  if (cpi_->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi_);
+  vpx_clear_system_state();
+}
+}  // namespace libvpx
diff --git a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h
new file mode 100644
index 0000000000..b458b5ce65
--- /dev/null
+++ b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h
@@ -0,0 +1,66 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_RATECTRL_RTC_H_
+#define VPX_VP8_RATECTRL_RTC_H_
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+
+#include "vpx/internal/vpx_ratectrl_rtc.h"
+
+struct VP8_COMP;
+
+namespace libvpx {
+struct VP8RateControlRtcConfig : public VpxRateControlRtcConfig {
+  VP8RateControlRtcConfig() {
+    memset(&layer_target_bitrate, 0, sizeof(layer_target_bitrate));
+    memset(&ts_rate_decimator, 0, sizeof(ts_rate_decimator));
+  }
+};
+
+struct VP8FrameParamsQpRTC {
+  RcFrameType frame_type;
+  int temporal_layer_id;
+};
+
+class VP8RateControlRTC {
+ public:
+  static std::unique_ptr<VP8RateControlRTC> Create(
+      const VP8RateControlRtcConfig &cfg);
+  ~VP8RateControlRTC();
+
+  bool UpdateRateControl(const VP8RateControlRtcConfig &rc_cfg);
+  // GetQP() needs to be called after ComputeQP() to get the latest QP
+  int GetQP() const;
+  // GetUVDeltaQP() needs to be called after ComputeQP() to get the latest
+  // delta QP for UV.
+  UVDeltaQP GetUVDeltaQP() const;
+  // GetLoopfilterLevel() needs to be called after ComputeQP() since loopfilter
+  // level is calculated from frame qp.
+  int GetLoopfilterLevel() const;
+  // ComputeQP computes the QP if the frame is not dropped (kOk return),
+  // otherwise it returns kDrop and subsequent GetQP and PostEncodeUpdate
+  // are not to be called.
+  FrameDropDecision ComputeQP(const VP8FrameParamsQpRTC &frame_params);
+  // Feedback to rate control with the size of current encoded frame
+  void PostEncodeUpdate(uint64_t encoded_frame_size);
+
+ private:
+  VP8RateControlRTC() = default;
+  bool InitRateControl(const VP8RateControlRtcConfig &cfg);
+  struct VP8_COMP *cpi_ = nullptr;
+  int q_ = -1;
+};
+
+}  // namespace libvpx
+
+#endif  // VPX_VP8_RATECTRL_RTC_H_
diff --git a/media/libvpx/libvpx/vp8/vp8cx.mk b/media/libvpx/libvpx/vp8/vp8cx.mk
index 7bd41a3fb7..b4b3fda9ea 100644
--- a/media/libvpx/libvpx/vp8/vp8cx.mk
+++ b/media/libvpx/libvpx/vp8/vp8cx.mk
@@ -23,6 +23,7 @@ VP8_CX_SRCS-yes += vp8_cx_iface.c
 VP8_CX_SRCS-yes += encoder/defaultcoefcounts.h
 VP8_CX_SRCS-yes += encoder/bitstream.c
 VP8_CX_SRCS-yes += encoder/boolhuff.c
+VP8_CX_SRCS-yes += encoder/copy_c.c
 VP8_CX_SRCS-yes += encoder/dct.c
 VP8_CX_SRCS-yes += encoder/encodeframe.c
 VP8_CX_SRCS-yes += encoder/encodeframe.h
@@ -30,6 +31,7 @@ VP8_CX_SRCS-yes += encoder/encodeintra.c
 VP8_CX_SRCS-yes += encoder/encodemb.c
 VP8_CX_SRCS-yes += encoder/encodemv.c
 VP8_CX_SRCS-$(CONFIG_MULTITHREAD) += encoder/ethreading.c
+VP8_CX_SRCS-$(CONFIG_MULTITHREAD) += encoder/ethreading.h
 VP8_CX_SRCS-yes += encoder/firstpass.c
 VP8_CX_SRCS-yes += encoder/block.h
 VP8_CX_SRCS-yes += encoder/boolhuff.h
@@ -56,11 +58,14 @@ VP8_CX_SRCS-yes += encoder/modecosts.c
 VP8_CX_SRCS-yes += encoder/onyx_if.c
 VP8_CX_SRCS-yes += encoder/pickinter.c
 VP8_CX_SRCS-yes += encoder/picklpf.c
+VP8_CX_SRCS-yes += encoder/picklpf.h
 VP8_CX_SRCS-yes += encoder/vp8_quantize.c
 VP8_CX_SRCS-yes += encoder/ratectrl.c
 VP8_CX_SRCS-yes += encoder/rdopt.c
 VP8_CX_SRCS-yes += encoder/segmentation.c
 VP8_CX_SRCS-yes += encoder/segmentation.h
+VP8_CX_SRCS-yes += common/vp8_skin_detection.c
+VP8_CX_SRCS-yes += common/vp8_skin_detection.h
 VP8_CX_SRCS-yes += encoder/tokenize.c
 VP8_CX_SRCS-yes += encoder/dct_value_cost.h
 VP8_CX_SRCS-yes += encoder/dct_value_tokens.h
@@ -68,29 +73,31 @@ VP8_CX_SRCS-yes += encoder/treewriter.c
 VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.h
 VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c
 VP8_CX_SRCS-yes += encoder/temporal_filter.c
+VP8_CX_SRCS-yes += encoder/temporal_filter.h
 VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.c
 VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.h
 
 ifeq ($(CONFIG_REALTIME_ONLY),yes)
 VP8_CX_SRCS_REMOVE-yes += encoder/firstpass.c
 VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.c
+VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.h
 endif
 
-VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/copy_sse2.asm
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/copy_sse3.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_quantize_sse2.c
-VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.c
+VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp8_quantize_ssse3.c
 VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.c
 
 ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c
 endif
 
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/block_error_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c
-VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
-VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
 
 ifeq ($(CONFIG_REALTIME_ONLY),yes)
 VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
@@ -106,6 +113,9 @@ VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/encodeopt_msa.c
 VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/quantize_msa.c
 VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c
 
+VP8_CX_SRCS-$(HAVE_MMI) += encoder/mips/mmi/vp8_quantize_mmi.c
+VP8_CX_SRCS-$(HAVE_MMI) += encoder/mips/mmi/dct_mmi.c
+
 ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
 VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/denoising_msa.c
 endif
@@ -114,4 +124,9 @@ ifeq ($(CONFIG_REALTIME_ONLY),yes)
 VP8_CX_SRCS_REMOVE-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c
 endif
 
+# common (loongarch LSX intrinsics)
+VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/dct_lsx.c
+VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/encodeopt_lsx.c
+VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/vp8_quantize_lsx.c
+
 VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))
diff --git a/media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c
new file mode 100644
index 0000000000..b43d7fa4f9
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c
@@ -0,0 +1,446 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vp9/common/vp9_enums.h"
+#include "vp9/common/arm/neon/vp9_iht_neon.h"
+#include "vpx_dsp/arm/highbd_idct_neon.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+// Use macros to make sure argument lane is passed in as a constant integer.
+
+#define vmull_lane_s32_dual(in, c, lane, out)                          \
+  do {                                                                 \
+    out[0].val[0] = vmull_lane_s32(vget_low_s32(in.val[0]), c, lane);  \
+    out[0].val[1] = vmull_lane_s32(vget_low_s32(in.val[1]), c, lane);  \
+    out[1].val[0] = vmull_lane_s32(vget_high_s32(in.val[0]), c, lane); \
+    out[1].val[1] = vmull_lane_s32(vget_high_s32(in.val[1]), c, lane); \
+  } while (0)
+
+#define vmlal_lane_s32_dual(in, c, lane, out)                             \
+  do {                                                                    \
+    out[0].val[0] =                                                       \
+        vmlal_lane_s32(out[0].val[0], vget_low_s32(in.val[0]), c, lane);  \
+    out[0].val[1] =                                                       \
+        vmlal_lane_s32(out[0].val[1], vget_low_s32(in.val[1]), c, lane);  \
+    out[1].val[0] =                                                       \
+        vmlal_lane_s32(out[1].val[0], vget_high_s32(in.val[0]), c, lane); \
+    out[1].val[1] =                                                       \
+        vmlal_lane_s32(out[1].val[1], vget_high_s32(in.val[1]), c, lane); \
+  } while (0)
+
+#define vmlsl_lane_s32_dual(in, c, lane, out)                             \
+  do {                                                                    \
+    out[0].val[0] =                                                       \
+        vmlsl_lane_s32(out[0].val[0], vget_low_s32(in.val[0]), c, lane);  \
+    out[0].val[1] =                                                       \
+        vmlsl_lane_s32(out[0].val[1], vget_low_s32(in.val[1]), c, lane);  \
+    out[1].val[0] =                                                       \
+        vmlsl_lane_s32(out[1].val[0], vget_high_s32(in.val[0]), c, lane); \
+    out[1].val[1] =                                                       \
+        vmlsl_lane_s32(out[1].val[1], vget_high_s32(in.val[1]), c, lane); \
+  } while (0)
+
+static INLINE int32x4x2_t
+highbd_dct_const_round_shift_low_8(const int64x2x2_t *const in) {
+  int32x4x2_t out;
+  out.val[0] = vcombine_s32(vrshrn_n_s64(in[0].val[0], DCT_CONST_BITS),
+                            vrshrn_n_s64(in[1].val[0], DCT_CONST_BITS));
+  out.val[1] = vcombine_s32(vrshrn_n_s64(in[0].val[1], DCT_CONST_BITS),
+                            vrshrn_n_s64(in[1].val[1], DCT_CONST_BITS));
+  return out;
+}
+
+#define highbd_iadst_half_butterfly(in, c, lane, out) \
+  do {                                                \
+    int64x2x2_t _t[2];                                \
+    vmull_lane_s32_dual(in, c, lane, _t);             \
+    out = highbd_dct_const_round_shift_low_8(_t);     \
+  } while (0)
+
+#define highbd_iadst_butterfly(in0, in1, c, lane0, lane1, s0, s1) \
+  do {                                                            \
+    vmull_lane_s32_dual(in0, c, lane0, s0);                       \
+    vmull_lane_s32_dual(in0, c, lane1, s1);                       \
+    vmlal_lane_s32_dual(in1, c, lane1, s0);                       \
+    vmlsl_lane_s32_dual(in1, c, lane0, s1);                       \
+  } while (0)
+
+static INLINE int32x4x2_t vaddq_s32_dual(const int32x4x2_t in0,
+                                         const int32x4x2_t in1) {
+  int32x4x2_t out;
+  out.val[0] = vaddq_s32(in0.val[0], in1.val[0]);
+  out.val[1] = vaddq_s32(in0.val[1], in1.val[1]);
+  return out;
+}
+
+static INLINE int64x2x2_t vaddq_s64_dual(const int64x2x2_t in0,
+                                         const int64x2x2_t in1) {
+  int64x2x2_t out;
+  out.val[0] = vaddq_s64(in0.val[0], in1.val[0]);
+  out.val[1] = vaddq_s64(in0.val[1], in1.val[1]);
+  return out;
+}
+
+static INLINE int32x4x2_t vsubq_s32_dual(const int32x4x2_t in0,
+                                         const int32x4x2_t in1) {
+  int32x4x2_t out;
+  out.val[0] = vsubq_s32(in0.val[0], in1.val[0]);
+  out.val[1] = vsubq_s32(in0.val[1], in1.val[1]);
+  return out;
+}
+
+static INLINE int64x2x2_t vsubq_s64_dual(const int64x2x2_t in0,
+                                         const int64x2x2_t in1) {
+  int64x2x2_t out;
+  out.val[0] = vsubq_s64(in0.val[0], in1.val[0]);
+  out.val[1] = vsubq_s64(in0.val[1], in1.val[1]);
+  return out;
+}
+
+static INLINE int32x4x2_t vcombine_s32_dual(const int32x2x2_t in0,
+                                            const int32x2x2_t in1) {
+  int32x4x2_t out;
+  out.val[0] = vcombine_s32(in0.val[0], in1.val[0]);
+  out.val[1] = vcombine_s32(in0.val[1], in1.val[1]);
+  return out;
+}
+
+static INLINE int32x4x2_t highbd_add_dct_const_round_shift_low_8(
+    const int64x2x2_t *const in0, const int64x2x2_t *const in1) {
+  const int64x2x2_t sum_lo = vaddq_s64_dual(in0[0], in1[0]);
+  const int64x2x2_t sum_hi = vaddq_s64_dual(in0[1], in1[1]);
+  int32x2x2_t out_lo, out_hi;
+
+  out_lo.val[0] = vrshrn_n_s64(sum_lo.val[0], DCT_CONST_BITS);
+  out_lo.val[1] = vrshrn_n_s64(sum_lo.val[1], DCT_CONST_BITS);
+  out_hi.val[0] = vrshrn_n_s64(sum_hi.val[0], DCT_CONST_BITS);
+  out_hi.val[1] = vrshrn_n_s64(sum_hi.val[1], DCT_CONST_BITS);
+  return vcombine_s32_dual(out_lo, out_hi);
+}
+
+static INLINE int32x4x2_t highbd_sub_dct_const_round_shift_low_8(
+    const int64x2x2_t *const in0, const int64x2x2_t *const in1) {
+  const int64x2x2_t sub_lo = vsubq_s64_dual(in0[0], in1[0]);
+  const int64x2x2_t sub_hi = vsubq_s64_dual(in0[1], in1[1]);
+  int32x2x2_t out_lo, out_hi;
+
+  out_lo.val[0] = vrshrn_n_s64(sub_lo.val[0], DCT_CONST_BITS);
+  out_lo.val[1] = vrshrn_n_s64(sub_lo.val[1], DCT_CONST_BITS);
+  out_hi.val[0] = vrshrn_n_s64(sub_hi.val[0], DCT_CONST_BITS);
+  out_hi.val[1] = vrshrn_n_s64(sub_hi.val[1], DCT_CONST_BITS);
+  return vcombine_s32_dual(out_lo, out_hi);
+}
+
+static INLINE int32x4x2_t vnegq_s32_dual(const int32x4x2_t in) {
+  int32x4x2_t out;
+  out.val[0] = vnegq_s32(in.val[0]);
+  out.val[1] = vnegq_s32(in.val[1]);
+  return out;
+}
+
+static void highbd_iadst16_neon(const int32_t *input, int32_t *output,
+                                uint16_t *dest, const int stride,
+                                const int bd) {
+  const int32x4_t c_1_31_5_27 =
+      create_s32x4_neon(cospi_1_64, cospi_31_64, cospi_5_64, cospi_27_64);
+  const int32x4_t c_9_23_13_19 =
+      create_s32x4_neon(cospi_9_64, cospi_23_64, cospi_13_64, cospi_19_64);
+  const int32x4_t c_17_15_21_11 =
+      create_s32x4_neon(cospi_17_64, cospi_15_64, cospi_21_64, cospi_11_64);
+  const int32x4_t c_25_7_29_3 =
+      create_s32x4_neon(cospi_25_64, cospi_7_64, cospi_29_64, cospi_3_64);
+  const int32x4_t c_4_28_20_12 =
+      create_s32x4_neon(cospi_4_64, cospi_28_64, cospi_20_64, cospi_12_64);
+  const int32x4_t c_16_n16_8_24 =
+      create_s32x4_neon(cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64);
+  int32x4x2_t in[16], out[16];
+  int32x4x2_t x[16], t[12];
+  int64x2x2_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
+  int64x2x2_t s8[2], s9[2], s10[2], s11[2], s12[2], s13[2], s14[2], s15[2];
+
+  // Load input (16x8)
+  in[0].val[0] = vld1q_s32(input);
+  in[0].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[8].val[0] = vld1q_s32(input);
+  in[8].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[1].val[0] = vld1q_s32(input);
+  in[1].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[9].val[0] = vld1q_s32(input);
+  in[9].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[2].val[0] = vld1q_s32(input);
+  in[2].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[10].val[0] = vld1q_s32(input);
+  in[10].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[3].val[0] = vld1q_s32(input);
+  in[3].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[11].val[0] = vld1q_s32(input);
+  in[11].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[4].val[0] = vld1q_s32(input);
+  in[4].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[12].val[0] = vld1q_s32(input);
+  in[12].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[5].val[0] = vld1q_s32(input);
+  in[5].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[13].val[0] = vld1q_s32(input);
+  in[13].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[6].val[0] = vld1q_s32(input);
+  in[6].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[14].val[0] = vld1q_s32(input);
+  in[14].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[7].val[0] = vld1q_s32(input);
+  in[7].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[15].val[0] = vld1q_s32(input);
+  in[15].val[1] = vld1q_s32(input + 4);
+
+  // Transpose
+  transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+                    &in[7]);
+  transpose_s32_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14],
+                    &in[15]);
+
+  x[0] = in[15];
+  x[1] = in[0];
+  x[2] = in[13];
+  x[3] = in[2];
+  x[4] = in[11];
+  x[5] = in[4];
+  x[6] = in[9];
+  x[7] = in[6];
+  x[8] = in[7];
+  x[9] = in[8];
+  x[10] = in[5];
+  x[11] = in[10];
+  x[12] = in[3];
+  x[13] = in[12];
+  x[14] = in[1];
+  x[15] = in[14];
+
+  // stage 1
+  highbd_iadst_butterfly(x[0], x[1], vget_low_s32(c_1_31_5_27), 0, 1, s0, s1);
+  highbd_iadst_butterfly(x[2], x[3], vget_high_s32(c_1_31_5_27), 0, 1, s2, s3);
+  highbd_iadst_butterfly(x[4], x[5], vget_low_s32(c_9_23_13_19), 0, 1, s4, s5);
+  highbd_iadst_butterfly(x[6], x[7], vget_high_s32(c_9_23_13_19), 0, 1, s6, s7);
+  highbd_iadst_butterfly(x[8], x[9], vget_low_s32(c_17_15_21_11), 0, 1, s8, s9);
+  highbd_iadst_butterfly(x[10], x[11], vget_high_s32(c_17_15_21_11), 0, 1, s10,
+                         s11);
+  highbd_iadst_butterfly(x[12], x[13], vget_low_s32(c_25_7_29_3), 0, 1, s12,
+                         s13);
+  highbd_iadst_butterfly(x[14], x[15], vget_high_s32(c_25_7_29_3), 0, 1, s14,
+                         s15);
+
+  x[0] = highbd_add_dct_const_round_shift_low_8(s0, s8);
+  x[1] = highbd_add_dct_const_round_shift_low_8(s1, s9);
+  x[2] = highbd_add_dct_const_round_shift_low_8(s2, s10);
+  x[3] = highbd_add_dct_const_round_shift_low_8(s3, s11);
+  x[4] = highbd_add_dct_const_round_shift_low_8(s4, s12);
+  x[5] = highbd_add_dct_const_round_shift_low_8(s5, s13);
+  x[6] = highbd_add_dct_const_round_shift_low_8(s6, s14);
+  x[7] = highbd_add_dct_const_round_shift_low_8(s7, s15);
+  x[8] = highbd_sub_dct_const_round_shift_low_8(s0, s8);
+  x[9] = highbd_sub_dct_const_round_shift_low_8(s1, s9);
+  x[10] = highbd_sub_dct_const_round_shift_low_8(s2, s10);
+  x[11] = highbd_sub_dct_const_round_shift_low_8(s3, s11);
+  x[12] = highbd_sub_dct_const_round_shift_low_8(s4, s12);
+  x[13] = highbd_sub_dct_const_round_shift_low_8(s5, s13);
+  x[14] = highbd_sub_dct_const_round_shift_low_8(s6, s14);
+  x[15] = highbd_sub_dct_const_round_shift_low_8(s7, s15);
+
+  // stage 2
+  t[0] = x[0];
+  t[1] = x[1];
+  t[2] = x[2];
+  t[3] = x[3];
+  t[4] = x[4];
+  t[5] = x[5];
+  t[6] = x[6];
+  t[7] = x[7];
+  highbd_iadst_butterfly(x[8], x[9], vget_low_s32(c_4_28_20_12), 0, 1, s8, s9);
+  highbd_iadst_butterfly(x[10], x[11], vget_high_s32(c_4_28_20_12), 0, 1, s10,
+                         s11);
+  highbd_iadst_butterfly(x[13], x[12], vget_low_s32(c_4_28_20_12), 1, 0, s13,
+                         s12);
+  highbd_iadst_butterfly(x[15], x[14], vget_high_s32(c_4_28_20_12), 1, 0, s15,
+                         s14);
+
+  x[0] = vaddq_s32_dual(t[0], t[4]);
+  x[1] = vaddq_s32_dual(t[1], t[5]);
+  x[2] = vaddq_s32_dual(t[2], t[6]);
+  x[3] = vaddq_s32_dual(t[3], t[7]);
+  x[4] = vsubq_s32_dual(t[0], t[4]);
+  x[5] = vsubq_s32_dual(t[1], t[5]);
+  x[6] = vsubq_s32_dual(t[2], t[6]);
+  x[7] = vsubq_s32_dual(t[3], t[7]);
+  x[8] = highbd_add_dct_const_round_shift_low_8(s8, s12);
+  x[9] = highbd_add_dct_const_round_shift_low_8(s9, s13);
+  x[10] = highbd_add_dct_const_round_shift_low_8(s10, s14);
+  x[11] = highbd_add_dct_const_round_shift_low_8(s11, s15);
+  x[12] = highbd_sub_dct_const_round_shift_low_8(s8, s12);
+  x[13] = highbd_sub_dct_const_round_shift_low_8(s9, s13);
+  x[14] = highbd_sub_dct_const_round_shift_low_8(s10, s14);
+  x[15] = highbd_sub_dct_const_round_shift_low_8(s11, s15);
+
+  // stage 3
+  t[0] = x[0];
+  t[1] = x[1];
+  t[2] = x[2];
+  t[3] = x[3];
+  highbd_iadst_butterfly(x[4], x[5], vget_high_s32(c_16_n16_8_24), 0, 1, s4,
+                         s5);
+  highbd_iadst_butterfly(x[7], x[6], vget_high_s32(c_16_n16_8_24), 1, 0, s7,
+                         s6);
+  t[8] = x[8];
+  t[9] = x[9];
+  t[10] = x[10];
+  t[11] = x[11];
+  highbd_iadst_butterfly(x[12], x[13], vget_high_s32(c_16_n16_8_24), 0, 1, s12,
+                         s13);
+  highbd_iadst_butterfly(x[15], x[14], vget_high_s32(c_16_n16_8_24), 1, 0, s15,
+                         s14);
+
+  x[0] = vaddq_s32_dual(t[0], t[2]);
+  x[1] = vaddq_s32_dual(t[1], t[3]);
+  x[2] = vsubq_s32_dual(t[0], t[2]);
+  x[3] = vsubq_s32_dual(t[1], t[3]);
+  x[4] = highbd_add_dct_const_round_shift_low_8(s4, s6);
+  x[5] = highbd_add_dct_const_round_shift_low_8(s5, s7);
+  x[6] = highbd_sub_dct_const_round_shift_low_8(s4, s6);
+  x[7] = highbd_sub_dct_const_round_shift_low_8(s5, s7);
+  x[8] = vaddq_s32_dual(t[8], t[10]);
+  x[9] = vaddq_s32_dual(t[9], t[11]);
+  x[10] = vsubq_s32_dual(t[8], t[10]);
+  x[11] = vsubq_s32_dual(t[9], t[11]);
+  x[12] = highbd_add_dct_const_round_shift_low_8(s12, s14);
+  x[13] = highbd_add_dct_const_round_shift_low_8(s13, s15);
+  x[14] = highbd_sub_dct_const_round_shift_low_8(s12, s14);
+  x[15] = highbd_sub_dct_const_round_shift_low_8(s13, s15);
+
+  // stage 4
+  {
+    const int32x4x2_t sum = vaddq_s32_dual(x[2], x[3]);
+    const int32x4x2_t sub = vsubq_s32_dual(x[2], x[3]);
+    highbd_iadst_half_butterfly(sum, vget_low_s32(c_16_n16_8_24), 1, x[2]);
+    highbd_iadst_half_butterfly(sub, vget_low_s32(c_16_n16_8_24), 0, x[3]);
+  }
+  {
+    const int32x4x2_t sum = vaddq_s32_dual(x[7], x[6]);
+    const int32x4x2_t sub = vsubq_s32_dual(x[7], x[6]);
+    highbd_iadst_half_butterfly(sum, vget_low_s32(c_16_n16_8_24), 0, x[6]);
+    highbd_iadst_half_butterfly(sub, vget_low_s32(c_16_n16_8_24), 0, x[7]);
+  }
+  {
+    const int32x4x2_t sum = vaddq_s32_dual(x[11], x[10]);
+    const int32x4x2_t sub = vsubq_s32_dual(x[11], x[10]);
+    highbd_iadst_half_butterfly(sum, vget_low_s32(c_16_n16_8_24), 0, x[10]);
+    highbd_iadst_half_butterfly(sub, vget_low_s32(c_16_n16_8_24), 0, x[11]);
+  }
+  {
+    const int32x4x2_t sum = vaddq_s32_dual(x[14], x[15]);
+    const int32x4x2_t sub = vsubq_s32_dual(x[14], x[15]);
+    highbd_iadst_half_butterfly(sum, vget_low_s32(c_16_n16_8_24), 1, x[14]);
+    highbd_iadst_half_butterfly(sub, vget_low_s32(c_16_n16_8_24), 0, x[15]);
+  }
+
+  out[0] = x[0];
+  out[1] = vnegq_s32_dual(x[8]);
+  out[2] = x[12];
+  out[3] = vnegq_s32_dual(x[4]);
+  out[4] = x[6];
+  out[5] = x[14];
+  out[6] = x[10];
+  out[7] = x[2];
+  out[8] = x[3];
+  out[9] = x[11];
+  out[10] = x[15];
+  out[11] = x[7];
+  out[12] = x[5];
+  out[13] = vnegq_s32_dual(x[13]);
+  out[14] = x[9];
+  out[15] = vnegq_s32_dual(x[1]);
+
+  if (output) {
+    highbd_idct16x16_store_pass1(out, output);
+  } else {
+    highbd_idct16x16_add_store(out, dest, stride, bd);
+  }
+}
+
+typedef void (*highbd_iht_1d)(const int32_t *input, int32_t *output,
+                              uint16_t *dest, const int stride, const int bd);
+
+typedef struct {
+  highbd_iht_1d cols, rows;  // vertical and horizontal
+} highbd_iht_2d;
+
+void vp9_highbd_iht16x16_256_add_neon(const tran_low_t *input, uint16_t *dest,
+                                      int stride, int tx_type, int bd) {
+  if (bd == 8) {
+    static const iht_2d IHT_16[] = {
+      { vpx_idct16x16_256_add_half1d,
+        vpx_idct16x16_256_add_half1d },  // DCT_DCT  = 0
+      { vpx_iadst16x16_256_add_half1d,
+        vpx_idct16x16_256_add_half1d },  // ADST_DCT = 1
+      { vpx_idct16x16_256_add_half1d,
+        vpx_iadst16x16_256_add_half1d },  // DCT_ADST = 2
+      { vpx_iadst16x16_256_add_half1d,
+        vpx_iadst16x16_256_add_half1d }  // ADST_ADST = 3
+    };
+    const iht_2d ht = IHT_16[tx_type];
+    int16_t row_output[16 * 16];
+
+    // pass 1
+    ht.rows(input, row_output, dest, stride, 1);               // upper 8 rows
+    ht.rows(input + 8 * 16, row_output + 8, dest, stride, 1);  // lower 8 rows
+
+    // pass 2
+    ht.cols(row_output, NULL, dest, stride, 1);               // left 8 columns
+    ht.cols(row_output + 16 * 8, NULL, dest + 8, stride, 1);  // right 8 columns
+  } else {
+    static const highbd_iht_2d IHT_16[] = {
+      { vpx_highbd_idct16x16_256_add_half1d,
+        vpx_highbd_idct16x16_256_add_half1d },  // DCT_DCT  = 0
+      { highbd_iadst16_neon,
+        vpx_highbd_idct16x16_256_add_half1d },  // ADST_DCT = 1
+      { vpx_highbd_idct16x16_256_add_half1d,
+        highbd_iadst16_neon },                      // DCT_ADST = 2
+      { highbd_iadst16_neon, highbd_iadst16_neon }  // ADST_ADST = 3
+    };
+    const highbd_iht_2d ht = IHT_16[tx_type];
+    int32_t row_output[16 * 16];
+
+    // pass 1
+    ht.rows(input, row_output, dest, stride, bd);               // upper 8 rows
+    ht.rows(input + 8 * 16, row_output + 8, dest, stride, bd);  // lower 8 rows
+
+    // pass 2
+    ht.cols(row_output, NULL, dest, stride, bd);  // left 8 columns
+    ht.cols(row_output + 8 * 16, NULL, dest + 8, stride,
+            bd);  // right 8 columns
+  }
+}
diff --git a/media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c
new file mode 100644
index 0000000000..52c4f1937d
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c
@@ -0,0 +1,181 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/arm/neon/vp9_iht_neon.h"
+#include "vpx_dsp/arm/highbd_idct_neon.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE void highbd_iadst4(int32x4_t *const io) {
+  const int32_t sinpis[4] = { sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9 };
+  const int32x4_t sinpi = vld1q_s32(sinpis);
+  int64x2x2_t s[7], t[4];
+  int32x4_t s7;
+
+  s[0].val[0] = vmull_lane_s32(vget_low_s32(io[0]), vget_low_s32(sinpi), 0);
+  s[0].val[1] = vmull_lane_s32(vget_high_s32(io[0]), vget_low_s32(sinpi), 0);
+  s[1].val[0] = vmull_lane_s32(vget_low_s32(io[0]), vget_low_s32(sinpi), 1);
+  s[1].val[1] = vmull_lane_s32(vget_high_s32(io[0]), vget_low_s32(sinpi), 1);
+  s[2].val[0] = vmull_lane_s32(vget_low_s32(io[1]), vget_high_s32(sinpi), 0);
+  s[2].val[1] = vmull_lane_s32(vget_high_s32(io[1]), vget_high_s32(sinpi), 0);
+  s[3].val[0] = vmull_lane_s32(vget_low_s32(io[2]), vget_high_s32(sinpi), 1);
+  s[3].val[1] = vmull_lane_s32(vget_high_s32(io[2]), vget_high_s32(sinpi), 1);
+  s[4].val[0] = vmull_lane_s32(vget_low_s32(io[2]), vget_low_s32(sinpi), 0);
+  s[4].val[1] = vmull_lane_s32(vget_high_s32(io[2]), vget_low_s32(sinpi), 0);
+  s[5].val[0] = vmull_lane_s32(vget_low_s32(io[3]), vget_low_s32(sinpi), 1);
+  s[5].val[1] = vmull_lane_s32(vget_high_s32(io[3]), vget_low_s32(sinpi), 1);
+  s[6].val[0] = vmull_lane_s32(vget_low_s32(io[3]), vget_high_s32(sinpi), 1);
+  s[6].val[1] = vmull_lane_s32(vget_high_s32(io[3]), vget_high_s32(sinpi), 1);
+  s7 = vsubq_s32(io[0], io[2]);
+  s7 = vaddq_s32(s7, io[3]);
+
+  s[0].val[0] = vaddq_s64(s[0].val[0], s[3].val[0]);
+  s[0].val[1] = vaddq_s64(s[0].val[1], s[3].val[1]);
+  s[0].val[0] = vaddq_s64(s[0].val[0], s[5].val[0]);
+  s[0].val[1] = vaddq_s64(s[0].val[1], s[5].val[1]);
+  s[1].val[0] = vsubq_s64(s[1].val[0], s[4].val[0]);
+  s[1].val[1] = vsubq_s64(s[1].val[1], s[4].val[1]);
+  s[1].val[0] = vsubq_s64(s[1].val[0], s[6].val[0]);
+  s[1].val[1] = vsubq_s64(s[1].val[1], s[6].val[1]);
+  s[3] = s[2];
+  s[2].val[0] = vmull_lane_s32(vget_low_s32(s7), vget_high_s32(sinpi), 0);
+  s[2].val[1] = vmull_lane_s32(vget_high_s32(s7), vget_high_s32(sinpi), 0);
+
+  t[0].val[0] = vaddq_s64(s[0].val[0], s[3].val[0]);
+  t[0].val[1] = vaddq_s64(s[0].val[1], s[3].val[1]);
+  t[1].val[0] = vaddq_s64(s[1].val[0], s[3].val[0]);
+  t[1].val[1] = vaddq_s64(s[1].val[1], s[3].val[1]);
+  t[2] = s[2];
+  t[3].val[0] = vaddq_s64(s[0].val[0], s[1].val[0]);
+  t[3].val[1] = vaddq_s64(s[0].val[1], s[1].val[1]);
+  t[3].val[0] = vsubq_s64(t[3].val[0], s[3].val[0]);
+  t[3].val[1] = vsubq_s64(t[3].val[1], s[3].val[1]);
+  io[0] = vcombine_s32(vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS),
+                       vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS));
+  io[1] = vcombine_s32(vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS),
+                       vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS));
+  io[2] = vcombine_s32(vrshrn_n_s64(t[2].val[0], DCT_CONST_BITS),
+                       vrshrn_n_s64(t[2].val[1], DCT_CONST_BITS));
+  io[3] = vcombine_s32(vrshrn_n_s64(t[3].val[0], DCT_CONST_BITS),
+                       vrshrn_n_s64(t[3].val[1], DCT_CONST_BITS));
+}
+
+void vp9_highbd_iht4x4_16_add_neon(const tran_low_t *input, uint16_t *dest,
+                                   int stride, int tx_type, int bd) {
+  const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+  int16x8_t a[2];
+  int32x4_t c[4];
+
+  c[0] = vld1q_s32(input);
+  c[1] = vld1q_s32(input + 4);
+  c[2] = vld1q_s32(input + 8);
+  c[3] = vld1q_s32(input + 12);
+
+  if (bd == 8) {
+    a[0] = vcombine_s16(vmovn_s32(c[0]), vmovn_s32(c[1]));
+    a[1] = vcombine_s16(vmovn_s32(c[2]), vmovn_s32(c[3]));
+    transpose_s16_4x4q(&a[0], &a[1]);
+
+    switch (tx_type) {
+      case DCT_DCT:
+        idct4x4_16_kernel_bd8(a);
+        a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+        transpose_s16_4x4q(&a[0], &a[1]);
+        idct4x4_16_kernel_bd8(a);
+        a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+        break;
+
+      case ADST_DCT:
+        idct4x4_16_kernel_bd8(a);
+        a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+        transpose_s16_4x4q(&a[0], &a[1]);
+        iadst4(a);
+        break;
+
+      case DCT_ADST:
+        iadst4(a);
+        transpose_s16_4x4q(&a[0], &a[1]);
+        idct4x4_16_kernel_bd8(a);
+        a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+        break;
+
+      default:
+        assert(tx_type == ADST_ADST);
+        iadst4(a);
+        transpose_s16_4x4q(&a[0], &a[1]);
+        iadst4(a);
+        break;
+    }
+    a[0] = vrshrq_n_s16(a[0], 4);
+    a[1] = vrshrq_n_s16(a[1], 4);
+  } else {
+    switch (tx_type) {
+      case DCT_DCT: {
+        const int32x4_t cospis = vld1q_s32(kCospi32);
+
+        if (bd == 10) {
+          idct4x4_16_kernel_bd10(cospis, c);
+          idct4x4_16_kernel_bd10(cospis, c);
+        } else {
+          idct4x4_16_kernel_bd12(cospis, c);
+          idct4x4_16_kernel_bd12(cospis, c);
+        }
+        break;
+      }
+
+      case ADST_DCT: {
+        const int32x4_t cospis = vld1q_s32(kCospi32);
+
+        if (bd == 10) {
+          idct4x4_16_kernel_bd10(cospis, c);
+        } else {
+          idct4x4_16_kernel_bd12(cospis, c);
+        }
+        transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]);
+        highbd_iadst4(c);
+        break;
+      }
+
+      case DCT_ADST: {
+        const int32x4_t cospis = vld1q_s32(kCospi32);
+
+        transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]);
+        highbd_iadst4(c);
+        if (bd == 10) {
+          idct4x4_16_kernel_bd10(cospis, c);
+        } else {
+          idct4x4_16_kernel_bd12(cospis, c);
+        }
+        break;
+      }
+
+      default: {
+        assert(tx_type == ADST_ADST);
+        transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]);
+        highbd_iadst4(c);
+        transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]);
+        highbd_iadst4(c);
+        break;
+      }
+    }
+    a[0] = vcombine_s16(vqrshrn_n_s32(c[0], 4), vqrshrn_n_s32(c[1], 4));
+    a[1] = vcombine_s16(vqrshrn_n_s32(c[2], 4), vqrshrn_n_s32(c[3], 4));
+  }
+
+  highbd_idct4x4_1_add_kernel1(&dest, stride, a[0], max);
+  highbd_idct4x4_1_add_kernel1(&dest, stride, a[1], max);
+}
diff --git a/media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c
new file mode 100644
index 0000000000..2232c6841c
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c
@@ -0,0 +1,345 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vp9/common/vp9_enums.h"
+#include "vp9/common/arm/neon/vp9_iht_neon.h"
+#include "vpx_dsp/arm/highbd_idct_neon.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void highbd_iadst_half_butterfly_neon(int32x4_t *const x,
+                                                    const int32x2_t c) {
+  const int32x4_t sum = vaddq_s32(x[0], x[1]);
+  const int32x4_t sub = vsubq_s32(x[0], x[1]);
+  const int64x2_t t0_lo = vmull_lane_s32(vget_low_s32(sum), c, 0);
+  const int64x2_t t1_lo = vmull_lane_s32(vget_low_s32(sub), c, 0);
+  const int64x2_t t0_hi = vmull_lane_s32(vget_high_s32(sum), c, 0);
+  const int64x2_t t1_hi = vmull_lane_s32(vget_high_s32(sub), c, 0);
+  const int32x2_t out0_lo = vrshrn_n_s64(t0_lo, DCT_CONST_BITS);
+  const int32x2_t out1_lo = vrshrn_n_s64(t1_lo, DCT_CONST_BITS);
+  const int32x2_t out0_hi = vrshrn_n_s64(t0_hi, DCT_CONST_BITS);
+  const int32x2_t out1_hi = vrshrn_n_s64(t1_hi, DCT_CONST_BITS);
+
+  x[0] = vcombine_s32(out0_lo, out0_hi);
+  x[1] = vcombine_s32(out1_lo, out1_hi);
+}
+
+static INLINE void highbd_iadst_butterfly_lane_0_1_neon(const int32x4_t in0,
+                                                        const int32x4_t in1,
+                                                        const int32x2_t c,
+                                                        int64x2_t *const s0,
+                                                        int64x2_t *const s1) {
+  const int64x2_t t0_lo = vmull_lane_s32(vget_low_s32(in0), c, 0);
+  const int64x2_t t1_lo = vmull_lane_s32(vget_low_s32(in0), c, 1);
+  const int64x2_t t0_hi = vmull_lane_s32(vget_high_s32(in0), c, 0);
+  const int64x2_t t1_hi = vmull_lane_s32(vget_high_s32(in0), c, 1);
+
+  s0[0] = vmlal_lane_s32(t0_lo, vget_low_s32(in1), c, 1);
+  s1[0] = vmlsl_lane_s32(t1_lo, vget_low_s32(in1), c, 0);
+  s0[1] = vmlal_lane_s32(t0_hi, vget_high_s32(in1), c, 1);
+  s1[1] = vmlsl_lane_s32(t1_hi, vget_high_s32(in1), c, 0);
+}
+
+static INLINE void highbd_iadst_butterfly_lane_1_0_neon(const int32x4_t in0,
+                                                        const int32x4_t in1,
+                                                        const int32x2_t c,
+                                                        int64x2_t *const s0,
+                                                        int64x2_t *const s1) {
+  const int64x2_t t0_lo = vmull_lane_s32(vget_low_s32(in0), c, 1);
+  const int64x2_t t1_lo = vmull_lane_s32(vget_low_s32(in0), c, 0);
+  const int64x2_t t0_hi = vmull_lane_s32(vget_high_s32(in0), c, 1);
+  const int64x2_t t1_hi = vmull_lane_s32(vget_high_s32(in0), c, 0);
+
+  s0[0] = vmlal_lane_s32(t0_lo, vget_low_s32(in1), c, 0);
+  s1[0] = vmlsl_lane_s32(t1_lo, vget_low_s32(in1), c, 1);
+  s0[1] = vmlal_lane_s32(t0_hi, vget_high_s32(in1), c, 0);
+  s1[1] = vmlsl_lane_s32(t1_hi, vget_high_s32(in1), c, 1);
+}
+
+static INLINE int32x4_t highbd_add_dct_const_round_shift_low_8(
+    const int64x2_t *const in0, const int64x2_t *const in1) {
+  const int64x2_t sum_lo = vaddq_s64(in0[0], in1[0]);
+  const int64x2_t sum_hi = vaddq_s64(in0[1], in1[1]);
+  const int32x2_t out_lo = vrshrn_n_s64(sum_lo, DCT_CONST_BITS);
+  const int32x2_t out_hi = vrshrn_n_s64(sum_hi, DCT_CONST_BITS);
+  return vcombine_s32(out_lo, out_hi);
+}
+
+static INLINE int32x4_t highbd_sub_dct_const_round_shift_low_8(
+    const int64x2_t *const in0, const int64x2_t *const in1) {
+  const int64x2_t sub_lo = vsubq_s64(in0[0], in1[0]);
+  const int64x2_t sub_hi = vsubq_s64(in0[1], in1[1]);
+  const int32x2_t out_lo = vrshrn_n_s64(sub_lo, DCT_CONST_BITS);
+  const int32x2_t out_hi = vrshrn_n_s64(sub_hi, DCT_CONST_BITS);
+  return vcombine_s32(out_lo, out_hi);
+}
+
+static INLINE void highbd_iadst8(int32x4_t *const io0, int32x4_t *const io1,
+                                 int32x4_t *const io2, int32x4_t *const io3,
+                                 int32x4_t *const io4, int32x4_t *const io5,
+                                 int32x4_t *const io6, int32x4_t *const io7) {
+  const int32x4_t c0 =
+      create_s32x4_neon(cospi_2_64, cospi_30_64, cospi_10_64, cospi_22_64);
+  const int32x4_t c1 =
+      create_s32x4_neon(cospi_18_64, cospi_14_64, cospi_26_64, cospi_6_64);
+  const int32x4_t c2 =
+      create_s32x4_neon(cospi_16_64, 0, cospi_8_64, cospi_24_64);
+  int32x4_t x[8], t[4];
+  int64x2_t s[8][2];
+
+  x[0] = *io7;
+  x[1] = *io0;
+  x[2] = *io5;
+  x[3] = *io2;
+  x[4] = *io3;
+  x[5] = *io4;
+  x[6] = *io1;
+  x[7] = *io6;
+
+  // stage 1
+  highbd_iadst_butterfly_lane_0_1_neon(x[0], x[1], vget_low_s32(c0), s[0],
+                                       s[1]);
+  highbd_iadst_butterfly_lane_0_1_neon(x[2], x[3], vget_high_s32(c0), s[2],
+                                       s[3]);
+  highbd_iadst_butterfly_lane_0_1_neon(x[4], x[5], vget_low_s32(c1), s[4],
+                                       s[5]);
+  highbd_iadst_butterfly_lane_0_1_neon(x[6], x[7], vget_high_s32(c1), s[6],
+                                       s[7]);
+
+  x[0] = highbd_add_dct_const_round_shift_low_8(s[0], s[4]);
+  x[1] = highbd_add_dct_const_round_shift_low_8(s[1], s[5]);
+  x[2] = highbd_add_dct_const_round_shift_low_8(s[2], s[6]);
+  x[3] = highbd_add_dct_const_round_shift_low_8(s[3], s[7]);
+  x[4] = highbd_sub_dct_const_round_shift_low_8(s[0], s[4]);
+  x[5] = highbd_sub_dct_const_round_shift_low_8(s[1], s[5]);
+  x[6] = highbd_sub_dct_const_round_shift_low_8(s[2], s[6]);
+  x[7] = highbd_sub_dct_const_round_shift_low_8(s[3], s[7]);
+
+  // stage 2
+  t[0] = x[0];
+  t[1] = x[1];
+  t[2] = x[2];
+  t[3] = x[3];
+  highbd_iadst_butterfly_lane_0_1_neon(x[4], x[5], vget_high_s32(c2), s[4],
+                                       s[5]);
+  highbd_iadst_butterfly_lane_1_0_neon(x[7], x[6], vget_high_s32(c2), s[7],
+                                       s[6]);
+
+  x[0] = vaddq_s32(t[0], t[2]);
+  x[1] = vaddq_s32(t[1], t[3]);
+  x[2] = vsubq_s32(t[0], t[2]);
+  x[3] = vsubq_s32(t[1], t[3]);
+  x[4] = highbd_add_dct_const_round_shift_low_8(s[4], s[6]);
+  x[5] = highbd_add_dct_const_round_shift_low_8(s[5], s[7]);
+  x[6] = highbd_sub_dct_const_round_shift_low_8(s[4], s[6]);
+  x[7] = highbd_sub_dct_const_round_shift_low_8(s[5], s[7]);
+
+  // stage 3
+  highbd_iadst_half_butterfly_neon(x + 2, vget_low_s32(c2));
+  highbd_iadst_half_butterfly_neon(x + 6, vget_low_s32(c2));
+
+  *io0 = x[0];
+  *io1 = vnegq_s32(x[4]);
+  *io2 = x[6];
+  *io3 = vnegq_s32(x[2]);
+  *io4 = x[3];
+  *io5 = vnegq_s32(x[7]);
+  *io6 = x[5];
+  *io7 = vnegq_s32(x[1]);
+}
+
+void vp9_highbd_iht8x8_64_add_neon(const tran_low_t *input, uint16_t *dest,
+                                   int stride, int tx_type, int bd) {
+  int32x4_t a[16];
+  int16x8_t c[8];
+
+  a[0] = vld1q_s32(input);
+  a[1] = vld1q_s32(input + 4);
+  a[2] = vld1q_s32(input + 8);
+  a[3] = vld1q_s32(input + 12);
+  a[4] = vld1q_s32(input + 16);
+  a[5] = vld1q_s32(input + 20);
+  a[6] = vld1q_s32(input + 24);
+  a[7] = vld1q_s32(input + 28);
+  a[8] = vld1q_s32(input + 32);
+  a[9] = vld1q_s32(input + 36);
+  a[10] = vld1q_s32(input + 40);
+  a[11] = vld1q_s32(input + 44);
+  a[12] = vld1q_s32(input + 48);
+  a[13] = vld1q_s32(input + 52);
+  a[14] = vld1q_s32(input + 56);
+  a[15] = vld1q_s32(input + 60);
+
+  if (bd == 8) {
+    c[0] = vcombine_s16(vmovn_s32(a[0]), vmovn_s32(a[1]));
+    c[1] = vcombine_s16(vmovn_s32(a[2]), vmovn_s32(a[3]));
+    c[2] = vcombine_s16(vmovn_s32(a[4]), vmovn_s32(a[5]));
+    c[3] = vcombine_s16(vmovn_s32(a[6]), vmovn_s32(a[7]));
+    c[4] = vcombine_s16(vmovn_s32(a[8]), vmovn_s32(a[9]));
+    c[5] = vcombine_s16(vmovn_s32(a[10]), vmovn_s32(a[11]));
+    c[6] = vcombine_s16(vmovn_s32(a[12]), vmovn_s32(a[13]));
+    c[7] = vcombine_s16(vmovn_s32(a[14]), vmovn_s32(a[15]));
+
+    switch (tx_type) {
+      case DCT_DCT: {
+        const int16x8_t cospis = vld1q_s16(kCospi);
+        const int16x4_t cospis0 = vget_low_s16(cospis);   // cospi 0, 8, 16, 24
+        const int16x4_t cospis1 = vget_high_s16(cospis);  // cospi 4, 12, 20, 28
+
+        idct8x8_64_1d_bd8(cospis0, cospis1, c);
+        idct8x8_64_1d_bd8(cospis0, cospis1, c);
+        break;
+      }
+
+      case ADST_DCT: {
+        const int16x8_t cospis = vld1q_s16(kCospi);
+        const int16x4_t cospis0 = vget_low_s16(cospis);   // cospi 0, 8, 16, 24
+        const int16x4_t cospis1 = vget_high_s16(cospis);  // cospi 4, 12, 20, 28
+
+        idct8x8_64_1d_bd8(cospis0, cospis1, c);
+        transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6],
+                          &c[7]);
+        iadst8(c);
+        break;
+      }
+
+      case DCT_ADST: {
+        const int16x8_t cospis = vld1q_s16(kCospi);
+        const int16x4_t cospis0 = vget_low_s16(cospis);   // cospi 0, 8, 16, 24
+        const int16x4_t cospis1 = vget_high_s16(cospis);  // cospi 4, 12, 20, 28
+
+        transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6],
+                          &c[7]);
+        iadst8(c);
+        idct8x8_64_1d_bd8(cospis0, cospis1, c);
+        break;
+      }
+
+      default: {
+        transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6],
+                          &c[7]);
+        iadst8(c);
+        transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6],
+                          &c[7]);
+        iadst8(c);
+        break;
+      }
+    }
+
+    c[0] = vrshrq_n_s16(c[0], 5);
+    c[1] = vrshrq_n_s16(c[1], 5);
+    c[2] = vrshrq_n_s16(c[2], 5);
+    c[3] = vrshrq_n_s16(c[3], 5);
+    c[4] = vrshrq_n_s16(c[4], 5);
+    c[5] = vrshrq_n_s16(c[5], 5);
+    c[6] = vrshrq_n_s16(c[6], 5);
+    c[7] = vrshrq_n_s16(c[7], 5);
+  } else {
+    switch (tx_type) {
+      case DCT_DCT: {
+        const int32x4_t cospis0 = vld1q_s32(kCospi32);  // cospi 0, 8, 16, 24
+        const int32x4_t cospis1 =
+            vld1q_s32(kCospi32 + 4);  // cospi 4, 12, 20, 28
+
+        if (bd == 10) {
+          idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+                                 &a[4], &a[5], &a[6], &a[7]);
+          idct8x8_64_half1d_bd10(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
+                                 &a[12], &a[13], &a[14], &a[15]);
+          idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
+                                 &a[2], &a[10], &a[3], &a[11]);
+          idct8x8_64_half1d_bd10(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
+                                 &a[6], &a[14], &a[7], &a[15]);
+        } else {
+          idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+                                 &a[4], &a[5], &a[6], &a[7]);
+          idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
+                                 &a[12], &a[13], &a[14], &a[15]);
+          idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
+                                 &a[2], &a[10], &a[3], &a[11]);
+          idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
+                                 &a[6], &a[14], &a[7], &a[15]);
+        }
+        break;
+      }
+
+      case ADST_DCT: {
+        const int32x4_t cospis0 = vld1q_s32(kCospi32);  // cospi 0, 8, 16, 24
+        const int32x4_t cospis1 =
+            vld1q_s32(kCospi32 + 4);  // cospi 4, 12, 20, 28
+
+        idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+                               &a[4], &a[5], &a[6], &a[7]);
+        idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
+                               &a[12], &a[13], &a[14], &a[15]);
+        transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3],
+                          &a[11]);
+        highbd_iadst8(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]);
+        transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
+                          &a[15]);
+        highbd_iadst8(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
+                      &a[15]);
+        break;
+      }
+
+      case DCT_ADST: {
+        const int32x4_t cospis0 = vld1q_s32(kCospi32);  // cospi 0, 8, 16, 24
+        const int32x4_t cospis1 =
+            vld1q_s32(kCospi32 + 4);  // cospi 4, 12, 20, 28
+
+        transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6],
+                          &a[7]);
+        highbd_iadst8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+        transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
+                          &a[15]);
+        highbd_iadst8(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
+                      &a[15]);
+        idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
+                               &a[2], &a[10], &a[3], &a[11]);
+        idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
+                               &a[6], &a[14], &a[7], &a[15]);
+        break;
+      }
+
+      default: {
+        assert(tx_type == ADST_ADST);
+        transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6],
+                          &a[7]);
+        highbd_iadst8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+        transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
+                          &a[15]);
+        highbd_iadst8(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
+                      &a[15]);
+        transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3],
+                          &a[11]);
+        highbd_iadst8(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]);
+        transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
+                          &a[15]);
+        highbd_iadst8(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
+                      &a[15]);
+        break;
+      }
+    }
+
+    c[0] = vcombine_s16(vrshrn_n_s32(a[0], 5), vrshrn_n_s32(a[4], 5));
+    c[1] = vcombine_s16(vrshrn_n_s32(a[8], 5), vrshrn_n_s32(a[12], 5));
+    c[2] = vcombine_s16(vrshrn_n_s32(a[1], 5), vrshrn_n_s32(a[5], 5));
+    c[3] = vcombine_s16(vrshrn_n_s32(a[9], 5), vrshrn_n_s32(a[13], 5));
+    c[4] = vcombine_s16(vrshrn_n_s32(a[2], 5), vrshrn_n_s32(a[6], 5));
+    c[5] = vcombine_s16(vrshrn_n_s32(a[10], 5), vrshrn_n_s32(a[14], 5));
+    c[6] = vcombine_s16(vrshrn_n_s32(a[3], 5), vrshrn_n_s32(a[7], 5));
+    c[7] = vcombine_s16(vrshrn_n_s32(a[11], 5), vrshrn_n_s32(a[15], 5));
+  }
+  highbd_add8x8(c, dest, stride, bd);
+}
diff --git a/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht16x16_add_neon.c b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht16x16_add_neon.c
new file mode 100644
index 0000000000..db72ff1161
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht16x16_add_neon.c
@@ -0,0 +1,279 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/arm/neon/vp9_iht_neon.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+void vpx_iadst16x16_256_add_half1d(const void *const input, int16_t *output,
+                                   void *const dest, const int stride,
+                                   const int highbd_flag) {
+  int16x8_t in[16], out[16];
+  const int16x4_t c_1_31_5_27 =
+      create_s16x4_neon(cospi_1_64, cospi_31_64, cospi_5_64, cospi_27_64);
+  const int16x4_t c_9_23_13_19 =
+      create_s16x4_neon(cospi_9_64, cospi_23_64, cospi_13_64, cospi_19_64);
+  const int16x4_t c_17_15_21_11 =
+      create_s16x4_neon(cospi_17_64, cospi_15_64, cospi_21_64, cospi_11_64);
+  const int16x4_t c_25_7_29_3 =
+      create_s16x4_neon(cospi_25_64, cospi_7_64, cospi_29_64, cospi_3_64);
+  const int16x4_t c_4_28_20_12 =
+      create_s16x4_neon(cospi_4_64, cospi_28_64, cospi_20_64, cospi_12_64);
+  const int16x4_t c_16_n16_8_24 =
+      create_s16x4_neon(cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64);
+  int16x8_t x[16], t[12];
+  int32x4_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
+  int32x4_t s8[2], s9[2], s10[2], s11[2], s12[2], s13[2], s14[2], s15[2];
+
+  // Load input (16x8)
+  if (output) {
+    const tran_low_t *inputT = (const tran_low_t *)input;
+    in[0] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[8] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[1] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[9] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[2] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[10] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[3] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[11] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[4] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[12] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[5] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[13] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[6] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[14] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[7] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[15] = load_tran_low_to_s16q(inputT);
+  } else {
+    const int16_t *inputT = (const int16_t *)input;
+    in[0] = vld1q_s16(inputT);
+    inputT += 8;
+    in[8] = vld1q_s16(inputT);
+    inputT += 8;
+    in[1] = vld1q_s16(inputT);
+    inputT += 8;
+    in[9] = vld1q_s16(inputT);
+    inputT += 8;
+    in[2] = vld1q_s16(inputT);
+    inputT += 8;
+    in[10] = vld1q_s16(inputT);
+    inputT += 8;
+    in[3] = vld1q_s16(inputT);
+    inputT += 8;
+    in[11] = vld1q_s16(inputT);
+    inputT += 8;
+    in[4] = vld1q_s16(inputT);
+    inputT += 8;
+    in[12] = vld1q_s16(inputT);
+    inputT += 8;
+    in[5] = vld1q_s16(inputT);
+    inputT += 8;
+    in[13] = vld1q_s16(inputT);
+    inputT += 8;
+    in[6] = vld1q_s16(inputT);
+    inputT += 8;
+    in[14] = vld1q_s16(inputT);
+    inputT += 8;
+    in[7] = vld1q_s16(inputT);
+    inputT += 8;
+    in[15] = vld1q_s16(inputT);
+  }
+
+  // Transpose
+  transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+                    &in[7]);
+  transpose_s16_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14],
+                    &in[15]);
+
+  x[0] = in[15];
+  x[1] = in[0];
+  x[2] = in[13];
+  x[3] = in[2];
+  x[4] = in[11];
+  x[5] = in[4];
+  x[6] = in[9];
+  x[7] = in[6];
+  x[8] = in[7];
+  x[9] = in[8];
+  x[10] = in[5];
+  x[11] = in[10];
+  x[12] = in[3];
+  x[13] = in[12];
+  x[14] = in[1];
+  x[15] = in[14];
+
+  // stage 1
+  iadst_butterfly_lane_0_1_neon(x[0], x[1], c_1_31_5_27, s0, s1);
+  iadst_butterfly_lane_2_3_neon(x[2], x[3], c_1_31_5_27, s2, s3);
+  iadst_butterfly_lane_0_1_neon(x[4], x[5], c_9_23_13_19, s4, s5);
+  iadst_butterfly_lane_2_3_neon(x[6], x[7], c_9_23_13_19, s6, s7);
+  iadst_butterfly_lane_0_1_neon(x[8], x[9], c_17_15_21_11, s8, s9);
+  iadst_butterfly_lane_2_3_neon(x[10], x[11], c_17_15_21_11, s10, s11);
+  iadst_butterfly_lane_0_1_neon(x[12], x[13], c_25_7_29_3, s12, s13);
+  iadst_butterfly_lane_2_3_neon(x[14], x[15], c_25_7_29_3, s14, s15);
+
+  x[0] = add_dct_const_round_shift_low_8(s0, s8);
+  x[1] = add_dct_const_round_shift_low_8(s1, s9);
+  x[2] = add_dct_const_round_shift_low_8(s2, s10);
+  x[3] = add_dct_const_round_shift_low_8(s3, s11);
+  x[4] = add_dct_const_round_shift_low_8(s4, s12);
+  x[5] = add_dct_const_round_shift_low_8(s5, s13);
+  x[6] = add_dct_const_round_shift_low_8(s6, s14);
+  x[7] = add_dct_const_round_shift_low_8(s7, s15);
+  x[8] = sub_dct_const_round_shift_low_8(s0, s8);
+  x[9] = sub_dct_const_round_shift_low_8(s1, s9);
+  x[10] = sub_dct_const_round_shift_low_8(s2, s10);
+  x[11] = sub_dct_const_round_shift_low_8(s3, s11);
+  x[12] = sub_dct_const_round_shift_low_8(s4, s12);
+  x[13] = sub_dct_const_round_shift_low_8(s5, s13);
+  x[14] = sub_dct_const_round_shift_low_8(s6, s14);
+  x[15] = sub_dct_const_round_shift_low_8(s7, s15);
+
+  // stage 2
+  t[0] = x[0];
+  t[1] = x[1];
+  t[2] = x[2];
+  t[3] = x[3];
+  t[4] = x[4];
+  t[5] = x[5];
+  t[6] = x[6];
+  t[7] = x[7];
+  iadst_butterfly_lane_0_1_neon(x[8], x[9], c_4_28_20_12, s8, s9);
+  iadst_butterfly_lane_2_3_neon(x[10], x[11], c_4_28_20_12, s10, s11);
+  iadst_butterfly_lane_1_0_neon(x[13], x[12], c_4_28_20_12, s13, s12);
+  iadst_butterfly_lane_3_2_neon(x[15], x[14], c_4_28_20_12, s15, s14);
+
+  x[0] = vaddq_s16(t[0], t[4]);
+  x[1] = vaddq_s16(t[1], t[5]);
+  x[2] = vaddq_s16(t[2], t[6]);
+  x[3] = vaddq_s16(t[3], t[7]);
+  x[4] = vsubq_s16(t[0], t[4]);
+  x[5] = vsubq_s16(t[1], t[5]);
+  x[6] = vsubq_s16(t[2], t[6]);
+  x[7] = vsubq_s16(t[3], t[7]);
+  x[8] = add_dct_const_round_shift_low_8(s8, s12);
+  x[9] = add_dct_const_round_shift_low_8(s9, s13);
+  x[10] = add_dct_const_round_shift_low_8(s10, s14);
+  x[11] = add_dct_const_round_shift_low_8(s11, s15);
+  x[12] = sub_dct_const_round_shift_low_8(s8, s12);
+  x[13] = sub_dct_const_round_shift_low_8(s9, s13);
+  x[14] = sub_dct_const_round_shift_low_8(s10, s14);
+  x[15] = sub_dct_const_round_shift_low_8(s11, s15);
+
+  // stage 3
+  t[0] = x[0];
+  t[1] = x[1];
+  t[2] = x[2];
+  t[3] = x[3];
+  iadst_butterfly_lane_2_3_neon(x[4], x[5], c_16_n16_8_24, s4, s5);
+  iadst_butterfly_lane_3_2_neon(x[7], x[6], c_16_n16_8_24, s7, s6);
+  t[8] = x[8];
+  t[9] = x[9];
+  t[10] = x[10];
+  t[11] = x[11];
+  iadst_butterfly_lane_2_3_neon(x[12], x[13], c_16_n16_8_24, s12, s13);
+  iadst_butterfly_lane_3_2_neon(x[15], x[14], c_16_n16_8_24, s15, s14);
+
+  x[0] = vaddq_s16(t[0], t[2]);
+  x[1] = vaddq_s16(t[1], t[3]);
+  x[2] = vsubq_s16(t[0], t[2]);
+  x[3] = vsubq_s16(t[1], t[3]);
+  x[4] = add_dct_const_round_shift_low_8(s4, s6);
+  x[5] = add_dct_const_round_shift_low_8(s5, s7);
+  x[6] = sub_dct_const_round_shift_low_8(s4, s6);
+  x[7] = sub_dct_const_round_shift_low_8(s5, s7);
+  x[8] = vaddq_s16(t[8], t[10]);
+  x[9] = vaddq_s16(t[9], t[11]);
+  x[10] = vsubq_s16(t[8], t[10]);
+  x[11] = vsubq_s16(t[9], t[11]);
+  x[12] = add_dct_const_round_shift_low_8(s12, s14);
+  x[13] = add_dct_const_round_shift_low_8(s13, s15);
+  x[14] = sub_dct_const_round_shift_low_8(s12, s14);
+  x[15] = sub_dct_const_round_shift_low_8(s13, s15);
+
+  // stage 4
+  iadst_half_butterfly_neg_neon(&x[3], &x[2], c_16_n16_8_24);
+  iadst_half_butterfly_pos_neon(&x[7], &x[6], c_16_n16_8_24);
+  iadst_half_butterfly_pos_neon(&x[11], &x[10], c_16_n16_8_24);
+  iadst_half_butterfly_neg_neon(&x[15], &x[14], c_16_n16_8_24);
+
+  out[0] = x[0];
+  out[1] = vnegq_s16(x[8]);
+  out[2] = x[12];
+  out[3] = vnegq_s16(x[4]);
+  out[4] = x[6];
+  out[5] = x[14];
+  out[6] = x[10];
+  out[7] = x[2];
+  out[8] = x[3];
+  out[9] = x[11];
+  out[10] = x[15];
+  out[11] = x[7];
+  out[12] = x[5];
+  out[13] = vnegq_s16(x[13]);
+  out[14] = x[9];
+  out[15] = vnegq_s16(x[1]);
+
+  if (output) {
+    idct16x16_store_pass1(out, output);
+  } else {
+    if (highbd_flag) {
+      idct16x16_add_store_bd8(out, dest, stride);
+    } else {
+      idct16x16_add_store(out, dest, stride);
+    }
+  }
+}
+
+void vp9_iht16x16_256_add_neon(const tran_low_t *input, uint8_t *dest,
+                               int stride, int tx_type) {
+  static const iht_2d IHT_16[] = {
+    { vpx_idct16x16_256_add_half1d,
+      vpx_idct16x16_256_add_half1d },  // DCT_DCT  = 0
+    { vpx_iadst16x16_256_add_half1d,
+      vpx_idct16x16_256_add_half1d },  // ADST_DCT = 1
+    { vpx_idct16x16_256_add_half1d,
+      vpx_iadst16x16_256_add_half1d },  // DCT_ADST = 2
+    { vpx_iadst16x16_256_add_half1d,
+      vpx_iadst16x16_256_add_half1d }  // ADST_ADST = 3
+  };
+  const iht_2d ht = IHT_16[tx_type];
+  int16_t row_output[16 * 16];
+
+  // pass 1
+  ht.rows(input, row_output, dest, stride, 0);               // upper 8 rows
+  ht.rows(input + 8 * 16, row_output + 8, dest, stride, 0);  // lower 8 rows
+
+  // pass 2
+  ht.cols(row_output, NULL, dest, stride, 0);               // left 8 columns
+  ht.cols(row_output + 16 * 8, NULL, dest + 8, stride, 0);  // right 8 columns
+}
diff --git a/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
index dd1ea03b6b..4f0a90f215 100644
--- a/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
+++ b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
@@ -14,213 +14,63 @@
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "vp9/common/vp9_common.h"
-
-static int16_t sinpi_1_9 = 0x14a3;
-static int16_t sinpi_2_9 = 0x26c9;
-static int16_t sinpi_3_9 = 0x3441;
-static int16_t sinpi_4_9 = 0x3b6c;
-static int16_t cospi_8_64 = 0x3b21;
-static int16_t cospi_16_64 = 0x2d41;
-static int16_t cospi_24_64 = 0x187e;
-
-static INLINE void TRANSPOSE4X4(int16x8_t *q8s16, int16x8_t *q9s16) {
-  int32x4_t q8s32, q9s32;
-  int16x4x2_t d0x2s16, d1x2s16;
-  int32x4x2_t q0x2s32;
-
-  d0x2s16 = vtrn_s16(vget_low_s16(*q8s16), vget_high_s16(*q8s16));
-  d1x2s16 = vtrn_s16(vget_low_s16(*q9s16), vget_high_s16(*q9s16));
-
-  q8s32 = vreinterpretq_s32_s16(vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]));
-  q9s32 = vreinterpretq_s32_s16(vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]));
-  q0x2s32 = vtrnq_s32(q8s32, q9s32);
-
-  *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]);
-  *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]);
-}
-
-static INLINE void GENERATE_COSINE_CONSTANTS(int16x4_t *d0s16, int16x4_t *d1s16,
-                                             int16x4_t *d2s16) {
-  *d0s16 = vdup_n_s16(cospi_8_64);
-  *d1s16 = vdup_n_s16(cospi_16_64);
-  *d2s16 = vdup_n_s16(cospi_24_64);
-}
-
-static INLINE void GENERATE_SINE_CONSTANTS(int16x4_t *d3s16, int16x4_t *d4s16,
-                                           int16x4_t *d5s16, int16x8_t *q3s16) {
-  *d3s16 = vdup_n_s16(sinpi_1_9);
-  *d4s16 = vdup_n_s16(sinpi_2_9);
-  *q3s16 = vdupq_n_s16(sinpi_3_9);
-  *d5s16 = vdup_n_s16(sinpi_4_9);
-}
-
-static INLINE void IDCT4x4_1D(int16x4_t *d0s16, int16x4_t *d1s16,
-                              int16x4_t *d2s16, int16x8_t *q8s16,
-                              int16x8_t *q9s16) {
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16;
-  int16x4_t d26s16, d27s16, d28s16, d29s16;
-  int32x4_t q10s32, q13s32, q14s32, q15s32;
-  int16x8_t q13s16, q14s16;
-
-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-
-  d23s16 = vadd_s16(d16s16, d18s16);
-  d24s16 = vsub_s16(d16s16, d18s16);
-
-  q15s32 = vmull_s16(d17s16, *d2s16);
-  q10s32 = vmull_s16(d17s16, *d0s16);
-  q13s32 = vmull_s16(d23s16, *d1s16);
-  q14s32 = vmull_s16(d24s16, *d1s16);
-  q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16);
-  q10s32 = vmlal_s16(q10s32, d19s16, *d2s16);
-
-  d26s16 = vrshrn_n_s32(q13s32, 14);
-  d27s16 = vrshrn_n_s32(q14s32, 14);
-  d29s16 = vrshrn_n_s32(q15s32, 14);
-  d28s16 = vrshrn_n_s32(q10s32, 14);
-
-  q13s16 = vcombine_s16(d26s16, d27s16);
-  q14s16 = vcombine_s16(d28s16, d29s16);
-  *q8s16 = vaddq_s16(q13s16, q14s16);
-  *q9s16 = vsubq_s16(q13s16, q14s16);
-  *q9s16 = vcombine_s16(vget_high_s16(*q9s16), vget_low_s16(*q9s16));  // vswp
-}
-
-static INLINE void IADST4x4_1D(int16x4_t *d3s16, int16x4_t *d4s16,
-                               int16x4_t *d5s16, int16x8_t *q3s16,
-                               int16x8_t *q8s16, int16x8_t *q9s16) {
-  int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16;
-  int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
-
-  d6s16 = vget_low_s16(*q3s16);
-
-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-
-  q10s32 = vmull_s16(*d3s16, d16s16);
-  q11s32 = vmull_s16(*d4s16, d16s16);
-  q12s32 = vmull_s16(d6s16, d17s16);
-  q13s32 = vmull_s16(*d5s16, d18s16);
-  q14s32 = vmull_s16(*d3s16, d18s16);
-  q15s32 = vmovl_s16(d16s16);
-  q15s32 = vaddw_s16(q15s32, d19s16);
-  q8s32 = vmull_s16(*d4s16, d19s16);
-  q15s32 = vsubw_s16(q15s32, d18s16);
-  q9s32 = vmull_s16(*d5s16, d19s16);
-
-  q10s32 = vaddq_s32(q10s32, q13s32);
-  q10s32 = vaddq_s32(q10s32, q8s32);
-  q11s32 = vsubq_s32(q11s32, q14s32);
-  q8s32 = vdupq_n_s32(sinpi_3_9);
-  q11s32 = vsubq_s32(q11s32, q9s32);
-  q15s32 = vmulq_s32(q15s32, q8s32);
-
-  q13s32 = vaddq_s32(q10s32, q12s32);
-  q10s32 = vaddq_s32(q10s32, q11s32);
-  q14s32 = vaddq_s32(q11s32, q12s32);
-  q10s32 = vsubq_s32(q10s32, q12s32);
-
-  d16s16 = vrshrn_n_s32(q13s32, 14);
-  d17s16 = vrshrn_n_s32(q14s32, 14);
-  d18s16 = vrshrn_n_s32(q15s32, 14);
-  d19s16 = vrshrn_n_s32(q10s32, 14);
-
-  *q8s16 = vcombine_s16(d16s16, d17s16);
-  *q9s16 = vcombine_s16(d18s16, d19s16);
-}
+#include "vp9/common/arm/neon/vp9_iht_neon.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/txfm_common.h"
 
 void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride,
                             int tx_type) {
-  uint8x8_t d26u8, d27u8;
-  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16;
-  uint32x2_t d26u32, d27u32;
-  int16x8_t q3s16, q8s16, q9s16;
-  uint16x8_t q8u16, q9u16;
+  int16x8_t a[2];
+  uint8x8_t s[2], d[2];
+  uint16x8_t sum[2];
 
-  d26u32 = d27u32 = vdup_n_u32(0);
+  assert(!((intptr_t)dest % sizeof(uint32_t)));
+  assert(!(stride % sizeof(uint32_t)));
 
-  q8s16 = vld1q_s16(input);
-  q9s16 = vld1q_s16(input + 8);
-
-  TRANSPOSE4X4(&q8s16, &q9s16);
+  a[0] = load_tran_low_to_s16q(input);
+  a[1] = load_tran_low_to_s16q(input + 8);
+  transpose_s16_4x4q(&a[0], &a[1]);
 
   switch (tx_type) {
-    case 0:  // idct_idct is not supported. Fall back to C
-      vp9_iht4x4_16_add_c(input, dest, stride, tx_type);
-      return;
-    case 1:  // iadst_idct
-      // generate constants
-      GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
-      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
-
-      // first transform rows
-      IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
-
-      // transpose the matrix
-      TRANSPOSE4X4(&q8s16, &q9s16);
-
-      // then transform columns
-      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+    case DCT_DCT:
+      idct4x4_16_kernel_bd8(a);
+      a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+      transpose_s16_4x4q(&a[0], &a[1]);
+      idct4x4_16_kernel_bd8(a);
+      a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
       break;
-    case 2:  // idct_iadst
-      // generate constantsyy
-      GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
-      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
 
-      // first transform rows
-      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
-
-      // transpose the matrix
-      TRANSPOSE4X4(&q8s16, &q9s16);
-
-      // then transform columns
-      IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
+    case ADST_DCT:
+      idct4x4_16_kernel_bd8(a);
+      a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+      transpose_s16_4x4q(&a[0], &a[1]);
+      iadst4(a);
       break;
-    case 3:  // iadst_iadst
-      // generate constants
-      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
 
-      // first transform rows
-      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
-
-      // transpose the matrix
-      TRANSPOSE4X4(&q8s16, &q9s16);
-
-      // then transform columns
-      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+    case DCT_ADST:
+      iadst4(a);
+      transpose_s16_4x4q(&a[0], &a[1]);
+      idct4x4_16_kernel_bd8(a);
+      a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
       break;
-    default:  // iadst_idct
-      assert(0);
+
+    default:
+      assert(tx_type == ADST_ADST);
+      iadst4(a);
+      transpose_s16_4x4q(&a[0], &a[1]);
+      iadst4(a);
       break;
   }
 
-  q8s16 = vrshrq_n_s16(q8s16, 4);
-  q9s16 = vrshrq_n_s16(q9s16, 4);
-
-  d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0);
-  dest += stride;
-  d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1);
-  dest += stride;
-  d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0);
-  dest += stride;
-  d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1);
-
-  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
-  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
-
-  d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-  d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1);
-  dest -= stride;
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0);
-  dest -= stride;
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1);
-  dest -= stride;
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0);
+  a[0] = vrshrq_n_s16(a[0], 4);
+  a[1] = vrshrq_n_s16(a[1], 4);
+  s[0] = load_u8(dest, stride);
+  s[1] = load_u8(dest + 2 * stride, stride);
+  sum[0] = vaddw_u8(vreinterpretq_u16_s16(a[0]), s[0]);
+  sum[1] = vaddw_u8(vreinterpretq_u16_s16(a[1]), s[1]);
+  d[0] = vqmovun_s16(vreinterpretq_s16_u16(sum[0]));
+  d[1] = vqmovun_s16(vreinterpretq_s16_u16(sum[1]));
+  store_u8(dest, stride, d[0]);
+  store_u8(dest + 2 * stride, stride, d[1]);
 }
diff --git a/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
index 1c739861c3..46ee632e01 100644
--- a/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
+++ b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
@@ -14,527 +14,55 @@
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "vp9/common/vp9_common.h"
+#include "vp9/common/arm/neon/vp9_iht_neon.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 
-static int16_t cospi_2_64 = 16305;
-static int16_t cospi_4_64 = 16069;
-static int16_t cospi_6_64 = 15679;
-static int16_t cospi_8_64 = 15137;
-static int16_t cospi_10_64 = 14449;
-static int16_t cospi_12_64 = 13623;
-static int16_t cospi_14_64 = 12665;
-static int16_t cospi_16_64 = 11585;
-static int16_t cospi_18_64 = 10394;
-static int16_t cospi_20_64 = 9102;
-static int16_t cospi_22_64 = 7723;
-static int16_t cospi_24_64 = 6270;
-static int16_t cospi_26_64 = 4756;
-static int16_t cospi_28_64 = 3196;
-static int16_t cospi_30_64 = 1606;
-
-static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
-                              int16x8_t *q10s16, int16x8_t *q11s16,
-                              int16x8_t *q12s16, int16x8_t *q13s16,
-                              int16x8_t *q14s16, int16x8_t *q15s16) {
-  int16x4_t d0s16, d1s16, d2s16, d3s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-  int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
-  int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
-
-  d0s16 = vdup_n_s16(cospi_28_64);
-  d1s16 = vdup_n_s16(cospi_4_64);
-  d2s16 = vdup_n_s16(cospi_12_64);
-  d3s16 = vdup_n_s16(cospi_20_64);
-
-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-  d20s16 = vget_low_s16(*q10s16);
-  d21s16 = vget_high_s16(*q10s16);
-  d22s16 = vget_low_s16(*q11s16);
-  d23s16 = vget_high_s16(*q11s16);
-  d24s16 = vget_low_s16(*q12s16);
-  d25s16 = vget_high_s16(*q12s16);
-  d26s16 = vget_low_s16(*q13s16);
-  d27s16 = vget_high_s16(*q13s16);
-  d28s16 = vget_low_s16(*q14s16);
-  d29s16 = vget_high_s16(*q14s16);
-  d30s16 = vget_low_s16(*q15s16);
-  d31s16 = vget_high_s16(*q15s16);
-
-  q2s32 = vmull_s16(d18s16, d0s16);
-  q3s32 = vmull_s16(d19s16, d0s16);
-  q5s32 = vmull_s16(d26s16, d2s16);
-  q6s32 = vmull_s16(d27s16, d2s16);
-
-  q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
-  q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
-  q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
-  q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
-
-  d8s16 = vrshrn_n_s32(q2s32, 14);
-  d9s16 = vrshrn_n_s32(q3s32, 14);
-  d10s16 = vrshrn_n_s32(q5s32, 14);
-  d11s16 = vrshrn_n_s32(q6s32, 14);
-  q4s16 = vcombine_s16(d8s16, d9s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-
-  q2s32 = vmull_s16(d18s16, d1s16);
-  q3s32 = vmull_s16(d19s16, d1s16);
-  q9s32 = vmull_s16(d26s16, d3s16);
-  q13s32 = vmull_s16(d27s16, d3s16);
-
-  q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
-  q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
-  q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
-  q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
-
-  d14s16 = vrshrn_n_s32(q2s32, 14);
-  d15s16 = vrshrn_n_s32(q3s32, 14);
-  d12s16 = vrshrn_n_s32(q9s32, 14);
-  d13s16 = vrshrn_n_s32(q13s32, 14);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-  q7s16 = vcombine_s16(d14s16, d15s16);
-
-  d0s16 = vdup_n_s16(cospi_16_64);
-
-  q2s32 = vmull_s16(d16s16, d0s16);
-  q3s32 = vmull_s16(d17s16, d0s16);
-  q13s32 = vmull_s16(d16s16, d0s16);
-  q15s32 = vmull_s16(d17s16, d0s16);
-
-  q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
-  q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
-  q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
-  q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
-
-  d0s16 = vdup_n_s16(cospi_24_64);
-  d1s16 = vdup_n_s16(cospi_8_64);
-
-  d18s16 = vrshrn_n_s32(q2s32, 14);
-  d19s16 = vrshrn_n_s32(q3s32, 14);
-  d22s16 = vrshrn_n_s32(q13s32, 14);
-  d23s16 = vrshrn_n_s32(q15s32, 14);
-  *q9s16 = vcombine_s16(d18s16, d19s16);
-  *q11s16 = vcombine_s16(d22s16, d23s16);
-
-  q2s32 = vmull_s16(d20s16, d0s16);
-  q3s32 = vmull_s16(d21s16, d0s16);
-  q8s32 = vmull_s16(d20s16, d1s16);
-  q12s32 = vmull_s16(d21s16, d1s16);
-
-  q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
-  q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
-  q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
-  q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
-
-  d26s16 = vrshrn_n_s32(q2s32, 14);
-  d27s16 = vrshrn_n_s32(q3s32, 14);
-  d30s16 = vrshrn_n_s32(q8s32, 14);
-  d31s16 = vrshrn_n_s32(q12s32, 14);
-  *q13s16 = vcombine_s16(d26s16, d27s16);
-  *q15s16 = vcombine_s16(d30s16, d31s16);
-
-  q0s16 = vaddq_s16(*q9s16, *q15s16);
-  q1s16 = vaddq_s16(*q11s16, *q13s16);
-  q2s16 = vsubq_s16(*q11s16, *q13s16);
-  q3s16 = vsubq_s16(*q9s16, *q15s16);
-
-  *q13s16 = vsubq_s16(q4s16, q5s16);
-  q4s16 = vaddq_s16(q4s16, q5s16);
-  *q14s16 = vsubq_s16(q7s16, q6s16);
-  q7s16 = vaddq_s16(q7s16, q6s16);
-  d26s16 = vget_low_s16(*q13s16);
-  d27s16 = vget_high_s16(*q13s16);
-  d28s16 = vget_low_s16(*q14s16);
-  d29s16 = vget_high_s16(*q14s16);
-
-  d16s16 = vdup_n_s16(cospi_16_64);
-
-  q9s32 = vmull_s16(d28s16, d16s16);
-  q10s32 = vmull_s16(d29s16, d16s16);
-  q11s32 = vmull_s16(d28s16, d16s16);
-  q12s32 = vmull_s16(d29s16, d16s16);
-
-  q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
-  q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
-  q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
-  q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
-
-  d10s16 = vrshrn_n_s32(q9s32, 14);
-  d11s16 = vrshrn_n_s32(q10s32, 14);
-  d12s16 = vrshrn_n_s32(q11s32, 14);
-  d13s16 = vrshrn_n_s32(q12s32, 14);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  *q8s16 = vaddq_s16(q0s16, q7s16);
-  *q9s16 = vaddq_s16(q1s16, q6s16);
-  *q10s16 = vaddq_s16(q2s16, q5s16);
-  *q11s16 = vaddq_s16(q3s16, q4s16);
-  *q12s16 = vsubq_s16(q3s16, q4s16);
-  *q13s16 = vsubq_s16(q2s16, q5s16);
-  *q14s16 = vsubq_s16(q1s16, q6s16);
-  *q15s16 = vsubq_s16(q0s16, q7s16);
-}
-
-static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
-                               int16x8_t *q10s16, int16x8_t *q11s16,
-                               int16x8_t *q12s16, int16x8_t *q13s16,
-                               int16x8_t *q14s16, int16x8_t *q15s16) {
-  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  int16x8_t q2s16, q4s16, q5s16, q6s16;
-  int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q7s32, q8s32;
-  int32x4_t q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
-
-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-  d20s16 = vget_low_s16(*q10s16);
-  d21s16 = vget_high_s16(*q10s16);
-  d22s16 = vget_low_s16(*q11s16);
-  d23s16 = vget_high_s16(*q11s16);
-  d24s16 = vget_low_s16(*q12s16);
-  d25s16 = vget_high_s16(*q12s16);
-  d26s16 = vget_low_s16(*q13s16);
-  d27s16 = vget_high_s16(*q13s16);
-  d28s16 = vget_low_s16(*q14s16);
-  d29s16 = vget_high_s16(*q14s16);
-  d30s16 = vget_low_s16(*q15s16);
-  d31s16 = vget_high_s16(*q15s16);
-
-  d14s16 = vdup_n_s16(cospi_2_64);
-  d15s16 = vdup_n_s16(cospi_30_64);
-
-  q1s32 = vmull_s16(d30s16, d14s16);
-  q2s32 = vmull_s16(d31s16, d14s16);
-  q3s32 = vmull_s16(d30s16, d15s16);
-  q4s32 = vmull_s16(d31s16, d15s16);
-
-  d30s16 = vdup_n_s16(cospi_18_64);
-  d31s16 = vdup_n_s16(cospi_14_64);
-
-  q1s32 = vmlal_s16(q1s32, d16s16, d15s16);
-  q2s32 = vmlal_s16(q2s32, d17s16, d15s16);
-  q3s32 = vmlsl_s16(q3s32, d16s16, d14s16);
-  q4s32 = vmlsl_s16(q4s32, d17s16, d14s16);
-
-  q5s32 = vmull_s16(d22s16, d30s16);
-  q6s32 = vmull_s16(d23s16, d30s16);
-  q7s32 = vmull_s16(d22s16, d31s16);
-  q8s32 = vmull_s16(d23s16, d31s16);
-
-  q5s32 = vmlal_s16(q5s32, d24s16, d31s16);
-  q6s32 = vmlal_s16(q6s32, d25s16, d31s16);
-  q7s32 = vmlsl_s16(q7s32, d24s16, d30s16);
-  q8s32 = vmlsl_s16(q8s32, d25s16, d30s16);
-
-  q11s32 = vaddq_s32(q1s32, q5s32);
-  q12s32 = vaddq_s32(q2s32, q6s32);
-  q1s32 = vsubq_s32(q1s32, q5s32);
-  q2s32 = vsubq_s32(q2s32, q6s32);
-
-  d22s16 = vrshrn_n_s32(q11s32, 14);
-  d23s16 = vrshrn_n_s32(q12s32, 14);
-  *q11s16 = vcombine_s16(d22s16, d23s16);
-
-  q12s32 = vaddq_s32(q3s32, q7s32);
-  q15s32 = vaddq_s32(q4s32, q8s32);
-  q3s32 = vsubq_s32(q3s32, q7s32);
-  q4s32 = vsubq_s32(q4s32, q8s32);
-
-  d2s16 = vrshrn_n_s32(q1s32, 14);
-  d3s16 = vrshrn_n_s32(q2s32, 14);
-  d24s16 = vrshrn_n_s32(q12s32, 14);
-  d25s16 = vrshrn_n_s32(q15s32, 14);
-  d6s16 = vrshrn_n_s32(q3s32, 14);
-  d7s16 = vrshrn_n_s32(q4s32, 14);
-  *q12s16 = vcombine_s16(d24s16, d25s16);
-
-  d0s16 = vdup_n_s16(cospi_10_64);
-  d1s16 = vdup_n_s16(cospi_22_64);
-  q4s32 = vmull_s16(d26s16, d0s16);
-  q5s32 = vmull_s16(d27s16, d0s16);
-  q2s32 = vmull_s16(d26s16, d1s16);
-  q6s32 = vmull_s16(d27s16, d1s16);
-
-  d30s16 = vdup_n_s16(cospi_26_64);
-  d31s16 = vdup_n_s16(cospi_6_64);
-
-  q4s32 = vmlal_s16(q4s32, d20s16, d1s16);
-  q5s32 = vmlal_s16(q5s32, d21s16, d1s16);
-  q2s32 = vmlsl_s16(q2s32, d20s16, d0s16);
-  q6s32 = vmlsl_s16(q6s32, d21s16, d0s16);
-
-  q0s32 = vmull_s16(d18s16, d30s16);
-  q13s32 = vmull_s16(d19s16, d30s16);
-
-  q0s32 = vmlal_s16(q0s32, d28s16, d31s16);
-  q13s32 = vmlal_s16(q13s32, d29s16, d31s16);
-
-  q10s32 = vmull_s16(d18s16, d31s16);
-  q9s32 = vmull_s16(d19s16, d31s16);
-
-  q10s32 = vmlsl_s16(q10s32, d28s16, d30s16);
-  q9s32 = vmlsl_s16(q9s32, d29s16, d30s16);
-
-  q14s32 = vaddq_s32(q2s32, q10s32);
-  q15s32 = vaddq_s32(q6s32, q9s32);
-  q2s32 = vsubq_s32(q2s32, q10s32);
-  q6s32 = vsubq_s32(q6s32, q9s32);
-
-  d28s16 = vrshrn_n_s32(q14s32, 14);
-  d29s16 = vrshrn_n_s32(q15s32, 14);
-  d4s16 = vrshrn_n_s32(q2s32, 14);
-  d5s16 = vrshrn_n_s32(q6s32, 14);
-  *q14s16 = vcombine_s16(d28s16, d29s16);
-
-  q9s32 = vaddq_s32(q4s32, q0s32);
-  q10s32 = vaddq_s32(q5s32, q13s32);
-  q4s32 = vsubq_s32(q4s32, q0s32);
-  q5s32 = vsubq_s32(q5s32, q13s32);
-
-  d30s16 = vdup_n_s16(cospi_8_64);
-  d31s16 = vdup_n_s16(cospi_24_64);
-
-  d18s16 = vrshrn_n_s32(q9s32, 14);
-  d19s16 = vrshrn_n_s32(q10s32, 14);
-  d8s16 = vrshrn_n_s32(q4s32, 14);
-  d9s16 = vrshrn_n_s32(q5s32, 14);
-  *q9s16 = vcombine_s16(d18s16, d19s16);
-
-  q5s32 = vmull_s16(d2s16, d30s16);
-  q6s32 = vmull_s16(d3s16, d30s16);
-  q7s32 = vmull_s16(d2s16, d31s16);
-  q0s32 = vmull_s16(d3s16, d31s16);
-
-  q5s32 = vmlal_s16(q5s32, d6s16, d31s16);
-  q6s32 = vmlal_s16(q6s32, d7s16, d31s16);
-  q7s32 = vmlsl_s16(q7s32, d6s16, d30s16);
-  q0s32 = vmlsl_s16(q0s32, d7s16, d30s16);
-
-  q1s32 = vmull_s16(d4s16, d30s16);
-  q3s32 = vmull_s16(d5s16, d30s16);
-  q10s32 = vmull_s16(d4s16, d31s16);
-  q2s32 = vmull_s16(d5s16, d31s16);
-
-  q1s32 = vmlsl_s16(q1s32, d8s16, d31s16);
-  q3s32 = vmlsl_s16(q3s32, d9s16, d31s16);
-  q10s32 = vmlal_s16(q10s32, d8s16, d30s16);
-  q2s32 = vmlal_s16(q2s32, d9s16, d30s16);
-
-  *q8s16 = vaddq_s16(*q11s16, *q9s16);
-  *q11s16 = vsubq_s16(*q11s16, *q9s16);
-  q4s16 = vaddq_s16(*q12s16, *q14s16);
-  *q12s16 = vsubq_s16(*q12s16, *q14s16);
-
-  q14s32 = vaddq_s32(q5s32, q1s32);
-  q15s32 = vaddq_s32(q6s32, q3s32);
-  q5s32 = vsubq_s32(q5s32, q1s32);
-  q6s32 = vsubq_s32(q6s32, q3s32);
-
-  d18s16 = vrshrn_n_s32(q14s32, 14);
-  d19s16 = vrshrn_n_s32(q15s32, 14);
-  d10s16 = vrshrn_n_s32(q5s32, 14);
-  d11s16 = vrshrn_n_s32(q6s32, 14);
-  *q9s16 = vcombine_s16(d18s16, d19s16);
-
-  q1s32 = vaddq_s32(q7s32, q10s32);
-  q3s32 = vaddq_s32(q0s32, q2s32);
-  q7s32 = vsubq_s32(q7s32, q10s32);
-  q0s32 = vsubq_s32(q0s32, q2s32);
-
-  d28s16 = vrshrn_n_s32(q1s32, 14);
-  d29s16 = vrshrn_n_s32(q3s32, 14);
-  d14s16 = vrshrn_n_s32(q7s32, 14);
-  d15s16 = vrshrn_n_s32(q0s32, 14);
-  *q14s16 = vcombine_s16(d28s16, d29s16);
-
-  d30s16 = vdup_n_s16(cospi_16_64);
-
-  d22s16 = vget_low_s16(*q11s16);
-  d23s16 = vget_high_s16(*q11s16);
-  q2s32 = vmull_s16(d22s16, d30s16);
-  q3s32 = vmull_s16(d23s16, d30s16);
-  q13s32 = vmull_s16(d22s16, d30s16);
-  q1s32 = vmull_s16(d23s16, d30s16);
-
-  d24s16 = vget_low_s16(*q12s16);
-  d25s16 = vget_high_s16(*q12s16);
-  q2s32 = vmlal_s16(q2s32, d24s16, d30s16);
-  q3s32 = vmlal_s16(q3s32, d25s16, d30s16);
-  q13s32 = vmlsl_s16(q13s32, d24s16, d30s16);
-  q1s32 = vmlsl_s16(q1s32, d25s16, d30s16);
-
-  d4s16 = vrshrn_n_s32(q2s32, 14);
-  d5s16 = vrshrn_n_s32(q3s32, 14);
-  d24s16 = vrshrn_n_s32(q13s32, 14);
-  d25s16 = vrshrn_n_s32(q1s32, 14);
-  q2s16 = vcombine_s16(d4s16, d5s16);
-  *q12s16 = vcombine_s16(d24s16, d25s16);
-
-  q13s32 = vmull_s16(d10s16, d30s16);
-  q1s32 = vmull_s16(d11s16, d30s16);
-  q11s32 = vmull_s16(d10s16, d30s16);
-  q0s32 = vmull_s16(d11s16, d30s16);
-
-  q13s32 = vmlal_s16(q13s32, d14s16, d30s16);
-  q1s32 = vmlal_s16(q1s32, d15s16, d30s16);
-  q11s32 = vmlsl_s16(q11s32, d14s16, d30s16);
-  q0s32 = vmlsl_s16(q0s32, d15s16, d30s16);
-
-  d20s16 = vrshrn_n_s32(q13s32, 14);
-  d21s16 = vrshrn_n_s32(q1s32, 14);
-  d12s16 = vrshrn_n_s32(q11s32, 14);
-  d13s16 = vrshrn_n_s32(q0s32, 14);
-  *q10s16 = vcombine_s16(d20s16, d21s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  q5s16 = vdupq_n_s16(0);
-
-  *q9s16 = vsubq_s16(q5s16, *q9s16);
-  *q11s16 = vsubq_s16(q5s16, q2s16);
-  *q13s16 = vsubq_s16(q5s16, q6s16);
-  *q15s16 = vsubq_s16(q5s16, q4s16);
-}
-
 void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride,
                             int tx_type) {
-  int i;
-  uint8_t *d1, *d2;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8;
-  uint64x1_t d0u64, d1u64, d2u64, d3u64;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  uint16x8_t q8u16, q9u16, q10u16, q11u16;
+  const int16x8_t cospis = vld1q_s16(kCospi);
+  const int16x4_t cospis0 = vget_low_s16(cospis);   // cospi 0, 8, 16, 24
+  const int16x4_t cospis1 = vget_high_s16(cospis);  // cospi 4, 12, 20, 28
+  int16x8_t a[8];
 
-  q8s16 = vld1q_s16(input);
-  q9s16 = vld1q_s16(input + 8);
-  q10s16 = vld1q_s16(input + 8 * 2);
-  q11s16 = vld1q_s16(input + 8 * 3);
-  q12s16 = vld1q_s16(input + 8 * 4);
-  q13s16 = vld1q_s16(input + 8 * 5);
-  q14s16 = vld1q_s16(input + 8 * 6);
-  q15s16 = vld1q_s16(input + 8 * 7);
+  a[0] = load_tran_low_to_s16q(input + 0 * 8);
+  a[1] = load_tran_low_to_s16q(input + 1 * 8);
+  a[2] = load_tran_low_to_s16q(input + 2 * 8);
+  a[3] = load_tran_low_to_s16q(input + 3 * 8);
+  a[4] = load_tran_low_to_s16q(input + 4 * 8);
+  a[5] = load_tran_low_to_s16q(input + 5 * 8);
+  a[6] = load_tran_low_to_s16q(input + 6 * 8);
+  a[7] = load_tran_low_to_s16q(input + 7 * 8);
 
-  transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                    &q15s16);
+  transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
 
   switch (tx_type) {
-    case 0:  // idct_idct is not supported. Fall back to C
-      vp9_iht8x8_64_add_c(input, dest, stride, tx_type);
-      return;
-    case 1:  // iadst_idct
-      // generate IDCT constants
-      // GENERATE_IDCT_CONSTANTS
-
-      // first transform rows
-      IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                 &q15s16);
-
-      // transpose the matrix
-      transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16,
-                        &q14s16, &q15s16);
-
-      // generate IADST constants
-      // GENERATE_IADST_CONSTANTS
-
-      // then transform columns
-      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                  &q15s16);
+    case DCT_DCT:
+      idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a);
+      transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+      idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a);
       break;
-    case 2:  // idct_iadst
-      // generate IADST constants
-      // GENERATE_IADST_CONSTANTS
 
-      // first transform rows
-      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                  &q15s16);
-
-      // transpose the matrix
-      transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16,
-                        &q14s16, &q15s16);
-
-      // generate IDCT constants
-      // GENERATE_IDCT_CONSTANTS
-
-      // then transform columns
-      IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                 &q15s16);
+    case ADST_DCT:
+      idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a);
+      transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+      iadst8(a);
       break;
-    case 3:  // iadst_iadst
-      // generate IADST constants
-      // GENERATE_IADST_CONSTANTS
 
-      // first transform rows
-      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                  &q15s16);
-
-      // transpose the matrix
-      transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16,
-                        &q14s16, &q15s16);
-
-      // then transform columns
-      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                  &q15s16);
+    case DCT_ADST:
+      iadst8(a);
+      transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+      idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a);
       break;
-    default:  // iadst_idct
-      assert(0);
+
+    default:
+      assert(tx_type == ADST_ADST);
+      iadst8(a);
+      transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+      iadst8(a);
       break;
   }
 
-  q8s16 = vrshrq_n_s16(q8s16, 5);
-  q9s16 = vrshrq_n_s16(q9s16, 5);
-  q10s16 = vrshrq_n_s16(q10s16, 5);
-  q11s16 = vrshrq_n_s16(q11s16, 5);
-  q12s16 = vrshrq_n_s16(q12s16, 5);
-  q13s16 = vrshrq_n_s16(q13s16, 5);
-  q14s16 = vrshrq_n_s16(q14s16, 5);
-  q15s16 = vrshrq_n_s16(q15s16, 5);
-
-  for (d1 = d2 = dest, i = 0; i < 2; i++) {
-    if (i != 0) {
-      q8s16 = q12s16;
-      q9s16 = q13s16;
-      q10s16 = q14s16;
-      q11s16 = q15s16;
-    }
-
-    d0u64 = vld1_u64((uint64_t *)d1);
-    d1 += stride;
-    d1u64 = vld1_u64((uint64_t *)d1);
-    d1 += stride;
-    d2u64 = vld1_u64((uint64_t *)d1);
-    d1 += stride;
-    d3u64 = vld1_u64((uint64_t *)d1);
-    d1 += stride;
-
-    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
-    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
-    q10u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
-    q11u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
-
-    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
-    d2 += stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
-    d2 += stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-    d2 += stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-    d2 += stride;
-  }
+  idct8x8_add8x8_neon(a, dest, stride);
 }
diff --git a/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht_neon.h b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht_neon.h
new file mode 100644
index 0000000000..c64822e27c
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht_neon.h
@@ -0,0 +1,272 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_
+#define VPX_VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE void iadst4(int16x8_t *const io) {
+  const int32x4_t c3 = vdupq_n_s32(sinpi_3_9);
+  int16x4_t x[4];
+  int32x4_t s[8], output[4];
+  const int16x4_t c =
+      create_s16x4_neon(sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9);
+
+  x[0] = vget_low_s16(io[0]);
+  x[1] = vget_low_s16(io[1]);
+  x[2] = vget_high_s16(io[0]);
+  x[3] = vget_high_s16(io[1]);
+
+  s[0] = vmull_lane_s16(x[0], c, 0);
+  s[1] = vmull_lane_s16(x[0], c, 1);
+  s[2] = vmull_lane_s16(x[1], c, 2);
+  s[3] = vmull_lane_s16(x[2], c, 3);
+  s[4] = vmull_lane_s16(x[2], c, 0);
+  s[5] = vmull_lane_s16(x[3], c, 1);
+  s[6] = vmull_lane_s16(x[3], c, 3);
+  s[7] = vaddl_s16(x[0], x[3]);
+  s[7] = vsubw_s16(s[7], x[2]);
+
+  s[0] = vaddq_s32(s[0], s[3]);
+  s[0] = vaddq_s32(s[0], s[5]);
+  s[1] = vsubq_s32(s[1], s[4]);
+  s[1] = vsubq_s32(s[1], s[6]);
+  s[3] = s[2];
+  s[2] = vmulq_s32(c3, s[7]);
+
+  output[0] = vaddq_s32(s[0], s[3]);
+  output[1] = vaddq_s32(s[1], s[3]);
+  output[2] = s[2];
+  output[3] = vaddq_s32(s[0], s[1]);
+  output[3] = vsubq_s32(output[3], s[3]);
+  dct_const_round_shift_low_8_dual(output, &io[0], &io[1]);
+}
+
+static INLINE void iadst_half_butterfly_neon(int16x8_t *const x,
+                                             const int16x4_t c) {
+  // Don't add/sub before multiply, which will overflow in iadst8.
+  const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(x[0]), c, 0);
+  const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(x[0]), c, 0);
+  const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(x[1]), c, 0);
+  const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(x[1]), c, 0);
+  int32x4_t t0[2], t1[2];
+
+  t0[0] = vaddq_s32(x0_lo, x1_lo);
+  t0[1] = vaddq_s32(x0_hi, x1_hi);
+  t1[0] = vsubq_s32(x0_lo, x1_lo);
+  t1[1] = vsubq_s32(x0_hi, x1_hi);
+  x[0] = dct_const_round_shift_low_8(t0);
+  x[1] = dct_const_round_shift_low_8(t1);
+}
+
+static INLINE void iadst_half_butterfly_neg_neon(int16x8_t *const x0,
+                                                 int16x8_t *const x1,
+                                                 const int16x4_t c) {
+  // Don't add/sub before multiply, which will overflow in iadst8.
+  const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(*x0), c, 1);
+  const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(*x0), c, 1);
+  const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(*x1), c, 1);
+  const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(*x1), c, 1);
+  int32x4_t t0[2], t1[2];
+
+  t0[0] = vaddq_s32(x0_lo, x1_lo);
+  t0[1] = vaddq_s32(x0_hi, x1_hi);
+  t1[0] = vsubq_s32(x0_lo, x1_lo);
+  t1[1] = vsubq_s32(x0_hi, x1_hi);
+  *x1 = dct_const_round_shift_low_8(t0);
+  *x0 = dct_const_round_shift_low_8(t1);
+}
+
+static INLINE void iadst_half_butterfly_pos_neon(int16x8_t *const x0,
+                                                 int16x8_t *const x1,
+                                                 const int16x4_t c) {
+  // Don't add/sub before multiply, which will overflow in iadst8.
+  const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(*x0), c, 0);
+  const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(*x0), c, 0);
+  const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(*x1), c, 0);
+  const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(*x1), c, 0);
+  int32x4_t t0[2], t1[2];
+
+  t0[0] = vaddq_s32(x0_lo, x1_lo);
+  t0[1] = vaddq_s32(x0_hi, x1_hi);
+  t1[0] = vsubq_s32(x0_lo, x1_lo);
+  t1[1] = vsubq_s32(x0_hi, x1_hi);
+  *x1 = dct_const_round_shift_low_8(t0);
+  *x0 = dct_const_round_shift_low_8(t1);
+}
+
+static INLINE void iadst_butterfly_lane_0_1_neon(const int16x8_t in0,
+                                                 const int16x8_t in1,
+                                                 const int16x4_t c,
+                                                 int32x4_t *const s0,
+                                                 int32x4_t *const s1) {
+  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
+  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
+  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
+  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
+
+  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1);
+  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1);
+  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0);
+  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0);
+}
+
+static INLINE void iadst_butterfly_lane_2_3_neon(const int16x8_t in0,
+                                                 const int16x8_t in1,
+                                                 const int16x4_t c,
+                                                 int32x4_t *const s0,
+                                                 int32x4_t *const s1) {
+  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
+  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
+  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
+  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
+
+  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3);
+  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3);
+  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2);
+  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2);
+}
+
+static INLINE void iadst_butterfly_lane_1_0_neon(const int16x8_t in0,
+                                                 const int16x8_t in1,
+                                                 const int16x4_t c,
+                                                 int32x4_t *const s0,
+                                                 int32x4_t *const s1) {
+  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
+  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
+  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
+  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
+
+  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 0);
+  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 0);
+  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 1);
+  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 1);
+}
+
+static INLINE void iadst_butterfly_lane_3_2_neon(const int16x8_t in0,
+                                                 const int16x8_t in1,
+                                                 const int16x4_t c,
+                                                 int32x4_t *const s0,
+                                                 int32x4_t *const s1) {
+  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
+  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
+  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
+  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
+
+  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2);
+  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2);
+  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3);
+  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3);
+}
+
+static INLINE int16x8_t add_dct_const_round_shift_low_8(
+    const int32x4_t *const in0, const int32x4_t *const in1) {
+  int32x4_t sum[2];
+
+  sum[0] = vaddq_s32(in0[0], in1[0]);
+  sum[1] = vaddq_s32(in0[1], in1[1]);
+  return dct_const_round_shift_low_8(sum);
+}
+
+static INLINE int16x8_t sub_dct_const_round_shift_low_8(
+    const int32x4_t *const in0, const int32x4_t *const in1) {
+  int32x4_t sum[2];
+
+  sum[0] = vsubq_s32(in0[0], in1[0]);
+  sum[1] = vsubq_s32(in0[1], in1[1]);
+  return dct_const_round_shift_low_8(sum);
+}
+
+static INLINE void iadst8(int16x8_t *const io) {
+  const int16x4_t c0 =
+      create_s16x4_neon(cospi_2_64, cospi_30_64, cospi_10_64, cospi_22_64);
+  const int16x4_t c1 =
+      create_s16x4_neon(cospi_18_64, cospi_14_64, cospi_26_64, cospi_6_64);
+  const int16x4_t c2 =
+      create_s16x4_neon(cospi_16_64, 0, cospi_8_64, cospi_24_64);
+  int16x8_t x[8], t[4];
+  int32x4_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
+
+  x[0] = io[7];
+  x[1] = io[0];
+  x[2] = io[5];
+  x[3] = io[2];
+  x[4] = io[3];
+  x[5] = io[4];
+  x[6] = io[1];
+  x[7] = io[6];
+
+  // stage 1
+  iadst_butterfly_lane_0_1_neon(x[0], x[1], c0, s0, s1);
+  iadst_butterfly_lane_2_3_neon(x[2], x[3], c0, s2, s3);
+  iadst_butterfly_lane_0_1_neon(x[4], x[5], c1, s4, s5);
+  iadst_butterfly_lane_2_3_neon(x[6], x[7], c1, s6, s7);
+
+  x[0] = add_dct_const_round_shift_low_8(s0, s4);
+  x[1] = add_dct_const_round_shift_low_8(s1, s5);
+  x[2] = add_dct_const_round_shift_low_8(s2, s6);
+  x[3] = add_dct_const_round_shift_low_8(s3, s7);
+  x[4] = sub_dct_const_round_shift_low_8(s0, s4);
+  x[5] = sub_dct_const_round_shift_low_8(s1, s5);
+  x[6] = sub_dct_const_round_shift_low_8(s2, s6);
+  x[7] = sub_dct_const_round_shift_low_8(s3, s7);
+
+  // stage 2
+  t[0] = x[0];
+  t[1] = x[1];
+  t[2] = x[2];
+  t[3] = x[3];
+  iadst_butterfly_lane_2_3_neon(x[4], x[5], c2, s4, s5);
+  iadst_butterfly_lane_3_2_neon(x[7], x[6], c2, s7, s6);
+
+  x[0] = vaddq_s16(t[0], t[2]);
+  x[1] = vaddq_s16(t[1], t[3]);
+  x[2] = vsubq_s16(t[0], t[2]);
+  x[3] = vsubq_s16(t[1], t[3]);
+  x[4] = add_dct_const_round_shift_low_8(s4, s6);
+  x[5] = add_dct_const_round_shift_low_8(s5, s7);
+  x[6] = sub_dct_const_round_shift_low_8(s4, s6);
+  x[7] = sub_dct_const_round_shift_low_8(s5, s7);
+
+  // stage 3
+  iadst_half_butterfly_neon(x + 2, c2);
+  iadst_half_butterfly_neon(x + 6, c2);
+
+  io[0] = x[0];
+  io[1] = vnegq_s16(x[4]);
+  io[2] = x[6];
+  io[3] = vnegq_s16(x[2]);
+  io[4] = x[3];
+  io[5] = vnegq_s16(x[7]);
+  io[6] = x[5];
+  io[7] = vnegq_s16(x[1]);
+}
+
+void vpx_iadst16x16_256_add_half1d(const void *const input, int16_t *output,
+                                   void *const dest, const int stride,
+                                   const int highbd_flag);
+
+typedef void (*iht_1d)(const void *const input, int16_t *output,
+                       void *const dest, const int stride,
+                       const int highbd_flag);
+
+typedef struct {
+  iht_1d cols, rows;  // vertical and horizontal
+} iht_2d;
+
+#endif  // VPX_VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_
diff --git a/media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c b/media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c
index 3e3530116d..c031322806 100644
--- a/media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c
+++ b/media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c
@@ -10,6 +10,7 @@
 
 #include <assert.h>
 
+#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_enums.h"
 #include "vpx_dsp/mips/inv_txfm_msa.h"
 
diff --git a/media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c b/media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c
index 786fbdb794..aaccd5ca7b 100644
--- a/media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c
+++ b/media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c
@@ -10,6 +10,7 @@
 
 #include <assert.h>
 
+#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_enums.h"
 #include "vpx_dsp/mips/inv_txfm_msa.h"
 
diff --git a/media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c b/media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c
index e4166775da..76d15ff8c0 100644
--- a/media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c
+++ b/media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c
@@ -10,6 +10,7 @@
 
 #include <assert.h>
 
+#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_enums.h"
 #include "vpx_dsp/mips/inv_txfm_msa.h"
 
diff --git a/media/libvpx/libvpx/vp9/common/ppc/vp9_idct_vsx.c b/media/libvpx/libvpx/vp9/common/ppc/vp9_idct_vsx.c
new file mode 100644
index 0000000000..e861596ad4
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/common/ppc/vp9_idct_vsx.c
@@ -0,0 +1,116 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/ppc/inv_txfm_vsx.h"
+#include "vpx_dsp/ppc/bitdepth_conversion_vsx.h"
+
+#include "vp9/common/vp9_enums.h"
+
+void vp9_iht4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, int stride,
+                           int tx_type) {
+  int16x8_t in[2], out[2];
+
+  in[0] = load_tran_low(0, input);
+  in[1] = load_tran_low(8 * sizeof(*input), input);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      vpx_idct4_vsx(in, out);
+      vpx_idct4_vsx(out, in);
+      break;
+    case ADST_DCT:
+      vpx_idct4_vsx(in, out);
+      vp9_iadst4_vsx(out, in);
+      break;
+    case DCT_ADST:
+      vp9_iadst4_vsx(in, out);
+      vpx_idct4_vsx(out, in);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      vp9_iadst4_vsx(in, out);
+      vp9_iadst4_vsx(out, in);
+      break;
+  }
+
+  vpx_round_store4x4_vsx(in, out, dest, stride);
+}
+
+void vp9_iht8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, int stride,
+                           int tx_type) {
+  int16x8_t in[8], out[8];
+
+  // load input data
+  in[0] = load_tran_low(0, input);
+  in[1] = load_tran_low(8 * sizeof(*input), input);
+  in[2] = load_tran_low(2 * 8 * sizeof(*input), input);
+  in[3] = load_tran_low(3 * 8 * sizeof(*input), input);
+  in[4] = load_tran_low(4 * 8 * sizeof(*input), input);
+  in[5] = load_tran_low(5 * 8 * sizeof(*input), input);
+  in[6] = load_tran_low(6 * 8 * sizeof(*input), input);
+  in[7] = load_tran_low(7 * 8 * sizeof(*input), input);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      vpx_idct8_vsx(in, out);
+      vpx_idct8_vsx(out, in);
+      break;
+    case ADST_DCT:
+      vpx_idct8_vsx(in, out);
+      vp9_iadst8_vsx(out, in);
+      break;
+    case DCT_ADST:
+      vp9_iadst8_vsx(in, out);
+      vpx_idct8_vsx(out, in);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      vp9_iadst8_vsx(in, out);
+      vp9_iadst8_vsx(out, in);
+      break;
+  }
+
+  vpx_round_store8x8_vsx(in, dest, stride);
+}
+
+void vp9_iht16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest,
+                              int stride, int tx_type) {
+  int16x8_t in0[16], in1[16];
+
+  LOAD_INPUT16(load_tran_low, input, 0, 8 * sizeof(*input), in0);
+  LOAD_INPUT16(load_tran_low, input, 8 * 8 * 2 * sizeof(*input),
+               8 * sizeof(*input), in1);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      vpx_idct16_vsx(in0, in1);
+      vpx_idct16_vsx(in0, in1);
+      break;
+    case ADST_DCT:
+      vpx_idct16_vsx(in0, in1);
+      vpx_iadst16_vsx(in0, in1);
+      break;
+    case DCT_ADST:
+      vpx_iadst16_vsx(in0, in1);
+      vpx_idct16_vsx(in0, in1);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      vpx_iadst16_vsx(in0, in1);
+      vpx_iadst16_vsx(in0, in1);
+      break;
+  }
+
+  vpx_round_store16x16_vsx(in0, in1, dest, stride);
+}
diff --git a/media/libvpx/libvpx/vp9/common/vp9_alloccommon.c b/media/libvpx/libvpx/vp9/common/vp9_alloccommon.c
index 66aa733b9e..9e73e40ea0 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_alloccommon.c
+++ b/media/libvpx/libvpx/vp9/common/vp9_alloccommon.c
@@ -17,35 +17,26 @@
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
-// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
-// frame reference count.
-void lock_buffer_pool(BufferPool *const pool) {
-#if CONFIG_MULTITHREAD
-  pthread_mutex_lock(&pool->pool_mutex);
-#else
-  (void)pool;
-#endif
+void vp9_set_mi_size(int *mi_rows, int *mi_cols, int *mi_stride, int width,
+                     int height) {
+  const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
+  const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
+  *mi_cols = aligned_width >> MI_SIZE_LOG2;
+  *mi_rows = aligned_height >> MI_SIZE_LOG2;
+  *mi_stride = calc_mi_size(*mi_cols);
 }
 
-void unlock_buffer_pool(BufferPool *const pool) {
-#if CONFIG_MULTITHREAD
-  pthread_mutex_unlock(&pool->pool_mutex);
-#else
-  (void)pool;
-#endif
+void vp9_set_mb_size(int *mb_rows, int *mb_cols, int *mb_num, int mi_rows,
+                     int mi_cols) {
+  *mb_cols = (mi_cols + 1) >> 1;
+  *mb_rows = (mi_rows + 1) >> 1;
+  *mb_num = (*mb_rows) * (*mb_cols);
 }
 
 void vp9_set_mb_mi(VP9_COMMON *cm, int width, int height) {
-  const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
-  const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
-
-  cm->mi_cols = aligned_width >> MI_SIZE_LOG2;
-  cm->mi_rows = aligned_height >> MI_SIZE_LOG2;
-  cm->mi_stride = calc_mi_size(cm->mi_cols);
-
-  cm->mb_cols = (cm->mi_cols + 1) >> 1;
-  cm->mb_rows = (cm->mi_rows + 1) >> 1;
-  cm->MBs = cm->mb_rows * cm->mb_cols;
+  vp9_set_mi_size(&cm->mi_rows, &cm->mi_cols, &cm->mi_stride, width, height);
+  vp9_set_mb_size(&cm->mb_rows, &cm->mb_cols, &cm->MBs, cm->mi_rows,
+                  cm->mi_cols);
 }
 
 static int alloc_seg_map(VP9_COMMON *cm, int seg_map_size) {
@@ -62,8 +53,7 @@ static int alloc_seg_map(VP9_COMMON *cm, int seg_map_size) {
   cm->prev_seg_map_idx = 1;
 
   cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx];
-  if (!cm->frame_parallel_decode)
-    cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
+  cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
 
   return 0;
 }
@@ -75,22 +65,23 @@ static void free_seg_map(VP9_COMMON *cm) {
     vpx_free(cm->seg_map_array[i]);
     cm->seg_map_array[i] = NULL;
   }
+  cm->seg_map_alloc_size = 0;
 
   cm->current_frame_seg_map = NULL;
-
-  if (!cm->frame_parallel_decode) {
-    cm->last_frame_seg_map = NULL;
-  }
+  cm->last_frame_seg_map = NULL;
 }
 
 void vp9_free_ref_frame_buffers(BufferPool *pool) {
   int i;
 
+  if (!pool) return;
+
   for (i = 0; i < FRAME_BUFFERS; ++i) {
-    if (pool->frame_bufs[i].ref_count > 0 &&
+    if (!pool->frame_bufs[i].released &&
         pool->frame_bufs[i].raw_frame_buffer.data != NULL) {
       pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer);
       pool->frame_bufs[i].ref_count = 0;
+      pool->frame_bufs[i].released = 1;
     }
     vpx_free(pool->frame_bufs[i].mvs);
     pool->frame_bufs[i].mvs = NULL;
@@ -112,12 +103,13 @@ void vp9_free_postproc_buffers(VP9_COMMON *cm) {
 }
 
 void vp9_free_context_buffers(VP9_COMMON *cm) {
-  cm->free_mi(cm);
+  if (cm->free_mi) cm->free_mi(cm);
   free_seg_map(cm);
   vpx_free(cm->above_context);
   cm->above_context = NULL;
   vpx_free(cm->above_seg_context);
   cm->above_seg_context = NULL;
+  cm->above_context_alloc_cols = 0;
   vpx_free(cm->lf.lfm);
   cm->lf.lfm = NULL;
 }
@@ -143,13 +135,6 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
     cm->free_mi(cm);
     if (cm->alloc_mi(cm, new_mi_size)) goto fail;
   }
-
-  if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) {
-    // Create the segmentation map structure and set to 0.
-    free_seg_map(cm);
-    if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols)) goto fail;
-  }
-
   if (cm->above_context_alloc_cols < cm->mi_cols) {
     vpx_free(cm->above_context);
     cm->above_context = (ENTROPY_CONTEXT *)vpx_calloc(
@@ -164,6 +149,12 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
     cm->above_context_alloc_cols = cm->mi_cols;
   }
 
+  if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) {
+    // Create the segmentation map structure and set to 0.
+    free_seg_map(cm);
+    if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols)) goto fail;
+  }
+
   if (vp9_alloc_loop_filter(cm)) goto fail;
 
   return 0;
@@ -176,6 +167,9 @@ fail:
 }
 
 void vp9_remove_common(VP9_COMMON *cm) {
+#if CONFIG_VP9_POSTPROC
+  vp9_free_postproc_buffers(cm);
+#endif
   vp9_free_context_buffers(cm);
 
   vpx_free(cm->fc);
@@ -186,7 +180,7 @@ void vp9_remove_common(VP9_COMMON *cm) {
 
 void vp9_init_context_buffers(VP9_COMMON *cm) {
   cm->setup_mi(cm);
-  if (cm->last_frame_seg_map && !cm->frame_parallel_decode)
+  if (cm->last_frame_seg_map)
     memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols);
 }
 
diff --git a/media/libvpx/libvpx/vp9/common/vp9_alloccommon.h b/media/libvpx/libvpx/vp9/common/vp9_alloccommon.h
index a3a1638572..90cbb093d7 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_alloccommon.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_alloccommon.h
@@ -8,10 +8,10 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_ALLOCCOMMON_H_
-#define VP9_COMMON_VP9_ALLOCCOMMON_H_
+#ifndef VPX_VP9_COMMON_VP9_ALLOCCOMMON_H_
+#define VPX_VP9_COMMON_VP9_ALLOCCOMMON_H_
 
-#define INVALID_IDX -1  // Invalid buffer index.
+#define INVALID_IDX (-1)  // Invalid buffer index.
 
 #ifdef __cplusplus
 extern "C" {
@@ -33,6 +33,11 @@ void vp9_free_postproc_buffers(struct VP9Common *cm);
 int vp9_alloc_state_buffers(struct VP9Common *cm, int width, int height);
 void vp9_free_state_buffers(struct VP9Common *cm);
 
+void vp9_set_mi_size(int *mi_rows, int *mi_cols, int *mi_stride, int width,
+                     int height);
+void vp9_set_mb_size(int *mb_rows, int *mb_cols, int *mb_num, int mi_rows,
+                     int mi_cols);
+
 void vp9_set_mb_mi(struct VP9Common *cm, int width, int height);
 
 void vp9_swap_current_and_last_seg_map(struct VP9Common *cm);
@@ -41,4 +46,4 @@ void vp9_swap_current_and_last_seg_map(struct VP9Common *cm);
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_ALLOCCOMMON_H_
+#endif  // VPX_VP9_COMMON_VP9_ALLOCCOMMON_H_
diff --git a/media/libvpx/libvpx/vp9/common/vp9_blockd.c b/media/libvpx/libvpx/vp9/common/vp9_blockd.c
index b0249687fd..4327599510 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_blockd.c
+++ b/media/libvpx/libvpx/vp9/common/vp9_blockd.c
@@ -53,8 +53,9 @@ void vp9_foreach_transformed_block_in_plane(
   // the current block size extends into the UMV and we won't
   // visit the sub blocks that are wholly within the UMV.
   const int max_blocks_wide =
-      num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >>
-                                                       (5 + pd->subsampling_x));
+      num_4x4_w + (xd->mb_to_right_edge >= 0
+                       ? 0
+                       : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
   const int max_blocks_high =
       num_4x4_h + (xd->mb_to_bottom_edge >= 0
                        ? 0
diff --git a/media/libvpx/libvpx/vp9/common/vp9_blockd.h b/media/libvpx/libvpx/vp9/common/vp9_blockd.h
index 780b29208b..514d6b7764 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_blockd.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_blockd.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_BLOCKD_H_
-#define VP9_COMMON_VP9_BLOCKD_H_
+#ifndef VPX_VP9_COMMON_VP9_BLOCKD_H_
+#define VPX_VP9_COMMON_VP9_BLOCKD_H_
 
 #include "./vpx_config.h"
 
@@ -54,14 +54,22 @@ typedef struct {
 // decoder implementation modules critically rely on the defined entry values
 // specified herein. They should be refactored concurrently.
 
-#define NONE -1
+#define NO_REF_FRAME (-1)
 #define INTRA_FRAME 0
 #define LAST_FRAME 1
 #define GOLDEN_FRAME 2
 #define ALTREF_FRAME 3
 #define MAX_REF_FRAMES 4
+#define MAX_INTER_REF_FRAMES 3
+
 typedef int8_t MV_REFERENCE_FRAME;
 
+static INLINE int mv_ref_frame_to_inter_ref_idx(
+    MV_REFERENCE_FRAME mv_ref_frame) {
+  assert(mv_ref_frame >= LAST_FRAME && mv_ref_frame < MAX_REF_FRAMES);
+  return mv_ref_frame - 1;
+}
+
 // This structure now relates to 8x8 block regions.
 typedef struct MODE_INFO {
   // Common for both INTER and INTRA blocks
@@ -130,9 +138,11 @@ struct macroblockd_plane {
 
   // encoder
   const int16_t *dequant;
+
+  int *eob;
 };
 
-#define BLOCK_OFFSET(x, i) ((x) + (i)*16)
+#define BLOCK_OFFSET(x, i) ((x) + (i) * 16)
 
 typedef struct RefBuffer {
   // TODO(dkovalev): idx is not really required and should be removed, now it
@@ -173,7 +183,7 @@ typedef struct macroblockd {
   FRAME_CONTEXT *fc;
 
   /* pointers to reference frames */
-  RefBuffer *block_refs[2];
+  const RefBuffer *block_refs[2];
 
   /* pointer to current frame */
   const YV12_BUFFER_CONFIG *cur_buf;
@@ -193,6 +203,8 @@ typedef struct macroblockd {
   int corrupted;
 
   struct vpx_internal_error_info *error_info;
+
+  PARTITION_TYPE *partition;
 } MACROBLOCKD;
 
 static INLINE PLANE_TYPE get_plane_type(int plane) {
@@ -281,8 +293,30 @@ void vp9_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
                       BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob,
                       int aoff, int loff);
 
+#if CONFIG_MISMATCH_DEBUG
+#define TX_UNIT_SIZE_LOG2 2
+static INLINE void mi_to_pixel_loc(int *pixel_c, int *pixel_r, int mi_col,
+                                   int mi_row, int tx_blk_col, int tx_blk_row,
+                                   int subsampling_x, int subsampling_y) {
+  *pixel_c = ((mi_col << MI_SIZE_LOG2) >> subsampling_x) +
+             (tx_blk_col << TX_UNIT_SIZE_LOG2);
+  *pixel_r = ((mi_row << MI_SIZE_LOG2) >> subsampling_y) +
+             (tx_blk_row << TX_UNIT_SIZE_LOG2);
+}
+
+static INLINE int get_block_width(BLOCK_SIZE bsize) {
+  const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+  return 4 * num_4x4_w;
+}
+
+static INLINE int get_block_height(BLOCK_SIZE bsize) {
+  const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+  return 4 * num_4x4_h;
+}
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_BLOCKD_H_
+#endif  // VPX_VP9_COMMON_VP9_BLOCKD_H_
diff --git a/media/libvpx/libvpx/vp9/common/vp9_common.h b/media/libvpx/libvpx/vp9/common/vp9_common.h
index 666c3beaf0..d63bad93d1 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_common.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_common.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_COMMON_H_
-#define VP9_COMMON_VP9_COMMON_H_
+#ifndef VPX_VP9_COMMON_VP9_COMMON_H_
+#define VPX_VP9_COMMON_VP9_COMMON_H_
 
 /* Interface header for common constant data structures and lookup tables */
 
@@ -27,44 +27,25 @@ extern "C" {
 
 // Only need this for fixed-size arrays, for structs just assign.
 #define vp9_copy(dest, src)              \
-  {                                      \
+  do {                                   \
     assert(sizeof(dest) == sizeof(src)); \
     memcpy(dest, src, sizeof(src));      \
-  }
+  } while (0)
 
 // Use this for variably-sized arrays.
-#define vp9_copy_array(dest, src, n)       \
-  {                                        \
-    assert(sizeof(*dest) == sizeof(*src)); \
-    memcpy(dest, src, n * sizeof(*src));   \
+#define vp9_copy_array(dest, src, n)           \
+  {                                            \
+    assert(sizeof(*(dest)) == sizeof(*(src))); \
+    memcpy(dest, src, (n) * sizeof(*(src)));   \
   }
 
 #define vp9_zero(dest) memset(&(dest), 0, sizeof(dest))
-#define vp9_zero_array(dest, n) memset(dest, 0, n * sizeof(*dest))
+#define vp9_zero_array(dest, n) memset(dest, 0, (n) * sizeof(*(dest)))
 
 static INLINE int get_unsigned_bits(unsigned int num_values) {
   return num_values > 0 ? get_msb(num_values) + 1 : 0;
 }
 
-#if CONFIG_DEBUG
-#define CHECK_MEM_ERROR(cm, lval, expr)                                     \
-  do {                                                                      \
-    lval = (expr);                                                          \
-    if (!lval)                                                              \
-      vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR,                 \
-                         "Failed to allocate " #lval " at %s:%d", __FILE__, \
-                         __LINE__);                                         \
-  } while (0)
-#else
-#define CHECK_MEM_ERROR(cm, lval, expr)                     \
-  do {                                                      \
-    lval = (expr);                                          \
-    if (!lval)                                              \
-      vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR, \
-                         "Failed to allocate " #lval);      \
-  } while (0)
-#endif
-
 #define VP9_SYNC_CODE_0 0x49
 #define VP9_SYNC_CODE_1 0x83
 #define VP9_SYNC_CODE_2 0x42
@@ -75,4 +56,4 @@ static INLINE int get_unsigned_bits(unsigned int num_values) {
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_COMMON_H_
+#endif  // VPX_VP9_COMMON_VP9_COMMON_H_
diff --git a/media/libvpx/libvpx/vp9/common/vp9_common_data.c b/media/libvpx/libvpx/vp9/common/vp9_common_data.c
index 4a10833229..809d7317ce 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_common_data.c
+++ b/media/libvpx/libvpx/vp9/common/vp9_common_data.c
@@ -28,7 +28,7 @@ const uint8_t num_8x8_blocks_wide_lookup[BLOCK_SIZES] = { 1, 1, 1, 1, 1, 2, 2,
 const uint8_t num_8x8_blocks_high_lookup[BLOCK_SIZES] = { 1, 1, 1, 1, 2, 1, 2,
                                                           4, 2, 4, 8, 4, 8 };
 
-// VPXMIN(3, VPXMIN(b_width_log2(bsize), b_height_log2(bsize)))
+// VPXMIN(3, VPXMIN(b_width_log2_lookup(bsize), b_height_log2_lookup(bsize)))
 const uint8_t size_group_lookup[BLOCK_SIZES] = { 0, 0, 0, 1, 1, 1, 2,
                                                  2, 2, 3, 3, 3, 3 };
 
diff --git a/media/libvpx/libvpx/vp9/common/vp9_common_data.h b/media/libvpx/libvpx/vp9/common/vp9_common_data.h
index 5c6a7e8ff3..a533c5f058 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_common_data.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_common_data.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_COMMON_DATA_H_
-#define VP9_COMMON_VP9_COMMON_DATA_H_
+#ifndef VPX_VP9_COMMON_VP9_COMMON_DATA_H_
+#define VPX_VP9_COMMON_VP9_COMMON_DATA_H_
 
 #include "vp9/common/vp9_enums.h"
 #include "vpx/vpx_integer.h"
@@ -42,4 +42,4 @@ extern const uint8_t need_top_left[INTRA_MODES];
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_COMMON_DATA_H_
+#endif  // VPX_VP9_COMMON_VP9_COMMON_DATA_H_
diff --git a/media/libvpx/libvpx/vp9/common/vp9_debugmodes.c b/media/libvpx/libvpx/vp9/common/vp9_debugmodes.c
index 7d128c9f7f..28cd4a1924 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_debugmodes.c
+++ b/media/libvpx/libvpx/vp9/common/vp9_debugmodes.c
@@ -34,7 +34,7 @@ static void print_mi_data(VP9_COMMON *cm, FILE *file, const char *descriptor,
   for (mi_row = 0; mi_row < rows; mi_row++) {
     fprintf(file, "%c ", prefix);
     for (mi_col = 0; mi_col < cols; mi_col++) {
-      fprintf(file, "%2d ", *((int *)((char *)(mi[0]) + member_offset)));
+      fprintf(file, "%2d ", *((char *)((char *)(mi[0]) + member_offset)));
       mi++;
     }
     fprintf(file, "\n");
diff --git a/media/libvpx/libvpx/vp9/common/vp9_entropy.c b/media/libvpx/libvpx/vp9/common/vp9_entropy.c
index a575bda729..430b917b8f 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_entropy.c
+++ b/media/libvpx/libvpx/vp9/common/vp9_entropy.c
@@ -42,6 +42,7 @@ const vpx_prob vp9_cat6_prob_high12[] = { 255, 255, 255, 255, 254, 254,
                                           177, 153, 140, 133, 130, 129 };
 #endif
 
+/* clang-format off */
 const uint8_t vp9_coefband_trans_8x8plus[1024] = {
   0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5,
   // beyond MAXBAND_INDEX+1 all values are filled as 5
@@ -85,6 +86,7 @@ const uint8_t vp9_coefband_trans_8x8plus[1024] = {
   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 };
+/* clang-format on */
 
 const uint8_t vp9_coefband_trans_4x4[16] = {
   0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5,
diff --git a/media/libvpx/libvpx/vp9/common/vp9_entropy.h b/media/libvpx/libvpx/vp9/common/vp9_entropy.h
index 1da4911668..d026651df7 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_entropy.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_entropy.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_ENTROPY_H_
-#define VP9_COMMON_VP9_ENTROPY_H_
+#ifndef VPX_VP9_COMMON_VP9_ENTROPY_H_
+#define VPX_VP9_COMMON_VP9_ENTROPY_H_
 
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/prob.h"
@@ -137,7 +137,6 @@ static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) {
 // 128 lists of probabilities are stored for the following ONE node probs:
 // 1, 3, 5, 7, ..., 253, 255
 // In between probabilities are interpolated linearly
-
 #define COEFF_PROB_MODELS 255
 
 #define UNCONSTRAINED_NODES 3
@@ -195,4 +194,4 @@ static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_ENTROPY_H_
+#endif  // VPX_VP9_COMMON_VP9_ENTROPY_H_
diff --git a/media/libvpx/libvpx/vp9/common/vp9_entropymode.c b/media/libvpx/libvpx/vp9/common/vp9_entropymode.c
index 22365efc50..9289fc9e1f 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_entropymode.c
+++ b/media/libvpx/libvpx/vp9/common/vp9_entropymode.c
@@ -186,16 +186,19 @@ const vpx_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
                                        { 93, 24, 99 },   // a split, l not split
                                        { 85, 119, 44 },  // l split, a not split
                                        { 62, 59, 67 },   // a/l both split
+
                                        // 16x16 -> 8x8
                                        { 149, 53, 53 },  // a/l both not split
                                        { 94, 20, 48 },   // a split, l not split
                                        { 83, 53, 24 },   // l split, a not split
                                        { 52, 18, 18 },   // a/l both split
+
                                        // 32x32 -> 16x16
                                        { 150, 40, 39 },  // a/l both not split
                                        { 78, 12, 26 },   // a split, l not split
                                        { 67, 33, 11 },   // l split, a not split
                                        { 24, 7, 5 },     // a/l both split
+
                                        // 64x64 -> 32x32
                                        { 174, 35, 49 },  // a/l both not split
                                        { 68, 11, 27 },   // a split, l not split
@@ -260,13 +263,13 @@ const vpx_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)] = {
   -PARTITION_NONE, 2, -PARTITION_HORZ, 4, -PARTITION_VERT, -PARTITION_SPLIT
 };
 
-static const vpx_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = {
-  9, 102, 187, 225
-};
+static const vpx_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = { 9, 102,
+                                                                      187,
+                                                                      225 };
 
-static const vpx_prob default_comp_inter_p[COMP_INTER_CONTEXTS] = {
-  239, 183, 119, 96, 41
-};
+static const vpx_prob default_comp_inter_p[COMP_INTER_CONTEXTS] = { 239, 183,
+                                                                    119, 96,
+                                                                    41 };
 
 static const vpx_prob default_comp_ref_p[REF_CONTEXTS] = { 50, 126, 123, 221,
                                                            226 };
@@ -331,8 +334,8 @@ static void init_mode_probs(FRAME_CONTEXT *fc) {
   vp9_copy(fc->inter_mode_probs, default_inter_mode_probs);
 }
 
-const vpx_tree_index vp9_switchable_interp_tree[TREE_SIZE(SWITCHABLE_FILTERS)] =
-    { -EIGHTTAP, 2, -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP };
+const vpx_tree_index vp9_switchable_interp_tree[TREE_SIZE(
+    SWITCHABLE_FILTERS)] = { -EIGHTTAP, 2, -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP };
 
 void vp9_adapt_mode_probs(VP9_COMMON *cm) {
   int i, j;
@@ -378,7 +381,6 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
   }
 
   if (cm->tx_mode == TX_MODE_SELECT) {
-    int j;
     unsigned int branch_ct_8x8p[TX_SIZES - 3][2];
     unsigned int branch_ct_16x16p[TX_SIZES - 2][2];
     unsigned int branch_ct_32x32p[TX_SIZES - 1][2];
@@ -428,7 +430,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm) {
   vp9_clearall_segfeatures(&cm->seg);
   cm->seg.abs_delta = SEGMENT_DELTADATA;
 
-  if (cm->last_frame_seg_map && !cm->frame_parallel_decode)
+  if (cm->last_frame_seg_map)
     memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
 
   if (cm->current_frame_seg_map)
@@ -457,7 +459,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm) {
   }
 
   // prev_mip will only be allocated in encoder.
-  if (frame_is_intra_only(cm) && cm->prev_mip && !cm->frame_parallel_decode)
+  if (frame_is_intra_only(cm) && cm->prev_mip)
     memset(cm->prev_mip, 0,
            cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->prev_mip));
 
diff --git a/media/libvpx/libvpx/vp9/common/vp9_entropymode.h b/media/libvpx/libvpx/vp9/common/vp9_entropymode.h
index 0ee663fe88..e616aeac5c 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_entropymode.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_entropymode.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_ENTROPYMODE_H_
-#define VP9_COMMON_VP9_ENTROPYMODE_H_
+#ifndef VPX_VP9_COMMON_VP9_ENTROPYMODE_H_
+#define VPX_VP9_COMMON_VP9_ENTROPYMODE_H_
 
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymv.h"
@@ -24,7 +24,7 @@ extern "C" {
 
 #define TX_SIZE_CONTEXTS 2
 
-#define INTER_OFFSET(mode) ((mode)-NEARESTMV)
+#define INTER_OFFSET(mode) ((mode) - NEARESTMV)
 
 struct VP9Common;
 
@@ -104,4 +104,4 @@ void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p,
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_ENTROPYMODE_H_
+#endif  // VPX_VP9_COMMON_VP9_ENTROPYMODE_H_
diff --git a/media/libvpx/libvpx/vp9/common/vp9_entropymv.c b/media/libvpx/libvpx/vp9/common/vp9_entropymv.c
index a18a290cfd..b6f052d088 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_entropymv.c
+++ b/media/libvpx/libvpx/vp9/common/vp9_entropymv.c
@@ -22,9 +22,7 @@ const vpx_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = {
   18,          -MV_CLASS_7, -MV_CLASS_8, -MV_CLASS_9, -MV_CLASS_10,
 };
 
-const vpx_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = {
-  -0, -1,
-};
+const vpx_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = { -0, -1 };
 
 const vpx_tree_index vp9_mv_fp_tree[TREE_SIZE(MV_FP_SIZE)] = { -0, 2,  -1,
                                                                4,  -2, -3 };
diff --git a/media/libvpx/libvpx/vp9/common/vp9_entropymv.h b/media/libvpx/libvpx/vp9/common/vp9_entropymv.h
index e2fe37a327..ee9d37973f 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_entropymv.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_entropymv.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_ENTROPYMV_H_
-#define VP9_COMMON_VP9_ENTROPYMV_H_
+#ifndef VPX_VP9_COMMON_VP9_ENTROPYMV_H_
+#define VPX_VP9_COMMON_VP9_ENTROPYMV_H_
 
 #include "./vpx_config.h"
 
@@ -25,7 +25,7 @@ struct VP9Common;
 
 void vp9_init_mv_probs(struct VP9Common *cm);
 
-void vp9_adapt_mv_probs(struct VP9Common *cm, int usehp);
+void vp9_adapt_mv_probs(struct VP9Common *cm, int allow_hp);
 
 static INLINE int use_mv_hp(const MV *ref) {
   const int kMvRefThresh = 64;  // threshold for use of high-precision 1/8 mv
@@ -127,10 +127,10 @@ typedef struct {
   nmv_component_counts comps[2];
 } nmv_context_counts;
 
-void vp9_inc_mv(const MV *mv, nmv_context_counts *mvctx);
+void vp9_inc_mv(const MV *mv, nmv_context_counts *counts);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_ENTROPYMV_H_
+#endif  // VPX_VP9_COMMON_VP9_ENTROPYMV_H_
diff --git a/media/libvpx/libvpx/vp9/common/vp9_enums.h b/media/libvpx/libvpx/vp9/common/vp9_enums.h
index 056b298b3d..b33a3a2978 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_enums.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_enums.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_ENUMS_H_
-#define VP9_COMMON_VP9_ENUMS_H_
+#ifndef VPX_VP9_COMMON_VP9_ENUMS_H_
+#define VPX_VP9_COMMON_VP9_ENUMS_H_
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
@@ -41,6 +41,8 @@ typedef enum BITSTREAM_PROFILE {
   MAX_PROFILES
 } BITSTREAM_PROFILE;
 
+typedef enum PARSE_RECON_FLAG { PARSE = 1, RECON = 2 } PARSE_RECON_FLAG;
+
 #define BLOCK_4X4 0
 #define BLOCK_4X8 1
 #define BLOCK_8X4 2
@@ -140,4 +142,4 @@ typedef uint8_t PREDICTION_MODE;
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_ENUMS_H_
+#endif  // VPX_VP9_COMMON_VP9_ENUMS_H_
diff --git a/media/libvpx/libvpx/vp9/common/vp9_filter.c b/media/libvpx/libvpx/vp9/common/vp9_filter.c
index 6c43af8ce8..adbda6c825 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_filter.c
+++ b/media/libvpx/libvpx/vp9/common/vp9_filter.c
@@ -63,6 +63,20 @@ DECLARE_ALIGNED(256, static const InterpKernel,
   { 0, -3, 2, 41, 63, 29, -2, -2 },   { 0, -3, 1, 38, 64, 32, -1, -3 }
 };
 
-const InterpKernel *vp9_filter_kernels[4] = {
-  sub_pel_filters_8, sub_pel_filters_8lp, sub_pel_filters_8s, bilinear_filters
+// 4-tap filter
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_4[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 128, 0, 0, 0, 0 },     { 0, 0, -4, 126, 8, -2, 0, 0 },
+  { 0, 0, -6, 120, 18, -4, 0, 0 },  { 0, 0, -8, 114, 28, -6, 0, 0 },
+  { 0, 0, -10, 108, 36, -6, 0, 0 }, { 0, 0, -12, 102, 46, -8, 0, 0 },
+  { 0, 0, -12, 94, 56, -10, 0, 0 }, { 0, 0, -12, 84, 66, -10, 0, 0 },
+  { 0, 0, -12, 76, 76, -12, 0, 0 }, { 0, 0, -10, 66, 84, -12, 0, 0 },
+  { 0, 0, -10, 56, 94, -12, 0, 0 }, { 0, 0, -8, 46, 102, -12, 0, 0 },
+  { 0, 0, -6, 36, 108, -10, 0, 0 }, { 0, 0, -6, 28, 114, -8, 0, 0 },
+  { 0, 0, -4, 18, 120, -6, 0, 0 },  { 0, 0, -2, 8, 126, -4, 0, 0 }
+};
+
+const InterpKernel *vp9_filter_kernels[5] = {
+  sub_pel_filters_8, sub_pel_filters_8lp, sub_pel_filters_8s, bilinear_filters,
+  sub_pel_filters_4
 };
diff --git a/media/libvpx/libvpx/vp9/common/vp9_filter.h b/media/libvpx/libvpx/vp9/common/vp9_filter.h
index 9d2b8e1dbf..0382c88e7c 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_filter.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_filter.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_FILTER_H_
-#define VP9_COMMON_VP9_FILTER_H_
+#ifndef VPX_VP9_COMMON_VP9_FILTER_H_
+#define VPX_VP9_COMMON_VP9_FILTER_H_
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
@@ -25,6 +25,7 @@ extern "C" {
 #define EIGHTTAP_SHARP 2
 #define SWITCHABLE_FILTERS 3 /* Number of switchable filters */
 #define BILINEAR 3
+#define FOURTAP 4
 // The codec can operate in four possible inter prediction filter mode:
 // 8-tap, 8-tap-smooth, 8-tap-sharp, and switching between the three.
 #define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1)
@@ -32,10 +33,10 @@ extern "C" {
 
 typedef uint8_t INTERP_FILTER;
 
-extern const InterpKernel *vp9_filter_kernels[4];
+extern const InterpKernel *vp9_filter_kernels[5];
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_FILTER_H_
+#endif  // VPX_VP9_COMMON_VP9_FILTER_H_
diff --git a/media/libvpx/libvpx/vp9/common/vp9_frame_buffers.c b/media/libvpx/libvpx/vp9/common/vp9_frame_buffers.c
index a254e79d20..889b809e50 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_frame_buffers.c
+++ b/media/libvpx/libvpx/vp9/common/vp9_frame_buffers.c
@@ -14,14 +14,17 @@
 #include "vpx_mem/vpx_mem.h"
 
 int vp9_alloc_internal_frame_buffers(InternalFrameBufferList *list) {
+  const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
   assert(list != NULL);
   vp9_free_internal_frame_buffers(list);
 
-  list->num_internal_frame_buffers =
-      VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
-  list->int_fb = (InternalFrameBuffer *)vpx_calloc(
-      list->num_internal_frame_buffers, sizeof(*list->int_fb));
-  return (list->int_fb == NULL);
+  list->int_fb =
+      (InternalFrameBuffer *)vpx_calloc(num_buffers, sizeof(*list->int_fb));
+  if (list->int_fb) {
+    list->num_internal_frame_buffers = num_buffers;
+    return 0;
+  }
+  return -1;
 }
 
 void vp9_free_internal_frame_buffers(InternalFrameBufferList *list) {
@@ -35,6 +38,7 @@ void vp9_free_internal_frame_buffers(InternalFrameBufferList *list) {
   }
   vpx_free(list->int_fb);
   list->int_fb = NULL;
+  list->num_internal_frame_buffers = 0;
 }
 
 int vp9_get_frame_buffer(void *cb_priv, size_t min_size,
diff --git a/media/libvpx/libvpx/vp9/common/vp9_frame_buffers.h b/media/libvpx/libvpx/vp9/common/vp9_frame_buffers.h
index e2cfe61b66..11be838c02 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_frame_buffers.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_frame_buffers.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_FRAME_BUFFERS_H_
-#define VP9_COMMON_VP9_FRAME_BUFFERS_H_
+#ifndef VPX_VP9_COMMON_VP9_FRAME_BUFFERS_H_
+#define VPX_VP9_COMMON_VP9_FRAME_BUFFERS_H_
 
 #include "vpx/vpx_frame_buffer.h"
 #include "vpx/vpx_integer.h"
@@ -50,4 +50,4 @@ int vp9_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb);
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_FRAME_BUFFERS_H_
+#endif  // VPX_VP9_COMMON_VP9_FRAME_BUFFERS_H_
diff --git a/media/libvpx/libvpx/vp9/common/vp9_idct.c b/media/libvpx/libvpx/vp9/common/vp9_idct.c
index e3a088e287..71be0f310d 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_idct.c
+++ b/media/libvpx/libvpx/vp9/common/vp9_idct.c
@@ -150,18 +150,22 @@ void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
 
 void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
                        int eob) {
+  assert(((intptr_t)input) % 32 == 0);
   /* The calculation can be simplified if there are not many non-zero dct
    * coefficients. Use eobs to separate different cases. */
   if (eob == 1) /* DC only DCT coefficient. */
     vpx_idct16x16_1_add(input, dest, stride);
   else if (eob <= 10)
     vpx_idct16x16_10_add(input, dest, stride);
+  else if (eob <= 38)
+    vpx_idct16x16_38_add(input, dest, stride);
   else
     vpx_idct16x16_256_add(input, dest, stride);
 }
 
 void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
                        int eob) {
+  assert(((intptr_t)input) % 32 == 0);
   if (eob == 1)
     vpx_idct32x32_1_add(input, dest, stride);
   else if (eob <= 34)
@@ -203,7 +207,7 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
 
 #if CONFIG_VP9_HIGHBITDEPTH
 
-void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
                                 int stride, int tx_type, int bd) {
   const highbd_transform_2d IHT_4[] = {
     { vpx_highbd_idct4_c, vpx_highbd_idct4_c },   // DCT_DCT  = 0
@@ -211,7 +215,6 @@ void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
     { vpx_highbd_idct4_c, vpx_highbd_iadst4_c },  // DCT_ADST = 2
     { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c }  // ADST_ADST = 3
   };
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
   int i, j;
   tran_low_t out[4 * 4];
@@ -243,14 +246,13 @@ static const highbd_transform_2d HIGH_IHT_8[] = {
   { vpx_highbd_iadst8_c, vpx_highbd_iadst8_c }  // ADST_ADST = 3
 };
 
-void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest,
                                 int stride, int tx_type, int bd) {
   int i, j;
   tran_low_t out[8 * 8];
   tran_low_t *outptr = out;
   tran_low_t temp_in[8], temp_out[8];
   const highbd_transform_2d ht = HIGH_IHT_8[tx_type];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
   // Inverse transform row vectors.
   for (i = 0; i < 8; ++i) {
@@ -277,14 +279,13 @@ static const highbd_transform_2d HIGH_IHT_16[] = {
   { vpx_highbd_iadst16_c, vpx_highbd_iadst16_c }  // ADST_ADST = 3
 };
 
-void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *dest,
                                    int stride, int tx_type, int bd) {
   int i, j;
   tran_low_t out[16 * 16];
   tran_low_t *outptr = out;
   tran_low_t temp_in[16], temp_out[16];
   const highbd_transform_2d ht = HIGH_IHT_16[tx_type];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
   // Rows
   for (i = 0; i < 16; ++i) {
@@ -305,7 +306,7 @@ void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
 }
 
 // idct
-void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+void vp9_highbd_idct4x4_add(const tran_low_t *input, uint16_t *dest, int stride,
                             int eob, int bd) {
   if (eob > 1)
     vpx_highbd_idct4x4_16_add(input, dest, stride, bd);
@@ -313,7 +314,7 @@ void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
     vpx_highbd_idct4x4_1_add(input, dest, stride, bd);
 }
 
-void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint16_t *dest, int stride,
                             int eob, int bd) {
   if (eob > 1)
     vpx_highbd_iwht4x4_16_add(input, dest, stride, bd);
@@ -321,7 +322,7 @@ void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
     vpx_highbd_iwht4x4_1_add(input, dest, stride, bd);
 }
 
-void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+void vp9_highbd_idct8x8_add(const tran_low_t *input, uint16_t *dest, int stride,
                             int eob, int bd) {
   // If dc is 1, then input[0] is the reconstructed value, do not need
   // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
@@ -338,7 +339,7 @@ void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
   }
 }
 
-void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest,
+void vp9_highbd_idct16x16_add(const tran_low_t *input, uint16_t *dest,
                               int stride, int eob, int bd) {
   // The calculation can be simplified if there are not many non-zero dct
   // coefficients. Use eobs to separate different cases.
@@ -347,18 +348,22 @@ void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest,
     vpx_highbd_idct16x16_1_add(input, dest, stride, bd);
   } else if (eob <= 10) {
     vpx_highbd_idct16x16_10_add(input, dest, stride, bd);
+  } else if (eob <= 38) {
+    vpx_highbd_idct16x16_38_add(input, dest, stride, bd);
   } else {
     vpx_highbd_idct16x16_256_add(input, dest, stride, bd);
   }
 }
 
-void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,
+void vp9_highbd_idct32x32_add(const tran_low_t *input, uint16_t *dest,
                               int stride, int eob, int bd) {
   // Non-zero coeff only in upper-left 8x8
   if (eob == 1) {
     vpx_highbd_idct32x32_1_add(input, dest, stride, bd);
   } else if (eob <= 34) {
     vpx_highbd_idct32x32_34_add(input, dest, stride, bd);
+  } else if (eob <= 135) {
+    vpx_highbd_idct32x32_135_add(input, dest, stride, bd);
   } else {
     vpx_highbd_idct32x32_1024_add(input, dest, stride, bd);
   }
@@ -366,7 +371,7 @@ void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,
 
 // iht
 void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
-                           uint8_t *dest, int stride, int eob, int bd) {
+                           uint16_t *dest, int stride, int eob, int bd) {
   if (tx_type == DCT_DCT)
     vp9_highbd_idct4x4_add(input, dest, stride, eob, bd);
   else
@@ -374,7 +379,7 @@ void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
 }
 
 void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
-                           uint8_t *dest, int stride, int eob, int bd) {
+                           uint16_t *dest, int stride, int eob, int bd) {
   if (tx_type == DCT_DCT) {
     vp9_highbd_idct8x8_add(input, dest, stride, eob, bd);
   } else {
@@ -383,7 +388,7 @@ void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
 }
 
 void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
-                             uint8_t *dest, int stride, int eob, int bd) {
+                             uint16_t *dest, int stride, int eob, int bd) {
   if (tx_type == DCT_DCT) {
     vp9_highbd_idct16x16_add(input, dest, stride, eob, bd);
   } else {
diff --git a/media/libvpx/libvpx/vp9/common/vp9_idct.h b/media/libvpx/libvpx/vp9/common/vp9_idct.h
index ea958a38c0..94eeaf599e 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_idct.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_idct.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_IDCT_H_
-#define VP9_COMMON_VP9_IDCT_H_
+#ifndef VPX_VP9_COMMON_VP9_IDCT_H_
+#define VPX_VP9_COMMON_VP9_IDCT_H_
 
 #include <assert.h>
 
@@ -57,25 +57,25 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
                       int stride, int eob);
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint16_t *dest, int stride,
                             int eob, int bd);
-void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+void vp9_highbd_idct4x4_add(const tran_low_t *input, uint16_t *dest, int stride,
                             int eob, int bd);
-void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+void vp9_highbd_idct8x8_add(const tran_low_t *input, uint16_t *dest, int stride,
                             int eob, int bd);
-void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest,
+void vp9_highbd_idct16x16_add(const tran_low_t *input, uint16_t *dest,
                               int stride, int eob, int bd);
-void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,
+void vp9_highbd_idct32x32_add(const tran_low_t *input, uint16_t *dest,
                               int stride, int eob, int bd);
 void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
-                           uint8_t *dest, int stride, int eob, int bd);
+                           uint16_t *dest, int stride, int eob, int bd);
 void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
-                           uint8_t *dest, int stride, int eob, int bd);
+                           uint16_t *dest, int stride, int eob, int bd);
 void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
-                             uint8_t *dest, int stride, int eob, int bd);
+                             uint16_t *dest, int stride, int eob, int bd);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_IDCT_H_
+#endif  // VPX_VP9_COMMON_VP9_IDCT_H_
diff --git a/media/libvpx/libvpx/vp9/common/vp9_loopfilter.c b/media/libvpx/libvpx/vp9/common/vp9_loopfilter.c
index ef0297dd5e..1a9d45ae77 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_loopfilter.c
+++ b/media/libvpx/libvpx/vp9/common/vp9_loopfilter.c
@@ -880,12 +880,12 @@ void vp9_adjust_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
 // This function sets up the bit masks for the entire 64x64 region represented
 // by mi_row, mi_col.
 void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
-                    MODE_INFO **mi, const int mode_info_stride,
+                    MODE_INFO **mi8x8, const int mode_info_stride,
                     LOOP_FILTER_MASK *lfm) {
   int idx_32, idx_16, idx_8;
   const loop_filter_info_n *const lfi_n = &cm->lf_info;
-  MODE_INFO **mip = mi;
-  MODE_INFO **mip2 = mi;
+  MODE_INFO **mip = mi8x8;
+  MODE_INFO **mip2 = mi8x8;
 
   // These are offsets to the next mi in the 64x64 block. It is what gets
   // added to the mi ptr as we go through each loop. It helps us to avoid
@@ -932,32 +932,32 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
       break;
     default:
       for (idx_32 = 0; idx_32 < 4; mip += offset_32[idx_32], ++idx_32) {
-        const int shift_y = shift_32_y[idx_32];
-        const int shift_uv = shift_32_uv[idx_32];
+        const int shift_y_32 = shift_32_y[idx_32];
+        const int shift_uv_32 = shift_32_uv[idx_32];
         const int mi_32_col_offset = ((idx_32 & 1) << 2);
         const int mi_32_row_offset = ((idx_32 >> 1) << 2);
         if (mi_32_col_offset >= max_cols || mi_32_row_offset >= max_rows)
           continue;
         switch (mip[0]->sb_type) {
           case BLOCK_32X32:
-            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+            build_masks(lfi_n, mip[0], shift_y_32, shift_uv_32, lfm);
             break;
           case BLOCK_32X16:
-            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+            build_masks(lfi_n, mip[0], shift_y_32, shift_uv_32, lfm);
             if (mi_32_row_offset + 2 >= max_rows) continue;
             mip2 = mip + mode_info_stride * 2;
-            build_masks(lfi_n, mip2[0], shift_y + 16, shift_uv + 4, lfm);
+            build_masks(lfi_n, mip2[0], shift_y_32 + 16, shift_uv_32 + 4, lfm);
             break;
           case BLOCK_16X32:
-            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+            build_masks(lfi_n, mip[0], shift_y_32, shift_uv_32, lfm);
             if (mi_32_col_offset + 2 >= max_cols) continue;
             mip2 = mip + 2;
-            build_masks(lfi_n, mip2[0], shift_y + 2, shift_uv + 1, lfm);
+            build_masks(lfi_n, mip2[0], shift_y_32 + 2, shift_uv_32 + 1, lfm);
             break;
           default:
             for (idx_16 = 0; idx_16 < 4; mip += offset_16[idx_16], ++idx_16) {
-              const int shift_y = shift_32_y[idx_32] + shift_16_y[idx_16];
-              const int shift_uv = shift_32_uv[idx_32] + shift_16_uv[idx_16];
+              const int shift_y_16 = shift_y_32 + shift_16_y[idx_16];
+              const int shift_uv_16 = shift_uv_32 + shift_16_uv[idx_16];
               const int mi_16_col_offset =
                   mi_32_col_offset + ((idx_16 & 1) << 1);
               const int mi_16_row_offset =
@@ -968,28 +968,26 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
 
               switch (mip[0]->sb_type) {
                 case BLOCK_16X16:
-                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  build_masks(lfi_n, mip[0], shift_y_16, shift_uv_16, lfm);
                   break;
                 case BLOCK_16X8:
-                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  build_masks(lfi_n, mip[0], shift_y_16, shift_uv_16, lfm);
                   if (mi_16_row_offset + 1 >= max_rows) continue;
                   mip2 = mip + mode_info_stride;
-                  build_y_mask(lfi_n, mip2[0], shift_y + 8, lfm);
+                  build_y_mask(lfi_n, mip2[0], shift_y_16 + 8, lfm);
                   break;
                 case BLOCK_8X16:
-                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  build_masks(lfi_n, mip[0], shift_y_16, shift_uv_16, lfm);
                   if (mi_16_col_offset + 1 >= max_cols) continue;
                   mip2 = mip + 1;
-                  build_y_mask(lfi_n, mip2[0], shift_y + 1, lfm);
+                  build_y_mask(lfi_n, mip2[0], shift_y_16 + 1, lfm);
                   break;
                 default: {
-                  const int shift_y =
-                      shift_32_y[idx_32] + shift_16_y[idx_16] + shift_8_y[0];
-                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  const int shift_y_8_0 = shift_y_16 + shift_8_y[0];
+                  build_masks(lfi_n, mip[0], shift_y_8_0, shift_uv_16, lfm);
                   mip += offset[0];
                   for (idx_8 = 1; idx_8 < 4; mip += offset[idx_8], ++idx_8) {
-                    const int shift_y = shift_32_y[idx_32] +
-                                        shift_16_y[idx_16] + shift_8_y[idx_8];
+                    const int shift_y_8 = shift_y_16 + shift_8_y[idx_8];
                     const int mi_8_col_offset =
                         mi_16_col_offset + ((idx_8 & 1));
                     const int mi_8_row_offset =
@@ -998,7 +996,7 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
                     if (mi_8_col_offset >= max_cols ||
                         mi_8_row_offset >= max_rows)
                       continue;
-                    build_y_mask(lfi_n, mip[0], shift_y, lfm);
+                    build_y_mask(lfi_n, mip[0], shift_y_8, lfm);
                   }
                   break;
                 }
@@ -1087,13 +1085,19 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm,
   const int row_step_stride = cm->mi_stride * row_step;
   struct buf_2d *const dst = &plane->dst;
   uint8_t *const dst0 = dst->buf;
-  unsigned int mask_16x16[MI_BLOCK_SIZE] = { 0 };
-  unsigned int mask_8x8[MI_BLOCK_SIZE] = { 0 };
-  unsigned int mask_4x4[MI_BLOCK_SIZE] = { 0 };
-  unsigned int mask_4x4_int[MI_BLOCK_SIZE] = { 0 };
+  unsigned int mask_16x16[MI_BLOCK_SIZE];
+  unsigned int mask_8x8[MI_BLOCK_SIZE];
+  unsigned int mask_4x4[MI_BLOCK_SIZE];
+  unsigned int mask_4x4_int[MI_BLOCK_SIZE];
   uint8_t lfl[MI_BLOCK_SIZE * MI_BLOCK_SIZE];
   int r, c;
 
+  vp9_zero(mask_16x16);
+  vp9_zero(mask_8x8);
+  vp9_zero(mask_4x4);
+  vp9_zero(mask_4x4_int);
+  vp9_zero(lfl);
+
   for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
     unsigned int mask_16x16_c = 0;
     unsigned int mask_8x8_c = 0;
@@ -1174,7 +1178,7 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm,
     }
 
     // Disable filtering on the leftmost column
-    border_mask = ~(mi_col == 0);
+    border_mask = ~(mi_col == 0 ? 1u : 0u);
 #if CONFIG_VP9_HIGHBITDEPTH
     if (cm->use_highbitdepth) {
       highbd_filter_selectively_vert(
@@ -1330,6 +1334,8 @@ void vp9_filter_block_plane_ss11(VP9_COMMON *const cm,
   uint16_t mask_4x4 = lfm->left_uv[TX_4X4];
   uint16_t mask_4x4_int = lfm->int_4x4_uv;
 
+  vp9_zero(lfl_uv);
+
   assert(plane->subsampling_x == 1 && plane->subsampling_y == 1);
 
   // Vertical pass: do 2 rows at one time
@@ -1612,12 +1618,14 @@ void vp9_loop_filter_data_reset(
 
 void vp9_reset_lfm(VP9_COMMON *const cm) {
   if (cm->lf.filter_level) {
-    memset(cm->lf.lfm, 0, ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) *
-                              cm->lf.lfm_stride * sizeof(*cm->lf.lfm));
+    memset(cm->lf.lfm, 0,
+           ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) * cm->lf.lfm_stride *
+               sizeof(*cm->lf.lfm));
   }
 }
 
-int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused) {
+int vp9_loop_filter_worker(void *arg1, void *unused) {
+  LFWorkerData *const lf_data = (LFWorkerData *)arg1;
   (void)unused;
   loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
                    lf_data->start, lf_data->stop, lf_data->y_only);
diff --git a/media/libvpx/libvpx/vp9/common/vp9_loopfilter.h b/media/libvpx/libvpx/vp9/common/vp9_loopfilter.h
index da37a6ebde..39648a72c3 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_loopfilter.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_loopfilter.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_LOOPFILTER_H_
-#define VP9_COMMON_VP9_LOOPFILTER_H_
+#ifndef VPX_VP9_COMMON_VP9_LOOPFILTER_H_
+#define VPX_VP9_COMMON_VP9_LOOPFILTER_H_
 
 #include "vpx_ports/mem.h"
 #include "./vpx_config.h"
@@ -97,7 +97,7 @@ struct VP9LfSyncData;
 // This function sets up the bit masks for the entire 64x64 region represented
 // by mi_row, mi_col.
 void vp9_setup_mask(struct VP9Common *const cm, const int mi_row,
-                    const int mi_col, MODE_INFO **mi_8x8,
+                    const int mi_col, MODE_INFO **mi8x8,
                     const int mode_info_stride, LOOP_FILTER_MASK *lfm);
 
 void vp9_filter_block_plane_ss00(struct VP9Common *const cm,
@@ -120,7 +120,7 @@ void vp9_loop_filter_init(struct VP9Common *cm);
 void vp9_loop_filter_frame_init(struct VP9Common *cm, int default_filt_lvl);
 
 void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct VP9Common *cm,
-                           struct macroblockd *mbd, int filter_level,
+                           struct macroblockd *xd, int frame_filter_level,
                            int y_only, int partial_frame);
 
 // Get the superblock lfm for a given mi_row, mi_col.
@@ -151,10 +151,10 @@ void vp9_loop_filter_data_reset(
     LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer,
     struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]);
 
-// Operates on the rows described by 'lf_data'.
-int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused);
+// Operates on the rows described by 'arg1' (cast to LFWorkerData *).
+int vp9_loop_filter_worker(void *arg1, void *unused);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_LOOPFILTER_H_
+#endif  // VPX_VP9_COMMON_VP9_LOOPFILTER_H_
diff --git a/media/libvpx/libvpx/vp9/common/vp9_mfqe.c b/media/libvpx/libvpx/vp9/common/vp9_mfqe.c
index e76d771b8d..cf60fa40fd 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_mfqe.c
+++ b/media/libvpx/libvpx/vp9/common/vp9_mfqe.c
@@ -217,6 +217,7 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
   const int bsl = b_width_log2_lookup[bs];
   PARTITION_TYPE partition = partition_lookup[bsl][cur_bs];
   const BLOCK_SIZE subsize = get_subsize(bs, partition);
+  BLOCK_SIZE mfqe_bs, bs_tmp;
 
   if (cur_bs < BLOCK_8X8) {
     // If there are blocks smaller than 8x8, it must be on the boundary.
@@ -236,7 +237,6 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
     uv_offset = 8;
   }
   switch (partition) {
-    BLOCK_SIZE mfqe_bs, bs_tmp;
     case PARTITION_HORZ:
       if (bs == BLOCK_64X64) {
         mfqe_bs = BLOCK_64X32;
diff --git a/media/libvpx/libvpx/vp9/common/vp9_mfqe.h b/media/libvpx/libvpx/vp9/common/vp9_mfqe.h
index dfff8c23d6..f53e1c2f9d 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_mfqe.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_mfqe.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_MFQE_H_
-#define VP9_COMMON_VP9_MFQE_H_
+#ifndef VPX_VP9_COMMON_VP9_MFQE_H_
+#define VPX_VP9_COMMON_VP9_MFQE_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -28,4 +28,4 @@ void vp9_mfqe(struct VP9Common *cm);
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_MFQE_H_
+#endif  // VPX_VP9_COMMON_VP9_MFQE_H_
diff --git a/media/libvpx/libvpx/vp9/common/vp9_mv.h b/media/libvpx/libvpx/vp9/common/vp9_mv.h
index 4c8eac7213..76f93cf0ba 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_mv.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_mv.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_MV_H_
-#define VP9_COMMON_VP9_MV_H_
+#ifndef VPX_VP9_COMMON_VP9_MV_H_
+#define VPX_VP9_COMMON_VP9_MV_H_
 
 #include "vpx/vpx_integer.h"
 
@@ -19,6 +19,8 @@
 extern "C" {
 #endif
 
+#define INVALID_MV 0x80008000
+
 typedef struct mv {
   int16_t row;
   int16_t col;
@@ -52,4 +54,4 @@ static INLINE void clamp_mv(MV *mv, int min_col, int max_col, int min_row,
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_MV_H_
+#endif  // VPX_VP9_COMMON_VP9_MV_H_
diff --git a/media/libvpx/libvpx/vp9/common/vp9_mvref_common.h b/media/libvpx/libvpx/vp9/common/vp9_mvref_common.h
index 2b2c1ba9ee..5db6772dca 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_mvref_common.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_mvref_common.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef VP9_COMMON_VP9_MVREF_COMMON_H_
-#define VP9_COMMON_VP9_MVREF_COMMON_H_
+#ifndef VPX_VP9_COMMON_VP9_MVREF_COMMON_H_
+#define VPX_VP9_COMMON_VP9_MVREF_COMMON_H_
 
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_blockd.h"
@@ -263,10 +263,10 @@ static INLINE int_mv scale_mv(const MODE_INFO *mi, int ref,
                                  mv_ref_list, Done)                           \
   do {                                                                        \
     if (is_inter_block(mbmi)) {                                               \
-      if ((mbmi)->ref_frame[0] != ref_frame)                                  \
+      if ((mbmi)->ref_frame[0] != (ref_frame))                                \
         ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias),        \
                         refmv_count, mv_ref_list, Done);                      \
-      if (has_second_ref(mbmi) && (mbmi)->ref_frame[1] != ref_frame &&        \
+      if (has_second_ref(mbmi) && (mbmi)->ref_frame[1] != (ref_frame) &&      \
           (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int)                       \
         ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias),        \
                         refmv_count, mv_ref_list, Done);                      \
@@ -320,4 +320,4 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, int block,
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_MVREF_COMMON_H_
+#endif  // VPX_VP9_COMMON_VP9_MVREF_COMMON_H_
diff --git a/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h b/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h
index 32db7b7aa7..4c8fcf6989 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h
@@ -8,12 +8,11 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_ONYXC_INT_H_
-#define VP9_COMMON_VP9_ONYXC_INT_H_
+#ifndef VPX_VP9_COMMON_VP9_ONYXC_INT_H_
+#define VPX_VP9_COMMON_VP9_ONYXC_INT_H_
 
 #include "./vpx_config.h"
 #include "vpx/internal/vpx_codec_internal.h"
-#include "vpx_util/vpx_thread.h"
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_loopfilter.h"
@@ -37,13 +36,9 @@ extern "C" {
 #define REF_FRAMES_LOG2 3
 #define REF_FRAMES (1 << REF_FRAMES_LOG2)
 
-// 4 scratch frames for the new frames to support a maximum of 4 cores decoding
-// in parallel, 3 for scaled references on the encoder.
-// TODO(hkuang): Add ondemand frame buffers instead of hardcoding the number
-// of framebuffers.
-// TODO(jkoleszar): These 3 extra references could probably come from the
-// normal reference pool.
-#define FRAME_BUFFERS (REF_FRAMES + 7)
+// 1 scratch frame for the new frame, REFS_PER_FRAME for scaled references on
+// the encoder.
+#define FRAME_BUFFERS (REF_FRAMES + 1 + REFS_PER_FRAME)
 
 #define FRAME_CONTEXTS_LOG2 2
 #define FRAME_CONTEXTS (1 << FRAME_CONTEXTS_LOG2)
@@ -72,30 +67,22 @@ typedef struct {
   MV_REF *mvs;
   int mi_rows;
   int mi_cols;
+  uint8_t released;
+
+  // Note that frame_index/frame_coding_index are only set by set_frame_index()
+  // on the encoder side.
+
+  // TODO(angiebird): Set frame_index/frame_coding_index on the decoder side
+  // properly.
+  int frame_index;         // Display order in the video, it's equivalent to the
+                           // show_idx defined in EncodeFrameInfo.
+  int frame_coding_index;  // The coding order (starting from zero) of this
+                           // frame.
   vpx_codec_frame_buffer_t raw_frame_buffer;
   YV12_BUFFER_CONFIG buf;
-
-  // The Following variables will only be used in frame parallel decode.
-
-  // frame_worker_owner indicates which FrameWorker owns this buffer. NULL means
-  // that no FrameWorker owns, or is decoding, this buffer.
-  VPxWorker *frame_worker_owner;
-
-  // row and col indicate which position frame has been decoded to in real
-  // pixel unit. They are reset to -1 when decoding begins and set to INT_MAX
-  // when the frame is fully decoded.
-  int row;
-  int col;
 } RefCntBuffer;
 
 typedef struct BufferPool {
-// Protect BufferPool from being accessed by several FrameWorkers at
-// the same time during frame parallel decode.
-// TODO(hkuang): Try to use atomic variable instead of locking the whole pool.
-#if CONFIG_MULTITHREAD
-  pthread_mutex_t pool_mutex;
-#endif
-
   // Private data associated with the frame buffer callbacks.
   void *cb_priv;
 
@@ -149,6 +136,8 @@ typedef struct VP9Common {
 
   int new_fb_idx;
 
+  int cur_show_frame_fb_idx;
+
 #if CONFIG_VP9_POSTPROC
   YV12_BUFFER_CONFIG post_proc_buffer;
   YV12_BUFFER_CONFIG post_proc_buffer_int;
@@ -235,10 +224,6 @@ typedef struct VP9Common {
   struct loopfilter lf;
   struct segmentation seg;
 
-  // TODO(hkuang): Remove this as it is the same as frame_parallel_decode
-  // in pbi.
-  int frame_parallel_decode;  // frame-based threading.
-
   // Context probabilities for reference frame prediction
   MV_REFERENCE_FRAME comp_fixed_ref;
   MV_REFERENCE_FRAME comp_var_ref[2];
@@ -249,7 +234,14 @@ typedef struct VP9Common {
   unsigned int frame_context_idx; /* Context to use/update */
   FRAME_COUNTS counts;
 
+  // TODO(angiebird): current_video_frame/current_frame_coding_index into a
+  // structure
   unsigned int current_video_frame;
+  // Each show or no show frame is assigned with a coding index based on its
+  // coding order (starting from zero).
+
+  // Current frame's coding index.
+  int current_frame_coding_index;
   BITSTREAM_PROFILE profile;
 
   // VPX_BITS_8 in profile 0 or 1, VPX_BITS_10 or VPX_BITS_12 in profile 2 or 3.
@@ -267,26 +259,63 @@ typedef struct VP9Common {
   int byte_alignment;
   int skip_loop_filter;
 
-  // Private data associated with the frame buffer callbacks.
-  void *cb_priv;
-  vpx_get_frame_buffer_cb_fn_t get_fb_cb;
-  vpx_release_frame_buffer_cb_fn_t release_fb_cb;
-
-  // Handles memory for the codec.
-  InternalFrameBufferList int_frame_buffers;
-
   // External BufferPool passed from outside.
   BufferPool *buffer_pool;
 
   PARTITION_CONTEXT *above_seg_context;
   ENTROPY_CONTEXT *above_context;
   int above_context_alloc_cols;
+
+  int lf_row;
 } VP9_COMMON;
 
-// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
-// frame reference count.
-void lock_buffer_pool(BufferPool *const pool);
-void unlock_buffer_pool(BufferPool *const pool);
+static INLINE void init_frame_indexes(VP9_COMMON *cm) {
+  cm->current_video_frame = 0;
+  cm->current_frame_coding_index = 0;
+}
+
+static INLINE void update_frame_indexes(VP9_COMMON *cm, int show_frame) {
+  if (show_frame) {
+    // Don't increment frame counters if this was an altref buffer
+    // update not a real frame
+    ++cm->current_video_frame;
+  }
+  ++cm->current_frame_coding_index;
+}
+
+typedef struct {
+  int frame_width;
+  int frame_height;
+  int render_frame_width;
+  int render_frame_height;
+  int mi_rows;
+  int mi_cols;
+  int mb_rows;
+  int mb_cols;
+  int num_mbs;
+  vpx_bit_depth_t bit_depth;
+} FRAME_INFO;
+
+static INLINE void init_frame_info(FRAME_INFO *frame_info,
+                                   const VP9_COMMON *cm) {
+  frame_info->frame_width = cm->width;
+  frame_info->frame_height = cm->height;
+  frame_info->render_frame_width = cm->render_width;
+  frame_info->render_frame_height = cm->render_height;
+  frame_info->mi_cols = cm->mi_cols;
+  frame_info->mi_rows = cm->mi_rows;
+  frame_info->mb_cols = cm->mb_cols;
+  frame_info->mb_rows = cm->mb_rows;
+  frame_info->num_mbs = cm->MBs;
+  frame_info->bit_depth = cm->bit_depth;
+  // TODO(angiebird): Figure out how to get subsampling_x/y here
+}
+
+static INLINE YV12_BUFFER_CONFIG *get_buf_frame(VP9_COMMON *cm, int index) {
+  if (index < 0 || index >= FRAME_BUFFERS) return NULL;
+  if (cm->error.error_code != VPX_CODEC_OK) return NULL;
+  return &cm->buffer_pool->frame_bufs[index].buf;
+}
 
 static INLINE YV12_BUFFER_CONFIG *get_ref_frame(VP9_COMMON *cm, int index) {
   if (index < 0 || index >= REF_FRAMES) return NULL;
@@ -303,7 +332,6 @@ static INLINE int get_free_fb(VP9_COMMON *cm) {
   RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
   int i;
 
-  lock_buffer_pool(cm->buffer_pool);
   for (i = 0; i < FRAME_BUFFERS; ++i)
     if (frame_bufs[i].ref_count == 0) break;
 
@@ -314,7 +342,6 @@ static INLINE int get_free_fb(VP9_COMMON *cm) {
     i = INVALID_IDX;
   }
 
-  unlock_buffer_pool(cm->buffer_pool);
   return i;
 }
 
@@ -342,7 +369,7 @@ static INLINE void set_partition_probs(const VP9_COMMON *const cm,
   xd->partition_probs =
       frame_is_intra_only(cm)
           ? &vp9_kf_partition_probs[0]
-          : (const vpx_prob(*)[PARTITION_TYPES - 1])cm->fc->partition_prob;
+          : (const vpx_prob(*)[PARTITION_TYPES - 1]) cm->fc->partition_prob;
 }
 
 static INLINE void vp9_init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd,
@@ -437,4 +464,4 @@ static INLINE int partition_plane_context(const MACROBLOCKD *xd, int mi_row,
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_ONYXC_INT_H_
+#endif  // VPX_VP9_COMMON_VP9_ONYXC_INT_H_
diff --git a/media/libvpx/libvpx/vp9/common/vp9_postproc.c b/media/libvpx/libvpx/vp9/common/vp9_postproc.c
index b105e5d45a..c777556050 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_postproc.c
+++ b/media/libvpx/libvpx/vp9/common/vp9_postproc.c
@@ -119,8 +119,8 @@ void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows,
   uint16_t d[16];
 
   for (r = 0; r < rows; r++) {
-    int sumsq = 0;
-    int sum = 0;
+    int64_t sumsq = 0;
+    int64_t sum = 0;
 
     for (i = -8; i <= 6; i++) {
       sumsq += s[i] * s[i];
@@ -157,8 +157,8 @@ void vp9_highbd_mbpost_proc_down_c(uint16_t *dst, int pitch, int rows, int cols,
 
   for (c = 0; c < cols; c++) {
     uint16_t *s = &dst[c];
-    int sumsq = 0;
-    int sum = 0;
+    int64_t sumsq = 0;
+    int64_t sum = 0;
     uint16_t d[16];
     const int16_t *rv2 = rv3 + ((c * 17) & 127);
 
@@ -183,7 +183,8 @@ void vp9_highbd_mbpost_proc_down_c(uint16_t *dst, int pitch, int rows, int cols,
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-static void deblock_and_de_macro_block(YV12_BUFFER_CONFIG *source,
+static void deblock_and_de_macro_block(VP9_COMMON *cm,
+                                       YV12_BUFFER_CONFIG *source,
                                        YV12_BUFFER_CONFIG *post, int q,
                                        int low_var_thresh, int flag,
                                        uint8_t *limits) {
@@ -216,7 +217,7 @@ static void deblock_and_de_macro_block(YV12_BUFFER_CONFIG *source,
         source->uv_height, source->uv_width, ppl);
   } else {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-    vp9_deblock(source, post, q, limits);
+    vp9_deblock(cm, source, post, q, limits);
     vpx_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
                               post->y_width, q2mbl(q));
     vpx_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
@@ -226,8 +227,8 @@ static void deblock_and_de_macro_block(YV12_BUFFER_CONFIG *source,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
-void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q,
-                 uint8_t *limits) {
+void vp9_deblock(struct VP9Common *cm, const YV12_BUFFER_CONFIG *src,
+                 YV12_BUFFER_CONFIG *dst, int q, uint8_t *limits) {
   const int ppl =
       (int)(6.0e-05 * q * q * q - 0.0067 * q * q + 0.306 * q + 0.0065 + 0.5);
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -252,10 +253,8 @@ void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q,
   } else {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     int mbr;
-    const int mb_rows = src->y_height / 16;
-    const int mb_cols = src->y_width / 16;
-
-    memset(limits, (unsigned char)ppl, 16 * mb_cols);
+    const int mb_rows = cm->mb_rows;
+    memset(limits, (unsigned char)ppl, cm->postproc_state.limits_size);
 
     for (mbr = 0; mbr < mb_rows; mbr++) {
       vpx_post_proc_down_and_across_mb_row(
@@ -276,9 +275,9 @@ void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
-void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q,
-                 uint8_t *limits) {
-  vp9_deblock(src, dst, q, limits);
+void vp9_denoise(struct VP9Common *cm, const YV12_BUFFER_CONFIG *src,
+                 YV12_BUFFER_CONFIG *dst, int q, uint8_t *limits) {
+  vp9_deblock(cm, src, dst, q, limits);
 }
 
 static void swap_mi_and_prev_mi(VP9_COMMON *cm) {
@@ -293,11 +292,12 @@ static void swap_mi_and_prev_mi(VP9_COMMON *cm) {
 }
 
 int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest,
-                        vp9_ppflags_t *ppflags) {
+                        vp9_ppflags_t *ppflags, int unscaled_width) {
   const int q = VPXMIN(105, cm->lf.filter_level * 2);
   const int flags = ppflags->post_proc_flag;
   YV12_BUFFER_CONFIG *const ppbuf = &cm->post_proc_buffer;
   struct postproc_state *const ppstate = &cm->postproc_state;
+  ppstate->limits_size = unscaled_width;
 
   if (!cm->frame_to_show) return -1;
 
@@ -339,10 +339,10 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest,
                            "Failed to allocate MFQE framebuffer");
       }
 
-      // Ensure that postproc is set to all 0s so that post proc
+      // Ensure that postproc is set to flat image so that post proc
       // doesn't pull random data in from edge.
       memset(cm->post_proc_buffer_int.buffer_alloc, 128,
-             cm->post_proc_buffer.frame_size);
+             cm->post_proc_buffer_int.frame_size);
     }
   }
 
@@ -352,14 +352,18 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest,
                                cm->use_highbitdepth,
 #endif
                                VP9_DEC_BORDER_IN_PIXELS, cm->byte_alignment,
-                               NULL, NULL, NULL) < 0)
+                               NULL, NULL, NULL) < 0) {
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate post-processing buffer");
+  }
+  memset(cm->post_proc_buffer.buffer_alloc, 128,
+         cm->post_proc_buffer.frame_size);
 
   if (flags & (VP9D_DEMACROBLOCK | VP9D_DEBLOCK)) {
     if (!cm->postproc_state.limits) {
       cm->postproc_state.limits =
-          vpx_calloc(cm->width, sizeof(*cm->postproc_state.limits));
+          vpx_calloc(ppstate->limits_size, sizeof(*cm->postproc_state.limits));
+      if (!cm->postproc_state.limits) return 1;
     }
   }
 
@@ -380,26 +384,26 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest,
     // if mfqe is enabled. Need to take both the quality and the speed
     // into consideration.
     if ((flags & VP9D_DEMACROBLOCK) || (flags & VP9D_DEBLOCK)) {
-      vp8_yv12_copy_frame(ppbuf, &cm->post_proc_buffer_int);
+      vpx_yv12_copy_frame(ppbuf, &cm->post_proc_buffer_int);
     }
     if ((flags & VP9D_DEMACROBLOCK) && cm->post_proc_buffer_int.buffer_alloc) {
-      deblock_and_de_macro_block(&cm->post_proc_buffer_int, ppbuf,
+      deblock_and_de_macro_block(cm, &cm->post_proc_buffer_int, ppbuf,
                                  q + (ppflags->deblocking_level - 5) * 10, 1, 0,
                                  cm->postproc_state.limits);
     } else if (flags & VP9D_DEBLOCK) {
-      vp9_deblock(&cm->post_proc_buffer_int, ppbuf, q,
+      vp9_deblock(cm, &cm->post_proc_buffer_int, ppbuf, q,
                   cm->postproc_state.limits);
     } else {
-      vp8_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf);
+      vpx_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf);
     }
   } else if (flags & VP9D_DEMACROBLOCK) {
-    deblock_and_de_macro_block(cm->frame_to_show, ppbuf,
+    deblock_and_de_macro_block(cm, cm->frame_to_show, ppbuf,
                                q + (ppflags->deblocking_level - 5) * 10, 1, 0,
                                cm->postproc_state.limits);
   } else if (flags & VP9D_DEBLOCK) {
-    vp9_deblock(cm->frame_to_show, ppbuf, q, cm->postproc_state.limits);
+    vp9_deblock(cm, cm->frame_to_show, ppbuf, q, cm->postproc_state.limits);
   } else {
-    vp8_yv12_copy_frame(cm->frame_to_show, ppbuf);
+    vpx_yv12_copy_frame(cm->frame_to_show, ppbuf);
   }
 
   ppstate->last_base_qindex = cm->base_qindex;
diff --git a/media/libvpx/libvpx/vp9/common/vp9_postproc.h b/media/libvpx/libvpx/vp9/common/vp9_postproc.h
index 6059094114..ef6f4ea5f4 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_postproc.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_postproc.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_POSTPROC_H_
-#define VP9_COMMON_VP9_POSTPROC_H_
+#ifndef VPX_VP9_COMMON_VP9_POSTPROC_H_
+#define VPX_VP9_COMMON_VP9_POSTPROC_H_
 
 #include "vpx_ports/mem.h"
 #include "vpx_scale/yv12config.h"
@@ -30,6 +30,7 @@ struct postproc_state {
   MODE_INFO *prev_mi;
   int clamp;
   uint8_t *limits;
+  int limits_size;
   int8_t *generated_noise;
 };
 
@@ -38,16 +39,16 @@ struct VP9Common;
 #define MFQE_PRECISION 4
 
 int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest,
-                        vp9_ppflags_t *flags);
+                        vp9_ppflags_t *ppflags, int unscaled_width);
 
-void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q,
-                 uint8_t *limits);
+void vp9_denoise(struct VP9Common *cm, const YV12_BUFFER_CONFIG *src,
+                 YV12_BUFFER_CONFIG *dst, int q, uint8_t *limits);
 
-void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q,
-                 uint8_t *limits);
+void vp9_deblock(struct VP9Common *cm, const YV12_BUFFER_CONFIG *src,
+                 YV12_BUFFER_CONFIG *dst, int q, uint8_t *limits);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_POSTPROC_H_
+#endif  // VPX_VP9_COMMON_VP9_POSTPROC_H_
diff --git a/media/libvpx/libvpx/vp9/common/vp9_ppflags.h b/media/libvpx/libvpx/vp9/common/vp9_ppflags.h
index b8b647bf18..a0e3017626 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_ppflags.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_ppflags.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_PPFLAGS_H_
-#define VP9_COMMON_VP9_PPFLAGS_H_
+#ifndef VPX_VP9_COMMON_VP9_PPFLAGS_H_
+#define VPX_VP9_COMMON_VP9_PPFLAGS_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -33,4 +33,4 @@ typedef struct {
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_PPFLAGS_H_
+#endif  // VPX_VP9_COMMON_VP9_PPFLAGS_H_
diff --git a/media/libvpx/libvpx/vp9/common/vp9_pred_common.c b/media/libvpx/libvpx/vp9/common/vp9_pred_common.c
index a7ddc0b951..375cb4d76c 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_pred_common.c
+++ b/media/libvpx/libvpx/vp9/common/vp9_pred_common.c
@@ -13,6 +13,32 @@
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_seg_common.h"
 
+int vp9_compound_reference_allowed(const VP9_COMMON *cm) {
+  int i;
+  for (i = 1; i < REFS_PER_FRAME; ++i)
+    if (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1]) return 1;
+
+  return 0;
+}
+
+void vp9_setup_compound_reference_mode(VP9_COMMON *cm) {
+  if (cm->ref_frame_sign_bias[LAST_FRAME] ==
+      cm->ref_frame_sign_bias[GOLDEN_FRAME]) {
+    cm->comp_fixed_ref = ALTREF_FRAME;
+    cm->comp_var_ref[0] = LAST_FRAME;
+    cm->comp_var_ref[1] = GOLDEN_FRAME;
+  } else if (cm->ref_frame_sign_bias[LAST_FRAME] ==
+             cm->ref_frame_sign_bias[ALTREF_FRAME]) {
+    cm->comp_fixed_ref = GOLDEN_FRAME;
+    cm->comp_var_ref[0] = LAST_FRAME;
+    cm->comp_var_ref[1] = ALTREF_FRAME;
+  } else {
+    cm->comp_fixed_ref = LAST_FRAME;
+    cm->comp_var_ref[0] = GOLDEN_FRAME;
+    cm->comp_var_ref[1] = ALTREF_FRAME;
+  }
+}
+
 int vp9_get_reference_mode_context(const VP9_COMMON *cm,
                                    const MACROBLOCKD *xd) {
   int ctx;
@@ -229,9 +255,8 @@ int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
         else
           pred_context = 4 * (edge_mi->ref_frame[0] == GOLDEN_FRAME);
       } else {
-        pred_context = 1 +
-                       2 * (edge_mi->ref_frame[0] == GOLDEN_FRAME ||
-                            edge_mi->ref_frame[1] == GOLDEN_FRAME);
+        pred_context = 1 + 2 * (edge_mi->ref_frame[0] == GOLDEN_FRAME ||
+                                edge_mi->ref_frame[1] == GOLDEN_FRAME);
       }
     } else {  // inter/inter
       const int above_has_second = has_second_ref(above_mi);
diff --git a/media/libvpx/libvpx/vp9/common/vp9_pred_common.h b/media/libvpx/libvpx/vp9/common/vp9_pred_common.h
index 8400bd70f1..ee59669359 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_pred_common.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_pred_common.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_PRED_COMMON_H_
-#define VP9_COMMON_VP9_PRED_COMMON_H_
+#ifndef VPX_VP9_COMMON_VP9_PRED_COMMON_H_
+#define VPX_VP9_COMMON_VP9_PRED_COMMON_H_
 
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_onyxc_int.h"
@@ -145,6 +145,10 @@ static INLINE vpx_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm,
   return cm->fc->single_ref_prob[vp9_get_pred_context_single_ref_p2(xd)][1];
 }
 
+int vp9_compound_reference_allowed(const VP9_COMMON *cm);
+
+void vp9_setup_compound_reference_mode(VP9_COMMON *cm);
+
 // Returns a context number for the given MB prediction signal
 // The mode info data structure has a one element border above and to the
 // left of the entries corresponding to real blocks.
@@ -176,12 +180,6 @@ static INLINE const vpx_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx,
   }
 }
 
-static INLINE const vpx_prob *get_tx_probs2(TX_SIZE max_tx_size,
-                                            const MACROBLOCKD *xd,
-                                            const struct tx_probs *tx_probs) {
-  return get_tx_probs(max_tx_size, get_tx_size_context(xd), tx_probs);
-}
-
 static INLINE unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx,
                                           struct tx_counts *tx_counts) {
   switch (max_tx_size) {
@@ -196,4 +194,4 @@ static INLINE unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx,
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_PRED_COMMON_H_
+#endif  // VPX_VP9_COMMON_VP9_PRED_COMMON_H_
diff --git a/media/libvpx/libvpx/vp9/common/vp9_quant_common.h b/media/libvpx/libvpx/vp9/common/vp9_quant_common.h
index 4bae4a8967..ec8b9f4c6a 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_quant_common.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_quant_common.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_QUANT_COMMON_H_
-#define VP9_COMMON_VP9_QUANT_COMMON_H_
+#ifndef VPX_VP9_COMMON_VP9_QUANT_COMMON_H_
+#define VPX_VP9_COMMON_VP9_QUANT_COMMON_H_
 
 #include "vpx/vpx_codec.h"
 #include "vp9/common/vp9_seg_common.h"
@@ -33,4 +33,4 @@ int vp9_get_qindex(const struct segmentation *seg, int segment_id,
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_QUANT_COMMON_H_
+#endif  // VPX_VP9_COMMON_VP9_QUANT_COMMON_H_
diff --git a/media/libvpx/libvpx/vp9/common/vp9_reconinter.c b/media/libvpx/libvpx/vp9/common/vp9_reconinter.c
index 8eb7126898..0a60b853d8 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_reconinter.c
+++ b/media/libvpx/libvpx/vp9/common/vp9_reconinter.c
@@ -13,15 +13,16 @@
 #include "./vpx_scale_rtcd.h"
 #include "./vpx_config.h"
 
-#include "vpx/vpx_integer.h"
-
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
 
+#include "vpx/vpx_integer.h"
+#include "vpx_scale/yv12config.h"
+
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp9_highbd_build_inter_predictor(
-    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
     const MV *src_mv, const struct scale_factors *sf, int w, int h, int ref,
     const InterpKernel *kernel, enum mv_precision precision, int x, int y,
     int bd) {
@@ -63,14 +64,14 @@ static INLINE int round_mv_comp_q4(int value) {
 }
 
 static MV mi_mv_pred_q4(const MODE_INFO *mi, int idx) {
-  MV res = {
-    round_mv_comp_q4(
-        mi->bmi[0].as_mv[idx].as_mv.row + mi->bmi[1].as_mv[idx].as_mv.row +
-        mi->bmi[2].as_mv[idx].as_mv.row + mi->bmi[3].as_mv[idx].as_mv.row),
-    round_mv_comp_q4(
-        mi->bmi[0].as_mv[idx].as_mv.col + mi->bmi[1].as_mv[idx].as_mv.col +
-        mi->bmi[2].as_mv[idx].as_mv.col + mi->bmi[3].as_mv[idx].as_mv.col)
-  };
+  MV res = { round_mv_comp_q4(mi->bmi[0].as_mv[idx].as_mv.row +
+                              mi->bmi[1].as_mv[idx].as_mv.row +
+                              mi->bmi[2].as_mv[idx].as_mv.row +
+                              mi->bmi[3].as_mv[idx].as_mv.row),
+             round_mv_comp_q4(mi->bmi[0].as_mv[idx].as_mv.col +
+                              mi->bmi[1].as_mv[idx].as_mv.col +
+                              mi->bmi[2].as_mv[idx].as_mv.col +
+                              mi->bmi[3].as_mv[idx].as_mv.col) };
   return res;
 }
 
@@ -96,8 +97,8 @@ MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv, int bw,
   const int spel_right = spel_left - SUBPEL_SHIFTS;
   const int spel_top = (VP9_INTERP_EXTEND + bh) << SUBPEL_BITS;
   const int spel_bottom = spel_top - SUBPEL_SHIFTS;
-  MV clamped_mv = { src_mv->row * (1 << (1 - ss_y)),
-                    src_mv->col * (1 << (1 - ss_x)) };
+  MV clamped_mv = { (short)(src_mv->row * (1 << (1 - ss_y))),
+                    (short)(src_mv->col * (1 << (1 - ss_x))) };
   assert(ss_x <= 1);
   assert(ss_y <= 1);
 
@@ -136,7 +137,7 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
     const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
     struct buf_2d *const pre_buf = &pd->pre[ref];
     struct buf_2d *const dst_buf = &pd->dst;
-    uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+    uint8_t *const dst = dst_buf->buf + (int64_t)dst_buf->stride * y + x;
     const MV mv = mi->sb_type < BLOCK_8X8
                       ? average_split_mvs(pd, mi, ref, block)
                       : mi->mv[ref].as_mv;
@@ -158,18 +159,19 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
       // Co-ordinate of containing block to pixel precision.
       const int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
       const int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
+      const YV12_BUFFER_CONFIG *ref_buf = xd->block_refs[ref]->buf;
+      uint8_t *buf_array[] = { ref_buf->y_buffer, ref_buf->u_buffer,
+                               ref_buf->v_buffer };
+      const int stride_array[] = { ref_buf->y_stride, ref_buf->uv_stride,
+                                   ref_buf->uv_stride };
 #if 0  // CONFIG_BETTER_HW_COMPATIBILITY
       assert(xd->mi[0]->sb_type != BLOCK_4X8 &&
              xd->mi[0]->sb_type != BLOCK_8X4);
       assert(mv_q4.row == mv.row * (1 << (1 - pd->subsampling_y)) &&
              mv_q4.col == mv.col * (1 << (1 - pd->subsampling_x)));
 #endif
-      if (plane == 0)
-        pre_buf->buf = xd->block_refs[ref]->buf->y_buffer;
-      else if (plane == 1)
-        pre_buf->buf = xd->block_refs[ref]->buf->u_buffer;
-      else
-        pre_buf->buf = xd->block_refs[ref]->buf->v_buffer;
+      pre_buf->buf = buf_array[plane];
+      pre_buf->stride = stride_array[plane];
 
       pre_buf->buf +=
           scaled_buffer_offset(x_start + x, y_start + y, pre_buf->stride, sf);
@@ -178,7 +180,7 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
       xs = sf->x_step_q4;
       ys = sf->y_step_q4;
     } else {
-      pre = pre_buf->buf + (y * pre_buf->stride + x);
+      pre = pre_buf->buf + ((int64_t)y * pre_buf->stride + x);
       scaled_mv.row = mv_q4.row;
       scaled_mv.col = mv_q4.col;
       xs = ys = 16;
@@ -190,7 +192,8 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
 
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      highbd_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+      highbd_inter_predictor(CONVERT_TO_SHORTPTR(pre), pre_buf->stride,
+                             CONVERT_TO_SHORTPTR(dst), dst_buf->stride,
                              subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys,
                              xd->bd);
     } else {
diff --git a/media/libvpx/libvpx/vp9/common/vp9_reconinter.h b/media/libvpx/libvpx/vp9/common/vp9_reconinter.h
index 4fed4f7f6e..12b545831a 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_reconinter.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_reconinter.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_RECONINTER_H_
-#define VP9_COMMON_VP9_RECONINTER_H_
+#ifndef VPX_VP9_COMMON_VP9_RECONINTER_H_
+#define VPX_VP9_COMMON_VP9_RECONINTER_H_
 
 #include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_onyxc_int.h"
@@ -26,19 +26,19 @@ static INLINE void inter_predictor(const uint8_t *src, int src_stride,
                                    const struct scale_factors *sf, int w, int h,
                                    int ref, const InterpKernel *kernel, int xs,
                                    int ys) {
-  sf->predict[subpel_x != 0][subpel_y != 0][ref](
-      src, src_stride, dst, dst_stride, kernel[subpel_x], xs, kernel[subpel_y],
-      ys, w, h);
+  sf->predict[subpel_x != 0][subpel_y != 0][ref](src, src_stride, dst,
+                                                 dst_stride, kernel, subpel_x,
+                                                 xs, subpel_y, ys, w, h);
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static INLINE void highbd_inter_predictor(
-    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
     const int subpel_x, const int subpel_y, const struct scale_factors *sf,
     int w, int h, int ref, const InterpKernel *kernel, int xs, int ys, int bd) {
   sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref](
-      src, src_stride, dst, dst_stride, kernel[subpel_x], xs, kernel[subpel_y],
-      ys, w, h, bd);
+      src, src_stride, dst, dst_stride, kernel, subpel_x, xs, subpel_y, ys, w,
+      h, bd);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -61,24 +61,25 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
                                    BLOCK_SIZE bsize);
 
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
-                               int dst_stride, const MV *mv_q3,
+                               int dst_stride, const MV *src_mv,
                                const struct scale_factors *sf, int w, int h,
-                               int do_avg, const InterpKernel *kernel,
+                               int ref, const InterpKernel *kernel,
                                enum mv_precision precision, int x, int y);
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp9_highbd_build_inter_predictor(
-    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
-    const MV *mv_q3, const struct scale_factors *sf, int w, int h, int do_avg,
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
+    const MV *src_mv, const struct scale_factors *sf, int w, int h, int ref,
     const InterpKernel *kernel, enum mv_precision precision, int x, int y,
     int bd);
 #endif
 
-static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride,
-                                       const struct scale_factors *sf) {
+static INLINE int64_t scaled_buffer_offset(int x_offset, int y_offset,
+                                           int stride,
+                                           const struct scale_factors *sf) {
   const int x = sf ? sf->scale_value_x(x_offset, sf) : x_offset;
   const int y = sf ? sf->scale_value_y(y_offset, sf) : y_offset;
-  return y * stride + x;
+  return (int64_t)y * stride + x;
 }
 
 static INLINE void setup_pred_plane(struct buf_2d *dst, uint8_t *src,
@@ -103,4 +104,4 @@ void vp9_setup_pre_planes(MACROBLOCKD *xd, int idx,
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_RECONINTER_H_
+#endif  // VPX_VP9_COMMON_VP9_RECONINTER_H_
diff --git a/media/libvpx/libvpx/vp9/common/vp9_reconintra.h b/media/libvpx/libvpx/vp9/common/vp9_reconintra.h
index 78e41c8811..426a35ebfa 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_reconintra.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_reconintra.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_RECONINTRA_H_
-#define VP9_COMMON_VP9_RECONINTRA_H_
+#ifndef VPX_VP9_COMMON_VP9_RECONINTRA_H_
+#define VPX_VP9_COMMON_VP9_RECONINTRA_H_
 
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"
@@ -28,4 +28,4 @@ void vp9_predict_intra_block(const MACROBLOCKD *xd, int bwl_in, TX_SIZE tx_size,
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_RECONINTRA_H_
+#endif  // VPX_VP9_COMMON_VP9_RECONINTRA_H_
diff --git a/media/libvpx/libvpx/vp9/common/vp9_rtcd.c b/media/libvpx/libvpx/vp9/common/vp9_rtcd.c
index d8c870aa3f..1a93b97e56 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_rtcd.c
+++ b/media/libvpx/libvpx/vp9/common/vp9_rtcd.c
@@ -12,8 +12,4 @@
 #include "./vp9_rtcd.h"
 #include "vpx_ports/vpx_once.h"
 
-void vp9_rtcd() {
-  // TODO(JBB): Remove this once, by insuring that both the encoder and
-  // decoder setup functions are protected by once();
-  once(setup_rtcd_internal);
-}
+void vp9_rtcd(void) { once(setup_rtcd_internal); }
diff --git a/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl b/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl
index 088b004f52..dac3d89e2a 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl
+++ b/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl
@@ -1,3 +1,13 @@
+##
+##  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
 sub vp9_common_forward_decls() {
 print <<EOF
 /*
@@ -7,12 +17,18 @@ print <<EOF
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_filter.h"
+#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER
+#include "vp9/encoder/vp9_temporal_filter.h"
+#endif
 
 struct macroblockd;
 
 /* Encoder forward decls */
 struct macroblock;
-struct vp9_variance_vtable;
+struct macroblock_plane;
+struct vp9_sad_table;
+struct ScanOrder;
 struct search_site_config;
 struct mv;
 union int_mv;
@@ -29,6 +45,7 @@ if ($opts{arch} eq "x86_64") {
   $ssse3_x86_64 = 'ssse3';
   $avx_x86_64 = 'avx';
   $avx2_x86_64 = 'avx2';
+  $avx512_x86_64 = 'avx512';
 }
 
 #
@@ -45,41 +62,24 @@ specialize qw/vp9_filter_by_weight8x8 sse2 msa/;
 #
 # dct
 #
-if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
-  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
-    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
+# Force C versions if CONFIG_EMULATE_HARDWARE is 1
+add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
 
-    add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
+add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
 
-    add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
-  } else {
-    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
-    specialize qw/vp9_iht4x4_16_add sse2/;
+add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
 
-    add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
-    specialize qw/vp9_iht8x8_64_add sse2/;
-
-    add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
-    specialize qw/vp9_iht16x16_256_add sse2/;
-  }
-} else {
-  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
-  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
-    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
-
-    add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
-
-    add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
-  } else {
-    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
-    specialize qw/vp9_iht4x4_16_add sse2 neon dspr2 msa/;
-
-    add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
-    specialize qw/vp9_iht8x8_64_add sse2 neon dspr2 msa/;
-
-    add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
-    specialize qw/vp9_iht16x16_256_add sse2 dspr2 msa/;
+if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
+  # Note that there are more specializations appended when
+  # CONFIG_VP9_HIGHBITDEPTH is off.
+  specialize qw/vp9_iht4x4_16_add neon sse2 vsx/;
+  specialize qw/vp9_iht8x8_64_add neon sse2 vsx/;
+  specialize qw/vp9_iht16x16_256_add neon sse2 vsx/;
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
+    # Note that these specializations are appended to the above ones.
+    specialize qw/vp9_iht4x4_16_add dspr2 msa/;
+    specialize qw/vp9_iht8x8_64_add dspr2 msa/;
+    specialize qw/vp9_iht16x16_256_add dspr2 msa/;
   }
 }
 
@@ -101,18 +101,23 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   #
   # Note as optimized versions of these functions are added we need to add a check to ensure
   # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
-  add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd";
+  add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd";
 
-  add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd";
+  add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd";
 
-  add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd";
+  add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd";
+
+  if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
+    specialize qw/vp9_highbd_iht4x4_16_add neon sse4_1/;
+    specialize qw/vp9_highbd_iht8x8_64_add neon sse4_1/;
+    specialize qw/vp9_highbd_iht16x16_256_add neon sse4_1/;
+  }
 }
 
 #
 # Encoder functions below this point.
 #
 if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
-
 # ENCODEMB INVOKE
 
 #
@@ -120,101 +125,120 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
 #
 if (vpx_config("CONFIG_VP9_TEMPORAL_DENOISING") eq "yes") {
   add_proto qw/int vp9_denoiser_filter/, "const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude";
-  specialize qw/vp9_denoiser_filter sse2/;
+  specialize qw/vp9_denoiser_filter neon sse2/;
 }
 
+add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+
+add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
+specialize qw/vp9_block_error_fp neon sve avx2 sse2/;
+
+add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
+specialize qw/vp9_quantize_fp neon sse2 ssse3 avx2 vsx/;
+
+add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
+specialize qw/vp9_quantize_fp_32x32 neon ssse3 avx2 vsx/;
+
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+  specialize qw/vp9_block_error neon sve avx2 sse2/;
 
   add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
-  specialize qw/vp9_highbd_block_error sse2/;
-
-  add_proto qw/int64_t vp9_highbd_block_error_8bit/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
-  specialize qw/vp9_highbd_block_error_8bit sse2 avx/;
-
-  add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-
-  add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-
-  add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_fdct8x8_quant ssse3/;
+  specialize qw/vp9_highbd_block_error neon sse2/;
 } else {
-  add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
-  specialize qw/vp9_block_error avx2 msa sse2/;
-
-  add_proto qw/int64_t vp9_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
-  specialize qw/vp9_block_error_fp neon sse2/;
-
-  add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";
-
-  add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";
-
-  add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_fdct8x8_quant sse2 ssse3 neon/;
+  specialize qw/vp9_block_error neon sve avx2 msa sse2/;
 }
 
 # fdct functions
 
-if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_fht4x4 sse2/;
+add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 
-  add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_fht8x8 sse2/;
+add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 
-  add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_fht16x16 sse2/;
+add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 
-  add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fwht4x4 sse2/;
-} else {
-  add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_fht4x4 sse2 msa/;
+add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
 
-  add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_fht8x8 sse2 msa/;
-
-  add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_fht16x16 sse2 msa/;
-
-  add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fwht4x4 msa sse2/;
+# Note that there are more specializations appended when CONFIG_VP9_HIGHBITDEPTH
+# is off.
+specialize qw/vp9_fht4x4 sse2 neon/;
+specialize qw/vp9_fht8x8 sse2 neon/;
+specialize qw/vp9_fht16x16 sse2 neon/;
+specialize qw/vp9_fwht4x4 sse2/;
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
+  # Note that these specializations are appended to the above ones.
+  specialize qw/vp9_fht4x4 msa/;
+  specialize qw/vp9_fht8x8 msa/;
+  specialize qw/vp9_fht16x16 msa/;
+  specialize qw/vp9_fwht4x4 msa/;
 }
 
 #
 # Motion search
 #
-add_proto qw/int vp9_full_search_sad/, "const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv";
-specialize qw/vp9_full_search_sad sse3 sse4_1/;
-$vp9_full_search_sad_sse3=vp9_full_search_sadx3;
-$vp9_full_search_sad_sse4_1=vp9_full_search_sadx8;
+add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv";
+specialize qw/vp9_diamond_search_sad neon/;
 
-add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
-specialize qw/vp9_diamond_search_sad avx/;
+#
+# Apply temporal filter
+#
+if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") {
+add_proto qw/void vp9_apply_temporal_filter/, "const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count";
+specialize qw/vp9_apply_temporal_filter sse4_1 neon/;
 
-add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
-specialize qw/vp9_temporal_filter_apply sse2 msa/;
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void vp9_highbd_apply_temporal_filter/, "const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count";
+    specialize qw/vp9_highbd_apply_temporal_filter sse4_1 neon/;
+  }
+}
+
+#
+# 12-tap filter used in prediction data generation during temporal filtering
+#
+if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") {
+  add_proto qw/void vpx_convolve12_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+  specialize qw/vpx_convolve12_vert ssse3 avx2 neon neon_dotprod neon_i8mm/;
+
+  add_proto qw/void vpx_convolve12_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+  specialize qw/vpx_convolve12_horiz ssse3 avx2 neon neon_dotprod neon_i8mm/;
+
+  add_proto qw/void vpx_convolve12/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+  specialize qw/vpx_convolve12 ssse3 avx2 neon neon_dotprod neon_i8mm/;
+
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void vpx_highbd_convolve12_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
+    specialize qw/vpx_highbd_convolve12_vert ssse3 avx2 neon sve2/;
+
+    add_proto qw/void vpx_highbd_convolve12_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
+    specialize qw/vpx_highbd_convolve12_horiz ssse3 avx2 neon sve2/;
+
+    add_proto qw/void vpx_highbd_convolve12/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
+    specialize qw/vpx_highbd_convolve12 ssse3 avx2 neon sve2/;
+  }
+}
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 
   # ENCODEMB INVOKE
 
-  add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
+  specialize qw/vp9_highbd_quantize_fp avx2 neon/;
 
-  add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
+  specialize qw/vp9_highbd_quantize_fp_32x32 avx2 neon/;
 
   # fdct functions
   add_proto qw/void vp9_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_highbd_fht4x4 neon/;
 
   add_proto qw/void vp9_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_highbd_fht8x8 neon/;
 
   add_proto qw/void vp9_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_highbd_fht16x16 neon/;
 
   add_proto qw/void vp9_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
 
-  add_proto qw/void vp9_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
+  add_proto qw/void vp9_highbd_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int *blk_fw, int use_32x32, uint32_t *accumulator, uint16_t *count";
 
 }
 # End vp9_high encoder functions
@@ -222,11 +246,8 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 #
 # frame based scale
 #
-if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-} else {
-  add_proto qw/void vp9_scale_and_extend_frame/, "const struct yv12_buffer_config *src, struct yv12_buffer_config *dst";
-  specialize qw/vp9_scale_and_extend_frame ssse3/;
-}
+add_proto qw/void vp9_scale_and_extend_frame/, "const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler";
+specialize qw/vp9_scale_and_extend_frame neon ssse3/;
 
 }
 # end encoder functions
diff --git a/media/libvpx/libvpx/vp9/common/vp9_scale.h b/media/libvpx/libvpx/vp9/common/vp9_scale.h
index ada8dbaad5..2f3b609483 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_scale.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_scale.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_SCALE_H_
-#define VP9_COMMON_VP9_SCALE_H_
+#ifndef VPX_VP9_COMMON_VP9_SCALE_H_
+#define VPX_VP9_COMMON_VP9_SCALE_H_
 
 #include "vp9/common/vp9_mv.h"
 #include "vpx_dsp/vpx_convolve.h"
@@ -20,7 +20,7 @@ extern "C" {
 
 #define REF_SCALE_SHIFT 14
 #define REF_NO_SCALE (1 << REF_SCALE_SHIFT)
-#define REF_INVALID_SCALE -1
+#define REF_INVALID_SCALE (-1)
 
 struct scale_factors {
   int x_scale_fp;  // horizontal fixed point scale factor
@@ -42,7 +42,7 @@ MV32 vp9_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf);
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
                                        int other_h, int this_w, int this_h,
-                                       int use_high);
+                                       int use_highbd);
 #else
 void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
                                        int other_h, int this_w, int this_h);
@@ -68,4 +68,4 @@ static INLINE int valid_ref_frame_size(int ref_width, int ref_height,
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_SCALE_H_
+#endif  // VPX_VP9_COMMON_VP9_SCALE_H_
diff --git a/media/libvpx/libvpx/vp9/common/vp9_scan.c b/media/libvpx/libvpx/vp9/common/vp9_scan.c
index 0fef263510..adacb7ef96 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_scan.c
+++ b/media/libvpx/libvpx/vp9/common/vp9_scan.c
@@ -511,190 +511,191 @@ DECLARE_ALIGNED(16, static const int16_t,
   959, 990,  991, 1022, 0,   0,
 };
 
+// Add 1 to iscan values. This represents the EOB position instead of the index.
 DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_4x4[16]) = {
-  0, 2, 5, 8, 1, 3, 9, 12, 4, 7, 11, 14, 6, 10, 13, 15,
+  1, 3, 6, 9, 2, 4, 10, 13, 5, 8, 12, 15, 7, 11, 14, 16,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_4x4[16]) = {
-  0, 3, 7, 11, 1, 5, 9, 12, 2, 6, 10, 14, 4, 8, 13, 15,
+  1, 4, 8, 12, 2, 6, 10, 13, 3, 7, 11, 15, 5, 9, 14, 16,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_4x4[16]) = {
-  0, 1, 3, 5, 2, 4, 6, 9, 7, 8, 11, 13, 10, 12, 14, 15,
+  1, 2, 4, 6, 3, 5, 7, 10, 8, 9, 12, 14, 11, 13, 15, 16,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_8x8[64]) = {
-  0,  3,  8,  15, 22, 32, 40, 47, 1,  5,  11, 18, 26, 34, 44, 51,
-  2,  7,  13, 20, 28, 38, 46, 54, 4,  10, 16, 24, 31, 41, 50, 56,
-  6,  12, 21, 27, 35, 43, 52, 58, 9,  17, 25, 33, 39, 48, 55, 60,
-  14, 23, 30, 37, 45, 53, 59, 62, 19, 29, 36, 42, 49, 57, 61, 63,
+  1,  4,  9,  16, 23, 33, 41, 48, 2,  6,  12, 19, 27, 35, 45, 52,
+  3,  8,  14, 21, 29, 39, 47, 55, 5,  11, 17, 25, 32, 42, 51, 57,
+  7,  13, 22, 28, 36, 44, 53, 59, 10, 18, 26, 34, 40, 49, 56, 61,
+  15, 24, 31, 38, 46, 54, 60, 63, 20, 30, 37, 43, 50, 58, 62, 64,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_8x8[64]) = {
-  0,  1,  2,  5,  8,  12, 19, 24, 3,  4,  7,  10, 15, 20, 30, 39,
-  6,  9,  13, 16, 21, 27, 37, 46, 11, 14, 17, 23, 28, 34, 44, 52,
-  18, 22, 25, 31, 35, 41, 50, 57, 26, 29, 33, 38, 43, 49, 55, 59,
-  32, 36, 42, 47, 51, 54, 60, 61, 40, 45, 48, 53, 56, 58, 62, 63,
+  1,  2,  3,  6,  9,  13, 20, 25, 4,  5,  8,  11, 16, 21, 31, 40,
+  7,  10, 14, 17, 22, 28, 38, 47, 12, 15, 18, 24, 29, 35, 45, 53,
+  19, 23, 26, 32, 36, 42, 51, 58, 27, 30, 34, 39, 44, 50, 56, 60,
+  33, 37, 43, 48, 52, 55, 61, 62, 41, 46, 49, 54, 57, 59, 63, 64,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_8x8[64]) = {
-  0,  2,  5,  9,  14, 22, 31, 37, 1,  4,  8,  13, 19, 26, 38, 44,
-  3,  6,  10, 17, 24, 30, 42, 49, 7,  11, 15, 21, 29, 36, 47, 53,
-  12, 16, 20, 27, 34, 43, 52, 57, 18, 23, 28, 35, 41, 48, 56, 60,
-  25, 32, 39, 45, 50, 55, 59, 62, 33, 40, 46, 51, 54, 58, 61, 63,
+  1,  3,  6,  10, 15, 23, 32, 38, 2,  5,  9,  14, 20, 27, 39, 45,
+  4,  7,  11, 18, 25, 31, 43, 50, 8,  12, 16, 22, 30, 37, 48, 54,
+  13, 17, 21, 28, 35, 44, 53, 58, 19, 24, 29, 36, 42, 49, 57, 61,
+  26, 33, 40, 46, 51, 56, 60, 63, 34, 41, 47, 52, 55, 59, 62, 64,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_16x16[256]) = {
-  0,  4,  11,  20,  31,  43,  59,  75,  85,  109, 130, 150, 165, 181, 195, 198,
-  1,  6,  14,  23,  34,  47,  64,  81,  95,  114, 135, 153, 171, 188, 201, 212,
-  2,  8,  16,  25,  38,  52,  67,  83,  101, 116, 136, 157, 172, 190, 205, 216,
-  3,  10, 18,  29,  41,  55,  71,  89,  103, 119, 141, 159, 176, 194, 208, 218,
-  5,  12, 21,  32,  45,  58,  74,  93,  104, 123, 144, 164, 179, 196, 210, 223,
-  7,  15, 26,  37,  49,  63,  78,  96,  112, 129, 146, 166, 182, 200, 215, 228,
-  9,  19, 28,  39,  54,  69,  86,  102, 117, 132, 151, 170, 187, 206, 220, 230,
-  13, 24, 35,  46,  60,  73,  91,  108, 122, 137, 154, 174, 189, 207, 224, 235,
-  17, 30, 40,  53,  66,  82,  98,  115, 126, 142, 161, 180, 197, 213, 227, 237,
-  22, 36, 48,  62,  76,  92,  105, 120, 133, 147, 167, 186, 203, 219, 232, 240,
-  27, 44, 56,  70,  84,  99,  113, 127, 140, 156, 175, 193, 209, 226, 236, 244,
-  33, 51, 68,  79,  94,  110, 125, 138, 149, 162, 184, 202, 217, 229, 241, 247,
-  42, 61, 77,  90,  106, 121, 134, 148, 160, 173, 191, 211, 225, 238, 245, 251,
-  50, 72, 87,  100, 118, 128, 145, 158, 168, 183, 204, 222, 233, 242, 249, 253,
-  57, 80, 97,  111, 131, 143, 155, 169, 178, 192, 214, 231, 239, 246, 250, 254,
-  65, 88, 107, 124, 139, 152, 163, 177, 185, 199, 221, 234, 243, 248, 252, 255,
+  1,  5,  12,  21,  32,  44,  60,  76,  86,  110, 131, 151, 166, 182, 196, 199,
+  2,  7,  15,  24,  35,  48,  65,  82,  96,  115, 136, 154, 172, 189, 202, 213,
+  3,  9,  17,  26,  39,  53,  68,  84,  102, 117, 137, 158, 173, 191, 206, 217,
+  4,  11, 19,  30,  42,  56,  72,  90,  104, 120, 142, 160, 177, 195, 209, 219,
+  6,  13, 22,  33,  46,  59,  75,  94,  105, 124, 145, 165, 180, 197, 211, 224,
+  8,  16, 27,  38,  50,  64,  79,  97,  113, 130, 147, 167, 183, 201, 216, 229,
+  10, 20, 29,  40,  55,  70,  87,  103, 118, 133, 152, 171, 188, 207, 221, 231,
+  14, 25, 36,  47,  61,  74,  92,  109, 123, 138, 155, 175, 190, 208, 225, 236,
+  18, 31, 41,  54,  67,  83,  99,  116, 127, 143, 162, 181, 198, 214, 228, 238,
+  23, 37, 49,  63,  77,  93,  106, 121, 134, 148, 168, 187, 204, 220, 233, 241,
+  28, 45, 57,  71,  85,  100, 114, 128, 141, 157, 176, 194, 210, 227, 237, 245,
+  34, 52, 69,  80,  95,  111, 126, 139, 150, 163, 185, 203, 218, 230, 242, 248,
+  43, 62, 78,  91,  107, 122, 135, 149, 161, 174, 192, 212, 226, 239, 246, 252,
+  51, 73, 88,  101, 119, 129, 146, 159, 169, 184, 205, 223, 234, 243, 250, 254,
+  58, 81, 98,  112, 132, 144, 156, 170, 179, 193, 215, 232, 240, 247, 251, 255,
+  66, 89, 108, 125, 140, 153, 164, 178, 186, 200, 222, 235, 244, 249, 253, 256,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_16x16[256]) = {
-  0,   1,   2,   4,   6,   9,   12,  17,  22,  29,  36,  43,  54,  64,  76,
-  86,  3,   5,   7,   11,  15,  19,  25,  32,  38,  48,  59,  68,  84,  99,
-  115, 130, 8,   10,  13,  18,  23,  27,  33,  42,  51,  60,  72,  88,  103,
-  119, 142, 167, 14,  16,  20,  26,  31,  37,  44,  53,  61,  73,  85,  100,
-  116, 135, 161, 185, 21,  24,  30,  35,  40,  47,  55,  65,  74,  81,  94,
-  112, 133, 154, 179, 205, 28,  34,  39,  45,  50,  58,  67,  77,  87,  96,
-  106, 121, 146, 169, 196, 212, 41,  46,  49,  56,  63,  70,  79,  90,  98,
-  107, 122, 138, 159, 182, 207, 222, 52,  57,  62,  69,  75,  83,  93,  102,
-  110, 120, 134, 150, 176, 195, 215, 226, 66,  71,  78,  82,  91,  97,  108,
-  113, 127, 136, 148, 168, 188, 202, 221, 232, 80,  89,  92,  101, 105, 114,
-  125, 131, 139, 151, 162, 177, 192, 208, 223, 234, 95,  104, 109, 117, 123,
-  128, 143, 144, 155, 165, 175, 190, 206, 219, 233, 239, 111, 118, 124, 129,
-  140, 147, 157, 164, 170, 181, 191, 203, 224, 230, 240, 243, 126, 132, 137,
-  145, 153, 160, 174, 178, 184, 197, 204, 216, 231, 237, 244, 246, 141, 149,
-  156, 166, 172, 180, 189, 199, 200, 210, 220, 228, 238, 242, 249, 251, 152,
-  163, 171, 183, 186, 193, 201, 211, 214, 218, 227, 236, 245, 247, 252, 253,
-  158, 173, 187, 194, 198, 209, 213, 217, 225, 229, 235, 241, 248, 250, 254,
-  255,
+  1,   2,   3,   5,   7,   10,  13,  18,  23,  30,  37,  44,  55,  65,  77,
+  87,  4,   6,   8,   12,  16,  20,  26,  33,  39,  49,  60,  69,  85,  100,
+  116, 131, 9,   11,  14,  19,  24,  28,  34,  43,  52,  61,  73,  89,  104,
+  120, 143, 168, 15,  17,  21,  27,  32,  38,  45,  54,  62,  74,  86,  101,
+  117, 136, 162, 186, 22,  25,  31,  36,  41,  48,  56,  66,  75,  82,  95,
+  113, 134, 155, 180, 206, 29,  35,  40,  46,  51,  59,  68,  78,  88,  97,
+  107, 122, 147, 170, 197, 213, 42,  47,  50,  57,  64,  71,  80,  91,  99,
+  108, 123, 139, 160, 183, 208, 223, 53,  58,  63,  70,  76,  84,  94,  103,
+  111, 121, 135, 151, 177, 196, 216, 227, 67,  72,  79,  83,  92,  98,  109,
+  114, 128, 137, 149, 169, 189, 203, 222, 233, 81,  90,  93,  102, 106, 115,
+  126, 132, 140, 152, 163, 178, 193, 209, 224, 235, 96,  105, 110, 118, 124,
+  129, 144, 145, 156, 166, 176, 191, 207, 220, 234, 240, 112, 119, 125, 130,
+  141, 148, 158, 165, 171, 182, 192, 204, 225, 231, 241, 244, 127, 133, 138,
+  146, 154, 161, 175, 179, 185, 198, 205, 217, 232, 238, 245, 247, 142, 150,
+  157, 167, 173, 181, 190, 200, 201, 211, 221, 229, 239, 243, 250, 252, 153,
+  164, 172, 184, 187, 194, 202, 212, 215, 219, 228, 237, 246, 248, 253, 254,
+  159, 174, 188, 195, 199, 210, 214, 218, 226, 230, 236, 242, 249, 251, 255,
+  256,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_16x16[256]) = {
-  0,   2,   5,   9,   17,  24,  36,  44,  55,  72,  88,  104, 128, 143, 166,
-  179, 1,   4,   8,   13,  20,  30,  40,  54,  66,  79,  96,  113, 141, 154,
-  178, 196, 3,   7,   11,  18,  25,  33,  46,  57,  71,  86,  101, 119, 148,
-  164, 186, 201, 6,   12,  16,  23,  31,  39,  53,  64,  78,  92,  110, 127,
-  153, 169, 193, 208, 10,  14,  19,  28,  37,  47,  58,  67,  84,  98,  114,
-  133, 161, 176, 198, 214, 15,  21,  26,  34,  43,  52,  65,  77,  91,  106,
-  120, 140, 165, 185, 205, 221, 22,  27,  32,  41,  48,  60,  73,  85,  99,
-  116, 130, 151, 175, 190, 211, 225, 29,  35,  42,  49,  59,  69,  81,  95,
-  108, 125, 139, 155, 182, 197, 217, 229, 38,  45,  51,  61,  68,  80,  93,
-  105, 118, 134, 150, 168, 191, 207, 223, 234, 50,  56,  63,  74,  83,  94,
-  109, 117, 129, 147, 163, 177, 199, 213, 228, 238, 62,  70,  76,  87,  97,
-  107, 122, 131, 145, 159, 172, 188, 210, 222, 235, 242, 75,  82,  90,  102,
-  112, 124, 138, 146, 157, 173, 187, 202, 219, 230, 240, 245, 89,  100, 111,
-  123, 132, 142, 156, 167, 180, 189, 203, 216, 231, 237, 246, 250, 103, 115,
-  126, 136, 149, 162, 171, 183, 194, 204, 215, 224, 236, 241, 248, 252, 121,
-  135, 144, 158, 170, 181, 192, 200, 209, 218, 227, 233, 243, 244, 251, 254,
-  137, 152, 160, 174, 184, 195, 206, 212, 220, 226, 232, 239, 247, 249, 253,
-  255,
+  1,   3,   6,   10,  18,  25,  37,  45,  56,  73,  89,  105, 129, 144, 167,
+  180, 2,   5,   9,   14,  21,  31,  41,  55,  67,  80,  97,  114, 142, 155,
+  179, 197, 4,   8,   12,  19,  26,  34,  47,  58,  72,  87,  102, 120, 149,
+  165, 187, 202, 7,   13,  17,  24,  32,  40,  54,  65,  79,  93,  111, 128,
+  154, 170, 194, 209, 11,  15,  20,  29,  38,  48,  59,  68,  85,  99,  115,
+  134, 162, 177, 199, 215, 16,  22,  27,  35,  44,  53,  66,  78,  92,  107,
+  121, 141, 166, 186, 206, 222, 23,  28,  33,  42,  49,  61,  74,  86,  100,
+  117, 131, 152, 176, 191, 212, 226, 30,  36,  43,  50,  60,  70,  82,  96,
+  109, 126, 140, 156, 183, 198, 218, 230, 39,  46,  52,  62,  69,  81,  94,
+  106, 119, 135, 151, 169, 192, 208, 224, 235, 51,  57,  64,  75,  84,  95,
+  110, 118, 130, 148, 164, 178, 200, 214, 229, 239, 63,  71,  77,  88,  98,
+  108, 123, 132, 146, 160, 173, 189, 211, 223, 236, 243, 76,  83,  91,  103,
+  113, 125, 139, 147, 158, 174, 188, 203, 220, 231, 241, 246, 90,  101, 112,
+  124, 133, 143, 157, 168, 181, 190, 204, 217, 232, 238, 247, 251, 104, 116,
+  127, 137, 150, 163, 172, 184, 195, 205, 216, 225, 237, 242, 249, 253, 122,
+  136, 145, 159, 171, 182, 193, 201, 210, 219, 228, 234, 244, 245, 252, 255,
+  138, 153, 161, 175, 185, 196, 207, 213, 221, 227, 233, 240, 248, 250, 254,
+  256,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_32x32[1024]) = {
-  0,    2,    5,    10,   17,   25,   38,   47,   62,   83,   101,  121,  145,
-  170,  193,  204,  210,  219,  229,  233,  245,  257,  275,  299,  342,  356,
-  377,  405,  455,  471,  495,  527,  1,    4,    8,    15,   22,   30,   45,
-  58,   74,   92,   112,  133,  158,  184,  203,  215,  222,  228,  234,  237,
-  256,  274,  298,  317,  355,  376,  404,  426,  470,  494,  526,  551,  3,
-  7,    12,   18,   28,   36,   52,   64,   82,   102,  118,  142,  164,  189,
-  208,  217,  224,  231,  235,  238,  273,  297,  316,  329,  375,  403,  425,
-  440,  493,  525,  550,  567,  6,    11,   16,   23,   31,   43,   60,   73,
-  90,   109,  126,  150,  173,  196,  211,  220,  226,  232,  236,  239,  296,
-  315,  328,  335,  402,  424,  439,  447,  524,  549,  566,  575,  9,    14,
-  19,   29,   37,   50,   65,   78,   95,   116,  134,  157,  179,  201,  214,
-  223,  244,  255,  272,  295,  341,  354,  374,  401,  454,  469,  492,  523,
-  582,  596,  617,  645,  13,   20,   26,   35,   44,   54,   72,   85,   105,
-  123,  140,  163,  182,  205,  216,  225,  254,  271,  294,  314,  353,  373,
-  400,  423,  468,  491,  522,  548,  595,  616,  644,  666,  21,   27,   33,
-  42,   53,   63,   80,   94,   113,  132,  151,  172,  190,  209,  218,  227,
-  270,  293,  313,  327,  372,  399,  422,  438,  490,  521,  547,  565,  615,
-  643,  665,  680,  24,   32,   39,   48,   57,   71,   88,   104,  120,  139,
-  159,  178,  197,  212,  221,  230,  292,  312,  326,  334,  398,  421,  437,
-  446,  520,  546,  564,  574,  642,  664,  679,  687,  34,   40,   46,   56,
-  68,   81,   96,   111,  130,  147,  167,  186,  243,  253,  269,  291,  340,
-  352,  371,  397,  453,  467,  489,  519,  581,  594,  614,  641,  693,  705,
-  723,  747,  41,   49,   55,   67,   77,   91,   107,  124,  138,  161,  177,
-  194,  252,  268,  290,  311,  351,  370,  396,  420,  466,  488,  518,  545,
-  593,  613,  640,  663,  704,  722,  746,  765,  51,   59,   66,   76,   89,
-  99,   119,  131,  149,  168,  181,  200,  267,  289,  310,  325,  369,  395,
-  419,  436,  487,  517,  544,  563,  612,  639,  662,  678,  721,  745,  764,
-  777,  61,   69,   75,   87,   100,  114,  129,  144,  162,  180,  191,  207,
-  288,  309,  324,  333,  394,  418,  435,  445,  516,  543,  562,  573,  638,
-  661,  677,  686,  744,  763,  776,  783,  70,   79,   86,   97,   108,  122,
-  137,  155,  242,  251,  266,  287,  339,  350,  368,  393,  452,  465,  486,
-  515,  580,  592,  611,  637,  692,  703,  720,  743,  788,  798,  813,  833,
-  84,   93,   103,  110,  125,  141,  154,  171,  250,  265,  286,  308,  349,
-  367,  392,  417,  464,  485,  514,  542,  591,  610,  636,  660,  702,  719,
-  742,  762,  797,  812,  832,  848,  98,   106,  115,  127,  143,  156,  169,
-  185,  264,  285,  307,  323,  366,  391,  416,  434,  484,  513,  541,  561,
-  609,  635,  659,  676,  718,  741,  761,  775,  811,  831,  847,  858,  117,
-  128,  136,  148,  160,  175,  188,  198,  284,  306,  322,  332,  390,  415,
-  433,  444,  512,  540,  560,  572,  634,  658,  675,  685,  740,  760,  774,
-  782,  830,  846,  857,  863,  135,  146,  152,  165,  241,  249,  263,  283,
-  338,  348,  365,  389,  451,  463,  483,  511,  579,  590,  608,  633,  691,
-  701,  717,  739,  787,  796,  810,  829,  867,  875,  887,  903,  153,  166,
-  174,  183,  248,  262,  282,  305,  347,  364,  388,  414,  462,  482,  510,
-  539,  589,  607,  632,  657,  700,  716,  738,  759,  795,  809,  828,  845,
-  874,  886,  902,  915,  176,  187,  195,  202,  261,  281,  304,  321,  363,
-  387,  413,  432,  481,  509,  538,  559,  606,  631,  656,  674,  715,  737,
-  758,  773,  808,  827,  844,  856,  885,  901,  914,  923,  192,  199,  206,
-  213,  280,  303,  320,  331,  386,  412,  431,  443,  508,  537,  558,  571,
-  630,  655,  673,  684,  736,  757,  772,  781,  826,  843,  855,  862,  900,
-  913,  922,  927,  240,  247,  260,  279,  337,  346,  362,  385,  450,  461,
-  480,  507,  578,  588,  605,  629,  690,  699,  714,  735,  786,  794,  807,
-  825,  866,  873,  884,  899,  930,  936,  945,  957,  246,  259,  278,  302,
-  345,  361,  384,  411,  460,  479,  506,  536,  587,  604,  628,  654,  698,
-  713,  734,  756,  793,  806,  824,  842,  872,  883,  898,  912,  935,  944,
-  956,  966,  258,  277,  301,  319,  360,  383,  410,  430,  478,  505,  535,
-  557,  603,  627,  653,  672,  712,  733,  755,  771,  805,  823,  841,  854,
-  882,  897,  911,  921,  943,  955,  965,  972,  276,  300,  318,  330,  382,
-  409,  429,  442,  504,  534,  556,  570,  626,  652,  671,  683,  732,  754,
-  770,  780,  822,  840,  853,  861,  896,  910,  920,  926,  954,  964,  971,
-  975,  336,  344,  359,  381,  449,  459,  477,  503,  577,  586,  602,  625,
-  689,  697,  711,  731,  785,  792,  804,  821,  865,  871,  881,  895,  929,
-  934,  942,  953,  977,  981,  987,  995,  343,  358,  380,  408,  458,  476,
-  502,  533,  585,  601,  624,  651,  696,  710,  730,  753,  791,  803,  820,
-  839,  870,  880,  894,  909,  933,  941,  952,  963,  980,  986,  994,  1001,
-  357,  379,  407,  428,  475,  501,  532,  555,  600,  623,  650,  670,  709,
-  729,  752,  769,  802,  819,  838,  852,  879,  893,  908,  919,  940,  951,
-  962,  970,  985,  993,  1000, 1005, 378,  406,  427,  441,  500,  531,  554,
-  569,  622,  649,  669,  682,  728,  751,  768,  779,  818,  837,  851,  860,
-  892,  907,  918,  925,  950,  961,  969,  974,  992,  999,  1004, 1007, 448,
-  457,  474,  499,  576,  584,  599,  621,  688,  695,  708,  727,  784,  790,
-  801,  817,  864,  869,  878,  891,  928,  932,  939,  949,  976,  979,  984,
-  991,  1008, 1010, 1013, 1017, 456,  473,  498,  530,  583,  598,  620,  648,
-  694,  707,  726,  750,  789,  800,  816,  836,  868,  877,  890,  906,  931,
-  938,  948,  960,  978,  983,  990,  998,  1009, 1012, 1016, 1020, 472,  497,
-  529,  553,  597,  619,  647,  668,  706,  725,  749,  767,  799,  815,  835,
-  850,  876,  889,  905,  917,  937,  947,  959,  968,  982,  989,  997,  1003,
-  1011, 1015, 1019, 1022, 496,  528,  552,  568,  618,  646,  667,  681,  724,
-  748,  766,  778,  814,  834,  849,  859,  888,  904,  916,  924,  946,  958,
-  967,  973,  988,  996,  1002, 1006, 1014, 1018, 1021, 1023,
+  1,    3,    6,    11,   18,   26,   39,   48,   63,   84,   102,  122,  146,
+  171,  194,  205,  211,  220,  230,  234,  246,  258,  276,  300,  343,  357,
+  378,  406,  456,  472,  496,  528,  2,    5,    9,    16,   23,   31,   46,
+  59,   75,   93,   113,  134,  159,  185,  204,  216,  223,  229,  235,  238,
+  257,  275,  299,  318,  356,  377,  405,  427,  471,  495,  527,  552,  4,
+  8,    13,   19,   29,   37,   53,   65,   83,   103,  119,  143,  165,  190,
+  209,  218,  225,  232,  236,  239,  274,  298,  317,  330,  376,  404,  426,
+  441,  494,  526,  551,  568,  7,    12,   17,   24,   32,   44,   61,   74,
+  91,   110,  127,  151,  174,  197,  212,  221,  227,  233,  237,  240,  297,
+  316,  329,  336,  403,  425,  440,  448,  525,  550,  567,  576,  10,   15,
+  20,   30,   38,   51,   66,   79,   96,   117,  135,  158,  180,  202,  215,
+  224,  245,  256,  273,  296,  342,  355,  375,  402,  455,  470,  493,  524,
+  583,  597,  618,  646,  14,   21,   27,   36,   45,   55,   73,   86,   106,
+  124,  141,  164,  183,  206,  217,  226,  255,  272,  295,  315,  354,  374,
+  401,  424,  469,  492,  523,  549,  596,  617,  645,  667,  22,   28,   34,
+  43,   54,   64,   81,   95,   114,  133,  152,  173,  191,  210,  219,  228,
+  271,  294,  314,  328,  373,  400,  423,  439,  491,  522,  548,  566,  616,
+  644,  666,  681,  25,   33,   40,   49,   58,   72,   89,   105,  121,  140,
+  160,  179,  198,  213,  222,  231,  293,  313,  327,  335,  399,  422,  438,
+  447,  521,  547,  565,  575,  643,  665,  680,  688,  35,   41,   47,   57,
+  69,   82,   97,   112,  131,  148,  168,  187,  244,  254,  270,  292,  341,
+  353,  372,  398,  454,  468,  490,  520,  582,  595,  615,  642,  694,  706,
+  724,  748,  42,   50,   56,   68,   78,   92,   108,  125,  139,  162,  178,
+  195,  253,  269,  291,  312,  352,  371,  397,  421,  467,  489,  519,  546,
+  594,  614,  641,  664,  705,  723,  747,  766,  52,   60,   67,   77,   90,
+  100,  120,  132,  150,  169,  182,  201,  268,  290,  311,  326,  370,  396,
+  420,  437,  488,  518,  545,  564,  613,  640,  663,  679,  722,  746,  765,
+  778,  62,   70,   76,   88,   101,  115,  130,  145,  163,  181,  192,  208,
+  289,  310,  325,  334,  395,  419,  436,  446,  517,  544,  563,  574,  639,
+  662,  678,  687,  745,  764,  777,  784,  71,   80,   87,   98,   109,  123,
+  138,  156,  243,  252,  267,  288,  340,  351,  369,  394,  453,  466,  487,
+  516,  581,  593,  612,  638,  693,  704,  721,  744,  789,  799,  814,  834,
+  85,   94,   104,  111,  126,  142,  155,  172,  251,  266,  287,  309,  350,
+  368,  393,  418,  465,  486,  515,  543,  592,  611,  637,  661,  703,  720,
+  743,  763,  798,  813,  833,  849,  99,   107,  116,  128,  144,  157,  170,
+  186,  265,  286,  308,  324,  367,  392,  417,  435,  485,  514,  542,  562,
+  610,  636,  660,  677,  719,  742,  762,  776,  812,  832,  848,  859,  118,
+  129,  137,  149,  161,  176,  189,  199,  285,  307,  323,  333,  391,  416,
+  434,  445,  513,  541,  561,  573,  635,  659,  676,  686,  741,  761,  775,
+  783,  831,  847,  858,  864,  136,  147,  153,  166,  242,  250,  264,  284,
+  339,  349,  366,  390,  452,  464,  484,  512,  580,  591,  609,  634,  692,
+  702,  718,  740,  788,  797,  811,  830,  868,  876,  888,  904,  154,  167,
+  175,  184,  249,  263,  283,  306,  348,  365,  389,  415,  463,  483,  511,
+  540,  590,  608,  633,  658,  701,  717,  739,  760,  796,  810,  829,  846,
+  875,  887,  903,  916,  177,  188,  196,  203,  262,  282,  305,  322,  364,
+  388,  414,  433,  482,  510,  539,  560,  607,  632,  657,  675,  716,  738,
+  759,  774,  809,  828,  845,  857,  886,  902,  915,  924,  193,  200,  207,
+  214,  281,  304,  321,  332,  387,  413,  432,  444,  509,  538,  559,  572,
+  631,  656,  674,  685,  737,  758,  773,  782,  827,  844,  856,  863,  901,
+  914,  923,  928,  241,  248,  261,  280,  338,  347,  363,  386,  451,  462,
+  481,  508,  579,  589,  606,  630,  691,  700,  715,  736,  787,  795,  808,
+  826,  867,  874,  885,  900,  931,  937,  946,  958,  247,  260,  279,  303,
+  346,  362,  385,  412,  461,  480,  507,  537,  588,  605,  629,  655,  699,
+  714,  735,  757,  794,  807,  825,  843,  873,  884,  899,  913,  936,  945,
+  957,  967,  259,  278,  302,  320,  361,  384,  411,  431,  479,  506,  536,
+  558,  604,  628,  654,  673,  713,  734,  756,  772,  806,  824,  842,  855,
+  883,  898,  912,  922,  944,  956,  966,  973,  277,  301,  319,  331,  383,
+  410,  430,  443,  505,  535,  557,  571,  627,  653,  672,  684,  733,  755,
+  771,  781,  823,  841,  854,  862,  897,  911,  921,  927,  955,  965,  972,
+  976,  337,  345,  360,  382,  450,  460,  478,  504,  578,  587,  603,  626,
+  690,  698,  712,  732,  786,  793,  805,  822,  866,  872,  882,  896,  930,
+  935,  943,  954,  978,  982,  988,  996,  344,  359,  381,  409,  459,  477,
+  503,  534,  586,  602,  625,  652,  697,  711,  731,  754,  792,  804,  821,
+  840,  871,  881,  895,  910,  934,  942,  953,  964,  981,  987,  995,  1002,
+  358,  380,  408,  429,  476,  502,  533,  556,  601,  624,  651,  671,  710,
+  730,  753,  770,  803,  820,  839,  853,  880,  894,  909,  920,  941,  952,
+  963,  971,  986,  994,  1001, 1006, 379,  407,  428,  442,  501,  532,  555,
+  570,  623,  650,  670,  683,  729,  752,  769,  780,  819,  838,  852,  861,
+  893,  908,  919,  926,  951,  962,  970,  975,  993,  1000, 1005, 1008, 449,
+  458,  475,  500,  577,  585,  600,  622,  689,  696,  709,  728,  785,  791,
+  802,  818,  865,  870,  879,  892,  929,  933,  940,  950,  977,  980,  985,
+  992,  1009, 1011, 1014, 1018, 457,  474,  499,  531,  584,  599,  621,  649,
+  695,  708,  727,  751,  790,  801,  817,  837,  869,  878,  891,  907,  932,
+  939,  949,  961,  979,  984,  991,  999,  1010, 1013, 1017, 1021, 473,  498,
+  530,  554,  598,  620,  648,  669,  707,  726,  750,  768,  800,  816,  836,
+  851,  877,  890,  906,  918,  938,  948,  960,  969,  983,  990,  998,  1004,
+  1012, 1016, 1020, 1023, 497,  529,  553,  569,  619,  647,  668,  682,  725,
+  749,  767,  779,  815,  835,  850,  860,  889,  905,  917,  925,  947,  959,
+  968,  974,  989,  997,  1003, 1007, 1015, 1019, 1022, 1024,
 };
 
-const scan_order vp9_default_scan_orders[TX_SIZES] = {
+const ScanOrder vp9_default_scan_orders[TX_SIZES] = {
   { default_scan_4x4, vp9_default_iscan_4x4, default_scan_4x4_neighbors },
   { default_scan_8x8, vp9_default_iscan_8x8, default_scan_8x8_neighbors },
   { default_scan_16x16, vp9_default_iscan_16x16, default_scan_16x16_neighbors },
   { default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors },
 };
 
-const scan_order vp9_scan_orders[TX_SIZES][TX_TYPES] = {
+const ScanOrder vp9_scan_orders[TX_SIZES][TX_TYPES] = {
   { // TX_4X4
     { default_scan_4x4, vp9_default_iscan_4x4, default_scan_4x4_neighbors },
     { row_scan_4x4, vp9_row_iscan_4x4, row_scan_4x4_neighbors },
diff --git a/media/libvpx/libvpx/vp9/common/vp9_scan.h b/media/libvpx/libvpx/vp9/common/vp9_scan.h
index b3520e7dcc..3d1dcc66da 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_scan.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_scan.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_SCAN_H_
-#define VP9_COMMON_VP9_SCAN_H_
+#ifndef VPX_VP9_COMMON_VP9_SCAN_H_
+#define VPX_VP9_COMMON_VP9_SCAN_H_
 
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
@@ -23,14 +23,14 @@ extern "C" {
 
 #define MAX_NEIGHBORS 2
 
-typedef struct {
+typedef struct ScanOrder {
   const int16_t *scan;
   const int16_t *iscan;
   const int16_t *neighbors;
-} scan_order;
+} ScanOrder;
 
-extern const scan_order vp9_default_scan_orders[TX_SIZES];
-extern const scan_order vp9_scan_orders[TX_SIZES][TX_TYPES];
+extern const ScanOrder vp9_default_scan_orders[TX_SIZES];
+extern const ScanOrder vp9_scan_orders[TX_SIZES][TX_TYPES];
 
 static INLINE int get_coef_context(const int16_t *neighbors,
                                    const uint8_t *token_cache, int c) {
@@ -39,8 +39,8 @@ static INLINE int get_coef_context(const int16_t *neighbors,
          1;
 }
 
-static INLINE const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
-                                         PLANE_TYPE type, int block_idx) {
+static INLINE const ScanOrder *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
+                                        PLANE_TYPE type, int block_idx) {
   const MODE_INFO *const mi = xd->mi[0];
 
   if (is_inter_block(mi) || type != PLANE_TYPE_Y || xd->lossless) {
@@ -55,4 +55,4 @@ static INLINE const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_SCAN_H_
+#endif  // VPX_VP9_COMMON_VP9_SCAN_H_
diff --git a/media/libvpx/libvpx/vp9/common/vp9_seg_common.h b/media/libvpx/libvpx/vp9/common/vp9_seg_common.h
index b9bf75d580..5e71c2fca5 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_seg_common.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_seg_common.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_SEG_COMMON_H_
-#define VP9_COMMON_VP9_SEG_COMMON_H_
+#ifndef VPX_VP9_COMMON_VP9_SEG_COMMON_H_
+#define VPX_VP9_COMMON_VP9_SEG_COMMON_H_
 
 #include "vpx_dsp/prob.h"
 
@@ -25,6 +25,11 @@ extern "C" {
 
 #define PREDICTION_PROBS 3
 
+// Segment ID used to skip background encoding
+#define BACKGROUND_SEG_SKIP_ID 3
+// Number of frames that don't skip after a key frame
+#define FRAMES_NO_SKIPPING_AFTER_KEY 20
+
 // Segment level features.
 typedef enum {
   SEG_LVL_ALT_Q = 0,      // Use alternate Quantizer ....
@@ -78,4 +83,4 @@ extern const vpx_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)];
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_SEG_COMMON_H_
+#endif  // VPX_VP9_COMMON_VP9_SEG_COMMON_H_
diff --git a/media/libvpx/libvpx/vp9/common/vp9_thread_common.c b/media/libvpx/libvpx/vp9/common/vp9_thread_common.c
index 07e659d235..7f5ac36669 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_thread_common.c
+++ b/media/libvpx/libvpx/vp9/common/vp9_thread_common.c
@@ -8,41 +8,27 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
+#include <limits.h>
 #include "./vpx_config.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_util/vpx_pthread.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_thread_common.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_loopfilter.h"
 
-#if CONFIG_MULTITHREAD
-static INLINE void mutex_lock(pthread_mutex_t *const mutex) {
-  const int kMaxTryLocks = 4000;
-  int locked = 0;
-  int i;
-
-  for (i = 0; i < kMaxTryLocks; ++i) {
-    if (!pthread_mutex_trylock(mutex)) {
-      locked = 1;
-      break;
-    }
-  }
-
-  if (!locked) pthread_mutex_lock(mutex);
-}
-#endif  // CONFIG_MULTITHREAD
-
 static INLINE void sync_read(VP9LfSync *const lf_sync, int r, int c) {
 #if CONFIG_MULTITHREAD
   const int nsync = lf_sync->sync_range;
 
   if (r && !(c & (nsync - 1))) {
-    pthread_mutex_t *const mutex = &lf_sync->mutex_[r - 1];
-    mutex_lock(mutex);
+    pthread_mutex_t *const mutex = &lf_sync->mutex[r - 1];
+    pthread_mutex_lock(mutex);
 
     while (c > lf_sync->cur_sb_col[r - 1] - nsync) {
-      pthread_cond_wait(&lf_sync->cond_[r - 1], mutex);
+      pthread_cond_wait(&lf_sync->cond[r - 1], mutex);
     }
     pthread_mutex_unlock(mutex);
   }
@@ -69,12 +55,12 @@ static INLINE void sync_write(VP9LfSync *const lf_sync, int r, int c,
   }
 
   if (sig) {
-    mutex_lock(&lf_sync->mutex_[r]);
+    pthread_mutex_lock(&lf_sync->mutex[r]);
 
     lf_sync->cur_sb_col[r] = cur;
 
-    pthread_cond_signal(&lf_sync->cond_[r]);
-    pthread_mutex_unlock(&lf_sync->mutex_[r]);
+    pthread_cond_signal(&lf_sync->cond[r]);
+    pthread_mutex_unlock(&lf_sync->mutex[r]);
   }
 #else
   (void)lf_sync;
@@ -91,6 +77,7 @@ static INLINE void thread_loop_filter_rows(
     int y_only, VP9LfSync *const lf_sync) {
   const int num_planes = y_only ? 1 : MAX_MB_PLANE;
   const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;
+  const int num_active_workers = lf_sync->num_active_workers;
   int mi_row, mi_col;
   enum lf_path path;
   if (y_only)
@@ -102,8 +89,10 @@ static INLINE void thread_loop_filter_rows(
   else
     path = LF_PATH_SLOW;
 
+  assert(num_active_workers > 0);
+
   for (mi_row = start; mi_row < stop;
-       mi_row += lf_sync->num_workers * MI_BLOCK_SIZE) {
+       mi_row += num_active_workers * MI_BLOCK_SIZE) {
     MODE_INFO **const mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
     LOOP_FILTER_MASK *lfm = get_lfm(&cm->lf, mi_row, 0);
 
@@ -140,8 +129,9 @@ static INLINE void thread_loop_filter_rows(
 }
 
 // Row-based multi-threaded loopfilter hook
-static int loop_filter_row_worker(VP9LfSync *const lf_sync,
-                                  LFWorkerData *const lf_data) {
+static int loop_filter_row_worker(void *arg1, void *arg2) {
+  VP9LfSync *const lf_sync = (VP9LfSync *)arg1;
+  LFWorkerData *const lf_data = (LFWorkerData *)arg2;
   thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
                           lf_data->start, lf_data->stop, lf_data->y_only,
                           lf_sync);
@@ -156,10 +146,12 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm,
   const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
   // Number of superblock rows and cols
   const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
-  // Decoder may allocate more threads than number of tiles based on user's
-  // input.
-  const int tile_cols = 1 << cm->log2_tile_cols;
-  const int num_workers = VPXMIN(nworkers, tile_cols);
+  const int num_tile_cols = 1 << cm->log2_tile_cols;
+  // Limit the number of workers to prevent changes in frame dimensions from
+  // causing incorrect sync calculations when sb_rows < threads/tile_cols.
+  // Further restrict them by the number of tile columns should the user
+  // request more as this implementation doesn't scale well beyond that.
+  const int num_workers = VPXMIN(nworkers, VPXMIN(num_tile_cols, sb_rows));
   int i;
 
   if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
@@ -167,6 +159,7 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm,
     vp9_loop_filter_dealloc(lf_sync);
     vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
   }
+  lf_sync->num_active_workers = num_workers;
 
   // Initialize cur_sb_col to -1 for all SB rows.
   memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
@@ -183,7 +176,7 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm,
     VPxWorker *const worker = &workers[i];
     LFWorkerData *const lf_data = &lf_sync->lfdata[i];
 
-    worker->hook = (VPxWorkerHook)loop_filter_row_worker;
+    worker->hook = loop_filter_row_worker;
     worker->data1 = lf_sync;
     worker->data2 = lf_data;
 
@@ -230,6 +223,28 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm,
                       workers, num_workers, lf_sync);
 }
 
+void vp9_lpf_mt_init(VP9LfSync *lf_sync, VP9_COMMON *cm, int frame_filter_level,
+                     int num_workers) {
+  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+
+  if (!frame_filter_level) return;
+
+  if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
+      num_workers > lf_sync->num_workers) {
+    vp9_loop_filter_dealloc(lf_sync);
+    vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
+  }
+
+  // Initialize cur_sb_col to -1 for all SB rows.
+  memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
+
+  lf_sync->corrupted = 0;
+
+  memset(lf_sync->num_tiles_done, 0,
+         sizeof(*lf_sync->num_tiles_done) * sb_rows);
+  cm->lf_row = 0;
+}
+
 // Set up nsync by width.
 static INLINE int get_sync_range(int width) {
   // nsync numbers are picked by testing. For example, for 4k
@@ -252,60 +267,205 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
   {
     int i;
 
-    CHECK_MEM_ERROR(cm, lf_sync->mutex_,
-                    vpx_malloc(sizeof(*lf_sync->mutex_) * rows));
-    if (lf_sync->mutex_) {
+    CHECK_MEM_ERROR(&cm->error, lf_sync->mutex,
+                    vpx_malloc(sizeof(*lf_sync->mutex) * rows));
+    if (lf_sync->mutex) {
       for (i = 0; i < rows; ++i) {
-        pthread_mutex_init(&lf_sync->mutex_[i], NULL);
+        pthread_mutex_init(&lf_sync->mutex[i], NULL);
       }
     }
 
-    CHECK_MEM_ERROR(cm, lf_sync->cond_,
-                    vpx_malloc(sizeof(*lf_sync->cond_) * rows));
-    if (lf_sync->cond_) {
+    CHECK_MEM_ERROR(&cm->error, lf_sync->cond,
+                    vpx_malloc(sizeof(*lf_sync->cond) * rows));
+    if (lf_sync->cond) {
       for (i = 0; i < rows; ++i) {
-        pthread_cond_init(&lf_sync->cond_[i], NULL);
+        pthread_cond_init(&lf_sync->cond[i], NULL);
+      }
+    }
+
+    CHECK_MEM_ERROR(&cm->error, lf_sync->lf_mutex,
+                    vpx_malloc(sizeof(*lf_sync->lf_mutex)));
+    pthread_mutex_init(lf_sync->lf_mutex, NULL);
+
+    CHECK_MEM_ERROR(&cm->error, lf_sync->recon_done_mutex,
+                    vpx_malloc(sizeof(*lf_sync->recon_done_mutex) * rows));
+    if (lf_sync->recon_done_mutex) {
+      for (i = 0; i < rows; ++i) {
+        pthread_mutex_init(&lf_sync->recon_done_mutex[i], NULL);
+      }
+    }
+
+    CHECK_MEM_ERROR(&cm->error, lf_sync->recon_done_cond,
+                    vpx_malloc(sizeof(*lf_sync->recon_done_cond) * rows));
+    if (lf_sync->recon_done_cond) {
+      for (i = 0; i < rows; ++i) {
+        pthread_cond_init(&lf_sync->recon_done_cond[i], NULL);
       }
     }
   }
 #endif  // CONFIG_MULTITHREAD
 
-  CHECK_MEM_ERROR(cm, lf_sync->lfdata,
+  CHECK_MEM_ERROR(&cm->error, lf_sync->lfdata,
                   vpx_malloc(num_workers * sizeof(*lf_sync->lfdata)));
   lf_sync->num_workers = num_workers;
+  lf_sync->num_active_workers = lf_sync->num_workers;
 
-  CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,
+  CHECK_MEM_ERROR(&cm->error, lf_sync->cur_sb_col,
                   vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));
 
+  CHECK_MEM_ERROR(&cm->error, lf_sync->num_tiles_done,
+                  vpx_malloc(sizeof(*lf_sync->num_tiles_done) *
+                                 mi_cols_aligned_to_sb(cm->mi_rows) >>
+                             MI_BLOCK_SIZE_LOG2));
+
   // Set up nsync.
   lf_sync->sync_range = get_sync_range(width);
 }
 
 // Deallocate lf synchronization related mutex and data
 void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) {
-  if (lf_sync != NULL) {
-#if CONFIG_MULTITHREAD
-    int i;
+  assert(lf_sync != NULL);
 
-    if (lf_sync->mutex_ != NULL) {
-      for (i = 0; i < lf_sync->rows; ++i) {
-        pthread_mutex_destroy(&lf_sync->mutex_[i]);
-      }
-      vpx_free(lf_sync->mutex_);
+#if CONFIG_MULTITHREAD
+  if (lf_sync->mutex != NULL) {
+    int i;
+    for (i = 0; i < lf_sync->rows; ++i) {
+      pthread_mutex_destroy(&lf_sync->mutex[i]);
     }
-    if (lf_sync->cond_ != NULL) {
-      for (i = 0; i < lf_sync->rows; ++i) {
-        pthread_cond_destroy(&lf_sync->cond_[i]);
-      }
-      vpx_free(lf_sync->cond_);
-    }
-#endif  // CONFIG_MULTITHREAD
-    vpx_free(lf_sync->lfdata);
-    vpx_free(lf_sync->cur_sb_col);
-    // clear the structure as the source of this call may be a resize in which
-    // case this call will be followed by an _alloc() which may fail.
-    vp9_zero(*lf_sync);
+    vpx_free(lf_sync->mutex);
   }
+  if (lf_sync->cond != NULL) {
+    int i;
+    for (i = 0; i < lf_sync->rows; ++i) {
+      pthread_cond_destroy(&lf_sync->cond[i]);
+    }
+    vpx_free(lf_sync->cond);
+  }
+  if (lf_sync->recon_done_mutex != NULL) {
+    int i;
+    for (i = 0; i < lf_sync->rows; ++i) {
+      pthread_mutex_destroy(&lf_sync->recon_done_mutex[i]);
+    }
+    vpx_free(lf_sync->recon_done_mutex);
+  }
+
+  if (lf_sync->lf_mutex != NULL) {
+    pthread_mutex_destroy(lf_sync->lf_mutex);
+    vpx_free(lf_sync->lf_mutex);
+  }
+  if (lf_sync->recon_done_cond != NULL) {
+    int i;
+    for (i = 0; i < lf_sync->rows; ++i) {
+      pthread_cond_destroy(&lf_sync->recon_done_cond[i]);
+    }
+    vpx_free(lf_sync->recon_done_cond);
+  }
+#endif  // CONFIG_MULTITHREAD
+
+  vpx_free(lf_sync->lfdata);
+  vpx_free(lf_sync->cur_sb_col);
+  vpx_free(lf_sync->num_tiles_done);
+  // clear the structure as the source of this call may be a resize in which
+  // case this call will be followed by an _alloc() which may fail.
+  vp9_zero(*lf_sync);
+}
+
+static int get_next_row(VP9_COMMON *cm, VP9LfSync *lf_sync) {
+  int return_val = -1;
+  const int max_rows = cm->mi_rows;
+
+#if CONFIG_MULTITHREAD
+  int cur_row;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+
+  pthread_mutex_lock(lf_sync->lf_mutex);
+  if (cm->lf_row < max_rows) {
+    cur_row = cm->lf_row >> MI_BLOCK_SIZE_LOG2;
+    return_val = cm->lf_row;
+    cm->lf_row += MI_BLOCK_SIZE;
+    if (cm->lf_row < max_rows) {
+      /* If this is not the last row, make sure the next row is also decoded.
+       * This is because the intra predict has to happen before loop filter */
+      cur_row += 1;
+    }
+  }
+  pthread_mutex_unlock(lf_sync->lf_mutex);
+
+  if (return_val == -1) return return_val;
+
+  pthread_mutex_lock(&lf_sync->recon_done_mutex[cur_row]);
+  if (lf_sync->num_tiles_done[cur_row] < tile_cols) {
+    pthread_cond_wait(&lf_sync->recon_done_cond[cur_row],
+                      &lf_sync->recon_done_mutex[cur_row]);
+  }
+  pthread_mutex_unlock(&lf_sync->recon_done_mutex[cur_row]);
+  pthread_mutex_lock(lf_sync->lf_mutex);
+  if (lf_sync->corrupted) {
+    int row = return_val >> MI_BLOCK_SIZE_LOG2;
+    pthread_mutex_lock(&lf_sync->mutex[row]);
+    lf_sync->cur_sb_col[row] = INT_MAX;
+    pthread_cond_signal(&lf_sync->cond[row]);
+    pthread_mutex_unlock(&lf_sync->mutex[row]);
+    return_val = -1;
+  }
+  pthread_mutex_unlock(lf_sync->lf_mutex);
+#else
+  (void)lf_sync;
+  if (cm->lf_row < max_rows) {
+    return_val = cm->lf_row;
+    cm->lf_row += MI_BLOCK_SIZE;
+  }
+#endif  // CONFIG_MULTITHREAD
+
+  return return_val;
+}
+
+void vp9_loopfilter_rows(LFWorkerData *lf_data, VP9LfSync *lf_sync) {
+  int mi_row;
+  VP9_COMMON *cm = lf_data->cm;
+
+  while ((mi_row = get_next_row(cm, lf_sync)) != -1 && mi_row < cm->mi_rows) {
+    lf_data->start = mi_row;
+    lf_data->stop = mi_row + MI_BLOCK_SIZE;
+
+    thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
+                            lf_data->start, lf_data->stop, lf_data->y_only,
+                            lf_sync);
+  }
+}
+
+void vp9_set_row(VP9LfSync *lf_sync, int num_tiles, int row, int is_last_row,
+                 int corrupted) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(lf_sync->lf_mutex);
+  lf_sync->corrupted |= corrupted;
+  pthread_mutex_unlock(lf_sync->lf_mutex);
+  pthread_mutex_lock(&lf_sync->recon_done_mutex[row]);
+  lf_sync->num_tiles_done[row] += 1;
+  if (num_tiles == lf_sync->num_tiles_done[row]) {
+    if (is_last_row) {
+      /* The last 2 rows wait on the last row to be done.
+       * So, we have to broadcast the signal in this case.
+       */
+      pthread_cond_broadcast(&lf_sync->recon_done_cond[row]);
+    } else {
+      pthread_cond_signal(&lf_sync->recon_done_cond[row]);
+    }
+  }
+  pthread_mutex_unlock(&lf_sync->recon_done_mutex[row]);
+#else
+  (void)lf_sync;
+  (void)num_tiles;
+  (void)row;
+  (void)is_last_row;
+  (void)corrupted;
+#endif  // CONFIG_MULTITHREAD
+}
+
+void vp9_loopfilter_job(LFWorkerData *lf_data, VP9LfSync *lf_sync) {
+  thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
+                          lf_data->start, lf_data->stop, lf_data->y_only,
+                          lf_sync);
 }
 
 // Accumulate frame counts.
diff --git a/media/libvpx/libvpx/vp9/common/vp9_thread_common.h b/media/libvpx/libvpx/vp9/common/vp9_thread_common.h
index 0f7c3ff748..96c705d0d5 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_thread_common.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_thread_common.h
@@ -8,10 +8,11 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_THREAD_COMMON_H_
-#define VP9_COMMON_VP9_THREAD_COMMON_H_
+#ifndef VPX_VP9_COMMON_VP9_THREAD_COMMON_H_
+#define VPX_VP9_COMMON_VP9_THREAD_COMMON_H_
 #include "./vpx_config.h"
 #include "vp9/common/vp9_loopfilter.h"
+#include "vpx_util/vpx_pthread.h"
 #include "vpx_util/vpx_thread.h"
 
 #ifdef __cplusplus
@@ -24,8 +25,8 @@ struct FRAME_COUNTS;
 // Loopfilter row synchronization
 typedef struct VP9LfSyncData {
 #if CONFIG_MULTITHREAD
-  pthread_mutex_t *mutex_;
-  pthread_cond_t *cond_;
+  pthread_mutex_t *mutex;
+  pthread_cond_t *cond;
 #endif
   // Allocate memory to store the loop-filtered superblock index in each row.
   int *cur_sb_col;
@@ -36,7 +37,16 @@ typedef struct VP9LfSyncData {
 
   // Row-based parallel loopfilter data
   LFWorkerData *lfdata;
-  int num_workers;
+  int num_workers;         // number of allocated workers.
+  int num_active_workers;  // number of scheduled workers.
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *lf_mutex;
+  pthread_mutex_t *recon_done_mutex;
+  pthread_cond_t *recon_done_cond;
+#endif
+  int *num_tiles_done;
+  int corrupted;
 } VP9LfSync;
 
 // Allocate memory for loopfilter row synchronization.
@@ -53,6 +63,17 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct VP9Common *cm,
                               int partial_frame, VPxWorker *workers,
                               int num_workers, VP9LfSync *lf_sync);
 
+// Multi-threaded loopfilter initialisations
+void vp9_lpf_mt_init(VP9LfSync *lf_sync, struct VP9Common *cm,
+                     int frame_filter_level, int num_workers);
+
+void vp9_loopfilter_rows(LFWorkerData *lf_data, VP9LfSync *lf_sync);
+
+void vp9_set_row(VP9LfSync *lf_sync, int num_tiles, int row, int is_last_row,
+                 int corrupted);
+
+void vp9_loopfilter_job(LFWorkerData *lf_data, VP9LfSync *lf_sync);
+
 void vp9_accumulate_frame_counts(struct FRAME_COUNTS *accum,
                                  const struct FRAME_COUNTS *counts, int is_dec);
 
@@ -60,4 +81,4 @@ void vp9_accumulate_frame_counts(struct FRAME_COUNTS *accum,
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_THREAD_COMMON_H_
+#endif  // VPX_VP9_COMMON_VP9_THREAD_COMMON_H_
diff --git a/media/libvpx/libvpx/vp9/common/vp9_tile_common.h b/media/libvpx/libvpx/vp9/common/vp9_tile_common.h
index 1b11c2680d..4ccf0a3d5f 100644
--- a/media/libvpx/libvpx/vp9/common/vp9_tile_common.h
+++ b/media/libvpx/libvpx/vp9/common/vp9_tile_common.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_TILE_COMMON_H_
-#define VP9_COMMON_VP9_TILE_COMMON_H_
+#ifndef VPX_VP9_COMMON_VP9_TILE_COMMON_H_
+#define VPX_VP9_COMMON_VP9_TILE_COMMON_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -37,4 +37,4 @@ void vp9_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols,
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_TILE_COMMON_H_
+#endif  // VPX_VP9_COMMON_VP9_TILE_COMMON_H_
diff --git a/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c b/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c
new file mode 100644
index 0000000000..57b79a732d
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c
@@ -0,0 +1,419 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in,
+                                                      const int c,
+                                                      __m128i *const s) {
+  const __m128i pair_c = pair_set_epi32(4 * c, 0);
+  __m128i x[2];
+
+  extend_64bit(in, x);
+  s[0] = _mm_mul_epi32(pair_c, x[0]);
+  s[1] = _mm_mul_epi32(pair_c, x[1]);
+}
+
+static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0,
+                                                 const __m128i in1,
+                                                 const int c0, const int c1,
+                                                 __m128i *const s0,
+                                                 __m128i *const s1) {
+  const __m128i pair_c0 = pair_set_epi32(4 * c0, 0);
+  const __m128i pair_c1 = pair_set_epi32(4 * c1, 0);
+  __m128i t00[2], t01[2], t10[2], t11[2];
+  __m128i x0[2], x1[2];
+
+  extend_64bit(in0, x0);
+  extend_64bit(in1, x1);
+  t00[0] = _mm_mul_epi32(pair_c0, x0[0]);
+  t00[1] = _mm_mul_epi32(pair_c0, x0[1]);
+  t01[0] = _mm_mul_epi32(pair_c0, x1[0]);
+  t01[1] = _mm_mul_epi32(pair_c0, x1[1]);
+  t10[0] = _mm_mul_epi32(pair_c1, x0[0]);
+  t10[1] = _mm_mul_epi32(pair_c1, x0[1]);
+  t11[0] = _mm_mul_epi32(pair_c1, x1[0]);
+  t11[1] = _mm_mul_epi32(pair_c1, x1[1]);
+
+  s0[0] = _mm_add_epi64(t00[0], t11[0]);
+  s0[1] = _mm_add_epi64(t00[1], t11[1]);
+  s1[0] = _mm_sub_epi64(t10[0], t01[0]);
+  s1[1] = _mm_sub_epi64(t10[1], t01[1]);
+}
+
+static void highbd_iadst16_4col_sse4_1(__m128i *const io /*io[16]*/) {
+  __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2], s8[2], s9[2],
+      s10[2], s11[2], s12[2], s13[2], s14[2], s15[2];
+  __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2], x8[2], x9[2],
+      x10[2], x11[2], x12[2], x13[2], x14[2], x15[2];
+
+  // stage 1
+  highbd_iadst_butterfly_sse4_1(io[15], io[0], cospi_1_64, cospi_31_64, s0, s1);
+  highbd_iadst_butterfly_sse4_1(io[13], io[2], cospi_5_64, cospi_27_64, s2, s3);
+  highbd_iadst_butterfly_sse4_1(io[11], io[4], cospi_9_64, cospi_23_64, s4, s5);
+  highbd_iadst_butterfly_sse4_1(io[9], io[6], cospi_13_64, cospi_19_64, s6, s7);
+  highbd_iadst_butterfly_sse4_1(io[7], io[8], cospi_17_64, cospi_15_64, s8, s9);
+  highbd_iadst_butterfly_sse4_1(io[5], io[10], cospi_21_64, cospi_11_64, s10,
+                                s11);
+  highbd_iadst_butterfly_sse4_1(io[3], io[12], cospi_25_64, cospi_7_64, s12,
+                                s13);
+  highbd_iadst_butterfly_sse4_1(io[1], io[14], cospi_29_64, cospi_3_64, s14,
+                                s15);
+
+  x0[0] = _mm_add_epi64(s0[0], s8[0]);
+  x0[1] = _mm_add_epi64(s0[1], s8[1]);
+  x1[0] = _mm_add_epi64(s1[0], s9[0]);
+  x1[1] = _mm_add_epi64(s1[1], s9[1]);
+  x2[0] = _mm_add_epi64(s2[0], s10[0]);
+  x2[1] = _mm_add_epi64(s2[1], s10[1]);
+  x3[0] = _mm_add_epi64(s3[0], s11[0]);
+  x3[1] = _mm_add_epi64(s3[1], s11[1]);
+  x4[0] = _mm_add_epi64(s4[0], s12[0]);
+  x4[1] = _mm_add_epi64(s4[1], s12[1]);
+  x5[0] = _mm_add_epi64(s5[0], s13[0]);
+  x5[1] = _mm_add_epi64(s5[1], s13[1]);
+  x6[0] = _mm_add_epi64(s6[0], s14[0]);
+  x6[1] = _mm_add_epi64(s6[1], s14[1]);
+  x7[0] = _mm_add_epi64(s7[0], s15[0]);
+  x7[1] = _mm_add_epi64(s7[1], s15[1]);
+  x8[0] = _mm_sub_epi64(s0[0], s8[0]);
+  x8[1] = _mm_sub_epi64(s0[1], s8[1]);
+  x9[0] = _mm_sub_epi64(s1[0], s9[0]);
+  x9[1] = _mm_sub_epi64(s1[1], s9[1]);
+  x10[0] = _mm_sub_epi64(s2[0], s10[0]);
+  x10[1] = _mm_sub_epi64(s2[1], s10[1]);
+  x11[0] = _mm_sub_epi64(s3[0], s11[0]);
+  x11[1] = _mm_sub_epi64(s3[1], s11[1]);
+  x12[0] = _mm_sub_epi64(s4[0], s12[0]);
+  x12[1] = _mm_sub_epi64(s4[1], s12[1]);
+  x13[0] = _mm_sub_epi64(s5[0], s13[0]);
+  x13[1] = _mm_sub_epi64(s5[1], s13[1]);
+  x14[0] = _mm_sub_epi64(s6[0], s14[0]);
+  x14[1] = _mm_sub_epi64(s6[1], s14[1]);
+  x15[0] = _mm_sub_epi64(s7[0], s15[0]);
+  x15[1] = _mm_sub_epi64(s7[1], s15[1]);
+
+  x0[0] = dct_const_round_shift_64bit(x0[0]);
+  x0[1] = dct_const_round_shift_64bit(x0[1]);
+  x1[0] = dct_const_round_shift_64bit(x1[0]);
+  x1[1] = dct_const_round_shift_64bit(x1[1]);
+  x2[0] = dct_const_round_shift_64bit(x2[0]);
+  x2[1] = dct_const_round_shift_64bit(x2[1]);
+  x3[0] = dct_const_round_shift_64bit(x3[0]);
+  x3[1] = dct_const_round_shift_64bit(x3[1]);
+  x4[0] = dct_const_round_shift_64bit(x4[0]);
+  x4[1] = dct_const_round_shift_64bit(x4[1]);
+  x5[0] = dct_const_round_shift_64bit(x5[0]);
+  x5[1] = dct_const_round_shift_64bit(x5[1]);
+  x6[0] = dct_const_round_shift_64bit(x6[0]);
+  x6[1] = dct_const_round_shift_64bit(x6[1]);
+  x7[0] = dct_const_round_shift_64bit(x7[0]);
+  x7[1] = dct_const_round_shift_64bit(x7[1]);
+  x8[0] = dct_const_round_shift_64bit(x8[0]);
+  x8[1] = dct_const_round_shift_64bit(x8[1]);
+  x9[0] = dct_const_round_shift_64bit(x9[0]);
+  x9[1] = dct_const_round_shift_64bit(x9[1]);
+  x10[0] = dct_const_round_shift_64bit(x10[0]);
+  x10[1] = dct_const_round_shift_64bit(x10[1]);
+  x11[0] = dct_const_round_shift_64bit(x11[0]);
+  x11[1] = dct_const_round_shift_64bit(x11[1]);
+  x12[0] = dct_const_round_shift_64bit(x12[0]);
+  x12[1] = dct_const_round_shift_64bit(x12[1]);
+  x13[0] = dct_const_round_shift_64bit(x13[0]);
+  x13[1] = dct_const_round_shift_64bit(x13[1]);
+  x14[0] = dct_const_round_shift_64bit(x14[0]);
+  x14[1] = dct_const_round_shift_64bit(x14[1]);
+  x15[0] = dct_const_round_shift_64bit(x15[0]);
+  x15[1] = dct_const_round_shift_64bit(x15[1]);
+  x0[0] = pack_4(x0[0], x0[1]);
+  x1[0] = pack_4(x1[0], x1[1]);
+  x2[0] = pack_4(x2[0], x2[1]);
+  x3[0] = pack_4(x3[0], x3[1]);
+  x4[0] = pack_4(x4[0], x4[1]);
+  x5[0] = pack_4(x5[0], x5[1]);
+  x6[0] = pack_4(x6[0], x6[1]);
+  x7[0] = pack_4(x7[0], x7[1]);
+  x8[0] = pack_4(x8[0], x8[1]);
+  x9[0] = pack_4(x9[0], x9[1]);
+  x10[0] = pack_4(x10[0], x10[1]);
+  x11[0] = pack_4(x11[0], x11[1]);
+  x12[0] = pack_4(x12[0], x12[1]);
+  x13[0] = pack_4(x13[0], x13[1]);
+  x14[0] = pack_4(x14[0], x14[1]);
+  x15[0] = pack_4(x15[0], x15[1]);
+
+  // stage 2
+  s0[0] = x0[0];
+  s1[0] = x1[0];
+  s2[0] = x2[0];
+  s3[0] = x3[0];
+  s4[0] = x4[0];
+  s5[0] = x5[0];
+  s6[0] = x6[0];
+  s7[0] = x7[0];
+  x0[0] = _mm_add_epi32(s0[0], s4[0]);
+  x1[0] = _mm_add_epi32(s1[0], s5[0]);
+  x2[0] = _mm_add_epi32(s2[0], s6[0]);
+  x3[0] = _mm_add_epi32(s3[0], s7[0]);
+  x4[0] = _mm_sub_epi32(s0[0], s4[0]);
+  x5[0] = _mm_sub_epi32(s1[0], s5[0]);
+  x6[0] = _mm_sub_epi32(s2[0], s6[0]);
+  x7[0] = _mm_sub_epi32(s3[0], s7[0]);
+
+  highbd_iadst_butterfly_sse4_1(x8[0], x9[0], cospi_4_64, cospi_28_64, s8, s9);
+  highbd_iadst_butterfly_sse4_1(x10[0], x11[0], cospi_20_64, cospi_12_64, s10,
+                                s11);
+  highbd_iadst_butterfly_sse4_1(x13[0], x12[0], cospi_28_64, cospi_4_64, s13,
+                                s12);
+  highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_12_64, cospi_20_64, s15,
+                                s14);
+
+  x8[0] = _mm_add_epi64(s8[0], s12[0]);
+  x8[1] = _mm_add_epi64(s8[1], s12[1]);
+  x9[0] = _mm_add_epi64(s9[0], s13[0]);
+  x9[1] = _mm_add_epi64(s9[1], s13[1]);
+  x10[0] = _mm_add_epi64(s10[0], s14[0]);
+  x10[1] = _mm_add_epi64(s10[1], s14[1]);
+  x11[0] = _mm_add_epi64(s11[0], s15[0]);
+  x11[1] = _mm_add_epi64(s11[1], s15[1]);
+  x12[0] = _mm_sub_epi64(s8[0], s12[0]);
+  x12[1] = _mm_sub_epi64(s8[1], s12[1]);
+  x13[0] = _mm_sub_epi64(s9[0], s13[0]);
+  x13[1] = _mm_sub_epi64(s9[1], s13[1]);
+  x14[0] = _mm_sub_epi64(s10[0], s14[0]);
+  x14[1] = _mm_sub_epi64(s10[1], s14[1]);
+  x15[0] = _mm_sub_epi64(s11[0], s15[0]);
+  x15[1] = _mm_sub_epi64(s11[1], s15[1]);
+  x8[0] = dct_const_round_shift_64bit(x8[0]);
+  x8[1] = dct_const_round_shift_64bit(x8[1]);
+  x9[0] = dct_const_round_shift_64bit(x9[0]);
+  x9[1] = dct_const_round_shift_64bit(x9[1]);
+  x10[0] = dct_const_round_shift_64bit(x10[0]);
+  x10[1] = dct_const_round_shift_64bit(x10[1]);
+  x11[0] = dct_const_round_shift_64bit(x11[0]);
+  x11[1] = dct_const_round_shift_64bit(x11[1]);
+  x12[0] = dct_const_round_shift_64bit(x12[0]);
+  x12[1] = dct_const_round_shift_64bit(x12[1]);
+  x13[0] = dct_const_round_shift_64bit(x13[0]);
+  x13[1] = dct_const_round_shift_64bit(x13[1]);
+  x14[0] = dct_const_round_shift_64bit(x14[0]);
+  x14[1] = dct_const_round_shift_64bit(x14[1]);
+  x15[0] = dct_const_round_shift_64bit(x15[0]);
+  x15[1] = dct_const_round_shift_64bit(x15[1]);
+  x8[0] = pack_4(x8[0], x8[1]);
+  x9[0] = pack_4(x9[0], x9[1]);
+  x10[0] = pack_4(x10[0], x10[1]);
+  x11[0] = pack_4(x11[0], x11[1]);
+  x12[0] = pack_4(x12[0], x12[1]);
+  x13[0] = pack_4(x13[0], x13[1]);
+  x14[0] = pack_4(x14[0], x14[1]);
+  x15[0] = pack_4(x15[0], x15[1]);
+
+  // stage 3
+  s0[0] = x0[0];
+  s1[0] = x1[0];
+  s2[0] = x2[0];
+  s3[0] = x3[0];
+  highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5);
+  highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6);
+  s8[0] = x8[0];
+  s9[0] = x9[0];
+  s10[0] = x10[0];
+  s11[0] = x11[0];
+  highbd_iadst_butterfly_sse4_1(x12[0], x13[0], cospi_8_64, cospi_24_64, s12,
+                                s13);
+  highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_24_64, cospi_8_64, s15,
+                                s14);
+
+  x0[0] = _mm_add_epi32(s0[0], s2[0]);
+  x1[0] = _mm_add_epi32(s1[0], s3[0]);
+  x2[0] = _mm_sub_epi32(s0[0], s2[0]);
+  x3[0] = _mm_sub_epi32(s1[0], s3[0]);
+  x4[0] = _mm_add_epi64(s4[0], s6[0]);
+  x4[1] = _mm_add_epi64(s4[1], s6[1]);
+  x5[0] = _mm_add_epi64(s5[0], s7[0]);
+  x5[1] = _mm_add_epi64(s5[1], s7[1]);
+  x6[0] = _mm_sub_epi64(s4[0], s6[0]);
+  x6[1] = _mm_sub_epi64(s4[1], s6[1]);
+  x7[0] = _mm_sub_epi64(s5[0], s7[0]);
+  x7[1] = _mm_sub_epi64(s5[1], s7[1]);
+  x4[0] = dct_const_round_shift_64bit(x4[0]);
+  x4[1] = dct_const_round_shift_64bit(x4[1]);
+  x5[0] = dct_const_round_shift_64bit(x5[0]);
+  x5[1] = dct_const_round_shift_64bit(x5[1]);
+  x6[0] = dct_const_round_shift_64bit(x6[0]);
+  x6[1] = dct_const_round_shift_64bit(x6[1]);
+  x7[0] = dct_const_round_shift_64bit(x7[0]);
+  x7[1] = dct_const_round_shift_64bit(x7[1]);
+  x4[0] = pack_4(x4[0], x4[1]);
+  x5[0] = pack_4(x5[0], x5[1]);
+  x6[0] = pack_4(x6[0], x6[1]);
+  x7[0] = pack_4(x7[0], x7[1]);
+  x8[0] = _mm_add_epi32(s8[0], s10[0]);
+  x9[0] = _mm_add_epi32(s9[0], s11[0]);
+  x10[0] = _mm_sub_epi32(s8[0], s10[0]);
+  x11[0] = _mm_sub_epi32(s9[0], s11[0]);
+  x12[0] = _mm_add_epi64(s12[0], s14[0]);
+  x12[1] = _mm_add_epi64(s12[1], s14[1]);
+  x13[0] = _mm_add_epi64(s13[0], s15[0]);
+  x13[1] = _mm_add_epi64(s13[1], s15[1]);
+  x14[0] = _mm_sub_epi64(s12[0], s14[0]);
+  x14[1] = _mm_sub_epi64(s12[1], s14[1]);
+  x15[0] = _mm_sub_epi64(s13[0], s15[0]);
+  x15[1] = _mm_sub_epi64(s13[1], s15[1]);
+  x12[0] = dct_const_round_shift_64bit(x12[0]);
+  x12[1] = dct_const_round_shift_64bit(x12[1]);
+  x13[0] = dct_const_round_shift_64bit(x13[0]);
+  x13[1] = dct_const_round_shift_64bit(x13[1]);
+  x14[0] = dct_const_round_shift_64bit(x14[0]);
+  x14[1] = dct_const_round_shift_64bit(x14[1]);
+  x15[0] = dct_const_round_shift_64bit(x15[0]);
+  x15[1] = dct_const_round_shift_64bit(x15[1]);
+  x12[0] = pack_4(x12[0], x12[1]);
+  x13[0] = pack_4(x13[0], x13[1]);
+  x14[0] = pack_4(x14[0], x14[1]);
+  x15[0] = pack_4(x15[0], x15[1]);
+
+  // stage 4
+  s2[0] = _mm_add_epi32(x2[0], x3[0]);
+  s3[0] = _mm_sub_epi32(x2[0], x3[0]);
+  s6[0] = _mm_add_epi32(x7[0], x6[0]);
+  s7[0] = _mm_sub_epi32(x7[0], x6[0]);
+  s10[0] = _mm_add_epi32(x11[0], x10[0]);
+  s11[0] = _mm_sub_epi32(x11[0], x10[0]);
+  s14[0] = _mm_add_epi32(x14[0], x15[0]);
+  s15[0] = _mm_sub_epi32(x14[0], x15[0]);
+  highbd_iadst_half_butterfly_sse4_1(s2[0], -cospi_16_64, s2);
+  highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3);
+  highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6);
+  highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7);
+  highbd_iadst_half_butterfly_sse4_1(s10[0], cospi_16_64, s10);
+  highbd_iadst_half_butterfly_sse4_1(s11[0], cospi_16_64, s11);
+  highbd_iadst_half_butterfly_sse4_1(s14[0], -cospi_16_64, s14);
+  highbd_iadst_half_butterfly_sse4_1(s15[0], cospi_16_64, s15);
+
+  x2[0] = dct_const_round_shift_64bit(s2[0]);
+  x2[1] = dct_const_round_shift_64bit(s2[1]);
+  x3[0] = dct_const_round_shift_64bit(s3[0]);
+  x3[1] = dct_const_round_shift_64bit(s3[1]);
+  x6[0] = dct_const_round_shift_64bit(s6[0]);
+  x6[1] = dct_const_round_shift_64bit(s6[1]);
+  x7[0] = dct_const_round_shift_64bit(s7[0]);
+  x7[1] = dct_const_round_shift_64bit(s7[1]);
+  x10[0] = dct_const_round_shift_64bit(s10[0]);
+  x10[1] = dct_const_round_shift_64bit(s10[1]);
+  x11[0] = dct_const_round_shift_64bit(s11[0]);
+  x11[1] = dct_const_round_shift_64bit(s11[1]);
+  x14[0] = dct_const_round_shift_64bit(s14[0]);
+  x14[1] = dct_const_round_shift_64bit(s14[1]);
+  x15[0] = dct_const_round_shift_64bit(s15[0]);
+  x15[1] = dct_const_round_shift_64bit(s15[1]);
+  x2[0] = pack_4(x2[0], x2[1]);
+  x3[0] = pack_4(x3[0], x3[1]);
+  x6[0] = pack_4(x6[0], x6[1]);
+  x7[0] = pack_4(x7[0], x7[1]);
+  x10[0] = pack_4(x10[0], x10[1]);
+  x11[0] = pack_4(x11[0], x11[1]);
+  x14[0] = pack_4(x14[0], x14[1]);
+  x15[0] = pack_4(x15[0], x15[1]);
+
+  io[0] = x0[0];
+  io[1] = _mm_sub_epi32(_mm_setzero_si128(), x8[0]);
+  io[2] = x12[0];
+  io[3] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]);
+  io[4] = x6[0];
+  io[5] = x14[0];
+  io[6] = x10[0];
+  io[7] = x2[0];
+  io[8] = x3[0];
+  io[9] = x11[0];
+  io[10] = x15[0];
+  io[11] = x7[0];
+  io[12] = x5[0];
+  io[13] = _mm_sub_epi32(_mm_setzero_si128(), x13[0]);
+  io[14] = x9[0];
+  io[15] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]);
+}
+
+void vp9_highbd_iht16x16_256_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                        int stride, int tx_type, int bd) {
+  int i;
+  __m128i out[16], *in;
+
+  if (bd == 8) {
+    __m128i l[16], r[16];
+
+    in = l;
+    for (i = 0; i < 2; i++) {
+      highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
+      highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);
+      if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+        idct16_8col(in, in);
+      } else {
+        vpx_iadst16_8col_sse2(in);
+      }
+      in = r;
+      input += 128;
+    }
+
+    for (i = 0; i < 16; i += 8) {
+      int j;
+      transpose_16bit_8x8(l + i, out);
+      transpose_16bit_8x8(r + i, out + 8);
+      if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+        idct16_8col(out, out);
+      } else {
+        vpx_iadst16_8col_sse2(out);
+      }
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[4][16];
+
+    for (i = 0; i < 4; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
+      highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
+      if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+        vpx_highbd_idct16_4col_sse4_1(in);
+      } else {
+        highbd_iadst16_4col_sse4_1(in);
+      }
+      input += 4 * 16;
+    }
+
+    for (i = 0; i < 16; i += 4) {
+      int j;
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      transpose_32bit_4x4(all[2] + i, out + 8);
+      transpose_32bit_4x4(all[3] + i, out + 12);
+      if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+        vpx_highbd_idct16_4col_sse4_1(out);
+      } else {
+        highbd_iadst16_4col_sse4_1(out);
+      }
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
diff --git a/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c b/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c
new file mode 100644
index 0000000000..af158536f9
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c
@@ -0,0 +1,131 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_iadst4_sse4_1(__m128i *const io) {
+  const __m128i pair_c1 = pair_set_epi32(4 * sinpi_1_9, 0);
+  const __m128i pair_c2 = pair_set_epi32(4 * sinpi_2_9, 0);
+  const __m128i pair_c3 = pair_set_epi32(4 * sinpi_3_9, 0);
+  const __m128i pair_c4 = pair_set_epi32(4 * sinpi_4_9, 0);
+  __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], t0[2], t1[2], t2[2];
+  __m128i temp[2];
+
+  transpose_32bit_4x4(io, io);
+
+  extend_64bit(io[0], temp);
+  s0[0] = _mm_mul_epi32(pair_c1, temp[0]);
+  s0[1] = _mm_mul_epi32(pair_c1, temp[1]);
+  s1[0] = _mm_mul_epi32(pair_c2, temp[0]);
+  s1[1] = _mm_mul_epi32(pair_c2, temp[1]);
+
+  extend_64bit(io[1], temp);
+  s2[0] = _mm_mul_epi32(pair_c3, temp[0]);
+  s2[1] = _mm_mul_epi32(pair_c3, temp[1]);
+
+  extend_64bit(io[2], temp);
+  s3[0] = _mm_mul_epi32(pair_c4, temp[0]);
+  s3[1] = _mm_mul_epi32(pair_c4, temp[1]);
+  s4[0] = _mm_mul_epi32(pair_c1, temp[0]);
+  s4[1] = _mm_mul_epi32(pair_c1, temp[1]);
+
+  extend_64bit(io[3], temp);
+  s5[0] = _mm_mul_epi32(pair_c2, temp[0]);
+  s5[1] = _mm_mul_epi32(pair_c2, temp[1]);
+  s6[0] = _mm_mul_epi32(pair_c4, temp[0]);
+  s6[1] = _mm_mul_epi32(pair_c4, temp[1]);
+
+  t0[0] = _mm_add_epi64(s0[0], s3[0]);
+  t0[1] = _mm_add_epi64(s0[1], s3[1]);
+  t0[0] = _mm_add_epi64(t0[0], s5[0]);
+  t0[1] = _mm_add_epi64(t0[1], s5[1]);
+  t1[0] = _mm_sub_epi64(s1[0], s4[0]);
+  t1[1] = _mm_sub_epi64(s1[1], s4[1]);
+  t1[0] = _mm_sub_epi64(t1[0], s6[0]);
+  t1[1] = _mm_sub_epi64(t1[1], s6[1]);
+  temp[0] = _mm_sub_epi32(io[0], io[2]);
+  temp[0] = _mm_add_epi32(temp[0], io[3]);
+  extend_64bit(temp[0], temp);
+  t2[0] = _mm_mul_epi32(pair_c3, temp[0]);
+  t2[1] = _mm_mul_epi32(pair_c3, temp[1]);
+
+  s0[0] = _mm_add_epi64(t0[0], s2[0]);
+  s0[1] = _mm_add_epi64(t0[1], s2[1]);
+  s1[0] = _mm_add_epi64(t1[0], s2[0]);
+  s1[1] = _mm_add_epi64(t1[1], s2[1]);
+  s3[0] = _mm_add_epi64(t0[0], t1[0]);
+  s3[1] = _mm_add_epi64(t0[1], t1[1]);
+  s3[0] = _mm_sub_epi64(s3[0], s2[0]);
+  s3[1] = _mm_sub_epi64(s3[1], s2[1]);
+
+  s0[0] = dct_const_round_shift_64bit(s0[0]);
+  s0[1] = dct_const_round_shift_64bit(s0[1]);
+  s1[0] = dct_const_round_shift_64bit(s1[0]);
+  s1[1] = dct_const_round_shift_64bit(s1[1]);
+  s2[0] = dct_const_round_shift_64bit(t2[0]);
+  s2[1] = dct_const_round_shift_64bit(t2[1]);
+  s3[0] = dct_const_round_shift_64bit(s3[0]);
+  s3[1] = dct_const_round_shift_64bit(s3[1]);
+  io[0] = pack_4(s0[0], s0[1]);
+  io[1] = pack_4(s1[0], s1[1]);
+  io[2] = pack_4(s2[0], s2[1]);
+  io[3] = pack_4(s3[0], s3[1]);
+}
+
+void vp9_highbd_iht4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                     int stride, int tx_type, int bd) {
+  __m128i io[4];
+
+  io[0] = _mm_load_si128((const __m128i *)(input + 0));
+  io[1] = _mm_load_si128((const __m128i *)(input + 4));
+  io[2] = _mm_load_si128((const __m128i *)(input + 8));
+  io[3] = _mm_load_si128((const __m128i *)(input + 12));
+
+  if (bd == 8) {
+    __m128i io_short[2];
+
+    io_short[0] = _mm_packs_epi32(io[0], io[1]);
+    io_short[1] = _mm_packs_epi32(io[2], io[3]);
+    if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+      idct4_sse2(io_short);
+    } else {
+      iadst4_sse2(io_short);
+    }
+    if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+      idct4_sse2(io_short);
+    } else {
+      iadst4_sse2(io_short);
+    }
+    io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8));
+    io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8));
+    io[0] = _mm_srai_epi16(io_short[0], 4);
+    io[1] = _mm_srai_epi16(io_short[1], 4);
+  } else {
+    if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+      highbd_idct4_sse4_1(io);
+    } else {
+      highbd_iadst4_sse4_1(io);
+    }
+    if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+      highbd_idct4_sse4_1(io);
+    } else {
+      highbd_iadst4_sse4_1(io);
+    }
+    io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8));
+    io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8));
+  }
+
+  recon_and_store_4x4(io, dest, stride, bd);
+}
diff --git a/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c b/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c
new file mode 100644
index 0000000000..7d949b6dbc
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c
@@ -0,0 +1,255 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in,
+                                                      const int c,
+                                                      __m128i *const s) {
+  const __m128i pair_c = pair_set_epi32(4 * c, 0);
+  __m128i x[2];
+
+  extend_64bit(in, x);
+  s[0] = _mm_mul_epi32(pair_c, x[0]);
+  s[1] = _mm_mul_epi32(pair_c, x[1]);
+}
+
+static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0,
+                                                 const __m128i in1,
+                                                 const int c0, const int c1,
+                                                 __m128i *const s0,
+                                                 __m128i *const s1) {
+  const __m128i pair_c0 = pair_set_epi32(4 * c0, 0);
+  const __m128i pair_c1 = pair_set_epi32(4 * c1, 0);
+  __m128i t00[2], t01[2], t10[2], t11[2];
+  __m128i x0[2], x1[2];
+
+  extend_64bit(in0, x0);
+  extend_64bit(in1, x1);
+  t00[0] = _mm_mul_epi32(pair_c0, x0[0]);
+  t00[1] = _mm_mul_epi32(pair_c0, x0[1]);
+  t01[0] = _mm_mul_epi32(pair_c0, x1[0]);
+  t01[1] = _mm_mul_epi32(pair_c0, x1[1]);
+  t10[0] = _mm_mul_epi32(pair_c1, x0[0]);
+  t10[1] = _mm_mul_epi32(pair_c1, x0[1]);
+  t11[0] = _mm_mul_epi32(pair_c1, x1[0]);
+  t11[1] = _mm_mul_epi32(pair_c1, x1[1]);
+
+  s0[0] = _mm_add_epi64(t00[0], t11[0]);
+  s0[1] = _mm_add_epi64(t00[1], t11[1]);
+  s1[0] = _mm_sub_epi64(t10[0], t01[0]);
+  s1[1] = _mm_sub_epi64(t10[1], t01[1]);
+}
+
+static void highbd_iadst8_sse4_1(__m128i *const io) {
+  __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
+  __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2];
+
+  transpose_32bit_4x4x2(io, io);
+
+  // stage 1
+  highbd_iadst_butterfly_sse4_1(io[7], io[0], cospi_2_64, cospi_30_64, s0, s1);
+  highbd_iadst_butterfly_sse4_1(io[3], io[4], cospi_18_64, cospi_14_64, s4, s5);
+  x0[0] = _mm_add_epi64(s0[0], s4[0]);
+  x0[1] = _mm_add_epi64(s0[1], s4[1]);
+  x1[0] = _mm_add_epi64(s1[0], s5[0]);
+  x1[1] = _mm_add_epi64(s1[1], s5[1]);
+  x4[0] = _mm_sub_epi64(s0[0], s4[0]);
+  x4[1] = _mm_sub_epi64(s0[1], s4[1]);
+  x5[0] = _mm_sub_epi64(s1[0], s5[0]);
+  x5[1] = _mm_sub_epi64(s1[1], s5[1]);
+
+  highbd_iadst_butterfly_sse4_1(io[5], io[2], cospi_10_64, cospi_22_64, s2, s3);
+  highbd_iadst_butterfly_sse4_1(io[1], io[6], cospi_26_64, cospi_6_64, s6, s7);
+  x2[0] = _mm_add_epi64(s2[0], s6[0]);
+  x2[1] = _mm_add_epi64(s2[1], s6[1]);
+  x3[0] = _mm_add_epi64(s3[0], s7[0]);
+  x3[1] = _mm_add_epi64(s3[1], s7[1]);
+  x6[0] = _mm_sub_epi64(s2[0], s6[0]);
+  x6[1] = _mm_sub_epi64(s2[1], s6[1]);
+  x7[0] = _mm_sub_epi64(s3[0], s7[0]);
+  x7[1] = _mm_sub_epi64(s3[1], s7[1]);
+
+  x0[0] = dct_const_round_shift_64bit(x0[0]);
+  x0[1] = dct_const_round_shift_64bit(x0[1]);
+  x1[0] = dct_const_round_shift_64bit(x1[0]);
+  x1[1] = dct_const_round_shift_64bit(x1[1]);
+  x2[0] = dct_const_round_shift_64bit(x2[0]);
+  x2[1] = dct_const_round_shift_64bit(x2[1]);
+  x3[0] = dct_const_round_shift_64bit(x3[0]);
+  x3[1] = dct_const_round_shift_64bit(x3[1]);
+  x4[0] = dct_const_round_shift_64bit(x4[0]);
+  x4[1] = dct_const_round_shift_64bit(x4[1]);
+  x5[0] = dct_const_round_shift_64bit(x5[0]);
+  x5[1] = dct_const_round_shift_64bit(x5[1]);
+  x6[0] = dct_const_round_shift_64bit(x6[0]);
+  x6[1] = dct_const_round_shift_64bit(x6[1]);
+  x7[0] = dct_const_round_shift_64bit(x7[0]);
+  x7[1] = dct_const_round_shift_64bit(x7[1]);
+  s0[0] = pack_4(x0[0], x0[1]);  // s0 = x0;
+  s1[0] = pack_4(x1[0], x1[1]);  // s1 = x1;
+  s2[0] = pack_4(x2[0], x2[1]);  // s2 = x2;
+  s3[0] = pack_4(x3[0], x3[1]);  // s3 = x3;
+  x4[0] = pack_4(x4[0], x4[1]);
+  x5[0] = pack_4(x5[0], x5[1]);
+  x6[0] = pack_4(x6[0], x6[1]);
+  x7[0] = pack_4(x7[0], x7[1]);
+
+  // stage 2
+  x0[0] = _mm_add_epi32(s0[0], s2[0]);
+  x1[0] = _mm_add_epi32(s1[0], s3[0]);
+  x2[0] = _mm_sub_epi32(s0[0], s2[0]);
+  x3[0] = _mm_sub_epi32(s1[0], s3[0]);
+
+  highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5);
+  highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6);
+
+  x4[0] = _mm_add_epi64(s4[0], s6[0]);
+  x4[1] = _mm_add_epi64(s4[1], s6[1]);
+  x5[0] = _mm_add_epi64(s5[0], s7[0]);
+  x5[1] = _mm_add_epi64(s5[1], s7[1]);
+  x6[0] = _mm_sub_epi64(s4[0], s6[0]);
+  x6[1] = _mm_sub_epi64(s4[1], s6[1]);
+  x7[0] = _mm_sub_epi64(s5[0], s7[0]);
+  x7[1] = _mm_sub_epi64(s5[1], s7[1]);
+  x4[0] = dct_const_round_shift_64bit(x4[0]);
+  x4[1] = dct_const_round_shift_64bit(x4[1]);
+  x5[0] = dct_const_round_shift_64bit(x5[0]);
+  x5[1] = dct_const_round_shift_64bit(x5[1]);
+  x6[0] = dct_const_round_shift_64bit(x6[0]);
+  x6[1] = dct_const_round_shift_64bit(x6[1]);
+  x7[0] = dct_const_round_shift_64bit(x7[0]);
+  x7[1] = dct_const_round_shift_64bit(x7[1]);
+  x4[0] = pack_4(x4[0], x4[1]);
+  x5[0] = pack_4(x5[0], x5[1]);
+  x6[0] = pack_4(x6[0], x6[1]);
+  x7[0] = pack_4(x7[0], x7[1]);
+
+  // stage 3
+  s2[0] = _mm_add_epi32(x2[0], x3[0]);
+  s3[0] = _mm_sub_epi32(x2[0], x3[0]);
+  s6[0] = _mm_add_epi32(x6[0], x7[0]);
+  s7[0] = _mm_sub_epi32(x6[0], x7[0]);
+  highbd_iadst_half_butterfly_sse4_1(s2[0], cospi_16_64, s2);
+  highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3);
+  highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6);
+  highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7);
+
+  x2[0] = dct_const_round_shift_64bit(s2[0]);
+  x2[1] = dct_const_round_shift_64bit(s2[1]);
+  x3[0] = dct_const_round_shift_64bit(s3[0]);
+  x3[1] = dct_const_round_shift_64bit(s3[1]);
+  x6[0] = dct_const_round_shift_64bit(s6[0]);
+  x6[1] = dct_const_round_shift_64bit(s6[1]);
+  x7[0] = dct_const_round_shift_64bit(s7[0]);
+  x7[1] = dct_const_round_shift_64bit(s7[1]);
+  x2[0] = pack_4(x2[0], x2[1]);
+  x3[0] = pack_4(x3[0], x3[1]);
+  x6[0] = pack_4(x6[0], x6[1]);
+  x7[0] = pack_4(x7[0], x7[1]);
+
+  io[0] = x0[0];
+  io[1] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]);
+  io[2] = x6[0];
+  io[3] = _mm_sub_epi32(_mm_setzero_si128(), x2[0]);
+  io[4] = x3[0];
+  io[5] = _mm_sub_epi32(_mm_setzero_si128(), x7[0]);
+  io[6] = x5[0];
+  io[7] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]);
+}
+
+void vp9_highbd_iht8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                     int stride, int tx_type, int bd) {
+  __m128i io[16];
+
+  io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+  io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4));
+  io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+  io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4));
+  io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+  io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4));
+  io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+  io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4));
+  io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+  io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+  io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+  io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+  io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+  io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+  io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+  io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+
+  if (bd == 8) {
+    __m128i io_short[8];
+
+    io_short[0] = _mm_packs_epi32(io[0], io[4]);
+    io_short[1] = _mm_packs_epi32(io[1], io[5]);
+    io_short[2] = _mm_packs_epi32(io[2], io[6]);
+    io_short[3] = _mm_packs_epi32(io[3], io[7]);
+    io_short[4] = _mm_packs_epi32(io[8], io[12]);
+    io_short[5] = _mm_packs_epi32(io[9], io[13]);
+    io_short[6] = _mm_packs_epi32(io[10], io[14]);
+    io_short[7] = _mm_packs_epi32(io[11], io[15]);
+
+    if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+      vpx_idct8_sse2(io_short);
+    } else {
+      iadst8_sse2(io_short);
+    }
+    if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+      vpx_idct8_sse2(io_short);
+    } else {
+      iadst8_sse2(io_short);
+    }
+    round_shift_8x8(io_short, io);
+  } else {
+    __m128i temp[4];
+
+    if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+      vpx_highbd_idct8x8_half1d_sse4_1(io);
+      vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
+    } else {
+      highbd_iadst8_sse4_1(io);
+      highbd_iadst8_sse4_1(&io[8]);
+    }
+
+    temp[0] = io[4];
+    temp[1] = io[5];
+    temp[2] = io[6];
+    temp[3] = io[7];
+    io[4] = io[8];
+    io[5] = io[9];
+    io[6] = io[10];
+    io[7] = io[11];
+
+    if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+      vpx_highbd_idct8x8_half1d_sse4_1(io);
+      io[8] = temp[0];
+      io[9] = temp[1];
+      io[10] = temp[2];
+      io[11] = temp[3];
+      vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
+    } else {
+      highbd_iadst8_sse4_1(io);
+      io[8] = temp[0];
+      io[9] = temp[1];
+      io[10] = temp[2];
+      io[11] = temp[3];
+      highbd_iadst8_sse4_1(&io[8]);
+    }
+    highbd_idct8x8_final_round(io);
+  }
+  recon_and_store_8x8(io, dest, stride, bd);
+}
diff --git a/media/libvpx/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/media/libvpx/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
index dcfc454aa0..ad693718c0 100644
--- a/media/libvpx/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/media/libvpx/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -10,36 +10,33 @@
 
 #include "./vp9_rtcd.h"
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
-#include "vpx_dsp/x86/txfm_common_sse2.h"
-#include "vpx_ports/mem.h"
 
 void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
                             int tx_type) {
   __m128i in[2];
-  const __m128i zero = _mm_setzero_si128();
   const __m128i eight = _mm_set1_epi16(8);
 
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 8);
+  in[0] = load_input_data8(input);
+  in[1] = load_input_data8(input + 8);
 
   switch (tx_type) {
-    case 0:  // DCT_DCT
+    case DCT_DCT:
       idct4_sse2(in);
       idct4_sse2(in);
       break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
       idct4_sse2(in);
       iadst4_sse2(in);
       break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
       iadst4_sse2(in);
       idct4_sse2(in);
       break;
-    case 3:  // ADST_ADST
+    default:
+      assert(tx_type == ADST_ADST);
       iadst4_sse2(in);
       iadst4_sse2(in);
       break;
-    default: assert(0); break;
   }
 
   // Final round and shift
@@ -49,67 +46,42 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
   in[0] = _mm_srai_epi16(in[0], 4);
   in[1] = _mm_srai_epi16(in[1], 4);
 
-  // Reconstruction and Store
-  {
-    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
-    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
-    d0 = _mm_unpacklo_epi32(d0,
-                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
-    d2 = _mm_unpacklo_epi32(
-        d2, _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)));
-    d0 = _mm_unpacklo_epi8(d0, zero);
-    d2 = _mm_unpacklo_epi8(d2, zero);
-    d0 = _mm_add_epi16(d0, in[0]);
-    d2 = _mm_add_epi16(d2, in[1]);
-    d0 = _mm_packus_epi16(d0, d2);
-    // store result[0]
-    *(int *)dest = _mm_cvtsi128_si32(d0);
-    // store result[1]
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
-    // store result[2]
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
-    // store result[3]
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
-  }
+  recon_and_store4x4_sse2(in, dest, stride);
 }
 
 void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
                             int tx_type) {
   __m128i in[8];
-  const __m128i zero = _mm_setzero_si128();
   const __m128i final_rounding = _mm_set1_epi16(1 << 4);
 
   // load input data
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 8 * 1);
-  in[2] = load_input_data(input + 8 * 2);
-  in[3] = load_input_data(input + 8 * 3);
-  in[4] = load_input_data(input + 8 * 4);
-  in[5] = load_input_data(input + 8 * 5);
-  in[6] = load_input_data(input + 8 * 6);
-  in[7] = load_input_data(input + 8 * 7);
+  in[0] = load_input_data8(input);
+  in[1] = load_input_data8(input + 8 * 1);
+  in[2] = load_input_data8(input + 8 * 2);
+  in[3] = load_input_data8(input + 8 * 3);
+  in[4] = load_input_data8(input + 8 * 4);
+  in[5] = load_input_data8(input + 8 * 5);
+  in[6] = load_input_data8(input + 8 * 6);
+  in[7] = load_input_data8(input + 8 * 7);
 
   switch (tx_type) {
-    case 0:  // DCT_DCT
-      idct8_sse2(in);
-      idct8_sse2(in);
+    case DCT_DCT:
+      vpx_idct8_sse2(in);
+      vpx_idct8_sse2(in);
       break;
-    case 1:  // ADST_DCT
-      idct8_sse2(in);
+    case ADST_DCT:
+      vpx_idct8_sse2(in);
       iadst8_sse2(in);
       break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
       iadst8_sse2(in);
-      idct8_sse2(in);
+      vpx_idct8_sse2(in);
       break;
-    case 3:  // ADST_ADST
+    default:
+      assert(tx_type == ADST_ADST);
       iadst8_sse2(in);
       iadst8_sse2(in);
       break;
-    default: assert(0); break;
   }
 
   // Final rounding and shift
@@ -131,14 +103,91 @@ void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
   in[6] = _mm_srai_epi16(in[6], 5);
   in[7] = _mm_srai_epi16(in[7], 5);
 
-  RECON_AND_STORE(dest + 0 * stride, in[0]);
-  RECON_AND_STORE(dest + 1 * stride, in[1]);
-  RECON_AND_STORE(dest + 2 * stride, in[2]);
-  RECON_AND_STORE(dest + 3 * stride, in[3]);
-  RECON_AND_STORE(dest + 4 * stride, in[4]);
-  RECON_AND_STORE(dest + 5 * stride, in[5]);
-  RECON_AND_STORE(dest + 6 * stride, in[6]);
-  RECON_AND_STORE(dest + 7 * stride, in[7]);
+  recon_and_store(dest + 0 * stride, in[0]);
+  recon_and_store(dest + 1 * stride, in[1]);
+  recon_and_store(dest + 2 * stride, in[2]);
+  recon_and_store(dest + 3 * stride, in[3]);
+  recon_and_store(dest + 4 * stride, in[4]);
+  recon_and_store(dest + 5 * stride, in[5]);
+  recon_and_store(dest + 6 * stride, in[6]);
+  recon_and_store(dest + 7 * stride, in[7]);
+}
+
+static INLINE void load_buffer_8x16(const tran_low_t *const input,
+                                    __m128i *const in) {
+  in[0] = load_input_data8(input + 0 * 16);
+  in[1] = load_input_data8(input + 1 * 16);
+  in[2] = load_input_data8(input + 2 * 16);
+  in[3] = load_input_data8(input + 3 * 16);
+  in[4] = load_input_data8(input + 4 * 16);
+  in[5] = load_input_data8(input + 5 * 16);
+  in[6] = load_input_data8(input + 6 * 16);
+  in[7] = load_input_data8(input + 7 * 16);
+
+  in[8] = load_input_data8(input + 8 * 16);
+  in[9] = load_input_data8(input + 9 * 16);
+  in[10] = load_input_data8(input + 10 * 16);
+  in[11] = load_input_data8(input + 11 * 16);
+  in[12] = load_input_data8(input + 12 * 16);
+  in[13] = load_input_data8(input + 13 * 16);
+  in[14] = load_input_data8(input + 14 * 16);
+  in[15] = load_input_data8(input + 15 * 16);
+}
+
+static INLINE void write_buffer_8x16(uint8_t *const dest, __m128i *const in,
+                                     const int stride) {
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  // Final rounding and shift
+  in[0] = _mm_adds_epi16(in[0], final_rounding);
+  in[1] = _mm_adds_epi16(in[1], final_rounding);
+  in[2] = _mm_adds_epi16(in[2], final_rounding);
+  in[3] = _mm_adds_epi16(in[3], final_rounding);
+  in[4] = _mm_adds_epi16(in[4], final_rounding);
+  in[5] = _mm_adds_epi16(in[5], final_rounding);
+  in[6] = _mm_adds_epi16(in[6], final_rounding);
+  in[7] = _mm_adds_epi16(in[7], final_rounding);
+  in[8] = _mm_adds_epi16(in[8], final_rounding);
+  in[9] = _mm_adds_epi16(in[9], final_rounding);
+  in[10] = _mm_adds_epi16(in[10], final_rounding);
+  in[11] = _mm_adds_epi16(in[11], final_rounding);
+  in[12] = _mm_adds_epi16(in[12], final_rounding);
+  in[13] = _mm_adds_epi16(in[13], final_rounding);
+  in[14] = _mm_adds_epi16(in[14], final_rounding);
+  in[15] = _mm_adds_epi16(in[15], final_rounding);
+
+  in[0] = _mm_srai_epi16(in[0], 6);
+  in[1] = _mm_srai_epi16(in[1], 6);
+  in[2] = _mm_srai_epi16(in[2], 6);
+  in[3] = _mm_srai_epi16(in[3], 6);
+  in[4] = _mm_srai_epi16(in[4], 6);
+  in[5] = _mm_srai_epi16(in[5], 6);
+  in[6] = _mm_srai_epi16(in[6], 6);
+  in[7] = _mm_srai_epi16(in[7], 6);
+  in[8] = _mm_srai_epi16(in[8], 6);
+  in[9] = _mm_srai_epi16(in[9], 6);
+  in[10] = _mm_srai_epi16(in[10], 6);
+  in[11] = _mm_srai_epi16(in[11], 6);
+  in[12] = _mm_srai_epi16(in[12], 6);
+  in[13] = _mm_srai_epi16(in[13], 6);
+  in[14] = _mm_srai_epi16(in[14], 6);
+  in[15] = _mm_srai_epi16(in[15], 6);
+
+  recon_and_store(dest + 0 * stride, in[0]);
+  recon_and_store(dest + 1 * stride, in[1]);
+  recon_and_store(dest + 2 * stride, in[2]);
+  recon_and_store(dest + 3 * stride, in[3]);
+  recon_and_store(dest + 4 * stride, in[4]);
+  recon_and_store(dest + 5 * stride, in[5]);
+  recon_and_store(dest + 6 * stride, in[6]);
+  recon_and_store(dest + 7 * stride, in[7]);
+  recon_and_store(dest + 8 * stride, in[8]);
+  recon_and_store(dest + 9 * stride, in[9]);
+  recon_and_store(dest + 10 * stride, in[10]);
+  recon_and_store(dest + 11 * stride, in[11]);
+  recon_and_store(dest + 12 * stride, in[12]);
+  recon_and_store(dest + 13 * stride, in[13]);
+  recon_and_store(dest + 14 * stride, in[14]);
+  recon_and_store(dest + 15 * stride, in[15]);
 }
 
 void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
@@ -150,23 +199,23 @@ void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
   load_buffer_8x16(input, in1);
 
   switch (tx_type) {
-    case 0:  // DCT_DCT
+    case DCT_DCT:
       idct16_sse2(in0, in1);
       idct16_sse2(in0, in1);
       break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
       idct16_sse2(in0, in1);
       iadst16_sse2(in0, in1);
       break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
       iadst16_sse2(in0, in1);
       idct16_sse2(in0, in1);
       break;
-    case 3:  // ADST_ADST
+    default:
+      assert(tx_type == ADST_ADST);
       iadst16_sse2(in0, in1);
       iadst16_sse2(in0, in1);
       break;
-    default: assert(0); break;
   }
 
   write_buffer_8x16(dest, in0, stride);
diff --git a/media/libvpx/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm b/media/libvpx/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm
index 30852049b4..ae7c94ea3f 100644
--- a/media/libvpx/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm
+++ b/media/libvpx/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm
@@ -12,6 +12,8 @@
 ;  TODO(jackychen): Find a way to fix the duplicate.
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
+
 ;void vp9_filter_by_weight16x16_sse2
 ;(
 ;    unsigned char *src,
@@ -20,7 +22,7 @@
 ;    int            dst_stride,
 ;    int            src_weight
 ;)
-global sym(vp9_filter_by_weight16x16_sse2) PRIVATE
+globalsym(vp9_filter_by_weight16x16_sse2)
 sym(vp9_filter_by_weight16x16_sse2):
     push        rbp
     mov         rbp, rsp
@@ -98,7 +100,7 @@ sym(vp9_filter_by_weight16x16_sse2):
 ;    int            dst_stride,
 ;    int            src_weight
 ;)
-global sym(vp9_filter_by_weight8x8_sse2) PRIVATE
+globalsym(vp9_filter_by_weight8x8_sse2)
 sym(vp9_filter_by_weight8x8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -166,7 +168,7 @@ sym(vp9_filter_by_weight8x8_sse2):
 ;    unsigned int  *variance,      4
 ;    unsigned int  *sad,           5
 ;)
-global sym(vp9_variance_and_sad_16x16_sse2) PRIVATE
+globalsym(vp9_variance_and_sad_16x16_sse2)
 sym(vp9_variance_and_sad_16x16_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c b/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c
index 628d1c8d2b..45ef99adf9 100644
--- a/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c
+++ b/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c
@@ -22,7 +22,11 @@
 #include "vpx_ports/mem.h"
 #include "vpx_ports/mem_ops.h"
 #include "vpx_scale/vpx_scale.h"
+#include "vpx_util/vpx_pthread.h"
 #include "vpx_util/vpx_thread.h"
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+#include "vpx_util/vpx_debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
 
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_common.h"
@@ -42,34 +46,15 @@
 #include "vp9/decoder/vp9_decodemv.h"
 #include "vp9/decoder/vp9_decoder.h"
 #include "vp9/decoder/vp9_dsubexp.h"
+#include "vp9/decoder/vp9_job_queue.h"
 
 #define MAX_VP9_HEADER_SIZE 80
 
-static int is_compound_reference_allowed(const VP9_COMMON *cm) {
-  int i;
-  for (i = 1; i < REFS_PER_FRAME; ++i)
-    if (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1]) return 1;
+typedef int (*predict_recon_func)(TileWorkerData *twd, MODE_INFO *const mi,
+                                  int plane, int row, int col, TX_SIZE tx_size);
 
-  return 0;
-}
-
-static void setup_compound_reference_mode(VP9_COMMON *cm) {
-  if (cm->ref_frame_sign_bias[LAST_FRAME] ==
-      cm->ref_frame_sign_bias[GOLDEN_FRAME]) {
-    cm->comp_fixed_ref = ALTREF_FRAME;
-    cm->comp_var_ref[0] = LAST_FRAME;
-    cm->comp_var_ref[1] = GOLDEN_FRAME;
-  } else if (cm->ref_frame_sign_bias[LAST_FRAME] ==
-             cm->ref_frame_sign_bias[ALTREF_FRAME]) {
-    cm->comp_fixed_ref = GOLDEN_FRAME;
-    cm->comp_var_ref[0] = LAST_FRAME;
-    cm->comp_var_ref[1] = ALTREF_FRAME;
-  } else {
-    cm->comp_fixed_ref = LAST_FRAME;
-    cm->comp_var_ref[0] = GOLDEN_FRAME;
-    cm->comp_var_ref[1] = ALTREF_FRAME;
-  }
-}
+typedef void (*intra_recon_func)(TileWorkerData *twd, MODE_INFO *const mi,
+                                 int plane, int row, int col, TX_SIZE tx_size);
 
 static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) {
   return len != 0 && len <= (size_t)(end - start);
@@ -83,6 +68,7 @@ static int decode_unsigned_max(struct vpx_read_bit_buffer *rb, int max) {
 static TX_MODE read_tx_mode(vpx_reader *r) {
   TX_MODE tx_mode = vpx_read_literal(r, 2);
   if (tx_mode == ALLOW_32X32) tx_mode += vpx_read_bit(r);
+  assert(tx_mode < TX_MODES);
   return tx_mode;
 }
 
@@ -118,7 +104,7 @@ static void read_inter_mode_probs(FRAME_CONTEXT *fc, vpx_reader *r) {
 
 static REFERENCE_MODE read_frame_reference_mode(const VP9_COMMON *cm,
                                                 vpx_reader *r) {
-  if (is_compound_reference_allowed(cm)) {
+  if (vp9_compound_reference_allowed(cm)) {
     return vpx_read_bit(r)
                ? (vpx_read_bit(r) ? REFERENCE_MODE_SELECT : COMPOUND_REFERENCE)
                : SINGLE_REFERENCE;
@@ -189,21 +175,22 @@ static void inverse_transform_block_inter(MACROBLOCKD *xd, int plane,
   assert(eob > 0);
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
     if (xd->lossless) {
-      vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
+      vp9_highbd_iwht4x4_add(dqcoeff, dst16, stride, eob, xd->bd);
     } else {
       switch (tx_size) {
         case TX_4X4:
-          vp9_highbd_idct4x4_add(dqcoeff, dst, stride, eob, xd->bd);
+          vp9_highbd_idct4x4_add(dqcoeff, dst16, stride, eob, xd->bd);
           break;
         case TX_8X8:
-          vp9_highbd_idct8x8_add(dqcoeff, dst, stride, eob, xd->bd);
+          vp9_highbd_idct8x8_add(dqcoeff, dst16, stride, eob, xd->bd);
           break;
         case TX_16X16:
-          vp9_highbd_idct16x16_add(dqcoeff, dst, stride, eob, xd->bd);
+          vp9_highbd_idct16x16_add(dqcoeff, dst16, stride, eob, xd->bd);
           break;
         case TX_32X32:
-          vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
+          vp9_highbd_idct32x32_add(dqcoeff, dst16, stride, eob, xd->bd);
           break;
         default: assert(0 && "Invalid transform size");
       }
@@ -256,21 +243,22 @@ static void inverse_transform_block_intra(MACROBLOCKD *xd, int plane,
   assert(eob > 0);
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
     if (xd->lossless) {
-      vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
+      vp9_highbd_iwht4x4_add(dqcoeff, dst16, stride, eob, xd->bd);
     } else {
       switch (tx_size) {
         case TX_4X4:
-          vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+          vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst16, stride, eob, xd->bd);
           break;
         case TX_8X8:
-          vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+          vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst16, stride, eob, xd->bd);
           break;
         case TX_16X16:
-          vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+          vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst16, stride, eob, xd->bd);
           break;
         case TX_32X32:
-          vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
+          vp9_highbd_idct32x32_add(dqcoeff, dst16, stride, eob, xd->bd);
           break;
         default: assert(0 && "Invalid transform size");
       }
@@ -337,9 +325,9 @@ static void predict_and_reconstruct_intra_block(TileWorkerData *twd,
   if (!mi->skip) {
     const TX_TYPE tx_type =
         (plane || xd->lossless) ? DCT_DCT : intra_mode_to_tx_type_lookup[mode];
-    const scan_order *sc = (plane || xd->lossless)
-                               ? &vp9_default_scan_orders[tx_size]
-                               : &vp9_scan_orders[tx_size][tx_type];
+    const ScanOrder *sc = (plane || xd->lossless)
+                              ? &vp9_default_scan_orders[tx_size]
+                              : &vp9_scan_orders[tx_size][tx_type];
     const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size,
                                             mi->segment_id);
     if (eob > 0) {
@@ -349,20 +337,121 @@ static void predict_and_reconstruct_intra_block(TileWorkerData *twd,
   }
 }
 
-static int reconstruct_inter_block(TileWorkerData *twd, MODE_INFO *const mi,
-                                   int plane, int row, int col,
-                                   TX_SIZE tx_size) {
+static void parse_intra_block_row_mt(TileWorkerData *twd, MODE_INFO *const mi,
+                                     int plane, int row, int col,
+                                     TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &twd->xd;
+  PREDICTION_MODE mode = (plane == 0) ? mi->mode : mi->uv_mode;
+
+  if (mi->sb_type < BLOCK_8X8)
+    if (plane == 0) mode = xd->mi[0]->bmi[(row << 1) + col].as_mode;
+
+  if (!mi->skip) {
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    const TX_TYPE tx_type =
+        (plane || xd->lossless) ? DCT_DCT : intra_mode_to_tx_type_lookup[mode];
+    const ScanOrder *sc = (plane || xd->lossless)
+                              ? &vp9_default_scan_orders[tx_size]
+                              : &vp9_scan_orders[tx_size][tx_type];
+    *pd->eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size,
+                                       mi->segment_id);
+    /* Keep the alignment to 16 */
+    pd->dqcoeff += (16 << (tx_size << 1));
+    pd->eob++;
+  }
+}
+
+static void predict_and_reconstruct_intra_block_row_mt(TileWorkerData *twd,
+                                                       MODE_INFO *const mi,
+                                                       int plane, int row,
+                                                       int col,
+                                                       TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &twd->xd;
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  const scan_order *sc = &vp9_default_scan_orders[tx_size];
+  PREDICTION_MODE mode = (plane == 0) ? mi->mode : mi->uv_mode;
+  uint8_t *dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col];
+
+  if (mi->sb_type < BLOCK_8X8)
+    if (plane == 0) mode = xd->mi[0]->bmi[(row << 1) + col].as_mode;
+
+  vp9_predict_intra_block(xd, pd->n4_wl, tx_size, mode, dst, pd->dst.stride,
+                          dst, pd->dst.stride, col, row, plane);
+
+  if (!mi->skip) {
+    const TX_TYPE tx_type =
+        (plane || xd->lossless) ? DCT_DCT : intra_mode_to_tx_type_lookup[mode];
+    if (*pd->eob > 0) {
+      inverse_transform_block_intra(xd, plane, tx_type, tx_size, dst,
+                                    pd->dst.stride, *pd->eob);
+    }
+    /* Keep the alignment to 16 */
+    pd->dqcoeff += (16 << (tx_size << 1));
+    pd->eob++;
+  }
+}
+
+static int reconstruct_inter_block(TileWorkerData *twd, MODE_INFO *const mi,
+                                   int plane, int row, int col, TX_SIZE tx_size,
+                                   int mi_row, int mi_col) {
+  MACROBLOCKD *const xd = &twd->xd;
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const ScanOrder *sc = &vp9_default_scan_orders[tx_size];
+  const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size,
+                                          mi->segment_id);
+  uint8_t *dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col];
+
+  if (eob > 0) {
+    inverse_transform_block_inter(xd, plane, tx_size, dst, pd->dst.stride, eob);
+  }
+#if CONFIG_MISMATCH_DEBUG
+  {
+    int pixel_c, pixel_r;
+    int blk_w = 1 << (tx_size + TX_UNIT_SIZE_LOG2);
+    int blk_h = 1 << (tx_size + TX_UNIT_SIZE_LOG2);
+    mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, col, row,
+                    pd->subsampling_x, pd->subsampling_y);
+    mismatch_check_block_tx(dst, pd->dst.stride, plane, pixel_c, pixel_r, blk_w,
+                            blk_h, xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+  }
+#else
+  (void)mi_row;
+  (void)mi_col;
+#endif
+  return eob;
+}
+
+static int parse_inter_block_row_mt(TileWorkerData *twd, MODE_INFO *const mi,
+                                    int plane, int row, int col,
+                                    TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &twd->xd;
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const ScanOrder *sc = &vp9_default_scan_orders[tx_size];
   const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size,
                                           mi->segment_id);
 
+  *pd->eob = eob;
+  pd->dqcoeff += (16 << (tx_size << 1));
+  pd->eob++;
+
+  return eob;
+}
+
+static int reconstruct_inter_block_row_mt(TileWorkerData *twd,
+                                          MODE_INFO *const mi, int plane,
+                                          int row, int col, TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &twd->xd;
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int eob = *pd->eob;
+
+  (void)mi;
   if (eob > 0) {
     inverse_transform_block_inter(
         xd, plane, tx_size, &pd->dst.buf[4 * row * pd->dst.stride + 4 * col],
         pd->dst.stride, eob);
   }
+  pd->dqcoeff += (16 << (tx_size << 1));
+  pd->eob++;
+
   return eob;
 }
 
@@ -442,45 +531,39 @@ static void high_build_mc_border(const uint8_t *src8, int src_stride,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride,
-                               int x0, int y0, int b_w, int b_h,
-                               int frame_width, int frame_height,
+static void extend_and_predict(TileWorkerData *twd, const uint8_t *buf_ptr1,
+                               int pre_buf_stride, int x0, int y0, int b_w,
+                               int b_h, int frame_width, int frame_height,
                                int border_offset, uint8_t *const dst,
                                int dst_buf_stride, int subpel_x, int subpel_y,
                                const InterpKernel *kernel,
                                const struct scale_factors *sf, MACROBLOCKD *xd,
                                int w, int h, int ref, int xs, int ys) {
-  DECLARE_ALIGNED(16, uint16_t, mc_buf_high[80 * 2 * 80 * 2]);
-  const uint8_t *buf_ptr;
-
+  uint16_t *mc_buf_high = twd->extend_and_predict_buf;
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     high_build_mc_border(buf_ptr1, pre_buf_stride, mc_buf_high, b_w, x0, y0,
                          b_w, b_h, frame_width, frame_height);
-    buf_ptr = CONVERT_TO_BYTEPTR(mc_buf_high) + border_offset;
+    highbd_inter_predictor(mc_buf_high + border_offset, b_w,
+                           CONVERT_TO_SHORTPTR(dst), dst_buf_stride, subpel_x,
+                           subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
   } else {
     build_mc_border(buf_ptr1, pre_buf_stride, (uint8_t *)mc_buf_high, b_w, x0,
                     y0, b_w, b_h, frame_width, frame_height);
-    buf_ptr = ((uint8_t *)mc_buf_high) + border_offset;
-  }
-
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    highbd_inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
-                           subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
-  } else {
-    inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x, subpel_y, sf,
-                    w, h, ref, kernel, xs, ys);
+    inter_predictor(((uint8_t *)mc_buf_high) + border_offset, b_w, dst,
+                    dst_buf_stride, subpel_x, subpel_y, sf, w, h, ref, kernel,
+                    xs, ys);
   }
 }
 #else
-static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride,
-                               int x0, int y0, int b_w, int b_h,
-                               int frame_width, int frame_height,
+static void extend_and_predict(TileWorkerData *twd, const uint8_t *buf_ptr1,
+                               int pre_buf_stride, int x0, int y0, int b_w,
+                               int b_h, int frame_width, int frame_height,
                                int border_offset, uint8_t *const dst,
                                int dst_buf_stride, int subpel_x, int subpel_y,
                                const InterpKernel *kernel,
                                const struct scale_factors *sf, int w, int h,
                                int ref, int xs, int ys) {
-  DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]);
+  uint8_t *mc_buf = (uint8_t *)twd->extend_and_predict_buf;
   const uint8_t *buf_ptr;
 
   build_mc_border(buf_ptr1, pre_buf_stride, mc_buf, b_w, x0, y0, b_w, b_h,
@@ -493,7 +576,7 @@ static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 static void dec_build_inter_predictors(
-    VPxWorker *const worker, MACROBLOCKD *xd, int plane, int bw, int bh, int x,
+    TileWorkerData *twd, MACROBLOCKD *xd, int plane, int bw, int bh, int x,
     int y, int w, int h, int mi_x, int mi_y, const InterpKernel *kernel,
     const struct scale_factors *sf, struct buf_2d *pre_buf,
     struct buf_2d *dst_buf, const MV *mv, RefCntBuffer *ref_frame_buf,
@@ -596,12 +679,6 @@ static void dec_build_inter_predictors(
       y_pad = 1;
     }
 
-    // Wait until reference block is ready. Pad 7 more pixels as last 7
-    // pixels of each superblock row can be changed by next superblock row.
-    if (worker != NULL)
-      vp9_frameworker_wait(worker, ref_frame_buf, VPXMAX(0, (y1 + 7))
-                                                      << (plane == 0 ? 0 : 1));
-
     // Skip border extension if block is inside the frame.
     if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 ||
         y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
@@ -611,27 +688,20 @@ static void dec_build_inter_predictors(
       const int b_h = y1 - y0 + 1;
       const int border_offset = y_pad * 3 * b_w + x_pad * 3;
 
-      extend_and_predict(buf_ptr1, buf_stride, x0, y0, b_w, b_h, frame_width,
-                         frame_height, border_offset, dst, dst_buf->stride,
-                         subpel_x, subpel_y, kernel, sf,
+      extend_and_predict(twd, buf_ptr1, buf_stride, x0, y0, b_w, b_h,
+                         frame_width, frame_height, border_offset, dst,
+                         dst_buf->stride, subpel_x, subpel_y, kernel, sf,
 #if CONFIG_VP9_HIGHBITDEPTH
                          xd,
 #endif
                          w, h, ref, xs, ys);
       return;
     }
-  } else {
-    // Wait until reference block is ready. Pad 7 more pixels as last 7
-    // pixels of each superblock row can be changed by next superblock row.
-    if (worker != NULL) {
-      const int y1 = (y0_16 + (h - 1) * ys) >> SUBPEL_BITS;
-      vp9_frameworker_wait(worker, ref_frame_buf, VPXMAX(0, (y1 + 7))
-                                                      << (plane == 0 ? 0 : 1));
-    }
   }
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    highbd_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+    highbd_inter_predictor(CONVERT_TO_SHORTPTR(buf_ptr), buf_stride,
+                           CONVERT_TO_SHORTPTR(dst), dst_buf->stride, subpel_x,
                            subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
   } else {
     inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
@@ -643,7 +713,8 @@ static void dec_build_inter_predictors(
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
-static void dec_build_inter_predictors_sb(VP9Decoder *const pbi,
+static void dec_build_inter_predictors_sb(TileWorkerData *twd,
+                                          VP9Decoder *const pbi,
                                           MACROBLOCKD *xd, int mi_row,
                                           int mi_col) {
   int plane;
@@ -655,8 +726,6 @@ static void dec_build_inter_predictors_sb(VP9Decoder *const pbi,
   const int is_compound = has_second_ref(mi);
   int ref;
   int is_scaled;
-  VPxWorker *const fwo =
-      pbi->frame_parallel_decode ? pbi->frame_worker_owner : NULL;
 
   for (ref = 0; ref < 1 + is_compound; ++ref) {
     const MV_REFERENCE_FRAME frame = mi->ref_frame[ref];
@@ -688,7 +757,7 @@ static void dec_build_inter_predictors_sb(VP9Decoder *const pbi,
         for (y = 0; y < num_4x4_h; ++y) {
           for (x = 0; x < num_4x4_w; ++x) {
             const MV mv = average_split_mvs(pd, mi, ref, i++);
-            dec_build_inter_predictors(fwo, xd, plane, n4w_x4, n4h_x4, 4 * x,
+            dec_build_inter_predictors(twd, xd, plane, n4w_x4, n4h_x4, 4 * x,
                                        4 * y, 4, 4, mi_x, mi_y, kernel, sf,
                                        pre_buf, dst_buf, &mv, ref_frame_buf,
                                        is_scaled, ref);
@@ -705,7 +774,7 @@ static void dec_build_inter_predictors_sb(VP9Decoder *const pbi,
         const int n4w_x4 = 4 * num_4x4_w;
         const int n4h_x4 = 4 * num_4x4_h;
         struct buf_2d *const pre_buf = &pd->pre[ref];
-        dec_build_inter_predictors(fwo, xd, plane, n4w_x4, n4h_x4, 0, 0, n4w_x4,
+        dec_build_inter_predictors(twd, xd, plane, n4w_x4, n4h_x4, 0, 0, n4w_x4,
                                    n4h_x4, mi_x, mi_y, kernel, sf, pre_buf,
                                    dst_buf, &mv, ref_frame_buf, is_scaled, ref);
       }
@@ -733,6 +802,25 @@ static void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh, int bwl,
   }
 }
 
+static MODE_INFO *set_offsets_recon(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                                    int mi_row, int mi_col, int bw, int bh,
+                                    int bwl, int bhl) {
+  const int offset = mi_row * cm->mi_stride + mi_col;
+  const TileInfo *const tile = &xd->tile;
+  xd->mi = cm->mi_grid_visible + offset;
+
+  set_plane_n4(xd, bw, bh, bwl, bhl);
+
+  set_skip_context(xd, mi_row, mi_col);
+
+  // Distance of Mb to the various image edges. These are specified to 8th pel
+  // as they are always compared to values that are in 1/8th pel units
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+
+  vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
+  return xd->mi[0];
+}
+
 static MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                               BLOCK_SIZE bsize, int mi_row, int mi_col, int bw,
                               int bh, int x_mis, int y_mis, int bwl, int bhl) {
@@ -762,6 +850,66 @@ static MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   return xd->mi[0];
 }
 
+static INLINE int predict_recon_inter(MACROBLOCKD *xd, MODE_INFO *mi,
+                                      TileWorkerData *twd,
+                                      predict_recon_func func) {
+  int eobtotal = 0;
+  int plane;
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const TX_SIZE tx_size = plane ? get_uv_tx_size(mi, pd) : mi->tx_size;
+    const int num_4x4_w = pd->n4_w;
+    const int num_4x4_h = pd->n4_h;
+    const int step = (1 << tx_size);
+    int row, col;
+    const int max_blocks_wide =
+        num_4x4_w + (xd->mb_to_right_edge >= 0
+                         ? 0
+                         : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+    const int max_blocks_high =
+        num_4x4_h + (xd->mb_to_bottom_edge >= 0
+                         ? 0
+                         : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
+    xd->max_blocks_wide = xd->mb_to_right_edge >= 0 ? 0 : max_blocks_wide;
+    xd->max_blocks_high = xd->mb_to_bottom_edge >= 0 ? 0 : max_blocks_high;
+
+    for (row = 0; row < max_blocks_high; row += step)
+      for (col = 0; col < max_blocks_wide; col += step)
+        eobtotal += func(twd, mi, plane, row, col, tx_size);
+  }
+  return eobtotal;
+}
+
+static INLINE void predict_recon_intra(MACROBLOCKD *xd, MODE_INFO *mi,
+                                       TileWorkerData *twd,
+                                       intra_recon_func func) {
+  int plane;
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const TX_SIZE tx_size = plane ? get_uv_tx_size(mi, pd) : mi->tx_size;
+    const int num_4x4_w = pd->n4_w;
+    const int num_4x4_h = pd->n4_h;
+    const int step = (1 << tx_size);
+    int row, col;
+    const int max_blocks_wide =
+        num_4x4_w + (xd->mb_to_right_edge >= 0
+                         ? 0
+                         : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+    const int max_blocks_high =
+        num_4x4_h + (xd->mb_to_bottom_edge >= 0
+                         ? 0
+                         : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
+    xd->max_blocks_wide = xd->mb_to_right_edge >= 0 ? 0 : max_blocks_wide;
+    xd->max_blocks_high = xd->mb_to_bottom_edge >= 0 ? 0 : max_blocks_high;
+
+    for (row = 0; row < max_blocks_high; row += step)
+      for (col = 0; col < max_blocks_wide; col += step)
+        func(twd, mi, plane, row, col, tx_size);
+  }
+}
+
 static void decode_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row,
                          int mi_col, BLOCK_SIZE bsize, int bwl, int bhl) {
   VP9_COMMON *const cm = &pbi->common;
@@ -818,7 +966,25 @@ static void decode_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row,
     }
   } else {
     // Prediction
-    dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col);
+    dec_build_inter_predictors_sb(twd, pbi, xd, mi_row, mi_col);
+#if CONFIG_MISMATCH_DEBUG
+    {
+      int plane;
+      for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+        const struct macroblockd_plane *pd = &xd->plane[plane];
+        int pixel_c, pixel_r;
+        const BLOCK_SIZE plane_bsize =
+            get_plane_block_size(VPXMAX(bsize, BLOCK_8X8), &xd->plane[plane]);
+        const int bw = get_block_width(plane_bsize);
+        const int bh = get_block_height(plane_bsize);
+        mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0,
+                        pd->subsampling_x, pd->subsampling_y);
+        mismatch_check_block_pre(pd->dst.buf, pd->dst.stride, plane, pixel_c,
+                                 pixel_r, bw, bh,
+                                 xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+      }
+    }
+#endif
 
     // Reconstruction
     if (!mi->skip) {
@@ -838,16 +1004,17 @@ static void decode_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row,
                              : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
         const int max_blocks_high =
             num_4x4_h +
-            (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >>
-                                                  (5 + pd->subsampling_y));
+            (xd->mb_to_bottom_edge >= 0
+                 ? 0
+                 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
 
         xd->max_blocks_wide = xd->mb_to_right_edge >= 0 ? 0 : max_blocks_wide;
         xd->max_blocks_high = xd->mb_to_bottom_edge >= 0 ? 0 : max_blocks_high;
 
         for (row = 0; row < max_blocks_high; row += step)
           for (col = 0; col < max_blocks_wide; col += step)
-            eobtotal +=
-                reconstruct_inter_block(twd, mi, plane, row, col, tx_size);
+            eobtotal += reconstruct_inter_block(twd, mi, plane, row, col,
+                                                tx_size, mi_row, mi_col);
       }
 
       if (!less8x8 && eobtotal == 0) mi->skip = 1;  // skip loopfilter
@@ -861,6 +1028,98 @@ static void decode_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row,
   }
 }
 
+static void recon_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row,
+                        int mi_col, BLOCK_SIZE bsize, int bwl, int bhl) {
+  VP9_COMMON *const cm = &pbi->common;
+  const int bw = 1 << (bwl - 1);
+  const int bh = 1 << (bhl - 1);
+  MACROBLOCKD *const xd = &twd->xd;
+
+  MODE_INFO *mi = set_offsets_recon(cm, xd, mi_row, mi_col, bw, bh, bwl, bhl);
+
+  if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) {
+    const BLOCK_SIZE uv_subsize =
+        ss_size_lookup[bsize][cm->subsampling_x][cm->subsampling_y];
+    if (uv_subsize == BLOCK_INVALID)
+      vpx_internal_error(xd->error_info, VPX_CODEC_CORRUPT_FRAME,
+                         "Invalid block size.");
+  }
+
+  if (!is_inter_block(mi)) {
+    predict_recon_intra(xd, mi, twd,
+                        predict_and_reconstruct_intra_block_row_mt);
+  } else {
+    // Prediction
+    dec_build_inter_predictors_sb(twd, pbi, xd, mi_row, mi_col);
+
+    // Reconstruction
+    if (!mi->skip) {
+      predict_recon_inter(xd, mi, twd, reconstruct_inter_block_row_mt);
+    }
+  }
+
+  vp9_build_mask(cm, mi, mi_row, mi_col, bw, bh);
+}
+
+static void parse_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row,
+                        int mi_col, BLOCK_SIZE bsize, int bwl, int bhl) {
+  VP9_COMMON *const cm = &pbi->common;
+  const int bw = 1 << (bwl - 1);
+  const int bh = 1 << (bhl - 1);
+  const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = VPXMIN(bh, cm->mi_rows - mi_row);
+  vpx_reader *r = &twd->bit_reader;
+  MACROBLOCKD *const xd = &twd->xd;
+
+  MODE_INFO *mi = set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis,
+                              y_mis, bwl, bhl);
+
+  if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) {
+    const BLOCK_SIZE uv_subsize =
+        ss_size_lookup[bsize][cm->subsampling_x][cm->subsampling_y];
+    if (uv_subsize == BLOCK_INVALID)
+      vpx_internal_error(xd->error_info, VPX_CODEC_CORRUPT_FRAME,
+                         "Invalid block size.");
+  }
+
+  vp9_read_mode_info(twd, pbi, mi_row, mi_col, x_mis, y_mis);
+
+  if (mi->skip) {
+    dec_reset_skip_context(xd);
+  }
+
+  if (!is_inter_block(mi)) {
+    predict_recon_intra(xd, mi, twd, parse_intra_block_row_mt);
+  } else {
+    if (!mi->skip) {
+      tran_low_t *dqcoeff[MAX_MB_PLANE];
+      int *eob[MAX_MB_PLANE];
+      int plane;
+      int eobtotal;
+      // Based on eobtotal and bsize, this may be mi->skip may be set to true
+      // In that case dqcoeff and eob need to be backed up and restored as
+      // recon_block will not increment these pointers for skip cases
+      for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+        const struct macroblockd_plane *const pd = &xd->plane[plane];
+        dqcoeff[plane] = pd->dqcoeff;
+        eob[plane] = pd->eob;
+      }
+      eobtotal = predict_recon_inter(xd, mi, twd, parse_inter_block_row_mt);
+
+      if (bsize >= BLOCK_8X8 && eobtotal == 0) {
+        mi->skip = 1;  // skip loopfilter
+        for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+          struct macroblockd_plane *pd = &xd->plane[plane];
+          pd->dqcoeff = dqcoeff[plane];
+          pd->eob = eob[plane];
+        }
+      }
+    }
+  }
+
+  xd->corrupted |= vpx_reader_has_error(r);
+}
+
 static INLINE int dec_partition_plane_context(TileWorkerData *twd, int mi_row,
                                               int mi_col, int bsl) {
   const PARTITION_CONTEXT *above_ctx = twd->xd.above_seg_context + mi_col;
@@ -967,14 +1226,82 @@ static void decode_partition(TileWorkerData *twd, VP9Decoder *const pbi,
     dec_update_partition_context(twd, mi_row, mi_col, subsize, num_8x8_wh);
 }
 
+static void process_partition(TileWorkerData *twd, VP9Decoder *const pbi,
+                              int mi_row, int mi_col, BLOCK_SIZE bsize,
+                              int n4x4_l2, int parse_recon_flag,
+                              process_block_fn_t process_block) {
+  VP9_COMMON *const cm = &pbi->common;
+  const int n8x8_l2 = n4x4_l2 - 1;
+  const int num_8x8_wh = 1 << n8x8_l2;
+  const int hbs = num_8x8_wh >> 1;
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize;
+  const int has_rows = (mi_row + hbs) < cm->mi_rows;
+  const int has_cols = (mi_col + hbs) < cm->mi_cols;
+  MACROBLOCKD *const xd = &twd->xd;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+  if (parse_recon_flag & PARSE) {
+    *xd->partition =
+        read_partition(twd, mi_row, mi_col, has_rows, has_cols, n8x8_l2);
+  }
+
+  partition = *xd->partition;
+  xd->partition++;
+
+  subsize = get_subsize(bsize, partition);
+  if (!hbs) {
+    // calculate bmode block dimensions (log 2)
+    xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT);
+    xd->bmode_blocks_hl = 1 >> !!(partition & PARTITION_HORZ);
+    process_block(twd, pbi, mi_row, mi_col, subsize, 1, 1);
+  } else {
+    switch (partition) {
+      case PARTITION_NONE:
+        process_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n4x4_l2);
+        break;
+      case PARTITION_HORZ:
+        process_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n8x8_l2);
+        if (has_rows)
+          process_block(twd, pbi, mi_row + hbs, mi_col, subsize, n4x4_l2,
+                        n8x8_l2);
+        break;
+      case PARTITION_VERT:
+        process_block(twd, pbi, mi_row, mi_col, subsize, n8x8_l2, n4x4_l2);
+        if (has_cols)
+          process_block(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2,
+                        n4x4_l2);
+        break;
+      case PARTITION_SPLIT:
+        process_partition(twd, pbi, mi_row, mi_col, subsize, n8x8_l2,
+                          parse_recon_flag, process_block);
+        process_partition(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2,
+                          parse_recon_flag, process_block);
+        process_partition(twd, pbi, mi_row + hbs, mi_col, subsize, n8x8_l2,
+                          parse_recon_flag, process_block);
+        process_partition(twd, pbi, mi_row + hbs, mi_col + hbs, subsize,
+                          n8x8_l2, parse_recon_flag, process_block);
+        break;
+      default: assert(0 && "Invalid partition type");
+    }
+  }
+
+  if (parse_recon_flag & PARSE) {
+    // update partition context
+    if ((bsize == BLOCK_8X8 || partition != PARTITION_SPLIT) &&
+        bsize >= BLOCK_8X8)
+      dec_update_partition_context(twd, mi_row, mi_col, subsize, num_8x8_wh);
+  }
+}
+
 static void setup_token_decoder(const uint8_t *data, const uint8_t *data_end,
                                 size_t read_size,
                                 struct vpx_internal_error_info *error_info,
                                 vpx_reader *r, vpx_decrypt_cb decrypt_cb,
                                 void *decrypt_state) {
-  // Validate the calculated partition length. If the buffer
-  // described by the partition can't be fully read, then restrict
-  // it to the portion that can be (for EC mode) or throw an error.
+  // Validate the calculated partition length. If the buffer described by the
+  // partition can't be fully read then throw an error.
   if (!read_is_valid(data, read_size, data_end))
     vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME,
                        "Truncated packet or corrupt tile length");
@@ -1144,7 +1471,7 @@ static void resize_mv_buffer(VP9_COMMON *cm) {
   vpx_free(cm->cur_frame->mvs);
   cm->cur_frame->mi_rows = cm->mi_rows;
   cm->cur_frame->mi_cols = cm->mi_cols;
-  CHECK_MEM_ERROR(cm, cm->cur_frame->mvs,
+  CHECK_MEM_ERROR(&cm->error, cm->cur_frame->mvs,
                   (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
                                        sizeof(*cm->cur_frame->mvs)));
 }
@@ -1165,9 +1492,15 @@ static void resize_context_buffers(VP9_COMMON *cm, int width, int height) {
     // Allocations in vp9_alloc_context_buffers() depend on individual
     // dimensions as well as the overall size.
     if (new_mi_cols > cm->mi_cols || new_mi_rows > cm->mi_rows) {
-      if (vp9_alloc_context_buffers(cm, width, height))
+      if (vp9_alloc_context_buffers(cm, width, height)) {
+        // The cm->mi_* values have been cleared and any existing context
+        // buffers have been freed. Clear cm->width and cm->height to be
+        // consistent and to force a realloc next time.
+        cm->width = 0;
+        cm->height = 0;
         vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                            "Failed to allocate context buffers");
+      }
     } else {
       vp9_set_mb_mi(cm, width, height);
     }
@@ -1188,7 +1521,6 @@ static void setup_frame_size(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) {
   resize_context_buffers(cm, width, height);
   setup_render_size(cm, rb);
 
-  lock_buffer_pool(pool);
   if (vpx_realloc_frame_buffer(
           get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x,
           cm->subsampling_y,
@@ -1198,12 +1530,11 @@ static void setup_frame_size(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) {
           VP9_DEC_BORDER_IN_PIXELS, cm->byte_alignment,
           &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
           pool->cb_priv)) {
-    unlock_buffer_pool(pool);
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffer");
   }
-  unlock_buffer_pool(pool);
 
+  pool->frame_bufs[cm->new_fb_idx].released = 0;
   pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
   pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
   pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
@@ -1274,7 +1605,6 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
   resize_context_buffers(cm, width, height);
   setup_render_size(cm, rb);
 
-  lock_buffer_pool(pool);
   if (vpx_realloc_frame_buffer(
           get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x,
           cm->subsampling_y,
@@ -1284,12 +1614,11 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
           VP9_DEC_BORDER_IN_PIXELS, cm->byte_alignment,
           &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
           pool->cb_priv)) {
-    unlock_buffer_pool(pool);
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffer");
   }
-  unlock_buffer_pool(pool);
 
+  pool->frame_bufs[cm->new_fb_idx].released = 0;
   pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
   pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
   pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
@@ -1369,6 +1698,322 @@ static void get_tile_buffers(VP9Decoder *pbi, const uint8_t *data,
   }
 }
 
+static void map_write(RowMTWorkerData *const row_mt_worker_data, int map_idx,
+                      int sync_idx) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(&row_mt_worker_data->recon_sync_mutex[sync_idx]);
+  row_mt_worker_data->recon_map[map_idx] = 1;
+  pthread_cond_signal(&row_mt_worker_data->recon_sync_cond[sync_idx]);
+  pthread_mutex_unlock(&row_mt_worker_data->recon_sync_mutex[sync_idx]);
+#else
+  (void)row_mt_worker_data;
+  (void)map_idx;
+  (void)sync_idx;
+#endif  // CONFIG_MULTITHREAD
+}
+
+static void map_read(RowMTWorkerData *const row_mt_worker_data, int map_idx,
+                     int sync_idx) {
+#if CONFIG_MULTITHREAD
+  volatile int8_t *map = row_mt_worker_data->recon_map + map_idx;
+  pthread_mutex_t *const mutex =
+      &row_mt_worker_data->recon_sync_mutex[sync_idx];
+  pthread_mutex_lock(mutex);
+  while (!(*map)) {
+    pthread_cond_wait(&row_mt_worker_data->recon_sync_cond[sync_idx], mutex);
+  }
+  pthread_mutex_unlock(mutex);
+#else
+  (void)row_mt_worker_data;
+  (void)map_idx;
+  (void)sync_idx;
+#endif  // CONFIG_MULTITHREAD
+}
+
+static int lpf_map_write_check(VP9LfSync *lf_sync, int row, int num_tile_cols) {
+  int return_val = 0;
+#if CONFIG_MULTITHREAD
+  int corrupted;
+  pthread_mutex_lock(lf_sync->lf_mutex);
+  corrupted = lf_sync->corrupted;
+  pthread_mutex_unlock(lf_sync->lf_mutex);
+  if (!corrupted) {
+    pthread_mutex_lock(&lf_sync->recon_done_mutex[row]);
+    lf_sync->num_tiles_done[row] += 1;
+    if (num_tile_cols == lf_sync->num_tiles_done[row]) return_val = 1;
+    pthread_mutex_unlock(&lf_sync->recon_done_mutex[row]);
+  }
+#else
+  (void)lf_sync;
+  (void)row;
+  (void)num_tile_cols;
+#endif
+  return return_val;
+}
+
+static void vp9_tile_done(VP9Decoder *pbi) {
+#if CONFIG_MULTITHREAD
+  int terminate;
+  RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data;
+  const int all_parse_done = 1 << pbi->common.log2_tile_cols;
+  pthread_mutex_lock(&row_mt_worker_data->recon_done_mutex);
+  row_mt_worker_data->num_tiles_done++;
+  terminate = all_parse_done == row_mt_worker_data->num_tiles_done;
+  pthread_mutex_unlock(&row_mt_worker_data->recon_done_mutex);
+  if (terminate) {
+    vp9_jobq_terminate(&row_mt_worker_data->jobq);
+  }
+#else
+  (void)pbi;
+#endif
+}
+
+static void vp9_jobq_alloc(VP9Decoder *pbi) {
+  VP9_COMMON *const cm = &pbi->common;
+  RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data;
+  const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows);
+  const int sb_rows = aligned_rows >> MI_BLOCK_SIZE_LOG2;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const size_t jobq_size = (tile_cols * sb_rows * 2 + sb_rows) * sizeof(Job);
+
+  if (jobq_size > row_mt_worker_data->jobq_size) {
+    vpx_free(row_mt_worker_data->jobq_buf);
+    CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->jobq_buf,
+                    vpx_calloc(1, jobq_size));
+    vp9_jobq_init(&row_mt_worker_data->jobq, row_mt_worker_data->jobq_buf,
+                  jobq_size);
+    row_mt_worker_data->jobq_size = jobq_size;
+  }
+}
+
+static void recon_tile_row(TileWorkerData *tile_data, VP9Decoder *pbi,
+                           int mi_row, int is_last_row, VP9LfSync *lf_sync,
+                           int cur_tile_col) {
+  VP9_COMMON *const cm = &pbi->common;
+  RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+  const int sb_cols = aligned_cols >> MI_BLOCK_SIZE_LOG2;
+  const int cur_sb_row = mi_row >> MI_BLOCK_SIZE_LOG2;
+  int mi_col_start = tile_data->xd.tile.mi_col_start;
+  int mi_col_end = tile_data->xd.tile.mi_col_end;
+  int mi_col;
+
+  vp9_zero(tile_data->xd.left_context);
+  vp9_zero(tile_data->xd.left_seg_context);
+  for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += MI_BLOCK_SIZE) {
+    const int c = mi_col >> MI_BLOCK_SIZE_LOG2;
+    int plane;
+    const int sb_num = (cur_sb_row * (aligned_cols >> MI_BLOCK_SIZE_LOG2) + c);
+
+    // Top Dependency
+    if (cur_sb_row) {
+      map_read(row_mt_worker_data, ((cur_sb_row - 1) * sb_cols) + c,
+               ((cur_sb_row - 1) * tile_cols) + cur_tile_col);
+    }
+
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+      tile_data->xd.plane[plane].eob =
+          row_mt_worker_data->eob[plane] + (sb_num << EOBS_PER_SB_LOG2);
+      tile_data->xd.plane[plane].dqcoeff =
+          row_mt_worker_data->dqcoeff[plane] + (sb_num << DQCOEFFS_PER_SB_LOG2);
+    }
+    tile_data->xd.partition =
+        row_mt_worker_data->partition + (sb_num * PARTITIONS_PER_SB);
+    process_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4, RECON,
+                      recon_block);
+    if (cm->lf.filter_level && !cm->skip_loop_filter) {
+      // Queue LPF_JOB
+      int is_lpf_job_ready = 0;
+
+      if (mi_col + MI_BLOCK_SIZE >= mi_col_end) {
+        // Checks if this row has been decoded in all tiles
+        is_lpf_job_ready = lpf_map_write_check(lf_sync, cur_sb_row, tile_cols);
+
+        if (is_lpf_job_ready) {
+          Job lpf_job;
+          lpf_job.job_type = LPF_JOB;
+          if (cur_sb_row > 0) {
+            lpf_job.row_num = mi_row - MI_BLOCK_SIZE;
+            vp9_jobq_queue(&row_mt_worker_data->jobq, &lpf_job,
+                           sizeof(lpf_job));
+          }
+          if (is_last_row) {
+            lpf_job.row_num = mi_row;
+            vp9_jobq_queue(&row_mt_worker_data->jobq, &lpf_job,
+                           sizeof(lpf_job));
+          }
+        }
+      }
+    }
+    map_write(row_mt_worker_data, (cur_sb_row * sb_cols) + c,
+              (cur_sb_row * tile_cols) + cur_tile_col);
+  }
+}
+
+static void parse_tile_row(TileWorkerData *tile_data, VP9Decoder *pbi,
+                           int mi_row, int cur_tile_col, uint8_t **data_end) {
+  int mi_col;
+  VP9_COMMON *const cm = &pbi->common;
+  RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data;
+  TileInfo *tile = &tile_data->xd.tile;
+  TileBuffer *const buf = &pbi->tile_buffers[cur_tile_col];
+  const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+
+  vp9_zero(tile_data->dqcoeff);
+  vp9_tile_init(tile, cm, 0, cur_tile_col);
+
+  /* Update reader only at the beginning of each row in a tile */
+  if (mi_row == 0) {
+    setup_token_decoder(buf->data, *data_end, buf->size, &tile_data->error_info,
+                        &tile_data->bit_reader, pbi->decrypt_cb,
+                        pbi->decrypt_state);
+  }
+  vp9_init_macroblockd(cm, &tile_data->xd, tile_data->dqcoeff);
+  tile_data->xd.error_info = &tile_data->error_info;
+
+  vp9_zero(tile_data->xd.left_context);
+  vp9_zero(tile_data->xd.left_seg_context);
+  for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
+       mi_col += MI_BLOCK_SIZE) {
+    const int r = mi_row >> MI_BLOCK_SIZE_LOG2;
+    const int c = mi_col >> MI_BLOCK_SIZE_LOG2;
+    int plane;
+    const int sb_num = (r * (aligned_cols >> MI_BLOCK_SIZE_LOG2) + c);
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+      tile_data->xd.plane[plane].eob =
+          row_mt_worker_data->eob[plane] + (sb_num << EOBS_PER_SB_LOG2);
+      tile_data->xd.plane[plane].dqcoeff =
+          row_mt_worker_data->dqcoeff[plane] + (sb_num << DQCOEFFS_PER_SB_LOG2);
+    }
+    tile_data->xd.partition =
+        row_mt_worker_data->partition + sb_num * PARTITIONS_PER_SB;
+    process_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4, PARSE,
+                      parse_block);
+  }
+}
+
+static int row_decode_worker_hook(void *arg1, void *arg2) {
+  ThreadData *const thread_data = (ThreadData *)arg1;
+  uint8_t **data_end = (uint8_t **)arg2;
+  VP9Decoder *const pbi = thread_data->pbi;
+  VP9_COMMON *const cm = &pbi->common;
+  RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data;
+  const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+  const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows);
+  const int sb_rows = aligned_rows >> MI_BLOCK_SIZE_LOG2;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  Job job;
+  LFWorkerData *lf_data = thread_data->lf_data;
+  VP9LfSync *lf_sync = thread_data->lf_sync;
+  volatile int corrupted = 0;
+  TileWorkerData *volatile tile_data_recon = NULL;
+
+  while (!vp9_jobq_dequeue(&row_mt_worker_data->jobq, &job, sizeof(job), 1)) {
+    int mi_col;
+    const int mi_row = job.row_num;
+
+    if (job.job_type == LPF_JOB) {
+      lf_data->start = mi_row;
+      lf_data->stop = lf_data->start + MI_BLOCK_SIZE;
+
+      if (cm->lf.filter_level && !cm->skip_loop_filter &&
+          mi_row < cm->mi_rows) {
+        vp9_loopfilter_job(lf_data, lf_sync);
+      }
+    } else if (job.job_type == RECON_JOB) {
+      const int cur_sb_row = mi_row >> MI_BLOCK_SIZE_LOG2;
+      const int is_last_row = sb_rows - 1 == cur_sb_row;
+      int mi_col_start, mi_col_end;
+      if (!tile_data_recon)
+        CHECK_MEM_ERROR(&cm->error, tile_data_recon,
+                        vpx_memalign(32, sizeof(TileWorkerData)));
+
+      tile_data_recon->xd = pbi->mb;
+      vp9_tile_init(&tile_data_recon->xd.tile, cm, 0, job.tile_col);
+      vp9_init_macroblockd(cm, &tile_data_recon->xd, tile_data_recon->dqcoeff);
+      mi_col_start = tile_data_recon->xd.tile.mi_col_start;
+      mi_col_end = tile_data_recon->xd.tile.mi_col_end;
+
+      if (setjmp(tile_data_recon->error_info.jmp)) {
+        const int sb_cols = aligned_cols >> MI_BLOCK_SIZE_LOG2;
+        tile_data_recon->error_info.setjmp = 0;
+        corrupted = 1;
+        for (mi_col = mi_col_start; mi_col < mi_col_end;
+             mi_col += MI_BLOCK_SIZE) {
+          const int c = mi_col >> MI_BLOCK_SIZE_LOG2;
+          map_write(row_mt_worker_data, (cur_sb_row * sb_cols) + c,
+                    (cur_sb_row * tile_cols) + job.tile_col);
+        }
+        if (is_last_row) {
+          vp9_tile_done(pbi);
+        }
+        continue;
+      }
+
+      tile_data_recon->error_info.setjmp = 1;
+      tile_data_recon->xd.error_info = &tile_data_recon->error_info;
+
+      recon_tile_row(tile_data_recon, pbi, mi_row, is_last_row, lf_sync,
+                     job.tile_col);
+
+      if (corrupted)
+        vpx_internal_error(&tile_data_recon->error_info,
+                           VPX_CODEC_CORRUPT_FRAME,
+                           "Failed to decode tile data");
+
+      if (is_last_row) {
+        vp9_tile_done(pbi);
+      }
+    } else if (job.job_type == PARSE_JOB) {
+      TileWorkerData *const tile_data = &pbi->tile_worker_data[job.tile_col];
+
+      if (setjmp(tile_data->error_info.jmp)) {
+        tile_data->error_info.setjmp = 0;
+        corrupted = 1;
+        vp9_tile_done(pbi);
+        continue;
+      }
+
+      tile_data->xd = pbi->mb;
+      tile_data->xd.counts =
+          cm->frame_parallel_decoding_mode ? 0 : &tile_data->counts;
+
+      tile_data->error_info.setjmp = 1;
+
+      parse_tile_row(tile_data, pbi, mi_row, job.tile_col, data_end);
+
+      corrupted |= tile_data->xd.corrupted;
+      if (corrupted)
+        vpx_internal_error(&tile_data->error_info, VPX_CODEC_CORRUPT_FRAME,
+                           "Failed to decode tile data");
+
+      /* Queue in the recon_job for this row */
+      {
+        Job recon_job;
+        recon_job.row_num = mi_row;
+        recon_job.tile_col = job.tile_col;
+        recon_job.job_type = RECON_JOB;
+        vp9_jobq_queue(&row_mt_worker_data->jobq, &recon_job,
+                       sizeof(recon_job));
+      }
+
+      /* Queue next parse job */
+      if (mi_row + MI_BLOCK_SIZE < cm->mi_rows) {
+        Job parse_job;
+        parse_job.row_num = mi_row + MI_BLOCK_SIZE;
+        parse_job.tile_col = job.tile_col;
+        parse_job.job_type = PARSE_JOB;
+        vp9_jobq_queue(&row_mt_worker_data->jobq, &parse_job,
+                       sizeof(parse_job));
+      }
+    }
+  }
+
+  vpx_free(tile_data_recon);
+  return !corrupted;
+}
+
 static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data,
                                    const uint8_t *data_end) {
   VP9_COMMON *const cm = &pbi->common;
@@ -1383,9 +2028,9 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data,
 
   if (cm->lf.filter_level && !cm->skip_loop_filter &&
       pbi->lf_worker.data1 == NULL) {
-    CHECK_MEM_ERROR(cm, pbi->lf_worker.data1,
+    CHECK_MEM_ERROR(&cm->error, pbi->lf_worker.data1,
                     vpx_memalign(32, sizeof(LFWorkerData)));
-    pbi->lf_worker.hook = (VPxWorkerHook)vp9_loop_filter_worker;
+    pbi->lf_worker.hook = vp9_loop_filter_worker;
     if (pbi->max_threads > 1 && !winterface->reset(&pbi->lf_worker)) {
       vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                          "Loop filter thread creation failed");
@@ -1447,7 +2092,29 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data,
         vp9_zero(tile_data->xd.left_seg_context);
         for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
              mi_col += MI_BLOCK_SIZE) {
-          decode_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4);
+          if (pbi->row_mt == 1) {
+            int plane;
+            RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data;
+            for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+              tile_data->xd.plane[plane].eob = row_mt_worker_data->eob[plane];
+              tile_data->xd.plane[plane].dqcoeff =
+                  row_mt_worker_data->dqcoeff[plane];
+            }
+            tile_data->xd.partition = row_mt_worker_data->partition;
+            process_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4,
+                              PARSE, parse_block);
+
+            for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+              tile_data->xd.plane[plane].eob = row_mt_worker_data->eob[plane];
+              tile_data->xd.plane[plane].dqcoeff =
+                  row_mt_worker_data->dqcoeff[plane];
+            }
+            tile_data->xd.partition = row_mt_worker_data->partition;
+            process_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4,
+                              RECON, recon_block);
+          } else {
+            decode_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4);
+          }
         }
         pbi->mb.corrupted |= tile_data->xd.corrupted;
         if (pbi->mb.corrupted)
@@ -1474,11 +2141,6 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data,
           winterface->execute(&pbi->lf_worker);
         }
       }
-      // After loopfiltering, the last 7 row pixels in each superblock row may
-      // still be changed by the longest loopfilter of the next superblock
-      // row.
-      if (pbi->frame_parallel_decode)
-        vp9_frameworker_broadcast(pbi->cur_buf, mi_row << MI_BLOCK_SIZE_LOG2);
     }
   }
 
@@ -1494,34 +2156,70 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data,
   // Get last tile data.
   tile_data = pbi->tile_worker_data + tile_cols * tile_rows - 1;
 
-  if (pbi->frame_parallel_decode)
-    vp9_frameworker_broadcast(pbi->cur_buf, INT_MAX);
   return vpx_reader_find_end(&tile_data->bit_reader);
 }
 
+static void set_rows_after_error(VP9LfSync *lf_sync, int start_row, int mi_rows,
+                                 int num_tiles_left, int total_num_tiles) {
+  do {
+    int mi_row;
+    const int aligned_rows = mi_cols_aligned_to_sb(mi_rows);
+    const int sb_rows = (aligned_rows >> MI_BLOCK_SIZE_LOG2);
+    const int corrupted = 1;
+    for (mi_row = start_row; mi_row < mi_rows; mi_row += MI_BLOCK_SIZE) {
+      const int is_last_row = (sb_rows - 1 == mi_row >> MI_BLOCK_SIZE_LOG2);
+      vp9_set_row(lf_sync, total_num_tiles, mi_row >> MI_BLOCK_SIZE_LOG2,
+                  is_last_row, corrupted);
+    }
+    /* If there are multiple tiles, the second tile should start marking row
+     * progress from row 0.
+     */
+    start_row = 0;
+  } while (num_tiles_left--);
+}
+
 // On entry 'tile_data->data_end' points to the end of the input frame, on exit
 // it is updated to reflect the bitreader position of the final tile column if
 // present in the tile buffer group or NULL otherwise.
-static int tile_worker_hook(TileWorkerData *const tile_data,
-                            VP9Decoder *const pbi) {
+static int tile_worker_hook(void *arg1, void *arg2) {
+  TileWorkerData *const tile_data = (TileWorkerData *)arg1;
+  VP9Decoder *const pbi = (VP9Decoder *)arg2;
+
   TileInfo *volatile tile = &tile_data->xd.tile;
   const int final_col = (1 << pbi->common.log2_tile_cols) - 1;
   const uint8_t *volatile bit_reader_end = NULL;
-  volatile int n = tile_data->buf_start;
-  tile_data->error_info.setjmp = 1;
+  VP9_COMMON *cm = &pbi->common;
 
+  LFWorkerData *lf_data = tile_data->lf_data;
+  VP9LfSync *lf_sync = tile_data->lf_sync;
+
+  volatile int mi_row = 0;
+  volatile int n = tile_data->buf_start;
   if (setjmp(tile_data->error_info.jmp)) {
     tile_data->error_info.setjmp = 0;
     tile_data->xd.corrupted = 1;
     tile_data->data_end = NULL;
+    if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) {
+      const int num_tiles_left = tile_data->buf_end - n;
+      const int mi_row_start = mi_row;
+      set_rows_after_error(lf_sync, mi_row_start, cm->mi_rows, num_tiles_left,
+                           1 << cm->log2_tile_cols);
+    }
     return 0;
   }
+  tile_data->error_info.setjmp = 1;
 
   tile_data->xd.corrupted = 0;
 
   do {
-    int mi_row, mi_col;
+    int mi_col;
     const TileBuffer *const buf = pbi->tile_buffers + n;
+
+    /* Initialize to 0 is safe since we do not deal with streams that have
+     * more than one row of tiles. (So tile->mi_row_start will be 0)
+     */
+    assert(cm->log2_tile_rows == 0);
+    mi_row = 0;
     vp9_zero(tile_data->dqcoeff);
     vp9_tile_init(tile, &pbi->common, 0, buf->col);
     setup_token_decoder(buf->data, tile_data->data_end, buf->size,
@@ -1539,6 +2237,14 @@ static int tile_worker_hook(TileWorkerData *const tile_data,
            mi_col += MI_BLOCK_SIZE) {
         decode_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4);
       }
+      if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) {
+        const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows);
+        const int sb_rows = (aligned_rows >> MI_BLOCK_SIZE_LOG2);
+        const int is_last_row = (sb_rows - 1 == mi_row >> MI_BLOCK_SIZE_LOG2);
+        vp9_set_row(lf_sync, 1 << cm->log2_tile_cols,
+                    mi_row >> MI_BLOCK_SIZE_LOG2, is_last_row,
+                    tile_data->xd.corrupted);
+      }
     }
 
     if (buf->col == final_col) {
@@ -1546,15 +2252,194 @@ static int tile_worker_hook(TileWorkerData *const tile_data,
     }
   } while (!tile_data->xd.corrupted && ++n <= tile_data->buf_end);
 
+  if (pbi->lpf_mt_opt && n < tile_data->buf_end && cm->lf.filter_level &&
+      !cm->skip_loop_filter) {
+    /* This was not incremented in the tile loop, so increment before tiles left
+     * calculation
+     */
+    ++n;
+    set_rows_after_error(lf_sync, 0, cm->mi_rows, tile_data->buf_end - n,
+                         1 << cm->log2_tile_cols);
+  }
+
+  if (pbi->lpf_mt_opt && !tile_data->xd.corrupted && cm->lf.filter_level &&
+      !cm->skip_loop_filter) {
+    vp9_loopfilter_rows(lf_data, lf_sync);
+  }
+
   tile_data->data_end = bit_reader_end;
   return !tile_data->xd.corrupted;
 }
 
 // sorts in descending order
 static int compare_tile_buffers(const void *a, const void *b) {
-  const TileBuffer *const buf1 = (const TileBuffer *)a;
-  const TileBuffer *const buf2 = (const TileBuffer *)b;
-  return (int)(buf2->size - buf1->size);
+  const TileBuffer *const buf_a = (const TileBuffer *)a;
+  const TileBuffer *const buf_b = (const TileBuffer *)b;
+  return (buf_a->size < buf_b->size) - (buf_a->size > buf_b->size);
+}
+
+static INLINE void init_mt(VP9Decoder *pbi) {
+  int n;
+  VP9_COMMON *const cm = &pbi->common;
+  VP9LfSync *lf_row_sync = &pbi->lf_row_sync;
+  const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+
+  if (pbi->num_tile_workers == 0) {
+    const int num_threads = pbi->max_threads;
+    CHECK_MEM_ERROR(&cm->error, pbi->tile_workers,
+                    vpx_malloc(num_threads * sizeof(*pbi->tile_workers)));
+    for (n = 0; n < num_threads; ++n) {
+      VPxWorker *const worker = &pbi->tile_workers[n];
+      ++pbi->num_tile_workers;
+
+      winterface->init(worker);
+      worker->thread_name = "vpx tile worker";
+      if (n < num_threads - 1 && !winterface->reset(worker)) {
+        do {
+          winterface->end(&pbi->tile_workers[pbi->num_tile_workers - 1]);
+        } while (--pbi->num_tile_workers != 0);
+        vpx_free(pbi->tile_workers);
+        pbi->tile_workers = NULL;
+        vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                           "Tile decoder thread creation failed");
+      }
+    }
+  }
+
+  // Initialize LPF
+  if ((pbi->lpf_mt_opt || pbi->row_mt) && cm->lf.filter_level &&
+      !cm->skip_loop_filter) {
+    vp9_lpf_mt_init(lf_row_sync, cm, cm->lf.filter_level,
+                    pbi->num_tile_workers);
+  }
+
+  // Note: this memset assumes above_context[0], [1] and [2]
+  // are allocated as part of the same buffer.
+  memset(cm->above_context, 0,
+         sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_mi_cols);
+
+  memset(cm->above_seg_context, 0,
+         sizeof(*cm->above_seg_context) * aligned_mi_cols);
+
+  vp9_reset_lfm(cm);
+}
+
+static const uint8_t *decode_tiles_row_wise_mt(VP9Decoder *pbi,
+                                               const uint8_t *data,
+                                               const uint8_t *data_end) {
+  VP9_COMMON *const cm = &pbi->common;
+  RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data;
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  const int num_workers = pbi->max_threads;
+  int i, n;
+  int col;
+  int corrupted = 0;
+  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+  const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;
+  VP9LfSync *lf_row_sync = &pbi->lf_row_sync;
+  YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm);
+
+  assert(tile_cols <= (1 << 6));
+  assert(tile_rows == 1);
+  (void)tile_rows;
+
+  memset(row_mt_worker_data->recon_map, 0,
+         sb_rows * sb_cols * sizeof(*row_mt_worker_data->recon_map));
+
+  init_mt(pbi);
+
+  // Reset tile decoding hook
+  for (n = 0; n < num_workers; ++n) {
+    VPxWorker *const worker = &pbi->tile_workers[n];
+    ThreadData *const thread_data = &pbi->row_mt_worker_data->thread_data[n];
+    winterface->sync(worker);
+
+    if (cm->lf.filter_level && !cm->skip_loop_filter) {
+      thread_data->lf_sync = lf_row_sync;
+      thread_data->lf_data = &thread_data->lf_sync->lfdata[n];
+      vp9_loop_filter_data_reset(thread_data->lf_data, new_fb, cm,
+                                 pbi->mb.plane);
+    }
+
+    thread_data->pbi = pbi;
+
+    worker->hook = row_decode_worker_hook;
+    worker->data1 = thread_data;
+    worker->data2 = (void *)&row_mt_worker_data->data_end;
+  }
+
+  for (col = 0; col < tile_cols; ++col) {
+    TileWorkerData *const tile_data = &pbi->tile_worker_data[col];
+    tile_data->xd = pbi->mb;
+    tile_data->xd.counts =
+        cm->frame_parallel_decoding_mode ? NULL : &tile_data->counts;
+  }
+
+  /* Reset the jobq to start of the jobq buffer */
+  vp9_jobq_reset(&row_mt_worker_data->jobq);
+  row_mt_worker_data->num_tiles_done = 0;
+  row_mt_worker_data->data_end = NULL;
+
+  // Load tile data into tile_buffers
+  get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows,
+                   &pbi->tile_buffers);
+
+  // Initialize thread frame counts.
+  if (!cm->frame_parallel_decoding_mode) {
+    for (col = 0; col < tile_cols; ++col) {
+      TileWorkerData *const tile_data = &pbi->tile_worker_data[col];
+      vp9_zero(tile_data->counts);
+    }
+  }
+
+  // queue parse jobs for 0th row of every tile
+  for (col = 0; col < tile_cols; ++col) {
+    Job parse_job;
+    parse_job.row_num = 0;
+    parse_job.tile_col = col;
+    parse_job.job_type = PARSE_JOB;
+    vp9_jobq_queue(&row_mt_worker_data->jobq, &parse_job, sizeof(parse_job));
+  }
+
+  for (i = 0; i < num_workers; ++i) {
+    VPxWorker *const worker = &pbi->tile_workers[i];
+    worker->had_error = 0;
+    if (i == num_workers - 1) {
+      winterface->execute(worker);
+    } else {
+      winterface->launch(worker);
+    }
+  }
+
+  for (; n > 0; --n) {
+    VPxWorker *const worker = &pbi->tile_workers[n - 1];
+    // TODO(jzern): The tile may have specific error data associated with
+    // its vpx_internal_error_info which could be propagated to the main info
+    // in cm. Additionally once the threads have been synced and an error is
+    // detected, there's no point in continuing to decode tiles.
+    corrupted |= !winterface->sync(worker);
+  }
+
+  pbi->mb.corrupted = corrupted;
+
+  {
+    /* Set data end */
+    TileWorkerData *const tile_data = &pbi->tile_worker_data[tile_cols - 1];
+    row_mt_worker_data->data_end = vpx_reader_find_end(&tile_data->bit_reader);
+  }
+
+  // Accumulate thread frame counts.
+  if (!cm->frame_parallel_decoding_mode) {
+    for (i = 0; i < tile_cols; ++i) {
+      TileWorkerData *const tile_data = &pbi->tile_worker_data[i];
+      vp9_accumulate_frame_counts(&cm->counts, &tile_data->counts, 1);
+    }
+  }
+
+  return row_mt_worker_data->data_end;
 }
 
 static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data,
@@ -1562,7 +2447,8 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data,
   VP9_COMMON *const cm = &pbi->common;
   const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
   const uint8_t *bit_reader_end = NULL;
-  const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+  VP9LfSync *lf_row_sync = &pbi->lf_row_sync;
+  YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm);
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int tile_rows = 1 << cm->log2_tile_rows;
   const int num_workers = VPXMIN(pbi->max_threads, tile_cols);
@@ -1572,21 +2458,7 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data,
   assert(tile_rows == 1);
   (void)tile_rows;
 
-  if (pbi->num_tile_workers == 0) {
-    const int num_threads = pbi->max_threads;
-    CHECK_MEM_ERROR(cm, pbi->tile_workers,
-                    vpx_malloc(num_threads * sizeof(*pbi->tile_workers)));
-    for (n = 0; n < num_threads; ++n) {
-      VPxWorker *const worker = &pbi->tile_workers[n];
-      ++pbi->num_tile_workers;
-
-      winterface->init(worker);
-      if (n < num_threads - 1 && !winterface->reset(worker)) {
-        vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
-                           "Tile decoder thread creation failed");
-      }
-    }
-  }
+  init_mt(pbi);
 
   // Reset tile decoding hook
   for (n = 0; n < num_workers; ++n) {
@@ -1594,23 +2466,22 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data,
     TileWorkerData *const tile_data =
         &pbi->tile_worker_data[n + pbi->total_tiles];
     winterface->sync(worker);
+
+    if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) {
+      tile_data->lf_sync = lf_row_sync;
+      tile_data->lf_data = &tile_data->lf_sync->lfdata[n];
+      vp9_loop_filter_data_reset(tile_data->lf_data, new_fb, cm, pbi->mb.plane);
+      tile_data->lf_data->y_only = 0;
+    }
+
     tile_data->xd = pbi->mb;
     tile_data->xd.counts =
         cm->frame_parallel_decoding_mode ? NULL : &tile_data->counts;
-    worker->hook = (VPxWorkerHook)tile_worker_hook;
+    worker->hook = tile_worker_hook;
     worker->data1 = tile_data;
     worker->data2 = pbi;
   }
 
-  // Note: this memset assumes above_context[0], [1] and [2]
-  // are allocated as part of the same buffer.
-  memset(cm->above_context, 0,
-         sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_mi_cols);
-  memset(cm->above_seg_context, 0,
-         sizeof(*cm->above_seg_context) * aligned_mi_cols);
-
-  vp9_reset_lfm(cm);
-
   // Load tile data into tile_buffers
   get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows,
                    &pbi->tile_buffers);
@@ -1750,6 +2621,22 @@ static void read_bitdepth_colorspace_sampling(VP9_COMMON *cm,
   }
 }
 
+static INLINE void flush_all_fb_on_key(VP9_COMMON *cm) {
+  if (cm->frame_type == KEY_FRAME && cm->current_video_frame > 0) {
+    RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+    BufferPool *const pool = cm->buffer_pool;
+    int i;
+    for (i = 0; i < FRAME_BUFFERS; ++i) {
+      if (i == cm->new_fb_idx) continue;
+      frame_bufs[i].ref_count = 0;
+      if (!frame_bufs[i].released) {
+        pool->release_fb_cb(pool->cb_priv, &frame_bufs[i].raw_frame_buffer);
+        frame_bufs[i].released = 1;
+      }
+    }
+  }
+}
+
 static size_t read_uncompressed_header(VP9Decoder *pbi,
                                        struct vpx_read_bit_buffer *rb) {
   VP9_COMMON *const cm = &pbi->common;
@@ -1780,24 +2667,17 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
   if (cm->show_existing_frame) {
     // Show an existing frame directly.
     const int frame_to_show = cm->ref_frame_map[vpx_rb_read_literal(rb, 3)];
-    lock_buffer_pool(pool);
     if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
-      unlock_buffer_pool(pool);
       vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                          "Buffer %d does not contain a decoded frame",
                          frame_to_show);
     }
 
     ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
-    unlock_buffer_pool(pool);
     pbi->refresh_frame_flags = 0;
     cm->lf.filter_level = 0;
     cm->show_frame = 1;
 
-    if (pbi->frame_parallel_decode) {
-      for (i = 0; i < REF_FRAMES; ++i)
-        cm->next_ref_frame_map[i] = cm->ref_frame_map[i];
-    }
     return 0;
   }
 
@@ -1821,6 +2701,7 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
     setup_frame_size(cm, rb);
     if (pbi->need_resync) {
       memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+      flush_all_fb_on_key(cm);
       pbi->need_resync = 0;
     }
   } else {
@@ -1914,7 +2795,6 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
   cm->frame_context_idx = vpx_rb_read_literal(rb, FRAME_CONTEXTS_LOG2);
 
   // Generate next_ref_frame_map.
-  lock_buffer_pool(pool);
   for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
     if (mask & 1) {
       cm->next_ref_frame_map[ref_index] = cm->new_fb_idx;
@@ -1934,7 +2814,6 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
     if (cm->ref_frame_map[ref_index] >= 0)
       ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
   }
-  unlock_buffer_pool(pool);
   pbi->hold_ref_buf = 1;
 
   if (frame_is_intra_only(cm) || cm->error_resilient_mode)
@@ -1946,6 +2825,35 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
   setup_segmentation_dequant(cm);
 
   setup_tile_info(cm, rb);
+  if (pbi->row_mt == 1) {
+    int num_sbs = 1;
+    const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows);
+    const int sb_rows = aligned_rows >> MI_BLOCK_SIZE_LOG2;
+    const int num_jobs = sb_rows << cm->log2_tile_cols;
+
+    if (pbi->row_mt_worker_data == NULL) {
+      CHECK_MEM_ERROR(&cm->error, pbi->row_mt_worker_data,
+                      vpx_calloc(1, sizeof(*pbi->row_mt_worker_data)));
+#if CONFIG_MULTITHREAD
+      pthread_mutex_init(&pbi->row_mt_worker_data->recon_done_mutex, NULL);
+#endif
+    }
+
+    if (pbi->max_threads > 1) {
+      const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+      const int sb_cols = aligned_cols >> MI_BLOCK_SIZE_LOG2;
+
+      num_sbs = sb_cols * sb_rows;
+    }
+
+    if (num_sbs > pbi->row_mt_worker_data->num_sbs ||
+        num_jobs > pbi->row_mt_worker_data->num_jobs) {
+      vp9_dec_free_row_mt_mem(pbi->row_mt_worker_data);
+      vp9_dec_alloc_row_mt_mem(pbi->row_mt_worker_data, cm, num_sbs,
+                               pbi->max_threads, num_jobs);
+    }
+    vp9_jobq_alloc(pbi);
+  }
   sz = vpx_rb_read_literal(rb, 16);
 
   if (sz == 0)
@@ -1988,7 +2896,7 @@ static int read_compressed_header(VP9Decoder *pbi, const uint8_t *data,
 
     cm->reference_mode = read_frame_reference_mode(cm, &r);
     if (cm->reference_mode != SINGLE_REFERENCE)
-      setup_compound_reference_mode(cm);
+      vp9_setup_compound_reference_mode(cm);
     read_frame_reference_mode_probs(cm, &r);
 
     for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
@@ -2056,6 +2964,12 @@ void vp9_decode_frame(VP9Decoder *pbi, const uint8_t *data,
   const int tile_rows = 1 << cm->log2_tile_rows;
   const int tile_cols = 1 << cm->log2_tile_cols;
   YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm);
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+  bitstream_queue_set_frame_read(cm->current_video_frame * 2 + cm->show_frame);
+#endif
+#if CONFIG_MISMATCH_DEBUG
+  mismatch_move_frame_idx_r();
+#endif
   xd->cur_buf = new_fb;
 
   if (!first_partition_size) {
@@ -2091,24 +3005,6 @@ void vp9_decode_frame(VP9Decoder *pbi, const uint8_t *data,
     vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
   }
 
-  // If encoded in frame parallel mode, frame context is ready after decoding
-  // the frame header.
-  if (pbi->frame_parallel_decode && cm->frame_parallel_decoding_mode) {
-    VPxWorker *const worker = pbi->frame_worker_owner;
-    FrameWorkerData *const frame_worker_data = worker->data1;
-    if (cm->refresh_frame_context) {
-      context_updated = 1;
-      cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
-    }
-    vp9_frameworker_lock_stats(worker);
-    pbi->cur_buf->row = -1;
-    pbi->cur_buf->col = -1;
-    frame_worker_data->frame_context_ready = 1;
-    // Signal the main thread that context is ready.
-    vp9_frameworker_signal_stats(worker);
-    vp9_frameworker_unlock_stats(worker);
-  }
-
   if (pbi->tile_worker_data == NULL ||
       (tile_cols * tile_rows) != pbi->total_tiles) {
     const int num_tile_workers =
@@ -2118,24 +3014,33 @@ void vp9_decode_frame(VP9Decoder *pbi, const uint8_t *data,
     // platforms without DECLARE_ALIGNED().
     assert((sizeof(*pbi->tile_worker_data) % 16) == 0);
     vpx_free(pbi->tile_worker_data);
-    CHECK_MEM_ERROR(cm, pbi->tile_worker_data, vpx_memalign(32, twd_size));
+    CHECK_MEM_ERROR(&cm->error, pbi->tile_worker_data,
+                    vpx_memalign(32, twd_size));
     pbi->total_tiles = tile_rows * tile_cols;
   }
 
-  if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1) {
-    // Multi-threaded tile decoder
-    *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end);
-    if (!xd->corrupted) {
-      if (!cm->skip_loop_filter) {
-        // If multiple threads are used to decode tiles, then we use those
-        // threads to do parallel loopfiltering.
-        vp9_loop_filter_frame_mt(new_fb, cm, pbi->mb.plane, cm->lf.filter_level,
-                                 0, 0, pbi->tile_workers, pbi->num_tile_workers,
-                                 &pbi->lf_row_sync);
-      }
+  if (pbi->max_threads > 1 && tile_rows == 1 &&
+      (tile_cols > 1 || pbi->row_mt == 1)) {
+    if (pbi->row_mt == 1) {
+      *p_data_end =
+          decode_tiles_row_wise_mt(pbi, data + first_partition_size, data_end);
     } else {
-      vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
-                         "Decode failed. Frame data is corrupted.");
+      // Multi-threaded tile decoder
+      *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end);
+      if (!pbi->lpf_mt_opt) {
+        if (!xd->corrupted) {
+          if (!cm->skip_loop_filter) {
+            // If multiple threads are used to decode tiles, then we use those
+            // threads to do parallel loopfiltering.
+            vp9_loop_filter_frame_mt(
+                new_fb, cm, pbi->mb.plane, cm->lf.filter_level, 0, 0,
+                pbi->tile_workers, pbi->num_tile_workers, &pbi->lf_row_sync);
+          }
+        } else {
+          vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                             "Decode failed. Frame data is corrupted.");
+        }
+      }
     }
   } else {
     *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end);
diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.h b/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.h
index 44717f546a..ba95e72344 100644
--- a/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.h
+++ b/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_DECODER_VP9_DECODEFRAME_H_
-#define VP9_DECODER_VP9_DECODEFRAME_H_
+#ifndef VPX_VP9_DECODER_VP9_DECODEFRAME_H_
+#define VPX_VP9_DECODER_VP9_DECODEFRAME_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -32,4 +32,4 @@ void vp9_decode_frame(struct VP9Decoder *pbi, const uint8_t *data,
 }  // extern "C"
 #endif
 
-#endif  // VP9_DECODER_VP9_DECODEFRAME_H_
+#endif  // VPX_VP9_DECODER_VP9_DECODEFRAME_H_
diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decodemv.c b/media/libvpx/libvpx/vp9/decoder/vp9_decodemv.c
index 1a4152436a..0989cde58d 100644
--- a/media/libvpx/libvpx/vp9/decoder/vp9_decodemv.c
+++ b/media/libvpx/libvpx/vp9/decoder/vp9_decodemv.c
@@ -204,7 +204,7 @@ static void read_intra_frame_mode_info(VP9_COMMON *const cm,
   mi->skip = read_skip(cm, xd, mi->segment_id, r);
   mi->tx_size = read_tx_size(cm, xd, 1, r);
   mi->ref_frame[0] = INTRA_FRAME;
-  mi->ref_frame[1] = NONE;
+  mi->ref_frame[1] = NO_REF_FRAME;
 
   switch (bsize) {
     case BLOCK_4X4:
@@ -299,7 +299,7 @@ static REFERENCE_MODE read_block_reference_mode(VP9_COMMON *cm,
   }
 }
 
-// Read the referncence frame
+// Read the reference frame
 static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                             vpx_reader *r, int segment_id,
                             MV_REFERENCE_FRAME ref_frame[2]) {
@@ -309,7 +309,7 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
     ref_frame[0] = (MV_REFERENCE_FRAME)get_segdata(&cm->seg, segment_id,
                                                    SEG_LVL_REF_FRAME);
-    ref_frame[1] = NONE;
+    ref_frame[1] = NO_REF_FRAME;
   } else {
     const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r);
     // FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding
@@ -333,7 +333,7 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
         ref_frame[0] = LAST_FRAME;
       }
 
-      ref_frame[1] = NONE;
+      ref_frame[1] = NO_REF_FRAME;
     } else {
       assert(0 && "Invalid prediction mode.");
     }
@@ -383,7 +383,7 @@ static void read_intra_block_mode_info(VP9_COMMON *const cm,
   mi->interp_filter = SWITCHABLE_FILTERS;
 
   mi->ref_frame[0] = INTRA_FRAME;
-  mi->ref_frame[1] = NONE;
+  mi->ref_frame[1] = NO_REF_FRAME;
 }
 
 static INLINE int is_mv_valid(const MV *mv) {
@@ -426,7 +426,9 @@ static INLINE int assign_mv(VP9_COMMON *cm, MACROBLOCKD *xd,
       zero_mv_pair(mv);
       break;
     }
-    default: { return 0; }
+    default: {
+      return 0;
+    }
   }
   return ret;
 }
@@ -444,23 +446,6 @@ static int read_is_inter_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   }
 }
 
-static void dec_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *best_mv,
-                                  int refmv_count) {
-  int i;
-
-  // Make sure all the candidates are properly clamped etc
-  for (i = 0; i < refmv_count; ++i) {
-    lower_mv_precision(&mvlist[i].as_mv, allow_hp);
-    *best_mv = mvlist[i];
-  }
-}
-
-static void fpm_sync(void *const data, int mi_row) {
-  VP9Decoder *const pbi = (VP9Decoder *)data;
-  vp9_frameworker_wait(pbi->frame_worker_owner, pbi->common.prev_frame,
-                       mi_row << MI_BLOCK_SIZE_LOG2);
-}
-
 // This macro is used to add a motion vector mv_ref list if it isn't
 // already in the list.  If it's the second motion vector or early_break
 // it will also skip all additional processing and jump to Done!
@@ -500,8 +485,7 @@ static int dec_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                             PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame,
                             const POSITION *const mv_ref_search,
                             int_mv *mv_ref_list, int mi_row, int mi_col,
-                            int block, int is_sub8x8, find_mv_refs_sync sync,
-                            void *const data) {
+                            int block) {
   const int *ref_sign_bias = cm->ref_frame_sign_bias;
   int i, refmv_count = 0;
   int different_ref_found = 0;
@@ -518,7 +502,7 @@ static int dec_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
   memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
 
   i = 0;
-  if (is_sub8x8) {
+  if (block >= 0) {
     // If the size < 8x8 we get the mv from the bmi substructure for the
     // nearest two blocks.
     for (i = 0; i < 2; ++i) {
@@ -557,23 +541,8 @@ static int dec_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
     }
   }
 
-// TODO(hkuang): Remove this sync after fixing pthread_cond_broadcast
-// on windows platform. The sync here is unnecessary if use_prev_frame_mvs
-// is 0. But after removing it, there will be hang in the unit test on windows
-// due to several threads waiting for a thread's signal.
-#if defined(_WIN32) && !HAVE_PTHREAD_H
-  if (cm->frame_parallel_decode && sync != NULL) {
-    sync(data, mi_row);
-  }
-#endif
-
   // Check the last frame's mode and mv info.
   if (prev_frame_mvs) {
-    // Synchronize here for frame parallel decode if sync function is provided.
-    if (cm->frame_parallel_decode && sync != NULL) {
-      sync(data, mi_row);
-    }
-
     if (prev_frame_mvs->ref_frame[0] == ref_frame) {
       ADD_MV_REF_LIST_EB(prev_frame_mvs->mv[0], refmv_count, mv_ref_list, Done);
     } else if (prev_frame_mvs->ref_frame[1] == ref_frame) {
@@ -650,19 +619,22 @@ static void append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
 
   assert(MAX_MV_REF_CANDIDATES == 2);
 
-  refmv_count =
-      dec_find_mv_refs(cm, xd, b_mode, mi->ref_frame[ref], mv_ref_search,
-                       mv_list, mi_row, mi_col, block, 1, NULL, NULL);
-
   switch (block) {
-    case 0: best_sub8x8->as_int = mv_list[refmv_count - 1].as_int; break;
+    case 0:
+      refmv_count =
+          dec_find_mv_refs(cm, xd, b_mode, mi->ref_frame[ref], mv_ref_search,
+                           mv_list, mi_row, mi_col, block);
+      best_sub8x8->as_int = mv_list[refmv_count - 1].as_int;
+      break;
     case 1:
     case 2:
       if (b_mode == NEARESTMV) {
         best_sub8x8->as_int = bmi[0].as_mv[ref].as_int;
       } else {
+        dec_find_mv_refs(cm, xd, b_mode, mi->ref_frame[ref], mv_ref_search,
+                         mv_list, mi_row, mi_col, block);
         best_sub8x8->as_int = 0;
-        for (n = 0; n < refmv_count; ++n)
+        for (n = 0; n < 2; ++n)
           if (bmi[0].as_mv[ref].as_int != mv_list[n].as_int) {
             best_sub8x8->as_int = mv_list[n].as_int;
             break;
@@ -673,15 +645,20 @@ static void append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
       if (b_mode == NEARESTMV) {
         best_sub8x8->as_int = bmi[2].as_mv[ref].as_int;
       } else {
-        int_mv candidates[2 + MAX_MV_REF_CANDIDATES];
-        candidates[0] = bmi[1].as_mv[ref];
-        candidates[1] = bmi[0].as_mv[ref];
-        candidates[2] = mv_list[0];
-        candidates[3] = mv_list[1];
         best_sub8x8->as_int = 0;
-        for (n = 0; n < 2 + MAX_MV_REF_CANDIDATES; ++n)
-          if (bmi[2].as_mv[ref].as_int != candidates[n].as_int) {
-            best_sub8x8->as_int = candidates[n].as_int;
+        if (bmi[2].as_mv[ref].as_int != bmi[1].as_mv[ref].as_int) {
+          best_sub8x8->as_int = bmi[1].as_mv[ref].as_int;
+          break;
+        }
+        if (bmi[2].as_mv[ref].as_int != bmi[0].as_mv[ref].as_int) {
+          best_sub8x8->as_int = bmi[0].as_mv[ref].as_int;
+          break;
+        }
+        dec_find_mv_refs(cm, xd, b_mode, mi->ref_frame[ref], mv_ref_search,
+                         mv_list, mi_row, mi_col, block);
+        for (n = 0; n < 2; ++n)
+          if (bmi[2].as_mv[ref].as_int != mv_list[n].as_int) {
+            best_sub8x8->as_int = mv_list[n].as_int;
             break;
           }
       }
@@ -718,7 +695,7 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi,
   VP9_COMMON *const cm = &pbi->common;
   const BLOCK_SIZE bsize = mi->sb_type;
   const int allow_hp = cm->allow_high_precision_mv;
-  int_mv best_ref_mvs[2];
+  int_mv best_ref_mvs[2] = { { 0 }, { 0 } };
   int ref, is_compound;
   uint8_t inter_mode_ctx;
   const POSITION *const mv_ref_search = mv_ref_blocks[bsize];
@@ -731,33 +708,12 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi,
     mi->mode = ZEROMV;
     if (bsize < BLOCK_8X8) {
       vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,
-                         "Invalid usage of segement feature on small blocks");
+                         "Invalid usage of segment feature on small blocks");
       return;
     }
   } else {
     if (bsize >= BLOCK_8X8)
       mi->mode = read_inter_mode(cm, xd, r, inter_mode_ctx);
-    else
-      // Sub 8x8 blocks use the nearestmv as a ref_mv if the b_mode is NEWMV.
-      // Setting mode to NEARESTMV forces the search to stop after the nearestmv
-      // has been found. After b_modes have been read, mode will be overwritten
-      // by the last b_mode.
-      mi->mode = NEARESTMV;
-
-    if (mi->mode != ZEROMV) {
-      for (ref = 0; ref < 1 + is_compound; ++ref) {
-        int_mv tmp_mvs[MAX_MV_REF_CANDIDATES];
-        const MV_REFERENCE_FRAME frame = mi->ref_frame[ref];
-        int refmv_count;
-
-        refmv_count =
-            dec_find_mv_refs(cm, xd, mi->mode, frame, mv_ref_search, tmp_mvs,
-                             mi_row, mi_col, -1, 0, fpm_sync, (void *)pbi);
-
-        dec_find_best_ref_mvs(allow_hp, tmp_mvs, &best_ref_mvs[ref],
-                              refmv_count);
-      }
-    }
   }
 
   mi->interp_filter = (cm->interp_filter == SWITCHABLE)
@@ -769,6 +725,7 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi,
     const int num_4x4_h = 1 << xd->bmode_blocks_hl;
     int idx, idy;
     PREDICTION_MODE b_mode;
+    int got_mv_refs_for_new = 0;
     int_mv best_sub8x8[2];
     const uint32_t invalid_mv = 0x80008000;
     // Initialize the 2nd element as even though it won't be used meaningfully
@@ -783,12 +740,24 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi,
           for (ref = 0; ref < 1 + is_compound; ++ref)
             append_sub8x8_mvs_for_idx(cm, xd, mv_ref_search, b_mode, j, ref,
                                       mi_row, mi_col, &best_sub8x8[ref]);
+        } else if (b_mode == NEWMV && !got_mv_refs_for_new) {
+          for (ref = 0; ref < 1 + is_compound; ++ref) {
+            int_mv tmp_mvs[MAX_MV_REF_CANDIDATES];
+            const MV_REFERENCE_FRAME frame = mi->ref_frame[ref];
+
+            dec_find_mv_refs(cm, xd, NEWMV, frame, mv_ref_search, tmp_mvs,
+                             mi_row, mi_col, -1);
+
+            lower_mv_precision(&tmp_mvs[0].as_mv, allow_hp);
+            best_ref_mvs[ref] = tmp_mvs[0];
+            got_mv_refs_for_new = 1;
+          }
         }
 
         if (!assign_mv(cm, xd, b_mode, mi->bmi[j].as_mv, best_ref_mvs,
                        best_sub8x8, is_compound, allow_hp, r)) {
           xd->corrupted |= 1;
-          break;
+          return;
         }
 
         if (num_4x4_h == 2) mi->bmi[j + 2] = mi->bmi[j];
@@ -800,6 +769,17 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi,
 
     copy_mv_pair(mi->mv, mi->bmi[3].as_mv);
   } else {
+    if (mi->mode != ZEROMV) {
+      for (ref = 0; ref < 1 + is_compound; ++ref) {
+        int_mv tmp_mvs[MAX_MV_REF_CANDIDATES];
+        const MV_REFERENCE_FRAME frame = mi->ref_frame[ref];
+        int refmv_count =
+            dec_find_mv_refs(cm, xd, mi->mode, frame, mv_ref_search, tmp_mvs,
+                             mi_row, mi_col, -1);
+        lower_mv_precision(&tmp_mvs[refmv_count - 1].as_mv, allow_hp);
+        best_ref_mvs[ref] = tmp_mvs[refmv_count - 1];
+      }
+    }
     xd->corrupted |= !assign_mv(cm, xd, mi->mode, mi->mv, best_ref_mvs,
                                 best_ref_mvs, is_compound, allow_hp, r);
   }
@@ -842,13 +822,21 @@ void vp9_read_mode_info(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row,
   if (frame_is_intra_only(cm)) {
     read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r, x_mis, y_mis);
   } else {
+    // Cache mi->ref_frame and mi->mv so that the compiler can prove that they
+    // are constant for the duration of the loop and avoids reloading them.
+    MV_REFERENCE_FRAME mi_ref_frame[2];
+    int_mv mi_mv[2];
+
     read_inter_frame_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
 
+    copy_ref_frame_pair(mi_ref_frame, mi->ref_frame);
+    copy_mv_pair(mi_mv, mi->mv);
+
     for (h = 0; h < y_mis; ++h) {
       for (w = 0; w < x_mis; ++w) {
         MV_REF *const mv = frame_mvs + w;
-        copy_ref_frame_pair(mv->ref_frame, mi->ref_frame);
-        copy_mv_pair(mv->mv, mi->mv);
+        copy_ref_frame_pair(mv->ref_frame, mi_ref_frame);
+        copy_mv_pair(mv->mv, mi_mv);
       }
       frame_mvs += cm->mi_cols;
     }
diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decodemv.h b/media/libvpx/libvpx/vp9/decoder/vp9_decodemv.h
index b460cb8fb1..11b45ace06 100644
--- a/media/libvpx/libvpx/vp9/decoder/vp9_decodemv.h
+++ b/media/libvpx/libvpx/vp9/decoder/vp9_decodemv.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_DECODER_VP9_DECODEMV_H_
-#define VP9_DECODER_VP9_DECODEMV_H_
+#ifndef VPX_VP9_DECODER_VP9_DECODEMV_H_
+#define VPX_VP9_DECODER_VP9_DECODEMV_H_
 
 #include "vpx_dsp/bitreader.h"
 
@@ -26,4 +26,4 @@ void vp9_read_mode_info(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row,
 }  // extern "C"
 #endif
 
-#endif  // VP9_DECODER_VP9_DECODEMV_H_
+#endif  // VPX_VP9_DECODER_VP9_DECODEMV_H_
diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c
index 37693f0944..5c77df5002 100644
--- a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c
+++ b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c
@@ -21,6 +21,7 @@
 #include "vpx_ports/vpx_once.h"
 #include "vpx_ports/vpx_timer.h"
 #include "vpx_scale/vpx_scale.h"
+#include "vpx_util/vpx_pthread.h"
 #include "vpx_util/vpx_thread.h"
 
 #include "vp9/common/vp9_alloccommon.h"
@@ -55,6 +56,94 @@ static void vp9_dec_setup_mi(VP9_COMMON *cm) {
          cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base));
 }
 
+void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data,
+                              VP9_COMMON *cm, int num_sbs, int max_threads,
+                              int num_jobs) {
+  int plane;
+  const size_t dqcoeff_size = (num_sbs << DQCOEFFS_PER_SB_LOG2) *
+                              sizeof(*row_mt_worker_data->dqcoeff[0]);
+  row_mt_worker_data->num_jobs = num_jobs;
+#if CONFIG_MULTITHREAD
+  {
+    int i;
+    CHECK_MEM_ERROR(
+        &cm->error, row_mt_worker_data->recon_sync_mutex,
+        vpx_malloc(sizeof(*row_mt_worker_data->recon_sync_mutex) * num_jobs));
+    if (row_mt_worker_data->recon_sync_mutex) {
+      for (i = 0; i < num_jobs; ++i) {
+        pthread_mutex_init(&row_mt_worker_data->recon_sync_mutex[i], NULL);
+      }
+    }
+
+    CHECK_MEM_ERROR(
+        &cm->error, row_mt_worker_data->recon_sync_cond,
+        vpx_malloc(sizeof(*row_mt_worker_data->recon_sync_cond) * num_jobs));
+    if (row_mt_worker_data->recon_sync_cond) {
+      for (i = 0; i < num_jobs; ++i) {
+        pthread_cond_init(&row_mt_worker_data->recon_sync_cond[i], NULL);
+      }
+    }
+  }
+#endif
+  row_mt_worker_data->num_sbs = num_sbs;
+  for (plane = 0; plane < 3; ++plane) {
+    CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->dqcoeff[plane],
+                    vpx_memalign(32, dqcoeff_size));
+    memset(row_mt_worker_data->dqcoeff[plane], 0, dqcoeff_size);
+    CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->eob[plane],
+                    vpx_calloc(num_sbs << EOBS_PER_SB_LOG2,
+                               sizeof(*row_mt_worker_data->eob[plane])));
+  }
+  CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->partition,
+                  vpx_calloc(num_sbs * PARTITIONS_PER_SB,
+                             sizeof(*row_mt_worker_data->partition)));
+  CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->recon_map,
+                  vpx_calloc(num_sbs, sizeof(*row_mt_worker_data->recon_map)));
+
+  // allocate memory for thread_data
+  if (row_mt_worker_data->thread_data == NULL) {
+    const size_t thread_size =
+        max_threads * sizeof(*row_mt_worker_data->thread_data);
+    CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->thread_data,
+                    vpx_memalign(32, thread_size));
+  }
+}
+
+void vp9_dec_free_row_mt_mem(RowMTWorkerData *row_mt_worker_data) {
+  if (row_mt_worker_data != NULL) {
+    int plane;
+#if CONFIG_MULTITHREAD
+    int i;
+    if (row_mt_worker_data->recon_sync_mutex != NULL) {
+      for (i = 0; i < row_mt_worker_data->num_jobs; ++i) {
+        pthread_mutex_destroy(&row_mt_worker_data->recon_sync_mutex[i]);
+      }
+      vpx_free(row_mt_worker_data->recon_sync_mutex);
+      row_mt_worker_data->recon_sync_mutex = NULL;
+    }
+    if (row_mt_worker_data->recon_sync_cond != NULL) {
+      for (i = 0; i < row_mt_worker_data->num_jobs; ++i) {
+        pthread_cond_destroy(&row_mt_worker_data->recon_sync_cond[i]);
+      }
+      vpx_free(row_mt_worker_data->recon_sync_cond);
+      row_mt_worker_data->recon_sync_cond = NULL;
+    }
+#endif
+    for (plane = 0; plane < 3; ++plane) {
+      vpx_free(row_mt_worker_data->eob[plane]);
+      row_mt_worker_data->eob[plane] = NULL;
+      vpx_free(row_mt_worker_data->dqcoeff[plane]);
+      row_mt_worker_data->dqcoeff[plane] = NULL;
+    }
+    vpx_free(row_mt_worker_data->partition);
+    row_mt_worker_data->partition = NULL;
+    vpx_free(row_mt_worker_data->recon_map);
+    row_mt_worker_data->recon_map = NULL;
+    vpx_free(row_mt_worker_data->thread_data);
+    row_mt_worker_data->thread_data = NULL;
+  }
+}
+
 static int vp9_dec_alloc_mi(VP9_COMMON *cm, int mi_size) {
   cm->mip = vpx_calloc(mi_size, sizeof(*cm->mip));
   if (!cm->mip) return 1;
@@ -65,10 +154,16 @@ static int vp9_dec_alloc_mi(VP9_COMMON *cm, int mi_size) {
 }
 
 static void vp9_dec_free_mi(VP9_COMMON *cm) {
+#if CONFIG_VP9_POSTPROC
+  // MFQE allocates an additional mip and swaps it with cm->mip.
+  vpx_free(cm->postproc_state.prev_mip);
+  cm->postproc_state.prev_mip = NULL;
+#endif
   vpx_free(cm->mip);
   cm->mip = NULL;
   vpx_free(cm->mi_grid_base);
   cm->mi_grid_base = NULL;
+  cm->mi_alloc_size = 0;
 }
 
 VP9Decoder *vp9_decoder_create(BufferPool *const pool) {
@@ -87,9 +182,10 @@ VP9Decoder *vp9_decoder_create(BufferPool *const pool) {
 
   cm->error.setjmp = 1;
 
-  CHECK_MEM_ERROR(cm, cm->fc, (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
+  CHECK_MEM_ERROR(&cm->error, cm->fc,
+                  (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
   CHECK_MEM_ERROR(
-      cm, cm->frame_contexts,
+      &cm->error, cm->frame_contexts,
       (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS, sizeof(*cm->frame_contexts)));
 
   pbi->need_resync = 1;
@@ -99,7 +195,7 @@ VP9Decoder *vp9_decoder_create(BufferPool *const pool) {
   memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
   memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map));
 
-  cm->current_video_frame = 0;
+  init_frame_indexes(cm);
   pbi->ready_for_new_data = 1;
   pbi->common.buffer_pool = pool;
 
@@ -115,6 +211,7 @@ VP9Decoder *vp9_decoder_create(BufferPool *const pool) {
   cm->error.setjmp = 0;
 
   vpx_get_worker_interface()->init(&pbi->lf_worker);
+  pbi->lf_worker.thread_name = "vpx lf worker";
 
   return pbi;
 }
@@ -139,6 +236,19 @@ void vp9_decoder_remove(VP9Decoder *pbi) {
     vp9_loop_filter_dealloc(&pbi->lf_row_sync);
   }
 
+  if (pbi->row_mt == 1) {
+    vp9_dec_free_row_mt_mem(pbi->row_mt_worker_data);
+    if (pbi->row_mt_worker_data != NULL) {
+      vp9_jobq_deinit(&pbi->row_mt_worker_data->jobq);
+      vpx_free(pbi->row_mt_worker_data->jobq_buf);
+#if CONFIG_MULTITHREAD
+      pthread_mutex_destroy(&pbi->row_mt_worker_data->recon_done_mutex);
+#endif
+    }
+    vpx_free(pbi->row_mt_worker_data);
+  }
+
+  vp9_remove_common(&pbi->common);
   vpx_free(pbi);
 }
 
@@ -169,7 +279,7 @@ vpx_codec_err_t vp9_copy_reference_dec(VP9Decoder *pbi,
       vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                          "Incorrect buffer dimensions");
     else
-      vp8_yv12_copy_frame(cfg, sd);
+      vpx_yv12_copy_frame(cfg, sd);
   } else {
     vpx_internal_error(&cm->error, VPX_CODEC_ERROR, "Invalid reference frame");
   }
@@ -217,7 +327,7 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,
                        "Incorrect buffer dimensions");
   } else {
     // Overwrite the reference frame buffer.
-    vp8_yv12_copy_frame(sd, ref_buf);
+    vpx_yv12_copy_frame(sd, ref_buf);
   }
 
   return cm->error.error_code;
@@ -230,7 +340,6 @@ static void swap_frame_buffers(VP9Decoder *pbi) {
   BufferPool *const pool = cm->buffer_pool;
   RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
 
-  lock_buffer_pool(pool);
   for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
     const int old_idx = cm->ref_frame_map[ref_index];
     // Current thread releases the holding of reference frame.
@@ -250,21 +359,54 @@ static void swap_frame_buffers(VP9Decoder *pbi) {
     decrease_ref_count(old_idx, frame_bufs, pool);
     cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
   }
-  unlock_buffer_pool(pool);
   pbi->hold_ref_buf = 0;
   cm->frame_to_show = get_frame_new_buffer(cm);
 
-  if (!pbi->frame_parallel_decode || !cm->show_frame) {
-    lock_buffer_pool(pool);
-    --frame_bufs[cm->new_fb_idx].ref_count;
-    unlock_buffer_pool(pool);
-  }
+  --frame_bufs[cm->new_fb_idx].ref_count;
 
   // Invalidate these references until the next frame starts.
   for (ref_index = 0; ref_index < 3; ref_index++)
     cm->frame_refs[ref_index].idx = -1;
 }
 
+static void release_fb_on_decoder_exit(VP9Decoder *pbi) {
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+  VP9_COMMON *volatile const cm = &pbi->common;
+  BufferPool *volatile const pool = cm->buffer_pool;
+  RefCntBuffer *volatile const frame_bufs = cm->buffer_pool->frame_bufs;
+  int i;
+
+  // Synchronize all threads immediately as a subsequent decode call may
+  // cause a resize invalidating some allocations.
+  winterface->sync(&pbi->lf_worker);
+  for (i = 0; i < pbi->num_tile_workers; ++i) {
+    winterface->sync(&pbi->tile_workers[i]);
+  }
+
+  // Release all the reference buffers if worker thread is holding them.
+  if (pbi->hold_ref_buf == 1) {
+    int ref_index = 0, mask;
+    for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+      const int old_idx = cm->ref_frame_map[ref_index];
+      // Current thread releases the holding of reference frame.
+      decrease_ref_count(old_idx, frame_bufs, pool);
+
+      // Release the reference frame in reference map.
+      if (mask & 1) {
+        decrease_ref_count(old_idx, frame_bufs, pool);
+      }
+      ++ref_index;
+    }
+
+    // Current thread releases the holding of reference frame.
+    for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) {
+      const int old_idx = cm->ref_frame_map[ref_index];
+      decrease_ref_count(old_idx, frame_bufs, pool);
+    }
+    pbi->hold_ref_buf = 0;
+  }
+}
+
 int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size,
                                 const uint8_t **psource) {
   VP9_COMMON *volatile const cm = &pbi->common;
@@ -292,14 +434,19 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size,
   pbi->ready_for_new_data = 0;
 
   // Check if the previous frame was a frame without any references to it.
-  // Release frame buffer if not decoding in frame parallel mode.
-  if (!pbi->frame_parallel_decode && cm->new_fb_idx >= 0 &&
-      frame_bufs[cm->new_fb_idx].ref_count == 0)
+  if (cm->new_fb_idx >= 0 && frame_bufs[cm->new_fb_idx].ref_count == 0 &&
+      !frame_bufs[cm->new_fb_idx].released) {
     pool->release_fb_cb(pool->cb_priv,
                         &frame_bufs[cm->new_fb_idx].raw_frame_buffer);
+    frame_bufs[cm->new_fb_idx].released = 1;
+  }
+
   // Find a free frame buffer. Return error if can not find any.
   cm->new_fb_idx = get_free_fb(cm);
   if (cm->new_fb_idx == INVALID_IDX) {
+    pbi->ready_for_new_data = 1;
+    release_fb_on_decoder_exit(pbi);
+    vpx_clear_system_state();
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Unable to find free frame buffer");
     return cm->error.error_code;
@@ -309,60 +456,14 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size,
   cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
 
   pbi->hold_ref_buf = 0;
-  if (pbi->frame_parallel_decode) {
-    VPxWorker *const worker = pbi->frame_worker_owner;
-    vp9_frameworker_lock_stats(worker);
-    frame_bufs[cm->new_fb_idx].frame_worker_owner = worker;
-    // Reset decoding progress.
-    pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
-    pbi->cur_buf->row = -1;
-    pbi->cur_buf->col = -1;
-    vp9_frameworker_unlock_stats(worker);
-  } else {
-    pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
-  }
+  pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
 
   if (setjmp(cm->error.jmp)) {
-    const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
-    int i;
-
     cm->error.setjmp = 0;
     pbi->ready_for_new_data = 1;
-
-    // Synchronize all threads immediately as a subsequent decode call may
-    // cause a resize invalidating some allocations.
-    winterface->sync(&pbi->lf_worker);
-    for (i = 0; i < pbi->num_tile_workers; ++i) {
-      winterface->sync(&pbi->tile_workers[i]);
-    }
-
-    lock_buffer_pool(pool);
-    // Release all the reference buffers if worker thread is holding them.
-    if (pbi->hold_ref_buf == 1) {
-      int ref_index = 0, mask;
-      for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
-        const int old_idx = cm->ref_frame_map[ref_index];
-        // Current thread releases the holding of reference frame.
-        decrease_ref_count(old_idx, frame_bufs, pool);
-
-        // Release the reference frame in reference map.
-        if (mask & 1) {
-          decrease_ref_count(old_idx, frame_bufs, pool);
-        }
-        ++ref_index;
-      }
-
-      // Current thread releases the holding of reference frame.
-      for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) {
-        const int old_idx = cm->ref_frame_map[ref_index];
-        decrease_ref_count(old_idx, frame_bufs, pool);
-      }
-      pbi->hold_ref_buf = 0;
-    }
+    release_fb_on_decoder_exit(pbi);
     // Release current frame.
     decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
-    unlock_buffer_pool(pool);
-
     vpx_clear_system_state();
     return -1;
   }
@@ -377,31 +478,16 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size,
   if (!cm->show_existing_frame) {
     cm->last_show_frame = cm->show_frame;
     cm->prev_frame = cm->cur_frame;
-    if (cm->seg.enabled && !pbi->frame_parallel_decode)
-      vp9_swap_current_and_last_seg_map(cm);
+    if (cm->seg.enabled) vp9_swap_current_and_last_seg_map(cm);
   }
 
-  // Update progress in frame parallel decode.
-  if (pbi->frame_parallel_decode) {
-    // Need to lock the mutex here as another thread may
-    // be accessing this buffer.
-    VPxWorker *const worker = pbi->frame_worker_owner;
-    FrameWorkerData *const frame_worker_data = worker->data1;
-    vp9_frameworker_lock_stats(worker);
+  if (cm->show_frame) cm->cur_show_frame_fb_idx = cm->new_fb_idx;
 
-    if (cm->show_frame) {
-      cm->current_video_frame++;
-    }
-    frame_worker_data->frame_decoded = 1;
-    frame_worker_data->frame_context_ready = 1;
-    vp9_frameworker_signal_stats(worker);
-    vp9_frameworker_unlock_stats(worker);
-  } else {
-    cm->last_width = cm->width;
-    cm->last_height = cm->height;
-    if (cm->show_frame) {
-      cm->current_video_frame++;
-    }
+  // Update progress in frame parallel decode.
+  cm->last_width = cm->width;
+  cm->last_height = cm->height;
+  if (cm->show_frame) {
+    cm->current_video_frame++;
   }
 
   cm->error.setjmp = 0;
@@ -427,7 +513,7 @@ int vp9_get_raw_frame(VP9Decoder *pbi, YV12_BUFFER_CONFIG *sd,
 
 #if CONFIG_VP9_POSTPROC
   if (!cm->show_existing_frame) {
-    ret = vp9_post_proc_frame(cm, sd, flags);
+    ret = vp9_post_proc_frame(cm, sd, flags, cm->width);
   } else {
     *sd = *cm->frame_to_show;
     ret = 0;
diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h
index 427baf1e0b..b3ee4eab5f 100644
--- a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h
+++ b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h
@@ -8,25 +8,38 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_DECODER_VP9_DECODER_H_
-#define VP9_DECODER_VP9_DECODER_H_
+#ifndef VPX_VP9_DECODER_VP9_DECODER_H_
+#define VPX_VP9_DECODER_VP9_DECODER_H_
 
 #include "./vpx_config.h"
 
 #include "vpx/vpx_codec.h"
 #include "vpx_dsp/bitreader.h"
 #include "vpx_scale/yv12config.h"
+#include "vpx_util/vpx_pthread.h"
 #include "vpx_util/vpx_thread.h"
 
 #include "vp9/common/vp9_thread_common.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_ppflags.h"
-#include "vp9/decoder/vp9_dthread.h"
+#include "./vp9_job_queue.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+#define EOBS_PER_SB_LOG2 8
+#define DQCOEFFS_PER_SB_LOG2 12
+#define PARTITIONS_PER_SB 85
+
+typedef enum JobType { PARSE_JOB, RECON_JOB, LPF_JOB } JobType;
+
+typedef struct ThreadData {
+  struct VP9Decoder *pbi;
+  LFWorkerData *lf_data;
+  VP9LfSync *lf_sync;
+} ThreadData;
+
 typedef struct TileBuffer {
   const uint8_t *data;
   size_t size;
@@ -38,12 +51,47 @@ typedef struct TileWorkerData {
   int buf_start, buf_end;  // pbi->tile_buffers to decode, inclusive
   vpx_reader bit_reader;
   FRAME_COUNTS counts;
+  LFWorkerData *lf_data;
+  VP9LfSync *lf_sync;
   DECLARE_ALIGNED(16, MACROBLOCKD, xd);
   /* dqcoeff are shared by all the planes. So planes must be decoded serially */
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
+  DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]);
+  DECLARE_ALIGNED(16, uint16_t, extend_and_predict_buf[80 * 2 * 80 * 2]);
   struct vpx_internal_error_info error_info;
 } TileWorkerData;
 
+typedef void (*process_block_fn_t)(TileWorkerData *twd,
+                                   struct VP9Decoder *const pbi, int mi_row,
+                                   int mi_col, BLOCK_SIZE bsize, int bwl,
+                                   int bhl);
+
+typedef struct RowMTWorkerData {
+  int num_sbs;
+  int *eob[MAX_MB_PLANE];
+  PARTITION_TYPE *partition;
+  tran_low_t *dqcoeff[MAX_MB_PLANE];
+  int8_t *recon_map;
+  const uint8_t *data_end;
+  uint8_t *jobq_buf;
+  JobQueueRowMt jobq;
+  size_t jobq_size;
+  int num_tiles_done;
+  int num_jobs;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t recon_done_mutex;
+  pthread_mutex_t *recon_sync_mutex;
+  pthread_cond_t *recon_sync_cond;
+#endif
+  ThreadData *thread_data;
+} RowMTWorkerData;
+
+/* Structure to queue and dequeue row decode jobs */
+typedef struct Job {
+  int row_num;
+  int tile_col;
+  JobType job_type;
+} Job;
+
 typedef struct VP9Decoder {
   DECLARE_ALIGNED(16, MACROBLOCKD, mb);
 
@@ -53,13 +101,10 @@ typedef struct VP9Decoder {
 
   int refresh_frame_flags;
 
-  int frame_parallel_decode;  // frame-based threading.
-
   // TODO(hkuang): Combine this with cur_buf in macroblockd as they are
   // the same.
   RefCntBuffer *cur_buf;  //  Current decoding frame buffer.
 
-  VPxWorker *frame_worker_owner;  // frame_worker that owns this pbi.
   VPxWorker lf_worker;
   VPxWorker *tile_workers;
   TileWorkerData *tile_worker_data;
@@ -76,10 +121,14 @@ typedef struct VP9Decoder {
   int inv_tile_order;
   int need_resync;   // wait for key/intra-only frame.
   int hold_ref_buf;  // hold the reference buffer.
+
+  int row_mt;
+  int lpf_mt_opt;
+  RowMTWorkerData *row_mt_worker_data;
 } VP9Decoder;
 
 int vp9_receive_compressed_data(struct VP9Decoder *pbi, size_t size,
-                                const uint8_t **dest);
+                                const uint8_t **psource);
 
 int vp9_get_raw_frame(struct VP9Decoder *pbi, YV12_BUFFER_CONFIG *sd,
                       vp9_ppflags_t *flags);
@@ -113,6 +162,11 @@ struct VP9Decoder *vp9_decoder_create(BufferPool *const pool);
 
 void vp9_decoder_remove(struct VP9Decoder *pbi);
 
+void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data,
+                              VP9_COMMON *cm, int num_sbs, int max_threads,
+                              int num_jobs);
+void vp9_dec_free_row_mt_mem(RowMTWorkerData *row_mt_worker_data);
+
 static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
                                       BufferPool *const pool) {
   if (idx >= 0 && frame_bufs[idx].ref_count > 0) {
@@ -121,9 +175,10 @@ static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
     // But the private buffer is not set up until finish decoding header.
     // So any error happens during decoding header, the frame_bufs will not
     // have valid priv buffer.
-    if (frame_bufs[idx].ref_count == 0 &&
+    if (!frame_bufs[idx].released && frame_bufs[idx].ref_count == 0 &&
         frame_bufs[idx].raw_frame_buffer.priv) {
       pool->release_fb_cb(pool->cb_priv, &frame_bufs[idx].raw_frame_buffer);
+      frame_bufs[idx].released = 1;
     }
   }
 }
@@ -132,4 +187,4 @@ static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
 }  // extern "C"
 #endif
 
-#endif  // VP9_DECODER_VP9_DECODER_H_
+#endif  // VPX_VP9_DECODER_VP9_DECODER_H_
diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_detokenize.c b/media/libvpx/libvpx/vp9/decoder/vp9_detokenize.c
index a441f3addc..d957dc34e3 100644
--- a/media/libvpx/libvpx/vp9/decoder/vp9_detokenize.c
+++ b/media/libvpx/libvpx/vp9/decoder/vp9_detokenize.c
@@ -33,6 +33,20 @@ static INLINE int read_bool(vpx_reader *r, int prob, BD_VALUE *value,
                             int *count, unsigned int *range) {
   const unsigned int split = (*range * prob + (256 - prob)) >> CHAR_BIT;
   const BD_VALUE bigsplit = (BD_VALUE)split << (BD_VALUE_SIZE - CHAR_BIT);
+#if CONFIG_BITSTREAM_DEBUG
+  const int queue_r = bitstream_queue_get_read();
+  const int frame_idx = bitstream_queue_get_frame_read();
+  int ref_result, ref_prob;
+  bitstream_queue_pop(&ref_result, &ref_prob);
+  if (prob != ref_prob) {
+    fprintf(stderr,
+            "\n *** [bit] prob error, frame_idx_r %d prob %d ref_prob %d "
+            "queue_r %d\n",
+            frame_idx, prob, ref_prob, queue_r);
+
+    assert(0);
+  }
+#endif
 
   if (*count < 0) {
     r->value = *value;
@@ -51,6 +65,20 @@ static INLINE int read_bool(vpx_reader *r, int prob, BD_VALUE *value,
       *value <<= shift;
       *count -= shift;
     }
+#if CONFIG_BITSTREAM_DEBUG
+    {
+      const int bit = 1;
+      if (bit != ref_result) {
+        fprintf(
+            stderr,
+            "\n *** [bit] result error, frame_idx_r %d bit %d ref_result %d "
+            "queue_r %d\n",
+            frame_idx, bit, ref_result, queue_r);
+
+        assert(0);
+      }
+    }
+#endif
     return 1;
   }
   *range = split;
@@ -60,6 +88,19 @@ static INLINE int read_bool(vpx_reader *r, int prob, BD_VALUE *value,
     *value <<= shift;
     *count -= shift;
   }
+#if CONFIG_BITSTREAM_DEBUG
+  {
+    const int bit = 0;
+    if (bit != ref_result) {
+      fprintf(stderr,
+              "\n *** [bit] result error, frame_idx_r %d bit %d ref_result %d "
+              "queue_r %d\n",
+              frame_idx, bit, ref_result, queue_r);
+
+      assert(0);
+    }
+  }
+#endif
   return 0;
 }
 
@@ -92,16 +133,18 @@ static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type,
   int16_t dqv = dq[0];
   const uint8_t *const cat6_prob =
 #if CONFIG_VP9_HIGHBITDEPTH
-      (xd->bd == VPX_BITS_12)
-          ? vp9_cat6_prob_high12
-          : (xd->bd == VPX_BITS_10) ? vp9_cat6_prob_high12 + 2 :
+      (xd->bd == VPX_BITS_12)   ? vp9_cat6_prob_high12
+      : (xd->bd == VPX_BITS_10) ? vp9_cat6_prob_high12 + 2
+                                :
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-                                    vp9_cat6_prob;
+                                vp9_cat6_prob;
   const int cat6_bits =
 #if CONFIG_VP9_HIGHBITDEPTH
-      (xd->bd == VPX_BITS_12) ? 18 : (xd->bd == VPX_BITS_10) ? 16 :
+      (xd->bd == VPX_BITS_12)   ? 18
+      : (xd->bd == VPX_BITS_10) ? 16
+                                :
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-                                                             14;
+                                14;
   // Keep value, range, and count as locals.  The compiler produces better
   // results with the locals than using r directly.
   BD_VALUE value = r->value;
@@ -201,9 +244,9 @@ static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #else
     if (read_bool(r, 128, &value, &count, &range)) {
-      dqcoeff[scan[c]] = -v;
+      dqcoeff[scan[c]] = (tran_low_t)-v;
     } else {
-      dqcoeff[scan[c]] = v;
+      dqcoeff[scan[c]] = (tran_low_t)v;
     }
 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
     ++c;
@@ -229,9 +272,8 @@ static void get_ctx_shift(MACROBLOCKD *xd, int *ctx_shift_a, int *ctx_shift_l,
   }
 }
 
-int vp9_decode_block_tokens(TileWorkerData *twd, int plane,
-                            const scan_order *sc, int x, int y, TX_SIZE tx_size,
-                            int seg_id) {
+int vp9_decode_block_tokens(TileWorkerData *twd, int plane, const ScanOrder *sc,
+                            int x, int y, TX_SIZE tx_size, int seg_id) {
   vpx_reader *r = &twd->bit_reader;
   MACROBLOCKD *xd = &twd->xd;
   struct macroblockd_plane *const pd = &xd->plane[plane];
diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_detokenize.h b/media/libvpx/libvpx/vp9/decoder/vp9_detokenize.h
index 7b0d876016..a8e47021b8 100644
--- a/media/libvpx/libvpx/vp9/decoder/vp9_detokenize.h
+++ b/media/libvpx/libvpx/vp9/decoder/vp9_detokenize.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_DECODER_VP9_DETOKENIZE_H_
-#define VP9_DECODER_VP9_DETOKENIZE_H_
+#ifndef VPX_VP9_DECODER_VP9_DETOKENIZE_H_
+#define VPX_VP9_DECODER_VP9_DETOKENIZE_H_
 
 #include "vpx_dsp/bitreader.h"
 #include "vp9/decoder/vp9_decoder.h"
@@ -19,12 +19,11 @@
 extern "C" {
 #endif
 
-int vp9_decode_block_tokens(TileWorkerData *twd, int plane,
-                            const scan_order *sc, int x, int y, TX_SIZE tx_size,
-                            int seg_id);
+int vp9_decode_block_tokens(TileWorkerData *twd, int plane, const ScanOrder *sc,
+                            int x, int y, TX_SIZE tx_size, int seg_id);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_DECODER_VP9_DETOKENIZE_H_
+#endif  // VPX_VP9_DECODER_VP9_DETOKENIZE_H_
diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_dsubexp.h b/media/libvpx/libvpx/vp9/decoder/vp9_dsubexp.h
index 5a8ec8300c..b0c7750736 100644
--- a/media/libvpx/libvpx/vp9/decoder/vp9_dsubexp.h
+++ b/media/libvpx/libvpx/vp9/decoder/vp9_dsubexp.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_DECODER_VP9_DSUBEXP_H_
-#define VP9_DECODER_VP9_DSUBEXP_H_
+#ifndef VPX_VP9_DECODER_VP9_DSUBEXP_H_
+#define VPX_VP9_DECODER_VP9_DSUBEXP_H_
 
 #include "vpx_dsp/bitreader.h"
 
@@ -23,4 +23,4 @@ void vp9_diff_update_prob(vpx_reader *r, vpx_prob *p);
 }  // extern "C"
 #endif
 
-#endif  // VP9_DECODER_VP9_DSUBEXP_H_
+#endif  // VPX_VP9_DECODER_VP9_DSUBEXP_H_
diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_dthread.c b/media/libvpx/libvpx/vp9/decoder/vp9_dthread.c
deleted file mode 100644
index 52bc2a0f60..0000000000
--- a/media/libvpx/libvpx/vp9/decoder/vp9_dthread.c
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_config.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp9/common/vp9_reconinter.h"
-#include "vp9/decoder/vp9_dthread.h"
-#include "vp9/decoder/vp9_decoder.h"
-
-// #define DEBUG_THREAD
-
-// TODO(hkuang): Clean up all the #ifdef in this file.
-void vp9_frameworker_lock_stats(VPxWorker *const worker) {
-#if CONFIG_MULTITHREAD
-  FrameWorkerData *const worker_data = worker->data1;
-  pthread_mutex_lock(&worker_data->stats_mutex);
-#else
-  (void)worker;
-#endif
-}
-
-void vp9_frameworker_unlock_stats(VPxWorker *const worker) {
-#if CONFIG_MULTITHREAD
-  FrameWorkerData *const worker_data = worker->data1;
-  pthread_mutex_unlock(&worker_data->stats_mutex);
-#else
-  (void)worker;
-#endif
-}
-
-void vp9_frameworker_signal_stats(VPxWorker *const worker) {
-#if CONFIG_MULTITHREAD
-  FrameWorkerData *const worker_data = worker->data1;
-
-// TODO(hkuang): Fix the pthread_cond_broadcast in windows wrapper.
-#if defined(_WIN32) && !HAVE_PTHREAD_H
-  pthread_cond_signal(&worker_data->stats_cond);
-#else
-  pthread_cond_broadcast(&worker_data->stats_cond);
-#endif
-
-#else
-  (void)worker;
-#endif
-}
-
-// This macro prevents thread_sanitizer from reporting known concurrent writes.
-#if defined(__has_feature)
-#if __has_feature(thread_sanitizer)
-#define BUILDING_WITH_TSAN
-#endif
-#endif
-
-// TODO(hkuang): Remove worker parameter as it is only used in debug code.
-void vp9_frameworker_wait(VPxWorker *const worker, RefCntBuffer *const ref_buf,
-                          int row) {
-#if CONFIG_MULTITHREAD
-  if (!ref_buf) return;
-
-#ifndef BUILDING_WITH_TSAN
-  // The following line of code will get harmless tsan error but it is the key
-  // to get best performance.
-  if (ref_buf->row >= row && ref_buf->buf.corrupted != 1) return;
-#endif
-
-  {
-    // Find the worker thread that owns the reference frame. If the reference
-    // frame has been fully decoded, it may not have owner.
-    VPxWorker *const ref_worker = ref_buf->frame_worker_owner;
-    FrameWorkerData *const ref_worker_data =
-        (FrameWorkerData *)ref_worker->data1;
-    const VP9Decoder *const pbi = ref_worker_data->pbi;
-
-#ifdef DEBUG_THREAD
-    {
-      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
-      printf("%d %p worker is waiting for %d %p worker (%d)  ref %d \r\n",
-             worker_data->worker_id, worker, ref_worker_data->worker_id,
-             ref_buf->frame_worker_owner, row, ref_buf->row);
-    }
-#endif
-
-    vp9_frameworker_lock_stats(ref_worker);
-    while (ref_buf->row < row && pbi->cur_buf == ref_buf &&
-           ref_buf->buf.corrupted != 1) {
-      pthread_cond_wait(&ref_worker_data->stats_cond,
-                        &ref_worker_data->stats_mutex);
-    }
-
-    if (ref_buf->buf.corrupted == 1) {
-      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
-      vp9_frameworker_unlock_stats(ref_worker);
-      vpx_internal_error(&worker_data->pbi->common.error,
-                         VPX_CODEC_CORRUPT_FRAME,
-                         "Worker %p failed to decode frame", worker);
-    }
-    vp9_frameworker_unlock_stats(ref_worker);
-  }
-#else
-  (void)worker;
-  (void)ref_buf;
-  (void)row;
-  (void)ref_buf;
-#endif  // CONFIG_MULTITHREAD
-}
-
-void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row) {
-#if CONFIG_MULTITHREAD
-  VPxWorker *worker = buf->frame_worker_owner;
-
-#ifdef DEBUG_THREAD
-  {
-    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
-    printf("%d %p worker decode to (%d) \r\n", worker_data->worker_id,
-           buf->frame_worker_owner, row);
-  }
-#endif
-
-  vp9_frameworker_lock_stats(worker);
-  buf->row = row;
-  vp9_frameworker_signal_stats(worker);
-  vp9_frameworker_unlock_stats(worker);
-#else
-  (void)buf;
-  (void)row;
-#endif  // CONFIG_MULTITHREAD
-}
-
-void vp9_frameworker_copy_context(VPxWorker *const dst_worker,
-                                  VPxWorker *const src_worker) {
-#if CONFIG_MULTITHREAD
-  FrameWorkerData *const src_worker_data = (FrameWorkerData *)src_worker->data1;
-  FrameWorkerData *const dst_worker_data = (FrameWorkerData *)dst_worker->data1;
-  VP9_COMMON *const src_cm = &src_worker_data->pbi->common;
-  VP9_COMMON *const dst_cm = &dst_worker_data->pbi->common;
-  int i;
-
-  // Wait until source frame's context is ready.
-  vp9_frameworker_lock_stats(src_worker);
-  while (!src_worker_data->frame_context_ready) {
-    pthread_cond_wait(&src_worker_data->stats_cond,
-                      &src_worker_data->stats_mutex);
-  }
-
-  dst_cm->last_frame_seg_map = src_cm->seg.enabled
-                                   ? src_cm->current_frame_seg_map
-                                   : src_cm->last_frame_seg_map;
-  dst_worker_data->pbi->need_resync = src_worker_data->pbi->need_resync;
-  vp9_frameworker_unlock_stats(src_worker);
-
-  dst_cm->bit_depth = src_cm->bit_depth;
-#if CONFIG_VP9_HIGHBITDEPTH
-  dst_cm->use_highbitdepth = src_cm->use_highbitdepth;
-#endif
-  dst_cm->prev_frame =
-      src_cm->show_existing_frame ? src_cm->prev_frame : src_cm->cur_frame;
-  dst_cm->last_width =
-      !src_cm->show_existing_frame ? src_cm->width : src_cm->last_width;
-  dst_cm->last_height =
-      !src_cm->show_existing_frame ? src_cm->height : src_cm->last_height;
-  dst_cm->subsampling_x = src_cm->subsampling_x;
-  dst_cm->subsampling_y = src_cm->subsampling_y;
-  dst_cm->frame_type = src_cm->frame_type;
-  dst_cm->last_show_frame = !src_cm->show_existing_frame
-                                ? src_cm->show_frame
-                                : src_cm->last_show_frame;
-  for (i = 0; i < REF_FRAMES; ++i)
-    dst_cm->ref_frame_map[i] = src_cm->next_ref_frame_map[i];
-
-  memcpy(dst_cm->lf_info.lfthr, src_cm->lf_info.lfthr,
-         (MAX_LOOP_FILTER + 1) * sizeof(loop_filter_thresh));
-  dst_cm->lf.last_sharpness_level = src_cm->lf.sharpness_level;
-  dst_cm->lf.filter_level = src_cm->lf.filter_level;
-  memcpy(dst_cm->lf.ref_deltas, src_cm->lf.ref_deltas, MAX_REF_LF_DELTAS);
-  memcpy(dst_cm->lf.mode_deltas, src_cm->lf.mode_deltas, MAX_MODE_LF_DELTAS);
-  dst_cm->seg = src_cm->seg;
-  memcpy(dst_cm->frame_contexts, src_cm->frame_contexts,
-         FRAME_CONTEXTS * sizeof(dst_cm->frame_contexts[0]));
-#else
-  (void)dst_worker;
-  (void)src_worker;
-#endif  // CONFIG_MULTITHREAD
-}
diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_dthread.h b/media/libvpx/libvpx/vp9/decoder/vp9_dthread.h
deleted file mode 100644
index fce0fe7fe3..0000000000
--- a/media/libvpx/libvpx/vp9/decoder/vp9_dthread.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_DECODER_VP9_DTHREAD_H_
-#define VP9_DECODER_VP9_DTHREAD_H_
-
-#include "./vpx_config.h"
-#include "vpx_util/vpx_thread.h"
-#include "vpx/internal/vpx_codec_internal.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct VP9Common;
-struct VP9Decoder;
-
-// WorkerData for the FrameWorker thread. It contains all the information of
-// the worker and decode structures for decoding a frame.
-typedef struct FrameWorkerData {
-  struct VP9Decoder *pbi;
-  const uint8_t *data;
-  const uint8_t *data_end;
-  size_t data_size;
-  void *user_priv;
-  int result;
-  int worker_id;
-  int received_frame;
-
-  // scratch_buffer is used in frame parallel mode only.
-  // It is used to make a copy of the compressed data.
-  uint8_t *scratch_buffer;
-  size_t scratch_buffer_size;
-
-#if CONFIG_MULTITHREAD
-  pthread_mutex_t stats_mutex;
-  pthread_cond_t stats_cond;
-#endif
-
-  int frame_context_ready;  // Current frame's context is ready to read.
-  int frame_decoded;        // Finished decoding current frame.
-} FrameWorkerData;
-
-void vp9_frameworker_lock_stats(VPxWorker *const worker);
-void vp9_frameworker_unlock_stats(VPxWorker *const worker);
-void vp9_frameworker_signal_stats(VPxWorker *const worker);
-
-// Wait until ref_buf has been decoded to row in real pixel unit.
-// Note: worker may already finish decoding ref_buf and release it in order to
-// start decoding next frame. So need to check whether worker is still decoding
-// ref_buf.
-void vp9_frameworker_wait(VPxWorker *const worker, RefCntBuffer *const ref_buf,
-                          int row);
-
-// FrameWorker broadcasts its decoding progress so other workers that are
-// waiting on it can resume decoding.
-void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row);
-
-// Copy necessary decoding context from src worker to dst worker.
-void vp9_frameworker_copy_context(VPxWorker *const dst_worker,
-                                  VPxWorker *const src_worker);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VP9_DECODER_VP9_DTHREAD_H_
diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c
new file mode 100644
index 0000000000..926ae87739
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c
@@ -0,0 +1,125 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <string.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_util/vpx_pthread.h"
+
+#include "vp9/decoder/vp9_job_queue.h"
+
+void vp9_jobq_init(JobQueueRowMt *jobq, uint8_t *buf, size_t buf_size) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_init(&jobq->mutex, NULL);
+  pthread_cond_init(&jobq->cond, NULL);
+#endif
+  jobq->buf_base = buf;
+  jobq->buf_wr = buf;
+  jobq->buf_rd = buf;
+  jobq->buf_end = buf + buf_size;
+  jobq->terminate = 0;
+}
+
+void vp9_jobq_reset(JobQueueRowMt *jobq) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(&jobq->mutex);
+#endif
+  jobq->buf_wr = jobq->buf_base;
+  jobq->buf_rd = jobq->buf_base;
+  jobq->terminate = 0;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(&jobq->mutex);
+#endif
+}
+
+void vp9_jobq_deinit(JobQueueRowMt *jobq) {
+  vp9_jobq_reset(jobq);
+#if CONFIG_MULTITHREAD
+  pthread_mutex_destroy(&jobq->mutex);
+  pthread_cond_destroy(&jobq->cond);
+#endif
+}
+
+void vp9_jobq_terminate(JobQueueRowMt *jobq) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(&jobq->mutex);
+#endif
+  jobq->terminate = 1;
+#if CONFIG_MULTITHREAD
+  pthread_cond_broadcast(&jobq->cond);
+  pthread_mutex_unlock(&jobq->mutex);
+#endif
+}
+
+int vp9_jobq_queue(JobQueueRowMt *jobq, void *job, size_t job_size) {
+  int ret = 0;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(&jobq->mutex);
+#endif
+  if (jobq->buf_end >= jobq->buf_wr + job_size) {
+    memcpy(jobq->buf_wr, job, job_size);
+    jobq->buf_wr = jobq->buf_wr + job_size;
+#if CONFIG_MULTITHREAD
+    pthread_cond_signal(&jobq->cond);
+#endif
+    ret = 0;
+  } else {
+    /* Wrap around case is not supported */
+    assert(0);
+    ret = 1;
+  }
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(&jobq->mutex);
+#endif
+  return ret;
+}
+
+int vp9_jobq_dequeue(JobQueueRowMt *jobq, void *job, size_t job_size,
+                     int blocking) {
+  int ret = 0;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(&jobq->mutex);
+#endif
+  if (jobq->buf_end >= jobq->buf_rd + job_size) {
+    while (1) {
+      if (jobq->buf_wr >= jobq->buf_rd + job_size) {
+        memcpy(job, jobq->buf_rd, job_size);
+        jobq->buf_rd = jobq->buf_rd + job_size;
+        ret = 0;
+        break;
+      } else {
+        /* If all the entries have been dequeued, then break and return */
+        if (jobq->terminate == 1) {
+          ret = 1;
+          break;
+        }
+        if (blocking == 1) {
+#if CONFIG_MULTITHREAD
+          pthread_cond_wait(&jobq->cond, &jobq->mutex);
+#endif
+        } else {
+          /* If there is no job available,
+           * and this is non blocking call then return fail */
+          ret = 1;
+          break;
+        }
+      }
+    }
+  } else {
+    /* Wrap around case is not supported */
+    ret = 1;
+  }
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(&jobq->mutex);
+#endif
+
+  return ret;
+}
diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h
new file mode 100644
index 0000000000..59f71fb9ba
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h
@@ -0,0 +1,45 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_DECODER_VP9_JOB_QUEUE_H_
+#define VPX_VP9_DECODER_VP9_JOB_QUEUE_H_
+
+#include "vpx_util/vpx_pthread.h"
+
+typedef struct {
+  // Pointer to buffer base which contains the jobs
+  uint8_t *buf_base;
+
+  // Pointer to current address where new job can be added
+  uint8_t *volatile buf_wr;
+
+  // Pointer to current address from where next job can be obtained
+  uint8_t *volatile buf_rd;
+
+  // Pointer to end of job buffer
+  uint8_t *buf_end;
+
+  int terminate;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t mutex;
+  pthread_cond_t cond;
+#endif
+} JobQueueRowMt;
+
+void vp9_jobq_init(JobQueueRowMt *jobq, uint8_t *buf, size_t buf_size);
+void vp9_jobq_reset(JobQueueRowMt *jobq);
+void vp9_jobq_deinit(JobQueueRowMt *jobq);
+void vp9_jobq_terminate(JobQueueRowMt *jobq);
+int vp9_jobq_queue(JobQueueRowMt *jobq, void *job, size_t job_size);
+int vp9_jobq_dequeue(JobQueueRowMt *jobq, void *job, size_t job_size,
+                     int blocking);
+
+#endif  // VPX_VP9_DECODER_VP9_JOB_QUEUE_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c
index afffc77178..997b5477e1 100644
--- a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c
+++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -10,26 +10,2164 @@
 
 #include <arm_neon.h>
 
-#include "./vp9_rtcd.h"
 #include "./vpx_config.h"
+#include "./vp9_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 
-#include "vp9/common/vp9_blockd.h"
 #include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
+#include "vpx_dsp/arm/fdct4x4_neon.h"
+#include "vpx_dsp/arm/fdct8x8_neon.h"
+#include "vpx_dsp/arm/fdct16x16_neon.h"
 
-void vp9_fdct8x8_quant_neon(const int16_t *input, int stride,
-                            int16_t *coeff_ptr, intptr_t n_coeffs,
-                            int skip_block, const int16_t *zbin_ptr,
-                            const int16_t *round_ptr, const int16_t *quant_ptr,
-                            const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
-                            int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                            uint16_t *eob_ptr, const int16_t *scan_ptr,
-                            const int16_t *iscan_ptr) {
-  int16_t temp_buffer[64];
-  (void)coeff_ptr;
+static INLINE void load_buffer_4x4(const int16_t *input, int16x8_t *in,
+                                   int stride) {
+  // { 0, 1, 1, 1 };
+  const int16x4_t nonzero_bias_a = vext_s16(vdup_n_s16(0), vdup_n_s16(1), 3);
+  // { 1, 0, 0, 0 };
+  const int16x4_t nonzero_bias_b = vext_s16(vdup_n_s16(1), vdup_n_s16(0), 3);
+  int16x4_t mask;
 
-  vpx_fdct8x8_neon(input, temp_buffer, stride);
-  vp9_quantize_fp_neon(temp_buffer, n_coeffs, skip_block, zbin_ptr, round_ptr,
-                       quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
-                       dequant_ptr, eob_ptr, scan_ptr, iscan_ptr);
+  int16x4_t input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
+  int16x4_t input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4);
+  int16x4_t input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4);
+  int16x4_t input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4);
+
+  // Copy the SSE method, use a mask to avoid an 'if' branch here to increase by
+  // one non-zero first elements
+  mask = vreinterpret_s16_u16(vceq_s16(input_0, nonzero_bias_a));
+  input_0 = vadd_s16(input_0, mask);
+  input_0 = vadd_s16(input_0, nonzero_bias_b);
+
+  in[0] = vcombine_s16(input_0, input_1);
+  in[1] = vcombine_s16(input_2, input_3);
 }
+
+static INLINE void write_buffer_4x4(tran_low_t *output, int16x8_t *res) {
+  const int16x8_t one_s16 = vdupq_n_s16(1);
+  res[0] = vaddq_s16(res[0], one_s16);
+  res[1] = vaddq_s16(res[1], one_s16);
+  res[0] = vshrq_n_s16(res[0], 2);
+  res[1] = vshrq_n_s16(res[1], 2);
+  store_s16q_to_tran_low(output + 0 * 8, res[0]);
+  store_s16q_to_tran_low(output + 1 * 8, res[1]);
+}
+
+static INLINE void fadst4x4_neon(int16x8_t *in) {
+  int32x4_t u[4], t[4];
+  int16x4_t s[4], out[4];
+
+  s[0] = vget_low_s16(in[0]);   // | x_00 | x_01 | x_02 | x_03 |
+  s[1] = vget_high_s16(in[0]);  // | x_10 | x_11 | x_12 | x_13 |
+  s[2] = vget_low_s16(in[1]);   // | x_20 | x_21 | x_22 | x_23 |
+  s[3] = vget_high_s16(in[1]);  // | x_30 | x_31 | x_32 | x_33 |
+
+  // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
+  // t0 = s0 * sinpi_1_9 + s1 * sinpi_2_9 + s3 * sinpi_4_9
+  t[0] = vmull_n_s16(s[0], sinpi_1_9);
+  t[0] = vmlal_n_s16(t[0], s[1], sinpi_2_9);
+  t[0] = vmlal_n_s16(t[0], s[3], sinpi_4_9);
+
+  // t1 = (s0 + s1) * sinpi_3_9 - s3 * sinpi_3_9
+  t[1] = vmull_n_s16(s[0], sinpi_3_9);
+  t[1] = vmlal_n_s16(t[1], s[1], sinpi_3_9);
+  t[1] = vmlsl_n_s16(t[1], s[3], sinpi_3_9);
+
+  // t2 = s0 * sinpi_4_9 - s1* sinpi_1_9 + s3 * sinpi_2_9
+  t[2] = vmull_n_s16(s[0], sinpi_4_9);
+  t[2] = vmlsl_n_s16(t[2], s[1], sinpi_1_9);
+  t[2] = vmlal_n_s16(t[2], s[3], sinpi_2_9);
+
+  // t3 = s2 * sinpi_3_9
+  t[3] = vmull_n_s16(s[2], sinpi_3_9);
+
+  /*
+   * u0 = t0 + t3
+   * u1 = t1
+   * u2 = t2 - t3
+   * u3 = t2 - t0 + t3
+   */
+  u[0] = vaddq_s32(t[0], t[3]);
+  u[1] = t[1];
+  u[2] = vsubq_s32(t[2], t[3]);
+  u[3] = vaddq_s32(vsubq_s32(t[2], t[0]), t[3]);
+
+  // fdct_round_shift
+  out[0] = vrshrn_n_s32(u[0], DCT_CONST_BITS);
+  out[1] = vrshrn_n_s32(u[1], DCT_CONST_BITS);
+  out[2] = vrshrn_n_s32(u[2], DCT_CONST_BITS);
+  out[3] = vrshrn_n_s32(u[3], DCT_CONST_BITS);
+
+  transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]);
+
+  in[0] = vcombine_s16(out[0], out[1]);
+  in[1] = vcombine_s16(out[2], out[3]);
+}
+
+void vp9_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride,
+                     int tx_type) {
+  int16x8_t in[2];
+
+  switch (tx_type) {
+    case DCT_DCT: vpx_fdct4x4_neon(input, output, stride); break;
+    case ADST_DCT:
+      load_buffer_4x4(input, in, stride);
+      fadst4x4_neon(in);
+      // pass1 variant is not accurate enough
+      vpx_fdct4x4_pass2_neon((int16x4_t *)in);
+      write_buffer_4x4(output, in);
+      break;
+    case DCT_ADST:
+      load_buffer_4x4(input, in, stride);
+      // pass1 variant is not accurate enough
+      vpx_fdct4x4_pass2_neon((int16x4_t *)in);
+      fadst4x4_neon(in);
+      write_buffer_4x4(output, in);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      load_buffer_4x4(input, in, stride);
+      fadst4x4_neon(in);
+      fadst4x4_neon(in);
+      write_buffer_4x4(output, in);
+      break;
+  }
+}
+
+static INLINE void load_buffer_8x8(const int16_t *input, int16x8_t *in,
+                                   int stride) {
+  in[0] = vshlq_n_s16(vld1q_s16(input + 0 * stride), 2);
+  in[1] = vshlq_n_s16(vld1q_s16(input + 1 * stride), 2);
+  in[2] = vshlq_n_s16(vld1q_s16(input + 2 * stride), 2);
+  in[3] = vshlq_n_s16(vld1q_s16(input + 3 * stride), 2);
+  in[4] = vshlq_n_s16(vld1q_s16(input + 4 * stride), 2);
+  in[5] = vshlq_n_s16(vld1q_s16(input + 5 * stride), 2);
+  in[6] = vshlq_n_s16(vld1q_s16(input + 6 * stride), 2);
+  in[7] = vshlq_n_s16(vld1q_s16(input + 7 * stride), 2);
+}
+
+/* right shift and rounding
+ * first get the sign bit (bit 15).
+ * If bit == 1, it's the simple case of shifting right by one bit.
+ * If bit == 2, it essentially computes the expression:
+ *
+ * out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+ *
+ * for each row.
+ */
+static INLINE void right_shift_8x8(int16x8_t *res, const int bit) {
+  int16x8_t sign0 = vshrq_n_s16(res[0], 15);
+  int16x8_t sign1 = vshrq_n_s16(res[1], 15);
+  int16x8_t sign2 = vshrq_n_s16(res[2], 15);
+  int16x8_t sign3 = vshrq_n_s16(res[3], 15);
+  int16x8_t sign4 = vshrq_n_s16(res[4], 15);
+  int16x8_t sign5 = vshrq_n_s16(res[5], 15);
+  int16x8_t sign6 = vshrq_n_s16(res[6], 15);
+  int16x8_t sign7 = vshrq_n_s16(res[7], 15);
+
+  if (bit == 2) {
+    const int16x8_t const_rounding = vdupq_n_s16(1);
+    res[0] = vaddq_s16(res[0], const_rounding);
+    res[1] = vaddq_s16(res[1], const_rounding);
+    res[2] = vaddq_s16(res[2], const_rounding);
+    res[3] = vaddq_s16(res[3], const_rounding);
+    res[4] = vaddq_s16(res[4], const_rounding);
+    res[5] = vaddq_s16(res[5], const_rounding);
+    res[6] = vaddq_s16(res[6], const_rounding);
+    res[7] = vaddq_s16(res[7], const_rounding);
+  }
+
+  res[0] = vsubq_s16(res[0], sign0);
+  res[1] = vsubq_s16(res[1], sign1);
+  res[2] = vsubq_s16(res[2], sign2);
+  res[3] = vsubq_s16(res[3], sign3);
+  res[4] = vsubq_s16(res[4], sign4);
+  res[5] = vsubq_s16(res[5], sign5);
+  res[6] = vsubq_s16(res[6], sign6);
+  res[7] = vsubq_s16(res[7], sign7);
+
+  if (bit == 1) {
+    res[0] = vshrq_n_s16(res[0], 1);
+    res[1] = vshrq_n_s16(res[1], 1);
+    res[2] = vshrq_n_s16(res[2], 1);
+    res[3] = vshrq_n_s16(res[3], 1);
+    res[4] = vshrq_n_s16(res[4], 1);
+    res[5] = vshrq_n_s16(res[5], 1);
+    res[6] = vshrq_n_s16(res[6], 1);
+    res[7] = vshrq_n_s16(res[7], 1);
+  } else {
+    res[0] = vshrq_n_s16(res[0], 2);
+    res[1] = vshrq_n_s16(res[1], 2);
+    res[2] = vshrq_n_s16(res[2], 2);
+    res[3] = vshrq_n_s16(res[3], 2);
+    res[4] = vshrq_n_s16(res[4], 2);
+    res[5] = vshrq_n_s16(res[5], 2);
+    res[6] = vshrq_n_s16(res[6], 2);
+    res[7] = vshrq_n_s16(res[7], 2);
+  }
+}
+
+static INLINE void write_buffer_8x8(tran_low_t *output, int16x8_t *res,
+                                    int stride) {
+  store_s16q_to_tran_low(output + 0 * stride, res[0]);
+  store_s16q_to_tran_low(output + 1 * stride, res[1]);
+  store_s16q_to_tran_low(output + 2 * stride, res[2]);
+  store_s16q_to_tran_low(output + 3 * stride, res[3]);
+  store_s16q_to_tran_low(output + 4 * stride, res[4]);
+  store_s16q_to_tran_low(output + 5 * stride, res[5]);
+  store_s16q_to_tran_low(output + 6 * stride, res[6]);
+  store_s16q_to_tran_low(output + 7 * stride, res[7]);
+}
+
+static INLINE void fadst8x8_neon(int16x8_t *in) {
+  int16x4_t x_lo[8], x_hi[8];
+  int32x4_t s_lo[8], s_hi[8];
+  int32x4_t t_lo[8], t_hi[8];
+
+  x_lo[0] = vget_low_s16(in[7]);
+  x_hi[0] = vget_high_s16(in[7]);
+  x_lo[1] = vget_low_s16(in[0]);
+  x_hi[1] = vget_high_s16(in[0]);
+  x_lo[2] = vget_low_s16(in[5]);
+  x_hi[2] = vget_high_s16(in[5]);
+  x_lo[3] = vget_low_s16(in[2]);
+  x_hi[3] = vget_high_s16(in[2]);
+  x_lo[4] = vget_low_s16(in[3]);
+  x_hi[4] = vget_high_s16(in[3]);
+  x_lo[5] = vget_low_s16(in[4]);
+  x_hi[5] = vget_high_s16(in[4]);
+  x_lo[6] = vget_low_s16(in[1]);
+  x_hi[6] = vget_high_s16(in[1]);
+  x_lo[7] = vget_low_s16(in[6]);
+  x_hi[7] = vget_high_s16(in[6]);
+
+  // stage 1
+  // s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
+  // s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
+  butterfly_two_coeff_s16_s32_noround(x_lo[0], x_hi[0], x_lo[1], x_hi[1],
+                                      cospi_2_64, cospi_30_64, &s_lo[0],
+                                      &s_hi[0], &s_lo[1], &s_hi[1]);
+
+  // s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+  // s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+  butterfly_two_coeff_s16_s32_noround(x_lo[2], x_hi[2], x_lo[3], x_hi[3],
+                                      cospi_10_64, cospi_22_64, &s_lo[2],
+                                      &s_hi[2], &s_lo[3], &s_hi[3]);
+
+  // s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+  // s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+  butterfly_two_coeff_s16_s32_noround(x_lo[4], x_hi[4], x_lo[5], x_hi[5],
+                                      cospi_18_64, cospi_14_64, &s_lo[4],
+                                      &s_hi[4], &s_lo[5], &s_hi[5]);
+
+  // s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
+  // s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+  butterfly_two_coeff_s16_s32_noround(x_lo[6], x_hi[6], x_lo[7], x_hi[7],
+                                      cospi_26_64, cospi_6_64, &s_lo[6],
+                                      &s_hi[6], &s_lo[7], &s_hi[7]);
+
+  // fdct_round_shift
+  t_lo[0] = vrshrq_n_s32(vaddq_s32(s_lo[0], s_lo[4]), DCT_CONST_BITS);
+  t_hi[0] = vrshrq_n_s32(vaddq_s32(s_hi[0], s_hi[4]), DCT_CONST_BITS);
+  t_lo[1] = vrshrq_n_s32(vaddq_s32(s_lo[1], s_lo[5]), DCT_CONST_BITS);
+  t_hi[1] = vrshrq_n_s32(vaddq_s32(s_hi[1], s_hi[5]), DCT_CONST_BITS);
+  t_lo[2] = vrshrq_n_s32(vaddq_s32(s_lo[2], s_lo[6]), DCT_CONST_BITS);
+  t_hi[2] = vrshrq_n_s32(vaddq_s32(s_hi[2], s_hi[6]), DCT_CONST_BITS);
+  t_lo[3] = vrshrq_n_s32(vaddq_s32(s_lo[3], s_lo[7]), DCT_CONST_BITS);
+  t_hi[3] = vrshrq_n_s32(vaddq_s32(s_hi[3], s_hi[7]), DCT_CONST_BITS);
+  t_lo[4] = vrshrq_n_s32(vsubq_s32(s_lo[0], s_lo[4]), DCT_CONST_BITS);
+  t_hi[4] = vrshrq_n_s32(vsubq_s32(s_hi[0], s_hi[4]), DCT_CONST_BITS);
+  t_lo[5] = vrshrq_n_s32(vsubq_s32(s_lo[1], s_lo[5]), DCT_CONST_BITS);
+  t_hi[5] = vrshrq_n_s32(vsubq_s32(s_hi[1], s_hi[5]), DCT_CONST_BITS);
+  t_lo[6] = vrshrq_n_s32(vsubq_s32(s_lo[2], s_lo[6]), DCT_CONST_BITS);
+  t_hi[6] = vrshrq_n_s32(vsubq_s32(s_hi[2], s_hi[6]), DCT_CONST_BITS);
+  t_lo[7] = vrshrq_n_s32(vsubq_s32(s_lo[3], s_lo[7]), DCT_CONST_BITS);
+  t_hi[7] = vrshrq_n_s32(vsubq_s32(s_hi[3], s_hi[7]), DCT_CONST_BITS);
+
+  // stage 2
+  s_lo[0] = t_lo[0];
+  s_hi[0] = t_hi[0];
+  s_lo[1] = t_lo[1];
+  s_hi[1] = t_hi[1];
+  s_lo[2] = t_lo[2];
+  s_hi[2] = t_hi[2];
+  s_lo[3] = t_lo[3];
+  s_hi[3] = t_hi[3];
+  // s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
+  // s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
+  butterfly_two_coeff_s32_noround(t_lo[4], t_hi[4], t_lo[5], t_hi[5],
+                                  cospi_8_64, cospi_24_64, &s_lo[4], &s_hi[4],
+                                  &s_lo[5], &s_hi[5]);
+
+  // s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
+  // s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+  butterfly_two_coeff_s32_noround(t_lo[6], t_hi[6], t_lo[7], t_hi[7],
+                                  -cospi_24_64, cospi_8_64, &s_lo[6], &s_hi[6],
+                                  &s_lo[7], &s_hi[7]);
+
+  // fdct_round_shift
+  // s0 + s2
+  t_lo[0] = vaddq_s32(s_lo[0], s_lo[2]);
+  t_hi[0] = vaddq_s32(s_hi[0], s_hi[2]);
+  // s1 + s3
+  t_lo[1] = vaddq_s32(s_lo[1], s_lo[3]);
+  t_hi[1] = vaddq_s32(s_hi[1], s_hi[3]);
+  // s0 - s2
+  t_lo[2] = vsubq_s32(s_lo[0], s_lo[2]);
+  t_hi[2] = vsubq_s32(s_hi[0], s_hi[2]);
+  // s1 - s3
+  t_lo[3] = vsubq_s32(s_lo[1], s_lo[3]);
+  t_hi[3] = vsubq_s32(s_hi[1], s_hi[3]);
+  // s4 + s6
+  t_lo[4] = vrshrq_n_s32(vaddq_s32(s_lo[4], s_lo[6]), DCT_CONST_BITS);
+  t_hi[4] = vrshrq_n_s32(vaddq_s32(s_hi[4], s_hi[6]), DCT_CONST_BITS);
+  // s5 + s7
+  t_lo[5] = vrshrq_n_s32(vaddq_s32(s_lo[5], s_lo[7]), DCT_CONST_BITS);
+  t_hi[5] = vrshrq_n_s32(vaddq_s32(s_hi[5], s_hi[7]), DCT_CONST_BITS);
+  // s4 - s6
+  t_lo[6] = vrshrq_n_s32(vsubq_s32(s_lo[4], s_lo[6]), DCT_CONST_BITS);
+  t_hi[6] = vrshrq_n_s32(vsubq_s32(s_hi[4], s_hi[6]), DCT_CONST_BITS);
+  // s5 - s7
+  t_lo[7] = vrshrq_n_s32(vsubq_s32(s_lo[5], s_lo[7]), DCT_CONST_BITS);
+  t_hi[7] = vrshrq_n_s32(vsubq_s32(s_hi[5], s_hi[7]), DCT_CONST_BITS);
+
+  // stage 3
+  // cospi_16_64 * (x2 + x3)
+  // cospi_16_64 * (x2 - x3)
+  butterfly_one_coeff_s32_noround(t_lo[2], t_hi[2], t_lo[3], t_hi[3],
+                                  cospi_16_64, &s_lo[2], &s_hi[2], &s_lo[3],
+                                  &s_hi[3]);
+
+  // cospi_16_64 * (x6 + x7)
+  // cospi_16_64 * (x2 - x3)
+  butterfly_one_coeff_s32_noround(t_lo[6], t_hi[6], t_lo[7], t_hi[7],
+                                  cospi_16_64, &s_lo[6], &s_hi[6], &s_lo[7],
+                                  &s_hi[7]);
+
+  // final fdct_round_shift
+  x_lo[2] = vrshrn_n_s32(s_lo[2], DCT_CONST_BITS);
+  x_hi[2] = vrshrn_n_s32(s_hi[2], DCT_CONST_BITS);
+  x_lo[3] = vrshrn_n_s32(s_lo[3], DCT_CONST_BITS);
+  x_hi[3] = vrshrn_n_s32(s_hi[3], DCT_CONST_BITS);
+  x_lo[6] = vrshrn_n_s32(s_lo[6], DCT_CONST_BITS);
+  x_hi[6] = vrshrn_n_s32(s_hi[6], DCT_CONST_BITS);
+  x_lo[7] = vrshrn_n_s32(s_lo[7], DCT_CONST_BITS);
+  x_hi[7] = vrshrn_n_s32(s_hi[7], DCT_CONST_BITS);
+
+  // x0, x1, x4, x5 narrow down to 16-bits directly
+  x_lo[0] = vmovn_s32(t_lo[0]);
+  x_hi[0] = vmovn_s32(t_hi[0]);
+  x_lo[1] = vmovn_s32(t_lo[1]);
+  x_hi[1] = vmovn_s32(t_hi[1]);
+  x_lo[4] = vmovn_s32(t_lo[4]);
+  x_hi[4] = vmovn_s32(t_hi[4]);
+  x_lo[5] = vmovn_s32(t_lo[5]);
+  x_hi[5] = vmovn_s32(t_hi[5]);
+
+  in[0] = vcombine_s16(x_lo[0], x_hi[0]);
+  in[1] = vnegq_s16(vcombine_s16(x_lo[4], x_hi[4]));
+  in[2] = vcombine_s16(x_lo[6], x_hi[6]);
+  in[3] = vnegq_s16(vcombine_s16(x_lo[2], x_hi[2]));
+  in[4] = vcombine_s16(x_lo[3], x_hi[3]);
+  in[5] = vnegq_s16(vcombine_s16(x_lo[7], x_hi[7]));
+  in[6] = vcombine_s16(x_lo[5], x_hi[5]);
+  in[7] = vnegq_s16(vcombine_s16(x_lo[1], x_hi[1]));
+
+  transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+                    &in[7]);
+}
+
+void vp9_fht8x8_neon(const int16_t *input, tran_low_t *output, int stride,
+                     int tx_type) {
+  int16x8_t in[8];
+
+  switch (tx_type) {
+    case DCT_DCT: vpx_fdct8x8_neon(input, output, stride); break;
+    case ADST_DCT:
+      load_buffer_8x8(input, in, stride);
+      fadst8x8_neon(in);
+      // pass1 variant is not accurate enough
+      vpx_fdct8x8_pass2_neon(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case DCT_ADST:
+      load_buffer_8x8(input, in, stride);
+      // pass1 variant is not accurate enough
+      vpx_fdct8x8_pass2_neon(in);
+      fadst8x8_neon(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      load_buffer_8x8(input, in, stride);
+      fadst8x8_neon(in);
+      fadst8x8_neon(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+  }
+}
+
+static INLINE void load_buffer_16x16(const int16_t *input, int16x8_t *in0,
+                                     int16x8_t *in1, int stride) {
+  // load first 8 columns
+  load_buffer_8x8(input, in0, stride);
+  load_buffer_8x8(input + 8 * stride, in0 + 8, stride);
+
+  input += 8;
+  // load second 8 columns
+  load_buffer_8x8(input, in1, stride);
+  load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void write_buffer_16x16(tran_low_t *output, int16x8_t *in0,
+                                      int16x8_t *in1, int stride) {
+  // write first 8 columns
+  write_buffer_8x8(output, in0, stride);
+  write_buffer_8x8(output + 8 * stride, in0 + 8, stride);
+
+  // write second 8 columns
+  output += 8;
+  write_buffer_8x8(output, in1, stride);
+  write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void right_shift_16x16(int16x8_t *res0, int16x8_t *res1) {
+  // perform rounding operations
+  right_shift_8x8(res0, 2);
+  right_shift_8x8(res0 + 8, 2);
+  right_shift_8x8(res1, 2);
+  right_shift_8x8(res1 + 8, 2);
+}
+
+static void fdct16_8col(int16x8_t *in) {
+  // perform 16x16 1-D DCT for 8 columns
+  int16x8_t i[8], s1[8], s2[8], s3[8], t[8];
+  int16x4_t t_lo[8], t_hi[8];
+  int32x4_t u_lo[8], u_hi[8];
+
+  // stage 1
+  i[0] = vaddq_s16(in[0], in[15]);
+  i[1] = vaddq_s16(in[1], in[14]);
+  i[2] = vaddq_s16(in[2], in[13]);
+  i[3] = vaddq_s16(in[3], in[12]);
+  i[4] = vaddq_s16(in[4], in[11]);
+  i[5] = vaddq_s16(in[5], in[10]);
+  i[6] = vaddq_s16(in[6], in[9]);
+  i[7] = vaddq_s16(in[7], in[8]);
+
+  // pass1 variant is not accurate enough
+  vpx_fdct8x8_pass2_neon(i);
+  transpose_s16_8x8(&i[0], &i[1], &i[2], &i[3], &i[4], &i[5], &i[6], &i[7]);
+
+  // step 2
+  s1[0] = vsubq_s16(in[7], in[8]);
+  s1[1] = vsubq_s16(in[6], in[9]);
+  s1[2] = vsubq_s16(in[5], in[10]);
+  s1[3] = vsubq_s16(in[4], in[11]);
+  s1[4] = vsubq_s16(in[3], in[12]);
+  s1[5] = vsubq_s16(in[2], in[13]);
+  s1[6] = vsubq_s16(in[1], in[14]);
+  s1[7] = vsubq_s16(in[0], in[15]);
+
+  t[2] = vsubq_s16(s1[5], s1[2]);
+  t[3] = vsubq_s16(s1[4], s1[3]);
+  t[4] = vaddq_s16(s1[4], s1[3]);
+  t[5] = vaddq_s16(s1[5], s1[2]);
+
+  t_lo[2] = vget_low_s16(t[2]);
+  t_hi[2] = vget_high_s16(t[2]);
+  t_lo[3] = vget_low_s16(t[3]);
+  t_hi[3] = vget_high_s16(t[3]);
+  t_lo[4] = vget_low_s16(t[4]);
+  t_hi[4] = vget_high_s16(t[4]);
+  t_lo[5] = vget_low_s16(t[5]);
+  t_hi[5] = vget_high_s16(t[5]);
+
+  u_lo[2] = vmull_n_s16(t_lo[2], cospi_16_64);
+  u_hi[2] = vmull_n_s16(t_hi[2], cospi_16_64);
+  u_lo[3] = vmull_n_s16(t_lo[3], cospi_16_64);
+  u_hi[3] = vmull_n_s16(t_hi[3], cospi_16_64);
+  u_lo[4] = vmull_n_s16(t_lo[4], cospi_16_64);
+  u_hi[4] = vmull_n_s16(t_hi[4], cospi_16_64);
+  u_lo[5] = vmull_n_s16(t_lo[5], cospi_16_64);
+  u_hi[5] = vmull_n_s16(t_hi[5], cospi_16_64);
+
+  t_lo[2] = vrshrn_n_s32(u_lo[2], DCT_CONST_BITS);
+  t_hi[2] = vrshrn_n_s32(u_hi[2], DCT_CONST_BITS);
+  t_lo[3] = vrshrn_n_s32(u_lo[3], DCT_CONST_BITS);
+  t_hi[3] = vrshrn_n_s32(u_hi[3], DCT_CONST_BITS);
+  t_lo[4] = vrshrn_n_s32(u_lo[4], DCT_CONST_BITS);
+  t_hi[4] = vrshrn_n_s32(u_hi[4], DCT_CONST_BITS);
+  t_lo[5] = vrshrn_n_s32(u_lo[5], DCT_CONST_BITS);
+  t_hi[5] = vrshrn_n_s32(u_hi[5], DCT_CONST_BITS);
+
+  s2[2] = vcombine_s16(t_lo[2], t_hi[2]);
+  s2[3] = vcombine_s16(t_lo[3], t_hi[3]);
+  s2[4] = vcombine_s16(t_lo[4], t_hi[4]);
+  s2[5] = vcombine_s16(t_lo[5], t_hi[5]);
+
+  // step 3
+  s3[0] = vaddq_s16(s1[0], s2[3]);
+  s3[1] = vaddq_s16(s1[1], s2[2]);
+  s3[2] = vsubq_s16(s1[1], s2[2]);
+  s3[3] = vsubq_s16(s1[0], s2[3]);
+  s3[4] = vsubq_s16(s1[7], s2[4]);
+  s3[5] = vsubq_s16(s1[6], s2[5]);
+  s3[6] = vaddq_s16(s1[6], s2[5]);
+  s3[7] = vaddq_s16(s1[7], s2[4]);
+
+  // step 4
+  t_lo[0] = vget_low_s16(s3[0]);
+  t_hi[0] = vget_high_s16(s3[0]);
+  t_lo[1] = vget_low_s16(s3[1]);
+  t_hi[1] = vget_high_s16(s3[1]);
+  t_lo[2] = vget_low_s16(s3[2]);
+  t_hi[2] = vget_high_s16(s3[2]);
+  t_lo[3] = vget_low_s16(s3[3]);
+  t_hi[3] = vget_high_s16(s3[3]);
+  t_lo[4] = vget_low_s16(s3[4]);
+  t_hi[4] = vget_high_s16(s3[4]);
+  t_lo[5] = vget_low_s16(s3[5]);
+  t_hi[5] = vget_high_s16(s3[5]);
+  t_lo[6] = vget_low_s16(s3[6]);
+  t_hi[6] = vget_high_s16(s3[6]);
+  t_lo[7] = vget_low_s16(s3[7]);
+  t_hi[7] = vget_high_s16(s3[7]);
+
+  // u[1] = -cospi_8_64 * t[1] + cospi_24_64 * t[6]
+  // u[6] = cospi_24_64 * t[1] + cospi_8_64 * t[6]
+  butterfly_two_coeff_s16_s32_noround(t_lo[1], t_hi[1], t_lo[6], t_hi[6],
+                                      -cospi_8_64, cospi_24_64, &u_lo[1],
+                                      &u_hi[1], &u_lo[6], &u_hi[6]);
+
+  // u[5] = -cospi_24_64 * t[5] + cospi_8_64 * t[2]
+  // u[2] = cospi_8_64 * t[5]   + cospi_24_64 * t[2]
+  butterfly_two_coeff_s16_s32_noround(t_lo[5], t_hi[5], t_lo[2], t_hi[2],
+                                      -cospi_24_64, cospi_8_64, &u_lo[5],
+                                      &u_hi[5], &u_lo[2], &u_hi[2]);
+
+  t_lo[1] = vrshrn_n_s32(u_lo[1], DCT_CONST_BITS);
+  t_hi[1] = vrshrn_n_s32(u_hi[1], DCT_CONST_BITS);
+  t_lo[2] = vrshrn_n_s32(u_lo[2], DCT_CONST_BITS);
+  t_hi[2] = vrshrn_n_s32(u_hi[2], DCT_CONST_BITS);
+  t_lo[5] = vrshrn_n_s32(u_lo[5], DCT_CONST_BITS);
+  t_hi[5] = vrshrn_n_s32(u_hi[5], DCT_CONST_BITS);
+  t_lo[6] = vrshrn_n_s32(u_lo[6], DCT_CONST_BITS);
+  t_hi[6] = vrshrn_n_s32(u_hi[6], DCT_CONST_BITS);
+
+  s2[1] = vcombine_s16(t_lo[1], t_hi[1]);
+  s2[2] = vcombine_s16(t_lo[2], t_hi[2]);
+  s2[5] = vcombine_s16(t_lo[5], t_hi[5]);
+  s2[6] = vcombine_s16(t_lo[6], t_hi[6]);
+
+  // step 5
+  s1[0] = vaddq_s16(s3[0], s2[1]);
+  s1[1] = vsubq_s16(s3[0], s2[1]);
+  s1[2] = vaddq_s16(s3[3], s2[2]);
+  s1[3] = vsubq_s16(s3[3], s2[2]);
+  s1[4] = vsubq_s16(s3[4], s2[5]);
+  s1[5] = vaddq_s16(s3[4], s2[5]);
+  s1[6] = vsubq_s16(s3[7], s2[6]);
+  s1[7] = vaddq_s16(s3[7], s2[6]);
+
+  // step 6
+  t_lo[0] = vget_low_s16(s1[0]);
+  t_hi[0] = vget_high_s16(s1[0]);
+  t_lo[1] = vget_low_s16(s1[1]);
+  t_hi[1] = vget_high_s16(s1[1]);
+  t_lo[2] = vget_low_s16(s1[2]);
+  t_hi[2] = vget_high_s16(s1[2]);
+  t_lo[3] = vget_low_s16(s1[3]);
+  t_hi[3] = vget_high_s16(s1[3]);
+  t_lo[4] = vget_low_s16(s1[4]);
+  t_hi[4] = vget_high_s16(s1[4]);
+  t_lo[5] = vget_low_s16(s1[5]);
+  t_hi[5] = vget_high_s16(s1[5]);
+  t_lo[6] = vget_low_s16(s1[6]);
+  t_hi[6] = vget_high_s16(s1[6]);
+  t_lo[7] = vget_low_s16(s1[7]);
+  t_hi[7] = vget_high_s16(s1[7]);
+
+  // u[0] = step1[7] * cospi_2_64 + step1[0] * cospi_30_64
+  // u[7] = step1[7] * cospi_30_64 - step1[0] * cospi_2_64
+  butterfly_two_coeff_s16_s32_noround(t_lo[7], t_hi[7], t_lo[0], t_hi[0],
+                                      cospi_2_64, cospi_30_64, &u_lo[0],
+                                      &u_hi[0], &u_lo[7], &u_hi[7]);
+
+  // u[1] = step1[6] * cospi_18_64 + step1[1] * cospi_14_64
+  // u[6] = step1[6] * cospi_14_64 - step1[1] * cospi_18_64
+  butterfly_two_coeff_s16_s32_noround(t_lo[6], t_hi[6], t_lo[1], t_hi[1],
+                                      cospi_18_64, cospi_14_64, &u_lo[1],
+                                      &u_hi[1], &u_lo[6], &u_hi[6]);
+
+  // u[2] = step1[5] * cospi_10_64 + step1[2] * cospi_22_64
+  // u[5] = step1[5] * cospi_22_64 - step1[2] * cospi_10_64
+  butterfly_two_coeff_s16_s32_noround(t_lo[5], t_hi[5], t_lo[2], t_hi[2],
+                                      cospi_10_64, cospi_22_64, &u_lo[2],
+                                      &u_hi[2], &u_lo[5], &u_hi[5]);
+
+  // u[3] = step1[4] * cospi_26_64 + step1[3] * cospi_6_64
+  // u[4] = step1[4] * cospi_6_64  - step1[3] * cospi_26_64
+  butterfly_two_coeff_s16_s32_noround(t_lo[4], t_hi[4], t_lo[3], t_hi[3],
+                                      cospi_26_64, cospi_6_64, &u_lo[3],
+                                      &u_hi[3], &u_lo[4], &u_hi[4]);
+
+  // final fdct_round_shift
+  t_lo[0] = vrshrn_n_s32(u_lo[0], DCT_CONST_BITS);
+  t_hi[0] = vrshrn_n_s32(u_hi[0], DCT_CONST_BITS);
+  t_lo[1] = vrshrn_n_s32(u_lo[1], DCT_CONST_BITS);
+  t_hi[1] = vrshrn_n_s32(u_hi[1], DCT_CONST_BITS);
+  t_lo[2] = vrshrn_n_s32(u_lo[2], DCT_CONST_BITS);
+  t_hi[2] = vrshrn_n_s32(u_hi[2], DCT_CONST_BITS);
+  t_lo[3] = vrshrn_n_s32(u_lo[3], DCT_CONST_BITS);
+  t_hi[3] = vrshrn_n_s32(u_hi[3], DCT_CONST_BITS);
+  t_lo[4] = vrshrn_n_s32(u_lo[4], DCT_CONST_BITS);
+  t_hi[4] = vrshrn_n_s32(u_hi[4], DCT_CONST_BITS);
+  t_lo[5] = vrshrn_n_s32(u_lo[5], DCT_CONST_BITS);
+  t_hi[5] = vrshrn_n_s32(u_hi[5], DCT_CONST_BITS);
+  t_lo[6] = vrshrn_n_s32(u_lo[6], DCT_CONST_BITS);
+  t_hi[6] = vrshrn_n_s32(u_hi[6], DCT_CONST_BITS);
+  t_lo[7] = vrshrn_n_s32(u_lo[7], DCT_CONST_BITS);
+  t_hi[7] = vrshrn_n_s32(u_hi[7], DCT_CONST_BITS);
+
+  in[0] = i[0];
+  in[2] = i[1];
+  in[4] = i[2];
+  in[6] = i[3];
+  in[8] = i[4];
+  in[10] = i[5];
+  in[12] = i[6];
+  in[14] = i[7];
+  in[1] = vcombine_s16(t_lo[0], t_hi[0]);
+  in[3] = vcombine_s16(t_lo[4], t_hi[4]);
+  in[5] = vcombine_s16(t_lo[2], t_hi[2]);
+  in[7] = vcombine_s16(t_lo[6], t_hi[6]);
+  in[9] = vcombine_s16(t_lo[1], t_hi[1]);
+  in[11] = vcombine_s16(t_lo[5], t_hi[5]);
+  in[13] = vcombine_s16(t_lo[3], t_hi[3]);
+  in[15] = vcombine_s16(t_lo[7], t_hi[7]);
+}
+
+static void fadst16_8col(int16x8_t *in) {
+  // perform 16x16 1-D ADST for 8 columns
+  int16x4_t x_lo[16], x_hi[16];
+  int32x4_t s_lo[16], s_hi[16];
+  int32x4_t t_lo[16], t_hi[16];
+
+  x_lo[0] = vget_low_s16(in[15]);
+  x_hi[0] = vget_high_s16(in[15]);
+  x_lo[1] = vget_low_s16(in[0]);
+  x_hi[1] = vget_high_s16(in[0]);
+  x_lo[2] = vget_low_s16(in[13]);
+  x_hi[2] = vget_high_s16(in[13]);
+  x_lo[3] = vget_low_s16(in[2]);
+  x_hi[3] = vget_high_s16(in[2]);
+  x_lo[4] = vget_low_s16(in[11]);
+  x_hi[4] = vget_high_s16(in[11]);
+  x_lo[5] = vget_low_s16(in[4]);
+  x_hi[5] = vget_high_s16(in[4]);
+  x_lo[6] = vget_low_s16(in[9]);
+  x_hi[6] = vget_high_s16(in[9]);
+  x_lo[7] = vget_low_s16(in[6]);
+  x_hi[7] = vget_high_s16(in[6]);
+  x_lo[8] = vget_low_s16(in[7]);
+  x_hi[8] = vget_high_s16(in[7]);
+  x_lo[9] = vget_low_s16(in[8]);
+  x_hi[9] = vget_high_s16(in[8]);
+  x_lo[10] = vget_low_s16(in[5]);
+  x_hi[10] = vget_high_s16(in[5]);
+  x_lo[11] = vget_low_s16(in[10]);
+  x_hi[11] = vget_high_s16(in[10]);
+  x_lo[12] = vget_low_s16(in[3]);
+  x_hi[12] = vget_high_s16(in[3]);
+  x_lo[13] = vget_low_s16(in[12]);
+  x_hi[13] = vget_high_s16(in[12]);
+  x_lo[14] = vget_low_s16(in[1]);
+  x_hi[14] = vget_high_s16(in[1]);
+  x_lo[15] = vget_low_s16(in[14]);
+  x_hi[15] = vget_high_s16(in[14]);
+
+  // stage 1
+  // s0 = cospi_1_64 * x0 + cospi_31_64 * x1;
+  // s1 = cospi_31_64 * x0 - cospi_1_64 * x1;
+  butterfly_two_coeff_s16_s32_noround(x_lo[0], x_hi[0], x_lo[1], x_hi[1],
+                                      cospi_1_64, cospi_31_64, &s_lo[0],
+                                      &s_hi[0], &s_lo[1], &s_hi[1]);
+  // s2 = cospi_5_64 * x2 + cospi_27_64 * x3;
+  // s3 = cospi_27_64 * x2 - cospi_5_64 * x3;
+  butterfly_two_coeff_s16_s32_noround(x_lo[2], x_hi[2], x_lo[3], x_hi[3],
+                                      cospi_5_64, cospi_27_64, &s_lo[2],
+                                      &s_hi[2], &s_lo[3], &s_hi[3]);
+  // s4 = cospi_9_64 * x4 + cospi_23_64 * x5;
+  // s5 = cospi_23_64 * x4 - cospi_9_64 * x5;
+  butterfly_two_coeff_s16_s32_noround(x_lo[4], x_hi[4], x_lo[5], x_hi[5],
+                                      cospi_9_64, cospi_23_64, &s_lo[4],
+                                      &s_hi[4], &s_lo[5], &s_hi[5]);
+  // s6 = cospi_13_64 * x6 + cospi_19_64 * x7;
+  // s7 = cospi_19_64 * x6 - cospi_13_64 * x7;
+  butterfly_two_coeff_s16_s32_noround(x_lo[6], x_hi[6], x_lo[7], x_hi[7],
+                                      cospi_13_64, cospi_19_64, &s_lo[6],
+                                      &s_hi[6], &s_lo[7], &s_hi[7]);
+  // s8 = cospi_17_64 * x8 + cospi_15_64 * x9;
+  // s9 = cospi_15_64 * x8 - cospi_17_64 * x9;
+  butterfly_two_coeff_s16_s32_noround(x_lo[8], x_hi[8], x_lo[9], x_hi[9],
+                                      cospi_17_64, cospi_15_64, &s_lo[8],
+                                      &s_hi[8], &s_lo[9], &s_hi[9]);
+  // s10 = cospi_21_64 * x10 + cospi_11_64 * x11;
+  // s11 = cospi_11_64 * x10 - cospi_21_64 * x11;
+  butterfly_two_coeff_s16_s32_noround(x_lo[10], x_hi[10], x_lo[11], x_hi[11],
+                                      cospi_21_64, cospi_11_64, &s_lo[10],
+                                      &s_hi[10], &s_lo[11], &s_hi[11]);
+  // s12 = cospi_25_64 * x12 + cospi_7_64 * x13;
+  // s13 = cospi_7_64 * x12 - cospi_25_64 * x13;
+  butterfly_two_coeff_s16_s32_noround(x_lo[12], x_hi[12], x_lo[13], x_hi[13],
+                                      cospi_25_64, cospi_7_64, &s_lo[12],
+                                      &s_hi[12], &s_lo[13], &s_hi[13]);
+  // s14 = cospi_29_64 * x14 + cospi_3_64 * x15;
+  // s15 = cospi_3_64 * x14 - cospi_29_64 * x15;
+  butterfly_two_coeff_s16_s32_noround(x_lo[14], x_hi[14], x_lo[15], x_hi[15],
+                                      cospi_29_64, cospi_3_64, &s_lo[14],
+                                      &s_hi[14], &s_lo[15], &s_hi[15]);
+
+  // fdct_round_shift
+  t_lo[0] = vrshrq_n_s32(vaddq_s32(s_lo[0], s_lo[8]), DCT_CONST_BITS);
+  t_hi[0] = vrshrq_n_s32(vaddq_s32(s_hi[0], s_hi[8]), DCT_CONST_BITS);
+  t_lo[1] = vrshrq_n_s32(vaddq_s32(s_lo[1], s_lo[9]), DCT_CONST_BITS);
+  t_hi[1] = vrshrq_n_s32(vaddq_s32(s_hi[1], s_hi[9]), DCT_CONST_BITS);
+  t_lo[2] = vrshrq_n_s32(vaddq_s32(s_lo[2], s_lo[10]), DCT_CONST_BITS);
+  t_hi[2] = vrshrq_n_s32(vaddq_s32(s_hi[2], s_hi[10]), DCT_CONST_BITS);
+  t_lo[3] = vrshrq_n_s32(vaddq_s32(s_lo[3], s_lo[11]), DCT_CONST_BITS);
+  t_hi[3] = vrshrq_n_s32(vaddq_s32(s_hi[3], s_hi[11]), DCT_CONST_BITS);
+  t_lo[4] = vrshrq_n_s32(vaddq_s32(s_lo[4], s_lo[12]), DCT_CONST_BITS);
+  t_hi[4] = vrshrq_n_s32(vaddq_s32(s_hi[4], s_hi[12]), DCT_CONST_BITS);
+  t_lo[5] = vrshrq_n_s32(vaddq_s32(s_lo[5], s_lo[13]), DCT_CONST_BITS);
+  t_hi[5] = vrshrq_n_s32(vaddq_s32(s_hi[5], s_hi[13]), DCT_CONST_BITS);
+  t_lo[6] = vrshrq_n_s32(vaddq_s32(s_lo[6], s_lo[14]), DCT_CONST_BITS);
+  t_hi[6] = vrshrq_n_s32(vaddq_s32(s_hi[6], s_hi[14]), DCT_CONST_BITS);
+  t_lo[7] = vrshrq_n_s32(vaddq_s32(s_lo[7], s_lo[15]), DCT_CONST_BITS);
+  t_hi[7] = vrshrq_n_s32(vaddq_s32(s_hi[7], s_hi[15]), DCT_CONST_BITS);
+  t_lo[8] = vrshrq_n_s32(vsubq_s32(s_lo[0], s_lo[8]), DCT_CONST_BITS);
+  t_hi[8] = vrshrq_n_s32(vsubq_s32(s_hi[0], s_hi[8]), DCT_CONST_BITS);
+  t_lo[9] = vrshrq_n_s32(vsubq_s32(s_lo[1], s_lo[9]), DCT_CONST_BITS);
+  t_hi[9] = vrshrq_n_s32(vsubq_s32(s_hi[1], s_hi[9]), DCT_CONST_BITS);
+  t_lo[10] = vrshrq_n_s32(vsubq_s32(s_lo[2], s_lo[10]), DCT_CONST_BITS);
+  t_hi[10] = vrshrq_n_s32(vsubq_s32(s_hi[2], s_hi[10]), DCT_CONST_BITS);
+  t_lo[11] = vrshrq_n_s32(vsubq_s32(s_lo[3], s_lo[11]), DCT_CONST_BITS);
+  t_hi[11] = vrshrq_n_s32(vsubq_s32(s_hi[3], s_hi[11]), DCT_CONST_BITS);
+  t_lo[12] = vrshrq_n_s32(vsubq_s32(s_lo[4], s_lo[12]), DCT_CONST_BITS);
+  t_hi[12] = vrshrq_n_s32(vsubq_s32(s_hi[4], s_hi[12]), DCT_CONST_BITS);
+  t_lo[13] = vrshrq_n_s32(vsubq_s32(s_lo[5], s_lo[13]), DCT_CONST_BITS);
+  t_hi[13] = vrshrq_n_s32(vsubq_s32(s_hi[5], s_hi[13]), DCT_CONST_BITS);
+  t_lo[14] = vrshrq_n_s32(vsubq_s32(s_lo[6], s_lo[14]), DCT_CONST_BITS);
+  t_hi[14] = vrshrq_n_s32(vsubq_s32(s_hi[6], s_hi[14]), DCT_CONST_BITS);
+  t_lo[15] = vrshrq_n_s32(vsubq_s32(s_lo[7], s_lo[15]), DCT_CONST_BITS);
+  t_hi[15] = vrshrq_n_s32(vsubq_s32(s_hi[7], s_hi[15]), DCT_CONST_BITS);
+
+  // stage 2
+  s_lo[0] = t_lo[0];
+  s_hi[0] = t_hi[0];
+  s_lo[1] = t_lo[1];
+  s_hi[1] = t_hi[1];
+  s_lo[2] = t_lo[2];
+  s_hi[2] = t_hi[2];
+  s_lo[3] = t_lo[3];
+  s_hi[3] = t_hi[3];
+  s_lo[4] = t_lo[4];
+  s_hi[4] = t_hi[4];
+  s_lo[5] = t_lo[5];
+  s_hi[5] = t_hi[5];
+  s_lo[6] = t_lo[6];
+  s_hi[6] = t_hi[6];
+  s_lo[7] = t_lo[7];
+  s_hi[7] = t_hi[7];
+  // s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+  // s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+  butterfly_two_coeff_s32_noround(t_lo[8], t_hi[8], t_lo[9], t_hi[9],
+                                  cospi_4_64, cospi_28_64, &s_lo[8], &s_hi[8],
+                                  &s_lo[9], &s_hi[9]);
+  // s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+  // s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+  butterfly_two_coeff_s32_noround(t_lo[10], t_hi[10], t_lo[11], t_hi[11],
+                                  cospi_20_64, cospi_12_64, &s_lo[10],
+                                  &s_hi[10], &s_lo[11], &s_hi[11]);
+  // s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+  // s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+  butterfly_two_coeff_s32_noround(t_lo[13], t_hi[13], t_lo[12], t_hi[12],
+                                  cospi_28_64, cospi_4_64, &s_lo[13], &s_hi[13],
+                                  &s_lo[12], &s_hi[12]);
+  // s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+  // s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+  butterfly_two_coeff_s32_noround(t_lo[15], t_hi[15], t_lo[14], t_hi[14],
+                                  cospi_12_64, cospi_20_64, &s_lo[15],
+                                  &s_hi[15], &s_lo[14], &s_hi[14]);
+
+  // s0 + s4
+  t_lo[0] = vaddq_s32(s_lo[0], s_lo[4]);
+  t_hi[0] = vaddq_s32(s_hi[0], s_hi[4]);
+  // s1 + s5
+  t_lo[1] = vaddq_s32(s_lo[1], s_lo[5]);
+  t_hi[1] = vaddq_s32(s_hi[1], s_hi[5]);
+  // s2 + s6
+  t_lo[2] = vaddq_s32(s_lo[2], s_lo[6]);
+  t_hi[2] = vaddq_s32(s_hi[2], s_hi[6]);
+  // s3 + s7
+  t_lo[3] = vaddq_s32(s_lo[3], s_lo[7]);
+  t_hi[3] = vaddq_s32(s_hi[3], s_hi[7]);
+  // s0 - s4
+  t_lo[4] = vsubq_s32(s_lo[0], s_lo[4]);
+  t_hi[4] = vsubq_s32(s_hi[0], s_hi[4]);
+  // s1 - s7
+  t_lo[5] = vsubq_s32(s_lo[1], s_lo[5]);
+  t_hi[5] = vsubq_s32(s_hi[1], s_hi[5]);
+  // s2 - s6
+  t_lo[6] = vsubq_s32(s_lo[2], s_lo[6]);
+  t_hi[6] = vsubq_s32(s_hi[2], s_hi[6]);
+  // s3 - s7
+  t_lo[7] = vsubq_s32(s_lo[3], s_lo[7]);
+  t_hi[7] = vsubq_s32(s_hi[3], s_hi[7]);
+  // s8 + s12
+  t_lo[8] = vaddq_s32(s_lo[8], s_lo[12]);
+  t_hi[8] = vaddq_s32(s_hi[8], s_hi[12]);
+  // s9 + s13
+  t_lo[9] = vaddq_s32(s_lo[9], s_lo[13]);
+  t_hi[9] = vaddq_s32(s_hi[9], s_hi[13]);
+  // s10 + s14
+  t_lo[10] = vaddq_s32(s_lo[10], s_lo[14]);
+  t_hi[10] = vaddq_s32(s_hi[10], s_hi[14]);
+  // s11 + s15
+  t_lo[11] = vaddq_s32(s_lo[11], s_lo[15]);
+  t_hi[11] = vaddq_s32(s_hi[11], s_hi[15]);
+  // s8 + s12
+  t_lo[12] = vsubq_s32(s_lo[8], s_lo[12]);
+  t_hi[12] = vsubq_s32(s_hi[8], s_hi[12]);
+  // s9 + s13
+  t_lo[13] = vsubq_s32(s_lo[9], s_lo[13]);
+  t_hi[13] = vsubq_s32(s_hi[9], s_hi[13]);
+  // s10 + s14
+  t_lo[14] = vsubq_s32(s_lo[10], s_lo[14]);
+  t_hi[14] = vsubq_s32(s_hi[10], s_hi[14]);
+  // s11 + s15
+  t_lo[15] = vsubq_s32(s_lo[11], s_lo[15]);
+  t_hi[15] = vsubq_s32(s_hi[11], s_hi[15]);
+
+  t_lo[8] = vrshrq_n_s32(t_lo[8], DCT_CONST_BITS);
+  t_hi[8] = vrshrq_n_s32(t_hi[8], DCT_CONST_BITS);
+  t_lo[9] = vrshrq_n_s32(t_lo[9], DCT_CONST_BITS);
+  t_hi[9] = vrshrq_n_s32(t_hi[9], DCT_CONST_BITS);
+  t_lo[10] = vrshrq_n_s32(t_lo[10], DCT_CONST_BITS);
+  t_hi[10] = vrshrq_n_s32(t_hi[10], DCT_CONST_BITS);
+  t_lo[11] = vrshrq_n_s32(t_lo[11], DCT_CONST_BITS);
+  t_hi[11] = vrshrq_n_s32(t_hi[11], DCT_CONST_BITS);
+  t_lo[12] = vrshrq_n_s32(t_lo[12], DCT_CONST_BITS);
+  t_hi[12] = vrshrq_n_s32(t_hi[12], DCT_CONST_BITS);
+  t_lo[13] = vrshrq_n_s32(t_lo[13], DCT_CONST_BITS);
+  t_hi[13] = vrshrq_n_s32(t_hi[13], DCT_CONST_BITS);
+  t_lo[14] = vrshrq_n_s32(t_lo[14], DCT_CONST_BITS);
+  t_hi[14] = vrshrq_n_s32(t_hi[14], DCT_CONST_BITS);
+  t_lo[15] = vrshrq_n_s32(t_lo[15], DCT_CONST_BITS);
+  t_hi[15] = vrshrq_n_s32(t_hi[15], DCT_CONST_BITS);
+
+  // stage 3
+  s_lo[0] = t_lo[0];
+  s_hi[0] = t_hi[0];
+  s_lo[1] = t_lo[1];
+  s_hi[1] = t_hi[1];
+  s_lo[2] = t_lo[2];
+  s_hi[2] = t_hi[2];
+  s_lo[3] = t_lo[3];
+  s_hi[3] = t_hi[3];
+  // s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+  // s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+  butterfly_two_coeff_s32_noround(t_lo[4], t_hi[4], t_lo[5], t_hi[5],
+                                  cospi_8_64, cospi_24_64, &s_lo[4], &s_hi[4],
+                                  &s_lo[5], &s_hi[5]);
+  // s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+  // s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+  butterfly_two_coeff_s32_noround(t_lo[7], t_hi[7], t_lo[6], t_hi[6],
+                                  cospi_24_64, cospi_8_64, &s_lo[7], &s_hi[7],
+                                  &s_lo[6], &s_hi[6]);
+  s_lo[8] = t_lo[8];
+  s_hi[8] = t_hi[8];
+  s_lo[9] = t_lo[9];
+  s_hi[9] = t_hi[9];
+  s_lo[10] = t_lo[10];
+  s_hi[10] = t_hi[10];
+  s_lo[11] = t_lo[11];
+  s_hi[11] = t_hi[11];
+  // s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+  // s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+  butterfly_two_coeff_s32_noround(t_lo[12], t_hi[12], t_lo[13], t_hi[13],
+                                  cospi_8_64, cospi_24_64, &s_lo[12], &s_hi[12],
+                                  &s_lo[13], &s_hi[13]);
+  // s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+  // s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+  butterfly_two_coeff_s32_noround(t_lo[15], t_hi[15], t_lo[14], t_hi[14],
+                                  cospi_24_64, cospi_8_64, &s_lo[15], &s_hi[15],
+                                  &s_lo[14], &s_hi[14]);
+
+  // s0 + s4
+  t_lo[0] = vaddq_s32(s_lo[0], s_lo[2]);
+  t_hi[0] = vaddq_s32(s_hi[0], s_hi[2]);
+  // s1 + s3
+  t_lo[1] = vaddq_s32(s_lo[1], s_lo[3]);
+  t_hi[1] = vaddq_s32(s_hi[1], s_hi[3]);
+  // s0 - s4
+  t_lo[2] = vsubq_s32(s_lo[0], s_lo[2]);
+  t_hi[2] = vsubq_s32(s_hi[0], s_hi[2]);
+  // s1 - s3
+  t_lo[3] = vsubq_s32(s_lo[1], s_lo[3]);
+  t_hi[3] = vsubq_s32(s_hi[1], s_hi[3]);
+  // s4 + s6
+  t_lo[4] = vaddq_s32(s_lo[4], s_lo[6]);
+  t_hi[4] = vaddq_s32(s_hi[4], s_hi[6]);
+  // s5 + s7
+  t_lo[5] = vaddq_s32(s_lo[5], s_lo[7]);
+  t_hi[5] = vaddq_s32(s_hi[5], s_hi[7]);
+  // s4 - s6
+  t_lo[6] = vsubq_s32(s_lo[4], s_lo[6]);
+  t_hi[6] = vsubq_s32(s_hi[4], s_hi[6]);
+  // s5 - s7
+  t_lo[7] = vsubq_s32(s_lo[5], s_lo[7]);
+  t_hi[7] = vsubq_s32(s_hi[5], s_hi[7]);
+  // s8 + s10
+  t_lo[8] = vaddq_s32(s_lo[8], s_lo[10]);
+  t_hi[8] = vaddq_s32(s_hi[8], s_hi[10]);
+  // s9 + s11
+  t_lo[9] = vaddq_s32(s_lo[9], s_lo[11]);
+  t_hi[9] = vaddq_s32(s_hi[9], s_hi[11]);
+  // s8 - s10
+  t_lo[10] = vsubq_s32(s_lo[8], s_lo[10]);
+  t_hi[10] = vsubq_s32(s_hi[8], s_hi[10]);
+  // s9 - s11
+  t_lo[11] = vsubq_s32(s_lo[9], s_lo[11]);
+  t_hi[11] = vsubq_s32(s_hi[9], s_hi[11]);
+  // s12 + s14
+  t_lo[12] = vaddq_s32(s_lo[12], s_lo[14]);
+  t_hi[12] = vaddq_s32(s_hi[12], s_hi[14]);
+  // s13 + s15
+  t_lo[13] = vaddq_s32(s_lo[13], s_lo[15]);
+  t_hi[13] = vaddq_s32(s_hi[13], s_hi[15]);
+  // s12 - s14
+  t_lo[14] = vsubq_s32(s_lo[12], s_lo[14]);
+  t_hi[14] = vsubq_s32(s_hi[12], s_hi[14]);
+  // s13 - s15
+  t_lo[15] = vsubq_s32(s_lo[13], s_lo[15]);
+  t_hi[15] = vsubq_s32(s_hi[13], s_hi[15]);
+
+  t_lo[4] = vrshrq_n_s32(t_lo[4], DCT_CONST_BITS);
+  t_hi[4] = vrshrq_n_s32(t_hi[4], DCT_CONST_BITS);
+  t_lo[5] = vrshrq_n_s32(t_lo[5], DCT_CONST_BITS);
+  t_hi[5] = vrshrq_n_s32(t_hi[5], DCT_CONST_BITS);
+  t_lo[6] = vrshrq_n_s32(t_lo[6], DCT_CONST_BITS);
+  t_hi[6] = vrshrq_n_s32(t_hi[6], DCT_CONST_BITS);
+  t_lo[7] = vrshrq_n_s32(t_lo[7], DCT_CONST_BITS);
+  t_hi[7] = vrshrq_n_s32(t_hi[7], DCT_CONST_BITS);
+  t_lo[12] = vrshrq_n_s32(t_lo[12], DCT_CONST_BITS);
+  t_hi[12] = vrshrq_n_s32(t_hi[12], DCT_CONST_BITS);
+  t_lo[13] = vrshrq_n_s32(t_lo[13], DCT_CONST_BITS);
+  t_hi[13] = vrshrq_n_s32(t_hi[13], DCT_CONST_BITS);
+  t_lo[14] = vrshrq_n_s32(t_lo[14], DCT_CONST_BITS);
+  t_hi[14] = vrshrq_n_s32(t_hi[14], DCT_CONST_BITS);
+  t_lo[15] = vrshrq_n_s32(t_lo[15], DCT_CONST_BITS);
+  t_hi[15] = vrshrq_n_s32(t_hi[15], DCT_CONST_BITS);
+
+  // stage 4
+  // s2 = (-cospi_16_64) * (x2 + x3);
+  // s3 = cospi_16_64 * (x2 - x3);
+  butterfly_one_coeff_s32_noround(t_lo[3], t_hi[3], t_lo[2], t_hi[2],
+                                  -cospi_16_64, &s_lo[2], &s_hi[2], &s_lo[3],
+                                  &s_hi[3]);
+  // s6 = cospi_16_64 * (x6 + x7);
+  // s7 = cospi_16_64 * (-x6 + x7);
+  butterfly_one_coeff_s32_noround(t_lo[7], t_hi[7], t_lo[6], t_hi[6],
+                                  cospi_16_64, &s_lo[6], &s_hi[6], &s_lo[7],
+                                  &s_hi[7]);
+  // s10 = cospi_16_64 * (x10 + x11);
+  // s11 = cospi_16_64 * (-x10 + x11);
+  butterfly_one_coeff_s32_noround(t_lo[11], t_hi[11], t_lo[10], t_hi[10],
+                                  cospi_16_64, &s_lo[10], &s_hi[10], &s_lo[11],
+                                  &s_hi[11]);
+  // s14 = (-cospi_16_64) * (x14 + x15);
+  // s15 = cospi_16_64 * (x14 - x15);
+  butterfly_one_coeff_s32_noround(t_lo[15], t_hi[15], t_lo[14], t_hi[14],
+                                  -cospi_16_64, &s_lo[14], &s_hi[14], &s_lo[15],
+                                  &s_hi[15]);
+
+  // final fdct_round_shift
+  x_lo[2] = vrshrn_n_s32(s_lo[2], DCT_CONST_BITS);
+  x_hi[2] = vrshrn_n_s32(s_hi[2], DCT_CONST_BITS);
+  x_lo[3] = vrshrn_n_s32(s_lo[3], DCT_CONST_BITS);
+  x_hi[3] = vrshrn_n_s32(s_hi[3], DCT_CONST_BITS);
+  x_lo[6] = vrshrn_n_s32(s_lo[6], DCT_CONST_BITS);
+  x_hi[6] = vrshrn_n_s32(s_hi[6], DCT_CONST_BITS);
+  x_lo[7] = vrshrn_n_s32(s_lo[7], DCT_CONST_BITS);
+  x_hi[7] = vrshrn_n_s32(s_hi[7], DCT_CONST_BITS);
+  x_lo[10] = vrshrn_n_s32(s_lo[10], DCT_CONST_BITS);
+  x_hi[10] = vrshrn_n_s32(s_hi[10], DCT_CONST_BITS);
+  x_lo[11] = vrshrn_n_s32(s_lo[11], DCT_CONST_BITS);
+  x_hi[11] = vrshrn_n_s32(s_hi[11], DCT_CONST_BITS);
+  x_lo[14] = vrshrn_n_s32(s_lo[14], DCT_CONST_BITS);
+  x_hi[14] = vrshrn_n_s32(s_hi[14], DCT_CONST_BITS);
+  x_lo[15] = vrshrn_n_s32(s_lo[15], DCT_CONST_BITS);
+  x_hi[15] = vrshrn_n_s32(s_hi[15], DCT_CONST_BITS);
+
+  // x0, x1, x4, x5, x8, x9, x12, x13 narrow down to 16-bits directly
+  x_lo[0] = vmovn_s32(t_lo[0]);
+  x_hi[0] = vmovn_s32(t_hi[0]);
+  x_lo[1] = vmovn_s32(t_lo[1]);
+  x_hi[1] = vmovn_s32(t_hi[1]);
+  x_lo[4] = vmovn_s32(t_lo[4]);
+  x_hi[4] = vmovn_s32(t_hi[4]);
+  x_lo[5] = vmovn_s32(t_lo[5]);
+  x_hi[5] = vmovn_s32(t_hi[5]);
+  x_lo[8] = vmovn_s32(t_lo[8]);
+  x_hi[8] = vmovn_s32(t_hi[8]);
+  x_lo[9] = vmovn_s32(t_lo[9]);
+  x_hi[9] = vmovn_s32(t_hi[9]);
+  x_lo[12] = vmovn_s32(t_lo[12]);
+  x_hi[12] = vmovn_s32(t_hi[12]);
+  x_lo[13] = vmovn_s32(t_lo[13]);
+  x_hi[13] = vmovn_s32(t_hi[13]);
+
+  in[0] = vcombine_s16(x_lo[0], x_hi[0]);
+  in[1] = vnegq_s16(vcombine_s16(x_lo[8], x_hi[8]));
+  in[2] = vcombine_s16(x_lo[12], x_hi[12]);
+  in[3] = vnegq_s16(vcombine_s16(x_lo[4], x_hi[4]));
+  in[4] = vcombine_s16(x_lo[6], x_hi[6]);
+  in[5] = vcombine_s16(x_lo[14], x_hi[14]);
+  in[6] = vcombine_s16(x_lo[10], x_hi[10]);
+  in[7] = vcombine_s16(x_lo[2], x_hi[2]);
+  in[8] = vcombine_s16(x_lo[3], x_hi[3]);
+  in[9] = vcombine_s16(x_lo[11], x_hi[11]);
+  in[10] = vcombine_s16(x_lo[15], x_hi[15]);
+  in[11] = vcombine_s16(x_lo[7], x_hi[7]);
+  in[12] = vcombine_s16(x_lo[5], x_hi[5]);
+  in[13] = vnegq_s16(vcombine_s16(x_lo[13], x_hi[13]));
+  in[14] = vcombine_s16(x_lo[9], x_hi[9]);
+  in[15] = vnegq_s16(vcombine_s16(x_lo[1], x_hi[1]));
+}
+
+static void fdct16x16_neon(int16x8_t *in0, int16x8_t *in1) {
+  // Left half.
+  fdct16_8col(in0);
+  // Right half.
+  fdct16_8col(in1);
+  transpose_s16_16x16(in0, in1);
+}
+
+static void fadst16x16_neon(int16x8_t *in0, int16x8_t *in1) {
+  fadst16_8col(in0);
+  fadst16_8col(in1);
+  transpose_s16_16x16(in0, in1);
+}
+
+void vp9_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride,
+                       int tx_type) {
+  int16x8_t in0[16], in1[16];
+
+  switch (tx_type) {
+    case DCT_DCT: vpx_fdct16x16_neon(input, output, stride); break;
+    case ADST_DCT:
+      load_buffer_16x16(input, in0, in1, stride);
+      fadst16x16_neon(in0, in1);
+      right_shift_16x16(in0, in1);
+      fdct16x16_neon(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case DCT_ADST:
+      load_buffer_16x16(input, in0, in1, stride);
+      fdct16x16_neon(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16x16_neon(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      load_buffer_16x16(input, in0, in1, stride);
+      fadst16x16_neon(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16x16_neon(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE void highbd_load_buffer_4x4(const int16_t *input,
+                                          int32x4_t *in /*[4]*/, int stride) {
+  // { 0, 1, 1, 1 };
+  const int32x4_t nonzero_bias_a = vextq_s32(vdupq_n_s32(0), vdupq_n_s32(1), 3);
+  // { 1, 0, 0, 0 };
+  const int32x4_t nonzero_bias_b = vextq_s32(vdupq_n_s32(1), vdupq_n_s32(0), 3);
+  int32x4_t mask;
+
+  in[0] = vshll_n_s16(vld1_s16(input + 0 * stride), 4);
+  in[1] = vshll_n_s16(vld1_s16(input + 1 * stride), 4);
+  in[2] = vshll_n_s16(vld1_s16(input + 2 * stride), 4);
+  in[3] = vshll_n_s16(vld1_s16(input + 3 * stride), 4);
+
+  // Copy the SSE method, use a mask to avoid an 'if' branch here to increase by
+  // one non-zero first elements
+  mask = vreinterpretq_s32_u32(vceqq_s32(in[0], nonzero_bias_a));
+  in[0] = vaddq_s32(in[0], mask);
+  in[0] = vaddq_s32(in[0], nonzero_bias_b);
+}
+
+static INLINE void highbd_write_buffer_4x4(tran_low_t *output, int32x4_t *res) {
+  const int32x4_t one = vdupq_n_s32(1);
+  res[0] = vshrq_n_s32(vaddq_s32(res[0], one), 2);
+  res[1] = vshrq_n_s32(vaddq_s32(res[1], one), 2);
+  res[2] = vshrq_n_s32(vaddq_s32(res[2], one), 2);
+  res[3] = vshrq_n_s32(vaddq_s32(res[3], one), 2);
+  vst1q_s32(output + 0 * 4, res[0]);
+  vst1q_s32(output + 1 * 4, res[1]);
+  vst1q_s32(output + 2 * 4, res[2]);
+  vst1q_s32(output + 3 * 4, res[3]);
+}
+
+static INLINE void highbd_fadst4x4_neon(int32x4_t *in /*[4]*/) {
+  int32x2_t s_lo[4], s_hi[4];
+  int64x2_t u_lo[4], u_hi[4], t_lo[4], t_hi[4];
+
+  s_lo[0] = vget_low_s32(in[0]);
+  s_hi[0] = vget_high_s32(in[0]);
+  s_lo[1] = vget_low_s32(in[1]);
+  s_hi[1] = vget_high_s32(in[1]);
+  s_lo[2] = vget_low_s32(in[2]);
+  s_hi[2] = vget_high_s32(in[2]);
+  s_lo[3] = vget_low_s32(in[3]);
+  s_hi[3] = vget_high_s32(in[3]);
+
+  // t0 = s0 * sinpi_1_9 + s1 * sinpi_2_9 + s3 * sinpi_4_9
+  t_lo[0] = vmull_n_s32(s_lo[0], sinpi_1_9);
+  t_lo[0] = vmlal_n_s32(t_lo[0], s_lo[1], sinpi_2_9);
+  t_lo[0] = vmlal_n_s32(t_lo[0], s_lo[3], sinpi_4_9);
+  t_hi[0] = vmull_n_s32(s_hi[0], sinpi_1_9);
+  t_hi[0] = vmlal_n_s32(t_hi[0], s_hi[1], sinpi_2_9);
+  t_hi[0] = vmlal_n_s32(t_hi[0], s_hi[3], sinpi_4_9);
+
+  // t1 = (s0 + s1) * sinpi_3_9 - s3 * sinpi_3_9
+  t_lo[1] = vmull_n_s32(s_lo[0], sinpi_3_9);
+  t_lo[1] = vmlal_n_s32(t_lo[1], s_lo[1], sinpi_3_9);
+  t_lo[1] = vmlsl_n_s32(t_lo[1], s_lo[3], sinpi_3_9);
+  t_hi[1] = vmull_n_s32(s_hi[0], sinpi_3_9);
+  t_hi[1] = vmlal_n_s32(t_hi[1], s_hi[1], sinpi_3_9);
+  t_hi[1] = vmlsl_n_s32(t_hi[1], s_hi[3], sinpi_3_9);
+
+  // t2 = s0 * sinpi_4_9 - s1* sinpi_1_9 + s3 * sinpi_2_9
+  t_lo[2] = vmull_n_s32(s_lo[0], sinpi_4_9);
+  t_lo[2] = vmlsl_n_s32(t_lo[2], s_lo[1], sinpi_1_9);
+  t_lo[2] = vmlal_n_s32(t_lo[2], s_lo[3], sinpi_2_9);
+  t_hi[2] = vmull_n_s32(s_hi[0], sinpi_4_9);
+  t_hi[2] = vmlsl_n_s32(t_hi[2], s_hi[1], sinpi_1_9);
+  t_hi[2] = vmlal_n_s32(t_hi[2], s_hi[3], sinpi_2_9);
+
+  // t3 = s2 * sinpi_3_9
+  t_lo[3] = vmull_n_s32(s_lo[2], sinpi_3_9);
+  t_hi[3] = vmull_n_s32(s_hi[2], sinpi_3_9);
+
+  /*
+   * u0 = t0 + t3
+   * u1 = t1
+   * u2 = t2 - t3
+   * u3 = t2 - t0 + t3
+   */
+  u_lo[0] = vaddq_s64(t_lo[0], t_lo[3]);
+  u_hi[0] = vaddq_s64(t_hi[0], t_hi[3]);
+  u_lo[1] = t_lo[1];
+  u_hi[1] = t_hi[1];
+  u_lo[2] = vsubq_s64(t_lo[2], t_lo[3]);
+  u_hi[2] = vsubq_s64(t_hi[2], t_hi[3]);
+  u_lo[3] = vaddq_s64(vsubq_s64(t_lo[2], t_lo[0]), t_lo[3]);
+  u_hi[3] = vaddq_s64(vsubq_s64(t_hi[2], t_hi[0]), t_hi[3]);
+
+  // fdct_round_shift
+  in[0] = vcombine_s32(vrshrn_n_s64(u_lo[0], DCT_CONST_BITS),
+                       vrshrn_n_s64(u_hi[0], DCT_CONST_BITS));
+  in[1] = vcombine_s32(vrshrn_n_s64(u_lo[1], DCT_CONST_BITS),
+                       vrshrn_n_s64(u_hi[1], DCT_CONST_BITS));
+  in[2] = vcombine_s32(vrshrn_n_s64(u_lo[2], DCT_CONST_BITS),
+                       vrshrn_n_s64(u_hi[2], DCT_CONST_BITS));
+  in[3] = vcombine_s32(vrshrn_n_s64(u_lo[3], DCT_CONST_BITS),
+                       vrshrn_n_s64(u_hi[3], DCT_CONST_BITS));
+
+  transpose_s32_4x4(&in[0], &in[1], &in[2], &in[3]);
+}
+
+void vp9_highbd_fht4x4_neon(const int16_t *input, tran_low_t *output,
+                            int stride, int tx_type) {
+  int32x4_t in[4];
+  // int i;
+
+  switch (tx_type) {
+    case DCT_DCT: vpx_highbd_fdct4x4_neon(input, output, stride); break;
+    case ADST_DCT:
+      highbd_load_buffer_4x4(input, in, stride);
+      highbd_fadst4x4_neon(in);
+      vpx_highbd_fdct4x4_pass1_neon(in);
+      highbd_write_buffer_4x4(output, in);
+      break;
+    case DCT_ADST:
+      highbd_load_buffer_4x4(input, in, stride);
+      vpx_highbd_fdct4x4_pass1_neon(in);
+      highbd_fadst4x4_neon(in);
+      highbd_write_buffer_4x4(output, in);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      highbd_load_buffer_4x4(input, in, stride);
+      highbd_fadst4x4_neon(in);
+      highbd_fadst4x4_neon(in);
+      highbd_write_buffer_4x4(output, in);
+      break;
+  }
+}
+
+static INLINE void highbd_load_buffer_8x8(const int16_t *input,
+                                          int32x4_t *lo /*[8]*/,
+                                          int32x4_t *hi /*[8]*/, int stride) {
+  int16x8_t in[8];
+  in[0] = vld1q_s16(input + 0 * stride);
+  in[1] = vld1q_s16(input + 1 * stride);
+  in[2] = vld1q_s16(input + 2 * stride);
+  in[3] = vld1q_s16(input + 3 * stride);
+  in[4] = vld1q_s16(input + 4 * stride);
+  in[5] = vld1q_s16(input + 5 * stride);
+  in[6] = vld1q_s16(input + 6 * stride);
+  in[7] = vld1q_s16(input + 7 * stride);
+  lo[0] = vshll_n_s16(vget_low_s16(in[0]), 2);
+  hi[0] = vshll_n_s16(vget_high_s16(in[0]), 2);
+  lo[1] = vshll_n_s16(vget_low_s16(in[1]), 2);
+  hi[1] = vshll_n_s16(vget_high_s16(in[1]), 2);
+  lo[2] = vshll_n_s16(vget_low_s16(in[2]), 2);
+  hi[2] = vshll_n_s16(vget_high_s16(in[2]), 2);
+  lo[3] = vshll_n_s16(vget_low_s16(in[3]), 2);
+  hi[3] = vshll_n_s16(vget_high_s16(in[3]), 2);
+  lo[4] = vshll_n_s16(vget_low_s16(in[4]), 2);
+  hi[4] = vshll_n_s16(vget_high_s16(in[4]), 2);
+  lo[5] = vshll_n_s16(vget_low_s16(in[5]), 2);
+  hi[5] = vshll_n_s16(vget_high_s16(in[5]), 2);
+  lo[6] = vshll_n_s16(vget_low_s16(in[6]), 2);
+  hi[6] = vshll_n_s16(vget_high_s16(in[6]), 2);
+  lo[7] = vshll_n_s16(vget_low_s16(in[7]), 2);
+  hi[7] = vshll_n_s16(vget_high_s16(in[7]), 2);
+}
+
+/* right shift and rounding
+ * first get the sign bit (bit 15).
+ * If bit == 1, it's the simple case of shifting right by one bit.
+ * If bit == 2, it essentially computes the expression:
+ *
+ * out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+ *
+ * for each row.
+ */
+static INLINE void highbd_right_shift_8x8(int32x4_t *lo, int32x4_t *hi,
+                                          const int bit) {
+  int32x4_t sign_lo[8], sign_hi[8];
+  sign_lo[0] = vshrq_n_s32(lo[0], 31);
+  sign_hi[0] = vshrq_n_s32(hi[0], 31);
+  sign_lo[1] = vshrq_n_s32(lo[1], 31);
+  sign_hi[1] = vshrq_n_s32(hi[1], 31);
+  sign_lo[2] = vshrq_n_s32(lo[2], 31);
+  sign_hi[2] = vshrq_n_s32(hi[2], 31);
+  sign_lo[3] = vshrq_n_s32(lo[3], 31);
+  sign_hi[3] = vshrq_n_s32(hi[3], 31);
+  sign_lo[4] = vshrq_n_s32(lo[4], 31);
+  sign_hi[4] = vshrq_n_s32(hi[4], 31);
+  sign_lo[5] = vshrq_n_s32(lo[5], 31);
+  sign_hi[5] = vshrq_n_s32(hi[5], 31);
+  sign_lo[6] = vshrq_n_s32(lo[6], 31);
+  sign_hi[6] = vshrq_n_s32(hi[6], 31);
+  sign_lo[7] = vshrq_n_s32(lo[7], 31);
+  sign_hi[7] = vshrq_n_s32(hi[7], 31);
+
+  if (bit == 2) {
+    const int32x4_t const_rounding = vdupq_n_s32(1);
+    lo[0] = vaddq_s32(lo[0], const_rounding);
+    hi[0] = vaddq_s32(hi[0], const_rounding);
+    lo[1] = vaddq_s32(lo[1], const_rounding);
+    hi[1] = vaddq_s32(hi[1], const_rounding);
+    lo[2] = vaddq_s32(lo[2], const_rounding);
+    hi[2] = vaddq_s32(hi[2], const_rounding);
+    lo[3] = vaddq_s32(lo[3], const_rounding);
+    hi[3] = vaddq_s32(hi[3], const_rounding);
+    lo[4] = vaddq_s32(lo[4], const_rounding);
+    hi[4] = vaddq_s32(hi[4], const_rounding);
+    lo[5] = vaddq_s32(lo[5], const_rounding);
+    hi[5] = vaddq_s32(hi[5], const_rounding);
+    lo[6] = vaddq_s32(lo[6], const_rounding);
+    hi[6] = vaddq_s32(hi[6], const_rounding);
+    lo[7] = vaddq_s32(lo[7], const_rounding);
+    hi[7] = vaddq_s32(hi[7], const_rounding);
+  }
+
+  lo[0] = vsubq_s32(lo[0], sign_lo[0]);
+  hi[0] = vsubq_s32(hi[0], sign_hi[0]);
+  lo[1] = vsubq_s32(lo[1], sign_lo[1]);
+  hi[1] = vsubq_s32(hi[1], sign_hi[1]);
+  lo[2] = vsubq_s32(lo[2], sign_lo[2]);
+  hi[2] = vsubq_s32(hi[2], sign_hi[2]);
+  lo[3] = vsubq_s32(lo[3], sign_lo[3]);
+  hi[3] = vsubq_s32(hi[3], sign_hi[3]);
+  lo[4] = vsubq_s32(lo[4], sign_lo[4]);
+  hi[4] = vsubq_s32(hi[4], sign_hi[4]);
+  lo[5] = vsubq_s32(lo[5], sign_lo[5]);
+  hi[5] = vsubq_s32(hi[5], sign_hi[5]);
+  lo[6] = vsubq_s32(lo[6], sign_lo[6]);
+  hi[6] = vsubq_s32(hi[6], sign_hi[6]);
+  lo[7] = vsubq_s32(lo[7], sign_lo[7]);
+  hi[7] = vsubq_s32(hi[7], sign_hi[7]);
+
+  if (bit == 1) {
+    lo[0] = vshrq_n_s32(lo[0], 1);
+    hi[0] = vshrq_n_s32(hi[0], 1);
+    lo[1] = vshrq_n_s32(lo[1], 1);
+    hi[1] = vshrq_n_s32(hi[1], 1);
+    lo[2] = vshrq_n_s32(lo[2], 1);
+    hi[2] = vshrq_n_s32(hi[2], 1);
+    lo[3] = vshrq_n_s32(lo[3], 1);
+    hi[3] = vshrq_n_s32(hi[3], 1);
+    lo[4] = vshrq_n_s32(lo[4], 1);
+    hi[4] = vshrq_n_s32(hi[4], 1);
+    lo[5] = vshrq_n_s32(lo[5], 1);
+    hi[5] = vshrq_n_s32(hi[5], 1);
+    lo[6] = vshrq_n_s32(lo[6], 1);
+    hi[6] = vshrq_n_s32(hi[6], 1);
+    lo[7] = vshrq_n_s32(lo[7], 1);
+    hi[7] = vshrq_n_s32(hi[7], 1);
+  } else {
+    lo[0] = vshrq_n_s32(lo[0], 2);
+    hi[0] = vshrq_n_s32(hi[0], 2);
+    lo[1] = vshrq_n_s32(lo[1], 2);
+    hi[1] = vshrq_n_s32(hi[1], 2);
+    lo[2] = vshrq_n_s32(lo[2], 2);
+    hi[2] = vshrq_n_s32(hi[2], 2);
+    lo[3] = vshrq_n_s32(lo[3], 2);
+    hi[3] = vshrq_n_s32(hi[3], 2);
+    lo[4] = vshrq_n_s32(lo[4], 2);
+    hi[4] = vshrq_n_s32(hi[4], 2);
+    lo[5] = vshrq_n_s32(lo[5], 2);
+    hi[5] = vshrq_n_s32(hi[5], 2);
+    lo[6] = vshrq_n_s32(lo[6], 2);
+    hi[6] = vshrq_n_s32(hi[6], 2);
+    lo[7] = vshrq_n_s32(lo[7], 2);
+    hi[7] = vshrq_n_s32(hi[7], 2);
+  }
+}
+
+static INLINE void highbd_write_buffer_8x8(tran_low_t *output, int32x4_t *lo,
+                                           int32x4_t *hi, int stride) {
+  vst1q_s32(output + 0 * stride, lo[0]);
+  vst1q_s32(output + 0 * stride + 4, hi[0]);
+  vst1q_s32(output + 1 * stride, lo[1]);
+  vst1q_s32(output + 1 * stride + 4, hi[1]);
+  vst1q_s32(output + 2 * stride, lo[2]);
+  vst1q_s32(output + 2 * stride + 4, hi[2]);
+  vst1q_s32(output + 3 * stride, lo[3]);
+  vst1q_s32(output + 3 * stride + 4, hi[3]);
+  vst1q_s32(output + 4 * stride, lo[4]);
+  vst1q_s32(output + 4 * stride + 4, hi[4]);
+  vst1q_s32(output + 5 * stride, lo[5]);
+  vst1q_s32(output + 5 * stride + 4, hi[5]);
+  vst1q_s32(output + 6 * stride, lo[6]);
+  vst1q_s32(output + 6 * stride + 4, hi[6]);
+  vst1q_s32(output + 7 * stride, lo[7]);
+  vst1q_s32(output + 7 * stride + 4, hi[7]);
+}
+
+static INLINE void highbd_fadst8x8_neon(int32x4_t *lo /*[8]*/,
+                                        int32x4_t *hi /*[8]*/) {
+  int32x4_t s_lo[8], s_hi[8];
+  int32x4_t t_lo[8], t_hi[8];
+  int32x4_t x_lo[8], x_hi[8];
+  int64x2_t s64_lo[16], s64_hi[16];
+
+  x_lo[0] = lo[7];
+  x_hi[0] = hi[7];
+  x_lo[1] = lo[0];
+  x_hi[1] = hi[0];
+  x_lo[2] = lo[5];
+  x_hi[2] = hi[5];
+  x_lo[3] = lo[2];
+  x_hi[3] = hi[2];
+  x_lo[4] = lo[3];
+  x_hi[4] = hi[3];
+  x_lo[5] = lo[4];
+  x_hi[5] = hi[4];
+  x_lo[6] = lo[1];
+  x_hi[6] = hi[1];
+  x_lo[7] = lo[6];
+  x_hi[7] = hi[6];
+
+  // stage 1
+  // s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
+  // s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[0], x_hi[0], x_lo[1], x_hi[1], cospi_2_64, cospi_30_64,
+      &s64_lo[2 * 0], &s64_hi[2 * 0], &s64_lo[2 * 1], &s64_hi[2 * 1]);
+  // s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+  // s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[2], x_hi[2], x_lo[3], x_hi[3], cospi_10_64, cospi_22_64,
+      &s64_lo[2 * 2], &s64_hi[2 * 2], &s64_lo[2 * 3], &s64_hi[2 * 3]);
+
+  // s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+  // s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[4], x_hi[4], x_lo[5], x_hi[5], cospi_18_64, cospi_14_64,
+      &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]);
+
+  // s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
+  // s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[6], x_hi[6], x_lo[7], x_hi[7], cospi_26_64, cospi_6_64,
+      &s64_lo[2 * 6], &s64_hi[2 * 6], &s64_lo[2 * 7], &s64_hi[2 * 7]);
+
+  // fdct_round_shift, indices are doubled
+  t_lo[0] = add_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 4]);
+  t_hi[0] = add_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 4]);
+  t_lo[1] = add_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 5]);
+  t_hi[1] = add_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 5]);
+  t_lo[2] = add_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 6]);
+  t_hi[2] = add_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 6]);
+  t_lo[3] = add_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 7]);
+  t_hi[3] = add_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 7]);
+  t_lo[4] = sub_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 4]);
+  t_hi[4] = sub_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 4]);
+  t_lo[5] = sub_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 5]);
+  t_hi[5] = sub_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 5]);
+  t_lo[6] = sub_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 6]);
+  t_hi[6] = sub_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 6]);
+  t_lo[7] = sub_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 7]);
+  t_hi[7] = sub_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 7]);
+
+  // stage 2
+  s_lo[0] = t_lo[0];
+  s_hi[0] = t_hi[0];
+  s_lo[1] = t_lo[1];
+  s_hi[1] = t_hi[1];
+  s_lo[2] = t_lo[2];
+  s_hi[2] = t_hi[2];
+  s_lo[3] = t_lo[3];
+  s_hi[3] = t_hi[3];
+  // s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
+  // s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[4], t_hi[4], t_lo[5], t_hi[5], cospi_8_64, cospi_24_64,
+      &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]);
+
+  // s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
+  // s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[6], t_hi[6], t_lo[7], t_hi[7], -cospi_24_64, cospi_8_64,
+      &s64_lo[2 * 6], &s64_hi[2 * 6], &s64_lo[2 * 7], &s64_hi[2 * 7]);
+
+  // fdct_round_shift
+  // s0 + s2
+  t_lo[0] = add_s32_s64_narrow(s_lo[0], s_lo[2]);
+  t_hi[0] = add_s32_s64_narrow(s_hi[0], s_hi[2]);
+  // s0 - s2
+  t_lo[2] = sub_s32_s64_narrow(s_lo[0], s_lo[2]);
+  t_hi[2] = sub_s32_s64_narrow(s_hi[0], s_hi[2]);
+
+  // s1 + s3
+  t_lo[1] = add_s32_s64_narrow(s_lo[1], s_lo[3]);
+  t_hi[1] = add_s32_s64_narrow(s_hi[1], s_hi[3]);
+  // s1 - s3
+  t_lo[3] = sub_s32_s64_narrow(s_lo[1], s_lo[3]);
+  t_hi[3] = sub_s32_s64_narrow(s_hi[1], s_hi[3]);
+
+  // s4 + s6
+  t_lo[4] = add_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]);
+  t_hi[4] = add_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]);
+  // s4 - s6
+  t_lo[6] = sub_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]);
+  t_hi[6] = sub_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]);
+
+  // s5 + s7
+  t_lo[5] = add_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]);
+  t_hi[5] = add_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]);
+  // s5 - s7
+  t_lo[7] = sub_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]);
+  t_hi[7] = sub_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]);
+
+  // stage 3
+  // s2 = cospi_16_64 * (x2 + x3)
+  // s3 = cospi_16_64 * (x2 - x3)
+  butterfly_one_coeff_s32_fast(t_lo[2], t_hi[2], t_lo[3], t_hi[3], cospi_16_64,
+                               &s_lo[2], &s_hi[2], &s_lo[3], &s_hi[3]);
+
+  // s6 = cospi_16_64 * (x6 + x7)
+  // s7 = cospi_16_64 * (x6 - x7)
+  butterfly_one_coeff_s32_fast(t_lo[6], t_hi[6], t_lo[7], t_hi[7], cospi_16_64,
+                               &s_lo[6], &s_hi[6], &s_lo[7], &s_hi[7]);
+
+  // x0, x2, x4, x6 pass through
+  lo[0] = t_lo[0];
+  hi[0] = t_hi[0];
+  lo[2] = s_lo[6];
+  hi[2] = s_hi[6];
+  lo[4] = s_lo[3];
+  hi[4] = s_hi[3];
+  lo[6] = t_lo[5];
+  hi[6] = t_hi[5];
+
+  lo[1] = vnegq_s32(t_lo[4]);
+  hi[1] = vnegq_s32(t_hi[4]);
+  lo[3] = vnegq_s32(s_lo[2]);
+  hi[3] = vnegq_s32(s_hi[2]);
+  lo[5] = vnegq_s32(s_lo[7]);
+  hi[5] = vnegq_s32(s_hi[7]);
+  lo[7] = vnegq_s32(t_lo[1]);
+  hi[7] = vnegq_s32(t_hi[1]);
+
+  transpose_s32_8x8_2(lo, hi, lo, hi);
+}
+
+void vp9_highbd_fht8x8_neon(const int16_t *input, tran_low_t *output,
+                            int stride, int tx_type) {
+  int32x4_t lo[8], hi[8];
+
+  switch (tx_type) {
+    case DCT_DCT: vpx_highbd_fdct8x8_neon(input, output, stride); break;
+    case ADST_DCT:
+      highbd_load_buffer_8x8(input, lo, hi, stride);
+      highbd_fadst8x8_neon(lo, hi);
+      // pass1 variant is not precise enough
+      vpx_highbd_fdct8x8_pass2_neon(lo, hi);
+      highbd_right_shift_8x8(lo, hi, 1);
+      highbd_write_buffer_8x8(output, lo, hi, 8);
+      break;
+    case DCT_ADST:
+      highbd_load_buffer_8x8(input, lo, hi, stride);
+      // pass1 variant is not precise enough
+      vpx_highbd_fdct8x8_pass2_neon(lo, hi);
+      highbd_fadst8x8_neon(lo, hi);
+      highbd_right_shift_8x8(lo, hi, 1);
+      highbd_write_buffer_8x8(output, lo, hi, 8);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      highbd_load_buffer_8x8(input, lo, hi, stride);
+      highbd_fadst8x8_neon(lo, hi);
+      highbd_fadst8x8_neon(lo, hi);
+      highbd_right_shift_8x8(lo, hi, 1);
+      highbd_write_buffer_8x8(output, lo, hi, 8);
+      break;
+  }
+}
+
+static INLINE void highbd_load_buffer_16x16(
+    const int16_t *input, int32x4_t *left1 /*[16]*/, int32x4_t *right1 /*[16]*/,
+    int32x4_t *left2 /*[16]*/, int32x4_t *right2 /*[16]*/, int stride) {
+  // load first 8 columns
+  highbd_load_buffer_8x8(input, left1, right1, stride);
+  highbd_load_buffer_8x8(input + 8 * stride, left1 + 8, right1 + 8, stride);
+
+  input += 8;
+  // load second 8 columns
+  highbd_load_buffer_8x8(input, left2, right2, stride);
+  highbd_load_buffer_8x8(input + 8 * stride, left2 + 8, right2 + 8, stride);
+}
+
+static INLINE void highbd_write_buffer_16x16(
+    tran_low_t *output, int32x4_t *left1 /*[16]*/, int32x4_t *right1 /*[16]*/,
+    int32x4_t *left2 /*[16]*/, int32x4_t *right2 /*[16]*/, int stride) {
+  // write first 8 columns
+  highbd_write_buffer_8x8(output, left1, right1, stride);
+  highbd_write_buffer_8x8(output + 8 * stride, left1 + 8, right1 + 8, stride);
+
+  // write second 8 columns
+  output += 8;
+  highbd_write_buffer_8x8(output, left2, right2, stride);
+  highbd_write_buffer_8x8(output + 8 * stride, left2 + 8, right2 + 8, stride);
+}
+
+static INLINE void highbd_right_shift_16x16(int32x4_t *left1 /*[16]*/,
+                                            int32x4_t *right1 /*[16]*/,
+                                            int32x4_t *left2 /*[16]*/,
+                                            int32x4_t *right2 /*[16]*/,
+                                            const int bit) {
+  // perform rounding operations
+  highbd_right_shift_8x8(left1, right1, bit);
+  highbd_right_shift_8x8(left1 + 8, right1 + 8, bit);
+  highbd_right_shift_8x8(left2, right2, bit);
+  highbd_right_shift_8x8(left2 + 8, right2 + 8, bit);
+}
+
+static void highbd_fdct16_8col(int32x4_t *left, int32x4_t *right) {
+  // perform 16x16 1-D DCT for 8 columns
+  int32x4_t s1_lo[8], s1_hi[8], s2_lo[8], s2_hi[8], s3_lo[8], s3_hi[8];
+  int32x4_t left8[8], right8[8];
+
+  // stage 1
+  left8[0] = vaddq_s32(left[0], left[15]);
+  right8[0] = vaddq_s32(right[0], right[15]);
+  left8[1] = vaddq_s32(left[1], left[14]);
+  right8[1] = vaddq_s32(right[1], right[14]);
+  left8[2] = vaddq_s32(left[2], left[13]);
+  right8[2] = vaddq_s32(right[2], right[13]);
+  left8[3] = vaddq_s32(left[3], left[12]);
+  right8[3] = vaddq_s32(right[3], right[12]);
+  left8[4] = vaddq_s32(left[4], left[11]);
+  right8[4] = vaddq_s32(right[4], right[11]);
+  left8[5] = vaddq_s32(left[5], left[10]);
+  right8[5] = vaddq_s32(right[5], right[10]);
+  left8[6] = vaddq_s32(left[6], left[9]);
+  right8[6] = vaddq_s32(right[6], right[9]);
+  left8[7] = vaddq_s32(left[7], left[8]);
+  right8[7] = vaddq_s32(right[7], right[8]);
+
+  // step 1
+  s1_lo[0] = vsubq_s32(left[7], left[8]);
+  s1_hi[0] = vsubq_s32(right[7], right[8]);
+  s1_lo[1] = vsubq_s32(left[6], left[9]);
+  s1_hi[1] = vsubq_s32(right[6], right[9]);
+  s1_lo[2] = vsubq_s32(left[5], left[10]);
+  s1_hi[2] = vsubq_s32(right[5], right[10]);
+  s1_lo[3] = vsubq_s32(left[4], left[11]);
+  s1_hi[3] = vsubq_s32(right[4], right[11]);
+  s1_lo[4] = vsubq_s32(left[3], left[12]);
+  s1_hi[4] = vsubq_s32(right[3], right[12]);
+  s1_lo[5] = vsubq_s32(left[2], left[13]);
+  s1_hi[5] = vsubq_s32(right[2], right[13]);
+  s1_lo[6] = vsubq_s32(left[1], left[14]);
+  s1_hi[6] = vsubq_s32(right[1], right[14]);
+  s1_lo[7] = vsubq_s32(left[0], left[15]);
+  s1_hi[7] = vsubq_s32(right[0], right[15]);
+
+  // pass1 variant is not accurate enough
+  vpx_highbd_fdct8x8_pass2_notranspose_neon(left8, right8);
+
+  // step 2
+  // step2[2] = (step1[5] - step1[2]) * cospi_16_64;
+  // step2[5] = (step1[5] + step1[2]) * cospi_16_64;
+  butterfly_one_coeff_s32_s64_narrow(s1_lo[5], s1_hi[5], s1_lo[2], s1_hi[2],
+                                     cospi_16_64, &s2_lo[5], &s2_hi[5],
+                                     &s2_lo[2], &s2_hi[2]);
+  // step2[3] = (step1[4] - step1[3]) * cospi_16_64;
+  // step2[4] = (step1[4] + step1[3]) * cospi_16_64;
+  butterfly_one_coeff_s32_s64_narrow(s1_lo[4], s1_hi[4], s1_lo[3], s1_hi[3],
+                                     cospi_16_64, &s2_lo[4], &s2_hi[4],
+                                     &s2_lo[3], &s2_hi[3]);
+
+  // step 3
+  s3_lo[0] = vaddq_s32(s1_lo[0], s2_lo[3]);
+  s3_hi[0] = vaddq_s32(s1_hi[0], s2_hi[3]);
+  s3_lo[1] = vaddq_s32(s1_lo[1], s2_lo[2]);
+  s3_hi[1] = vaddq_s32(s1_hi[1], s2_hi[2]);
+  s3_lo[2] = vsubq_s32(s1_lo[1], s2_lo[2]);
+  s3_hi[2] = vsubq_s32(s1_hi[1], s2_hi[2]);
+  s3_lo[3] = vsubq_s32(s1_lo[0], s2_lo[3]);
+  s3_hi[3] = vsubq_s32(s1_hi[0], s2_hi[3]);
+  s3_lo[4] = vsubq_s32(s1_lo[7], s2_lo[4]);
+  s3_hi[4] = vsubq_s32(s1_hi[7], s2_hi[4]);
+  s3_lo[5] = vsubq_s32(s1_lo[6], s2_lo[5]);
+  s3_hi[5] = vsubq_s32(s1_hi[6], s2_hi[5]);
+  s3_lo[6] = vaddq_s32(s1_lo[6], s2_lo[5]);
+  s3_hi[6] = vaddq_s32(s1_hi[6], s2_hi[5]);
+  s3_lo[7] = vaddq_s32(s1_lo[7], s2_lo[4]);
+  s3_hi[7] = vaddq_s32(s1_hi[7], s2_hi[4]);
+
+  // step 4
+  // s2[1] = cospi_24_64 * s3[6] - cospi_8_64 * s3[1]
+  // s2[6] = cospi_8_64 * s3[6]  + cospi_24_64 * s3[1]
+  butterfly_two_coeff_s32_s64_narrow(s3_lo[6], s3_hi[6], s3_lo[1], s3_hi[1],
+                                     cospi_8_64, cospi_24_64, &s2_lo[6],
+                                     &s2_hi[6], &s2_lo[1], &s2_hi[1]);
+
+  // s2[5] =  cospi_8_64 * s3[2] - cospi_24_64 * s3[5]
+  // s2[2] = cospi_24_64 * s3[2] + cospi_8_64 * s3[5]
+  butterfly_two_coeff_s32_s64_narrow(s3_lo[2], s3_hi[2], s3_lo[5], s3_hi[5],
+                                     cospi_24_64, cospi_8_64, &s2_lo[2],
+                                     &s2_hi[2], &s2_lo[5], &s2_hi[5]);
+
+  // step 5
+  s1_lo[0] = vaddq_s32(s3_lo[0], s2_lo[1]);
+  s1_hi[0] = vaddq_s32(s3_hi[0], s2_hi[1]);
+  s1_lo[1] = vsubq_s32(s3_lo[0], s2_lo[1]);
+  s1_hi[1] = vsubq_s32(s3_hi[0], s2_hi[1]);
+  s1_lo[2] = vaddq_s32(s3_lo[3], s2_lo[2]);
+  s1_hi[2] = vaddq_s32(s3_hi[3], s2_hi[2]);
+  s1_lo[3] = vsubq_s32(s3_lo[3], s2_lo[2]);
+  s1_hi[3] = vsubq_s32(s3_hi[3], s2_hi[2]);
+  s1_lo[4] = vsubq_s32(s3_lo[4], s2_lo[5]);
+  s1_hi[4] = vsubq_s32(s3_hi[4], s2_hi[5]);
+  s1_lo[5] = vaddq_s32(s3_lo[4], s2_lo[5]);
+  s1_hi[5] = vaddq_s32(s3_hi[4], s2_hi[5]);
+  s1_lo[6] = vsubq_s32(s3_lo[7], s2_lo[6]);
+  s1_hi[6] = vsubq_s32(s3_hi[7], s2_hi[6]);
+  s1_lo[7] = vaddq_s32(s3_lo[7], s2_lo[6]);
+  s1_hi[7] = vaddq_s32(s3_hi[7], s2_hi[6]);
+
+  // step 6
+  // out[1]  = step1[7] * cospi_2_64 + step1[0] * cospi_30_64
+  // out[15] = step1[7] * cospi_30_64 - step1[0] * cospi_2_64
+  butterfly_two_coeff_s32_s64_narrow(s1_lo[7], s1_hi[7], s1_lo[0], s1_hi[0],
+                                     cospi_2_64, cospi_30_64, &left[1],
+                                     &right[1], &left[15], &right[15]);
+
+  // out[9] = step1[6] * cospi_18_64 + step1[1] * cospi_14_64
+  // out[7] = step1[6] * cospi_14_64 - step1[1] * cospi_18_64
+  butterfly_two_coeff_s32_s64_narrow(s1_lo[6], s1_hi[6], s1_lo[1], s1_hi[1],
+                                     cospi_18_64, cospi_14_64, &left[9],
+                                     &right[9], &left[7], &right[7]);
+
+  // out[5]  = step1[5] * cospi_10_64 + step1[2] * cospi_22_64
+  // out[11] = step1[5] * cospi_22_64 - step1[2] * cospi_10_64
+  butterfly_two_coeff_s32_s64_narrow(s1_lo[5], s1_hi[5], s1_lo[2], s1_hi[2],
+                                     cospi_10_64, cospi_22_64, &left[5],
+                                     &right[5], &left[11], &right[11]);
+
+  // out[13] = step1[4] * cospi_26_64 + step1[3] * cospi_6_64
+  // out[3]  = step1[4] * cospi_6_64  - step1[3] * cospi_26_64
+  butterfly_two_coeff_s32_s64_narrow(s1_lo[4], s1_hi[4], s1_lo[3], s1_hi[3],
+                                     cospi_26_64, cospi_6_64, &left[13],
+                                     &right[13], &left[3], &right[3]);
+
+  left[0] = left8[0];
+  right[0] = right8[0];
+  left[2] = left8[1];
+  right[2] = right8[1];
+  left[4] = left8[2];
+  right[4] = right8[2];
+  left[6] = left8[3];
+  right[6] = right8[3];
+  left[8] = left8[4];
+  right[8] = right8[4];
+  left[10] = left8[5];
+  right[10] = right8[5];
+  left[12] = left8[6];
+  right[12] = right8[6];
+  left[14] = left8[7];
+  right[14] = right8[7];
+}
+
+static void highbd_fadst16_8col(int32x4_t *left, int32x4_t *right) {
+  // perform 16x16 1-D ADST for 8 columns
+  int32x4_t x_lo[16], x_hi[16];
+  int32x4_t s_lo[16], s_hi[16];
+  int32x4_t t_lo[16], t_hi[16];
+  int64x2_t s64_lo[32], s64_hi[32];
+
+  x_lo[0] = left[15];
+  x_hi[0] = right[15];
+  x_lo[1] = left[0];
+  x_hi[1] = right[0];
+  x_lo[2] = left[13];
+  x_hi[2] = right[13];
+  x_lo[3] = left[2];
+  x_hi[3] = right[2];
+  x_lo[4] = left[11];
+  x_hi[4] = right[11];
+  x_lo[5] = left[4];
+  x_hi[5] = right[4];
+  x_lo[6] = left[9];
+  x_hi[6] = right[9];
+  x_lo[7] = left[6];
+  x_hi[7] = right[6];
+  x_lo[8] = left[7];
+  x_hi[8] = right[7];
+  x_lo[9] = left[8];
+  x_hi[9] = right[8];
+  x_lo[10] = left[5];
+  x_hi[10] = right[5];
+  x_lo[11] = left[10];
+  x_hi[11] = right[10];
+  x_lo[12] = left[3];
+  x_hi[12] = right[3];
+  x_lo[13] = left[12];
+  x_hi[13] = right[12];
+  x_lo[14] = left[1];
+  x_hi[14] = right[1];
+  x_lo[15] = left[14];
+  x_hi[15] = right[14];
+
+  // stage 1, indices are doubled
+  // s0 = cospi_1_64 * x0 + cospi_31_64 * x1;
+  // s1 = cospi_31_64 * x0 - cospi_1_64 * x1;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[0], x_hi[0], x_lo[1], x_hi[1], cospi_1_64, cospi_31_64,
+      &s64_lo[2 * 0], &s64_hi[2 * 0], &s64_lo[2 * 1], &s64_hi[2 * 1]);
+  // s2 = cospi_5_64 * x2 + cospi_27_64 * x3;
+  // s3 = cospi_27_64 * x2 - cospi_5_64 * x3;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[2], x_hi[2], x_lo[3], x_hi[3], cospi_5_64, cospi_27_64,
+      &s64_lo[2 * 2], &s64_hi[2 * 2], &s64_lo[2 * 3], &s64_hi[2 * 3]);
+  // s4 = cospi_9_64 * x4 + cospi_23_64 * x5;
+  // s5 = cospi_23_64 * x4 - cospi_9_64 * x5;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[4], x_hi[4], x_lo[5], x_hi[5], cospi_9_64, cospi_23_64,
+      &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]);
+  // s6 = cospi_13_64 * x6 + cospi_19_64 * x7;
+  // s7 = cospi_19_64 * x6 - cospi_13_64 * x7;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[6], x_hi[6], x_lo[7], x_hi[7], cospi_13_64, cospi_19_64,
+      &s64_lo[2 * 6], &s64_hi[2 * 6], &s64_lo[2 * 7], &s64_hi[2 * 7]);
+  // s8 = cospi_17_64 * x8 + cospi_15_64 * x9;
+  // s9 = cospi_15_64 * x8 - cospi_17_64 * x9;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[8], x_hi[8], x_lo[9], x_hi[9], cospi_17_64, cospi_15_64,
+      &s64_lo[2 * 8], &s64_hi[2 * 8], &s64_lo[2 * 9], &s64_hi[2 * 9]);
+  // s10 = cospi_21_64 * x10 + cospi_11_64 * x11;
+  // s11 = cospi_11_64 * x10 - cospi_21_64 * x11;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[10], x_hi[10], x_lo[11], x_hi[11], cospi_21_64, cospi_11_64,
+      &s64_lo[2 * 10], &s64_hi[2 * 10], &s64_lo[2 * 11], &s64_hi[2 * 11]);
+  // s12 = cospi_25_64 * x12 + cospi_7_64 * x13;
+  // s13 = cospi_7_64 * x12 - cospi_25_64 * x13;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[12], x_hi[12], x_lo[13], x_hi[13], cospi_25_64, cospi_7_64,
+      &s64_lo[2 * 12], &s64_hi[2 * 12], &s64_lo[2 * 13], &s64_hi[2 * 13]);
+  // s14 = cospi_29_64 * x14 + cospi_3_64 * x15;
+  // s15 = cospi_3_64 * x14 - cospi_29_64 * x15;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[14], x_hi[14], x_lo[15], x_hi[15], cospi_29_64, cospi_3_64,
+      &s64_lo[2 * 14], &s64_hi[2 * 14], &s64_lo[2 * 15], &s64_hi[2 * 15]);
+
+  // fdct_round_shift, indices are doubled
+  t_lo[0] = add_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 8]);
+  t_hi[0] = add_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 8]);
+  t_lo[1] = add_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 9]);
+  t_hi[1] = add_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 9]);
+  t_lo[2] = add_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 10]);
+  t_hi[2] = add_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 10]);
+  t_lo[3] = add_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 11]);
+  t_hi[3] = add_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 11]);
+  t_lo[4] = add_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 12]);
+  t_hi[4] = add_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 12]);
+  t_lo[5] = add_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 13]);
+  t_hi[5] = add_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 13]);
+  t_lo[6] = add_s64_round_narrow(&s64_lo[2 * 6], &s64_lo[2 * 14]);
+  t_hi[6] = add_s64_round_narrow(&s64_hi[2 * 6], &s64_hi[2 * 14]);
+  t_lo[7] = add_s64_round_narrow(&s64_lo[2 * 7], &s64_lo[2 * 15]);
+  t_hi[7] = add_s64_round_narrow(&s64_hi[2 * 7], &s64_hi[2 * 15]);
+  t_lo[8] = sub_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 8]);
+  t_hi[8] = sub_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 8]);
+  t_lo[9] = sub_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 9]);
+  t_hi[9] = sub_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 9]);
+  t_lo[10] = sub_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 10]);
+  t_hi[10] = sub_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 10]);
+  t_lo[11] = sub_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 11]);
+  t_hi[11] = sub_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 11]);
+  t_lo[12] = sub_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 12]);
+  t_hi[12] = sub_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 12]);
+  t_lo[13] = sub_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 13]);
+  t_hi[13] = sub_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 13]);
+  t_lo[14] = sub_s64_round_narrow(&s64_lo[2 * 6], &s64_lo[2 * 14]);
+  t_hi[14] = sub_s64_round_narrow(&s64_hi[2 * 6], &s64_hi[2 * 14]);
+  t_lo[15] = sub_s64_round_narrow(&s64_lo[2 * 7], &s64_lo[2 * 15]);
+  t_hi[15] = sub_s64_round_narrow(&s64_hi[2 * 7], &s64_hi[2 * 15]);
+
+  // stage 2
+  s_lo[0] = t_lo[0];
+  s_hi[0] = t_hi[0];
+  s_lo[1] = t_lo[1];
+  s_hi[1] = t_hi[1];
+  s_lo[2] = t_lo[2];
+  s_hi[2] = t_hi[2];
+  s_lo[3] = t_lo[3];
+  s_hi[3] = t_hi[3];
+  s_lo[4] = t_lo[4];
+  s_hi[4] = t_hi[4];
+  s_lo[5] = t_lo[5];
+  s_hi[5] = t_hi[5];
+  s_lo[6] = t_lo[6];
+  s_hi[6] = t_hi[6];
+  s_lo[7] = t_lo[7];
+  s_hi[7] = t_hi[7];
+  // s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+  // s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[8], t_hi[8], t_lo[9], t_hi[9], cospi_4_64, cospi_28_64,
+      &s64_lo[2 * 8], &s64_hi[2 * 8], &s64_lo[2 * 9], &s64_hi[2 * 9]);
+  // s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+  // s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[10], t_hi[10], t_lo[11], t_hi[11], cospi_20_64, cospi_12_64,
+      &s64_lo[2 * 10], &s64_hi[2 * 10], &s64_lo[2 * 11], &s64_hi[2 * 11]);
+  // s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+  // s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[13], t_hi[13], t_lo[12], t_hi[12], cospi_28_64, cospi_4_64,
+      &s64_lo[2 * 13], &s64_hi[2 * 13], &s64_lo[2 * 12], &s64_hi[2 * 12]);
+  // s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+  // s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[15], t_hi[15], t_lo[14], t_hi[14], cospi_12_64, cospi_20_64,
+      &s64_lo[2 * 15], &s64_hi[2 * 15], &s64_lo[2 * 14], &s64_hi[2 * 14]);
+
+  // s0 + s4
+  t_lo[0] = add_s32_s64_narrow(s_lo[0], s_lo[4]);
+  t_hi[0] = add_s32_s64_narrow(s_hi[0], s_hi[4]);
+  // s1 + s5
+  t_lo[1] = add_s32_s64_narrow(s_lo[1], s_lo[5]);
+  t_hi[1] = add_s32_s64_narrow(s_hi[1], s_hi[5]);
+  // s2 + s6
+  t_lo[2] = add_s32_s64_narrow(s_lo[2], s_lo[6]);
+  t_hi[2] = add_s32_s64_narrow(s_hi[2], s_hi[6]);
+  // s3 + s7
+  t_lo[3] = add_s32_s64_narrow(s_lo[3], s_lo[7]);
+  t_hi[3] = add_s32_s64_narrow(s_hi[3], s_hi[7]);
+
+  // s0 - s4
+  t_lo[4] = sub_s32_s64_narrow(s_lo[0], s_lo[4]);
+  t_hi[4] = sub_s32_s64_narrow(s_hi[0], s_hi[4]);
+  // s1 - s5
+  t_lo[5] = sub_s32_s64_narrow(s_lo[1], s_lo[5]);
+  t_hi[5] = sub_s32_s64_narrow(s_hi[1], s_hi[5]);
+  // s2 - s6
+  t_lo[6] = sub_s32_s64_narrow(s_lo[2], s_lo[6]);
+  t_hi[6] = sub_s32_s64_narrow(s_hi[2], s_hi[6]);
+  // s3 - s7
+  t_lo[7] = sub_s32_s64_narrow(s_lo[3], s_lo[7]);
+  t_hi[7] = sub_s32_s64_narrow(s_hi[3], s_hi[7]);
+
+  // fdct_round_shift()
+  // s8 + s12
+  t_lo[8] = add_s64_round_narrow(&s64_lo[2 * 8], &s64_lo[2 * 12]);
+  t_hi[8] = add_s64_round_narrow(&s64_hi[2 * 8], &s64_hi[2 * 12]);
+  // s9 + s13
+  t_lo[9] = add_s64_round_narrow(&s64_lo[2 * 9], &s64_lo[2 * 13]);
+  t_hi[9] = add_s64_round_narrow(&s64_hi[2 * 9], &s64_hi[2 * 13]);
+  // s10 + s14
+  t_lo[10] = add_s64_round_narrow(&s64_lo[2 * 10], &s64_lo[2 * 14]);
+  t_hi[10] = add_s64_round_narrow(&s64_hi[2 * 10], &s64_hi[2 * 14]);
+  // s11 + s15
+  t_lo[11] = add_s64_round_narrow(&s64_lo[2 * 11], &s64_lo[2 * 15]);
+  t_hi[11] = add_s64_round_narrow(&s64_hi[2 * 11], &s64_hi[2 * 15]);
+
+  // s8 - s12
+  t_lo[12] = sub_s64_round_narrow(&s64_lo[2 * 8], &s64_lo[2 * 12]);
+  t_hi[12] = sub_s64_round_narrow(&s64_hi[2 * 8], &s64_hi[2 * 12]);
+  // s9 - s13
+  t_lo[13] = sub_s64_round_narrow(&s64_lo[2 * 9], &s64_lo[2 * 13]);
+  t_hi[13] = sub_s64_round_narrow(&s64_hi[2 * 9], &s64_hi[2 * 13]);
+  // s10 - s14
+  t_lo[14] = sub_s64_round_narrow(&s64_lo[2 * 10], &s64_lo[2 * 14]);
+  t_hi[14] = sub_s64_round_narrow(&s64_hi[2 * 10], &s64_hi[2 * 14]);
+  // s11 - s15
+  t_lo[15] = sub_s64_round_narrow(&s64_lo[2 * 11], &s64_lo[2 * 15]);
+  t_hi[15] = sub_s64_round_narrow(&s64_hi[2 * 11], &s64_hi[2 * 15]);
+
+  // stage 3
+  s_lo[0] = t_lo[0];
+  s_hi[0] = t_hi[0];
+  s_lo[1] = t_lo[1];
+  s_hi[1] = t_hi[1];
+  s_lo[2] = t_lo[2];
+  s_hi[2] = t_hi[2];
+  s_lo[3] = t_lo[3];
+  s_hi[3] = t_hi[3];
+  // s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+  // s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[4], t_hi[4], t_lo[5], t_hi[5], cospi_8_64, cospi_24_64,
+      &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]);
+  // s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+  // s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[7], t_hi[7], t_lo[6], t_hi[6], cospi_24_64, cospi_8_64,
+      &s64_lo[2 * 7], &s64_hi[2 * 7], &s64_lo[2 * 6], &s64_hi[2 * 6]);
+  s_lo[8] = t_lo[8];
+  s_hi[8] = t_hi[8];
+  s_lo[9] = t_lo[9];
+  s_hi[9] = t_hi[9];
+  s_lo[10] = t_lo[10];
+  s_hi[10] = t_hi[10];
+  s_lo[11] = t_lo[11];
+  s_hi[11] = t_hi[11];
+  // s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+  // s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[12], t_hi[12], t_lo[13], t_hi[13], cospi_8_64, cospi_24_64,
+      &s64_lo[2 * 12], &s64_hi[2 * 12], &s64_lo[2 * 13], &s64_hi[2 * 13]);
+  // s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+  // s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[15], t_hi[15], t_lo[14], t_hi[14], cospi_24_64, cospi_8_64,
+      &s64_lo[2 * 15], &s64_hi[2 * 15], &s64_lo[2 * 14], &s64_hi[2 * 14]);
+
+  // s0 + s2
+  t_lo[0] = add_s32_s64_narrow(s_lo[0], s_lo[2]);
+  t_hi[0] = add_s32_s64_narrow(s_hi[0], s_hi[2]);
+  // s1 + s3
+  t_lo[1] = add_s32_s64_narrow(s_lo[1], s_lo[3]);
+  t_hi[1] = add_s32_s64_narrow(s_hi[1], s_hi[3]);
+  // s0 - s2
+  t_lo[2] = sub_s32_s64_narrow(s_lo[0], s_lo[2]);
+  t_hi[2] = sub_s32_s64_narrow(s_hi[0], s_hi[2]);
+  // s1 - s3
+  t_lo[3] = sub_s32_s64_narrow(s_lo[1], s_lo[3]);
+  t_hi[3] = sub_s32_s64_narrow(s_hi[1], s_hi[3]);
+  // fdct_round_shift()
+  // s4 + s6
+  t_lo[4] = add_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]);
+  t_hi[4] = add_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]);
+  // s5 + s7
+  t_lo[5] = add_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]);
+  t_hi[5] = add_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]);
+  // s4 - s6
+  t_lo[6] = sub_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]);
+  t_hi[6] = sub_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]);
+  // s5 - s7
+  t_lo[7] = sub_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]);
+  t_hi[7] = sub_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]);
+  // s8 + s10
+  t_lo[8] = add_s32_s64_narrow(s_lo[8], s_lo[10]);
+  t_hi[8] = add_s32_s64_narrow(s_hi[8], s_hi[10]);
+  // s9 + s11
+  t_lo[9] = add_s32_s64_narrow(s_lo[9], s_lo[11]);
+  t_hi[9] = add_s32_s64_narrow(s_hi[9], s_hi[11]);
+  // s8 - s10
+  t_lo[10] = sub_s32_s64_narrow(s_lo[8], s_lo[10]);
+  t_hi[10] = sub_s32_s64_narrow(s_hi[8], s_hi[10]);
+  // s9 - s11
+  t_lo[11] = sub_s32_s64_narrow(s_lo[9], s_lo[11]);
+  t_hi[11] = sub_s32_s64_narrow(s_hi[9], s_hi[11]);
+  // fdct_round_shift()
+  // s12 + s14
+  t_lo[12] = add_s64_round_narrow(&s64_lo[2 * 12], &s64_lo[2 * 14]);
+  t_hi[12] = add_s64_round_narrow(&s64_hi[2 * 12], &s64_hi[2 * 14]);
+  // s13 + s15
+  t_lo[13] = add_s64_round_narrow(&s64_lo[2 * 13], &s64_lo[2 * 15]);
+  t_hi[13] = add_s64_round_narrow(&s64_hi[2 * 13], &s64_hi[2 * 15]);
+  // s12 - s14
+  t_lo[14] = sub_s64_round_narrow(&s64_lo[2 * 12], &s64_lo[2 * 14]);
+  t_hi[14] = sub_s64_round_narrow(&s64_hi[2 * 12], &s64_hi[2 * 14]);
+  // s13 - s15
+  t_lo[15] = sub_s64_round_narrow(&s64_lo[2 * 13], &s64_lo[2 * 15]);
+  t_hi[15] = sub_s64_round_narrow(&s64_hi[2 * 13], &s64_hi[2 * 15]);
+
+  // stage 4, with fdct_round_shift
+  // s2 = (-cospi_16_64) * (x2 + x3);
+  // s3 = cospi_16_64 * (x2 - x3);
+  butterfly_one_coeff_s32_s64_narrow(t_lo[3], t_hi[3], t_lo[2], t_hi[2],
+                                     -cospi_16_64, &x_lo[2], &x_hi[2], &x_lo[3],
+                                     &x_hi[3]);
+  // s6 = cospi_16_64 * (x6 + x7);
+  // s7 = cospi_16_64 * (-x6 + x7);
+  butterfly_one_coeff_s32_s64_narrow(t_lo[7], t_hi[7], t_lo[6], t_hi[6],
+                                     cospi_16_64, &x_lo[6], &x_hi[6], &x_lo[7],
+                                     &x_hi[7]);
+  // s10 = cospi_16_64 * (x10 + x11);
+  // s11 = cospi_16_64 * (-x10 + x11);
+  butterfly_one_coeff_s32_s64_narrow(t_lo[11], t_hi[11], t_lo[10], t_hi[10],
+                                     cospi_16_64, &x_lo[10], &x_hi[10],
+                                     &x_lo[11], &x_hi[11]);
+  // s14 = (-cospi_16_64) * (x14 + x15);
+  // s15 = cospi_16_64 * (x14 - x15);
+  butterfly_one_coeff_s32_s64_narrow(t_lo[15], t_hi[15], t_lo[14], t_hi[14],
+                                     -cospi_16_64, &x_lo[14], &x_hi[14],
+                                     &x_lo[15], &x_hi[15]);
+
+  // Just copy x0, x1, x4, x5, x8, x9, x12, x13
+  x_lo[0] = t_lo[0];
+  x_hi[0] = t_hi[0];
+  x_lo[1] = t_lo[1];
+  x_hi[1] = t_hi[1];
+  x_lo[4] = t_lo[4];
+  x_hi[4] = t_hi[4];
+  x_lo[5] = t_lo[5];
+  x_hi[5] = t_hi[5];
+  x_lo[8] = t_lo[8];
+  x_hi[8] = t_hi[8];
+  x_lo[9] = t_lo[9];
+  x_hi[9] = t_hi[9];
+  x_lo[12] = t_lo[12];
+  x_hi[12] = t_hi[12];
+  x_lo[13] = t_lo[13];
+  x_hi[13] = t_hi[13];
+
+  left[0] = x_lo[0];
+  right[0] = x_hi[0];
+  left[1] = vnegq_s32(x_lo[8]);
+  right[1] = vnegq_s32(x_hi[8]);
+  left[2] = x_lo[12];
+  right[2] = x_hi[12];
+  left[3] = vnegq_s32(x_lo[4]);
+  right[3] = vnegq_s32(x_hi[4]);
+  left[4] = x_lo[6];
+  right[4] = x_hi[6];
+  left[5] = x_lo[14];
+  right[5] = x_hi[14];
+  left[6] = x_lo[10];
+  right[6] = x_hi[10];
+  left[7] = x_lo[2];
+  right[7] = x_hi[2];
+  left[8] = x_lo[3];
+  right[8] = x_hi[3];
+  left[9] = x_lo[11];
+  right[9] = x_hi[11];
+  left[10] = x_lo[15];
+  right[10] = x_hi[15];
+  left[11] = x_lo[7];
+  right[11] = x_hi[7];
+  left[12] = x_lo[5];
+  right[12] = x_hi[5];
+  left[13] = vnegq_s32(x_lo[13]);
+  right[13] = vnegq_s32(x_hi[13]);
+  left[14] = x_lo[9];
+  right[14] = x_hi[9];
+  left[15] = vnegq_s32(x_lo[1]);
+  right[15] = vnegq_s32(x_hi[1]);
+}
+
+static void highbd_fdct16x16_neon(int32x4_t *left1, int32x4_t *right1,
+                                  int32x4_t *left2, int32x4_t *right2) {
+  // Left half.
+  highbd_fdct16_8col(left1, right1);
+  // Right half.
+  highbd_fdct16_8col(left2, right2);
+  transpose_s32_16x16(left1, right1, left2, right2);
+}
+
+static void highbd_fadst16x16_neon(int32x4_t *left1, int32x4_t *right1,
+                                   int32x4_t *left2, int32x4_t *right2) {
+  // Left half.
+  highbd_fadst16_8col(left1, right1);
+  // Right half.
+  highbd_fadst16_8col(left2, right2);
+  transpose_s32_16x16(left1, right1, left2, right2);
+}
+
+void vp9_highbd_fht16x16_neon(const int16_t *input, tran_low_t *output,
+                              int stride, int tx_type) {
+  int32x4_t left1[16], right1[16], left2[16], right2[16];
+
+  switch (tx_type) {
+    case DCT_DCT: vpx_highbd_fdct16x16_neon(input, output, stride); break;
+    case ADST_DCT:
+      highbd_load_buffer_16x16(input, left1, right1, left2, right2, stride);
+      highbd_fadst16x16_neon(left1, right1, left2, right2);
+      highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16);
+      highbd_right_shift_16x16(left1, right1, left2, right2, 2);
+      highbd_fdct16x16_neon(left1, right1, left2, right2);
+      highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16);
+      break;
+    case DCT_ADST:
+      highbd_load_buffer_16x16(input, left1, right1, left2, right2, stride);
+      highbd_fdct16x16_neon(left1, right1, left2, right2);
+      highbd_right_shift_16x16(left1, right1, left2, right2, 2);
+      highbd_fadst16x16_neon(left1, right1, left2, right2);
+      highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      highbd_load_buffer_16x16(input, left1, right1, left2, right2, stride);
+      highbd_fadst16x16_neon(left1, right1, left2, right2);
+      highbd_right_shift_16x16(left1, right1, left2, right2, 2);
+      highbd_fadst16x16_neon(left1, right1, left2, right2);
+      highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16);
+      break;
+  }
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c
new file mode 100644
index 0000000000..d631cd437d
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c
@@ -0,0 +1,356 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/encoder/vp9_context_tree.h"
+#include "vp9/encoder/vp9_denoiser.h"
+#include "vpx_mem/vpx_mem.h"
+
+// Compute the sum of all pixel differences of this MB.
+static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) {
+#if VPX_ARCH_AARCH64
+  return vaddlvq_s8(v_sum_diff_total);
+#else
+  const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total);
+  const int32x4_t fedc_ba98_7654_3210 = vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
+  const int64x2_t fedcba98_76543210 = vpaddlq_s32(fedc_ba98_7654_3210);
+  const int64x1_t x = vqadd_s64(vget_high_s64(fedcba98_76543210),
+                                vget_low_s64(fedcba98_76543210));
+  const int sum_diff = vget_lane_s32(vreinterpret_s32_s64(x), 0);
+  return sum_diff;
+#endif
+}
+
+// Denoise a 16x1 vector.
+static INLINE int8x16_t denoiser_16x1_neon(
+    const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
+    const uint8x16_t v_level1_threshold, const uint8x16_t v_level2_threshold,
+    const uint8x16_t v_level3_threshold, const uint8x16_t v_level1_adjustment,
+    const uint8x16_t v_delta_level_1_and_2,
+    const uint8x16_t v_delta_level_2_and_3, int8x16_t v_sum_diff_total) {
+  const uint8x16_t v_sig = vld1q_u8(sig);
+  const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
+
+  /* Calculate absolute difference and sign masks. */
+  const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y);
+  const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y);
+  const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y);
+
+  /* Figure out which level that put us in. */
+  const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold, v_abs_diff);
+  const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold, v_abs_diff);
+  const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold, v_abs_diff);
+
+  /* Calculate absolute adjustments for level 1, 2 and 3. */
+  const uint8x16_t v_level2_adjustment =
+      vandq_u8(v_level2_mask, v_delta_level_1_and_2);
+  const uint8x16_t v_level3_adjustment =
+      vandq_u8(v_level3_mask, v_delta_level_2_and_3);
+  const uint8x16_t v_level1and2_adjustment =
+      vaddq_u8(v_level1_adjustment, v_level2_adjustment);
+  const uint8x16_t v_level1and2and3_adjustment =
+      vaddq_u8(v_level1and2_adjustment, v_level3_adjustment);
+
+  /* Figure adjustment absolute value by selecting between the absolute
+   * difference if in level0 or the value for level 1, 2 and 3.
+   */
+  const uint8x16_t v_abs_adjustment =
+      vbslq_u8(v_level1_mask, v_level1and2and3_adjustment, v_abs_diff);
+
+  /* Calculate positive and negative adjustments. Apply them to the signal
+   * and accumulate them. Adjustments are less than eight and the maximum
+   * sum of them (7 * 16) can fit in a signed char.
+   */
+  const uint8x16_t v_pos_adjustment =
+      vandq_u8(v_diff_pos_mask, v_abs_adjustment);
+  const uint8x16_t v_neg_adjustment =
+      vandq_u8(v_diff_neg_mask, v_abs_adjustment);
+
+  uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment);
+  v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment);
+
+  /* Store results. */
+  vst1q_u8(running_avg_y, v_running_avg_y);
+
+  /* Sum all the accumulators to have the sum of all pixel differences
+   * for this macroblock.
+   */
+  {
+    const int8x16_t v_sum_diff =
+        vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment),
+                  vreinterpretq_s8_u8(v_neg_adjustment));
+    v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff);
+  }
+  return v_sum_diff_total;
+}
+
+static INLINE int8x16_t denoiser_adjust_16x1_neon(
+    const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
+    const uint8x16_t k_delta, int8x16_t v_sum_diff_total) {
+  uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y);
+  const uint8x16_t v_sig = vld1q_u8(sig);
+  const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
+
+  /* Calculate absolute difference and sign masks. */
+  const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y);
+  const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y);
+  const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y);
+  // Clamp absolute difference to delta to get the adjustment.
+  const uint8x16_t v_abs_adjustment = vminq_u8(v_abs_diff, (k_delta));
+
+  const uint8x16_t v_pos_adjustment =
+      vandq_u8(v_diff_pos_mask, v_abs_adjustment);
+  const uint8x16_t v_neg_adjustment =
+      vandq_u8(v_diff_neg_mask, v_abs_adjustment);
+
+  v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment);
+  v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment);
+
+  /* Store results. */
+  vst1q_u8(running_avg_y, v_running_avg_y);
+
+  {
+    const int8x16_t v_sum_diff =
+        vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment),
+                  vreinterpretq_s8_u8(v_pos_adjustment));
+    v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff);
+  }
+  return v_sum_diff_total;
+}
+
+// Denoise 8x8 and 8x16 blocks.
+static int vp9_denoiser_8xN_neon(const uint8_t *sig, int sig_stride,
+                                 const uint8_t *mc_running_avg_y,
+                                 int mc_avg_y_stride, uint8_t *running_avg_y,
+                                 int avg_y_stride, int increase_denoising,
+                                 BLOCK_SIZE bs, int motion_magnitude,
+                                 int width) {
+  int sum_diff_thresh, r, sum_diff = 0;
+  const int shift_inc =
+      (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+          ? 1
+          : 0;
+  uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16];
+
+  const uint8x16_t v_level1_adjustment = vmovq_n_u8(
+      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3);
+  const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
+  const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
+  const uint8x16_t v_level1_threshold = vdupq_n_u8(4 + shift_inc);
+  const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
+  const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
+
+  const int b_height = (4 << b_height_log2_lookup[bs]) >> 1;
+
+  int8x16_t v_sum_diff_total = vdupq_n_s8(0);
+
+  for (r = 0; r < b_height; ++r) {
+    memcpy(sig_buffer[r], sig, width);
+    memcpy(sig_buffer[r] + width, sig + sig_stride, width);
+    memcpy(mc_running_buffer[r], mc_running_avg_y, width);
+    memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride,
+           width);
+    memcpy(running_buffer[r], running_avg_y, width);
+    memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width);
+    v_sum_diff_total = denoiser_16x1_neon(
+        sig_buffer[r], mc_running_buffer[r], running_buffer[r],
+        v_level1_threshold, v_level2_threshold, v_level3_threshold,
+        v_level1_adjustment, v_delta_level_1_and_2, v_delta_level_2_and_3,
+        v_sum_diff_total);
+    {
+      const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]);
+      const uint8x8_t v_running_buffer_high = vget_high_u8(v_running_buffer);
+      const uint8x8_t v_running_buffer_low = vget_low_u8(v_running_buffer);
+      vst1_u8(running_avg_y, v_running_buffer_low);
+      vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high);
+    }
+    // Update pointers for next iteration.
+    sig += (sig_stride << 1);
+    mc_running_avg_y += (mc_avg_y_stride << 1);
+    running_avg_y += (avg_y_stride << 1);
+  }
+
+  {
+    sum_diff = horizontal_add_s8x16(v_sum_diff_total);
+    sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+    if (abs(sum_diff) > sum_diff_thresh) {
+      // Before returning to copy the block (i.e., apply no denoising),
+      // check if we can still apply some (weaker) temporal filtering to
+      // this block, that would otherwise not be denoised at all. Simplest
+      // is to apply an additional adjustment to running_avg_y to bring it
+      // closer to sig. The adjustment is capped by a maximum delta, and
+      // chosen such that in most cases the resulting sum_diff will be
+      // within the acceptable range given by sum_diff_thresh.
+
+      // The delta is set by the excess of absolute pixel diff over the
+      // threshold.
+      const int delta =
+          ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
+      // Only apply the adjustment for max delta up to 3.
+      if (delta < 4) {
+        const uint8x16_t k_delta = vmovq_n_u8(delta);
+        running_avg_y -= avg_y_stride * (b_height << 1);
+        for (r = 0; r < b_height; ++r) {
+          v_sum_diff_total = denoiser_adjust_16x1_neon(
+              sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_delta,
+              v_sum_diff_total);
+          {
+            const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]);
+            const uint8x8_t v_running_buffer_high =
+                vget_high_u8(v_running_buffer);
+            const uint8x8_t v_running_buffer_low =
+                vget_low_u8(v_running_buffer);
+            vst1_u8(running_avg_y, v_running_buffer_low);
+            vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high);
+          }
+          // Update pointers for next iteration.
+          running_avg_y += (avg_y_stride << 1);
+        }
+        sum_diff = horizontal_add_s8x16(v_sum_diff_total);
+        if (abs(sum_diff) > sum_diff_thresh) {
+          return COPY_BLOCK;
+        }
+      } else {
+        return COPY_BLOCK;
+      }
+    }
+  }
+
+  return FILTER_BLOCK;
+}
+
+// Denoise 16x16, 16x32, 32x16, 32x32, 32x64, 64x32 and 64x64 blocks.
+static int vp9_denoiser_NxM_neon(const uint8_t *sig, int sig_stride,
+                                 const uint8_t *mc_running_avg_y,
+                                 int mc_avg_y_stride, uint8_t *running_avg_y,
+                                 int avg_y_stride, int increase_denoising,
+                                 BLOCK_SIZE bs, int motion_magnitude) {
+  const int shift_inc =
+      (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+          ? 1
+          : 0;
+  const uint8x16_t v_level1_adjustment = vmovq_n_u8(
+      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3);
+  const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
+  const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
+  const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc);
+  const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
+  const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
+
+  const int b_width = (4 << b_width_log2_lookup[bs]);
+  const int b_height = (4 << b_height_log2_lookup[bs]);
+  const int b_width_shift4 = b_width >> 4;
+
+  int8x16_t v_sum_diff_total[4][4];
+  int r, c, sum_diff = 0;
+
+  for (r = 0; r < 4; ++r) {
+    for (c = 0; c < b_width_shift4; ++c) {
+      v_sum_diff_total[c][r] = vdupq_n_s8(0);
+    }
+  }
+
+  for (r = 0; r < b_height; ++r) {
+    for (c = 0; c < b_width_shift4; ++c) {
+      v_sum_diff_total[c][r >> 4] = denoiser_16x1_neon(
+          sig, mc_running_avg_y, running_avg_y, v_level1_threshold,
+          v_level2_threshold, v_level3_threshold, v_level1_adjustment,
+          v_delta_level_1_and_2, v_delta_level_2_and_3,
+          v_sum_diff_total[c][r >> 4]);
+
+      // Update pointers for next iteration.
+      sig += 16;
+      mc_running_avg_y += 16;
+      running_avg_y += 16;
+    }
+
+    if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+      for (c = 0; c < b_width_shift4; ++c) {
+        sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]);
+      }
+    }
+
+    // Update pointers for next iteration.
+    sig = sig - b_width + sig_stride;
+    mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+    running_avg_y = running_avg_y - b_width + avg_y_stride;
+  }
+
+  {
+    const int sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+    if (abs(sum_diff) > sum_diff_thresh) {
+      const int delta =
+          ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
+      // Only apply the adjustment for max delta up to 3.
+      if (delta < 4) {
+        const uint8x16_t k_delta = vdupq_n_u8(delta);
+        sig -= sig_stride * b_height;
+        mc_running_avg_y -= mc_avg_y_stride * b_height;
+        running_avg_y -= avg_y_stride * b_height;
+        sum_diff = 0;
+
+        for (r = 0; r < b_height; ++r) {
+          for (c = 0; c < b_width_shift4; ++c) {
+            v_sum_diff_total[c][r >> 4] =
+                denoiser_adjust_16x1_neon(sig, mc_running_avg_y, running_avg_y,
+                                          k_delta, v_sum_diff_total[c][r >> 4]);
+
+            // Update pointers for next iteration.
+            sig += 16;
+            mc_running_avg_y += 16;
+            running_avg_y += 16;
+          }
+          if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+            for (c = 0; c < b_width_shift4; ++c) {
+              sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]);
+            }
+          }
+
+          sig = sig - b_width + sig_stride;
+          mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+          running_avg_y = running_avg_y - b_width + avg_y_stride;
+        }
+
+        if (abs(sum_diff) > sum_diff_thresh) {
+          return COPY_BLOCK;
+        }
+      } else {
+        return COPY_BLOCK;
+      }
+    }
+  }
+  return FILTER_BLOCK;
+}
+
+int vp9_denoiser_filter_neon(const uint8_t *sig, int sig_stride,
+                             const uint8_t *mc_avg, int mc_avg_stride,
+                             uint8_t *avg, int avg_stride,
+                             int increase_denoising, BLOCK_SIZE bs,
+                             int motion_magnitude) {
+  // Rank by frequency of the block type to have an early termination.
+  if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 ||
+      bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 ||
+      bs == BLOCK_32X64 || bs == BLOCK_64X32) {
+    return vp9_denoiser_NxM_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg,
+                                 avg_stride, increase_denoising, bs,
+                                 motion_magnitude);
+  } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) {
+    return vp9_denoiser_8xN_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg,
+                                 avg_stride, increase_denoising, bs,
+                                 motion_magnitude, 8);
+  }
+  return COPY_BLOCK;
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
new file mode 100644
index 0000000000..b82b3f9db5
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
@@ -0,0 +1,296 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __GNUC__
+#define LIKELY(v) __builtin_expect(v, 1)
+#define UNLIKELY(v) __builtin_expect(v, 0)
+#else
+#define LIKELY(v) (v)
+#define UNLIKELY(v) (v)
+#endif
+
+static INLINE int_mv pack_int_mv(int16_t row, int16_t col) {
+  int_mv result;
+  result.as_mv.row = row;
+  result.as_mv.col = col;
+  return result;
+}
+
+/*****************************************************************************
+ * This function utilizes 3 properties of the cost function lookup tables,   *
+ * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in       *
+ * vp9_encoder.c.                                                            *
+ * For the joint cost:                                                       *
+ *   - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3]           *
+ * For the component costs:                                                  *
+ *   - For all i: mvsadcost[0][i] == mvsadcost[1][i]                         *
+ *         (Equal costs for both components)                                 *
+ *   - For all i: mvsadcost[0][i] == mvsadcost[0][-i]                        *
+ *         (Cost function is even)                                           *
+ * If these do not hold, then this function cannot be used without           *
+ * modification, in which case you can revert to using the C implementation, *
+ * which does not rely on these properties.                                  *
+ *****************************************************************************/
+int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
+                                const search_site_config *cfg, MV *ref_mv,
+                                uint32_t start_mv_sad, MV *best_mv,
+                                int search_param, int sad_per_bit, int *num00,
+                                const vp9_sad_fn_ptr_t *sad_fn_ptr,
+                                const MV *center_mv) {
+  static const uint32_t data[4] = { 0, 1, 2, 3 };
+  const uint32x4_t v_idx_d = vld1q_u32((const uint32_t *)data);
+
+  const int32x4_t zero_s32 = vdupq_n_s32(0);
+  const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max);
+  const int16x8_t v_max_mv_w = vreinterpretq_s16_s32(vdupq_n_s32(maxmv.as_int));
+  const int_mv minmv = pack_int_mv(x->mv_limits.row_min, x->mv_limits.col_min);
+  const int16x8_t v_min_mv_w = vreinterpretq_s16_s32(vdupq_n_s32(minmv.as_int));
+
+  const int32x4_t v_spb_d = vdupq_n_s32(sad_per_bit);
+
+  const int32x4_t v_joint_cost_0_d = vdupq_n_s32(x->nmvjointsadcost[0]);
+  const int32x4_t v_joint_cost_1_d = vdupq_n_s32(x->nmvjointsadcost[1]);
+
+  // search_param determines the length of the initial step and hence the number
+  // of iterations.
+  // 0 = initial step (MAX_FIRST_STEP) pel
+  // 1 = (MAX_FIRST_STEP/2) pel,
+  // 2 = (MAX_FIRST_STEP/4) pel...
+  const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param];
+  const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param];
+  const int tot_steps = cfg->total_steps - search_param;
+
+  const int_mv fcenter_mv =
+      pack_int_mv(center_mv->row >> 3, center_mv->col >> 3);
+  const int16x8_t vfcmv = vreinterpretq_s16_s32(vdupq_n_s32(fcenter_mv.as_int));
+
+  const int ref_row = ref_mv->row;
+  const int ref_col = ref_mv->col;
+
+  int_mv bmv = pack_int_mv(ref_row, ref_col);
+  int_mv new_bmv = bmv;
+  int16x8_t v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int));
+
+  const int what_stride = x->plane[0].src.stride;
+  const int in_what_stride = x->e_mbd.plane[0].pre[0].stride;
+  const uint8_t *const what = x->plane[0].src.buf;
+  const uint8_t *const in_what =
+      x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
+
+  // Work out the start point for the search
+  const uint8_t *best_address = in_what;
+  const uint8_t *new_best_address = best_address;
+#if VPX_ARCH_AARCH64
+  int64x2_t v_ba_q = vdupq_n_s64((intptr_t)best_address);
+#else
+  int32x4_t v_ba_d = vdupq_n_s32((intptr_t)best_address);
+#endif
+  // Starting position
+  unsigned int best_sad = start_mv_sad;
+  int i, j, step;
+
+  // Check the prerequisite cost function properties that are easy to check
+  // in an assert. See the function-level documentation for details on all
+  // prerequisites.
+  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
+  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);
+
+  *num00 = 0;
+
+  for (i = 0, step = 0; step < tot_steps; step++) {
+    for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) {
+      int16x8_t v_diff_mv_w;
+      int8x16_t v_inside_d;
+      uint32x4_t v_outside_d;
+      int32x4_t v_cost_d, v_sad_d;
+#if VPX_ARCH_AARCH64
+      int64x2_t v_blocka[2];
+#else
+      int32x4_t v_blocka[1];
+      uint32x2_t horiz_max_0, horiz_max_1;
+#endif
+
+      uint32_t horiz_max;
+      // Compute the candidate motion vectors
+      const int16x8_t v_ss_mv_w = vld1q_s16((const int16_t *)&ss_mv[i]);
+      const int16x8_t v_these_mv_w = vaddq_s16(v_bmv_w, v_ss_mv_w);
+      // Clamp them to the search bounds
+      int16x8_t v_these_mv_clamp_w = v_these_mv_w;
+      v_these_mv_clamp_w = vminq_s16(v_these_mv_clamp_w, v_max_mv_w);
+      v_these_mv_clamp_w = vmaxq_s16(v_these_mv_clamp_w, v_min_mv_w);
+      // The ones that did not change are inside the search area
+      v_inside_d = vreinterpretq_s8_u32(
+          vceqq_s32(vreinterpretq_s32_s16(v_these_mv_clamp_w),
+                    vreinterpretq_s32_s16(v_these_mv_w)));
+
+      // If none of them are inside, then move on
+#if VPX_ARCH_AARCH64
+      horiz_max = vmaxvq_u32(vreinterpretq_u32_s8(v_inside_d));
+#else
+      horiz_max_0 = vmax_u32(vget_low_u32(vreinterpretq_u32_s8(v_inside_d)),
+                             vget_high_u32(vreinterpretq_u32_s8(v_inside_d)));
+      horiz_max_1 = vpmax_u32(horiz_max_0, horiz_max_0);
+      vst1_lane_u32(&horiz_max, horiz_max_1, 0);
+#endif
+      if (LIKELY(horiz_max == 0)) {
+        continue;
+      }
+
+      // The inverse mask indicates which of the MVs are outside
+      v_outside_d =
+          vreinterpretq_u32_s8(veorq_s8(v_inside_d, vdupq_n_s8((int8_t)0xff)));
+      // Shift right to keep the sign bit clear, we will use this later
+      // to set the cost to the maximum value.
+      v_outside_d = vshrq_n_u32(v_outside_d, 1);
+
+      // Compute the difference MV
+      v_diff_mv_w = vsubq_s16(v_these_mv_clamp_w, vfcmv);
+      // We utilise the fact that the cost function is even, and use the
+      // absolute difference. This allows us to use unsigned indexes later
+      // and reduces cache pressure somewhat as only a half of the table
+      // is ever referenced.
+      v_diff_mv_w = vabsq_s16(v_diff_mv_w);
+
+      // Compute the SIMD pointer offsets.
+      {
+#if VPX_ARCH_AARCH64  //  sizeof(intptr_t) == 8
+        // Load the offsets
+        int64x2_t v_bo10_q = vld1q_s64((const int64_t *)&ss_os[i + 0]);
+        int64x2_t v_bo32_q = vld1q_s64((const int64_t *)&ss_os[i + 2]);
+        // Set the ones falling outside to zero
+        v_bo10_q = vandq_s64(
+            v_bo10_q,
+            vmovl_s32(vget_low_s32(vreinterpretq_s32_s8(v_inside_d))));
+        v_bo32_q = vandq_s64(
+            v_bo32_q,
+            vmovl_s32(vget_high_s32(vreinterpretq_s32_s8(v_inside_d))));
+        // Compute the candidate addresses
+        v_blocka[0] = vaddq_s64(v_ba_q, v_bo10_q);
+        v_blocka[1] = vaddq_s64(v_ba_q, v_bo32_q);
+#else  // sizeof(intptr_t) == 4
+        int32x4_t v_bo_d = vld1q_s32((const int32_t *)&ss_os[i]);
+        v_bo_d = vandq_s32(v_bo_d, vreinterpretq_s32_s8(v_inside_d));
+        v_blocka[0] = vaddq_s32(v_ba_d, v_bo_d);
+#endif
+      }
+
+      sad_fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
+                         in_what_stride, (uint32_t *)&v_sad_d);
+
+      // Look up the component cost of the residual motion vector
+      {
+        uint32_t cost[4];
+        DECLARE_ALIGNED(16, int16_t, rowcol[8]);
+        vst1q_s16(rowcol, v_diff_mv_w);
+
+        // Note: This is a use case for gather instruction
+        cost[0] = x->nmvsadcost[0][rowcol[0]] + x->nmvsadcost[0][rowcol[1]];
+        cost[1] = x->nmvsadcost[0][rowcol[2]] + x->nmvsadcost[0][rowcol[3]];
+        cost[2] = x->nmvsadcost[0][rowcol[4]] + x->nmvsadcost[0][rowcol[5]];
+        cost[3] = x->nmvsadcost[0][rowcol[6]] + x->nmvsadcost[0][rowcol[7]];
+
+        v_cost_d = vld1q_s32((int32_t *)cost);
+      }
+
+      // Now add in the joint cost
+      {
+        const uint32x4_t v_sel_d =
+            vceqq_s32(vreinterpretq_s32_s16(v_diff_mv_w), zero_s32);
+        const int32x4_t v_joint_cost_d = vreinterpretq_s32_u8(
+            vbslq_u8(vreinterpretq_u8_u32(v_sel_d),
+                     vreinterpretq_u8_s32(v_joint_cost_0_d),
+                     vreinterpretq_u8_s32(v_joint_cost_1_d)));
+        v_cost_d = vaddq_s32(v_cost_d, v_joint_cost_d);
+      }
+
+      // Multiply by sad_per_bit
+      v_cost_d = vmulq_s32(v_cost_d, v_spb_d);
+      // ROUND_POWER_OF_TWO(v_cost_d, VP9_PROB_COST_SHIFT)
+      v_cost_d =
+          vaddq_s32(v_cost_d, vdupq_n_s32(1 << (VP9_PROB_COST_SHIFT - 1)));
+      v_cost_d = vshrq_n_s32(v_cost_d, VP9_PROB_COST_SHIFT);
+      // Add the cost to the sad
+      v_sad_d = vaddq_s32(v_sad_d, v_cost_d);
+
+      // Make the motion vectors outside the search area have max cost
+      // by or'ing in the comparison mask, this way the minimum search won't
+      // pick them.
+      v_sad_d = vorrq_s32(v_sad_d, vreinterpretq_s32_u32(v_outside_d));
+
+      // Find the minimum value and index horizontally in v_sad_d
+      {
+        uint32_t local_best_sad;
+#if VPX_ARCH_AARCH64
+        local_best_sad = vminvq_u32(vreinterpretq_u32_s32(v_sad_d));
+#else
+        uint32x2_t horiz_min_0 =
+            vmin_u32(vget_low_u32(vreinterpretq_u32_s32(v_sad_d)),
+                     vget_high_u32(vreinterpretq_u32_s32(v_sad_d)));
+        uint32x2_t horiz_min_1 = vpmin_u32(horiz_min_0, horiz_min_0);
+        vst1_lane_u32(&local_best_sad, horiz_min_1, 0);
+#endif
+
+        // Update the global minimum if the local minimum is smaller
+        if (LIKELY(local_best_sad < best_sad)) {
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+          uint32_t local_best_idx;
+          const uint32x4_t v_sel_d =
+              vceqq_s32(v_sad_d, vdupq_n_s32(local_best_sad));
+          uint32x4_t v_mask_d = vandq_u32(v_sel_d, v_idx_d);
+          v_mask_d = vbslq_u32(v_sel_d, v_mask_d, vdupq_n_u32(0xffffffff));
+
+#if VPX_ARCH_AARCH64
+          local_best_idx = vminvq_u32(v_mask_d);
+#else
+          horiz_min_0 =
+              vmin_u32(vget_low_u32(v_mask_d), vget_high_u32(v_mask_d));
+          horiz_min_1 = vpmin_u32(horiz_min_0, horiz_min_0);
+          vst1_lane_u32(&local_best_idx, horiz_min_1, 0);
+#endif
+
+          new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx];
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+          new_best_address = ((const uint8_t **)v_blocka)[local_best_idx];
+
+          best_sad = local_best_sad;
+        }
+      }
+    }
+
+    bmv = new_bmv;
+    best_address = new_best_address;
+
+    v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int));
+#if VPX_ARCH_AARCH64
+    v_ba_q = vdupq_n_s64((intptr_t)best_address);
+#else
+    v_ba_d = vdupq_n_s32((intptr_t)best_address);
+#endif
+
+    if (UNLIKELY(best_address == in_what)) {
+      (*num00)++;
+    }
+  }
+
+  *best_mv = bmv.as_mv;
+  return best_sad;
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_neon.c
index 1c7503139e..0cf0bf250e 100644
--- a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_neon.c
+++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_neon.c
@@ -12,30 +12,91 @@
 #include <assert.h>
 
 #include "./vp9_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
 
-int64_t vp9_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff,
-                                int block_size) {
-  int64x2_t error = vdupq_n_s64(0);
+int64_t vp9_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+                             intptr_t block_size, int64_t *ssz) {
+  uint64x2_t err_u64 = vdupq_n_u64(0);
+  int64x2_t ssz_s64 = vdupq_n_s64(0);
 
-  assert(block_size >= 8);
-  assert((block_size % 8) == 0);
+  assert(block_size >= 16);
+  assert((block_size % 16) == 0);
 
   do {
-    const int16x8_t c = vld1q_s16(coeff);
-    const int16x8_t d = vld1q_s16(dqcoeff);
-    const int16x8_t diff = vsubq_s16(c, d);
-    const int16x4_t diff_lo = vget_low_s16(diff);
-    const int16x4_t diff_hi = vget_high_s16(diff);
-    // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before
+    uint32x4_t err;
+    int32x4_t ssz0, ssz1;
+
+    const int16x8_t c0 = load_tran_low_to_s16q(coeff);
+    const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
+    const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
+    const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
+
+    const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0));
+    const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1));
+
+    // diff is 15-bits, the squares 30, so we can store 4 in 32-bits before
     // accumulating them in 64-bits.
-    const int32x4_t err0 = vmull_s16(diff_lo, diff_lo);
-    const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi);
-    const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1));
-    error = vaddq_s64(error, err2);
-    coeff += 8;
-    dqcoeff += 8;
-    block_size -= 8;
+    err = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0));
+    err = vmlal_u16(err, vget_high_u16(diff0), vget_high_u16(diff0));
+    err = vmlal_u16(err, vget_low_u16(diff1), vget_low_u16(diff1));
+    err = vmlal_u16(err, vget_high_u16(diff1), vget_high_u16(diff1));
+    err_u64 = vpadalq_u32(err_u64, err);
+
+    // We can't do the same here as we're operating on signed integers, so we
+    // can store 2 15-bit diff before accumulating into 64-bits.
+    ssz0 = vmull_s16(vget_low_s16(c0), vget_low_s16(c0));
+    ssz0 = vmlal_s16(ssz0, vget_high_s16(c0), vget_high_s16(c0));
+    ssz_s64 = vpadalq_s32(ssz_s64, ssz0);
+
+    ssz1 = vmull_s16(vget_low_s16(c1), vget_low_s16(c1));
+    ssz1 = vmlal_s16(ssz1, vget_high_s16(c1), vget_high_s16(c1));
+    ssz_s64 = vpadalq_s32(ssz_s64, ssz1);
+
+    coeff += 16;
+    dqcoeff += 16;
+    block_size -= 16;
   } while (block_size != 0);
 
-  return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1);
+  *ssz = horizontal_add_int64x2(ssz_s64);
+  return (int64_t)horizontal_add_uint64x2(err_u64);
+}
+
+int64_t vp9_block_error_fp_neon(const tran_low_t *coeff,
+                                const tran_low_t *dqcoeff, int block_size) {
+  uint64x2_t err_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
+
+  assert(block_size >= 16);
+  assert((block_size % 16) == 0);
+
+  do {
+    uint32x4_t err0, err1;
+
+    const int16x8_t c0 = load_tran_low_to_s16q(coeff);
+    const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
+    const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
+    const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
+
+    const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0));
+    const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1));
+
+    // diff is 15-bits, the squares 30, so in theory we can store 4 in 32-bits
+    // before accumulating them in 64-bits. However splitting into 2 mull, mlal
+    // pairs is beneficial since it allows us to use both Neon
+    // multiply-accumulate pipes - on CPUs that have them - rather than having
+    // a single chain of 4 instructions executing serially.
+    err0 = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0));
+    err0 = vmlal_u16(err0, vget_high_u16(diff0), vget_high_u16(diff0));
+    err_u64[0] = vpadalq_u32(err_u64[0], err0);
+
+    err1 = vmull_u16(vget_low_u16(diff1), vget_low_u16(diff1));
+    err1 = vmlal_u16(err1, vget_high_u16(diff1), vget_high_u16(diff1));
+    err_u64[1] = vpadalq_u32(err_u64[1], err1);
+
+    coeff += 16;
+    dqcoeff += 16;
+    block_size -= 16;
+  } while (block_size != 0);
+
+  return horizontal_add_uint64x2(vaddq_u64(err_u64[0], err_u64[1]));
 }
diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_sve.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_sve.c
new file mode 100644
index 0000000000..78e7361d85
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_sve.c
@@ -0,0 +1,78 @@
+/*
+ *  Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_dsp/arm/vpx_neon_sve_bridge.h"
+
+int64_t vp9_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+                            intptr_t block_size, int64_t *ssz) {
+  int64x2_t err_v = vdupq_n_s64(0);
+  int64x2_t ssz_v = vdupq_n_s64(0);
+
+  assert(block_size >= 16);
+  assert((block_size % 16) == 0);
+
+  do {
+    const int16x8_t c0 = load_tran_low_to_s16q(coeff);
+    const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
+
+    const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
+    const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
+
+    const int16x8_t diff0 = vabdq_s16(c0, d0);
+    const int16x8_t diff1 = vabdq_s16(c1, d1);
+
+    err_v = vpx_dotq_s16(err_v, diff0, diff0);
+    err_v = vpx_dotq_s16(err_v, diff1, diff1);
+
+    ssz_v = vpx_dotq_s16(ssz_v, c0, c0);
+    ssz_v = vpx_dotq_s16(ssz_v, c1, c1);
+
+    coeff += 16;
+    dqcoeff += 16;
+    block_size -= 16;
+  } while (block_size != 0);
+
+  *ssz = horizontal_add_int64x2(ssz_v);
+  return horizontal_add_int64x2(err_v);
+}
+
+int64_t vp9_block_error_fp_sve(const tran_low_t *coeff,
+                               const tran_low_t *dqcoeff, int block_size) {
+  int64x2_t err = vdupq_n_s64(0);
+
+  assert(block_size >= 16);
+  assert((block_size % 16) == 0);
+
+  do {
+    const int16x8_t c0 = load_tran_low_to_s16q(coeff);
+    const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
+
+    const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
+    const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
+
+    const int16x8_t diff0 = vabdq_s16(c0, d0);
+    const int16x8_t diff1 = vabdq_s16(c1, d1);
+
+    err = vpx_dotq_s16(err, diff0, diff0);
+    err = vpx_dotq_s16(err, diff1, diff1);
+
+    coeff += 16;
+    dqcoeff += 16;
+    block_size -= 16;
+  } while (block_size != 0);
+
+  return horizontal_add_int64x2(err);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c
new file mode 100644
index 0000000000..bc8dd4a341
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c
@@ -0,0 +1,844 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_scale/yv12config.h"
+
+// Note: The scaling functions could write extra rows and columns in dst, which
+// exceed the right and bottom boundaries of the destination frame. We rely on
+// the following frame extension function to fix these rows and columns.
+
+static INLINE void scale_plane_2_to_1_phase_0(const uint8_t *src,
+                                              const int src_stride,
+                                              uint8_t *dst,
+                                              const int dst_stride, const int w,
+                                              const int h) {
+  const int max_width = (w + 15) & ~15;
+  int y = h;
+
+  assert(w && h);
+
+  do {
+    int x = max_width;
+    do {
+      const uint8x16x2_t s = vld2q_u8(src);
+      vst1q_u8(dst, s.val[0]);
+      src += 32;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 2 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static INLINE void scale_plane_4_to_1_phase_0(const uint8_t *src,
+                                              const int src_stride,
+                                              uint8_t *dst,
+                                              const int dst_stride, const int w,
+                                              const int h) {
+  const int max_width = (w + 15) & ~15;
+  int y = h;
+
+  assert(w && h);
+
+  do {
+    int x = max_width;
+    do {
+      const uint8x16x4_t s = vld4q_u8(src);
+      vst1q_u8(dst, s.val[0]);
+      src += 64;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 4 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static INLINE void scale_plane_bilinear_kernel(
+    const uint8x16_t in0, const uint8x16_t in1, const uint8x16_t in2,
+    const uint8x16_t in3, const uint8x8_t coef0, const uint8x8_t coef1,
+    uint8_t *const dst) {
+  const uint16x8_t h0 = vmull_u8(vget_low_u8(in0), coef0);
+  const uint16x8_t h1 = vmull_u8(vget_high_u8(in0), coef0);
+  const uint16x8_t h2 = vmull_u8(vget_low_u8(in2), coef0);
+  const uint16x8_t h3 = vmull_u8(vget_high_u8(in2), coef0);
+  const uint16x8_t h4 = vmlal_u8(h0, vget_low_u8(in1), coef1);
+  const uint16x8_t h5 = vmlal_u8(h1, vget_high_u8(in1), coef1);
+  const uint16x8_t h6 = vmlal_u8(h2, vget_low_u8(in3), coef1);
+  const uint16x8_t h7 = vmlal_u8(h3, vget_high_u8(in3), coef1);
+
+  const uint8x8_t hor0 = vrshrn_n_u16(h4, 7);  // temp: 00 01 02 03 04 05 06 07
+  const uint8x8_t hor1 = vrshrn_n_u16(h5, 7);  // temp: 08 09 0A 0B 0C 0D 0E 0F
+  const uint8x8_t hor2 = vrshrn_n_u16(h6, 7);  // temp: 10 11 12 13 14 15 16 17
+  const uint8x8_t hor3 = vrshrn_n_u16(h7, 7);  // temp: 18 19 1A 1B 1C 1D 1E 1F
+  const uint16x8_t v0 = vmull_u8(hor0, coef0);
+  const uint16x8_t v1 = vmull_u8(hor1, coef0);
+  const uint16x8_t v2 = vmlal_u8(v0, hor2, coef1);
+  const uint16x8_t v3 = vmlal_u8(v1, hor3, coef1);
+  // dst: 0 1 2 3 4 5 6 7  8 9 A B C D E F
+  const uint8x16_t d = vcombine_u8(vrshrn_n_u16(v2, 7), vrshrn_n_u16(v3, 7));
+  vst1q_u8(dst, d);
+}
+
+static INLINE void scale_plane_2_to_1_bilinear(
+    const uint8_t *const src, const int src_stride, uint8_t *dst,
+    const int dst_stride, const int w, const int h, const int16_t c0,
+    const int16_t c1) {
+  const int max_width = (w + 15) & ~15;
+  const uint8_t *src0 = src;
+  const uint8_t *src1 = src + src_stride;
+  const uint8x8_t coef0 = vdup_n_u8(c0);
+  const uint8x8_t coef1 = vdup_n_u8(c1);
+  int y = h;
+
+  assert(w && h);
+
+  do {
+    int x = max_width;
+    do {
+      // 000 002 004 006 008 00A 00C 00E  010 012 014 016 018 01A 01C 01E
+      // 001 003 005 007 009 00B 00D 00F  011 013 015 017 019 01B 01D 01F
+      const uint8x16x2_t s0 = vld2q_u8(src0);
+      // 100 102 104 106 108 10A 10C 10E  110 112 114 116 118 11A 11C 11E
+      // 101 103 105 107 109 10B 10D 10F  111 113 115 117 119 11B 11D 11F
+      const uint8x16x2_t s1 = vld2q_u8(src1);
+      scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1],
+                                  coef0, coef1, dst);
+      src0 += 32;
+      src1 += 32;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src0 += 2 * (src_stride - max_width);
+    src1 += 2 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static INLINE void scale_plane_4_to_1_bilinear(
+    const uint8_t *const src, const int src_stride, uint8_t *dst,
+    const int dst_stride, const int w, const int h, const int16_t c0,
+    const int16_t c1) {
+  const int max_width = (w + 15) & ~15;
+  const uint8_t *src0 = src;
+  const uint8_t *src1 = src + src_stride;
+  const uint8x8_t coef0 = vdup_n_u8(c0);
+  const uint8x8_t coef1 = vdup_n_u8(c1);
+  int y = h;
+
+  assert(w && h);
+
+  do {
+    int x = max_width;
+    do {
+      // (*) -- useless
+      // 000 004 008 00C 010 014 018 01C  020 024 028 02C 030 034 038 03C
+      // 001 005 009 00D 011 015 019 01D  021 025 029 02D 031 035 039 03D
+      // 002 006 00A 00E 012 016 01A 01E  022 026 02A 02E 032 036 03A 03E (*)
+      // 003 007 00B 00F 013 017 01B 01F  023 027 02B 02F 033 037 03B 03F (*)
+      const uint8x16x4_t s0 = vld4q_u8(src0);
+      // 100 104 108 10C 110 114 118 11C  120 124 128 12C 130 134 138 13C
+      // 101 105 109 10D 111 115 119 11D  121 125 129 12D 131 135 139 13D
+      // 102 106 10A 10E 112 116 11A 11E  122 126 12A 12E 132 136 13A 13E (*)
+      // 103 107 10B 10F 113 117 11B 11F  123 127 12B 12F 133 137 13B 13F (*)
+      const uint8x16x4_t s1 = vld4q_u8(src1);
+      scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1],
+                                  coef0, coef1, dst);
+      src0 += 64;
+      src1 += 64;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src0 += 4 * (src_stride - max_width);
+    src1 += 4 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static INLINE uint8x8_t scale_filter_bilinear(const uint8x8_t *const s,
+                                              const uint8x8_t *const coef) {
+  const uint16x8_t h0 = vmull_u8(s[0], coef[0]);
+  const uint16x8_t h1 = vmlal_u8(h0, s[1], coef[1]);
+
+  return vrshrn_n_u16(h1, 7);
+}
+
+static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const int16_t *const coef,
+                                       uint8_t *const temp_buffer) {
+  const int width_hor = (w + 3) & ~3;
+  const int width_ver = (w + 7) & ~7;
+  const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+  const int height_ver = (h + 3) & ~3;
+  const int16x8_t filters = vld1q_s16(coef);
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  uint8x8_t s[14], d[4];
+
+  assert(w && h);
+
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1;
+
+  // horizontal 4x8
+  // Note: processing 4x8 is about 20% faster than processing row by row using
+  // vld4_u8().
+  do {
+    load_u8_8x8(src + 2, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                &s[6], &s[7]);
+    transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
+    x = width_hor;
+
+    do {
+      src += 8;
+      load_u8_8x8(src, src_stride, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11],
+                  &s[12], &s[13]);
+      transpose_u8_8x8(&s[6], &s[7], &s[8], &s[9], &s[10], &s[11], &s[12],
+                       &s[13]);
+
+      d[0] = scale_filter_8(&s[0], filters);  // 00 10 20 30 40 50 60 70
+      d[1] = scale_filter_8(&s[2], filters);  // 01 11 21 31 41 51 61 71
+      d[2] = scale_filter_8(&s[4], filters);  // 02 12 22 32 42 52 62 72
+      d[3] = scale_filter_8(&s[6], filters);  // 03 13 23 33 43 53 63 73
+      // 00 01 02 03 40 41 42 43
+      // 10 11 12 13 50 51 52 53
+      // 20 21 22 23 60 61 62 63
+      // 30 31 32 33 70 71 72 73
+      transpose_u8_8x4(&d[0], &d[1], &d[2], &d[3]);
+      vst1_lane_u32((uint32_t *)(t + 0 * width_hor), vreinterpret_u32_u8(d[0]),
+                    0);
+      vst1_lane_u32((uint32_t *)(t + 1 * width_hor), vreinterpret_u32_u8(d[1]),
+                    0);
+      vst1_lane_u32((uint32_t *)(t + 2 * width_hor), vreinterpret_u32_u8(d[2]),
+                    0);
+      vst1_lane_u32((uint32_t *)(t + 3 * width_hor), vreinterpret_u32_u8(d[3]),
+                    0);
+      vst1_lane_u32((uint32_t *)(t + 4 * width_hor), vreinterpret_u32_u8(d[0]),
+                    1);
+      vst1_lane_u32((uint32_t *)(t + 5 * width_hor), vreinterpret_u32_u8(d[1]),
+                    1);
+      vst1_lane_u32((uint32_t *)(t + 6 * width_hor), vreinterpret_u32_u8(d[2]),
+                    1);
+      vst1_lane_u32((uint32_t *)(t + 7 * width_hor), vreinterpret_u32_u8(d[3]),
+                    1);
+
+      s[0] = s[8];
+      s[1] = s[9];
+      s[2] = s[10];
+      s[3] = s[11];
+      s[4] = s[12];
+      s[5] = s[13];
+
+      t += 4;
+      x -= 4;
+    } while (x);
+    src += 8 * src_stride - 2 * width_hor;
+    t += 7 * width_hor;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x4
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    load_u8_8x8(t, width_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+                &s[7]);
+    t += 6 * width_hor;
+    y = height_ver;
+
+    do {
+      load_u8_8x8(t, width_hor, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11],
+                  &s[12], &s[13]);
+      t += 8 * width_hor;
+
+      d[0] = scale_filter_8(&s[0], filters);  // 00 01 02 03 04 05 06 07
+      d[1] = scale_filter_8(&s[2], filters);  // 10 11 12 13 14 15 16 17
+      d[2] = scale_filter_8(&s[4], filters);  // 20 21 22 23 24 25 26 27
+      d[3] = scale_filter_8(&s[6], filters);  // 30 31 32 33 34 35 36 37
+      vst1_u8(dst + 0 * dst_stride, d[0]);
+      vst1_u8(dst + 1 * dst_stride, d[1]);
+      vst1_u8(dst + 2 * dst_stride, d[2]);
+      vst1_u8(dst + 3 * dst_stride, d[3]);
+
+      s[0] = s[8];
+      s[1] = s[9];
+      s[2] = s[10];
+      s[3] = s[11];
+      s[4] = s[12];
+      s[5] = s[13];
+
+      dst += 4 * dst_stride;
+      y -= 4;
+    } while (y);
+    t -= width_hor * (2 * height_ver + 6);
+    t += 8;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const int16_t *const coef,
+                                       uint8_t *const temp_buffer) {
+  const int width_hor = (w + 1) & ~1;
+  const int width_ver = (w + 7) & ~7;
+  const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+  const int height_ver = (h + 1) & ~1;
+  const int16x8_t filters = vld1q_s16(coef);
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  uint8x8_t s[12], d[2];
+
+  assert(w && h);
+
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3;
+
+  // horizontal 2x8
+  // Note: processing 2x8 is about 20% faster than processing row by row using
+  // vld4_u8().
+  do {
+    load_u8_8x8(src + 4, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                &s[6], &s[7]);
+    transpose_u8_4x8(&s[0], &s[1], &s[2], &s[3], s[4], s[5], s[6], s[7]);
+    x = width_hor;
+
+    do {
+      uint8x8x2_t dd;
+      src += 8;
+      load_u8_8x8(src, src_stride, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9],
+                  &s[10], &s[11]);
+      transpose_u8_8x8(&s[4], &s[5], &s[6], &s[7], &s[8], &s[9], &s[10],
+                       &s[11]);
+
+      d[0] = scale_filter_8(&s[0], filters);  // 00 10 20 30 40 50 60 70
+      d[1] = scale_filter_8(&s[4], filters);  // 01 11 21 31 41 51 61 71
+      // dd.val[0]: 00 01 20 21 40 41 60 61
+      // dd.val[1]: 10 11 30 31 50 51 70 71
+      dd = vtrn_u8(d[0], d[1]);
+      vst1_lane_u16((uint16_t *)(t + 0 * width_hor),
+                    vreinterpret_u16_u8(dd.val[0]), 0);
+      vst1_lane_u16((uint16_t *)(t + 1 * width_hor),
+                    vreinterpret_u16_u8(dd.val[1]), 0);
+      vst1_lane_u16((uint16_t *)(t + 2 * width_hor),
+                    vreinterpret_u16_u8(dd.val[0]), 1);
+      vst1_lane_u16((uint16_t *)(t + 3 * width_hor),
+                    vreinterpret_u16_u8(dd.val[1]), 1);
+      vst1_lane_u16((uint16_t *)(t + 4 * width_hor),
+                    vreinterpret_u16_u8(dd.val[0]), 2);
+      vst1_lane_u16((uint16_t *)(t + 5 * width_hor),
+                    vreinterpret_u16_u8(dd.val[1]), 2);
+      vst1_lane_u16((uint16_t *)(t + 6 * width_hor),
+                    vreinterpret_u16_u8(dd.val[0]), 3);
+      vst1_lane_u16((uint16_t *)(t + 7 * width_hor),
+                    vreinterpret_u16_u8(dd.val[1]), 3);
+
+      s[0] = s[8];
+      s[1] = s[9];
+      s[2] = s[10];
+      s[3] = s[11];
+
+      t += 2;
+      x -= 2;
+    } while (x);
+    src += 8 * src_stride - 4 * width_hor;
+    t += 7 * width_hor;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x2
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    load_u8_8x4(t, width_hor, &s[0], &s[1], &s[2], &s[3]);
+    t += 4 * width_hor;
+    y = height_ver;
+
+    do {
+      load_u8_8x8(t, width_hor, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9],
+                  &s[10], &s[11]);
+      t += 8 * width_hor;
+
+      d[0] = scale_filter_8(&s[0], filters);  // 00 01 02 03 04 05 06 07
+      d[1] = scale_filter_8(&s[4], filters);  // 10 11 12 13 14 15 16 17
+      vst1_u8(dst + 0 * dst_stride, d[0]);
+      vst1_u8(dst + 1 * dst_stride, d[1]);
+
+      s[0] = s[8];
+      s[1] = s[9];
+      s[2] = s[10];
+      s[3] = s[11];
+
+      dst += 2 * dst_stride;
+      y -= 2;
+    } while (y);
+    t -= width_hor * (4 * height_ver + 4);
+    t += 8;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+// Notes for 4 to 3 scaling:
+//
+// 1. 6 rows are calculated in each horizontal inner loop, so width_hor must be
+// multiple of 6, and no less than w.
+//
+// 2. 8 rows are calculated in each vertical inner loop, so width_ver must be
+// multiple of 8, and no less than w.
+//
+// 3. 8 columns are calculated in each horizontal inner loop for further
+// vertical scaling, so height_hor must be multiple of 8, and no less than
+// 4 * h / 3.
+//
+// 4. 6 columns are calculated in each vertical inner loop, so height_ver must
+// be multiple of 6, and no less than h.
+//
+// 5. The physical location of the last row of the 4 to 3 scaled frame is
+// decided by phase_scaler, and are always less than 1 pixel below the last row
+// of the original image.
+
+static void scale_plane_4_to_3_bilinear(const uint8_t *src,
+                                        const int src_stride, uint8_t *dst,
+                                        const int dst_stride, const int w,
+                                        const int h, const int phase_scaler,
+                                        uint8_t *const temp_buffer) {
+  static const int step_q4 = 16 * 4 / 3;
+  const int width_hor = (w + 5) - ((w + 5) % 6);
+  const int stride_hor = width_hor + 2;  // store 2 extra pixels
+  const int width_ver = (w + 7) & ~7;
+  // We only need 1 extra row below because there are only 2 bilinear
+  // coefficients.
+  const int height_hor = (4 * h / 3 + 1 + 7) & ~7;
+  const int height_ver = (h + 5) - ((h + 5) % 6);
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  uint8x8_t s[9], d[8], c[6];
+
+  assert(w && h);
+
+  c[0] = vdup_n_u8((uint8_t)vp9_filter_kernels[BILINEAR][phase_scaler][3]);
+  c[1] = vdup_n_u8((uint8_t)vp9_filter_kernels[BILINEAR][phase_scaler][4]);
+  c[2] = vdup_n_u8(
+      (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 1 * step_q4) &
+                                            SUBPEL_MASK][3]);
+  c[3] = vdup_n_u8(
+      (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 1 * step_q4) &
+                                            SUBPEL_MASK][4]);
+  c[4] = vdup_n_u8(
+      (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 2 * step_q4) &
+                                            SUBPEL_MASK][3]);
+  c[5] = vdup_n_u8(
+      (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 2 * step_q4) &
+                                            SUBPEL_MASK][4]);
+
+  d[6] = vdup_n_u8(0);
+  d[7] = vdup_n_u8(0);
+
+  // horizontal 6x8
+  do {
+    load_u8_8x8(src, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                &s[6], &s[7]);
+    src += 1;
+    transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
+    x = width_hor;
+
+    do {
+      load_u8_8x8(src, src_stride, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+                  &s[7], &s[8]);
+      src += 8;
+      transpose_u8_8x8(&s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7], &s[8]);
+
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      d[0] = scale_filter_bilinear(&s[0], &c[0]);
+      d[1] =
+          scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]);
+      d[2] =
+          scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]);
+      d[3] = scale_filter_bilinear(&s[4], &c[0]);
+      d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)],
+                                   &c[2]);
+      d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)],
+                                   &c[4]);
+
+      // 00 01 02 03 04 05 xx xx
+      // 10 11 12 13 14 15 xx xx
+      // 20 21 22 23 24 25 xx xx
+      // 30 31 32 33 34 35 xx xx
+      // 40 41 42 43 44 45 xx xx
+      // 50 51 52 53 54 55 xx xx
+      // 60 61 62 63 64 65 xx xx
+      // 70 71 72 73 74 75 xx xx
+      transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+      // store 2 extra pixels
+      vst1_u8(t + 0 * stride_hor, d[0]);
+      vst1_u8(t + 1 * stride_hor, d[1]);
+      vst1_u8(t + 2 * stride_hor, d[2]);
+      vst1_u8(t + 3 * stride_hor, d[3]);
+      vst1_u8(t + 4 * stride_hor, d[4]);
+      vst1_u8(t + 5 * stride_hor, d[5]);
+      vst1_u8(t + 6 * stride_hor, d[6]);
+      vst1_u8(t + 7 * stride_hor, d[7]);
+
+      s[0] = s[8];
+
+      t += 6;
+      x -= 6;
+    } while (x);
+    src += 8 * src_stride - 4 * width_hor / 3 - 1;
+    t += 7 * stride_hor + 2;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x6
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+                &s[7]);
+    t += stride_hor;
+    y = height_ver;
+
+    do {
+      load_u8_8x8(t, stride_hor, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+                  &s[7], &s[8]);
+      t += 8 * stride_hor;
+
+      d[0] = scale_filter_bilinear(&s[0], &c[0]);
+      d[1] =
+          scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]);
+      d[2] =
+          scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]);
+      d[3] = scale_filter_bilinear(&s[4], &c[0]);
+      d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)],
+                                   &c[2]);
+      d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)],
+                                   &c[4]);
+      vst1_u8(dst + 0 * dst_stride, d[0]);
+      vst1_u8(dst + 1 * dst_stride, d[1]);
+      vst1_u8(dst + 2 * dst_stride, d[2]);
+      vst1_u8(dst + 3 * dst_stride, d[3]);
+      vst1_u8(dst + 4 * dst_stride, d[4]);
+      vst1_u8(dst + 5 * dst_stride, d[5]);
+
+      s[0] = s[8];
+
+      dst += 6 * dst_stride;
+      y -= 6;
+    } while (y);
+    t -= stride_hor * (4 * height_ver / 3 + 1);
+    t += 8;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const InterpKernel *const coef,
+                                       const int phase_scaler,
+                                       uint8_t *const temp_buffer) {
+  static const int step_q4 = 16 * 4 / 3;
+  const int width_hor = (w + 5) - ((w + 5) % 6);
+  const int stride_hor = width_hor + 2;  // store 2 extra pixels
+  const int width_ver = (w + 7) & ~7;
+  // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows
+  // above and (SUBPEL_TAPS / 2) extra rows below.
+  const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+  const int height_ver = (h + 5) - ((h + 5) % 6);
+  const int16x8_t filters0 =
+      vld1q_s16(coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK]);
+  const int16x8_t filters1 =
+      vld1q_s16(coef[(phase_scaler + 1 * step_q4) & SUBPEL_MASK]);
+  const int16x8_t filters2 =
+      vld1q_s16(coef[(phase_scaler + 2 * step_q4) & SUBPEL_MASK]);
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  uint8x8_t s[15], d[8];
+
+  assert(w && h);
+
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2;
+  d[6] = vdup_n_u8(0);
+  d[7] = vdup_n_u8(0);
+
+  // horizontal 6x8
+  do {
+    load_u8_8x8(src + 1, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                &s[6], &s[7]);
+    transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
+    x = width_hor;
+
+    do {
+      src += 8;
+      load_u8_8x8(src, src_stride, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12],
+                  &s[13], &s[14]);
+      transpose_u8_8x8(&s[7], &s[8], &s[9], &s[10], &s[11], &s[12], &s[13],
+                       &s[14]);
+
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      d[0] = scale_filter_8(&s[0], filters0);
+      d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1);
+      d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2);
+      d[3] = scale_filter_8(&s[4], filters0);
+      d[4] =
+          scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1);
+      d[5] =
+          scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2);
+
+      // 00 01 02 03 04 05 xx xx
+      // 10 11 12 13 14 15 xx xx
+      // 20 21 22 23 24 25 xx xx
+      // 30 31 32 33 34 35 xx xx
+      // 40 41 42 43 44 45 xx xx
+      // 50 51 52 53 54 55 xx xx
+      // 60 61 62 63 64 65 xx xx
+      // 70 71 72 73 74 75 xx xx
+      transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+      // store 2 extra pixels
+      vst1_u8(t + 0 * stride_hor, d[0]);
+      vst1_u8(t + 1 * stride_hor, d[1]);
+      vst1_u8(t + 2 * stride_hor, d[2]);
+      vst1_u8(t + 3 * stride_hor, d[3]);
+      vst1_u8(t + 4 * stride_hor, d[4]);
+      vst1_u8(t + 5 * stride_hor, d[5]);
+      vst1_u8(t + 6 * stride_hor, d[6]);
+      vst1_u8(t + 7 * stride_hor, d[7]);
+
+      s[0] = s[8];
+      s[1] = s[9];
+      s[2] = s[10];
+      s[3] = s[11];
+      s[4] = s[12];
+      s[5] = s[13];
+      s[6] = s[14];
+
+      t += 6;
+      x -= 6;
+    } while (x);
+    src += 8 * src_stride - 4 * width_hor / 3;
+    t += 7 * stride_hor + 2;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x6
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+                &s[7]);
+    t += 7 * stride_hor;
+    y = height_ver;
+
+    do {
+      load_u8_8x8(t, stride_hor, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12],
+                  &s[13], &s[14]);
+      t += 8 * stride_hor;
+
+      d[0] = scale_filter_8(&s[0], filters0);
+      d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1);
+      d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2);
+      d[3] = scale_filter_8(&s[4], filters0);
+      d[4] =
+          scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1);
+      d[5] =
+          scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2);
+      vst1_u8(dst + 0 * dst_stride, d[0]);
+      vst1_u8(dst + 1 * dst_stride, d[1]);
+      vst1_u8(dst + 2 * dst_stride, d[2]);
+      vst1_u8(dst + 3 * dst_stride, d[3]);
+      vst1_u8(dst + 4 * dst_stride, d[4]);
+      vst1_u8(dst + 5 * dst_stride, d[5]);
+
+      s[0] = s[8];
+      s[1] = s[9];
+      s[2] = s[10];
+      s[3] = s[11];
+      s[4] = s[12];
+      s[5] = s[13];
+      s[6] = s[14];
+
+      dst += 6 * dst_stride;
+      y -= 6;
+    } while (y);
+    t -= stride_hor * (4 * height_ver / 3 + 7);
+    t += 8;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+void vp9_scale_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src,
+                                     YV12_BUFFER_CONFIG *dst,
+                                     INTERP_FILTER filter_type,
+                                     int phase_scaler) {
+  const int src_w = src->y_crop_width;
+  const int src_h = src->y_crop_height;
+  const int dst_w = dst->y_crop_width;
+  const int dst_h = dst->y_crop_height;
+  const int dst_uv_w = dst->uv_crop_width;
+  const int dst_uv_h = dst->uv_crop_height;
+  int scaled = 0;
+
+  // phase_scaler is usually 0 or 8.
+  assert(phase_scaler >= 0 && phase_scaler < 16);
+
+  if (2 * dst_w == src_w && 2 * dst_h == src_h) {
+    // 2 to 1
+    scaled = 1;
+    if (phase_scaler == 0) {
+      scale_plane_2_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer,
+                                 dst->y_stride, dst_w, dst_h);
+      scale_plane_2_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+      scale_plane_2_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+    } else if (filter_type == BILINEAR) {
+      const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3];
+      const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4];
+      scale_plane_2_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
+                                  dst->y_stride, dst_w, dst_h, c0, c1);
+      scale_plane_2_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1);
+      scale_plane_2_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1);
+    } else {
+      const int buffer_stride = (dst_w + 3) & ~3;
+      const int buffer_height = (2 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
+      uint8_t *const temp_buffer =
+          (uint8_t *)malloc(buffer_stride * buffer_height);
+      if (temp_buffer) {
+        scale_plane_2_to_1_general(
+            src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+            dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer);
+        scale_plane_2_to_1_general(
+            src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+            temp_buffer);
+        scale_plane_2_to_1_general(
+            src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+            temp_buffer);
+        free(temp_buffer);
+      } else {
+        scaled = 0;
+      }
+    }
+  } else if (4 * dst_w == src_w && 4 * dst_h == src_h) {
+    // 4 to 1
+    scaled = 1;
+    if (phase_scaler == 0) {
+      scale_plane_4_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer,
+                                 dst->y_stride, dst_w, dst_h);
+      scale_plane_4_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+      scale_plane_4_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+    } else if (filter_type == BILINEAR) {
+      const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3];
+      const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4];
+      scale_plane_4_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
+                                  dst->y_stride, dst_w, dst_h, c0, c1);
+      scale_plane_4_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1);
+      scale_plane_4_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1);
+    } else {
+      const int buffer_stride = (dst_w + 1) & ~1;
+      const int buffer_height = (4 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
+      uint8_t *const temp_buffer =
+          (uint8_t *)malloc(buffer_stride * buffer_height);
+      if (temp_buffer) {
+        scale_plane_4_to_1_general(
+            src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+            dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer);
+        scale_plane_4_to_1_general(
+            src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+            temp_buffer);
+        scale_plane_4_to_1_general(
+            src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+            temp_buffer);
+        free(temp_buffer);
+      } else {
+        scaled = 0;
+      }
+    }
+  } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) {
+    // 4 to 3
+    const int buffer_stride = (dst_w + 5) - ((dst_w + 5) % 6) + 2;
+    const int buffer_height = (4 * dst_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+    uint8_t *const temp_buffer =
+        (uint8_t *)malloc(buffer_stride * buffer_height);
+    if (temp_buffer) {
+      scaled = 1;
+      if (filter_type == BILINEAR) {
+        scale_plane_4_to_3_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
+                                    dst->y_stride, dst_w, dst_h, phase_scaler,
+                                    temp_buffer);
+        scale_plane_4_to_3_bilinear(src->u_buffer, src->uv_stride,
+                                    dst->u_buffer, dst->uv_stride, dst_uv_w,
+                                    dst_uv_h, phase_scaler, temp_buffer);
+        scale_plane_4_to_3_bilinear(src->v_buffer, src->uv_stride,
+                                    dst->v_buffer, dst->uv_stride, dst_uv_w,
+                                    dst_uv_h, phase_scaler, temp_buffer);
+      } else {
+        scale_plane_4_to_3_general(
+            src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+            dst_h, vp9_filter_kernels[filter_type], phase_scaler, temp_buffer);
+        scale_plane_4_to_3_general(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                   dst->uv_stride, dst_uv_w, dst_uv_h,
+                                   vp9_filter_kernels[filter_type],
+                                   phase_scaler, temp_buffer);
+        scale_plane_4_to_3_general(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                   dst->uv_stride, dst_uv_w, dst_uv_h,
+                                   vp9_filter_kernels[filter_type],
+                                   phase_scaler, temp_buffer);
+      }
+      free(temp_buffer);
+    }
+  }
+
+  if (scaled) {
+    vpx_extend_frame_borders(dst);
+  } else {
+    // Call c version for all other scaling ratios.
+    vp9_scale_and_extend_frame_c(src, dst, filter_type, phase_scaler);
+  }
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_error_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_error_neon.c
new file mode 100644
index 0000000000..d9b183472d
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_error_neon.c
@@ -0,0 +1,49 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+int64_t vp9_highbd_block_error_neon(const tran_low_t *coeff,
+                                    const tran_low_t *dqcoeff,
+                                    intptr_t block_size, int64_t *ssz, int bd) {
+  uint64x2_t err_u64 = vdupq_n_u64(0);
+  int64x2_t ssz_s64 = vdupq_n_s64(0);
+
+  const int shift = 2 * (bd - 8);
+  const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+  assert(block_size >= 16);
+  assert((block_size % 16) == 0);
+
+  do {
+    const int32x4_t c = load_tran_low_to_s32q(coeff);
+    const int32x4_t d = load_tran_low_to_s32q(dqcoeff);
+
+    const uint32x4_t diff = vreinterpretq_u32_s32(vabdq_s32(c, d));
+
+    err_u64 = vmlal_u32(err_u64, vget_low_u32(diff), vget_low_u32(diff));
+    err_u64 = vmlal_u32(err_u64, vget_high_u32(diff), vget_high_u32(diff));
+
+    ssz_s64 = vmlal_s32(ssz_s64, vget_low_s32(c), vget_low_s32(c));
+    ssz_s64 = vmlal_s32(ssz_s64, vget_high_s32(c), vget_high_s32(c));
+
+    coeff += 4;
+    dqcoeff += 4;
+    block_size -= 4;
+  } while (block_size != 0);
+
+  *ssz = (horizontal_add_int64x2(ssz_s64) + rounding) >> shift;
+  return ((int64_t)horizontal_add_uint64x2(err_u64) + rounding) >> shift;
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c
new file mode 100644
index 0000000000..f990a747ed
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c
@@ -0,0 +1,1076 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+#include "vp9/encoder/vp9_temporal_filter_constants.h"
+
+// Compute (a-b)**2 for 8 pixels with size 16-bit
+static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b,
+                                       uint32_t *dst) {
+  const uint16x8_t a_reg = vld1q_u16(a);
+  const uint16x8_t b_reg = vld1q_u16(b);
+
+  uint16x8_t dist = vabdq_u16(a_reg, b_reg);
+  uint32x4_t dist_first = vmull_u16(vget_low_u16(dist), vget_low_u16(dist));
+  uint32x4_t dist_second = vmull_u16(vget_high_u16(dist), vget_high_u16(dist));
+
+  vst1q_u32(dst, dist_first);
+  vst1q_u32(dst + 4, dist_second);
+}
+
+// Sum up three neighboring distortions for the pixels
+static INLINE void highbd_get_sum_4(const uint32_t *dist, uint32x4_t *sum) {
+  uint32x4_t dist_reg, dist_left, dist_right;
+
+  dist_reg = vld1q_u32(dist);
+  dist_left = vld1q_u32(dist - 1);
+  dist_right = vld1q_u32(dist + 1);
+
+  *sum = vaddq_u32(dist_reg, dist_left);
+  *sum = vaddq_u32(*sum, dist_right);
+}
+
+static INLINE void highbd_get_sum_8(const uint32_t *dist, uint32x4_t *sum_first,
+                                    uint32x4_t *sum_second) {
+  highbd_get_sum_4(dist, sum_first);
+  highbd_get_sum_4(dist + 4, sum_second);
+}
+
+// Average the value based on the number of values summed (9 for pixels away
+// from the border, 4 for pixels in corners, and 6 for other edge values, plus
+// however many values from y/uv plane are).
+//
+// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
+// by weight.
+static INLINE void highbd_average_4(uint32x4_t *output, const uint32x4_t sum,
+                                    const uint32x4_t *mul_constants,
+                                    const int strength, const int rounding,
+                                    const int weight) {
+  const int64x2_t strength_s64 = vdupq_n_s64(-strength - 32);
+  const uint64x2_t rounding_u64 = vdupq_n_u64((uint64_t)rounding << 32);
+  const uint32x4_t weight_u32 = vdupq_n_u32(weight);
+  const uint32x4_t sixteen = vdupq_n_u32(16);
+  uint32x4_t sum2;
+
+  // modifier * 3 / index;
+  uint64x2_t sum_lo =
+      vmlal_u32(rounding_u64, vget_low_u32(sum), vget_low_u32(*mul_constants));
+  uint64x2_t sum_hi = vmlal_u32(rounding_u64, vget_high_u32(sum),
+                                vget_high_u32(*mul_constants));
+
+  // we cannot use vshrn_n_u64 as strength is not known at compile time.
+  sum_lo = vshlq_u64(sum_lo, strength_s64);
+  sum_hi = vshlq_u64(sum_hi, strength_s64);
+
+  sum2 = vcombine_u32(vmovn_u64(sum_lo), vmovn_u64(sum_hi));
+
+  // Multiply with the weight
+  sum2 = vminq_u32(sum2, sixteen);
+  sum2 = vsubq_u32(sixteen, sum2);
+  *output = vmulq_u32(sum2, weight_u32);
+}
+
+static INLINE void highbd_average_8(uint32x4_t *output_0, uint32x4_t *output_1,
+                                    const uint32x4_t sum_0_u32,
+                                    const uint32x4_t sum_1_u32,
+                                    const uint32x4_t *mul_constants_0,
+                                    const uint32x4_t *mul_constants_1,
+                                    const int strength, const int rounding,
+                                    const int weight) {
+  highbd_average_4(output_0, sum_0_u32, mul_constants_0, strength, rounding,
+                   weight);
+  highbd_average_4(output_1, sum_1_u32, mul_constants_1, strength, rounding,
+                   weight);
+}
+
+// Add 'sum_u32' to 'count'. Multiply by 'pred' and add to 'accumulator.'
+static INLINE void highbd_accumulate_and_store_8(
+    const uint32x4_t sum_first_u32, const uint32x4_t sum_second_u32,
+    const uint16_t *pred, uint16_t *count, uint32_t *accumulator) {
+  const uint16x8_t sum_u16 =
+      vcombine_u16(vqmovn_u32(sum_first_u32), vqmovn_u32(sum_second_u32));
+  uint16x8_t pred_u16 = vld1q_u16(pred);
+  uint16x8_t count_u16 = vld1q_u16(count);
+  uint32x4_t pred_0_u32, pred_1_u32;
+  uint32x4_t accum_0_u32, accum_1_u32;
+
+  count_u16 = vqaddq_u16(count_u16, sum_u16);
+  vst1q_u16(count, count_u16);
+
+  accum_0_u32 = vld1q_u32(accumulator);
+  accum_1_u32 = vld1q_u32(accumulator + 4);
+
+  pred_0_u32 = vmovl_u16(vget_low_u16(pred_u16));
+  pred_1_u32 = vmovl_u16(vget_high_u16(pred_u16));
+
+  // Don't use sum_u16 as that produces different results to the C version
+  accum_0_u32 = vmlaq_u32(accum_0_u32, sum_first_u32, pred_0_u32);
+  accum_1_u32 = vmlaq_u32(accum_1_u32, sum_second_u32, pred_1_u32);
+
+  vst1q_u32(accumulator, accum_0_u32);
+  vst1q_u32(accumulator + 4, accum_1_u32);
+}
+
+static INLINE void highbd_read_dist_4(const uint32_t *dist,
+                                      uint32x4_t *dist_reg) {
+  *dist_reg = vld1q_u32(dist);
+}
+
+static INLINE void highbd_read_dist_8(const uint32_t *dist,
+                                      uint32x4_t *reg_first,
+                                      uint32x4_t *reg_second) {
+  highbd_read_dist_4(dist, reg_first);
+  highbd_read_dist_4(dist + 4, reg_second);
+}
+
+static INLINE void highbd_read_chroma_dist_row_8(
+    int ss_x, const uint32_t *u_dist, const uint32_t *v_dist,
+    uint32x4_t *u_first, uint32x4_t *u_second, uint32x4_t *v_first,
+    uint32x4_t *v_second) {
+  if (!ss_x) {
+    // If there is no chroma subsampling in the horizontal direction, then we
+    // need to load 8 entries from chroma.
+    highbd_read_dist_8(u_dist, u_first, u_second);
+    highbd_read_dist_8(v_dist, v_first, v_second);
+  } else {  // ss_x == 1
+    // Otherwise, we only need to load 8 entries
+    uint32x4_t u_reg, v_reg;
+    uint32x4x2_t pair;
+
+    highbd_read_dist_4(u_dist, &u_reg);
+
+    pair = vzipq_u32(u_reg, u_reg);
+    *u_first = pair.val[0];
+    *u_second = pair.val[1];
+
+    highbd_read_dist_4(v_dist, &v_reg);
+
+    pair = vzipq_u32(v_reg, v_reg);
+    *v_first = pair.val[0];
+    *v_second = pair.val[1];
+  }
+}
+
+static void highbd_apply_temporal_filter_luma_8(
+    const uint16_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist,
+    const uint32_t *const *neighbors_first,
+    const uint32_t *const *neighbors_second, int top_weight,
+    int bottom_weight) {
+  const int rounding = (1 << strength) >> 1;
+  int weight = top_weight;
+
+  uint32x4_t mul_first, mul_second;
+
+  uint32x4_t sum_row_1_first, sum_row_1_second;
+  uint32x4_t sum_row_2_first, sum_row_2_second;
+  uint32x4_t sum_row_3_first, sum_row_3_second;
+
+  uint32x4_t u_first, u_second;
+  uint32x4_t v_first, v_second;
+
+  uint32x4_t sum_row_first;
+  uint32x4_t sum_row_second;
+
+  // Loop variables
+  unsigned int h;
+
+  assert(strength >= 4 && strength <= 14 &&
+         "invalid adjusted temporal filter strength");
+  assert(block_width == 8);
+
+  (void)block_width;
+
+  // First row
+  mul_first = vld1q_u32(neighbors_first[0]);
+  mul_second = vld1q_u32(neighbors_second[0]);
+
+  // Add luma values
+  highbd_get_sum_8(y_dist, &sum_row_2_first, &sum_row_2_second);
+  highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+  // We don't need to saturate here because the maximum value is UINT12_MAX ** 2
+  // * 9 ~= 2**24 * 9 < 2 ** 28 < INT32_MAX
+  sum_row_first = vaddq_u32(sum_row_2_first, sum_row_3_first);
+  sum_row_second = vaddq_u32(sum_row_2_second, sum_row_3_second);
+
+  // Add chroma values
+  highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+                                &v_first, &v_second);
+
+  // Max value here is 2 ** 24 * (9 + 2), so no saturation is needed
+  sum_row_first = vaddq_u32(sum_row_first, u_first);
+  sum_row_second = vaddq_u32(sum_row_second, u_second);
+
+  sum_row_first = vaddq_u32(sum_row_first, v_first);
+  sum_row_second = vaddq_u32(sum_row_second, v_second);
+
+  // Get modifier and store result
+  highbd_average_8(&sum_row_first, &sum_row_second, sum_row_first,
+                   sum_row_second, &mul_first, &mul_second, strength, rounding,
+                   weight);
+
+  highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+                                y_accum);
+
+  y_pre += y_pre_stride;
+  y_count += y_pre_stride;
+  y_accum += y_pre_stride;
+  y_dist += DIST_STRIDE;
+
+  u_dist += DIST_STRIDE;
+  v_dist += DIST_STRIDE;
+
+  // Then all the rows except the last one
+  mul_first = vld1q_u32(neighbors_first[1]);
+  mul_second = vld1q_u32(neighbors_second[1]);
+
+  for (h = 1; h < block_height - 1; ++h) {
+    // Move the weight to bottom half
+    if (!use_whole_blk && h == block_height / 2) {
+      weight = bottom_weight;
+    }
+    // Shift the rows up
+    sum_row_1_first = sum_row_2_first;
+    sum_row_1_second = sum_row_2_second;
+    sum_row_2_first = sum_row_3_first;
+    sum_row_2_second = sum_row_3_second;
+
+    // Add luma values to the modifier
+    sum_row_first = vaddq_u32(sum_row_1_first, sum_row_2_first);
+    sum_row_second = vaddq_u32(sum_row_1_second, sum_row_2_second);
+
+    highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+    sum_row_first = vaddq_u32(sum_row_first, sum_row_3_first);
+    sum_row_second = vaddq_u32(sum_row_second, sum_row_3_second);
+
+    // Add chroma values to the modifier
+    if (ss_y == 0 || h % 2 == 0) {
+      // Only calculate the new chroma distortion if we are at a pixel that
+      // corresponds to a new chroma row
+      highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+                                    &v_first, &v_second);
+
+      u_dist += DIST_STRIDE;
+      v_dist += DIST_STRIDE;
+    }
+
+    sum_row_first = vaddq_u32(sum_row_first, u_first);
+    sum_row_second = vaddq_u32(sum_row_second, u_second);
+    sum_row_first = vaddq_u32(sum_row_first, v_first);
+    sum_row_second = vaddq_u32(sum_row_second, v_second);
+
+    // Get modifier and store result
+    highbd_average_8(&sum_row_first, &sum_row_second, sum_row_first,
+                     sum_row_second, &mul_first, &mul_second, strength,
+                     rounding, weight);
+    highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+                                  y_accum);
+
+    y_pre += y_pre_stride;
+    y_count += y_pre_stride;
+    y_accum += y_pre_stride;
+    y_dist += DIST_STRIDE;
+  }
+
+  // The last row
+  mul_first = vld1q_u32(neighbors_first[0]);
+  mul_second = vld1q_u32(neighbors_second[0]);
+
+  // Shift the rows up
+  sum_row_1_first = sum_row_2_first;
+  sum_row_1_second = sum_row_2_second;
+  sum_row_2_first = sum_row_3_first;
+  sum_row_2_second = sum_row_3_second;
+
+  // Add luma values to the modifier
+  sum_row_first = vaddq_u32(sum_row_1_first, sum_row_2_first);
+  sum_row_second = vaddq_u32(sum_row_1_second, sum_row_2_second);
+
+  // Add chroma values to the modifier
+  if (ss_y == 0) {
+    // Only calculate the new chroma distortion if we are at a pixel that
+    // corresponds to a new chroma row
+    highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+                                  &v_first, &v_second);
+  }
+
+  sum_row_first = vaddq_u32(sum_row_first, u_first);
+  sum_row_second = vaddq_u32(sum_row_second, u_second);
+  sum_row_first = vaddq_u32(sum_row_first, v_first);
+  sum_row_second = vaddq_u32(sum_row_second, v_second);
+
+  // Get modifier and store result
+  highbd_average_8(&sum_row_first, &sum_row_second, sum_row_first,
+                   sum_row_second, &mul_first, &mul_second, strength, rounding,
+                   weight);
+  highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+                                y_accum);
+}
+
+// Perform temporal filter for the luma component.
+static void highbd_apply_temporal_filter_luma(
+    const uint16_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x;
+  const unsigned int mid_width = block_width >> 1,
+                     last_width = block_width - blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const uint32_t *const *neighbors_first;
+  const uint32_t *const *neighbors_second;
+
+  // Left
+  neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS;
+  neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  highbd_apply_temporal_filter_luma_8(
+      y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+      strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+      neighbors_first, neighbors_second, top_weight, bottom_weight);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  neighbors_first = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  for (; blk_col < mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    highbd_apply_temporal_filter_luma_8(
+        y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+        strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_first, neighbors_second, top_weight, bottom_weight);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; blk_col < last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    highbd_apply_temporal_filter_luma_8(
+        y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+        strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_first, neighbors_second, top_weight, bottom_weight);
+  }
+
+  // Right
+  neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS;
+  highbd_apply_temporal_filter_luma_8(
+      y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+      strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+      neighbors_first, neighbors_second, top_weight, bottom_weight);
+}
+
+// Add a row of luma distortion that corresponds to 8 chroma mods. If we are
+// subsampling in x direction, then we have 16 lumas, else we have 8.
+static INLINE void highbd_add_luma_dist_to_8_chroma_mod(
+    const uint32_t *y_dist, int ss_x, int ss_y, uint32x4_t *u_mod_fst,
+    uint32x4_t *u_mod_snd, uint32x4_t *v_mod_fst, uint32x4_t *v_mod_snd) {
+  uint32x4_t y_reg_fst, y_reg_snd;
+  if (!ss_x) {
+    highbd_read_dist_8(y_dist, &y_reg_fst, &y_reg_snd);
+    if (ss_y == 1) {
+      uint32x4_t y_tmp_fst, y_tmp_snd;
+      highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+      y_reg_fst = vaddq_u32(y_reg_fst, y_tmp_fst);
+      y_reg_snd = vaddq_u32(y_reg_snd, y_tmp_snd);
+    }
+  } else {
+    // Temporary
+    uint32x4_t y_fst, y_snd;
+    uint64x2_t y_fst64, y_snd64;
+
+    // First 8
+    highbd_read_dist_8(y_dist, &y_fst, &y_snd);
+    if (ss_y == 1) {
+      uint32x4_t y_tmp_fst, y_tmp_snd;
+      highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+
+      y_fst = vaddq_u32(y_fst, y_tmp_fst);
+      y_snd = vaddq_u32(y_snd, y_tmp_snd);
+    }
+
+    y_fst64 = vpaddlq_u32(y_fst);
+    y_snd64 = vpaddlq_u32(y_snd);
+    y_reg_fst = vcombine_u32(vqmovn_u64(y_fst64), vqmovn_u64(y_snd64));
+
+    // Second 8
+    highbd_read_dist_8(y_dist + 8, &y_fst, &y_snd);
+    if (ss_y == 1) {
+      uint32x4_t y_tmp_fst, y_tmp_snd;
+      highbd_read_dist_8(y_dist + 8 + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+
+      y_fst = vaddq_u32(y_fst, y_tmp_fst);
+      y_snd = vaddq_u32(y_snd, y_tmp_snd);
+    }
+
+    y_fst64 = vpaddlq_u32(y_fst);
+    y_snd64 = vpaddlq_u32(y_snd);
+    y_reg_snd = vcombine_u32(vqmovn_u64(y_fst64), vqmovn_u64(y_snd64));
+  }
+
+  *u_mod_fst = vaddq_u32(*u_mod_fst, y_reg_fst);
+  *u_mod_snd = vaddq_u32(*u_mod_snd, y_reg_snd);
+  *v_mod_fst = vaddq_u32(*v_mod_fst, y_reg_fst);
+  *v_mod_snd = vaddq_u32(*v_mod_snd, y_reg_snd);
+}
+
+// Apply temporal filter to the chroma components. This performs temporal
+// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
+// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void highbd_apply_temporal_filter_chroma_8(
+    const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride,
+    unsigned int uv_block_width, unsigned int uv_block_height, int ss_x,
+    int ss_y, int strength, uint32_t *u_accum, uint16_t *u_count,
+    uint32_t *v_accum, uint16_t *v_count, const uint32_t *y_dist,
+    const uint32_t *u_dist, const uint32_t *v_dist,
+    const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd,
+    int top_weight, int bottom_weight, const int *blk_fw) {
+  const int rounding = (1 << strength) >> 1;
+  int weight = top_weight;
+
+  uint32x4_t mul_fst, mul_snd;
+
+  uint32x4_t u_sum_row_1_fst, u_sum_row_2_fst, u_sum_row_3_fst;
+  uint32x4_t v_sum_row_1_fst, v_sum_row_2_fst, v_sum_row_3_fst;
+  uint32x4_t u_sum_row_1_snd, u_sum_row_2_snd, u_sum_row_3_snd;
+  uint32x4_t v_sum_row_1_snd, v_sum_row_2_snd, v_sum_row_3_snd;
+
+  uint32x4_t u_sum_row_fst, v_sum_row_fst;
+  uint32x4_t u_sum_row_snd, v_sum_row_snd;
+
+  // Loop variable
+  unsigned int h;
+
+  (void)uv_block_width;
+
+  // First row
+  mul_fst = vld1q_u32(neighbors_fst[0]);
+  mul_snd = vld1q_u32(neighbors_snd[0]);
+
+  // Add chroma values
+  highbd_get_sum_8(u_dist, &u_sum_row_2_fst, &u_sum_row_2_snd);
+  highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
+
+  u_sum_row_fst = vaddq_u32(u_sum_row_2_fst, u_sum_row_3_fst);
+  u_sum_row_snd = vaddq_u32(u_sum_row_2_snd, u_sum_row_3_snd);
+
+  highbd_get_sum_8(v_dist, &v_sum_row_2_fst, &v_sum_row_2_snd);
+  highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
+
+  v_sum_row_fst = vaddq_u32(v_sum_row_2_fst, v_sum_row_3_fst);
+  v_sum_row_snd = vaddq_u32(v_sum_row_2_snd, v_sum_row_3_snd);
+
+  // Add luma values
+  highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+                                       &u_sum_row_snd, &v_sum_row_fst,
+                                       &v_sum_row_snd);
+
+  // Get modifier and store result
+  if (blk_fw) {
+    highbd_average_4(&u_sum_row_fst, u_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&u_sum_row_snd, u_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+    highbd_average_4(&v_sum_row_fst, v_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&v_sum_row_snd, v_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+  } else {
+    highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, u_sum_row_fst,
+                     u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+    highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, v_sum_row_fst,
+                     v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+  }
+  highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+                                u_accum);
+  highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+                                v_accum);
+
+  u_pre += uv_pre_stride;
+  u_dist += DIST_STRIDE;
+  v_pre += uv_pre_stride;
+  v_dist += DIST_STRIDE;
+  u_count += uv_pre_stride;
+  u_accum += uv_pre_stride;
+  v_count += uv_pre_stride;
+  v_accum += uv_pre_stride;
+
+  y_dist += DIST_STRIDE * (1 + ss_y);
+
+  // Then all the rows except the last one
+  mul_fst = vld1q_u32(neighbors_fst[1]);
+  mul_snd = vld1q_u32(neighbors_snd[1]);
+
+  for (h = 1; h < uv_block_height - 1; ++h) {
+    // Move the weight pointer to the bottom half of the blocks
+    if (h == uv_block_height / 2) {
+      if (blk_fw) {
+        blk_fw += 2;
+      } else {
+        weight = bottom_weight;
+      }
+    }
+
+    // Shift the rows up
+    u_sum_row_1_fst = u_sum_row_2_fst;
+    u_sum_row_2_fst = u_sum_row_3_fst;
+    u_sum_row_1_snd = u_sum_row_2_snd;
+    u_sum_row_2_snd = u_sum_row_3_snd;
+
+    v_sum_row_1_fst = v_sum_row_2_fst;
+    v_sum_row_2_fst = v_sum_row_3_fst;
+    v_sum_row_1_snd = v_sum_row_2_snd;
+    v_sum_row_2_snd = v_sum_row_3_snd;
+
+    // Add chroma values
+    u_sum_row_fst = vaddq_u32(u_sum_row_1_fst, u_sum_row_2_fst);
+    u_sum_row_snd = vaddq_u32(u_sum_row_1_snd, u_sum_row_2_snd);
+    highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
+    u_sum_row_fst = vaddq_u32(u_sum_row_fst, u_sum_row_3_fst);
+    u_sum_row_snd = vaddq_u32(u_sum_row_snd, u_sum_row_3_snd);
+
+    v_sum_row_fst = vaddq_u32(v_sum_row_1_fst, v_sum_row_2_fst);
+    v_sum_row_snd = vaddq_u32(v_sum_row_1_snd, v_sum_row_2_snd);
+    highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
+    v_sum_row_fst = vaddq_u32(v_sum_row_fst, v_sum_row_3_fst);
+    v_sum_row_snd = vaddq_u32(v_sum_row_snd, v_sum_row_3_snd);
+
+    // Add luma values
+    highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+                                         &u_sum_row_snd, &v_sum_row_fst,
+                                         &v_sum_row_snd);
+
+    // Get modifier and store result
+    if (blk_fw) {
+      highbd_average_4(&u_sum_row_fst, u_sum_row_fst, &mul_fst, strength,
+                       rounding, blk_fw[0]);
+      highbd_average_4(&u_sum_row_snd, u_sum_row_snd, &mul_snd, strength,
+                       rounding, blk_fw[1]);
+
+      highbd_average_4(&v_sum_row_fst, v_sum_row_fst, &mul_fst, strength,
+                       rounding, blk_fw[0]);
+      highbd_average_4(&v_sum_row_snd, v_sum_row_snd, &mul_snd, strength,
+                       rounding, blk_fw[1]);
+
+    } else {
+      highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, u_sum_row_fst,
+                       u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                       weight);
+      highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, v_sum_row_fst,
+                       v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                       weight);
+    }
+
+    highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+                                  u_accum);
+    highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+                                  v_accum);
+
+    u_pre += uv_pre_stride;
+    u_dist += DIST_STRIDE;
+    v_pre += uv_pre_stride;
+    v_dist += DIST_STRIDE;
+    u_count += uv_pre_stride;
+    u_accum += uv_pre_stride;
+    v_count += uv_pre_stride;
+    v_accum += uv_pre_stride;
+
+    y_dist += DIST_STRIDE * (1 + ss_y);
+  }
+
+  // The last row
+  mul_fst = vld1q_u32(neighbors_fst[0]);
+  mul_snd = vld1q_u32(neighbors_snd[0]);
+
+  // Shift the rows up
+  u_sum_row_1_fst = u_sum_row_2_fst;
+  u_sum_row_2_fst = u_sum_row_3_fst;
+  u_sum_row_1_snd = u_sum_row_2_snd;
+  u_sum_row_2_snd = u_sum_row_3_snd;
+
+  v_sum_row_1_fst = v_sum_row_2_fst;
+  v_sum_row_2_fst = v_sum_row_3_fst;
+  v_sum_row_1_snd = v_sum_row_2_snd;
+  v_sum_row_2_snd = v_sum_row_3_snd;
+
+  // Add chroma values
+  u_sum_row_fst = vaddq_u32(u_sum_row_1_fst, u_sum_row_2_fst);
+  v_sum_row_fst = vaddq_u32(v_sum_row_1_fst, v_sum_row_2_fst);
+  u_sum_row_snd = vaddq_u32(u_sum_row_1_snd, u_sum_row_2_snd);
+  v_sum_row_snd = vaddq_u32(v_sum_row_1_snd, v_sum_row_2_snd);
+
+  // Add luma values
+  highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+                                       &u_sum_row_snd, &v_sum_row_fst,
+                                       &v_sum_row_snd);
+
+  // Get modifier and store result
+  if (blk_fw) {
+    highbd_average_4(&u_sum_row_fst, u_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&u_sum_row_snd, u_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+    highbd_average_4(&v_sum_row_fst, v_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&v_sum_row_snd, v_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+  } else {
+    highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, u_sum_row_fst,
+                     u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+    highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, v_sum_row_fst,
+                     v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+  }
+
+  highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+                                u_accum);
+  highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+                                v_accum);
+}
+
+// Perform temporal filter for the chroma components.
+static void highbd_apply_temporal_filter_chroma(
+    const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride,
+    unsigned int block_width, unsigned int block_height, int ss_x, int ss_y,
+    int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum,
+    uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
+  const unsigned int uv_width = block_width >> ss_x,
+                     uv_height = block_height >> ss_y;
+
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
+  const unsigned int uv_mid_width = uv_width >> 1,
+                     uv_last_width = uv_width - uv_blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const uint32_t *const *neighbors_fst;
+  const uint32_t *const *neighbors_snd;
+
+  if (uv_width == 8) {
+    // Special Case: We are subsampling in x direction on a 16x16 block. Since
+    // we are operating on a row of 8 chroma pixels, we can't use the usual
+    // left-middle-right pattern.
+    assert(ss_x);
+
+    if (ss_y) {
+      neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+      neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+    } else {
+      neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+      neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+    }
+
+    if (use_whole_blk) {
+      highbd_apply_temporal_filter_chroma_8(
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+          neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+    } else {
+      highbd_apply_temporal_filter_chroma_8(
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+          neighbors_fst, neighbors_snd, 0, 0, blk_fw);
+    }
+
+    return;
+  }
+
+  // Left
+  if (ss_x && ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+    neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+    neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else {
+    neighbors_fst = HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
+    neighbors_snd = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+  }
+
+  highbd_apply_temporal_filter_chroma_8(
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+      uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+      u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst,
+      neighbors_snd, top_weight, bottom_weight, NULL);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  if (ss_x && ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else {
+    neighbors_fst = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+  }
+
+  for (; uv_blk_col < uv_mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    highbd_apply_temporal_filter_chroma_8(
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; uv_blk_col < uv_last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    highbd_apply_temporal_filter_chroma_8(
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+  }
+
+  // Right
+  if (ss_x && ss_y) {
+    neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else {
+    neighbors_snd = HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
+  }
+
+  highbd_apply_temporal_filter_chroma_8(
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+      uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+      u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst,
+      neighbors_snd, top_weight, bottom_weight, NULL);
+}
+
+void vp9_highbd_apply_temporal_filter_neon(
+    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *const blk_fw,
+    int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum,
+    uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) {
+  const unsigned int chroma_height = block_height >> ss_y,
+                     chroma_width = block_width >> ss_x;
+
+  DECLARE_ALIGNED(16, uint32_t, y_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint32_t, u_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint32_t, v_dist[BH * DIST_STRIDE]) = { 0 };
+
+  uint32_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
+           *v_dist_ptr = v_dist + 1;
+  const uint16_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src;
+  const uint16_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre;
+
+  // Loop variables
+  unsigned int row, blk_col;
+
+  assert(block_width <= BW && "block width too large");
+  assert(block_height <= BH && "block height too large");
+  assert(block_width % 16 == 0 && "block width must be multiple of 16");
+  assert(block_height % 2 == 0 && "block height must be even");
+  assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
+         "invalid chroma subsampling");
+  assert(strength >= 4 && strength <= 14 &&
+         "invalid adjusted temporal filter strength");
+  assert(blk_fw[0] >= 0 && "filter weight must be positive");
+  assert(
+      (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
+      "subblock filter weight must be positive");
+  assert(blk_fw[0] <= 2 && "subblock filter weight must be less than 2");
+  assert(
+      (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
+      "subblock filter weight must be less than 2");
+
+  // Precompute the difference squared
+  for (row = 0; row < block_height; row++) {
+    for (blk_col = 0; blk_col < block_width; blk_col += 8) {
+      highbd_store_dist_8(y_src_ptr + blk_col, y_pre_ptr + blk_col,
+                          y_dist_ptr + blk_col);
+    }
+    y_src_ptr += y_src_stride;
+    y_pre_ptr += y_pre_stride;
+    y_dist_ptr += DIST_STRIDE;
+  }
+
+  for (row = 0; row < chroma_height; row++) {
+    for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
+      highbd_store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
+                          u_dist_ptr + blk_col);
+      highbd_store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
+                          v_dist_ptr + blk_col);
+    }
+
+    u_src_ptr += uv_src_stride;
+    u_pre_ptr += uv_pre_stride;
+    u_dist_ptr += DIST_STRIDE;
+    v_src_ptr += uv_src_stride;
+    v_pre_ptr += uv_pre_stride;
+    v_dist_ptr += DIST_STRIDE;
+  }
+
+  y_dist_ptr = y_dist + 1;
+  u_dist_ptr = u_dist + 1;
+  v_dist_ptr = v_dist + 1;
+
+  highbd_apply_temporal_filter_luma(y_pre, y_pre_stride, block_width,
+                                    block_height, ss_x, ss_y, strength, blk_fw,
+                                    use_whole_blk, y_accum, y_count, y_dist_ptr,
+                                    u_dist_ptr, v_dist_ptr);
+
+  highbd_apply_temporal_filter_chroma(
+      u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
+      strength, blk_fw, use_whole_blk, u_accum, u_count, v_accum, v_count,
+      y_dist_ptr, u_dist_ptr, v_dist_ptr);
+}
+
+static INLINE uint16x8_t highbd_convolve12_8(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16x8_t s8,
+    const int16x8_t s9, const int16x8_t sA, const int16x8_t sB,
+    const int16x8_t filter_0_7, const int16x4_t filter_8_11, uint16x8_t max) {
+  const int16x4_t filter_0_3 = vget_low_s16(filter_0_7);
+  const int16x4_t filter_4_7 = vget_high_s16(filter_0_7);
+
+  int32x4_t sum_lo = vmull_lane_s16(vget_low_s16(s0), filter_0_3, 0);
+  sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s1), filter_0_3, 1);
+  sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s2), filter_0_3, 2);
+  sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), filter_0_3, 3);
+  sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s4), filter_4_7, 0);
+  sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s5), filter_4_7, 1);
+  sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s6), filter_4_7, 2);
+  sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s7), filter_4_7, 3);
+  sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s8), filter_8_11, 0);
+  sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s9), filter_8_11, 1);
+  sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(sA), filter_8_11, 2);
+  sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(sB), filter_8_11, 3);
+
+  int32x4_t sum_hi = vmull_lane_s16(vget_high_s16(s0), filter_0_3, 0);
+  sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s1), filter_0_3, 1);
+  sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s2), filter_0_3, 2);
+  sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), filter_0_3, 3);
+  sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s4), filter_4_7, 0);
+  sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s5), filter_4_7, 1);
+  sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s6), filter_4_7, 2);
+  sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s7), filter_4_7, 3);
+  sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s8), filter_8_11, 0);
+  sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s9), filter_8_11, 1);
+  sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(sA), filter_8_11, 2);
+  sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(sB), filter_8_11, 3);
+
+  uint16x4_t sum_lo_s16 = vqrshrun_n_s32(sum_lo, FILTER_BITS);
+  uint16x4_t sum_hi_s16 = vqrshrun_n_s32(sum_hi, FILTER_BITS);
+
+  uint16x8_t sum = vcombine_u16(sum_lo_s16, sum_hi_s16);
+  return vminq_u16(sum, max);
+}
+
+void vpx_highbd_convolve12_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
+                                      uint16_t *dst, ptrdiff_t dst_stride,
+                                      const InterpKernel12 *filter, int x0_q4,
+                                      int x_step_q4, int y0_q4, int y_step_q4,
+                                      int w, int h, int bd) {
+  // Scaling not supported by Neon implementation.
+  if (x_step_q4 != 16) {
+    vpx_highbd_convolve12_horiz_c(src, src_stride, dst, dst_stride, filter,
+                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
+    return;
+  }
+
+  assert(w == 32 || w == 16 || w == 8);
+  assert(h % 4 == 0);
+
+  const int16x8_t filter_0_7 = vld1q_s16(filter[x0_q4]);
+  const int16x4_t filter_8_11 = vld1_s16(filter[x0_q4] + 8);
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+  src -= MAX_FILTER_TAP / 2 - 1;
+
+  do {
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+    int width = w;
+
+    do {
+      int16x8_t s0[12], s1[12];
+      load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                    &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10],
+                    &s0[11]);
+      load_s16_8x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                    &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10],
+                    &s1[11]);
+
+      uint16x8_t d0 = highbd_convolve12_8(
+          s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6], s0[7], s0[8], s0[9],
+          s0[10], s0[11], filter_0_7, filter_8_11, max);
+      uint16x8_t d1 = highbd_convolve12_8(
+          s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6], s1[7], s1[8], s1[9],
+          s1[10], s1[11], filter_0_7, filter_8_11, max);
+
+      vst1q_u16(d + 0 * dst_stride, d0);
+      vst1q_u16(d + 1 * dst_stride, d1);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+    src += 2 * src_stride;
+    dst += 2 * dst_stride;
+    h -= 2;
+  } while (h != 0);
+}
+
+void vpx_highbd_convolve12_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
+                                     uint16_t *dst, ptrdiff_t dst_stride,
+                                     const InterpKernel12 *filter, int x0_q4,
+                                     int x_step_q4, int y0_q4, int y_step_q4,
+                                     int w, int h, int bd) {
+  // Scaling not supported by Neon implementation.
+  if (y_step_q4 != 16) {
+    vpx_highbd_convolve12_vert_c(src, src_stride, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
+    return;
+  }
+
+  assert(w == 32 || w == 16 || w == 8);
+  assert(h == 32 || h == 16 || h == 8);
+
+  const int16x8_t filter_0_7 = vld1q_s16(filter[y0_q4]);
+  const int16x4_t filter_8_11 = vld1_s16(filter[y0_q4] + 8);
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+  src -= src_stride * (MAX_FILTER_TAP / 2 - 1);
+
+  do {
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+    int height = h;
+
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA;
+    load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
+                  &s9, &sA);
+    s += 11 * src_stride;
+
+    do {
+      int16x8_t sB, sC, sD, sE;
+      load_s16_8x4(s, src_stride, &sB, &sC, &sD, &sE);
+
+      uint16x8_t d0 =
+          highbd_convolve12_8(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB,
+                              filter_0_7, filter_8_11, max);
+      uint16x8_t d1 =
+          highbd_convolve12_8(s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC,
+                              filter_0_7, filter_8_11, max);
+      uint16x8_t d2 =
+          highbd_convolve12_8(s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD,
+                              filter_0_7, filter_8_11, max);
+      uint16x8_t d3 =
+          highbd_convolve12_8(s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE,
+                              filter_0_7, filter_8_11, max);
+
+      store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = sA;
+      s7 = sB;
+      s8 = sC;
+      s9 = sD;
+      sA = sE;
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+    src += 8;
+    dst += 8;
+    w -= 8;
+  } while (w != 0);
+}
+
+void vpx_highbd_convolve12_neon(const uint16_t *src, ptrdiff_t src_stride,
+                                uint16_t *dst, ptrdiff_t dst_stride,
+                                const InterpKernel12 *filter, int x0_q4,
+                                int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                int h, int bd) {
+  // Scaling not supported by Neon implementation.
+  if (x_step_q4 != 16 || y_step_q4 != 16) {
+    vpx_highbd_convolve12_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                            x_step_q4, y0_q4, y_step_q4, w, h, bd);
+    return;
+  }
+
+  assert(w == 32 || w == 16 || w == 8);
+  assert(h == 32 || h == 16 || h == 8);
+
+  DECLARE_ALIGNED(32, uint16_t, im_block[BW * (BH + MAX_FILTER_TAP)]);
+
+  const int im_stride = BW;
+  // Account for the vertical pass needing MAX_FILTER_TAP / 2 - 1 lines prior
+  // and MAX_FILTER_TAP / 2 lines post. (+1 to make total divisible by 2.)
+  const int im_height = h + MAX_FILTER_TAP;
+  const ptrdiff_t border_offset = MAX_FILTER_TAP / 2 - 1;
+
+  // Filter starting border_offset rows up.
+  vpx_highbd_convolve12_horiz_neon(
+      src - src_stride * border_offset, src_stride, im_block, im_stride, filter,
+      x0_q4, x_step_q4, y0_q4, y_step_q4, w, im_height, bd);
+
+  vpx_highbd_convolve12_vert_neon(im_block + im_stride * border_offset,
+                                  im_stride, dst, dst_stride, filter, x0_q4,
+                                  x_step_q4, y0_q4, y_step_q4, w, h, bd);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_sve2.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_sve2.c
new file mode 100644
index 0000000000..73660eb4ac
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_sve2.c
@@ -0,0 +1,285 @@
+/*
+ *  Copyright (c) 2025 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <arm_neon_sve_bridge.h>
+#include <arm_sve.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+#include "vpx_dsp/arm/vpx_neon_sve_bridge.h"
+#include "vpx_dsp/arm/vpx_neon_sve2_bridge.h"
+
+DECLARE_ALIGNED(16, static const uint16_t, kDotProdPermuteTbl[32]) = {
+  // clang-format off
+  0,  1,  2,  3,  1,  2,  3,  4,
+  2,  3,  4,  5,  3,  4,  5,  6,
+  4,  5,  6,  7,  5,  6,  7,  0,
+  6,  7,  0,  1,  7,  0,  1,  2,
+  // clang-format on
+};
+
+static INLINE uint16x8_t highbd_convolve12_8_h(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t filter_0_7, const int16x8_t filter_4_11,
+    const uint16x8x4_t perm_tbl, const uint16x8_t max) {
+  int16x8_t perm_samples[8];
+
+  perm_samples[0] = vpx_tbl_s16(s0, perm_tbl.val[0]);
+  perm_samples[1] = vpx_tbl_s16(s0, perm_tbl.val[1]);
+  perm_samples[2] = vpx_tbl2_s16(s0, s1, perm_tbl.val[2]);
+  perm_samples[3] = vpx_tbl2_s16(s0, s1, perm_tbl.val[3]);
+  perm_samples[4] = vpx_tbl_s16(s1, perm_tbl.val[0]);
+  perm_samples[5] = vpx_tbl_s16(s1, perm_tbl.val[1]);
+  perm_samples[6] = vpx_tbl2_s16(s1, s2, perm_tbl.val[2]);
+  perm_samples[7] = vpx_tbl2_s16(s1, s2, perm_tbl.val[3]);
+
+  int64x2_t sum01 =
+      vpx_dotq_lane_s16(vdupq_n_s64(0), perm_samples[0], filter_0_7, 0);
+  sum01 = vpx_dotq_lane_s16(sum01, perm_samples[2], filter_0_7, 1);
+  sum01 = vpx_dotq_lane_s16(sum01, perm_samples[4], filter_4_11, 1);
+
+  int64x2_t sum23 =
+      vpx_dotq_lane_s16(vdupq_n_s64(0), perm_samples[1], filter_0_7, 0);
+  sum23 = vpx_dotq_lane_s16(sum23, perm_samples[3], filter_0_7, 1);
+  sum23 = vpx_dotq_lane_s16(sum23, perm_samples[5], filter_4_11, 1);
+
+  int64x2_t sum45 =
+      vpx_dotq_lane_s16(vdupq_n_s64(0), perm_samples[2], filter_0_7, 0);
+  sum45 = vpx_dotq_lane_s16(sum45, perm_samples[4], filter_0_7, 1);
+  sum45 = vpx_dotq_lane_s16(sum45, perm_samples[6], filter_4_11, 1);
+
+  int64x2_t sum67 =
+      vpx_dotq_lane_s16(vdupq_n_s64(0), perm_samples[3], filter_0_7, 0);
+  sum67 = vpx_dotq_lane_s16(sum67, perm_samples[5], filter_0_7, 1);
+  sum67 = vpx_dotq_lane_s16(sum67, perm_samples[7], filter_4_11, 1);
+
+  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+  int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
+
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS),
+                                vqrshrun_n_s32(sum4567, FILTER_BITS));
+  return vminq_u16(res, max);
+}
+
+void vpx_highbd_convolve12_horiz_sve2(const uint16_t *src, ptrdiff_t src_stride,
+                                      uint16_t *dst, ptrdiff_t dst_stride,
+                                      const InterpKernel12 *filter, int x0_q4,
+                                      int x_step_q4, int y0_q4, int y_step_q4,
+                                      int w, int h, int bd) {
+  // Scaling not supported by SVE2 implementation.
+  if (x_step_q4 != 16) {
+    vpx_highbd_convolve12_horiz_c(src, src_stride, dst, dst_stride, filter,
+                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
+    return;
+  }
+
+  assert(w == 32 || w == 16 || w == 8);
+  assert(h % 4 == 0);
+
+  const int16x8_t filter_0_7 = vld1q_s16(filter[x0_q4]);
+  const int16x8_t filter_4_11 = vld1q_s16(filter[x0_q4] + 4);
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+  uint16x8x4_t permute_tbl = vld1q_u16_x4(kDotProdPermuteTbl);
+
+  // Scale indices by size of the true vector length to avoid reading from an
+  // 'undefined' portion of a vector on a system with SVE vectors > 128-bit.
+  permute_tbl.val[2] = vsetq_lane_u16(svcnth(), permute_tbl.val[2], 7);
+  permute_tbl.val[3] = vsetq_lane_u16(svcnth(), permute_tbl.val[3], 5);
+  uint16x8_t permute_tbl_3_offsets =
+      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL));
+  permute_tbl.val[3] =
+      vaddq_u16(permute_tbl.val[3], permute_tbl_3_offsets);  // 2, 3, 6, 7
+
+  src -= MAX_FILTER_TAP / 2 - 1;
+
+  do {
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+    int width = w;
+
+    do {
+      int16x8_t s0[3], s1[3];
+
+      load_s16_8x3(s + 0 * src_stride, 8, &s0[0], &s0[1], &s0[2]);
+      load_s16_8x3(s + 1 * src_stride, 8, &s1[0], &s1[1], &s1[2]);
+
+      uint16x8_t d0 = highbd_convolve12_8_h(s0[0], s0[1], s0[2], filter_0_7,
+                                            filter_4_11, permute_tbl, max);
+      uint16x8_t d1 = highbd_convolve12_8_h(s1[0], s1[1], s1[2], filter_0_7,
+                                            filter_4_11, permute_tbl, max);
+
+      vst1q_u16(d + 0 * dst_stride, d0);
+      vst1q_u16(d + 1 * dst_stride, d1);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+    src += 2 * src_stride;
+    dst += 2 * dst_stride;
+    h -= 2;
+  } while (h != 0);
+}
+
+static INLINE uint16x4_t highbd_convolve12_4_v(const int16x8_t s0[2],
+                                               const int16x8_t s1[2],
+                                               const int16x8_t s2[2],
+                                               const int16x8_t filter_0_7,
+                                               const int16x8_t filter_4_11,
+                                               const uint16x4_t max) {
+  int64x2_t sum01 = vpx_dotq_lane_s16(vdupq_n_s64(0), s0[0], filter_0_7, 0);
+  sum01 = vpx_dotq_lane_s16(sum01, s1[0], filter_0_7, 1);
+  sum01 = vpx_dotq_lane_s16(sum01, s2[0], filter_4_11, 1);
+
+  int64x2_t sum23 = vpx_dotq_lane_s16(vdupq_n_s64(0), s0[1], filter_0_7, 0);
+  sum23 = vpx_dotq_lane_s16(sum23, s1[1], filter_0_7, 1);
+  sum23 = vpx_dotq_lane_s16(sum23, s2[1], filter_4_11, 1);
+
+  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+
+  uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS);
+
+  return vmin_u16(res, max);
+}
+
+void vpx_highbd_convolve12_vert_sve2(const uint16_t *src, ptrdiff_t src_stride,
+                                     uint16_t *dst, ptrdiff_t dst_stride,
+                                     const InterpKernel12 *filter, int x0_q4,
+                                     int x_step_q4, int y0_q4, int y_step_q4,
+                                     int w, int h, int bd) {
+  // Scaling not supported by SVE2 implementation.
+  if (y_step_q4 != 16) {
+    vpx_highbd_convolve12_vert_c(src, src_stride, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
+    return;
+  }
+  assert(w == 32 || w == 16 || w == 8);
+  assert(h % 4 == 0);
+
+  const int16x8_t filter_0_7 = vld1q_s16(filter[y0_q4]);
+  const int16x8_t filter_4_11 = vld1q_s16(filter[y0_q4] + 4);
+
+  const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+
+  src -= src_stride * (MAX_FILTER_TAP / 2 - 1);
+
+  do {
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+    int height = h;
+
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA;
+    load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
+                  &s9, &sA);
+    s += 11 * src_stride;
+
+    int16x8_t s0123[2], s1234[2], s2345[2], s3456[2], s4567[2], s5678[2],
+        s6789[2], s789A[2];
+    transpose_concat_s16_4x4(s0, s1, s2, s3, &s0123[0], &s0123[1]);
+    transpose_concat_s16_4x4(s1, s2, s3, s4, &s1234[0], &s1234[1]);
+    transpose_concat_s16_4x4(s2, s3, s4, s5, &s2345[0], &s2345[1]);
+    transpose_concat_s16_4x4(s3, s4, s5, s6, &s3456[0], &s3456[1]);
+    transpose_concat_s16_4x4(s4, s5, s6, s7, &s4567[0], &s4567[1]);
+    transpose_concat_s16_4x4(s5, s6, s7, s8, &s5678[0], &s5678[1]);
+    transpose_concat_s16_4x4(s6, s7, s8, s9, &s6789[0], &s6789[1]);
+    transpose_concat_s16_4x4(s7, s8, s9, sA, &s789A[0], &s789A[1]);
+
+    do {
+      int16x4_t sB, sC, sD, sE;
+      load_s16_4x4(s, src_stride, &sB, &sC, &sD, &sE);
+
+      int16x8_t s89AB[2], s9ABC[2], sABCD[2], sBCDE[2];
+      transpose_concat_s16_4x4(s8, s9, sA, sB, &s89AB[0], &s89AB[1]);
+      transpose_concat_s16_4x4(s9, sA, sB, sC, &s9ABC[0], &s9ABC[1]);
+      transpose_concat_s16_4x4(sA, sB, sC, sD, &sABCD[0], &sABCD[1]);
+      transpose_concat_s16_4x4(sB, sC, sD, sE, &sBCDE[0], &sBCDE[1]);
+
+      uint16x4_t d0 = highbd_convolve12_4_v(s0123, s4567, s89AB, filter_0_7,
+                                            filter_4_11, max);
+      uint16x4_t d1 = highbd_convolve12_4_v(s1234, s5678, s9ABC, filter_0_7,
+                                            filter_4_11, max);
+      uint16x4_t d2 = highbd_convolve12_4_v(s2345, s6789, sABCD, filter_0_7,
+                                            filter_4_11, max);
+      uint16x4_t d3 = highbd_convolve12_4_v(s3456, s789A, sBCDE, filter_0_7,
+                                            filter_4_11, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      // Prepare block for next iteration - reusing as much as possible.
+      // Shuffle everything up four rows.
+      s0123[0] = s4567[0];
+      s0123[1] = s4567[1];
+      s1234[0] = s5678[0];
+      s1234[1] = s5678[1];
+      s2345[0] = s6789[0];
+      s2345[1] = s6789[1];
+      s3456[0] = s789A[0];
+      s3456[1] = s789A[1];
+      s4567[0] = s89AB[0];
+      s4567[1] = s89AB[1];
+      s5678[0] = s9ABC[0];
+      s5678[1] = s9ABC[1];
+      s6789[0] = sABCD[0];
+      s6789[1] = sABCD[1];
+      s789A[0] = sBCDE[0];
+      s789A[1] = sBCDE[1];
+
+      s8 = sC;
+      s9 = sD;
+      sA = sE;
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+    src += 4;
+    dst += 4;
+    w -= 4;
+  } while (w != 0);
+}
+
+void vpx_highbd_convolve12_sve2(const uint16_t *src, ptrdiff_t src_stride,
+                                uint16_t *dst, ptrdiff_t dst_stride,
+                                const InterpKernel12 *filter, int x0_q4,
+                                int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                int h, int bd) {
+  // Scaling not supported by SVE2 implementation.
+  if (x_step_q4 != 16 || y_step_q4 != 16) {
+    vpx_highbd_convolve12_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                            x_step_q4, y0_q4, y_step_q4, w, h, bd);
+    return;
+  }
+
+  assert(w == 32 || w == 16 || w == 8);
+  assert(h == 32 || h == 16 || h == 8);
+
+  DECLARE_ALIGNED(32, uint16_t, im_block[BW * (BH + MAX_FILTER_TAP)]);
+
+  const int im_stride = BW;
+  // Account for the vertical pass needing MAX_FILTER_TAP / 2 - 1 lines prior
+  // and MAX_FILTER_TAP / 2 lines post. (+1 to make total divisible by 4.)
+  const int im_height = h + MAX_FILTER_TAP;
+  const ptrdiff_t border_offset = MAX_FILTER_TAP / 2 - 1;
+
+  // Filter starting border_offset rows up.
+  vpx_highbd_convolve12_horiz_sve2(
+      src - src_stride * border_offset, src_stride, im_block, im_stride, filter,
+      x0_q4, x_step_q4, y0_q4, y_step_q4, w, im_height, bd);
+
+  vpx_highbd_convolve12_vert_sve2(im_block + im_stride * border_offset,
+                                  im_stride, dst, dst_stride, filter, x0_q4,
+                                  x_step_q4, y0_q4, y_step_q4, w, h, bd);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
index 33c2fc7feb..96d0614367 100644
--- a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -9,109 +9,395 @@
  */
 
 #include <arm_neon.h>
-
+#include <assert.h>
 #include <math.h>
+#include <stdint.h>
 
+#include "./vpx_config.h"
 #include "vpx_mem/vpx_mem.h"
 
 #include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_scan.h"
 #include "vp9/common/vp9_seg_common.h"
 
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_rd.h"
 
-void vp9_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t count,
-                          int skip_block, const int16_t *zbin_ptr,
-                          const int16_t *round_ptr, const int16_t *quant_ptr,
-                          const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
-                          int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                          uint16_t *eob_ptr, const int16_t *scan,
-                          const int16_t *iscan) {
-  // TODO(jingning) Decide the need of these arguments after the
-  // quantization process is completed.
-  (void)zbin_ptr;
-  (void)quant_shift_ptr;
-  (void)scan;
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 
-  if (!skip_block) {
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    int i;
-    const int16x8_t v_zero = vdupq_n_s16(0);
-    const int16x8_t v_one = vdupq_n_s16(1);
-    int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
-    int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
-    int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
-    int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
-    // adjust for dc
-    v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
-    v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
-    v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
-    // process dc and the first seven ac coeffs
-    {
-      const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
-      const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[0]);
-      const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
-      const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
-      const int32x4_t v_tmp_lo =
-          vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
-      const int32x4_t v_tmp_hi =
-          vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
-      const int16x8_t v_tmp2 =
-          vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
-      const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
-      const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
-      const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
-      const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
-      const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
-      const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
-      v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
-      vst1q_s16(&qcoeff_ptr[0], v_qcoeff);
-      vst1q_s16(&dqcoeff_ptr[0], v_dqcoeff);
-      v_round = vmovq_n_s16(round_ptr[1]);
-      v_quant = vmovq_n_s16(quant_ptr[1]);
-      v_dequant = vmovq_n_s16(dequant_ptr[1]);
-    }
-    // now process the rest of the ac coeffs
-    for (i = 8; i < count; i += 8) {
-      const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
-      const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[i]);
-      const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
-      const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
-      const int32x4_t v_tmp_lo =
-          vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
-      const int32x4_t v_tmp_hi =
-          vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
-      const int16x8_t v_tmp2 =
-          vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
-      const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
-      const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
-      const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
-      const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
-      const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
-      const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
-      v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
-      vst1q_s16(&qcoeff_ptr[i], v_qcoeff);
-      vst1q_s16(&dqcoeff_ptr[i], v_dqcoeff);
-    }
-    {
-      const int16x4_t v_eobmax_3210 = vmax_s16(
-          vget_low_s16(v_eobmax_76543210), vget_high_s16(v_eobmax_76543210));
-      const int64x1_t v_eobmax_xx32 =
-          vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
-      const int16x4_t v_eobmax_tmp =
-          vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
-      const int64x1_t v_eobmax_xxx3 =
-          vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
-      const int16x4_t v_eobmax_final =
-          vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+static VPX_FORCE_INLINE void calculate_dqcoeff_and_store(
+    const int16x8_t qcoeff, const int16x8_t dequant, tran_low_t *dqcoeff) {
+  const int32x4_t dqcoeff_0 =
+      vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
+  const int32x4_t dqcoeff_1 =
+      vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
 
-      *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
-    }
-  } else {
-    memset(qcoeff_ptr, 0, count * sizeof(int16_t));
-    memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
-    *eob_ptr = 0;
-  }
+#if CONFIG_VP9_HIGHBITDEPTH
+  vst1q_s32(dqcoeff, dqcoeff_0);
+  vst1q_s32(dqcoeff + 4, dqcoeff_1);
+#else
+  vst1q_s16(dqcoeff, vcombine_s16(vmovn_s32(dqcoeff_0), vmovn_s32(dqcoeff_1)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 }
+
+static VPX_FORCE_INLINE int16x8_t get_max_lane_eob(const int16_t *iscan_ptr,
+                                                   int16x8_t v_eobmax,
+                                                   uint16x8_t v_nz_mask) {
+  const int16x8_t v_iscan = vld1q_s16(&iscan_ptr[0]);
+  const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, vdupq_n_s16(0), v_iscan);
+  return vmaxq_s16(v_eobmax, v_nz_iscan);
+}
+
+static VPX_FORCE_INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
+#if VPX_ARCH_AARCH64
+  return (uint16_t)vmaxvq_s16(v_eobmax);
+#else
+  const int16x4_t v_eobmax_3210 =
+      vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax));
+  const int64x1_t v_eobmax_xx32 =
+      vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+  const int16x4_t v_eobmax_tmp =
+      vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+  const int64x1_t v_eobmax_xxx3 =
+      vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+  const int16x4_t v_eobmax_final =
+      vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+
+  return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+#endif  // VPX_ARCH_AARCH64
+}
+
+static VPX_FORCE_INLINE void load_fp_values(
+    const struct macroblock_plane *mb_plane, const int16_t *dequant_ptr,
+    int16x8_t *round, int16x8_t *quant, int16x8_t *dequant) {
+  *round = vld1q_s16(mb_plane->round_fp);
+  *quant = vld1q_s16(mb_plane->quant_fp);
+  *dequant = vld1q_s16(dequant_ptr);
+}
+
+static VPX_FORCE_INLINE void update_fp_values(int16x8_t *v_round,
+                                              int16x8_t *v_quant,
+                                              int16x8_t *v_dequant) {
+#if VPX_ARCH_AARCH64
+  *v_round = vdupq_laneq_s16(*v_round, 1);
+  *v_quant = vdupq_laneq_s16(*v_quant, 1);
+  *v_dequant = vdupq_laneq_s16(*v_dequant, 1);
+#else
+  *v_round = vdupq_lane_s16(vget_low_s16(*v_round), 1);
+  *v_quant = vdupq_lane_s16(vget_low_s16(*v_quant), 1);
+  *v_dequant = vdupq_lane_s16(vget_low_s16(*v_dequant), 1);
+#endif
+}
+
+static VPX_FORCE_INLINE void quantize_fp_8(
+    const int16x8_t *v_round, const int16x8_t *v_quant,
+    const int16x8_t *v_dequant, const tran_low_t *coeff_ptr,
+    const int16_t *iscan_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+    int16x8_t *v_eobmax) {
+  const int16x8_t v_zero = vdupq_n_s16(0);
+  const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
+  const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+  const int16x8_t v_abs = vabsq_s16(v_coeff);
+  const int16x8_t v_tmp = vqaddq_s16(v_abs, *v_round);
+  const int32x4_t v_tmp_lo =
+      vmull_s16(vget_low_s16(v_tmp), vget_low_s16(*v_quant));
+  const int32x4_t v_tmp_hi =
+      vmull_s16(vget_high_s16(v_tmp), vget_high_s16(*v_quant));
+  const int16x8_t v_tmp2 =
+      vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
+  const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+  const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+  const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+  calculate_dqcoeff_and_store(v_qcoeff, *v_dequant, dqcoeff_ptr);
+  store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
+
+  *v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask);
+}
+
+void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                          const struct macroblock_plane *mb_plane,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const struct ScanOrder *const scan_order) {
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  int i;
+  int16x8_t v_eobmax = vdupq_n_s16(-1);
+  int16x8_t v_round, v_quant, v_dequant;
+  const int16_t *iscan = scan_order->iscan;
+
+  load_fp_values(mb_plane, dequant_ptr, &v_round, &v_quant, &v_dequant);
+  // process dc and the first seven ac coeffs
+  quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr, iscan, qcoeff_ptr,
+                dqcoeff_ptr, &v_eobmax);
+
+  // now process the rest of the ac coeffs
+  update_fp_values(&v_round, &v_quant, &v_dequant);
+  for (i = 8; i < n_coeffs; i += 8) {
+    quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr + i, iscan + i,
+                  qcoeff_ptr + i, dqcoeff_ptr + i, &v_eobmax);
+  }
+
+  *eob_ptr = get_max_eob(v_eobmax);
+}
+
+static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
+  return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
+}
+
+static VPX_FORCE_INLINE void quantize_fp_32x32_8(
+    const int16x8_t *v_round, const int16x8_t *v_quant,
+    const int16x8_t *v_dequant, const int16x8_t *dequant_thresh,
+    const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, int16x8_t *v_eobmax) {
+  const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
+  const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+  const int16x8_t v_coeff_abs = vabsq_s16(v_coeff);
+  const int16x8_t v_thr_mask =
+      vreinterpretq_s16_u16(vcgeq_s16(v_coeff_abs, *dequant_thresh));
+  const int16x8_t v_tmp_rnd =
+      vandq_s16(vqaddq_s16(v_coeff_abs, *v_round), v_thr_mask);
+  const int16x8_t v_abs_qcoeff = vqdmulhq_s16(v_tmp_rnd, *v_quant);
+  const int16x8_t v_qcoeff =
+      vsubq_s16(veorq_s16(v_abs_qcoeff, v_coeff_sign), v_coeff_sign);
+  const uint16x8_t v_nz_mask = vceqq_s16(v_abs_qcoeff, vdupq_n_s16(0));
+
+  int32x4_t dqcoeff_0, dqcoeff_1;
+  dqcoeff_0 = vmull_s16(vget_low_s16(v_qcoeff), vget_low_s16(*v_dequant));
+  dqcoeff_1 = vmull_s16(vget_high_s16(v_qcoeff), vget_high_s16(*v_dequant));
+  // Add 1 if negative to round towards zero because the C uses division.
+  dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
+  dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  vst1q_s32(dqcoeff_ptr, vshrq_n_s32(dqcoeff_0, 1));
+  vst1q_s32(dqcoeff_ptr + 4, vshrq_n_s32(dqcoeff_1, 1));
+#else
+  store_s16q_to_tran_low(dqcoeff_ptr, vcombine_s16(vshrn_n_s32(dqcoeff_0, 1),
+                                                   vshrn_n_s32(dqcoeff_1, 1)));
+#endif
+
+  store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
+
+  *v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask);
+}
+
+void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const struct macroblock_plane *mb_plane,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const struct ScanOrder *const scan_order) {
+  int16x8_t eob_max = vdupq_n_s16(-1);
+  // ROUND_POWER_OF_TWO(round_ptr[], 1)
+  int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round_fp), 1);
+  int16x8_t quant = vld1q_s16(mb_plane->quant_fp);
+  int16x8_t dequant = vld1q_s16(dequant_ptr);
+  // dequant >> 2 is used similar to zbin as a threshold.
+  int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2);
+  int i;
+  const int16_t *iscan = scan_order->iscan;
+
+  (void)n_coeffs;
+
+  // Process dc and the first seven ac coeffs.
+  quantize_fp_32x32_8(&round, &quant, &dequant, &dequant_thresh, coeff_ptr,
+                      iscan, qcoeff_ptr, dqcoeff_ptr, &eob_max);
+
+  update_fp_values(&round, &quant, &dequant);
+  dequant_thresh = vdupq_lane_s16(vget_low_s16(dequant_thresh), 1);
+
+  iscan += 8;
+  coeff_ptr += 8;
+  qcoeff_ptr += 8;
+  dqcoeff_ptr += 8;
+
+  // Process the rest of the ac coeffs.
+  for (i = 8; i < 32 * 32; i += 8) {
+    quantize_fp_32x32_8(&round, &quant, &dequant, &dequant_thresh, coeff_ptr,
+                        iscan, qcoeff_ptr, dqcoeff_ptr, &eob_max);
+
+    iscan += 8;
+    coeff_ptr += 8;
+    qcoeff_ptr += 8;
+    dqcoeff_ptr += 8;
+  }
+
+  *eob_ptr = get_max_eob(eob_max);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static VPX_FORCE_INLINE uint16x4_t
+highbd_quantize_fp_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+                     tran_low_t *dqcoeff_ptr, int32x4_t v_quant_s32,
+                     int32x4_t v_dequant_s32, int32x4_t v_round_s32) {
+  const int32x4_t v_coeff = vld1q_s32(coeff_ptr);
+  const int32x4_t v_coeff_sign =
+      vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0)));
+  const int32x4_t v_abs_coeff = vabsq_s32(v_coeff);
+  const int32x4_t v_tmp = vaddq_s32(v_abs_coeff, v_round_s32);
+  //  const int abs_qcoeff = (int)((tmp * quant) >> 16);
+  const int32x4_t v_abs_qcoeff = vqdmulhq_s32(v_tmp, v_quant_s32);
+  //  qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+  const int32x4_t v_qcoeff =
+      vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign);
+  const int32x4_t v_abs_dqcoeff = vmulq_s32(v_abs_qcoeff, v_dequant_s32);
+  //  dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+  const int32x4_t v_dqcoeff =
+      vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign);
+
+  vst1q_s32(qcoeff_ptr, v_qcoeff);
+  vst1q_s32(dqcoeff_ptr, v_dqcoeff);
+
+  // Packed nz_qcoeff_mask. Used to find eob.
+  return vmovn_u32(vceqq_s32(v_abs_qcoeff, vdupq_n_s32(0)));
+}
+
+void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                 const struct macroblock_plane *mb_plane,
+                                 tran_low_t *qcoeff_ptr,
+                                 tran_low_t *dqcoeff_ptr,
+                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                 const struct ScanOrder *const scan_order) {
+  const int16x4_t v_zero = vdup_n_s16(0);
+  const int16x4_t v_quant = vld1_s16(mb_plane->quant_fp);
+  const int16x4_t v_dequant = vld1_s16(dequant_ptr);
+  const int16x4_t v_round = vld1_s16(mb_plane->round_fp);
+  int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero);
+  int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15);
+  int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero);
+  uint16x4_t v_mask_lo, v_mask_hi;
+  int16x8_t v_eobmax = vdupq_n_s16(-1);
+  const int16_t *iscan = scan_order->iscan;
+
+  // DC and first 3 AC
+  v_mask_lo = highbd_quantize_fp_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
+                                   v_quant_s32, v_dequant_s32, v_round_s32);
+
+  // overwrite the DC constants with AC constants
+  v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1);
+  v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1);
+  v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1);
+
+  // 4 more AC
+  v_mask_hi =
+      highbd_quantize_fp_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+                           v_quant_s32, v_dequant_s32, v_round_s32);
+
+  // Find the max lane eob for the first 8 coeffs.
+  v_eobmax =
+      get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+
+  n_coeffs -= 8;
+  do {
+    coeff_ptr += 8;
+    qcoeff_ptr += 8;
+    dqcoeff_ptr += 8;
+    iscan += 8;
+    v_mask_lo = highbd_quantize_fp_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
+                                     v_quant_s32, v_dequant_s32, v_round_s32);
+    v_mask_hi =
+        highbd_quantize_fp_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+                             v_quant_s32, v_dequant_s32, v_round_s32);
+    // Find the max lane eob for 8 coeffs.
+    v_eobmax =
+        get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+    n_coeffs -= 8;
+  } while (n_coeffs);
+
+  *eob_ptr = get_max_eob(v_eobmax);
+}
+
+static VPX_FORCE_INLINE uint16x4_t
+highbd_quantize_fp_32x32_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+                           tran_low_t *dqcoeff_ptr, int32x4_t v_quant_s32,
+                           int32x4_t v_dequant_s32, int32x4_t v_round_s32) {
+  const int32x4_t v_coeff = vld1q_s32(coeff_ptr);
+  const int32x4_t v_coeff_sign =
+      vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0)));
+  const int32x4_t v_abs_coeff = vabsq_s32(v_coeff);
+  // ((abs_coeff << (1 + log_scale)) >= dequant_ptr[rc01])
+  const int32x4_t v_abs_coeff_scaled = vshlq_n_s32(v_abs_coeff, 2);
+  const uint32x4_t v_mask = vcgeq_s32(v_abs_coeff_scaled, v_dequant_s32);
+  // const int64_t tmp = vmask ? (int64_t)abs_coeff + log_scaled_round : 0
+  const int32x4_t v_tmp = vandq_s32(vaddq_s32(v_abs_coeff, v_round_s32),
+                                    vreinterpretq_s32_u32(v_mask));
+  //  const int abs_qcoeff = (int)((tmp * quant) >> (16 - log_scale));
+  const int32x4_t v_abs_qcoeff =
+      vqdmulhq_s32(vshlq_n_s32(v_tmp, 1), v_quant_s32);
+  //  qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+  const int32x4_t v_qcoeff =
+      vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign);
+  // vshlq_s32 will shift right if shift value is negative.
+  const int32x4_t v_abs_dqcoeff =
+      vshrq_n_s32(vmulq_s32(v_abs_qcoeff, v_dequant_s32), 1);
+  //  dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+  const int32x4_t v_dqcoeff =
+      vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign);
+
+  vst1q_s32(qcoeff_ptr, v_qcoeff);
+  vst1q_s32(dqcoeff_ptr, v_dqcoeff);
+
+  // Packed nz_qcoeff_mask. Used to find eob.
+  return vmovn_u32(vceqq_s32(v_abs_qcoeff, vdupq_n_s32(0)));
+}
+
+void vp9_highbd_quantize_fp_32x32_neon(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+    const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const struct ScanOrder *const scan_order) {
+  const int16x4_t v_quant = vld1_s16(mb_plane->quant_fp);
+  const int16x4_t v_dequant = vld1_s16(dequant_ptr);
+  const int16x4_t v_zero = vdup_n_s16(0);
+  const int16x4_t v_round =
+      vqrdmulh_n_s16(vld1_s16(mb_plane->round_fp), (int16_t)(1 << 14));
+  int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero);
+  int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15);
+  int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero);
+  uint16x4_t v_mask_lo, v_mask_hi;
+  int16x8_t v_eobmax = vdupq_n_s16(-1);
+  const int16_t *iscan = scan_order->iscan;
+
+  // DC and first 3 AC
+  v_mask_lo =
+      highbd_quantize_fp_32x32_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
+                                 v_quant_s32, v_dequant_s32, v_round_s32);
+
+  // overwrite the DC constants with AC constants
+  v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1);
+  v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1);
+  v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1);
+
+  // 4 more AC
+  v_mask_hi =
+      highbd_quantize_fp_32x32_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+                                 v_quant_s32, v_dequant_s32, v_round_s32);
+
+  // Find the max lane eob for the first 8 coeffs.
+  v_eobmax =
+      get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+
+  n_coeffs -= 8;
+  do {
+    coeff_ptr += 8;
+    qcoeff_ptr += 8;
+    dqcoeff_ptr += 8;
+    iscan += 8;
+    v_mask_lo =
+        highbd_quantize_fp_32x32_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
+                                   v_quant_s32, v_dequant_s32, v_round_s32);
+    v_mask_hi = highbd_quantize_fp_32x32_4(coeff_ptr + 4, qcoeff_ptr + 4,
+                                           dqcoeff_ptr + 4, v_quant_s32,
+                                           v_dequant_s32, v_round_s32);
+    // Find the max lane eob for 8 coeffs.
+    v_eobmax =
+        get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+    n_coeffs -= 8;
+  } while (n_coeffs);
+
+  *eob_ptr = get_max_eob(v_eobmax);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c
new file mode 100644
index 0000000000..b6cce39bfe
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c
@@ -0,0 +1,1103 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+#include "vp9/encoder/vp9_temporal_filter_constants.h"
+
+// Read in 8 pixels from a and b as 8-bit unsigned integers, compute the
+// difference squared, and store as unsigned 16-bit integer to dst.
+static INLINE void store_dist_8(const uint8_t *a, const uint8_t *b,
+                                uint16_t *dst) {
+  const uint8x8_t a_reg = vld1_u8(a);
+  const uint8x8_t b_reg = vld1_u8(b);
+
+  uint16x8_t dist_first = vabdl_u8(a_reg, b_reg);
+  dist_first = vmulq_u16(dist_first, dist_first);
+
+  vst1q_u16(dst, dist_first);
+}
+
+static INLINE void store_dist_16(const uint8_t *a, const uint8_t *b,
+                                 uint16_t *dst) {
+  const uint8x16_t a_reg = vld1q_u8(a);
+  const uint8x16_t b_reg = vld1q_u8(b);
+
+  uint16x8_t dist_first = vabdl_u8(vget_low_u8(a_reg), vget_low_u8(b_reg));
+  uint16x8_t dist_second = vabdl_u8(vget_high_u8(a_reg), vget_high_u8(b_reg));
+  dist_first = vmulq_u16(dist_first, dist_first);
+  dist_second = vmulq_u16(dist_second, dist_second);
+
+  vst1q_u16(dst, dist_first);
+  vst1q_u16(dst + 8, dist_second);
+}
+
+static INLINE void read_dist_8(const uint16_t *dist, uint16x8_t *dist_reg) {
+  *dist_reg = vld1q_u16(dist);
+}
+
+static INLINE void read_dist_16(const uint16_t *dist, uint16x8_t *reg_first,
+                                uint16x8_t *reg_second) {
+  read_dist_8(dist, reg_first);
+  read_dist_8(dist + 8, reg_second);
+}
+
+// Average the value based on the number of values summed (9 for pixels away
+// from the border, 4 for pixels in corners, and 6 for other edge values).
+//
+// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
+// by weight.
+static INLINE uint16x8_t average_8(uint16x8_t sum,
+                                   const uint16x8_t *mul_constants,
+                                   const int strength, const int rounding,
+                                   const uint16x8_t *weight) {
+  const uint32x4_t rounding_u32 = vdupq_n_u32(rounding << 16);
+  const uint16x8_t weight_u16 = *weight;
+  const uint16x8_t sixteen = vdupq_n_u16(16);
+  const int32x4_t strength_u32 = vdupq_n_s32(-strength - 16);
+
+  // modifier * 3 / index;
+  uint32x4_t sum_hi =
+      vmull_u16(vget_low_u16(sum), vget_low_u16(*mul_constants));
+  uint32x4_t sum_lo =
+      vmull_u16(vget_high_u16(sum), vget_high_u16(*mul_constants));
+
+  sum_lo = vqaddq_u32(sum_lo, rounding_u32);
+  sum_hi = vqaddq_u32(sum_hi, rounding_u32);
+
+  // we cannot use vshrn_n_u32 as strength is not known at compile time.
+  sum_lo = vshlq_u32(sum_lo, strength_u32);
+  sum_hi = vshlq_u32(sum_hi, strength_u32);
+
+  sum = vcombine_u16(vmovn_u32(sum_hi), vmovn_u32(sum_lo));
+
+  // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
+  // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
+  // So this needs to use the epu16 version which did not come until SSE4.
+  sum = vminq_u16(sum, sixteen);
+  sum = vsubq_u16(sixteen, sum);
+  return vmulq_u16(sum, weight_u16);
+}
+
+// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.'
+static void accumulate_and_store_8(const uint16x8_t sum_u16,
+                                   const uint8_t *pred, uint16_t *count,
+                                   uint32_t *accumulator) {
+  uint16x8_t pred_u16 = vmovl_u8(vld1_u8(pred));
+  uint16x8_t count_u16 = vld1q_u16(count);
+  uint32x4_t accum_0_u32, accum_1_u32;
+
+  count_u16 = vqaddq_u16(count_u16, sum_u16);
+  vst1q_u16(count, count_u16);
+
+  accum_0_u32 = vld1q_u32(accumulator);
+  accum_1_u32 = vld1q_u32(accumulator + 4);
+
+  accum_0_u32 =
+      vmlal_u16(accum_0_u32, vget_low_u16(sum_u16), vget_low_u16(pred_u16));
+  accum_1_u32 =
+      vmlal_u16(accum_1_u32, vget_high_u16(sum_u16), vget_high_u16(pred_u16));
+
+  vst1q_u32(accumulator, accum_0_u32);
+  vst1q_u32(accumulator + 4, accum_1_u32);
+}
+
+static INLINE void accumulate_and_store_16(const uint16x8_t sum_0_u16,
+                                           const uint16x8_t sum_1_u16,
+                                           const uint8_t *pred, uint16_t *count,
+                                           uint32_t *accumulator) {
+  uint8x16_t pred_u8 = vld1q_u8(pred);
+  uint16x8_t pred_0_u16 = vmovl_u8(vget_low_u8(pred_u8));
+  uint16x8_t pred_1_u16 = vmovl_u8(vget_high_u8(pred_u8));
+  uint16x8_t count_0_u16 = vld1q_u16(count);
+  uint16x8_t count_1_u16 = vld1q_u16(count + 8);
+  uint32x4_t accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32;
+
+  count_0_u16 = vqaddq_u16(count_0_u16, sum_0_u16);
+  vst1q_u16(count, count_0_u16);
+  count_1_u16 = vqaddq_u16(count_1_u16, sum_1_u16);
+  vst1q_u16(count + 8, count_1_u16);
+
+  accum_0_u32 = vld1q_u32(accumulator);
+  accum_1_u32 = vld1q_u32(accumulator + 4);
+  accum_2_u32 = vld1q_u32(accumulator + 8);
+  accum_3_u32 = vld1q_u32(accumulator + 12);
+
+  accum_0_u32 =
+      vmlal_u16(accum_0_u32, vget_low_u16(sum_0_u16), vget_low_u16(pred_0_u16));
+  accum_1_u32 = vmlal_u16(accum_1_u32, vget_high_u16(sum_0_u16),
+                          vget_high_u16(pred_0_u16));
+  accum_2_u32 =
+      vmlal_u16(accum_2_u32, vget_low_u16(sum_1_u16), vget_low_u16(pred_1_u16));
+  accum_3_u32 = vmlal_u16(accum_3_u32, vget_high_u16(sum_1_u16),
+                          vget_high_u16(pred_1_u16));
+
+  vst1q_u32(accumulator, accum_0_u32);
+  vst1q_u32(accumulator + 4, accum_1_u32);
+  vst1q_u32(accumulator + 8, accum_2_u32);
+  vst1q_u32(accumulator + 12, accum_3_u32);
+}
+
+// Read in 8 pixels from y_dist. For each index i, compute y_dist[i-1] +
+// y_dist[i] + y_dist[i+1] and store in sum as 16-bit unsigned int.
+static INLINE void get_sum_8(const uint16_t *y_dist, uint16x8_t *sum) {
+  uint16x8_t dist_reg, dist_left, dist_right;
+
+  dist_reg = vld1q_u16(y_dist);
+  dist_left = vld1q_u16(y_dist - 1);
+  dist_right = vld1q_u16(y_dist + 1);
+
+  *sum = vqaddq_u16(dist_reg, dist_left);
+  *sum = vqaddq_u16(*sum, dist_right);
+}
+
+// Read in 16 pixels from y_dist. For each index i, compute y_dist[i-1] +
+// y_dist[i] + y_dist[i+1]. Store the result for first 8 pixels in sum_first and
+// the rest in sum_second.
+static INLINE void get_sum_16(const uint16_t *y_dist, uint16x8_t *sum_first,
+                              uint16x8_t *sum_second) {
+  get_sum_8(y_dist, sum_first);
+  get_sum_8(y_dist + 8, sum_second);
+}
+
+// Read in a row of chroma values corresponds to a row of 16 luma values.
+static INLINE void read_chroma_dist_row_16(int ss_x, const uint16_t *u_dist,
+                                           const uint16_t *v_dist,
+                                           uint16x8_t *u_first,
+                                           uint16x8_t *u_second,
+                                           uint16x8_t *v_first,
+                                           uint16x8_t *v_second) {
+  if (!ss_x) {
+    // If there is no chroma subsampling in the horizontal direction, then we
+    // need to load 16 entries from chroma.
+    read_dist_16(u_dist, u_first, u_second);
+    read_dist_16(v_dist, v_first, v_second);
+  } else {  // ss_x == 1
+    // Otherwise, we only need to load 8 entries
+    uint16x8_t u_reg, v_reg;
+    uint16x8x2_t pair;
+
+    read_dist_8(u_dist, &u_reg);
+
+    pair = vzipq_u16(u_reg, u_reg);
+    *u_first = pair.val[0];
+    *u_second = pair.val[1];
+
+    read_dist_8(v_dist, &v_reg);
+
+    pair = vzipq_u16(v_reg, v_reg);
+    *v_first = pair.val[0];
+    *v_second = pair.val[1];
+  }
+}
+
+// Add a row of luma distortion to 8 corresponding chroma mods.
+static INLINE void add_luma_dist_to_8_chroma_mod(const uint16_t *y_dist,
+                                                 int ss_x, int ss_y,
+                                                 uint16x8_t *u_mod,
+                                                 uint16x8_t *v_mod) {
+  uint16x8_t y_reg;
+  if (!ss_x) {
+    read_dist_8(y_dist, &y_reg);
+    if (ss_y == 1) {
+      uint16x8_t y_tmp;
+      read_dist_8(y_dist + DIST_STRIDE, &y_tmp);
+
+      y_reg = vqaddq_u16(y_reg, y_tmp);
+    }
+  } else {
+    uint16x8_t y_first, y_second;
+    uint32x4_t y_first32, y_second32;
+
+    read_dist_16(y_dist, &y_first, &y_second);
+    if (ss_y == 1) {
+      uint16x8_t y_tmp_0, y_tmp_1;
+      read_dist_16(y_dist + DIST_STRIDE, &y_tmp_0, &y_tmp_1);
+
+      y_first = vqaddq_u16(y_first, y_tmp_0);
+      y_second = vqaddq_u16(y_second, y_tmp_1);
+    }
+
+    y_first32 = vpaddlq_u16(y_first);
+    y_second32 = vpaddlq_u16(y_second);
+
+    y_reg = vcombine_u16(vqmovn_u32(y_first32), vqmovn_u32(y_second32));
+  }
+
+  *u_mod = vqaddq_u16(*u_mod, y_reg);
+  *v_mod = vqaddq_u16(*v_mod, y_reg);
+}
+
+// Apply temporal filter to the luma components. This performs temporal
+// filtering on a luma block of 16 X block_height. Use blk_fw as an array of
+// size 4 for the weights for each of the 4 subblocks if blk_fw is not NULL,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void apply_temporal_filter_luma_16(
+    const uint8_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist,
+    const int16_t *const *neighbors_first,
+    const int16_t *const *neighbors_second, int top_weight, int bottom_weight,
+    const int *blk_fw) {
+  const int rounding = (1 << strength) >> 1;
+  uint16x8_t weight_first, weight_second;
+
+  uint16x8_t mul_first, mul_second;
+
+  uint16x8_t sum_row_1_first, sum_row_1_second;
+  uint16x8_t sum_row_2_first, sum_row_2_second;
+  uint16x8_t sum_row_3_first, sum_row_3_second;
+
+  uint16x8_t u_first, u_second;
+  uint16x8_t v_first, v_second;
+
+  uint16x8_t sum_row_first;
+  uint16x8_t sum_row_second;
+
+  // Loop variables
+  unsigned int h;
+
+  assert(strength >= 0);
+  assert(strength <= 6);
+
+  assert(block_width == 16);
+  (void)block_width;
+
+  // Initialize the weights
+  if (blk_fw) {
+    weight_first = vdupq_n_u16(blk_fw[0]);
+    weight_second = vdupq_n_u16(blk_fw[1]);
+  } else {
+    weight_first = vdupq_n_u16(top_weight);
+    weight_second = weight_first;
+  }
+
+  // First row
+  mul_first = vld1q_u16((const uint16_t *)neighbors_first[0]);
+  mul_second = vld1q_u16((const uint16_t *)neighbors_second[0]);
+
+  // Add luma values
+  get_sum_16(y_dist, &sum_row_2_first, &sum_row_2_second);
+  get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+  sum_row_first = vqaddq_u16(sum_row_2_first, sum_row_3_first);
+  sum_row_second = vqaddq_u16(sum_row_2_second, sum_row_3_second);
+
+  // Add chroma values
+  read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
+                          &v_second);
+
+  sum_row_first = vqaddq_u16(sum_row_first, u_first);
+  sum_row_second = vqaddq_u16(sum_row_second, u_second);
+
+  sum_row_first = vqaddq_u16(sum_row_first, v_first);
+  sum_row_second = vqaddq_u16(sum_row_second, v_second);
+
+  // Get modifier and store result
+  sum_row_first =
+      average_8(sum_row_first, &mul_first, strength, rounding, &weight_first);
+
+  sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding,
+                             &weight_second);
+
+  accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+                          y_accum);
+
+  y_pre += y_pre_stride;
+  y_count += y_pre_stride;
+  y_accum += y_pre_stride;
+  y_dist += DIST_STRIDE;
+
+  u_dist += DIST_STRIDE;
+  v_dist += DIST_STRIDE;
+
+  // Then all the rows except the last one
+  mul_first = vld1q_u16((const uint16_t *)neighbors_first[1]);
+  mul_second = vld1q_u16((const uint16_t *)neighbors_second[1]);
+
+  for (h = 1; h < block_height - 1; ++h) {
+    // Move the weight to bottom half
+    if (!use_whole_blk && h == block_height / 2) {
+      if (blk_fw) {
+        weight_first = vdupq_n_u16(blk_fw[2]);
+        weight_second = vdupq_n_u16(blk_fw[3]);
+      } else {
+        weight_first = vdupq_n_u16(bottom_weight);
+        weight_second = weight_first;
+      }
+    }
+    // Shift the rows up
+    sum_row_1_first = sum_row_2_first;
+    sum_row_1_second = sum_row_2_second;
+    sum_row_2_first = sum_row_3_first;
+    sum_row_2_second = sum_row_3_second;
+
+    // Add luma values to the modifier
+    sum_row_first = vqaddq_u16(sum_row_1_first, sum_row_2_first);
+    sum_row_second = vqaddq_u16(sum_row_1_second, sum_row_2_second);
+
+    get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+    sum_row_first = vqaddq_u16(sum_row_first, sum_row_3_first);
+    sum_row_second = vqaddq_u16(sum_row_second, sum_row_3_second);
+
+    // Add chroma values to the modifier
+    if (ss_y == 0 || h % 2 == 0) {
+      // Only calculate the new chroma distortion if we are at a pixel that
+      // corresponds to a new chroma row
+      read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second,
+                              &v_first, &v_second);
+      u_dist += DIST_STRIDE;
+      v_dist += DIST_STRIDE;
+    }
+
+    sum_row_first = vqaddq_u16(sum_row_first, u_first);
+    sum_row_second = vqaddq_u16(sum_row_second, u_second);
+    sum_row_first = vqaddq_u16(sum_row_first, v_first);
+    sum_row_second = vqaddq_u16(sum_row_second, v_second);
+
+    // Get modifier and store result
+    sum_row_first =
+        average_8(sum_row_first, &mul_first, strength, rounding, &weight_first);
+    sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding,
+                               &weight_second);
+    accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+                            y_accum);
+    y_pre += y_pre_stride;
+    y_count += y_pre_stride;
+    y_accum += y_pre_stride;
+    y_dist += DIST_STRIDE;
+  }
+
+  // The last row
+  mul_first = vld1q_u16((const uint16_t *)neighbors_first[0]);
+  mul_second = vld1q_u16((const uint16_t *)neighbors_second[0]);
+
+  // Shift the rows up
+  sum_row_1_first = sum_row_2_first;
+  sum_row_1_second = sum_row_2_second;
+  sum_row_2_first = sum_row_3_first;
+  sum_row_2_second = sum_row_3_second;
+
+  // Add luma values to the modifier
+  sum_row_first = vqaddq_u16(sum_row_1_first, sum_row_2_first);
+  sum_row_second = vqaddq_u16(sum_row_1_second, sum_row_2_second);
+
+  // Add chroma values to the modifier
+  if (ss_y == 0) {
+    // Only calculate the new chroma distortion if we are at a pixel that
+    // corresponds to a new chroma row
+    read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
+                            &v_second);
+  }
+
+  sum_row_first = vqaddq_u16(sum_row_first, u_first);
+  sum_row_second = vqaddq_u16(sum_row_second, u_second);
+  sum_row_first = vqaddq_u16(sum_row_first, v_first);
+  sum_row_second = vqaddq_u16(sum_row_second, v_second);
+
+  // Get modifier and store result
+  sum_row_first =
+      average_8(sum_row_first, &mul_first, strength, rounding, &weight_first);
+  sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding,
+                             &weight_second);
+  accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+                          y_accum);
+}
+
+// Perform temporal filter for the luma component.
+static void apply_temporal_filter_luma(
+    const uint8_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) {
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int blk_col_step = 16, uv_blk_col_step = 16 >> ss_x;
+  const unsigned int mid_width = block_width >> 1,
+                     last_width = block_width - blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const int16_t *const *neighbors_first;
+  const int16_t *const *neighbors_second;
+
+  if (block_width == 16) {
+    // Special Case: The block width is 16 and we are operating on a row of 16
+    // chroma pixels. In this case, we can't use the usual left-middle-right
+    // pattern. We also don't support splitting now.
+    neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
+    neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
+    if (use_whole_blk) {
+      apply_temporal_filter_luma_16(
+          y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+          use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+          neighbors_second, top_weight, bottom_weight, NULL);
+    } else {
+      apply_temporal_filter_luma_16(
+          y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+          use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+          neighbors_second, 0, 0, blk_fw);
+    }
+
+    return;
+  }
+
+  // Left
+  neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
+  neighbors_second = LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  apply_temporal_filter_luma_16(
+      y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+      use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+      neighbors_second, top_weight, bottom_weight, NULL);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  neighbors_first = LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  for (; blk_col < mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    apply_temporal_filter_luma_16(
+        y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+        use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+        neighbors_second, top_weight, bottom_weight, NULL);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; blk_col < last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    apply_temporal_filter_luma_16(
+        y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+        use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+        neighbors_second, top_weight, bottom_weight, NULL);
+  }
+
+  // Right
+  neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
+  apply_temporal_filter_luma_16(
+      y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+      use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+      neighbors_second, top_weight, bottom_weight, NULL);
+}
+
+// Apply temporal filter to the chroma components. This performs temporal
+// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
+// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void apply_temporal_filter_chroma_8(
+    const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride,
+    unsigned int uv_block_height, int ss_x, int ss_y, int strength,
+    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist,
+    const int16_t *const *neighbors, int top_weight, int bottom_weight,
+    const int *blk_fw) {
+  const int rounding = (1 << strength) >> 1;
+
+  uint16x8_t weight;
+
+  uint16x8_t mul;
+
+  uint16x8_t u_sum_row_1, u_sum_row_2, u_sum_row_3;
+  uint16x8_t v_sum_row_1, v_sum_row_2, v_sum_row_3;
+
+  uint16x8_t u_sum_row, v_sum_row;
+
+  // Loop variable
+  unsigned int h;
+
+  // Initialize weight
+  if (blk_fw) {
+    weight = vcombine_u16(vdup_n_u16(blk_fw[0]), vdup_n_u16(blk_fw[1]));
+  } else {
+    weight = vdupq_n_u16(top_weight);
+  }
+
+  // First row
+  mul = vld1q_u16((const uint16_t *)neighbors[0]);
+
+  // Add chroma values
+  get_sum_8(u_dist, &u_sum_row_2);
+  get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
+
+  u_sum_row = vqaddq_u16(u_sum_row_2, u_sum_row_3);
+
+  get_sum_8(v_dist, &v_sum_row_2);
+  get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
+
+  v_sum_row = vqaddq_u16(v_sum_row_2, v_sum_row_3);
+
+  // Add luma values
+  add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+  // Get modifier and store result
+  u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight);
+  v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight);
+
+  accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+  accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+
+  u_pre += uv_pre_stride;
+  u_dist += DIST_STRIDE;
+  v_pre += uv_pre_stride;
+  v_dist += DIST_STRIDE;
+  u_count += uv_pre_stride;
+  u_accum += uv_pre_stride;
+  v_count += uv_pre_stride;
+  v_accum += uv_pre_stride;
+
+  y_dist += DIST_STRIDE * (1 + ss_y);
+
+  // Then all the rows except the last one
+  mul = vld1q_u16((const uint16_t *)neighbors[1]);
+
+  for (h = 1; h < uv_block_height - 1; ++h) {
+    // Move the weight pointer to the bottom half of the blocks
+    if (h == uv_block_height / 2) {
+      if (blk_fw) {
+        weight = vcombine_u16(vdup_n_u16(blk_fw[2]), vdup_n_u16(blk_fw[3]));
+      } else {
+        weight = vdupq_n_u16(bottom_weight);
+      }
+    }
+
+    // Shift the rows up
+    u_sum_row_1 = u_sum_row_2;
+    u_sum_row_2 = u_sum_row_3;
+
+    v_sum_row_1 = v_sum_row_2;
+    v_sum_row_2 = v_sum_row_3;
+
+    // Add chroma values
+    u_sum_row = vqaddq_u16(u_sum_row_1, u_sum_row_2);
+    get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
+    u_sum_row = vqaddq_u16(u_sum_row, u_sum_row_3);
+
+    v_sum_row = vqaddq_u16(v_sum_row_1, v_sum_row_2);
+    get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
+    v_sum_row = vqaddq_u16(v_sum_row, v_sum_row_3);
+
+    // Add luma values
+    add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+    // Get modifier and store result
+    u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight);
+    v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight);
+
+    accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+    accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+
+    u_pre += uv_pre_stride;
+    u_dist += DIST_STRIDE;
+    v_pre += uv_pre_stride;
+    v_dist += DIST_STRIDE;
+    u_count += uv_pre_stride;
+    u_accum += uv_pre_stride;
+    v_count += uv_pre_stride;
+    v_accum += uv_pre_stride;
+
+    y_dist += DIST_STRIDE * (1 + ss_y);
+  }
+
+  // The last row
+  mul = vld1q_u16((const uint16_t *)neighbors[0]);
+
+  // Shift the rows up
+  u_sum_row_1 = u_sum_row_2;
+  u_sum_row_2 = u_sum_row_3;
+
+  v_sum_row_1 = v_sum_row_2;
+  v_sum_row_2 = v_sum_row_3;
+
+  // Add chroma values
+  u_sum_row = vqaddq_u16(u_sum_row_1, u_sum_row_2);
+  v_sum_row = vqaddq_u16(v_sum_row_1, v_sum_row_2);
+
+  // Add luma values
+  add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+  // Get modifier and store result
+  u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight);
+  v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight);
+
+  accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+  accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+}
+
+// Perform temporal filter for the chroma components.
+static void apply_temporal_filter_chroma(
+    const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride,
+    unsigned int block_width, unsigned int block_height, int ss_x, int ss_y,
+    int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum,
+    uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) {
+  const unsigned int uv_width = block_width >> ss_x,
+                     uv_height = block_height >> ss_y;
+
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
+  const unsigned int uv_mid_width = uv_width >> 1,
+                     uv_last_width = uv_width - uv_blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const int16_t *const *neighbors;
+
+  if (uv_width == 8) {
+    // Special Case: We are subsampling in x direction on a 16x16 block. Since
+    // we are operating on a row of 8 chroma pixels, we can't use the usual
+    // left-middle-right pattern.
+    assert(ss_x);
+
+    if (ss_y) {
+      neighbors = CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS;
+    } else {
+      neighbors = CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS;
+    }
+
+    if (use_whole_blk) {
+      apply_temporal_filter_chroma_8(
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height,
+          ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+          v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+          bottom_weight, NULL);
+    } else {
+      apply_temporal_filter_chroma_8(
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height,
+          ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+          v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, 0, 0, blk_fw);
+    }
+
+    return;
+  }
+
+  // Left
+  if (ss_x && ss_y) {
+    neighbors = CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors = CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+  } else {
+    neighbors = CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
+  }
+
+  apply_temporal_filter_chroma_8(
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+      ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+      bottom_weight, NULL);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  if (ss_x && ss_y) {
+    neighbors = CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors = CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else {
+    neighbors = CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+  }
+
+  for (; uv_blk_col < uv_mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    apply_temporal_filter_chroma_8(
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+        ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+        v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+        bottom_weight, NULL);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; uv_blk_col < uv_last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    apply_temporal_filter_chroma_8(
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+        ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+        v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+        bottom_weight, NULL);
+  }
+
+  // Right
+  if (ss_x && ss_y) {
+    neighbors = CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors = CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else {
+    neighbors = CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
+  }
+
+  apply_temporal_filter_chroma_8(
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+      ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+      bottom_weight, NULL);
+}
+
+void vp9_apply_temporal_filter_neon(
+    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *const blk_fw,
+    int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum,
+    uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) {
+  const unsigned int chroma_height = block_height >> ss_y,
+                     chroma_width = block_width >> ss_x;
+
+  DECLARE_ALIGNED(16, uint16_t, y_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint16_t, u_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint16_t, v_dist[BH * DIST_STRIDE]) = { 0 };
+  const int *blk_fw_ptr = blk_fw;
+
+  uint16_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
+           *v_dist_ptr = v_dist + 1;
+  const uint8_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src;
+  const uint8_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre;
+
+  // Loop variables
+  unsigned int row, blk_col;
+
+  assert(block_width <= BW && "block width too large");
+  assert(block_height <= BH && "block height too large");
+  assert(block_width % 16 == 0 && "block width must be multiple of 16");
+  assert(block_height % 2 == 0 && "block height must be even");
+  assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
+         "invalid chroma subsampling");
+  assert(strength >= 0 && strength <= 6 && "invalid temporal filter strength");
+  assert(blk_fw[0] >= 0 && "filter weight must be positive");
+  assert(
+      (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
+      "subblock filter weight must be positive");
+  assert(blk_fw[0] <= 2 && "subblock filter weight must be less than 2");
+  assert(
+      (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
+      "subblock filter weight must be less than 2");
+
+  // Precompute the difference squared
+  for (row = 0; row < block_height; row++) {
+    for (blk_col = 0; blk_col < block_width; blk_col += 16) {
+      store_dist_16(y_src_ptr + blk_col, y_pre_ptr + blk_col,
+                    y_dist_ptr + blk_col);
+    }
+    y_src_ptr += y_src_stride;
+    y_pre_ptr += y_pre_stride;
+    y_dist_ptr += DIST_STRIDE;
+  }
+
+  for (row = 0; row < chroma_height; row++) {
+    for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
+      store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
+                   u_dist_ptr + blk_col);
+      store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
+                   v_dist_ptr + blk_col);
+    }
+
+    u_src_ptr += uv_src_stride;
+    u_pre_ptr += uv_pre_stride;
+    u_dist_ptr += DIST_STRIDE;
+    v_src_ptr += uv_src_stride;
+    v_pre_ptr += uv_pre_stride;
+    v_dist_ptr += DIST_STRIDE;
+  }
+
+  y_dist_ptr = y_dist + 1;
+  u_dist_ptr = u_dist + 1;
+  v_dist_ptr = v_dist + 1;
+
+  apply_temporal_filter_luma(y_pre, y_pre_stride, block_width, block_height,
+                             ss_x, ss_y, strength, blk_fw_ptr, use_whole_blk,
+                             y_accum, y_count, y_dist_ptr, u_dist_ptr,
+                             v_dist_ptr);
+
+  apply_temporal_filter_chroma(u_pre, v_pre, uv_pre_stride, block_width,
+                               block_height, ss_x, ss_y, strength, blk_fw_ptr,
+                               use_whole_blk, u_accum, u_count, v_accum,
+                               v_count, y_dist_ptr, u_dist_ptr, v_dist_ptr);
+}
+
+static INLINE uint8x8_t convolve12_8(const int16x8_t s0, const int16x8_t s1,
+                                     const int16x8_t s2, const int16x8_t s3,
+                                     const int16x8_t s4, const int16x8_t s5,
+                                     const int16x8_t s6, const int16x8_t s7,
+                                     const int16x8_t s8, const int16x8_t s9,
+                                     const int16x8_t sA, const int16x8_t sB,
+                                     const int16x8_t filter_0_7,
+                                     const int16x4_t filter_8_11) {
+  const int16x4_t filter_0_3 = vget_low_s16(filter_0_7);
+  const int16x4_t filter_4_7 = vget_high_s16(filter_0_7);
+
+  int16x8_t sum = vmulq_lane_s16(s0, filter_0_3, 0);
+  sum = vmlaq_lane_s16(sum, s1, filter_0_3, 1);
+  sum = vmlaq_lane_s16(sum, s2, filter_0_3, 2);
+  sum = vmlaq_lane_s16(sum, s3, filter_0_3, 3);
+  sum = vmlaq_lane_s16(sum, s4, filter_4_7, 0);
+
+  sum = vmlaq_lane_s16(sum, s7, filter_4_7, 3);
+  sum = vmlaq_lane_s16(sum, s8, filter_8_11, 0);
+  sum = vmlaq_lane_s16(sum, s9, filter_8_11, 1);
+  sum = vmlaq_lane_s16(sum, sA, filter_8_11, 2);
+  sum = vmlaq_lane_s16(sum, sB, filter_8_11, 3);
+
+  // Saturating addition is required for the largest filter taps to avoid
+  // overflow (while staying in 16-bit elements.)
+  sum = vqaddq_s16(sum, vmulq_lane_s16(s5, filter_4_7, 1));
+  sum = vqaddq_s16(sum, vmulq_lane_s16(s6, filter_4_7, 2));
+
+  return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void vpx_convolve12_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const InterpKernel12 *filter, int x0_q4,
+                               int x_step_q4, int y0_q4, int y_step_q4, int w,
+                               int h) {
+  // Scaling not supported by Neon implementation.
+  if (x_step_q4 != 16) {
+    vpx_convolve12_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                           x_step_q4, y0_q4, y_step_q4, w, h);
+    return;
+  }
+
+  assert(w == 32 || w == 16 || w == 8);
+  assert(h % 4 == 0);
+
+  const int16x8_t filter_0_7 = vld1q_s16(filter[x0_q4]);
+  const int16x4_t filter_8_11 = vld1_s16(filter[x0_q4] + 8);
+
+  src -= MAX_FILTER_TAP / 2 - 1;
+
+  do {
+    const uint8_t *s = src;
+    uint8_t *d = dst;
+    int width = w;
+
+    uint8x8_t t0, t1, t2, t3;
+    load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+    transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+    int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+    int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+    int16x8_t s0s1 = vcombine_s16(s0, s1);
+    int16x8_t s1s2 = vcombine_s16(s1, s2);
+    int16x8_t s2s3 = vcombine_s16(s2, s3);
+    int16x8_t s3s4 = vcombine_s16(s3, s4);
+    int16x8_t s4s5 = vcombine_s16(s4, s5);
+    int16x8_t s5s6 = vcombine_s16(s5, s6);
+    int16x8_t s6s7 = vcombine_s16(s6, s7);
+
+    load_u8_8x4(s + 8, src_stride, &t0, &t1, &t2, &t3);
+    transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+    int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    int16x4_t sA = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
+    int16x8_t s7s8 = vcombine_s16(s7, s8);
+    int16x8_t s8s9 = vcombine_s16(s8, s9);
+    int16x8_t s9sA = vcombine_s16(s9, sA);
+
+    s += 11;
+
+    do {
+      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+      int16x4_t sB = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      int16x4_t sC = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      int16x4_t sD = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      int16x4_t sE = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+      int16x8_t sAsB = vcombine_s16(sA, sB);
+      int16x8_t sBsC = vcombine_s16(sB, sC);
+      int16x8_t sCsD = vcombine_s16(sC, sD);
+      int16x8_t sDsE = vcombine_s16(sD, sE);
+
+      uint8x8_t d01 =
+          convolve12_8(s0s1, s1s2, s2s3, s3s4, s4s5, s5s6, s6s7, s7s8, s8s9,
+                       s9sA, sAsB, sBsC, filter_0_7, filter_8_11);
+      uint8x8_t d23 =
+          convolve12_8(s2s3, s3s4, s4s5, s5s6, s6s7, s7s8, s8s9, s9sA, sAsB,
+                       sBsC, sCsD, sDsE, filter_0_7, filter_8_11);
+
+      transpose_u8_4x4(&d01, &d23);
+
+      store_u8(d + 0 * dst_stride, 2 * dst_stride, d01);
+      store_u8(d + 1 * dst_stride, 2 * dst_stride, d23);
+
+      s0s1 = s4s5;
+      s1s2 = s5s6;
+      s2s3 = s6s7;
+      s3s4 = s7s8;
+      s4s5 = s8s9;
+      s5s6 = s9sA;
+      s6s7 = sAsB;
+      s7s8 = sBsC;
+      s8s9 = sCsD;
+      s9sA = sDsE;
+      sA = sE;
+      s += 4;
+      d += 4;
+      width -= 4;
+    } while (width != 0);
+    src += 4 * src_stride;
+    dst += 4 * dst_stride;
+    h -= 4;
+  } while (h != 0);
+}
+
+void vpx_convolve12_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const InterpKernel12 *filter, int x0_q4,
+                              int x_step_q4, int y0_q4, int y_step_q4, int w,
+                              int h) {
+  // Scaling not supported by Neon implementation.
+  if (y_step_q4 != 16) {
+    vpx_convolve12_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                          x_step_q4, y0_q4, y_step_q4, w, h);
+    return;
+  }
+
+  assert(w == 32 || w == 16 || w == 8);
+  assert(h == 32 || h == 16 || h == 8);
+
+  const int16x8_t filter_0_7 = vld1q_s16(filter[y0_q4]);
+  const int16x4_t filter_8_11 = vld1_s16(filter[y0_q4] + 8);
+
+  src -= src_stride * (MAX_FILTER_TAP / 2 - 1);
+
+  do {
+    const uint8_t *s = src;
+    uint8_t *d = dst;
+    int height = h;
+
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, tA;
+    load_u8_8x11(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8,
+                 &t9, &tA);
+    int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+    int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+    int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+    int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+    int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+    int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+    int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+    int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+    int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
+    int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
+    int16x8_t sA = vreinterpretq_s16_u16(vmovl_u8(tA));
+
+    s += 11 * src_stride;
+
+    do {
+      uint8x8_t tB, tC, tD, tE;
+      load_u8_8x4(s, src_stride, &tB, &tC, &tD, &tE);
+
+      int16x8_t sB = vreinterpretq_s16_u16(vmovl_u8(tB));
+      int16x8_t sC = vreinterpretq_s16_u16(vmovl_u8(tC));
+      int16x8_t sD = vreinterpretq_s16_u16(vmovl_u8(tD));
+      int16x8_t sE = vreinterpretq_s16_u16(vmovl_u8(tE));
+
+      uint8x8_t d0 = convolve12_8(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA,
+                                  sB, filter_0_7, filter_8_11);
+      uint8x8_t d1 = convolve12_8(s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB,
+                                  sC, filter_0_7, filter_8_11);
+      uint8x8_t d2 = convolve12_8(s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC,
+                                  sD, filter_0_7, filter_8_11);
+      uint8x8_t d3 = convolve12_8(s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD,
+                                  sE, filter_0_7, filter_8_11);
+
+      store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = sA;
+      s7 = sB;
+      s8 = sC;
+      s9 = sD;
+      sA = sE;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+    src += 8;
+    dst += 8;
+    w -= 8;
+  } while (w != 0);
+}
+
+void vpx_convolve12_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                         ptrdiff_t dst_stride, const InterpKernel12 *filter,
+                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+                         int w, int h) {
+  // Scaling not supported by Neon implementation.
+  if (x_step_q4 != 16 || y_step_q4 != 16) {
+    vpx_convolve12_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                     y0_q4, y_step_q4, w, h);
+    return;
+  }
+
+  assert(w == 32 || w == 16 || w == 8);
+  assert(h == 32 || h == 16 || h == 8);
+
+  DECLARE_ALIGNED(32, uint8_t, im_block[BW * (BH + MAX_FILTER_TAP)]);
+
+  const int im_stride = BW;
+  // Account for the vertical pass needing MAX_FILTER_TAP / 2 - 1 lines prior
+  // and MAX_FILTER_TAP / 2 lines post. (+1 to make total divisible by 4.)
+  const int im_height = h + MAX_FILTER_TAP;
+  const ptrdiff_t border_offset = MAX_FILTER_TAP / 2 - 1;
+
+  // Filter starting border_offset rows up.
+  vpx_convolve12_horiz_neon(src - src_stride * border_offset, src_stride,
+                            im_block, im_stride, filter, x0_q4, x_step_q4,
+                            y0_q4, y_step_q4, w, im_height);
+
+  vpx_convolve12_vert_neon(im_block + im_stride * border_offset, im_stride, dst,
+                           dst_stride, filter, x0_q4, x_step_q4, y0_q4,
+                           y_step_q4, w, h);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon_dotprod.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon_dotprod.c
new file mode 100644
index 0000000000..1fa2d8732f
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon_dotprod.c
@@ -0,0 +1,367 @@
+/*
+ *  Copyright (c) 2025 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = {
+  // clang-format off
+  0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
+  4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10,
+  8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+  // clang-format on
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = {
+  // clang-format off
+  // Shift left and insert new last column in transposed 4x4 block.
+  1,  2,  3, 16,  5,  6,  7, 20,  9, 10, 11, 24, 13, 14, 15, 28,
+  // Shift left and insert two new columns in transposed 4x4 block.
+  2,  3, 16, 17,  6,  7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+  // Shift left and insert three new columns in transposed 4x4 block.
+  3, 16, 17, 18,  7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+  // clang-format on
+};
+
+static INLINE uint8x8_t convolve12_8_h(uint8x16_t samples[2],
+                                       const int8x16_t filter,
+                                       const uint8x16x3_t perm_tbl) {
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t samples_128[2] = {
+    vreinterpretq_s8_u8(vsubq_u8(samples[0], vdupq_n_u8(128))),
+    vreinterpretq_s8_u8(vsubq_u8(samples[1], vdupq_n_u8(128)))
+  };
+
+  // Permute samples ready for dot product.
+  // {  0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // {  4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  // {  8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+  // { 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 }
+  int8x16_t perm_samples[4] = { vqtbl1q_s8(samples_128[0], perm_tbl.val[0]),
+                                vqtbl1q_s8(samples_128[0], perm_tbl.val[1]),
+                                vqtbl1q_s8(samples_128[0], perm_tbl.val[2]),
+                                vqtbl1q_s8(samples_128[1], perm_tbl.val[2]) };
+
+  // Accumulate into 128 << FILTER_BITS to account for range transform.
+  int32x4_t acc = vdupq_n_s32(128 << FILTER_BITS);
+
+  int32x4_t sum0123 = vdotq_laneq_s32(acc, perm_samples[0], filter, 0);
+  sum0123 = vdotq_laneq_s32(sum0123, perm_samples[1], filter, 1);
+  sum0123 = vdotq_laneq_s32(sum0123, perm_samples[2], filter, 2);
+
+  int32x4_t sum4567 = vdotq_laneq_s32(acc, perm_samples[1], filter, 0);
+  sum4567 = vdotq_laneq_s32(sum4567, perm_samples[2], filter, 1);
+  sum4567 = vdotq_laneq_s32(sum4567, perm_samples[3], filter, 2);
+
+  // Narrow and re-pack.
+  int16x8_t sum_s16 = vcombine_s16(vqrshrn_n_s32(sum0123, FILTER_BITS),
+                                   vqrshrn_n_s32(sum4567, FILTER_BITS));
+  return vqmovun_s16(sum_s16);
+}
+
+void vpx_convolve12_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                       uint8_t *dst, ptrdiff_t dst_stride,
+                                       const InterpKernel12 *filter, int x0_q4,
+                                       int x_step_q4, int y0_q4, int y_step_q4,
+                                       int w, int h) {
+  // Scaling not supported by Neon implementation.
+  if (x_step_q4 != 16) {
+    vpx_convolve12_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                           x_step_q4, y0_q4, y_step_q4, w, h);
+    return;
+  }
+
+  assert(w == 32 || w == 16 || w == 8);
+  assert(h == 32 || h == 16 || h == 8);
+
+  const int16x8_t x_filter_0_7 = vld1q_s16(filter[x0_q4]);
+  const int16x4_t x_filter_8_11 = vld1_s16(filter[x0_q4] + 8);
+  const int16x8_t x_filter_8_15 = vcombine_s16(x_filter_8_11, vdup_n_s16(0));
+  const int8x16_t x_filter =
+      vcombine_s8(vmovn_s16(x_filter_0_7), vmovn_s16(x_filter_8_15));
+
+  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
+
+  src -= MAX_FILTER_TAP / 2 - 1;
+
+  do {
+    const uint8_t *s = src;
+    uint8_t *d = dst;
+    int width = w;
+
+    do {
+      uint8x16_t s0[2], s1[2], s2[2], s3[2];
+      load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
+      load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
+
+      uint8x8_t d0 = convolve12_8_h(s0, x_filter, permute_tbl);
+      uint8x8_t d1 = convolve12_8_h(s1, x_filter, permute_tbl);
+      uint8x8_t d2 = convolve12_8_h(s2, x_filter, permute_tbl);
+      uint8x8_t d3 = convolve12_8_h(s3, x_filter, permute_tbl);
+
+      store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+    src += 4 * src_stride;
+    dst += 4 * dst_stride;
+    h -= 4;
+  } while (h != 0);
+}
+
+static INLINE uint8x8_t convolve12_8_v(
+    const int8x16_t s0_lo, const int8x16_t s0_hi, const int8x16_t s1_lo,
+    const int8x16_t s1_hi, const int8x16_t s2_lo, const int8x16_t s2_hi,
+    const int8x8_t filters_0_7, const int8x8_t filters_4_11) {
+  // The sample range transform and permutation are performed by the caller.
+  // Accumulate into 128 << FILTER_BITS to account for range transform.
+  int32x4_t acc = vdupq_n_s32(128 << FILTER_BITS);
+
+  int32x4_t sum0123 = vdotq_lane_s32(acc, s0_lo, filters_0_7, 0);
+  sum0123 = vdotq_lane_s32(sum0123, s1_lo, filters_0_7, 1);
+  sum0123 = vdotq_lane_s32(sum0123, s2_lo, filters_4_11, 1);
+
+  int32x4_t sum4567 = vdotq_lane_s32(acc, s0_hi, filters_0_7, 0);
+  sum4567 = vdotq_lane_s32(sum4567, s1_hi, filters_0_7, 1);
+  sum4567 = vdotq_lane_s32(sum4567, s2_hi, filters_4_11, 1);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vqmovn_s32(sum0123), vqmovn_s32(sum4567));
+  return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void vpx_convolve12_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const InterpKernel12 *filter, int x0_q4,
+                                      int x_step_q4, int y0_q4, int y_step_q4,
+                                      int w, int h) {
+  // Scaling not supported by Neon implementation.
+  if (y_step_q4 != 16) {
+    vpx_convolve12_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                          x_step_q4, y0_q4, y_step_q4, w, h);
+    return;
+  }
+
+  assert(w == 32 || w == 16 || w == 8);
+  assert(h == 32 || h == 16 || h == 8);
+
+  const int8x8_t filter_0_7 = vmovn_s16(vld1q_s16(filter[y0_q4]));
+  const int8x8_t filter_4_11 = vmovn_s16(vld1q_s16(filter[y0_q4] + 4));
+
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
+
+  src -= src_stride * (MAX_FILTER_TAP / 2 - 1);
+
+  do {
+    int height = h;
+    const uint8_t *s = src;
+    uint8_t *d = dst;
+
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, tA;
+    load_u8_8x11(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8,
+                 &t9, &tA);
+    s += 11 * src_stride;
+
+    // Transform sample range to [-128, 127] for 8-bit signed dot product.
+    int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
+    int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
+    int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
+    int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
+    int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
+    int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
+    int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
+    int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
+    int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
+    int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
+    int8x8_t sA = vreinterpret_s8_u8(vsub_u8(tA, vdup_n_u8(128)));
+
+    // This operation combines a conventional transpose and the sample permute
+    // (see horizontal case) required before computing the dot product.
+    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+        s6789_hi, s789A_lo, s789A_hi;
+    transpose_concat_s8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+    transpose_concat_s8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+    transpose_concat_s8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+    transpose_concat_s8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
+    transpose_concat_s8_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi);
+    transpose_concat_s8_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi);
+    transpose_concat_s8_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi);
+    transpose_concat_s8_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi);
+
+    do {
+      uint8x8_t tB, tC, tD, tE;
+      load_u8_8x4(s, src_stride, &tB, &tC, &tD, &tE);
+
+      int8x8_t sB = vreinterpret_s8_u8(vsub_u8(tB, vdup_n_u8(128)));
+      int8x8_t sC = vreinterpret_s8_u8(vsub_u8(tC, vdup_n_u8(128)));
+      int8x8_t sD = vreinterpret_s8_u8(vsub_u8(tD, vdup_n_u8(128)));
+      int8x8_t sE = vreinterpret_s8_u8(vsub_u8(tE, vdup_n_u8(128)));
+
+      int8x16_t s89AB_lo, s89AB_hi, s9ABC_lo, s9ABC_hi, sABCD_lo, sABCD_hi,
+          sBCDE_lo, sBCDE_hi;
+      transpose_concat_s8_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi);
+
+      // Merge new data into block from previous iteration.
+      int8x16x2_t samples_LUT_lo = { { s789A_lo, sBCDE_lo } };
+      s89AB_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[0]);
+      s9ABC_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[1]);
+      sABCD_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[2]);
+
+      int8x16x2_t samples_LUT_hi = { { s789A_hi, sBCDE_hi } };
+      s89AB_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[0]);
+      s9ABC_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[1]);
+      sABCD_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[2]);
+
+      uint8x8_t d0 =
+          convolve12_8_v(s0123_lo, s0123_hi, s4567_lo, s4567_hi, s89AB_lo,
+                         s89AB_hi, filter_0_7, filter_4_11);
+      uint8x8_t d1 =
+          convolve12_8_v(s1234_lo, s1234_hi, s5678_lo, s5678_hi, s9ABC_lo,
+                         s9ABC_hi, filter_0_7, filter_4_11);
+      uint8x8_t d2 =
+          convolve12_8_v(s2345_lo, s2345_hi, s6789_lo, s6789_hi, sABCD_lo,
+                         sABCD_hi, filter_0_7, filter_4_11);
+      uint8x8_t d3 =
+          convolve12_8_v(s3456_lo, s3456_hi, s789A_lo, s789A_hi, sBCDE_lo,
+                         sBCDE_hi, filter_0_7, filter_4_11);
+
+      store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      // Prepare block for next iteration - re-using as much as possible.
+      // Shuffle everything up four rows.
+      s0123_lo = s4567_lo;
+      s0123_hi = s4567_hi;
+      s1234_lo = s5678_lo;
+      s1234_hi = s5678_hi;
+      s2345_lo = s6789_lo;
+      s2345_hi = s6789_hi;
+      s3456_lo = s789A_lo;
+      s3456_hi = s789A_hi;
+      s4567_lo = s89AB_lo;
+      s4567_hi = s89AB_hi;
+      s5678_lo = s9ABC_lo;
+      s5678_hi = s9ABC_hi;
+      s6789_lo = sABCD_lo;
+      s6789_hi = sABCD_hi;
+      s789A_lo = sBCDE_lo;
+      s789A_hi = sBCDE_hi;
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+    src += 8;
+    dst += 8;
+    w -= 8;
+  } while (w != 0);
+}
+
+static INLINE void vpx_convolve12_2d_horiz_neon_dotprod(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int w,
+    int h) {
+  assert(w == 32 || w == 16 || w == 8);
+  assert(h % 4 == 3);
+
+  const int16x8_t x_filter_0_7 = vld1q_s16(filter[x0_q4]);
+  const int16x4_t x_filter_8_11 = vld1_s16(filter[x0_q4] + 8);
+  const int16x8_t x_filter_8_15 = vcombine_s16(x_filter_8_11, vdup_n_s16(0));
+  const int8x16_t x_filter =
+      vcombine_s8(vmovn_s16(x_filter_0_7), vmovn_s16(x_filter_8_15));
+
+  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
+
+  src -= MAX_FILTER_TAP / 2 - 1;
+
+  do {
+    const uint8_t *s = src;
+    uint8_t *d = dst;
+    int width = w;
+
+    do {
+      uint8x16_t s0[2], s1[2], s2[2], s3[2];
+      load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
+      load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
+
+      uint8x8_t d0 = convolve12_8_h(s0, x_filter, permute_tbl);
+      uint8x8_t d1 = convolve12_8_h(s1, x_filter, permute_tbl);
+      uint8x8_t d2 = convolve12_8_h(s2, x_filter, permute_tbl);
+      uint8x8_t d3 = convolve12_8_h(s3, x_filter, permute_tbl);
+
+      store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+    src += 4 * src_stride;
+    dst += 4 * dst_stride;
+    h -= 4;
+  } while (h != 3);
+
+  do {
+    uint8x16_t s0[2], s1[2], s2[2];
+    load_u8_16x3(src, src_stride, &s0[0], &s1[0], &s2[0]);
+    load_u8_16x3(src + 4, src_stride, &s0[1], &s1[1], &s2[1]);
+
+    uint8x8_t d0 = convolve12_8_h(s0, x_filter, permute_tbl);
+    uint8x8_t d1 = convolve12_8_h(s1, x_filter, permute_tbl);
+    uint8x8_t d2 = convolve12_8_h(s2, x_filter, permute_tbl);
+
+    store_u8_8x3(dst, dst_stride, d0, d1, d2);
+
+    src += 8;
+    dst += 8;
+    w -= 8;
+  } while (w != 0);
+}
+
+void vpx_convolve12_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const InterpKernel12 *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                 int h) {
+  // Scaling not supported by Neon implementation.
+  if (x_step_q4 != 16 || y_step_q4 != 16) {
+    vpx_convolve12_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                     y0_q4, y_step_q4, w, h);
+    return;
+  }
+
+  assert(w == 32 || w == 16 || w == 8);
+  assert(h == 32 || h == 16 || h == 8);
+
+  DECLARE_ALIGNED(32, uint8_t, im_block[BW * (BH + MAX_FILTER_TAP)]);
+
+  const int im_stride = BW;
+  // Account for the vertical pass needing MAX_FILTER_TAP / 2 - 1 lines prior
+  // and MAX_FILTER_TAP / 2 lines post.
+  const int im_height = h + MAX_FILTER_TAP - 1;
+  const ptrdiff_t border_offset = MAX_FILTER_TAP / 2 - 1;
+
+  // Filter starting border_offset rows up.
+  vpx_convolve12_2d_horiz_neon_dotprod(src - src_stride * border_offset,
+                                       src_stride, im_block, im_stride, filter,
+                                       x0_q4, w, im_height);
+
+  vpx_convolve12_vert_neon_dotprod(im_block + im_stride * border_offset,
+                                   im_stride, dst, dst_stride, filter, x0_q4,
+                                   x_step_q4, y0_q4, y_step_q4, w, h);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon_i8mm.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon_i8mm.c
new file mode 100644
index 0000000000..3803873a44
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon_i8mm.c
@@ -0,0 +1,351 @@
+/*
+ *  Copyright (c) 2025 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, kMatMulPermuteTbl[32]) = {
+  // clang-format off
+  0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9,
+  4,  5,  6,  7,  8,  9, 10, 11,  6,  7,  8,  9, 10, 11, 12, 13
+  // clang-format on
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = {
+  // clang-format off
+  // Shift left and insert new last column in transposed 4x4 block.
+  1,  2,  3, 16,  5,  6,  7, 20,  9, 10, 11, 24, 13, 14, 15, 28,
+  // Shift left and insert two new columns in transposed 4x4 block.
+  2,  3, 16, 17,  6,  7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+  // Shift left and insert three new columns in transposed 4x4 block.
+  3, 16, 17, 18,  7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+  // clang-format on
+};
+
+static INLINE uint8x8_t convolve12_8_h(uint8x16_t samples[2],
+                                       const int8x16_t filter[2],
+                                       const uint8x16x2_t perm_tbl) {
+  // Permute samples ready for matrix multiply.
+  // {  0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9 }
+  // {  4,  5,  6,  7,  8,  9, 10, 11,  6,  7,  8,  9, 10, 11, 12, 13 }
+  // {  6,  7,  8,  9, 10, 11, 12, 13,  8,  9, 10, 11, 12, 13, 14, 15 }
+  // { 10, 11, 12, 13, 14, 15, 16, 17, 12, 13, 14, 15, 16, 17, 18, 19 }
+  uint8x16_t perm_samples[4] = { vqtbl1q_u8(samples[0], perm_tbl.val[0]),
+                                 vqtbl1q_u8(samples[0], perm_tbl.val[1]),
+                                 vqtbl1q_u8(samples[1], perm_tbl.val[0]),
+                                 vqtbl1q_u8(samples[1], perm_tbl.val[1]) };
+
+  // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix
+  // (filter), destructively accumulating into the destination register.
+  int32x4_t sum0123 = vusmmlaq_s32(vdupq_n_s32(0), perm_samples[0], filter[0]);
+  int32x4_t sum4567 = vusmmlaq_s32(vdupq_n_s32(0), perm_samples[1], filter[0]);
+  sum0123 = vusmmlaq_s32(sum0123, perm_samples[2], filter[1]);
+  sum4567 = vusmmlaq_s32(sum4567, perm_samples[3], filter[1]);
+
+  // Narrow and re-pack.
+  int16x8_t sum_s16 = vcombine_s16(vqrshrn_n_s32(sum0123, FILTER_BITS),
+                                   vqrshrn_n_s32(sum4567, FILTER_BITS));
+  return vqmovun_s16(sum_s16);
+}
+
+void vpx_convolve12_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                    uint8_t *dst, ptrdiff_t dst_stride,
+                                    const InterpKernel12 *filter, int x0_q4,
+                                    int x_step_q4, int y0_q4, int y_step_q4,
+                                    int w, int h) {
+  // Scaling not supported by Neon implementation.
+  if (x_step_q4 != 16) {
+    vpx_convolve12_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                           x_step_q4, y0_q4, y_step_q4, w, h);
+    return;
+  }
+
+  assert(w == 32 || w == 16 || w == 8);
+  assert(h == 32 || h == 16 || h == 8);
+
+  // Split 12-tap filter into two 6-tap filters, masking the top two elements.
+  // { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 }
+  const int8x8_t mask = vcreate_s8(0x0000ffffffffffff);
+  const int8x8_t filter_0 = vand_s8(vmovn_s16(vld1q_s16(filter[x0_q4])), mask);
+  const int8x8_t filter_1 =
+      vext_s8(vmovn_s16(vld1q_s16(filter[x0_q4] + 4)), vdup_n_s8(0), 2);
+
+  // Stagger each 6-tap filter to enable use of matrix multiply instructions.
+  // { f0, f1, f2, f3, f4, f5,  0,  0,  0, f0, f1, f2, f3, f4, f5,  0 }
+  const int8x16_t x_filter[2] = {
+    vcombine_s8(filter_0, vext_s8(filter_0, filter_0, 7)),
+    vcombine_s8(filter_1, vext_s8(filter_1, filter_1, 7))
+  };
+
+  const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl);
+
+  src -= MAX_FILTER_TAP / 2 - 1;
+
+  do {
+    const uint8_t *s = src;
+    uint8_t *d = dst;
+    int width = w;
+
+    do {
+      uint8x16_t s0[2], s1[2], s2[2], s3[2];
+      load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
+      load_u8_16x4(s + 6, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
+
+      uint8x8_t d0 = convolve12_8_h(s0, x_filter, permute_tbl);
+      uint8x8_t d1 = convolve12_8_h(s1, x_filter, permute_tbl);
+      uint8x8_t d2 = convolve12_8_h(s2, x_filter, permute_tbl);
+      uint8x8_t d3 = convolve12_8_h(s3, x_filter, permute_tbl);
+
+      store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+    src += 4 * src_stride;
+    dst += 4 * dst_stride;
+    h -= 4;
+  } while (h != 0);
+}
+
+static INLINE uint8x8_t convolve12_8_v(
+    const uint8x16_t s0_lo, const uint8x16_t s0_hi, const uint8x16_t s1_lo,
+    const uint8x16_t s1_hi, const uint8x16_t s2_lo, const uint8x16_t s2_hi,
+    const int8x8_t filters_0_7, const int8x8_t filters_4_11) {
+  // The sample range transform and permutation are performed by the caller.
+  int32x4_t sum0123 = vusdotq_lane_s32(vdupq_n_s32(0), s0_lo, filters_0_7, 0);
+  sum0123 = vusdotq_lane_s32(sum0123, s1_lo, filters_0_7, 1);
+  sum0123 = vusdotq_lane_s32(sum0123, s2_lo, filters_4_11, 1);
+
+  int32x4_t sum4567 = vusdotq_lane_s32(vdupq_n_s32(0), s0_hi, filters_0_7, 0);
+  sum4567 = vusdotq_lane_s32(sum4567, s1_hi, filters_0_7, 1);
+  sum4567 = vusdotq_lane_s32(sum4567, s2_hi, filters_4_11, 1);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vqmovn_s32(sum0123), vqmovn_s32(sum4567));
+  return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void vpx_convolve12_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const InterpKernel12 *filter, int x0_q4,
+                                   int x_step_q4, int y0_q4, int y_step_q4,
+                                   int w, int h) {
+  // Scaling not supported by Neon implementation.
+  if (y_step_q4 != 16) {
+    vpx_convolve12_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                          x_step_q4, y0_q4, y_step_q4, w, h);
+    return;
+  }
+
+  assert(w == 32 || w == 16 || w == 8);
+  assert(h == 32 || h == 16 || h == 8);
+
+  const int8x8_t filter_0_7 = vmovn_s16(vld1q_s16(filter[y0_q4]));
+  const int8x8_t filter_4_11 = vmovn_s16(vld1q_s16(filter[y0_q4] + 4));
+
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
+
+  src -= src_stride * (MAX_FILTER_TAP / 2 - 1);
+
+  do {
+    int height = h;
+    const uint8_t *s = src;
+    uint8_t *d = dst;
+
+    uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA;
+    load_u8_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
+                 &s9, &sA);
+    s += 11 * src_stride;
+
+    // This operation combines a conventional transpose and the sample permute
+    // (see horizontal case) required before computing the dot product.
+    uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+        s6789_hi, s789A_lo, s789A_hi;
+    transpose_concat_u8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+    transpose_concat_u8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+    transpose_concat_u8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+    transpose_concat_u8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
+    transpose_concat_u8_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi);
+    transpose_concat_u8_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi);
+    transpose_concat_u8_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi);
+    transpose_concat_u8_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi);
+
+    do {
+      uint8x8_t sB, sC, sD, sE;
+      load_u8_8x4(s, src_stride, &sB, &sC, &sD, &sE);
+
+      uint8x16_t s89AB_lo, s89AB_hi, s9ABC_lo, s9ABC_hi, sABCD_lo, sABCD_hi,
+          sBCDE_lo, sBCDE_hi;
+      transpose_concat_u8_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi);
+
+      // Merge new data into block from previous iteration.
+      uint8x16x2_t samples_LUT_lo = { { s789A_lo, sBCDE_lo } };
+      s89AB_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[0]);
+      s9ABC_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[1]);
+      sABCD_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[2]);
+
+      uint8x16x2_t samples_LUT_hi = { { s789A_hi, sBCDE_hi } };
+      s89AB_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[0]);
+      s9ABC_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[1]);
+      sABCD_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[2]);
+
+      uint8x8_t d0 =
+          convolve12_8_v(s0123_lo, s0123_hi, s4567_lo, s4567_hi, s89AB_lo,
+                         s89AB_hi, filter_0_7, filter_4_11);
+      uint8x8_t d1 =
+          convolve12_8_v(s1234_lo, s1234_hi, s5678_lo, s5678_hi, s9ABC_lo,
+                         s9ABC_hi, filter_0_7, filter_4_11);
+      uint8x8_t d2 =
+          convolve12_8_v(s2345_lo, s2345_hi, s6789_lo, s6789_hi, sABCD_lo,
+                         sABCD_hi, filter_0_7, filter_4_11);
+      uint8x8_t d3 =
+          convolve12_8_v(s3456_lo, s3456_hi, s789A_lo, s789A_hi, sBCDE_lo,
+                         sBCDE_hi, filter_0_7, filter_4_11);
+
+      store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      // Prepare block for next iteration - re-using as much as possible.
+      // Shuffle everything up four rows.
+      s0123_lo = s4567_lo;
+      s0123_hi = s4567_hi;
+      s1234_lo = s5678_lo;
+      s1234_hi = s5678_hi;
+      s2345_lo = s6789_lo;
+      s2345_hi = s6789_hi;
+      s3456_lo = s789A_lo;
+      s3456_hi = s789A_hi;
+      s4567_lo = s89AB_lo;
+      s4567_hi = s89AB_hi;
+      s5678_lo = s9ABC_lo;
+      s5678_hi = s9ABC_hi;
+      s6789_lo = sABCD_lo;
+      s6789_hi = sABCD_hi;
+      s789A_lo = sBCDE_lo;
+      s789A_hi = sBCDE_hi;
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+    src += 8;
+    dst += 8;
+    w -= 8;
+  } while (w != 0);
+}
+
+static INLINE void vpx_convolve12_2d_horiz_neon_i8mm(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int w,
+    int h) {
+  assert(w == 32 || w == 16 || w == 8);
+  assert(h % 4 == 3);
+
+  // Split 12-tap filter into two 6-tap filters, masking the top two elements.
+  // { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 }
+  const int8x8_t mask = vcreate_s8(0x0000ffffffffffff);
+  const int8x8_t filter_0 = vand_s8(vmovn_s16(vld1q_s16(filter[x0_q4])), mask);
+  const int8x8_t filter_1 =
+      vext_s8(vmovn_s16(vld1q_s16(filter[x0_q4] + 4)), vdup_n_s8(0), 2);
+
+  // Stagger each 6-tap filter to enable use of matrix multiply instructions.
+  // { f0, f1, f2, f3, f4, f5,  0,  0,  0, f0, f1, f2, f3, f4, f5,  0 }
+  const int8x16_t x_filter[2] = {
+    vcombine_s8(filter_0, vext_s8(filter_0, filter_0, 7)),
+    vcombine_s8(filter_1, vext_s8(filter_1, filter_1, 7))
+  };
+
+  const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl);
+
+  src -= MAX_FILTER_TAP / 2 - 1;
+
+  do {
+    const uint8_t *s = src;
+    uint8_t *d = dst;
+    int width = w;
+
+    do {
+      uint8x16_t s0[2], s1[2], s2[2], s3[2];
+      load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
+      load_u8_16x4(s + 6, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
+
+      uint8x8_t d0 = convolve12_8_h(s0, x_filter, permute_tbl);
+      uint8x8_t d1 = convolve12_8_h(s1, x_filter, permute_tbl);
+      uint8x8_t d2 = convolve12_8_h(s2, x_filter, permute_tbl);
+      uint8x8_t d3 = convolve12_8_h(s3, x_filter, permute_tbl);
+
+      store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+    src += 4 * src_stride;
+    dst += 4 * dst_stride;
+    h -= 4;
+  } while (h != 3);
+
+  do {
+    uint8x16_t s0[2], s1[2], s2[2];
+    load_u8_16x3(src, src_stride, &s0[0], &s1[0], &s2[0]);
+    load_u8_16x3(src + 6, src_stride, &s0[1], &s1[1], &s2[1]);
+
+    uint8x8_t d0 = convolve12_8_h(s0, x_filter, permute_tbl);
+    uint8x8_t d1 = convolve12_8_h(s1, x_filter, permute_tbl);
+    uint8x8_t d2 = convolve12_8_h(s2, x_filter, permute_tbl);
+
+    store_u8_8x3(dst, dst_stride, d0, d1, d2);
+
+    src += 8;
+    dst += 8;
+    w -= 8;
+  } while (w != 0);
+}
+
+void vpx_convolve12_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const InterpKernel12 *filter, int x0_q4,
+                              int x_step_q4, int y0_q4, int y_step_q4, int w,
+                              int h) {
+  // Scaling not supported by Neon implementation.
+  if (x_step_q4 != 16 || y_step_q4 != 16) {
+    vpx_convolve12_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                     y0_q4, y_step_q4, w, h);
+    return;
+  }
+
+  assert(w == 32 || w == 16 || w == 8);
+  assert(h == 32 || h == 16 || h == 8);
+
+  DECLARE_ALIGNED(32, uint8_t, im_block[BW * (BH + MAX_FILTER_TAP)]);
+
+  const int im_stride = BW;
+  // Account for the vertical pass needing MAX_FILTER_TAP / 2 - 1 lines prior
+  // and MAX_FILTER_TAP / 2 lines post.
+  const int im_height = h + MAX_FILTER_TAP - 1;
+  const ptrdiff_t border_offset = MAX_FILTER_TAP / 2 - 1;
+
+  // Filter starting border_offset rows up.
+  vpx_convolve12_2d_horiz_neon_i8mm(src - src_stride * border_offset,
+                                    src_stride, im_block, im_stride, filter,
+                                    x0_q4, w, im_height);
+
+  vpx_convolve12_vert_neon_i8mm(im_block + im_stride * border_offset, im_stride,
+                                dst, dst_stride, filter, x0_q4, x_step_q4,
+                                y0_q4, y_step_q4, w, h);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_error_msa.c b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_error_msa.c
index 188d04d8f6..61786d8f66 100644
--- a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_error_msa.c
+++ b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_error_msa.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_config.h"
 #include "./vp9_rtcd.h"
 #include "vpx_dsp/mips/macros_msa.h"
 
@@ -79,6 +80,7 @@
     return err;                                                              \
   }
 
+#if !CONFIG_VP9_HIGHBITDEPTH
 BLOCK_ERROR_BLOCKSIZE_MSA(16);
 BLOCK_ERROR_BLOCKSIZE_MSA(64);
 BLOCK_ERROR_BLOCKSIZE_MSA(256);
@@ -103,3 +105,4 @@ int64_t vp9_block_error_msa(const tran_low_t *coeff_ptr,
 
   return err;
 }
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c
index 0831e59148..efbbe830db 100644
--- a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c
+++ b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c
@@ -10,6 +10,7 @@
 
 #include <assert.h>
 
+#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_enums.h"
 #include "vp9/encoder/mips/msa/vp9_fdct_msa.h"
 #include "vpx_dsp/mips/fwd_txfm_msa.h"
diff --git a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c
index fa36f09ab8..9c5cc12ef0 100644
--- a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c
+++ b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c
@@ -10,6 +10,7 @@
 
 #include <assert.h>
 
+#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_enums.h"
 #include "vp9/encoder/mips/msa/vp9_fdct_msa.h"
 
diff --git a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c
index 604db853c4..26d81aa9ef 100644
--- a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c
+++ b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c
@@ -10,6 +10,7 @@
 
 #include <assert.h>
 
+#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_enums.h"
 #include "vp9/encoder/mips/msa/vp9_fdct_msa.h"
 
diff --git a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h
index 794bec70b6..fa1af2fc57 100644
--- a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h
+++ b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_
-#define VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_
+#ifndef VPX_VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_
+#define VPX_VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_
 
 #include "vpx_dsp/mips/fwd_txfm_msa.h"
 #include "vpx_dsp/mips/txfm_macros_msa.h"
@@ -113,4 +113,4 @@
     PCKEV_H4_SH(in0_r_m, in0_r_m, in1_r_m, in1_r_m, s2_m, s2_m, s3_m, s3_m, \
                 out0, out1, out2, out3);                                    \
   }
-#endif /* VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ */
+#endif  // VPX_VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c
deleted file mode 100644
index 23f7ebace4..0000000000
--- a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp9_rtcd.h"
-#include "vpx_dsp/mips/macros_msa.h"
-
-static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr, uint32_t stride,
-                                            uint8_t *frm2_ptr, int32_t filt_sth,
-                                            int32_t filt_wgt, uint32_t *acc,
-                                            uint16_t *cnt) {
-  uint32_t row;
-  uint64_t f0, f1, f2, f3;
-  v16i8 frm2, frm1 = { 0 };
-  v16i8 frm4, frm3 = { 0 };
-  v16u8 frm_r, frm_l;
-  v8i16 frm2_r, frm2_l;
-  v8i16 diff0, diff1, mod0_h, mod1_h;
-  v4i32 cnst3, cnst16, filt_wt, strength;
-  v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
-  v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
-  v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
-  v4i32 acc0, acc1, acc2, acc3;
-  v8i16 cnt0, cnt1;
-
-  filt_wt = __msa_fill_w(filt_wgt);
-  strength = __msa_fill_w(filt_sth);
-  cnst3 = __msa_ldi_w(3);
-  cnst16 = __msa_ldi_w(16);
-
-  for (row = 2; row--;) {
-    LD4(frm1_ptr, stride, f0, f1, f2, f3);
-    frm1_ptr += (4 * stride);
-
-    LD_SB2(frm2_ptr, 16, frm2, frm4);
-    frm2_ptr += 32;
-
-    LD_SW2(acc, 4, acc0, acc1);
-    LD_SW2(acc + 8, 4, acc2, acc3);
-    LD_SH2(cnt, 8, cnt0, cnt1);
-
-    INSERT_D2_SB(f0, f1, frm1);
-    INSERT_D2_SB(f2, f3, frm3);
-    ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
-    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
-    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
-    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
-    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
-         mod1_w, mod2_w, mod3_w);
-    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
-
-    diff0_r = (mod0_w < cnst16);
-    diff0_l = (mod1_w < cnst16);
-    diff1_r = (mod2_w < cnst16);
-    diff1_l = (mod3_w < cnst16);
-
-    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
-         mod1_w, mod2_w, mod3_w);
-
-    mod0_w = diff0_r & mod0_w;
-    mod1_w = diff0_l & mod1_w;
-    mod2_w = diff1_r & mod2_w;
-    mod3_w = diff1_l & mod3_w;
-
-    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
-    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
-    ST_SH2(mod0_h, mod1_h, cnt, 8);
-    cnt += 16;
-
-    UNPCK_UB_SH(frm2, frm2_r, frm2_l);
-    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
-    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
-    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
-         mod2_w, mod3_w);
-
-    ST_SW2(mod0_w, mod1_w, acc, 4);
-    acc += 8;
-    ST_SW2(mod2_w, mod3_w, acc, 4);
-    acc += 8;
-
-    LD_SW2(acc, 4, acc0, acc1);
-    LD_SW2(acc + 8, 4, acc2, acc3);
-    LD_SH2(cnt, 8, cnt0, cnt1);
-
-    ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
-    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
-    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
-    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
-    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
-         mod1_w, mod2_w, mod3_w);
-    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
-
-    diff0_r = (mod0_w < cnst16);
-    diff0_l = (mod1_w < cnst16);
-    diff1_r = (mod2_w < cnst16);
-    diff1_l = (mod3_w < cnst16);
-
-    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
-         mod1_w, mod2_w, mod3_w);
-
-    mod0_w = diff0_r & mod0_w;
-    mod1_w = diff0_l & mod1_w;
-    mod2_w = diff1_r & mod2_w;
-    mod3_w = diff1_l & mod3_w;
-
-    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
-    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
-    ST_SH2(mod0_h, mod1_h, cnt, 8);
-    cnt += 16;
-    UNPCK_UB_SH(frm4, frm2_r, frm2_l);
-    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
-    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
-    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
-         mod2_w, mod3_w);
-
-    ST_SW2(mod0_w, mod1_w, acc, 4);
-    acc += 8;
-    ST_SW2(mod2_w, mod3_w, acc, 4);
-    acc += 8;
-  }
-}
-
-static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr, uint32_t stride,
-                                             uint8_t *frm2_ptr,
-                                             int32_t filt_sth, int32_t filt_wgt,
-                                             uint32_t *acc, uint16_t *cnt) {
-  uint32_t row;
-  v16i8 frm1, frm2, frm3, frm4;
-  v16u8 frm_r, frm_l;
-  v16i8 zero = { 0 };
-  v8u16 frm2_r, frm2_l;
-  v8i16 diff0, diff1, mod0_h, mod1_h;
-  v4i32 cnst3, cnst16, filt_wt, strength;
-  v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
-  v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
-  v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
-  v4i32 acc0, acc1, acc2, acc3;
-  v8i16 cnt0, cnt1;
-
-  filt_wt = __msa_fill_w(filt_wgt);
-  strength = __msa_fill_w(filt_sth);
-  cnst3 = __msa_ldi_w(3);
-  cnst16 = __msa_ldi_w(16);
-
-  for (row = 8; row--;) {
-    LD_SB2(frm1_ptr, stride, frm1, frm3);
-    frm1_ptr += stride;
-
-    LD_SB2(frm2_ptr, 16, frm2, frm4);
-    frm2_ptr += 16;
-
-    LD_SW2(acc, 4, acc0, acc1);
-    LD_SW2(acc, 4, acc2, acc3);
-    LD_SH2(cnt, 8, cnt0, cnt1);
-
-    ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
-    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
-    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
-    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
-    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
-         mod1_w, mod2_w, mod3_w);
-    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
-
-    diff0_r = (mod0_w < cnst16);
-    diff0_l = (mod1_w < cnst16);
-    diff1_r = (mod2_w < cnst16);
-    diff1_l = (mod3_w < cnst16);
-
-    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
-         mod1_w, mod2_w, mod3_w);
-
-    mod0_w = diff0_r & mod0_w;
-    mod1_w = diff0_l & mod1_w;
-    mod2_w = diff1_r & mod2_w;
-    mod3_w = diff1_l & mod3_w;
-
-    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
-    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
-    ST_SH2(mod0_h, mod1_h, cnt, 8);
-    cnt += 16;
-
-    ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l);
-    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
-    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
-    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
-         mod2_w, mod3_w);
-
-    ST_SW2(mod0_w, mod1_w, acc, 4);
-    acc += 8;
-    ST_SW2(mod2_w, mod3_w, acc, 4);
-    acc += 8;
-
-    LD_SW2(acc, 4, acc0, acc1);
-    LD_SW2(acc + 8, 4, acc2, acc3);
-    LD_SH2(cnt, 8, cnt0, cnt1);
-
-    ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
-    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
-    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
-    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
-    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
-         mod1_w, mod2_w, mod3_w);
-    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
-
-    diff0_r = (mod0_w < cnst16);
-    diff0_l = (mod1_w < cnst16);
-    diff1_r = (mod2_w < cnst16);
-    diff1_l = (mod3_w < cnst16);
-
-    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
-         mod1_w, mod2_w, mod3_w);
-
-    mod0_w = diff0_r & mod0_w;
-    mod1_w = diff0_l & mod1_w;
-    mod2_w = diff1_r & mod2_w;
-    mod3_w = diff1_l & mod3_w;
-
-    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
-    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
-    ST_SH2(mod0_h, mod1_h, cnt, 8);
-    cnt += 16;
-
-    ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l);
-    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
-    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
-    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
-         mod0_w, mod1_w, mod2_w, mod3_w);
-    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
-         mod2_w, mod3_w);
-    ST_SW2(mod0_w, mod1_w, acc, 4);
-    acc += 8;
-    ST_SW2(mod2_w, mod3_w, acc, 4);
-    acc += 8;
-
-    frm1_ptr += stride;
-    frm2_ptr += 16;
-  }
-}
-
-void vp9_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride,
-                                   uint8_t *frame2_ptr, uint32_t blk_w,
-                                   uint32_t blk_h, int32_t strength,
-                                   int32_t filt_wgt, uint32_t *accu,
-                                   uint16_t *cnt) {
-  if (8 == (blk_w * blk_h)) {
-    temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr, strength,
-                                    filt_wgt, accu, cnt);
-  } else if (16 == (blk_w * blk_h)) {
-    temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr, strength,
-                                     filt_wgt, accu, cnt);
-  } else {
-    vp9_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h,
-                                strength, filt_wgt, accu, cnt);
-  }
-}
diff --git a/media/libvpx/libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c b/media/libvpx/libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c
new file mode 100644
index 0000000000..4d31558471
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c
@@ -0,0 +1,287 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+
+#include "./vp9_rtcd.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit
+// integers, and return the high 16 bits of the intermediate integers.
+// (a * b) >> 16
+// Note: Because this is done in 2 operations, a and b cannot both be UINT16_MIN
+static INLINE int16x8_t vec_mulhi(int16x8_t a, int16x8_t b) {
+  // madds does ((A * B) >> 15) + C, we need >> 16, so we perform an extra right
+  // shift.
+  return vec_sra(vec_madds(a, b, vec_zeros_s16), vec_ones_u16);
+}
+
+// Negate 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative.
+static INLINE int16x8_t vec_sign(int16x8_t a, int16x8_t b) {
+  const int16x8_t mask = vec_sra(b, vec_shift_sign_s16);
+  return vec_xor(vec_add(a, mask), mask);
+}
+
+// Compare packed 16-bit integers across a, and return the maximum value in
+// every element. Returns a vector containing the biggest value across vector a.
+static INLINE int16x8_t vec_max_across(int16x8_t a) {
+  a = vec_max(a, vec_perm(a, a, vec_perm64));
+  a = vec_max(a, vec_perm(a, a, vec_perm32));
+  return vec_max(a, vec_perm(a, a, vec_perm16));
+}
+
+void vp9_quantize_fp_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                         const int16_t *round_ptr, const int16_t *quant_ptr,
+                         tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                         const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                         const int16_t *scan, const int16_t *iscan) {
+  int16x8_t qcoeff0, qcoeff1, dqcoeff0, dqcoeff1, eob;
+  bool16x8_t zero_coeff0, zero_coeff1;
+
+  int16x8_t round = vec_vsx_ld(0, round_ptr);
+  int16x8_t quant = vec_vsx_ld(0, quant_ptr);
+  int16x8_t dequant = vec_vsx_ld(0, dequant_ptr);
+  int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr);
+  int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr);
+  int16x8_t scan0 = vec_vsx_ld(0, iscan);
+  int16x8_t scan1 = vec_vsx_ld(16, iscan);
+
+  (void)scan;
+
+  // First set of 8 coeff starts with DC + 7 AC
+  qcoeff0 = vec_mulhi(vec_vaddshs(vec_abs(coeff0), round), quant);
+  zero_coeff0 = vec_cmpeq(qcoeff0, vec_zeros_s16);
+  qcoeff0 = vec_sign(qcoeff0, coeff0);
+  vec_vsx_st(qcoeff0, 0, qcoeff_ptr);
+
+  dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16);
+  vec_vsx_st(dqcoeff0, 0, dqcoeff_ptr);
+
+  // Remove DC value from round and quant
+  round = vec_splat(round, 1);
+  quant = vec_splat(quant, 1);
+
+  // Remove DC value from dequant
+  dequant = vec_splat(dequant, 1);
+
+  // Second set of 8 coeff starts with (all AC)
+  qcoeff1 = vec_mulhi(vec_vaddshs(vec_abs(coeff1), round), quant);
+  zero_coeff1 = vec_cmpeq(qcoeff1, vec_zeros_s16);
+  qcoeff1 = vec_sign(qcoeff1, coeff1);
+  vec_vsx_st(qcoeff1, 16, qcoeff_ptr);
+
+  dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16);
+  vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr);
+
+  eob = vec_max(vec_or(scan0, zero_coeff0), vec_or(scan1, zero_coeff1));
+
+  // We quantize 16 coeff up front (enough for a 4x4) and process 24 coeff per
+  // loop iteration.
+  // for 8x8: 16 + 2 x 24 = 64
+  // for 16x16: 16 + 10 x 24 = 256
+  if (n_coeffs > 16) {
+    int16x8_t coeff2, qcoeff2, dqcoeff2, eob2, scan2;
+    bool16x8_t zero_coeff2;
+
+    int index = 16;
+    int off0 = 32;
+    int off1 = 48;
+    int off2 = 64;
+
+    do {
+      coeff0 = vec_vsx_ld(off0, coeff_ptr);
+      coeff1 = vec_vsx_ld(off1, coeff_ptr);
+      coeff2 = vec_vsx_ld(off2, coeff_ptr);
+      scan0 = vec_vsx_ld(off0, iscan);
+      scan1 = vec_vsx_ld(off1, iscan);
+      scan2 = vec_vsx_ld(off2, iscan);
+
+      qcoeff0 = vec_mulhi(vec_vaddshs(vec_abs(coeff0), round), quant);
+      zero_coeff0 = vec_cmpeq(qcoeff0, vec_zeros_s16);
+      qcoeff0 = vec_sign(qcoeff0, coeff0);
+      vec_vsx_st(qcoeff0, off0, qcoeff_ptr);
+      dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16);
+      vec_vsx_st(dqcoeff0, off0, dqcoeff_ptr);
+
+      qcoeff1 = vec_mulhi(vec_vaddshs(vec_abs(coeff1), round), quant);
+      zero_coeff1 = vec_cmpeq(qcoeff1, vec_zeros_s16);
+      qcoeff1 = vec_sign(qcoeff1, coeff1);
+      vec_vsx_st(qcoeff1, off1, qcoeff_ptr);
+      dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16);
+      vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr);
+
+      qcoeff2 = vec_mulhi(vec_vaddshs(vec_abs(coeff2), round), quant);
+      zero_coeff2 = vec_cmpeq(qcoeff2, vec_zeros_s16);
+      qcoeff2 = vec_sign(qcoeff2, coeff2);
+      vec_vsx_st(qcoeff2, off2, qcoeff_ptr);
+      dqcoeff2 = vec_mladd(qcoeff2, dequant, vec_zeros_s16);
+      vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr);
+
+      eob = vec_max(eob, vec_or(scan0, zero_coeff0));
+      eob2 = vec_max(vec_or(scan1, zero_coeff1), vec_or(scan2, zero_coeff2));
+      eob = vec_max(eob, eob2);
+
+      index += 24;
+      off0 += 48;
+      off1 += 48;
+      off2 += 48;
+    } while (index < n_coeffs);
+  }
+
+  eob = vec_max_across(eob);
+  *eob_ptr = eob[0] + 1;
+}
+
+// Sets the value of a 32-bit integers to 1 when the corresponding value in a is
+// negative.
+static INLINE int32x4_t vec_is_neg(int32x4_t a) {
+  return vec_sr(a, vec_shift_sign_s32);
+}
+
+// DeQuantization function used for 32x32 blocks. Quantized coeff of 32x32
+// blocks are twice as big as for other block sizes. As such, using
+// vec_mladd results in overflow.
+static INLINE int16x8_t dequantize_coeff_32(int16x8_t qcoeff,
+                                            int16x8_t dequant) {
+  int32x4_t dqcoeffe = vec_mule(qcoeff, dequant);
+  int32x4_t dqcoeffo = vec_mulo(qcoeff, dequant);
+  // Add 1 if negative to round towards zero because the C uses division.
+  dqcoeffe = vec_add(dqcoeffe, vec_is_neg(dqcoeffe));
+  dqcoeffo = vec_add(dqcoeffo, vec_is_neg(dqcoeffo));
+  dqcoeffe = vec_sra(dqcoeffe, vec_ones_u32);
+  dqcoeffo = vec_sra(dqcoeffo, vec_ones_u32);
+  return (int16x8_t)vec_perm(dqcoeffe, dqcoeffo, vec_perm_odd_even_pack);
+}
+
+void vp9_quantize_fp_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                               const int16_t *round_ptr,
+                               const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
+                               tran_low_t *dqcoeff_ptr,
+                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                               const int16_t *scan, const int16_t *iscan) {
+  // In stage 1, we quantize 16 coeffs (DC + 15 AC)
+  // In stage 2, we loop 42 times and quantize 24 coeffs per iteration
+  // (32 * 32 - 16) / 24 = 42
+  int num_itr = 42;
+  // Offsets are in bytes, 16 coeffs = 32 bytes
+  int off0 = 32;
+  int off1 = 48;
+  int off2 = 64;
+
+  int16x8_t qcoeff0, qcoeff1, dqcoeff0, dqcoeff1, eob;
+  bool16x8_t mask0, mask1, zero_coeff0, zero_coeff1;
+
+  int16x8_t round = vec_vsx_ld(0, round_ptr);
+  int16x8_t quant = vec_vsx_ld(0, quant_ptr);
+  int16x8_t dequant = vec_vsx_ld(0, dequant_ptr);
+  int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr);
+  int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr);
+  int16x8_t scan0 = vec_vsx_ld(0, iscan);
+  int16x8_t scan1 = vec_vsx_ld(16, iscan);
+  int16x8_t thres = vec_sra(dequant, vec_splats((uint16_t)2));
+  int16x8_t abs_coeff0 = vec_abs(coeff0);
+  int16x8_t abs_coeff1 = vec_abs(coeff1);
+
+  (void)scan;
+  (void)n_coeffs;
+
+  mask0 = vec_cmpge(abs_coeff0, thres);
+  round = vec_sra(vec_add(round, vec_ones_s16), vec_ones_u16);
+  // First set of 8 coeff starts with DC + 7 AC
+  qcoeff0 = vec_madds(vec_vaddshs(abs_coeff0, round), quant, vec_zeros_s16);
+  qcoeff0 = vec_and(qcoeff0, mask0);
+  zero_coeff0 = vec_cmpeq(qcoeff0, vec_zeros_s16);
+  qcoeff0 = vec_sign(qcoeff0, coeff0);
+  vec_vsx_st(qcoeff0, 0, qcoeff_ptr);
+
+  dqcoeff0 = dequantize_coeff_32(qcoeff0, dequant);
+  vec_vsx_st(dqcoeff0, 0, dqcoeff_ptr);
+
+  // Remove DC value from thres, round, quant and dequant
+  thres = vec_splat(thres, 1);
+  round = vec_splat(round, 1);
+  quant = vec_splat(quant, 1);
+  dequant = vec_splat(dequant, 1);
+
+  mask1 = vec_cmpge(abs_coeff1, thres);
+
+  // Second set of 8 coeff starts with (all AC)
+  qcoeff1 =
+      vec_madds(vec_vaddshs(vec_abs(coeff1), round), quant, vec_zeros_s16);
+  qcoeff1 = vec_and(qcoeff1, mask1);
+  zero_coeff1 = vec_cmpeq(qcoeff1, vec_zeros_s16);
+  qcoeff1 = vec_sign(qcoeff1, coeff1);
+  vec_vsx_st(qcoeff1, 16, qcoeff_ptr);
+
+  dqcoeff1 = dequantize_coeff_32(qcoeff1, dequant);
+  vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr);
+
+  eob = vec_max(vec_or(scan0, zero_coeff0), vec_or(scan1, zero_coeff1));
+
+  do {
+    int16x8_t coeff2, abs_coeff2, qcoeff2, dqcoeff2, eob2, scan2;
+    bool16x8_t zero_coeff2, mask2;
+    coeff0 = vec_vsx_ld(off0, coeff_ptr);
+    coeff1 = vec_vsx_ld(off1, coeff_ptr);
+    coeff2 = vec_vsx_ld(off2, coeff_ptr);
+    scan0 = vec_vsx_ld(off0, iscan);
+    scan1 = vec_vsx_ld(off1, iscan);
+    scan2 = vec_vsx_ld(off2, iscan);
+
+    abs_coeff0 = vec_abs(coeff0);
+    abs_coeff1 = vec_abs(coeff1);
+    abs_coeff2 = vec_abs(coeff2);
+
+    qcoeff0 = vec_madds(vec_vaddshs(abs_coeff0, round), quant, vec_zeros_s16);
+    qcoeff1 = vec_madds(vec_vaddshs(abs_coeff1, round), quant, vec_zeros_s16);
+    qcoeff2 = vec_madds(vec_vaddshs(abs_coeff2, round), quant, vec_zeros_s16);
+
+    mask0 = vec_cmpge(abs_coeff0, thres);
+    mask1 = vec_cmpge(abs_coeff1, thres);
+    mask2 = vec_cmpge(abs_coeff2, thres);
+
+    qcoeff0 = vec_and(qcoeff0, mask0);
+    qcoeff1 = vec_and(qcoeff1, mask1);
+    qcoeff2 = vec_and(qcoeff2, mask2);
+
+    zero_coeff0 = vec_cmpeq(qcoeff0, vec_zeros_s16);
+    zero_coeff1 = vec_cmpeq(qcoeff1, vec_zeros_s16);
+    zero_coeff2 = vec_cmpeq(qcoeff2, vec_zeros_s16);
+
+    qcoeff0 = vec_sign(qcoeff0, coeff0);
+    qcoeff1 = vec_sign(qcoeff1, coeff1);
+    qcoeff2 = vec_sign(qcoeff2, coeff2);
+
+    vec_vsx_st(qcoeff0, off0, qcoeff_ptr);
+    vec_vsx_st(qcoeff1, off1, qcoeff_ptr);
+    vec_vsx_st(qcoeff2, off2, qcoeff_ptr);
+
+    dqcoeff0 = dequantize_coeff_32(qcoeff0, dequant);
+    dqcoeff1 = dequantize_coeff_32(qcoeff1, dequant);
+    dqcoeff2 = dequantize_coeff_32(qcoeff2, dequant);
+
+    vec_vsx_st(dqcoeff0, off0, dqcoeff_ptr);
+    vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr);
+    vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr);
+
+    eob = vec_max(eob, vec_or(scan0, zero_coeff0));
+    eob2 = vec_max(vec_or(scan1, zero_coeff1), vec_or(scan2, zero_coeff2));
+    eob = vec_max(eob, eob2);
+
+    off0 += 48;
+    off1 += 48;
+    off2 += 48;
+    num_itr--;
+  } while (num_itr != 0);
+
+  eob = vec_max_across(eob);
+  *eob_ptr = eob[0] + 1;
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.c b/media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.c
index 3aeefb5845..acc3764c7a 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.c
@@ -15,7 +15,7 @@ struct ALT_REF_AQ {
   int dummy;
 };
 
-struct ALT_REF_AQ *vp9_alt_ref_aq_create() {
+struct ALT_REF_AQ *vp9_alt_ref_aq_create(void) {
   return (struct ALT_REF_AQ *)vpx_malloc(sizeof(struct ALT_REF_AQ));
 }
 
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.h b/media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.h
index 18acd8a85b..22a657e035 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.h
@@ -15,8 +15,8 @@
  *  for altref frames.  Go to alt_ref_aq_private.h for implmentation details.
  */
 
-#ifndef VP9_ENCODER_VP9_ALT_REF_AQ_H_
-#define VP9_ENCODER_VP9_ALT_REF_AQ_H_
+#ifndef VPX_VP9_ENCODER_VP9_ALT_REF_AQ_H_
+#define VPX_VP9_ENCODER_VP9_ALT_REF_AQ_H_
 
 #include "vpx/vpx_integer.h"
 
@@ -54,7 +54,7 @@ struct ALT_REF_AQ;
  *
  * \return Instance of the class
  */
-struct ALT_REF_AQ *vp9_alt_ref_aq_create();
+struct ALT_REF_AQ *vp9_alt_ref_aq_create(void);
 
 /*!\brief Upload segmentation_map to self object
  *
@@ -124,4 +124,4 @@ void vp9_alt_ref_aq_destroy(struct ALT_REF_AQ *const self);
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_ALT_REF_AQ_H_
+#endif  // VPX_VP9_ENCODER_VP9_ALT_REF_AQ_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_aq_360.h b/media/libvpx/libvpx/vp9/encoder/vp9_aq_360.h
index b1b56561d8..749d3c198a 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_aq_360.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_aq_360.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_AQ_360_H_
-#define VP9_ENCODER_VP9_AQ_360_H_
+#ifndef VPX_VP9_ENCODER_VP9_AQ_360_H_
+#define VPX_VP9_ENCODER_VP9_AQ_360_H_
 
 #include "vp9/encoder/vp9_encoder.h"
 
@@ -24,4 +24,4 @@ void vp9_360aq_frame_setup(VP9_COMP *cpi);
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_AQ_VARIANCE_H_
+#endif  // VPX_VP9_ENCODER_VP9_AQ_360_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_aq_complexity.c b/media/libvpx/libvpx/vp9/encoder/vp9_aq_complexity.c
index bd3812036c..ef3423f8eb 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_aq_complexity.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_aq_complexity.c
@@ -87,7 +87,7 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
           &cpi->rc, cm->frame_type, cm->base_qindex,
           aq_c_q_adj_factor[aq_strength][segment], cm->bit_depth);
 
-      // For AQ complexity mode, we dont allow Q0 in a segment if the base
+      // For AQ complexity mode, we don't allow Q0 in a segment if the base
       // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment
       // Q delta is sometimes applied without going back around the rd loop.
       // This could lead to an illegal combination of partition size and q.
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_aq_complexity.h b/media/libvpx/libvpx/vp9/encoder/vp9_aq_complexity.h
index a00d34e702..d3cb34c013 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_aq_complexity.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_aq_complexity.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_AQ_COMPLEXITY_H_
-#define VP9_ENCODER_VP9_AQ_COMPLEXITY_H_
+#ifndef VPX_VP9_ENCODER_VP9_AQ_COMPLEXITY_H_
+#define VPX_VP9_ENCODER_VP9_AQ_COMPLEXITY_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -33,4 +33,4 @@ void vp9_setup_in_frame_q_adj(struct VP9_COMP *cpi);
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_AQ_COMPLEXITY_H_
+#endif  // VPX_VP9_ENCODER_VP9_AQ_COMPLEXITY_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c b/media/libvpx/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
index 3dc88b1914..92a31ebf63 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -21,6 +21,14 @@
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_segmentation.h"
 
+static const uint8_t VP9_VAR_OFFS[64] = {
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
+};
+
 CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
   size_t last_coded_q_map_size;
   CYCLIC_REFRESH *const cr = vpx_calloc(1, sizeof(*cr));
@@ -39,13 +47,17 @@ CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
   }
   assert(MAXQ <= 255);
   memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size);
+  cr->counter_encode_maxq_scene_change = 0;
+  cr->content_mode = 1;
   return cr;
 }
 
 void vp9_cyclic_refresh_free(CYCLIC_REFRESH *cr) {
-  vpx_free(cr->map);
-  vpx_free(cr->last_coded_q_map);
-  vpx_free(cr);
+  if (cr != NULL) {
+    vpx_free(cr->map);
+    vpx_free(cr->last_coded_q_map);
+    vpx_free(cr);
+  }
 }
 
 // Check if this coding block, of size bsize, should be considered for refresh
@@ -102,18 +114,18 @@ int vp9_cyclic_refresh_estimate_bits_at_q(const VP9_COMP *cpi,
   double weight_segment1 = (double)cr->actual_num_seg1_blocks / num8x8bl;
   double weight_segment2 = (double)cr->actual_num_seg2_blocks / num8x8bl;
   // Take segment weighted average for estimated bits.
-  estimated_bits =
-      (int)((1.0 - weight_segment1 - weight_segment2) *
-                vp9_estimate_bits_at_q(cm->frame_type, cm->base_qindex, mbs,
-                                       correction_factor, cm->bit_depth) +
-            weight_segment1 *
-                vp9_estimate_bits_at_q(cm->frame_type,
-                                       cm->base_qindex + cr->qindex_delta[1],
-                                       mbs, correction_factor, cm->bit_depth) +
-            weight_segment2 *
-                vp9_estimate_bits_at_q(cm->frame_type,
-                                       cm->base_qindex + cr->qindex_delta[2],
-                                       mbs, correction_factor, cm->bit_depth));
+  estimated_bits = (int)round(
+      (1.0 - weight_segment1 - weight_segment2) *
+          vp9_estimate_bits_at_q(cm->frame_type, cm->base_qindex, mbs,
+                                 correction_factor, cm->bit_depth) +
+      weight_segment1 *
+          vp9_estimate_bits_at_q(cm->frame_type,
+                                 cm->base_qindex + cr->qindex_delta[1], mbs,
+                                 correction_factor, cm->bit_depth) +
+      weight_segment2 *
+          vp9_estimate_bits_at_q(cm->frame_type,
+                                 cm->base_qindex + cr->qindex_delta[2], mbs,
+                                 correction_factor, cm->bit_depth));
   return estimated_bits;
 }
 
@@ -127,28 +139,19 @@ int vp9_cyclic_refresh_rc_bits_per_mb(const VP9_COMP *cpi, int i,
   const VP9_COMMON *const cm = &cpi->common;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   int bits_per_mb;
-  int num8x8bl = cm->MBs << 2;
-  // Compute delta-q corresponding to qindex i.
-  int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta);
-  // Weight for segment prior to encoding: take the average of the target
-  // number for the frame to be encoded and the actual from the previous frame.
-  // Use the target if its less.
-  int target_refresh = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
-  double weight_segment_target = (double)(target_refresh) / num8x8bl;
-  double weight_segment =
-      (double)((target_refresh + cr->actual_num_seg1_blocks +
-                cr->actual_num_seg2_blocks) >>
-               1) /
-      num8x8bl;
-  if (weight_segment_target < 7 * weight_segment / 8)
-    weight_segment = weight_segment_target;
+  int deltaq = 0;
+  if (cpi->oxcf.speed < 8)
+    deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta);
+  else
+    deltaq = -(cr->max_qdelta_perc * i) / 200;
   // Take segment weighted average for bits per mb.
-  bits_per_mb = (int)((1.0 - weight_segment) *
-                          vp9_rc_bits_per_mb(cm->frame_type, i,
-                                             correction_factor, cm->bit_depth) +
-                      weight_segment *
-                          vp9_rc_bits_per_mb(cm->frame_type, i + deltaq,
-                                             correction_factor, cm->bit_depth));
+  bits_per_mb =
+      (int)round((1.0 - cr->weight_segment) *
+                     vp9_rc_bits_per_mb(cm->frame_type, i, correction_factor,
+                                        cm->bit_depth) +
+                 cr->weight_segment *
+                     vp9_rc_bits_per_mb(cm->frame_type, i + deltaq,
+                                        correction_factor, cm->bit_depth));
   return bits_per_mb;
 }
 
@@ -186,7 +189,8 @@ void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi, MODE_INFO *const mi,
 
   // If this block is labeled for refresh, check if we should reset the
   // segment_id.
-  if (cyclic_refresh_segment_id_boosted(mi->segment_id)) {
+  if (cpi->sf.use_nonrd_pick_mode &&
+      cyclic_refresh_segment_id_boosted(mi->segment_id)) {
     mi->segment_id = refresh_this_block;
     // Reset segment_id if it will be skipped.
     if (skip) mi->segment_id = CR_SEGMENT_ID_BASE;
@@ -250,24 +254,66 @@ void vp9_cyclic_refresh_update_sb_postencode(VP9_COMP *const cpi,
     }
 }
 
-// Update the actual number of blocks that were applied the segment delta q.
+// From the just encoded frame: update the actual number of blocks that were
+// applied the segment delta q, and the amount of low motion in the frame.
+// Also check conditions for forcing golden update, or preventing golden
+// update if the period is up.
 void vp9_cyclic_refresh_postencode(VP9_COMP *const cpi) {
   VP9_COMMON *const cm = &cpi->common;
+  MODE_INFO **mi = cm->mi_grid_visible;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  RATE_CONTROL *const rc = &cpi->rc;
   unsigned char *const seg_map = cpi->segmentation_map;
+  double fraction_low = 0.0;
+  int force_gf_refresh = 0;
+  int low_content_frame = 0;
   int mi_row, mi_col;
   cr->actual_num_seg1_blocks = 0;
   cr->actual_num_seg2_blocks = 0;
-  for (mi_row = 0; mi_row < cm->mi_rows; mi_row++)
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
     for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
-      if (cyclic_refresh_segment_id(seg_map[mi_row * cm->mi_cols + mi_col]) ==
-          CR_SEGMENT_ID_BOOST1)
+      MV mv = mi[0]->mv[0].as_mv;
+      int map_index = mi_row * cm->mi_cols + mi_col;
+      if (cyclic_refresh_segment_id(seg_map[map_index]) == CR_SEGMENT_ID_BOOST1)
         cr->actual_num_seg1_blocks++;
-      else if (cyclic_refresh_segment_id(
-                   seg_map[mi_row * cm->mi_cols + mi_col]) ==
+      else if (cyclic_refresh_segment_id(seg_map[map_index]) ==
                CR_SEGMENT_ID_BOOST2)
         cr->actual_num_seg2_blocks++;
+      // Accumulate low_content_frame.
+      if (is_inter_block(mi[0]) && abs(mv.row) < 16 && abs(mv.col) < 16)
+        low_content_frame++;
+      mi++;
     }
+    mi += 8;
+  }
+  // Check for golden frame update: only for non-SVC and non-golden boost.
+  if (!cpi->use_svc && cpi->ext_refresh_frame_flags_pending == 0 &&
+      !cpi->oxcf.gf_cbr_boost_pct) {
+    // Force this frame as a golden update frame if this frame changes the
+    // resolution (resize_pending != 0).
+    if (cpi->resize_pending != 0) {
+      vp9_cyclic_refresh_set_golden_update(cpi);
+      rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+      if (rc->frames_till_gf_update_due > rc->frames_to_key)
+        rc->frames_till_gf_update_due = rc->frames_to_key;
+      cpi->refresh_golden_frame = 1;
+      force_gf_refresh = 1;
+    }
+    // Update average of low content/motion in the frame.
+    fraction_low = (double)low_content_frame / (cm->mi_rows * cm->mi_cols);
+    cr->low_content_avg = (fraction_low + 3 * cr->low_content_avg) / 4;
+    if (!force_gf_refresh && cpi->refresh_golden_frame == 1 &&
+        rc->frames_since_key > rc->frames_since_golden + 1) {
+      // Don't update golden reference if the amount of low_content for the
+      // current encoded frame is small, or if the recursive average of the
+      // low_content over the update interval window falls below threshold.
+      if (fraction_low < 0.65 || cr->low_content_avg < 0.6) {
+        cpi->refresh_golden_frame = 0;
+      }
+      // Reset for next internal.
+      cr->low_content_avg = fraction_low;
+    }
+  }
 }
 
 // Set golden frame update interval, for non-svc 1 pass CBR mode.
@@ -282,72 +328,31 @@ void vp9_cyclic_refresh_set_golden_update(VP9_COMP *const cpi) {
   else
     rc->baseline_gf_interval = 40;
   if (cpi->oxcf.rc_mode == VPX_VBR) rc->baseline_gf_interval = 20;
+  if (rc->avg_frame_low_motion < 50 && rc->frames_since_key > 40 &&
+      cr->content_mode)
+    rc->baseline_gf_interval = 10;
 }
 
-// Update some encoding stats (from the just encoded frame). If this frame's
-// background has high motion, refresh the golden frame. Otherwise, if the
-// golden reference is to be updated check if we should NOT update the golden
-// ref.
-void vp9_cyclic_refresh_check_golden_update(VP9_COMP *const cpi) {
-  VP9_COMMON *const cm = &cpi->common;
-  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
-  int mi_row, mi_col;
-  double fraction_low = 0.0;
-  int low_content_frame = 0;
-  MODE_INFO **mi = cm->mi_grid_visible;
-  RATE_CONTROL *const rc = &cpi->rc;
-  const int rows = cm->mi_rows, cols = cm->mi_cols;
-  int cnt1 = 0, cnt2 = 0;
-  int force_gf_refresh = 0;
-  int flag_force_gf_high_motion = 0;
-  for (mi_row = 0; mi_row < rows; mi_row++) {
-    for (mi_col = 0; mi_col < cols; mi_col++) {
-      if (flag_force_gf_high_motion == 1) {
-        int16_t abs_mvr = mi[0]->mv[0].as_mv.row >= 0
-                              ? mi[0]->mv[0].as_mv.row
-                              : -1 * mi[0]->mv[0].as_mv.row;
-        int16_t abs_mvc = mi[0]->mv[0].as_mv.col >= 0
-                              ? mi[0]->mv[0].as_mv.col
-                              : -1 * mi[0]->mv[0].as_mv.col;
-        // Calculate the motion of the background.
-        if (abs_mvr <= 16 && abs_mvc <= 16) {
-          cnt1++;
-          if (abs_mvr == 0 && abs_mvc == 0) cnt2++;
-        }
-      }
-      mi++;
-      // Accumulate low_content_frame.
-      if (cr->map[mi_row * cols + mi_col] < 1) low_content_frame++;
-    }
-    mi += 8;
-  }
-  // For video conference clips, if the background has high motion in current
-  // frame because of the camera movement, set this frame as the golden frame.
-  // Use 70% and 5% as the thresholds for golden frame refreshing.
-  // Also, force this frame as a golden update frame if this frame will change
-  // the resolution (resize_pending != 0).
-  if (cpi->resize_pending != 0 ||
-      (cnt1 * 100 > (70 * rows * cols) && cnt2 * 20 < cnt1)) {
-    vp9_cyclic_refresh_set_golden_update(cpi);
-    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
-
-    if (rc->frames_till_gf_update_due > rc->frames_to_key)
-      rc->frames_till_gf_update_due = rc->frames_to_key;
-    cpi->refresh_golden_frame = 1;
-    force_gf_refresh = 1;
-  }
-  fraction_low = (double)low_content_frame / (rows * cols);
-  // Update average.
-  cr->low_content_avg = (fraction_low + 3 * cr->low_content_avg) / 4;
-  if (!force_gf_refresh && cpi->refresh_golden_frame == 1) {
-    // Don't update golden reference if the amount of low_content for the
-    // current encoded frame is small, or if the recursive average of the
-    // low_content over the update interval window falls below threshold.
-    if (fraction_low < 0.8 || cr->low_content_avg < 0.7)
-      cpi->refresh_golden_frame = 0;
-    // Reset for next internal.
-    cr->low_content_avg = fraction_low;
+static int is_superblock_flat_static(VP9_COMP *const cpi, int sb_row_index,
+                                     int sb_col_index) {
+  unsigned int source_variance;
+  const uint8_t *src_y = cpi->Source->y_buffer;
+  const int ystride = cpi->Source->y_stride;
+  unsigned int sse;
+  const BLOCK_SIZE bsize = BLOCK_64X64;
+  src_y += (sb_row_index << 6) * ystride + (sb_col_index << 6);
+  source_variance =
+      cpi->fn_ptr[bsize].vf(src_y, ystride, VP9_VAR_OFFS, 0, &sse);
+  if (source_variance == 0) {
+    uint64_t block_sad;
+    const uint8_t *last_src_y = cpi->Last_Source->y_buffer;
+    const int last_ystride = cpi->Last_Source->y_stride;
+    last_src_y += (sb_row_index << 6) * ystride + (sb_col_index << 6);
+    block_sad =
+        cpi->fn_ptr[bsize].sdf(src_y, ystride, last_src_y, last_ystride);
+    if (block_sad == 0) return 1;
   }
+  return 0;
 }
 
 // Update the segmentation map, and related quantities: cyclic refresh map,
@@ -386,7 +391,8 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
           ? vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex)
           : vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex);
   // More aggressive settings for noisy content.
-  if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium) {
+  if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium &&
+      cr->content_mode) {
     consec_zero_mv_thresh = 60;
     qindex_thresh =
         VPXMAX(vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex),
@@ -400,8 +406,17 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
     int sb_col_index = i - sb_row_index * sb_cols;
     int mi_row = sb_row_index * MI_BLOCK_SIZE;
     int mi_col = sb_col_index * MI_BLOCK_SIZE;
+    int flat_static_blocks = 0;
+    int compute_content = 1;
     assert(mi_row >= 0 && mi_row < cm->mi_rows);
     assert(mi_col >= 0 && mi_col < cm->mi_cols);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cpi->common.use_highbitdepth) compute_content = 0;
+#endif
+    if (cr->content_mode == 0 || cpi->Last_Source == NULL ||
+        cpi->Last_Source->y_width != cpi->Source->y_width ||
+        cpi->Last_Source->y_height != cpi->Source->y_height)
+      compute_content = 0;
     bl_index = mi_row * cm->mi_cols + mi_col;
     // Loop through all 8x8 blocks in superblock and update map.
     xmis =
@@ -410,16 +425,17 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
         VPXMIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[BLOCK_64X64]);
     if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium &&
         (xmis <= 2 || ymis <= 2))
-      consec_zero_mv_thresh_block = 10;
+      consec_zero_mv_thresh_block = 4;
     for (y = 0; y < ymis; y++) {
       for (x = 0; x < xmis; x++) {
         const int bl_index2 = bl_index + y * cm->mi_cols + x;
         // If the block is as a candidate for clean up then mark it
         // for possible boost/refresh (segment 1). The segment id may get
-        // reset to 0 later if block gets coded anything other than ZEROMV.
+        // reset to 0 later depending on the coding mode.
         if (cr->map[bl_index2] == 0) {
           count_tot++;
-          if (cr->last_coded_q_map[bl_index2] > qindex_thresh ||
+          if (cr->content_mode == 0 ||
+              cr->last_coded_q_map[bl_index2] > qindex_thresh ||
               cpi->consec_zero_mv[bl_index2] < consec_zero_mv_thresh_block) {
             sum_map++;
             count_sel++;
@@ -432,11 +448,21 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
     // Enforce constant segment over superblock.
     // If segment is at least half of superblock, set to 1.
     if (sum_map >= xmis * ymis / 2) {
-      for (y = 0; y < ymis; y++)
-        for (x = 0; x < xmis; x++) {
-          seg_map[bl_index + y * cm->mi_cols + x] = CR_SEGMENT_ID_BOOST1;
-        }
-      cr->target_num_seg_blocks += xmis * ymis;
+      // This superblock is a candidate for refresh:
+      // compute spatial variance and exclude blocks that are spatially flat
+      // and stationary. Note: this is currently only done for screne content
+      // mode.
+      if (compute_content && cr->skip_flat_static_blocks)
+        flat_static_blocks =
+            is_superblock_flat_static(cpi, sb_row_index, sb_col_index);
+      if (!flat_static_blocks) {
+        // Label this superblock as segment 1.
+        for (y = 0; y < ymis; y++)
+          for (x = 0; x < xmis; x++) {
+            seg_map[bl_index + y * cm->mi_cols + x] = CR_SEGMENT_ID_BOOST1;
+          }
+        cr->target_num_seg_blocks += xmis * ymis;
+      }
     }
     i++;
     if (i == sbs_in_frame) {
@@ -445,7 +471,8 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
   } while (cr->target_num_seg_blocks < block_count && i != cr->sb_index);
   cr->sb_index = i;
   cr->reduce_refresh = 0;
-  if (count_sel<(3 * count_tot)>> 2) cr->reduce_refresh = 1;
+  if (cpi->oxcf.content != VP9E_CONTENT_SCREEN)
+    if (count_sel < (3 * count_tot) >> 2) cr->reduce_refresh = 1;
 }
 
 // Set cyclic refresh parameters.
@@ -453,9 +480,33 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
   const RATE_CONTROL *const rc = &cpi->rc;
   const VP9_COMMON *const cm = &cpi->common;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  int num8x8bl = cm->MBs << 2;
+  int target_refresh = 0;
+  double weight_segment_target = 0;
+  double weight_segment = 0;
+  int thresh_low_motion = 20;
+  int qp_thresh = VPXMIN((cpi->oxcf.content == VP9E_CONTENT_SCREEN) ? 35 : 20,
+                         rc->best_quality << 1);
+  int qp_max_thresh = 117 * MAXQ >> 7;
+  cr->apply_cyclic_refresh = 1;
+  if (frame_is_intra_only(cm) || cpi->svc.temporal_layer_id > 0 ||
+      is_lossless_requested(&cpi->oxcf) ||
+      rc->avg_frame_qindex[INTER_FRAME] < qp_thresh ||
+      (cpi->use_svc &&
+       cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) ||
+      (!cpi->use_svc && cr->content_mode &&
+       rc->avg_frame_low_motion < thresh_low_motion &&
+       rc->frames_since_key > 40) ||
+      (!cpi->use_svc && rc->avg_frame_qindex[INTER_FRAME] > qp_max_thresh &&
+       rc->frames_since_key > 20) ||
+      (cpi->roi.enabled && cpi->roi.skip[BACKGROUND_SEG_SKIP_ID] &&
+       rc->frames_since_key > FRAMES_NO_SKIPPING_AFTER_KEY)) {
+    cr->apply_cyclic_refresh = 0;
+    return;
+  }
   cr->percent_refresh = 10;
   if (cr->reduce_refresh) cr->percent_refresh = 5;
-  cr->max_qdelta_perc = 50;
+  cr->max_qdelta_perc = 60;
   cr->time_for_refresh = 0;
   cr->motion_thresh = 32;
   cr->rate_boost_fac = 15;
@@ -468,20 +519,38 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
     cr->rate_ratio_qdelta = 3.0;
   } else {
     cr->rate_ratio_qdelta = 2.0;
-    if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium) {
+    if (cr->content_mode && cpi->noise_estimate.enabled &&
+        cpi->noise_estimate.level >= kMedium) {
       // Reduce the delta-qp if the estimated source noise is above threshold.
       cr->rate_ratio_qdelta = 1.7;
       cr->rate_boost_fac = 13;
     }
   }
-  // Adjust some parameters for low resolutions at low bitrates.
-  if (cm->width <= 352 && cm->height <= 288 && rc->avg_frame_bandwidth < 3400) {
-    cr->motion_thresh = 16;
-    cr->rate_boost_fac = 13;
+  // For screen-content: keep rate_ratio_qdelta to 2.0 (segment#1 boost) and
+  // percent_refresh (refresh rate) to 10. But reduce rate boost for segment#2
+  // (rate_boost_fac = 10 disables segment#2).
+  if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) {
+    // Only enable feature of skipping flat_static blocks for top layer
+    // under screen content mode.
+    if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)
+      cr->skip_flat_static_blocks = 1;
+    cr->percent_refresh = (cr->skip_flat_static_blocks) ? 5 : 10;
+    // Increase the amount of refresh on scene change that is encoded at max Q,
+    // increase for a few cycles of the refresh period (~100 / percent_refresh).
+    if (cr->content_mode && cr->counter_encode_maxq_scene_change < 30)
+      cr->percent_refresh = (cr->skip_flat_static_blocks) ? 10 : 15;
+    cr->rate_ratio_qdelta = 2.0;
+    cr->rate_boost_fac = 10;
   }
-  if (cpi->svc.spatial_layer_id > 0) {
-    cr->motion_thresh = 4;
-    cr->rate_boost_fac = 12;
+  // Adjust some parameters for low resolutions.
+  if (cm->width * cm->height <= 352 * 288) {
+    if (rc->avg_frame_bandwidth < 3000) {
+      cr->motion_thresh = 64;
+      cr->rate_boost_fac = 13;
+    } else {
+      cr->max_qdelta_perc = 70;
+      cr->rate_ratio_qdelta = VPXMAX(cr->rate_ratio_qdelta, 2.5);
+    }
   }
   if (cpi->oxcf.rc_mode == VPX_VBR) {
     // To be adjusted for VBR mode, e.g., based on gf period and boost.
@@ -490,11 +559,37 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
     cr->percent_refresh = 10;
     cr->rate_ratio_qdelta = 1.5;
     cr->rate_boost_fac = 10;
-    if (cpi->refresh_golden_frame == 1) {
+    if (cpi->refresh_golden_frame == 1 && !cpi->use_svc) {
       cr->percent_refresh = 0;
       cr->rate_ratio_qdelta = 1.0;
     }
   }
+  // Weight for segment prior to encoding: take the average of the target
+  // number for the frame to be encoded and the actual from the previous frame.
+  // Use the target if its less. To be used for setting the base qp for the
+  // frame in vp9_rc_regulate_q.
+  target_refresh = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
+  weight_segment_target = (double)(target_refresh) / num8x8bl;
+  weight_segment = (double)((target_refresh + cr->actual_num_seg1_blocks +
+                             cr->actual_num_seg2_blocks) >>
+                            1) /
+                   num8x8bl;
+  if (weight_segment_target < 7 * weight_segment / 8)
+    weight_segment = weight_segment_target;
+  // For screen-content: don't include target for the weight segment,
+  // since for all flat areas the segment is reset, so its more accurate
+  // to just use the previous actual number of seg blocks for the weight.
+  if (cpi->oxcf.content == VP9E_CONTENT_SCREEN)
+    weight_segment =
+        (double)(cr->actual_num_seg1_blocks + cr->actual_num_seg2_blocks) /
+        num8x8bl;
+  cr->weight_segment = weight_segment;
+  if (cr->content_mode == 0) {
+    cr->actual_num_seg1_blocks =
+        cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
+    cr->actual_num_seg2_blocks = 0;
+    cr->weight_segment = (double)(cr->actual_num_seg1_blocks) / num8x8bl;
+  }
 }
 
 // Setup cyclic background refresh: set delta q and segmentation map.
@@ -503,28 +598,33 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) {
   const RATE_CONTROL *const rc = &cpi->rc;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   struct segmentation *const seg = &cm->seg;
-  // TODO(marpan): Look into whether we should reduce the amount/delta-qp
-  // instead of completely shutting off at low bitrates. For now keep it on.
-  // const int apply_cyclic_refresh = apply_cyclic_refresh_bitrate(cm, rc);
-  const int apply_cyclic_refresh = 1;
+  int scene_change_detected =
+      cpi->rc.high_source_sad ||
+      (cpi->use_svc && cpi->svc.high_source_sad_superframe);
   if (cm->current_video_frame == 0) cr->low_content_avg = 0.0;
-  // Don't apply refresh on key frame or temporal enhancement layer frames.
-  if (!apply_cyclic_refresh || (cm->frame_type == KEY_FRAME) ||
-      (cpi->force_update_segmentation) || (cpi->svc.temporal_layer_id > 0)) {
+  // Reset if resoluton change has occurred.
+  if (cpi->resize_pending != 0 && cpi->svc.temporal_layer_id == 0)
+    vp9_cyclic_refresh_reset_resize(cpi);
+  if (!cr->apply_cyclic_refresh || (cpi->force_update_segmentation) ||
+      scene_change_detected) {
     // Set segmentation map to 0 and disable.
     unsigned char *const seg_map = cpi->segmentation_map;
     memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
     vp9_disable_segmentation(&cm->seg);
-    if (cm->frame_type == KEY_FRAME) {
+    if ((cm->frame_type == KEY_FRAME || scene_change_detected) &&
+        cpi->svc.temporal_layer_id == 0) {
       memset(cr->last_coded_q_map, MAXQ,
              cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map));
       cr->sb_index = 0;
+      cr->reduce_refresh = 0;
+      cr->counter_encode_maxq_scene_change = 0;
     }
     return;
   } else {
     int qindex_delta = 0;
     int qindex2;
     const double q = vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth);
+    cr->counter_encode_maxq_scene_change++;
     vpx_clear_system_state();
     // Set rate threshold to some multiple (set to 2 for now) of the target
     // rate (target is given by sb64_target_rate and scaled by 256).
@@ -574,9 +674,6 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) {
     cr->qindex_delta[2] = qindex_delta;
     vp9_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta);
 
-    // Reset if resoluton change has occurred.
-    if (cpi->resize_pending != 0) vp9_cyclic_refresh_reset_resize(cpi);
-
     // Update the segmentation and refresh map.
     cyclic_refresh_update_map(cpi);
   }
@@ -590,8 +687,19 @@ void vp9_cyclic_refresh_reset_resize(VP9_COMP *const cpi) {
   const VP9_COMMON *const cm = &cpi->common;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   memset(cr->map, 0, cm->mi_rows * cm->mi_cols);
-  memset(cr->last_coded_q_map, MAXQ, cm->mi_rows * cm->mi_cols);
+  memset(cr->last_coded_q_map, MAXQ,
+         cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map));
   cr->sb_index = 0;
   cpi->refresh_golden_frame = 1;
   cpi->refresh_alt_ref_frame = 1;
+  cr->counter_encode_maxq_scene_change = 0;
+}
+
+void vp9_cyclic_refresh_limit_q(const VP9_COMP *cpi, int *q) {
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  // For now apply hard limit to frame-level decrease in q, if the cyclic
+  // refresh is active (percent_refresh > 0).
+  if (cr->percent_refresh > 0 && cpi->rc.q_1_frame - *q > 8) {
+    *q = cpi->rc.q_1_frame - 8;
+  }
 }
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h b/media/libvpx/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h
index a4be031295..c74cee4743 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_
-#define VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_
+#ifndef VPX_VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_
+#define VPX_VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_
 
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"
@@ -66,6 +66,11 @@ struct CYCLIC_REFRESH {
   double low_content_avg;
   int qindex_delta[3];
   int reduce_refresh;
+  double weight_segment;
+  int apply_cyclic_refresh;
+  int counter_encode_maxq_scene_change;
+  int skip_flat_static_blocks;
+  int content_mode;
 };
 
 struct VP9_COMP;
@@ -100,19 +105,15 @@ void vp9_cyclic_refresh_update_sb_postencode(struct VP9_COMP *const cpi,
                                              int mi_row, int mi_col,
                                              BLOCK_SIZE bsize);
 
-// Update the segmentation map, and related quantities: cyclic refresh map,
-// refresh sb_index, and target number of blocks to be refreshed.
-void vp9_cyclic_refresh_update__map(struct VP9_COMP *const cpi);
-
-// Update the actual number of blocks that were applied the segment delta q.
+// From the just encoded frame: update the actual number of blocks that were
+// applied the segment delta q, and the amount of low motion in the frame.
+// Also check conditions for forcing golden update, or preventing golden
+// update if the period is up.
 void vp9_cyclic_refresh_postencode(struct VP9_COMP *const cpi);
 
 // Set golden frame update interval, for non-svc 1 pass CBR mode.
 void vp9_cyclic_refresh_set_golden_update(struct VP9_COMP *const cpi);
 
-// Check if we should not update golden reference, based on past refresh stats.
-void vp9_cyclic_refresh_check_golden_update(struct VP9_COMP *const cpi);
-
 // Set/update global/frame level refresh parameters.
 void vp9_cyclic_refresh_update_parameters(struct VP9_COMP *const cpi);
 
@@ -137,8 +138,10 @@ static INLINE int cyclic_refresh_segment_id(int segment_id) {
     return CR_SEGMENT_ID_BASE;
 }
 
+void vp9_cyclic_refresh_limit_q(const struct VP9_COMP *cpi, int *q);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_
+#endif  // VPX_VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.c b/media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.c
index 477f62ba5a..2d57615259 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.c
@@ -19,6 +19,7 @@
 
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rd.h"
+#include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_segmentation.h"
 
 #define ENERGY_MIN (-4)
@@ -31,7 +32,7 @@ static const double rate_ratio[MAX_SEGMENTS] = { 2.5,  2.0, 1.5, 1.0,
                                                  0.75, 1.0, 1.0, 1.0 };
 static const int segment_id[ENERGY_SPAN] = { 0, 1, 1, 2, 3, 4 };
 
-#define SEGMENT_ID(i) segment_id[(i)-ENERGY_MIN]
+#define SEGMENT_ID(i) segment_id[(i) - ENERGY_MIN]
 
 DECLARE_ALIGNED(16, static const uint8_t, vp9_64_zeros[64]) = { 0 };
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -108,7 +109,7 @@ static void aq_variance(const uint8_t *a, int a_stride, const uint8_t *b,
 #if CONFIG_VP9_HIGHBITDEPTH
 static void aq_highbd_variance64(const uint8_t *a8, int a_stride,
                                  const uint8_t *b8, int b_stride, int w, int h,
-                                 uint64_t *sse, uint64_t *sum) {
+                                 uint64_t *sse, int64_t *sum) {
   int i, j;
 
   uint16_t *a = CONVERT_TO_SHORTPTR(a8);
@@ -127,15 +128,6 @@ static void aq_highbd_variance64(const uint8_t *a8, int a_stride,
   }
 }
 
-static void aq_highbd_8_variance(const uint8_t *a8, int a_stride,
-                                 const uint8_t *b8, int b_stride, int w, int h,
-                                 unsigned int *sse, int *sum) {
-  uint64_t sse_long = 0;
-  uint64_t sum_long = 0;
-  aq_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
-  *sse = (unsigned int)sse_long;
-  *sum = (int)sum_long;
-}
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x,
@@ -153,11 +145,13 @@ static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x,
     int avg;
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      aq_highbd_8_variance(x->plane[0].src.buf, x->plane[0].src.stride,
+      uint64_t sse64 = 0;
+      int64_t sum64 = 0;
+      aq_highbd_variance64(x->plane[0].src.buf, x->plane[0].src.stride,
                            CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, bw, bh,
-                           &sse, &avg);
-      sse >>= 2 * (xd->bd - 8);
-      avg >>= (xd->bd - 8);
+                           &sse64, &sum64);
+      sse = (unsigned int)(sse64 >> (2 * (xd->bd - 8)));
+      avg = (int)(sum64 >> (xd->bd - 8));
     } else {
       aq_variance(x->plane[0].src.buf, x->plane[0].src.stride, vp9_64_zeros, 0,
                   bw, bh, &sse, &avg);
@@ -193,12 +187,61 @@ double vp9_log_block_var(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
 }
 
 #define DEFAULT_E_MIDPOINT 10.0
-int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
+static int scale_block_energy(VP9_COMP *cpi, unsigned int block_var) {
   double energy;
   double energy_midpoint;
-  vpx_clear_system_state();
   energy_midpoint =
       (cpi->oxcf.pass == 2) ? cpi->twopass.mb_av_energy : DEFAULT_E_MIDPOINT;
-  energy = vp9_log_block_var(cpi, x, bs) - energy_midpoint;
+  energy = log(block_var + 1.0) - energy_midpoint;
   return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX);
 }
+#undef DEFAULT_E_MIDPOINT
+
+// Get the range of sub block energy values;
+void vp9_get_sub_block_energy(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row,
+                              int mi_col, BLOCK_SIZE bsize, int *min_e,
+                              int *max_e) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+  const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
+  int x, y;
+
+  if (xmis < bw || ymis < bh) {
+    vp9_setup_src_planes(mb, cpi->Source, mi_row, mi_col);
+    *min_e = vp9_block_energy(cpi, mb, bsize);
+    *max_e = *min_e;
+  } else {
+    unsigned int var;
+    // Because scale_block_energy is non-decreasing, we can find the min/max
+    // block variance and scale afterwards. This avoids a costly scaling at
+    // every iteration.
+    unsigned int min_var = UINT_MAX;
+    unsigned int max_var = 0;
+
+    for (y = 0; y < ymis; ++y) {
+      for (x = 0; x < xmis; ++x) {
+        vp9_setup_src_planes(mb, cpi->Source, mi_row + y, mi_col + x);
+        vpx_clear_system_state();
+        var = block_variance(cpi, mb, BLOCK_8X8);
+        vpx_clear_system_state();
+        min_var = VPXMIN(min_var, var);
+        max_var = VPXMAX(max_var, var);
+      }
+    }
+    *min_e = scale_block_energy(cpi, min_var);
+    *max_e = scale_block_energy(cpi, max_var);
+  }
+
+  // Re-instate source pointers back to what they should have been on entry.
+  vp9_setup_src_planes(mb, cpi->Source, mi_row, mi_col);
+}
+
+int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
+  unsigned int var;
+  vpx_clear_system_state();
+  var = block_variance(cpi, x, bs);
+  vpx_clear_system_state();
+  return scale_block_energy(cpi, var);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.h b/media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.h
index 211a69f392..a4f872879d 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_AQ_VARIANCE_H_
-#define VP9_ENCODER_VP9_AQ_VARIANCE_H_
+#ifndef VPX_VP9_ENCODER_VP9_AQ_VARIANCE_H_
+#define VPX_VP9_ENCODER_VP9_AQ_VARIANCE_H_
 
 #include "vp9/encoder/vp9_encoder.h"
 
@@ -20,11 +20,15 @@ extern "C" {
 unsigned int vp9_vaq_segment_id(int energy);
 void vp9_vaq_frame_setup(VP9_COMP *cpi);
 
+void vp9_get_sub_block_energy(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row,
+                              int mi_col, BLOCK_SIZE bsize, int *min_e,
+                              int *max_e);
 int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
+
 double vp9_log_block_var(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_AQ_VARIANCE_H_
+#endif  // VPX_VP9_ENCODER_VP9_AQ_VARIANCE_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_bitstream.c b/media/libvpx/libvpx/vp9/encoder/vp9_bitstream.c
index 49aea69ebd..8df16691f5 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_bitstream.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_bitstream.c
@@ -9,6 +9,7 @@
  */
 
 #include <assert.h>
+#include <stdint.h>
 #include <stdio.h>
 #include <limits.h>
 
@@ -18,6 +19,9 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem_ops.h"
 #include "vpx_ports/system_state.h"
+#if CONFIG_BITSTREAM_DEBUG
+#include "vpx_util/vpx_debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG
 
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymode.h"
@@ -39,8 +43,10 @@ static const struct vp9_token intra_mode_encodings[INTRA_MODES] = {
   { 0, 1 },  { 6, 3 },   { 28, 5 },  { 30, 5 }, { 58, 6 },
   { 59, 6 }, { 126, 7 }, { 127, 7 }, { 62, 6 }, { 2, 2 }
 };
-static const struct vp9_token switchable_interp_encodings[SWITCHABLE_FILTERS] =
-    { { 0, 1 }, { 2, 2 }, { 3, 2 } };
+static const struct vp9_token
+    switchable_interp_encodings[SWITCHABLE_FILTERS] = { { 0, 1 },
+                                                        { 2, 2 },
+                                                        { 3, 2 } };
 static const struct vp9_token partition_encodings[PARTITION_TYPES] = {
   { 0, 1 }, { 2, 2 }, { 6, 3 }, { 7, 3 }
 };
@@ -50,6 +56,7 @@ static const struct vp9_token inter_mode_encodings[INTER_MODES] = {
 
 static void write_intra_mode(vpx_writer *w, PREDICTION_MODE mode,
                              const vpx_prob *probs) {
+  assert(!is_inter_mode(mode));
   vp9_write_token(w, vp9_intra_mode_tree, probs, &intra_mode_encodings[mode]);
 }
 
@@ -86,7 +93,7 @@ static void write_selected_tx_size(const VP9_COMMON *cm,
   BLOCK_SIZE bsize = xd->mi[0]->sb_type;
   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
   const vpx_prob *const tx_probs =
-      get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs);
+      get_tx_probs(max_tx_size, get_tx_size_context(xd), &cm->fc->tx_probs);
   vpx_write(w, tx_size != TX_4X4, tx_probs[0]);
   if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
     vpx_write(w, tx_size != TX_8X8, tx_probs[1]);
@@ -129,9 +136,9 @@ static void pack_mb_tokens(vpx_writer *w, TOKENEXTRA **tp,
   const TOKENEXTRA *p;
   const vp9_extra_bit *const extra_bits =
 #if CONFIG_VP9_HIGHBITDEPTH
-      (bit_depth == VPX_BITS_12)
-          ? vp9_extra_bits_high12
-          : (bit_depth == VPX_BITS_10) ? vp9_extra_bits_high10 : vp9_extra_bits;
+      (bit_depth == VPX_BITS_12)   ? vp9_extra_bits_high12
+      : (bit_depth == VPX_BITS_10) ? vp9_extra_bits_high10
+                                   : vp9_extra_bits;
 #else
       vp9_extra_bits;
   (void)bit_depth;
@@ -164,8 +171,8 @@ static void pack_mb_tokens(vpx_writer *w, TOKENEXTRA **tp,
         vpx_write_bit(w, p->extra & 1);
       } else {  // t >= TWO_TOKEN && t < EOB_TOKEN
         const struct vp9_token *const a = &vp9_coef_encodings[t];
-        const int v = a->value;
-        const int n = a->len;
+        int v = a->value;
+        int n = a->len;
         const int e = p->extra;
         vpx_write(w, 1, context_tree[2]);
         vp9_write_tree(w, vp9_coef_con_tree,
@@ -174,8 +181,8 @@ static void pack_mb_tokens(vpx_writer *w, TOKENEXTRA **tp,
         if (t >= CATEGORY1_TOKEN) {
           const vp9_extra_bit *const b = &extra_bits[t];
           const unsigned char *pb = b->prob;
-          int v = e >> 1;
-          int n = b->len;  // number of bits in v, assumed nonzero
+          v = e >> 1;
+          n = b->len;  // number of bits in v, assumed nonzero
           do {
             const int bb = (v >> --n) & 1;
             vpx_write(w, bb, *pb++);
@@ -217,7 +224,8 @@ static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *const xd,
     }
 
     if (is_compound) {
-      vpx_write(w, mi->ref_frame[0] == GOLDEN_FRAME,
+      const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
+      vpx_write(w, mi->ref_frame[!idx] == cm->comp_var_ref[1],
                 vp9_get_pred_prob_comp_ref_p(cm, xd));
     } else {
       const int bit0 = mi->ref_frame[0] != LAST_FRAME;
@@ -234,8 +242,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MACROBLOCKD *const xd,
                                 const MB_MODE_INFO_EXT *const mbmi_ext,
                                 vpx_writer *w,
                                 unsigned int *const max_mv_magnitude,
-                                int interp_filter_selected[MAX_REF_FRAMES]
-                                                          [SWITCHABLE]) {
+                                int interp_filter_selected[][SWITCHABLE]) {
   VP9_COMMON *const cm = &cpi->common;
   const nmv_context *nmvc = &cm->fc->nmvc;
   const struct segmentation *const seg = &cm->seg;
@@ -373,8 +380,7 @@ static void write_modes_b(VP9_COMP *cpi, MACROBLOCKD *const xd,
                           TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
                           int mi_row, int mi_col,
                           unsigned int *const max_mv_magnitude,
-                          int interp_filter_selected[MAX_REF_FRAMES]
-                                                    [SWITCHABLE]) {
+                          int interp_filter_selected[][SWITCHABLE]) {
   const VP9_COMMON *const cm = &cpi->common;
   const MB_MODE_INFO_EXT *const mbmi_ext =
       cpi->td.mb.mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
@@ -424,8 +430,7 @@ static void write_modes_sb(VP9_COMP *cpi, MACROBLOCKD *const xd,
                            TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
                            int mi_row, int mi_col, BLOCK_SIZE bsize,
                            unsigned int *const max_mv_magnitude,
-                           int interp_filter_selected[MAX_REF_FRAMES]
-                                                     [SWITCHABLE]) {
+                           int interp_filter_selected[][SWITCHABLE]) {
   const VP9_COMMON *const cm = &cpi->common;
   const int bsl = b_width_log2_lookup[bsize];
   const int bs = (1 << bsl) / 4;
@@ -463,7 +468,8 @@ static void write_modes_sb(VP9_COMP *cpi, MACROBLOCKD *const xd,
           write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col + bs,
                         max_mv_magnitude, interp_filter_selected);
         break;
-      case PARTITION_SPLIT:
+      default:
+        assert(partition == PARTITION_SPLIT);
         write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col, subsize,
                        max_mv_magnitude, interp_filter_selected);
         write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col + bs,
@@ -473,7 +479,6 @@ static void write_modes_sb(VP9_COMP *cpi, MACROBLOCKD *const xd,
         write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row + bs, mi_col + bs,
                        subsize, max_mv_magnitude, interp_filter_selected);
         break;
-      default: assert(0);
     }
   }
 
@@ -484,23 +489,30 @@ static void write_modes_sb(VP9_COMP *cpi, MACROBLOCKD *const xd,
 }
 
 static void write_modes(VP9_COMP *cpi, MACROBLOCKD *const xd,
-                        const TileInfo *const tile, vpx_writer *w,
-                        TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
-                        unsigned int *const max_mv_magnitude,
-                        int interp_filter_selected[MAX_REF_FRAMES]
-                                                  [SWITCHABLE]) {
+                        const TileInfo *const tile, vpx_writer *w, int tile_row,
+                        int tile_col, unsigned int *const max_mv_magnitude,
+                        int interp_filter_selected[][SWITCHABLE]) {
   const VP9_COMMON *const cm = &cpi->common;
-  int mi_row, mi_col;
+  int mi_row, mi_col, tile_sb_row;
+  TOKENEXTRA *tok = NULL;
+  TOKENEXTRA *tok_end = NULL;
 
   set_partition_probs(cm, xd);
 
   for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
        mi_row += MI_BLOCK_SIZE) {
+    tile_sb_row = mi_cols_aligned_to_sb(mi_row - tile->mi_row_start) >>
+                  MI_BLOCK_SIZE_LOG2;
+    tok = cpi->tplist[tile_row][tile_col][tile_sb_row].start;
+    tok_end = tok + cpi->tplist[tile_row][tile_col][tile_sb_row].count;
+
     vp9_zero(xd->left_seg_context);
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
          mi_col += MI_BLOCK_SIZE)
-      write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col,
+      write_modes_sb(cpi, xd, tile, w, &tok, tok_end, mi_row, mi_col,
                      BLOCK_64X64, max_mv_magnitude, interp_filter_selected);
+
+    assert(tok == cpi->tplist[tile_row][tile_col][tile_sb_row].stop);
   }
 }
 
@@ -544,7 +556,7 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi,
   switch (cpi->sf.use_fast_coef_updates) {
     case TWO_LOOP: {
       /* dry run to see if there is any update at all needed */
-      int savings = 0;
+      int64_t savings = 0;
       int update[2] = { 0, 0 };
       for (i = 0; i < PLANE_TYPES; ++i) {
         for (j = 0; j < REF_TYPES; ++j) {
@@ -553,7 +565,7 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi,
               for (t = 0; t < entropy_nodes_update; ++t) {
                 vpx_prob newp = new_coef_probs[i][j][k][l][t];
                 const vpx_prob oldp = old_coef_probs[i][j][k][l][t];
-                int s;
+                int64_t s;
                 int u = 0;
                 if (t == PIVOT_NODE)
                   s = vp9_prob_diff_update_savings_search_model(
@@ -589,8 +601,7 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi,
               for (t = 0; t < entropy_nodes_update; ++t) {
                 vpx_prob newp = new_coef_probs[i][j][k][l][t];
                 vpx_prob *oldp = old_coef_probs[i][j][k][l] + t;
-                const vpx_prob upd = DIFF_UPDATE_PROB;
-                int s;
+                int64_t s;
                 int u = 0;
                 if (t == PIVOT_NODE)
                   s = vp9_prob_diff_update_savings_search_model(
@@ -614,9 +625,10 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi,
       return;
     }
 
-    case ONE_LOOP_REDUCED: {
+    default: {
       int updates = 0;
       int noupdates_before_first = 0;
+      assert(cpi->sf.use_fast_coef_updates == ONE_LOOP_REDUCED);
       for (i = 0; i < PLANE_TYPES; ++i) {
         for (j = 0; j < REF_TYPES; ++j) {
           for (k = 0; k < COEF_BANDS; ++k) {
@@ -625,7 +637,7 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi,
               for (t = 0; t < entropy_nodes_update; ++t) {
                 vpx_prob newp = new_coef_probs[i][j][k][l][t];
                 vpx_prob *oldp = old_coef_probs[i][j][k][l] + t;
-                int s;
+                int64_t s;
                 int u = 0;
 
                 if (t == PIVOT_NODE) {
@@ -666,7 +678,6 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi,
       }
       return;
     }
-    default: assert(0);
   }
 }
 
@@ -890,6 +901,19 @@ static void write_tile_info(const VP9_COMMON *const cm,
 }
 
 int vp9_get_refresh_mask(VP9_COMP *cpi) {
+  if (cpi->ext_ratectrl.ready &&
+      (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 &&
+      cpi->ext_ratectrl.funcs.get_gop_decision != NULL) {
+    GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    const int this_gf_index = gf_group->index;
+    const int update_ref_idx = gf_group->update_ref_idx[this_gf_index];
+
+    if (update_ref_idx != INVALID_IDX) {
+      return (1 << update_ref_idx);
+    } else {
+      return 0;
+    }
+  }
   if (vp9_preserve_existing_gf(cpi)) {
     // We have decided to preserve the previously existing golden frame as our
     // new ARF frame. However, in the short term we leave it in the GF slot and,
@@ -905,25 +929,40 @@ int vp9_get_refresh_mask(VP9_COMP *cpi) {
            (cpi->refresh_golden_frame << cpi->alt_fb_idx);
   } else {
     int arf_idx = cpi->alt_fb_idx;
-    if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
-      const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-      arf_idx = gf_group->arf_update_idx[gf_group->index];
+    GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+
+    if (cpi->multi_layer_arf) {
+      for (arf_idx = 0; arf_idx < REF_FRAMES; ++arf_idx) {
+        if (arf_idx != cpi->alt_fb_idx && arf_idx != cpi->lst_fb_idx &&
+            arf_idx != cpi->gld_fb_idx) {
+          int idx;
+          for (idx = 0; idx < gf_group->stack_size; ++idx)
+            if (arf_idx == gf_group->arf_index_stack[idx]) break;
+          if (idx == gf_group->stack_size) break;
+        }
+      }
     }
+    cpi->twopass.gf_group.top_arf_idx = arf_idx;
+
+    if (cpi->use_svc && cpi->svc.use_set_ref_frame_config &&
+        cpi->svc.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS)
+      return cpi->svc.update_buffer_slot[cpi->svc.spatial_layer_id];
     return (cpi->refresh_last_frame << cpi->lst_fb_idx) |
            (cpi->refresh_golden_frame << cpi->gld_fb_idx) |
            (cpi->refresh_alt_ref_frame << arf_idx);
   }
 }
 
-static int encode_tile_worker(VP9_COMP *cpi, VP9BitstreamWorkerData *data) {
+static int encode_tile_worker(void *arg1, void *arg2) {
+  VP9_COMP *cpi = (VP9_COMP *)arg1;
+  VP9BitstreamWorkerData *data = (VP9BitstreamWorkerData *)arg2;
   MACROBLOCKD *const xd = &data->xd;
-  vpx_start_encode(&data->bit_writer, data->dest);
+  const int tile_row = 0;
+  vpx_start_encode(&data->bit_writer, data->dest, data->dest_size);
   write_modes(cpi, xd, &cpi->tile_data[data->tile_idx].tile_info,
-              &data->bit_writer, &data->tok, data->tok_end,
+              &data->bit_writer, tile_row, data->tile_idx,
               &data->max_mv_magnitude, data->interp_filter_selected);
-  assert(data->tok == data->tok_end);
-  vpx_stop_encode(&data->bit_writer);
-  return 1;
+  return vpx_stop_encode(&data->bit_writer) == 0;
 }
 
 void vp9_bitstream_encode_tiles_buffer_dealloc(VP9_COMP *const cpi) {
@@ -937,36 +976,47 @@ void vp9_bitstream_encode_tiles_buffer_dealloc(VP9_COMP *const cpi) {
   }
 }
 
-static int encode_tiles_buffer_alloc(VP9_COMP *const cpi) {
+static size_t encode_tiles_buffer_alloc_size(const VP9_COMP *cpi) {
+  const VP9_COMMON *cm = &cpi->common;
+  const int image_bps =
+      (8 + 2 * (8 >> (cm->subsampling_x + cm->subsampling_y))) *
+      (1 + (cm->bit_depth > 8));
+  const int64_t size =
+      (int64_t)cpi->oxcf.width * cpi->oxcf.height * image_bps / 8;
+  return (size_t)size;
+}
+
+static void encode_tiles_buffer_alloc(VP9_COMP *const cpi,
+                                      size_t buffer_alloc_size) {
+  VP9_COMMON *const cm = &cpi->common;
   int i;
   const size_t worker_data_size =
       cpi->num_workers * sizeof(*cpi->vp9_bitstream_worker_data);
-  cpi->vp9_bitstream_worker_data = vpx_memalign(16, worker_data_size);
+  CHECK_MEM_ERROR(&cm->error, cpi->vp9_bitstream_worker_data,
+                  vpx_memalign(16, worker_data_size));
   memset(cpi->vp9_bitstream_worker_data, 0, worker_data_size);
-  if (!cpi->vp9_bitstream_worker_data) return 1;
   for (i = 1; i < cpi->num_workers; ++i) {
-    cpi->vp9_bitstream_worker_data[i].dest_size =
-        cpi->oxcf.width * cpi->oxcf.height;
-    cpi->vp9_bitstream_worker_data[i].dest =
-        vpx_malloc(cpi->vp9_bitstream_worker_data[i].dest_size);
-    if (!cpi->vp9_bitstream_worker_data[i].dest) return 1;
+    CHECK_MEM_ERROR(&cm->error, cpi->vp9_bitstream_worker_data[i].dest,
+                    vpx_malloc(buffer_alloc_size));
+    cpi->vp9_bitstream_worker_data[i].dest_size = buffer_alloc_size;
   }
-  return 0;
 }
 
-static size_t encode_tiles_mt(VP9_COMP *cpi, uint8_t *data_ptr) {
+static size_t encode_tiles_mt(VP9_COMP *cpi, uint8_t *data_ptr,
+                              size_t data_size) {
   const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
   VP9_COMMON *const cm = &cpi->common;
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int num_workers = cpi->num_workers;
   size_t total_size = 0;
   int tile_col = 0;
+  int error = 0;
 
+  const size_t buffer_alloc_size = encode_tiles_buffer_alloc_size(cpi);
   if (!cpi->vp9_bitstream_worker_data ||
-      cpi->vp9_bitstream_worker_data[1].dest_size >
-          (cpi->oxcf.width * cpi->oxcf.height)) {
+      cpi->vp9_bitstream_worker_data[1].dest_size != buffer_alloc_size) {
     vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
-    if (encode_tiles_buffer_alloc(cpi)) return 0;
+    encode_tiles_buffer_alloc(cpi, buffer_alloc_size);
   }
 
   while (tile_col < tile_cols) {
@@ -978,8 +1028,6 @@ static size_t encode_tiles_mt(VP9_COMP *cpi, uint8_t *data_ptr) {
       // Populate the worker data.
       data->xd = cpi->td.mb.e_mbd;
       data->tile_idx = tile_col;
-      data->tok = cpi->tile_tok[0][tile_col];
-      data->tok_end = cpi->tile_tok[0][tile_col] + cpi->tok_count[0][tile_col];
       data->max_mv_magnitude = cpi->max_mv_magnitude;
       memset(data->interp_filter_selected, 0,
              sizeof(data->interp_filter_selected[0][0]) * SWITCHABLE);
@@ -988,12 +1036,17 @@ static size_t encode_tiles_mt(VP9_COMP *cpi, uint8_t *data_ptr) {
       if (i == 0) {
         // If this worker happens to be for the last tile, then do not offset it
         // by 4 for the tile size.
-        data->dest =
-            data_ptr + total_size + (tile_col == tile_cols - 1 ? 0 : 4);
+        const size_t offset = total_size + (tile_col == tile_cols - 1 ? 0 : 4);
+        if (data_size < offset) {
+          vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                             "encode_tiles_mt: output buffer full");
+        }
+        data->dest = data_ptr + offset;
+        data->dest_size = data_size - offset;
       }
       worker->data1 = cpi;
       worker->data2 = data;
-      worker->hook = (VPxWorkerHook)encode_tile_worker;
+      worker->hook = encode_tile_worker;
       worker->had_error = 0;
 
       if (i < num_workers - 1) {
@@ -1010,7 +1063,11 @@ static size_t encode_tiles_mt(VP9_COMP *cpi, uint8_t *data_ptr) {
       uint32_t tile_size;
       int k;
 
-      if (!winterface->sync(worker)) return 0;
+      if (!winterface->sync(worker)) {
+        error = 1;
+        continue;
+      }
+
       tile_size = data->bit_writer.pos;
 
       // Aggregate per-thread bitstream stats.
@@ -1022,24 +1079,35 @@ static size_t encode_tiles_mt(VP9_COMP *cpi, uint8_t *data_ptr) {
 
       // Prefix the size of the tile on all but the last.
       if (tile_col != tile_cols || j < i - 1) {
+        if (data_size - total_size < 4) {
+          error = 1;
+          continue;
+        }
         mem_put_be32(data_ptr + total_size, tile_size);
         total_size += 4;
       }
       if (j > 0) {
+        if (data_size - total_size < tile_size) {
+          error = 1;
+          continue;
+        }
         memcpy(data_ptr + total_size, data->dest, tile_size);
       }
       total_size += tile_size;
     }
+    if (error) {
+      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                         "encode_tiles_mt: output buffer full");
+    }
   }
   return total_size;
 }
 
-static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
+static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr, size_t data_size) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   vpx_writer residual_bc;
   int tile_row, tile_col;
-  TOKENEXTRA *tok_end;
   size_t total_size = 0;
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int tile_rows = 1 << cm->log2_tile_rows;
@@ -1052,27 +1120,32 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
   // that it does not make the overall process worse in any case.
   if (cpi->oxcf.mode == REALTIME && cpi->num_workers > 1 && tile_rows == 1 &&
       tile_cols > 1) {
-    return encode_tiles_mt(cpi, data_ptr);
+    return encode_tiles_mt(cpi, data_ptr, data_size);
   }
 
   for (tile_row = 0; tile_row < tile_rows; tile_row++) {
     for (tile_col = 0; tile_col < tile_cols; tile_col++) {
       int tile_idx = tile_row * tile_cols + tile_col;
-      TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
-
-      tok_end = cpi->tile_tok[tile_row][tile_col] +
-                cpi->tok_count[tile_row][tile_col];
 
+      size_t offset;
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1)
-        vpx_start_encode(&residual_bc, data_ptr + total_size + 4);
+        offset = total_size + 4;
       else
-        vpx_start_encode(&residual_bc, data_ptr + total_size);
+        offset = total_size;
+      if (data_size < offset) {
+        vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                           "encode_tiles: output buffer full");
+      }
+      vpx_start_encode(&residual_bc, data_ptr + offset, data_size - offset);
 
       write_modes(cpi, xd, &cpi->tile_data[tile_idx].tile_info, &residual_bc,
-                  &tok, tok_end, &cpi->max_mv_magnitude,
+                  tile_row, tile_col, &cpi->max_mv_magnitude,
                   cpi->interp_filter_selected);
-      assert(tok == tok_end);
-      vpx_stop_encode(&residual_bc);
+
+      if (vpx_stop_encode(&residual_bc)) {
+        vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                           "encode_tiles: output buffer full");
+      }
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) {
         // size of this tile
         mem_put_be32(data_ptr + total_size, residual_bc.pos);
@@ -1118,11 +1191,7 @@ static void write_frame_size_with_refs(VP9_COMP *cpi,
         ((cpi->svc.number_temporal_layers > 1 &&
           cpi->oxcf.rc_mode == VPX_CBR) ||
          (cpi->svc.number_spatial_layers > 1 &&
-          cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame) ||
-         (is_two_pass_svc(cpi) &&
-          cpi->svc.encode_empty_frame_state == ENCODING &&
-          cpi->svc.layer_context[0].frames_from_key_frame <
-              cpi->svc.number_temporal_layers + 1))) {
+          cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame))) {
       found = 0;
     } else if (cfg != NULL) {
       found =
@@ -1154,8 +1223,10 @@ static void write_profile(BITSTREAM_PROFILE profile,
     case PROFILE_0: vpx_wb_write_literal(wb, 0, 2); break;
     case PROFILE_1: vpx_wb_write_literal(wb, 2, 2); break;
     case PROFILE_2: vpx_wb_write_literal(wb, 1, 2); break;
-    case PROFILE_3: vpx_wb_write_literal(wb, 6, 3); break;
-    default: assert(0);
+    default:
+      assert(profile == PROFILE_3);
+      vpx_wb_write_literal(wb, 6, 3);
+      break;
   }
 }
 
@@ -1179,6 +1250,7 @@ static void write_bitdepth_colorspace_sampling(
     }
   } else {
     assert(cm->profile == PROFILE_1 || cm->profile == PROFILE_3);
+    assert(cm->subsampling_x == 0 && cm->subsampling_y == 0);
     vpx_wb_write_bit(wb, 0);  // unused
   }
 }
@@ -1192,7 +1264,13 @@ static void write_uncompressed_header(VP9_COMP *cpi,
 
   write_profile(cm->profile, wb);
 
-  vpx_wb_write_bit(wb, 0);  // show_existing_frame
+  // If to use show existing frame.
+  vpx_wb_write_bit(wb, cm->show_existing_frame);
+  if (cm->show_existing_frame) {
+    vpx_wb_write_literal(wb, cpi->alt_fb_idx, 3);
+    return;
+  }
+
   vpx_wb_write_bit(wb, cm->frame_type);
   vpx_wb_write_bit(wb, cm->show_frame);
   vpx_wb_write_bit(wb, cm->error_resilient_mode);
@@ -1202,14 +1280,6 @@ static void write_uncompressed_header(VP9_COMP *cpi,
     write_bitdepth_colorspace_sampling(cm, wb);
     write_frame_size(cm, wb);
   } else {
-    // In spatial svc if it's not error_resilient_mode then we need to code all
-    // visible frames as invisible. But we need to keep the show_frame flag so
-    // that the publisher could know whether it is supposed to be visible.
-    // So we will code the show_frame flag as it is. Then code the intra_only
-    // bit here. This will make the bitstream incompatible. In the player we
-    // will change to show_frame flag to 0, then add an one byte frame with
-    // show_existing_frame flag which tells the decoder which frame we want to
-    // show.
     if (!cm->show_frame) vpx_wb_write_bit(wb, cm->intra_only);
 
     if (!cm->error_resilient_mode)
@@ -1258,14 +1328,15 @@ static void write_uncompressed_header(VP9_COMP *cpi,
   write_tile_info(cm, wb);
 }
 
-static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
+static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data,
+                                      size_t data_size) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   FRAME_CONTEXT *const fc = cm->fc;
   FRAME_COUNTS *counts = cpi->td.counts;
   vpx_writer header_bc;
 
-  vpx_start_encode(&header_bc, data);
+  vpx_start_encode(&header_bc, data, data_size);
 
   if (xd->lossless)
     cm->tx_mode = ONLY_4X4;
@@ -1329,33 +1400,67 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
                         &counts->mv);
   }
 
-  vpx_stop_encode(&header_bc);
-  assert(header_bc.pos <= 0xffff);
+  if (vpx_stop_encode(&header_bc)) {
+    vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                       "write_compressed_header: output buffer full");
+  }
 
   return header_bc.pos;
 }
 
-void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size) {
+void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t dest_size,
+                        size_t *size) {
+  VP9_COMMON *const cm = &cpi->common;
   uint8_t *data = dest;
-  size_t first_part_size, uncompressed_hdr_size;
-  struct vpx_write_bit_buffer wb = { data, 0 };
+  size_t data_size = dest_size;
+  size_t uncompressed_hdr_size, compressed_hdr_size;
+  struct vpx_write_bit_buffer wb;
   struct vpx_write_bit_buffer saved_wb;
 
+#if CONFIG_BITSTREAM_DEBUG
+  bitstream_queue_reset_write();
+#endif
+
+  vpx_wb_init(&wb, data, data_size);
   write_uncompressed_header(cpi, &wb);
+  if (vpx_wb_has_error(&wb)) {
+    vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                       "vp9_pack_bitstream: output buffer full");
+  }
+
+  // Skip the rest coding process if use show existing frame.
+  if (cm->show_existing_frame) {
+    uncompressed_hdr_size = vpx_wb_bytes_written(&wb);
+    data += uncompressed_hdr_size;
+    *size = data - dest;
+    return;
+  }
+
   saved_wb = wb;
-  vpx_wb_write_literal(&wb, 0, 16);  // don't know in advance first part. size
+  // don't know in advance compressed header size
+  vpx_wb_write_literal(&wb, 0, 16);
+  if (vpx_wb_has_error(&wb)) {
+    vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                       "vp9_pack_bitstream: output buffer full");
+  }
 
   uncompressed_hdr_size = vpx_wb_bytes_written(&wb);
   data += uncompressed_hdr_size;
+  data_size -= uncompressed_hdr_size;
 
   vpx_clear_system_state();
 
-  first_part_size = write_compressed_header(cpi, data);
-  data += first_part_size;
-  // TODO(jbb): Figure out what to do if first_part_size > 16 bits.
-  vpx_wb_write_literal(&saved_wb, (int)first_part_size, 16);
+  compressed_hdr_size = write_compressed_header(cpi, data, data_size);
+  data += compressed_hdr_size;
+  data_size -= compressed_hdr_size;
+  if (compressed_hdr_size > UINT16_MAX) {
+    vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                       "compressed_hdr_size > 16 bits");
+  }
+  vpx_wb_write_literal(&saved_wb, (int)compressed_hdr_size, 16);
+  assert(!vpx_wb_has_error(&saved_wb));
 
-  data += encode_tiles(cpi, data);
+  data += encode_tiles(cpi, data, data_size);
 
   *size = data - dest;
 }
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_bitstream.h b/media/libvpx/libvpx/vp9/encoder/vp9_bitstream.h
index 044a3bbc7b..1120841ecb 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_bitstream.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_bitstream.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_BITSTREAM_H_
-#define VP9_ENCODER_VP9_BITSTREAM_H_
+#ifndef VPX_VP9_ENCODER_VP9_BITSTREAM_H_
+#define VPX_VP9_ENCODER_VP9_BITSTREAM_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -19,9 +19,7 @@ extern "C" {
 
 typedef struct VP9BitstreamWorkerData {
   uint8_t *dest;
-  int dest_size;
-  TOKENEXTRA *tok;
-  TOKENEXTRA *tok_end;
+  size_t dest_size;
   vpx_writer bit_writer;
   int tile_idx;
   unsigned int max_mv_magnitude;
@@ -37,19 +35,16 @@ int vp9_get_refresh_mask(VP9_COMP *cpi);
 
 void vp9_bitstream_encode_tiles_buffer_dealloc(VP9_COMP *const cpi);
 
-void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size);
+void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t dest_size,
+                        size_t *size);
 
 static INLINE int vp9_preserve_existing_gf(VP9_COMP *cpi) {
-  return !cpi->multi_arf_allowed && cpi->refresh_golden_frame &&
-         cpi->rc.is_src_frame_alt_ref &&
-         (!cpi->use_svc ||  // Add spatial svc base layer case here
-          (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id == 0 &&
-           cpi->svc.layer_context[0].gold_ref_idx >= 0 &&
-           cpi->oxcf.ss_enable_auto_arf[0]));
+  return cpi->refresh_golden_frame && cpi->rc.is_src_frame_alt_ref &&
+         !cpi->use_svc;
 }
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_BITSTREAM_H_
+#endif  // VPX_VP9_ENCODER_VP9_BITSTREAM_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_block.h b/media/libvpx/libvpx/vp9/encoder/vp9_block.h
index 1ea5fdf1ff..abb462c46a 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_block.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_block.h
@@ -8,9 +8,10 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_BLOCK_H_
-#define VP9_ENCODER_VP9_BLOCK_H_
+#ifndef VPX_VP9_ENCODER_VP9_BLOCK_H_
+#define VPX_VP9_ENCODER_VP9_BLOCK_H_
 
+#include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_entropy.h"
 
@@ -18,12 +19,6 @@
 extern "C" {
 #endif
 
-typedef struct {
-  unsigned int sse;
-  int sum;
-  unsigned int var;
-} diff;
-
 struct macroblock_plane {
   DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]);
   tran_low_t *qcoeff;
@@ -31,9 +26,9 @@ struct macroblock_plane {
   uint16_t *eobs;
   struct buf_2d src;
 
-  // Quantizer setings
-  int16_t *quant_fp;
+  // Quantizer settings
   int16_t *round_fp;
+  int16_t *quant_fp;
   int16_t *quant;
   int16_t *quant_shift;
   int16_t *zbin;
@@ -61,6 +56,11 @@ typedef struct {
 
 typedef struct macroblock MACROBLOCK;
 struct macroblock {
+// cf. https://bugs.chromium.org/p/webm/issues/detail?id=1054
+#if defined(_MSC_VER) && _MSC_VER < 1900
+  int64_t bsse[MAX_MB_PLANE << 2];
+#endif
+
   struct macroblock_plane plane[MAX_MB_PLANE];
 
   MACROBLOCKD e_mbd;
@@ -71,23 +71,23 @@ struct macroblock {
   int skip_recode;
   int skip_optimize;
   int q_index;
-  int block_qcoeff_opt;
+  double log_block_src_var;
   int block_tx_domain;
 
   // The equivalent error at the current rdmult of one whole bit (not one
   // bitcost unit).
   int errorperbit;
-  // The equivalend SAD error of one (whole) bit at the current quantizer
+  // The equivalent SAD error of one (whole) bit at the current quantizer
   // for large blocks.
   int sadperbit16;
-  // The equivalend SAD error of one (whole) bit at the current quantizer
+  // The equivalent SAD error of one (whole) bit at the current quantizer
   // for sub-8x8 blocks.
   int sadperbit4;
   int rddiv;
   int rdmult;
+  int cb_rdmult;
+  int segment_id;
   int mb_energy;
-  int *m_search_count_ptr;
-  int *ex_search_count_ptr;
 
   // These are set to their default values at the beginning, and then adjusted
   // further in the encoding process.
@@ -110,14 +110,23 @@ struct macroblock {
   int *nmvsadcost_hp[2];
   int **mvsadcost;
 
+  // sharpness is used to disable skip mode and change rd_mult
+  int sharpness;
+
+  // aq mode is used to adjust rd based on segment.
+  int adjust_rdmult_by_segment;
+
   // These define limits to motion vector components to prevent them
   // from extending outside the UMV borders
   MvLimits mv_limits;
 
-  // Notes transform blocks where no coefficents are coded.
+  // Notes transform blocks where no coefficients are coded.
   // Set during mode selection. Read during block encoding.
   uint8_t zcoeff_blk[TX_SIZES][256];
 
+  // Accumulate the tx block eobs in a partition block.
+  int32_t sum_y_eobs[TX_SIZES];
+
   int skip;
 
   int encode_breakout;
@@ -131,16 +140,26 @@ struct macroblock {
   int use_lp32x32fdct;
   int skip_encode;
 
+  // In first pass, intra prediction is done based on source pixels
+  // at tile boundaries
+  int fp_src_pred;
+
   // use fast quantization process
   int quant_fp;
 
   // skip forward transform and quantization
   uint8_t skip_txfm[MAX_MB_PLANE << 2];
 #define SKIP_TXFM_NONE 0
+// TODO(chengchen): consider remove SKIP_TXFM_AC_DC from vp9 completely
+// since it increases risks of bad perceptual quality.
+// https://crbug.com/webm/1729
 #define SKIP_TXFM_AC_DC 1
 #define SKIP_TXFM_AC_ONLY 2
 
+// cf. https://bugs.chromium.org/p/webm/issues/detail?id=1054
+#if !defined(_MSC_VER) || _MSC_VER >= 1900
   int64_t bsse[MAX_MB_PLANE << 2];
+#endif
 
   // Used to store sub partition's choices.
   MV pred_mv[MAX_REF_FRAMES];
@@ -151,21 +170,48 @@ struct macroblock {
 
   uint8_t sb_is_skin;
 
+  uint8_t skip_low_source_sad;
+
+  uint8_t lowvar_highsumdiff;
+
+  uint8_t last_sb_high_content;
+
+  int sb_use_mv_part;
+
+  int sb_mvcol_part;
+
+  int sb_mvrow_part;
+
+  int sb_pickmode_part;
+
+  int zero_temp_sad_source;
+
+  // For each superblock: saves the content value (e.g., low/high sad/sumdiff)
+  // based on source sad, prior to encoding the frame.
+  uint8_t content_state_sb;
+
   // Used to save the status of whether a block has a low variance in
   // choose_partitioning. 0 for 64x64, 1~2 for 64x32, 3~4 for 32x64, 5~8 for
   // 32x32, 9~24 for 16x16.
   uint8_t variance_low[25];
 
-  void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride);
-  void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob);
+  uint8_t arf_frame_usage;
+  uint8_t lastgolden_frame_usage;
+
+  void (*fwd_txfm4x4)(const int16_t *input, tran_low_t *output, int stride);
+  void (*inv_txfm_add)(const tran_low_t *input, uint8_t *dest, int stride,
+                       int eob);
 #if CONFIG_VP9_HIGHBITDEPTH
-  void (*highbd_itxm_add)(const tran_low_t *input, uint8_t *dest, int stride,
-                          int eob, int bd);
+  void (*highbd_inv_txfm_add)(const tran_low_t *input, uint16_t *dest,
+                              int stride, int eob, int bd);
 #endif
+  DECLARE_ALIGNED(16, uint8_t, est_pred[64 * 64]);
+
+  struct scale_factors *me_sf;
 };
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_BLOCK_H_
+#endif  // VPX_VP9_ENCODER_VP9_BLOCK_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_blockiness.c b/media/libvpx/libvpx/vp9/encoder/vp9_blockiness.c
index 9ab57b57c7..da68a3c3c3 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_blockiness.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_blockiness.c
@@ -11,6 +11,7 @@
 
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/system_state.h"
+#include "vp9/encoder/vp9_blockiness.h"
 
 static int horizontal_filter(const uint8_t *s) {
   return (s[1] - s[-2]) * 2 + (s[-1] - s[0]) * 6;
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_blockiness.h b/media/libvpx/libvpx/vp9/encoder/vp9_blockiness.h
new file mode 100644
index 0000000000..e840cb2518
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_blockiness.h
@@ -0,0 +1,26 @@
+/*
+ *  Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_BLOCKINESS_H_
+#define VPX_VP9_ENCODER_VP9_BLOCKINESS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+double vp9_get_blockiness(const uint8_t *img1, int img1_pitch,
+                          const uint8_t *img2, int img2_pitch, int width,
+                          int height);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VP9_ENCODER_VP9_BLOCKINESS_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c
index 2f7e544332..ee0fcd8729 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c
@@ -12,7 +12,10 @@
 #include "vp9/encoder/vp9_encoder.h"
 
 static const BLOCK_SIZE square[] = {
-  BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64,
+  BLOCK_8X8,
+  BLOCK_16X16,
+  BLOCK_32X32,
+  BLOCK_64X64,
 };
 
 static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk,
@@ -22,16 +25,17 @@ static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk,
   int i, k;
   ctx->num_4x4_blk = num_blk;
 
-  CHECK_MEM_ERROR(cm, ctx->zcoeff_blk, vpx_calloc(num_blk, sizeof(uint8_t)));
+  CHECK_MEM_ERROR(&cm->error, ctx->zcoeff_blk,
+                  vpx_calloc(num_blk, sizeof(uint8_t)));
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     for (k = 0; k < 3; ++k) {
-      CHECK_MEM_ERROR(cm, ctx->coeff[i][k],
+      CHECK_MEM_ERROR(&cm->error, ctx->coeff[i][k],
                       vpx_memalign(32, num_pix * sizeof(*ctx->coeff[i][k])));
-      CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k],
+      CHECK_MEM_ERROR(&cm->error, ctx->qcoeff[i][k],
                       vpx_memalign(32, num_pix * sizeof(*ctx->qcoeff[i][k])));
-      CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k],
+      CHECK_MEM_ERROR(&cm->error, ctx->dqcoeff[i][k],
                       vpx_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i][k])));
-      CHECK_MEM_ERROR(cm, ctx->eobs[i][k],
+      CHECK_MEM_ERROR(&cm->error, ctx->eobs[i][k],
                       vpx_memalign(32, num_blk * sizeof(*ctx->eobs[i][k])));
       ctx->coeff_pbuf[i][k] = ctx->coeff[i][k];
       ctx->qcoeff_pbuf[i][k] = ctx->qcoeff[i][k];
@@ -97,10 +101,10 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) {
   int nodes;
 
   vpx_free(td->leaf_tree);
-  CHECK_MEM_ERROR(cm, td->leaf_tree,
+  CHECK_MEM_ERROR(&cm->error, td->leaf_tree,
                   vpx_calloc(leaf_nodes, sizeof(*td->leaf_tree)));
   vpx_free(td->pc_tree);
-  CHECK_MEM_ERROR(cm, td->pc_tree,
+  CHECK_MEM_ERROR(&cm->error, td->pc_tree,
                   vpx_calloc(tree_nodes, sizeof(*td->pc_tree)));
 
   this_pc = &td->pc_tree[0];
@@ -115,8 +119,8 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) {
     PC_TREE *const tree = &td->pc_tree[pc_tree_index];
     tree->block_size = square[0];
     alloc_tree_contexts(cm, tree, 4);
-    tree->leaf_split[0] = this_leaf++;
-    for (j = 1; j < 4; j++) tree->leaf_split[j] = tree->leaf_split[0];
+    tree->u.leaf_split[0] = this_leaf++;
+    for (j = 1; j < 4; j++) tree->u.leaf_split[j] = tree->u.leaf_split[0];
   }
 
   // Each node has 4 leaf nodes, fill each block_size level of the tree
@@ -126,7 +130,7 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) {
       PC_TREE *const tree = &td->pc_tree[pc_tree_index];
       alloc_tree_contexts(cm, tree, 4 << (2 * square_index));
       tree->block_size = square[square_index];
-      for (j = 0; j < 4; j++) tree->split[j] = this_pc++;
+      for (j = 0; j < 4; j++) tree->u.split[j] = this_pc++;
       ++pc_tree_index;
     }
     ++square_index;
@@ -136,17 +140,22 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) {
 }
 
 void vp9_free_pc_tree(ThreadData *td) {
-  const int tree_nodes = 64 + 16 + 4 + 1;
   int i;
 
-  // Set up all 4x4 mode contexts
-  for (i = 0; i < 64; ++i) free_mode_context(&td->leaf_tree[i]);
+  if (td == NULL) return;
 
-  // Sets up all the leaf nodes in the tree.
-  for (i = 0; i < tree_nodes; ++i) free_tree_contexts(&td->pc_tree[i]);
+  if (td->leaf_tree != NULL) {
+    // Set up all 4x4 mode contexts
+    for (i = 0; i < 64; ++i) free_mode_context(&td->leaf_tree[i]);
+    vpx_free(td->leaf_tree);
+    td->leaf_tree = NULL;
+  }
 
-  vpx_free(td->pc_tree);
-  td->pc_tree = NULL;
-  vpx_free(td->leaf_tree);
-  td->leaf_tree = NULL;
+  if (td->pc_tree != NULL) {
+    const int tree_nodes = 64 + 16 + 4 + 1;
+    // Sets up all the leaf nodes in the tree.
+    for (i = 0; i < tree_nodes; ++i) free_tree_contexts(&td->pc_tree[i]);
+    vpx_free(td->pc_tree);
+    td->pc_tree = NULL;
+  }
 }
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h
index 86ba03d69f..51e13ba654 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_CONTEXT_TREE_H_
-#define VP9_ENCODER_VP9_CONTEXT_TREE_H_
+#ifndef VPX_VP9_ENCODER_VP9_CONTEXT_TREE_H_
+#define VPX_VP9_ENCODER_VP9_CONTEXT_TREE_H_
 
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/encoder/vp9_block.h"
@@ -56,6 +56,7 @@ typedef struct {
   // scope of refactoring.
   int rate;
   int64_t dist;
+  int64_t rdcost;
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
   unsigned int newmv_sse;
@@ -65,12 +66,18 @@ typedef struct {
   int_mv best_sse_mv;
   MV_REFERENCE_FRAME best_reference_frame;
   MV_REFERENCE_FRAME best_zeromv_reference_frame;
+  int sb_skip_denoising;
 #endif
 
   // motion vector cache for adaptive motion search control in partition
   // search loop
   MV pred_mv[MAX_REF_FRAMES];
   INTERP_FILTER pred_interp_filter;
+
+  // Used for the machine learning-based early termination
+  int32_t sum_y_eobs;
+  // Skip certain ref frames during RD search of rectangular partitions.
+  uint8_t skip_ref_frame_mask;
 } PICK_MODE_CONTEXT;
 
 typedef struct PC_TREE {
@@ -83,7 +90,10 @@ typedef struct PC_TREE {
   union {
     struct PC_TREE *split[4];
     PICK_MODE_CONTEXT *leaf_split[4];
-  };
+  } u;
+  // Obtained from a simple motion search. Used by the ML based partition search
+  // speed feature.
+  MV mv;
 } PC_TREE;
 
 void vp9_setup_pc_tree(struct VP9Common *cm, struct ThreadData *td);
@@ -93,4 +103,4 @@ void vp9_free_pc_tree(struct ThreadData *td);
 }  // extern "C"
 #endif
 
-#endif /* VP9_ENCODER_VP9_CONTEXT_TREE_H_ */
+#endif  // VPX_VP9_ENCODER_VP9_CONTEXT_TREE_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_cost.h b/media/libvpx/libvpx/vp9/encoder/vp9_cost.h
index 70a1a2e0e9..ee0033fa31 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_cost.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_cost.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_COST_H_
-#define VP9_ENCODER_VP9_COST_H_
+#ifndef VPX_VP9_ENCODER_VP9_COST_H_
+#define VPX_VP9_ENCODER_VP9_COST_H_
 
 #include "vpx_dsp/prob.h"
 #include "vpx/vpx_integer.h"
@@ -29,9 +29,8 @@ extern const uint16_t vp9_prob_cost[256];
 
 #define vp9_cost_bit(prob, bit) vp9_cost_zero((bit) ? 256 - (prob) : (prob))
 
-static INLINE unsigned int cost_branch256(const unsigned int ct[2],
-                                          vpx_prob p) {
-  return ct[0] * vp9_cost_zero(p) + ct[1] * vp9_cost_one(p);
+static INLINE uint64_t cost_branch256(const unsigned int ct[2], vpx_prob p) {
+  return (uint64_t)ct[0] * vp9_cost_zero(p) + (uint64_t)ct[1] * vp9_cost_one(p);
 }
 
 static INLINE int treed_cost(vpx_tree tree, const vpx_prob *probs, int bits,
@@ -55,4 +54,4 @@ void vp9_cost_tokens_skip(int *costs, const vpx_prob *probs, vpx_tree tree);
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_COST_H_
+#endif  // VPX_VP9_ENCODER_VP9_COST_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_dct.c b/media/libvpx/libvpx/vp9/encoder/vp9_dct.c
index bb8c23fdb9..2f42c6afc2 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_dct.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_dct.c
@@ -554,114 +554,6 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
   }
 }
 
-void vp9_fdct8x8_quant_c(const int16_t *input, int stride,
-                         tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                         int skip_block, const int16_t *zbin_ptr,
-                         const int16_t *round_ptr, const int16_t *quant_ptr,
-                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                         uint16_t *eob_ptr, const int16_t *scan,
-                         const int16_t *iscan) {
-  int eob = -1;
-
-  int i, j;
-  tran_low_t intermediate[64];
-
-  // Transform columns
-  {
-    tran_low_t *output = intermediate;
-    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
-    tran_high_t t0, t1, t2, t3;                  // needs32
-    tran_high_t x0, x1, x2, x3;                  // canbe16
-
-    int i;
-    for (i = 0; i < 8; i++) {
-      // stage 1
-      s0 = (input[0 * stride] + input[7 * stride]) * 4;
-      s1 = (input[1 * stride] + input[6 * stride]) * 4;
-      s2 = (input[2 * stride] + input[5 * stride]) * 4;
-      s3 = (input[3 * stride] + input[4 * stride]) * 4;
-      s4 = (input[3 * stride] - input[4 * stride]) * 4;
-      s5 = (input[2 * stride] - input[5 * stride]) * 4;
-      s6 = (input[1 * stride] - input[6 * stride]) * 4;
-      s7 = (input[0 * stride] - input[7 * stride]) * 4;
-
-      // fdct4(step, step);
-      x0 = s0 + s3;
-      x1 = s1 + s2;
-      x2 = s1 - s2;
-      x3 = s0 - s3;
-      t0 = (x0 + x1) * cospi_16_64;
-      t1 = (x0 - x1) * cospi_16_64;
-      t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
-      t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
-      output[0 * 8] = (tran_low_t)fdct_round_shift(t0);
-      output[2 * 8] = (tran_low_t)fdct_round_shift(t2);
-      output[4 * 8] = (tran_low_t)fdct_round_shift(t1);
-      output[6 * 8] = (tran_low_t)fdct_round_shift(t3);
-
-      // Stage 2
-      t0 = (s6 - s5) * cospi_16_64;
-      t1 = (s6 + s5) * cospi_16_64;
-      t2 = fdct_round_shift(t0);
-      t3 = fdct_round_shift(t1);
-
-      // Stage 3
-      x0 = s4 + t2;
-      x1 = s4 - t2;
-      x2 = s7 - t3;
-      x3 = s7 + t3;
-
-      // Stage 4
-      t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
-      t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
-      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
-      t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
-      output[1 * 8] = (tran_low_t)fdct_round_shift(t0);
-      output[3 * 8] = (tran_low_t)fdct_round_shift(t2);
-      output[5 * 8] = (tran_low_t)fdct_round_shift(t1);
-      output[7 * 8] = (tran_low_t)fdct_round_shift(t3);
-      input++;
-      output++;
-    }
-  }
-
-  // Rows
-  for (i = 0; i < 8; ++i) {
-    fdct8(&intermediate[i * 8], &coeff_ptr[i * 8]);
-    for (j = 0; j < 8; ++j) coeff_ptr[j + i * 8] /= 2;
-  }
-
-  // TODO(jingning) Decide the need of these arguments after the
-  // quantization process is completed.
-  (void)zbin_ptr;
-  (void)quant_shift_ptr;
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
-      int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-      tmp = (tmp * quant_ptr[rc != 0]) >> 16;
-
-      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
-
-      if (tmp) eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
 void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
                   int tx_type) {
   if (tx_type == DCT_DCT) {
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_denoiser.c b/media/libvpx/libvpx/vp9/encoder/vp9_denoiser.c
index 1d9a6702df..e5dffa90a8 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_denoiser.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_denoiser.c
@@ -185,33 +185,41 @@ static uint8_t *block_start(uint8_t *framebuf, int stride, int mi_row,
 }
 
 static VP9_DENOISER_DECISION perform_motion_compensation(
-    VP9_DENOISER *denoiser, MACROBLOCK *mb, BLOCK_SIZE bs,
+    VP9_COMMON *const cm, VP9_DENOISER *denoiser, MACROBLOCK *mb, BLOCK_SIZE bs,
     int increase_denoising, int mi_row, int mi_col, PICK_MODE_CONTEXT *ctx,
-    int motion_magnitude, int is_skin, int *zeromv_filter, int consec_zeromv) {
-  int sse_diff = ctx->zeromv_sse - ctx->newmv_sse;
-  MV_REFERENCE_FRAME frame;
+    int motion_magnitude, int is_skin, int *zeromv_filter, int consec_zeromv,
+    int num_spatial_layers, int width, int lst_fb_idx, int gld_fb_idx,
+    int use_svc, int spatial_layer, int use_gf_temporal_ref) {
+  const int sse_diff = (ctx->newmv_sse == UINT_MAX)
+                           ? 0
+                           : ((int)ctx->zeromv_sse - (int)ctx->newmv_sse);
+  int frame;
+  int denoise_layer_idx = 0;
   MACROBLOCKD *filter_mbd = &mb->e_mbd;
   MODE_INFO *mi = filter_mbd->mi[0];
   MODE_INFO saved_mi;
   int i;
   struct buf_2d saved_dst[MAX_MB_PLANE];
   struct buf_2d saved_pre[MAX_MB_PLANE];
+  const RefBuffer *saved_block_refs[2];
+  MV_REFERENCE_FRAME saved_frame;
 
   frame = ctx->best_reference_frame;
+
   saved_mi = *mi;
 
   if (is_skin && (motion_magnitude > 0 || consec_zeromv < 4)) return COPY_BLOCK;
 
-  // Avoid denoising for small block (unless motion is small).
-  // Small blocks are selected in variance partition (before encoding) and
-  // will typically lie on moving areas.
-  if (denoiser->denoising_level < kDenHigh && motion_magnitude > 16 &&
-      bs <= BLOCK_8X8)
+  // Avoid denoising small blocks. When noise > kDenLow or frame width > 480,
+  // denoise 16x16 blocks.
+  if (bs == BLOCK_8X8 || bs == BLOCK_8X16 || bs == BLOCK_16X8 ||
+      (bs == BLOCK_16X16 && width > 480 &&
+       denoiser->denoising_level <= kDenLow))
     return COPY_BLOCK;
 
   // If the best reference frame uses inter-prediction and there is enough of a
   // difference in sum-squared-error, use it.
-  if (frame != INTRA_FRAME && ctx->newmv_sse != UINT_MAX &&
+  if (frame != INTRA_FRAME && frame != ALTREF_FRAME && frame != GOLDEN_FRAME &&
       sse_diff > sse_diff_thresh(bs, increase_denoising, motion_magnitude)) {
     mi->ref_frame[0] = ctx->best_reference_frame;
     mi->mode = ctx->best_sse_inter_mode;
@@ -221,9 +229,12 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
     frame = ctx->best_zeromv_reference_frame;
     ctx->newmv_sse = ctx->zeromv_sse;
     // Bias to last reference.
-    if (frame != LAST_FRAME &&
-        ((ctx->zeromv_lastref_sse<(5 * ctx->zeromv_sse)>> 2) ||
-         denoiser->denoising_level >= kDenHigh)) {
+    if ((num_spatial_layers > 1 && !use_gf_temporal_ref) ||
+        frame == ALTREF_FRAME ||
+        (frame == GOLDEN_FRAME && use_gf_temporal_ref) ||
+        (frame != LAST_FRAME &&
+         ((ctx->zeromv_lastref_sse < (5 * ctx->zeromv_sse) >> 2) ||
+          denoiser->denoising_level >= kDenHigh))) {
       frame = LAST_FRAME;
       ctx->newmv_sse = ctx->zeromv_lastref_sse;
     }
@@ -238,6 +249,27 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
     }
   }
 
+  saved_frame = frame;
+  // When using SVC, we need to map REF_FRAME to the frame buffer index.
+  if (use_svc) {
+    if (frame == LAST_FRAME)
+      frame = lst_fb_idx + 1;
+    else if (frame == GOLDEN_FRAME)
+      frame = gld_fb_idx + 1;
+    // Shift for the second spatial layer.
+    if (num_spatial_layers - spatial_layer == 2)
+      frame = frame + denoiser->num_ref_frames;
+    denoise_layer_idx = num_spatial_layers - spatial_layer - 1;
+  }
+
+  // Force copy (no denoise, copy source in denoised buffer) if
+  // running_avg_y[frame] is NULL.
+  if (denoiser->running_avg_y[frame].buffer_alloc == NULL) {
+    // Restore everything to its original state
+    *mi = saved_mi;
+    return COPY_BLOCK;
+  }
+
   if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) {
     // Restore everything to its original state
     *mi = saved_mi;
@@ -254,6 +286,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
     saved_pre[i] = filter_mbd->plane[i].pre[0];
     saved_dst[i] = filter_mbd->plane[i].dst;
   }
+  saved_block_refs[0] = filter_mbd->block_refs[0];
 
   // Set the pointers in the MACROBLOCKD to point to the buffers in the denoiser
   // struct.
@@ -270,23 +303,28 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
                   denoiser->running_avg_y[frame].uv_stride, mi_row, mi_col);
   filter_mbd->plane[2].pre[0].stride = denoiser->running_avg_y[frame].uv_stride;
 
-  filter_mbd->plane[0].dst.buf =
-      block_start(denoiser->mc_running_avg_y.y_buffer,
-                  denoiser->mc_running_avg_y.y_stride, mi_row, mi_col);
-  filter_mbd->plane[0].dst.stride = denoiser->mc_running_avg_y.y_stride;
-  filter_mbd->plane[1].dst.buf =
-      block_start(denoiser->mc_running_avg_y.u_buffer,
-                  denoiser->mc_running_avg_y.uv_stride, mi_row, mi_col);
-  filter_mbd->plane[1].dst.stride = denoiser->mc_running_avg_y.uv_stride;
-  filter_mbd->plane[2].dst.buf =
-      block_start(denoiser->mc_running_avg_y.v_buffer,
-                  denoiser->mc_running_avg_y.uv_stride, mi_row, mi_col);
-  filter_mbd->plane[2].dst.stride = denoiser->mc_running_avg_y.uv_stride;
+  filter_mbd->plane[0].dst.buf = block_start(
+      denoiser->mc_running_avg_y[denoise_layer_idx].y_buffer,
+      denoiser->mc_running_avg_y[denoise_layer_idx].y_stride, mi_row, mi_col);
+  filter_mbd->plane[0].dst.stride =
+      denoiser->mc_running_avg_y[denoise_layer_idx].y_stride;
+  filter_mbd->plane[1].dst.buf = block_start(
+      denoiser->mc_running_avg_y[denoise_layer_idx].u_buffer,
+      denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride, mi_row, mi_col);
+  filter_mbd->plane[1].dst.stride =
+      denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride;
+  filter_mbd->plane[2].dst.buf = block_start(
+      denoiser->mc_running_avg_y[denoise_layer_idx].v_buffer,
+      denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride, mi_row, mi_col);
+  filter_mbd->plane[2].dst.stride =
+      denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride;
 
+  set_ref_ptrs(cm, filter_mbd, saved_frame, NO_REF_FRAME);
   vp9_build_inter_predictors_sby(filter_mbd, mi_row, mi_col, bs);
 
   // Restore everything to its original state
   *mi = saved_mi;
+  filter_mbd->block_refs[0] = saved_block_refs[0];
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     filter_mbd->plane[i].pre[0] = saved_pre[i];
     filter_mbd->plane[i].dst = saved_dst[i];
@@ -297,20 +335,31 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
 
 void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
                           BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx,
-                          VP9_DENOISER_DECISION *denoiser_decision) {
+                          VP9_DENOISER_DECISION *denoiser_decision,
+                          int use_gf_temporal_ref) {
   int mv_col, mv_row;
   int motion_magnitude = 0;
   int zeromv_filter = 0;
   VP9_DENOISER *denoiser = &cpi->denoiser;
   VP9_DENOISER_DECISION decision = COPY_BLOCK;
-  YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME];
-  YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y;
+
+  const int shift =
+      cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2
+          ? denoiser->num_ref_frames
+          : 0;
+  YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME + shift];
+  const int denoise_layer_index =
+      cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id - 1;
+  YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y[denoise_layer_index];
   uint8_t *avg_start = block_start(avg.y_buffer, avg.y_stride, mi_row, mi_col);
+
   uint8_t *mc_avg_start =
       block_start(mc_avg.y_buffer, mc_avg.y_stride, mi_row, mi_col);
   struct buf_2d src = mb->plane[0].src;
   int is_skin = 0;
+  int increase_denoising = 0;
   int consec_zeromv = 0;
+  int last_is_reference = cpi->ref_frame_flags & VP9_LAST_FLAG;
   mv_col = ctx->best_sse_mv.as_mv.col;
   mv_row = ctx->best_sse_mv.as_mv.row;
   motion_magnitude = mv_row * mv_row + mv_col * mv_col;
@@ -326,8 +375,8 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
       VP9_COMMON *const cm = &cpi->common;
       int j, i;
       // Loop through the 8x8 sub-blocks.
-      const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
-      const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
+      const int bw = num_8x8_blocks_wide_lookup[bs];
+      const int bh = num_8x8_blocks_high_lookup[bs];
       const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
       const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
       const int block_index = mi_row * cm->mi_cols + mi_col;
@@ -338,10 +387,10 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
           consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index], consec_zeromv);
           // No need to keep checking 8x8 blocks if any of the sub-blocks
           // has small consec_zeromv (since threshold for no_skin based on
-          // zero/small motion in skin detection is high, i.e, > 4).
+          // zero/small motion in skin detection is high, i.e., > 4).
           if (consec_zeromv < 4) {
             i = ymis;
-            j = xmis;
+            break;
           }
         }
       }
@@ -352,30 +401,34 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
         mb->plane[0].src.stride, mb->plane[1].src.stride, bs, consec_zeromv,
         motion_level);
   }
-  if (!is_skin && denoiser->denoising_level == kDenHigh) {
-    denoiser->increase_denoising = 1;
-  } else {
-    denoiser->increase_denoising = 0;
-  }
+  if (!is_skin && denoiser->denoising_level == kDenHigh) increase_denoising = 1;
 
-  if (denoiser->denoising_level >= kDenLow)
+  // Copy block if LAST_FRAME is not a reference.
+  // Last doesn't always exist when SVC layers are dynamically changed, e.g. top
+  // spatial layer doesn't have last reference when it's brought up for the
+  // first time on the fly.
+  if (last_is_reference && denoiser->denoising_level >= kDenLow &&
+      !ctx->sb_skip_denoising)
     decision = perform_motion_compensation(
-        denoiser, mb, bs, denoiser->increase_denoising, mi_row, mi_col, ctx,
-        motion_magnitude, is_skin, &zeromv_filter, consec_zeromv);
+        &cpi->common, denoiser, mb, bs, increase_denoising, mi_row, mi_col, ctx,
+        motion_magnitude, is_skin, &zeromv_filter, consec_zeromv,
+        cpi->svc.number_spatial_layers, cpi->Source->y_width, cpi->lst_fb_idx,
+        cpi->gld_fb_idx, cpi->use_svc, cpi->svc.spatial_layer_id,
+        use_gf_temporal_ref);
 
   if (decision == FILTER_BLOCK) {
-    decision = vp9_denoiser_filter(
-        src.buf, src.stride, mc_avg_start, mc_avg.y_stride, avg_start,
-        avg.y_stride, denoiser->increase_denoising, bs, motion_magnitude);
+    decision = vp9_denoiser_filter(src.buf, src.stride, mc_avg_start,
+                                   mc_avg.y_stride, avg_start, avg.y_stride,
+                                   increase_denoising, bs, motion_magnitude);
   }
 
   if (decision == FILTER_BLOCK) {
-    vpx_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride, NULL, 0,
-                      NULL, 0, num_4x4_blocks_wide_lookup[bs] << 2,
+    vpx_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride, NULL, 0, 0,
+                      0, 0, num_4x4_blocks_wide_lookup[bs] << 2,
                       num_4x4_blocks_high_lookup[bs] << 2);
   } else {  // COPY_BLOCK
-    vpx_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride, NULL, 0,
-                      NULL, 0, num_4x4_blocks_wide_lookup[bs] << 2,
+    vpx_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride, NULL, 0, 0,
+                      0, 0, num_4x4_blocks_wide_lookup[bs] << 2,
                       num_4x4_blocks_high_lookup[bs] << 2);
   }
   *denoiser_decision = decision;
@@ -408,49 +461,64 @@ static void swap_frame_buffer(YV12_BUFFER_CONFIG *const dest,
   src->y_buffer = tmp_buf;
 }
 
-void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
-                                    YV12_BUFFER_CONFIG src,
-                                    FRAME_TYPE frame_type,
-                                    int refresh_alt_ref_frame,
-                                    int refresh_golden_frame,
-                                    int refresh_last_frame, int resized) {
+void vp9_denoiser_update_frame_info(
+    VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, struct SVC *svc,
+    FRAME_TYPE frame_type, int refresh_alt_ref_frame, int refresh_golden_frame,
+    int refresh_last_frame, int alt_fb_idx, int gld_fb_idx, int lst_fb_idx,
+    int resized, int svc_refresh_denoiser_buffers, int second_spatial_layer) {
+  const int shift = second_spatial_layer ? denoiser->num_ref_frames : 0;
   // Copy source into denoised reference buffers on KEY_FRAME or
-  // if the just encoded frame was resized.
-  if (frame_type == KEY_FRAME || resized != 0 || denoiser->reset) {
+  // if the just encoded frame was resized. For SVC, copy source if the base
+  // spatial layer was key frame.
+  if (frame_type == KEY_FRAME || resized != 0 || denoiser->reset ||
+      svc_refresh_denoiser_buffers) {
     int i;
     // Start at 1 so as not to overwrite the INTRA_FRAME
-    for (i = 1; i < MAX_REF_FRAMES; ++i)
-      copy_frame(&denoiser->running_avg_y[i], &src);
+    for (i = 1; i < denoiser->num_ref_frames; ++i) {
+      if (denoiser->running_avg_y[i + shift].buffer_alloc != NULL)
+        copy_frame(&denoiser->running_avg_y[i + shift], &src);
+    }
     denoiser->reset = 0;
     return;
   }
 
-  // If more than one refresh occurs, must copy frame buffer.
-  if ((refresh_alt_ref_frame + refresh_golden_frame + refresh_last_frame) > 1) {
-    if (refresh_alt_ref_frame) {
-      copy_frame(&denoiser->running_avg_y[ALTREF_FRAME],
-                 &denoiser->running_avg_y[INTRA_FRAME]);
-    }
-    if (refresh_golden_frame) {
-      copy_frame(&denoiser->running_avg_y[GOLDEN_FRAME],
-                 &denoiser->running_avg_y[INTRA_FRAME]);
-    }
-    if (refresh_last_frame) {
-      copy_frame(&denoiser->running_avg_y[LAST_FRAME],
-                 &denoiser->running_avg_y[INTRA_FRAME]);
+  if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
+      svc->use_set_ref_frame_config) {
+    int i;
+    for (i = 0; i < REF_FRAMES; i++) {
+      if (svc->update_buffer_slot[svc->spatial_layer_id] & (1 << i))
+        copy_frame(&denoiser->running_avg_y[i + 1 + shift],
+                   &denoiser->running_avg_y[INTRA_FRAME + shift]);
     }
   } else {
-    if (refresh_alt_ref_frame) {
-      swap_frame_buffer(&denoiser->running_avg_y[ALTREF_FRAME],
-                        &denoiser->running_avg_y[INTRA_FRAME]);
-    }
-    if (refresh_golden_frame) {
-      swap_frame_buffer(&denoiser->running_avg_y[GOLDEN_FRAME],
-                        &denoiser->running_avg_y[INTRA_FRAME]);
-    }
-    if (refresh_last_frame) {
-      swap_frame_buffer(&denoiser->running_avg_y[LAST_FRAME],
-                        &denoiser->running_avg_y[INTRA_FRAME]);
+    // If more than one refresh occurs, must copy frame buffer.
+    if ((refresh_alt_ref_frame + refresh_golden_frame + refresh_last_frame) >
+        1) {
+      if (refresh_alt_ref_frame) {
+        copy_frame(&denoiser->running_avg_y[alt_fb_idx + 1 + shift],
+                   &denoiser->running_avg_y[INTRA_FRAME + shift]);
+      }
+      if (refresh_golden_frame) {
+        copy_frame(&denoiser->running_avg_y[gld_fb_idx + 1 + shift],
+                   &denoiser->running_avg_y[INTRA_FRAME + shift]);
+      }
+      if (refresh_last_frame) {
+        copy_frame(&denoiser->running_avg_y[lst_fb_idx + 1 + shift],
+                   &denoiser->running_avg_y[INTRA_FRAME + shift]);
+      }
+    } else {
+      if (refresh_alt_ref_frame) {
+        swap_frame_buffer(&denoiser->running_avg_y[alt_fb_idx + 1 + shift],
+                          &denoiser->running_avg_y[INTRA_FRAME + shift]);
+      }
+      if (refresh_golden_frame) {
+        swap_frame_buffer(&denoiser->running_avg_y[gld_fb_idx + 1 + shift],
+                          &denoiser->running_avg_y[INTRA_FRAME + shift]);
+      }
+      if (refresh_last_frame) {
+        swap_frame_buffer(&denoiser->running_avg_y[lst_fb_idx + 1 + shift],
+                          &denoiser->running_avg_y[INTRA_FRAME + shift]);
+      }
     }
   }
 }
@@ -479,19 +547,122 @@ void vp9_denoiser_update_frame_stats(MODE_INFO *mi, unsigned int sse,
   }
 }
 
-int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, int ssx,
-                       int ssy,
+static int vp9_denoiser_realloc_svc_helper(VP9_COMMON *cm,
+                                           VP9_DENOISER *denoiser, int fb_idx) {
+  int fail = 0;
+  if (denoiser->running_avg_y[fb_idx].buffer_alloc == NULL) {
+    fail =
+        vpx_alloc_frame_buffer(&denoiser->running_avg_y[fb_idx], cm->width,
+                               cm->height, cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
+                               VP9_ENC_BORDER_IN_PIXELS, 0);
+    if (fail) {
+      vp9_denoiser_free(denoiser);
+      return 1;
+    }
+  }
+  return 0;
+}
+
+int vp9_denoiser_realloc_svc(VP9_COMMON *cm, VP9_DENOISER *denoiser,
+                             struct SVC *svc, int svc_buf_shift,
+                             int refresh_alt, int refresh_gld, int refresh_lst,
+                             int alt_fb_idx, int gld_fb_idx, int lst_fb_idx) {
+  int fail = 0;
+  if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
+      svc->use_set_ref_frame_config) {
+    int i;
+    for (i = 0; i < REF_FRAMES; i++) {
+      if (cm->frame_type == KEY_FRAME ||
+          svc->update_buffer_slot[svc->spatial_layer_id] & (1 << i)) {
+        fail = vp9_denoiser_realloc_svc_helper(cm, denoiser,
+                                               i + 1 + svc_buf_shift);
+      }
+    }
+  } else {
+    if (refresh_alt) {
+      // Increase the frame buffer index by 1 to map it to the buffer index in
+      // the denoiser.
+      fail = vp9_denoiser_realloc_svc_helper(cm, denoiser,
+                                             alt_fb_idx + 1 + svc_buf_shift);
+      if (fail) return 1;
+    }
+    if (refresh_gld) {
+      fail = vp9_denoiser_realloc_svc_helper(cm, denoiser,
+                                             gld_fb_idx + 1 + svc_buf_shift);
+      if (fail) return 1;
+    }
+    if (refresh_lst) {
+      fail = vp9_denoiser_realloc_svc_helper(cm, denoiser,
+                                             lst_fb_idx + 1 + svc_buf_shift);
+      if (fail) return 1;
+    }
+  }
+  return 0;
+}
+
+int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser,
+                       int use_svc, int noise_sen, int width, int height,
+                       int ssx, int ssy,
 #if CONFIG_VP9_HIGHBITDEPTH
                        int use_highbitdepth,
 #endif
                        int border) {
-  int i, fail;
+  int i, layer, fail, init_num_ref_frames;
   const int legacy_byte_alignment = 0;
+  int num_layers = 1;
+  int scaled_width = width;
+  int scaled_height = height;
+  if (use_svc) {
+    LAYER_CONTEXT *lc = &svc->layer_context[svc->spatial_layer_id *
+                                                svc->number_temporal_layers +
+                                            svc->temporal_layer_id];
+    get_layer_resolution(width, height, lc->scaling_factor_num,
+                         lc->scaling_factor_den, &scaled_width, &scaled_height);
+    // For SVC: only denoise at most 2 spatial (highest) layers.
+    if (noise_sen >= 2)
+      // Denoise from one spatial layer below the top.
+      svc->first_layer_denoise = VPXMAX(svc->number_spatial_layers - 2, 0);
+    else
+      // Only denoise the top spatial layer.
+      svc->first_layer_denoise = VPXMAX(svc->number_spatial_layers - 1, 0);
+    num_layers = svc->number_spatial_layers - svc->first_layer_denoise;
+  }
   assert(denoiser != NULL);
+  denoiser->num_ref_frames = use_svc ? SVC_REF_FRAMES : NONSVC_REF_FRAMES;
+  init_num_ref_frames = use_svc ? MAX_REF_FRAMES : NONSVC_REF_FRAMES;
+  denoiser->num_layers = num_layers;
+  CHECK_MEM_ERROR(&cm->error, denoiser->running_avg_y,
+                  vpx_calloc(denoiser->num_ref_frames * num_layers,
+                             sizeof(denoiser->running_avg_y[0])));
+  CHECK_MEM_ERROR(
+      &cm->error, denoiser->mc_running_avg_y,
+      vpx_calloc(num_layers, sizeof(denoiser->mc_running_avg_y[0])));
 
-  for (i = 0; i < MAX_REF_FRAMES; ++i) {
-    fail = vpx_alloc_frame_buffer(&denoiser->running_avg_y[i], width, height,
-                                  ssx, ssy,
+  for (layer = 0; layer < num_layers; ++layer) {
+    const int denoise_width = (layer == 0) ? width : scaled_width;
+    const int denoise_height = (layer == 0) ? height : scaled_height;
+    for (i = 0; i < init_num_ref_frames; ++i) {
+      fail = vpx_alloc_frame_buffer(
+          &denoiser->running_avg_y[i + denoiser->num_ref_frames * layer],
+          denoise_width, denoise_height, ssx, ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+          use_highbitdepth,
+#endif
+          border, legacy_byte_alignment);
+      if (fail) {
+        vp9_denoiser_free(denoiser);
+        return 1;
+      }
+#ifdef OUTPUT_YUV_DENOISED
+      make_grayscale(&denoiser->running_avg_y[i]);
+#endif
+    }
+
+    fail = vpx_alloc_frame_buffer(&denoiser->mc_running_avg_y[layer],
+                                  denoise_width, denoise_height, ssx, ssy,
 #if CONFIG_VP9_HIGHBITDEPTH
                                   use_highbitdepth,
 #endif
@@ -500,22 +671,10 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, int ssx,
       vp9_denoiser_free(denoiser);
       return 1;
     }
-#ifdef OUTPUT_YUV_DENOISED
-    make_grayscale(&denoiser->running_avg_y[i]);
-#endif
-  }
-
-  fail = vpx_alloc_frame_buffer(&denoiser->mc_running_avg_y, width, height, ssx,
-                                ssy,
-#if CONFIG_VP9_HIGHBITDEPTH
-                                use_highbitdepth,
-#endif
-                                border, legacy_byte_alignment);
-  if (fail) {
-    vp9_denoiser_free(denoiser);
-    return 1;
   }
 
+  // denoiser->last_source only used for noise_estimation, so only for top
+  // layer.
   fail = vpx_alloc_frame_buffer(&denoiser->last_source, width, height, ssx, ssy,
 #if CONFIG_VP9_HIGHBITDEPTH
                                 use_highbitdepth,
@@ -528,11 +687,11 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, int ssx,
 #ifdef OUTPUT_YUV_DENOISED
   make_grayscale(&denoiser->running_avg_y[i]);
 #endif
-  denoiser->increase_denoising = 0;
   denoiser->frame_buffer_initialized = 1;
-  denoiser->denoising_level = kDenLow;
-  denoiser->prev_denoising_level = kDenLow;
+  denoiser->denoising_level = kDenMedium;
+  denoiser->prev_denoising_level = kDenMedium;
   denoiser->reset = 0;
+  denoiser->current_denoiser_frame = 0;
   return 0;
 }
 
@@ -542,23 +701,126 @@ void vp9_denoiser_free(VP9_DENOISER *denoiser) {
     return;
   }
   denoiser->frame_buffer_initialized = 0;
-  for (i = 0; i < MAX_REF_FRAMES; ++i) {
+  for (i = 0; i < denoiser->num_ref_frames * denoiser->num_layers; ++i) {
     vpx_free_frame_buffer(&denoiser->running_avg_y[i]);
   }
-  vpx_free_frame_buffer(&denoiser->mc_running_avg_y);
+  vpx_free(denoiser->running_avg_y);
+  denoiser->running_avg_y = NULL;
+
+  for (i = 0; i < denoiser->num_layers; ++i) {
+    vpx_free_frame_buffer(&denoiser->mc_running_avg_y[i]);
+  }
+
+  vpx_free(denoiser->mc_running_avg_y);
+  denoiser->mc_running_avg_y = NULL;
   vpx_free_frame_buffer(&denoiser->last_source);
 }
 
-void vp9_denoiser_set_noise_level(VP9_DENOISER *denoiser, int noise_level) {
+static void force_refresh_longterm_ref(VP9_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  // If long term reference is used, force refresh of that slot, so
+  // denoiser buffer for long term reference stays in sync.
+  if (svc->use_gf_temporal_ref_current_layer) {
+    int index = svc->spatial_layer_id;
+    if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1;
+    assert(index >= 0);
+    cpi->alt_fb_idx = svc->buffer_gf_temporal_ref[index].idx;
+    cpi->refresh_alt_ref_frame = 1;
+  }
+}
+
+void vp9_denoiser_set_noise_level(VP9_COMP *const cpi, int noise_level) {
+  VP9_DENOISER *const denoiser = &cpi->denoiser;
   denoiser->denoising_level = noise_level;
   if (denoiser->denoising_level > kDenLowLow &&
-      denoiser->prev_denoising_level == kDenLowLow)
+      denoiser->prev_denoising_level == kDenLowLow) {
     denoiser->reset = 1;
-  else
+    force_refresh_longterm_ref(cpi);
+  } else {
     denoiser->reset = 0;
+  }
   denoiser->prev_denoising_level = denoiser->denoising_level;
 }
 
+// Scale/increase the partition threshold
+// for denoiser speed-up.
+int64_t vp9_scale_part_thresh(int64_t threshold, VP9_DENOISER_LEVEL noise_level,
+                              int content_state, int temporal_layer_id) {
+  if ((content_state == kLowSadLowSumdiff) ||
+      (content_state == kHighSadLowSumdiff) ||
+      (content_state == kLowVarHighSumdiff) || (noise_level == kDenHigh) ||
+      (temporal_layer_id != 0)) {
+    int64_t scaled_thr =
+        (temporal_layer_id < 2) ? (3 * threshold) >> 1 : (7 * threshold) >> 2;
+    return scaled_thr;
+  } else {
+    return (5 * threshold) >> 2;
+  }
+}
+
+//  Scale/increase the ac skip threshold for
+//  denoiser speed-up.
+int64_t vp9_scale_acskip_thresh(int64_t threshold,
+                                VP9_DENOISER_LEVEL noise_level, int abs_sumdiff,
+                                int temporal_layer_id) {
+  if (noise_level >= kDenLow && abs_sumdiff < 5)
+    return threshold *= (noise_level == kDenLow)   ? 2
+                        : (temporal_layer_id == 2) ? 10
+                                                   : 6;
+  else
+    return threshold;
+}
+
+void vp9_denoiser_reset_on_first_frame(VP9_COMP *const cpi) {
+  if (vp9_denoise_svc_non_key(cpi) &&
+      cpi->denoiser.current_denoiser_frame == 0) {
+    cpi->denoiser.reset = 1;
+    force_refresh_longterm_ref(cpi);
+  }
+}
+
+void vp9_denoiser_update_ref_frame(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  SVC *const svc = &cpi->svc;
+
+  if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
+      cpi->denoiser.denoising_level > kDenLowLow) {
+    int svc_refresh_denoiser_buffers = 0;
+    int denoise_svc_second_layer = 0;
+    FRAME_TYPE frame_type = cm->intra_only ? KEY_FRAME : cm->frame_type;
+    cpi->denoiser.current_denoiser_frame++;
+    if (cpi->use_svc) {
+      const int svc_buf_shift =
+          svc->number_spatial_layers - svc->spatial_layer_id == 2
+              ? cpi->denoiser.num_ref_frames
+              : 0;
+      int layer =
+          LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id,
+                           svc->number_temporal_layers);
+      LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+      svc_refresh_denoiser_buffers =
+          lc->is_key_frame || svc->spatial_layer_sync[svc->spatial_layer_id];
+      denoise_svc_second_layer =
+          svc->number_spatial_layers - svc->spatial_layer_id == 2 ? 1 : 0;
+      // Check if we need to allocate extra buffers in the denoiser
+      // for refreshed frames.
+      if (vp9_denoiser_realloc_svc(cm, &cpi->denoiser, svc, svc_buf_shift,
+                                   cpi->refresh_alt_ref_frame,
+                                   cpi->refresh_golden_frame,
+                                   cpi->refresh_last_frame, cpi->alt_fb_idx,
+                                   cpi->gld_fb_idx, cpi->lst_fb_idx))
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to re-allocate denoiser for SVC");
+    }
+    vp9_denoiser_update_frame_info(
+        &cpi->denoiser, *cpi->Source, svc, frame_type,
+        cpi->refresh_alt_ref_frame, cpi->refresh_golden_frame,
+        cpi->refresh_last_frame, cpi->alt_fb_idx, cpi->gld_fb_idx,
+        cpi->lst_fb_idx, cpi->resize_pending, svc_refresh_denoiser_buffers,
+        denoise_svc_second_layer);
+  }
+}
+
 #ifdef OUTPUT_YUV_DENOISED
 static void make_grayscale(YV12_BUFFER_CONFIG *yuv) {
   int r, c;
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_denoiser.h b/media/libvpx/libvpx/vp9/encoder/vp9_denoiser.h
index fcfaa5051a..1973e98988 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_denoiser.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_denoiser.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_DENOISER_H_
-#define VP9_ENCODER_DENOISER_H_
+#ifndef VPX_VP9_ENCODER_VP9_DENOISER_H_
+#define VPX_VP9_ENCODER_VP9_DENOISER_H_
 
 #include "vp9/encoder/vp9_block.h"
 #include "vp9/encoder/vp9_skin_detection.h"
@@ -21,6 +21,14 @@ extern "C" {
 
 #define MOTION_MAGNITUDE_THRESHOLD (8 * 3)
 
+// Denoiser is used in non svc real-time mode which does not use alt-ref, so no
+// need to allocate for it, and hence we need MAX_REF_FRAME - 1
+#define NONSVC_REF_FRAMES MAX_REF_FRAMES - 1
+
+// Number of frame buffers when SVC is used. [0] for current denoised buffer and
+// [1..8] for REF_FRAMES
+#define SVC_REF_FRAMES 9
+
 typedef enum vp9_denoiser_decision {
   COPY_BLOCK,
   FILTER_BLOCK,
@@ -35,12 +43,14 @@ typedef enum vp9_denoiser_level {
 } VP9_DENOISER_LEVEL;
 
 typedef struct vp9_denoiser {
-  YV12_BUFFER_CONFIG running_avg_y[MAX_REF_FRAMES];
-  YV12_BUFFER_CONFIG mc_running_avg_y;
+  YV12_BUFFER_CONFIG *running_avg_y;
+  YV12_BUFFER_CONFIG *mc_running_avg_y;
   YV12_BUFFER_CONFIG last_source;
-  int increase_denoising;
   int frame_buffer_initialized;
   int reset;
+  int num_ref_frames;
+  int num_layers;
+  unsigned int current_denoiser_frame;
   VP9_DENOISER_LEVEL denoising_level;
   VP9_DENOISER_LEVEL prev_denoising_level;
 } VP9_DENOISER;
@@ -58,17 +68,18 @@ typedef struct {
 } VP9_PICKMODE_CTX_DEN;
 
 struct VP9_COMP;
+struct SVC;
 
-void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
-                                    YV12_BUFFER_CONFIG src,
-                                    FRAME_TYPE frame_type,
-                                    int refresh_alt_ref_frame,
-                                    int refresh_golden_frame,
-                                    int refresh_last_frame, int resized);
+void vp9_denoiser_update_frame_info(
+    VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, struct SVC *svc,
+    FRAME_TYPE frame_type, int refresh_alt_ref_frame, int refresh_golden_frame,
+    int refresh_last_frame, int alt_fb_idx, int gld_fb_idx, int lst_fb_idx,
+    int resized, int svc_refresh_denoiser_buffers, int second_spatial_layer);
 
 void vp9_denoiser_denoise(struct VP9_COMP *cpi, MACROBLOCK *mb, int mi_row,
                           int mi_col, BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx,
-                          VP9_DENOISER_DECISION *denoiser_decision);
+                          VP9_DENOISER_DECISION *denoiser_decision,
+                          int use_gf_temporal_ref);
 
 void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx);
 
@@ -76,8 +87,14 @@ void vp9_denoiser_update_frame_stats(MODE_INFO *mi, unsigned int sse,
                                      PREDICTION_MODE mode,
                                      PICK_MODE_CONTEXT *ctx);
 
-int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, int ssx,
-                       int ssy,
+int vp9_denoiser_realloc_svc(VP9_COMMON *cm, VP9_DENOISER *denoiser,
+                             struct SVC *svc, int svc_buf_shift,
+                             int refresh_alt, int refresh_gld, int refresh_lst,
+                             int alt_fb_idx, int gld_fb_idx, int lst_fb_idx);
+
+int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser,
+                       int use_svc, int noise_sen, int width, int height,
+                       int ssx, int ssy,
 #if CONFIG_VP9_HIGHBITDEPTH
                        int use_highbitdepth,
 #endif
@@ -95,10 +112,21 @@ static INLINE int total_adj_strong_thresh(BLOCK_SIZE bs,
 
 void vp9_denoiser_free(VP9_DENOISER *denoiser);
 
-void vp9_denoiser_set_noise_level(VP9_DENOISER *denoiser, int noise_level);
+void vp9_denoiser_set_noise_level(struct VP9_COMP *const cpi, int noise_level);
+
+void vp9_denoiser_reset_on_first_frame(struct VP9_COMP *const cpi);
+
+int64_t vp9_scale_part_thresh(int64_t threshold, VP9_DENOISER_LEVEL noise_level,
+                              int content_state, int temporal_layer_id);
+
+int64_t vp9_scale_acskip_thresh(int64_t threshold,
+                                VP9_DENOISER_LEVEL noise_level, int abs_sumdiff,
+                                int temporal_layer_id);
+
+void vp9_denoiser_update_ref_frame(struct VP9_COMP *const cpi);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_DENOISER_H_
+#endif  // VPX_VP9_ENCODER_VP9_DENOISER_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c b/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c
index 323c053edf..53b8136b3e 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <float.h>
 #include <limits.h>
 #include <math.h>
 #include <stdio.h>
@@ -20,6 +21,10 @@
 #include "vpx_ports/mem.h"
 #include "vpx_ports/vpx_timer.h"
 #include "vpx_ports/system_state.h"
+#include "vpx_util/vpx_pthread.h"
+#if CONFIG_MISMATCH_DEBUG
+#include "vpx_util/vpx_debug_util.h"
+#endif  // CONFIG_MISMATCH_DEBUG
 
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_entropy.h"
@@ -32,16 +37,22 @@
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_tile_common.h"
-
+#if !CONFIG_REALTIME_ONLY
 #include "vp9/encoder/vp9_aq_360.h"
 #include "vp9/encoder/vp9_aq_complexity.h"
+#endif
 #include "vp9/encoder/vp9_aq_cyclicrefresh.h"
+#if !CONFIG_REALTIME_ONLY
 #include "vp9/encoder/vp9_aq_variance.h"
+#endif
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_ethread.h"
 #include "vp9/encoder/vp9_extend.h"
+#include "vp9/encoder/vp9_multi_thread.h"
+#include "vp9/encoder/vp9_partition_models.h"
 #include "vp9/encoder/vp9_pickmode.h"
 #include "vp9/encoder/vp9_rd.h"
 #include "vp9/encoder/vp9_rdopt.h"
@@ -98,19 +109,17 @@ static const uint16_t VP9_HIGH_VAR_OFFS_12[64] = {
 };
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-unsigned int vp9_get_sby_perpixel_variance(VP9_COMP *cpi,
-                                           const struct buf_2d *ref,
-                                           BLOCK_SIZE bs) {
+unsigned int vp9_get_sby_variance(VP9_COMP *cpi, const struct buf_2d *ref,
+                                  BLOCK_SIZE bs) {
   unsigned int sse;
   const unsigned int var =
       cpi->fn_ptr[bs].vf(ref->buf, ref->stride, VP9_VAR_OFFS, 0, &sse);
-  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+  return var;
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi,
-                                                const struct buf_2d *ref,
-                                                BLOCK_SIZE bs, int bd) {
+unsigned int vp9_high_get_sby_variance(VP9_COMP *cpi, const struct buf_2d *ref,
+                                       BLOCK_SIZE bs, int bd) {
   unsigned int var, sse;
   switch (bd) {
     case 10:
@@ -130,37 +139,90 @@ unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi,
                              CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_8), 0, &sse);
       break;
   }
-  return ROUND64_POWER_OF_TWO((int64_t)var, num_pels_log2_lookup[bs]);
+  return var;
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-static unsigned int get_sby_perpixel_diff_variance(VP9_COMP *cpi,
-                                                   const struct buf_2d *ref,
-                                                   int mi_row, int mi_col,
-                                                   BLOCK_SIZE bs) {
-  unsigned int sse, var;
-  uint8_t *last_y;
-  const YV12_BUFFER_CONFIG *last = get_ref_frame_buffer(cpi, LAST_FRAME);
-
-  assert(last != NULL);
-  last_y =
-      &last->y_buffer[mi_row * MI_SIZE * last->y_stride + mi_col * MI_SIZE];
-  var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, last_y, last->y_stride, &sse);
-  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+unsigned int vp9_get_sby_perpixel_variance(VP9_COMP *cpi,
+                                           const struct buf_2d *ref,
+                                           BLOCK_SIZE bs) {
+  return ROUND_POWER_OF_TWO(vp9_get_sby_variance(cpi, ref, bs),
+                            num_pels_log2_lookup[bs]);
 }
 
-static BLOCK_SIZE get_rd_var_based_fixed_partition(VP9_COMP *cpi, MACROBLOCK *x,
-                                                   int mi_row, int mi_col) {
-  unsigned int var = get_sby_perpixel_diff_variance(
-      cpi, &x->plane[0].src, mi_row, mi_col, BLOCK_64X64);
-  if (var < 8)
-    return BLOCK_64X64;
-  else if (var < 128)
-    return BLOCK_32X32;
-  else if (var < 2048)
-    return BLOCK_16X16;
-  else
-    return BLOCK_8X8;
+#if CONFIG_VP9_HIGHBITDEPTH
+unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi,
+                                                const struct buf_2d *ref,
+                                                BLOCK_SIZE bs, int bd) {
+  return (unsigned int)ROUND64_POWER_OF_TWO(
+      (int64_t)vp9_high_get_sby_variance(cpi, ref, bs, bd),
+      num_pels_log2_lookup[bs]);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static void set_segment_index(VP9_COMP *cpi, MACROBLOCK *const x, int mi_row,
+                              int mi_col, BLOCK_SIZE bsize, int segment_index) {
+  VP9_COMMON *const cm = &cpi->common;
+  const struct segmentation *const seg = &cm->seg;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *mi = xd->mi[0];
+
+  const AQ_MODE aq_mode = cpi->oxcf.aq_mode;
+  const uint8_t *const map =
+      seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
+
+  // Initialize the segmentation index as 0.
+  mi->segment_id = 0;
+
+  // Skip the rest if AQ mode is disabled.
+  if (!seg->enabled) return;
+
+  switch (aq_mode) {
+    case CYCLIC_REFRESH_AQ:
+      mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+      break;
+#if !CONFIG_REALTIME_ONLY
+    case VARIANCE_AQ:
+      if (cm->frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame ||
+          cpi->force_update_segmentation ||
+          (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+        int min_energy;
+        int max_energy;
+        // Get sub block energy range
+        if (bsize >= BLOCK_32X32) {
+          vp9_get_sub_block_energy(cpi, x, mi_row, mi_col, bsize, &min_energy,
+                                   &max_energy);
+        } else {
+          min_energy = bsize <= BLOCK_16X16 ? x->mb_energy
+                                            : vp9_block_energy(cpi, x, bsize);
+        }
+        mi->segment_id = vp9_vaq_segment_id(min_energy);
+      } else {
+        mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+      }
+      break;
+    case EQUATOR360_AQ:
+      if (cm->frame_type == KEY_FRAME || cpi->force_update_segmentation)
+        mi->segment_id = vp9_360aq_segment_id(mi_row, cm->mi_rows);
+      else
+        mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+      break;
+#endif
+    case LOOKAHEAD_AQ:
+      mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+      break;
+    case PSNR_AQ: mi->segment_id = segment_index; break;
+    case PERCEPTUAL_AQ: mi->segment_id = x->segment_id; break;
+    default:
+      // NO_AQ or PSNR_AQ
+      break;
+  }
+
+  // Set segment index if ROI map or active_map is enabled.
+  if (cpi->roi.enabled || cpi->active_map.enabled)
+    mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+
+  vp9_init_plane_quantizers(cpi, x);
 }
 
 // Lighter version of set_offsets that only sets the mode info
@@ -175,23 +237,57 @@ static INLINE void set_mode_info_offsets(VP9_COMMON *const cm,
   x->mbmi_ext = x->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
 }
 
+static void set_ssim_rdmult(VP9_COMP *const cpi, MACROBLOCK *const x,
+                            const BLOCK_SIZE bsize, const int mi_row,
+                            const int mi_col, int *const rdmult) {
+  const VP9_COMMON *const cm = &cpi->common;
+
+  const int bsize_base = BLOCK_16X16;
+  const int num_8x8_w = num_8x8_blocks_wide_lookup[bsize_base];
+  const int num_8x8_h = num_8x8_blocks_high_lookup[bsize_base];
+  const int num_cols = (cm->mi_cols + num_8x8_w - 1) / num_8x8_w;
+  const int num_rows = (cm->mi_rows + num_8x8_h - 1) / num_8x8_h;
+  const int num_bcols =
+      (num_8x8_blocks_wide_lookup[bsize] + num_8x8_w - 1) / num_8x8_w;
+  const int num_brows =
+      (num_8x8_blocks_high_lookup[bsize] + num_8x8_h - 1) / num_8x8_h;
+  int row, col;
+  double num_of_mi = 0.0;
+  double geom_mean_of_scale = 0.0;
+
+  assert(cpi->oxcf.tuning == VP8_TUNE_SSIM);
+
+  for (row = mi_row / num_8x8_w;
+       row < num_rows && row < mi_row / num_8x8_w + num_brows; ++row) {
+    for (col = mi_col / num_8x8_h;
+         col < num_cols && col < mi_col / num_8x8_h + num_bcols; ++col) {
+      const int index = row * num_cols + col;
+      geom_mean_of_scale += log(cpi->mi_ssim_rdmult_scaling_factors[index]);
+      num_of_mi += 1.0;
+    }
+  }
+  geom_mean_of_scale = exp(geom_mean_of_scale / num_of_mi);
+
+  *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale);
+  *rdmult = VPXMAX(*rdmult, 0);
+  set_error_per_bit(x, *rdmult);
+  vpx_clear_system_state();
+}
+
 static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
                         MACROBLOCK *const x, int mi_row, int mi_col,
                         BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *mi;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  const struct segmentation *const seg = &cm->seg;
   MvLimits *const mv_limits = &x->mv_limits;
 
   set_skip_context(xd, mi_row, mi_col);
 
   set_mode_info_offsets(cm, x, xd, mi_row, mi_col);
 
-  mi = xd->mi[0];
-
   // Set up destination pointers.
   vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
 
@@ -213,21 +309,8 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
   // R/D setup.
   x->rddiv = cpi->rd.RDDIV;
   x->rdmult = cpi->rd.RDMULT;
-
-  // Setup segment ID.
-  if (seg->enabled) {
-    if (cpi->oxcf.aq_mode != VARIANCE_AQ && cpi->oxcf.aq_mode != LOOKAHEAD_AQ &&
-        cpi->oxcf.aq_mode != EQUATOR360_AQ) {
-      const uint8_t *const map =
-          seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
-      mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
-    }
-    vp9_init_plane_quantizers(cpi, x);
-
-    x->encode_breakout = cpi->segment_encode_breakout[mi->segment_id];
-  } else {
-    mi->segment_id = 0;
-    x->encode_breakout = cpi->encode_breakout;
+  if (oxcf->tuning == VP8_TUNE_SSIM) {
+    set_ssim_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult);
   }
 
   // required by vp9_append_sub8x8_mvs_for_idx() and vp9_find_best_ref_mvs()
@@ -259,21 +342,25 @@ static void set_block_size(VP9_COMP *const cpi, MACROBLOCK *const x,
 }
 
 typedef struct {
-  int64_t sum_square_error;
-  int64_t sum_error;
+  // This struct is used for computing variance in choose_partitioning(), where
+  // the max number of samples within a superblock is 16x16 (with 4x4 avg). Even
+  // in high bitdepth, uint32_t is enough for sum_square_error (2^12 * 2^12 * 16
+  // * 16 = 2^32).
+  uint32_t sum_square_error;
+  int32_t sum_error;
   int log2_count;
   int variance;
-} var;
+} Var;
 
 typedef struct {
-  var none;
-  var horz[2];
-  var vert[2];
+  Var none;
+  Var horz[2];
+  Var vert[2];
 } partition_variance;
 
 typedef struct {
   partition_variance part_variances;
-  var split[4];
+  Var split[4];
 } v4x4;
 
 typedef struct {
@@ -298,7 +385,7 @@ typedef struct {
 
 typedef struct {
   partition_variance *part_variances;
-  var *split[4];
+  Var *split[4];
 } variance_node;
 
 typedef enum {
@@ -339,34 +426,32 @@ static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
         node->split[i] = &vt->split[i].part_variances.none;
       break;
     }
-    case BLOCK_4X4: {
+    default: {
       v4x4 *vt = (v4x4 *)data;
+      assert(bsize == BLOCK_4X4);
       node->part_variances = &vt->part_variances;
       for (i = 0; i < 4; i++) node->split[i] = &vt->split[i];
       break;
     }
-    default: {
-      assert(0);
-      break;
-    }
   }
 }
 
 // Set variance values given sum square error, sum error, count.
-static void fill_variance(int64_t s2, int64_t s, int c, var *v) {
+static void fill_variance(uint32_t s2, int32_t s, int c, Var *v) {
   v->sum_square_error = s2;
   v->sum_error = s;
   v->log2_count = c;
 }
 
-static void get_variance(var *v) {
+static void get_variance(Var *v) {
   v->variance =
       (int)(256 * (v->sum_square_error -
-                   ((v->sum_error * v->sum_error) >> v->log2_count)) >>
+                   (uint32_t)(((int64_t)v->sum_error * v->sum_error) >>
+                              v->log2_count)) >>
             v->log2_count);
 }
 
-static void sum_2_variances(const var *a, const var *b, var *r) {
+static void sum_2_variances(const Var *a, const Var *b, Var *r) {
   assert(a->log2_count == b->log2_count);
   fill_variance(a->sum_square_error + b->sum_square_error,
                 a->sum_error + b->sum_error, a->log2_count + 1, r);
@@ -404,7 +489,7 @@ static int set_vt_partitioning(VP9_COMP *cpi, MACROBLOCK *const x,
   // No check for vert/horiz split as too few samples for variance.
   if (bsize == bsize_min) {
     // Variance already computed to set the force_split.
-    if (cm->frame_type == KEY_FRAME) get_variance(&vt.part_variances->none);
+    if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none);
     if (mi_col + block_width / 2 < cm->mi_cols &&
         mi_row + block_height / 2 < cm->mi_rows &&
         vt.part_variances->none.variance < threshold) {
@@ -414,9 +499,9 @@ static int set_vt_partitioning(VP9_COMP *cpi, MACROBLOCK *const x,
     return 0;
   } else if (bsize > bsize_min) {
     // Variance already computed to set the force_split.
-    if (cm->frame_type == KEY_FRAME) get_variance(&vt.part_variances->none);
+    if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none);
     // For key frame: take split for bsize above 32X32 or very high variance.
-    if (cm->frame_type == KEY_FRAME &&
+    if (frame_is_intra_only(cm) &&
         (bsize > BLOCK_32X32 ||
          vt.part_variances->none.variance > (threshold << 4))) {
       return 0;
@@ -461,16 +546,39 @@ static int set_vt_partitioning(VP9_COMP *cpi, MACROBLOCK *const x,
   return 0;
 }
 
+static int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed,
+                                         int width, int height,
+                                         int content_state) {
+  if (speed >= 8) {
+    if (width <= 640 && height <= 480)
+      return (5 * threshold_base) >> 2;
+    else if ((content_state == kLowSadLowSumdiff) ||
+             (content_state == kHighSadLowSumdiff) ||
+             (content_state == kLowVarHighSumdiff))
+      return (5 * threshold_base) >> 2;
+  } else if (speed == 7) {
+    if ((content_state == kLowSadLowSumdiff) ||
+        (content_state == kHighSadLowSumdiff) ||
+        (content_state == kLowVarHighSumdiff)) {
+      return (5 * threshold_base) >> 2;
+    }
+  }
+  return threshold_base;
+}
+
 // Set the variance split thresholds for following the block sizes:
 // 0 - threshold_64x64, 1 - threshold_32x32, 2 - threshold_16x16,
 // 3 - vbp_threshold_8x8. vbp_threshold_8x8 (to split to 4x4 partition) is
 // currently only used on key frame.
-static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) {
+static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q,
+                               int content_state) {
   VP9_COMMON *const cm = &cpi->common;
-  const int is_key_frame = (cm->frame_type == KEY_FRAME);
-  const int threshold_multiplier = is_key_frame ? 20 : 1;
+  const int is_key_frame = frame_is_intra_only(cm);
+  const int threshold_multiplier =
+      is_key_frame ? 20 : cpi->sf.variance_part_thresh_mult;
   int64_t threshold_base =
       (int64_t)(threshold_multiplier * cpi->y_dequant[q][1]);
+
   if (is_key_frame) {
     thresholds[0] = threshold_base;
     thresholds[1] = threshold_base >> 2;
@@ -488,12 +596,33 @@ static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) {
       else if (noise_level < kLow)
         threshold_base = (7 * threshold_base) >> 3;
     }
+#if CONFIG_VP9_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
+        cpi->oxcf.speed > 5 && cpi->denoiser.denoising_level >= kDenLow)
+      threshold_base =
+          vp9_scale_part_thresh(threshold_base, cpi->denoiser.denoising_level,
+                                content_state, cpi->svc.temporal_layer_id);
+    else
+      threshold_base =
+          scale_part_thresh_sumdiff(threshold_base, cpi->oxcf.speed, cm->width,
+                                    cm->height, content_state);
+#else
+    // Increase base variance threshold based on content_state/sum_diff level.
+    threshold_base = scale_part_thresh_sumdiff(
+        threshold_base, cpi->oxcf.speed, cm->width, cm->height, content_state);
+#endif
     thresholds[0] = threshold_base;
     thresholds[2] = threshold_base << cpi->oxcf.speed;
+    if (cm->width >= 1280 && cm->height >= 720 && cpi->oxcf.speed < 7)
+      thresholds[2] = thresholds[2] << 1;
     if (cm->width <= 352 && cm->height <= 288) {
       thresholds[0] = threshold_base >> 3;
       thresholds[1] = threshold_base >> 1;
       thresholds[2] = threshold_base << 3;
+      if (cpi->rc.avg_frame_qindex[INTER_FRAME] > 220)
+        thresholds[2] = thresholds[2] << 2;
+      else if (cpi->rc.avg_frame_qindex[INTER_FRAME] > 200)
+        thresholds[2] = thresholds[2] << 1;
     } else if (cm->width < 1280 && cm->height < 720) {
       thresholds[1] = (5 * threshold_base) >> 2;
     } else if (cm->width < 1920 && cm->height < 1080) {
@@ -501,21 +630,24 @@ static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) {
     } else {
       thresholds[1] = (5 * threshold_base) >> 1;
     }
+    if (cpi->sf.disable_16x16part_nonkey) thresholds[2] = INT64_MAX;
   }
 }
 
-void vp9_set_variance_partition_thresholds(VP9_COMP *cpi, int q) {
+void vp9_set_variance_partition_thresholds(VP9_COMP *cpi, int q,
+                                           int content_state) {
   VP9_COMMON *const cm = &cpi->common;
   SPEED_FEATURES *const sf = &cpi->sf;
-  const int is_key_frame = (cm->frame_type == KEY_FRAME);
+  const int is_key_frame = frame_is_intra_only(cm);
   if (sf->partition_search_type != VAR_BASED_PARTITION &&
       sf->partition_search_type != REFERENCE_PARTITION) {
     return;
   } else {
-    set_vbp_thresholds(cpi, cpi->vbp_thresholds, q);
+    set_vbp_thresholds(cpi, cpi->vbp_thresholds, q, content_state);
     // The thresholds below are not changed locally.
     if (is_key_frame) {
       cpi->vbp_threshold_sad = 0;
+      cpi->vbp_threshold_copy = 0;
       cpi->vbp_bsize_min = BLOCK_8X8;
     } else {
       if (cm->width <= 352 && cm->height <= 288)
@@ -525,8 +657,20 @@ void vp9_set_variance_partition_thresholds(VP9_COMP *cpi, int q) {
                                      ? (cpi->y_dequant[q][1] << 1)
                                      : 1000;
       cpi->vbp_bsize_min = BLOCK_16X16;
+      if (cm->width <= 352 && cm->height <= 288)
+        cpi->vbp_threshold_copy = 4000;
+      else if (cm->width <= 640 && cm->height <= 360)
+        cpi->vbp_threshold_copy = 8000;
+      else
+        cpi->vbp_threshold_copy = (cpi->y_dequant[q][1] << 3) > 8000
+                                      ? (cpi->y_dequant[q][1] << 3)
+                                      : 8000;
+      if (cpi->rc.high_source_sad ||
+          (cpi->use_svc && cpi->svc.high_source_sad_superframe)) {
+        cpi->vbp_threshold_sad = 0;
+        cpi->vbp_threshold_copy = 0;
+      }
     }
-    cpi->vbp_threshold_copy = cpi->vbp_thresholds[0] << 16;
     cpi->vbp_threshold_minmax = 15 + (q >> 3);
   }
 }
@@ -639,12 +783,14 @@ static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d,
   }
 }
 
-#if !CONFIG_VP9_HIGHBITDEPTH
 // Check if most of the superblock is skin content, and if so, force split to
 // 32x32, and set x->sb_is_skin for use in mode selection.
-static int skin_sb_split(VP9_COMP *cpi, MACROBLOCK *x, const int low_res,
-                         int mi_row, int mi_col, int *force_split) {
+static int skin_sb_split(VP9_COMP *cpi, const int low_res, int mi_row,
+                         int mi_col, int *force_split) {
   VP9_COMMON *const cm = &cpi->common;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cm->use_highbitdepth) return 0;
+#endif
   // Avoid checking superblocks on/near boundary and avoid low resolutions.
   // Note superblock may still pick 64X64 if y_sad is very small
   // (i.e., y_sad < cpi->vbp_threshold_sad) below. For now leave this as is.
@@ -652,11 +798,6 @@ static int skin_sb_split(VP9_COMP *cpi, MACROBLOCK *x, const int low_res,
                    mi_row + 8 < cm->mi_rows)) {
     int num_16x16_skin = 0;
     int num_16x16_nonskin = 0;
-    uint8_t *ysignal = x->plane[0].src.buf;
-    uint8_t *usignal = x->plane[1].src.buf;
-    uint8_t *vsignal = x->plane[2].src.buf;
-    int sp = x->plane[0].src.stride;
-    int spuv = x->plane[1].src.stride;
     const int block_index = mi_row * cm->mi_cols + mi_col;
     const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
     const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
@@ -667,16 +808,7 @@ static int skin_sb_split(VP9_COMP *cpi, MACROBLOCK *x, const int low_res,
     for (i = 0; i < ymis; i += 2) {
       for (j = 0; j < xmis; j += 2) {
         int bl_index = block_index + i * cm->mi_cols + j;
-        int bl_index1 = bl_index + 1;
-        int bl_index2 = bl_index + cm->mi_cols;
-        int bl_index3 = bl_index2 + 1;
-        int consec_zeromv =
-            VPXMIN(cpi->consec_zero_mv[bl_index],
-                   VPXMIN(cpi->consec_zero_mv[bl_index1],
-                          VPXMIN(cpi->consec_zero_mv[bl_index2],
-                                 cpi->consec_zero_mv[bl_index3])));
-        int is_skin = vp9_compute_skin_block(
-            ysignal, usignal, vsignal, sp, spuv, BLOCK_16X16, consec_zeromv, 0);
+        int is_skin = cpi->skin_map[bl_index];
         num_16x16_skin += is_skin;
         num_16x16_nonskin += (1 - is_skin);
         if (num_16x16_nonskin > 3) {
@@ -684,13 +816,7 @@ static int skin_sb_split(VP9_COMP *cpi, MACROBLOCK *x, const int low_res,
           i = ymis;
           break;
         }
-        ysignal += 16;
-        usignal += 8;
-        vsignal += 8;
       }
-      ysignal += (sp << 4) - 64;
-      usignal += (spuv << 3) - 32;
-      vsignal += (spuv << 3) - 32;
     }
     if (num_16x16_skin > 12) {
       *force_split = 1;
@@ -699,7 +825,6 @@ static int skin_sb_split(VP9_COMP *cpi, MACROBLOCK *x, const int low_res,
   }
   return 0;
 }
-#endif
 
 static void set_low_temp_var_flag(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
                                   v64x64 *vt, int64_t thresholds[],
@@ -767,59 +892,232 @@ static void set_low_temp_var_flag(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
   }
 }
 
-static void copy_prev_partition(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
-                                int mi_col) {
+static void copy_partitioning_helper(VP9_COMP *cpi, MACROBLOCK *x,
+                                     MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                     int mi_row, int mi_col) {
   VP9_COMMON *const cm = &cpi->common;
   BLOCK_SIZE *prev_part = cpi->prev_partition;
   int start_pos = mi_row * cm->mi_stride + mi_col;
 
   const int bsl = b_width_log2_lookup[bsize];
-  const int bs = (1 << bsl) / 4;
+  const int bs = (1 << bsl) >> 2;
   BLOCK_SIZE subsize;
   PARTITION_TYPE partition;
-  MODE_INFO *mi = NULL;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
   partition = partition_lookup[bsl][prev_part[start_pos]];
   subsize = get_subsize(bsize, partition);
-  mi = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col];
 
   if (subsize < BLOCK_8X8) {
-    mi->sb_type = bsize;
+    set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
   } else {
     switch (partition) {
-      case PARTITION_NONE: mi->sb_type = bsize; break;
+      case PARTITION_NONE:
+        set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
+        break;
       case PARTITION_HORZ:
-        mi->sb_type = subsize;
-        if (mi_row + bs < cm->mi_rows)
-          cm->mi_grid_visible[(mi_row + bs) * cm->mi_stride + mi_col]->sb_type =
-              subsize;
+        set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+        set_block_size(cpi, x, xd, mi_row + bs, mi_col, subsize);
         break;
       case PARTITION_VERT:
-        mi->sb_type = subsize;
-        if (mi_col + bs < cm->mi_cols)
-          cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col + bs]->sb_type =
-              subsize;
+        set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+        set_block_size(cpi, x, xd, mi_row, mi_col + bs, subsize);
         break;
-      case PARTITION_SPLIT:
-        copy_prev_partition(cpi, subsize, mi_row, mi_col);
-        copy_prev_partition(cpi, subsize, mi_row + bs, mi_col);
-        copy_prev_partition(cpi, subsize, mi_row, mi_col + bs);
-        copy_prev_partition(cpi, subsize, mi_row + bs, mi_col + bs);
+      default:
+        assert(partition == PARTITION_SPLIT);
+        copy_partitioning_helper(cpi, x, xd, subsize, mi_row, mi_col);
+        copy_partitioning_helper(cpi, x, xd, subsize, mi_row + bs, mi_col);
+        copy_partitioning_helper(cpi, x, xd, subsize, mi_row, mi_col + bs);
+        copy_partitioning_helper(cpi, x, xd, subsize, mi_row + bs, mi_col + bs);
         break;
-      default: assert(0);
     }
   }
 }
 
-static void update_prev_partition(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
-                                  int mi_col) {
+static int copy_partitioning(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+                             int mi_row, int mi_col, int segment_id,
+                             int sb_offset) {
+  int svc_copy_allowed = 1;
+  int frames_since_key_thresh = 1;
+  if (cpi->use_svc) {
+    // For SVC, don't allow copy if base spatial layer is key frame, or if
+    // frame is not a temporal enhancement layer frame.
+    int layer = LAYER_IDS_TO_IDX(0, cpi->svc.temporal_layer_id,
+                                 cpi->svc.number_temporal_layers);
+    const LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
+    if (lc->is_key_frame || !cpi->svc.non_reference_frame) svc_copy_allowed = 0;
+    frames_since_key_thresh = cpi->svc.number_spatial_layers << 1;
+  }
+  if (cpi->rc.frames_since_key > frames_since_key_thresh && svc_copy_allowed &&
+      !cpi->resize_pending && segment_id == CR_SEGMENT_ID_BASE &&
+      cpi->prev_segment_id[sb_offset] == CR_SEGMENT_ID_BASE &&
+      cpi->copied_frame_cnt[sb_offset] < cpi->max_copied_frame) {
+    if (cpi->prev_partition != NULL) {
+      copy_partitioning_helper(cpi, x, xd, BLOCK_64X64, mi_row, mi_col);
+      cpi->copied_frame_cnt[sb_offset] += 1;
+      memcpy(x->variance_low, &(cpi->prev_variance_low[sb_offset * 25]),
+             sizeof(x->variance_low));
+      return 1;
+    }
+  }
+
+  return 0;
+}
+
+// Set the partition for mi_col/row_high (current resolution) based on
+// the previous spatial layer (mi_col/row). Returns 0 if partition is set,
+// returns 1 if no scale partitioning is done. Return 1 means the variance
+// partitioning will be used.
+static int scale_partitioning_svc(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+                                  BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                  int mi_row_high, int mi_col_high) {
+  VP9_COMMON *const cm = &cpi->common;
+  SVC *const svc = &cpi->svc;
+  BLOCK_SIZE *prev_part = svc->prev_partition_svc;
+  // Variables with _high are for higher resolution.
+  int bsize_high = 0;
+  int subsize_high = 0;
+  const int bsl = b_width_log2_lookup[bsize];
+  const int bs = (1 << bsl) >> 2;
+  const int has_rows = (mi_row_high + bs) < cm->mi_rows;
+  const int has_cols = (mi_col_high + bs) < cm->mi_cols;
+
+  int start_pos;
+  BLOCK_SIZE bsize_low;
+  PARTITION_TYPE partition_high;
+
+  // If the lower layer frame is outside the boundary (this can happen for
+  // odd size resolutions) then do not scale partitioning from the lower
+  // layer. Do variance based partitioning instead (return 1).
+  if (mi_row >= svc->mi_rows[svc->spatial_layer_id - 1] ||
+      mi_col >= svc->mi_cols[svc->spatial_layer_id - 1])
+    return 1;
+
+  // Do not scale partitioning from lower layers on the boundary. Do
+  // variance based partitioning instead (return 1).
+  if (!has_rows || !has_cols) return 1;
+
+  // Find corresponding (mi_col/mi_row) block down-scaled by 2x2.
+  start_pos = mi_row * (svc->mi_stride[svc->spatial_layer_id - 1]) + mi_col;
+  bsize_low = prev_part[start_pos];
+
+  // For reference frames: return 1 (do variance-based partitioning) if the
+  // superblock is not low source sad and lower-resoln bsize is below 32x32.
+  if (!cpi->svc.non_reference_frame && !x->skip_low_source_sad &&
+      bsize_low < BLOCK_32X32)
+    return 1;
+
+  // Scale up block size by 2x2. Force 64x64 for size larger than 32x32.
+  if (bsize_low < BLOCK_32X32) {
+    bsize_high = bsize_low + 3;
+  } else if (bsize_low >= BLOCK_32X32) {
+    bsize_high = BLOCK_64X64;
+  }
+
+  partition_high = partition_lookup[bsl][bsize_high];
+  subsize_high = get_subsize(bsize, partition_high);
+
+  if (subsize_high < BLOCK_8X8) {
+    set_block_size(cpi, x, xd, mi_row_high, mi_col_high, bsize_high);
+  } else {
+    switch (partition_high) {
+      case PARTITION_NONE:
+        set_block_size(cpi, x, xd, mi_row_high, mi_col_high, bsize_high);
+        break;
+      case PARTITION_HORZ:
+        set_block_size(cpi, x, xd, mi_row_high, mi_col_high, subsize_high);
+        if (subsize_high < BLOCK_64X64)
+          set_block_size(cpi, x, xd, mi_row_high + bs, mi_col_high,
+                         subsize_high);
+        break;
+      case PARTITION_VERT:
+        set_block_size(cpi, x, xd, mi_row_high, mi_col_high, subsize_high);
+        if (subsize_high < BLOCK_64X64)
+          set_block_size(cpi, x, xd, mi_row_high, mi_col_high + bs,
+                         subsize_high);
+        break;
+      default:
+        assert(partition_high == PARTITION_SPLIT);
+        if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row, mi_col,
+                                   mi_row_high, mi_col_high))
+          return 1;
+        if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row + (bs >> 1),
+                                   mi_col, mi_row_high + bs, mi_col_high))
+          return 1;
+        if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row,
+                                   mi_col + (bs >> 1), mi_row_high,
+                                   mi_col_high + bs))
+          return 1;
+        if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row + (bs >> 1),
+                                   mi_col + (bs >> 1), mi_row_high + bs,
+                                   mi_col_high + bs))
+          return 1;
+        break;
+    }
+  }
+
+  return 0;
+}
+
+static void update_partition_svc(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+                                 int mi_col) {
+  VP9_COMMON *const cm = &cpi->common;
+  BLOCK_SIZE *prev_part = cpi->svc.prev_partition_svc;
+  int start_pos = mi_row * cm->mi_stride + mi_col;
+  const int bsl = b_width_log2_lookup[bsize];
+  const int bs = (1 << bsl) >> 2;
+  BLOCK_SIZE subsize;
+  PARTITION_TYPE partition;
+  const MODE_INFO *mi = NULL;
+  int xx, yy;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+  mi = cm->mi_grid_visible[start_pos];
+  partition = partition_lookup[bsl][mi->sb_type];
+  subsize = get_subsize(bsize, partition);
+  if (subsize < BLOCK_8X8) {
+    prev_part[start_pos] = bsize;
+  } else {
+    switch (partition) {
+      case PARTITION_NONE:
+        prev_part[start_pos] = bsize;
+        if (bsize == BLOCK_64X64) {
+          for (xx = 0; xx < 8; xx += 4)
+            for (yy = 0; yy < 8; yy += 4) {
+              if ((mi_row + xx < cm->mi_rows) && (mi_col + yy < cm->mi_cols))
+                prev_part[start_pos + xx * cm->mi_stride + yy] = bsize;
+            }
+        }
+        break;
+      case PARTITION_HORZ:
+        prev_part[start_pos] = subsize;
+        if (mi_row + bs < cm->mi_rows)
+          prev_part[start_pos + bs * cm->mi_stride] = subsize;
+        break;
+      case PARTITION_VERT:
+        prev_part[start_pos] = subsize;
+        if (mi_col + bs < cm->mi_cols) prev_part[start_pos + bs] = subsize;
+        break;
+      default:
+        assert(partition == PARTITION_SPLIT);
+        update_partition_svc(cpi, subsize, mi_row, mi_col);
+        update_partition_svc(cpi, subsize, mi_row + bs, mi_col);
+        update_partition_svc(cpi, subsize, mi_row, mi_col + bs);
+        update_partition_svc(cpi, subsize, mi_row + bs, mi_col + bs);
+        break;
+    }
+  }
+}
+
+static void update_prev_partition_helper(VP9_COMP *cpi, BLOCK_SIZE bsize,
+                                         int mi_row, int mi_col) {
   VP9_COMMON *const cm = &cpi->common;
   BLOCK_SIZE *prev_part = cpi->prev_partition;
   int start_pos = mi_row * cm->mi_stride + mi_col;
   const int bsl = b_width_log2_lookup[bsize];
-  const int bs = (1 << bsl) / 4;
+  const int bs = (1 << bsl) >> 2;
   BLOCK_SIZE subsize;
   PARTITION_TYPE partition;
   const MODE_INFO *mi = NULL;
@@ -843,23 +1141,47 @@ static void update_prev_partition(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
         prev_part[start_pos] = subsize;
         if (mi_col + bs < cm->mi_cols) prev_part[start_pos + bs] = subsize;
         break;
-      case PARTITION_SPLIT:
-        update_prev_partition(cpi, subsize, mi_row, mi_col);
-        update_prev_partition(cpi, subsize, mi_row + bs, mi_col);
-        update_prev_partition(cpi, subsize, mi_row, mi_col + bs);
-        update_prev_partition(cpi, subsize, mi_row + bs, mi_col + bs);
+      default:
+        assert(partition == PARTITION_SPLIT);
+        update_prev_partition_helper(cpi, subsize, mi_row, mi_col);
+        update_prev_partition_helper(cpi, subsize, mi_row + bs, mi_col);
+        update_prev_partition_helper(cpi, subsize, mi_row, mi_col + bs);
+        update_prev_partition_helper(cpi, subsize, mi_row + bs, mi_col + bs);
         break;
-      default: assert(0);
     }
   }
 }
 
+static void update_prev_partition(VP9_COMP *cpi, MACROBLOCK *x, int segment_id,
+                                  int mi_row, int mi_col, int sb_offset) {
+  update_prev_partition_helper(cpi, BLOCK_64X64, mi_row, mi_col);
+  cpi->prev_segment_id[sb_offset] = segment_id;
+  memcpy(&(cpi->prev_variance_low[sb_offset * 25]), x->variance_low,
+         sizeof(x->variance_low));
+  // Reset the counter for copy partitioning
+  cpi->copied_frame_cnt[sb_offset] = 0;
+}
+
 static void chroma_check(VP9_COMP *cpi, MACROBLOCK *x, int bsize,
-                         unsigned int y_sad, int is_key_frame) {
+                         unsigned int y_sad, int is_key_frame,
+                         int scene_change_detected) {
   int i;
   MACROBLOCKD *xd = &x->e_mbd;
+  int shift = 2;
+
   if (is_key_frame) return;
 
+  // For speed > 8, avoid the chroma check if y_sad is above threshold.
+  if (cpi->oxcf.speed > 8) {
+    if (y_sad > cpi->vbp_thresholds[1] &&
+        (!cpi->noise_estimate.enabled ||
+         vp9_noise_estimate_extract_level(&cpi->noise_estimate) < kMedium))
+      return;
+  }
+
+  if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && scene_change_detected)
+    shift = 5;
+
   for (i = 1; i <= 2; ++i) {
     unsigned int uv_sad = UINT_MAX;
     struct macroblock_plane *p = &x->plane[i];
@@ -872,10 +1194,60 @@ static void chroma_check(VP9_COMP *cpi, MACROBLOCK *x, int bsize,
 
     // TODO(marpan): Investigate if we should lower this threshold if
     // superblock is detected as skin.
-    x->color_sensitivity[i - 1] = uv_sad > (y_sad >> 2);
+    x->color_sensitivity[i - 1] = uv_sad > (y_sad >> shift);
   }
 }
 
+static uint64_t avg_source_sad(VP9_COMP *cpi, MACROBLOCK *x, int shift,
+                               int sb_offset) {
+  unsigned int tmp_sse;
+  uint64_t tmp_sad;
+  unsigned int tmp_variance;
+  const BLOCK_SIZE bsize = BLOCK_64X64;
+  uint8_t *src_y = cpi->Source->y_buffer;
+  int src_ystride = cpi->Source->y_stride;
+  uint8_t *last_src_y = cpi->Last_Source->y_buffer;
+  int last_src_ystride = cpi->Last_Source->y_stride;
+  uint64_t avg_source_sad_threshold = 10000;
+  uint64_t avg_source_sad_threshold2 = 12000;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cpi->common.use_highbitdepth) return 0;
+#endif
+  src_y += shift;
+  last_src_y += shift;
+  tmp_sad =
+      cpi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y, last_src_ystride);
+  tmp_variance = vpx_variance64x64(src_y, src_ystride, last_src_y,
+                                   last_src_ystride, &tmp_sse);
+  // Note: tmp_sse - tmp_variance = ((sum * sum) >> 12)
+  if (tmp_sad < avg_source_sad_threshold)
+    x->content_state_sb = ((tmp_sse - tmp_variance) < 25) ? kLowSadLowSumdiff
+                                                          : kLowSadHighSumdiff;
+  else
+    x->content_state_sb = ((tmp_sse - tmp_variance) < 25) ? kHighSadLowSumdiff
+                                                          : kHighSadHighSumdiff;
+
+  // Detect large lighting change.
+  if (cpi->oxcf.content != VP9E_CONTENT_SCREEN &&
+      cpi->oxcf.rc_mode == VPX_CBR && tmp_variance < (tmp_sse >> 3) &&
+      (tmp_sse - tmp_variance) > 10000)
+    x->content_state_sb = kLowVarHighSumdiff;
+  else if (tmp_sad > (avg_source_sad_threshold << 1))
+    x->content_state_sb = kVeryHighSad;
+
+  if (cpi->content_state_sb_fd != NULL) {
+    if (tmp_sad < avg_source_sad_threshold2) {
+      // Cap the increment to 255.
+      if (cpi->content_state_sb_fd[sb_offset] < 255)
+        cpi->content_state_sb_fd[sb_offset]++;
+    } else {
+      cpi->content_state_sb_fd[sb_offset] = 0;
+    }
+  }
+  if (tmp_sad == 0) x->zero_temp_sad_source = 1;
+  return tmp_sad;
+}
+
 // This function chooses partitioning based on the variance between source and
 // reconstructed last, where variance is computed for down-sampled inputs.
 static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
@@ -884,19 +1256,23 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
   MACROBLOCKD *xd = &x->e_mbd;
   int i, j, k, m;
   v64x64 vt;
-  v16x16 vt2[16];
+  v16x16 *vt2 = NULL;
   int force_split[21];
   int avg_32x32;
   int max_var_32x32 = 0;
   int min_var_32x32 = INT_MAX;
   int var_32x32;
   int avg_16x16[4];
+  int maxvar_16x16[4];
+  int minvar_16x16[4];
   int64_t threshold_4x4avg;
   NOISE_LEVEL noise_level = kLow;
+  int content_state = 0;
   uint8_t *s;
   const uint8_t *d;
   int sp;
   int dp;
+  int compute_minmax_variance = 1;
   unsigned int y_sad = UINT_MAX;
   BLOCK_SIZE bsize = BLOCK_64X64;
   // Ref frame used in partitioning.
@@ -904,34 +1280,113 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
   int pixels_wide = 64, pixels_high = 64;
   int64_t thresholds[4] = { cpi->vbp_thresholds[0], cpi->vbp_thresholds[1],
                             cpi->vbp_thresholds[2], cpi->vbp_thresholds[3] };
+  int scene_change_detected =
+      cpi->rc.high_source_sad ||
+      (cpi->use_svc && cpi->svc.high_source_sad_superframe);
+  int force_64_split = scene_change_detected ||
+                       (cpi->oxcf.content == VP9E_CONTENT_SCREEN &&
+                        cpi->compute_source_sad_onepass &&
+                        cpi->sf.use_source_sad && !x->zero_temp_sad_source);
 
   // For the variance computation under SVC mode, we treat the frame as key if
   // the reference (base layer frame) is key frame (i.e., is_key_frame == 1).
-  const int is_key_frame =
-      (cm->frame_type == KEY_FRAME ||
-       (is_one_pass_cbr_svc(cpi) &&
+  int is_key_frame =
+      (frame_is_intra_only(cm) ||
+       (is_one_pass_svc(cpi) &&
         cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame));
-  // Always use 4x4 partition for key frame.
-  const int use_4x4_partition = cm->frame_type == KEY_FRAME;
+
+  if (!is_key_frame) {
+    if (cm->frame_refs[LAST_FRAME - 1].sf.x_scale_fp == REF_INVALID_SCALE ||
+        cm->frame_refs[LAST_FRAME - 1].sf.y_scale_fp == REF_INVALID_SCALE)
+      is_key_frame = 1;
+  }
+
+  // Allow for sub8x8 (4x4) partition on key frames, but only for hybrid mode
+  // (i.e., sf->nonrd_keyframe = 0), where for small blocks rd intra pickmode
+  // (vp9_rd_pick_intra_mode_sb) is used. The nonrd intra pickmode
+  // (vp9_pick_intra_mode) does not currently support sub8x8 blocks. This causes
+  // the issue: 44166813. Assert is added in vp9_pick_intra_mode to check this.
+  const int use_4x4_partition =
+      frame_is_intra_only(cm) && !cpi->sf.nonrd_keyframe;
   const int low_res = (cm->width <= 352 && cm->height <= 288);
   int variance4x4downsample[16];
   int segment_id;
-  int offset = cm->mi_stride * mi_row + mi_col;
+  int sb_offset = (cm->mi_stride >> 3) * (mi_row >> 3) + (mi_col >> 3);
+
+  // For SVC: check if LAST frame is NULL or if the resolution of LAST is
+  // different than the current frame resolution, and if so, treat this frame
+  // as a key frame, for the purpose of the superblock partitioning.
+  // LAST == NULL can happen in some cases where enhancement spatial layers are
+  // enabled dyanmically in the stream and the only reference is the spatial
+  // reference (GOLDEN).
+  if (cpi->use_svc) {
+    const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, LAST_FRAME);
+    if (ref == NULL || ref->y_crop_height != cm->height ||
+        ref->y_crop_width != cm->width)
+      is_key_frame = 1;
+  }
 
   set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
+  set_segment_index(cpi, x, mi_row, mi_col, BLOCK_64X64, 0);
   segment_id = xd->mi[0]->segment_id;
-  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
-    if (cyclic_refresh_segment_id_boosted(segment_id)) {
-      int q = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
-      set_vbp_thresholds(cpi, thresholds, q);
+
+  if (cpi->oxcf.speed >= 8 || (cpi->use_svc && cpi->svc.non_reference_frame))
+    compute_minmax_variance = 0;
+
+  memset(x->variance_low, 0, sizeof(x->variance_low));
+
+  if (cpi->sf.use_source_sad && !is_key_frame) {
+    int sb_offset2 = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3);
+    content_state = x->content_state_sb;
+    x->skip_low_source_sad = (content_state == kLowSadLowSumdiff ||
+                              content_state == kLowSadHighSumdiff)
+                                 ? 1
+                                 : 0;
+    x->lowvar_highsumdiff = (content_state == kLowVarHighSumdiff) ? 1 : 0;
+    if (cpi->content_state_sb_fd != NULL)
+      x->last_sb_high_content = cpi->content_state_sb_fd[sb_offset2];
+
+    // For SVC on top spatial layer: use/scale the partition from
+    // the lower spatial resolution if svc_use_lowres_part is enabled.
+    if (cpi->sf.svc_use_lowres_part &&
+        cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1 &&
+        cpi->svc.prev_partition_svc != NULL && content_state != kVeryHighSad) {
+      if (!scale_partitioning_svc(cpi, x, xd, BLOCK_64X64, mi_row >> 1,
+                                  mi_col >> 1, mi_row, mi_col)) {
+        if (cpi->sf.copy_partition_flag) {
+          update_prev_partition(cpi, x, segment_id, mi_row, mi_col, sb_offset);
+        }
+        return 0;
+      }
+    }
+    // If source_sad is low copy the partition without computing the y_sad.
+    if (x->skip_low_source_sad && cpi->sf.copy_partition_flag &&
+        !force_64_split &&
+        copy_partitioning(cpi, x, xd, mi_row, mi_col, segment_id, sb_offset)) {
+      x->sb_use_mv_part = 1;
+      if (cpi->sf.svc_use_lowres_part &&
+          cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2)
+        update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col);
+      return 0;
     }
   }
 
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
+      cyclic_refresh_segment_id_boosted(segment_id)) {
+    int q = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+    set_vbp_thresholds(cpi, thresholds, q, content_state);
+  } else {
+    set_vbp_thresholds(cpi, thresholds, cm->base_qindex, content_state);
+  }
+  // Decrease 32x32 split threshold for screen on base layer, for scene
+  // change/high motion frames.
+  if (cpi->oxcf.content == VP9E_CONTENT_SCREEN &&
+      cpi->svc.spatial_layer_id == 0 && force_64_split)
+    thresholds[1] = 3 * thresholds[1] >> 2;
+
   // For non keyframes, disable 4x4 average for low resolution when speed = 8
   threshold_4x4avg = (cpi->oxcf.speed < 8) ? thresholds[1] << 1 : INT64_MAX;
 
-  memset(x->variance_low, 0, sizeof(x->variance_low));
-
   if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3);
   if (xd->mb_to_bottom_edge < 0) pixels_high += (xd->mb_to_bottom_edge >> 3);
 
@@ -940,7 +1395,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
 
   // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
   // 5-20 for the 16x16 blocks.
-  force_split[0] = 0;
+  force_split[0] = force_64_split;
 
   if (!is_key_frame) {
     // In the case of spatial/temporal scalable coding, the assumption here is
@@ -956,13 +1411,16 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
 
     assert(yv12 != NULL);
 
-    if (!(is_one_pass_cbr_svc(cpi) && cpi->svc.spatial_layer_id)) {
+    if (!(is_one_pass_svc(cpi) && cpi->svc.spatial_layer_id) ||
+        cpi->svc.use_gf_temporal_ref_current_layer) {
       // For now, GOLDEN will not be used for non-zero spatial layers, since
       // it may not be a temporal reference.
       yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
     }
 
-    if (yv12_g && yv12_g != yv12 && (cpi->ref_frame_flags & VP9_GOLD_FLAG)) {
+    // Only compute y_sad_g (sad for golden reference) for speed < 8.
+    if (cpi->oxcf.speed < 8 && yv12_g && yv12_g != yv12 &&
+        (cpi->ref_frame_flags & VP9_GOLD_FLAG)) {
       vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
                            &cm->frame_refs[GOLDEN_FRAME - 1].sf);
       y_sad_g = cpi->fn_ptr[bsize].sdf(
@@ -984,12 +1442,41 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
                            &cm->frame_refs[LAST_FRAME - 1].sf);
       mi->ref_frame[0] = LAST_FRAME;
     }
-    mi->ref_frame[1] = NONE;
+    mi->ref_frame[1] = NO_REF_FRAME;
     mi->sb_type = BLOCK_64X64;
     mi->mv[0].as_int = 0;
     mi->interp_filter = BILINEAR;
 
-    y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
+    if (cpi->oxcf.speed >= 8 && !low_res &&
+        x->content_state_sb != kVeryHighSad) {
+      y_sad = cpi->fn_ptr[bsize].sdf(
+          x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf,
+          xd->plane[0].pre[0].stride);
+    } else {
+      const MV dummy_mv = { 0, 0 };
+      y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col,
+                                            &dummy_mv);
+      x->sb_use_mv_part = 1;
+      x->sb_mvcol_part = mi->mv[0].as_mv.col;
+      x->sb_mvrow_part = mi->mv[0].as_mv.row;
+      if (cpi->oxcf.content == VP9E_CONTENT_SCREEN &&
+          cpi->svc.spatial_layer_id == cpi->svc.first_spatial_layer_to_encode &&
+          cpi->svc.high_num_blocks_with_motion && !x->zero_temp_sad_source &&
+          cm->width > 640 && cm->height > 480) {
+        // Disable split below 16x16 block size when scroll motion (horz or
+        // vert) is detected.
+        // TODO(marpan/jianj): Improve this condition: issue is that search
+        // range is hard-coded/limited in vp9_int_pro_motion_estimation() so
+        // scroll motion may not be detected here.
+        if (((abs(x->sb_mvrow_part) >= 48 && abs(x->sb_mvcol_part) <= 8) ||
+             (abs(x->sb_mvcol_part) >= 48 && abs(x->sb_mvrow_part) <= 8)) &&
+            y_sad < 100000) {
+          compute_minmax_variance = 0;
+          thresholds[2] = INT64_MAX;
+        }
+      }
+    }
+
     y_sad_last = y_sad;
     // Pick ref frame for partitioning, bias last frame when y_sad_g and y_sad
     // are close if short_circuit_low_temp_var is on.
@@ -1009,12 +1496,8 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
     set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
 
-    x->sb_is_skin = 0;
-#if !CONFIG_VP9_HIGHBITDEPTH
     if (cpi->use_skin_detection)
-      x->sb_is_skin =
-          skin_sb_split(cpi, x, low_res, mi_row, mi_col, force_split);
-#endif
+      x->sb_is_skin = skin_sb_split(cpi, low_res, mi_row, mi_col, force_split);
 
     d = xd->plane[0].dst.buf;
     dp = xd->plane[0].dst.stride;
@@ -1027,23 +1510,29 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
       if (mi_col + block_width / 2 < cm->mi_cols &&
           mi_row + block_height / 2 < cm->mi_rows) {
         set_block_size(cpi, x, xd, mi_row, mi_col, BLOCK_64X64);
-        chroma_check(cpi, x, bsize, y_sad, is_key_frame);
+        x->variance_low[0] = 1;
+        chroma_check(cpi, x, bsize, y_sad, is_key_frame, scene_change_detected);
+        if (cpi->sf.svc_use_lowres_part &&
+            cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2)
+          update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col);
+        if (cpi->sf.copy_partition_flag) {
+          update_prev_partition(cpi, x, segment_id, mi_row, mi_col, sb_offset);
+        }
         return 0;
       }
     }
 
     // If the y_sad is small enough, copy the partition of the superblock in the
     // last frame to current frame only if the last frame is not a keyframe.
+    // Stop the copy every cpi->max_copied_frame to refresh the partition.
     // TODO(jianj) : tune the threshold.
-    if (cpi->sf.copy_partition_flag && cpi->rc.frames_since_key > 1 &&
-        segment_id == CR_SEGMENT_ID_BASE &&
-        cpi->prev_segment_id[offset] == CR_SEGMENT_ID_BASE &&
-        y_sad_last < cpi->vbp_threshold_copy) {
-      if (cpi->prev_partition != NULL) {
-        copy_prev_partition(cpi, BLOCK_64X64, mi_row, mi_col);
-        chroma_check(cpi, x, bsize, y_sad, is_key_frame);
-        return 0;
-      }
+    if (cpi->sf.copy_partition_flag && y_sad_last < cpi->vbp_threshold_copy &&
+        copy_partitioning(cpi, x, xd, mi_row, mi_col, segment_id, sb_offset)) {
+      chroma_check(cpi, x, bsize, y_sad, is_key_frame, scene_change_detected);
+      if (cpi->sf.svc_use_lowres_part &&
+          cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2)
+        update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col);
+      return 0;
     }
   } else {
     d = VP9_VAR_OFFS;
@@ -1060,6 +1549,8 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   }
 
+  if (low_res && threshold_4x4avg < INT64_MAX)
+    CHECK_MEM_ERROR(&cm->error, vt2, vpx_calloc(16, sizeof(*vt2)));
   // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
   // for splits.
   for (i = 0; i < 4; i++) {
@@ -1068,6 +1559,8 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
     const int i2 = i << 2;
     force_split[i + 1] = 0;
     avg_16x16[i] = 0;
+    maxvar_16x16[i] = 0;
+    minvar_16x16[i] = INT_MAX;
     for (j = 0; j < 4; j++) {
       const int x16_idx = x32_idx + ((j & 1) << 4);
       const int y16_idx = y32_idx + ((j >> 1) << 4);
@@ -1084,13 +1577,17 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
         fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16);
         get_variance(&vt.split[i].split[j].part_variances.none);
         avg_16x16[i] += vt.split[i].split[j].part_variances.none.variance;
+        if (vt.split[i].split[j].part_variances.none.variance < minvar_16x16[i])
+          minvar_16x16[i] = vt.split[i].split[j].part_variances.none.variance;
+        if (vt.split[i].split[j].part_variances.none.variance > maxvar_16x16[i])
+          maxvar_16x16[i] = vt.split[i].split[j].part_variances.none.variance;
         if (vt.split[i].split[j].part_variances.none.variance > thresholds[2]) {
           // 16X16 variance is above threshold for split, so force split to 8x8
           // for this 16x16 block (this also forces splits for upper levels).
           force_split[split_index] = 1;
           force_split[i + 1] = 1;
           force_split[0] = 1;
-        } else if (cpi->oxcf.speed < 8 &&
+        } else if (compute_minmax_variance &&
                    vt.split[i].split[j].part_variances.none.variance >
                        thresholds[1] &&
                    !cyclic_refresh_segment_id_boosted(segment_id)) {
@@ -1102,16 +1599,19 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
                                           xd->cur_buf->flags,
 #endif
                                           pixels_wide, pixels_high);
-          if (minmax > cpi->vbp_threshold_minmax) {
+          int thresh_minmax = (int)cpi->vbp_threshold_minmax;
+          if (x->content_state_sb == kVeryHighSad)
+            thresh_minmax = thresh_minmax << 1;
+          if (minmax > thresh_minmax) {
             force_split[split_index] = 1;
             force_split[i + 1] = 1;
             force_split[0] = 1;
           }
         }
       }
-      if (is_key_frame || (low_res &&
-                           vt.split[i].split[j].part_variances.none.variance >
-                               threshold_4x4avg)) {
+      if (is_key_frame ||
+          (low_res && vt.split[i].split[j].part_variances.none.variance >
+                          threshold_4x4avg)) {
         force_split[split_index] = 0;
         // Go down to 4x4 down-sampling for variance.
         variance4x4downsample[i2 + j] = 1;
@@ -1128,6 +1628,8 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
       }
     }
   }
+  if (cpi->noise_estimate.enabled)
+    noise_level = vp9_noise_estimate_extract_level(&cpi->noise_estimate);
   // Fill the rest of the variance tree by summing split partition values.
   avg_32x32 = 0;
   for (i = 0; i < 4; i++) {
@@ -1163,6 +1665,11 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
            vt.split[i].part_variances.none.variance > (avg_16x16[i] >> 1))) {
         force_split[i + 1] = 1;
         force_split[0] = 1;
+      } else if (!is_key_frame && noise_level < kLow && cm->height <= 360 &&
+                 (maxvar_16x16[i] - minvar_16x16[i]) > (thresholds[1] >> 1) &&
+                 maxvar_16x16[i] > thresholds[1]) {
+        force_split[i + 1] = 1;
+        force_split[0] = 1;
       }
       avg_32x32 += var_32x32;
     }
@@ -1170,13 +1677,11 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
   if (!force_split[0]) {
     fill_variance_tree(&vt, BLOCK_64X64);
     get_variance(&vt.part_variances.none);
-    if (cpi->noise_estimate.enabled)
-      noise_level = vp9_noise_estimate_extract_level(&cpi->noise_estimate);
     // If variance of this 64x64 block is above (some threshold of) the average
     // variance over the sub-32x32 blocks, then force this block to split.
     // Only checking this for noise level >= medium for now.
     if (!is_key_frame && noise_level >= kMedium &&
-        vt.part_variances.none.variance > (5 * avg_32x32) >> 4)
+        vt.part_variances.none.variance > (9 * avg_32x32) >> 5)
       force_split[0] = 1;
     // Else if the maximum 32x32 variance minus the miniumum 32x32 variance in
     // a 64x64 block is greater than threshold and the maximum 32x32 variance is
@@ -1206,7 +1711,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
           const int y16_idx = ((j >> 1) << 1);
           // For inter frames: if variance4x4downsample[] == 1 for this 16x16
           // block, then the variance is based on 4x4 down-sampling, so use vt2
-          // in set_vt_partioning(), otherwise use vt.
+          // in set_vt_partitioning(), otherwise use vt.
           v16x16 *vtemp = (!is_key_frame && variance4x4downsample[i2 + j] == 1)
                               ? &vt2[i2 + j]
                               : &vt.split[i].split[j];
@@ -1239,20 +1744,25 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
     }
   }
 
-  if (cm->frame_type != KEY_FRAME && cpi->sf.copy_partition_flag) {
-    update_prev_partition(cpi, BLOCK_64X64, mi_row, mi_col);
-    cpi->prev_segment_id[offset] = segment_id;
+  if (!frame_is_intra_only(cm) && cpi->sf.copy_partition_flag) {
+    update_prev_partition(cpi, x, segment_id, mi_row, mi_col, sb_offset);
   }
 
+  if (!frame_is_intra_only(cm) && cpi->sf.svc_use_lowres_part &&
+      cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2)
+    update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col);
+
   if (cpi->sf.short_circuit_low_temp_var) {
     set_low_temp_var_flag(cpi, x, xd, &vt, thresholds, ref_frame_partition,
                           mi_col, mi_row);
   }
 
-  chroma_check(cpi, x, bsize, y_sad, is_key_frame);
+  chroma_check(cpi, x, bsize, y_sad, is_key_frame, scene_change_detected);
+  if (vt2) vpx_free(vt2);
   return 0;
 }
 
+#if !CONFIG_REALTIME_ONLY
 static void update_state(VP9_COMP *cpi, ThreadData *td, PICK_MODE_CONTEXT *ctx,
                          int mi_row, int mi_col, BLOCK_SIZE bsize,
                          int output_enabled) {
@@ -1294,7 +1804,8 @@ static void update_state(VP9_COMP *cpi, ThreadData *td, PICK_MODE_CONTEXT *ctx,
     }
     // Else for cyclic refresh mode update the segment map, set the segment id
     // and then update the quantizer.
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+        cpi->cyclic_refresh->content_mode) {
       vp9_cyclic_refresh_update_segment(cpi, xd->mi[0], mi_row, mi_col, bsize,
                                         ctx->rate, ctx->dist, x->skip, p);
     }
@@ -1357,8 +1868,8 @@ static void update_state(VP9_COMP *cpi, ThreadData *td, PICK_MODE_CONTEXT *ctx,
       vp9_update_mv_count(td);
 
       if (cm->interp_filter == SWITCHABLE) {
-        const int ctx = get_pred_context_switchable_interp(xd);
-        ++td->counts->switchable_interp[ctx][xdmi->interp_filter];
+        const int ctx_interp = get_pred_context_switchable_interp(xd);
+        ++td->counts->switchable_interp[ctx_interp][xdmi->interp_filter];
       }
     }
 
@@ -1381,6 +1892,7 @@ static void update_state(VP9_COMP *cpi, ThreadData *td, PICK_MODE_CONTEXT *ctx,
     }
   }
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
                           int mi_row, int mi_col) {
@@ -1398,13 +1910,17 @@ void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
 }
 
 static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode,
+                                   INTERP_FILTER interp_filter,
                                    RD_COST *rd_cost, BLOCK_SIZE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mi = xd->mi[0];
   INTERP_FILTER filter_ref;
 
   filter_ref = get_pred_context_switchable_interp(xd);
-  if (filter_ref == SWITCHABLE_FILTERS) filter_ref = EIGHTTAP;
+  if (interp_filter == BILINEAR)
+    filter_ref = BILINEAR;
+  else if (filter_ref == SWITCHABLE_FILTERS)
+    filter_ref = EIGHTTAP;
 
   mi->sb_type = bsize;
   mi->mode = ZEROMV;
@@ -1413,7 +1929,7 @@ static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode,
   mi->skip = 1;
   mi->uv_mode = DC_PRED;
   mi->ref_frame[0] = LAST_FRAME;
-  mi->ref_frame[1] = NONE;
+  mi->ref_frame[1] = NO_REF_FRAME;
   mi->mv[0].as_int = 0;
   mi->interp_filter = filter_ref;
 
@@ -1423,20 +1939,41 @@ static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode,
   vp9_rd_cost_init(rd_cost);
 }
 
-static int set_segment_rdmult(VP9_COMP *const cpi, MACROBLOCK *const x,
-                              int8_t segment_id) {
-  int segment_qindex;
+#if !CONFIG_REALTIME_ONLY
+static void set_segment_rdmult(VP9_COMP *const cpi, MACROBLOCK *const x,
+                               int mi_row, int mi_col, BLOCK_SIZE bsize,
+                               AQ_MODE aq_mode) {
   VP9_COMMON *const cm = &cpi->common;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  const uint8_t *const map =
+      cm->seg.update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
+
   vp9_init_plane_quantizers(cpi, x);
   vpx_clear_system_state();
-  segment_qindex = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
-  return vp9_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q);
+
+  if (aq_mode == NO_AQ || aq_mode == PSNR_AQ) {
+    if (cpi->sf.enable_tpl_model) x->rdmult = x->cb_rdmult;
+  } else if (aq_mode == PERCEPTUAL_AQ) {
+    x->rdmult = x->cb_rdmult;
+  } else if (aq_mode == CYCLIC_REFRESH_AQ) {
+    // If segment is boosted, use rdmult for that segment.
+    if (cyclic_refresh_segment_id_boosted(
+            get_segment_id(cm, map, bsize, mi_row, mi_col)))
+      x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
+  } else {
+    x->rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q);
+  }
+
+  if (oxcf->tuning == VP8_TUNE_SSIM) {
+    set_ssim_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult);
+  }
 }
 
 static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
                              MACROBLOCK *const x, int mi_row, int mi_col,
                              RD_COST *rd_cost, BLOCK_SIZE bsize,
-                             PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
+                             PICK_MODE_CONTEXT *ctx, int rate_in_best_rd,
+                             int64_t dist_in_best_rd) {
   VP9_COMMON *const cm = &cpi->common;
   TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -1445,8 +1982,12 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
   struct macroblockd_plane *const pd = xd->plane;
   const AQ_MODE aq_mode = cpi->oxcf.aq_mode;
   int i, orig_rdmult;
+  int64_t best_rd = INT64_MAX;
 
   vpx_clear_system_state();
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, rd_pick_sb_modes_time);
+#endif
 
   // Use the lower precision, but faster, 32x32 fdct for mode selection.
   x->use_lp32x32fdct = 1;
@@ -1485,59 +2026,27 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
   // Save rdmult before it might be changed, so it can be restored later.
   orig_rdmult = x->rdmult;
 
-  if ((cpi->sf.tx_domain_thresh > 0.0) || (cpi->sf.quant_opt_thresh > 0.0)) {
+  if ((cpi->sf.tx_domain_thresh > 0.0) ||
+      (cpi->sf.trellis_opt_tx_rd.thresh > 0.0)) {
     double logvar = vp9_log_block_var(cpi, x, bsize);
-    // Check block complexity as part of descision on using pixel or transform
+    // Check block complexity as part of decision on using pixel or transform
     // domain distortion in rd tests.
     x->block_tx_domain = cpi->sf.allow_txfm_domain_distortion &&
                          (logvar >= cpi->sf.tx_domain_thresh);
 
-    // Check block complexity as part of descision on using quantized
-    // coefficient optimisation inside the rd loop.
-    x->block_qcoeff_opt =
-        cpi->sf.allow_quant_coeff_opt && (logvar <= cpi->sf.quant_opt_thresh);
+    // Store block complexity to decide on using quantized coefficient
+    // optimization inside the rd loop.
+    x->log_block_src_var = logvar;
   } else {
     x->block_tx_domain = cpi->sf.allow_txfm_domain_distortion;
-    x->block_qcoeff_opt = cpi->sf.allow_quant_coeff_opt;
+    x->log_block_src_var = 0.0;
   }
 
-  if (aq_mode == VARIANCE_AQ) {
-    const int energy =
-        bsize <= BLOCK_16X16 ? x->mb_energy : vp9_block_energy(cpi, x, bsize);
-
-    if (cm->frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame ||
-        cpi->force_update_segmentation ||
-        (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
-      mi->segment_id = vp9_vaq_segment_id(energy);
-    } else {
-      const uint8_t *const map =
-          cm->seg.update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
-      mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
-    }
-    x->rdmult = set_segment_rdmult(cpi, x, mi->segment_id);
-  } else if (aq_mode == LOOKAHEAD_AQ) {
-    const uint8_t *const map = cpi->segmentation_map;
-
-    // I do not change rdmult here consciously.
-    mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
-  } else if (aq_mode == EQUATOR360_AQ) {
-    if (cm->frame_type == KEY_FRAME || cpi->force_update_segmentation) {
-      mi->segment_id = vp9_360aq_segment_id(mi_row, cm->mi_rows);
-    } else {
-      const uint8_t *const map =
-          cm->seg.update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
-      mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
-    }
-    x->rdmult = set_segment_rdmult(cpi, x, mi->segment_id);
-  } else if (aq_mode == COMPLEXITY_AQ) {
-    x->rdmult = set_segment_rdmult(cpi, x, mi->segment_id);
-  } else if (aq_mode == CYCLIC_REFRESH_AQ) {
-    const uint8_t *const map =
-        cm->seg.update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
-    // If segment is boosted, use rdmult for that segment.
-    if (cyclic_refresh_segment_id_boosted(
-            get_segment_id(cm, map, bsize, mi_row, mi_col)))
-      x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
+  set_segment_index(cpi, x, mi_row, mi_col, bsize, 0);
+  set_segment_rdmult(cpi, x, mi_row, mi_col, bsize, aq_mode);
+  if (rate_in_best_rd < INT_MAX && dist_in_best_rd < INT64_MAX) {
+    best_rd = vp9_calculate_rd_cost(x->rdmult, x->rddiv, rate_in_best_rd,
+                                    dist_in_best_rd);
   }
 
   // Find best coding mode & reconstruct the MB so it is available
@@ -1546,15 +2055,27 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
     vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd);
   } else {
     if (bsize >= BLOCK_8X8) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, vp9_rd_pick_inter_mode_sb_time);
+#endif
       if (segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP))
         vp9_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, rd_cost, bsize,
                                            ctx, best_rd);
       else
         vp9_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost,
                                   bsize, ctx, best_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, vp9_rd_pick_inter_mode_sb_time);
+#endif
     } else {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, vp9_rd_pick_inter_mode_sub8x8_time);
+#endif
       vp9_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col, rd_cost,
                                     bsize, ctx, best_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, vp9_rd_pick_inter_mode_sub8x8_time);
+#endif
     }
   }
 
@@ -1566,15 +2087,22 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
     vp9_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate);
   }
 
-  x->rdmult = orig_rdmult;
-
   // TODO(jingning) The rate-distortion optimization flow needs to be
   // refactored to provide proper exit/return handle.
-  if (rd_cost->rate == INT_MAX) rd_cost->rdcost = INT64_MAX;
+  if (rd_cost->rate == INT_MAX || rd_cost->dist == INT64_MAX)
+    rd_cost->rdcost = INT64_MAX;
+  else
+    rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
+
+  x->rdmult = orig_rdmult;
 
   ctx->rate = rd_cost->rate;
   ctx->dist = rd_cost->dist;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, rd_pick_sb_modes_time);
+#endif
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 static void update_stats(VP9_COMMON *cm, ThreadData *td) {
   const MACROBLOCK *x = &td->mb;
@@ -1600,8 +2128,10 @@ static void update_stats(VP9_COMMON *cm, ThreadData *td) {
                             [has_second_ref(mi)]++;
 
         if (has_second_ref(mi)) {
-          counts->comp_ref[vp9_get_pred_context_comp_ref_p(cm, xd)]
-                          [ref0 == GOLDEN_FRAME]++;
+          const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
+          const int ctx = vp9_get_pred_context_comp_ref_p(cm, xd);
+          const int bit = mi->ref_frame[!idx] == cm->comp_var_ref[1];
+          counts->comp_ref[ctx][bit]++;
         } else {
           counts->single_ref[vp9_get_pred_context_single_ref_p1(xd)][0]
                             [ref0 != LAST_FRAME]++;
@@ -1633,6 +2163,7 @@ static void update_stats(VP9_COMMON *cm, ThreadData *td) {
   }
 }
 
+#if !CONFIG_REALTIME_ONLY
 static void restore_context(MACROBLOCK *const x, int mi_row, int mi_col,
                             ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
                             ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
@@ -1697,6 +2228,16 @@ static void encode_b(VP9_COMP *cpi, const TileInfo *const tile, ThreadData *td,
                      PICK_MODE_CONTEXT *ctx) {
   MACROBLOCK *const x = &td->mb;
   set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
+
+  if (cpi->sf.enable_tpl_model &&
+      (cpi->oxcf.aq_mode == NO_AQ || cpi->oxcf.aq_mode == PERCEPTUAL_AQ)) {
+    const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+    x->rdmult = x->cb_rdmult;
+    if (oxcf->tuning == VP8_TUNE_SSIM) {
+      set_ssim_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult);
+    }
+  }
+
   update_state(cpi, td, ctx, mi_row, mi_col, bsize, output_enabled);
   encode_superblock(cpi, td, tp, output_enabled, mi_row, mi_col, bsize, ctx);
 
@@ -1755,27 +2296,28 @@ static void encode_sb(VP9_COMP *cpi, ThreadData *td, const TileInfo *const tile,
                  subsize, &pc_tree->horizontal[1]);
       }
       break;
-    case PARTITION_SPLIT:
+    default:
+      assert(partition == PARTITION_SPLIT);
       if (bsize == BLOCK_8X8) {
         encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
-                 pc_tree->leaf_split[0]);
+                 pc_tree->u.leaf_split[0]);
       } else {
         encode_sb(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
-                  pc_tree->split[0]);
+                  pc_tree->u.split[0]);
         encode_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled,
-                  subsize, pc_tree->split[1]);
+                  subsize, pc_tree->u.split[1]);
         encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled,
-                  subsize, pc_tree->split[2]);
+                  subsize, pc_tree->u.split[2]);
         encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled,
-                  subsize, pc_tree->split[3]);
+                  subsize, pc_tree->u.split[3]);
       }
       break;
-    default: assert(0 && "Invalid partition type."); break;
   }
 
   if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
     update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 // Check to see if the given partition size is allowed for a specified number
 // of 8x8 block rows and columns remaining in the image.
@@ -1849,120 +2391,6 @@ static void set_fixed_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
   }
 }
 
-static const struct {
-  int row;
-  int col;
-} coord_lookup[16] = {
-  // 32x32 index = 0
-  { 0, 0 },
-  { 0, 2 },
-  { 2, 0 },
-  { 2, 2 },
-  // 32x32 index = 1
-  { 0, 4 },
-  { 0, 6 },
-  { 2, 4 },
-  { 2, 6 },
-  // 32x32 index = 2
-  { 4, 0 },
-  { 4, 2 },
-  { 6, 0 },
-  { 6, 2 },
-  // 32x32 index = 3
-  { 4, 4 },
-  { 4, 6 },
-  { 6, 4 },
-  { 6, 6 },
-};
-
-static void set_source_var_based_partition(VP9_COMP *cpi,
-                                           const TileInfo *const tile,
-                                           MACROBLOCK *const x,
-                                           MODE_INFO **mi_8x8, int mi_row,
-                                           int mi_col) {
-  VP9_COMMON *const cm = &cpi->common;
-  const int mis = cm->mi_stride;
-  const int row8x8_remaining = tile->mi_row_end - mi_row;
-  const int col8x8_remaining = tile->mi_col_end - mi_col;
-  MODE_INFO *mi_upper_left = cm->mi + mi_row * mis + mi_col;
-
-  vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col);
-
-  assert((row8x8_remaining > 0) && (col8x8_remaining > 0));
-
-  // In-image SB64
-  if ((col8x8_remaining >= MI_BLOCK_SIZE) &&
-      (row8x8_remaining >= MI_BLOCK_SIZE)) {
-    int i, j;
-    int index;
-    diff d32[4];
-    const int offset = (mi_row >> 1) * cm->mb_cols + (mi_col >> 1);
-    int is_larger_better = 0;
-    int use32x32 = 0;
-    unsigned int thr = cpi->source_var_thresh;
-
-    memset(d32, 0, 4 * sizeof(diff));
-
-    for (i = 0; i < 4; i++) {
-      diff *d16[4];
-
-      for (j = 0; j < 4; j++) {
-        int b_mi_row = coord_lookup[i * 4 + j].row;
-        int b_mi_col = coord_lookup[i * 4 + j].col;
-        int boffset = b_mi_row / 2 * cm->mb_cols + b_mi_col / 2;
-
-        d16[j] = cpi->source_diff_var + offset + boffset;
-
-        index = b_mi_row * mis + b_mi_col;
-        mi_8x8[index] = mi_upper_left + index;
-        mi_8x8[index]->sb_type = BLOCK_16X16;
-
-        // TODO(yunqingwang): If d16[j].var is very large, use 8x8 partition
-        // size to further improve quality.
-      }
-
-      is_larger_better = (d16[0]->var < thr) && (d16[1]->var < thr) &&
-                         (d16[2]->var < thr) && (d16[3]->var < thr);
-
-      // Use 32x32 partition
-      if (is_larger_better) {
-        use32x32 += 1;
-
-        for (j = 0; j < 4; j++) {
-          d32[i].sse += d16[j]->sse;
-          d32[i].sum += d16[j]->sum;
-        }
-
-        d32[i].var =
-            (unsigned int)(d32[i].sse -
-                           (unsigned int)(((int64_t)d32[i].sum * d32[i].sum) >>
-                                          10));
-
-        index = coord_lookup[i * 4].row * mis + coord_lookup[i * 4].col;
-        mi_8x8[index] = mi_upper_left + index;
-        mi_8x8[index]->sb_type = BLOCK_32X32;
-      }
-    }
-
-    if (use32x32 == 4) {
-      thr <<= 1;
-      is_larger_better = (d32[0].var < thr) && (d32[1].var < thr) &&
-                         (d32[2].var < thr) && (d32[3].var < thr);
-
-      // Use 64x64 partition
-      if (is_larger_better) {
-        mi_8x8[0] = mi_upper_left;
-        mi_8x8[0]->sb_type = BLOCK_64X64;
-      }
-    }
-  } else {  // partial in-image SB64
-    int bh = num_8x8_blocks_high_lookup[BLOCK_16X16];
-    int bw = num_8x8_blocks_wide_lookup[BLOCK_16X16];
-    set_partial_b64x64_partition(mi_upper_left, mis, bh, bw, row8x8_remaining,
-                                 col8x8_remaining, BLOCK_16X16, mi_8x8);
-  }
-}
-
 static void update_state_rt(VP9_COMP *cpi, ThreadData *td,
                             PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
                             int bsize) {
@@ -1980,17 +2408,17 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td,
   *(xd->mi[0]) = ctx->mic;
   *(x->mbmi_ext) = ctx->mbmi_ext;
 
-  if (seg->enabled && cpi->oxcf.aq_mode != NO_AQ) {
-    // For in frame complexity AQ or variance AQ, copy segment_id from
-    // segmentation_map.
-    if (cpi->oxcf.aq_mode != CYCLIC_REFRESH_AQ) {
+  if (seg->enabled && (cpi->oxcf.aq_mode != NO_AQ || cpi->roi.enabled ||
+                       cpi->active_map.enabled)) {
+    // Setting segmentation map for cyclic_refresh.
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+        cpi->cyclic_refresh->content_mode) {
+      vp9_cyclic_refresh_update_segment(cpi, mi, mi_row, mi_col, bsize,
+                                        ctx->rate, ctx->dist, x->skip, p);
+    } else {
       const uint8_t *const map =
           seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
       mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
-    } else {
-      // Setting segmentation map for cyclic_refresh.
-      vp9_cyclic_refresh_update_segment(cpi, mi, mi_row, mi_col, bsize,
-                                        ctx->rate, ctx->dist, x->skip, p);
     }
     vp9_init_plane_quantizers(cpi, x);
   }
@@ -2028,7 +2456,7 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td,
   }
 
   x->skip = ctx->skip;
-  x->skip_txfm[0] = mi->segment_id ? 0 : ctx->skip_txfm[0];
+  x->skip_txfm[0] = (mi->segment_id || xd->lossless) ? 0 : ctx->skip_txfm[0];
 }
 
 static void encode_b_rt(VP9_COMP *cpi, ThreadData *td,
@@ -2096,24 +2524,25 @@ static void encode_sb_rt(VP9_COMP *cpi, ThreadData *td,
                     subsize, &pc_tree->horizontal[1]);
       }
       break;
-    case PARTITION_SPLIT:
+    default:
+      assert(partition == PARTITION_SPLIT);
       subsize = get_subsize(bsize, PARTITION_SPLIT);
       encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
-                   pc_tree->split[0]);
+                   pc_tree->u.split[0]);
       encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled,
-                   subsize, pc_tree->split[1]);
+                   subsize, pc_tree->u.split[1]);
       encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled,
-                   subsize, pc_tree->split[2]);
+                   subsize, pc_tree->u.split[2]);
       encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs,
-                   output_enabled, subsize, pc_tree->split[3]);
+                   output_enabled, subsize, pc_tree->u.split[3]);
       break;
-    default: assert(0 && "Invalid partition type."); break;
   }
 
   if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
     update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 }
 
+#if !CONFIG_REALTIME_ONLY
 static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
                              TileDataEnc *tile_data, MODE_INFO **mi_8x8,
                              TOKENEXTRA **tp, int mi_row, int mi_col,
@@ -2182,7 +2611,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
         mi_col + (mi_step >> 1) < cm->mi_cols) {
       pc_tree->partitioning = PARTITION_NONE;
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize, ctx,
-                       INT64_MAX);
+                       INT_MAX, INT64_MAX);
 
       pl = partition_plane_context(xd, mi_row, mi_col, bsize);
 
@@ -2201,20 +2630,23 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
   switch (partition) {
     case PARTITION_NONE:
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, bsize,
-                       ctx, INT64_MAX);
+                       ctx, INT_MAX, INT64_MAX);
       break;
     case PARTITION_HORZ:
+      pc_tree->horizontal[0].skip_ref_frame_mask = 0;
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-                       subsize, &pc_tree->horizontal[0], INT64_MAX);
+                       subsize, &pc_tree->horizontal[0], INT_MAX, INT64_MAX);
       if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
           mi_row + (mi_step >> 1) < cm->mi_rows) {
         RD_COST tmp_rdc;
-        PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
+        PICK_MODE_CONTEXT *hctx = &pc_tree->horizontal[0];
         vp9_rd_cost_init(&tmp_rdc);
-        update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
-        encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+        update_state(cpi, td, hctx, mi_row, mi_col, subsize, 0);
+        encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, hctx);
+        pc_tree->horizontal[1].skip_ref_frame_mask = 0;
         rd_pick_sb_modes(cpi, tile_data, x, mi_row + (mi_step >> 1), mi_col,
-                         &tmp_rdc, subsize, &pc_tree->horizontal[1], INT64_MAX);
+                         &tmp_rdc, subsize, &pc_tree->horizontal[1], INT_MAX,
+                         INT64_MAX);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           vp9_rd_cost_reset(&last_part_rdc);
           break;
@@ -2225,18 +2657,20 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
       }
       break;
     case PARTITION_VERT:
+      pc_tree->vertical[0].skip_ref_frame_mask = 0;
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-                       subsize, &pc_tree->vertical[0], INT64_MAX);
+                       subsize, &pc_tree->vertical[0], INT_MAX, INT64_MAX);
       if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
           mi_col + (mi_step >> 1) < cm->mi_cols) {
         RD_COST tmp_rdc;
-        PICK_MODE_CONTEXT *ctx = &pc_tree->vertical[0];
+        PICK_MODE_CONTEXT *vctx = &pc_tree->vertical[0];
         vp9_rd_cost_init(&tmp_rdc);
-        update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
-        encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
-        rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + (mi_step >> 1),
-                         &tmp_rdc, subsize,
-                         &pc_tree->vertical[bsize > BLOCK_8X8], INT64_MAX);
+        update_state(cpi, td, vctx, mi_row, mi_col, subsize, 0);
+        encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, vctx);
+        pc_tree->vertical[bsize > BLOCK_8X8].skip_ref_frame_mask = 0;
+        rd_pick_sb_modes(
+            cpi, tile_data, x, mi_row, mi_col + (mi_step >> 1), &tmp_rdc,
+            subsize, &pc_tree->vertical[bsize > BLOCK_8X8], INT_MAX, INT64_MAX);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           vp9_rd_cost_reset(&last_part_rdc);
           break;
@@ -2246,10 +2680,11 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
         last_part_rdc.rdcost += tmp_rdc.rdcost;
       }
       break;
-    case PARTITION_SPLIT:
+    default:
+      assert(partition == PARTITION_SPLIT);
       if (bsize == BLOCK_8X8) {
         rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-                         subsize, pc_tree->leaf_split[0], INT64_MAX);
+                         subsize, pc_tree->u.leaf_split[0], INT_MAX, INT64_MAX);
         break;
       }
       last_part_rdc.rate = 0;
@@ -2267,7 +2702,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
         rd_use_partition(cpi, td, tile_data, mi_8x8 + jj * bss * mis + ii * bss,
                          tp, mi_row + y_idx, mi_col + x_idx, subsize,
                          &tmp_rdc.rate, &tmp_rdc.dist, i != 3,
-                         pc_tree->split[i]);
+                         pc_tree->u.split[i]);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           vp9_rd_cost_reset(&last_part_rdc);
           break;
@@ -2276,7 +2711,6 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
         last_part_rdc.dist += tmp_rdc.dist;
       }
       break;
-    default: assert(0); break;
   }
 
   pl = partition_plane_context(xd, mi_row, mi_col, bsize);
@@ -2304,17 +2738,15 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
       int x_idx = (i & 1) * (mi_step >> 1);
       int y_idx = (i >> 1) * (mi_step >> 1);
       RD_COST tmp_rdc;
-      ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
-      PARTITION_CONTEXT sl[8], sa[8];
 
       if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
         continue;
 
       save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
-      pc_tree->split[i]->partitioning = PARTITION_NONE;
+      pc_tree->u.split[i]->partitioning = PARTITION_NONE;
       rd_pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
-                       &tmp_rdc, split_subsize, &pc_tree->split[i]->none,
-                       INT64_MAX);
+                       &tmp_rdc, split_subsize, &pc_tree->u.split[i]->none,
+                       INT_MAX, INT64_MAX);
 
       restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
 
@@ -2328,7 +2760,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
 
       if (i != 3)
         encode_sb(cpi, td, tile_info, tp, mi_row + y_idx, mi_col + x_idx, 0,
-                  split_subsize, pc_tree->split[i]);
+                  split_subsize, pc_tree->u.split[i]);
 
       pl = partition_plane_context(xd, mi_row + y_idx, mi_col + x_idx,
                                    split_subsize);
@@ -2511,14 +2943,12 @@ static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd, int mi_row,
   min_size = BLOCK_64X64;
   max_size = BLOCK_4X4;
 
-  if (prev_mi) {
-    for (idy = 0; idy < mi_height; ++idy) {
-      for (idx = 0; idx < mi_width; ++idx) {
-        mi = prev_mi[idy * cm->mi_stride + idx];
-        bs = mi ? mi->sb_type : bsize;
-        min_size = VPXMIN(min_size, bs);
-        max_size = VPXMAX(max_size, bs);
-      }
+  for (idy = 0; idy < mi_height; ++idy) {
+    for (idx = 0; idx < mi_width; ++idx) {
+      mi = prev_mi[idy * cm->mi_stride + idx];
+      bs = mi ? mi->sb_type : bsize;
+      min_size = VPXMIN(min_size, bs);
+      max_size = VPXMAX(max_size, bs);
     }
   }
 
@@ -2548,6 +2978,7 @@ static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd, int mi_row,
   *min_bs = min_size;
   *max_bs = max_size;
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
   memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv));
@@ -2557,63 +2988,689 @@ static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
   memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv));
 }
 
-#if CONFIG_FP_MB_STATS
-const int num_16x16_blocks_wide_lookup[BLOCK_SIZES] = { 1, 1, 1, 1, 1, 1, 1,
-                                                        1, 2, 2, 2, 4, 4 };
-const int num_16x16_blocks_high_lookup[BLOCK_SIZES] = { 1, 1, 1, 1, 1, 1, 1,
-                                                        2, 1, 2, 4, 2, 4 };
-const int qindex_skip_threshold_lookup[BLOCK_SIZES] = {
-  0, 10, 10, 30, 40, 40, 60, 80, 80, 90, 100, 100, 120
-};
-const int qindex_split_threshold_lookup[BLOCK_SIZES] = {
-  0, 3, 3, 7, 15, 15, 30, 40, 40, 60, 80, 80, 120
-};
-const int complexity_16x16_blocks_threshold[BLOCK_SIZES] = {
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 6
-};
+// Calculate prediction based on the given input features and neural net config.
+// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
+// layer.
+static void nn_predict(const float *features, const NN_CONFIG *nn_config,
+                       float *output) {
+  int num_input_nodes = nn_config->num_inputs;
+  int buf_index = 0;
+  float buf[2][NN_MAX_NODES_PER_LAYER];
+  const float *input_nodes = features;
 
-typedef enum {
-  MV_ZERO = 0,
-  MV_LEFT = 1,
-  MV_UP = 2,
-  MV_RIGHT = 3,
-  MV_DOWN = 4,
-  MV_INVALID
-} MOTION_DIRECTION;
+  // Propagate hidden layers.
+  const int num_layers = nn_config->num_hidden_layers;
+  int layer, node, i;
+  assert(num_layers <= NN_MAX_HIDDEN_LAYERS);
+  for (layer = 0; layer < num_layers; ++layer) {
+    const float *weights = nn_config->weights[layer];
+    const float *bias = nn_config->bias[layer];
+    float *output_nodes = buf[buf_index];
+    const int num_output_nodes = nn_config->num_hidden_nodes[layer];
+    assert(num_output_nodes < NN_MAX_NODES_PER_LAYER);
+    for (node = 0; node < num_output_nodes; ++node) {
+      float val = 0.0f;
+      for (i = 0; i < num_input_nodes; ++i) val += weights[i] * input_nodes[i];
+      val += bias[node];
+      // ReLU as activation function.
+      val = VPXMAX(val, 0.0f);
+      output_nodes[node] = val;
+      weights += num_input_nodes;
+    }
+    num_input_nodes = num_output_nodes;
+    input_nodes = output_nodes;
+    buf_index = 1 - buf_index;
+  }
 
-static INLINE MOTION_DIRECTION get_motion_direction_fp(uint8_t fp_byte) {
-  if (fp_byte & FPMB_MOTION_ZERO_MASK) {
-    return MV_ZERO;
-  } else if (fp_byte & FPMB_MOTION_LEFT_MASK) {
-    return MV_LEFT;
-  } else if (fp_byte & FPMB_MOTION_RIGHT_MASK) {
-    return MV_RIGHT;
-  } else if (fp_byte & FPMB_MOTION_UP_MASK) {
-    return MV_UP;
-  } else {
-    return MV_DOWN;
+  // Final output layer.
+  {
+    const float *weights = nn_config->weights[num_layers];
+    for (node = 0; node < nn_config->num_outputs; ++node) {
+      const float *bias = nn_config->bias[num_layers];
+      float val = 0.0f;
+      for (i = 0; i < num_input_nodes; ++i) val += weights[i] * input_nodes[i];
+      output[node] = val + bias[node];
+      weights += num_input_nodes;
+    }
   }
 }
 
-static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv,
-                                           MOTION_DIRECTION that_mv) {
-  if (this_mv == that_mv) {
-    return 0;
-  } else {
-    return abs(this_mv - that_mv) == 2 ? 2 : 1;
+#if !CONFIG_REALTIME_ONLY
+#define FEATURES 7
+// Machine-learning based partition search early termination.
+// Return 1 to skip split and rect partitions.
+static int ml_pruning_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                                PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
+                                BLOCK_SIZE bsize) {
+  const int mag_mv =
+      abs(ctx->mic.mv[0].as_mv.col) + abs(ctx->mic.mv[0].as_mv.row);
+  const int left_in_image = !!xd->left_mi;
+  const int above_in_image = !!xd->above_mi;
+  MODE_INFO **prev_mi =
+      &cm->prev_mi_grid_visible[mi_col + cm->mi_stride * mi_row];
+  int above_par = 0;  // above_partitioning
+  int left_par = 0;   // left_partitioning
+  int last_par = 0;   // last_partitioning
+  int offset = 0;
+  int i;
+  BLOCK_SIZE context_size;
+  const NN_CONFIG *nn_config = NULL;
+  const float *mean, *sd, *linear_weights;
+  float nn_score, linear_score;
+  float features[FEATURES];
+
+  assert(b_width_log2_lookup[bsize] == b_height_log2_lookup[bsize]);
+  vpx_clear_system_state();
+
+  switch (bsize) {
+    case BLOCK_64X64:
+      offset = 0;
+      nn_config = &vp9_partition_nnconfig_64x64;
+      break;
+    case BLOCK_32X32:
+      offset = 8;
+      nn_config = &vp9_partition_nnconfig_32x32;
+      break;
+    case BLOCK_16X16:
+      offset = 16;
+      nn_config = &vp9_partition_nnconfig_16x16;
+      break;
+    default: assert(0 && "Unexpected block size."); return 0;
   }
+
+  if (above_in_image) {
+    context_size = xd->above_mi->sb_type;
+    if (context_size < bsize)
+      above_par = 2;
+    else if (context_size == bsize)
+      above_par = 1;
+  }
+
+  if (left_in_image) {
+    context_size = xd->left_mi->sb_type;
+    if (context_size < bsize)
+      left_par = 2;
+    else if (context_size == bsize)
+      left_par = 1;
+  }
+
+  if (prev_mi[0]) {
+    context_size = prev_mi[0]->sb_type;
+    if (context_size < bsize)
+      last_par = 2;
+    else if (context_size == bsize)
+      last_par = 1;
+  }
+
+  mean = &vp9_partition_feature_mean[offset];
+  sd = &vp9_partition_feature_std[offset];
+  features[0] = ((float)ctx->rate - mean[0]) / sd[0];
+  features[1] = ((float)ctx->dist - mean[1]) / sd[1];
+  features[2] = ((float)mag_mv / 2 - mean[2]) * sd[2];
+  features[3] = ((float)(left_par + above_par) / 2 - mean[3]) * sd[3];
+  features[4] = ((float)ctx->sum_y_eobs - mean[4]) / sd[4];
+  features[5] = ((float)cm->base_qindex - mean[5]) * sd[5];
+  features[6] = ((float)last_par - mean[6]) * sd[6];
+
+  // Predict using linear model.
+  linear_weights = &vp9_partition_linear_weights[offset];
+  linear_score = linear_weights[FEATURES];
+  for (i = 0; i < FEATURES; ++i)
+    linear_score += linear_weights[i] * features[i];
+  if (linear_score > 0.1f) return 0;
+
+  // Predict using neural net model.
+  nn_predict(features, nn_config, &nn_score);
+
+  if (linear_score < -0.0f && nn_score < 0.1f) return 1;
+  if (nn_score < -0.0f && linear_score < 0.1f) return 1;
+  return 0;
 }
+#undef FEATURES
+
+#define FEATURES 4
+// ML-based partition search breakout.
+static int ml_predict_breakout(VP9_COMP *const cpi, BLOCK_SIZE bsize,
+                               const MACROBLOCK *const x,
+                               const RD_COST *const rd_cost) {
+  DECLARE_ALIGNED(16, static const uint8_t, vp9_64_zeros[64]) = { 0 };
+  const VP9_COMMON *const cm = &cpi->common;
+  float features[FEATURES];
+  const float *linear_weights = NULL;  // Linear model weights.
+  float linear_score = 0.0f;
+  const int qindex = cm->base_qindex;
+  const int q_ctx = qindex >= 200 ? 0 : (qindex >= 150 ? 1 : 2);
+  const int is_720p_or_larger = VPXMIN(cm->width, cm->height) >= 720;
+  const int resolution_ctx = is_720p_or_larger ? 1 : 0;
+
+  switch (bsize) {
+    case BLOCK_64X64:
+      linear_weights = vp9_partition_breakout_weights_64[resolution_ctx][q_ctx];
+      break;
+    case BLOCK_32X32:
+      linear_weights = vp9_partition_breakout_weights_32[resolution_ctx][q_ctx];
+      break;
+    case BLOCK_16X16:
+      linear_weights = vp9_partition_breakout_weights_16[resolution_ctx][q_ctx];
+      break;
+    case BLOCK_8X8:
+      linear_weights = vp9_partition_breakout_weights_8[resolution_ctx][q_ctx];
+      break;
+    default: assert(0 && "Unexpected block size."); return 0;
+  }
+  if (!linear_weights) return 0;
+
+  {  // Generate feature values.
+#if CONFIG_VP9_HIGHBITDEPTH
+    const int ac_q =
+        vp9_ac_quant(cm->base_qindex, 0, cm->bit_depth) >> (x->e_mbd.bd - 8);
+#else
+    const int ac_q = vp9_ac_quant(qindex, 0, cm->bit_depth);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    const int num_pels_log2 = num_pels_log2_lookup[bsize];
+    int feature_index = 0;
+    unsigned int var, sse;
+    float rate_f, dist_f;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      var =
+          vp9_high_get_sby_variance(cpi, &x->plane[0].src, bsize, x->e_mbd.bd);
+    } else {
+      var = cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride,
+                                  vp9_64_zeros, 0, &sse);
+    }
+#else
+    var = cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride,
+                                vp9_64_zeros, 0, &sse);
 #endif
+    var = var >> num_pels_log2;
 
+    vpx_clear_system_state();
+
+    rate_f = (float)VPXMIN(rd_cost->rate, INT_MAX);
+    dist_f = (float)(VPXMIN(rd_cost->dist, INT_MAX) >> num_pels_log2);
+    rate_f =
+        ((float)x->rdmult / 128.0f / 512.0f / (float)(1 << num_pels_log2)) *
+        rate_f;
+
+    features[feature_index++] = rate_f;
+    features[feature_index++] = dist_f;
+    features[feature_index++] = (float)var;
+    features[feature_index++] = (float)ac_q;
+    assert(feature_index == FEATURES);
+  }
+
+  {  // Calculate the output score.
+    int i;
+    linear_score = linear_weights[FEATURES];
+    for (i = 0; i < FEATURES; ++i)
+      linear_score += linear_weights[i] * features[i];
+  }
+
+  return linear_score >= cpi->sf.rd_ml_partition.search_breakout_thresh[q_ctx];
+}
+#undef FEATURES
+
+#define FEATURES 8
+#define LABELS 4
+static void ml_prune_rect_partition(VP9_COMP *const cpi, MACROBLOCK *const x,
+                                    BLOCK_SIZE bsize,
+                                    const PC_TREE *const pc_tree,
+                                    int *allow_horz, int *allow_vert,
+                                    int64_t ref_rd) {
+  const NN_CONFIG *nn_config = NULL;
+  float score[LABELS] = {
+    0.0f,
+  };
+  int thresh = -1;
+  int i;
+  (void)x;
+
+  if (ref_rd <= 0 || ref_rd > 1000000000) return;
+
+  switch (bsize) {
+    case BLOCK_8X8: break;
+    case BLOCK_16X16:
+      nn_config = &vp9_rect_part_nnconfig_16;
+      thresh = cpi->sf.rd_ml_partition.prune_rect_thresh[1];
+      break;
+    case BLOCK_32X32:
+      nn_config = &vp9_rect_part_nnconfig_32;
+      thresh = cpi->sf.rd_ml_partition.prune_rect_thresh[2];
+      break;
+    case BLOCK_64X64:
+      nn_config = &vp9_rect_part_nnconfig_64;
+      thresh = cpi->sf.rd_ml_partition.prune_rect_thresh[3];
+      break;
+    default: assert(0 && "Unexpected block size."); return;
+  }
+  if (!nn_config || thresh < 0) return;
+
+  // Feature extraction and model score calculation.
+  {
+    const VP9_COMMON *const cm = &cpi->common;
+#if CONFIG_VP9_HIGHBITDEPTH
+    const int dc_q =
+        vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth) >> (x->e_mbd.bd - 8);
+#else
+    const int dc_q = vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    const int bs = 4 * num_4x4_blocks_wide_lookup[bsize];
+    int feature_index = 0;
+    float features[FEATURES];
+
+    features[feature_index++] = logf((float)dc_q + 1.0f);
+    features[feature_index++] =
+        (float)(pc_tree->partitioning == PARTITION_NONE);
+    features[feature_index++] = logf((float)ref_rd / bs / bs + 1.0f);
+
+    {
+      const float norm_factor = 1.0f / ((float)ref_rd + 1.0f);
+      const int64_t none_rdcost = pc_tree->none.rdcost;
+      float rd_ratio = 2.0f;
+      if (none_rdcost > 0 && none_rdcost < 1000000000)
+        rd_ratio = (float)none_rdcost * norm_factor;
+      features[feature_index++] = VPXMIN(rd_ratio, 2.0f);
+
+      for (i = 0; i < 4; ++i) {
+        const int64_t this_rd = pc_tree->u.split[i]->none.rdcost;
+        const int rd_valid = this_rd > 0 && this_rd < 1000000000;
+        // Ratio between sub-block RD and whole block RD.
+        features[feature_index++] =
+            rd_valid ? (float)this_rd * norm_factor : 1.0f;
+      }
+    }
+
+    assert(feature_index == FEATURES);
+    nn_predict(features, nn_config, score);
+  }
+
+  // Make decisions based on the model score.
+  {
+    int max_score = -1000;
+    int horz = 0, vert = 0;
+    int int_score[LABELS];
+    for (i = 0; i < LABELS; ++i) {
+      int_score[i] = (int)(100 * score[i]);
+      max_score = VPXMAX(int_score[i], max_score);
+    }
+    thresh = max_score - thresh;
+    for (i = 0; i < LABELS; ++i) {
+      if (int_score[i] >= thresh) {
+        if ((i >> 0) & 1) horz = 1;
+        if ((i >> 1) & 1) vert = 1;
+      }
+    }
+    *allow_horz = *allow_horz && horz;
+    *allow_vert = *allow_vert && vert;
+  }
+}
+#undef FEATURES
+#undef LABELS
+
+// Perform fast and coarse motion search for the given block. This is a
+// pre-processing step for the ML based partition search speedup.
+static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x,
+                                 BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                 MV ref_mv, MV_REFERENCE_FRAME ref,
+                                 uint8_t *const pred_buf) {
+  const VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mi = xd->mi[0];
+  YV12_BUFFER_CONFIG *yv12;
+  YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, ref);
+  const int step_param = 1;
+  const MvLimits tmp_mv_limits = x->mv_limits;
+  const SEARCH_METHODS search_method = NSTEP;
+  const int sadpb = x->sadperbit16;
+  MV ref_mv_full = { ref_mv.row >> 3, ref_mv.col >> 3 };
+  MV best_mv = { 0, 0 };
+  int cost_list[5];
+  struct buf_2d backup_pre[MAX_MB_PLANE] = { { 0, 0 } };
+
+  if (scaled_ref_frame) {
+    yv12 = scaled_ref_frame;
+    // As reported in b/311294795, the reference buffer pointer needs to be
+    // saved and restored after the search. Otherwise, it causes problems while
+    // the reference frame scaling happens.
+    for (int i = 0; i < MAX_MB_PLANE; i++) backup_pre[i] = xd->plane[i].pre[0];
+  } else {
+    yv12 = get_ref_frame_buffer(cpi, ref);
+  }
+
+  assert(yv12 != NULL);
+  if (!yv12) return;
+  vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, NULL);
+  mi->ref_frame[0] = ref;
+  mi->ref_frame[1] = NO_REF_FRAME;
+  mi->sb_type = bsize;
+  vp9_set_mv_search_range(&x->mv_limits, &ref_mv);
+  vp9_full_pixel_search(cpi, x, bsize, &ref_mv_full, step_param, search_method,
+                        sadpb, cond_cost_list(cpi, cost_list), &ref_mv,
+                        &best_mv, 0, 0);
+  best_mv.row *= 8;
+  best_mv.col *= 8;
+  x->mv_limits = tmp_mv_limits;
+  mi->mv[0].as_mv = best_mv;
+
+  // Restore reference buffer pointer.
+  if (scaled_ref_frame) {
+    for (int i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_pre[i];
+  }
+
+  set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
+  xd->plane[0].dst.buf = pred_buf;
+  xd->plane[0].dst.stride = 64;
+  vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+}
+
+// Use a neural net model to prune partition-none and partition-split search.
+// Features used: QP; spatial block size contexts; variance of prediction
+// residue after simple_motion_search.
+#define FEATURES 12
+static void ml_predict_var_rd_partitioning(const VP9_COMP *const cpi,
+                                           MACROBLOCK *const x,
+                                           PC_TREE *const pc_tree,
+                                           BLOCK_SIZE bsize, int mi_row,
+                                           int mi_col, int *none, int *split) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const NN_CONFIG *nn_config = NULL;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint8_t, pred_buffer[64 * 64 * 2]);
+  uint8_t *const pred_buf = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+                                ? (CONVERT_TO_BYTEPTR(pred_buffer))
+                                : pred_buffer;
+#else
+  DECLARE_ALIGNED(16, uint8_t, pred_buffer[64 * 64]);
+  uint8_t *const pred_buf = pred_buffer;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  const int speed = cpi->oxcf.speed;
+  float thresh = 0.0f;
+
+  switch (bsize) {
+    case BLOCK_64X64:
+      nn_config = &vp9_part_split_nnconfig_64;
+      thresh = speed > 0 ? 2.8f : 3.0f;
+      break;
+    case BLOCK_32X32:
+      nn_config = &vp9_part_split_nnconfig_32;
+      thresh = speed > 0 ? 3.5f : 3.0f;
+      break;
+    case BLOCK_16X16:
+      nn_config = &vp9_part_split_nnconfig_16;
+      thresh = speed > 0 ? 3.8f : 4.0f;
+      break;
+    case BLOCK_8X8:
+      nn_config = &vp9_part_split_nnconfig_8;
+      if (cm->width >= 720 && cm->height >= 720)
+        thresh = speed > 0 ? 2.5f : 2.0f;
+      else
+        thresh = speed > 0 ? 3.8f : 2.0f;
+      break;
+    default: assert(0 && "Unexpected block size."); return;
+  }
+
+  if (!nn_config) return;
+
+  // Do a simple single motion search to find a prediction for current block.
+  // The variance of the residue will be used as input features.
+  {
+    MV ref_mv;
+    const MV_REFERENCE_FRAME ref =
+        cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME;
+    // If bsize is 64x64, use zero MV as reference; otherwise, use MV result
+    // of previous(larger) block as reference.
+    if (bsize == BLOCK_64X64)
+      ref_mv.row = ref_mv.col = 0;
+    else
+      ref_mv = pc_tree->mv;
+    vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col);
+    simple_motion_search(cpi, x, bsize, mi_row, mi_col, ref_mv, ref, pred_buf);
+    pc_tree->mv = x->e_mbd.mi[0]->mv[0].as_mv;
+  }
+
+  vpx_clear_system_state();
+
+  {
+    float features[FEATURES] = { 0.0f };
+#if CONFIG_VP9_HIGHBITDEPTH
+    const int dc_q =
+        vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth) >> (xd->bd - 8);
+#else
+    const int dc_q = vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    int feature_idx = 0;
+    float score;
+
+    // Generate model input features.
+    features[feature_idx++] = logf((float)dc_q + 1.0f);
+
+    // Get the variance of the residue as input features.
+    {
+      const int bs = 4 * num_4x4_blocks_wide_lookup[bsize];
+      const BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT);
+      const uint8_t *pred = pred_buf;
+      const uint8_t *src = x->plane[0].src.buf;
+      const int src_stride = x->plane[0].src.stride;
+      const int pred_stride = 64;
+      unsigned int sse;
+      // Variance of whole block.
+      const unsigned int var =
+          cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse);
+      const float factor = (var == 0) ? 1.0f : (1.0f / (float)var);
+      const int has_above = !!xd->above_mi;
+      const int has_left = !!xd->left_mi;
+      const BLOCK_SIZE above_bsize = has_above ? xd->above_mi->sb_type : bsize;
+      const BLOCK_SIZE left_bsize = has_left ? xd->left_mi->sb_type : bsize;
+      int i;
+
+      features[feature_idx++] = (float)has_above;
+      features[feature_idx++] = (float)b_width_log2_lookup[above_bsize];
+      features[feature_idx++] = (float)b_height_log2_lookup[above_bsize];
+      features[feature_idx++] = (float)has_left;
+      features[feature_idx++] = (float)b_width_log2_lookup[left_bsize];
+      features[feature_idx++] = (float)b_height_log2_lookup[left_bsize];
+      features[feature_idx++] = logf((float)var + 1.0f);
+      for (i = 0; i < 4; ++i) {
+        const int x_idx = (i & 1) * bs / 2;
+        const int y_idx = (i >> 1) * bs / 2;
+        const int src_offset = y_idx * src_stride + x_idx;
+        const int pred_offset = y_idx * pred_stride + x_idx;
+        // Variance of quarter block.
+        const unsigned int sub_var =
+            cpi->fn_ptr[subsize].vf(src + src_offset, src_stride,
+                                    pred + pred_offset, pred_stride, &sse);
+        const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var;
+        features[feature_idx++] = var_ratio;
+      }
+    }
+    assert(feature_idx == FEATURES);
+
+    // Feed the features into the model to get the confidence score.
+    nn_predict(features, nn_config, &score);
+
+    // Higher score means that the model has higher confidence that the split
+    // partition is better than the non-split partition. So if the score is
+    // high enough, we skip the none-split partition search; if the score is
+    // low enough, we skip the split partition search.
+    if (score > thresh) *none = 0;
+    if (score < -thresh) *split = 0;
+  }
+}
+#undef FEATURES
+#endif  // !CONFIG_REALTIME_ONLY
+
+static double log_wiener_var(int64_t wiener_variance) {
+  return log(1.0 + wiener_variance) / log(2.0);
+}
+
+static void build_kmeans_segmentation(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  BLOCK_SIZE bsize = BLOCK_64X64;
+  KMEANS_DATA *kmeans_data;
+
+  vp9_disable_segmentation(&cm->seg);
+  if (cm->show_frame) {
+    int mi_row, mi_col;
+    cpi->kmeans_data_size = 0;
+    cpi->kmeans_ctr_num = 8;
+
+    for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MI_BLOCK_SIZE) {
+      for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
+        int mb_row_start = mi_row >> 1;
+        int mb_col_start = mi_col >> 1;
+        int mb_row_end = VPXMIN(
+            (mi_row + num_8x8_blocks_high_lookup[bsize]) >> 1, cm->mb_rows);
+        int mb_col_end = VPXMIN(
+            (mi_col + num_8x8_blocks_wide_lookup[bsize]) >> 1, cm->mb_cols);
+        int row, col;
+        int64_t wiener_variance = 0;
+
+        for (row = mb_row_start; row < mb_row_end; ++row)
+          for (col = mb_col_start; col < mb_col_end; ++col)
+            wiener_variance += cpi->mb_wiener_variance[row * cm->mb_cols + col];
+
+        wiener_variance /=
+            (mb_row_end - mb_row_start) * (mb_col_end - mb_col_start);
+
+#if CONFIG_MULTITHREAD
+        pthread_mutex_lock(&cpi->kmeans_mutex);
+#endif  // CONFIG_MULTITHREAD
+
+        kmeans_data = &cpi->kmeans_data_arr[cpi->kmeans_data_size++];
+        kmeans_data->value = log_wiener_var(wiener_variance);
+        kmeans_data->pos = mi_row * cpi->kmeans_data_stride + mi_col;
+#if CONFIG_MULTITHREAD
+        pthread_mutex_unlock(&cpi->kmeans_mutex);
+#endif  // CONFIG_MULTITHREAD
+      }
+    }
+
+    vp9_kmeans(cpi->kmeans_ctr_ls, cpi->kmeans_boundary_ls,
+               cpi->kmeans_count_ls, cpi->kmeans_ctr_num, cpi->kmeans_data_arr,
+               cpi->kmeans_data_size);
+
+    vp9_perceptual_aq_mode_setup(cpi, &cm->seg);
+  }
+}
+
+#if !CONFIG_REALTIME_ONLY
+static int wiener_var_segment(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+                              int mi_col) {
+  VP9_COMMON *cm = &cpi->common;
+  int mb_row_start = mi_row >> 1;
+  int mb_col_start = mi_col >> 1;
+  int mb_row_end =
+      VPXMIN((mi_row + num_8x8_blocks_high_lookup[bsize]) >> 1, cm->mb_rows);
+  int mb_col_end =
+      VPXMIN((mi_col + num_8x8_blocks_wide_lookup[bsize]) >> 1, cm->mb_cols);
+  int row, col, idx;
+  int64_t wiener_variance = 0;
+  int segment_id;
+  int8_t seg_hist[MAX_SEGMENTS] = { 0 };
+  int8_t max_count = 0, max_index = -1;
+
+  vpx_clear_system_state();
+
+  assert(cpi->norm_wiener_variance > 0);
+
+  for (row = mb_row_start; row < mb_row_end; ++row) {
+    for (col = mb_col_start; col < mb_col_end; ++col) {
+      wiener_variance = cpi->mb_wiener_variance[row * cm->mb_cols + col];
+      segment_id =
+          vp9_get_group_idx(log_wiener_var(wiener_variance),
+                            cpi->kmeans_boundary_ls, cpi->kmeans_ctr_num);
+      ++seg_hist[segment_id];
+    }
+  }
+
+  for (idx = 0; idx < cpi->kmeans_ctr_num; ++idx) {
+    if (seg_hist[idx] > max_count) {
+      max_count = seg_hist[idx];
+      max_index = idx;
+    }
+  }
+
+  assert(max_index >= 0);
+  segment_id = max_index;
+
+  return segment_id;
+}
+
+static int get_rdmult_delta(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+                            int mi_col, int orig_rdmult) {
+  const int gf_group_index = cpi->twopass.gf_group.index;
+  int64_t intra_cost = 0;
+  int64_t mc_dep_cost = 0;
+  int mi_wide = num_8x8_blocks_wide_lookup[bsize];
+  int mi_high = num_8x8_blocks_high_lookup[bsize];
+  int row, col;
+
+  int dr = 0;
+  double r0, rk, beta;
+
+  TplDepFrame *tpl_frame;
+  TplDepStats *tpl_stats;
+  int tpl_stride;
+
+  if (gf_group_index >= MAX_ARF_GOP_SIZE) return orig_rdmult;
+  tpl_frame = &cpi->tpl_stats[gf_group_index];
+
+  if (tpl_frame->is_valid == 0) return orig_rdmult;
+  tpl_stats = tpl_frame->tpl_stats_ptr;
+  tpl_stride = tpl_frame->stride;
+
+  if (cpi->twopass.gf_group.layer_depth[gf_group_index] > 1) return orig_rdmult;
+
+  if (cpi->ext_ratectrl.ready &&
+      (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0 &&
+      cpi->ext_ratectrl.funcs.get_encodeframe_decision != NULL) {
+    int sb_size = num_8x8_blocks_wide_lookup[BLOCK_64X64] * MI_SIZE;
+    int sb_stride = (cpi->common.width + sb_size - 1) / sb_size;
+    int sby = mi_row / 8;
+    int sbx = mi_col / 8;
+    return (int)((cpi->sb_mul_scale[sby * sb_stride + sbx] * orig_rdmult) /
+                 256);
+  }
+
+  for (row = mi_row; row < mi_row + mi_high; ++row) {
+    for (col = mi_col; col < mi_col + mi_wide; ++col) {
+      TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col];
+
+      if (row >= cpi->common.mi_rows || col >= cpi->common.mi_cols) continue;
+
+      intra_cost += this_stats->intra_cost;
+      mc_dep_cost += this_stats->mc_dep_cost;
+    }
+  }
+
+  vpx_clear_system_state();
+
+  r0 = cpi->rd.r0;
+  rk = (double)intra_cost / mc_dep_cost;
+  beta = r0 / rk;
+  dr = vp9_get_adaptive_rdmult(cpi, beta);
+
+  dr = clamp(dr, orig_rdmult * 1 / 2, orig_rdmult * 3 / 2);
+  dr = VPXMAX(1, dr);
+
+  return dr;
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+#if !CONFIG_REALTIME_ONLY
 // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
 // unlikely to be selected depending on previous rate-distortion optimization
 // results, for encoding speed-up.
-static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
-                              TileDataEnc *tile_data, TOKENEXTRA **tp,
-                              int mi_row, int mi_col, BLOCK_SIZE bsize,
-                              RD_COST *rd_cost, int64_t best_rd,
-                              PC_TREE *pc_tree) {
+static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
+                             TileDataEnc *tile_data, TOKENEXTRA **tp,
+                             int mi_row, int mi_col, BLOCK_SIZE bsize,
+                             RD_COST *rd_cost, RD_COST best_rdc,
+                             PC_TREE *pc_tree) {
   VP9_COMMON *const cm = &cpi->common;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -2621,11 +3678,11 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
   ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
   PARTITION_CONTEXT sl[8], sa[8];
   TOKENEXTRA *tp_orig = *tp;
-  PICK_MODE_CONTEXT *ctx = &pc_tree->none;
+  PICK_MODE_CONTEXT *const ctx = &pc_tree->none;
   int i;
   const int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
   BLOCK_SIZE subsize;
-  RD_COST this_rdc, sum_rdc, best_rdc;
+  RD_COST this_rdc, sum_rdc;
   int do_split = bsize >= BLOCK_8X8;
   int do_rect = 1;
   INTERP_FILTER pred_interp_filter;
@@ -2639,37 +3696,43 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
   BLOCK_SIZE min_size = x->min_partition_size;
   BLOCK_SIZE max_size = x->max_partition_size;
 
-#if CONFIG_FP_MB_STATS
-  unsigned int src_diff_var = UINT_MAX;
-  int none_complexity = 0;
-#endif
-
   int partition_none_allowed = !force_horz_split && !force_vert_split;
   int partition_horz_allowed =
       !force_vert_split && yss <= xss && bsize >= BLOCK_8X8;
   int partition_vert_allowed =
       !force_horz_split && xss <= yss && bsize >= BLOCK_8X8;
 
-  int64_t dist_breakout_thr = cpi->sf.partition_search_breakout_dist_thr;
-  int rate_breakout_thr = cpi->sf.partition_search_breakout_rate_thr;
+  int64_t dist_breakout_thr = cpi->sf.partition_search_breakout_thr.dist;
+  int rate_breakout_thr = cpi->sf.partition_search_breakout_thr.rate;
+  int must_split = 0;
+  int should_encode_sb = 0;
+
+  // Ref frames picked in the [i_th] quarter subblock during square partition
+  // RD search. It may be used to prune ref frame selection of rect partitions.
+  uint8_t ref_frames_used[4] = { 0, 0, 0, 0 };
+
+  int partition_mul = x->cb_rdmult;
 
   (void)*tp_orig;
 
   assert(num_8x8_blocks_wide_lookup[bsize] ==
          num_8x8_blocks_high_lookup[bsize]);
 
-  // Adjust dist breakout threshold according to the partition size.
   dist_breakout_thr >>=
       8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+
   rate_breakout_thr *= num_pels_log2_lookup[bsize];
 
   vp9_rd_cost_init(&this_rdc);
   vp9_rd_cost_init(&sum_rdc);
-  vp9_rd_cost_reset(&best_rdc);
-  best_rdc.rdcost = best_rd;
 
   set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
 
+  if (oxcf->tuning == VP8_TUNE_SSIM) {
+    set_ssim_rdmult(cpi, x, bsize, mi_row, mi_col, &partition_mul);
+  }
+  vp9_rd_cost_update(partition_mul, x->rddiv, &best_rdc);
+
   if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode != NO_AQ &&
       cpi->oxcf.aq_mode != LOOKAHEAD_AQ)
     x->mb_energy = vp9_block_energy(cpi, x, bsize);
@@ -2684,10 +3747,18 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
       set_partition_range(cm, xd, mi_row, mi_col, bsize, &min_size, &max_size);
   }
 
+  // Get sub block energy range
+  if (bsize >= BLOCK_16X16) {
+    int min_energy, max_energy;
+    vp9_get_sub_block_energy(cpi, x, mi_row, mi_col, bsize, &min_energy,
+                             &max_energy);
+    must_split = (min_energy < -3) && (max_energy - min_energy > 2);
+  }
+
   // Determine partition types in search according to the speed features.
   // The threshold set here has to be of square block size.
   if (cpi->sf.auto_min_max_partition_size) {
-    partition_none_allowed &= (bsize <= max_size && bsize >= min_size);
+    partition_none_allowed &= (bsize <= max_size);
     partition_horz_allowed &=
         ((bsize <= max_size && bsize > min_size) || force_horz_split);
     partition_vert_allowed &=
@@ -2696,7 +3767,8 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
   }
 
   if (cpi->sf.use_square_partition_only &&
-      bsize > cpi->sf.use_square_only_threshold) {
+      (bsize > cpi->sf.use_square_only_thresh_high ||
+       bsize < cpi->sf.use_square_only_thresh_low)) {
     if (cpi->use_svc) {
       if (!vp9_active_h_edge(cpi, mi_row, mi_step) || x->e_mbd.lossless)
         partition_horz_allowed &= force_horz_split;
@@ -2710,144 +3782,101 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
 
   save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
 
-#if CONFIG_FP_MB_STATS
-  if (cpi->use_fp_mb_stats) {
-    set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
-    src_diff_var = get_sby_perpixel_diff_variance(cpi, &x->plane[0].src, mi_row,
-                                                  mi_col, bsize);
-  }
-#endif
+  pc_tree->partitioning = PARTITION_NONE;
 
-#if CONFIG_FP_MB_STATS
-  // Decide whether we shall split directly and skip searching NONE by using
-  // the first pass block statistics
-  if (cpi->use_fp_mb_stats && bsize >= BLOCK_32X32 && do_split &&
-      partition_none_allowed && src_diff_var > 4 &&
-      cm->base_qindex < qindex_split_threshold_lookup[bsize]) {
-    int mb_row = mi_row >> 1;
-    int mb_col = mi_col >> 1;
-    int mb_row_end =
-        VPXMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
-    int mb_col_end =
-        VPXMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
-    int r, c;
-
-    // compute a complexity measure, basically measure inconsistency of motion
-    // vectors obtained from the first pass in the current block
-    for (r = mb_row; r < mb_row_end; r++) {
-      for (c = mb_col; c < mb_col_end; c++) {
-        const int mb_index = r * cm->mb_cols + c;
-
-        MOTION_DIRECTION this_mv;
-        MOTION_DIRECTION right_mv;
-        MOTION_DIRECTION bottom_mv;
-
-        this_mv =
-            get_motion_direction_fp(cpi->twopass.this_frame_mb_stats[mb_index]);
-
-        // to its right
-        if (c != mb_col_end - 1) {
-          right_mv = get_motion_direction_fp(
-              cpi->twopass.this_frame_mb_stats[mb_index + 1]);
-          none_complexity += get_motion_inconsistency(this_mv, right_mv);
-        }
-
-        // to its bottom
-        if (r != mb_row_end - 1) {
-          bottom_mv = get_motion_direction_fp(
-              cpi->twopass.this_frame_mb_stats[mb_index + cm->mb_cols]);
-          none_complexity += get_motion_inconsistency(this_mv, bottom_mv);
-        }
-
-        // do not count its left and top neighbors to avoid double counting
+  if (cpi->sf.rd_ml_partition.var_pruning && !frame_is_intra_only(cm)) {
+    const int do_rd_ml_partition_var_pruning =
+        partition_none_allowed && do_split &&
+        mi_row + num_8x8_blocks_high_lookup[bsize] <= cm->mi_rows &&
+        mi_col + num_8x8_blocks_wide_lookup[bsize] <= cm->mi_cols;
+    if (do_rd_ml_partition_var_pruning) {
+      ml_predict_var_rd_partitioning(cpi, x, pc_tree, bsize, mi_row, mi_col,
+                                     &partition_none_allowed, &do_split);
+      // ml_predict_var_rd_partitioning() may pruune out either
+      // partition_none_allowed or do_split, but we should keep the
+      // partition_none_allowed for 8x8 blocks unless disable_split_mask is
+      // off (0).
+      if (bsize == BLOCK_8X8 && cpi->sf.disable_split_mask &&
+          partition_none_allowed == 0) {
+        partition_none_allowed = 1;
       }
+    } else {
+      vp9_zero(pc_tree->mv);
     }
-
-    if (none_complexity > complexity_16x16_blocks_threshold[bsize]) {
-      partition_none_allowed = 0;
+    if (bsize > BLOCK_8X8) {  // Store MV result as reference for subblocks.
+      for (i = 0; i < 4; ++i) pc_tree->u.split[i]->mv = pc_tree->mv;
     }
   }
-#endif
 
   // PARTITION_NONE
   if (partition_none_allowed) {
     rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, bsize, ctx,
-                     best_rdc.rdcost);
+                     best_rdc.rate, best_rdc.dist);
+    ctx->rdcost = this_rdc.rdcost;
     if (this_rdc.rate != INT_MAX) {
+      if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+        const int ref1 = ctx->mic.ref_frame[0];
+        const int ref2 = ctx->mic.ref_frame[1];
+        for (i = 0; i < 4; ++i) {
+          ref_frames_used[i] |= (1 << ref1);
+          if (ref2 > 0) ref_frames_used[i] |= (1 << ref2);
+        }
+      }
       if (bsize >= BLOCK_8X8) {
         this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
-        this_rdc.rdcost =
-            RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist);
+        vp9_rd_cost_update(partition_mul, x->rddiv, &this_rdc);
       }
 
       if (this_rdc.rdcost < best_rdc.rdcost) {
+        MODE_INFO *mi = xd->mi[0];
+
         best_rdc = this_rdc;
+        should_encode_sb = 1;
         if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE;
 
-        // If all y, u, v transform blocks in this partition are skippable, and
-        // the dist & rate are within the thresholds, the partition search is
-        // terminated for current branch of the partition search tree.
-        if (!x->e_mbd.lossless && ctx->skippable &&
-            ((best_rdc.dist < (dist_breakout_thr >> 2)) ||
-             (best_rdc.dist < dist_breakout_thr &&
-              best_rdc.rate < rate_breakout_thr))) {
-          do_split = 0;
-          do_rect = 0;
-        }
-
-#if CONFIG_FP_MB_STATS
-        // Check if every 16x16 first pass block statistics has zero
-        // motion and the corresponding first pass residue is small enough.
-        // If that is the case, check the difference variance between the
-        // current frame and the last frame. If the variance is small enough,
-        // stop further splitting in RD optimization
-        if (cpi->use_fp_mb_stats && do_split != 0 &&
-            cm->base_qindex > qindex_skip_threshold_lookup[bsize]) {
-          int mb_row = mi_row >> 1;
-          int mb_col = mi_col >> 1;
-          int mb_row_end =
-              VPXMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
-          int mb_col_end =
-              VPXMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
-          int r, c;
-
-          int skip = 1;
-          for (r = mb_row; r < mb_row_end; r++) {
-            for (c = mb_col; c < mb_col_end; c++) {
-              const int mb_index = r * cm->mb_cols + c;
-              if (!(cpi->twopass.this_frame_mb_stats[mb_index] &
-                    FPMB_MOTION_ZERO_MASK) ||
-                  !(cpi->twopass.this_frame_mb_stats[mb_index] &
-                    FPMB_ERROR_SMALL_MASK)) {
-                skip = 0;
-                break;
-              }
-            }
-            if (skip == 0) {
-              break;
-            }
-          }
-
-          if (skip) {
-            if (src_diff_var == UINT_MAX) {
-              set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
-              src_diff_var = get_sby_perpixel_diff_variance(
-                  cpi, &x->plane[0].src, mi_row, mi_col, bsize);
-            }
-            if (src_diff_var < 8) {
+        if (cpi->sf.rd_ml_partition.search_early_termination) {
+          // Currently, the machine-learning based partition search early
+          // termination is only used while bsize is 16x16, 32x32 or 64x64,
+          // VPXMIN(cm->width, cm->height) >= 480, and speed = 0.
+          if (!x->e_mbd.lossless &&
+              !segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP) &&
+              ctx->mic.mode >= INTRA_MODES && bsize >= BLOCK_16X16) {
+            if (ml_pruning_partition(cm, xd, ctx, mi_row, mi_col, bsize)) {
               do_split = 0;
               do_rect = 0;
             }
           }
         }
-#endif
+
+        if ((do_split || do_rect) && !x->e_mbd.lossless && ctx->skippable) {
+          const int use_ml_based_breakout =
+              cpi->sf.rd_ml_partition.search_breakout && cm->base_qindex >= 100;
+          if (use_ml_based_breakout) {
+            if (ml_predict_breakout(cpi, bsize, x, &this_rdc)) {
+              do_split = 0;
+              do_rect = 0;
+            }
+          } else {
+            if (!cpi->sf.rd_ml_partition.search_early_termination) {
+              if ((best_rdc.dist < (dist_breakout_thr >> 2)) ||
+                  (best_rdc.dist < dist_breakout_thr &&
+                   best_rdc.rate < rate_breakout_thr)) {
+                do_split = 0;
+                do_rect = 0;
+              }
+            }
+          }
+        }
       }
     }
     restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+  } else {
+    vp9_zero(ctx->pred_mv);
+    ctx->mic.interp_filter = EIGHTTAP;
   }
 
   // store estimated motion vector
-  if (cpi->sf.adaptive_motion_search) store_pred_mv(x, ctx);
+  store_pred_mv(x, ctx);
 
   // If the interp_filter is marked as SWITCHABLE_FILTERS, it was for an
   // intra block and used for context purposes.
@@ -2860,112 +3889,192 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
   // PARTITION_SPLIT
   // TODO(jingning): use the motion vectors given by the above search as
   // the starting point of motion search in the following partition type check.
-  if (do_split) {
+  pc_tree->u.split[0]->none.rdcost = 0;
+  pc_tree->u.split[1]->none.rdcost = 0;
+  pc_tree->u.split[2]->none.rdcost = 0;
+  pc_tree->u.split[3]->none.rdcost = 0;
+  if (do_split || must_split) {
     subsize = get_subsize(bsize, PARTITION_SPLIT);
+    load_pred_mv(x, ctx);
     if (bsize == BLOCK_8X8) {
       i = 4;
       if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed)
-        pc_tree->leaf_split[0]->pred_interp_filter = pred_interp_filter;
+        pc_tree->u.leaf_split[0]->pred_interp_filter = pred_interp_filter;
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
-                       pc_tree->leaf_split[0], best_rdc.rdcost);
-
-      if (sum_rdc.rate == INT_MAX) sum_rdc.rdcost = INT64_MAX;
+                       pc_tree->u.leaf_split[0], best_rdc.rate, best_rdc.dist);
+      if (sum_rdc.rate == INT_MAX) {
+        sum_rdc.rdcost = INT64_MAX;
+      } else {
+        if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+          const int ref1 = pc_tree->u.leaf_split[0]->mic.ref_frame[0];
+          const int ref2 = pc_tree->u.leaf_split[0]->mic.ref_frame[1];
+          for (i = 0; i < 4; ++i) {
+            ref_frames_used[i] |= (1 << ref1);
+            if (ref2 > 0) ref_frames_used[i] |= (1 << ref2);
+          }
+        }
+      }
     } else {
-      for (i = 0; i < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++i) {
+      for (i = 0; (i < 4) && ((sum_rdc.rdcost < best_rdc.rdcost) || must_split);
+           ++i) {
         const int x_idx = (i & 1) * mi_step;
         const int y_idx = (i >> 1) * mi_step;
+        int found_best_rd = 0;
+        RD_COST best_rdc_split;
+        vp9_rd_cost_reset(&best_rdc_split);
+
+        if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX) {
+          // A must split test here increases the number of sub
+          // partitions but hurts metrics results quite a bit,
+          // so this extra test is commented out pending
+          // further tests on whether it adds much in terms of
+          // visual quality.
+          // (must_split) ? best_rdc.rate
+          //              : best_rdc.rate - sum_rdc.rate,
+          // (must_split) ? best_rdc.dist
+          //              : best_rdc.dist - sum_rdc.dist,
+          best_rdc_split.rate = best_rdc.rate - sum_rdc.rate;
+          best_rdc_split.dist = best_rdc.dist - sum_rdc.dist;
+        }
 
         if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
           continue;
 
-        if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
+        pc_tree->u.split[i]->index = i;
+        if (cpi->sf.prune_ref_frame_for_rect_partitions)
+          pc_tree->u.split[i]->none.rate = INT_MAX;
+        found_best_rd = rd_pick_partition(
+            cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize,
+            &this_rdc, best_rdc_split, pc_tree->u.split[i]);
 
-        pc_tree->split[i]->index = i;
-        rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx,
-                          mi_col + x_idx, subsize, &this_rdc,
-                          best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[i]);
-
-        if (this_rdc.rate == INT_MAX) {
+        if (found_best_rd == 0) {
           sum_rdc.rdcost = INT64_MAX;
           break;
         } else {
+          if (cpi->sf.prune_ref_frame_for_rect_partitions &&
+              pc_tree->u.split[i]->none.rate != INT_MAX) {
+            const int ref1 = pc_tree->u.split[i]->none.mic.ref_frame[0];
+            const int ref2 = pc_tree->u.split[i]->none.mic.ref_frame[1];
+            ref_frames_used[i] |= (1 << ref1);
+            if (ref2 > 0) ref_frames_used[i] |= (1 << ref2);
+          }
           sum_rdc.rate += this_rdc.rate;
           sum_rdc.dist += this_rdc.dist;
-          sum_rdc.rdcost += this_rdc.rdcost;
+          vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc);
         }
       }
     }
 
-    if (sum_rdc.rdcost < best_rdc.rdcost && i == 4) {
+    if (((sum_rdc.rdcost < best_rdc.rdcost) || must_split) && i == 4) {
       sum_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
-      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+      vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc);
 
-      if (sum_rdc.rdcost < best_rdc.rdcost) {
+      if ((sum_rdc.rdcost < best_rdc.rdcost) ||
+          (must_split && (sum_rdc.dist < best_rdc.dist))) {
         best_rdc = sum_rdc;
+        should_encode_sb = 1;
         pc_tree->partitioning = PARTITION_SPLIT;
 
         // Rate and distortion based partition search termination clause.
-        if (!x->e_mbd.lossless && ((best_rdc.dist < (dist_breakout_thr >> 2)) ||
-                                   (best_rdc.dist < dist_breakout_thr &&
-                                    best_rdc.rate < rate_breakout_thr))) {
+        if (!cpi->sf.rd_ml_partition.search_early_termination &&
+            !x->e_mbd.lossless &&
+            ((best_rdc.dist < (dist_breakout_thr >> 2)) ||
+             (best_rdc.dist < dist_breakout_thr &&
+              best_rdc.rate < rate_breakout_thr))) {
           do_rect = 0;
         }
       }
     } else {
       // skip rectangular partition test when larger block size
       // gives better rd cost
-      if ((cpi->sf.less_rectangular_check) &&
-          ((bsize > cpi->sf.use_square_only_threshold) ||
-           (best_rdc.dist < dist_breakout_thr)))
+      if (cpi->sf.less_rectangular_check &&
+          (bsize > cpi->sf.use_square_only_thresh_high ||
+           best_rdc.dist < dist_breakout_thr))
         do_rect &= !partition_none_allowed;
     }
     restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
   }
 
+  pc_tree->horizontal[0].skip_ref_frame_mask = 0;
+  pc_tree->horizontal[1].skip_ref_frame_mask = 0;
+  pc_tree->vertical[0].skip_ref_frame_mask = 0;
+  pc_tree->vertical[1].skip_ref_frame_mask = 0;
+  if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+    uint8_t used_frames;
+    used_frames = ref_frames_used[0] | ref_frames_used[1];
+    if (used_frames) {
+      pc_tree->horizontal[0].skip_ref_frame_mask = ~used_frames & 0xff;
+    }
+    used_frames = ref_frames_used[2] | ref_frames_used[3];
+    if (used_frames) {
+      pc_tree->horizontal[1].skip_ref_frame_mask = ~used_frames & 0xff;
+    }
+    used_frames = ref_frames_used[0] | ref_frames_used[2];
+    if (used_frames) {
+      pc_tree->vertical[0].skip_ref_frame_mask = ~used_frames & 0xff;
+    }
+    used_frames = ref_frames_used[1] | ref_frames_used[3];
+    if (used_frames) {
+      pc_tree->vertical[1].skip_ref_frame_mask = ~used_frames & 0xff;
+    }
+  }
+
+  {
+    const int do_ml_rect_partition_pruning =
+        !frame_is_intra_only(cm) && !force_horz_split && !force_vert_split &&
+        (partition_horz_allowed || partition_vert_allowed) && bsize > BLOCK_8X8;
+    if (do_ml_rect_partition_pruning) {
+      ml_prune_rect_partition(cpi, x, bsize, pc_tree, &partition_horz_allowed,
+                              &partition_vert_allowed, best_rdc.rdcost);
+    }
+  }
+
   // PARTITION_HORZ
   if (partition_horz_allowed &&
       (do_rect || vp9_active_h_edge(cpi, mi_row, mi_step))) {
+    const int part_mode_rate = cpi->partition_cost[pl][PARTITION_HORZ];
     subsize = get_subsize(bsize, PARTITION_HORZ);
-    if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
+    load_pred_mv(x, ctx);
     if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
         partition_none_allowed)
       pc_tree->horizontal[0].pred_interp_filter = pred_interp_filter;
     rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
-                     &pc_tree->horizontal[0], best_rdc.rdcost);
+                     &pc_tree->horizontal[0], best_rdc.rate - part_mode_rate,
+                     best_rdc.dist);
+    if (sum_rdc.rdcost < INT64_MAX) {
+      sum_rdc.rate += part_mode_rate;
+      vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc);
+    }
 
     if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + mi_step < cm->mi_rows &&
         bsize > BLOCK_8X8) {
-      PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
-      update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
-      encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
-
-      if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
+      PICK_MODE_CONTEXT *hctx = &pc_tree->horizontal[0];
+      update_state(cpi, td, hctx, mi_row, mi_col, subsize, 0);
+      encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, hctx);
       if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
           partition_none_allowed)
         pc_tree->horizontal[1].pred_interp_filter = pred_interp_filter;
       rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc,
                        subsize, &pc_tree->horizontal[1],
-                       best_rdc.rdcost - sum_rdc.rdcost);
+                       best_rdc.rate - sum_rdc.rate,
+                       best_rdc.dist - sum_rdc.dist);
       if (this_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
       } else {
         sum_rdc.rate += this_rdc.rate;
         sum_rdc.dist += this_rdc.dist;
-        sum_rdc.rdcost += this_rdc.rdcost;
+        vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc);
       }
     }
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
-      sum_rdc.rate += cpi->partition_cost[pl][PARTITION_HORZ];
-      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
-      if (sum_rdc.rdcost < best_rdc.rdcost) {
-        best_rdc = sum_rdc;
-        pc_tree->partitioning = PARTITION_HORZ;
+      best_rdc = sum_rdc;
+      should_encode_sb = 1;
+      pc_tree->partitioning = PARTITION_HORZ;
 
-        if ((cpi->sf.less_rectangular_check) &&
-            (bsize > cpi->sf.use_square_only_threshold))
-          do_rect = 0;
-      }
+      if (cpi->sf.less_rectangular_check &&
+          bsize > cpi->sf.use_square_only_thresh_high)
+        do_rect = 0;
     }
     restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
   }
@@ -2973,59 +4082,74 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
   // PARTITION_VERT
   if (partition_vert_allowed &&
       (do_rect || vp9_active_v_edge(cpi, mi_col, mi_step))) {
+    const int part_mode_rate = cpi->partition_cost[pl][PARTITION_VERT];
     subsize = get_subsize(bsize, PARTITION_VERT);
-
-    if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
+    load_pred_mv(x, ctx);
     if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
         partition_none_allowed)
       pc_tree->vertical[0].pred_interp_filter = pred_interp_filter;
     rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
-                     &pc_tree->vertical[0], best_rdc.rdcost);
+                     &pc_tree->vertical[0], best_rdc.rate - part_mode_rate,
+                     best_rdc.dist);
+    if (sum_rdc.rdcost < INT64_MAX) {
+      sum_rdc.rate += part_mode_rate;
+      vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc);
+    }
+
     if (sum_rdc.rdcost < best_rdc.rdcost && mi_col + mi_step < cm->mi_cols &&
         bsize > BLOCK_8X8) {
       update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 0);
       encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize,
                         &pc_tree->vertical[0]);
-
-      if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
       if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
           partition_none_allowed)
         pc_tree->vertical[1].pred_interp_filter = pred_interp_filter;
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc,
                        subsize, &pc_tree->vertical[1],
-                       best_rdc.rdcost - sum_rdc.rdcost);
+                       best_rdc.rate - sum_rdc.rate,
+                       best_rdc.dist - sum_rdc.dist);
       if (this_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
       } else {
         sum_rdc.rate += this_rdc.rate;
         sum_rdc.dist += this_rdc.dist;
-        sum_rdc.rdcost += this_rdc.rdcost;
+        vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc);
       }
     }
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
-      sum_rdc.rate += cpi->partition_cost[pl][PARTITION_VERT];
-      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
-      if (sum_rdc.rdcost < best_rdc.rdcost) {
-        best_rdc = sum_rdc;
-        pc_tree->partitioning = PARTITION_VERT;
-      }
+      best_rdc = sum_rdc;
+      should_encode_sb = 1;
+      pc_tree->partitioning = PARTITION_VERT;
     }
     restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
   }
 
-  // TODO(jbb): This code added so that we avoid static analysis
-  // warning related to the fact that best_rd isn't used after this
-  // point.  This code should be refactored so that the duplicate
-  // checks occur in some sub function and thus are used...
-  (void)best_rd;
+  if (bsize == BLOCK_64X64 && best_rdc.rdcost == INT64_MAX) {
+    vp9_rd_cost_reset(&this_rdc);
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, BLOCK_64X64,
+                     ctx, INT_MAX, INT64_MAX);
+    ctx->rdcost = this_rdc.rdcost;
+    vp9_rd_cost_update(partition_mul, x->rddiv, &this_rdc);
+    if (this_rdc.rdcost < best_rdc.rdcost) {
+      best_rdc = this_rdc;
+      should_encode_sb = 1;
+      pc_tree->partitioning = PARTITION_NONE;
+    }
+  }
+
   *rd_cost = best_rdc;
 
-  if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
-      pc_tree->index != 3) {
+  if (should_encode_sb && pc_tree->index != 3) {
     int output_enabled = (bsize == BLOCK_64X64);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, encode_sb_time);
+#endif
     encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
               pc_tree);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, encode_sb_time);
+#endif
   }
 
   if (bsize == BLOCK_64X64) {
@@ -3035,6 +4159,8 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
   } else {
     assert(tp_orig == *tp);
   }
+
+  return should_encode_sb;
 }
 
 static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td,
@@ -3048,23 +4174,33 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td,
   const int mi_col_start = tile_info->mi_col_start;
   const int mi_col_end = tile_info->mi_col_end;
   int mi_col;
+  const int sb_row = mi_row >> MI_BLOCK_SIZE_LOG2;
+  const int num_sb_cols =
+      get_num_cols(tile_data->tile_info, MI_BLOCK_SIZE_LOG2);
+  int sb_col_in_tile;
 
   // Initialize the left context for the new SB row
   memset(&xd->left_context, 0, sizeof(xd->left_context));
   memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
 
   // Code each SB in the row
-  for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += MI_BLOCK_SIZE) {
+  for (mi_col = mi_col_start, sb_col_in_tile = 0; mi_col < mi_col_end;
+       mi_col += MI_BLOCK_SIZE, sb_col_in_tile++) {
     const struct segmentation *const seg = &cm->seg;
     int dummy_rate;
     int64_t dummy_dist;
     RD_COST dummy_rdc;
     int i;
     int seg_skip = 0;
+    int orig_rdmult = cpi->rd.RDMULT;
 
     const int idx_str = cm->mi_stride * mi_row + mi_col;
     MODE_INFO **mi = cm->mi_grid_visible + idx_str;
 
+    vp9_rd_cost_reset(&dummy_rdc);
+    (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, sb_row,
+                                   sb_col_in_tile);
+
     if (sf->adaptive_pred_interp_filter) {
       for (i = 0; i < 64; ++i) td->leaf_tree[i].pred_interp_filter = SWITCHABLE;
 
@@ -3076,7 +4212,10 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td,
       }
     }
 
-    vp9_zero(x->pred_mv);
+    for (i = 0; i < MAX_REF_FRAMES; ++i) {
+      x->pred_mv[i].row = INT16_MAX;
+      x->pred_mv[i].col = INT16_MAX;
+    }
     td->pc_root->index = 0;
 
     if (seg->enabled) {
@@ -3087,6 +4226,9 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td,
     }
 
     x->source_variance = UINT_MAX;
+
+    x->cb_rdmult = orig_rdmult;
+
     if (sf->partition_search_type == FIXED_PARTITION || seg_skip) {
       const BLOCK_SIZE bsize =
           seg_skip ? BLOCK_64X64 : sf->always_this_block_size;
@@ -3094,30 +4236,46 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td,
       set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
       rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64,
                        &dummy_rate, &dummy_dist, 1, td->pc_root);
-    } else if (cpi->partition_search_skippable_frame) {
-      BLOCK_SIZE bsize;
-      set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
-      bsize = get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
-      set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
-      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                       &dummy_rate, &dummy_dist, 1, td->pc_root);
     } else if (sf->partition_search_type == VAR_BASED_PARTITION &&
                cm->frame_type != KEY_FRAME) {
       choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
       rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64,
                        &dummy_rate, &dummy_dist, 1, td->pc_root);
     } else {
+      if (cpi->twopass.gf_group.index > 0 && cpi->sf.enable_tpl_model) {
+        int dr =
+            get_rdmult_delta(cpi, BLOCK_64X64, mi_row, mi_col, orig_rdmult);
+        x->cb_rdmult = dr;
+      }
+
+      if (cpi->oxcf.aq_mode == PERCEPTUAL_AQ && cm->show_frame) {
+        x->segment_id = wiener_var_segment(cpi, BLOCK_64X64, mi_row, mi_col);
+        x->cb_rdmult = vp9_compute_rd_mult(
+            cpi, vp9_get_qindex(&cm->seg, x->segment_id, cm->base_qindex));
+      }
+
       // If required set upper and lower partition size limits
       if (sf->auto_min_max_partition_size) {
         set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
         rd_auto_partition_range(cpi, tile_info, xd, mi_row, mi_col,
                                 &x->min_partition_size, &x->max_partition_size);
       }
+      td->pc_root->none.rdcost = 0;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, rd_pick_partition_time);
+#endif
       rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_64X64,
-                        &dummy_rdc, INT64_MAX, td->pc_root);
+                        &dummy_rdc, dummy_rdc, td->pc_root);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, rd_pick_partition_time);
+#endif
     }
+    (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row,
+                                    sb_col_in_tile, num_sb_cols);
   }
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 static void init_encode_frame_mb_context(VP9_COMP *cpi) {
   MACROBLOCK *const x = &cpi->td.mb;
@@ -3189,12 +4347,42 @@ static TX_MODE select_tx_mode(const VP9_COMP *cpi, MACROBLOCKD *const xd) {
 static void hybrid_intra_mode_search(VP9_COMP *cpi, MACROBLOCK *const x,
                                      RD_COST *rd_cost, BLOCK_SIZE bsize,
                                      PICK_MODE_CONTEXT *ctx) {
-  if (bsize < BLOCK_16X16)
+  if (!cpi->sf.nonrd_keyframe && bsize < BLOCK_16X16)
     vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX);
   else
     vp9_pick_intra_mode(cpi, x, rd_cost, bsize, ctx);
 }
 
+static void hybrid_search_svc_baseiskey(VP9_COMP *cpi, MACROBLOCK *const x,
+                                        RD_COST *rd_cost, BLOCK_SIZE bsize,
+                                        PICK_MODE_CONTEXT *ctx,
+                                        TileDataEnc *tile_data, int mi_row,
+                                        int mi_col) {
+  if (!cpi->sf.nonrd_keyframe && bsize <= BLOCK_8X8) {
+    vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX);
+  } else {
+    if (cpi->svc.disable_inter_layer_pred == INTER_LAYER_PRED_OFF)
+      vp9_pick_intra_mode(cpi, x, rd_cost, bsize, ctx);
+    else if (bsize >= BLOCK_8X8)
+      vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col, rd_cost, bsize,
+                          ctx);
+    else
+      vp9_pick_inter_mode_sub8x8(cpi, x, mi_row, mi_col, rd_cost, bsize, ctx);
+  }
+}
+
+static void hybrid_search_scene_change(VP9_COMP *cpi, MACROBLOCK *const x,
+                                       RD_COST *rd_cost, BLOCK_SIZE bsize,
+                                       PICK_MODE_CONTEXT *ctx,
+                                       TileDataEnc *tile_data, int mi_row,
+                                       int mi_col) {
+  if (!cpi->sf.nonrd_keyframe && bsize <= BLOCK_8X8) {
+    vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX);
+  } else {
+    vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col, rd_cost, bsize, ctx);
+  }
+}
+
 static void nonrd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
                                 MACROBLOCK *const x, int mi_row, int mi_col,
                                 RD_COST *rd_cost, BLOCK_SIZE bsize,
@@ -3210,6 +4398,11 @@ static void nonrd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
   int plane;
 
   set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+
+  set_segment_index(cpi, x, mi_row, mi_col, bsize, 0);
+
+  x->skip_recode = 0;
+
   mi = xd->mi[0];
   mi->sb_type = bsize;
 
@@ -3225,14 +4418,23 @@ static void nonrd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
     if (cyclic_refresh_segment_id_boosted(mi->segment_id))
       x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
 
-  if (cm->frame_type == KEY_FRAME)
+  if (frame_is_intra_only(cm))
     hybrid_intra_mode_search(cpi, x, rd_cost, bsize, ctx);
+  else if (cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)
+    hybrid_search_svc_baseiskey(cpi, x, rd_cost, bsize, ctx, tile_data, mi_row,
+                                mi_col);
   else if (segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP))
-    set_mode_info_seg_skip(x, cm->tx_mode, rd_cost, bsize);
-  else if (bsize >= BLOCK_8X8)
-    vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col, rd_cost, bsize, ctx);
-  else
+    set_mode_info_seg_skip(x, cm->tx_mode, cm->interp_filter, rd_cost, bsize);
+  else if (bsize >= BLOCK_8X8) {
+    if (cpi->rc.hybrid_intra_scene_change)
+      hybrid_search_scene_change(cpi, x, rd_cost, bsize, ctx, tile_data, mi_row,
+                                 mi_col);
+    else
+      vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col, rd_cost, bsize,
+                          ctx);
+  } else {
     vp9_pick_inter_mode_sub8x8(cpi, x, mi_row, mi_col, rd_cost, bsize, ctx);
+  }
 
   duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
 
@@ -3294,13 +4496,13 @@ static void fill_mode_info_sb(VP9_COMMON *cm, MACROBLOCK *x, int mi_row,
       }
       break;
     case PARTITION_SPLIT: {
-      fill_mode_info_sb(cm, x, mi_row, mi_col, subsize, pc_tree->split[0]);
+      fill_mode_info_sb(cm, x, mi_row, mi_col, subsize, pc_tree->u.split[0]);
       fill_mode_info_sb(cm, x, mi_row, mi_col + hbs, subsize,
-                        pc_tree->split[1]);
+                        pc_tree->u.split[1]);
       fill_mode_info_sb(cm, x, mi_row + hbs, mi_col, subsize,
-                        pc_tree->split[2]);
+                        pc_tree->u.split[2]);
       fill_mode_info_sb(cm, x, mi_row + hbs, mi_col + hbs, subsize,
-                        pc_tree->split[3]);
+                        pc_tree->u.split[3]);
       break;
     }
     default: break;
@@ -3318,10 +4520,81 @@ static void pred_pixel_ready_reset(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
   if (bsize > BLOCK_8X8) {
     BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT);
     int i;
-    for (i = 0; i < 4; ++i) pred_pixel_ready_reset(pc_tree->split[i], subsize);
+    for (i = 0; i < 4; ++i)
+      pred_pixel_ready_reset(pc_tree->u.split[i], subsize);
   }
 }
 
+#define FEATURES 6
+#define LABELS 2
+static int ml_predict_var_partitioning(VP9_COMP *cpi, MACROBLOCK *x,
+                                       BLOCK_SIZE bsize, int mi_row,
+                                       int mi_col) {
+  VP9_COMMON *const cm = &cpi->common;
+  const NN_CONFIG *nn_config = NULL;
+
+  switch (bsize) {
+    case BLOCK_64X64: nn_config = &vp9_var_part_nnconfig_64; break;
+    case BLOCK_32X32: nn_config = &vp9_var_part_nnconfig_32; break;
+    case BLOCK_16X16: nn_config = &vp9_var_part_nnconfig_16; break;
+    case BLOCK_8X8: break;
+    default: assert(0 && "Unexpected block size."); return -1;
+  }
+
+  if (!nn_config) return -1;
+
+  vpx_clear_system_state();
+
+  {
+    const float thresh = cpi->oxcf.speed <= 5 ? 1.25f : 0.0f;
+    float features[FEATURES] = { 0.0f };
+    const int dc_q = vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth);
+    int feature_idx = 0;
+    float score[LABELS];
+
+    features[feature_idx++] = logf((float)(dc_q * dc_q) / 256.0f + 1.0f);
+    vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col);
+    {
+      const int bs = 4 * num_4x4_blocks_wide_lookup[bsize];
+      const BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT);
+      const int sb_offset_row = 8 * (mi_row & 7);
+      const int sb_offset_col = 8 * (mi_col & 7);
+      const uint8_t *pred = x->est_pred + sb_offset_row * 64 + sb_offset_col;
+      const uint8_t *src = x->plane[0].src.buf;
+      const int src_stride = x->plane[0].src.stride;
+      const int pred_stride = 64;
+      unsigned int sse;
+      int i;
+      // Variance of whole block.
+      const unsigned int var =
+          cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse);
+      const float factor = (var == 0) ? 1.0f : (1.0f / (float)var);
+
+      features[feature_idx++] = logf((float)var + 1.0f);
+      for (i = 0; i < 4; ++i) {
+        const int x_idx = (i & 1) * bs / 2;
+        const int y_idx = (i >> 1) * bs / 2;
+        const int src_offset = y_idx * src_stride + x_idx;
+        const int pred_offset = y_idx * pred_stride + x_idx;
+        // Variance of quarter block.
+        const unsigned int sub_var =
+            cpi->fn_ptr[subsize].vf(src + src_offset, src_stride,
+                                    pred + pred_offset, pred_stride, &sse);
+        const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var;
+        features[feature_idx++] = var_ratio;
+      }
+    }
+
+    assert(feature_idx == FEATURES);
+    nn_predict(features, nn_config, score);
+    if (score[0] > thresh) return PARTITION_SPLIT;
+    if (score[0] < -thresh) return PARTITION_NONE;
+    return -1;
+  }
+}
+#undef FEATURES
+#undef LABELS
+
 static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
                                  TileDataEnc *tile_data, TOKENEXTRA **tp,
                                  int mi_row, int mi_col, BLOCK_SIZE bsize,
@@ -3351,8 +4624,14 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
       !force_vert_split && yss <= xss && bsize >= BLOCK_8X8;
   int partition_vert_allowed =
       !force_horz_split && xss <= yss && bsize >= BLOCK_8X8;
+  const int use_ml_based_partitioning =
+      sf->partition_search_type == ML_BASED_PARTITION;
+
   (void)*tp_orig;
 
+  // Avoid checking for rectangular partitions for speed >= 5.
+  if (cpi->oxcf.speed >= 5) do_rect = 0;
+
   assert(num_8x8_blocks_wide_lookup[bsize] ==
          num_8x8_blocks_high_lookup[bsize]);
 
@@ -3378,6 +4657,18 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
     partition_vert_allowed &= force_vert_split;
   }
 
+  if (use_ml_based_partitioning) {
+    if (partition_none_allowed || do_split) do_rect = 0;
+    if (partition_none_allowed && do_split) {
+      const int ml_predicted_partition =
+          ml_predict_var_partitioning(cpi, x, bsize, mi_row, mi_col);
+      if (ml_predicted_partition == PARTITION_NONE) do_split = 0;
+      if (ml_predicted_partition == PARTITION_SPLIT) partition_none_allowed = 0;
+    }
+  }
+
+  if (!partition_none_allowed && !do_split) do_rect = 1;
+
   ctx->pred_pixel_ready =
       !(partition_vert_allowed || partition_horz_allowed || do_split);
 
@@ -3391,26 +4682,25 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
     ctx->skip = x->skip;
 
     if (this_rdc.rate != INT_MAX) {
-      int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+      const int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
       this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
       this_rdc.rdcost =
           RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist);
       if (this_rdc.rdcost < best_rdc.rdcost) {
-        int64_t dist_breakout_thr = sf->partition_search_breakout_dist_thr;
-        int64_t rate_breakout_thr = sf->partition_search_breakout_rate_thr;
-
-        dist_breakout_thr >>=
-            8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
-
-        rate_breakout_thr *= num_pels_log2_lookup[bsize];
-
         best_rdc = this_rdc;
         if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE;
 
-        if (!x->e_mbd.lossless && this_rdc.rate < rate_breakout_thr &&
-            this_rdc.dist < dist_breakout_thr) {
-          do_split = 0;
-          do_rect = 0;
+        if (!use_ml_based_partitioning) {
+          int64_t dist_breakout_thr = sf->partition_search_breakout_thr.dist;
+          int64_t rate_breakout_thr = sf->partition_search_breakout_thr.rate;
+          dist_breakout_thr >>=
+              8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+          rate_breakout_thr *= num_pels_log2_lookup[bsize];
+          if (!x->e_mbd.lossless && this_rdc.rate < rate_breakout_thr &&
+              this_rdc.dist < dist_breakout_thr) {
+            do_split = 0;
+            do_rect = 0;
+          }
         }
       }
     }
@@ -3432,9 +4722,9 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
       if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
         continue;
       load_pred_mv(x, ctx);
-      nonrd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx,
-                           mi_col + x_idx, subsize, &this_rdc, 0,
-                           best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[i]);
+      nonrd_pick_partition(
+          cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize,
+          &this_rdc, 0, best_rdc.rdcost - sum_rdc.rdcost, pc_tree->u.split[i]);
 
       if (this_rdc.rate == INT_MAX) {
         vp9_rd_cost_reset(&sum_rdc);
@@ -3458,7 +4748,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
   // PARTITION_HORZ
   if (partition_horz_allowed && do_rect) {
     subsize = get_subsize(bsize, PARTITION_HORZ);
-    if (sf->adaptive_motion_search) load_pred_mv(x, ctx);
+    load_pred_mv(x, ctx);
     pc_tree->horizontal[0].pred_pixel_ready = 1;
     nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
                         &pc_tree->horizontal[0]);
@@ -3502,7 +4792,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
   // PARTITION_VERT
   if (partition_vert_allowed && do_rect) {
     subsize = get_subsize(bsize, PARTITION_VERT);
-    if (sf->adaptive_motion_search) load_pred_mv(x, ctx);
+    load_pred_mv(x, ctx);
     pc_tree->vertical[0].pred_pixel_ready = 1;
     nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
                         &pc_tree->vertical[0]);
@@ -3580,6 +4870,8 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td,
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
   RD_COST this_rdc;
+  BLOCK_SIZE subsize_ref =
+      (cpi->sf.adapt_partition_source_sad) ? BLOCK_8X8 : BLOCK_16X16;
 
   vp9_rd_cost_reset(&this_rdc);
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
@@ -3593,7 +4885,7 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td,
     nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize, rd_cost,
                          0, INT64_MAX, pc_tree);
   } else if (bsize == BLOCK_32X32 && partition != PARTITION_NONE &&
-             subsize >= BLOCK_16X16) {
+             subsize >= subsize_ref) {
     x->max_partition_size = BLOCK_32X32;
     x->min_partition_size = BLOCK_8X8;
     nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize, rd_cost,
@@ -3660,14 +4952,15 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td,
           }
         }
         break;
-      case PARTITION_SPLIT:
+      default:
+        assert(partition == PARTITION_SPLIT);
         subsize = get_subsize(bsize, PARTITION_SPLIT);
         nonrd_select_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
                                subsize, output_enabled, rd_cost,
-                               pc_tree->split[0]);
+                               pc_tree->u.split[0]);
         nonrd_select_partition(cpi, td, tile_data, mi + hbs, tp, mi_row,
                                mi_col + hbs, subsize, output_enabled, &this_rdc,
-                               pc_tree->split[1]);
+                               pc_tree->u.split[1]);
         if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
             rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
           rd_cost->rate += this_rdc.rate;
@@ -3675,7 +4968,7 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td,
         }
         nonrd_select_partition(cpi, td, tile_data, mi + hbs * mis, tp,
                                mi_row + hbs, mi_col, subsize, output_enabled,
-                               &this_rdc, pc_tree->split[2]);
+                               &this_rdc, pc_tree->u.split[2]);
         if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
             rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
           rd_cost->rate += this_rdc.rate;
@@ -3683,14 +4976,13 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td,
         }
         nonrd_select_partition(cpi, td, tile_data, mi + hbs * mis + hbs, tp,
                                mi_row + hbs, mi_col + hbs, subsize,
-                               output_enabled, &this_rdc, pc_tree->split[3]);
+                               output_enabled, &this_rdc, pc_tree->u.split[3]);
         if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
             rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
           rd_cost->rate += this_rdc.rate;
           rd_cost->dist += this_rdc.dist;
         }
         break;
-      default: assert(0 && "Invalid partition type."); break;
     }
   }
 
@@ -3779,34 +5071,132 @@ static void nonrd_use_partition(VP9_COMP *cpi, ThreadData *td,
                     output_enabled, subsize, &pc_tree->horizontal[1]);
       }
       break;
-    case PARTITION_SPLIT:
+    default:
+      assert(partition == PARTITION_SPLIT);
       subsize = get_subsize(bsize, PARTITION_SPLIT);
       if (bsize == BLOCK_8X8) {
         nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
-                            subsize, pc_tree->leaf_split[0]);
+                            subsize, pc_tree->u.leaf_split[0]);
         encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
-                    subsize, pc_tree->leaf_split[0]);
+                    subsize, pc_tree->u.leaf_split[0]);
       } else {
         nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, subsize,
-                            output_enabled, dummy_cost, pc_tree->split[0]);
+                            output_enabled, dummy_cost, pc_tree->u.split[0]);
         nonrd_use_partition(cpi, td, tile_data, mi + hbs, tp, mi_row,
                             mi_col + hbs, subsize, output_enabled, dummy_cost,
-                            pc_tree->split[1]);
+                            pc_tree->u.split[1]);
         nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis, tp,
                             mi_row + hbs, mi_col, subsize, output_enabled,
-                            dummy_cost, pc_tree->split[2]);
+                            dummy_cost, pc_tree->u.split[2]);
         nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis + hbs, tp,
                             mi_row + hbs, mi_col + hbs, subsize, output_enabled,
-                            dummy_cost, pc_tree->split[3]);
+                            dummy_cost, pc_tree->u.split[3]);
       }
       break;
-    default: assert(0 && "Invalid partition type."); break;
   }
 
   if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
     update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 }
 
+// Get a prediction(stored in x->est_pred) for the whole 64x64 superblock.
+static void get_estimated_pred(VP9_COMP *cpi, const TileInfo *const tile,
+                               MACROBLOCK *x, int mi_row, int mi_col) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int is_key_frame = frame_is_intra_only(cm);
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
+
+  if (!is_key_frame) {
+    MODE_INFO *mi = xd->mi[0];
+    YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+    const YV12_BUFFER_CONFIG *yv12_g = NULL;
+    const BLOCK_SIZE bsize = BLOCK_32X32 + (mi_col + 4 < cm->mi_cols) * 2 +
+                             (mi_row + 4 < cm->mi_rows);
+    unsigned int y_sad_g, y_sad_thr;
+    unsigned int y_sad = UINT_MAX;
+
+    assert(yv12 != NULL);
+
+    if (!(is_one_pass_svc(cpi) && cpi->svc.spatial_layer_id) ||
+        cpi->svc.use_gf_temporal_ref_current_layer) {
+      // For now, GOLDEN will not be used for non-zero spatial layers, since
+      // it may not be a temporal reference.
+      yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+    }
+
+    // Only compute y_sad_g (sad for golden reference) for speed < 8.
+    if (cpi->oxcf.speed < 8 && yv12_g && yv12_g != yv12 &&
+        (cpi->ref_frame_flags & VP9_GOLD_FLAG)) {
+      vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
+                           &cm->frame_refs[GOLDEN_FRAME - 1].sf);
+      y_sad_g = cpi->fn_ptr[bsize].sdf(
+          x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf,
+          xd->plane[0].pre[0].stride);
+    } else {
+      y_sad_g = UINT_MAX;
+    }
+
+    if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR &&
+        cpi->rc.is_src_frame_alt_ref) {
+      yv12 = get_ref_frame_buffer(cpi, ALTREF_FRAME);
+      vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+                           &cm->frame_refs[ALTREF_FRAME - 1].sf);
+      mi->ref_frame[0] = ALTREF_FRAME;
+      y_sad_g = UINT_MAX;
+    } else {
+      vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+                           &cm->frame_refs[LAST_FRAME - 1].sf);
+      mi->ref_frame[0] = LAST_FRAME;
+    }
+    mi->ref_frame[1] = NO_REF_FRAME;
+    mi->sb_type = BLOCK_64X64;
+    mi->mv[0].as_int = 0;
+    mi->interp_filter = BILINEAR;
+
+    {
+      const MV dummy_mv = { 0, 0 };
+      y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col,
+                                            &dummy_mv);
+      x->sb_use_mv_part = 1;
+      x->sb_mvcol_part = mi->mv[0].as_mv.col;
+      x->sb_mvrow_part = mi->mv[0].as_mv.row;
+    }
+
+    // Pick ref frame for partitioning, bias last frame when y_sad_g and y_sad
+    // are close if short_circuit_low_temp_var is on.
+    y_sad_thr = cpi->sf.short_circuit_low_temp_var ? (y_sad * 7) >> 3 : y_sad;
+    if (y_sad_g < y_sad_thr) {
+      vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
+                           &cm->frame_refs[GOLDEN_FRAME - 1].sf);
+      mi->ref_frame[0] = GOLDEN_FRAME;
+      mi->mv[0].as_int = 0;
+    } else {
+      x->pred_mv[LAST_FRAME] = mi->mv[0].as_mv;
+    }
+
+    set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
+    xd->plane[0].dst.buf = x->est_pred;
+    xd->plane[0].dst.stride = 64;
+    vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
+  } else {
+#if CONFIG_VP9_HIGHBITDEPTH
+    switch (xd->bd) {
+      case 8: memset(x->est_pred, 128, 64 * 64 * sizeof(x->est_pred[0])); break;
+      case 10:
+        memset(x->est_pred, 128 * 4, 64 * 64 * sizeof(x->est_pred[0]));
+        break;
+      case 12:
+        memset(x->est_pred, 128 * 16, 64 * 64 * sizeof(x->est_pred[0]));
+        break;
+    }
+#else
+    memset(x->est_pred, 128, 64 * 64 * sizeof(x->est_pred[0]));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+}
+
 static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
                                 TileDataEnc *tile_data, int mi_row,
                                 TOKENEXTRA **tp) {
@@ -3818,13 +5208,18 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
   const int mi_col_start = tile_info->mi_col_start;
   const int mi_col_end = tile_info->mi_col_end;
   int mi_col;
+  const int sb_row = mi_row >> MI_BLOCK_SIZE_LOG2;
+  const int num_sb_cols =
+      get_num_cols(tile_data->tile_info, MI_BLOCK_SIZE_LOG2);
+  int sb_col_in_tile;
 
   // Initialize the left context for the new SB row
   memset(&xd->left_context, 0, sizeof(xd->left_context));
   memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
 
   // Code each SB in the row
-  for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += MI_BLOCK_SIZE) {
+  for (mi_col = mi_col_start, sb_col_in_tile = 0; mi_col < mi_col_end;
+       mi_col += MI_BLOCK_SIZE, ++sb_col_in_tile) {
     const struct segmentation *const seg = &cm->seg;
     RD_COST dummy_rdc;
     const int idx_str = cm->mi_stride * mi_row + mi_col;
@@ -3832,18 +5227,73 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
     PARTITION_SEARCH_TYPE partition_search_type = sf->partition_search_type;
     BLOCK_SIZE bsize = BLOCK_64X64;
     int seg_skip = 0;
+    int i;
+
+    (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, sb_row,
+                                   sb_col_in_tile);
+
+    if (cpi->use_skin_detection) {
+      vp9_compute_skin_sb(cpi, BLOCK_16X16, mi_row, mi_col);
+    }
+
     x->source_variance = UINT_MAX;
-    vp9_zero(x->pred_mv);
+    for (i = 0; i < MAX_REF_FRAMES; ++i) {
+      x->pred_mv[i].row = INT16_MAX;
+      x->pred_mv[i].col = INT16_MAX;
+    }
     vp9_rd_cost_init(&dummy_rdc);
     x->color_sensitivity[0] = 0;
     x->color_sensitivity[1] = 0;
     x->sb_is_skin = 0;
+    x->skip_low_source_sad = 0;
+    x->lowvar_highsumdiff = 0;
+    x->content_state_sb = 0;
+    x->zero_temp_sad_source = 0;
+    x->sb_use_mv_part = 0;
+    x->sb_mvcol_part = 0;
+    x->sb_mvrow_part = 0;
+    x->sb_pickmode_part = 0;
+    x->arf_frame_usage = 0;
+    x->lastgolden_frame_usage = 0;
+
+    if (cpi->compute_source_sad_onepass && cpi->sf.use_source_sad) {
+      int shift = cpi->Source->y_stride * (mi_row << 3) + (mi_col << 3);
+      int sb_offset2 = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3);
+      int64_t source_sad = avg_source_sad(cpi, x, shift, sb_offset2);
+      if (sf->adapt_partition_source_sad &&
+          (cpi->oxcf.rc_mode == VPX_VBR && !cpi->rc.is_src_frame_alt_ref &&
+           source_sad > sf->adapt_partition_thresh &&
+           (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)))
+        partition_search_type = REFERENCE_PARTITION;
+    }
 
     if (seg->enabled) {
       const uint8_t *const map =
           seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
       int segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
       seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
+
+      if (cpi->roi.enabled && cpi->roi.skip[BACKGROUND_SEG_SKIP_ID] &&
+          cpi->rc.frames_since_key > FRAMES_NO_SKIPPING_AFTER_KEY &&
+          x->content_state_sb > kLowSadLowSumdiff) {
+        // For ROI with skip, force segment = 0 (no skip) over whole
+        // superblock to avoid artifacts if temporal change in source_sad is
+        // not 0.
+        int xi, yi;
+        const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
+        const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
+        const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+        const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
+        const int block_index = mi_row * cm->mi_cols + mi_col;
+        set_mode_info_offsets(cm, x, xd, mi_row, mi_col);
+        for (yi = 0; yi < ymis; yi++)
+          for (xi = 0; xi < xmis; xi++) {
+            int map_offset = block_index + yi * cm->mi_cols + xi;
+            cpi->segmentation_map[map_offset] = 0;
+          }
+        set_segment_index(cpi, x, mi_row, mi_col, BLOCK_64X64, 0);
+        seg_skip = 0;
+      }
       if (seg_skip) {
         partition_search_type = FIXED_PARTITION;
       }
@@ -3860,10 +5310,14 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
         nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
                             BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
         break;
-      case SOURCE_VAR_BASED_PARTITION:
-        set_source_var_based_partition(cpi, tile_info, x, mi, mi_row, mi_col);
-        nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
-                            BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
+      case ML_BASED_PARTITION:
+        get_estimated_pred(cpi, tile_info, x, mi_row, mi_col);
+        x->max_partition_size = BLOCK_64X64;
+        x->min_partition_size = BLOCK_8X8;
+        x->sb_pickmode_part = 1;
+        nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col,
+                             BLOCK_64X64, &dummy_rdc, 1, INT64_MAX,
+                             td->pc_root);
         break;
       case FIXED_PARTITION:
         if (!seg_skip) bsize = sf->always_this_block_size;
@@ -3871,17 +5325,17 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
         nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
                             BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
         break;
-      case REFERENCE_PARTITION:
+      default:
+        assert(partition_search_type == REFERENCE_PARTITION);
+        x->sb_pickmode_part = 1;
         set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
-        // Use nonrd_pick_partition on scene-cut for VBR, or on qp-segment
-        // if cyclic_refresh is enabled.
+        // Use nonrd_pick_partition on scene-cut for VBR mode.
         // nonrd_pick_partition does not support 4x4 partition, so avoid it
         // on key frame for now.
         if ((cpi->oxcf.rc_mode == VPX_VBR && cpi->rc.high_source_sad &&
-             cm->frame_type != KEY_FRAME) ||
-            (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
-             xd->mi[0]->segment_id)) {
-          // Use lower max_partition_size for low resoultions.
+             cpi->oxcf.speed < 6 && !frame_is_intra_only(cm) &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
+          // Use lower max_partition_size for low resolutions.
           if (cm->width <= 352 && cm->height <= 288)
             x->max_partition_size = BLOCK_32X32;
           else
@@ -3895,7 +5349,7 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
           // TODO(marpan): Seems like nonrd_select_partition does not support
           // 4x4 partition. Since 4x4 is used on key frame, use this switch
           // for now.
-          if (cm->frame_type == KEY_FRAME)
+          if (frame_is_intra_only(cm))
             nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
                                 BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
           else
@@ -3904,123 +5358,25 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
         }
 
         break;
-      default: assert(0); break;
     }
+
+    // Update ref_frame usage for inter frame if this group is ARF group.
+    if (!cpi->rc.is_src_frame_alt_ref && !cpi->refresh_golden_frame &&
+        !cpi->refresh_alt_ref_frame && cpi->rc.alt_ref_gf_group &&
+        cpi->sf.use_altref_onepass) {
+      int sboffset = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3);
+      if (cpi->count_arf_frame_usage != NULL)
+        cpi->count_arf_frame_usage[sboffset] = x->arf_frame_usage;
+      if (cpi->count_lastgolden_frame_usage != NULL)
+        cpi->count_lastgolden_frame_usage[sboffset] = x->lastgolden_frame_usage;
+    }
+
+    (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row,
+                                    sb_col_in_tile, num_sb_cols);
   }
 }
 // end RTC play code
 
-static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
-  const SPEED_FEATURES *const sf = &cpi->sf;
-  const VP9_COMMON *const cm = &cpi->common;
-
-  const uint8_t *src = cpi->Source->y_buffer;
-  const uint8_t *last_src = cpi->Last_Source->y_buffer;
-  const int src_stride = cpi->Source->y_stride;
-  const int last_stride = cpi->Last_Source->y_stride;
-
-  // Pick cutoff threshold
-  const int cutoff = (VPXMIN(cm->width, cm->height) >= 720)
-                         ? (cm->MBs * VAR_HIST_LARGE_CUT_OFF / 100)
-                         : (cm->MBs * VAR_HIST_SMALL_CUT_OFF / 100);
-  DECLARE_ALIGNED(16, int, hist[VAR_HIST_BINS]);
-  diff *var16 = cpi->source_diff_var;
-
-  int sum = 0;
-  int i, j;
-
-  memset(hist, 0, VAR_HIST_BINS * sizeof(hist[0]));
-
-  for (i = 0; i < cm->mb_rows; i++) {
-    for (j = 0; j < cm->mb_cols; j++) {
-#if CONFIG_VP9_HIGHBITDEPTH
-      if (cm->use_highbitdepth) {
-        switch (cm->bit_depth) {
-          case VPX_BITS_8:
-            vpx_highbd_8_get16x16var(src, src_stride, last_src, last_stride,
-                                     &var16->sse, &var16->sum);
-            break;
-          case VPX_BITS_10:
-            vpx_highbd_10_get16x16var(src, src_stride, last_src, last_stride,
-                                      &var16->sse, &var16->sum);
-            break;
-          case VPX_BITS_12:
-            vpx_highbd_12_get16x16var(src, src_stride, last_src, last_stride,
-                                      &var16->sse, &var16->sum);
-            break;
-          default:
-            assert(0 &&
-                   "cm->bit_depth should be VPX_BITS_8, VPX_BITS_10"
-                   " or VPX_BITS_12");
-            return -1;
-        }
-      } else {
-        vpx_get16x16var(src, src_stride, last_src, last_stride, &var16->sse,
-                        &var16->sum);
-      }
-#else
-      vpx_get16x16var(src, src_stride, last_src, last_stride, &var16->sse,
-                      &var16->sum);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-      var16->var = var16->sse - (((uint32_t)var16->sum * var16->sum) >> 8);
-
-      if (var16->var >= VAR_HIST_MAX_BG_VAR)
-        hist[VAR_HIST_BINS - 1]++;
-      else
-        hist[var16->var / VAR_HIST_FACTOR]++;
-
-      src += 16;
-      last_src += 16;
-      var16++;
-    }
-
-    src = src - cm->mb_cols * 16 + 16 * src_stride;
-    last_src = last_src - cm->mb_cols * 16 + 16 * last_stride;
-  }
-
-  cpi->source_var_thresh = 0;
-
-  if (hist[VAR_HIST_BINS - 1] < cutoff) {
-    for (i = 0; i < VAR_HIST_BINS - 1; i++) {
-      sum += hist[i];
-
-      if (sum > cutoff) {
-        cpi->source_var_thresh = (i + 1) * VAR_HIST_FACTOR;
-        return 0;
-      }
-    }
-  }
-
-  return sf->search_type_check_frequency;
-}
-
-static void source_var_based_partition_search_method(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
-  SPEED_FEATURES *const sf = &cpi->sf;
-
-  if (cm->frame_type == KEY_FRAME) {
-    // For key frame, use SEARCH_PARTITION.
-    sf->partition_search_type = SEARCH_PARTITION;
-  } else if (cm->intra_only) {
-    sf->partition_search_type = FIXED_PARTITION;
-  } else {
-    if (cm->last_width != cm->width || cm->last_height != cm->height) {
-      if (cpi->source_diff_var) vpx_free(cpi->source_diff_var);
-
-      CHECK_MEM_ERROR(cm, cpi->source_diff_var,
-                      vpx_calloc(cm->MBs, sizeof(diff)));
-    }
-
-    if (!cpi->frames_till_next_var_check)
-      cpi->frames_till_next_var_check = set_var_thresh_from_histogram(cpi);
-
-    if (cpi->frames_till_next_var_check > 0) {
-      sf->partition_search_type = FIXED_PARTITION;
-      cpi->frames_till_next_var_check--;
-    }
-  }
-}
-
 static int get_skip_encode_frame(const VP9_COMMON *cm, ThreadData *const td) {
   unsigned int intra_count = 0, inter_count = 0;
   int j;
@@ -4040,12 +5396,20 @@ void vp9_init_tile_data(VP9_COMP *cpi) {
   const int tile_rows = 1 << cm->log2_tile_rows;
   int tile_col, tile_row;
   TOKENEXTRA *pre_tok = cpi->tile_tok[0][0];
+  TOKENLIST *tplist = cpi->tplist[0][0];
   int tile_tok = 0;
+  int tplist_count = 0;
 
   if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
-    if (cpi->tile_data != NULL) vpx_free(cpi->tile_data);
-    CHECK_MEM_ERROR(cm, cpi->tile_data, vpx_malloc(tile_cols * tile_rows *
-                                                   sizeof(*cpi->tile_data)));
+    if (cpi->tile_data != NULL) {
+      // Free the row mt memory in cpi->tile_data first.
+      vp9_row_mt_mem_dealloc(cpi);
+      vpx_free(cpi->tile_data);
+    }
+    cpi->allocated_tiles = 0;
+    CHECK_MEM_ERROR(
+        &cm->error, cpi->tile_data,
+        vpx_malloc(tile_cols * tile_rows * sizeof(*cpi->tile_data)));
     cpi->allocated_tiles = tile_cols * tile_rows;
 
     for (tile_row = 0; tile_row < tile_rows; ++tile_row)
@@ -4053,55 +5417,88 @@ void vp9_init_tile_data(VP9_COMP *cpi) {
         TileDataEnc *tile_data =
             &cpi->tile_data[tile_row * tile_cols + tile_col];
         int i, j;
+        const MV zero_mv = { 0, 0 };
         for (i = 0; i < BLOCK_SIZES; ++i) {
           for (j = 0; j < MAX_MODES; ++j) {
-            tile_data->thresh_freq_fact[i][j] = 32;
+            tile_data->thresh_freq_fact[i][j] = RD_THRESH_INIT_FACT;
+            tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT;
             tile_data->mode_map[i][j] = j;
           }
         }
+        tile_data->firstpass_top_mv = zero_mv;
+#if CONFIG_MULTITHREAD
+        tile_data->row_base_thresh_freq_fact = NULL;
+#endif
       }
   }
 
   for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
     for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-      TileInfo *tile_info =
-          &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info;
+      TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+      TileInfo *tile_info = &this_tile->tile_info;
+      if (cpi->sf.adaptive_rd_thresh_row_mt) {
+        vp9_row_mt_alloc_rd_thresh(cpi, this_tile);
+      }
       vp9_tile_init(tile_info, cm, tile_row, tile_col);
 
       cpi->tile_tok[tile_row][tile_col] = pre_tok + tile_tok;
       pre_tok = cpi->tile_tok[tile_row][tile_col];
       tile_tok = allocated_tokens(*tile_info);
+
+      cpi->tplist[tile_row][tile_col] = tplist + tplist_count;
+      tplist = cpi->tplist[tile_row][tile_col];
+      tplist_count = get_num_vert_units(*tile_info, MI_BLOCK_SIZE_LOG2);
     }
   }
 }
 
+void vp9_encode_sb_row(VP9_COMP *cpi, ThreadData *td, int tile_row,
+                       int tile_col, int mi_row) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+  const TileInfo *const tile_info = &this_tile->tile_info;
+  TOKENEXTRA *tok = NULL;
+  int tile_sb_row;
+  int tile_mb_cols = (tile_info->mi_col_end - tile_info->mi_col_start + 1) >> 1;
+
+  tile_sb_row = mi_cols_aligned_to_sb(mi_row - tile_info->mi_row_start) >>
+                MI_BLOCK_SIZE_LOG2;
+  get_start_tok(cpi, tile_row, tile_col, mi_row, &tok);
+  cpi->tplist[tile_row][tile_col][tile_sb_row].start = tok;
+
+#if CONFIG_REALTIME_ONLY
+  assert(cpi->sf.use_nonrd_pick_mode);
+  encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok);
+#else
+  if (cpi->sf.use_nonrd_pick_mode)
+    encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok);
+  else
+    encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
+#endif
+
+  cpi->tplist[tile_row][tile_col][tile_sb_row].stop = tok;
+  cpi->tplist[tile_row][tile_col][tile_sb_row].count =
+      (unsigned int)(cpi->tplist[tile_row][tile_col][tile_sb_row].stop -
+                     cpi->tplist[tile_row][tile_col][tile_sb_row].start);
+  assert(tok - cpi->tplist[tile_row][tile_col][tile_sb_row].start <=
+         get_token_alloc(MI_BLOCK_SIZE >> 1, tile_mb_cols));
+
+  (void)tile_mb_cols;
+}
+
 void vp9_encode_tile(VP9_COMP *cpi, ThreadData *td, int tile_row,
                      int tile_col) {
   VP9_COMMON *const cm = &cpi->common;
   const int tile_cols = 1 << cm->log2_tile_cols;
   TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
   const TileInfo *const tile_info = &this_tile->tile_info;
-  TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
   const int mi_row_start = tile_info->mi_row_start;
   const int mi_row_end = tile_info->mi_row_end;
   int mi_row;
 
-  // Set up pointers to per thread motion search counters.
-  this_tile->m_search_count = 0;   // Count of motion search hits.
-  this_tile->ex_search_count = 0;  // Exhaustive mesh search hits.
-  td->mb.m_search_count_ptr = &this_tile->m_search_count;
-  td->mb.ex_search_count_ptr = &this_tile->ex_search_count;
-
-  for (mi_row = mi_row_start; mi_row < mi_row_end; mi_row += MI_BLOCK_SIZE) {
-    if (cpi->sf.use_nonrd_pick_mode)
-      encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok);
-    else
-      encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
-  }
-  cpi->tok_count[tile_row][tile_col] =
-      (unsigned int)(tok - cpi->tile_tok[tile_row][tile_col]);
-  assert(tok - cpi->tile_tok[tile_row][tile_col] <=
-         allocated_tokens(*tile_info));
+  for (mi_row = mi_row_start; mi_row < mi_row_end; mi_row += MI_BLOCK_SIZE)
+    vp9_encode_sb_row(cpi, td, tile_row, tile_col, mi_row);
 }
 
 static void encode_tiles(VP9_COMP *cpi) {
@@ -4117,19 +5514,106 @@ static void encode_tiles(VP9_COMP *cpi) {
       vp9_encode_tile(cpi, &cpi->td, tile_row, tile_col);
 }
 
-#if CONFIG_FP_MB_STATS
-static int input_fpmb_stats(FIRSTPASS_MB_STATS *firstpass_mb_stats,
-                            VP9_COMMON *cm, uint8_t **this_frame_mb_stats) {
-  uint8_t *mb_stats_in = firstpass_mb_stats->mb_stats_start +
-                         cm->current_video_frame * cm->MBs * sizeof(uint8_t);
-
-  if (mb_stats_in > firstpass_mb_stats->mb_stats_end) return EOF;
-
-  *this_frame_mb_stats = mb_stats_in;
-
-  return 1;
+static int compare_kmeans_data(const void *a, const void *b) {
+  if (((const KMEANS_DATA *)a)->value > ((const KMEANS_DATA *)b)->value) {
+    return 1;
+  } else if (((const KMEANS_DATA *)a)->value <
+             ((const KMEANS_DATA *)b)->value) {
+    return -1;
+  } else {
+    return 0;
+  }
+}
+
+static void compute_boundary_ls(const double *ctr_ls, int k,
+                                double *boundary_ls) {
+  // boundary_ls[j] is the upper bound of data centered at ctr_ls[j]
+  int j;
+  for (j = 0; j < k - 1; ++j) {
+    boundary_ls[j] = (ctr_ls[j] + ctr_ls[j + 1]) / 2.;
+  }
+  boundary_ls[k - 1] = DBL_MAX;
+}
+
+int vp9_get_group_idx(double value, double *boundary_ls, int k) {
+  int group_idx = 0;
+  while (value >= boundary_ls[group_idx]) {
+    ++group_idx;
+    if (group_idx == k - 1) {
+      break;
+    }
+  }
+  return group_idx;
+}
+
+void vp9_kmeans(double *ctr_ls, double *boundary_ls, int *count_ls, int k,
+                KMEANS_DATA *arr, int size) {
+  int i, j;
+  int itr;
+  int group_idx;
+  double sum[MAX_KMEANS_GROUPS];
+  int count[MAX_KMEANS_GROUPS];
+
+  vpx_clear_system_state();
+
+  assert(k >= 2 && k <= MAX_KMEANS_GROUPS);
+
+  qsort(arr, size, sizeof(*arr), compare_kmeans_data);
+
+  // initialize the center points
+  for (j = 0; j < k; ++j) {
+    ctr_ls[j] = arr[(size * (2 * j + 1)) / (2 * k)].value;
+  }
+
+  for (itr = 0; itr < 10; ++itr) {
+    compute_boundary_ls(ctr_ls, k, boundary_ls);
+    for (i = 0; i < MAX_KMEANS_GROUPS; ++i) {
+      sum[i] = 0;
+      count[i] = 0;
+    }
+
+    // Both the data and centers are sorted in ascending order.
+    // As each data point is processed in order, its corresponding group index
+    // can only increase. So we only need to reset the group index to zero here.
+    group_idx = 0;
+    for (i = 0; i < size; ++i) {
+      while (arr[i].value >= boundary_ls[group_idx]) {
+        // place samples into clusters
+        ++group_idx;
+        if (group_idx == k - 1) {
+          break;
+        }
+      }
+      sum[group_idx] += arr[i].value;
+      ++count[group_idx];
+    }
+
+    for (group_idx = 0; group_idx < k; ++group_idx) {
+      if (count[group_idx] > 0)
+        ctr_ls[group_idx] = sum[group_idx] / count[group_idx];
+
+      sum[group_idx] = 0;
+      count[group_idx] = 0;
+    }
+  }
+
+  // compute group_idx, boundary_ls and count_ls
+  for (j = 0; j < k; ++j) {
+    count_ls[j] = 0;
+  }
+  compute_boundary_ls(ctr_ls, k, boundary_ls);
+  group_idx = 0;
+  for (i = 0; i < size; ++i) {
+    while (arr[i].value >= boundary_ls[group_idx]) {
+      ++group_idx;
+      if (group_idx == k - 1) {
+        break;
+      }
+    }
+    arr[i].group_idx = group_idx;
+    ++count_ls[group_idx];
+  }
 }
-#endif
 
 static void encode_frame_internal(VP9_COMP *cpi) {
   SPEED_FEATURES *const sf = &cpi->sf;
@@ -4137,10 +5621,10 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   MACROBLOCK *const x = &td->mb;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
+  const int gf_group_index = cpi->twopass.gf_group.index;
 
   xd->mi = cm->mi_grid_visible;
   xd->mi[0] = cm->mi;
-
   vp9_zero(*td->counts);
   vp9_zero(cpi->td.rd_counts);
 
@@ -4149,17 +5633,19 @@ static void encode_frame_internal(VP9_COMP *cpi) {
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (cm->use_highbitdepth)
-    x->fwd_txm4x4 = xd->lossless ? vp9_highbd_fwht4x4 : vpx_highbd_fdct4x4;
+    x->fwd_txfm4x4 = xd->lossless ? vp9_highbd_fwht4x4 : vpx_highbd_fdct4x4;
   else
-    x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4;
-  x->highbd_itxm_add =
+    x->fwd_txfm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4;
+  x->highbd_inv_txfm_add =
       xd->lossless ? vp9_highbd_iwht4x4_add : vp9_highbd_idct4x4_add;
 #else
-  x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4;
+  x->fwd_txfm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-  x->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
-
+  x->inv_txfm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
+  x->optimize = sf->optimize_coefficients == 1 && cpi->oxcf.pass != 1;
   if (xd->lossless) x->optimize = 0;
+  x->sharpness = cpi->oxcf.sharpness;
+  x->adjust_rdmult_by_segment = (cpi->oxcf.aq_mode == VARIANCE_AQ);
 
   cm->tx_mode = select_tx_mode(cpi, xd);
 
@@ -4198,30 +5684,67 @@ static void encode_frame_internal(VP9_COMP *cpi) {
         !(cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR) &&
         !cpi->use_svc)
       cpi->ref_frame_flags &= (~VP9_GOLD_FLAG);
+  } else if (gf_group_index && gf_group_index < MAX_ARF_GOP_SIZE &&
+             cpi->sf.enable_tpl_model) {
+    TplDepFrame *tpl_frame = &cpi->tpl_stats[cpi->twopass.gf_group.index];
+    TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
 
-    if (sf->partition_search_type == SOURCE_VAR_BASED_PARTITION)
-      source_var_based_partition_search_method(cpi);
+    int tpl_stride = tpl_frame->stride;
+    int64_t intra_cost_base = 0;
+    int64_t mc_dep_cost_base = 0;
+    int row, col;
+
+    for (row = 0; row < cm->mi_rows && tpl_frame->is_valid; ++row) {
+      for (col = 0; col < cm->mi_cols; ++col) {
+        TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col];
+        intra_cost_base += this_stats->intra_cost;
+        mc_dep_cost_base += this_stats->mc_dep_cost;
+      }
+    }
+
+    vpx_clear_system_state();
+
+    if (tpl_frame->is_valid)
+      cpi->rd.r0 = (double)intra_cost_base / mc_dep_cost_base;
   }
 
+  for (MV_REFERENCE_FRAME ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME;
+       ++ref_frame) {
+    if (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) {
+      if (cm->frame_refs[ref_frame - 1].sf.x_scale_fp == REF_INVALID_SCALE ||
+          cm->frame_refs[ref_frame - 1].sf.y_scale_fp == REF_INVALID_SCALE)
+        cpi->ref_frame_flags &= ~ref_frame_to_flag(ref_frame);
+    }
+  }
+
+  // Frame segmentation
+  if (cpi->oxcf.aq_mode == PERCEPTUAL_AQ) build_kmeans_segmentation(cpi);
+
   {
+#if CONFIG_INTERNAL_STATS
     struct vpx_usec_timer emr_timer;
     vpx_usec_timer_start(&emr_timer);
-
-#if CONFIG_FP_MB_STATS
-    if (cpi->use_fp_mb_stats) {
-      input_fpmb_stats(&cpi->twopass.firstpass_mb_stats, cm,
-                       &cpi->twopass.this_frame_mb_stats);
-    }
 #endif
 
-    // If allowed, encoding tiles in parallel with one thread handling one tile.
-    if (VPXMIN(cpi->oxcf.max_threads, 1 << cm->log2_tile_cols) > 1)
-      vp9_encode_tiles_mt(cpi);
-    else
-      encode_tiles(cpi);
+    if (!cpi->row_mt) {
+      cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read_dummy;
+      cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write_dummy;
+      // If allowed, encoding tiles in parallel with one thread handling one
+      // tile when row based multi-threading is disabled.
+      if (VPXMIN(cpi->oxcf.max_threads, 1 << cm->log2_tile_cols) > 1)
+        vp9_encode_tiles_mt(cpi);
+      else
+        encode_tiles(cpi);
+    } else {
+      cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read;
+      cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write;
+      vp9_encode_tiles_row_mt(cpi);
+    }
 
+#if CONFIG_INTERNAL_STATS
     vpx_usec_timer_mark(&emr_timer);
     cpi->time_encode_sb_row += vpx_usec_timer_elapsed(&emr_timer);
+#endif
   }
 
   sf->skip_encode_frame =
@@ -4256,7 +5779,6 @@ static int compute_frame_aq_offset(struct VP9_COMP *cpi) {
 
   int mi_row, mi_col;
   int sum_delta = 0;
-  int map_index = 0;
   int qdelta_index;
   int segment_id;
 
@@ -4266,7 +5788,6 @@ static int compute_frame_aq_offset(struct VP9_COMP *cpi) {
       segment_id = mi_8x8[0]->segment_id;
       qdelta_index = get_segdata(seg, segment_id, SEG_LVL_ALT_Q);
       sum_delta += qdelta_index;
-      map_index++;
     }
     mi_8x8_ptr += cm->mi_stride;
   }
@@ -4274,9 +5795,39 @@ static int compute_frame_aq_offset(struct VP9_COMP *cpi) {
   return sum_delta / (cm->mi_rows * cm->mi_cols);
 }
 
+static void restore_encode_params(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  int tile_idx;
+  int i, j;
+  TileDataEnc *tile_data;
+  RD_OPT *rd_opt = &cpi->rd;
+  for (i = 0; i < MAX_REF_FRAMES; i++) {
+    for (j = 0; j < REFERENCE_MODES; j++)
+      rd_opt->prediction_type_threshes[i][j] =
+          rd_opt->prediction_type_threshes_prev[i][j];
+
+    for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; j++)
+      rd_opt->filter_threshes[i][j] = rd_opt->filter_threshes_prev[i][j];
+  }
+
+  for (tile_idx = 0; tile_idx < cpi->allocated_tiles; tile_idx++) {
+    assert(cpi->tile_data);
+    tile_data = &cpi->tile_data[tile_idx];
+    vp9_copy(tile_data->thresh_freq_fact, tile_data->thresh_freq_fact_prev);
+  }
+
+  cm->interp_filter = cpi->sf.default_interp_filter;
+}
+
 void vp9_encode_frame(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
 
+  restore_encode_params(cpi);
+
+#if CONFIG_MISMATCH_DEBUG
+  mismatch_reset_frame(MAX_MB_PLANE);
+#endif
+
   // In the longer term the encoder should be generalized to match the
   // decoder such that we allow compound where one of the 3 buffers has a
   // different sign bias and that buffer is then the fixed ref. However, this
@@ -4284,16 +5835,11 @@ void vp9_encode_frame(VP9_COMP *cpi) {
   // side behavior is where the ALT ref buffer has opposite sign bias to
   // the other two.
   if (!frame_is_intra_only(cm)) {
-    if ((cm->ref_frame_sign_bias[ALTREF_FRAME] ==
-         cm->ref_frame_sign_bias[GOLDEN_FRAME]) ||
-        (cm->ref_frame_sign_bias[ALTREF_FRAME] ==
-         cm->ref_frame_sign_bias[LAST_FRAME])) {
-      cpi->allow_comp_inter_inter = 0;
-    } else {
+    if (vp9_compound_reference_allowed(cm)) {
       cpi->allow_comp_inter_inter = 1;
-      cm->comp_fixed_ref = ALTREF_FRAME;
-      cm->comp_var_ref[0] = LAST_FRAME;
-      cm->comp_var_ref[1] = GOLDEN_FRAME;
+      vp9_setup_compound_reference_mode(cm);
+    } else {
+      cpi->allow_comp_inter_inter = 0;
     }
   }
 
@@ -4330,7 +5876,13 @@ void vp9_encode_frame(VP9_COMP *cpi) {
     if (cm->interp_filter == SWITCHABLE)
       cm->interp_filter = get_interp_filter(filter_thrs, is_alt_ref);
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, encode_frame_internal_time);
+#endif
     encode_frame_internal(cpi);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, encode_frame_internal_time);
+#endif
 
     for (i = 0; i < REFERENCE_MODES; ++i)
       mode_thrs[i] = (mode_thrs[i] + rdc->comp_pred_diff[i] / cm->MBs) / 2;
@@ -4391,8 +5943,31 @@ void vp9_encode_frame(VP9_COMP *cpi) {
       }
     }
   } else {
+    FRAME_COUNTS *counts = cpi->td.counts;
     cm->reference_mode = SINGLE_REFERENCE;
+    if (cpi->allow_comp_inter_inter && cpi->sf.use_compound_nonrd_pickmode &&
+        cpi->rc.alt_ref_gf_group && !cpi->rc.is_src_frame_alt_ref &&
+        cm->frame_type != KEY_FRAME)
+      cm->reference_mode = REFERENCE_MODE_SELECT;
+
     encode_frame_internal(cpi);
+
+    if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+      int single_count_zero = 0;
+      int comp_count_zero = 0;
+      int i;
+      for (i = 0; i < COMP_INTER_CONTEXTS; i++) {
+        single_count_zero += counts->comp_inter[i][0];
+        comp_count_zero += counts->comp_inter[i][1];
+      }
+      if (comp_count_zero == 0) {
+        cm->reference_mode = SINGLE_REFERENCE;
+        vp9_zero(counts->comp_inter);
+      } else if (single_count_zero == 0) {
+        cm->reference_mode = COMPOUND_REFERENCE;
+        vp9_zero(counts->comp_inter);
+      }
+    }
   }
 
   // If segmented AQ is enabled compute the average AQ weighting.
@@ -4434,7 +6009,8 @@ static void update_zeromv_cnt(VP9_COMP *const cpi, const MODE_INFO *const mi,
   for (y = 0; y < ymis; y++)
     for (x = 0; x < xmis; x++) {
       int map_offset = block_index + y * cm->mi_cols + x;
-      if (is_inter_block(mi) && mi->segment_id <= CR_SEGMENT_ID_BOOST2) {
+      if (mi->ref_frame[0] == LAST_FRAME && is_inter_block(mi) &&
+          mi->segment_id <= CR_SEGMENT_ID_BOOST2) {
         if (abs(mv.row) < 8 && abs(mv.col) < 8) {
           if (cpi->consec_zero_mv[map_offset] < 255)
             cpi->consec_zero_mv[map_offset]++;
@@ -4501,7 +6077,27 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
     vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col,
                                     VPXMAX(bsize, BLOCK_8X8));
 
-    vp9_encode_sb(x, VPXMAX(bsize, BLOCK_8X8));
+#if CONFIG_MISMATCH_DEBUG
+    if (output_enabled) {
+      int plane;
+      for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+        const struct macroblockd_plane *pd = &xd->plane[plane];
+        int pixel_c, pixel_r;
+        const BLOCK_SIZE plane_bsize =
+            get_plane_block_size(VPXMAX(bsize, BLOCK_8X8), &xd->plane[plane]);
+        const int bw = get_block_width(plane_bsize);
+        const int bh = get_block_height(plane_bsize);
+        mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0,
+                        pd->subsampling_x, pd->subsampling_y);
+
+        mismatch_record_block_pre(pd->dst.buf, pd->dst.stride, plane, pixel_c,
+                                  pixel_r, bw, bh,
+                                  xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+      }
+    }
+#endif
+
+    vp9_encode_sb(x, VPXMAX(bsize, BLOCK_8X8), mi_row, mi_col, output_enabled);
     vp9_tokenize_sb(cpi, td, t, !output_enabled, seg_skip,
                     VPXMAX(bsize, BLOCK_8X8));
   }
@@ -4527,9 +6123,14 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
 
     ++td->counts->tx.tx_totals[mi->tx_size];
     ++td->counts->tx.tx_totals[get_uv_tx_size(mi, &xd->plane[1])];
-    if (cm->seg.enabled && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+    if (cm->seg.enabled && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+        cpi->cyclic_refresh->content_mode)
       vp9_cyclic_refresh_update_sb_postencode(cpi, mi, mi_row, mi_col, bsize);
-    if (cpi->oxcf.pass == 0 && cpi->svc.temporal_layer_id == 0)
+    if (cpi->oxcf.pass == 0 && cpi->svc.temporal_layer_id == 0 &&
+        (!cpi->use_svc ||
+         (cpi->use_svc &&
+          !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
+          cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)))
       update_zeromv_cnt(cpi, mi, mi_row, mi_col, bsize);
   }
 }
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.h b/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.h
index aa54947858..97fe52484a 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_ENCODEFRAME_H_
-#define VP9_ENCODER_VP9_ENCODEFRAME_H_
+#ifndef VPX_VP9_ENCODER_VP9_ENCODEFRAME_H_
+#define VPX_VP9_ENCODER_VP9_ENCODEFRAME_H_
 
 #include "vpx/vpx_integer.h"
 
@@ -22,13 +22,6 @@ struct yv12_buffer_config;
 struct VP9_COMP;
 struct ThreadData;
 
-// Constants used in SOURCE_VAR_BASED_PARTITION
-#define VAR_HIST_MAX_BG_VAR 1000
-#define VAR_HIST_FACTOR 10
-#define VAR_HIST_BINS (VAR_HIST_MAX_BG_VAR / VAR_HIST_FACTOR + 1)
-#define VAR_HIST_LARGE_CUT_OFF 75
-#define VAR_HIST_SMALL_CUT_OFF 45
-
 void vp9_setup_src_planes(struct macroblock *x,
                           const struct yv12_buffer_config *src, int mi_row,
                           int mi_col);
@@ -39,10 +32,19 @@ void vp9_init_tile_data(struct VP9_COMP *cpi);
 void vp9_encode_tile(struct VP9_COMP *cpi, struct ThreadData *td, int tile_row,
                      int tile_col);
 
-void vp9_set_variance_partition_thresholds(struct VP9_COMP *cpi, int q);
+void vp9_encode_sb_row(struct VP9_COMP *cpi, struct ThreadData *td,
+                       int tile_row, int tile_col, int mi_row);
+
+void vp9_set_variance_partition_thresholds(struct VP9_COMP *cpi, int q,
+                                           int content_state);
+
+struct KMEANS_DATA;
+void vp9_kmeans(double *ctr_ls, double *boundary_ls, int *count_ls, int k,
+                struct KMEANS_DATA *arr, int size);
+int vp9_get_group_idx(double value, double *boundary_ls, int k);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_ENCODEFRAME_H_
+#endif  // VPX_VP9_ENCODER_VP9_ENCODEFRAME_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encodemb.c b/media/libvpx/libvpx/vp9/encoder/vp9_encodemb.c
index 2cb137d8b9..eded9f5c42 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_encodemb.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_encodemb.c
@@ -16,12 +16,17 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 
+#if CONFIG_MISMATCH_DEBUG
+#include "vpx_util/vpx_debug_util.h"
+#endif
+
 #include "vp9/common/vp9_idct.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_scan.h"
 
 #include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_rd.h"
 #include "vp9/encoder/vp9_tokenize.h"
 
@@ -49,36 +54,14 @@ void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
                      pd->dst.buf, pd->dst.stride);
 }
 
-typedef struct vp9_token_state {
-  int64_t error;
-  int rate;
-  int16_t next;
-  int16_t token;
-  tran_low_t qc;
-  tran_low_t dqc;
-  uint8_t best_index;
-} vp9_token_state;
-
 static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
-  { 10, 6 }, { 8, 5 },
+  { 10, 6 },
+  { 8, 5 },
 };
 
-#define UPDATE_RD_COST()                             \
-  {                                                  \
-    rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0); \
-    rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1); \
-  }
-
-// This function is a place holder for now but may ultimately need
-// to scan previous tokens to work out the correct context.
-static int trellis_get_coeff_context(const int16_t *scan, const int16_t *nb,
-                                     int idx, int token, uint8_t *token_cache) {
-  int bak = token_cache[scan[idx]], pt;
-  token_cache[scan[idx]] = vp9_pt_energy_class[token];
-  pt = get_coef_context(nb, token_cache, idx + 1);
-  token_cache[scan[idx]] = bak;
-  return pt;
-}
+// 'num' can be negative, but 'shift' must be non-negative.
+#define RIGHT_SHIFT_POSSIBLY_NEGATIVE(num, shift) \
+  (((num) >= 0) ? (num) >> (shift) : -((-(num)) >> (shift)))
 
 int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
                    int ctx) {
@@ -86,221 +69,264 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
   struct macroblock_plane *const p = &mb->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int ref = is_inter_block(xd->mi[0]);
-  vp9_token_state tokens[1025][2];
   uint8_t token_cache[1024];
-  const tran_low_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);
+  const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   const int eob = p->eobs[block];
-  const PLANE_TYPE type = get_plane_type(plane);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
   const int default_eob = 16 << (tx_size << 1);
   const int shift = (tx_size == TX_32X32);
   const int16_t *const dequant_ptr = pd->dequant;
   const uint8_t *const band_translate = get_band_translate(tx_size);
-  const scan_order *const so = get_scan(xd, tx_size, type, block);
+  const ScanOrder *const so = get_scan(xd, tx_size, plane_type, block);
   const int16_t *const scan = so->scan;
   const int16_t *const nb = so->neighbors;
-  const int dq_step[2] = { dequant_ptr[0] >> shift, dequant_ptr[1] >> shift };
-  int next = eob, sz = 0;
-  const int64_t rdmult = (mb->rdmult * plane_rd_mult[ref][type]) >> 1;
+  const MODE_INFO *mbmi = xd->mi[0];
+  const int sharpness = mb->sharpness;
+  const int64_t rdadj = (int64_t)mb->rdmult * plane_rd_mult[ref][plane_type];
+  const int64_t rdmult =
+      (sharpness == 0 ? rdadj >> 1
+                      : (rdadj * (8 - sharpness + mbmi->segment_id)) >> 4);
+
   const int64_t rddiv = mb->rddiv;
   int64_t rd_cost0, rd_cost1;
-  int rate0, rate1;
-  int64_t error0, error1;
+  int64_t rate0, rate1;
   int16_t t0, t1;
-  EXTRABIT e0;
-  unsigned int(*const token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
-      mb->token_costs[tx_size][type][ref];
-  int best, band, pt, i, final_eob;
+  int i, final_eob;
+  int count_high_values_after_eob = 0;
 #if CONFIG_VP9_HIGHBITDEPTH
-  const int *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
+  const uint16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
 #else
-  const int *cat6_high_cost = vp9_get_high_cost_table(8);
+  const uint16_t *cat6_high_cost = vp9_get_high_cost_table(8);
 #endif
+  unsigned int(*const token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
+      mb->token_costs[tx_size][plane_type][ref];
+  unsigned int(*token_costs_cur)[2][COEFF_CONTEXTS][ENTROPY_TOKENS];
+  int64_t eob_cost0, eob_cost1;
+  const int ctx0 = ctx;
+  int64_t accu_rate = 0;
+  // Initialized to the worst possible error for the largest transform size.
+  // This ensures that it never goes negative.
+  int64_t accu_error = ((int64_t)1) << 50;
+  int64_t best_block_rd_cost = INT64_MAX;
+  int x_prev = 1;
+  tran_low_t before_best_eob_qc = 0;
+  tran_low_t before_best_eob_dqc = 0;
 
-  assert((!type && !plane) || (type && plane));
+  assert((!plane_type && !plane) || (plane_type && plane));
   assert(eob <= default_eob);
 
-  /* Now set up a Viterbi trellis to evaluate alternative roundings. */
-  /* Initialize the sentinel node of the trellis. */
-  tokens[eob][0].rate = 0;
-  tokens[eob][0].error = 0;
-  tokens[eob][0].next = default_eob;
-  tokens[eob][0].token = EOB_TOKEN;
-  tokens[eob][0].qc = 0;
-  tokens[eob][1] = tokens[eob][0];
-
-  for (i = 0; i < eob; i++)
-    token_cache[scan[i]] = vp9_pt_energy_class[vp9_get_token(qcoeff[scan[i]])];
-
-  for (i = eob; i-- > 0;) {
-    int base_bits, d2, dx;
+  for (i = 0; i < eob; i++) {
     const int rc = scan[i];
-    int x = qcoeff[rc];
-    /* Only add a trellis state for non-zero coefficients. */
-    if (x) {
-      error0 = tokens[next][0].error;
-      error1 = tokens[next][1].error;
-      /* Evaluate the first possibility for this state. */
-      rate0 = tokens[next][0].rate;
-      rate1 = tokens[next][1].rate;
-      vp9_get_token_extra(x, &t0, &e0);
-      /* Consider both possible successor states. */
-      if (next < default_eob) {
-        band = band_translate[i + 1];
-        pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
-        rate0 += token_costs[band][0][pt][tokens[next][0].token];
-        rate1 += token_costs[band][0][pt][tokens[next][1].token];
-      }
-      UPDATE_RD_COST();
-      /* And pick the best. */
-      best = rd_cost1 < rd_cost0;
-      base_bits = vp9_get_cost(t0, e0, cat6_high_cost);
-      dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
-#if CONFIG_VP9_HIGHBITDEPTH
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        dx >>= xd->bd - 8;
-      }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-      d2 = dx * dx;
-      tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
-      tokens[i][0].error = d2 + (best ? error1 : error0);
-      tokens[i][0].next = next;
-      tokens[i][0].token = t0;
-      tokens[i][0].qc = x;
-      tokens[i][0].dqc = dqcoeff[rc];
-      tokens[i][0].best_index = best;
+    token_cache[rc] = vp9_pt_energy_class[vp9_get_token(qcoeff[rc])];
+  }
+  final_eob = 0;
 
-      /* Evaluate the second possibility for this state. */
-      rate0 = tokens[next][0].rate;
-      rate1 = tokens[next][1].rate;
+  // Initial RD cost.
+  token_costs_cur = token_costs + band_translate[0];
+  rate0 = (*token_costs_cur)[0][ctx0][EOB_TOKEN];
+  best_block_rd_cost = RDCOST(rdmult, rddiv, rate0, accu_error);
 
-      if ((abs(x) * dequant_ptr[rc != 0] > (abs(coeff[rc]) << shift)) &&
-          (abs(x) * dequant_ptr[rc != 0] <
-           (abs(coeff[rc]) << shift) + dequant_ptr[rc != 0])) {
-        sz = -(x < 0);
-        x -= 2 * sz + 1;
-      } else {
-        tokens[i][1] = tokens[i][0];
-        next = i;
-        continue;
-      }
-
-      /* Consider both possible successor states. */
-      if (!x) {
-        /* If we reduced this coefficient to zero, check to see if
-         *  we need to move the EOB back here.
-         */
-        t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
-        t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
-        e0 = 0;
-      } else {
-        vp9_get_token_extra(x, &t0, &e0);
-        t1 = t0;
-      }
-      if (next < default_eob) {
-        band = band_translate[i + 1];
-        if (t0 != EOB_TOKEN) {
-          pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
-          rate0 += token_costs[band][!x][pt][tokens[next][0].token];
-        }
-        if (t1 != EOB_TOKEN) {
-          pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache);
-          rate1 += token_costs[band][!x][pt][tokens[next][1].token];
-        }
-      }
-
-      UPDATE_RD_COST();
-      /* And pick the best. */
-      best = rd_cost1 < rd_cost0;
-      base_bits = vp9_get_cost(t0, e0, cat6_high_cost);
-
-#if CONFIG_VP9_HIGHBITDEPTH
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        dx -= ((dequant_ptr[rc != 0] >> (xd->bd - 8)) + sz) ^ sz;
-      } else {
-        dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
-      }
-#else
-      dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-      d2 = dx * dx;
-
-      tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
-      tokens[i][1].error = d2 + (best ? error1 : error0);
-      tokens[i][1].next = next;
-      tokens[i][1].token = best ? t1 : t0;
-      tokens[i][1].qc = x;
-
-      if (x) {
-        tran_low_t offset = dq_step[rc != 0];
-        // The 32x32 transform coefficient uses half quantization step size.
-        // Account for the rounding difference in the dequantized coefficeint
-        // value when the quantization index is dropped from an even number
-        // to an odd number.
-        if (shift & x) offset += (dequant_ptr[rc != 0] & 0x01);
-
-        if (sz == 0)
-          tokens[i][1].dqc = dqcoeff[rc] - offset;
-        else
-          tokens[i][1].dqc = dqcoeff[rc] + offset;
-      } else {
-        tokens[i][1].dqc = 0;
-      }
-
-      tokens[i][1].best_index = best;
-      /* Finally, make this the new head of the trellis. */
-      next = i;
+  // For each token, pick one of two choices greedily:
+  // (i) First candidate: Keep current quantized value, OR
+  // (ii) Second candidate: Reduce quantized value by 1.
+  for (i = 0; i < eob; i++) {
+    const int rc = scan[i];
+    const int x = qcoeff[rc];
+    const int band_cur = band_translate[i];
+    const int ctx_cur = (i == 0) ? ctx : get_coef_context(nb, token_cache, i);
+    const int token_tree_sel_cur = (x_prev == 0);
+    token_costs_cur = token_costs + band_cur;
+    if (x == 0) {  // No need to search
+      const int token = vp9_get_token(x);
+      rate0 = (*token_costs_cur)[token_tree_sel_cur][ctx_cur][token];
+      accu_rate += rate0;
+      x_prev = 0;
+      // Note: accu_error does not change.
     } else {
-      /* There's no choice to make for a zero coefficient, so we don't
-       *  add a new trellis node, but we do need to update the costs.
-       */
-      band = band_translate[i + 1];
-      pt = get_coef_context(nb, token_cache, i + 1);
-      t0 = tokens[next][0].token;
-      t1 = tokens[next][1].token;
-      /* Update the cost of each path if we're past the EOB token. */
-      if (t0 != EOB_TOKEN) {
-        tokens[next][0].rate += token_costs[band][1][pt][t0];
-        tokens[next][0].token = ZERO_TOKEN;
+      const int dqv = dequant_ptr[rc != 0];
+      // Compute the distortion for quantizing to 0.
+      const int diff_for_zero_raw = (0 - coeff[rc]) * (1 << shift);
+      const int diff_for_zero =
+#if CONFIG_VP9_HIGHBITDEPTH
+          (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+              ? RIGHT_SHIFT_POSSIBLY_NEGATIVE(diff_for_zero_raw, xd->bd - 8)
+              :
+#endif
+              diff_for_zero_raw;
+      const int64_t distortion_for_zero =
+          (int64_t)diff_for_zero * diff_for_zero;
+
+      // Compute the distortion for the first candidate
+      const int diff0_raw = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
+      const int diff0 =
+#if CONFIG_VP9_HIGHBITDEPTH
+          (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+              ? RIGHT_SHIFT_POSSIBLY_NEGATIVE(diff0_raw, xd->bd - 8)
+              :
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+              diff0_raw;
+      const int64_t distortion0 = (int64_t)diff0 * diff0;
+
+      // Compute the distortion for the second candidate
+      const int sign = -(x < 0);        // -1 if x is negative and 0 otherwise.
+      const int x1 = x - 2 * sign - 1;  // abs(x1) = abs(x) - 1.
+      int64_t distortion1;
+      if (x1 != 0) {
+        const int dqv_step =
+#if CONFIG_VP9_HIGHBITDEPTH
+            (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? dqv >> (xd->bd - 8)
+                                                          :
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+                                                          dqv;
+        const int diff_step = (dqv_step + sign) ^ sign;
+        const int diff1 = diff0 - diff_step;
+        assert(dqv > 0);  // We aren't right shifting a negative number above.
+        distortion1 = (int64_t)diff1 * diff1;
+      } else {
+        distortion1 = distortion_for_zero;
       }
-      if (t1 != EOB_TOKEN) {
-        tokens[next][1].rate += token_costs[band][1][pt][t1];
-        tokens[next][1].token = ZERO_TOKEN;
+      {
+        // Calculate RDCost for current coeff for the two candidates.
+        const int64_t base_bits0 = vp9_get_token_cost(x, &t0, cat6_high_cost);
+        const int64_t base_bits1 = vp9_get_token_cost(x1, &t1, cat6_high_cost);
+        rate0 =
+            base_bits0 + (*token_costs_cur)[token_tree_sel_cur][ctx_cur][t0];
+        rate1 =
+            base_bits1 + (*token_costs_cur)[token_tree_sel_cur][ctx_cur][t1];
+      }
+      {
+        int rdcost_better_for_x1, eob_rdcost_better_for_x1;
+        int dqc0, dqc1;
+        int64_t best_eob_cost_cur;
+        int use_x1;
+
+        // Calculate RD Cost effect on the next coeff for the two candidates.
+        int64_t next_bits0 = 0;
+        int64_t next_bits1 = 0;
+        int64_t next_eob_bits0 = 0;
+        int64_t next_eob_bits1 = 0;
+        if (i < default_eob - 1) {
+          int ctx_next, token_tree_sel_next;
+          const int band_next = band_translate[i + 1];
+          const int token_next =
+              (i + 1 != eob) ? vp9_get_token(qcoeff[scan[i + 1]]) : EOB_TOKEN;
+          unsigned int(*const token_costs_next)[2][COEFF_CONTEXTS]
+                                               [ENTROPY_TOKENS] =
+                                                   token_costs + band_next;
+          token_cache[rc] = vp9_pt_energy_class[t0];
+          ctx_next = get_coef_context(nb, token_cache, i + 1);
+          token_tree_sel_next = (x == 0);
+          next_bits0 =
+              (*token_costs_next)[token_tree_sel_next][ctx_next][token_next];
+          next_eob_bits0 =
+              (*token_costs_next)[token_tree_sel_next][ctx_next][EOB_TOKEN];
+          token_cache[rc] = vp9_pt_energy_class[t1];
+          ctx_next = get_coef_context(nb, token_cache, i + 1);
+          token_tree_sel_next = (x1 == 0);
+          next_bits1 =
+              (*token_costs_next)[token_tree_sel_next][ctx_next][token_next];
+          if (x1 != 0) {
+            next_eob_bits1 =
+                (*token_costs_next)[token_tree_sel_next][ctx_next][EOB_TOKEN];
+          }
+        }
+
+        // Compare the total RD costs for two candidates.
+        rd_cost0 = RDCOST(rdmult, rddiv, (rate0 + next_bits0), distortion0);
+        rd_cost1 = RDCOST(rdmult, rddiv, (rate1 + next_bits1), distortion1);
+        rdcost_better_for_x1 = (rd_cost1 < rd_cost0);
+        eob_cost0 = RDCOST(rdmult, rddiv, (accu_rate + rate0 + next_eob_bits0),
+                           (accu_error + distortion0 - distortion_for_zero));
+        eob_cost1 = eob_cost0;
+        if (x1 != 0) {
+          eob_cost1 =
+              RDCOST(rdmult, rddiv, (accu_rate + rate1 + next_eob_bits1),
+                     (accu_error + distortion1 - distortion_for_zero));
+          eob_rdcost_better_for_x1 = (eob_cost1 < eob_cost0);
+        } else {
+          eob_rdcost_better_for_x1 = 0;
+        }
+
+        // Calculate the two candidate de-quantized values.
+        dqc0 = dqcoeff[rc];
+        dqc1 = 0;
+        if (rdcost_better_for_x1 + eob_rdcost_better_for_x1) {
+          if (x1 != 0) {
+            dqc1 = RIGHT_SHIFT_POSSIBLY_NEGATIVE(x1 * dqv, shift);
+          } else {
+            dqc1 = 0;
+          }
+        }
+
+        // Pick and record the better quantized and de-quantized values.
+        if (rdcost_better_for_x1) {
+          qcoeff[rc] = x1;
+          dqcoeff[rc] = dqc1;
+          accu_rate += rate1;
+          accu_error += distortion1 - distortion_for_zero;
+          assert(distortion1 <= distortion_for_zero);
+          token_cache[rc] = vp9_pt_energy_class[t1];
+        } else {
+          accu_rate += rate0;
+          accu_error += distortion0 - distortion_for_zero;
+          assert(distortion0 <= distortion_for_zero);
+          token_cache[rc] = vp9_pt_energy_class[t0];
+        }
+        if (sharpness > 0 && abs(qcoeff[rc]) > 1) count_high_values_after_eob++;
+        assert(accu_error >= 0);
+        x_prev = qcoeff[rc];  // Update based on selected quantized value.
+
+        use_x1 = (x1 != 0) && eob_rdcost_better_for_x1;
+        best_eob_cost_cur = use_x1 ? eob_cost1 : eob_cost0;
+
+        // Determine whether to move the eob position to i+1
+        if (best_eob_cost_cur < best_block_rd_cost) {
+          best_block_rd_cost = best_eob_cost_cur;
+          final_eob = i + 1;
+          count_high_values_after_eob = 0;
+          if (use_x1) {
+            before_best_eob_qc = x1;
+            before_best_eob_dqc = dqc1;
+          } else {
+            before_best_eob_qc = x;
+            before_best_eob_dqc = dqc0;
+          }
+        }
       }
-      tokens[i][0].best_index = tokens[i][1].best_index = 0;
-      /* Don't update next, because we didn't add a new node. */
     }
   }
-
-  /* Now pick the best path through the whole trellis. */
-  band = band_translate[i + 1];
-  rate0 = tokens[next][0].rate;
-  rate1 = tokens[next][1].rate;
-  error0 = tokens[next][0].error;
-  error1 = tokens[next][1].error;
-  t0 = tokens[next][0].token;
-  t1 = tokens[next][1].token;
-  rate0 += token_costs[band][0][ctx][t0];
-  rate1 += token_costs[band][0][ctx][t1];
-  UPDATE_RD_COST();
-  best = rd_cost1 < rd_cost0;
-  final_eob = -1;
-
-  for (i = next; i < eob; i = next) {
-    const int x = tokens[i][best].qc;
-    const int rc = scan[i];
-    if (x) final_eob = i;
-    qcoeff[rc] = x;
-    dqcoeff[rc] = tokens[i][best].dqc;
-    next = tokens[i][best].next;
-    best = tokens[i][best].best_index;
+  if (count_high_values_after_eob > 0) {
+    final_eob = eob - 1;
+    for (; final_eob >= 0; final_eob--) {
+      const int rc = scan[final_eob];
+      const int x = qcoeff[rc];
+      if (x) {
+        break;
+      }
+    }
+    final_eob++;
+  } else {
+    assert(final_eob <= eob);
+    if (final_eob > 0) {
+      int rc;
+      assert(before_best_eob_qc != 0);
+      i = final_eob - 1;
+      rc = scan[i];
+      qcoeff[rc] = before_best_eob_qc;
+      dqcoeff[rc] = before_best_eob_dqc;
+    }
+    for (i = final_eob; i < eob; i++) {
+      int rc = scan[i];
+      qcoeff[rc] = 0;
+      dqcoeff[rc] = 0;
+    }
   }
-  final_eob++;
-
   mb->plane[plane].eobs[block] = final_eob;
   return final_eob;
 }
+#undef RIGHT_SHIFT_POSSIBLY_NEGATIVE
 
 static INLINE void fdct32x32(int rd_transform, const int16_t *src,
                              tran_low_t *dst, int src_stride) {
@@ -325,7 +351,7 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+  const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size];
   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
@@ -333,39 +359,33 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   const int16_t *src_diff;
   src_diff = &p->src_diff[4 * (row * diff_stride + col)];
+  // skip block condition should be handled before this is called.
+  assert(!x->skip_block);
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     switch (tx_size) {
       case TX_32X32:
         highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
-                                     p->round_fp, p->quant_fp, p->quant_shift,
-                                     qcoeff, dqcoeff, pd->dequant, eob,
-                                     scan_order->scan, scan_order->iscan);
+        vp9_highbd_quantize_fp_32x32(coeff, 1024, p, qcoeff, dqcoeff,
+                                     pd->dequant, eob, scan_order);
         break;
       case TX_16X16:
         vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
-                               p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                               pd->dequant, eob, scan_order->scan,
-                               scan_order->iscan);
+        vp9_highbd_quantize_fp(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob,
+                               scan_order);
         break;
       case TX_8X8:
         vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
-                               p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                               pd->dequant, eob, scan_order->scan,
-                               scan_order->iscan);
+        vp9_highbd_quantize_fp(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+                               scan_order);
         break;
-      case TX_4X4:
-        x->fwd_txm4x4(src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
-                               p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                               pd->dequant, eob, scan_order->scan,
-                               scan_order->iscan);
+      default:
+        assert(tx_size == TX_4X4);
+        x->fwd_txfm4x4(src_diff, coeff, diff_stride);
+        vp9_highbd_quantize_fp(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+                               scan_order);
         break;
-      default: assert(0);
     }
     return;
   }
@@ -374,30 +394,26 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
   switch (tx_size) {
     case TX_32X32:
       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-      vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, p->round_fp,
-                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                            pd->dequant, eob, scan_order->scan,
-                            scan_order->iscan);
+      vp9_quantize_fp_32x32(coeff, 1024, p, qcoeff, dqcoeff, pd->dequant, eob,
+                            scan_order);
       break;
     case TX_16X16:
       vpx_fdct16x16(src_diff, coeff, diff_stride);
-      vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
-                      p->quant_fp, p->quant_shift, qcoeff, dqcoeff, pd->dequant,
-                      eob, scan_order->scan, scan_order->iscan);
+      vp9_quantize_fp(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob,
+                      scan_order);
       break;
     case TX_8X8:
-      vp9_fdct8x8_quant(src_diff, diff_stride, coeff, 64, x->skip_block,
-                        p->zbin, p->round_fp, p->quant_fp, p->quant_shift,
-                        qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
-                        scan_order->iscan);
+      vpx_fdct8x8(src_diff, coeff, diff_stride);
+      vp9_quantize_fp(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+                      scan_order);
+
       break;
-    case TX_4X4:
-      x->fwd_txm4x4(src_diff, coeff, diff_stride);
-      vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
-                      p->quant_fp, p->quant_shift, qcoeff, dqcoeff, pd->dequant,
-                      eob, scan_order->scan, scan_order->iscan);
+    default:
+      assert(tx_size == TX_4X4);
+      x->fwd_txfm4x4(src_diff, coeff, diff_stride);
+      vp9_quantize_fp(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+                      scan_order);
       break;
-    default: assert(0); break;
   }
 }
 
@@ -413,34 +429,33 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col,
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   const int16_t *src_diff;
   src_diff = &p->src_diff[4 * (row * diff_stride + col)];
+  // skip block condition should be handled before this is called.
+  assert(!x->skip_block);
+
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     switch (tx_size) {
       case TX_32X32:
         vpx_highbd_fdct32x32_1(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_dc_32x32(coeff, x->skip_block, p->round,
-                                     p->quant_fp[0], qcoeff, dqcoeff,
-                                     pd->dequant[0], eob);
+        vpx_highbd_quantize_dc_32x32(coeff, p->round, p->quant_fp[0], qcoeff,
+                                     dqcoeff, pd->dequant[0], eob);
         break;
       case TX_16X16:
         vpx_highbd_fdct16x16_1(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_dc(coeff, 256, x->skip_block, p->round,
-                               p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0],
-                               eob);
+        vpx_highbd_quantize_dc(coeff, 256, p->round, p->quant_fp[0], qcoeff,
+                               dqcoeff, pd->dequant[0], eob);
         break;
       case TX_8X8:
         vpx_highbd_fdct8x8_1(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_dc(coeff, 64, x->skip_block, p->round,
-                               p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0],
-                               eob);
+        vpx_highbd_quantize_dc(coeff, 64, p->round, p->quant_fp[0], qcoeff,
+                               dqcoeff, pd->dequant[0], eob);
         break;
-      case TX_4X4:
-        x->fwd_txm4x4(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_dc(coeff, 16, x->skip_block, p->round,
-                               p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0],
-                               eob);
+      default:
+        assert(tx_size == TX_4X4);
+        x->fwd_txfm4x4(src_diff, coeff, diff_stride);
+        vpx_highbd_quantize_dc(coeff, 16, p->round, p->quant_fp[0], qcoeff,
+                               dqcoeff, pd->dequant[0], eob);
         break;
-      default: assert(0);
     }
     return;
   }
@@ -449,25 +464,25 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col,
   switch (tx_size) {
     case TX_32X32:
       vpx_fdct32x32_1(src_diff, coeff, diff_stride);
-      vpx_quantize_dc_32x32(coeff, x->skip_block, p->round, p->quant_fp[0],
-                            qcoeff, dqcoeff, pd->dequant[0], eob);
+      vpx_quantize_dc_32x32(coeff, p->round, p->quant_fp[0], qcoeff, dqcoeff,
+                            pd->dequant[0], eob);
       break;
     case TX_16X16:
       vpx_fdct16x16_1(src_diff, coeff, diff_stride);
-      vpx_quantize_dc(coeff, 256, x->skip_block, p->round, p->quant_fp[0],
-                      qcoeff, dqcoeff, pd->dequant[0], eob);
+      vpx_quantize_dc(coeff, 256, p->round, p->quant_fp[0], qcoeff, dqcoeff,
+                      pd->dequant[0], eob);
       break;
     case TX_8X8:
       vpx_fdct8x8_1(src_diff, coeff, diff_stride);
-      vpx_quantize_dc(coeff, 64, x->skip_block, p->round, p->quant_fp[0],
-                      qcoeff, dqcoeff, pd->dequant[0], eob);
+      vpx_quantize_dc(coeff, 64, p->round, p->quant_fp[0], qcoeff, dqcoeff,
+                      pd->dequant[0], eob);
       break;
-    case TX_4X4:
-      x->fwd_txm4x4(src_diff, coeff, diff_stride);
-      vpx_quantize_dc(coeff, 16, x->skip_block, p->round, p->quant_fp[0],
-                      qcoeff, dqcoeff, pd->dequant[0], eob);
+    default:
+      assert(tx_size == TX_4X4);
+      x->fwd_txfm4x4(src_diff, coeff, diff_stride);
+      vpx_quantize_dc(coeff, 16, p->round, p->quant_fp[0], qcoeff, dqcoeff,
+                      pd->dequant[0], eob);
       break;
-    default: assert(0); break;
   }
 }
 
@@ -476,7 +491,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+  const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size];
   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
@@ -484,39 +499,33 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   const int16_t *src_diff;
   src_diff = &p->src_diff[4 * (row * diff_stride + col)];
+  // skip block condition should be handled before this is called.
+  assert(!x->skip_block);
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     switch (tx_size) {
       case TX_32X32:
         highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
-                                    p->round, p->quant, p->quant_shift, qcoeff,
-                                    dqcoeff, pd->dequant, eob, scan_order->scan,
-                                    scan_order->iscan);
+        vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
+                                    scan_order);
         break;
       case TX_16X16:
         vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
-                              p->quant, p->quant_shift, qcoeff, dqcoeff,
-                              pd->dequant, eob, scan_order->scan,
-                              scan_order->iscan);
+        vpx_highbd_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob,
+                              scan_order);
         break;
       case TX_8X8:
         vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
-                              p->quant, p->quant_shift, qcoeff, dqcoeff,
-                              pd->dequant, eob, scan_order->scan,
-                              scan_order->iscan);
+        vpx_highbd_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+                              scan_order);
         break;
-      case TX_4X4:
-        x->fwd_txm4x4(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
-                              p->quant, p->quant_shift, qcoeff, dqcoeff,
-                              pd->dequant, eob, scan_order->scan,
-                              scan_order->iscan);
+      default:
+        assert(tx_size == TX_4X4);
+        x->fwd_txfm4x4(src_diff, coeff, diff_stride);
+        vpx_highbd_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+                              scan_order);
         break;
-      default: assert(0);
     }
     return;
   }
@@ -525,36 +534,36 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
   switch (tx_size) {
     case TX_32X32:
       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-      vpx_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
-                           p->quant, p->quant_shift, qcoeff, dqcoeff,
-                           pd->dequant, eob, scan_order->scan,
-                           scan_order->iscan);
+      vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
+                           scan_order);
       break;
     case TX_16X16:
       vpx_fdct16x16(src_diff, coeff, diff_stride);
-      vpx_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant,
-                     p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                     scan_order->scan, scan_order->iscan);
+      vpx_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob,
+                     scan_order);
       break;
     case TX_8X8:
       vpx_fdct8x8(src_diff, coeff, diff_stride);
-      vpx_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
-                     p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                     scan_order->scan, scan_order->iscan);
+      vpx_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+                     scan_order);
       break;
-    case TX_4X4:
-      x->fwd_txm4x4(src_diff, coeff, diff_stride);
-      vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
-                     p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                     scan_order->scan, scan_order->iscan);
+    default:
+      assert(tx_size == TX_4X4);
+      x->fwd_txfm4x4(src_diff, coeff, diff_stride);
+      vpx_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+                     scan_order);
       break;
-    default: assert(0); break;
   }
 }
 
 static void encode_block(int plane, int block, int row, int col,
                          BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
   struct encode_b_args *const args = arg;
+#if CONFIG_MISMATCH_DEBUG
+  int mi_row = args->mi_row;
+  int mi_col = args->mi_col;
+  int output_enabled = args->output_enabled;
+#endif
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *const p = &x->plane[plane];
@@ -571,7 +580,11 @@ static void encode_block(int plane, int block, int row, int col,
   if (x->zcoeff_blk[tx_size][block] && plane == 0) {
     p->eobs[block] = 0;
     *a = *l = 0;
+#if CONFIG_MISMATCH_DEBUG
+    goto encode_block_end;
+#else
     return;
+#endif
   }
 
   if (!x->skip_recode) {
@@ -581,7 +594,11 @@ static void encode_block(int plane, int block, int row, int col,
         // skip forward transform
         p->eobs[block] = 0;
         *a = *l = 0;
+#if CONFIG_MISMATCH_DEBUG
+        goto encode_block_end;
+#else
         return;
+#endif
       } else {
         vp9_xform_quant_fp(x, plane, block, row, col, plane_bsize, tx_size);
       }
@@ -598,7 +615,11 @@ static void encode_block(int plane, int block, int row, int col,
           // skip forward transform
           p->eobs[block] = 0;
           *a = *l = 0;
+#if CONFIG_MISMATCH_DEBUG
+          goto encode_block_end;
+#else
           return;
+#endif
         }
       } else {
         vp9_xform_quant(x, plane, block, row, col, plane_bsize, tx_size);
@@ -615,32 +636,43 @@ static void encode_block(int plane, int block, int row, int col,
 
   if (p->eobs[block]) *(args->skip) = 0;
 
-  if (x->skip_encode || p->eobs[block] == 0) return;
+  if (x->skip_encode || p->eobs[block] == 0) {
+#if CONFIG_MISMATCH_DEBUG
+    goto encode_block_end;
+#else
+    return;
+#endif
+  }
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
     switch (tx_size) {
       case TX_32X32:
-        vp9_highbd_idct32x32_add(dqcoeff, dst, pd->dst.stride, p->eobs[block],
+        vp9_highbd_idct32x32_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],
                                  xd->bd);
         break;
       case TX_16X16:
-        vp9_highbd_idct16x16_add(dqcoeff, dst, pd->dst.stride, p->eobs[block],
+        vp9_highbd_idct16x16_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],
                                  xd->bd);
         break;
       case TX_8X8:
-        vp9_highbd_idct8x8_add(dqcoeff, dst, pd->dst.stride, p->eobs[block],
+        vp9_highbd_idct8x8_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],
                                xd->bd);
         break;
-      case TX_4X4:
+      default:
+        assert(tx_size == TX_4X4);
         // this is like vp9_short_idct4x4 but has a special case around eob<=1
         // which is significant (not just an optimization) for the lossless
         // case.
-        x->highbd_itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block],
-                           xd->bd);
+        x->highbd_inv_txfm_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],
+                               xd->bd);
         break;
-      default: assert(0 && "Invalid transform size");
     }
+#if CONFIG_MISMATCH_DEBUG
+    goto encode_block_end;
+#else
     return;
+#endif
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -654,14 +686,27 @@ static void encode_block(int plane, int block, int row, int col,
     case TX_8X8:
       vp9_idct8x8_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
       break;
-    case TX_4X4:
+    default:
+      assert(tx_size == TX_4X4);
       // this is like vp9_short_idct4x4 but has a special case around eob<=1
       // which is significant (not just an optimization) for the lossless
       // case.
-      x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+      x->inv_txfm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
       break;
-    default: assert(0 && "Invalid transform size"); break;
   }
+#if CONFIG_MISMATCH_DEBUG
+encode_block_end:
+  if (output_enabled) {
+    int pixel_c, pixel_r;
+    int blk_w = 1 << (tx_size + TX_UNIT_SIZE_LOG2);
+    int blk_h = 1 << (tx_size + TX_UNIT_SIZE_LOG2);
+    mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, col, row,
+                    pd->subsampling_x, pd->subsampling_y);
+    mismatch_record_block_tx(dst, pd->dst.stride, plane, pixel_c, pixel_r,
+                             blk_w, blk_h,
+                             xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+  }
+#endif
 }
 
 static void encode_block_pass1(int plane, int block, int row, int col,
@@ -680,11 +725,12 @@ static void encode_block_pass1(int plane, int block, int row, int col,
   if (p->eobs[block] > 0) {
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      x->highbd_itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], xd->bd);
+      x->highbd_inv_txfm_add(dqcoeff, CONVERT_TO_SHORTPTR(dst), pd->dst.stride,
+                             p->eobs[block], xd->bd);
       return;
     }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-    x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+    x->inv_txfm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
   }
 }
 
@@ -694,12 +740,34 @@ void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) {
                                          encode_block_pass1, x);
 }
 
-void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
+void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col,
+                   int output_enabled) {
   MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx ctx;
   MODE_INFO *mi = xd->mi[0];
-  struct encode_b_args arg = { x, 1, NULL, NULL, &mi->skip };
   int plane;
+#if CONFIG_MISMATCH_DEBUG
+  struct encode_b_args arg = { x,
+                               1,     // enable_trellis_opt
+                               0.0,   // trellis_opt_thresh
+                               NULL,  // &sse_calc_done
+                               NULL,  // &sse
+                               NULL,  // above entropy context
+                               NULL,  // left entropy context
+                               &mi->skip, mi_row, mi_col, output_enabled };
+#else
+  struct encode_b_args arg = { x,
+                               1,     // enable_trellis_opt
+                               0.0,   // trellis_opt_thresh
+                               NULL,  // &sse_calc_done
+                               NULL,  // &sse
+                               NULL,  // above entropy context
+                               NULL,  // left entropy context
+                               &mi->skip };
+  (void)mi_row;
+  (void)mi_col;
+  (void)output_enabled;
+#endif
 
   mi->skip = 1;
 
@@ -713,9 +781,9 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
       const TX_SIZE tx_size = plane ? get_uv_tx_size(mi, pd) : mi->tx_size;
       vp9_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane],
                                ctx.tl[plane]);
-      arg.enable_coeff_opt = 1;
+      arg.enable_trellis_opt = 1;
     } else {
-      arg.enable_coeff_opt = 0;
+      arg.enable_trellis_opt = 0;
     }
     arg.ta = ctx.ta[plane];
     arg.tl = ctx.tl[plane];
@@ -737,7 +805,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
   tran_low_t *coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  const scan_order *scan_order;
+  const ScanOrder *scan_order;
   TX_TYPE tx_type = DCT_DCT;
   PREDICTION_MODE mode;
   const int bwl = b_width_log2_lookup[plane_bsize];
@@ -747,17 +815,13 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
   uint16_t *eob = &p->eobs[block];
   const int src_stride = p->src.stride;
   const int dst_stride = pd->dst.stride;
+  int enable_trellis_opt = !x->skip_recode;
   ENTROPY_CONTEXT *a = NULL;
   ENTROPY_CONTEXT *l = NULL;
   int entropy_ctx = 0;
   dst = &pd->dst.buf[4 * (row * dst_stride + col)];
   src = &p->src.buf[4 * (row * src_stride + col)];
   src_diff = &p->src_diff[4 * (row * diff_stride + col)];
-  if (args->enable_coeff_opt) {
-    a = &args->ta[col];
-    l = &args->tl[row];
-    entropy_ctx = combine_entropy_contexts(*a, *l);
-  }
 
   if (tx_size == TX_4X4) {
     tx_type = get_tx_type_4x4(get_plane_type(plane), xd, block);
@@ -773,89 +837,115 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
     }
   }
 
-  vp9_predict_intra_block(xd, bwl, tx_size, mode, x->skip_encode ? src : dst,
-                          x->skip_encode ? src_stride : dst_stride, dst,
-                          dst_stride, col, row, plane);
+  vp9_predict_intra_block(
+      xd, bwl, tx_size, mode, (x->skip_encode || x->fp_src_pred) ? src : dst,
+      (x->skip_encode || x->fp_src_pred) ? src_stride : dst_stride, dst,
+      dst_stride, col, row, plane);
+
+  // skip block condition should be handled before this is called.
+  assert(!x->skip_block);
+
+  if (!x->skip_recode) {
+    const int tx_size_in_pixels = (1 << tx_size) << 2;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vpx_highbd_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff,
+                                diff_stride, src, src_stride, dst, dst_stride,
+                                xd->bd);
+    } else {
+      vpx_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff,
+                         diff_stride, src, src_stride, dst, dst_stride);
+    }
+#else
+    vpx_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff,
+                       diff_stride, src, src_stride, dst, dst_stride);
+#endif
+    enable_trellis_opt = do_trellis_opt(pd, src_diff, diff_stride, row, col,
+                                        plane_bsize, tx_size, args);
+  }
+
+  if (enable_trellis_opt) {
+    a = &args->ta[col];
+    l = &args->tl[row];
+    entropy_ctx = combine_entropy_contexts(*a, *l);
+  }
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
     switch (tx_size) {
       case TX_32X32:
         if (!x->skip_recode) {
-          vpx_highbd_subtract_block(32, 32, src_diff, diff_stride, src,
-                                    src_stride, dst, dst_stride, xd->bd);
           highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-          vpx_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
-                                      p->round, p->quant, p->quant_shift,
-                                      qcoeff, dqcoeff, pd->dequant, eob,
-                                      scan_order->scan, scan_order->iscan);
+          vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant,
+                                      eob, scan_order);
+        }
+        if (enable_trellis_opt) {
+          *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
         }
         if (!x->skip_encode && *eob) {
-          vp9_highbd_idct32x32_add(dqcoeff, dst, dst_stride, *eob, xd->bd);
+          vp9_highbd_idct32x32_add(dqcoeff, dst16, dst_stride, *eob, xd->bd);
         }
         break;
       case TX_16X16:
         if (!x->skip_recode) {
-          vpx_highbd_subtract_block(16, 16, src_diff, diff_stride, src,
-                                    src_stride, dst, dst_stride, xd->bd);
           if (tx_type == DCT_DCT)
             vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
           else
             vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
-          vpx_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
-                                p->quant, p->quant_shift, qcoeff, dqcoeff,
-                                pd->dequant, eob, scan_order->scan,
-                                scan_order->iscan);
+          vpx_highbd_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant,
+                                eob, scan_order);
+        }
+        if (enable_trellis_opt) {
+          *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
         }
         if (!x->skip_encode && *eob) {
-          vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, dst_stride, *eob,
+          vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst16, dst_stride, *eob,
                                   xd->bd);
         }
         break;
       case TX_8X8:
         if (!x->skip_recode) {
-          vpx_highbd_subtract_block(8, 8, src_diff, diff_stride, src,
-                                    src_stride, dst, dst_stride, xd->bd);
           if (tx_type == DCT_DCT)
             vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
           else
             vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
-          vpx_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
-                                p->quant, p->quant_shift, qcoeff, dqcoeff,
-                                pd->dequant, eob, scan_order->scan,
-                                scan_order->iscan);
+          vpx_highbd_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+                                scan_order);
+        }
+        if (enable_trellis_opt) {
+          *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
         }
         if (!x->skip_encode && *eob) {
-          vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob,
+          vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst16, dst_stride, *eob,
                                 xd->bd);
         }
         break;
-      case TX_4X4:
+      default:
+        assert(tx_size == TX_4X4);
         if (!x->skip_recode) {
-          vpx_highbd_subtract_block(4, 4, src_diff, diff_stride, src,
-                                    src_stride, dst, dst_stride, xd->bd);
           if (tx_type != DCT_DCT)
             vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
           else
-            x->fwd_txm4x4(src_diff, coeff, diff_stride);
-          vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
-                                p->quant, p->quant_shift, qcoeff, dqcoeff,
-                                pd->dequant, eob, scan_order->scan,
-                                scan_order->iscan);
+            x->fwd_txfm4x4(src_diff, coeff, diff_stride);
+          vpx_highbd_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+                                scan_order);
+        }
+        if (enable_trellis_opt) {
+          *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
         }
-
         if (!x->skip_encode && *eob) {
           if (tx_type == DCT_DCT) {
             // this is like vp9_short_idct4x4 but has a special case around
             // eob<=1 which is significant (not just an optimization) for the
             // lossless case.
-            x->highbd_itxm_add(dqcoeff, dst, dst_stride, *eob, xd->bd);
+            x->highbd_inv_txfm_add(dqcoeff, dst16, dst_stride, *eob, xd->bd);
           } else {
-            vp9_highbd_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type, xd->bd);
+            vp9_highbd_iht4x4_16_add(dqcoeff, dst16, dst_stride, tx_type,
+                                     xd->bd);
           }
         }
         break;
-      default: assert(0); return;
     }
     if (*eob) *(args->skip) = 0;
     return;
@@ -865,15 +955,11 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
   switch (tx_size) {
     case TX_32X32:
       if (!x->skip_recode) {
-        vpx_subtract_block(32, 32, src_diff, diff_stride, src, src_stride, dst,
-                           dst_stride);
         fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-        vpx_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
-                             p->quant, p->quant_shift, qcoeff, dqcoeff,
-                             pd->dequant, eob, scan_order->scan,
-                             scan_order->iscan);
+        vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
+                             scan_order);
       }
-      if (args->enable_coeff_opt && !x->skip_recode) {
+      if (enable_trellis_opt) {
         *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
       }
       if (!x->skip_encode && *eob)
@@ -881,14 +967,11 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
       break;
     case TX_16X16:
       if (!x->skip_recode) {
-        vpx_subtract_block(16, 16, src_diff, diff_stride, src, src_stride, dst,
-                           dst_stride);
         vp9_fht16x16(src_diff, coeff, diff_stride, tx_type);
-        vpx_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant,
-                       p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                       scan_order->scan, scan_order->iscan);
+        vpx_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob,
+                       scan_order);
       }
-      if (args->enable_coeff_opt && !x->skip_recode) {
+      if (enable_trellis_opt) {
         *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
       }
       if (!x->skip_encode && *eob)
@@ -896,32 +979,27 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
       break;
     case TX_8X8:
       if (!x->skip_recode) {
-        vpx_subtract_block(8, 8, src_diff, diff_stride, src, src_stride, dst,
-                           dst_stride);
         vp9_fht8x8(src_diff, coeff, diff_stride, tx_type);
-        vpx_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
-                       p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                       scan_order->scan, scan_order->iscan);
+        vpx_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+                       scan_order);
       }
-      if (args->enable_coeff_opt && !x->skip_recode) {
+      if (enable_trellis_opt) {
         *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
       }
       if (!x->skip_encode && *eob)
         vp9_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob);
       break;
-    case TX_4X4:
+    default:
+      assert(tx_size == TX_4X4);
       if (!x->skip_recode) {
-        vpx_subtract_block(4, 4, src_diff, diff_stride, src, src_stride, dst,
-                           dst_stride);
         if (tx_type != DCT_DCT)
           vp9_fht4x4(src_diff, coeff, diff_stride, tx_type);
         else
-          x->fwd_txm4x4(src_diff, coeff, diff_stride);
-        vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
-                       p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                       scan_order->scan, scan_order->iscan);
+          x->fwd_txfm4x4(src_diff, coeff, diff_stride);
+        vpx_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+                       scan_order);
       }
-      if (args->enable_coeff_opt && !x->skip_recode) {
+      if (enable_trellis_opt) {
         *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
       }
       if (!x->skip_encode && *eob) {
@@ -929,31 +1007,53 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
           // this is like vp9_short_idct4x4 but has a special case around eob<=1
           // which is significant (not just an optimization) for the lossless
           // case.
-          x->itxm_add(dqcoeff, dst, dst_stride, *eob);
+          x->inv_txfm_add(dqcoeff, dst, dst_stride, *eob);
         else
           vp9_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type);
       }
       break;
-    default: assert(0); break;
   }
   if (*eob) *(args->skip) = 0;
 }
 
 void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane,
-                                  int enable_optimize_b) {
+                                  int enable_trellis_opt) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  struct encode_b_args arg = { x, enable_optimize_b, ctx.ta[plane],
-                               ctx.tl[plane], &xd->mi[0]->skip };
+#if CONFIG_MISMATCH_DEBUG
+  // TODO(angiebird): make mismatch_debug support intra mode
+  struct encode_b_args arg = {
+    x,
+    enable_trellis_opt,
+    0.0,   // trellis_opt_thresh
+    NULL,  // &sse_calc_done
+    NULL,  // &sse
+    ctx.ta[plane],
+    ctx.tl[plane],
+    &xd->mi[0]->skip,
+    0,  // mi_row
+    0,  // mi_col
+    0   // output_enabled
+  };
+#else
+  struct encode_b_args arg = { x,
+                               enable_trellis_opt,
+                               0.0,   // trellis_opt_thresh
+                               NULL,  // &sse_calc_done
+                               NULL,  // &sse
+                               ctx.ta[plane],
+                               ctx.tl[plane],
+                               &xd->mi[0]->skip };
+#endif
 
-  if (enable_optimize_b && x->optimize &&
+  if (enable_trellis_opt && x->optimize &&
       (!x->skip_recode || !x->skip_optimize)) {
     const struct macroblockd_plane *const pd = &xd->plane[plane];
     const TX_SIZE tx_size =
         plane ? get_uv_tx_size(xd->mi[0], pd) : xd->mi[0]->tx_size;
     vp9_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]);
   } else {
-    arg.enable_coeff_opt = 0;
+    arg.enable_trellis_opt = 0;
   }
 
   vp9_foreach_transformed_block_in_plane(xd, bsize, plane,
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encodemb.h b/media/libvpx/libvpx/vp9/encoder/vp9_encodemb.h
index cf943bedfd..1391446bed 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_encodemb.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_encodemb.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_ENCODEMB_H_
-#define VP9_ENCODER_VP9_ENCODEMB_H_
+#ifndef VPX_VP9_ENCODER_VP9_ENCODEMB_H_
+#define VPX_VP9_ENCODER_VP9_ENCODEMB_H_
 
 #include "./vpx_config.h"
 #include "vp9/encoder/vp9_block.h"
@@ -20,14 +20,23 @@ extern "C" {
 
 struct encode_b_args {
   MACROBLOCK *x;
-  int enable_coeff_opt;
+  int enable_trellis_opt;
+  double trellis_opt_thresh;
+  int *sse_calc_done;
+  int64_t *sse;
   ENTROPY_CONTEXT *ta;
   ENTROPY_CONTEXT *tl;
   int8_t *skip;
+#if CONFIG_MISMATCH_DEBUG
+  int mi_row;
+  int mi_col;
+  int output_enabled;
+#endif
 };
 int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
                    int ctx);
-void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
+void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col,
+                   int output_enabled);
 void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize);
 void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
@@ -42,10 +51,10 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
                             BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg);
 
 void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane,
-                                  int enable_optimize_b);
+                                  int enable_trellis_opt);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_ENCODEMB_H_
+#endif  // VPX_VP9_ENCODER_VP9_ENCODEMB_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encodemv.h b/media/libvpx/libvpx/vp9/encoder/vp9_encodemv.h
index 9fc7ab8dc4..2f1be4b233 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_encodemv.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_encodemv.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_ENCODEMV_H_
-#define VP9_ENCODER_VP9_ENCODEMV_H_
+#ifndef VPX_VP9_ENCODER_VP9_ENCODEMV_H_
+#define VPX_VP9_ENCODER_VP9_ENCODEMV_H_
 
 #include "vp9/encoder/vp9_encoder.h"
 
@@ -27,7 +27,7 @@ void vp9_encode_mv(VP9_COMP *cpi, vpx_writer *w, const MV *mv, const MV *ref,
                    unsigned int *const max_mv_magnitude);
 
 void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
-                              const nmv_context *mvctx, int usehp);
+                              const nmv_context *ctx, int usehp);
 
 void vp9_update_mv_count(ThreadData *td);
 
@@ -35,4 +35,4 @@ void vp9_update_mv_count(ThreadData *td);
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_ENCODEMV_H_
+#endif  // VPX_VP9_ENCODER_VP9_ENCODEMV_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c
index 432eac8da0..3f42655527 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c
@@ -8,25 +8,38 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <math.h>
-#include <stdio.h>
 #include <limits.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
 
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_ext_ratectrl.h"
 #include "vpx_dsp/psnr.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/vpx_filter.h"
 #if CONFIG_INTERNAL_STATS
 #include "vpx_dsp/ssim.h"
 #endif
+#include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/system_state.h"
+#include "vpx_ports/vpx_once.h"
 #include "vpx_ports/vpx_timer.h"
+#include "vpx_util/vpx_pthread.h"
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+#include "vpx_util/vpx_debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
 
 #include "vp9/common/vp9_alloccommon.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_enums.h"
 #include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_idct.h"
 #if CONFIG_VP9_POSTPROC
@@ -34,24 +47,38 @@
 #endif
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_scale.h"
 #include "vp9/common/vp9_tile_common.h"
 
+#if !CONFIG_REALTIME_ONLY
 #include "vp9/encoder/vp9_alt_ref_aq.h"
 #include "vp9/encoder/vp9_aq_360.h"
 #include "vp9/encoder/vp9_aq_complexity.h"
+#endif
 #include "vp9/encoder/vp9_aq_cyclicrefresh.h"
+#if !CONFIG_REALTIME_ONLY
 #include "vp9/encoder/vp9_aq_variance.h"
+#endif
 #include "vp9/encoder/vp9_bitstream.h"
+#if CONFIG_INTERNAL_STATS
+#include "vp9/encoder/vp9_blockiness.h"
+#endif
 #include "vp9/encoder/vp9_context_tree.h"
 #include "vp9/encoder/vp9_encodeframe.h"
+#include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/encoder/vp9_encoder.h"
-#include "vp9/encoder/vp9_extend.h"
 #include "vp9/encoder/vp9_ethread.h"
+#include "vp9/encoder/vp9_extend.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_mbgraph.h"
+#if CONFIG_NON_GREEDY_MV
+#include "vp9/encoder/vp9_mcomp.h"
+#endif
+#include "vp9/encoder/vp9_multi_thread.h"
 #include "vp9/encoder/vp9_noise_estimate.h"
 #include "vp9/encoder/vp9_picklpf.h"
+#include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rd.h"
 #include "vp9/encoder/vp9_resize.h"
@@ -60,27 +87,34 @@
 #include "vp9/encoder/vp9_speed_features.h"
 #include "vp9/encoder/vp9_svc_layercontext.h"
 #include "vp9/encoder/vp9_temporal_filter.h"
+#include "vp9/encoder/vp9_tpl_model.h"
+#include "vp9/vp9_cx_iface.h"
 
 #define AM_SEGMENT_ID_INACTIVE 7
 #define AM_SEGMENT_ID_ACTIVE 0
 
-#define ALTREF_HIGH_PRECISION_MV 1     // Whether to use high precision mv
-                                       //  for altref computation.
-#define HIGH_PRECISION_MV_QTHRESH 200  // Q threshold for high precision
-                                       // mv. Choose a very high value for
-                                       // now so that HIGH_PRECISION is always
-                                       // chosen.
-// #define OUTPUT_YUV_REC
+// Whether to use high precision mv for altref computation.
+#define ALTREF_HIGH_PRECISION_MV 1
+
+// Q threshold for high precision mv. Choose a very high value for now so that
+// HIGH_PRECISION is always chosen.
+#define HIGH_PRECISION_MV_QTHRESH 200
+
+#define FRAME_SIZE_FACTOR 128  // empirical params for context model threshold
+#define FRAME_RATE_FACTOR 8
 
 #ifdef OUTPUT_YUV_DENOISED
 FILE *yuv_denoised_file = NULL;
 #endif
 #ifdef OUTPUT_YUV_SKINMAP
-FILE *yuv_skinmap_file = NULL;
+static FILE *yuv_skinmap_file = NULL;
 #endif
 #ifdef OUTPUT_YUV_REC
 FILE *yuv_rec_file;
 #endif
+#ifdef OUTPUT_YUV_SVC_SRC
+FILE *yuv_svc_src[3] = { NULL, NULL, NULL };
+#endif
 
 #if 0
 FILE *framepsnr;
@@ -99,9 +133,336 @@ static int is_spatial_denoise_enabled(VP9_COMP *cpi) {
 }
 #endif
 
+#if !CONFIG_REALTIME_ONLY
+// compute adaptive threshold for skip recoding
+static int compute_context_model_thresh(const VP9_COMP *const cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  const int frame_size = (cm->width * cm->height) >> 10;
+  const int bitrate = (int)(oxcf->target_bandwidth >> 10);
+  const int qindex_factor = cm->base_qindex + (MAXQ >> 1);
+
+  // This equation makes the threshold adaptive to frame size.
+  // Coding gain obtained by recoding comes from alternate frames of large
+  // content change. We skip recoding if the difference of previous and current
+  // frame context probability model is less than a certain threshold.
+  // The first component is the most critical part to guarantee adaptivity.
+  // Other parameters are estimated based on normal setting of hd resolution
+  // parameters. e.g. frame_size = 1920x1080, bitrate = 8000, qindex_factor < 50
+  const int thresh =
+      ((FRAME_SIZE_FACTOR * frame_size - FRAME_RATE_FACTOR * bitrate) *
+       qindex_factor) >>
+      9;
+
+  return thresh;
+}
+
+// compute the total cost difference between current
+// and previous frame context prob model.
+static int compute_context_model_diff(const VP9_COMMON *const cm) {
+  const FRAME_CONTEXT *const pre_fc =
+      &cm->frame_contexts[cm->frame_context_idx];
+  const FRAME_CONTEXT *const cur_fc = cm->fc;
+  const FRAME_COUNTS *counts = &cm->counts;
+  vpx_prob pre_last_prob, cur_last_prob;
+  int diff = 0;
+  int i, j, k, l, m, n;
+
+  // y_mode_prob
+  for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) {
+    for (j = 0; j < INTRA_MODES - 1; ++j) {
+      diff += (int)counts->y_mode[i][j] *
+              (pre_fc->y_mode_prob[i][j] - cur_fc->y_mode_prob[i][j]);
+    }
+    pre_last_prob = MAX_PROB - pre_fc->y_mode_prob[i][INTRA_MODES - 2];
+    cur_last_prob = MAX_PROB - cur_fc->y_mode_prob[i][INTRA_MODES - 2];
+
+    diff += (int)counts->y_mode[i][INTRA_MODES - 1] *
+            (pre_last_prob - cur_last_prob);
+  }
+
+  // uv_mode_prob
+  for (i = 0; i < INTRA_MODES; ++i) {
+    for (j = 0; j < INTRA_MODES - 1; ++j) {
+      diff += (int)counts->uv_mode[i][j] *
+              (pre_fc->uv_mode_prob[i][j] - cur_fc->uv_mode_prob[i][j]);
+    }
+    pre_last_prob = MAX_PROB - pre_fc->uv_mode_prob[i][INTRA_MODES - 2];
+    cur_last_prob = MAX_PROB - cur_fc->uv_mode_prob[i][INTRA_MODES - 2];
+
+    diff += (int)counts->uv_mode[i][INTRA_MODES - 1] *
+            (pre_last_prob - cur_last_prob);
+  }
+
+  // partition_prob
+  for (i = 0; i < PARTITION_CONTEXTS; ++i) {
+    for (j = 0; j < PARTITION_TYPES - 1; ++j) {
+      diff += (int)counts->partition[i][j] *
+              (pre_fc->partition_prob[i][j] - cur_fc->partition_prob[i][j]);
+    }
+    pre_last_prob = MAX_PROB - pre_fc->partition_prob[i][PARTITION_TYPES - 2];
+    cur_last_prob = MAX_PROB - cur_fc->partition_prob[i][PARTITION_TYPES - 2];
+
+    diff += (int)counts->partition[i][PARTITION_TYPES - 1] *
+            (pre_last_prob - cur_last_prob);
+  }
+
+  // coef_probs
+  for (i = 0; i < TX_SIZES; ++i) {
+    for (j = 0; j < PLANE_TYPES; ++j) {
+      for (k = 0; k < REF_TYPES; ++k) {
+        for (l = 0; l < COEF_BANDS; ++l) {
+          for (m = 0; m < BAND_COEFF_CONTEXTS(l); ++m) {
+            for (n = 0; n < UNCONSTRAINED_NODES; ++n) {
+              diff += (int)counts->coef[i][j][k][l][m][n] *
+                      (pre_fc->coef_probs[i][j][k][l][m][n] -
+                       cur_fc->coef_probs[i][j][k][l][m][n]);
+            }
+
+            pre_last_prob =
+                MAX_PROB -
+                pre_fc->coef_probs[i][j][k][l][m][UNCONSTRAINED_NODES - 1];
+            cur_last_prob =
+                MAX_PROB -
+                cur_fc->coef_probs[i][j][k][l][m][UNCONSTRAINED_NODES - 1];
+
+            diff += (int)counts->coef[i][j][k][l][m][UNCONSTRAINED_NODES] *
+                    (pre_last_prob - cur_last_prob);
+          }
+        }
+      }
+    }
+  }
+
+  // switchable_interp_prob
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) {
+    for (j = 0; j < SWITCHABLE_FILTERS - 1; ++j) {
+      diff += (int)counts->switchable_interp[i][j] *
+              (pre_fc->switchable_interp_prob[i][j] -
+               cur_fc->switchable_interp_prob[i][j]);
+    }
+    pre_last_prob =
+        MAX_PROB - pre_fc->switchable_interp_prob[i][SWITCHABLE_FILTERS - 2];
+    cur_last_prob =
+        MAX_PROB - cur_fc->switchable_interp_prob[i][SWITCHABLE_FILTERS - 2];
+
+    diff += (int)counts->switchable_interp[i][SWITCHABLE_FILTERS - 1] *
+            (pre_last_prob - cur_last_prob);
+  }
+
+  // inter_mode_probs
+  for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
+    for (j = 0; j < INTER_MODES - 1; ++j) {
+      diff += (int)counts->inter_mode[i][j] *
+              (pre_fc->inter_mode_probs[i][j] - cur_fc->inter_mode_probs[i][j]);
+    }
+    pre_last_prob = MAX_PROB - pre_fc->inter_mode_probs[i][INTER_MODES - 2];
+    cur_last_prob = MAX_PROB - cur_fc->inter_mode_probs[i][INTER_MODES - 2];
+
+    diff += (int)counts->inter_mode[i][INTER_MODES - 1] *
+            (pre_last_prob - cur_last_prob);
+  }
+
+  // intra_inter_prob
+  for (i = 0; i < INTRA_INTER_CONTEXTS; ++i) {
+    diff += (int)counts->intra_inter[i][0] *
+            (pre_fc->intra_inter_prob[i] - cur_fc->intra_inter_prob[i]);
+
+    pre_last_prob = MAX_PROB - pre_fc->intra_inter_prob[i];
+    cur_last_prob = MAX_PROB - cur_fc->intra_inter_prob[i];
+
+    diff += (int)counts->intra_inter[i][1] * (pre_last_prob - cur_last_prob);
+  }
+
+  // comp_inter_prob
+  for (i = 0; i < COMP_INTER_CONTEXTS; ++i) {
+    diff += (int)counts->comp_inter[i][0] *
+            (pre_fc->comp_inter_prob[i] - cur_fc->comp_inter_prob[i]);
+
+    pre_last_prob = MAX_PROB - pre_fc->comp_inter_prob[i];
+    cur_last_prob = MAX_PROB - cur_fc->comp_inter_prob[i];
+
+    diff += (int)counts->comp_inter[i][1] * (pre_last_prob - cur_last_prob);
+  }
+
+  // single_ref_prob
+  for (i = 0; i < REF_CONTEXTS; ++i) {
+    for (j = 0; j < 2; ++j) {
+      diff += (int)counts->single_ref[i][j][0] *
+              (pre_fc->single_ref_prob[i][j] - cur_fc->single_ref_prob[i][j]);
+
+      pre_last_prob = MAX_PROB - pre_fc->single_ref_prob[i][j];
+      cur_last_prob = MAX_PROB - cur_fc->single_ref_prob[i][j];
+
+      diff +=
+          (int)counts->single_ref[i][j][1] * (pre_last_prob - cur_last_prob);
+    }
+  }
+
+  // comp_ref_prob
+  for (i = 0; i < REF_CONTEXTS; ++i) {
+    diff += (int)counts->comp_ref[i][0] *
+            (pre_fc->comp_ref_prob[i] - cur_fc->comp_ref_prob[i]);
+
+    pre_last_prob = MAX_PROB - pre_fc->comp_ref_prob[i];
+    cur_last_prob = MAX_PROB - cur_fc->comp_ref_prob[i];
+
+    diff += (int)counts->comp_ref[i][1] * (pre_last_prob - cur_last_prob);
+  }
+
+  // tx_probs
+  for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
+    // p32x32
+    for (j = 0; j < TX_SIZES - 1; ++j) {
+      diff += (int)counts->tx.p32x32[i][j] *
+              (pre_fc->tx_probs.p32x32[i][j] - cur_fc->tx_probs.p32x32[i][j]);
+    }
+    pre_last_prob = MAX_PROB - pre_fc->tx_probs.p32x32[i][TX_SIZES - 2];
+    cur_last_prob = MAX_PROB - cur_fc->tx_probs.p32x32[i][TX_SIZES - 2];
+
+    diff += (int)counts->tx.p32x32[i][TX_SIZES - 1] *
+            (pre_last_prob - cur_last_prob);
+
+    // p16x16
+    for (j = 0; j < TX_SIZES - 2; ++j) {
+      diff += (int)counts->tx.p16x16[i][j] *
+              (pre_fc->tx_probs.p16x16[i][j] - cur_fc->tx_probs.p16x16[i][j]);
+    }
+    pre_last_prob = MAX_PROB - pre_fc->tx_probs.p16x16[i][TX_SIZES - 3];
+    cur_last_prob = MAX_PROB - cur_fc->tx_probs.p16x16[i][TX_SIZES - 3];
+
+    diff += (int)counts->tx.p16x16[i][TX_SIZES - 2] *
+            (pre_last_prob - cur_last_prob);
+
+    // p8x8
+    for (j = 0; j < TX_SIZES - 3; ++j) {
+      diff += (int)counts->tx.p8x8[i][j] *
+              (pre_fc->tx_probs.p8x8[i][j] - cur_fc->tx_probs.p8x8[i][j]);
+    }
+    pre_last_prob = MAX_PROB - pre_fc->tx_probs.p8x8[i][TX_SIZES - 4];
+    cur_last_prob = MAX_PROB - cur_fc->tx_probs.p8x8[i][TX_SIZES - 4];
+
+    diff +=
+        (int)counts->tx.p8x8[i][TX_SIZES - 3] * (pre_last_prob - cur_last_prob);
+  }
+
+  // skip_probs
+  for (i = 0; i < SKIP_CONTEXTS; ++i) {
+    diff += (int)counts->skip[i][0] *
+            (pre_fc->skip_probs[i] - cur_fc->skip_probs[i]);
+
+    pre_last_prob = MAX_PROB - pre_fc->skip_probs[i];
+    cur_last_prob = MAX_PROB - cur_fc->skip_probs[i];
+
+    diff += (int)counts->skip[i][1] * (pre_last_prob - cur_last_prob);
+  }
+
+  // mv
+  for (i = 0; i < MV_JOINTS - 1; ++i) {
+    diff += (int)counts->mv.joints[i] *
+            (pre_fc->nmvc.joints[i] - cur_fc->nmvc.joints[i]);
+  }
+  pre_last_prob = MAX_PROB - pre_fc->nmvc.joints[MV_JOINTS - 2];
+  cur_last_prob = MAX_PROB - cur_fc->nmvc.joints[MV_JOINTS - 2];
+
+  diff +=
+      (int)counts->mv.joints[MV_JOINTS - 1] * (pre_last_prob - cur_last_prob);
+
+  for (i = 0; i < 2; ++i) {
+    const nmv_component_counts *nmv_count = &counts->mv.comps[i];
+    const nmv_component *pre_nmv_prob = &pre_fc->nmvc.comps[i];
+    const nmv_component *cur_nmv_prob = &cur_fc->nmvc.comps[i];
+
+    // sign
+    diff += (int)nmv_count->sign[0] * (pre_nmv_prob->sign - cur_nmv_prob->sign);
+
+    pre_last_prob = MAX_PROB - pre_nmv_prob->sign;
+    cur_last_prob = MAX_PROB - cur_nmv_prob->sign;
+
+    diff += (int)nmv_count->sign[1] * (pre_last_prob - cur_last_prob);
+
+    // classes
+    for (j = 0; j < MV_CLASSES - 1; ++j) {
+      diff += (int)nmv_count->classes[j] *
+              (pre_nmv_prob->classes[j] - cur_nmv_prob->classes[j]);
+    }
+    pre_last_prob = MAX_PROB - pre_nmv_prob->classes[MV_CLASSES - 2];
+    cur_last_prob = MAX_PROB - cur_nmv_prob->classes[MV_CLASSES - 2];
+
+    diff += (int)nmv_count->classes[MV_CLASSES - 1] *
+            (pre_last_prob - cur_last_prob);
+
+    // class0
+    for (j = 0; j < CLASS0_SIZE - 1; ++j) {
+      diff += (int)nmv_count->class0[j] *
+              (pre_nmv_prob->class0[j] - cur_nmv_prob->class0[j]);
+    }
+    pre_last_prob = MAX_PROB - pre_nmv_prob->class0[CLASS0_SIZE - 2];
+    cur_last_prob = MAX_PROB - cur_nmv_prob->class0[CLASS0_SIZE - 2];
+
+    diff += (int)nmv_count->class0[CLASS0_SIZE - 1] *
+            (pre_last_prob - cur_last_prob);
+
+    // bits
+    for (j = 0; j < MV_OFFSET_BITS; ++j) {
+      diff += (int)nmv_count->bits[j][0] *
+              (pre_nmv_prob->bits[j] - cur_nmv_prob->bits[j]);
+
+      pre_last_prob = MAX_PROB - pre_nmv_prob->bits[j];
+      cur_last_prob = MAX_PROB - cur_nmv_prob->bits[j];
+
+      diff += (int)nmv_count->bits[j][1] * (pre_last_prob - cur_last_prob);
+    }
+
+    // class0_fp
+    for (j = 0; j < CLASS0_SIZE; ++j) {
+      for (k = 0; k < MV_FP_SIZE - 1; ++k) {
+        diff += (int)nmv_count->class0_fp[j][k] *
+                (pre_nmv_prob->class0_fp[j][k] - cur_nmv_prob->class0_fp[j][k]);
+      }
+      pre_last_prob = MAX_PROB - pre_nmv_prob->class0_fp[j][MV_FP_SIZE - 2];
+      cur_last_prob = MAX_PROB - cur_nmv_prob->class0_fp[j][MV_FP_SIZE - 2];
+
+      diff += (int)nmv_count->class0_fp[j][MV_FP_SIZE - 1] *
+              (pre_last_prob - cur_last_prob);
+    }
+
+    // fp
+    for (j = 0; j < MV_FP_SIZE - 1; ++j) {
+      diff +=
+          (int)nmv_count->fp[j] * (pre_nmv_prob->fp[j] - cur_nmv_prob->fp[j]);
+    }
+    pre_last_prob = MAX_PROB - pre_nmv_prob->fp[MV_FP_SIZE - 2];
+    cur_last_prob = MAX_PROB - cur_nmv_prob->fp[MV_FP_SIZE - 2];
+
+    diff +=
+        (int)nmv_count->fp[MV_FP_SIZE - 1] * (pre_last_prob - cur_last_prob);
+
+    // class0_hp
+    diff += (int)nmv_count->class0_hp[0] *
+            (pre_nmv_prob->class0_hp - cur_nmv_prob->class0_hp);
+
+    pre_last_prob = MAX_PROB - pre_nmv_prob->class0_hp;
+    cur_last_prob = MAX_PROB - cur_nmv_prob->class0_hp;
+
+    diff += (int)nmv_count->class0_hp[1] * (pre_last_prob - cur_last_prob);
+
+    // hp
+    diff += (int)nmv_count->hp[0] * (pre_nmv_prob->hp - cur_nmv_prob->hp);
+
+    pre_last_prob = MAX_PROB - pre_nmv_prob->hp;
+    cur_last_prob = MAX_PROB - cur_nmv_prob->hp;
+
+    diff += (int)nmv_count->hp[1] * (pre_last_prob - cur_last_prob);
+  }
+
+  return -diff;
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
 // Test for whether to calculate metrics for the frame.
-static int is_psnr_calc_enabled(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
+static int is_psnr_calc_enabled(const VP9_COMP *cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
 
   return cpi->b_calculate_psnr && (oxcf->pass != 1) && cm->show_frame;
@@ -109,58 +470,57 @@ static int is_psnr_calc_enabled(VP9_COMP *cpi) {
 
 /* clang-format off */
 const Vp9LevelSpec vp9_level_defs[VP9_LEVELS] = {
-  { LEVEL_1,   829440,      36864,    200,    400,   2, 1,  4,  8 },
-  { LEVEL_1_1, 2764800,     73728,    800,    1000,  2, 1,  4,  8 },
-  { LEVEL_2,   4608000,     122880,   1800,   1500,  2, 1,  4,  8 },
-  { LEVEL_2_1, 9216000,     245760,   3600,   2800,  2, 2,  4,  8 },
-  { LEVEL_3,   20736000,    552960,   7200,   6000,  2, 4,  4,  8 },
-  { LEVEL_3_1, 36864000,    983040,   12000,  10000, 2, 4,  4,  8 },
-  { LEVEL_4,   83558400,    2228224,  18000,  16000, 4, 4,  4,  8 },
-  { LEVEL_4_1, 160432128,   2228224,  30000,  18000, 4, 4,  5,  6 },
-  { LEVEL_5,   311951360,   8912896,  60000,  36000, 6, 8,  6,  4 },
-  { LEVEL_5_1, 588251136,   8912896,  120000, 46000, 8, 8,  10, 4 },
+  //         sample rate    size   breadth  bitrate  cpb
+  { LEVEL_1,   829440,      36864,    512,   200,    400,    2, 1,  4,  8 },
+  { LEVEL_1_1, 2764800,     73728,    768,   800,    1000,   2, 1,  4,  8 },
+  { LEVEL_2,   4608000,     122880,   960,   1800,   1500,   2, 1,  4,  8 },
+  { LEVEL_2_1, 9216000,     245760,   1344,  3600,   2800,   2, 2,  4,  8 },
+  { LEVEL_3,   20736000,    552960,   2048,  7200,   6000,   2, 4,  4,  8 },
+  { LEVEL_3_1, 36864000,    983040,   2752,  12000,  10000,  2, 4,  4,  8 },
+  { LEVEL_4,   83558400,    2228224,  4160,  18000,  16000,  4, 4,  4,  8 },
+  { LEVEL_4_1, 160432128,   2228224,  4160,  30000,  18000,  4, 4,  5,  6 },
+  { LEVEL_5,   311951360,   8912896,  8384,  60000,  36000,  6, 8,  6,  4 },
+  { LEVEL_5_1, 588251136,   8912896,  8384,  120000, 46000,  8, 8,  10, 4 },
   // TODO(huisu): update max_cpb_size for level 5_2 ~ 6_2 when
-  // they are finalized (currently TBD).
-  { LEVEL_5_2, 1176502272,  8912896,  180000, 0,     8, 8,  10, 4 },
-  { LEVEL_6,   1176502272,  35651584, 180000, 0,     8, 16, 10, 4 },
-  { LEVEL_6_1, 2353004544u, 35651584, 240000, 0,     8, 16, 10, 4 },
-  { LEVEL_6_2, 4706009088u, 35651584, 480000, 0,     8, 16, 10, 4 },
+  // they are finalized (currently tentative).
+  { LEVEL_5_2, 1176502272,  8912896,  8384,  180000, 90000,  8, 8,  10, 4 },
+  { LEVEL_6,   1176502272,  35651584, 16832, 180000, 90000,  8, 16, 10, 4 },
+  { LEVEL_6_1, 2353004544u, 35651584, 16832, 240000, 180000, 8, 16, 10, 4 },
+  { LEVEL_6_2, 4706009088u, 35651584, 16832, 480000, 360000, 8, 16, 10, 4 },
 };
 /* clang-format on */
 
-static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] =
-    { "The average bit-rate is too high.",
-      "The picture size is too large.",
-      "The luma sample rate is too large.",
-      "The CPB size is too large.",
-      "The compression ratio is too small",
-      "Too many column tiles are used.",
-      "The alt-ref distance is too small.",
-      "Too many reference buffers are used." };
+static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] = {
+  "The average bit-rate is too high.",
+  "The picture size is too large.",
+  "The picture width/height is too large.",
+  "The luma sample rate is too large.",
+  "The CPB size is too large.",
+  "The compression ratio is too small",
+  "Too many column tiles are used.",
+  "The alt-ref distance is too small.",
+  "Too many reference buffers are used."
+};
 
-static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) {
+static INLINE void Scale2Ratio(VPX_SCALING_MODE mode, int *hr, int *hs) {
   switch (mode) {
-    case NORMAL:
+    case VP8E_NORMAL:
       *hr = 1;
       *hs = 1;
       break;
-    case FOURFIVE:
+    case VP8E_FOURFIVE:
       *hr = 4;
       *hs = 5;
       break;
-    case THREEFIVE:
+    case VP8E_THREEFIVE:
       *hr = 3;
       *hs = 5;
       break;
-    case ONETWO:
+    default:
+      assert(mode == VP8E_ONETWO);
       *hr = 1;
       *hs = 2;
       break;
-    default:
-      *hr = 1;
-      *hs = 1;
-      assert(0);
-      break;
   }
 }
 
@@ -216,6 +576,72 @@ static void apply_active_map(VP9_COMP *cpi) {
   }
 }
 
+static void apply_roi_map(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  struct segmentation *const seg = &cm->seg;
+  vpx_roi_map_t *roi = &cpi->roi;
+  const int *delta_q = roi->delta_q;
+  const int *delta_lf = roi->delta_lf;
+  const int *skip = roi->skip;
+  int ref_frame[8];
+  int internal_delta_q[MAX_SEGMENTS];
+  int i;
+
+  // TODO(jianj): Investigate why ROI not working in speed < 5 or in non
+  // realtime mode.
+  if (cpi->oxcf.mode != REALTIME || cpi->oxcf.speed < 5) return;
+  if (!roi->enabled) return;
+
+  memcpy(&ref_frame, roi->ref_frame, sizeof(ref_frame));
+
+  vp9_enable_segmentation(seg);
+  vp9_clearall_segfeatures(seg);
+  // Select delta coding method;
+  seg->abs_delta = SEGMENT_DELTADATA;
+
+  memcpy(cpi->segmentation_map, roi->roi_map, (cm->mi_rows * cm->mi_cols));
+
+  for (i = 0; i < MAX_SEGMENTS; ++i) {
+    // Translate the external delta q values to internal values.
+    internal_delta_q[i] = vp9_quantizer_to_qindex(abs(delta_q[i]));
+    if (delta_q[i] < 0) internal_delta_q[i] = -internal_delta_q[i];
+    vp9_disable_segfeature(seg, i, SEG_LVL_ALT_Q);
+    vp9_disable_segfeature(seg, i, SEG_LVL_ALT_LF);
+    if (internal_delta_q[i] != 0) {
+      vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
+      vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, internal_delta_q[i]);
+    }
+    if (delta_lf[i] != 0) {
+      vp9_enable_segfeature(seg, i, SEG_LVL_ALT_LF);
+      vp9_set_segdata(seg, i, SEG_LVL_ALT_LF, delta_lf[i]);
+    }
+    if (skip[i] != 0) {
+      vp9_enable_segfeature(seg, i, SEG_LVL_SKIP);
+      vp9_set_segdata(seg, i, SEG_LVL_SKIP, 0);
+    }
+    if (ref_frame[i] >= 0) {
+      int valid_ref = 1;
+      // ALTREF is not used as reference for nonrd_pickmode with 0 lag.
+      if (ref_frame[i] == ALTREF_FRAME && cpi->sf.use_nonrd_pick_mode)
+        valid_ref = 0;
+      // If GOLDEN is selected, make sure it's set as reference.
+      if (ref_frame[i] == GOLDEN_FRAME &&
+          !(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame[i]))) {
+        valid_ref = 0;
+      }
+      // GOLDEN was updated in previous encoded frame, so GOLDEN and LAST are
+      // same reference.
+      if (ref_frame[i] == GOLDEN_FRAME && cpi->rc.frames_since_golden == 0)
+        ref_frame[i] = LAST_FRAME;
+      if (valid_ref) {
+        vp9_enable_segfeature(seg, i, SEG_LVL_REF_FRAME);
+        vp9_set_segdata(seg, i, SEG_LVL_REF_FRAME, ref_frame[i]);
+      }
+    }
+  }
+  roi->enabled = 1;
+}
+
 static void init_level_info(Vp9LevelInfo *level_info) {
   Vp9LevelStats *const level_stats = &level_info->level_stats;
   Vp9LevelSpec *const level_spec = &level_info->level_spec;
@@ -226,6 +652,18 @@ static void init_level_info(Vp9LevelInfo *level_info) {
   level_spec->min_altref_distance = INT_MAX;
 }
 
+static int check_seg_range(int seg_data[8], int range) {
+  int i;
+  for (i = 0; i < 8; ++i) {
+    // Note abs() alone can't be used as the behavior of abs(INT_MIN) is
+    // undefined.
+    if (seg_data[i] > range || seg_data[i] < -range) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
 VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) {
   int i;
   const Vp9LevelSpec *this_level;
@@ -238,6 +676,8 @@ VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) {
             (double)this_level->max_luma_sample_rate *
                 (1 + SAMPLE_RATE_GRACE_P) ||
         level_spec->max_luma_picture_size > this_level->max_luma_picture_size ||
+        level_spec->max_luma_picture_breadth >
+            this_level->max_luma_picture_breadth ||
         level_spec->average_bitrate > this_level->average_bitrate ||
         level_spec->max_cpb_size > this_level->max_cpb_size ||
         level_spec->compression_ratio < this_level->compression_ratio ||
@@ -250,6 +690,63 @@ VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) {
   return (i == VP9_LEVELS) ? LEVEL_UNKNOWN : vp9_level_defs[i].level;
 }
 
+vpx_codec_err_t vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map,
+                                unsigned int rows, unsigned int cols,
+                                int delta_q[8], int delta_lf[8], int skip[8],
+                                int ref_frame[8]) {
+  VP9_COMMON *cm = &cpi->common;
+  vpx_roi_map_t *roi = &cpi->roi;
+  const int range = 63;
+  const int ref_frame_range = 3;  // Alt-ref
+  const int skip_range = 1;
+  const int frame_rows = cpi->common.mi_rows;
+  const int frame_cols = cpi->common.mi_cols;
+
+  // Check number of rows and columns match
+  if (frame_rows != (int)rows || frame_cols != (int)cols) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+
+  if (!check_seg_range(delta_q, range) || !check_seg_range(delta_lf, range) ||
+      !check_seg_range(ref_frame, ref_frame_range) ||
+      !check_seg_range(skip, skip_range))
+    return VPX_CODEC_INVALID_PARAM;
+
+  // Also disable segmentation if no deltas are specified.
+  if (!map ||
+      (!(delta_q[0] | delta_q[1] | delta_q[2] | delta_q[3] | delta_q[4] |
+         delta_q[5] | delta_q[6] | delta_q[7] | delta_lf[0] | delta_lf[1] |
+         delta_lf[2] | delta_lf[3] | delta_lf[4] | delta_lf[5] | delta_lf[6] |
+         delta_lf[7] | skip[0] | skip[1] | skip[2] | skip[3] | skip[4] |
+         skip[5] | skip[6] | skip[7]) &&
+       (ref_frame[0] == -1 && ref_frame[1] == -1 && ref_frame[2] == -1 &&
+        ref_frame[3] == -1 && ref_frame[4] == -1 && ref_frame[5] == -1 &&
+        ref_frame[6] == -1 && ref_frame[7] == -1))) {
+    vp9_disable_segmentation(&cm->seg);
+    cpi->roi.enabled = 0;
+    return VPX_CODEC_OK;
+  }
+
+  if (roi->roi_map) {
+    vpx_free(roi->roi_map);
+    roi->roi_map = NULL;
+  }
+  roi->roi_map = vpx_malloc(rows * cols);
+  if (!roi->roi_map) return VPX_CODEC_MEM_ERROR;
+
+  // Copy to ROI structure in the compressor.
+  memcpy(roi->roi_map, map, rows * cols);
+  memcpy(&roi->delta_q, delta_q, MAX_SEGMENTS * sizeof(delta_q[0]));
+  memcpy(&roi->delta_lf, delta_lf, MAX_SEGMENTS * sizeof(delta_lf[0]));
+  memcpy(&roi->skip, skip, MAX_SEGMENTS * sizeof(skip[0]));
+  memcpy(&roi->ref_frame, ref_frame, MAX_SEGMENTS * sizeof(ref_frame[0]));
+  roi->enabled = 1;
+  roi->rows = rows;
+  roi->cols = cols;
+
+  return VPX_CODEC_OK;
+}
+
 int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
                        int cols) {
   if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) {
@@ -327,8 +824,37 @@ static void setup_frame(VP9_COMP *cpi) {
     if (!cpi->use_svc) cm->frame_context_idx = cpi->refresh_alt_ref_frame;
   }
 
+  // TODO(jingning): Overwrite the frame_context_idx index in multi-layer ARF
+  // case. Need some further investigation on if we could apply this to single
+  // layer ARF case as well.
+  if (cpi->multi_layer_arf && !cpi->use_svc) {
+    GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    const int gf_group_index = gf_group->index;
+    const int boost_frame =
+        !cpi->rc.is_src_frame_alt_ref &&
+        (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame);
+
+    // frame_context_idx           Frame Type
+    //        0              Intra only frame, base layer ARF
+    //        1              ARFs with layer depth = 2,3
+    //        2              ARFs with layer depth > 3
+    //        3              Non-boosted frames
+    if (frame_is_intra_only(cm)) {
+      cm->frame_context_idx = 0;
+    } else if (boost_frame) {
+      if (gf_group->rf_level[gf_group_index] == GF_ARF_STD)
+        cm->frame_context_idx = 0;
+      else if (gf_group->layer_depth[gf_group_index] <= 3)
+        cm->frame_context_idx = 1;
+      else
+        cm->frame_context_idx = 2;
+    } else {
+      cm->frame_context_idx = 3;
+    }
+  }
+
   if (cm->frame_type == KEY_FRAME) {
-    if (!is_two_pass_svc(cpi)) cpi->refresh_golden_frame = 1;
+    cpi->refresh_golden_frame = 1;
     cpi->refresh_alt_ref_frame = 1;
     vp9_zero(cpi->interp_filter_selected);
   } else {
@@ -362,10 +888,11 @@ static int vp9_enc_alloc_mi(VP9_COMMON *cm, int mi_size) {
   if (!cm->prev_mip) return 1;
   cm->mi_alloc_size = mi_size;
 
-  cm->mi_grid_base = (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO *));
+  cm->mi_grid_base =
+      (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->mi_grid_base));
   if (!cm->mi_grid_base) return 1;
   cm->prev_mi_grid_base =
-      (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO *));
+      (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->prev_mi_grid_base));
   if (!cm->prev_mi_grid_base) return 1;
 
   return 0;
@@ -380,12 +907,17 @@ static void vp9_enc_free_mi(VP9_COMMON *cm) {
   cm->mi_grid_base = NULL;
   vpx_free(cm->prev_mi_grid_base);
   cm->prev_mi_grid_base = NULL;
+  cm->mi_alloc_size = 0;
 }
 
 static void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) {
   // Current mip will be the prev_mip for the next frame.
   MODE_INFO **temp_base = cm->prev_mi_grid_base;
   MODE_INFO *temp = cm->prev_mip;
+
+  // Skip update prev_mi frame in show_existing_frame mode.
+  if (cm->show_existing_frame) return;
+
   cm->prev_mip = cm->mip;
   cm->mip = temp;
 
@@ -399,22 +931,21 @@ static void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) {
   cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1;
 }
 
-void vp9_initialize_enc(void) {
-  static volatile int init_done = 0;
-
-  if (!init_done) {
-    vp9_rtcd();
-    vpx_dsp_rtcd();
-    vpx_scale_rtcd();
-    vp9_init_intra_predictors();
-    vp9_init_me_luts();
-    vp9_rc_init_minq_luts();
-    vp9_entropy_mv_init();
-    vp9_temporal_filter_init();
-    init_done = 1;
-  }
+static void initialize_enc(void) {
+  vp9_rtcd();
+  vpx_dsp_rtcd();
+  vpx_scale_rtcd();
+  vp9_init_intra_predictors();
+  vp9_init_me_luts();
+  vp9_rc_init_minq_luts();
+  vp9_entropy_mv_init();
+#if !CONFIG_REALTIME_ONLY
+  vp9_temporal_filter_init();
+#endif
 }
 
+void vp9_initialize_enc(void) { once(initialize_enc); }
+
 static void dealloc_compressor_data(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   int i;
@@ -450,21 +981,53 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
   cpi->nmvsadcosts_hp[0] = NULL;
   cpi->nmvsadcosts_hp[1] = NULL;
 
+  vpx_free(cpi->skin_map);
+  cpi->skin_map = NULL;
+
   vpx_free(cpi->prev_partition);
   cpi->prev_partition = NULL;
 
+  vpx_free(cpi->svc.prev_partition_svc);
+  cpi->svc.prev_partition_svc = NULL;
+
   vpx_free(cpi->prev_segment_id);
   cpi->prev_segment_id = NULL;
 
+  vpx_free(cpi->prev_variance_low);
+  cpi->prev_variance_low = NULL;
+
+  vpx_free(cpi->copied_frame_cnt);
+  cpi->copied_frame_cnt = NULL;
+
+  vpx_free(cpi->content_state_sb_fd);
+  cpi->content_state_sb_fd = NULL;
+
+  vpx_free(cpi->count_arf_frame_usage);
+  cpi->count_arf_frame_usage = NULL;
+  vpx_free(cpi->count_lastgolden_frame_usage);
+  cpi->count_lastgolden_frame_usage = NULL;
+
   vp9_cyclic_refresh_free(cpi->cyclic_refresh);
   cpi->cyclic_refresh = NULL;
 
   vpx_free(cpi->active_map.map);
   cpi->active_map.map = NULL;
 
+  vpx_free(cpi->roi.roi_map);
+  cpi->roi.roi_map = NULL;
+
   vpx_free(cpi->consec_zero_mv);
   cpi->consec_zero_mv = NULL;
 
+  vpx_free(cpi->mb_wiener_variance);
+  cpi->mb_wiener_variance = NULL;
+
+  vpx_free(cpi->sb_mul_scale);
+  cpi->sb_mul_scale = NULL;
+
+  vpx_free(cpi->mi_ssim_rdmult_scaling_factors);
+  cpi->mi_ssim_rdmult_scaling_factors = NULL;
+
   vp9_free_ref_frame_buffers(cm->buffer_pool);
 #if CONFIG_VP9_POSTPROC
   vp9_free_postproc_buffers(cm);
@@ -474,7 +1037,7 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
   vpx_free_frame_buffer(&cpi->last_frame_uf);
   vpx_free_frame_buffer(&cpi->scaled_source);
   vpx_free_frame_buffer(&cpi->scaled_last_source);
-  vpx_free_frame_buffer(&cpi->alt_ref_buffer);
+  vpx_free_frame_buffer(&cpi->tf_buffer);
 #ifdef ENABLE_KF_DENOISE
   vpx_free_frame_buffer(&cpi->raw_unscaled_source);
   vpx_free_frame_buffer(&cpi->raw_scaled_source);
@@ -485,6 +1048,9 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
   vpx_free(cpi->tile_tok[0][0]);
   cpi->tile_tok[0][0] = 0;
 
+  vpx_free(cpi->tplist[0][0]);
+  cpi->tplist[0][0] = NULL;
+
   vp9_free_pc_tree(&cpi->td);
 
   for (i = 0; i < cpi->svc.number_spatial_layers; ++i) {
@@ -494,11 +1060,6 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
     lc->rc_twopass_stats_in.sz = 0;
   }
 
-  if (cpi->source_diff_var != NULL) {
-    vpx_free(cpi->source_diff_var);
-    cpi->source_diff_var = NULL;
-  }
-
   for (i = 0; i < MAX_LAG_BUFFERS; ++i) {
     vpx_free_frame_buffer(&cpi->svc.scaled_frames[i]);
   }
@@ -570,6 +1131,7 @@ static void restore_coding_context(VP9_COMP *cpi) {
   *cm->fc = cc->fc;
 }
 
+#if !CONFIG_REALTIME_ONLY
 static void configure_static_seg_features(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
@@ -693,6 +1255,7 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
     }
   }
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 static void update_reference_segmentation_map(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
@@ -726,7 +1289,7 @@ static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
                        "Failed to allocate lag buffers");
 
   // TODO(agrange) Check if ARF is enabled and skip allocation if not.
-  if (vpx_realloc_frame_buffer(&cpi->alt_ref_buffer, oxcf->width, oxcf->height,
+  if (vpx_realloc_frame_buffer(&cpi->tf_buffer, oxcf->width, oxcf->height,
                                cm->subsampling_x, cm->subsampling_y,
 #if CONFIG_VP9_HIGHBITDEPTH
                                cm->use_highbitdepth,
@@ -734,7 +1297,7 @@ static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
                                VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
                                NULL, NULL, NULL))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                       "Failed to allocate altref buffer");
+                       "Failed to allocate temporal filter buffer");
 }
 
 static void alloc_util_frame_buffers(VP9_COMP *cpi) {
@@ -761,8 +1324,9 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) {
 
   // For 1 pass cbr: allocate scaled_frame that may be used as an intermediate
   // buffer for a 2 stage down-sampling: two stages of 1:2 down-sampling for a
-  // target of 1/4x1/4.
-  if (is_one_pass_cbr_svc(cpi) && !cpi->svc.scaled_temp_is_alloc) {
+  // target of 1/4x1/4. number_spatial_layers must be greater than 2.
+  if (is_one_pass_svc(cpi) && !cpi->svc.scaled_temp_is_alloc &&
+      cpi->svc.number_spatial_layers > 2) {
     cpi->svc.scaled_temp_is_alloc = 1;
     if (vpx_realloc_frame_buffer(
             &cpi->svc.scaled_temp, cm->width >> 1, cm->height >> 1,
@@ -807,20 +1371,22 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) {
 #endif
 }
 
-static int alloc_context_buffers_ext(VP9_COMP *cpi) {
+static void alloc_context_buffers_ext(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   int mi_size = cm->mi_cols * cm->mi_rows;
 
-  cpi->mbmi_ext_base = vpx_calloc(mi_size, sizeof(*cpi->mbmi_ext_base));
-  if (!cpi->mbmi_ext_base) return 1;
-
-  return 0;
+  CHECK_MEM_ERROR(&cm->error, cpi->mbmi_ext_base,
+                  vpx_calloc(mi_size, sizeof(*cpi->mbmi_ext_base)));
 }
 
 static void alloc_compressor_data(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
+  int sb_rows;
 
-  vp9_alloc_context_buffers(cm, cm->width, cm->height);
+  if (vp9_alloc_context_buffers(cm, cm->width, cm->height)) {
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate context buffers");
+  }
 
   alloc_context_buffers_ext(cpi);
 
@@ -828,10 +1394,16 @@ static void alloc_compressor_data(VP9_COMP *cpi) {
 
   {
     unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols);
-    CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0],
+    CHECK_MEM_ERROR(&cm->error, cpi->tile_tok[0][0],
                     vpx_calloc(tokens, sizeof(*cpi->tile_tok[0][0])));
   }
 
+  sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+  vpx_free(cpi->tplist[0][0]);
+  CHECK_MEM_ERROR(
+      &cm->error, cpi->tplist[0][0],
+      vpx_calloc(sb_rows * 4 * (1 << 6), sizeof(*cpi->tplist[0][0])));
+
   vp9_setup_pc_tree(&cpi->common, &cpi->td);
 }
 
@@ -846,14 +1418,23 @@ static void set_tile_limits(VP9_COMP *cpi) {
   int min_log2_tile_cols, max_log2_tile_cols;
   vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
 
-  if (is_two_pass_svc(cpi) && (cpi->svc.encode_empty_frame_state == ENCODING ||
-                               cpi->svc.number_spatial_layers > 1)) {
-    cm->log2_tile_cols = 0;
-    cm->log2_tile_rows = 0;
-  } else {
-    cm->log2_tile_cols =
-        clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols);
-    cm->log2_tile_rows = cpi->oxcf.tile_rows;
+  cm->log2_tile_cols =
+      clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols);
+
+  // Max allowed number of tile_rows is 4 (so log2_tile_rows = 2), and each
+  // tile_row contains a multiple of superblocks.
+  const int sb64_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> 3;
+  const int max_log2_tile_rows = (sb64_rows >= 4)   ? 2
+                                 : (sb64_rows >= 2) ? 1
+                                                    : 0;
+  cm->log2_tile_rows = VPXMIN(cpi->oxcf.tile_rows, max_log2_tile_rows);
+
+  if (cpi->oxcf.target_level == LEVEL_AUTO) {
+    const int level_tile_cols =
+        log_tile_cols_from_picsize_level(cpi->common.width, cpi->common.height);
+    if (cm->log2_tile_cols > level_tile_cols) {
+      cm->log2_tile_cols = VPXMAX(level_tile_cols, min_log2_tile_cols);
+    }
   }
 }
 
@@ -869,31 +1450,23 @@ static void update_frame_size(VP9_COMP *cpi) {
          cm->mi_rows * cm->mi_cols * sizeof(*cpi->mbmi_ext_base));
 
   set_tile_limits(cpi);
-
-  if (is_two_pass_svc(cpi)) {
-    if (vpx_realloc_frame_buffer(&cpi->alt_ref_buffer, cm->width, cm->height,
-                                 cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_VP9_HIGHBITDEPTH
-                                 cm->use_highbitdepth,
-#endif
-                                 VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
-                                 NULL, NULL, NULL))
-      vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                         "Failed to reallocate alt_ref_buffer");
-  }
 }
 
 static void init_buffer_indices(VP9_COMP *cpi) {
-  cpi->lst_fb_idx = 0;
-  cpi->gld_fb_idx = 1;
-  cpi->alt_fb_idx = 2;
+  int ref_frame;
+
+  for (ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame)
+    cpi->ref_fb_idx[ref_frame] = ref_frame;
+
+  cpi->lst_fb_idx = cpi->ref_fb_idx[LAST_FRAME - 1];
+  cpi->gld_fb_idx = cpi->ref_fb_idx[GOLDEN_FRAME - 1];
+  cpi->alt_fb_idx = cpi->ref_fb_idx[ALTREF_FRAME - 1];
 }
 
 static void init_level_constraint(LevelConstraint *lc) {
   lc->level_index = -1;
   lc->max_cpb_size = INT_MAX;
   lc->max_frame_size = INT_MAX;
-  lc->rc_config_updated = 0;
   lc->fail_flag = 0;
 }
 
@@ -905,7 +1478,7 @@ static void set_level_constraint(LevelConstraint *ls, int8_t level_index) {
   }
 }
 
-static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) {
+static void init_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   VP9_COMMON *const cm = &cpi->common;
 
   cpi->oxcf = *oxcf;
@@ -937,7 +1510,7 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) {
   // Temporal scalability.
   cpi->svc.number_temporal_layers = oxcf->ts_number_layers;
 
-  if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ||
+  if ((cpi->svc.number_temporal_layers > 1) ||
       ((cpi->svc.number_temporal_layers > 1 ||
         cpi->svc.number_spatial_layers > 1) &&
        cpi->oxcf.pass != 1)) {
@@ -953,10 +1526,32 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) {
   init_buffer_indices(cpi);
 
   vp9_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height);
+  cpi->fixed_qp_onepass = 0;
 }
 
-static void set_rc_buffer_sizes(RATE_CONTROL *rc,
-                                const VP9EncoderConfig *oxcf) {
+void vp9_check_reset_rc_flag(VP9_COMP *cpi) {
+  RATE_CONTROL *rc = &cpi->rc;
+
+  if (cpi->common.current_video_frame >
+      (unsigned int)cpi->svc.number_spatial_layers) {
+    if (cpi->use_svc) {
+      vp9_svc_check_reset_layer_rc_flag(cpi);
+    } else {
+      if (rc->avg_frame_bandwidth / 3 > (rc->last_avg_frame_bandwidth >> 1) ||
+          rc->avg_frame_bandwidth < (rc->last_avg_frame_bandwidth >> 1)) {
+        rc->rc_1_frame = 0;
+        rc->rc_2_frame = 0;
+        rc->bits_off_target = rc->optimal_buffer_level;
+        rc->buffer_level = rc->optimal_buffer_level;
+      }
+    }
+  }
+}
+
+void vp9_set_rc_buffer_sizes(VP9_COMP *cpi) {
+  RATE_CONTROL *rc = &cpi->rc;
+  const VP9EncoderConfig *oxcf = &cpi->oxcf;
+
   const int64_t bandwidth = oxcf->target_bandwidth;
   const int64_t starting = oxcf->starting_buffer_level_ms;
   const int64_t optimal = oxcf->optimal_buffer_level_ms;
@@ -967,18 +1562,23 @@ static void set_rc_buffer_sizes(RATE_CONTROL *rc,
       (optimal == 0) ? bandwidth / 8 : optimal * bandwidth / 1000;
   rc->maximum_buffer_size =
       (maximum == 0) ? bandwidth / 8 : maximum * bandwidth / 1000;
+
+  // Under a configuration change, where maximum_buffer_size may change,
+  // keep buffer level clipped to the maximum allowed buffer size.
+  rc->bits_off_target = VPXMIN(rc->bits_off_target, rc->maximum_buffer_size);
+  rc->buffer_level = VPXMIN(rc->buffer_level, rc->maximum_buffer_size);
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF) \
-  cpi->fn_ptr[BT].sdf = SDF;                                           \
-  cpi->fn_ptr[BT].sdaf = SDAF;                                         \
-  cpi->fn_ptr[BT].vf = VF;                                             \
-  cpi->fn_ptr[BT].svf = SVF;                                           \
-  cpi->fn_ptr[BT].svaf = SVAF;                                         \
-  cpi->fn_ptr[BT].sdx3f = SDX3F;                                       \
-  cpi->fn_ptr[BT].sdx8f = SDX8F;                                       \
-  cpi->fn_ptr[BT].sdx4df = SDX4DF;
+#define HIGHBD_BFP(BT, SDF, SDSF, SDAF, VF, SVF, SVAF, SDX4DF, SDSX4DF) \
+  cpi->fn_ptr[BT].sdf = SDF;                                            \
+  cpi->fn_ptr[BT].sdsf = SDSF;                                          \
+  cpi->fn_ptr[BT].sdaf = SDAF;                                          \
+  cpi->fn_ptr[BT].vf = VF;                                              \
+  cpi->fn_ptr[BT].svf = SVF;                                            \
+  cpi->fn_ptr[BT].svaf = SVAF;                                          \
+  cpi->fn_ptr[BT].sdx4df = SDX4DF;                                      \
+  cpi->fn_ptr[BT].sdsx4df = SDSX4DF;
 
 #define MAKE_BFP_SAD_WRAPPER(fnname)                                           \
   static unsigned int fnname##_bits8(const uint8_t *src_ptr,                   \
@@ -1016,47 +1616,6 @@ static void set_rc_buffer_sizes(RATE_CONTROL *rc,
            4;                                                                  \
   }
 
-#define MAKE_BFP_SAD3_WRAPPER(fnname)                                    \
-  static void fnname##_bits8(const uint8_t *src_ptr, int source_stride,  \
-                             const uint8_t *ref_ptr, int ref_stride,     \
-                             unsigned int *sad_array) {                  \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
-  }                                                                      \
-  static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \
-                              const uint8_t *ref_ptr, int ref_stride,    \
-                              unsigned int *sad_array) {                 \
-    int i;                                                               \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
-    for (i = 0; i < 3; i++) sad_array[i] >>= 2;                          \
-  }                                                                      \
-  static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \
-                              const uint8_t *ref_ptr, int ref_stride,    \
-                              unsigned int *sad_array) {                 \
-    int i;                                                               \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
-    for (i = 0; i < 3; i++) sad_array[i] >>= 4;                          \
-  }
-
-#define MAKE_BFP_SAD8_WRAPPER(fnname)                                    \
-  static void fnname##_bits8(const uint8_t *src_ptr, int source_stride,  \
-                             const uint8_t *ref_ptr, int ref_stride,     \
-                             unsigned int *sad_array) {                  \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
-  }                                                                      \
-  static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \
-                              const uint8_t *ref_ptr, int ref_stride,    \
-                              unsigned int *sad_array) {                 \
-    int i;                                                               \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
-    for (i = 0; i < 8; i++) sad_array[i] >>= 2;                          \
-  }                                                                      \
-  static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \
-                              const uint8_t *ref_ptr, int ref_stride,    \
-                              unsigned int *sad_array) {                 \
-    int i;                                                               \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
-    for (i = 0; i < 8; i++) sad_array[i] >>= 4;                          \
-  }
 #define MAKE_BFP_SAD4D_WRAPPER(fnname)                                        \
   static void fnname##_bits8(const uint8_t *src_ptr, int source_stride,       \
                              const uint8_t *const ref_ptr[], int ref_stride,  \
@@ -1079,322 +1638,362 @@ static void set_rc_buffer_sizes(RATE_CONTROL *rc,
   }
 
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x16)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x16)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x16_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x16x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x32)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x32)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x32_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x32x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x32)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_64x32)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x32_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_64x32x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x64)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x64)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x64_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x64x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x32)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x32)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x32_avg)
-MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad32x32x3)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad32x32x8)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x32x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x64)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_64x64)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x64_avg)
-MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad64x64x3)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad64x64x8)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_64x64x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x16)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x16)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x16_avg)
-MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad16x16x3)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad16x16x8)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x16x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x8)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x8)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x8_avg)
-MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad16x8x3)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad16x8x8)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x8x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x16)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x16)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x16_avg)
-MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad8x16x3)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x16x8)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x16x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x8)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x8)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x8_avg)
-MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad8x8x3)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x8x8)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x8x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x4)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x4)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x4_avg)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x4x8)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x4x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x8)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_4x8)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x8_avg)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad4x8x8)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_4x8x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x4)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_4x4)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x4_avg)
-MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad4x4x3)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad4x4x8)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_4x4x4d)
 
 static void highbd_set_var_fns(VP9_COMP *const cpi) {
   VP9_COMMON *const cm = &cpi->common;
   if (cm->use_highbitdepth) {
     switch (cm->bit_depth) {
       case VPX_BITS_8:
-        HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits8,
-                   vpx_highbd_sad32x16_avg_bits8, vpx_highbd_8_variance32x16,
-                   vpx_highbd_8_sub_pixel_variance32x16,
-                   vpx_highbd_8_sub_pixel_avg_variance32x16, NULL, NULL,
-                   vpx_highbd_sad32x16x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits8,
-                   vpx_highbd_sad16x32_avg_bits8, vpx_highbd_8_variance16x32,
-                   vpx_highbd_8_sub_pixel_variance16x32,
-                   vpx_highbd_8_sub_pixel_avg_variance16x32, NULL, NULL,
-                   vpx_highbd_sad16x32x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits8,
-                   vpx_highbd_sad64x32_avg_bits8, vpx_highbd_8_variance64x32,
-                   vpx_highbd_8_sub_pixel_variance64x32,
-                   vpx_highbd_8_sub_pixel_avg_variance64x32, NULL, NULL,
-                   vpx_highbd_sad64x32x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits8,
-                   vpx_highbd_sad32x64_avg_bits8, vpx_highbd_8_variance32x64,
-                   vpx_highbd_8_sub_pixel_variance32x64,
-                   vpx_highbd_8_sub_pixel_avg_variance32x64, NULL, NULL,
-                   vpx_highbd_sad32x64x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits8,
-                   vpx_highbd_sad32x32_avg_bits8, vpx_highbd_8_variance32x32,
-                   vpx_highbd_8_sub_pixel_variance32x32,
-                   vpx_highbd_8_sub_pixel_avg_variance32x32,
-                   vpx_highbd_sad32x32x3_bits8, vpx_highbd_sad32x32x8_bits8,
-                   vpx_highbd_sad32x32x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits8,
-                   vpx_highbd_sad64x64_avg_bits8, vpx_highbd_8_variance64x64,
-                   vpx_highbd_8_sub_pixel_variance64x64,
-                   vpx_highbd_8_sub_pixel_avg_variance64x64,
-                   vpx_highbd_sad64x64x3_bits8, vpx_highbd_sad64x64x8_bits8,
-                   vpx_highbd_sad64x64x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits8,
-                   vpx_highbd_sad16x16_avg_bits8, vpx_highbd_8_variance16x16,
-                   vpx_highbd_8_sub_pixel_variance16x16,
-                   vpx_highbd_8_sub_pixel_avg_variance16x16,
-                   vpx_highbd_sad16x16x3_bits8, vpx_highbd_sad16x16x8_bits8,
-                   vpx_highbd_sad16x16x4d_bits8)
+        HIGHBD_BFP(
+            BLOCK_32X16, vpx_highbd_sad32x16_bits8,
+            vpx_highbd_sad_skip_32x16_bits8, vpx_highbd_sad32x16_avg_bits8,
+            vpx_highbd_8_variance32x16, vpx_highbd_8_sub_pixel_variance32x16,
+            vpx_highbd_8_sub_pixel_avg_variance32x16,
+            vpx_highbd_sad32x16x4d_bits8, vpx_highbd_sad_skip_32x16x4d_bits8)
 
         HIGHBD_BFP(
-            BLOCK_16X8, vpx_highbd_sad16x8_bits8, vpx_highbd_sad16x8_avg_bits8,
+            BLOCK_16X32, vpx_highbd_sad16x32_bits8,
+            vpx_highbd_sad_skip_16x32_bits8, vpx_highbd_sad16x32_avg_bits8,
+            vpx_highbd_8_variance16x32, vpx_highbd_8_sub_pixel_variance16x32,
+            vpx_highbd_8_sub_pixel_avg_variance16x32,
+            vpx_highbd_sad16x32x4d_bits8, vpx_highbd_sad_skip_16x32x4d_bits8)
+
+        HIGHBD_BFP(
+            BLOCK_64X32, vpx_highbd_sad64x32_bits8,
+            vpx_highbd_sad_skip_64x32_bits8, vpx_highbd_sad64x32_avg_bits8,
+            vpx_highbd_8_variance64x32, vpx_highbd_8_sub_pixel_variance64x32,
+            vpx_highbd_8_sub_pixel_avg_variance64x32,
+            vpx_highbd_sad64x32x4d_bits8, vpx_highbd_sad_skip_64x32x4d_bits8)
+
+        HIGHBD_BFP(
+            BLOCK_32X64, vpx_highbd_sad32x64_bits8,
+            vpx_highbd_sad_skip_32x64_bits8, vpx_highbd_sad32x64_avg_bits8,
+            vpx_highbd_8_variance32x64, vpx_highbd_8_sub_pixel_variance32x64,
+            vpx_highbd_8_sub_pixel_avg_variance32x64,
+            vpx_highbd_sad32x64x4d_bits8, vpx_highbd_sad_skip_32x64x4d_bits8)
+
+        HIGHBD_BFP(
+            BLOCK_32X32, vpx_highbd_sad32x32_bits8,
+            vpx_highbd_sad_skip_32x32_bits8, vpx_highbd_sad32x32_avg_bits8,
+            vpx_highbd_8_variance32x32, vpx_highbd_8_sub_pixel_variance32x32,
+            vpx_highbd_8_sub_pixel_avg_variance32x32,
+            vpx_highbd_sad32x32x4d_bits8, vpx_highbd_sad_skip_32x32x4d_bits8)
+
+        HIGHBD_BFP(
+            BLOCK_64X64, vpx_highbd_sad64x64_bits8,
+            vpx_highbd_sad_skip_64x64_bits8, vpx_highbd_sad64x64_avg_bits8,
+            vpx_highbd_8_variance64x64, vpx_highbd_8_sub_pixel_variance64x64,
+            vpx_highbd_8_sub_pixel_avg_variance64x64,
+            vpx_highbd_sad64x64x4d_bits8, vpx_highbd_sad_skip_64x64x4d_bits8)
+
+        HIGHBD_BFP(
+            BLOCK_16X16, vpx_highbd_sad16x16_bits8,
+            vpx_highbd_sad_skip_16x16_bits8, vpx_highbd_sad16x16_avg_bits8,
+            vpx_highbd_8_variance16x16, vpx_highbd_8_sub_pixel_variance16x16,
+            vpx_highbd_8_sub_pixel_avg_variance16x16,
+            vpx_highbd_sad16x16x4d_bits8, vpx_highbd_sad_skip_16x16x4d_bits8)
+
+        HIGHBD_BFP(
+            BLOCK_16X8, vpx_highbd_sad16x8_bits8,
+            vpx_highbd_sad_skip_16x8_bits8, vpx_highbd_sad16x8_avg_bits8,
             vpx_highbd_8_variance16x8, vpx_highbd_8_sub_pixel_variance16x8,
-            vpx_highbd_8_sub_pixel_avg_variance16x8, vpx_highbd_sad16x8x3_bits8,
-            vpx_highbd_sad16x8x8_bits8, vpx_highbd_sad16x8x4d_bits8)
+            vpx_highbd_8_sub_pixel_avg_variance16x8,
+            vpx_highbd_sad16x8x4d_bits8, vpx_highbd_sad_skip_16x8x4d_bits8)
 
         HIGHBD_BFP(
-            BLOCK_8X16, vpx_highbd_sad8x16_bits8, vpx_highbd_sad8x16_avg_bits8,
+            BLOCK_8X16, vpx_highbd_sad8x16_bits8,
+            vpx_highbd_sad_skip_8x16_bits8, vpx_highbd_sad8x16_avg_bits8,
             vpx_highbd_8_variance8x16, vpx_highbd_8_sub_pixel_variance8x16,
-            vpx_highbd_8_sub_pixel_avg_variance8x16, vpx_highbd_sad8x16x3_bits8,
-            vpx_highbd_sad8x16x8_bits8, vpx_highbd_sad8x16x4d_bits8)
+            vpx_highbd_8_sub_pixel_avg_variance8x16,
+            vpx_highbd_sad8x16x4d_bits8, vpx_highbd_sad_skip_8x16x4d_bits8)
 
-        HIGHBD_BFP(
-            BLOCK_8X8, vpx_highbd_sad8x8_bits8, vpx_highbd_sad8x8_avg_bits8,
-            vpx_highbd_8_variance8x8, vpx_highbd_8_sub_pixel_variance8x8,
-            vpx_highbd_8_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x3_bits8,
-            vpx_highbd_sad8x8x8_bits8, vpx_highbd_sad8x8x4d_bits8)
+        HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits8,
+                   vpx_highbd_sad_skip_8x8_bits8, vpx_highbd_sad8x8_avg_bits8,
+                   vpx_highbd_8_variance8x8, vpx_highbd_8_sub_pixel_variance8x8,
+                   vpx_highbd_8_sub_pixel_avg_variance8x8,
+                   vpx_highbd_sad8x8x4d_bits8, vpx_highbd_sad_skip_8x8x4d_bits8)
 
         HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits8,
-                   vpx_highbd_sad8x4_avg_bits8, vpx_highbd_8_variance8x4,
-                   vpx_highbd_8_sub_pixel_variance8x4,
-                   vpx_highbd_8_sub_pixel_avg_variance8x4, NULL,
-                   vpx_highbd_sad8x4x8_bits8, vpx_highbd_sad8x4x4d_bits8)
+                   vpx_highbd_sad_skip_8x4_bits8, vpx_highbd_sad8x4_avg_bits8,
+                   vpx_highbd_8_variance8x4, vpx_highbd_8_sub_pixel_variance8x4,
+                   vpx_highbd_8_sub_pixel_avg_variance8x4,
+                   vpx_highbd_sad8x4x4d_bits8, vpx_highbd_sad_skip_8x4x4d_bits8)
 
         HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits8,
-                   vpx_highbd_sad4x8_avg_bits8, vpx_highbd_8_variance4x8,
-                   vpx_highbd_8_sub_pixel_variance4x8,
-                   vpx_highbd_8_sub_pixel_avg_variance4x8, NULL,
-                   vpx_highbd_sad4x8x8_bits8, vpx_highbd_sad4x8x4d_bits8)
+                   vpx_highbd_sad_skip_4x8_bits8, vpx_highbd_sad4x8_avg_bits8,
+                   vpx_highbd_8_variance4x8, vpx_highbd_8_sub_pixel_variance4x8,
+                   vpx_highbd_8_sub_pixel_avg_variance4x8,
+                   vpx_highbd_sad4x8x4d_bits8, vpx_highbd_sad_skip_4x8x4d_bits8)
 
-        HIGHBD_BFP(
-            BLOCK_4X4, vpx_highbd_sad4x4_bits8, vpx_highbd_sad4x4_avg_bits8,
-            vpx_highbd_8_variance4x4, vpx_highbd_8_sub_pixel_variance4x4,
-            vpx_highbd_8_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x3_bits8,
-            vpx_highbd_sad4x4x8_bits8, vpx_highbd_sad4x4x4d_bits8)
+        HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits8,
+                   vpx_highbd_sad_skip_4x4_bits8, vpx_highbd_sad4x4_avg_bits8,
+                   vpx_highbd_8_variance4x4, vpx_highbd_8_sub_pixel_variance4x4,
+                   vpx_highbd_8_sub_pixel_avg_variance4x4,
+                   vpx_highbd_sad4x4x4d_bits8, vpx_highbd_sad_skip_4x4x4d_bits8)
         break;
 
       case VPX_BITS_10:
-        HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits10,
-                   vpx_highbd_sad32x16_avg_bits10, vpx_highbd_10_variance32x16,
-                   vpx_highbd_10_sub_pixel_variance32x16,
-                   vpx_highbd_10_sub_pixel_avg_variance32x16, NULL, NULL,
-                   vpx_highbd_sad32x16x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits10,
-                   vpx_highbd_sad16x32_avg_bits10, vpx_highbd_10_variance16x32,
-                   vpx_highbd_10_sub_pixel_variance16x32,
-                   vpx_highbd_10_sub_pixel_avg_variance16x32, NULL, NULL,
-                   vpx_highbd_sad16x32x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits10,
-                   vpx_highbd_sad64x32_avg_bits10, vpx_highbd_10_variance64x32,
-                   vpx_highbd_10_sub_pixel_variance64x32,
-                   vpx_highbd_10_sub_pixel_avg_variance64x32, NULL, NULL,
-                   vpx_highbd_sad64x32x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits10,
-                   vpx_highbd_sad32x64_avg_bits10, vpx_highbd_10_variance32x64,
-                   vpx_highbd_10_sub_pixel_variance32x64,
-                   vpx_highbd_10_sub_pixel_avg_variance32x64, NULL, NULL,
-                   vpx_highbd_sad32x64x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits10,
-                   vpx_highbd_sad32x32_avg_bits10, vpx_highbd_10_variance32x32,
-                   vpx_highbd_10_sub_pixel_variance32x32,
-                   vpx_highbd_10_sub_pixel_avg_variance32x32,
-                   vpx_highbd_sad32x32x3_bits10, vpx_highbd_sad32x32x8_bits10,
-                   vpx_highbd_sad32x32x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits10,
-                   vpx_highbd_sad64x64_avg_bits10, vpx_highbd_10_variance64x64,
-                   vpx_highbd_10_sub_pixel_variance64x64,
-                   vpx_highbd_10_sub_pixel_avg_variance64x64,
-                   vpx_highbd_sad64x64x3_bits10, vpx_highbd_sad64x64x8_bits10,
-                   vpx_highbd_sad64x64x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits10,
-                   vpx_highbd_sad16x16_avg_bits10, vpx_highbd_10_variance16x16,
-                   vpx_highbd_10_sub_pixel_variance16x16,
-                   vpx_highbd_10_sub_pixel_avg_variance16x16,
-                   vpx_highbd_sad16x16x3_bits10, vpx_highbd_sad16x16x8_bits10,
-                   vpx_highbd_sad16x16x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits10,
-                   vpx_highbd_sad16x8_avg_bits10, vpx_highbd_10_variance16x8,
-                   vpx_highbd_10_sub_pixel_variance16x8,
-                   vpx_highbd_10_sub_pixel_avg_variance16x8,
-                   vpx_highbd_sad16x8x3_bits10, vpx_highbd_sad16x8x8_bits10,
-                   vpx_highbd_sad16x8x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits10,
-                   vpx_highbd_sad8x16_avg_bits10, vpx_highbd_10_variance8x16,
-                   vpx_highbd_10_sub_pixel_variance8x16,
-                   vpx_highbd_10_sub_pixel_avg_variance8x16,
-                   vpx_highbd_sad8x16x3_bits10, vpx_highbd_sad8x16x8_bits10,
-                   vpx_highbd_sad8x16x4d_bits10)
+        HIGHBD_BFP(
+            BLOCK_32X16, vpx_highbd_sad32x16_bits10,
+            vpx_highbd_sad_skip_32x16_bits10, vpx_highbd_sad32x16_avg_bits10,
+            vpx_highbd_10_variance32x16, vpx_highbd_10_sub_pixel_variance32x16,
+            vpx_highbd_10_sub_pixel_avg_variance32x16,
+            vpx_highbd_sad32x16x4d_bits10, vpx_highbd_sad_skip_32x16x4d_bits10)
 
         HIGHBD_BFP(
-            BLOCK_8X8, vpx_highbd_sad8x8_bits10, vpx_highbd_sad8x8_avg_bits10,
-            vpx_highbd_10_variance8x8, vpx_highbd_10_sub_pixel_variance8x8,
-            vpx_highbd_10_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x3_bits10,
-            vpx_highbd_sad8x8x8_bits10, vpx_highbd_sad8x8x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits10,
-                   vpx_highbd_sad8x4_avg_bits10, vpx_highbd_10_variance8x4,
-                   vpx_highbd_10_sub_pixel_variance8x4,
-                   vpx_highbd_10_sub_pixel_avg_variance8x4, NULL,
-                   vpx_highbd_sad8x4x8_bits10, vpx_highbd_sad8x4x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits10,
-                   vpx_highbd_sad4x8_avg_bits10, vpx_highbd_10_variance4x8,
-                   vpx_highbd_10_sub_pixel_variance4x8,
-                   vpx_highbd_10_sub_pixel_avg_variance4x8, NULL,
-                   vpx_highbd_sad4x8x8_bits10, vpx_highbd_sad4x8x4d_bits10)
+            BLOCK_16X32, vpx_highbd_sad16x32_bits10,
+            vpx_highbd_sad_skip_16x32_bits10, vpx_highbd_sad16x32_avg_bits10,
+            vpx_highbd_10_variance16x32, vpx_highbd_10_sub_pixel_variance16x32,
+            vpx_highbd_10_sub_pixel_avg_variance16x32,
+            vpx_highbd_sad16x32x4d_bits10, vpx_highbd_sad_skip_16x32x4d_bits10)
 
         HIGHBD_BFP(
-            BLOCK_4X4, vpx_highbd_sad4x4_bits10, vpx_highbd_sad4x4_avg_bits10,
-            vpx_highbd_10_variance4x4, vpx_highbd_10_sub_pixel_variance4x4,
-            vpx_highbd_10_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x3_bits10,
-            vpx_highbd_sad4x4x8_bits10, vpx_highbd_sad4x4x4d_bits10)
-        break;
-
-      case VPX_BITS_12:
-        HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits12,
-                   vpx_highbd_sad32x16_avg_bits12, vpx_highbd_12_variance32x16,
-                   vpx_highbd_12_sub_pixel_variance32x16,
-                   vpx_highbd_12_sub_pixel_avg_variance32x16, NULL, NULL,
-                   vpx_highbd_sad32x16x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits12,
-                   vpx_highbd_sad16x32_avg_bits12, vpx_highbd_12_variance16x32,
-                   vpx_highbd_12_sub_pixel_variance16x32,
-                   vpx_highbd_12_sub_pixel_avg_variance16x32, NULL, NULL,
-                   vpx_highbd_sad16x32x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits12,
-                   vpx_highbd_sad64x32_avg_bits12, vpx_highbd_12_variance64x32,
-                   vpx_highbd_12_sub_pixel_variance64x32,
-                   vpx_highbd_12_sub_pixel_avg_variance64x32, NULL, NULL,
-                   vpx_highbd_sad64x32x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits12,
-                   vpx_highbd_sad32x64_avg_bits12, vpx_highbd_12_variance32x64,
-                   vpx_highbd_12_sub_pixel_variance32x64,
-                   vpx_highbd_12_sub_pixel_avg_variance32x64, NULL, NULL,
-                   vpx_highbd_sad32x64x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits12,
-                   vpx_highbd_sad32x32_avg_bits12, vpx_highbd_12_variance32x32,
-                   vpx_highbd_12_sub_pixel_variance32x32,
-                   vpx_highbd_12_sub_pixel_avg_variance32x32,
-                   vpx_highbd_sad32x32x3_bits12, vpx_highbd_sad32x32x8_bits12,
-                   vpx_highbd_sad32x32x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits12,
-                   vpx_highbd_sad64x64_avg_bits12, vpx_highbd_12_variance64x64,
-                   vpx_highbd_12_sub_pixel_variance64x64,
-                   vpx_highbd_12_sub_pixel_avg_variance64x64,
-                   vpx_highbd_sad64x64x3_bits12, vpx_highbd_sad64x64x8_bits12,
-                   vpx_highbd_sad64x64x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits12,
-                   vpx_highbd_sad16x16_avg_bits12, vpx_highbd_12_variance16x16,
-                   vpx_highbd_12_sub_pixel_variance16x16,
-                   vpx_highbd_12_sub_pixel_avg_variance16x16,
-                   vpx_highbd_sad16x16x3_bits12, vpx_highbd_sad16x16x8_bits12,
-                   vpx_highbd_sad16x16x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits12,
-                   vpx_highbd_sad16x8_avg_bits12, vpx_highbd_12_variance16x8,
-                   vpx_highbd_12_sub_pixel_variance16x8,
-                   vpx_highbd_12_sub_pixel_avg_variance16x8,
-                   vpx_highbd_sad16x8x3_bits12, vpx_highbd_sad16x8x8_bits12,
-                   vpx_highbd_sad16x8x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits12,
-                   vpx_highbd_sad8x16_avg_bits12, vpx_highbd_12_variance8x16,
-                   vpx_highbd_12_sub_pixel_variance8x16,
-                   vpx_highbd_12_sub_pixel_avg_variance8x16,
-                   vpx_highbd_sad8x16x3_bits12, vpx_highbd_sad8x16x8_bits12,
-                   vpx_highbd_sad8x16x4d_bits12)
+            BLOCK_64X32, vpx_highbd_sad64x32_bits10,
+            vpx_highbd_sad_skip_64x32_bits10, vpx_highbd_sad64x32_avg_bits10,
+            vpx_highbd_10_variance64x32, vpx_highbd_10_sub_pixel_variance64x32,
+            vpx_highbd_10_sub_pixel_avg_variance64x32,
+            vpx_highbd_sad64x32x4d_bits10, vpx_highbd_sad_skip_64x32x4d_bits10)
 
         HIGHBD_BFP(
-            BLOCK_8X8, vpx_highbd_sad8x8_bits12, vpx_highbd_sad8x8_avg_bits12,
-            vpx_highbd_12_variance8x8, vpx_highbd_12_sub_pixel_variance8x8,
-            vpx_highbd_12_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x3_bits12,
-            vpx_highbd_sad8x8x8_bits12, vpx_highbd_sad8x8x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits12,
-                   vpx_highbd_sad8x4_avg_bits12, vpx_highbd_12_variance8x4,
-                   vpx_highbd_12_sub_pixel_variance8x4,
-                   vpx_highbd_12_sub_pixel_avg_variance8x4, NULL,
-                   vpx_highbd_sad8x4x8_bits12, vpx_highbd_sad8x4x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits12,
-                   vpx_highbd_sad4x8_avg_bits12, vpx_highbd_12_variance4x8,
-                   vpx_highbd_12_sub_pixel_variance4x8,
-                   vpx_highbd_12_sub_pixel_avg_variance4x8, NULL,
-                   vpx_highbd_sad4x8x8_bits12, vpx_highbd_sad4x8x4d_bits12)
+            BLOCK_32X64, vpx_highbd_sad32x64_bits10,
+            vpx_highbd_sad_skip_32x64_bits10, vpx_highbd_sad32x64_avg_bits10,
+            vpx_highbd_10_variance32x64, vpx_highbd_10_sub_pixel_variance32x64,
+            vpx_highbd_10_sub_pixel_avg_variance32x64,
+            vpx_highbd_sad32x64x4d_bits10, vpx_highbd_sad_skip_32x64x4d_bits10)
 
         HIGHBD_BFP(
-            BLOCK_4X4, vpx_highbd_sad4x4_bits12, vpx_highbd_sad4x4_avg_bits12,
-            vpx_highbd_12_variance4x4, vpx_highbd_12_sub_pixel_variance4x4,
-            vpx_highbd_12_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x3_bits12,
-            vpx_highbd_sad4x4x8_bits12, vpx_highbd_sad4x4x4d_bits12)
+            BLOCK_32X32, vpx_highbd_sad32x32_bits10,
+            vpx_highbd_sad_skip_32x32_bits10, vpx_highbd_sad32x32_avg_bits10,
+            vpx_highbd_10_variance32x32, vpx_highbd_10_sub_pixel_variance32x32,
+            vpx_highbd_10_sub_pixel_avg_variance32x32,
+            vpx_highbd_sad32x32x4d_bits10, vpx_highbd_sad_skip_32x32x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_64X64, vpx_highbd_sad64x64_bits10,
+            vpx_highbd_sad_skip_64x64_bits10, vpx_highbd_sad64x64_avg_bits10,
+            vpx_highbd_10_variance64x64, vpx_highbd_10_sub_pixel_variance64x64,
+            vpx_highbd_10_sub_pixel_avg_variance64x64,
+            vpx_highbd_sad64x64x4d_bits10, vpx_highbd_sad_skip_64x64x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_16X16, vpx_highbd_sad16x16_bits10,
+            vpx_highbd_sad_skip_16x16_bits10, vpx_highbd_sad16x16_avg_bits10,
+            vpx_highbd_10_variance16x16, vpx_highbd_10_sub_pixel_variance16x16,
+            vpx_highbd_10_sub_pixel_avg_variance16x16,
+            vpx_highbd_sad16x16x4d_bits10, vpx_highbd_sad_skip_16x16x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_16X8, vpx_highbd_sad16x8_bits10,
+            vpx_highbd_sad_skip_16x8_bits10, vpx_highbd_sad16x8_avg_bits10,
+            vpx_highbd_10_variance16x8, vpx_highbd_10_sub_pixel_variance16x8,
+            vpx_highbd_10_sub_pixel_avg_variance16x8,
+            vpx_highbd_sad16x8x4d_bits10, vpx_highbd_sad_skip_16x8x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_8X16, vpx_highbd_sad8x16_bits10,
+            vpx_highbd_sad_skip_8x16_bits10, vpx_highbd_sad8x16_avg_bits10,
+            vpx_highbd_10_variance8x16, vpx_highbd_10_sub_pixel_variance8x16,
+            vpx_highbd_10_sub_pixel_avg_variance8x16,
+            vpx_highbd_sad8x16x4d_bits10, vpx_highbd_sad_skip_8x16x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_8X8, vpx_highbd_sad8x8_bits10, vpx_highbd_sad_skip_8x8_bits10,
+            vpx_highbd_sad8x8_avg_bits10, vpx_highbd_10_variance8x8,
+            vpx_highbd_10_sub_pixel_variance8x8,
+            vpx_highbd_10_sub_pixel_avg_variance8x8,
+            vpx_highbd_sad8x8x4d_bits10, vpx_highbd_sad_skip_8x8x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_8X4, vpx_highbd_sad8x4_bits10, vpx_highbd_sad_skip_8x4_bits10,
+            vpx_highbd_sad8x4_avg_bits10, vpx_highbd_10_variance8x4,
+            vpx_highbd_10_sub_pixel_variance8x4,
+            vpx_highbd_10_sub_pixel_avg_variance8x4,
+            vpx_highbd_sad8x4x4d_bits10, vpx_highbd_sad_skip_8x4x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_4X8, vpx_highbd_sad4x8_bits10, vpx_highbd_sad_skip_4x8_bits10,
+            vpx_highbd_sad4x8_avg_bits10, vpx_highbd_10_variance4x8,
+            vpx_highbd_10_sub_pixel_variance4x8,
+            vpx_highbd_10_sub_pixel_avg_variance4x8,
+            vpx_highbd_sad4x8x4d_bits10, vpx_highbd_sad_skip_4x8x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_4X4, vpx_highbd_sad4x4_bits10, vpx_highbd_sad_skip_4x4_bits10,
+            vpx_highbd_sad4x4_avg_bits10, vpx_highbd_10_variance4x4,
+            vpx_highbd_10_sub_pixel_variance4x4,
+            vpx_highbd_10_sub_pixel_avg_variance4x4,
+            vpx_highbd_sad4x4x4d_bits10, vpx_highbd_sad_skip_4x4x4d_bits10)
         break;
 
       default:
-        assert(0 &&
-               "cm->bit_depth should be VPX_BITS_8, "
-               "VPX_BITS_10 or VPX_BITS_12");
+        assert(cm->bit_depth == VPX_BITS_12);
+        HIGHBD_BFP(
+            BLOCK_32X16, vpx_highbd_sad32x16_bits12,
+            vpx_highbd_sad_skip_32x16_bits12, vpx_highbd_sad32x16_avg_bits12,
+            vpx_highbd_12_variance32x16, vpx_highbd_12_sub_pixel_variance32x16,
+            vpx_highbd_12_sub_pixel_avg_variance32x16,
+            vpx_highbd_sad32x16x4d_bits12, vpx_highbd_sad_skip_32x16x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_16X32, vpx_highbd_sad16x32_bits12,
+            vpx_highbd_sad_skip_16x32_bits12, vpx_highbd_sad16x32_avg_bits12,
+            vpx_highbd_12_variance16x32, vpx_highbd_12_sub_pixel_variance16x32,
+            vpx_highbd_12_sub_pixel_avg_variance16x32,
+            vpx_highbd_sad16x32x4d_bits12, vpx_highbd_sad_skip_16x32x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_64X32, vpx_highbd_sad64x32_bits12,
+            vpx_highbd_sad_skip_64x32_bits12, vpx_highbd_sad64x32_avg_bits12,
+            vpx_highbd_12_variance64x32, vpx_highbd_12_sub_pixel_variance64x32,
+            vpx_highbd_12_sub_pixel_avg_variance64x32,
+            vpx_highbd_sad64x32x4d_bits12, vpx_highbd_sad_skip_64x32x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_32X64, vpx_highbd_sad32x64_bits12,
+            vpx_highbd_sad_skip_32x64_bits12, vpx_highbd_sad32x64_avg_bits12,
+            vpx_highbd_12_variance32x64, vpx_highbd_12_sub_pixel_variance32x64,
+            vpx_highbd_12_sub_pixel_avg_variance32x64,
+            vpx_highbd_sad32x64x4d_bits12, vpx_highbd_sad_skip_32x64x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_32X32, vpx_highbd_sad32x32_bits12,
+            vpx_highbd_sad_skip_32x32_bits12, vpx_highbd_sad32x32_avg_bits12,
+            vpx_highbd_12_variance32x32, vpx_highbd_12_sub_pixel_variance32x32,
+            vpx_highbd_12_sub_pixel_avg_variance32x32,
+            vpx_highbd_sad32x32x4d_bits12, vpx_highbd_sad_skip_32x32x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_64X64, vpx_highbd_sad64x64_bits12,
+            vpx_highbd_sad_skip_64x64_bits12, vpx_highbd_sad64x64_avg_bits12,
+            vpx_highbd_12_variance64x64, vpx_highbd_12_sub_pixel_variance64x64,
+            vpx_highbd_12_sub_pixel_avg_variance64x64,
+            vpx_highbd_sad64x64x4d_bits12, vpx_highbd_sad_skip_64x64x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_16X16, vpx_highbd_sad16x16_bits12,
+            vpx_highbd_sad_skip_16x16_bits12, vpx_highbd_sad16x16_avg_bits12,
+            vpx_highbd_12_variance16x16, vpx_highbd_12_sub_pixel_variance16x16,
+            vpx_highbd_12_sub_pixel_avg_variance16x16,
+            vpx_highbd_sad16x16x4d_bits12, vpx_highbd_sad_skip_16x16x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_16X8, vpx_highbd_sad16x8_bits12,
+            vpx_highbd_sad_skip_16x8_bits12, vpx_highbd_sad16x8_avg_bits12,
+            vpx_highbd_12_variance16x8, vpx_highbd_12_sub_pixel_variance16x8,
+            vpx_highbd_12_sub_pixel_avg_variance16x8,
+            vpx_highbd_sad16x8x4d_bits12, vpx_highbd_sad_skip_16x8x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_8X16, vpx_highbd_sad8x16_bits12,
+            vpx_highbd_sad_skip_8x16_bits12, vpx_highbd_sad8x16_avg_bits12,
+            vpx_highbd_12_variance8x16, vpx_highbd_12_sub_pixel_variance8x16,
+            vpx_highbd_12_sub_pixel_avg_variance8x16,
+            vpx_highbd_sad8x16x4d_bits12, vpx_highbd_sad_skip_8x16x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_8X8, vpx_highbd_sad8x8_bits12, vpx_highbd_sad_skip_8x8_bits12,
+            vpx_highbd_sad8x8_avg_bits12, vpx_highbd_12_variance8x8,
+            vpx_highbd_12_sub_pixel_variance8x8,
+            vpx_highbd_12_sub_pixel_avg_variance8x8,
+            vpx_highbd_sad8x8x4d_bits12, vpx_highbd_sad_skip_8x8x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_8X4, vpx_highbd_sad8x4_bits12, vpx_highbd_sad_skip_8x4_bits12,
+            vpx_highbd_sad8x4_avg_bits12, vpx_highbd_12_variance8x4,
+            vpx_highbd_12_sub_pixel_variance8x4,
+            vpx_highbd_12_sub_pixel_avg_variance8x4,
+            vpx_highbd_sad8x4x4d_bits12, vpx_highbd_sad_skip_8x4x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_4X8, vpx_highbd_sad4x8_bits12, vpx_highbd_sad_skip_4x8_bits12,
+            vpx_highbd_sad4x8_avg_bits12, vpx_highbd_12_variance4x8,
+            vpx_highbd_12_sub_pixel_variance4x8,
+            vpx_highbd_12_sub_pixel_avg_variance4x8,
+            vpx_highbd_sad4x8x4d_bits12, vpx_highbd_sad_skip_4x8x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_4X4, vpx_highbd_sad4x4_bits12, vpx_highbd_sad_skip_4x4_bits12,
+            vpx_highbd_sad4x4_avg_bits12, vpx_highbd_12_variance4x4,
+            vpx_highbd_12_sub_pixel_variance4x4,
+            vpx_highbd_12_sub_pixel_avg_variance4x4,
+            vpx_highbd_sad4x4x4d_bits12, vpx_highbd_sad_skip_4x4x4d_bits12)
+        break;
     }
   }
 }
@@ -1405,32 +2004,89 @@ static void realloc_segmentation_maps(VP9_COMP *cpi) {
 
   // Create the encoder segmentation map and set all entries to 0
   vpx_free(cpi->segmentation_map);
-  CHECK_MEM_ERROR(cm, cpi->segmentation_map,
+  CHECK_MEM_ERROR(&cm->error, cpi->segmentation_map,
                   vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
 
   // Create a map used for cyclic background refresh.
   if (cpi->cyclic_refresh) vp9_cyclic_refresh_free(cpi->cyclic_refresh);
-  CHECK_MEM_ERROR(cm, cpi->cyclic_refresh,
+  CHECK_MEM_ERROR(&cm->error, cpi->cyclic_refresh,
                   vp9_cyclic_refresh_alloc(cm->mi_rows, cm->mi_cols));
 
   // Create a map used to mark inactive areas.
   vpx_free(cpi->active_map.map);
-  CHECK_MEM_ERROR(cm, cpi->active_map.map,
+  CHECK_MEM_ERROR(&cm->error, cpi->active_map.map,
                   vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
 
   // And a place holder structure is the coding context
   // for use if we want to save and restore it
   vpx_free(cpi->coding_context.last_frame_seg_map_copy);
-  CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy,
+  CHECK_MEM_ERROR(&cm->error, cpi->coding_context.last_frame_seg_map_copy,
                   vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
 }
 
+static void alloc_copy_partition_data(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  if (cpi->prev_partition == NULL) {
+    CHECK_MEM_ERROR(&cm->error, cpi->prev_partition,
+                    (BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows,
+                                             sizeof(*cpi->prev_partition)));
+  }
+  if (cpi->prev_segment_id == NULL) {
+    CHECK_MEM_ERROR(
+        &cm->error, cpi->prev_segment_id,
+        (int8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
+                             sizeof(*cpi->prev_segment_id)));
+  }
+  if (cpi->prev_variance_low == NULL) {
+    CHECK_MEM_ERROR(&cm->error, cpi->prev_variance_low,
+                    (uint8_t *)vpx_calloc(
+                        (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1) * 25,
+                        sizeof(*cpi->prev_variance_low)));
+  }
+  if (cpi->copied_frame_cnt == NULL) {
+    CHECK_MEM_ERROR(
+        &cm->error, cpi->copied_frame_cnt,
+        (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
+                              sizeof(*cpi->copied_frame_cnt)));
+  }
+}
+
+static void free_copy_partition_data(VP9_COMP *cpi) {
+  vpx_free(cpi->prev_partition);
+  cpi->prev_partition = NULL;
+  vpx_free(cpi->prev_segment_id);
+  cpi->prev_segment_id = NULL;
+  vpx_free(cpi->prev_variance_low);
+  cpi->prev_variance_low = NULL;
+  vpx_free(cpi->copied_frame_cnt);
+  cpi->copied_frame_cnt = NULL;
+}
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+static void setup_denoiser_buffer(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  if (cpi->oxcf.noise_sensitivity > 0 &&
+      !cpi->denoiser.frame_buffer_initialized) {
+    if (vp9_denoiser_alloc(cm, &cpi->svc, &cpi->denoiser, cpi->use_svc,
+                           cpi->oxcf.noise_sensitivity, cm->width, cm->height,
+                           cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                           cm->use_highbitdepth,
+#endif
+                           VP9_ENC_BORDER_IN_PIXELS))
+      vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                         "Failed to allocate denoiser");
+  }
+}
+#endif
+
 void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   int last_w = cpi->oxcf.width;
   int last_h = cpi->oxcf.height;
 
+  vp9_init_quantizer(cpi);
   if (cm->profile != oxcf->profile) cm->profile = oxcf->profile;
   cm->bit_depth = oxcf->bit_depth;
   cm->color_space = oxcf->color_space;
@@ -1473,12 +2129,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   }
   cpi->encode_breakout = cpi->oxcf.encode_breakout;
 
-  set_rc_buffer_sizes(rc, &cpi->oxcf);
-
-  // Under a configuration change, where maximum_buffer_size may change,
-  // keep buffer level clipped to the maximum allowed buffer size.
-  rc->bits_off_target = VPXMIN(rc->bits_off_target, rc->maximum_buffer_size);
-  rc->buffer_level = VPXMIN(rc->buffer_level, rc->maximum_buffer_size);
+  vp9_set_rc_buffer_sizes(cpi);
 
   // Set up frame rate and related parameters rate control values.
   vp9_new_framerate(cpi, cpi->framerate);
@@ -1502,19 +2153,22 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
     cpi->external_resize = 1;
   }
 
-  if (cpi->initial_width) {
-    int new_mi_size = 0;
-    vp9_set_mb_mi(cm, cm->width, cm->height);
-    new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows);
-    if (cm->mi_alloc_size < new_mi_size) {
-      vp9_free_context_buffers(cm);
-      alloc_compressor_data(cpi);
-      realloc_segmentation_maps(cpi);
-      cpi->initial_width = cpi->initial_height = 0;
-      cpi->external_resize = 0;
-    } else if (cm->mi_alloc_size == new_mi_size &&
-               (cpi->oxcf.width > last_w || cpi->oxcf.height > last_h)) {
-      vp9_alloc_loop_filter(cm);
+  int new_mi_size = 0;
+  vp9_set_mb_mi(cm, cm->width, cm->height);
+  new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows);
+  if (cm->mi_alloc_size < new_mi_size) {
+    vp9_free_context_buffers(cm);
+    vp9_free_pc_tree(&cpi->td);
+    vpx_free(cpi->mbmi_ext_base);
+    alloc_compressor_data(cpi);
+    realloc_segmentation_maps(cpi);
+    cpi->initial_width = cpi->initial_height = 0;
+    cpi->external_resize = 0;
+  } else if (cm->mi_alloc_size == new_mi_size &&
+             (cpi->oxcf.width > last_w || cpi->oxcf.height > last_h)) {
+    if (vp9_alloc_loop_filter(cm)) {
+      vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                         "Failed to allocate loop filter data");
     }
   }
 
@@ -1523,13 +2177,62 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
     update_frame_size(cpi);
 
   if (last_w != cpi->oxcf.width || last_h != cpi->oxcf.height) {
-    memset(cpi->consec_zero_mv, 0,
-           cm->mi_rows * cm->mi_cols * sizeof(*cpi->consec_zero_mv));
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+    vpx_free(cpi->consec_zero_mv);
+    CHECK_MEM_ERROR(
+        &cm->error, cpi->consec_zero_mv,
+        vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->consec_zero_mv)));
+
+    vpx_free(cpi->skin_map);
+    CHECK_MEM_ERROR(
+        &cm->error, cpi->skin_map,
+        vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->skin_map)));
+
+    if (cpi->svc.number_spatial_layers > 1) {
+#if CONFIG_VP9_TEMPORAL_DENOISING
+      // Reset the denoiser for svc on the resize change.
+      if (cpi->oxcf.noise_sensitivity > 0) {
+        vp9_denoiser_free(&cpi->denoiser);
+        setup_denoiser_buffer(cpi);
+      }
+#endif
+      if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+        for (int sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) {
+          const int layer =
+              LAYER_IDS_TO_IDX(sl, 0, cpi->svc.number_temporal_layers);
+          LAYER_CONTEXT *const lc = &cpi->svc.layer_context[layer];
+          lc->sb_index = 0;
+          lc->actual_num_seg1_blocks = 0;
+          lc->actual_num_seg2_blocks = 0;
+          lc->counter_encode_maxq_scene_change = 0;
+          vpx_free(lc->map);
+          CHECK_MEM_ERROR(
+              &cm->error, lc->map,
+              vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*lc->map)));
+          vpx_free(lc->last_coded_q_map);
+          CHECK_MEM_ERROR(&cm->error, lc->last_coded_q_map,
+                          vpx_malloc(cm->mi_rows * cm->mi_cols *
+                                     sizeof(*lc->last_coded_q_map)));
+          memset(lc->last_coded_q_map, MAXQ, cm->mi_rows * cm->mi_cols);
+          vpx_free(lc->consec_zero_mv);
+          CHECK_MEM_ERROR(&cm->error, lc->consec_zero_mv,
+                          vpx_calloc(cm->mi_rows * cm->mi_cols,
+                                     sizeof(*lc->consec_zero_mv)));
+        }
+        cpi->refresh_golden_frame = 1;
+        cpi->refresh_alt_ref_frame = 1;
+      }
+    }
+
+    free_copy_partition_data(cpi);
+    alloc_copy_partition_data(cpi);
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+        cpi->svc.number_spatial_layers == 1)
       vp9_cyclic_refresh_reset_resize(cpi);
+    rc->rc_1_frame = 0;
+    rc->rc_2_frame = 0;
   }
 
-  if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ||
+  if ((cpi->svc.number_temporal_layers > 1) ||
       ((cpi->svc.number_temporal_layers > 1 ||
         cpi->svc.number_spatial_layers > 1) &&
        cpi->oxcf.pass != 1)) {
@@ -1537,6 +2240,8 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
                                            (int)cpi->oxcf.target_bandwidth);
   }
 
+  vp9_check_reset_rc_flag(cpi);
+
   cpi->alt_ref_source = NULL;
   rc->is_src_frame_alt_ref = 0;
 
@@ -1554,12 +2259,9 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
 #if CONFIG_VP9_HIGHBITDEPTH
   highbd_set_var_fns(cpi);
 #endif
-}
 
-#ifndef M_LOG2_E
-#define M_LOG2_E 0.693147180559945309417
-#endif
-#define log2f(x) (log(x) / (float)M_LOG2_E)
+  vp9_set_row_mt(cpi);
+}
 
 /***********************************************************************
  * Read before modifying 'cal_nmvjointsadcost' or 'cal_nmvsadcosts'    *
@@ -1567,7 +2269,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
  * The following 2 functions ('cal_nmvjointsadcost' and                *
  * 'cal_nmvsadcosts') are used to calculate cost lookup tables         *
  * used by 'vp9_diamond_search_sad'. The C implementation of the       *
- * function is generic, but the AVX intrinsics optimised version       *
+ * function is generic, but the NEON intrinsics optimised version      *
  * relies on the following properties of the computed tables:          *
  * For cal_nmvjointsadcost:                                            *
  *   - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3]     *
@@ -1576,7 +2278,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
  *         (Equal costs for both components)                           *
  *   - For all i: mvsadcost[0][i] == mvsadcost[0][-i]                  *
  *         (Cost function is even)                                     *
- * If these do not hold, then the AVX optimised version of the         *
+ * If these do not hold, then the NEON optimised version of the        *
  * 'vp9_diamond_search_sad' function cannot be used as it is, in which *
  * case you can revert to using the C function instead.                *
  ***********************************************************************/
@@ -1624,10 +2326,57 @@ static void cal_nmvsadcosts_hp(int *mvsadcost[2]) {
   } while (++i <= MV_MAX);
 }
 
-VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
+static void init_ref_frame_bufs(VP9_COMMON *cm) {
+  int i;
+  BufferPool *const pool = cm->buffer_pool;
+  cm->new_fb_idx = INVALID_IDX;
+  for (i = 0; i < REF_FRAMES; ++i) {
+    cm->ref_frame_map[i] = INVALID_IDX;
+  }
+  for (i = 0; i < FRAME_BUFFERS; ++i) {
+    pool->frame_bufs[i].ref_count = 0;
+  }
+}
+
+static void update_initial_width(VP9_COMP *cpi, int use_highbitdepth,
+                                 int subsampling_x, int subsampling_y) {
+  VP9_COMMON *const cm = &cpi->common;
+#if !CONFIG_VP9_HIGHBITDEPTH
+  (void)use_highbitdepth;
+  assert(use_highbitdepth == 0);
+#endif
+
+  if (!cpi->initial_width ||
+#if CONFIG_VP9_HIGHBITDEPTH
+      cm->use_highbitdepth != use_highbitdepth ||
+#endif
+      cm->subsampling_x != subsampling_x ||
+      cm->subsampling_y != subsampling_y) {
+    cm->subsampling_x = subsampling_x;
+    cm->subsampling_y = subsampling_y;
+#if CONFIG_VP9_HIGHBITDEPTH
+    cm->use_highbitdepth = use_highbitdepth;
+#endif
+    alloc_util_frame_buffers(cpi);
+    // The initial_width/height is used to clamp the encoding width/height in
+    // vp9_set_size_literal(). The check below is added to avoid setting the
+    // initial_width/height to a smaller resolution than the one configured.
+    // This can happen when the user passes in a lower resolution on the very
+    // first frame (after creating the encoder with a larger resolution). For
+    // spatial layers this will prevent user from going back up in resolution
+    // (i.e., the top layer will get stuck at the lower resolution).
+    if (cm->width > cpi->initial_width || cm->height > cpi->initial_height) {
+      cpi->initial_width = cm->width;
+      cpi->initial_height = cm->height;
+    }
+    cpi->initial_mbs = cm->MBs;
+  }
+}
+
+VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
                                 BufferPool *const pool) {
   unsigned int i;
-  VP9_COMP *volatile const cpi = vpx_memalign(32, sizeof(VP9_COMP));
+  VP9_COMP *volatile const cpi = vpx_memalign(32, sizeof(*cpi));
   VP9_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL;
 
   if (!cm) return NULL;
@@ -1645,74 +2394,74 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
   cm->free_mi = vp9_enc_free_mi;
   cm->setup_mi = vp9_enc_setup_mi;
 
-  CHECK_MEM_ERROR(cm, cm->fc, (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
+  CHECK_MEM_ERROR(&cm->error, cm->fc,
+                  (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
   CHECK_MEM_ERROR(
-      cm, cm->frame_contexts,
+      &cm->error, cm->frame_contexts,
       (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS, sizeof(*cm->frame_contexts)));
 
+  cpi->compute_frame_low_motion_onepass = 1;
   cpi->use_svc = 0;
-  cpi->resize_state = 0;
+  cpi->resize_state = ORIG;
   cpi->external_resize = 0;
   cpi->resize_avg_qp = 0;
   cpi->resize_buffer_underflow = 0;
   cpi->use_skin_detection = 0;
   cpi->common.buffer_pool = pool;
+  init_ref_frame_bufs(cm);
 
   cpi->force_update_segmentation = 0;
 
   init_config(cpi, oxcf);
-  vp9_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc);
+  cpi->frame_info = vp9_get_frame_info(oxcf);
 
-  cm->current_video_frame = 0;
-  cpi->partition_search_skippable_frame = 0;
+  vp9_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc);
+  vp9_init_rd_parameters(cpi);
+
+  init_frame_indexes(cm);
+  cpi->initial_width = cpi->oxcf.width;
+  cpi->initial_height = cpi->oxcf.height;
   cpi->tile_data = NULL;
 
   realloc_segmentation_maps(cpi);
 
-  CHECK_MEM_ERROR(cm, cpi->alt_ref_aq, vp9_alt_ref_aq_create());
+  CHECK_MEM_ERROR(
+      &cm->error, cpi->skin_map,
+      vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->skin_map)));
+
+#if !CONFIG_REALTIME_ONLY
+  CHECK_MEM_ERROR(&cm->error, cpi->alt_ref_aq, vp9_alt_ref_aq_create());
+#endif
 
   CHECK_MEM_ERROR(
-      cm, cpi->consec_zero_mv,
+      &cm->error, cpi->consec_zero_mv,
       vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->consec_zero_mv)));
 
-  CHECK_MEM_ERROR(cm, cpi->nmvcosts[0],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts[0],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[0])));
-  CHECK_MEM_ERROR(cm, cpi->nmvcosts[1],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts[1],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[1])));
-  CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[0],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts_hp[0],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[0])));
-  CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[1],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts_hp[1],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[1])));
-  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[0],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts[0],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[0])));
-  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[1],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts[1],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[1])));
-  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[0],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts_hp[0],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[0])));
-  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[1],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts_hp[1],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[1])));
 
   for (i = 0; i < (sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]));
        i++) {
     CHECK_MEM_ERROR(
-        cm, cpi->mbgraph_stats[i].mb_stats,
+        &cm->error, cpi->mbgraph_stats[i].mb_stats,
         vpx_calloc(cm->MBs * sizeof(*cpi->mbgraph_stats[i].mb_stats), 1));
   }
 
-#if CONFIG_FP_MB_STATS
-  cpi->use_fp_mb_stats = 0;
-  if (cpi->use_fp_mb_stats) {
-    // a place holder used to store the first pass mb stats in the first pass
-    CHECK_MEM_ERROR(cm, cpi->twopass.frame_mb_stats_buf,
-                    vpx_calloc(cm->MBs * sizeof(uint8_t), 1));
-  } else {
-    cpi->twopass.frame_mb_stats_buf = NULL;
-  }
-#endif
-
   cpi->refresh_alt_ref_frame = 0;
-  cpi->multi_arf_last_grp_enabled = 0;
-
   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
 
   init_level_info(&cpi->level_info);
@@ -1752,10 +2501,12 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
   }
 
   if (cpi->b_calculate_consistency) {
-    CHECK_MEM_ERROR(cm, cpi->ssim_vars,
-                    vpx_malloc(sizeof(*cpi->ssim_vars) * 4 *
-                               cpi->common.mi_rows * cpi->common.mi_cols));
+    CHECK_MEM_ERROR(&cm->error, cpi->ssim_vars,
+                    vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols,
+                               sizeof(*cpi->ssim_vars) * 4));
     cpi->worst_consistency = 100.0;
+  } else {
+    cpi->ssim_vars = NULL;
   }
 
 #endif
@@ -1785,11 +2536,16 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
 #endif
 #endif
 #ifdef OUTPUT_YUV_SKINMAP
-  yuv_skinmap_file = fopen("skinmap.yuv", "ab");
+  yuv_skinmap_file = fopen("skinmap.yuv", "wb");
 #endif
 #ifdef OUTPUT_YUV_REC
   yuv_rec_file = fopen("rec.yuv", "wb");
 #endif
+#ifdef OUTPUT_YUV_SVC_SRC
+  yuv_svc_src[0] = fopen("svc_src_0.yuv", "wb");
+  yuv_svc_src[1] = fopen("svc_src_1.yuv", "wb");
+  yuv_svc_src[2] = fopen("svc_src_2.yuv", "wb");
+#endif
 
 #if 0
   framepsnr = fopen("framepsnr.stt", "a");
@@ -1798,6 +2554,14 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
 
   cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
 
+  {
+    vpx_codec_err_t codec_status = vp9_extrc_init(&cpi->ext_ratectrl);
+    if (codec_status != VPX_CODEC_OK) {
+      vpx_internal_error(&cm->error, codec_status, "vp9_extrc_init() failed");
+    }
+  }
+
+#if !CONFIG_REALTIME_ONLY
   if (oxcf->pass == 1) {
     vp9_init_first_pass(cpi);
   } else if (oxcf->pass == 2) {
@@ -1808,130 +2572,149 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
         cpi->svc.number_temporal_layers > 1) {
       FIRSTPASS_STATS *const stats = oxcf->two_pass_stats_in.buf;
       FIRSTPASS_STATS *stats_copy[VPX_SS_MAX_LAYERS] = { 0 };
-      int i;
+      int n;
 
-      for (i = 0; i < oxcf->ss_number_layers; ++i) {
+      for (n = 0; n < oxcf->ss_number_layers; ++n) {
         FIRSTPASS_STATS *const last_packet_for_layer =
-            &stats[packets - oxcf->ss_number_layers + i];
+            &stats[packets - oxcf->ss_number_layers + n];
         const int layer_id = (int)last_packet_for_layer->spatial_layer_id;
         const int packets_in_layer = (int)last_packet_for_layer->count + 1;
         if (layer_id >= 0 && layer_id < oxcf->ss_number_layers) {
+          int num_frames;
           LAYER_CONTEXT *const lc = &cpi->svc.layer_context[layer_id];
 
           vpx_free(lc->rc_twopass_stats_in.buf);
 
           lc->rc_twopass_stats_in.sz = packets_in_layer * packet_sz;
-          CHECK_MEM_ERROR(cm, lc->rc_twopass_stats_in.buf,
+          CHECK_MEM_ERROR(&cm->error, lc->rc_twopass_stats_in.buf,
                           vpx_malloc(lc->rc_twopass_stats_in.sz));
           lc->twopass.stats_in_start = lc->rc_twopass_stats_in.buf;
           lc->twopass.stats_in = lc->twopass.stats_in_start;
           lc->twopass.stats_in_end =
               lc->twopass.stats_in_start + packets_in_layer - 1;
+          // Note the last packet is cumulative first pass stats.
+          // So the number of frames is packet number minus one
+          num_frames = packets_in_layer - 1;
+          fps_init_first_pass_info(&lc->twopass.first_pass_info,
+                                   lc->rc_twopass_stats_in.buf, num_frames);
           stats_copy[layer_id] = lc->rc_twopass_stats_in.buf;
         }
       }
 
-      for (i = 0; i < packets; ++i) {
-        const int layer_id = (int)stats[i].spatial_layer_id;
+      for (n = 0; n < packets; ++n) {
+        const int layer_id = (int)stats[n].spatial_layer_id;
         if (layer_id >= 0 && layer_id < oxcf->ss_number_layers &&
             stats_copy[layer_id] != NULL) {
-          *stats_copy[layer_id] = stats[i];
+          *stats_copy[layer_id] = stats[n];
           ++stats_copy[layer_id];
         }
       }
 
       vp9_init_second_pass_spatial_svc(cpi);
     } else {
-#if CONFIG_FP_MB_STATS
-      if (cpi->use_fp_mb_stats) {
-        const size_t psz = cpi->common.MBs * sizeof(uint8_t);
-        const int ps = (int)(oxcf->firstpass_mb_stats_in.sz / psz);
-
-        cpi->twopass.firstpass_mb_stats.mb_stats_start =
-            oxcf->firstpass_mb_stats_in.buf;
-        cpi->twopass.firstpass_mb_stats.mb_stats_end =
-            cpi->twopass.firstpass_mb_stats.mb_stats_start +
-            (ps - 1) * cpi->common.MBs * sizeof(uint8_t);
-      }
-#endif
+      int num_frames;
 
       cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf;
       cpi->twopass.stats_in = cpi->twopass.stats_in_start;
       cpi->twopass.stats_in_end = &cpi->twopass.stats_in[packets - 1];
+      // Note the last packet is cumulative first pass stats.
+      // So the number of frames is packet number minus one
+      num_frames = packets - 1;
+      fps_init_first_pass_info(&cpi->twopass.first_pass_info,
+                               oxcf->two_pass_stats_in.buf, num_frames);
 
       vp9_init_second_pass(cpi);
     }
   }
+#endif  // !CONFIG_REALTIME_ONLY
 
-  vp9_set_speed_features_framesize_independent(cpi);
-  vp9_set_speed_features_framesize_dependent(cpi);
+  cpi->mb_wiener_var_cols = 0;
+  cpi->mb_wiener_var_rows = 0;
+  cpi->mb_wiener_variance = NULL;
 
-  // Allocate memory to store variances for a frame.
-  CHECK_MEM_ERROR(cm, cpi->source_diff_var, vpx_calloc(cm->MBs, sizeof(diff)));
-  cpi->source_var_thresh = 0;
-  cpi->frames_till_next_var_check = 0;
+  vp9_set_speed_features_framesize_independent(cpi, oxcf->speed);
+  vp9_set_speed_features_framesize_dependent(cpi, oxcf->speed);
 
-#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF) \
-  cpi->fn_ptr[BT].sdf = SDF;                                    \
-  cpi->fn_ptr[BT].sdaf = SDAF;                                  \
-  cpi->fn_ptr[BT].vf = VF;                                      \
-  cpi->fn_ptr[BT].svf = SVF;                                    \
-  cpi->fn_ptr[BT].svaf = SVAF;                                  \
-  cpi->fn_ptr[BT].sdx3f = SDX3F;                                \
-  cpi->fn_ptr[BT].sdx8f = SDX8F;                                \
-  cpi->fn_ptr[BT].sdx4df = SDX4DF;
+  {
+    const int bsize = BLOCK_16X16;
+    const int w = num_8x8_blocks_wide_lookup[bsize];
+    const int h = num_8x8_blocks_high_lookup[bsize];
+    const int num_cols = (cm->mi_cols + w - 1) / w;
+    const int num_rows = (cm->mi_rows + h - 1) / h;
+    CHECK_MEM_ERROR(&cm->error, cpi->mi_ssim_rdmult_scaling_factors,
+                    vpx_calloc(num_rows * num_cols,
+                               sizeof(*cpi->mi_ssim_rdmult_scaling_factors)));
+  }
 
-  BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg, vpx_variance32x16,
-      vpx_sub_pixel_variance32x16, vpx_sub_pixel_avg_variance32x16, NULL, NULL,
-      vpx_sad32x16x4d)
+  cpi->kmeans_data_arr_alloc = 0;
+#if CONFIG_NON_GREEDY_MV
+  cpi->tpl_ready = 0;
+#endif  // CONFIG_NON_GREEDY_MV
+  for (i = 0; i < MAX_ARF_GOP_SIZE; ++i) {
+    cpi->tpl_stats[i].tpl_stats_ptr = NULL;
+  }
 
-  BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg, vpx_variance16x32,
-      vpx_sub_pixel_variance16x32, vpx_sub_pixel_avg_variance16x32, NULL, NULL,
-      vpx_sad16x32x4d)
+#define BFP(BT, SDF, SDSF, SDAF, VF, SVF, SVAF, SDX4DF, SDSX4DF) \
+  cpi->fn_ptr[BT].sdf = SDF;                                     \
+  cpi->fn_ptr[BT].sdsf = SDSF;                                   \
+  cpi->fn_ptr[BT].sdaf = SDAF;                                   \
+  cpi->fn_ptr[BT].vf = VF;                                       \
+  cpi->fn_ptr[BT].svf = SVF;                                     \
+  cpi->fn_ptr[BT].svaf = SVAF;                                   \
+  cpi->fn_ptr[BT].sdx4df = SDX4DF;                               \
+  cpi->fn_ptr[BT].sdsx4df = SDSX4DF;
 
-  BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg, vpx_variance64x32,
-      vpx_sub_pixel_variance64x32, vpx_sub_pixel_avg_variance64x32, NULL, NULL,
-      vpx_sad64x32x4d)
+  BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad_skip_32x16, vpx_sad32x16_avg,
+      vpx_variance32x16, vpx_sub_pixel_variance32x16,
+      vpx_sub_pixel_avg_variance32x16, vpx_sad32x16x4d, vpx_sad_skip_32x16x4d)
 
-  BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg, vpx_variance32x64,
-      vpx_sub_pixel_variance32x64, vpx_sub_pixel_avg_variance32x64, NULL, NULL,
-      vpx_sad32x64x4d)
+  BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad_skip_16x32, vpx_sad16x32_avg,
+      vpx_variance16x32, vpx_sub_pixel_variance16x32,
+      vpx_sub_pixel_avg_variance16x32, vpx_sad16x32x4d, vpx_sad_skip_16x32x4d)
 
-  BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg, vpx_variance32x32,
-      vpx_sub_pixel_variance32x32, vpx_sub_pixel_avg_variance32x32,
-      vpx_sad32x32x3, vpx_sad32x32x8, vpx_sad32x32x4d)
+  BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad_skip_64x32, vpx_sad64x32_avg,
+      vpx_variance64x32, vpx_sub_pixel_variance64x32,
+      vpx_sub_pixel_avg_variance64x32, vpx_sad64x32x4d, vpx_sad_skip_64x32x4d)
 
-  BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg, vpx_variance64x64,
-      vpx_sub_pixel_variance64x64, vpx_sub_pixel_avg_variance64x64,
-      vpx_sad64x64x3, vpx_sad64x64x8, vpx_sad64x64x4d)
+  BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad_skip_32x64, vpx_sad32x64_avg,
+      vpx_variance32x64, vpx_sub_pixel_variance32x64,
+      vpx_sub_pixel_avg_variance32x64, vpx_sad32x64x4d, vpx_sad_skip_32x64x4d)
 
-  BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg, vpx_variance16x16,
-      vpx_sub_pixel_variance16x16, vpx_sub_pixel_avg_variance16x16,
-      vpx_sad16x16x3, vpx_sad16x16x8, vpx_sad16x16x4d)
+  BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad_skip_32x32, vpx_sad32x32_avg,
+      vpx_variance32x32, vpx_sub_pixel_variance32x32,
+      vpx_sub_pixel_avg_variance32x32, vpx_sad32x32x4d, vpx_sad_skip_32x32x4d)
 
-  BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg, vpx_variance16x8,
-      vpx_sub_pixel_variance16x8, vpx_sub_pixel_avg_variance16x8, vpx_sad16x8x3,
-      vpx_sad16x8x8, vpx_sad16x8x4d)
+  BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad_skip_64x64, vpx_sad64x64_avg,
+      vpx_variance64x64, vpx_sub_pixel_variance64x64,
+      vpx_sub_pixel_avg_variance64x64, vpx_sad64x64x4d, vpx_sad_skip_64x64x4d)
 
-  BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg, vpx_variance8x16,
-      vpx_sub_pixel_variance8x16, vpx_sub_pixel_avg_variance8x16, vpx_sad8x16x3,
-      vpx_sad8x16x8, vpx_sad8x16x4d)
+  BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad_skip_16x16, vpx_sad16x16_avg,
+      vpx_variance16x16, vpx_sub_pixel_variance16x16,
+      vpx_sub_pixel_avg_variance16x16, vpx_sad16x16x4d, vpx_sad_skip_16x16x4d)
 
-  BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg, vpx_variance8x8,
-      vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x3,
-      vpx_sad8x8x8, vpx_sad8x8x4d)
+  BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad_skip_16x8, vpx_sad16x8_avg,
+      vpx_variance16x8, vpx_sub_pixel_variance16x8,
+      vpx_sub_pixel_avg_variance16x8, vpx_sad16x8x4d, vpx_sad_skip_16x8x4d)
 
-  BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg, vpx_variance8x4,
-      vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, NULL,
-      vpx_sad8x4x8, vpx_sad8x4x4d)
+  BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad_skip_8x16, vpx_sad8x16_avg,
+      vpx_variance8x16, vpx_sub_pixel_variance8x16,
+      vpx_sub_pixel_avg_variance8x16, vpx_sad8x16x4d, vpx_sad_skip_8x16x4d)
 
-  BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg, vpx_variance4x8,
-      vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, NULL,
-      vpx_sad4x8x8, vpx_sad4x8x4d)
+  BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad_skip_8x8, vpx_sad8x8_avg, vpx_variance8x8,
+      vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d,
+      vpx_sad_skip_8x8x4d)
 
-  BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg, vpx_variance4x4,
-      vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x3,
-      vpx_sad4x4x8, vpx_sad4x4x4d)
+  BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad_skip_8x4, vpx_sad8x4_avg, vpx_variance8x4,
+      vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d,
+      vpx_sad_skip_8x4x4d)
+
+  BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad_skip_4x8, vpx_sad4x8_avg, vpx_variance4x8,
+      vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d,
+      vpx_sad_skip_4x8x4d)
+
+  BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad_skip_4x4, vpx_sad4x4_avg, vpx_variance4x4,
+      vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d,
+      vpx_sad_skip_4x4x4d)
 
 #if CONFIG_VP9_HIGHBITDEPTH
   highbd_set_var_fns(cpi);
@@ -1946,6 +2729,17 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
 
   vp9_loop_filter_init(cm);
 
+  // Set up the unit scaling factor used during motion search.
+#if CONFIG_VP9_HIGHBITDEPTH
+  vp9_setup_scale_factors_for_frame(&cpi->me_sf, cm->width, cm->height,
+                                    cm->width, cm->height,
+                                    cm->use_highbitdepth);
+#else
+  vp9_setup_scale_factors_for_frame(&cpi->me_sf, cm->width, cm->height,
+                                    cm->width, cm->height);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  cpi->td.mb.me_sf = &cpi->me_sf;
+
   cm->error.setjmp = 0;
 
   return cpi;
@@ -1961,10 +2755,13 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
 void vp9_remove_compressor(VP9_COMP *cpi) {
   VP9_COMMON *cm;
   unsigned int i;
-  int t;
 
   if (!cpi) return;
 
+#if CONFIG_INTERNAL_STATS
+  vpx_free(cpi->ssim_vars);
+#endif
+
   cm = &cpi->common;
   if (cm->current_video_frame > 0) {
 #if CONFIG_INTERNAL_STATS
@@ -1998,16 +2795,20 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
         snprintf(headings, sizeof(headings),
                  "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
                  "VPXSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t"
-                 "WstPsnr\tWstSsim\tWstFast\tWstHVS");
+                 "WstPsnr\tWstSsim\tWstFast\tWstHVS\t"
+                 "AVPsnrY\tAPsnrCb\tAPsnrCr");
         snprintf(results, sizeof(results),
                  "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
                  "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
-                 "%7.3f\t%7.3f\t%7.3f\t%7.3f",
+                 "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+                 "%7.3f\t%7.3f\t%7.3f",
                  dr, cpi->psnr.stat[ALL] / cpi->count, total_psnr,
                  cpi->psnrp.stat[ALL] / cpi->count, totalp_psnr, total_ssim,
                  totalp_ssim, cpi->fastssim.stat[ALL] / cpi->count,
                  cpi->psnrhvs.stat[ALL] / cpi->count, cpi->psnr.worst,
-                 cpi->worst_ssim, cpi->fastssim.worst, cpi->psnrhvs.worst);
+                 cpi->worst_ssim, cpi->fastssim.worst, cpi->psnrhvs.worst,
+                 cpi->psnr.stat[Y] / cpi->count, cpi->psnr.stat[U] / cpi->count,
+                 cpi->psnr.stat[V] / cpi->count);
 
         if (cpi->b_calculate_blockiness) {
           SNPRINT(headings, "\t  Block\tWstBlck");
@@ -2025,20 +2826,26 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
           SNPRINT2(results, "\t%7.3f", cpi->worst_consistency);
         }
 
-        fprintf(f, "%s\t    Time\tRcErr\tAbsErr\n", headings);
-        fprintf(f, "%s\t%8.0f\t%7.2f\t%7.2f\n", results, total_encode_time,
-                rate_err, fabs(rate_err));
+        SNPRINT(headings, "\t    Time\tRcErr\tAbsErr");
+        SNPRINT2(results, "\t%8.0f", total_encode_time);
+        SNPRINT2(results, "\t%7.2f", rate_err);
+        SNPRINT2(results, "\t%7.2f", fabs(rate_err));
+
+        fprintf(f, "%s\tAPsnr611\n", headings);
+        fprintf(
+            f, "%s\t%7.3f\n", results,
+            (6 * cpi->psnr.stat[Y] + cpi->psnr.stat[U] + cpi->psnr.stat[V]) /
+                (cpi->count * 8));
       }
 
       fclose(f);
     }
-
 #endif
 
 #if 0
     {
       printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000);
-      printf("\n_frames recive_data encod_mb_row compress_frame  Total\n");
+      printf("\n_frames receive_data encod_mb_row compress_frame  Total\n");
       printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame,
              cpi->time_receive_data / 1000, cpi->time_encode_sb_row / 1000,
              cpi->time_compress_data / 1000,
@@ -2051,29 +2858,23 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
   vp9_denoiser_free(&(cpi->denoiser));
 #endif
 
-  for (t = 0; t < cpi->num_workers; ++t) {
-    VPxWorker *const worker = &cpi->workers[t];
-    EncWorkerData *const thread_data = &cpi->tile_thr_data[t];
-
-    // Deallocate allocated threads.
-    vpx_get_worker_interface()->end(worker);
-
-    // Deallocate allocated thread data.
-    if (t < cpi->num_workers - 1) {
-      vpx_free(thread_data->td->counts);
-      vp9_free_pc_tree(thread_data->td);
-      vpx_free(thread_data->td);
-    }
-  }
-  vpx_free(cpi->tile_thr_data);
-  vpx_free(cpi->workers);
-
-  if (cpi->num_workers > 1) {
-    vp9_loop_filter_dealloc(&cpi->lf_row_sync);
-    vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
+  if (cpi->kmeans_data_arr_alloc) {
+#if CONFIG_MULTITHREAD
+    pthread_mutex_destroy(&cpi->kmeans_mutex);
+#endif
+    vpx_free(cpi->kmeans_data_arr);
   }
 
+  vp9_free_tpl_buffer(cpi);
+
+  vp9_loop_filter_dealloc(&cpi->lf_row_sync);
+  vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
+  vp9_row_mt_mem_dealloc(cpi);
+  vp9_encode_free_mt_data(cpi);
+
+#if !CONFIG_REALTIME_ONLY
   vp9_alt_ref_aq_destroy(cpi->alt_ref_aq);
+#endif
 
   dealloc_compressor_data(cpi);
 
@@ -2082,12 +2883,11 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
     vpx_free(cpi->mbgraph_stats[i].mb_stats);
   }
 
-#if CONFIG_FP_MB_STATS
-  if (cpi->use_fp_mb_stats) {
-    vpx_free(cpi->twopass.frame_mb_stats_buf);
-    cpi->twopass.frame_mb_stats_buf = NULL;
-  }
-#endif
+  vp9_extrc_delete(&cpi->ext_ratectrl);
+
+  // Help detect use after free of the error detail string.
+  memset(cm->error.detail, 'A', sizeof(cm->error.detail) - 1);
+  cm->error.detail[sizeof(cm->error.detail) - 1] = '\0';
 
   vp9_remove_common(cm);
   vp9_free_ref_frame_buffers(cm->buffer_pool);
@@ -2107,6 +2907,11 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
 #ifdef OUTPUT_YUV_REC
   fclose(yuv_rec_file);
 #endif
+#ifdef OUTPUT_YUV_SVC_SRC
+  fclose(yuv_svc_src[0]);
+  fclose(yuv_svc_src[1]);
+  fclose(yuv_svc_src[2]);
+#endif
 
 #if 0
 
@@ -2122,30 +2927,21 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
 #endif
 }
 
-static void generate_psnr_packet(VP9_COMP *cpi) {
-  struct vpx_codec_cx_pkt pkt;
-  int i;
-  PSNR_STATS psnr;
+int vp9_get_psnr(const VP9_COMP *cpi, PSNR_STATS *psnr) {
+  if (is_psnr_calc_enabled(cpi)) {
 #if CONFIG_VP9_HIGHBITDEPTH
-  vpx_calc_highbd_psnr(cpi->raw_source_frame, cpi->common.frame_to_show, &psnr,
-                       cpi->td.mb.e_mbd.bd, cpi->oxcf.input_bit_depth);
+    vpx_calc_highbd_psnr(cpi->raw_source_frame, cpi->common.frame_to_show, psnr,
+                         cpi->td.mb.e_mbd.bd, cpi->oxcf.input_bit_depth,
+                         cpi->svc.spatial_layer_id);
 #else
-  vpx_calc_psnr(cpi->raw_source_frame, cpi->common.frame_to_show, &psnr);
+    vpx_calc_psnr(cpi->raw_source_frame, cpi->common.frame_to_show, psnr,
+                  cpi->svc.spatial_layer_id);
 #endif
-
-  for (i = 0; i < 4; ++i) {
-    pkt.data.psnr.samples[i] = psnr.samples[i];
-    pkt.data.psnr.sse[i] = psnr.sse[i];
-    pkt.data.psnr.psnr[i] = psnr.psnr[i];
+    return 1;
+  } else {
+    vp9_zero(*psnr);
+    return 0;
   }
-  pkt.kind = VPX_CODEC_PSNR_PKT;
-  if (cpi->use_svc)
-    cpi->svc
-        .layer_context[cpi->svc.spatial_layer_id *
-                       cpi->svc.number_temporal_layers]
-        .psnr_pkt = pkt.data.psnr;
-  else
-    vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
 }
 
 int vp9_use_as_reference(VP9_COMP *cpi, int ref_frame_flags) {
@@ -2164,7 +2960,7 @@ void vp9_update_reference(VP9_COMP *cpi, int ref_frame_flags) {
 
 static YV12_BUFFER_CONFIG *get_vp9_ref_frame_buffer(
     VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag) {
-  MV_REFERENCE_FRAME ref_frame = NONE;
+  MV_REFERENCE_FRAME ref_frame = NO_REF_FRAME;
   if (ref_frame_flag == VP9_LAST_FLAG)
     ref_frame = LAST_FRAME;
   else if (ref_frame_flag == VP9_GOLD_FLAG)
@@ -2172,14 +2968,15 @@ static YV12_BUFFER_CONFIG *get_vp9_ref_frame_buffer(
   else if (ref_frame_flag == VP9_ALT_FLAG)
     ref_frame = ALTREF_FRAME;
 
-  return ref_frame == NONE ? NULL : get_ref_frame_buffer(cpi, ref_frame);
+  return ref_frame == NO_REF_FRAME ? NULL
+                                   : get_ref_frame_buffer(cpi, ref_frame);
 }
 
 int vp9_copy_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
                            YV12_BUFFER_CONFIG *sd) {
   YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag);
   if (cfg) {
-    vp8_yv12_copy_frame(cfg, sd);
+    vpx_yv12_copy_frame(cfg, sd);
     return 0;
   } else {
     return -1;
@@ -2190,7 +2987,7 @@ int vp9_set_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
                           YV12_BUFFER_CONFIG *sd) {
   YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag);
   if (cfg) {
-    vp8_yv12_copy_frame(sd, cfg);
+    vpx_yv12_copy_frame(sd, cfg);
     return 0;
   } else {
     return -1;
@@ -2203,38 +3000,6 @@ int vp9_update_entropy(VP9_COMP *cpi, int update) {
   return 0;
 }
 
-#if defined(OUTPUT_YUV_DENOISED) || defined(OUTPUT_YUV_SKINMAP)
-// The denoiser buffer is allocated as a YUV 440 buffer. This function writes it
-// as YUV 420. We simply use the top-left pixels of the UV buffers, since we do
-// not denoise the UV channels at this time. If ever we implement UV channel
-// denoising we will have to modify this.
-void vp9_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) {
-  uint8_t *src = s->y_buffer;
-  int h = s->y_height;
-
-  do {
-    fwrite(src, s->y_width, 1, f);
-    src += s->y_stride;
-  } while (--h);
-
-  src = s->u_buffer;
-  h = s->uv_height;
-
-  do {
-    fwrite(src, s->uv_width, 1, f);
-    src += s->uv_stride;
-  } while (--h);
-
-  src = s->v_buffer;
-  h = s->uv_height;
-
-  do {
-    fwrite(src, s->uv_width, 1, f);
-    src += s->uv_stride;
-  } while (--h);
-}
-#endif
-
 #ifdef OUTPUT_YUV_REC
 void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
   YV12_BUFFER_CONFIG *s = cm->frame_to_show;
@@ -2297,12 +3062,11 @@ void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
 #endif
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
-                                                YV12_BUFFER_CONFIG *dst,
-                                                int bd) {
+void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                             YV12_BUFFER_CONFIG *dst, int bd) {
 #else
-static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
-                                                YV12_BUFFER_CONFIG *dst) {
+void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                             YV12_BUFFER_CONFIG *dst) {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   // TODO(dkovalev): replace YV12_BUFFER_CONFIG with vpx_image_t
   int i;
@@ -2340,17 +3104,36 @@ static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
-                                   YV12_BUFFER_CONFIG *dst, int bd) {
+                                   YV12_BUFFER_CONFIG *dst, int bd,
+                                   INTERP_FILTER filter_type,
+                                   int phase_scaler) {
   const int src_w = src->y_crop_width;
   const int src_h = src->y_crop_height;
   const int dst_w = dst->y_crop_width;
   const int dst_h = dst->y_crop_height;
+
+  // The issue b/311394513 reveals a corner case bug.
+  // For bd = 8, vpx_scaled_2d() requires both x_step_q4 and y_step_q4 are less
+  // than or equal to 64. For bd >= 10, vpx_highbd_convolve8() requires both
+  // x_step_q4 and y_step_q4 are less than or equal to 32. If this condition
+  // isn't met, it needs to call vp9_scale_and_extend_frame_nonnormative() that
+  // supports arbitrary scaling.
+  const int x_step_q4 = 16 * src_w / dst_w;
+  const int y_step_q4 = 16 * src_h / dst_h;
+  const int is_arbitrary_scaling =
+      (bd == 8 && (x_step_q4 > 64 || y_step_q4 > 64)) ||
+      (bd >= 10 && (x_step_q4 > 32 || y_step_q4 > 32));
+  if (is_arbitrary_scaling) {
+    vp9_scale_and_extend_frame_nonnormative(src, dst, bd);
+    return;
+  }
+
   const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer,
                                    src->v_buffer };
   const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
   uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer };
   const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride };
-  const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP];
+  const InterpKernel *const kernel = vp9_filter_kernels[filter_type];
   int x, y, i;
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
@@ -2358,24 +3141,24 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
     const int src_stride = src_strides[i];
     const int dst_stride = dst_strides[i];
     for (y = 0; y < dst_h; y += 16) {
-      const int y_q4 = y * (16 / factor) * src_h / dst_h;
+      const int y_q4 = y * (16 / factor) * src_h / dst_h + phase_scaler;
       for (x = 0; x < dst_w; x += 16) {
-        const int x_q4 = x * (16 / factor) * src_w / dst_w;
+        const int x_q4 = x * (16 / factor) * src_w / dst_w + phase_scaler;
         const uint8_t *src_ptr = srcs[i] +
                                  (y / factor) * src_h / dst_h * src_stride +
                                  (x / factor) * src_w / dst_w;
         uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
 
         if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
-          vpx_highbd_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
-                               kernel[x_q4 & 0xf], 16 * src_w / dst_w,
-                               kernel[y_q4 & 0xf], 16 * src_h / dst_h,
-                               16 / factor, 16 / factor, bd);
+          vpx_highbd_convolve8(CONVERT_TO_SHORTPTR(src_ptr), src_stride,
+                               CONVERT_TO_SHORTPTR(dst_ptr), dst_stride, kernel,
+                               x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,
+                               16 * src_h / dst_h, 16 / factor, 16 / factor,
+                               bd);
         } else {
-          vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,
-                        kernel[x_q4 & 0xf], 16 * src_w / dst_w,
-                        kernel[y_q4 & 0xf], 16 * src_h / dst_h, 16 / factor,
-                        16 / factor);
+          vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel,
+                        x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,
+                        16 * src_h / dst_h, 16 / factor, 16 / factor);
         }
       }
     }
@@ -2383,46 +3166,9 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
 
   vpx_extend_frame_borders(dst);
 }
-#else
-void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
-                                  YV12_BUFFER_CONFIG *dst) {
-  const int src_w = src->y_crop_width;
-  const int src_h = src->y_crop_height;
-  const int dst_w = dst->y_crop_width;
-  const int dst_h = dst->y_crop_height;
-  const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer,
-                                   src->v_buffer };
-  const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
-  uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer };
-  const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride };
-  const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP];
-  int x, y, i;
-
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    const int factor = (i == 0 || i == 3 ? 1 : 2);
-    const int src_stride = src_strides[i];
-    const int dst_stride = dst_strides[i];
-    for (y = 0; y < dst_h; y += 16) {
-      const int y_q4 = y * (16 / factor) * src_h / dst_h;
-      for (x = 0; x < dst_w; x += 16) {
-        const int x_q4 = x * (16 / factor) * src_w / dst_w;
-        const uint8_t *src_ptr = srcs[i] +
-                                 (y / factor) * src_h / dst_h * src_stride +
-                                 (x / factor) * src_w / dst_w;
-        uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
-
-        vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,
-                      kernel[x_q4 & 0xf], 16 * src_w / dst_w,
-                      kernel[y_q4 & 0xf], 16 * src_h / dst_h, 16 / factor,
-                      16 / factor);
-      }
-    }
-  }
-
-  vpx_extend_frame_borders(dst);
-}
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+#if !CONFIG_REALTIME_ONLY
 static int scale_down(VP9_COMP *cpi, int q) {
   RATE_CONTROL *const rc = &cpi->rc;
   GF_GROUP *const gf_group = &cpi->twopass.gf_group;
@@ -2439,20 +3185,45 @@ static int scale_down(VP9_COMP *cpi, int q) {
   return scale;
 }
 
-static int big_rate_miss(VP9_COMP *cpi, int high_limit, int low_limit) {
+static int big_rate_miss_high_threshold(VP9_COMP *cpi) {
   const RATE_CONTROL *const rc = &cpi->rc;
+  int big_miss_high;
 
-  return (rc->projected_frame_size > ((high_limit * 3) / 2)) ||
-         (rc->projected_frame_size < (low_limit / 2));
+  if (frame_is_kf_gf_arf(cpi))
+    big_miss_high = rc->this_frame_target * 3 / 2;
+  else
+    big_miss_high = rc->this_frame_target * 2;
+
+  return big_miss_high;
+}
+
+static int big_rate_miss(VP9_COMP *cpi) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  int big_miss_high;
+  int big_miss_low;
+
+  // Ignore for overlay frames
+  if (rc->is_src_frame_alt_ref) {
+    return 0;
+  } else {
+    big_miss_low = (rc->this_frame_target / 2);
+    big_miss_high = big_rate_miss_high_threshold(cpi);
+
+    return (rc->projected_frame_size > big_miss_high) ||
+           (rc->projected_frame_size < big_miss_low);
+  }
 }
 
 // test in two pass for the first
 static int two_pass_first_group_inter(VP9_COMP *cpi) {
-  TWO_PASS *const twopass = &cpi->twopass;
-  GF_GROUP *const gf_group = &twopass->gf_group;
-  if ((cpi->oxcf.pass == 2) &&
-      (gf_group->index == gf_group->first_inter_index)) {
-    return 1;
+  if (cpi->oxcf.pass == 2) {
+    TWO_PASS *const twopass = &cpi->twopass;
+    GF_GROUP *const gf_group = &twopass->gf_group;
+    const int gfg_index = gf_group->index;
+
+    if (gfg_index == 0) return gf_group->update_type[gfg_index] == LF_UPDATE;
+    return gf_group->update_type[gfg_index - 1] != LF_UPDATE &&
+           gf_group->update_type[gfg_index] == LF_UPDATE;
   } else {
     return 0;
   }
@@ -2468,8 +3239,7 @@ static int recode_loop_test(VP9_COMP *cpi, int high_limit, int low_limit, int q,
   int force_recode = 0;
 
   if ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
-      big_rate_miss(cpi, high_limit, low_limit) ||
-      (cpi->sf.recode_loop == ALLOW_RECODE) ||
+      big_rate_miss(cpi) || (cpi->sf.recode_loop == ALLOW_RECODE) ||
       (two_pass_first_group_inter(cpi) &&
        (cpi->sf.recode_loop == ALLOW_RECODE_FIRST)) ||
       (frame_is_kfgfarf && (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF))) {
@@ -2479,8 +3249,13 @@ static int recode_loop_test(VP9_COMP *cpi, int high_limit, int low_limit, int q,
       cpi->resize_pending = 1;
       return 1;
     }
-    // Force recode if projected_frame_size > max_frame_bandwidth
-    if (rc->projected_frame_size >= rc->max_frame_bandwidth) return 1;
+
+    // Force recode for extreme overshoot.
+    if ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
+        (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF &&
+         rc->projected_frame_size >= big_rate_miss_high_threshold(cpi))) {
+      return 1;
+    }
 
     // TODO(agrange) high_limit could be greater than the scale-down threshold.
     if ((rc->projected_frame_size > high_limit && q < maxq) ||
@@ -2497,10 +3272,52 @@ static int recode_loop_test(VP9_COMP *cpi, int high_limit, int low_limit, int q,
   }
   return force_recode;
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
-void vp9_update_reference_frames(VP9_COMP *cpi) {
+static void update_ref_frames(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   BufferPool *const pool = cm->buffer_pool;
+  GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+
+  if (cpi->ext_ratectrl.ready &&
+      (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 &&
+      cpi->ext_ratectrl.funcs.get_gop_decision != NULL) {
+    const int this_gf_index = gf_group->index;
+    const int update_ref_idx = gf_group->update_ref_idx[this_gf_index];
+    if (gf_group->update_type[this_gf_index] == KF_UPDATE) {
+      ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[0], cm->new_fb_idx);
+      ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[1], cm->new_fb_idx);
+      ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[2], cm->new_fb_idx);
+    } else if (update_ref_idx != INVALID_IDX) {
+      ref_cnt_fb(pool->frame_bufs,
+                 &cm->ref_frame_map[gf_group->update_ref_idx[this_gf_index]],
+                 cm->new_fb_idx);
+    }
+
+    const int next_gf_index = gf_group->index + 1;
+
+    // Overlay frame should ideally look at the colocated ref frame from rc lib.
+    // Here temporarily just don't update the indices.
+    if (next_gf_index < gf_group->gf_group_size) {
+      cpi->lst_fb_idx = gf_group->ext_rc_ref[next_gf_index].last_index;
+      cpi->gld_fb_idx = gf_group->ext_rc_ref[next_gf_index].golden_index;
+      cpi->alt_fb_idx = gf_group->ext_rc_ref[next_gf_index].altref_index;
+    }
+
+    return;
+  }
+
+  if (cpi->rc.show_arf_as_gld) {
+    int tmp = cpi->alt_fb_idx;
+    cpi->alt_fb_idx = cpi->gld_fb_idx;
+    cpi->gld_fb_idx = tmp;
+  } else if (cm->show_existing_frame) {
+    // Pop ARF.
+    cpi->lst_fb_idx = cpi->alt_fb_idx;
+    cpi->alt_fb_idx =
+        stack_pop(gf_group->arf_index_stack, gf_group->stack_size);
+    --gf_group->stack_size;
+  }
 
   // At this point the new frame has been encoded.
   // If any buffer copy / swapping is signaled it should be done here.
@@ -2526,23 +3343,23 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
     tmp = cpi->alt_fb_idx;
     cpi->alt_fb_idx = cpi->gld_fb_idx;
     cpi->gld_fb_idx = tmp;
-
-    if (is_two_pass_svc(cpi)) {
-      cpi->svc.layer_context[0].gold_ref_idx = cpi->gld_fb_idx;
-      cpi->svc.layer_context[0].alt_ref_idx = cpi->alt_fb_idx;
-    }
   } else { /* For non key/golden frames */
     if (cpi->refresh_alt_ref_frame) {
-      int arf_idx = cpi->alt_fb_idx;
-      if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
-        const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-        arf_idx = gf_group->arf_update_idx[gf_group->index];
-      }
+      int arf_idx = gf_group->top_arf_idx;
+
+      // Push new ARF into stack.
+      stack_push(gf_group->arf_index_stack, cpi->alt_fb_idx,
+                 gf_group->stack_size);
+      ++gf_group->stack_size;
+
+      assert(arf_idx < REF_FRAMES);
 
       ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[arf_idx], cm->new_fb_idx);
       memcpy(cpi->interp_filter_selected[ALTREF_FRAME],
              cpi->interp_filter_selected[0],
              sizeof(cpi->interp_filter_selected[0]));
+
+      cpi->alt_fb_idx = arf_idx;
     }
 
     if (cpi->refresh_golden_frame) {
@@ -2567,46 +3384,60 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
              cpi->interp_filter_selected[0],
              sizeof(cpi->interp_filter_selected[0]));
   }
+
+  if (gf_group->update_type[gf_group->index] == MID_OVERLAY_UPDATE) {
+    cpi->alt_fb_idx =
+        stack_pop(gf_group->arf_index_stack, gf_group->stack_size);
+    --gf_group->stack_size;
+  }
+}
+
+void vp9_update_reference_frames(VP9_COMP *cpi) {
+  update_ref_frames(cpi);
+
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0 &&
-      cpi->denoiser.denoising_level > kDenLowLow) {
-    vp9_denoiser_update_frame_info(
-        &cpi->denoiser, *cpi->Source, cpi->common.frame_type,
-        cpi->refresh_alt_ref_frame, cpi->refresh_golden_frame,
-        cpi->refresh_last_frame, cpi->resize_pending);
-  }
+  vp9_denoiser_update_ref_frame(cpi);
 #endif
-  if (is_one_pass_cbr_svc(cpi)) {
-    // Keep track of frame index for each reference frame.
-    SVC *const svc = &cpi->svc;
-    if (cm->frame_type == KEY_FRAME) {
-      svc->ref_frame_index[cpi->lst_fb_idx] = svc->current_superframe;
-      svc->ref_frame_index[cpi->gld_fb_idx] = svc->current_superframe;
-      svc->ref_frame_index[cpi->alt_fb_idx] = svc->current_superframe;
-    } else {
-      if (cpi->refresh_last_frame)
-        svc->ref_frame_index[cpi->lst_fb_idx] = svc->current_superframe;
-      if (cpi->refresh_golden_frame)
-        svc->ref_frame_index[cpi->gld_fb_idx] = svc->current_superframe;
-      if (cpi->refresh_alt_ref_frame)
-        svc->ref_frame_index[cpi->alt_fb_idx] = svc->current_superframe;
-    }
-  }
+
+  if (is_one_pass_svc(cpi)) vp9_svc_update_ref_frame(cpi);
 }
 
 static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
   MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
   struct loopfilter *lf = &cm->lf;
+  int is_reference_frame =
+      (cm->frame_type == KEY_FRAME || cpi->refresh_last_frame ||
+       cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame);
+  if (cpi->use_svc &&
+      cpi->svc.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS)
+    is_reference_frame = !cpi->svc.non_reference_frame;
+
+  // Skip loop filter in show_existing_frame mode.
+  if (cm->show_existing_frame) {
+    lf->filter_level = 0;
+    return;
+  }
+
+  if (cpi->loopfilter_ctrl == NO_LOOPFILTER ||
+      (!is_reference_frame && cpi->loopfilter_ctrl == LOOPFILTER_REFERENCE)) {
+    lf->filter_level = 0;
+    vpx_extend_frame_inner_borders(cm->frame_to_show);
+    return;
+  }
 
   if (xd->lossless) {
     lf->filter_level = 0;
     lf->last_filt_level = 0;
   } else {
+#if CONFIG_INTERNAL_STATS
     struct vpx_usec_timer timer;
+#endif
 
     vpx_clear_system_state();
 
+#if CONFIG_INTERNAL_STATS
     vpx_usec_timer_start(&timer);
+#endif
 
     if (!cpi->rc.is_src_frame_alt_ref) {
       if ((cpi->common.frame_type == KEY_FRAME) &&
@@ -2619,11 +3450,13 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
       lf->filter_level = 0;
     }
 
+#if CONFIG_INTERNAL_STATS
     vpx_usec_timer_mark(&timer);
     cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);
+#endif
   }
 
-  if (lf->filter_level > 0) {
+  if (lf->filter_level > 0 && is_reference_frame) {
     vp9_build_mask_frame(cm, lf->filter_level, 0);
 
     if (cpi->num_workers > 1)
@@ -2637,19 +3470,6 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
   vpx_extend_frame_inner_borders(cm->frame_to_show);
 }
 
-static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) {
-  RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx];
-  if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows ||
-      new_fb_ptr->mi_cols < cm->mi_cols) {
-    vpx_free(new_fb_ptr->mvs);
-    CHECK_MEM_ERROR(cm, new_fb_ptr->mvs,
-                    (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
-                                         sizeof(*new_fb_ptr->mvs)));
-    new_fb_ptr->mi_rows = cm->mi_rows;
-    new_fb_ptr->mi_cols = cm->mi_cols;
-  }
-}
-
 void vp9_scale_references(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   MV_REFERENCE_FRAME ref_frame;
@@ -2668,7 +3488,6 @@ void vp9_scale_references(VP9_COMP *cpi) {
         continue;
       }
 
-#if CONFIG_VP9_HIGHBITDEPTH
       if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
         RefCntBuffer *new_fb_ptr = NULL;
         int force_scaling = 0;
@@ -2681,6 +3500,7 @@ void vp9_scale_references(VP9_COMP *cpi) {
         new_fb_ptr = &pool->frame_bufs[new_fb];
         if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width ||
             new_fb_ptr->buf.y_crop_height != cm->height) {
+#if CONFIG_VP9_HIGHBITDEPTH
           if (vpx_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height,
                                        cm->subsampling_x, cm->subsampling_y,
                                        cm->use_highbitdepth,
@@ -2688,42 +3508,28 @@ void vp9_scale_references(VP9_COMP *cpi) {
                                        cm->byte_alignment, NULL, NULL, NULL))
             vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                                "Failed to allocate frame buffer");
-          scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth);
-          cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
-          alloc_frame_mvs(cm, new_fb);
-        }
+          scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth,
+                                 EIGHTTAP, 0);
 #else
-      if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
-        RefCntBuffer *new_fb_ptr = NULL;
-        int force_scaling = 0;
-        int new_fb = cpi->scaled_ref_idx[ref_frame - 1];
-        if (new_fb == INVALID_IDX) {
-          new_fb = get_free_fb(cm);
-          force_scaling = 1;
-        }
-        if (new_fb == INVALID_IDX) return;
-        new_fb_ptr = &pool->frame_bufs[new_fb];
-        if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width ||
-            new_fb_ptr->buf.y_crop_height != cm->height) {
           if (vpx_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height,
                                        cm->subsampling_x, cm->subsampling_y,
                                        VP9_ENC_BORDER_IN_PIXELS,
                                        cm->byte_alignment, NULL, NULL, NULL))
             vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                                "Failed to allocate frame buffer");
-          vp9_scale_and_extend_frame(ref, &new_fb_ptr->buf);
+          vp9_scale_and_extend_frame(ref, &new_fb_ptr->buf, EIGHTTAP, 0);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
           cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
           alloc_frame_mvs(cm, new_fb);
         }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
       } else {
         int buf_idx;
         RefCntBuffer *buf = NULL;
         if (cpi->oxcf.pass == 0 && !cpi->use_svc) {
           // Check for release of scaled reference.
           buf_idx = cpi->scaled_ref_idx[ref_frame - 1];
-          buf = (buf_idx != INVALID_IDX) ? &pool->frame_bufs[buf_idx] : NULL;
-          if (buf != NULL) {
+          if (buf_idx != INVALID_IDX) {
+            buf = &pool->frame_bufs[buf_idx];
             --buf->ref_count;
             cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
           }
@@ -2754,22 +3560,21 @@ static void release_scaled_references(VP9_COMP *cpi) {
     refresh[2] = (cpi->refresh_alt_ref_frame) ? 1 : 0;
     for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
       const int idx = cpi->scaled_ref_idx[i - 1];
-      RefCntBuffer *const buf =
-          idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[idx] : NULL;
-      const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, i);
-      if (buf != NULL &&
-          (refresh[i - 1] || (buf->buf.y_crop_width == ref->y_crop_width &&
-                              buf->buf.y_crop_height == ref->y_crop_height))) {
-        --buf->ref_count;
-        cpi->scaled_ref_idx[i - 1] = INVALID_IDX;
+      if (idx != INVALID_IDX) {
+        RefCntBuffer *const buf = &cm->buffer_pool->frame_bufs[idx];
+        const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, i);
+        if (refresh[i - 1] || (buf->buf.y_crop_width == ref->y_crop_width &&
+                               buf->buf.y_crop_height == ref->y_crop_height)) {
+          --buf->ref_count;
+          cpi->scaled_ref_idx[i - 1] = INVALID_IDX;
+        }
       }
     }
   } else {
-    for (i = 0; i < MAX_REF_FRAMES; ++i) {
+    for (i = 0; i < REFS_PER_FRAME; ++i) {
       const int idx = cpi->scaled_ref_idx[i];
-      RefCntBuffer *const buf =
-          idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[idx] : NULL;
-      if (buf != NULL) {
+      if (idx != INVALID_IDX) {
+        RefCntBuffer *const buf = &cm->buffer_pool->frame_bufs[idx];
         --buf->ref_count;
         cpi->scaled_ref_idx[i] = INVALID_IDX;
       }
@@ -2828,26 +3633,46 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
       case VPX_BITS_10:
         dc_quant_devisor = 16.0;
         break;
-      case VPX_BITS_12:
-        dc_quant_devisor = 64.0;
-        break;
       default:
-        assert(0 && "bit_depth must be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+        assert(cm->bit_depth == VPX_BITS_12);
+        dc_quant_devisor = 64.0;
         break;
     }
 #else
     dc_quant_devisor = 4.0;
 #endif
 
-    fprintf(f, "%10u %dx%d %d %d %10d %10d %10d %10d"
-       "%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" "
-       "%10"PRId64" %10"PRId64" %10d "
-       "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
-        "%6d %6d %5d %5d %5d "
-        "%10"PRId64" %10.3lf"
-        "%10lf %8u %10"PRId64" %10d %10d %10d %10d %10d\n",
+    if (!cm->current_video_frame) {
+      fprintf(f, "frame, width, height, last ts, last end ts, "
+          "source_alt_ref_pending, source_alt_ref_active, "
+          "this_frame_target, projected_frame_size, "
+          "projected_frame_size / MBs, "
+          "projected_frame_size - this_frame_target, "
+          "vbr_bits_off_target, vbr_bits_off_target_fast, "
+          "twopass.extend_minq, twopass.extend_minq_fast, "
+          "total_target_vs_actual, "
+          "starting_buffer_level - bits_off_target, "
+          "total_actual_bits, base_qindex, q for base_qindex, "
+          "dc quant, q for active_worst_quality, avg_q, q for oxcf.cq_level, "
+          "refresh_last_frame, refresh_golden_frame, refresh_alt_ref_frame, "
+          "frame_type, gfu_boost, "
+          "twopass.bits_left, "
+          "twopass.total_left_stats.coded_error, "
+          "twopass.bits_left / (1 + twopass.total_left_stats.coded_error), "
+          "tot_recode_hits, recon_err, kf_boost, "
+          "twopass.kf_zeromotion_pct, twopass.fr_content_type, "
+          "filter_level, seg.aq_av_offset\n");
+    }
+
+    fprintf(f, "%10u, %d, %d, %10"PRId64", %10"PRId64", %d, %d, %10d, %10d, "
+        "%10d, %10d, %10"PRId64", %10"PRId64", %5d, %5d, %10"PRId64", "
+        "%10"PRId64", %10"PRId64", %10d, %7.2lf, %7.2lf, %7.2lf, %7.2lf, "
+        "%7.2lf, %6d, %6d, %5d, %5d, %5d, %10"PRId64", %10.3lf, %10lf, %8u, "
+        "%10"PRId64", %10d, %10d, %10d, %10d, %10d\n",
         cpi->common.current_video_frame,
         cm->width, cm->height,
+        cpi->last_time_stamp_seen,
+        cpi->last_end_time_stamp_seen,
         cpi->rc.source_alt_ref_pending,
         cpi->rc.source_alt_ref_active,
         cpi->rc.this_frame_target,
@@ -2926,7 +3751,7 @@ static void set_mv_search_params(VP9_COMP *cpi) {
 }
 
 static void set_size_independent_vars(VP9_COMP *cpi) {
-  vp9_set_speed_features_framesize_independent(cpi);
+  vp9_set_speed_features_framesize_independent(cpi, cpi->oxcf.speed);
   vp9_set_rd_speed_thresholds(cpi);
   vp9_set_rd_speed_thresholds_sub8x8(cpi);
   cpi->common.interp_filter = cpi->sf.default_interp_filter;
@@ -2935,29 +3760,39 @@ static void set_size_independent_vars(VP9_COMP *cpi) {
 static void set_size_dependent_vars(VP9_COMP *cpi, int *q, int *bottom_index,
                                     int *top_index) {
   VP9_COMMON *const cm = &cpi->common;
-  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
 
   // Setup variables that depend on the dimensions of the frame.
-  vp9_set_speed_features_framesize_dependent(cpi);
+  vp9_set_speed_features_framesize_dependent(cpi, cpi->oxcf.speed);
 
   // Decide q and q bounds.
   *q = vp9_rc_pick_q_and_bounds(cpi, bottom_index, top_index);
 
+  if (cpi->oxcf.rc_mode == VPX_CBR && cpi->rc.force_max_q) {
+    *q = cpi->rc.worst_quality;
+    cpi->rc.force_max_q = 0;
+  }
+
+  if (cpi->use_svc) {
+    cpi->svc.base_qindex[cpi->svc.spatial_layer_id] = *q;
+  }
+
   if (!frame_is_intra_only(cm)) {
     vp9_set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH);
   }
 
+#if !CONFIG_REALTIME_ONLY
   // Configure experimental use of segmentation for enhanced coding of
   // static regions if indicated.
   // Only allowed in the second pass of a two pass encode, as it requires
   // lagged coding, and if the relevant speed feature flag is set.
-  if (oxcf->pass == 2 && cpi->sf.static_segmentation)
+  if (cpi->oxcf.pass == 2 && cpi->sf.static_segmentation)
     configure_static_seg_features(cpi);
+#endif  // !CONFIG_REALTIME_ONLY
 
 #if CONFIG_VP9_POSTPROC && !(CONFIG_VP9_TEMPORAL_DENOISING)
-  if (oxcf->noise_sensitivity > 0) {
+  if (cpi->oxcf.noise_sensitivity > 0) {
     int l = 0;
-    switch (oxcf->noise_sensitivity) {
+    switch (cpi->oxcf.noise_sensitivity) {
       case 1: l = 20; break;
       case 2: l = 40; break;
       case 3: l = 60; break;
@@ -2966,31 +3801,16 @@ static void set_size_dependent_vars(VP9_COMP *cpi, int *q, int *bottom_index,
       case 6: l = 150; break;
     }
     if (!cpi->common.postproc_state.limits) {
-      cpi->common.postproc_state.limits = vpx_calloc(
-          cpi->common.width, sizeof(*cpi->common.postproc_state.limits));
+      CHECK_MEM_ERROR(&cm->error, cpi->common.postproc_state.limits,
+                      vpx_calloc(cpi->un_scaled_source->y_width,
+                                 sizeof(*cpi->common.postproc_state.limits)));
     }
-    vp9_denoise(cpi->Source, cpi->Source, l, cpi->common.postproc_state.limits);
+    vp9_denoise(&cpi->common, cpi->Source, cpi->Source, l,
+                cpi->common.postproc_state.limits);
   }
 #endif  // CONFIG_VP9_POSTPROC
 }
 
-#if CONFIG_VP9_TEMPORAL_DENOISING
-static void setup_denoiser_buffer(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
-  if (cpi->oxcf.noise_sensitivity > 0 &&
-      !cpi->denoiser.frame_buffer_initialized) {
-    if (vp9_denoiser_alloc(&cpi->denoiser, cm->width, cm->height,
-                           cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_VP9_HIGHBITDEPTH
-                           cm->use_highbitdepth,
-#endif
-                           VP9_ENC_BORDER_IN_PIXELS))
-      vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                         "Failed to allocate denoiser");
-  }
-}
-#endif
-
 static void init_motion_estimation(VP9_COMP *cpi) {
   int y_stride = cpi->scaled_source.y_stride;
 
@@ -3007,6 +3827,7 @@ static void set_frame_size(VP9_COMP *cpi) {
   VP9EncoderConfig *const oxcf = &cpi->oxcf;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
 
+#if !CONFIG_REALTIME_ONLY
   if (oxcf->pass == 2 && oxcf->rc_mode == VPX_VBR &&
       ((oxcf->resize_mode == RESIZE_FIXED && cm->current_video_frame == 0) ||
        (oxcf->resize_mode == RESIZE_DYNAMIC && cpi->resize_pending))) {
@@ -3017,16 +3838,21 @@ static void set_frame_size(VP9_COMP *cpi) {
     vp9_set_size_literal(cpi, oxcf->scaled_frame_width,
                          oxcf->scaled_frame_height);
   }
+#endif  // !CONFIG_REALTIME_ONLY
 
-  if (oxcf->pass == 0 && oxcf->rc_mode == VPX_CBR && !cpi->use_svc &&
+  if (oxcf->pass == 0 && oxcf->rc_mode == VPX_CBR &&
       oxcf->resize_mode == RESIZE_DYNAMIC && cpi->resize_pending != 0) {
-    oxcf->scaled_frame_width =
-        (oxcf->width * cpi->resize_scale_num) / cpi->resize_scale_den;
-    oxcf->scaled_frame_height =
-        (oxcf->height * cpi->resize_scale_num) / cpi->resize_scale_den;
-    // There has been a change in frame size.
-    vp9_set_size_literal(cpi, oxcf->scaled_frame_width,
-                         oxcf->scaled_frame_height);
+    // For SVC scaled width/height will have been set (svc->resize_set=1)
+    // in get_svc_params based on the layer width/height.
+    if (!cpi->use_svc || !cpi->svc.resize_set) {
+      oxcf->scaled_frame_width =
+          (oxcf->width * cpi->resize_scale_num) / cpi->resize_scale_den;
+      oxcf->scaled_frame_height =
+          (oxcf->height * cpi->resize_scale_num) / cpi->resize_scale_den;
+      // There has been a change in frame size.
+      vp9_set_size_literal(cpi, oxcf->scaled_frame_width,
+                           oxcf->scaled_frame_height);
+    }
 
     // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
     set_mv_search_params(cpi);
@@ -3044,9 +3870,7 @@ static void set_frame_size(VP9_COMP *cpi) {
 #endif
   }
 
-  if ((oxcf->pass == 2) &&
-      (!cpi->use_svc || (is_two_pass_svc(cpi) &&
-                         cpi->svc.encode_empty_frame_state != ENCODING))) {
+  if ((oxcf->pass == 2) && !cpi->use_svc) {
     vp9_set_target_rate(cpi);
   }
 
@@ -3066,6 +3890,7 @@ static void set_frame_size(VP9_COMP *cpi) {
   alloc_util_frame_buffers(cpi);
   init_motion_estimation(cpi);
 
+  int has_valid_ref_frame = 0;
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - 1];
     const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
@@ -3084,55 +3909,158 @@ static void set_frame_size(VP9_COMP *cpi) {
                                         buf->y_crop_height, cm->width,
                                         cm->height);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+      has_valid_ref_frame |= vp9_is_valid_scale(&ref_buf->sf);
       if (vp9_is_scaled(&ref_buf->sf)) vpx_extend_frame_borders(buf);
     } else {
       ref_buf->buf = NULL;
     }
   }
+  if (!frame_is_intra_only(cm) && !has_valid_ref_frame) {
+    vpx_internal_error(
+        &cm->error, VPX_CODEC_ERROR,
+        "Can't find at least one reference frame with valid size");
+  }
 
   set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
 }
 
-static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
-                                       uint8_t *dest) {
+static void save_encode_params(VP9_COMP *cpi) {
+  int tile_idx;
+  int i, j;
+  TileDataEnc *tile_data;
+  RD_OPT *rd_opt = &cpi->rd;
+  for (i = 0; i < MAX_REF_FRAMES; i++) {
+    for (j = 0; j < REFERENCE_MODES; j++)
+      rd_opt->prediction_type_threshes_prev[i][j] =
+          rd_opt->prediction_type_threshes[i][j];
+
+    for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; j++)
+      rd_opt->filter_threshes_prev[i][j] = rd_opt->filter_threshes[i][j];
+  }
+
+  for (tile_idx = 0; tile_idx < cpi->allocated_tiles; tile_idx++) {
+    assert(cpi->tile_data);
+    tile_data = &cpi->tile_data[tile_idx];
+    vp9_copy(tile_data->thresh_freq_fact_prev, tile_data->thresh_freq_fact);
+  }
+}
+
+static INLINE void set_raw_source_frame(VP9_COMP *cpi) {
+#ifdef ENABLE_KF_DENOISE
+  if (is_spatial_denoise_enabled(cpi)) {
+    cpi->raw_source_frame = vp9_scale_if_required(
+        cm, &cpi->raw_unscaled_source, &cpi->raw_scaled_source,
+        (oxcf->pass == 0), EIGHTTAP, 0);
+  } else {
+    cpi->raw_source_frame = cpi->Source;
+  }
+#else
+  cpi->raw_source_frame = cpi->Source;
+#endif
+}
+
+static YV12_BUFFER_CONFIG *svc_twostage_scale(
+    VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
+    YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type,
+    int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2) {
+  if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
+      cm->mi_rows * MI_SIZE != unscaled->y_height) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->bit_depth == VPX_BITS_8) {
+      vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2,
+                                 phase_scaler2);
+      vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type,
+                                 phase_scaler);
+    } else {
+      scale_and_extend_frame(unscaled, scaled_temp, (int)cm->bit_depth,
+                             filter_type2, phase_scaler2);
+      scale_and_extend_frame(scaled_temp, scaled, (int)cm->bit_depth,
+                             filter_type, phase_scaler);
+    }
+#else
+    vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2,
+                               phase_scaler2);
+    vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type, phase_scaler);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    return scaled;
+  } else {
+    return unscaled;
+  }
+}
+
+static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
+                                      uint8_t *dest, size_t dest_size) {
   VP9_COMMON *const cm = &cpi->common;
-  int q = 0, bottom_index = 0, top_index = 0;  // Dummy variables.
+  SVC *const svc = &cpi->svc;
+  int q = 0, bottom_index = 0, top_index = 0;
+  int no_drop_scene_change = 0;
+  const INTERP_FILTER filter_scaler =
+      (is_one_pass_svc(cpi))
+          ? svc->downsample_filter_type[svc->spatial_layer_id]
+          : EIGHTTAP;
+  const int phase_scaler =
+      (is_one_pass_svc(cpi))
+          ? svc->downsample_filter_phase[svc->spatial_layer_id]
+          : 0;
+
+  if (cm->show_existing_frame) {
+    cpi->rc.this_frame_target = 0;
+    if (is_psnr_calc_enabled(cpi)) set_raw_source_frame(cpi);
+    return 1;
+  }
+
+  svc->time_stamp_prev[svc->spatial_layer_id] = svc->time_stamp_superframe;
+
+  // Flag to check if its valid to compute the source sad (used for
+  // scene detection and for superblock content state in CBR mode).
+  // The flag may get reset below based on SVC or resizing state.
+  cpi->compute_source_sad_onepass = cpi->oxcf.mode == REALTIME;
 
   vpx_clear_system_state();
 
   set_frame_size(cpi);
 
-  if (is_one_pass_cbr_svc(cpi) &&
+  if (is_one_pass_svc(cpi) &&
       cpi->un_scaled_source->y_width == cm->width << 2 &&
       cpi->un_scaled_source->y_height == cm->height << 2 &&
-      cpi->svc.scaled_temp.y_width == cm->width << 1 &&
-      cpi->svc.scaled_temp.y_height == cm->height << 1) {
+      svc->scaled_temp.y_width == cm->width << 1 &&
+      svc->scaled_temp.y_height == cm->height << 1) {
     // For svc, if it is a 1/4x1/4 downscaling, do a two-stage scaling to take
     // advantage of the 1:2 optimized scaler. In the process, the 1/2x1/2
     // result will be saved in scaled_temp and might be used later.
-    cpi->Source = vp9_svc_twostage_scale(
-        cm, cpi->un_scaled_source, &cpi->scaled_source, &cpi->svc.scaled_temp);
-    cpi->svc.scaled_one_half = 1;
-  } else if (is_one_pass_cbr_svc(cpi) &&
+    const INTERP_FILTER filter_scaler2 = svc->downsample_filter_type[1];
+    const int phase_scaler2 = svc->downsample_filter_phase[1];
+    cpi->Source = svc_twostage_scale(
+        cm, cpi->un_scaled_source, &cpi->scaled_source, &svc->scaled_temp,
+        filter_scaler, phase_scaler, filter_scaler2, phase_scaler2);
+    svc->scaled_one_half = 1;
+  } else if (is_one_pass_svc(cpi) &&
              cpi->un_scaled_source->y_width == cm->width << 1 &&
              cpi->un_scaled_source->y_height == cm->height << 1 &&
-             cpi->svc.scaled_one_half) {
+             svc->scaled_one_half) {
     // If the spatial layer is 1/2x1/2 and the scaling is already done in the
     // two-stage scaling, use the result directly.
-    cpi->Source = &cpi->svc.scaled_temp;
-    cpi->svc.scaled_one_half = 0;
+    cpi->Source = &svc->scaled_temp;
+    svc->scaled_one_half = 0;
   } else {
     cpi->Source = vp9_scale_if_required(
-        cm, cpi->un_scaled_source, &cpi->scaled_source, (cpi->oxcf.pass == 0));
+        cm, cpi->un_scaled_source, &cpi->scaled_source, (cpi->oxcf.pass == 0),
+        filter_scaler, phase_scaler);
   }
+#ifdef OUTPUT_YUV_SVC_SRC
+  // Write out at most 3 spatial layers.
+  if (is_one_pass_svc(cpi) && svc->spatial_layer_id < 3) {
+    vpx_write_yuv_frame(yuv_svc_src[svc->spatial_layer_id], cpi->Source);
+  }
+#endif
   // Unfiltered raw source used in metrics calculation if the source
   // has been filtered.
   if (is_psnr_calc_enabled(cpi)) {
 #ifdef ENABLE_KF_DENOISE
     if (is_spatial_denoise_enabled(cpi)) {
-      cpi->raw_source_frame =
-          vp9_scale_if_required(cm, &cpi->raw_unscaled_source,
-                                &cpi->raw_scaled_source, (cpi->oxcf.pass == 0));
+      cpi->raw_source_frame = vp9_scale_if_required(
+          cm, &cpi->raw_unscaled_source, &cpi->raw_scaled_source,
+          (cpi->oxcf.pass == 0), EIGHTTAP, phase_scaler);
     } else {
       cpi->raw_source_frame = cpi->Source;
     }
@@ -3141,58 +4069,170 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
 #endif
   }
 
+  if ((cpi->use_svc &&
+       (svc->spatial_layer_id < svc->number_spatial_layers - 1 ||
+        svc->temporal_layer_id < svc->number_temporal_layers - 1 ||
+        svc->current_superframe < 1)) ||
+      cpi->resize_pending || cpi->resize_state || cpi->external_resize ||
+      cpi->resize_state != ORIG) {
+    cpi->compute_source_sad_onepass = 0;
+    if (cpi->content_state_sb_fd != NULL)
+      memset(cpi->content_state_sb_fd, 0,
+             (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1) *
+                 sizeof(*cpi->content_state_sb_fd));
+  }
+
   // Avoid scaling last_source unless its needed.
-  // Last source is needed if vp9_avg_source_sad() is used, or if
-  // partition_search_type == SOURCE_VAR_BASED_PARTITION, or if noise
-  // estimation is enabled.
+  // Last source is needed if avg_source_sad() is used, or if noise estimation
+  // is enabled.
   if (cpi->unscaled_last_source != NULL &&
       (cpi->oxcf.content == VP9E_CONTENT_SCREEN ||
        (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_VBR &&
         cpi->oxcf.mode == REALTIME && cpi->oxcf.speed >= 5) ||
-       cpi->sf.partition_search_type == SOURCE_VAR_BASED_PARTITION ||
-       cpi->noise_estimate.enabled))
-    cpi->Last_Source =
-        vp9_scale_if_required(cm, cpi->unscaled_last_source,
-                              &cpi->scaled_last_source, (cpi->oxcf.pass == 0));
+       (cpi->noise_estimate.enabled && !cpi->oxcf.noise_sensitivity) ||
+       cpi->compute_source_sad_onepass))
+    cpi->Last_Source = vp9_scale_if_required(
+        cm, cpi->unscaled_last_source, &cpi->scaled_last_source,
+        (cpi->oxcf.pass == 0), EIGHTTAP, 0);
 
-  if (cm->frame_type == KEY_FRAME || cpi->resize_pending != 0) {
+  if (cpi->Last_Source == NULL ||
+      cpi->Last_Source->y_width != cpi->Source->y_width ||
+      cpi->Last_Source->y_height != cpi->Source->y_height)
+    cpi->compute_source_sad_onepass = 0;
+
+  if (frame_is_intra_only(cm) || cpi->resize_pending != 0) {
     memset(cpi->consec_zero_mv, 0,
            cm->mi_rows * cm->mi_cols * sizeof(*cpi->consec_zero_mv));
   }
 
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0 && cpi->use_svc)
+    vp9_denoiser_reset_on_first_frame(cpi);
+#endif
+
+  // Scene detection is always used for VBR mode or screen-content case.
+  // For other cases (e.g., CBR mode) use it for 5 <= speed.
+  cpi->rc.high_source_sad = 0;
+  cpi->rc.hybrid_intra_scene_change = 0;
+  cpi->rc.re_encode_maxq_scene_change = 0;
+  if (cm->show_frame && cpi->oxcf.mode == REALTIME &&
+      !cpi->disable_scene_detection_rtc_ratectrl &&
+      (cpi->oxcf.rc_mode == VPX_VBR ||
+       cpi->oxcf.content == VP9E_CONTENT_SCREEN || cpi->oxcf.speed >= 5))
+    vp9_scene_detection_onepass(cpi);
+
+  if (svc->spatial_layer_id == svc->first_spatial_layer_to_encode) {
+    svc->high_source_sad_superframe = cpi->rc.high_source_sad;
+    svc->high_num_blocks_with_motion = cpi->rc.high_num_blocks_with_motion;
+    // On scene change reset temporal layer pattern to TL0.
+    // Note that if the base/lower spatial layers are skipped: instead of
+    // inserting base layer here, we force max-q for the next superframe
+    // with lower spatial layers: this is done in vp9_encodedframe_overshoot()
+    // when max-q is decided for the current layer.
+    // Only do this reset for bypass/flexible mode.
+    if (svc->high_source_sad_superframe && svc->temporal_layer_id > 0 &&
+        svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+      // rc->high_source_sad will get reset so copy it to restore it.
+      int tmp_high_source_sad = cpi->rc.high_source_sad;
+      vp9_svc_reset_temporal_layers(cpi, cm->frame_type == KEY_FRAME);
+      cpi->rc.high_source_sad = tmp_high_source_sad;
+    }
+  }
+
   vp9_update_noise_estimate(cpi);
 
-  if (cpi->oxcf.pass == 0 && cpi->oxcf.mode == REALTIME &&
-      cpi->oxcf.speed >= 5 && cpi->resize_state == 0 &&
-      (cpi->oxcf.content == VP9E_CONTENT_SCREEN ||
-       cpi->oxcf.rc_mode == VPX_VBR || cpi->sf.copy_partition_flag) &&
-      cm->show_frame)
-    vp9_avg_source_sad(cpi);
+  // For 1 pass CBR, check if we are dropping this frame.
+  // Never drop on key frame, if base layer is key for svc,
+  // on scene change, or if superframe has layer sync.
+  if ((cpi->rc.high_source_sad || svc->high_source_sad_superframe) &&
+      !(cpi->rc.use_post_encode_drop && svc->last_layer_dropped[0]))
+    no_drop_scene_change = 1;
+  if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR &&
+      !frame_is_intra_only(cm) && !no_drop_scene_change &&
+      !svc->superframe_has_layer_sync &&
+      (!cpi->use_svc ||
+       !svc->layer_context[svc->temporal_layer_id].is_key_frame)) {
+    if (vp9_rc_drop_frame(cpi)) return 0;
+  }
 
-  // For 1 pass SVC, since only ZEROMV is allowed for upsampled reference
-  // frame (i.e, svc->force_zero_mode_spatial_ref = 0), we can avoid this
-  // frame-level upsampling.
-  if (frame_is_intra_only(cm) == 0 && !is_one_pass_cbr_svc(cpi)) {
+  // For 1 pass SVC, only ZEROMV is allowed for spatial reference frame
+  // when svc->force_zero_mode_spatial_ref = 1. Under those conditions we can
+  // avoid this frame-level upsampling (for non intra_only frames).
+  // For SVC single_layer mode, dynamic resize is allowed and we need to
+  // scale references for this case.
+  if (frame_is_intra_only(cm) == 0 &&
+      ((svc->single_layer_svc && cpi->oxcf.resize_mode == RESIZE_DYNAMIC) ||
+       !(is_one_pass_svc(cpi) && svc->force_zero_mode_spatial_ref))) {
     vp9_scale_references(cpi);
   }
 
   set_size_independent_vars(cpi);
   set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
 
-  if (cpi->oxcf.speed >= 5 && cpi->oxcf.pass == 0 &&
+  // search method and step parameter might be changed in speed settings.
+  init_motion_estimation(cpi);
+
+  if (cpi->sf.copy_partition_flag) alloc_copy_partition_data(cpi);
+
+  if (cpi->sf.svc_use_lowres_part &&
+      svc->spatial_layer_id == svc->number_spatial_layers - 2) {
+    if (svc->prev_partition_svc == NULL) {
+      CHECK_MEM_ERROR(
+          &cm->error, svc->prev_partition_svc,
+          (BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows,
+                                   sizeof(*svc->prev_partition_svc)));
+    }
+  }
+
+  // TODO(jianj): Look into issue of skin detection with high bitdepth.
+  if (cm->bit_depth == 8 && cpi->oxcf.speed >= 5 && cpi->oxcf.pass == 0 &&
       cpi->oxcf.rc_mode == VPX_CBR &&
       cpi->oxcf.content != VP9E_CONTENT_SCREEN &&
       cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
     cpi->use_skin_detection = 1;
   }
 
-  vp9_set_quantizer(cm, q);
-  vp9_set_variance_partition_thresholds(cpi, q);
+  // Enable post encode frame dropping for CBR on non key frame, when
+  // ext_use_post_encode_drop is specified by user.
+  cpi->rc.use_post_encode_drop = cpi->rc.ext_use_post_encode_drop &&
+                                 cpi->oxcf.rc_mode == VPX_CBR &&
+                                 cm->frame_type != KEY_FRAME;
+
+  vp9_set_quantizer(cpi, q, 0);
+  vp9_set_variance_partition_thresholds(cpi, q, 0);
 
   setup_frame(cpi);
 
   suppress_active_map(cpi);
 
+  if (cpi->use_svc) {
+    // On non-zero spatial layer, check for disabling inter-layer
+    // prediction.
+    if (svc->spatial_layer_id > 0) vp9_svc_constrain_inter_layer_pred(cpi);
+    vp9_svc_assert_constraints_pattern(cpi);
+  }
+
+  if (cpi->rc.last_post_encode_dropped_scene_change) {
+    cpi->rc.high_source_sad = 1;
+    svc->high_source_sad_superframe = 1;
+    // For now disable use_source_sad since Last_Source will not be the previous
+    // encoded but the dropped one.
+    cpi->sf.use_source_sad = 0;
+    cpi->rc.last_post_encode_dropped_scene_change = 0;
+  }
+  // Check if this high_source_sad (scene/slide change) frame should be
+  // encoded at high/max QP, and if so, set the q and adjust some rate
+  // control parameters.
+  if (cpi->sf.overshoot_detection_cbr_rt == FAST_DETECTION_MAXQ &&
+      (cpi->rc.high_source_sad ||
+       (cpi->use_svc && svc->high_source_sad_superframe))) {
+    if (vp9_encodedframe_overshoot(cpi, -1, &q)) {
+      vp9_set_quantizer(cpi, q, 0);
+      vp9_set_variance_partition_thresholds(cpi, q, 0);
+    }
+  }
+
+#if !CONFIG_REALTIME_ONLY
   // Variance adaptive and in frame q adjustment experiments are mutually
   // exclusive.
   if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
@@ -3201,41 +4241,64 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
     vp9_360aq_frame_setup(cpi);
   } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
     vp9_setup_in_frame_q_adj(cpi);
-  } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
-    vp9_cyclic_refresh_setup(cpi);
   } else if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ) {
     // it may be pretty bad for rate-control,
     // and I should handle it somehow
     vp9_alt_ref_aq_setup_map(cpi->alt_ref_aq, cpi);
+  } else {
+#endif
+    // If ROI is enabled and skip feature is used for segmentation, apply cyclic
+    // refresh but not apply ROI for skip for the first 20 frames (defined by
+    // FRAMES_NO_SKIPPING_AFTER_KEY) after key frame to improve quality.
+    if (cpi->roi.enabled && !frame_is_intra_only(cm)) {
+      if (cpi->roi.skip[BACKGROUND_SEG_SKIP_ID]) {
+        if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+          vp9_cyclic_refresh_setup(cpi);
+        if (cpi->rc.frames_since_key > FRAMES_NO_SKIPPING_AFTER_KEY)
+          apply_roi_map(cpi);
+      } else {
+        apply_roi_map(cpi);
+      }
+    } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+      vp9_cyclic_refresh_setup(cpi);
+    }
+
+#if !CONFIG_REALTIME_ONLY
   }
+#endif
 
   apply_active_map(cpi);
 
   vp9_encode_frame(cpi);
 
-  // Check if we should drop this frame because of high overshoot.
-  // Only for frames where high temporal-source SAD is detected.
-  if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR &&
-      cpi->resize_state == 0 && cm->frame_type != KEY_FRAME &&
-      cpi->oxcf.content == VP9E_CONTENT_SCREEN &&
-      cpi->rc.high_source_sad == 1) {
+  // Check if we should re-encode this frame at high Q because of high
+  // overshoot based on the encoded frame size. Only for frames where
+  // high temporal-source SAD is detected.
+  // For SVC: all spatial layers are checked for re-encoding.
+  if (cpi->sf.overshoot_detection_cbr_rt == RE_ENCODE_MAXQ &&
+      (cpi->rc.high_source_sad ||
+       (cpi->use_svc && svc->high_source_sad_superframe))) {
     int frame_size = 0;
     // Get an estimate of the encoded frame size.
     save_coding_context(cpi);
-    vp9_pack_bitstream(cpi, dest, size);
+    vp9_pack_bitstream(cpi, dest, dest_size, size);
     restore_coding_context(cpi);
     frame_size = (int)(*size) << 3;
     // Check if encoded frame will overshoot too much, and if so, set the q and
     // adjust some rate control parameters, and return to re-encode the frame.
     if (vp9_encodedframe_overshoot(cpi, frame_size, &q)) {
       vpx_clear_system_state();
-      vp9_set_quantizer(cm, q);
-      vp9_set_variance_partition_thresholds(cpi, q);
+      vp9_set_quantizer(cpi, q, 0);
+      vp9_set_variance_partition_thresholds(cpi, q, 0);
       suppress_active_map(cpi);
       // Turn-off cyclic refresh for re-encoded frame.
       if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+        CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
         unsigned char *const seg_map = cpi->segmentation_map;
         memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
+        memset(cr->last_coded_q_map, MAXQ,
+               cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map));
+        cr->sb_index = 0;
         vp9_disable_segmentation(&cm->seg);
       }
       apply_active_map(cpi);
@@ -3243,19 +4306,40 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
     }
   }
 
-  // Update some stats from cyclic refresh, and check if we should not update
-  // golden reference, for non-SVC 1 pass CBR.
-  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->frame_type != KEY_FRAME &&
-      !cpi->use_svc && cpi->ext_refresh_frame_flags_pending == 0 &&
-      (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR))
-    vp9_cyclic_refresh_check_golden_update(cpi);
+  // Update some stats from cyclic refresh, and check for golden frame update.
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
+      !frame_is_intra_only(cm) && cpi->cyclic_refresh->content_mode)
+    vp9_cyclic_refresh_postencode(cpi);
 
   // Update the skip mb flag probabilities based on the distribution
   // seen in the last encoder iteration.
   // update_base_skip_probs(cpi);
   vpx_clear_system_state();
+  return 1;
 }
 
+static int get_ref_frame_flags(const VP9_COMP *cpi) {
+  const int *const map = cpi->common.ref_frame_map;
+  const int gold_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idx];
+  const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idx];
+  const int gold_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx];
+  int flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
+
+  if (gold_is_last) flags &= ~VP9_GOLD_FLAG;
+
+  if (cpi->rc.frames_till_gf_update_due == INT_MAX &&
+      (cpi->svc.number_temporal_layers == 1 &&
+       cpi->svc.number_spatial_layers == 1))
+    flags &= ~VP9_GOLD_FLAG;
+
+  if (alt_is_last) flags &= ~VP9_ALT_FLAG;
+
+  if (gold_is_alt) flags &= ~VP9_ALT_FLAG;
+
+  return flags;
+}
+
+#if !CONFIG_REALTIME_ONLY
 #define MAX_QSTEP_ADJ 4
 static int get_qstep_adj(int rate_excess, int rate_limit) {
   int qstep =
@@ -3263,8 +4347,9 @@ static int get_qstep_adj(int rate_excess, int rate_limit) {
   return VPXMIN(qstep, MAX_QSTEP_ADJ);
 }
 
-static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
-                                    uint8_t *dest) {
+static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest,
+                                    size_t dest_size) {
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   int bottom_index, top_index;
@@ -3277,13 +4362,27 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
   int frame_under_shoot_limit;
   int q = 0, q_low = 0, q_high = 0;
   int enable_acl;
+#ifdef AGGRESSIVE_VBR
+  int qrange_adj = 1;
+#endif
+
+  const int orig_rc_max_frame_bandwidth = rc->max_frame_bandwidth;
+
+  if (cm->show_existing_frame) {
+    rc->this_frame_target = 0;
+    if (is_psnr_calc_enabled(cpi)) set_raw_source_frame(cpi);
+    return;
+  }
 
   set_size_independent_vars(cpi);
 
-  enable_acl = cpi->sf.allow_acl
-                   ? (cm->frame_type == KEY_FRAME) || (cm->show_frame == 0)
-                   : 0;
+  enable_acl = cpi->sf.allow_acl ? (cm->frame_type == KEY_FRAME) ||
+                                       (cpi->twopass.gf_group.index == 1)
+                                 : 0;
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  printf("\n Encoding a frame: \n");
+#endif
   do {
     vpx_clear_system_state();
 
@@ -3292,6 +4391,16 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
     if (loop_count == 0 || cpi->resize_pending != 0) {
       set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
 
+#ifdef AGGRESSIVE_VBR
+      if (two_pass_first_group_inter(cpi)) {
+        // Adjustment limits for min and max q
+        qrange_adj = VPXMAX(1, (top_index - bottom_index) / 2);
+
+        bottom_index =
+            VPXMAX(bottom_index - qrange_adj / 2, oxcf->best_allowed_q);
+        top_index = VPXMIN(oxcf->worst_allowed_q, top_index + qrange_adj / 2);
+      }
+#endif
       // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
       set_mv_search_params(cpi);
 
@@ -3315,8 +4424,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
                                        &frame_over_shoot_limit);
     }
 
-    cpi->Source = vp9_scale_if_required(
-        cm, cpi->un_scaled_source, &cpi->scaled_source, (cpi->oxcf.pass == 0));
+    cpi->Source =
+        vp9_scale_if_required(cm, cpi->un_scaled_source, &cpi->scaled_source,
+                              (oxcf->pass == 0), EIGHTTAP, 0);
 
     // Unfiltered raw source used in metrics calculation if the source
     // has been filtered.
@@ -3325,7 +4435,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
       if (is_spatial_denoise_enabled(cpi)) {
         cpi->raw_source_frame = vp9_scale_if_required(
             cm, &cpi->raw_unscaled_source, &cpi->raw_scaled_source,
-            (cpi->oxcf.pass == 0));
+            (oxcf->pass == 0), EIGHTTAP, 0);
       } else {
         cpi->raw_source_frame = cpi->Source;
       }
@@ -3337,7 +4447,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
     if (cpi->unscaled_last_source != NULL)
       cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source,
                                                &cpi->scaled_last_source,
-                                               (cpi->oxcf.pass == 0));
+                                               (oxcf->pass == 0), EIGHTTAP, 0);
 
     if (frame_is_intra_only(cm) == 0) {
       if (loop_count > 0) {
@@ -3346,20 +4456,62 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
       vp9_scale_references(cpi);
     }
 
-    vp9_set_quantizer(cm, q);
+    const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+    int ext_rc_delta_q_uv = 0;
+    if (cpi->ext_ratectrl.ready &&
+        (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0 &&
+        cpi->ext_ratectrl.funcs.get_encodeframe_decision != NULL) {
+      vpx_codec_err_t codec_status;
+      vpx_rc_encodeframe_decision_t encode_frame_decision;
+      int sb_size = num_8x8_blocks_wide_lookup[BLOCK_64X64] * MI_SIZE;
+      int frame_height_sb = (cm->height + sb_size - 1) / sb_size;
+      int frame_width_sb = (cm->width + sb_size - 1) / sb_size;
+      CHECK_MEM_ERROR(&cm->error, encode_frame_decision.sb_params_list,
+                      (sb_params *)vpx_calloc(
+                          frame_height_sb * frame_width_sb,
+                          sizeof(*encode_frame_decision.sb_params_list)));
+      codec_status = vp9_extrc_get_encodeframe_decision(
+          &cpi->ext_ratectrl, gf_group->index, &encode_frame_decision);
+      if (codec_status != VPX_CODEC_OK) {
+        vpx_internal_error(&cm->error, codec_status,
+                           "vp9_extrc_get_encodeframe_decision() failed");
+      }
+      for (int idx = 0; idx < frame_height_sb * frame_width_sb; ++idx) {
+        cpi->sb_mul_scale[idx] =
+            (((int64_t)encode_frame_decision.sb_params_list[idx].rdmult * 256) /
+             (encode_frame_decision.rdmult + 1));
+      }
+      vpx_free(encode_frame_decision.sb_params_list);
+      // If the external model recommends a reserved value, we use
+      // libvpx's default q.
+      if (encode_frame_decision.q_index != VPX_DEFAULT_Q) {
+        q = encode_frame_decision.q_index;
+      }
+      ext_rc_delta_q_uv = encode_frame_decision.delta_q_uv;
+    }
+
+    if (cpi->ext_ratectrl.ready && cpi->ext_ratectrl.log_file) {
+      fprintf(cpi->ext_ratectrl.log_file,
+              "ENCODE_FRAME_INFO gop_index %d update_type %d q %d\n",
+              gf_group->index, gf_group->update_type[gf_group->index], q);
+    }
+
+    vp9_set_quantizer(cpi, q, ext_rc_delta_q_uv);
 
     if (loop_count == 0) setup_frame(cpi);
 
     // Variance adaptive and in frame q adjustment experiments are mutually
     // exclusive.
-    if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+    if (oxcf->aq_mode == VARIANCE_AQ) {
       vp9_vaq_frame_setup(cpi);
-    } else if (cpi->oxcf.aq_mode == EQUATOR360_AQ) {
+    } else if (oxcf->aq_mode == EQUATOR360_AQ) {
       vp9_360aq_frame_setup(cpi);
-    } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+    } else if (oxcf->aq_mode == COMPLEXITY_AQ) {
       vp9_setup_in_frame_q_adj(cpi);
-    } else if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ) {
+    } else if (oxcf->aq_mode == LOOKAHEAD_AQ) {
       vp9_alt_ref_aq_setup_map(cpi->alt_ref_aq, cpi);
+    } else if (oxcf->aq_mode == PSNR_AQ) {
+      vp9_psnr_aq_mode_setup(&cm->seg);
     }
 
     vp9_encode_frame(cpi);
@@ -3375,14 +4527,20 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
     // to recode.
     if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) {
       save_coding_context(cpi);
-      if (!cpi->sf.use_nonrd_pick_mode) vp9_pack_bitstream(cpi, dest, size);
+      if (!cpi->sf.use_nonrd_pick_mode)
+        vp9_pack_bitstream(cpi, dest, dest_size, size);
 
       rc->projected_frame_size = (int)(*size) << 3;
 
       if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
     }
 
-    if (cpi->oxcf.rc_mode == VPX_Q) {
+    if (cpi->ext_ratectrl.ready &&
+        (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0) {
+      break;
+    }
+
+    if (oxcf->rc_mode == VPX_Q) {
       loop = 0;
     } else {
       if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced &&
@@ -3462,14 +4620,25 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
         // Frame is too large
         if (rc->projected_frame_size > rc->this_frame_target) {
           // Special case if the projected size is > the max allowed.
-          if (rc->projected_frame_size >= rc->max_frame_bandwidth)
-            q_high = rc->worst_quality;
+          if ((q == q_high) &&
+              ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
+               (!rc->is_src_frame_alt_ref &&
+                (rc->projected_frame_size >=
+                 big_rate_miss_high_threshold(cpi))))) {
+            int max_rate = VPXMAX(1, VPXMIN(rc->max_frame_bandwidth,
+                                            big_rate_miss_high_threshold(cpi)));
+            double q_val_high;
+            q_val_high = vp9_convert_qindex_to_q(q_high, cm->bit_depth);
+            q_val_high =
+                q_val_high * ((double)rc->projected_frame_size / max_rate);
+            q_high = vp9_convert_q_to_qindex(q_val_high, cm->bit_depth);
+            q_high = clamp(q_high, rc->best_quality, rc->worst_quality);
+          }
 
           // Raise Qlow as to at least the current value
           qstep =
               get_qstep_adj(rc->projected_frame_size, rc->this_frame_target);
           q_low = VPXMIN(q + qstep, q_high);
-          // q_low = q < q_high ? q + 1 : q_high;
 
           if (undershoot_seen || loop_at_this_size > 1) {
             // Update rate_correction_factor unless
@@ -3497,31 +4666,29 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
           qstep =
               get_qstep_adj(rc->this_frame_target, rc->projected_frame_size);
           q_high = VPXMAX(q - qstep, q_low);
-          // q_high = q > q_low ? q - 1 : q_low;
 
           if (overshoot_seen || loop_at_this_size > 1) {
             vp9_rc_update_rate_correction_factors(cpi);
             q = (q_high + q_low) / 2;
           } else {
             vp9_rc_update_rate_correction_factors(cpi);
-            q = vp9_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
-                                  top_index);
+            q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
+                                  VPXMIN(q_low, bottom_index), top_index);
             // Special case reset for qlow for constrained quality.
             // This should only trigger where there is very substantial
             // undershoot on a frame and the auto cq level is above
-            // the user passsed in value.
-            if (cpi->oxcf.rc_mode == VPX_CQ && q < q_low) {
+            // the user passed in value.
+            if (oxcf->rc_mode == VPX_CQ && q < q_low) {
               q_low = q;
             }
 
             while (q > q_high && retries < 10) {
               vp9_rc_update_rate_correction_factors(cpi);
-              q = vp9_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
-                                    top_index);
+              q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
+                                    VPXMIN(q_low, bottom_index), top_index);
               retries++;
             }
           }
-
           undershoot_seen = 1;
         }
 
@@ -3549,42 +4716,47 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
     }
 
     if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF)
-      if (loop || !enable_acl) restore_coding_context(cpi);
+      if (loop) restore_coding_context(cpi);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    if (loop) printf("\n Recoding:");
+#endif
   } while (loop);
 
+  rc->max_frame_bandwidth = orig_rc_max_frame_bandwidth;
+
+#ifdef AGGRESSIVE_VBR
+  if (two_pass_first_group_inter(cpi)) {
+    cpi->twopass.active_worst_quality =
+        VPXMIN(q + qrange_adj, oxcf->worst_allowed_q);
+  } else if (!frame_is_kf_gf_arf(cpi)) {
+#else
+  if (!frame_is_kf_gf_arf(cpi)) {
+#endif
+    // Have we been forced to adapt Q outside the expected range by an extreme
+    // rate miss. If so adjust the active maxQ for the subsequent frames.
+    if (!rc->is_src_frame_alt_ref && (q > cpi->twopass.active_worst_quality)) {
+      cpi->twopass.active_worst_quality = q;
+    } else if (oxcf->vbr_corpus_complexity && q == q_low &&
+               rc->projected_frame_size < rc->this_frame_target) {
+      cpi->twopass.active_worst_quality =
+          VPXMAX(q, cpi->twopass.active_worst_quality - 1);
+    }
+  }
+
   if (enable_acl) {
-    vp9_encode_frame(cpi);
+    // Skip recoding, if model diff is below threshold
+    const int thresh = compute_context_model_thresh(cpi);
+    const int diff = compute_context_model_diff(cm);
+    if (diff >= thresh) {
+      vp9_encode_frame(cpi);
+    }
+  }
+  if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) {
     vpx_clear_system_state();
     restore_coding_context(cpi);
-    vp9_pack_bitstream(cpi, dest, size);
-
-    vp9_encode_frame(cpi);
-    vpx_clear_system_state();
-
-    restore_coding_context(cpi);
   }
 }
-
-static int get_ref_frame_flags(const VP9_COMP *cpi) {
-  const int *const map = cpi->common.ref_frame_map;
-  const int gold_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idx];
-  const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idx];
-  const int gold_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx];
-  int flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
-
-  if (gold_is_last) flags &= ~VP9_GOLD_FLAG;
-
-  if (cpi->rc.frames_till_gf_update_due == INT_MAX &&
-      (cpi->svc.number_temporal_layers == 1 &&
-       cpi->svc.number_spatial_layers == 1))
-    flags &= ~VP9_GOLD_FLAG;
-
-  if (alt_is_last) flags &= ~VP9_ALT_FLAG;
-
-  if (gold_is_alt) flags &= ~VP9_ALT_FLAG;
-
-  return flags;
-}
+#endif  // !CONFIG_REALTIME_ONLY
 
 static void set_ext_overrides(VP9_COMP *cpi) {
   // Overrides the defaults with the externally supplied values with
@@ -3602,18 +4774,28 @@ static void set_ext_overrides(VP9_COMP *cpi) {
   }
 }
 
-YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(VP9_COMMON *cm,
-                                           YV12_BUFFER_CONFIG *unscaled,
-                                           YV12_BUFFER_CONFIG *scaled,
-                                           YV12_BUFFER_CONFIG *scaled_temp) {
+YV12_BUFFER_CONFIG *vp9_scale_if_required(
+    VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
+    int use_normative_scaler, INTERP_FILTER filter_type, int phase_scaler) {
   if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
       cm->mi_rows * MI_SIZE != unscaled->y_height) {
 #if CONFIG_VP9_HIGHBITDEPTH
-    scale_and_extend_frame(unscaled, scaled_temp, (int)cm->bit_depth);
-    scale_and_extend_frame(scaled_temp, scaled, (int)cm->bit_depth);
+    if (use_normative_scaler && unscaled->y_width <= (scaled->y_width << 1) &&
+        unscaled->y_height <= (scaled->y_height << 1))
+      if (cm->bit_depth == VPX_BITS_8)
+        vp9_scale_and_extend_frame(unscaled, scaled, filter_type, phase_scaler);
+      else
+        scale_and_extend_frame(unscaled, scaled, (int)cm->bit_depth,
+                               filter_type, phase_scaler);
+    else
+      vp9_scale_and_extend_frame_nonnormative(unscaled, scaled,
+                                              (int)cm->bit_depth);
 #else
-    vp9_scale_and_extend_frame(unscaled, scaled_temp);
-    vp9_scale_and_extend_frame(scaled_temp, scaled);
+    if (use_normative_scaler && unscaled->y_width <= (scaled->y_width << 1) &&
+        unscaled->y_height <= (scaled->y_height << 1))
+      vp9_scale_and_extend_frame(unscaled, scaled, filter_type, phase_scaler);
+    else
+      vp9_scale_and_extend_frame_nonnormative(unscaled, scaled);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     return scaled;
   } else {
@@ -3621,45 +4803,21 @@ YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(VP9_COMMON *cm,
   }
 }
 
-YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
-                                          YV12_BUFFER_CONFIG *unscaled,
-                                          YV12_BUFFER_CONFIG *scaled,
-                                          int use_normative_scaler) {
-  if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
-      cm->mi_rows * MI_SIZE != unscaled->y_height) {
-#if CONFIG_VP9_HIGHBITDEPTH
-    if (use_normative_scaler && unscaled->y_width <= (scaled->y_width << 1) &&
-        unscaled->y_height <= (scaled->y_height << 1))
-      scale_and_extend_frame(unscaled, scaled, (int)cm->bit_depth);
-    else
-      scale_and_extend_frame_nonnormative(unscaled, scaled, (int)cm->bit_depth);
-#else
-    if (use_normative_scaler && unscaled->y_width <= (scaled->y_width << 1) &&
-        unscaled->y_height <= (scaled->y_height << 1))
-      vp9_scale_and_extend_frame(unscaled, scaled);
-    else
-      scale_and_extend_frame_nonnormative(unscaled, scaled);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-    return scaled;
-  } else {
-    return unscaled;
-  }
-}
-
-static void set_arf_sign_bias(VP9_COMP *cpi) {
+static void set_ref_sign_bias(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
-  int arf_sign_bias;
+  RefCntBuffer *const ref_buffer = get_ref_cnt_buffer(cm, cm->new_fb_idx);
+  const int cur_frame_index = ref_buffer->frame_index;
+  MV_REFERENCE_FRAME ref_frame;
 
-  if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
-    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-    arf_sign_bias = cpi->rc.source_alt_ref_active &&
-                    (!cpi->refresh_alt_ref_frame ||
-                     (gf_group->rf_level[gf_group->index] == GF_ARF_LOW));
-  } else {
-    arf_sign_bias =
-        (cpi->rc.source_alt_ref_active && !cpi->refresh_alt_ref_frame);
+  for (ref_frame = LAST_FRAME; ref_frame < MAX_REF_FRAMES; ++ref_frame) {
+    const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+    const RefCntBuffer *const ref_cnt_buf =
+        get_ref_cnt_buffer(&cpi->common, buf_idx);
+    if (ref_cnt_buf) {
+      cm->ref_frame_sign_bias[ref_frame] =
+          cur_frame_index < ref_cnt_buf->frame_index;
+    }
   }
-  cm->ref_frame_sign_bias[ALTREF_FRAME] = arf_sign_bias;
 }
 
 static int setup_interp_filter_search_mask(VP9_COMP *cpi) {
@@ -3688,9 +4846,9 @@ static int setup_interp_filter_search_mask(VP9_COMP *cpi) {
 }
 
 #ifdef ENABLE_KF_DENOISE
-// Baseline Kernal weights for denoise
-static uint8_t dn_kernal_3[9] = { 1, 2, 1, 2, 4, 2, 1, 2, 1 };
-static uint8_t dn_kernal_5[25] = { 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 4,
+// Baseline kernel weights for denoise
+static uint8_t dn_kernel_3[9] = { 1, 2, 1, 2, 4, 2, 1, 2, 1 };
+static uint8_t dn_kernel_5[25] = { 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 4,
                                    2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1 };
 
 static INLINE void add_denoise_point(int centre_val, int data_val, int thresh,
@@ -3707,37 +4865,37 @@ static void spatial_denoise_point(uint8_t *src_ptr, const int stride,
   int sum_weight = 0;
   int sum_val = 0;
   int thresh = strength;
-  int kernal_size = 5;
+  int kernel_size = 5;
   int half_k_size = 2;
   int i, j;
   int max_diff = 0;
   uint8_t *tmp_ptr;
-  uint8_t *kernal_ptr;
+  uint8_t *kernel_ptr;
 
   // Find the maximum deviation from the source point in the locale.
   tmp_ptr = src_ptr - (stride * (half_k_size + 1)) - (half_k_size + 1);
-  for (i = 0; i < kernal_size + 2; ++i) {
-    for (j = 0; j < kernal_size + 2; ++j) {
+  for (i = 0; i < kernel_size + 2; ++i) {
+    for (j = 0; j < kernel_size + 2; ++j) {
       max_diff = VPXMAX(max_diff, abs((int)*src_ptr - (int)tmp_ptr[j]));
     }
     tmp_ptr += stride;
   }
 
-  // Select the kernal size.
+  // Select the kernel size.
   if (max_diff > (strength + (strength >> 1))) {
-    kernal_size = 3;
+    kernel_size = 3;
     half_k_size = 1;
     thresh = thresh >> 1;
   }
-  kernal_ptr = (kernal_size == 3) ? dn_kernal_3 : dn_kernal_5;
+  kernel_ptr = (kernel_size == 3) ? dn_kernel_3 : dn_kernel_5;
 
-  // Apply the kernal
+  // Apply the kernel
   tmp_ptr = src_ptr - (stride * half_k_size) - half_k_size;
-  for (i = 0; i < kernal_size; ++i) {
-    for (j = 0; j < kernal_size; ++j) {
-      add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernal_ptr,
+  for (i = 0; i < kernel_size; ++i) {
+    for (j = 0; j < kernel_size; ++j) {
+      add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernel_ptr,
                         &sum_val, &sum_weight);
-      ++kernal_ptr;
+      ++kernel_ptr;
     }
     tmp_ptr += stride;
   }
@@ -3752,37 +4910,37 @@ static void highbd_spatial_denoise_point(uint16_t *src_ptr, const int stride,
   int sum_weight = 0;
   int sum_val = 0;
   int thresh = strength;
-  int kernal_size = 5;
+  int kernel_size = 5;
   int half_k_size = 2;
   int i, j;
   int max_diff = 0;
   uint16_t *tmp_ptr;
-  uint8_t *kernal_ptr;
+  uint8_t *kernel_ptr;
 
   // Find the maximum deviation from the source point in the locale.
   tmp_ptr = src_ptr - (stride * (half_k_size + 1)) - (half_k_size + 1);
-  for (i = 0; i < kernal_size + 2; ++i) {
-    for (j = 0; j < kernal_size + 2; ++j) {
+  for (i = 0; i < kernel_size + 2; ++i) {
+    for (j = 0; j < kernel_size + 2; ++j) {
       max_diff = VPXMAX(max_diff, abs((int)src_ptr - (int)tmp_ptr[j]));
     }
     tmp_ptr += stride;
   }
 
-  // Select the kernal size.
+  // Select the kernel size.
   if (max_diff > (strength + (strength >> 1))) {
-    kernal_size = 3;
+    kernel_size = 3;
     half_k_size = 1;
     thresh = thresh >> 1;
   }
-  kernal_ptr = (kernal_size == 3) ? dn_kernal_3 : dn_kernal_5;
+  kernel_ptr = (kernel_size == 3) ? dn_kernel_3 : dn_kernel_5;
 
-  // Apply the kernal
+  // Apply the kernel
   tmp_ptr = src_ptr - (stride * half_k_size) - half_k_size;
-  for (i = 0; i < kernal_size; ++i) {
-    for (j = 0; j < kernal_size; ++j) {
-      add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernal_ptr,
+  for (i = 0; i < kernel_size; ++i) {
+    for (j = 0; j < kernel_size; ++j) {
+      add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernel_ptr,
                         &sum_val, &sum_weight);
-      ++kernal_ptr;
+      ++kernel_ptr;
     }
     tmp_ptr += stride;
   }
@@ -3792,7 +4950,7 @@ static void highbd_spatial_denoise_point(uint16_t *src_ptr, const int stride,
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-// Apply thresholded spatial noise supression to a given buffer.
+// Apply thresholded spatial noise suppression to a given buffer.
 static void spatial_denoise_buffer(VP9_COMP *cpi, uint8_t *buffer,
                                    const int stride, const int width,
                                    const int height, const int strength) {
@@ -3817,7 +4975,7 @@ static void spatial_denoise_buffer(VP9_COMP *cpi, uint8_t *buffer,
   }
 }
 
-// Apply thresholded spatial noise supression to source.
+// Apply thresholded spatial noise suppression to source.
 static void spatial_denoise_frame(VP9_COMP *cpi) {
   YV12_BUFFER_CONFIG *src = cpi->Source;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
@@ -3827,8 +4985,7 @@ static void spatial_denoise_frame(VP9_COMP *cpi) {
   // Base the filter strength on the current active max Q.
   const int q = (int)(vp9_convert_qindex_to_q(twopass->active_worst_quality,
                                               cm->bit_depth));
-  int strength =
-      VPXMAX(oxcf->arnr_strength >> 2, VPXMIN(oxcf->arnr_strength, (q >> 4)));
+  int strength = clamp(q >> 4, oxcf->arnr_strength >> 2, oxcf->arnr_strength);
 
   // Denoise each of Y,U and V buffers.
   spatial_denoise_buffer(cpi, src->y_buffer, src->y_stride, src->y_width,
@@ -3843,8 +5000,9 @@ static void spatial_denoise_frame(VP9_COMP *cpi) {
 }
 #endif  // ENABLE_KF_DENOISE
 
+#if !CONFIG_REALTIME_ONLY
 static void vp9_try_disable_lookahead_aq(VP9_COMP *cpi, size_t *size,
-                                         uint8_t *dest) {
+                                         uint8_t *dest, size_t dest_size) {
   if (cpi->common.seg.enabled)
     if (ALT_REF_AQ_PROTECT_GAIN) {
       size_t nsize = *size;
@@ -3855,7 +5013,7 @@ static void vp9_try_disable_lookahead_aq(VP9_COMP *cpi, size_t *size,
 
       save_coding_context(cpi);
       vp9_disable_segmentation(&cpi->common.seg);
-      vp9_pack_bitstream(cpi, dest, &nsize);
+      vp9_pack_bitstream(cpi, dest, dest_size, &nsize);
       restore_coding_context(cpi);
 
       overhead = (int)*size - (int)nsize;
@@ -3866,15 +5024,263 @@ static void vp9_try_disable_lookahead_aq(VP9_COMP *cpi, size_t *size,
         vp9_enable_segmentation(&cpi->common.seg);
     }
 }
+#endif
 
-static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
-                                      uint8_t *dest,
-                                      unsigned int *frame_flags) {
+static void set_frame_index(VP9_COMP *cpi, VP9_COMMON *cm) {
+  RefCntBuffer *const ref_buffer = get_ref_cnt_buffer(cm, cm->new_fb_idx);
+
+  if (ref_buffer) {
+    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    ref_buffer->frame_index =
+        cm->current_video_frame + gf_group->arf_src_offset[gf_group->index];
+    ref_buffer->frame_coding_index = cm->current_frame_coding_index;
+  }
+}
+
+static void set_mb_ssim_rdmult_scaling(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  ThreadData *td = &cpi->td;
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  uint8_t *y_buffer = cpi->Source->y_buffer;
+  const int y_stride = cpi->Source->y_stride;
+  const int block_size = BLOCK_16X16;
+
+  const int num_8x8_w = num_8x8_blocks_wide_lookup[block_size];
+  const int num_8x8_h = num_8x8_blocks_high_lookup[block_size];
+  const int num_cols = (cm->mi_cols + num_8x8_w - 1) / num_8x8_w;
+  const int num_rows = (cm->mi_rows + num_8x8_h - 1) / num_8x8_h;
+  double log_sum = 0.0;
+  int row, col;
+
+  // Loop through each 64x64 block.
+  for (row = 0; row < num_rows; ++row) {
+    for (col = 0; col < num_cols; ++col) {
+      int mi_row, mi_col;
+      double var = 0.0, num_of_var = 0.0;
+      const int index = row * num_cols + col;
+
+      for (mi_row = row * num_8x8_h;
+           mi_row < cm->mi_rows && mi_row < (row + 1) * num_8x8_h; ++mi_row) {
+        for (mi_col = col * num_8x8_w;
+             mi_col < cm->mi_cols && mi_col < (col + 1) * num_8x8_w; ++mi_col) {
+          struct buf_2d buf;
+          const int row_offset_y = mi_row << 3;
+          const int col_offset_y = mi_col << 3;
+
+          buf.buf = y_buffer + row_offset_y * y_stride + col_offset_y;
+          buf.stride = y_stride;
+
+          // In order to make SSIM_VAR_SCALE in a same scale for both 8 bit
+          // and high bit videos, the variance needs to be divided by 2.0 or
+          // 64.0 separately.
+          // TODO(sdeng): need to tune for 12bit videos.
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (cpi->Source->flags & YV12_FLAG_HIGHBITDEPTH)
+            var += vp9_high_get_sby_variance(cpi, &buf, BLOCK_8X8, xd->bd);
+          else
+#endif
+            var += vp9_get_sby_variance(cpi, &buf, BLOCK_8X8);
+
+          num_of_var += 1.0;
+        }
+      }
+      var = var / num_of_var / 64.0;
+
+      // Curve fitting with an exponential model on all 16x16 blocks from the
+      // Midres dataset.
+      var = 67.035434 * (1 - exp(-0.0021489 * var)) + 17.492222;
+      cpi->mi_ssim_rdmult_scaling_factors[index] = var;
+      log_sum += log(var);
+    }
+  }
+  log_sum = exp(log_sum / (double)(num_rows * num_cols));
+
+  for (row = 0; row < num_rows; ++row) {
+    for (col = 0; col < num_cols; ++col) {
+      const int index = row * num_cols + col;
+      cpi->mi_ssim_rdmult_scaling_factors[index] /= log_sum;
+    }
+  }
+
+  (void)xd;
+}
+
+// Process the wiener variance in 16x16 block basis.
+static int qsort_comp(const void *elem1, const void *elem2) {
+  int a = *((const int *)elem1);
+  int b = *((const int *)elem2);
+  if (a > b) return 1;
+  if (a < b) return -1;
+  return 0;
+}
+
+static void init_mb_wiener_var_buffer(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+
+  if (cpi->mb_wiener_variance && cpi->mb_wiener_var_rows >= cm->mb_rows &&
+      cpi->mb_wiener_var_cols >= cm->mb_cols)
+    return;
+
+  vpx_free(cpi->mb_wiener_variance);
+  cpi->mb_wiener_variance = NULL;
+
+  CHECK_MEM_ERROR(
+      &cm->error, cpi->mb_wiener_variance,
+      vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->mb_wiener_variance)));
+  cpi->mb_wiener_var_rows = cm->mb_rows;
+  cpi->mb_wiener_var_cols = cm->mb_cols;
+}
+
+static void init_sb_mul_scale_buffer(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+
+  if (cpi->mb_wiener_var_rows >= cm->mb_rows &&
+      cpi->mb_wiener_var_cols >= cm->mb_cols)
+    return;
+
+  vpx_free(cpi->sb_mul_scale);
+  cpi->sb_mul_scale = NULL;
+
+  CHECK_MEM_ERROR(
+      &cm->error, cpi->sb_mul_scale,
+      vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->sb_mul_scale)));
+  cpi->mb_wiener_var_rows = cm->mb_rows;
+  cpi->mb_wiener_var_cols = cm->mb_cols;
+}
+
+static void set_mb_wiener_variance(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  uint8_t *buffer = cpi->Source->y_buffer;
+  int buf_stride = cpi->Source->y_stride;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  ThreadData *td = &cpi->td;
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  DECLARE_ALIGNED(16, uint16_t, zero_pred16[32 * 32]);
+  DECLARE_ALIGNED(16, uint8_t, zero_pred8[32 * 32]);
+  uint8_t *zero_pred;
+#else
+  DECLARE_ALIGNED(16, uint8_t, zero_pred[32 * 32]);
+#endif
+
+  DECLARE_ALIGNED(16, int16_t, src_diff[32 * 32]);
+  DECLARE_ALIGNED(16, tran_low_t, coeff[32 * 32]);
+
+  int mb_row, mb_col, count = 0;
+  // Hard coded operating block size
+  const int block_size = 16;
+  const int coeff_count = block_size * block_size;
+  const TX_SIZE tx_size = TX_16X16;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  xd->cur_buf = cpi->Source;
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    zero_pred = CONVERT_TO_BYTEPTR(zero_pred16);
+    memset(zero_pred16, 0, sizeof(*zero_pred16) * coeff_count);
+  } else {
+    zero_pred = zero_pred8;
+    memset(zero_pred8, 0, sizeof(*zero_pred8) * coeff_count);
+  }
+#else
+  memset(zero_pred, 0, sizeof(*zero_pred) * coeff_count);
+#endif
+
+  cpi->norm_wiener_variance = 0;
+
+  for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
+    for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
+      int idx;
+      int16_t median_val = 0;
+      uint8_t *mb_buffer =
+          buffer + mb_row * block_size * buf_stride + mb_col * block_size;
+      int64_t wiener_variance = 0;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        vpx_highbd_subtract_block(block_size, block_size, src_diff, block_size,
+                                  mb_buffer, buf_stride, zero_pred, block_size,
+                                  xd->bd);
+        vp9_highbd_wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
+      } else {
+        vpx_subtract_block(block_size, block_size, src_diff, block_size,
+                           mb_buffer, buf_stride, zero_pred, block_size);
+        vp9_wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
+      }
+#else
+      vpx_subtract_block(block_size, block_size, src_diff, block_size,
+                         mb_buffer, buf_stride, zero_pred, block_size);
+      vp9_wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+      coeff[0] = 0;
+      for (idx = 1; idx < coeff_count; ++idx) coeff[idx] = abs(coeff[idx]);
+
+      qsort(coeff, coeff_count - 1, sizeof(*coeff), qsort_comp);
+
+      // Noise level estimation
+      median_val = coeff[coeff_count / 2];
+
+      // Wiener filter
+      for (idx = 1; idx < coeff_count; ++idx) {
+        int64_t sqr_coeff = (int64_t)coeff[idx] * coeff[idx];
+        int64_t tmp_coeff = (int64_t)coeff[idx];
+        if (median_val) {
+          tmp_coeff = (sqr_coeff * coeff[idx]) /
+                      (sqr_coeff + (int64_t)median_val * median_val);
+        }
+        wiener_variance += tmp_coeff * tmp_coeff;
+      }
+      cpi->mb_wiener_variance[mb_row * cm->mb_cols + mb_col] =
+          wiener_variance / coeff_count;
+      cpi->norm_wiener_variance +=
+          cpi->mb_wiener_variance[mb_row * cm->mb_cols + mb_col];
+      ++count;
+    }
+  }
+
+  if (count) cpi->norm_wiener_variance /= count;
+  cpi->norm_wiener_variance = VPXMAX(1, cpi->norm_wiener_variance);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static PSNR_STATS compute_psnr_stats(const YV12_BUFFER_CONFIG *source_frame,
+                                     const YV12_BUFFER_CONFIG *coded_frame,
+                                     uint32_t bit_depth,
+                                     uint32_t input_bit_depth,
+                                     int spatial_layer_id) {
+  PSNR_STATS psnr;
+#if CONFIG_VP9_HIGHBITDEPTH
+  vpx_calc_highbd_psnr(source_frame, coded_frame, &psnr, bit_depth,
+                       input_bit_depth, spatial_layer_id);
+#else   // CONFIG_VP9_HIGHBITDEPTH
+  (void)bit_depth;
+  (void)input_bit_depth;
+  vpx_calc_psnr(source_frame, coded_frame, &psnr, spatial_layer_id);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  return psnr;
+}
+
+static void update_encode_frame_result_basic(
+    FRAME_UPDATE_TYPE update_type, int show_idx, int quantize_index,
+    ENCODE_FRAME_RESULT *encode_frame_result) {
+  encode_frame_result->show_idx = show_idx;
+  encode_frame_result->update_type = update_type;
+  encode_frame_result->quantize_index = quantize_index;
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+static void encode_frame_to_data_rate(
+    VP9_COMP *cpi, size_t *size, uint8_t *dest, size_t dest_size,
+    unsigned int *frame_flags, ENCODE_FRAME_RESULT *encode_frame_result) {
   VP9_COMMON *const cm = &cpi->common;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   struct segmentation *const seg = &cm->seg;
   TX_SIZE t;
 
+  if (vp9_svc_check_skip_enhancement_layer(cpi)) return;
+
   set_ext_overrides(cpi);
   vpx_clear_system_state();
 
@@ -3883,8 +5289,18 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
   if (is_spatial_denoise_enabled(cpi)) spatial_denoise_frame(cpi);
 #endif
 
-  // Set the arf sign bias for this frame.
-  set_arf_sign_bias(cpi);
+  if (cm->show_existing_frame == 0) {
+    // Update frame index
+    set_frame_index(cpi, cm);
+
+    // Set the arf sign bias for this frame.
+    set_ref_sign_bias(cpi);
+  }
+
+  // On the very first frame set the deadline_mode_previous_frame to
+  // the current mode.
+  if (cpi->common.current_video_frame == 0)
+    cpi->deadline_mode_previous_frame = cpi->oxcf.mode;
 
   // Set default state for segment based loop filter update flags.
   cm->lf.mode_ref_delta_update = 0;
@@ -3919,65 +5335,15 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
       cm->reset_frame_context = 2;
     }
   }
-  if (is_two_pass_svc(cpi) && cm->error_resilient_mode == 0) {
-    // Use context 0 for intra only empty frame, but the last frame context
-    // for other empty frames.
-    if (cpi->svc.encode_empty_frame_state == ENCODING) {
-      if (cpi->svc.encode_intra_empty_frame != 0)
-        cm->frame_context_idx = 0;
-      else
-        cm->frame_context_idx = FRAME_CONTEXTS - 1;
-    } else {
-      cm->frame_context_idx =
-          cpi->svc.spatial_layer_id * cpi->svc.number_temporal_layers +
-          cpi->svc.temporal_layer_id;
-    }
 
-    cm->frame_parallel_decoding_mode = oxcf->frame_parallel_decoding_mode;
+  if (oxcf->tuning == VP8_TUNE_SSIM) set_mb_ssim_rdmult_scaling(cpi);
 
-    // The probs will be updated based on the frame type of its previous
-    // frame if frame_parallel_decoding_mode is 0. The type may vary for
-    // the frame after a key frame in base layer since we may drop enhancement
-    // layers. So set frame_parallel_decoding_mode to 1 in this case.
-    if (cm->frame_parallel_decoding_mode == 0) {
-      if (cpi->svc.number_temporal_layers == 1) {
-        if (cpi->svc.spatial_layer_id == 0 &&
-            cpi->svc.layer_context[0].last_frame_type == KEY_FRAME)
-          cm->frame_parallel_decoding_mode = 1;
-      } else if (cpi->svc.spatial_layer_id == 0) {
-        // Find the 2nd frame in temporal base layer and 1st frame in temporal
-        // enhancement layers from the key frame.
-        int i;
-        for (i = 0; i < cpi->svc.number_temporal_layers; ++i) {
-          if (cpi->svc.layer_context[0].frames_from_key_frame == 1 << i) {
-            cm->frame_parallel_decoding_mode = 1;
-            break;
-          }
-        }
-      }
-    }
+  if (oxcf->aq_mode == PERCEPTUAL_AQ) {
+    init_mb_wiener_var_buffer(cpi);
+    set_mb_wiener_variance(cpi);
   }
 
-  // For 1 pass CBR, check if we are dropping this frame.
-  // For spatial layers, for now only check for frame-dropping on first spatial
-  // layer, and if decision is to drop, we drop whole super-frame.
-  if (oxcf->pass == 0 && oxcf->rc_mode == VPX_CBR &&
-      cm->frame_type != KEY_FRAME) {
-    if (vp9_rc_drop_frame(cpi) ||
-        (is_one_pass_cbr_svc(cpi) && cpi->svc.rc_drop_superframe == 1)) {
-      vp9_rc_postencode_update_drop_frame(cpi);
-      ++cm->current_video_frame;
-      cpi->ext_refresh_frame_flags_pending = 0;
-      cpi->svc.rc_drop_superframe = 1;
-      // TODO(marpan): Advancing the svc counters on dropped frames can break
-      // the referencing scheme for the fixed svc patterns defined in
-      // vp9_one_pass_cbr_svc_start_layer(). Look into fixing this issue, but
-      // for now, don't advance the svc frame counters on dropped frame.
-      // if (cpi->use_svc)
-      //   vp9_inc_frame_in_layer(cpi);
-      return;
-    }
-  }
+  init_sb_mul_scale_buffer(cpi);
 
   vpx_clear_system_state();
 
@@ -3985,28 +5351,76 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
   memset(cpi->mode_chosen_counts, 0,
          MAX_MODES * sizeof(*cpi->mode_chosen_counts));
 #endif
-
-  if (cpi->sf.recode_loop == DISALLOW_RECODE) {
-    encode_without_recode_loop(cpi, size, dest);
-  } else {
-    encode_with_recode_loop(cpi, size, dest);
+  // Backup to ensure consistency between recodes
+  save_encode_params(cpi);
+  if (cpi->ext_ratectrl.ready &&
+      (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0 &&
+      cpi->ext_ratectrl.funcs.get_frame_rdmult != NULL) {
+    vpx_codec_err_t codec_status;
+    const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+    FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
+    const int ref_frame_flags = get_ref_frame_flags(cpi);
+    RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES];
+    const RefCntBuffer *curr_frame_buf = get_ref_cnt_buffer(cm, cm->new_fb_idx);
+    // index 0 of a gf group is always KEY/OVERLAY/GOLDEN.
+    // index 1 refers to the first encoding frame in a gf group.
+    // Therefore if it is ARF_UPDATE, it means this gf group uses alt ref.
+    // See function define_gf_group_structure().
+    const int use_alt_ref = gf_group->update_type[1] == ARF_UPDATE;
+    int ext_rdmult = VPX_DEFAULT_RDMULT;
+    get_ref_frame_bufs(cpi, ref_frame_bufs);
+    codec_status = vp9_extrc_get_frame_rdmult(
+        &cpi->ext_ratectrl, curr_frame_buf->frame_index,
+        cm->current_frame_coding_index, gf_group->index, update_type,
+        gf_group->gf_group_size, use_alt_ref, ref_frame_bufs, ref_frame_flags,
+        &ext_rdmult);
+    if (codec_status != VPX_CODEC_OK) {
+      vpx_internal_error(&cm->error, codec_status,
+                         "vp9_extrc_get_frame_rdmult() failed");
+    }
+    cpi->ext_ratectrl.ext_rdmult = ext_rdmult;
   }
 
+  if (cpi->sf.recode_loop == DISALLOW_RECODE) {
+    if (!encode_without_recode_loop(cpi, size, dest, dest_size)) return;
+  } else {
+#if !CONFIG_REALTIME_ONLY
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, encode_with_recode_loop_time);
+#endif
+    encode_with_recode_loop(cpi, size, dest, dest_size);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, encode_with_recode_loop_time);
+#endif
+#endif  // !CONFIG_REALTIME_ONLY
+  }
+
+  // TODO(jingning): When using show existing frame mode, we assume that the
+  // current ARF will be directly used as the final reconstructed frame. This is
+  // an encoder control scheme. One could in principle explore other
+  // possibilities to arrange the reference frame buffer and their coding order.
+  if (cm->show_existing_frame) {
+    ref_cnt_fb(cm->buffer_pool->frame_bufs, &cm->new_fb_idx,
+               cm->ref_frame_map[cpi->alt_fb_idx]);
+  }
+
+#if !CONFIG_REALTIME_ONLY
   // Disable segmentation if it decrease rate/distortion ratio
   if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ)
-    vp9_try_disable_lookahead_aq(cpi, size, dest);
+    vp9_try_disable_lookahead_aq(cpi, size, dest, dest_size);
+#endif
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
 #ifdef OUTPUT_YUV_DENOISED
-  if (oxcf->noise_sensitivity > 0) {
-    vp9_write_yuv_frame_420(&cpi->denoiser.running_avg_y[INTRA_FRAME],
-                            yuv_denoised_file);
+  if (oxcf->noise_sensitivity > 0 && denoise_svc(cpi)) {
+    vpx_write_yuv_frame(yuv_denoised_file,
+                        &cpi->denoiser.running_avg_y[INTRA_FRAME]);
   }
 #endif
 #endif
 #ifdef OUTPUT_YUV_SKINMAP
   if (cpi->common.current_video_frame > 1) {
-    vp9_compute_skin_map(cpi, yuv_skinmap_file);
+    vp9_output_skin_map(cpi, yuv_skinmap_file);
   }
 #endif
 
@@ -4035,11 +5449,98 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
   cm->frame_to_show->render_width = cm->render_width;
   cm->frame_to_show->render_height = cm->render_height;
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, loopfilter_frame_time);
+#endif
   // Pick the loop filter level for the frame.
   loopfilter_frame(cpi, cm);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, loopfilter_frame_time);
+#endif
 
+  if (cpi->rc.use_post_encode_drop) save_coding_context(cpi);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, vp9_pack_bitstream_time);
+#endif
   // build the bitstream
-  vp9_pack_bitstream(cpi, dest, size);
+  vp9_pack_bitstream(cpi, dest, dest_size, size);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, vp9_pack_bitstream_time);
+#endif
+
+  if (cpi->ext_ratectrl.ready &&
+      cpi->ext_ratectrl.funcs.update_encodeframe_result != NULL) {
+    vpx_codec_err_t codec_status = vp9_extrc_update_encodeframe_result(
+        &cpi->ext_ratectrl, (*size) << 3, cm->base_qindex);
+    if (codec_status != VPX_CODEC_OK) {
+      vpx_internal_error(&cm->error, codec_status,
+                         "vp9_extrc_update_encodeframe_result() failed");
+    }
+  }
+#if CONFIG_REALTIME_ONLY
+  (void)encode_frame_result;
+  assert(encode_frame_result == NULL);
+#else   // CONFIG_REALTIME_ONLY
+  if (encode_frame_result != NULL) {
+    const RefCntBuffer *coded_frame_buf =
+        get_ref_cnt_buffer(cm, cm->new_fb_idx);
+    RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES];
+    FRAME_UPDATE_TYPE update_type =
+        cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index];
+    int quantize_index = vp9_get_quantizer(cpi);
+    get_ref_frame_bufs(cpi, ref_frame_bufs);
+    // update_encode_frame_result() depends on twopass.gf_group.index and
+    // cm->new_fb_idx, cpi->Source, cpi->lst_fb_idx, cpi->gld_fb_idx and
+    // cpi->alt_fb_idx are updated for current frame and have
+    // not been updated for the next frame yet.
+    // The update locations are as follows.
+    // 1) twopass.gf_group.index is initialized at define_gf_group by vp9_zero()
+    // for the first frame in the gf_group and is updated for the next frame at
+    // vp9_twopass_postencode_update().
+    // 2) cpi->Source is updated at the beginning of vp9_get_compressed_data()
+    // 3) cm->new_fb_idx is updated at the beginning of
+    // vp9_get_compressed_data() by get_free_fb(cm).
+    // 4) cpi->lst_fb_idx/gld_fb_idx/alt_fb_idx will be updated for the next
+    // frame at vp9_update_reference_frames().
+    // This function needs to be called before vp9_update_reference_frames().
+    // TODO(angiebird): Improve the codebase to make the update of frame
+    // dependent variables more robust.
+
+    update_encode_frame_result_basic(update_type, coded_frame_buf->frame_index,
+                                     quantize_index, encode_frame_result);
+    if (cpi->ext_ratectrl.ready && cpi->ext_ratectrl.log_file) {
+      PSNR_STATS psnr = compute_psnr_stats(
+          cpi->Source, &coded_frame_buf->buf, cm->bit_depth,
+          cpi->oxcf.input_bit_depth, cpi->svc.spatial_layer_id);
+      fprintf(cpi->ext_ratectrl.log_file,
+              "ENCODE_FRAME_RESULT gop_index %d psnr %f bits %zu\n",
+              cpi->twopass.gf_group.index, psnr.psnr[0], (*size) << 3);
+    }
+  }
+#endif  // CONFIG_REALTIME_ONLY
+
+  if (cpi->rc.use_post_encode_drop && cm->base_qindex < cpi->rc.worst_quality &&
+      cpi->svc.spatial_layer_id == 0 && post_encode_drop_cbr(cpi, size)) {
+    restore_coding_context(cpi);
+    return;
+  }
+
+  cpi->last_frame_dropped = 0;
+  cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = 0;
+  if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)
+    cpi->svc.num_encoded_top_layer++;
+
+  // Keep track of the frame buffer index updated/refreshed for the
+  // current encoded TL0 superframe.
+  if (cpi->svc.temporal_layer_id == 0) {
+    if (cpi->refresh_last_frame)
+      cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id] = cpi->lst_fb_idx;
+    else if (cpi->refresh_golden_frame)
+      cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id] = cpi->gld_fb_idx;
+    else if (cpi->refresh_alt_ref_frame)
+      cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id] = cpi->alt_fb_idx;
+  }
 
   if (cm->seg.update_map) update_reference_segmentation_map(cpi);
 
@@ -4048,17 +5549,18 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
   }
   vp9_update_reference_frames(cpi);
 
-  for (t = TX_4X4; t <= TX_32X32; t++)
-    full_to_model_counts(cpi->td.counts->coef[t],
-                         cpi->td.rd_counts.coef_counts[t]);
+  if (!cm->show_existing_frame) {
+    for (t = TX_4X4; t <= TX_32X32; ++t) {
+      full_to_model_counts(cpi->td.counts->coef[t],
+                           cpi->td.rd_counts.coef_counts[t]);
+    }
 
-  if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode)
-    vp9_adapt_coef_probs(cm);
-
-  if (!frame_is_intra_only(cm)) {
     if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) {
-      vp9_adapt_mode_probs(cm);
-      vp9_adapt_mv_probs(cm, cm->allow_high_precision_mv);
+      if (!frame_is_intra_only(cm)) {
+        vp9_adapt_mode_probs(cm);
+        vp9_adapt_mv_probs(cm, cm->allow_high_precision_mv);
+      }
+      vp9_adapt_coef_probs(cm);
     }
   }
 
@@ -4078,8 +5580,18 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
 
   cm->last_frame_type = cm->frame_type;
 
-  if (!(is_two_pass_svc(cpi) && cpi->svc.encode_empty_frame_state == ENCODING))
-    vp9_rc_postencode_update(cpi, *size);
+  vp9_rc_postencode_update(cpi, *size);
+
+  if (cpi->compute_frame_low_motion_onepass && oxcf->pass == 0 &&
+      !frame_is_intra_only(cm) &&
+      (!cpi->use_svc ||
+       (cpi->use_svc &&
+        !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
+        cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1))) {
+    vp9_compute_frame_low_motion(cpi);
+  }
+
+  *size = VPXMAX(1, *size);
 
 #if 0
   output_frame_level_debug_stats(cpi);
@@ -4103,140 +5615,128 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
   cm->last_height = cm->height;
 
   // reset to normal state now that we are done.
-  if (!cm->show_existing_frame) cm->last_show_frame = cm->show_frame;
+  if (!cm->show_existing_frame) {
+    cm->last_show_frame = cm->show_frame;
+    cm->prev_frame = cm->cur_frame;
+  }
 
   if (cm->show_frame) {
     vp9_swap_mi_and_prev_mi(cm);
-    // Don't increment frame counters if this was an altref buffer
-    // update not a real frame
-    ++cm->current_video_frame;
     if (cpi->use_svc) vp9_inc_frame_in_layer(cpi);
   }
-  cm->prev_frame = cm->cur_frame;
+  update_frame_indexes(cm, cm->show_frame);
 
-  if (cpi->use_svc)
+  if (cpi->use_svc) {
     cpi->svc
         .layer_context[cpi->svc.spatial_layer_id *
                            cpi->svc.number_temporal_layers +
                        cpi->svc.temporal_layer_id]
         .last_frame_type = cm->frame_type;
+    // Reset layer_sync back to 0 for next frame.
+    cpi->svc.spatial_layer_sync[cpi->svc.spatial_layer_id] = 0;
+  }
 
   cpi->force_update_segmentation = 0;
 
+#if !CONFIG_REALTIME_ONLY
   if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ)
     vp9_alt_ref_aq_unset_all(cpi->alt_ref_aq, cpi);
+#endif
+
+  cpi->svc.previous_frame_is_intra_only = cm->intra_only;
+  cpi->svc.set_intra_only_frame = 0;
 }
 
 static void SvcEncode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
-                      unsigned int *frame_flags) {
+                      size_t dest_size, unsigned int *frame_flags) {
   vp9_rc_get_svc_params(cpi);
-  encode_frame_to_data_rate(cpi, size, dest, frame_flags);
+  encode_frame_to_data_rate(cpi, size, dest, dest_size, frame_flags,
+                            /*encode_frame_result = */ NULL);
 }
 
 static void Pass0Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
-                        unsigned int *frame_flags) {
+                        size_t dest_size, unsigned int *frame_flags) {
   if (cpi->oxcf.rc_mode == VPX_CBR) {
     vp9_rc_get_one_pass_cbr_params(cpi);
   } else {
     vp9_rc_get_one_pass_vbr_params(cpi);
   }
-  encode_frame_to_data_rate(cpi, size, dest, frame_flags);
+  encode_frame_to_data_rate(cpi, size, dest, dest_size, frame_flags,
+                            /*encode_frame_result = */ NULL);
 }
 
+#if !CONFIG_REALTIME_ONLY
 static void Pass2Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
-                        unsigned int *frame_flags) {
+                        size_t dest_size, unsigned int *frame_flags,
+                        ENCODE_FRAME_RESULT *encode_frame_result) {
   cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
-  encode_frame_to_data_rate(cpi, size, dest, frame_flags);
-
-  if (!(is_two_pass_svc(cpi) && cpi->svc.encode_empty_frame_state == ENCODING))
-    vp9_twopass_postencode_update(cpi);
-}
-
-static void init_ref_frame_bufs(VP9_COMMON *cm) {
-  int i;
-  BufferPool *const pool = cm->buffer_pool;
-  cm->new_fb_idx = INVALID_IDX;
-  for (i = 0; i < REF_FRAMES; ++i) {
-    cm->ref_frame_map[i] = INVALID_IDX;
-    pool->frame_bufs[i].ref_count = 0;
-  }
-}
-
-static void check_initial_width(VP9_COMP *cpi,
-#if CONFIG_VP9_HIGHBITDEPTH
-                                int use_highbitdepth,
+#if CONFIG_MISMATCH_DEBUG
+  mismatch_move_frame_idx_w();
 #endif
-                                int subsampling_x, int subsampling_y) {
-  VP9_COMMON *const cm = &cpi->common;
-
-  if (!cpi->initial_width ||
-#if CONFIG_VP9_HIGHBITDEPTH
-      cm->use_highbitdepth != use_highbitdepth ||
-#endif
-      cm->subsampling_x != subsampling_x ||
-      cm->subsampling_y != subsampling_y) {
-    cm->subsampling_x = subsampling_x;
-    cm->subsampling_y = subsampling_y;
-#if CONFIG_VP9_HIGHBITDEPTH
-    cm->use_highbitdepth = use_highbitdepth;
-#endif
-
-    alloc_raw_frame_buffers(cpi);
-    init_ref_frame_bufs(cm);
-    alloc_util_frame_buffers(cpi);
-
-    init_motion_estimation(cpi);  // TODO(agrange) This can be removed.
-
-    cpi->initial_width = cm->width;
-    cpi->initial_height = cm->height;
-    cpi->initial_mbs = cm->MBs;
-  }
+  encode_frame_to_data_rate(cpi, size, dest, dest_size, frame_flags,
+                            encode_frame_result);
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 int vp9_receive_raw_frame(VP9_COMP *cpi, vpx_enc_frame_flags_t frame_flags,
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
                           int64_t end_time) {
   VP9_COMMON *const cm = &cpi->common;
+#if CONFIG_INTERNAL_STATS
   struct vpx_usec_timer timer;
+#endif
   int res = 0;
   const int subsampling_x = sd->subsampling_x;
   const int subsampling_y = sd->subsampling_y;
 #if CONFIG_VP9_HIGHBITDEPTH
   const int use_highbitdepth = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
-#endif
-
-#if CONFIG_VP9_HIGHBITDEPTH
-  check_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y);
 #else
-  check_initial_width(cpi, subsampling_x, subsampling_y);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-#if CONFIG_VP9_TEMPORAL_DENOISING
-  setup_denoiser_buffer(cpi);
+  const int use_highbitdepth = 0;
 #endif
-  vpx_usec_timer_start(&timer);
-
-  if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time,
-#if CONFIG_VP9_HIGHBITDEPTH
-                         use_highbitdepth,
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-                         frame_flags))
-    res = -1;
-  vpx_usec_timer_mark(&timer);
-  cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
 
   if ((cm->profile == PROFILE_0 || cm->profile == PROFILE_2) &&
       (subsampling_x != 1 || subsampling_y != 1)) {
     vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM,
                        "Non-4:2:0 color format requires profile 1 or 3");
-    res = -1;
+    return -1;
   }
   if ((cm->profile == PROFILE_1 || cm->profile == PROFILE_3) &&
       (subsampling_x == 1 && subsampling_y == 1)) {
     vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM,
                        "4:2:0 color format requires profile 0 or 2");
-    res = -1;
+    return -1;
   }
+  if (cm->color_space == VPX_CS_SRGB) {
+    if (cm->profile == PROFILE_0 || cm->profile == PROFILE_2) {
+      vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM,
+                         "SRGB color space requires profile 1 or 3");
+      return -1;
+    }
+    if (subsampling_x != 0 || subsampling_y != 0) {
+      vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM,
+                         "SRGB color space requires 4:4:4");
+      return -1;
+    }
+  }
+
+  update_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y);
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  setup_denoiser_buffer(cpi);
+#endif
+
+  alloc_raw_frame_buffers(cpi);
+
+#if CONFIG_INTERNAL_STATS
+  vpx_usec_timer_start(&timer);
+#endif
+
+  if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time,
+                         use_highbitdepth, frame_flags))
+    res = -1;
+#if CONFIG_INTERNAL_STATS
+  vpx_usec_timer_mark(&timer);
+  cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
+#endif
 
   return res;
 }
@@ -4331,10 +5831,6 @@ static void check_src_altref(VP9_COMP *cpi,
 }
 
 #if CONFIG_INTERNAL_STATS
-extern double vp9_get_blockiness(const uint8_t *img1, int img1_pitch,
-                                 const uint8_t *img2, int img2_pitch, int width,
-                                 int height);
-
 static void adjust_image_stat(double y, double u, double v, double all,
                               ImageStat *s) {
   s->stat[Y] += y;
@@ -4373,6 +5869,7 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
   int i, idx;
   uint64_t luma_samples, dur_end;
   const uint32_t luma_pic_size = cm->width * cm->height;
+  const uint32_t luma_pic_breadth = VPXMAX(cm->width, cm->height);
   LevelConstraint *const level_constraint = &cpi->level_constraint;
   const int8_t level_index = level_constraint->level_index;
   double cpb_data_size;
@@ -4476,6 +5973,11 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
     level_spec->max_luma_picture_size = luma_pic_size;
   }
 
+  // update max_luma_picture_breadth
+  if (luma_pic_breadth > level_spec->max_luma_picture_breadth) {
+    level_spec->max_luma_picture_breadth = luma_pic_breadth;
+  }
+
   // update compression_ratio
   level_spec->compression_ratio = (double)level_stats->total_uncompressed_size *
                                   cm->bit_depth /
@@ -4496,6 +5998,15 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
                          level_fail_messages[LUMA_PIC_SIZE_TOO_LARGE]);
     }
 
+    if (level_spec->max_luma_picture_breadth >
+        vp9_level_defs[level_index].max_luma_picture_breadth) {
+      level_constraint->fail_flag |= (1 << LUMA_PIC_BREADTH_TOO_LARGE);
+      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                         "Failed to encode to the target level %d. %s",
+                         vp9_level_defs[level_index].level,
+                         level_fail_messages[LUMA_PIC_BREADTH_TOO_LARGE]);
+    }
+
     if ((double)level_spec->max_luma_sample_rate >
         (double)vp9_level_defs[level_index].max_luma_sample_rate *
             (1 + SAMPLE_RATE_GRACE_P)) {
@@ -4559,48 +6070,99 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
   }
 }
 
+void vp9_get_ref_frame_info(FRAME_UPDATE_TYPE update_type, int ref_frame_flags,
+                            RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES],
+                            int *ref_frame_coding_indexes,
+                            int *ref_frame_valid_list) {
+  if (update_type != KF_UPDATE) {
+    const VP9_REFFRAME inter_ref_flags[MAX_INTER_REF_FRAMES] = { VP9_LAST_FLAG,
+                                                                 VP9_GOLD_FLAG,
+                                                                 VP9_ALT_FLAG };
+    int i;
+    for (i = 0; i < MAX_INTER_REF_FRAMES; ++i) {
+      assert(ref_frame_bufs[i] != NULL);
+      ref_frame_coding_indexes[i] = ref_frame_bufs[i]->frame_coding_index;
+      ref_frame_valid_list[i] = (ref_frame_flags & inter_ref_flags[i]) != 0;
+    }
+  } else {
+    // No reference frame is available when this is a key frame.
+    int i;
+    for (i = 0; i < MAX_INTER_REF_FRAMES; ++i) {
+      ref_frame_coding_indexes[i] = -1;
+      ref_frame_valid_list[i] = 0;
+    }
+  }
+}
+
+void vp9_init_encode_frame_result(ENCODE_FRAME_RESULT *encode_frame_result) {
+  encode_frame_result->show_idx = -1;  // Actual encoding doesn't happen.
+}
+
+// Returns if TPL stats need to be calculated.
+static INLINE int should_run_tpl(VP9_COMP *cpi, int gf_group_index) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  if (!cpi->sf.enable_tpl_model) return 0;
+  // If there is an ARF for this GOP, TPL stats is always calculated.
+  if (gf_group_index == 1 &&
+      cpi->twopass.gf_group.update_type[gf_group_index] == ARF_UPDATE)
+    return 1;
+  // If this GOP doesn't have an ARF, TPL stats is still calculated, only when
+  // external rate control is used.
+  if (cpi->ext_ratectrl.ready &&
+      cpi->ext_ratectrl.funcs.send_tpl_gop_stats != NULL &&
+      rc->frames_till_gf_update_due == rc->baseline_gf_interval &&
+      cpi->twopass.gf_group.update_type[1] != ARF_UPDATE) {
+    return 1;
+  }
+  return 0;
+}
+
 int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
-                            size_t *size, uint8_t *dest, int64_t *time_stamp,
-                            int64_t *time_end, int flush) {
+                            size_t *size, uint8_t *dest, size_t dest_size,
+                            int64_t *time_stamp, int64_t *time_end, int flush,
+                            ENCODE_FRAME_RESULT *encode_frame_result) {
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   VP9_COMMON *const cm = &cpi->common;
   BufferPool *const pool = cm->buffer_pool;
   RATE_CONTROL *const rc = &cpi->rc;
+#if CONFIG_INTERNAL_STATS
   struct vpx_usec_timer cmptimer;
+#endif
   YV12_BUFFER_CONFIG *force_src_buffer = NULL;
   struct lookahead_entry *last_source = NULL;
   struct lookahead_entry *source = NULL;
   int arf_src_index;
+  const int gf_group_index = cpi->twopass.gf_group.index;
   int i;
 
-  if (is_two_pass_svc(cpi)) {
-#if CONFIG_SPATIAL_SVC
-    vp9_svc_start_frame(cpi);
-    // Use a small empty frame instead of a real frame
-    if (cpi->svc.encode_empty_frame_state == ENCODING)
-      source = &cpi->svc.empty_frame;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  if (oxcf->pass == 2) start_timing(cpi, vp9_get_compressed_data_time);
 #endif
-    if (oxcf->pass == 2) vp9_restore_layer_context(cpi);
-  } else if (is_one_pass_cbr_svc(cpi)) {
-    vp9_one_pass_cbr_svc_start_layer(cpi);
+
+  if (is_one_pass_svc(cpi)) {
+    vp9_one_pass_svc_start_layer(cpi);
   }
 
+#if CONFIG_INTERNAL_STATS
   vpx_usec_timer_start(&cmptimer);
+#endif
 
   vp9_set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV);
 
   // Is multi-arf enabled.
   // Note that at the moment multi_arf is only configured for 2 pass VBR and
   // will not work properly with svc.
-  if ((oxcf->pass == 2) && !cpi->use_svc && (cpi->oxcf.enable_auto_arf > 1))
-    cpi->multi_arf_allowed = 1;
+  // Enable the Jingning's new "multi_layer_arf" code if "enable_auto_arf"
+  // is greater than or equal to 2.
+  if ((oxcf->pass == 2) && !cpi->use_svc && (cpi->oxcf.enable_auto_arf >= 2))
+    cpi->multi_layer_arf = 1;
   else
-    cpi->multi_arf_allowed = 0;
+    cpi->multi_layer_arf = 0;
 
   // Normal defaults
   cm->reset_frame_context = 0;
   cm->refresh_frame_context = 1;
-  if (!is_one_pass_cbr_svc(cpi)) {
+  if (!is_one_pass_svc(cpi)) {
     cpi->refresh_last_frame = 1;
     cpi->refresh_golden_frame = 0;
     cpi->refresh_alt_ref_frame = 0;
@@ -4609,9 +6171,6 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   // Should we encode an arf frame.
   arf_src_index = get_arf_src_index(cpi);
 
-  // Skip alt frame if we encode the empty frame
-  if (is_two_pass_svc(cpi) && source != NULL) arf_src_index = 0;
-
   if (arf_src_index) {
     for (i = 0; i <= arf_src_index; ++i) {
       struct lookahead_entry *e = vp9_lookahead_peek(cpi->lookahead, i);
@@ -4626,26 +6185,23 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
     }
   }
 
-  if (arf_src_index) {
-    assert(arf_src_index <= rc->frames_to_key);
+  // Clear arf index stack before group of pictures processing starts.
+  if (gf_group_index == 1) {
+    stack_init(cpi->twopass.gf_group.arf_index_stack, MAX_LAG_BUFFERS * 2);
+    cpi->twopass.gf_group.stack_size = 0;
+  }
 
+  if (arf_src_index) {
+    if (!(cpi->ext_ratectrl.ready &&
+          (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 &&
+          cpi->ext_ratectrl.funcs.get_gop_decision != NULL)) {
+      // This assert only makes sense when not using external RC.
+      assert(arf_src_index <= rc->frames_to_key);
+    }
     if ((source = vp9_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
       cpi->alt_ref_source = source;
 
-#if CONFIG_SPATIAL_SVC
-      if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0) {
-        int i;
-        // Reference a hidden frame from a lower layer
-        for (i = cpi->svc.spatial_layer_id - 1; i >= 0; --i) {
-          if (oxcf->ss_enable_auto_arf[i]) {
-            cpi->gld_fb_idx = cpi->svc.layer_context[i].alt_ref_idx;
-            break;
-          }
-        }
-      }
-      cpi->svc.layer_context[cpi->svc.spatial_layer_id].has_alt_frame = 1;
-#endif
-
+#if !CONFIG_REALTIME_ONLY
       if ((oxcf->mode != REALTIME) && (oxcf->arnr_max_frames > 0) &&
           (oxcf->arnr_strength > 0)) {
         int bitrate = cpi->rc.avg_frame_bandwidth / 40;
@@ -4654,18 +6210,24 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
         int not_last_frame = (cpi->lookahead->sz - arf_src_index > 1);
         not_last_frame |= ALT_REF_AQ_APPLY_TO_LAST_FRAME;
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+        start_timing(cpi, vp9_temporal_filter_time);
+#endif
         // Produce the filtered ARF frame.
         vp9_temporal_filter(cpi, arf_src_index);
-        vpx_extend_frame_borders(&cpi->alt_ref_buffer);
+        vpx_extend_frame_borders(&cpi->tf_buffer);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+        end_timing(cpi, vp9_temporal_filter_time);
+#endif
 
         // for small bitrates segmentation overhead usually
         // eats all bitrate gain from enabling delta quantizers
         if (cpi->oxcf.alt_ref_aq != 0 && not_low_bitrate && not_last_frame)
           vp9_alt_ref_aq_setup_mode(cpi->alt_ref_aq, cpi);
 
-        force_src_buffer = &cpi->alt_ref_buffer;
+        force_src_buffer = &cpi->tf_buffer;
       }
-
+#endif
       cm->show_frame = 0;
       cm->intra_only = 0;
       cpi->refresh_alt_ref_frame = 1;
@@ -4686,7 +6248,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
     }
 
     // Read in the source frame.
-    if (cpi->use_svc)
+    if (cpi->use_svc || cpi->svc.set_intra_only_frame)
       source = vp9_svc_lookahead_pop(cpi, cpi->lookahead, flush);
     else
       source = vp9_lookahead_pop(cpi->lookahead, flush);
@@ -4694,11 +6256,10 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
     if (source != NULL) {
       cm->show_frame = 1;
       cm->intra_only = 0;
-      // if the flags indicate intra frame, but if the current picture is for
-      // non-zero spatial layer, it should not be an intra picture.
-      // TODO(Won Kap): this needs to change if per-layer intra frame is
-      // allowed.
-      if ((source->flags & VPX_EFLAG_FORCE_KF) &&
+      // If the flags indicate intra frame, but if the current picture is for
+      // spatial layer above first_spatial_layer_to_encode, it should not be an
+      // intra picture.
+      if ((source->flags & VPX_EFLAG_FORCE_KF) && cpi->use_svc &&
           cpi->svc.spatial_layer_id > cpi->svc.first_spatial_layer_to_encode) {
         source->flags &= ~(unsigned int)(VPX_EFLAG_FORCE_KF);
       }
@@ -4723,13 +6284,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
     *time_stamp = source->ts_start;
     *time_end = source->ts_end;
     *frame_flags = (source->flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
-
   } else {
     *size = 0;
-    if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) {
-      vp9_end_first_pass(cpi); /* get last stats packet */
-      cpi->twopass.first_pass_done = 1;
-    }
     return -1;
   }
 
@@ -4743,10 +6299,14 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 
   // adjust frame rates based on timestamps given
   if (cm->show_frame) {
-    adjust_frame_rate(cpi, source);
+    if (cpi->use_svc && cpi->svc.use_set_ref_frame_config &&
+        cpi->svc.duration[cpi->svc.spatial_layer_id] > 0)
+      vp9_svc_adjust_frame_rate(cpi);
+    else
+      adjust_frame_rate(cpi, source);
   }
 
-  if (is_one_pass_cbr_svc(cpi)) {
+  if (is_one_pass_svc(cpi)) {
     vp9_update_temporal_layer_framerate(cpi);
     vp9_restore_layer_context(cpi);
   }
@@ -4759,62 +6319,150 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   cm->new_fb_idx = get_free_fb(cm);
 
   if (cm->new_fb_idx == INVALID_IDX) return -1;
-
   cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
-
-  if (!cpi->use_svc && cpi->multi_arf_allowed) {
-    if (cm->frame_type == KEY_FRAME) {
-      init_buffer_indices(cpi);
-    } else if (oxcf->pass == 2) {
-      const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-      cpi->alt_fb_idx = gf_group->arf_ref_idx[gf_group->index];
-    }
+  // If the frame buffer for current frame is the same as previous frame, MV in
+  // the base layer shouldn't be used as it'll cause data race.
+  if (cpi->svc.spatial_layer_id > 0 && cm->cur_frame == cm->prev_frame) {
+    cpi->svc.use_base_mv = 0;
   }
-
   // Start with a 0 size frame.
   *size = 0;
 
   cpi->frame_flags = *frame_flags;
 
-  if ((oxcf->pass == 2) &&
-      (!cpi->use_svc || (is_two_pass_svc(cpi) &&
-                         cpi->svc.encode_empty_frame_state != ENCODING))) {
+#if !CONFIG_REALTIME_ONLY
+  if ((oxcf->pass == 2) && !cpi->use_svc) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, vp9_rc_get_second_pass_params_time);
+#endif
     vp9_rc_get_second_pass_params(cpi);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, vp9_rc_get_second_pass_params_time);
+#endif
   } else if (oxcf->pass == 1) {
     set_frame_size(cpi);
   }
 
+  // Key frame temporal filtering
+  const int is_key_temporal_filter_enabled =
+      oxcf->enable_keyframe_filtering && cpi->oxcf.mode != REALTIME &&
+      (oxcf->pass != 1) && !cpi->use_svc &&
+      !is_lossless_requested(&cpi->oxcf) && cm->frame_type == KEY_FRAME &&
+      (oxcf->arnr_max_frames > 0) && (oxcf->arnr_strength > 0) &&
+      cpi->oxcf.speed < 2;
+  // Save the pointer to the original source image.
+  YV12_BUFFER_CONFIG *source_buffer = cpi->un_scaled_source;
+
+  if (is_key_temporal_filter_enabled && source != NULL) {
+    // Produce the filtered Key frame. Set distance to -1 since the key frame
+    // is already popped out.
+    vp9_temporal_filter(cpi, -1);
+    vpx_extend_frame_borders(&cpi->tf_buffer);
+    force_src_buffer = &cpi->tf_buffer;
+    cpi->un_scaled_source = cpi->Source =
+        force_src_buffer ? force_src_buffer : &source->img;
+  }
+#endif  // !CONFIG_REALTIME_ONLY
+
   if (oxcf->pass != 1 && cpi->level_constraint.level_index >= 0 &&
       cpi->level_constraint.fail_flag == 0)
     level_rc_framerate(cpi, arf_src_index);
 
   if (cpi->oxcf.pass != 0 || cpi->use_svc || frame_is_intra_only(cm) == 1) {
-    for (i = 0; i < MAX_REF_FRAMES; ++i) cpi->scaled_ref_idx[i] = INVALID_IDX;
+    for (i = 0; i < REFS_PER_FRAME; ++i) cpi->scaled_ref_idx[i] = INVALID_IDX;
   }
 
-  if (oxcf->pass == 1 && (!cpi->use_svc || is_two_pass_svc(cpi))) {
+  if (cpi->kmeans_data_arr_alloc == 0) {
+    const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+    const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows);
+#if CONFIG_MULTITHREAD
+    pthread_mutex_init(&cpi->kmeans_mutex, NULL);
+#endif
+    CHECK_MEM_ERROR(
+        &cm->error, cpi->kmeans_data_arr,
+        vpx_calloc(mi_rows * mi_cols, sizeof(*cpi->kmeans_data_arr)));
+    cpi->kmeans_data_stride = mi_cols;
+    cpi->kmeans_data_arr_alloc = 1;
+  }
+
+#if CONFIG_NON_GREEDY_MV
+  {
+    const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+    const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows);
+    Status status = vp9_alloc_motion_field_info(
+        &cpi->motion_field_info, MAX_ARF_GOP_SIZE, mi_rows, mi_cols);
+    if (status == STATUS_FAILED) {
+      vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR,
+                         "vp9_alloc_motion_field_info failed");
+    }
+  }
+#endif  // CONFIG_NON_GREEDY_MV
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, setup_tpl_stats_time);
+#endif
+  if (should_run_tpl(cpi, cpi->twopass.gf_group.index)) {
+    vp9_init_tpl_buffer(cpi);
+    vp9_estimate_tpl_qp_gop(cpi);
+    vp9_setup_tpl_stats(cpi);
+  }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, setup_tpl_stats_time);
+#endif
+
+#if CONFIG_BITSTREAM_DEBUG
+  assert(cpi->oxcf.max_threads == 0 &&
+         "bitstream debug tool does not support multithreading");
+  bitstream_queue_record_write();
+#endif
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+  bitstream_queue_set_frame_write(cm->current_video_frame * 2 + cm->show_frame);
+#endif
+
+  cpi->td.mb.fp_src_pred = 0;
+#if CONFIG_REALTIME_ONLY
+  (void)encode_frame_result;
+  if (cpi->use_svc) {
+    SvcEncode(cpi, size, dest, dest_size, frame_flags);
+  } else {
+    // One pass encode
+    Pass0Encode(cpi, size, dest, dest_size, frame_flags);
+  }
+#else  // !CONFIG_REALTIME_ONLY
+  if (oxcf->pass == 1 && !cpi->use_svc) {
     const int lossless = is_lossless_requested(oxcf);
 #if CONFIG_VP9_HIGHBITDEPTH
     if (cpi->oxcf.use_highbitdepth)
-      cpi->td.mb.fwd_txm4x4 =
+      cpi->td.mb.fwd_txfm4x4 =
           lossless ? vp9_highbd_fwht4x4 : vpx_highbd_fdct4x4;
     else
-      cpi->td.mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4;
-    cpi->td.mb.highbd_itxm_add =
+      cpi->td.mb.fwd_txfm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4;
+    cpi->td.mb.highbd_inv_txfm_add =
         lossless ? vp9_highbd_iwht4x4_add : vp9_highbd_idct4x4_add;
 #else
-    cpi->td.mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4;
+    cpi->td.mb.fwd_txfm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-    cpi->td.mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
+    cpi->td.mb.inv_txfm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
     vp9_first_pass(cpi, source);
-  } else if (oxcf->pass == 2 && (!cpi->use_svc || is_two_pass_svc(cpi))) {
-    Pass2Encode(cpi, size, dest, frame_flags);
+  } else if (oxcf->pass == 2 && !cpi->use_svc) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    // Accumulate 2nd pass time in 2-pass case.
+    start_timing(cpi, Pass2Encode_time);
+#endif
+    Pass2Encode(cpi, size, dest, dest_size, frame_flags, encode_frame_result);
+    vp9_twopass_postencode_update(cpi);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, Pass2Encode_time);
+#endif
   } else if (cpi->use_svc) {
-    SvcEncode(cpi, size, dest, frame_flags);
+    SvcEncode(cpi, size, dest, dest_size, frame_flags);
   } else {
     // One pass encode
-    Pass0Encode(cpi, size, dest, frame_flags);
+    Pass0Encode(cpi, size, dest, dest_size, frame_flags);
   }
+#endif  // CONFIG_REALTIME_ONLY
+
+  if (cm->show_frame) cm->cur_show_frame_fb_idx = cm->new_fb_idx;
 
   if (cm->refresh_frame_context)
     cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
@@ -4829,26 +6477,35 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   }
 
   // Save layer specific state.
-  if (is_one_pass_cbr_svc(cpi) || ((cpi->svc.number_temporal_layers > 1 ||
-                                    cpi->svc.number_spatial_layers > 1) &&
-                                   oxcf->pass == 2)) {
+  if (is_one_pass_svc(cpi) || ((cpi->svc.number_temporal_layers > 1 ||
+                                cpi->svc.number_spatial_layers > 1) &&
+                               oxcf->pass == 2)) {
     vp9_save_layer_context(cpi);
   }
 
+  if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)
+    cpi->fixed_qp_onepass = 0;
+
+#if CONFIG_INTERNAL_STATS
   vpx_usec_timer_mark(&cmptimer);
   cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
-
-  // Should we calculate metrics for the frame.
-  if (is_psnr_calc_enabled(cpi)) generate_psnr_packet(cpi);
+#endif
 
   if (cpi->keep_level_stats && oxcf->pass != 1)
     update_level_info(cpi, size, arf_src_index);
 
+#if !CONFIG_REALTIME_ONLY
+  if (is_key_temporal_filter_enabled && cpi->b_calculate_psnr) {
+    cpi->raw_source_frame = vp9_scale_if_required(
+        cm, source_buffer, &cpi->scaled_source, (oxcf->pass == 0), EIGHTTAP, 0);
+  }
+#endif  // !CONFIG_REALTIME_ONLY
+
 #if CONFIG_INTERNAL_STATS
 
-  if (oxcf->pass != 1) {
+  if (oxcf->pass != 1 && !cpi->last_frame_dropped) {
     double samples = 0.0;
-    cpi->bytes += (int)(*size);
+    cpi->bytes += *size;
 
     if (cm->show_frame) {
       uint32_t bit_depth = 8;
@@ -4868,9 +6525,9 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
         PSNR_STATS psnr;
 #if CONFIG_VP9_HIGHBITDEPTH
         vpx_calc_highbd_psnr(orig, recon, &psnr, cpi->td.mb.e_mbd.bd,
-                             in_bit_depth);
+                             in_bit_depth, cpi->svc.spatial_layer_id);
 #else
-        vpx_calc_psnr(orig, recon, &psnr);
+        vpx_calc_psnr(orig, recon, &psnr, cpi->svc.spatial_layer_id);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
         adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3],
@@ -4898,16 +6555,18 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
             ppflags.post_proc_flag = VP9D_DEBLOCK;
             ppflags.deblocking_level = 0;  // not used in vp9_post_proc_frame()
             ppflags.noise_level = 0;       // not used in vp9_post_proc_frame()
-            vp9_post_proc_frame(cm, pp, &ppflags);
+            vp9_post_proc_frame(cm, pp, &ppflags,
+                                cpi->un_scaled_source->y_width);
           }
 #endif
           vpx_clear_system_state();
 
 #if CONFIG_VP9_HIGHBITDEPTH
           vpx_calc_highbd_psnr(orig, pp, &psnr2, cpi->td.mb.e_mbd.bd,
-                               cpi->oxcf.input_bit_depth);
+                               cpi->oxcf.input_bit_depth,
+                               cpi->svc.spatial_layer_id);
 #else
-          vpx_calc_psnr(orig, pp, &psnr2);
+          vpx_calc_psnr(orig, pp, &psnr2, cpi->svc.spatial_layer_id);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
           cpi->totalp_sq_error += psnr2.sse[0];
@@ -4944,11 +6603,11 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
           cpi->summedp_quality += frame_ssim2 * weight;
           cpi->summedp_weights += weight;
 #if 0
-          {
+          if (cm->show_frame) {
             FILE *f = fopen("q_used.stt", "a");
             fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n",
-                    cpi->common.current_video_frame, y2, u2, v2,
-                    frame_psnr2, frame_ssim2);
+                    cpi->common.current_video_frame, psnr2.psnr[1],
+                    psnr2.psnr[2], psnr2.psnr[3], psnr2.psnr[0], frame_ssim2);
             fclose(f);
           }
 #endif
@@ -5007,21 +6666,42 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 
 #endif
 
-  if (is_two_pass_svc(cpi)) {
-    if (cpi->svc.encode_empty_frame_state == ENCODING) {
-      cpi->svc.encode_empty_frame_state = ENCODED;
-      cpi->svc.encode_intra_empty_frame = 0;
-    }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  if (oxcf->pass == 2) end_timing(cpi, vp9_get_compressed_data_time);
 
-    if (cm->show_frame) {
-      ++cpi->svc.spatial_layer_to_encode;
-      if (cpi->svc.spatial_layer_to_encode >= cpi->svc.number_spatial_layers)
-        cpi->svc.spatial_layer_to_encode = 0;
+  // Print out timing information.
+  // Note: Use "cpi->frame_component_time[0] > 100 us" to avoid showing of
+  // show_existing_frame and lag-in-frames.
+  //  if (cpi->frame_component_time[0] > 100)
+  if (oxcf->pass == 2) {
+    uint64_t frame_total = 0, total = 0;
+    int i;
 
-      // May need the empty frame after an visible frame.
-      cpi->svc.encode_empty_frame_state = NEED_TO_ENCODE;
+    fprintf(stderr,
+            "\n Frame number: %d, Frame type: %s, Show Frame: %d, Q: %d\n",
+            cm->current_video_frame, get_frame_type_enum(cm->frame_type),
+            cm->show_frame, cm->base_qindex);
+    for (i = 0; i < kTimingComponents; i++) {
+      cpi->component_time[i] += cpi->frame_component_time[i];
+      // Use vp9_get_compressed_data_time (i = 0) as the total time.
+      if (i == 0) {
+        frame_total = cpi->frame_component_time[0];
+        total = cpi->component_time[0];
+      }
+      fprintf(stderr,
+              " %50s:  %15" PRId64 " us [%6.2f%%] (total: %15" PRId64
+              " us [%6.2f%%])\n",
+              get_component_name(i), cpi->frame_component_time[i],
+              (float)((float)cpi->frame_component_time[i] * 100.0 /
+                      (float)frame_total),
+              cpi->component_time[i],
+              (float)((float)cpi->component_time[i] * 100.0 / (float)total));
+      cpi->frame_component_time[i] = 0;
     }
-  } else if (is_one_pass_cbr_svc(cpi)) {
+  }
+#endif
+
+  if (is_one_pass_svc(cpi)) {
     if (cm->show_frame) {
       ++cpi->svc.spatial_layer_to_encode;
       if (cpi->svc.spatial_layer_to_encode >= cpi->svc.number_spatial_layers)
@@ -5045,7 +6725,7 @@ int vp9_get_preview_raw_frame(VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest,
   } else {
     int ret;
 #if CONFIG_VP9_POSTPROC
-    ret = vp9_post_proc_frame(cm, dest, flags);
+    ret = vp9_post_proc_frame(cm, dest, flags, cpi->un_scaled_source->y_width);
 #else
     if (cm->frame_to_show) {
       *dest = *cm->frame_to_show;
@@ -5063,12 +6743,12 @@ int vp9_get_preview_raw_frame(VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest,
   }
 }
 
-int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING horiz_mode,
-                          VPX_SCALING vert_mode) {
+int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING_MODE horiz_mode,
+                          VPX_SCALING_MODE vert_mode) {
   VP9_COMMON *cm = &cpi->common;
   int hr = 0, hs = 0, vr = 0, vs = 0;
 
-  if (horiz_mode > ONETWO || vert_mode > ONETWO) return -1;
+  if (horiz_mode > VP8E_ONETWO || vert_mode > VP8E_ONETWO) return -1;
 
   Scale2Ratio(horiz_mode, &hr, &hs);
   Scale2Ratio(vert_mode, &vr, &vs);
@@ -5090,20 +6770,21 @@ int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width,
                          unsigned int height) {
   VP9_COMMON *cm = &cpi->common;
 #if CONFIG_VP9_HIGHBITDEPTH
-  check_initial_width(cpi, cm->use_highbitdepth, 1, 1);
+  update_initial_width(cpi, cm->use_highbitdepth, cpi->common.subsampling_x,
+                       cpi->common.subsampling_y);
 #else
-  check_initial_width(cpi, 1, 1);
+  update_initial_width(cpi, 0, cpi->common.subsampling_x,
+                       cpi->common.subsampling_y);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
   setup_denoiser_buffer(cpi);
 #endif
-
+  alloc_raw_frame_buffers(cpi);
   if (width) {
     cm->width = width;
     if (cm->width > cpi->initial_width) {
       cm->width = cpi->initial_width;
-      printf("Warning: Desired width too large, changed to %d\n", cm->width);
     }
   }
 
@@ -5111,7 +6792,6 @@ int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width,
     cm->height = height;
     if (cm->height > cpi->initial_height) {
       cm->height = cpi->initial_height;
-      printf("Warning: Desired height too large, changed to %d\n", cm->height);
     }
   }
   assert(cm->width <= cpi->initial_width);
@@ -5127,7 +6807,7 @@ void vp9_set_svc(VP9_COMP *cpi, int use_svc) {
   return;
 }
 
-int vp9_get_quantizer(VP9_COMP *cpi) { return cpi->common.base_qindex; }
+int vp9_get_quantizer(const VP9_COMP *cpi) { return cpi->common.base_qindex; }
 
 void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags) {
   if (flags &
@@ -5161,3 +6841,28 @@ void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags) {
     vp9_update_entropy(cpi, 0);
   }
 }
+
+void vp9_set_row_mt(VP9_COMP *cpi) {
+  // Enable row based multi-threading for supported modes of encoding
+  cpi->row_mt = 0;
+  if (((cpi->oxcf.mode == GOOD || cpi->oxcf.mode == BEST) &&
+       cpi->oxcf.speed < 5 && cpi->oxcf.pass == 1) &&
+      cpi->oxcf.row_mt && !cpi->use_svc)
+    cpi->row_mt = 1;
+
+  if (cpi->oxcf.mode == GOOD && cpi->oxcf.speed < 5 &&
+      (cpi->oxcf.pass == 0 || cpi->oxcf.pass == 2) && cpi->oxcf.row_mt &&
+      !cpi->use_svc)
+    cpi->row_mt = 1;
+
+  // In realtime mode, enable row based multi-threading for all the speed levels
+  // where non-rd path is used.
+  if (cpi->oxcf.mode == REALTIME && cpi->oxcf.speed >= 5 && cpi->oxcf.row_mt) {
+    cpi->row_mt = 1;
+  }
+
+  if (cpi->row_mt)
+    cpi->row_mt_bit_exact = 1;
+  else
+    cpi->row_mt_bit_exact = 0;
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h
index de324d3aab..f58fa2470e 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h
@@ -8,20 +8,26 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_ENCODER_H_
-#define VP9_ENCODER_VP9_ENCODER_H_
+#ifndef VPX_VP9_ENCODER_VP9_ENCODER_H_
+#define VPX_VP9_ENCODER_VP9_ENCODER_H_
 
 #include <stdio.h>
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx/internal/vpx_codec_internal.h"
+#include "vpx/vpx_ext_ratectrl.h"
 #include "vpx/vp8cx.h"
+#include "vpx/vpx_tpl.h"
 #if CONFIG_INTERNAL_STATS
 #include "vpx_dsp/ssim.h"
 #endif
 #include "vpx_dsp/variance.h"
+#include "vpx_dsp/psnr.h"
 #include "vpx_ports/system_state.h"
+#include "vpx_util/vpx_pthread.h"
 #include "vpx_util/vpx_thread.h"
+#include "vpx_util/vpx_timestamp.h"
 
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_ppflags.h"
@@ -29,11 +35,16 @@
 #include "vp9/common/vp9_thread_common.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
+#if !CONFIG_REALTIME_ONLY
 #include "vp9/encoder/vp9_alt_ref_aq.h"
+#endif
 #include "vp9/encoder/vp9_aq_cyclicrefresh.h"
 #include "vp9/encoder/vp9_context_tree.h"
 #include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_ethread.h"
+#include "vp9/encoder/vp9_ext_ratectrl.h"
 #include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_job_queue.h"
 #include "vp9/encoder/vp9_lookahead.h"
 #include "vp9/encoder/vp9_mbgraph.h"
 #include "vp9/encoder/vp9_mcomp.h"
@@ -82,13 +93,6 @@ typedef enum {
   ENCODE_BREAKOUT_LIMITED = 2
 } ENCODE_BREAKOUT_TYPE;
 
-typedef enum {
-  NORMAL = 0,
-  FOURFIVE = 1,
-  THREEFIVE = 2,
-  ONETWO = 3
-} VPX_SCALING;
-
 typedef enum {
   // Good Quality Fast Encoding. The encoder balances quality with the amount of
   // time it takes to encode the output. Speed setting controls how fast.
@@ -117,9 +121,11 @@ typedef enum {
   COMPLEXITY_AQ = 2,
   CYCLIC_REFRESH_AQ = 3,
   EQUATOR360_AQ = 4,
+  PERCEPTUAL_AQ = 5,
+  PSNR_AQ = 6,
   // AQ based on lookahead temporal
   // variance (only valid for altref frames)
-  LOOKAHEAD_AQ = 5,
+  LOOKAHEAD_AQ = 7,
   AQ_MODE_COUNT  // This should always be the last member of the enum
 } AQ_MODE;
 
@@ -129,6 +135,22 @@ typedef enum {
   RESIZE_DYNAMIC = 2  // Coded size of each frame is determined by the codec.
 } RESIZE_TYPE;
 
+typedef enum {
+  kInvalid = 0,
+  kLowSadLowSumdiff = 1,
+  kLowSadHighSumdiff = 2,
+  kHighSadLowSumdiff = 3,
+  kHighSadHighSumdiff = 4,
+  kLowVarHighSumdiff = 5,
+  kVeryHighSad = 6,
+} CONTENT_STATE_SB;
+
+typedef enum {
+  LOOPFILTER_ALL = 0,
+  LOOPFILTER_REFERENCE = 1,  // Disable loopfilter on non reference frames.
+  NO_LOOPFILTER = 2,         // Disable loopfilter on all frames.
+} LOOPFILTER_CONTROL;
+
 typedef struct VP9EncoderConfig {
   BITSTREAM_PROFILE profile;
   vpx_bit_depth_t bit_depth;     // Codec bit-depth.
@@ -136,7 +158,10 @@ typedef struct VP9EncoderConfig {
   int height;                    // height of data passed to the compressor
   unsigned int input_bit_depth;  // Input bit depth.
   double init_framerate;         // set to passed in framerate
-  int64_t target_bandwidth;      // bandwidth to be used in bits per second
+  vpx_rational_t g_timebase;  // equivalent to g_timebase in vpx_codec_enc_cfg_t
+  vpx_rational64_t g_timebase_in_ts;  // g_timebase * TICKS_PER_SEC
+
+  int64_t target_bandwidth;  // bandwidth to be used in bits per second
 
   int noise_sensitivity;  // pre processing blur: recommendation 0
   int sharpness;          // sharpening output: recommendation 0:
@@ -197,6 +222,7 @@ typedef struct VP9EncoderConfig {
   int two_pass_vbrbias;  // two pass datarate control tweaks
   int two_pass_vbrmin_section;
   int two_pass_vbrmax_section;
+  int vbr_corpus_complexity;  // 0 indicates corpus vbr disabled
   // END DATARATE CONTROL OPTIONS
   // ----------------------------------------------------------------
 
@@ -235,16 +261,15 @@ typedef struct VP9EncoderConfig {
   int tile_columns;
   int tile_rows;
 
+  int enable_tpl_model;
+
+  int enable_keyframe_filtering;
+
   int max_threads;
 
   unsigned int target_level;
 
   vpx_fixed_buf_t two_pass_stats_in;
-  struct vpx_codec_pkt_list *output_pkt_list;
-
-#if CONFIG_FP_MB_STATS
-  vpx_fixed_buf_t firstpass_mb_stats_in;
-#endif
 
   vp8e_tuning tuning;
   vp9e_tune_content content;
@@ -256,21 +281,102 @@ typedef struct VP9EncoderConfig {
   int render_width;
   int render_height;
   VP9E_TEMPORAL_LAYERING_MODE temporal_layering_mode;
+
+  int row_mt;
+  unsigned int motion_vector_unit_test;
+  int delta_q_uv;
 } VP9EncoderConfig;
 
 static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) {
   return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0;
 }
 
+typedef struct TplDepStats {
+  int64_t intra_cost;
+  int64_t inter_cost;
+  int64_t mc_flow;
+  int64_t mc_dep_cost;
+  int64_t mc_ref_cost;
+
+  int ref_frame_index;
+  int_mv mv;
+} TplDepStats;
+
+#if CONFIG_NON_GREEDY_MV
+
+#define ZERO_MV_MODE 0
+#define NEW_MV_MODE 1
+#define NEAREST_MV_MODE 2
+#define NEAR_MV_MODE 3
+#define MAX_MV_MODE 4
+#endif
+
+typedef struct TplDepFrame {
+  uint8_t is_valid;
+  TplDepStats *tpl_stats_ptr;
+  int stride;
+  int width;
+  int height;
+  int mi_rows;
+  int mi_cols;
+  int base_qindex;
+#if CONFIG_NON_GREEDY_MV
+  int lambda;
+  int *mv_mode_arr[3];
+  double *rd_diff_arr[3];
+#endif
+} TplDepFrame;
+
+#define TPL_DEP_COST_SCALE_LOG2 4
+
 // TODO(jingning) All spatially adaptive variables should go to TileDataEnc.
 typedef struct TileDataEnc {
   TileInfo tile_info;
   int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
-  int mode_map[BLOCK_SIZES][MAX_MODES];
-  int m_search_count;
-  int ex_search_count;
+  int thresh_freq_fact_prev[BLOCK_SIZES][MAX_MODES];
+  int8_t mode_map[BLOCK_SIZES][MAX_MODES];
+  FIRSTPASS_DATA fp_data;
+  VP9RowMTSync row_mt_sync;
+
+  // Used for adaptive_rd_thresh with row multithreading
+  int *row_base_thresh_freq_fact;
+  // The value of sb_rows when row_base_thresh_freq_fact is allocated.
+  // The row_base_thresh_freq_fact array has sb_rows * BLOCK_SIZES * MAX_MODES
+  // elements.
+  int sb_rows;
+  MV firstpass_top_mv;
 } TileDataEnc;
 
+typedef struct RowMTInfo {
+  JobQueueHandle job_queue_hdl;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t job_mutex;
+#endif
+} RowMTInfo;
+
+typedef struct {
+  TOKENEXTRA *start;
+  TOKENEXTRA *stop;
+  unsigned int count;
+} TOKENLIST;
+
+typedef struct MultiThreadHandle {
+  int allocated_tile_rows;
+  int allocated_tile_cols;
+  int allocated_vert_unit_rows;
+
+  // Frame level params
+  int num_tile_vert_sbs[MAX_NUM_TILE_ROWS];
+
+  // Job Queue structure and handles
+  JobQueue *job_queue;
+
+  int jobs_per_tile_col;
+
+  RowMTInfo row_mt_info[MAX_NUM_TILE_COLS];
+  int thread_id_to_tile_id[MAX_NUM_THREADS];  // Mapping of threads to tiles
+} MultiThreadHandle;
+
 typedef struct RD_COUNTS {
   vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
   int64_t comp_pred_diff[REFERENCE_MODES];
@@ -312,6 +418,7 @@ typedef struct IMAGE_STAT {
 
 typedef enum {
   LEVEL_UNKNOWN = 0,
+  LEVEL_AUTO = 1,
   LEVEL_1 = 10,
   LEVEL_1_1 = 11,
   LEVEL_2 = 20,
@@ -333,6 +440,7 @@ typedef struct {
   VP9_LEVEL level;
   uint64_t max_luma_sample_rate;
   uint32_t max_luma_picture_size;
+  uint32_t max_luma_picture_breadth;
   double average_bitrate;  // in kilobits per second
   double max_cpb_size;     // in kilobits
   double compression_ratio;
@@ -372,25 +480,118 @@ typedef struct {
 
 typedef enum {
   BITRATE_TOO_LARGE = 0,
-  LUMA_PIC_SIZE_TOO_LARGE = 1,
-  LUMA_SAMPLE_RATE_TOO_LARGE = 2,
-  CPB_TOO_LARGE = 3,
-  COMPRESSION_RATIO_TOO_SMALL = 4,
-  TOO_MANY_COLUMN_TILE = 5,
-  ALTREF_DIST_TOO_SMALL = 6,
-  TOO_MANY_REF_BUFFER = 7,
-  TARGET_LEVEL_FAIL_IDS = 8
+  LUMA_PIC_SIZE_TOO_LARGE,
+  LUMA_PIC_BREADTH_TOO_LARGE,
+  LUMA_SAMPLE_RATE_TOO_LARGE,
+  CPB_TOO_LARGE,
+  COMPRESSION_RATIO_TOO_SMALL,
+  TOO_MANY_COLUMN_TILE,
+  ALTREF_DIST_TOO_SMALL,
+  TOO_MANY_REF_BUFFER,
+  TARGET_LEVEL_FAIL_IDS
 } TARGET_LEVEL_FAIL_ID;
 
 typedef struct {
   int8_t level_index;
-  uint8_t rc_config_updated;
   uint8_t fail_flag;
   int max_frame_size;   // in bits
   double max_cpb_size;  // in bits
 } LevelConstraint;
 
+typedef struct ARNRFilterData {
+  YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];
+  int strength;
+  int frame_count;
+  int alt_ref_index;
+  struct scale_factors sf;
+  YV12_BUFFER_CONFIG *dst;
+} ARNRFilterData;
+
+typedef struct EncFrameBuf {
+  int mem_valid;
+  int released;
+  YV12_BUFFER_CONFIG frame;
+} EncFrameBuf;
+
+// Maximum operating frame buffer size needed for a GOP using ARF reference.
+// This is used to allocate the memory for TPL stats for a GOP.
+#define MAX_ARF_GOP_SIZE (2 * MAX_LAG_BUFFERS)
+#define MAX_KMEANS_GROUPS 8
+
+typedef struct KMEANS_DATA {
+  double value;
+  int pos;
+  int group_idx;
+} KMEANS_DATA;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+#include "vpx_ports/vpx_timer.h"
+// Adjust the following to add new components.
+typedef enum {
+  vp9_get_compressed_data_time,
+  vp9_temporal_filter_time,
+  vp9_rc_get_second_pass_params_time,
+  setup_tpl_stats_time,
+  Pass2Encode_time,
+
+  encode_with_recode_loop_time,
+  loopfilter_frame_time,
+  vp9_pack_bitstream_time,
+
+  encode_frame_internal_time,
+  rd_pick_partition_time,
+  rd_pick_sb_modes_time,
+  encode_sb_time,
+
+  vp9_rd_pick_inter_mode_sb_time,
+  vp9_rd_pick_inter_mode_sub8x8_time,
+
+  intra_mode_search_time,
+  handle_inter_mode_time,
+  single_motion_search_time,
+  joint_motion_search_time,
+  interp_filter_time,
+
+  kTimingComponents,
+} TIMING_COMPONENT;
+
+static INLINE char const *get_component_name(int index) {
+  switch (index) {
+    case vp9_get_compressed_data_time: return "vp9_get_compressed_data_time";
+    case vp9_temporal_filter_time: return "vp9_temporal_filter_time";
+    case vp9_rc_get_second_pass_params_time:
+      return "vp9_rc_get_second_pass_params_time";
+    case setup_tpl_stats_time: return "setup_tpl_stats_time";
+    case Pass2Encode_time: return "Pass2Encode_time";
+
+    case encode_with_recode_loop_time: return "encode_with_recode_loop_time";
+    case loopfilter_frame_time: return "loopfilter_frame_time";
+    case vp9_pack_bitstream_time: return "vp9_pack_bitstream_time";
+
+    case encode_frame_internal_time: return "encode_frame_internal_time";
+    case rd_pick_partition_time: return "rd_pick_partition_time";
+    case rd_pick_sb_modes_time: return "rd_pick_sb_modes_time";
+    case encode_sb_time: return "encode_sb_time";
+
+    case vp9_rd_pick_inter_mode_sb_time:
+      return "vp9_rd_pick_inter_mode_sb_time";
+    case vp9_rd_pick_inter_mode_sub8x8_time:
+      return "vp9_rd_pick_inter_mode_sub8x8_time";
+
+    case intra_mode_search_time: return "intra_mode_search_time";
+    case handle_inter_mode_time: return "handle_inter_mode_time";
+    case single_motion_search_time: return "single_motion_search_time";
+    case joint_motion_search_time: return "joint_motion_search_time";
+    case interp_filter_time: return "interp_filter_time";
+
+    default: assert(0);
+  }
+  return "error";
+}
+#endif
+
 typedef struct VP9_COMP {
+  FRAME_INFO frame_info;
   QUANTS quants;
   ThreadData td;
   MB_MODE_INFO_EXT *mbmi_ext_base;
@@ -413,17 +614,39 @@ typedef struct VP9_COMP {
 #endif
   YV12_BUFFER_CONFIG *raw_source_frame;
 
+  BLOCK_SIZE tpl_bsize;
+  TplDepFrame tpl_stats[MAX_ARF_GOP_SIZE];
+  // Used to store TPL stats before propagation
+  VpxTplGopStats tpl_gop_stats;
+  YV12_BUFFER_CONFIG *tpl_recon_frames[REF_FRAMES];
+  EncFrameBuf enc_frame_buf[REF_FRAMES];
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t kmeans_mutex;
+#endif
+  int kmeans_data_arr_alloc;
+  KMEANS_DATA *kmeans_data_arr;
+  int kmeans_data_size;
+  int kmeans_data_stride;
+  double kmeans_ctr_ls[MAX_KMEANS_GROUPS];
+  double kmeans_boundary_ls[MAX_KMEANS_GROUPS];
+  int kmeans_count_ls[MAX_KMEANS_GROUPS];
+  int kmeans_ctr_num;
+#if CONFIG_NON_GREEDY_MV
+  MotionFieldInfo motion_field_info;
+  int tpl_ready;
+  int_mv *select_mv_arr;
+#endif
+
   TileDataEnc *tile_data;
   int allocated_tiles;  // Keep track of memory allocated for tiles.
 
-  // For a still frame, this flag is set to 1 to skip partition search.
-  int partition_search_skippable_frame;
-
-  int scaled_ref_idx[MAX_REF_FRAMES];
+  int scaled_ref_idx[REFS_PER_FRAME];
   int lst_fb_idx;
   int gld_fb_idx;
   int alt_fb_idx;
 
+  int ref_fb_idx[REF_FRAMES];
+
   int refresh_last_frame;
   int refresh_golden_frame;
   int refresh_alt_ref_frame;
@@ -436,14 +659,23 @@ typedef struct VP9_COMP {
   int ext_refresh_frame_context_pending;
   int ext_refresh_frame_context;
 
+  int64_t norm_wiener_variance;
+  int64_t *mb_wiener_variance;
+  int mb_wiener_var_rows;
+  int mb_wiener_var_cols;
+  double *mi_ssim_rdmult_scaling_factors;
+
+  int64_t *sb_mul_scale;
+
   YV12_BUFFER_CONFIG last_frame_uf;
 
   TOKENEXTRA *tile_tok[4][1 << 6];
-  uint32_t tok_count[4][1 << 6];
+  TOKENLIST *tplist[4][1 << 6];
 
   // Ambient reconstruction err target for force key frames
   int64_t ambient_err;
 
+  RD_CONTROL rd_ctrl;
   RD_OPT rd;
 
   CODING_CONTEXT coding_context;
@@ -460,7 +692,7 @@ typedef struct VP9_COMP {
   RATE_CONTROL rc;
   double framerate;
 
-  int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE];
+  int interp_filter_selected[REF_FRAMES][SWITCHABLE];
 
   struct vpx_codec_pkt_list *output_pkt_list;
 
@@ -485,23 +717,23 @@ typedef struct VP9_COMP {
 
   uint8_t *segmentation_map;
 
-  // segment threashold for encode breakout
+  uint8_t *skin_map;
+
+  // segment threshold for encode breakout
   int segment_encode_breakout[MAX_SEGMENTS];
 
   CYCLIC_REFRESH *cyclic_refresh;
   ActiveMap active_map;
 
   fractional_mv_step_fp *find_fractional_mv_step;
-  vp9_full_search_fn_t full_search_sad;
+  struct scale_factors me_sf;
   vp9_diamond_search_fn_t diamond_search_sad;
   vp9_variance_fn_ptr_t fn_ptr[BLOCK_SIZES];
+#if CONFIG_INTERNAL_STATS
   uint64_t time_receive_data;
   uint64_t time_compress_data;
   uint64_t time_pick_lpf;
   uint64_t time_encode_sb_row;
-
-#if CONFIG_FP_MB_STATS
-  int use_fp_mb_stats;
 #endif
 
   TWO_PASS twopass;
@@ -509,7 +741,7 @@ typedef struct VP9_COMP {
   // Force recalculation of segment_ids for each mode info
   uint8_t force_update_segmentation;
 
-  YV12_BUFFER_CONFIG alt_ref_buffer;
+  YV12_BUFFER_CONFIG tf_buffer;
 
   // class responsible for adaptive
   // quantization of altref frames
@@ -530,7 +762,7 @@ typedef struct VP9_COMP {
   double total_blockiness;
   double worst_blockiness;
 
-  int bytes;
+  uint64_t bytes;
   double summed_quality;
   double summed_weights;
   double summedp_quality;
@@ -563,16 +795,13 @@ typedef struct VP9_COMP {
                     // number of MBs in the current frame when the frame is
                     // scaled.
 
+  int last_coded_width;
+  int last_coded_height;
+
   int use_svc;
 
   SVC svc;
 
-  // Store frame variance info in SOURCE_VAR_BASED_PARTITION search type.
-  diff *source_diff_var;
-  // The threshold used in SOURCE_VAR_BASED_PARTITION search type.
-  unsigned int source_var_thresh;
-  int frames_till_next_var_check;
-
   int frame_flags;
 
   search_site_config ss_cfg;
@@ -583,17 +812,15 @@ typedef struct VP9_COMP {
   int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
   int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
   int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES];
-
-  int multi_arf_allowed;
-  int multi_arf_enabled;
-  int multi_arf_last_grp_enabled;
+  // Indices are:  max_tx_size-1,  tx_size_ctx,    tx_size
+  int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES];
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
   VP9_DENOISER denoiser;
 #endif
 
   int resize_pending;
-  int resize_state;
+  RESIZE_STATE resize_state;
   int external_resize;
   int resize_scale_num;
   int resize_scale_den;
@@ -629,17 +856,84 @@ typedef struct VP9_COMP {
 
   int keep_level_stats;
   Vp9LevelInfo level_info;
+  MultiThreadHandle multi_thread_ctxt;
+  void (*row_mt_sync_read_ptr)(VP9RowMTSync *const, int, int);
+  void (*row_mt_sync_write_ptr)(VP9RowMTSync *const, int, int, const int);
+  ARNRFilterData arnr_filter_data;
+
+  int row_mt;
+  unsigned int row_mt_bit_exact;
 
   // Previous Partition Info
   BLOCK_SIZE *prev_partition;
   int8_t *prev_segment_id;
+  // Used to save the status of whether a block has a low variance in
+  // choose_partitioning. 0 for 64x64, 1~2 for 64x32, 3~4 for 32x64, 5~8 for
+  // 32x32, 9~24 for 16x16.
+  // This is for the last frame and is copied to the current frame
+  // when partition copy happens.
+  uint8_t *prev_variance_low;
+  uint8_t *copied_frame_cnt;
+  uint8_t max_copied_frame;
+  // If the last frame is dropped, we don't copy partition.
+  uint8_t last_frame_dropped;
+
+  // For each superblock: keeps track of the last time (in frame distance) the
+  // the superblock did not have low source sad.
+  uint8_t *content_state_sb_fd;
+
+  int compute_source_sad_onepass;
+
+  int compute_frame_low_motion_onepass;
 
   LevelConstraint level_constraint;
+
+  uint8_t *count_arf_frame_usage;
+  uint8_t *count_lastgolden_frame_usage;
+
+  int multi_layer_arf;
+  vpx_roi_map_t roi;
+
+  LOOPFILTER_CONTROL loopfilter_ctrl;
+  EXT_RATECTRL ext_ratectrl;
+
+  int fixed_qp_onepass;
+
+  // Flag to keep track of dynamic change in deadline mode
+  // (good/best/realtime).
+  MODE deadline_mode_previous_frame;
+
+  // Flag to disable scene detection when rtc rate control library is used.
+  int disable_scene_detection_rtc_ratectrl;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  /*!
+   * component_time[] are initialized to zero while encoder starts.
+   */
+  uint64_t component_time[kTimingComponents];
+  /*!
+   * Stores timing for individual components between calls of start_timing()
+   * and end_timing().
+   */
+  struct vpx_usec_timer component_timer[kTimingComponents];
+  /*!
+   * frame_component_time[] are initialized to zero at beginning of each frame.
+   */
+  uint64_t frame_component_time[kTimingComponents];
+#endif
 } VP9_COMP;
 
+typedef struct ENCODE_FRAME_RESULT {
+  int show_idx;
+  FRAME_UPDATE_TYPE update_type;
+  int quantize_index;
+} ENCODE_FRAME_RESULT;
+
+void vp9_init_encode_frame_result(ENCODE_FRAME_RESULT *encode_frame_result);
+
 void vp9_initialize_enc(void);
 
-struct VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
+struct VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
                                        BufferPool *const pool);
 void vp9_remove_compressor(VP9_COMP *cpi);
 
@@ -649,11 +943,12 @@ void vp9_change_config(VP9_COMP *cpi, const VP9EncoderConfig *oxcf);
 // frame is made and not just a copy of the pointer..
 int vp9_receive_raw_frame(VP9_COMP *cpi, vpx_enc_frame_flags_t frame_flags,
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
-                          int64_t end_time_stamp);
+                          int64_t end_time);
 
 int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
-                            size_t *size, uint8_t *dest, int64_t *time_stamp,
-                            int64_t *time_end, int flush);
+                            size_t *size, uint8_t *dest, size_t dest_size,
+                            int64_t *time_stamp, int64_t *time_end, int flush,
+                            ENCODE_FRAME_RESULT *encode_frame_result);
 
 int vp9_get_preview_raw_frame(VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest,
                               vp9_ppflags_t *flags);
@@ -670,25 +965,63 @@ int vp9_set_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
 
 int vp9_update_entropy(VP9_COMP *cpi, int update);
 
-int vp9_set_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols);
+int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
+                       int cols);
 
-int vp9_get_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols);
+int vp9_get_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
+                       int cols);
 
-int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING horiz_mode,
-                          VPX_SCALING vert_mode);
+int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING_MODE horiz_mode,
+                          VPX_SCALING_MODE vert_mode);
 
 int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width,
                          unsigned int height);
 
 void vp9_set_svc(VP9_COMP *cpi, int use_svc);
 
-int vp9_get_quantizer(struct VP9_COMP *cpi);
+// Check for resetting the rc flags (rc_1_frame, rc_2_frame) if the
+// configuration change has a large change in avg_frame_bandwidth.
+// For SVC check for resetting based on spatial layer average bandwidth.
+// Also reset buffer level to optimal level.
+void vp9_check_reset_rc_flag(VP9_COMP *cpi);
+
+void vp9_set_rc_buffer_sizes(VP9_COMP *cpi);
+
+static INLINE int stack_pop(int *stack, int stack_size) {
+  int idx;
+  const int r = stack[0];
+  for (idx = 1; idx < stack_size; ++idx) stack[idx - 1] = stack[idx];
+
+  return r;
+}
+
+static INLINE int stack_top(const int *stack) { return stack[0]; }
+
+static INLINE void stack_push(int *stack, int new_item, int stack_size) {
+  int idx;
+  for (idx = stack_size; idx > 0; --idx) stack[idx] = stack[idx - 1];
+  stack[0] = new_item;
+}
+
+static INLINE void stack_init(int *stack, int length) {
+  int idx;
+  for (idx = 0; idx < length; ++idx) stack[idx] = -1;
+}
+
+int vp9_get_quantizer(const VP9_COMP *cpi);
 
 static INLINE int frame_is_kf_gf_arf(const VP9_COMP *cpi) {
   return frame_is_intra_only(&cpi->common) || cpi->refresh_alt_ref_frame ||
          (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
 }
 
+static INLINE int ref_frame_to_flag(int8_t ref_frame) {
+  static const int kVp9RefFlagList[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                          VP9_ALT_FLAG };
+  assert(ref_frame >= LAST_FRAME && ref_frame <= ALTREF_FRAME);
+  return kVp9RefFlagList[ref_frame];
+}
+
 static INLINE int get_ref_frame_map_idx(const VP9_COMP *cpi,
                                         MV_REFERENCE_FRAME ref_frame) {
   if (ref_frame == LAST_FRAME) {
@@ -707,9 +1040,25 @@ static INLINE int get_ref_frame_buf_idx(const VP9_COMP *const cpi,
   return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : INVALID_IDX;
 }
 
+static INLINE RefCntBuffer *get_ref_cnt_buffer(const VP9_COMMON *cm,
+                                               int fb_idx) {
+  return fb_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[fb_idx] : NULL;
+}
+
+static INLINE void get_ref_frame_bufs(
+    const VP9_COMP *cpi, RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES]) {
+  const VP9_COMMON *const cm = &cpi->common;
+  MV_REFERENCE_FRAME ref_frame;
+  for (ref_frame = LAST_FRAME; ref_frame < MAX_REF_FRAMES; ++ref_frame) {
+    int ref_frame_buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+    int inter_ref_idx = mv_ref_frame_to_inter_ref_idx(ref_frame);
+    ref_frame_bufs[inter_ref_idx] = get_ref_cnt_buffer(cm, ref_frame_buf_idx);
+  }
+}
+
 static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer(
-    VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
-  VP9_COMMON *const cm = &cpi->common;
+    const VP9_COMP *const cpi, MV_REFERENCE_FRAME ref_frame) {
+  const VP9_COMMON *const cm = &cpi->common;
   const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
   return buf_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[buf_idx].buf
                                 : NULL;
@@ -721,7 +1070,13 @@ static INLINE int get_token_alloc(int mb_rows, int mb_cols) {
   // mb_rows, cols are in units of 16 pixels. We assume 3 planes all at full
   // resolution. We assume up to 1 token per pixel, and then allow
   // a head room of 4.
-  return mb_rows * mb_cols * (16 * 16 * 3 + 4);
+
+  // Use aligned mb_rows and mb_cols to better align with actual token sizes.
+  const int aligned_mb_rows =
+      ALIGN_POWER_OF_TWO(mb_rows, MI_BLOCK_SIZE_LOG2 - 1);
+  const int aligned_mb_cols =
+      ALIGN_POWER_OF_TWO(mb_cols, MI_BLOCK_SIZE_LOG2 - 1);
+  return aligned_mb_rows * aligned_mb_cols * (16 * 16 * 3 + 4);
 }
 
 // Get the allocated token size for a tile. It does the same calculation as in
@@ -733,6 +1088,20 @@ static INLINE int allocated_tokens(TileInfo tile) {
   return get_token_alloc(tile_mb_rows, tile_mb_cols);
 }
 
+static INLINE void get_start_tok(VP9_COMP *cpi, int tile_row, int tile_col,
+                                 int mi_row, TOKENEXTRA **tok) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+  const TileInfo *const tile_info = &this_tile->tile_info;
+
+  int tile_mb_cols = (tile_info->mi_col_end - tile_info->mi_col_start + 1) >> 1;
+  const int mb_row = (mi_row - tile_info->mi_row_start) >> 1;
+
+  *tok =
+      cpi->tile_tok[tile_row][tile_col] + get_token_alloc(mb_row, tile_mb_cols);
+}
+
 int64_t vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
 #if CONFIG_VP9_HIGHBITDEPTH
 int64_t vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
@@ -743,37 +1112,46 @@ void vp9_scale_references(VP9_COMP *cpi);
 
 void vp9_update_reference_frames(VP9_COMP *cpi);
 
+void vp9_get_ref_frame_info(FRAME_UPDATE_TYPE update_type, int ref_frame_flags,
+                            RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES],
+                            int *ref_frame_coding_indexes,
+                            int *ref_frame_valid_list);
+
 void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv);
 
-YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(VP9_COMMON *cm,
-                                           YV12_BUFFER_CONFIG *unscaled,
-                                           YV12_BUFFER_CONFIG *scaled,
-                                           YV12_BUFFER_CONFIG *scaled_temp);
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                             YV12_BUFFER_CONFIG *dst, int bd);
+#else
+void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                             YV12_BUFFER_CONFIG *dst);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
-YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
-                                          YV12_BUFFER_CONFIG *unscaled,
-                                          YV12_BUFFER_CONFIG *scaled,
-                                          int use_normative_scaler);
+YV12_BUFFER_CONFIG *vp9_scale_if_required(
+    VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
+    int use_normative_scaler, INTERP_FILTER filter_type, int phase_scaler);
 
 void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags);
 
-static INLINE int is_two_pass_svc(const struct VP9_COMP *const cpi) {
-  return cpi->use_svc && cpi->oxcf.pass != 0;
-}
-
-static INLINE int is_one_pass_cbr_svc(const struct VP9_COMP *const cpi) {
+static INLINE int is_one_pass_svc(const struct VP9_COMP *const cpi) {
   return (cpi->use_svc && cpi->oxcf.pass == 0);
 }
 
+#if CONFIG_VP9_TEMPORAL_DENOISING
+static INLINE int denoise_svc(const struct VP9_COMP *const cpi) {
+  return (!cpi->use_svc || (cpi->use_svc && cpi->svc.spatial_layer_id >=
+                                                cpi->svc.first_layer_denoise));
+}
+#endif
+
+#define MIN_LOOKAHEAD_FOR_ARFS 4
 static INLINE int is_altref_enabled(const VP9_COMP *const cpi) {
   return !(cpi->oxcf.mode == REALTIME && cpi->oxcf.rc_mode == VPX_CBR) &&
-         cpi->oxcf.lag_in_frames > 0 &&
-         (cpi->oxcf.enable_auto_arf &&
-          (!is_two_pass_svc(cpi) ||
-           cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id]));
+         cpi->oxcf.lag_in_frames >= MIN_LOOKAHEAD_FOR_ARFS &&
+         cpi->oxcf.enable_auto_arf;
 }
 
-static INLINE void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd,
+static INLINE void set_ref_ptrs(const VP9_COMMON *const cm, MACROBLOCKD *xd,
                                 MV_REFERENCE_FRAME ref0,
                                 MV_REFERENCE_FRAME ref1) {
   xd->block_refs[0] =
@@ -790,6 +1168,18 @@ static INLINE int *cond_cost_list(const struct VP9_COMP *cpi, int *cost_list) {
   return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL;
 }
 
+static INLINE int get_num_vert_units(TileInfo tile, int shift) {
+  int num_vert_units =
+      (tile.mi_row_end - tile.mi_row_start + (1 << shift) - 1) >> shift;
+  return num_vert_units;
+}
+
+static INLINE int get_num_cols(TileInfo tile, int shift) {
+  int num_cols =
+      (tile.mi_col_end - tile.mi_col_start + (1 << shift) - 1) >> shift;
+  return num_cols;
+}
+
 static INLINE int get_level_index(VP9_LEVEL level) {
   int i;
   for (i = 0; i < VP9_LEVELS; ++i) {
@@ -798,14 +1188,204 @@ static INLINE int get_level_index(VP9_LEVEL level) {
   return -1;
 }
 
+// Return the log2 value of max column tiles corresponding to the level that
+// the picture size fits into.
+static INLINE int log_tile_cols_from_picsize_level(uint32_t width,
+                                                   uint32_t height) {
+  int i;
+  const uint32_t pic_size = width * height;
+  const uint32_t pic_breadth = VPXMAX(width, height);
+  for (i = 0; i < VP9_LEVELS; ++i) {
+    if (vp9_level_defs[i].max_luma_picture_size >= pic_size &&
+        vp9_level_defs[i].max_luma_picture_breadth >= pic_breadth) {
+      return get_msb(vp9_level_defs[i].max_col_tiles);
+    }
+  }
+  return INT_MAX;
+}
+
 VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec);
 
+vpx_codec_err_t vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map,
+                                unsigned int rows, unsigned int cols,
+                                int delta_q[8], int delta_lf[8], int skip[8],
+                                int ref_frame[8]);
+
 void vp9_new_framerate(VP9_COMP *cpi, double framerate);
 
+void vp9_set_row_mt(VP9_COMP *cpi);
+
+int vp9_get_psnr(const VP9_COMP *cpi, PSNR_STATS *psnr);
+
 #define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl))
 
+static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) {
+  RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx];
+  if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows ||
+      new_fb_ptr->mi_cols < cm->mi_cols) {
+    vpx_free(new_fb_ptr->mvs);
+    CHECK_MEM_ERROR(&cm->error, new_fb_ptr->mvs,
+                    (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
+                                         sizeof(*new_fb_ptr->mvs)));
+    new_fb_ptr->mi_rows = cm->mi_rows;
+    new_fb_ptr->mi_cols = cm->mi_cols;
+  }
+}
+
+static INLINE int mv_cost(const MV *mv, const int *joint_cost,
+                          int *const comp_cost[2]) {
+  assert(mv->row >= -MV_MAX && mv->row < MV_MAX);
+  assert(mv->col >= -MV_MAX && mv->col < MV_MAX);
+  return joint_cost[vp9_get_mv_joint(mv)] + comp_cost[0][mv->row] +
+         comp_cost[1][mv->col];
+}
+
+static INLINE int mvsad_err_cost(const MACROBLOCK *x, const MV *mv,
+                                 const MV *ref, int sad_per_bit) {
+  MV diff;
+  diff.row = mv->row - ref->row;
+  diff.col = mv->col - ref->col;
+  return ROUND_POWER_OF_TWO(
+      (unsigned)mv_cost(&diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit,
+      VP9_PROB_COST_SHIFT);
+}
+
+static INLINE uint32_t get_start_mv_sad(const MACROBLOCK *x, const MV *mvp_full,
+                                        const MV *ref_mv_full,
+                                        vpx_sad_fn_t sad_fn_ptr, int sadpb) {
+  const int src_buf_stride = x->plane[0].src.stride;
+  const uint8_t *const src_buf = x->plane[0].src.buf;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int pred_buf_stride = xd->plane[0].pre[0].stride;
+  const uint8_t *const pred_buf =
+      xd->plane[0].pre[0].buf + mvp_full->row * pred_buf_stride + mvp_full->col;
+  uint32_t start_mv_sad =
+      sad_fn_ptr(src_buf, src_buf_stride, pred_buf, pred_buf_stride);
+  start_mv_sad += mvsad_err_cost(x, mvp_full, ref_mv_full, sadpb);
+
+  return start_mv_sad;
+}
+
+static INLINE int num_4x4_to_edge(int plane_4x4_dim, int mb_to_edge_dim,
+                                  int subsampling_dim, int blk_dim) {
+  return plane_4x4_dim + (mb_to_edge_dim >> (5 + subsampling_dim)) - blk_dim;
+}
+
+// Compute the sum of squares on all visible 4x4s in the transform block.
+static int64_t sum_squares_visible(const MACROBLOCKD *xd,
+                                   const struct macroblockd_plane *const pd,
+                                   const int16_t *diff, const int diff_stride,
+                                   int blk_row, int blk_col,
+                                   const BLOCK_SIZE plane_bsize,
+                                   const BLOCK_SIZE tx_bsize,
+                                   int *visible_width, int *visible_height) {
+  int64_t sse;
+  const int plane_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+  const int plane_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+  const int tx_4x4_w = num_4x4_blocks_wide_lookup[tx_bsize];
+  const int tx_4x4_h = num_4x4_blocks_high_lookup[tx_bsize];
+  const int b4x4s_to_right_edge = num_4x4_to_edge(
+      plane_4x4_w, xd->mb_to_right_edge, pd->subsampling_x, blk_col);
+  const int b4x4s_to_bottom_edge = num_4x4_to_edge(
+      plane_4x4_h, xd->mb_to_bottom_edge, pd->subsampling_y, blk_row);
+  if (tx_bsize == BLOCK_4X4 ||
+      (b4x4s_to_right_edge >= tx_4x4_w && b4x4s_to_bottom_edge >= tx_4x4_h)) {
+    assert(tx_4x4_w == tx_4x4_h);
+    sse = (int64_t)vpx_sum_squares_2d_i16(diff, diff_stride, tx_4x4_w << 2);
+    *visible_width = tx_4x4_w << 2;
+    *visible_height = tx_4x4_h << 2;
+  } else {
+    int r, c;
+    const int max_r = VPXMIN(b4x4s_to_bottom_edge, tx_4x4_h);
+    const int max_c = VPXMIN(b4x4s_to_right_edge, tx_4x4_w);
+    sse = 0;
+    // if we are in the unrestricted motion border.
+    for (r = 0; r < max_r; ++r) {
+      // Skip visiting the sub blocks that are wholly within the UMV.
+      for (c = 0; c < max_c; ++c) {
+        sse += (int64_t)vpx_sum_squares_2d_i16(
+            diff + r * diff_stride * 4 + c * 4, diff_stride, 4);
+      }
+    }
+    *visible_width = max_c << 2;
+    *visible_height = max_r << 2;
+  }
+  return sse;
+}
+
+// Check if trellis coefficient optimization of the transform block is enabled.
+static INLINE int do_trellis_opt(const struct macroblockd_plane *pd,
+                                 const int16_t *src_diff, int diff_stride,
+                                 int blk_row, int blk_col,
+                                 BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                                 void *arg) {
+  const struct encode_b_args *const args = (struct encode_b_args *)arg;
+  const MACROBLOCK *const x = args->x;
+
+  switch (args->enable_trellis_opt) {
+    case DISABLE_TRELLIS_OPT: return 0;
+    case ENABLE_TRELLIS_OPT: return 1;
+    case ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR: {
+      vpx_clear_system_state();
+
+      return (args->trellis_opt_thresh > 0.0)
+                 ? (x->log_block_src_var <= args->trellis_opt_thresh)
+                 : 1;
+    }
+    case ENABLE_TRELLIS_OPT_TX_RD_RESIDUAL_MSE: {
+      const MACROBLOCKD *const xd = &x->e_mbd;
+      const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+#if CONFIG_VP9_HIGHBITDEPTH
+      const int dequant_shift =
+          (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+#else
+      const int dequant_shift = 3;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      const int qstep = pd->dequant[1] >> dequant_shift;
+      int *sse_calc_done = args->sse_calc_done;
+      int64_t *sse = args->sse;
+      int visible_width = 0, visible_height = 0;
+
+      // TODO: Enable the sf for high bit-depth case
+      if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) || !sse ||
+          !sse_calc_done)
+        return 1;
+
+      *sse = sum_squares_visible(xd, pd, src_diff, diff_stride, blk_row,
+                                 blk_col, plane_bsize, tx_bsize, &visible_width,
+                                 &visible_height);
+      *sse_calc_done = 1;
+
+      vpx_clear_system_state();
+
+      return (*(sse) <= (int64_t)visible_width * visible_height * qstep *
+                            qstep * args->trellis_opt_thresh);
+    }
+    default: assert(0 && "Invalid trellis optimization method."); return 1;
+  }
+}
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+static INLINE void start_timing(VP9_COMP *cpi, int component) {
+  vpx_usec_timer_start(&cpi->component_timer[component]);
+}
+static INLINE void end_timing(VP9_COMP *cpi, int component) {
+  vpx_usec_timer_mark(&cpi->component_timer[component]);
+  cpi->frame_component_time[component] +=
+      vpx_usec_timer_elapsed(&cpi->component_timer[component]);
+}
+static INLINE char const *get_frame_type_enum(int type) {
+  switch (type) {
+    case 0: return "KEY_FRAME";
+    case 1: return "INTER_FRAME";
+    default: assert(0);
+  }
+  return "error";
+}
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_ENCODER_H_
+#endif  // VPX_VP9_ENCODER_VP9_ENCODER_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c
index f4f7c7bacc..efe47259ef 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c
@@ -8,10 +8,16 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "vp9/common/vp9_thread_common.h"
+#include "vp9/encoder/vp9_bitstream.h"
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_ethread.h"
+#include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_multi_thread.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_util/vpx_pthread.h"
 
 static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
   int i, j, k, l, m, n;
@@ -32,7 +38,8 @@ static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
                   td_t->rd_counts.coef_counts[i][j][k][l][m][n];
 }
 
-static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
+static int enc_worker_hook(void *arg1, void *unused) {
+  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
   VP9_COMP *const cpi = thread_data->cpi;
   const VP9_COMMON *const cm = &cpi->common;
   const int tile_cols = 1 << cm->log2_tile_cols;
@@ -49,7 +56,7 @@ static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
     vp9_encode_tile(cpi, thread_data->td, tile_row, tile_col);
   }
 
-  return 0;
+  return 1;
 }
 
 static int get_max_tile_cols(VP9_COMP *cpi) {
@@ -61,81 +68,145 @@ static int get_max_tile_cols(VP9_COMP *cpi) {
   vp9_get_tile_n_bits(mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
   log2_tile_cols =
       clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols);
+  if (cpi->oxcf.target_level == LEVEL_AUTO) {
+    const int level_tile_cols =
+        log_tile_cols_from_picsize_level(cpi->common.width, cpi->common.height);
+    if (log2_tile_cols > level_tile_cols) {
+      log2_tile_cols = VPXMAX(level_tile_cols, min_log2_tile_cols);
+    }
+  }
   return (1 << log2_tile_cols);
 }
 
+static void create_enc_workers(VP9_COMP *cpi, int num_workers) {
+  VP9_COMMON *const cm = &cpi->common;
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+  int i;
+  // While using SVC, we need to allocate threads according to the highest
+  // resolution. When row based multithreading is enabled, it is OK to
+  // allocate more threads than the number of max tile columns.
+  if (cpi->use_svc && !cpi->row_mt) {
+    int max_tile_cols = get_max_tile_cols(cpi);
+    num_workers = VPXMIN(cpi->oxcf.max_threads, max_tile_cols);
+  }
+  assert(num_workers > 0);
+  if (num_workers == cpi->num_workers) return;
+  vp9_loop_filter_dealloc(&cpi->lf_row_sync);
+  vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
+  vp9_encode_free_mt_data(cpi);
+
+  CHECK_MEM_ERROR(&cm->error, cpi->workers,
+                  vpx_malloc(num_workers * sizeof(*cpi->workers)));
+
+  CHECK_MEM_ERROR(&cm->error, cpi->tile_thr_data,
+                  vpx_calloc(num_workers, sizeof(*cpi->tile_thr_data)));
+
+  for (i = 0; i < num_workers; i++) {
+    VPxWorker *const worker = &cpi->workers[i];
+    EncWorkerData *thread_data = &cpi->tile_thr_data[i];
+
+    ++cpi->num_workers;
+    winterface->init(worker);
+    worker->thread_name = "vpx enc worker";
+
+    if (i < num_workers - 1) {
+      thread_data->cpi = cpi;
+
+      // Allocate thread data.
+      CHECK_MEM_ERROR(&cm->error, thread_data->td,
+                      vpx_memalign(32, sizeof(*thread_data->td)));
+      vp9_zero(*thread_data->td);
+
+      // Set up pc_tree.
+      thread_data->td->leaf_tree = NULL;
+      thread_data->td->pc_tree = NULL;
+      vp9_setup_pc_tree(cm, thread_data->td);
+
+      // Allocate frame counters in thread data.
+      CHECK_MEM_ERROR(&cm->error, thread_data->td->counts,
+                      vpx_calloc(1, sizeof(*thread_data->td->counts)));
+
+      // Create threads
+      if (!winterface->reset(worker))
+        vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                           "Tile encoder thread creation failed");
+    } else {
+      // Main thread acts as a worker and uses the thread data in cpi.
+      thread_data->cpi = cpi;
+      thread_data->td = &cpi->td;
+    }
+    winterface->sync(worker);
+  }
+}
+
+static void launch_enc_workers(VP9_COMP *cpi, VPxWorkerHook hook, void *data2,
+                               int num_workers) {
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+  int i;
+
+  for (i = 0; i < num_workers; i++) {
+    VPxWorker *const worker = &cpi->workers[i];
+    worker->hook = hook;
+    worker->data1 = &cpi->tile_thr_data[i];
+    worker->data2 = data2;
+  }
+
+  // Encode a frame
+  for (i = 0; i < num_workers; i++) {
+    VPxWorker *const worker = &cpi->workers[i];
+    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+
+    // Set the starting tile for each thread.
+    thread_data->start = i;
+
+    if (i == cpi->num_workers - 1)
+      winterface->execute(worker);
+    else
+      winterface->launch(worker);
+  }
+
+  // Encoding ends.
+  for (i = 0; i < num_workers; i++) {
+    VPxWorker *const worker = &cpi->workers[i];
+    winterface->sync(worker);
+  }
+}
+
+void vp9_encode_free_mt_data(struct VP9_COMP *cpi) {
+  int t;
+  for (t = 0; t < cpi->num_workers; ++t) {
+    VPxWorker *const worker = &cpi->workers[t];
+    EncWorkerData *const thread_data = &cpi->tile_thr_data[t];
+
+    // Deallocate allocated threads.
+    vpx_get_worker_interface()->end(worker);
+
+    // Deallocate allocated thread data.
+    if (t < cpi->num_workers - 1) {
+      vpx_free(thread_data->td->counts);
+      vp9_free_pc_tree(thread_data->td);
+      vpx_free(thread_data->td);
+    }
+  }
+  vpx_free(cpi->tile_thr_data);
+  cpi->tile_thr_data = NULL;
+  vpx_free(cpi->workers);
+  cpi->workers = NULL;
+  cpi->num_workers = 0;
+}
+
 void vp9_encode_tiles_mt(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   const int tile_cols = 1 << cm->log2_tile_cols;
-  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
   const int num_workers = VPXMIN(cpi->oxcf.max_threads, tile_cols);
   int i;
 
   vp9_init_tile_data(cpi);
 
-  // Only run once to create threads and allocate thread data.
-  if (cpi->num_workers == 0) {
-    int allocated_workers = num_workers;
-
-    // While using SVC, we need to allocate threads according to the highest
-    // resolution.
-    if (cpi->use_svc) {
-      int max_tile_cols = get_max_tile_cols(cpi);
-      allocated_workers = VPXMIN(cpi->oxcf.max_threads, max_tile_cols);
-    }
-
-    CHECK_MEM_ERROR(cm, cpi->workers,
-                    vpx_malloc(allocated_workers * sizeof(*cpi->workers)));
-
-    CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
-                    vpx_calloc(allocated_workers, sizeof(*cpi->tile_thr_data)));
-
-    for (i = 0; i < allocated_workers; i++) {
-      VPxWorker *const worker = &cpi->workers[i];
-      EncWorkerData *thread_data = &cpi->tile_thr_data[i];
-
-      ++cpi->num_workers;
-      winterface->init(worker);
-
-      if (i < allocated_workers - 1) {
-        thread_data->cpi = cpi;
-
-        // Allocate thread data.
-        CHECK_MEM_ERROR(cm, thread_data->td,
-                        vpx_memalign(32, sizeof(*thread_data->td)));
-        vp9_zero(*thread_data->td);
-
-        // Set up pc_tree.
-        thread_data->td->leaf_tree = NULL;
-        thread_data->td->pc_tree = NULL;
-        vp9_setup_pc_tree(cm, thread_data->td);
-
-        // Allocate frame counters in thread data.
-        CHECK_MEM_ERROR(cm, thread_data->td->counts,
-                        vpx_calloc(1, sizeof(*thread_data->td->counts)));
-
-        // Create threads
-        if (!winterface->reset(worker))
-          vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
-                             "Tile encoder thread creation failed");
-      } else {
-        // Main thread acts as a worker and uses the thread data in cpi.
-        thread_data->cpi = cpi;
-        thread_data->td = &cpi->td;
-      }
-
-      winterface->sync(worker);
-    }
-  }
+  create_enc_workers(cpi, num_workers);
 
   for (i = 0; i < num_workers; i++) {
-    VPxWorker *const worker = &cpi->workers[i];
-    EncWorkerData *thread_data;
-
-    worker->hook = (VPxWorkerHook)enc_worker_hook;
-    worker->data1 = &cpi->tile_thr_data[i];
-    worker->data2 = NULL;
-    thread_data = (EncWorkerData *)worker->data1;
+    EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
 
     // Before encoding a frame, copy the thread data from cpi.
     if (thread_data->td != &cpi->td) {
@@ -165,25 +236,449 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) {
     }
   }
 
-  // Encode a frame
-  for (i = 0; i < num_workers; i++) {
-    VPxWorker *const worker = &cpi->workers[i];
-    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
-
-    // Set the starting tile for each thread.
-    thread_data->start = i;
-
-    if (i == cpi->num_workers - 1)
-      winterface->execute(worker);
-    else
-      winterface->launch(worker);
-  }
-
-  // Encoding ends.
-  for (i = 0; i < num_workers; i++) {
-    VPxWorker *const worker = &cpi->workers[i];
-    winterface->sync(worker);
-  }
+  launch_enc_workers(cpi, enc_worker_hook, NULL, num_workers);
+
+  for (i = 0; i < num_workers; i++) {
+    VPxWorker *const worker = &cpi->workers[i];
+    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+
+    // Accumulate counters.
+    if (i < cpi->num_workers - 1) {
+      vp9_accumulate_frame_counts(&cm->counts, thread_data->td->counts, 0);
+      accumulate_rd_opt(&cpi->td, thread_data->td);
+    }
+  }
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void accumulate_fp_tile_stat(TileDataEnc *tile_data,
+                                    TileDataEnc *tile_data_t) {
+  tile_data->fp_data.intra_factor += tile_data_t->fp_data.intra_factor;
+  tile_data->fp_data.brightness_factor +=
+      tile_data_t->fp_data.brightness_factor;
+  tile_data->fp_data.coded_error += tile_data_t->fp_data.coded_error;
+  tile_data->fp_data.sr_coded_error += tile_data_t->fp_data.sr_coded_error;
+  tile_data->fp_data.frame_noise_energy +=
+      tile_data_t->fp_data.frame_noise_energy;
+  tile_data->fp_data.intra_error += tile_data_t->fp_data.intra_error;
+  tile_data->fp_data.intercount += tile_data_t->fp_data.intercount;
+  tile_data->fp_data.second_ref_count += tile_data_t->fp_data.second_ref_count;
+  tile_data->fp_data.neutral_count += tile_data_t->fp_data.neutral_count;
+  tile_data->fp_data.intra_count_low += tile_data_t->fp_data.intra_count_low;
+  tile_data->fp_data.intra_count_high += tile_data_t->fp_data.intra_count_high;
+  tile_data->fp_data.intra_skip_count += tile_data_t->fp_data.intra_skip_count;
+  tile_data->fp_data.mvcount += tile_data_t->fp_data.mvcount;
+  tile_data->fp_data.new_mv_count += tile_data_t->fp_data.new_mv_count;
+  tile_data->fp_data.sum_mvr += tile_data_t->fp_data.sum_mvr;
+  tile_data->fp_data.sum_mvr_abs += tile_data_t->fp_data.sum_mvr_abs;
+  tile_data->fp_data.sum_mvc += tile_data_t->fp_data.sum_mvc;
+  tile_data->fp_data.sum_mvc_abs += tile_data_t->fp_data.sum_mvc_abs;
+  tile_data->fp_data.sum_mvrs += tile_data_t->fp_data.sum_mvrs;
+  tile_data->fp_data.sum_mvcs += tile_data_t->fp_data.sum_mvcs;
+  tile_data->fp_data.sum_in_vectors += tile_data_t->fp_data.sum_in_vectors;
+  tile_data->fp_data.intra_smooth_count +=
+      tile_data_t->fp_data.intra_smooth_count;
+  const int min_start_row = VPXMIN(tile_data->fp_data.image_data_start_row,
+                                   tile_data_t->fp_data.image_data_start_row);
+  tile_data->fp_data.image_data_start_row =
+      (min_start_row == INVALID_ROW)
+          ? VPXMAX(tile_data->fp_data.image_data_start_row,
+                   tile_data_t->fp_data.image_data_start_row)
+          : min_start_row;
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+// Allocate memory for row synchronization
+void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm,
+                               int rows) {
+  row_mt_sync->rows = rows;
+#if CONFIG_MULTITHREAD
+  {
+    int i;
+
+    CHECK_MEM_ERROR(&cm->error, row_mt_sync->mutex,
+                    vpx_malloc(sizeof(*row_mt_sync->mutex) * rows));
+    if (row_mt_sync->mutex) {
+      for (i = 0; i < rows; ++i) {
+        pthread_mutex_init(&row_mt_sync->mutex[i], NULL);
+      }
+    }
+
+    CHECK_MEM_ERROR(&cm->error, row_mt_sync->cond,
+                    vpx_malloc(sizeof(*row_mt_sync->cond) * rows));
+    if (row_mt_sync->cond) {
+      for (i = 0; i < rows; ++i) {
+        pthread_cond_init(&row_mt_sync->cond[i], NULL);
+      }
+    }
+  }
+#endif  // CONFIG_MULTITHREAD
+
+  CHECK_MEM_ERROR(&cm->error, row_mt_sync->cur_col,
+                  vpx_malloc(sizeof(*row_mt_sync->cur_col) * rows));
+
+  // Set up nsync.
+  row_mt_sync->sync_range = 1;
+}
+
+// Deallocate row based multi-threading synchronization related mutex and data
+void vp9_row_mt_sync_mem_dealloc(VP9RowMTSync *row_mt_sync) {
+  if (row_mt_sync != NULL) {
+#if CONFIG_MULTITHREAD
+    int i;
+
+    if (row_mt_sync->mutex != NULL) {
+      for (i = 0; i < row_mt_sync->rows; ++i) {
+        pthread_mutex_destroy(&row_mt_sync->mutex[i]);
+      }
+      vpx_free(row_mt_sync->mutex);
+    }
+    if (row_mt_sync->cond != NULL) {
+      for (i = 0; i < row_mt_sync->rows; ++i) {
+        pthread_cond_destroy(&row_mt_sync->cond[i]);
+      }
+      vpx_free(row_mt_sync->cond);
+    }
+#endif  // CONFIG_MULTITHREAD
+    vpx_free(row_mt_sync->cur_col);
+    // clear the structure as the source of this call may be dynamic change
+    // in tiles in which case this call will be followed by an _alloc()
+    // which may fail.
+    vp9_zero(*row_mt_sync);
+  }
+}
+
+void vp9_row_mt_sync_read(VP9RowMTSync *const row_mt_sync, int r, int c) {
+#if CONFIG_MULTITHREAD
+  const int nsync = row_mt_sync->sync_range;
+
+  if (r && !(c & (nsync - 1))) {
+    pthread_mutex_t *const mutex = &row_mt_sync->mutex[r - 1];
+    pthread_mutex_lock(mutex);
+
+    while (c > row_mt_sync->cur_col[r - 1] - nsync + 1) {
+      pthread_cond_wait(&row_mt_sync->cond[r - 1], mutex);
+    }
+    pthread_mutex_unlock(mutex);
+  }
+#else
+  (void)row_mt_sync;
+  (void)r;
+  (void)c;
+#endif  // CONFIG_MULTITHREAD
+}
+
+void vp9_row_mt_sync_read_dummy(VP9RowMTSync *const row_mt_sync, int r, int c) {
+  (void)row_mt_sync;
+  (void)r;
+  (void)c;
+  return;
+}
+
+void vp9_row_mt_sync_write(VP9RowMTSync *const row_mt_sync, int r, int c,
+                           const int cols) {
+#if CONFIG_MULTITHREAD
+  const int nsync = row_mt_sync->sync_range;
+  int cur;
+  // Only signal when there are enough encoded blocks for next row to run.
+  int sig = 1;
+
+  if (c < cols - 1) {
+    cur = c;
+    if (c % nsync != nsync - 1) sig = 0;
+  } else {
+    cur = cols + nsync;
+  }
+
+  if (sig) {
+    pthread_mutex_lock(&row_mt_sync->mutex[r]);
+
+    row_mt_sync->cur_col[r] = cur;
+
+    pthread_cond_signal(&row_mt_sync->cond[r]);
+    pthread_mutex_unlock(&row_mt_sync->mutex[r]);
+  }
+#else
+  (void)row_mt_sync;
+  (void)r;
+  (void)c;
+  (void)cols;
+#endif  // CONFIG_MULTITHREAD
+}
+
+void vp9_row_mt_sync_write_dummy(VP9RowMTSync *const row_mt_sync, int r, int c,
+                                 const int cols) {
+  (void)row_mt_sync;
+  (void)r;
+  (void)c;
+  (void)cols;
+  return;
+}
+
+#if !CONFIG_REALTIME_ONLY
+static int first_pass_worker_hook(void *arg1, void *arg2) {
+  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+  MultiThreadHandle *multi_thread_ctxt = (MultiThreadHandle *)arg2;
+  VP9_COMP *const cpi = thread_data->cpi;
+  const VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  int tile_row, tile_col;
+  TileDataEnc *this_tile;
+  int end_of_frame;
+  int thread_id = thread_data->thread_id;
+  int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id];
+  JobNode *proc_job = NULL;
+  FIRSTPASS_DATA fp_acc_data;
+  MV zero_mv = { 0, 0 };
+  MV best_ref_mv;
+  int mb_row;
+
+  end_of_frame = 0;
+  while (0 == end_of_frame) {
+    // Get the next job in the queue
+    proc_job =
+        (JobNode *)vp9_enc_grp_get_next_job(multi_thread_ctxt, cur_tile_id);
+    if (NULL == proc_job) {
+      // Query for the status of other tiles
+      end_of_frame = vp9_get_tiles_proc_status(
+          multi_thread_ctxt, thread_data->tile_completion_status, &cur_tile_id,
+          tile_cols);
+    } else {
+      tile_col = proc_job->tile_col_id;
+      tile_row = proc_job->tile_row_id;
+
+      this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+      mb_row = proc_job->vert_unit_row_num;
+
+      best_ref_mv = zero_mv;
+      vp9_zero(fp_acc_data);
+      fp_acc_data.image_data_start_row = INVALID_ROW;
+      vp9_first_pass_encode_tile_mb_row(cpi, thread_data->td, &fp_acc_data,
+                                        this_tile, &best_ref_mv, mb_row);
+    }
+  }
+  return 1;
+}
+
+void vp9_encode_fp_row_mt(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
+  TileDataEnc *first_tile_col;
+  int num_workers = VPXMAX(cpi->oxcf.max_threads, 1);
+  int i;
+
+  if (multi_thread_ctxt->allocated_tile_cols < tile_cols ||
+      multi_thread_ctxt->allocated_tile_rows < tile_rows ||
+      multi_thread_ctxt->allocated_vert_unit_rows < cm->mb_rows) {
+    vp9_row_mt_mem_dealloc(cpi);
+    vp9_init_tile_data(cpi);
+    vp9_row_mt_mem_alloc(cpi);
+  } else {
+    vp9_init_tile_data(cpi);
+  }
+
+  create_enc_workers(cpi, num_workers);
+
+  vp9_assign_tile_to_thread(multi_thread_ctxt, tile_cols, cpi->num_workers);
+
+  vp9_prepare_job_queue(cpi, FIRST_PASS_JOB);
+
+  vp9_multi_thread_tile_init(cpi);
+
+  for (i = 0; i < num_workers; i++) {
+    EncWorkerData *thread_data;
+    thread_data = &cpi->tile_thr_data[i];
+
+    // Before encoding a frame, copy the thread data from cpi.
+    if (thread_data->td != &cpi->td) {
+      thread_data->td->mb = cpi->td.mb;
+    }
+  }
+
+  launch_enc_workers(cpi, first_pass_worker_hook, multi_thread_ctxt,
+                     num_workers);
+
+  first_tile_col = &cpi->tile_data[0];
+  for (i = 1; i < tile_cols; i++) {
+    TileDataEnc *this_tile = &cpi->tile_data[i];
+    accumulate_fp_tile_stat(first_tile_col, this_tile);
+  }
+}
+
+static int temporal_filter_worker_hook(void *arg1, void *arg2) {
+  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+  MultiThreadHandle *multi_thread_ctxt = (MultiThreadHandle *)arg2;
+  VP9_COMP *const cpi = thread_data->cpi;
+  const VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  int tile_row, tile_col;
+  int mb_col_start, mb_col_end;
+  TileDataEnc *this_tile;
+  int end_of_frame;
+  int thread_id = thread_data->thread_id;
+  int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id];
+  JobNode *proc_job = NULL;
+  int mb_row;
+
+  end_of_frame = 0;
+  while (0 == end_of_frame) {
+    // Get the next job in the queue
+    proc_job =
+        (JobNode *)vp9_enc_grp_get_next_job(multi_thread_ctxt, cur_tile_id);
+    if (NULL == proc_job) {
+      // Query for the status of other tiles
+      end_of_frame = vp9_get_tiles_proc_status(
+          multi_thread_ctxt, thread_data->tile_completion_status, &cur_tile_id,
+          tile_cols);
+    } else {
+      tile_col = proc_job->tile_col_id;
+      tile_row = proc_job->tile_row_id;
+      this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+      mb_col_start = (this_tile->tile_info.mi_col_start) >> TF_SHIFT;
+      mb_col_end = (this_tile->tile_info.mi_col_end + TF_ROUND) >> TF_SHIFT;
+      mb_row = proc_job->vert_unit_row_num;
+
+      vp9_temporal_filter_iterate_row_c(cpi, thread_data->td, mb_row,
+                                        mb_col_start, mb_col_end);
+    }
+  }
+  return 1;
+}
+
+void vp9_temporal_filter_row_mt(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
+  int num_workers = cpi->num_workers ? cpi->num_workers : 1;
+  int i;
+
+  if (multi_thread_ctxt->allocated_tile_cols < tile_cols ||
+      multi_thread_ctxt->allocated_tile_rows < tile_rows ||
+      multi_thread_ctxt->allocated_vert_unit_rows < cm->mb_rows) {
+    vp9_row_mt_mem_dealloc(cpi);
+    vp9_init_tile_data(cpi);
+    vp9_row_mt_mem_alloc(cpi);
+  } else {
+    vp9_init_tile_data(cpi);
+  }
+
+  create_enc_workers(cpi, num_workers);
+
+  vp9_assign_tile_to_thread(multi_thread_ctxt, tile_cols, cpi->num_workers);
+
+  vp9_prepare_job_queue(cpi, ARNR_JOB);
+
+  for (i = 0; i < num_workers; i++) {
+    EncWorkerData *thread_data;
+    thread_data = &cpi->tile_thr_data[i];
+
+    // Before encoding a frame, copy the thread data from cpi.
+    if (thread_data->td != &cpi->td) {
+      thread_data->td->mb = cpi->td.mb;
+    }
+  }
+
+  launch_enc_workers(cpi, temporal_filter_worker_hook, multi_thread_ctxt,
+                     num_workers);
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+static int enc_row_mt_worker_hook(void *arg1, void *arg2) {
+  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+  MultiThreadHandle *multi_thread_ctxt = (MultiThreadHandle *)arg2;
+  VP9_COMP *const cpi = thread_data->cpi;
+  const VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  int tile_row, tile_col;
+  int end_of_frame;
+  int thread_id = thread_data->thread_id;
+  int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id];
+  JobNode *proc_job = NULL;
+  int mi_row;
+
+  end_of_frame = 0;
+  while (0 == end_of_frame) {
+    // Get the next job in the queue
+    proc_job =
+        (JobNode *)vp9_enc_grp_get_next_job(multi_thread_ctxt, cur_tile_id);
+    if (NULL == proc_job) {
+      // Query for the status of other tiles
+      end_of_frame = vp9_get_tiles_proc_status(
+          multi_thread_ctxt, thread_data->tile_completion_status, &cur_tile_id,
+          tile_cols);
+    } else {
+      tile_col = proc_job->tile_col_id;
+      tile_row = proc_job->tile_row_id;
+      mi_row = proc_job->vert_unit_row_num * MI_BLOCK_SIZE;
+
+      vp9_encode_sb_row(cpi, thread_data->td, tile_row, tile_col, mi_row);
+    }
+  }
+  return 1;
+}
+
+void vp9_encode_tiles_row_mt(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
+  int num_workers = VPXMAX(cpi->oxcf.max_threads, 1);
+  int i;
+
+  if (multi_thread_ctxt->allocated_tile_cols < tile_cols ||
+      multi_thread_ctxt->allocated_tile_rows < tile_rows ||
+      multi_thread_ctxt->allocated_vert_unit_rows < cm->mb_rows) {
+    vp9_row_mt_mem_dealloc(cpi);
+    vp9_init_tile_data(cpi);
+    vp9_row_mt_mem_alloc(cpi);
+  } else {
+    vp9_init_tile_data(cpi);
+  }
+
+  create_enc_workers(cpi, num_workers);
+
+  vp9_assign_tile_to_thread(multi_thread_ctxt, tile_cols, cpi->num_workers);
+
+  vp9_prepare_job_queue(cpi, ENCODE_JOB);
+
+  vp9_multi_thread_tile_init(cpi);
+
+  for (i = 0; i < num_workers; i++) {
+    EncWorkerData *thread_data;
+    thread_data = &cpi->tile_thr_data[i];
+    // Before encoding a frame, copy the thread data from cpi.
+    if (thread_data->td != &cpi->td) {
+      thread_data->td->mb = cpi->td.mb;
+      thread_data->td->rd_counts = cpi->td.rd_counts;
+    }
+    if (thread_data->td->counts != &cpi->common.counts) {
+      memcpy(thread_data->td->counts, &cpi->common.counts,
+             sizeof(cpi->common.counts));
+    }
+
+    // Handle use_nonrd_pick_mode case.
+    if (cpi->sf.use_nonrd_pick_mode) {
+      MACROBLOCK *const x = &thread_data->td->mb;
+      MACROBLOCKD *const xd = &x->e_mbd;
+      struct macroblock_plane *const p = x->plane;
+      struct macroblockd_plane *const pd = xd->plane;
+      PICK_MODE_CONTEXT *ctx = &thread_data->td->pc_root->none;
+      int j;
+
+      for (j = 0; j < MAX_MB_PLANE; ++j) {
+        p[j].coeff = ctx->coeff_pbuf[j][0];
+        p[j].qcoeff = ctx->qcoeff_pbuf[j][0];
+        pd[j].dqcoeff = ctx->dqcoeff_pbuf[j][0];
+        p[j].eobs = ctx->eobs_pbuf[j][0];
+      }
+    }
+  }
+
+  launch_enc_workers(cpi, enc_row_mt_worker_hook, multi_thread_ctxt,
+                     num_workers);
 
   for (i = 0; i < num_workers; i++) {
     VPxWorker *const worker = &cpi->workers[i];
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h
index 1efa4dcde2..46478bef9f 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h
@@ -8,13 +8,19 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_ETHREAD_H_
-#define VP9_ENCODER_VP9_ETHREAD_H_
+#ifndef VPX_VP9_ENCODER_VP9_ETHREAD_H_
+#define VPX_VP9_ENCODER_VP9_ETHREAD_H_
+
+#include "vpx_util/vpx_pthread.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+#define MAX_NUM_TILE_COLS (1 << 6)
+#define MAX_NUM_TILE_ROWS 4
+#define MAX_NUM_THREADS 64
+
 struct VP9_COMP;
 struct ThreadData;
 
@@ -22,12 +28,52 @@ typedef struct EncWorkerData {
   struct VP9_COMP *cpi;
   struct ThreadData *td;
   int start;
+  int thread_id;
+  int tile_completion_status[MAX_NUM_TILE_COLS];
 } EncWorkerData;
 
+// Encoder row synchronization
+typedef struct VP9RowMTSyncData {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *mutex;
+  pthread_cond_t *cond;
+#endif
+  // Allocate memory to store the sb/mb block index in each row.
+  int *cur_col;
+  int sync_range;
+  int rows;
+} VP9RowMTSync;
+
+// Frees EncWorkerData related allocations made by vp9_encode_*_mt().
+// row_mt specific data is freed with vp9_row_mt_mem_dealloc() and is not
+// called by this function.
+void vp9_encode_free_mt_data(struct VP9_COMP *cpi);
+
 void vp9_encode_tiles_mt(struct VP9_COMP *cpi);
 
+void vp9_encode_tiles_row_mt(struct VP9_COMP *cpi);
+
+void vp9_encode_fp_row_mt(struct VP9_COMP *cpi);
+
+void vp9_row_mt_sync_read(VP9RowMTSync *const row_mt_sync, int r, int c);
+void vp9_row_mt_sync_write(VP9RowMTSync *const row_mt_sync, int r, int c,
+                           const int cols);
+
+void vp9_row_mt_sync_read_dummy(VP9RowMTSync *const row_mt_sync, int r, int c);
+void vp9_row_mt_sync_write_dummy(VP9RowMTSync *const row_mt_sync, int r, int c,
+                                 const int cols);
+
+// Allocate memory for row based multi-threading synchronization.
+void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, struct VP9Common *cm,
+                               int rows);
+
+// Deallocate row based multi-threading synchronization related mutex and data.
+void vp9_row_mt_sync_mem_dealloc(VP9RowMTSync *row_mt_sync);
+
+void vp9_temporal_filter_row_mt(struct VP9_COMP *cpi);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_ETHREAD_H_
+#endif  // VPX_VP9_ENCODER_VP9_ETHREAD_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c
new file mode 100644
index 0000000000..e8cc006b1e
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c
@@ -0,0 +1,257 @@
+/*
+ *  Copyright (c) 2020 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stddef.h>
+#include <stdio.h>
+
+#include "vp9/encoder/vp9_ext_ratectrl.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx_dsp/psnr.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_ext_ratectrl.h"
+#include "vpx/vpx_tpl.h"
+
+vpx_codec_err_t vp9_extrc_init(EXT_RATECTRL *ext_ratectrl) {
+  if (ext_ratectrl == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  vp9_zero(*ext_ratectrl);
+  return VPX_CODEC_OK;
+}
+
+vpx_codec_err_t vp9_extrc_create(vpx_rc_funcs_t funcs,
+                                 vpx_rc_config_t ratectrl_config,
+                                 EXT_RATECTRL *ext_ratectrl) {
+  vpx_rc_status_t rc_status;
+  vpx_rc_firstpass_stats_t *rc_firstpass_stats;
+  if (ext_ratectrl == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  vp9_extrc_delete(ext_ratectrl);
+  ext_ratectrl->funcs = funcs;
+  ext_ratectrl->ratectrl_config = ratectrl_config;
+  rc_status = ext_ratectrl->funcs.create_model(ext_ratectrl->funcs.priv,
+                                               &ext_ratectrl->ratectrl_config,
+                                               &ext_ratectrl->model);
+  if (rc_status == VPX_RC_ERROR) {
+    return VPX_CODEC_ERROR;
+  }
+  rc_firstpass_stats = &ext_ratectrl->rc_firstpass_stats;
+  rc_firstpass_stats->num_frames = ratectrl_config.show_frame_count;
+  rc_firstpass_stats->frame_stats =
+      vpx_malloc(sizeof(*rc_firstpass_stats->frame_stats) *
+                 rc_firstpass_stats->num_frames);
+  if (rc_firstpass_stats->frame_stats == NULL) {
+    return VPX_CODEC_MEM_ERROR;
+  }
+  if (funcs.rate_ctrl_log_path != NULL) {
+    ext_ratectrl->log_file = fopen(funcs.rate_ctrl_log_path, "w");
+    if (!ext_ratectrl->log_file) {
+      return VPX_CODEC_ERROR;
+    }
+  } else {
+    ext_ratectrl->log_file = NULL;
+  }
+  ext_ratectrl->ready = 1;
+  return VPX_CODEC_OK;
+}
+
+vpx_codec_err_t vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl) {
+  if (ext_ratectrl == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  if (ext_ratectrl->ready) {
+    if (ext_ratectrl->log_file) {
+      fclose(ext_ratectrl->log_file);
+    }
+    vpx_rc_status_t rc_status =
+        ext_ratectrl->funcs.delete_model(ext_ratectrl->model);
+    if (rc_status == VPX_RC_ERROR) {
+      return VPX_CODEC_ERROR;
+    }
+    vpx_free(ext_ratectrl->rc_firstpass_stats.frame_stats);
+  }
+  return vp9_extrc_init(ext_ratectrl);
+}
+
+static void gen_rc_firstpass_stats(const FIRSTPASS_STATS *stats,
+                                   vpx_rc_frame_stats_t *rc_frame_stats) {
+  rc_frame_stats->frame = stats->frame;
+  rc_frame_stats->weight = stats->weight;
+  rc_frame_stats->intra_error = stats->intra_error;
+  rc_frame_stats->coded_error = stats->coded_error;
+  rc_frame_stats->sr_coded_error = stats->sr_coded_error;
+  rc_frame_stats->frame_noise_energy = stats->frame_noise_energy;
+  rc_frame_stats->pcnt_inter = stats->pcnt_inter;
+  rc_frame_stats->pcnt_motion = stats->pcnt_motion;
+  rc_frame_stats->pcnt_second_ref = stats->pcnt_second_ref;
+  rc_frame_stats->pcnt_neutral = stats->pcnt_neutral;
+  rc_frame_stats->pcnt_intra_low = stats->pcnt_intra_low;
+  rc_frame_stats->pcnt_intra_high = stats->pcnt_intra_high;
+  rc_frame_stats->intra_skip_pct = stats->intra_skip_pct;
+  rc_frame_stats->intra_smooth_pct = stats->intra_smooth_pct;
+  rc_frame_stats->inactive_zone_rows = stats->inactive_zone_rows;
+  rc_frame_stats->inactive_zone_cols = stats->inactive_zone_cols;
+  rc_frame_stats->MVr = stats->MVr;
+  rc_frame_stats->mvr_abs = stats->mvr_abs;
+  rc_frame_stats->MVc = stats->MVc;
+  rc_frame_stats->mvc_abs = stats->mvc_abs;
+  rc_frame_stats->MVrv = stats->MVrv;
+  rc_frame_stats->MVcv = stats->MVcv;
+  rc_frame_stats->mv_in_out_count = stats->mv_in_out_count;
+  rc_frame_stats->duration = stats->duration;
+  rc_frame_stats->count = stats->count;
+  rc_frame_stats->new_mv_count = stats->new_mv_count;
+}
+
+vpx_codec_err_t vp9_extrc_send_firstpass_stats(
+    EXT_RATECTRL *ext_ratectrl, const FIRST_PASS_INFO *first_pass_info) {
+  if (ext_ratectrl == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  if (ext_ratectrl->ready) {
+    vpx_rc_status_t rc_status;
+    vpx_rc_firstpass_stats_t *rc_firstpass_stats =
+        &ext_ratectrl->rc_firstpass_stats;
+    int i;
+    assert(rc_firstpass_stats->num_frames == first_pass_info->num_frames);
+    for (i = 0; i < rc_firstpass_stats->num_frames; ++i) {
+      gen_rc_firstpass_stats(&first_pass_info->stats[i],
+                             &rc_firstpass_stats->frame_stats[i]);
+    }
+    rc_status = ext_ratectrl->funcs.send_firstpass_stats(ext_ratectrl->model,
+                                                         rc_firstpass_stats);
+    if (rc_status == VPX_RC_ERROR) {
+      return VPX_CODEC_ERROR;
+    }
+  }
+  return VPX_CODEC_OK;
+}
+
+vpx_codec_err_t vp9_extrc_send_tpl_stats(EXT_RATECTRL *ext_ratectrl,
+                                         const VpxTplGopStats *tpl_gop_stats) {
+  if (ext_ratectrl == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  if (ext_ratectrl->ready && ext_ratectrl->funcs.send_tpl_gop_stats != NULL) {
+    vpx_rc_status_t rc_status = ext_ratectrl->funcs.send_tpl_gop_stats(
+        ext_ratectrl->model, tpl_gop_stats);
+    if (rc_status == VPX_RC_ERROR) {
+      return VPX_CODEC_ERROR;
+    }
+  }
+  return VPX_CODEC_OK;
+}
+
+static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) {
+  // TODO(angiebird): Add unit test to make sure this function behaves like
+  // get_frame_type_from_update_type()
+  // TODO(angiebird): Merge this function with get_frame_type_from_update_type()
+  switch (update_type) {
+    case KF_UPDATE: return 0;       // kFrameTypeKey;
+    case ARF_UPDATE: return 2;      // kFrameTypeAltRef;
+    case GF_UPDATE: return 4;       // kFrameTypeGolden;
+    case OVERLAY_UPDATE: return 3;  // kFrameTypeOverlay;
+    case LF_UPDATE: return 1;       // kFrameTypeInter;
+    default:
+      fprintf(stderr, "Unsupported update_type %d\n", update_type);
+      abort();
+  }
+}
+
+vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
+    EXT_RATECTRL *ext_ratectrl, int gop_index,
+    vpx_rc_encodeframe_decision_t *encode_frame_decision) {
+  assert(ext_ratectrl != NULL);
+  assert(ext_ratectrl->ready && (ext_ratectrl->funcs.rc_type & VPX_RC_QP) != 0);
+
+  vpx_rc_status_t rc_status = ext_ratectrl->funcs.get_encodeframe_decision(
+      ext_ratectrl->model, gop_index, encode_frame_decision);
+  if (rc_status == VPX_RC_ERROR) {
+    return VPX_CODEC_ERROR;
+  }
+  return VPX_CODEC_OK;
+}
+
+vpx_codec_err_t vp9_extrc_update_encodeframe_result(
+    EXT_RATECTRL *ext_ratectrl, int64_t bit_count, int actual_encoding_qindex) {
+  if (ext_ratectrl == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  if (ext_ratectrl->ready) {
+    vpx_rc_status_t rc_status;
+    vpx_rc_encodeframe_result_t encode_frame_result;
+    encode_frame_result.bit_count = bit_count;
+    encode_frame_result.actual_encoding_qindex = actual_encoding_qindex;
+    rc_status = ext_ratectrl->funcs.update_encodeframe_result(
+        ext_ratectrl->model, &encode_frame_result);
+    if (rc_status == VPX_RC_ERROR) {
+      return VPX_CODEC_ERROR;
+    }
+  }
+  return VPX_CODEC_OK;
+}
+
+vpx_codec_err_t vp9_extrc_get_gop_decision(
+    EXT_RATECTRL *ext_ratectrl, vpx_rc_gop_decision_t *gop_decision) {
+  vpx_rc_status_t rc_status;
+  if (ext_ratectrl == NULL || !ext_ratectrl->ready ||
+      (ext_ratectrl->funcs.rc_type & VPX_RC_GOP) == 0) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  rc_status =
+      ext_ratectrl->funcs.get_gop_decision(ext_ratectrl->model, gop_decision);
+  if (rc_status == VPX_RC_ERROR) {
+    return VPX_CODEC_ERROR;
+  }
+  return VPX_CODEC_OK;
+}
+
+vpx_codec_err_t vp9_extrc_get_key_frame_decision(
+    EXT_RATECTRL *ext_ratectrl,
+    vpx_rc_key_frame_decision_t *key_frame_decision) {
+  if (ext_ratectrl == NULL || !ext_ratectrl->ready ||
+      (ext_ratectrl->funcs.rc_type & VPX_RC_GOP) == 0) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  vpx_rc_status_t rc_status = ext_ratectrl->funcs.get_key_frame_decision(
+      ext_ratectrl->model, key_frame_decision);
+  return rc_status == VPX_RC_OK ? VPX_CODEC_OK : VPX_CODEC_ERROR;
+}
+
+vpx_codec_err_t vp9_extrc_get_frame_rdmult(
+    EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
+    FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref,
+    RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
+    int *rdmult) {
+  vpx_rc_status_t rc_status;
+  vpx_rc_encodeframe_info_t encode_frame_info;
+  if (ext_ratectrl == NULL || !ext_ratectrl->ready ||
+      (ext_ratectrl->funcs.rc_type & VPX_RC_RDMULT) == 0) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  encode_frame_info.show_index = show_index;
+  encode_frame_info.coding_index = coding_index;
+  encode_frame_info.gop_index = gop_index;
+  encode_frame_info.frame_type = extrc_get_frame_type(update_type);
+  encode_frame_info.gop_size = gop_size;
+  encode_frame_info.use_alt_ref = use_alt_ref;
+
+  vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs,
+                         encode_frame_info.ref_frame_coding_indexes,
+                         encode_frame_info.ref_frame_valid_list);
+  rc_status = ext_ratectrl->funcs.get_frame_rdmult(ext_ratectrl->model,
+                                                   &encode_frame_info, rdmult);
+  if (rc_status == VPX_RC_ERROR) {
+    return VPX_CODEC_ERROR;
+  }
+  return VPX_CODEC_OK;
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h
new file mode 100644
index 0000000000..4ea1b24646
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h
@@ -0,0 +1,63 @@
+/*
+ *  Copyright (c) 2020 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_
+#define VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_
+
+#include "vpx/vpx_ext_ratectrl.h"
+#include "vpx/vpx_tpl.h"
+#include "vp9/encoder/vp9_firstpass.h"
+
+typedef struct EXT_RATECTRL {
+  int ready;
+  int ext_rdmult;
+  vpx_rc_model_t model;
+  vpx_rc_funcs_t funcs;
+  vpx_rc_config_t ratectrl_config;
+  vpx_rc_firstpass_stats_t rc_firstpass_stats;
+  FILE *log_file;
+} EXT_RATECTRL;
+
+vpx_codec_err_t vp9_extrc_init(EXT_RATECTRL *ext_ratectrl);
+
+vpx_codec_err_t vp9_extrc_create(vpx_rc_funcs_t funcs,
+                                 vpx_rc_config_t ratectrl_config,
+                                 EXT_RATECTRL *ext_ratectrl);
+
+vpx_codec_err_t vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl);
+
+vpx_codec_err_t vp9_extrc_send_firstpass_stats(
+    EXT_RATECTRL *ext_ratectrl, const FIRST_PASS_INFO *first_pass_info);
+
+vpx_codec_err_t vp9_extrc_send_tpl_stats(EXT_RATECTRL *ext_ratectrl,
+                                         const VpxTplGopStats *tpl_gop_stats);
+
+vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
+    EXT_RATECTRL *ext_ratectrl, int gop_index,
+    vpx_rc_encodeframe_decision_t *encode_frame_decision);
+
+vpx_codec_err_t vp9_extrc_update_encodeframe_result(EXT_RATECTRL *ext_ratectrl,
+                                                    int64_t bit_count,
+                                                    int actual_encoding_qindex);
+
+vpx_codec_err_t vp9_extrc_get_key_frame_decision(
+    EXT_RATECTRL *ext_ratectrl,
+    vpx_rc_key_frame_decision_t *key_frame_decision);
+
+vpx_codec_err_t vp9_extrc_get_gop_decision(EXT_RATECTRL *ext_ratectrl,
+                                           vpx_rc_gop_decision_t *gop_decision);
+
+vpx_codec_err_t vp9_extrc_get_frame_rdmult(
+    EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
+    FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref,
+    RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
+    int *rdmult);
+
+#endif  // VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_extend.c b/media/libvpx/libvpx/vp9/encoder/vp9_extend.c
index f8e24610ae..69261ac65f 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_extend.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_extend.c
@@ -18,18 +18,26 @@
 static void copy_and_extend_plane(const uint8_t *src, int src_pitch,
                                   uint8_t *dst, int dst_pitch, int w, int h,
                                   int extend_top, int extend_left,
-                                  int extend_bottom, int extend_right) {
-  int i, linesize;
+                                  int extend_bottom, int extend_right,
+                                  int interleave_step) {
+  int i, j, linesize;
+  const int step = interleave_step < 1 ? 1 : interleave_step;
 
   // copy the left and right most columns out
   const uint8_t *src_ptr1 = src;
-  const uint8_t *src_ptr2 = src + w - 1;
+  const uint8_t *src_ptr2 = src + (w - 1) * step;
   uint8_t *dst_ptr1 = dst - extend_left;
   uint8_t *dst_ptr2 = dst + w;
 
   for (i = 0; i < h; i++) {
     memset(dst_ptr1, src_ptr1[0], extend_left);
-    memcpy(dst_ptr1 + extend_left, src_ptr1, w);
+    if (step == 1) {
+      memcpy(dst_ptr1 + extend_left, src_ptr1, w);
+    } else {
+      for (j = 0; j < w; j++) {
+        dst_ptr1[extend_left + j] = src_ptr1[step * j];
+      }
+    }
     memset(dst_ptr2, src_ptr2[0], extend_right);
     src_ptr1 += src_pitch;
     src_ptr2 += src_pitch;
@@ -122,6 +130,8 @@ void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
   const int el_uv = el_y >> uv_width_subsampling;
   const int eb_uv = eb_y >> uv_height_subsampling;
   const int er_uv = er_y >> uv_width_subsampling;
+  // detect nv12 colorspace
+  const int chroma_step = src->v_buffer - src->u_buffer == 1 ? 2 : 1;
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -142,50 +152,13 @@ void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
 
   copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
                         dst->y_stride, src->y_crop_width, src->y_crop_height,
-                        et_y, el_y, eb_y, er_y);
+                        et_y, el_y, eb_y, er_y, 1);
 
   copy_and_extend_plane(src->u_buffer, src->uv_stride, dst->u_buffer,
                         dst->uv_stride, src->uv_crop_width, src->uv_crop_height,
-                        et_uv, el_uv, eb_uv, er_uv);
+                        et_uv, el_uv, eb_uv, er_uv, chroma_step);
 
   copy_and_extend_plane(src->v_buffer, src->uv_stride, dst->v_buffer,
                         dst->uv_stride, src->uv_crop_width, src->uv_crop_height,
-                        et_uv, el_uv, eb_uv, er_uv);
-}
-
-void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
-                                         YV12_BUFFER_CONFIG *dst, int srcy,
-                                         int srcx, int srch, int srcw) {
-  // If the side is not touching the bounder then don't extend.
-  const int et_y = srcy ? 0 : dst->border;
-  const int el_y = srcx ? 0 : dst->border;
-  const int eb_y = srcy + srch != src->y_height
-                       ? 0
-                       : dst->border + dst->y_height - src->y_height;
-  const int er_y = srcx + srcw != src->y_width
-                       ? 0
-                       : dst->border + dst->y_width - src->y_width;
-  const int src_y_offset = srcy * src->y_stride + srcx;
-  const int dst_y_offset = srcy * dst->y_stride + srcx;
-
-  const int et_uv = ROUND_POWER_OF_TWO(et_y, 1);
-  const int el_uv = ROUND_POWER_OF_TWO(el_y, 1);
-  const int eb_uv = ROUND_POWER_OF_TWO(eb_y, 1);
-  const int er_uv = ROUND_POWER_OF_TWO(er_y, 1);
-  const int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);
-  const int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);
-  const int srch_uv = ROUND_POWER_OF_TWO(srch, 1);
-  const int srcw_uv = ROUND_POWER_OF_TWO(srcw, 1);
-
-  copy_and_extend_plane(src->y_buffer + src_y_offset, src->y_stride,
-                        dst->y_buffer + dst_y_offset, dst->y_stride, srcw, srch,
-                        et_y, el_y, eb_y, er_y);
-
-  copy_and_extend_plane(src->u_buffer + src_uv_offset, src->uv_stride,
-                        dst->u_buffer + dst_uv_offset, dst->uv_stride, srcw_uv,
-                        srch_uv, et_uv, el_uv, eb_uv, er_uv);
-
-  copy_and_extend_plane(src->v_buffer + src_uv_offset, src->uv_stride,
-                        dst->v_buffer + dst_uv_offset, dst->uv_stride, srcw_uv,
-                        srch_uv, et_uv, el_uv, eb_uv, er_uv);
+                        et_uv, el_uv, eb_uv, er_uv, chroma_step);
 }
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_extend.h b/media/libvpx/libvpx/vp9/encoder/vp9_extend.h
index c0dd757159..21d7e68b9f 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_extend.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_extend.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_EXTEND_H_
-#define VP9_ENCODER_VP9_EXTEND_H_
+#ifndef VPX_VP9_ENCODER_VP9_EXTEND_H_
+#define VPX_VP9_ENCODER_VP9_EXTEND_H_
 
 #include "vpx_scale/yv12config.h"
 #include "vpx/vpx_integer.h"
@@ -21,11 +21,8 @@ extern "C" {
 void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
                                YV12_BUFFER_CONFIG *dst);
 
-void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
-                                         YV12_BUFFER_CONFIG *dst, int srcy,
-                                         int srcx, int srch, int srcw);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_EXTEND_H_
+#endif  // VPX_VP9_ENCODER_VP9_EXTEND_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c
index 72e9ac77e7..e5918a9bfe 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c
@@ -10,6 +10,7 @@
 
 #include <limits.h>
 #include <math.h>
+#include <stdint.h>
 #include <stdio.h>
 
 #include "./vpx_dsp_rtcd.h"
@@ -31,38 +32,59 @@
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_ethread.h"
 #include "vp9/encoder/vp9_extend.h"
+#include "vp9/encoder/vp9_ext_ratectrl.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/encoder/vp9_quantize.h"
+#include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rd.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_ext_ratectrl.h"
 #include "vpx_dsp/variance.h"
 
 #define OUTPUT_FPF 0
 #define ARF_STATS_OUTPUT 0
+#define COMPLEXITY_STATS_OUTPUT 0
 
-#define BOOST_BREAKOUT 12.5
-#define BOOST_FACTOR 12.5
-#define FACTOR_PT_LOW 0.70
-#define FACTOR_PT_HIGH 0.90
 #define FIRST_PASS_Q 10.0
-#define GF_MAX_BOOST 96.0
-#define INTRA_MODE_PENALTY 1024
-#define MIN_ARF_GF_BOOST 240
+#define NORMAL_BOOST 100
+#define MIN_ARF_GF_BOOST 250
 #define MIN_DECAY_FACTOR 0.01
 #define NEW_MV_MODE_PENALTY 32
-#define SVC_FACTOR_PT_LOW 0.45
 #define DARK_THRESH 64
-#define DEFAULT_GRP_WEIGHT 1.0
-#define RC_FACTOR_MIN 0.75
-#define RC_FACTOR_MAX 1.75
-#define SECTION_NOISE_DEF 250.0
 #define LOW_I_THRESH 24000
 
 #define NCOUNT_INTRA_THRESH 8192
 #define NCOUNT_INTRA_FACTOR 3
 
-#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x)-0.000001 : (x) + 0.000001)
+#define INTRA_PART 0.005
+#define DEFAULT_DECAY_LIMIT 0.75
+#define LOW_SR_DIFF_TRHESH 0.1
+#define LOW_CODED_ERR_PER_MB 10.0
+#define NCOUNT_FRAME_II_THRESH 6.0
+#define BASELINE_ERR_PER_MB 12500.0
+#define GF_MAX_FRAME_BOOST 96.0
+
+#ifdef AGGRESSIVE_VBR
+#define KF_MIN_FRAME_BOOST 40.0
+#define KF_MAX_FRAME_BOOST 80.0
+#define MAX_KF_TOT_BOOST 4800
+#else
+#define KF_MIN_FRAME_BOOST 40.0
+#define KF_MAX_FRAME_BOOST 96.0
+#define MAX_KF_TOT_BOOST 5400
+#endif
+
+#define DEFAULT_ZM_FACTOR 0.5
+#define MINQ_ADJ_LIMIT 48
+#define MINQ_ADJ_LIMIT_CQ 20
+#define HIGH_UNDERSHOOT_RATIO 2
+#define AV_WQ_FACTOR 4.0
+
+#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x) - 0.000001 : (x) + 0.000001)
 
 #if ARF_STATS_OUTPUT
 unsigned int arf_count = 0;
@@ -92,14 +114,8 @@ static int input_stats(TWO_PASS *p, FIRSTPASS_STATS *fps) {
   return 1;
 }
 
-static void output_stats(FIRSTPASS_STATS *stats,
-                         struct vpx_codec_pkt_list *pktlist) {
-  struct vpx_codec_cx_pkt pkt;
-  pkt.kind = VPX_CODEC_STATS_PKT;
-  pkt.data.twopass_stats.buf = stats;
-  pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS);
-  vpx_codec_pkt_list_add(pktlist, &pkt);
-
+static void output_stats(FIRSTPASS_STATS *stats) {
+  (void)stats;
 // TEMP debug code
 #if OUTPUT_FPF
   {
@@ -107,13 +123,15 @@ static void output_stats(FIRSTPASS_STATS *stats,
     fpfile = fopen("firstpass.stt", "a");
 
     fprintf(fpfile,
-            "%12.0lf %12.4lf %12.0lf %12.0lf %12.0lf %12.0lf %12.4lf"
+            "%12.0lf %12.4lf %12.2lf %12.2lf %12.2lf %12.0lf %12.4lf %12.4lf"
             "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf"
-            "%12.4lf %12.4lf %12.4lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf"
+            "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.0lf %12.4lf %12.0lf"
+            "%12.4lf"
             "\n",
             stats->frame, stats->weight, stats->intra_error, stats->coded_error,
             stats->sr_coded_error, stats->frame_noise_energy, stats->pcnt_inter,
             stats->pcnt_motion, stats->pcnt_second_ref, stats->pcnt_neutral,
+            stats->pcnt_intra_low, stats->pcnt_intra_high,
             stats->intra_skip_pct, stats->intra_smooth_pct,
             stats->inactive_zone_rows, stats->inactive_zone_cols, stats->MVr,
             stats->mvr_abs, stats->MVc, stats->mvc_abs, stats->MVrv,
@@ -123,17 +141,6 @@ static void output_stats(FIRSTPASS_STATS *stats,
 #endif
 }
 
-#if CONFIG_FP_MB_STATS
-static void output_fpmb_stats(uint8_t *this_frame_mb_stats, VP9_COMMON *cm,
-                              struct vpx_codec_pkt_list *pktlist) {
-  struct vpx_codec_cx_pkt pkt;
-  pkt.kind = VPX_CODEC_FPMB_STATS_PKT;
-  pkt.data.firstpass_mb_stats.buf = this_frame_mb_stats;
-  pkt.data.firstpass_mb_stats.sz = cm->initial_mbs * sizeof(uint8_t);
-  vpx_codec_pkt_list_add(pktlist, &pkt);
-}
-#endif
-
 static void zero_stats(FIRSTPASS_STATS *section) {
   section->frame = 0.0;
   section->weight = 0.0;
@@ -147,8 +154,11 @@ static void zero_stats(FIRSTPASS_STATS *section) {
   section->pcnt_neutral = 0.0;
   section->intra_skip_pct = 0.0;
   section->intra_smooth_pct = 0.0;
+  section->pcnt_intra_low = 0.0;
+  section->pcnt_intra_high = 0.0;
   section->inactive_zone_rows = 0.0;
   section->inactive_zone_cols = 0.0;
+  section->new_mv_count = 0.0;
   section->MVr = 0.0;
   section->mvr_abs = 0.0;
   section->MVc = 0.0;
@@ -176,8 +186,11 @@ static void accumulate_stats(FIRSTPASS_STATS *section,
   section->pcnt_neutral += frame->pcnt_neutral;
   section->intra_skip_pct += frame->intra_skip_pct;
   section->intra_smooth_pct += frame->intra_smooth_pct;
+  section->pcnt_intra_low += frame->pcnt_intra_low;
+  section->pcnt_intra_high += frame->pcnt_intra_high;
   section->inactive_zone_rows += frame->inactive_zone_rows;
   section->inactive_zone_cols += frame->inactive_zone_cols;
+  section->new_mv_count += frame->new_mv_count;
   section->MVr += frame->MVr;
   section->mvr_abs += frame->mvr_abs;
   section->MVc += frame->MVc;
@@ -203,8 +216,11 @@ static void subtract_stats(FIRSTPASS_STATS *section,
   section->pcnt_neutral -= frame->pcnt_neutral;
   section->intra_skip_pct -= frame->intra_skip_pct;
   section->intra_smooth_pct -= frame->intra_smooth_pct;
+  section->pcnt_intra_low -= frame->pcnt_intra_low;
+  section->pcnt_intra_high -= frame->pcnt_intra_high;
   section->inactive_zone_rows -= frame->inactive_zone_rows;
   section->inactive_zone_cols -= frame->inactive_zone_cols;
+  section->new_mv_count -= frame->new_mv_count;
   section->MVr -= frame->MVr;
   section->mvr_abs -= frame->mvr_abs;
   section->MVc -= frame->MVc;
@@ -220,28 +236,37 @@ static void subtract_stats(FIRSTPASS_STATS *section,
 // bars and partially discounts other 0 energy areas.
 #define MIN_ACTIVE_AREA 0.5
 #define MAX_ACTIVE_AREA 1.0
-static double calculate_active_area(const VP9_COMP *cpi,
+static double calculate_active_area(const FRAME_INFO *frame_info,
                                     const FIRSTPASS_STATS *this_frame) {
   double active_pct;
 
   active_pct =
       1.0 -
       ((this_frame->intra_skip_pct / 2) +
-       ((this_frame->inactive_zone_rows * 2) / (double)cpi->common.mb_rows));
+       ((this_frame->inactive_zone_rows * 2) / (double)frame_info->mb_rows));
   return fclamp(active_pct, MIN_ACTIVE_AREA, MAX_ACTIVE_AREA);
 }
 
+// Get the average weighted error for the clip (or corpus)
+static double get_distribution_av_err(VP9_COMP *cpi, TWO_PASS *const twopass) {
+  const double av_weight =
+      twopass->total_stats.weight / twopass->total_stats.count;
+
+  if (cpi->oxcf.vbr_corpus_complexity)
+    return av_weight * twopass->mean_mod_score;
+  else
+    return (twopass->total_stats.coded_error * av_weight) /
+           twopass->total_stats.count;
+}
+
+#define ACT_AREA_CORRECTION 0.5
 // Calculate a modified Error used in distributing bits between easier and
 // harder frames.
-#define ACT_AREA_CORRECTION 0.5
-static double calculate_modified_err(const VP9_COMP *cpi,
-                                     const TWO_PASS *twopass,
-                                     const VP9EncoderConfig *oxcf,
-                                     const FIRSTPASS_STATS *this_frame) {
-  const FIRSTPASS_STATS *const stats = &twopass->total_stats;
-  const double av_weight = stats->weight / stats->count;
-  const double av_err = (stats->coded_error * av_weight) / stats->count;
-  double modified_error =
+static double calculate_mod_frame_score(const VP9_COMP *cpi,
+                                        const VP9EncoderConfig *oxcf,
+                                        const FIRSTPASS_STATS *this_frame,
+                                        const double av_err) {
+  double modified_score =
       av_err * pow(this_frame->coded_error * this_frame->weight /
                        DOUBLE_DIVIDE_CHECK(av_err),
                    oxcf->two_pass_vbrbias / 100.0);
@@ -251,11 +276,44 @@ static double calculate_modified_err(const VP9_COMP *cpi,
   // remaining active MBs. The correction here assumes that coding
   // 0.5N blocks of complexity 2X is a little easier than coding N
   // blocks of complexity X.
-  modified_error *=
-      pow(calculate_active_area(cpi, this_frame), ACT_AREA_CORRECTION);
+  modified_score *= pow(calculate_active_area(&cpi->frame_info, this_frame),
+                        ACT_AREA_CORRECTION);
 
-  return fclamp(modified_error, twopass->modified_error_min,
-                twopass->modified_error_max);
+  return modified_score;
+}
+
+static double calc_norm_frame_score(const VP9EncoderConfig *oxcf,
+                                    const FRAME_INFO *frame_info,
+                                    const FIRSTPASS_STATS *this_frame,
+                                    double mean_mod_score, double av_err) {
+  double modified_score =
+      av_err * pow(this_frame->coded_error * this_frame->weight /
+                       DOUBLE_DIVIDE_CHECK(av_err),
+                   oxcf->two_pass_vbrbias / 100.0);
+
+  const double min_score = (double)(oxcf->two_pass_vbrmin_section) / 100.0;
+  const double max_score = (double)(oxcf->two_pass_vbrmax_section) / 100.0;
+
+  // Correction for active area. Frames with a reduced active area
+  // (eg due to formatting bars) have a higher error per mb for the
+  // remaining active MBs. The correction here assumes that coding
+  // 0.5N blocks of complexity 2X is a little easier than coding N
+  // blocks of complexity X.
+  modified_score *=
+      pow(calculate_active_area(frame_info, this_frame), ACT_AREA_CORRECTION);
+
+  // Normalize to a midpoint score.
+  modified_score /= DOUBLE_DIVIDE_CHECK(mean_mod_score);
+  return fclamp(modified_score, min_score, max_score);
+}
+
+static double calculate_norm_frame_score(const VP9_COMP *cpi,
+                                         const TWO_PASS *twopass,
+                                         const VP9EncoderConfig *oxcf,
+                                         const FIRSTPASS_STATS *this_frame,
+                                         const double av_err) {
+  return calc_norm_frame_score(oxcf, &cpi->frame_info, this_frame,
+                               twopass->mean_mod_score, av_err);
 }
 
 // This function returns the maximum target rate per frame.
@@ -277,15 +335,10 @@ void vp9_init_first_pass(VP9_COMP *cpi) {
 }
 
 void vp9_end_first_pass(VP9_COMP *cpi) {
-  if (is_two_pass_svc(cpi)) {
-    int i;
-    for (i = 0; i < cpi->svc.number_spatial_layers; ++i) {
-      output_stats(&cpi->svc.layer_context[i].twopass.total_stats,
-                   cpi->output_pkt_list);
-    }
-  } else {
-    output_stats(&cpi->twopass.total_stats, cpi->output_pkt_list);
-  }
+  output_stats(&cpi->twopass.total_stats);
+  cpi->twopass.first_pass_done = 1;
+  vpx_free(cpi->twopass.fp_mb_float_stats);
+  cpi->twopass.fp_mb_float_stats = NULL;
 }
 
 static vpx_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
@@ -317,7 +370,6 @@ static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
         case BLOCK_8X16: return vpx_highbd_8_mse8x16;
         default: return vpx_highbd_8_mse16x16;
       }
-      break;
     case 10:
       switch (bsize) {
         case BLOCK_8X8: return vpx_highbd_10_mse8x8;
@@ -325,7 +377,6 @@ static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
         case BLOCK_8X16: return vpx_highbd_10_mse8x16;
         default: return vpx_highbd_10_mse16x16;
       }
-      break;
     case 12:
       switch (bsize) {
         case BLOCK_8X8: return vpx_highbd_12_mse8x8;
@@ -333,7 +384,6 @@ static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
         case BLOCK_8X16: return vpx_highbd_12_mse8x16;
         default: return vpx_highbd_12_mse16x16;
       }
-      break;
   }
 }
 
@@ -352,12 +402,36 @@ static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize,
 // for first pass test.
 static int get_search_range(const VP9_COMP *cpi) {
   int sr = 0;
-  const int dim = VPXMIN(cpi->initial_width, cpi->initial_height);
+  int dim = VPXMIN(cpi->initial_width, cpi->initial_height);
+  dim = VPXMAX(dim, MI_SIZE);
 
   while ((dim << sr) < MAX_FULL_PEL_VAL) ++sr;
   return sr;
 }
 
+// Reduce limits to keep the motion search within MV_MAX of ref_mv. Not doing
+// this can be problematic for big videos (8K) and may cause assert failure
+// (or memory violation) in mv_cost. Limits are only modified if they would
+// be non-empty. Returns 1 if limits are non-empty.
+static int intersect_limits_with_mv_max(MvLimits *mv_limits, const MV *ref_mv) {
+  const int row_min =
+      VPXMAX(mv_limits->row_min, (ref_mv->row + 7 - MV_MAX) >> 3);
+  const int row_max =
+      VPXMIN(mv_limits->row_max, (ref_mv->row - 1 + MV_MAX) >> 3);
+  const int col_min =
+      VPXMAX(mv_limits->col_min, (ref_mv->col + 7 - MV_MAX) >> 3);
+  const int col_max =
+      VPXMIN(mv_limits->col_max, (ref_mv->col - 1 + MV_MAX) >> 3);
+  if (row_min > row_max || col_min > col_max) {
+    return 0;
+  }
+  mv_limits->row_min = row_min;
+  mv_limits->row_max = row_max;
+  mv_limits->col_min = col_min;
+  mv_limits->col_max = col_max;
+  return 1;
+}
+
 static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                      const MV *ref_mv, MV *best_mv,
                                      int *best_motion_err) {
@@ -368,13 +442,21 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
   vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
   const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY;
+  MV center_mv_full = ref_mv_full;
+  unsigned int start_mv_sad;
+  vp9_sad_fn_ptr_t sad_fn_ptr;
 
   int step_param = 3;
   int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
   const int sr = get_search_range(cpi);
+  const MvLimits tmp_mv_limits = x->mv_limits;
   step_param += sr;
   further_steps -= sr;
 
+  if (!intersect_limits_with_mv_max(&x->mv_limits, ref_mv)) {
+    return;
+  }
+
   // Override the default variance function to use MSE.
   v_fn_ptr.vf = get_block_variance_fn(bsize);
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -383,10 +465,18 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+  // Calculate SAD of the start mv
+  clamp_mv(&ref_mv_full, x->mv_limits.col_min, x->mv_limits.col_max,
+           x->mv_limits.row_min, x->mv_limits.row_max);
+  start_mv_sad = get_start_mv_sad(x, &ref_mv_full, &center_mv_full,
+                                  cpi->fn_ptr[bsize].sdf, x->sadperbit16);
+  sad_fn_ptr.sdf = cpi->fn_ptr[bsize].sdf;
+  sad_fn_ptr.sdx4df = cpi->fn_ptr[bsize].sdx4df;
+
   // Center the initial step/diamond search on best mv.
-  tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
-                                    step_param, x->sadperbit16, &num00,
-                                    &v_fn_ptr, ref_mv);
+  tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, start_mv_sad,
+                                    &tmp_mv, step_param, x->sadperbit16, &num00,
+                                    &sad_fn_ptr, ref_mv);
   if (tmp_err < INT_MAX)
     tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
   if (tmp_err < INT_MAX - new_mv_mode_penalty) tmp_err += new_mv_mode_penalty;
@@ -406,9 +496,9 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     if (num00) {
       --num00;
     } else {
-      tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
-                                        step_param + n, x->sadperbit16, &num00,
-                                        &v_fn_ptr, ref_mv);
+      tmp_err = cpi->diamond_search_sad(
+          x, &cpi->ss_cfg, &ref_mv_full, start_mv_sad, &tmp_mv, step_param + n,
+          x->sadperbit16, &num00, &sad_fn_ptr, ref_mv);
       if (tmp_err < INT_MAX)
         tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
       if (tmp_err < INT_MAX - new_mv_mode_penalty)
@@ -420,6 +510,7 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
       }
     }
   }
+  x->mv_limits = tmp_mv_limits;
 }
 
 static BLOCK_SIZE get_bsize(const VP9_COMMON *cm, int mb_row, int mb_col) {
@@ -460,12 +551,11 @@ static int scale_sse_threshold(VP9_COMMON *cm, int thresh) {
   if (cm->use_highbitdepth) {
     switch (cm->bit_depth) {
       case VPX_BITS_8: ret_val = thresh; break;
-      case VPX_BITS_10: ret_val = thresh >> 4; break;
-      case VPX_BITS_12: ret_val = thresh >> 8; break;
+      case VPX_BITS_10: ret_val = thresh << 4; break;
       default:
-        assert(0 &&
-               "cm->bit_depth should be VPX_BITS_8, "
-               "VPX_BITS_10 or VPX_BITS_12");
+        assert(cm->bit_depth == VPX_BITS_12);
+        ret_val = thresh << 8;
+        break;
     }
   }
 #else
@@ -487,11 +577,10 @@ static int get_ul_intra_threshold(VP9_COMMON *cm) {
     switch (cm->bit_depth) {
       case VPX_BITS_8: ret_val = UL_INTRA_THRESH; break;
       case VPX_BITS_10: ret_val = UL_INTRA_THRESH << 2; break;
-      case VPX_BITS_12: ret_val = UL_INTRA_THRESH << 4; break;
       default:
-        assert(0 &&
-               "cm->bit_depth should be VPX_BITS_8, "
-               "VPX_BITS_10 or VPX_BITS_12");
+        assert(cm->bit_depth == VPX_BITS_12);
+        ret_val = UL_INTRA_THRESH << 4;
+        break;
     }
   }
 #else
@@ -508,11 +597,10 @@ static int get_smooth_intra_threshold(VP9_COMMON *cm) {
     switch (cm->bit_depth) {
       case VPX_BITS_8: ret_val = SMOOTH_INTRA_THRESH; break;
       case VPX_BITS_10: ret_val = SMOOTH_INTRA_THRESH << 4; break;
-      case VPX_BITS_12: ret_val = SMOOTH_INTRA_THRESH << 8; break;
       default:
-        assert(0 &&
-               "cm->bit_depth should be VPX_BITS_8, "
-               "VPX_BITS_10 or VPX_BITS_12");
+        assert(cm->bit_depth == VPX_BITS_12);
+        ret_val = SMOOTH_INTRA_THRESH << 8;
+        break;
     }
   }
 #else
@@ -522,14 +610,14 @@ static int get_smooth_intra_threshold(VP9_COMMON *cm) {
 }
 
 #define FP_DN_THRESH 8
-#define FP_MAX_DN_THRESH 16
+#define FP_MAX_DN_THRESH 24
 #define KERNEL_SIZE 3
 
-// Baseline Kernal weights for first pass noise metric
-static uint8_t fp_dn_kernal_3[KERNEL_SIZE * KERNEL_SIZE] = { 1, 2, 1, 2, 4,
+// Baseline Kernel weights for first pass noise metric
+static uint8_t fp_dn_kernel_3[KERNEL_SIZE * KERNEL_SIZE] = { 1, 2, 1, 2, 4,
                                                              2, 1, 2, 1 };
 
-// Estimate noise at a single point based on the impace of a spatial kernal
+// Estimate noise at a single point based on the impact of a spatial kernel
 // on the point value
 static int fp_estimate_point_noise(uint8_t *src_ptr, const int stride) {
   int sum_weight = 0;
@@ -539,23 +627,23 @@ static int fp_estimate_point_noise(uint8_t *src_ptr, const int stride) {
   int diff;
   int dn_diff;
   uint8_t *tmp_ptr;
-  uint8_t *kernal_ptr;
+  uint8_t *kernel_ptr;
   uint8_t dn_val;
   uint8_t centre_val = *src_ptr;
 
-  kernal_ptr = fp_dn_kernal_3;
+  kernel_ptr = fp_dn_kernel_3;
 
-  // Apply the kernal
+  // Apply the kernel
   tmp_ptr = src_ptr - stride - 1;
   for (i = 0; i < KERNEL_SIZE; ++i) {
     for (j = 0; j < KERNEL_SIZE; ++j) {
       diff = abs((int)centre_val - (int)tmp_ptr[j]);
       max_diff = VPXMAX(max_diff, diff);
       if (diff <= FP_DN_THRESH) {
-        sum_weight += *kernal_ptr;
-        sum_val += (int)tmp_ptr[j] * (int)*kernal_ptr;
+        sum_weight += *kernel_ptr;
+        sum_val += (int)tmp_ptr[j] * (int)*kernel_ptr;
       }
-      ++kernal_ptr;
+      ++kernel_ptr;
     }
     tmp_ptr += stride;
   }
@@ -581,13 +669,13 @@ static int fp_highbd_estimate_point_noise(uint8_t *src_ptr, const int stride) {
   int dn_diff;
   uint8_t *tmp_ptr;
   uint16_t *tmp_ptr16;
-  uint8_t *kernal_ptr;
+  uint8_t *kernel_ptr;
   uint16_t dn_val;
   uint16_t centre_val = *CONVERT_TO_SHORTPTR(src_ptr);
 
-  kernal_ptr = fp_dn_kernal_3;
+  kernel_ptr = fp_dn_kernel_3;
 
-  // Apply the kernal
+  // Apply the kernel
   tmp_ptr = src_ptr - stride - 1;
   for (i = 0; i < KERNEL_SIZE; ++i) {
     tmp_ptr16 = CONVERT_TO_SHORTPTR(tmp_ptr);
@@ -595,10 +683,10 @@ static int fp_highbd_estimate_point_noise(uint8_t *src_ptr, const int stride) {
       diff = abs((int)centre_val - (int)tmp_ptr16[j]);
       max_diff = VPXMAX(max_diff, diff);
       if (diff <= FP_DN_THRESH) {
-        sum_weight += *kernal_ptr;
-        sum_val += (int)tmp_ptr16[j] * (int)*kernal_ptr;
+        sum_weight += *kernel_ptr;
+        sum_val += (int)tmp_ptr16[j] * (int)*kernel_ptr;
       }
-      ++kernal_ptr;
+      ++kernel_ptr;
     }
     tmp_ptr += stride;
   }
@@ -646,37 +734,166 @@ static int fp_estimate_block_noise(MACROBLOCK *x, BLOCK_SIZE bsize) {
   return block_noise << 2;  // Scale << 2 to account for sampling.
 }
 
-#define INVALID_ROW -1
-void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
+// This function is called to test the functionality of row based
+// multi-threading in unit tests for bit-exactness
+static void accumulate_floating_point_stats(VP9_COMP *cpi,
+                                            TileDataEnc *first_tile_col) {
+  VP9_COMMON *const cm = &cpi->common;
   int mb_row, mb_col;
-  MACROBLOCK *const x = &cpi->td.mb;
+  first_tile_col->fp_data.intra_factor = 0;
+  first_tile_col->fp_data.brightness_factor = 0;
+  first_tile_col->fp_data.neutral_count = 0;
+  for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
+    for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
+      const int mb_index = mb_row * cm->mb_cols + mb_col;
+      first_tile_col->fp_data.intra_factor +=
+          cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor;
+      first_tile_col->fp_data.brightness_factor +=
+          cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor;
+      first_tile_col->fp_data.neutral_count +=
+          cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count;
+    }
+  }
+}
+
+static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps,
+                                 FIRSTPASS_DATA *fp_acc_data) {
+  VP9_COMMON *const cm = &cpi->common;
+  // The minimum error here insures some bit allocation to frames even
+  // in static regions. The allocation per MB declines for larger formats
+  // where the typical "real" energy per MB also falls.
+  // Initial estimate here uses sqrt(mbs) to define the min_err, where the
+  // number of mbs is proportional to the image area.
+  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
+                                                             : cpi->common.MBs;
+  const double min_err = 200 * sqrt(num_mbs);
+
+  // Clamp the image start to rows/2. This number of rows is discarded top
+  // and bottom as dead data so rows / 2 means the frame is blank.
+  if ((fp_acc_data->image_data_start_row > cm->mb_rows / 2) ||
+      (fp_acc_data->image_data_start_row == INVALID_ROW)) {
+    fp_acc_data->image_data_start_row = cm->mb_rows / 2;
+  }
+  // Exclude any image dead zone
+  if (fp_acc_data->image_data_start_row > 0) {
+    fp_acc_data->intra_skip_count =
+        VPXMAX(0, fp_acc_data->intra_skip_count -
+                      (fp_acc_data->image_data_start_row * cm->mb_cols * 2));
+  }
+
+  fp_acc_data->intra_factor = fp_acc_data->intra_factor / (double)num_mbs;
+  fp_acc_data->brightness_factor =
+      fp_acc_data->brightness_factor / (double)num_mbs;
+  fps->weight = fp_acc_data->intra_factor * fp_acc_data->brightness_factor;
+
+  fps->frame = cm->current_video_frame;
+  fps->spatial_layer_id = cpi->svc.spatial_layer_id;
+
+  fps->coded_error =
+      ((double)(fp_acc_data->coded_error >> 8) + min_err) / num_mbs;
+  fps->sr_coded_error =
+      ((double)(fp_acc_data->sr_coded_error >> 8) + min_err) / num_mbs;
+  fps->intra_error =
+      ((double)(fp_acc_data->intra_error >> 8) + min_err) / num_mbs;
+
+  fps->frame_noise_energy =
+      (double)(fp_acc_data->frame_noise_energy) / (double)num_mbs;
+  fps->count = 1.0;
+  fps->pcnt_inter = (double)(fp_acc_data->intercount) / num_mbs;
+  fps->pcnt_second_ref = (double)(fp_acc_data->second_ref_count) / num_mbs;
+  fps->pcnt_neutral = (double)(fp_acc_data->neutral_count) / num_mbs;
+  fps->pcnt_intra_low = (double)(fp_acc_data->intra_count_low) / num_mbs;
+  fps->pcnt_intra_high = (double)(fp_acc_data->intra_count_high) / num_mbs;
+  fps->intra_skip_pct = (double)(fp_acc_data->intra_skip_count) / num_mbs;
+  fps->intra_smooth_pct = (double)(fp_acc_data->intra_smooth_count) / num_mbs;
+  fps->inactive_zone_rows = (double)(fp_acc_data->image_data_start_row);
+  // Currently set to 0 as most issues relate to letter boxing.
+  fps->inactive_zone_cols = (double)0;
+
+  if (fp_acc_data->mvcount > 0) {
+    fps->new_mv_count = (double)(fp_acc_data->new_mv_count) / num_mbs;
+    fps->MVr = (double)(fp_acc_data->sum_mvr) / fp_acc_data->mvcount;
+    fps->mvr_abs = (double)(fp_acc_data->sum_mvr_abs) / fp_acc_data->mvcount;
+    fps->MVc = (double)(fp_acc_data->sum_mvc) / fp_acc_data->mvcount;
+    fps->mvc_abs = (double)(fp_acc_data->sum_mvc_abs) / fp_acc_data->mvcount;
+    fps->MVrv = ((double)(fp_acc_data->sum_mvrs) -
+                 ((double)(fp_acc_data->sum_mvr) * (fp_acc_data->sum_mvr) /
+                  fp_acc_data->mvcount)) /
+                fp_acc_data->mvcount;
+    fps->MVcv = ((double)(fp_acc_data->sum_mvcs) -
+                 ((double)(fp_acc_data->sum_mvc) * (fp_acc_data->sum_mvc) /
+                  fp_acc_data->mvcount)) /
+                fp_acc_data->mvcount;
+    fps->mv_in_out_count =
+        (double)(fp_acc_data->sum_in_vectors) / (fp_acc_data->mvcount * 2);
+    fps->pcnt_motion = (double)(fp_acc_data->mvcount) / num_mbs;
+  } else {
+    fps->new_mv_count = 0.0;
+    fps->MVr = 0.0;
+    fps->mvr_abs = 0.0;
+    fps->MVc = 0.0;
+    fps->mvc_abs = 0.0;
+    fps->MVrv = 0.0;
+    fps->MVcv = 0.0;
+    fps->mv_in_out_count = 0.0;
+    fps->pcnt_motion = 0.0;
+  }
+}
+
+static void accumulate_fp_mb_row_stat(TileDataEnc *this_tile,
+                                      FIRSTPASS_DATA *fp_acc_data) {
+  this_tile->fp_data.intra_factor += fp_acc_data->intra_factor;
+  this_tile->fp_data.brightness_factor += fp_acc_data->brightness_factor;
+  this_tile->fp_data.coded_error += fp_acc_data->coded_error;
+  this_tile->fp_data.sr_coded_error += fp_acc_data->sr_coded_error;
+  this_tile->fp_data.frame_noise_energy += fp_acc_data->frame_noise_energy;
+  this_tile->fp_data.intra_error += fp_acc_data->intra_error;
+  this_tile->fp_data.intercount += fp_acc_data->intercount;
+  this_tile->fp_data.second_ref_count += fp_acc_data->second_ref_count;
+  this_tile->fp_data.neutral_count += fp_acc_data->neutral_count;
+  this_tile->fp_data.intra_count_low += fp_acc_data->intra_count_low;
+  this_tile->fp_data.intra_count_high += fp_acc_data->intra_count_high;
+  this_tile->fp_data.intra_skip_count += fp_acc_data->intra_skip_count;
+  this_tile->fp_data.new_mv_count += fp_acc_data->new_mv_count;
+  this_tile->fp_data.mvcount += fp_acc_data->mvcount;
+  this_tile->fp_data.sum_mvr += fp_acc_data->sum_mvr;
+  this_tile->fp_data.sum_mvr_abs += fp_acc_data->sum_mvr_abs;
+  this_tile->fp_data.sum_mvc += fp_acc_data->sum_mvc;
+  this_tile->fp_data.sum_mvc_abs += fp_acc_data->sum_mvc_abs;
+  this_tile->fp_data.sum_mvrs += fp_acc_data->sum_mvrs;
+  this_tile->fp_data.sum_mvcs += fp_acc_data->sum_mvcs;
+  this_tile->fp_data.sum_in_vectors += fp_acc_data->sum_in_vectors;
+  this_tile->fp_data.intra_smooth_count += fp_acc_data->intra_smooth_count;
+  const int min_start_row = VPXMIN(this_tile->fp_data.image_data_start_row,
+                                   fp_acc_data->image_data_start_row);
+  this_tile->fp_data.image_data_start_row =
+      (min_start_row == INVALID_ROW)
+          ? VPXMAX(this_tile->fp_data.image_data_start_row,
+                   fp_acc_data->image_data_start_row)
+          : min_start_row;
+}
+
+#define NZ_MOTION_PENALTY 128
+#define INTRA_MODE_PENALTY 1024
+void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
+                                       FIRSTPASS_DATA *fp_acc_data,
+                                       TileDataEnc *tile_data, MV *best_ref_mv,
+                                       int mb_row) {
+  int mb_col;
+  MACROBLOCK *const x = &td->mb;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  TileInfo tile;
+  TileInfo tile = tile_data->tile_info;
+  const int mb_col_start = ROUND_POWER_OF_TWO(tile.mi_col_start, 1);
+  const int mb_col_end = ROUND_POWER_OF_TWO(tile.mi_col_end, 1);
   struct macroblock_plane *const p = x->plane;
   struct macroblockd_plane *const pd = xd->plane;
-  const PICK_MODE_CONTEXT *ctx = &cpi->td.pc_root->none;
-  int i;
+  const PICK_MODE_CONTEXT *ctx = &td->pc_root->none;
+  int i, c;
+  int num_mb_cols = get_num_cols(tile_data->tile_info, 1);
 
   int recon_yoffset, recon_uvoffset;
-  int64_t intra_error = 0;
-  int64_t coded_error = 0;
-  int64_t sr_coded_error = 0;
-  int64_t frame_noise_energy = 0;
-
-  int sum_mvr = 0, sum_mvc = 0;
-  int sum_mvr_abs = 0, sum_mvc_abs = 0;
-  int64_t sum_mvrs = 0, sum_mvcs = 0;
-  int mvcount = 0;
-  int intercount = 0;
-  int second_ref_count = 0;
   const int intrapenalty = INTRA_MODE_PENALTY;
-  double neutral_count;
-  int intra_skip_count = 0;
-  int intra_smooth_count = 0;
-  int image_data_start_row = INVALID_ROW;
-  int sum_in_vectors = 0;
-  TWO_PASS *twopass = &cpi->twopass;
   const MV zero_mv = { 0, 0 };
   int recon_y_stride, recon_uv_stride, uv_mb_height;
 
@@ -685,76 +902,480 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
   YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm);
   const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
 
-  LAYER_CONTEXT *const lc =
-      is_two_pass_svc(cpi) ? &cpi->svc.layer_context[cpi->svc.spatial_layer_id]
-                           : NULL;
-  double intra_factor;
-  double brightness_factor;
-  BufferPool *const pool = cm->buffer_pool;
   MODE_INFO mi_above, mi_left;
 
+  double mb_intra_factor;
+  double mb_brightness_factor;
+  double mb_neutral_count;
+  int scaled_low_intra_thresh = scale_sse_threshold(cm, LOW_I_THRESH);
+
+  MV *first_top_mv = &tile_data->firstpass_top_mv;
+  MV last_nonzero_mv = { 0, 0 };
+
   // First pass code requires valid last and new frame buffers.
   assert(new_yv12 != NULL);
-  assert((lc != NULL) || frame_is_intra_only(cm) || (lst_yv12 != NULL));
+  assert(frame_is_intra_only(cm) || (lst_yv12 != NULL));
 
-#if CONFIG_FP_MB_STATS
-  if (cpi->use_fp_mb_stats) {
-    vp9_zero_array(cpi->twopass.frame_mb_stats_buf, cm->initial_mbs);
+  xd->mi = cm->mi_grid_visible + xd->mi_stride * (mb_row << 1) + mb_col_start;
+  xd->mi[0] = cm->mi + xd->mi_stride * (mb_row << 1) + mb_col_start;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    p[i].coeff = ctx->coeff_pbuf[i][1];
+    p[i].qcoeff = ctx->qcoeff_pbuf[i][1];
+    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
+    p[i].eobs = ctx->eobs_pbuf[i][1];
   }
-#endif
 
-  vpx_clear_system_state();
+  recon_y_stride = new_yv12->y_stride;
+  recon_uv_stride = new_yv12->uv_stride;
+  uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);
 
-  intra_factor = 0.0;
-  brightness_factor = 0.0;
-  neutral_count = 0.0;
+  // Reset above block coeffs.
+  recon_yoffset = (mb_row * recon_y_stride * 16) + mb_col_start * 16;
+  recon_uvoffset =
+      (mb_row * recon_uv_stride * uv_mb_height) + mb_col_start * uv_mb_height;
 
-  set_first_pass_params(cpi);
-  vp9_set_quantizer(cm, find_fp_qindex(cm->bit_depth));
+  // Set up limit values for motion vectors to prevent them extending
+  // outside the UMV borders.
+  x->mv_limits.row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16);
+  x->mv_limits.row_max =
+      ((cm->mb_rows - 1 - mb_row) * 16) + BORDER_MV_PIXELS_B16;
 
-  if (lc != NULL) {
-    twopass = &lc->twopass;
+  for (mb_col = mb_col_start, c = 0; mb_col < mb_col_end; ++mb_col, c++) {
+    int this_error;
+    int this_intra_error;
+    const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+    const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);
+    double log_intra;
+    int level_sample;
+    const int mb_index = mb_row * cm->mb_cols + mb_col;
 
-    cpi->lst_fb_idx = cpi->svc.spatial_layer_id;
-    cpi->ref_frame_flags = VP9_LAST_FLAG;
+    (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, mb_row, c);
 
-    if (cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id <
-        REF_FRAMES) {
-      cpi->gld_fb_idx =
-          cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id;
-      cpi->ref_frame_flags |= VP9_GOLD_FLAG;
-      cpi->refresh_golden_frame = (lc->current_video_frame_in_layer == 0);
+    if (mb_col == mb_col_start) {
+      last_nonzero_mv = *first_top_mv;
+    }
+
+    // Adjust to the next column of MBs.
+    x->plane[0].src.buf = cpi->Source->y_buffer +
+                          mb_row * 16 * x->plane[0].src.stride + mb_col * 16;
+    x->plane[1].src.buf = cpi->Source->u_buffer +
+                          mb_row * uv_mb_height * x->plane[1].src.stride +
+                          mb_col * uv_mb_height;
+    x->plane[2].src.buf = cpi->Source->v_buffer +
+                          mb_row * uv_mb_height * x->plane[1].src.stride +
+                          mb_col * uv_mb_height;
+
+    vpx_clear_system_state();
+
+    xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
+    xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
+    xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset;
+    xd->mi[0]->sb_type = bsize;
+    xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+    set_mi_row_col(xd, &tile, mb_row << 1, num_8x8_blocks_high_lookup[bsize],
+                   mb_col << 1, num_8x8_blocks_wide_lookup[bsize], cm->mi_rows,
+                   cm->mi_cols);
+    // Are edges available for intra prediction?
+    // Since the firstpass does not populate the mi_grid_visible,
+    // above_mi/left_mi must be overwritten with a nonzero value when edges
+    // are available.  Required by vp9_predict_intra_block().
+    xd->above_mi = (mb_row != 0) ? &mi_above : NULL;
+    xd->left_mi = ((mb_col << 1) > tile.mi_col_start) ? &mi_left : NULL;
+
+    // Do intra 16x16 prediction.
+    x->skip_encode = 0;
+    x->fp_src_pred = 0;
+    // Do intra prediction based on source pixels for tile boundaries
+    if (mb_col == mb_col_start && mb_col != 0) {
+      xd->left_mi = &mi_left;
+      x->fp_src_pred = 1;
+    }
+    xd->mi[0]->mode = DC_PRED;
+    xd->mi[0]->tx_size =
+        use_dc_pred ? (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
+    // Fix - zero the 16x16 block first. This ensures correct this_error for
+    // block sizes smaller than 16x16.
+    vp9_zero_array(x->plane[0].src_diff, 256);
+    vp9_encode_intra_block_plane(x, bsize, 0, 0);
+    this_error = vpx_get_mb_ss(x->plane[0].src_diff);
+    this_intra_error = this_error;
+
+    // Keep a record of blocks that have very low intra error residual
+    // (i.e. are in effect completely flat and untextured in the intra
+    // domain). In natural videos this is uncommon, but it is much more
+    // common in animations, graphics and screen content, so may be used
+    // as a signal to detect these types of content.
+    if (this_error < get_ul_intra_threshold(cm)) {
+      ++(fp_acc_data->intra_skip_count);
+    } else if ((mb_col > 0) &&
+               (fp_acc_data->image_data_start_row == INVALID_ROW)) {
+      fp_acc_data->image_data_start_row = mb_row;
+    }
+
+    // Blocks that are mainly smooth in the intra domain.
+    // Some special accounting for CQ but also these are better for testing
+    // noise levels.
+    if (this_error < get_smooth_intra_threshold(cm)) {
+      ++(fp_acc_data->intra_smooth_count);
+    }
+
+    // Special case noise measurement for first frame.
+    if (cm->current_video_frame == 0) {
+      if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) {
+        fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize);
+      } else {
+        fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
+      }
+    }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      switch (cm->bit_depth) {
+        case VPX_BITS_8: break;
+        case VPX_BITS_10: this_error >>= 4; break;
+        default:
+          assert(cm->bit_depth == VPX_BITS_12);
+          this_error >>= 8;
+          break;
+      }
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    vpx_clear_system_state();
+    log_intra = log(this_error + 1.0);
+    if (log_intra < 10.0) {
+      mb_intra_factor = 1.0 + ((10.0 - log_intra) * 0.05);
+      fp_acc_data->intra_factor += mb_intra_factor;
+      if (cpi->row_mt_bit_exact)
+        cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor =
+            mb_intra_factor;
     } else {
-      cpi->refresh_golden_frame = 0;
+      fp_acc_data->intra_factor += 1.0;
+      if (cpi->row_mt_bit_exact)
+        cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor = 1.0;
     }
 
-    if (lc->current_video_frame_in_layer == 0) cpi->ref_frame_flags = 0;
-
-    vp9_scale_references(cpi);
-
-    // Use either last frame or alt frame for motion search.
-    if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
-      first_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME);
-      if (first_ref_buf == NULL)
-        first_ref_buf = get_ref_frame_buffer(cpi, LAST_FRAME);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth)
+      level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0];
+    else
+      level_sample = x->plane[0].src.buf[0];
+#else
+    level_sample = x->plane[0].src.buf[0];
+#endif
+    if ((level_sample < DARK_THRESH) && (log_intra < 9.0)) {
+      mb_brightness_factor = 1.0 + (0.01 * (DARK_THRESH - level_sample));
+      fp_acc_data->brightness_factor += mb_brightness_factor;
+      if (cpi->row_mt_bit_exact)
+        cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor =
+            mb_brightness_factor;
+    } else {
+      fp_acc_data->brightness_factor += 1.0;
+      if (cpi->row_mt_bit_exact)
+        cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor =
+            1.0;
     }
 
-    if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
-      gld_yv12 = vp9_get_scaled_ref_frame(cpi, GOLDEN_FRAME);
-      if (gld_yv12 == NULL) {
-        gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+    // Intrapenalty below deals with situations where the intra and inter
+    // error scores are very low (e.g. a plain black frame).
+    // We do not have special cases in first pass for 0,0 and nearest etc so
+    // all inter modes carry an overhead cost estimate for the mv.
+    // When the error score is very low this causes us to pick all or lots of
+    // INTRA modes and throw lots of key frames.
+    // This penalty adds a cost matching that of a 0,0 mv to the intra case.
+    this_error += intrapenalty;
+
+    // Accumulate the intra error.
+    fp_acc_data->intra_error += (int64_t)this_error;
+
+    // Set up limit values for motion vectors to prevent them extending
+    // outside the UMV borders.
+    x->mv_limits.col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);
+    x->mv_limits.col_max =
+        ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;
+
+    // Other than for intra-only frame do a motion search.
+    if (!frame_is_intra_only(cm)) {
+      int tmp_err, motion_error, this_motion_error, raw_motion_error;
+      // Assume 0,0 motion with no mv overhead.
+      MV mv = { 0, 0 }, tmp_mv = { 0, 0 };
+      struct buf_2d unscaled_last_source_buf_2d;
+      vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
+
+      xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        motion_error = highbd_get_prediction_error(
+            bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
+        this_motion_error = highbd_get_prediction_error(
+            bsize, &x->plane[0].src, &xd->plane[0].pre[0], 8);
+      } else {
+        motion_error =
+            get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+        this_motion_error = motion_error;
+      }
+#else
+      motion_error =
+          get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+      this_motion_error = motion_error;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+      // Compute the motion error of the 0,0 motion using the last source
+      // frame as the reference. Skip the further motion search on
+      // reconstructed frame if this error is very small.
+      unscaled_last_source_buf_2d.buf =
+          cpi->unscaled_last_source->y_buffer + recon_yoffset;
+      unscaled_last_source_buf_2d.stride = cpi->unscaled_last_source->y_stride;
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        raw_motion_error = highbd_get_prediction_error(
+            bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd);
+      } else {
+        raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+                                                &unscaled_last_source_buf_2d);
+      }
+#else
+      raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+                                              &unscaled_last_source_buf_2d);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+      if (raw_motion_error > NZ_MOTION_PENALTY) {
+        // Test last reference frame using the previous best mv as the
+        // starting point (best reference) for the search.
+        first_pass_motion_search(cpi, x, best_ref_mv, &mv, &motion_error);
+
+        v_fn_ptr.vf = get_block_variance_fn(bsize);
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, xd->bd);
+        }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        this_motion_error =
+            vp9_get_mvpred_var(x, &mv, best_ref_mv, &v_fn_ptr, 0);
+
+        // If the current best reference mv is not centered on 0,0 then do a
+        // 0,0 based search as well.
+        if (!is_zero_mv(best_ref_mv)) {
+          tmp_err = INT_MAX;
+          first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err);
+
+          if (tmp_err < motion_error) {
+            motion_error = tmp_err;
+            mv = tmp_mv;
+            this_motion_error =
+                vp9_get_mvpred_var(x, &tmp_mv, &zero_mv, &v_fn_ptr, 0);
+          }
+        }
+
+        // Search in an older reference frame.
+        if ((cm->current_video_frame > 1) && gld_yv12 != NULL) {
+          // Assume 0,0 motion with no mv overhead.
+          int gf_motion_error;
+
+          xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            gf_motion_error = highbd_get_prediction_error(
+                bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
+          } else {
+            gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+                                                   &xd->plane[0].pre[0]);
+          }
+#else
+          gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+                                                 &xd->plane[0].pre[0]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+          first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &gf_motion_error);
+
+          if (gf_motion_error < motion_error && gf_motion_error < this_error)
+            ++(fp_acc_data->second_ref_count);
+
+          // Reset to last frame as reference buffer.
+          xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
+          xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset;
+          xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset;
+
+          // In accumulating a score for the older reference frame take the
+          // best of the motion predicted score and the intra coded error
+          // (just as will be done for) accumulation of "coded_error" for
+          // the last frame.
+          if (gf_motion_error < this_error)
+            fp_acc_data->sr_coded_error += gf_motion_error;
+          else
+            fp_acc_data->sr_coded_error += this_error;
+        } else {
+          fp_acc_data->sr_coded_error += motion_error;
+        }
+      } else {
+        fp_acc_data->sr_coded_error += motion_error;
+      }
+
+      // Start by assuming that intra mode is best.
+      best_ref_mv->row = 0;
+      best_ref_mv->col = 0;
+
+      if (motion_error <= this_error) {
+        vpx_clear_system_state();
+
+        // Keep a count of cases where the inter and intra were very close
+        // and very low. This helps with scene cut detection for example in
+        // cropped clips with black bars at the sides or top and bottom.
+        if (((this_error - intrapenalty) * 9 <= motion_error * 10) &&
+            (this_error < (2 * intrapenalty))) {
+          fp_acc_data->neutral_count += 1.0;
+          if (cpi->row_mt_bit_exact)
+            cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count =
+                1.0;
+          // Also track cases where the intra is not much worse than the inter
+          // and use this in limiting the GF/arf group length.
+        } else if ((this_error > NCOUNT_INTRA_THRESH) &&
+                   (this_error < (NCOUNT_INTRA_FACTOR * motion_error))) {
+          mb_neutral_count =
+              (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_error);
+          fp_acc_data->neutral_count += mb_neutral_count;
+          if (cpi->row_mt_bit_exact)
+            cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count =
+                mb_neutral_count;
+        }
+
+        mv.row *= 8;
+        mv.col *= 8;
+        this_error = motion_error;
+        xd->mi[0]->mode = NEWMV;
+        xd->mi[0]->mv[0].as_mv = mv;
+        xd->mi[0]->tx_size = TX_4X4;
+        xd->mi[0]->ref_frame[0] = LAST_FRAME;
+        xd->mi[0]->ref_frame[1] = NO_REF_FRAME;
+        vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize);
+        vp9_encode_sby_pass1(x, bsize);
+        fp_acc_data->sum_mvr += mv.row;
+        fp_acc_data->sum_mvr_abs += abs(mv.row);
+        fp_acc_data->sum_mvc += mv.col;
+        fp_acc_data->sum_mvc_abs += abs(mv.col);
+        fp_acc_data->sum_mvrs += mv.row * mv.row;
+        fp_acc_data->sum_mvcs += mv.col * mv.col;
+        ++(fp_acc_data->intercount);
+
+        *best_ref_mv = mv;
+
+        if (!is_zero_mv(&mv)) {
+          ++(fp_acc_data->mvcount);
+          if (!is_equal_mv(&mv, &last_nonzero_mv)) {
+            ++(fp_acc_data->new_mv_count);
+          }
+          last_nonzero_mv = mv;
+
+          // Does the row vector point inwards or outwards?
+          if (mb_row < cm->mb_rows / 2) {
+            if (mv.row > 0)
+              --(fp_acc_data->sum_in_vectors);
+            else if (mv.row < 0)
+              ++(fp_acc_data->sum_in_vectors);
+          } else if (mb_row > cm->mb_rows / 2) {
+            if (mv.row > 0)
+              ++(fp_acc_data->sum_in_vectors);
+            else if (mv.row < 0)
+              --(fp_acc_data->sum_in_vectors);
+          }
+
+          // Does the col vector point inwards or outwards?
+          if (mb_col < cm->mb_cols / 2) {
+            if (mv.col > 0)
+              --(fp_acc_data->sum_in_vectors);
+            else if (mv.col < 0)
+              ++(fp_acc_data->sum_in_vectors);
+          } else if (mb_col > cm->mb_cols / 2) {
+            if (mv.col > 0)
+              ++(fp_acc_data->sum_in_vectors);
+            else if (mv.col < 0)
+              --(fp_acc_data->sum_in_vectors);
+          }
+        }
+        if (this_intra_error < scaled_low_intra_thresh) {
+          fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize);
+        } else {
+          fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
+        }
+      } else {  // Intra < inter error
+        if (this_intra_error < scaled_low_intra_thresh) {
+          fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize);
+          if (this_motion_error < scaled_low_intra_thresh) {
+            fp_acc_data->intra_count_low += 1.0;
+          } else {
+            fp_acc_data->intra_count_high += 1.0;
+          }
+        } else {
+          fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
+          fp_acc_data->intra_count_high += 1.0;
+        }
       }
     } else {
-      gld_yv12 = NULL;
+      fp_acc_data->sr_coded_error += (int64_t)this_error;
     }
+    fp_acc_data->coded_error += (int64_t)this_error;
 
-    set_ref_ptrs(cm, xd,
-                 (cpi->ref_frame_flags & VP9_LAST_FLAG) ? LAST_FRAME : NONE,
-                 (cpi->ref_frame_flags & VP9_GOLD_FLAG) ? GOLDEN_FRAME : NONE);
+    if (mb_col == mb_col_start) {
+      *first_top_mv = last_nonzero_mv;
+    }
+    recon_yoffset += 16;
+    recon_uvoffset += uv_mb_height;
 
-    cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
-                                        &cpi->scaled_source, 0);
+    // Accumulate row level stats to the corresponding tile stats
+    if (cpi->row_mt && mb_col == mb_col_end - 1)
+      accumulate_fp_mb_row_stat(tile_data, fp_acc_data);
+
+    (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, mb_row, c,
+                                    num_mb_cols);
   }
+  vpx_clear_system_state();
+}
+
+static void first_pass_encode(VP9_COMP *cpi, FIRSTPASS_DATA *fp_acc_data) {
+  VP9_COMMON *const cm = &cpi->common;
+  int mb_row;
+  TileDataEnc tile_data;
+  TileInfo *tile = &tile_data.tile_info;
+  MV zero_mv = { 0, 0 };
+  MV best_ref_mv;
+  // Tiling is ignored in the first pass.
+  vp9_tile_init(tile, cm, 0, 0);
+  tile_data.firstpass_top_mv = zero_mv;
+
+  for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
+    best_ref_mv = zero_mv;
+    vp9_first_pass_encode_tile_mb_row(cpi, &cpi->td, fp_acc_data, &tile_data,
+                                      &best_ref_mv, mb_row);
+  }
+}
+
+void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  TWO_PASS *twopass = &cpi->twopass;
+
+  YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+  YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+  YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm);
+  const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
+
+  BufferPool *const pool = cm->buffer_pool;
+
+  FIRSTPASS_DATA fp_temp_data;
+  FIRSTPASS_DATA *fp_acc_data = &fp_temp_data;
+
+  vpx_clear_system_state();
+  vp9_zero(fp_temp_data);
+  fp_acc_data->image_data_start_row = INVALID_ROW;
+
+  // First pass code requires valid last and new frame buffers.
+  assert(new_yv12 != NULL);
+  assert(frame_is_intra_only(cm) || (lst_yv12 != NULL));
+
+  set_first_pass_params(cpi);
+  vp9_set_quantizer(cpi, find_fp_qindex(cm->bit_depth), 0);
 
   vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
 
@@ -770,518 +1391,52 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
 
   vp9_frame_init_quantizer(cpi);
 
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    p[i].coeff = ctx->coeff_pbuf[i][1];
-    p[i].qcoeff = ctx->qcoeff_pbuf[i][1];
-    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
-    p[i].eobs = ctx->eobs_pbuf[i][1];
-  }
   x->skip_recode = 0;
 
   vp9_init_mv_probs(cm);
   vp9_initialize_rd_consts(cpi);
 
-  // Tiling is ignored in the first pass.
-  vp9_tile_init(&tile, cm, 0, 0);
+  cm->log2_tile_rows = 0;
 
-  recon_y_stride = new_yv12->y_stride;
-  recon_uv_stride = new_yv12->uv_stride;
-  uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);
-
-  for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
-    MV best_ref_mv = { 0, 0 };
-
-    // Reset above block coeffs.
-    recon_yoffset = (mb_row * recon_y_stride * 16);
-    recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height);
-
-    // Set up limit values for motion vectors to prevent them extending
-    // outside the UMV borders.
-    x->mv_limits.row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16);
-    x->mv_limits.row_max =
-        ((cm->mb_rows - 1 - mb_row) * 16) + BORDER_MV_PIXELS_B16;
-
-    for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
-      int this_error;
-      int this_intra_error;
-      const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
-      const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);
-      double log_intra;
-      int level_sample;
-
-#if CONFIG_FP_MB_STATS
-      const int mb_index = mb_row * cm->mb_cols + mb_col;
-#endif
-
-      vpx_clear_system_state();
-
-      xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
-      xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
-      xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset;
-      xd->mi[0]->sb_type = bsize;
-      xd->mi[0]->ref_frame[0] = INTRA_FRAME;
-      set_mi_row_col(xd, &tile, mb_row << 1, num_8x8_blocks_high_lookup[bsize],
-                     mb_col << 1, num_8x8_blocks_wide_lookup[bsize],
-                     cm->mi_rows, cm->mi_cols);
-      // Are edges available for intra prediction?
-      // Since the firstpass does not populate the mi_grid_visible,
-      // above_mi/left_mi must be overwritten with a nonzero value when edges
-      // are available.  Required by vp9_predict_intra_block().
-      xd->above_mi = (mb_row != 0) ? &mi_above : NULL;
-      xd->left_mi = (mb_col > tile.mi_col_start) ? &mi_left : NULL;
-
-      // Do intra 16x16 prediction.
-      x->skip_encode = 0;
-      xd->mi[0]->mode = DC_PRED;
-      xd->mi[0]->tx_size =
-          use_dc_pred ? (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
-
-      // Set the 16x16 src_diff block to zero, which ensures correct this_error
-      // calculation for block sizes smaller than 16x16.
-      vp9_zero_array(x->plane[0].src_diff, 256);
-      vp9_encode_intra_block_plane(x, bsize, 0, 0);
-      this_error = vpx_get_mb_ss(x->plane[0].src_diff);
-      this_intra_error = this_error;
-
-      // Keep a record of blocks that have very low intra error residual
-      // (i.e. are in effect completely flat and untextured in the intra
-      // domain). In natural videos this is uncommon, but it is much more
-      // common in animations, graphics and screen content, so may be used
-      // as a signal to detect these types of content.
-      if (this_error < get_ul_intra_threshold(cm)) {
-        ++intra_skip_count;
-      } else if ((mb_col > 0) && (image_data_start_row == INVALID_ROW)) {
-        image_data_start_row = mb_row;
-      }
-
-      // Blocks that are mainly smooth in the intra domain.
-      // Some special accounting for CQ but also these are better for testing
-      // noise levels.
-      if (this_error < get_smooth_intra_threshold(cm)) {
-        ++intra_smooth_count;
-      }
-
-      // Special case noise measurement for first frame.
-      if (cm->current_video_frame == 0) {
-        if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) {
-          frame_noise_energy += fp_estimate_block_noise(x, bsize);
-        } else {
-          frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
-        }
-      }
-
-#if CONFIG_VP9_HIGHBITDEPTH
-      if (cm->use_highbitdepth) {
-        switch (cm->bit_depth) {
-          case VPX_BITS_8: break;
-          case VPX_BITS_10: this_error >>= 4; break;
-          case VPX_BITS_12: this_error >>= 8; break;
-          default:
-            assert(0 &&
-                   "cm->bit_depth should be VPX_BITS_8, "
-                   "VPX_BITS_10 or VPX_BITS_12");
-            return;
-        }
-      }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-      vpx_clear_system_state();
-      log_intra = log(this_error + 1.0);
-      if (log_intra < 10.0)
-        intra_factor += 1.0 + ((10.0 - log_intra) * 0.05);
-      else
-        intra_factor += 1.0;
-
-#if CONFIG_VP9_HIGHBITDEPTH
-      if (cm->use_highbitdepth)
-        level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0];
-      else
-        level_sample = x->plane[0].src.buf[0];
-#else
-      level_sample = x->plane[0].src.buf[0];
-#endif
-      if ((level_sample < DARK_THRESH) && (log_intra < 9.0))
-        brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample));
-      else
-        brightness_factor += 1.0;
-
-      // Intrapenalty below deals with situations where the intra and inter
-      // error scores are very low (e.g. a plain black frame).
-      // We do not have special cases in first pass for 0,0 and nearest etc so
-      // all inter modes carry an overhead cost estimate for the mv.
-      // When the error score is very low this causes us to pick all or lots of
-      // INTRA modes and throw lots of key frames.
-      // This penalty adds a cost matching that of a 0,0 mv to the intra case.
-      this_error += intrapenalty;
-
-      // Accumulate the intra error.
-      intra_error += (int64_t)this_error;
-
-#if CONFIG_FP_MB_STATS
-      if (cpi->use_fp_mb_stats) {
-        // initialization
-        cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
-      }
-#endif
-
-      // Set up limit values for motion vectors to prevent them extending
-      // outside the UMV borders.
-      x->mv_limits.col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);
-      x->mv_limits.col_max =
-          ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;
-
-      // Other than for the first frame do a motion search.
-      if ((lc == NULL && cm->current_video_frame > 0) ||
-          (lc != NULL && lc->current_video_frame_in_layer > 0)) {
-        int tmp_err, motion_error, raw_motion_error;
-        // Assume 0,0 motion with no mv overhead.
-        MV mv = { 0, 0 }, tmp_mv = { 0, 0 };
-        struct buf_2d unscaled_last_source_buf_2d;
-
-        xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
-#if CONFIG_VP9_HIGHBITDEPTH
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-          motion_error = highbd_get_prediction_error(
-              bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
-        } else {
-          motion_error = get_prediction_error(bsize, &x->plane[0].src,
-                                              &xd->plane[0].pre[0]);
-        }
-#else
-        motion_error =
-            get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-        // Compute the motion error of the 0,0 motion using the last source
-        // frame as the reference. Skip the further motion search on
-        // reconstructed frame if this error is small.
-        unscaled_last_source_buf_2d.buf =
-            cpi->unscaled_last_source->y_buffer + recon_yoffset;
-        unscaled_last_source_buf_2d.stride =
-            cpi->unscaled_last_source->y_stride;
-#if CONFIG_VP9_HIGHBITDEPTH
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-          raw_motion_error = highbd_get_prediction_error(
-              bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd);
-        } else {
-          raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
-                                                  &unscaled_last_source_buf_2d);
-        }
-#else
-        raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
-                                                &unscaled_last_source_buf_2d);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-        // TODO(pengchong): Replace the hard-coded threshold
-        if (raw_motion_error > 25 || lc != NULL) {
-          // Test last reference frame using the previous best mv as the
-          // starting point (best reference) for the search.
-          first_pass_motion_search(cpi, x, &best_ref_mv, &mv, &motion_error);
-
-          // If the current best reference mv is not centered on 0,0 then do a
-          // 0,0 based search as well.
-          if (!is_zero_mv(&best_ref_mv)) {
-            tmp_err = INT_MAX;
-            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err);
-
-            if (tmp_err < motion_error) {
-              motion_error = tmp_err;
-              mv = tmp_mv;
-            }
-          }
-
-          // Search in an older reference frame.
-          if (((lc == NULL && cm->current_video_frame > 1) ||
-               (lc != NULL && lc->current_video_frame_in_layer > 1)) &&
-              gld_yv12 != NULL) {
-            // Assume 0,0 motion with no mv overhead.
-            int gf_motion_error;
-
-            xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
-#if CONFIG_VP9_HIGHBITDEPTH
-            if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-              gf_motion_error = highbd_get_prediction_error(
-                  bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
-            } else {
-              gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
-                                                     &xd->plane[0].pre[0]);
-            }
-#else
-            gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
-                                                   &xd->plane[0].pre[0]);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv,
-                                     &gf_motion_error);
-
-            if (gf_motion_error < motion_error && gf_motion_error < this_error)
-              ++second_ref_count;
-
-            // Reset to last frame as reference buffer.
-            xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
-            xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset;
-            xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset;
-
-            // In accumulating a score for the older reference frame take the
-            // best of the motion predicted score and the intra coded error
-            // (just as will be done for) accumulation of "coded_error" for
-            // the last frame.
-            if (gf_motion_error < this_error)
-              sr_coded_error += gf_motion_error;
-            else
-              sr_coded_error += this_error;
-          } else {
-            sr_coded_error += motion_error;
-          }
-        } else {
-          sr_coded_error += motion_error;
-        }
-
-        // Start by assuming that intra mode is best.
-        best_ref_mv.row = 0;
-        best_ref_mv.col = 0;
-
-#if CONFIG_FP_MB_STATS
-        if (cpi->use_fp_mb_stats) {
-          // intra prediction statistics
-          cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
-          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK;
-          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
-          if (this_error > FPMB_ERROR_LARGE_TH) {
-            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK;
-          } else if (this_error < FPMB_ERROR_SMALL_TH) {
-            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK;
-          }
-        }
-#endif
-
-        if (motion_error <= this_error) {
-          vpx_clear_system_state();
-
-          // Keep a count of cases where the inter and intra were very close
-          // and very low. This helps with scene cut detection for example in
-          // cropped clips with black bars at the sides or top and bottom.
-          if (((this_error - intrapenalty) * 9 <= motion_error * 10) &&
-              (this_error < (2 * intrapenalty))) {
-            neutral_count += 1.0;
-            // Also track cases where the intra is not much worse than the inter
-            // and use this in limiting the GF/arf group length.
-          } else if ((this_error > NCOUNT_INTRA_THRESH) &&
-                     (this_error < (NCOUNT_INTRA_FACTOR * motion_error))) {
-            neutral_count +=
-                (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_error);
-          }
-
-          mv.row *= 8;
-          mv.col *= 8;
-          this_error = motion_error;
-          xd->mi[0]->mode = NEWMV;
-          xd->mi[0]->mv[0].as_mv = mv;
-          xd->mi[0]->tx_size = TX_4X4;
-          xd->mi[0]->ref_frame[0] = LAST_FRAME;
-          xd->mi[0]->ref_frame[1] = NONE;
-          vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize);
-          vp9_encode_sby_pass1(x, bsize);
-          sum_mvr += mv.row;
-          sum_mvr_abs += abs(mv.row);
-          sum_mvc += mv.col;
-          sum_mvc_abs += abs(mv.col);
-          sum_mvrs += mv.row * mv.row;
-          sum_mvcs += mv.col * mv.col;
-          ++intercount;
-
-          best_ref_mv = mv;
-
-#if CONFIG_FP_MB_STATS
-          if (cpi->use_fp_mb_stats) {
-            // inter prediction statistics
-            cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
-            cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK;
-            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
-            if (this_error > FPMB_ERROR_LARGE_TH) {
-              cpi->twopass.frame_mb_stats_buf[mb_index] |=
-                  FPMB_ERROR_LARGE_MASK;
-            } else if (this_error < FPMB_ERROR_SMALL_TH) {
-              cpi->twopass.frame_mb_stats_buf[mb_index] |=
-                  FPMB_ERROR_SMALL_MASK;
-            }
-          }
-#endif
-
-          if (!is_zero_mv(&mv)) {
-            ++mvcount;
-
-#if CONFIG_FP_MB_STATS
-            if (cpi->use_fp_mb_stats) {
-              cpi->twopass.frame_mb_stats_buf[mb_index] &=
-                  ~FPMB_MOTION_ZERO_MASK;
-              // check estimated motion direction
-              if (mv.as_mv.col > 0 && mv.as_mv.col >= abs(mv.as_mv.row)) {
-                // right direction
-                cpi->twopass.frame_mb_stats_buf[mb_index] |=
-                    FPMB_MOTION_RIGHT_MASK;
-              } else if (mv.as_mv.row < 0 &&
-                         abs(mv.as_mv.row) >= abs(mv.as_mv.col)) {
-                // up direction
-                cpi->twopass.frame_mb_stats_buf[mb_index] |=
-                    FPMB_MOTION_UP_MASK;
-              } else if (mv.as_mv.col < 0 &&
-                         abs(mv.as_mv.col) >= abs(mv.as_mv.row)) {
-                // left direction
-                cpi->twopass.frame_mb_stats_buf[mb_index] |=
-                    FPMB_MOTION_LEFT_MASK;
-              } else {
-                // down direction
-                cpi->twopass.frame_mb_stats_buf[mb_index] |=
-                    FPMB_MOTION_DOWN_MASK;
-              }
-            }
-#endif
-
-            // Does the row vector point inwards or outwards?
-            if (mb_row < cm->mb_rows / 2) {
-              if (mv.row > 0)
-                --sum_in_vectors;
-              else if (mv.row < 0)
-                ++sum_in_vectors;
-            } else if (mb_row > cm->mb_rows / 2) {
-              if (mv.row > 0)
-                ++sum_in_vectors;
-              else if (mv.row < 0)
-                --sum_in_vectors;
-            }
-
-            // Does the col vector point inwards or outwards?
-            if (mb_col < cm->mb_cols / 2) {
-              if (mv.col > 0)
-                --sum_in_vectors;
-              else if (mv.col < 0)
-                ++sum_in_vectors;
-            } else if (mb_col > cm->mb_cols / 2) {
-              if (mv.col > 0)
-                ++sum_in_vectors;
-              else if (mv.col < 0)
-                --sum_in_vectors;
-            }
-            frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
-          } else if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) {
-            frame_noise_energy += fp_estimate_block_noise(x, bsize);
-          } else {  // 0,0 mv but high error
-            frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
-          }
-        } else {  // Intra < inter error
-          if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH))
-            frame_noise_energy += fp_estimate_block_noise(x, bsize);
-          else
-            frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
-        }
-      } else {
-        sr_coded_error += (int64_t)this_error;
-      }
-      coded_error += (int64_t)this_error;
-
-      // Adjust to the next column of MBs.
-      x->plane[0].src.buf += 16;
-      x->plane[1].src.buf += uv_mb_height;
-      x->plane[2].src.buf += uv_mb_height;
-
-      recon_yoffset += 16;
-      recon_uvoffset += uv_mb_height;
-    }
-
-    // Adjust to the next row of MBs.
-    x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols;
-    x->plane[1].src.buf +=
-        uv_mb_height * x->plane[1].src.stride - uv_mb_height * cm->mb_cols;
-    x->plane[2].src.buf +=
-        uv_mb_height * x->plane[1].src.stride - uv_mb_height * cm->mb_cols;
-
-    vpx_clear_system_state();
-  }
-
-  // Clamp the image start to rows/2. This number of rows is discarded top
-  // and bottom as dead data so rows / 2 means the frame is blank.
-  if ((image_data_start_row > cm->mb_rows / 2) ||
-      (image_data_start_row == INVALID_ROW)) {
-    image_data_start_row = cm->mb_rows / 2;
-  }
-  // Exclude any image dead zone
-  if (image_data_start_row > 0) {
-    intra_skip_count =
-        VPXMAX(0, intra_skip_count - (image_data_start_row * cm->mb_cols * 2));
-  }
+  if (cpi->row_mt_bit_exact && cpi->twopass.fp_mb_float_stats == NULL)
+    CHECK_MEM_ERROR(
+        &cm->error, cpi->twopass.fp_mb_float_stats,
+        vpx_calloc(cm->MBs * sizeof(*cpi->twopass.fp_mb_float_stats), 1));
 
   {
     FIRSTPASS_STATS fps;
-    // The minimum error here insures some bit allocation to frames even
-    // in static regions. The allocation per MB declines for larger formats
-    // where the typical "real" energy per MB also falls.
-    // Initial estimate here uses sqrt(mbs) to define the min_err, where the
-    // number of mbs is proportional to the image area.
-    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
-                            ? cpi->initial_mbs
-                            : cpi->common.MBs;
-    const double min_err = 200 * sqrt(num_mbs);
-
-    intra_factor = intra_factor / (double)num_mbs;
-    brightness_factor = brightness_factor / (double)num_mbs;
-    fps.weight = intra_factor * brightness_factor;
-
-    fps.frame = cm->current_video_frame;
-    fps.spatial_layer_id = cpi->svc.spatial_layer_id;
-    fps.coded_error = (double)(coded_error >> 8) + min_err;
-    fps.sr_coded_error = (double)(sr_coded_error >> 8) + min_err;
-    fps.intra_error = (double)(intra_error >> 8) + min_err;
-    fps.frame_noise_energy = (double)frame_noise_energy / (double)num_mbs;
-    fps.count = 1.0;
-    fps.pcnt_inter = (double)intercount / num_mbs;
-    fps.pcnt_second_ref = (double)second_ref_count / num_mbs;
-    fps.pcnt_neutral = (double)neutral_count / num_mbs;
-    fps.intra_skip_pct = (double)intra_skip_count / num_mbs;
-    fps.intra_smooth_pct = (double)intra_smooth_count / num_mbs;
-    fps.inactive_zone_rows = (double)image_data_start_row;
-    // Currently set to 0 as most issues relate to letter boxing.
-    fps.inactive_zone_cols = (double)0;
-
-    if (mvcount > 0) {
-      fps.MVr = (double)sum_mvr / mvcount;
-      fps.mvr_abs = (double)sum_mvr_abs / mvcount;
-      fps.MVc = (double)sum_mvc / mvcount;
-      fps.mvc_abs = (double)sum_mvc_abs / mvcount;
-      fps.MVrv =
-          ((double)sum_mvrs - ((double)sum_mvr * sum_mvr / mvcount)) / mvcount;
-      fps.MVcv =
-          ((double)sum_mvcs - ((double)sum_mvc * sum_mvc / mvcount)) / mvcount;
-      fps.mv_in_out_count = (double)sum_in_vectors / (mvcount * 2);
-      fps.pcnt_motion = (double)mvcount / num_mbs;
+    TileDataEnc *first_tile_col;
+    if (!cpi->row_mt) {
+      cm->log2_tile_cols = 0;
+      cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read_dummy;
+      cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write_dummy;
+      first_pass_encode(cpi, fp_acc_data);
+      first_pass_stat_calc(cpi, &fps, fp_acc_data);
     } else {
-      fps.MVr = 0.0;
-      fps.mvr_abs = 0.0;
-      fps.MVc = 0.0;
-      fps.mvc_abs = 0.0;
-      fps.MVrv = 0.0;
-      fps.MVcv = 0.0;
-      fps.mv_in_out_count = 0.0;
-      fps.pcnt_motion = 0.0;
+      cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read;
+      cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write;
+      if (cpi->row_mt_bit_exact) {
+        cm->log2_tile_cols = 0;
+        vp9_zero_array(cpi->twopass.fp_mb_float_stats, cm->MBs);
+      }
+      vp9_encode_fp_row_mt(cpi);
+      first_tile_col = &cpi->tile_data[0];
+      if (cpi->row_mt_bit_exact)
+        accumulate_floating_point_stats(cpi, first_tile_col);
+      first_pass_stat_calc(cpi, &fps, &(first_tile_col->fp_data));
     }
 
-    // Dont allow a value of 0 for duration.
+    // Don't allow a value of 0 for duration.
     // (Section duration is also defaulted to minimum of 1.0).
     fps.duration = VPXMAX(1.0, (double)(source->ts_end - source->ts_start));
 
     // Don't want to do output stats with a stack variable!
     twopass->this_frame_stats = fps;
-    output_stats(&twopass->this_frame_stats, cpi->output_pkt_list);
+    output_stats(&twopass->this_frame_stats);
     accumulate_stats(&twopass->total_stats, &fps);
-
-#if CONFIG_FP_MB_STATS
-    if (cpi->use_fp_mb_stats) {
-      output_fpmb_stats(twopass->frame_mb_stats_buf, cm, cpi->output_pkt_list);
-    }
-#endif
   }
 
-  // Copy the previous Last Frame back into gf and and arf buffers if
+  // Copy the previous Last Frame back into gf and arf buffers if
   // the prediction is good enough... but also don't allow it to lag too far.
   if ((twopass->sr_update_lag > 3) ||
       ((cm->current_video_frame > 0) &&
@@ -1299,50 +1454,38 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
 
   vpx_extend_frame_borders(new_yv12);
 
-  if (lc != NULL) {
-    vp9_update_reference_frames(cpi);
-  } else {
-    // The frame we just compressed now becomes the last frame.
-    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx],
-               cm->new_fb_idx);
-  }
+  // The frame we just compressed now becomes the last frame.
+  ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx],
+             cm->new_fb_idx);
 
   // Special case for the first frame. Copy into the GF buffer as a second
   // reference.
-  if (cm->current_video_frame == 0 && cpi->gld_fb_idx != INVALID_IDX &&
-      lc == NULL) {
+  if (cm->current_video_frame == 0 && cpi->gld_fb_idx != INVALID_IDX) {
     ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
                cm->ref_frame_map[cpi->lst_fb_idx]);
   }
 
-  // Use this to see what the first pass reconstruction looks like.
-  if (0) {
-    char filename[512];
-    FILE *recon_file;
-    snprintf(filename, sizeof(filename), "enc%04d.yuv",
-             (int)cm->current_video_frame);
-
-    if (cm->current_video_frame == 0)
-      recon_file = fopen(filename, "wb");
-    else
-      recon_file = fopen(filename, "ab");
-
-    (void)fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file);
-    fclose(recon_file);
-  }
-
-  ++cm->current_video_frame;
+  // In the first pass, every frame is considered as a show frame.
+  update_frame_indexes(cm, /*show_frame=*/1);
   if (cpi->use_svc) vp9_inc_frame_in_layer(cpi);
 }
 
-static double calc_correction_factor(double err_per_mb, double err_divisor,
-                                     double pt_low, double pt_high, int q,
-                                     vpx_bit_depth_t bit_depth) {
-  const double error_term = err_per_mb / err_divisor;
+static const double q_pow_term[(QINDEX_RANGE >> 5) + 1] = { 0.65, 0.70, 0.75,
+                                                            0.85, 0.90, 0.90,
+                                                            0.90, 1.00, 1.25 };
 
-  // Adjustment based on actual quantizer to power term.
-  const double power_term =
-      VPXMIN(vp9_convert_qindex_to_q(q, bit_depth) * 0.01 + pt_low, pt_high);
+static double calc_correction_factor(double err_per_mb, double err_divisor,
+                                     int q) {
+  const double error_term = err_per_mb / DOUBLE_DIVIDE_CHECK(err_divisor);
+  const int index = q >> 5;
+  double power_term;
+
+  assert((index >= 0) && (index < (QINDEX_RANGE >> 5)));
+
+  // Adjustment based on quantizer to the power term.
+  power_term =
+      q_pow_term[index] +
+      (((q_pow_term[index + 1] - q_pow_term[index]) * (q % 32)) / 32.0);
 
   // Calculate correction factor.
   if (power_term < 1.0) assert(error_term >= 0.0);
@@ -1350,7 +1493,26 @@ static double calc_correction_factor(double err_per_mb, double err_divisor,
   return fclamp(pow(error_term, power_term), 0.05, 5.0);
 }
 
-#define ERR_DIVISOR 115.0
+static double wq_err_divisor(VP9_COMP *cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
+  unsigned int screen_area = (cm->width * cm->height);
+
+  // Use a different error per mb factor for calculating boost for
+  //  different formats.
+  if (screen_area <= 640 * 360) {
+    return 115.0;
+  } else if (screen_area < 1280 * 720) {
+    return 125.0;
+  } else if (screen_area <= 1920 * 1080) {
+    return 130.0;
+  } else if (screen_area < 3840 * 2160) {
+    return 150.0;
+  }
+
+  // Fall through to here only for 4K and above.
+  return 200.0;
+}
+
 #define NOISE_FACTOR_MIN 0.9
 #define NOISE_FACTOR_MAX 1.1
 static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err,
@@ -1359,6 +1521,7 @@ static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err,
   const RATE_CONTROL *const rc = &cpi->rc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   TWO_PASS *const twopass = &cpi->twopass;
+  double last_group_rate_err;
 
   // Clamp the target rate to VBR min / max limts.
   const int target_rate =
@@ -1367,44 +1530,54 @@ static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err,
   noise_factor = fclamp(noise_factor, NOISE_FACTOR_MIN, NOISE_FACTOR_MAX);
   inactive_zone = fclamp(inactive_zone, 0.0, 1.0);
 
+// TODO(jimbankoski): remove #if here or below when this has been
+// well tested.
+#if CONFIG_ALWAYS_ADJUST_BPM
+  // based on recent history adjust expectations of bits per macroblock.
+  last_group_rate_err =
+      (double)twopass->rolling_arf_group_actual_bits /
+      DOUBLE_DIVIDE_CHECK((double)twopass->rolling_arf_group_target_bits);
+  last_group_rate_err = fclamp(last_group_rate_err, 0.25, 4.0);
+  twopass->bpm_factor *= (3.0 + last_group_rate_err) / 4.0;
+  twopass->bpm_factor = fclamp(twopass->bpm_factor, 0.25, 4.0);
+#endif
+
   if (target_rate <= 0) {
     return rc->worst_quality;  // Highest value allowed
   } else {
     const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
                             ? cpi->initial_mbs
                             : cpi->common.MBs;
-    const int active_mbs = VPXMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
-    const double av_err_per_mb = section_err / active_mbs;
+    const double active_pct = VPXMAX(0.01, 1.0 - inactive_zone);
+    const int active_mbs = (int)VPXMAX(1, (double)num_mbs * active_pct);
+    const double av_err_per_mb = section_err / active_pct;
     const double speed_term = 1.0 + 0.04 * oxcf->speed;
-    double last_group_rate_err;
-    const int target_norm_bits_per_mb =
-        (int)(((uint64_t)target_rate << BPER_MB_NORMBITS) / active_mbs);
+    const uint64_t target_norm_bits_per_mb =
+        ((uint64_t)target_rate << BPER_MB_NORMBITS) / active_mbs;
     int q;
-    int is_svc_upper_layer = 0;
-
-    if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0)
-      is_svc_upper_layer = 1;
 
+// TODO(jimbankoski): remove #if here or above when this has been
+// well tested.
+#if !CONFIG_ALWAYS_ADJUST_BPM
     // based on recent history adjust expectations of bits per macroblock.
     last_group_rate_err =
         (double)twopass->rolling_arf_group_actual_bits /
         DOUBLE_DIVIDE_CHECK((double)twopass->rolling_arf_group_target_bits);
-    last_group_rate_err = VPXMAX(0.25, VPXMIN(4.0, last_group_rate_err));
+    last_group_rate_err = fclamp(last_group_rate_err, 0.25, 4.0);
     twopass->bpm_factor *= (3.0 + last_group_rate_err) / 4.0;
-    twopass->bpm_factor = VPXMAX(0.25, VPXMIN(4.0, twopass->bpm_factor));
+    twopass->bpm_factor = fclamp(twopass->bpm_factor, 0.25, 4.0);
+#endif
 
     // Try and pick a max Q that will be high enough to encode the
     // content at the given rate.
     for (q = rc->best_quality; q < rc->worst_quality; ++q) {
-      const double factor = calc_correction_factor(
-          av_err_per_mb, ERR_DIVISOR,
-          is_svc_upper_layer ? SVC_FACTOR_PT_LOW : FACTOR_PT_LOW,
-          FACTOR_PT_HIGH, q, cpi->common.bit_depth);
+      const double factor =
+          calc_correction_factor(av_err_per_mb, wq_err_divisor(cpi), q);
       const int bits_per_mb = vp9_rc_bits_per_mb(
           INTER_FRAME, q,
           factor * speed_term * cpi->twopass.bpm_factor * noise_factor,
           cpi->common.bit_depth);
-      if (bits_per_mb <= target_norm_bits_per_mb) break;
+      if ((uint64_t)bits_per_mb <= target_norm_bits_per_mb) break;
     }
 
     // Restriction on active max q for constrained quality mode.
@@ -1446,14 +1619,9 @@ void calculate_coded_size(VP9_COMP *cpi, int *scaled_frame_width,
 }
 
 void vp9_init_second_pass(VP9_COMP *cpi) {
-  SVC *const svc = &cpi->svc;
-  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
-  const int is_two_pass_svc =
-      (svc->number_spatial_layers > 1) || (svc->number_temporal_layers > 1);
+  VP9EncoderConfig *const oxcf = &cpi->oxcf;
   RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass =
-      is_two_pass_svc ? &svc->layer_context[svc->spatial_layer_id].twopass
-                      : &cpi->twopass;
+  TWO_PASS *const twopass = &cpi->twopass;
   double frame_rate;
   FIRSTPASS_STATS *stats;
 
@@ -1467,46 +1635,76 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
   *stats = *twopass->stats_in_end;
   twopass->total_left_stats = *stats;
 
+  // Scan the first pass file and calculate a modified score for each
+  // frame that is used to distribute bits. The modified score is assumed
+  // to provide a linear basis for bit allocation. I.e., a frame A with a score
+  // that is double that of frame B will be allocated 2x as many bits.
+  {
+    double modified_score_total = 0.0;
+    const FIRSTPASS_STATS *s = twopass->stats_in;
+    double av_err;
+
+    if (oxcf->vbr_corpus_complexity) {
+      twopass->mean_mod_score = (double)oxcf->vbr_corpus_complexity / 10.0;
+      av_err = get_distribution_av_err(cpi, twopass);
+    } else {
+      av_err = get_distribution_av_err(cpi, twopass);
+      // The first scan is unclamped and gives a raw average.
+      while (s < twopass->stats_in_end) {
+        modified_score_total += calculate_mod_frame_score(cpi, oxcf, s, av_err);
+        ++s;
+      }
+
+      // The average error from this first scan is used to define the midpoint
+      // error for the rate distribution function.
+      twopass->mean_mod_score =
+          modified_score_total / DOUBLE_DIVIDE_CHECK(stats->count);
+    }
+
+    // Second scan using clamps based on the previous cycle average.
+    // This may modify the total and average somewhat but we don't bother with
+    // further iterations.
+    modified_score_total = 0.0;
+    s = twopass->stats_in;
+    while (s < twopass->stats_in_end) {
+      modified_score_total +=
+          calculate_norm_frame_score(cpi, twopass, oxcf, s, av_err);
+      ++s;
+    }
+    twopass->normalized_score_left = modified_score_total;
+
+    // If using Corpus wide VBR mode then update the clip target bandwidth to
+    // reflect how the clip compares to the rest of the corpus.
+    if (oxcf->vbr_corpus_complexity) {
+      oxcf->target_bandwidth =
+          (int64_t)((double)oxcf->target_bandwidth *
+                    (twopass->normalized_score_left / stats->count));
+    }
+
+#if COMPLEXITY_STATS_OUTPUT
+    {
+      FILE *compstats;
+      compstats = fopen("complexity_stats.stt", "a");
+      fprintf(compstats, "%10.3lf\n",
+              twopass->normalized_score_left / stats->count);
+      fclose(compstats);
+    }
+#endif
+  }
+
   frame_rate = 10000000.0 * stats->count / stats->duration;
   // Each frame can have a different duration, as the frame rate in the source
   // isn't guaranteed to be constant. The frame rate prior to the first frame
   // encoded in the second pass is a guess. However, the sum duration is not.
   // It is calculated based on the actual durations of all frames from the
   // first pass.
-
-  if (is_two_pass_svc) {
-    vp9_update_spatial_layer_framerate(cpi, frame_rate);
-    twopass->bits_left =
-        (int64_t)(stats->duration *
-                  svc->layer_context[svc->spatial_layer_id].target_bandwidth /
-                  10000000.0);
-  } else {
-    vp9_new_framerate(cpi, frame_rate);
-    twopass->bits_left =
-        (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0);
-  }
+  vp9_new_framerate(cpi, frame_rate);
+  twopass->bits_left =
+      (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0);
 
   // This variable monitors how far behind the second ref update is lagging.
   twopass->sr_update_lag = 1;
 
-  // Scan the first pass file and calculate a modified total error based upon
-  // the bias/power function used to allocate bits.
-  {
-    const double avg_error =
-        stats->coded_error / DOUBLE_DIVIDE_CHECK(stats->count);
-    const FIRSTPASS_STATS *s = twopass->stats_in;
-    double modified_error_total = 0.0;
-    twopass->modified_error_min =
-        (avg_error * oxcf->two_pass_vbrmin_section) / 100;
-    twopass->modified_error_max =
-        (avg_error * oxcf->two_pass_vbrmax_section) / 100;
-    while (s < twopass->stats_in_end) {
-      modified_error_total += calculate_modified_err(cpi, twopass, oxcf, s);
-      ++s;
-    }
-    twopass->modified_error_left = modified_error_total;
-  }
-
   // Reset the vbr bits off target counters
   rc->vbr_bits_off_target = 0;
   rc->vbr_bits_off_target_fast = 0;
@@ -1531,110 +1729,110 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
   twopass->arnr_strength_adjustment = 0;
 }
 
-#define SR_DIFF_PART 0.0015
-#define INTRA_PART 0.005
-#define DEFAULT_DECAY_LIMIT 0.75
-#define LOW_SR_DIFF_TRHESH 0.1
-#define SR_DIFF_MAX 128.0
-#define LOW_CODED_ERR_PER_MB 10.0
-#define NCOUNT_FRAME_II_THRESH 6.0
-
-static double get_sr_decay_rate(const VP9_COMP *cpi,
+/* This function considers how the quality of prediction may be deteriorating
+ * with distance. It compares the coded error for the last frame and the
+ * second reference frame (usually two frames old) and also applies a factor
+ * based on the extent of INTRA coding.
+ *
+ * The decay factor is then used to reduce the contribution of frames further
+ * from the alt-ref or golden frame, to the bitrate boost calculation for that
+ * alt-ref or golden frame.
+ */
+static double get_sr_decay_rate(const TWO_PASS *const twopass,
                                 const FIRSTPASS_STATS *frame) {
-  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
-                                                             : cpi->common.MBs;
-  double sr_diff = (frame->sr_coded_error - frame->coded_error) / num_mbs;
+  double sr_diff = (frame->sr_coded_error - frame->coded_error);
   double sr_decay = 1.0;
-  double modified_pct_inter;
-  double modified_pcnt_intra;
-  const double motion_amplitude_part =
-      frame->pcnt_motion * ((frame->mvc_abs + frame->mvr_abs) /
-                            (cpi->initial_height + cpi->initial_width));
-
-  modified_pct_inter = frame->pcnt_inter;
-  if (((frame->coded_error / num_mbs) > LOW_CODED_ERR_PER_MB) &&
-      ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
-       (double)NCOUNT_FRAME_II_THRESH)) {
-    modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral;
-  }
-  modified_pcnt_intra = 100 * (1.0 - modified_pct_inter);
 
+  // Do nothing if the second ref to last frame error difference is
+  // very small or even negative.
   if ((sr_diff > LOW_SR_DIFF_TRHESH)) {
-    sr_diff = VPXMIN(sr_diff, SR_DIFF_MAX);
-    sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) - motion_amplitude_part -
-               (INTRA_PART * modified_pcnt_intra);
+    const double sr_diff_part =
+        twopass->sr_diff_factor * ((sr_diff * 0.25) / frame->intra_error);
+    double modified_pct_inter = frame->pcnt_inter;
+    double modified_pcnt_intra;
+
+    if ((frame->coded_error > LOW_CODED_ERR_PER_MB) &&
+        ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
+         (double)NCOUNT_FRAME_II_THRESH)) {
+      modified_pct_inter =
+          frame->pcnt_inter + frame->pcnt_intra_low - frame->pcnt_neutral;
+    }
+    modified_pcnt_intra = 100 * (1.0 - modified_pct_inter);
+
+    sr_decay = 1.0 - sr_diff_part - (INTRA_PART * modified_pcnt_intra);
   }
-  return VPXMAX(sr_decay, DEFAULT_DECAY_LIMIT);
+  return VPXMAX(sr_decay, twopass->sr_default_decay_limit);
 }
 
 // This function gives an estimate of how badly we believe the prediction
 // quality is decaying from frame to frame.
-static double get_zero_motion_factor(const VP9_COMP *cpi,
-                                     const FIRSTPASS_STATS *frame) {
-  const double zero_motion_pct = frame->pcnt_inter - frame->pcnt_motion;
-  double sr_decay = get_sr_decay_rate(cpi, frame);
+static double get_zero_motion_factor(const TWO_PASS *const twopass,
+                                     const FIRSTPASS_STATS *frame_stats) {
+  const double zero_motion_pct =
+      frame_stats->pcnt_inter - frame_stats->pcnt_motion;
+  double sr_decay = get_sr_decay_rate(twopass, frame_stats);
   return VPXMIN(sr_decay, zero_motion_pct);
 }
 
-#define ZM_POWER_FACTOR 0.75
+static double get_prediction_decay_rate(const TWO_PASS *const twopass,
+                                        const FIRSTPASS_STATS *frame_stats) {
+  const double sr_decay_rate = get_sr_decay_rate(twopass, frame_stats);
+  double zero_motion_factor =
+      twopass->zm_factor * (frame_stats->pcnt_inter - frame_stats->pcnt_motion);
 
-static double get_prediction_decay_rate(const VP9_COMP *cpi,
-                                        const FIRSTPASS_STATS *next_frame) {
-  const double sr_decay_rate = get_sr_decay_rate(cpi, next_frame);
-  const double zero_motion_factor =
-      (0.95 * pow((next_frame->pcnt_inter - next_frame->pcnt_motion),
-                  ZM_POWER_FACTOR));
+  // Check that the zero motion factor is valid
+  assert(zero_motion_factor >= 0.0 && zero_motion_factor <= 1.0);
 
   return VPXMAX(zero_motion_factor,
                 (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor)));
 }
 
+static int get_show_idx(const TWO_PASS *twopass) {
+  return (int)(twopass->stats_in - twopass->stats_in_start);
+}
 // Function to test for a condition where a complex transition is followed
 // by a static section. For example in slide shows where there is a fade
 // between slides. This is to help with more optimal kf and gf positioning.
-static int detect_transition_to_still(VP9_COMP *cpi, int frame_interval,
-                                      int still_interval,
-                                      double loop_decay_rate,
-                                      double last_decay_rate) {
-  TWO_PASS *const twopass = &cpi->twopass;
-  RATE_CONTROL *const rc = &cpi->rc;
-
-  // Break clause to detect very still sections after motion
-  // For example a static image after a fade or other transition
-  // instead of a clean scene cut.
-  if (frame_interval > rc->min_gf_interval && loop_decay_rate >= 0.999 &&
-      last_decay_rate < 0.9) {
-    int j;
-
-    // Look ahead a few frames to see if static condition persists...
-    for (j = 0; j < still_interval; ++j) {
-      const FIRSTPASS_STATS *stats = &twopass->stats_in[j];
-      if (stats >= twopass->stats_in_end) break;
-
-      if (stats->pcnt_inter - stats->pcnt_motion < 0.999) break;
-    }
-
-    // Only if it does do we signal a transition to still.
-    return j == still_interval;
+static int check_transition_to_still(const FIRST_PASS_INFO *first_pass_info,
+                                     int show_idx, int still_interval) {
+  int j;
+  int num_frames = fps_get_num_frames(first_pass_info);
+  if (show_idx + still_interval > num_frames) {
+    return 0;
   }
 
-  return 0;
+  // Look ahead a few frames to see if static condition persists...
+  for (j = 0; j < still_interval; ++j) {
+    const FIRSTPASS_STATS *stats =
+        fps_get_frame_stats(first_pass_info, show_idx + j);
+    if (stats->pcnt_inter - stats->pcnt_motion < 0.999) break;
+  }
+
+  // Only if it does do we signal a transition to still.
+  return j == still_interval;
 }
 
 // This function detects a flash through the high relative pcnt_second_ref
 // score in the frame following a flash frame. The offset passed in should
 // reflect this.
-static int detect_flash(const TWO_PASS *twopass, int offset) {
-  const FIRSTPASS_STATS *const next_frame = read_frame_stats(twopass, offset);
-
+static int detect_flash_from_frame_stats(const FIRSTPASS_STATS *frame_stats) {
   // What we are looking for here is a situation where there is a
   // brief break in prediction (such as a flash) but subsequent frames
   // are reasonably well predicted by an earlier (pre flash) frame.
   // The recovery after a flash is indicated by a high pcnt_second_ref
-  // compared to pcnt_inter.
-  return next_frame != NULL &&
-         next_frame->pcnt_second_ref > next_frame->pcnt_inter &&
-         next_frame->pcnt_second_ref >= 0.5;
+  // usage or a second ref coded error notabley lower than the last
+  // frame coded error.
+  if (frame_stats == NULL) {
+    return 0;
+  }
+  return (frame_stats->sr_coded_error < frame_stats->coded_error) ||
+         ((frame_stats->pcnt_second_ref > frame_stats->pcnt_inter) &&
+          (frame_stats->pcnt_second_ref >= 0.5));
+}
+
+static int detect_flash(const TWO_PASS *twopass, int offset) {
+  const FIRSTPASS_STATS *const next_frame = read_frame_stats(twopass, offset);
+  return detect_flash_from_frame_stats(next_frame);
 }
 
 // Update the motion related elements to the GF arf boost calculation.
@@ -1665,79 +1863,80 @@ static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats,
   }
 }
 
-#define BASELINE_ERR_PER_MB 1000.0
-static double calc_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame,
-                               double *sr_accumulator,
-                               double this_frame_mv_in_out, double max_boost) {
+static double calc_frame_boost(const FRAME_INFO *frame_info,
+                               const FIRSTPASS_STATS *this_frame,
+                               const TWO_PASS *const twopass,
+                               int avg_frame_qindex,
+                               double this_frame_mv_in_out) {
   double frame_boost;
-  const double lq = vp9_convert_qindex_to_q(
-      cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth);
+  const double lq =
+      vp9_convert_qindex_to_q(avg_frame_qindex, frame_info->bit_depth);
   const double boost_q_correction = VPXMIN((0.5 + (lq * 0.015)), 1.5);
-  int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
-                                                       : cpi->common.MBs;
+  const double active_area = calculate_active_area(frame_info, this_frame);
 
-  // Correct for any inactive region in the image
-  num_mbs = (int)VPXMAX(1, num_mbs * calculate_active_area(cpi, this_frame));
-
-  // Underlying boost factor is based on inter error ratio.
-  frame_boost = (BASELINE_ERR_PER_MB * num_mbs) /
-                DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator);
-
-  // Update the accumulator for second ref error difference.
-  // This is intended to give an indication of how much the coded error is
-  // increasing over time.
-  *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error) / 1;
-  *sr_accumulator = VPXMAX(0.0, *sr_accumulator);
+  // Frame booost is based on inter error.
+  frame_boost = (twopass->err_per_mb * active_area) /
+                DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
 
   // Small adjustment for cases where there is a zoom out
   if (this_frame_mv_in_out > 0.0)
     frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
 
   // Q correction and scalling
-  frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction;
+  frame_boost = frame_boost * boost_q_correction;
 
-  return VPXMIN(frame_boost, max_boost * boost_q_correction);
+  return VPXMIN(frame_boost, twopass->gf_frame_max_boost * boost_q_correction);
 }
 
-#define KF_BOOST_FACTOR 12.5
 static double calc_kf_frame_boost(VP9_COMP *cpi,
                                   const FIRSTPASS_STATS *this_frame,
                                   double *sr_accumulator,
                                   double this_frame_mv_in_out,
-                                  double max_boost) {
+                                  double zm_factor) {
+  TWO_PASS *const twopass = &cpi->twopass;
   double frame_boost;
   const double lq = vp9_convert_qindex_to_q(
       cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth);
   const double boost_q_correction = VPXMIN((0.50 + (lq * 0.015)), 2.00);
-  int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
-                                                       : cpi->common.MBs;
+  const double active_area =
+      calculate_active_area(&cpi->frame_info, this_frame);
+  double max_boost;
 
-  // Correct for any inactive region in the image
-  num_mbs = (int)VPXMAX(1, num_mbs * calculate_active_area(cpi, this_frame));
-
-  // Underlying boost factor is based on inter error ratio.
-  frame_boost = (BASELINE_ERR_PER_MB * num_mbs) /
+  // Frame booost is based on inter error.
+  frame_boost = (twopass->kf_err_per_mb * active_area) /
                 DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator);
 
   // Update the accumulator for second ref error difference.
   // This is intended to give an indication of how much the coded error is
   // increasing over time.
-  *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error) / 1;
+  *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error);
   *sr_accumulator = VPXMAX(0.0, *sr_accumulator);
 
   // Small adjustment for cases where there is a zoom out
   if (this_frame_mv_in_out > 0.0)
     frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
 
-  // Q correction and scalling
-  frame_boost = frame_boost * KF_BOOST_FACTOR * boost_q_correction;
+  // Q correction and scaling
+  // The 40.0 value here is an experimentally derived baseline minimum.
+  // This value is in line with the minimum per frame boost in the alt_ref
+  // boost calculation.
+  frame_boost =
+      (frame_boost + twopass->kf_frame_min_boost) * boost_q_correction;
 
-  return VPXMIN(frame_boost, max_boost * boost_q_correction);
+  // Maximum allowed boost this frame. May be different for first vs subsequent
+  // key frames.
+  max_boost = (cpi->common.current_video_frame == 0)
+                  ? twopass->kf_frame_max_boost_first
+                  : twopass->kf_frame_max_boost_subs;
+  max_boost *= zm_factor * boost_q_correction;
+
+  return VPXMIN(frame_boost, max_boost);
 }
 
-static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
-                          int *f_boost, int *b_boost) {
-  TWO_PASS *const twopass = &cpi->twopass;
+static int compute_arf_boost(const FRAME_INFO *frame_info,
+                             TWO_PASS *const twopass, int arf_show_idx,
+                             int f_frames, int b_frames, int avg_frame_qindex) {
+  const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
   int i;
   double boost_score = 0.0;
   double mv_ratio_accumulator = 0.0;
@@ -1745,13 +1944,15 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
   double this_frame_mv_in_out = 0.0;
   double mv_in_out_accumulator = 0.0;
   double abs_mv_in_out_accumulator = 0.0;
-  double sr_accumulator = 0.0;
   int arf_boost;
   int flash_detected = 0;
 
   // Search forward from the proposed arf/next gf position.
   for (i = 0; i < f_frames; ++i) {
-    const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
+    const FIRSTPASS_STATS *this_frame =
+        fps_get_frame_stats(first_pass_info, arf_show_idx + i);
+    const FIRSTPASS_STATS *next_frame =
+        fps_get_frame_stats(first_pass_info, arf_show_idx + i + 1);
     if (this_frame == NULL) break;
 
     // Update the motion related elements to the boost calculation.
@@ -1761,24 +1962,22 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
 
     // We want to discount the flash frame itself and the recovery
     // frame that follows as both will have poor scores.
-    flash_detected = detect_flash(twopass, i + offset) ||
-                     detect_flash(twopass, i + offset + 1);
+    flash_detected = detect_flash_from_frame_stats(this_frame) ||
+                     detect_flash_from_frame_stats(next_frame);
 
     // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
-      decay_accumulator *= get_prediction_decay_rate(cpi, this_frame);
+      decay_accumulator *= get_prediction_decay_rate(twopass, this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
                               ? MIN_DECAY_FACTOR
                               : decay_accumulator;
     }
-
-    sr_accumulator = 0.0;
     boost_score += decay_accumulator *
-                   calc_frame_boost(cpi, this_frame, &sr_accumulator,
-                                    this_frame_mv_in_out, GF_MAX_BOOST);
+                   calc_frame_boost(frame_info, this_frame, twopass,
+                                    avg_frame_qindex, this_frame_mv_in_out);
   }
 
-  *f_boost = (int)boost_score;
+  arf_boost = (int)boost_score;
 
   // Reset for backward looking loop.
   boost_score = 0.0;
@@ -1787,11 +1986,13 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
   this_frame_mv_in_out = 0.0;
   mv_in_out_accumulator = 0.0;
   abs_mv_in_out_accumulator = 0.0;
-  sr_accumulator = 0.0;
 
   // Search backward towards last gf position.
   for (i = -1; i >= -b_frames; --i) {
-    const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
+    const FIRSTPASS_STATS *this_frame =
+        fps_get_frame_stats(first_pass_info, arf_show_idx + i);
+    const FIRSTPASS_STATS *next_frame =
+        fps_get_frame_stats(first_pass_info, arf_show_idx + i + 1);
     if (this_frame == NULL) break;
 
     // Update the motion related elements to the boost calculation.
@@ -1799,34 +2000,40 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
         this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
         &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
 
-    // We want to discount the the flash frame itself and the recovery
+    // We want to discount the flash frame itself and the recovery
     // frame that follows as both will have poor scores.
-    flash_detected = detect_flash(twopass, i + offset) ||
-                     detect_flash(twopass, i + offset + 1);
+    flash_detected = detect_flash_from_frame_stats(this_frame) ||
+                     detect_flash_from_frame_stats(next_frame);
 
     // Cumulative effect of prediction quality decay.
     if (!flash_detected) {
-      decay_accumulator *= get_prediction_decay_rate(cpi, this_frame);
+      decay_accumulator *= get_prediction_decay_rate(twopass, this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
                               ? MIN_DECAY_FACTOR
                               : decay_accumulator;
     }
-
-    sr_accumulator = 0.0;
     boost_score += decay_accumulator *
-                   calc_frame_boost(cpi, this_frame, &sr_accumulator,
-                                    this_frame_mv_in_out, GF_MAX_BOOST);
+                   calc_frame_boost(frame_info, this_frame, twopass,
+                                    avg_frame_qindex, this_frame_mv_in_out);
   }
-  *b_boost = (int)boost_score;
+  arf_boost += (int)boost_score;
 
-  arf_boost = (*f_boost + *b_boost);
-  if (arf_boost < ((b_frames + f_frames) * 20))
-    arf_boost = ((b_frames + f_frames) * 20);
+  if (arf_boost < ((b_frames + f_frames) * 40))
+    arf_boost = ((b_frames + f_frames) * 40);
   arf_boost = VPXMAX(arf_boost, MIN_ARF_GF_BOOST);
 
   return arf_boost;
 }
 
+static int calc_arf_boost(VP9_COMP *cpi, int f_frames, int b_frames) {
+  const FRAME_INFO *frame_info = &cpi->frame_info;
+  TWO_PASS *const twopass = &cpi->twopass;
+  const int avg_inter_frame_qindex = cpi->rc.avg_frame_qindex[INTER_FRAME];
+  int arf_show_idx = get_show_idx(twopass);
+  return compute_arf_boost(frame_info, twopass, arf_show_idx, f_frames,
+                           b_frames, avg_inter_frame_qindex);
+}
+
 // Calculate a section intra ratio used in setting max loop filter.
 static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin,
                                          const FIRSTPASS_STATS *end,
@@ -1849,28 +2056,44 @@ static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin,
 // Calculate the total bits to allocate in this GF/ARF group.
 static int64_t calculate_total_gf_group_bits(VP9_COMP *cpi,
                                              double gf_group_err) {
+  VP9_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
   const TWO_PASS *const twopass = &cpi->twopass;
   const int max_bits = frame_max_bits(rc, &cpi->oxcf);
   int64_t total_group_bits;
+  const int is_key_frame = frame_is_intra_only(cm);
+  const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active;
+  int gop_frames =
+      rc->baseline_gf_interval + rc->source_alt_ref_pending - arf_active_or_kf;
 
   // Calculate the bits to be allocated to the group as a whole.
-  if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0)) {
+  if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0.0)) {
+    int key_frame_interval = rc->frames_since_key + rc->frames_to_key;
+    int distance_from_next_key_frame =
+        rc->frames_to_key -
+        (rc->baseline_gf_interval + rc->source_alt_ref_pending);
+    int max_gf_bits_bias = rc->avg_frame_bandwidth;
+    double gf_interval_bias_bits_normalize_factor =
+        (double)rc->baseline_gf_interval / 16;
     total_group_bits = (int64_t)(twopass->kf_group_bits *
                                  (gf_group_err / twopass->kf_group_error_left));
+    // TODO(ravi): Experiment with different values of max_gf_bits_bias
+    total_group_bits +=
+        (int64_t)((double)distance_from_next_key_frame / key_frame_interval *
+                  max_gf_bits_bias * gf_interval_bias_bits_normalize_factor);
   } else {
     total_group_bits = 0;
   }
 
   // Clamp odd edge cases.
-  total_group_bits =
-      (total_group_bits < 0) ? 0 : (total_group_bits > twopass->kf_group_bits)
-                                       ? twopass->kf_group_bits
-                                       : total_group_bits;
+  total_group_bits = (total_group_bits < 0) ? 0
+                     : (total_group_bits > twopass->kf_group_bits)
+                         ? twopass->kf_group_bits
+                         : total_group_bits;
 
   // Clip based on user supplied data rate variability limit.
-  if (total_group_bits > (int64_t)max_bits * rc->baseline_gf_interval)
-    total_group_bits = (int64_t)max_bits * rc->baseline_gf_interval;
+  if (total_group_bits > (int64_t)max_bits * gop_frames)
+    total_group_bits = (int64_t)max_bits * gop_frames;
 
   return total_group_bits;
 }
@@ -1881,9 +2104,9 @@ static int calculate_boost_bits(int frame_count, int boost,
   int allocation_chunks;
 
   // return 0 for invalid inputs (could arise e.g. through rounding errors)
-  if (!boost || (total_group_bits <= 0) || (frame_count <= 0)) return 0;
+  if (!boost || (total_group_bits <= 0) || (frame_count < 0)) return 0;
 
-  allocation_chunks = (frame_count * 100) + boost;
+  allocation_chunks = (frame_count * NORMAL_BOOST) + boost;
 
   // Prevent overflow.
   if (boost > 1023) {
@@ -1897,183 +2120,421 @@ static int calculate_boost_bits(int frame_count, int boost,
                 0);
 }
 
-// Current limit on maximum number of active arfs in a GF/ARF group.
-#define MAX_ACTIVE_ARFS 2
-#define ARF_SLOT1 2
-#define ARF_SLOT2 3
-// This function indirects the choice of buffers for arfs.
-// At the moment the values are fixed but this may change as part of
-// the integration process with other codec features that swap buffers around.
-static void get_arf_buffer_indices(unsigned char *arf_buffer_indices) {
-  arf_buffer_indices[0] = ARF_SLOT1;
-  arf_buffer_indices[1] = ARF_SLOT2;
+// Used in corpus vbr: Calculates the total normalized group complexity score
+// for a given number of frames starting at the current position in the stats
+// file.
+static double calculate_group_score(VP9_COMP *cpi, double av_score,
+                                    int frame_count) {
+  VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  TWO_PASS *const twopass = &cpi->twopass;
+  const FIRSTPASS_STATS *s = twopass->stats_in;
+  double score_total = 0.0;
+  int i = 0;
+
+  // We don't ever want to return a 0 score here.
+  if (frame_count == 0) return 1.0;
+
+  while ((i < frame_count) && (s < twopass->stats_in_end)) {
+    score_total += calculate_norm_frame_score(cpi, twopass, oxcf, s, av_score);
+    ++s;
+    ++i;
+  }
+
+  return score_total;
+}
+
+static void find_arf_order(VP9_COMP *cpi, GF_GROUP *gf_group,
+                           int *index_counter, int depth, int start, int end) {
+  TWO_PASS *twopass = &cpi->twopass;
+  const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
+  FIRSTPASS_STATS fpf_frame;
+  const int mid = (start + end + 1) >> 1;
+  const int min_frame_interval = 2;
+  int idx;
+
+  // Process regular P frames
+  if ((end - start < min_frame_interval) ||
+      (depth > gf_group->allowed_max_layer_depth)) {
+    for (idx = start; idx <= end; ++idx) {
+      gf_group->update_type[*index_counter] = LF_UPDATE;
+      gf_group->arf_src_offset[*index_counter] = 0;
+      gf_group->frame_gop_index[*index_counter] = idx;
+      gf_group->rf_level[*index_counter] = INTER_NORMAL;
+      gf_group->layer_depth[*index_counter] = depth;
+      gf_group->gfu_boost[*index_counter] = NORMAL_BOOST;
+      ++(*index_counter);
+    }
+    gf_group->max_layer_depth = VPXMAX(gf_group->max_layer_depth, depth);
+    return;
+  }
+
+  assert(abs(mid - start) >= 1 && abs(mid - end) >= 1);
+
+  // Process ARF frame
+  gf_group->layer_depth[*index_counter] = depth;
+  gf_group->update_type[*index_counter] = ARF_UPDATE;
+  gf_group->arf_src_offset[*index_counter] = mid - start;
+  gf_group->frame_gop_index[*index_counter] = mid;
+  gf_group->rf_level[*index_counter] = GF_ARF_LOW;
+
+  for (idx = 0; idx <= mid; ++idx)
+    if (EOF == input_stats(twopass, &fpf_frame)) break;
+
+  gf_group->gfu_boost[*index_counter] =
+      VPXMAX(MIN_ARF_GF_BOOST,
+             calc_arf_boost(cpi, end - mid + 1, mid - start) >> depth);
+
+  reset_fpf_position(twopass, start_pos);
+
+  ++(*index_counter);
+
+  find_arf_order(cpi, gf_group, index_counter, depth + 1, start, mid - 1);
+
+  gf_group->update_type[*index_counter] = USE_BUF_FRAME;
+  gf_group->arf_src_offset[*index_counter] = 0;
+  gf_group->frame_gop_index[*index_counter] = mid;
+  gf_group->rf_level[*index_counter] = INTER_NORMAL;
+  gf_group->layer_depth[*index_counter] = depth;
+  ++(*index_counter);
+
+  find_arf_order(cpi, gf_group, index_counter, depth + 1, mid + 1, end);
+}
+
+static INLINE void set_gf_overlay_frame_type(GF_GROUP *gf_group,
+                                             int frame_index,
+                                             int source_alt_ref_active) {
+  if (source_alt_ref_active) {
+    gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+    gf_group->rf_level[frame_index] = INTER_NORMAL;
+    gf_group->layer_depth[frame_index] = MAX_ARF_LAYERS - 1;
+    gf_group->gfu_boost[frame_index] = NORMAL_BOOST;
+  } else {
+    gf_group->update_type[frame_index] = GF_UPDATE;
+    gf_group->rf_level[frame_index] = GF_ARF_STD;
+    gf_group->layer_depth[frame_index] = 0;
+  }
+}
+
+static void define_gf_group_structure(VP9_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
+  int frame_index = 0;
+  int key_frame = cpi->common.frame_type == KEY_FRAME;
+  int layer_depth = 1;
+  int gop_frames =
+      rc->baseline_gf_interval - (key_frame || rc->source_alt_ref_pending);
+
+  gf_group->frame_start = cpi->common.current_video_frame;
+  gf_group->frame_end = gf_group->frame_start + rc->baseline_gf_interval;
+  gf_group->max_layer_depth = 0;
+  gf_group->allowed_max_layer_depth = 0;
+
+  // For key frames the frame target rate is already set and it
+  // is also the golden frame.
+  // === [frame_index == 0] ===
+  if (!key_frame)
+    set_gf_overlay_frame_type(gf_group, frame_index, rc->source_alt_ref_active);
+
+  ++frame_index;
+
+  // === [frame_index == 1] ===
+  if (rc->source_alt_ref_pending) {
+    gf_group->update_type[frame_index] = ARF_UPDATE;
+    gf_group->rf_level[frame_index] = GF_ARF_STD;
+    gf_group->layer_depth[frame_index] = layer_depth;
+    gf_group->arf_src_offset[frame_index] =
+        (unsigned char)(rc->baseline_gf_interval - 1);
+    gf_group->frame_gop_index[frame_index] = rc->baseline_gf_interval;
+    gf_group->max_layer_depth = 1;
+    ++frame_index;
+    ++layer_depth;
+    gf_group->allowed_max_layer_depth = cpi->oxcf.enable_auto_arf;
+  }
+
+  find_arf_order(cpi, gf_group, &frame_index, layer_depth, 1, gop_frames);
+
+  // TODO(b/345523905): Why do we need to set an overlay frame in the end?
+  set_gf_overlay_frame_type(gf_group, frame_index, rc->source_alt_ref_pending);
+  gf_group->arf_src_offset[frame_index] = 0;
+  gf_group->frame_gop_index[frame_index] = rc->baseline_gf_interval;
+
+  // Set the frame ops number.
+  gf_group->gf_group_size = frame_index;
+}
+
+static INLINE void gf_group_set_overlay_frame(GF_GROUP *gf_group,
+                                              int frame_index,
+                                              int show_frame_index) {
+  gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+  gf_group->arf_src_offset[frame_index] = 0;
+  gf_group->frame_gop_index[frame_index] = show_frame_index;
+  gf_group->rf_level[frame_index] = INTER_NORMAL;
+  gf_group->layer_depth[frame_index] = MAX_ARF_LAYERS - 1;
+}
+
+static INLINE void gf_group_set_key_frame(GF_GROUP *gf_group, int frame_index,
+                                          int show_frame_index) {
+  gf_group->update_type[frame_index] = KF_UPDATE;
+  gf_group->arf_src_offset[frame_index] = 0;
+  gf_group->frame_gop_index[frame_index] = show_frame_index;
+  gf_group->rf_level[frame_index] = KF_STD;
+  gf_group->layer_depth[frame_index] = 0;
+}
+
+static INLINE void gf_group_set_arf_frame(GF_GROUP *gf_group, int frame_index,
+                                          int show_frame_index) {
+  gf_group->update_type[frame_index] = ARF_UPDATE;
+  gf_group->arf_src_offset[frame_index] =
+      (unsigned char)(show_frame_index - frame_index);
+  gf_group->frame_gop_index[frame_index] = show_frame_index;
+  gf_group->rf_level[frame_index] = GF_ARF_STD;
+  gf_group->layer_depth[frame_index] = 1;
+}
+
+static INLINE void gf_group_set_inter_normal_frame(GF_GROUP *gf_group,
+                                                   int frame_index,
+                                                   int show_frame_index) {
+  gf_group->update_type[frame_index] = LF_UPDATE;
+  gf_group->arf_src_offset[frame_index] = 0;
+  gf_group->frame_gop_index[frame_index] = show_frame_index;
+  gf_group->rf_level[frame_index] = INTER_NORMAL;
+  gf_group->layer_depth[frame_index] = 2;
+}
+
+static INLINE void set_gf_frame_type(vpx_rc_frame_update_type_t update_type,
+                                     int show_frame_count, GF_GROUP *gf_group,
+                                     int *frame_index, int *show_frame_index) {
+  if (update_type == VPX_RC_KF_UPDATE) {
+    gf_group_set_key_frame(gf_group, *frame_index, *show_frame_index);
+    ++(*frame_index);
+    ++(*show_frame_index);
+  } else if (update_type == VPX_RC_OVERLAY_UPDATE) {
+    gf_group_set_overlay_frame(gf_group, *frame_index, *show_frame_index);
+    ++(*frame_index);
+    ++(*show_frame_index);
+  } else if (update_type == VPX_RC_ARF_UPDATE) {
+    gf_group_set_arf_frame(gf_group, *frame_index, show_frame_count);
+    ++(*frame_index);
+  } else if (update_type == VPX_RC_LF_UPDATE) {
+    gf_group_set_inter_normal_frame(gf_group, *frame_index, *show_frame_index);
+    ++(*frame_index);
+    ++(*show_frame_index);
+  } else {
+    assert(0);
+  }
+}
+
+static void ext_rc_define_gf_group_structure(
+    const vpx_rc_gop_decision_t *gop_decision, GF_GROUP *gf_group) {
+  const int gop_coding_frames = gop_decision->gop_coding_frames;
+
+  const int show_frame_count = gop_coding_frames - gop_decision->use_alt_ref;
+  int frame_index = 0;
+  int show_frame_index = 0;
+
+  for (int i = frame_index; i < gop_coding_frames; i++) {
+    set_gf_frame_type(gop_decision->update_type[i], show_frame_count, gf_group,
+                      &frame_index, &show_frame_index);
+
+    gf_group->update_ref_idx[i] = gop_decision->update_ref_index[i];
+
+    gf_group->ext_rc_ref[i].last_index = 0;
+    gf_group->ext_rc_ref[i].golden_index = 0;
+    gf_group->ext_rc_ref[i].altref_index = 0;
+    for (int ref_frame = 0; ref_frame < 3; ref_frame++) {
+      const vpx_rc_ref_frame_t *const ext_ref_frame =
+          &gop_decision->ref_frame_list[i];
+      const int ref_index = ext_ref_frame->index[ref_frame];
+      gf_group->ref_frame_list[i][ref_frame] = ext_ref_frame->index[ref_frame];
+      switch (ext_ref_frame->name[ref_frame]) {
+        case VPX_RC_LAST_FRAME:
+          gf_group->ext_rc_ref[i].last_index = ref_index;
+          break;
+        case VPX_RC_GOLDEN_FRAME:
+          gf_group->ext_rc_ref[i].golden_index = ref_index;
+          break;
+        case VPX_RC_ALTREF_FRAME:
+          gf_group->ext_rc_ref[i].altref_index = ref_index;
+          break;
+        default: break;
+      }
+    }
+    if (gf_group->update_type[i] == OVERLAY_UPDATE) {
+      // From ext_rc, overlay may not update any ref. But here we force it to
+      // update its arf's slot. This is probably OK since the arf and this
+      // overlay frame should be very similar.
+      gf_group->update_ref_idx[i] = gf_group->ext_rc_ref[i].altref_index;
+    }
+  }
+  // max_layer_depth is hardcoded to match the behavior of
+  // define_gf_group_structure()
+  // TODO(angiebird): Check whether max_layer_depth has performance impact.
+  gf_group->max_layer_depth = 2;
+  gf_group->allowed_max_layer_depth = 1;
+  gf_group->gf_group_size = gop_coding_frames;
+
+  // TODO(b/345523905): Why do we need to set an overlay frame in the end?
+  assert(show_frame_count == show_frame_index);
+  if (gop_decision->use_alt_ref) {
+    gf_group_set_overlay_frame(gf_group, gf_group->gf_group_size,
+                               show_frame_index);
+  } else {
+    gf_group_set_inter_normal_frame(gf_group, gf_group->gf_group_size,
+                                    show_frame_index);
+  }
+
+  gf_group->frame_start = 0;
+  gf_group->frame_end = gf_group->gf_group_size - gop_decision->use_alt_ref;
 }
 
 static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
                                    int gf_arf_bits) {
+  VP9EncoderConfig *const oxcf = &cpi->oxcf;
   RATE_CONTROL *const rc = &cpi->rc;
   TWO_PASS *const twopass = &cpi->twopass;
   GF_GROUP *const gf_group = &twopass->gf_group;
   FIRSTPASS_STATS frame_stats;
   int i;
-  int frame_index = 1;
+  int frame_index = 0;
   int target_frame_size;
   int key_frame;
-  const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf);
+  const int max_bits = frame_max_bits(&cpi->rc, oxcf);
   int64_t total_group_bits = gf_group_bits;
-  int mid_boost_bits = 0;
   int mid_frame_idx;
-  unsigned char arf_buffer_indices[MAX_ACTIVE_ARFS];
-  int alt_frame_index = frame_index;
-  int has_temporal_layers =
-      is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1;
   int normal_frames;
   int normal_frame_bits;
-  int last_frame_bits;
-  int last_frame_reduction;
+  int last_frame_reduction = 0;
+  double av_score = 1.0;
+  double tot_norm_frame_score = 1.0;
+  double this_frame_score = 1.0;
 
-  // Only encode alt reference frame in temporal base layer.
-  if (has_temporal_layers) alt_frame_index = cpi->svc.number_temporal_layers;
+  // Define the GF structure and specify
+  int gop_frames = gf_group->gf_group_size;
 
-  key_frame =
-      cpi->common.frame_type == KEY_FRAME || vp9_is_upper_layer_key_frame(cpi);
-
-  get_arf_buffer_indices(arf_buffer_indices);
+  key_frame = cpi->common.frame_type == KEY_FRAME;
 
   // For key frames the frame target rate is already set and it
   // is also the golden frame.
+  // === [frame_index == 0] ===
   if (!key_frame) {
-    if (rc->source_alt_ref_active) {
-      gf_group->update_type[0] = OVERLAY_UPDATE;
-      gf_group->rf_level[0] = INTER_NORMAL;
-      gf_group->bit_allocation[0] = 0;
-    } else {
-      gf_group->update_type[0] = GF_UPDATE;
-      gf_group->rf_level[0] = GF_ARF_STD;
-      gf_group->bit_allocation[0] = gf_arf_bits;
-    }
-    gf_group->arf_update_idx[0] = arf_buffer_indices[0];
-    gf_group->arf_ref_idx[0] = arf_buffer_indices[0];
-
-    // Step over the golden frame / overlay frame
-    if (EOF == input_stats(twopass, &frame_stats)) return;
+    gf_group->bit_allocation[frame_index] =
+        rc->source_alt_ref_active ? 0 : gf_arf_bits;
   }
 
   // Deduct the boost bits for arf (or gf if it is not a key frame)
   // from the group total.
   if (rc->source_alt_ref_pending || !key_frame) total_group_bits -= gf_arf_bits;
 
+  ++frame_index;
+
+  // === [frame_index == 1] ===
   // Store the bits to spend on the ARF if there is one.
   if (rc->source_alt_ref_pending) {
-    gf_group->update_type[alt_frame_index] = ARF_UPDATE;
-    gf_group->rf_level[alt_frame_index] = GF_ARF_STD;
-    gf_group->bit_allocation[alt_frame_index] = gf_arf_bits;
+    gf_group->bit_allocation[frame_index] = gf_arf_bits;
 
-    if (has_temporal_layers)
-      gf_group->arf_src_offset[alt_frame_index] =
-          (unsigned char)(rc->baseline_gf_interval -
-                          cpi->svc.number_temporal_layers);
-    else
-      gf_group->arf_src_offset[alt_frame_index] =
-          (unsigned char)(rc->baseline_gf_interval - 1);
-
-    gf_group->arf_update_idx[alt_frame_index] = arf_buffer_indices[0];
-    gf_group->arf_ref_idx[alt_frame_index] =
-        arf_buffer_indices[cpi->multi_arf_last_grp_enabled &&
-                           rc->source_alt_ref_active];
-    if (!has_temporal_layers) ++frame_index;
-
-    if (cpi->multi_arf_enabled) {
-      // Set aside a slot for a level 1 arf.
-      gf_group->update_type[frame_index] = ARF_UPDATE;
-      gf_group->rf_level[frame_index] = GF_ARF_LOW;
-      gf_group->arf_src_offset[frame_index] =
-          (unsigned char)((rc->baseline_gf_interval >> 1) - 1);
-      gf_group->arf_update_idx[frame_index] = arf_buffer_indices[1];
-      gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
-      ++frame_index;
-    }
+    ++frame_index;
   }
 
-  // Note index of the first normal inter frame int eh group (not gf kf arf)
-  gf_group->first_inter_index = frame_index;
-
   // Define middle frame
   mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1;
 
-  normal_frames = (rc->baseline_gf_interval - rc->source_alt_ref_pending);
-
-  // The last frame in the group is used less as a predictor so reduce
-  // its allocation a little.
-  if (normal_frames > 1) {
+  normal_frames = (rc->baseline_gf_interval - 1);
+  if (normal_frames > 1)
     normal_frame_bits = (int)(total_group_bits / normal_frames);
-    last_frame_reduction = normal_frame_bits / 16;
-    last_frame_bits = normal_frame_bits - last_frame_reduction;
-  } else {
+  else
     normal_frame_bits = (int)total_group_bits;
-    last_frame_bits = normal_frame_bits;
-    last_frame_reduction = 0;
+
+  gf_group->gfu_boost[1] = rc->gfu_boost;
+
+  if (cpi->multi_layer_arf) {
+    int idx;
+    int arf_depth_bits[MAX_ARF_LAYERS] = { 0 };
+    int arf_depth_count[MAX_ARF_LAYERS] = { 0 };
+    int arf_depth_boost[MAX_ARF_LAYERS] = { 0 };
+    int total_arfs = 1;  // Account for the base layer ARF.
+
+    for (idx = 0; idx < gop_frames; ++idx) {
+      if (gf_group->update_type[idx] == ARF_UPDATE) {
+        arf_depth_boost[gf_group->layer_depth[idx]] += gf_group->gfu_boost[idx];
+        ++arf_depth_count[gf_group->layer_depth[idx]];
+      }
+    }
+
+    for (idx = 2; idx < MAX_ARF_LAYERS; ++idx) {
+      if (arf_depth_boost[idx] == 0) break;
+      arf_depth_bits[idx] = calculate_boost_bits(
+          rc->baseline_gf_interval - total_arfs - arf_depth_count[idx],
+          arf_depth_boost[idx], total_group_bits);
+
+      total_group_bits -= arf_depth_bits[idx];
+      total_arfs += arf_depth_count[idx];
+    }
+
+    // offset the base layer arf
+    normal_frames -= (total_arfs - 1);
+    if (normal_frames > 1)
+      normal_frame_bits = (int)(total_group_bits / normal_frames);
+    else
+      normal_frame_bits = (int)total_group_bits;
+
+    target_frame_size = normal_frame_bits;
+    target_frame_size =
+        clamp(target_frame_size, 0, VPXMIN(max_bits, (int)total_group_bits));
+
+    // The first layer ARF has its bit allocation assigned.
+    for (idx = frame_index; idx < gop_frames; ++idx) {
+      switch (gf_group->update_type[idx]) {
+        case ARF_UPDATE:
+          gf_group->bit_allocation[idx] =
+              (int)(((int64_t)arf_depth_bits[gf_group->layer_depth[idx]] *
+                     gf_group->gfu_boost[idx]) /
+                    arf_depth_boost[gf_group->layer_depth[idx]]);
+          break;
+        case USE_BUF_FRAME: gf_group->bit_allocation[idx] = 0; break;
+        default: gf_group->bit_allocation[idx] = target_frame_size; break;
+      }
+    }
+    gf_group->bit_allocation[idx] = 0;
+
+    return;
+  }
+
+  if (oxcf->vbr_corpus_complexity) {
+    av_score = get_distribution_av_err(cpi, twopass);
+    tot_norm_frame_score = calculate_group_score(cpi, av_score, normal_frames);
   }
 
   // Allocate bits to the other frames in the group.
   for (i = 0; i < normal_frames; ++i) {
-    int arf_idx = 0;
     if (EOF == input_stats(twopass, &frame_stats)) break;
-
-    if (has_temporal_layers && frame_index == alt_frame_index) {
-      ++frame_index;
+    if (oxcf->vbr_corpus_complexity) {
+      this_frame_score = calculate_norm_frame_score(cpi, twopass, oxcf,
+                                                    &frame_stats, av_score);
+      normal_frame_bits = (int)((double)total_group_bits *
+                                (this_frame_score / tot_norm_frame_score));
     }
 
-    target_frame_size = (i == (normal_frames - 1))
-                            ? last_frame_bits
-                            : (i == mid_frame_idx)
-                                  ? normal_frame_bits + last_frame_reduction
-                                  : normal_frame_bits;
-
-    if (rc->source_alt_ref_pending && cpi->multi_arf_enabled) {
-      mid_boost_bits += (target_frame_size >> 4);
-      target_frame_size -= (target_frame_size >> 4);
-
-      if (frame_index <= mid_frame_idx) arf_idx = 1;
+    target_frame_size = normal_frame_bits;
+    if ((i == (normal_frames - 1)) && (i >= 1)) {
+      last_frame_reduction = normal_frame_bits / 16;
+      target_frame_size -= last_frame_reduction;
     }
-    gf_group->arf_update_idx[frame_index] = arf_buffer_indices[arf_idx];
-    gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[arf_idx];
 
     target_frame_size =
         clamp(target_frame_size, 0, VPXMIN(max_bits, (int)total_group_bits));
 
-    gf_group->update_type[frame_index] = LF_UPDATE;
-    gf_group->rf_level[frame_index] = INTER_NORMAL;
-
     gf_group->bit_allocation[frame_index] = target_frame_size;
     ++frame_index;
   }
 
+  // Add in some extra bits for the middle frame in the group.
+  gf_group->bit_allocation[mid_frame_idx] += last_frame_reduction;
+
   // Note:
   // We need to configure the frame at the end of the sequence + 1 that will be
   // the start frame for the next group. Otherwise prior to the call to
   // vp9_rc_get_second_pass_params() the data will be undefined.
-  gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0];
-  gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
-
-  if (rc->source_alt_ref_pending) {
-    gf_group->update_type[frame_index] = OVERLAY_UPDATE;
-    gf_group->rf_level[frame_index] = INTER_NORMAL;
-
-    // Final setup for second arf and its overlay.
-    if (cpi->multi_arf_enabled) {
-      gf_group->bit_allocation[2] =
-          gf_group->bit_allocation[mid_frame_idx] + mid_boost_bits;
-      gf_group->update_type[mid_frame_idx] = OVERLAY_UPDATE;
-      gf_group->bit_allocation[mid_frame_idx] = 0;
-    }
-  } else {
-    gf_group->update_type[frame_index] = GF_UPDATE;
-    gf_group->rf_level[frame_index] = GF_ARF_STD;
-  }
-
-  // Note whether multi-arf was enabled this group for next time.
-  cpi->multi_arf_last_grp_enabled = cpi->multi_arf_enabled;
 }
 
 // Adjusts the ARNF filter for a GF group.
@@ -2085,23 +2546,228 @@ static void adjust_group_arnr_filter(VP9_COMP *cpi, double section_noise,
 
   twopass->arnr_strength_adjustment = 0;
 
-  if ((section_zeromv < 0.10) || (section_noise <= (SECTION_NOISE_DEF * 0.75)))
+  if (section_noise < 150) {
     twopass->arnr_strength_adjustment -= 1;
+    if (section_noise < 75) twopass->arnr_strength_adjustment -= 1;
+  } else if (section_noise > 250)
+    twopass->arnr_strength_adjustment += 1;
+
   if (section_zeromv > 0.50) twopass->arnr_strength_adjustment += 1;
 }
 
 // Analyse and define a gf/arf group.
-static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+#define ARF_ABS_ZOOM_THRESH 4.0
+
+#define MAX_GF_BOOST 5400
+
+typedef struct RANGE {
+  int min;
+  int max;
+} RANGE;
+
+/* get_gop_coding_frame_num() depends on several fields in RATE_CONTROL *rc as
+ * follows.
+ * Static fields:
+ * (The following fields will remain unchanged after initialization of encoder.)
+ *   rc->static_scene_max_gf_interval
+ *   rc->min_gf_interval
+ *   twopass->sr_diff_factor
+ *   twopass->sr_default_decay_limit
+ *   twopass->zm_factor
+ *
+ * Dynamic fields:
+ * (The following fields will be updated before or after coding each frame.)
+ *   rc->frames_to_key
+ *   rc->frames_since_key
+ *   rc->source_alt_ref_active
+ *
+ * TODO(angiebird): Separate the dynamic fields and static fields into two
+ * structs.
+ */
+static int get_gop_coding_frame_num(
+    int *use_alt_ref, const FRAME_INFO *frame_info,
+    const TWO_PASS *const twopass, const RATE_CONTROL *rc,
+    int gf_start_show_idx, const RANGE *active_gf_interval,
+    double gop_intra_factor, int lag_in_frames, int *end_of_sequence) {
+  const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
+  double loop_decay_rate = 1.00;
+  double mv_ratio_accumulator = 0.0;
+  double this_frame_mv_in_out = 0.0;
+  double mv_in_out_accumulator = 0.0;
+  double abs_mv_in_out_accumulator = 0.0;
+  double sr_accumulator = 0.0;
+  // Motion breakout threshold for loop below depends on image size.
+  double mv_ratio_accumulator_thresh =
+      (frame_info->frame_height + frame_info->frame_width) / 4.0;
+  double zero_motion_accumulator = 1.0;
+  int gop_coding_frames;
+
+  *use_alt_ref = 1;
+  gop_coding_frames = 0;
+  while (gop_coding_frames < rc->static_scene_max_gf_interval &&
+         gop_coding_frames < rc->frames_to_key) {
+    const FIRSTPASS_STATS *next_next_frame;
+    const FIRSTPASS_STATS *next_frame;
+    int flash_detected;
+    ++gop_coding_frames;
+
+    next_frame = fps_get_frame_stats(first_pass_info,
+                                     gf_start_show_idx + gop_coding_frames);
+    if (next_frame == NULL) {
+      *end_of_sequence = gop_coding_frames == 1 && rc->source_alt_ref_active;
+      break;
+    }
+
+    // Test for the case where there is a brief flash but the prediction
+    // quality back to an earlier frame is then restored.
+    next_next_frame = fps_get_frame_stats(
+        first_pass_info, gf_start_show_idx + gop_coding_frames + 1);
+    flash_detected = detect_flash_from_frame_stats(next_next_frame);
+
+    // Update the motion related elements to the boost calculation.
+    accumulate_frame_motion_stats(
+        next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
+        &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+
+    // Monitor for static sections.
+    if ((rc->frames_since_key + gop_coding_frames - 1) > 1) {
+      zero_motion_accumulator = VPXMIN(
+          zero_motion_accumulator, get_zero_motion_factor(twopass, next_frame));
+    }
+
+    // Accumulate the effect of prediction quality decay.
+    if (!flash_detected) {
+      double last_loop_decay_rate = loop_decay_rate;
+      loop_decay_rate = get_prediction_decay_rate(twopass, next_frame);
+
+      // Break clause to detect very still sections after motion. For example,
+      // a static image after a fade or other transition.
+      if (gop_coding_frames > rc->min_gf_interval && loop_decay_rate >= 0.999 &&
+          last_loop_decay_rate < 0.9) {
+        int still_interval = 5;
+        if (check_transition_to_still(first_pass_info,
+                                      gf_start_show_idx + gop_coding_frames,
+                                      still_interval)) {
+          *use_alt_ref = 0;
+          break;
+        }
+      }
+
+      // Update the accumulator for second ref error difference.
+      // This is intended to give an indication of how much the coded error is
+      // increasing over time.
+      if (gop_coding_frames == 1) {
+        sr_accumulator += next_frame->coded_error;
+      } else {
+        sr_accumulator +=
+            (next_frame->sr_coded_error - next_frame->coded_error);
+      }
+    }
+
+    // Break out conditions.
+    // Break at maximum of active_gf_interval->max unless almost totally
+    // static.
+    //
+    // Note that the addition of a test of rc->source_alt_ref_active is
+    // deliberate. The effect of this is that after a normal altref group even
+    // if the material is static there will be one normal length GF group
+    // before allowing longer GF groups. The reason for this is that in cases
+    // such as slide shows where slides are separated by a complex transition
+    // such as a fade, the arf group spanning the transition may not be coded
+    // at a very high quality and hence this frame (with its overlay) is a
+    // poor golden frame to use for an extended group.
+    if ((gop_coding_frames >= active_gf_interval->max) &&
+        ((zero_motion_accumulator < 0.995) || (rc->source_alt_ref_active))) {
+      break;
+    }
+    if (
+        // Don't break out with a very short interval.
+        (gop_coding_frames >= active_gf_interval->min) &&
+        // If possible don't break very close to a kf
+        ((rc->frames_to_key - gop_coding_frames) >= rc->min_gf_interval) &&
+        (gop_coding_frames & 0x01) && (!flash_detected) &&
+        ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
+         (abs_mv_in_out_accumulator > ARF_ABS_ZOOM_THRESH) ||
+         (sr_accumulator > gop_intra_factor * next_frame->intra_error))) {
+      break;
+    }
+  }
+  *use_alt_ref &= zero_motion_accumulator < 0.995;
+  *use_alt_ref &= gop_coding_frames < lag_in_frames;
+  *use_alt_ref &= gop_coding_frames >= rc->min_gf_interval;
+  return gop_coding_frames;
+}
+
+static RANGE get_active_gf_inverval_range(
+    const FRAME_INFO *frame_info, const RATE_CONTROL *rc, int arf_active_or_kf,
+    int gf_start_show_idx, int active_worst_quality, int last_boosted_qindex) {
+  RANGE active_gf_interval;
+  int int_max_q = (int)(vp9_convert_qindex_to_q(active_worst_quality,
+                                                frame_info->bit_depth));
+  int q_term = (gf_start_show_idx == 0)
+                   ? int_max_q / 32
+                   : (int)(vp9_convert_qindex_to_q(last_boosted_qindex,
+                                                   frame_info->bit_depth) /
+                           6);
+  active_gf_interval.min =
+      rc->min_gf_interval + arf_active_or_kf + VPXMIN(2, int_max_q / 200);
+  active_gf_interval.min =
+      VPXMIN(active_gf_interval.min, rc->max_gf_interval + arf_active_or_kf);
+
+  // The value chosen depends on the active Q range. At low Q we have
+  // bits to spare and are better with a smaller interval and smaller boost.
+  // At high Q when there are few bits to spare we are better with a longer
+  // interval to spread the cost of the GF.
+  active_gf_interval.max = 11 + arf_active_or_kf + VPXMIN(5, q_term);
+
+  // Force max GF interval to be odd.
+  active_gf_interval.max = active_gf_interval.max | 0x01;
+
+  // We have: active_gf_interval.min <=
+  // rc->max_gf_interval + arf_active_or_kf.
+  if (active_gf_interval.max < active_gf_interval.min) {
+    active_gf_interval.max = active_gf_interval.min;
+  } else {
+    active_gf_interval.max =
+        VPXMIN(active_gf_interval.max, rc->max_gf_interval + arf_active_or_kf);
+  }
+
+  // Would the active max drop us out just before the near the next kf?
+  if ((active_gf_interval.max <= rc->frames_to_key) &&
+      (active_gf_interval.max >= (rc->frames_to_key - rc->min_gf_interval))) {
+    active_gf_interval.max = rc->frames_to_key / 2;
+  }
+  active_gf_interval.max =
+      VPXMAX(active_gf_interval.max, active_gf_interval.min);
+  return active_gf_interval;
+}
+
+static int get_arf_layers(int multi_layer_arf, int max_layers,
+                          int coding_frame_num) {
+  assert(max_layers <= MAX_ARF_LAYERS);
+  if (multi_layer_arf) {
+    int layers = 0;
+    int i;
+    for (i = coding_frame_num; i > 0; i >>= 1) {
+      ++layers;
+    }
+    layers = VPXMIN(max_layers, layers);
+    return layers;
+  } else {
+    return 1;
+  }
+}
+
+static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   VP9EncoderConfig *const oxcf = &cpi->oxcf;
   TWO_PASS *const twopass = &cpi->twopass;
-  FIRSTPASS_STATS next_frame;
+  const FRAME_INFO *frame_info = &cpi->frame_info;
+  const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
   const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
-  int i;
+  int gop_coding_frames;
 
-  double boost_score = 0.0;
-  double old_boost_score = 0.0;
   double gf_group_err = 0.0;
   double gf_group_raw_error = 0.0;
   double gf_group_noise = 0.0;
@@ -2109,268 +2775,263 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   double gf_group_inactive_zone_rows = 0.0;
   double gf_group_inter = 0.0;
   double gf_group_motion = 0.0;
-  double gf_first_frame_err = 0.0;
-  double mod_frame_err = 0.0;
 
-  double mv_ratio_accumulator = 0.0;
-  double decay_accumulator = 1.0;
-  double zero_motion_accumulator = 1.0;
-  double loop_decay_rate = 1.00;
-  double last_loop_decay_rate = 1.00;
+  int allow_alt_ref = is_altref_enabled(cpi);
+  int use_alt_ref;
 
-  double this_frame_mv_in_out = 0.0;
-  double mv_in_out_accumulator = 0.0;
-  double abs_mv_in_out_accumulator = 0.0;
-  double mv_ratio_accumulator_thresh;
-  double mv_in_out_thresh;
-  double abs_mv_in_out_thresh;
-  double sr_accumulator = 0.0;
-  unsigned int allow_alt_ref = is_altref_enabled(cpi);
-
-  int f_boost = 0;
-  int b_boost = 0;
-  int flash_detected;
-  int active_max_gf_interval;
-  int active_min_gf_interval;
   int64_t gf_group_bits;
   int gf_arf_bits;
-  const int is_key_frame = frame_is_intra_only(cm);
+  int is_key_frame = frame_is_intra_only(cm);
+
+  vpx_rc_gop_decision_t gop_decision;
+  int gop_decision_ready = 0;
+  if (cpi->ext_ratectrl.ready &&
+      (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 &&
+      cpi->ext_ratectrl.funcs.get_gop_decision != NULL) {
+    vpx_codec_err_t codec_status =
+        vp9_extrc_get_gop_decision(&cpi->ext_ratectrl, &gop_decision);
+    if (codec_status != VPX_CODEC_OK) {
+      vpx_internal_error(&cm->error, codec_status,
+                         "vp9_extrc_get_gop_decision() failed");
+    }
+    is_key_frame = gop_decision.use_key_frame;
+    gop_decision_ready = 1;
+  }
+
+  // If this is a key frame or the overlay from a previous arf then
+  // the error score / cost of this frame has already been accounted for.
   const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active;
+  int is_alt_ref_flash = 0;
+
+  double gop_intra_factor;
+  int gop_frames;
+  RANGE active_gf_interval;
+  // Whether this is at the end of last GOP of this sequence.
+  int end_of_sequence = 0;
 
   // Reset the GF group data structures unless this is a key
   // frame in which case it will already have been done.
   if (is_key_frame == 0) {
     vp9_zero(twopass->gf_group);
+    ++rc->gop_global_index;
+  } else {
+    rc->gop_global_index = 0;
   }
 
   vpx_clear_system_state();
-  vp9_zero(next_frame);
 
-  // Load stats for the current frame.
-  mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+  active_gf_interval = get_active_gf_inverval_range(
+      frame_info, rc, arf_active_or_kf, gf_start_show_idx,
+      twopass->active_worst_quality, rc->last_boosted_qindex);
 
-  // Note the error of the frame at the start of the group. This will be
-  // the GF frame error if we code a normal gf.
-  gf_first_frame_err = mod_frame_err;
-
-  // If this is a key frame or the overlay from a previous arf then
-  // the error score / cost of this frame has already been accounted for.
-  if (arf_active_or_kf) {
-    gf_group_err -= gf_first_frame_err;
-    gf_group_raw_error -= this_frame->coded_error;
-    gf_group_noise -= this_frame->frame_noise_energy;
-    gf_group_skip_pct -= this_frame->intra_skip_pct;
-    gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows;
-    gf_group_inter -= this_frame->pcnt_inter;
-    gf_group_motion -= this_frame->pcnt_motion;
+  if (cpi->multi_layer_arf) {
+    int arf_layers = get_arf_layers(cpi->multi_layer_arf, oxcf->enable_auto_arf,
+                                    active_gf_interval.max);
+    gop_intra_factor = 1.0 + 0.25 * arf_layers;
+  } else {
+    gop_intra_factor = 1.0;
   }
 
-  // Motion breakout threshold for loop below depends on image size.
-  mv_ratio_accumulator_thresh =
-      (cpi->initial_height + cpi->initial_width) / 4.0;
-  mv_in_out_thresh = (cpi->initial_height + cpi->initial_width) / 300.0;
-  abs_mv_in_out_thresh = (cpi->initial_height + cpi->initial_width) / 200.0;
+  gop_coding_frames = get_gop_coding_frame_num(
+      &use_alt_ref, frame_info, twopass, rc, gf_start_show_idx,
+      &active_gf_interval, gop_intra_factor, cpi->oxcf.lag_in_frames,
+      &end_of_sequence);
+  use_alt_ref &= allow_alt_ref;
 
-  // Set a maximum and minimum interval for the GF group.
-  // If the image appears almost completely static we can extend beyond this.
-  {
-    int int_max_q = (int)(vp9_convert_qindex_to_q(twopass->active_worst_quality,
-                                                  cpi->common.bit_depth));
-    int int_lbq = (int)(vp9_convert_qindex_to_q(rc->last_boosted_qindex,
-                                                cpi->common.bit_depth));
-    active_min_gf_interval =
-        rc->min_gf_interval + arf_active_or_kf + VPXMIN(2, int_max_q / 200);
-    if (active_min_gf_interval > rc->max_gf_interval)
-      active_min_gf_interval = rc->max_gf_interval;
-
-    if (cpi->multi_arf_allowed) {
-      active_max_gf_interval = rc->max_gf_interval;
-    } else {
-      // The value chosen depends on the active Q range. At low Q we have
-      // bits to spare and are better with a smaller interval and smaller boost.
-      // At high Q when there are few bits to spare we are better with a longer
-      // interval to spread the cost of the GF.
-      active_max_gf_interval = 12 + arf_active_or_kf + VPXMIN(4, (int_lbq / 6));
-
-      // We have: active_min_gf_interval <= rc->max_gf_interval
-      if (active_max_gf_interval < active_min_gf_interval)
-        active_max_gf_interval = active_min_gf_interval;
-      else if (active_max_gf_interval > rc->max_gf_interval)
-        active_max_gf_interval = rc->max_gf_interval;
-
-      // Would the active max drop us out just before the near the next kf?
-      if ((active_max_gf_interval <= rc->frames_to_key) &&
-          (active_max_gf_interval >= (rc->frames_to_key - rc->min_gf_interval)))
-        active_max_gf_interval = rc->frames_to_key / 2;
-    }
-  }
-
-  i = 0;
-  while (i < rc->static_scene_max_gf_interval && i < rc->frames_to_key) {
-    ++i;
-
-    // Accumulate error score of frames in this gf group.
-    mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
-    gf_group_err += mod_frame_err;
-    gf_group_raw_error += this_frame->coded_error;
-    gf_group_noise += this_frame->frame_noise_energy;
-    gf_group_skip_pct += this_frame->intra_skip_pct;
-    gf_group_inactive_zone_rows += this_frame->inactive_zone_rows;
-    gf_group_inter += this_frame->pcnt_inter;
-    gf_group_motion += this_frame->pcnt_motion;
-
-    if (EOF == input_stats(twopass, &next_frame)) break;
-
-    // Test for the case where there is a brief flash but the prediction
-    // quality back to an earlier frame is then restored.
-    flash_detected = detect_flash(twopass, 0);
-
-    // Update the motion related elements to the boost calculation.
-    accumulate_frame_motion_stats(
-        &next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
-        &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
-
-    // Accumulate the effect of prediction quality decay.
-    if (!flash_detected) {
-      last_loop_decay_rate = loop_decay_rate;
-      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
-
-      decay_accumulator = decay_accumulator * loop_decay_rate;
-
-      // Monitor for static sections.
-      zero_motion_accumulator = VPXMIN(
-          zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
-
-      // Break clause to detect very still sections after motion. For example,
-      // a static image after a fade or other transition.
-      if (detect_transition_to_still(cpi, i, 5, loop_decay_rate,
-                                     last_loop_decay_rate)) {
-        allow_alt_ref = 0;
-        break;
-      }
-    }
-
-    // Calculate a boost number for this frame.
-    sr_accumulator = 0.0;
-    boost_score += decay_accumulator *
-                   calc_frame_boost(cpi, &next_frame, &sr_accumulator,
-                                    this_frame_mv_in_out, GF_MAX_BOOST);
-
-    // Break out conditions.
-    if (
-        // Break at active_max_gf_interval unless almost totally static.
-        ((i >= active_max_gf_interval) && (zero_motion_accumulator < 0.995)) ||
-        (
-            // Don't break out with a very short interval.
-            (i >= active_min_gf_interval) &&
-            // If possible dont break very close to a kf
-            ((rc->frames_to_key - i) >= rc->min_gf_interval) &&
-            (!flash_detected) &&
-            ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
-             (abs_mv_in_out_accumulator > abs_mv_in_out_thresh) ||
-             (mv_in_out_accumulator < -mv_in_out_thresh) ||
-             ((boost_score - old_boost_score) < BOOST_BREAKOUT)))) {
-      boost_score = old_boost_score;
-      break;
-    }
-
-    *this_frame = next_frame;
-    old_boost_score = boost_score;
+  if (gop_decision_ready) {
+    gop_coding_frames = gop_decision.gop_coding_frames;
+    use_alt_ref = gop_decision.use_alt_ref;
   }
 
   // Was the group length constrained by the requirement for a new KF?
-  rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
+  rc->constrained_gf_group = (gop_coding_frames >= rc->frames_to_key) ? 1 : 0;
 
   // Should we use the alternate reference frame.
-  if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) &&
-      (i >= rc->min_gf_interval)) {
+  if (use_alt_ref) {
+    const int f_frames =
+        (rc->frames_to_key - gop_coding_frames >= gop_coding_frames - 1)
+            ? gop_coding_frames - 1
+            : VPXMAX(0, rc->frames_to_key - gop_coding_frames);
+    const int b_frames = gop_coding_frames - 1;
+    const int avg_inter_frame_qindex = rc->avg_frame_qindex[INTER_FRAME];
+    // TODO(angiebird): figure out why arf's location is assigned this way
+    const int arf_show_idx = VPXMIN(gf_start_show_idx + gop_coding_frames + 1,
+                                    fps_get_num_frames(first_pass_info));
+
     // Calculate the boost for alt ref.
     rc->gfu_boost =
-        calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost);
+        compute_arf_boost(frame_info, twopass, arf_show_idx, f_frames, b_frames,
+                          avg_inter_frame_qindex);
     rc->source_alt_ref_pending = 1;
-
-    // Test to see if multi arf is appropriate.
-    cpi->multi_arf_enabled =
-        (cpi->multi_arf_allowed && (rc->baseline_gf_interval >= 6) &&
-         (zero_motion_accumulator < 0.995))
-            ? 1
-            : 0;
   } else {
-    rc->gfu_boost = VPXMAX((int)boost_score, MIN_ARF_GF_BOOST);
+    const int f_frames = gop_coding_frames - 1;
+    const int b_frames = 0;
+    const int avg_inter_frame_qindex = rc->avg_frame_qindex[INTER_FRAME];
+    // TODO(angiebird): figure out why arf's location is assigned this way
+    const int gld_show_idx =
+        VPXMIN(gf_start_show_idx + 1, fps_get_num_frames(first_pass_info));
+    const int arf_boost =
+        compute_arf_boost(frame_info, twopass, gld_show_idx, f_frames, b_frames,
+                          avg_inter_frame_qindex);
+    rc->gfu_boost = VPXMIN((int)twopass->gf_max_total_boost, arf_boost);
     rc->source_alt_ref_pending = 0;
   }
 
-  // Limit maximum boost based on interval length.
-  rc->gfu_boost = VPXMIN((int)rc->gfu_boost, i * 200);
+#define LAST_ALR_ACTIVE_BEST_QUALITY_ADJUSTMENT_FACTOR 0.2
+  rc->arf_active_best_quality_adjustment_factor = 1.0;
+  rc->arf_increase_active_best_quality = 0;
 
-  // Set the interval until the next gf.
-  rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending);
-
-  // Only encode alt reference frame in temporal base layer. So
-  // baseline_gf_interval should be multiple of a temporal layer group
-  // (typically the frame distance between two base layer frames)
-  if (is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1) {
-    int count = (1 << (cpi->svc.number_temporal_layers - 1)) - 1;
-    int new_gf_interval = (rc->baseline_gf_interval + count) & (~count);
-    int j;
-    for (j = 0; j < new_gf_interval - rc->baseline_gf_interval; ++j) {
-      if (EOF == input_stats(twopass, this_frame)) break;
-      gf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
-      gf_group_raw_error += this_frame->coded_error;
-      gf_group_noise += this_frame->frame_noise_energy;
-      gf_group_skip_pct += this_frame->intra_skip_pct;
-      gf_group_inactive_zone_rows += this_frame->inactive_zone_rows;
-      gf_group_inter += this_frame->pcnt_inter;
-      gf_group_motion += this_frame->pcnt_motion;
+  if (!is_lossless_requested(&cpi->oxcf)) {
+    if (rc->frames_since_key >= rc->frames_to_key) {
+      // Increase the active best quality in the second half of key frame
+      // interval.
+      rc->arf_active_best_quality_adjustment_factor =
+          LAST_ALR_ACTIVE_BEST_QUALITY_ADJUSTMENT_FACTOR +
+          (1.0 - LAST_ALR_ACTIVE_BEST_QUALITY_ADJUSTMENT_FACTOR) *
+              (rc->frames_to_key - gop_coding_frames) /
+              (VPXMAX(1, ((rc->frames_to_key + rc->frames_since_key) / 2 -
+                          gop_coding_frames)));
+      rc->arf_increase_active_best_quality = 1;
+    } else if ((rc->frames_to_key - gop_coding_frames) > 0) {
+      // Reduce the active best quality in the first half of key frame interval.
+      rc->arf_active_best_quality_adjustment_factor =
+          LAST_ALR_ACTIVE_BEST_QUALITY_ADJUSTMENT_FACTOR +
+          (1.0 - LAST_ALR_ACTIVE_BEST_QUALITY_ADJUSTMENT_FACTOR) *
+              (rc->frames_since_key + gop_coding_frames) /
+              (VPXMAX(1, (rc->frames_to_key + rc->frames_since_key) / 2 +
+                             gop_coding_frames));
+      rc->arf_increase_active_best_quality = -1;
     }
-    rc->baseline_gf_interval = new_gf_interval;
   }
 
-  rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+#ifdef AGGRESSIVE_VBR
+  // Limit maximum boost based on interval length.
+  rc->gfu_boost = VPXMIN((int)rc->gfu_boost, gop_coding_frames * 140);
+#else
+  rc->gfu_boost = VPXMIN((int)rc->gfu_boost, gop_coding_frames * 200);
+#endif
 
-  // Reset the file position.
-  reset_fpf_position(twopass, start_pos);
+  // Cap the ARF boost when perceptual quality AQ mode is enabled. This is
+  // designed to improve the perceptual quality of high value content and to
+  // make consistent quality across consecutive frames. It will hurt objective
+  // quality.
+  if (oxcf->aq_mode == PERCEPTUAL_AQ)
+    rc->gfu_boost = VPXMIN(rc->gfu_boost, MIN_ARF_GF_BOOST);
+
+  rc->baseline_gf_interval = gop_coding_frames - rc->source_alt_ref_pending;
+
+  if (rc->source_alt_ref_pending)
+    is_alt_ref_flash = detect_flash(twopass, rc->baseline_gf_interval);
+
+  {
+    const double av_err = get_distribution_av_err(cpi, twopass);
+    const double mean_mod_score = twopass->mean_mod_score;
+    // If the first frame is a key frame or the overlay from a previous arf then
+    // the error score / cost of this frame has already been accounted for.
+    int start_idx = arf_active_or_kf ? 1 : 0;
+    int j;
+    for (j = start_idx; j < gop_coding_frames; ++j) {
+      int show_idx = gf_start_show_idx + j;
+      const FIRSTPASS_STATS *frame_stats =
+          fps_get_frame_stats(first_pass_info, show_idx);
+      if (frame_stats == NULL) {
+        if (cpi->ext_ratectrl.ready &&
+            (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 &&
+            cpi->ext_ratectrl.funcs.get_gop_decision != NULL) {
+          // Since in ext_ratectrl, gop_coding_frames means the count of both
+          // show and no show frames. Using this variable to access
+          // first_pass_info will trigger out-of-range error because
+          // first_pass_info only contains show frames. This part is used for
+          // computing gf_group_err which will be used to compute gf_group_bits
+          // for libvpx internal rate control. Since ext_ratectrl is using
+          // external rate control module, this part becomes non-critical.
+          // Hence, we can safely turn off this error reporting.
+          break;
+        }
+        vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                           "In define_gf_group(), frame_stats is NULL when "
+                           "calculating gf_group_err.");
+        break;
+      }
+      // Accumulate error score of frames in this gf group.
+      gf_group_err += calc_norm_frame_score(oxcf, frame_info, frame_stats,
+                                            mean_mod_score, av_err);
+      gf_group_raw_error += frame_stats->coded_error;
+      gf_group_noise += frame_stats->frame_noise_energy;
+      gf_group_skip_pct += frame_stats->intra_skip_pct;
+      gf_group_inactive_zone_rows += frame_stats->inactive_zone_rows;
+      gf_group_inter += frame_stats->pcnt_inter;
+      gf_group_motion += frame_stats->pcnt_motion;
+    }
+  }
 
   // Calculate the bits to be allocated to the gf/arf group as a whole
   gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err);
 
+  gop_frames =
+      rc->baseline_gf_interval + rc->source_alt_ref_pending - arf_active_or_kf;
+
+  // Store the average moise level measured for the group
+  // TODO(any): Experiment with removal of else condition (gop_frames = 0) so
+  // that consumption of group noise energy is based on previous gf group
+  if (gop_frames > 0)
+    twopass->gf_group.group_noise_energy = (int)(gf_group_noise / gop_frames);
+  else
+    twopass->gf_group.group_noise_energy = 0;
+
   // Calculate an estimate of the maxq needed for the group.
-  // We are more agressive about correcting for sections
+  // We are more aggressive about correcting for sections
   // where there could be significant overshoot than for easier
   // sections where we do not wish to risk creating an overshoot
   // of the allocated bit budget.
   if ((cpi->oxcf.rc_mode != VPX_Q) && (rc->baseline_gf_interval > 1)) {
-    const int vbr_group_bits_per_frame =
-        (int)(gf_group_bits / rc->baseline_gf_interval);
-    const double group_av_err = gf_group_raw_error / rc->baseline_gf_interval;
-    const double group_av_noise = gf_group_noise / rc->baseline_gf_interval;
-    const double group_av_skip_pct =
-        gf_group_skip_pct / rc->baseline_gf_interval;
-    const double group_av_inactive_zone =
-        ((gf_group_inactive_zone_rows * 2) /
-         (rc->baseline_gf_interval * (double)cm->mb_rows));
+    const int vbr_group_bits_per_frame = (int)(gf_group_bits / gop_frames);
+    const double group_av_err = gf_group_raw_error / gop_frames;
+    const double group_av_noise = gf_group_noise / gop_frames;
+    const double group_av_skip_pct = gf_group_skip_pct / gop_frames;
+    const double group_av_inactive_zone = ((gf_group_inactive_zone_rows * 2) /
+                                           (gop_frames * (double)cm->mb_rows));
     int tmp_q = get_twopass_worst_quality(
         cpi, group_av_err, (group_av_skip_pct + group_av_inactive_zone),
         group_av_noise, vbr_group_bits_per_frame);
     twopass->active_worst_quality =
-        (tmp_q + (twopass->active_worst_quality * 3)) >> 2;
+        (int)((tmp_q + (twopass->active_worst_quality *
+                        (twopass->active_wq_factor - 1))) /
+              twopass->active_wq_factor);
+
+#if CONFIG_ALWAYS_ADJUST_BPM
+    // Reset rolling actual and target bits counters for ARF groups.
+    twopass->rolling_arf_group_target_bits = 0;
+    twopass->rolling_arf_group_actual_bits = 0;
+#endif
   }
 
   // Context Adjustment of ARNR filter strength
   if (rc->baseline_gf_interval > 1) {
-    adjust_group_arnr_filter(cpi, (gf_group_noise / rc->baseline_gf_interval),
-                             (gf_group_inter / rc->baseline_gf_interval),
-                             (gf_group_motion / rc->baseline_gf_interval));
+    adjust_group_arnr_filter(cpi, (gf_group_noise / gop_frames),
+                             (gf_group_inter / gop_frames),
+                             (gf_group_motion / gop_frames));
   } else {
     twopass->arnr_strength_adjustment = 0;
   }
 
   // Calculate the extra bits to be used for boosted frame(s)
-  gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval, rc->gfu_boost,
-                                     gf_group_bits);
+  gf_arf_bits = calculate_boost_bits((rc->baseline_gf_interval - 1),
+                                     rc->gfu_boost, gf_group_bits);
 
   // Adjust KF group bits and error remaining.
-  twopass->kf_group_error_left -= (int64_t)gf_group_err;
+  twopass->kf_group_error_left -= gf_group_err;
+
+  // Decide GOP structure.
+  if (gop_decision_ready) {
+    ext_rc_define_gf_group_structure(&gop_decision, &twopass->gf_group);
+    // Set the fb idx for the first frame in this GOP.
+    cpi->lst_fb_idx = twopass->gf_group.ext_rc_ref[0].last_index;
+    cpi->gld_fb_idx = twopass->gf_group.ext_rc_ref[0].golden_index;
+    cpi->alt_fb_idx = twopass->gf_group.ext_rc_ref[0].altref_index;
+  } else {
+    define_gf_group_structure(cpi);
+  }
 
   // Allocate bits to each of the frames in the GF group.
   allocate_gf_group_bits(cpi, gf_group_bits, gf_arf_bits);
@@ -2379,32 +3040,94 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   reset_fpf_position(twopass, start_pos);
 
   // Calculate a section intra ratio used in setting max loop filter.
-  if (cpi->common.frame_type != KEY_FRAME) {
-    twopass->section_intra_rating = calculate_section_intra_ratio(
-        start_pos, twopass->stats_in_end, rc->baseline_gf_interval);
-  }
+  twopass->section_intra_rating = calculate_section_intra_ratio(
+      start_pos, twopass->stats_in_end, rc->baseline_gf_interval);
 
   if (oxcf->resize_mode == RESIZE_DYNAMIC) {
     // Default to starting GF groups at normal frame size.
     cpi->rc.next_frame_size_selector = UNSCALED;
   }
-
+#if !CONFIG_ALWAYS_ADJUST_BPM
   // Reset rolling actual and target bits counters for ARF groups.
   twopass->rolling_arf_group_target_bits = 0;
   twopass->rolling_arf_group_actual_bits = 0;
+#endif
+  rc->preserve_arf_as_gld = rc->preserve_next_arf_as_gld;
+  rc->preserve_next_arf_as_gld = 0;
+  // If alt ref frame is flash do not set preserve_arf_as_gld
+  if (!is_lossless_requested(&cpi->oxcf) && !cpi->use_svc &&
+      cpi->oxcf.aq_mode == NO_AQ && cpi->multi_layer_arf && !is_alt_ref_flash)
+    rc->preserve_next_arf_as_gld = 1;
+}
+
+// Intra / Inter threshold very low
+#define VERY_LOW_II 1.5
+// Clean slide transitions we expect a sharp single frame spike in error.
+#define ERROR_SPIKE 5.0
+
+// Slide show transition detection.
+// Tests for case where there is very low error either side of the current frame
+// but much higher just for this frame. This can help detect key frames in
+// slide shows even where the slides are pictures of different sizes.
+// Also requires that intra and inter errors are very similar to help eliminate
+// harmful false positives.
+// It will not help if the transition is a fade or other multi-frame effect.
+static int slide_transition(const FIRSTPASS_STATS *this_frame,
+                            const FIRSTPASS_STATS *last_frame,
+                            const FIRSTPASS_STATS *next_frame) {
+  return (this_frame->intra_error < (this_frame->coded_error * VERY_LOW_II)) &&
+         (this_frame->coded_error > (last_frame->coded_error * ERROR_SPIKE)) &&
+         (this_frame->coded_error > (next_frame->coded_error * ERROR_SPIKE));
+}
+
+// This test looks for anomalous changes in the nature of the intra signal
+// related to the previous and next frame as an indicator for coding a key
+// frame. This test serves to detect some additional scene cuts,
+// especially in lowish motion and low contrast sections, that are missed
+// by the other tests.
+static int intra_step_transition(const FIRSTPASS_STATS *this_frame,
+                                 const FIRSTPASS_STATS *last_frame,
+                                 const FIRSTPASS_STATS *next_frame) {
+  double last_ii_ratio;
+  double this_ii_ratio;
+  double next_ii_ratio;
+  double last_pcnt_intra = 1.0 - last_frame->pcnt_inter;
+  double this_pcnt_intra = 1.0 - this_frame->pcnt_inter;
+  double next_pcnt_intra = 1.0 - next_frame->pcnt_inter;
+  double mod_this_intra = this_pcnt_intra + this_frame->pcnt_neutral;
+
+  // Calculate ii ratio for this frame last frame and next frame.
+  last_ii_ratio =
+      last_frame->intra_error / DOUBLE_DIVIDE_CHECK(last_frame->coded_error);
+  this_ii_ratio =
+      this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
+  next_ii_ratio =
+      next_frame->intra_error / DOUBLE_DIVIDE_CHECK(next_frame->coded_error);
+
+  // Return true the intra/inter ratio for the current frame is
+  // low but better in the next and previous frame and the relative usage of
+  // intra in the current frame is markedly higher than the last and next frame.
+  if ((this_ii_ratio < 2.0) && (last_ii_ratio > 2.25) &&
+      (next_ii_ratio > 2.25) && (this_pcnt_intra > (3 * last_pcnt_intra)) &&
+      (this_pcnt_intra > (3 * next_pcnt_intra)) &&
+      ((this_pcnt_intra > 0.075) || (mod_this_intra > 0.85))) {
+    return 1;
+    // Very low inter intra ratio (i.e. not much gain from inter coding), most
+    // blocks neutral on coding method and better inter prediction either side
+  } else if ((this_ii_ratio < 1.25) && (mod_this_intra > 0.85) &&
+             (this_ii_ratio < last_ii_ratio * 0.9) &&
+             (this_ii_ratio < next_ii_ratio * 0.9)) {
+    return 1;
+  } else {
+    return 0;
+  }
 }
 
-// Threshold for use of the lagging second reference frame. High second ref
-// usage may point to a transient event like a flash or occlusion rather than
-// a real scene cut.
-#define SECOND_REF_USEAGE_THRESH 0.1
 // Minimum % intra coding observed in first pass (1.0 = 100%)
 #define MIN_INTRA_LEVEL 0.25
-// Minimum ratio between the % of intra coding and inter coding in the first
-// pass after discounting neutral blocks (discounting neutral blocks in this
-// way helps catch scene cuts in clips with very flat areas or letter box
-// format clips with image padding.
-#define INTRA_VS_INTER_THRESH 2.0
+// Threshold for use of the lagging second reference frame. Scene cuts do not
+// usually have a high second ref usage.
+#define SECOND_REF_USAGE_THRESH 0.2
 // Hard threshold where the first pass chooses intra for almost all blocks.
 // In such a case even if the frame is not a scene cut coding a key frame
 // may be a good option.
@@ -2412,80 +3135,75 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 // Maximum threshold for the relative ratio of intra error score vs best
 // inter error score.
 #define KF_II_ERR_THRESHOLD 2.5
-// In real scene cuts there is almost always a sharp change in the intra
-// or inter error score.
-#define ERR_CHANGE_THRESHOLD 0.4
-// For real scene cuts we expect an improvment in the intra inter error
-// ratio in the next frame.
-#define II_IMPROVEMENT_THRESHOLD 3.5
 #define KF_II_MAX 128.0
+#define II_FACTOR 12.5
+// Test for very low intra complexity which could cause false key frames
+#define V_LOW_INTRA 0.5
 
-static int test_candidate_kf(TWO_PASS *twopass,
-                             const FIRSTPASS_STATS *last_frame,
-                             const FIRSTPASS_STATS *this_frame,
-                             const FIRSTPASS_STATS *next_frame) {
+static int test_candidate_kf(const FIRST_PASS_INFO *first_pass_info,
+                             int show_idx) {
+  const FIRSTPASS_STATS *last_frame =
+      fps_get_frame_stats(first_pass_info, show_idx - 1);
+  const FIRSTPASS_STATS *this_frame =
+      fps_get_frame_stats(first_pass_info, show_idx);
+  const FIRSTPASS_STATS *next_frame =
+      fps_get_frame_stats(first_pass_info, show_idx + 1);
   int is_viable_kf = 0;
   double pcnt_intra = 1.0 - this_frame->pcnt_inter;
-  double modified_pcnt_inter =
-      this_frame->pcnt_inter - this_frame->pcnt_neutral;
 
   // Does the frame satisfy the primary criteria of a key frame?
   // See above for an explanation of the test criteria.
   // If so, then examine how well it predicts subsequent frames.
-  if ((this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
-      (next_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
+  detect_flash_from_frame_stats(next_frame);
+  if (!detect_flash_from_frame_stats(this_frame) &&
+      !detect_flash_from_frame_stats(next_frame) &&
+      (this_frame->pcnt_second_ref < SECOND_REF_USAGE_THRESH) &&
       ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) ||
-       ((pcnt_intra > MIN_INTRA_LEVEL) &&
-        (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) &&
+       (slide_transition(this_frame, last_frame, next_frame)) ||
+       (intra_step_transition(this_frame, last_frame, next_frame)) ||
+       (((this_frame->coded_error > (next_frame->coded_error * 1.2)) &&
+         (this_frame->coded_error > (last_frame->coded_error * 1.2))) &&
+        (pcnt_intra > MIN_INTRA_LEVEL) &&
+        ((pcnt_intra + this_frame->pcnt_neutral) > 0.5) &&
         ((this_frame->intra_error /
           DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) <
-         KF_II_ERR_THRESHOLD) &&
-        ((fabs(last_frame->coded_error - this_frame->coded_error) /
-              DOUBLE_DIVIDE_CHECK(this_frame->coded_error) >
-          ERR_CHANGE_THRESHOLD) ||
-         (fabs(last_frame->intra_error - this_frame->intra_error) /
-              DOUBLE_DIVIDE_CHECK(this_frame->intra_error) >
-          ERR_CHANGE_THRESHOLD) ||
-         ((next_frame->intra_error /
-           DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) >
-          II_IMPROVEMENT_THRESHOLD))))) {
+         KF_II_ERR_THRESHOLD)))) {
     int i;
-    const FIRSTPASS_STATS *start_pos = twopass->stats_in;
-    FIRSTPASS_STATS local_next_frame = *next_frame;
     double boost_score = 0.0;
     double old_boost_score = 0.0;
     double decay_accumulator = 1.0;
 
     // Examine how well the key frame predicts subsequent frames.
     for (i = 0; i < 16; ++i) {
-      double next_iiratio = (BOOST_FACTOR * local_next_frame.intra_error /
-                             DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
+      const FIRSTPASS_STATS *frame_stats =
+          fps_get_frame_stats(first_pass_info, show_idx + 1 + i);
+      double next_iiratio = (II_FACTOR * frame_stats->intra_error /
+                             DOUBLE_DIVIDE_CHECK(frame_stats->coded_error));
 
       if (next_iiratio > KF_II_MAX) next_iiratio = KF_II_MAX;
 
       // Cumulative effect of decay in prediction quality.
-      if (local_next_frame.pcnt_inter > 0.85)
-        decay_accumulator *= local_next_frame.pcnt_inter;
+      if (frame_stats->pcnt_inter > 0.85)
+        decay_accumulator *= frame_stats->pcnt_inter;
       else
-        decay_accumulator *= (0.85 + local_next_frame.pcnt_inter) / 2.0;
+        decay_accumulator *= (0.85 + frame_stats->pcnt_inter) / 2.0;
 
       // Keep a running total.
       boost_score += (decay_accumulator * next_iiratio);
 
       // Test various breakout clauses.
-      if ((local_next_frame.pcnt_inter < 0.05) || (next_iiratio < 1.5) ||
-          (((local_next_frame.pcnt_inter - local_next_frame.pcnt_neutral) <
-            0.20) &&
+      if ((frame_stats->pcnt_inter < 0.05) || (next_iiratio < 1.5) ||
+          (((frame_stats->pcnt_inter - frame_stats->pcnt_neutral) < 0.20) &&
            (next_iiratio < 3.0)) ||
           ((boost_score - old_boost_score) < 3.0) ||
-          (local_next_frame.intra_error < 200)) {
+          (frame_stats->intra_error < V_LOW_INTRA)) {
         break;
       }
 
       old_boost_score = boost_score;
 
       // Get the next frame details
-      if (EOF == input_stats(twopass, &local_next_frame)) break;
+      if (show_idx + 1 + i == fps_get_num_frames(first_pass_info) - 1) break;
     }
 
     // If there is tolerable prediction for at least the next 3 frames then
@@ -2493,9 +3211,6 @@ static int test_candidate_kf(TWO_PASS *twopass,
     if (boost_score > 30.0 && (i > 3)) {
       is_viable_kf = 1;
     } else {
-      // Reset the file position
-      reset_fpf_position(twopass, start_pos);
-
       is_viable_kf = 0;
     }
   }
@@ -2504,33 +3219,105 @@ static int test_candidate_kf(TWO_PASS *twopass,
 }
 
 #define FRAMES_TO_CHECK_DECAY 8
-#define KF_MAX_FRAME_BOOST 96.0
 #define MIN_KF_TOT_BOOST 300
-#define MAX_KF_TOT_BOOST 5400
-#define KF_BOOST_SCAN_MAX_FRAMES 32
+#define DEFAULT_SCAN_FRAMES_FOR_KF_BOOST 32
+#define MAX_SCAN_FRAMES_FOR_KF_BOOST 48
+#define MIN_SCAN_FRAMES_FOR_KF_BOOST 32
+#define KF_ABS_ZOOM_THRESH 6.0
 
-static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
-  int i, j;
+int vp9_get_frames_to_next_key(const VP9EncoderConfig *oxcf,
+                               const TWO_PASS *const twopass, int kf_show_idx,
+                               int min_gf_interval) {
+  const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
+  double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
+  int j;
+  int frames_to_key;
+  int max_frames_to_key = first_pass_info->num_frames - kf_show_idx;
+  max_frames_to_key = VPXMIN(max_frames_to_key, oxcf->key_freq);
+
+  // Initialize the decay rates for the recent frames to check
+  for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0;
+  // Find the next keyframe.
+  if (!oxcf->auto_key) {
+    frames_to_key = max_frames_to_key;
+  } else {
+    frames_to_key = 1;
+    while (frames_to_key < max_frames_to_key) {
+      // Provided that we are not at the end of the file...
+      if (kf_show_idx + frames_to_key + 1 < first_pass_info->num_frames) {
+        double loop_decay_rate;
+        double decay_accumulator;
+        const FIRSTPASS_STATS *next_frame = fps_get_frame_stats(
+            first_pass_info, kf_show_idx + frames_to_key + 1);
+
+        // Check for a scene cut.
+        if (test_candidate_kf(first_pass_info, kf_show_idx + frames_to_key))
+          break;
+
+        // How fast is the prediction quality decaying?
+        loop_decay_rate = get_prediction_decay_rate(twopass, next_frame);
+
+        // We want to know something about the recent past... rather than
+        // as used elsewhere where we are concerned with decay in prediction
+        // quality since the last GF or KF.
+        recent_loop_decay[(frames_to_key - 1) % FRAMES_TO_CHECK_DECAY] =
+            loop_decay_rate;
+        decay_accumulator = 1.0;
+        for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j)
+          decay_accumulator *= recent_loop_decay[j];
+
+        // Special check for transition or high motion followed by a
+        // static scene.
+        if ((frames_to_key - 1) > min_gf_interval && loop_decay_rate >= 0.999 &&
+            decay_accumulator < 0.9) {
+          int still_interval = oxcf->key_freq - (frames_to_key - 1);
+          // TODO(angiebird): Figure out why we use "+1" here
+          int show_idx = kf_show_idx + frames_to_key;
+          if (check_transition_to_still(first_pass_info, show_idx,
+                                        still_interval)) {
+            break;
+          }
+        }
+      }
+      ++frames_to_key;
+    }
+  }
+  return frames_to_key;
+}
+
+static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) {
+  int i;
   RATE_CONTROL *const rc = &cpi->rc;
   TWO_PASS *const twopass = &cpi->twopass;
   GF_GROUP *const gf_group = &twopass->gf_group;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
-  const FIRSTPASS_STATS first_frame = *this_frame;
+  const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
+  const FRAME_INFO *frame_info = &cpi->frame_info;
   const FIRSTPASS_STATS *const start_position = twopass->stats_in;
+  const FIRSTPASS_STATS *keyframe_stats =
+      fps_get_frame_stats(first_pass_info, kf_show_idx);
   FIRSTPASS_STATS next_frame;
-  FIRSTPASS_STATS last_frame;
   int kf_bits = 0;
-  double decay_accumulator = 1.0;
+  int64_t max_kf_bits;
   double zero_motion_accumulator = 1.0;
+  double zero_motion_sum = 0.0;
+  double zero_motion_avg;
+  double motion_compensable_sum = 0.0;
+  double motion_compensable_avg;
+  int num_frames = 0;
+  int kf_boost_scan_frames = DEFAULT_SCAN_FRAMES_FOR_KF_BOOST;
   double boost_score = 0.0;
   double kf_mod_err = 0.0;
+  double kf_raw_err = 0.0;
   double kf_group_err = 0.0;
-  double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
   double sr_accumulator = 0.0;
-
+  double abs_mv_in_out_accumulator = 0.0;
+  const double av_err = get_distribution_av_err(cpi, twopass);
+  const double mean_mod_score = twopass->mean_mod_score;
   vp9_zero(next_frame);
 
   cpi->common.frame_type = KEY_FRAME;
+  rc->frames_since_key = 0;
 
   // Reset the GF group data structures.
   vp9_zero(*gf_group);
@@ -2541,116 +3328,56 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // Clear the alt ref active flag and last group multi arf flags as they
   // can never be set for a key frame.
   rc->source_alt_ref_active = 0;
-  cpi->multi_arf_last_grp_enabled = 0;
 
   // KF is always a GF so clear frames till next gf counter.
   rc->frames_till_gf_update_due = 0;
 
   rc->frames_to_key = 1;
 
-  twopass->kf_group_bits = 0;        // Total bits available to kf group
-  twopass->kf_group_error_left = 0;  // Group modified error score.
+  twopass->kf_group_bits = 0;          // Total bits available to kf group
+  twopass->kf_group_error_left = 0.0;  // Group modified error score.
 
-  kf_mod_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+  kf_raw_err = keyframe_stats->intra_error;
+  kf_mod_err = calc_norm_frame_score(oxcf, frame_info, keyframe_stats,
+                                     mean_mod_score, av_err);
 
-  // Initialize the decay rates for the recent frames to check
-  for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0;
-
-  // Find the next keyframe.
-  i = 0;
-  while (twopass->stats_in < twopass->stats_in_end &&
-         rc->frames_to_key < cpi->oxcf.key_freq) {
-    // Accumulate kf group error.
-    kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
-
-    // Load the next frame's stats.
-    last_frame = *this_frame;
-    input_stats(twopass, this_frame);
-
-    // Provided that we are not at the end of the file...
-    if (cpi->oxcf.auto_key && twopass->stats_in < twopass->stats_in_end) {
-      double loop_decay_rate;
-
-      // Check for a scene cut.
-      if (test_candidate_kf(twopass, &last_frame, this_frame,
-                            twopass->stats_in))
-        break;
-
-      // How fast is the prediction quality decaying?
-      loop_decay_rate = get_prediction_decay_rate(cpi, twopass->stats_in);
-
-      // We want to know something about the recent past... rather than
-      // as used elsewhere where we are concerned with decay in prediction
-      // quality since the last GF or KF.
-      recent_loop_decay[i % FRAMES_TO_CHECK_DECAY] = loop_decay_rate;
-      decay_accumulator = 1.0;
-      for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j)
-        decay_accumulator *= recent_loop_decay[j];
-
-      // Special check for transition or high motion followed by a
-      // static scene.
-      if (detect_transition_to_still(cpi, i, cpi->oxcf.key_freq - i,
-                                     loop_decay_rate, decay_accumulator))
-        break;
-
-      // Step on to the next frame.
-      ++rc->frames_to_key;
-
-      // If we don't have a real key frame within the next two
-      // key_freq intervals then break out of the loop.
-      if (rc->frames_to_key >= 2 * cpi->oxcf.key_freq) break;
+  if (cpi->ext_ratectrl.ready &&
+      (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 &&
+      cpi->ext_ratectrl.funcs.get_key_frame_decision != NULL) {
+    vpx_rc_key_frame_decision_t key_frame_decision;
+    vpx_codec_err_t codec_status = vp9_extrc_get_key_frame_decision(
+        &cpi->ext_ratectrl, &key_frame_decision);
+    if (codec_status == VPX_CODEC_OK) {
+      rc->frames_to_key = key_frame_decision.key_frame_group_size;
     } else {
-      ++rc->frames_to_key;
+      vpx_internal_error(&cpi->common.error, codec_status,
+                         "vp9_extrc_get_key_frame_decision() failed");
     }
-    ++i;
+  } else {
+    rc->frames_to_key = vp9_get_frames_to_next_key(oxcf, twopass, kf_show_idx,
+                                                   rc->min_gf_interval);
   }
 
   // If there is a max kf interval set by the user we must obey it.
   // We already breakout of the loop above at 2x max.
   // This code centers the extra kf if the actual natural interval
   // is between 1x and 2x.
-  if (cpi->oxcf.auto_key && rc->frames_to_key > cpi->oxcf.key_freq) {
-    FIRSTPASS_STATS tmp_frame = first_frame;
-
-    rc->frames_to_key /= 2;
-
-    // Reset to the start of the group.
-    reset_fpf_position(twopass, start_position);
-
-    kf_group_err = 0.0;
-
-    // Rescan to get the correct error data for the forced kf group.
-    for (i = 0; i < rc->frames_to_key; ++i) {
-      kf_group_err += calculate_modified_err(cpi, twopass, oxcf, &tmp_frame);
-      input_stats(twopass, &tmp_frame);
-    }
-    rc->next_key_frame_forced = 1;
-  } else if (twopass->stats_in == twopass->stats_in_end ||
-             rc->frames_to_key >= cpi->oxcf.key_freq) {
+  if (rc->frames_to_key >= cpi->oxcf.key_freq) {
     rc->next_key_frame_forced = 1;
   } else {
     rc->next_key_frame_forced = 0;
   }
 
-  if (is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1) {
-    int count = (1 << (cpi->svc.number_temporal_layers - 1)) - 1;
-    int new_frame_to_key = (rc->frames_to_key + count) & (~count);
-    int j;
-    for (j = 0; j < new_frame_to_key - rc->frames_to_key; ++j) {
-      if (EOF == input_stats(twopass, this_frame)) break;
-      kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
-    }
-    rc->frames_to_key = new_frame_to_key;
-  }
-
-  // Special case for the last key frame of the file.
-  if (twopass->stats_in >= twopass->stats_in_end) {
+  for (i = 0; i < rc->frames_to_key; ++i) {
+    const FIRSTPASS_STATS *frame_stats =
+        fps_get_frame_stats(first_pass_info, kf_show_idx + i);
     // Accumulate kf group error.
-    kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+    kf_group_err += calc_norm_frame_score(oxcf, frame_info, frame_stats,
+                                          mean_mod_score, av_err);
   }
 
   // Calculate the number of bits that should be assigned to the kf group.
-  if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) {
+  if (twopass->bits_left > 0 && twopass->normalized_score_left > 0.0) {
     // Maximum number of bits for a single normal frame (not key frame).
     const int max_bits = frame_max_bits(rc, &cpi->oxcf);
 
@@ -2659,8 +3386,9 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
     // Default allocation based on bits left and relative
     // complexity of the section.
-    twopass->kf_group_bits = (int64_t)(
-        twopass->bits_left * (kf_group_err / twopass->modified_error_left));
+    twopass->kf_group_bits =
+        (int64_t)(twopass->bits_left *
+                  (kf_group_err / twopass->normalized_score_left));
 
     // Clip based on maximum per frame rate defined by the user.
     max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key;
@@ -2671,37 +3399,73 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   }
   twopass->kf_group_bits = VPXMAX(0, twopass->kf_group_bits);
 
-  // Reset the first pass file position.
-  reset_fpf_position(twopass, start_position);
-
   // Scan through the kf group collating various stats used to determine
   // how many bits to spend on it.
   boost_score = 0.0;
 
+  for (i = 0; i < VPXMIN(MAX_SCAN_FRAMES_FOR_KF_BOOST, (rc->frames_to_key - 1));
+       ++i) {
+    if (EOF == input_stats(twopass, &next_frame)) break;
+
+    zero_motion_sum += next_frame.pcnt_inter - next_frame.pcnt_motion;
+    motion_compensable_sum +=
+        1 - (double)next_frame.coded_error / next_frame.intra_error;
+    num_frames++;
+  }
+
+  if (num_frames >= MIN_SCAN_FRAMES_FOR_KF_BOOST) {
+    zero_motion_avg = zero_motion_sum / num_frames;
+    motion_compensable_avg = motion_compensable_sum / num_frames;
+    kf_boost_scan_frames = (int)(VPXMAX(64 * zero_motion_avg - 16,
+                                        160 * motion_compensable_avg - 112));
+    kf_boost_scan_frames =
+        clamp(kf_boost_scan_frames, MIN_SCAN_FRAMES_FOR_KF_BOOST,
+              MAX_SCAN_FRAMES_FOR_KF_BOOST);
+  }
+  reset_fpf_position(twopass, start_position);
+
   for (i = 0; i < (rc->frames_to_key - 1); ++i) {
     if (EOF == input_stats(twopass, &next_frame)) break;
 
-    if (i <= KF_BOOST_SCAN_MAX_FRAMES) {
+    // The zero motion test here insures that if we mark a kf group as static
+    // it is static throughout not just the first KF_BOOST_SCAN_MAX_FRAMES.
+    // It also allows for a larger boost on long static groups.
+    if ((i <= kf_boost_scan_frames) || (zero_motion_accumulator >= 0.99)) {
       double frame_boost;
       double zm_factor;
 
       // Monitor for static sections.
-      zero_motion_accumulator = VPXMIN(
-          zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
+      // First frame in kf group the second ref indicator is invalid.
+      if (i > 0) {
+        zero_motion_accumulator =
+            VPXMIN(zero_motion_accumulator,
+                   get_zero_motion_factor(twopass, &next_frame));
+      } else {
+        zero_motion_accumulator =
+            next_frame.pcnt_inter - next_frame.pcnt_motion;
+      }
 
       // Factor 0.75-1.25 based on how much of frame is static.
       zm_factor = (0.75 + (zero_motion_accumulator / 2.0));
 
       // The second (lagging) ref error is not valid immediately after
       // a key frame because either the lag has not built up (in the case of
-      // the first key frame or it points to a refernce before the new key
+      // the first key frame or it points to a reference before the new key
       // frame.
       if (i < 2) sr_accumulator = 0.0;
-      frame_boost = calc_kf_frame_boost(cpi, &next_frame, &sr_accumulator, 0,
-                                        KF_MAX_FRAME_BOOST * zm_factor);
+      frame_boost =
+          calc_kf_frame_boost(cpi, &next_frame, &sr_accumulator, 0, zm_factor);
 
       boost_score += frame_boost;
-      if (frame_boost < 25.00) break;
+
+      // Measure of zoom. Large zoom tends to indicate reduced boost.
+      abs_mv_in_out_accumulator +=
+          fabs(next_frame.mv_in_out_count * next_frame.pcnt_motion);
+
+      if ((frame_boost < 25.00) ||
+          (abs_mv_in_out_accumulator > KF_ABS_ZOOM_THRESH) ||
+          (sr_accumulator > (kf_raw_err * 1.50)))
+        break;
     } else {
       break;
     }
@@ -2713,17 +3477,30 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
 
   // Calculate a section intra ratio used in setting max loop filter.
-  twopass->section_intra_rating = calculate_section_intra_ratio(
+  twopass->key_frame_section_intra_rating = calculate_section_intra_ratio(
       start_position, twopass->stats_in_end, rc->frames_to_key);
 
-  // Apply various clamps for min and max boost
-  rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3));
-  rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST);
-  rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST);
+  // Special case for static / slide show content but don't apply
+  // if the kf group is very short.
+  if ((zero_motion_accumulator > 0.99) && (rc->frames_to_key > 8)) {
+    rc->kf_boost = (int)(twopass->kf_max_total_boost);
+  } else {
+    // Apply various clamps for min and max oost
+    rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3));
+    rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST);
+    rc->kf_boost = VPXMIN(rc->kf_boost, (int)(twopass->kf_max_total_boost));
+  }
 
   // Work out how many bits to allocate for the key frame itself.
   kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost,
                                  twopass->kf_group_bits);
+  // Based on the spatial complexity, increase the bits allocated to key frame.
+  kf_bits +=
+      (int)((twopass->kf_group_bits - kf_bits) * (kf_mod_err / kf_group_err));
+  max_kf_bits =
+      twopass->kf_group_bits - (rc->frames_to_key - 1) * FRAME_OVERHEAD_BITS;
+  max_kf_bits = lclamp(max_kf_bits, 0, INT_MAX);
+  kf_bits = VPXMIN(kf_bits, (int)max_kf_bits);
 
   twopass->kf_group_bits -= kf_bits;
 
@@ -2731,14 +3508,15 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   gf_group->bit_allocation[0] = kf_bits;
   gf_group->update_type[0] = KF_UPDATE;
   gf_group->rf_level[0] = KF_STD;
+  gf_group->layer_depth[0] = 0;
 
   // Note the total error score of the kf group minus the key frame itself.
-  twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
+  twopass->kf_group_error_left = (kf_group_err - kf_mod_err);
 
   // Adjust the count of total modified error left.
   // The count of bits left is adjusted elsewhere based on real coded frame
   // sizes.
-  twopass->modified_error_left -= kf_group_err;
+  twopass->normalized_score_left -= kf_group_err;
 
   if (oxcf->resize_mode == RESIZE_DYNAMIC) {
     // Default to normal-sized frame on keyframes.
@@ -2746,114 +3524,117 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   }
 }
 
-// Define the reference buffers that will be updated post encode.
-static void configure_buffer_updates(VP9_COMP *cpi) {
-  TWO_PASS *const twopass = &cpi->twopass;
+// Configure image size specific vizier parameters.
+// Later these will be set via additional command line options
+void vp9_init_vizier_params(TWO_PASS *const twopass, int screen_area) {
+  // When |use_vizier_rc_params| is 1, we expect the rc parameters below to
+  // have been initialised on the command line as adjustment factors such
+  // that a factor of 1.0 will match the default behavior when
+  // |use_vizier_rc_params| is 0
+  if (twopass->use_vizier_rc_params) {
+    twopass->active_wq_factor *= AV_WQ_FACTOR;
+    twopass->err_per_mb *= BASELINE_ERR_PER_MB;
+    twopass->sr_default_decay_limit *= DEFAULT_DECAY_LIMIT;
+    if (twopass->sr_default_decay_limit > 1.0)  // > 1.0 here makes no sense
+      twopass->sr_default_decay_limit = 1.0;
+    twopass->sr_diff_factor *= 1.0;
+    twopass->gf_frame_max_boost *= GF_MAX_FRAME_BOOST;
+    twopass->gf_max_total_boost *= MAX_GF_BOOST;
+    // NOTE: In use max boost has precedence over min boost. So even if min is
+    // somehow set higher than max the final boost value will be clamped to the
+    // appropriate maximum.
+    twopass->kf_frame_min_boost *= KF_MIN_FRAME_BOOST;
+    twopass->kf_frame_max_boost_first *= KF_MAX_FRAME_BOOST;
+    twopass->kf_frame_max_boost_subs *= KF_MAX_FRAME_BOOST;
+    twopass->kf_max_total_boost *= MAX_KF_TOT_BOOST;
+    twopass->zm_factor *= DEFAULT_ZM_FACTOR;
+    if (twopass->zm_factor > 1.0)  // > 1.0 here makes no sense
+      twopass->zm_factor = 1.0;
 
-  cpi->rc.is_src_frame_alt_ref = 0;
-  switch (twopass->gf_group.update_type[twopass->gf_group.index]) {
-    case KF_UPDATE:
-      cpi->refresh_last_frame = 1;
-      cpi->refresh_golden_frame = 1;
-      cpi->refresh_alt_ref_frame = 1;
-      break;
-    case LF_UPDATE:
-      cpi->refresh_last_frame = 1;
-      cpi->refresh_golden_frame = 0;
-      cpi->refresh_alt_ref_frame = 0;
-      break;
-    case GF_UPDATE:
-      cpi->refresh_last_frame = 1;
-      cpi->refresh_golden_frame = 1;
-      cpi->refresh_alt_ref_frame = 0;
-      break;
-    case OVERLAY_UPDATE:
-      cpi->refresh_last_frame = 0;
-      cpi->refresh_golden_frame = 1;
-      cpi->refresh_alt_ref_frame = 0;
-      cpi->rc.is_src_frame_alt_ref = 1;
-      break;
-    case ARF_UPDATE:
-      cpi->refresh_last_frame = 0;
-      cpi->refresh_golden_frame = 0;
-      cpi->refresh_alt_ref_frame = 1;
-      break;
-    default: assert(0); break;
-  }
-  if (is_two_pass_svc(cpi)) {
-    if (cpi->svc.temporal_layer_id > 0) {
-      cpi->refresh_last_frame = 0;
-      cpi->refresh_golden_frame = 0;
+    // Correction for the fact that the kf_err_per_mb_factor default is
+    // already different for different video formats and ensures that a passed
+    // in value of 1.0 on the vizier command line will still match the current
+    // default.
+    if (screen_area < 1280 * 720) {
+      twopass->kf_err_per_mb *= 2000.0;
+    } else if (screen_area < 1920 * 1080) {
+      twopass->kf_err_per_mb *= 500.0;
+    } else {
+      twopass->kf_err_per_mb *= 250.0;
+    }
+  } else {
+    // When |use_vizier_rc_params| is 0, use defaults.
+    twopass->active_wq_factor = AV_WQ_FACTOR;
+    twopass->err_per_mb = BASELINE_ERR_PER_MB;
+    twopass->sr_default_decay_limit = DEFAULT_DECAY_LIMIT;
+    twopass->sr_diff_factor = 1.0;
+    twopass->gf_frame_max_boost = GF_MAX_FRAME_BOOST;
+    twopass->gf_max_total_boost = MAX_GF_BOOST;
+    twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST;
+    twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
+    twopass->kf_frame_max_boost_subs = KF_MAX_FRAME_BOOST;
+    twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
+    twopass->zm_factor = DEFAULT_ZM_FACTOR;
+
+    if (screen_area < 1280 * 720) {
+      twopass->kf_err_per_mb = 2000.0;
+    } else if (screen_area < 1920 * 1080) {
+      twopass->kf_err_per_mb = 500.0;
+    } else {
+      twopass->kf_err_per_mb = 250.0;
     }
-    if (cpi->svc.layer_context[cpi->svc.spatial_layer_id].gold_ref_idx < 0)
-      cpi->refresh_golden_frame = 0;
-    if (cpi->alt_ref_source == NULL) cpi->refresh_alt_ref_frame = 0;
   }
 }
 
-static int is_skippable_frame(const VP9_COMP *cpi) {
-  // If the current frame does not have non-zero motion vector detected in the
-  // first  pass, and so do its previous and forward frames, then this frame
-  // can be skipped for partition check, and the partition size is assigned
-  // according to the variance
-  const SVC *const svc = &cpi->svc;
-  const TWO_PASS *const twopass =
-      is_two_pass_svc(cpi) ? &svc->layer_context[svc->spatial_layer_id].twopass
-                           : &cpi->twopass;
-
-  return (!frame_is_intra_only(&cpi->common) &&
-          twopass->stats_in - 2 > twopass->stats_in_start &&
-          twopass->stats_in < twopass->stats_in_end &&
-          (twopass->stats_in - 1)->pcnt_inter -
-                  (twopass->stats_in - 1)->pcnt_motion ==
-              1 &&
-          (twopass->stats_in - 2)->pcnt_inter -
-                  (twopass->stats_in - 2)->pcnt_motion ==
-              1 &&
-          twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1);
-}
-
 void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   TWO_PASS *const twopass = &cpi->twopass;
   GF_GROUP *const gf_group = &twopass->gf_group;
   FIRSTPASS_STATS this_frame;
+  const int show_idx = cm->current_video_frame;
 
-  int target_rate;
-  LAYER_CONTEXT *const lc =
-      is_two_pass_svc(cpi) ? &cpi->svc.layer_context[cpi->svc.spatial_layer_id]
-                           : 0;
+  if (cpi->common.current_frame_coding_index == 0 &&
+      cpi->ext_ratectrl.funcs.send_firstpass_stats != NULL) {
+    const vpx_codec_err_t codec_status = vp9_extrc_send_firstpass_stats(
+        &cpi->ext_ratectrl, &cpi->twopass.first_pass_info);
+    if (codec_status != VPX_CODEC_OK) {
+      vpx_internal_error(&cm->error, codec_status,
+                         "vp9_extrc_send_firstpass_stats() failed");
+    }
+  }
 
   if (!twopass->stats_in) return;
 
-  // If this is an arf frame then we dont want to read the stats file or
+  // Configure image size specific vizier parameters
+  if (cm->current_video_frame == 0) {
+    unsigned int screen_area = (cm->width * cm->height);
+
+    vp9_init_vizier_params(twopass, screen_area);
+  }
+
+  // If this is an arf frame then we don't want to read the stats file or
   // advance the input pointer as we already have what we need.
   if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
     int target_rate;
-    configure_buffer_updates(cpi);
+
+    vp9_zero(this_frame);
+    this_frame =
+        cpi->twopass.stats_in_start[cm->current_video_frame +
+                                    gf_group->arf_src_offset[gf_group->index]];
+
+    vp9_configure_buffer_updates(cpi, gf_group->index);
+
     target_rate = gf_group->bit_allocation[gf_group->index];
     target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
     rc->base_frame_target = target_rate;
 
     cm->frame_type = INTER_FRAME;
 
-    if (lc != NULL) {
-      if (cpi->svc.spatial_layer_id == 0) {
-        lc->is_key_frame = 0;
-      } else {
-        lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame;
-
-        if (lc->is_key_frame) cpi->ref_frame_flags &= (~VP9_LAST_FLAG);
-      }
-    }
-
-    // Do the firstpass stats indicate that this frame is skippable for the
-    // partition search?
-    if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2 &&
-        (!cpi->use_svc || is_two_pass_svc(cpi))) {
-      cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
-    }
+    // The multiplication by 256 reverses a scaling factor of (>> 8)
+    // applied when combining MB error values for the frame.
+    twopass->mb_av_energy = log((this_frame.intra_error * 256.0) + 1.0);
+    twopass->mb_smooth_pct = this_frame.intra_smooth_pct;
 
     return;
   }
@@ -2862,15 +3643,12 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
 
   if (cpi->oxcf.rc_mode == VPX_Q) {
     twopass->active_worst_quality = cpi->oxcf.cq_level;
-  } else if (cm->current_video_frame == 0 ||
-             (lc != NULL && lc->current_video_frame_in_layer == 0)) {
+  } else if (cm->current_video_frame == 0) {
     const int frames_left =
-        (int)(twopass->total_stats.count -
-              ((lc != NULL) ? lc->current_video_frame_in_layer
-                            : cm->current_video_frame));
+        (int)(twopass->total_stats.count - cm->current_video_frame);
     // Special case code for first frame.
-    const int section_target_bandwidth =
-        (int)(twopass->bits_left / frames_left);
+    int64_t section_target_bandwidth = twopass->bits_left / frames_left;
+    section_target_bandwidth = VPXMIN(section_target_bandwidth, INT_MAX);
     const double section_length = twopass->total_left_stats.count;
     const double section_error =
         twopass->total_left_stats.coded_error / section_length;
@@ -2885,7 +3663,7 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
 
     tmp_q = get_twopass_worst_quality(
         cpi, section_error, section_intra_skip + section_inactive_zone,
-        section_noise, section_target_bandwidth);
+        section_noise, (int)section_target_bandwidth);
 
     twopass->active_worst_quality = tmp_q;
     twopass->baseline_active_worst_quality = tmp_q;
@@ -2907,87 +3685,51 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
 
   // Keyframe and section processing.
   if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) {
-    FIRSTPASS_STATS this_frame_copy;
-    this_frame_copy = this_frame;
     // Define next KF group and assign bits to it.
-    find_next_key_frame(cpi, &this_frame);
-    this_frame = this_frame_copy;
+    find_next_key_frame(cpi, show_idx);
   } else {
     cm->frame_type = INTER_FRAME;
   }
 
-  if (lc != NULL) {
-    if (cpi->svc.spatial_layer_id == 0) {
-      lc->is_key_frame = (cm->frame_type == KEY_FRAME);
-      if (lc->is_key_frame) {
-        cpi->ref_frame_flags &=
-            (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
-        lc->frames_from_key_frame = 0;
-        // Encode an intra only empty frame since we have a key frame.
-        cpi->svc.encode_intra_empty_frame = 1;
-      }
-    } else {
-      cm->frame_type = INTER_FRAME;
-      lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame;
-
-      if (lc->is_key_frame) {
-        cpi->ref_frame_flags &= (~VP9_LAST_FLAG);
-        lc->frames_from_key_frame = 0;
-      }
-    }
-  }
-
   // Define a new GF/ARF group. (Should always enter here for key frames).
   if (rc->frames_till_gf_update_due == 0) {
-    define_gf_group(cpi, &this_frame);
-
-    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
-    if (lc != NULL) cpi->refresh_golden_frame = 1;
+    define_gf_group(cpi, show_idx);
 
 #if ARF_STATS_OUTPUT
     {
       FILE *fpfile;
       fpfile = fopen("arf.stt", "a");
       ++arf_count;
-      fprintf(fpfile, "%10d %10ld %10d %10d %10ld\n", cm->current_video_frame,
-              rc->frames_till_gf_update_due, rc->kf_boost, arf_count,
-              rc->gfu_boost);
+      fprintf(fpfile, "%10d %10ld %10d %10d %10ld %10ld\n",
+              cm->current_video_frame, rc->baseline_gf_interval, rc->kf_boost,
+              arf_count, rc->gfu_boost, cm->frame_type);
 
       fclose(fpfile);
     }
 #endif
   }
 
-  configure_buffer_updates(cpi);
-
-  // Do the firstpass stats indicate that this frame is skippable for the
-  // partition search?
-  if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2 &&
-      (!cpi->use_svc || is_two_pass_svc(cpi))) {
-    cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
+  if (rc->frames_till_gf_update_due == 0) {
+    if (cpi->ext_ratectrl.ready && cpi->ext_ratectrl.log_file) {
+      fprintf(cpi->ext_ratectrl.log_file, "GOP_INFO show_frame_count %d\n",
+              rc->baseline_gf_interval);
+    }
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
   }
 
-  target_rate = gf_group->bit_allocation[gf_group->index];
-  rc->base_frame_target = target_rate;
+  vp9_configure_buffer_updates(cpi, gf_group->index);
 
-  {
-    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
-                            ? cpi->initial_mbs
-                            : cpi->common.MBs;
-    // The multiplication by 256 reverses a scaling factor of (>> 8)
-    // applied when combining MB error values for the frame.
-    twopass->mb_av_energy =
-        log(((this_frame.intra_error * 256.0) / num_mbs) + 1.0);
-    twopass->mb_smooth_pct = this_frame.intra_smooth_pct;
-  }
+  rc->base_frame_target = gf_group->bit_allocation[gf_group->index];
+
+  // The multiplication by 256 reverses a scaling factor of (>> 8)
+  // applied when combining MB error values for the frame.
+  twopass->mb_av_energy = log((this_frame.intra_error * 256.0) + 1.0);
+  twopass->mb_smooth_pct = this_frame.intra_smooth_pct;
 
   // Update the total stats remaining structure.
   subtract_stats(&twopass->total_left_stats, &this_frame);
 }
 
-#define MINQ_ADJ_LIMIT 48
-#define MINQ_ADJ_LIMIT_CQ 20
-#define HIGH_UNDERSHOOT_RATIO 2
 void vp9_twopass_postencode_update(VP9_COMP *cpi) {
   TWO_PASS *const twopass = &cpi->twopass;
   RATE_CONTROL *const rc = &cpi->rc;
@@ -3015,8 +3757,7 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) {
     rc->rate_error_estimate = 0;
   }
 
-  if (cpi->common.frame_type != KEY_FRAME &&
-      !vp9_is_upper_layer_key_frame(cpi)) {
+  if (cpi->common.frame_type != KEY_FRAME) {
     twopass->kf_group_bits -= bits_used;
     twopass->last_kfgroup_zeromotion_pct = twopass->kf_zeromotion_pct;
   }
@@ -3036,7 +3777,8 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) {
 
     // Extend min or Max Q range to account for imbalance from the base
     // value when using AQ.
-    if (cpi->oxcf.aq_mode != NO_AQ) {
+    if (cpi->oxcf.aq_mode != NO_AQ && cpi->oxcf.aq_mode != PSNR_AQ &&
+        cpi->oxcf.aq_mode != PERCEPTUAL_AQ) {
       if (cm->seg.aq_av_offset < 0) {
         // The balance of the AQ map tends towarda lowering the average Q.
         aq_extend_min = 0;
@@ -3086,7 +3828,8 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) {
         rc->vbr_bits_off_target_fast +=
             fast_extra_thresh - rc->projected_frame_size;
         rc->vbr_bits_off_target_fast =
-            VPXMIN(rc->vbr_bits_off_target_fast, (4 * rc->avg_frame_bandwidth));
+            VPXMIN(rc->vbr_bits_off_target_fast,
+                   (4 * (int64_t)rc->avg_frame_bandwidth));
 
         // Fast adaptation of minQ if necessary to use up the extra bits.
         if (rc->avg_frame_bandwidth) {
@@ -3104,3 +3847,10 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) {
     }
   }
 }
+
+FIRSTPASS_STATS vp9_get_frame_stats(const TWO_PASS *twopass) {
+  return twopass->this_frame_stats;
+}
+FIRSTPASS_STATS vp9_get_total_stats(const TWO_PASS *twopass) {
+  return twopass->total_stats;
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.h b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.h
index 5541893dc8..9fdd5fcb1d 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.h
@@ -8,9 +8,13 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_FIRSTPASS_H_
-#define VP9_ENCODER_VP9_FIRSTPASS_H_
+#ifndef VPX_VP9_ENCODER_VP9_FIRSTPASS_H_
+#define VPX_VP9_ENCODER_VP9_FIRSTPASS_H_
 
+#include <assert.h>
+
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/encoder/vp9_firstpass_stats.h"
 #include "vp9/encoder/vp9_lookahead.h"
 #include "vp9/encoder/vp9_ratectrl.h"
 
@@ -18,53 +22,42 @@
 extern "C" {
 #endif
 
-#if CONFIG_FP_MB_STATS
+#define INVALID_ROW (-1)
 
-#define FPMB_DCINTRA_MASK 0x01
-
-#define FPMB_MOTION_ZERO_MASK 0x02
-#define FPMB_MOTION_LEFT_MASK 0x04
-#define FPMB_MOTION_RIGHT_MASK 0x08
-#define FPMB_MOTION_UP_MASK 0x10
-#define FPMB_MOTION_DOWN_MASK 0x20
-
-#define FPMB_ERROR_SMALL_MASK 0x40
-#define FPMB_ERROR_LARGE_MASK 0x80
-#define FPMB_ERROR_SMALL_TH 2000
-#define FPMB_ERROR_LARGE_TH 48000
+#define MAX_ARF_LAYERS 6
+#define SECTION_NOISE_DEF 250.0
 
 typedef struct {
-  uint8_t *mb_stats_start;
-  uint8_t *mb_stats_end;
-} FIRSTPASS_MB_STATS;
-#endif
+  double frame_mb_intra_factor;
+  double frame_mb_brightness_factor;
+  double frame_mb_neutral_count;
+} FP_MB_FLOAT_STATS;
 
 typedef struct {
-  double frame;
-  double weight;
-  double intra_error;
-  double coded_error;
-  double sr_coded_error;
-  double frame_noise_energy;
-  double pcnt_inter;
-  double pcnt_motion;
-  double pcnt_second_ref;
-  double pcnt_neutral;
-  double intra_skip_pct;
-  double intra_smooth_pct;    // % of blocks that are smooth
-  double inactive_zone_rows;  // Image mask rows top and bottom.
-  double inactive_zone_cols;  // Image mask columns at left and right edges.
-  double MVr;
-  double mvr_abs;
-  double MVc;
-  double mvc_abs;
-  double MVrv;
-  double MVcv;
-  double mv_in_out_count;
-  double duration;
-  double count;
-  int64_t spatial_layer_id;
-} FIRSTPASS_STATS;
+  double intra_factor;
+  double brightness_factor;
+  int64_t coded_error;
+  int64_t sr_coded_error;
+  int64_t frame_noise_energy;
+  int64_t intra_error;
+  int intercount;
+  int second_ref_count;
+  double neutral_count;
+  double intra_count_low;   // Coded intra but low variance
+  double intra_count_high;  // Coded intra high variance
+  int intra_skip_count;
+  int image_data_start_row;
+  int mvcount;
+  int sum_mvr;
+  int sum_mvr_abs;
+  int sum_mvc;
+  int sum_mvc_abs;
+  int64_t sum_mvrs;
+  int64_t sum_mvcs;
+  int sum_in_vectors;
+  int intra_smooth_count;
+  int new_mv_count;
+} FIRSTPASS_DATA;
 
 typedef enum {
   KF_UPDATE = 0,
@@ -72,7 +65,9 @@ typedef enum {
   GF_UPDATE = 2,
   ARF_UPDATE = 3,
   OVERLAY_UPDATE = 4,
-  FRAME_UPDATE_TYPES = 5
+  MID_OVERLAY_UPDATE = 5,
+  USE_BUF_FRAME = 6,  // Use show existing frame, no ref buffer update
+  FRAME_UPDATE_TYPES = 7
 } FRAME_UPDATE_TYPE;
 
 #define FC_ANIMATION_THRESH 0.15
@@ -82,38 +77,83 @@ typedef enum {
   FRAME_CONTENT_TYPES = 2
 } FRAME_CONTENT_TYPE;
 
+typedef struct ExternalRcReference {
+  int last_index;
+  int golden_index;
+  int altref_index;
+} ExternalRcReference;
+
 typedef struct {
   unsigned char index;
-  unsigned char first_inter_index;
-  RATE_FACTOR_LEVEL rf_level[(MAX_LAG_BUFFERS * 2) + 1];
-  FRAME_UPDATE_TYPE update_type[(MAX_LAG_BUFFERS * 2) + 1];
-  unsigned char arf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
-  unsigned char arf_update_idx[(MAX_LAG_BUFFERS * 2) + 1];
-  unsigned char arf_ref_idx[(MAX_LAG_BUFFERS * 2) + 1];
-  int bit_allocation[(MAX_LAG_BUFFERS * 2) + 1];
+  RATE_FACTOR_LEVEL rf_level[MAX_STATIC_GF_GROUP_LENGTH + 2];
+  FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH + 2];
+  unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH + 2];
+  unsigned char layer_depth[MAX_STATIC_GF_GROUP_LENGTH + 2];
+  unsigned char frame_gop_index[MAX_STATIC_GF_GROUP_LENGTH + 2];
+  int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH + 2];
+  int gfu_boost[MAX_STATIC_GF_GROUP_LENGTH + 2];
+  int update_ref_idx[MAX_STATIC_GF_GROUP_LENGTH + 2];
+
+  int frame_start;
+  int frame_end;
+  // TODO(jingning): The array size of arf_stack could be reduced.
+  int arf_index_stack[MAX_LAG_BUFFERS * 2];
+  int top_arf_idx;
+  int stack_size;
+  int gf_group_size;
+  int max_layer_depth;
+  int allowed_max_layer_depth;
+  int group_noise_energy;
+
+  // Structure to store the reference information from external RC.
+  // Used to override reference frame decisions in libvpx.
+  ExternalRcReference ext_rc_ref[MAX_STATIC_GF_GROUP_LENGTH + 2];
+  int ref_frame_list[MAX_STATIC_GF_GROUP_LENGTH + 2][REFS_PER_FRAME];
 } GF_GROUP;
 
+typedef struct {
+  const FIRSTPASS_STATS *stats;
+  int num_frames;
+} FIRST_PASS_INFO;
+
+static INLINE void fps_init_first_pass_info(FIRST_PASS_INFO *first_pass_info,
+                                            const FIRSTPASS_STATS *stats,
+                                            int num_frames) {
+  first_pass_info->stats = stats;
+  first_pass_info->num_frames = num_frames;
+}
+
+static INLINE int fps_get_num_frames(const FIRST_PASS_INFO *first_pass_info) {
+  return first_pass_info->num_frames;
+}
+
+static INLINE const FIRSTPASS_STATS *fps_get_frame_stats(
+    const FIRST_PASS_INFO *first_pass_info, int show_idx) {
+  if (show_idx < 0 || show_idx >= first_pass_info->num_frames) {
+    return NULL;
+  }
+  return &first_pass_info->stats[show_idx];
+}
+
 typedef struct {
   unsigned int section_intra_rating;
+  unsigned int key_frame_section_intra_rating;
   FIRSTPASS_STATS total_stats;
   FIRSTPASS_STATS this_frame_stats;
   const FIRSTPASS_STATS *stats_in;
   const FIRSTPASS_STATS *stats_in_start;
   const FIRSTPASS_STATS *stats_in_end;
+  FIRST_PASS_INFO first_pass_info;
   FIRSTPASS_STATS total_left_stats;
   int first_pass_done;
   int64_t bits_left;
-  double modified_error_min;
-  double modified_error_max;
-  double modified_error_left;
+  double mean_mod_score;
+  double normalized_score_left;
   double mb_av_energy;
   double mb_smooth_pct;
 
-#if CONFIG_FP_MB_STATS
-  uint8_t *frame_mb_stats_buf;
-  uint8_t *this_frame_mb_stats;
-  FIRSTPASS_MB_STATS firstpass_mb_stats;
-#endif
+  FP_MB_FLOAT_STATS *fp_mb_float_stats;
+
   // An indication of the content type of the current frame
   FRAME_CONTENT_TYPE fr_content_type;
 
@@ -121,7 +161,7 @@ typedef struct {
   int64_t kf_group_bits;
 
   // Error score of frames still to be coded in kf group
-  int64_t kf_group_error_left;
+  double kf_group_error_left;
 
   double bpm_factor;
   int rolling_arf_group_target_bits;
@@ -136,20 +176,46 @@ typedef struct {
   int extend_maxq;
   int extend_minq_fast;
   int arnr_strength_adjustment;
+  int last_qindex_of_arf_layer[MAX_ARF_LAYERS];
 
   GF_GROUP gf_group;
+
+  // Vizeir project experimental two pass rate control parameters.
+  // When |use_vizier_rc_params| is 1, the following parameters will
+  // be overwritten by pass in values. Otherwise, they are initialized
+  // by default values.
+  int use_vizier_rc_params;
+  double active_wq_factor;
+  double err_per_mb;
+  double sr_default_decay_limit;
+  double sr_diff_factor;
+  double kf_err_per_mb;
+  double kf_frame_min_boost;
+  double kf_frame_max_boost_first;  // Max for first kf in a chunk.
+  double kf_frame_max_boost_subs;   // Max for subsequent mid chunk kfs.
+  double kf_max_total_boost;
+  double gf_max_total_boost;
+  double gf_frame_max_boost;
+  double zm_factor;
 } TWO_PASS;
 
 struct VP9_COMP;
+struct ThreadData;
+struct TileDataEnc;
 
 void vp9_init_first_pass(struct VP9_COMP *cpi);
-void vp9_rc_get_first_pass_params(struct VP9_COMP *cpi);
 void vp9_first_pass(struct VP9_COMP *cpi, const struct lookahead_entry *source);
 void vp9_end_first_pass(struct VP9_COMP *cpi);
 
+void vp9_first_pass_encode_tile_mb_row(struct VP9_COMP *cpi,
+                                       struct ThreadData *td,
+                                       FIRSTPASS_DATA *fp_acc_data,
+                                       struct TileDataEnc *tile_data,
+                                       MV *best_ref_mv, int mb_row);
+
 void vp9_init_second_pass(struct VP9_COMP *cpi);
 void vp9_rc_get_second_pass_params(struct VP9_COMP *cpi);
-void vp9_twopass_postencode_update(struct VP9_COMP *cpi);
+void vp9_init_vizier_params(TWO_PASS *const twopass, int screen_area);
 
 // Post encode update of the rate control parameters for 2-pass
 void vp9_twopass_postencode_update(struct VP9_COMP *cpi);
@@ -157,8 +223,16 @@ void vp9_twopass_postencode_update(struct VP9_COMP *cpi);
 void calculate_coded_size(struct VP9_COMP *cpi, int *scaled_frame_width,
                           int *scaled_frame_height);
 
+struct VP9EncoderConfig;
+int vp9_get_frames_to_next_key(const struct VP9EncoderConfig *oxcf,
+                               const TWO_PASS *const twopass, int kf_show_idx,
+                               int min_gf_interval);
+
+FIRSTPASS_STATS vp9_get_frame_stats(const TWO_PASS *twopass);
+FIRSTPASS_STATS vp9_get_total_stats(const TWO_PASS *twopass);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_FIRSTPASS_H_
+#endif  // VPX_VP9_ENCODER_VP9_FIRSTPASS_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_firstpass_stats.h b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass_stats.h
new file mode 100644
index 0000000000..01928e7816
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass_stats.h
@@ -0,0 +1,54 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_FIRSTPASS_STATS_H_
+#define VPX_VP9_ENCODER_VP9_FIRSTPASS_STATS_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  double frame;
+  double weight;
+  double intra_error;
+  double coded_error;
+  double sr_coded_error;
+  double frame_noise_energy;
+  double pcnt_inter;
+  double pcnt_motion;
+  double pcnt_second_ref;
+  double pcnt_neutral;
+  double pcnt_intra_low;   // Coded intra but low variance
+  double pcnt_intra_high;  // Coded intra high variance
+  double intra_skip_pct;
+  double intra_smooth_pct;    // % of blocks that are smooth
+  double inactive_zone_rows;  // Image mask rows top and bottom.
+  double inactive_zone_cols;  // Image mask columns at left and right edges.
+  double MVr;
+  double mvr_abs;
+  double MVc;
+  double mvc_abs;
+  double MVrv;
+  double MVcv;
+  double mv_in_out_count;
+  double duration;
+  double count;
+  double new_mv_count;
+  int64_t spatial_layer_id;
+} FIRSTPASS_STATS;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VP9_ENCODER_VP9_FIRSTPASS_STATS_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_frame_scale.c b/media/libvpx/libvpx/vp9/encoder/vp9_frame_scale.c
new file mode 100644
index 0000000000..c74d523246
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_frame_scale.c
@@ -0,0 +1,136 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vpx/vpx_codec.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_scale/yv12config.h"
+
+void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
+                                  YV12_BUFFER_CONFIG *dst,
+                                  INTERP_FILTER filter_type, int phase_scaler) {
+  const int src_w = src->y_crop_width;
+  const int src_h = src->y_crop_height;
+  const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer,
+                                   src->v_buffer };
+  const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
+  uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer };
+  const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride };
+  const InterpKernel *const kernel = vp9_filter_kernels[filter_type];
+  int x, y, i;
+
+#if HAVE_SSSE3 || HAVE_NEON
+  // TODO(linfengz): The 4:3 specialized C code is disabled by default since
+  // it's much slower than the general version which calls vpx_scaled_2d() even
+  // if vpx_scaled_2d() is not optimized. It will only be enabled as a reference
+  // for the platforms which have faster optimization.
+  if (4 * dst->y_crop_width == 3 * src_w &&
+      4 * dst->y_crop_height == 3 * src_h) {
+    // Specialize 4 to 3 scaling.
+    // Example pixel locations.
+    // (O: Original pixel. S: Scaled pixel. X: Overlapped pixel.)
+    //      phase_scaler = 0               |      phase_scaler = 8
+    //                                     |
+    //      X     O S   O   S O     X      |      O     O     O     O     O
+    //                                     |
+    //                                     |
+    //                                     |         S       S       S
+    //                                     |
+    //                                     |
+    //      O     O     O     O     O      |      O     O     O     O     O
+    //                                     |
+    //      S       S       S       S      |
+    //                                     |
+    //                                     |
+    //                                     |         S       S       S
+    //      O     O     O     O     O      |      O     O     O     O     O
+    //                                     |
+    //                                     |
+    //                                     |
+    //      S       S       S       S      |
+    //                                     |
+    //      O     O     O     O     O      |      O     O     O     O     O
+    //                                     |         S       S       S
+    //                                     |
+    //                                     |
+    //                                     |
+    //                                     |
+    //      X     O S   O   S O     X      |      O     O     O     O     O
+
+    const int dst_ws[3] = { dst->y_crop_width, dst->uv_crop_width,
+                            dst->uv_crop_width };
+    const int dst_hs[3] = { dst->y_crop_height, dst->uv_crop_height,
+                            dst->uv_crop_height };
+    for (i = 0; i < MAX_MB_PLANE; ++i) {
+      const int dst_w = dst_ws[i];
+      const int dst_h = dst_hs[i];
+      const int src_stride = src_strides[i];
+      const int dst_stride = dst_strides[i];
+      for (y = 0; y < dst_h; y += 3) {
+        for (x = 0; x < dst_w; x += 3) {
+          const uint8_t *src_ptr = srcs[i] + 4 * y / 3 * src_stride + 4 * x / 3;
+          uint8_t *dst_ptr = dsts[i] + y * dst_stride + x;
+
+          // Must call c function because its optimization doesn't support 3x3.
+          vpx_scaled_2d_c(src_ptr, src_stride, dst_ptr, dst_stride, kernel,
+                          phase_scaler, 64 / 3, phase_scaler, 64 / 3, 3, 3);
+        }
+      }
+    }
+  } else
+#endif
+  {
+    const int dst_w = dst->y_crop_width;
+    const int dst_h = dst->y_crop_height;
+
+    // The issue b/311394513 reveals a corner case bug. vpx_scaled_2d() requires
+    // both x_step_q4 and y_step_q4 are less than or equal to 64. Otherwise, it
+    // needs to call vp9_scale_and_extend_frame_nonnormative() that supports
+    // arbitrary scaling.
+    const int x_step_q4 = 16 * src_w / dst_w;
+    const int y_step_q4 = 16 * src_h / dst_h;
+    if (x_step_q4 > 64 || y_step_q4 > 64) {
+      // This function is only called while cm->bit_depth is VPX_BITS_8.
+#if CONFIG_VP9_HIGHBITDEPTH
+      vp9_scale_and_extend_frame_nonnormative(src, dst, (int)VPX_BITS_8);
+#else
+      vp9_scale_and_extend_frame_nonnormative(src, dst);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      return;
+    }
+
+    for (i = 0; i < MAX_MB_PLANE; ++i) {
+      const int factor = (i == 0 || i == 3 ? 1 : 2);
+      const int src_stride = src_strides[i];
+      const int dst_stride = dst_strides[i];
+      for (y = 0; y < dst_h; y += 16) {
+        const int y_q4 = y * (16 / factor) * src_h / dst_h + phase_scaler;
+        for (x = 0; x < dst_w; x += 16) {
+          const int x_q4 = x * (16 / factor) * src_w / dst_w + phase_scaler;
+          const uint8_t *src_ptr = srcs[i] +
+                                   (y / factor) * src_h / dst_h * src_stride +
+                                   (x / factor) * src_w / dst_w;
+          uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
+
+          vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel,
+                        x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,
+                        16 * src_h / dst_h, 16 / factor, 16 / factor);
+        }
+      }
+    }
+  }
+
+  vpx_extend_frame_borders(dst);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_job_queue.h b/media/libvpx/libvpx/vp9/encoder/vp9_job_queue.h
new file mode 100644
index 0000000000..ad09c11198
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_job_queue.h
@@ -0,0 +1,46 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_JOB_QUEUE_H_
+#define VPX_VP9_ENCODER_VP9_JOB_QUEUE_H_
+
+typedef enum {
+  FIRST_PASS_JOB,
+  ENCODE_JOB,
+  ARNR_JOB,
+  NUM_JOB_TYPES,
+} JOB_TYPE;
+
+// Encode job parameters
+typedef struct {
+  int vert_unit_row_num;  // Index of the vertical unit row
+  int tile_col_id;        // tile col id within a tile
+  int tile_row_id;        // tile col id within a tile
+} JobNode;
+
+// Job queue element parameters
+typedef struct {
+  // Pointer to the next link in the job queue
+  void *next;
+
+  // Job information context of the module
+  JobNode job_info;
+} JobQueue;
+
+// Job queue handle
+typedef struct {
+  // Pointer to the next link in the job queue
+  void *next;
+
+  // Counter to store the number of jobs picked up for processing
+  int num_jobs_acquired;
+} JobQueueHandle;
+
+#endif  // VPX_VP9_ENCODER_VP9_JOB_QUEUE_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c b/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c
index 392cd5d418..ba4fe3d3b7 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c
@@ -9,6 +9,7 @@
  */
 #include <assert.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include "./vpx_config.h"
 
@@ -64,6 +65,7 @@ struct lookahead_ctx *vp9_lookahead_init(unsigned int width,
     unsigned int i;
     ctx->max_sz = depth;
     ctx->buf = calloc(depth, sizeof(*ctx->buf));
+    ctx->next_show_idx = 0;
     if (!ctx->buf) goto bail;
     for (i = 0; i < depth; i++)
       if (vpx_alloc_frame_buffer(
@@ -80,20 +82,18 @@ bail:
   return NULL;
 }
 
-#define USE_PARTIAL_COPY 0
+int vp9_lookahead_full(const struct lookahead_ctx *ctx) {
+  return ctx->sz + 1 + MAX_PRE_FRAMES > ctx->max_sz;
+}
+
+int vp9_lookahead_next_show_idx(const struct lookahead_ctx *ctx) {
+  return ctx->next_show_idx;
+}
 
 int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
-                       int64_t ts_start, int64_t ts_end,
-#if CONFIG_VP9_HIGHBITDEPTH
-                       int use_highbitdepth,
-#endif
+                       int64_t ts_start, int64_t ts_end, int use_highbitdepth,
                        vpx_enc_frame_flags_t flags) {
   struct lookahead_entry *buf;
-#if USE_PARTIAL_COPY
-  int row, col, active_end;
-  int mb_rows = (src->y_height + 15) >> 4;
-  int mb_cols = (src->y_width + 15) >> 4;
-#endif
   int width = src->y_crop_width;
   int height = src->y_crop_height;
   int uv_width = src->uv_crop_width;
@@ -101,8 +101,12 @@ int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
   int subsampling_x = src->subsampling_x;
   int subsampling_y = src->subsampling_y;
   int larger_dimensions, new_dimensions;
+#if !CONFIG_VP9_HIGHBITDEPTH
+  (void)use_highbitdepth;
+  assert(use_highbitdepth == 0);
+#endif
 
-  if (ctx->sz + 1 + MAX_PRE_FRAMES > ctx->max_sz) return 1;
+  if (vp9_lookahead_full(ctx)) return 1;
   ctx->sz++;
   buf = pop(ctx, &ctx->write_idx);
 
@@ -110,80 +114,50 @@ int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
                    height != buf->img.y_crop_height ||
                    uv_width != buf->img.uv_crop_width ||
                    uv_height != buf->img.uv_crop_height;
-  larger_dimensions = width > buf->img.y_width || height > buf->img.y_height ||
-                      uv_width > buf->img.uv_width ||
-                      uv_height > buf->img.uv_height;
+  larger_dimensions =
+      width > buf->img.y_crop_width || height > buf->img.y_crop_height ||
+      uv_width > buf->img.uv_crop_width || uv_height > buf->img.uv_crop_height;
   assert(!larger_dimensions || new_dimensions);
 
-#if USE_PARTIAL_COPY
-  // TODO(jkoleszar): This is disabled for now, as
-  // vp9_copy_and_extend_frame_with_rect is not subsampling/alpha aware.
-
-  // Only do this partial copy if the following conditions are all met:
-  // 1. Lookahead queue has has size of 1.
-  // 2. Active map is provided.
-  // 3. This is not a key frame, golden nor altref frame.
-  if (!new_dimensions && ctx->max_sz == 1 && active_map && !flags) {
-    for (row = 0; row < mb_rows; ++row) {
-      col = 0;
-
-      while (1) {
-        // Find the first active macroblock in this row.
-        for (; col < mb_cols; ++col) {
-          if (active_map[col]) break;
-        }
-
-        // No more active macroblock in this row.
-        if (col == mb_cols) break;
-
-        // Find the end of active region in this row.
-        active_end = col;
-
-        for (; active_end < mb_cols; ++active_end) {
-          if (!active_map[active_end]) break;
-        }
-
-        // Only copy this active region.
-        vp9_copy_and_extend_frame_with_rect(src, &buf->img, row << 4, col << 4,
-                                            16, (active_end - col) << 4);
-
-        // Start again from the end of this active region.
-        col = active_end;
-      }
-
-      active_map += mb_cols;
-    }
-  } else {
-#endif
-    if (larger_dimensions) {
-      YV12_BUFFER_CONFIG new_img;
-      memset(&new_img, 0, sizeof(new_img));
-      if (vpx_alloc_frame_buffer(&new_img, width, height, subsampling_x,
-                                 subsampling_y,
+  if (larger_dimensions) {
+    YV12_BUFFER_CONFIG new_img;
+    memset(&new_img, 0, sizeof(new_img));
+    if (vpx_alloc_frame_buffer(&new_img, width, height, subsampling_x,
+                               subsampling_y,
 #if CONFIG_VP9_HIGHBITDEPTH
-                                 use_highbitdepth,
+                               use_highbitdepth,
 #endif
-                                 VP9_ENC_BORDER_IN_PIXELS, 0))
-        return 1;
-      vpx_free_frame_buffer(&buf->img);
-      buf->img = new_img;
-    } else if (new_dimensions) {
-      buf->img.y_crop_width = src->y_crop_width;
-      buf->img.y_crop_height = src->y_crop_height;
-      buf->img.uv_crop_width = src->uv_crop_width;
-      buf->img.uv_crop_height = src->uv_crop_height;
-      buf->img.subsampling_x = src->subsampling_x;
-      buf->img.subsampling_y = src->subsampling_y;
-    }
-    // Partial copy not implemented yet
-    vp9_copy_and_extend_frame(src, &buf->img);
-#if USE_PARTIAL_COPY
+                               VP9_ENC_BORDER_IN_PIXELS, 0))
+      return 1;
+    vpx_free_frame_buffer(&buf->img);
+    buf->img = new_img;
+  } else if (new_dimensions) {
+    int aligned_width = ALIGN_POWER_OF_TWO(width, 3);
+    buf->img.y_width = src->y_width;
+    buf->img.y_height = src->y_height;
+    buf->img.uv_width = src->uv_width;
+    buf->img.uv_height = src->uv_height;
+    buf->img.y_crop_width = src->y_crop_width;
+    buf->img.y_crop_height = src->y_crop_height;
+    buf->img.uv_crop_width = src->uv_crop_width;
+    buf->img.uv_crop_height = src->uv_crop_height;
+    buf->img.subsampling_x = src->subsampling_x;
+    buf->img.subsampling_y = src->subsampling_y;
+    // Here the new width (src->y_crop_width) is <= the previous width
+    // (since otherwise it would enter the "larger_dimensions" code), so
+    // it is safe here to update the stride.
+    // The stride setting is taken from vpx_alloc_frame_buffer().
+    buf->img.y_stride =
+        ALIGN_POWER_OF_TWO((aligned_width + 2 * buf->img.border), 5);
+    buf->img.uv_stride = buf->img.y_stride >> subsampling_x;
   }
-#endif
+  vp9_copy_and_extend_frame(src, &buf->img);
 
   buf->ts_start = ts_start;
   buf->ts_end = ts_end;
   buf->flags = flags;
+  buf->show_idx = ctx->next_show_idx;
+  ++ctx->next_show_idx;
   return 0;
 }
 
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.h b/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.h
index 88be0ffcd5..6ac6736673 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.h
@@ -8,17 +8,13 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_LOOKAHEAD_H_
-#define VP9_ENCODER_VP9_LOOKAHEAD_H_
+#ifndef VPX_VP9_ENCODER_VP9_LOOKAHEAD_H_
+#define VPX_VP9_ENCODER_VP9_LOOKAHEAD_H_
 
 #include "vpx_scale/yv12config.h"
 #include "vpx/vpx_encoder.h"
 #include "vpx/vpx_integer.h"
 
-#if CONFIG_SPATIAL_SVC
-#include "vpx/vp8cx.h"
-#endif
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -29,6 +25,7 @@ struct lookahead_entry {
   YV12_BUFFER_CONFIG img;
   int64_t ts_start;
   int64_t ts_end;
+  int show_idx; /*The show_idx of this frame*/
   vpx_enc_frame_flags_t flags;
 };
 
@@ -36,10 +33,12 @@ struct lookahead_entry {
 #define MAX_PRE_FRAMES 1
 
 struct lookahead_ctx {
-  int max_sz;                  /* Absolute size of the queue */
-  int sz;                      /* Number of buffers currently in the queue */
-  int read_idx;                /* Read index */
-  int write_idx;               /* Write index */
+  int max_sz;        /* Absolute size of the queue */
+  int sz;            /* Number of buffers currently in the queue */
+  int read_idx;      /* Read index */
+  int write_idx;     /* Write index */
+  int next_show_idx; /* The show_idx that will be assigned to the next frame
+                        being pushed in the queue*/
   struct lookahead_entry *buf; /* Buffer list */
 };
 
@@ -61,26 +60,36 @@ struct lookahead_ctx *vp9_lookahead_init(unsigned int width,
  */
 void vp9_lookahead_destroy(struct lookahead_ctx *ctx);
 
+/**\brief Check if lookahead is full
+ *
+ * \param[in] ctx         Pointer to the lookahead context
+ *
+ * Return 1 if lookahead is full, otherwise return 0.
+ */
+int vp9_lookahead_full(const struct lookahead_ctx *ctx);
+
+/**\brief Return the next_show_idx
+ *
+ * \param[in] ctx         Pointer to the lookahead context
+ *
+ * Return the show_idx that will be assigned to the next
+ * frame pushed by vp9_lookahead_push()
+ */
+int vp9_lookahead_next_show_idx(const struct lookahead_ctx *ctx);
+
 /**\brief Enqueue a source buffer
  *
  * This function will copy the source image into a new framebuffer with
  * the expected stride/border.
  *
- * If active_map is non-NULL and there is only one frame in the queue, then copy
- * only active macroblocks.
- *
  * \param[in] ctx         Pointer to the lookahead context
  * \param[in] src         Pointer to the image to enqueue
  * \param[in] ts_start    Timestamp for the start of this frame
  * \param[in] ts_end      Timestamp for the end of this frame
  * \param[in] flags       Flags set on this frame
- * \param[in] active_map  Map that specifies which macroblock is active
  */
 int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
-                       int64_t ts_start, int64_t ts_end,
-#if CONFIG_VP9_HIGHBITDEPTH
-                       int use_highbitdepth,
-#endif
+                       int64_t ts_start, int64_t ts_end, int use_highbitdepth,
                        vpx_enc_frame_flags_t flags);
 
 /**\brief Get the next source buffer to encode
@@ -115,4 +124,4 @@ unsigned int vp9_lookahead_depth(struct lookahead_ctx *ctx);
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_LOOKAHEAD_H_
+#endif  // VPX_VP9_ENCODER_VP9_LOOKAHEAD_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.c b/media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.c
index e000220b97..2f20a8fe6d 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.c
@@ -45,20 +45,24 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, const MV *ref_mv,
 
   mv_sf->search_method = HEX;
   vp9_full_pixel_search(cpi, x, BLOCK_16X16, &ref_full, step_param,
-                        x->errorperbit, cond_cost_list(cpi, cost_list), ref_mv,
-                        dst_mv, 0, 0);
+                        cpi->sf.mv.search_method, x->errorperbit,
+                        cond_cost_list(cpi, cost_list), ref_mv, dst_mv, 0, 0);
   mv_sf->search_method = old_search_method;
 
+  /* restore UMV window */
+  x->mv_limits = tmp_mv_limits;
+
   // Try sub-pixel MC
   // if (bestsme > error_thresh && bestsme < INT_MAX)
   {
     uint32_t distortion;
     uint32_t sse;
+    // TODO(yunqing): may use higher tap interp filter than 2 taps if needed.
     cpi->find_fractional_mv_step(
         x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
-        &v_fn_ptr, 0, mv_sf->subpel_iters_per_step,
+        &v_fn_ptr, 0, mv_sf->subpel_search_level,
         cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0,
-        0);
+        0, USE_2_TAPS);
   }
 
   xd->mi[0]->mode = NEWMV;
@@ -66,9 +70,6 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, const MV *ref_mv,
 
   vp9_build_inter_predictors_sby(xd, mb_row, mb_col, BLOCK_16X16);
 
-  /* restore UMV window */
-  x->mv_limits = tmp_mv_limits;
-
   return vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
                       xd->plane[0].dst.buf, xd->plane[0].dst.stride);
 }
@@ -97,8 +98,7 @@ static int do_16x16_motion_search(VP9_COMP *cpi, const MV *ref_mv,
   // If the current best reference mv is not centered on 0,0 then do a 0,0
   // based search as well.
   if (ref_mv->row != 0 || ref_mv->col != 0) {
-    unsigned int tmp_err;
-    MV zero_ref_mv = { 0, 0 }, tmp_mv;
+    MV zero_ref_mv = { 0, 0 };
 
     tmp_err =
         do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv, mb_row, mb_col);
@@ -218,7 +218,7 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
   VP9_COMMON *const cm = &cpi->common;
 
   int mb_col, mb_row, offset = 0;
-  int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0;
+  int mb_y_offset = 0;
   MV gld_top_mv = { 0, 0 };
   MODE_INFO mi_local;
   MODE_INFO mi_above, mi_left;
@@ -237,13 +237,11 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
   xd->mi[0] = &mi_local;
   mi_local.sb_type = BLOCK_16X16;
   mi_local.ref_frame[0] = LAST_FRAME;
-  mi_local.ref_frame[1] = NONE;
+  mi_local.ref_frame[1] = NO_REF_FRAME;
 
   for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
     MV gld_left_mv = gld_top_mv;
     int mb_y_in_offset = mb_y_offset;
-    int arf_y_in_offset = arf_y_offset;
-    int gld_y_in_offset = gld_y_offset;
 
     // Set up limit values for motion vectors to prevent them extending outside
     // the UMV borders.
@@ -265,8 +263,6 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
       xd->left_mi = &mi_left;
 
       mb_y_in_offset += 16;
-      gld_y_in_offset += 16;
-      arf_y_in_offset += 16;
       x->mv_limits.col_min -= 16;
       x->mv_limits.col_max -= 16;
     }
@@ -275,8 +271,6 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
     xd->above_mi = &mi_above;
 
     mb_y_offset += buf->y_stride * 16;
-    gld_y_offset += golden_ref->y_stride * 16;
-    if (alt_ref) arf_y_offset += alt_ref->y_stride * 16;
     x->mv_limits.row_min -= 16;
     x->mv_limits.row_max -= 16;
     offset += cm->mb_cols;
@@ -294,7 +288,7 @@ static void separate_arf_mbs(VP9_COMP *cpi) {
   int *arf_not_zz;
 
   CHECK_MEM_ERROR(
-      cm, arf_not_zz,
+      &cm->error, arf_not_zz,
       vpx_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz), 1));
 
   // We are not interested in results beyond the alt ref itself.
@@ -339,23 +333,16 @@ static void separate_arf_mbs(VP9_COMP *cpi) {
     }
   }
 
-  // Only bother with segmentation if over 10% of the MBs in static segment
-  // if ( ncnt[1] && (ncnt[0] / ncnt[1] < 10) )
-  if (1) {
-    // Note % of blocks that are marked as static
-    if (cm->MBs)
-      cpi->static_mb_pct = (ncnt[1] * 100) / (cm->mi_rows * cm->mi_cols);
+  // Note % of blocks that are marked as static
+  if (cm->MBs)
+    cpi->static_mb_pct = (ncnt[1] * 100) / (cm->mi_rows * cm->mi_cols);
 
-    // This error case should not be reachable as this function should
-    // never be called with the common data structure uninitialized.
-    else
-      cpi->static_mb_pct = 0;
-
-    vp9_enable_segmentation(&cm->seg);
-  } else {
+  // This error case should not be reachable as this function should
+  // never be called with the common data structure uninitialized.
+  else
     cpi->static_mb_pct = 0;
-    vp9_disable_segmentation(&cm->seg);
-  }
+
+  vp9_enable_segmentation(&cm->seg);
 
   // Free localy allocated storage
   vpx_free(arf_not_zz);
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.h b/media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.h
index df2fb98efa..7b629861d5 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_MBGRAPH_H_
-#define VP9_ENCODER_VP9_MBGRAPH_H_
+#ifndef VPX_VP9_ENCODER_VP9_MBGRAPH_H_
+#define VPX_VP9_ENCODER_VP9_MBGRAPH_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -25,7 +25,9 @@ typedef struct {
   } ref[MAX_REF_FRAMES];
 } MBGRAPH_MB_STATS;
 
-typedef struct { MBGRAPH_MB_STATS *mb_stats; } MBGRAPH_FRAME_STATS;
+typedef struct {
+  MBGRAPH_MB_STATS *mb_stats;
+} MBGRAPH_FRAME_STATS;
 
 struct VP9_COMP;
 
@@ -35,4 +37,4 @@ void vp9_update_mbgraph_stats(struct VP9_COMP *cpi);
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_MBGRAPH_H_
+#endif  // VPX_VP9_ENCODER_VP9_MBGRAPH_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_mcomp.c b/media/libvpx/libvpx/vp9/encoder/vp9_mcomp.c
index 70deda8421..1f7f174105 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_mcomp.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_mcomp.c
@@ -21,6 +21,7 @@
 #include "vpx_ports/mem.h"
 
 #include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_mvref_common.h"
 #include "vp9/common/vp9_reconinter.h"
 
 #include "vp9/encoder/vp9_encoder.h"
@@ -28,11 +29,6 @@
 
 // #define NEW_DIAMOND_SEARCH
 
-static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
-                                             const MV *mv) {
-  return &buf->buf[mv->row * buf->stride + mv->col];
-}
-
 void vp9_set_mv_search_range(MvLimits *mv_limits, const MV *mv) {
   int col_min = (mv->col >> 3) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0);
   int row_min = (mv->row >> 3) - MAX_FULL_PEL_VAL + (mv->row & 7 ? 1 : 0);
@@ -52,6 +48,24 @@ void vp9_set_mv_search_range(MvLimits *mv_limits, const MV *mv) {
   if (mv_limits->row_max > row_max) mv_limits->row_max = row_max;
 }
 
+void vp9_set_subpel_mv_search_range(MvLimits *subpel_mv_limits,
+                                    const MvLimits *umv_window_limits,
+                                    const MV *ref_mv) {
+  subpel_mv_limits->col_min = VPXMAX(umv_window_limits->col_min * 8,
+                                     ref_mv->col - MAX_FULL_PEL_VAL * 8);
+  subpel_mv_limits->col_max = VPXMIN(umv_window_limits->col_max * 8,
+                                     ref_mv->col + MAX_FULL_PEL_VAL * 8);
+  subpel_mv_limits->row_min = VPXMAX(umv_window_limits->row_min * 8,
+                                     ref_mv->row - MAX_FULL_PEL_VAL * 8);
+  subpel_mv_limits->row_max = VPXMIN(umv_window_limits->row_max * 8,
+                                     ref_mv->row + MAX_FULL_PEL_VAL * 8);
+
+  subpel_mv_limits->col_min = VPXMAX(MV_LOW + 1, subpel_mv_limits->col_min);
+  subpel_mv_limits->col_max = VPXMIN(MV_UPP - 1, subpel_mv_limits->col_max);
+  subpel_mv_limits->row_min = VPXMAX(MV_LOW + 1, subpel_mv_limits->row_min);
+  subpel_mv_limits->row_max = VPXMIN(MV_UPP - 1, subpel_mv_limits->row_max);
+}
+
 int vp9_init_search_range(int size) {
   int sr = 0;
   // Minimum search size no matter what the passed in value.
@@ -63,14 +77,6 @@ int vp9_init_search_range(int size) {
   return sr;
 }
 
-static INLINE int mv_cost(const MV *mv, const int *joint_cost,
-                          int *const comp_cost[2]) {
-  assert(mv->row >= -MV_MAX && mv->row < MV_MAX);
-  assert(mv->col >= -MV_MAX && mv->col < MV_MAX);
-  return joint_cost[vp9_get_mv_joint(mv)] + comp_cost[0][mv->row] +
-         comp_cost[1][mv->col];
-}
-
 int vp9_mv_bit_cost(const MV *mv, const MV *ref, const int *mvjcost,
                     int *mvcost[2], int weight) {
   const MV diff = { mv->row - ref->row, mv->col - ref->col };
@@ -82,24 +88,13 @@ static int mv_err_cost(const MV *mv, const MV *ref, const int *mvjcost,
                        int *mvcost[2], int error_per_bit) {
   if (mvcost) {
     const MV diff = { mv->row - ref->row, mv->col - ref->col };
-    // This product sits at a 32-bit ceiling right now and any additional
-    // accuracy in either bit cost or error cost will cause it to overflow.
-    return ROUND_POWER_OF_TWO(
-        (unsigned)mv_cost(&diff, mvjcost, mvcost) * error_per_bit,
+    return (int)ROUND64_POWER_OF_TWO(
+        (int64_t)mv_cost(&diff, mvjcost, mvcost) * error_per_bit,
         RDDIV_BITS + VP9_PROB_COST_SHIFT - RD_EPB_SHIFT +
             PIXEL_TRANSFORM_ERROR_SCALE);
   }
   return 0;
 }
-
-static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref,
-                          int sad_per_bit) {
-  const MV diff = { mv->row - ref->row, mv->col - ref->col };
-  return ROUND_POWER_OF_TWO(
-      (unsigned)mv_cost(&diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit,
-      VP9_PROB_COST_SHIFT);
-}
-
 void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride) {
   int len;
   int ss_count = 0;
@@ -138,17 +133,6 @@ void vp9_init3smotion_compensation(search_site_config *cfg, int stride) {
   cfg->total_steps = ss_count / cfg->searches_per_step;
 }
 
-/* Estimated (square) error cost of a motion vector (r,c). The 14 scale comes
- * from the same math as in mv_err_cost(). */
-#define MVC(r, c)                                                 \
-  (mvcost                                                         \
-       ? ((unsigned)(mvjcost[((r) != rr) * 2 + ((c) != rc)] +     \
-                     mvcost[0][((r)-rr)] + mvcost[1][((c)-rc)]) * \
-              error_per_bit +                                     \
-          8192) >>                                                \
-             14                                                   \
-       : 0)
-
 // convert motion vector component to offset for sv[a]f calc
 static INLINE int sp(int x) { return x & 7; }
 
@@ -158,54 +142,65 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
 
 #if CONFIG_VP9_HIGHBITDEPTH
 /* checks if (r, c) has better score than previous best */
-#define CHECK_BETTER(v, r, c)                                                \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                    \
-    int64_t tmpmse;                                                          \
-    if (second_pred == NULL) {                                               \
-      thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z,  \
-                         src_stride, &sse);                                  \
-    } else {                                                                 \
-      thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
-                          src_stride, &sse, second_pred);                    \
-    }                                                                        \
-    tmpmse = thismse;                                                        \
-    tmpmse += MVC(r, c);                                                     \
-    if (tmpmse >= INT_MAX) {                                                 \
-      v = INT_MAX;                                                           \
-    } else if ((v = (uint32_t)tmpmse) < besterr) {                           \
-      besterr = v;                                                           \
-      br = r;                                                                \
-      bc = c;                                                                \
-      *distortion = thismse;                                                 \
-      *sse1 = sse;                                                           \
-    }                                                                        \
-  } else {                                                                   \
-    v = INT_MAX;                                                             \
-  }
+#define CHECK_BETTER(v, r, c)                                                  \
+  do {                                                                         \
+    if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                    \
+      int64_t tmpmse;                                                          \
+      const MV cb_mv = { r, c };                                               \
+      const MV cb_ref_mv = { rr, rc };                                         \
+      if (second_pred == NULL) {                                               \
+        thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z,  \
+                           src_stride, &sse);                                  \
+      } else {                                                                 \
+        thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
+                            src_stride, &sse, second_pred);                    \
+      }                                                                        \
+      tmpmse = thismse;                                                        \
+      tmpmse +=                                                                \
+          mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, error_per_bit);     \
+      if (tmpmse >= INT_MAX) {                                                 \
+        v = INT_MAX;                                                           \
+      } else if ((v = (uint32_t)tmpmse) < besterr) {                           \
+        besterr = v;                                                           \
+        br = r;                                                                \
+        bc = c;                                                                \
+        *distortion = thismse;                                                 \
+        *sse1 = sse;                                                           \
+      }                                                                        \
+    } else {                                                                   \
+      v = INT_MAX;                                                             \
+    }                                                                          \
+  } while (0)
 #else
 /* checks if (r, c) has better score than previous best */
-#define CHECK_BETTER(v, r, c)                                                \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                    \
-    if (second_pred == NULL)                                                 \
-      thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z,  \
-                         src_stride, &sse);                                  \
-    else                                                                     \
-      thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
-                          src_stride, &sse, second_pred);                    \
-    if ((v = MVC(r, c) + thismse) < besterr) {                               \
-      besterr = v;                                                           \
-      br = r;                                                                \
-      bc = c;                                                                \
-      *distortion = thismse;                                                 \
-      *sse1 = sse;                                                           \
-    }                                                                        \
-  } else {                                                                   \
-    v = INT_MAX;                                                             \
-  }
+#define CHECK_BETTER(v, r, c)                                                  \
+  do {                                                                         \
+    if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                    \
+      const MV cb_mv = { r, c };                                               \
+      const MV cb_ref_mv = { rr, rc };                                         \
+      if (second_pred == NULL)                                                 \
+        thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z,  \
+                           src_stride, &sse);                                  \
+      else                                                                     \
+        thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
+                            src_stride, &sse, second_pred);                    \
+      if ((v = mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost,                \
+                           error_per_bit) +                                    \
+               thismse) < besterr) {                                           \
+        besterr = v;                                                           \
+        br = r;                                                                \
+        bc = c;                                                                \
+        *distortion = thismse;                                                 \
+        *sse1 = sse;                                                           \
+      }                                                                        \
+    } else {                                                                   \
+      v = INT_MAX;                                                             \
+    }                                                                          \
+  } while (0)
 
 #endif
 #define FIRST_LEVEL_CHECKS                                       \
-  {                                                              \
+  do {                                                           \
     unsigned int left, right, up, down, diag;                    \
     CHECK_BETTER(left, tr, tc - hstep);                          \
     CHECK_BETTER(right, tr, tc + hstep);                         \
@@ -218,10 +213,10 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
       case 2: CHECK_BETTER(diag, tr + hstep, tc - hstep); break; \
       case 3: CHECK_BETTER(diag, tr + hstep, tc + hstep); break; \
     }                                                            \
-  }
+  } while (0)
 
 #define SECOND_LEVEL_CHECKS                                       \
-  {                                                               \
+  do {                                                            \
     int kr, kc;                                                   \
     unsigned int second;                                          \
     if (tr != br && tc != bc) {                                   \
@@ -250,58 +245,41 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
         case 3: CHECK_BETTER(second, tr + kr, tc - hstep); break; \
       }                                                           \
     }                                                             \
-  }
+  } while (0)
 
-// TODO(yunqingwang): SECOND_LEVEL_CHECKS_BEST was a rewrote of
-// SECOND_LEVEL_CHECKS, and SECOND_LEVEL_CHECKS should be rewritten
-// later in the same way.
-#define SECOND_LEVEL_CHECKS_BEST                \
-  {                                             \
-    unsigned int second;                        \
-    int br0 = br;                               \
-    int bc0 = bc;                               \
-    assert(tr == br || tc == bc);               \
-    if (tr == br && tc != bc) {                 \
-      kc = bc - tc;                             \
-    } else if (tr != br && tc == bc) {          \
-      kr = br - tr;                             \
-    }                                           \
-    CHECK_BETTER(second, br0 + kr, bc0);        \
-    CHECK_BETTER(second, br0, bc0 + kc);        \
-    if (br0 != br || bc0 != bc) {               \
-      CHECK_BETTER(second, br0 + kr, bc0 + kc); \
-    }                                           \
-  }
-
-#define SETUP_SUBPEL_SEARCH                                                \
-  const uint8_t *const z = x->plane[0].src.buf;                            \
-  const int src_stride = x->plane[0].src.stride;                           \
-  const MACROBLOCKD *xd = &x->e_mbd;                                       \
-  unsigned int besterr = UINT_MAX;                                         \
-  unsigned int sse;                                                        \
-  unsigned int whichdir;                                                   \
-  int thismse;                                                             \
-  const unsigned int halfiters = iters_per_step;                           \
-  const unsigned int quarteriters = iters_per_step;                        \
-  const unsigned int eighthiters = iters_per_step;                         \
-  const int y_stride = xd->plane[0].pre[0].stride;                         \
-  const int offset = bestmv->row * y_stride + bestmv->col;                 \
-  const uint8_t *const y = xd->plane[0].pre[0].buf;                        \
-                                                                           \
-  int rr = ref_mv->row;                                                    \
-  int rc = ref_mv->col;                                                    \
-  int br = bestmv->row * 8;                                                \
-  int bc = bestmv->col * 8;                                                \
-  int hstep = 4;                                                           \
-  const int minc = VPXMAX(x->mv_limits.col_min * 8, ref_mv->col - MV_MAX); \
-  const int maxc = VPXMIN(x->mv_limits.col_max * 8, ref_mv->col + MV_MAX); \
-  const int minr = VPXMAX(x->mv_limits.row_min * 8, ref_mv->row - MV_MAX); \
-  const int maxr = VPXMIN(x->mv_limits.row_max * 8, ref_mv->row + MV_MAX); \
-  int tr = br;                                                             \
-  int tc = bc;                                                             \
-                                                                           \
-  bestmv->row *= 8;                                                        \
-  bestmv->col *= 8;
+#define SETUP_SUBPEL_SEARCH                                                 \
+  const uint8_t *const z = x->plane[0].src.buf;                             \
+  const int src_stride = x->plane[0].src.stride;                            \
+  const MACROBLOCKD *xd = &x->e_mbd;                                        \
+  unsigned int besterr = UINT_MAX;                                          \
+  unsigned int sse;                                                         \
+  unsigned int whichdir;                                                    \
+  int thismse;                                                              \
+  const unsigned int halfiters = iters_per_step;                            \
+  const unsigned int quarteriters = iters_per_step;                         \
+  const unsigned int eighthiters = iters_per_step;                          \
+  const int y_stride = xd->plane[0].pre[0].stride;                          \
+  const int offset = bestmv->row * y_stride + bestmv->col;                  \
+  const uint8_t *const y = xd->plane[0].pre[0].buf;                         \
+                                                                            \
+  int rr = ref_mv->row;                                                     \
+  int rc = ref_mv->col;                                                     \
+  int br = bestmv->row * 8;                                                 \
+  int bc = bestmv->col * 8;                                                 \
+  int hstep = 4;                                                            \
+  int minc, maxc, minr, maxr;                                               \
+  int tr = br;                                                              \
+  int tc = bc;                                                              \
+  MvLimits subpel_mv_limits;                                                \
+                                                                            \
+  vp9_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, ref_mv); \
+  minc = subpel_mv_limits.col_min;                                          \
+  maxc = subpel_mv_limits.col_max;                                          \
+  minr = subpel_mv_limits.row_min;                                          \
+  maxr = subpel_mv_limits.row_max;                                          \
+                                                                            \
+  bestmv->row *= 8;                                                         \
+  bestmv->col *= 8
 
 static unsigned int setup_center_error(
     const MACROBLOCKD *xd, const MV *bestmv, const MV *ref_mv,
@@ -314,12 +292,12 @@ static unsigned int setup_center_error(
   if (second_pred != NULL) {
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]);
-      vpx_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
-                               y_stride);
+      vpx_highbd_comp_avg_pred(comp_pred16, CONVERT_TO_SHORTPTR(second_pred), w,
+                               h, CONVERT_TO_SHORTPTR(y + offset), y_stride);
       besterr =
           vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1);
     } else {
-      DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+      DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]);
       vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
       besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
     }
@@ -334,7 +312,7 @@ static unsigned int setup_center_error(
   uint32_t besterr;
   (void)xd;
   if (second_pred != NULL) {
-    DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+    DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]);
     vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
     besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
   } else {
@@ -346,7 +324,7 @@ static unsigned int setup_center_error(
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
-static INLINE int divide_and_round(const int n, const int d) {
+static INLINE int64_t divide_and_round(const int64_t n, const int64_t d) {
   return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d);
 }
 
@@ -364,20 +342,21 @@ static INLINE int is_cost_list_wellbehaved(int *cost_list) {
 // y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0).
 // The code below is an integerized version of that.
 static void get_cost_surf_min(int *cost_list, int *ir, int *ic, int bits) {
-  *ic = divide_and_round((cost_list[1] - cost_list[3]) * (1 << (bits - 1)),
-                         (cost_list[1] - 2 * cost_list[0] + cost_list[3]));
-  *ir = divide_and_round((cost_list[4] - cost_list[2]) * (1 << (bits - 1)),
-                         (cost_list[4] - 2 * cost_list[0] + cost_list[2]));
+  const int64_t x0 = (int64_t)cost_list[1] - cost_list[3];
+  const int64_t y0 = cost_list[1] - 2 * (int64_t)cost_list[0] + cost_list[3];
+  const int64_t x1 = (int64_t)cost_list[4] - cost_list[2];
+  const int64_t y1 = cost_list[4] - 2 * (int64_t)cost_list[0] + cost_list[2];
+  const int b = 1 << (bits - 1);
+  *ic = (int)divide_and_round(x0 * b, y0);
+  *ir = (int)divide_and_round(x1 * b, y1);
 }
 
-uint32_t vp9_skip_sub_pixel_tree(const MACROBLOCK *x, MV *bestmv,
-                                 const MV *ref_mv, int allow_hp,
-                                 int error_per_bit,
-                                 const vp9_variance_fn_ptr_t *vfp,
-                                 int forced_stop, int iters_per_step,
-                                 int *cost_list, int *mvjcost, int *mvcost[2],
-                                 uint32_t *distortion, uint32_t *sse1,
-                                 const uint8_t *second_pred, int w, int h) {
+uint32_t vp9_skip_sub_pixel_tree(
+    const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
+    int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop,
+    int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
+    uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
+    int h, int use_accurate_subpel_search) {
   SETUP_SUBPEL_SEARCH;
   besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z,
                                src_stride, y, y_stride, second_pred, w, h,
@@ -400,10 +379,7 @@ uint32_t vp9_skip_sub_pixel_tree(const MACROBLOCK *x, MV *bestmv,
   (void)sse;
   (void)thismse;
   (void)cost_list;
-
-  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
-      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
-    return UINT_MAX;
+  (void)use_accurate_subpel_search;
 
   return besterr;
 }
@@ -413,7 +389,7 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_evenmore(
     int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop,
     int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
     uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
-    int h) {
+    int h, int use_accurate_subpel_search) {
   SETUP_SUBPEL_SEARCH;
   besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z,
                                src_stride, y, y_stride, second_pred, w, h,
@@ -425,12 +401,13 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_evenmore(
   (void)allow_hp;
   (void)forced_stop;
   (void)hstep;
+  (void)use_accurate_subpel_search;
 
   if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
       cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
       cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
     int ir, ic;
-    unsigned int minpt;
+    unsigned int minpt = INT_MAX;
     get_cost_surf_min(cost_list, &ir, &ic, 2);
     if (ir != 0 || ic != 0) {
       CHECK_BETTER(minpt, tr + 2 * ir, tc + 2 * ic);
@@ -470,10 +447,6 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_evenmore(
   bestmv->row = br;
   bestmv->col = bc;
 
-  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
-      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
-    return UINT_MAX;
-
   return besterr;
 }
 
@@ -482,8 +455,10 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_more(
     int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop,
     int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
     uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
-    int h) {
+    int h, int use_accurate_subpel_search) {
   SETUP_SUBPEL_SEARCH;
+  (void)use_accurate_subpel_search;
+
   besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z,
                                src_stride, y, y_stride, second_pred, w, h,
                                offset, mvjcost, mvcost, sse1, distortion);
@@ -534,10 +509,6 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_more(
   bestmv->row = br;
   bestmv->col = bc;
 
-  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
-      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
-    return UINT_MAX;
-
   return besterr;
 }
 
@@ -546,8 +517,10 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned(
     int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop,
     int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
     uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
-    int h) {
+    int h, int use_accurate_subpel_search) {
   SETUP_SUBPEL_SEARCH;
+  (void)use_accurate_subpel_search;
+
   besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z,
                                src_stride, y, y_stride, second_pred, w, h,
                                offset, mvjcost, mvcost, sse1, distortion);
@@ -620,10 +593,6 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned(
   bestmv->row = br;
   bestmv->col = bc;
 
-  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
-      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
-    return UINT_MAX;
-
   return besterr;
 }
 
@@ -636,12 +605,125 @@ static const MV search_step_table[12] = {
 };
 /* clang-format on */
 
+static int accurate_sub_pel_search(
+    const MACROBLOCKD *xd, const MV *this_mv, const struct scale_factors *sf,
+    const InterpKernel *kernel, const vp9_variance_fn_ptr_t *vfp,
+    const uint8_t *const src_address, const int src_stride,
+    const uint8_t *const pre_address, int y_stride, const uint8_t *second_pred,
+    int w, int h, uint32_t *sse) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  uint64_t besterr;
+  assert(sf->x_step_q4 == 16 && sf->y_step_q4 == 16);
+  assert(w != 0 && h != 0);
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    DECLARE_ALIGNED(16, uint16_t, pred16[64 * 64]);
+    vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(pre_address), y_stride,
+                                     pred16, w, this_mv, sf, w, h, 0, kernel,
+                                     MV_PRECISION_Q3, 0, 0, xd->bd);
+    if (second_pred != NULL) {
+      DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]);
+      vpx_highbd_comp_avg_pred(comp_pred16, CONVERT_TO_SHORTPTR(second_pred), w,
+                               h, pred16, w);
+      besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src_address,
+                        src_stride, sse);
+    } else {
+      besterr =
+          vfp->vf(CONVERT_TO_BYTEPTR(pred16), w, src_address, src_stride, sse);
+    }
+  } else {
+    DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]);
+    vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h,
+                              0, kernel, MV_PRECISION_Q3, 0, 0);
+    if (second_pred != NULL) {
+      DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]);
+      vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w);
+      besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse);
+    } else {
+      besterr = vfp->vf(pred, w, src_address, src_stride, sse);
+    }
+  }
+  if (besterr >= UINT_MAX) return UINT_MAX;
+  return (int)besterr;
+#else
+  int besterr;
+  DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]);
+  assert(sf->x_step_q4 == 16 && sf->y_step_q4 == 16);
+  assert(w != 0 && h != 0);
+  (void)xd;
+
+  vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h,
+                            0, kernel, MV_PRECISION_Q3, 0, 0);
+  if (second_pred != NULL) {
+    DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]);
+    vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w);
+    besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse);
+  } else {
+    besterr = vfp->vf(pred, w, src_address, src_stride, sse);
+  }
+  return besterr;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+
+// TODO(yunqing): this part can be further refactored.
+#if CONFIG_VP9_HIGHBITDEPTH
+/* checks if (r, c) has better score than previous best */
+#define CHECK_BETTER1(v, r, c)                                                \
+  do {                                                                        \
+    if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                   \
+      int64_t tmpmse;                                                         \
+      const MV cb_mv = { r, c };                                              \
+      const MV cb_ref_mv = { rr, rc };                                        \
+      thismse = accurate_sub_pel_search(xd, &cb_mv, x->me_sf, kernel, vfp, z, \
+                                        src_stride, y, y_stride, second_pred, \
+                                        w, h, &sse);                          \
+      tmpmse = thismse;                                                       \
+      tmpmse +=                                                               \
+          mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, error_per_bit);    \
+      if (tmpmse >= INT_MAX) {                                                \
+        v = INT_MAX;                                                          \
+      } else if ((v = (uint32_t)tmpmse) < besterr) {                          \
+        besterr = v;                                                          \
+        br = r;                                                               \
+        bc = c;                                                               \
+        *distortion = thismse;                                                \
+        *sse1 = sse;                                                          \
+      }                                                                       \
+    } else {                                                                  \
+      v = INT_MAX;                                                            \
+    }                                                                         \
+  } while (0)
+#else
+/* checks if (r, c) has better score than previous best */
+#define CHECK_BETTER1(v, r, c)                                                \
+  do {                                                                        \
+    if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                   \
+      const MV cb_mv = { r, c };                                              \
+      const MV cb_ref_mv = { rr, rc };                                        \
+      thismse = accurate_sub_pel_search(xd, &cb_mv, x->me_sf, kernel, vfp, z, \
+                                        src_stride, y, y_stride, second_pred, \
+                                        w, h, &sse);                          \
+      if ((v = mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost,               \
+                           error_per_bit) +                                   \
+               thismse) < besterr) {                                          \
+        besterr = v;                                                          \
+        br = r;                                                               \
+        bc = c;                                                               \
+        *distortion = thismse;                                                \
+        *sse1 = sse;                                                          \
+      }                                                                       \
+    } else {                                                                  \
+      v = INT_MAX;                                                            \
+    }                                                                         \
+  } while (0)
+
+#endif
+
 uint32_t vp9_find_best_sub_pixel_tree(
     const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
     int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop,
     int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
     uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
-    int h) {
+    int h, int use_accurate_subpel_search) {
   const uint8_t *const z = x->plane[0].src.buf;
   const uint8_t *const src_address = z;
   const int src_stride = x->plane[0].src.stride;
@@ -659,16 +741,32 @@ uint32_t vp9_find_best_sub_pixel_tree(
   int bc = bestmv->col * 8;
   int hstep = 4;
   int iter, round = 3 - forced_stop;
-  const int minc = VPXMAX(x->mv_limits.col_min * 8, ref_mv->col - MV_MAX);
-  const int maxc = VPXMIN(x->mv_limits.col_max * 8, ref_mv->col + MV_MAX);
-  const int minr = VPXMAX(x->mv_limits.row_min * 8, ref_mv->row - MV_MAX);
-  const int maxr = VPXMIN(x->mv_limits.row_max * 8, ref_mv->row + MV_MAX);
+
+  int minc, maxc, minr, maxr;
   int tr = br;
   int tc = bc;
   const MV *search_step = search_step_table;
   int idx, best_idx = -1;
   unsigned int cost_array[5];
   int kr, kc;
+  MvLimits subpel_mv_limits;
+
+  // TODO(yunqing): need to add 4-tap filter optimization to speed up the
+  // encoder.
+  const InterpKernel *kernel =
+      (use_accurate_subpel_search > 0)
+          ? ((use_accurate_subpel_search == USE_4_TAPS)
+                 ? vp9_filter_kernels[FOURTAP]
+                 : ((use_accurate_subpel_search == USE_8_TAPS)
+                        ? vp9_filter_kernels[EIGHTTAP]
+                        : vp9_filter_kernels[EIGHTTAP_SHARP]))
+          : vp9_filter_kernels[BILINEAR];
+
+  vp9_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, ref_mv);
+  minc = subpel_mv_limits.col_min;
+  maxc = subpel_mv_limits.col_max;
+  minr = subpel_mv_limits.row_min;
+  maxr = subpel_mv_limits.row_max;
 
   if (!(allow_hp && use_mv_hp(ref_mv)))
     if (round == 3) round = 2;
@@ -688,16 +786,25 @@ uint32_t vp9_find_best_sub_pixel_tree(
       tr = br + search_step[idx].row;
       tc = bc + search_step[idx].col;
       if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
-        const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
         MV this_mv;
         this_mv.row = tr;
         this_mv.col = tc;
-        if (second_pred == NULL)
-          thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address,
-                             src_stride, &sse);
-        else
-          thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
-                              src_address, src_stride, &sse, second_pred);
+
+        if (use_accurate_subpel_search) {
+          thismse = accurate_sub_pel_search(xd, &this_mv, x->me_sf, kernel, vfp,
+                                            src_address, src_stride, y,
+                                            y_stride, second_pred, w, h, &sse);
+        } else {
+          const uint8_t *const pre_address =
+              y + (tr >> 3) * y_stride + (tc >> 3);
+          if (second_pred == NULL)
+            thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr),
+                               src_address, src_stride, &sse);
+          else
+            thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
+                                src_address, src_stride, &sse, second_pred);
+        }
+
         cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost,
                                                 mvcost, error_per_bit);
 
@@ -719,14 +826,21 @@ uint32_t vp9_find_best_sub_pixel_tree(
     tc = bc + kc;
     tr = br + kr;
     if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
-      const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
       MV this_mv = { tr, tc };
-      if (second_pred == NULL)
-        thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address,
-                           src_stride, &sse);
-      else
-        thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr), src_address,
-                            src_stride, &sse, second_pred);
+      if (use_accurate_subpel_search) {
+        thismse = accurate_sub_pel_search(xd, &this_mv, x->me_sf, kernel, vfp,
+                                          src_address, src_stride, y, y_stride,
+                                          second_pred, w, h, &sse);
+      } else {
+        const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+        if (second_pred == NULL)
+          thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address,
+                             src_stride, &sse);
+        else
+          thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
+                              src_address, src_stride, &sse, second_pred);
+      }
+
       cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
                                             error_per_bit);
 
@@ -748,10 +862,48 @@ uint32_t vp9_find_best_sub_pixel_tree(
       bc = tc;
     }
 
-    if (iters_per_step > 1 && best_idx != -1) SECOND_LEVEL_CHECKS_BEST;
+    if (iters_per_step > 0 && best_idx != -1) {
+      unsigned int second;
+      const int br0 = br;
+      const int bc0 = bc;
+      assert(tr == br || tc == bc);
 
-    tr = br;
-    tc = bc;
+      if (tr == br && tc != bc) {
+        kc = bc - tc;
+        if (iters_per_step == 1) {
+          if (use_accurate_subpel_search) {
+            CHECK_BETTER1(second, br0, bc0 + kc);
+          } else {
+            CHECK_BETTER(second, br0, bc0 + kc);
+          }
+        }
+      } else if (tr != br && tc == bc) {
+        kr = br - tr;
+        if (iters_per_step == 1) {
+          if (use_accurate_subpel_search) {
+            CHECK_BETTER1(second, br0 + kr, bc0);
+          } else {
+            CHECK_BETTER(second, br0 + kr, bc0);
+          }
+        }
+      }
+
+      if (iters_per_step > 1) {
+        if (use_accurate_subpel_search) {
+          CHECK_BETTER1(second, br0 + kr, bc0);
+          CHECK_BETTER1(second, br0, bc0 + kc);
+          if (br0 != br || bc0 != bc) {
+            CHECK_BETTER1(second, br0 + kr, bc0 + kc);
+          }
+        } else {
+          CHECK_BETTER(second, br0 + kr, bc0);
+          CHECK_BETTER(second, br0, bc0 + kc);
+          if (br0 != br || bc0 != bc) {
+            CHECK_BETTER(second, br0 + kr, bc0 + kc);
+          }
+        }
+      }
+    }
 
     search_step += 4;
     hstep >>= 1;
@@ -769,15 +921,11 @@ uint32_t vp9_find_best_sub_pixel_tree(
   bestmv->row = br;
   bestmv->col = bc;
 
-  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
-      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
-    return UINT_MAX;
-
   return besterr;
 }
 
-#undef MVC
 #undef CHECK_BETTER
+#undef CHECK_BETTER1
 
 static INLINE int check_bounds(const MvLimits *mv_limits, int row, int col,
                                int range) {
@@ -805,7 +953,7 @@ static INLINE int is_mv_in(const MvLimits *mv_limits, const MV *mv) {
   }
 
 #define MAX_PATTERN_SCALES 11
-#define MAX_PATTERN_CANDIDATES 8  // max number of canddiates per scale
+#define MAX_PATTERN_CANDIDATES 8  // max number of candidates per scale
 #define PATTERN_CANDIDATES_REF 3  // number of refinement candidates
 
 // Calculate and return a sad+mvcost list around an integer best pel.
@@ -819,16 +967,14 @@ static INLINE void calc_int_cost_list(const MACROBLOCK *x, const MV *ref_mv,
   const MV fcenter_mv = { ref_mv->row >> 3, ref_mv->col >> 3 };
   int br = best_mv->row;
   int bc = best_mv->col;
-  MV this_mv;
+  const MV mv = { br, bc };
   int i;
   unsigned int sse;
 
-  this_mv.row = br;
-  this_mv.col = bc;
   cost_list[0] =
-      fn_ptr->vf(what->buf, what->stride, get_buf_from_mv(in_what, &this_mv),
+      fn_ptr->vf(what->buf, what->stride, get_buf_from_mv(in_what, &mv),
                  in_what->stride, &sse) +
-      mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
+      mvsad_err_cost(x, &mv, &fcenter_mv, sadpb);
   if (check_bounds(&x->mv_limits, br, bc, 1)) {
     for (i = 0; i < 4; i++) {
       const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col };
@@ -858,13 +1004,12 @@ static INLINE void calc_int_cost_list(const MACROBLOCK *x, const MV *ref_mv,
 // candidates as indicated in the num_candidates and candidates arrays
 // passed into this function
 //
-static int vp9_pattern_search(const MACROBLOCK *x, MV *ref_mv, int search_param,
-                              int sad_per_bit, int do_init_search,
-                              int *cost_list, const vp9_variance_fn_ptr_t *vfp,
-                              int use_mvcost, const MV *center_mv, MV *best_mv,
-                              const int num_candidates[MAX_PATTERN_SCALES],
-                              const MV candidates[MAX_PATTERN_SCALES]
-                                                 [MAX_PATTERN_CANDIDATES]) {
+static int vp9_pattern_search(
+    const MACROBLOCK *x, MV *ref_mv, int search_param, int sad_per_bit,
+    int do_init_search, int *cost_list, const vp9_variance_fn_ptr_t *vfp,
+    int use_mvcost, const MV *center_mv, MV *best_mv,
+    const int num_candidates[MAX_PATTERN_SCALES],
+    const MV candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES]) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = {
     10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
@@ -889,7 +1034,7 @@ static int vp9_pattern_search(const MACROBLOCK *x, MV *ref_mv, int search_param,
                      in_what->stride) +
             mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
 
-  // Search all possible scales upto the search param around the center point
+  // Search all possible scales up to the search param around the center point
   // pick the scale of the point that is best as the starting scale of
   // further steps around it.
   if (do_init_search) {
@@ -1010,6 +1155,9 @@ static int vp9_pattern_search(const MACROBLOCK *x, MV *ref_mv, int search_param,
     } while (s--);
   }
 
+  best_mv->row = br;
+  best_mv->col = bc;
+
   // Returns the one-away integer pel sad values around the best as follows:
   // cost_list[0]: cost at the best integer pel
   // cost_list[1]: cost at delta {0, -1} (left)   from the best integer pel
@@ -1017,11 +1165,8 @@ static int vp9_pattern_search(const MACROBLOCK *x, MV *ref_mv, int search_param,
   // cost_list[3]: cost at delta { 0, 1} (right)  from the best integer pel
   // cost_list[4]: cost at delta {-1, 0} (top)    from the best integer pel
   if (cost_list) {
-    const MV best_mv = { br, bc };
-    calc_int_cost_list(x, &fcenter_mv, sad_per_bit, vfp, &best_mv, cost_list);
+    calc_int_cost_list(x, &fcenter_mv, sad_per_bit, vfp, best_mv, cost_list);
   }
-  best_mv->row = br;
-  best_mv->col = bc;
   return bestsad;
 }
 
@@ -1063,7 +1208,7 @@ static int vp9_pattern_search_sad(
                      in_what->stride) +
             mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
 
-  // Search all possible scales upto the search param around the center point
+  // Search all possible scales up to the search param around the center point
   // pick the scale of the point that is best as the starting scale of
   // further steps around it.
   if (do_init_search) {
@@ -1489,7 +1634,7 @@ static int fast_dia_search(const MACROBLOCK *x, MV *ref_mv, int search_param,
 
 // Exhuastive motion search around a given centre position with a given
 // step size.
-static int exhuastive_mesh_search(const MACROBLOCK *x, MV *ref_mv, MV *best_mv,
+static int exhaustive_mesh_search(const MACROBLOCK *x, MV *ref_mv, MV *best_mv,
                                   int range, int step, int sad_per_bit,
                                   const vp9_variance_fn_ptr_t *fn_ptr,
                                   const MV *center_mv) {
@@ -1575,10 +1720,342 @@ static int exhuastive_mesh_search(const MACROBLOCK *x, MV *ref_mv, MV *best_mv,
   return best_sad;
 }
 
+#define MIN_RANGE 7
+#define MAX_RANGE 256
+#define MIN_INTERVAL 1
+#if CONFIG_NON_GREEDY_MV
+static int64_t exhaustive_mesh_search_multi_step(
+    MV *best_mv, const MV *center_mv, int range, int step,
+    const struct buf_2d *src, const struct buf_2d *pre, int lambda,
+    const int_mv *nb_full_mvs, int full_mv_num, const MvLimits *mv_limits,
+    const vp9_variance_fn_ptr_t *fn_ptr) {
+  int64_t best_sad;
+  int r, c;
+  int start_col, end_col, start_row, end_row;
+  *best_mv = *center_mv;
+  best_sad =
+      ((int64_t)fn_ptr->sdf(src->buf, src->stride,
+                            get_buf_from_mv(pre, center_mv), pre->stride)
+       << LOG2_PRECISION) +
+      lambda * vp9_nb_mvs_inconsistency(best_mv, nb_full_mvs, full_mv_num);
+  start_row = VPXMAX(center_mv->row - range, mv_limits->row_min);
+  start_col = VPXMAX(center_mv->col - range, mv_limits->col_min);
+  end_row = VPXMIN(center_mv->row + range, mv_limits->row_max);
+  end_col = VPXMIN(center_mv->col + range, mv_limits->col_max);
+  for (r = start_row; r <= end_row; r += step) {
+    for (c = start_col; c <= end_col; c += step) {
+      const MV mv = { r, c };
+      int64_t sad = (int64_t)fn_ptr->sdf(src->buf, src->stride,
+                                         get_buf_from_mv(pre, &mv), pre->stride)
+                    << LOG2_PRECISION;
+      if (sad < best_sad) {
+        sad += lambda * vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num);
+        if (sad < best_sad) {
+          best_sad = sad;
+          *best_mv = mv;
+        }
+      }
+    }
+  }
+  return best_sad;
+}
+
+static int64_t exhaustive_mesh_search_single_step(
+    MV *best_mv, const MV *center_mv, int range, const struct buf_2d *src,
+    const struct buf_2d *pre, int lambda, const int_mv *nb_full_mvs,
+    int full_mv_num, const MvLimits *mv_limits,
+    const vp9_variance_fn_ptr_t *fn_ptr) {
+  int64_t best_sad;
+  int r, c, i;
+  int start_col, end_col, start_row, end_row;
+
+  *best_mv = *center_mv;
+  best_sad =
+      ((int64_t)fn_ptr->sdf(src->buf, src->stride,
+                            get_buf_from_mv(pre, center_mv), pre->stride)
+       << LOG2_PRECISION) +
+      lambda * vp9_nb_mvs_inconsistency(best_mv, nb_full_mvs, full_mv_num);
+  start_row = VPXMAX(center_mv->row - range, mv_limits->row_min);
+  start_col = VPXMAX(center_mv->col - range, mv_limits->col_min);
+  end_row = VPXMIN(center_mv->row + range, mv_limits->row_max);
+  end_col = VPXMIN(center_mv->col + range, mv_limits->col_max);
+  for (r = start_row; r <= end_row; r += 1) {
+    c = start_col;
+    while (c + 3 <= end_col) {
+      unsigned int sads[4];
+      const uint8_t *addrs[4];
+      for (i = 0; i < 4; ++i) {
+        const MV mv = { r, c + i };
+        addrs[i] = get_buf_from_mv(pre, &mv);
+      }
+      fn_ptr->sdx4df(src->buf, src->stride, addrs, pre->stride, sads);
+
+      for (i = 0; i < 4; ++i) {
+        int64_t sad = (int64_t)sads[i] << LOG2_PRECISION;
+        if (sad < best_sad) {
+          const MV mv = { r, c + i };
+          sad +=
+              lambda * vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num);
+          if (sad < best_sad) {
+            best_sad = sad;
+            *best_mv = mv;
+          }
+        }
+      }
+      c += 4;
+    }
+    while (c <= end_col) {
+      const MV mv = { r, c };
+      int64_t sad = (int64_t)fn_ptr->sdf(src->buf, src->stride,
+                                         get_buf_from_mv(pre, &mv), pre->stride)
+                    << LOG2_PRECISION;
+      if (sad < best_sad) {
+        sad += lambda * vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num);
+        if (sad < best_sad) {
+          best_sad = sad;
+          *best_mv = mv;
+        }
+      }
+      c += 1;
+    }
+  }
+  return best_sad;
+}
+
+static int64_t exhaustive_mesh_search_new(const MACROBLOCK *x, MV *best_mv,
+                                          int range, int step,
+                                          const vp9_variance_fn_ptr_t *fn_ptr,
+                                          const MV *center_mv, int lambda,
+                                          const int_mv *nb_full_mvs,
+                                          int full_mv_num) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *src = &x->plane[0].src;
+  const struct buf_2d *pre = &xd->plane[0].pre[0];
+  assert(step >= 1);
+  assert(is_mv_in(&x->mv_limits, center_mv));
+  if (step == 1) {
+    return exhaustive_mesh_search_single_step(
+        best_mv, center_mv, range, src, pre, lambda, nb_full_mvs, full_mv_num,
+        &x->mv_limits, fn_ptr);
+  }
+  return exhaustive_mesh_search_multi_step(best_mv, center_mv, range, step, src,
+                                           pre, lambda, nb_full_mvs,
+                                           full_mv_num, &x->mv_limits, fn_ptr);
+}
+
+static int64_t full_pixel_exhaustive_new(const VP9_COMP *cpi, MACROBLOCK *x,
+                                         MV *centre_mv_full,
+                                         const vp9_variance_fn_ptr_t *fn_ptr,
+                                         MV *dst_mv, int lambda,
+                                         const int_mv *nb_full_mvs,
+                                         int full_mv_num) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  MV temp_mv = { centre_mv_full->row, centre_mv_full->col };
+  int64_t bestsme;
+  int i;
+  int interval = sf->mesh_patterns[0].interval;
+  int range = sf->mesh_patterns[0].range;
+  int baseline_interval_divisor;
+
+  // Trap illegal values for interval and range for this function.
+  if ((range < MIN_RANGE) || (range > MAX_RANGE) || (interval < MIN_INTERVAL) ||
+      (interval > range)) {
+    printf("ERROR: invalid range\n");
+    assert(0);
+  }
+
+  baseline_interval_divisor = range / interval;
+
+  // Check size of proposed first range against magnitude of the centre
+  // value used as a starting point.
+  range = clamp(range, (5 * VPXMAX(abs(temp_mv.row), abs(temp_mv.col))) / 4,
+                MAX_RANGE);
+  interval = VPXMAX(interval, range / baseline_interval_divisor);
+
+  // initial search
+  bestsme =
+      exhaustive_mesh_search_new(x, &temp_mv, range, interval, fn_ptr, &temp_mv,
+                                 lambda, nb_full_mvs, full_mv_num);
+
+  if ((interval > MIN_INTERVAL) && (range > MIN_RANGE)) {
+    // Progressive searches with range and step size decreasing each time
+    // till we reach a step size of 1. Then break out.
+    for (i = 1; i < MAX_MESH_STEP; ++i) {
+      // First pass with coarser step and longer range
+      bestsme = exhaustive_mesh_search_new(
+          x, &temp_mv, sf->mesh_patterns[i].range,
+          sf->mesh_patterns[i].interval, fn_ptr, &temp_mv, lambda, nb_full_mvs,
+          full_mv_num);
+
+      if (sf->mesh_patterns[i].interval == 1) break;
+    }
+  }
+
+  *dst_mv = temp_mv;
+
+  return bestsme;
+}
+
+static int64_t diamond_search_sad_new(const MACROBLOCK *x,
+                                      const search_site_config *cfg,
+                                      const MV *init_full_mv, MV *best_full_mv,
+                                      int search_param, int lambda, int *num00,
+                                      const vp9_variance_fn_ptr_t *fn_ptr,
+                                      const int_mv *nb_full_mvs,
+                                      int full_mv_num) {
+  int i, j, step;
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  uint8_t *what = x->plane[0].src.buf;
+  const int what_stride = x->plane[0].src.stride;
+  const uint8_t *in_what;
+  const int in_what_stride = xd->plane[0].pre[0].stride;
+  const uint8_t *best_address;
+
+  int64_t bestsad;
+  int best_site = -1;
+  int last_site = -1;
+
+  // search_param determines the length of the initial step and hence the number
+  // of iterations.
+  // 0 = initial step (MAX_FIRST_STEP) pel
+  // 1 = (MAX_FIRST_STEP/2) pel,
+  // 2 = (MAX_FIRST_STEP/4) pel...
+  //  const search_site *ss = &cfg->ss[search_param * cfg->searches_per_step];
+  const MV *ss_mv = &cfg->ss_mv[search_param * cfg->searches_per_step];
+  const intptr_t *ss_os = &cfg->ss_os[search_param * cfg->searches_per_step];
+  const int tot_steps = cfg->total_steps - search_param;
+  vpx_clear_system_state();
+
+  *best_full_mv = *init_full_mv;
+  clamp_mv(best_full_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+           x->mv_limits.row_min, x->mv_limits.row_max);
+  *num00 = 0;
+
+  // Work out the start point for the search
+  in_what = xd->plane[0].pre[0].buf + best_full_mv->row * in_what_stride +
+            best_full_mv->col;
+  best_address = in_what;
+
+  // Check the starting position
+  {
+    const int64_t mv_dist =
+        (int64_t)fn_ptr->sdf(what, what_stride, in_what, in_what_stride)
+        << LOG2_PRECISION;
+    const int64_t mv_cost =
+        vp9_nb_mvs_inconsistency(best_full_mv, nb_full_mvs, full_mv_num);
+    bestsad = mv_dist + lambda * mv_cost;
+  }
+
+  i = 0;
+
+  for (step = 0; step < tot_steps; step++) {
+    int all_in = 1, t;
+
+    // All_in is true if every one of the points we are checking are within
+    // the bounds of the image.
+    all_in &= ((best_full_mv->row + ss_mv[i].row) > x->mv_limits.row_min);
+    all_in &= ((best_full_mv->row + ss_mv[i + 1].row) < x->mv_limits.row_max);
+    all_in &= ((best_full_mv->col + ss_mv[i + 2].col) > x->mv_limits.col_min);
+    all_in &= ((best_full_mv->col + ss_mv[i + 3].col) < x->mv_limits.col_max);
+
+    // If all the pixels are within the bounds we don't check whether the
+    // search point is valid in this loop,  otherwise we check each point
+    // for validity..
+    if (all_in) {
+      unsigned int sad_array[4];
+
+      for (j = 0; j < cfg->searches_per_step; j += 4) {
+        unsigned char const *block_offset[4];
+
+        for (t = 0; t < 4; t++) block_offset[t] = ss_os[i + t] + best_address;
+
+        fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
+                       sad_array);
+
+        for (t = 0; t < 4; t++, i++) {
+          const int64_t mv_dist = (int64_t)sad_array[t] << LOG2_PRECISION;
+          if (mv_dist < bestsad) {
+            const MV this_mv = { best_full_mv->row + ss_mv[i].row,
+                                 best_full_mv->col + ss_mv[i].col };
+            const int64_t mv_cost =
+                vp9_nb_mvs_inconsistency(&this_mv, nb_full_mvs, full_mv_num);
+            const int64_t thissad = mv_dist + lambda * mv_cost;
+            if (thissad < bestsad) {
+              bestsad = thissad;
+              best_site = i;
+            }
+          }
+        }
+      }
+    } else {
+      for (j = 0; j < cfg->searches_per_step; j++) {
+        // Trap illegal vectors
+        const MV this_mv = { best_full_mv->row + ss_mv[i].row,
+                             best_full_mv->col + ss_mv[i].col };
+
+        if (is_mv_in(&x->mv_limits, &this_mv)) {
+          const uint8_t *const check_here = ss_os[i] + best_address;
+          const int64_t mv_dist =
+              (int64_t)fn_ptr->sdf(what, what_stride, check_here,
+                                   in_what_stride)
+              << LOG2_PRECISION;
+          if (mv_dist < bestsad) {
+            const int64_t mv_cost =
+                vp9_nb_mvs_inconsistency(&this_mv, nb_full_mvs, full_mv_num);
+            const int64_t thissad = mv_dist + lambda * mv_cost;
+            if (thissad < bestsad) {
+              bestsad = thissad;
+              best_site = i;
+            }
+          }
+        }
+        i++;
+      }
+    }
+    if (best_site != last_site) {
+      best_full_mv->row += ss_mv[best_site].row;
+      best_full_mv->col += ss_mv[best_site].col;
+      best_address += ss_os[best_site];
+      last_site = best_site;
+    } else if (best_address == in_what) {
+      (*num00)++;
+    }
+  }
+  return bestsad;
+}
+
+int vp9_prepare_nb_full_mvs(const MotionField *motion_field, int mi_row,
+                            int mi_col, int_mv *nb_full_mvs) {
+  const int mi_width = num_8x8_blocks_wide_lookup[motion_field->bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[motion_field->bsize];
+  const int dirs[NB_MVS_NUM][2] = { { -1, 0 }, { 0, -1 }, { 1, 0 }, { 0, 1 } };
+  int nb_full_mv_num = 0;
+  int i;
+  assert(mi_row % mi_height == 0);
+  assert(mi_col % mi_width == 0);
+  for (i = 0; i < NB_MVS_NUM; ++i) {
+    int r = dirs[i][0];
+    int c = dirs[i][1];
+    int brow = mi_row / mi_height + r;
+    int bcol = mi_col / mi_width + c;
+    if (brow >= 0 && brow < motion_field->block_rows && bcol >= 0 &&
+        bcol < motion_field->block_cols) {
+      if (vp9_motion_field_is_mv_set(motion_field, brow, bcol)) {
+        int_mv mv = vp9_motion_field_get_mv(motion_field, brow, bcol);
+        nb_full_mvs[nb_full_mv_num].as_mv = get_full_mv(&mv.as_mv);
+        ++nb_full_mv_num;
+      }
+    }
+  }
+  return nb_full_mv_num;
+}
+#endif  // CONFIG_NON_GREEDY_MV
+
 int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
-                             MV *ref_mv, MV *best_mv, int search_param,
-                             int sad_per_bit, int *num00,
-                             const vp9_variance_fn_ptr_t *fn_ptr,
+                             MV *ref_mv, uint32_t start_mv_sad, MV *best_mv,
+                             int search_param, int sad_per_bit, int *num00,
+                             const vp9_sad_fn_ptr_t *sad_fn_ptr,
                              const MV *center_mv) {
   int i, j, step;
 
@@ -1589,7 +2066,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
   const int in_what_stride = xd->plane[0].pre[0].stride;
   const uint8_t *best_address;
 
-  unsigned int bestsad = INT_MAX;
+  unsigned int bestsad = start_mv_sad;
   int best_site = -1;
   int last_site = -1;
 
@@ -1607,8 +2084,6 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
   const int tot_steps = cfg->total_steps - search_param;
 
   const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
-  clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max,
-           x->mv_limits.row_min, x->mv_limits.row_max);
   ref_row = ref_mv->row;
   ref_col = ref_mv->col;
   *num00 = 0;
@@ -1619,10 +2094,6 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
   in_what = xd->plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
   best_address = in_what;
 
-  // Check the starting position
-  bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride) +
-            mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
-
   i = 0;
 
   for (step = 0; step < tot_steps; step++) {
@@ -1646,8 +2117,8 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
 
         for (t = 0; t < 4; t++) block_offset[t] = ss_os[i + t] + best_address;
 
-        fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
-                       sad_array);
+        sad_fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
+                           sad_array);
 
         for (t = 0; t < 4; t++, i++) {
           if (sad_array[t] < bestsad) {
@@ -1671,7 +2142,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
         if (is_mv_in(&x->mv_limits, &this_mv)) {
           const uint8_t *const check_here = ss_os[i] + best_address;
           unsigned int thissad =
-              fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
+              sad_fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
 
           if (thissad < bestsad) {
             thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
@@ -1784,12 +2255,15 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) {
 }
 
 static const MV search_pos[4] = {
-  { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 },
+  { -1, 0 },
+  { 0, -1 },
+  { 0, 1 },
+  { 1, 0 },
 };
 
 unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
                                            BLOCK_SIZE bsize, int mi_row,
-                                           int mi_col) {
+                                           int mi_col, const MV *ref_mv) {
   MACROBLOCKD *xd = &x->e_mbd;
   MODE_INFO *mi = xd->mi[0];
   struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0 } };
@@ -1811,6 +2285,7 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
   const int norm_factor = 3 + (bw >> 5);
   const YV12_BUFFER_CONFIG *scaled_ref_frame =
       vp9_get_scaled_ref_frame(cpi, mi->ref_frame[0]);
+  MvLimits subpel_mv_limits;
 
   if (scaled_ref_frame) {
     int i;
@@ -1822,18 +2297,19 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
   }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-  {
-    unsigned int this_sad;
+  // TODO(jingning): Implement integral projection functions for high bit-depth
+  // setting and remove this part of code.
+  if (xd->bd != 8) {
+    const unsigned int sad = cpi->fn_ptr[bsize].sdf(
+        x->plane[0].src.buf, src_stride, xd->plane[0].pre[0].buf, ref_stride);
     tmp_mv->row = 0;
     tmp_mv->col = 0;
-    this_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
-                                      xd->plane[0].pre[0].buf, ref_stride);
 
     if (scaled_ref_frame) {
       int i;
       for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
     }
-    return this_sad;
+    return sad;
   }
 #endif
 
@@ -1873,7 +2349,10 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
 
   {
     const uint8_t *const pos[4] = {
-      ref_buf - ref_stride, ref_buf - 1, ref_buf + 1, ref_buf + ref_stride,
+      ref_buf - ref_stride,
+      ref_buf - 1,
+      ref_buf + 1,
+      ref_buf + ref_stride,
     };
 
     cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, this_sad);
@@ -1908,6 +2387,10 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
   tmp_mv->row *= 8;
   tmp_mv->col *= 8;
 
+  vp9_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, ref_mv);
+  clamp_mv(tmp_mv, subpel_mv_limits.col_min, subpel_mv_limits.col_max,
+           subpel_mv_limits.row_min, subpel_mv_limits.row_max);
+
   if (scaled_ref_frame) {
     int i;
     for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
@@ -1916,19 +2399,139 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
   return best_sad;
 }
 
+static int get_exhaustive_threshold(int exhaustive_searches_thresh,
+                                    BLOCK_SIZE bsize) {
+  return exhaustive_searches_thresh >>
+         (8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]));
+}
+
+#if CONFIG_NON_GREEDY_MV
 // Runs sequence of diamond searches in smaller steps for RD.
 /* do_refine: If last step (1-away) of n-step search doesn't pick the center
               point as the best match, we will do a final 1-away diamond
               refining search  */
-static int full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x, MV *mvp_full,
-                              int step_param, int sadpb, int further_steps,
-                              int do_refine, int *cost_list,
+int vp9_full_pixel_diamond_new(const VP9_COMP *cpi, MACROBLOCK *x,
+                               BLOCK_SIZE bsize, MV *mvp_full, int step_param,
+                               int lambda, int do_refine,
+                               const int_mv *nb_full_mvs, int full_mv_num,
+                               MV *best_mv) {
+  const vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  int n, num00 = 0;
+  int thissme;
+  int bestsme;
+  const int further_steps = MAX_MVSEARCH_STEPS - 1 - step_param;
+  const MV center_mv = { 0, 0 };
+  vpx_clear_system_state();
+  diamond_search_sad_new(x, &cpi->ss_cfg, mvp_full, best_mv, step_param, lambda,
+                         &n, fn_ptr, nb_full_mvs, full_mv_num);
+
+  bestsme = vp9_get_mvpred_var(x, best_mv, &center_mv, fn_ptr, 0);
+
+  // If there won't be more n-step search, check to see if refining search is
+  // needed.
+  if (n > further_steps) do_refine = 0;
+
+  while (n < further_steps) {
+    ++n;
+    if (num00) {
+      num00--;
+    } else {
+      MV temp_mv;
+      diamond_search_sad_new(x, &cpi->ss_cfg, mvp_full, &temp_mv,
+                             step_param + n, lambda, &num00, fn_ptr,
+                             nb_full_mvs, full_mv_num);
+      thissme = vp9_get_mvpred_var(x, &temp_mv, &center_mv, fn_ptr, 0);
+      // check to see if refining search is needed.
+      if (num00 > further_steps - n) do_refine = 0;
+
+      if (thissme < bestsme) {
+        bestsme = thissme;
+        *best_mv = temp_mv;
+      }
+    }
+  }
+
+  // final 1-away diamond refining search
+  if (do_refine) {
+    const int search_range = 8;
+    MV temp_mv = *best_mv;
+    vp9_refining_search_sad_new(x, &temp_mv, lambda, search_range, fn_ptr,
+                                nb_full_mvs, full_mv_num);
+    thissme = vp9_get_mvpred_var(x, &temp_mv, &center_mv, fn_ptr, 0);
+    if (thissme < bestsme) {
+      bestsme = thissme;
+      *best_mv = temp_mv;
+    }
+  }
+
+  if (sf->exhaustive_searches_thresh < INT_MAX &&
+      !cpi->rc.is_src_frame_alt_ref) {
+    const int64_t exhaustive_thr =
+        get_exhaustive_threshold(sf->exhaustive_searches_thresh, bsize);
+    if (bestsme > exhaustive_thr) {
+      full_pixel_exhaustive_new(cpi, x, best_mv, fn_ptr, best_mv, lambda,
+                                nb_full_mvs, full_mv_num);
+      bestsme = vp9_get_mvpred_var(x, best_mv, &center_mv, fn_ptr, 0);
+    }
+  }
+  return bestsme;
+}
+#endif  // CONFIG_NON_GREEDY_MV
+
+// Runs sequence of diamond searches in smaller steps for RD.
+/* do_refine: If last step (1-away) of n-step search doesn't pick the center
+              point as the best match, we will do a final 1-away diamond
+              refining search  */
+static int full_pixel_diamond(const VP9_COMP *const cpi,
+                              const MACROBLOCK *const x, BLOCK_SIZE bsize,
+                              MV *mvp_full, int step_param, int sadpb,
+                              int further_steps, int do_refine,
+                              int use_downsampled_sad, int *cost_list,
                               const vp9_variance_fn_ptr_t *fn_ptr,
                               const MV *ref_mv, MV *dst_mv) {
   MV temp_mv;
   int thissme, n, num00 = 0;
-  int bestsme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
-                                        step_param, sadpb, &n, fn_ptr, ref_mv);
+  int bestsme;
+  const int src_buf_stride = x->plane[0].src.stride;
+  const uint8_t *const src_buf = x->plane[0].src.buf;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int pred_buf_stride = xd->plane[0].pre[0].stride;
+  uint8_t *pred_buf;
+  vp9_sad_fn_ptr_t sad_fn_ptr;
+  unsigned int start_mv_sad, start_mv_sad_even_rows, start_mv_sad_odd_rows;
+  const MV ref_mv_full = { ref_mv->row >> 3, ref_mv->col >> 3 };
+  clamp_mv(mvp_full, x->mv_limits.col_min, x->mv_limits.col_max,
+           x->mv_limits.row_min, x->mv_limits.row_max);
+
+  pred_buf =
+      xd->plane[0].pre[0].buf + mvp_full->row * pred_buf_stride + mvp_full->col;
+  start_mv_sad_even_rows =
+      fn_ptr->sdsf(src_buf, src_buf_stride, pred_buf, pred_buf_stride);
+  start_mv_sad_odd_rows =
+      fn_ptr->sdsf(src_buf + src_buf_stride, src_buf_stride,
+                   pred_buf + pred_buf_stride, pred_buf_stride);
+  start_mv_sad = (start_mv_sad_even_rows + start_mv_sad_odd_rows) >> 1;
+  start_mv_sad += mvsad_err_cost(x, mvp_full, &ref_mv_full, sadpb);
+
+  sad_fn_ptr.sdf = fn_ptr->sdf;
+  sad_fn_ptr.sdx4df = fn_ptr->sdx4df;
+  if (use_downsampled_sad && num_4x4_blocks_high_lookup[bsize] >= 2) {
+    // If the absolute difference between the pred-to-src SAD of even rows and
+    // the pred-to-src SAD of odd rows is small, skip every other row in sad
+    // computation.
+    const int odd_to_even_diff_sad =
+        abs((int)start_mv_sad_even_rows - (int)start_mv_sad_odd_rows);
+    const int mult_thresh = 10;
+    if (odd_to_even_diff_sad * mult_thresh < (int)start_mv_sad_even_rows) {
+      sad_fn_ptr.sdf = fn_ptr->sdsf;
+      sad_fn_ptr.sdx4df = fn_ptr->sdsx4df;
+    }
+  }
+
+  bestsme =
+      cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, start_mv_sad, &temp_mv,
+                              step_param, sadpb, &n, &sad_fn_ptr, ref_mv);
   if (bestsme < INT_MAX)
     bestsme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
   *dst_mv = temp_mv;
@@ -1943,9 +2546,9 @@ static int full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x, MV *mvp_full,
     if (num00) {
       num00--;
     } else {
-      thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
-                                        step_param + n, sadpb, &num00, fn_ptr,
-                                        ref_mv);
+      thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, start_mv_sad,
+                                        &temp_mv, step_param + n, sadpb, &num00,
+                                        &sad_fn_ptr, ref_mv);
       if (thissme < INT_MAX)
         thissme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
 
@@ -1963,8 +2566,8 @@ static int full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x, MV *mvp_full,
   if (do_refine) {
     const int search_range = 8;
     MV best_mv = *dst_mv;
-    thissme = vp9_refining_search_sad(x, &best_mv, sadpb, search_range, fn_ptr,
-                                      ref_mv);
+    thissme = vp9_refining_search_sad(x, &best_mv, sadpb, search_range,
+                                      &sad_fn_ptr, ref_mv);
     if (thissme < INT_MAX)
       thissme = vp9_get_mvpred_var(x, &best_mv, ref_mv, fn_ptr, 1);
     if (thissme < bestsme) {
@@ -1973,6 +2576,27 @@ static int full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x, MV *mvp_full,
     }
   }
 
+  if (sad_fn_ptr.sdf != fn_ptr->sdf) {
+    // If we are skipping rows when we perform the motion search, we need to
+    // check the quality of skipping. If it's bad, then we run search with
+    // skip row features off.
+    const uint8_t *best_address = get_buf_from_mv(&xd->plane[0].pre[0], dst_mv);
+    const int sad =
+        fn_ptr->sdf(src_buf, src_buf_stride, best_address, pred_buf_stride);
+    const int skip_sad =
+        fn_ptr->sdsf(src_buf, src_buf_stride, best_address, pred_buf_stride);
+    // We will keep the result of skipping rows if it's good enough.
+    const int kSADThresh =
+        1 << (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+    if (sad > kSADThresh && abs(skip_sad - sad) * 10 >= VPXMAX(sad, 1) * 9) {
+      // There is a large discrepancy between skipping and not skipping, so we
+      // need to redo the motion search.
+      return full_pixel_diamond(cpi, x, bsize, mvp_full, step_param, sadpb,
+                                further_steps, do_refine, 0, cost_list, fn_ptr,
+                                ref_mv, dst_mv);
+    }
+  }
+
   // Return cost list.
   if (cost_list) {
     calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list);
@@ -1980,13 +2604,11 @@ static int full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x, MV *mvp_full,
   return bestsme;
 }
 
-#define MIN_RANGE 7
-#define MAX_RANGE 256
-#define MIN_INTERVAL 1
 // Runs an limited range exhaustive mesh search using a pattern set
 // according to the encode speed profile.
-static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x,
-                                 MV *centre_mv_full, int sadpb, int *cost_list,
+static int full_pixel_exhaustive(const VP9_COMP *const cpi,
+                                 const MACROBLOCK *const x, MV *centre_mv_full,
+                                 int sadpb, int *cost_list,
                                  const vp9_variance_fn_ptr_t *fn_ptr,
                                  const MV *ref_mv, MV *dst_mv) {
   const SPEED_FEATURES *const sf = &cpi->sf;
@@ -1998,9 +2620,6 @@ static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x,
   int range = sf->mesh_patterns[0].range;
   int baseline_interval_divisor;
 
-  // Keep track of number of exhaustive calls (this frame in this thread).
-  ++(*x->ex_search_count_ptr);
-
   // Trap illegal values for interval and range for this function.
   if ((range < MIN_RANGE) || (range > MAX_RANGE) || (interval < MIN_INTERVAL) ||
       (interval > range))
@@ -2010,12 +2629,12 @@ static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x,
 
   // Check size of proposed first range against magnitude of the centre
   // value used as a starting point.
-  range = VPXMAX(range, (5 * VPXMAX(abs(temp_mv.row), abs(temp_mv.col))) / 4);
-  range = VPXMIN(range, MAX_RANGE);
+  range = clamp(range, (5 * VPXMAX(abs(temp_mv.row), abs(temp_mv.col))) / 4,
+                MAX_RANGE);
   interval = VPXMAX(interval, range / baseline_interval_divisor);
 
   // initial search
-  bestsme = exhuastive_mesh_search(x, &f_ref_mv, &temp_mv, range, interval,
+  bestsme = exhaustive_mesh_search(x, &f_ref_mv, &temp_mv, range, interval,
                                    sadpb, fn_ptr, &temp_mv);
 
   if ((interval > MIN_INTERVAL) && (range > MIN_RANGE)) {
@@ -2023,7 +2642,7 @@ static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x,
     // till we reach a step size of 1. Then break out.
     for (i = 1; i < MAX_MESH_STEP; ++i) {
       // First pass with coarser step and longer range
-      bestsme = exhuastive_mesh_search(
+      bestsme = exhaustive_mesh_search(
           x, &f_ref_mv, &temp_mv, sf->mesh_patterns[i].range,
           sf->mesh_patterns[i].interval, sadpb, fn_ptr, &temp_mv);
 
@@ -2042,200 +2661,94 @@ static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x,
   return bestsme;
 }
 
-int vp9_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv,
-                          int sad_per_bit, int distance,
-                          const vp9_variance_fn_ptr_t *fn_ptr,
-                          const MV *center_mv, MV *best_mv) {
-  int r, c;
+#if CONFIG_NON_GREEDY_MV
+int64_t vp9_refining_search_sad_new(const MACROBLOCK *x, MV *best_full_mv,
+                                    int lambda, int search_range,
+                                    const vp9_variance_fn_ptr_t *fn_ptr,
+                                    const int_mv *nb_full_mvs,
+                                    int full_mv_num) {
   const MACROBLOCKD *const xd = &x->e_mbd;
+  const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } };
   const struct buf_2d *const what = &x->plane[0].src;
   const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const int row_min = VPXMAX(ref_mv->row - distance, x->mv_limits.row_min);
-  const int row_max = VPXMIN(ref_mv->row + distance, x->mv_limits.row_max);
-  const int col_min = VPXMAX(ref_mv->col - distance, x->mv_limits.col_min);
-  const int col_max = VPXMIN(ref_mv->col + distance, x->mv_limits.col_max);
-  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
-  int best_sad =
-      fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
-                  in_what->stride) +
-      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
-  *best_mv = *ref_mv;
-
-  for (r = row_min; r < row_max; ++r) {
-    for (c = col_min; c < col_max; ++c) {
-      const MV mv = { r, c };
-      const int sad =
-          fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &mv),
-                      in_what->stride) +
-          mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-      if (sad < best_sad) {
-        best_sad = sad;
-        *best_mv = mv;
-      }
-    }
+  const uint8_t *best_address = get_buf_from_mv(in_what, best_full_mv);
+  int64_t best_sad;
+  int i, j;
+  vpx_clear_system_state();
+  {
+    const int64_t mv_dist = (int64_t)fn_ptr->sdf(what->buf, what->stride,
+                                                 best_address, in_what->stride)
+                            << LOG2_PRECISION;
+    const int64_t mv_cost =
+        vp9_nb_mvs_inconsistency(best_full_mv, nb_full_mvs, full_mv_num);
+    best_sad = mv_dist + lambda * mv_cost;
   }
-  return best_sad;
-}
 
-int vp9_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv,
-                          int sad_per_bit, int distance,
-                          const vp9_variance_fn_ptr_t *fn_ptr,
-                          const MV *center_mv, MV *best_mv) {
-  int r;
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const int row_min = VPXMAX(ref_mv->row - distance, x->mv_limits.row_min);
-  const int row_max = VPXMIN(ref_mv->row + distance, x->mv_limits.row_max);
-  const int col_min = VPXMAX(ref_mv->col - distance, x->mv_limits.col_min);
-  const int col_max = VPXMIN(ref_mv->col + distance, x->mv_limits.col_max);
-  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
-  unsigned int best_sad =
-      fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
-                  in_what->stride) +
-      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
-  *best_mv = *ref_mv;
+  for (i = 0; i < search_range; i++) {
+    int best_site = -1;
+    const int all_in = ((best_full_mv->row - 1) > x->mv_limits.row_min) &
+                       ((best_full_mv->row + 1) < x->mv_limits.row_max) &
+                       ((best_full_mv->col - 1) > x->mv_limits.col_min) &
+                       ((best_full_mv->col + 1) < x->mv_limits.col_max);
 
-  for (r = row_min; r < row_max; ++r) {
-    int c = col_min;
-    const uint8_t *check_here = &in_what->buf[r * in_what->stride + c];
+    if (all_in) {
+      unsigned int sads[4];
+      const uint8_t *const positions[4] = { best_address - in_what->stride,
+                                            best_address - 1, best_address + 1,
+                                            best_address + in_what->stride };
 
-    if (fn_ptr->sdx3f != NULL) {
-      while ((c + 2) < col_max) {
-        int i;
-        DECLARE_ALIGNED(16, uint32_t, sads[3]);
+      fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, sads);
 
-        fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
-                      sads);
+      for (j = 0; j < 4; ++j) {
+        const MV mv = { best_full_mv->row + neighbors[j].row,
+                        best_full_mv->col + neighbors[j].col };
+        const int64_t mv_dist = (int64_t)sads[j] << LOG2_PRECISION;
+        const int64_t mv_cost =
+            vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num);
+        const int64_t thissad = mv_dist + lambda * mv_cost;
+        if (thissad < best_sad) {
+          best_sad = thissad;
+          best_site = j;
+        }
+      }
+    } else {
+      for (j = 0; j < 4; ++j) {
+        const MV mv = { best_full_mv->row + neighbors[j].row,
+                        best_full_mv->col + neighbors[j].col };
 
-        for (i = 0; i < 3; ++i) {
-          unsigned int sad = sads[i];
-          if (sad < best_sad) {
-            const MV mv = { r, c };
-            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-            if (sad < best_sad) {
-              best_sad = sad;
-              *best_mv = mv;
-            }
+        if (is_mv_in(&x->mv_limits, &mv)) {
+          const int64_t mv_dist =
+              (int64_t)fn_ptr->sdf(what->buf, what->stride,
+                                   get_buf_from_mv(in_what, &mv),
+                                   in_what->stride)
+              << LOG2_PRECISION;
+          const int64_t mv_cost =
+              vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num);
+          const int64_t thissad = mv_dist + lambda * mv_cost;
+          if (thissad < best_sad) {
+            best_sad = thissad;
+            best_site = j;
           }
-          ++check_here;
-          ++c;
         }
       }
     }
 
-    while (c < col_max) {
-      unsigned int sad =
-          fn_ptr->sdf(what->buf, what->stride, check_here, in_what->stride);
-      if (sad < best_sad) {
-        const MV mv = { r, c };
-        sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-        if (sad < best_sad) {
-          best_sad = sad;
-          *best_mv = mv;
-        }
-      }
-      ++check_here;
-      ++c;
-    }
-  }
-
-  return best_sad;
-}
-
-int vp9_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv,
-                          int sad_per_bit, int distance,
-                          const vp9_variance_fn_ptr_t *fn_ptr,
-                          const MV *center_mv, MV *best_mv) {
-  int r;
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const int row_min = VPXMAX(ref_mv->row - distance, x->mv_limits.row_min);
-  const int row_max = VPXMIN(ref_mv->row + distance, x->mv_limits.row_max);
-  const int col_min = VPXMAX(ref_mv->col - distance, x->mv_limits.col_min);
-  const int col_max = VPXMIN(ref_mv->col + distance, x->mv_limits.col_max);
-  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
-  unsigned int best_sad =
-      fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
-                  in_what->stride) +
-      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
-  *best_mv = *ref_mv;
-
-  for (r = row_min; r < row_max; ++r) {
-    int c = col_min;
-    const uint8_t *check_here = &in_what->buf[r * in_what->stride + c];
-
-    if (fn_ptr->sdx8f != NULL) {
-      while ((c + 7) < col_max) {
-        int i;
-        DECLARE_ALIGNED(16, uint32_t, sads[8]);
-
-        fn_ptr->sdx8f(what->buf, what->stride, check_here, in_what->stride,
-                      sads);
-
-        for (i = 0; i < 8; ++i) {
-          unsigned int sad = sads[i];
-          if (sad < best_sad) {
-            const MV mv = { r, c };
-            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-            if (sad < best_sad) {
-              best_sad = sad;
-              *best_mv = mv;
-            }
-          }
-          ++check_here;
-          ++c;
-        }
-      }
-    }
-
-    if (fn_ptr->sdx3f != NULL) {
-      while ((c + 2) < col_max) {
-        int i;
-        DECLARE_ALIGNED(16, uint32_t, sads[3]);
-
-        fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
-                      sads);
-
-        for (i = 0; i < 3; ++i) {
-          unsigned int sad = sads[i];
-          if (sad < best_sad) {
-            const MV mv = { r, c };
-            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-            if (sad < best_sad) {
-              best_sad = sad;
-              *best_mv = mv;
-            }
-          }
-          ++check_here;
-          ++c;
-        }
-      }
-    }
-
-    while (c < col_max) {
-      unsigned int sad =
-          fn_ptr->sdf(what->buf, what->stride, check_here, in_what->stride);
-      if (sad < best_sad) {
-        const MV mv = { r, c };
-        sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-        if (sad < best_sad) {
-          best_sad = sad;
-          *best_mv = mv;
-        }
-      }
-      ++check_here;
-      ++c;
+    if (best_site == -1) {
+      break;
+    } else {
+      best_full_mv->row += neighbors[best_site].row;
+      best_full_mv->col += neighbors[best_site].col;
+      best_address = get_buf_from_mv(in_what, best_full_mv);
     }
   }
 
   return best_sad;
 }
+#endif  // CONFIG_NON_GREEDY_MV
 
 int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
                             int search_range,
-                            const vp9_variance_fn_ptr_t *fn_ptr,
+                            const vp9_sad_fn_ptr_t *sad_fn_ptr,
                             const MV *center_mv) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } };
@@ -2244,7 +2757,7 @@ int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
   const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
   const uint8_t *best_address = get_buf_from_mv(in_what, ref_mv);
   unsigned int best_sad =
-      fn_ptr->sdf(what->buf, what->stride, best_address, in_what->stride) +
+      sad_fn_ptr->sdf(what->buf, what->stride, best_address, in_what->stride) +
       mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
   int i, j;
 
@@ -2261,7 +2774,8 @@ int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
                                             best_address - 1, best_address + 1,
                                             best_address + in_what->stride };
 
-      fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, sads);
+      sad_fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride,
+                         sads);
 
       for (j = 0; j < 4; ++j) {
         if (sads[j] < best_sad) {
@@ -2281,8 +2795,8 @@ int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
 
         if (is_mv_in(&x->mv_limits, &mv)) {
           unsigned int sad =
-              fn_ptr->sdf(what->buf, what->stride,
-                          get_buf_from_mv(in_what, &mv), in_what->stride);
+              sad_fn_ptr->sdf(what->buf, what->stride,
+                              get_buf_from_mv(in_what, &mv), in_what->stride);
           if (sad < best_sad) {
             sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
             if (sad < best_sad) {
@@ -2358,26 +2872,16 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
   return best_sad;
 }
 
-#define MIN_EX_SEARCH_LIMIT 128
-static int is_exhaustive_allowed(VP9_COMP *cpi, MACROBLOCK *x) {
+int vp9_full_pixel_search(const VP9_COMP *const cpi, const MACROBLOCK *const x,
+                          BLOCK_SIZE bsize, MV *mvp_full, int step_param,
+                          int search_method, int error_per_bit, int *cost_list,
+                          const MV *ref_mv, MV *tmp_mv, int var_max, int rd) {
   const SPEED_FEATURES *const sf = &cpi->sf;
-  const int max_ex =
-      VPXMAX(MIN_EX_SEARCH_LIMIT,
-             (*x->m_search_count_ptr * sf->max_exaustive_pct) / 100);
-
-  return sf->allow_exhaustive_searches &&
-         (sf->exhaustive_searches_thresh < INT_MAX) &&
-         (*x->ex_search_count_ptr <= max_ex) && !cpi->rc.is_src_frame_alt_ref;
-}
-
-int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
-                          MV *mvp_full, int step_param, int error_per_bit,
-                          int *cost_list, const MV *ref_mv, MV *tmp_mv,
-                          int var_max, int rd) {
-  const SPEED_FEATURES *const sf = &cpi->sf;
-  const SEARCH_METHODS method = sf->mv.search_method;
-  vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
+  const SEARCH_METHODS method = (SEARCH_METHODS)search_method;
+  const vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
   int var = 0;
+  int run_exhaustive_search = 0;
+
   if (cost_list) {
     cost_list[0] = INT_MAX;
     cost_list[1] = INT_MAX;
@@ -2386,9 +2890,6 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
     cost_list[4] = INT_MAX;
   }
 
-  // Keep track of number of searches (this frame in this thread).
-  ++(*x->m_search_count_ptr);
-
   switch (method) {
     case FAST_DIAMOND:
       var = fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
@@ -2411,35 +2912,124 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                           fn_ptr, 1, ref_mv, tmp_mv);
       break;
     case NSTEP:
-      var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
-                               MAX_MVSEARCH_STEPS - 1 - step_param, 1,
-                               cost_list, fn_ptr, ref_mv, tmp_mv);
-
-      // Should we allow a follow on exhaustive search?
-      if (is_exhaustive_allowed(cpi, x)) {
-        int64_t exhuastive_thr = sf->exhaustive_searches_thresh;
-        exhuastive_thr >>=
-            8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
-
-        // Threshold variance for an exhaustive full search.
-        if (var > exhuastive_thr) {
-          int var_ex;
-          MV tmp_mv_ex;
-          var_ex = full_pixel_exhaustive(cpi, x, tmp_mv, error_per_bit,
-                                         cost_list, fn_ptr, ref_mv, &tmp_mv_ex);
-
-          if (var_ex < var) {
-            var = var_ex;
-            *tmp_mv = tmp_mv_ex;
-          }
-        }
-      }
+    case MESH:
+      var = full_pixel_diamond(
+          cpi, x, bsize, mvp_full, step_param, error_per_bit,
+          MAX_MVSEARCH_STEPS - 1 - step_param, 1,
+          cpi->sf.mv.use_downsampled_sad, cost_list, fn_ptr, ref_mv, tmp_mv);
       break;
-    default: assert(0 && "Invalid search method.");
+    default: assert(0 && "Unknown search method");
   }
 
-  if (method != NSTEP && rd && var < var_max)
+  if (method == NSTEP) {
+    if (sf->exhaustive_searches_thresh < INT_MAX &&
+        !cpi->rc.is_src_frame_alt_ref) {
+      const int64_t exhaustive_thr =
+          get_exhaustive_threshold(sf->exhaustive_searches_thresh, bsize);
+      if (var > exhaustive_thr) {
+        run_exhaustive_search = 1;
+      }
+    }
+  } else if (method == MESH) {
+    run_exhaustive_search = 1;
+  }
+
+  if (run_exhaustive_search) {
+    int var_ex;
+    MV tmp_mv_ex;
+    var_ex = full_pixel_exhaustive(cpi, x, tmp_mv, error_per_bit, cost_list,
+                                   fn_ptr, ref_mv, &tmp_mv_ex);
+    if (var_ex < var) {
+      var = var_ex;
+      *tmp_mv = tmp_mv_ex;
+    }
+  }
+
+  if (method != NSTEP && method != MESH && rd && var < var_max)
     var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, fn_ptr, 1);
 
   return var;
 }
+
+// Note(yunqingwang): The following 2 functions are only used in the motion
+// vector unit test, which return extreme motion vectors allowed by the MV
+// limits.
+#define COMMON_MV_TEST \
+  SETUP_SUBPEL_SEARCH; \
+                       \
+  (void)error_per_bit; \
+  (void)vfp;           \
+  (void)z;             \
+  (void)src_stride;    \
+  (void)y;             \
+  (void)y_stride;      \
+  (void)second_pred;   \
+  (void)w;             \
+  (void)h;             \
+  (void)offset;        \
+  (void)mvjcost;       \
+  (void)mvcost;        \
+  (void)sse1;          \
+  (void)distortion;    \
+                       \
+  (void)halfiters;     \
+  (void)quarteriters;  \
+  (void)eighthiters;   \
+  (void)whichdir;      \
+  (void)allow_hp;      \
+  (void)forced_stop;   \
+  (void)hstep;         \
+  (void)rr;            \
+  (void)rc;            \
+                       \
+  (void)tr;            \
+  (void)tc;            \
+  (void)sse;           \
+  (void)thismse;       \
+  (void)cost_list;     \
+  (void)use_accurate_subpel_search
+
+// Return the maximum MV.
+uint32_t vp9_return_max_sub_pixel_mv(
+    const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
+    int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop,
+    int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
+    uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
+    int h, int use_accurate_subpel_search) {
+  COMMON_MV_TEST;
+
+  (void)minr;
+  (void)minc;
+
+  bestmv->row = maxr;
+  bestmv->col = maxc;
+  besterr = 0;
+
+  // In the sub-pel motion search, if hp is not used, then the last bit of mv
+  // has to be 0.
+  lower_mv_precision(bestmv, allow_hp && use_mv_hp(ref_mv));
+
+  return besterr;
+}
+// Return the minimum MV.
+uint32_t vp9_return_min_sub_pixel_mv(
+    const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
+    int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop,
+    int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
+    uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
+    int h, int use_accurate_subpel_search) {
+  COMMON_MV_TEST;
+
+  (void)maxr;
+  (void)maxc;
+
+  bestmv->row = minr;
+  bestmv->col = minc;
+  besterr = 0;
+
+  // In the sub-pel motion search, if hp is not used, then the last bit of mv
+  // has to be 0.
+  lower_mv_precision(bestmv, allow_hp && use_mv_hp(ref_mv));
+
+  return besterr;
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_mcomp.h b/media/libvpx/libvpx/vp9/encoder/vp9_mcomp.h
index 2726b9e230..fd6a8b9aca 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_mcomp.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_mcomp.h
@@ -8,10 +8,13 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_MCOMP_H_
-#define VP9_ENCODER_VP9_MCOMP_H_
+#ifndef VPX_VP9_ENCODER_VP9_MCOMP_H_
+#define VPX_VP9_ENCODER_VP9_MCOMP_H_
 
 #include "vp9/encoder/vp9_block.h"
+#if CONFIG_NON_GREEDY_MV
+#include "vp9/encoder/vp9_non_greedy_mv.h"
+#endif  // CONFIG_NON_GREEDY_MV
 #include "vpx_dsp/variance.h"
 
 #ifdef __cplusplus
@@ -38,6 +41,16 @@ typedef struct search_site_config {
   int total_steps;
 } search_site_config;
 
+typedef struct vp9_sad_table {
+  vpx_sad_fn_t sdf;
+  vpx_sad_multi_d_fn_t sdx4df;
+} vp9_sad_fn_ptr_t;
+
+static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
+                                             const MV *mv) {
+  return &buf->buf[mv->row * buf->stride + mv->col];
+}
+
 void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride);
 void vp9_init3smotion_compensation(search_site_config *cfg, int stride);
 
@@ -55,18 +68,20 @@ int vp9_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv,
 
 struct VP9_COMP;
 struct SPEED_FEATURES;
+struct vp9_sad_table;
 
 int vp9_init_search_range(int size);
 
 int vp9_refining_search_sad(const struct macroblock *x, struct mv *ref_mv,
-                            int sad_per_bit, int distance,
-                            const struct vp9_variance_vtable *fn_ptr,
+                            int error_per_bit, int search_range,
+                            const struct vp9_sad_table *sad_fn_ptr,
                             const struct mv *center_mv);
 
 // Perform integral projection based motion estimation.
 unsigned int vp9_int_pro_motion_estimation(const struct VP9_COMP *cpi,
                                            MACROBLOCK *x, BLOCK_SIZE bsize,
-                                           int mi_row, int mi_col);
+                                           int mi_row, int mi_col,
+                                           const MV *ref_mv);
 
 typedef uint32_t(fractional_mv_step_fp)(
     const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
@@ -74,28 +89,20 @@ typedef uint32_t(fractional_mv_step_fp)(
     int forced_stop,  // 0 - full, 1 - qtr only, 2 - half only
     int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
     uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
-    int h);
+    int h, int use_accurate_subpel_search);
 
 extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree;
 extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned;
 extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned_more;
 extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned_evenmore;
 extern fractional_mv_step_fp vp9_skip_sub_pixel_tree;
-
-typedef int (*vp9_full_search_fn_t)(const MACROBLOCK *x, const MV *ref_mv,
-                                    int sad_per_bit, int distance,
-                                    const vp9_variance_fn_ptr_t *fn_ptr,
-                                    const MV *center_mv, MV *best_mv);
-
-typedef int (*vp9_refining_search_fn_t)(const MACROBLOCK *x, MV *ref_mv,
-                                        int sad_per_bit, int distance,
-                                        const vp9_variance_fn_ptr_t *fn_ptr,
-                                        const MV *center_mv);
+extern fractional_mv_step_fp vp9_return_max_sub_pixel_mv;
+extern fractional_mv_step_fp vp9_return_min_sub_pixel_mv;
 
 typedef int (*vp9_diamond_search_fn_t)(
-    const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, MV *best_mv,
-    int search_param, int sad_per_bit, int *num00,
-    const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv);
+    const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv,
+    uint32_t start_mv_sad, MV *best_mv, int search_param, int sad_per_bit,
+    int *num00, const vp9_sad_fn_ptr_t *sad_fn_ptr, const MV *center_mv);
 
 int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
                              int search_range,
@@ -104,13 +111,68 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
 
 struct VP9_COMP;
 
-int vp9_full_pixel_search(struct VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
-                          MV *mvp_full, int step_param, int error_per_bit,
-                          int *cost_list, const MV *ref_mv, MV *tmp_mv,
-                          int var_max, int rd);
+// "mvp_full" is the MV search starting point;
+// "ref_mv" is the context reference MV;
+// "tmp_mv" is the searched best MV.
+int vp9_full_pixel_search(const struct VP9_COMP *const cpi,
+                          const MACROBLOCK *const x, BLOCK_SIZE bsize,
+                          MV *mvp_full, int step_param, int search_method,
+                          int error_per_bit, int *cost_list, const MV *ref_mv,
+                          MV *tmp_mv, int var_max, int rd);
 
+void vp9_set_subpel_mv_search_range(MvLimits *subpel_mv_limits,
+                                    const MvLimits *umv_window_limits,
+                                    const MV *ref_mv);
+
+#if CONFIG_NON_GREEDY_MV
+struct TplDepStats;
+int64_t vp9_refining_search_sad_new(const MACROBLOCK *x, MV *best_full_mv,
+                                    int lambda, int search_range,
+                                    const vp9_variance_fn_ptr_t *fn_ptr,
+                                    const int_mv *nb_full_mvs, int full_mv_num);
+
+int vp9_full_pixel_diamond_new(const struct VP9_COMP *cpi, MACROBLOCK *x,
+                               BLOCK_SIZE bsize, MV *mvp_full, int step_param,
+                               int lambda, int do_refine,
+                               const int_mv *nb_full_mvs, int full_mv_num,
+                               MV *best_mv);
+
+static INLINE MV get_full_mv(const MV *mv) {
+  MV out_mv;
+  out_mv.row = mv->row >> 3;
+  out_mv.col = mv->col >> 3;
+  return out_mv;
+}
+struct TplDepFrame;
+int vp9_prepare_nb_full_mvs(const struct MotionField *motion_field, int mi_row,
+                            int mi_col, int_mv *nb_full_mvs);
+
+static INLINE BLOCK_SIZE get_square_block_size(BLOCK_SIZE bsize) {
+  BLOCK_SIZE square_bsize;
+  switch (bsize) {
+    case BLOCK_4X4:
+    case BLOCK_4X8:
+    case BLOCK_8X4: square_bsize = BLOCK_4X4; break;
+    case BLOCK_8X8:
+    case BLOCK_8X16:
+    case BLOCK_16X8: square_bsize = BLOCK_8X8; break;
+    case BLOCK_16X16:
+    case BLOCK_16X32:
+    case BLOCK_32X16: square_bsize = BLOCK_16X16; break;
+    case BLOCK_32X32:
+    case BLOCK_32X64:
+    case BLOCK_64X32:
+    case BLOCK_64X64: square_bsize = BLOCK_32X32; break;
+    default:
+      square_bsize = BLOCK_INVALID;
+      assert(0 && "ERROR: invalid block size");
+      break;
+  }
+  return square_bsize;
+}
+#endif  // CONFIG_NON_GREEDY_MV
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_MCOMP_H_
+#endif  // VPX_VP9_ENCODER_VP9_MCOMP_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c b/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c
new file mode 100644
index 0000000000..8437ce7531
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c
@@ -0,0 +1,342 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vpx_util/vpx_pthread.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_ethread.h"
+#include "vp9/encoder/vp9_multi_thread.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+
+void *vp9_enc_grp_get_next_job(MultiThreadHandle *multi_thread_ctxt,
+                               int tile_id) {
+  RowMTInfo *row_mt_info;
+  JobQueueHandle *job_queue_hdl = NULL;
+  void *next = NULL;
+  JobNode *job_info = NULL;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *mutex_handle = NULL;
+#endif
+
+  row_mt_info = (RowMTInfo *)(&multi_thread_ctxt->row_mt_info[tile_id]);
+  job_queue_hdl = (JobQueueHandle *)&row_mt_info->job_queue_hdl;
+#if CONFIG_MULTITHREAD
+  mutex_handle = &row_mt_info->job_mutex;
+#endif
+
+// lock the mutex for queue access
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(mutex_handle);
+#endif
+  next = job_queue_hdl->next;
+  if (next != NULL) {
+    JobQueue *job_queue = (JobQueue *)next;
+    job_info = &job_queue->job_info;
+    // Update the next job in the queue
+    job_queue_hdl->next = job_queue->next;
+    job_queue_hdl->num_jobs_acquired++;
+  }
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(mutex_handle);
+#endif
+
+  return job_info;
+}
+
+void vp9_row_mt_alloc_rd_thresh(VP9_COMP *const cpi,
+                                TileDataEnc *const this_tile) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+  int i;
+
+  if (this_tile->row_base_thresh_freq_fact != NULL) {
+    if (sb_rows <= this_tile->sb_rows) {
+      return;
+    }
+    vpx_free(this_tile->row_base_thresh_freq_fact);
+    this_tile->row_base_thresh_freq_fact = NULL;
+  }
+  CHECK_MEM_ERROR(
+      &cm->error, this_tile->row_base_thresh_freq_fact,
+      (int *)vpx_calloc(sb_rows * BLOCK_SIZES * MAX_MODES,
+                        sizeof(*(this_tile->row_base_thresh_freq_fact))));
+  for (i = 0; i < sb_rows * BLOCK_SIZES * MAX_MODES; i++)
+    this_tile->row_base_thresh_freq_fact[i] = RD_THRESH_INIT_FACT;
+  this_tile->sb_rows = sb_rows;
+}
+
+void vp9_row_mt_mem_alloc(VP9_COMP *cpi) {
+  struct VP9Common *cm = &cpi->common;
+  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
+  int tile_row, tile_col;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+  int jobs_per_tile_col, total_jobs;
+
+  // Allocate memory that is large enough for all row_mt stages. First pass
+  // uses 16x16 block size.
+  jobs_per_tile_col = VPXMAX(cm->mb_rows, sb_rows);
+  // Calculate the total number of jobs
+  total_jobs = jobs_per_tile_col * tile_cols;
+
+  multi_thread_ctxt->allocated_tile_cols = tile_cols;
+  multi_thread_ctxt->allocated_tile_rows = tile_rows;
+  multi_thread_ctxt->allocated_vert_unit_rows = jobs_per_tile_col;
+
+  CHECK_MEM_ERROR(&cm->error, multi_thread_ctxt->job_queue,
+                  (JobQueue *)vpx_memalign(32, total_jobs * sizeof(JobQueue)));
+
+#if CONFIG_MULTITHREAD
+  // Create mutex for each tile
+  for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+    RowMTInfo *row_mt_info = &multi_thread_ctxt->row_mt_info[tile_col];
+    pthread_mutex_init(&row_mt_info->job_mutex, NULL);
+  }
+#endif
+
+  // Allocate memory for row based multi-threading
+  for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+    TileDataEnc *this_tile = &cpi->tile_data[tile_col];
+    vp9_row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, jobs_per_tile_col);
+  }
+
+  // Assign the sync pointer of tile row zero for every tile row > 0
+  for (tile_row = 1; tile_row < tile_rows; tile_row++) {
+    for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+      TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+      TileDataEnc *this_col_tile = &cpi->tile_data[tile_col];
+      this_tile->row_mt_sync = this_col_tile->row_mt_sync;
+    }
+  }
+
+  // Calculate the number of vertical units in the given tile row
+  for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+    TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols];
+    TileInfo *tile_info = &this_tile->tile_info;
+    multi_thread_ctxt->num_tile_vert_sbs[tile_row] =
+        get_num_vert_units(*tile_info, MI_BLOCK_SIZE_LOG2);
+  }
+}
+
+void vp9_row_mt_mem_dealloc(VP9_COMP *cpi) {
+  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
+  int tile_col;
+#if CONFIG_MULTITHREAD
+  int tile_row;
+#endif
+
+  // Deallocate memory for job queue
+  if (multi_thread_ctxt->job_queue) {
+    vpx_free(multi_thread_ctxt->job_queue);
+    multi_thread_ctxt->job_queue = NULL;
+  }
+
+#if CONFIG_MULTITHREAD
+  // Destroy mutex for each tile
+  for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols;
+       tile_col++) {
+    RowMTInfo *row_mt_info = &multi_thread_ctxt->row_mt_info[tile_col];
+    pthread_mutex_destroy(&row_mt_info->job_mutex);
+  }
+#endif
+
+  // Free row based multi-threading sync memory
+  for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols;
+       tile_col++) {
+    TileDataEnc *this_tile = &cpi->tile_data[tile_col];
+    vp9_row_mt_sync_mem_dealloc(&this_tile->row_mt_sync);
+  }
+
+#if CONFIG_MULTITHREAD
+  for (tile_row = 0; tile_row < multi_thread_ctxt->allocated_tile_rows;
+       tile_row++) {
+    for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols;
+         tile_col++) {
+      TileDataEnc *this_tile =
+          &cpi->tile_data[tile_row * multi_thread_ctxt->allocated_tile_cols +
+                          tile_col];
+      if (this_tile->row_base_thresh_freq_fact != NULL) {
+        vpx_free(this_tile->row_base_thresh_freq_fact);
+        this_tile->row_base_thresh_freq_fact = NULL;
+      }
+    }
+  }
+#endif
+
+  multi_thread_ctxt->allocated_tile_cols = 0;
+  multi_thread_ctxt->allocated_tile_rows = 0;
+  multi_thread_ctxt->allocated_vert_unit_rows = 0;
+}
+
+void vp9_multi_thread_tile_init(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+  int i;
+
+  for (i = 0; i < tile_cols; i++) {
+    TileDataEnc *this_tile = &cpi->tile_data[i];
+    int jobs_per_tile_col = cpi->oxcf.pass == 1 ? cm->mb_rows : sb_rows;
+
+    // Initialize cur_col to -1 for all rows.
+    memset(this_tile->row_mt_sync.cur_col, -1,
+           sizeof(*this_tile->row_mt_sync.cur_col) * jobs_per_tile_col);
+    vp9_zero(this_tile->fp_data);
+    this_tile->fp_data.image_data_start_row = INVALID_ROW;
+  }
+}
+
+void vp9_assign_tile_to_thread(MultiThreadHandle *multi_thread_ctxt,
+                               int tile_cols, int num_workers) {
+  int tile_id = 0;
+  int i;
+
+  // Allocating the threads for the tiles
+  for (i = 0; i < num_workers; i++) {
+    multi_thread_ctxt->thread_id_to_tile_id[i] = tile_id++;
+    if (tile_id == tile_cols) tile_id = 0;
+  }
+}
+
+int vp9_get_job_queue_status(MultiThreadHandle *multi_thread_ctxt,
+                             int cur_tile_id) {
+  RowMTInfo *row_mt_info;
+  JobQueueHandle *job_queue_hndl;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *mutex;
+#endif
+  int num_jobs_remaining;
+
+  row_mt_info = &multi_thread_ctxt->row_mt_info[cur_tile_id];
+  job_queue_hndl = &row_mt_info->job_queue_hdl;
+#if CONFIG_MULTITHREAD
+  mutex = &row_mt_info->job_mutex;
+#endif
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(mutex);
+#endif
+  num_jobs_remaining =
+      multi_thread_ctxt->jobs_per_tile_col - job_queue_hndl->num_jobs_acquired;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(mutex);
+#endif
+
+  return (num_jobs_remaining);
+}
+
+void vp9_prepare_job_queue(VP9_COMP *cpi, JOB_TYPE job_type) {
+  VP9_COMMON *const cm = &cpi->common;
+  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
+  JobQueue *job_queue = multi_thread_ctxt->job_queue;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  int job_row_num, jobs_per_tile, jobs_per_tile_col = 0, total_jobs;
+  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+  int tile_col, i;
+
+  switch (job_type) {
+    case ENCODE_JOB: jobs_per_tile_col = sb_rows; break;
+    case FIRST_PASS_JOB: jobs_per_tile_col = cm->mb_rows; break;
+    case ARNR_JOB:
+      jobs_per_tile_col = ((cm->mi_rows + TF_ROUND) >> TF_SHIFT);
+      break;
+    default: assert(0);
+  }
+
+  total_jobs = jobs_per_tile_col * tile_cols;
+
+  multi_thread_ctxt->jobs_per_tile_col = jobs_per_tile_col;
+  // memset the entire job queue buffer to zero
+  memset(job_queue, 0, total_jobs * sizeof(JobQueue));
+
+  // Job queue preparation
+  for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+    RowMTInfo *tile_ctxt = &multi_thread_ctxt->row_mt_info[tile_col];
+    JobQueue *job_queue_curr, *job_queue_temp;
+    int tile_row = 0;
+
+    tile_ctxt->job_queue_hdl.next = (void *)job_queue;
+    tile_ctxt->job_queue_hdl.num_jobs_acquired = 0;
+
+    job_queue_curr = job_queue;
+    job_queue_temp = job_queue;
+
+    // loop over all the vertical rows
+    for (job_row_num = 0, jobs_per_tile = 0; job_row_num < jobs_per_tile_col;
+         job_row_num++, jobs_per_tile++) {
+      job_queue_curr->job_info.vert_unit_row_num = job_row_num;
+      job_queue_curr->job_info.tile_col_id = tile_col;
+      job_queue_curr->job_info.tile_row_id = tile_row;
+      job_queue_curr->next = (void *)(job_queue_temp + 1);
+      job_queue_curr = ++job_queue_temp;
+
+      if (ENCODE_JOB == job_type) {
+        if (jobs_per_tile >=
+            multi_thread_ctxt->num_tile_vert_sbs[tile_row] - 1) {
+          tile_row++;
+          jobs_per_tile = -1;
+        }
+      }
+    }
+
+    // Set the last pointer to NULL
+    job_queue_curr += -1;
+    job_queue_curr->next = (void *)NULL;
+
+    // Move to the next tile
+    job_queue += jobs_per_tile_col;
+  }
+
+  for (i = 0; i < cpi->num_workers; i++) {
+    EncWorkerData *thread_data;
+    thread_data = &cpi->tile_thr_data[i];
+    thread_data->thread_id = i;
+
+    for (tile_col = 0; tile_col < tile_cols; tile_col++)
+      thread_data->tile_completion_status[tile_col] = 0;
+  }
+}
+
+int vp9_get_tiles_proc_status(MultiThreadHandle *multi_thread_ctxt,
+                              int *tile_completion_status, int *cur_tile_id,
+                              int tile_cols) {
+  int tile_col;
+  int tile_id = -1;  // Stores the tile ID with minimum proc done
+  int max_num_jobs_remaining = 0;
+  int num_jobs_remaining;
+
+  // Mark the completion to avoid check in the loop
+  tile_completion_status[*cur_tile_id] = 1;
+  // Check for the status of all the tiles
+  for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+    if (tile_completion_status[tile_col] == 0) {
+      num_jobs_remaining =
+          vp9_get_job_queue_status(multi_thread_ctxt, tile_col);
+      // Mark the completion to avoid checks during future switches across tiles
+      if (num_jobs_remaining == 0) tile_completion_status[tile_col] = 1;
+      if (num_jobs_remaining > max_num_jobs_remaining) {
+        max_num_jobs_remaining = num_jobs_remaining;
+        tile_id = tile_col;
+      }
+    }
+  }
+
+  if (-1 == tile_id) {
+    return 1;
+  } else {
+    // Update the cur ID to the next tile ID that will be processed,
+    // which will be the least processed tile
+    *cur_tile_id = tile_id;
+    return 0;
+  }
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.h b/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.h
new file mode 100644
index 0000000000..a2276f4fe6
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.h
@@ -0,0 +1,41 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_MULTI_THREAD_H_
+#define VPX_VP9_ENCODER_VP9_MULTI_THREAD_H_
+
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_job_queue.h"
+
+void *vp9_enc_grp_get_next_job(MultiThreadHandle *multi_thread_ctxt,
+                               int tile_id);
+
+void vp9_prepare_job_queue(VP9_COMP *cpi, JOB_TYPE job_type);
+
+int vp9_get_job_queue_status(MultiThreadHandle *multi_thread_ctxt,
+                             int cur_tile_id);
+
+void vp9_assign_tile_to_thread(MultiThreadHandle *multi_thread_ctxt,
+                               int tile_cols, int num_workers);
+
+void vp9_multi_thread_tile_init(VP9_COMP *cpi);
+
+void vp9_row_mt_mem_alloc(VP9_COMP *cpi);
+
+void vp9_row_mt_alloc_rd_thresh(VP9_COMP *const cpi,
+                                TileDataEnc *const this_tile);
+
+void vp9_row_mt_mem_dealloc(VP9_COMP *cpi);
+
+int vp9_get_tiles_proc_status(MultiThreadHandle *multi_thread_ctxt,
+                              int *tile_completion_status, int *cur_tile_id,
+                              int tile_cols);
+
+#endif  // VPX_VP9_ENCODER_VP9_MULTI_THREAD_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_noise_estimate.c b/media/libvpx/libvpx/vp9/encoder/vp9_noise_estimate.c
index 2252fe16b9..4ee6e51ba8 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_noise_estimate.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_noise_estimate.c
@@ -21,27 +21,42 @@
 #include "vp9/encoder/vp9_noise_estimate.h"
 #include "vp9/encoder/vp9_encoder.h"
 
+#if CONFIG_VP9_TEMPORAL_DENOISING
+// For SVC: only do noise estimation on top spatial layer.
+static INLINE int noise_est_svc(const struct VP9_COMP *const cpi) {
+  return (!cpi->use_svc ||
+          (cpi->use_svc &&
+           cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1));
+}
+#endif
+
 void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) {
   ne->enabled = 0;
-  ne->level = kLowLow;
+  ne->level = (width * height < 1280 * 720) ? kLowLow : kLow;
   ne->value = 0;
   ne->count = 0;
-  ne->thresh = 100;
+  ne->thresh = 90;
   ne->last_w = 0;
   ne->last_h = 0;
   if (width * height >= 1920 * 1080) {
     ne->thresh = 200;
   } else if (width * height >= 1280 * 720) {
     ne->thresh = 140;
+  } else if (width * height >= 640 * 360) {
+    ne->thresh = 115;
   }
-  ne->num_frames_estimate = 20;
+  ne->num_frames_estimate = 15;
+  ne->adapt_thresh = (3 * ne->thresh) >> 1;
 }
 
 static int enable_noise_estimation(VP9_COMP *const cpi) {
-// Enable noise estimation if denoising is on, but not for low resolutions.
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cpi->common.use_highbitdepth) return 0;
+#endif
+// Enable noise estimation if denoising is on.
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0 && cpi->common.width >= 640 &&
-      cpi->common.height >= 360)
+  if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) &&
+      cpi->common.width >= 320 && cpi->common.height >= 180)
     return 1;
 #endif
   // Only allow noise estimate under certain encoding mode.
@@ -51,8 +66,8 @@ static int enable_noise_estimation(VP9_COMP *const cpi) {
   if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR &&
       cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.speed >= 5 &&
       cpi->resize_state == ORIG && cpi->resize_pending == 0 && !cpi->use_svc &&
-      cpi->oxcf.content != VP9E_CONTENT_SCREEN && cpi->common.width >= 640 &&
-      cpi->common.height >= 360)
+      cpi->oxcf.content != VP9E_CONTENT_SCREEN &&
+      cpi->common.width * cpi->common.height >= 640 * 360)
     return 1;
   else
     return 0;
@@ -83,7 +98,7 @@ NOISE_LEVEL vp9_noise_estimate_extract_level(NOISE_ESTIMATE *const ne) {
   } else {
     if (ne->value > ne->thresh)
       noise_level = kMedium;
-    else if (ne->value > ((9 * ne->thresh) >> 4))
+    else if (ne->value > (ne->thresh >> 1))
       noise_level = kLow;
     else
       noise_level = kLowLow;
@@ -94,24 +109,32 @@ NOISE_LEVEL vp9_noise_estimate_extract_level(NOISE_ESTIMATE *const ne) {
 void vp9_update_noise_estimate(VP9_COMP *const cpi) {
   const VP9_COMMON *const cm = &cpi->common;
   NOISE_ESTIMATE *const ne = &cpi->noise_estimate;
+  const int low_res = (cm->width <= 352 && cm->height <= 288);
   // Estimate of noise level every frame_period frames.
   int frame_period = 8;
   int thresh_consec_zeromv = 6;
-  unsigned int thresh_sum_diff = 100;
-  unsigned int thresh_sum_spatial = (200 * 200) << 8;
-  unsigned int thresh_spatial_var = (32 * 32) << 8;
-  int min_blocks_estimate = cm->mi_rows * cm->mi_cols >> 7;
+  int frame_counter = cm->current_video_frame;
   // Estimate is between current source and last source.
   YV12_BUFFER_CONFIG *last_source = cpi->Last_Source;
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0) last_source = &cpi->denoiser.last_source;
+  if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) {
+    last_source = &cpi->denoiser.last_source;
+    // Tune these thresholds for different resolutions when denoising is
+    // enabled.
+    if (cm->width > 640 && cm->width <= 1920) {
+      thresh_consec_zeromv = 2;
+    }
+  }
 #endif
   ne->enabled = enable_noise_estimation(cpi);
-  if (!ne->enabled || cm->current_video_frame % frame_period != 0 ||
-      last_source == NULL || ne->last_w != cm->width ||
-      ne->last_h != cm->height) {
+  if (cpi->svc.number_spatial_layers > 1)
+    frame_counter = cpi->svc.current_superframe;
+  if (!ne->enabled || frame_counter % frame_period != 0 ||
+      last_source == NULL ||
+      (cpi->svc.number_spatial_layers == 1 &&
+       (ne->last_w != cm->width || ne->last_h != cm->height))) {
 #if CONFIG_VP9_TEMPORAL_DENOISING
-    if (cpi->oxcf.noise_sensitivity > 0)
+    if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
       copy_frame(&cpi->denoiser.last_source, cpi->Source);
 #endif
     if (last_source != NULL) {
@@ -119,20 +142,30 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
       ne->last_h = cm->height;
     }
     return;
-  } else if (cpi->rc.avg_frame_low_motion < 50) {
+  } else if (frame_counter > 60 && cpi->svc.num_encoded_top_layer > 1 &&
+             cpi->rc.frames_since_key > cpi->svc.number_spatial_layers &&
+             cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1 &&
+             cpi->rc.avg_frame_low_motion < (low_res ? 60 : 40)) {
     // Force noise estimation to 0 and denoiser off if content has high motion.
     ne->level = kLowLow;
+    ne->count = 0;
+    ne->num_frames_estimate = 10;
 #if CONFIG_VP9_TEMPORAL_DENOISING
-    if (cpi->oxcf.noise_sensitivity > 0)
-      vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level);
+    if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) &&
+        cpi->svc.current_superframe > 1) {
+      vp9_denoiser_set_noise_level(cpi, ne->level);
+      copy_frame(&cpi->denoiser.last_source, cpi->Source);
+    }
 #endif
     return;
   } else {
-    int num_samples = 0;
-    uint64_t avg_est = 0;
+    unsigned int bin_size = 100;
+    unsigned int hist[MAX_VAR_HIST_BINS] = { 0 };
+    unsigned int hist_avg[MAX_VAR_HIST_BINS];
+    unsigned int max_bin = 0;
+    unsigned int max_bin_count = 0;
+    unsigned int bin_cnt;
     int bsize = BLOCK_16X16;
-    static const unsigned char const_source[16] = { 0, 0, 0, 0, 0, 0, 0, 0,
-                                                    0, 0, 0, 0, 0, 0, 0, 0 };
     // Loop over sub-sample of 16x16 blocks of frame, and for blocks that have
     // been encoded as zero/small mv at least x consecutive frames, compute
     // the variance to update estimate of noise in the source.
@@ -164,44 +197,36 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
           int bl_index1 = bl_index + 1;
           int bl_index2 = bl_index + cm->mi_cols;
           int bl_index3 = bl_index2 + 1;
-          // Only consider blocks that are likely steady background. i.e, have
-          // been encoded as zero/low motion x (= thresh_consec_zeromv) frames
-          // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all
-          // 4 sub-blocks for 16x16 block. Also, avoid skin blocks.
           int consec_zeromv =
               VPXMIN(cpi->consec_zero_mv[bl_index],
                      VPXMIN(cpi->consec_zero_mv[bl_index1],
                             VPXMIN(cpi->consec_zero_mv[bl_index2],
                                    cpi->consec_zero_mv[bl_index3])));
-          int is_skin = 0;
-          if (cpi->use_skin_detection) {
-            is_skin =
-                vp9_compute_skin_block(src_y, src_u, src_v, src_ystride,
-                                       src_uvstride, bsize, consec_zeromv, 0);
-          }
-          if (frame_low_motion &&
-              cpi->consec_zero_mv[bl_index] > thresh_consec_zeromv &&
-              cpi->consec_zero_mv[bl_index1] > thresh_consec_zeromv &&
-              cpi->consec_zero_mv[bl_index2] > thresh_consec_zeromv &&
-              cpi->consec_zero_mv[bl_index3] > thresh_consec_zeromv &&
-              !is_skin) {
-            // Compute variance.
-            unsigned int sse;
-            unsigned int variance = cpi->fn_ptr[bsize].vf(
-                src_y, src_ystride, last_src_y, last_src_ystride, &sse);
-            // Only consider this block as valid for noise measurement if the
-            // average term (sse - variance = N * avg^{2}, N = 16X16) of the
-            // temporal residual is small (avoid effects from lighting change).
-            if ((sse - variance) < thresh_sum_diff) {
-              unsigned int sse2;
-              const unsigned int spatial_variance = cpi->fn_ptr[bsize].vf(
-                  src_y, src_ystride, const_source, 0, &sse2);
-              // Avoid blocks with high brightness and high spatial variance.
-              if ((sse2 - spatial_variance) < thresh_sum_spatial &&
-                  spatial_variance < thresh_spatial_var) {
-                avg_est += variance / ((spatial_variance >> 9) + 1);
-                num_samples++;
-              }
+          // Only consider blocks that are likely steady background. i.e., have
+          // been encoded as zero/low motion x (= thresh_consec_zeromv) frames
+          // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all
+          // 4 sub-blocks for 16x16 block. And exclude this frame if
+          // high_source_sad is true (i.e., scene/content change).
+          if (frame_low_motion && consec_zeromv > thresh_consec_zeromv &&
+              !cpi->rc.high_source_sad &&
+              !cpi->svc.high_source_sad_superframe) {
+            int is_skin = 0;
+            if (cpi->use_skin_detection) {
+              is_skin =
+                  vp9_compute_skin_block(src_y, src_u, src_v, src_ystride,
+                                         src_uvstride, bsize, consec_zeromv, 0);
+            }
+            if (!is_skin) {
+              unsigned int sse;
+              // Compute variance between co-located blocks from current and
+              // last input frames.
+              unsigned int variance = cpi->fn_ptr[bsize].vf(
+                  src_y, src_ystride, last_src_y, last_src_ystride, &sse);
+              unsigned int hist_index = variance / bin_size;
+              if (hist_index < MAX_VAR_HIST_BINS)
+                hist[hist_index]++;
+              else if (hist_index < 3 * (MAX_VAR_HIST_BINS >> 1))
+                hist[MAX_VAR_HIST_BINS - 1]++;  // Account for the tail
             }
           }
         }
@@ -217,29 +242,61 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
     }
     ne->last_w = cm->width;
     ne->last_h = cm->height;
-    // Update noise estimate if we have at a minimum number of block samples,
-    // and avg_est > 0 (avg_est == 0 can happen if the application inputs
-    // duplicate frames).
-    if (num_samples > min_blocks_estimate && avg_est > 0) {
-      // Normalize.
-      avg_est = avg_est / num_samples;
-      // Update noise estimate.
-      ne->value = (int)((15 * ne->value + avg_est) >> 4);
-      ne->count++;
-      if (ne->count == ne->num_frames_estimate) {
-        // Reset counter and check noise level condition.
-        ne->num_frames_estimate = 30;
-        ne->count = 0;
-        ne->level = vp9_noise_estimate_extract_level(ne);
-#if CONFIG_VP9_TEMPORAL_DENOISING
-        if (cpi->oxcf.noise_sensitivity > 0)
-          vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level);
-#endif
+    // Adjust histogram to account for effect that histogram flattens
+    // and shifts to zero as scene darkens.
+    if (hist[0] > 10 && (hist[MAX_VAR_HIST_BINS - 1] > hist[0] >> 2)) {
+      hist[0] = 0;
+      hist[1] >>= 2;
+      hist[2] >>= 2;
+      hist[3] >>= 2;
+      hist[4] >>= 1;
+      hist[5] >>= 1;
+      hist[6] = 3 * hist[6] >> 1;
+      hist[MAX_VAR_HIST_BINS - 1] >>= 1;
+    }
+
+    // Average hist[] and find largest bin
+    for (bin_cnt = 0; bin_cnt < MAX_VAR_HIST_BINS; bin_cnt++) {
+      if (bin_cnt == 0)
+        hist_avg[bin_cnt] = (hist[0] + hist[1] + hist[2]) / 3;
+      else if (bin_cnt == MAX_VAR_HIST_BINS - 1)
+        hist_avg[bin_cnt] = hist[MAX_VAR_HIST_BINS - 1] >> 2;
+      else if (bin_cnt == MAX_VAR_HIST_BINS - 2)
+        hist_avg[bin_cnt] = (hist[bin_cnt - 1] + 2 * hist[bin_cnt] +
+                             (hist[bin_cnt + 1] >> 1) + 2) >>
+                            2;
+      else
+        hist_avg[bin_cnt] =
+            (hist[bin_cnt - 1] + 2 * hist[bin_cnt] + hist[bin_cnt + 1] + 2) >>
+            2;
+
+      if (hist_avg[bin_cnt] > max_bin_count) {
+        max_bin_count = hist_avg[bin_cnt];
+        max_bin = bin_cnt;
       }
     }
+
+    // Scale by 40 to work with existing thresholds
+    ne->value = (int)((3 * ne->value + max_bin * 40) >> 2);
+    // Quickly increase VNR strength when the noise level increases suddenly.
+    if (ne->level < kMedium && ne->value > ne->adapt_thresh) {
+      ne->count = ne->num_frames_estimate;
+    } else {
+      ne->count++;
+    }
+    if (ne->count == ne->num_frames_estimate) {
+      // Reset counter and check noise level condition.
+      ne->num_frames_estimate = 30;
+      ne->count = 0;
+      ne->level = vp9_noise_estimate_extract_level(ne);
+#if CONFIG_VP9_TEMPORAL_DENOISING
+      if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
+        vp9_denoiser_set_noise_level(cpi, ne->level);
+#endif
+    }
   }
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0)
+  if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
     copy_frame(&cpi->denoiser.last_source, cpi->Source);
 #endif
 }
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_noise_estimate.h b/media/libvpx/libvpx/vp9/encoder/vp9_noise_estimate.h
index 335cdbe643..7fc94ff8c9 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_noise_estimate.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_noise_estimate.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_NOISE_ESTIMATE_H_
-#define VP9_ENCODER_NOISE_ESTIMATE_H_
+#ifndef VPX_VP9_ENCODER_VP9_NOISE_ESTIMATE_H_
+#define VPX_VP9_ENCODER_VP9_NOISE_ESTIMATE_H_
 
 #include "vp9/encoder/vp9_block.h"
 #include "vp9/encoder/vp9_skin_detection.h"
@@ -23,6 +23,8 @@
 extern "C" {
 #endif
 
+#define MAX_VAR_HIST_BINS 20
+
 typedef enum noise_level { kLowLow, kLow, kMedium, kHigh } NOISE_LEVEL;
 
 typedef struct noise_estimate {
@@ -30,6 +32,7 @@ typedef struct noise_estimate {
   NOISE_LEVEL level;
   int value;
   int thresh;
+  int adapt_thresh;
   int count;
   int last_w;
   int last_h;
@@ -48,4 +51,4 @@ void vp9_update_noise_estimate(struct VP9_COMP *const cpi);
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_NOISE_ESTIMATE_H_
+#endif  // VPX_VP9_ENCODER_VP9_NOISE_ESTIMATE_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.c b/media/libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.c
new file mode 100644
index 0000000000..d52801c845
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.c
@@ -0,0 +1,536 @@
+/*
+ *  Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/vp9_mv.h"
+#include "vp9/encoder/vp9_non_greedy_mv.h"
+// TODO(angiebird): move non_greedy_mv related functions to this file
+
+#define LOG2_TABLE_SIZE 1024
+static const int log2_table[LOG2_TABLE_SIZE] = {
+  0,  // This is a dummy value
+  0,        1048576,  1661954,  2097152,  2434718,  2710530,  2943725,
+  3145728,  3323907,  3483294,  3627477,  3759106,  3880192,  3992301,
+  4096672,  4194304,  4286015,  4372483,  4454275,  4531870,  4605679,
+  4676053,  4743299,  4807682,  4869436,  4928768,  4985861,  5040877,
+  5093962,  5145248,  5194851,  5242880,  5289431,  5334591,  5378443,
+  5421059,  5462508,  5502851,  5542146,  5580446,  5617800,  5654255,
+  5689851,  5724629,  5758625,  5791875,  5824409,  5856258,  5887450,
+  5918012,  5947969,  5977344,  6006160,  6034437,  6062195,  6089453,
+  6116228,  6142538,  6168398,  6193824,  6218829,  6243427,  6267632,
+  6291456,  6314910,  6338007,  6360756,  6383167,  6405252,  6427019,
+  6448477,  6469635,  6490501,  6511084,  6531390,  6551427,  6571202,
+  6590722,  6609993,  6629022,  6647815,  6666376,  6684713,  6702831,
+  6720734,  6738427,  6755916,  6773205,  6790299,  6807201,  6823917,
+  6840451,  6856805,  6872985,  6888993,  6904834,  6920510,  6936026,
+  6951384,  6966588,  6981641,  6996545,  7011304,  7025920,  7040397,
+  7054736,  7068940,  7083013,  7096956,  7110771,  7124461,  7138029,
+  7151476,  7164804,  7178017,  7191114,  7204100,  7216974,  7229740,
+  7242400,  7254954,  7267405,  7279754,  7292003,  7304154,  7316208,
+  7328167,  7340032,  7351805,  7363486,  7375079,  7386583,  7398000,
+  7409332,  7420579,  7431743,  7442826,  7453828,  7464751,  7475595,
+  7486362,  7497053,  7507669,  7518211,  7528680,  7539077,  7549404,
+  7559660,  7569847,  7579966,  7590017,  7600003,  7609923,  7619778,
+  7629569,  7639298,  7648964,  7658569,  7668114,  7677598,  7687023,
+  7696391,  7705700,  7714952,  7724149,  7733289,  7742375,  7751407,
+  7760385,  7769310,  7778182,  7787003,  7795773,  7804492,  7813161,
+  7821781,  7830352,  7838875,  7847350,  7855777,  7864158,  7872493,
+  7880782,  7889027,  7897226,  7905381,  7913492,  7921561,  7929586,
+  7937569,  7945510,  7953410,  7961268,  7969086,  7976864,  7984602,
+  7992301,  7999960,  8007581,  8015164,  8022709,  8030217,  8037687,
+  8045121,  8052519,  8059880,  8067206,  8074496,  8081752,  8088973,
+  8096159,  8103312,  8110431,  8117516,  8124569,  8131589,  8138576,
+  8145532,  8152455,  8159347,  8166208,  8173037,  8179836,  8186605,
+  8193343,  8200052,  8206731,  8213380,  8220001,  8226593,  8233156,
+  8239690,  8246197,  8252676,  8259127,  8265550,  8271947,  8278316,
+  8284659,  8290976,  8297266,  8303530,  8309768,  8315981,  8322168,
+  8328330,  8334467,  8340579,  8346667,  8352730,  8358769,  8364784,
+  8370775,  8376743,  8382687,  8388608,  8394506,  8400381,  8406233,
+  8412062,  8417870,  8423655,  8429418,  8435159,  8440878,  8446576,
+  8452252,  8457908,  8463542,  8469155,  8474748,  8480319,  8485871,
+  8491402,  8496913,  8502404,  8507875,  8513327,  8518759,  8524171,
+  8529564,  8534938,  8540293,  8545629,  8550947,  8556245,  8561525,
+  8566787,  8572031,  8577256,  8582464,  8587653,  8592825,  8597980,
+  8603116,  8608236,  8613338,  8618423,  8623491,  8628542,  8633576,
+  8638593,  8643594,  8648579,  8653547,  8658499,  8663434,  8668354,
+  8673258,  8678145,  8683017,  8687874,  8692715,  8697540,  8702350,
+  8707145,  8711925,  8716690,  8721439,  8726174,  8730894,  8735599,
+  8740290,  8744967,  8749628,  8754276,  8758909,  8763528,  8768134,
+  8772725,  8777302,  8781865,  8786415,  8790951,  8795474,  8799983,
+  8804478,  8808961,  8813430,  8817886,  8822328,  8826758,  8831175,
+  8835579,  8839970,  8844349,  8848715,  8853068,  8857409,  8861737,
+  8866053,  8870357,  8874649,  8878928,  8883195,  8887451,  8891694,
+  8895926,  8900145,  8904353,  8908550,  8912734,  8916908,  8921069,
+  8925220,  8929358,  8933486,  8937603,  8941708,  8945802,  8949885,
+  8953957,  8958018,  8962068,  8966108,  8970137,  8974155,  8978162,
+  8982159,  8986145,  8990121,  8994086,  8998041,  9001986,  9005920,
+  9009844,  9013758,  9017662,  9021556,  9025440,  9029314,  9033178,
+  9037032,  9040877,  9044711,  9048536,  9052352,  9056157,  9059953,
+  9063740,  9067517,  9071285,  9075044,  9078793,  9082533,  9086263,
+  9089985,  9093697,  9097400,  9101095,  9104780,  9108456,  9112123,
+  9115782,  9119431,  9123072,  9126704,  9130328,  9133943,  9137549,
+  9141146,  9144735,  9148316,  9151888,  9155452,  9159007,  9162554,
+  9166092,  9169623,  9173145,  9176659,  9180165,  9183663,  9187152,
+  9190634,  9194108,  9197573,  9201031,  9204481,  9207923,  9211357,
+  9214784,  9218202,  9221613,  9225017,  9228412,  9231800,  9235181,
+  9238554,  9241919,  9245277,  9248628,  9251971,  9255307,  9258635,
+  9261956,  9265270,  9268577,  9271876,  9275169,  9278454,  9281732,
+  9285002,  9288266,  9291523,  9294773,  9298016,  9301252,  9304481,
+  9307703,  9310918,  9314126,  9317328,  9320523,  9323711,  9326892,
+  9330067,  9333235,  9336397,  9339552,  9342700,  9345842,  9348977,
+  9352106,  9355228,  9358344,  9361454,  9364557,  9367654,  9370744,
+  9373828,  9376906,  9379978,  9383043,  9386102,  9389155,  9392202,
+  9395243,  9398278,  9401306,  9404329,  9407345,  9410356,  9413360,
+  9416359,  9419351,  9422338,  9425319,  9428294,  9431263,  9434226,
+  9437184,  9440136,  9443082,  9446022,  9448957,  9451886,  9454809,
+  9457726,  9460638,  9463545,  9466446,  9469341,  9472231,  9475115,
+  9477994,  9480867,  9483735,  9486597,  9489454,  9492306,  9495152,
+  9497993,  9500828,  9503659,  9506484,  9509303,  9512118,  9514927,
+  9517731,  9520530,  9523324,  9526112,  9528895,  9531674,  9534447,
+  9537215,  9539978,  9542736,  9545489,  9548237,  9550980,  9553718,
+  9556451,  9559179,  9561903,  9564621,  9567335,  9570043,  9572747,
+  9575446,  9578140,  9580830,  9583514,  9586194,  9588869,  9591540,
+  9594205,  9596866,  9599523,  9602174,  9604821,  9607464,  9610101,
+  9612735,  9615363,  9617987,  9620607,  9623222,  9625832,  9628438,
+  9631040,  9633637,  9636229,  9638818,  9641401,  9643981,  9646556,
+  9649126,  9651692,  9654254,  9656812,  9659365,  9661914,  9664459,
+  9666999,  9669535,  9672067,  9674594,  9677118,  9679637,  9682152,
+  9684663,  9687169,  9689672,  9692170,  9694665,  9697155,  9699641,
+  9702123,  9704601,  9707075,  9709545,  9712010,  9714472,  9716930,
+  9719384,  9721834,  9724279,  9726721,  9729159,  9731593,  9734024,
+  9736450,  9738872,  9741291,  9743705,  9746116,  9748523,  9750926,
+  9753326,  9755721,  9758113,  9760501,  9762885,  9765266,  9767642,
+  9770015,  9772385,  9774750,  9777112,  9779470,  9781825,  9784175,
+  9786523,  9788866,  9791206,  9793543,  9795875,  9798204,  9800530,
+  9802852,  9805170,  9807485,  9809797,  9812104,  9814409,  9816710,
+  9819007,  9821301,  9823591,  9825878,  9828161,  9830441,  9832718,
+  9834991,  9837261,  9839527,  9841790,  9844050,  9846306,  9848559,
+  9850808,  9853054,  9855297,  9857537,  9859773,  9862006,  9864235,
+  9866462,  9868685,  9870904,  9873121,  9875334,  9877544,  9879751,
+  9881955,  9884155,  9886352,  9888546,  9890737,  9892925,  9895109,
+  9897291,  9899469,  9901644,  9903816,  9905985,  9908150,  9910313,
+  9912473,  9914629,  9916783,  9918933,  9921080,  9923225,  9925366,
+  9927504,  9929639,  9931771,  9933900,  9936027,  9938150,  9940270,
+  9942387,  9944502,  9946613,  9948721,  9950827,  9952929,  9955029,
+  9957126,  9959219,  9961310,  9963398,  9965484,  9967566,  9969645,
+  9971722,  9973796,  9975866,  9977934,  9980000,  9982062,  9984122,
+  9986179,  9988233,  9990284,  9992332,  9994378,  9996421,  9998461,
+  10000498, 10002533, 10004565, 10006594, 10008621, 10010644, 10012665,
+  10014684, 10016700, 10018713, 10020723, 10022731, 10024736, 10026738,
+  10028738, 10030735, 10032729, 10034721, 10036710, 10038697, 10040681,
+  10042662, 10044641, 10046617, 10048591, 10050562, 10052530, 10054496,
+  10056459, 10058420, 10060379, 10062334, 10064287, 10066238, 10068186,
+  10070132, 10072075, 10074016, 10075954, 10077890, 10079823, 10081754,
+  10083682, 10085608, 10087532, 10089453, 10091371, 10093287, 10095201,
+  10097112, 10099021, 10100928, 10102832, 10104733, 10106633, 10108529,
+  10110424, 10112316, 10114206, 10116093, 10117978, 10119861, 10121742,
+  10123620, 10125495, 10127369, 10129240, 10131109, 10132975, 10134839,
+  10136701, 10138561, 10140418, 10142273, 10144126, 10145976, 10147825,
+  10149671, 10151514, 10153356, 10155195, 10157032, 10158867, 10160699,
+  10162530, 10164358, 10166184, 10168007, 10169829, 10171648, 10173465,
+  10175280, 10177093, 10178904, 10180712, 10182519, 10184323, 10186125,
+  10187925, 10189722, 10191518, 10193311, 10195103, 10196892, 10198679,
+  10200464, 10202247, 10204028, 10205806, 10207583, 10209357, 10211130,
+  10212900, 10214668, 10216435, 10218199, 10219961, 10221721, 10223479,
+  10225235, 10226989, 10228741, 10230491, 10232239, 10233985, 10235728,
+  10237470, 10239210, 10240948, 10242684, 10244417, 10246149, 10247879,
+  10249607, 10251333, 10253057, 10254779, 10256499, 10258217, 10259933,
+  10261647, 10263360, 10265070, 10266778, 10268485, 10270189, 10271892,
+  10273593, 10275292, 10276988, 10278683, 10280376, 10282068, 10283757,
+  10285444, 10287130, 10288814, 10290495, 10292175, 10293853, 10295530,
+  10297204, 10298876, 10300547, 10302216, 10303883, 10305548, 10307211,
+  10308873, 10310532, 10312190, 10313846, 10315501, 10317153, 10318804,
+  10320452, 10322099, 10323745, 10325388, 10327030, 10328670, 10330308,
+  10331944, 10333578, 10335211, 10336842, 10338472, 10340099, 10341725,
+  10343349, 10344971, 10346592, 10348210, 10349828, 10351443, 10353057,
+  10354668, 10356279, 10357887, 10359494, 10361099, 10362702, 10364304,
+  10365904, 10367502, 10369099, 10370694, 10372287, 10373879, 10375468,
+  10377057, 10378643, 10380228, 10381811, 10383393, 10384973, 10386551,
+  10388128, 10389703, 10391276, 10392848, 10394418, 10395986, 10397553,
+  10399118, 10400682, 10402244, 10403804, 10405363, 10406920, 10408476,
+  10410030, 10411582, 10413133, 10414682, 10416230, 10417776, 10419320,
+  10420863, 10422404, 10423944, 10425482, 10427019, 10428554, 10430087,
+  10431619, 10433149, 10434678, 10436206, 10437731, 10439256, 10440778,
+  10442299, 10443819, 10445337, 10446854, 10448369, 10449882, 10451394,
+  10452905, 10454414, 10455921, 10457427, 10458932, 10460435, 10461936,
+  10463436, 10464935, 10466432, 10467927, 10469422, 10470914, 10472405,
+  10473895, 10475383, 10476870, 10478355, 10479839, 10481322, 10482802,
+  10484282,
+};
+
+static int mi_size_to_block_size(int mi_bsize, int mi_num) {
+  return (mi_num % mi_bsize) ? mi_num / mi_bsize + 1 : mi_num / mi_bsize;
+}
+
+Status vp9_alloc_motion_field_info(MotionFieldInfo *motion_field_info,
+                                   int frame_num, int mi_rows, int mi_cols) {
+  int frame_idx, rf_idx, square_block_idx;
+  if (motion_field_info->allocated) {
+    // TODO(angiebird): Avoid re-allocate buffer if possible
+    vp9_free_motion_field_info(motion_field_info);
+  }
+  motion_field_info->frame_num = frame_num;
+  motion_field_info->motion_field_array =
+      vpx_calloc(frame_num, sizeof(*motion_field_info->motion_field_array));
+  if (!motion_field_info->motion_field_array) return STATUS_FAILED;
+  for (frame_idx = 0; frame_idx < frame_num; ++frame_idx) {
+    for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+      for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES;
+           ++square_block_idx) {
+        BLOCK_SIZE bsize = square_block_idx_to_bsize(square_block_idx);
+        const int mi_height = num_8x8_blocks_high_lookup[bsize];
+        const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+        const int block_rows = mi_size_to_block_size(mi_height, mi_rows);
+        const int block_cols = mi_size_to_block_size(mi_width, mi_cols);
+        MotionField *motion_field =
+            &motion_field_info
+                 ->motion_field_array[frame_idx][rf_idx][square_block_idx];
+        Status status =
+            vp9_alloc_motion_field(motion_field, bsize, block_rows, block_cols);
+        if (status == STATUS_FAILED) {
+          return STATUS_FAILED;
+        }
+      }
+    }
+  }
+  motion_field_info->allocated = 1;
+  return STATUS_OK;
+}
+
+Status vp9_alloc_motion_field(MotionField *motion_field, BLOCK_SIZE bsize,
+                              int block_rows, int block_cols) {
+  Status status = STATUS_OK;
+  motion_field->ready = 0;
+  motion_field->bsize = bsize;
+  motion_field->block_rows = block_rows;
+  motion_field->block_cols = block_cols;
+  motion_field->block_num = block_rows * block_cols;
+  motion_field->mf =
+      vpx_calloc(motion_field->block_num, sizeof(*motion_field->mf));
+  if (motion_field->mf == NULL) {
+    status = STATUS_FAILED;
+  }
+  motion_field->set_mv =
+      vpx_calloc(motion_field->block_num, sizeof(*motion_field->set_mv));
+  if (motion_field->set_mv == NULL) {
+    vpx_free(motion_field->mf);
+    motion_field->mf = NULL;
+    status = STATUS_FAILED;
+  }
+  motion_field->local_structure = vpx_calloc(
+      motion_field->block_num, sizeof(*motion_field->local_structure));
+  if (motion_field->local_structure == NULL) {
+    vpx_free(motion_field->mf);
+    motion_field->mf = NULL;
+    vpx_free(motion_field->set_mv);
+    motion_field->set_mv = NULL;
+    status = STATUS_FAILED;
+  }
+  return status;
+}
+
+void vp9_free_motion_field(MotionField *motion_field) {
+  vpx_free(motion_field->mf);
+  vpx_free(motion_field->set_mv);
+  vpx_free(motion_field->local_structure);
+  vp9_zero(*motion_field);
+}
+
+void vp9_free_motion_field_info(MotionFieldInfo *motion_field_info) {
+  if (motion_field_info->allocated) {
+    int frame_idx, rf_idx, square_block_idx;
+    for (frame_idx = 0; frame_idx < motion_field_info->frame_num; ++frame_idx) {
+      for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+        for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES;
+             ++square_block_idx) {
+          MotionField *motion_field =
+              &motion_field_info
+                   ->motion_field_array[frame_idx][rf_idx][square_block_idx];
+          vp9_free_motion_field(motion_field);
+        }
+      }
+    }
+    vpx_free(motion_field_info->motion_field_array);
+    motion_field_info->motion_field_array = NULL;
+    motion_field_info->frame_num = 0;
+    motion_field_info->allocated = 0;
+  }
+}
+
+MotionField *vp9_motion_field_info_get_motion_field(
+    MotionFieldInfo *motion_field_info, int frame_idx, int rf_idx,
+    BLOCK_SIZE bsize) {
+  int square_block_idx = get_square_block_idx(bsize);
+  assert(frame_idx < motion_field_info->frame_num);
+  assert(motion_field_info->allocated == 1);
+  return &motion_field_info
+              ->motion_field_array[frame_idx][rf_idx][square_block_idx];
+}
+
+int vp9_motion_field_is_mv_set(const MotionField *motion_field, int brow,
+                               int bcol) {
+  assert(brow >= 0 && brow < motion_field->block_rows);
+  assert(bcol >= 0 && bcol < motion_field->block_cols);
+  return motion_field->set_mv[brow * motion_field->block_cols + bcol];
+}
+
+int_mv vp9_motion_field_get_mv(const MotionField *motion_field, int brow,
+                               int bcol) {
+  assert(brow >= 0 && brow < motion_field->block_rows);
+  assert(bcol >= 0 && bcol < motion_field->block_cols);
+  return motion_field->mf[brow * motion_field->block_cols + bcol];
+}
+
+int_mv vp9_motion_field_mi_get_mv(const MotionField *motion_field, int mi_row,
+                                  int mi_col) {
+  const int mi_height = num_8x8_blocks_high_lookup[motion_field->bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[motion_field->bsize];
+  const int brow = mi_row / mi_height;
+  const int bcol = mi_col / mi_width;
+  assert(mi_row % mi_height == 0);
+  assert(mi_col % mi_width == 0);
+  return vp9_motion_field_get_mv(motion_field, brow, bcol);
+}
+
+void vp9_motion_field_mi_set_mv(MotionField *motion_field, int mi_row,
+                                int mi_col, int_mv mv) {
+  const int mi_height = num_8x8_blocks_high_lookup[motion_field->bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[motion_field->bsize];
+  const int brow = mi_row / mi_height;
+  const int bcol = mi_col / mi_width;
+  assert(mi_row % mi_height == 0);
+  assert(mi_col % mi_width == 0);
+  assert(brow >= 0 && brow < motion_field->block_rows);
+  assert(bcol >= 0 && bcol < motion_field->block_cols);
+  motion_field->mf[brow * motion_field->block_cols + bcol] = mv;
+  motion_field->set_mv[brow * motion_field->block_cols + bcol] = 1;
+}
+
+void vp9_motion_field_reset_mvs(MotionField *motion_field) {
+  memset(motion_field->set_mv, 0,
+         motion_field->block_num * sizeof(*motion_field->set_mv));
+}
+
+static int64_t log2_approximation(int64_t v) {
+  assert(v > 0);
+  if (v < LOG2_TABLE_SIZE) {
+    return log2_table[v];
+  } else {
+    // use linear approximation when v >= 2^10
+    const int slope =
+        1477;  // slope = 1 / (log(2) * 1024) * (1 << LOG2_PRECISION)
+    assert(LOG2_TABLE_SIZE == 1 << 10);
+
+    return slope * (v - LOG2_TABLE_SIZE) + (10 << LOG2_PRECISION);
+  }
+}
+
+int64_t vp9_nb_mvs_inconsistency(const MV *mv, const int_mv *nb_full_mvs,
+                                 int mv_num) {
+  // The behavior of this function is to compute log2 of mv difference,
+  // i.e. min log2(1 + row_diff * row_diff + col_diff * col_diff)
+  // against available neighbor mvs.
+  // Since the log2 is monotonically increasing, we can compute
+  // min row_diff * row_diff + col_diff * col_diff first
+  // then apply log2 in the end.
+  int i;
+  int64_t min_abs_diff = INT64_MAX;
+  int cnt = 0;
+  assert(mv_num <= NB_MVS_NUM);
+  for (i = 0; i < mv_num; ++i) {
+    MV nb_mv = nb_full_mvs[i].as_mv;
+    const int64_t row_diff = abs(mv->row - nb_mv.row);
+    const int64_t col_diff = abs(mv->col - nb_mv.col);
+    const int64_t abs_diff = row_diff * row_diff + col_diff * col_diff;
+    assert(nb_full_mvs[i].as_int != INVALID_MV);
+    min_abs_diff = VPXMIN(abs_diff, min_abs_diff);
+    ++cnt;
+  }
+  if (cnt) {
+    return log2_approximation(1 + min_abs_diff);
+  }
+  return 0;
+}
+
+static FloatMV get_smooth_motion_vector(const FloatMV scaled_search_mv,
+                                        const FloatMV *tmp_mf,
+                                        const int (*M)[MF_LOCAL_STRUCTURE_SIZE],
+                                        int rows, int cols, int row, int col,
+                                        float alpha) {
+  const FloatMV tmp_mv = tmp_mf[row * cols + col];
+  int idx_row, idx_col;
+  FloatMV avg_nb_mv = { 0.0f, 0.0f };
+  FloatMV mv = { 0.0f, 0.0f };
+  float filter[3][3] = { { 1.0f / 12.0f, 1.0f / 6.0f, 1.0f / 12.0f },
+                         { 1.0f / 6.0f, 0.0f, 1.0f / 6.0f },
+                         { 1.0f / 12.0f, 1.0f / 6.0f, 1.0f / 12.0f } };
+  for (idx_row = 0; idx_row < 3; ++idx_row) {
+    int nb_row = row + idx_row - 1;
+    for (idx_col = 0; idx_col < 3; ++idx_col) {
+      int nb_col = col + idx_col - 1;
+      if (nb_row < 0 || nb_col < 0 || nb_row >= rows || nb_col >= cols) {
+        avg_nb_mv.row += (tmp_mv.row) * filter[idx_row][idx_col];
+        avg_nb_mv.col += (tmp_mv.col) * filter[idx_row][idx_col];
+      } else {
+        const FloatMV nb_mv = tmp_mf[nb_row * cols + nb_col];
+        avg_nb_mv.row += (nb_mv.row) * filter[idx_row][idx_col];
+        avg_nb_mv.col += (nb_mv.col) * filter[idx_row][idx_col];
+      }
+    }
+  }
+  {
+    // M is the local variance of reference frame
+    float M00 = M[row * cols + col][0];
+    float M01 = M[row * cols + col][1];
+    float M10 = M[row * cols + col][2];
+    float M11 = M[row * cols + col][3];
+
+    float det = (M00 + alpha) * (M11 + alpha) - M01 * M10;
+
+    float inv_M00 = (M11 + alpha) / det;
+    float inv_M01 = -M01 / det;
+    float inv_M10 = -M10 / det;
+    float inv_M11 = (M00 + alpha) / det;
+
+    float inv_MM00 = inv_M00 * M00 + inv_M01 * M10;
+    float inv_MM01 = inv_M00 * M01 + inv_M01 * M11;
+    float inv_MM10 = inv_M10 * M00 + inv_M11 * M10;
+    float inv_MM11 = inv_M10 * M01 + inv_M11 * M11;
+
+    mv.row = inv_M00 * avg_nb_mv.row * alpha + inv_M01 * avg_nb_mv.col * alpha +
+             inv_MM00 * scaled_search_mv.row + inv_MM01 * scaled_search_mv.col;
+    mv.col = inv_M10 * avg_nb_mv.row * alpha + inv_M11 * avg_nb_mv.col * alpha +
+             inv_MM10 * scaled_search_mv.row + inv_MM11 * scaled_search_mv.col;
+  }
+  return mv;
+}
+
+void vp9_get_smooth_motion_field(const MV *search_mf,
+                                 const int (*M)[MF_LOCAL_STRUCTURE_SIZE],
+                                 int rows, int cols, BLOCK_SIZE bsize,
+                                 float alpha, int num_iters, MV *smooth_mf) {
+  // M is the local variation of reference frame
+  // build two buffers
+  FloatMV *input = (FloatMV *)malloc(rows * cols * sizeof(FloatMV));
+  FloatMV *output = (FloatMV *)malloc(rows * cols * sizeof(FloatMV));
+  int idx;
+  int row, col;
+  int bw = 4 << b_width_log2_lookup[bsize];
+  int bh = 4 << b_height_log2_lookup[bsize];
+  if (!(input && output)) goto fail;
+  // copy search results to input buffer
+  for (idx = 0; idx < rows * cols; ++idx) {
+    input[idx].row = (float)search_mf[idx].row / bh;
+    input[idx].col = (float)search_mf[idx].col / bw;
+  }
+  for (idx = 0; idx < num_iters; ++idx) {
+    FloatMV *tmp;
+    for (row = 0; row < rows; ++row) {
+      for (col = 0; col < cols; ++col) {
+        // note: the scaled_search_mf and smooth_mf are all scaled by macroblock
+        // size
+        const MV search_mv = search_mf[row * cols + col];
+        FloatMV scaled_search_mv = { (float)search_mv.row / bh,
+                                     (float)search_mv.col / bw };
+        output[row * cols + col] = get_smooth_motion_vector(
+            scaled_search_mv, input, M, rows, cols, row, col, alpha);
+      }
+    }
+    // swap buffers
+    tmp = input;
+    input = output;
+    output = tmp;
+  }
+  // copy smoothed results to output
+  for (idx = 0; idx < rows * cols; ++idx) {
+    smooth_mf[idx].row = (int)(input[idx].row * bh);
+    smooth_mf[idx].col = (int)(input[idx].col * bw);
+  }
+fail:
+  free(input);
+  free(output);
+}
+
+void vp9_get_local_structure(const YV12_BUFFER_CONFIG *cur_frame,
+                             const YV12_BUFFER_CONFIG *ref_frame,
+                             const MV *search_mf,
+                             const vp9_variance_fn_ptr_t *fn_ptr, int rows,
+                             int cols, BLOCK_SIZE bsize,
+                             int (*M)[MF_LOCAL_STRUCTURE_SIZE]) {
+  const int bw = 4 << b_width_log2_lookup[bsize];
+  const int bh = 4 << b_height_log2_lookup[bsize];
+  const int cur_stride = cur_frame->y_stride;
+  const int ref_stride = ref_frame->y_stride;
+  const int width = ref_frame->y_width;
+  const int height = ref_frame->y_height;
+  int row, col;
+  for (row = 0; row < rows; ++row) {
+    for (col = 0; col < cols; ++col) {
+      int cur_offset = row * bh * cur_stride + col * bw;
+      uint8_t *center = cur_frame->y_buffer + cur_offset;
+      int ref_h = row * bh + search_mf[row * cols + col].row;
+      int ref_w = col * bw + search_mf[row * cols + col].col;
+      int ref_offset;
+      uint8_t *target;
+      uint8_t *nb;
+      int search_dist;
+      int nb_dist;
+      int I_row = 0, I_col = 0;
+      // TODO(Dan): handle the case that when reference frame block beyond the
+      // boundary
+      ref_h = ref_h < 0 ? 0 : (ref_h >= height - bh ? height - bh - 1 : ref_h);
+      ref_w = ref_w < 0 ? 0 : (ref_w >= width - bw ? width - bw - 1 : ref_w);
+      // compute search results distortion
+      // TODO(Dan): maybe need to use vp9 function to find the reference block,
+      // to compare with the results of my python code, I first use my way to
+      // compute the reference block
+      ref_offset = ref_h * ref_stride + ref_w;
+      target = ref_frame->y_buffer + ref_offset;
+      search_dist = fn_ptr->sdf(center, cur_stride, target, ref_stride);
+      // compute target's neighbors' distortions
+      // TODO(Dan): if using padding, the boundary condition may vary
+      // up
+      if (ref_h - bh >= 0) {
+        nb = target - ref_stride * bh;
+        nb_dist = fn_ptr->sdf(center, cur_stride, nb, ref_stride);
+        I_row += nb_dist - search_dist;
+      }
+      // down
+      if (ref_h + bh < height - bh) {
+        nb = target + ref_stride * bh;
+        nb_dist = fn_ptr->sdf(center, cur_stride, nb, ref_stride);
+        I_row += nb_dist - search_dist;
+      }
+      if (ref_h - bh >= 0 && ref_h + bh < height - bh) {
+        I_row /= 2;
+      }
+      I_row /= (bw * bh);
+      // left
+      if (ref_w - bw >= 0) {
+        nb = target - bw;
+        nb_dist = fn_ptr->sdf(center, cur_stride, nb, ref_stride);
+        I_col += nb_dist - search_dist;
+      }
+      // down
+      if (ref_w + bw < width - bw) {
+        nb = target + bw;
+        nb_dist = fn_ptr->sdf(center, cur_stride, nb, ref_stride);
+        I_col += nb_dist - search_dist;
+      }
+      if (ref_w - bw >= 0 && ref_w + bw < width - bw) {
+        I_col /= 2;
+      }
+      I_col /= (bw * bh);
+      M[row * cols + col][0] = I_row * I_row;
+      M[row * cols + col][1] = I_row * I_col;
+      M[row * cols + col][2] = I_col * I_row;
+      M[row * cols + col][3] = I_col * I_col;
+    }
+  }
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.h b/media/libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.h
new file mode 100644
index 0000000000..c2bd69722a
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.h
@@ -0,0 +1,129 @@
+/*
+ *  Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_NON_GREEDY_MV_H_
+#define VPX_VP9_ENCODER_VP9_NON_GREEDY_MV_H_
+
+#include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vpx_scale/yv12config.h"
+#include "vpx_dsp/variance.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#define NB_MVS_NUM 4
+#define LOG2_PRECISION 20
+#define MF_LOCAL_STRUCTURE_SIZE 4
+#define SQUARE_BLOCK_SIZES 4
+
+typedef enum Status { STATUS_OK = 0, STATUS_FAILED = 1 } Status;
+
+typedef struct MotionField {
+  int ready;
+  BLOCK_SIZE bsize;
+  int block_rows;
+  int block_cols;
+  int block_num;  // block_num == block_rows * block_cols
+  int (*local_structure)[MF_LOCAL_STRUCTURE_SIZE];
+  int_mv *mf;
+  int *set_mv;
+  int mv_log_scale;
+} MotionField;
+
+typedef struct MotionFieldInfo {
+  int frame_num;
+  int allocated;
+  MotionField (*motion_field_array)[MAX_INTER_REF_FRAMES][SQUARE_BLOCK_SIZES];
+} MotionFieldInfo;
+
+typedef struct {
+  float row, col;
+} FloatMV;
+
+static INLINE int get_square_block_idx(BLOCK_SIZE bsize) {
+  if (bsize == BLOCK_4X4) {
+    return 0;
+  }
+  if (bsize == BLOCK_8X8) {
+    return 1;
+  }
+  if (bsize == BLOCK_16X16) {
+    return 2;
+  }
+  if (bsize == BLOCK_32X32) {
+    return 3;
+  }
+  assert(0 && "ERROR: non-square block size");
+  return -1;
+}
+
+static INLINE BLOCK_SIZE square_block_idx_to_bsize(int square_block_idx) {
+  if (square_block_idx == 0) {
+    return BLOCK_4X4;
+  }
+  if (square_block_idx == 1) {
+    return BLOCK_8X8;
+  }
+  if (square_block_idx == 2) {
+    return BLOCK_16X16;
+  }
+  if (square_block_idx == 3) {
+    return BLOCK_32X32;
+  }
+  assert(0 && "ERROR: invalid square_block_idx");
+  return BLOCK_INVALID;
+}
+
+Status vp9_alloc_motion_field_info(MotionFieldInfo *motion_field_info,
+                                   int frame_num, int mi_rows, int mi_cols);
+
+Status vp9_alloc_motion_field(MotionField *motion_field, BLOCK_SIZE bsize,
+                              int block_rows, int block_cols);
+
+void vp9_free_motion_field(MotionField *motion_field);
+
+void vp9_free_motion_field_info(MotionFieldInfo *motion_field_info);
+
+int64_t vp9_nb_mvs_inconsistency(const MV *mv, const int_mv *nb_full_mvs,
+                                 int mv_num);
+
+void vp9_get_smooth_motion_field(const MV *search_mf,
+                                 const int (*M)[MF_LOCAL_STRUCTURE_SIZE],
+                                 int rows, int cols, BLOCK_SIZE bize,
+                                 float alpha, int num_iters, MV *smooth_mf);
+
+void vp9_get_local_structure(const YV12_BUFFER_CONFIG *cur_frame,
+                             const YV12_BUFFER_CONFIG *ref_frame,
+                             const MV *search_mf,
+                             const vp9_variance_fn_ptr_t *fn_ptr, int rows,
+                             int cols, BLOCK_SIZE bsize,
+                             int (*M)[MF_LOCAL_STRUCTURE_SIZE]);
+
+MotionField *vp9_motion_field_info_get_motion_field(
+    MotionFieldInfo *motion_field_info, int frame_idx, int rf_idx,
+    BLOCK_SIZE bsize);
+
+void vp9_motion_field_mi_set_mv(MotionField *motion_field, int mi_row,
+                                int mi_col, int_mv mv);
+
+void vp9_motion_field_reset_mvs(MotionField *motion_field);
+
+int_mv vp9_motion_field_get_mv(const MotionField *motion_field, int brow,
+                               int bcol);
+int_mv vp9_motion_field_mi_get_mv(const MotionField *motion_field, int mi_row,
+                                  int mi_col);
+int vp9_motion_field_is_mv_set(const MotionField *motion_field, int brow,
+                               int bcol);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif  // VPX_VP9_ENCODER_VP9_NON_GREEDY_MV_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_partition_models.h b/media/libvpx/libvpx/vp9/encoder/vp9_partition_models.h
new file mode 100644
index 0000000000..09c0e30a47
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_partition_models.h
@@ -0,0 +1,975 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_PARTITION_MODELS_H_
+#define VPX_VP9_ENCODER_VP9_PARTITION_MODELS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define NN_MAX_HIDDEN_LAYERS 10
+#define NN_MAX_NODES_PER_LAYER 128
+
+// Neural net model config. It defines the layout of a neural net model, such as
+// the number of inputs/outputs, number of layers, the number of nodes in each
+// layer, as well as the weights and bias of each node.
+typedef struct {
+  int num_inputs;         // Number of input nodes, i.e. features.
+  int num_outputs;        // Number of output nodes.
+  int num_hidden_layers;  // Number of hidden layers, maximum 10.
+  // Number of nodes for each hidden layer.
+  int num_hidden_nodes[NN_MAX_HIDDEN_LAYERS];
+  // Weight parameters, indexed by layer.
+  const float *weights[NN_MAX_HIDDEN_LAYERS + 1];
+  // Bias parameters, indexed by layer.
+  const float *bias[NN_MAX_HIDDEN_LAYERS + 1];
+} NN_CONFIG;
+
+// Partition search breakout model.
+#define FEATURES 4
+#define Q_CTX 3
+#define RESOLUTION_CTX 2
+static const float
+    vp9_partition_breakout_weights_64[RESOLUTION_CTX][Q_CTX][FEATURES + 1] = {
+      {
+          {
+              -0.016673f,
+              -0.001025f,
+              -0.000032f,
+              0.000833f,
+              1.94261885f - 2.1f,
+          },
+          {
+              -0.160867f,
+              -0.002101f,
+              0.000011f,
+              0.002448f,
+              1.65738142f - 2.5f,
+          },
+          {
+              -0.628934f,
+              -0.011459f,
+              -0.000009f,
+              0.013833f,
+              1.47982645f - 1.6f,
+          },
+      },
+      {
+          {
+              -0.064309f,
+              -0.006121f,
+              0.000232f,
+              0.005778f,
+              0.7989465f - 5.0f,
+          },
+          {
+              -0.314957f,
+              -0.009346f,
+              -0.000225f,
+              0.010072f,
+              2.80695581f - 5.5f,
+          },
+          {
+              -0.635535f,
+              -0.015135f,
+              0.000091f,
+              0.015247f,
+              2.90381241f - 5.0f,
+          },
+      },
+    };
+
+static const float
+    vp9_partition_breakout_weights_32[RESOLUTION_CTX][Q_CTX][FEATURES + 1] = {
+      {
+          {
+              -0.010554f,
+              -0.003081f,
+              -0.000134f,
+              0.004491f,
+              1.68445992f - 3.5f,
+          },
+          {
+              -0.051489f,
+              -0.007609f,
+              0.000016f,
+              0.009792f,
+              1.28089404f - 2.5f,
+          },
+          {
+              -0.163097f,
+              -0.013081f,
+              0.000022f,
+              0.019006f,
+              1.36129403f - 3.2f,
+          },
+      },
+      {
+          {
+              -0.024629f,
+              -0.006492f,
+              -0.000254f,
+              0.004895f,
+              1.27919173f - 4.5f,
+          },
+          {
+              -0.083936f,
+              -0.009827f,
+              -0.000200f,
+              0.010399f,
+              2.73731065f - 4.5f,
+          },
+          {
+              -0.279052f,
+              -0.013334f,
+              0.000289f,
+              0.023203f,
+              2.43595719f - 3.5f,
+          },
+      },
+    };
+
+static const float
+    vp9_partition_breakout_weights_16[RESOLUTION_CTX][Q_CTX][FEATURES + 1] = {
+      {
+          {
+              -0.013154f,
+              -0.002404f,
+              -0.000977f,
+              0.008450f,
+              2.57404566f - 5.5f,
+          },
+          {
+              -0.019146f,
+              -0.004018f,
+              0.000064f,
+              0.008187f,
+              2.15043926f - 2.5f,
+          },
+          {
+              -0.075755f,
+              -0.010858f,
+              0.000030f,
+              0.024505f,
+              2.06848121f - 2.5f,
+          },
+      },
+      {
+          {
+              -0.007636f,
+              -0.002751f,
+              -0.000682f,
+              0.005968f,
+              0.19225763f - 4.5f,
+          },
+          {
+              -0.047306f,
+              -0.009113f,
+              -0.000518f,
+              0.016007f,
+              2.61068869f - 4.0f,
+          },
+          {
+              -0.069336f,
+              -0.010448f,
+              -0.001120f,
+              0.023083f,
+              1.47591054f - 5.5f,
+          },
+      },
+    };
+
+static const float vp9_partition_breakout_weights_8[RESOLUTION_CTX][Q_CTX]
+                                                   [FEATURES + 1] = {
+                                                     {
+                                                         {
+                                                             -0.011807f,
+                                                             -0.009873f,
+                                                             -0.000931f,
+                                                             0.034768f,
+                                                             1.32254851f - 2.0f,
+                                                         },
+                                                         {
+                                                             -0.003861f,
+                                                             -0.002701f,
+                                                             0.000100f,
+                                                             0.013876f,
+                                                             1.96755111f - 1.5f,
+                                                         },
+                                                         {
+                                                             -0.013522f,
+                                                             -0.008677f,
+                                                             -0.000562f,
+                                                             0.034468f,
+                                                             1.53440356f - 1.5f,
+                                                         },
+                                                     },
+                                                     {
+                                                         {
+                                                             -0.003221f,
+                                                             -0.002125f,
+                                                             0.000993f,
+                                                             0.012768f,
+                                                             0.03541421f - 2.0f,
+                                                         },
+                                                         {
+                                                             -0.006069f,
+                                                             -0.007335f,
+                                                             0.000229f,
+                                                             0.026104f,
+                                                             0.17135315f - 1.5f,
+                                                         },
+                                                         {
+                                                             -0.039894f,
+                                                             -0.011419f,
+                                                             0.000070f,
+                                                             0.061817f,
+                                                             0.6739977f - 1.5f,
+                                                         },
+                                                     },
+                                                   };
+#undef FEATURES
+#undef Q_CTX
+#undef RESOLUTION_CTX
+
+// Rectangular partition search pruning model.
+#define FEATURES 8
+#define LABELS 4
+#define NODES 16
+static const float vp9_rect_part_nn_weights_16_layer0[FEATURES * NODES] = {
+  -0.432522f, 0.133070f,  -0.169187f, 0.768340f,  0.891228f,  0.554458f,
+  0.356000f,  0.403621f,  0.809165f,  0.778214f,  -0.520357f, 0.301451f,
+  -0.386972f, -0.314402f, 0.021878f,  1.148746f,  -0.462258f, -0.175524f,
+  -0.344589f, -0.475159f, -0.232322f, 0.471147f,  -0.489948f, 0.467740f,
+  -0.391550f, 0.208601f,  0.054138f,  0.076859f,  -0.309497f, -0.095927f,
+  0.225917f,  0.011582f,  -0.520730f, -0.585497f, 0.174036f,  0.072521f,
+  0.120771f,  -0.517234f, -0.581908f, -0.034003f, -0.694722f, -0.364368f,
+  0.290584f,  0.038373f,  0.685654f,  0.394019f,  0.759667f,  1.257502f,
+  -0.610516f, -0.185434f, 0.211997f,  -0.172458f, 0.044605f,  0.145316f,
+  -0.182525f, -0.147376f, 0.578742f,  0.312412f,  -0.446135f, -0.389112f,
+  0.454033f,  0.260490f,  0.664285f,  0.395856f,  -0.231827f, 0.215228f,
+  0.014856f,  -0.395462f, 0.479646f,  -0.391445f, -0.357788f, 0.166238f,
+  -0.056818f, -0.027783f, 0.060880f,  -1.604710f, 0.531268f,  0.282184f,
+  0.714944f,  0.093523f,  -0.218312f, -0.095546f, -0.285621f, -0.190871f,
+  -0.448340f, -0.016611f, 0.413913f,  -0.286720f, -0.158828f, -0.092635f,
+  -0.279551f, 0.166509f,  -0.088162f, 0.446543f,  -0.276830f, -0.065642f,
+  -0.176346f, -0.984754f, 0.338738f,  0.403809f,  0.738065f,  1.154439f,
+  0.750764f,  0.770959f,  -0.269403f, 0.295651f,  -0.331858f, 0.367144f,
+  0.279279f,  0.157419f,  -0.348227f, -0.168608f, -0.956000f, -0.647136f,
+  0.250516f,  0.858084f,  0.809802f,  0.492408f,  0.804841f,  0.282802f,
+  0.079395f,  -0.291771f, -0.024382f, -1.615880f, -0.445166f, -0.407335f,
+  -0.483044f, 0.141126f,
+};
+
+static const float vp9_rect_part_nn_bias_16_layer0[NODES] = {
+  0.275384f,  -0.053745f, 0.000000f,  0.000000f, -0.178103f, 0.513965f,
+  -0.161352f, 0.228551f,  0.000000f,  1.013712f, 0.000000f,  0.000000f,
+  -1.144009f, -0.000006f, -0.241727f, 2.048764f,
+};
+
+static const float vp9_rect_part_nn_weights_16_layer1[NODES * LABELS] = {
+  -1.435278f, 2.204691f,  -0.410718f, 0.202708f,  0.109208f,  1.059142f,
+  -0.306360f, 0.845906f,  0.489654f,  -1.121915f, -0.169133f, -0.003385f,
+  0.660590f,  -0.018711f, 1.227158f,  -2.967504f, 1.407345f,  -1.293243f,
+  -0.386921f, 0.300492f,  0.338824f,  -0.083250f, -0.069454f, -1.001827f,
+  -0.327891f, 0.899353f,  0.367397f,  -0.118601f, -0.171936f, -0.420646f,
+  -0.803319f, 2.029634f,  0.940268f,  -0.664484f, 0.339916f,  0.315944f,
+  0.157374f,  -0.402482f, -0.491695f, 0.595827f,  0.015031f,  0.255887f,
+  -0.466327f, -0.212598f, 0.136485f,  0.033363f,  -0.796921f, 1.414304f,
+  -0.282185f, -2.673571f, -0.280994f, 0.382658f,  -0.350902f, 0.227926f,
+  0.062602f,  -1.000199f, 0.433731f,  1.176439f,  -0.163216f, -0.229015f,
+  -0.640098f, -0.438852f, -0.947700f, 2.203434f,
+};
+
+static const float vp9_rect_part_nn_bias_16_layer1[LABELS] = {
+  -0.875510f,
+  0.982408f,
+  0.560854f,
+  -0.415209f,
+};
+
+static const NN_CONFIG vp9_rect_part_nnconfig_16 = {
+  FEATURES,  // num_inputs
+  LABELS,    // num_outputs
+  1,         // num_hidden_layers
+  {
+      NODES,
+  },  // num_hidden_nodes
+  {
+      vp9_rect_part_nn_weights_16_layer0,
+      vp9_rect_part_nn_weights_16_layer1,
+  },
+  {
+      vp9_rect_part_nn_bias_16_layer0,
+      vp9_rect_part_nn_bias_16_layer1,
+  },
+};
+
+static const float vp9_rect_part_nn_weights_32_layer0[FEATURES * NODES] = {
+  -0.147312f, -0.753248f, 0.540206f,  0.661415f,  0.484117f,  -0.341609f,
+  0.016183f,  0.064177f,  0.781580f,  0.902232f,  -0.505342f, 0.325183f,
+  -0.231072f, -0.120107f, -0.076216f, 0.120038f,  0.403695f,  -0.463301f,
+  -0.192158f, 0.407442f,  0.106633f,  1.072371f,  -0.446779f, 0.467353f,
+  0.318812f,  -0.505996f, -0.008768f, -0.239598f, 0.085480f,  0.284640f,
+  -0.365045f, -0.048083f, -0.112090f, -0.067089f, 0.304138f,  -0.228809f,
+  0.383651f,  -0.196882f, 0.477039f,  -0.217978f, -0.506931f, -0.125675f,
+  0.050456f,  1.086598f,  0.732128f,  0.326941f,  0.103952f,  0.121769f,
+  -0.154487f, -0.255514f, 0.030591f,  -0.382797f, -0.019981f, -0.326570f,
+  0.149691f,  -0.435633f, -0.070795f, 0.167691f,  0.251413f,  -0.153405f,
+  0.160347f,  0.455107f,  -0.968580f, -0.575879f, 0.623115f,  -0.069793f,
+  -0.379768f, -0.965807f, -0.062057f, 0.071312f,  0.457098f,  0.350372f,
+  -0.460659f, -0.985393f, 0.359963f,  -0.093677f, 0.404272f,  -0.326896f,
+  -0.277752f, 0.609322f,  -0.114193f, -0.230701f, 0.089208f,  0.645381f,
+  0.494485f,  0.467876f,  -0.166187f, 0.251044f,  -0.394661f, 0.192895f,
+  -0.344777f, -0.041893f, -0.111163f, 0.066347f,  0.378158f,  -0.455465f,
+  0.339839f,  -0.418207f, -0.356515f, -0.227536f, -0.211091f, -0.122945f,
+  0.361772f,  -0.338095f, 0.004564f,  -0.398510f, 0.060876f,  -2.132504f,
+  -0.086776f, -0.029166f, 0.039241f,  0.222534f,  -0.188565f, -0.288792f,
+  -0.160789f, -0.123905f, 0.397916f,  -0.063779f, 0.167210f,  -0.445004f,
+  0.056889f,  0.207280f,  0.000101f,  0.384507f,  -1.721239f, -2.036402f,
+  -2.084403f, -2.060483f,
+};
+
+static const float vp9_rect_part_nn_bias_32_layer0[NODES] = {
+  -0.859251f, -0.109938f, 0.091838f,  0.187817f,  -0.728265f, 0.253080f,
+  0.000000f,  -0.357195f, -0.031290f, -1.373237f, -0.761086f, 0.000000f,
+  -0.024504f, 1.765711f,  0.000000f,  1.505390f,
+};
+
+static const float vp9_rect_part_nn_weights_32_layer1[NODES * LABELS] = {
+  0.680940f,  1.367178f,  0.403075f,  0.029957f,  0.500917f,  1.407776f,
+  -0.354002f, 0.011667f,  1.663767f,  0.959155f,  0.428323f,  -0.205345f,
+  -0.081850f, -3.920103f, -0.243802f, -4.253933f, -0.034020f, -1.361057f,
+  0.128236f,  -0.138422f, -0.025790f, -0.563518f, -0.148715f, -0.344381f,
+  -1.677389f, -0.868332f, -0.063792f, 0.052052f,  0.359591f,  2.739808f,
+  -0.414304f, 3.036597f,  -0.075368f, -1.019680f, 0.642501f,  0.209779f,
+  -0.374539f, -0.718294f, -0.116616f, -0.043212f, -1.787809f, -0.773262f,
+  0.068734f,  0.508309f,  0.099334f,  1.802239f,  -0.333538f, 2.708645f,
+  -0.447682f, -2.355555f, -0.506674f, -0.061028f, -0.310305f, -0.375475f,
+  0.194572f,  0.431788f,  -0.789624f, -0.031962f, 0.358353f,  0.382937f,
+  0.232002f,  2.321813f,  -0.037523f, 2.104652f,
+};
+
+static const float vp9_rect_part_nn_bias_32_layer1[LABELS] = {
+  -0.693383f,
+  0.773661f,
+  0.426878f,
+  -0.070619f,
+};
+
+static const NN_CONFIG vp9_rect_part_nnconfig_32 = {
+  FEATURES,  // num_inputs
+  LABELS,    // num_outputs
+  1,         // num_hidden_layers
+  {
+      NODES,
+  },  // num_hidden_nodes
+  {
+      vp9_rect_part_nn_weights_32_layer0,
+      vp9_rect_part_nn_weights_32_layer1,
+  },
+  {
+      vp9_rect_part_nn_bias_32_layer0,
+      vp9_rect_part_nn_bias_32_layer1,
+  },
+};
+#undef NODES
+
+#define NODES 24
+static const float vp9_rect_part_nn_weights_64_layer0[FEATURES * NODES] = {
+  0.024671f,  -0.220610f, -0.284362f, -0.069556f, -0.315700f, 0.187861f,
+  0.139782f,  0.063110f,  0.796561f,  0.172868f,  -0.662194f, -1.393074f,
+  0.085003f,  0.393381f,  0.358477f,  -0.187268f, -0.370745f, 0.218287f,
+  0.027271f,  -0.254089f, -0.048236f, -0.459137f, 0.253171f,  0.122598f,
+  -0.550107f, -0.568456f, 0.159866f,  -0.246534f, 0.096384f,  -0.255460f,
+  0.077864f,  -0.334837f, 0.026921f,  -0.697252f, 0.345262f,  1.343578f,
+  0.815984f,  1.118211f,  1.574016f,  0.578476f,  -0.285967f, -0.508672f,
+  0.118137f,  0.037695f,  1.540510f,  1.256648f,  1.163819f,  1.172027f,
+  0.661551f,  -0.111980f, -0.434204f, -0.894217f, 0.570524f,  0.050292f,
+  -0.113680f, 0.000784f,  -0.211554f, -0.369394f, 0.158306f,  -0.512505f,
+  -0.238696f, 0.091498f,  -0.448490f, -0.491268f, -0.353112f, -0.303315f,
+  -0.428438f, 0.127998f,  -0.406790f, -0.401786f, -0.279888f, -0.384223f,
+  0.026100f,  0.041621f,  -0.315818f, -0.087888f, 0.353497f,  0.163123f,
+  -0.380128f, -0.090334f, -0.216647f, -0.117849f, -0.173502f, 0.301871f,
+  0.070854f,  0.114627f,  -0.050545f, -0.160381f, 0.595294f,  0.492696f,
+  -0.453858f, -1.154139f, 0.126000f,  0.034550f,  0.456665f,  -0.236618f,
+  -0.112640f, 0.050759f,  -0.449162f, 0.110059f,  0.147116f,  0.249358f,
+  -0.049894f, 0.063351f,  -0.004467f, 0.057242f,  -0.482015f, -0.174335f,
+  -0.085617f, -0.333808f, -0.358440f, -0.069006f, 0.099260f,  -1.243430f,
+  -0.052963f, 0.112088f,  -2.661115f, -2.445893f, -2.688174f, -2.624232f,
+  0.030494f,  0.161311f,  0.012136f,  0.207564f,  -2.776856f, -2.791940f,
+  -2.623962f, -2.918820f, 1.231619f,  -0.376692f, -0.698078f, 0.110336f,
+  -0.285378f, 0.258367f,  -0.180159f, -0.376608f, -0.034348f, -0.130206f,
+  0.160020f,  0.852977f,  0.580573f,  1.450782f,  1.357596f,  0.787382f,
+  -0.544004f, -0.014795f, 0.032121f,  -0.557696f, 0.159994f,  -0.540908f,
+  0.180380f,  -0.398045f, 0.705095f,  0.515103f,  -0.511521f, -1.271374f,
+  -0.231019f, 0.423647f,  0.064907f,  -0.255338f, -0.877748f, -0.667205f,
+  0.267847f,  0.135229f,  0.617844f,  1.349849f,  1.012623f,  0.730506f,
+  -0.078571f, 0.058401f,  0.053221f,  -2.426146f, -0.098808f, -0.138508f,
+  -0.153299f, 0.149116f,  -0.444243f, 0.301807f,  0.065066f,  0.092929f,
+  -0.372784f, -0.095540f, 0.192269f,  0.237894f,  0.080228f,  -0.214074f,
+  -0.011426f, -2.352367f, -0.085394f, -0.190361f, -0.001177f, 0.089197f,
+};
+
+static const float vp9_rect_part_nn_bias_64_layer0[NODES] = {
+  0.000000f,  -0.057652f, -0.175413f, -0.175389f, -1.084097f, -1.423801f,
+  -0.076307f, -0.193803f, 0.000000f,  -0.066474f, -0.050318f, -0.019832f,
+  -0.038814f, -0.144184f, 2.652451f,  2.415006f,  0.197464f,  -0.729842f,
+  -0.173774f, 0.239171f,  0.486425f,  2.463304f,  -0.175279f, 2.352637f,
+};
+
+static const float vp9_rect_part_nn_weights_64_layer1[NODES * LABELS] = {
+  -0.063237f, 1.925696f,  -0.182145f, -0.226687f, 0.602941f,  -0.941140f,
+  0.814598f,  -0.117063f, 0.282988f,  0.066369f,  0.096951f,  1.049735f,
+  -0.188188f, -0.281227f, -4.836746f, -5.047797f, 0.892358f,  0.417145f,
+  -0.279849f, 1.335945f,  0.660338f,  -2.757938f, -0.115714f, -1.862183f,
+  -0.045980f, -1.597624f, -0.586822f, -0.615589f, -0.330537f, 1.068496f,
+  -0.167290f, 0.141290f,  -0.112100f, 0.232761f,  0.252307f,  -0.399653f,
+  0.353118f,  0.241583f,  2.635241f,  4.026119f,  -1.137327f, -0.052446f,
+  -0.139814f, -1.104256f, -0.759391f, 2.508457f,  -0.526297f, 2.095348f,
+  -0.444473f, -1.090452f, 0.584122f,  0.468729f,  -0.368865f, 1.041425f,
+  -1.079504f, 0.348837f,  0.390091f,  0.416191f,  0.212906f,  -0.660255f,
+  0.053630f,  0.209476f,  3.595525f,  2.257293f,  -0.514030f, 0.074203f,
+  -0.375862f, -1.998307f, -0.930310f, 1.866686f,  -0.247137f, 1.087789f,
+  0.100186f,  0.298150f,  0.165265f,  0.050478f,  0.249167f,  0.371789f,
+  -0.294497f, 0.202954f,  0.037310f,  0.193159f,  0.161551f,  0.301597f,
+  0.299286f,  0.185946f,  0.822976f,  2.066130f,  -1.724588f, 0.055977f,
+  -0.330747f, -0.067747f, -0.475801f, 1.555958f,  -0.025808f, -0.081516f,
+};
+
+static const float vp9_rect_part_nn_bias_64_layer1[LABELS] = {
+  -0.090723f,
+  0.894968f,
+  0.844754f,
+  -3.496194f,
+};
+
+static const NN_CONFIG vp9_rect_part_nnconfig_64 = {
+  FEATURES,  // num_inputs
+  LABELS,    // num_outputs
+  1,         // num_hidden_layers
+  {
+      NODES,
+  },  // num_hidden_nodes
+  {
+      vp9_rect_part_nn_weights_64_layer0,
+      vp9_rect_part_nn_weights_64_layer1,
+  },
+  {
+      vp9_rect_part_nn_bias_64_layer0,
+      vp9_rect_part_nn_bias_64_layer1,
+  },
+};
+#undef FEATURES
+#undef LABELS
+#undef NODES
+
+#define FEATURES 7
+// Partition pruning model(neural nets).
+static const float vp9_partition_nn_weights_64x64_layer0[FEATURES * 8] = {
+  -3.571348f, 0.014835f,  -3.255393f, -0.098090f, -0.013120f, 0.000221f,
+  0.056273f,  0.190179f,  -0.268130f, -1.828242f, -0.010655f, 0.937244f,
+  -0.435120f, 0.512125f,  1.610679f,  0.190816f,  -0.799075f, -0.377348f,
+  -0.144232f, 0.614383f,  -0.980388f, 1.754150f,  -0.185603f, -0.061854f,
+  -0.807172f, 1.240177f,  1.419531f,  -0.438544f, -5.980774f, 0.139045f,
+  -0.032359f, -0.068887f, -1.237918f, 0.115706f,  0.003164f,  2.924212f,
+  1.246838f,  -0.035833f, 0.810011f,  -0.805894f, 0.010966f,  0.076463f,
+  -4.226380f, -2.437764f, -0.010619f, -0.020935f, -0.451494f, 0.300079f,
+  -0.168961f, -3.326450f, -2.731094f, 0.002518f,  0.018840f,  -1.656815f,
+  0.068039f,  0.010586f,
+};
+
+static const float vp9_partition_nn_bias_64x64_layer0[8] = {
+  -3.469882f, 0.683989f, 0.194010f,  0.313782f,
+  -3.153335f, 2.245849f, -1.946190f, -3.740020f,
+};
+
+static const float vp9_partition_nn_weights_64x64_layer1[8] = {
+  -8.058566f, 0.108306f, -0.280620f, -0.818823f,
+  -6.445117f, 0.865364f, -1.127127f, -8.808660f,
+};
+
+static const float vp9_partition_nn_bias_64x64_layer1[1] = {
+  6.46909416f,
+};
+
+static const NN_CONFIG vp9_partition_nnconfig_64x64 = {
+  FEATURES,  // num_inputs
+  1,         // num_outputs
+  1,         // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  {
+      vp9_partition_nn_weights_64x64_layer0,
+      vp9_partition_nn_weights_64x64_layer1,
+  },
+  {
+      vp9_partition_nn_bias_64x64_layer0,
+      vp9_partition_nn_bias_64x64_layer1,
+  },
+};
+
+static const float vp9_partition_nn_weights_32x32_layer0[FEATURES * 8] = {
+  -0.295437f, -4.002648f, -0.205399f, -0.060919f, 0.708037f,  0.027221f,
+  -0.039137f, -0.907724f, -3.151662f, 0.007106f,  0.018726f,  -0.534928f,
+  0.022744f,  0.000159f,  -1.717189f, -3.229031f, -0.027311f, 0.269863f,
+  -0.400747f, -0.394366f, -0.108878f, 0.603027f,  0.455369f,  -0.197170f,
+  1.241746f,  -1.347820f, -0.575636f, -0.462879f, -2.296426f, 0.196696f,
+  -0.138347f, -0.030754f, -0.200774f, 0.453795f,  0.055625f,  -3.163116f,
+  -0.091003f, -0.027028f, -0.042984f, -0.605185f, 0.143240f,  -0.036439f,
+  -0.801228f, 0.313409f,  -0.159942f, 0.031267f,  0.886454f,  -1.531644f,
+  -0.089655f, 0.037683f,  -0.163441f, -0.130454f, -0.058344f, 0.060011f,
+  0.275387f,  1.552226f,
+};
+
+static const float vp9_partition_nn_bias_32x32_layer0[8] = {
+  -0.838372f, -2.609089f, -0.055763f, 1.329485f,
+  -1.297638f, -2.636622f, -0.826909f, 1.012644f,
+};
+
+static const float vp9_partition_nn_weights_32x32_layer1[8] = {
+  -1.792632f, -7.322353f, -0.683386f, 0.676564f,
+  -1.488118f, -7.527719f, 1.240163f,  0.614309f,
+};
+
+static const float vp9_partition_nn_bias_32x32_layer1[1] = {
+  4.97422546f,
+};
+
+static const NN_CONFIG vp9_partition_nnconfig_32x32 = {
+  FEATURES,  // num_inputs
+  1,         // num_outputs
+  1,         // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  {
+      vp9_partition_nn_weights_32x32_layer0,
+      vp9_partition_nn_weights_32x32_layer1,
+  },
+  {
+      vp9_partition_nn_bias_32x32_layer0,
+      vp9_partition_nn_bias_32x32_layer1,
+  },
+};
+
+static const float vp9_partition_nn_weights_16x16_layer0[FEATURES * 8] = {
+  -1.717673f, -4.718130f, -0.125725f, -0.183427f, -0.511764f, 0.035328f,
+  0.130891f,  -3.096753f, 0.174968f,  -0.188769f, -0.640796f, 1.305661f,
+  1.700638f,  -0.073806f, -4.006781f, -1.630999f, -0.064863f, -0.086410f,
+  -0.148617f, 0.172733f,  -0.018619f, 2.152595f,  0.778405f,  -0.156455f,
+  0.612995f,  -0.467878f, 0.152022f,  -0.236183f, 0.339635f,  -0.087119f,
+  -3.196610f, -1.080401f, -0.637704f, -0.059974f, 1.706298f,  -0.793705f,
+  -6.399260f, 0.010624f,  -0.064199f, -0.650621f, 0.338087f,  -0.001531f,
+  1.023655f,  -3.700272f, -0.055281f, -0.386884f, 0.375504f,  -0.898678f,
+  0.281156f,  -0.314611f, 0.863354f,  -0.040582f, -0.145019f, 0.029329f,
+  -2.197880f, -0.108733f,
+};
+
+static const float vp9_partition_nn_bias_16x16_layer0[8] = {
+  0.411516f,  -2.143737f, -3.693192f, 2.123142f,
+  -1.356910f, -3.561016f, -0.765045f, -2.417082f,
+};
+
+static const float vp9_partition_nn_weights_16x16_layer1[8] = {
+  -0.619755f, -2.202391f, -4.337171f, 0.611319f,
+  0.377677f,  -4.998723f, -1.052235f, 1.949922f,
+};
+
+static const float vp9_partition_nn_bias_16x16_layer1[1] = {
+  3.20981717f,
+};
+
+static const NN_CONFIG vp9_partition_nnconfig_16x16 = {
+  FEATURES,  // num_inputs
+  1,         // num_outputs
+  1,         // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  {
+      vp9_partition_nn_weights_16x16_layer0,
+      vp9_partition_nn_weights_16x16_layer1,
+  },
+  {
+      vp9_partition_nn_bias_16x16_layer0,
+      vp9_partition_nn_bias_16x16_layer1,
+  },
+};
+#undef FEATURES
+
+#define FEATURES 6
+static const float vp9_var_part_nn_weights_64_layer0[FEATURES * 8] = {
+  -0.249572f, 0.205532f,  -2.175608f, 1.094836f,  -2.986370f, 0.193160f,
+  -0.143823f, 0.378511f,  -1.997788f, -2.166866f, -1.930158f, -1.202127f,
+  -0.611875f, -0.506422f, -0.432487f, 0.071205f,  0.578172f,  -0.154285f,
+  -0.051830f, 0.331681f,  -1.457177f, -2.443546f, -2.000302f, -1.389283f,
+  0.372084f,  -0.464917f, 2.265235f,  2.385787f,  2.312722f,  2.127868f,
+  -0.403963f, -0.177860f, -0.436751f, -0.560539f, 0.254903f,  0.193976f,
+  -0.305611f, 0.256632f,  0.309388f,  -0.437439f, 1.702640f,  -5.007069f,
+  -0.323450f, 0.294227f,  1.267193f,  1.056601f,  0.387181f,  -0.191215f,
+};
+
+static const float vp9_var_part_nn_bias_64_layer0[8] = {
+  -0.044396f, -0.938166f, 0.000000f,  -0.916375f,
+  1.242299f,  0.000000f,  -0.405734f, 0.014206f,
+};
+
+static const float vp9_var_part_nn_weights_64_layer1[8] = {
+  1.635945f,  0.979557f,  0.455315f, 1.197199f,
+  -2.251024f, -0.464953f, 1.378676f, -0.111927f,
+};
+
+static const float vp9_var_part_nn_bias_64_layer1[1] = {
+  -0.37972447f,
+};
+
+static const NN_CONFIG vp9_var_part_nnconfig_64 = {
+  FEATURES,  // num_inputs
+  1,         // num_outputs
+  1,         // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  {
+      vp9_var_part_nn_weights_64_layer0,
+      vp9_var_part_nn_weights_64_layer1,
+  },
+  {
+      vp9_var_part_nn_bias_64_layer0,
+      vp9_var_part_nn_bias_64_layer1,
+  },
+};
+
+static const float vp9_var_part_nn_weights_32_layer0[FEATURES * 8] = {
+  0.067243f,  -0.083598f, -2.191159f, 2.726434f,  -3.324013f, 3.477977f,
+  0.323736f,  -0.510199f, 2.960693f,  2.937661f,  2.888476f,  2.938315f,
+  -0.307602f, -0.503353f, -0.080725f, -0.473909f, -0.417162f, 0.457089f,
+  0.665153f,  -0.273210f, 0.028279f,  0.972220f,  -0.445596f, 1.756611f,
+  -0.177892f, -0.091758f, 0.436661f,  -0.521506f, 0.133786f,  0.266743f,
+  0.637367f,  -0.160084f, -1.396269f, 1.020841f,  -1.112971f, 0.919496f,
+  -0.235883f, 0.651954f,  0.109061f,  -0.429463f, 0.740839f,  -0.962060f,
+  0.299519f,  -0.386298f, 1.550231f,  2.464915f,  1.311969f,  2.561612f,
+};
+
+static const float vp9_var_part_nn_bias_32_layer0[8] = {
+  0.368242f, 0.736617f, 0.000000f,  0.757287f,
+  0.000000f, 0.613248f, -0.776390f, 0.928497f,
+};
+
+static const float vp9_var_part_nn_weights_32_layer1[8] = {
+  0.939884f, -2.420850f, -0.410489f, -0.186690f,
+  0.063287f, -0.522011f, 0.484527f,  -0.639625f,
+};
+
+static const float vp9_var_part_nn_bias_32_layer1[1] = {
+  -0.6455006f,
+};
+
+static const NN_CONFIG vp9_var_part_nnconfig_32 = {
+  FEATURES,  // num_inputs
+  1,         // num_outputs
+  1,         // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  {
+      vp9_var_part_nn_weights_32_layer0,
+      vp9_var_part_nn_weights_32_layer1,
+  },
+  {
+      vp9_var_part_nn_bias_32_layer0,
+      vp9_var_part_nn_bias_32_layer1,
+  },
+};
+
+static const float vp9_var_part_nn_weights_16_layer0[FEATURES * 8] = {
+  0.742567f,  -0.580624f, -0.244528f, 0.331661f,  -0.113949f, -0.559295f,
+  -0.386061f, 0.438653f,  1.467463f,  0.211589f,  0.513972f,  1.067855f,
+  -0.876679f, 0.088560f,  -0.687483f, -0.380304f, -0.016412f, 0.146380f,
+  0.015318f,  0.000351f,  -2.764887f, 3.269717f,  2.752428f,  -2.236754f,
+  0.561539f,  -0.852050f, -0.084667f, 0.202057f,  0.197049f,  0.364922f,
+  -0.463801f, 0.431790f,  1.872096f,  -0.091887f, -0.055034f, 2.443492f,
+  -0.156958f, -0.189571f, -0.542424f, -0.589804f, -0.354422f, 0.401605f,
+  0.642021f,  -0.875117f, 2.040794f,  1.921070f,  1.792413f,  1.839727f,
+};
+
+static const float vp9_var_part_nn_bias_16_layer0[8] = {
+  2.901234f, -1.940932f, -0.198970f, -0.406524f,
+  0.059422f, -1.879207f, -0.232340f, 2.979821f,
+};
+
+static const float vp9_var_part_nn_weights_16_layer1[8] = {
+  -0.528731f, 0.375234f, -0.088422f, 0.668629f,
+  0.870449f,  0.578735f, 0.546103f,  -1.957207f,
+};
+
+static const float vp9_var_part_nn_bias_16_layer1[1] = {
+  -1.95769405f,
+};
+
+static const NN_CONFIG vp9_var_part_nnconfig_16 = {
+  FEATURES,  // num_inputs
+  1,         // num_outputs
+  1,         // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  {
+      vp9_var_part_nn_weights_16_layer0,
+      vp9_var_part_nn_weights_16_layer1,
+  },
+  {
+      vp9_var_part_nn_bias_16_layer0,
+      vp9_var_part_nn_bias_16_layer1,
+  },
+};
+#undef FEATURES
+
+#define FEATURES 12
+#define LABELS 1
+#define NODES 8
+static const float vp9_part_split_nn_weights_64_layer0[FEATURES * NODES] = {
+  -0.609728f, -0.409099f, -0.472449f, 0.183769f,  -0.457740f, 0.081089f,
+  0.171003f,  0.578696f,  -0.019043f, -0.856142f, 0.557369f,  -1.779424f,
+  -0.274044f, -0.320632f, -0.392531f, -0.359462f, -0.404106f, -0.288357f,
+  0.200620f,  0.038013f,  -0.430093f, 0.235083f,  -0.487442f, 0.424814f,
+  -0.232758f, -0.442943f, 0.229397f,  -0.540301f, -0.648421f, -0.649747f,
+  -0.171638f, 0.603824f,  0.468497f,  -0.421580f, 0.178840f,  -0.533838f,
+  -0.029471f, -0.076296f, 0.197426f,  -0.187908f, -0.003950f, -0.065740f,
+  0.085165f,  -0.039674f, -5.640702f, 1.909538f,  -1.434604f, 3.294606f,
+  -0.788812f, 0.196864f,  0.057012f,  -0.019757f, 0.336233f,  0.075378f,
+  0.081503f,  0.491864f,  -1.899470f, -1.764173f, -1.888137f, -1.762343f,
+  0.845542f,  0.202285f,  0.381948f,  -0.150996f, 0.556893f,  -0.305354f,
+  0.561482f,  -0.021974f, -0.703117f, 0.268638f,  -0.665736f, 1.191005f,
+  -0.081568f, -0.115653f, 0.272029f,  -0.140074f, 0.072683f,  0.092651f,
+  -0.472287f, -0.055790f, -0.434425f, 0.352055f,  0.048246f,  0.372865f,
+  0.111499f,  -0.338304f, 0.739133f,  0.156519f,  -0.594644f, 0.137295f,
+  0.613350f,  -0.165102f, -1.003731f, 0.043070f,  -0.887896f, -0.174202f,
+};
+
+static const float vp9_part_split_nn_bias_64_layer0[NODES] = {
+  1.182714f,  0.000000f,  0.902019f,  0.953115f,
+  -1.372486f, -1.288740f, -0.155144f, -3.041362f,
+};
+
+static const float vp9_part_split_nn_weights_64_layer1[NODES * LABELS] = {
+  0.841214f,  0.456016f,  0.869270f, 1.692999f,
+  -1.700494f, -0.911761f, 0.030111f, -1.447548f,
+};
+
+static const float vp9_part_split_nn_bias_64_layer1[LABELS] = {
+  1.17782545f,
+};
+
+static const NN_CONFIG vp9_part_split_nnconfig_64 = {
+  FEATURES,  // num_inputs
+  LABELS,    // num_outputs
+  1,         // num_hidden_layers
+  {
+      NODES,
+  },  // num_hidden_nodes
+  {
+      vp9_part_split_nn_weights_64_layer0,
+      vp9_part_split_nn_weights_64_layer1,
+  },
+  {
+      vp9_part_split_nn_bias_64_layer0,
+      vp9_part_split_nn_bias_64_layer1,
+  },
+};
+
+static const float vp9_part_split_nn_weights_32_layer0[FEATURES * NODES] = {
+  -0.105488f, -0.218662f, 0.010980f,  -0.226979f, 0.028076f,  0.743430f,
+  0.789266f,  0.031907f,  -1.464200f, 0.222336f,  -1.068493f, -0.052712f,
+  -0.176181f, -0.102654f, -0.973932f, -0.182637f, -0.198000f, 0.335977f,
+  0.271346f,  0.133005f,  1.674203f,  0.689567f,  0.657133f,  0.283524f,
+  0.115529f,  0.738327f,  0.317184f,  -0.179736f, 0.403691f,  0.679350f,
+  0.048925f,  0.271338f,  -1.538921f, -0.900737f, -1.377845f, 0.084245f,
+  0.803122f,  -0.107806f, 0.103045f,  -0.023335f, -0.098116f, -0.127809f,
+  0.037665f,  -0.523225f, 1.622185f,  1.903999f,  1.358889f,  1.680785f,
+  0.027743f,  0.117906f,  -0.158810f, 0.057775f,  0.168257f,  0.062414f,
+  0.086228f,  -0.087381f, -3.066082f, 3.021855f,  -4.092155f, 2.550104f,
+  -0.230022f, -0.207445f, -0.000347f, 0.034042f,  0.097057f,  0.220088f,
+  -0.228841f, -0.029405f, -1.507174f, -1.455184f, 2.624904f,  2.643355f,
+  0.319912f,  0.585531f,  -1.018225f, -0.699606f, 1.026490f,  0.169952f,
+  -0.093579f, -0.142352f, -0.107256f, 0.059598f,  0.043190f,  0.507543f,
+  -0.138617f, 0.030197f,  0.059574f,  -0.634051f, -0.586724f, -0.148020f,
+  -0.334380f, 0.459547f,  1.620600f,  0.496850f,  0.639480f,  -0.465715f,
+};
+
+static const float vp9_part_split_nn_bias_32_layer0[NODES] = {
+  -1.125885f, 0.753197f, -0.825808f, 0.004839f,
+  0.583920f,  0.718062f, 0.976741f,  0.796188f,
+};
+
+static const float vp9_part_split_nn_weights_32_layer1[NODES * LABELS] = {
+  -0.458745f, 0.724624f, -0.479720f, -2.199872f,
+  1.162661f,  1.194153f, -0.716896f, 0.824080f,
+};
+
+static const float vp9_part_split_nn_bias_32_layer1[LABELS] = {
+  0.71644074f,
+};
+
+static const NN_CONFIG vp9_part_split_nnconfig_32 = {
+  FEATURES,  // num_inputs
+  LABELS,    // num_outputs
+  1,         // num_hidden_layers
+  {
+      NODES,
+  },  // num_hidden_nodes
+  {
+      vp9_part_split_nn_weights_32_layer0,
+      vp9_part_split_nn_weights_32_layer1,
+  },
+  {
+      vp9_part_split_nn_bias_32_layer0,
+      vp9_part_split_nn_bias_32_layer1,
+  },
+};
+
+static const float vp9_part_split_nn_weights_16_layer0[FEATURES * NODES] = {
+  -0.003629f, -0.046852f, 0.220428f,  -0.033042f, 0.049365f,  0.112818f,
+  -0.306149f, -0.005872f, 1.066947f,  -2.290226f, 2.159505f,  -0.618714f,
+  -0.213294f, 0.451372f,  -0.199459f, 0.223730f,  -0.321709f, 0.063364f,
+  0.148704f,  -0.293371f, 0.077225f,  -0.421947f, -0.515543f, -0.240975f,
+  -0.418516f, 1.036523f,  -0.009165f, 0.032484f,  1.086549f,  0.220322f,
+  -0.247585f, -0.221232f, -0.225050f, 0.993051f,  0.285907f,  1.308846f,
+  0.707456f,  0.335152f,  0.234556f,  0.264590f,  -0.078033f, 0.542226f,
+  0.057777f,  0.163471f,  0.039245f,  -0.725960f, 0.963780f,  -0.972001f,
+  0.252237f,  -0.192745f, -0.836571f, -0.460539f, -0.528713f, -0.160198f,
+  -0.621108f, 0.486405f,  -0.221923f, 1.519426f,  -0.857871f, 0.411595f,
+  0.947188f,  0.203339f,  0.174526f,  0.016382f,  0.256879f,  0.049818f,
+  0.057836f,  -0.659096f, 0.459894f,  0.174695f,  0.379359f,  0.062530f,
+  -0.210201f, -0.355788f, -0.208432f, -0.401723f, -0.115373f, 0.191336f,
+  -0.109342f, 0.002455f,  -0.078746f, -0.391871f, 0.149892f,  -0.239615f,
+  -0.520709f, 0.118568f,  -0.437975f, 0.118116f,  -0.565426f, -0.206446f,
+  0.113407f,  0.558894f,  0.534627f,  1.154350f,  -0.116833f, 1.723311f,
+};
+
+static const float vp9_part_split_nn_bias_16_layer0[NODES] = {
+  0.013109f,  -0.034341f, 0.679845f,  -0.035781f,
+  -0.104183f, 0.098055f,  -0.041130f, 0.160107f,
+};
+
+static const float vp9_part_split_nn_weights_16_layer1[NODES * LABELS] = {
+  1.499564f, -0.403259f, 1.366532f, -0.469868f,
+  0.482227f, -2.076697f, 0.527691f, 0.540495f,
+};
+
+static const float vp9_part_split_nn_bias_16_layer1[LABELS] = {
+  0.01134653f,
+};
+
+static const NN_CONFIG vp9_part_split_nnconfig_16 = {
+  FEATURES,  // num_inputs
+  LABELS,    // num_outputs
+  1,         // num_hidden_layers
+  {
+      NODES,
+  },  // num_hidden_nodes
+  {
+      vp9_part_split_nn_weights_16_layer0,
+      vp9_part_split_nn_weights_16_layer1,
+  },
+  {
+      vp9_part_split_nn_bias_16_layer0,
+      vp9_part_split_nn_bias_16_layer1,
+  },
+};
+
+static const float vp9_part_split_nn_weights_8_layer0[FEATURES * NODES] = {
+  -0.668875f, -0.159078f, -0.062663f, -0.483785f, -0.146814f, -0.608975f,
+  -0.589145f, 0.203704f,  -0.051007f, -0.113769f, -0.477511f, -0.122603f,
+  -1.329890f, 1.403386f,  0.199636f,  -0.161139f, 2.182090f,  -0.014307f,
+  0.015755f,  -0.208468f, 0.884353f,  0.815920f,  0.632464f,  0.838225f,
+  1.369483f,  -0.029068f, 0.570213f,  -0.573546f, 0.029617f,  0.562054f,
+  -0.653093f, -0.211910f, -0.661013f, -0.384418f, -0.574038f, -0.510069f,
+  0.173047f,  -0.274231f, -1.044008f, -0.422040f, -0.810296f, 0.144069f,
+  -0.406704f, 0.411230f,  -0.144023f, 0.745651f,  -0.595091f, 0.111787f,
+  0.840651f,  0.030123f,  -0.242155f, 0.101486f,  -0.017889f, -0.254467f,
+  -0.285407f, -0.076675f, -0.549542f, -0.013544f, -0.686566f, -0.755150f,
+  1.623949f,  -0.286369f, 0.170976f,  0.016442f,  -0.598353f, -0.038540f,
+  0.202597f,  -0.933582f, 0.599510f,  0.362273f,  0.577722f,  0.477603f,
+  0.767097f,  0.431532f,  0.457034f,  0.223279f,  0.381349f,  0.033777f,
+  0.423923f,  -0.664762f, 0.385662f,  0.075744f,  0.182681f,  0.024118f,
+  0.319408f,  -0.528864f, 0.976537f,  -0.305971f, -0.189380f, -0.241689f,
+  -1.318092f, 0.088647f,  -0.109030f, -0.945654f, 1.082797f,  0.184564f,
+};
+
+static const float vp9_part_split_nn_bias_8_layer0[NODES] = {
+  -0.237472f, 2.051396f,  0.297062f, -0.730194f,
+  0.060472f,  -0.565959f, 0.560869f, -0.395448f,
+};
+
+static const float vp9_part_split_nn_weights_8_layer1[NODES * LABELS] = {
+  0.568121f,  1.575915f,  -0.544309f, 0.751595f,
+  -0.117911f, -1.340730f, -0.739671f, 0.661216f,
+};
+
+static const float vp9_part_split_nn_bias_8_layer1[LABELS] = {
+  -0.63375306f,
+};
+
+static const NN_CONFIG vp9_part_split_nnconfig_8 = {
+  FEATURES,  // num_inputs
+  LABELS,    // num_outputs
+  1,         // num_hidden_layers
+  {
+      NODES,
+  },  // num_hidden_nodes
+  {
+      vp9_part_split_nn_weights_8_layer0,
+      vp9_part_split_nn_weights_8_layer1,
+  },
+  {
+      vp9_part_split_nn_bias_8_layer0,
+      vp9_part_split_nn_bias_8_layer1,
+  },
+};
+#undef NODES
+#undef FEATURES
+#undef LABELS
+
+// Partition pruning model(linear).
+static const float vp9_partition_feature_mean[24] = {
+  303501.697372f, 3042630.372158f, 24.694696f, 1.392182f,
+  689.413511f,    162.027012f,     1.478213f,  0.0,
+  135382.260230f, 912738.513263f,  28.845217f, 1.515230f,
+  544.158492f,    131.807995f,     1.436863f,  0.0f,
+  43682.377587f,  208131.711766f,  28.084737f, 1.356677f,
+  138.254122f,    119.522553f,     1.252322f,  0.0f,
+};
+
+static const float vp9_partition_feature_std[24] = {
+  673689.212982f, 5996652.516628f, 0.024449f, 1.989792f,
+  985.880847f,    0.014638f,       2.001898f, 0.0f,
+  208798.775332f, 1812548.443284f, 0.018693f, 1.838009f,
+  396.986910f,    0.015657f,       1.332541f, 0.0f,
+  55888.847031f,  448587.962714f,  0.017900f, 1.904776f,
+  98.652832f,     0.016598f,       1.320992f, 0.0f,
+};
+
+// Error tolerance: 0.01%-0.0.05%-0.1%
+static const float vp9_partition_linear_weights[24] = {
+  0.111736f, 0.289977f, 0.042219f, 0.204765f, 0.120410f, -0.143863f,
+  0.282376f, 0.847811f, 0.637161f, 0.131570f, 0.018636f, 0.202134f,
+  0.112797f, 0.028162f, 0.182450f, 1.124367f, 0.386133f, 0.083700f,
+  0.050028f, 0.150873f, 0.061119f, 0.109318f, 0.127255f, 0.625211f,
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VP9_ENCODER_VP9_PARTITION_MODELS_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_picklpf.c b/media/libvpx/libvpx/vp9/encoder/vp9_picklpf.c
index 6fc7cd1e3c..3a620df693 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_picklpf.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_picklpf.c
@@ -24,10 +24,20 @@
 #include "vp9/encoder/vp9_picklpf.h"
 #include "vp9/encoder/vp9_quantize.h"
 
+static unsigned int get_section_intra_rating(const VP9_COMP *cpi) {
+  unsigned int section_intra_rating;
+
+  section_intra_rating = (cpi->common.frame_type == KEY_FRAME)
+                             ? cpi->twopass.key_frame_section_intra_rating
+                             : cpi->twopass.section_intra_rating;
+
+  return section_intra_rating;
+}
+
 static int get_max_filter_level(const VP9_COMP *cpi) {
   if (cpi->oxcf.pass == 2) {
-    return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4
-                                                 : MAX_LOOP_FILTER;
+    unsigned int section_intra_rating = get_section_intra_rating(cpi);
+    return section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4 : MAX_LOOP_FILTER;
   } else {
     return MAX_LOOP_FILTER;
   }
@@ -81,6 +91,7 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
   int filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
   // Sum squared error at each filter level
   int64_t ss_err[MAX_LOOP_FILTER + 1];
+  unsigned int section_intra_rating = get_section_intra_rating(cpi);
 
   // Set each entry to -1
   memset(ss_err, 0xFF, sizeof(ss_err));
@@ -99,8 +110,8 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
     // Bias against raising loop filter in favor of lowering it.
     int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
 
-    if ((cpi->oxcf.pass == 2) && (cpi->twopass.section_intra_rating < 20))
-      bias = (bias * cpi->twopass.section_intra_rating) / 20;
+    if ((cpi->oxcf.pass == 2) && (section_intra_rating < 20))
+      bias = (bias * section_intra_rating) / 20;
 
     // yx, bias less for large block size
     if (cm->tx_mode != ONLY_4X4) bias >>= 1;
@@ -150,7 +161,7 @@ void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
   VP9_COMMON *const cm = &cpi->common;
   struct loopfilter *const lf = &cm->lf;
 
-  lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0 : cpi->oxcf.sharpness;
+  lf->sharpness_level = 0;
 
   if (method == LPF_PICK_MINIMAL_LPF && lf->filter_level) {
     lf->filter_level = 0;
@@ -169,22 +180,20 @@ void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
       case VPX_BITS_10:
         filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20);
         break;
-      case VPX_BITS_12:
+      default:
+        assert(cm->bit_depth == VPX_BITS_12);
         filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22);
         break;
-      default:
-        assert(0 &&
-               "bit_depth should be VPX_BITS_8, VPX_BITS_10 "
-               "or VPX_BITS_12");
-        return;
     }
 #else
     int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR &&
+        cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
+        (cm->base_qindex < 200 || cm->width * cm->height > 320 * 240) &&
         cpi->oxcf.content != VP9E_CONTENT_SCREEN && cm->frame_type != KEY_FRAME)
       filt_guess = 5 * filt_guess >> 3;
 
-#endif  // CONFIG_VP9_HIGHBITDEPTH
     if (cm->frame_type == KEY_FRAME) filt_guess -= 4;
     lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level);
   } else {
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_picklpf.h b/media/libvpx/libvpx/vp9/encoder/vp9_picklpf.h
index cecca058b4..8881b44daa 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_picklpf.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_picklpf.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_PICKLPF_H_
-#define VP9_ENCODER_VP9_PICKLPF_H_
+#ifndef VPX_VP9_ENCODER_VP9_PICKLPF_H_
+#define VPX_VP9_ENCODER_VP9_PICKLPF_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -26,4 +26,4 @@ void vp9_pick_filter_level(const struct yv12_buffer_config *sd,
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_PICKLPF_H_
+#endif  // VPX_VP9_ENCODER_VP9_PICKLPF_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_pickmode.c b/media/libvpx/libvpx/vp9/encoder/vp9_pickmode.c
index 33f3f5a476..b841385baa 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_pickmode.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_pickmode.c
@@ -12,6 +12,7 @@
 #include <limits.h>
 #include <math.h>
 #include <stdio.h>
+#include <stdlib.h>
 
 #include "./vp9_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
@@ -19,7 +20,7 @@
 #include "vpx/vpx_codec.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
-#include "vpx_ports/mem.h"
+#include "vpx_ports/compiler_attributes.h"
 
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_common.h"
@@ -41,6 +42,17 @@ typedef struct {
   int in_use;
 } PRED_BUFFER;
 
+typedef struct {
+  PRED_BUFFER *best_pred;
+  PREDICTION_MODE best_mode;
+  TX_SIZE best_tx_size;
+  TX_SIZE best_intra_tx_size;
+  MV_REFERENCE_FRAME best_ref_frame;
+  MV_REFERENCE_FRAME best_second_ref_frame;
+  uint8_t best_mode_skip_txfm;
+  INTERP_FILTER best_pred_filter;
+} BEST_PICKMODE;
+
 static const int pos_shift_16x16[4][4] = {
   { 9, 10, 13, 14 }, { 11, 12, 15, 16 }, { 17, 18, 21, 22 }, { 19, 20, 23, 24 }
 };
@@ -117,15 +129,24 @@ static int mv_refs_rt(VP9_COMP *cpi, const VP9_COMMON *cm, const MACROBLOCK *x,
       !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
       ref_frame == LAST_FRAME) {
     // Get base layer mv.
-    MV_REF *candidate =
-        &cm->prev_frame
-             ->mvs[(mi_col >> 1) + (mi_row >> 1) * (cm->mi_cols >> 1)];
-    if (candidate->mv[0].as_int != INVALID_MV) {
-      base_mv->as_mv.row = (candidate->mv[0].as_mv.row * 2);
-      base_mv->as_mv.col = (candidate->mv[0].as_mv.col * 2);
-      clamp_mv_ref(&base_mv->as_mv, xd);
-    } else {
-      base_mv->as_int = INVALID_MV;
+    const int prev_layer = cpi->svc.spatial_layer_id - 1;
+    const int index =
+        (mi_col >> 1) + (mi_row >> 1) * cpi->svc.mi_cols[prev_layer];
+    // prev_frame->mvs[] is allocated to size mi_cols * mi_rows corresponding
+    // to the previous spatial layer, so the index check is against
+    // svc.mi_col/rows[prev_layer].
+    if (index < cpi->svc.mi_cols[prev_layer] * cpi->svc.mi_rows[prev_layer]) {
+      MV_REF *candidate = &cm->prev_frame->mvs[index];
+      // Avoid using base_mv if scaled mv is out of range, for either component.
+      if (candidate->mv[0].as_int != INVALID_MV &&
+          abs(candidate->mv[0].as_mv.row) <= INT16_MAX >> 1 &&
+          abs(candidate->mv[0].as_mv.col) <= INT16_MAX >> 1) {
+        base_mv->as_mv.row = candidate->mv[0].as_mv.row * 2;
+        base_mv->as_mv.col = candidate->mv[0].as_mv.col * 2;
+        clamp_mv_ref(&base_mv->as_mv, xd);
+      } else {
+        base_mv->as_int = INVALID_MV;
+      }
     }
   }
 
@@ -158,6 +179,7 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   const MvLimits tmp_mv_limits = x->mv_limits;
   int rv = 0;
   int cost_list[5];
+  int search_subpel = 1;
   const YV12_BUFFER_CONFIG *scaled_ref_frame =
       vp9_get_scaled_ref_frame(cpi, ref);
   if (scaled_ref_frame) {
@@ -170,6 +192,14 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   }
   vp9_set_mv_search_range(&x->mv_limits, &ref_mv);
 
+  // Limit motion vector for large lightning change.
+  if (cpi->oxcf.speed > 5 && x->lowvar_highsumdiff) {
+    x->mv_limits.col_min = VPXMAX(x->mv_limits.col_min, -10);
+    x->mv_limits.row_min = VPXMAX(x->mv_limits.row_min, -10);
+    x->mv_limits.col_max = VPXMIN(x->mv_limits.col_max, 10);
+    x->mv_limits.row_max = VPXMIN(x->mv_limits.row_max, 10);
+  }
+
   assert(x->mv_best_ref_index[ref] <= 2);
   if (x->mv_best_ref_index[ref] < 2)
     mvp_full = x->mbmi_ext->ref_mvs[ref][x->mv_best_ref_index[ref]].as_mv;
@@ -184,9 +214,14 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   else
     center_mv = tmp_mv->as_mv;
 
-  vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
-                        cond_cost_list(cpi, cost_list), &center_mv,
-                        &tmp_mv->as_mv, INT_MAX, 0);
+  if (x->sb_use_mv_part) {
+    tmp_mv->as_mv.row = x->sb_mvrow_part >> 3;
+    tmp_mv->as_mv.col = x->sb_mvcol_part >> 3;
+  } else {
+    vp9_full_pixel_search(
+        cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, sadpb,
+        cond_cost_list(cpi, cost_list), &center_mv, &tmp_mv->as_mv, INT_MAX, 0);
+  }
 
   x->mv_limits = tmp_mv_limits;
 
@@ -202,15 +237,28 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   rv =
       !(RDCOST(x->rdmult, x->rddiv, (*rate_mv + rate_mode), 0) > best_rd_sofar);
 
-  if (rv) {
-    const int subpel_force_stop = use_base_mv && cpi->sf.base_mv_aggressive
-                                      ? 2
-                                      : cpi->sf.mv.subpel_force_stop;
+  // For SVC on non-reference frame, avoid subpel for (0, 0) motion.
+  if (cpi->use_svc && cpi->svc.non_reference_frame) {
+    if (mvp_full.row == 0 && mvp_full.col == 0) search_subpel = 0;
+  }
+
+  if (rv && search_subpel) {
+    SUBPEL_FORCE_STOP subpel_force_stop = cpi->sf.mv.subpel_force_stop;
+    if (use_base_mv && cpi->sf.base_mv_aggressive) subpel_force_stop = HALF_PEL;
+    if (cpi->sf.mv.enable_adaptive_subpel_force_stop) {
+      const int mv_thresh = cpi->sf.mv.adapt_subpel_force_stop.mv_thresh;
+      if (abs(tmp_mv->as_mv.row) >= mv_thresh ||
+          abs(tmp_mv->as_mv.col) >= mv_thresh)
+        subpel_force_stop = cpi->sf.mv.adapt_subpel_force_stop.force_stop_above;
+      else
+        subpel_force_stop = cpi->sf.mv.adapt_subpel_force_stop.force_stop_below;
+    }
     cpi->find_fractional_mv_step(
         x, &tmp_mv->as_mv, &ref_mv, cpi->common.allow_high_precision_mv,
         x->errorperbit, &cpi->fn_ptr[bsize], subpel_force_stop,
-        cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
-        x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, 0, 0);
+        cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list),
+        x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, 0, 0,
+        cpi->sf.use_accurate_subpel_search);
     *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmvjointcost,
                                x->mvcost, MV_COST_WEIGHT);
   }
@@ -230,6 +278,7 @@ static void block_variance(const uint8_t *src, int src_stride,
 #endif
                            uint32_t *sse8x8, int *sum8x8, uint32_t *var8x8) {
   int i, j, k = 0;
+  uint32_t k_sqr = 0;
 
   *sse = 0;
   *sum = 0;
@@ -267,7 +316,8 @@ static void block_variance(const uint8_t *src, int src_stride,
 #endif
       *sse += sse8x8[k];
       *sum += sum8x8[k];
-      var8x8[k] = sse8x8[k] - (uint32_t)(((int64_t)sum8x8[k] * sum8x8[k]) >> 6);
+      k_sqr = (uint32_t)(((int64_t)sum8x8[k] * sum8x8[k]) >> 6);
+      var8x8[k] = sse8x8[k] > k_sqr ? sse8x8[k] - k_sqr : k_sqr - sse8x8[k];
       k++;
     }
   }
@@ -281,6 +331,7 @@ static void calculate_variance(int bw, int bh, TX_SIZE tx_size,
   const int nw = 1 << (bw - b_width_log2_lookup[unit_size]);
   const int nh = 1 << (bh - b_height_log2_lookup[unit_size]);
   int i, j, k = 0;
+  uint32_t k_sqr = 0;
 
   for (i = 0; i < nh; i += 2) {
     for (j = 0; j < nw; j += 2) {
@@ -288,19 +339,109 @@ static void calculate_variance(int bw, int bh, TX_SIZE tx_size,
                  sse_i[(i + 1) * nw + j] + sse_i[(i + 1) * nw + j + 1];
       sum_o[k] = sum_i[i * nw + j] + sum_i[i * nw + j + 1] +
                  sum_i[(i + 1) * nw + j] + sum_i[(i + 1) * nw + j + 1];
-      var_o[k] = sse_o[k] - (uint32_t)(((int64_t)sum_o[k] * sum_o[k]) >>
-                                       (b_width_log2_lookup[unit_size] +
-                                        b_height_log2_lookup[unit_size] + 6));
+      k_sqr = (uint32_t)(((int64_t)sum_o[k] * sum_o[k]) >>
+                         (b_width_log2_lookup[unit_size] +
+                          b_height_log2_lookup[unit_size] + 6));
+      var_o[k] = sse_o[k] > k_sqr ? sse_o[k] - k_sqr : k_sqr - sse_o[k];
       k++;
     }
   }
 }
 
+// Adjust the ac_thr according to speed, width, height and normalized sum
+static int ac_thr_factor(const int speed, const int width, const int height,
+                         const int norm_sum) {
+  if (speed >= 8 && norm_sum < 5) {
+    if (width <= 640 && height <= 480)
+      return 4;
+    else
+      return 2;
+  }
+  return 1;
+}
+
+static TX_SIZE calculate_tx_size(VP9_COMP *const cpi, BLOCK_SIZE bsize,
+                                 MACROBLOCKD *const xd, unsigned int var,
+                                 unsigned int sse, int64_t ac_thr,
+                                 unsigned int source_variance, int is_intra) {
+  // TODO(marpan): Tune selection for intra-modes, screen content, etc.
+  TX_SIZE tx_size;
+  unsigned int var_thresh = is_intra ? (unsigned int)ac_thr : 1;
+  int limit_tx = 1;
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+      (source_variance == 0 || var < var_thresh))
+    limit_tx = 0;
+  if (cpi->common.tx_mode == TX_MODE_SELECT) {
+    if (sse > (var << 2))
+      tx_size = VPXMIN(max_txsize_lookup[bsize],
+                       tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+    else
+      tx_size = TX_8X8;
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && limit_tx &&
+        cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id))
+      tx_size = TX_8X8;
+    else if (tx_size > TX_16X16 && limit_tx)
+      tx_size = TX_16X16;
+    // For screen-content force 4X4 tx_size over 8X8, for large variance.
+    if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && tx_size == TX_8X8 &&
+        bsize <= BLOCK_16X16 && ((var >> 5) > (unsigned int)ac_thr))
+      tx_size = TX_4X4;
+  } else {
+    tx_size = VPXMIN(max_txsize_lookup[bsize],
+                     tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+  }
+  return tx_size;
+}
+
+static void compute_intra_yprediction(PREDICTION_MODE mode, BLOCK_SIZE bsize,
+                                      MACROBLOCK *x, MACROBLOCKD *xd) {
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  struct macroblock_plane *const p = &x->plane[0];
+  uint8_t *const src_buf_base = p->src.buf;
+  uint8_t *const dst_buf_base = pd->dst.buf;
+  const int src_stride = p->src.stride;
+  const int dst_stride = pd->dst.stride;
+  // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
+  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
+  const TX_SIZE tx_size = max_txsize_lookup[bsize];
+  const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+  const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+  int row, col;
+  // If mb_to_right_edge is < 0 we are in a situation in which
+  // the current block size extends into the UMV and we won't
+  // visit the sub blocks that are wholly within the UMV.
+  const int max_blocks_wide =
+      num_4x4_w + (xd->mb_to_right_edge >= 0
+                       ? 0
+                       : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+  const int max_blocks_high =
+      num_4x4_h + (xd->mb_to_bottom_edge >= 0
+                       ? 0
+                       : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
+  // Keep track of the row and column of the blocks we use so that we know
+  // if we are in the unrestricted motion border.
+  for (row = 0; row < max_blocks_high; row += (1 << tx_size)) {
+    // Skip visiting the sub blocks that are wholly within the UMV.
+    for (col = 0; col < max_blocks_wide; col += (1 << tx_size)) {
+      p->src.buf = &src_buf_base[4 * (row * (int64_t)src_stride + col)];
+      pd->dst.buf = &dst_buf_base[4 * (row * (int64_t)dst_stride + col)];
+      vp9_predict_intra_block(xd, b_width_log2_lookup[bsize], tx_size, mode,
+                              x->skip_encode ? p->src.buf : pd->dst.buf,
+                              x->skip_encode ? src_stride : dst_stride,
+                              pd->dst.buf, dst_stride, col, row, 0);
+    }
+  }
+  p->src.buf = src_buf_base;
+  pd->dst.buf = dst_buf_base;
+}
+
 static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
                                     MACROBLOCK *x, MACROBLOCKD *xd,
                                     int *out_rate_sum, int64_t *out_dist_sum,
                                     unsigned int *var_y, unsigned int *sse_y,
-                                    int mi_row, int mi_col, int *early_term) {
+                                    int mi_row, int mi_col, int *early_term,
+                                    int *flag_preduv_computed) {
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
@@ -311,8 +452,8 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
   struct macroblockd_plane *const pd = &xd->plane[0];
   const uint32_t dc_quant = pd->dequant[0];
   const uint32_t ac_quant = pd->dequant[1];
-  const int64_t dc_thr = dc_quant * dc_quant >> 6;
-  const int64_t ac_thr = ac_quant * ac_quant >> 6;
+  int64_t dc_thr = dc_quant * dc_quant >> 6;
+  int64_t ac_thr = ac_quant * ac_quant >> 6;
   unsigned int var;
   int sum;
   int skip_dc = 0;
@@ -325,6 +466,7 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
   unsigned int var8x8[64] = { 0 };
   TX_SIZE tx_size;
   int i, k;
+  uint32_t sum_sqr;
 #if CONFIG_VP9_HIGHBITDEPTH
   const vpx_bit_depth_t bd = cpi->common.bit_depth;
 #endif
@@ -336,31 +478,37 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
                  cpi->common.use_highbitdepth, bd,
 #endif
                  sse8x8, sum8x8, var8x8);
-  var = sse - (unsigned int)(((int64_t)sum * sum) >> (bw + bh + 4));
+  sum_sqr = (uint32_t)((int64_t)sum * sum) >> (bw + bh + 4);
+  var = sse > sum_sqr ? sse - sum_sqr : sum_sqr - sse;
 
   *var_y = var;
   *sse_y = sse;
 
-  if (cpi->common.tx_mode == TX_MODE_SELECT) {
-    if (sse > (var << 2))
-      tx_size = VPXMIN(max_txsize_lookup[bsize],
-                       tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
-    else
-      tx_size = TX_8X8;
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
+      cpi->oxcf.speed > 5)
+    ac_thr = vp9_scale_acskip_thresh(ac_thr, cpi->denoiser.denoising_level,
+                                     (abs(sum) >> (bw + bh)),
+                                     cpi->svc.temporal_layer_id);
+  else
+    ac_thr *= ac_thr_factor(cpi->oxcf.speed, cpi->common.width,
+                            cpi->common.height, abs(sum) >> (bw + bh));
+#else
+  ac_thr *= ac_thr_factor(cpi->oxcf.speed, cpi->common.width,
+                          cpi->common.height, abs(sum) >> (bw + bh));
+#endif
 
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
-        cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id))
-      tx_size = TX_8X8;
-    else if (tx_size > TX_16X16)
-      tx_size = TX_16X16;
-  } else {
-    tx_size = VPXMIN(max_txsize_lookup[bsize],
-                     tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
-  }
-
-  assert(tx_size >= TX_8X8);
+  tx_size = calculate_tx_size(cpi, bsize, xd, var, sse, ac_thr,
+                              x->source_variance, 0);
+  // The code below for setting skip flag assumes tranform size of at least 8x8,
+  // so force this lower limit on transform.
+  if (tx_size < TX_8X8) tx_size = TX_8X8;
   xd->mi[0]->tx_size = tx_size;
 
+  if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && x->zero_temp_sad_source &&
+      x->source_variance == 0)
+    dc_thr = dc_thr << 1;
+
   // Evaluate if the partition block is a skippable block in Y plane.
   {
     unsigned int sse16x16[16] = { 0 };
@@ -428,22 +576,26 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
 
     // Transform skipping test in UV planes.
     for (i = 1; i <= 2; i++) {
-      struct macroblock_plane *const p = &x->plane[i];
-      struct macroblockd_plane *const pd = &xd->plane[i];
-      const TX_SIZE uv_tx_size = get_uv_tx_size(xd->mi[0], pd);
+      struct macroblock_plane *const p_uv = &x->plane[i];
+      struct macroblockd_plane *const pd_uv = &xd->plane[i];
+      const TX_SIZE uv_tx_size = get_uv_tx_size(xd->mi[0], pd_uv);
       const BLOCK_SIZE unit_size = txsize_to_bsize[uv_tx_size];
-      const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd);
+      const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd_uv);
       const int uv_bw = b_width_log2_lookup[uv_bsize];
       const int uv_bh = b_height_log2_lookup[uv_bsize];
       const int sf = (uv_bw - b_width_log2_lookup[unit_size]) +
                      (uv_bh - b_height_log2_lookup[unit_size]);
-      const uint32_t uv_dc_thr = pd->dequant[0] * pd->dequant[0] >> (6 - sf);
-      const uint32_t uv_ac_thr = pd->dequant[1] * pd->dequant[1] >> (6 - sf);
+      const uint32_t uv_dc_thr =
+          pd_uv->dequant[0] * pd_uv->dequant[0] >> (6 - sf);
+      const uint32_t uv_ac_thr =
+          pd_uv->dequant[1] * pd_uv->dequant[1] >> (6 - sf);
       int j = i - 1;
 
       vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i);
-      var_uv[j] = cpi->fn_ptr[uv_bsize].vf(
-          p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse_uv[j]);
+      flag_preduv_computed[i - 1] = 1;
+      var_uv[j] = cpi->fn_ptr[uv_bsize].vf(p_uv->src.buf, p_uv->src.stride,
+                                           pd_uv->dst.buf, pd_uv->dst.stride,
+                                           &sse_uv[j]);
 
       if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) &&
           (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j]))
@@ -457,7 +609,6 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
     if (skip_uv[0] & skip_uv[1]) {
       *early_term = 1;
     }
-
     return;
   }
 
@@ -494,7 +645,7 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
 static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
                               MACROBLOCKD *xd, int *out_rate_sum,
                               int64_t *out_dist_sum, unsigned int *var_y,
-                              unsigned int *sse_y) {
+                              unsigned int *sse_y, int is_intra) {
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
@@ -514,24 +665,8 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
   *var_y = var;
   *sse_y = sse;
 
-  if (cpi->common.tx_mode == TX_MODE_SELECT) {
-    if (sse > (var << 2))
-      xd->mi[0]->tx_size =
-          VPXMIN(max_txsize_lookup[bsize],
-                 tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
-    else
-      xd->mi[0]->tx_size = TX_8X8;
-
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
-        cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id))
-      xd->mi[0]->tx_size = TX_8X8;
-    else if (xd->mi[0]->tx_size > TX_16X16)
-      xd->mi[0]->tx_size = TX_16X16;
-  } else {
-    xd->mi[0]->tx_size =
-        VPXMIN(max_txsize_lookup[bsize],
-               tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
-  }
+  xd->mi[0]->tx_size = calculate_tx_size(cpi, bsize, xd, var, sse, ac_thr,
+                                         x->source_variance, is_intra);
 
   // Evaluate if the partition block is a skippable block in Y plane.
   {
@@ -590,24 +725,9 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
   *out_dist_sum += dist << 4;
 }
 
-#if CONFIG_VP9_HIGHBITDEPTH
 static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
                       int *skippable, int64_t *sse, BLOCK_SIZE bsize,
-                      TX_SIZE tx_size) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  unsigned int var_y, sse_y;
-
-  (void)tx_size;
-  model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc->rate, &this_rdc->dist, &var_y,
-                    &sse_y);
-  *sse = INT_MAX;
-  *skippable = 0;
-  return;
-}
-#else
-static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
-                      int *skippable, int64_t *sse, BLOCK_SIZE bsize,
-                      TX_SIZE tx_size) {
+                      TX_SIZE tx_size, int rd_computed, int is_intra) {
   MACROBLOCKD *xd = &x->e_mbd;
   const struct macroblockd_plane *pd = &xd->plane[0];
   struct macroblock_plane *const p = &x->plane[0];
@@ -624,20 +744,44 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
   const int bw = 4 * num_4x4_w;
   const int bh = 4 * num_4x4_h;
 
+  if (cpi->sf.use_simple_block_yrd && cpi->common.frame_type != KEY_FRAME &&
+      (bsize < BLOCK_32X32 ||
+       (cpi->use_svc &&
+        (bsize < BLOCK_32X32 || cpi->svc.temporal_layer_id > 0)))) {
+    unsigned int var_y, sse_y;
+    (void)tx_size;
+    if (!rd_computed)
+      model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc->rate, &this_rdc->dist,
+                        &var_y, &sse_y, is_intra);
+    *sse = INT_MAX;
+    *skippable = 0;
+    return;
+  }
+
   (void)cpi;
 
   // The max tx_size passed in is TX_16X16.
   assert(tx_size != TX_32X32);
-
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vpx_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf,
+                              p->src.stride, pd->dst.buf, pd->dst.stride,
+                              x->e_mbd.bd);
+  } else {
+    vpx_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+                       pd->dst.buf, pd->dst.stride);
+  }
+#else
   vpx_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
                      pd->dst.buf, pd->dst.stride);
+#endif
   *skippable = 1;
   // Keep track of the row and column of the blocks we use so that we know
   // if we are in the unrestricted motion border.
   for (r = 0; r < max_blocks_high; r += block_step) {
     for (c = 0; c < num_4x4_w; c += block_step) {
       if (c < max_blocks_wide) {
-        const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+        const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size];
         tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
         tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
         tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
@@ -646,29 +790,26 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
         const int16_t *src_diff;
         src_diff = &p->src_diff[(r * diff_stride + c) << 2];
 
+        // skip block condition should be handled before this is called.
+        assert(!x->skip_block);
+
         switch (tx_size) {
           case TX_16X16:
-            vpx_hadamard_16x16(src_diff, diff_stride, (int16_t *)coeff);
-            vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
-                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                            pd->dequant, eob, scan_order->scan,
-                            scan_order->iscan);
+            vpx_hadamard_16x16(src_diff, diff_stride, coeff);
+            vp9_quantize_fp(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob,
+                            scan_order);
             break;
           case TX_8X8:
-            vpx_hadamard_8x8(src_diff, diff_stride, (int16_t *)coeff);
-            vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
-                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                            pd->dequant, eob, scan_order->scan,
-                            scan_order->iscan);
+            vpx_hadamard_8x8(src_diff, diff_stride, coeff);
+            vp9_quantize_fp(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+                            scan_order);
             break;
-          case TX_4X4:
-            x->fwd_txm4x4(src_diff, coeff, diff_stride);
-            vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
-                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                            pd->dequant, eob, scan_order->scan,
-                            scan_order->iscan);
+          default:
+            assert(tx_size == TX_4X4);
+            x->fwd_txfm4x4(src_diff, coeff, diff_stride);
+            vp9_quantize_fp(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+                            scan_order);
             break;
-          default: assert(0); break;
         }
         *skippable &= (*eob == 0);
         eob_cost += 1;
@@ -699,7 +840,7 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
         if (*eob == 1)
           this_rdc->rate += (int)abs(qcoeff[0]);
         else if (*eob > 1)
-          this_rdc->rate += vpx_satd((const int16_t *)qcoeff, step << 4);
+          this_rdc->rate += vpx_satd(qcoeff, step << 4);
 
         this_rdc->dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> 2;
       }
@@ -711,7 +852,6 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
   this_rdc->rate <<= (2 + VP9_PROB_COST_SHIFT);
   this_rdc->rate += (eob_cost << VP9_PROB_COST_SHIFT);
 }
-#endif
 
 static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE plane_bsize,
                                MACROBLOCK *x, MACROBLOCKD *xd,
@@ -799,13 +939,11 @@ static void free_pred_buffer(PRED_BUFFER *p) {
   if (p != NULL) p->in_use = 0;
 }
 
-static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
-                                 int mi_row, int mi_col,
-                                 MV_REFERENCE_FRAME ref_frame,
-                                 PREDICTION_MODE this_mode, unsigned int var_y,
-                                 unsigned int sse_y,
-                                 struct buf_2d yv12_mb[][MAX_MB_PLANE],
-                                 int *rate, int64_t *dist) {
+static void encode_breakout_test(
+    VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col,
+    MV_REFERENCE_FRAME ref_frame, PREDICTION_MODE this_mode, unsigned int var_y,
+    unsigned int sse_y, struct buf_2d yv12_mb[][MAX_MB_PLANE], int *rate,
+    int64_t *dist, int *flag_preduv_computed) {
   MACROBLOCKD *xd = &x->e_mbd;
   MODE_INFO *const mi = xd->mi[0];
   const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
@@ -815,6 +953,8 @@ static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
   // Skipping threshold for dc.
   unsigned int thresh_dc;
   int motion_low = 1;
+
+  if (cpi->use_svc && ref_frame == GOLDEN_FRAME) return;
   if (mi->mv[0].as_mv.row > 64 || mi->mv[0].as_mv.row < -64 ||
       mi->mv[0].as_mv.col > 64 || mi->mv[0].as_mv.col < -64)
     motion_low = 0;
@@ -865,9 +1005,7 @@ static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
       thresh_dc_uv = 0;
     }
 
-    // Skip UV prediction unless breakout is zero (lossless) to save
-    // computation with low impact on the result
-    if (x->encode_breakout == 0) {
+    if (!flag_preduv_computed[0] || !flag_preduv_computed[1]) {
       xd->plane[1].pre[0] = yv12_mb[ref_frame][1];
       xd->plane[2].pre[0] = yv12_mb[ref_frame][2];
       vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, bsize);
@@ -921,8 +1059,8 @@ static void estimate_block_intra(int plane, int block, int row, int col,
   VP9_COMP *const cpi = args->cpi;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
-  struct macroblock_plane *const p = &x->plane[0];
-  struct macroblockd_plane *const pd = &xd->plane[0];
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
   const BLOCK_SIZE bsize_tx = txsize_to_bsize[tx_size];
   uint8_t *const src_buf_base = p->src.buf;
   uint8_t *const dst_buf_base = pd->dst.buf;
@@ -932,8 +1070,8 @@ static void estimate_block_intra(int plane, int block, int row, int col,
 
   (void)block;
 
-  p->src.buf = &src_buf_base[4 * (row * src_stride + col)];
-  pd->dst.buf = &dst_buf_base[4 * (row * dst_stride + col)];
+  p->src.buf = &src_buf_base[4 * (row * (int64_t)src_stride + col)];
+  pd->dst.buf = &dst_buf_base[4 * (row * (int64_t)dst_stride + col)];
   // Use source buffer as an approximation for the fully reconstructed buffer.
   vp9_predict_intra_block(xd, b_width_log2_lookup[plane_bsize], tx_size,
                           args->mode, x->skip_encode ? p->src.buf : pd->dst.buf,
@@ -942,13 +1080,12 @@ static void estimate_block_intra(int plane, int block, int row, int col,
 
   if (plane == 0) {
     int64_t this_sse = INT64_MAX;
-    // TODO(jingning): This needs further refactoring.
     block_yrd(cpi, x, &this_rdc, &args->skippable, &this_sse, bsize_tx,
-              VPXMIN(tx_size, TX_16X16));
+              VPXMIN(tx_size, TX_16X16), 0, 1);
   } else {
     unsigned int var = 0;
     unsigned int sse = 0;
-    model_rd_for_sb_uv(cpi, plane_bsize, x, xd, &this_rdc, &var, &sse, plane,
+    model_rd_for_sb_uv(cpi, bsize_tx, x, xd, &this_rdc, &var, &sse, plane,
                        plane);
   }
 
@@ -958,10 +1095,11 @@ static void estimate_block_intra(int plane, int block, int row, int col,
   args->rdc->dist += this_rdc.dist;
 }
 
-static const THR_MODES mode_idx[MAX_REF_FRAMES - 1][4] = {
+static const THR_MODES mode_idx[MAX_REF_FRAMES][4] = {
   { THR_DC, THR_V_PRED, THR_H_PRED, THR_TM },
   { THR_NEARESTMV, THR_NEARMV, THR_ZEROMV, THR_NEWMV },
   { THR_NEARESTG, THR_NEARG, THR_ZEROG, THR_NEWG },
+  { THR_NEARESTA, THR_NEARA, THR_ZEROA, THR_NEWA },
 };
 
 static const PREDICTION_MODE intra_mode_list[] = { DC_PRED, V_PRED, H_PRED,
@@ -981,8 +1119,34 @@ static int mode_offset(const PREDICTION_MODE mode) {
   }
 }
 
+static INLINE int rd_less_than_thresh_row_mt(int64_t best_rd, int thresh,
+                                             const int *const thresh_fact) {
+  int is_rd_less_than_thresh;
+  is_rd_less_than_thresh =
+      best_rd < ((int64_t)thresh * (*thresh_fact) >> 5) || thresh == INT_MAX;
+  return is_rd_less_than_thresh;
+}
+
+static INLINE void update_thresh_freq_fact_row_mt(
+    VP9_COMP *cpi, TileDataEnc *tile_data, unsigned int source_variance,
+    int thresh_freq_fact_idx, MV_REFERENCE_FRAME ref_frame,
+    THR_MODES best_mode_idx, PREDICTION_MODE mode) {
+  THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)];
+  int freq_fact_idx = thresh_freq_fact_idx + thr_mode_idx;
+  int *freq_fact = &tile_data->row_base_thresh_freq_fact[freq_fact_idx];
+  if (thr_mode_idx == best_mode_idx)
+    *freq_fact -= (*freq_fact >> 4);
+  else if (cpi->sf.limit_newmv_early_exit && mode == NEWMV &&
+           ref_frame == LAST_FRAME && source_variance < 5) {
+    *freq_fact = VPXMIN(*freq_fact + RD_THRESH_INC, 32);
+  } else {
+    *freq_fact = VPXMIN(*freq_fact + RD_THRESH_INC,
+                        cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
+  }
+}
+
 static INLINE void update_thresh_freq_fact(
-    VP9_COMP *cpi, TileDataEnc *tile_data, int source_variance,
+    VP9_COMP *cpi, TileDataEnc *tile_data, unsigned int source_variance,
     BLOCK_SIZE bsize, MV_REFERENCE_FRAME ref_frame, THR_MODES best_mode_idx,
     PREDICTION_MODE mode) {
   THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)];
@@ -1015,6 +1179,7 @@ void vp9_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
   const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0);
   const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0);
   bmode_costs = cpi->y_mode_costs[A][L];
+  assert(bsize >= BLOCK_8X8);
 
   (void)ctx;
   vp9_rd_cost_reset(&best_rdc);
@@ -1089,33 +1254,22 @@ static const REF_MODE ref_mode_set[RT_INTER_MODES] = {
   { ALTREF_FRAME, ZEROMV }, { ALTREF_FRAME, NEARESTMV },
   { ALTREF_FRAME, NEARMV }, { ALTREF_FRAME, NEWMV }
 };
-static const REF_MODE ref_mode_set_svc[RT_INTER_MODES] = {
-  { LAST_FRAME, ZEROMV },      { GOLDEN_FRAME, ZEROMV },
-  { LAST_FRAME, NEARESTMV },   { LAST_FRAME, NEARMV },
+
+#define RT_INTER_MODES_SVC 8
+static const REF_MODE ref_mode_set_svc[RT_INTER_MODES_SVC] = {
+  { LAST_FRAME, ZEROMV },      { LAST_FRAME, NEARESTMV },
+  { LAST_FRAME, NEARMV },      { GOLDEN_FRAME, ZEROMV },
   { GOLDEN_FRAME, NEARESTMV }, { GOLDEN_FRAME, NEARMV },
   { LAST_FRAME, NEWMV },       { GOLDEN_FRAME, NEWMV }
 };
 
-static int set_intra_cost_penalty(const VP9_COMP *const cpi, BLOCK_SIZE bsize) {
-  const VP9_COMMON *const cm = &cpi->common;
-  // Reduce the intra cost penalty for small blocks (<=16x16).
-  int reduction_fac =
-      (bsize <= BLOCK_16X16) ? ((bsize <= BLOCK_8X8) ? 4 : 2) : 0;
-  if (cpi->noise_estimate.enabled && cpi->noise_estimate.level == kHigh)
-    // Don't reduce intra cost penalty if estimated noise level is high.
-    reduction_fac = 0;
-  return vp9_get_intra_cost_penalty(cm->base_qindex, cm->y_dc_delta_q,
-                                    cm->bit_depth) >>
-         reduction_fac;
-}
-
 static INLINE void find_predictors(
     VP9_COMP *cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
     int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
     int const_motion[MAX_REF_FRAMES], int *ref_frame_skip_mask,
-    const int flag_list[4], TileDataEnc *tile_data, int mi_row, int mi_col,
+    TileDataEnc *tile_data, int mi_row, int mi_col,
     struct buf_2d yv12_mb[4][MAX_MB_PLANE], BLOCK_SIZE bsize,
-    int force_skip_low_temp_var) {
+    int force_skip_low_temp_var, int comp_pred_allowed) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
@@ -1125,11 +1279,11 @@ static INLINE void find_predictors(
   frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
   frame_mv[ZEROMV][ref_frame].as_int = 0;
   // this needs various further optimizations. to be continued..
-  if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) {
+  if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) && (yv12 != NULL)) {
     int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
     const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
     vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
-    if (cm->use_prev_frame_mvs) {
+    if (cm->use_prev_frame_mvs || comp_pred_allowed) {
       vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame, candidates, mi_row, mi_col,
                        x->mbmi_ext->mode_context);
     } else {
@@ -1155,7 +1309,8 @@ static INLINE void find_predictors(
 static void vp9_NEWMV_diff_bias(const NOISE_ESTIMATE *ne, MACROBLOCKD *xd,
                                 PREDICTION_MODE this_mode, RD_COST *this_rdc,
                                 BLOCK_SIZE bsize, int mv_row, int mv_col,
-                                int is_last_frame) {
+                                int is_last_frame, int lowvar_highsumdiff,
+                                int is_skin) {
   // Bias against MVs associated with NEWMV mode that are very different from
   // top/left neighbors.
   if (this_mode == NEWMV) {
@@ -1202,9 +1357,12 @@ static void vp9_NEWMV_diff_bias(const NOISE_ESTIMATE *ne, MACROBLOCKD *xd,
   // If noise estimation is enabled, and estimated level is above threshold,
   // add a bias to LAST reference with small motion, for large blocks.
   if (ne->enabled && ne->level >= kMedium && bsize >= BLOCK_32X32 &&
-      is_last_frame && mv_row < 8 && mv_row > -8 && mv_col < 8 && mv_col > -8) {
-    this_rdc->rdcost = 7 * this_rdc->rdcost >> 3;
-  }
+      is_last_frame && mv_row < 8 && mv_row > -8 && mv_col < 8 && mv_col > -8)
+    this_rdc->rdcost = 7 * (this_rdc->rdcost >> 3);
+  else if (lowvar_highsumdiff && !is_skin && bsize >= BLOCK_16X16 &&
+           is_last_frame && mv_row < 16 && mv_row > -16 && mv_col < 16 &&
+           mv_col > -16)
+    this_rdc->rdcost = 7 * (this_rdc->rdcost >> 3);
 }
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
@@ -1212,18 +1370,16 @@ static void vp9_pickmode_ctx_den_update(
     VP9_PICKMODE_CTX_DEN *ctx_den, int64_t zero_last_cost_orig,
     int ref_frame_cost[MAX_REF_FRAMES],
     int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], int reuse_inter_pred,
-    TX_SIZE best_tx_size, PREDICTION_MODE best_mode,
-    MV_REFERENCE_FRAME best_ref_frame, INTERP_FILTER best_pred_filter,
-    uint8_t best_mode_skip_txfm) {
+    BEST_PICKMODE *bp) {
   ctx_den->zero_last_cost_orig = zero_last_cost_orig;
   ctx_den->ref_frame_cost = ref_frame_cost;
   ctx_den->frame_mv = frame_mv;
   ctx_den->reuse_inter_pred = reuse_inter_pred;
-  ctx_den->best_tx_size = best_tx_size;
-  ctx_den->best_mode = best_mode;
-  ctx_den->best_ref_frame = best_ref_frame;
-  ctx_den->best_pred_filter = best_pred_filter;
-  ctx_den->best_mode_skip_txfm = best_mode_skip_txfm;
+  ctx_den->best_tx_size = bp->best_tx_size;
+  ctx_den->best_mode = bp->best_mode;
+  ctx_den->best_ref_frame = bp->best_ref_frame;
+  ctx_den->best_pred_filter = bp->best_pred_filter;
+  ctx_den->best_mode_skip_txfm = bp->best_mode_skip_txfm;
 }
 
 static void recheck_zeromv_after_denoising(
@@ -1239,8 +1395,10 @@ static void recheck_zeromv_after_denoising(
       ctx_den->zero_last_cost_orig < (best_rdc->rdcost << 3) &&
       ((ctx_den->best_ref_frame == INTRA_FRAME && decision >= FILTER_BLOCK) ||
        (ctx_den->best_ref_frame == GOLDEN_FRAME &&
+        cpi->svc.number_spatial_layers == 1 &&
         decision == FILTER_ZEROMV_BLOCK))) {
     // Check if we should pick ZEROMV on denoised signal.
+    VP9_COMMON *const cm = &cpi->common;
     int rate = 0;
     int64_t dist = 0;
     uint32_t var_y = UINT_MAX;
@@ -1248,12 +1406,14 @@ static void recheck_zeromv_after_denoising(
     RD_COST this_rdc;
     mi->mode = ZEROMV;
     mi->ref_frame[0] = LAST_FRAME;
-    mi->ref_frame[1] = NONE;
+    mi->ref_frame[1] = NO_REF_FRAME;
+    set_ref_ptrs(cm, xd, mi->ref_frame[0], NO_REF_FRAME);
     mi->mv[0].as_int = 0;
     mi->interp_filter = EIGHTTAP;
+    if (cpi->sf.default_interp_filter == BILINEAR) mi->interp_filter = BILINEAR;
     xd->plane[0].pre[0] = yv12_mb[LAST_FRAME][0];
     vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
-    model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y);
+    model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y, 0);
     this_rdc.rate = rate + ctx_den->ref_frame_cost[LAST_FRAME] +
                     cpi->inter_mode_cost[x->mbmi_ext->mode_context[LAST_FRAME]]
                                         [INTER_OFFSET(ZEROMV)];
@@ -1265,6 +1425,7 @@ static void recheck_zeromv_after_denoising(
       this_rdc = *best_rdc;
       mi->mode = ctx_den->best_mode;
       mi->ref_frame[0] = ctx_den->best_ref_frame;
+      set_ref_ptrs(cm, xd, mi->ref_frame[0], NO_REF_FRAME);
       mi->interp_filter = ctx_den->best_pred_filter;
       if (ctx_den->best_ref_frame == INTRA_FRAME) {
         mi->mv[0].as_int = INVALID_MV;
@@ -1335,42 +1496,240 @@ static INLINE int get_force_skip_low_temp_var(uint8_t *variance_low, int mi_row,
   return force_skip_low_temp_var;
 }
 
+static void search_filter_ref(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
+                              int mi_row, int mi_col, PRED_BUFFER *tmp,
+                              BLOCK_SIZE bsize, int reuse_inter_pred,
+                              PRED_BUFFER **this_mode_pred, unsigned int *var_y,
+                              unsigned int *sse_y, int force_smooth_filter,
+                              int *this_early_term, int *flag_preduv_computed,
+                              int use_model_yrd_large) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mi = xd->mi[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  const int bw = num_4x4_blocks_wide_lookup[bsize] << 2;
+
+  int pf_rate[3] = { 0 };
+  int64_t pf_dist[3] = { 0 };
+  int curr_rate[3] = { 0 };
+  unsigned int pf_var[3] = { 0 };
+  unsigned int pf_sse[3] = { 0 };
+  TX_SIZE pf_tx_size[3] = { 0 };
+  int64_t best_cost = INT64_MAX;
+  INTERP_FILTER best_filter = SWITCHABLE, filter;
+  PRED_BUFFER *current_pred = *this_mode_pred;
+  uint8_t skip_txfm = SKIP_TXFM_NONE;
+  int best_early_term = 0;
+  int best_flag_preduv_computed[2] = { 0 };
+  INTERP_FILTER filter_start = force_smooth_filter ? EIGHTTAP_SMOOTH : EIGHTTAP;
+  INTERP_FILTER filter_end = EIGHTTAP_SMOOTH;
+  for (filter = filter_start; filter <= filter_end; ++filter) {
+    int64_t cost;
+    mi->interp_filter = filter;
+    vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+    // For large partition blocks, extra testing is done.
+    if (use_model_yrd_large)
+      model_rd_for_sb_y_large(cpi, bsize, x, xd, &pf_rate[filter],
+                              &pf_dist[filter], &pf_var[filter],
+                              &pf_sse[filter], mi_row, mi_col, this_early_term,
+                              flag_preduv_computed);
+    else
+      model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter], &pf_dist[filter],
+                        &pf_var[filter], &pf_sse[filter], 0);
+    curr_rate[filter] = pf_rate[filter];
+    pf_rate[filter] += vp9_get_switchable_rate(cpi, xd);
+    cost = RDCOST(x->rdmult, x->rddiv, pf_rate[filter], pf_dist[filter]);
+    pf_tx_size[filter] = mi->tx_size;
+    if (cost < best_cost) {
+      best_filter = filter;
+      best_cost = cost;
+      skip_txfm = x->skip_txfm[0];
+      best_early_term = *this_early_term;
+      best_flag_preduv_computed[0] = flag_preduv_computed[0];
+      best_flag_preduv_computed[1] = flag_preduv_computed[1];
+
+      if (reuse_inter_pred) {
+        if (*this_mode_pred != current_pred) {
+          free_pred_buffer(*this_mode_pred);
+          *this_mode_pred = current_pred;
+        }
+        if (filter != filter_end) {
+          current_pred = &tmp[get_pred_buffer(tmp, 3)];
+          pd->dst.buf = current_pred->data;
+          pd->dst.stride = bw;
+        }
+      }
+    }
+  }
+
+  if (reuse_inter_pred && *this_mode_pred != current_pred)
+    free_pred_buffer(current_pred);
+
+  mi->interp_filter = best_filter;
+  mi->tx_size = pf_tx_size[best_filter];
+  this_rdc->rate = curr_rate[best_filter];
+  this_rdc->dist = pf_dist[best_filter];
+  *var_y = pf_var[best_filter];
+  *sse_y = pf_sse[best_filter];
+  x->skip_txfm[0] = skip_txfm;
+  *this_early_term = best_early_term;
+  flag_preduv_computed[0] = best_flag_preduv_computed[0];
+  flag_preduv_computed[1] = best_flag_preduv_computed[1];
+  if (reuse_inter_pred) {
+    pd->dst.buf = (*this_mode_pred)->data;
+    pd->dst.stride = (*this_mode_pred)->stride;
+  } else if (best_filter < filter_end) {
+    mi->interp_filter = best_filter;
+    vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+  }
+}
+
+static int search_new_mv(VP9_COMP *cpi, MACROBLOCK *x,
+                         int_mv frame_mv[][MAX_REF_FRAMES],
+                         MV_REFERENCE_FRAME ref_frame, int gf_temporal_ref,
+                         BLOCK_SIZE bsize, int mi_row, int mi_col,
+                         int best_pred_sad, int *rate_mv,
+                         unsigned int best_sse_sofar, RD_COST *best_rdc) {
+  SVC *const svc = &cpi->svc;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mi = xd->mi[0];
+  SPEED_FEATURES *const sf = &cpi->sf;
+
+  if (ref_frame > LAST_FRAME && gf_temporal_ref &&
+      cpi->oxcf.rc_mode == VPX_CBR) {
+    int tmp_sad;
+    uint32_t dis;
+    int cost_list[5] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+
+    if (bsize < BLOCK_16X16) return -1;
+
+    tmp_sad = vp9_int_pro_motion_estimation(
+        cpi, x, bsize, mi_row, mi_col,
+        &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv);
+
+    if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) return -1;
+    if (tmp_sad + (num_pels_log2_lookup[bsize] << 4) > best_pred_sad) return -1;
+
+    frame_mv[NEWMV][ref_frame].as_int = mi->mv[0].as_int;
+    *rate_mv = vp9_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv,
+                               &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv,
+                               x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+    frame_mv[NEWMV][ref_frame].as_mv.row >>= 3;
+    frame_mv[NEWMV][ref_frame].as_mv.col >>= 3;
+
+    cpi->find_fractional_mv_step(
+        x, &frame_mv[NEWMV][ref_frame].as_mv,
+        &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv,
+        cpi->common.allow_high_precision_mv, x->errorperbit,
+        &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+        cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list),
+        x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref_frame], NULL, 0, 0,
+        cpi->sf.use_accurate_subpel_search);
+  } else if (svc->use_base_mv && svc->spatial_layer_id) {
+    if (frame_mv[NEWMV][ref_frame].as_int != INVALID_MV) {
+      const int pre_stride = xd->plane[0].pre[0].stride;
+      unsigned int base_mv_sse = UINT_MAX;
+      int scale = (cpi->rc.avg_frame_low_motion > 60) ? 2 : 4;
+      const uint8_t *const pre_buf =
+          xd->plane[0].pre[0].buf +
+          (frame_mv[NEWMV][ref_frame].as_mv.row >> 3) * pre_stride +
+          (frame_mv[NEWMV][ref_frame].as_mv.col >> 3);
+      cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride,
+                            pre_buf, pre_stride, &base_mv_sse);
+
+      // Exit NEWMV search if base_mv is (0,0) && bsize < BLOCK_16x16,
+      // for SVC encoding.
+      if (cpi->use_svc && svc->use_base_mv && bsize < BLOCK_16X16 &&
+          frame_mv[NEWMV][ref_frame].as_mv.row == 0 &&
+          frame_mv[NEWMV][ref_frame].as_mv.col == 0)
+        return -1;
+
+      // Exit NEWMV search if base_mv_sse is large.
+      if (sf->base_mv_aggressive && (base_mv_sse >> scale) > best_sse_sofar)
+        return -1;
+      if ((base_mv_sse >> 1) < best_sse_sofar) {
+        // Base layer mv is good.
+        // Exit NEWMV search if the base_mv is (0, 0) and sse is low, since
+        // (0, 0) mode is already tested.
+        unsigned int base_mv_sse_normalized =
+            base_mv_sse >>
+            (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+        if (sf->base_mv_aggressive && base_mv_sse <= best_sse_sofar &&
+            base_mv_sse_normalized < 400 &&
+            frame_mv[NEWMV][ref_frame].as_mv.row == 0 &&
+            frame_mv[NEWMV][ref_frame].as_mv.col == 0)
+          return -1;
+        if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+                                    &frame_mv[NEWMV][ref_frame], rate_mv,
+                                    best_rdc->rdcost, 1)) {
+          return -1;
+        }
+      } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+                                         &frame_mv[NEWMV][ref_frame], rate_mv,
+                                         best_rdc->rdcost, 0)) {
+        return -1;
+      }
+    } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+                                       &frame_mv[NEWMV][ref_frame], rate_mv,
+                                       best_rdc->rdcost, 0)) {
+      return -1;
+    }
+  } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+                                     &frame_mv[NEWMV][ref_frame], rate_mv,
+                                     best_rdc->rdcost, 0)) {
+    return -1;
+  }
+
+  return 0;
+}
+
+static INLINE void init_best_pickmode(BEST_PICKMODE *bp) {
+  bp->best_mode = ZEROMV;
+  bp->best_ref_frame = LAST_FRAME;
+  bp->best_tx_size = TX_SIZES;
+  bp->best_intra_tx_size = TX_SIZES;
+  bp->best_pred_filter = EIGHTTAP;
+  bp->best_mode_skip_txfm = SKIP_TXFM_NONE;
+  bp->best_second_ref_frame = NO_REF_FRAME;
+  bp->best_pred = NULL;
+}
+
 void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
                          int mi_row, int mi_col, RD_COST *rd_cost,
                          BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
   VP9_COMMON *const cm = &cpi->common;
   SPEED_FEATURES *const sf = &cpi->sf;
-  const SVC *const svc = &cpi->svc;
+  SVC *const svc = &cpi->svc;
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mi = xd->mi[0];
   struct macroblockd_plane *const pd = &xd->plane[0];
-  PREDICTION_MODE best_mode = ZEROMV;
-  MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME;
-  MV_REFERENCE_FRAME usable_ref_frame;
-  TX_SIZE best_tx_size = TX_SIZES;
-  INTERP_FILTER best_pred_filter = EIGHTTAP;
+
+  BEST_PICKMODE best_pickmode;
+
+  MV_REFERENCE_FRAME ref_frame;
+  MV_REFERENCE_FRAME usable_ref_frame, second_ref_frame;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
-  struct buf_2d yv12_mb[4][MAX_MB_PLANE];
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
+  uint8_t mode_checked[MB_MODE_COUNT][MAX_REF_FRAMES];
+  struct buf_2d yv12_mb[4][MAX_MB_PLANE] = { 0 };
   RD_COST this_rdc, best_rdc;
-  uint8_t skip_txfm = SKIP_TXFM_NONE, best_mode_skip_txfm = SKIP_TXFM_NONE;
   // var_y and sse_y are saved to be used in skipping checking
   unsigned int var_y = UINT_MAX;
   unsigned int sse_y = UINT_MAX;
-  const int intra_cost_penalty = set_intra_cost_penalty(cpi, bsize);
+  const int intra_cost_penalty =
+      vp9_get_intra_cost_penalty(cpi, bsize, cm->base_qindex, cm->y_dc_delta_q);
   int64_t inter_mode_thresh =
       RDCOST(x->rdmult, x->rddiv, intra_cost_penalty, 0);
   const int *const rd_threshes = cpi->rd.threshes[mi->segment_id][bsize];
-  const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
+  const int sb_row = mi_row >> MI_BLOCK_SIZE_LOG2;
+  int thresh_freq_fact_idx = (sb_row * BLOCK_SIZES + bsize) * MAX_MODES;
+  const int *const rd_thresh_freq_fact =
+      (cpi->sf.adaptive_rd_thresh_row_mt)
+          ? &(tile_data->row_base_thresh_freq_fact[thresh_freq_fact_idx])
+          : tile_data->thresh_freq_fact[bsize];
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  const int denoise_recheck_zeromv = 1;
+#endif
   INTERP_FILTER filter_ref;
-  const int bsl = mi_width_log2_lookup[bsize];
-  const int pred_filter_search =
-      cm->interp_filter == SWITCHABLE
-          ? (((mi_row + mi_col) >> bsl) +
-             get_chessboard_index(cm->current_video_frame)) &
-                0x1
-          : 0;
+  int pred_filter_search = cm->interp_filter == SWITCHABLE;
   int const_motion[MAX_REF_FRAMES] = { 0 };
   const int bh = num_4x4_blocks_high_lookup[bsize] << 2;
   const int bw = num_4x4_blocks_wide_lookup[bsize] << 2;
@@ -1378,12 +1737,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   // process.
   // tmp[3] points to dst buffer, and the other 3 point to allocated buffers.
   PRED_BUFFER tmp[4];
-  DECLARE_ALIGNED(16, uint8_t, pred_buf[3 * 64 * 64]);
+  DECLARE_ALIGNED(16, uint8_t, pred_buf[3 * 64 * 64] VPX_UNINITIALIZED);
 #if CONFIG_VP9_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint16_t, pred_buf_16[3 * 64 * 64]);
+  DECLARE_ALIGNED(16, uint16_t, pred_buf_16[3 * 64 * 64] VPX_UNINITIALIZED);
 #endif
   struct buf_2d orig_dst = pd->dst;
-  PRED_BUFFER *best_pred = NULL;
   PRED_BUFFER *this_mode_pred = NULL;
   const int pixels_in_block = bh * bw;
   int reuse_inter_pred = cpi->sf.reuse_inter_pred_sby && ctx->pred_pixel_ready;
@@ -1397,12 +1755,87 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   int use_golden_nonzeromv = 1;
   int force_skip_low_temp_var = 0;
   int skip_ref_find_pred[4] = { 0 };
+  unsigned int sse_zeromv_normalized = UINT_MAX;
+  unsigned int best_sse_sofar = UINT_MAX;
+  int gf_temporal_ref = 0;
+  int force_test_gf_zeromv = 0;
 #if CONFIG_VP9_TEMPORAL_DENOISING
   VP9_PICKMODE_CTX_DEN ctx_den;
   int64_t zero_last_cost_orig = INT64_MAX;
+  int denoise_svc_pickmode = 1;
 #endif
+  INTERP_FILTER filter_gf_svc = EIGHTTAP;
+  MV_REFERENCE_FRAME inter_layer_ref = GOLDEN_FRAME;
+  const struct segmentation *const seg = &cm->seg;
+  int comp_modes = 0;
+  int num_inter_modes = (cpi->use_svc) ? RT_INTER_MODES_SVC : RT_INTER_MODES;
+  int flag_svc_subpel = 0;
+  int svc_mv_col = 0;
+  int svc_mv_row = 0;
+  int no_scaling = 0;
+  int large_block = 0;
+  int use_model_yrd_large = 0;
+  unsigned int thresh_svc_skip_golden = 500;
+  unsigned int thresh_skip_golden = 500;
+  int force_smooth_filter = cpi->sf.force_smooth_interpol;
+  int scene_change_detected =
+      cpi->rc.high_source_sad ||
+      (cpi->use_svc && cpi->svc.high_source_sad_superframe);
+
+  init_best_pickmode(&best_pickmode);
+
+  x->encode_breakout = seg->enabled
+                           ? cpi->segment_encode_breakout[mi->segment_id]
+                           : cpi->encode_breakout;
+
+  x->source_variance = UINT_MAX;
+  if (cpi->sf.default_interp_filter == BILINEAR) {
+    best_pickmode.best_pred_filter = BILINEAR;
+    filter_gf_svc = BILINEAR;
+  }
+  if (cpi->use_svc && svc->spatial_layer_id > 0) {
+    int layer =
+        LAYER_IDS_TO_IDX(svc->spatial_layer_id - 1, svc->temporal_layer_id,
+                         svc->number_temporal_layers);
+    LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+    if (lc->scaling_factor_num == lc->scaling_factor_den) no_scaling = 1;
+  }
+  if (svc->spatial_layer_id > 0 &&
+      (svc->high_source_sad_superframe || no_scaling))
+    thresh_svc_skip_golden = 0;
+  // Lower the skip threshold if lower spatial layer is better quality relative
+  // to current layer.
+  else if (svc->spatial_layer_id > 0 && cm->base_qindex > 150 &&
+           cm->base_qindex > svc->lower_layer_qindex + 15)
+    thresh_svc_skip_golden = 100;
+  // Increase skip threshold if lower spatial layer is lower quality relative
+  // to current layer.
+  else if (svc->spatial_layer_id > 0 && cm->base_qindex < 140 &&
+           cm->base_qindex < svc->lower_layer_qindex - 20)
+    thresh_svc_skip_golden = 1000;
+
+  if (!cpi->use_svc ||
+      (svc->use_gf_temporal_ref_current_layer &&
+       !svc->layer_context[svc->temporal_layer_id].is_key_frame)) {
+    struct scale_factors *const sf_last = &cm->frame_refs[LAST_FRAME - 1].sf;
+    struct scale_factors *const sf_golden =
+        &cm->frame_refs[GOLDEN_FRAME - 1].sf;
+    gf_temporal_ref = 1;
+    // For temporal long term prediction, check that the golden reference
+    // is same scale as last reference, otherwise disable.
+    if ((sf_last->x_scale_fp != sf_golden->x_scale_fp) ||
+        (sf_last->y_scale_fp != sf_golden->y_scale_fp)) {
+      gf_temporal_ref = 0;
+    } else {
+      if (cpi->rc.avg_frame_low_motion > 70)
+        thresh_svc_skip_golden = 500;
+      else
+        thresh_svc_skip_golden = 0;
+    }
+  }
 
   init_ref_frame_cost(cm, xd, ref_frame_cost);
+  memset(&mode_checked[0][0], 0, MB_MODE_COUNT * MAX_REF_FRAMES);
 
   if (reuse_inter_pred) {
     int i;
@@ -1426,23 +1859,32 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
   x->skip = 0;
 
+  if (cpi->sf.cb_pred_filter_search) {
+    const int bsl = mi_width_log2_lookup[bsize];
+    pred_filter_search = cm->interp_filter == SWITCHABLE
+                             ? (((mi_row + mi_col) >> bsl) +
+                                get_chessboard_index(cm->current_video_frame)) &
+                                   0x1
+                             : 0;
+  }
   // Instead of using vp9_get_pred_context_switchable_interp(xd) to assign
   // filter_ref, we use a less strict condition on assigning filter_ref.
   // This is to reduce the probabily of entering the flow of not assigning
   // filter_ref and then skip filter search.
-  if (xd->above_mi && is_inter_block(xd->above_mi))
-    filter_ref = xd->above_mi->interp_filter;
-  else if (xd->left_mi && is_inter_block(xd->left_mi))
-    filter_ref = xd->left_mi->interp_filter;
-  else
-    filter_ref = cm->interp_filter;
+  filter_ref = cm->interp_filter;
+  if (cpi->sf.default_interp_filter != BILINEAR) {
+    if (xd->above_mi && is_inter_block(xd->above_mi))
+      filter_ref = xd->above_mi->interp_filter;
+    else if (xd->left_mi && is_inter_block(xd->left_mi))
+      filter_ref = xd->left_mi->interp_filter;
+  }
 
   // initialize mode decisions
   vp9_rd_cost_reset(&best_rdc);
   vp9_rd_cost_reset(rd_cost);
   mi->sb_type = bsize;
-  mi->ref_frame[0] = NONE;
-  mi->ref_frame[1] = NONE;
+  mi->ref_frame[0] = NO_REF_FRAME;
+  mi->ref_frame[1] = NO_REF_FRAME;
 
   mi->tx_size =
       VPXMIN(max_txsize_lookup[bsize], tx_mode_to_biggest_tx_size[cm->tx_mode]);
@@ -1456,16 +1898,25 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       x->source_variance =
           vp9_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+
+    if (cpi->oxcf.content == VP9E_CONTENT_SCREEN &&
+        cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && mi->segment_id > 0 &&
+        x->zero_temp_sad_source && x->source_variance == 0) {
+      mi->segment_id = 0;
+      vp9_init_plane_quantizers(cpi, x);
+    }
   }
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0 &&
-      cpi->denoiser.denoising_level > kDenLowLow) {
-    vp9_denoiser_reset_frame_stats(ctx);
+  if (cpi->oxcf.noise_sensitivity > 0) {
+    if (cpi->use_svc) denoise_svc_pickmode = vp9_denoise_svc_non_key(cpi);
+    if (cpi->denoiser.denoising_level > kDenLowLow && denoise_svc_pickmode)
+      vp9_denoiser_reset_frame_stats(ctx);
   }
 #endif
 
-  if (cpi->rc.frames_since_golden == 0 && !cpi->use_svc) {
+  if (cpi->rc.frames_since_golden == 0 && gf_temporal_ref &&
+      !cpi->rc.alt_ref_gf_group && !cpi->rc.last_frame_is_src_altref) {
     usable_ref_frame = LAST_FRAME;
   } else {
     usable_ref_frame = GOLDEN_FRAME;
@@ -1479,19 +1930,32 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       skip_ref_find_pred[LAST_FRAME] = 1;
       skip_ref_find_pred[GOLDEN_FRAME] = 1;
     }
+    if (!cm->show_frame) {
+      if (cpi->rc.frames_since_key == 1) {
+        usable_ref_frame = LAST_FRAME;
+        skip_ref_find_pred[GOLDEN_FRAME] = 1;
+        skip_ref_find_pred[ALTREF_FRAME] = 1;
+      }
+    }
   }
 
   // For svc mode, on spatial_layer_id > 0: if the reference has different scale
   // constrain the inter mode to only test zero motion.
   if (cpi->use_svc && svc->force_zero_mode_spatial_ref &&
-      cpi->svc.spatial_layer_id > 0) {
-    if (cpi->ref_frame_flags & flag_list[LAST_FRAME]) {
-      struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf;
-      if (vp9_is_scaled(sf)) svc_force_zero_mode[LAST_FRAME - 1] = 1;
+      svc->spatial_layer_id > 0 && !gf_temporal_ref) {
+    if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
+      struct scale_factors *const ref_sf = &cm->frame_refs[LAST_FRAME - 1].sf;
+      if (vp9_is_scaled(ref_sf)) {
+        svc_force_zero_mode[LAST_FRAME - 1] = 1;
+        inter_layer_ref = LAST_FRAME;
+      }
     }
-    if (cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) {
-      struct scale_factors *const sf = &cm->frame_refs[GOLDEN_FRAME - 1].sf;
-      if (vp9_is_scaled(sf)) svc_force_zero_mode[GOLDEN_FRAME - 1] = 1;
+    if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
+      struct scale_factors *const ref_sf = &cm->frame_refs[GOLDEN_FRAME - 1].sf;
+      if (vp9_is_scaled(ref_sf)) {
+        svc_force_zero_mode[GOLDEN_FRAME - 1] = 1;
+        inter_layer_ref = GOLDEN_FRAME;
+      }
     }
   }
 
@@ -1507,19 +1971,83 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     }
   }
 
-  if (!((cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) &&
+  if (sf->disable_golden_ref && (x->content_state_sb != kVeryHighSad ||
+                                 cpi->rc.avg_frame_low_motion < 60))
+    usable_ref_frame = LAST_FRAME;
+
+  if (!((cpi->ref_frame_flags & VP9_GOLD_FLAG) &&
         !svc_force_zero_mode[GOLDEN_FRAME - 1] && !force_skip_low_temp_var))
     use_golden_nonzeromv = 0;
 
+  if (cpi->oxcf.speed >= 8 && !cpi->use_svc &&
+      ((cpi->rc.frames_since_golden + 1) < x->last_sb_high_content ||
+       x->last_sb_high_content > 40 || cpi->rc.frames_since_golden > 120))
+    usable_ref_frame = LAST_FRAME;
+
+  // Compound prediction modes: (0,0) on LAST/GOLDEN and ARF.
+  if (cm->reference_mode == REFERENCE_MODE_SELECT &&
+      cpi->sf.use_compound_nonrd_pickmode && usable_ref_frame == ALTREF_FRAME)
+    comp_modes = 2;
+
+  // If the segment reference frame feature is enabled and it's set to GOLDEN
+  // reference, then make sure we don't skip checking GOLDEN, this is to
+  // prevent possibility of not picking any mode.
+  if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
+      get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) == GOLDEN_FRAME) {
+    usable_ref_frame = GOLDEN_FRAME;
+    skip_ref_find_pred[GOLDEN_FRAME] = 0;
+    thresh_svc_skip_golden = 0;
+  }
+
   for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) {
+    // Skip find_predictor if the reference frame is not in the
+    // ref_frame_flags (i.e., not used as a reference for this frame).
+    skip_ref_find_pred[ref_frame] =
+        !(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame));
     if (!skip_ref_find_pred[ref_frame]) {
       find_predictors(cpi, x, ref_frame, frame_mv, const_motion,
-                      &ref_frame_skip_mask, flag_list, tile_data, mi_row,
-                      mi_col, yv12_mb, bsize, force_skip_low_temp_var);
+                      &ref_frame_skip_mask, tile_data, mi_row, mi_col, yv12_mb,
+                      bsize, force_skip_low_temp_var, comp_modes > 0);
     }
   }
 
-  for (idx = 0; idx < RT_INTER_MODES; ++idx) {
+  if (cpi->use_svc || cpi->oxcf.speed <= 7 || bsize < BLOCK_32X32)
+    x->sb_use_mv_part = 0;
+
+  // Set the flag_svc_subpel to 1 for SVC if the lower spatial layer used
+  // an averaging filter for downsampling (phase = 8). If so, we will test
+  // a nonzero motion mode on the spatial reference.
+  // The nonzero motion is half pixel shifted to left and top (-4, -4).
+  if (cpi->use_svc && svc->spatial_layer_id > 0 &&
+      svc_force_zero_mode[inter_layer_ref - 1] &&
+      svc->downsample_filter_phase[svc->spatial_layer_id - 1] == 8 &&
+      !gf_temporal_ref) {
+    svc_mv_col = -4;
+    svc_mv_row = -4;
+    flag_svc_subpel = 1;
+  }
+
+  // For SVC with quality layers, when QP of lower layer is lower
+  // than current layer: force check of GF-ZEROMV before early exit
+  // due to skip flag.
+  if (svc->spatial_layer_id > 0 && no_scaling &&
+      (cpi->ref_frame_flags & VP9_GOLD_FLAG) &&
+      cm->base_qindex > svc->lower_layer_qindex + 10)
+    force_test_gf_zeromv = 1;
+
+  // For low motion content use x->sb_is_skin in addition to VeryHighSad
+  // for setting large_block.
+  large_block = (x->content_state_sb == kVeryHighSad ||
+                 (x->sb_is_skin && cpi->rc.avg_frame_low_motion > 70) ||
+                 cpi->oxcf.speed < 7)
+                    ? bsize > BLOCK_32X32
+                    : bsize >= BLOCK_32X32;
+  use_model_yrd_large =
+      cpi->oxcf.rc_mode == VPX_CBR && large_block &&
+      !cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) &&
+      cm->base_qindex;
+
+  for (idx = 0; idx < num_inter_modes + comp_modes; ++idx) {
     int rate_mv = 0;
     int mode_rd_thresh;
     int mode_index;
@@ -1527,20 +2055,96 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     int64_t this_sse;
     int is_skippable;
     int this_early_term = 0;
-    PREDICTION_MODE this_mode = ref_mode_set[idx].pred_mode;
+    int rd_computed = 0;
+    int flag_preduv_computed[2] = { 0 };
+    int inter_mv_mode = 0;
+    int skip_this_mv = 0;
+    int comp_pred = 0;
+    int force_mv_inter_layer = 0;
+    PREDICTION_MODE this_mode;
+    second_ref_frame = NO_REF_FRAME;
 
-    ref_frame = ref_mode_set[idx].ref_frame;
+    if (idx < num_inter_modes) {
+      this_mode = ref_mode_set[idx].pred_mode;
+      ref_frame = ref_mode_set[idx].ref_frame;
 
-    if (cpi->use_svc) {
-      this_mode = ref_mode_set_svc[idx].pred_mode;
-      ref_frame = ref_mode_set_svc[idx].ref_frame;
+      if (cpi->use_svc) {
+        this_mode = ref_mode_set_svc[idx].pred_mode;
+        ref_frame = ref_mode_set_svc[idx].ref_frame;
+      }
+    } else {
+      // Add (0,0) compound modes.
+      this_mode = ZEROMV;
+      ref_frame = LAST_FRAME;
+      if (idx == num_inter_modes + comp_modes - 1) ref_frame = GOLDEN_FRAME;
+      second_ref_frame = ALTREF_FRAME;
+      comp_pred = 1;
     }
+
     if (ref_frame > usable_ref_frame) continue;
     if (skip_ref_find_pred[ref_frame]) continue;
 
-    if (sf->short_circuit_flat_blocks && x->source_variance == 0 &&
-        this_mode != NEARESTMV) {
+    if (svc->previous_frame_is_intra_only) {
+      if (ref_frame != LAST_FRAME || frame_mv[this_mode][ref_frame].as_int != 0)
+        continue;
+    }
+
+    // If the segment reference frame feature is enabled then do nothing if the
+    // current ref frame is not allowed.
+    if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
+        get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame)
       continue;
+
+    if (flag_svc_subpel && ref_frame == inter_layer_ref) {
+      force_mv_inter_layer = 1;
+      // Only test mode if NEARESTMV/NEARMV is (svc_mv_col, svc_mv_row),
+      // otherwise set NEWMV to (svc_mv_col, svc_mv_row).
+      if (this_mode == NEWMV) {
+        frame_mv[this_mode][ref_frame].as_mv.col = svc_mv_col;
+        frame_mv[this_mode][ref_frame].as_mv.row = svc_mv_row;
+      } else if (frame_mv[this_mode][ref_frame].as_mv.col != svc_mv_col ||
+                 frame_mv[this_mode][ref_frame].as_mv.row != svc_mv_row) {
+        continue;
+      }
+    }
+
+    if (comp_pred) {
+      if (!cpi->allow_comp_inter_inter) continue;
+      // Skip compound inter modes if ARF is not available.
+      if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame)))
+        continue;
+      // Do not allow compound prediction if the segment level reference frame
+      // feature is in use as in this case there can only be one reference.
+      if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) continue;
+    }
+
+    // For CBR mode: skip the golden reference search if sse of zeromv_last is
+    // below threshold.
+    if (ref_frame == GOLDEN_FRAME && cpi->oxcf.rc_mode == VPX_CBR &&
+        ((cpi->use_svc && sse_zeromv_normalized < thresh_svc_skip_golden) ||
+         (!cpi->use_svc && sse_zeromv_normalized < thresh_skip_golden)))
+      continue;
+
+    if (!(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) continue;
+
+    // For screen content. If zero_temp_sad source is computed: skip
+    // non-zero motion check for stationary blocks. If the superblock is
+    // non-stationary then for flat blocks skip the zero last check (keep golden
+    // as it may be inter-layer reference). Otherwise (if zero_temp_sad_source
+    // is not computed) skip non-zero motion check for flat blocks.
+    // TODO(marpan): Compute zero_temp_sad_source per coding block.
+    if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) {
+      if (cpi->compute_source_sad_onepass && cpi->sf.use_source_sad) {
+        if ((frame_mv[this_mode][ref_frame].as_int != 0 &&
+             x->zero_temp_sad_source) ||
+            (frame_mv[this_mode][ref_frame].as_int == 0 &&
+             x->source_variance == 0 && ref_frame == LAST_FRAME &&
+             !x->zero_temp_sad_source))
+          continue;
+      } else if (frame_mv[this_mode][ref_frame].as_int != 0 &&
+                 x->source_variance == 0) {
+        continue;
+      }
     }
 
     if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode))) continue;
@@ -1551,76 +2155,92 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
            frame_mv[this_mode][ref_frame].as_int != 0))
         continue;
 
-      if (cpi->rc.alt_ref_gf_group &&
+      if (!cm->show_frame && ref_frame == ALTREF_FRAME &&
+          frame_mv[this_mode][ref_frame].as_int != 0)
+        continue;
+
+      if (cpi->rc.alt_ref_gf_group && cm->show_frame &&
           cpi->rc.frames_since_golden > (cpi->rc.baseline_gf_interval >> 1) &&
           ref_frame == GOLDEN_FRAME &&
           frame_mv[this_mode][ref_frame].as_int != 0)
         continue;
 
-      if (cpi->rc.alt_ref_gf_group &&
+      if (cpi->rc.alt_ref_gf_group && cm->show_frame &&
+          cpi->rc.frames_since_golden > 0 &&
           cpi->rc.frames_since_golden < (cpi->rc.baseline_gf_interval >> 1) &&
           ref_frame == ALTREF_FRAME &&
           frame_mv[this_mode][ref_frame].as_int != 0)
         continue;
     }
 
-    if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue;
-
     if (const_motion[ref_frame] && this_mode == NEARMV) continue;
 
     // Skip non-zeromv mode search for golden frame if force_skip_low_temp_var
     // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped
     // later.
-    if (force_skip_low_temp_var && ref_frame == GOLDEN_FRAME &&
+    if (!force_mv_inter_layer && force_skip_low_temp_var &&
+        ref_frame == GOLDEN_FRAME &&
         frame_mv[this_mode][ref_frame].as_int != 0) {
       continue;
     }
 
-    if (cpi->sf.short_circuit_low_temp_var >= 2 && force_skip_low_temp_var &&
-        ref_frame == LAST_FRAME && this_mode == NEWMV) {
+    if (x->content_state_sb != kVeryHighSad &&
+        (cpi->sf.short_circuit_low_temp_var >= 2 ||
+         (cpi->sf.short_circuit_low_temp_var == 1 && bsize == BLOCK_64X64)) &&
+        force_skip_low_temp_var && ref_frame == LAST_FRAME &&
+        this_mode == NEWMV) {
       continue;
     }
 
     if (cpi->use_svc) {
-      if (svc_force_zero_mode[ref_frame - 1] &&
+      if (!force_mv_inter_layer && svc_force_zero_mode[ref_frame - 1] &&
           frame_mv[this_mode][ref_frame].as_int != 0)
         continue;
     }
 
-    if (sf->reference_masking &&
-        !(frame_mv[this_mode][ref_frame].as_int == 0 &&
-          ref_frame == LAST_FRAME)) {
-      if (usable_ref_frame < ALTREF_FRAME) {
-        if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) {
-          i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
-          if ((cpi->ref_frame_flags & flag_list[i]))
-            if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
-              ref_frame_skip_mask |= (1 << ref_frame);
+    // Disable this drop out case if the ref frame segment level feature is
+    // enabled for this segment. This is to prevent the possibility that we end
+    // up unable to pick any mode.
+    if (!segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) {
+      if (sf->reference_masking &&
+          !(frame_mv[this_mode][ref_frame].as_int == 0 &&
+            ref_frame == LAST_FRAME)) {
+        if (usable_ref_frame < ALTREF_FRAME) {
+          if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) {
+            i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
+            if ((cpi->ref_frame_flags & ref_frame_to_flag(i)))
+              if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
+                ref_frame_skip_mask |= (1 << ref_frame);
+          }
+        } else if (!cpi->rc.is_src_frame_alt_ref &&
+                   !(frame_mv[this_mode][ref_frame].as_int == 0 &&
+                     ref_frame == ALTREF_FRAME)) {
+          int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME;
+          int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME;
+          if (((cpi->ref_frame_flags & ref_frame_to_flag(ref1)) &&
+               (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) ||
+              ((cpi->ref_frame_flags & ref_frame_to_flag(ref2)) &&
+               (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1))))
+            ref_frame_skip_mask |= (1 << ref_frame);
         }
-      } else if (!cpi->rc.is_src_frame_alt_ref &&
-                 !(frame_mv[this_mode][ref_frame].as_int == 0 &&
-                   ref_frame == ALTREF_FRAME)) {
-        int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME;
-        int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME;
-        if (((cpi->ref_frame_flags & flag_list[ref1]) &&
-             (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) ||
-            ((cpi->ref_frame_flags & flag_list[ref2]) &&
-             (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1))))
-          ref_frame_skip_mask |= (1 << ref_frame);
       }
+      if (ref_frame_skip_mask & (1 << ref_frame)) continue;
     }
-    if (ref_frame_skip_mask & (1 << ref_frame)) continue;
 
     // Select prediction reference frames.
-    for (i = 0; i < MAX_MB_PLANE; i++)
+    for (i = 0; i < MAX_MB_PLANE; i++) {
       xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+      if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
+    }
 
     mi->ref_frame[0] = ref_frame;
-    set_ref_ptrs(cm, xd, ref_frame, NONE);
+    mi->ref_frame[1] = second_ref_frame;
+    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
 
     mode_index = mode_idx[ref_frame][INTER_OFFSET(this_mode)];
-    mode_rd_thresh = best_mode_skip_txfm ? rd_threshes[mode_index] << 1
-                                         : rd_threshes[mode_index];
+    mode_rd_thresh = best_pickmode.best_mode_skip_txfm
+                         ? rd_threshes[mode_index] << 1
+                         : rd_threshes[mode_index];
 
     // Increase mode_rd_thresh value for GOLDEN_FRAME for improved encoding
     // speed with little/no subjective quality loss.
@@ -1628,76 +2248,37 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
         cpi->rc.frames_since_golden > 4)
       mode_rd_thresh = mode_rd_thresh << 3;
 
-    if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
-                            rd_thresh_freq_fact[mode_index]))
-      continue;
+    if ((cpi->sf.adaptive_rd_thresh_row_mt &&
+         rd_less_than_thresh_row_mt(best_rdc.rdcost, mode_rd_thresh,
+                                    &rd_thresh_freq_fact[mode_index])) ||
+        (!cpi->sf.adaptive_rd_thresh_row_mt &&
+         rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
+                             &rd_thresh_freq_fact[mode_index])))
+      if (frame_mv[this_mode][ref_frame].as_int != 0) continue;
 
-    if (this_mode == NEWMV) {
-      if (ref_frame > LAST_FRAME && !cpi->use_svc &&
-          cpi->oxcf.rc_mode == VPX_CBR) {
-        int tmp_sad;
-        uint32_t dis;
-        int cost_list[5];
-
-        if (bsize < BLOCK_16X16) continue;
-
-        tmp_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
-
-        if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) continue;
-        if (tmp_sad + (num_pels_log2_lookup[bsize] << 4) > best_pred_sad)
-          continue;
-
-        frame_mv[NEWMV][ref_frame].as_int = mi->mv[0].as_int;
-        rate_mv = vp9_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv,
-                                  &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv,
-                                  x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
-        frame_mv[NEWMV][ref_frame].as_mv.row >>= 3;
-        frame_mv[NEWMV][ref_frame].as_mv.col >>= 3;
-
-        cpi->find_fractional_mv_step(
-            x, &frame_mv[NEWMV][ref_frame].as_mv,
-            &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv,
-            cpi->common.allow_high_precision_mv, x->errorperbit,
-            &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
-            cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
-            x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref_frame], NULL, 0,
-            0);
-      } else if (svc->use_base_mv && svc->spatial_layer_id) {
-        if (frame_mv[NEWMV][ref_frame].as_int != INVALID_MV) {
-          const int pre_stride = xd->plane[0].pre[0].stride;
-          int base_mv_sad = INT_MAX;
-          const float base_mv_bias = sf->base_mv_aggressive ? 1.5f : 1.0f;
-          const uint8_t *const pre_buf =
-              xd->plane[0].pre[0].buf +
-              (frame_mv[NEWMV][ref_frame].as_mv.row >> 3) * pre_stride +
-              (frame_mv[NEWMV][ref_frame].as_mv.col >> 3);
-          base_mv_sad = cpi->fn_ptr[bsize].sdf(
-              x->plane[0].src.buf, x->plane[0].src.stride, pre_buf, pre_stride);
-
-          if (base_mv_sad < (int)(base_mv_bias * x->pred_mv_sad[ref_frame])) {
-            // Base layer mv is good.
-            if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
-                                        &frame_mv[NEWMV][ref_frame], &rate_mv,
-                                        best_rdc.rdcost, 1)) {
-              continue;
-            }
-          } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
-                                             &frame_mv[NEWMV][ref_frame],
-                                             &rate_mv, best_rdc.rdcost, 0)) {
-            continue;
-          }
-        } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
-                                           &frame_mv[NEWMV][ref_frame],
-                                           &rate_mv, best_rdc.rdcost, 0)) {
-          continue;
-        }
-      } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
-                                         &frame_mv[NEWMV][ref_frame], &rate_mv,
-                                         best_rdc.rdcost, 0)) {
+    if (this_mode == NEWMV && !force_mv_inter_layer) {
+      if (search_new_mv(cpi, x, frame_mv, ref_frame, gf_temporal_ref, bsize,
+                        mi_row, mi_col, best_pred_sad, &rate_mv, best_sse_sofar,
+                        &best_rdc))
         continue;
+    }
+
+    // TODO(jianj): Skipping the testing of (duplicate) non-zero motion vector
+    // causes some regression, leave it for duplicate zero-mv for now, until
+    // regression issue is resolved.
+    for (inter_mv_mode = NEARESTMV; inter_mv_mode <= NEWMV; inter_mv_mode++) {
+      if (inter_mv_mode == this_mode || comp_pred) continue;
+      if (mode_checked[inter_mv_mode][ref_frame] &&
+          frame_mv[this_mode][ref_frame].as_int ==
+              frame_mv[inter_mv_mode][ref_frame].as_int &&
+          frame_mv[inter_mv_mode][ref_frame].as_int == 0) {
+        skip_this_mv = 1;
+        break;
       }
     }
 
+    if (skip_this_mv) continue;
+
     // If use_golden_nonzeromv is false, NEWMV mode is skipped for golden, no
     // need to compute best_pred_sad which is only used to skip golden NEWMV.
     if (use_golden_nonzeromv && this_mode == NEWMV && ref_frame == LAST_FRAME &&
@@ -1712,13 +2293,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       x->pred_mv_sad[LAST_FRAME] = best_pred_sad;
     }
 
-    if (this_mode != NEARESTMV &&
+    if (this_mode != NEARESTMV && !comp_pred &&
         frame_mv[this_mode][ref_frame].as_int ==
             frame_mv[NEARESTMV][ref_frame].as_int)
       continue;
 
     mi->mode = this_mode;
     mi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
+    mi->mv[1].as_int = 0;
 
     // Search for the best prediction filter type, when the resulting
     // motion vector is at sub-pixel accuracy level for luma component, i.e.,
@@ -1736,88 +2318,47 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     if ((this_mode == NEWMV || filter_ref == SWITCHABLE) &&
         pred_filter_search &&
         (ref_frame == LAST_FRAME ||
-         (ref_frame == GOLDEN_FRAME &&
+         (ref_frame == GOLDEN_FRAME && !force_mv_inter_layer &&
           (cpi->use_svc || cpi->oxcf.rc_mode == VPX_VBR))) &&
         (((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07) != 0)) {
-      int pf_rate[3];
-      int64_t pf_dist[3];
-      unsigned int pf_var[3];
-      unsigned int pf_sse[3];
-      TX_SIZE pf_tx_size[3];
-      int64_t best_cost = INT64_MAX;
-      INTERP_FILTER best_filter = SWITCHABLE, filter;
-      PRED_BUFFER *current_pred = this_mode_pred;
-
-      for (filter = EIGHTTAP; filter <= EIGHTTAP_SMOOTH; ++filter) {
-        int64_t cost;
-        mi->interp_filter = filter;
-        vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
-        model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter], &pf_dist[filter],
-                          &pf_var[filter], &pf_sse[filter]);
-        pf_rate[filter] += vp9_get_switchable_rate(cpi, xd);
-        cost = RDCOST(x->rdmult, x->rddiv, pf_rate[filter], pf_dist[filter]);
-        pf_tx_size[filter] = mi->tx_size;
-        if (cost < best_cost) {
-          best_filter = filter;
-          best_cost = cost;
-          skip_txfm = x->skip_txfm[0];
-
-          if (reuse_inter_pred) {
-            if (this_mode_pred != current_pred) {
-              free_pred_buffer(this_mode_pred);
-              this_mode_pred = current_pred;
-            }
-            current_pred = &tmp[get_pred_buffer(tmp, 3)];
-            pd->dst.buf = current_pred->data;
-            pd->dst.stride = bw;
-          }
-        }
-      }
-
-      if (reuse_inter_pred && this_mode_pred != current_pred)
-        free_pred_buffer(current_pred);
-
-      mi->interp_filter = best_filter;
-      mi->tx_size = pf_tx_size[best_filter];
-      this_rdc.rate = pf_rate[best_filter];
-      this_rdc.dist = pf_dist[best_filter];
-      var_y = pf_var[best_filter];
-      sse_y = pf_sse[best_filter];
-      x->skip_txfm[0] = skip_txfm;
-      if (reuse_inter_pred) {
-        pd->dst.buf = this_mode_pred->data;
-        pd->dst.stride = this_mode_pred->stride;
-      }
+      rd_computed = 1;
+      search_filter_ref(cpi, x, &this_rdc, mi_row, mi_col, tmp, bsize,
+                        reuse_inter_pred, &this_mode_pred, &var_y, &sse_y,
+                        force_smooth_filter, &this_early_term,
+                        flag_preduv_computed, use_model_yrd_large);
     } else {
-// TODO(jackychen): the low-bitdepth condition causes a segfault in
-// high-bitdepth builds.
-// https://bugs.chromium.org/p/webm/issues/detail?id=1250
-#if CONFIG_VP9_HIGHBITDEPTH
-      const int large_block = bsize > BLOCK_32X32;
-#else
-      const int large_block =
-          x->sb_is_skin ? bsize > BLOCK_32X32 : bsize >= BLOCK_32X32;
-#endif
       mi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP : filter_ref;
+
+      if (cpi->use_svc && ref_frame == GOLDEN_FRAME &&
+          svc_force_zero_mode[ref_frame - 1])
+        mi->interp_filter = filter_gf_svc;
+
       vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
 
       // For large partition blocks, extra testing is done.
-      if (cpi->oxcf.rc_mode == VPX_CBR && large_block &&
-          !cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) &&
-          cm->base_qindex) {
+      if (use_model_yrd_large) {
+        rd_computed = 1;
         model_rd_for_sb_y_large(cpi, bsize, x, xd, &this_rdc.rate,
                                 &this_rdc.dist, &var_y, &sse_y, mi_row, mi_col,
-                                &this_early_term);
+                                &this_early_term, flag_preduv_computed);
       } else {
+        rd_computed = 1;
         model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
-                          &var_y, &sse_y);
+                          &var_y, &sse_y, 0);
       }
+      // Save normalized sse (between current and last frame) for (0, 0) motion.
+      if (ref_frame == LAST_FRAME &&
+          frame_mv[this_mode][ref_frame].as_int == 0) {
+        sse_zeromv_normalized =
+            sse_y >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+      }
+      if (sse_y < best_sse_sofar) best_sse_sofar = sse_y;
     }
 
     if (!this_early_term) {
       this_sse = (int64_t)sse_y;
       block_yrd(cpi, x, &this_rdc, &is_skippable, &this_sse, bsize,
-                VPXMIN(mi->tx_size, TX_16X16));
+                VPXMIN(mi->tx_size, TX_16X16), rd_computed, 0);
       x->skip_txfm[0] = is_skippable;
       if (is_skippable) {
         this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
@@ -1837,19 +2378,25 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
           this_rdc.rate += vp9_get_switchable_rate(cpi, xd);
       }
     } else {
-      this_rdc.rate += cm->interp_filter == SWITCHABLE
-                           ? vp9_get_switchable_rate(cpi, xd)
-                           : 0;
+      if (cm->interp_filter == SWITCHABLE) {
+        if ((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07)
+          this_rdc.rate += vp9_get_switchable_rate(cpi, xd);
+      }
       this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
     }
 
-    if (x->color_sensitivity[0] || x->color_sensitivity[1]) {
+    if (!this_early_term &&
+        (x->color_sensitivity[0] || x->color_sensitivity[1])) {
       RD_COST rdc_uv;
       const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, &xd->plane[1]);
-      if (x->color_sensitivity[0])
+      if (x->color_sensitivity[0] && !flag_preduv_computed[0]) {
         vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 1);
-      if (x->color_sensitivity[1])
+        flag_preduv_computed[0] = 1;
+      }
+      if (x->color_sensitivity[1] && !flag_preduv_computed[1]) {
         vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 2);
+        flag_preduv_computed[1] = 1;
+      }
       model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &rdc_uv, &var_y, &sse_y, 1, 2);
       this_rdc.rate += rdc_uv.rate;
       this_rdc.dist += rdc_uv.dist;
@@ -1858,6 +2405,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     this_rdc.rate += rate_mv;
     this_rdc.rate += cpi->inter_mode_cost[x->mbmi_ext->mode_context[ref_frame]]
                                          [INTER_OFFSET(this_mode)];
+    // TODO(marpan): Add costing for compound mode.
     this_rdc.rate += ref_frame_cost[ref_frame];
     this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist);
 
@@ -1868,15 +2416,17 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       vp9_NEWMV_diff_bias(&cpi->noise_estimate, xd, this_mode, &this_rdc, bsize,
                           frame_mv[this_mode][ref_frame].as_mv.row,
                           frame_mv[this_mode][ref_frame].as_mv.col,
-                          ref_frame == LAST_FRAME);
+                          ref_frame == LAST_FRAME, x->lowvar_highsumdiff,
+                          x->sb_is_skin);
     }
 
     // Skipping checking: test to see if this block can be reconstructed by
     // prediction only.
-    if (cpi->allow_encode_breakout) {
+    if (cpi->allow_encode_breakout && !xd->lossless && !scene_change_detected &&
+        !svc->high_num_blocks_with_motion) {
       encode_breakout_test(cpi, x, bsize, mi_row, mi_col, ref_frame, this_mode,
                            var_y, sse_y, yv12_mb, &this_rdc.rate,
-                           &this_rdc.dist);
+                           &this_rdc.dist, flag_preduv_computed);
       if (x->skip) {
         this_rdc.rate += rate_mv;
         this_rdc.rdcost =
@@ -1884,8 +2434,17 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       }
     }
 
+    // On spatially flat blocks for screne content: bias against zero-last
+    // if the sse_y is non-zero. Only on scene change or high motion frames.
+    if (cpi->oxcf.content == VP9E_CONTENT_SCREEN &&
+        (scene_change_detected || svc->high_num_blocks_with_motion) &&
+        ref_frame == LAST_FRAME && frame_mv[this_mode][ref_frame].as_int == 0 &&
+        svc->spatial_layer_id == 0 && x->source_variance == 0 && sse_y > 0) {
+      this_rdc.rdcost = this_rdc.rdcost << 2;
+    }
+
 #if CONFIG_VP9_TEMPORAL_DENOISING
-    if (cpi->oxcf.noise_sensitivity > 0 &&
+    if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc_pickmode &&
         cpi->denoiser.denoising_level > kDenLowLow) {
       vp9_denoiser_update_frame_stats(mi, sse_y, this_mode, ctx);
       // Keep track of zero_last cost.
@@ -1896,88 +2455,110 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     (void)ctx;
 #endif
 
+    mode_checked[this_mode][ref_frame] = 1;
+
     if (this_rdc.rdcost < best_rdc.rdcost || x->skip) {
       best_rdc = this_rdc;
-      best_mode = this_mode;
-      best_pred_filter = mi->interp_filter;
-      best_tx_size = mi->tx_size;
-      best_ref_frame = ref_frame;
-      best_mode_skip_txfm = x->skip_txfm[0];
       best_early_term = this_early_term;
+      best_pickmode.best_mode = this_mode;
+      best_pickmode.best_pred_filter = mi->interp_filter;
+      best_pickmode.best_tx_size = mi->tx_size;
+      best_pickmode.best_ref_frame = ref_frame;
+      best_pickmode.best_mode_skip_txfm = x->skip_txfm[0];
+      best_pickmode.best_second_ref_frame = second_ref_frame;
 
       if (reuse_inter_pred) {
-        free_pred_buffer(best_pred);
-        best_pred = this_mode_pred;
+        free_pred_buffer(best_pickmode.best_pred);
+        best_pickmode.best_pred = this_mode_pred;
       }
     } else {
       if (reuse_inter_pred) free_pred_buffer(this_mode_pred);
     }
 
-    if (x->skip) break;
+    if (x->skip &&
+        (!force_test_gf_zeromv || mode_checked[ZEROMV][GOLDEN_FRAME]))
+      break;
 
     // If early termination flag is 1 and at least 2 modes are checked,
     // the mode search is terminated.
-    if (best_early_term && idx > 0) {
+    if (best_early_term && idx > 0 && !scene_change_detected &&
+        (!force_test_gf_zeromv || mode_checked[ZEROMV][GOLDEN_FRAME])) {
       x->skip = 1;
       break;
     }
   }
 
-  mi->mode = best_mode;
-  mi->interp_filter = best_pred_filter;
-  mi->tx_size = best_tx_size;
-  mi->ref_frame[0] = best_ref_frame;
-  mi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int;
+  mi->mode = best_pickmode.best_mode;
+  mi->interp_filter = best_pickmode.best_pred_filter;
+  mi->tx_size = best_pickmode.best_tx_size;
+  mi->ref_frame[0] = best_pickmode.best_ref_frame;
+  mi->mv[0].as_int =
+      frame_mv[best_pickmode.best_mode][best_pickmode.best_ref_frame].as_int;
   xd->mi[0]->bmi[0].as_mv[0].as_int = mi->mv[0].as_int;
-  x->skip_txfm[0] = best_mode_skip_txfm;
+  x->skip_txfm[0] = best_pickmode.best_mode_skip_txfm;
+  mi->ref_frame[1] = best_pickmode.best_second_ref_frame;
 
   // For spatial enhancemanent layer: perform intra prediction only if base
   // layer is chosen as the reference. Always perform intra prediction if
-  // LAST is the only reference or is_key_frame is set.
-  if (cpi->svc.spatial_layer_id) {
+  // LAST is the only reference, or is_key_frame is set, or on base
+  // temporal layer.
+  if (svc->spatial_layer_id && !gf_temporal_ref) {
     perform_intra_pred =
-        cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame ||
-        !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) ||
-        (!cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
-         svc_force_zero_mode[best_ref_frame - 1]);
+        svc->temporal_layer_id == 0 ||
+        svc->layer_context[svc->temporal_layer_id].is_key_frame ||
+        !(cpi->ref_frame_flags & VP9_GOLD_FLAG) ||
+        (!svc->layer_context[svc->temporal_layer_id].is_key_frame &&
+         svc_force_zero_mode[best_pickmode.best_ref_frame - 1]);
     inter_mode_thresh = (inter_mode_thresh << 1) + inter_mode_thresh;
   }
-  if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR &&
-      cpi->rc.is_src_frame_alt_ref)
+  if ((cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR &&
+       cpi->rc.is_src_frame_alt_ref) ||
+      svc->previous_frame_is_intra_only)
     perform_intra_pred = 0;
+
+  // If the segment reference frame feature is enabled and set then
+  // skip the intra prediction.
+  if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
+      get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) > 0)
+    perform_intra_pred = 0;
+
   // Perform intra prediction search, if the best SAD is above a certain
   // threshold.
-  if ((!force_skip_low_temp_var || bsize < BLOCK_32X32) && perform_intra_pred &&
-      (best_rdc.rdcost == INT64_MAX ||
-       (!x->skip && best_rdc.rdcost > inter_mode_thresh &&
-        bsize <= cpi->sf.max_intra_bsize))) {
+  if (best_rdc.rdcost == INT64_MAX ||
+      (cpi->oxcf.content == VP9E_CONTENT_SCREEN && x->source_variance == 0) ||
+      (scene_change_detected && perform_intra_pred) ||
+      ((!force_skip_low_temp_var || bsize < BLOCK_32X32 ||
+        x->content_state_sb == kVeryHighSad) &&
+       perform_intra_pred && !x->skip && best_rdc.rdcost > inter_mode_thresh &&
+       bsize <= cpi->sf.max_intra_bsize && !x->skip_low_source_sad &&
+       !x->lowvar_highsumdiff)) {
     struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 };
+    int64_t this_sse = INT64_MAX;
     int i;
-    TX_SIZE best_intra_tx_size = TX_SIZES;
+    PRED_BUFFER *const best_pred = best_pickmode.best_pred;
     TX_SIZE intra_tx_size =
         VPXMIN(max_txsize_lookup[bsize],
                tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
-    if (cpi->oxcf.content != VP9E_CONTENT_SCREEN && intra_tx_size > TX_16X16)
-      intra_tx_size = TX_16X16;
 
     if (reuse_inter_pred && best_pred != NULL) {
       if (best_pred->data == orig_dst.buf) {
         this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
 #if CONFIG_VP9_HIGHBITDEPTH
         if (cm->use_highbitdepth)
-          vpx_highbd_convolve_copy(best_pred->data, best_pred->stride,
-                                   this_mode_pred->data, this_mode_pred->stride,
-                                   NULL, 0, NULL, 0, bw, bh, xd->bd);
+          vpx_highbd_convolve_copy(
+              CONVERT_TO_SHORTPTR(best_pred->data), best_pred->stride,
+              CONVERT_TO_SHORTPTR(this_mode_pred->data), this_mode_pred->stride,
+              NULL, 0, 0, 0, 0, bw, bh, xd->bd);
         else
           vpx_convolve_copy(best_pred->data, best_pred->stride,
                             this_mode_pred->data, this_mode_pred->stride, NULL,
-                            0, NULL, 0, bw, bh);
+                            0, 0, 0, 0, bw, bh);
 #else
         vpx_convolve_copy(best_pred->data, best_pred->stride,
                           this_mode_pred->data, this_mode_pred->stride, NULL, 0,
-                          NULL, 0, bw, bh);
+                          0, 0, 0, bw, bh);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-        best_pred = this_mode_pred;
+        best_pickmode.best_pred = this_mode_pred;
       }
     }
     pd->dst = orig_dst;
@@ -1986,18 +2567,35 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       const PREDICTION_MODE this_mode = intra_mode_list[i];
       THR_MODES mode_index = mode_idx[INTRA_FRAME][mode_offset(this_mode)];
       int mode_rd_thresh = rd_threshes[mode_index];
+      // For spatially flat blocks, under short_circuit_flat_blocks flag:
+      // only check DC mode for stationary blocks, otherwise also check
+      // H and V mode.
       if (sf->short_circuit_flat_blocks && x->source_variance == 0 &&
-          this_mode != DC_PRED) {
+          ((x->zero_temp_sad_source && this_mode != DC_PRED) || i > 2)) {
         continue;
       }
 
       if (!((1 << this_mode) & cpi->sf.intra_y_mode_bsize_mask[bsize]))
         continue;
 
-      if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
-                              rd_thresh_freq_fact[mode_index]))
+      if (cpi->sf.rt_intra_dc_only_low_content && this_mode != DC_PRED &&
+          x->content_state_sb != kVeryHighSad)
         continue;
 
+      if ((cpi->sf.adaptive_rd_thresh_row_mt &&
+           rd_less_than_thresh_row_mt(best_rdc.rdcost, mode_rd_thresh,
+                                      &rd_thresh_freq_fact[mode_index])) ||
+          (!cpi->sf.adaptive_rd_thresh_row_mt &&
+           rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
+                               &rd_thresh_freq_fact[mode_index]))) {
+        // Avoid this early exit for screen on base layer, for scene
+        // changes or high motion frames.
+        if (cpi->oxcf.content != VP9E_CONTENT_SCREEN ||
+            svc->spatial_layer_id > 0 ||
+            (!scene_change_detected && !svc->high_num_blocks_with_motion))
+          continue;
+      }
+
       mi->mode = this_mode;
       mi->ref_frame[0] = INTRA_FRAME;
       this_rdc.dist = this_rdc.rate = 0;
@@ -2005,8 +2603,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       args.skippable = 1;
       args.rdc = &this_rdc;
       mi->tx_size = intra_tx_size;
-      vp9_foreach_transformed_block_in_plane(xd, bsize, 0, estimate_block_intra,
-                                             &args);
+
+      compute_intra_yprediction(this_mode, bsize, x, xd);
+      model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
+                        &var_y, &sse_y, 1);
+      block_yrd(cpi, x, &this_rdc, &args.skippable, &this_sse, bsize,
+                VPXMIN(mi->tx_size, TX_16X16), 1, 1);
+
       // Check skip cost here since skippable is not set for for uv, this
       // mirrors the behavior used by inter
       if (args.skippable) {
@@ -2033,68 +2636,86 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
 
       if (this_rdc.rdcost < best_rdc.rdcost) {
         best_rdc = this_rdc;
-        best_mode = this_mode;
-        best_intra_tx_size = mi->tx_size;
-        best_ref_frame = INTRA_FRAME;
+        best_pickmode.best_mode = this_mode;
+        best_pickmode.best_intra_tx_size = mi->tx_size;
+        best_pickmode.best_ref_frame = INTRA_FRAME;
+        best_pickmode.best_second_ref_frame = NO_REF_FRAME;
         mi->uv_mode = this_mode;
         mi->mv[0].as_int = INVALID_MV;
-        best_mode_skip_txfm = x->skip_txfm[0];
+        mi->mv[1].as_int = INVALID_MV;
+        best_pickmode.best_mode_skip_txfm = x->skip_txfm[0];
       }
     }
 
     // Reset mb_mode_info to the best inter mode.
-    if (best_ref_frame != INTRA_FRAME) {
-      mi->tx_size = best_tx_size;
+    if (best_pickmode.best_ref_frame != INTRA_FRAME) {
+      mi->tx_size = best_pickmode.best_tx_size;
     } else {
-      mi->tx_size = best_intra_tx_size;
+      mi->tx_size = best_pickmode.best_intra_tx_size;
     }
   }
 
   pd->dst = orig_dst;
-  mi->mode = best_mode;
-  mi->ref_frame[0] = best_ref_frame;
-  x->skip_txfm[0] = best_mode_skip_txfm;
+  mi->mode = best_pickmode.best_mode;
+  mi->ref_frame[0] = best_pickmode.best_ref_frame;
+  mi->ref_frame[1] = best_pickmode.best_second_ref_frame;
+  x->skip_txfm[0] = best_pickmode.best_mode_skip_txfm;
 
   if (!is_inter_block(mi)) {
     mi->interp_filter = SWITCHABLE_FILTERS;
   }
 
-  if (reuse_inter_pred && best_pred != NULL) {
+  if (reuse_inter_pred && best_pickmode.best_pred != NULL) {
+    PRED_BUFFER *const best_pred = best_pickmode.best_pred;
     if (best_pred->data != orig_dst.buf && is_inter_mode(mi->mode)) {
 #if CONFIG_VP9_HIGHBITDEPTH
       if (cm->use_highbitdepth)
-        vpx_highbd_convolve_copy(best_pred->data, best_pred->stride,
-                                 pd->dst.buf, pd->dst.stride, NULL, 0, NULL, 0,
-                                 bw, bh, xd->bd);
+        vpx_highbd_convolve_copy(
+            CONVERT_TO_SHORTPTR(best_pred->data), best_pred->stride,
+            CONVERT_TO_SHORTPTR(pd->dst.buf), pd->dst.stride, NULL, 0, 0, 0, 0,
+            bw, bh, xd->bd);
       else
         vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf,
-                          pd->dst.stride, NULL, 0, NULL, 0, bw, bh);
+                          pd->dst.stride, NULL, 0, 0, 0, 0, bw, bh);
 #else
       vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf,
-                        pd->dst.stride, NULL, 0, NULL, 0, bw, bh);
+                        pd->dst.stride, NULL, 0, 0, 0, 0, bw, bh);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     }
   }
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
   if (cpi->oxcf.noise_sensitivity > 0 && cpi->resize_pending == 0 &&
-      cpi->denoiser.denoising_level > kDenLowLow && cpi->denoiser.reset == 0) {
+      denoise_svc_pickmode && cpi->denoiser.denoising_level > kDenLowLow &&
+      cpi->denoiser.reset == 0) {
     VP9_DENOISER_DECISION decision = COPY_BLOCK;
+    ctx->sb_skip_denoising = 0;
+    // TODO(marpan): There is an issue with denoising when the
+    // superblock partitioning scheme is based on the pickmode.
+    // Remove this condition when the issue is resolved.
+    if (x->sb_pickmode_part) ctx->sb_skip_denoising = 1;
     vp9_pickmode_ctx_den_update(&ctx_den, zero_last_cost_orig, ref_frame_cost,
-                                frame_mv, reuse_inter_pred, best_tx_size,
-                                best_mode, best_ref_frame, best_pred_filter,
-                                best_mode_skip_txfm);
-    vp9_denoiser_denoise(cpi, x, mi_row, mi_col, bsize, ctx, &decision);
-    recheck_zeromv_after_denoising(cpi, mi, x, xd, decision, &ctx_den, yv12_mb,
-                                   &best_rdc, bsize, mi_row, mi_col);
-    best_ref_frame = ctx_den.best_ref_frame;
+                                frame_mv, reuse_inter_pred, &best_pickmode);
+    vp9_denoiser_denoise(cpi, x, mi_row, mi_col, bsize, ctx, &decision,
+                         gf_temporal_ref);
+    if (denoise_recheck_zeromv)
+      recheck_zeromv_after_denoising(cpi, mi, x, xd, decision, &ctx_den,
+                                     yv12_mb, &best_rdc, bsize, mi_row, mi_col);
+    best_pickmode.best_ref_frame = ctx_den.best_ref_frame;
   }
 #endif
 
-  if (cpi->sf.adaptive_rd_thresh) {
-    THR_MODES best_mode_idx = mode_idx[best_ref_frame][mode_offset(mi->mode)];
+  if (best_pickmode.best_ref_frame == ALTREF_FRAME ||
+      best_pickmode.best_second_ref_frame == ALTREF_FRAME)
+    x->arf_frame_usage++;
+  else if (best_pickmode.best_ref_frame != INTRA_FRAME)
+    x->lastgolden_frame_usage++;
 
-    if (best_ref_frame == INTRA_FRAME) {
+  if (cpi->sf.adaptive_rd_thresh) {
+    THR_MODES best_mode_idx =
+        mode_idx[best_pickmode.best_ref_frame][mode_offset(mi->mode)];
+
+    if (best_pickmode.best_ref_frame == INTRA_FRAME) {
       // Only consider the modes that are included in the intra_mode_list.
       int intra_modes = sizeof(intra_mode_list) / sizeof(PREDICTION_MODE);
       int i;
@@ -2102,16 +2723,27 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       // TODO(yunqingwang): Check intra mode mask and only update freq_fact
       // for those valid modes.
       for (i = 0; i < intra_modes; i++) {
-        update_thresh_freq_fact(cpi, tile_data, x->source_variance, bsize,
-                                INTRA_FRAME, best_mode_idx, intra_mode_list[i]);
+        if (cpi->sf.adaptive_rd_thresh_row_mt)
+          update_thresh_freq_fact_row_mt(cpi, tile_data, x->source_variance,
+                                         thresh_freq_fact_idx, INTRA_FRAME,
+                                         best_mode_idx, intra_mode_list[i]);
+        else
+          update_thresh_freq_fact(cpi, tile_data, x->source_variance, bsize,
+                                  INTRA_FRAME, best_mode_idx,
+                                  intra_mode_list[i]);
       }
     } else {
       for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
         PREDICTION_MODE this_mode;
-        if (best_ref_frame != ref_frame) continue;
+        if (best_pickmode.best_ref_frame != ref_frame) continue;
         for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
-          update_thresh_freq_fact(cpi, tile_data, x->source_variance, bsize,
-                                  ref_frame, best_mode_idx, this_mode);
+          if (cpi->sf.adaptive_rd_thresh_row_mt)
+            update_thresh_freq_fact_row_mt(cpi, tile_data, x->source_variance,
+                                           thresh_freq_fact_idx, ref_frame,
+                                           best_mode_idx, this_mode);
+          else
+            update_thresh_freq_fact(cpi, tile_data, x->source_variance, bsize,
+                                    ref_frame, best_mode_idx, this_mode);
         }
       }
     }
@@ -2129,12 +2761,10 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
   MODE_INFO *const mi = xd->mi[0];
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const struct segmentation *const seg = &cm->seg;
-  MV_REFERENCE_FRAME ref_frame, second_ref_frame = NONE;
-  MV_REFERENCE_FRAME best_ref_frame = NONE;
+  MV_REFERENCE_FRAME ref_frame, second_ref_frame = NO_REF_FRAME;
+  MV_REFERENCE_FRAME best_ref_frame = NO_REF_FRAME;
   unsigned char segment_id = mi->segment_id;
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
   int64_t best_rd = INT64_MAX;
   b_mode_info bsi[MAX_REF_FRAMES][4];
   int ref_frame_skip_mask = 0;
@@ -2150,11 +2780,13 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
     int_mv dummy_mv[2];
     x->pred_mv_sad[ref_frame] = INT_MAX;
 
-    if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) {
+    if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) &&
+        (yv12 != NULL)) {
       int_mv *const candidates = mbmi_ext->ref_mvs[ref_frame];
-      const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
-      vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf,
-                           sf);
+      const struct scale_factors *const ref_sf =
+          &cm->frame_refs[ref_frame - 1].sf;
+      vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, ref_sf,
+                           ref_sf);
       vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame, candidates, mi_row, mi_col,
                        mbmi_ext->mode_context);
 
@@ -2169,7 +2801,7 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
   mi->tx_size = TX_4X4;
   mi->uv_mode = DC_PRED;
   mi->ref_frame[0] = LAST_FRAME;
-  mi->ref_frame[1] = NONE;
+  mi->ref_frame[1] = NO_REF_FRAME;
   mi->interp_filter =
       cm->interp_filter == SWITCHABLE ? EIGHTTAP : cm->interp_filter;
 
@@ -2258,12 +2890,12 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
             }
 
             vp9_set_mv_search_range(&x->mv_limits,
-                                    &mbmi_ext->ref_mvs[0]->as_mv);
+                                    &mbmi_ext->ref_mvs[ref_frame][0].as_mv);
 
-            vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
-                                  x->sadperbit4, cond_cost_list(cpi, cost_list),
-                                  &mbmi_ext->ref_mvs[ref_frame][0].as_mv,
-                                  &tmp_mv, INT_MAX, 0);
+            vp9_full_pixel_search(
+                cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method,
+                x->sadperbit4, cond_cost_list(cpi, cost_list),
+                &mbmi_ext->ref_mvs[ref_frame][0].as_mv, &tmp_mv, INT_MAX, 0);
 
             x->mv_limits = tmp_mv_limits;
 
@@ -2283,9 +2915,10 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
                 x, &tmp_mv, &mbmi_ext->ref_mvs[ref_frame][0].as_mv,
                 cpi->common.allow_high_precision_mv, x->errorperbit,
                 &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
-                cpi->sf.mv.subpel_iters_per_step,
-                cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost,
-                &dummy_dist, &x->pred_sse[ref_frame], NULL, 0, 0);
+                cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list),
+                x->nmvjointcost, x->mvcost, &dummy_dist,
+                &x->pred_sse[ref_frame], NULL, 0, 0,
+                cpi->sf.use_accurate_subpel_search);
 
             xd->mi[0]->bmi[i].as_mv[0].as_mv = tmp_mv;
           } else {
@@ -2296,7 +2929,8 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
 #if CONFIG_VP9_HIGHBITDEPTH
           if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
             vp9_highbd_build_inter_predictor(
-                pd->pre[0].buf, pd->pre[0].stride, pd->dst.buf, pd->dst.stride,
+                CONVERT_TO_SHORTPTR(pd->pre[0].buf), pd->pre[0].stride,
+                CONVERT_TO_SHORTPTR(pd->dst.buf), pd->dst.stride,
                 &xd->mi[0]->bmi[i].as_mv[0].as_mv, &xd->block_refs[0]->sf,
                 4 * num_4x4_blocks_wide, 4 * num_4x4_blocks_high, 0,
                 vp9_filter_kernels[mi->interp_filter], MV_PRECISION_Q3,
@@ -2317,7 +2951,7 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
 #endif
 
           model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
-                            &var_y, &sse_y);
+                            &var_y, &sse_y, 0);
 
           this_rdc.rate += b_rate;
           this_rdc.rdcost =
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_pickmode.h b/media/libvpx/libvpx/vp9/encoder/vp9_pickmode.h
index 9aa00c4fab..15207e6cf4 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_pickmode.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_pickmode.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_PICKMODE_H_
-#define VP9_ENCODER_VP9_PICKMODE_H_
+#ifndef VPX_VP9_ENCODER_VP9_PICKMODE_H_
+#define VPX_VP9_ENCODER_VP9_PICKMODE_H_
 
 #include "vp9/encoder/vp9_encoder.h"
 
@@ -32,4 +32,4 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_PICKMODE_H_
+#endif  // VPX_VP9_ENCODER_VP9_PICKMODE_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c b/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c
index de96c6e068..a1e0b4439e 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c
@@ -8,12 +8,15 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 #include <math.h>
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/bitops.h"
 #include "vpx_ports/mem.h"
 
 #include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_scan.h"
 #include "vp9/common/vp9_seg_common.h"
 
 #include "vp9/encoder/vp9_encoder.h"
@@ -21,77 +24,64 @@
 #include "vp9/encoder/vp9_rd.h"
 
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                       int skip_block, const int16_t *zbin_ptr,
-                       const int16_t *round_ptr, const int16_t *quant_ptr,
-                       const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-                       tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                       uint16_t *eob_ptr, const int16_t *scan,
-                       const int16_t *iscan) {
+                       const struct macroblock_plane *const mb_plane,
+                       tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                       const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                       const struct ScanOrder *const scan_order) {
   int i, eob = -1;
-  // TODO(jingning) Decide the need of these arguments after the
-  // quantization process is completed.
-  (void)zbin_ptr;
-  (void)quant_shift_ptr;
-  (void)iscan;
+  const int16_t *round_ptr = mb_plane->round_fp;
+  const int16_t *quant_ptr = mb_plane->quant_fp;
+  const int16_t *scan = scan_order->scan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  for (i = 0; i < n_coeffs; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
 
-      int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-      tmp = (tmp * quant_ptr[rc != 0]) >> 16;
+    int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+    tmp = (tmp * quant_ptr[rc != 0]) >> 16;
 
-      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+    qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
 
-      if (tmp) eob = i;
-    }
+    if (tmp) eob = i;
   }
   *eob_ptr = eob + 1;
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count,
-                              int skip_block, const int16_t *zbin_ptr,
-                              const int16_t *round_ptr,
-                              const int16_t *quant_ptr,
-                              const int16_t *quant_shift_ptr,
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                              const struct macroblock_plane *const mb_plane,
                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                              const int16_t *scan, const int16_t *iscan) {
+                              const struct ScanOrder *const scan_order) {
   int i;
   int eob = -1;
-  // TODO(jingning) Decide the need of these arguments after the
-  // quantization process is completed.
-  (void)zbin_ptr;
-  (void)quant_shift_ptr;
-  (void)iscan;
+  const int16_t *round_ptr = mb_plane->round_fp;
+  const int16_t *quant_ptr = mb_plane->quant_fp;
+  const int16_t *scan = scan_order->scan;
 
-  memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < count; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      const int64_t tmp = abs_coeff + round_ptr[rc != 0];
-      const int abs_qcoeff = (int)((tmp * quant_ptr[rc != 0]) >> 16);
-      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
-      if (abs_qcoeff) eob = i;
-    }
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  for (i = 0; i < n_coeffs; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp = abs_coeff + round_ptr[rc != 0];
+    const int abs_qcoeff = (int)((tmp * quant_ptr[rc != 0]) >> 16);
+    qcoeff_ptr[rc] = (tran_low_t)(abs_qcoeff ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+    if (abs_qcoeff) eob = i;
   }
   *eob_ptr = eob + 1;
 }
@@ -100,108 +90,77 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count,
 // TODO(jingning) Refactor this file and combine functions with similar
 // operations.
 void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             int skip_block, const int16_t *zbin_ptr,
-                             const int16_t *round_ptr, const int16_t *quant_ptr,
-                             const int16_t *quant_shift_ptr,
+                             const struct macroblock_plane *const mb_plane,
                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                             const int16_t *scan, const int16_t *iscan) {
+                             const struct ScanOrder *const scan_order) {
   int i, eob = -1;
-  (void)zbin_ptr;
-  (void)quant_shift_ptr;
-  (void)iscan;
+  const int16_t *round_ptr = mb_plane->round_fp;
+  const int16_t *quant_ptr = mb_plane->quant_fp;
+  const int16_t *scan = scan_order->scan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      int tmp = 0;
-      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  for (i = 0; i < n_coeffs; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    int tmp = 0;
+    int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
 
-      if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
-        abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-        abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
-        tmp = (abs_coeff * quant_ptr[rc != 0]) >> 15;
-        qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-      }
-
-      if (tmp) eob = i;
+    if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
+      abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+      abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+      tmp = (abs_coeff * quant_ptr[rc != 0]) >> 15;
+      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
     }
+
+    if (tmp) eob = i;
   }
   *eob_ptr = eob + 1;
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp9_highbd_quantize_fp_32x32_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+    const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan) {
+    const struct ScanOrder *const scan_order) {
   int i, eob = -1;
-  (void)zbin_ptr;
-  (void)quant_shift_ptr;
-  (void)iscan;
+  const int16_t *round_ptr = mb_plane->round_fp;
+  const int16_t *quant_ptr = mb_plane->quant_fp;
+  const int16_t *scan = scan_order->scan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    for (i = 0; i < n_coeffs; i++) {
-      uint32_t abs_qcoeff = 0;
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  for (i = 0; i < n_coeffs; i++) {
+    int abs_qcoeff = 0;
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
 
-      if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
-        const int64_t tmp =
-            abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-        abs_qcoeff = (uint32_t)((tmp * quant_ptr[rc != 0]) >> 15);
-        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-      }
-
-      if (abs_qcoeff) eob = i;
+    if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
+      const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+      abs_qcoeff = (int)((tmp * quant_ptr[rc != 0]) >> 15);
+      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
     }
+
+    if (abs_qcoeff) eob = i;
   }
   *eob_ptr = eob + 1;
 }
 #endif
 
-void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
-                                const int16_t *scan, const int16_t *iscan) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  struct macroblock_plane *p = &x->plane[plane];
-  struct macroblockd_plane *pd = &xd->plane[plane];
-
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vpx_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block), 16, x->skip_block,
-                          p->zbin, p->round, p->quant, p->quant_shift,
-                          BLOCK_OFFSET(p->qcoeff, block),
-                          BLOCK_OFFSET(pd->dqcoeff, block), pd->dequant,
-                          &p->eobs[block], scan, iscan);
-    return;
-  }
-#endif
-  vpx_quantize_b(BLOCK_OFFSET(p->coeff, block), 16, x->skip_block, p->zbin,
-                 p->round, p->quant, p->quant_shift,
-                 BLOCK_OFFSET(p->qcoeff, block),
-                 BLOCK_OFFSET(pd->dqcoeff, block), pd->dequant, &p->eobs[block],
-                 scan, iscan);
-}
-
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
-  unsigned t;
+  unsigned int t;
   int l, m;
-  t = d;
-  for (l = 0; t > 1; l++) t >>= 1;
+  t = (unsigned int)d;
+  l = get_msb(t);
   m = 1 + (1 << (16 + l)) / d;
   *quant = (int16_t)(m - (1 << 16));
   *shift = 1 << (16 - l);
@@ -213,10 +172,9 @@ static int get_qzbin_factor(int q, vpx_bit_depth_t bit_depth) {
   switch (bit_depth) {
     case VPX_BITS_8: return q == 0 ? 64 : (quant < 148 ? 84 : 80);
     case VPX_BITS_10: return q == 0 ? 64 : (quant < 592 ? 84 : 80);
-    case VPX_BITS_12: return q == 0 ? 64 : (quant < 2368 ? 84 : 80);
     default:
-      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
-      return -1;
+      assert(bit_depth == VPX_BITS_12);
+      return q == 0 ? 64 : (quant < 2368 ? 84 : 80);
   }
 #else
   (void)bit_depth;
@@ -230,13 +188,20 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
   int i, q, quant;
 
   for (q = 0; q < QINDEX_RANGE; q++) {
-    const int qzbin_factor = get_qzbin_factor(q, cm->bit_depth);
-    const int qrounding_factor = q == 0 ? 64 : 48;
+    int qzbin_factor = get_qzbin_factor(q, cm->bit_depth);
+    int qrounding_factor = q == 0 ? 64 : 48;
+    const int sharpness_adjustment = 16 * (7 - cpi->oxcf.sharpness) / 7;
+
+    if (cpi->oxcf.sharpness > 0 && q > 0) {
+      qzbin_factor = 64 + sharpness_adjustment;
+      qrounding_factor = 64 - sharpness_adjustment;
+    }
 
     for (i = 0; i < 2; ++i) {
       int qrounding_factor_fp = i == 0 ? 48 : 42;
       if (q == 0) qrounding_factor_fp = 64;
-
+      if (cpi->oxcf.sharpness > 0)
+        qrounding_factor_fp = 64 - sharpness_adjustment;
       // y
       quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q, cm->bit_depth)
                      : vp9_ac_quant(q, 0, cm->bit_depth);
@@ -296,7 +261,6 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
   x->plane[0].zbin = quants->y_zbin[qindex];
   x->plane[0].round = quants->y_round[qindex];
   xd->plane[0].dequant = cpi->y_dequant[qindex];
-
   x->plane[0].quant_thred[0] = x->plane[0].zbin[0] * x->plane[0].zbin[0];
   x->plane[0].quant_thred[1] = x->plane[0].zbin[1] * x->plane[0].zbin[1];
 
@@ -309,7 +273,6 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
     x->plane[i].zbin = quants->uv_zbin[qindex];
     x->plane[i].round = quants->uv_round[qindex];
     xd->plane[i].dequant = cpi->uv_dequant[qindex];
-
     x->plane[i].quant_thred[0] = x->plane[i].zbin[0] * x->plane[i].zbin[0];
     x->plane[i].quant_thred[1] = x->plane[i].zbin[1] * x->plane[i].zbin[1];
   }
@@ -326,13 +289,25 @@ void vp9_frame_init_quantizer(VP9_COMP *cpi) {
   vp9_init_plane_quantizers(cpi, &cpi->td.mb);
 }
 
-void vp9_set_quantizer(VP9_COMMON *cm, int q) {
+void vp9_set_quantizer(VP9_COMP *cpi, int q, int ext_rc_delta_q_uv) {
+  VP9_COMMON *cm = &cpi->common;
   // quantizer has to be reinitialized with vp9_init_quantizer() if any
   // delta_q changes.
   cm->base_qindex = q;
   cm->y_dc_delta_q = 0;
   cm->uv_dc_delta_q = 0;
   cm->uv_ac_delta_q = 0;
+
+  if (ext_rc_delta_q_uv != 0) {
+    cm->uv_dc_delta_q = cm->uv_ac_delta_q = ext_rc_delta_q_uv;
+    vp9_init_quantizer(cpi);
+    return;
+  }
+
+  if (cpi->oxcf.delta_q_uv != 0) {
+    cm->uv_dc_delta_q = cm->uv_ac_delta_q = cpi->oxcf.delta_q_uv;
+    vp9_init_quantizer(cpi);
+  }
 }
 
 // Table that converts 0-63 Q-range values passed in outside to the Qindex
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_quantize.h b/media/libvpx/libvpx/vp9/encoder/vp9_quantize.h
index 61320361b6..bca2e055a2 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_quantize.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_quantize.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_QUANTIZE_H_
-#define VP9_ENCODER_VP9_QUANTIZE_H_
+#ifndef VPX_VP9_ENCODER_VP9_QUANTIZE_H_
+#define VPX_VP9_ENCODER_VP9_QUANTIZE_H_
 
 #include "./vpx_config.h"
 #include "vp9/encoder/vp9_block.h"
@@ -37,9 +37,6 @@ typedef struct {
   DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
 } QUANTS;
 
-void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
-                                const int16_t *scan, const int16_t *iscan);
-
 struct VP9_COMP;
 struct VP9Common;
 
@@ -49,7 +46,7 @@ void vp9_init_plane_quantizers(struct VP9_COMP *cpi, MACROBLOCK *x);
 
 void vp9_init_quantizer(struct VP9_COMP *cpi);
 
-void vp9_set_quantizer(struct VP9Common *cm, int q);
+void vp9_set_quantizer(struct VP9_COMP *cpi, int q, int ext_rc_delta_q_uv);
 
 int vp9_quantizer_to_qindex(int quantizer);
 
@@ -59,4 +56,4 @@ int vp9_qindex_to_quantizer(int qindex);
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_QUANTIZE_H_
+#endif  // VPX_VP9_ENCODER_VP9_QUANTIZE_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c
index 1eb8b50f01..529cbfab0c 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c
@@ -11,56 +11,61 @@
 #include <assert.h>
 #include <limits.h>
 #include <math.h>
+#include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/system_state.h"
 
 #include "vp9/common/vp9_alloccommon.h"
-#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
+#include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_seg_common.h"
 
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
 #include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_ext_ratectrl.h"
+#include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_svc_layercontext.h"
 
-// Max rate target for 1080P and below encodes under normal circumstances
-// (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_ext_ratectrl.h"
+#include "vpx/internal/vpx_codec_internal.h"
+
+// Max rate per frame for 1080P and below encodes if no level requirement given.
+// For larger formats limit to MAX_MB_RATE bits per MB
+// 4Mbits is derived from the level requirement for level 4 (1080P 30) which
+// requires that HW can sustain a rate of 16Mbits over a 4 frame group.
+// If a lower level requirement is specified then this may over ride this value.
 #define MAX_MB_RATE 250
-#define MAXRATE_1080P 2025000
-
-#define DEFAULT_KF_BOOST 2000
-#define DEFAULT_GF_BOOST 2000
+#define MAXRATE_1080P 4000000
 
 #define LIMIT_QRANGE_FOR_ALTREF_AND_KEY 1
 
 #define MIN_BPB_FACTOR 0.005
 #define MAX_BPB_FACTOR 50
 
-#define FRAME_OVERHEAD_BITS 200
-
-// Use this macro to turn on/off use of alt-refs in one-pass vbr mode.
-#define USE_ALTREF_FOR_ONE_PASS 0
-
 #if CONFIG_VP9_HIGHBITDEPTH
-#define ASSIGN_MINQ_TABLE(bit_depth, name)                   \
-  do {                                                       \
-    switch (bit_depth) {                                     \
-      case VPX_BITS_8: name = name##_8; break;               \
-      case VPX_BITS_10: name = name##_10; break;             \
-      case VPX_BITS_12: name = name##_12; break;             \
-      default:                                               \
-        assert(0 &&                                          \
-               "bit_depth should be VPX_BITS_8, VPX_BITS_10" \
-               " or VPX_BITS_12");                           \
-        name = NULL;                                         \
-    }                                                        \
+#define ASSIGN_MINQ_TABLE(bit_depth, name)       \
+  do {                                           \
+    switch (bit_depth) {                         \
+      case VPX_BITS_8: name = name##_8; break;   \
+      case VPX_BITS_10: name = name##_10; break; \
+      default:                                   \
+        assert(bit_depth == VPX_BITS_12);        \
+        name = name##_12;                        \
+        break;                                   \
+    }                                            \
   } while (0)
 #else
 #define ASSIGN_MINQ_TABLE(bit_depth, name) \
@@ -93,10 +98,17 @@ static int inter_minq_12[QINDEX_RANGE];
 static int rtc_minq_12[QINDEX_RANGE];
 #endif
 
+#ifdef AGGRESSIVE_VBR
+static int gf_high = 2400;
+static int gf_low = 400;
+static int kf_high = 4000;
+static int kf_low = 400;
+#else
 static int gf_high = 2000;
 static int gf_low = 400;
-static int kf_high = 5000;
-static int kf_low = 400;
+static int kf_high = 4800;
+static int kf_low = 300;
+#endif
 
 // Functions to compute the active minq lookup table entries based on a
 // formulaic approach to facilitate easier adjustment of the Q tables.
@@ -125,10 +137,15 @@ static void init_minq_luts(int *kf_low_m, int *kf_high_m, int *arfgf_low,
   for (i = 0; i < QINDEX_RANGE; i++) {
     const double maxq = vp9_convert_qindex_to_q(i, bit_depth);
     kf_low_m[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.150, bit_depth);
-    kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth);
+    kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.45, bit_depth);
+#ifdef AGGRESSIVE_VBR
+    arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.275, bit_depth);
+    inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.80, bit_depth);
+#else
     arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30, bit_depth);
-    arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth);
     inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70, bit_depth);
+#endif
+    arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth);
     rtc[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70, bit_depth);
   }
 }
@@ -156,16 +173,26 @@ double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth) {
   switch (bit_depth) {
     case VPX_BITS_8: return vp9_ac_quant(qindex, 0, bit_depth) / 4.0;
     case VPX_BITS_10: return vp9_ac_quant(qindex, 0, bit_depth) / 16.0;
-    case VPX_BITS_12: return vp9_ac_quant(qindex, 0, bit_depth) / 64.0;
     default:
-      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
-      return -1.0;
+      assert(bit_depth == VPX_BITS_12);
+      return vp9_ac_quant(qindex, 0, bit_depth) / 64.0;
   }
 #else
   return vp9_ac_quant(qindex, 0, bit_depth) / 4.0;
 #endif
 }
 
+int vp9_convert_q_to_qindex(double q_val, vpx_bit_depth_t bit_depth) {
+  int i;
+
+  for (i = 0; i < QINDEX_RANGE; ++i)
+    if (vp9_convert_qindex_to_q(i, bit_depth) >= q_val) break;
+
+  if (i == QINDEX_RANGE) i--;
+
+  return i;
+}
+
 int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
                        double correction_factor, vpx_bit_depth_t bit_depth) {
   const double q = vp9_convert_qindex_to_q(qindex, bit_depth);
@@ -185,12 +212,13 @@ int vp9_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
   const int bpm =
       (int)(vp9_rc_bits_per_mb(frame_type, q, correction_factor, bit_depth));
   return VPXMAX(FRAME_OVERHEAD_BITS,
-                (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS);
+                (int)(((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS));
 }
 
 int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) {
   const RATE_CONTROL *rc = &cpi->rc;
   const VP9EncoderConfig *oxcf = &cpi->oxcf;
+
   const int min_frame_target =
       VPXMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5);
   if (target < min_frame_target) target = min_frame_target;
@@ -201,12 +229,15 @@ int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) {
     // number of bits will be spent if needed for constructed ARFs.
     target = min_frame_target;
   }
+
   // Clip the frame target to the maximum allowed value.
   if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
+
   if (oxcf->rc_max_inter_bitrate_pct) {
-    const int max_rate =
-        rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100;
-    target = VPXMIN(target, max_rate);
+    const int64_t max_rate =
+        (int64_t)rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100;
+    // target is of type int and VPXMIN cannot evaluate to larger than target
+    target = (int)VPXMIN(target, max_rate);
   }
   return target;
 }
@@ -215,28 +246,78 @@ int vp9_rc_clamp_iframe_target_size(const VP9_COMP *const cpi, int target) {
   const RATE_CONTROL *rc = &cpi->rc;
   const VP9EncoderConfig *oxcf = &cpi->oxcf;
   if (oxcf->rc_max_intra_bitrate_pct) {
-    const int max_rate =
-        rc->avg_frame_bandwidth * oxcf->rc_max_intra_bitrate_pct / 100;
-    target = VPXMIN(target, max_rate);
+    const int64_t max_rate =
+        (int64_t)rc->avg_frame_bandwidth * oxcf->rc_max_intra_bitrate_pct / 100;
+    target = (int)VPXMIN(target, max_rate);
   }
   if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
   return target;
 }
 
+// TODO(marpan/jianj): bits_off_target and buffer_level are used in the same
+// way for CBR mode, for the buffering updates below. Look into removing one
+// of these (i.e., bits_off_target).
+// Update the buffer level before encoding with the per-frame-bandwidth,
+void vp9_update_buffer_level_preencode(VP9_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  rc->bits_off_target += rc->avg_frame_bandwidth;
+  // Clip the buffer level to the maximum specified buffer size.
+  rc->bits_off_target = VPXMIN(rc->bits_off_target, rc->maximum_buffer_size);
+  rc->buffer_level = rc->bits_off_target;
+}
+
+// Update the buffer level before encoding with the per-frame-bandwidth
+// for SVC. The current and all upper temporal layers are updated, needed
+// for the layered rate control which involves cumulative buffer levels for
+// the temporal layers. Allow for using the timestamp(pts) delta for the
+// framerate when the set_ref_frame_config is used.
+void vp9_update_buffer_level_svc_preencode(VP9_COMP *cpi) {
+  SVC *const svc = &cpi->svc;
+  int i;
+  // Set this to 1 to use timestamp delta for "framerate" under
+  // ref_frame_config usage.
+  int use_timestamp = 1;
+  const int64_t ts_delta =
+      svc->time_stamp_superframe - svc->time_stamp_prev[svc->spatial_layer_id];
+  for (i = svc->temporal_layer_id; i < svc->number_temporal_layers; ++i) {
+    const int layer =
+        LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, svc->number_temporal_layers);
+    LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+    RATE_CONTROL *const lrc = &lc->rc;
+    if (use_timestamp && cpi->svc.use_set_ref_frame_config &&
+        svc->number_temporal_layers == 1 && ts_delta > 0 &&
+        svc->current_superframe > 0) {
+      // TODO(marpan): This may need to be modified for temporal layers.
+      const double framerate_pts = 10000000.0 / ts_delta;
+      lrc->bits_off_target += saturate_cast_double_to_int(
+          round(lc->target_bandwidth / framerate_pts));
+    } else {
+      lrc->bits_off_target += saturate_cast_double_to_int(
+          round(lc->target_bandwidth / lc->framerate));
+    }
+    // Clip buffer level to maximum buffer size for the layer.
+    lrc->bits_off_target =
+        VPXMIN(lrc->bits_off_target, lrc->maximum_buffer_size);
+    lrc->buffer_level = lrc->bits_off_target;
+    if (i == svc->temporal_layer_id) {
+      cpi->rc.bits_off_target = lrc->bits_off_target;
+      cpi->rc.buffer_level = lrc->buffer_level;
+    }
+  }
+}
+
 // Update the buffer level for higher temporal layers, given the encoded current
 // temporal layer.
-static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) {
+static void update_layer_buffer_level_postencode(SVC *svc,
+                                                 int encoded_frame_size) {
   int i = 0;
-  int current_temporal_layer = svc->temporal_layer_id;
+  const int current_temporal_layer = svc->temporal_layer_id;
   for (i = current_temporal_layer + 1; i < svc->number_temporal_layers; ++i) {
     const int layer =
         LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, svc->number_temporal_layers);
     LAYER_CONTEXT *lc = &svc->layer_context[layer];
     RATE_CONTROL *lrc = &lc->rc;
-    int bits_off_for_this_layer =
-        (int)(lc->target_bandwidth / lc->framerate - encoded_frame_size);
-    lrc->bits_off_target += bits_off_for_this_layer;
-
+    lrc->bits_off_target -= encoded_frame_size;
     // Clip buffer level to maximum buffer size for the layer.
     lrc->bits_off_target =
         VPXMIN(lrc->bits_off_target, lrc->maximum_buffer_size);
@@ -244,21 +325,13 @@ static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) {
   }
 }
 
-// Update the buffer level: leaky bucket model.
-static void update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) {
-  const VP9_COMMON *const cm = &cpi->common;
+// Update the buffer level after encoding with encoded frame size.
+static void update_buffer_level_postencode(VP9_COMP *cpi,
+                                           int encoded_frame_size) {
   RATE_CONTROL *const rc = &cpi->rc;
-
-  // Non-viewable frames are a special case and are treated as pure overhead.
-  if (!cm->show_frame) {
-    rc->bits_off_target -= encoded_frame_size;
-  } else {
-    rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size;
-  }
-
+  rc->bits_off_target -= encoded_frame_size;
   // Clip the buffer level to the maximum specified buffer size.
   rc->bits_off_target = VPXMIN(rc->bits_off_target, rc->maximum_buffer_size);
-
   // For screen-content mode, and if frame-dropper is off, don't let buffer
   // level go below threshold, given here as -rc->maximum_ buffer_size.
   if (cpi->oxcf.content == VP9E_CONTENT_SCREEN &&
@@ -267,8 +340,8 @@ static void update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) {
 
   rc->buffer_level = rc->bits_off_target;
 
-  if (is_one_pass_cbr_svc(cpi)) {
-    update_layer_buffer_level(&cpi->svc, encoded_frame_size);
+  if (is_one_pass_svc(cpi)) {
+    update_layer_buffer_level_postencode(&cpi->svc, encoded_frame_size);
   }
 }
 
@@ -278,13 +351,13 @@ int vp9_rc_get_default_min_gf_interval(int width, int height,
   static const double factor_safe = 3840 * 2160 * 20.0;
   const double factor = width * height * framerate;
   const int default_interval =
-      clamp((int)(framerate * 0.125), MIN_GF_INTERVAL, MAX_GF_INTERVAL);
+      clamp((int)round(framerate * 0.125), MIN_GF_INTERVAL, MAX_GF_INTERVAL);
 
   if (factor <= factor_safe)
     return default_interval;
   else
     return VPXMAX(default_interval,
-                  (int)(MIN_GF_INTERVAL * factor / factor_safe + 0.5));
+                  (int)round(MIN_GF_INTERVAL * factor / factor_safe));
   // Note this logic makes:
   // 4K24: 5
   // 4K30: 6
@@ -292,7 +365,7 @@ int vp9_rc_get_default_min_gf_interval(int width, int height,
 }
 
 int vp9_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) {
-  int interval = VPXMIN(MAX_GF_INTERVAL, (int)(framerate * 0.75));
+  int interval = VPXMIN(MAX_GF_INTERVAL, (int)round(framerate * 0.75));
   interval += (interval & 0x01);  // Round to even value
   return VPXMAX(interval, min_gf_interval);
 }
@@ -329,12 +402,18 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
   rc->af_ratio_onepass_vbr = 10;
   rc->prev_avg_source_sad_lag = 0;
   rc->high_source_sad = 0;
+  rc->reset_high_source_sad = 0;
   rc->high_source_sad_lagindex = -1;
+  rc->high_num_blocks_with_motion = 0;
+  rc->hybrid_intra_scene_change = 0;
+  rc->re_encode_maxq_scene_change = 0;
   rc->alt_ref_gf_group = 0;
+  rc->last_frame_is_src_altref = 0;
   rc->fac_active_worst_inter = 150;
   rc->fac_active_worst_gf = 100;
   rc->force_qpmin = 0;
   for (i = 0; i < MAX_LAG_BUFFERS; ++i) rc->avg_source_sad[i] = 0;
+  rc->frames_to_key = 0;
   rc->frames_since_key = 8;  // Sensible default for first frame.
   rc->this_key_frame_forced = 0;
   rc->next_key_frame_forced = 0;
@@ -342,6 +421,7 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
   rc->source_alt_ref_active = 0;
 
   rc->frames_till_gf_update_due = 0;
+  rc->constrain_gf_key_freq_onepass_vbr = 1;
   rc->ni_av_qi = oxcf->worst_allowed_q;
   rc->ni_tot_qi = 0;
   rc->ni_frames = 0;
@@ -351,6 +431,7 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
 
   for (i = 0; i < RATE_FACTOR_LEVELS; ++i) {
     rc->rate_correction_factors[i] = 1.0;
+    rc->damped_adjustment[i] = 0;
   }
 
   rc->min_gf_interval = oxcf->min_gf_interval;
@@ -362,27 +443,121 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
     rc->max_gf_interval = vp9_rc_get_default_max_gf_interval(
         oxcf->init_framerate, rc->min_gf_interval);
   rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2;
+  if ((oxcf->pass == 0) && (oxcf->rc_mode == VPX_Q)) {
+    rc->static_scene_max_gf_interval = FIXED_GF_INTERVAL;
+  } else {
+    rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH;
+  }
+
+  rc->force_max_q = 0;
+  rc->last_post_encode_dropped_scene_change = 0;
+  rc->use_post_encode_drop = 0;
+  rc->ext_use_post_encode_drop = 0;
+  rc->disable_overshoot_maxq_cbr = 0;
+  rc->arf_active_best_quality_adjustment_factor = 1.0;
+  rc->arf_increase_active_best_quality = 0;
+  rc->preserve_arf_as_gld = 0;
+  rc->preserve_next_arf_as_gld = 0;
+  rc->show_arf_as_gld = 0;
 }
 
-int vp9_rc_drop_frame(VP9_COMP *cpi) {
+static int check_buffer_above_thresh(VP9_COMP *cpi, int drop_mark) {
+  SVC *svc = &cpi->svc;
+  if (!cpi->use_svc || cpi->svc.framedrop_mode != FULL_SUPERFRAME_DROP) {
+    RATE_CONTROL *const rc = &cpi->rc;
+    return (rc->buffer_level > drop_mark);
+  } else {
+    int i;
+    // For SVC in the FULL_SUPERFRAME_DROP): the condition on
+    // buffer (if its above threshold, so no drop) is checked on current and
+    // upper spatial layers. If any spatial layer is not above threshold then
+    // we return 0.
+    for (i = svc->spatial_layer_id; i < svc->number_spatial_layers; ++i) {
+      const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id,
+                                         svc->number_temporal_layers);
+      LAYER_CONTEXT *lc = &svc->layer_context[layer];
+      RATE_CONTROL *lrc = &lc->rc;
+      // Exclude check for layer whose bitrate is 0.
+      if (lc->target_bandwidth > 0) {
+        const int drop_mark_layer = (int)(cpi->svc.framedrop_thresh[i] *
+                                          lrc->optimal_buffer_level / 100);
+        if (!(lrc->buffer_level > drop_mark_layer)) return 0;
+      }
+    }
+    return 1;
+  }
+}
+
+static int check_buffer_below_thresh(VP9_COMP *cpi, int drop_mark) {
+  SVC *svc = &cpi->svc;
+  if (!cpi->use_svc || cpi->svc.framedrop_mode == LAYER_DROP) {
+    RATE_CONTROL *const rc = &cpi->rc;
+    return (rc->buffer_level <= drop_mark);
+  } else {
+    int i;
+    // For SVC in the constrained framedrop mode (svc->framedrop_mode =
+    // CONSTRAINED_LAYER_DROP or FULL_SUPERFRAME_DROP): the condition on
+    // buffer (if its below threshold, so drop frame) is checked on current
+    // and upper spatial layers. For FULL_SUPERFRAME_DROP mode if any
+    // spatial layer is <= threshold, then we return 1 (drop).
+    for (i = svc->spatial_layer_id; i < svc->number_spatial_layers; ++i) {
+      const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id,
+                                         svc->number_temporal_layers);
+      LAYER_CONTEXT *lc = &svc->layer_context[layer];
+      RATE_CONTROL *lrc = &lc->rc;
+      // Exclude check for layer whose bitrate is 0.
+      if (lc->target_bandwidth > 0) {
+        const int drop_mark_layer = (int)(cpi->svc.framedrop_thresh[i] *
+                                          lrc->optimal_buffer_level / 100);
+        if (cpi->svc.framedrop_mode == FULL_SUPERFRAME_DROP) {
+          if (lrc->buffer_level <= drop_mark_layer) return 1;
+        } else {
+          if (!(lrc->buffer_level <= drop_mark_layer)) return 0;
+        }
+      }
+    }
+    if (cpi->svc.framedrop_mode == FULL_SUPERFRAME_DROP)
+      return 0;
+    else
+      return 1;
+  }
+}
+
+int vp9_test_drop(VP9_COMP *cpi) {
   const VP9EncoderConfig *oxcf = &cpi->oxcf;
   RATE_CONTROL *const rc = &cpi->rc;
-  if (!oxcf->drop_frames_water_mark ||
-      (is_one_pass_cbr_svc(cpi) &&
-       cpi->svc.spatial_layer_id > cpi->svc.first_spatial_layer_to_encode)) {
+  SVC *svc = &cpi->svc;
+  int drop_frames_water_mark = oxcf->drop_frames_water_mark;
+  if (cpi->use_svc) {
+    // If we have dropped max_consec_drop frames, then we don't
+    // drop this spatial layer, and reset counter to 0.
+    if (svc->drop_count[svc->spatial_layer_id] == svc->max_consec_drop) {
+      svc->drop_count[svc->spatial_layer_id] = 0;
+      return 0;
+    } else {
+      drop_frames_water_mark = svc->framedrop_thresh[svc->spatial_layer_id];
+    }
+  }
+  if (!drop_frames_water_mark ||
+      (svc->spatial_layer_id > 0 &&
+       svc->framedrop_mode == FULL_SUPERFRAME_DROP)) {
     return 0;
   } else {
-    if (rc->buffer_level < 0) {
+    if ((rc->buffer_level < 0 && svc->framedrop_mode != FULL_SUPERFRAME_DROP) ||
+        (check_buffer_below_thresh(cpi, -1) &&
+         svc->framedrop_mode == FULL_SUPERFRAME_DROP)) {
       // Always drop if buffer is below 0.
       return 1;
     } else {
       // If buffer is below drop_mark, for now just drop every other frame
       // (starting with the next frame) until it increases back over drop_mark.
       int drop_mark =
-          (int)(oxcf->drop_frames_water_mark * rc->optimal_buffer_level / 100);
-      if ((rc->buffer_level > drop_mark) && (rc->decimation_factor > 0)) {
+          (int)(drop_frames_water_mark * rc->optimal_buffer_level / 100);
+      if (check_buffer_above_thresh(cpi, drop_mark) &&
+          (rc->decimation_factor > 0)) {
         --rc->decimation_factor;
-      } else if (rc->buffer_level <= drop_mark && rc->decimation_factor == 0) {
+      } else if (check_buffer_below_thresh(cpi, drop_mark) &&
+                 rc->decimation_factor == 0) {
         rc->decimation_factor = 1;
       }
       if (rc->decimation_factor > 0) {
@@ -401,11 +576,135 @@ int vp9_rc_drop_frame(VP9_COMP *cpi) {
   }
 }
 
+int post_encode_drop_cbr(VP9_COMP *cpi, size_t *size) {
+  size_t frame_size = *size << 3;
+  int64_t new_buffer_level =
+      cpi->rc.buffer_level + cpi->rc.avg_frame_bandwidth - (int64_t)frame_size;
+
+  // For now we drop if new buffer level (given the encoded frame size) goes
+  // below 0.
+  if (new_buffer_level < 0) {
+    *size = 0;
+    vp9_rc_postencode_update_drop_frame(cpi);
+    // Update flag to use for next frame.
+    if (cpi->rc.high_source_sad ||
+        (cpi->use_svc && cpi->svc.high_source_sad_superframe))
+      cpi->rc.last_post_encode_dropped_scene_change = 1;
+    // Force max_q on next fame.
+    cpi->rc.force_max_q = 1;
+    cpi->rc.avg_frame_qindex[INTER_FRAME] = cpi->rc.worst_quality;
+    cpi->last_frame_dropped = 1;
+    cpi->ext_refresh_frame_flags_pending = 0;
+    if (cpi->use_svc) {
+      SVC *svc = &cpi->svc;
+      int sl = 0;
+      int tl = 0;
+      svc->last_layer_dropped[svc->spatial_layer_id] = 1;
+      svc->drop_spatial_layer[svc->spatial_layer_id] = 1;
+      svc->drop_count[svc->spatial_layer_id]++;
+      svc->skip_enhancement_layer = 1;
+      // Postencode drop is only checked on base spatial layer,
+      // for now if max-q is set on base we force it on all layers.
+      for (sl = 0; sl < svc->number_spatial_layers; ++sl) {
+        for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
+          const int layer =
+              LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+          LAYER_CONTEXT *lc = &svc->layer_context[layer];
+          RATE_CONTROL *lrc = &lc->rc;
+          lrc->force_max_q = 1;
+          lrc->avg_frame_qindex[INTER_FRAME] = cpi->rc.worst_quality;
+        }
+      }
+    }
+    return 1;
+  }
+
+  cpi->rc.force_max_q = 0;
+  cpi->rc.last_post_encode_dropped_scene_change = 0;
+  return 0;
+}
+
+int vp9_rc_drop_frame(VP9_COMP *cpi) {
+  SVC *svc = &cpi->svc;
+  int svc_prev_layer_dropped = 0;
+  // In the constrained or full_superframe framedrop mode for svc
+  // (framedrop_mode != (LAYER_DROP && CONSTRAINED_FROM_ABOVE)),
+  // if the previous spatial layer was dropped, drop the current spatial layer.
+  if (cpi->use_svc && svc->spatial_layer_id > 0 &&
+      svc->drop_spatial_layer[svc->spatial_layer_id - 1])
+    svc_prev_layer_dropped = 1;
+  if ((svc_prev_layer_dropped && svc->framedrop_mode != LAYER_DROP &&
+       svc->framedrop_mode != CONSTRAINED_FROM_ABOVE_DROP) ||
+      svc->force_drop_constrained_from_above[svc->spatial_layer_id] ||
+      vp9_test_drop(cpi)) {
+    vp9_rc_postencode_update_drop_frame(cpi);
+    cpi->ext_refresh_frame_flags_pending = 0;
+    cpi->last_frame_dropped = 1;
+    if (cpi->use_svc) {
+      svc->last_layer_dropped[svc->spatial_layer_id] = 1;
+      svc->drop_spatial_layer[svc->spatial_layer_id] = 1;
+      svc->drop_count[svc->spatial_layer_id]++;
+      svc->skip_enhancement_layer = 1;
+      if (svc->framedrop_mode == LAYER_DROP ||
+          (svc->framedrop_mode == CONSTRAINED_FROM_ABOVE_DROP &&
+           svc->force_drop_constrained_from_above[svc->number_spatial_layers -
+                                                  1] == 0) ||
+          svc->drop_spatial_layer[0] == 0) {
+        // For the case of constrained drop mode where full superframe is
+        // dropped, we don't increment the svc frame counters.
+        // In particular temporal layer counter (which is incremented in
+        // vp9_inc_frame_in_layer()) won't be incremented, so on a dropped
+        // frame we try the same temporal_layer_id on next incoming frame.
+        // This is to avoid an issue with temporal alignment with full
+        // superframe dropping.
+        vp9_inc_frame_in_layer(cpi);
+      }
+      if (svc->spatial_layer_id == svc->number_spatial_layers - 1) {
+        int i;
+        int all_layers_drop = 1;
+        for (i = 0; i < svc->spatial_layer_id; i++) {
+          if (svc->drop_spatial_layer[i] == 0) {
+            all_layers_drop = 0;
+            break;
+          }
+        }
+        if (all_layers_drop == 1) svc->skip_enhancement_layer = 0;
+      }
+    }
+    return 1;
+  }
+  return 0;
+}
+
+static int adjust_q_cbr(const VP9_COMP *cpi, int q) {
+  // This makes sure q is between oscillating Qs to prevent resonance.
+  if (!cpi->rc.reset_high_source_sad &&
+      (!cpi->oxcf.gf_cbr_boost_pct ||
+       !(cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)) &&
+      (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) &&
+      cpi->rc.q_1_frame != cpi->rc.q_2_frame) {
+    int qclamp = clamp(q, VPXMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame),
+                       VPXMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame));
+    // If the previous frame had overshoot and the current q needs to increase
+    // above the clamped value, reduce the clamp for faster reaction to
+    // overshoot.
+    if (cpi->rc.rc_1_frame == -1 && q > qclamp)
+      q = (q + qclamp) >> 1;
+    else
+      q = qclamp;
+  }
+  if (cpi->oxcf.content == VP9E_CONTENT_SCREEN &&
+      cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+    vp9_cyclic_refresh_limit_q(cpi, &q);
+  return VPXMAX(VPXMIN(q, cpi->rc.worst_quality), cpi->rc.best_quality);
+}
+
 static double get_rate_correction_factor(const VP9_COMP *cpi) {
   const RATE_CONTROL *const rc = &cpi->rc;
+  const VP9_COMMON *const cm = &cpi->common;
   double rcf;
 
-  if (cpi->common.frame_type == KEY_FRAME) {
+  if (frame_is_intra_only(cm)) {
     rcf = rc->rate_correction_factors[KF_STD];
   } else if (cpi->oxcf.pass == 2) {
     RATE_FACTOR_LEVEL rf_lvl =
@@ -425,13 +724,14 @@ static double get_rate_correction_factor(const VP9_COMP *cpi) {
 
 static void set_rate_correction_factor(VP9_COMP *cpi, double factor) {
   RATE_CONTROL *const rc = &cpi->rc;
+  const VP9_COMMON *const cm = &cpi->common;
 
   // Normalize RCF to account for the size-dependent scaling factor.
   factor /= rcf_mult[cpi->rc.frame_size_selector];
 
   factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
 
-  if (cpi->common.frame_type == KEY_FRAME) {
+  if (frame_is_intra_only(cm)) {
     rc->rate_correction_factors[KF_STD] = factor;
   } else if (cpi->oxcf.pass == 2) {
     RATE_FACTOR_LEVEL rf_lvl =
@@ -452,6 +752,8 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) {
   int correction_factor = 100;
   double rate_correction_factor = get_rate_correction_factor(cpi);
   double adjustment_limit;
+  RATE_FACTOR_LEVEL rf_lvl =
+      cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
 
   int projected_size_based_on_q = 0;
 
@@ -468,8 +770,9 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) {
     projected_size_based_on_q =
         vp9_cyclic_refresh_estimate_bits_at_q(cpi, rate_correction_factor);
   } else {
+    FRAME_TYPE frame_type = cm->intra_only ? KEY_FRAME : cm->frame_type;
     projected_size_based_on_q =
-        vp9_estimate_bits_at_q(cpi->common.frame_type, cm->base_qindex, cm->MBs,
+        vp9_estimate_bits_at_q(frame_type, cm->base_qindex, cm->MBs,
                                rate_correction_factor, cm->bit_depth);
   }
   // Work out a size correction factor.
@@ -477,10 +780,16 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) {
     correction_factor = (int)((100 * (int64_t)cpi->rc.projected_frame_size) /
                               projected_size_based_on_q);
 
-  // More heavily damped adjustment used if we have been oscillating either side
-  // of target.
-  adjustment_limit =
-      0.25 + 0.5 * VPXMIN(1, fabs(log10(0.01 * correction_factor)));
+  // Do not use damped adjustment for the first frame of each frame type
+  if (!cpi->rc.damped_adjustment[rf_lvl]) {
+    adjustment_limit = 1.0;
+    cpi->rc.damped_adjustment[rf_lvl] = 1;
+  } else {
+    // More heavily damped adjustment used if we have been oscillating either
+    // side of target.
+    adjustment_limit =
+        0.25 + 0.5 * VPXMIN(1, fabs(log10(0.01 * correction_factor)));
+  }
 
   cpi->rc.q_2_frame = cpi->rc.q_1_frame;
   cpi->rc.q_1_frame = cm->base_qindex;
@@ -523,6 +832,7 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) {
 int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
                       int active_best_quality, int active_worst_quality) {
   const VP9_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   int q = active_worst_quality;
   int last_error = INT_MAX;
   int i, target_bits_per_mb, bits_per_mb_at_this_q;
@@ -536,42 +846,35 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
   i = active_best_quality;
 
   do {
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
-        cpi->svc.temporal_layer_id == 0) {
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cr->apply_cyclic_refresh &&
+        (!cpi->oxcf.gf_cbr_boost_pct || !cpi->refresh_golden_frame)) {
       bits_per_mb_at_this_q =
           (int)vp9_cyclic_refresh_rc_bits_per_mb(cpi, i, correction_factor);
     } else {
+      FRAME_TYPE frame_type = cm->intra_only ? KEY_FRAME : cm->frame_type;
       bits_per_mb_at_this_q = (int)vp9_rc_bits_per_mb(
-          cm->frame_type, i, correction_factor, cm->bit_depth);
+          frame_type, i, correction_factor, cm->bit_depth);
     }
 
+    int diff_bits = (int)VPXMIN(
+        VPXMAX(((int64_t)target_bits_per_mb - (int64_t)bits_per_mb_at_this_q),
+               -INT_MAX),
+        INT_MAX);
     if (bits_per_mb_at_this_q <= target_bits_per_mb) {
-      if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error)
+      if (diff_bits <= last_error)
         q = i;
       else
         q = i - 1;
 
       break;
     } else {
-      last_error = bits_per_mb_at_this_q - target_bits_per_mb;
+      last_error = -diff_bits;
     }
   } while (++i <= active_worst_quality);
 
-  // In CBR mode, this makes sure q is between oscillating Qs to prevent
-  // resonance.
-  if (cpi->oxcf.rc_mode == VPX_CBR &&
-      (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) &&
-      cpi->rc.q_1_frame != cpi->rc.q_2_frame) {
-    q = clamp(q, VPXMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame),
-              VPXMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame));
-  }
-#if USE_ALTREF_FOR_ONE_PASS
-  if (cpi->oxcf.enable_auto_arf && cpi->oxcf.pass == 0 &&
-      cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0 &&
-      cpi->rc.is_src_frame_alt_ref && !cpi->rc.alt_ref_gf_group) {
-    q = VPXMIN(q, (q + cpi->rc.last_boosted_qindex) >> 1);
-  }
-#endif
+  // Adjustment to q for CBR mode.
+  if (cpi->oxcf.rc_mode == VPX_CBR) return adjust_q_cbr(cpi, q);
+
   return q;
 }
 
@@ -600,13 +903,19 @@ static int get_kf_active_quality(const RATE_CONTROL *const rc, int q,
                             kf_low_motion_minq, kf_high_motion_minq);
 }
 
-static int get_gf_active_quality(const RATE_CONTROL *const rc, int q,
+static int get_gf_active_quality(const VP9_COMP *const cpi, int q,
                                  vpx_bit_depth_t bit_depth) {
+  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+  const RATE_CONTROL *const rc = &cpi->rc;
+
   int *arfgf_low_motion_minq;
   int *arfgf_high_motion_minq;
+  const int gfu_boost = cpi->multi_layer_arf
+                            ? gf_group->gfu_boost[gf_group->index]
+                            : rc->gfu_boost;
   ASSIGN_MINQ_TABLE(bit_depth, arfgf_low_motion_minq);
   ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq);
-  return get_active_quality(q, rc->gfu_boost, gf_low, gf_high,
+  return get_active_quality(q, gfu_boost, gf_low, gf_high,
                             arfgf_low_motion_minq, arfgf_high_motion_minq);
 }
 
@@ -619,7 +928,7 @@ static int calc_active_worst_quality_one_pass_vbr(const VP9_COMP *cpi) {
     active_worst_quality =
         curr_frame == 0 ? rc->worst_quality : rc->last_q[KEY_FRAME] << 1;
   } else {
-    if (!rc->is_src_frame_alt_ref &&
+    if (!rc->is_src_frame_alt_ref && !cpi->use_svc &&
         (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
       active_worst_quality =
           curr_frame == 1
@@ -651,7 +960,8 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) {
   int active_worst_quality;
   int ambient_qp;
   unsigned int num_frames_weight_key = 5 * cpi->svc.number_temporal_layers;
-  if (cm->frame_type == KEY_FRAME) return rc->worst_quality;
+  if (frame_is_intra_only(cm) || rc->reset_high_source_sad || rc->force_max_q)
+    return rc->worst_quality;
   // For ambient_qp we use minimum of avg_frame_qindex[KEY_FRAME/INTER_FRAME]
   // for the first few frames following key frame. These are both initialized
   // to worst_quality and updated with (3/4, 1/4) average in postencode_update.
@@ -661,11 +971,25 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) {
                    ? VPXMIN(rc->avg_frame_qindex[INTER_FRAME],
                             rc->avg_frame_qindex[KEY_FRAME])
                    : rc->avg_frame_qindex[INTER_FRAME];
-  active_worst_quality = VPXMIN(rc->worst_quality, ambient_qp * 5 >> 2);
+  active_worst_quality = VPXMIN(rc->worst_quality, (ambient_qp * 5) >> 2);
+  // For SVC if the current base spatial layer was key frame, use the QP from
+  // that base layer for ambient_qp.
+  if (cpi->use_svc && cpi->svc.spatial_layer_id > 0) {
+    int layer = LAYER_IDS_TO_IDX(0, cpi->svc.temporal_layer_id,
+                                 cpi->svc.number_temporal_layers);
+    const LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
+    if (lc->is_key_frame) {
+      const RATE_CONTROL *lrc = &lc->rc;
+      ambient_qp = VPXMIN(ambient_qp, lrc->last_q[KEY_FRAME]);
+      active_worst_quality = VPXMIN(rc->worst_quality, (ambient_qp * 9) >> 3);
+    }
+  }
   if (rc->buffer_level > rc->optimal_buffer_level) {
     // Adjust down.
-    // Maximum limit for down adjustment, ~30%.
+    // Maximum limit for down adjustment ~30%; make it lower for screen content.
     int max_adjustment_down = active_worst_quality / 3;
+    if (cpi->oxcf.content == VP9E_CONTENT_SCREEN)
+      max_adjustment_down = active_worst_quality >> 3;
     if (max_adjustment_down) {
       buff_lvl_step = ((rc->maximum_buffer_size - rc->optimal_buffer_level) /
                        max_adjustment_down);
@@ -734,6 +1058,7 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi,
           vp9_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth);
     }
   } else if (!rc->is_src_frame_alt_ref && !cpi->use_svc &&
+             cpi->oxcf.gf_cbr_boost_pct &&
              (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
     // Use the lower of active_worst_quality and recent
     // average Q as basis for GF/ARF best Q limit unless last frame was
@@ -744,7 +1069,7 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi,
     } else {
       q = active_worst_quality;
     }
-    active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+    active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth);
   } else {
     // Use the lower of active_worst_quality and recent/average Q.
     if (cm->current_video_frame > 1) {
@@ -769,21 +1094,8 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi,
   *top_index = active_worst_quality;
   *bottom_index = active_best_quality;
 
-#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
-  // Limit Q range for the adaptive loop.
-  if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced &&
-      !(cm->current_video_frame == 0)) {
-    int qdelta = 0;
-    vpx_clear_system_state();
-    qdelta = vp9_compute_qdelta_by_rate(
-        &cpi->rc, cm->frame_type, active_worst_quality, 2.0, cm->bit_depth);
-    *top_index = active_worst_quality + qdelta;
-    *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index;
-  }
-#endif
-
   // Special case code to try and match quality with forced key frames
-  if (cm->frame_type == KEY_FRAME && rc->this_key_frame_forced) {
+  if (frame_is_intra_only(cm) && rc->this_key_frame_forced) {
     q = rc->last_boosted_qindex;
   } else {
     q = vp9_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
@@ -796,6 +1108,7 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi,
         q = *top_index;
     }
   }
+
   assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
   assert(*bottom_index <= rc->worst_quality &&
          *bottom_index >= rc->best_quality);
@@ -855,8 +1168,9 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
   if (frame_is_intra_only(cm)) {
     if (oxcf->rc_mode == VPX_Q) {
       int qindex = cq_level;
-      double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
-      int delta_qindex = vp9_compute_qdelta(rc, q, q * 0.25, cm->bit_depth);
+      double qstart = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+      int delta_qindex =
+          vp9_compute_qdelta(rc, qstart, qstart * 0.25, cm->bit_depth);
       active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
     } else if (rc->this_key_frame_forced) {
       // Handle the special case for key frames forced when we have reached
@@ -900,35 +1214,38 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
     } else {
       q = rc->avg_frame_qindex[KEY_FRAME];
     }
-    // For constrained quality dont allow Q less than the cq level
+    // For constrained quality don't allow Q less than the cq level
     if (oxcf->rc_mode == VPX_CQ) {
       if (q < cq_level) q = cq_level;
 
-      active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+      active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth);
 
       // Constrained quality use slightly lower active best.
       active_best_quality = active_best_quality * 15 / 16;
 
     } else if (oxcf->rc_mode == VPX_Q) {
       int qindex = cq_level;
-      double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+      double qstart = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
       int delta_qindex;
       if (cpi->refresh_alt_ref_frame)
-        delta_qindex = vp9_compute_qdelta(rc, q, q * 0.40, cm->bit_depth);
+        delta_qindex =
+            vp9_compute_qdelta(rc, qstart, qstart * 0.40, cm->bit_depth);
       else
-        delta_qindex = vp9_compute_qdelta(rc, q, q * 0.50, cm->bit_depth);
+        delta_qindex =
+            vp9_compute_qdelta(rc, qstart, qstart * 0.50, cm->bit_depth);
       active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
     } else {
-      active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+      active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth);
     }
   } else {
     if (oxcf->rc_mode == VPX_Q) {
       int qindex = cq_level;
-      double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+      double qstart = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
       double delta_rate[FIXED_GF_INTERVAL] = { 0.50, 1.0, 0.85, 1.0,
                                                0.70, 1.0, 0.85, 1.0 };
       int delta_qindex = vp9_compute_qdelta(
-          rc, q, q * delta_rate[cm->current_video_frame % FIXED_GF_INTERVAL],
+          rc, qstart,
+          qstart * delta_rate[cm->current_video_frame % FIXED_GF_INTERVAL],
           cm->bit_depth);
       active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
     } else {
@@ -972,6 +1289,7 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
       qdelta = vp9_compute_qdelta_by_rate(
           &cpi->rc, cm->frame_type, active_worst_quality, 1.75, cm->bit_depth);
     }
+    if (rc->high_source_sad && cpi->sf.use_altref_onepass) qdelta = 0;
     *top_index = active_worst_quality + qdelta;
     *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index;
   }
@@ -985,6 +1303,30 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
   } else {
     q = vp9_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
                           active_worst_quality);
+
+    // For no lookahead: if buffer_level indicates overshoot, then avoid going
+    // to very low QP. This reduces overshoot observed in Issue: 376707227.
+    // Note the buffer_level is updated for every encoded frame as:
+    // buffer_level - starting_buffer_level += (avg_frame_bandwidth -
+    // encoded_frame_size). So normalizing this with framerate and #encoded
+    // frames (current_video_frame) gives the difference/error between target
+    // and encoding bitrate. The additional avg_frame_bandwidth term is to
+    // compensate for the pre-encoded buffer update (in
+    // vp9_rc_get_one_pass_vbr_params).
+    const int qp_thresh = 32;
+    const int64_t bitrate_err =
+        (int64_t)(cpi->framerate *
+                  (rc->buffer_level - rc->starting_buffer_level -
+                   rc->avg_frame_bandwidth) /
+                  (cm->current_video_frame + 1));
+    // Threshold may be tuned, but for now condition this on low QP.
+    if (cpi->oxcf.lag_in_frames == 0 && bitrate_err / 1000 < -10 &&
+        qp_thresh < rc->worst_quality &&
+        (q < qp_thresh || *top_index < qp_thresh)) {
+      q = qp_thresh;
+      *top_index = VPXMAX(*top_index, q);
+    }
+
     if (q > *top_index) {
       // Special case when we are targeting the max allowed rate
       if (rc->this_frame_target >= rc->max_frame_bandwidth)
@@ -1009,19 +1351,122 @@ int vp9_frame_type_qdelta(const VP9_COMP *cpi, int rf_level, int q) {
     1.75,  // GF_ARF_STD
     2.00,  // KF_STD
   };
-  static const FRAME_TYPE frame_type[RATE_FACTOR_LEVELS] = {
-    INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, KEY_FRAME
-  };
   const VP9_COMMON *const cm = &cpi->common;
-  int qdelta =
-      vp9_compute_qdelta_by_rate(&cpi->rc, frame_type[rf_level], q,
-                                 rate_factor_deltas[rf_level], cm->bit_depth);
+
+  int qdelta = vp9_compute_qdelta_by_rate(
+      &cpi->rc, cm->frame_type, q, rate_factor_deltas[rf_level], cm->bit_depth);
   return qdelta;
 }
 
 #define STATIC_MOTION_THRESH 95
-static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
-                                         int *top_index) {
+
+static void pick_kf_q_bound_two_pass(const VP9_COMP *cpi, int *bottom_index,
+                                     int *top_index) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  int active_best_quality;
+  int active_worst_quality = cpi->twopass.active_worst_quality;
+
+  if (rc->this_key_frame_forced) {
+    // Handle the special case for key frames forced when we have reached
+    // the maximum key frame interval. Here force the Q to a range
+    // based on the ambient Q to reduce the risk of popping.
+    double last_boosted_q;
+    int delta_qindex;
+    int qindex;
+
+    if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
+      qindex = VPXMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
+      active_best_quality = qindex;
+      last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+      delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
+                                        last_boosted_q * 1.25, cm->bit_depth);
+      active_worst_quality =
+          VPXMIN(qindex + delta_qindex, active_worst_quality);
+    } else {
+      qindex = rc->last_boosted_qindex;
+      last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+      delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
+                                        last_boosted_q * 0.75, cm->bit_depth);
+      active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
+    }
+  } else {
+    // Not forced keyframe.
+    double q_adj_factor = 1.0;
+    double q_val;
+    // Baseline value derived from cpi->active_worst_quality and kf boost.
+    active_best_quality =
+        get_kf_active_quality(rc, active_worst_quality, cm->bit_depth);
+    if (cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) {
+      active_best_quality /= 4;
+    }
+
+    // Don't allow the active min to be lossless (q0) unlesss the max q
+    // already indicates lossless.
+    active_best_quality =
+        VPXMIN(active_worst_quality, VPXMAX(1, active_best_quality));
+
+    // Allow somewhat lower kf minq with small image formats.
+    if ((cm->width * cm->height) <= (352 * 288)) {
+      q_adj_factor -= 0.25;
+    }
+
+    // Make a further adjustment based on the kf zero motion measure.
+    q_adj_factor += 0.05 - (0.001 * (double)cpi->twopass.kf_zeromotion_pct);
+
+    // Convert the adjustment factor to a qindex delta
+    // on active_best_quality.
+    q_val = vp9_convert_qindex_to_q(active_best_quality, cm->bit_depth);
+    active_best_quality +=
+        vp9_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth);
+  }
+  *top_index = active_worst_quality;
+  *bottom_index = active_best_quality;
+}
+
+static int rc_constant_q(const VP9_COMP *cpi, int *bottom_index, int *top_index,
+                         int gf_group_index) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+  const int is_intra_frame = frame_is_intra_only(cm);
+
+  const int cq_level = get_active_cq_level_two_pass(&cpi->twopass, rc, oxcf);
+
+  int q = cq_level;
+  int active_best_quality = cq_level;
+  int active_worst_quality = cq_level;
+
+  // Key frame qp decision
+  if (is_intra_frame && rc->frames_to_key > 1)
+    pick_kf_q_bound_two_pass(cpi, &active_best_quality, &active_worst_quality);
+
+  // ARF / GF qp decision
+  if (!is_intra_frame && !rc->is_src_frame_alt_ref &&
+      cpi->refresh_alt_ref_frame) {
+    active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth);
+
+    // Modify best quality for second level arfs. For mode VPX_Q this
+    // becomes the baseline frame q.
+    if (gf_group->rf_level[gf_group_index] == GF_ARF_LOW) {
+      const int layer_depth = gf_group->layer_depth[gf_group_index];
+      // linearly fit the frame q depending on the layer depth index from
+      // the base layer ARF.
+      active_best_quality = ((layer_depth - 1) * cq_level +
+                             active_best_quality + layer_depth / 2) /
+                            layer_depth;
+    }
+  }
+
+  q = active_best_quality;
+  *top_index = active_worst_quality;
+  *bottom_index = active_best_quality;
+  return q;
+}
+
+int vp9_rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
+                                      int *top_index, int gf_group_index) {
   const VP9_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
@@ -1031,56 +1476,20 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
   int active_worst_quality = cpi->twopass.active_worst_quality;
   int q;
   int *inter_minq;
+  int arf_active_best_quality_hl;
+  int *arfgf_high_motion_minq, *arfgf_low_motion_minq;
+  const int boost_frame =
+      !rc->is_src_frame_alt_ref &&
+      (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame);
+
   ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq);
 
-  if (frame_is_intra_only(cm) || vp9_is_upper_layer_key_frame(cpi)) {
-    // Handle the special case for key frames forced when we have reached
-    // the maximum key frame interval. Here force the Q to a range
-    // based on the ambient Q to reduce the risk of popping.
-    if (rc->this_key_frame_forced) {
-      double last_boosted_q;
-      int delta_qindex;
-      int qindex;
+  if (oxcf->rc_mode == VPX_Q)
+    return rc_constant_q(cpi, bottom_index, top_index, gf_group_index);
 
-      if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
-        qindex = VPXMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
-        active_best_quality = qindex;
-        last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
-        delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
-                                          last_boosted_q * 1.25, cm->bit_depth);
-        active_worst_quality =
-            VPXMIN(qindex + delta_qindex, active_worst_quality);
-      } else {
-        qindex = rc->last_boosted_qindex;
-        last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
-        delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
-                                          last_boosted_q * 0.75, cm->bit_depth);
-        active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
-      }
-    } else {
-      // Not forced keyframe.
-      double q_adj_factor = 1.0;
-      double q_val;
-      // Baseline value derived from cpi->active_worst_quality and kf boost.
-      active_best_quality =
-          get_kf_active_quality(rc, active_worst_quality, cm->bit_depth);
-
-      // Allow somewhat lower kf minq with small image formats.
-      if ((cm->width * cm->height) <= (352 * 288)) {
-        q_adj_factor -= 0.25;
-      }
-
-      // Make a further adjustment based on the kf zero motion measure.
-      q_adj_factor += 0.05 - (0.001 * (double)cpi->twopass.kf_zeromotion_pct);
-
-      // Convert the adjustment factor to a qindex delta
-      // on active_best_quality.
-      q_val = vp9_convert_qindex_to_q(active_best_quality, cm->bit_depth);
-      active_best_quality +=
-          vp9_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth);
-    }
-  } else if (!rc->is_src_frame_alt_ref &&
-             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+  if (frame_is_intra_only(cm)) {
+    pick_kf_q_bound_two_pass(cpi, &active_best_quality, &active_worst_quality);
+  } else if (boost_frame) {
     // Use the lower of active_worst_quality and recent
     // average Q as basis for GF/ARF best Q limit unless last frame was
     // a key frame.
@@ -1090,66 +1499,81 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
     } else {
       q = active_worst_quality;
     }
-    // For constrained quality dont allow Q less than the cq level
+    // For constrained quality don't allow Q less than the cq level
     if (oxcf->rc_mode == VPX_CQ) {
       if (q < cq_level) q = cq_level;
+    }
+    active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth);
+    arf_active_best_quality_hl = active_best_quality;
 
-      active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+    if (rc->arf_increase_active_best_quality == 1) {
+      ASSIGN_MINQ_TABLE(cm->bit_depth, arfgf_high_motion_minq);
+      arf_active_best_quality_hl = arfgf_high_motion_minq[q];
+    } else if (rc->arf_increase_active_best_quality == -1) {
+      ASSIGN_MINQ_TABLE(cm->bit_depth, arfgf_low_motion_minq);
+      arf_active_best_quality_hl = arfgf_low_motion_minq[q];
+    }
+    active_best_quality =
+        (int)((double)active_best_quality *
+                  rc->arf_active_best_quality_adjustment_factor +
+              (double)arf_active_best_quality_hl *
+                  (1.0 - rc->arf_active_best_quality_adjustment_factor));
 
-      // Constrained quality use slightly lower active best.
-      active_best_quality = active_best_quality * 15 / 16;
-
-    } else if (oxcf->rc_mode == VPX_Q) {
-      if (!cpi->refresh_alt_ref_frame) {
-        active_best_quality = cq_level;
-      } else {
-        active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
-
-        // Modify best quality for second level arfs. For mode VPX_Q this
-        // becomes the baseline frame q.
-        if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW)
-          active_best_quality = (active_best_quality + cq_level + 1) / 2;
-      }
-    } else {
-      active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+    // Modify best quality for second level arfs. For mode VPX_Q this
+    // becomes the baseline frame q.
+    if (gf_group->rf_level[gf_group_index] == GF_ARF_LOW) {
+      const int layer_depth = gf_group->layer_depth[gf_group_index];
+      // linearly fit the frame q depending on the layer depth index from
+      // the base layer ARF.
+      active_best_quality =
+          ((layer_depth - 1) * q + active_best_quality + layer_depth / 2) /
+          layer_depth;
     }
   } else {
-    if (oxcf->rc_mode == VPX_Q) {
-      active_best_quality = cq_level;
-    } else {
-      active_best_quality = inter_minq[active_worst_quality];
+    active_best_quality = inter_minq[active_worst_quality];
 
-      // For the constrained quality mode we don't want
-      // q to fall below the cq level.
-      if ((oxcf->rc_mode == VPX_CQ) && (active_best_quality < cq_level)) {
-        active_best_quality = cq_level;
-      }
+    // For the constrained quality mode we don't want
+    // q to fall below the cq level.
+    if ((oxcf->rc_mode == VPX_CQ) && (active_best_quality < cq_level)) {
+      active_best_quality = cq_level;
     }
   }
 
   // Extension to max or min Q if undershoot or overshoot is outside
   // the permitted range.
-  if (cpi->oxcf.rc_mode != VPX_Q) {
-    if (frame_is_intra_only(cm) ||
-        (!rc->is_src_frame_alt_ref &&
-         (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
-      active_best_quality -=
-          (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast);
-      active_worst_quality += (cpi->twopass.extend_maxq / 2);
-    } else {
-      active_best_quality -=
-          (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast) / 2;
-      active_worst_quality += cpi->twopass.extend_maxq;
+  if (frame_is_intra_only(cm) || boost_frame) {
+    const int layer_depth = gf_group->layer_depth[gf_group_index];
+    active_best_quality -=
+        (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast);
+    active_worst_quality += (cpi->twopass.extend_maxq / 2);
+
+    if (gf_group->rf_level[gf_group_index] == GF_ARF_LOW) {
+      assert(layer_depth > 1);
+      active_best_quality =
+          VPXMAX(active_best_quality,
+                 cpi->twopass.last_qindex_of_arf_layer[layer_depth - 1]);
     }
+  } else {
+    const int max_layer_depth = gf_group->max_layer_depth;
+    assert(max_layer_depth > 0);
+
+    active_best_quality -=
+        (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast) / 2;
+    active_worst_quality += cpi->twopass.extend_maxq;
+
+    // For normal frames do not allow an active minq lower than the q used for
+    // the last boosted frame.
+    active_best_quality =
+        VPXMAX(active_best_quality,
+               cpi->twopass.last_qindex_of_arf_layer[max_layer_depth - 1]);
   }
 
 #if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
   vpx_clear_system_state();
   // Static forced key frames Q restrictions dealt with elsewhere.
-  if (!((frame_is_intra_only(cm) || vp9_is_upper_layer_key_frame(cpi))) ||
-      !rc->this_key_frame_forced ||
-      (cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) {
-    int qdelta = vp9_frame_type_qdelta(cpi, gf_group->rf_level[gf_group->index],
+  if (!frame_is_intra_only(cm) || !rc->this_key_frame_forced ||
+      cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH) {
+    int qdelta = vp9_frame_type_qdelta(cpi, gf_group->rf_level[gf_group_index],
                                        active_worst_quality);
     active_worst_quality =
         VPXMAX(active_worst_quality + qdelta, active_best_quality);
@@ -1169,17 +1593,15 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
   active_worst_quality =
       clamp(active_worst_quality, active_best_quality, rc->worst_quality);
 
-  if (oxcf->rc_mode == VPX_Q) {
-    q = active_best_quality;
-    // Special case code to try and match quality with forced key frames.
-  } else if ((frame_is_intra_only(cm) || vp9_is_upper_layer_key_frame(cpi)) &&
-             rc->this_key_frame_forced) {
+  if (frame_is_intra_only(cm) && rc->this_key_frame_forced) {
     // If static since last kf use better of last boosted and last kf q.
     if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
       q = VPXMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
     } else {
       q = rc->last_boosted_qindex;
     }
+  } else if (frame_is_intra_only(cm) && !rc->this_key_frame_forced) {
+    q = active_best_quality;
   } else {
     q = vp9_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
                           active_worst_quality);
@@ -1191,7 +1613,6 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
         q = active_worst_quality;
     }
   }
-  clamp(q, active_best_quality, active_worst_quality);
 
   *top_index = active_worst_quality;
   *bottom_index = active_best_quality;
@@ -1206,13 +1627,15 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
 int vp9_rc_pick_q_and_bounds(const VP9_COMP *cpi, int *bottom_index,
                              int *top_index) {
   int q;
+  const int gf_group_index = cpi->twopass.gf_group.index;
   if (cpi->oxcf.pass == 0) {
     if (cpi->oxcf.rc_mode == VPX_CBR)
       q = rc_pick_q_and_bounds_one_pass_cbr(cpi, bottom_index, top_index);
     else
       q = rc_pick_q_and_bounds_one_pass_vbr(cpi, bottom_index, top_index);
   } else {
-    q = rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index);
+    q = vp9_rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index,
+                                          gf_group_index);
   }
   if (cpi->sf.use_nonrd_pick_mode) {
     if (cpi->sf.force_frame_boost == 1) q -= cpi->sf.max_delta_qindex;
@@ -1225,6 +1648,64 @@ int vp9_rc_pick_q_and_bounds(const VP9_COMP *cpi, int *bottom_index,
   return q;
 }
 
+void vp9_configure_buffer_updates(VP9_COMP *cpi, int gf_group_index) {
+  VP9_COMMON *cm = &cpi->common;
+  TWO_PASS *const twopass = &cpi->twopass;
+
+  cpi->rc.is_src_frame_alt_ref = 0;
+  cm->show_existing_frame = 0;
+  cpi->rc.show_arf_as_gld = 0;
+  switch (twopass->gf_group.update_type[gf_group_index]) {
+    case KF_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 1;
+      cpi->refresh_alt_ref_frame = 1;
+      break;
+    case LF_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+      break;
+    case GF_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 1;
+      cpi->refresh_alt_ref_frame = 0;
+      break;
+    case OVERLAY_UPDATE:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 1;
+      cpi->refresh_alt_ref_frame = 0;
+      cpi->rc.is_src_frame_alt_ref = 1;
+      if (cpi->rc.preserve_arf_as_gld) {
+        cpi->rc.show_arf_as_gld = 1;
+        cpi->refresh_golden_frame = 0;
+        cm->show_existing_frame = 1;
+        cm->refresh_frame_context = 0;
+      }
+      break;
+    case MID_OVERLAY_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+      cpi->rc.is_src_frame_alt_ref = 1;
+      break;
+    case USE_BUF_FRAME:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+      cpi->rc.is_src_frame_alt_ref = 1;
+      cm->show_existing_frame = 1;
+      cm->refresh_frame_context = 0;
+      break;
+    default:
+      assert(twopass->gf_group.update_type[gf_group_index] == ARF_UPDATE);
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_alt_ref_frame = 1;
+      break;
+  }
+}
+
 void vp9_rc_compute_frame_size_bounds(const VP9_COMP *cpi, int frame_target,
                                       int *frame_under_shoot_limit,
                                       int *frame_over_shoot_limit) {
@@ -1234,8 +1715,10 @@ void vp9_rc_compute_frame_size_bounds(const VP9_COMP *cpi, int frame_target,
   } else {
     // For very small rate targets where the fractional adjustment
     // may be tiny make sure there is at least a minimum range.
-    const int tol_low = (cpi->sf.recode_tolerance_low * frame_target) / 100;
-    const int tol_high = (cpi->sf.recode_tolerance_high * frame_target) / 100;
+    const int tol_low =
+        (int)(((int64_t)cpi->sf.recode_tolerance_low * frame_target) / 100);
+    const int tol_high =
+        (int)(((int64_t)cpi->sf.recode_tolerance_high * frame_target) / 100);
     *frame_under_shoot_limit = VPXMAX(frame_target - tol_low - 100, 0);
     *frame_over_shoot_limit =
         VPXMIN(frame_target + tol_high + 100, cpi->rc.max_frame_bandwidth);
@@ -1250,13 +1733,15 @@ void vp9_rc_set_frame_target(VP9_COMP *cpi, int target) {
 
   // Modify frame size target when down-scaling.
   if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC &&
-      rc->frame_size_selector != UNSCALED)
+      rc->frame_size_selector != UNSCALED) {
     rc->this_frame_target = (int)(rc->this_frame_target *
                                   rate_thresh_mult[rc->frame_size_selector]);
+  }
 
   // Target rate per SB64 (including partial SB64s.
-  rc->sb64_target_rate = (int)(((int64_t)rc->this_frame_target * 64 * 64) /
-                               (cm->width * cm->height));
+  const int64_t sb64_target_rate =
+      ((int64_t)rc->this_frame_target * 64 * 64) / (cm->width * cm->height);
+  rc->sb64_target_rate = (int)VPXMIN(sb64_target_rate, INT_MAX);
 }
 
 static void update_alt_ref_frame_stats(VP9_COMP *cpi) {
@@ -1297,11 +1782,43 @@ static void update_golden_frame_stats(VP9_COMP *cpi) {
     if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--;
 
     rc->frames_since_golden++;
+
+    if (rc->show_arf_as_gld) {
+      rc->frames_since_golden = 0;
+      // If we are not using alt ref in the up and coming group clear the arf
+      // active flag. In multi arf group case, if the index is not 0 then
+      // we are overlaying a mid group arf so should not reset the flag.
+      if (!rc->source_alt_ref_pending && (cpi->twopass.gf_group.index == 0))
+        rc->source_alt_ref_active = 0;
+    }
   }
 }
 
-static void compute_frame_low_motion(VP9_COMP *const cpi) {
+static void update_altref_usage(VP9_COMP *const cpi) {
   VP9_COMMON *const cm = &cpi->common;
+  int sum_ref_frame_usage = 0;
+  int arf_frame_usage = 0;
+  int mi_row, mi_col;
+  if (cpi->rc.alt_ref_gf_group && !cpi->rc.is_src_frame_alt_ref &&
+      !cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame)
+    for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 8) {
+      for (mi_col = 0; mi_col < cm->mi_cols; mi_col += 8) {
+        int sboffset = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3);
+        sum_ref_frame_usage += cpi->count_arf_frame_usage[sboffset] +
+                               cpi->count_lastgolden_frame_usage[sboffset];
+        arf_frame_usage += cpi->count_arf_frame_usage[sboffset];
+      }
+    }
+  if (sum_ref_frame_usage > 0) {
+    double altref_count = 100.0 * arf_frame_usage / sum_ref_frame_usage;
+    cpi->rc.perc_arf_usage =
+        0.75 * cpi->rc.perc_arf_usage + 0.25 * altref_count;
+  }
+}
+
+void vp9_compute_frame_low_motion(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  SVC *const svc = &cpi->svc;
   int mi_row, mi_col;
   MODE_INFO **mi = cm->mi_grid_visible;
   RATE_CONTROL *const rc = &cpi->rc;
@@ -1309,7 +1826,8 @@ static void compute_frame_low_motion(VP9_COMP *const cpi) {
   int cnt_zeromv = 0;
   for (mi_row = 0; mi_row < rows; mi_row++) {
     for (mi_col = 0; mi_col < cols; mi_col++) {
-      if (abs(mi[0]->mv[0].as_mv.row) < 16 && abs(mi[0]->mv[0].as_mv.col) < 16)
+      if (mi[0]->ref_frame[0] == LAST_FRAME &&
+          abs(mi[0]->mv[0].as_mv.row) < 16 && abs(mi[0]->mv[0].as_mv.col) < 16)
         cnt_zeromv++;
       mi++;
     }
@@ -1317,17 +1835,30 @@ static void compute_frame_low_motion(VP9_COMP *const cpi) {
   }
   cnt_zeromv = 100 * cnt_zeromv / (rows * cols);
   rc->avg_frame_low_motion = (3 * rc->avg_frame_low_motion + cnt_zeromv) >> 2;
+
+  // For SVC: set avg_frame_low_motion (only computed on top spatial layer)
+  // to all lower spatial layers.
+  if (cpi->use_svc && svc->spatial_layer_id == svc->number_spatial_layers - 1) {
+    int i;
+    for (i = 0; i < svc->number_spatial_layers - 1; ++i) {
+      const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id,
+                                         svc->number_temporal_layers);
+      LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+      RATE_CONTROL *const lrc = &lc->rc;
+      lrc->avg_frame_low_motion = rc->avg_frame_low_motion;
+    }
+  }
 }
 
 void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   const VP9_COMMON *const cm = &cpi->common;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   RATE_CONTROL *const rc = &cpi->rc;
+  SVC *const svc = &cpi->svc;
   const int qindex = cm->base_qindex;
-
-  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
-    vp9_cyclic_refresh_postencode(cpi);
-  }
+  const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+  const int gf_group_index = cpi->twopass.gf_group.index;
+  const int layer_depth = gf_group->layer_depth[gf_group_index];
 
   // Update rate control heuristics
   rc->projected_frame_size = (int)(bytes_used << 3);
@@ -1336,13 +1867,12 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   vp9_rc_update_rate_correction_factors(cpi);
 
   // Keep a record of last Q and ambient average Q.
-  if (cm->frame_type == KEY_FRAME) {
+  if (frame_is_intra_only(cm)) {
     rc->last_q[KEY_FRAME] = qindex;
     rc->avg_frame_qindex[KEY_FRAME] =
         ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
     if (cpi->use_svc) {
-      int i = 0;
-      SVC *svc = &cpi->svc;
+      int i;
       for (i = 0; i < svc->number_temporal_layers; ++i) {
         const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i,
                                            svc->number_temporal_layers);
@@ -1353,7 +1883,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
       }
     }
   } else {
-    if ((cpi->use_svc && oxcf->rc_mode == VPX_CBR) ||
+    if ((cpi->use_svc) ||
         (!rc->is_src_frame_alt_ref &&
          !(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
       rc->last_q[INTER_FRAME] = qindex;
@@ -1369,6 +1899,8 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
     }
   }
 
+  if (cpi->use_svc) vp9_svc_adjust_avg_frame_qindex(cpi);
+
   // Keep record of last boosted (KF/KF/ARF) Q value.
   // If the current frame is coded at a lower Q then we also update it.
   // If all mbs in this group are skipped only update if the Q value is
@@ -1380,21 +1912,31 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
         (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) {
     rc->last_boosted_qindex = qindex;
   }
-  if (cm->frame_type == KEY_FRAME) rc->last_kf_qindex = qindex;
 
-  update_buffer_level(cpi, rc->projected_frame_size);
+  if ((qindex < cpi->twopass.last_qindex_of_arf_layer[layer_depth]) ||
+      (cm->frame_type == KEY_FRAME) ||
+      (!rc->constrained_gf_group &&
+       (cpi->refresh_alt_ref_frame ||
+        (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) {
+    cpi->twopass.last_qindex_of_arf_layer[layer_depth] = qindex;
+  }
+
+  if (frame_is_intra_only(cm)) rc->last_kf_qindex = qindex;
+
+  update_buffer_level_postencode(cpi, rc->projected_frame_size);
 
   // Rolling monitors of whether we are over or underspending used to help
   // regulate min and Max Q in two pass.
-  if (cm->frame_type != KEY_FRAME) {
-    rc->rolling_target_bits = ROUND_POWER_OF_TWO(
-        rc->rolling_target_bits * 3 + rc->this_frame_target, 2);
-    rc->rolling_actual_bits = ROUND_POWER_OF_TWO(
-        rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2);
-    rc->long_rolling_target_bits = ROUND_POWER_OF_TWO(
-        rc->long_rolling_target_bits * 31 + rc->this_frame_target, 5);
-    rc->long_rolling_actual_bits = ROUND_POWER_OF_TWO(
-        rc->long_rolling_actual_bits * 31 + rc->projected_frame_size, 5);
+  if (!frame_is_intra_only(cm)) {
+    rc->rolling_target_bits = (int)ROUND64_POWER_OF_TWO(
+        (int64_t)rc->rolling_target_bits * 3 + rc->this_frame_target, 2);
+    rc->rolling_actual_bits = (int)ROUND64_POWER_OF_TWO(
+        (int64_t)rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2);
+    rc->long_rolling_target_bits = (int)ROUND64_POWER_OF_TWO(
+        (int64_t)rc->long_rolling_target_bits * 31 + rc->this_frame_target, 5);
+    rc->long_rolling_actual_bits = (int)ROUND64_POWER_OF_TWO(
+        (int64_t)rc->long_rolling_actual_bits * 31 + rc->projected_frame_size,
+        5);
   }
 
   // Actual bits spent
@@ -1403,9 +1945,9 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
 
   rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits;
 
-  if (!cpi->use_svc || is_two_pass_svc(cpi)) {
+  if (!cpi->use_svc) {
     if (is_altref_enabled(cpi) && cpi->refresh_alt_ref_frame &&
-        (cm->frame_type != KEY_FRAME))
+        (!frame_is_intra_only(cm)))
       // Update the alternate reference frame stats as appropriate.
       update_alt_ref_frame_stats(cpi);
     else
@@ -1413,7 +1955,28 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
       update_golden_frame_stats(cpi);
   }
 
-  if (cm->frame_type == KEY_FRAME) rc->frames_since_key = 0;
+  // If second (long term) temporal reference is used for SVC,
+  // update the golden frame counter, only for base temporal layer.
+  if (cpi->use_svc && svc->use_gf_temporal_ref_current_layer &&
+      svc->temporal_layer_id == 0) {
+    int i = 0;
+    if (cpi->refresh_golden_frame)
+      rc->frames_since_golden = 0;
+    else
+      rc->frames_since_golden++;
+    // Decrement count down till next gf
+    if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--;
+    // Update the frames_since_golden for all upper temporal layers.
+    for (i = 1; i < svc->number_temporal_layers; ++i) {
+      const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i,
+                                         svc->number_temporal_layers);
+      LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+      RATE_CONTROL *const lrc = &lc->rc;
+      lrc->frames_since_golden = rc->frames_since_golden;
+    }
+  }
+
+  if (frame_is_intra_only(cm)) rc->frames_since_key = 0;
   if (cm->show_frame) {
     rc->frames_since_key++;
     rc->frames_to_key--;
@@ -1427,36 +1990,68 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   }
 
   if (oxcf->pass == 0) {
-    if (cm->frame_type != KEY_FRAME) compute_frame_low_motion(cpi);
+    if (!frame_is_intra_only(cm))
+      if (cpi->sf.use_altref_onepass) update_altref_usage(cpi);
+    cpi->rc.last_frame_is_src_altref = cpi->rc.is_src_frame_alt_ref;
   }
+
+  if (!frame_is_intra_only(cm)) rc->reset_high_source_sad = 0;
+
+  rc->last_avg_frame_bandwidth = rc->avg_frame_bandwidth;
+  if (cpi->use_svc && svc->spatial_layer_id < svc->number_spatial_layers - 1)
+    svc->lower_layer_qindex = cm->base_qindex;
+  cpi->deadline_mode_previous_frame = cpi->oxcf.mode;
 }
 
 void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
-  // Update buffer level with zero size, update frame counters, and return.
-  update_buffer_level(cpi, 0);
+  cpi->common.current_video_frame++;
   cpi->rc.frames_since_key++;
   cpi->rc.frames_to_key--;
   cpi->rc.rc_2_frame = 0;
   cpi->rc.rc_1_frame = 0;
+  cpi->rc.last_avg_frame_bandwidth = cpi->rc.avg_frame_bandwidth;
+  cpi->rc.last_q[INTER_FRAME] = cpi->common.base_qindex;
+  // For SVC on dropped frame when framedrop_mode != LAYER_DROP:
+  // in this mode the whole superframe may be dropped if only a single layer
+  // has buffer underflow (below threshold). Since this can then lead to
+  // increasing buffer levels/overflow for certain layers even though whole
+  // superframe is dropped, we cap buffer level if its already stable.
+  if (cpi->use_svc && cpi->svc.framedrop_mode != LAYER_DROP &&
+      cpi->rc.buffer_level > cpi->rc.optimal_buffer_level) {
+    cpi->rc.buffer_level = cpi->rc.optimal_buffer_level;
+    cpi->rc.bits_off_target = cpi->rc.optimal_buffer_level;
+  }
+  cpi->deadline_mode_previous_frame = cpi->oxcf.mode;
 }
 
-static int calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) {
+int vp9_calc_pframe_target_size_one_pass_vbr(const VP9_COMP *cpi) {
   const RATE_CONTROL *const rc = &cpi->rc;
   const int af_ratio = rc->af_ratio_onepass_vbr;
-  int target =
+  int64_t target =
       (!rc->is_src_frame_alt_ref &&
        (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))
-          ? (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio) /
+          ? ((int64_t)rc->avg_frame_bandwidth * rc->baseline_gf_interval *
+             af_ratio) /
                 (rc->baseline_gf_interval + af_ratio - 1)
-          : (rc->avg_frame_bandwidth * rc->baseline_gf_interval) /
+          : ((int64_t)rc->avg_frame_bandwidth * rc->baseline_gf_interval) /
                 (rc->baseline_gf_interval + af_ratio - 1);
-  return vp9_rc_clamp_pframe_target_size(cpi, target);
+  // For SVC: refresh flags are used to define the pattern, so we can't
+  // use that for boosting the target size here.
+  // TODO(marpan): Consider adding internal boost on TL0 for VBR-SVC.
+  // For now just use the CBR logic for setting target size.
+  if (cpi->use_svc) target = vp9_calc_pframe_target_size_one_pass_cbr(cpi);
+  if (target > INT_MAX) target = INT_MAX;
+  return vp9_rc_clamp_pframe_target_size(cpi, (int)target);
 }
 
-static int calc_iframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) {
+int vp9_calc_iframe_target_size_one_pass_vbr(const VP9_COMP *cpi) {
   static const int kf_ratio = 25;
   const RATE_CONTROL *rc = &cpi->rc;
-  const int target = rc->avg_frame_bandwidth * kf_ratio;
+  int target = rc->avg_frame_bandwidth;
+  if (target > INT_MAX / kf_ratio)
+    target = INT_MAX;
+  else
+    target = rc->avg_frame_bandwidth * kf_ratio;
   return vp9_rc_clamp_iframe_target_size(cpi, target);
 }
 
@@ -1479,23 +2074,9 @@ static void adjust_gfint_frame_constraint(VP9_COMP *cpi, int frame_constraint) {
   }
 }
 
-void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
+void vp9_set_gf_update_one_pass_vbr(VP9_COMP *const cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
-  int target;
-  // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic.
-  if (!cpi->refresh_alt_ref_frame &&
-      (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
-       rc->frames_to_key == 0 || (cpi->oxcf.auto_key && 0))) {
-    cm->frame_type = KEY_FRAME;
-    rc->this_key_frame_forced =
-        cm->current_video_frame != 0 && rc->frames_to_key == 0;
-    rc->frames_to_key = cpi->oxcf.key_freq;
-    rc->kf_boost = DEFAULT_KF_BOOST;
-    rc->source_alt_ref_active = 0;
-  } else {
-    cm->frame_type = INTER_FRAME;
-  }
+  VP9_COMMON *const cm = &cpi->common;
   if (rc->frames_till_gf_update_due == 0) {
     double rate_err = 1.0;
     rc->gfu_boost = DEFAULT_GF_BOOST;
@@ -1514,39 +2095,63 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
           rate_err > 3.5) {
         rc->baseline_gf_interval =
             VPXMIN(15, (3 * rc->baseline_gf_interval) >> 1);
-      } else if (rc->avg_frame_low_motion < 20) {
+      } else if (rc->avg_frame_low_motion > 0 &&
+                 rc->avg_frame_low_motion < 20) {
         // Decrease gf interval for high motion case.
         rc->baseline_gf_interval = VPXMAX(6, rc->baseline_gf_interval >> 1);
       }
-      // Adjust boost and af_ratio based on avg_frame_low_motion, which varies
-      // between 0 and 100 (stationary, 100% zero/small motion).
-      rc->gfu_boost =
-          VPXMAX(500, DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) /
-                          (rc->avg_frame_low_motion + 100));
+      // Adjust boost and af_ratio based on avg_frame_low_motion, which
+      // varies between 0 and 100 (stationary, 100% zero/small motion).
+      if (rc->avg_frame_low_motion > 0)
+        rc->gfu_boost =
+            VPXMAX(500, DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) /
+                            (rc->avg_frame_low_motion + 100));
+      else if (rc->avg_frame_low_motion == 0 && rate_err > 1.0)
+        rc->gfu_boost = DEFAULT_GF_BOOST >> 1;
       rc->af_ratio_onepass_vbr = VPXMIN(15, VPXMAX(5, 3 * rc->gfu_boost / 400));
     }
-    adjust_gfint_frame_constraint(cpi, rc->frames_to_key);
+    if (rc->constrain_gf_key_freq_onepass_vbr)
+      adjust_gfint_frame_constraint(cpi, rc->frames_to_key);
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
     cpi->refresh_golden_frame = 1;
     rc->source_alt_ref_pending = 0;
     rc->alt_ref_gf_group = 0;
-#if USE_ALTREF_FOR_ONE_PASS
-    if (cpi->oxcf.enable_auto_arf) {
+    if (cpi->sf.use_altref_onepass && cpi->oxcf.enable_auto_arf) {
       rc->source_alt_ref_pending = 1;
       rc->alt_ref_gf_group = 1;
     }
-#endif
   }
+}
+
+void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int target;
+  if (!cpi->refresh_alt_ref_frame &&
+      (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
+       rc->frames_to_key == 0 ||
+       (cpi->oxcf.mode != cpi->deadline_mode_previous_frame))) {
+    cm->frame_type = KEY_FRAME;
+    rc->this_key_frame_forced =
+        cm->current_video_frame != 0 && rc->frames_to_key == 0;
+    rc->frames_to_key = cpi->oxcf.key_freq;
+    rc->kf_boost = DEFAULT_KF_BOOST;
+    rc->source_alt_ref_active = 0;
+  } else {
+    cm->frame_type = INTER_FRAME;
+  }
+  vp9_set_gf_update_one_pass_vbr(cpi);
   if (cm->frame_type == KEY_FRAME)
-    target = calc_iframe_target_size_one_pass_vbr(cpi);
+    target = vp9_calc_iframe_target_size_one_pass_vbr(cpi);
   else
-    target = calc_pframe_target_size_one_pass_vbr(cpi);
+    target = vp9_calc_pframe_target_size_one_pass_vbr(cpi);
   vp9_rc_set_frame_target(cpi, target);
+  if (cm->show_frame) vp9_update_buffer_level_preencode(cpi);
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.pass == 0)
     vp9_cyclic_refresh_update_parameters(cpi);
 }
 
-static int calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
+int vp9_calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
   const VP9EncoderConfig *oxcf = &cpi->oxcf;
   const RATE_CONTROL *rc = &cpi->rc;
   const SVC *const svc = &cpi->svc;
@@ -1554,20 +2159,21 @@ static int calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
   const int64_t one_pct_bits = 1 + rc->optimal_buffer_level / 100;
   int min_frame_target =
       VPXMAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS);
-  int target;
+  int64_t target;
 
   if (oxcf->gf_cbr_boost_pct) {
     const int af_ratio_pct = oxcf->gf_cbr_boost_pct + 100;
     target = cpi->refresh_golden_frame
-                 ? (rc->avg_frame_bandwidth * rc->baseline_gf_interval *
-                    af_ratio_pct) /
+                 ? ((int64_t)rc->avg_frame_bandwidth *
+                    rc->baseline_gf_interval * af_ratio_pct) /
                        (rc->baseline_gf_interval * 100 + af_ratio_pct - 100)
-                 : (rc->avg_frame_bandwidth * rc->baseline_gf_interval * 100) /
+                 : ((int64_t)rc->avg_frame_bandwidth *
+                    rc->baseline_gf_interval * 100) /
                        (rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
   } else {
     target = rc->avg_frame_bandwidth;
   }
-  if (is_one_pass_cbr_svc(cpi)) {
+  if (is_one_pass_svc(cpi)) {
     // Note that for layers, avg_frame_bandwidth is the cumulative
     // per-frame-bandwidth. For the target size of this frame, use the
     // layer average frame size (i.e., non-cumulative per-frame-bw).
@@ -1588,22 +2194,21 @@ static int calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
     target += (target * pct_high) / 200;
   }
   if (oxcf->rc_max_inter_bitrate_pct) {
-    const int max_rate =
-        rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100;
+    const int64_t max_rate =
+        (int64_t)rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100;
     target = VPXMIN(target, max_rate);
   }
-  return VPXMAX(min_frame_target, target);
+  if (target > INT_MAX) target = INT_MAX;
+  return VPXMAX(min_frame_target, (int)target);
 }
 
-static int calc_iframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
+int vp9_calc_iframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
   const RATE_CONTROL *rc = &cpi->rc;
   const VP9EncoderConfig *oxcf = &cpi->oxcf;
   const SVC *const svc = &cpi->svc;
-  int target;
+  int64_t target;
   if (cpi->common.current_video_frame == 0) {
-    target = ((rc->starting_buffer_level / 2) > INT_MAX)
-                 ? INT_MAX
-                 : (int)(rc->starting_buffer_level / 2);
+    target = rc->starting_buffer_level / 2;
   } else {
     int kf_boost = 32;
     double framerate = cpi->framerate;
@@ -1615,88 +2220,295 @@ static int calc_iframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
       const LAYER_CONTEXT *lc = &svc->layer_context[layer];
       framerate = lc->framerate;
     }
-    kf_boost = VPXMAX(kf_boost, (int)(2 * framerate - 16));
+    kf_boost = VPXMAX(kf_boost, (int)round(2 * framerate - 16));
     if (rc->frames_since_key < framerate / 2) {
-      kf_boost = (int)(kf_boost * rc->frames_since_key / (framerate / 2));
+      kf_boost = (int)round(kf_boost * rc->frames_since_key / (framerate / 2));
+    }
+
+    target = ((int64_t)(16 + kf_boost) * rc->avg_frame_bandwidth) >> 4;
+  }
+  target = VPXMIN(INT_MAX, target);
+  return vp9_rc_clamp_iframe_target_size(cpi, (int)target);
+}
+
+static void set_intra_only_frame(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  SVC *const svc = &cpi->svc;
+  // Don't allow intra_only frame for bypass/flexible SVC mode, or if number
+  // of spatial layers is 1 or if number of spatial or temporal layers > 3.
+  // Also if intra-only is inserted on very first frame, don't allow if
+  // if number of temporal layers > 1. This is because on intra-only frame
+  // only 3 reference buffers can be updated, but for temporal layers > 1
+  // we generally need to use buffer slots 4 and 5.
+  if ((cm->current_video_frame == 0 && svc->number_temporal_layers > 1) ||
+      svc->number_spatial_layers > 3 || svc->number_temporal_layers > 3 ||
+      svc->number_spatial_layers == 1)
+    return;
+  cm->show_frame = 0;
+  cm->intra_only = 1;
+  cm->frame_type = INTER_FRAME;
+  cpi->ext_refresh_frame_flags_pending = 1;
+  cpi->ext_refresh_last_frame = 1;
+  cpi->ext_refresh_golden_frame = 1;
+  cpi->ext_refresh_alt_ref_frame = 1;
+  if (cm->current_video_frame == 0) {
+    cpi->lst_fb_idx = 0;
+    cpi->gld_fb_idx = 1;
+    cpi->alt_fb_idx = 2;
+  } else {
+    int i;
+    int count = 0;
+    cpi->lst_fb_idx = -1;
+    cpi->gld_fb_idx = -1;
+    cpi->alt_fb_idx = -1;
+    svc->update_buffer_slot[0] = 0;
+    // For intra-only frame we need to refresh all slots that were
+    // being used for the base layer (fb_idx_base[i] == 1).
+    // Start with assigning last first, then golden and then alt.
+    for (i = 0; i < REF_FRAMES; ++i) {
+      if (svc->fb_idx_base[i] == 1) {
+        svc->update_buffer_slot[0] |= 1 << i;
+        count++;
+      }
+      if (count == 1 && cpi->lst_fb_idx == -1) cpi->lst_fb_idx = i;
+      if (count == 2 && cpi->gld_fb_idx == -1) cpi->gld_fb_idx = i;
+      if (count == 3 && cpi->alt_fb_idx == -1) cpi->alt_fb_idx = i;
+    }
+    // If golden or alt is not being used for base layer, then set them
+    // to the lst_fb_idx.
+    if (cpi->gld_fb_idx == -1) cpi->gld_fb_idx = cpi->lst_fb_idx;
+    if (cpi->alt_fb_idx == -1) cpi->alt_fb_idx = cpi->lst_fb_idx;
+    if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+      cpi->ext_refresh_last_frame = 0;
+      cpi->ext_refresh_golden_frame = 0;
+      cpi->ext_refresh_alt_ref_frame = 0;
+      cpi->ref_frame_flags = 0;
     }
-    target = ((16 + kf_boost) * rc->avg_frame_bandwidth) >> 4;
   }
-  return vp9_rc_clamp_iframe_target_size(cpi, target);
 }
 
 void vp9_rc_get_svc_params(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
+  SVC *const svc = &cpi->svc;
   int target = rc->avg_frame_bandwidth;
-  int layer =
-      LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id, cpi->svc.temporal_layer_id,
-                       cpi->svc.number_temporal_layers);
+  int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id,
+                               svc->number_temporal_layers);
+  if (svc->first_spatial_layer_to_encode)
+    svc->layer_context[svc->temporal_layer_id].is_key_frame = 0;
   // Periodic key frames is based on the super-frame counter
   // (svc.current_superframe), also only base spatial layer is key frame.
-  if ((cm->current_video_frame == 0) || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
+  // Key frame is set for any of the following: very first frame, frame flags
+  // indicates key, superframe counter hits key frequency,(non-intra) sync
+  // flag is set for spatial layer 0, or deadline mode changes.
+  if ((cm->current_video_frame == 0 && !svc->previous_frame_is_intra_only) ||
+      (cpi->frame_flags & FRAMEFLAGS_KEY) ||
       (cpi->oxcf.auto_key &&
-       (cpi->svc.current_superframe % cpi->oxcf.key_freq == 0) &&
-       cpi->svc.spatial_layer_id == 0)) {
+       (svc->current_superframe % cpi->oxcf.key_freq == 0) &&
+       !svc->previous_frame_is_intra_only && svc->spatial_layer_id == 0) ||
+      (svc->spatial_layer_sync[0] == 1 && svc->spatial_layer_id == 0) ||
+      (cpi->oxcf.mode != cpi->deadline_mode_previous_frame)) {
     cm->frame_type = KEY_FRAME;
     rc->source_alt_ref_active = 0;
-    if (is_two_pass_svc(cpi)) {
-      cpi->svc.layer_context[layer].is_key_frame = 1;
-      cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
-    } else if (is_one_pass_cbr_svc(cpi)) {
-      if (cm->current_video_frame > 0) vp9_svc_reset_key_frame(cpi);
-      layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id,
-                               cpi->svc.temporal_layer_id,
-                               cpi->svc.number_temporal_layers);
-      cpi->svc.layer_context[layer].is_key_frame = 1;
+    if (is_one_pass_svc(cpi)) {
+      if (cm->current_video_frame > 0) vp9_svc_reset_temporal_layers(cpi, 1);
+      layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id,
+                               svc->number_temporal_layers);
+      svc->layer_context[layer].is_key_frame = 1;
       cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
       // Assumption here is that LAST_FRAME is being updated for a keyframe.
       // Thus no change in update flags.
-      target = calc_iframe_target_size_one_pass_cbr(cpi);
+      if (cpi->oxcf.rc_mode == VPX_CBR)
+        target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
+      else
+        target = vp9_calc_iframe_target_size_one_pass_vbr(cpi);
     }
   } else {
     cm->frame_type = INTER_FRAME;
-    if (is_two_pass_svc(cpi)) {
-      LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
-      if (cpi->svc.spatial_layer_id == 0) {
-        lc->is_key_frame = 0;
+    if (is_one_pass_svc(cpi)) {
+      LAYER_CONTEXT *lc = &svc->layer_context[layer];
+      // Add condition current_video_frame > 0 for the case where first frame
+      // is intra only followed by overlay/copy frame. In this case we don't
+      // want to reset is_key_frame to 0 on overlay/copy frame.
+      lc->is_key_frame =
+          (svc->spatial_layer_id == 0 && cm->current_video_frame > 0)
+              ? 0
+              : svc->layer_context[svc->temporal_layer_id].is_key_frame;
+      if (cpi->oxcf.rc_mode == VPX_CBR) {
+        target = vp9_calc_pframe_target_size_one_pass_cbr(cpi);
       } else {
-        lc->is_key_frame =
-            cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame;
-        if (lc->is_key_frame) cpi->ref_frame_flags &= (~VP9_LAST_FLAG);
+        double rate_err = 0.0;
+        rc->fac_active_worst_inter = 140;
+        rc->fac_active_worst_gf = 100;
+        if (rc->rolling_target_bits > 0) {
+          rate_err =
+              (double)rc->rolling_actual_bits / (double)rc->rolling_target_bits;
+          if (rate_err < 1.0)
+            rc->fac_active_worst_inter = 120;
+          else if (rate_err > 2.0)
+            // Increase active_worst faster if rate fluctuation is high.
+            rc->fac_active_worst_inter = 160;
+        }
+        target = vp9_calc_pframe_target_size_one_pass_vbr(cpi);
       }
-      cpi->ref_frame_flags &= (~VP9_ALT_FLAG);
-    } else if (is_one_pass_cbr_svc(cpi)) {
-      LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
-      if (cpi->svc.spatial_layer_id == cpi->svc.first_spatial_layer_to_encode) {
-        lc->is_key_frame = 0;
-      } else {
-        lc->is_key_frame =
-            cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame;
-      }
-      target = calc_pframe_target_size_one_pass_cbr(cpi);
     }
   }
 
+  if (svc->simulcast_mode) {
+    if (svc->spatial_layer_id > 0 &&
+        svc->layer_context[layer].is_key_frame == 1) {
+      cm->frame_type = KEY_FRAME;
+      cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
+      if (cpi->oxcf.rc_mode == VPX_CBR)
+        target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
+      else
+        target = vp9_calc_iframe_target_size_one_pass_vbr(cpi);
+    }
+    // Set the buffer idx and refresh flags for key frames in simulcast mode.
+    // Note the buffer slot for long-term reference is set below (line 2255),
+    // and alt_ref is used for that on key frame. So use last and golden for
+    // the other two normal slots.
+    if (cm->frame_type == KEY_FRAME) {
+      if (svc->number_spatial_layers == 2) {
+        if (svc->spatial_layer_id == 0) {
+          cpi->lst_fb_idx = 0;
+          cpi->gld_fb_idx = 2;
+          cpi->alt_fb_idx = 6;
+        } else if (svc->spatial_layer_id == 1) {
+          cpi->lst_fb_idx = 1;
+          cpi->gld_fb_idx = 3;
+          cpi->alt_fb_idx = 6;
+        }
+      } else if (svc->number_spatial_layers == 3) {
+        if (svc->spatial_layer_id == 0) {
+          cpi->lst_fb_idx = 0;
+          cpi->gld_fb_idx = 3;
+          cpi->alt_fb_idx = 6;
+        } else if (svc->spatial_layer_id == 1) {
+          cpi->lst_fb_idx = 1;
+          cpi->gld_fb_idx = 4;
+          cpi->alt_fb_idx = 6;
+        } else if (svc->spatial_layer_id == 2) {
+          cpi->lst_fb_idx = 2;
+          cpi->gld_fb_idx = 5;
+          cpi->alt_fb_idx = 7;
+        }
+      }
+      cpi->ext_refresh_last_frame = 1;
+      cpi->ext_refresh_golden_frame = 1;
+      cpi->ext_refresh_alt_ref_frame = 1;
+    }
+  }
+
+  // Check if superframe contains a sync layer request.
+  vp9_svc_check_spatial_layer_sync(cpi);
+
+  // If long term termporal feature is enabled, set the period of the update.
+  // The update/refresh of this reference frame is always on base temporal
+  // layer frame.
+  if (svc->use_gf_temporal_ref_current_layer) {
+    // Only use gf long-term prediction on non-key superframes.
+    if (!svc->layer_context[svc->temporal_layer_id].is_key_frame) {
+      // Use golden for this reference, which will be used for prediction.
+      int index = svc->spatial_layer_id;
+      if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1;
+      assert(index >= 0);
+      cpi->gld_fb_idx = svc->buffer_gf_temporal_ref[index].idx;
+      // Enable prediction off LAST (last reference) and golden (which will
+      // generally be further behind/long-term reference).
+      cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+    }
+    // Check for update/refresh of reference: only refresh on base temporal
+    // layer.
+    if (svc->temporal_layer_id == 0) {
+      if (svc->layer_context[svc->temporal_layer_id].is_key_frame) {
+        // On key frame we update the buffer index used for long term reference.
+        // Use the alt_ref since it is not used or updated on key frames.
+        int index = svc->spatial_layer_id;
+        if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1;
+        assert(index >= 0);
+        cpi->alt_fb_idx = svc->buffer_gf_temporal_ref[index].idx;
+        cpi->ext_refresh_alt_ref_frame = 1;
+      } else if (rc->frames_till_gf_update_due == 0) {
+        // Set perdiod of next update. Make it a multiple of 10, as the cyclic
+        // refresh is typically ~10%, and we'd like the update to happen after
+        // a few cylces of the refresh (so it better quality frame). Note the
+        // cyclic refresh for SVC only operates on base temporal layer frames.
+        // Choose 20 as perdiod for now (2 cycles).
+        rc->baseline_gf_interval = 20;
+        rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+        cpi->ext_refresh_golden_frame = 1;
+        rc->gfu_boost = DEFAULT_GF_BOOST;
+      }
+    }
+  } else if (!svc->use_gf_temporal_ref) {
+    rc->frames_till_gf_update_due = INT_MAX;
+    rc->baseline_gf_interval = INT_MAX;
+  }
+  if (svc->set_intra_only_frame) {
+    set_intra_only_frame(cpi);
+    if (cpi->oxcf.rc_mode == VPX_CBR)
+      target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
+    else
+      target = vp9_calc_iframe_target_size_one_pass_vbr(cpi);
+  }
+  // Overlay frame predicts from LAST (intra-only)
+  if (svc->previous_frame_is_intra_only) cpi->ref_frame_flags |= VP9_LAST_FLAG;
+
   // Any update/change of global cyclic refresh parameters (amount/delta-qp)
   // should be done here, before the frame qp is selected.
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
     vp9_cyclic_refresh_update_parameters(cpi);
 
   vp9_rc_set_frame_target(cpi, target);
-  rc->frames_till_gf_update_due = INT_MAX;
-  rc->baseline_gf_interval = INT_MAX;
+  if (cm->show_frame) vp9_update_buffer_level_svc_preencode(cpi);
+
+  if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC && svc->single_layer_svc == 1 &&
+      svc->spatial_layer_id == svc->first_spatial_layer_to_encode &&
+      svc->temporal_layer_id == 0) {
+    LAYER_CONTEXT *lc = NULL;
+    cpi->resize_pending = vp9_resize_one_pass_cbr(cpi);
+    if (cpi->resize_pending) {
+      int tl, width, height;
+      // Apply the same scale to all temporal layers.
+      for (tl = 0; tl < svc->number_temporal_layers; tl++) {
+        lc = &svc->layer_context[svc->spatial_layer_id *
+                                     svc->number_temporal_layers +
+                                 tl];
+        lc->scaling_factor_num_resize =
+            cpi->resize_scale_num * lc->scaling_factor_num;
+        lc->scaling_factor_den_resize =
+            cpi->resize_scale_den * lc->scaling_factor_den;
+        // Reset rate control for all temporal layers.
+        lc->rc.buffer_level = lc->rc.optimal_buffer_level;
+        lc->rc.bits_off_target = lc->rc.optimal_buffer_level;
+        lc->rc.rate_correction_factors[INTER_FRAME] =
+            rc->rate_correction_factors[INTER_FRAME];
+      }
+      // Set the size for this current temporal layer.
+      lc = &svc->layer_context[svc->spatial_layer_id *
+                                   svc->number_temporal_layers +
+                               svc->temporal_layer_id];
+      get_layer_resolution(cpi->oxcf.width, cpi->oxcf.height,
+                           lc->scaling_factor_num_resize,
+                           lc->scaling_factor_den_resize, &width, &height);
+      vp9_set_size_literal(cpi, width, height);
+      svc->resize_set = 1;
+    }
+  } else {
+    cpi->resize_pending = 0;
+    svc->resize_set = 0;
+  }
 }
 
 void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   int target;
-  // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic.
-  if ((cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
-       rc->frames_to_key == 0 || (cpi->oxcf.auto_key && 0))) {
+  if ((cm->current_video_frame == 0) || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
+      (cpi->oxcf.auto_key && rc->frames_to_key == 0) ||
+      (cpi->oxcf.mode != cpi->deadline_mode_previous_frame)) {
     cm->frame_type = KEY_FRAME;
-    rc->this_key_frame_forced =
-        cm->current_video_frame != 0 && rc->frames_to_key == 0;
     rc->frames_to_key = cpi->oxcf.key_freq;
     rc->kf_boost = DEFAULT_KF_BOOST;
     rc->source_alt_ref_active = 0;
@@ -1722,12 +2534,15 @@ void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) {
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
     vp9_cyclic_refresh_update_parameters(cpi);
 
-  if (cm->frame_type == KEY_FRAME)
-    target = calc_iframe_target_size_one_pass_cbr(cpi);
+  if (frame_is_intra_only(cm))
+    target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
   else
-    target = calc_pframe_target_size_one_pass_cbr(cpi);
+    target = vp9_calc_pframe_target_size_one_pass_cbr(cpi);
 
   vp9_rc_set_frame_target(cpi, target);
+
+  if (cm->show_frame) vp9_update_buffer_level_preencode(cpi);
+
   if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC)
     cpi->resize_pending = vp9_resize_one_pass_cbr(cpi);
   else
@@ -1789,29 +2604,46 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi,
     rc->min_gf_interval = FIXED_GF_INTERVAL;
     rc->static_scene_max_gf_interval = FIXED_GF_INTERVAL;
   } else {
+    double framerate = cpi->framerate;
     // Set Maximum gf/arf interval
     rc->max_gf_interval = oxcf->max_gf_interval;
     rc->min_gf_interval = oxcf->min_gf_interval;
-    if (rc->min_gf_interval == 0)
+    if (rc->min_gf_interval == 0) {
       rc->min_gf_interval = vp9_rc_get_default_min_gf_interval(
-          oxcf->width, oxcf->height, cpi->framerate);
-    if (rc->max_gf_interval == 0)
-      rc->max_gf_interval = vp9_rc_get_default_max_gf_interval(
-          cpi->framerate, rc->min_gf_interval);
-
-    // Extended interval for genuinely static scenes
-    rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2;
-
-    if (is_altref_enabled(cpi)) {
-      if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
-        rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1;
+          oxcf->width, oxcf->height, framerate);
     }
+    if (rc->max_gf_interval == 0) {
+      rc->max_gf_interval =
+          vp9_rc_get_default_max_gf_interval(framerate, rc->min_gf_interval);
+    }
+
+    // Extended max interval for genuinely static scenes like slide shows.
+    rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH;
 
     if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
       rc->max_gf_interval = rc->static_scene_max_gf_interval;
 
     // Clamp min to max
     rc->min_gf_interval = VPXMIN(rc->min_gf_interval, rc->max_gf_interval);
+
+    if (oxcf->target_level == LEVEL_AUTO) {
+      const uint32_t pic_size = cpi->common.width * cpi->common.height;
+      const uint32_t pic_breadth =
+          VPXMAX(cpi->common.width, cpi->common.height);
+      int i;
+      for (i = 0; i < VP9_LEVELS; ++i) {
+        if (vp9_level_defs[i].max_luma_picture_size >= pic_size &&
+            vp9_level_defs[i].max_luma_picture_breadth >= pic_breadth) {
+          if (rc->min_gf_interval <=
+              (int)vp9_level_defs[i].min_altref_distance) {
+            rc->min_gf_interval = (int)vp9_level_defs[i].min_altref_distance;
+            rc->max_gf_interval =
+                VPXMAX(rc->max_gf_interval, rc->min_gf_interval);
+          }
+          break;
+        }
+      }
+    }
   }
 }
 
@@ -1819,27 +2651,29 @@ void vp9_rc_update_framerate(VP9_COMP *cpi) {
   const VP9_COMMON *const cm = &cpi->common;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   RATE_CONTROL *const rc = &cpi->rc;
-  int vbr_max_bits;
 
-  rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / cpi->framerate);
-  rc->min_frame_bandwidth =
-      (int)(rc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100);
+  rc->avg_frame_bandwidth = saturate_cast_double_to_int(
+      round(oxcf->target_bandwidth / cpi->framerate));
 
-  rc->min_frame_bandwidth =
-      VPXMAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS);
+  int64_t vbr_min_bits =
+      (int64_t)rc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100;
+  vbr_min_bits = VPXMIN(vbr_min_bits, INT_MAX);
+
+  rc->min_frame_bandwidth = VPXMAX((int)vbr_min_bits, FRAME_OVERHEAD_BITS);
 
   // A maximum bitrate for a frame is defined.
-  // The baseline for this aligns with HW implementations that
-  // can support decode of 1080P content up to a bitrate of MAX_MB_RATE bits
-  // per 16x16 MB (averaged over a frame). However this limit is extended if
-  // a very high rate is given on the command line or the the rate cannnot
-  // be acheived because of a user specificed max q (e.g. when the user
-  // specifies lossless encode.
-  vbr_max_bits =
-      (int)(((int64_t)rc->avg_frame_bandwidth * oxcf->two_pass_vbrmax_section) /
-            100);
+  // However this limit is extended if a very high rate is given on the command
+  // line or the rate can not be achieved because of a user specified max q
+  // (e.g. when the user specifies lossless encode).
+  //
+  // If a level is specified that requires a lower maximum rate then the level
+  // value take precedence.
+  int64_t vbr_max_bits =
+      (int64_t)rc->avg_frame_bandwidth * oxcf->two_pass_vbrmax_section / 100;
+  vbr_max_bits = VPXMIN(vbr_max_bits, INT_MAX);
+
   rc->max_frame_bandwidth =
-      VPXMAX(VPXMAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits);
+      VPXMAX(VPXMAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P), (int)vbr_max_bits);
 
   vp9_rc_set_gf_interval_range(cpi, rc);
 }
@@ -1849,44 +2683,43 @@ void vp9_rc_update_framerate(VP9_COMP *cpi) {
 static void vbr_rate_correction(VP9_COMP *cpi, int *this_frame_target) {
   RATE_CONTROL *const rc = &cpi->rc;
   int64_t vbr_bits_off_target = rc->vbr_bits_off_target;
-  int max_delta;
-  int frame_window = VPXMIN(16, ((int)cpi->twopass.total_stats.count -
-                                 cpi->common.current_video_frame));
+  int64_t frame_target = *this_frame_target;
+  int frame_window = (int)VPXMIN(
+      16, cpi->twopass.total_stats.count - cpi->common.current_video_frame);
 
   // Calcluate the adjustment to rate for this frame.
   if (frame_window > 0) {
-    max_delta = (vbr_bits_off_target > 0)
-                    ? (int)(vbr_bits_off_target / frame_window)
-                    : (int)(-vbr_bits_off_target / frame_window);
+    int64_t max_delta = (vbr_bits_off_target > 0)
+                            ? (vbr_bits_off_target / frame_window)
+                            : (-vbr_bits_off_target / frame_window);
 
-    max_delta = VPXMIN(max_delta,
-                       ((*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100));
+    max_delta =
+        VPXMIN(max_delta, ((frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100));
 
     // vbr_bits_off_target > 0 means we have extra bits to spend
     if (vbr_bits_off_target > 0) {
-      *this_frame_target += (vbr_bits_off_target > max_delta)
-                                ? max_delta
-                                : (int)vbr_bits_off_target;
+      frame_target += VPXMIN(vbr_bits_off_target, max_delta);
     } else {
-      *this_frame_target -= (vbr_bits_off_target < -max_delta)
-                                ? max_delta
-                                : (int)-vbr_bits_off_target;
+      frame_target -= VPXMIN(-vbr_bits_off_target, max_delta);
     }
   }
 
   // Fast redistribution of bits arising from massive local undershoot.
-  // Dont do it for kf,arf,gf or overlay frames.
+  // Don't do it for kf,arf,gf or overlay frames.
   if (!frame_is_kf_gf_arf(cpi) && !rc->is_src_frame_alt_ref &&
       rc->vbr_bits_off_target_fast) {
-    int one_frame_bits = VPXMAX(rc->avg_frame_bandwidth, *this_frame_target);
-    int fast_extra_bits;
-    fast_extra_bits = (int)VPXMIN(rc->vbr_bits_off_target_fast, one_frame_bits);
-    fast_extra_bits = (int)VPXMIN(
-        fast_extra_bits,
-        VPXMAX(one_frame_bits / 8, rc->vbr_bits_off_target_fast / 8));
-    *this_frame_target += (int)fast_extra_bits;
+    int64_t one_frame_bits = VPXMAX(rc->avg_frame_bandwidth, frame_target);
+    int64_t fast_extra_bits =
+        VPXMIN(rc->vbr_bits_off_target_fast, one_frame_bits);
+    fast_extra_bits =
+        VPXMIN(fast_extra_bits,
+               VPXMAX(one_frame_bits / 8, rc->vbr_bits_off_target_fast / 8));
+    frame_target += fast_extra_bits;
     rc->vbr_bits_off_target_fast -= fast_extra_bits;
   }
+
+  // Clamp the target for the frame to the maximum allowed for one frame.
+  *this_frame_target = (int)VPXMIN(frame_target, INT_MAX);
 }
 
 void vp9_set_target_rate(VP9_COMP *cpi) {
@@ -1898,9 +2731,11 @@ void vp9_set_target_rate(VP9_COMP *cpi) {
   else
     target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
 
-  // Correction to rate target based on prior over or under shoot.
-  if (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.rc_mode == VPX_CQ)
-    vbr_rate_correction(cpi, &target_rate);
+  if (!cpi->oxcf.vbr_corpus_complexity) {
+    // Correction to rate target based on prior over or under shoot.
+    if (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.rc_mode == VPX_CQ)
+      vbr_rate_correction(cpi, &target_rate);
+  }
   vp9_rc_set_frame_target(cpi, target_rate);
 }
 
@@ -1912,9 +2747,11 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) {
   RESIZE_ACTION resize_action = NO_RESIZE;
   int avg_qp_thr1 = 70;
   int avg_qp_thr2 = 50;
-  int min_width = 180;
-  int min_height = 180;
+  // Don't allow for resized frame to go below 320x180, resize in steps of 3/4.
+  int min_width = (320 * 4) / 3;
+  int min_height = (180 * 4) / 3;
   int down_size_on = 1;
+  int force_downsize_rate = 0;
   cpi->resize_scale_num = 1;
   cpi->resize_scale_den = 1;
   // Don't resize on key frame; reset the counters on key frame.
@@ -1923,20 +2760,9 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) {
     cpi->resize_count = 0;
     return 0;
   }
-  // Check current frame reslution to avoid generating frames smaller than
-  // the minimum resolution.
-  if (ONEHALFONLY_RESIZE) {
-    if ((cm->width >> 1) < min_width || (cm->height >> 1) < min_height)
-      down_size_on = 0;
-  } else {
-    if (cpi->resize_state == ORIG &&
-        (cm->width * 3 / 4 < min_width || cm->height * 3 / 4 < min_height))
-      return 0;
-    else if (cpi->resize_state == THREE_QUARTER &&
-             ((cpi->oxcf.width >> 1) < min_width ||
-              (cpi->oxcf.height >> 1) < min_height))
-      down_size_on = 0;
-  }
+
+  // No resizing down if frame size is below some limit.
+  if ((cm->width * cm->height) < min_width * min_height) down_size_on = 0;
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
   // If denoiser is on, apply a smaller qp threshold.
@@ -1946,11 +2772,32 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) {
   }
 #endif
 
+  // Force downsize based on per-frame-bandwidth, for extreme case,
+  // for HD input.
+  if (cpi->resize_state == ORIG && cm->width * cm->height >= 1280 * 720) {
+    if (rc->avg_frame_bandwidth < 300000 / 30) {
+      resize_action = DOWN_ONEHALF;
+      cpi->resize_state = ONE_HALF;
+      force_downsize_rate = 1;
+    } else if (rc->avg_frame_bandwidth < 400000 / 30) {
+      resize_action = ONEHALFONLY_RESIZE ? DOWN_ONEHALF : DOWN_THREEFOUR;
+      cpi->resize_state = ONEHALFONLY_RESIZE ? ONE_HALF : THREE_QUARTER;
+      force_downsize_rate = 1;
+    }
+  } else if (cpi->resize_state == THREE_QUARTER &&
+             cm->width * cm->height >= 960 * 540) {
+    if (rc->avg_frame_bandwidth < 300000 / 30) {
+      resize_action = DOWN_ONEHALF;
+      cpi->resize_state = ONE_HALF;
+      force_downsize_rate = 1;
+    }
+  }
+
   // Resize based on average buffer underflow and QP over some window.
   // Ignore samples close to key frame, since QP is usually high after key.
-  if (cpi->rc.frames_since_key > 2 * cpi->framerate) {
-    const int window = (int)(4 * cpi->framerate);
-    cpi->resize_avg_qp += cm->base_qindex;
+  if (!force_downsize_rate && cpi->rc.frames_since_key > cpi->framerate) {
+    const int window = VPXMIN(30, (int)round(2 * cpi->framerate));
+    cpi->resize_avg_qp += rc->last_q[INTER_FRAME];
     if (cpi->rc.buffer_level < (int)(30 * rc->optimal_buffer_level / 100))
       ++cpi->resize_buffer_underflow;
     ++cpi->resize_count;
@@ -1962,8 +2809,9 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) {
       // Resize back up if average QP is low, and we are currently in a resized
       // down state, i.e. 1/2 or 3/4 of original resolution.
       // Currently, use a flag to turn 3/4 resizing feature on/off.
-      if (cpi->resize_buffer_underflow > (cpi->resize_count >> 2)) {
-        if (cpi->resize_state == THREE_QUARTER && down_size_on) {
+      if (cpi->resize_buffer_underflow > (cpi->resize_count >> 2) &&
+          down_size_on) {
+        if (cpi->resize_state == THREE_QUARTER) {
           resize_action = DOWN_ONEHALF;
           cpi->resize_state = ONE_HALF;
         } else if (cpi->resize_state == ORIG) {
@@ -2010,7 +2858,7 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) {
     // Reset buffer level to optimal, update target size.
     rc->buffer_level = rc->optimal_buffer_level;
     rc->bits_off_target = rc->optimal_buffer_level;
-    rc->this_frame_target = calc_pframe_target_size_one_pass_cbr(cpi);
+    rc->this_frame_target = vp9_calc_pframe_target_size_one_pass_cbr(cpi);
     // Get the projected qindex, based on the scaled target frame size (scaled
     // so target_bits_per_mb in vp9_rc_regulate_q will be correct target).
     target_bits_per_frame = (resize_action >= 0)
@@ -2035,7 +2883,8 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) {
   return resize_action;
 }
 
-void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) {
+static void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi,
+                                             uint64_t avg_sad_current) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   int target;
@@ -2046,7 +2895,7 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) {
   uint64_t avg_source_sad_lag = avg_sad_current;
   int high_source_sad_lagindex = -1;
   int steady_sad_lagindex = -1;
-  uint32_t sad_thresh1 = 60000;
+  uint32_t sad_thresh1 = 70000;
   uint32_t sad_thresh2 = 120000;
   int low_content = 0;
   int high_content = 0;
@@ -2135,9 +2984,14 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) {
     // Adjust factors for active_worst setting & af_ratio for next gf interval.
     rc->fac_active_worst_inter = 150;  // corresponds to 3/2 (= 150 /100).
     rc->fac_active_worst_gf = 100;
-    if (rate_err < 1.5 && !high_content) {
+    if (rate_err < 2.0 && !high_content) {
       rc->fac_active_worst_inter = 120;
       rc->fac_active_worst_gf = 90;
+    } else if (rate_err > 8.0 && rc->avg_frame_qindex[INTER_FRAME] < 16) {
+      // Increase active_worst faster at low Q if rate fluctuation is high.
+      rc->fac_active_worst_inter = 200;
+      if (rc->avg_frame_qindex[INTER_FRAME] < 8)
+        rc->fac_active_worst_inter = 400;
     }
     if (low_content && rc->avg_frame_low_motion > 80) {
       rc->af_ratio_onepass_vbr = 15;
@@ -2145,11 +2999,16 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) {
       rc->af_ratio_onepass_vbr = 5;
       rc->gfu_boost = DEFAULT_GF_BOOST >> 2;
     }
-#if USE_ALTREF_FOR_ONE_PASS
-    if (cpi->oxcf.enable_auto_arf) {
-      // Don't use alt-ref if there is a scene cut within the group,
-      // or content is not low.
-      if ((rc->high_source_sad_lagindex > 0 &&
+    if (cpi->sf.use_altref_onepass && cpi->oxcf.enable_auto_arf) {
+      // Flag to disable usage of ARF based on past usage, only allow this
+      // disabling if current frame/group does not start with key frame or
+      // scene cut. Note perc_arf_usage is only computed for speed >= 5.
+      int arf_usage_low =
+          (cm->frame_type != KEY_FRAME && !rc->high_source_sad &&
+           cpi->rc.perc_arf_usage < 15 && cpi->oxcf.speed >= 5);
+      // Don't use alt-ref for this group under certain conditions.
+      if (arf_usage_low ||
+          (rc->high_source_sad_lagindex > 0 &&
            rc->high_source_sad_lagindex <= rc->frames_till_gf_update_due) ||
           (avg_source_sad_lag > 3 * sad_thresh1 >> 3)) {
         rc->source_alt_ref_pending = 0;
@@ -2158,13 +3017,13 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) {
         rc->source_alt_ref_pending = 1;
         rc->alt_ref_gf_group = 1;
         // If alt-ref is used for this gf group, limit the interval.
-        if (rc->baseline_gf_interval > 10 &&
-            rc->baseline_gf_interval < rc->frames_to_key)
-          rc->baseline_gf_interval = 10;
+        if (rc->baseline_gf_interval > 12) {
+          rc->baseline_gf_interval = 12;
+          rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+        }
       }
     }
-#endif
-    target = calc_pframe_target_size_one_pass_vbr(cpi);
+    target = vp9_calc_pframe_target_size_one_pass_vbr(cpi);
     vp9_rc_set_frame_target(cpi, target);
   }
   rc->prev_avg_source_sad_lag = avg_source_sad_lag;
@@ -2175,27 +3034,59 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) {
 // in content and allow rate control to react.
 // This function also handles special case of lag_in_frames, to measure content
 // level in #future frames set by the lag_in_frames.
-void vp9_avg_source_sad(VP9_COMP *cpi) {
+void vp9_scene_detection_onepass(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
+  YV12_BUFFER_CONFIG const *unscaled_src = cpi->un_scaled_source;
+  YV12_BUFFER_CONFIG const *unscaled_last_src = cpi->unscaled_last_source;
+  uint8_t *src_y;
+  int src_ystride;
+  int src_width;
+  int src_height;
+  uint8_t *last_src_y;
+  int last_src_ystride;
+  int last_src_width;
+  int last_src_height;
+  if (cpi->un_scaled_source == NULL || cpi->unscaled_last_source == NULL ||
+      (cpi->use_svc && cpi->svc.current_superframe == 0))
+    return;
+  src_y = unscaled_src->y_buffer;
+  src_ystride = unscaled_src->y_stride;
+  src_width = unscaled_src->y_width;
+  src_height = unscaled_src->y_height;
+  last_src_y = unscaled_last_src->y_buffer;
+  last_src_ystride = unscaled_last_src->y_stride;
+  last_src_width = unscaled_last_src->y_width;
+  last_src_height = unscaled_last_src->y_height;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cm->use_highbitdepth) return;
+#endif
   rc->high_source_sad = 0;
-  if (cpi->Last_Source != NULL &&
-      cpi->Last_Source->y_width == cpi->Source->y_width &&
-      cpi->Last_Source->y_height == cpi->Source->y_height) {
+  rc->high_num_blocks_with_motion = 0;
+  // For SVC: scene detection is only checked on first spatial layer of
+  // the superframe using the original/unscaled resolutions.
+  if (cpi->svc.spatial_layer_id == cpi->svc.first_spatial_layer_to_encode &&
+      src_width == last_src_width && src_height == last_src_height) {
     YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL };
-    uint8_t *src_y = cpi->Source->y_buffer;
-    int src_ystride = cpi->Source->y_stride;
-    uint8_t *last_src_y = cpi->Last_Source->y_buffer;
-    int last_src_ystride = cpi->Last_Source->y_stride;
+    int num_mi_cols = cm->mi_cols;
+    int num_mi_rows = cm->mi_rows;
     int start_frame = 0;
     int frames_to_buffer = 1;
     int frame = 0;
+    int scene_cut_force_key_frame = 0;
+    int num_zero_temp_sad = 0;
     uint64_t avg_sad_current = 0;
-    uint32_t min_thresh = 4000;
+    uint32_t min_thresh = 20000;  // ~5 * 64 * 64
     float thresh = 8.0f;
-    if (cpi->oxcf.rc_mode == VPX_VBR) {
-      min_thresh = 60000;
-      thresh = 2.1f;
+    uint32_t thresh_key = 140000;
+    if (cpi->oxcf.speed <= 5) thresh_key = 240000;
+    if (cpi->oxcf.content != VP9E_CONTENT_SCREEN) min_thresh = 65000;
+    if (cpi->oxcf.rc_mode == VPX_VBR) thresh = 2.1f;
+    if (cpi->use_svc && cpi->svc.number_spatial_layers > 1) {
+      const int aligned_width = ALIGN_POWER_OF_TWO(src_width, MI_SIZE_LOG2);
+      const int aligned_height = ALIGN_POWER_OF_TWO(src_height, MI_SIZE_LOG2);
+      num_mi_cols = aligned_width >> MI_SIZE_LOG2;
+      num_mi_rows = aligned_height >> MI_SIZE_LOG2;
     }
     if (cpi->oxcf.lag_in_frames > 0) {
       frames_to_buffer = (cm->current_video_frame == 1)
@@ -2220,6 +3111,8 @@ void vp9_avg_source_sad(VP9_COMP *cpi) {
         rc->high_source_sad = 1;
       else
         rc->high_source_sad = 0;
+      if (rc->high_source_sad && avg_sad_current > thresh_key)
+        scene_cut_force_key_frame = 1;
       // Update recursive average for current frame.
       if (avg_sad_current > 0)
         rc->avg_source_sad[0] =
@@ -2239,27 +3132,29 @@ void vp9_avg_source_sad(VP9_COMP *cpi) {
         const BLOCK_SIZE bsize = BLOCK_64X64;
         // Loop over sub-sample of frame, compute average sad over 64x64 blocks.
         uint64_t avg_sad = 0;
+        uint64_t tmp_sad = 0;
         int num_samples = 0;
-        int sb_cols = (cm->mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
-        int sb_rows = (cm->mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
+        int sb_cols = (num_mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
+        int sb_rows = (num_mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
         if (cpi->oxcf.lag_in_frames > 0) {
           src_y = frames[frame]->y_buffer;
           src_ystride = frames[frame]->y_stride;
           last_src_y = frames[frame + 1]->y_buffer;
           last_src_ystride = frames[frame + 1]->y_stride;
         }
+        num_zero_temp_sad = 0;
         for (sbi_row = 0; sbi_row < sb_rows; ++sbi_row) {
           for (sbi_col = 0; sbi_col < sb_cols; ++sbi_col) {
             // Checker-board pattern, ignore boundary.
-            // If the partition copy is on, compute for every superblock.
-            if (cpi->sf.copy_partition_flag ||
-                ((sbi_row > 0 && sbi_col > 0) &&
+            if (((sbi_row > 0 && sbi_col > 0) &&
                  (sbi_row < sb_rows - 1 && sbi_col < sb_cols - 1) &&
                  ((sbi_row % 2 == 0 && sbi_col % 2 == 0) ||
                   (sbi_row % 2 != 0 && sbi_col % 2 != 0)))) {
+              tmp_sad = cpi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y,
+                                               last_src_ystride);
+              avg_sad += tmp_sad;
               num_samples++;
-              avg_sad += cpi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y,
-                                                last_src_ystride);
+              if (tmp_sad == 0) num_zero_temp_sad++;
             }
             src_y += 64;
             last_src_y += 64;
@@ -2276,17 +3171,52 @@ void vp9_avg_source_sad(VP9_COMP *cpi) {
           if (avg_sad >
                   VPXMAX(min_thresh,
                          (unsigned int)(rc->avg_source_sad[0] * thresh)) &&
-              rc->frames_since_key > 1)
+              rc->frames_since_key > 1 + cpi->svc.number_spatial_layers &&
+              num_zero_temp_sad < 3 * (num_samples >> 2))
             rc->high_source_sad = 1;
           else
             rc->high_source_sad = 0;
+          if (rc->high_source_sad && avg_sad > thresh_key)
+            scene_cut_force_key_frame = 1;
           if (avg_sad > 0 || cpi->oxcf.rc_mode == VPX_CBR)
             rc->avg_source_sad[0] = (3 * rc->avg_source_sad[0] + avg_sad) >> 2;
         } else {
           rc->avg_source_sad[lagframe_idx] = avg_sad;
         }
+        if (num_zero_temp_sad < (3 * num_samples >> 2))
+          rc->high_num_blocks_with_motion = 1;
       }
     }
+    // For CBR non-screen content mode, check if we should reset the rate
+    // control. Reset is done if high_source_sad is detected and the rate
+    // control is at very low QP with rate correction factor at min level.
+    if (cpi->oxcf.rc_mode == VPX_CBR &&
+        cpi->oxcf.content != VP9E_CONTENT_SCREEN && !cpi->use_svc) {
+      if (rc->high_source_sad && rc->last_q[INTER_FRAME] == rc->best_quality &&
+          rc->avg_frame_qindex[INTER_FRAME] < (rc->best_quality << 1) &&
+          rc->rate_correction_factors[INTER_NORMAL] == MIN_BPB_FACTOR) {
+        rc->rate_correction_factors[INTER_NORMAL] = 0.5;
+        rc->avg_frame_qindex[INTER_FRAME] = rc->worst_quality;
+        rc->buffer_level = rc->optimal_buffer_level;
+        rc->bits_off_target = rc->optimal_buffer_level;
+        rc->reset_high_source_sad = 1;
+      }
+      if (cm->frame_type != KEY_FRAME && rc->reset_high_source_sad)
+        rc->this_frame_target = rc->avg_frame_bandwidth;
+    }
+    // For SVC the new (updated) avg_source_sad[0] for the current superframe
+    // updates the setting for all layers.
+    if (cpi->use_svc) {
+      int sl, tl;
+      SVC *const svc = &cpi->svc;
+      for (sl = 0; sl < svc->number_spatial_layers; ++sl)
+        for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
+          int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+          LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+          RATE_CONTROL *const lrc = &lc->rc;
+          lrc->avg_source_sad[0] = rc->avg_source_sad[0];
+        }
+    }
     // For VBR, under scene change/high content change, force golden refresh.
     if (cpi->oxcf.rc_mode == VPX_VBR && cm->frame_type != KEY_FRAME &&
         rc->high_source_sad && rc->frames_to_key > 3 &&
@@ -2294,16 +3224,16 @@ void vp9_avg_source_sad(VP9_COMP *cpi) {
         cpi->ext_refresh_frame_flags_pending == 0) {
       int target;
       cpi->refresh_golden_frame = 1;
+      if (scene_cut_force_key_frame) cm->frame_type = KEY_FRAME;
       rc->source_alt_ref_pending = 0;
-#if USE_ALTREF_FOR_ONE_PASS
-      if (cpi->oxcf.enable_auto_arf) rc->source_alt_ref_pending = 1;
-#endif
+      if (cpi->sf.use_altref_onepass && cpi->oxcf.enable_auto_arf)
+        rc->source_alt_ref_pending = 1;
       rc->gfu_boost = DEFAULT_GF_BOOST >> 1;
       rc->baseline_gf_interval =
           VPXMIN(20, VPXMAX(10, rc->baseline_gf_interval));
       adjust_gfint_frame_constraint(cpi, rc->frames_to_key);
       rc->frames_till_gf_update_due = rc->baseline_gf_interval;
-      target = calc_pframe_target_size_one_pass_vbr(cpi);
+      target = vp9_calc_pframe_target_size_one_pass_vbr(cpi);
       vp9_rc_set_frame_target(cpi, target);
       rc->count_last_scene_change = 0;
     } else {
@@ -2317,21 +3247,65 @@ void vp9_avg_source_sad(VP9_COMP *cpi) {
 
 // Test if encoded frame will significantly overshoot the target bitrate, and
 // if so, set the QP, reset/adjust some rate control parameters, and return 1.
+// frame_size = -1 means frame has not been encoded.
 int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
-  int thresh_qp = 3 * (rc->worst_quality >> 2);
-  int thresh_rate = rc->avg_frame_bandwidth * 10;
-  if (cm->base_qindex < thresh_qp && frame_size > thresh_rate) {
+  SPEED_FEATURES *const sf = &cpi->sf;
+  int thresh_qp = 7 * (rc->worst_quality >> 3);
+  int thresh_rate = rc->avg_frame_bandwidth << 3;
+  // Lower thresh_qp for video (more overshoot at lower Q) to be
+  // more conservative for video.
+  if (cpi->oxcf.content != VP9E_CONTENT_SCREEN)
+    thresh_qp = 3 * (rc->worst_quality >> 2);
+  // If this decision is not based on an encoded frame size but just on
+  // scene/slide change detection (i.e., re_encode_overshoot_cbr_rt ==
+  // FAST_DETECTION_MAXQ), for now skip the (frame_size > thresh_rate)
+  // condition in this case.
+  // TODO(marpan): Use a better size/rate condition for this case and
+  // adjust thresholds.
+  if ((sf->overshoot_detection_cbr_rt == FAST_DETECTION_MAXQ ||
+       frame_size > thresh_rate) &&
+      cm->base_qindex < thresh_qp) {
     double rate_correction_factor =
         cpi->rc.rate_correction_factors[INTER_NORMAL];
     const int target_size = cpi->rc.avg_frame_bandwidth;
+    const uint64_t sad_thr = 64 * 64 * 32;
+    int force_maxqp = 1;
     double new_correction_factor;
     int target_bits_per_mb;
     double q2;
     int enumerator;
-    // Force a re-encode, and for now use max-QP.
-    *q = cpi->rc.worst_quality;
+    // Set a larger QP.
+    if (cpi->oxcf.content != VP9E_CONTENT_SCREEN &&
+        (rc->buffer_level > (3 * rc->optimal_buffer_level) >> 2) &&
+        (cpi->rc.avg_source_sad[0] < sad_thr)) {
+      *q = (*q + cpi->rc.worst_quality) >> 1;
+      force_maxqp = 0;
+    } else {
+      *q = cpi->rc.worst_quality;
+    }
+    cpi->cyclic_refresh->counter_encode_maxq_scene_change = 0;
+    cpi->rc.re_encode_maxq_scene_change = 1;
+    // If the frame_size is much larger than the threshold (big content change)
+    // and the encoded frame used alot of Intra modes, then force hybrid_intra
+    // encoding for the re-encode on this scene change. hybrid_intra will
+    // use rd-based intra mode selection for small blocks.
+    if (sf->overshoot_detection_cbr_rt == RE_ENCODE_MAXQ &&
+        frame_size > (thresh_rate << 1) && cpi->svc.spatial_layer_id == 0) {
+      MODE_INFO **mi = cm->mi_grid_visible;
+      int sum_intra_usage = 0;
+      int mi_row, mi_col;
+      for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
+        for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
+          if (mi[0]->ref_frame[0] == INTRA_FRAME) sum_intra_usage++;
+          mi++;
+        }
+        mi += 8;
+      }
+      sum_intra_usage = 100 * sum_intra_usage / (cm->mi_rows * cm->mi_cols);
+      if (sum_intra_usage > 60) cpi->rc.hybrid_intra_scene_change = 1;
+    }
     // Adjust avg_frame_qindex, buffer_level, and rate correction factors, as
     // these parameters will affect QP selection for subsequent frames. If they
     // have settled down to a very different (low QP) state, then not adjusting
@@ -2360,20 +3334,32 @@ int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) {
     }
     // For temporal layers, reset the rate control parametes across all
     // temporal layers.
+    // If the first_spatial_layer_to_encode > 0, then this superframe has
+    // skipped lower base layers. So in this case we should also reset and
+    // force max-q for spatial layers < first_spatial_layer_to_encode.
+    // For the case of no inter-layer prediction on delta frames: reset and
+    // force max-q for all spatial layers, to avoid excessive frame drops.
     if (cpi->use_svc) {
-      int i = 0;
+      int tl = 0;
+      int sl = 0;
       SVC *svc = &cpi->svc;
-      for (i = 0; i < svc->number_temporal_layers; ++i) {
-        const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i,
-                                           svc->number_temporal_layers);
-        LAYER_CONTEXT *lc = &svc->layer_context[layer];
-        RATE_CONTROL *lrc = &lc->rc;
-        lrc->avg_frame_qindex[INTER_FRAME] = *q;
-        lrc->buffer_level = rc->optimal_buffer_level;
-        lrc->bits_off_target = rc->optimal_buffer_level;
-        lrc->rc_1_frame = 0;
-        lrc->rc_2_frame = 0;
-        lrc->rate_correction_factors[INTER_NORMAL] = rate_correction_factor;
+      int num_spatial_layers = VPXMAX(1, svc->first_spatial_layer_to_encode);
+      if (svc->disable_inter_layer_pred != INTER_LAYER_PRED_ON)
+        num_spatial_layers = svc->number_spatial_layers;
+      for (sl = 0; sl < num_spatial_layers; ++sl) {
+        for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
+          const int layer =
+              LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+          LAYER_CONTEXT *lc = &svc->layer_context[layer];
+          RATE_CONTROL *lrc = &lc->rc;
+          lrc->avg_frame_qindex[INTER_FRAME] = *q;
+          lrc->buffer_level = lrc->optimal_buffer_level;
+          lrc->bits_off_target = lrc->optimal_buffer_level;
+          lrc->rc_1_frame = 0;
+          lrc->rc_2_frame = 0;
+          lrc->rate_correction_factors[INTER_NORMAL] = rate_correction_factor;
+          lrc->force_max_q = force_maxqp;
+        }
       }
     }
     return 1;
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h
index 70aef03ffb..0c61ad3461 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_RATECTRL_H_
-#define VP9_ENCODER_VP9_RATECTRL_H_
+#ifndef VPX_VP9_ENCODER_VP9_RATECTRL_H_
+#define VPX_VP9_ENCODER_VP9_RATECTRL_H_
 
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
@@ -21,14 +21,30 @@
 extern "C" {
 #endif
 
+// Used to control aggressive VBR mode.
+// #define AGGRESSIVE_VBR 1
+
 // Bits Per MB at different Q (Multiplied by 512)
 #define BPER_MB_NORMBITS 9
 
+#define DEFAULT_KF_BOOST 2000
+#define DEFAULT_GF_BOOST 2000
+
 #define MIN_GF_INTERVAL 4
 #define MAX_GF_INTERVAL 16
 #define FIXED_GF_INTERVAL 8  // Used in some testing modes only
 #define ONEHALFONLY_RESIZE 0
 
+#define FRAME_OVERHEAD_BITS 200
+
+// Threshold used to define a KF group as static (e.g. a slide show).
+// Essentially this means that no frame in the group has more than 1% of MBs
+// that are not marked as coded with 0,0 motion in the first pass.
+#define STATIC_KF_GROUP_THRESH 99
+
+// The maximum duration of a GF group that is static (for example a slide show).
+#define MAX_STATIC_GF_GROUP_LENGTH 250
+
 typedef enum {
   INTER_NORMAL = 0,
   INTER_HIGH = 1,
@@ -70,7 +86,7 @@ static const double rate_thresh_mult[FRAME_SCALE_STEPS] = { 1.0, 2.0 };
 static const double rcf_mult[FRAME_SCALE_STEPS] = { 1.0, 2.0 };
 
 typedef struct {
-  // Rate targetting variables
+  // Rate targeting variables
   int base_frame_target;  // A baseline frame target before adjustment
                           // for previous under or over shoot.
   int this_frame_target;  // Actual frame target after rc adjustment.
@@ -147,6 +163,8 @@ typedef struct {
   int rc_2_frame;
   int q_1_frame;
   int q_2_frame;
+  // Keep track of the last target average frame bandwidth.
+  int last_avg_frame_bandwidth;
 
   // Auto frame-scaling variables.
   FRAME_SCALE_LEVEL frame_size_selector;
@@ -160,12 +178,43 @@ typedef struct {
   uint64_t avg_source_sad[MAX_LAG_BUFFERS];
   uint64_t prev_avg_source_sad_lag;
   int high_source_sad_lagindex;
+  int high_num_blocks_with_motion;
   int alt_ref_gf_group;
+  int last_frame_is_src_altref;
   int high_source_sad;
   int count_last_scene_change;
+  int hybrid_intra_scene_change;
+  int re_encode_maxq_scene_change;
   int avg_frame_low_motion;
   int af_ratio_onepass_vbr;
   int force_qpmin;
+  int reset_high_source_sad;
+  double perc_arf_usage;
+  int force_max_q;
+  // Last frame was dropped post encode on scene change.
+  int last_post_encode_dropped_scene_change;
+  // Enable post encode frame dropping for screen content. Only enabled when
+  // ext_use_post_encode_drop is enabled by user.
+  int use_post_encode_drop;
+  // External flag to enable post encode frame dropping, controlled by user.
+  int ext_use_post_encode_drop;
+  // Flag to disable CBR feature to increase Q on overshoot detection.
+  int disable_overshoot_maxq_cbr;
+  int damped_adjustment[RATE_FACTOR_LEVELS];
+  double arf_active_best_quality_adjustment_factor;
+  int arf_increase_active_best_quality;
+
+  int preserve_arf_as_gld;
+  int preserve_next_arf_as_gld;
+  int show_arf_as_gld;
+
+  // Flag to constrain golden frame interval on key frame frequency for 1 pass
+  // VBR.
+  int constrain_gf_key_freq_onepass_vbr;
+
+  // The index of the current GOP. Start from zero.
+  // When a key frame is inserted, it resets to zero.
+  int gop_global_index;
 } RATE_CONTROL;
 
 struct VP9_COMP;
@@ -174,18 +223,20 @@ struct VP9EncoderConfig;
 void vp9_rc_init(const struct VP9EncoderConfig *oxcf, int pass,
                  RATE_CONTROL *rc);
 
-int vp9_estimate_bits_at_q(FRAME_TYPE frame_kind, int q, int mbs,
+int vp9_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
                            double correction_factor, vpx_bit_depth_t bit_depth);
 
 double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth);
 
+int vp9_convert_q_to_qindex(double q_val, vpx_bit_depth_t bit_depth);
+
 void vp9_rc_init_minq_luts(void);
 
 int vp9_rc_get_default_min_gf_interval(int width, int height, double framerate);
 // Note vp9_rc_get_default_max_gf_interval() requires the min_gf_interval to
-// be passed in to ensure that the max_gf_interval returned is at least as bis
+// be passed in to ensure that the max_gf_interval returned is at least as big
 // as that.
-int vp9_rc_get_default_max_gf_interval(double framerate, int min_frame_rate);
+int vp9_rc_get_default_max_gf_interval(double framerate, int min_gf_interval);
 
 // Generally at the high level, the following flow is expected
 // to be enforced for rate control:
@@ -213,6 +264,12 @@ int vp9_rc_get_default_max_gf_interval(double framerate, int min_frame_rate);
 // encode_frame_to_data_rate() function.
 void vp9_rc_get_one_pass_vbr_params(struct VP9_COMP *cpi);
 void vp9_rc_get_one_pass_cbr_params(struct VP9_COMP *cpi);
+int vp9_calc_pframe_target_size_one_pass_cbr(const struct VP9_COMP *cpi);
+int vp9_calc_iframe_target_size_one_pass_cbr(const struct VP9_COMP *cpi);
+int vp9_calc_pframe_target_size_one_pass_vbr(const struct VP9_COMP *cpi);
+int vp9_calc_iframe_target_size_one_pass_vbr(const struct VP9_COMP *cpi);
+void vp9_set_gf_update_one_pass_vbr(struct VP9_COMP *const cpi);
+void vp9_update_buffer_level_preencode(struct VP9_COMP *cpi);
 void vp9_rc_get_svc_params(struct VP9_COMP *cpi);
 
 // Post encode update of the rate control parameters based
@@ -225,13 +282,18 @@ void vp9_rc_postencode_update_drop_frame(struct VP9_COMP *cpi);
 // Changes only the rate correction factors in the rate control structure.
 void vp9_rc_update_rate_correction_factors(struct VP9_COMP *cpi);
 
+// Post encode drop for CBR mode.
+int post_encode_drop_cbr(struct VP9_COMP *cpi, size_t *size);
+
+int vp9_test_drop(struct VP9_COMP *cpi);
+
 // Decide if we should drop this frame: For 1-pass CBR.
 // Changes only the decimation count in the rate control structure
 int vp9_rc_drop_frame(struct VP9_COMP *cpi);
 
 // Computes frame size bounds.
 void vp9_rc_compute_frame_size_bounds(const struct VP9_COMP *cpi,
-                                      int this_frame_target,
+                                      int frame_target,
                                       int *frame_under_shoot_limit,
                                       int *frame_over_shoot_limit);
 
@@ -278,12 +340,22 @@ void vp9_set_target_rate(struct VP9_COMP *cpi);
 
 int vp9_resize_one_pass_cbr(struct VP9_COMP *cpi);
 
-void vp9_avg_source_sad(struct VP9_COMP *cpi);
+void vp9_scene_detection_onepass(struct VP9_COMP *cpi);
 
 int vp9_encodedframe_overshoot(struct VP9_COMP *cpi, int frame_size, int *q);
 
+void vp9_configure_buffer_updates(struct VP9_COMP *cpi, int gf_group_index);
+
+void vp9_compute_frame_low_motion(struct VP9_COMP *const cpi);
+
+void vp9_update_buffer_level_svc_preencode(struct VP9_COMP *cpi);
+
+int vp9_rc_pick_q_and_bounds_two_pass(const struct VP9_COMP *cpi,
+                                      int *bottom_index, int *top_index,
+                                      int gf_group_index);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_RATECTRL_H_
+#endif  // VPX_VP9_ENCODER_VP9_RATECTRL_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_rd.c b/media/libvpx/libvpx/vp9/encoder/vp9_rd.c
index 76c639a647..95c95971c5 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_rd.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_rd.c
@@ -57,6 +57,30 @@ void vp9_rd_cost_init(RD_COST *rd_cost) {
   rd_cost->rdcost = 0;
 }
 
+int64_t vp9_calculate_rd_cost(int mult, int div, int rate, int64_t dist) {
+  assert(mult >= 0);
+  assert(div > 0);
+  if (rate >= 0 && dist >= 0) {
+    return RDCOST(mult, div, rate, dist);
+  }
+  if (rate >= 0 && dist < 0) {
+    return RDCOST_NEG_D(mult, div, rate, -dist);
+  }
+  if (rate < 0 && dist >= 0) {
+    return RDCOST_NEG_R(mult, div, -rate, dist);
+  }
+  return -RDCOST(mult, div, -rate, -dist);
+}
+
+void vp9_rd_cost_update(int mult, int div, RD_COST *rd_cost) {
+  if (rd_cost->rate < INT_MAX && rd_cost->dist < INT64_MAX) {
+    rd_cost->rdcost =
+        vp9_calculate_rd_cost(mult, div, rd_cost->rate, rd_cost->dist);
+  } else {
+    vp9_rd_cost_reset(rd_cost);
+  }
+}
+
 // The baseline rd thresholds for breaking out of the rd loop for
 // certain modes are assumed to be based on 8x8 blocks.
 // This table is used to correct for block size.
@@ -69,10 +93,12 @@ static void fill_mode_costs(VP9_COMP *cpi) {
   const FRAME_CONTEXT *const fc = cpi->common.fc;
   int i, j;
 
-  for (i = 0; i < INTRA_MODES; ++i)
-    for (j = 0; j < INTRA_MODES; ++j)
+  for (i = 0; i < INTRA_MODES; ++i) {
+    for (j = 0; j < INTRA_MODES; ++j) {
       vp9_cost_tokens(cpi->y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
                       vp9_intra_mode_tree);
+    }
+  }
 
   vp9_cost_tokens(cpi->mbmode_cost, fc->y_mode_prob[1], vp9_intra_mode_tree);
   for (i = 0; i < INTRA_MODES; ++i) {
@@ -82,9 +108,28 @@ static void fill_mode_costs(VP9_COMP *cpi) {
                     fc->uv_mode_prob[i], vp9_intra_mode_tree);
   }
 
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) {
     vp9_cost_tokens(cpi->switchable_interp_costs[i],
                     fc->switchable_interp_prob[i], vp9_switchable_interp_tree);
+  }
+
+  for (i = TX_8X8; i < TX_SIZES; ++i) {
+    for (j = 0; j < TX_SIZE_CONTEXTS; ++j) {
+      const vpx_prob *tx_probs = get_tx_probs(i, j, &fc->tx_probs);
+      int k;
+      for (k = 0; k <= i; ++k) {
+        int cost = 0;
+        int m;
+        for (m = 0; m <= k - (k == i); ++m) {
+          if (m == k)
+            cost += vp9_cost_zero(tx_probs[m]);
+          else
+            cost += vp9_cost_one(tx_probs[m]);
+        }
+        cpi->tx_size_cost[i - 1][j][k] = cost;
+      }
+    }
+  }
 }
 
 static void fill_token_costs(vp9_coeff_cost *c,
@@ -143,34 +188,125 @@ void vp9_init_me_luts(void) {
 
 static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12,
                                          8,  8,  4,  4,  2,  2,  1,  0 };
-static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { 128, 144, 128,
-                                                              128, 144 };
 
-int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) {
-  const int64_t q = vp9_dc_quant(qindex, 0, cpi->common.bit_depth);
-#if CONFIG_VP9_HIGHBITDEPTH
-  int64_t rdmult = 0;
-  switch (cpi->common.bit_depth) {
-    case VPX_BITS_8: rdmult = 88 * q * q / 24; break;
-    case VPX_BITS_10: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 4); break;
-    case VPX_BITS_12: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 8); break;
-    default:
-      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
-      return -1;
+// Note that the element below for frame type "USE_BUF_FRAME", which indicates
+// that the show frame flag is set, should not be used as no real frame
+// is encoded so we should not reach here. However, a dummy value
+// is inserted here to make sure the data structure has the right number
+// of values assigned.
+static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { 128, 144, 128,
+                                                              128, 144, 144 };
+
+// Configure Vizier RD parameters.
+// Later this function will use passed in command line values.
+void vp9_init_rd_parameters(VP9_COMP *cpi) {
+  RD_CONTROL *const rdc = &cpi->rd_ctrl;
+
+  // When |use_vizier_rc_params| is 1, we expect the rd parameters have been
+  // initialized by the pass in values.
+  // Be careful that parameters below are only initialized to 1, if we do not
+  // pass values to them. It is desired to take care of each parameter when
+  // using |use_vizier_rc_params|.
+  if (cpi->twopass.use_vizier_rc_params) return;
+
+  // Make sure this function is floating point safe.
+  vpx_clear_system_state();
+
+  rdc->rd_mult_inter_qp_fac = 1.0;
+  rdc->rd_mult_arf_qp_fac = 1.0;
+  rdc->rd_mult_key_qp_fac = 1.0;
+}
+
+// Returns the default rd multiplier for inter frames for a given qindex.
+// The function here is a first pass estimate based on data from
+// a previous Vizer run
+static double def_inter_rd_multiplier(int qindex) {
+  return 4.15 + (0.001 * (double)qindex);
+}
+
+// Returns the default rd multiplier for ARF/Golden Frames for a given qindex.
+// The function here is a first pass estimate based on data from
+// a previous Vizer run
+static double def_arf_rd_multiplier(int qindex) {
+  return 4.25 + (0.001 * (double)qindex);
+}
+
+// Returns the default rd multiplier for key frames for a given qindex.
+// The function here is a first pass estimate based on data from
+// a previous Vizer run
+static double def_kf_rd_multiplier(int qindex) {
+  return 4.35 + (0.001 * (double)qindex);
+}
+
+int vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) {
+  const RD_CONTROL *rdc = &cpi->rd_ctrl;
+  const int q = vp9_dc_quant(qindex, 0, cpi->common.bit_depth);
+  // largest dc_quant is 21387, therefore rdmult should fit in int32_t
+  int rdmult = q * q;
+
+  if (cpi->ext_ratectrl.ready &&
+      (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0 &&
+      cpi->ext_ratectrl.ext_rdmult != VPX_DEFAULT_RDMULT) {
+    return cpi->ext_ratectrl.ext_rdmult;
+  }
+
+  // Make sure this function is floating point safe.
+  vpx_clear_system_state();
+
+  if (cpi->common.frame_type == KEY_FRAME) {
+    double def_rd_q_mult = def_kf_rd_multiplier(qindex);
+    rdmult = (int)((double)rdmult * def_rd_q_mult * rdc->rd_mult_key_qp_fac);
+  } else if (!cpi->rc.is_src_frame_alt_ref &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+    double def_rd_q_mult = def_arf_rd_multiplier(qindex);
+    rdmult = (int)((double)rdmult * def_rd_q_mult * rdc->rd_mult_arf_qp_fac);
+  } else {
+    double def_rd_q_mult = def_inter_rd_multiplier(qindex);
+    rdmult = (int)((double)rdmult * def_rd_q_mult * rdc->rd_mult_inter_qp_fac);
+  }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  switch (cpi->common.bit_depth) {
+    case VPX_BITS_10: rdmult = ROUND_POWER_OF_TWO(rdmult, 4); break;
+    case VPX_BITS_12: rdmult = ROUND_POWER_OF_TWO(rdmult, 8); break;
+    default: break;
   }
-#else
-  int64_t rdmult = 88 * q * q / 24;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+  return rdmult > 0 ? rdmult : 1;
+}
+
+static int modulate_rdmult(const VP9_COMP *cpi, int rdmult) {
+  int64_t rdmult_64 = rdmult;
   if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
     const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
     const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index];
-    const int boost_index = VPXMIN(15, (cpi->rc.gfu_boost / 100));
+    const int gfu_boost = cpi->multi_layer_arf
+                              ? gf_group->gfu_boost[gf_group->index]
+                              : cpi->rc.gfu_boost;
+    const int boost_index = VPXMIN(15, (gfu_boost / 100));
 
-    rdmult = (rdmult * rd_frame_type_factor[frame_type]) >> 7;
-    rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7);
+    rdmult_64 = (rdmult_64 * rd_frame_type_factor[frame_type]) >> 7;
+    rdmult_64 += ((rdmult_64 * rd_boost_factor[boost_index]) >> 7);
   }
-  if (rdmult < 1) rdmult = 1;
-  return (int)rdmult;
+  return (int)rdmult_64;
+}
+
+int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) {
+  int rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, qindex);
+  if (cpi->ext_ratectrl.ready &&
+      (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0 &&
+      cpi->ext_ratectrl.ext_rdmult != VPX_DEFAULT_RDMULT) {
+    return cpi->ext_ratectrl.ext_rdmult;
+  }
+  return modulate_rdmult(cpi, rdmult);
+}
+
+int vp9_get_adaptive_rdmult(const VP9_COMP *cpi, double beta) {
+  int rdmult =
+      vp9_compute_rd_mult_based_on_qindex(cpi, cpi->common.base_qindex);
+  rdmult = (int)((double)rdmult / beta);
+  rdmult = rdmult > 0 ? rdmult : 1;
+  return modulate_rdmult(cpi, rdmult);
 }
 
 static int compute_rd_thresh_factor(int qindex, vpx_bit_depth_t bit_depth) {
@@ -179,10 +315,10 @@ static int compute_rd_thresh_factor(int qindex, vpx_bit_depth_t bit_depth) {
   switch (bit_depth) {
     case VPX_BITS_8: q = vp9_dc_quant(qindex, 0, VPX_BITS_8) / 4.0; break;
     case VPX_BITS_10: q = vp9_dc_quant(qindex, 0, VPX_BITS_10) / 16.0; break;
-    case VPX_BITS_12: q = vp9_dc_quant(qindex, 0, VPX_BITS_12) / 64.0; break;
     default:
-      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
-      return -1;
+      assert(bit_depth == VPX_BITS_12);
+      q = vp9_dc_quant(qindex, 0, VPX_BITS_12) / 64.0;
+      break;
   }
 #else
   (void)bit_depth;
@@ -203,12 +339,11 @@ void vp9_initialize_me_consts(VP9_COMP *cpi, MACROBLOCK *x, int qindex) {
       x->sadperbit16 = sad_per_bit16lut_10[qindex];
       x->sadperbit4 = sad_per_bit4lut_10[qindex];
       break;
-    case VPX_BITS_12:
+    default:
+      assert(cpi->common.bit_depth == VPX_BITS_12);
       x->sadperbit16 = sad_per_bit16lut_12[qindex];
       x->sadperbit4 = sad_per_bit4lut_12[qindex];
       break;
-    default:
-      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
   }
 #else
   (void)cpi;
@@ -249,6 +384,15 @@ static void set_block_thresholds(const VP9_COMMON *cm, RD_OPT *rd) {
   }
 }
 
+void vp9_build_inter_mode_cost(VP9_COMP *cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
+  int i;
+  for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
+    vp9_cost_tokens((int *)cpi->inter_mode_cost[i], cm->fc->inter_mode_probs[i],
+                    vp9_inter_mode_tree);
+  }
+}
+
 void vp9_initialize_rd_consts(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->td.mb;
@@ -297,72 +441,68 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) {
             x->nmvjointcost,
             cm->allow_high_precision_mv ? x->nmvcost_hp : x->nmvcost,
             &cm->fc->nmvc, cm->allow_high_precision_mv);
-
-        for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
-          vp9_cost_tokens((int *)cpi->inter_mode_cost[i],
-                          cm->fc->inter_mode_probs[i], vp9_inter_mode_tree);
+        vp9_build_inter_mode_cost(cpi);
       }
     }
   }
 }
 
+// NOTE: The tables below must be of the same size.
+
+// The functions described below are sampled at the four most significant
+// bits of x^2 + 8 / 256.
+
+// Normalized rate:
+// This table models the rate for a Laplacian source with given variance
+// when quantized with a uniform quantizer with given stepsize. The
+// closed form expression is:
+// Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
+// where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
+// and H(x) is the binary entropy function.
+static const int rate_tab_q10[] = {
+  65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, 4553, 4389, 4255, 4142, 4044,
+  3958,  3881, 3811, 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186, 3133, 3037,
+  2952,  2877, 2809, 2747, 2690, 2638, 2589, 2501, 2423, 2353, 2290, 2232, 2179,
+  2130,  2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651, 1608, 1530, 1460, 1398,
+  1342,  1290, 1243, 1199, 1159, 1086, 1021, 963,  911,  864,  821,  781,  745,
+  680,   623,  574,  530,  490,  455,  424,  395,  345,  304,  269,  239,  213,
+  190,   171,  154,  126,  104,  87,   73,   61,   52,   44,   38,   28,   21,
+  16,    12,   10,   8,    6,    5,    3,    2,    1,    1,    1,    0,    0,
+};
+
+// Normalized distortion:
+// This table models the normalized distortion for a Laplacian source
+// with given variance when quantized with a uniform quantizer
+// with given stepsize. The closed form expression is:
+// Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
+// where x = qpstep / sqrt(variance).
+// Note the actual distortion is Dn * variance.
+static const int dist_tab_q10[] = {
+  0,    0,    1,    1,    1,    2,    2,    2,    3,    3,    4,    5,    5,
+  6,    7,    7,    8,    9,    11,   12,   13,   15,   16,   17,   18,   21,
+  24,   26,   29,   31,   34,   36,   39,   44,   49,   54,   59,   64,   69,
+  73,   78,   88,   97,   106,  115,  124,  133,  142,  151,  167,  184,  200,
+  215,  231,  245,  260,  274,  301,  327,  351,  375,  397,  418,  439,  458,
+  495,  528,  559,  587,  613,  637,  659,  680,  717,  749,  777,  801,  823,
+  842,  859,  874,  899,  919,  936,  949,  960,  969,  977,  983,  994,  1001,
+  1006, 1010, 1013, 1015, 1017, 1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024,
+};
+static const int xsq_iq_q10[] = {
+  0,      4,      8,      12,     16,     20,     24,     28,     32,
+  40,     48,     56,     64,     72,     80,     88,     96,     112,
+  128,    144,    160,    176,    192,    208,    224,    256,    288,
+  320,    352,    384,    416,    448,    480,    544,    608,    672,
+  736,    800,    864,    928,    992,    1120,   1248,   1376,   1504,
+  1632,   1760,   1888,   2016,   2272,   2528,   2784,   3040,   3296,
+  3552,   3808,   4064,   4576,   5088,   5600,   6112,   6624,   7136,
+  7648,   8160,   9184,   10208,  11232,  12256,  13280,  14304,  15328,
+  16352,  18400,  20448,  22496,  24544,  26592,  28640,  30688,  32736,
+  36832,  40928,  45024,  49120,  53216,  57312,  61408,  65504,  73696,
+  81888,  90080,  98272,  106464, 114656, 122848, 131040, 147424, 163808,
+  180192, 196576, 212960, 229344, 245728,
+};
+
 static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
-  // NOTE: The tables below must be of the same size.
-
-  // The functions described below are sampled at the four most significant
-  // bits of x^2 + 8 / 256.
-
-  // Normalized rate:
-  // This table models the rate for a Laplacian source with given variance
-  // when quantized with a uniform quantizer with given stepsize. The
-  // closed form expression is:
-  // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
-  // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
-  // and H(x) is the binary entropy function.
-  static const int rate_tab_q10[] = {
-    65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, 4553, 4389, 4255, 4142,
-    4044,  3958, 3881, 3811, 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186,
-    3133,  3037, 2952, 2877, 2809, 2747, 2690, 2638, 2589, 2501, 2423, 2353,
-    2290,  2232, 2179, 2130, 2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651,
-    1608,  1530, 1460, 1398, 1342, 1290, 1243, 1199, 1159, 1086, 1021, 963,
-    911,   864,  821,  781,  745,  680,  623,  574,  530,  490,  455,  424,
-    395,   345,  304,  269,  239,  213,  190,  171,  154,  126,  104,  87,
-    73,    61,   52,   44,   38,   28,   21,   16,   12,   10,   8,    6,
-    5,     3,    2,    1,    1,    1,    0,    0,
-  };
-
-  // Normalized distortion:
-  // This table models the normalized distortion for a Laplacian source
-  // with given variance when quantized with a uniform quantizer
-  // with given stepsize. The closed form expression is:
-  // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
-  // where x = qpstep / sqrt(variance).
-  // Note the actual distortion is Dn * variance.
-  static const int dist_tab_q10[] = {
-    0,    0,    1,    1,    1,    2,    2,    2,    3,    3,    4,    5,
-    5,    6,    7,    7,    8,    9,    11,   12,   13,   15,   16,   17,
-    18,   21,   24,   26,   29,   31,   34,   36,   39,   44,   49,   54,
-    59,   64,   69,   73,   78,   88,   97,   106,  115,  124,  133,  142,
-    151,  167,  184,  200,  215,  231,  245,  260,  274,  301,  327,  351,
-    375,  397,  418,  439,  458,  495,  528,  559,  587,  613,  637,  659,
-    680,  717,  749,  777,  801,  823,  842,  859,  874,  899,  919,  936,
-    949,  960,  969,  977,  983,  994,  1001, 1006, 1010, 1013, 1015, 1017,
-    1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024,
-  };
-  static const int xsq_iq_q10[] = {
-    0,      4,      8,      12,     16,     20,     24,     28,     32,
-    40,     48,     56,     64,     72,     80,     88,     96,     112,
-    128,    144,    160,    176,    192,    208,    224,    256,    288,
-    320,    352,    384,    416,    448,    480,    544,    608,    672,
-    736,    800,    864,    928,    992,    1120,   1248,   1376,   1504,
-    1632,   1760,   1888,   2016,   2272,   2528,   2784,   3040,   3296,
-    3552,   3808,   4064,   4576,   5088,   5600,   6112,   6624,   7136,
-    7648,   8160,   9184,   10208,  11232,  12256,  13280,  14304,  15328,
-    16352,  18400,  20448,  22496,  24544,  26592,  28640,  30688,  32736,
-    36832,  40928,  45024,  49120,  53216,  57312,  61408,  65504,  73696,
-    81888,  90080,  98272,  106464, 114656, 122848, 131040, 147424, 163808,
-    180192, 196576, 212960, 229344, 245728,
-  };
   const int tmp = (xsq_q10 >> 2) + 8;
   const int k = get_msb(tmp) - 3;
   const int xq = (k << 3) + ((tmp >> k) & 0x7);
@@ -373,6 +513,8 @@ static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
   *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
 }
 
+static const uint32_t MAX_XSQ_Q10 = 245727;
+
 void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
                                   unsigned int qstep, int *rate,
                                   int64_t *dist) {
@@ -387,7 +529,6 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
     *dist = 0;
   } else {
     int d_q10, r_q10;
-    static const uint32_t MAX_XSQ_Q10 = 245727;
     const uint64_t xsq_q10_64 =
         (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var;
     const int xsq_q10 = (int)VPXMIN(xsq_q10_64, MAX_XSQ_Q10);
@@ -397,6 +538,12 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
   }
 }
 
+// Disable gcc 12.2 false positive warning.
+// warning: writing 1 byte into a region of size 0 [-Wstringop-overflow=]
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstringop-overflow"
+#endif
 void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
                               const struct macroblockd_plane *pd,
                               ENTROPY_CONTEXT t_above[16],
@@ -425,15 +572,18 @@ void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
       for (i = 0; i < num_4x4_h; i += 4)
         t_left[i] = !!*(const uint32_t *)&left[i];
       break;
-    case TX_32X32:
+    default:
+      assert(tx_size == TX_32X32);
       for (i = 0; i < num_4x4_w; i += 8)
         t_above[i] = !!*(const uint64_t *)&above[i];
       for (i = 0; i < num_4x4_h; i += 8)
         t_left[i] = !!*(const uint64_t *)&left[i];
       break;
-    default: assert(0 && "Invalid transform size."); break;
   }
 }
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
 
 void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
                  int ref_y_stride, int ref_frame, BLOCK_SIZE block_size) {
@@ -447,8 +597,7 @@ void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
   uint8_t *src_y_ptr = x->plane[0].src.buf;
   uint8_t *ref_y_ptr;
   const int num_mv_refs =
-      MAX_MV_REF_CANDIDATES +
-      (cpi->sf.adaptive_motion_search && block_size < x->max_partition_size);
+      MAX_MV_REF_CANDIDATES + (block_size < x->max_partition_size);
 
   MV pred_mv[3];
   pred_mv[0] = x->mbmi_ext->ref_mvs[ref_frame][0].as_mv;
@@ -458,11 +607,12 @@ void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
 
   near_same_nearest = x->mbmi_ext->ref_mvs[ref_frame][0].as_int ==
                       x->mbmi_ext->ref_mvs[ref_frame][1].as_int;
+
   // Get the sad for each candidate reference mv.
   for (i = 0; i < num_mv_refs; ++i) {
     const MV *this_mv = &pred_mv[i];
     int fp_row, fp_col;
-
+    if (this_mv->row == INT16_MAX || this_mv->col == INT16_MAX) continue;
     if (i == 1 && near_same_nearest) continue;
     fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3;
     fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3;
@@ -527,6 +677,7 @@ YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi,
   const VP9_COMMON *const cm = &cpi->common;
   const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1];
   const int ref_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+  assert(ref_frame >= LAST_FRAME && ref_frame <= ALTREF_FRAME);
   return (scaled_idx != ref_idx && scaled_idx != INVALID_IDX)
              ? &cm->buffer_pool->frame_bufs[scaled_idx].buf
              : NULL;
@@ -624,19 +775,21 @@ void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh,
   }
 }
 
-int vp9_get_intra_cost_penalty(int qindex, int qdelta,
-                               vpx_bit_depth_t bit_depth) {
-  const int q = vp9_dc_quant(qindex, qdelta, bit_depth);
-#if CONFIG_VP9_HIGHBITDEPTH
-  switch (bit_depth) {
-    case VPX_BITS_8: return 20 * q;
-    case VPX_BITS_10: return 5 * q;
-    case VPX_BITS_12: return ROUND_POWER_OF_TWO(5 * q, 2);
-    default:
-      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
-      return -1;
-  }
-#else
-  return 20 * q;
-#endif  // CONFIG_VP9_HIGHBITDEPTH
+int vp9_get_intra_cost_penalty(const VP9_COMP *const cpi, BLOCK_SIZE bsize,
+                               int qindex, int qdelta) {
+  // Reduce the intra cost penalty for small blocks (<=16x16).
+  int reduction_fac =
+      (bsize <= BLOCK_16X16) ? ((bsize <= BLOCK_8X8) ? 4 : 2) : 0;
+
+  if (cpi->noise_estimate.enabled && cpi->noise_estimate.level == kHigh)
+    // Don't reduce intra cost penalty if estimated noise level is high.
+    reduction_fac = 0;
+
+  // Always use VPX_BITS_8 as input here because the penalty is applied
+  // to rate not distortion so we want a consistent penalty for all bit
+  // depths. If the actual bit depth were passed in here then the value
+  // retured by vp9_dc_quant() would scale with the bit depth and we would
+  // then need to apply inverse scaling to correct back to a bit depth
+  // independent rate penalty.
+  return (20 * vp9_dc_quant(qindex, qdelta, VPX_BITS_8)) >> reduction_fac;
 }
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_rd.h b/media/libvpx/libvpx/vp9/encoder/vp9_rd.h
index 05344b6cf0..6c61ae514a 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_rd.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_rd.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_RD_H_
-#define VP9_ENCODER_VP9_RD_H_
+#ifndef VPX_VP9_ENCODER_VP9_RD_H_
+#define VPX_VP9_ENCODER_VP9_RD_H_
 
 #include <limits.h>
 
@@ -27,20 +27,27 @@ extern "C" {
 #define RD_EPB_SHIFT 6
 
 #define RDCOST(RM, DM, R, D) \
-  (ROUND_POWER_OF_TWO(((int64_t)R) * (RM), VP9_PROB_COST_SHIFT) + (D << DM))
+  ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), VP9_PROB_COST_SHIFT) + ((D) << (DM))
+#define RDCOST_NEG_R(RM, DM, R, D) \
+  ((D) << (DM)) - ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), VP9_PROB_COST_SHIFT)
+#define RDCOST_NEG_D(RM, DM, R, D) \
+  ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), VP9_PROB_COST_SHIFT) - ((D) << (DM))
+
 #define QIDX_SKIP_THRESH 115
 
 #define MV_COST_WEIGHT 108
 #define MV_COST_WEIGHT_SUB 120
 
-#define INVALID_MV 0x80008000
-
 #define MAX_MODES 30
 #define MAX_REFS 6
 
+#define RD_THRESH_INIT_FACT 32
 #define RD_THRESH_MAX_FACT 64
 #define RD_THRESH_INC 1
 
+#define VP9_DIST_SCALE_LOG2 4
+#define VP9_DIST_SCALE (1 << VP9_DIST_SCALE_LOG2)
+
 // This enumerator type needs to be kept aligned with the mode order in
 // const MODE_DEFINITION vp9_mode_order[MAX_MODES] used in the rd code.
 typedef enum {
@@ -94,11 +101,18 @@ typedef enum {
   THR_INTRA,
 } THR_MODES_SUB8X8;
 
+typedef struct {
+  // RD multiplier control factors added for Vizier project.
+  double rd_mult_inter_qp_fac;
+  double rd_mult_arf_qp_fac;
+  double rd_mult_key_qp_fac;
+} RD_CONTROL;
+
 typedef struct RD_OPT {
   // Thresh_mult is used to set a threshold for the rd score. A higher value
   // means that we will accept the best mode so far more often. This number
-  // is used in combination with the current block size, and thresh_freq_fact
-  // to pick a threshold.
+  // is used in combination with the current block size, and thresh_freq_fact to
+  // pick a threshold.
   int thresh_mult[MAX_MODES];
   int thresh_mult_sub8x8[MAX_REFS];
 
@@ -107,9 +121,12 @@ typedef struct RD_OPT {
   int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES];
 
   int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
+  int64_t prediction_type_threshes_prev[MAX_REF_FRAMES][REFERENCE_MODES];
 
+  int64_t filter_threshes_prev[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
   int RDMULT;
   int RDDIV;
+  double r0;
 } RD_OPT;
 
 typedef struct RD_COST {
@@ -122,19 +139,29 @@ typedef struct RD_COST {
 void vp9_rd_cost_reset(RD_COST *rd_cost);
 // Initialize the rate distortion cost values to zero.
 void vp9_rd_cost_init(RD_COST *rd_cost);
+// It supports negative rate and dist, which is different from RDCOST().
+int64_t vp9_calculate_rd_cost(int mult, int div, int rate, int64_t dist);
+// Update the cost value based on its rate and distortion.
+void vp9_rd_cost_update(int mult, int div, RD_COST *rd_cost);
 
 struct TileInfo;
 struct TileDataEnc;
 struct VP9_COMP;
 struct macroblock;
 
+void vp9_init_rd_parameters(struct VP9_COMP *cpi);
+
+int vp9_compute_rd_mult_based_on_qindex(const struct VP9_COMP *cpi, int qindex);
+
 int vp9_compute_rd_mult(const struct VP9_COMP *cpi, int qindex);
 
+int vp9_get_adaptive_rdmult(const struct VP9_COMP *cpi, double beta);
+
 void vp9_initialize_rd_consts(struct VP9_COMP *cpi);
 
 void vp9_initialize_me_consts(struct VP9_COMP *cpi, MACROBLOCK *x, int qindex);
 
-void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
+void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
                                   unsigned int qstep, int *rate, int64_t *dist);
 
 int vp9_get_switchable_rate(const struct VP9_COMP *cpi,
@@ -160,12 +187,12 @@ void vp9_set_rd_speed_thresholds(struct VP9_COMP *cpi);
 
 void vp9_set_rd_speed_thresholds_sub8x8(struct VP9_COMP *cpi);
 
-void vp9_update_rd_thresh_fact(int (*fact)[MAX_MODES], int rd_thresh, int bsize,
-                               int best_mode_index);
+void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh,
+                               int bsize, int best_mode_index);
 
 static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
-                                      int thresh_fact) {
-  return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX;
+                                      const int *const thresh_fact) {
+  return best_rd < ((int64_t)thresh * (*thresh_fact) >> 5) || thresh == INT_MAX;
 }
 
 static INLINE void set_error_per_bit(MACROBLOCK *x, int rdmult) {
@@ -182,20 +209,27 @@ void vp9_setup_pred_block(const MACROBLOCKD *xd,
                           const struct scale_factors *scale,
                           const struct scale_factors *scale_uv);
 
-int vp9_get_intra_cost_penalty(int qindex, int qdelta,
-                               vpx_bit_depth_t bit_depth);
+int vp9_get_intra_cost_penalty(const struct VP9_COMP *const cpi,
+                               BLOCK_SIZE bsize, int qindex, int qdelta);
 
+unsigned int vp9_get_sby_variance(struct VP9_COMP *cpi,
+                                  const struct buf_2d *ref, BLOCK_SIZE bs);
 unsigned int vp9_get_sby_perpixel_variance(struct VP9_COMP *cpi,
                                            const struct buf_2d *ref,
                                            BLOCK_SIZE bs);
 #if CONFIG_VP9_HIGHBITDEPTH
+unsigned int vp9_high_get_sby_variance(struct VP9_COMP *cpi,
+                                       const struct buf_2d *ref, BLOCK_SIZE bs,
+                                       int bd);
 unsigned int vp9_high_get_sby_perpixel_variance(struct VP9_COMP *cpi,
                                                 const struct buf_2d *ref,
                                                 BLOCK_SIZE bs, int bd);
 #endif
 
+void vp9_build_inter_mode_cost(struct VP9_COMP *cpi);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_RD_H_
+#endif  // VPX_VP9_ENCODER_VP9_RD_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c b/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c
index 27d4e9d6d3..f8c6f850fc 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c
@@ -31,6 +31,9 @@
 #include "vp9/common/vp9_scan.h"
 #include "vp9/common/vp9_seg_common.h"
 
+#if !CONFIG_REALTIME_ONLY
+#include "vp9/encoder/vp9_aq_variance.h"
+#endif
 #include "vp9/encoder/vp9_cost.h"
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_encodemv.h"
@@ -40,7 +43,6 @@
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rd.h"
 #include "vp9/encoder/vp9_rdopt.h"
-#include "vp9/encoder/vp9_aq_variance.h"
 
 #define LAST_FRAME_MODE_MASK \
   ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | (1 << INTRA_FRAME))
@@ -59,7 +61,9 @@ typedef struct {
   MV_REFERENCE_FRAME ref_frame[2];
 } MODE_DEFINITION;
 
-typedef struct { MV_REFERENCE_FRAME ref_frame[2]; } REF_DEFINITION;
+typedef struct {
+  MV_REFERENCE_FRAME ref_frame[2];
+} REF_DEFINITION;
 
 struct rdcost_block_args {
   const VP9_COMP *cpi;
@@ -73,34 +77,37 @@ struct rdcost_block_args {
   int64_t best_rd;
   int exit_early;
   int use_fast_coef_costing;
-  const scan_order *so;
+  const ScanOrder *so;
   uint8_t skippable;
+  struct buf_2d *this_recon;
 };
 
 #define LAST_NEW_MV_INDEX 6
+
+#if !CONFIG_REALTIME_ONLY
 static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
-  { NEARESTMV, { LAST_FRAME, NONE } },
-  { NEARESTMV, { ALTREF_FRAME, NONE } },
-  { NEARESTMV, { GOLDEN_FRAME, NONE } },
+  { NEARESTMV, { LAST_FRAME, NO_REF_FRAME } },
+  { NEARESTMV, { ALTREF_FRAME, NO_REF_FRAME } },
+  { NEARESTMV, { GOLDEN_FRAME, NO_REF_FRAME } },
 
-  { DC_PRED, { INTRA_FRAME, NONE } },
+  { DC_PRED, { INTRA_FRAME, NO_REF_FRAME } },
 
-  { NEWMV, { LAST_FRAME, NONE } },
-  { NEWMV, { ALTREF_FRAME, NONE } },
-  { NEWMV, { GOLDEN_FRAME, NONE } },
+  { NEWMV, { LAST_FRAME, NO_REF_FRAME } },
+  { NEWMV, { ALTREF_FRAME, NO_REF_FRAME } },
+  { NEWMV, { GOLDEN_FRAME, NO_REF_FRAME } },
 
-  { NEARMV, { LAST_FRAME, NONE } },
-  { NEARMV, { ALTREF_FRAME, NONE } },
-  { NEARMV, { GOLDEN_FRAME, NONE } },
+  { NEARMV, { LAST_FRAME, NO_REF_FRAME } },
+  { NEARMV, { ALTREF_FRAME, NO_REF_FRAME } },
+  { NEARMV, { GOLDEN_FRAME, NO_REF_FRAME } },
 
-  { ZEROMV, { LAST_FRAME, NONE } },
-  { ZEROMV, { GOLDEN_FRAME, NONE } },
-  { ZEROMV, { ALTREF_FRAME, NONE } },
+  { ZEROMV, { LAST_FRAME, NO_REF_FRAME } },
+  { ZEROMV, { GOLDEN_FRAME, NO_REF_FRAME } },
+  { ZEROMV, { ALTREF_FRAME, NO_REF_FRAME } },
 
   { NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
   { NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
 
-  { TM_PRED, { INTRA_FRAME, NONE } },
+  { TM_PRED, { INTRA_FRAME, NO_REF_FRAME } },
 
   { NEARMV, { LAST_FRAME, ALTREF_FRAME } },
   { NEWMV, { LAST_FRAME, ALTREF_FRAME } },
@@ -110,21 +117,22 @@ static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
   { ZEROMV, { LAST_FRAME, ALTREF_FRAME } },
   { ZEROMV, { GOLDEN_FRAME, ALTREF_FRAME } },
 
-  { H_PRED, { INTRA_FRAME, NONE } },
-  { V_PRED, { INTRA_FRAME, NONE } },
-  { D135_PRED, { INTRA_FRAME, NONE } },
-  { D207_PRED, { INTRA_FRAME, NONE } },
-  { D153_PRED, { INTRA_FRAME, NONE } },
-  { D63_PRED, { INTRA_FRAME, NONE } },
-  { D117_PRED, { INTRA_FRAME, NONE } },
-  { D45_PRED, { INTRA_FRAME, NONE } },
+  { H_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+  { V_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+  { D135_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+  { D207_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+  { D153_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+  { D63_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+  { D117_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+  { D45_PRED, { INTRA_FRAME, NO_REF_FRAME } },
 };
 
 static const REF_DEFINITION vp9_ref_order[MAX_REFS] = {
-  { { LAST_FRAME, NONE } },           { { GOLDEN_FRAME, NONE } },
-  { { ALTREF_FRAME, NONE } },         { { LAST_FRAME, ALTREF_FRAME } },
-  { { GOLDEN_FRAME, ALTREF_FRAME } }, { { INTRA_FRAME, NONE } },
+  { { LAST_FRAME, NO_REF_FRAME } },   { { GOLDEN_FRAME, NO_REF_FRAME } },
+  { { ALTREF_FRAME, NO_REF_FRAME } }, { { LAST_FRAME, ALTREF_FRAME } },
+  { { GOLDEN_FRAME, ALTREF_FRAME } }, { { INTRA_FRAME, NO_REF_FRAME } },
 };
+#endif  // !CONFIG_REALTIME_ONLY
 
 static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int m, int n,
                            int min_plane, int max_plane) {
@@ -151,10 +159,14 @@ static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int m, int n,
   }
 }
 
-static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
-                            MACROBLOCKD *xd, int *out_rate_sum,
-                            int64_t *out_dist_sum, int *skip_txfm_sb,
-                            int64_t *skip_sse_sb) {
+#if !CONFIG_REALTIME_ONLY
+// Planewise build inter prediction and compute rdcost with early termination
+// option
+static int build_inter_pred_model_rd_earlyterm(
+    VP9_COMP *cpi, int mi_row, int mi_col, BLOCK_SIZE bsize, MACROBLOCK *x,
+    MACROBLOCKD *xd, int *out_rate_sum, int64_t *out_dist_sum,
+    int *skip_txfm_sb, int64_t *skip_sse_sb, int do_earlyterm,
+    int64_t best_rd) {
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
@@ -164,12 +176,9 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
   const int ref = xd->mi[0]->ref_frame[0];
   unsigned int sse;
   unsigned int var = 0;
-  unsigned int sum_sse = 0;
   int64_t total_sse = 0;
   int skip_flag = 1;
   const int shift = 6;
-  int rate;
-  int64_t dist;
   const int dequant_shift =
 #if CONFIG_VP9_HIGHBITDEPTH
       (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 :
@@ -178,6 +187,7 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
 
   x->pred_sse[ref] = 0;
 
+  // Build prediction signal, compute stats and RD cost on per-plane basis
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     struct macroblock_plane *const p = &x->plane[i];
     struct macroblockd_plane *const pd = &xd->plane[i];
@@ -186,6 +196,7 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
     const BLOCK_SIZE unit_size = txsize_to_bsize[max_tx_size];
     const int64_t dc_thr = p->quant_thred[0] >> shift;
     const int64_t ac_thr = p->quant_thred[1] >> shift;
+    unsigned int sum_sse = 0;
     // The low thresholds are used to measure if the prediction errors are
     // low enough so that we can skip the mode search.
     const int64_t low_dc_thr = VPXMIN(50, dc_thr >> 2);
@@ -195,9 +206,14 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
     int idx, idy;
     int lw = b_width_log2_lookup[unit_size] + 2;
     int lh = b_height_log2_lookup[unit_size] + 2;
+    unsigned int qstep;
+    unsigned int nlog2;
+    int64_t dist = 0;
 
-    sum_sse = 0;
+    // Build inter predictor
+    vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i);
 
+    // Compute useful stats
     for (idy = 0; idy < bh; ++idy) {
       for (idx = 0; idx < bw; ++idx) {
         uint8_t *src = p->src.buf + (idy * p->src.stride << lh) + (idx << lw);
@@ -233,34 +249,38 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
     }
 
     total_sse += sum_sse;
+    qstep = pd->dequant[1] >> dequant_shift;
+    nlog2 = num_pels_log2_lookup[bs];
 
     // Fast approximate the modelling function.
     if (cpi->sf.simple_model_rd_from_var) {
       int64_t rate;
-      const int64_t square_error = sum_sse;
-      int quantizer = (pd->dequant[1] >> dequant_shift);
-
-      if (quantizer < 120)
-        rate = (square_error * (280 - quantizer)) >> (16 - VP9_PROB_COST_SHIFT);
+      if (qstep < 120)
+        rate = ((int64_t)sum_sse * (280 - qstep)) >> (16 - VP9_PROB_COST_SHIFT);
       else
         rate = 0;
-      dist = (square_error * quantizer) >> 8;
+      dist = ((int64_t)sum_sse * qstep) >> 8;
       rate_sum += rate;
-      dist_sum += dist;
     } else {
-      vp9_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs],
-                                   pd->dequant[1] >> dequant_shift, &rate,
-                                   &dist);
+      int rate;
+      vp9_model_rd_from_var_lapndz(sum_sse, nlog2, qstep, &rate, &dist);
       rate_sum += rate;
-      dist_sum += dist;
+    }
+    dist_sum += dist;
+    if (do_earlyterm) {
+      if (RDCOST(x->rdmult, x->rddiv, rate_sum,
+                 dist_sum << VP9_DIST_SCALE_LOG2) >= best_rd)
+        return 1;
     }
   }
-
   *skip_txfm_sb = skip_flag;
-  *skip_sse_sb = total_sse << 4;
+  *skip_sse_sb = total_sse << VP9_DIST_SCALE_LOG2;
   *out_rate_sum = (int)rate_sum;
-  *out_dist_sum = dist_sum << 4;
+  *out_dist_sum = dist_sum << VP9_DIST_SCALE_LOG2;
+
+  return 0;
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 #if CONFIG_VP9_HIGHBITDEPTH
 int64_t vp9_highbd_block_error_c(const tran_low_t *coeff,
@@ -284,22 +304,12 @@ int64_t vp9_highbd_block_error_c(const tran_low_t *coeff,
   return error;
 }
 
-int64_t vp9_highbd_block_error_8bit_c(const tran_low_t *coeff,
-                                      const tran_low_t *dqcoeff,
-                                      intptr_t block_size, int64_t *ssz) {
-  // Note that the C versions of these 2 functions (vp9_block_error and
-  // vp9_highbd_block_error_8bit are the same, but the optimized assembly
-  // routines are not compatible in the non high bitdepth configuration, so
-  // they still cannot share the same name.
-  return vp9_block_error_c(coeff, dqcoeff, block_size, ssz);
-}
-
 static int64_t vp9_highbd_block_error_dispatch(const tran_low_t *coeff,
                                                const tran_low_t *dqcoeff,
                                                intptr_t block_size,
                                                int64_t *ssz, int bd) {
   if (bd == 8) {
-    return vp9_highbd_block_error_8bit(coeff, dqcoeff, block_size, ssz);
+    return vp9_block_error(coeff, dqcoeff, block_size, ssz);
   } else {
     return vp9_highbd_block_error(coeff, dqcoeff, block_size, ssz, bd);
   }
@@ -321,7 +331,7 @@ int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
   return error;
 }
 
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff,
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
                              int block_size) {
   int i;
   int64_t error = 0;
@@ -358,11 +368,11 @@ static int cost_coeffs(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
   unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
       x->token_costs[tx_size][type][is_inter_block(mi)];
   uint8_t token_cache[32 * 32];
-  int c, cost;
+  int cost;
 #if CONFIG_VP9_HIGHBITDEPTH
-  const int *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
+  const uint16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
 #else
-  const int *cat6_high_cost = vp9_get_high_cost_table(8);
+  const uint16_t *cat6_high_cost = vp9_get_high_cost_table(8);
 #endif
 
   // Check for consistency of tx_size with mode info
@@ -373,10 +383,10 @@ static int cost_coeffs(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
   if (eob == 0) {
     // single eob token
     cost = token_costs[0][0][pt][EOB_TOKEN];
-    c = 0;
   } else {
     if (use_fast_coef_costing) {
       int band_left = *band_count++;
+      int c;
 
       // dc token
       int v = qcoeff[0];
@@ -407,6 +417,7 @@ static int cost_coeffs(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
 
     } else {  // !use_fast_coef_costing
       int band_left = *band_count++;
+      int c;
 
       // dc token
       int v = qcoeff[0];
@@ -447,9 +458,64 @@ static int cost_coeffs(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
   return cost;
 }
 
-static INLINE int num_4x4_to_edge(int plane_4x4_dim, int mb_to_edge_dim,
-                                  int subsampling_dim, int blk_dim) {
-  return plane_4x4_dim + (mb_to_edge_dim >> (5 + subsampling_dim)) - blk_dim;
+// Copy all visible 4x4s in the transform block.
+static void copy_block_visible(const MACROBLOCKD *xd,
+                               const struct macroblockd_plane *const pd,
+                               const uint8_t *src, const int src_stride,
+                               uint8_t *dst, const int dst_stride, int blk_row,
+                               int blk_col, const BLOCK_SIZE plane_bsize,
+                               const BLOCK_SIZE tx_bsize) {
+  const int plane_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+  const int plane_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+  const int tx_4x4_w = num_4x4_blocks_wide_lookup[tx_bsize];
+  const int tx_4x4_h = num_4x4_blocks_high_lookup[tx_bsize];
+  int b4x4s_to_right_edge = num_4x4_to_edge(plane_4x4_w, xd->mb_to_right_edge,
+                                            pd->subsampling_x, blk_col);
+  int b4x4s_to_bottom_edge = num_4x4_to_edge(plane_4x4_h, xd->mb_to_bottom_edge,
+                                             pd->subsampling_y, blk_row);
+  const int is_highbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+  if (tx_bsize == BLOCK_4X4 ||
+      (b4x4s_to_right_edge >= tx_4x4_w && b4x4s_to_bottom_edge >= tx_4x4_h)) {
+    const int w = tx_4x4_w << 2;
+    const int h = tx_4x4_h << 2;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (is_highbd) {
+      vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(src), src_stride,
+                               CONVERT_TO_SHORTPTR(dst), dst_stride, NULL, 0, 0,
+                               0, 0, w, h, xd->bd);
+    } else {
+#endif
+      vpx_convolve_copy(src, src_stride, dst, dst_stride, NULL, 0, 0, 0, 0, w,
+                        h);
+#if CONFIG_VP9_HIGHBITDEPTH
+    }
+#endif
+  } else {
+    int r, c;
+    int max_r = VPXMIN(b4x4s_to_bottom_edge, tx_4x4_h);
+    int max_c = VPXMIN(b4x4s_to_right_edge, tx_4x4_w);
+    // if we are in the unrestricted motion border.
+    for (r = 0; r < max_r; ++r) {
+      // Skip visiting the sub blocks that are wholly within the UMV.
+      for (c = 0; c < max_c; ++c) {
+        const uint8_t *src_ptr = src + r * src_stride * 4 + c * 4;
+        uint8_t *dst_ptr = dst + r * dst_stride * 4 + c * 4;
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (is_highbd) {
+          vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(src_ptr), src_stride,
+                                   CONVERT_TO_SHORTPTR(dst_ptr), dst_stride,
+                                   NULL, 0, 0, 0, 0, 4, 4, xd->bd);
+        } else {
+#endif
+          vpx_convolve_copy(src_ptr, src_stride, dst_ptr, dst_stride, NULL, 0,
+                            0, 0, 0, 4, 4);
+#if CONFIG_VP9_HIGHBITDEPTH
+        }
+#endif
+      }
+    }
+  }
+  (void)is_highbd;
 }
 
 // Compute the pixel domain sum square error on all visible 4x4s in the
@@ -492,50 +558,17 @@ static unsigned pixel_sse(const VP9_COMP *const cpi, const MACROBLOCKD *xd,
   return sse;
 }
 
-// Compute the squares sum squares on all visible 4x4s in the transform block.
-static int64_t sum_squares_visible(const MACROBLOCKD *xd,
-                                   const struct macroblockd_plane *const pd,
-                                   const int16_t *diff, const int diff_stride,
-                                   int blk_row, int blk_col,
-                                   const BLOCK_SIZE plane_bsize,
-                                   const BLOCK_SIZE tx_bsize) {
-  int64_t sse;
-  const int plane_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
-  const int plane_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
-  const int tx_4x4_w = num_4x4_blocks_wide_lookup[tx_bsize];
-  const int tx_4x4_h = num_4x4_blocks_high_lookup[tx_bsize];
-  int b4x4s_to_right_edge = num_4x4_to_edge(plane_4x4_w, xd->mb_to_right_edge,
-                                            pd->subsampling_x, blk_col);
-  int b4x4s_to_bottom_edge = num_4x4_to_edge(plane_4x4_h, xd->mb_to_bottom_edge,
-                                             pd->subsampling_y, blk_row);
-  if (tx_bsize == BLOCK_4X4 ||
-      (b4x4s_to_right_edge >= tx_4x4_w && b4x4s_to_bottom_edge >= tx_4x4_h)) {
-    sse = (int64_t)vpx_sum_squares_2d_i16(diff, diff_stride, tx_bsize);
-  } else {
-    int r, c;
-    int max_r = VPXMIN(b4x4s_to_bottom_edge, tx_4x4_h);
-    int max_c = VPXMIN(b4x4s_to_right_edge, tx_4x4_w);
-    sse = 0;
-    // if we are in the unrestricted motion border.
-    for (r = 0; r < max_r; ++r) {
-      // Skip visiting the sub blocks that are wholly within the UMV.
-      for (c = 0; c < max_c; ++c) {
-        sse += (int64_t)vpx_sum_squares_2d_i16(diff, diff_stride, BLOCK_4X4);
-      }
-    }
-  }
-  return sse;
-}
-
 static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
                        BLOCK_SIZE plane_bsize, int block, int blk_row,
                        int blk_col, TX_SIZE tx_size, int64_t *out_dist,
-                       int64_t *out_sse) {
+                       int64_t *out_sse, struct buf_2d *out_recon,
+                       int sse_calc_done) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int eob = p->eobs[block];
 
-  if (x->block_tx_domain) {
+  if (!out_recon && x->block_tx_domain && eob) {
     const int ss_txfrm_size = tx_size << 1;
     int64_t this_sse;
     const int shift = tx_size == TX_32X32 ? 0 : 2;
@@ -555,15 +588,15 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
 
     if (x->skip_encode && !is_inter_block(xd->mi[0])) {
       // TODO(jingning): tune the model to better capture the distortion.
-      const int64_t p =
+      const int64_t mean_quant_error =
           (pd->dequant[1] * pd->dequant[1] * (1 << ss_txfrm_size)) >>
 #if CONFIG_VP9_HIGHBITDEPTH
           (shift + 2 + (bd - 8) * 2);
 #else
           (shift + 2);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-      *out_dist += (p >> 4);
-      *out_sse += p;
+      *out_dist += (mean_quant_error >> 4);
+      *out_sse += mean_quant_error;
     }
   } else {
     const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
@@ -574,15 +607,27 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
     const int dst_idx = 4 * (blk_row * dst_stride + blk_col);
     const uint8_t *src = &p->src.buf[src_idx];
     const uint8_t *dst = &pd->dst.buf[dst_idx];
+    uint8_t *out_recon_ptr = 0;
+
     const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-    const uint16_t *eob = &p->eobs[block];
     unsigned int tmp;
 
-    tmp = pixel_sse(cpi, xd, pd, src, src_stride, dst, dst_stride, blk_row,
-                    blk_col, plane_bsize, tx_bsize);
+    if (sse_calc_done) {
+      tmp = (unsigned int)(*out_sse);
+    } else {
+      tmp = pixel_sse(cpi, xd, pd, src, src_stride, dst, dst_stride, blk_row,
+                      blk_col, plane_bsize, tx_bsize);
+    }
     *out_sse = (int64_t)tmp * 16;
+    if (out_recon) {
+      const int out_recon_idx = 4 * (blk_row * out_recon->stride + blk_col);
+      out_recon_ptr = &out_recon->buf[out_recon_idx];
+      copy_block_visible(xd, pd, dst, dst_stride, out_recon_ptr,
+                         out_recon->stride, blk_row, blk_col, plane_bsize,
+                         tx_bsize);
+    }
 
-    if (*eob) {
+    if (eob) {
 #if CONFIG_VP9_HIGHBITDEPTH
       DECLARE_ALIGNED(16, uint16_t, recon16[1024]);
       uint8_t *recon = (uint8_t *)recon16;
@@ -592,42 +637,42 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
 
 #if CONFIG_VP9_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        recon = CONVERT_TO_BYTEPTR(recon);
-        vpx_highbd_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0,
-                                 bs, bs, xd->bd);
+        vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride, recon16,
+                                 32, NULL, 0, 0, 0, 0, bs, bs, xd->bd);
         if (xd->lossless) {
-          vp9_highbd_iwht4x4_add(dqcoeff, recon, 32, *eob, xd->bd);
+          vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, eob, xd->bd);
         } else {
           switch (tx_size) {
             case TX_4X4:
-              vp9_highbd_idct4x4_add(dqcoeff, recon, 32, *eob, xd->bd);
+              vp9_highbd_idct4x4_add(dqcoeff, recon16, 32, eob, xd->bd);
               break;
             case TX_8X8:
-              vp9_highbd_idct8x8_add(dqcoeff, recon, 32, *eob, xd->bd);
+              vp9_highbd_idct8x8_add(dqcoeff, recon16, 32, eob, xd->bd);
               break;
             case TX_16X16:
-              vp9_highbd_idct16x16_add(dqcoeff, recon, 32, *eob, xd->bd);
+              vp9_highbd_idct16x16_add(dqcoeff, recon16, 32, eob, xd->bd);
               break;
-            case TX_32X32:
-              vp9_highbd_idct32x32_add(dqcoeff, recon, 32, *eob, xd->bd);
+            default:
+              assert(tx_size == TX_32X32);
+              vp9_highbd_idct32x32_add(dqcoeff, recon16, 32, eob, xd->bd);
               break;
-            default: assert(0 && "Invalid transform size");
           }
         }
+        recon = CONVERT_TO_BYTEPTR(recon16);
       } else {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-        vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0, bs, bs);
+        vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, 0, 0, 0, bs, bs);
         switch (tx_size) {
-          case TX_32X32: vp9_idct32x32_add(dqcoeff, recon, 32, *eob); break;
-          case TX_16X16: vp9_idct16x16_add(dqcoeff, recon, 32, *eob); break;
-          case TX_8X8: vp9_idct8x8_add(dqcoeff, recon, 32, *eob); break;
-          case TX_4X4:
+          case TX_32X32: vp9_idct32x32_add(dqcoeff, recon, 32, eob); break;
+          case TX_16X16: vp9_idct16x16_add(dqcoeff, recon, 32, eob); break;
+          case TX_8X8: vp9_idct8x8_add(dqcoeff, recon, 32, eob); break;
+          default:
+            assert(tx_size == TX_4X4);
             // this is like vp9_short_idct4x4 but has a special case around
             // eob<=1, which is significant (not just an optimization) for
             // the lossless case.
-            x->itxm_add(dqcoeff, recon, 32, *eob);
+            x->inv_txfm_add(dqcoeff, recon, 32, eob);
             break;
-          default: assert(0 && "Invalid transform size"); break;
         }
 #if CONFIG_VP9_HIGHBITDEPTH
       }
@@ -635,6 +680,10 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
 
       tmp = pixel_sse(cpi, xd, pd, src, src_stride, recon, 32, blk_row, blk_col,
                       plane_bsize, tx_bsize);
+      if (out_recon) {
+        copy_block_visible(xd, pd, recon, 32, out_recon_ptr, out_recon->stride,
+                           blk_row, blk_col, plane_bsize, tx_bsize);
+      }
     }
 
     *out_dist = (int64_t)tmp * 16;
@@ -655,34 +704,60 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
   MODE_INFO *const mi = xd->mi[0];
   int64_t rd1, rd2, rd;
   int rate;
-  int64_t dist;
-  int64_t sse;
+  int64_t dist = INT64_MAX;
+  int64_t sse = INT64_MAX;
   const int coeff_ctx =
       combine_entropy_contexts(args->t_left[blk_row], args->t_above[blk_col]);
+  struct buf_2d *recon = args->this_recon;
+  const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int dst_stride = pd->dst.stride;
+  const uint8_t *dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
+  const int enable_trellis_opt = args->cpi->sf.trellis_opt_tx_rd.method;
+  const double trellis_opt_thresh = args->cpi->sf.trellis_opt_tx_rd.thresh;
+  int sse_calc_done = 0;
+#if CONFIG_MISMATCH_DEBUG
+  struct encode_b_args encode_b_arg = {
+    x,    enable_trellis_opt, trellis_opt_thresh, &sse_calc_done,
+    &sse, args->t_above,      args->t_left,       &mi->skip,
+    0,  // mi_row
+    0,  // mi_col
+    0   // output_enabled
+  };
+#else
+  struct encode_b_args encode_b_arg = {
+    x,    enable_trellis_opt, trellis_opt_thresh, &sse_calc_done,
+    &sse, args->t_above,      args->t_left,       &mi->skip
+  };
+#endif
 
   if (args->exit_early) return;
 
   if (!is_inter_block(mi)) {
-    struct encode_b_args intra_arg = { x, x->block_qcoeff_opt, args->t_above,
-                                       args->t_left, &mi->skip };
     vp9_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                           &intra_arg);
+                           &encode_b_arg);
+    if (recon) {
+      uint8_t *rec_ptr = &recon->buf[4 * (blk_row * recon->stride + blk_col)];
+      copy_block_visible(xd, pd, dst, dst_stride, rec_ptr, recon->stride,
+                         blk_row, blk_col, plane_bsize, tx_bsize);
+    }
     if (x->block_tx_domain) {
       dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
-                 tx_size, &dist, &sse);
+                 tx_size, &dist, &sse, /*out_recon=*/NULL, sse_calc_done);
     } else {
-      const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
       const struct macroblock_plane *const p = &x->plane[plane];
-      const struct macroblockd_plane *const pd = &xd->plane[plane];
       const int src_stride = p->src.stride;
-      const int dst_stride = pd->dst.stride;
-      const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
       const uint8_t *src = &p->src.buf[4 * (blk_row * src_stride + blk_col)];
-      const uint8_t *dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
-      const int16_t *diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
       unsigned int tmp;
-      sse = sum_squares_visible(xd, pd, diff, diff_stride, blk_row, blk_col,
-                                plane_bsize, tx_bsize);
+      if (!sse_calc_done) {
+        const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+        const int16_t *diff =
+            &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+        int visible_width, visible_height;
+        sse = sum_squares_visible(xd, pd, diff, diff_stride, blk_row, blk_col,
+                                  plane_bsize, tx_bsize, &visible_width,
+                                  &visible_height);
+      }
 #if CONFIG_VP9_HIGHBITDEPTH
       if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && (xd->bd > 8))
         sse = ROUND64_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
@@ -692,17 +767,33 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
                       blk_row, blk_col, plane_bsize, tx_bsize);
       dist = (int64_t)tmp * 16;
     }
-  } else if (max_txsize_lookup[plane_bsize] == tx_size) {
-    if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==
-        SKIP_TXFM_NONE) {
+  } else {
+    int skip_txfm_flag = SKIP_TXFM_NONE;
+    if (max_txsize_lookup[plane_bsize] == tx_size)
+      skip_txfm_flag = x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))];
+
+    // This reduces the risk of bad perceptual quality due to bad prediction.
+    // We always force the encoder to perform transform and quantization.
+    if (!args->cpi->sf.allow_skip_txfm_ac_dc &&
+        skip_txfm_flag == SKIP_TXFM_AC_DC) {
+      skip_txfm_flag = SKIP_TXFM_NONE;
+    }
+
+    if (skip_txfm_flag == SKIP_TXFM_NONE ||
+        (recon && skip_txfm_flag == SKIP_TXFM_AC_ONLY)) {
+      const struct macroblock_plane *const p = &x->plane[plane];
+      const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+      const int16_t *const diff =
+          &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+      const int use_trellis_opt =
+          do_trellis_opt(pd, diff, diff_stride, blk_row, blk_col, plane_bsize,
+                         tx_size, &encode_b_arg);
       // full forward transform and quantization
       vp9_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size);
-      if (x->block_qcoeff_opt)
-        vp9_optimize_b(x, plane, block, tx_size, coeff_ctx);
+      if (use_trellis_opt) vp9_optimize_b(x, plane, block, tx_size, coeff_ctx);
       dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
-                 tx_size, &dist, &sse);
-    } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==
-               SKIP_TXFM_AC_ONLY) {
+                 tx_size, &dist, &sse, recon, sse_calc_done);
+    } else if (skip_txfm_flag == SKIP_TXFM_AC_ONLY) {
       // compute DC coefficient
       tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block);
       tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
@@ -722,19 +813,8 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
         dist = VPXMAX(0, sse - dc_correct);
       }
     } else {
-      // SKIP_TXFM_AC_DC
-      // skip forward transform
-      x->plane[plane].eobs[block] = 0;
-      sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
-      dist = sse;
+      assert(0 && "allow_skip_txfm_ac_dc does not allow SKIP_TXFM_AC_DC.");
     }
-  } else {
-    // full forward transform and quantization
-    vp9_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size);
-    if (x->block_qcoeff_opt)
-      vp9_optimize_b(x, plane, block, tx_size, coeff_ctx);
-    dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
-               tx_size, &dist, &sse);
   }
 
   rd = RDCOST(x->rdmult, x->rddiv, 0, dist);
@@ -751,9 +831,12 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
 
   // TODO(jingning): temporarily enabled only for luma component
   rd = VPXMIN(rd1, rd2);
-  if (plane == 0)
+  if (plane == 0) {
     x->zcoeff_blk[tx_size][block] =
-        !x->plane[plane].eobs[block] || (rd1 > rd2 && !xd->lossless);
+        !x->plane[plane].eobs[block] ||
+        (x->sharpness == 0 && rd1 > rd2 && !xd->lossless);
+    x->sum_y_eobs[tx_size] += x->plane[plane].eobs[block];
+  }
 
   args->this_rate += rate;
   args->this_dist += dist;
@@ -771,7 +854,8 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
 static void txfm_rd_in_plane(const VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                              int64_t *distortion, int *skippable, int64_t *sse,
                              int64_t ref_best_rd, int plane, BLOCK_SIZE bsize,
-                             TX_SIZE tx_size, int use_fast_coef_casting) {
+                             TX_SIZE tx_size, int use_fast_coef_costing,
+                             struct buf_2d *recon) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   struct rdcost_block_args args;
@@ -779,8 +863,9 @@ static void txfm_rd_in_plane(const VP9_COMP *cpi, MACROBLOCK *x, int *rate,
   args.cpi = cpi;
   args.x = x;
   args.best_rd = ref_best_rd;
-  args.use_fast_coef_costing = use_fast_coef_casting;
+  args.use_fast_coef_costing = use_fast_coef_costing;
   args.skippable = 1;
+  args.this_recon = recon;
 
   if (plane == 0) xd->mi[0]->tx_size = tx_size;
 
@@ -805,7 +890,8 @@ static void txfm_rd_in_plane(const VP9_COMP *cpi, MACROBLOCK *x, int *rate,
 
 static void choose_largest_tx_size(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                                    int64_t *distortion, int *skip, int64_t *sse,
-                                   int64_t ref_best_rd, BLOCK_SIZE bs) {
+                                   int64_t ref_best_rd, BLOCK_SIZE bs,
+                                   struct buf_2d *recon) {
   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
   VP9_COMMON *const cm = &cpi->common;
   const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
@@ -815,13 +901,13 @@ static void choose_largest_tx_size(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
   mi->tx_size = VPXMIN(max_tx_size, largest_tx_size);
 
   txfm_rd_in_plane(cpi, x, rate, distortion, skip, sse, ref_best_rd, 0, bs,
-                   mi->tx_size, cpi->sf.use_fast_coef_costing);
+                   mi->tx_size, cpi->sf.use_fast_coef_costing, recon);
 }
 
 static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                                    int64_t *distortion, int *skip,
                                    int64_t *psse, int64_t ref_best_rd,
-                                   BLOCK_SIZE bs) {
+                                   BLOCK_SIZE bs, struct buf_2d *recon) {
   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -833,20 +919,34 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                               { INT64_MAX, INT64_MAX },
                               { INT64_MAX, INT64_MAX },
                               { INT64_MAX, INT64_MAX } };
-  int n, m;
+  int n;
   int s0, s1;
-  int64_t best_rd = INT64_MAX;
+  int64_t best_rd = ref_best_rd;
   TX_SIZE best_tx = max_tx_size;
   int start_tx, end_tx;
+  const int tx_size_ctx = get_tx_size_context(xd);
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t, recon_buf16[TX_SIZES][64 * 64]);
+  uint8_t *recon_buf[TX_SIZES];
+  for (n = 0; n < TX_SIZES; ++n) {
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      recon_buf[n] = CONVERT_TO_BYTEPTR(recon_buf16[n]);
+    } else {
+      recon_buf[n] = (uint8_t *)recon_buf16[n];
+    }
+  }
+#else
+  DECLARE_ALIGNED(16, uint8_t, recon_buf[TX_SIZES][64 * 64]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
-  const vpx_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs);
   assert(skip_prob > 0);
   s0 = vp9_cost_bit(skip_prob, 0);
   s1 = vp9_cost_bit(skip_prob, 1);
 
   if (cm->tx_mode == TX_MODE_SELECT) {
     start_tx = max_tx_size;
-    end_tx = 0;
+    end_tx = VPXMAX(start_tx - cpi->sf.tx_size_search_depth, 0);
+    if (bs > BLOCK_32X32) end_tx = VPXMIN(end_tx + 1, start_tx);
   } else {
     TX_SIZE chosen_tx_size =
         VPXMIN(max_tx_size, tx_mode_to_biggest_tx_size[cm->tx_mode]);
@@ -855,15 +955,17 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
   }
 
   for (n = start_tx; n >= end_tx; n--) {
-    int r_tx_size = 0;
-    for (m = 0; m <= n - (n == (int)max_tx_size); m++) {
-      if (m == n)
-        r_tx_size += vp9_cost_zero(tx_probs[m]);
-      else
-        r_tx_size += vp9_cost_one(tx_probs[m]);
+    const int r_tx_size = cpi->tx_size_cost[max_tx_size - 1][tx_size_ctx][n];
+    if (recon) {
+      struct buf_2d this_recon;
+      this_recon.buf = recon_buf[n];
+      this_recon.stride = recon->stride;
+      txfm_rd_in_plane(cpi, x, &r[n][0], &d[n], &s[n], &sse[n], best_rd, 0, bs,
+                       n, cpi->sf.use_fast_coef_costing, &this_recon);
+    } else {
+      txfm_rd_in_plane(cpi, x, &r[n][0], &d[n], &s[n], &sse[n], best_rd, 0, bs,
+                       n, cpi->sf.use_fast_coef_costing, 0);
     }
-    txfm_rd_in_plane(cpi, x, &r[n][0], &d[n], &s[n], &sse[n], ref_best_rd, 0,
-                     bs, n, cpi->sf.use_fast_coef_costing);
     r[n][1] = r[n][0];
     if (r[n][0] < INT_MAX) {
       r[n][1] += r_tx_size;
@@ -905,11 +1007,25 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
   *rate = r[mi->tx_size][cm->tx_mode == TX_MODE_SELECT];
   *skip = s[mi->tx_size];
   *psse = sse[mi->tx_size];
+  if (recon) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      memcpy(CONVERT_TO_SHORTPTR(recon->buf),
+             CONVERT_TO_SHORTPTR(recon_buf[mi->tx_size]),
+             64 * 64 * sizeof(uint16_t));
+    } else {
+#endif
+      memcpy(recon->buf, recon_buf[mi->tx_size], 64 * 64);
+#if CONFIG_VP9_HIGHBITDEPTH
+    }
+#endif
+  }
 }
 
 static void super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                             int64_t *distortion, int *skip, int64_t *psse,
-                            BLOCK_SIZE bs, int64_t ref_best_rd) {
+                            BLOCK_SIZE bs, int64_t ref_best_rd,
+                            struct buf_2d *recon) {
   MACROBLOCKD *xd = &x->e_mbd;
   int64_t sse;
   int64_t *ret_sse = psse ? psse : &sse;
@@ -918,10 +1034,10 @@ static void super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
 
   if (cpi->sf.tx_size_search_method == USE_LARGESTALL || xd->lossless) {
     choose_largest_tx_size(cpi, x, rate, distortion, skip, ret_sse, ref_best_rd,
-                           bs);
+                           bs, recon);
   } else {
     choose_tx_size_from_rd(cpi, x, rate, distortion, skip, ret_sse, ref_best_rd,
-                           bs);
+                           bs, recon);
   }
 }
 
@@ -971,6 +1087,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
 
   xd->mi[0]->tx_size = TX_4X4;
 
+  assert(!x->skip_block);
+
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
@@ -995,9 +1113,13 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
           const int block = (row + idy) * 2 + (col + idx);
           const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
           uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
+          uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
           int16_t *const src_diff =
               vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
-          tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
+          tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+          tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+          tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+          uint16_t *const eob = &p->eobs[block];
           xd->mi[0]->bmi[block].as_mode = mode;
           vp9_predict_intra_block(xd, 1, TX_4X4, mode,
                                   x->skip_encode ? src : dst,
@@ -1006,29 +1128,31 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
           vpx_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride, dst,
                                     dst_stride, xd->bd);
           if (xd->lossless) {
-            const scan_order *so = &vp9_default_scan_orders[TX_4X4];
+            const ScanOrder *so = &vp9_default_scan_orders[TX_4X4];
             const int coeff_ctx =
                 combine_entropy_contexts(tempa[idx], templ[idy]);
             vp9_highbd_fwht4x4(src_diff, coeff, 8);
-            vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+            vpx_highbd_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant,
+                                  eob, so);
             ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
                                  so->neighbors, cpi->sf.use_fast_coef_costing);
             tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0 ? 1 : 0);
             if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
               goto next_highbd;
-            vp9_highbd_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst,
+            vp9_highbd_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst16,
                                    dst_stride, p->eobs[block], xd->bd);
           } else {
             int64_t unused;
             const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
-            const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type];
+            const ScanOrder *so = &vp9_scan_orders[TX_4X4][tx_type];
             const int coeff_ctx =
                 combine_entropy_contexts(tempa[idx], templ[idy]);
             if (tx_type == DCT_DCT)
               vpx_highbd_fdct4x4(src_diff, coeff, 8);
             else
               vp9_highbd_fht4x4(src_diff, coeff, 8, tx_type);
-            vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+            vpx_highbd_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant,
+                                  eob, so);
             ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
                                  so->neighbors, cpi->sf.use_fast_coef_costing);
             distortion += vp9_highbd_block_error_dispatch(
@@ -1039,7 +1163,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
             if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
               goto next_highbd;
             vp9_highbd_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block),
-                                  dst, dst_stride, p->eobs[block], xd->bd);
+                                  dst16, dst_stride, p->eobs[block], xd->bd);
           }
         }
       }
@@ -1061,7 +1185,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
                  num_4x4_blocks_wide * 4 * sizeof(uint16_t));
         }
       }
-    next_highbd : {}
+    next_highbd: {}
     }
     if (best_rd >= rd_thresh || x->skip_encode) return best_rd;
 
@@ -1098,7 +1222,10 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
         uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
         int16_t *const src_diff =
             vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
-        tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
+        tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+        tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+        tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+        uint16_t *const eob = &p->eobs[block];
         xd->mi[0]->bmi[block].as_mode = mode;
         vp9_predict_intra_block(xd, 1, TX_4X4, mode, x->skip_encode ? src : dst,
                                 x->skip_encode ? src_stride : dst_stride, dst,
@@ -1106,11 +1233,12 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
         vpx_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);
 
         if (xd->lossless) {
-          const scan_order *so = &vp9_default_scan_orders[TX_4X4];
+          const ScanOrder *so = &vp9_default_scan_orders[TX_4X4];
           const int coeff_ctx =
               combine_entropy_contexts(tempa[idx], templ[idy]);
           vp9_fwht4x4(src_diff, coeff, 8);
-          vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+          vpx_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob,
+                         so);
           ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
                                so->neighbors, cpi->sf.use_fast_coef_costing);
           tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0;
@@ -1121,24 +1249,18 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
         } else {
           int64_t unused;
           const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
-          const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type];
+          const ScanOrder *so = &vp9_scan_orders[TX_4X4][tx_type];
           const int coeff_ctx =
               combine_entropy_contexts(tempa[idx], templ[idy]);
           vp9_fht4x4(src_diff, coeff, 8, tx_type);
-          vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+          vpx_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob,
+                         so);
           ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
                                so->neighbors, cpi->sf.use_fast_coef_costing);
           tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0;
-#if CONFIG_VP9_HIGHBITDEPTH
-          distortion +=
-              vp9_highbd_block_error_8bit(
-                  coeff, BLOCK_OFFSET(pd->dqcoeff, block), 16, &unused) >>
-              2;
-#else
           distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
                                         16, &unused) >>
                         2;
-#endif
           if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
             goto next;
           vp9_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block), dst,
@@ -1162,7 +1284,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
         memcpy(best_dst + idy * 8, dst_init + idy * dst_stride,
                num_4x4_blocks_wide * 4);
     }
-  next : {}
+  next: {}
   }
 
   if (best_rd >= rd_thresh || x->skip_encode) return best_rd;
@@ -1269,7 +1391,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
     mic->mode = mode;
 
     super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL,
-                    bsize, best_rd);
+                    bsize, best_rd, /*recon=*/NULL);
 
     if (this_rate_tokenonly == INT_MAX) continue;
 
@@ -1309,7 +1431,6 @@ static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x, int *rate,
   if (ref_best_rd < 0) is_cost_valid = 0;
 
   if (is_inter_block(mi) && is_cost_valid) {
-    int plane;
     for (plane = 1; plane < MAX_MB_PLANE; ++plane)
       vp9_subtract_plane(x, bsize, plane);
   }
@@ -1321,7 +1442,8 @@ static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x, int *rate,
 
   for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
     txfm_rd_in_plane(cpi, x, &pnrate, &pndist, &pnskip, &pnsse, ref_best_rd,
-                     plane, bsize, uv_tx_size, cpi->sf.use_fast_coef_costing);
+                     plane, bsize, uv_tx_size, cpi->sf.use_fast_coef_costing,
+                     /*recon=*/NULL);
     if (pnrate == INT_MAX) {
       is_cost_valid = 0;
       break;
@@ -1389,6 +1511,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
   return best_rd;
 }
 
+#if !CONFIG_REALTIME_ONLY
 static int64_t rd_sbuv_dcpred(const VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                               int *rate_tokenonly, int64_t *distortion,
                               int *skippable, BLOCK_SIZE bsize) {
@@ -1462,11 +1585,11 @@ static int set_and_cost_bmi_mvs(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
       if (is_compound)
         this_mv[1].as_int = frame_mv[mode][mi->ref_frame[1]].as_int;
       break;
-    case ZEROMV:
+    default:
+      assert(mode == ZEROMV);
       this_mv[0].as_int = 0;
       if (is_compound) this_mv[1].as_int = 0;
       break;
-    default: break;
   }
 
   mi->bmi[i].as_mv[0].as_int = this_mv[0].as_int;
@@ -1503,10 +1626,12 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x,
       &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, i, pd->dst.stride)];
   int64_t thisdistortion = 0, thissse = 0;
   int thisrate = 0, ref;
-  const scan_order *so = &vp9_default_scan_orders[TX_4X4];
+  const ScanOrder *so = &vp9_default_scan_orders[TX_4X4];
   const int is_compound = has_second_ref(mi);
   const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter];
 
+  assert(!x->skip_block);
+
   for (ref = 0; ref < 1 + is_compound; ++ref) {
     const int bw = b_width_log2_lookup[BLOCK_8X8];
     const int h = 4 * (i >> bw);
@@ -1526,7 +1651,8 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x,
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       vp9_highbd_build_inter_predictor(
-          pre, y_stride, dst, pd->dst.stride, &mi->bmi[i].as_mv[ref].as_mv,
+          CONVERT_TO_SHORTPTR(pre), y_stride, CONVERT_TO_SHORTPTR(dst),
+          pd->dst.stride, &mi->bmi[i].as_mv[ref].as_mv,
           &xd->block_refs[ref]->sf, width, height, ref, kernel, MV_PRECISION_Q3,
           mi_col * MI_SIZE + 4 * (i % 2), mi_row * MI_SIZE + 4 * (i / 2),
           xd->bd);
@@ -1567,18 +1693,25 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x,
       const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
 #endif
       int64_t ssz, rd, rd1, rd2;
-      tran_low_t *coeff;
+      tran_low_t *coeff, *qcoeff, *dqcoeff;
+      uint16_t *eob;
       int coeff_ctx;
       k += (idy * 2 + idx);
       coeff_ctx = combine_entropy_contexts(ta[k & 1], tl[k >> 1]);
       coeff = BLOCK_OFFSET(p->coeff, k);
-      x->fwd_txm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
-                    coeff, 8);
-      vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
+      qcoeff = BLOCK_OFFSET(p->qcoeff, k);
+      dqcoeff = BLOCK_OFFSET(pd->dqcoeff, k);
+      eob = &p->eobs[k];
+
+      x->fwd_txfm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
+                     coeff, 8);
 #if CONFIG_VP9_HIGHBITDEPTH
+      vpx_highbd_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob,
+                            so);
       thisdistortion += vp9_highbd_block_error_dispatch(
           coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz, bd);
 #else
+      vpx_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob, so);
       thisdistortion +=
           vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -1599,6 +1732,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x,
 
   return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 typedef struct {
   int eobs;
@@ -1626,6 +1760,7 @@ typedef struct {
   int mvthresh;
 } BEST_SEG_INFO;
 
+#if !CONFIG_REALTIME_ONLY
 static INLINE int mv_check_bounds(const MvLimits *mv_limits, const MV *mv) {
   return (mv->row >> 3) < mv_limits->row_min ||
          (mv->row >> 3) > mv_limits->row_max ||
@@ -1670,7 +1805,7 @@ static int check_best_zero_mv(const VP9_COMP *cpi,
                               const MV_REFERENCE_FRAME ref_frames[2]) {
   if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
       frame_mv[this_mode][ref_frames[0]].as_int == 0 &&
-      (ref_frames[1] == NONE ||
+      (ref_frames[1] == NO_REF_FRAME ||
        frame_mv[this_mode][ref_frames[1]].as_int == 0)) {
     int rfc = mode_context[ref_frames[0]];
     int c1 = cost_mv_ref(cpi, NEARMV, rfc);
@@ -1683,7 +1818,7 @@ static int check_best_zero_mv(const VP9_COMP *cpi,
       if (c2 > c3) return 0;
     } else {
       assert(this_mode == ZEROMV);
-      if (ref_frames[1] == NONE) {
+      if (ref_frames[1] == NO_REF_FRAME) {
         if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0) ||
             (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0))
           return 0;
@@ -1699,10 +1834,80 @@ static int check_best_zero_mv(const VP9_COMP *cpi,
   return 1;
 }
 
+static INLINE int skip_iters(int_mv iter_mvs[][2], int ite, int id) {
+  if (ite >= 2 && iter_mvs[ite - 2][!id].as_int == iter_mvs[ite][!id].as_int) {
+    int_mv cur_fullpel_mv, prev_fullpel_mv;
+    cur_fullpel_mv.as_mv.row = iter_mvs[ite][id].as_mv.row >> 3;
+    cur_fullpel_mv.as_mv.col = iter_mvs[ite][id].as_mv.col >> 3;
+    prev_fullpel_mv.as_mv.row = iter_mvs[ite - 2][id].as_mv.row >> 3;
+    prev_fullpel_mv.as_mv.col = iter_mvs[ite - 2][id].as_mv.col >> 3;
+    if (cur_fullpel_mv.as_int == prev_fullpel_mv.as_int) return 1;
+  }
+  return 0;
+}
+
+// Compares motion vector and mode rate of current mode and given mode.
+static INLINE int compare_mv_mode_rate(MV this_mv, MV mode_mv,
+                                       int this_mode_rate, int mode_rate,
+                                       int mv_thresh) {
+  const int mv_diff =
+      abs(mode_mv.col - this_mv.col) + abs(mode_mv.row - this_mv.row);
+  if (mv_diff <= mv_thresh && mode_rate < this_mode_rate) return 1;
+  return 0;
+}
+
+// Skips single reference inter modes NEARMV and ZEROMV based on motion vector
+// difference and mode rate.
+static INLINE int skip_single_mode_based_on_mode_rate(
+    int_mv (*mode_mv)[MAX_REF_FRAMES], int *single_mode_rate, int this_mode,
+    int ref0, int this_mode_rate, int best_mode_index) {
+  MV this_mv = mode_mv[this_mode][ref0].as_mv;
+  const int mv_thresh = 3;
+
+  // Pruning is not applicable for NEARESTMV or NEWMV modes.
+  if (this_mode == NEARESTMV || this_mode == NEWMV) return 0;
+  // Pruning is not done when reference frame of the mode is same as best
+  // reference so far.
+  if (best_mode_index > 0 &&
+      ref0 == vp9_mode_order[best_mode_index].ref_frame[0])
+    return 0;
+
+  // Check absolute mv difference and mode rate of current mode w.r.t NEARESTMV
+  if (compare_mv_mode_rate(
+          this_mv, mode_mv[NEARESTMV][ref0].as_mv, this_mode_rate,
+          single_mode_rate[INTER_OFFSET(NEARESTMV)], mv_thresh))
+    return 1;
+
+  // Check absolute mv difference and mode rate of current mode w.r.t NEWMV
+  if (compare_mv_mode_rate(this_mv, mode_mv[NEWMV][ref0].as_mv, this_mode_rate,
+                           single_mode_rate[INTER_OFFSET(NEWMV)], mv_thresh))
+    return 1;
+
+  // Pruning w.r.t NEARMV is applicable only for ZEROMV mode
+  if (this_mode == NEARMV) return 0;
+  // Check absolute mv difference and mode rate of current mode w.r.t NEARMV
+  if (compare_mv_mode_rate(this_mv, mode_mv[NEARMV][ref0].as_mv, this_mode_rate,
+                           single_mode_rate[INTER_OFFSET(NEARMV)], mv_thresh))
+    return 1;
+  return 0;
+}
+
+#define MAX_JOINT_MV_SEARCH_ITERS 4
+static INLINE int get_joint_search_iters(int sf_level, BLOCK_SIZE bsize) {
+  int num_iters = MAX_JOINT_MV_SEARCH_ITERS;  // sf_level = 0
+  if (sf_level >= 2)
+    num_iters = 0;
+  else if (sf_level >= 1)
+    num_iters = bsize < BLOCK_8X8
+                    ? 0
+                    : (bsize <= BLOCK_16X16 ? 2 : MAX_JOINT_MV_SEARCH_ITERS);
+  return num_iters;
+}
+
 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                                 int_mv *frame_mv, int mi_row, int mi_col,
                                 int_mv single_newmv[MAX_REF_FRAMES],
-                                int *rate_mv) {
+                                int *rate_mv, int num_iters) {
   const VP9_COMMON *const cm = &cpi->common;
   const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
   const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
@@ -1711,6 +1916,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
   const int refs[2] = { mi->ref_frame[0],
                         mi->ref_frame[1] < 0 ? 0 : mi->ref_frame[1] };
   int_mv ref_mv[2];
+  int_mv iter_mvs[MAX_JOINT_MV_SEARCH_ITERS][2];
   int ite, ref;
   const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter];
   struct scale_factors sf;
@@ -1725,12 +1931,15 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
 
 // Prediction buffer from second frame.
 #if CONFIG_VP9_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[64 * 64]);
+  DECLARE_ALIGNED(32, uint16_t, second_pred_alloc_16[64 * 64]);
   uint8_t *second_pred;
 #else
-  DECLARE_ALIGNED(16, uint8_t, second_pred[64 * 64]);
+  DECLARE_ALIGNED(32, uint8_t, second_pred[64 * 64]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+  // Check number of iterations do not exceed the max
+  assert(num_iters <= MAX_JOINT_MV_SEARCH_ITERS);
+
   for (ref = 0; ref < 2; ++ref) {
     ref_mv[ref] = x->mbmi_ext->ref_mvs[refs[ref]][0];
 
@@ -1746,6 +1955,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
     }
 
     frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int;
+    iter_mvs[0][ref].as_int = single_newmv[refs[ref]].as_int;
   }
 
 // Since we have scaled the reference frames to match the size of the current
@@ -1760,7 +1970,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
 
   // Allow joint search multiple times iteratively for each reference frame
   // and break out of the search loop if it couldn't find a better mv.
-  for (ite = 0; ite < 4; ite++) {
+  for (ite = 0; ite < num_iters; ite++) {
     struct buf_2d ref_yv12[2];
     uint32_t bestsme = UINT_MAX;
     int sadpb = x->sadperbit16;
@@ -1772,6 +1982,11 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                        // odd iterations search in the second. The predictor
                        // found for the 'other' reference frame is factored in.
 
+    // Skip further iterations of search if in the previous iteration, the
+    // motion vector of the searched ref frame is unchanged, and the other ref
+    // frame's full-pixel mv is unchanged.
+    if (skip_iters(iter_mvs, ite, id)) break;
+
     // Initialized here because of compiler problem in Visual Studio.
     ref_yv12[0] = xd->plane[0].pre[0];
     ref_yv12[1] = xd->plane[0].pre[1];
@@ -1781,9 +1996,9 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
       vp9_highbd_build_inter_predictor(
-          ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw,
-          &frame_mv[refs[!id]].as_mv, &sf, pw, ph, 0, kernel, MV_PRECISION_Q3,
-          mi_col * MI_SIZE, mi_row * MI_SIZE, xd->bd);
+          CONVERT_TO_SHORTPTR(ref_yv12[!id].buf), ref_yv12[!id].stride,
+          second_pred_alloc_16, pw, &frame_mv[refs[!id]].as_mv, &sf, pw, ph, 0,
+          kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd->bd);
     } else {
       second_pred = (uint8_t *)second_pred_alloc_16;
       vp9_build_inter_predictor(ref_yv12[!id].buf, ref_yv12[!id].stride,
@@ -1824,8 +2039,8 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
       bestsme = cpi->find_fractional_mv_step(
           x, &tmp_mv, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv,
           x->errorperbit, &cpi->fn_ptr[bsize], 0,
-          cpi->sf.mv.subpel_iters_per_step, NULL, x->nmvjointcost, x->mvcost,
-          &dis, &sse, second_pred, pw, ph);
+          cpi->sf.mv.subpel_search_level, NULL, x->nmvjointcost, x->mvcost,
+          &dis, &sse, second_pred, pw, ph, cpi->sf.use_accurate_subpel_search);
     }
 
     // Restore the pointer to the first (possibly scaled) prediction buffer.
@@ -1837,6 +2052,10 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
     } else {
       break;
     }
+    if (ite < num_iters - 1) {
+      iter_mvs[ite + 1][0].as_int = frame_mv[refs[0]].as_int;
+      iter_mvs[ite + 1][1].as_int = frame_mv[refs[1]].as_int;
+    }
   }
 
   *rate_mv = 0;
@@ -1857,7 +2076,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
 
 static int64_t rd_pick_best_sub8x8_mode(
     VP9_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv,
-    int_mv *second_best_ref_mv, int64_t best_rd, int *returntotrate,
+    int_mv *second_best_ref_mv, int64_t best_rd_so_far, int *returntotrate,
     int *returnyrate, int64_t *returndistortion, int *skippable, int64_t *psse,
     int mvthresh, int_mv seg_mvs[4][MAX_REF_FRAMES], BEST_SEG_INFO *bsi_buf,
     int filter_idx, int mi_row, int mi_col) {
@@ -1879,6 +2098,8 @@ static int64_t rd_pick_best_sub8x8_mode(
   const BLOCK_SIZE bsize = mi->sb_type;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+  const int pw = num_4x4_blocks_wide << 2;
+  const int ph = num_4x4_blocks_high << 2;
   ENTROPY_CONTEXT t_above[2], t_left[2];
   int subpelmv = 1, have_ref = 0;
   SPEED_FEATURES *const sf = &cpi->sf;
@@ -1888,7 +2109,7 @@ static int64_t rd_pick_best_sub8x8_mode(
 
   vp9_zero(*bsi);
 
-  bsi->segment_rd = best_rd;
+  bsi->segment_rd = best_rd_so_far;
   bsi->ref_mv[0] = best_ref_mv;
   bsi->ref_mv[1] = second_best_ref_mv;
   bsi->mvp.as_int = best_ref_mv->as_int;
@@ -1914,14 +2135,14 @@ static int64_t rd_pick_best_sub8x8_mode(
       int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
       PREDICTION_MODE mode_selected = ZEROMV;
       int64_t best_rd = INT64_MAX;
-      const int i = idy * 2 + idx;
+      const int block = idy * 2 + idx;
       int ref;
 
       for (ref = 0; ref < 1 + has_second_rf; ++ref) {
         const MV_REFERENCE_FRAME frame = mi->ref_frame[ref];
         frame_mv[ZEROMV][frame].as_int = 0;
         vp9_append_sub8x8_mvs_for_idx(
-            cm, xd, i, ref, mi_row, mi_col, &frame_mv[NEARESTMV][frame],
+            cm, xd, block, ref, mi_row, mi_col, &frame_mv[NEARESTMV][frame],
             &frame_mv[NEARMV][frame], mbmi_ext->mode_context);
       }
 
@@ -1931,7 +2152,7 @@ static int64_t rd_pick_best_sub8x8_mode(
         struct buf_2d orig_pre[2];
 
         mode_idx = INTER_OFFSET(this_mode);
-        bsi->rdstat[i][mode_idx].brdcost = INT64_MAX;
+        bsi->rdstat[block][mode_idx].brdcost = INT64_MAX;
         if (!(inter_mode_mask & (1 << this_mode))) continue;
 
         if (!check_best_zero_mv(cpi, mbmi_ext->mode_context, frame_mv,
@@ -1939,14 +2160,14 @@ static int64_t rd_pick_best_sub8x8_mode(
           continue;
 
         memcpy(orig_pre, pd->pre, sizeof(orig_pre));
-        memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
-               sizeof(bsi->rdstat[i][mode_idx].ta));
-        memcpy(bsi->rdstat[i][mode_idx].tl, t_left,
-               sizeof(bsi->rdstat[i][mode_idx].tl));
+        memcpy(bsi->rdstat[block][mode_idx].ta, t_above,
+               sizeof(bsi->rdstat[block][mode_idx].ta));
+        memcpy(bsi->rdstat[block][mode_idx].tl, t_left,
+               sizeof(bsi->rdstat[block][mode_idx].tl));
 
         // motion search for newmv (single predictor case only)
         if (!has_second_rf && this_mode == NEWMV &&
-            seg_mvs[i][mi->ref_frame[0]].as_int == INVALID_MV) {
+            seg_mvs[block][mi->ref_frame[0]].as_int == INVALID_MV) {
           MV *const new_mv = &mode_mv[NEWMV][0].as_mv;
           int step_param = 0;
           uint32_t bestsme = UINT_MAX;
@@ -1956,18 +2177,19 @@ static int64_t rd_pick_best_sub8x8_mode(
           int cost_list[5];
           const MvLimits tmp_mv_limits = x->mv_limits;
 
-          /* Is the best so far sufficiently good that we cant justify doing
+          /* Is the best so far sufficiently good that we can't justify doing
            * and new motion search. */
           if (best_rd < label_mv_thresh) break;
 
           if (cpi->oxcf.mode != BEST) {
             // use previous block's result as next block's MV predictor.
-            if (i > 0) {
-              bsi->mvp.as_int = mi->bmi[i - 1].as_mv[0].as_int;
-              if (i == 2) bsi->mvp.as_int = mi->bmi[i - 2].as_mv[0].as_int;
+            if (block > 0) {
+              bsi->mvp.as_int = mi->bmi[block - 1].as_mv[0].as_int;
+              if (block == 2)
+                bsi->mvp.as_int = mi->bmi[block - 2].as_mv[0].as_int;
             }
           }
-          if (i == 0)
+          if (block == 0)
             max_mv = x->max_mv_context[mi->ref_frame[0]];
           else
             max_mv =
@@ -1987,18 +2209,22 @@ static int64_t rd_pick_best_sub8x8_mode(
           mvp_full.col = bsi->mvp.as_mv.col >> 3;
 
           if (sf->adaptive_motion_search) {
-            mvp_full.row = x->pred_mv[mi->ref_frame[0]].row >> 3;
-            mvp_full.col = x->pred_mv[mi->ref_frame[0]].col >> 3;
+            if (x->pred_mv[mi->ref_frame[0]].row != INT16_MAX &&
+                x->pred_mv[mi->ref_frame[0]].col != INT16_MAX) {
+              mvp_full.row = x->pred_mv[mi->ref_frame[0]].row >> 3;
+              mvp_full.col = x->pred_mv[mi->ref_frame[0]].col >> 3;
+            }
             step_param = VPXMAX(step_param, 8);
           }
 
           // adjust src pointer for this block
-          mi_buf_shift(x, i);
+          mi_buf_shift(x, block);
 
           vp9_set_mv_search_range(&x->mv_limits, &bsi->ref_mv[0]->as_mv);
 
           bestsme = vp9_full_pixel_search(
-              cpi, x, bsize, &mvp_full, step_param, sadpb,
+              cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method,
+              sadpb,
               sf->mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL,
               &bsi->ref_mv[0]->as_mv, new_mv, INT_MAX, 1);
 
@@ -2009,56 +2235,60 @@ static int64_t rd_pick_best_sub8x8_mode(
             cpi->find_fractional_mv_step(
                 x, new_mv, &bsi->ref_mv[0]->as_mv, cm->allow_high_precision_mv,
                 x->errorperbit, &cpi->fn_ptr[bsize], sf->mv.subpel_force_stop,
-                sf->mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
+                sf->mv.subpel_search_level, cond_cost_list(cpi, cost_list),
                 x->nmvjointcost, x->mvcost, &distortion,
-                &x->pred_sse[mi->ref_frame[0]], NULL, 0, 0);
+                &x->pred_sse[mi->ref_frame[0]], NULL, pw, ph,
+                cpi->sf.use_accurate_subpel_search);
 
             // save motion search result for use in compound prediction
-            seg_mvs[i][mi->ref_frame[0]].as_mv = *new_mv;
+            seg_mvs[block][mi->ref_frame[0]].as_mv = *new_mv;
           }
 
-          if (sf->adaptive_motion_search)
-            x->pred_mv[mi->ref_frame[0]] = *new_mv;
+          x->pred_mv[mi->ref_frame[0]] = *new_mv;
 
           // restore src pointers
           mi_buf_restore(x, orig_src, orig_pre);
         }
 
         if (has_second_rf) {
-          if (seg_mvs[i][mi->ref_frame[1]].as_int == INVALID_MV ||
-              seg_mvs[i][mi->ref_frame[0]].as_int == INVALID_MV)
+          if (seg_mvs[block][mi->ref_frame[1]].as_int == INVALID_MV ||
+              seg_mvs[block][mi->ref_frame[0]].as_int == INVALID_MV)
             continue;
         }
 
         if (has_second_rf && this_mode == NEWMV &&
             mi->interp_filter == EIGHTTAP) {
+          // Decide number of joint motion search iterations
+          const int num_joint_search_iters = get_joint_search_iters(
+              cpi->sf.comp_inter_joint_search_iter_level, bsize);
           // adjust src pointers
-          mi_buf_shift(x, i);
-          if (sf->comp_inter_joint_search_thresh <= bsize) {
+          mi_buf_shift(x, block);
+          if (num_joint_search_iters) {
             int rate_mv;
             joint_motion_search(cpi, x, bsize, frame_mv[this_mode], mi_row,
-                                mi_col, seg_mvs[i], &rate_mv);
-            seg_mvs[i][mi->ref_frame[0]].as_int =
+                                mi_col, seg_mvs[block], &rate_mv,
+                                num_joint_search_iters);
+            seg_mvs[block][mi->ref_frame[0]].as_int =
                 frame_mv[this_mode][mi->ref_frame[0]].as_int;
-            seg_mvs[i][mi->ref_frame[1]].as_int =
+            seg_mvs[block][mi->ref_frame[1]].as_int =
                 frame_mv[this_mode][mi->ref_frame[1]].as_int;
           }
           // restore src pointers
           mi_buf_restore(x, orig_src, orig_pre);
         }
 
-        bsi->rdstat[i][mode_idx].brate = set_and_cost_bmi_mvs(
-            cpi, x, xd, i, this_mode, mode_mv[this_mode], frame_mv, seg_mvs[i],
-            bsi->ref_mv, x->nmvjointcost, x->mvcost);
+        bsi->rdstat[block][mode_idx].brate = set_and_cost_bmi_mvs(
+            cpi, x, xd, block, this_mode, mode_mv[this_mode], frame_mv,
+            seg_mvs[block], bsi->ref_mv, x->nmvjointcost, x->mvcost);
 
         for (ref = 0; ref < 1 + has_second_rf; ++ref) {
-          bsi->rdstat[i][mode_idx].mvs[ref].as_int =
+          bsi->rdstat[block][mode_idx].mvs[ref].as_int =
               mode_mv[this_mode][ref].as_int;
           if (num_4x4_blocks_wide > 1)
-            bsi->rdstat[i + 1][mode_idx].mvs[ref].as_int =
+            bsi->rdstat[block + 1][mode_idx].mvs[ref].as_int =
                 mode_mv[this_mode][ref].as_int;
           if (num_4x4_blocks_high > 1)
-            bsi->rdstat[i + 2][mode_idx].mvs[ref].as_int =
+            bsi->rdstat[block + 2][mode_idx].mvs[ref].as_int =
                 mode_mv[this_mode][ref].as_int;
         }
 
@@ -2076,7 +2306,7 @@ static int64_t rd_pick_best_sub8x8_mode(
           for (ref = 0; ref < 1 + has_second_rf; ++ref) {
             subpelmv |= mv_has_subpel(&mode_mv[this_mode][ref].as_mv);
             have_ref &= mode_mv[this_mode][ref].as_int ==
-                        ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int;
+                        ref_bsi->rdstat[block][mode_idx].mvs[ref].as_int;
           }
 
           if (filter_idx > 1 && !subpelmv && !have_ref) {
@@ -2084,53 +2314,54 @@ static int64_t rd_pick_best_sub8x8_mode(
             have_ref = 1;
             for (ref = 0; ref < 1 + has_second_rf; ++ref)
               have_ref &= mode_mv[this_mode][ref].as_int ==
-                          ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int;
+                          ref_bsi->rdstat[block][mode_idx].mvs[ref].as_int;
           }
 
           if (!subpelmv && have_ref &&
-              ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
-            memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx],
-                   sizeof(SEG_RDSTAT));
+              ref_bsi->rdstat[block][mode_idx].brdcost < INT64_MAX) {
+            bsi->rdstat[block][mode_idx] = ref_bsi->rdstat[block][mode_idx];
             if (num_4x4_blocks_wide > 1)
-              bsi->rdstat[i + 1][mode_idx].eobs =
-                  ref_bsi->rdstat[i + 1][mode_idx].eobs;
+              bsi->rdstat[block + 1][mode_idx].eobs =
+                  ref_bsi->rdstat[block + 1][mode_idx].eobs;
             if (num_4x4_blocks_high > 1)
-              bsi->rdstat[i + 2][mode_idx].eobs =
-                  ref_bsi->rdstat[i + 2][mode_idx].eobs;
+              bsi->rdstat[block + 2][mode_idx].eobs =
+                  ref_bsi->rdstat[block + 2][mode_idx].eobs;
 
-            if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
+            if (bsi->rdstat[block][mode_idx].brdcost < best_rd) {
               mode_selected = this_mode;
-              best_rd = bsi->rdstat[i][mode_idx].brdcost;
+              best_rd = bsi->rdstat[block][mode_idx].brdcost;
             }
             continue;
           }
         }
 
-        bsi->rdstat[i][mode_idx].brdcost = encode_inter_mb_segment(
-            cpi, x, bsi->segment_rd - this_segment_rd, i,
-            &bsi->rdstat[i][mode_idx].byrate, &bsi->rdstat[i][mode_idx].bdist,
-            &bsi->rdstat[i][mode_idx].bsse, bsi->rdstat[i][mode_idx].ta,
-            bsi->rdstat[i][mode_idx].tl, mi_row, mi_col);
-        if (bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
-          bsi->rdstat[i][mode_idx].brdcost +=
-              RDCOST(x->rdmult, x->rddiv, bsi->rdstat[i][mode_idx].brate, 0);
-          bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate;
-          bsi->rdstat[i][mode_idx].eobs = p->eobs[i];
+        bsi->rdstat[block][mode_idx].brdcost = encode_inter_mb_segment(
+            cpi, x, bsi->segment_rd - this_segment_rd, block,
+            &bsi->rdstat[block][mode_idx].byrate,
+            &bsi->rdstat[block][mode_idx].bdist,
+            &bsi->rdstat[block][mode_idx].bsse, bsi->rdstat[block][mode_idx].ta,
+            bsi->rdstat[block][mode_idx].tl, mi_row, mi_col);
+        if (bsi->rdstat[block][mode_idx].brdcost < INT64_MAX) {
+          bsi->rdstat[block][mode_idx].brdcost += RDCOST(
+              x->rdmult, x->rddiv, bsi->rdstat[block][mode_idx].brate, 0);
+          bsi->rdstat[block][mode_idx].brate +=
+              bsi->rdstat[block][mode_idx].byrate;
+          bsi->rdstat[block][mode_idx].eobs = p->eobs[block];
           if (num_4x4_blocks_wide > 1)
-            bsi->rdstat[i + 1][mode_idx].eobs = p->eobs[i + 1];
+            bsi->rdstat[block + 1][mode_idx].eobs = p->eobs[block + 1];
           if (num_4x4_blocks_high > 1)
-            bsi->rdstat[i + 2][mode_idx].eobs = p->eobs[i + 2];
+            bsi->rdstat[block + 2][mode_idx].eobs = p->eobs[block + 2];
         }
 
-        if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
+        if (bsi->rdstat[block][mode_idx].brdcost < best_rd) {
           mode_selected = this_mode;
-          best_rd = bsi->rdstat[i][mode_idx].brdcost;
+          best_rd = bsi->rdstat[block][mode_idx].brdcost;
         }
       } /*for each 4x4 mode*/
 
       if (best_rd == INT64_MAX) {
         int iy, midx;
-        for (iy = i + 1; iy < 4; ++iy)
+        for (iy = block + 1; iy < 4; ++iy)
           for (midx = 0; midx < INTER_MODES; ++midx)
             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
         bsi->segment_rd = INT64_MAX;
@@ -2138,22 +2369,22 @@ static int64_t rd_pick_best_sub8x8_mode(
       }
 
       mode_idx = INTER_OFFSET(mode_selected);
-      memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above));
-      memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left));
+      memcpy(t_above, bsi->rdstat[block][mode_idx].ta, sizeof(t_above));
+      memcpy(t_left, bsi->rdstat[block][mode_idx].tl, sizeof(t_left));
 
-      set_and_cost_bmi_mvs(cpi, x, xd, i, mode_selected, mode_mv[mode_selected],
-                           frame_mv, seg_mvs[i], bsi->ref_mv, x->nmvjointcost,
-                           x->mvcost);
+      set_and_cost_bmi_mvs(cpi, x, xd, block, mode_selected,
+                           mode_mv[mode_selected], frame_mv, seg_mvs[block],
+                           bsi->ref_mv, x->nmvjointcost, x->mvcost);
 
-      br += bsi->rdstat[i][mode_idx].brate;
-      bd += bsi->rdstat[i][mode_idx].bdist;
-      block_sse += bsi->rdstat[i][mode_idx].bsse;
-      segmentyrate += bsi->rdstat[i][mode_idx].byrate;
-      this_segment_rd += bsi->rdstat[i][mode_idx].brdcost;
+      br += bsi->rdstat[block][mode_idx].brate;
+      bd += bsi->rdstat[block][mode_idx].bdist;
+      block_sse += bsi->rdstat[block][mode_idx].bsse;
+      segmentyrate += bsi->rdstat[block][mode_idx].byrate;
+      this_segment_rd += bsi->rdstat[block][mode_idx].brdcost;
 
       if (this_segment_rd > bsi->segment_rd) {
         int iy, midx;
-        for (iy = i + 1; iy < 4; ++iy)
+        for (iy = block + 1; iy < 4; ++iy)
           for (midx = 0; midx < INTER_MODES; ++midx)
             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
         bsi->segment_rd = INT64_MAX;
@@ -2171,7 +2402,7 @@ static int64_t rd_pick_best_sub8x8_mode(
   // update the coding decisions
   for (k = 0; k < 4; ++k) bsi->modes[k] = mi->bmi[k].as_mode;
 
-  if (bsi->segment_rd > best_rd) return INT64_MAX;
+  if (bsi->segment_rd > best_rd_so_far) return INT64_MAX;
   /* set it to the best */
   for (i = 0; i < 4; i++) {
     mode_idx = INTER_OFFSET(bsi->modes[i]);
@@ -2313,6 +2544,22 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
                 block_size);
 }
 
+#if CONFIG_NON_GREEDY_MV
+static int ref_frame_to_gf_rf_idx(int ref_frame) {
+  if (ref_frame == GOLDEN_FRAME) {
+    return 0;
+  }
+  if (ref_frame == LAST_FRAME) {
+    return 1;
+  }
+  if (ref_frame == ALTREF_FRAME) {
+    return 2;
+  }
+  assert(0);
+  return -1;
+}
+#endif
+
 static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                                  int mi_row, int mi_col, int_mv *tmp_mv,
                                  int *rate_mv) {
@@ -2320,19 +2567,35 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
   const VP9_COMMON *cm = &cpi->common;
   MODE_INFO *mi = xd->mi[0];
   struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0 } };
-  int bestsme = INT_MAX;
   int step_param;
-  int sadpb = x->sadperbit16;
   MV mvp_full;
   int ref = mi->ref_frame[0];
   MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
   const MvLimits tmp_mv_limits = x->mv_limits;
   int cost_list[5];
-
+  const int best_predmv_idx = x->mv_best_ref_index[ref];
   const YV12_BUFFER_CONFIG *scaled_ref_frame =
       vp9_get_scaled_ref_frame(cpi, ref);
-
+  const int pw = num_4x4_blocks_wide_lookup[bsize] << 2;
+  const int ph = num_4x4_blocks_high_lookup[bsize] << 2;
   MV pred_mv[3];
+
+  int bestsme = INT_MAX;
+#if CONFIG_NON_GREEDY_MV
+  int gf_group_idx = cpi->twopass.gf_group.index;
+  int gf_rf_idx = ref_frame_to_gf_rf_idx(ref);
+  BLOCK_SIZE square_bsize = get_square_block_size(bsize);
+  int_mv nb_full_mvs[NB_MVS_NUM] = { 0 };
+  MotionField *motion_field = vp9_motion_field_info_get_motion_field(
+      &cpi->motion_field_info, gf_group_idx, gf_rf_idx, square_bsize);
+  const int nb_full_mv_num =
+      vp9_prepare_nb_full_mvs(motion_field, mi_row, mi_col, nb_full_mvs);
+  const int lambda = (pw * ph) / 4;
+  assert(pw * ph == lambda << 2);
+#else   // CONFIG_NON_GREEDY_MV
+  int sadpb = x->sadperbit16;
+#endif  // CONFIG_NON_GREEDY_MV
+
   pred_mv[0] = x->mbmi_ext->ref_mvs[ref][0].as_mv;
   pred_mv[1] = x->mbmi_ext->ref_mvs[ref][1].as_mv;
   pred_mv[2] = x->pred_mv[ref];
@@ -2361,7 +2624,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
   }
 
   if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64) {
-    int boffset =
+    const int boffset =
         2 * (b_width_log2_lookup[BLOCK_64X64] -
              VPXMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
     step_param = VPXMAX(step_param, boffset);
@@ -2379,14 +2642,14 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
       int i;
       for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
         if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
-          x->pred_mv[ref].row = 0;
-          x->pred_mv[ref].col = 0;
+          x->pred_mv[ref].row = INT16_MAX;
+          x->pred_mv[ref].col = INT16_MAX;
           tmp_mv->as_int = INVALID_MV;
 
           if (scaled_ref_frame) {
-            int i;
-            for (i = 0; i < MAX_MB_PLANE; ++i)
-              xd->plane[i].pre[0] = backup_yv12[i];
+            int j;
+            for (j = 0; j < MAX_MB_PLANE; ++j)
+              xd->plane[j].pre[0] = backup_yv12[j];
           }
           return;
         }
@@ -2398,14 +2661,65 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
   // after full-pixel motion search.
   vp9_set_mv_search_range(&x->mv_limits, &ref_mv);
 
-  mvp_full = pred_mv[x->mv_best_ref_index[ref]];
-
+  mvp_full = pred_mv[best_predmv_idx];
   mvp_full.col >>= 3;
   mvp_full.row >>= 3;
 
-  bestsme = vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
-                                  cond_cost_list(cpi, cost_list), &ref_mv,
-                                  &tmp_mv->as_mv, INT_MAX, 1);
+#if CONFIG_NON_GREEDY_MV
+  bestsme = vp9_full_pixel_diamond_new(cpi, x, bsize, &mvp_full, step_param,
+                                       lambda, 1, nb_full_mvs, nb_full_mv_num,
+                                       &tmp_mv->as_mv);
+#else   // CONFIG_NON_GREEDY_MV
+  bestsme = vp9_full_pixel_search(
+      cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, sadpb,
+      cond_cost_list(cpi, cost_list), &ref_mv, &tmp_mv->as_mv, INT_MAX, 1);
+#endif  // CONFIG_NON_GREEDY_MV
+
+  if (cpi->sf.enhanced_full_pixel_motion_search) {
+    int i;
+    for (i = 0; i < 3; ++i) {
+      int this_me;
+      MV this_mv;
+      int diff_row;
+      int diff_col;
+      int step;
+
+      if (pred_mv[i].row == INT16_MAX || pred_mv[i].col == INT16_MAX) continue;
+      if (i == best_predmv_idx) continue;
+
+      diff_row = ((int)pred_mv[i].row -
+                  pred_mv[i > 0 ? (i - 1) : best_predmv_idx].row) >>
+                 3;
+      diff_col = ((int)pred_mv[i].col -
+                  pred_mv[i > 0 ? (i - 1) : best_predmv_idx].col) >>
+                 3;
+      if (diff_row == 0 && diff_col == 0) continue;
+      if (diff_row < 0) diff_row = -diff_row;
+      if (diff_col < 0) diff_col = -diff_col;
+      step = get_msb((diff_row + diff_col + 1) >> 1);
+      if (step <= 0) continue;
+
+      mvp_full = pred_mv[i];
+      mvp_full.col >>= 3;
+      mvp_full.row >>= 3;
+#if CONFIG_NON_GREEDY_MV
+      this_me = vp9_full_pixel_diamond_new(
+          cpi, x, bsize, &mvp_full,
+          VPXMAX(step_param, MAX_MVSEARCH_STEPS - step), lambda, 1, nb_full_mvs,
+          nb_full_mv_num, &this_mv);
+#else   // CONFIG_NON_GREEDY_MV
+      this_me = vp9_full_pixel_search(
+          cpi, x, bsize, &mvp_full,
+          VPXMAX(step_param, MAX_MVSEARCH_STEPS - step),
+          cpi->sf.mv.search_method, sadpb, cond_cost_list(cpi, cost_list),
+          &ref_mv, &this_mv, INT_MAX, 1);
+#endif  // CONFIG_NON_GREEDY_MV
+      if (this_me < bestsme) {
+        tmp_mv->as_mv = this_mv;
+        bestsme = this_me;
+      }
+    }
+  }
 
   x->mv_limits = tmp_mv_limits;
 
@@ -2414,13 +2728,14 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
     cpi->find_fractional_mv_step(
         x, &tmp_mv->as_mv, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
         &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
-        cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
-        x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, 0, 0);
+        cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list),
+        x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, pw, ph,
+        cpi->sf.use_accurate_subpel_search);
   }
   *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmvjointcost,
                              x->mvcost, MV_COST_WEIGHT);
 
-  if (cpi->sf.adaptive_motion_search) x->pred_mv[ref] = tmp_mv->as_mv;
+  x->pred_mv[ref] = tmp_mv->as_mv;
 
   if (scaled_ref_frame) {
     int i;
@@ -2445,26 +2760,63 @@ static INLINE void restore_dst_buf(MACROBLOCKD *xd,
 // However, once established that vector may be usable through the nearest and
 // near mv modes to reduce distortion in subsequent blocks and also improve
 // visual quality.
-static int discount_newmv_test(const VP9_COMP *cpi, int this_mode,
-                               int_mv this_mv,
-                               int_mv (*mode_mv)[MAX_REF_FRAMES],
-                               int ref_frame) {
+static int discount_newmv_test(VP9_COMP *cpi, int this_mode, int_mv this_mv,
+                               int_mv (*mode_mv)[MAX_REF_FRAMES], int ref_frame,
+                               int mi_row, int mi_col, BLOCK_SIZE bsize) {
+#if CONFIG_NON_GREEDY_MV
+  (void)mode_mv;
+  (void)this_mv;
+  if (this_mode == NEWMV && bsize >= BLOCK_8X8 && cpi->tpl_ready) {
+    const int gf_group_idx = cpi->twopass.gf_group.index;
+    const int gf_rf_idx = ref_frame_to_gf_rf_idx(ref_frame);
+    const TplDepFrame tpl_frame = cpi->tpl_stats[gf_group_idx];
+    const MotionField *motion_field = vp9_motion_field_info_get_motion_field(
+        &cpi->motion_field_info, gf_group_idx, gf_rf_idx, cpi->tpl_bsize);
+    const int tpl_block_mi_h = num_8x8_blocks_high_lookup[cpi->tpl_bsize];
+    const int tpl_block_mi_w = num_8x8_blocks_wide_lookup[cpi->tpl_bsize];
+    const int tpl_mi_row = mi_row - (mi_row % tpl_block_mi_h);
+    const int tpl_mi_col = mi_col - (mi_col % tpl_block_mi_w);
+    const int mv_mode =
+        tpl_frame
+            .mv_mode_arr[gf_rf_idx][tpl_mi_row * tpl_frame.stride + tpl_mi_col];
+    if (mv_mode == NEW_MV_MODE) {
+      int_mv tpl_new_mv =
+          vp9_motion_field_mi_get_mv(motion_field, tpl_mi_row, tpl_mi_col);
+      int row_diff = abs(tpl_new_mv.as_mv.row - this_mv.as_mv.row);
+      int col_diff = abs(tpl_new_mv.as_mv.col - this_mv.as_mv.col);
+      if (VPXMAX(row_diff, col_diff) <= 8) {
+        return 1;
+      } else {
+        return 0;
+      }
+    } else {
+      return 0;
+    }
+  } else {
+    return 0;
+  }
+#else
+  (void)mi_row;
+  (void)mi_col;
+  (void)bsize;
   return (!cpi->rc.is_src_frame_alt_ref && (this_mode == NEWMV) &&
           (this_mv.as_int != 0) &&
           ((mode_mv[NEARESTMV][ref_frame].as_int == 0) ||
            (mode_mv[NEARESTMV][ref_frame].as_int == INVALID_MV)) &&
           ((mode_mv[NEARMV][ref_frame].as_int == 0) ||
            (mode_mv[NEARMV][ref_frame].as_int == INVALID_MV)));
+#endif
 }
 
 static int64_t handle_inter_mode(
     VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int *rate2,
     int64_t *distortion, int *skippable, int *rate_y, int *rate_uv,
-    int *disable_skip, int_mv (*mode_mv)[MAX_REF_FRAMES], int mi_row,
-    int mi_col, int_mv single_newmv[MAX_REF_FRAMES],
+    struct buf_2d *recon, int *disable_skip, int_mv (*mode_mv)[MAX_REF_FRAMES],
+    int mi_row, int mi_col, int_mv single_newmv[MAX_REF_FRAMES],
     INTERP_FILTER (*single_filter)[MAX_REF_FRAMES],
-    int (*single_skippable)[MAX_REF_FRAMES], int64_t *psse,
-    const int64_t ref_best_rd, int64_t *mask_filter, int64_t filter_cache[]) {
+    int (*single_skippable)[MAX_REF_FRAMES], int *single_mode_rate,
+    int64_t *psse, const int64_t ref_best_rd, int64_t *mask_filter,
+    int64_t filter_cache[], int best_mode_index) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   MODE_INFO *mi = xd->mi[0];
@@ -2482,9 +2834,8 @@ static int64_t handle_inter_mode(
 #else
   DECLARE_ALIGNED(16, uint8_t, tmp_buf[MAX_MB_PLANE * 64 * 64]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-  int pred_exists = 0;
   int intpel_mv;
-  int64_t rd, tmp_rd, best_rd = INT64_MAX;
+  int64_t rd, tmp_rd = INT64_MAX, best_rd = INT64_MAX;
   int best_needs_copy = 0;
   uint8_t *orig_dst[MAX_MB_PLANE];
   int orig_dst_stride[MAX_MB_PLANE];
@@ -2493,13 +2844,12 @@ static int64_t handle_inter_mode(
   uint8_t skip_txfm[MAX_MB_PLANE << 2] = { 0 };
   int64_t bsse[MAX_MB_PLANE << 2] = { 0 };
 
-  int bsl = mi_width_log2_lookup[bsize];
-  int pred_filter_search =
-      cpi->sf.cb_pred_filter_search
-          ? (((mi_row + mi_col) >> bsl) +
-             get_chessboard_index(cm->current_video_frame)) &
-                0x1
-          : 0;
+  const int bsl = mi_width_log2_lookup[bsize];
+  const int blk_parity = (((mi_row + mi_col) >> bsl) +
+                          get_chessboard_index(cm->current_video_frame)) &
+                         0x1;
+  const int pred_filter_search =
+      (cpi->sf.cb_pred_filter_search >= 2) && blk_parity;
 
   int skip_txfm_sb = 0;
   int64_t skip_sse_sb = INT64_MAX;
@@ -2538,13 +2888,23 @@ static int64_t handle_inter_mode(
   if (this_mode == NEWMV) {
     int rate_mv;
     if (is_comp_pred) {
+      // Decide number of joint motion search iterations
+      const int num_joint_search_iters = get_joint_search_iters(
+          cpi->sf.comp_inter_joint_search_iter_level, bsize);
+
       // Initialize mv using single prediction mode result.
       frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
       frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
 
-      if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
+      if (num_joint_search_iters) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+        start_timing(cpi, joint_motion_search_time);
+#endif
         joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col,
-                            single_newmv, &rate_mv);
+                            single_newmv, &rate_mv, num_joint_search_iters);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+        end_timing(cpi, joint_motion_search_time);
+#endif
       } else {
         rate_mv = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv,
                                   &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
@@ -2556,7 +2916,13 @@ static int64_t handle_inter_mode(
       *rate2 += rate_mv;
     } else {
       int_mv tmp_mv;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, single_motion_search_time);
+#endif
       single_motion_search(cpi, x, bsize, mi_row, mi_col, &tmp_mv, &rate_mv);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, single_motion_search_time);
+#endif
       if (tmp_mv.as_int == INVALID_MV) return INT64_MAX;
 
       frame_mv[refs[0]].as_int = xd->mi[0]->bmi[0].as_mv[0].as_int =
@@ -2567,7 +2933,8 @@ static int64_t handle_inter_mode(
       // under certain circumstances where we want to help initiate a weak
       // motion field, where the distortion gain for a single block may not
       // be enough to overcome the cost of a new mv.
-      if (discount_newmv_test(cpi, this_mode, tmp_mv, mode_mv, refs[0])) {
+      if (discount_newmv_test(cpi, this_mode, tmp_mv, mode_mv, refs[0], mi_row,
+                              mi_col, bsize)) {
         *rate2 += VPXMAX((rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
       } else {
         *rate2 += rate_mv;
@@ -2600,8 +2967,8 @@ static int64_t handle_inter_mode(
   //
   // Under some circumstances we discount the cost of new mv mode to encourage
   // initiation of a motion field.
-  if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]], mode_mv,
-                          refs[0])) {
+  if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]], mode_mv, refs[0],
+                          mi_row, mi_col, bsize)) {
     *rate2 +=
         VPXMIN(cost_mv_ref(cpi, this_mode, mbmi_ext->mode_context[refs[0]]),
                cost_mv_ref(cpi, NEARESTMV, mbmi_ext->mode_context[refs[0]]));
@@ -2609,23 +2976,45 @@ static int64_t handle_inter_mode(
     *rate2 += cost_mv_ref(cpi, this_mode, mbmi_ext->mode_context[refs[0]]);
   }
 
+  if (!is_comp_pred && cpi->sf.prune_single_mode_based_on_mv_diff_mode_rate) {
+    single_mode_rate[INTER_OFFSET(this_mode)] = *rate2;
+    // Prune NEARMV and ZEROMV modes based on motion vector difference and mode
+    // rate.
+    if (skip_single_mode_based_on_mode_rate(mode_mv, single_mode_rate,
+                                            this_mode, refs[0], *rate2,
+                                            best_mode_index)) {
+      // Check when the single inter mode is pruned, NEARESTMV or NEWMV modes
+      // are not early terminated. This ensures all single modes are not getting
+      // skipped when the speed feature is enabled.
+      assert(single_mode_rate[INTER_OFFSET(NEARESTMV)] != INT_MAX ||
+             single_mode_rate[INTER_OFFSET(NEWMV)] != INT_MAX);
+      return INT64_MAX;
+    }
+  }
   if (RDCOST(x->rdmult, x->rddiv, *rate2, 0) > ref_best_rd &&
       mi->mode != NEARESTMV)
     return INT64_MAX;
 
-  pred_exists = 0;
   // Are all MVs integer pel for Y and UV
   intpel_mv = !mv_has_subpel(&mi->mv[0].as_mv);
   if (is_comp_pred) intpel_mv &= !mv_has_subpel(&mi->mv[1].as_mv);
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, interp_filter_time);
+#endif
   // Search for best switchable filter by checking the variance of
   // pred error irrespective of whether the filter will be used
   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) filter_cache[i] = INT64_MAX;
 
   if (cm->interp_filter != BILINEAR) {
+    // Use cb pattern for filter eval when filter is not switchable
+    const int enable_interp_search =
+        (cpi->sf.cb_pred_filter_search && cm->interp_filter != SWITCHABLE)
+            ? blk_parity
+            : 1;
     if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
       best_filter = EIGHTTAP;
-    } else if (best_filter == SWITCHABLE) {
+    } else if (best_filter == SWITCHABLE && enable_interp_search) {
       int newbest;
       int tmp_rate_sum = 0;
       int64_t tmp_dist_sum = 0;
@@ -2635,6 +3024,9 @@ static int64_t handle_inter_mode(
         int64_t rs_rd;
         int tmp_skip_sb = 0;
         int64_t tmp_skip_sse = INT64_MAX;
+        const int enable_earlyterm =
+            cpi->sf.early_term_interp_search_plane_rd && cm->interp_filter != i;
+        int64_t filt_best_rd;
 
         mi->interp_filter = i;
         rs = vp9_get_switchable_rate(cpi, xd);
@@ -2668,9 +3060,16 @@ static int64_t handle_inter_mode(
               xd->plane[j].dst.stride = 64;
             }
           }
-          vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-          model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum, &tmp_skip_sb,
-                          &tmp_skip_sse);
+
+          filt_best_rd =
+              cm->interp_filter == SWITCHABLE ? (best_rd - rs_rd) : best_rd;
+          if (build_inter_pred_model_rd_earlyterm(
+                  cpi, mi_row, mi_col, bsize, x, xd, &rate_sum, &dist_sum,
+                  &tmp_skip_sb, &tmp_skip_sse, enable_earlyterm,
+                  filt_best_rd)) {
+            filter_cache[i] = INT64_MAX;
+            continue;
+          }
 
           rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum);
           filter_cache[i] = rd;
@@ -2703,7 +3102,6 @@ static int64_t handle_inter_mode(
         if ((cm->interp_filter == SWITCHABLE && newbest) ||
             (cm->interp_filter != SWITCHABLE &&
              cm->interp_filter == mi->interp_filter)) {
-          pred_exists = 1;
           tmp_rd = best_rd;
 
           skip_txfm_sb = tmp_skip_sb;
@@ -2715,12 +3113,15 @@ static int64_t handle_inter_mode(
       restore_dst_buf(xd, orig_dst, orig_dst_stride);
     }
   }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, interp_filter_time);
+#endif
   // Set the appropriate filter
   mi->interp_filter =
       cm->interp_filter != SWITCHABLE ? cm->interp_filter : best_filter;
   rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(cpi, xd) : 0;
 
-  if (pred_exists) {
+  if (tmp_rd != INT64_MAX) {
     if (best_needs_copy) {
       // again temporarily set the buffers to local memory to prevent a memcpy
       for (i = 0; i < MAX_MB_PLANE; i++) {
@@ -2735,9 +3136,9 @@ static int64_t handle_inter_mode(
     // Handles the special case when a filter that is not in the
     // switchable list (ex. bilinear) is indicated at the frame level, or
     // skip condition holds.
-    vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-    model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist, &skip_txfm_sb,
-                    &skip_sse_sb);
+    build_inter_pred_model_rd_earlyterm(
+        cpi, mi_row, mi_col, bsize, x, xd, &tmp_rate, &tmp_dist, &skip_txfm_sb,
+        &skip_sse_sb, 0 /*do_earlyterm*/, INT64_MAX);
     rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
     memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
     memcpy(bsse, x->bsse, sizeof(bsse));
@@ -2765,7 +3166,7 @@ static int64_t handle_inter_mode(
   memcpy(x->skip_txfm, skip_txfm, sizeof(skip_txfm));
   memcpy(x->bsse, bsse, sizeof(bsse));
 
-  if (!skip_txfm_sb) {
+  if (!skip_txfm_sb || xd->lossless) {
     int skippable_y, skippable_uv;
     int64_t sseuv = INT64_MAX;
     int64_t rdcosty = INT64_MAX;
@@ -2773,7 +3174,7 @@ static int64_t handle_inter_mode(
     // Y cost and distortion
     vp9_subtract_plane(x, bsize, 0);
     super_block_yrd(cpi, x, rate_y, &distortion_y, &skippable_y, psse, bsize,
-                    ref_best_rd);
+                    ref_best_rd, recon);
 
     if (*rate_y == INT_MAX) {
       *rate2 = INT_MAX;
@@ -2815,6 +3216,7 @@ static int64_t handle_inter_mode(
   restore_dst_buf(xd, orig_dst, orig_dst_stride);
   return 0;  // The rate-distortion cost will be re-calculated by caller.
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
                                BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
@@ -2829,7 +3231,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
   x->skip_encode = 0;
   ctx->skip = 0;
   xd->mi[0]->ref_frame[0] = INTRA_FRAME;
-  xd->mi[0]->ref_frame[1] = NONE;
+  xd->mi[0]->ref_frame[1] = NO_REF_FRAME;
   // Initialize interp_filter here so we do not have to check for inter block
   // modes in get_pred_context_switchable_interp()
   xd->mi[0]->interp_filter = SWITCHABLE_FILTERS;
@@ -2868,60 +3270,97 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
   rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
 }
 
+#if !CONFIG_REALTIME_ONLY
 // This function is designed to apply a bias or adjustment to an rd value based
 // on the relative variance of the source and reconstruction.
-#define LOW_VAR_THRESH 16
-#define VLOW_ADJ_MAX 25
-#define VHIGH_ADJ_MAX 8
+#define LOW_VAR_THRESH 250
+#define VAR_MULT 250
+static unsigned int max_var_adjust[VP9E_CONTENT_INVALID] = { 16, 16, 250 };
+
 static void rd_variance_adjustment(VP9_COMP *cpi, MACROBLOCK *x,
                                    BLOCK_SIZE bsize, int64_t *this_rd,
+                                   struct buf_2d *recon,
                                    MV_REFERENCE_FRAME ref_frame,
-                                   unsigned int source_variance) {
+                                   MV_REFERENCE_FRAME second_ref_frame,
+                                   PREDICTION_MODE this_mode) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  unsigned int recon_variance;
-  unsigned int absvar_diff = 0;
-  int64_t var_error = 0;
-  int64_t var_factor = 0;
+  unsigned int rec_variance;
+  unsigned int src_variance;
+  unsigned int src_rec_min;
+  unsigned int var_diff = 0;
+  unsigned int var_factor = 0;
+  unsigned int adj_max;
+  unsigned int low_var_thresh = LOW_VAR_THRESH;
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  vp9e_tune_content content_type = cpi->oxcf.content;
 
   if (*this_rd == INT64_MAX) return;
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    recon_variance = vp9_high_get_sby_perpixel_variance(cpi, &xd->plane[0].dst,
-                                                        bsize, xd->bd);
+    rec_variance = vp9_high_get_sby_variance(cpi, recon, bsize, xd->bd);
+    src_variance =
+        vp9_high_get_sby_variance(cpi, &x->plane[0].src, bsize, xd->bd);
   } else {
-    recon_variance =
-        vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+    rec_variance = vp9_get_sby_variance(cpi, recon, bsize);
+    src_variance = vp9_get_sby_variance(cpi, &x->plane[0].src, bsize);
   }
 #else
-  recon_variance = vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+  rec_variance = vp9_get_sby_variance(cpi, recon, bsize);
+  src_variance = vp9_get_sby_variance(cpi, &x->plane[0].src, bsize);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-  if ((source_variance + recon_variance) > LOW_VAR_THRESH) {
-    absvar_diff = (source_variance > recon_variance)
-                      ? (source_variance - recon_variance)
-                      : (recon_variance - source_variance);
+  // Scale based on area in 8x8 blocks
+  rec_variance /= (bw * bh);
+  src_variance /= (bw * bh);
 
-    var_error = ((int64_t)200 * source_variance * recon_variance) /
-                (((int64_t)source_variance * source_variance) +
-                 ((int64_t)recon_variance * recon_variance));
-    var_error = 100 - var_error;
+  if (content_type == VP9E_CONTENT_FILM) {
+    if (cpi->oxcf.pass == 2) {
+      // Adjust low variance threshold based on estimated group noise enegry.
+      double noise_factor =
+          (double)cpi->twopass.gf_group.group_noise_energy / SECTION_NOISE_DEF;
+      low_var_thresh = (unsigned int)(low_var_thresh * noise_factor);
+
+      if (ref_frame == INTRA_FRAME) {
+        low_var_thresh *= 2;
+        if (this_mode == DC_PRED) low_var_thresh *= 5;
+      } else if (second_ref_frame > INTRA_FRAME) {
+        low_var_thresh *= 2;
+      }
+    }
+  } else {
+    low_var_thresh = LOW_VAR_THRESH / 2;
   }
 
-  // Source variance above a threshold and ref frame is intra.
-  // This case is targeted mainly at discouraging intra modes that give rise
-  // to a predictor with a low spatial complexity compared to the source.
-  if ((source_variance > LOW_VAR_THRESH) && (ref_frame == INTRA_FRAME) &&
-      (source_variance > recon_variance)) {
-    var_factor = VPXMIN(absvar_diff, VPXMIN(VLOW_ADJ_MAX, var_error));
-    // A second possible case of interest is where the source variance
-    // is very low and we wish to discourage false texture or motion trails.
-  } else if ((source_variance < (LOW_VAR_THRESH >> 1)) &&
-             (recon_variance > source_variance)) {
-    var_factor = VPXMIN(absvar_diff, VPXMIN(VHIGH_ADJ_MAX, var_error));
+  // Lower of source (raw per pixel value) and recon variance. Note that
+  // if the source per pixel is 0 then the recon value here will not be per
+  // pixel (see above) so will likely be much larger.
+  src_rec_min = VPXMIN(src_variance, rec_variance);
+
+  if (src_rec_min > low_var_thresh) return;
+
+  // We care more when the reconstruction has lower variance so give this case
+  // a stronger weighting.
+  var_diff = (src_variance > rec_variance) ? (src_variance - rec_variance) * 2
+                                           : (rec_variance - src_variance) / 2;
+
+  adj_max = max_var_adjust[content_type];
+
+  var_factor =
+      (unsigned int)((int64_t)VAR_MULT * var_diff) / VPXMAX(1, src_variance);
+  var_factor = VPXMIN(adj_max, var_factor);
+
+  if ((content_type == VP9E_CONTENT_FILM) &&
+      ((ref_frame == INTRA_FRAME) || (second_ref_frame > INTRA_FRAME))) {
+    var_factor *= 2;
   }
+
   *this_rd += (*this_rd * var_factor) / 100;
+
+  (void)xd;
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 // Do we have an internal image edge (e.g. formatting bars).
 int vp9_internal_image_edge(VP9_COMP *cpi) {
@@ -2941,6 +3380,7 @@ int vp9_active_h_edge(VP9_COMP *cpi, int mi_row, int mi_step) {
   // For two pass account for any formatting bars detected.
   if (cpi->oxcf.pass == 2) {
     TWO_PASS *twopass = &cpi->twopass;
+    vpx_clear_system_state();
 
     // The inactive region is specified in MBs not mi units.
     // The image edge is in the following MB row.
@@ -2968,6 +3408,7 @@ int vp9_active_v_edge(VP9_COMP *cpi, int mi_col, int mi_step) {
   // For two pass account for any formatting bars detected.
   if (cpi->oxcf.pass == 2) {
     TWO_PASS *twopass = &cpi->twopass;
+    vpx_clear_system_state();
 
     // The inactive region is specified in MBs not mi units.
     // The image edge is in the following MB row.
@@ -2992,6 +3433,15 @@ int vp9_active_edge_sb(VP9_COMP *cpi, int mi_row, int mi_col) {
          vp9_active_v_edge(cpi, mi_col, MI_BLOCK_SIZE);
 }
 
+#if !CONFIG_REALTIME_ONLY
+static void init_frame_mv(int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]) {
+  for (int mode = 0; mode < MB_MODE_COUNT; ++mode) {
+    for (int ref_frame = 0; ref_frame < MAX_REF_FRAMES; ++ref_frame) {
+      frame_mv[mode][ref_frame].as_int = INVALID_MV;
+    }
+  }
+}
+
 void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
                                MACROBLOCK *x, int mi_row, int mi_col,
                                RD_COST *rd_cost, BLOCK_SIZE bsize,
@@ -3009,12 +3459,11 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
   unsigned char segment_id = mi->segment_id;
   int comp_pred, i, k;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
-  struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+  struct buf_2d yv12_mb[4][MAX_MB_PLANE] = { 0 };
   int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } };
   INTERP_FILTER single_inter_filter[MB_MODE_COUNT][MAX_REF_FRAMES];
   int single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES];
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
+  int single_mode_rate[MAX_REF_FRAMES][INTER_MODES];
   int64_t best_rd = best_rd_so_far;
   int64_t best_pred_diff[REFERENCE_MODES];
   int64_t best_pred_rd[REFERENCE_MODES];
@@ -3032,20 +3481,39 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
   int64_t dist_uv[TX_SIZES];
   int skip_uv[TX_SIZES];
   PREDICTION_MODE mode_uv[TX_SIZES];
-  const int intra_cost_penalty = vp9_get_intra_cost_penalty(
-      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+  const int intra_cost_penalty =
+      vp9_get_intra_cost_penalty(cpi, bsize, cm->base_qindex, cm->y_dc_delta_q);
   int best_skip2 = 0;
-  uint8_t ref_frame_skip_mask[2] = { 0 };
+  uint8_t ref_frame_skip_mask[2] = { 0, 1 };
   uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 };
   int mode_skip_start = sf->mode_skip_start + 1;
   const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
   const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
   int64_t mode_threshold[MAX_MODES];
-  int *mode_map = tile_data->mode_map[bsize];
+  int8_t *tile_mode_map = tile_data->mode_map[bsize];
+  int8_t mode_map[MAX_MODES];  // Maintain mode_map information locally to avoid
+                               // lock mechanism involved with reads from
+                               // tile_mode_map
   const int mode_search_skip_flags = sf->mode_search_skip_flags;
+  const int is_rect_partition =
+      num_4x4_blocks_wide_lookup[bsize] != num_4x4_blocks_high_lookup[bsize];
   int64_t mask_filter = 0;
   int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
 
+  struct buf_2d *recon;
+  struct buf_2d recon_buf;
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t, recon16[64 * 64]);
+  recon_buf.buf = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH
+                      ? CONVERT_TO_BYTEPTR(recon16)
+                      : (uint8_t *)recon16;
+#else
+  DECLARE_ALIGNED(16, uint8_t, recon8[64 * 64]);
+  recon_buf.buf = recon8;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  recon_buf.stride = 64;
+  recon = cpi->oxcf.content == VP9E_CONTENT_FILM ? &recon_buf : 0;
+
   vp9_zero(best_mbmode);
 
   x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
@@ -3069,9 +3537,12 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
 
   rd_cost->rate = INT_MAX;
 
+  init_frame_mv(frame_mv);
+
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
-    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+    if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) &&
+        !(is_rect_partition && (ctx->skip_ref_frame_mask & (1 << ref_frame)))) {
       assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
       setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
                          frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
@@ -3081,7 +3552,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
   }
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
+    if (!(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) {
       // Skip checking missing references in both single and compound reference
       // modes. Note that a mode will be skipped if both reference frames
       // are masked out.
@@ -3127,7 +3598,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
   if (cpi->rc.is_src_frame_alt_ref) {
     if (sf->alt_ref_search_fp) {
       mode_skip_mask[ALTREF_FRAME] = 0;
-      ref_frame_skip_mask[0] = ~(1 << ALTREF_FRAME);
+      ref_frame_skip_mask[0] = ~(1 << ALTREF_FRAME) & 0xff;
       ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
     }
   }
@@ -3144,32 +3615,37 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
         mode_skip_mask[GOLDEN_FRAME] |= INTER_ALL;
   }
 
-  if (bsize > sf->max_intra_bsize) {
+  if (bsize > sf->max_intra_bsize && cpi->ref_frame_flags != 0) {
     ref_frame_skip_mask[0] |= (1 << INTRA_FRAME);
     ref_frame_skip_mask[1] |= (1 << INTRA_FRAME);
   }
 
   mode_skip_mask[INTRA_FRAME] |=
-      ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
+      (uint16_t)~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
 
   for (i = 0; i <= LAST_NEW_MV_INDEX; ++i) mode_threshold[i] = 0;
+
   for (i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i)
     mode_threshold[i] = ((int64_t)rd_threshes[i] * rd_thresh_freq_fact[i]) >> 5;
 
   midx = sf->schedule_mode_search ? mode_skip_start : 0;
+
   while (midx > 4) {
     uint8_t end_pos = 0;
     for (i = 5; i < midx; ++i) {
-      if (mode_threshold[mode_map[i - 1]] > mode_threshold[mode_map[i]]) {
-        uint8_t tmp = mode_map[i];
-        mode_map[i] = mode_map[i - 1];
-        mode_map[i - 1] = tmp;
+      if (mode_threshold[tile_mode_map[i - 1]] >
+          mode_threshold[tile_mode_map[i]]) {
+        uint8_t tmp = tile_mode_map[i];
+        tile_mode_map[i] = tile_mode_map[i - 1];
+        tile_mode_map[i - 1] = tmp;
         end_pos = i;
       }
     }
     midx = end_pos;
   }
 
+  memcpy(mode_map, tile_mode_map, sizeof(mode_map));
+
   for (midx = 0; midx < MAX_MODES; ++midx) {
     int mode_index = mode_map[midx];
     int mode_excluded = 0;
@@ -3187,21 +3663,30 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
     ref_frame = vp9_mode_order[mode_index].ref_frame[0];
     second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
 
+    vp9_zero(x->sum_y_eobs);
+    comp_pred = second_ref_frame > INTRA_FRAME;
+    if (!comp_pred && ref_frame != INTRA_FRAME &&
+        sf->prune_single_mode_based_on_mv_diff_mode_rate)
+      single_mode_rate[ref_frame][INTER_OFFSET(this_mode)] = INT_MAX;
+
+    if (is_rect_partition) {
+      if (ctx->skip_ref_frame_mask & (1 << ref_frame)) continue;
+      if (second_ref_frame > 0 &&
+          (ctx->skip_ref_frame_mask & (1 << second_ref_frame)))
+        continue;
+    }
+
     // Look at the reference frame of the best mode so far and set the
     // skip mask to look at a subset of the remaining modes.
     if (midx == mode_skip_start && best_mode_index >= 0) {
       switch (best_mbmode.ref_frame[0]) {
         case INTRA_FRAME: break;
-        case LAST_FRAME:
-          ref_frame_skip_mask[0] |= LAST_FRAME_MODE_MASK;
-          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
-          break;
+        case LAST_FRAME: ref_frame_skip_mask[0] |= LAST_FRAME_MODE_MASK; break;
         case GOLDEN_FRAME:
           ref_frame_skip_mask[0] |= GOLDEN_FRAME_MODE_MASK;
-          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
           break;
         case ALTREF_FRAME: ref_frame_skip_mask[0] |= ALT_REF_MODE_MASK; break;
-        case NONE:
+        case NO_REF_FRAME:
         case MAX_REF_FRAMES: assert(0 && "Invalid Reference frame"); break;
       }
     }
@@ -3218,6 +3703,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
 
     if (best_rd < mode_threshold[mode_index]) continue;
 
+    // This is only used in motion vector unit test.
+    if (cpi->oxcf.motion_vector_unit_test && ref_frame == INTRA_FRAME) continue;
+
     if (sf->motion_field_mode_search) {
       const int mi_width = VPXMIN(num_8x8_blocks_wide_lookup[bsize],
                                   tile_info->mi_col_end - mi_col);
@@ -3231,7 +3719,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
       MODE_INFO *ref_mi;
       int const_motion = 1;
       int skip_ref_frame = !cb_partition_search_ctrl;
-      MV_REFERENCE_FRAME rf = NONE;
+      MV_REFERENCE_FRAME rf = NO_REF_FRAME;
       int_mv ref_mv;
       ref_mv.as_int = INVALID_MV;
 
@@ -3248,7 +3736,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
 
       if ((mi_col - 1) >= tile_info->mi_col_start) {
         if (ref_mv.as_int == INVALID_MV) ref_mv = xd->mi[-1]->mv[0];
-        if (rf == NONE) rf = xd->mi[-1]->ref_frame[0];
+        if (rf == NO_REF_FRAME) rf = xd->mi[-1]->ref_frame[0];
         for (i = 0; i < mi_height; ++i) {
           ref_mi = xd->mi[i * xd->mi_stride - 1];
           const_motion &= (ref_mv.as_int == ref_mi->mv[0].as_int) &&
@@ -3265,12 +3753,16 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
         if (this_mode == NEARMV || this_mode == ZEROMV) continue;
     }
 
-    comp_pred = second_ref_frame > INTRA_FRAME;
     if (comp_pred) {
       if (!cpi->allow_comp_inter_inter) continue;
 
+      if (cm->ref_frame_sign_bias[ref_frame] ==
+          cm->ref_frame_sign_bias[second_ref_frame])
+        continue;
+
       // Skip compound inter modes if ARF is not available.
-      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
+      if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame)))
+        continue;
 
       // Do not allow compound prediction if the segment level reference frame
       // feature is in use as in this case there can only be one reference.
@@ -3295,7 +3787,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
         // Disable intra modes other than DC_PRED for blocks with low variance
         // Threshold for intra skipping based on source variance
         // TODO(debargha): Specialize the threshold for super block sizes
-        const unsigned int skip_intra_var_thresh = 64;
+        const unsigned int skip_intra_var_thresh =
+            (cpi->oxcf.content == VP9E_CONTENT_FILM) ? 0 : 64;
         if ((mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
             x->source_variance < skip_intra_var_thresh)
           continue;
@@ -3339,19 +3832,30 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
     if (ref_frame == INTRA_FRAME) {
       TX_SIZE uv_tx;
       struct macroblockd_plane *const pd = &xd->plane[1];
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, intra_mode_search_time);
+#endif
       memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
       super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL, bsize,
-                      best_rd);
+                      best_rd, recon);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, intra_mode_search_time);
+#endif
       if (rate_y == INT_MAX) continue;
 
       uv_tx = uv_txsize_lookup[bsize][mi->tx_size][pd->subsampling_x]
                               [pd->subsampling_y];
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, intra_mode_search_time);
+#endif
       if (rate_uv_intra[uv_tx] == INT_MAX) {
         choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx],
                              &rate_uv_tokenonly[uv_tx], &dist_uv[uv_tx],
                              &skip_uv[uv_tx], &mode_uv[uv_tx]);
       }
-
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, intra_mode_search_time);
+#endif
       rate_uv = rate_uv_tokenonly[uv_tx];
       distortion_uv = dist_uv[uv_tx];
       skippable = skippable && skip_uv[uv_tx];
@@ -3362,11 +3866,18 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
         rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
     } else {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, handle_inter_mode_time);
+#endif
       this_rd = handle_inter_mode(
           cpi, x, bsize, &rate2, &distortion2, &skippable, &rate_y, &rate_uv,
-          &disable_skip, frame_mv, mi_row, mi_col, single_newmv,
-          single_inter_filter, single_skippable, &total_sse, best_rd,
-          &mask_filter, filter_cache);
+          recon, &disable_skip, frame_mv, mi_row, mi_col, single_newmv,
+          single_inter_filter, single_skippable,
+          &single_mode_rate[ref_frame][0], &total_sse, best_rd, &mask_filter,
+          filter_cache, best_mode_index);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, handle_inter_mode_time);
+#endif
       if (this_rd == INT64_MAX) continue;
 
       compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
@@ -3393,7 +3904,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
 
         // Cost the skip mb case
         rate2 += skip_cost1;
-      } else if (ref_frame != INTRA_FRAME && !xd->lossless) {
+      } else if (ref_frame != INTRA_FRAME && !xd->lossless &&
+                 !cpi->oxcf.sharpness) {
         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv + skip_cost0,
                    distortion2) <
             RDCOST(x->rdmult, x->rddiv, skip_cost1, total_sse)) {
@@ -3417,10 +3929,39 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
       this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
     }
 
-    // Apply an adjustment to the rd value based on the similarity of the
-    // source variance and reconstructed variance.
-    rd_variance_adjustment(cpi, x, bsize, &this_rd, ref_frame,
-                           x->source_variance);
+    if (recon) {
+      // In film mode bias against DC pred and other intra if there is a
+      // significant difference between the variance of the sub blocks in the
+      // the source. Also apply some bias against compound modes which also
+      // tend to blur fine texture such as film grain over time.
+      //
+      // The sub block test here acts in the case where one or more sub
+      // blocks have high relatively variance but others relatively low
+      // variance. Here the high variance sub blocks may push the
+      // total variance for the current block size over the thresholds
+      // used in rd_variance_adjustment() below.
+      if (cpi->oxcf.content == VP9E_CONTENT_FILM) {
+        if (bsize >= BLOCK_16X16) {
+          int min_energy, max_energy;
+          vp9_get_sub_block_energy(cpi, x, mi_row, mi_col, bsize, &min_energy,
+                                   &max_energy);
+          if (max_energy > min_energy) {
+            if (ref_frame == INTRA_FRAME) {
+              if (this_mode == DC_PRED)
+                this_rd += (this_rd * (max_energy - min_energy));
+              else
+                this_rd += (this_rd * (max_energy - min_energy)) / 4;
+            } else if (second_ref_frame > INTRA_FRAME) {
+              this_rd += this_rd / 4;
+            }
+          }
+        }
+      }
+      // Apply an adjustment to the rd value based on the similarity of the
+      // source variance and reconstructed variance.
+      rd_variance_adjustment(cpi, x, bsize, &this_rd, recon, ref_frame,
+                             second_ref_frame, this_mode);
+    }
 
     if (ref_frame == INTRA_FRAME) {
       // Keep record of best intra rd
@@ -3466,6 +4007,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
         if (!x->select_tx_size) swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
         memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mi->tx_size],
                sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk);
+        ctx->sum_y_eobs = x->sum_y_eobs[mi->tx_size];
 
         // TODO(debargha): enhance this test with a better distortion prediction
         // based on qp, activity mask and history
@@ -3571,6 +4113,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
   }
 
   if (best_mode_index < 0 || best_rd >= best_rd_so_far) {
+    // If adaptive interp filter is enabled, then the current leaf node of 8x8
+    // data is needed for sub8x8. Hence preserve the context.
+    if (bsize == BLOCK_8X8) ctx->mic = *xd->mi[0];
     rd_cost->rate = INT_MAX;
     rd_cost->rdcost = INT64_MAX;
     return;
@@ -3685,10 +4230,12 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, TileDataEnc *tile_data,
   mi->mode = ZEROMV;
   mi->uv_mode = DC_PRED;
   mi->ref_frame[0] = LAST_FRAME;
-  mi->ref_frame[1] = NONE;
+  mi->ref_frame[1] = NO_REF_FRAME;
   mi->mv[0].as_int = 0;
   x->skip = 1;
 
+  ctx->sum_y_eobs = 0;
+
   if (cm->interp_filter != BILINEAR) {
     best_filter = EIGHTTAP;
     if (cm->interp_filter == SWITCHABLE &&
@@ -3759,9 +4306,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
   unsigned char segment_id = mi->segment_id;
   int comp_pred, i;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
-  struct buf_2d yv12_mb[4][MAX_MB_PLANE];
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
+  struct buf_2d yv12_mb[4][MAX_MB_PLANE] = { 0 };
   int64_t best_rd = best_rd_so_far;
   int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
   int64_t best_pred_diff[REFERENCE_MODES];
@@ -3777,8 +4322,8 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
   int64_t dist_uv;
   int skip_uv;
   PREDICTION_MODE mode_uv = DC_PRED;
-  const int intra_cost_penalty = vp9_get_intra_cost_penalty(
-      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+  const int intra_cost_penalty =
+      vp9_get_intra_cost_penalty(cpi, bsize, cm->base_qindex, cm->y_dc_delta_q);
   int_mv seg_mvs[4][MAX_REF_FRAMES];
   b_mode_info best_bmodes[4];
   int best_skip2 = 0;
@@ -3787,6 +4332,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
   int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
   int internal_active_edge =
       vp9_active_edge_sb(cpi, mi_row, mi_col) && vp9_internal_image_edge(cpi);
+  const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
 
   x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
   memset(x->zcoeff_blk[TX_4X4], 0, 4);
@@ -3810,7 +4356,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
   rd_cost->rate = INT_MAX;
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
-    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+    if (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) {
       setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
                          frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
     } else {
@@ -3829,7 +4375,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
     int rate2 = 0, rate_y = 0, rate_uv = 0;
     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
     int skippable = 0;
-    int i;
     int this_skip2 = 0;
     int64_t total_sse = INT_MAX;
     int early_term = 0;
@@ -3838,10 +4383,13 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
     ref_frame = vp9_ref_order[ref_index].ref_frame[0];
     second_ref_frame = vp9_ref_order[ref_index].ref_frame[1];
 
+    vp9_zero(x->sum_y_eobs);
+
 #if CONFIG_BETTER_HW_COMPATIBILITY
     // forbid 8X4 and 4X8 partitions if any reference frame is scaled.
     if (bsize == BLOCK_8X4 || bsize == BLOCK_4X8) {
-      int ref_scaled = vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf);
+      int ref_scaled = ref_frame > INTRA_FRAME &&
+                       vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf);
       if (second_ref_frame > INTRA_FRAME)
         ref_scaled += vp9_is_scaled(&cm->frame_refs[second_ref_frame - 1].sf);
       if (ref_scaled) continue;
@@ -3864,7 +4412,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
           case ALTREF_FRAME:
             ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) | (1 << LAST_FRAME);
             break;
-          case NONE:
+          case NO_REF_FRAME:
           case MAX_REF_FRAMES: assert(0 && "Invalid Reference frame"); break;
         }
       }
@@ -3878,13 +4426,22 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
     if (!internal_active_edge &&
         rd_less_than_thresh(best_rd,
                             rd_opt->threshes[segment_id][bsize][ref_index],
-                            tile_data->thresh_freq_fact[bsize][ref_index]))
+                            &rd_thresh_freq_fact[ref_index]))
       continue;
 
+    // This is only used in motion vector unit test.
+    if (cpi->oxcf.motion_vector_unit_test && ref_frame == INTRA_FRAME) continue;
+
     comp_pred = second_ref_frame > INTRA_FRAME;
     if (comp_pred) {
       if (!cpi->allow_comp_inter_inter) continue;
-      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
+
+      if (cm->ref_frame_sign_bias[ref_frame] ==
+          cm->ref_frame_sign_bias[second_ref_frame])
+        continue;
+
+      if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame)))
+        continue;
       // Do not allow compound prediction if the segment level reference frame
       // feature is in use as in this case there can only be one reference.
       if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) continue;
@@ -3978,7 +4535,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
                 : NULL;
 
         if (scaled_ref_frame[ref]) {
-          int i;
           // Swap out the reference frame for a version that's been scaled to
           // match the resolution of the current frame, allowing the existing
           // motion search code to be used without additional modifications.
@@ -4048,9 +4604,11 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
               tmp_best_sse = total_sse;
               tmp_best_skippable = skippable;
               tmp_best_mbmode = *mi;
+              x->sum_y_eobs[TX_4X4] = 0;
               for (i = 0; i < 4; i++) {
                 tmp_best_bmodes[i] = xd->mi[0]->bmi[i];
                 x->zcoeff_blk[TX_4X4][i] = !x->plane[0].eobs[i];
+                x->sum_y_eobs[TX_4X4] += x->plane[0].eobs[i];
               }
               pred_exists = 1;
               if (switchable_filter_index == 0 && sf->use_rd_breakout &&
@@ -4080,6 +4638,11 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
             &rate, &rate_y, &distortion, &skippable, &total_sse,
             (int)this_rd_thresh, seg_mvs, bsi, 0, mi_row, mi_col);
         if (tmp_rd == INT64_MAX) continue;
+        x->sum_y_eobs[TX_4X4] = 0;
+        for (i = 0; i < 4; i++) {
+          x->zcoeff_blk[TX_4X4][i] = !x->plane[0].eobs[i];
+          x->sum_y_eobs[TX_4X4] += x->plane[0].eobs[i];
+        }
       } else {
         total_sse = tmp_best_sse;
         rate = tmp_best_rate;
@@ -4108,14 +4671,13 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
 
       if (tmp_best_rdu > 0) {
         // If even the 'Y' rd value of split is higher than best so far
-        // then dont bother looking at UV
+        // then don't bother looking at UV
         vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col, BLOCK_8X8);
         memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
         if (!super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
                               &uv_sse, BLOCK_8X8, tmp_best_rdu)) {
           for (ref = 0; ref < 2; ++ref) {
             if (scaled_ref_frame[ref]) {
-              int i;
               for (i = 0; i < MAX_MB_PLANE; ++i)
                 xd->plane[i].pre[ref] = backup_yv12[ref][i];
             }
@@ -4132,7 +4694,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
       for (ref = 0; ref < 2; ++ref) {
         if (scaled_ref_frame[ref]) {
           // Restore the prediction frame pointers to their unscaled versions.
-          int i;
           for (i = 0; i < MAX_MB_PLANE; ++i)
             xd->plane[i].pre[ref] = backup_yv12[ref][i];
         }
@@ -4215,6 +4776,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
         if (!x->select_tx_size) swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
         memcpy(ctx->zcoeff_blk, x->zcoeff_blk[TX_4X4],
                sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk);
+        ctx->sum_y_eobs = x->sum_y_eobs[TX_4X4];
 
         for (i = 0; i < 4; i++) best_bmodes[i] = xd->mi[0]->bmi[i];
 
@@ -4330,12 +4892,18 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
   if (!is_inter_block(&best_mbmode)) {
     for (i = 0; i < 4; i++) xd->mi[0]->bmi[i].as_mode = best_bmodes[i].as_mode;
   } else {
-    for (i = 0; i < 4; ++i)
-      memcpy(&xd->mi[0]->bmi[i], &best_bmodes[i], sizeof(b_mode_info));
+    for (i = 0; i < 4; ++i) xd->mi[0]->bmi[i] = best_bmodes[i];
 
     mi->mv[0].as_int = xd->mi[0]->bmi[3].as_mv[0].as_int;
     mi->mv[1].as_int = xd->mi[0]->bmi[3].as_mv[1].as_int;
   }
+  // If the second reference does not exist, set the corresponding mv to zero.
+  if (mi->ref_frame[1] == NO_REF_FRAME) {
+    mi->mv[1].as_int = 0;
+    for (i = 0; i < 4; ++i) {
+      mi->bmi[i].as_mv[1].as_int = 0;
+    }
+  }
 
   for (i = 0; i < REFERENCE_MODES; ++i) {
     if (best_pred_rd[i] == INT64_MAX)
@@ -4360,3 +4928,4 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
   store_coding_context(x, ctx, best_ref_index, best_pred_diff, best_filter_diff,
                        0);
 }
+#endif  // !CONFIG_REALTIME_ONLY
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.h b/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.h
index 795c91aef7..e1147ff943 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_RDOPT_H_
-#define VP9_ENCODER_VP9_RDOPT_H_
+#ifndef VPX_VP9_ENCODER_VP9_RDOPT_H_
+#define VPX_VP9_ENCODER_VP9_RDOPT_H_
 
 #include "vp9/common/vp9_blockd.h"
 
@@ -29,6 +29,7 @@ void vp9_rd_pick_intra_mode_sb(struct VP9_COMP *cpi, struct macroblock *x,
                                struct RD_COST *rd_cost, BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd);
 
+#if !CONFIG_REALTIME_ONLY
 void vp9_rd_pick_inter_mode_sb(struct VP9_COMP *cpi,
                                struct TileDataEnc *tile_data,
                                struct macroblock *x, int mi_row, int mi_col,
@@ -39,21 +40,24 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(
     struct VP9_COMP *cpi, struct TileDataEnc *tile_data, struct macroblock *x,
     struct RD_COST *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
     int64_t best_rd_so_far);
+#endif
 
 int vp9_internal_image_edge(struct VP9_COMP *cpi);
 int vp9_active_h_edge(struct VP9_COMP *cpi, int mi_row, int mi_step);
 int vp9_active_v_edge(struct VP9_COMP *cpi, int mi_col, int mi_step);
 int vp9_active_edge_sb(struct VP9_COMP *cpi, int mi_row, int mi_col);
 
+#if !CONFIG_REALTIME_ONLY
 void vp9_rd_pick_inter_mode_sub8x8(struct VP9_COMP *cpi,
                                    struct TileDataEnc *tile_data,
                                    struct macroblock *x, int mi_row, int mi_col,
                                    struct RD_COST *rd_cost, BLOCK_SIZE bsize,
                                    PICK_MODE_CONTEXT *ctx,
                                    int64_t best_rd_so_far);
+#endif
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_RDOPT_H_
+#endif  // VPX_VP9_ENCODER_VP9_RDOPT_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_resize.c b/media/libvpx/libvpx/vp9/encoder/vp9_resize.c
index f6c4aad4d3..352d8f1273 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_resize.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_resize.c
@@ -360,6 +360,12 @@ static int get_down2_steps(int in_length, int out_length) {
   while ((proj_in_length = get_down2_length(in_length, 1)) >= out_length) {
     ++steps;
     in_length = proj_in_length;
+    if (in_length == 1) {
+      // Special case: we break because any further calls to get_down2_length()
+      // with be with length == 1, which return 1, resulting in an infinite
+      // loop.
+      break;
+    }
   }
   return steps;
 }
@@ -424,11 +430,11 @@ void vp9_resize_plane(const uint8_t *const input, int height, int width,
                       int in_stride, uint8_t *output, int height2, int width2,
                       int out_stride) {
   int i;
-  uint8_t *intbuf = (uint8_t *)malloc(sizeof(uint8_t) * width2 * height);
+  uint8_t *intbuf = (uint8_t *)calloc(width2 * height, sizeof(*intbuf));
   uint8_t *tmpbuf =
-      (uint8_t *)malloc(sizeof(uint8_t) * (width < height ? height : width));
-  uint8_t *arrbuf = (uint8_t *)malloc(sizeof(uint8_t) * height);
-  uint8_t *arrbuf2 = (uint8_t *)malloc(sizeof(uint8_t) * height2);
+      (uint8_t *)calloc(width < height ? height : width, sizeof(*tmpbuf));
+  uint8_t *arrbuf = (uint8_t *)calloc(height, sizeof(*arrbuf));
+  uint8_t *arrbuf2 = (uint8_t *)calloc(height2, sizeof(*arrbuf2));
   if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL)
     goto Error;
   assert(width > 0);
@@ -506,10 +512,12 @@ static void highbd_interpolate(const uint16_t *const input, int inlength,
       sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
       filter = interp_filters[sub_pel];
       sum = 0;
-      for (k = 0; k < INTERP_TAPS; ++k)
+      for (k = 0; k < INTERP_TAPS; ++k) {
+        assert(int_pel - INTERP_TAPS / 2 + 1 + k < inlength);
         sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k < 0
                                       ? 0
                                       : int_pel - INTERP_TAPS / 2 + 1 + k)];
+      }
       *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
     }
     // Middle part.
@@ -720,6 +728,10 @@ void vp9_highbd_resize_plane(const uint8_t *const input, int height, int width,
   uint16_t *arrbuf2 = (uint16_t *)malloc(sizeof(uint16_t) * height2);
   if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL)
     goto Error;
+  assert(width > 0);
+  assert(height > 0);
+  assert(width2 > 0);
+  assert(height2 > 0);
   for (i = 0; i < height; ++i) {
     highbd_resize_multistep(CONVERT_TO_SHORTPTR(input + in_stride * i), width,
                             intbuf + width2 * i, width2, tmpbuf, bd);
@@ -738,83 +750,3 @@ Error:
   free(arrbuf2);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-
-void vp9_resize_frame420(const uint8_t *const y, int y_stride,
-                         const uint8_t *const u, const uint8_t *const v,
-                         int uv_stride, int height, int width, uint8_t *oy,
-                         int oy_stride, uint8_t *ou, uint8_t *ov,
-                         int ouv_stride, int oheight, int owidth) {
-  vp9_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
-  vp9_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2,
-                   owidth / 2, ouv_stride);
-  vp9_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2,
-                   owidth / 2, ouv_stride);
-}
-
-void vp9_resize_frame422(const uint8_t *const y, int y_stride,
-                         const uint8_t *const u, const uint8_t *const v,
-                         int uv_stride, int height, int width, uint8_t *oy,
-                         int oy_stride, uint8_t *ou, uint8_t *ov,
-                         int ouv_stride, int oheight, int owidth) {
-  vp9_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
-  vp9_resize_plane(u, height, width / 2, uv_stride, ou, oheight, owidth / 2,
-                   ouv_stride);
-  vp9_resize_plane(v, height, width / 2, uv_stride, ov, oheight, owidth / 2,
-                   ouv_stride);
-}
-
-void vp9_resize_frame444(const uint8_t *const y, int y_stride,
-                         const uint8_t *const u, const uint8_t *const v,
-                         int uv_stride, int height, int width, uint8_t *oy,
-                         int oy_stride, uint8_t *ou, uint8_t *ov,
-                         int ouv_stride, int oheight, int owidth) {
-  vp9_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
-  vp9_resize_plane(u, height, width, uv_stride, ou, oheight, owidth,
-                   ouv_stride);
-  vp9_resize_plane(v, height, width, uv_stride, ov, oheight, owidth,
-                   ouv_stride);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_resize_frame420(const uint8_t *const y, int y_stride,
-                                const uint8_t *const u, const uint8_t *const v,
-                                int uv_stride, int height, int width,
-                                uint8_t *oy, int oy_stride, uint8_t *ou,
-                                uint8_t *ov, int ouv_stride, int oheight,
-                                int owidth, int bd) {
-  vp9_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
-                          oy_stride, bd);
-  vp9_highbd_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2,
-                          owidth / 2, ouv_stride, bd);
-  vp9_highbd_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2,
-                          owidth / 2, ouv_stride, bd);
-}
-
-void vp9_highbd_resize_frame422(const uint8_t *const y, int y_stride,
-                                const uint8_t *const u, const uint8_t *const v,
-                                int uv_stride, int height, int width,
-                                uint8_t *oy, int oy_stride, uint8_t *ou,
-                                uint8_t *ov, int ouv_stride, int oheight,
-                                int owidth, int bd) {
-  vp9_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
-                          oy_stride, bd);
-  vp9_highbd_resize_plane(u, height, width / 2, uv_stride, ou, oheight,
-                          owidth / 2, ouv_stride, bd);
-  vp9_highbd_resize_plane(v, height, width / 2, uv_stride, ov, oheight,
-                          owidth / 2, ouv_stride, bd);
-}
-
-void vp9_highbd_resize_frame444(const uint8_t *const y, int y_stride,
-                                const uint8_t *const u, const uint8_t *const v,
-                                int uv_stride, int height, int width,
-                                uint8_t *oy, int oy_stride, uint8_t *ou,
-                                uint8_t *ov, int ouv_stride, int oheight,
-                                int owidth, int bd) {
-  vp9_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
-                          oy_stride, bd);
-  vp9_highbd_resize_plane(u, height, width, uv_stride, ou, oheight, owidth,
-                          ouv_stride, bd);
-  vp9_highbd_resize_plane(v, height, width, uv_stride, ov, oheight, owidth,
-                          ouv_stride, bd);
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_resize.h b/media/libvpx/libvpx/vp9/encoder/vp9_resize.h
index d3282ee191..7a984dbc94 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_resize.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_resize.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_RESIZE_H_
-#define VP9_ENCODER_VP9_RESIZE_H_
+#ifndef VPX_VP9_ENCODER_VP9_RESIZE_H_
+#define VPX_VP9_ENCODER_VP9_RESIZE_H_
 
 #include <stdio.h>
 #include "vpx/vpx_integer.h"
@@ -21,48 +21,15 @@ extern "C" {
 void vp9_resize_plane(const uint8_t *const input, int height, int width,
                       int in_stride, uint8_t *output, int height2, int width2,
                       int out_stride);
-void vp9_resize_frame420(const uint8_t *const y, int y_stride,
-                         const uint8_t *const u, const uint8_t *const v,
-                         int uv_stride, int height, int width, uint8_t *oy,
-                         int oy_stride, uint8_t *ou, uint8_t *ov,
-                         int ouv_stride, int oheight, int owidth);
-void vp9_resize_frame422(const uint8_t *const y, int y_stride,
-                         const uint8_t *const u, const uint8_t *const v,
-                         int uv_stride, int height, int width, uint8_t *oy,
-                         int oy_stride, uint8_t *ou, uint8_t *ov,
-                         int ouv_stride, int oheight, int owidth);
-void vp9_resize_frame444(const uint8_t *const y, int y_stride,
-                         const uint8_t *const u, const uint8_t *const v,
-                         int uv_stride, int height, int width, uint8_t *oy,
-                         int oy_stride, uint8_t *ou, uint8_t *ov,
-                         int ouv_stride, int oheight, int owidth);
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp9_highbd_resize_plane(const uint8_t *const input, int height, int width,
                              int in_stride, uint8_t *output, int height2,
                              int width2, int out_stride, int bd);
-void vp9_highbd_resize_frame420(const uint8_t *const y, int y_stride,
-                                const uint8_t *const u, const uint8_t *const v,
-                                int uv_stride, int height, int width,
-                                uint8_t *oy, int oy_stride, uint8_t *ou,
-                                uint8_t *ov, int ouv_stride, int oheight,
-                                int owidth, int bd);
-void vp9_highbd_resize_frame422(const uint8_t *const y, int y_stride,
-                                const uint8_t *const u, const uint8_t *const v,
-                                int uv_stride, int height, int width,
-                                uint8_t *oy, int oy_stride, uint8_t *ou,
-                                uint8_t *ov, int ouv_stride, int oheight,
-                                int owidth, int bd);
-void vp9_highbd_resize_frame444(const uint8_t *const y, int y_stride,
-                                const uint8_t *const u, const uint8_t *const v,
-                                int uv_stride, int height, int width,
-                                uint8_t *oy, int oy_stride, uint8_t *ou,
-                                uint8_t *ov, int ouv_stride, int oheight,
-                                int owidth, int bd);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_RESIZE_H_
+#endif  // VPX_VP9_ENCODER_VP9_RESIZE_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_segmentation.c b/media/libvpx/libvpx/vp9/encoder/vp9_segmentation.c
index 4a5a68e07a..d75488a8e6 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_segmentation.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_segmentation.c
@@ -9,6 +9,7 @@
  */
 
 #include <limits.h>
+#include <math.h>
 
 #include "vpx_mem/vpx_mem.h"
 
@@ -38,7 +39,7 @@ void vp9_set_segment_data(struct segmentation *seg, signed char *feature_data,
 }
 void vp9_disable_segfeature(struct segmentation *seg, int segment_id,
                             SEG_LVL_FEATURES feature_id) {
-  seg->feature_mask[segment_id] &= ~(1 << feature_id);
+  seg->feature_mask[segment_id] &= ~(1u << feature_id);
 }
 
 void vp9_clear_segdata(struct segmentation *seg, int segment_id,
@@ -46,6 +47,59 @@ void vp9_clear_segdata(struct segmentation *seg, int segment_id,
   seg->feature_data[segment_id][feature_id] = 0;
 }
 
+void vp9_psnr_aq_mode_setup(struct segmentation *seg) {
+  int i;
+
+  vp9_enable_segmentation(seg);
+  vp9_clearall_segfeatures(seg);
+  seg->abs_delta = SEGMENT_DELTADATA;
+
+  for (i = 0; i < MAX_SEGMENTS; ++i) {
+    vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, 2 * (i - (MAX_SEGMENTS / 2)));
+    vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
+  }
+}
+
+void vp9_perceptual_aq_mode_setup(struct VP9_COMP *cpi,
+                                  struct segmentation *seg) {
+  const VP9_COMMON *cm = &cpi->common;
+  const int seg_counts = cpi->kmeans_ctr_num;
+  const int base_qindex = cm->base_qindex;
+  const double base_qstep = vp9_convert_qindex_to_q(base_qindex, cm->bit_depth);
+  const double mid_ctr = cpi->kmeans_ctr_ls[seg_counts / 2];
+  const double var_diff_scale = 4.0;
+  int i;
+
+  assert(seg_counts <= MAX_SEGMENTS);
+
+  vp9_enable_segmentation(seg);
+  vp9_clearall_segfeatures(seg);
+  seg->abs_delta = SEGMENT_DELTADATA;
+
+  for (i = 0; i < seg_counts / 2; ++i) {
+    double wiener_var_diff = mid_ctr - cpi->kmeans_ctr_ls[i];
+    double target_qstep = base_qstep / (1.0 + wiener_var_diff / var_diff_scale);
+    int target_qindex = vp9_convert_q_to_qindex(target_qstep, cm->bit_depth);
+    assert(wiener_var_diff >= 0.0);
+
+    vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, target_qindex - base_qindex);
+    vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
+  }
+
+  vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, 0);
+  vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
+
+  for (; i < seg_counts; ++i) {
+    double wiener_var_diff = cpi->kmeans_ctr_ls[i] - mid_ctr;
+    double target_qstep = base_qstep * (1.0 + wiener_var_diff / var_diff_scale);
+    int target_qindex = vp9_convert_q_to_qindex(target_qstep, cm->bit_depth);
+    assert(wiener_var_diff >= 0.0);
+
+    vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, target_qindex - base_qindex);
+    vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
+  }
+}
+
 // Based on set of segment counts calculate a probability tree
 static void calc_segtree_probs(int *segcounts, vpx_prob *segment_tree_probs) {
   // Work out probabilities of each segment
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_segmentation.h b/media/libvpx/libvpx/vp9/encoder/vp9_segmentation.h
index 562805543b..9404c38bc8 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_segmentation.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_segmentation.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_SEGMENTATION_H_
-#define VP9_ENCODER_VP9_SEGMENTATION_H_
+#ifndef VPX_VP9_ENCODER_VP9_SEGMENTATION_H_
+#define VPX_VP9_ENCODER_VP9_SEGMENTATION_H_
 
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/encoder/vp9_encoder.h"
@@ -26,6 +26,11 @@ void vp9_disable_segfeature(struct segmentation *seg, int segment_id,
 void vp9_clear_segdata(struct segmentation *seg, int segment_id,
                        SEG_LVL_FEATURES feature_id);
 
+void vp9_psnr_aq_mode_setup(struct segmentation *seg);
+
+void vp9_perceptual_aq_mode_setup(struct VP9_COMP *cpi,
+                                  struct segmentation *seg);
+
 // The values given for each segment can be either deltas (from the default
 // value chosen for the frame) or absolute values.
 //
@@ -47,4 +52,4 @@ void vp9_reset_segment_features(struct segmentation *seg);
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_SEGMENTATION_H_
+#endif  // VPX_VP9_ENCODER_VP9_SEGMENTATION_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_skin_detection.c b/media/libvpx/libvpx/vp9/encoder/vp9_skin_detection.c
index 3f3d48fb9f..cc6c967767 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_skin_detection.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_skin_detection.c
@@ -15,75 +15,6 @@
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_skin_detection.h"
 
-#define MODEL_MODE 1
-
-// Fixed-point skin color model parameters.
-static const int skin_mean[5][2] = { { 7463, 9614 },
-                                     { 6400, 10240 },
-                                     { 7040, 10240 },
-                                     { 8320, 9280 },
-                                     { 6800, 9614 } };
-static const int skin_inv_cov[4] = { 4107, 1663, 1663, 2157 };  // q16
-static const int skin_threshold[6] = { 1570636, 1400000, 800000,
-                                       800000,  800000,  800000 };  // q18
-
-// Thresholds on luminance.
-static const int y_low = 40;
-static const int y_high = 220;
-
-// Evaluates the Mahalanobis distance measure for the input CbCr values.
-static int evaluate_skin_color_difference(int cb, int cr, int idx) {
-  const int cb_q6 = cb << 6;
-  const int cr_q6 = cr << 6;
-  const int cb_diff_q12 =
-      (cb_q6 - skin_mean[idx][0]) * (cb_q6 - skin_mean[idx][0]);
-  const int cbcr_diff_q12 =
-      (cb_q6 - skin_mean[idx][0]) * (cr_q6 - skin_mean[idx][1]);
-  const int cr_diff_q12 =
-      (cr_q6 - skin_mean[idx][1]) * (cr_q6 - skin_mean[idx][1]);
-  const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10;
-  const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10;
-  const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10;
-  const int skin_diff =
-      skin_inv_cov[0] * cb_diff_q2 + skin_inv_cov[1] * cbcr_diff_q2 +
-      skin_inv_cov[2] * cbcr_diff_q2 + skin_inv_cov[3] * cr_diff_q2;
-  return skin_diff;
-}
-
-int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr,
-                   int motion) {
-  if (y < y_low || y > y_high) {
-    return 0;
-  } else {
-    if (MODEL_MODE == 0) {
-      return (evaluate_skin_color_difference(cb, cr, 0) < skin_threshold[0]);
-    } else {
-      int i = 0;
-      // Exit on grey.
-      if (cb == 128 && cr == 128) return 0;
-      // Exit on very strong cb.
-      if (cb > 150 && cr < 110) return 0;
-      for (; i < 5; i++) {
-        int skin_color_diff = evaluate_skin_color_difference(cb, cr, i);
-        if (skin_color_diff < skin_threshold[i + 1]) {
-          if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2))
-            return 0;
-          else if (motion == 0 &&
-                   skin_color_diff > (skin_threshold[i + 1] >> 1))
-            return 0;
-          else
-            return 1;
-        }
-        // Exit if difference is much large than the threshold.
-        if (skin_color_diff > (skin_threshold[i + 1] << 3)) {
-          return 0;
-        }
-      }
-      return 0;
-    }
-  }
-}
-
 int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
                            int stride, int strideuv, int bsize,
                            int consec_zeromv, int curr_motion_magn) {
@@ -100,31 +31,113 @@ int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
     const uint8_t ysource = y[y_height_shift * stride + y_width_shift];
     const uint8_t usource = u[uv_height_shift * strideuv + uv_width_shift];
     const uint8_t vsource = v[uv_height_shift * strideuv + uv_width_shift];
+
     if (consec_zeromv > 25 && curr_motion_magn == 0) motion = 0;
-    return vp9_skin_pixel(ysource, usource, vsource, motion);
+    return vpx_skin_pixel(ysource, usource, vsource, motion);
   }
 }
 
-#ifdef OUTPUT_YUV_SKINMAP
-// For viewing skin map on input source.
-void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) {
-  int i, j, mi_row, mi_col, num_bl;
+void vp9_compute_skin_sb(VP9_COMP *const cpi, BLOCK_SIZE bsize, int mi_row,
+                         int mi_col) {
+  int i, j, num_bl;
   VP9_COMMON *const cm = &cpi->common;
-  uint8_t *y;
   const uint8_t *src_y = cpi->Source->y_buffer;
   const uint8_t *src_u = cpi->Source->u_buffer;
   const uint8_t *src_v = cpi->Source->v_buffer;
   const int src_ystride = cpi->Source->y_stride;
   const int src_uvstride = cpi->Source->uv_stride;
-  int y_bsize = 16;  // Use 8x8 or 16x16.
-  int uv_bsize = y_bsize >> 1;
-  int ypos = y_bsize >> 1;
-  int uvpos = uv_bsize >> 1;
-  int shy = (y_bsize == 8) ? 3 : 4;
-  int shuv = shy - 1;
-  int fac = y_bsize / 8;
-  // Use center pixel or average of center 2x2 pixels.
-  int mode_filter = 0;
+  const int y_bsize = 4 << b_width_log2_lookup[bsize];
+  const int uv_bsize = y_bsize >> 1;
+  const int shy = (y_bsize == 8) ? 3 : 4;
+  const int shuv = shy - 1;
+  const int fac = y_bsize / 8;
+  const int y_shift = src_ystride * (mi_row << 3) + (mi_col << 3);
+  const int uv_shift = src_uvstride * (mi_row << 2) + (mi_col << 2);
+  const int mi_row_limit = VPXMIN(mi_row + 8, cm->mi_rows - 2);
+  const int mi_col_limit = VPXMIN(mi_col + 8, cm->mi_cols - 2);
+  src_y += y_shift;
+  src_u += uv_shift;
+  src_v += uv_shift;
+
+  for (i = mi_row; i < mi_row_limit; i += fac) {
+    num_bl = 0;
+    for (j = mi_col; j < mi_col_limit; j += fac) {
+      int consec_zeromv = 0;
+      int bl_index = i * cm->mi_cols + j;
+      int bl_index1 = bl_index + 1;
+      int bl_index2 = bl_index + cm->mi_cols;
+      int bl_index3 = bl_index2 + 1;
+      // Don't detect skin on the boundary.
+      if (i == 0 || j == 0) continue;
+      if (bsize == BLOCK_8X8)
+        consec_zeromv = cpi->consec_zero_mv[bl_index];
+      else
+        consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index],
+                               VPXMIN(cpi->consec_zero_mv[bl_index1],
+                                      VPXMIN(cpi->consec_zero_mv[bl_index2],
+                                             cpi->consec_zero_mv[bl_index3])));
+      cpi->skin_map[bl_index] =
+          vp9_compute_skin_block(src_y, src_u, src_v, src_ystride, src_uvstride,
+                                 bsize, consec_zeromv, 0);
+      num_bl++;
+      src_y += y_bsize;
+      src_u += uv_bsize;
+      src_v += uv_bsize;
+    }
+    src_y += (src_ystride << shy) - (num_bl << shy);
+    src_u += (src_uvstride << shuv) - (num_bl << shuv);
+    src_v += (src_uvstride << shuv) - (num_bl << shuv);
+  }
+
+  // Remove isolated skin blocks (none of its neighbors are skin) and isolated
+  // non-skin blocks (all of its neighbors are skin).
+  // Skip 4 corner blocks which have only 3 neighbors to remove isolated skin
+  // blocks. Skip superblock borders to remove isolated non-skin blocks.
+  for (i = mi_row; i < mi_row_limit; i += fac) {
+    for (j = mi_col; j < mi_col_limit; j += fac) {
+      int bl_index = i * cm->mi_cols + j;
+      int num_neighbor = 0;
+      int mi, mj;
+      int non_skin_threshold = 8;
+      // Skip 4 corners.
+      if ((i == mi_row && (j == mi_col || j == mi_col_limit - fac)) ||
+          (i == mi_row_limit - fac && (j == mi_col || j == mi_col_limit - fac)))
+        continue;
+      // There are only 5 neighbors for non-skin blocks on the border.
+      if (i == mi_row || i == mi_row_limit - fac || j == mi_col ||
+          j == mi_col_limit - fac)
+        non_skin_threshold = 5;
+
+      for (mi = -fac; mi <= fac; mi += fac) {
+        for (mj = -fac; mj <= fac; mj += fac) {
+          if (i + mi >= mi_row && i + mi < mi_row_limit && j + mj >= mi_col &&
+              j + mj < mi_col_limit) {
+            int bl_neighbor_index = (i + mi) * cm->mi_cols + j + mj;
+            if (cpi->skin_map[bl_neighbor_index]) num_neighbor++;
+          }
+        }
+      }
+
+      if (cpi->skin_map[bl_index] && num_neighbor < 2)
+        cpi->skin_map[bl_index] = 0;
+      if (!cpi->skin_map[bl_index] && num_neighbor == non_skin_threshold)
+        cpi->skin_map[bl_index] = 1;
+    }
+  }
+}
+
+#ifdef OUTPUT_YUV_SKINMAP
+// For viewing skin map on input source.
+void vp9_output_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) {
+  int i, j, mi_row, mi_col, num_bl;
+  VP9_COMMON *const cm = &cpi->common;
+  uint8_t *y;
+  const uint8_t *src_y = cpi->Source->y_buffer;
+  const int src_ystride = cpi->Source->y_stride;
+  const int y_bsize = 16;  // Use 8x8 or 16x16.
+  const int shy = (y_bsize == 8) ? 3 : 4;
+  const int fac = y_bsize / 8;
+
   YV12_BUFFER_CONFIG skinmap;
   memset(&skinmap, 0, sizeof(YV12_BUFFER_CONFIG));
   if (vpx_alloc_frame_buffer(&skinmap, cm->width, cm->height, cm->subsampling_x,
@@ -141,65 +154,21 @@ void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) {
   for (mi_row = 0; mi_row < cm->mi_rows - 1; mi_row += fac) {
     num_bl = 0;
     for (mi_col = 0; mi_col < cm->mi_cols - 1; mi_col += fac) {
-      int is_skin = 0;
-      if (mode_filter == 1) {
-        // Use 2x2 average at center.
-        uint8_t ysource = src_y[ypos * src_ystride + ypos];
-        uint8_t usource = src_u[uvpos * src_uvstride + uvpos];
-        uint8_t vsource = src_v[uvpos * src_uvstride + uvpos];
-        uint8_t ysource2 = src_y[(ypos + 1) * src_ystride + ypos];
-        uint8_t usource2 = src_u[(uvpos + 1) * src_uvstride + uvpos];
-        uint8_t vsource2 = src_v[(uvpos + 1) * src_uvstride + uvpos];
-        uint8_t ysource3 = src_y[ypos * src_ystride + (ypos + 1)];
-        uint8_t usource3 = src_u[uvpos * src_uvstride + (uvpos + 1)];
-        uint8_t vsource3 = src_v[uvpos * src_uvstride + (uvpos + 1)];
-        uint8_t ysource4 = src_y[(ypos + 1) * src_ystride + (ypos + 1)];
-        uint8_t usource4 = src_u[(uvpos + 1) * src_uvstride + (uvpos + 1)];
-        uint8_t vsource4 = src_v[(uvpos + 1) * src_uvstride + (uvpos + 1)];
-        ysource = (ysource + ysource2 + ysource3 + ysource4) >> 2;
-        usource = (usource + usource2 + usource3 + usource4) >> 2;
-        vsource = (vsource + vsource2 + vsource3 + vsource4) >> 2;
-        is_skin = vp9_skin_pixel(ysource, usource, vsource, 1);
-      } else {
-        int block_size = BLOCK_8X8;
-        int consec_zeromv = 0;
-        int bl_index = mi_row * cm->mi_cols + mi_col;
-        int bl_index1 = bl_index + 1;
-        int bl_index2 = bl_index + cm->mi_cols;
-        int bl_index3 = bl_index2 + 1;
-        if (y_bsize == 8)
-          consec_zeromv = cpi->consec_zero_mv[bl_index];
-        else
-          consec_zeromv =
-              VPXMIN(cpi->consec_zero_mv[bl_index],
-                     VPXMIN(cpi->consec_zero_mv[bl_index1],
-                            VPXMIN(cpi->consec_zero_mv[bl_index2],
-                                   cpi->consec_zero_mv[bl_index3])));
-        if (y_bsize == 16) block_size = BLOCK_16X16;
-        is_skin =
-            vp9_compute_skin_block(src_y, src_u, src_v, src_ystride,
-                                   src_uvstride, block_size, consec_zeromv, 0);
-      }
+      const int block_index = mi_row * cm->mi_cols + mi_col;
+      const int is_skin = cpi->skin_map[block_index];
       for (i = 0; i < y_bsize; i++) {
         for (j = 0; j < y_bsize; j++) {
-          if (is_skin)
-            y[i * src_ystride + j] = 255;
-          else
-            y[i * src_ystride + j] = src_y[i * src_ystride + j];
+          y[i * src_ystride + j] = is_skin ? 255 : src_y[i * src_ystride + j];
         }
       }
       num_bl++;
       y += y_bsize;
       src_y += y_bsize;
-      src_u += uv_bsize;
-      src_v += uv_bsize;
     }
     y += (src_ystride << shy) - (num_bl << shy);
     src_y += (src_ystride << shy) - (num_bl << shy);
-    src_u += (src_uvstride << shuv) - (num_bl << shuv);
-    src_v += (src_uvstride << shuv) - (num_bl << shuv);
   }
-  vp9_write_yuv_frame_420(&skinmap, yuv_skinmap_file);
+  vpx_write_yuv_frame(yuv_skinmap_file, &skinmap);
   vpx_free_frame_buffer(&skinmap);
 }
 #endif
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_skin_detection.h b/media/libvpx/libvpx/vp9/encoder/vp9_skin_detection.h
index c77382dbd7..46a722af9b 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_skin_detection.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_skin_detection.h
@@ -8,10 +8,12 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_SKIN_MAP_H_
-#define VP9_ENCODER_VP9_SKIN_MAP_H_
+#ifndef VPX_VP9_ENCODER_VP9_SKIN_DETECTION_H_
+#define VPX_VP9_ENCODER_VP9_SKIN_DETECTION_H_
 
 #include "vp9/common/vp9_blockd.h"
+#include "vpx_dsp/skin_detection.h"
+#include "vpx_util/vpx_write_yuv_frame.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -19,23 +21,20 @@ extern "C" {
 
 struct VP9_COMP;
 
-// #define OUTPUT_YUV_SKINMAP
-
-int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr,
-                   int motion);
-
 int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
                            int stride, int strideuv, int bsize,
                            int consec_zeromv, int curr_motion_magn);
 
+void vp9_compute_skin_sb(struct VP9_COMP *const cpi, BLOCK_SIZE bsize,
+                         int mi_row, int mi_col);
+
 #ifdef OUTPUT_YUV_SKINMAP
 // For viewing skin map on input source.
-void vp9_compute_skin_map(struct VP9_COMP *const cpi, FILE *yuv_skinmap_file);
-extern void vp9_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f);
+void vp9_output_skin_map(struct VP9_COMP *const cpi, FILE *yuv_skinmap_file);
 #endif
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_SKIN_MAP_H_
+#endif  // VPX_VP9_ENCODER_VP9_SKIN_DETECTION_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.c b/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.c
index 81cb431ba5..3268f64648 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.c
@@ -16,34 +16,33 @@
 #include "vpx_dsp/vpx_dsp_common.h"
 
 // Mesh search patters for various speed settings
-static MESH_PATTERN best_quality_mesh_pattern[MAX_MESH_STEP] = {
-  { 64, 4 }, { 28, 2 }, { 15, 1 }, { 7, 1 }
+// Define 2 mesh density levels for FC_GRAPHICS_ANIMATION content type and non
+// FC_GRAPHICS_ANIMATION content type.
+static MESH_PATTERN best_quality_mesh_pattern[2][MAX_MESH_STEP] = {
+  { { 64, 4 }, { 28, 2 }, { 15, 1 }, { 7, 1 } },
+  { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
 };
 
-#define MAX_MESH_SPEED 5  // Max speed setting for mesh motion method
+#if !CONFIG_REALTIME_ONLY
+// Define 3 mesh density levels to control the number of searches.
+#define MESH_DENSITY_LEVELS 3
 static MESH_PATTERN
-    good_quality_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = {
-      { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
+    good_quality_mesh_patterns[MESH_DENSITY_LEVELS][MAX_MESH_STEP] = {
       { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
       { { 64, 8 }, { 14, 2 }, { 7, 1 }, { 7, 1 } },
       { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
-      { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
-      { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
     };
-static unsigned char good_quality_max_mesh_pct[MAX_MESH_SPEED + 1] = {
-  50, 25, 15, 5, 1, 1
-};
 
 // Intra only frames, golden frames (except alt ref overlays) and
 // alt ref frames tend to be coded at a higher than ambient quality
 static int frame_is_boosted(const VP9_COMP *cpi) {
-  return frame_is_kf_gf_arf(cpi) || vp9_is_upper_layer_key_frame(cpi);
+  return frame_is_kf_gf_arf(cpi);
 }
 
 // Sets a partition size down to which the auto partition code will always
 // search (can go lower), based on the image dimensions. The logic here
 // is that the extent to which ringing artefacts are offensive, depends
-// partly on the screen area that over which they propogate. Propogation is
+// partly on the screen area that over which they propagate. Propagation is
 // limited by transform block size but the screen area take up by a given block
 // size will be larger for a small image format stretched to full screen.
 static BLOCK_SIZE set_partition_min_limit(VP9_COMMON *const cm) {
@@ -66,56 +65,127 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
                                                        SPEED_FEATURES *sf,
                                                        int speed) {
   VP9_COMMON *const cm = &cpi->common;
+  const int min_frame_size = VPXMIN(cm->width, cm->height);
+  const int is_480p_or_larger = min_frame_size >= 480;
+  const int is_720p_or_larger = min_frame_size >= 720;
+  const int is_1080p_or_larger = min_frame_size >= 1080;
+  const int is_2160p_or_larger = min_frame_size >= 2160;
+  const int boosted = frame_is_boosted(cpi);
 
-  if (speed >= 1) {
-    if (VPXMIN(cm->width, cm->height) >= 720) {
-      sf->disable_split_mask =
-          cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
-      sf->partition_search_breakout_dist_thr = (1 << 23);
+  // speed 0 features
+  sf->partition_search_breakout_thr.dist = (1 << 20);
+  sf->partition_search_breakout_thr.rate = 80;
+  sf->use_square_only_thresh_high = BLOCK_SIZES;
+  sf->use_square_only_thresh_low = BLOCK_4X4;
+
+  if (is_480p_or_larger) {
+    // Currently, the machine-learning based partition search early termination
+    // is only used while VPXMIN(cm->width, cm->height) >= 480 and speed = 0.
+    sf->rd_ml_partition.search_early_termination = 1;
+    sf->recode_tolerance_high = 45;
+  } else {
+    sf->use_square_only_thresh_high = BLOCK_32X32;
+  }
+  if (is_720p_or_larger) {
+    sf->alt_ref_search_fp = 1;
+  }
+
+  if (!is_1080p_or_larger) {
+    sf->rd_ml_partition.search_breakout = 1;
+    if (is_720p_or_larger) {
+      sf->rd_ml_partition.search_breakout_thresh[0] = 0.0f;
+      sf->rd_ml_partition.search_breakout_thresh[1] = 0.0f;
+      sf->rd_ml_partition.search_breakout_thresh[2] = 0.0f;
     } else {
-      sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
-      sf->partition_search_breakout_dist_thr = (1 << 21);
+      sf->rd_ml_partition.search_breakout_thresh[0] = 2.5f;
+      sf->rd_ml_partition.search_breakout_thresh[1] = 1.5f;
+      sf->rd_ml_partition.search_breakout_thresh[2] = 1.5f;
     }
   }
 
+  if (!is_720p_or_larger) {
+    if (is_480p_or_larger)
+      sf->prune_single_mode_based_on_mv_diff_mode_rate = boosted ? 0 : 1;
+    else
+      sf->prune_single_mode_based_on_mv_diff_mode_rate = 1;
+  }
+
+  if (speed >= 1) {
+    sf->rd_ml_partition.search_early_termination = 0;
+    sf->rd_ml_partition.search_breakout = 1;
+    if (is_480p_or_larger)
+      sf->use_square_only_thresh_high = BLOCK_64X64;
+    else
+      sf->use_square_only_thresh_high = BLOCK_32X32;
+    sf->use_square_only_thresh_low = BLOCK_16X16;
+    if (is_720p_or_larger) {
+      sf->disable_split_mask =
+          cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
+      sf->partition_search_breakout_thr.dist = (1 << 22);
+      sf->rd_ml_partition.search_breakout_thresh[0] = -5.0f;
+      sf->rd_ml_partition.search_breakout_thresh[1] = -5.0f;
+      sf->rd_ml_partition.search_breakout_thresh[2] = -9.0f;
+    } else {
+      sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+      sf->partition_search_breakout_thr.dist = (1 << 21);
+      sf->rd_ml_partition.search_breakout_thresh[0] = -1.0f;
+      sf->rd_ml_partition.search_breakout_thresh[1] = -1.0f;
+      sf->rd_ml_partition.search_breakout_thresh[2] = -1.0f;
+    }
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cpi->Source->flags & YV12_FLAG_HIGHBITDEPTH) {
+      sf->rd_ml_partition.search_breakout_thresh[0] -= 1.0f;
+      sf->rd_ml_partition.search_breakout_thresh[1] -= 1.0f;
+      sf->rd_ml_partition.search_breakout_thresh[2] -= 1.0f;
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+
   if (speed >= 2) {
-    if (VPXMIN(cm->width, cm->height) >= 720) {
+    sf->use_square_only_thresh_high = BLOCK_4X4;
+    sf->use_square_only_thresh_low = BLOCK_SIZES;
+    if (is_720p_or_larger) {
       sf->disable_split_mask =
           cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
       sf->adaptive_pred_interp_filter = 0;
-      sf->partition_search_breakout_dist_thr = (1 << 24);
-      sf->partition_search_breakout_rate_thr = 120;
+      sf->partition_search_breakout_thr.dist = (1 << 24);
+      sf->partition_search_breakout_thr.rate = 120;
+      sf->rd_ml_partition.search_breakout = 0;
     } else {
       sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
-      sf->partition_search_breakout_dist_thr = (1 << 22);
-      sf->partition_search_breakout_rate_thr = 100;
+      sf->partition_search_breakout_thr.dist = (1 << 22);
+      sf->partition_search_breakout_thr.rate = 100;
+      sf->rd_ml_partition.search_breakout_thresh[0] = 0.0f;
+      sf->rd_ml_partition.search_breakout_thresh[1] = -1.0f;
+      sf->rd_ml_partition.search_breakout_thresh[2] = -4.0f;
     }
     sf->rd_auto_partition_min_limit = set_partition_min_limit(cm);
 
     // Use a set of speed features for 4k videos.
-    if (VPXMIN(cm->width, cm->height) >= 2160) {
+    if (is_2160p_or_larger) {
       sf->use_square_partition_only = 1;
       sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
       sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
       sf->alt_ref_search_fp = 1;
-      sf->cb_pred_filter_search = 1;
+      sf->cb_pred_filter_search = 2;
       sf->adaptive_interp_filter_search = 1;
       sf->disable_split_mask = DISABLE_ALL_SPLIT;
     }
   }
 
   if (speed >= 3) {
-    if (VPXMIN(cm->width, cm->height) >= 720) {
+    sf->rd_ml_partition.search_breakout = 0;
+    if (is_720p_or_larger) {
       sf->disable_split_mask = DISABLE_ALL_SPLIT;
       sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0;
-      sf->partition_search_breakout_dist_thr = (1 << 25);
-      sf->partition_search_breakout_rate_thr = 200;
+      sf->partition_search_breakout_thr.dist = (1 << 25);
+      sf->partition_search_breakout_thr.rate = 200;
     } else {
       sf->max_intra_bsize = BLOCK_32X32;
       sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
       sf->schedule_mode_search = cm->base_qindex < 175 ? 1 : 0;
-      sf->partition_search_breakout_dist_thr = (1 << 23);
-      sf->partition_search_breakout_rate_thr = 120;
+      sf->partition_search_breakout_thr.dist = (1 << 23);
+      sf->partition_search_breakout_thr.rate = 120;
     }
   }
 
@@ -129,37 +199,87 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
   }
 
   if (speed >= 4) {
-    if (VPXMIN(cm->width, cm->height) >= 720) {
-      sf->partition_search_breakout_dist_thr = (1 << 26);
+    sf->partition_search_breakout_thr.rate = 300;
+    if (is_720p_or_larger) {
+      sf->partition_search_breakout_thr.dist = (1 << 26);
     } else {
-      sf->partition_search_breakout_dist_thr = (1 << 24);
+      sf->partition_search_breakout_thr.dist = (1 << 24);
     }
     sf->disable_split_mask = DISABLE_ALL_SPLIT;
   }
+
+  if (speed >= 5) {
+    sf->partition_search_breakout_thr.rate = 500;
+  }
 }
 
 static double tx_dom_thresholds[6] = { 99.0, 14.0, 12.0, 8.0, 4.0, 0.0 };
 static double qopt_thresholds[6] = { 99.0, 12.0, 10.0, 4.0, 2.0, 0.0 };
 
-static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
-                                   SPEED_FEATURES *sf, int speed) {
+static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
+                                                         VP9_COMMON *cm,
+                                                         SPEED_FEATURES *sf,
+                                                         int speed) {
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   const int boosted = frame_is_boosted(cpi);
+  int i;
 
-  sf->partition_search_breakout_dist_thr = (1 << 20);
-  sf->partition_search_breakout_rate_thr = 80;
-  sf->tx_size_search_breakout = 1;
+  sf->adaptive_interp_filter_search = 1;
+  sf->adaptive_pred_interp_filter = 1;
   sf->adaptive_rd_thresh = 1;
+  sf->adaptive_rd_thresh_row_mt = 0;
   sf->allow_skip_recode = 1;
   sf->less_rectangular_check = 1;
-  sf->use_square_partition_only = !frame_is_boosted(cpi);
-  sf->use_square_only_threshold = BLOCK_16X16;
+  sf->mv.auto_mv_step_size = 1;
+  sf->mv.use_downsampled_sad = 1;
+  sf->prune_ref_frame_for_rect_partitions = 1;
+  sf->temporal_filter_search_method = NSTEP;
+  sf->tx_size_search_breakout = 1;
+  sf->use_square_partition_only = !boosted;
+  sf->early_term_interp_search_plane_rd = 1;
+  sf->cb_pred_filter_search = 1;
+  sf->trellis_opt_tx_rd.method = sf->optimize_coefficients
+                                     ? ENABLE_TRELLIS_OPT_TX_RD_RESIDUAL_MSE
+                                     : DISABLE_TRELLIS_OPT;
+  sf->trellis_opt_tx_rd.thresh = boosted ? 4.0 : 3.0;
+
+  sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+  sf->comp_inter_joint_search_iter_level = 1;
+
+  // Reference masking is not supported in dynamic scaling mode.
+  sf->reference_masking = oxcf->resize_mode != RESIZE_DYNAMIC;
+
+  sf->rd_ml_partition.var_pruning = 1;
+  sf->rd_ml_partition.prune_rect_thresh[0] = -1;
+  sf->rd_ml_partition.prune_rect_thresh[1] = 350;
+  sf->rd_ml_partition.prune_rect_thresh[2] = 325;
+  sf->rd_ml_partition.prune_rect_thresh[3] = 250;
+
+  if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {
+    sf->exhaustive_searches_thresh = (1 << 22);
+  } else {
+    sf->exhaustive_searches_thresh = INT_MAX;
+  }
+
+  for (i = 0; i < MAX_MESH_STEP; ++i) {
+    const int mesh_density_level = 0;
+    sf->mesh_patterns[i].range =
+        good_quality_mesh_patterns[mesh_density_level][i].range;
+    sf->mesh_patterns[i].interval =
+        good_quality_mesh_patterns[mesh_density_level][i].interval;
+  }
 
   if (speed >= 1) {
-    if (cpi->oxcf.pass == 2) {
+    sf->rd_ml_partition.var_pruning = !boosted;
+    sf->rd_ml_partition.prune_rect_thresh[1] = 225;
+    sf->rd_ml_partition.prune_rect_thresh[2] = 225;
+    sf->rd_ml_partition.prune_rect_thresh[3] = 225;
+
+    if (oxcf->pass == 2) {
       TWO_PASS *const twopass = &cpi->twopass;
       if ((twopass->fr_content_type == FC_GRAPHICS_ANIMATION) ||
           vp9_internal_image_edge(cpi)) {
-        sf->use_square_partition_only = !frame_is_boosted(cpi);
+        sf->use_square_partition_only = !boosted;
       } else {
         sf->use_square_partition_only = !frame_is_intra_only(cm);
       }
@@ -169,49 +289,70 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
 
     sf->allow_txfm_domain_distortion = 1;
     sf->tx_domain_thresh = tx_dom_thresholds[(speed < 6) ? speed : 5];
-    sf->allow_quant_coeff_opt = sf->optimize_coefficients;
-    sf->quant_opt_thresh = qopt_thresholds[(speed < 6) ? speed : 5];
-
-    sf->use_square_only_threshold = BLOCK_4X4;
+    sf->trellis_opt_tx_rd.method = sf->optimize_coefficients
+                                       ? ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR
+                                       : DISABLE_TRELLIS_OPT;
+    sf->trellis_opt_tx_rd.thresh = qopt_thresholds[(speed < 6) ? speed : 5];
     sf->less_rectangular_check = 1;
-
     sf->use_rd_breakout = 1;
     sf->adaptive_motion_search = 1;
-    sf->mv.auto_mv_step_size = 1;
     sf->adaptive_rd_thresh = 2;
-    sf->mv.subpel_iters_per_step = 1;
-    sf->mode_skip_start = 10;
-    sf->adaptive_pred_interp_filter = 1;
+    sf->mv.subpel_search_level = 1;
+    if (cpi->oxcf.content != VP9E_CONTENT_FILM) sf->mode_skip_start = 10;
     sf->allow_acl = 0;
 
-    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
-    sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
-    sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+    if (cpi->oxcf.content != VP9E_CONTENT_FILM) {
+      sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+      sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+    }
 
     sf->recode_tolerance_low = 15;
     sf->recode_tolerance_high = 30;
+
+    sf->exhaustive_searches_thresh =
+        (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 23)
+                                                                : INT_MAX;
+    sf->use_accurate_subpel_search = USE_4_TAPS;
   }
 
   if (speed >= 2) {
-    sf->recode_loop = ALLOW_RECODE_KFARFGF;
+    sf->rd_ml_partition.var_pruning = 0;
+    if (oxcf->vbr_corpus_complexity)
+      sf->recode_loop = ALLOW_RECODE_FIRST;
+    else
+      sf->recode_loop = ALLOW_RECODE_KFARFGF;
+
     sf->tx_size_search_method =
         frame_is_boosted(cpi) ? USE_FULL_RD : USE_LARGESTALL;
 
-    // Reference masking is not supported in dynamic scaling mode.
-    sf->reference_masking = cpi->oxcf.resize_mode != RESIZE_DYNAMIC ? 1 : 0;
-
     sf->mode_search_skip_flags =
-        (cm->frame_type == KEY_FRAME) ? 0 : FLAG_SKIP_INTRA_DIRMISMATCH |
-                                                FLAG_SKIP_INTRA_BESTINTER |
-                                                FLAG_SKIP_COMP_BESTINTRA |
-                                                FLAG_SKIP_INTRA_LOWVAR;
+        (cm->frame_type == KEY_FRAME)
+            ? 0
+            : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
+                  FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR;
     sf->disable_filter_search_var_thresh = 100;
-    sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+    sf->comp_inter_joint_search_iter_level = 2;
     sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
-    sf->allow_partition_search_skip = 1;
-    sf->recode_tolerance_low = 15;
     sf->recode_tolerance_high = 45;
+    sf->enhanced_full_pixel_motion_search = 0;
+    sf->prune_ref_frame_for_rect_partitions = 0;
+    sf->rd_ml_partition.prune_rect_thresh[1] = -1;
+    sf->rd_ml_partition.prune_rect_thresh[2] = -1;
+    sf->rd_ml_partition.prune_rect_thresh[3] = -1;
+    sf->mv.subpel_search_level = 0;
+
+    if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {
+      for (i = 0; i < MAX_MESH_STEP; ++i) {
+        int mesh_density_level = 1;
+        sf->mesh_patterns[i].range =
+            good_quality_mesh_patterns[mesh_density_level][i].range;
+        sf->mesh_patterns[i].interval =
+            good_quality_mesh_patterns[mesh_density_level][i].interval;
+      }
+    }
+
+    sf->use_accurate_subpel_search = USE_2_TAPS;
   }
 
   if (speed >= 3) {
@@ -222,14 +363,23 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
     sf->adaptive_pred_interp_filter = 0;
     sf->adaptive_mode_search = 1;
     sf->cb_partition_search = !boosted;
-    sf->cb_pred_filter_search = 1;
+    sf->cb_pred_filter_search = 2;
     sf->alt_ref_search_fp = 1;
     sf->recode_loop = ALLOW_RECODE_KFMAXBW;
     sf->adaptive_rd_thresh = 3;
     sf->mode_skip_start = 6;
     sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
-    sf->adaptive_interp_filter_search = 1;
+
+    if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {
+      for (i = 0; i < MAX_MESH_STEP; ++i) {
+        int mesh_density_level = 2;
+        sf->mesh_patterns[i].range =
+            good_quality_mesh_patterns[mesh_density_level][i].range;
+        sf->mesh_patterns[i].interval =
+            good_quality_mesh_patterns[mesh_density_level][i].interval;
+      }
+    }
   }
 
   if (speed >= 4) {
@@ -245,11 +395,9 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
     sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
     sf->use_fast_coef_costing = 1;
     sf->motion_field_mode_search = !boosted;
-    sf->partition_search_breakout_rate_thr = 300;
   }
 
   if (speed >= 5) {
-    int i;
     sf->optimize_coefficients = 0;
     sf->mv.search_method = HEX;
     sf->disable_filter_search_var_thresh = 500;
@@ -257,11 +405,11 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
       sf->intra_y_mode_mask[i] = INTRA_DC;
       sf->intra_uv_mode_mask[i] = INTRA_DC;
     }
-    sf->partition_search_breakout_rate_thr = 500;
     sf->mv.reduce_first_step_size = 1;
     sf->simple_model_rd_from_var = 1;
   }
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 static void set_rt_speed_feature_framesize_dependent(VP9_COMP *cpi,
                                                      SPEED_FEATURES *sf,
@@ -287,10 +435,11 @@ static void set_rt_speed_feature_framesize_dependent(VP9_COMP *cpi,
   }
 
   if (speed >= 5) {
+    sf->partition_search_breakout_thr.rate = 200;
     if (VPXMIN(cm->width, cm->height) >= 720) {
-      sf->partition_search_breakout_dist_thr = (1 << 25);
+      sf->partition_search_breakout_thr.dist = (1 << 25);
     } else {
-      sf->partition_search_breakout_dist_thr = (1 << 23);
+      sf->partition_search_breakout_thr.dist = (1 << 23);
     }
   }
 
@@ -300,24 +449,44 @@ static void set_rt_speed_feature_framesize_dependent(VP9_COMP *cpi,
   }
 }
 
-static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed,
-                                 vp9e_tune_content content) {
+static void set_rt_speed_feature_framesize_independent(
+    VP9_COMP *cpi, SPEED_FEATURES *sf, int speed, vp9e_tune_content content) {
   VP9_COMMON *const cm = &cpi->common;
+  SVC *const svc = &cpi->svc;
   const int is_keyframe = cm->frame_type == KEY_FRAME;
   const int frames_since_key = is_keyframe ? 0 : cpi->rc.frames_since_key;
   sf->static_segmentation = 0;
   sf->adaptive_rd_thresh = 1;
+  sf->adaptive_rd_thresh_row_mt = 0;
   sf->use_fast_coef_costing = 1;
-  sf->allow_exhaustive_searches = 0;
   sf->exhaustive_searches_thresh = INT_MAX;
   sf->allow_acl = 0;
   sf->copy_partition_flag = 0;
+  sf->use_source_sad = 0;
+  sf->use_simple_block_yrd = 0;
+  sf->adapt_partition_source_sad = 0;
+  sf->use_altref_onepass = 0;
+  sf->use_compound_nonrd_pickmode = 0;
+  sf->nonrd_keyframe = 0;
+  sf->svc_use_lowres_part = 0;
+  sf->overshoot_detection_cbr_rt = NO_DETECTION;
+  sf->disable_16x16part_nonkey = 0;
+  sf->disable_golden_ref = 0;
+  sf->enable_tpl_model = 0;
+  sf->enhanced_full_pixel_motion_search = 0;
+  sf->use_accurate_subpel_search = USE_2_TAPS;
+  sf->nonrd_use_ml_partition = 0;
+  sf->variance_part_thresh_mult = 1;
+  sf->cb_pred_filter_search = 0;
+  sf->force_smooth_interpol = 0;
+  sf->rt_intra_dc_only_low_content = 0;
+  sf->mv.enable_adaptive_subpel_force_stop = 0;
 
   if (speed >= 1) {
     sf->allow_txfm_domain_distortion = 1;
     sf->tx_domain_thresh = 0.0;
-    sf->allow_quant_coeff_opt = 0;
-    sf->quant_opt_thresh = 0.0;
+    sf->trellis_opt_tx_rd.method = DISABLE_TRELLIS_OPT;
+    sf->trellis_opt_tx_rd.thresh = 0.0;
     sf->use_square_partition_only = !frame_is_intra_only(cm);
     sf->less_rectangular_check = 1;
     sf->tx_size_search_method =
@@ -336,25 +505,24 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed,
 
   if (speed >= 2) {
     sf->mode_search_skip_flags =
-        (cm->frame_type == KEY_FRAME) ? 0 : FLAG_SKIP_INTRA_DIRMISMATCH |
-                                                FLAG_SKIP_INTRA_BESTINTER |
-                                                FLAG_SKIP_COMP_BESTINTRA |
-                                                FLAG_SKIP_INTRA_LOWVAR;
+        (cm->frame_type == KEY_FRAME)
+            ? 0
+            : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
+                  FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR;
     sf->adaptive_pred_interp_filter = 2;
 
     // Reference masking only enabled for 1 spatial layer, and if none of the
     // references have been scaled. The latter condition needs to be checked
     // for external or internal dynamic resize.
-    sf->reference_masking = (cpi->svc.number_spatial_layers == 1);
+    sf->reference_masking = (svc->number_spatial_layers == 1);
     if (sf->reference_masking == 1 &&
         (cpi->external_resize == 1 ||
          cpi->oxcf.resize_mode == RESIZE_DYNAMIC)) {
       MV_REFERENCE_FRAME ref_frame;
-      static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                        VP9_ALT_FLAG };
       for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
         const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
-        if (yv12 != NULL && (cpi->ref_frame_flags & flag_list[ref_frame])) {
+        if (yv12 != NULL &&
+            (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) {
           const struct scale_factors *const scale_fac =
               &cm->frame_refs[ref_frame - 1].sf;
           if (vp9_is_scaled(scale_fac)) sf->reference_masking = 0;
@@ -363,7 +531,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed,
     }
 
     sf->disable_filter_search_var_thresh = 50;
-    sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+    sf->comp_inter_joint_search_iter_level = 2;
     sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
     sf->lf_motion_threshold = LOW_MOTION_THRESHOLD;
     sf->adjust_partitioning_from_last_frame = 1;
@@ -378,7 +546,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed,
     sf->disable_filter_search_var_thresh = 100;
     sf->use_uv_intra_rd_estimate = 1;
     sf->skip_encode_sb = 1;
-    sf->mv.subpel_iters_per_step = 1;
+    sf->mv.subpel_search_level = 0;
     sf->adaptive_rd_thresh = 4;
     sf->mode_skip_start = 6;
     sf->allow_skip_recode = 0;
@@ -389,14 +557,9 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed,
 
   if (speed >= 4) {
     int i;
-    sf->last_partitioning_redo_frequency = 4;
-    sf->adaptive_rd_thresh = 5;
-    sf->use_fast_coef_costing = 0;
-    sf->auto_min_max_partition_size = STRICT_NEIGHBORING_MIN_MAX;
-    sf->adjust_partitioning_from_last_frame =
-        cm->last_frame_type != cm->frame_type ||
-        (0 == (frames_since_key + 1) % sf->last_partitioning_redo_frequency);
-    sf->mv.subpel_force_stop = 1;
+    if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0)
+      sf->use_altref_onepass = 1;
+    sf->mv.subpel_force_stop = QUARTER_PEL;
     for (i = 0; i < TX_SIZES; i++) {
       sf->intra_y_mode_mask[i] = INTRA_DC_H_V;
       sf->intra_uv_mode_mask[i] = INTRA_DC;
@@ -404,16 +567,23 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed,
     sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
     sf->frame_parameter_update = 0;
     sf->mv.search_method = FAST_HEX;
-
-    sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEAR_NEW;
-    sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST;
-    sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST;
-    sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST;
+    sf->allow_skip_recode = 0;
     sf->max_intra_bsize = BLOCK_32X32;
-    sf->allow_skip_recode = 1;
+    sf->use_fast_coef_costing = 0;
+    sf->use_quant_fp = !is_keyframe;
+    sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEW_ZERO;
+    sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST_NEW_ZERO;
+    sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST_NEW_ZERO;
+    sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST_NEW_ZERO;
+    sf->adaptive_rd_thresh = 2;
+    sf->use_fast_coef_updates = is_keyframe ? TWO_LOOP : ONE_LOOP_REDUCED;
+    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH;
+    sf->tx_size_search_method = is_keyframe ? USE_LARGESTALL : USE_TX_8X8;
+    sf->partition_search_type = VAR_BASED_PARTITION;
   }
 
   if (speed >= 5) {
+    sf->use_altref_onepass = 0;
     sf->use_quant_fp = !is_keyframe;
     sf->auto_min_max_partition_size =
         is_keyframe ? RELAXED_NEIGHBORING_MIN_MAX : STRICT_NEIGHBORING_MIN_MAX;
@@ -437,7 +607,6 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed,
     sf->adaptive_rd_thresh = 2;
     // This feature is only enabled when partition search is disabled.
     sf->reuse_inter_pred_sby = 1;
-    sf->partition_search_breakout_rate_thr = 200;
     sf->coeff_prob_appx_step = 4;
     sf->use_fast_coef_updates = is_keyframe ? TWO_LOOP : ONE_LOOP_REDUCED;
     sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH;
@@ -449,7 +618,10 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed,
       int i;
       if (content == VP9E_CONTENT_SCREEN) {
         for (i = 0; i < BLOCK_SIZES; ++i)
-          sf->intra_y_mode_bsize_mask[i] = INTRA_DC_TM_H_V;
+          if (i >= BLOCK_32X32)
+            sf->intra_y_mode_bsize_mask[i] = INTRA_DC_H_V;
+          else
+            sf->intra_y_mode_bsize_mask[i] = INTRA_DC_TM_H_V;
       } else {
         for (i = 0; i < BLOCK_SIZES; ++i)
           if (i > BLOCK_16X16)
@@ -463,80 +635,260 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed,
       sf->short_circuit_flat_blocks = 1;
     }
     if (cpi->oxcf.rc_mode == VPX_CBR &&
-        cpi->oxcf.content != VP9E_CONTENT_SCREEN && !cpi->use_svc) {
+        cpi->oxcf.content != VP9E_CONTENT_SCREEN) {
       sf->limit_newmv_early_exit = 1;
-      sf->bias_golden = 1;
+      if (!cpi->use_svc) sf->bias_golden = 1;
     }
+    // Keep nonrd_keyframe = 1 for non-base spatial layers to prevent
+    // increase in encoding time.
+    if (cpi->use_svc && svc->spatial_layer_id > 0) sf->nonrd_keyframe = 1;
+    if (cm->frame_type != KEY_FRAME && cpi->resize_state == ORIG &&
+        cpi->oxcf.rc_mode == VPX_CBR && !cpi->rc.disable_overshoot_maxq_cbr) {
+      if (cm->width * cm->height <= 352 * 288 && !cpi->use_svc &&
+          cpi->oxcf.content != VP9E_CONTENT_SCREEN)
+        sf->overshoot_detection_cbr_rt = RE_ENCODE_MAXQ;
+      else
+        sf->overshoot_detection_cbr_rt = FAST_DETECTION_MAXQ;
+    }
+    if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0 &&
+        cm->width <= 1280 && cm->height <= 720) {
+      sf->use_altref_onepass = 1;
+      sf->use_compound_nonrd_pickmode = 1;
+    }
+    if (cm->width * cm->height > 1280 * 720) sf->cb_pred_filter_search = 2;
+    if (!cpi->external_resize) sf->use_source_sad = 1;
   }
 
   if (speed >= 6) {
+    if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0) {
+      sf->use_altref_onepass = 1;
+      sf->use_compound_nonrd_pickmode = 1;
+    }
     sf->partition_search_type = VAR_BASED_PARTITION;
-    // Turn on this to use non-RD key frame coding mode.
-    sf->use_nonrd_pick_mode = 1;
     sf->mv.search_method = NSTEP;
     sf->mv.reduce_first_step_size = 1;
     sf->skip_encode_sb = 0;
-    if (!cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR &&
-        content != VP9E_CONTENT_SCREEN) {
+
+    if (sf->use_source_sad) {
+      sf->adapt_partition_source_sad = 1;
+      sf->adapt_partition_thresh =
+          (cm->width * cm->height <= 640 * 360) ? 40000 : 60000;
+      if (cpi->content_state_sb_fd == NULL &&
+          (!cpi->use_svc ||
+           svc->spatial_layer_id == svc->number_spatial_layers - 1)) {
+        CHECK_MEM_ERROR(&cm->error, cpi->content_state_sb_fd,
+                        (uint8_t *)vpx_calloc(
+                            (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
+                            sizeof(uint8_t)));
+      }
+    }
+    if (cpi->oxcf.rc_mode == VPX_CBR && content != VP9E_CONTENT_SCREEN) {
       // Enable short circuit for low temporal variance.
       sf->short_circuit_low_temp_var = 1;
     }
-    if (cpi->use_svc) sf->base_mv_aggressive = 1;
+    if (svc->temporal_layer_id > 0) {
+      sf->adaptive_rd_thresh = 4;
+      sf->limit_newmv_early_exit = 0;
+      sf->base_mv_aggressive = 1;
+    }
+    if (cm->frame_type != KEY_FRAME && cpi->resize_state == ORIG &&
+        cpi->oxcf.rc_mode == VPX_CBR && !cpi->rc.disable_overshoot_maxq_cbr)
+      sf->overshoot_detection_cbr_rt = FAST_DETECTION_MAXQ;
   }
 
   if (speed >= 7) {
+    sf->adapt_partition_source_sad = 0;
     sf->adaptive_rd_thresh = 3;
     sf->mv.search_method = FAST_DIAMOND;
     sf->mv.fullpel_search_step_param = 10;
-    if (cpi->svc.number_temporal_layers > 2 &&
-        cpi->svc.temporal_layer_id == 0) {
+    // For SVC: use better mv search on base temporal layer, and only
+    // on base spatial layer if highest resolution is above 640x360.
+    if (svc->number_temporal_layers > 2 && svc->temporal_layer_id == 0 &&
+        (svc->spatial_layer_id == 0 ||
+         cpi->oxcf.width * cpi->oxcf.height <= 640 * 360)) {
       sf->mv.search_method = NSTEP;
       sf->mv.fullpel_search_step_param = 6;
     }
+    if (svc->temporal_layer_id > 0 || svc->spatial_layer_id > 1) {
+      sf->use_simple_block_yrd = 1;
+      if (svc->non_reference_frame)
+        sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_EVENMORE;
+    }
+    if (cpi->use_svc && cpi->row_mt && cpi->oxcf.max_threads > 1)
+      sf->adaptive_rd_thresh_row_mt = 1;
+    // Enable partition copy. For SVC only enabled for top spatial resolution
+    // layer.
+    cpi->max_copied_frame = 0;
+    if (!cpi->last_frame_dropped && cpi->resize_state == ORIG &&
+        !cpi->external_resize &&
+        (!cpi->use_svc ||
+         (svc->spatial_layer_id == svc->number_spatial_layers - 1 &&
+          !svc->last_layer_dropped[svc->number_spatial_layers - 1]))) {
+      sf->copy_partition_flag = 1;
+      cpi->max_copied_frame = 2;
+      // The top temporal enhancement layer (for number of temporal layers > 1)
+      // are non-reference frames, so use large/max value for max_copied_frame.
+      if (svc->number_temporal_layers > 1 &&
+          svc->temporal_layer_id == svc->number_temporal_layers - 1)
+        cpi->max_copied_frame = 255;
+    }
+    // For SVC: enable use of lower resolution partition for higher resolution,
+    // only for 3 spatial layers and when config/top resolution is above VGA.
+    // Enable only for non-base temporal layer frames.
+    if (cpi->use_svc && svc->use_partition_reuse &&
+        svc->number_spatial_layers == 3 && svc->temporal_layer_id > 0 &&
+        cpi->oxcf.width * cpi->oxcf.height > 640 * 480)
+      sf->svc_use_lowres_part = 1;
+    // For SVC when golden is used as second temporal reference: to avoid
+    // encode time increase only use this feature on base temporal layer.
+    // (i.e remove golden flag from frame_flags for temporal_layer_id > 0).
+    if (cpi->use_svc && svc->use_gf_temporal_ref_current_layer &&
+        svc->temporal_layer_id > 0)
+      cpi->ref_frame_flags &= (~VP9_GOLD_FLAG);
+    if (cm->width * cm->height > 640 * 480) sf->cb_pred_filter_search = 2;
   }
 
   if (speed >= 8) {
     sf->adaptive_rd_thresh = 4;
-    // Disabled for now until the threshold is tuned.
-    sf->copy_partition_flag = 0;
-    if (sf->copy_partition_flag) {
-      if (cpi->prev_partition == NULL) {
-        cpi->prev_partition = (BLOCK_SIZE *)vpx_calloc(
-            cm->mi_stride * cm->mi_rows, sizeof(BLOCK_SIZE));
-      }
-      if (cpi->prev_segment_id == NULL) {
-        cpi->prev_segment_id =
-            (int8_t *)vpx_calloc(cm->mi_stride * cm->mi_rows, sizeof(int8_t));
-      }
+    sf->skip_encode_sb = 1;
+    if (cpi->svc.number_spatial_layers > 1 && !cpi->svc.simulcast_mode)
+      sf->nonrd_keyframe = 0;
+    else
+      sf->nonrd_keyframe = 1;
+    if (!cpi->use_svc) cpi->max_copied_frame = 4;
+    if (cpi->row_mt && cpi->oxcf.max_threads > 1)
+      sf->adaptive_rd_thresh_row_mt = 1;
+    // Enable ML based partition for low res.
+    if (!frame_is_intra_only(cm) && cm->width * cm->height <= 352 * 288) {
+      sf->nonrd_use_ml_partition = 1;
     }
-    sf->mv.subpel_force_stop = (content == VP9E_CONTENT_SCREEN) ? 3 : 2;
-    if (content == VP9E_CONTENT_SCREEN) sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
-    // Only keep INTRA_DC mode for speed 8.
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cpi->Source->flags & YV12_FLAG_HIGHBITDEPTH)
+      sf->nonrd_use_ml_partition = 0;
+#endif
+    if (content == VP9E_CONTENT_SCREEN) sf->mv.subpel_force_stop = HALF_PEL;
+    sf->rt_intra_dc_only_low_content = 1;
+    if (!cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR &&
+        content != VP9E_CONTENT_SCREEN) {
+      // More aggressive short circuit for speed 8.
+      sf->short_circuit_low_temp_var = 3;
+      // Use level 2 for noisey cases as there is a regression in some
+      // noisy clips with level 3.
+      if (cpi->noise_estimate.enabled && cm->width >= 1280 &&
+          cm->height >= 720) {
+        NOISE_LEVEL noise_level =
+            vp9_noise_estimate_extract_level(&cpi->noise_estimate);
+        if (noise_level >= kMedium) sf->short_circuit_low_temp_var = 2;
+      }
+      // Since the short_circuit_low_temp_var is used, reduce the
+      // adaptive_rd_thresh level.
+      if (cm->width * cm->height > 352 * 288)
+        sf->adaptive_rd_thresh = 1;
+      else
+        sf->adaptive_rd_thresh = 2;
+    }
+    sf->limit_newmv_early_exit = 0;
+    sf->use_simple_block_yrd = 1;
+    if (cm->width * cm->height > 352 * 288) sf->cb_pred_filter_search = 2;
+  }
+
+  if (speed >= 9) {
+    // Only keep INTRA_DC mode for speed 9.
     if (!is_keyframe) {
       int i = 0;
       for (i = 0; i < BLOCK_SIZES; ++i)
         sf->intra_y_mode_bsize_mask[i] = INTRA_DC;
     }
-    if (!cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR &&
-        content != VP9E_CONTENT_SCREEN) {
-      // More aggressive short circuit for speed 8.
-      sf->short_circuit_low_temp_var = 3;
-    }
-    sf->limit_newmv_early_exit = 0;
+    sf->cb_pred_filter_search = 2;
+    sf->mv.enable_adaptive_subpel_force_stop = 1;
+    sf->mv.adapt_subpel_force_stop.mv_thresh = 1;
+    sf->mv.adapt_subpel_force_stop.force_stop_below = QUARTER_PEL;
+    sf->mv.adapt_subpel_force_stop.force_stop_above = HALF_PEL;
+    // Disable partition blocks below 16x16, except for low-resolutions.
+    if (cm->frame_type != KEY_FRAME && cm->width >= 320 && cm->height >= 240)
+      sf->disable_16x16part_nonkey = 1;
+    // Allow for disabling GOLDEN reference, for CBR mode.
+    if (cpi->oxcf.rc_mode == VPX_CBR) sf->disable_golden_ref = 1;
+    if (cpi->rc.avg_frame_low_motion < 70) sf->default_interp_filter = BILINEAR;
+    if (cm->width * cm->height >= 640 * 360) sf->variance_part_thresh_mult = 2;
   }
+
+  // Disable split to 8x8 for low-resolution at very high Q.
+  // For variance partition (speed >= 6). Ignore the first few frames
+  // as avg_frame_qindex starts at max_q (worst_quality).
+  if (cm->frame_type != KEY_FRAME && cm->width * cm->height <= 320 * 240 &&
+      sf->partition_search_type == VAR_BASED_PARTITION &&
+      cpi->rc.avg_frame_qindex[INTER_FRAME] > 208 &&
+      cpi->common.current_video_frame > 8)
+    sf->disable_16x16part_nonkey = 1;
+
+  if (sf->nonrd_use_ml_partition)
+    sf->partition_search_type = ML_BASED_PARTITION;
+
+  if (sf->use_altref_onepass) {
+    if (cpi->rc.is_src_frame_alt_ref && cm->frame_type != KEY_FRAME) {
+      sf->partition_search_type = FIXED_PARTITION;
+      sf->always_this_block_size = BLOCK_64X64;
+    }
+    if (cpi->count_arf_frame_usage == NULL) {
+      CHECK_MEM_ERROR(
+          &cm->error, cpi->count_arf_frame_usage,
+          (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
+                                sizeof(*cpi->count_arf_frame_usage)));
+    }
+    if (cpi->count_lastgolden_frame_usage == NULL)
+      CHECK_MEM_ERROR(
+          &cm->error, cpi->count_lastgolden_frame_usage,
+          (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
+                                sizeof(*cpi->count_lastgolden_frame_usage)));
+  }
+  if (svc->previous_frame_is_intra_only) {
+    sf->partition_search_type = FIXED_PARTITION;
+    sf->always_this_block_size = BLOCK_64X64;
+  }
+  // Special case for screen content: increase motion search on base spatial
+  // layer when high motion is detected or previous SL0 frame was dropped.
+  if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && cpi->oxcf.speed >= 5 &&
+      (svc->high_num_blocks_with_motion || svc->last_layer_dropped[0])) {
+    sf->mv.search_method = NSTEP;
+    // TODO(marpan/jianj): Tune this setting for screensharing. For now use
+    // small step_param for all spatial layers.
+    sf->mv.fullpel_search_step_param = 2;
+  }
+  // TODO(marpan): There is regression for aq-mode=3 speed <= 4, force it
+  // off for now.
+  if (speed <= 3 && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+    cpi->oxcf.aq_mode = 0;
+  // For all speeds for rt mode: if the deadline mode changed (was good/best
+  // quality on previous frame and now is realtime) set nonrd_keyframe to 1 to
+  // avoid entering rd pickmode. This causes issues, such as: b/310663186.
+  if (cpi->oxcf.mode != cpi->deadline_mode_previous_frame)
+    sf->nonrd_keyframe = 1;
+
+  // TODO(marpan): Force this feature off always, for the issue: 366146260
+  // Remove this disabling when underlying issue is resolved.
+  sf->svc_use_lowres_part = 0;
 }
 
-void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) {
+void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi, int speed) {
   SPEED_FEATURES *const sf = &cpi->sf;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   RD_OPT *const rd = &cpi->rd;
   int i;
 
-  if (oxcf->mode == REALTIME) {
-    set_rt_speed_feature_framesize_dependent(cpi, sf, oxcf->speed);
-  } else if (oxcf->mode == GOOD) {
-    set_good_speed_feature_framesize_dependent(cpi, sf, oxcf->speed);
-  }
+  // best quality defaults
+  // Some speed-up features even for best quality as minimal impact on quality.
+  sf->partition_search_breakout_thr.dist = (1 << 19);
+  sf->partition_search_breakout_thr.rate = 80;
+  sf->rd_ml_partition.search_early_termination = 0;
+  sf->rd_ml_partition.search_breakout = 0;
+
+  if (oxcf->mode == REALTIME)
+    set_rt_speed_feature_framesize_dependent(cpi, sf, speed);
+#if !CONFIG_REALTIME_ONLY
+  else if (oxcf->mode == GOOD)
+    set_good_speed_feature_framesize_dependent(cpi, sf, speed);
+#endif
 
   if (sf->disable_split_mask == DISABLE_ALL_SPLIT) {
     sf->adaptive_pred_interp_filter = 0;
@@ -553,11 +905,22 @@ void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) {
       rd->thresh_mult_sub8x8[i] = INT_MAX;
     }
   }
+
+  // With row based multi-threading, the following speed features
+  // have to be disabled to guarantee that bitstreams encoded with single thread
+  // and multiple threads match.
+  // It can be used in realtime when adaptive_rd_thresh_row_mt is enabled since
+  // adaptive_rd_thresh is defined per-row for non-rd pickmode.
+  if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact &&
+      oxcf->max_threads > 1)
+    sf->adaptive_rd_thresh = 0;
 }
 
-void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
+void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) {
   SPEED_FEATURES *const sf = &cpi->sf;
+#if !CONFIG_REALTIME_ONLY
   VP9_COMMON *const cm = &cpi->common;
+#endif
   MACROBLOCK *const x = &cpi->td.mb;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   int i;
@@ -567,20 +930,24 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
   sf->mv.search_method = NSTEP;
   sf->recode_loop = ALLOW_RECODE_FIRST;
   sf->mv.subpel_search_method = SUBPEL_TREE;
-  sf->mv.subpel_iters_per_step = 2;
-  sf->mv.subpel_force_stop = 0;
+  sf->mv.subpel_search_level = 2;
+  sf->mv.subpel_force_stop = EIGHTH_PEL;
   sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf);
   sf->mv.reduce_first_step_size = 0;
   sf->coeff_prob_appx_step = 1;
   sf->mv.auto_mv_step_size = 0;
   sf->mv.fullpel_search_step_param = 6;
-  sf->comp_inter_joint_search_thresh = BLOCK_4X4;
+  sf->mv.use_downsampled_sad = 0;
+  sf->comp_inter_joint_search_iter_level = 0;
   sf->tx_size_search_method = USE_FULL_RD;
   sf->use_lp32x32fdct = 0;
   sf->adaptive_motion_search = 0;
+  sf->enhanced_full_pixel_motion_search = 1;
   sf->adaptive_pred_interp_filter = 0;
   sf->adaptive_mode_search = 0;
+  sf->prune_single_mode_based_on_mv_diff_mode_rate = 0;
   sf->cb_pred_filter_search = 0;
+  sf->early_term_interp_search_plane_rd = 0;
   sf->cb_partition_search = 0;
   sf->motion_field_mode_search = 0;
   sf->alt_ref_search_fp = 0;
@@ -589,7 +956,8 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
   sf->partition_search_type = SEARCH_PARTITION;
   sf->less_rectangular_check = 0;
   sf->use_square_partition_only = 0;
-  sf->use_square_only_threshold = BLOCK_SIZES;
+  sf->use_square_only_thresh_high = BLOCK_SIZES;
+  sf->use_square_only_thresh_low = BLOCK_4X4;
   sf->auto_min_max_partition_size = NOT_IN_USE;
   sf->rd_auto_partition_min_limit = BLOCK_4X4;
   sf->default_max_partition_size = BLOCK_64X64;
@@ -602,12 +970,16 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
   sf->max_delta_qindex = 0;
   sf->disable_filter_search_var_thresh = 0;
   sf->adaptive_interp_filter_search = 0;
-  sf->allow_partition_search_skip = 0;
   sf->allow_txfm_domain_distortion = 0;
   sf->tx_domain_thresh = 99.0;
-  sf->allow_quant_coeff_opt = sf->optimize_coefficients;
-  sf->quant_opt_thresh = 99.0;
+  sf->trellis_opt_tx_rd.method =
+      sf->optimize_coefficients ? ENABLE_TRELLIS_OPT : DISABLE_TRELLIS_OPT;
+  sf->trellis_opt_tx_rd.thresh = 99.0;
   sf->allow_acl = 1;
+  sf->enable_tpl_model = oxcf->enable_tpl_model;
+  sf->prune_ref_frame_for_rect_partitions = 0;
+  sf->temporal_filter_search_method = MESH;
+  sf->allow_skip_txfm_ac_dc = 0;
 
   for (i = 0; i < TX_SIZES; i++) {
     sf->intra_y_mode_mask[i] = INTRA_ALL;
@@ -629,7 +1001,6 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
   // This setting only takes effect when partition_search_type is set
   // to FIXED_PARTITION.
   sf->always_this_block_size = BLOCK_16X16;
-  sf->search_type_check_frequency = 50;
   sf->encode_breakout_thresh = 0;
   // Recode loop tolerance %.
   sf->recode_tolerance_low = 12;
@@ -641,49 +1012,41 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
   sf->limit_newmv_early_exit = 0;
   sf->bias_golden = 0;
   sf->base_mv_aggressive = 0;
+  sf->rd_ml_partition.prune_rect_thresh[0] = -1;
+  sf->rd_ml_partition.prune_rect_thresh[1] = -1;
+  sf->rd_ml_partition.prune_rect_thresh[2] = -1;
+  sf->rd_ml_partition.prune_rect_thresh[3] = -1;
+  sf->rd_ml_partition.var_pruning = 0;
+  sf->use_accurate_subpel_search = USE_8_TAPS;
 
   // Some speed-up features even for best quality as minimal impact on quality.
   sf->adaptive_rd_thresh = 1;
   sf->tx_size_search_breakout = 1;
-  sf->partition_search_breakout_dist_thr = (1 << 19);
-  sf->partition_search_breakout_rate_thr = 80;
+  sf->tx_size_search_depth = 2;
 
-  if (oxcf->mode == REALTIME)
-    set_rt_speed_feature(cpi, sf, oxcf->speed, oxcf->content);
-  else if (oxcf->mode == GOOD)
-    set_good_speed_feature(cpi, cm, sf, oxcf->speed);
-
-  cpi->full_search_sad = vp9_full_search_sad;
-  cpi->diamond_search_sad = vp9_diamond_search_sad;
-
-  sf->allow_exhaustive_searches = 1;
-  if (oxcf->mode == BEST) {
-    if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)
-      sf->exhaustive_searches_thresh = (1 << 20);
-    else
-      sf->exhaustive_searches_thresh = (1 << 21);
-    sf->max_exaustive_pct = 100;
+  sf->exhaustive_searches_thresh =
+      (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 20)
+                                                              : INT_MAX;
+  {
+    const int mesh_density_level =
+        (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? 0 : 1;
     for (i = 0; i < MAX_MESH_STEP; ++i) {
-      sf->mesh_patterns[i].range = best_quality_mesh_pattern[i].range;
-      sf->mesh_patterns[i].interval = best_quality_mesh_pattern[i].interval;
-    }
-  } else {
-    int speed = (oxcf->speed > MAX_MESH_SPEED) ? MAX_MESH_SPEED : oxcf->speed;
-    if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)
-      sf->exhaustive_searches_thresh = (1 << 22);
-    else
-      sf->exhaustive_searches_thresh = (1 << 23);
-    sf->max_exaustive_pct = good_quality_max_mesh_pct[speed];
-    if (speed > 0)
-      sf->exhaustive_searches_thresh = sf->exhaustive_searches_thresh << 1;
-
-    for (i = 0; i < MAX_MESH_STEP; ++i) {
-      sf->mesh_patterns[i].range = good_quality_mesh_patterns[speed][i].range;
+      sf->mesh_patterns[i].range =
+          best_quality_mesh_pattern[mesh_density_level][i].range;
       sf->mesh_patterns[i].interval =
-          good_quality_mesh_patterns[speed][i].interval;
+          best_quality_mesh_pattern[mesh_density_level][i].interval;
     }
   }
 
+  if (oxcf->mode == REALTIME)
+    set_rt_speed_feature_framesize_independent(cpi, sf, speed, oxcf->content);
+#if !CONFIG_REALTIME_ONLY
+  else if (oxcf->mode == GOOD)
+    set_good_speed_feature_framesize_independent(cpi, cm, sf, speed);
+#endif
+
+  cpi->diamond_search_sad = vp9_diamond_search_sad;
+
   // Slow quant, dct and trellis not worthwhile for first pass
   // so make sure they are always turned off.
   if (oxcf->pass == 1) sf->optimize_coefficients = 0;
@@ -694,7 +1057,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
     sf->optimize_coefficients = 0;
   }
 
-  if (sf->mv.subpel_force_stop == 3) {
+  if (sf->mv.subpel_force_stop == FULL_PEL) {
     // Whole pel only
     cpi->find_fractional_mv_step = vp9_skip_sub_pixel_tree;
   } else if (sf->mv.subpel_search_method == SUBPEL_TREE) {
@@ -707,6 +1070,12 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
     cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned_evenmore;
   }
 
+  // This is only used in motion vector unit test.
+  if (cpi->oxcf.motion_vector_unit_test == 1)
+    cpi->find_fractional_mv_step = vp9_return_max_sub_pixel_mv;
+  else if (cpi->oxcf.motion_vector_unit_test == 2)
+    cpi->find_fractional_mv_step = vp9_return_min_sub_pixel_mv;
+
   x->optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1;
 
   x->min_partition_size = sf->default_min_partition_size;
@@ -715,4 +1084,13 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
   if (!cpi->oxcf.frame_periodic_boost) {
     sf->max_delta_qindex = 0;
   }
+
+  // With row based multi-threading, the following speed features
+  // have to be disabled to guarantee that bitstreams encoded with single thread
+  // and multiple threads match.
+  // It can be used in realtime when adaptive_rd_thresh_row_mt is enabled since
+  // adaptive_rd_thresh is defined per-row for non-rd pickmode.
+  if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact &&
+      oxcf->max_threads > 1)
+    sf->adaptive_rd_thresh = 0;
 }
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.h b/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.h
index 478684d059..92a7df767f 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_SPEED_FEATURES_H_
-#define VP9_ENCODER_VP9_SPEED_FEATURES_H_
+#ifndef VPX_VP9_ENCODER_VP9_SPEED_FEATURES_H_
+#define VPX_VP9_ENCODER_VP9_SPEED_FEATURES_H_
 
 #include "vp9/common/vp9_enums.h"
 
@@ -57,7 +57,8 @@ typedef enum {
   BIGDIA = 3,
   SQUARE = 4,
   FAST_HEX = 5,
-  FAST_DIAMOND = 6
+  FAST_DIAMOND = 6,
+  MESH = 7
 } SEARCH_METHODS;
 
 typedef enum {
@@ -135,20 +136,20 @@ typedef enum {
 } INTERP_FILTER_MASK;
 
 typedef enum {
-  // Search partitions using RD/NONRD criterion
+  // Search partitions using RD/NONRD criterion.
   SEARCH_PARTITION,
 
-  // Always use a fixed size partition
+  // Always use a fixed size partition.
   FIXED_PARTITION,
 
   REFERENCE_PARTITION,
 
   // Use an arbitrary partitioning scheme based on source variance within
-  // a 64X64 SB
+  // a 64X64 SB.
   VAR_BASED_PARTITION,
 
-  // Use non-fixed partitions based on source variance
-  SOURCE_VAR_BASED_PARTITION
+  // Make partition decisions with machine learning models.
+  ML_BASED_PARTITION
 } PARTITION_SEARCH_TYPE;
 
 typedef enum {
@@ -161,6 +162,19 @@ typedef enum {
   ONE_LOOP_REDUCED = 1
 } FAST_COEFF_UPDATE;
 
+typedef enum { EIGHTH_PEL, QUARTER_PEL, HALF_PEL, FULL_PEL } SUBPEL_FORCE_STOP;
+
+typedef struct ADAPT_SUBPEL_FORCE_STOP {
+  // Threshold for full pixel motion vector;
+  int mv_thresh;
+
+  // subpel_force_stop if full pixel MV is below the threshold.
+  SUBPEL_FORCE_STOP force_stop_below;
+
+  // subpel_force_stop if full pixel MV is equal to or above the threshold.
+  SUBPEL_FORCE_STOP force_stop_above;
+} ADAPT_SUBPEL_FORCE_STOP;
+
 typedef struct MV_SPEED_FEATURES {
   // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc).
   SEARCH_METHODS search_method;
@@ -179,20 +193,31 @@ typedef struct MV_SPEED_FEATURES {
   // the same process. Along the way it skips many diagonals.
   SUBPEL_SEARCH_METHODS subpel_search_method;
 
-  // Maximum number of steps in logarithmic subpel search before giving up.
-  int subpel_iters_per_step;
+  // Subpel MV search level. Can take values 0 - 2. Higher values mean more
+  // extensive subpel search.
+  int subpel_search_level;
 
-  // Control when to stop subpel search:
-  // 0: Full subpel search.
-  // 1: Stop at quarter pixel.
-  // 2: Stop at half pixel.
-  // 3: Stop at full pixel.
-  int subpel_force_stop;
+  // When to stop subpel motion search.
+  SUBPEL_FORCE_STOP subpel_force_stop;
+
+  // If it's enabled, different subpel_force_stop will be used for different MV.
+  int enable_adaptive_subpel_force_stop;
+
+  ADAPT_SUBPEL_FORCE_STOP adapt_subpel_force_stop;
 
   // This variable sets the step_param used in full pel motion search.
   int fullpel_search_step_param;
+
+  // Whether to downsample the rows in sad calculation during motion search.
+  // This is only active when there are at least 8 rows.
+  int use_downsampled_sad;
 } MV_SPEED_FEATURES;
 
+typedef struct PARTITION_SEARCH_BREAKOUT_THR {
+  int64_t dist;
+  int rate;
+} PARTITION_SEARCH_BREAKOUT_THR;
+
 #define MAX_MESH_STEP 4
 
 typedef struct MESH_PATTERN {
@@ -200,6 +225,46 @@ typedef struct MESH_PATTERN {
   int interval;
 } MESH_PATTERN;
 
+typedef enum {
+  // No reaction to rate control on a detected slide/scene change.
+  NO_DETECTION = 0,
+
+  // Set to larger Q (max_q set by user) based only on the
+  // detected slide/scene change and current/past Q.
+  FAST_DETECTION_MAXQ = 1,
+
+  // Based on (first pass) encoded frame, if large frame size is detected
+  // then set to higher Q for the second re-encode. This involves 2 pass
+  // encoding on slide change, so slower than 1, but more accurate for
+  // detecting overshoot.
+  RE_ENCODE_MAXQ = 2
+} OVERSHOOT_DETECTION_CBR_RT;
+
+typedef enum {
+  USE_2_TAPS = 0,
+  USE_4_TAPS,
+  USE_8_TAPS,
+  USE_8_TAPS_SHARP,
+} SUBPEL_SEARCH_TYPE;
+
+typedef enum {
+  // Disable trellis coefficient optimization
+  DISABLE_TRELLIS_OPT,
+  // Enable trellis coefficient optimization
+  ENABLE_TRELLIS_OPT,
+  // Enable trellis coefficient optimization based on source variance of the
+  // prediction block during transform RD
+  ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR,
+  // Enable trellis coefficient optimization based on residual mse of the
+  // transform block during transform RD
+  ENABLE_TRELLIS_OPT_TX_RD_RESIDUAL_MSE,
+} ENABLE_TRELLIS_OPT_METHOD;
+
+typedef struct TRELLIS_OPT_CONTROL {
+  ENABLE_TRELLIS_OPT_METHOD method;
+  double thresh;
+} TRELLIS_OPT_CONTROL;
+
 typedef struct SPEED_FEATURES {
   MV_SPEED_FEATURES mv;
 
@@ -218,16 +283,30 @@ typedef struct SPEED_FEATURES {
   // adds overhead.
   int static_segmentation;
 
-  // If 1 we iterate finding a best reference for 2 ref frames together - via
-  // a log search that iterates 4 times (check around mv for last for best
-  // error of combined predictor then check around mv for alt). If 0 we
-  // we just use the best motion vector found for each frame by itself.
-  BLOCK_SIZE comp_inter_joint_search_thresh;
+  // The best compound predictor is found using an iterative log search process
+  // that searches for best ref0 mv using error of combined predictor and then
+  // searches for best ref1 mv. This sf determines the number of iterations of
+  // this process based on block size. The sf becomes more aggressive from level
+  // 0 to 2. The following table indicates the number of iterations w.r.t bsize:
+  //  -----------------------------------------------
+  // |sf (level)|bsize < 8X8| [8X8, 16X16] | > 16X16 |
+  // |    0     |     4     |      4       |    4    |
+  // |    1     |     0     |      2       |    4    |
+  // |    2     |     0     |      0       |    0    |
+  //  -----------------------------------------------
+  // Here, 0 iterations indicate using the best single motion vector selected
+  // for each ref frame without any iterative refinement.
+  int comp_inter_joint_search_iter_level;
 
   // This variable is used to cap the maximum number of times we skip testing a
   // mode to be evaluated. A high value means we will be faster.
+  // Turned off when (row_mt_bit_exact == 1 && adaptive_rd_thresh_row_mt == 0).
   int adaptive_rd_thresh;
 
+  // Flag to use adaptive_rd_thresh when row-mt is enabled, only for non-rd
+  // pickmode.
+  int adaptive_rd_thresh_row_mt;
+
   // Enables skipping the reconstruction step (idct, recon) in the
   // intermediate steps assuming the last frame didn't have too many intra
   // blocks and the q is less than a threshold.
@@ -241,13 +320,16 @@ typedef struct SPEED_FEATURES {
   int coeff_prob_appx_step;
 
   // Enable uniform quantizer followed by trellis coefficient optimization
-  int allow_quant_coeff_opt;
-  double quant_opt_thresh;
+  // during transform RD
+  TRELLIS_OPT_CONTROL trellis_opt_tx_rd;
 
   // Enable asymptotic closed-loop encoding decision for key frame and
   // alternate reference frames.
   int allow_acl;
 
+  // Temporal dependency model based encoding mode optimization
+  int enable_tpl_model;
+
   // Use transform domain distortion. Use pixel domain distortion in speed 0
   // and certain situations in higher speed to improve the RD model precision.
   int allow_txfm_domain_distortion;
@@ -262,6 +344,9 @@ typedef struct SPEED_FEATURES {
   // for intra and model coefs for the rest.
   TX_SIZE_SEARCH_METHOD tx_size_search_method;
 
+  // How many levels of tx size to search, starting from the largest.
+  int tx_size_search_depth;
+
   // Low precision 32x32 fdct keeps everything in 16 bits and thus is less
   // precise but significantly faster than the non lp version.
   int use_lp32x32fdct;
@@ -276,16 +361,21 @@ typedef struct SPEED_FEATURES {
 
   PARTITION_SEARCH_TYPE partition_search_type;
 
-  // Used if partition_search_type = FIXED_SIZE_PARTITION
+  // Used if partition_search_type = FIXED_PARTITION
   BLOCK_SIZE always_this_block_size;
 
   // Skip rectangular partition test when partition type none gives better
   // rd than partition type split.
   int less_rectangular_check;
 
-  // Disable testing non square partitions. (eg 16x32)
+  // Disable testing non square partitions(eg 16x32) for block sizes larger than
+  // use_square_only_thresh_high or smaller than use_square_only_thresh_low.
   int use_square_partition_only;
-  BLOCK_SIZE use_square_only_threshold;
+  BLOCK_SIZE use_square_only_thresh_high;
+  BLOCK_SIZE use_square_only_thresh_low;
+
+  // Prune reference frames for rectangular partitions.
+  int prune_ref_frame_for_rect_partitions;
 
   // Sets min and max partition sizes for this 64x64 region based on the
   // same 64x64 in last encoded frame, and the left and above neighbor.
@@ -317,15 +407,12 @@ typedef struct SPEED_FEATURES {
   // point for this motion search and limits the search range around it.
   int adaptive_motion_search;
 
-  // Flag for allowing some use of exhaustive searches;
-  int allow_exhaustive_searches;
+  // Do extra full pixel motion search to obtain better motion vector.
+  int enhanced_full_pixel_motion_search;
 
   // Threshold for allowing exhaistive motion search.
   int exhaustive_searches_thresh;
 
-  // Maximum number of exhaustive searches for a frame.
-  int max_exaustive_pct;
-
   // Pattern to be used for any exhaustive mesh searches.
   MESH_PATTERN mesh_patterns[MAX_MESH_STEP];
 
@@ -340,9 +427,21 @@ typedef struct SPEED_FEATURES {
   // Adaptive prediction mode search
   int adaptive_mode_search;
 
-  // Chessboard pattern prediction filter type search
+  // Prune NEAREST and ZEROMV single reference modes based on motion vector
+  // difference and mode rate
+  int prune_single_mode_based_on_mv_diff_mode_rate;
+
+  // Chessboard pattern prediction for interp filter. Aggressiveness increases
+  // with levels.
+  // 0: disable
+  // 1: cb pattern in eval when filter is not switchable
+  // 2: cb pattern prediction for filter search
   int cb_pred_filter_search;
 
+  // This variable enables an early termination of interpolation filter eval
+  // based on the current rd cost after processing each plane
+  int early_term_interp_search_plane_rd;
+
   int cb_partition_search;
 
   int motion_field_mode_search;
@@ -415,10 +514,6 @@ typedef struct SPEED_FEATURES {
   // TODO(aconverse): Fold this into one of the other many mode skips
   BLOCK_SIZE max_intra_bsize;
 
-  // The frequency that we check if SOURCE_VAR_BASED_PARTITION or
-  // FIXED_PARTITION search type should be used.
-  int search_type_check_frequency;
-
   // When partition is pre-set, the inter prediction result from pick_inter_mode
   // can be reused in final block encoding process. It is enabled only for real-
   // time mode speed 6.
@@ -442,11 +537,29 @@ typedef struct SPEED_FEATURES {
   INTERP_FILTER_MASK interp_filter_search_mask;
 
   // Partition search early breakout thresholds.
-  int64_t partition_search_breakout_dist_thr;
-  int partition_search_breakout_rate_thr;
+  PARTITION_SEARCH_BREAKOUT_THR partition_search_breakout_thr;
 
-  // Allow skipping partition search for still image frame
-  int allow_partition_search_skip;
+  struct {
+    // Use ML-based partition search early breakout.
+    int search_breakout;
+    // Higher values mean more aggressiveness for partition search breakout that
+    // results in better encoding  speed but worse compression performance.
+    float search_breakout_thresh[3];
+
+    // Machine-learning based partition search early termination
+    int search_early_termination;
+
+    // Machine-learning based partition search pruning using prediction residue
+    // variance.
+    int var_pruning;
+
+    // Threshold values used for ML based rectangular partition search pruning.
+    // If < 0, the feature is turned off.
+    // Higher values mean more aggressiveness to skip rectangular partition
+    // search that results in better encoding speed but worse coding
+    // performance.
+    int prune_rect_thresh[4];
+  } rd_ml_partition;
 
   // Fast approximation of vp9_model_rd_from_var_lapndz
   int simple_model_rd_from_var;
@@ -456,12 +569,13 @@ typedef struct SPEED_FEATURES {
   int short_circuit_flat_blocks;
 
   // Skip a number of expensive mode evaluations for blocks with very low
-  // temporal variance.
+  // temporal variance. If the low temporal variance flag is set for a block,
+  // do the following:
   // 1: Skip all golden modes and ALL INTRA for bsize >= 32x32.
   // 2: Skip golden non-zeromv and newmv-last for bsize >= 16x16, skip ALL
   // INTRA for bsize >= 32x32 and vert/horz INTRA for bsize 16x16, 16x32 and
   // 32x16.
-  // 3: Same as (2), but also skip golden zeromv for low res.
+  // 3: Same as (2), but also skip golden zeromv.
   int short_circuit_low_temp_var;
 
   // Limits the rd-threshold update for early exit for the newmv-last mode,
@@ -477,15 +591,77 @@ typedef struct SPEED_FEATURES {
 
   // Global flag to enable partition copy from the previous frame.
   int copy_partition_flag;
+
+  // Compute the source sad for every superblock of the frame,
+  // prior to encoding the frame, to be used to bypass some encoder decisions.
+  int use_source_sad;
+
+  int use_simple_block_yrd;
+
+  // If source sad of superblock is high (> adapt_partition_thresh), will switch
+  // from VARIANCE_PARTITION to REFERENCE_PARTITION (which selects partition
+  // based on the nonrd-pickmode).
+  int adapt_partition_source_sad;
+  int adapt_partition_thresh;
+
+  // Enable use of alt-refs in 1 pass VBR.
+  int use_altref_onepass;
+
+  // Enable use of compound prediction, for nonrd_pickmode with nonzero lag.
+  int use_compound_nonrd_pickmode;
+
+  // Always use nonrd_pick_intra for all block sizes on keyframes.
+  int nonrd_keyframe;
+
+  // For SVC: enables use of partition from lower spatial resolution.
+  int svc_use_lowres_part;
+
+  // Flag to indicate process for handling overshoot on slide/scene change,
+  // for real-time CBR mode.
+  OVERSHOOT_DETECTION_CBR_RT overshoot_detection_cbr_rt;
+
+  // Disable partitioning of 16x16 blocks.
+  int disable_16x16part_nonkey;
+
+  // Allow for disabling golden reference.
+  int disable_golden_ref;
+
+  // Allow sub-pixel search to use interpolation filters with different taps in
+  // order to achieve accurate motion search result.
+  SUBPEL_SEARCH_TYPE use_accurate_subpel_search;
+
+  // Search method used by temporal filtering in full_pixel_motion_search.
+  SEARCH_METHODS temporal_filter_search_method;
+
+  // Use machine learning based partition search.
+  int nonrd_use_ml_partition;
+
+  // Multiplier for base threshold for variance partitioning.
+  int variance_part_thresh_mult;
+
+  // Force subpel motion filter to always use SMOOTH_FILTER.
+  int force_smooth_interpol;
+
+  // For real-time mode: force DC only under intra search when content
+  // does not have high souce SAD.
+  int rt_intra_dc_only_low_content;
+
+  // The encoder has a feature that skips forward transform and quantization
+  // based on a model rd estimation to reduce encoding time.
+  // However, this feature is dangerous since it could lead to bad perceptual
+  // quality. This flag is added to guard the feature.
+  int allow_skip_txfm_ac_dc;
 } SPEED_FEATURES;
 
 struct VP9_COMP;
 
-void vp9_set_speed_features_framesize_independent(struct VP9_COMP *cpi);
-void vp9_set_speed_features_framesize_dependent(struct VP9_COMP *cpi);
+void vp9_set_speed_features_framesize_independent(struct VP9_COMP *cpi,
+                                                  int speed);
+void vp9_set_speed_features_framesize_dependent(struct VP9_COMP *cpi,
+                                                int speed);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_SPEED_FEATURES_H_
+#endif  // VPX_VP9_ENCODER_VP9_SPEED_FEATURES_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_subexp.c b/media/libvpx/libvpx/vp9/encoder/vp9_subexp.c
index e8212ce05e..2e1810b0ee 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_subexp.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_subexp.c
@@ -66,11 +66,13 @@ static int remap_prob(int v, int m) {
   };
   v--;
   m--;
+  assert(m >= 0);
   if ((m << 1) <= MAX_PROB)
     i = recenter_nonneg(v, m) - 1;
   else
     i = recenter_nonneg(MAX_PROB - 1 - v, MAX_PROB - 1 - m) - 1;
 
+  assert(i >= 0 && (size_t)i < sizeof(map_table));
   i = map_table[i];
   return i;
 }
@@ -113,19 +115,20 @@ void vp9_write_prob_diff_update(vpx_writer *w, vpx_prob newp, vpx_prob oldp) {
   encode_term_subexp(w, delp);
 }
 
-int vp9_prob_diff_update_savings_search(const unsigned int *ct, vpx_prob oldp,
-                                        vpx_prob *bestp, vpx_prob upd) {
-  const int old_b = cost_branch256(ct, oldp);
-  int bestsavings = 0;
+int64_t vp9_prob_diff_update_savings_search(const unsigned int *ct,
+                                            vpx_prob oldp, vpx_prob *bestp,
+                                            vpx_prob upd) {
+  const int64_t old_b = cost_branch256(ct, oldp);
+  int64_t bestsavings = 0;
   vpx_prob newp, bestnewp = oldp;
   const int step = *bestp > oldp ? -1 : 1;
   const int upd_cost = vp9_cost_one(upd) - vp9_cost_zero(upd);
 
   if (old_b > upd_cost + (MIN_DELP_BITS << VP9_PROB_COST_SHIFT)) {
     for (newp = *bestp; newp != oldp; newp += step) {
-      const int new_b = cost_branch256(ct, newp);
-      const int update_b = prob_diff_update_cost(newp, oldp) + upd_cost;
-      const int savings = old_b - new_b - update_b;
+      const int64_t new_b = cost_branch256(ct, newp);
+      const int64_t update_b = prob_diff_update_cost(newp, oldp) + upd_cost;
+      const int64_t savings = old_b - new_b - update_b;
       if (savings > bestsavings) {
         bestsavings = savings;
         bestnewp = newp;
@@ -136,15 +139,15 @@ int vp9_prob_diff_update_savings_search(const unsigned int *ct, vpx_prob oldp,
   return bestsavings;
 }
 
-int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
-                                              const vpx_prob oldp,
-                                              vpx_prob *bestp, vpx_prob upd,
-                                              int stepsize) {
-  int i, old_b, new_b, update_b, savings, bestsavings;
-  int newp;
-  const int step_sign = *bestp > oldp ? -1 : 1;
-  const int step = stepsize * step_sign;
-  const int upd_cost = vp9_cost_one(upd) - vp9_cost_zero(upd);
+int64_t vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
+                                                  const vpx_prob oldp,
+                                                  vpx_prob *bestp, vpx_prob upd,
+                                                  int stepsize) {
+  int64_t i, old_b, new_b, update_b, savings, bestsavings;
+  int64_t newp;
+  const int64_t step_sign = *bestp > oldp ? -1 : 1;
+  const int64_t step = stepsize * step_sign;
+  const int64_t upd_cost = vp9_cost_one(upd) - vp9_cost_zero(upd);
   const vpx_prob *newplist, *oldplist;
   vpx_prob bestnewp;
   oldplist = vp9_pareto8_full[oldp - 1];
@@ -161,14 +164,14 @@ int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
     for (newp = *bestp; (newp - oldp) * step_sign < 0; newp += step) {
       if (newp < 1 || newp > 255) continue;
       newplist = vp9_pareto8_full[newp - 1];
-      new_b = cost_branch256(ct + 2 * PIVOT_NODE, newp);
+      new_b = cost_branch256(ct + 2 * PIVOT_NODE, (vpx_prob)newp);
       for (i = UNCONSTRAINED_NODES; i < ENTROPY_NODES; ++i)
         new_b += cost_branch256(ct + 2 * i, newplist[i - UNCONSTRAINED_NODES]);
-      update_b = prob_diff_update_cost(newp, oldp) + upd_cost;
+      update_b = prob_diff_update_cost((vpx_prob)newp, oldp) + upd_cost;
       savings = old_b - new_b - update_b;
       if (savings > bestsavings) {
         bestsavings = savings;
-        bestnewp = newp;
+        bestnewp = (vpx_prob)newp;
       }
     }
   }
@@ -181,7 +184,7 @@ void vp9_cond_prob_diff_update(vpx_writer *w, vpx_prob *oldp,
                                const unsigned int ct[2]) {
   const vpx_prob upd = DIFF_UPDATE_PROB;
   vpx_prob newp = get_binary_prob(ct[0], ct[1]);
-  const int savings =
+  const int64_t savings =
       vp9_prob_diff_update_savings_search(ct, *oldp, &newp, upd);
   assert(newp >= 1);
   if (savings > 0) {
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_subexp.h b/media/libvpx/libvpx/vp9/encoder/vp9_subexp.h
index 26c89e2ea7..2d016d24c5 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_subexp.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_subexp.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_SUBEXP_H_
-#define VP9_ENCODER_VP9_SUBEXP_H_
+#ifndef VPX_VP9_ENCODER_VP9_SUBEXP_H_
+#define VPX_VP9_ENCODER_VP9_SUBEXP_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -25,16 +25,17 @@ void vp9_write_prob_diff_update(struct vpx_writer *w, vpx_prob newp,
 void vp9_cond_prob_diff_update(struct vpx_writer *w, vpx_prob *oldp,
                                const unsigned int ct[2]);
 
-int vp9_prob_diff_update_savings_search(const unsigned int *ct, vpx_prob oldp,
-                                        vpx_prob *bestp, vpx_prob upd);
+int64_t vp9_prob_diff_update_savings_search(const unsigned int *ct,
+                                            vpx_prob oldp, vpx_prob *bestp,
+                                            vpx_prob upd);
 
-int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
-                                              const vpx_prob oldp,
-                                              vpx_prob *bestp, vpx_prob upd,
-                                              int stepsize);
+int64_t vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
+                                                  const vpx_prob oldp,
+                                                  vpx_prob *bestp, vpx_prob upd,
+                                                  int stepsize);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_SUBEXP_H_
+#endif  // VPX_VP9_ENCODER_VP9_SUBEXP_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.c b/media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.c
index 1d892dc148..6e9405e422 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.c
@@ -19,6 +19,14 @@
 #define SMALL_FRAME_WIDTH 32
 #define SMALL_FRAME_HEIGHT 16
 
+static void swap_ptr(void *a, void *b) {
+  void **a_p = (void **)a;
+  void **b_p = (void **)b;
+  void *c = *a_p;
+  *a_p = *b_p;
+  *b_p = c;
+}
+
 void vp9_init_layer_context(VP9_COMP *const cpi) {
   SVC *const svc = &cpi->svc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
@@ -29,20 +37,53 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
 
   svc->spatial_layer_id = 0;
   svc->temporal_layer_id = 0;
-  svc->first_spatial_layer_to_encode = 0;
-  svc->rc_drop_superframe = 0;
   svc->force_zero_mode_spatial_ref = 0;
   svc->use_base_mv = 0;
+  svc->use_partition_reuse = 0;
+  svc->use_gf_temporal_ref = 1;
+  svc->use_gf_temporal_ref_current_layer = 0;
   svc->scaled_temp_is_alloc = 0;
   svc->scaled_one_half = 0;
   svc->current_superframe = 0;
-  for (i = 0; i < REF_FRAMES; ++i) svc->ref_frame_index[i] = -1;
-  for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
-    cpi->svc.ext_frame_flags[sl] = 0;
-    cpi->svc.ext_lst_fb_idx[sl] = 0;
-    cpi->svc.ext_gld_fb_idx[sl] = 1;
-    cpi->svc.ext_alt_fb_idx[sl] = 2;
+  svc->non_reference_frame = 0;
+  svc->skip_enhancement_layer = 0;
+  svc->disable_inter_layer_pred = INTER_LAYER_PRED_ON;
+  svc->framedrop_mode = CONSTRAINED_LAYER_DROP;
+  svc->set_intra_only_frame = 0;
+  svc->previous_frame_is_intra_only = 0;
+  svc->superframe_has_layer_sync = 0;
+  svc->use_set_ref_frame_config = 0;
+  svc->num_encoded_top_layer = 0;
+  svc->simulcast_mode = 0;
+  svc->single_layer_svc = 0;
+  svc->resize_set = 0;
+
+  for (i = 0; i < REF_FRAMES; ++i) {
+    svc->fb_idx_spatial_layer_id[i] = 0xff;
+    svc->fb_idx_temporal_layer_id[i] = 0xff;
+    svc->fb_idx_base[i] = 0;
   }
+  for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
+    svc->last_layer_dropped[sl] = 0;
+    svc->drop_spatial_layer[sl] = 0;
+    svc->ext_frame_flags[sl] = 0;
+    svc->lst_fb_idx[sl] = 0;
+    svc->gld_fb_idx[sl] = 1;
+    svc->alt_fb_idx[sl] = 2;
+    svc->downsample_filter_type[sl] = BILINEAR;
+    svc->downsample_filter_phase[sl] = 8;  // Set to 8 for averaging filter.
+    svc->framedrop_thresh[sl] = oxcf->drop_frames_water_mark;
+    svc->fb_idx_upd_tl0[sl] = -1;
+    svc->drop_count[sl] = 0;
+    svc->spatial_layer_sync[sl] = 0;
+    svc->force_drop_constrained_from_above[sl] = 0;
+  }
+  svc->max_consec_drop = INT_MAX;
+
+  svc->buffer_gf_temporal_ref[1].idx = 7;
+  svc->buffer_gf_temporal_ref[0].idx = 6;
+  svc->buffer_gf_temporal_ref[1].is_used = 0;
+  svc->buffer_gf_temporal_ref[0].is_used = 0;
 
   if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) {
     if (vpx_realloc_frame_buffer(&cpi->svc.empty_frame.img, SMALL_FRAME_WIDTH,
@@ -66,7 +107,6 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
       int layer = LAYER_IDS_TO_IDX(sl, tl, oxcf->ts_number_layers);
       LAYER_CONTEXT *const lc = &svc->layer_context[layer];
       RATE_CONTROL *const lrc = &lc->rc;
-      int i;
       lc->current_video_frame_in_layer = 0;
       lc->layer_size = 0;
       lc->frames_from_key_frame = 0;
@@ -80,6 +120,8 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
       lrc->ni_frames = 0;
       lrc->decimation_count = 0;
       lrc->decimation_factor = 0;
+      lrc->worst_quality = oxcf->worst_allowed_q;
+      lrc->best_quality = oxcf->best_allowed_q;
 
       for (i = 0; i < RATE_FACTOR_LEVELS; ++i) {
         lrc->rate_correction_factors[i] = 1.0;
@@ -118,17 +160,20 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
         size_t consec_zero_mv_size;
         VP9_COMMON *const cm = &cpi->common;
         lc->sb_index = 0;
-        CHECK_MEM_ERROR(cm, lc->map,
+        lc->actual_num_seg1_blocks = 0;
+        lc->actual_num_seg2_blocks = 0;
+        lc->counter_encode_maxq_scene_change = 0;
+        CHECK_MEM_ERROR(&cm->error, lc->map,
                         vpx_malloc(mi_rows * mi_cols * sizeof(*lc->map)));
         memset(lc->map, 0, mi_rows * mi_cols);
         last_coded_q_map_size =
             mi_rows * mi_cols * sizeof(*lc->last_coded_q_map);
-        CHECK_MEM_ERROR(cm, lc->last_coded_q_map,
+        CHECK_MEM_ERROR(&cm->error, lc->last_coded_q_map,
                         vpx_malloc(last_coded_q_map_size));
         assert(MAXQ <= 255);
         memset(lc->last_coded_q_map, MAXQ, last_coded_q_map_size);
         consec_zero_mv_size = mi_rows * mi_cols * sizeof(*lc->consec_zero_mv);
-        CHECK_MEM_ERROR(cm, lc->consec_zero_mv,
+        CHECK_MEM_ERROR(&cm->error, lc->consec_zero_mv,
                         vpx_malloc(consec_zero_mv_size));
         memset(lc->consec_zero_mv, 0, consec_zero_mv_size);
       }
@@ -149,6 +194,9 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
   const RATE_CONTROL *const rc = &cpi->rc;
   int sl, tl, layer = 0, spatial_layer_target;
   float bitrate_alloc = 1.0;
+  int num_spatial_layers_nonzero_rate = 0;
+
+  cpi->svc.temporal_layering_mode = oxcf->temporal_layering_mode;
 
   if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) {
     for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
@@ -171,18 +219,21 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
         RATE_CONTROL *const lrc = &lc->rc;
 
         lc->spatial_layer_target_bandwidth = spatial_layer_target;
-        bitrate_alloc = (float)lc->target_bandwidth / spatial_layer_target;
+        if (target_bandwidth != 0) {
+          bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
+        }
         lrc->starting_buffer_level =
-            (int64_t)(rc->starting_buffer_level * bitrate_alloc);
+            (int64_t)(rc->starting_buffer_level * bitrate_alloc + 0.5);
         lrc->optimal_buffer_level =
-            (int64_t)(rc->optimal_buffer_level * bitrate_alloc);
+            (int64_t)(rc->optimal_buffer_level * bitrate_alloc + 0.5);
         lrc->maximum_buffer_size =
-            (int64_t)(rc->maximum_buffer_size * bitrate_alloc);
+            (int64_t)(rc->maximum_buffer_size * bitrate_alloc + 0.5);
         lrc->bits_off_target =
             VPXMIN(lrc->bits_off_target, lrc->maximum_buffer_size);
         lrc->buffer_level = VPXMIN(lrc->buffer_level, lrc->maximum_buffer_size);
         lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[tl];
-        lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+        lrc->avg_frame_bandwidth = saturate_cast_double_to_int(
+            round(lc->target_bandwidth / lc->framerate));
         lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
         lrc->worst_quality = rc->worst_quality;
         lrc->best_quality = rc->best_quality;
@@ -203,7 +254,9 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
 
       lc->target_bandwidth = oxcf->layer_target_bitrate[layer];
 
-      bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
+      if (target_bandwidth != 0) {
+        bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
+      }
       // Update buffer-related quantities.
       lrc->starting_buffer_level =
           (int64_t)(rc->starting_buffer_level * bitrate_alloc);
@@ -220,17 +273,29 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
       } else {
         lc->framerate = cpi->framerate;
       }
-      lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+      lrc->avg_frame_bandwidth = saturate_cast_double_to_int(
+          round(lc->target_bandwidth / lc->framerate));
       lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
       // Update qp-related quantities.
       lrc->worst_quality = rc->worst_quality;
       lrc->best_quality = rc->best_quality;
     }
   }
+  for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
+    // Check bitrate of spatia layer.
+    layer = LAYER_IDS_TO_IDX(sl, oxcf->ts_number_layers - 1,
+                             oxcf->ts_number_layers);
+    if (oxcf->layer_target_bitrate[layer] > 0)
+      num_spatial_layers_nonzero_rate += 1;
+  }
+  if (num_spatial_layers_nonzero_rate == 1)
+    svc->single_layer_svc = 1;
+  else
+    svc->single_layer_svc = 0;
 }
 
 static LAYER_CONTEXT *get_layer_context(VP9_COMP *const cpi) {
-  if (is_one_pass_cbr_svc(cpi))
+  if (is_one_pass_svc(cpi))
     return &cpi->svc.layer_context[cpi->svc.spatial_layer_id *
                                        cpi->svc.number_temporal_layers +
                                    cpi->svc.temporal_layer_id];
@@ -251,7 +316,8 @@ void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) {
   const int tl = svc->temporal_layer_id;
 
   lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[tl];
-  lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+  lrc->avg_frame_bandwidth =
+      saturate_cast_double_to_int(round(lc->target_bandwidth / lc->framerate));
   lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth;
   // Update the average layer frame size (non-cumulative per-frame-bw).
   if (tl == 0) {
@@ -262,8 +328,8 @@ void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) {
     const int prev_layer_target_bandwidth =
         oxcf->layer_target_bitrate[st_idx - 1];
     lc->avg_frame_size =
-        (int)((lc->target_bandwidth - prev_layer_target_bandwidth) /
-              (lc->framerate - prev_layer_framerate));
+        (int)round((lc->target_bandwidth - prev_layer_target_bandwidth) /
+                   (lc->framerate - prev_layer_framerate));
   }
 }
 
@@ -273,12 +339,14 @@ void vp9_update_spatial_layer_framerate(VP9_COMP *const cpi, double framerate) {
   RATE_CONTROL *const lrc = &lc->rc;
 
   lc->framerate = framerate;
-  lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
-  lrc->min_frame_bandwidth =
-      (int)(lrc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100);
-  lrc->max_frame_bandwidth = (int)(((int64_t)lrc->avg_frame_bandwidth *
-                                    oxcf->two_pass_vbrmax_section) /
-                                   100);
+  lrc->avg_frame_bandwidth =
+      saturate_cast_double_to_int(round(lc->target_bandwidth / lc->framerate));
+  const int64_t vbr_min_bits =
+      (int64_t)lrc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100;
+  lrc->min_frame_bandwidth = (int)VPXMIN(vbr_min_bits, INT_MAX);
+  const int64_t vbr_max_bits =
+      (int64_t)lrc->avg_frame_bandwidth * oxcf->two_pass_vbrmax_section / 100;
+  lrc->max_frame_bandwidth = (int)VPXMIN(vbr_max_bits, INT_MAX);
   vp9_rc_set_gf_interval_range(cpi, lrc);
 }
 
@@ -286,6 +354,7 @@ void vp9_restore_layer_context(VP9_COMP *const cpi) {
   LAYER_CONTEXT *const lc = get_layer_context(cpi);
   const int old_frame_since_key = cpi->rc.frames_since_key;
   const int old_frame_to_key = cpi->rc.frames_to_key;
+  const int old_ext_use_post_encode_drop = cpi->rc.ext_use_post_encode_drop;
 
   cpi->rc = lc->rc;
   cpi->twopass = lc->twopass;
@@ -293,32 +362,30 @@ void vp9_restore_layer_context(VP9_COMP *const cpi) {
   cpi->alt_ref_source = lc->alt_ref_source;
   // Check if it is one_pass_cbr_svc mode and lc->speed > 0 (real-time mode
   // does not use speed = 0).
-  if (is_one_pass_cbr_svc(cpi) && lc->speed > 0) {
+  if (is_one_pass_svc(cpi) && lc->speed > 0) {
     cpi->oxcf.speed = lc->speed;
   }
+  cpi->loopfilter_ctrl = lc->loopfilter_ctrl;
   // Reset the frames_since_key and frames_to_key counters to their values
   // before the layer restore. Keep these defined for the stream (not layer).
   if (cpi->svc.number_temporal_layers > 1 ||
-      (cpi->svc.number_spatial_layers > 1 && !is_two_pass_svc(cpi))) {
+      cpi->svc.number_spatial_layers > 1) {
     cpi->rc.frames_since_key = old_frame_since_key;
     cpi->rc.frames_to_key = old_frame_to_key;
   }
-
+  cpi->rc.ext_use_post_encode_drop = old_ext_use_post_encode_drop;
   // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers,
   // for the base temporal layer.
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
       cpi->svc.number_spatial_layers > 1 && cpi->svc.temporal_layer_id == 0) {
     CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
-    signed char *temp = cr->map;
-    uint8_t *temp2 = cr->last_coded_q_map;
-    uint8_t *temp3 = cpi->consec_zero_mv;
-    cr->map = lc->map;
-    lc->map = temp;
-    cr->last_coded_q_map = lc->last_coded_q_map;
-    lc->last_coded_q_map = temp2;
-    cpi->consec_zero_mv = lc->consec_zero_mv;
-    lc->consec_zero_mv = temp3;
+    swap_ptr(&cr->map, &lc->map);
+    swap_ptr(&cr->last_coded_q_map, &lc->last_coded_q_map);
+    swap_ptr(&cpi->consec_zero_mv, &lc->consec_zero_mv);
     cr->sb_index = lc->sb_index;
+    cr->actual_num_seg1_blocks = lc->actual_num_seg1_blocks;
+    cr->actual_num_seg2_blocks = lc->actual_num_seg2_blocks;
+    cr->counter_encode_maxq_scene_change = lc->counter_encode_maxq_scene_change;
   }
 }
 
@@ -330,6 +397,8 @@ void vp9_save_layer_context(VP9_COMP *const cpi) {
   lc->twopass = cpi->twopass;
   lc->target_bandwidth = (int)oxcf->target_bandwidth;
   lc->alt_ref_source = cpi->alt_ref_source;
+  lc->frame_qp = cpi->common.base_qindex;
+  lc->MBs = cpi->common.MBs;
 
   // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers,
   // for the base temporal layer.
@@ -346,9 +415,16 @@ void vp9_save_layer_context(VP9_COMP *const cpi) {
     lc->consec_zero_mv = cpi->consec_zero_mv;
     cpi->consec_zero_mv = temp3;
     lc->sb_index = cr->sb_index;
+    lc->actual_num_seg1_blocks = cr->actual_num_seg1_blocks;
+    lc->actual_num_seg2_blocks = cr->actual_num_seg2_blocks;
+    lc->counter_encode_maxq_scene_change = cr->counter_encode_maxq_scene_change;
+    lc->qindex_delta[0] = cr->qindex_delta[0];
+    lc->qindex_delta[1] = cr->qindex_delta[1];
+    lc->qindex_delta[2] = cr->qindex_delta[2];
   }
 }
 
+#if !CONFIG_REALTIME_ONLY
 void vp9_init_second_pass_spatial_svc(VP9_COMP *cpi) {
   SVC *const svc = &cpi->svc;
   int i;
@@ -364,6 +440,7 @@ void vp9_init_second_pass_spatial_svc(VP9_COMP *cpi) {
   }
   svc->spatial_layer_id = 0;
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 void vp9_inc_frame_in_layer(VP9_COMP *const cpi) {
   LAYER_CONTEXT *const lc =
@@ -375,21 +452,18 @@ void vp9_inc_frame_in_layer(VP9_COMP *const cpi) {
     ++cpi->svc.current_superframe;
 }
 
-int vp9_is_upper_layer_key_frame(const VP9_COMP *const cpi) {
-  return is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0 &&
-         cpi->svc
-             .layer_context[cpi->svc.spatial_layer_id *
-                                cpi->svc.number_temporal_layers +
-                            cpi->svc.temporal_layer_id]
-             .is_key_frame;
-}
-
-static void get_layer_resolution(const int width_org, const int height_org,
-                                 const int num, const int den, int *width_out,
-                                 int *height_out) {
+void get_layer_resolution(const int width_org, const int height_org,
+                          const int num, const int den, int *width_out,
+                          int *height_out) {
   int w, h;
 
-  if (width_out == NULL || height_out == NULL || den == 0) return;
+  if (width_out == NULL || height_out == NULL) return;
+
+  if (den == 0 || num == 0) {
+    *width_out = width_org;
+    *height_out = height_org;
+    return;
+  }
 
   w = width_org * num / den;
   h = height_org * num / den;
@@ -402,6 +476,48 @@ static void get_layer_resolution(const int width_org, const int height_org,
   *height_out = h;
 }
 
+static void reset_fb_idx_unused(VP9_COMP *const cpi) {
+  // If a reference frame is not referenced or refreshed, then set the
+  // fb_idx for that reference to the first one used/referenced.
+  // This is to avoid setting fb_idx for a reference to a slot that is not
+  // used/needed (i.e., since that reference is not referenced or refreshed).
+  MV_REFERENCE_FRAME ref_frame;
+  MV_REFERENCE_FRAME first_ref = 0;
+  int first_fb_idx = 0;
+  int fb_idx[3] = { cpi->lst_fb_idx, cpi->gld_fb_idx, cpi->alt_fb_idx };
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+    if (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) {
+      first_ref = ref_frame;
+      first_fb_idx = fb_idx[ref_frame - 1];
+      break;
+    }
+  }
+  if (first_ref > 0) {
+    if (first_ref != LAST_FRAME && !(cpi->ref_frame_flags & VP9_LAST_FLAG) &&
+        !cpi->ext_refresh_last_frame)
+      cpi->lst_fb_idx = first_fb_idx;
+    else if (first_ref != GOLDEN_FRAME &&
+             !(cpi->ref_frame_flags & VP9_GOLD_FLAG) &&
+             !cpi->ext_refresh_golden_frame)
+      cpi->gld_fb_idx = first_fb_idx;
+    else if (first_ref != ALTREF_FRAME &&
+             !(cpi->ref_frame_flags & VP9_ALT_FLAG) &&
+             !cpi->ext_refresh_alt_ref_frame)
+      cpi->alt_fb_idx = first_fb_idx;
+  }
+}
+
+// Never refresh any reference frame buffers on top temporal layers in
+// simulcast mode, which has interlayer prediction disabled.
+static void non_reference_frame_simulcast(VP9_COMP *const cpi) {
+  if (cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1 &&
+      cpi->svc.temporal_layer_id > 0) {
+    cpi->ext_refresh_last_frame = 0;
+    cpi->ext_refresh_golden_frame = 0;
+    cpi->ext_refresh_alt_ref_frame = 0;
+  }
+}
+
 // The function sets proper ref_frame_flags, buffer indices, and buffer update
 // variables for temporal layering mode 3 - that does 0-2-1-2 temporal layering
 // scheme.
@@ -505,6 +621,10 @@ static void set_flags_and_fb_idx_for_temporal_mode3(VP9_COMP *const cpi) {
     cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1;
     cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id;
   }
+
+  if (cpi->svc.simulcast_mode) non_reference_frame_simulcast(cpi);
+
+  reset_fb_idx_unused(cpi);
 }
 
 // The function sets proper ref_frame_flags, buffer indices, and buffer update
@@ -540,6 +660,8 @@ static void set_flags_and_fb_idx_for_temporal_mode2(VP9_COMP *const cpi) {
     if (!spatial_id) {
       cpi->ref_frame_flags = VP9_LAST_FLAG;
     } else {
+      if (spatial_id == cpi->svc.number_spatial_layers - 1)
+        cpi->ext_refresh_alt_ref_frame = 0;
       cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
     }
   }
@@ -562,6 +684,10 @@ static void set_flags_and_fb_idx_for_temporal_mode2(VP9_COMP *const cpi) {
     cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1;
     cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id;
   }
+
+  if (cpi->svc.simulcast_mode) non_reference_frame_simulcast(cpi);
+
+  reset_fb_idx_unused(cpi);
 }
 
 // The function sets proper ref_frame_flags, buffer indices, and buffer update
@@ -594,197 +720,286 @@ static void set_flags_and_fb_idx_for_temporal_mode_noLayering(
   } else {
     cpi->gld_fb_idx = 0;
   }
+
+  if (cpi->svc.simulcast_mode) non_reference_frame_simulcast(cpi);
+
+  reset_fb_idx_unused(cpi);
 }
 
-int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
-  int width = 0, height = 0;
-  LAYER_CONTEXT *lc = NULL;
-  if (cpi->svc.number_spatial_layers > 1) cpi->svc.use_base_mv = 1;
-  cpi->svc.force_zero_mode_spatial_ref = 1;
+static void set_flags_and_fb_idx_bypass_via_set_ref_frame_config(
+    VP9_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  int sl = svc->spatial_layer_id = svc->spatial_layer_to_encode;
+  cpi->svc.temporal_layer_id = cpi->svc.temporal_layer_id_per_spatial[sl];
+  cpi->ext_refresh_frame_flags_pending = 1;
+  cpi->lst_fb_idx = svc->lst_fb_idx[sl];
+  cpi->gld_fb_idx = svc->gld_fb_idx[sl];
+  cpi->alt_fb_idx = svc->alt_fb_idx[sl];
+  cpi->ext_refresh_last_frame = 0;
+  cpi->ext_refresh_golden_frame = 0;
+  cpi->ext_refresh_alt_ref_frame = 0;
+  cpi->ref_frame_flags = 0;
+  if (svc->reference_last[sl]) cpi->ref_frame_flags |= VP9_LAST_FLAG;
+  if (svc->reference_golden[sl]) cpi->ref_frame_flags |= VP9_GOLD_FLAG;
+  if (svc->reference_altref[sl]) cpi->ref_frame_flags |= VP9_ALT_FLAG;
+}
 
-  if (cpi->svc.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) {
-    set_flags_and_fb_idx_for_temporal_mode3(cpi);
-  } else if (cpi->svc.temporal_layering_mode ==
-             VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) {
-    set_flags_and_fb_idx_for_temporal_mode_noLayering(cpi);
-  } else if (cpi->svc.temporal_layering_mode ==
-             VP9E_TEMPORAL_LAYERING_MODE_0101) {
-    set_flags_and_fb_idx_for_temporal_mode2(cpi);
-  } else if (cpi->svc.temporal_layering_mode ==
-             VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
-    // In the BYPASS/flexible mode, the encoder is relying on the application
-    // to specify, for each spatial layer, the flags and buffer indices for the
-    // layering.
-    // Note that the check (cpi->ext_refresh_frame_flags_pending == 0) is
-    // needed to support the case where the frame flags may be passed in via
-    // vpx_codec_encode(), which can be used for the temporal-only svc case.
-    // TODO(marpan): Consider adding an enc_config parameter to better handle
-    // this case.
-    if (cpi->ext_refresh_frame_flags_pending == 0) {
-      int sl;
-      cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode;
-      sl = cpi->svc.spatial_layer_id;
-      vp9_apply_encoding_flags(cpi, cpi->svc.ext_frame_flags[sl]);
-      cpi->lst_fb_idx = cpi->svc.ext_lst_fb_idx[sl];
-      cpi->gld_fb_idx = cpi->svc.ext_gld_fb_idx[sl];
-      cpi->alt_fb_idx = cpi->svc.ext_alt_fb_idx[sl];
+void vp9_copy_flags_ref_update_idx(VP9_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  int sl = svc->spatial_layer_id;
+  svc->lst_fb_idx[sl] = cpi->lst_fb_idx;
+  svc->gld_fb_idx[sl] = cpi->gld_fb_idx;
+  svc->alt_fb_idx[sl] = cpi->alt_fb_idx;
+  // For the fixed SVC mode: pass the refresh_lst/gld/alt_frame flags to the
+  // update_buffer_slot, this is needed for the GET_SVC_REF_FRAME_CONFIG api.
+  if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+    int ref;
+    for (ref = 0; ref < REF_FRAMES; ++ref) {
+      svc->update_buffer_slot[sl] &= ~(1 << ref);
+      if ((ref == svc->lst_fb_idx[sl] && cpi->refresh_last_frame) ||
+          (ref == svc->gld_fb_idx[sl] && cpi->refresh_golden_frame) ||
+          (ref == svc->alt_fb_idx[sl] && cpi->refresh_alt_ref_frame))
+        svc->update_buffer_slot[sl] |= (1 << ref);
     }
   }
 
-  if (cpi->svc.spatial_layer_id == cpi->svc.first_spatial_layer_to_encode)
-    cpi->svc.rc_drop_superframe = 0;
+  // TODO(jianj): Remove these 3, deprecated.
+  svc->update_last[sl] = (uint8_t)cpi->refresh_last_frame;
+  svc->update_golden[sl] = (uint8_t)cpi->refresh_golden_frame;
+  svc->update_altref[sl] = (uint8_t)cpi->refresh_alt_ref_frame;
 
-  lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id *
-                                   cpi->svc.number_temporal_layers +
-                               cpi->svc.temporal_layer_id];
+  svc->reference_last[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_LAST_FLAG);
+  svc->reference_golden[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_GOLD_FLAG);
+  svc->reference_altref[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_ALT_FLAG);
+}
+
+int vp9_one_pass_svc_start_layer(VP9_COMP *const cpi) {
+  int width = 0, height = 0;
+  SVC *const svc = &cpi->svc;
+  LAYER_CONTEXT *lc = NULL;
+  int scaling_factor_num = 1;
+  int scaling_factor_den = 1;
+  svc->skip_enhancement_layer = 0;
+
+  if (svc->disable_inter_layer_pred == INTER_LAYER_PRED_OFF &&
+      svc->number_spatial_layers > 1 && svc->number_spatial_layers <= 3 &&
+      svc->number_temporal_layers <= 3)
+    svc->simulcast_mode = 1;
+  else
+    svc->simulcast_mode = 0;
+
+  if (svc->number_spatial_layers > 1) {
+    svc->use_base_mv = 1;
+    svc->use_partition_reuse = 1;
+  }
+  svc->force_zero_mode_spatial_ref = 1;
+
+  // For constrained_from_above drop mode: before encoding superframe (i.e.,
+  // at SL0 frame) check all spatial layers (starting from top) for possible
+  // drop, and if so, set a flag to force drop of that layer and all its lower
+  // layers.
+  if (svc->spatial_layer_to_encode == svc->first_spatial_layer_to_encode) {
+    int sl;
+    for (sl = 0; sl < svc->number_spatial_layers; sl++)
+      svc->force_drop_constrained_from_above[sl] = 0;
+    if (svc->framedrop_mode == CONSTRAINED_FROM_ABOVE_DROP) {
+      for (sl = svc->number_spatial_layers - 1;
+           sl >= svc->first_spatial_layer_to_encode; sl--) {
+        int layer = sl * svc->number_temporal_layers + svc->temporal_layer_id;
+        LAYER_CONTEXT *const sl_lc = &svc->layer_context[layer];
+        cpi->rc = sl_lc->rc;
+        cpi->oxcf.target_bandwidth = sl_lc->target_bandwidth;
+        if (vp9_test_drop(cpi)) {
+          int sl2;
+          // Set flag to force drop in encoding for this mode.
+          for (sl2 = sl; sl2 >= svc->first_spatial_layer_to_encode; sl2--)
+            svc->force_drop_constrained_from_above[sl2] = 1;
+          break;
+        }
+      }
+    }
+  }
+
+  if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) {
+    set_flags_and_fb_idx_for_temporal_mode3(cpi);
+  } else if (svc->temporal_layering_mode ==
+             VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) {
+    set_flags_and_fb_idx_for_temporal_mode_noLayering(cpi);
+  } else if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0101) {
+    set_flags_and_fb_idx_for_temporal_mode2(cpi);
+  } else if (svc->temporal_layering_mode ==
+                 VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
+             svc->use_set_ref_frame_config) {
+    set_flags_and_fb_idx_bypass_via_set_ref_frame_config(cpi);
+  }
+
+  if (cpi->lst_fb_idx == svc->buffer_gf_temporal_ref[0].idx ||
+      cpi->gld_fb_idx == svc->buffer_gf_temporal_ref[0].idx ||
+      cpi->alt_fb_idx == svc->buffer_gf_temporal_ref[0].idx)
+    svc->buffer_gf_temporal_ref[0].is_used = 1;
+  if (cpi->lst_fb_idx == svc->buffer_gf_temporal_ref[1].idx ||
+      cpi->gld_fb_idx == svc->buffer_gf_temporal_ref[1].idx ||
+      cpi->alt_fb_idx == svc->buffer_gf_temporal_ref[1].idx)
+    svc->buffer_gf_temporal_ref[1].is_used = 1;
+
+  // For the fixed (non-flexible/bypass) SVC mode:
+  // If long term temporal reference is enabled at the sequence level
+  // (use_gf_temporal_ref == 1), and inter_layer is disabled (on inter-frames),
+  // we can use golden as a second temporal reference
+  // (since the spatial/inter-layer reference is disabled).
+  // We check that the fb_idx for this reference (buffer_gf_temporal_ref.idx) is
+  // unused (slot 7 and 6 should be available for 3-3 layer system).
+  // For now usage of this second temporal reference will only be used for
+  // highest and next to highest spatial layer (i.e., top and middle layer for
+  // 3 spatial layers).
+  svc->use_gf_temporal_ref_current_layer = 0;
+  if (svc->use_gf_temporal_ref && !svc->buffer_gf_temporal_ref[0].is_used &&
+      !svc->buffer_gf_temporal_ref[1].is_used &&
+      svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
+      svc->disable_inter_layer_pred != INTER_LAYER_PRED_ON &&
+      svc->number_spatial_layers <= 3 && svc->number_temporal_layers <= 3 &&
+      svc->spatial_layer_id >= svc->number_spatial_layers - 2) {
+    // Enable the second (long-term) temporal reference at the frame-level.
+    svc->use_gf_temporal_ref_current_layer = 1;
+  }
+
+  // Check if current superframe has any layer sync, only check once on
+  // base layer.
+  if (svc->spatial_layer_id == 0) {
+    int sl = 0;
+    // Default is no sync.
+    svc->superframe_has_layer_sync = 0;
+    for (sl = 0; sl < svc->number_spatial_layers; ++sl) {
+      if (cpi->svc.spatial_layer_sync[sl]) svc->superframe_has_layer_sync = 1;
+    }
+  }
+
+  // Reset the drop flags for all spatial layers, on the
+  // first_spatial_layer_to_encode.
+  if (svc->spatial_layer_id == svc->first_spatial_layer_to_encode) {
+    vp9_zero(svc->drop_spatial_layer);
+    // TODO(jianj/marpan): Investigate why setting svc->lst/gld/alt_fb_idx
+    // causes an issue with frame dropping and temporal layers, when the frame
+    // flags are passed via the encode call (bypass mode). Issue is that we're
+    // resetting ext_refresh_frame_flags_pending to 0 on frame drops.
+    if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+      memset(&svc->lst_fb_idx, -1, sizeof(svc->lst_fb_idx));
+      memset(&svc->gld_fb_idx, -1, sizeof(svc->lst_fb_idx));
+      memset(&svc->alt_fb_idx, -1, sizeof(svc->lst_fb_idx));
+      // These are set by API before the superframe is encoded and they are
+      // passed to encoder layer by layer. Don't reset them on layer 0 in bypass
+      // mode.
+      vp9_zero(svc->update_buffer_slot);
+      vp9_zero(svc->reference_last);
+      vp9_zero(svc->reference_golden);
+      vp9_zero(svc->reference_altref);
+      // TODO(jianj): Remove these 3, deprecated.
+      vp9_zero(svc->update_last);
+      vp9_zero(svc->update_golden);
+      vp9_zero(svc->update_altref);
+    }
+  }
+
+  lc = &svc->layer_context[svc->spatial_layer_id * svc->number_temporal_layers +
+                           svc->temporal_layer_id];
 
   // Setting the worst/best_quality via the encoder control: SET_SVC_PARAMETERS,
   // only for non-BYPASS mode for now.
-  if (cpi->svc.temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+  if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS ||
+      svc->use_set_ref_frame_config) {
     RATE_CONTROL *const lrc = &lc->rc;
     lrc->worst_quality = vp9_quantizer_to_qindex(lc->max_q);
     lrc->best_quality = vp9_quantizer_to_qindex(lc->min_q);
+    if (cpi->fixed_qp_onepass) {
+      lrc->worst_quality = cpi->rc.worst_quality;
+      lrc->best_quality = cpi->rc.best_quality;
+    }
   }
 
-  get_layer_resolution(cpi->oxcf.width, cpi->oxcf.height,
-                       lc->scaling_factor_num, lc->scaling_factor_den, &width,
-                       &height);
+  if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC && svc->single_layer_svc == 1 &&
+      svc->spatial_layer_id == svc->first_spatial_layer_to_encode &&
+      cpi->resize_state != ORIG) {
+    scaling_factor_num = lc->scaling_factor_num_resize;
+    scaling_factor_den = lc->scaling_factor_den_resize;
+  } else {
+    scaling_factor_num = lc->scaling_factor_num;
+    scaling_factor_den = lc->scaling_factor_den;
+  }
 
-  // The usage of use_base_mv assumes down-scale of 2x2. For now, turn off use
-  // of base motion vectors if spatial scale factors for any layers are not 2.
+  get_layer_resolution(cpi->oxcf.width, cpi->oxcf.height, scaling_factor_num,
+                       scaling_factor_den, &width, &height);
+
+  // Use Eightap_smooth for low resolutions.
+  if (width * height <= 320 * 240)
+    svc->downsample_filter_type[svc->spatial_layer_id] = EIGHTTAP_SMOOTH;
+  // For scale factors > 0.75, set the phase to 0 (aligns decimated pixel
+  // to source pixel).
+  if (scaling_factor_num > (3 * scaling_factor_den) >> 2)
+    svc->downsample_filter_phase[svc->spatial_layer_id] = 0;
+
+  // The usage of use_base_mv or partition_reuse assumes down-scale of 2x2.
+  // For now, turn off use of base motion vectors and partition reuse if the
+  // spatial scale factors for any layers are not 2,
+  // keep the case of 3 spatial layers with scale factor of 4x4 for base layer.
   // TODO(marpan): Fix this to allow for use_base_mv for scale factors != 2.
-  if (cpi->svc.number_spatial_layers > 1) {
+  if (svc->number_spatial_layers > 1) {
     int sl;
-    for (sl = 0; sl < cpi->svc.number_spatial_layers - 1; ++sl) {
-      lc = &cpi->svc.layer_context[sl * cpi->svc.number_temporal_layers +
-                                   cpi->svc.temporal_layer_id];
-      if (lc->scaling_factor_num != lc->scaling_factor_den >> 1) {
-        cpi->svc.use_base_mv = 0;
+    for (sl = 0; sl < svc->number_spatial_layers - 1; ++sl) {
+      lc = &svc->layer_context[sl * svc->number_temporal_layers +
+                               svc->temporal_layer_id];
+      if ((lc->scaling_factor_num != lc->scaling_factor_den >> 1) &&
+          !(lc->scaling_factor_num == lc->scaling_factor_den >> 2 && sl == 0 &&
+            svc->number_spatial_layers == 3)) {
+        svc->use_base_mv = 0;
+        svc->use_partition_reuse = 0;
         break;
       }
     }
+    // For non-zero spatial layers: if the previous spatial layer was dropped
+    // disable the base_mv and partition_reuse features.
+    if (svc->spatial_layer_id > 0 &&
+        svc->drop_spatial_layer[svc->spatial_layer_id - 1]) {
+      svc->use_base_mv = 0;
+      svc->use_partition_reuse = 0;
+    }
+  }
+
+  svc->non_reference_frame = 0;
+  if (cpi->common.frame_type != KEY_FRAME && !cpi->ext_refresh_last_frame &&
+      !cpi->ext_refresh_golden_frame && !cpi->ext_refresh_alt_ref_frame)
+    svc->non_reference_frame = 1;
+  // For flexible mode, where update_buffer_slot is used, need to check if
+  // all buffer slots are not refreshed.
+  if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+    if (svc->update_buffer_slot[svc->spatial_layer_id] != 0)
+      svc->non_reference_frame = 0;
+  }
+
+  if (svc->spatial_layer_id == 0) {
+    svc->high_source_sad_superframe = 0;
+    svc->high_num_blocks_with_motion = 0;
+  }
+
+  if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
+      svc->last_layer_dropped[svc->spatial_layer_id] &&
+      svc->fb_idx_upd_tl0[svc->spatial_layer_id] != -1 &&
+      !svc->layer_context[svc->temporal_layer_id].is_key_frame) {
+    // For fixed/non-flexible mode, if the previous frame (same spatial layer
+    // from previous superframe) was dropped, make sure the lst_fb_idx
+    // for this frame corresponds to the buffer index updated on (last) encoded
+    // TL0 frame (with same spatial layer).
+    cpi->lst_fb_idx = svc->fb_idx_upd_tl0[svc->spatial_layer_id];
   }
 
   if (vp9_set_size_literal(cpi, width, height) != 0)
     return VPX_CODEC_INVALID_PARAM;
 
+  svc->mi_stride[svc->spatial_layer_id] = cpi->common.mi_stride;
+  svc->mi_rows[svc->spatial_layer_id] = cpi->common.mi_rows;
+  svc->mi_cols[svc->spatial_layer_id] = cpi->common.mi_cols;
   return 0;
 }
 
-#if CONFIG_SPATIAL_SVC
-#define SMALL_FRAME_FB_IDX 7
-
-int vp9_svc_start_frame(VP9_COMP *const cpi) {
-  int width = 0, height = 0;
-  LAYER_CONTEXT *lc;
-  struct lookahead_entry *buf;
-  int count = 1 << (cpi->svc.number_temporal_layers - 1);
-
-  cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode;
-  lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
-
-  cpi->svc.temporal_layer_id = 0;
-  while ((lc->current_video_frame_in_layer % count) != 0) {
-    ++cpi->svc.temporal_layer_id;
-    count >>= 1;
-  }
-
-  cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
-
-  cpi->lst_fb_idx = cpi->svc.spatial_layer_id;
-
-  if (cpi->svc.spatial_layer_id == 0)
-    cpi->gld_fb_idx =
-        (lc->gold_ref_idx >= 0) ? lc->gold_ref_idx : cpi->lst_fb_idx;
-  else
-    cpi->gld_fb_idx = cpi->svc.spatial_layer_id - 1;
-
-  if (lc->current_video_frame_in_layer == 0) {
-    if (cpi->svc.spatial_layer_id >= 2) {
-      cpi->alt_fb_idx = cpi->svc.spatial_layer_id - 2;
-    } else {
-      cpi->alt_fb_idx = cpi->lst_fb_idx;
-      cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_ALT_FLAG);
-    }
-  } else {
-    if (cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id]) {
-      cpi->alt_fb_idx = lc->alt_ref_idx;
-      if (!lc->has_alt_frame) cpi->ref_frame_flags &= (~VP9_ALT_FLAG);
-    } else {
-      // Find a proper alt_fb_idx for layers that don't have alt ref frame
-      if (cpi->svc.spatial_layer_id == 0) {
-        cpi->alt_fb_idx = cpi->lst_fb_idx;
-      } else {
-        LAYER_CONTEXT *lc_lower =
-            &cpi->svc.layer_context[cpi->svc.spatial_layer_id - 1];
-
-        if (cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id - 1] &&
-            lc_lower->alt_ref_source != NULL)
-          cpi->alt_fb_idx = lc_lower->alt_ref_idx;
-        else if (cpi->svc.spatial_layer_id >= 2)
-          cpi->alt_fb_idx = cpi->svc.spatial_layer_id - 2;
-        else
-          cpi->alt_fb_idx = cpi->lst_fb_idx;
-      }
-    }
-  }
-
-  get_layer_resolution(cpi->oxcf.width, cpi->oxcf.height,
-                       lc->scaling_factor_num, lc->scaling_factor_den, &width,
-                       &height);
-
-  // Workaround for multiple frame contexts. In some frames we can't use prev_mi
-  // since its previous frame could be changed during decoding time. The idea is
-  // we put a empty invisible frame in front of them, then we will not use
-  // prev_mi when encoding these frames.
-
-  buf = vp9_lookahead_peek(cpi->lookahead, 0);
-  if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2 &&
-      cpi->svc.encode_empty_frame_state == NEED_TO_ENCODE &&
-      lc->rc.frames_to_key != 0 &&
-      !(buf != NULL && (buf->flags & VPX_EFLAG_FORCE_KF))) {
-    if ((cpi->svc.number_temporal_layers > 1 &&
-         cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1) ||
-        (cpi->svc.number_spatial_layers > 1 &&
-         cpi->svc.spatial_layer_id == 0)) {
-      struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead, 0);
-
-      if (buf != NULL) {
-        cpi->svc.empty_frame.ts_start = buf->ts_start;
-        cpi->svc.empty_frame.ts_end = buf->ts_end;
-        cpi->svc.encode_empty_frame_state = ENCODING;
-        cpi->common.show_frame = 0;
-        cpi->ref_frame_flags = 0;
-        cpi->common.frame_type = INTER_FRAME;
-        cpi->lst_fb_idx = cpi->gld_fb_idx = cpi->alt_fb_idx =
-            SMALL_FRAME_FB_IDX;
-
-        if (cpi->svc.encode_intra_empty_frame != 0) cpi->common.intra_only = 1;
-
-        width = SMALL_FRAME_WIDTH;
-        height = SMALL_FRAME_HEIGHT;
-      }
-    }
-  }
-
-  cpi->oxcf.worst_allowed_q = vp9_quantizer_to_qindex(lc->max_q);
-  cpi->oxcf.best_allowed_q = vp9_quantizer_to_qindex(lc->min_q);
-
-  vp9_change_config(cpi, &cpi->oxcf);
-
-  if (vp9_set_size_literal(cpi, width, height) != 0)
-    return VPX_CODEC_INVALID_PARAM;
-
-  vp9_set_high_precision_mv(cpi, 1);
-
-  cpi->alt_ref_source = get_layer_context(cpi)->alt_ref_source;
-
-  return 0;
-}
-
-#undef SMALL_FRAME_FB_IDX
-#endif  // CONFIG_SPATIAL_SVC
-
 struct lookahead_entry *vp9_svc_lookahead_pop(VP9_COMP *const cpi,
                                               struct lookahead_ctx *ctx,
                                               int drain) {
@@ -817,7 +1032,7 @@ void vp9_free_svc_cyclic_refresh(VP9_COMP *const cpi) {
 }
 
 // Reset on key frame: reset counters, references and buffer updates.
-void vp9_svc_reset_key_frame(VP9_COMP *const cpi) {
+void vp9_svc_reset_temporal_layers(VP9_COMP *const cpi, int is_key) {
   int sl, tl;
   SVC *const svc = &cpi->svc;
   LAYER_CONTEXT *lc = NULL;
@@ -825,7 +1040,7 @@ void vp9_svc_reset_key_frame(VP9_COMP *const cpi) {
     for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
       lc = &cpi->svc.layer_context[sl * svc->number_temporal_layers + tl];
       lc->current_video_frame_in_layer = 0;
-      lc->frames_from_key_frame = 0;
+      if (is_key) lc->frames_from_key_frame = 0;
     }
   }
   if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) {
@@ -839,3 +1054,330 @@ void vp9_svc_reset_key_frame(VP9_COMP *const cpi) {
   vp9_update_temporal_layer_framerate(cpi);
   vp9_restore_layer_context(cpi);
 }
+
+void vp9_svc_check_reset_layer_rc_flag(VP9_COMP *const cpi) {
+  SVC *svc = &cpi->svc;
+  int sl, tl;
+  for (sl = 0; sl < svc->number_spatial_layers; ++sl) {
+    // Check for reset based on avg_frame_bandwidth for spatial layer sl.
+    const int spatial_layer_idx = LAYER_IDS_TO_IDX(
+        sl, svc->number_temporal_layers - 1, svc->number_temporal_layers);
+    LAYER_CONTEXT *lc = &svc->layer_context[spatial_layer_idx];
+    RATE_CONTROL *lrc = &lc->rc;
+    if (lrc->avg_frame_bandwidth / 3 > (lrc->last_avg_frame_bandwidth >> 1) ||
+        lrc->avg_frame_bandwidth < (lrc->last_avg_frame_bandwidth >> 1)) {
+      // Reset for all temporal layers with spatial layer sl.
+      for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
+        int temporal_layer_idx =
+            LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+        lrc = &svc->layer_context[temporal_layer_idx].rc;
+        lrc->rc_1_frame = 0;
+        lrc->rc_2_frame = 0;
+        lrc->bits_off_target = lrc->optimal_buffer_level;
+        lrc->buffer_level = lrc->optimal_buffer_level;
+      }
+    }
+  }
+}
+
+void vp9_svc_constrain_inter_layer_pred(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  SVC *const svc = &cpi->svc;
+  const int sl = svc->spatial_layer_id;
+  // Check for disabling inter-layer (spatial) prediction, if
+  // svc.disable_inter_layer_pred is set. If the previous spatial layer was
+  // dropped then disable the prediction from this (scaled) reference.
+  // For INTER_LAYER_PRED_OFF_NONKEY: inter-layer prediction is disabled
+  // on key frames or if any spatial layer is a sync layer.
+  if ((svc->disable_inter_layer_pred == INTER_LAYER_PRED_OFF_NONKEY &&
+       !svc->layer_context[svc->temporal_layer_id].is_key_frame &&
+       !svc->superframe_has_layer_sync) ||
+      svc->disable_inter_layer_pred == INTER_LAYER_PRED_OFF ||
+      svc->drop_spatial_layer[sl - 1]) {
+    MV_REFERENCE_FRAME ref_frame;
+    for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+      const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+      if (yv12 != NULL &&
+          (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) {
+        const struct scale_factors *const scale_fac =
+            &cm->frame_refs[ref_frame - 1].sf;
+        if (vp9_is_scaled(scale_fac)) {
+          cpi->ref_frame_flags &= (~ref_frame_to_flag(ref_frame));
+          // Point golden/altref frame buffer index to last.
+          if (!svc->simulcast_mode) {
+            if (ref_frame == GOLDEN_FRAME)
+              cpi->gld_fb_idx = cpi->lst_fb_idx;
+            else if (ref_frame == ALTREF_FRAME)
+              cpi->alt_fb_idx = cpi->lst_fb_idx;
+          }
+        }
+      }
+    }
+  }
+  // For fixed/non-flexible SVC: check for disabling inter-layer prediction.
+  // If the reference for inter-layer prediction (the reference that is scaled)
+  // is not the previous spatial layer from the same superframe, then we disable
+  // inter-layer prediction. Only need to check when inter_layer prediction is
+  // not set to OFF mode.
+  if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
+      svc->disable_inter_layer_pred != INTER_LAYER_PRED_OFF) {
+    // We only use LAST and GOLDEN for prediction in real-time mode, so we
+    // check both here.
+    MV_REFERENCE_FRAME ref_frame;
+    for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ref_frame++) {
+      struct scale_factors *scale_fac = &cm->frame_refs[ref_frame - 1].sf;
+      if (vp9_is_scaled(scale_fac)) {
+        // If this reference  was updated on the previous spatial layer of the
+        // current superframe, then we keep this reference (don't disable).
+        // Otherwise we disable the inter-layer prediction.
+        // This condition is verified by checking if the current frame buffer
+        // index is equal to any of the slots for the previous spatial layer,
+        // and if so, check if that slot was updated/refreshed. If that is the
+        // case, then this reference is valid for inter-layer prediction under
+        // the mode INTER_LAYER_PRED_ON_CONSTRAINED.
+        int fb_idx =
+            ref_frame == LAST_FRAME ? cpi->lst_fb_idx : cpi->gld_fb_idx;
+        int ref_flag = ref_frame == LAST_FRAME ? VP9_LAST_FLAG : VP9_GOLD_FLAG;
+        int disable = 1;
+        if (fb_idx < 0) continue;
+        if ((fb_idx == svc->lst_fb_idx[sl - 1] &&
+             (svc->update_buffer_slot[sl - 1] & (1 << fb_idx))) ||
+            (fb_idx == svc->gld_fb_idx[sl - 1] &&
+             (svc->update_buffer_slot[sl - 1] & (1 << fb_idx))) ||
+            (fb_idx == svc->alt_fb_idx[sl - 1] &&
+             (svc->update_buffer_slot[sl - 1] & (1 << fb_idx))))
+          disable = 0;
+        if (disable) cpi->ref_frame_flags &= (~ref_flag);
+      }
+    }
+  }
+}
+
+void vp9_svc_assert_constraints_pattern(VP9_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  // For fixed/non-flexible mode, the following constraint are expected,
+  // when inter-layer prediction is on (default).
+  if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
+      svc->disable_inter_layer_pred == INTER_LAYER_PRED_ON &&
+      svc->framedrop_mode != LAYER_DROP) {
+    if (!svc->layer_context[svc->temporal_layer_id].is_key_frame) {
+      // On non-key frames: LAST is always temporal reference, GOLDEN is
+      // spatial reference.
+      if (svc->temporal_layer_id == 0)
+        // Base temporal only predicts from base temporal.
+        assert(svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] == 0);
+      else
+        // Non-base temporal only predicts from lower temporal layer.
+        assert(svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] <
+               svc->temporal_layer_id);
+      if (svc->spatial_layer_id > 0 && cpi->ref_frame_flags & VP9_GOLD_FLAG &&
+          svc->spatial_layer_id > svc->first_spatial_layer_to_encode) {
+        // Non-base spatial only predicts from lower spatial layer with same
+        // temporal_id.
+        assert(svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] ==
+               svc->spatial_layer_id - 1);
+        assert(svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] ==
+               svc->temporal_layer_id);
+      }
+    } else if (svc->spatial_layer_id > 0 &&
+               svc->spatial_layer_id > svc->first_spatial_layer_to_encode) {
+      // Only 1 reference for frame whose base is key; reference may be LAST
+      // or GOLDEN, so we check both.
+      if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
+        assert(svc->fb_idx_spatial_layer_id[cpi->lst_fb_idx] ==
+               svc->spatial_layer_id - 1);
+        assert(svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] ==
+               svc->temporal_layer_id);
+      } else if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
+        assert(svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] ==
+               svc->spatial_layer_id - 1);
+        assert(svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] ==
+               svc->temporal_layer_id);
+      }
+    }
+  } else if (svc->use_gf_temporal_ref_current_layer &&
+             !svc->layer_context[svc->temporal_layer_id].is_key_frame) {
+    // For the usage of golden as second long term reference: the
+    // temporal_layer_id of that reference must be base temporal layer 0, and
+    // spatial_layer_id of that reference must be same as current
+    // spatial_layer_id. If not, disable feature.
+    // TODO(marpan): Investigate when this can happen, and maybe put this check
+    // and reset in a different place.
+    if (svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] !=
+            svc->spatial_layer_id ||
+        svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] != 0)
+      svc->use_gf_temporal_ref_current_layer = 0;
+  }
+}
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+int vp9_denoise_svc_non_key(VP9_COMP *const cpi) {
+  int layer =
+      LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id, cpi->svc.temporal_layer_id,
+                       cpi->svc.number_temporal_layers);
+  LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
+  return denoise_svc(cpi) && !lc->is_key_frame;
+}
+#endif
+
+void vp9_svc_check_spatial_layer_sync(VP9_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  // Only for superframes whose base is not key, as those are
+  // already sync frames.
+  if (!svc->layer_context[svc->temporal_layer_id].is_key_frame) {
+    if (svc->spatial_layer_id == 0) {
+      // On base spatial layer: if the current superframe has a layer sync then
+      // reset the pattern counters and reset to base temporal layer.
+      if (svc->superframe_has_layer_sync)
+        vp9_svc_reset_temporal_layers(cpi, cpi->common.frame_type == KEY_FRAME);
+    }
+    // If the layer sync is set for this current spatial layer then
+    // disable the temporal reference.
+    if (svc->spatial_layer_id > 0 &&
+        svc->spatial_layer_sync[svc->spatial_layer_id]) {
+      cpi->ref_frame_flags &= (~VP9_LAST_FLAG);
+      if (svc->use_gf_temporal_ref_current_layer) {
+        int index = svc->spatial_layer_id;
+        // If golden is used as second reference: need to remove it from
+        // prediction, reset refresh period to 0, and update the reference.
+        svc->use_gf_temporal_ref_current_layer = 0;
+        cpi->rc.baseline_gf_interval = 0;
+        cpi->rc.frames_till_gf_update_due = 0;
+        // On layer sync frame we must update the buffer index used for long
+        // term reference. Use the alt_ref since it is not used or updated on
+        // sync frames.
+        if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1;
+        assert(index >= 0);
+        cpi->alt_fb_idx = svc->buffer_gf_temporal_ref[index].idx;
+        cpi->ext_refresh_alt_ref_frame = 1;
+      }
+    }
+  }
+}
+
+void vp9_svc_update_ref_frame_buffer_idx(VP9_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  int i = 0;
+  // Update the usage of frame buffer index for base spatial layers.
+  if (svc->spatial_layer_id == 0) {
+    if ((cpi->ref_frame_flags & VP9_LAST_FLAG) || cpi->refresh_last_frame)
+      svc->fb_idx_base[cpi->lst_fb_idx] = 1;
+    if ((cpi->ref_frame_flags & VP9_GOLD_FLAG) || cpi->refresh_golden_frame)
+      svc->fb_idx_base[cpi->gld_fb_idx] = 1;
+    if ((cpi->ref_frame_flags & VP9_ALT_FLAG) || cpi->refresh_alt_ref_frame)
+      svc->fb_idx_base[cpi->alt_fb_idx] = 1;
+    // For bypass/flexible mode: check for refresh slots.
+    if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+      for (i = 0; i < REF_FRAMES; ++i)
+        if (svc->update_buffer_slot[0] & (1 << i)) svc->fb_idx_base[i] = 1;
+    }
+  }
+}
+
+static void vp9_svc_update_ref_frame_bypass_mode(VP9_COMP *const cpi) {
+  // For non-flexible/bypass SVC mode: check for refreshing other buffer
+  // slots.
+  SVC *const svc = &cpi->svc;
+  VP9_COMMON *const cm = &cpi->common;
+  BufferPool *const pool = cm->buffer_pool;
+  int i;
+  for (i = 0; i < REF_FRAMES; i++) {
+    if ((cm->frame_type == KEY_FRAME && !svc->simulcast_mode) ||
+        svc->update_buffer_slot[svc->spatial_layer_id] & (1 << i)) {
+      ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[i], cm->new_fb_idx);
+      svc->fb_idx_spatial_layer_id[i] = svc->spatial_layer_id;
+      svc->fb_idx_temporal_layer_id[i] = svc->temporal_layer_id;
+    }
+  }
+}
+
+void vp9_svc_update_ref_frame(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  SVC *const svc = &cpi->svc;
+  BufferPool *const pool = cm->buffer_pool;
+
+  if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
+      svc->use_set_ref_frame_config) {
+    vp9_svc_update_ref_frame_bypass_mode(cpi);
+  } else if (cm->frame_type == KEY_FRAME && !svc->simulcast_mode) {
+    // Keep track of frame index for each reference frame.
+    int i;
+    // On key frame update all reference frame slots.
+    for (i = 0; i < REF_FRAMES; i++) {
+      svc->fb_idx_spatial_layer_id[i] = svc->spatial_layer_id;
+      svc->fb_idx_temporal_layer_id[i] = svc->temporal_layer_id;
+      // LAST/GOLDEN/ALTREF is already updated above.
+      if (i != cpi->lst_fb_idx && i != cpi->gld_fb_idx && i != cpi->alt_fb_idx)
+        ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[i], cm->new_fb_idx);
+    }
+  } else {
+    if (cpi->refresh_last_frame) {
+      svc->fb_idx_spatial_layer_id[cpi->lst_fb_idx] = svc->spatial_layer_id;
+      svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] = svc->temporal_layer_id;
+    }
+    if (cpi->refresh_golden_frame) {
+      svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] = svc->spatial_layer_id;
+      svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] = svc->temporal_layer_id;
+    }
+    if (cpi->refresh_alt_ref_frame) {
+      svc->fb_idx_spatial_layer_id[cpi->alt_fb_idx] = svc->spatial_layer_id;
+      svc->fb_idx_temporal_layer_id[cpi->alt_fb_idx] = svc->temporal_layer_id;
+    }
+  }
+  // Copy flags from encoder to SVC struct.
+  vp9_copy_flags_ref_update_idx(cpi);
+  vp9_svc_update_ref_frame_buffer_idx(cpi);
+}
+
+void vp9_svc_adjust_frame_rate(VP9_COMP *const cpi) {
+  int64_t this_duration =
+      cpi->svc.timebase_fac * cpi->svc.duration[cpi->svc.spatial_layer_id];
+  vp9_new_framerate(cpi, 10000000.0 / this_duration);
+}
+
+void vp9_svc_adjust_avg_frame_qindex(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  SVC *const svc = &cpi->svc;
+  RATE_CONTROL *const rc = &cpi->rc;
+  // On key frames in CBR mode: reset the avg_frame_qindex for base layer
+  // (to level closer to worst_quality) if the overshoot is significant.
+  // Reset it for all temporal layers on base spatial layer.
+  if (cm->frame_type == KEY_FRAME && cpi->oxcf.rc_mode == VPX_CBR &&
+      !svc->simulcast_mode &&
+      rc->projected_frame_size / 3 > rc->avg_frame_bandwidth) {
+    int tl;
+    rc->avg_frame_qindex[INTER_FRAME] =
+        VPXMAX(rc->avg_frame_qindex[INTER_FRAME],
+               (cm->base_qindex + rc->worst_quality) >> 1);
+    for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
+      const int layer = LAYER_IDS_TO_IDX(0, tl, svc->number_temporal_layers);
+      LAYER_CONTEXT *lc = &svc->layer_context[layer];
+      RATE_CONTROL *lrc = &lc->rc;
+      lrc->avg_frame_qindex[INTER_FRAME] = rc->avg_frame_qindex[INTER_FRAME];
+    }
+  }
+}
+
+// SVC: skip encoding of enhancement layer if the layer target bandwidth = 0.
+// No need to set svc.skip_enhancement_layer if whole superframe will be
+// dropped.
+int vp9_svc_check_skip_enhancement_layer(VP9_COMP *const cpi) {
+  if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 &&
+      cpi->oxcf.target_bandwidth == 0 &&
+      !(cpi->svc.framedrop_mode != LAYER_DROP &&
+        (cpi->svc.framedrop_mode != CONSTRAINED_FROM_ABOVE_DROP ||
+         cpi->svc
+             .force_drop_constrained_from_above[cpi->svc.number_spatial_layers -
+                                                1]) &&
+        cpi->svc.drop_spatial_layer[0])) {
+    cpi->svc.skip_enhancement_layer = 1;
+    vp9_rc_postencode_update_drop_frame(cpi);
+    cpi->ext_refresh_frame_flags_pending = 0;
+    cpi->last_frame_dropped = 1;
+    cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = 1;
+    cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id] = 1;
+    vp9_inc_frame_in_layer(cpi);
+    return 1;
+  }
+  return 0;
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.h b/media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.h
index ee7a6638b4..388a02789d 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_
-#define VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_
+#ifndef VPX_VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_
+#define VPX_VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_
 
 #include "vpx/vpx_encoder.h"
 
@@ -19,6 +19,24 @@
 extern "C" {
 #endif
 
+typedef enum {
+  // Inter-layer prediction is on on all frames.
+  INTER_LAYER_PRED_ON,
+  // Inter-layer prediction is off on all frames.
+  INTER_LAYER_PRED_OFF,
+  // Inter-layer prediction is off on non-key frames and non-sync frames.
+  INTER_LAYER_PRED_OFF_NONKEY,
+  // Inter-layer prediction is on on all frames, but constrained such
+  // that any layer S (> 0) can only predict from previous spatial
+  // layer S-1, from the same superframe.
+  INTER_LAYER_PRED_ON_CONSTRAINED
+} INTER_LAYER_PRED;
+
+typedef struct BUFFER_LONGTERM_REF {
+  int idx;
+  int is_used;
+} BUFFER_LONGTERM_REF;
+
 typedef struct {
   RATE_CONTROL rc;
   int target_bandwidth;
@@ -29,6 +47,9 @@ typedef struct {
   int min_q;
   int scaling_factor_num;
   int scaling_factor_den;
+  // Scaling factors used for internal resize scaling for single layer SVC.
+  int scaling_factor_num_resize;
+  int scaling_factor_den_resize;
   TWO_PASS twopass;
   vpx_fixed_buf_t rc_twopass_stats_in;
   unsigned int current_video_frame_in_layer;
@@ -40,24 +61,29 @@ typedef struct {
   int gold_ref_idx;
   int has_alt_frame;
   size_t layer_size;
-  struct vpx_psnr_pkt psnr_pkt;
   // Cyclic refresh parameters (aq-mode=3), that need to be updated per-frame.
+  // TODO(jianj/marpan): Is it better to use the full cyclic refresh struct.
   int sb_index;
   signed char *map;
   uint8_t *last_coded_q_map;
   uint8_t *consec_zero_mv;
+  int actual_num_seg1_blocks;
+  int actual_num_seg2_blocks;
+  int counter_encode_maxq_scene_change;
+  int qindex_delta[3];
   uint8_t speed;
+  int loopfilter_ctrl;
+  int frame_qp;
+  int MBs;
 } LAYER_CONTEXT;
 
-typedef struct {
+typedef struct SVC {
   int spatial_layer_id;
   int temporal_layer_id;
   int number_spatial_layers;
   int number_temporal_layers;
 
   int spatial_layer_to_encode;
-  int first_spatial_layer_to_encode;
-  int rc_drop_superframe;
 
   // Workaround for multiple frame contexts
   enum { ENCODED = 0, ENCODING, NEED_TO_ENCODE } encode_empty_frame_state;
@@ -81,13 +107,104 @@ typedef struct {
   // Frame flags and buffer indexes for each spatial layer, set by the
   // application (external settings).
   int ext_frame_flags[VPX_MAX_LAYERS];
-  int ext_lst_fb_idx[VPX_MAX_LAYERS];
-  int ext_gld_fb_idx[VPX_MAX_LAYERS];
-  int ext_alt_fb_idx[VPX_MAX_LAYERS];
-  int ref_frame_index[REF_FRAMES];
+  int lst_fb_idx[VPX_MAX_LAYERS];
+  int gld_fb_idx[VPX_MAX_LAYERS];
+  int alt_fb_idx[VPX_MAX_LAYERS];
   int force_zero_mode_spatial_ref;
+  // Sequence level flag to enable second (long term) temporal reference.
+  int use_gf_temporal_ref;
+  // Frame level flag to enable second (long term) temporal reference.
+  int use_gf_temporal_ref_current_layer;
+  // Allow second reference for at most 2 top highest resolution layers.
+  BUFFER_LONGTERM_REF buffer_gf_temporal_ref[2];
   int current_superframe;
+  int non_reference_frame;
   int use_base_mv;
+  int use_partition_reuse;
+  // Used to control the downscaling filter for source scaling, for 1 pass CBR.
+  // downsample_filter_phase: = 0 will do sub-sampling (no weighted average),
+  // = 8 will center the target pixel and get a symmetric averaging filter.
+  // downsample_filter_type: 4 filters may be used: eighttap_regular,
+  // eighttap_smooth, eighttap_sharp, and bilinear.
+  INTERP_FILTER downsample_filter_type[VPX_SS_MAX_LAYERS];
+  int downsample_filter_phase[VPX_SS_MAX_LAYERS];
+
+  BLOCK_SIZE *prev_partition_svc;
+  int mi_stride[VPX_MAX_LAYERS];
+  int mi_rows[VPX_MAX_LAYERS];
+  int mi_cols[VPX_MAX_LAYERS];
+
+  int first_layer_denoise;
+
+  int skip_enhancement_layer;
+
+  int lower_layer_qindex;
+
+  int last_layer_dropped[VPX_MAX_LAYERS];
+  int drop_spatial_layer[VPX_MAX_LAYERS];
+  int framedrop_thresh[VPX_MAX_LAYERS];
+  int drop_count[VPX_MAX_LAYERS];
+  int force_drop_constrained_from_above[VPX_MAX_LAYERS];
+  int max_consec_drop;
+  SVC_LAYER_DROP_MODE framedrop_mode;
+
+  INTER_LAYER_PRED disable_inter_layer_pred;
+
+  // Flag to indicate scene change and high num of motion blocks at current
+  // superframe, scene detection is currently checked for each superframe prior
+  // to encoding, on the full resolution source.
+  int high_source_sad_superframe;
+  int high_num_blocks_with_motion;
+
+  // Flags used to get SVC pattern info.
+  int update_buffer_slot[VPX_SS_MAX_LAYERS];
+  uint8_t reference_last[VPX_SS_MAX_LAYERS];
+  uint8_t reference_golden[VPX_SS_MAX_LAYERS];
+  uint8_t reference_altref[VPX_SS_MAX_LAYERS];
+  // TODO(jianj): Remove these last 3, deprecated.
+  uint8_t update_last[VPX_SS_MAX_LAYERS];
+  uint8_t update_golden[VPX_SS_MAX_LAYERS];
+  uint8_t update_altref[VPX_SS_MAX_LAYERS];
+
+  // Keep track of the frame buffer index updated/refreshed on the base
+  // temporal superframe.
+  int fb_idx_upd_tl0[VPX_SS_MAX_LAYERS];
+
+  // Keep track of the spatial and temporal layer id of the frame that last
+  // updated the frame buffer index.
+  uint8_t fb_idx_spatial_layer_id[REF_FRAMES];
+  uint8_t fb_idx_temporal_layer_id[REF_FRAMES];
+
+  int spatial_layer_sync[VPX_SS_MAX_LAYERS];
+  // Quantizer for each spatial layer.
+  int base_qindex[VPX_SS_MAX_LAYERS];
+  uint8_t set_intra_only_frame;
+  uint8_t previous_frame_is_intra_only;
+  uint8_t superframe_has_layer_sync;
+
+  uint8_t fb_idx_base[REF_FRAMES];
+
+  int use_set_ref_frame_config;
+
+  int temporal_layer_id_per_spatial[VPX_SS_MAX_LAYERS];
+
+  int first_spatial_layer_to_encode;
+
+  // Parameters for allowing framerate per spatial layer, and buffer
+  // update based on timestamps.
+  int64_t duration[VPX_SS_MAX_LAYERS];
+  int64_t timebase_fac;
+  int64_t time_stamp_superframe;
+  int64_t time_stamp_prev[VPX_SS_MAX_LAYERS];
+
+  int num_encoded_top_layer;
+
+  // Every spatial layer on a superframe whose base is key is key too.
+  int simulcast_mode;
+
+  // Flag to indicate SVC is dynamically switched to a single layer.
+  int single_layer_svc;
+  int resize_set;
 } SVC;
 
 struct VP9_COMP;
@@ -117,6 +234,10 @@ void vp9_save_layer_context(struct VP9_COMP *const cpi);
 // Initialize second pass rc for spatial svc.
 void vp9_init_second_pass_spatial_svc(struct VP9_COMP *cpi);
 
+void get_layer_resolution(const int width_org, const int height_org,
+                          const int num, const int den, int *width_out,
+                          int *height_out);
+
 // Increment number of video frames in layer
 void vp9_inc_frame_in_layer(struct VP9_COMP *const cpi);
 
@@ -131,14 +252,39 @@ struct lookahead_entry *vp9_svc_lookahead_pop(struct VP9_COMP *const cpi,
 // Start a frame and initialize svc parameters
 int vp9_svc_start_frame(struct VP9_COMP *const cpi);
 
-int vp9_one_pass_cbr_svc_start_layer(struct VP9_COMP *const cpi);
+#if CONFIG_VP9_TEMPORAL_DENOISING
+int vp9_denoise_svc_non_key(struct VP9_COMP *const cpi);
+#endif
+
+void vp9_copy_flags_ref_update_idx(struct VP9_COMP *const cpi);
+
+int vp9_one_pass_svc_start_layer(struct VP9_COMP *const cpi);
 
 void vp9_free_svc_cyclic_refresh(struct VP9_COMP *const cpi);
 
-void vp9_svc_reset_key_frame(struct VP9_COMP *const cpi);
+void vp9_svc_reset_temporal_layers(struct VP9_COMP *const cpi, int is_key);
 
+void vp9_svc_check_reset_layer_rc_flag(struct VP9_COMP *const cpi);
+
+void vp9_svc_constrain_inter_layer_pred(struct VP9_COMP *const cpi);
+
+void vp9_svc_assert_constraints_pattern(struct VP9_COMP *const cpi);
+
+void vp9_svc_check_spatial_layer_sync(struct VP9_COMP *const cpi);
+
+void vp9_svc_update_ref_frame_buffer_idx(struct VP9_COMP *const cpi);
+
+void vp9_svc_update_ref_frame_key_simulcast(struct VP9_COMP *const cpi);
+
+void vp9_svc_update_ref_frame(struct VP9_COMP *const cpi);
+
+void vp9_svc_adjust_frame_rate(struct VP9_COMP *const cpi);
+
+void vp9_svc_adjust_avg_frame_qindex(struct VP9_COMP *const cpi);
+
+int vp9_svc_check_skip_enhancement_layer(struct VP9_COMP *const cpi);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_SVC_LAYERCONTEXT_
+#endif  // VPX_VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter.c b/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter.c
index 344658483a..46f36c3eb9 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter.c
@@ -8,13 +8,17 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 #include <math.h>
 #include <limits.h>
 
 #include "vp9/common/vp9_alloccommon.h"
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_reconinter.h"
+#include "vp9/encoder/vp9_encodeframe.h"
+#include "vp9/encoder/vp9_ethread.h"
 #include "vp9/encoder/vp9_extend.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_mcomp.h"
@@ -30,54 +34,372 @@
 #include "vpx_scale/vpx_scale.h"
 
 static int fixed_divide[512];
+static unsigned int index_mult[14] = { 0,     0,     0,     0,     49152,
+                                       39322, 32768, 28087, 24576, 21846,
+                                       19661, 17874, 0,     15124 };
+#if CONFIG_VP9_HIGHBITDEPTH
+static int64_t highbd_index_mult[14] = { 0U,          0U,          0U,
+                                         0U,          3221225472U, 2576980378U,
+                                         2147483648U, 1840700270U, 1610612736U,
+                                         1431655766U, 1288490189U, 1171354718U,
+                                         0U,          991146300U };
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static const MV kZeroMv = { 0, 0 };
+#define TF_INTERP_EXTEND 6
+
+// Prediction function using 12-tap interpolation filter.
+void vpx_convolve12_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const InterpKernel12 *filter, int x0_q4,
+                            int x_step_q4, int y0_q4, int y_step_q4, int w,
+                            int h) {
+  (void)y0_q4;
+  (void)y_step_q4;
+  int x, y;
+  src -= MAX_FILTER_TAP / 2 - 1;
+
+  for (y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; ++x) {
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = filter[x_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < MAX_FILTER_TAP; ++k) sum += src_x[k] * x_filter[k];
+      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vpx_convolve12_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const InterpKernel12 *filter, int x0_q4,
+                           int x_step_q4, int y0_q4, int y_step_q4, int w,
+                           int h) {
+  (void)x0_q4;
+  (void)x_step_q4;
+  int x, y;
+  src -= src_stride * (MAX_FILTER_TAP / 2 - 1);
+
+  for (x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (y = 0; y < h; ++y) {
+      const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = filter[y_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < MAX_FILTER_TAP; ++k)
+        sum += src_y[k * src_stride] * y_filter[k];
+      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+// Copied from vpx_convolve8_c(). Possible block sizes are 32x32, 16x16, 8x8.
+void vpx_convolve12_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                      ptrdiff_t dst_stride, const InterpKernel12 *filter,
+                      int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w,
+                      int h) {
+  uint8_t temp[BW * (BH + MAX_FILTER_TAP - 1)];
+  const int temp_stride = BW;
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + MAX_FILTER_TAP;
+
+  vpx_convolve12_horiz_c(src - src_stride * (MAX_FILTER_TAP / 2 - 1),
+                         src_stride, temp, temp_stride, filter, x0_q4,
+                         x_step_q4, y0_q4, y_step_q4, w, intermediate_height);
+  vpx_convolve12_vert_c(temp + temp_stride * (MAX_FILTER_TAP / 2 - 1),
+                        temp_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                        y0_q4, y_step_q4, w, h);
+}
+
+static void vp9_build_inter_predictor_12(
+    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
+    const MV *src_mv, const struct scale_factors *sf, int w, int h, int ref,
+    const InterpKernel12 *kernel, enum mv_precision precision, int x, int y) {
+  (void)ref;
+  const int is_q4 = precision == MV_PRECISION_Q4;
+  const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
+                     is_q4 ? src_mv->col : src_mv->col * 2 };
+  MV32 mv = vp9_scale_mv(&mv_q4, x, y, sf);
+  const int subpel_x = mv.col & SUBPEL_MASK;
+  const int subpel_y = mv.row & SUBPEL_MASK;
+
+  src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);
+
+  if (subpel_x == 0 && subpel_y == 0) {
+    vpx_convolve_copy(src, src_stride, dst, dst_stride, NULL, subpel_x,
+                      sf->x_step_q4, subpel_y, sf->y_step_q4, w, h);
+  } else if (subpel_x == 0 && subpel_y != 0) {
+    vpx_convolve12_vert(src, src_stride, dst, dst_stride, kernel, subpel_x,
+                        sf->x_step_q4, subpel_y, sf->y_step_q4, w, h);
+  } else if (subpel_x != 0 && subpel_y == 0) {
+    vpx_convolve12_horiz(src, src_stride, dst, dst_stride, kernel, subpel_x,
+                         sf->x_step_q4, subpel_y, sf->y_step_q4, w, h);
+  } else {
+    vpx_convolve12(src, src_stride, dst, dst_stride, kernel, subpel_x,
+                   sf->x_step_q4, subpel_y, sf->y_step_q4, w, h);
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_convolve12_horiz_c(const uint16_t *src, ptrdiff_t src_stride,
+                                   uint16_t *dst, ptrdiff_t dst_stride,
+                                   const InterpKernel12 *filter, int x0_q4,
+                                   int x_step_q4, int y0_q4, int y_step_q4,
+                                   int w, int h, int bd) {
+  (void)y0_q4;
+  (void)y_step_q4;
+  int x, y;
+  src -= MAX_FILTER_TAP / 2 - 1;
+
+  for (y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; ++x) {
+      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = filter[x_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < MAX_FILTER_TAP; ++k) sum += src_x[k] * x_filter[k];
+      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vpx_highbd_convolve12_vert_c(const uint16_t *src, ptrdiff_t src_stride,
+                                  uint16_t *dst, ptrdiff_t dst_stride,
+                                  const InterpKernel12 *filter, int x0_q4,
+                                  int x_step_q4, int y0_q4, int y_step_q4,
+                                  int w, int h, int bd) {
+  (void)x0_q4;
+  (void)x_step_q4;
+  int x, y;
+  src -= src_stride * (MAX_FILTER_TAP / 2 - 1);
+
+  for (x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (y = 0; y < h; ++y) {
+      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = filter[y_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < MAX_FILTER_TAP; ++k)
+        sum += src_y[k * src_stride] * y_filter[k];
+      dst[y * dst_stride] =
+          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+static void highbd_convolve12(const uint16_t *src, ptrdiff_t src_stride,
+                              uint16_t *dst, ptrdiff_t dst_stride,
+                              const InterpKernel12 *filter, int x0_q4,
+                              int x_step_q4, int y0_q4, int y_step_q4, int w,
+                              int h, int bd) {
+  uint16_t temp[BW * (BH + MAX_FILTER_TAP - 1)];
+  const int temp_stride = BW;
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + MAX_FILTER_TAP;
+
+  vpx_highbd_convolve12_horiz_c(src - src_stride * (MAX_FILTER_TAP / 2 - 1),
+                                src_stride, temp, temp_stride, filter, x0_q4,
+                                x_step_q4, y0_q4, y_step_q4, w,
+                                intermediate_height, bd);
+  vpx_highbd_convolve12_vert_c(temp + temp_stride * (MAX_FILTER_TAP / 2 - 1),
+                               temp_stride, dst, dst_stride, filter, x0_q4,
+                               x_step_q4, y0_q4, y_step_q4, w, h, bd);
+}
+
+// Copied from vpx_highbd_convolve8_c()
+void vpx_highbd_convolve12_c(const uint16_t *src, ptrdiff_t src_stride,
+                             uint16_t *dst, ptrdiff_t dst_stride,
+                             const InterpKernel12 *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
+                             int h, int bd) {
+  highbd_convolve12(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                    y0_q4, y_step_q4, w, h, bd);
+}
+
+static void vp9_highbd_build_inter_predictor_12(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
+    const MV *src_mv, const struct scale_factors *sf, int w, int h, int ref,
+    const InterpKernel12 *kernel, enum mv_precision precision, int x, int y,
+    int bd) {
+  (void)ref;
+  const int is_q4 = precision == MV_PRECISION_Q4;
+  const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
+                     is_q4 ? src_mv->col : src_mv->col * 2 };
+  MV32 mv = vp9_scale_mv(&mv_q4, x, y, sf);
+  const int subpel_x = mv.col & SUBPEL_MASK;
+  const int subpel_y = mv.row & SUBPEL_MASK;
+
+  src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);
+
+  if (subpel_x == 0 && subpel_y == 0) {
+    vpx_highbd_convolve_copy(src, src_stride, dst, dst_stride, NULL, subpel_x,
+                             sf->x_step_q4, subpel_y, sf->y_step_q4, w, h, bd);
+  } else if (subpel_x == 0 && subpel_y != 0) {
+    vpx_highbd_convolve12_vert(src, src_stride, dst, dst_stride, kernel,
+                               subpel_x, sf->x_step_q4, subpel_y, sf->y_step_q4,
+                               w, h, bd);
+  } else if (subpel_x != 0 && subpel_y == 0) {
+    vpx_highbd_convolve12_horiz(src, src_stride, dst, dst_stride, kernel,
+                                subpel_x, sf->x_step_q4, subpel_y,
+                                sf->y_step_q4, w, h, bd);
+  } else {
+    vpx_highbd_convolve12(src, src_stride, dst, dst_stride, kernel, subpel_x,
+                          sf->x_step_q4, subpel_y, sf->y_step_q4, w, h, bd);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 static void temporal_filter_predictors_mb_c(
     MACROBLOCKD *xd, uint8_t *y_mb_ptr, uint8_t *u_mb_ptr, uint8_t *v_mb_ptr,
     int stride, int uv_block_width, int uv_block_height, int mv_row, int mv_col,
-    uint8_t *pred, struct scale_factors *scale, int x, int y) {
+    uint8_t *pred, struct scale_factors *scale, int x, int y, MV *blk_mvs,
+    int use_32x32) {
   const int which_mv = 0;
-  const MV mv = { mv_row, mv_col };
-  const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP_SHARP];
+  const InterpKernel12 *const kernel = sub_pel_filters_12;
+  int i, j, k = 0, ys = (BH >> 1), xs = (BW >> 1);
 
   enum mv_precision mv_precision_uv;
   int uv_stride;
-  if (uv_block_width == 8) {
+  if (uv_block_width == (BW >> 1)) {
     uv_stride = (stride + 1) >> 1;
     mv_precision_uv = MV_PRECISION_Q4;
   } else {
     uv_stride = stride;
     mv_precision_uv = MV_PRECISION_Q3;
   }
+#if !CONFIG_VP9_HIGHBITDEPTH
+  (void)xd;
+#endif
 
+  if (use_32x32) {
+    const MV mv = { mv_row, mv_col };
 #if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vp9_highbd_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale,
-                                     16, 16, which_mv, kernel, MV_PRECISION_Q3,
-                                     x, y, xd->bd);
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_highbd_build_inter_predictor_12(CONVERT_TO_SHORTPTR(y_mb_ptr), stride,
+                                          CONVERT_TO_SHORTPTR(&pred[0]), BW,
+                                          &mv, scale, BW, BH, which_mv, kernel,
+                                          MV_PRECISION_Q3, x, y, xd->bd);
 
-    vp9_highbd_build_inter_predictor(u_mb_ptr, uv_stride, &pred[256],
-                                     uv_block_width, &mv, scale, uv_block_width,
-                                     uv_block_height, which_mv, kernel,
-                                     mv_precision_uv, x, y, xd->bd);
+      vp9_highbd_build_inter_predictor_12(
+          CONVERT_TO_SHORTPTR(u_mb_ptr), uv_stride,
+          CONVERT_TO_SHORTPTR(&pred[BLK_PELS]), uv_block_width, &mv, scale,
+          uv_block_width, uv_block_height, which_mv, kernel, mv_precision_uv, x,
+          y, xd->bd);
 
-    vp9_highbd_build_inter_predictor(v_mb_ptr, uv_stride, &pred[512],
-                                     uv_block_width, &mv, scale, uv_block_width,
-                                     uv_block_height, which_mv, kernel,
-                                     mv_precision_uv, x, y, xd->bd);
+      vp9_highbd_build_inter_predictor_12(
+          CONVERT_TO_SHORTPTR(v_mb_ptr), uv_stride,
+          CONVERT_TO_SHORTPTR(&pred[(BLK_PELS << 1)]), uv_block_width, &mv,
+          scale, uv_block_width, uv_block_height, which_mv, kernel,
+          mv_precision_uv, x, y, xd->bd);
+      return;
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    vp9_build_inter_predictor_12(y_mb_ptr, stride, &pred[0], BW, &mv, scale, BW,
+                                 BH, which_mv, kernel, MV_PRECISION_Q3, x, y);
+
+    vp9_build_inter_predictor_12(u_mb_ptr, uv_stride, &pred[BLK_PELS],
+                                 uv_block_width, &mv, scale, uv_block_width,
+                                 uv_block_height, which_mv, kernel,
+                                 mv_precision_uv, x, y);
+
+    vp9_build_inter_predictor_12(v_mb_ptr, uv_stride, &pred[(BLK_PELS << 1)],
+                                 uv_block_width, &mv, scale, uv_block_width,
+                                 uv_block_height, which_mv, kernel,
+                                 mv_precision_uv, x, y);
     return;
   }
+
+  // While use_32x32 = 0, construct the 32x32 predictor using 4 16x16
+  // predictors.
+  // Y predictor
+  for (i = 0; i < BH; i += ys) {
+    for (j = 0; j < BW; j += xs) {
+      const MV mv = blk_mvs[k];
+      const int y_offset = i * stride + j;
+      const int p_offset = i * BW + j;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        vp9_highbd_build_inter_predictor_12(
+            CONVERT_TO_SHORTPTR(y_mb_ptr + y_offset), stride,
+            CONVERT_TO_SHORTPTR(&pred[p_offset]), BW, &mv, scale, xs, ys,
+            which_mv, kernel, MV_PRECISION_Q3, x, y, xd->bd);
+      } else {
+        vp9_build_inter_predictor_12(y_mb_ptr + y_offset, stride,
+                                     &pred[p_offset], BW, &mv, scale, xs, ys,
+                                     which_mv, kernel, MV_PRECISION_Q3, x, y);
+      }
+#else
+      vp9_build_inter_predictor_12(y_mb_ptr + y_offset, stride, &pred[p_offset],
+                                   BW, &mv, scale, xs, ys, which_mv, kernel,
+                                   MV_PRECISION_Q3, x, y);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-  (void)xd;
-  vp9_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale, 16, 16,
-                            which_mv, kernel, MV_PRECISION_Q3, x, y);
+      k++;
+    }
+  }
 
-  vp9_build_inter_predictor(u_mb_ptr, uv_stride, &pred[256], uv_block_width,
-                            &mv, scale, uv_block_width, uv_block_height,
-                            which_mv, kernel, mv_precision_uv, x, y);
+  // U and V predictors
+  ys = (uv_block_height >> 1);
+  xs = (uv_block_width >> 1);
+  k = 0;
 
-  vp9_build_inter_predictor(v_mb_ptr, uv_stride, &pred[512], uv_block_width,
-                            &mv, scale, uv_block_width, uv_block_height,
-                            which_mv, kernel, mv_precision_uv, x, y);
+  for (i = 0; i < uv_block_height; i += ys) {
+    for (j = 0; j < uv_block_width; j += xs) {
+      const MV mv = blk_mvs[k];
+      const int uv_offset = i * uv_stride + j;
+      const int p_offset = i * uv_block_width + j;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        vp9_highbd_build_inter_predictor_12(
+            CONVERT_TO_SHORTPTR(u_mb_ptr + uv_offset), uv_stride,
+            CONVERT_TO_SHORTPTR(&pred[BLK_PELS + p_offset]), uv_block_width,
+            &mv, scale, xs, ys, which_mv, kernel, mv_precision_uv, x, y,
+            xd->bd);
+
+        vp9_highbd_build_inter_predictor_12(
+            CONVERT_TO_SHORTPTR(v_mb_ptr + uv_offset), uv_stride,
+            CONVERT_TO_SHORTPTR(&pred[(BLK_PELS << 1) + p_offset]),
+            uv_block_width, &mv, scale, xs, ys, which_mv, kernel,
+            mv_precision_uv, x, y, xd->bd);
+      } else {
+        vp9_build_inter_predictor_12(u_mb_ptr + uv_offset, uv_stride,
+                                     &pred[BLK_PELS + p_offset], uv_block_width,
+                                     &mv, scale, xs, ys, which_mv, kernel,
+                                     mv_precision_uv, x, y);
+
+        vp9_build_inter_predictor_12(v_mb_ptr + uv_offset, uv_stride,
+                                     &pred[(BLK_PELS << 1) + p_offset],
+                                     uv_block_width, &mv, scale, xs, ys,
+                                     which_mv, kernel, mv_precision_uv, x, y);
+      }
+#else
+      vp9_build_inter_predictor_12(u_mb_ptr + uv_offset, uv_stride,
+                                   &pred[BLK_PELS + p_offset], uv_block_width,
+                                   &mv, scale, xs, ys, which_mv, kernel,
+                                   mv_precision_uv, x, y);
+
+      vp9_build_inter_predictor_12(v_mb_ptr + uv_offset, uv_stride,
+                                   &pred[(BLK_PELS << 1) + p_offset],
+                                   uv_block_width, &mv, scale, xs, ys, which_mv,
+                                   kernel, mv_precision_uv, x, y);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      k++;
+    }
+  }
 }
 
 void vp9_temporal_filter_init(void) {
@@ -87,149 +409,387 @@ void vp9_temporal_filter_init(void) {
   for (i = 1; i < 512; ++i) fixed_divide[i] = 0x80000 / i;
 }
 
-void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride,
-                                 uint8_t *frame2, unsigned int block_width,
-                                 unsigned int block_height, int strength,
-                                 int filter_weight, unsigned int *accumulator,
-                                 uint16_t *count) {
-  unsigned int i, j, k;
-  int modifier;
-  int byte = 0;
-  const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
+static INLINE int mod_index(int sum_dist, int index, int rounding, int strength,
+                            int filter_weight) {
+  int mod;
 
-  for (i = 0, k = 0; i < block_height; i++) {
-    for (j = 0; j < block_width; j++, k++) {
-      int pixel_value = *frame2;
+  assert(index >= 0 && index <= 13);
+  assert(index_mult[index] != 0);
+
+  mod =
+      ((unsigned int)clamp(sum_dist, 0, UINT16_MAX) * index_mult[index]) >> 16;
+  mod += rounding;
+  mod >>= strength;
+
+  mod = VPXMIN(16, mod);
+
+  mod = 16 - mod;
+  mod *= filter_weight;
+
+  return mod;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE int highbd_mod_index(int sum_dist, int index, int rounding,
+                                   int strength, int filter_weight) {
+  int mod;
+
+  assert(index >= 0 && index <= 13);
+  assert(highbd_index_mult[index] != 0);
+
+  mod = (int)((clamp(sum_dist, 0, INT32_MAX) * highbd_index_mult[index]) >> 32);
+  mod += rounding;
+  mod >>= strength;
+
+  mod = VPXMIN(16, mod);
+
+  mod = 16 - mod;
+  mod *= filter_weight;
+
+  return mod;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE int get_filter_weight(unsigned int i, unsigned int j,
+                                    unsigned int block_height,
+                                    unsigned int block_width,
+                                    const int *const blk_fw, int use_32x32) {
+  // blk_fw[0] ~ blk_fw[3] are the same.
+  if (use_32x32) {
+    return blk_fw[0];
+  }
+
+  if (i < block_height / 2) {
+    if (j < block_width / 2) {
+      return blk_fw[0];
+    }
+
+    return blk_fw[1];
+  }
+
+  if (j < block_width / 2) {
+    return blk_fw[2];
+  }
+
+  return blk_fw[3];
+}
+
+void vp9_apply_temporal_filter_c(
+    const uint8_t *y_frame1, int y_stride, const uint8_t *y_pred,
+    int y_buf_stride, const uint8_t *u_frame1, const uint8_t *v_frame1,
+    int uv_stride, const uint8_t *u_pred, const uint8_t *v_pred,
+    int uv_buf_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32,
+    uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator,
+    uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count) {
+  unsigned int i, j, k, m;
+  int modifier;
+  const int rounding = (1 << strength) >> 1;
+  const unsigned int uv_block_width = block_width >> ss_x;
+  const unsigned int uv_block_height = block_height >> ss_y;
+  DECLARE_ALIGNED(16, uint16_t, y_diff_sse[BLK_PELS]);
+  DECLARE_ALIGNED(16, uint16_t, u_diff_sse[BLK_PELS]);
+  DECLARE_ALIGNED(16, uint16_t, v_diff_sse[BLK_PELS]);
+
+  int idx = 0, idy;
+
+  assert(strength >= 0);
+  assert(strength <= 6);
+
+  memset(y_diff_sse, 0, BLK_PELS * sizeof(uint16_t));
+  memset(u_diff_sse, 0, BLK_PELS * sizeof(uint16_t));
+  memset(v_diff_sse, 0, BLK_PELS * sizeof(uint16_t));
+
+  // Calculate diff^2 for each pixel of the 16x16 block.
+  // TODO(yunqing): the following code needs to be optimized.
+  for (i = 0; i < block_height; i++) {
+    for (j = 0; j < block_width; j++) {
+      const int16_t diff =
+          y_frame1[i * (int)y_stride + j] - y_pred[i * (int)block_width + j];
+      y_diff_sse[idx++] = diff * diff;
+    }
+  }
+  idx = 0;
+  for (i = 0; i < uv_block_height; i++) {
+    for (j = 0; j < uv_block_width; j++) {
+      const int16_t diffu =
+          u_frame1[i * uv_stride + j] - u_pred[i * uv_buf_stride + j];
+      const int16_t diffv =
+          v_frame1[i * uv_stride + j] - v_pred[i * uv_buf_stride + j];
+      u_diff_sse[idx] = diffu * diffu;
+      v_diff_sse[idx] = diffv * diffv;
+      idx++;
+    }
+  }
+
+  for (i = 0, k = 0, m = 0; i < block_height; i++) {
+    for (j = 0; j < block_width; j++) {
+      const int pixel_value = y_pred[i * y_buf_stride + j];
+      const int filter_weight =
+          get_filter_weight(i, j, block_height, block_width, blk_fw, use_32x32);
 
       // non-local mean approach
-      int diff_sse[9] = { 0 };
-      int idx, idy, index = 0;
+      int y_index = 0;
+
+      const int uv_r = i >> ss_y;
+      const int uv_c = j >> ss_x;
+      modifier = 0;
 
       for (idy = -1; idy <= 1; ++idy) {
         for (idx = -1; idx <= 1; ++idx) {
-          int row = (int)i + idy;
-          int col = (int)j + idx;
+          const int row = (int)i + idy;
+          const int col = (int)j + idx;
 
           if (row >= 0 && row < (int)block_height && col >= 0 &&
               col < (int)block_width) {
-            int diff = frame1[byte + idy * (int)stride + idx] -
-                       frame2[idy * (int)block_width + idx];
-            diff_sse[index] = diff * diff;
-            ++index;
+            modifier += y_diff_sse[row * (int)block_width + col];
+            ++y_index;
           }
         }
       }
 
-      assert(index > 0);
+      assert(y_index > 0);
 
-      modifier = 0;
-      for (idx = 0; idx < 9; ++idx) modifier += diff_sse[idx];
+      modifier += u_diff_sse[uv_r * uv_block_width + uv_c];
+      modifier += v_diff_sse[uv_r * uv_block_width + uv_c];
 
-      modifier *= 3;
-      modifier /= index;
+      y_index += 2;
 
-      ++frame2;
+      modifier =
+          mod_index(modifier, y_index, rounding, strength, filter_weight);
 
-      modifier += rounding;
-      modifier >>= strength;
+      y_count[k] += modifier;
+      y_accumulator[k] += modifier * pixel_value;
 
-      if (modifier > 16) modifier = 16;
+      ++k;
 
-      modifier = 16 - modifier;
-      modifier *= filter_weight;
+      // Process chroma component
+      if (!(i & ss_y) && !(j & ss_x)) {
+        const int u_pixel_value = u_pred[uv_r * uv_buf_stride + uv_c];
+        const int v_pixel_value = v_pred[uv_r * uv_buf_stride + uv_c];
 
-      count[k] += modifier;
-      accumulator[k] += modifier * pixel_value;
+        // non-local mean approach
+        int cr_index = 0;
+        int u_mod = 0, v_mod = 0;
+        int y_diff = 0;
 
-      byte++;
+        for (idy = -1; idy <= 1; ++idy) {
+          for (idx = -1; idx <= 1; ++idx) {
+            const int row = uv_r + idy;
+            const int col = uv_c + idx;
+
+            if (row >= 0 && row < (int)uv_block_height && col >= 0 &&
+                col < (int)uv_block_width) {
+              u_mod += u_diff_sse[row * uv_block_width + col];
+              v_mod += v_diff_sse[row * uv_block_width + col];
+              ++cr_index;
+            }
+          }
+        }
+
+        assert(cr_index > 0);
+
+        for (idy = 0; idy < 1 + ss_y; ++idy) {
+          for (idx = 0; idx < 1 + ss_x; ++idx) {
+            const int row = (uv_r << ss_y) + idy;
+            const int col = (uv_c << ss_x) + idx;
+            y_diff += y_diff_sse[row * (int)block_width + col];
+            ++cr_index;
+          }
+        }
+
+        u_mod += y_diff;
+        v_mod += y_diff;
+
+        u_mod = mod_index(u_mod, cr_index, rounding, strength, filter_weight);
+        v_mod = mod_index(v_mod, cr_index, rounding, strength, filter_weight);
+
+        u_count[m] += u_mod;
+        u_accumulator[m] += u_mod * u_pixel_value;
+        v_count[m] += v_mod;
+        v_accumulator[m] += v_mod * v_pixel_value;
+
+        ++m;
+      }  // Complete YUV pixel
     }
-
-    byte += stride - block_width;
   }
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_temporal_filter_apply_c(
-    uint8_t *frame1_8, unsigned int stride, uint8_t *frame2_8,
-    unsigned int block_width, unsigned int block_height, int strength,
-    int filter_weight, unsigned int *accumulator, uint16_t *count) {
-  uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8);
-  uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8);
-  unsigned int i, j, k;
-  int modifier;
-  int byte = 0;
-  const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
+void vp9_highbd_apply_temporal_filter_c(
+    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32,
+    uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count,
+    uint32_t *v_accum, uint16_t *v_count) {
+  const int uv_block_width = block_width >> ss_x;
+  const int uv_block_height = block_height >> ss_y;
+  const int y_diff_stride = BW;
+  const int uv_diff_stride = BW;
 
-  for (i = 0, k = 0; i < block_height; i++) {
-    for (j = 0; j < block_width; j++, k++) {
-      int pixel_value = *frame2;
-      int diff_sse[9] = { 0 };
-      int idx, idy, index = 0;
+  DECLARE_ALIGNED(16, uint32_t, y_diff_sse[BLK_PELS]);
+  DECLARE_ALIGNED(16, uint32_t, u_diff_sse[BLK_PELS]);
+  DECLARE_ALIGNED(16, uint32_t, v_diff_sse[BLK_PELS]);
 
-      for (idy = -1; idy <= 1; ++idy) {
-        for (idx = -1; idx <= 1; ++idx) {
-          int row = (int)i + idy;
-          int col = (int)j + idx;
+  const int rounding = (1 << strength) >> 1;
 
-          if (row >= 0 && row < (int)block_height && col >= 0 &&
-              col < (int)block_width) {
-            int diff = frame1[byte + idy * (int)stride + idx] -
-                       frame2[idy * (int)block_width + idx];
-            diff_sse[index] = diff * diff;
-            ++index;
+  // Loop variables
+  int row, col;
+  int uv_row, uv_col;
+  int row_step, col_step;
+
+  memset(y_diff_sse, 0, BLK_PELS * sizeof(uint32_t));
+  memset(u_diff_sse, 0, BLK_PELS * sizeof(uint32_t));
+  memset(v_diff_sse, 0, BLK_PELS * sizeof(uint32_t));
+
+  // Get the square diffs
+  for (row = 0; row < (int)block_height; row++) {
+    for (col = 0; col < (int)block_width; col++) {
+      const int diff =
+          y_src[row * y_src_stride + col] - y_pre[row * y_pre_stride + col];
+      y_diff_sse[row * y_diff_stride + col] = diff * diff;
+    }
+  }
+
+  for (row = 0; row < uv_block_height; row++) {
+    for (col = 0; col < uv_block_width; col++) {
+      const int u_diff =
+          u_src[row * uv_src_stride + col] - u_pre[row * uv_pre_stride + col];
+      const int v_diff =
+          v_src[row * uv_src_stride + col] - v_pre[row * uv_pre_stride + col];
+      u_diff_sse[row * uv_diff_stride + col] = u_diff * u_diff;
+      v_diff_sse[row * uv_diff_stride + col] = v_diff * v_diff;
+    }
+  }
+
+  // Apply the filter to luma
+  for (row = 0; row < (int)block_height; row++) {
+    for (col = 0; col < (int)block_width; col++) {
+      const int filter_weight = get_filter_weight(
+          row, col, block_height, block_width, blk_fw, use_32x32);
+
+      // First we get the modifier for the current y pixel
+      const int y_pixel = y_pre[row * y_pre_stride + col];
+      int y_num_used = 0;
+      int y_mod = 0;
+
+      // Sum the neighboring 3x3 y pixels
+      for (row_step = -1; row_step <= 1; row_step++) {
+        for (col_step = -1; col_step <= 1; col_step++) {
+          const int sub_row = row + row_step;
+          const int sub_col = col + col_step;
+
+          if (sub_row >= 0 && sub_row < (int)block_height && sub_col >= 0 &&
+              sub_col < (int)block_width) {
+            y_mod += y_diff_sse[sub_row * y_diff_stride + sub_col];
+            y_num_used++;
           }
         }
       }
-      assert(index > 0);
 
-      modifier = 0;
-      for (idx = 0; idx < 9; ++idx) modifier += diff_sse[idx];
+      // Sum the corresponding uv pixels to the current y modifier
+      // Note we are rounding down instead of rounding to the nearest pixel.
+      uv_row = row >> ss_y;
+      uv_col = col >> ss_x;
+      y_mod += u_diff_sse[uv_row * uv_diff_stride + uv_col];
+      y_mod += v_diff_sse[uv_row * uv_diff_stride + uv_col];
 
-      modifier *= 3;
-      modifier /= index;
+      y_num_used += 2;
 
-      ++frame2;
-      modifier += rounding;
-      modifier >>= strength;
+      // Set the modifier
+      y_mod = highbd_mod_index(y_mod, y_num_used, rounding, strength,
+                               filter_weight);
 
-      if (modifier > 16) modifier = 16;
-
-      modifier = 16 - modifier;
-      modifier *= filter_weight;
-
-      count[k] += modifier;
-      accumulator[k] += modifier * pixel_value;
-
-      byte++;
+      // Accumulate the result
+      y_count[row * block_width + col] += y_mod;
+      y_accum[row * block_width + col] += y_mod * y_pixel;
     }
+  }
 
-    byte += stride - block_width;
+  // Apply the filter to chroma
+  for (uv_row = 0; uv_row < uv_block_height; uv_row++) {
+    for (uv_col = 0; uv_col < uv_block_width; uv_col++) {
+      const int y_row = uv_row << ss_y;
+      const int y_col = uv_col << ss_x;
+      const int filter_weight = get_filter_weight(
+          uv_row, uv_col, uv_block_height, uv_block_width, blk_fw, use_32x32);
+
+      const int u_pixel = u_pre[uv_row * uv_pre_stride + uv_col];
+      const int v_pixel = v_pre[uv_row * uv_pre_stride + uv_col];
+
+      int uv_num_used = 0;
+      int u_mod = 0, v_mod = 0;
+
+      // Sum the neighboring 3x3 chromal pixels to the chroma modifier
+      for (row_step = -1; row_step <= 1; row_step++) {
+        for (col_step = -1; col_step <= 1; col_step++) {
+          const int sub_row = uv_row + row_step;
+          const int sub_col = uv_col + col_step;
+
+          if (sub_row >= 0 && sub_row < uv_block_height && sub_col >= 0 &&
+              sub_col < uv_block_width) {
+            u_mod += u_diff_sse[sub_row * uv_diff_stride + sub_col];
+            v_mod += v_diff_sse[sub_row * uv_diff_stride + sub_col];
+            uv_num_used++;
+          }
+        }
+      }
+
+      // Sum all the luma pixels associated with the current luma pixel
+      for (row_step = 0; row_step < 1 + ss_y; row_step++) {
+        for (col_step = 0; col_step < 1 + ss_x; col_step++) {
+          const int sub_row = y_row + row_step;
+          const int sub_col = y_col + col_step;
+          const int y_diff = y_diff_sse[sub_row * y_diff_stride + sub_col];
+
+          u_mod += y_diff;
+          v_mod += y_diff;
+          uv_num_used++;
+        }
+      }
+
+      // Set the modifier
+      u_mod = highbd_mod_index(u_mod, uv_num_used, rounding, strength,
+                               filter_weight);
+      v_mod = highbd_mod_index(v_mod, uv_num_used, rounding, strength,
+                               filter_weight);
+
+      // Accumulate the result
+      u_count[uv_row * uv_block_width + uv_col] += u_mod;
+      u_accum[uv_row * uv_block_width + uv_col] += u_mod * u_pixel;
+      v_count[uv_row * uv_block_width + uv_col] += v_mod;
+      v_accum[uv_row * uv_block_width + uv_col] += v_mod * v_pixel;
+    }
   }
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-static uint32_t temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
-                                                   uint8_t *arf_frame_buf,
-                                                   uint8_t *frame_ptr_buf,
-                                                   int stride) {
-  MACROBLOCK *const x = &cpi->td.mb;
+static uint32_t temporal_filter_find_matching_mb_c(
+    VP9_COMP *cpi, ThreadData *td, uint8_t *arf_frame_buf,
+    uint8_t *frame_ptr_buf, int stride, MV *ref_mv, MV *blk_mvs,
+    int *blk_bestsme, int *is_dc_diff_large) {
+  MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
-  const SEARCH_METHODS old_search_method = mv_sf->search_method;
+  const SEARCH_METHODS search_method = MESH;
+  const SEARCH_METHODS search_method_16 = cpi->sf.temporal_filter_search_method;
   int step_param;
   int sadpb = x->sadperbit16;
   uint32_t bestsme = UINT_MAX;
   uint32_t distortion;
   uint32_t sse;
   int cost_list[5];
+  const MvLimits tmp_mv_limits = x->mv_limits;
 
   MV best_ref_mv1 = { 0, 0 };
   MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
-  MV *ref_mv = &x->e_mbd.mi[0]->bmi[0].as_mv[0].as_mv;
 
   // Save input state
   struct buf_2d src = x->plane[0].src;
   struct buf_2d pre = xd->plane[0].pre[0];
+  int i, j, k = 0;
 
   best_ref_mv1_full.col = best_ref_mv1.col >> 3;
   best_ref_mv1_full.row = best_ref_mv1.row >> 3;
@@ -239,22 +799,60 @@ static uint32_t temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
   x->plane[0].src.stride = stride;
   xd->plane[0].pre[0].buf = frame_ptr_buf;
   xd->plane[0].pre[0].stride = stride;
+  *is_dc_diff_large = 0;
 
   step_param = mv_sf->reduce_first_step_size;
   step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);
 
-  mv_sf->search_method = HEX;
-  vp9_full_pixel_search(cpi, x, BLOCK_16X16, &best_ref_mv1_full, step_param,
-                        sadpb, cond_cost_list(cpi, cost_list), &best_ref_mv1,
-                        ref_mv, 0, 0);
-  mv_sf->search_method = old_search_method;
+  vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
 
-  // Ignore mv costing by sending NULL pointer instead of cost array
+  vp9_full_pixel_search(cpi, x, TF_BLOCK, &best_ref_mv1_full, step_param,
+                        search_method, sadpb, cond_cost_list(cpi, cost_list),
+                        &best_ref_mv1, ref_mv, 0, 0);
+
+  /* restore UMV window */
+  x->mv_limits = tmp_mv_limits;
+
+  // find_fractional_mv_step parameters: best_ref_mv1 is for mv rate cost
+  // calculation. The start full mv and the search result are stored in
+  // ref_mv.
   bestsme = cpi->find_fractional_mv_step(
       x, ref_mv, &best_ref_mv1, cpi->common.allow_high_precision_mv,
-      x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], 0,
-      mv_sf->subpel_iters_per_step, cond_cost_list(cpi, cost_list), NULL, NULL,
-      &distortion, &sse, NULL, 0, 0);
+      x->errorperbit, &cpi->fn_ptr[TF_BLOCK], 0, mv_sf->subpel_search_level,
+      cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, BW,
+      BH, USE_8_TAPS_SHARP);
+  *is_dc_diff_large = 50 * bestsme < sse;
+
+  // DO motion search on 4 16x16 sub_blocks.
+  best_ref_mv1.row = ref_mv->row;
+  best_ref_mv1.col = ref_mv->col;
+  best_ref_mv1_full.col = best_ref_mv1.col >> 3;
+  best_ref_mv1_full.row = best_ref_mv1.row >> 3;
+
+  for (i = 0; i < BH; i += SUB_BH) {
+    for (j = 0; j < BW; j += SUB_BW) {
+      // Setup frame pointers
+      x->plane[0].src.buf = arf_frame_buf + i * stride + j;
+      x->plane[0].src.stride = stride;
+      xd->plane[0].pre[0].buf = frame_ptr_buf + i * stride + j;
+      xd->plane[0].pre[0].stride = stride;
+
+      vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
+      vp9_full_pixel_search(cpi, x, TF_SUB_BLOCK, &best_ref_mv1_full,
+                            step_param, search_method_16, sadpb,
+                            cond_cost_list(cpi, cost_list), &best_ref_mv1,
+                            &blk_mvs[k], 0, 0);
+      /* restore UMV window */
+      x->mv_limits = tmp_mv_limits;
+
+      blk_bestsme[k] = cpi->find_fractional_mv_step(
+          x, &blk_mvs[k], &best_ref_mv1, cpi->common.allow_high_precision_mv,
+          x->errorperbit, &cpi->fn_ptr[TF_SUB_BLOCK], 0,
+          mv_sf->subpel_search_level, cond_cost_list(cpi, cost_list), NULL,
+          NULL, &distortion, &sse, NULL, SUB_BW, SUB_BH, USE_8_TAPS_SHARP);
+      k++;
+    }
+  }
 
   // Restore input state
   x->plane[0].src = src;
@@ -263,37 +861,40 @@ static uint32_t temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
   return bestsme;
 }
 
-static void temporal_filter_iterate_c(VP9_COMP *cpi,
-                                      YV12_BUFFER_CONFIG **frames,
-                                      int frame_count, int alt_ref_index,
-                                      int strength,
-                                      struct scale_factors *scale) {
+void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
+                                       int mb_row, int mb_col_start,
+                                       int mb_col_end) {
+  ARNRFilterData *arnr_filter_data = &cpi->arnr_filter_data;
+  YV12_BUFFER_CONFIG **frames = arnr_filter_data->frames;
+  int frame_count = arnr_filter_data->frame_count;
+  int alt_ref_index = arnr_filter_data->alt_ref_index;
+  int strength = arnr_filter_data->strength;
+  struct scale_factors *scale = &arnr_filter_data->sf;
   int byte;
   int frame;
-  int mb_col, mb_row;
-  unsigned int filter_weight;
-  int mb_cols = (frames[alt_ref_index]->y_crop_width + 15) >> 4;
-  int mb_rows = (frames[alt_ref_index]->y_crop_height + 15) >> 4;
-  int mb_y_offset = 0;
-  int mb_uv_offset = 0;
-  DECLARE_ALIGNED(16, unsigned int, accumulator[16 * 16 * 3]);
-  DECLARE_ALIGNED(16, uint16_t, count[16 * 16 * 3]);
-  MACROBLOCKD *mbd = &cpi->td.mb.e_mbd;
+  int mb_col;
+  int mb_cols = (frames[alt_ref_index]->y_crop_width + BW - 1) >> BW_LOG2;
+  int mb_rows = (frames[alt_ref_index]->y_crop_height + BH - 1) >> BH_LOG2;
+  DECLARE_ALIGNED(16, uint32_t, accumulator[BLK_PELS * 3]);
+  DECLARE_ALIGNED(16, uint16_t, count[BLK_PELS * 3]);
+  MACROBLOCKD *mbd = &td->mb.e_mbd;
   YV12_BUFFER_CONFIG *f = frames[alt_ref_index];
+  YV12_BUFFER_CONFIG *dst = arnr_filter_data->dst;
   uint8_t *dst1, *dst2;
 #if CONFIG_VP9_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint16_t, predictor16[16 * 16 * 3]);
-  DECLARE_ALIGNED(16, uint8_t, predictor8[16 * 16 * 3]);
+  DECLARE_ALIGNED(16, uint16_t, predictor16[BLK_PELS * 3]);
+  DECLARE_ALIGNED(16, uint8_t, predictor8[BLK_PELS * 3]);
   uint8_t *predictor;
 #else
-  DECLARE_ALIGNED(16, uint8_t, predictor[16 * 16 * 3]);
+  DECLARE_ALIGNED(16, uint8_t, predictor[BLK_PELS * 3]);
 #endif
-  const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y;
-  const int mb_uv_width = 16 >> mbd->plane[1].subsampling_x;
+  const int mb_uv_height = BH >> mbd->plane[1].subsampling_y;
+  const int mb_uv_width = BW >> mbd->plane[1].subsampling_x;
+  // Addition of the tile col level offsets
+  int mb_y_offset = mb_row * BH * (f->y_stride) + BW * mb_col_start;
+  int mb_uv_offset =
+      mb_row * mb_uv_height * f->uv_stride + mb_uv_width * mb_col_start;
 
-  // Save input state
-  uint8_t *input_buffer[MAX_MB_PLANE];
-  int i;
 #if CONFIG_VP9_HIGHBITDEPTH
   if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     predictor = CONVERT_TO_BYTEPTR(predictor16);
@@ -302,221 +903,234 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
   }
 #endif
 
-  for (i = 0; i < MAX_MB_PLANE; i++) input_buffer[i] = mbd->plane[i].pre[0].buf;
+  // Source frames are extended to 16 pixels. This is different than
+  //  L/A/G reference frames that have a border of 32 (VP9ENCBORDERINPIXELS)
+  // A 6/8/12 tap filter is used for motion search and prediction. So the
+  // largest Y mv on a border would then be 16 - TF_INTERP_EXTEND. The UV
+  // blocks are half the size of the Y and therefore only extended by 8.
+  // The largest mv that a UV block can support is 8 - TF_INTERP_EXTEND.
+  // A UV mv is half of a Y mv. (16 - TF_INTERP_EXTEND) >> 1 is greater than
+  // 8 - TF_INTERP_EXTEND. To keep the mv in play for both Y and UV planes,
+  // the max that it can be on a border is therefore 16 - (2 * TF_INTERP_EXTEND
+  // + 1).
+  td->mb.mv_limits.row_min = -((mb_row * BH) + (17 - 2 * TF_INTERP_EXTEND));
+  td->mb.mv_limits.row_max =
+      ((mb_rows - 1 - mb_row) * BH) + (17 - 2 * TF_INTERP_EXTEND);
 
-  for (mb_row = 0; mb_row < mb_rows; mb_row++) {
-    // Source frames are extended to 16 pixels. This is different than
-    //  L/A/G reference frames that have a border of 32 (VP9ENCBORDERINPIXELS)
-    // A 6/8 tap filter is used for motion search.  This requires 2 pixels
-    //  before and 3 pixels after.  So the largest Y mv on a border would
-    //  then be 16 - VP9_INTERP_EXTEND. The UV blocks are half the size of the
-    //  Y and therefore only extended by 8.  The largest mv that a UV block
-    //  can support is 8 - VP9_INTERP_EXTEND.  A UV mv is half of a Y mv.
-    //  (16 - VP9_INTERP_EXTEND) >> 1 which is greater than
-    //  8 - VP9_INTERP_EXTEND.
-    // To keep the mv in play for both Y and UV planes the max that it
-    //  can be on a border is therefore 16 - (2*VP9_INTERP_EXTEND+1).
-    cpi->td.mb.mv_limits.row_min =
-        -((mb_row * 16) + (17 - 2 * VP9_INTERP_EXTEND));
-    cpi->td.mb.mv_limits.row_max =
-        ((mb_rows - 1 - mb_row) * 16) + (17 - 2 * VP9_INTERP_EXTEND);
+  for (mb_col = mb_col_start; mb_col < mb_col_end; mb_col++) {
+    int i, j, k;
+    int stride;
+    MV ref_mv;
 
-    for (mb_col = 0; mb_col < mb_cols; mb_col++) {
-      int i, j, k;
-      int stride;
+    vp9_zero_array(accumulator, BLK_PELS * 3);
+    vp9_zero_array(count, BLK_PELS * 3);
 
-      memset(accumulator, 0, 16 * 16 * 3 * sizeof(accumulator[0]));
-      memset(count, 0, 16 * 16 * 3 * sizeof(count[0]));
+    td->mb.mv_limits.col_min = -((mb_col * BW) + (17 - 2 * TF_INTERP_EXTEND));
+    td->mb.mv_limits.col_max =
+        ((mb_cols - 1 - mb_col) * BW) + (17 - 2 * TF_INTERP_EXTEND);
 
-      cpi->td.mb.mv_limits.col_min =
-          -((mb_col * 16) + (17 - 2 * VP9_INTERP_EXTEND));
-      cpi->td.mb.mv_limits.col_max =
-          ((mb_cols - 1 - mb_col) * 16) + (17 - 2 * VP9_INTERP_EXTEND);
+    if (cpi->oxcf.content == VP9E_CONTENT_FILM) {
+      unsigned int src_variance;
+      struct buf_2d src;
 
-      for (frame = 0; frame < frame_count; frame++) {
-        const uint32_t thresh_low = 10000;
-        const uint32_t thresh_high = 20000;
-
-        if (frames[frame] == NULL) continue;
-
-        mbd->mi[0]->bmi[0].as_mv[0].as_mv.row = 0;
-        mbd->mi[0]->bmi[0].as_mv[0].as_mv.col = 0;
-
-        if (frame == alt_ref_index) {
-          filter_weight = 2;
-        } else {
-          // Find best match in this frame by MC
-          uint32_t err = temporal_filter_find_matching_mb_c(
-              cpi, frames[alt_ref_index]->y_buffer + mb_y_offset,
-              frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride);
-
-          // Assign higher weight to matching MB if its error
-          // score is lower. If not applying MC default behavior
-          // is to weight all MBs equal.
-          filter_weight = err < thresh_low ? 2 : err < thresh_high ? 1 : 0;
-        }
-
-        if (filter_weight != 0) {
-          // Construct the predictors
-          temporal_filter_predictors_mb_c(
-              mbd, frames[frame]->y_buffer + mb_y_offset,
-              frames[frame]->u_buffer + mb_uv_offset,
-              frames[frame]->v_buffer + mb_uv_offset, frames[frame]->y_stride,
-              mb_uv_width, mb_uv_height, mbd->mi[0]->bmi[0].as_mv[0].as_mv.row,
-              mbd->mi[0]->bmi[0].as_mv[0].as_mv.col, predictor, scale,
-              mb_col * 16, mb_row * 16);
-
-#if CONFIG_VP9_HIGHBITDEPTH
-          if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-            int adj_strength = strength + 2 * (mbd->bd - 8);
-            // Apply the filter (YUV)
-            vp9_highbd_temporal_filter_apply_c(
-                f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16,
-                adj_strength, filter_weight, accumulator, count);
-            vp9_highbd_temporal_filter_apply_c(
-                f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256,
-                mb_uv_width, mb_uv_height, adj_strength, filter_weight,
-                accumulator + 256, count + 256);
-            vp9_highbd_temporal_filter_apply_c(
-                f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512,
-                mb_uv_width, mb_uv_height, adj_strength, filter_weight,
-                accumulator + 512, count + 512);
-          } else {
-            // Apply the filter (YUV)
-            vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
-                                        predictor, 16, 16, strength,
-                                        filter_weight, accumulator, count);
-            vp9_temporal_filter_apply_c(
-                f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256,
-                mb_uv_width, mb_uv_height, strength, filter_weight,
-                accumulator + 256, count + 256);
-            vp9_temporal_filter_apply_c(
-                f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512,
-                mb_uv_width, mb_uv_height, strength, filter_weight,
-                accumulator + 512, count + 512);
-          }
-#else
-          // Apply the filter (YUV)
-          // TODO(jingning): Need SIMD optimization for this.
-          vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
-                                      predictor, 16, 16, strength,
-                                      filter_weight, accumulator, count);
-          vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride,
-                                      predictor + 256, mb_uv_width,
-                                      mb_uv_height, strength, filter_weight,
-                                      accumulator + 256, count + 256);
-          vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride,
-                                      predictor + 512, mb_uv_width,
-                                      mb_uv_height, strength, filter_weight,
-                                      accumulator + 512, count + 512);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-        }
-      }
+      src.buf = f->y_buffer + mb_y_offset;
+      src.stride = f->y_stride;
 
 #if CONFIG_VP9_HIGHBITDEPTH
       if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        uint16_t *dst1_16;
-        uint16_t *dst2_16;
-        // Normalize filter output to produce AltRef frame
-        dst1 = cpi->alt_ref_buffer.y_buffer;
-        dst1_16 = CONVERT_TO_SHORTPTR(dst1);
-        stride = cpi->alt_ref_buffer.y_stride;
-        byte = mb_y_offset;
-        for (i = 0, k = 0; i < 16; i++) {
-          for (j = 0; j < 16; j++, k++) {
-            unsigned int pval = accumulator[k] + (count[k] >> 1);
-            pval *= fixed_divide[count[k]];
-            pval >>= 19;
-
-            dst1_16[byte] = (uint16_t)pval;
-
-            // move to next pixel
-            byte++;
-          }
-
-          byte += stride - 16;
-        }
-
-        dst1 = cpi->alt_ref_buffer.u_buffer;
-        dst2 = cpi->alt_ref_buffer.v_buffer;
-        dst1_16 = CONVERT_TO_SHORTPTR(dst1);
-        dst2_16 = CONVERT_TO_SHORTPTR(dst2);
-        stride = cpi->alt_ref_buffer.uv_stride;
-        byte = mb_uv_offset;
-        for (i = 0, k = 256; i < mb_uv_height; i++) {
-          for (j = 0; j < mb_uv_width; j++, k++) {
-            int m = k + 256;
-
-            // U
-            unsigned int pval = accumulator[k] + (count[k] >> 1);
-            pval *= fixed_divide[count[k]];
-            pval >>= 19;
-            dst1_16[byte] = (uint16_t)pval;
-
-            // V
-            pval = accumulator[m] + (count[m] >> 1);
-            pval *= fixed_divide[count[m]];
-            pval >>= 19;
-            dst2_16[byte] = (uint16_t)pval;
-
-            // move to next pixel
-            byte++;
-          }
-
-          byte += stride - mb_uv_width;
-        }
+        src_variance =
+            vp9_high_get_sby_perpixel_variance(cpi, &src, TF_BLOCK, mbd->bd);
       } else {
-        // Normalize filter output to produce AltRef frame
-        dst1 = cpi->alt_ref_buffer.y_buffer;
-        stride = cpi->alt_ref_buffer.y_stride;
-        byte = mb_y_offset;
-        for (i = 0, k = 0; i < 16; i++) {
-          for (j = 0; j < 16; j++, k++) {
-            unsigned int pval = accumulator[k] + (count[k] >> 1);
-            pval *= fixed_divide[count[k]];
-            pval >>= 19;
-
-            dst1[byte] = (uint8_t)pval;
-
-            // move to next pixel
-            byte++;
-          }
-          byte += stride - 16;
-        }
-
-        dst1 = cpi->alt_ref_buffer.u_buffer;
-        dst2 = cpi->alt_ref_buffer.v_buffer;
-        stride = cpi->alt_ref_buffer.uv_stride;
-        byte = mb_uv_offset;
-        for (i = 0, k = 256; i < mb_uv_height; i++) {
-          for (j = 0; j < mb_uv_width; j++, k++) {
-            int m = k + 256;
-
-            // U
-            unsigned int pval = accumulator[k] + (count[k] >> 1);
-            pval *= fixed_divide[count[k]];
-            pval >>= 19;
-            dst1[byte] = (uint8_t)pval;
-
-            // V
-            pval = accumulator[m] + (count[m] >> 1);
-            pval *= fixed_divide[count[m]];
-            pval >>= 19;
-            dst2[byte] = (uint8_t)pval;
-
-            // move to next pixel
-            byte++;
-          }
-          byte += stride - mb_uv_width;
-        }
+        src_variance = vp9_get_sby_perpixel_variance(cpi, &src, TF_BLOCK);
       }
 #else
+      src_variance = vp9_get_sby_perpixel_variance(cpi, &src, TF_BLOCK);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+      if (src_variance <= 2) {
+        strength = VPXMAX(0, arnr_filter_data->strength - 2);
+      }
+    }
+
+    for (frame = 0; frame < frame_count; frame++) {
+      // MVs for 4 16x16 sub blocks.
+      MV blk_mvs[4];
+      // Filter weights for 4 16x16 sub blocks.
+      int blk_fw[4] = { 0, 0, 0, 0 };
+      int use_32x32 = 0;
+
+      if (frames[frame] == NULL) continue;
+
+      ref_mv.row = 0;
+      ref_mv.col = 0;
+      blk_mvs[0] = kZeroMv;
+      blk_mvs[1] = kZeroMv;
+      blk_mvs[2] = kZeroMv;
+      blk_mvs[3] = kZeroMv;
+
+      if (frame == alt_ref_index) {
+        blk_fw[0] = blk_fw[1] = blk_fw[2] = blk_fw[3] = 2;
+        use_32x32 = 1;
+      } else {
+        const int thresh_low = 10000;
+        const int thresh_high = 20000;
+        int blk_bestsme[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+        int is_dc_diff_large = 0;
+
+        // Find best match in this frame by MC
+        int err = temporal_filter_find_matching_mb_c(
+            cpi, td, frames[alt_ref_index]->y_buffer + mb_y_offset,
+            frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride,
+            &ref_mv, blk_mvs, blk_bestsme, &is_dc_diff_large);
+
+        if (cpi->oxcf.enable_keyframe_filtering == 1 &&
+            cpi->common.frame_type == KEY_FRAME && is_dc_diff_large)
+          strength = VPXMIN(strength, 1);
+
+        int err16 =
+            blk_bestsme[0] + blk_bestsme[1] + blk_bestsme[2] + blk_bestsme[3];
+        int max_err = INT_MIN, min_err = INT_MAX;
+        for (k = 0; k < 4; k++) {
+          if (min_err > blk_bestsme[k]) min_err = blk_bestsme[k];
+          if (max_err < blk_bestsme[k]) max_err = blk_bestsme[k];
+        }
+
+        if (((err * 15 < (err16 << 4)) && max_err - min_err < 10000) ||
+            ((err * 14 < (err16 << 4)) && max_err - min_err < 5000)) {
+          use_32x32 = 1;
+          // Assign higher weight to matching MB if it's error
+          // score is lower. If not applying MC default behavior
+          // is to weight all MBs equal.
+          blk_fw[0] = err < (thresh_low << THR_SHIFT)    ? 2
+                      : err < (thresh_high << THR_SHIFT) ? 1
+                                                         : 0;
+          blk_fw[1] = blk_fw[2] = blk_fw[3] = blk_fw[0];
+        } else {
+          use_32x32 = 0;
+          for (k = 0; k < 4; k++)
+            blk_fw[k] = blk_bestsme[k] < thresh_low    ? 2
+                        : blk_bestsme[k] < thresh_high ? 1
+                                                       : 0;
+        }
+
+        for (k = 0; k < 4; k++) {
+          switch (abs(frame - alt_ref_index)) {
+            case 1: blk_fw[k] = VPXMIN(blk_fw[k], 2); break;
+            case 2:
+            case 3: blk_fw[k] = VPXMIN(blk_fw[k], 1); break;
+            default: break;
+          }
+        }
+      }
+
+      if (blk_fw[0] | blk_fw[1] | blk_fw[2] | blk_fw[3]) {
+        // Construct the predictors
+        temporal_filter_predictors_mb_c(
+            mbd, frames[frame]->y_buffer + mb_y_offset,
+            frames[frame]->u_buffer + mb_uv_offset,
+            frames[frame]->v_buffer + mb_uv_offset, frames[frame]->y_stride,
+            mb_uv_width, mb_uv_height, ref_mv.row, ref_mv.col, predictor, scale,
+            mb_col * BW, mb_row * BH, blk_mvs, use_32x32);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          int adj_strength = strength + 2 * (mbd->bd - 8);
+          // Apply the filter (YUV)
+          vp9_highbd_apply_temporal_filter(
+              CONVERT_TO_SHORTPTR(f->y_buffer + mb_y_offset), f->y_stride,
+              CONVERT_TO_SHORTPTR(predictor), BW,
+              CONVERT_TO_SHORTPTR(f->u_buffer + mb_uv_offset),
+              CONVERT_TO_SHORTPTR(f->v_buffer + mb_uv_offset), f->uv_stride,
+              CONVERT_TO_SHORTPTR(predictor + BLK_PELS),
+              CONVERT_TO_SHORTPTR(predictor + (BLK_PELS << 1)), mb_uv_width, BW,
+              BH, mbd->plane[1].subsampling_x, mbd->plane[1].subsampling_y,
+              adj_strength, blk_fw, use_32x32, accumulator, count,
+              accumulator + BLK_PELS, count + BLK_PELS,
+              accumulator + (BLK_PELS << 1), count + (BLK_PELS << 1));
+        } else {
+          // Apply the filter (YUV)
+          vp9_apply_temporal_filter(
+              f->y_buffer + mb_y_offset, f->y_stride, predictor, BW,
+              f->u_buffer + mb_uv_offset, f->v_buffer + mb_uv_offset,
+              f->uv_stride, predictor + BLK_PELS, predictor + (BLK_PELS << 1),
+              mb_uv_width, BW, BH, mbd->plane[1].subsampling_x,
+              mbd->plane[1].subsampling_y, strength, blk_fw, use_32x32,
+              accumulator, count, accumulator + BLK_PELS, count + BLK_PELS,
+              accumulator + (BLK_PELS << 1), count + (BLK_PELS << 1));
+        }
+#else
+        // Apply the filter (YUV)
+        vp9_apply_temporal_filter(
+            f->y_buffer + mb_y_offset, f->y_stride, predictor, BW,
+            f->u_buffer + mb_uv_offset, f->v_buffer + mb_uv_offset,
+            f->uv_stride, predictor + BLK_PELS, predictor + (BLK_PELS << 1),
+            mb_uv_width, BW, BH, mbd->plane[1].subsampling_x,
+            mbd->plane[1].subsampling_y, strength, blk_fw, use_32x32,
+            accumulator, count, accumulator + BLK_PELS, count + BLK_PELS,
+            accumulator + (BLK_PELS << 1), count + (BLK_PELS << 1));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+    }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      uint16_t *dst1_16;
+      uint16_t *dst2_16;
       // Normalize filter output to produce AltRef frame
-      dst1 = cpi->alt_ref_buffer.y_buffer;
-      stride = cpi->alt_ref_buffer.y_stride;
+      dst1 = dst->y_buffer;
+      dst1_16 = CONVERT_TO_SHORTPTR(dst1);
+      stride = dst->y_stride;
       byte = mb_y_offset;
-      for (i = 0, k = 0; i < 16; i++) {
-        for (j = 0; j < 16; j++, k++) {
+      for (i = 0, k = 0; i < BH; i++) {
+        for (j = 0; j < BW; j++, k++) {
+          unsigned int pval = accumulator[k] + (count[k] >> 1);
+          pval *= fixed_divide[count[k]];
+          pval >>= 19;
+
+          dst1_16[byte] = (uint16_t)pval;
+
+          // move to next pixel
+          byte++;
+        }
+
+        byte += stride - BW;
+      }
+
+      dst1 = dst->u_buffer;
+      dst2 = dst->v_buffer;
+      dst1_16 = CONVERT_TO_SHORTPTR(dst1);
+      dst2_16 = CONVERT_TO_SHORTPTR(dst2);
+      stride = dst->uv_stride;
+      byte = mb_uv_offset;
+      for (i = 0, k = BLK_PELS; i < mb_uv_height; i++) {
+        for (j = 0; j < mb_uv_width; j++, k++) {
+          int m = k + BLK_PELS;
+
+          // U
+          unsigned int pval = accumulator[k] + (count[k] >> 1);
+          pval *= fixed_divide[count[k]];
+          pval >>= 19;
+          dst1_16[byte] = (uint16_t)pval;
+
+          // V
+          pval = accumulator[m] + (count[m] >> 1);
+          pval *= fixed_divide[count[m]];
+          pval >>= 19;
+          dst2_16[byte] = (uint16_t)pval;
+
+          // move to next pixel
+          byte++;
+        }
+
+        byte += stride - mb_uv_width;
+      }
+    } else {
+      // Normalize filter output to produce AltRef frame
+      dst1 = dst->y_buffer;
+      stride = dst->y_stride;
+      byte = mb_y_offset;
+      for (i = 0, k = 0; i < BH; i++) {
+        for (j = 0; j < BW; j++, k++) {
           unsigned int pval = accumulator[k] + (count[k] >> 1);
           pval *= fixed_divide[count[k]];
           pval >>= 19;
@@ -526,16 +1140,16 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
           // move to next pixel
           byte++;
         }
-        byte += stride - 16;
+        byte += stride - BW;
       }
 
-      dst1 = cpi->alt_ref_buffer.u_buffer;
-      dst2 = cpi->alt_ref_buffer.v_buffer;
-      stride = cpi->alt_ref_buffer.uv_stride;
+      dst1 = dst->u_buffer;
+      dst2 = dst->v_buffer;
+      stride = dst->uv_stride;
       byte = mb_uv_offset;
-      for (i = 0, k = 256; i < mb_uv_height; i++) {
+      for (i = 0, k = BLK_PELS; i < mb_uv_height; i++) {
         for (j = 0; j < mb_uv_width; j++, k++) {
-          int m = k + 256;
+          int m = k + BLK_PELS;
 
           // U
           unsigned int pval = accumulator[k] + (count[k] >> 1);
@@ -554,50 +1168,110 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
         }
         byte += stride - mb_uv_width;
       }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-      mb_y_offset += 16;
-      mb_uv_offset += mb_uv_width;
     }
-    mb_y_offset += 16 * (f->y_stride - mb_cols);
-    mb_uv_offset += mb_uv_height * f->uv_stride - mb_uv_width * mb_cols;
-  }
+#else
+    // Normalize filter output to produce AltRef frame
+    dst1 = dst->y_buffer;
+    stride = dst->y_stride;
+    byte = mb_y_offset;
+    for (i = 0, k = 0; i < BH; i++) {
+      for (j = 0; j < BW; j++, k++) {
+        unsigned int pval = accumulator[k] + (count[k] >> 1);
+        pval *= fixed_divide[count[k]];
+        pval >>= 19;
 
-  // Restore input state
-  for (i = 0; i < MAX_MB_PLANE; i++) mbd->plane[i].pre[0].buf = input_buffer[i];
+        dst1[byte] = (uint8_t)pval;
+
+        // move to next pixel
+        byte++;
+      }
+      byte += stride - BW;
+    }
+
+    dst1 = dst->u_buffer;
+    dst2 = dst->v_buffer;
+    stride = dst->uv_stride;
+    byte = mb_uv_offset;
+    for (i = 0, k = BLK_PELS; i < mb_uv_height; i++) {
+      for (j = 0; j < mb_uv_width; j++, k++) {
+        int m = k + BLK_PELS;
+
+        // U
+        unsigned int pval = accumulator[k] + (count[k] >> 1);
+        pval *= fixed_divide[count[k]];
+        pval >>= 19;
+        dst1[byte] = (uint8_t)pval;
+
+        // V
+        pval = accumulator[m] + (count[m] >> 1);
+        pval *= fixed_divide[count[m]];
+        pval >>= 19;
+        dst2[byte] = (uint8_t)pval;
+
+        // move to next pixel
+        byte++;
+      }
+      byte += stride - mb_uv_width;
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    mb_y_offset += BW;
+    mb_uv_offset += mb_uv_width;
+  }
+}
+
+static void temporal_filter_iterate_tile_c(VP9_COMP *cpi, int tile_row,
+                                           int tile_col) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  TileInfo *tile_info =
+      &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info;
+  const int mb_row_start = (tile_info->mi_row_start) >> TF_SHIFT;
+  const int mb_row_end = (tile_info->mi_row_end + TF_ROUND) >> TF_SHIFT;
+  const int mb_col_start = (tile_info->mi_col_start) >> TF_SHIFT;
+  const int mb_col_end = (tile_info->mi_col_end + TF_ROUND) >> TF_SHIFT;
+  int mb_row;
+
+  for (mb_row = mb_row_start; mb_row < mb_row_end; mb_row++) {
+    vp9_temporal_filter_iterate_row_c(cpi, &cpi->td, mb_row, mb_col_start,
+                                      mb_col_end);
+  }
+}
+
+static void temporal_filter_iterate_c(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  int tile_row, tile_col;
+  vp9_init_tile_data(cpi);
+
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      temporal_filter_iterate_tile_c(cpi, tile_row, tile_col);
+    }
+  }
 }
 
 // Apply buffer limits and context specific adjustments to arnr filter.
 static void adjust_arnr_filter(VP9_COMP *cpi, int distance, int group_boost,
-                               int *arnr_frames, int *arnr_strength) {
+                               int *arnr_frames, int *frames_backward,
+                               int *frames_forward, int *arnr_strength) {
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
-  const int frames_after_arf =
-      vp9_lookahead_depth(cpi->lookahead) - distance - 1;
-  int frames_fwd = (cpi->oxcf.arnr_max_frames - 1) >> 1;
-  int frames_bwd;
-  int q, frames, base_strength, strength;
+
+  int max_fwd =
+      VPXMAX((int)vp9_lookahead_depth(cpi->lookahead) - distance - 1, 0);
+  int max_bwd = VPXMAX(distance, 0);
+  int frames = VPXMAX(oxcf->arnr_max_frames, 1);
+  int q, base_strength, strength;
 
   // Context dependent two pass adjustment to strength.
   if (oxcf->pass == 2) {
     base_strength = oxcf->arnr_strength + cpi->twopass.arnr_strength_adjustment;
     // Clip to allowed range.
-    base_strength = VPXMIN(6, VPXMAX(0, base_strength));
+    base_strength = clamp(base_strength, 0, 6);
   } else {
     base_strength = oxcf->arnr_strength;
   }
 
-  // Define the forward and backwards filter limits for this arnr group.
-  if (frames_fwd > frames_after_arf) frames_fwd = frames_after_arf;
-  if (frames_fwd > distance) frames_fwd = distance;
-
-  frames_bwd = frames_fwd;
-
-  // For even length filter there is one more frame backward
-  // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff.
-  if (frames_bwd < distance) frames_bwd += (oxcf->arnr_max_frames + 1) & 0x1;
-
-  // Set the baseline active filter size.
-  frames = frames_bwd + 1 + frames_fwd;
-
   // Adjust the strength based on active max q.
   if (cpi->common.current_video_frame > 1)
     q = ((int)vp9_convert_qindex_to_q(cpi->rc.avg_frame_qindex[INTER_FRAME],
@@ -613,23 +1287,44 @@ static void adjust_arnr_filter(VP9_COMP *cpi, int distance, int group_boost,
   }
 
   // Adjust number of frames in filter and strength based on gf boost level.
-  if (frames > group_boost / 150) {
-    frames = group_boost / 150;
-    frames += !(frames & 1);
-  }
+  frames = VPXMIN(frames, group_boost / 150);
 
   if (strength > group_boost / 300) {
     strength = group_boost / 300;
   }
 
-  // Adjustments for second level arf in multi arf case.
-  if (cpi->oxcf.pass == 2 && cpi->multi_arf_allowed) {
-    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-    if (gf_group->rf_level[gf_group->index] != GF_ARF_STD) {
-      strength >>= 1;
+  if (VPXMIN(max_fwd, max_bwd) >= frames / 2) {
+    // Handle the even/odd case.
+    *frames_backward = frames / 2;
+    *frames_forward = (frames - 1) / 2;
+  } else {
+    if (max_fwd < frames / 2) {
+      *frames_forward = max_fwd;
+      *frames_backward = VPXMIN(frames - 1 - *frames_forward, max_bwd);
+    } else {
+      *frames_backward = max_bwd;
+      *frames_forward = VPXMIN(frames - 1 - *frames_backward, max_fwd);
     }
   }
 
+  // Set the baseline active filter size.
+  frames = *frames_backward + 1 + *frames_forward;
+
+  // Adjustments for second level arf in multi arf case.
+  // Leave commented out place holder for possible filtering adjustment with
+  // new multi-layer arf code.
+  // if (cpi->oxcf.pass == 2 && cpi->multi_arf_allowed)
+  //   if (gf_group->rf_level[gf_group->index] != GF_ARF_STD) strength >>= 1;
+
+  // TODO(jingning): Skip temporal filtering for intermediate frames that will
+  // be used as show_existing_frame. Need to further explore the possibility to
+  // apply certain filter.
+  if (frames <= 1) {
+    frames = 1;
+    *frames_backward = 0;
+    *frames_forward = 0;
+  }
+
   *arnr_frames = frames;
   *arnr_strength = strength;
 }
@@ -638,21 +1333,28 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  ARNRFilterData *arnr_filter_data = &cpi->arnr_filter_data;
   int frame;
   int frames_to_blur;
   int start_frame;
   int strength;
   int frames_to_blur_backward;
   int frames_to_blur_forward;
-  struct scale_factors sf;
-  YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL };
+  struct scale_factors *sf = &arnr_filter_data->sf;
+  YV12_BUFFER_CONFIG **frames = arnr_filter_data->frames;
+  int rdmult;
 
   // Apply context specific adjustments to the arnr filter parameters.
-  adjust_arnr_filter(cpi, distance, rc->gfu_boost, &frames_to_blur, &strength);
-  frames_to_blur_backward = (frames_to_blur / 2);
-  frames_to_blur_forward = ((frames_to_blur - 1) / 2);
+  adjust_arnr_filter(cpi, distance, rc->gfu_boost, &frames_to_blur,
+                     &frames_to_blur_backward, &frames_to_blur_forward,
+                     &strength);
   start_frame = distance + frames_to_blur_forward;
 
+  arnr_filter_data->strength = strength;
+  arnr_filter_data->frame_count = frames_to_blur;
+  arnr_filter_data->alt_ref_index = frames_to_blur_backward;
+  arnr_filter_data->dst = &cpi->tf_buffer;
+
   // Setup frame pointers, NULL indicates frame not included in filter.
   for (frame = 0; frame < frames_to_blur; ++frame) {
     const int which_buffer = start_frame - frame;
@@ -661,6 +1363,11 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
     frames[frames_to_blur - 1 - frame] = &buf->img;
   }
 
+  YV12_BUFFER_CONFIG *f = frames[arnr_filter_data->alt_ref_index];
+  xd->cur_buf = f;
+  xd->plane[1].subsampling_y = f->subsampling_y;
+  xd->plane[1].subsampling_x = f->subsampling_x;
+
   if (frames_to_blur > 0) {
     // Setup scaling factors. Scaling on each of the arnr frames is not
     // supported.
@@ -670,13 +1377,13 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
       int frame_used = 0;
 #if CONFIG_VP9_HIGHBITDEPTH
       vp9_setup_scale_factors_for_frame(
-          &sf, get_frame_new_buffer(cm)->y_crop_width,
+          sf, get_frame_new_buffer(cm)->y_crop_width,
           get_frame_new_buffer(cm)->y_crop_height,
           get_frame_new_buffer(cm)->y_crop_width,
           get_frame_new_buffer(cm)->y_crop_height, cm->use_highbitdepth);
 #else
       vp9_setup_scale_factors_for_frame(
-          &sf, get_frame_new_buffer(cm)->y_crop_width,
+          sf, get_frame_new_buffer(cm)->y_crop_width,
           get_frame_new_buffer(cm)->y_crop_height,
           get_frame_new_buffer(cm)->y_crop_width,
           get_frame_new_buffer(cm)->y_crop_height);
@@ -697,7 +1404,8 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
                                "Failed to reallocate alt_ref_buffer");
           }
           frames[frame] = vp9_scale_if_required(
-              cm, frames[frame], &cpi->svc.scaled_frames[frame_used], 0);
+              cm, frames[frame], &cpi->svc.scaled_frames[frame_used], 0,
+              EIGHTTAP, 0);
           ++frame_used;
         }
       }
@@ -708,17 +1416,24 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
 // ARF is produced at the native frame size and resized when coded.
 #if CONFIG_VP9_HIGHBITDEPTH
       vp9_setup_scale_factors_for_frame(
-          &sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
+          sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
           frames[0]->y_crop_width, frames[0]->y_crop_height,
           cm->use_highbitdepth);
 #else
       vp9_setup_scale_factors_for_frame(
-          &sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
+          sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
           frames[0]->y_crop_width, frames[0]->y_crop_height);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     }
   }
 
-  temporal_filter_iterate_c(cpi, frames, frames_to_blur,
-                            frames_to_blur_backward, strength, &sf);
+  // Initialize errorperbit and sabperbit.
+  rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, ARNR_FILT_QINDEX);
+  set_error_per_bit(&cpi->td.mb, rdmult);
+  vp9_initialize_me_consts(cpi, &cpi->td.mb, ARNR_FILT_QINDEX);
+
+  if (!cpi->row_mt)
+    temporal_filter_iterate_c(cpi);
+  else
+    vp9_temporal_filter_row_mt(cpi);
 }
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter.h b/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter.h
index f537b8870a..59fb71fc0d 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter.h
@@ -8,18 +8,64 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_TEMPORAL_FILTER_H_
-#define VP9_ENCODER_VP9_TEMPORAL_FILTER_H_
+#ifndef VPX_VP9_ENCODER_VP9_TEMPORAL_FILTER_H_
+#define VPX_VP9_ENCODER_VP9_TEMPORAL_FILTER_H_
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+#define ARNR_FILT_QINDEX 128
+struct VP9_COMP;
+struct ThreadData;
+
+// Block size used in temporal filtering
+#define TF_BLOCK BLOCK_32X32
+#define BH 32
+#define BH_LOG2 5
+#define BW 32
+#define BW_LOG2 5
+#define BLK_PELS ((BH) * (BW))  // Pixels in the block
+#define TF_SHIFT 2
+#define TF_ROUND 3
+#define THR_SHIFT 2
+#define TF_SUB_BLOCK BLOCK_16X16
+#define SUB_BH 16
+#define SUB_BW 16
+#define MAX_FILTER_TAP 12
+
+typedef int16_t InterpKernel12[MAX_FILTER_TAP];
+
+// 12-tap filter (used by the encoder only).
+DECLARE_ALIGNED(256, static const InterpKernel12,
+                sub_pel_filters_12[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 },
+  { 0, 1, -2, 3, -7, 127, 8, -4, 2, -1, 1, 0 },
+  { -1, 2, -3, 6, -13, 124, 18, -8, 4, -2, 2, -1 },
+  { -1, 3, -4, 8, -18, 120, 28, -12, 7, -4, 2, -1 },
+  { -1, 3, -6, 10, -21, 115, 38, -15, 8, -5, 3, -1 },
+  { -2, 4, -6, 12, -24, 108, 49, -18, 10, -6, 3, -2 },
+  { -2, 4, -7, 13, -25, 100, 60, -21, 11, -7, 4, -2 },
+  { -2, 4, -7, 13, -26, 91, 71, -24, 13, -7, 4, -2 },
+  { -2, 4, -7, 13, -25, 81, 81, -25, 13, -7, 4, -2 },
+  { -2, 4, -7, 13, -24, 71, 91, -26, 13, -7, 4, -2 },
+  { -2, 4, -7, 11, -21, 60, 100, -25, 13, -7, 4, -2 },
+  { -2, 3, -6, 10, -18, 49, 108, -24, 12, -6, 4, -2 },
+  { -1, 3, -5, 8, -15, 38, 115, -21, 10, -6, 3, -1 },
+  { -1, 2, -4, 7, -12, 28, 120, -18, 8, -4, 3, -1 },
+  { -1, 2, -2, 4, -8, 18, 124, -13, 6, -3, 2, -1 },
+  { 0, 1, -1, 2, -4, 8, 127, -7, 3, -2, 1, 0 }
+};
+
 void vp9_temporal_filter_init(void);
-void vp9_temporal_filter(VP9_COMP *cpi, int distance);
+void vp9_temporal_filter(struct VP9_COMP *cpi, int distance);
+
+void vp9_temporal_filter_iterate_row_c(struct VP9_COMP *cpi,
+                                       struct ThreadData *td, int mb_row,
+                                       int mb_col_start, int mb_col_end);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_TEMPORAL_FILTER_H_
+#endif  // VPX_VP9_ENCODER_VP9_TEMPORAL_FILTER_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter_constants.h b/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter_constants.h
new file mode 100644
index 0000000000..8776dfc068
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter_constants.h
@@ -0,0 +1,410 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_TEMPORAL_FILTER_CONSTANTS_H_
+#define VPX_VP9_ENCODER_TEMPORAL_FILTER_CONSTANTS_H_
+#include "./vpx_config.h"
+
+// Division using multiplication and shifting. The C implementation does:
+// modifier *= 3;
+// modifier /= index;
+// where 'modifier' is a set of summed values and 'index' is the number of
+// summed values.
+//
+// This equation works out to (m * 3) / i which reduces to:
+// m * 3/4
+// m * 1/2
+// m * 1/3
+//
+// By pairing the multiply with a down shift by 16 (_mm_mulhi_epu16):
+// m * C / 65536
+// we can create a C to replicate the division.
+//
+// m * 49152 / 65536 = m * 3/4
+// m * 32758 / 65536 = m * 1/2
+// m * 21846 / 65536 = m * 0.3333
+//
+// These are loaded using an instruction expecting int16_t values but are used
+// with _mm_mulhi_epu16(), which treats them as unsigned.
+#define NEIGHBOR_CONSTANT_4 (int16_t)49152
+#define NEIGHBOR_CONSTANT_5 (int16_t)39322
+#define NEIGHBOR_CONSTANT_6 (int16_t)32768
+#define NEIGHBOR_CONSTANT_7 (int16_t)28087
+#define NEIGHBOR_CONSTANT_8 (int16_t)24576
+#define NEIGHBOR_CONSTANT_9 (int16_t)21846
+#define NEIGHBOR_CONSTANT_10 (int16_t)19661
+#define NEIGHBOR_CONSTANT_11 (int16_t)17874
+#define NEIGHBOR_CONSTANT_13 (int16_t)15124
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_1[8]) = {
+  NEIGHBOR_CONSTANT_5, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_1[8]) = {
+  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_5
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_1[8]) = {
+  NEIGHBOR_CONSTANT_7,  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_1[8]) = {
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_1[8]) = {
+  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_1[8]) = {
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_2[8]) = {
+  NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_2[8]) = {
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_6
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_2[8]) = {
+  NEIGHBOR_CONSTANT_8,  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_2[8]) = {
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_2[8]) = {
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_2[8]) = {
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11
+};
+
+DECLARE_ALIGNED(16, static const int16_t, TWO_CORNER_NEIGHBORS_PLUS_2[8]) = {
+  NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_6
+};
+
+DECLARE_ALIGNED(16, static const int16_t, TWO_EDGE_NEIGHBORS_PLUS_2[8]) = {
+  NEIGHBOR_CONSTANT_8,  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_4[8]) = {
+  NEIGHBOR_CONSTANT_8,  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_4[8]) = {
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_4[8]) = {
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_4[8]) = {
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_4[8]) = {
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_4[8]) = {
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13
+};
+
+DECLARE_ALIGNED(16, static const int16_t, TWO_CORNER_NEIGHBORS_PLUS_4[8]) = {
+  NEIGHBOR_CONSTANT_8,  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, TWO_EDGE_NEIGHBORS_PLUS_4[8]) = {
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_10
+};
+
+static const int16_t *const LUMA_LEFT_COLUMN_NEIGHBORS[2] = {
+  LEFT_CORNER_NEIGHBORS_PLUS_2, LEFT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const LUMA_MIDDLE_COLUMN_NEIGHBORS[2] = {
+  MIDDLE_EDGE_NEIGHBORS_PLUS_2, MIDDLE_CENTER_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const LUMA_RIGHT_COLUMN_NEIGHBORS[2] = {
+  RIGHT_CORNER_NEIGHBORS_PLUS_2, RIGHT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+  LEFT_CORNER_NEIGHBORS_PLUS_1, LEFT_EDGE_NEIGHBORS_PLUS_1
+};
+
+static const int16_t *const CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+  MIDDLE_EDGE_NEIGHBORS_PLUS_1, MIDDLE_CENTER_NEIGHBORS_PLUS_1
+};
+
+static const int16_t *const CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+  RIGHT_CORNER_NEIGHBORS_PLUS_1, RIGHT_EDGE_NEIGHBORS_PLUS_1
+};
+
+static const int16_t *const CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+  LEFT_CORNER_NEIGHBORS_PLUS_2, LEFT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+  MIDDLE_EDGE_NEIGHBORS_PLUS_2, MIDDLE_CENTER_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+  RIGHT_CORNER_NEIGHBORS_PLUS_2, RIGHT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS[2] = {
+  TWO_CORNER_NEIGHBORS_PLUS_2, TWO_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+  LEFT_CORNER_NEIGHBORS_PLUS_4, LEFT_EDGE_NEIGHBORS_PLUS_4
+};
+
+static const int16_t *const CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+  MIDDLE_EDGE_NEIGHBORS_PLUS_4, MIDDLE_CENTER_NEIGHBORS_PLUS_4
+};
+
+static const int16_t *const CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+  RIGHT_CORNER_NEIGHBORS_PLUS_4, RIGHT_EDGE_NEIGHBORS_PLUS_4
+};
+
+static const int16_t *const CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS[2] = {
+  TWO_CORNER_NEIGHBORS_PLUS_4, TWO_EDGE_NEIGHBORS_PLUS_4
+};
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define HIGHBD_NEIGHBOR_CONSTANT_4 (uint32_t)3221225472U
+#define HIGHBD_NEIGHBOR_CONSTANT_5 (uint32_t)2576980378U
+#define HIGHBD_NEIGHBOR_CONSTANT_6 (uint32_t)2147483648U
+#define HIGHBD_NEIGHBOR_CONSTANT_7 (uint32_t)1840700270U
+#define HIGHBD_NEIGHBOR_CONSTANT_8 (uint32_t)1610612736U
+#define HIGHBD_NEIGHBOR_CONSTANT_9 (uint32_t)1431655766U
+#define HIGHBD_NEIGHBOR_CONSTANT_10 (uint32_t)1288490189U
+#define HIGHBD_NEIGHBOR_CONSTANT_11 (uint32_t)1171354718U
+#define HIGHBD_NEIGHBOR_CONSTANT_13 (uint32_t)991146300U
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_1[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_5, HIGHBD_NEIGHBOR_CONSTANT_7,
+  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_1[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7,
+  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_5
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_1[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_10,
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_1[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_1[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7,
+  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_1[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_6, HIGHBD_NEIGHBOR_CONSTANT_8,
+  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8,
+  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_6
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_11,
+  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11,
+  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8,
+  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11,
+  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_4[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_10,
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_4[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_4[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_13,
+  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_4[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13,
+  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_4[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_4[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13,
+  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13
+};
+
+static const uint32_t *const HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS[2] = {
+  HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const uint32_t *const HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS[2] = {
+  HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2
+};
+
+static const uint32_t *const HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS[2] = {
+  HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const uint32_t *const HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+  HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_1, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_1
+};
+
+static const uint32_t *const HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+  HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_1, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_1
+};
+
+static const uint32_t *const HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+  HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_1, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_1
+};
+
+static const uint32_t
+    *const HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+      HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2
+    };
+
+static const uint32_t
+    *const HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+      HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2
+    };
+
+static const uint32_t
+    *const HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+      HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2
+    };
+
+static const uint32_t
+    *const HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+      HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_4, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_4
+    };
+
+static const uint32_t
+    *const HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+      HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_4, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_4
+    };
+
+static const uint32_t
+    *const HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+      HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_4, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_4
+    };
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#define DIST_STRIDE ((BW) + 2)
+
+#endif  // VPX_VP9_ENCODER_TEMPORAL_FILTER_CONSTANTS_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_tokenize.c b/media/libvpx/libvpx/vp9/encoder/vp9_tokenize.c
index dc2616dbe1..6c6c04493f 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_tokenize.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_tokenize.c
@@ -123,7 +123,7 @@ const int16_t vp9_cat6_low_cost[256] = {
   6620, 6632, 6654, 6666, 6677, 6689, 6751, 6763, 6774, 6786, 6808, 6820, 6831,
   6843, 6890, 6902, 6913, 6925, 6947, 6959, 6970, 6982
 };
-const int vp9_cat6_high_cost[64] = {
+const uint16_t vp9_cat6_high_cost[64] = {
   88,    2251,  2727,  4890,  3148,  5311,  5787,  7950,  3666,  5829,  6305,
   8468,  6726,  8889,  9365,  11528, 3666,  5829,  6305,  8468,  6726,  8889,
   9365,  11528, 7244,  9407,  9883,  12046, 10304, 12467, 12943, 15106, 3666,
@@ -133,7 +133,7 @@ const int vp9_cat6_high_cost[64] = {
 };
 
 #if CONFIG_VP9_HIGHBITDEPTH
-const int vp9_cat6_high10_high_cost[256] = {
+const uint16_t vp9_cat6_high10_high_cost[256] = {
   94,    2257,  2733,  4896,  3154,  5317,  5793,  7956,  3672,  5835,  6311,
   8474,  6732,  8895,  9371,  11534, 3672,  5835,  6311,  8474,  6732,  8895,
   9371,  11534, 7250,  9413,  9889,  12052, 10310, 12473, 12949, 15112, 3672,
@@ -159,7 +159,7 @@ const int vp9_cat6_high10_high_cost[256] = {
   18075, 20238, 18496, 20659, 21135, 23298, 19014, 21177, 21653, 23816, 22074,
   24237, 24713, 26876
 };
-const int vp9_cat6_high12_high_cost[1024] = {
+const uint16_t vp9_cat6_high12_high_cost[1024] = {
   100,   2263,  2739,  4902,  3160,  5323,  5799,  7962,  3678,  5841,  6317,
   8480,  6738,  8901,  9377,  11540, 3678,  5841,  6317,  8480,  6738,  8901,
   9377,  11540, 7256,  9419,  9895,  12058, 10316, 12479, 12955, 15118, 3678,
@@ -364,7 +364,7 @@ static void tokenize_b(int plane, int block, int row, int col,
   const PLANE_TYPE type = get_plane_type(plane);
   const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   const int16_t *scan, *nb;
-  const scan_order *so;
+  const ScanOrder *so;
   const int ref = is_inter_block(mi);
   unsigned int(*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
       td->rd_counts.coef_counts[tx_size][type][ref];
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_tokenize.h b/media/libvpx/libvpx/vp9/encoder/vp9_tokenize.h
index c905715d7d..6407ff9237 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_tokenize.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_tokenize.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_TOKENIZE_H_
-#define VP9_ENCODER_VP9_TOKENIZE_H_
+#ifndef VPX_VP9_ENCODER_VP9_TOKENIZE_H_
+#define VPX_VP9_ENCODER_VP9_TOKENIZE_H_
 
 #include "vp9/common/vp9_entropy.h"
 
@@ -76,25 +76,18 @@ extern const TOKENVALUE *vp9_dct_value_tokens_ptr;
 extern const TOKENVALUE *vp9_dct_cat_lt_10_value_tokens;
 extern const int *vp9_dct_cat_lt_10_value_cost;
 extern const int16_t vp9_cat6_low_cost[256];
-extern const int vp9_cat6_high_cost[64];
-extern const int vp9_cat6_high10_high_cost[256];
-extern const int vp9_cat6_high12_high_cost[1024];
-static INLINE int vp9_get_cost(int16_t token, EXTRABIT extrabits,
-                               const int *cat6_high_table) {
-  if (token != CATEGORY6_TOKEN)
-    return vp9_extra_bits[token].cost[extrabits >> 1];
-  return vp9_cat6_low_cost[(extrabits >> 1) & 0xff] +
-         cat6_high_table[extrabits >> 9];
-}
+extern const uint16_t vp9_cat6_high_cost[64];
+extern const uint16_t vp9_cat6_high10_high_cost[256];
+extern const uint16_t vp9_cat6_high12_high_cost[1024];
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static INLINE const int *vp9_get_high_cost_table(int bit_depth) {
+static INLINE const uint16_t *vp9_get_high_cost_table(int bit_depth) {
   return bit_depth == 8 ? vp9_cat6_high_cost
                         : (bit_depth == 10 ? vp9_cat6_high10_high_cost
                                            : vp9_cat6_high12_high_cost);
 }
 #else
-static INLINE const int *vp9_get_high_cost_table(int bit_depth) {
+static INLINE const uint16_t *vp9_get_high_cost_table(int bit_depth) {
   (void)bit_depth;
   return vp9_cat6_high_cost;
 }
@@ -118,7 +111,7 @@ static INLINE int16_t vp9_get_token(int v) {
 }
 
 static INLINE int vp9_get_token_cost(int v, int16_t *token,
-                                     const int *cat6_high_table) {
+                                     const uint16_t *cat6_high_table) {
   if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) {
     EXTRABIT extrabits;
     *token = CATEGORY6_TOKEN;
@@ -134,4 +127,4 @@ static INLINE int vp9_get_token_cost(int v, int16_t *token,
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_TOKENIZE_H_
+#endif  // VPX_VP9_ENCODER_VP9_TOKENIZE_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c
new file mode 100644
index 0000000000..f65c98f779
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c
@@ -0,0 +1,1791 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+
+#include "./vpx_dsp_rtcd.h"
+#if CONFIG_NON_GREEDY_MV
+#include "vp9/common/vp9_mvref_common.h"
+#endif
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_ext_ratectrl.h"
+#include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_tpl_model.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_ext_ratectrl.h"
+
+static int init_gop_frames_rc(VP9_COMP *cpi, GF_PICTURE *gf_picture,
+                              const GF_GROUP *gf_group, int *tpl_group_frames) {
+  VP9_COMMON *cm = &cpi->common;
+  int frame_idx = 0;
+  int i;
+  int extend_frame_count = 0;
+  int pframe_qindex = cpi->tpl_stats[2].base_qindex;
+  int frame_gop_offset = 0;
+
+  int added_overlay = 0;
+
+  RefCntBuffer *frame_bufs = cm->buffer_pool->frame_bufs;
+  int8_t recon_frame_index[REFS_PER_FRAME + MAX_ARF_LAYERS];
+
+  memset(recon_frame_index, -1, sizeof(recon_frame_index));
+
+  for (i = 0; i < FRAME_BUFFERS; ++i) {
+    if (frame_bufs[i].ref_count == 0) {
+      alloc_frame_mvs(cm, i);
+      if (vpx_realloc_frame_buffer(&frame_bufs[i].buf, cm->width, cm->height,
+                                   cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                   cm->use_highbitdepth,
+#endif
+                                   VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                                   NULL, NULL, NULL))
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate frame buffer");
+
+      recon_frame_index[frame_idx] = i;
+      ++frame_idx;
+
+      if (frame_idx >= REFS_PER_FRAME + cpi->oxcf.enable_auto_arf) break;
+    }
+  }
+
+  for (i = 0; i < REFS_PER_FRAME + 1; ++i) {
+    assert(recon_frame_index[i] >= 0);
+    cpi->tpl_recon_frames[i] = &frame_bufs[recon_frame_index[i]].buf;
+  }
+
+  *tpl_group_frames = 0;
+
+  int ref_table[3];
+
+  if (gf_group->index == 1 && gf_group->update_type[1] == ARF_UPDATE) {
+    if (gf_group->update_type[0] == KF_UPDATE) {
+      // This is the only frame in ref buffer. We need it to be on
+      // gf_picture[0].
+      for (i = 0; i < 3; ++i) ref_table[i] = -REFS_PER_FRAME;
+
+      gf_picture[0].frame =
+          &cm->buffer_pool
+               ->frame_bufs[cm->ref_frame_map[gf_group->update_ref_idx[0]]]
+               .buf;
+      ref_table[gf_group->update_ref_idx[0]] = 0;
+
+      for (i = 0; i < 3; ++i) gf_picture[0].ref_frame[i] = -REFS_PER_FRAME;
+      gf_picture[0].update_type = gf_group->update_type[0];
+    } else {
+      for (i = 0; i < REFS_PER_FRAME; i++) {
+        if (cm->ref_frame_map[i] != -1) {
+          gf_picture[-i].frame =
+              &cm->buffer_pool->frame_bufs[cm->ref_frame_map[i]].buf;
+          ref_table[i] = -i;
+        } else {
+          ref_table[i] = -REFS_PER_FRAME;
+        }
+      }
+      for (i = 0; i < 3; ++i) {
+        gf_picture[0].ref_frame[i] = ref_table[i];
+      }
+    }
+    ++*tpl_group_frames;
+
+    // Initialize base layer ARF frame
+    gf_picture[1].frame = cpi->Source;
+    for (i = 0; i < 3; ++i) gf_picture[1].ref_frame[i] = ref_table[i];
+    gf_picture[1].update_type = gf_group->update_type[1];
+    ref_table[gf_group->update_ref_idx[1]] = 1;
+
+    ++*tpl_group_frames;
+  } else {
+    assert(gf_group->index == 0);
+    if (gf_group->update_type[0] == KF_UPDATE) {
+      // This is the only frame in ref buffer. We need it to be on
+      // gf_picture[0].
+      gf_picture[0].frame = cpi->Source;
+      for (i = 0; i < 3; ++i) gf_picture[0].ref_frame[i] = -REFS_PER_FRAME;
+      gf_picture[0].update_type = gf_group->update_type[0];
+
+      for (i = 0; i < 3; ++i) ref_table[i] = -REFS_PER_FRAME;
+      ref_table[gf_group->update_ref_idx[0]] = 0;
+    } else {
+      // Initialize ref table
+      for (i = 0; i < REFS_PER_FRAME; i++) {
+        if (cm->ref_frame_map[i] != -1) {
+          gf_picture[-i].frame =
+              &cm->buffer_pool->frame_bufs[cm->ref_frame_map[i]].buf;
+          ref_table[i] = -i;
+        } else {
+          ref_table[i] = -REFS_PER_FRAME;
+        }
+      }
+      for (i = 0; i < 3; ++i) {
+        gf_picture[0].ref_frame[i] = ref_table[i];
+      }
+      gf_picture[0].update_type = gf_group->update_type[0];
+      if (gf_group->update_type[0] != OVERLAY_UPDATE &&
+          gf_group->update_ref_idx[0] != -1) {
+        ref_table[gf_group->update_ref_idx[0]] = 0;
+      }
+    }
+    ++*tpl_group_frames;
+  }
+
+  int has_arf =
+      gf_group->gf_group_size > 1 && gf_group->update_type[1] == ARF_UPDATE &&
+      gf_group->update_type[gf_group->gf_group_size] == OVERLAY_UPDATE;
+
+  // Initialize P frames
+  for (frame_idx = *tpl_group_frames; frame_idx < MAX_ARF_GOP_SIZE;
+       ++frame_idx) {
+    if (frame_idx >= gf_group->gf_group_size && !has_arf) break;
+    struct lookahead_entry *buf;
+    frame_gop_offset = gf_group->frame_gop_index[frame_idx];
+    buf = vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1);
+
+    if (buf == NULL) break;
+
+    gf_picture[frame_idx].frame = &buf->img;
+    for (i = 0; i < 3; ++i) {
+      gf_picture[frame_idx].ref_frame[i] = ref_table[i];
+    }
+
+    if (gf_group->update_type[frame_idx] != OVERLAY_UPDATE &&
+        gf_group->update_ref_idx[frame_idx] != -1) {
+      ref_table[gf_group->update_ref_idx[frame_idx]] = frame_idx;
+    }
+
+    gf_picture[frame_idx].update_type = gf_group->update_type[frame_idx];
+
+    ++*tpl_group_frames;
+
+    // The length of group of pictures is baseline_gf_interval, plus the
+    // beginning golden frame from last GOP, plus the last overlay frame in
+    // the same GOP.
+    if (frame_idx == gf_group->gf_group_size) {
+      added_overlay = 1;
+
+      ++frame_idx;
+      ++frame_gop_offset;
+      break;
+    }
+
+    if (frame_idx == gf_group->gf_group_size - 1 &&
+        gf_group->update_type[gf_group->gf_group_size] != OVERLAY_UPDATE) {
+      ++frame_idx;
+      ++frame_gop_offset;
+      break;
+    }
+  }
+
+  int lst_index = frame_idx - 1;
+  // Extend two frames outside the current gf group.
+  for (; has_arf && frame_idx < MAX_LAG_BUFFERS && extend_frame_count < 2;
+       ++frame_idx) {
+    struct lookahead_entry *buf =
+        vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1);
+
+    if (buf == NULL) break;
+
+    cpi->tpl_stats[frame_idx].base_qindex = pframe_qindex;
+
+    gf_picture[frame_idx].frame = &buf->img;
+    gf_picture[frame_idx].ref_frame[0] = gf_picture[lst_index].ref_frame[0];
+    gf_picture[frame_idx].ref_frame[1] = gf_picture[lst_index].ref_frame[1];
+    gf_picture[frame_idx].ref_frame[2] = gf_picture[lst_index].ref_frame[2];
+
+    if (gf_picture[frame_idx].ref_frame[0] >
+            gf_picture[frame_idx].ref_frame[1] &&
+        gf_picture[frame_idx].ref_frame[0] >
+            gf_picture[frame_idx].ref_frame[2]) {
+      gf_picture[frame_idx].ref_frame[0] = lst_index;
+    } else if (gf_picture[frame_idx].ref_frame[1] >
+                   gf_picture[frame_idx].ref_frame[0] &&
+               gf_picture[frame_idx].ref_frame[1] >
+                   gf_picture[frame_idx].ref_frame[2]) {
+      gf_picture[frame_idx].ref_frame[1] = lst_index;
+    } else {
+      gf_picture[frame_idx].ref_frame[2] = lst_index;
+    }
+
+    gf_picture[frame_idx].update_type = LF_UPDATE;
+    lst_index = frame_idx;
+    ++*tpl_group_frames;
+    ++extend_frame_count;
+    ++frame_gop_offset;
+  }
+
+  return extend_frame_count + added_overlay;
+}
+
+static int init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture,
+                           const GF_GROUP *gf_group, int *tpl_group_frames) {
+  if (cpi->ext_ratectrl.ready &&
+      (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0) {
+    return init_gop_frames_rc(cpi, gf_picture, gf_group, tpl_group_frames);
+  }
+
+  VP9_COMMON *cm = &cpi->common;
+  int frame_idx = 0;
+  int i;
+  int gld_index = -1;
+  int alt_index = -2;
+  int lst_index = -1;
+  int arf_index_stack[MAX_ARF_LAYERS];
+  int arf_stack_size = 0;
+  int extend_frame_count = 0;
+  int pframe_qindex = cpi->tpl_stats[2].base_qindex;
+  int frame_gop_offset = 0;
+
+  RefCntBuffer *frame_bufs = cm->buffer_pool->frame_bufs;
+  int8_t recon_frame_index[REFS_PER_FRAME + MAX_ARF_LAYERS];
+
+  memset(recon_frame_index, -1, sizeof(recon_frame_index));
+  stack_init(arf_index_stack, MAX_ARF_LAYERS);
+
+  for (i = 0; i < FRAME_BUFFERS; ++i) {
+    if (frame_bufs[i].ref_count == 0) {
+      alloc_frame_mvs(cm, i);
+      if (vpx_realloc_frame_buffer(&frame_bufs[i].buf, cm->width, cm->height,
+                                   cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                   cm->use_highbitdepth,
+#endif
+                                   VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                                   NULL, NULL, NULL))
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate frame buffer");
+
+      recon_frame_index[frame_idx] = i;
+      ++frame_idx;
+
+      if (frame_idx >= REFS_PER_FRAME + cpi->oxcf.enable_auto_arf) break;
+    }
+  }
+
+  for (i = 0; i < REFS_PER_FRAME + 1; ++i) {
+    assert(recon_frame_index[i] >= 0);
+    cpi->tpl_recon_frames[i] = &frame_bufs[recon_frame_index[i]].buf;
+  }
+
+  *tpl_group_frames = 0;
+
+  // Initialize Golden reference frame.
+  gf_picture[0].frame = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+  for (i = 0; i < 3; ++i) gf_picture[0].ref_frame[i] = -REFS_PER_FRAME;
+  gf_picture[0].update_type = gf_group->update_type[0];
+  gld_index = 0;
+  ++*tpl_group_frames;
+
+  gf_picture[-1].frame = get_ref_frame_buffer(cpi, LAST_FRAME);
+  gf_picture[-2].frame = get_ref_frame_buffer(cpi, ALTREF_FRAME);
+
+  // Initialize base layer ARF frame
+  gf_picture[1].frame = cpi->Source;
+  gf_picture[1].ref_frame[0] = gld_index;
+  gf_picture[1].ref_frame[1] = lst_index;
+  gf_picture[1].ref_frame[2] = alt_index;
+  gf_picture[1].update_type = gf_group->update_type[1];
+  alt_index = 1;
+  ++*tpl_group_frames;
+
+  // Initialize P frames
+  for (frame_idx = 2; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) {
+    struct lookahead_entry *buf;
+    frame_gop_offset = gf_group->frame_gop_index[frame_idx];
+    buf = vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1);
+
+    if (buf == NULL) break;
+
+    gf_picture[frame_idx].frame = &buf->img;
+    gf_picture[frame_idx].ref_frame[0] = gld_index;
+    gf_picture[frame_idx].ref_frame[1] = lst_index;
+    gf_picture[frame_idx].ref_frame[2] = alt_index;
+    gf_picture[frame_idx].update_type = gf_group->update_type[frame_idx];
+
+    switch (gf_group->update_type[frame_idx]) {
+      case ARF_UPDATE:
+        stack_push(arf_index_stack, alt_index, arf_stack_size);
+        ++arf_stack_size;
+        alt_index = frame_idx;
+        break;
+      case LF_UPDATE: lst_index = frame_idx; break;
+      case OVERLAY_UPDATE:
+        gld_index = frame_idx;
+        alt_index = stack_pop(arf_index_stack, arf_stack_size);
+        --arf_stack_size;
+        break;
+      case USE_BUF_FRAME:
+        lst_index = alt_index;
+        alt_index = stack_pop(arf_index_stack, arf_stack_size);
+        --arf_stack_size;
+        break;
+      default: break;
+    }
+
+    ++*tpl_group_frames;
+
+    // The length of group of pictures is baseline_gf_interval, plus the
+    // beginning golden frame from last GOP, plus the last overlay frame in
+    // the same GOP.
+    if (frame_idx == gf_group->gf_group_size) break;
+  }
+
+  alt_index = -1;
+  ++frame_idx;
+  ++frame_gop_offset;
+
+  // Extend two frames outside the current gf group.
+  for (; frame_idx < MAX_LAG_BUFFERS && extend_frame_count < 2; ++frame_idx) {
+    struct lookahead_entry *buf =
+        vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1);
+
+    if (buf == NULL) break;
+
+    cpi->tpl_stats[frame_idx].base_qindex = pframe_qindex;
+
+    gf_picture[frame_idx].frame = &buf->img;
+    gf_picture[frame_idx].ref_frame[0] = gld_index;
+    gf_picture[frame_idx].ref_frame[1] = lst_index;
+    gf_picture[frame_idx].ref_frame[2] = alt_index;
+    gf_picture[frame_idx].update_type = LF_UPDATE;
+    lst_index = frame_idx;
+    ++*tpl_group_frames;
+    ++extend_frame_count;
+    ++frame_gop_offset;
+  }
+
+  return extend_frame_count;
+}
+
+static void init_tpl_stats(VP9_COMP *cpi) {
+  int frame_idx;
+  for (frame_idx = 0; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) {
+    TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+    memset(tpl_frame->tpl_stats_ptr, 0,
+           tpl_frame->height * tpl_frame->width *
+               sizeof(*tpl_frame->tpl_stats_ptr));
+    tpl_frame->is_valid = 0;
+  }
+}
+
+static void free_tpl_frame_stats_list(VpxTplGopStats *tpl_gop_stats) {
+  int frame_idx;
+  for (frame_idx = 0; frame_idx < tpl_gop_stats->size; ++frame_idx) {
+    vpx_free(tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list);
+  }
+  vpx_free(tpl_gop_stats->frame_stats_list);
+}
+
+static void init_tpl_stats_before_propagation(
+    struct vpx_internal_error_info *error_info, VpxTplGopStats *tpl_gop_stats,
+    TplDepFrame *tpl_stats, int tpl_gop_frames, int frame_width,
+    int frame_height) {
+  int frame_idx;
+  free_tpl_frame_stats_list(tpl_gop_stats);
+  CHECK_MEM_ERROR(
+      error_info, tpl_gop_stats->frame_stats_list,
+      vpx_calloc(tpl_gop_frames, sizeof(*tpl_gop_stats->frame_stats_list)));
+  tpl_gop_stats->size = tpl_gop_frames;
+  for (frame_idx = 0; frame_idx < tpl_gop_frames; ++frame_idx) {
+    const int mi_rows = tpl_stats[frame_idx].mi_rows;
+    const int mi_cols = tpl_stats[frame_idx].mi_cols;
+    CHECK_MEM_ERROR(
+        error_info, tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list,
+        vpx_calloc(
+            mi_rows * mi_cols,
+            sizeof(
+                *tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list)));
+    tpl_gop_stats->frame_stats_list[frame_idx].num_blocks = mi_rows * mi_cols;
+    tpl_gop_stats->frame_stats_list[frame_idx].frame_width = frame_width;
+    tpl_gop_stats->frame_stats_list[frame_idx].frame_height = frame_height;
+  }
+}
+
+#if CONFIG_NON_GREEDY_MV
+static uint32_t full_pixel_motion_search(VP9_COMP *cpi, ThreadData *td,
+                                         MotionField *motion_field,
+                                         int frame_idx, uint8_t *cur_frame_buf,
+                                         uint8_t *ref_frame_buf, int stride,
+                                         BLOCK_SIZE bsize, int mi_row,
+                                         int mi_col, MV *mv) {
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+  int step_param;
+  uint32_t bestsme = UINT_MAX;
+  const MvLimits tmp_mv_limits = x->mv_limits;
+  // lambda is used to adjust the importance of motion vector consistency.
+  // TODO(angiebird): Figure out lambda's proper value.
+  const int lambda = cpi->tpl_stats[frame_idx].lambda;
+  int_mv nb_full_mvs[NB_MVS_NUM];
+  int nb_full_mv_num;
+
+  MV best_ref_mv1 = { 0, 0 };
+  MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
+
+  best_ref_mv1_full.col = best_ref_mv1.col >> 3;
+  best_ref_mv1_full.row = best_ref_mv1.row >> 3;
+
+  // Setup frame pointers
+  x->plane[0].src.buf = cur_frame_buf;
+  x->plane[0].src.stride = stride;
+  xd->plane[0].pre[0].buf = ref_frame_buf;
+  xd->plane[0].pre[0].stride = stride;
+
+  step_param = mv_sf->reduce_first_step_size;
+  step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+  vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
+
+  nb_full_mv_num =
+      vp9_prepare_nb_full_mvs(motion_field, mi_row, mi_col, nb_full_mvs);
+  vp9_full_pixel_diamond_new(cpi, x, bsize, &best_ref_mv1_full, step_param,
+                             lambda, 1, nb_full_mvs, nb_full_mv_num, mv);
+
+  /* restore UMV window */
+  x->mv_limits = tmp_mv_limits;
+
+  return bestsme;
+}
+
+static uint32_t sub_pixel_motion_search(VP9_COMP *cpi, ThreadData *td,
+                                        uint8_t *cur_frame_buf,
+                                        uint8_t *ref_frame_buf, int stride,
+                                        BLOCK_SIZE bsize, MV *mv) {
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+  uint32_t bestsme = UINT_MAX;
+  uint32_t distortion;
+  uint32_t sse;
+  int cost_list[5];
+
+  MV best_ref_mv1 = { 0, 0 };
+
+  // Setup frame pointers
+  x->plane[0].src.buf = cur_frame_buf;
+  x->plane[0].src.stride = stride;
+  xd->plane[0].pre[0].buf = ref_frame_buf;
+  xd->plane[0].pre[0].stride = stride;
+
+  // TODO(yunqing): may use higher tap interp filter than 2 taps.
+  // Ignore mv costing by sending NULL pointer instead of cost array
+  bestsme = cpi->find_fractional_mv_step(
+      x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit,
+      &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level,
+      cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0,
+      USE_2_TAPS);
+
+  return bestsme;
+}
+
+#else  // CONFIG_NON_GREEDY_MV
+static uint32_t motion_compensated_prediction(VP9_COMP *cpi, ThreadData *td,
+                                              uint8_t *cur_frame_buf,
+                                              uint8_t *ref_frame_buf,
+                                              int stride, BLOCK_SIZE bsize,
+                                              MV *mv) {
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+  const SEARCH_METHODS search_method = NSTEP;
+  int step_param;
+  int sadpb = x->sadperbit16;
+  uint32_t bestsme = UINT_MAX;
+  uint32_t distortion;
+  uint32_t sse;
+  int cost_list[5];
+  const MvLimits tmp_mv_limits = x->mv_limits;
+
+  MV best_ref_mv1 = { 0, 0 };
+  MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
+
+  best_ref_mv1_full.col = best_ref_mv1.col >> 3;
+  best_ref_mv1_full.row = best_ref_mv1.row >> 3;
+
+  // Setup frame pointers
+  x->plane[0].src.buf = cur_frame_buf;
+  x->plane[0].src.stride = stride;
+  xd->plane[0].pre[0].buf = ref_frame_buf;
+  xd->plane[0].pre[0].stride = stride;
+
+  step_param = mv_sf->reduce_first_step_size;
+  step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+  vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
+
+  vp9_full_pixel_search(cpi, x, bsize, &best_ref_mv1_full, step_param,
+                        search_method, sadpb, cond_cost_list(cpi, cost_list),
+                        &best_ref_mv1, mv, 0, 0);
+
+  /* restore UMV window */
+  x->mv_limits = tmp_mv_limits;
+
+  // TODO(yunqing): may use higher tap interp filter than 2 taps.
+  // Ignore mv costing by sending NULL pointer instead of cost array
+  bestsme = cpi->find_fractional_mv_step(
+      x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit,
+      &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level,
+      cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0,
+      USE_2_TAPS);
+
+  return bestsme;
+}
+#endif
+
+static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row,
+                            int ref_pos_col, int block, BLOCK_SIZE bsize) {
+  int width = 0, height = 0;
+  int bw = 4 << b_width_log2_lookup[bsize];
+  int bh = 4 << b_height_log2_lookup[bsize];
+
+  switch (block) {
+    case 0:
+      width = grid_pos_col + bw - ref_pos_col;
+      height = grid_pos_row + bh - ref_pos_row;
+      break;
+    case 1:
+      width = ref_pos_col + bw - grid_pos_col;
+      height = grid_pos_row + bh - ref_pos_row;
+      break;
+    case 2:
+      width = grid_pos_col + bw - ref_pos_col;
+      height = ref_pos_row + bh - grid_pos_row;
+      break;
+    case 3:
+      width = ref_pos_col + bw - grid_pos_col;
+      height = ref_pos_row + bh - grid_pos_row;
+      break;
+    default: assert(0);
+  }
+
+  return width * height;
+}
+
+static int round_floor(int ref_pos, int bsize_pix) {
+  int round;
+  if (ref_pos < 0)
+    round = -(1 + (-ref_pos - 1) / bsize_pix);
+  else
+    round = ref_pos / bsize_pix;
+
+  return round;
+}
+
+static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col,
+                            BLOCK_SIZE bsize, int stride) {
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col];
+  int idx, idy;
+
+  for (idy = 0; idy < mi_height; ++idy) {
+    for (idx = 0; idx < mi_width; ++idx) {
+      TplDepStats *tpl_ptr = &tpl_stats[(mi_row + idy) * stride + mi_col + idx];
+      const int64_t mc_flow = tpl_ptr->mc_flow;
+      const int64_t mc_ref_cost = tpl_ptr->mc_ref_cost;
+      *tpl_ptr = *src_stats;
+      tpl_ptr->mc_flow = mc_flow;
+      tpl_ptr->mc_ref_cost = mc_ref_cost;
+      tpl_ptr->mc_dep_cost = tpl_ptr->intra_cost + tpl_ptr->mc_flow;
+    }
+  }
+}
+
+static void tpl_store_before_propagation(VpxTplBlockStats *tpl_block_stats,
+                                         TplDepStats *tpl_stats, int mi_row,
+                                         int mi_col, BLOCK_SIZE bsize,
+                                         int src_stride, int64_t recon_error,
+                                         int64_t pred_error, int64_t rate_cost,
+                                         int ref_frame_idx, int mi_rows,
+                                         int mi_cols) {
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const TplDepStats *src_stats = &tpl_stats[mi_row * src_stride + mi_col];
+  int idx, idy;
+
+  for (idy = 0; idy < mi_height; ++idy) {
+    for (idx = 0; idx < mi_width; ++idx) {
+      if (mi_row + idy >= mi_rows || mi_col + idx >= mi_cols) continue;
+      VpxTplBlockStats *tpl_block_stats_ptr =
+          &tpl_block_stats[(mi_row + idy) * mi_cols + mi_col + idx];
+      tpl_block_stats_ptr->row = mi_row * 8 + idy * 8;
+      tpl_block_stats_ptr->col = mi_col * 8 + idx * 8;
+      tpl_block_stats_ptr->inter_cost = src_stats->inter_cost;
+      tpl_block_stats_ptr->intra_cost = src_stats->intra_cost;
+      // inter/intra_cost here is calculated with SATD which should be close
+      // enough to be used as inter/intra_pred_error
+      tpl_block_stats_ptr->inter_pred_err = src_stats->inter_cost;
+      tpl_block_stats_ptr->intra_pred_err = src_stats->intra_cost;
+      tpl_block_stats_ptr->srcrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
+      tpl_block_stats_ptr->srcrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
+      tpl_block_stats_ptr->pred_error = pred_error << TPL_DEP_COST_SCALE_LOG2;
+      tpl_block_stats_ptr->mv_r = (src_stats->mv.as_mv.row >= 0 ? 1 : -1) *
+                                  (abs(src_stats->mv.as_mv.row) + 4) / 8;
+      tpl_block_stats_ptr->mv_c = (src_stats->mv.as_mv.col >= 0 ? 1 : -1) *
+                                  (abs(src_stats->mv.as_mv.col) + 4) / 8;
+      tpl_block_stats_ptr->ref_frame_index = ref_frame_idx;
+    }
+  }
+}
+
+static void tpl_model_update_b(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
+                               int mi_row, int mi_col, const BLOCK_SIZE bsize) {
+  if (tpl_stats->ref_frame_index < 0) return;
+
+  TplDepFrame *ref_tpl_frame = &tpl_frame[tpl_stats->ref_frame_index];
+  TplDepStats *ref_stats = ref_tpl_frame->tpl_stats_ptr;
+  MV mv = tpl_stats->mv.as_mv;
+  int mv_row = mv.row >> 3;
+  int mv_col = mv.col >> 3;
+
+  int ref_pos_row = mi_row * MI_SIZE + mv_row;
+  int ref_pos_col = mi_col * MI_SIZE + mv_col;
+
+  const int bw = 4 << b_width_log2_lookup[bsize];
+  const int bh = 4 << b_height_log2_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int pix_num = bw * bh;
+
+  // top-left on grid block location in pixel
+  int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh;
+  int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw;
+  int block;
+
+  for (block = 0; block < 4; ++block) {
+    int grid_pos_row = grid_pos_row_base + bh * (block >> 1);
+    int grid_pos_col = grid_pos_col_base + bw * (block & 0x01);
+
+    if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE &&
+        grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) {
+      int overlap_area = get_overlap_area(
+          grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize);
+      int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height;
+      int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width;
+
+      int64_t mc_flow = tpl_stats->mc_dep_cost -
+                        (tpl_stats->mc_dep_cost * tpl_stats->inter_cost) /
+                            tpl_stats->intra_cost;
+
+      int idx, idy;
+
+      for (idy = 0; idy < mi_height; ++idy) {
+        for (idx = 0; idx < mi_width; ++idx) {
+          TplDepStats *des_stats =
+              &ref_stats[(ref_mi_row + idy) * ref_tpl_frame->stride +
+                         (ref_mi_col + idx)];
+
+          des_stats->mc_flow += (mc_flow * overlap_area) / pix_num;
+          des_stats->mc_ref_cost +=
+              ((tpl_stats->intra_cost - tpl_stats->inter_cost) * overlap_area) /
+              pix_num;
+          assert(overlap_area >= 0);
+        }
+      }
+    }
+  }
+}
+
+static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
+                             int mi_row, int mi_col, const BLOCK_SIZE bsize) {
+  int idx, idy;
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+
+  for (idy = 0; idy < mi_height; ++idy) {
+    for (idx = 0; idx < mi_width; ++idx) {
+      TplDepStats *tpl_ptr =
+          &tpl_stats[(mi_row + idy) * tpl_frame->stride + (mi_col + idx)];
+      tpl_model_update_b(tpl_frame, tpl_ptr, mi_row + idy, mi_col + idx,
+                         BLOCK_8X8);
+    }
+  }
+}
+
+static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff,
+                               tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                               TX_SIZE tx_size, int64_t *recon_error,
+                               int64_t *sse, uint16_t *eob) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size];
+  int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
+  const int shift = tx_size == TX_32X32 ? 0 : 2;
+
+  // skip block condition should be handled before this is called.
+  assert(!x->skip_block);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp9_highbd_quantize_fp_32x32(coeff, pix_num, p, qcoeff, dqcoeff,
+                                 pd->dequant, eob, scan_order);
+  } else {
+    vp9_quantize_fp_32x32(coeff, pix_num, p, qcoeff, dqcoeff, pd->dequant, eob,
+                          scan_order);
+  }
+#else
+  vp9_quantize_fp_32x32(coeff, pix_num, p, qcoeff, dqcoeff, pd->dequant, eob,
+                        scan_order);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  *recon_error = vp9_block_error(coeff, dqcoeff, pix_num, sse) >> shift;
+  *recon_error = VPXMAX(*recon_error, 1);
+
+  *sse = (*sse) >> shift;
+  *sse = VPXMAX(*sse, 1);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+                             TX_SIZE tx_size) {
+  // TODO(sdeng): Implement SIMD based high bit-depth Hadamard transforms.
+  switch (tx_size) {
+    case TX_8X8: vpx_highbd_hadamard_8x8(src_diff, bw, coeff); break;
+    case TX_16X16: vpx_highbd_hadamard_16x16(src_diff, bw, coeff); break;
+    case TX_32X32: vpx_highbd_hadamard_32x32(src_diff, bw, coeff); break;
+    default: assert(0);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp9_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+                      TX_SIZE tx_size) {
+  switch (tx_size) {
+    case TX_8X8: vpx_hadamard_8x8(src_diff, bw, coeff); break;
+    case TX_16X16: vpx_hadamard_16x16(src_diff, bw, coeff); break;
+    case TX_32X32: vpx_hadamard_32x32(src_diff, bw, coeff); break;
+    default: assert(0);
+  }
+}
+
+static void set_mv_limits(const VP9_COMMON *cm, MACROBLOCK *x, int mi_row,
+                          int mi_col) {
+  x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND));
+  x->mv_limits.row_max =
+      (cm->mi_rows - 1 - mi_row) * MI_SIZE + (17 - 2 * VP9_INTERP_EXTEND);
+  x->mv_limits.col_min = -((mi_col * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND));
+  x->mv_limits.col_max =
+      ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND);
+}
+
+static int rate_estimator(const tran_low_t *qcoeff, int eob, TX_SIZE tx_size) {
+  const ScanOrder *const scan_order = &vp9_scan_orders[tx_size][DCT_DCT];
+  int rate_cost = 1;
+  int idx;
+  assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob);
+  for (idx = 0; idx < eob; ++idx) {
+    unsigned int abs_level = abs(qcoeff[scan_order->scan[idx]]);
+    rate_cost += get_msb(abs_level + 1) + 1 + (abs_level > 0);
+  }
+
+  return (rate_cost << VP9_PROB_COST_SHIFT);
+}
+
+static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+                            struct scale_factors *sf, GF_PICTURE *gf_picture,
+                            int frame_idx, TplDepFrame *tpl_frame,
+                            int16_t *src_diff, tran_low_t *coeff,
+                            tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row,
+                            int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size,
+                            YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor,
+                            int64_t *recon_error, int64_t *rate_cost,
+                            int64_t *sse, int *ref_frame_idx) {
+  VP9_COMMON *cm = &cpi->common;
+  ThreadData *td = &cpi->td;
+
+  const int bw = 4 << b_width_log2_lookup[bsize];
+  const int bh = 4 << b_height_log2_lookup[bsize];
+  const int pix_num = bw * bh;
+  int best_rf_idx = -1;
+  int_mv best_mv;
+  int64_t best_inter_cost = INT64_MAX;
+  int64_t inter_cost;
+  int rf_idx;
+  const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP];
+
+  int64_t best_intra_cost = INT64_MAX;
+  int64_t intra_cost;
+  PREDICTION_MODE mode;
+  int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+  MODE_INFO mi_above, mi_left;
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  TplDepStats *tpl_stats =
+      &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];
+
+  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
+  xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8;
+  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
+  xd->mb_to_right_edge = ((cm->mi_cols - 1 - mi_col) * MI_SIZE) * 8;
+  xd->above_mi = (mi_row > 0) ? &mi_above : NULL;
+  xd->left_mi = (mi_col > 0) ? &mi_left : NULL;
+
+  // Intra prediction search
+  for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
+    uint8_t *src, *dst;
+    int src_stride, dst_stride;
+
+    src = xd->cur_buf->y_buffer + mb_y_offset;
+    src_stride = xd->cur_buf->y_stride;
+
+    dst = &predictor[0];
+    dst_stride = bw;
+
+    xd->mi[0]->sb_type = bsize;
+    xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+
+    vp9_predict_intra_block(xd, b_width_log2_lookup[bsize], tx_size, mode, src,
+                            src_stride, dst, dst_stride, 0, 0, 0);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vpx_highbd_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
+                                dst_stride, xd->bd);
+      vp9_highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+      intra_cost = vpx_highbd_satd(coeff, pix_num);
+    } else {
+      vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
+                         dst_stride);
+      vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+      intra_cost = vpx_satd(coeff, pix_num);
+    }
+#else
+    vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, dst_stride);
+    vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+    intra_cost = vpx_satd(coeff, pix_num);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    if (intra_cost < best_intra_cost) best_intra_cost = intra_cost;
+  }
+
+  // Motion compensated prediction
+  best_mv.as_int = 0;
+
+  set_mv_limits(cm, x, mi_row, mi_col);
+
+  for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+    int_mv mv;
+#if CONFIG_NON_GREEDY_MV
+    MotionField *motion_field;
+#endif
+    if (ref_frame[rf_idx] == NULL) continue;
+
+#if CONFIG_NON_GREEDY_MV
+    (void)td;
+    motion_field = vp9_motion_field_info_get_motion_field(
+        &cpi->motion_field_info, frame_idx, rf_idx, bsize);
+    mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col);
+#else
+    motion_compensated_prediction(cpi, td, xd->cur_buf->y_buffer + mb_y_offset,
+                                  ref_frame[rf_idx]->y_buffer + mb_y_offset,
+                                  xd->cur_buf->y_stride, bsize, &mv.as_mv);
+#endif
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_highbd_build_inter_predictor(
+          CONVERT_TO_SHORTPTR(ref_frame[rf_idx]->y_buffer + mb_y_offset),
+          ref_frame[rf_idx]->y_stride, CONVERT_TO_SHORTPTR(&predictor[0]), bw,
+          &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE,
+          mi_row * MI_SIZE, xd->bd);
+      vpx_highbd_subtract_block(
+          bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset,
+          xd->cur_buf->y_stride, &predictor[0], bw, xd->bd);
+      vp9_highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+      inter_cost = vpx_highbd_satd(coeff, pix_num);
+    } else {
+      vp9_build_inter_predictor(
+          ref_frame[rf_idx]->y_buffer + mb_y_offset,
+          ref_frame[rf_idx]->y_stride, &predictor[0], bw, &mv.as_mv, sf, bw, bh,
+          0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE);
+      vpx_subtract_block(bh, bw, src_diff, bw,
+                         xd->cur_buf->y_buffer + mb_y_offset,
+                         xd->cur_buf->y_stride, &predictor[0], bw);
+      vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+      inter_cost = vpx_satd(coeff, pix_num);
+    }
+#else
+    vp9_build_inter_predictor(ref_frame[rf_idx]->y_buffer + mb_y_offset,
+                              ref_frame[rf_idx]->y_stride, &predictor[0], bw,
+                              &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3,
+                              mi_col * MI_SIZE, mi_row * MI_SIZE);
+    vpx_subtract_block(bh, bw, src_diff, bw,
+                       xd->cur_buf->y_buffer + mb_y_offset,
+                       xd->cur_buf->y_stride, &predictor[0], bw);
+    vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+    inter_cost = vpx_satd(coeff, pix_num);
+#endif
+
+    if (inter_cost < best_inter_cost) {
+      uint16_t eob = 0;
+      best_rf_idx = rf_idx;
+      best_inter_cost = inter_cost;
+      best_mv.as_int = mv.as_int;
+      // Since best_inter_cost is initialized as INT64_MAX, recon_error and
+      // rate_cost will be calculated with the best reference frame.
+      get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, recon_error,
+                         sse, &eob);
+      *rate_cost = rate_estimator(qcoeff, eob, tx_size);
+    }
+  }
+  best_intra_cost = VPXMAX(best_intra_cost, 1);
+  best_inter_cost = VPXMIN(best_intra_cost, best_inter_cost);
+  tpl_stats->inter_cost = VPXMAX(
+      1, (best_inter_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width));
+  tpl_stats->intra_cost = VPXMAX(
+      1, (best_intra_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width));
+  if (best_rf_idx >= 0) {
+    tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx];
+  }
+  tpl_stats->mv.as_int = best_mv.as_int;
+  *ref_frame_idx = best_rf_idx;
+}
+
+#if CONFIG_NON_GREEDY_MV
+static int get_block_src_pred_buf(MACROBLOCKD *xd, GF_PICTURE *gf_picture,
+                                  int frame_idx, int rf_idx, int mi_row,
+                                  int mi_col, struct buf_2d *src,
+                                  struct buf_2d *pre) {
+  const int mb_y_offset =
+      mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+  YV12_BUFFER_CONFIG *ref_frame = NULL;
+  int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
+  if (ref_frame_idx != -1) {
+    ref_frame = gf_picture[ref_frame_idx].frame;
+    src->buf = xd->cur_buf->y_buffer + mb_y_offset;
+    src->stride = xd->cur_buf->y_stride;
+    pre->buf = ref_frame->y_buffer + mb_y_offset;
+    pre->stride = ref_frame->y_stride;
+    assert(src->stride == pre->stride);
+    return 1;
+  } else {
+    printf("invalid ref_frame_idx");
+    assert(ref_frame_idx != -1);
+    return 0;
+  }
+}
+
+#define kMvPreCheckLines 5
+#define kMvPreCheckSize 15
+
+#define MV_REF_POS_NUM 3
+POSITION mv_ref_pos[MV_REF_POS_NUM] = {
+  { -1, 0 },
+  { 0, -1 },
+  { -1, -1 },
+};
+
+static int_mv *get_select_mv(VP9_COMP *cpi, TplDepFrame *tpl_frame, int mi_row,
+                             int mi_col) {
+  return &cpi->select_mv_arr[mi_row * tpl_frame->stride + mi_col];
+}
+
+static int_mv find_ref_mv(int mv_mode, VP9_COMP *cpi, TplDepFrame *tpl_frame,
+                          BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  int i;
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  int_mv nearest_mv, near_mv, invalid_mv;
+  nearest_mv.as_int = INVALID_MV;
+  near_mv.as_int = INVALID_MV;
+  invalid_mv.as_int = INVALID_MV;
+  for (i = 0; i < MV_REF_POS_NUM; ++i) {
+    int nb_row = mi_row + mv_ref_pos[i].row * mi_height;
+    int nb_col = mi_col + mv_ref_pos[i].col * mi_width;
+    assert(mv_ref_pos[i].row <= 0);
+    assert(mv_ref_pos[i].col <= 0);
+    if (nb_row >= 0 && nb_col >= 0) {
+      if (nearest_mv.as_int == INVALID_MV) {
+        nearest_mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col);
+      } else {
+        int_mv mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col);
+        if (mv.as_int == nearest_mv.as_int) {
+          continue;
+        } else {
+          near_mv = mv;
+          break;
+        }
+      }
+    }
+  }
+  if (nearest_mv.as_int == INVALID_MV) {
+    nearest_mv.as_mv.row = 0;
+    nearest_mv.as_mv.col = 0;
+  }
+  if (near_mv.as_int == INVALID_MV) {
+    near_mv.as_mv.row = 0;
+    near_mv.as_mv.col = 0;
+  }
+  if (mv_mode == NEAREST_MV_MODE) {
+    return nearest_mv;
+  }
+  if (mv_mode == NEAR_MV_MODE) {
+    return near_mv;
+  }
+  assert(0);
+  return invalid_mv;
+}
+
+static int_mv get_mv_from_mv_mode(int mv_mode, VP9_COMP *cpi,
+                                  MotionField *motion_field,
+                                  TplDepFrame *tpl_frame, BLOCK_SIZE bsize,
+                                  int mi_row, int mi_col) {
+  int_mv mv;
+  switch (mv_mode) {
+    case ZERO_MV_MODE:
+      mv.as_mv.row = 0;
+      mv.as_mv.col = 0;
+      break;
+    case NEW_MV_MODE:
+      mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col);
+      break;
+    case NEAREST_MV_MODE:
+      mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col);
+      break;
+    case NEAR_MV_MODE:
+      mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col);
+      break;
+    default:
+      mv.as_int = INVALID_MV;
+      assert(0);
+      break;
+  }
+  return mv;
+}
+
+static double get_mv_dist(int mv_mode, VP9_COMP *cpi, MACROBLOCKD *xd,
+                          GF_PICTURE *gf_picture, MotionField *motion_field,
+                          int frame_idx, TplDepFrame *tpl_frame, int rf_idx,
+                          BLOCK_SIZE bsize, int mi_row, int mi_col,
+                          int_mv *mv) {
+  uint32_t sse;
+  struct buf_2d src;
+  struct buf_2d pre;
+  MV full_mv;
+  *mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame, bsize,
+                            mi_row, mi_col);
+  full_mv = get_full_mv(&mv->as_mv);
+  if (get_block_src_pred_buf(xd, gf_picture, frame_idx, rf_idx, mi_row, mi_col,
+                             &src, &pre)) {
+    // TODO(angiebird): Consider subpixel when computing the sse.
+    cpi->fn_ptr[bsize].vf(src.buf, src.stride, get_buf_from_mv(&pre, &full_mv),
+                          pre.stride, &sse);
+    return (double)(sse << VP9_DIST_SCALE_LOG2);
+  } else {
+    assert(0);
+    return 0;
+  }
+}
+
+static int get_mv_mode_cost(int mv_mode) {
+  // TODO(angiebird): The probabilities are roughly inferred from
+  // default_inter_mode_probs. Check if there is a better way to set the
+  // probabilities.
+  const int zero_mv_prob = 16;
+  const int new_mv_prob = 24 * 1;
+  const int ref_mv_prob = 256 - zero_mv_prob - new_mv_prob;
+  assert(zero_mv_prob + new_mv_prob + ref_mv_prob == 256);
+  switch (mv_mode) {
+    case ZERO_MV_MODE: return vp9_prob_cost[zero_mv_prob]; break;
+    case NEW_MV_MODE: return vp9_prob_cost[new_mv_prob]; break;
+    case NEAREST_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break;
+    case NEAR_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break;
+    default: assert(0); return -1;
+  }
+}
+
+static INLINE double get_mv_diff_cost(MV *new_mv, MV *ref_mv) {
+  double mv_diff_cost = log2(1 + abs(new_mv->row - ref_mv->row)) +
+                        log2(1 + abs(new_mv->col - ref_mv->col));
+  mv_diff_cost *= (1 << VP9_PROB_COST_SHIFT);
+  return mv_diff_cost;
+}
+static double get_mv_cost(int mv_mode, VP9_COMP *cpi, MotionField *motion_field,
+                          TplDepFrame *tpl_frame, BLOCK_SIZE bsize, int mi_row,
+                          int mi_col) {
+  double mv_cost = get_mv_mode_cost(mv_mode);
+  if (mv_mode == NEW_MV_MODE) {
+    MV new_mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame,
+                                    bsize, mi_row, mi_col)
+                    .as_mv;
+    MV nearest_mv = get_mv_from_mv_mode(NEAREST_MV_MODE, cpi, motion_field,
+                                        tpl_frame, bsize, mi_row, mi_col)
+                        .as_mv;
+    MV near_mv = get_mv_from_mv_mode(NEAR_MV_MODE, cpi, motion_field, tpl_frame,
+                                     bsize, mi_row, mi_col)
+                     .as_mv;
+    double nearest_cost = get_mv_diff_cost(&new_mv, &nearest_mv);
+    double near_cost = get_mv_diff_cost(&new_mv, &near_mv);
+    mv_cost += nearest_cost < near_cost ? nearest_cost : near_cost;
+  }
+  return mv_cost;
+}
+
+static double eval_mv_mode(int mv_mode, VP9_COMP *cpi, MACROBLOCK *x,
+                           GF_PICTURE *gf_picture, MotionField *motion_field,
+                           int frame_idx, TplDepFrame *tpl_frame, int rf_idx,
+                           BLOCK_SIZE bsize, int mi_row, int mi_col,
+                           int_mv *mv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  double mv_dist =
+      get_mv_dist(mv_mode, cpi, xd, gf_picture, motion_field, frame_idx,
+                  tpl_frame, rf_idx, bsize, mi_row, mi_col, mv);
+  double mv_cost =
+      get_mv_cost(mv_mode, cpi, motion_field, tpl_frame, bsize, mi_row, mi_col);
+  double mult = 180;
+
+  return mv_cost + mult * log2f(1 + mv_dist);
+}
+
+static int find_best_ref_mv_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                                 GF_PICTURE *gf_picture,
+                                 MotionField *motion_field, int frame_idx,
+                                 TplDepFrame *tpl_frame, int rf_idx,
+                                 BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                 double *rd, int_mv *mv) {
+  int best_mv_mode = ZERO_MV_MODE;
+  int update = 0;
+  int mv_mode;
+  *rd = 0;
+  for (mv_mode = 0; mv_mode < MAX_MV_MODE; ++mv_mode) {
+    double this_rd;
+    int_mv this_mv;
+    if (mv_mode == NEW_MV_MODE) {
+      continue;
+    }
+    this_rd = eval_mv_mode(mv_mode, cpi, x, gf_picture, motion_field, frame_idx,
+                           tpl_frame, rf_idx, bsize, mi_row, mi_col, &this_mv);
+    if (update == 0) {
+      *rd = this_rd;
+      *mv = this_mv;
+      best_mv_mode = mv_mode;
+      update = 1;
+    } else {
+      if (this_rd < *rd) {
+        *rd = this_rd;
+        *mv = this_mv;
+        best_mv_mode = mv_mode;
+      }
+    }
+  }
+  return best_mv_mode;
+}
+
+static void predict_mv_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                            GF_PICTURE *gf_picture, MotionField *motion_field,
+                            int frame_idx, TplDepFrame *tpl_frame, int rf_idx,
+                            BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  int tmp_mv_mode_arr[kMvPreCheckSize];
+  int *mv_mode_arr = tpl_frame->mv_mode_arr[rf_idx];
+  double *rd_diff_arr = tpl_frame->rd_diff_arr[rf_idx];
+  int_mv *select_mv_arr = cpi->select_mv_arr;
+  int_mv tmp_select_mv_arr[kMvPreCheckSize];
+  int stride = tpl_frame->stride;
+  double new_mv_rd = 0;
+  double no_new_mv_rd = 0;
+  double this_new_mv_rd = 0;
+  double this_no_new_mv_rd = 0;
+  int idx;
+  int tmp_idx;
+  assert(kMvPreCheckSize == (kMvPreCheckLines * (kMvPreCheckLines + 1)) >> 1);
+
+  // no new mv
+  // diagonal scan order
+  tmp_idx = 0;
+  for (idx = 0; idx < kMvPreCheckLines; ++idx) {
+    int r;
+    for (r = 0; r <= idx; ++r) {
+      int c = idx - r;
+      int nb_row = mi_row + r * mi_height;
+      int nb_col = mi_col + c * mi_width;
+      if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) {
+        double this_rd;
+        int_mv *mv = &select_mv_arr[nb_row * stride + nb_col];
+        mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode(
+            cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx,
+            bsize, nb_row, nb_col, &this_rd, mv);
+        if (r == 0 && c == 0) {
+          this_no_new_mv_rd = this_rd;
+        }
+        no_new_mv_rd += this_rd;
+        tmp_mv_mode_arr[tmp_idx] = mv_mode_arr[nb_row * stride + nb_col];
+        tmp_select_mv_arr[tmp_idx] = select_mv_arr[nb_row * stride + nb_col];
+        ++tmp_idx;
+      }
+    }
+  }
+
+  // new mv
+  mv_mode_arr[mi_row * stride + mi_col] = NEW_MV_MODE;
+  this_new_mv_rd = eval_mv_mode(
+      NEW_MV_MODE, cpi, x, gf_picture, motion_field, frame_idx, tpl_frame,
+      rf_idx, bsize, mi_row, mi_col, &select_mv_arr[mi_row * stride + mi_col]);
+  new_mv_rd = this_new_mv_rd;
+  // We start from idx = 1 because idx = 0 is evaluated as NEW_MV_MODE
+  // beforehand.
+  for (idx = 1; idx < kMvPreCheckLines; ++idx) {
+    int r;
+    for (r = 0; r <= idx; ++r) {
+      int c = idx - r;
+      int nb_row = mi_row + r * mi_height;
+      int nb_col = mi_col + c * mi_width;
+      if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) {
+        double this_rd;
+        int_mv *mv = &select_mv_arr[nb_row * stride + nb_col];
+        mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode(
+            cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx,
+            bsize, nb_row, nb_col, &this_rd, mv);
+        new_mv_rd += this_rd;
+      }
+    }
+  }
+
+  // update best_mv_mode
+  tmp_idx = 0;
+  if (no_new_mv_rd < new_mv_rd) {
+    for (idx = 0; idx < kMvPreCheckLines; ++idx) {
+      int r;
+      for (r = 0; r <= idx; ++r) {
+        int c = idx - r;
+        int nb_row = mi_row + r * mi_height;
+        int nb_col = mi_col + c * mi_width;
+        if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) {
+          mv_mode_arr[nb_row * stride + nb_col] = tmp_mv_mode_arr[tmp_idx];
+          select_mv_arr[nb_row * stride + nb_col] = tmp_select_mv_arr[tmp_idx];
+          ++tmp_idx;
+        }
+      }
+    }
+    rd_diff_arr[mi_row * stride + mi_col] = 0;
+  } else {
+    rd_diff_arr[mi_row * stride + mi_col] =
+        (no_new_mv_rd - this_no_new_mv_rd) - (new_mv_rd - this_new_mv_rd);
+  }
+}
+
+static void predict_mv_mode_arr(VP9_COMP *cpi, MACROBLOCK *x,
+                                GF_PICTURE *gf_picture,
+                                MotionField *motion_field, int frame_idx,
+                                TplDepFrame *tpl_frame, int rf_idx,
+                                BLOCK_SIZE bsize) {
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int unit_rows = tpl_frame->mi_rows / mi_height;
+  const int unit_cols = tpl_frame->mi_cols / mi_width;
+  const int max_diagonal_lines = unit_rows + unit_cols - 1;
+  int idx;
+  for (idx = 0; idx < max_diagonal_lines; ++idx) {
+    int r;
+    for (r = VPXMAX(idx - unit_cols + 1, 0); r <= VPXMIN(idx, unit_rows - 1);
+         ++r) {
+      int c = idx - r;
+      int mi_row = r * mi_height;
+      int mi_col = c * mi_width;
+      assert(c >= 0 && c < unit_cols);
+      assert(mi_row >= 0 && mi_row < tpl_frame->mi_rows);
+      assert(mi_col >= 0 && mi_col < tpl_frame->mi_cols);
+      predict_mv_mode(cpi, x, gf_picture, motion_field, frame_idx, tpl_frame,
+                      rf_idx, bsize, mi_row, mi_col);
+    }
+  }
+}
+
+static void do_motion_search(VP9_COMP *cpi, ThreadData *td,
+                             MotionField *motion_field, int frame_idx,
+                             YV12_BUFFER_CONFIG *ref_frame, BLOCK_SIZE bsize,
+                             int mi_row, int mi_col) {
+  VP9_COMMON *cm = &cpi->common;
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int mb_y_offset =
+      mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+  assert(ref_frame != NULL);
+  set_mv_limits(cm, x, mi_row, mi_col);
+  {
+    int_mv mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col);
+    uint8_t *cur_frame_buf = xd->cur_buf->y_buffer + mb_y_offset;
+    uint8_t *ref_frame_buf = ref_frame->y_buffer + mb_y_offset;
+    const int stride = xd->cur_buf->y_stride;
+    full_pixel_motion_search(cpi, td, motion_field, frame_idx, cur_frame_buf,
+                             ref_frame_buf, stride, bsize, mi_row, mi_col,
+                             &mv.as_mv);
+    sub_pixel_motion_search(cpi, td, cur_frame_buf, ref_frame_buf, stride,
+                            bsize, &mv.as_mv);
+    vp9_motion_field_mi_set_mv(motion_field, mi_row, mi_col, mv);
+  }
+}
+
+static void build_motion_field(
+    VP9_COMP *cpi, int frame_idx,
+    YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES], BLOCK_SIZE bsize) {
+  VP9_COMMON *cm = &cpi->common;
+  ThreadData *td = &cpi->td;
+  TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int pw = num_4x4_blocks_wide_lookup[bsize] << 2;
+  const int ph = num_4x4_blocks_high_lookup[bsize] << 2;
+  int mi_row, mi_col;
+  int rf_idx;
+
+  tpl_frame->lambda = (pw * ph) >> 2;
+  assert(pw * ph == tpl_frame->lambda << 2);
+
+  for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+    MotionField *motion_field = vp9_motion_field_info_get_motion_field(
+        &cpi->motion_field_info, frame_idx, rf_idx, bsize);
+    if (ref_frame[rf_idx] == NULL) {
+      continue;
+    }
+    vp9_motion_field_reset_mvs(motion_field);
+    for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
+      for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
+        do_motion_search(cpi, td, motion_field, frame_idx, ref_frame[rf_idx],
+                         bsize, mi_row, mi_col);
+      }
+    }
+  }
+}
+#endif  // CONFIG_NON_GREEDY_MV
+
+static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture,
+                              int frame_idx, BLOCK_SIZE bsize) {
+  TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+  VpxTplFrameStats *tpl_frame_stats_before_propagation =
+      &cpi->tpl_gop_stats.frame_stats_list[frame_idx];
+  YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame;
+  YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES] = { NULL, NULL, NULL };
+
+  VP9_COMMON *cm = &cpi->common;
+  struct scale_factors sf;
+  int rdmult, idx;
+  ThreadData *td = &cpi->td;
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  int mi_row, mi_col;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t, predictor16[32 * 32 * 3]);
+  DECLARE_ALIGNED(16, uint8_t, predictor8[32 * 32 * 3]);
+  uint8_t *predictor;
+#else
+  DECLARE_ALIGNED(16, uint8_t, predictor[32 * 32 * 3]);
+#endif
+  DECLARE_ALIGNED(16, int16_t, src_diff[32 * 32]);
+  DECLARE_ALIGNED(16, tran_low_t, coeff[32 * 32]);
+  DECLARE_ALIGNED(16, tran_low_t, qcoeff[32 * 32]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
+
+  const TX_SIZE tx_size = max_txsize_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+
+  tpl_frame_stats_before_propagation->frame_width = cm->width;
+  tpl_frame_stats_before_propagation->frame_height = cm->height;
+  // Setup scaling factor
+#if CONFIG_VP9_HIGHBITDEPTH
+  vp9_setup_scale_factors_for_frame(
+      &sf, this_frame->y_crop_width, this_frame->y_crop_height,
+      this_frame->y_crop_width, this_frame->y_crop_height,
+      cpi->common.use_highbitdepth);
+
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    predictor = CONVERT_TO_BYTEPTR(predictor16);
+  else
+    predictor = predictor8;
+#else
+  vp9_setup_scale_factors_for_frame(
+      &sf, this_frame->y_crop_width, this_frame->y_crop_height,
+      this_frame->y_crop_width, this_frame->y_crop_height);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  // Prepare reference frame pointers. If any reference frame slot is
+  // unavailable, the pointer will be set to Null.
+  for (idx = 0; idx < MAX_INTER_REF_FRAMES; ++idx) {
+    int rf_idx = gf_picture[frame_idx].ref_frame[idx];
+    if (rf_idx != -REFS_PER_FRAME) ref_frame[idx] = gf_picture[rf_idx].frame;
+  }
+
+  xd->mi = cm->mi_grid_visible;
+  xd->mi[0] = cm->mi;
+  xd->cur_buf = this_frame;
+
+  // Get rd multiplier set up.
+  rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, tpl_frame->base_qindex);
+  set_error_per_bit(&cpi->td.mb, rdmult);
+  vp9_initialize_me_consts(cpi, &cpi->td.mb, tpl_frame->base_qindex);
+
+  tpl_frame->is_valid = 1;
+
+  cm->base_qindex = tpl_frame->base_qindex;
+  vp9_frame_init_quantizer(cpi);
+
+#if CONFIG_NON_GREEDY_MV
+  {
+    int square_block_idx;
+    int rf_idx;
+    for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES;
+         ++square_block_idx) {
+      BLOCK_SIZE square_bsize = square_block_idx_to_bsize(square_block_idx);
+      build_motion_field(cpi, frame_idx, ref_frame, square_bsize);
+    }
+    for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+      int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
+      if (ref_frame_idx != -1) {
+        MotionField *motion_field = vp9_motion_field_info_get_motion_field(
+            &cpi->motion_field_info, frame_idx, rf_idx, bsize);
+        predict_mv_mode_arr(cpi, x, gf_picture, motion_field, frame_idx,
+                            tpl_frame, rf_idx, bsize);
+      }
+    }
+  }
+#endif  // CONFIG_NON_GREEDY_MV
+
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
+      int64_t recon_error = 0;
+      int64_t rate_cost = 0;
+      int64_t sse = 0;
+      // Ref frame index in the ref frame buffer.
+      int ref_frame_idx = -1;
+      mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, tpl_frame,
+                      src_diff, coeff, qcoeff, dqcoeff, mi_row, mi_col, bsize,
+                      tx_size, ref_frame, predictor, &recon_error, &rate_cost,
+                      &sse, &ref_frame_idx);
+      // Motion flow dependency dispenser.
+      tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
+                      tpl_frame->stride);
+
+      tpl_store_before_propagation(
+          tpl_frame_stats_before_propagation->block_stats_list,
+          tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, tpl_frame->stride,
+          recon_error, sse, rate_cost, ref_frame_idx, tpl_frame->mi_rows,
+          tpl_frame->mi_cols);
+
+      tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col,
+                       bsize);
+    }
+  }
+}
+
+static void trim_tpl_stats(struct vpx_internal_error_info *error_info,
+                           VpxTplGopStats *tpl_gop_stats, int extra_frames) {
+  int i;
+  VpxTplFrameStats *new_frame_stats;
+  const int new_size = tpl_gop_stats->size - extra_frames;
+  if (tpl_gop_stats->size <= extra_frames)
+    vpx_internal_error(
+        error_info, VPX_CODEC_ERROR,
+        "The number of frames in VpxTplGopStats is fewer than expected.");
+  CHECK_MEM_ERROR(error_info, new_frame_stats,
+                  vpx_calloc(new_size, sizeof(*new_frame_stats)));
+  for (i = 0; i < new_size; i++) {
+    VpxTplFrameStats *frame_stats = &tpl_gop_stats->frame_stats_list[i];
+    const int num_blocks = frame_stats->num_blocks;
+    new_frame_stats[i].num_blocks = frame_stats->num_blocks;
+    new_frame_stats[i].frame_width = frame_stats->frame_width;
+    new_frame_stats[i].frame_height = frame_stats->frame_height;
+    new_frame_stats[i].num_blocks = num_blocks;
+    CHECK_MEM_ERROR(
+        error_info, new_frame_stats[i].block_stats_list,
+        vpx_calloc(num_blocks, sizeof(*new_frame_stats[i].block_stats_list)));
+    memcpy(new_frame_stats[i].block_stats_list, frame_stats->block_stats_list,
+           num_blocks * sizeof(*new_frame_stats[i].block_stats_list));
+  }
+  free_tpl_frame_stats_list(tpl_gop_stats);
+  tpl_gop_stats->size = new_size;
+  tpl_gop_stats->frame_stats_list = new_frame_stats;
+}
+
+#if CONFIG_NON_GREEDY_MV
+#define DUMP_TPL_STATS 0
+#if DUMP_TPL_STATS
+static void dump_buf(uint8_t *buf, int stride, int row, int col, int h, int w) {
+  int i, j;
+  printf("%d %d\n", h, w);
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      printf("%d ", buf[(row + i) * stride + col + j]);
+    }
+  }
+  printf("\n");
+}
+
+static void dump_frame_buf(const YV12_BUFFER_CONFIG *frame_buf) {
+  dump_buf(frame_buf->y_buffer, frame_buf->y_stride, 0, 0, frame_buf->y_height,
+           frame_buf->y_width);
+  dump_buf(frame_buf->u_buffer, frame_buf->uv_stride, 0, 0,
+           frame_buf->uv_height, frame_buf->uv_width);
+  dump_buf(frame_buf->v_buffer, frame_buf->uv_stride, 0, 0,
+           frame_buf->uv_height, frame_buf->uv_width);
+}
+
+static void dump_tpl_stats(const VP9_COMP *cpi, int tpl_group_frames,
+                           const GF_GROUP *gf_group,
+                           const GF_PICTURE *gf_picture, BLOCK_SIZE bsize) {
+  int frame_idx;
+  const VP9_COMMON *cm = &cpi->common;
+  int rf_idx;
+  for (frame_idx = 1; frame_idx < tpl_group_frames; ++frame_idx) {
+    for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+      const TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+      int mi_row, mi_col;
+      int ref_frame_idx;
+      const int mi_height = num_8x8_blocks_high_lookup[bsize];
+      const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+      ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
+      if (ref_frame_idx != -1) {
+        YV12_BUFFER_CONFIG *ref_frame_buf = gf_picture[ref_frame_idx].frame;
+        const int gf_frame_offset = gf_group->frame_gop_index[frame_idx];
+        const int ref_gf_frame_offset =
+            gf_group->frame_gop_index[ref_frame_idx];
+        printf("=\n");
+        printf(
+            "frame_idx %d mi_rows %d mi_cols %d bsize %d ref_frame_idx %d "
+            "rf_idx %d gf_frame_offset %d ref_gf_frame_offset %d\n",
+            frame_idx, cm->mi_rows, cm->mi_cols, mi_width * MI_SIZE,
+            ref_frame_idx, rf_idx, gf_frame_offset, ref_gf_frame_offset);
+        for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) {
+          for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
+            if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) {
+              int_mv mv = vp9_motion_field_info_get_mv(&cpi->motion_field_info,
+                                                       frame_idx, rf_idx, bsize,
+                                                       mi_row, mi_col);
+              printf("%d %d %d %d\n", mi_row, mi_col, mv.as_mv.row,
+                     mv.as_mv.col);
+            }
+          }
+        }
+        for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) {
+          for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
+            if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) {
+              const TplDepStats *tpl_ptr =
+                  &tpl_frame
+                       ->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];
+              printf("%f ", tpl_ptr->feature_score);
+            }
+          }
+        }
+        printf("\n");
+
+        for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
+          for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
+            const int mv_mode =
+                tpl_frame
+                    ->mv_mode_arr[rf_idx][mi_row * tpl_frame->stride + mi_col];
+            printf("%d ", mv_mode);
+          }
+        }
+        printf("\n");
+
+        dump_frame_buf(gf_picture[frame_idx].frame);
+        dump_frame_buf(ref_frame_buf);
+      }
+    }
+  }
+}
+#endif  // DUMP_TPL_STATS
+#endif  // CONFIG_NON_GREEDY_MV
+
+void vp9_init_tpl_buffer(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  int frame;
+
+  const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+  const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows);
+#if CONFIG_NON_GREEDY_MV
+  int rf_idx;
+
+  vpx_free(cpi->select_mv_arr);
+  CHECK_MEM_ERROR(
+      &cm->error, cpi->select_mv_arr,
+      vpx_calloc(mi_rows * mi_cols * 4, sizeof(*cpi->select_mv_arr)));
+#endif
+
+  // TODO(jingning): Reduce the actual memory use for tpl model build up.
+  for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) {
+    if (cpi->tpl_stats[frame].width >= mi_cols &&
+        cpi->tpl_stats[frame].height >= mi_rows &&
+        cpi->tpl_stats[frame].tpl_stats_ptr)
+      continue;
+
+#if CONFIG_NON_GREEDY_MV
+    for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+      vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]);
+      CHECK_MEM_ERROR(
+          &cm->error, cpi->tpl_stats[frame].mv_mode_arr[rf_idx],
+          vpx_calloc(mi_rows * mi_cols * 4,
+                     sizeof(*cpi->tpl_stats[frame].mv_mode_arr[rf_idx])));
+      vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]);
+      CHECK_MEM_ERROR(
+          &cm->error, cpi->tpl_stats[frame].rd_diff_arr[rf_idx],
+          vpx_calloc(mi_rows * mi_cols * 4,
+                     sizeof(*cpi->tpl_stats[frame].rd_diff_arr[rf_idx])));
+    }
+#endif
+    vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr);
+    CHECK_MEM_ERROR(&cm->error, cpi->tpl_stats[frame].tpl_stats_ptr,
+                    vpx_calloc(mi_rows * mi_cols,
+                               sizeof(*cpi->tpl_stats[frame].tpl_stats_ptr)));
+    cpi->tpl_stats[frame].is_valid = 0;
+    cpi->tpl_stats[frame].width = mi_cols;
+    cpi->tpl_stats[frame].height = mi_rows;
+    cpi->tpl_stats[frame].stride = mi_cols;
+    cpi->tpl_stats[frame].mi_rows = cm->mi_rows;
+    cpi->tpl_stats[frame].mi_cols = cm->mi_cols;
+  }
+
+  for (frame = 0; frame < REF_FRAMES; ++frame) {
+    cpi->enc_frame_buf[frame].mem_valid = 0;
+    cpi->enc_frame_buf[frame].released = 1;
+  }
+}
+
+void vp9_free_tpl_buffer(VP9_COMP *cpi) {
+  int frame;
+#if CONFIG_NON_GREEDY_MV
+  vp9_free_motion_field_info(&cpi->motion_field_info);
+  vpx_free(cpi->select_mv_arr);
+#endif
+  for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) {
+#if CONFIG_NON_GREEDY_MV
+    int rf_idx;
+    for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+      vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]);
+      vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]);
+    }
+#endif
+    vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr);
+    cpi->tpl_stats[frame].is_valid = 0;
+  }
+  free_tpl_frame_stats_list(&cpi->tpl_gop_stats);
+}
+
+void vp9_estimate_tpl_qp_gop(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  int gop_length = cpi->twopass.gf_group.gf_group_size;
+  int bottom_index, top_index;
+  int idx;
+  const int gf_index = cpi->twopass.gf_group.index;
+  const int is_src_frame_alt_ref = cpi->rc.is_src_frame_alt_ref;
+  const int refresh_frame_context = cpi->common.refresh_frame_context;
+
+  const int sb_size = num_8x8_blocks_wide_lookup[BLOCK_64X64] * MI_SIZE;
+  const int frame_height_sb = (cm->height + sb_size - 1) / sb_size;
+  const int frame_width_sb = (cm->width + sb_size - 1) / sb_size;
+
+  vpx_codec_err_t codec_status;
+  const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+  vpx_rc_encodeframe_decision_t encode_frame_decision;
+
+  CHECK_MEM_ERROR(
+      &cm->error, encode_frame_decision.sb_params_list,
+      (sb_params *)vpx_malloc(frame_height_sb * frame_width_sb *
+                              sizeof(*encode_frame_decision.sb_params_list)));
+
+  for (idx = gf_index; idx <= gop_length; ++idx) {
+    TplDepFrame *tpl_frame = &cpi->tpl_stats[idx];
+    int target_rate = cpi->twopass.gf_group.bit_allocation[idx];
+    cpi->twopass.gf_group.index = idx;
+    vp9_rc_set_frame_target(cpi, target_rate);
+    vp9_configure_buffer_updates(cpi, idx);
+    if (cpi->ext_ratectrl.ready &&
+        (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0 &&
+        cpi->ext_ratectrl.funcs.get_encodeframe_decision != NULL) {
+      if (idx == gop_length) break;
+      memset(encode_frame_decision.sb_params_list, 0,
+             sizeof(*encode_frame_decision.sb_params_list) * frame_height_sb *
+                 frame_width_sb);
+      codec_status = vp9_extrc_get_encodeframe_decision(
+          &cpi->ext_ratectrl, gf_group->index, &encode_frame_decision);
+      if (codec_status != VPX_CODEC_OK) {
+        vpx_internal_error(&cm->error, codec_status,
+                           "vp9_extrc_get_encodeframe_decision() failed");
+      }
+      for (int i = 0; i < frame_height_sb * frame_width_sb; ++i) {
+        cpi->sb_mul_scale[i] =
+            (((int64_t)encode_frame_decision.sb_params_list[i].rdmult * 256) /
+             (encode_frame_decision.rdmult + 1));
+      }
+      tpl_frame->base_qindex = encode_frame_decision.q_index;
+    } else {
+      tpl_frame->base_qindex = vp9_rc_pick_q_and_bounds_two_pass(
+          cpi, &bottom_index, &top_index, idx);
+      tpl_frame->base_qindex = VPXMAX(tpl_frame->base_qindex, 1);
+    }
+  }
+  // Reset the actual index and frame update
+  cpi->twopass.gf_group.index = gf_index;
+  cpi->rc.is_src_frame_alt_ref = is_src_frame_alt_ref;
+  cpi->common.refresh_frame_context = refresh_frame_context;
+  vp9_configure_buffer_updates(cpi, gf_index);
+
+  vpx_free(encode_frame_decision.sb_params_list);
+}
+
+void vp9_setup_tpl_stats(VP9_COMP *cpi) {
+  GF_PICTURE gf_picture_buf[MAX_ARF_GOP_SIZE + REFS_PER_FRAME];
+  GF_PICTURE *gf_picture = &gf_picture_buf[REFS_PER_FRAME];
+  const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+  int tpl_group_frames = 0;
+  int frame_idx;
+  int extended_frame_count;
+  cpi->tpl_bsize = BLOCK_32X32;
+
+  memset(gf_picture_buf, 0, sizeof(gf_picture_buf));
+  extended_frame_count =
+      init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames);
+
+  init_tpl_stats(cpi);
+
+  init_tpl_stats_before_propagation(&cpi->common.error, &cpi->tpl_gop_stats,
+                                    cpi->tpl_stats, tpl_group_frames,
+                                    cpi->common.width, cpi->common.height);
+
+  // Backward propagation from tpl_group_frames to 1.
+  for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx) {
+    if (gf_picture[frame_idx].update_type == USE_BUF_FRAME) continue;
+    mc_flow_dispenser(cpi, gf_picture, frame_idx, cpi->tpl_bsize);
+  }
+
+  if (cpi->ext_ratectrl.ready &&
+      cpi->ext_ratectrl.funcs.send_tpl_gop_stats != NULL) {
+    // Intra search on key frame
+    if (gf_group->update_type[0] != OVERLAY_UPDATE) {
+      mc_flow_dispenser(cpi, gf_picture, 0, cpi->tpl_bsize);
+    }
+    // TPL stats has extra frames from next GOP. Trim those extra frames for
+    // Qmode.
+    trim_tpl_stats(&cpi->common.error, &cpi->tpl_gop_stats,
+                   extended_frame_count);
+    const vpx_codec_err_t codec_status =
+        vp9_extrc_send_tpl_stats(&cpi->ext_ratectrl, &cpi->tpl_gop_stats);
+    if (codec_status != VPX_CODEC_OK) {
+      vpx_internal_error(&cpi->common.error, codec_status,
+                         "vp9_extrc_send_tpl_stats() failed");
+    }
+  }
+
+#if CONFIG_NON_GREEDY_MV
+  cpi->tpl_ready = 1;
+#if DUMP_TPL_STATS
+  dump_tpl_stats(cpi, tpl_group_frames, gf_group, gf_picture, cpi->tpl_bsize);
+#endif  // DUMP_TPL_STATS
+#endif  // CONFIG_NON_GREEDY_MV
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h
new file mode 100644
index 0000000000..de0ac39a1f
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h
@@ -0,0 +1,47 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_TPL_MODEL_H_
+#define VPX_VP9_ENCODER_VP9_TPL_MODEL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef M_LOG2_E
+#define M_LOG2_E 0.693147180559945309417
+#endif
+#define log2f(x) (log(x) / (float)M_LOG2_E)
+
+#define TPL_DEP_COST_SCALE_LOG2 4
+
+typedef struct GF_PICTURE {
+  YV12_BUFFER_CONFIG *frame;
+  int ref_frame[3];
+  FRAME_UPDATE_TYPE update_type;
+} GF_PICTURE;
+
+void vp9_init_tpl_buffer(VP9_COMP *cpi);
+void vp9_setup_tpl_stats(VP9_COMP *cpi);
+void vp9_free_tpl_buffer(VP9_COMP *cpi);
+void vp9_estimate_tpl_qp_gop(VP9_COMP *cpi);
+
+void vp9_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+                      TX_SIZE tx_size);
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+                             TX_SIZE tx_size);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VP9_ENCODER_VP9_TPL_MODEL_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_treewriter.h b/media/libvpx/libvpx/vp9/encoder/vp9_treewriter.h
index a8b9c2cd31..86c5fa2244 100644
--- a/media/libvpx/libvpx/vp9/encoder/vp9_treewriter.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_treewriter.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_TREEWRITER_H_
-#define VP9_ENCODER_VP9_TREEWRITER_H_
+#ifndef VPX_VP9_ENCODER_VP9_TREEWRITER_H_
+#define VPX_VP9_ENCODER_VP9_TREEWRITER_H_
 
 #include "vpx_dsp/bitwriter.h"
 
@@ -48,4 +48,4 @@ static INLINE void vp9_write_token(vpx_writer *w, const vpx_tree_index *tree,
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_TREEWRITER_H_
+#endif  // VPX_VP9_ENCODER_VP9_TREEWRITER_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_avx2.c b/media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_avx2.c
new file mode 100644
index 0000000000..418edd62b5
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_avx2.c
@@ -0,0 +1,263 @@
+/*
+ *  Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+
+static INLINE void highbd_shuffle_12tap_filter_avx2(const int16_t *filter,
+                                                    __m256i *f) {
+  const __m256i f_low =
+      _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i *)filter));
+  const __m256i f_high = _mm256_broadcastsi128_si256(
+      _mm_loadl_epi64((const __m128i *)(filter + 8)));
+
+  f[0] = _mm256_shuffle_epi32(f_low, 0x00);
+  f[1] = _mm256_shuffle_epi32(f_low, 0x55);
+  f[2] = _mm256_shuffle_epi32(f_low, 0xaa);
+  f[3] = _mm256_shuffle_epi32(f_low, 0xff);
+  f[4] = _mm256_shuffle_epi32(f_high, 0x00);
+  f[5] = _mm256_shuffle_epi32(f_high, 0x55);
+}
+
+static INLINE __m256i highbd_convolve_12tap(const __m256i *s,
+                                            const __m256i *f) {
+  const __m256i res_0 = _mm256_madd_epi16(s[0], f[0]);
+  const __m256i res_1 = _mm256_madd_epi16(s[1], f[1]);
+  const __m256i res_2 = _mm256_madd_epi16(s[2], f[2]);
+  const __m256i res_3 = _mm256_madd_epi16(s[3], f[3]);
+  const __m256i res_4 = _mm256_madd_epi16(s[4], f[4]);
+  const __m256i res_5 = _mm256_madd_epi16(s[5], f[5]);
+
+  const __m256i res =
+      _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
+                       _mm256_add_epi32(_mm256_add_epi32(res_2, res_3),
+                                        _mm256_add_epi32(res_4, res_5)));
+  return res;
+}
+
+static INLINE void reuse_src_data_avx2(const __m256i *src, __m256i *des) {
+  des[0] = src[0];
+  des[1] = src[1];
+  des[2] = src[2];
+  des[3] = src[3];
+  des[4] = src[4];
+}
+
+void vpx_highbd_convolve12_horiz_avx2(const uint16_t *src, ptrdiff_t src_stride,
+                                      uint16_t *dst, ptrdiff_t dst_stride,
+                                      const InterpKernel12 *filter, int x0_q4,
+                                      int x_step_q4, int y0_q4, int y_step_q4,
+                                      int w, int h, int bd) {
+  assert(x_step_q4 == 16);
+  (void)y0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+  const uint16_t *src_ptr = src;
+  src_ptr -= MAX_FILTER_TAP / 2 - 1;
+  __m256i s[6], f[6];
+  const __m256i rounding = _mm256_set1_epi32(1 << (FILTER_BITS - 1));
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+  highbd_shuffle_12tap_filter_avx2(filter[x0_q4], f);
+
+  for (int j = 0; j < w; j += 8) {
+    for (int i = 0; i < h; i += 2) {
+      // s00 s01 s02 s03 s04 s05 s06 s07 s08 s09 s010 s011 s012 s013 s014 s015
+      const __m256i row0 =
+          _mm256_loadu_si256((const __m256i *)&src_ptr[i * src_stride + j]);
+      // s10 s11 s12 s13 s14 s15 s16 s17 s18 s19 s110 s111 s112 s113 s114
+      // s115
+      const __m256i row1 = _mm256_loadu_si256(
+          (const __m256i *)&src_ptr[(i + 1) * src_stride + j]);
+      // s016 s017 s018 s019 s020 s021 s022 s023
+      const __m128i row0_16 =
+          _mm_loadu_si128((const __m128i *)&src_ptr[i * src_stride + j + 16]);
+      // s116 s117 s118 s119 s120 s121 s122 s123
+      const __m128i row1_16 = _mm_loadu_si128(
+          (const __m128i *)&src_ptr[(i + 1) * src_stride + j + 16]);
+
+      // s00 s01 s02 s03 s04 s05 s06 s07 | s10 s11 s12 s13 s14 s15 s16 s17
+      const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+      // s08 s09 s010 s011 s012 s013 s014 s015 | s18 s19 s110 s111 s112 s113
+      // s114 s115
+      const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+      // s016 s017 s018 s019 s020 s021 s022 s023 | s116 s117 s118 s119 s120 s121
+      // s122 s123
+      const __m256i r2 =
+          _mm256_inserti128_si256(_mm256_castsi128_si256(row0_16), row1_16, 1);
+
+      // even pixels
+      s[0] = r0;
+      s[1] = _mm256_alignr_epi8(r1, r0, 4);
+      s[2] = _mm256_alignr_epi8(r1, r0, 8);
+      s[3] = _mm256_alignr_epi8(r1, r0, 12);
+      s[4] = r1;
+      s[5] = _mm256_alignr_epi8(r2, r1, 4);
+
+      // 00 02 04 06 | 10 12 14 16
+      __m256i res_even = highbd_convolve_12tap(s, f);
+      res_even =
+          _mm256_srai_epi32(_mm256_add_epi32(res_even, rounding), FILTER_BITS);
+
+      // odd pixels
+      s[0] = _mm256_alignr_epi8(r1, r0, 2);
+      s[1] = _mm256_alignr_epi8(r1, r0, 6);
+      s[2] = _mm256_alignr_epi8(r1, r0, 10);
+      s[3] = _mm256_alignr_epi8(r1, r0, 14);
+      s[4] = _mm256_alignr_epi8(r2, r1, 2);
+      s[5] = _mm256_alignr_epi8(r2, r1, 6);
+
+      // 01 03 05 07 | 11 13 15 17
+      __m256i res_odd = highbd_convolve_12tap(s, f);
+      res_odd =
+          _mm256_srai_epi32(_mm256_add_epi32(res_odd, rounding), FILTER_BITS);
+
+      // 00 01 02 03 | 10 11 12 13
+      const __m256i res_0 = _mm256_unpacklo_epi32(res_even, res_odd);
+      // 04 05 06 07 | 14 15 16 17
+      const __m256i res_1 = _mm256_unpackhi_epi32(res_even, res_odd);
+      // 00 01 02 03 | 04 05 06 07 | 10 11 12 13 | 14 15 16 17
+      const __m256i res_2 = _mm256_packus_epi32(res_0, res_1);
+      const __m256i res = _mm256_min_epi16(res_2, max);
+      _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
+                       _mm256_castsi256_si128(res));
+      if (i + 1 < h) {
+        _mm_storeu_si128((__m128i *)(&dst[(i + 1) * dst_stride + j]),
+                         _mm256_extractf128_si256(res, 1));
+      }
+    }
+  }
+}
+
+void vpx_highbd_convolve12_vert_avx2(const uint16_t *src, ptrdiff_t src_stride,
+                                     uint16_t *dst, ptrdiff_t dst_stride,
+                                     const InterpKernel12 *filter, int x0_q4,
+                                     int x_step_q4, int y0_q4, int y_step_q4,
+                                     int w, int h, int bd) {
+  assert(y_step_q4 == 16);
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+  const uint16_t *src_ptr = src;
+  src_ptr -= src_stride * (MAX_FILTER_TAP / 2 - 1);
+  __m256i s[12], f[6];
+  const __m256i rounding = _mm256_set1_epi32(((1 << FILTER_BITS) >> 1));
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+  highbd_shuffle_12tap_filter_avx2(filter[y0_q4], f);
+
+  for (int j = 0; j < w; j += 8) {
+    __m128i s0 =
+        _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_stride + j));
+    __m128i s1 =
+        _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_stride + j));
+    __m128i s2 =
+        _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_stride + j));
+    __m128i s3 =
+        _mm_loadu_si128((const __m128i *)(src_ptr + 3 * src_stride + j));
+    __m128i s4 =
+        _mm_loadu_si128((const __m128i *)(src_ptr + 4 * src_stride + j));
+    __m128i s5 =
+        _mm_loadu_si128((const __m128i *)(src_ptr + 5 * src_stride + j));
+    __m128i s6 =
+        _mm_loadu_si128((const __m128i *)(src_ptr + 6 * src_stride + j));
+    __m128i s7 =
+        _mm_loadu_si128((const __m128i *)(src_ptr + 7 * src_stride + j));
+    __m128i s8 =
+        _mm_loadu_si128((const __m128i *)(src_ptr + 8 * src_stride + j));
+    __m128i s9 =
+        _mm_loadu_si128((const __m128i *)(src_ptr + 9 * src_stride + j));
+    __m128i s10t =
+        _mm_loadu_si128((const __m128i *)(src_ptr + 10 * src_stride + j));
+
+    __m256i r01 = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1);
+    __m256i r12 = _mm256_inserti128_si256(_mm256_castsi128_si256(s1), s2, 1);
+    __m256i r23 = _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1);
+    __m256i r34 = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1);
+    __m256i r45 = _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1);
+    __m256i r56 = _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1);
+    __m256i r67 = _mm256_inserti128_si256(_mm256_castsi128_si256(s6), s7, 1);
+    __m256i r78 = _mm256_inserti128_si256(_mm256_castsi128_si256(s7), s8, 1);
+    __m256i r89 = _mm256_inserti128_si256(_mm256_castsi128_si256(s8), s9, 1);
+    __m256i r910 = _mm256_inserti128_si256(_mm256_castsi128_si256(s9), s10t, 1);
+
+    s[0] = _mm256_unpacklo_epi16(r01, r12);
+    s[1] = _mm256_unpacklo_epi16(r23, r34);
+    s[2] = _mm256_unpacklo_epi16(r45, r56);
+    s[3] = _mm256_unpacklo_epi16(r67, r78);
+    s[4] = _mm256_unpacklo_epi16(r89, r910);
+
+    s[6] = _mm256_unpackhi_epi16(r01, r12);
+    s[7] = _mm256_unpackhi_epi16(r23, r34);
+    s[8] = _mm256_unpackhi_epi16(r45, r56);
+    s[9] = _mm256_unpackhi_epi16(r67, r78);
+    s[10] = _mm256_unpackhi_epi16(r89, r910);
+    for (int i = 0; i < h; i += 2) {
+      const __m128i s10 = _mm_loadu_si128(
+          (const __m128i *)(src_ptr + (i + 10) * src_stride + j));
+      const __m128i s11 = _mm_loadu_si128(
+          (const __m128i *)(src_ptr + (i + 11) * src_stride + j));
+      const __m128i s12 = _mm_loadu_si128(
+          (const __m128i *)(src_ptr + (i + 12) * src_stride + j));
+      __m256i r1011 =
+          _mm256_inserti128_si256(_mm256_castsi128_si256(s10), s11, 1);
+      __m256i r1112 =
+          _mm256_inserti128_si256(_mm256_castsi128_si256(s11), s12, 1);
+
+      s[5] = _mm256_unpacklo_epi16(r1011, r1112);
+      s[11] = _mm256_unpackhi_epi16(r1011, r1112);
+
+      // 00 01 02 03 | 10 11 12 13
+      const __m256i res_a = highbd_convolve_12tap(s, f);
+      __m256i res_a_round =
+          _mm256_srai_epi32(_mm256_add_epi32(res_a, rounding), FILTER_BITS);
+      // 04 05 06 07 | 14 15 16 17
+      const __m256i res_b = highbd_convolve_12tap(s + 6, f);
+      __m256i res_b_round =
+          _mm256_srai_epi32(_mm256_add_epi32(res_b, rounding), FILTER_BITS);
+
+      // 00 01 02 03 | 04 05 06 07 | 10 11 12 13 | 14 15 16 17
+      const __m256i res_0 = _mm256_packus_epi32(res_a_round, res_b_round);
+      const __m256i res = _mm256_min_epi16(res_0, max);
+      _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
+                       _mm256_castsi256_si128(res));
+
+      _mm_storeu_si128((__m128i *)(&dst[(i + 1) * dst_stride + j]),
+                       _mm256_extractf128_si256(res, 1));
+
+      reuse_src_data_avx2(s + 1, s);
+      reuse_src_data_avx2(s + 7, s + 6);
+    }
+  }
+}
+
+void vpx_highbd_convolve12_avx2(const uint16_t *src, ptrdiff_t src_stride,
+                                uint16_t *dst, ptrdiff_t dst_stride,
+                                const InterpKernel12 *filter, int x0_q4,
+                                int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                int h, int bd) {
+  assert(x_step_q4 == 16 && y_step_q4 == 16);
+  assert(h == 32 || h == 16 || h == 8);
+  assert(w == 32 || w == 16 || w == 8);
+  DECLARE_ALIGNED(32, uint16_t, temp[BW * (BH + MAX_FILTER_TAP - 1)]);
+  const int temp_stride = BW;
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + MAX_FILTER_TAP;
+
+  vpx_highbd_convolve12_horiz_avx2(src - src_stride * (MAX_FILTER_TAP / 2 - 1),
+                                   src_stride, temp, temp_stride, filter, x0_q4,
+                                   x_step_q4, y0_q4, y_step_q4, w,
+                                   intermediate_height, bd);
+  vpx_highbd_convolve12_vert_avx2(temp + temp_stride * (MAX_FILTER_TAP / 2 - 1),
+                                  temp_stride, dst, dst_stride, filter, x0_q4,
+                                  x_step_q4, y0_q4, y_step_q4, w, h, bd);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c b/media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c
new file mode 100644
index 0000000000..97f182c660
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c
@@ -0,0 +1,893 @@
+/*
+ *  Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+#include "vp9/encoder/vp9_temporal_filter_constants.h"
+
+// Compute (a-b)**2 for 8 pixels with size 16-bit
+static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b,
+                                       uint32_t *dst) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a_reg = _mm_loadu_si128((const __m128i *)a);
+  const __m128i b_reg = _mm_loadu_si128((const __m128i *)b);
+
+  const __m128i a_first = _mm_cvtepu16_epi32(a_reg);
+  const __m128i a_second = _mm_unpackhi_epi16(a_reg, zero);
+  const __m128i b_first = _mm_cvtepu16_epi32(b_reg);
+  const __m128i b_second = _mm_unpackhi_epi16(b_reg, zero);
+
+  __m128i dist_first, dist_second;
+
+  dist_first = _mm_sub_epi32(a_first, b_first);
+  dist_second = _mm_sub_epi32(a_second, b_second);
+  dist_first = _mm_mullo_epi32(dist_first, dist_first);
+  dist_second = _mm_mullo_epi32(dist_second, dist_second);
+
+  _mm_storeu_si128((__m128i *)dst, dist_first);
+  _mm_storeu_si128((__m128i *)(dst + 4), dist_second);
+}
+
+// Sum up three neighboring distortions for the pixels
+static INLINE void highbd_get_sum_4(const uint32_t *dist, __m128i *sum) {
+  __m128i dist_reg, dist_left, dist_right;
+
+  dist_reg = _mm_loadu_si128((const __m128i *)dist);
+  dist_left = _mm_loadu_si128((const __m128i *)(dist - 1));
+  dist_right = _mm_loadu_si128((const __m128i *)(dist + 1));
+
+  *sum = _mm_add_epi32(dist_reg, dist_left);
+  *sum = _mm_add_epi32(*sum, dist_right);
+}
+
+static INLINE void highbd_get_sum_8(const uint32_t *dist, __m128i *sum_first,
+                                    __m128i *sum_second) {
+  highbd_get_sum_4(dist, sum_first);
+  highbd_get_sum_4(dist + 4, sum_second);
+}
+
+// Average the value based on the number of values summed (9 for pixels away
+// from the border, 4 for pixels in corners, and 6 for other edge values, plus
+// however many values from y/uv plane are).
+//
+// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
+// by weight.
+static INLINE void highbd_average_4(__m128i *output, const __m128i *sum,
+                                    const __m128i *mul_constants,
+                                    const int strength, const int rounding,
+                                    const int weight) {
+  // _mm_srl_epi16 uses the lower 64 bit value for the shift.
+  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
+  const __m128i rounding_u32 = _mm_set1_epi32(rounding);
+  const __m128i weight_u32 = _mm_set1_epi32(weight);
+  const __m128i sixteen = _mm_set1_epi32(16);
+  const __m128i zero = _mm_setzero_si128();
+
+  // modifier * 3 / index;
+  const __m128i sum_lo = _mm_unpacklo_epi32(*sum, zero);
+  const __m128i sum_hi = _mm_unpackhi_epi32(*sum, zero);
+  const __m128i const_lo = _mm_unpacklo_epi32(*mul_constants, zero);
+  const __m128i const_hi = _mm_unpackhi_epi32(*mul_constants, zero);
+
+  const __m128i mul_lo = _mm_mul_epu32(sum_lo, const_lo);
+  const __m128i mul_lo_div = _mm_srli_epi64(mul_lo, 32);
+  const __m128i mul_hi = _mm_mul_epu32(sum_hi, const_hi);
+  const __m128i mul_hi_div = _mm_srli_epi64(mul_hi, 32);
+
+  // Now we have
+  //   mul_lo: 00 a1 00 a0
+  //   mul_hi: 00 a3 00 a2
+  // Unpack as 64 bit words to get even and odd elements
+  //   unpack_lo: 00 a2 00 a0
+  //   unpack_hi: 00 a3 00 a1
+  // Then we can shift and OR the results to get everything in 32-bits
+  const __m128i mul_even = _mm_unpacklo_epi64(mul_lo_div, mul_hi_div);
+  const __m128i mul_odd = _mm_unpackhi_epi64(mul_lo_div, mul_hi_div);
+  const __m128i mul_odd_shift = _mm_slli_si128(mul_odd, 4);
+  const __m128i mul = _mm_or_si128(mul_even, mul_odd_shift);
+
+  // Round
+  *output = _mm_add_epi32(mul, rounding_u32);
+  *output = _mm_srl_epi32(*output, strength_u128);
+
+  // Multiply with the weight
+  *output = _mm_min_epu32(*output, sixteen);
+  *output = _mm_sub_epi32(sixteen, *output);
+  *output = _mm_mullo_epi32(*output, weight_u32);
+}
+
+static INLINE void highbd_average_8(__m128i *output_0, __m128i *output_1,
+                                    const __m128i *sum_0_u32,
+                                    const __m128i *sum_1_u32,
+                                    const __m128i *mul_constants_0,
+                                    const __m128i *mul_constants_1,
+                                    const int strength, const int rounding,
+                                    const int weight) {
+  highbd_average_4(output_0, sum_0_u32, mul_constants_0, strength, rounding,
+                   weight);
+  highbd_average_4(output_1, sum_1_u32, mul_constants_1, strength, rounding,
+                   weight);
+}
+
+// Add 'sum_u32' to 'count'. Multiply by 'pred' and add to 'accumulator.'
+static INLINE void highbd_accumulate_and_store_8(const __m128i sum_first_u32,
+                                                 const __m128i sum_second_u32,
+                                                 const uint16_t *pred,
+                                                 uint16_t *count,
+                                                 uint32_t *accumulator) {
+  // Cast down to 16-bit ints
+  const __m128i sum_u16 = _mm_packus_epi32(sum_first_u32, sum_second_u32);
+  const __m128i zero = _mm_setzero_si128();
+
+  __m128i pred_u16 = _mm_loadu_si128((const __m128i *)pred);
+  __m128i count_u16 = _mm_loadu_si128((const __m128i *)count);
+
+  __m128i pred_0_u32, pred_1_u32;
+  __m128i accum_0_u32, accum_1_u32;
+
+  count_u16 = _mm_adds_epu16(count_u16, sum_u16);
+  _mm_storeu_si128((__m128i *)count, count_u16);
+
+  pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
+  pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);
+
+  pred_0_u32 = _mm_mullo_epi32(sum_first_u32, pred_0_u32);
+  pred_1_u32 = _mm_mullo_epi32(sum_second_u32, pred_1_u32);
+
+  accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
+  accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
+
+  accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
+  accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
+
+  _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
+  _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
+}
+
+static INLINE void highbd_read_dist_4(const uint32_t *dist, __m128i *dist_reg) {
+  *dist_reg = _mm_loadu_si128((const __m128i *)dist);
+}
+
+static INLINE void highbd_read_dist_8(const uint32_t *dist, __m128i *reg_first,
+                                      __m128i *reg_second) {
+  highbd_read_dist_4(dist, reg_first);
+  highbd_read_dist_4(dist + 4, reg_second);
+}
+
+static INLINE void highbd_read_chroma_dist_row_8(
+    int ss_x, const uint32_t *u_dist, const uint32_t *v_dist, __m128i *u_first,
+    __m128i *u_second, __m128i *v_first, __m128i *v_second) {
+  if (!ss_x) {
+    // If there is no chroma subsampling in the horizontal direction, then we
+    // need to load 8 entries from chroma.
+    highbd_read_dist_8(u_dist, u_first, u_second);
+    highbd_read_dist_8(v_dist, v_first, v_second);
+  } else {  // ss_x == 1
+    // Otherwise, we only need to load 8 entries
+    __m128i u_reg, v_reg;
+
+    highbd_read_dist_4(u_dist, &u_reg);
+
+    *u_first = _mm_unpacklo_epi32(u_reg, u_reg);
+    *u_second = _mm_unpackhi_epi32(u_reg, u_reg);
+
+    highbd_read_dist_4(v_dist, &v_reg);
+
+    *v_first = _mm_unpacklo_epi32(v_reg, v_reg);
+    *v_second = _mm_unpackhi_epi32(v_reg, v_reg);
+  }
+}
+
+static void vp9_highbd_apply_temporal_filter_luma_8(
+    const uint16_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist,
+    const uint32_t *const *neighbors_first,
+    const uint32_t *const *neighbors_second, int top_weight,
+    int bottom_weight) {
+  const int rounding = (1 << strength) >> 1;
+  int weight = top_weight;
+
+  __m128i mul_first, mul_second;
+
+  __m128i sum_row_1_first, sum_row_1_second;
+  __m128i sum_row_2_first, sum_row_2_second;
+  __m128i sum_row_3_first, sum_row_3_second;
+
+  __m128i u_first, u_second;
+  __m128i v_first, v_second;
+
+  __m128i sum_row_first;
+  __m128i sum_row_second;
+
+  // Loop variables
+  unsigned int h;
+
+  assert(strength >= 4 && strength <= 14 &&
+         "invalid adjusted temporal filter strength");
+  assert(block_width == 8);
+
+  (void)block_width;
+
+  // First row
+  mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]);
+  mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]);
+
+  // Add luma values
+  highbd_get_sum_8(y_dist, &sum_row_2_first, &sum_row_2_second);
+  highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+  // We don't need to saturate here because the maximum value is UINT12_MAX ** 2
+  // * 9 ~= 2**24 * 9 < 2 ** 28 < INT32_MAX
+  sum_row_first = _mm_add_epi32(sum_row_2_first, sum_row_3_first);
+  sum_row_second = _mm_add_epi32(sum_row_2_second, sum_row_3_second);
+
+  // Add chroma values
+  highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+                                &v_first, &v_second);
+
+  // Max value here is 2 ** 24 * (9 + 2), so no saturation is needed
+  sum_row_first = _mm_add_epi32(sum_row_first, u_first);
+  sum_row_second = _mm_add_epi32(sum_row_second, u_second);
+
+  sum_row_first = _mm_add_epi32(sum_row_first, v_first);
+  sum_row_second = _mm_add_epi32(sum_row_second, v_second);
+
+  // Get modifier and store result
+  highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
+                   &sum_row_second, &mul_first, &mul_second, strength, rounding,
+                   weight);
+
+  highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+                                y_accum);
+
+  y_pre += y_pre_stride;
+  y_count += y_pre_stride;
+  y_accum += y_pre_stride;
+  y_dist += DIST_STRIDE;
+
+  u_dist += DIST_STRIDE;
+  v_dist += DIST_STRIDE;
+
+  // Then all the rows except the last one
+  mul_first = _mm_load_si128((const __m128i *)neighbors_first[1]);
+  mul_second = _mm_load_si128((const __m128i *)neighbors_second[1]);
+
+  for (h = 1; h < block_height - 1; ++h) {
+    // Move the weight to bottom half
+    if (!use_whole_blk && h == block_height / 2) {
+      weight = bottom_weight;
+    }
+    // Shift the rows up
+    sum_row_1_first = sum_row_2_first;
+    sum_row_1_second = sum_row_2_second;
+    sum_row_2_first = sum_row_3_first;
+    sum_row_2_second = sum_row_3_second;
+
+    // Add luma values to the modifier
+    sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first);
+    sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second);
+
+    highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+    sum_row_first = _mm_add_epi32(sum_row_first, sum_row_3_first);
+    sum_row_second = _mm_add_epi32(sum_row_second, sum_row_3_second);
+
+    // Add chroma values to the modifier
+    if (ss_y == 0 || h % 2 == 0) {
+      // Only calculate the new chroma distortion if we are at a pixel that
+      // corresponds to a new chroma row
+      highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+                                    &v_first, &v_second);
+
+      u_dist += DIST_STRIDE;
+      v_dist += DIST_STRIDE;
+    }
+
+    sum_row_first = _mm_add_epi32(sum_row_first, u_first);
+    sum_row_second = _mm_add_epi32(sum_row_second, u_second);
+    sum_row_first = _mm_add_epi32(sum_row_first, v_first);
+    sum_row_second = _mm_add_epi32(sum_row_second, v_second);
+
+    // Get modifier and store result
+    highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
+                     &sum_row_second, &mul_first, &mul_second, strength,
+                     rounding, weight);
+    highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+                                  y_accum);
+
+    y_pre += y_pre_stride;
+    y_count += y_pre_stride;
+    y_accum += y_pre_stride;
+    y_dist += DIST_STRIDE;
+  }
+
+  // The last row
+  mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]);
+  mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]);
+
+  // Shift the rows up
+  sum_row_1_first = sum_row_2_first;
+  sum_row_1_second = sum_row_2_second;
+  sum_row_2_first = sum_row_3_first;
+  sum_row_2_second = sum_row_3_second;
+
+  // Add luma values to the modifier
+  sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first);
+  sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second);
+
+  // Add chroma values to the modifier
+  if (ss_y == 0) {
+    // Only calculate the new chroma distortion if we are at a pixel that
+    // corresponds to a new chroma row
+    highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+                                  &v_first, &v_second);
+  }
+
+  sum_row_first = _mm_add_epi32(sum_row_first, u_first);
+  sum_row_second = _mm_add_epi32(sum_row_second, u_second);
+  sum_row_first = _mm_add_epi32(sum_row_first, v_first);
+  sum_row_second = _mm_add_epi32(sum_row_second, v_second);
+
+  // Get modifier and store result
+  highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
+                   &sum_row_second, &mul_first, &mul_second, strength, rounding,
+                   weight);
+  highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+                                y_accum);
+}
+
+// Perform temporal filter for the luma component.
+static void vp9_highbd_apply_temporal_filter_luma(
+    const uint16_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x;
+  const unsigned int mid_width = block_width >> 1,
+                     last_width = block_width - blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const uint32_t *const *neighbors_first;
+  const uint32_t *const *neighbors_second;
+
+  // Left
+  neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS;
+  neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  vp9_highbd_apply_temporal_filter_luma_8(
+      y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+      strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+      neighbors_first, neighbors_second, top_weight, bottom_weight);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  neighbors_first = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  for (; blk_col < mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    vp9_highbd_apply_temporal_filter_luma_8(
+        y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+        strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_first, neighbors_second, top_weight, bottom_weight);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; blk_col < last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    vp9_highbd_apply_temporal_filter_luma_8(
+        y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+        strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_first, neighbors_second, top_weight, bottom_weight);
+  }
+
+  // Right
+  neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS;
+  vp9_highbd_apply_temporal_filter_luma_8(
+      y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+      strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+      neighbors_first, neighbors_second, top_weight, bottom_weight);
+}
+
+// Add a row of luma distortion that corresponds to 8 chroma mods. If we are
+// subsampling in x direction, then we have 16 lumas, else we have 8.
+static INLINE void highbd_add_luma_dist_to_8_chroma_mod(
+    const uint32_t *y_dist, int ss_x, int ss_y, __m128i *u_mod_fst,
+    __m128i *u_mod_snd, __m128i *v_mod_fst, __m128i *v_mod_snd) {
+  __m128i y_reg_fst, y_reg_snd;
+  if (!ss_x) {
+    highbd_read_dist_8(y_dist, &y_reg_fst, &y_reg_snd);
+    if (ss_y == 1) {
+      __m128i y_tmp_fst, y_tmp_snd;
+      highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+      y_reg_fst = _mm_add_epi32(y_reg_fst, y_tmp_fst);
+      y_reg_snd = _mm_add_epi32(y_reg_snd, y_tmp_snd);
+    }
+  } else {
+    // Temporary
+    __m128i y_fst, y_snd;
+
+    // First 8
+    highbd_read_dist_8(y_dist, &y_fst, &y_snd);
+    if (ss_y == 1) {
+      __m128i y_tmp_fst, y_tmp_snd;
+      highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+
+      y_fst = _mm_add_epi32(y_fst, y_tmp_fst);
+      y_snd = _mm_add_epi32(y_snd, y_tmp_snd);
+    }
+
+    y_reg_fst = _mm_hadd_epi32(y_fst, y_snd);
+
+    // Second 8
+    highbd_read_dist_8(y_dist + 8, &y_fst, &y_snd);
+    if (ss_y == 1) {
+      __m128i y_tmp_fst, y_tmp_snd;
+      highbd_read_dist_8(y_dist + 8 + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+
+      y_fst = _mm_add_epi32(y_fst, y_tmp_fst);
+      y_snd = _mm_add_epi32(y_snd, y_tmp_snd);
+    }
+
+    y_reg_snd = _mm_hadd_epi32(y_fst, y_snd);
+  }
+
+  *u_mod_fst = _mm_add_epi32(*u_mod_fst, y_reg_fst);
+  *u_mod_snd = _mm_add_epi32(*u_mod_snd, y_reg_snd);
+  *v_mod_fst = _mm_add_epi32(*v_mod_fst, y_reg_fst);
+  *v_mod_snd = _mm_add_epi32(*v_mod_snd, y_reg_snd);
+}
+
+// Apply temporal filter to the chroma components. This performs temporal
+// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
+// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void vp9_highbd_apply_temporal_filter_chroma_8(
+    const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride,
+    unsigned int uv_block_width, unsigned int uv_block_height, int ss_x,
+    int ss_y, int strength, uint32_t *u_accum, uint16_t *u_count,
+    uint32_t *v_accum, uint16_t *v_count, const uint32_t *y_dist,
+    const uint32_t *u_dist, const uint32_t *v_dist,
+    const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd,
+    int top_weight, int bottom_weight, const int *blk_fw) {
+  const int rounding = (1 << strength) >> 1;
+  int weight = top_weight;
+
+  __m128i mul_fst, mul_snd;
+
+  __m128i u_sum_row_1_fst, u_sum_row_2_fst, u_sum_row_3_fst;
+  __m128i v_sum_row_1_fst, v_sum_row_2_fst, v_sum_row_3_fst;
+  __m128i u_sum_row_1_snd, u_sum_row_2_snd, u_sum_row_3_snd;
+  __m128i v_sum_row_1_snd, v_sum_row_2_snd, v_sum_row_3_snd;
+
+  __m128i u_sum_row_fst, v_sum_row_fst;
+  __m128i u_sum_row_snd, v_sum_row_snd;
+
+  // Loop variable
+  unsigned int h;
+
+  (void)uv_block_width;
+
+  // First row
+  mul_fst = _mm_load_si128((const __m128i *)neighbors_fst[0]);
+  mul_snd = _mm_load_si128((const __m128i *)neighbors_snd[0]);
+
+  // Add chroma values
+  highbd_get_sum_8(u_dist, &u_sum_row_2_fst, &u_sum_row_2_snd);
+  highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
+
+  u_sum_row_fst = _mm_add_epi32(u_sum_row_2_fst, u_sum_row_3_fst);
+  u_sum_row_snd = _mm_add_epi32(u_sum_row_2_snd, u_sum_row_3_snd);
+
+  highbd_get_sum_8(v_dist, &v_sum_row_2_fst, &v_sum_row_2_snd);
+  highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
+
+  v_sum_row_fst = _mm_add_epi32(v_sum_row_2_fst, v_sum_row_3_fst);
+  v_sum_row_snd = _mm_add_epi32(v_sum_row_2_snd, v_sum_row_3_snd);
+
+  // Add luma values
+  highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+                                       &u_sum_row_snd, &v_sum_row_fst,
+                                       &v_sum_row_snd);
+
+  // Get modifier and store result
+  if (blk_fw) {
+    highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+    highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+  } else {
+    highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
+                     &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+    highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
+                     &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+  }
+  highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+                                u_accum);
+  highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+                                v_accum);
+
+  u_pre += uv_pre_stride;
+  u_dist += DIST_STRIDE;
+  v_pre += uv_pre_stride;
+  v_dist += DIST_STRIDE;
+  u_count += uv_pre_stride;
+  u_accum += uv_pre_stride;
+  v_count += uv_pre_stride;
+  v_accum += uv_pre_stride;
+
+  y_dist += DIST_STRIDE * (1 + ss_y);
+
+  // Then all the rows except the last one
+  mul_fst = _mm_load_si128((const __m128i *)neighbors_fst[1]);
+  mul_snd = _mm_load_si128((const __m128i *)neighbors_snd[1]);
+
+  for (h = 1; h < uv_block_height - 1; ++h) {
+    // Move the weight pointer to the bottom half of the blocks
+    if (h == uv_block_height / 2) {
+      if (blk_fw) {
+        blk_fw += 2;
+      } else {
+        weight = bottom_weight;
+      }
+    }
+
+    // Shift the rows up
+    u_sum_row_1_fst = u_sum_row_2_fst;
+    u_sum_row_2_fst = u_sum_row_3_fst;
+    u_sum_row_1_snd = u_sum_row_2_snd;
+    u_sum_row_2_snd = u_sum_row_3_snd;
+
+    v_sum_row_1_fst = v_sum_row_2_fst;
+    v_sum_row_2_fst = v_sum_row_3_fst;
+    v_sum_row_1_snd = v_sum_row_2_snd;
+    v_sum_row_2_snd = v_sum_row_3_snd;
+
+    // Add chroma values
+    u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst);
+    u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd);
+    highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
+    u_sum_row_fst = _mm_add_epi32(u_sum_row_fst, u_sum_row_3_fst);
+    u_sum_row_snd = _mm_add_epi32(u_sum_row_snd, u_sum_row_3_snd);
+
+    v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst);
+    v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd);
+    highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
+    v_sum_row_fst = _mm_add_epi32(v_sum_row_fst, v_sum_row_3_fst);
+    v_sum_row_snd = _mm_add_epi32(v_sum_row_snd, v_sum_row_3_snd);
+
+    // Add luma values
+    highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+                                         &u_sum_row_snd, &v_sum_row_fst,
+                                         &v_sum_row_snd);
+
+    // Get modifier and store result
+    if (blk_fw) {
+      highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
+                       rounding, blk_fw[0]);
+      highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
+                       rounding, blk_fw[1]);
+
+      highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
+                       rounding, blk_fw[0]);
+      highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
+                       rounding, blk_fw[1]);
+
+    } else {
+      highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
+                       &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                       weight);
+      highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
+                       &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                       weight);
+    }
+
+    highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+                                  u_accum);
+    highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+                                  v_accum);
+
+    u_pre += uv_pre_stride;
+    u_dist += DIST_STRIDE;
+    v_pre += uv_pre_stride;
+    v_dist += DIST_STRIDE;
+    u_count += uv_pre_stride;
+    u_accum += uv_pre_stride;
+    v_count += uv_pre_stride;
+    v_accum += uv_pre_stride;
+
+    y_dist += DIST_STRIDE * (1 + ss_y);
+  }
+
+  // The last row
+  mul_fst = _mm_load_si128((const __m128i *)neighbors_fst[0]);
+  mul_snd = _mm_load_si128((const __m128i *)neighbors_snd[0]);
+
+  // Shift the rows up
+  u_sum_row_1_fst = u_sum_row_2_fst;
+  u_sum_row_2_fst = u_sum_row_3_fst;
+  u_sum_row_1_snd = u_sum_row_2_snd;
+  u_sum_row_2_snd = u_sum_row_3_snd;
+
+  v_sum_row_1_fst = v_sum_row_2_fst;
+  v_sum_row_2_fst = v_sum_row_3_fst;
+  v_sum_row_1_snd = v_sum_row_2_snd;
+  v_sum_row_2_snd = v_sum_row_3_snd;
+
+  // Add chroma values
+  u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst);
+  v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst);
+  u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd);
+  v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd);
+
+  // Add luma values
+  highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+                                       &u_sum_row_snd, &v_sum_row_fst,
+                                       &v_sum_row_snd);
+
+  // Get modifier and store result
+  if (blk_fw) {
+    highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+    highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+  } else {
+    highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
+                     &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+    highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
+                     &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+  }
+
+  highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+                                u_accum);
+  highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+                                v_accum);
+}
+
+// Perform temporal filter for the chroma components.
+static void vp9_highbd_apply_temporal_filter_chroma(
+    const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride,
+    unsigned int block_width, unsigned int block_height, int ss_x, int ss_y,
+    int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum,
+    uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
+  const unsigned int uv_width = block_width >> ss_x,
+                     uv_height = block_height >> ss_y;
+
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
+  const unsigned int uv_mid_width = uv_width >> 1,
+                     uv_last_width = uv_width - uv_blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const uint32_t *const *neighbors_fst;
+  const uint32_t *const *neighbors_snd;
+
+  if (uv_width == 8) {
+    // Special Case: We are subsampling in x direction on a 16x16 block. Since
+    // we are operating on a row of 8 chroma pixels, we can't use the usual
+    // left-middle-right pattern.
+    assert(ss_x);
+
+    if (ss_y) {
+      neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+      neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+    } else {
+      neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+      neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+    }
+
+    if (use_whole_blk) {
+      vp9_highbd_apply_temporal_filter_chroma_8(
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+          neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+    } else {
+      vp9_highbd_apply_temporal_filter_chroma_8(
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+          neighbors_fst, neighbors_snd, 0, 0, blk_fw);
+    }
+
+    return;
+  }
+
+  // Left
+  if (ss_x && ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+    neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+    neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else {
+    neighbors_fst = HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
+    neighbors_snd = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+  }
+
+  vp9_highbd_apply_temporal_filter_chroma_8(
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+      uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+      u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst,
+      neighbors_snd, top_weight, bottom_weight, NULL);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  if (ss_x && ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else {
+    neighbors_fst = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+  }
+
+  for (; uv_blk_col < uv_mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    vp9_highbd_apply_temporal_filter_chroma_8(
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; uv_blk_col < uv_last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    vp9_highbd_apply_temporal_filter_chroma_8(
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+  }
+
+  // Right
+  if (ss_x && ss_y) {
+    neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else {
+    neighbors_snd = HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
+  }
+
+  vp9_highbd_apply_temporal_filter_chroma_8(
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+      uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+      u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst,
+      neighbors_snd, top_weight, bottom_weight, NULL);
+}
+
+void vp9_highbd_apply_temporal_filter_sse4_1(
+    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *const blk_fw,
+    int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum,
+    uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) {
+  const unsigned int chroma_height = block_height >> ss_y,
+                     chroma_width = block_width >> ss_x;
+
+  DECLARE_ALIGNED(16, uint32_t, y_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint32_t, u_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint32_t, v_dist[BH * DIST_STRIDE]) = { 0 };
+
+  uint32_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
+           *v_dist_ptr = v_dist + 1;
+  const uint16_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src;
+  const uint16_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre;
+
+  // Loop variables
+  unsigned int row, blk_col;
+
+  assert(block_width <= BW && "block width too large");
+  assert(block_height <= BH && "block height too large");
+  assert(block_width % 16 == 0 && "block width must be multiple of 16");
+  assert(block_height % 2 == 0 && "block height must be even");
+  assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
+         "invalid chroma subsampling");
+  assert(strength >= 4 && strength <= 14 &&
+         "invalid adjusted temporal filter strength");
+  assert(blk_fw[0] >= 0 && "filter weight must be positive");
+  assert(
+      (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
+      "subblock filter weight must be positive");
+  assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2");
+  assert(
+      (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
+      "subblock filter weight must be less than 2");
+
+  // Precompute the difference squared
+  for (row = 0; row < block_height; row++) {
+    for (blk_col = 0; blk_col < block_width; blk_col += 8) {
+      highbd_store_dist_8(y_src_ptr + blk_col, y_pre_ptr + blk_col,
+                          y_dist_ptr + blk_col);
+    }
+    y_src_ptr += y_src_stride;
+    y_pre_ptr += y_pre_stride;
+    y_dist_ptr += DIST_STRIDE;
+  }
+
+  for (row = 0; row < chroma_height; row++) {
+    for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
+      highbd_store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
+                          u_dist_ptr + blk_col);
+      highbd_store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
+                          v_dist_ptr + blk_col);
+    }
+
+    u_src_ptr += uv_src_stride;
+    u_pre_ptr += uv_pre_stride;
+    u_dist_ptr += DIST_STRIDE;
+    v_src_ptr += uv_src_stride;
+    v_pre_ptr += uv_pre_stride;
+    v_dist_ptr += DIST_STRIDE;
+  }
+
+  y_dist_ptr = y_dist + 1;
+  u_dist_ptr = u_dist + 1;
+  v_dist_ptr = v_dist + 1;
+
+  vp9_highbd_apply_temporal_filter_luma(y_pre, y_pre_stride, block_width,
+                                        block_height, ss_x, ss_y, strength,
+                                        blk_fw, use_whole_blk, y_accum, y_count,
+                                        y_dist_ptr, u_dist_ptr, v_dist_ptr);
+
+  vp9_highbd_apply_temporal_filter_chroma(
+      u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
+      strength, blk_fw, use_whole_blk, u_accum, u_count, v_accum, v_count,
+      y_dist_ptr, u_dist_ptr, v_dist_ptr);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_ssse3.c b/media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_ssse3.c
new file mode 100644
index 0000000000..4540dca3a6
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_ssse3.c
@@ -0,0 +1,233 @@
+/*
+ *  Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <tmmintrin.h>  // SSSE3
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+
+static INLINE void highbd_shuffle_12tap_filter_ssse3(const int16_t *filter,
+                                                     __m128i *f) {
+  const __m128i f_low = _mm_loadu_si128((const __m128i *)filter);
+  const __m128i f_high = _mm_loadl_epi64((const __m128i *)(filter + 8));
+
+  f[0] = _mm_shuffle_epi32(f_low, 0x00);
+  f[1] = _mm_shuffle_epi32(f_low, 0x55);
+  f[2] = _mm_shuffle_epi32(f_low, 0xaa);
+  f[3] = _mm_shuffle_epi32(f_low, 0xff);
+  f[4] = _mm_shuffle_epi32(f_high, 0x00);
+  f[5] = _mm_shuffle_epi32(f_high, 0x55);
+}
+
+static INLINE void unpacklo_src_ssse3(__m128i *a, __m128i *s) {
+  s[0] = _mm_unpacklo_epi16(a[0], a[1]);
+  s[1] = _mm_unpacklo_epi16(a[2], a[3]);
+  s[2] = _mm_unpacklo_epi16(a[4], a[5]);
+  s[3] = _mm_unpacklo_epi16(a[6], a[7]);
+  s[4] = _mm_unpacklo_epi16(a[8], a[9]);
+}
+
+static INLINE void unpackhi_src_ssse3(__m128i *a, __m128i *s) {
+  s[0] = _mm_unpackhi_epi16(a[0], a[1]);
+  s[1] = _mm_unpackhi_epi16(a[2], a[3]);
+  s[2] = _mm_unpackhi_epi16(a[4], a[5]);
+  s[3] = _mm_unpackhi_epi16(a[6], a[7]);
+  s[4] = _mm_unpackhi_epi16(a[8], a[9]);
+}
+
+static INLINE __m128i highbd_convolve_12tap(const __m128i *s,
+                                            const __m128i *f) {
+  const __m128i rounding = _mm_set1_epi32(1 << (FILTER_BITS - 1));
+  const __m128i res_0 = _mm_madd_epi16(s[0], f[0]);
+  const __m128i res_1 = _mm_madd_epi16(s[1], f[1]);
+  const __m128i res_2 = _mm_madd_epi16(s[2], f[2]);
+  const __m128i res_3 = _mm_madd_epi16(s[3], f[3]);
+  const __m128i res_4 = _mm_madd_epi16(s[4], f[4]);
+  const __m128i res_5 = _mm_madd_epi16(s[5], f[5]);
+
+  const __m128i res_6 = _mm_add_epi32(
+      _mm_add_epi32(res_0, res_1),
+      _mm_add_epi32(_mm_add_epi32(res_2, res_3), _mm_add_epi32(res_4, res_5)));
+  const __m128i res =
+      _mm_srai_epi32(_mm_add_epi32(res_6, rounding), FILTER_BITS);
+  return res;
+}
+
+static INLINE void reuse_src_data_ssse3(const __m128i *src, __m128i *des) {
+  des[0] = src[0];
+  des[1] = src[1];
+  des[2] = src[2];
+  des[3] = src[3];
+  des[4] = src[4];
+}
+
+void vpx_highbd_convolve12_horiz_ssse3(const uint16_t *src,
+                                       ptrdiff_t src_stride, uint16_t *dst,
+                                       ptrdiff_t dst_stride,
+                                       const InterpKernel12 *filter, int x0_q4,
+                                       int x_step_q4, int y0_q4, int y_step_q4,
+                                       int w, int h, int bd) {
+  assert(x_step_q4 == 16);
+  (void)y0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+  const uint16_t *src_ptr = src;
+  src_ptr -= MAX_FILTER_TAP / 2 - 1;
+  __m128i s[6], f[6];
+  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+  const __m128i min = _mm_setzero_si128();
+  highbd_shuffle_12tap_filter_ssse3(filter[x0_q4], f);
+
+  for (int j = 0; j < w; j += 8) {
+    for (int i = 0; i < h; i++) {
+      // s00 s01 s02 s03 s04 s05 s06 s07
+      const __m128i r0 =
+          _mm_loadu_si128((const __m128i *)&src_ptr[i * src_stride + j]);
+      // s08 s09 s010 s011 s012 s013 s014 s015
+      const __m128i r1 =
+          _mm_loadu_si128((const __m128i *)&src_ptr[i * src_stride + j + 8]);
+      // s016 s017 s018 s019 s020 s021 s022 s023
+      const __m128i r2 =
+          _mm_loadu_si128((const __m128i *)&src_ptr[i * src_stride + j + 16]);
+
+      // even pixels
+      s[0] = r0;
+      s[1] = _mm_alignr_epi8(r1, r0, 4);
+      s[2] = _mm_alignr_epi8(r1, r0, 8);
+      s[3] = _mm_alignr_epi8(r1, r0, 12);
+      s[4] = r1;
+      s[5] = _mm_alignr_epi8(r2, r1, 4);
+
+      // 00 02 04 06
+      __m128i res_even = highbd_convolve_12tap(s, f);
+
+      // odd pixels
+      s[0] = _mm_alignr_epi8(r1, r0, 2);
+      s[1] = _mm_alignr_epi8(r1, r0, 6);
+      s[2] = _mm_alignr_epi8(r1, r0, 10);
+      s[3] = _mm_alignr_epi8(r1, r0, 14);
+      s[4] = _mm_alignr_epi8(r2, r1, 2);
+      s[5] = _mm_alignr_epi8(r2, r1, 6);
+
+      // 01 03 05 07
+      __m128i res_odd = highbd_convolve_12tap(s, f);
+
+      // 00 01 02 03
+      const __m128i res_0 = _mm_unpacklo_epi32(res_even, res_odd);
+      // 04 05 06 07
+      const __m128i res_1 = _mm_unpackhi_epi32(res_even, res_odd);
+      // 00 01 02 03 | 04 05 06 07
+      const __m128i res_2 = _mm_packs_epi32(res_0, res_1);
+      const __m128i res = _mm_max_epi16(_mm_min_epi16(res_2, max), min);
+      _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+    }
+  }
+}
+
+void vpx_highbd_convolve12_vert_ssse3(const uint16_t *src, ptrdiff_t src_stride,
+                                      uint16_t *dst, ptrdiff_t dst_stride,
+                                      const InterpKernel12 *filter, int x0_q4,
+                                      int x_step_q4, int y0_q4, int y_step_q4,
+                                      int w, int h, int bd) {
+  assert(y_step_q4 == 16);
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+  const uint16_t *src_ptr = src;
+  src_ptr -= src_stride * (MAX_FILTER_TAP / 2 - 1);
+  __m128i s[12], r[12], a[11], f[6];
+  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+  const __m128i min = _mm_setzero_si128();
+  highbd_shuffle_12tap_filter_ssse3(filter[y0_q4], f);
+
+  for (int j = 0; j < w; j += 8) {
+    a[0] = _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_stride + j));
+    a[1] = _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_stride + j));
+    a[2] = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_stride + j));
+    a[3] = _mm_loadu_si128((const __m128i *)(src_ptr + 3 * src_stride + j));
+    a[4] = _mm_loadu_si128((const __m128i *)(src_ptr + 4 * src_stride + j));
+    a[5] = _mm_loadu_si128((const __m128i *)(src_ptr + 5 * src_stride + j));
+    a[6] = _mm_loadu_si128((const __m128i *)(src_ptr + 6 * src_stride + j));
+    a[7] = _mm_loadu_si128((const __m128i *)(src_ptr + 7 * src_stride + j));
+    a[8] = _mm_loadu_si128((const __m128i *)(src_ptr + 8 * src_stride + j));
+    a[9] = _mm_loadu_si128((const __m128i *)(src_ptr + 9 * src_stride + j));
+    a[10] = _mm_loadu_si128((const __m128i *)(src_ptr + 10 * src_stride + j));
+
+    // even row
+    unpacklo_src_ssse3(a, s);
+    unpackhi_src_ssse3(a, s + 6);
+    // odd row
+    unpacklo_src_ssse3(a + 1, r);
+    unpackhi_src_ssse3(a + 1, r + 6);
+
+    for (int i = 0; i < h; i += 2) {
+      const __m128i s0 = _mm_loadu_si128(
+          (const __m128i *)(src_ptr + (i + 10) * src_stride + j));
+      const __m128i s1 = _mm_loadu_si128(
+          (const __m128i *)(src_ptr + (i + 11) * src_stride + j));
+      const __m128i s2 = _mm_loadu_si128(
+          (const __m128i *)(src_ptr + (i + 12) * src_stride + j));
+
+      s[5] = _mm_unpacklo_epi16(s0, s1);
+      r[5] = _mm_unpacklo_epi16(s1, s2);
+
+      s[11] = _mm_unpackhi_epi16(s0, s1);
+      r[11] = _mm_unpackhi_epi16(s1, s2);
+
+      // 00 01 02 03
+      const __m128i res_a = highbd_convolve_12tap(s, f);
+      // 04 05 06 07
+      const __m128i res_b = highbd_convolve_12tap(s + 6, f);
+      // 10 11 12 13
+      const __m128i res_c = highbd_convolve_12tap(r, f);
+      // 14 15 16 17
+      const __m128i res_d = highbd_convolve_12tap(r + 6, f);
+
+      // 00 01 02 03 | 04 05 06 07
+      const __m128i res_0 = _mm_packs_epi32(res_a, res_b);
+      // 10 11 12 13 | 14 15 16 17
+      const __m128i res_1 = _mm_packs_epi32(res_c, res_d);
+      const __m128i res_r0 = _mm_max_epi16(_mm_min_epi16(res_0, max), min);
+      const __m128i res_r1 = _mm_max_epi16(_mm_min_epi16(res_1, max), min);
+
+      _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_r0);
+      _mm_storeu_si128((__m128i *)&dst[(i + 1) * dst_stride + j], res_r1);
+
+      reuse_src_data_ssse3(s + 1, s);
+      reuse_src_data_ssse3(s + 7, s + 6);
+      reuse_src_data_ssse3(r + 1, r);
+      reuse_src_data_ssse3(r + 7, r + 6);
+    }
+  }
+}
+
+void vpx_highbd_convolve12_ssse3(const uint16_t *src, ptrdiff_t src_stride,
+                                 uint16_t *dst, ptrdiff_t dst_stride,
+                                 const InterpKernel12 *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                 int h, int bd) {
+  assert(x_step_q4 == 16 && y_step_q4 == 16);
+  assert(h == 32 || h == 16 || h == 8);
+  assert(w == 32 || w == 16 || w == 8);
+  DECLARE_ALIGNED(32, uint16_t, temp[BW * (BH + MAX_FILTER_TAP - 1)]);
+  const int temp_stride = BW;
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + MAX_FILTER_TAP;
+
+  vpx_highbd_convolve12_horiz_ssse3(src - src_stride * (MAX_FILTER_TAP / 2 - 1),
+                                    src_stride, temp, temp_stride, filter,
+                                    x0_q4, x_step_q4, y0_q4, y_step_q4, w,
+                                    intermediate_height, bd);
+  vpx_highbd_convolve12_vert_ssse3(
+      temp + temp_stride * (MAX_FILTER_TAP / 2 - 1), temp_stride, dst,
+      dst_stride, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_avx2.c b/media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_avx2.c
new file mode 100644
index 0000000000..9d94ff8d69
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_avx2.c
@@ -0,0 +1,441 @@
+/*
+ *  Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+
+DECLARE_ALIGNED(32, static const uint8_t,
+                shuffle_src_mask1_avx2[32]) = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,
+                                                6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3,
+                                                3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_src_mask2_avx2[32]) = {
+  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
+  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_src_mask3_avx2[32]) = {
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_src_mask4_avx2[32]) = {
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+static INLINE void shuffle_12tap_filter_avx2(const int16_t *filter,
+                                             __m256i *f) {
+  const __m256i f_low =
+      _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i *)filter));
+  const __m256i f_high = _mm256_broadcastsi128_si256(
+      _mm_loadl_epi64((const __m128i *)(filter + 8)));
+
+  f[0] = _mm256_shuffle_epi8(f_low, _mm256_set1_epi16(0x0200u));
+  f[1] = _mm256_shuffle_epi8(f_low, _mm256_set1_epi16(0x0604u));
+  f[2] = _mm256_shuffle_epi8(f_low, _mm256_set1_epi16(0x0a08u));
+  f[3] = _mm256_shuffle_epi8(f_low, _mm256_set1_epi16(0x0e0cu));
+  f[4] = _mm256_shuffle_epi8(f_high, _mm256_set1_epi16(0x0200u));
+  f[5] = _mm256_shuffle_epi8(f_high, _mm256_set1_epi16(0x0604u));
+}
+
+static INLINE void shuffle_src_data_avx2(const __m256i *r1, const __m256i *r2,
+                                         const __m256i *f, __m256i *s) {
+  s[0] = _mm256_shuffle_epi8(*r1, f[0]);
+  s[1] = _mm256_shuffle_epi8(*r1, f[1]);
+  s[2] = _mm256_shuffle_epi8(*r1, f[2]);
+  s[3] = _mm256_shuffle_epi8(*r1, f[3]);
+  s[4] = _mm256_shuffle_epi8(*r2, f[0]);
+  s[5] = _mm256_shuffle_epi8(*r2, f[1]);
+}
+
+static INLINE void reuse_src_data_avx2(const __m256i *src, __m256i *des) {
+  des[0] = src[0];
+  des[1] = src[1];
+  des[2] = src[2];
+  des[3] = src[3];
+  des[4] = src[4];
+}
+
+static INLINE __m256i convolve12_16_avx2(const __m256i *s, const __m256i *f) {
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m256i k_64 = _mm256_set1_epi16(1 << (FILTER_BITS - 1));
+  const __m256i x0 = _mm256_maddubs_epi16(s[0], f[0]);
+  const __m256i x1 = _mm256_maddubs_epi16(s[1], f[1]);
+  const __m256i x2 = _mm256_maddubs_epi16(s[2], f[2]);
+  const __m256i x3 = _mm256_maddubs_epi16(s[3], f[3]);
+  const __m256i x4 = _mm256_maddubs_epi16(s[4], f[4]);
+  const __m256i x5 = _mm256_maddubs_epi16(s[5], f[5]);
+  __m256i sum1, sum2, sum3;
+
+  sum1 = _mm256_add_epi16(x0, x2);
+  sum2 = _mm256_add_epi16(x3, x5);
+  sum3 = _mm256_add_epi16(x1, x4);
+  sum3 = _mm256_add_epi16(sum3, k_64);
+
+  const __m256i s0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum1));
+  const __m256i s1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(sum1, 1));
+  const __m256i s2 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum2));
+  const __m256i s3 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(sum2, 1));
+  const __m256i s4 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum3));
+  const __m256i s5 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(sum3, 1));
+
+  sum1 = _mm256_add_epi32(s0, s2);
+  sum2 = _mm256_add_epi32(s1, s3);
+  sum1 = _mm256_add_epi32(sum1, s4);
+  sum2 = _mm256_add_epi32(sum2, s5);
+
+  // round and shift by 7 bit each 32 bit
+  // 0 1 2 3 4 5 6 7
+  sum1 = _mm256_srai_epi32(sum1, FILTER_BITS);
+  // 8 9 10 11 12 13 14 15
+  sum2 = _mm256_srai_epi32(sum2, FILTER_BITS);
+
+  // 0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15
+  // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+  __m256i const res =
+      _mm256_permute4x64_epi64(_mm256_packus_epi32(sum1, sum2), 0xD8);
+  return res;
+}
+
+void vpx_convolve12_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const InterpKernel12 *filter, int x0_q4,
+                               int x_step_q4, int y0_q4, int y_step_q4, int w,
+                               int h) {
+  assert(x_step_q4 == 16);
+  assert(w == 32 || w == 16 || w == 8);
+  (void)y0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+  const uint8_t *src_ptr = src;
+  src_ptr -= MAX_FILTER_TAP / 2 - 1;
+  __m256i s[6], f[6], src_mask[4];
+
+  shuffle_12tap_filter_avx2(filter[x0_q4], f);
+  src_mask[0] = _mm256_load_si256((__m256i const *)shuffle_src_mask1_avx2);
+  src_mask[1] = _mm256_load_si256((__m256i const *)shuffle_src_mask2_avx2);
+  src_mask[2] = _mm256_load_si256((__m256i const *)shuffle_src_mask3_avx2);
+  src_mask[3] = _mm256_load_si256((__m256i const *)shuffle_src_mask4_avx2);
+  if (w == 8) {
+    for (int i = 0; i < h; i += 4) {
+      // s00 s01 s02 s03 s04 s05 s06 s07 s08 s09 s010 s011 s012 s013 s014 s015
+      const __m128i row0 =
+          _mm_loadu_si128((const __m128i *)&src_ptr[i * src_stride]);
+      // s08 s09 s010 s011 s012 s013 s014 s015 s016 s017 s018 s019 s020 s021
+      // s022 s023
+      const __m128i row0_8 =
+          _mm_loadu_si128((const __m128i *)&src_ptr[i * src_stride + 8]);
+      // s10 s11 s12 s13 s14 s15 s16 s17 s18 s19 s110 s111 s112 s113 s114 s115
+      const __m128i row1 =
+          _mm_loadu_si128((const __m128i *)&src_ptr[(i + 1) * src_stride]);
+      const __m128i row1_8 =
+          _mm_loadu_si128((const __m128i *)&src_ptr[(i + 1) * src_stride + 8]);
+      // s20 s21 s22 s23 s24 s25 s26 s27 s28 s29 s210 s211 s212 s213 s214 s215
+      const __m128i row2 =
+          _mm_loadu_si128((const __m128i *)&src_ptr[(i + 2) * src_stride]);
+      const __m128i row2_8 =
+          _mm_loadu_si128((const __m128i *)&src_ptr[(i + 2) * src_stride + 8]);
+      // s30 s31 s32 s33 s34 s35 s36 s37 s38 s39 s310 s311 s312 s313 s314 s115
+      const __m128i row3 =
+          _mm_loadu_si128((const __m128i *)&src_ptr[(i + 3) * src_stride]);
+      const __m128i row3_8 =
+          _mm_loadu_si128((const __m128i *)&src_ptr[(i + 3) * src_stride + 8]);
+      // s00 s01 s02 s03 s04 s05 s06 s07 s08 s09 s010 s011 s012 s013 s014 s015 |
+      // s10 s11 s12 s13 s14 s15 s16 s17 s18 s19 s110 s111 s112 s113 s114 s115
+      const __m256i row01 =
+          _mm256_inserti128_si256(_mm256_castsi128_si256(row0), row1, 1);
+      // s20 s21 s22 s23 s24 s25 s26 s27 s28 s29 s210 s211 s212 s213 s214 s215 |
+      // s30 s31 s32 s33 s34 s35 s36 s37 s38 s39 s310 s311 s312 s313 s314 s115
+      const __m256i row23 =
+          _mm256_inserti128_si256(_mm256_castsi128_si256(row2), row3, 1);
+      // s08 s09 s010 s011 s012 s013 s014 s015 s016 s017 s018 s019 s020 s021
+      // s022 s023 | s18 s19 s110 s111 s112 s113 s114 s115 s116 s117 s118 s119
+      // s120 s121 s122 s123
+      const __m256i row01_8 =
+          _mm256_inserti128_si256(_mm256_castsi128_si256(row0_8), row1_8, 1);
+      const __m256i row23_8 =
+          _mm256_inserti128_si256(_mm256_castsi128_si256(row2_8), row3_8, 1);
+
+      shuffle_src_data_avx2(&row01, &row01_8, src_mask, s);
+      const __m256i res_0 = convolve12_16_avx2(s, f);
+
+      shuffle_src_data_avx2(&row23, &row23_8, src_mask, s);
+      const __m256i res_1 = convolve12_16_avx2(s, f);
+
+      // 00 01 02 03 04 05 06 07 | 10 11 12 13 14 15 16 17 | 08 09 010 011 012
+      // 013 014 015 | 18 19 110 111 112 113 114 115
+      const __m256i res = _mm256_packus_epi16(res_0, res_1);
+      const __m128i res_lo = _mm256_castsi256_si128(res);
+      const __m128i res_hi = _mm256_extracti128_si256(res, 1);
+
+      _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_lo);
+      _mm_storel_epi64((__m128i *)&dst[(i + 1) * dst_stride], res_hi);
+      _mm_storel_epi64((__m128i *)&dst[(i + 2) * dst_stride],
+                       _mm_srli_si128(res_lo, 8));
+      _mm_storel_epi64((__m128i *)&dst[(i + 3) * dst_stride],
+                       _mm_srli_si128(res_hi, 8));
+    }
+  } else {
+    for (int j = 0; j < w; j += 16) {
+      for (int i = 0; i < h; i += 2) {
+        // s00 s01 s02 s03 s04 s05 s06 s07 s08 s09 s010 s011 s012 s013 s014 s015
+        const __m128i row0 =
+            _mm_loadu_si128((const __m128i *)&src_ptr[i * src_stride + j]);
+        // s016 s017 s018 s019 s020 s021 s022 s023 s024 s025 s026 s027 s028 s029
+        // s030 s031
+        const __m128i row0_16 =
+            _mm_loadu_si128((const __m128i *)&src_ptr[i * src_stride + j + 16]);
+        // s10 s11 s12 s13 s14 s15 s16 s17 s18 s19 s110 s111 s112 s113 s114
+        // s115
+        const __m128i row1 = _mm_loadu_si128(
+            (const __m128i *)&src_ptr[(i + 1) * src_stride + j]);
+        // s116 s117 s118 s119 s120 s121 s122 s123 s124 s125 s126 s127 s128
+        // s129 s130 s131
+        const __m128i row1_16 = _mm_loadu_si128(
+            (const __m128i *)&src_ptr[(i + 1) * src_stride + j + 16]);
+
+        // s00 s01 s02 s03 s04 s05 s06 s07 s08 s09 s010 s011 s012 s013 s014 s015
+        // | s10 s11 s12 s13 s14 s15 s16 s17 s18 s19 s110 s111 s112 s113 s114
+        // s115
+        const __m256i r0 =
+            _mm256_inserti128_si256(_mm256_castsi128_si256(row0), row1, 1);
+        // s016 s017 s018 s019 s020 s021 s022 s023 s024 s025 s026 s027 s028 s029
+        // s030 s031 | s116 s117 s118 s119 s120 s121 s122 s123 s124 s125 s126
+        // s127 s128 s129 s130 s131
+        const __m256i r2 = _mm256_inserti128_si256(
+            _mm256_castsi128_si256(row0_16), row1_16, 1);
+
+        // s08 s09 s010 s011 s012 s013 s014 s015 s016 s017 s018 s019 s020 s021
+        // s022 s023 | s18 s19 s110 s111 s112 s113 s114 s115 s116 s117 s118 s119
+        // s120 s121 s122 s123
+        const __m256i r1 = _mm256_alignr_epi8(r2, r0, 8);
+
+        shuffle_src_data_avx2(&r0, &r1, src_mask, s);
+        const __m256i res_0 = convolve12_16_avx2(s, f);
+
+        shuffle_src_data_avx2(&r1, &r2, src_mask, s);
+        const __m256i res_1 = convolve12_16_avx2(s, f);
+
+        const __m256i res = _mm256_packus_epi16(res_0, res_1);
+
+        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
+                         _mm256_castsi256_si128(res));
+        if (i + 1 < h) {
+          _mm_storeu_si128((__m128i *)&dst[(i + 1) * dst_stride + j],
+                           _mm256_extracti128_si256(res, 1));
+        }
+      }
+    }
+  }
+}
+
+void vpx_convolve12_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const InterpKernel12 *filter, int x0_q4,
+                              int x_step_q4, int y0_q4, int y_step_q4, int w,
+                              int h) {
+  assert(y_step_q4 == 16);
+  assert(h == 32 || h == 16 || h == 8);
+  assert(w == 32 || w == 16 || w == 8);
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+  const uint8_t *src_ptr = src;
+  src_ptr -= src_stride * (MAX_FILTER_TAP / 2 - 1);
+  __m256i s[12], f[6];
+
+  shuffle_12tap_filter_avx2(filter[y0_q4], f);
+  if (w == 8) {
+    const __m128i s0 =
+        _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_stride));
+    const __m128i s1 =
+        _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_stride));
+    const __m128i s2 =
+        _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_stride));
+    const __m128i s3 =
+        _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_stride));
+    const __m128i s4 =
+        _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_stride));
+    const __m128i s5 =
+        _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_stride));
+    const __m128i s6 =
+        _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_stride));
+    const __m128i s7 =
+        _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_stride));
+    const __m128i s8 =
+        _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_stride));
+    const __m128i s9 =
+        _mm_loadl_epi64((const __m128i *)(src_ptr + 9 * src_stride));
+    const __m128i s10t =
+        _mm_loadl_epi64((const __m128i *)(src_ptr + 10 * src_stride));
+
+    const __m256i r01 =
+        _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1);
+    const __m256i r12 =
+        _mm256_inserti128_si256(_mm256_castsi128_si256(s1), s2, 1);
+    const __m256i r23 =
+        _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1);
+    const __m256i r34 =
+        _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1);
+    const __m256i r45 =
+        _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1);
+    const __m256i r56 =
+        _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1);
+    const __m256i r67 =
+        _mm256_inserti128_si256(_mm256_castsi128_si256(s6), s7, 1);
+    const __m256i r78 =
+        _mm256_inserti128_si256(_mm256_castsi128_si256(s7), s8, 1);
+    const __m256i r89 =
+        _mm256_inserti128_si256(_mm256_castsi128_si256(s8), s9, 1);
+    const __m256i r910 =
+        _mm256_inserti128_si256(_mm256_castsi128_si256(s9), s10t, 1);
+
+    s[0] = _mm256_unpacklo_epi8(r01, r12);
+    s[1] = _mm256_unpacklo_epi8(r23, r34);
+    s[2] = _mm256_unpacklo_epi8(r45, r56);
+    s[3] = _mm256_unpacklo_epi8(r67, r78);
+    s[4] = _mm256_unpacklo_epi8(r89, r910);
+    for (int i = 0; i < h; i += 2) {
+      const __m128i s10 =
+          _mm_loadl_epi64((const __m128i *)(src_ptr + (i + 10) * src_stride));
+      const __m128i s11 =
+          _mm_loadl_epi64((const __m128i *)(src_ptr + (i + 11) * src_stride));
+      const __m128i s12 =
+          _mm_loadl_epi64((const __m128i *)(src_ptr + (i + 12) * src_stride));
+
+      const __m256i r1011 =
+          _mm256_inserti128_si256(_mm256_castsi128_si256(s10), s11, 1);
+      const __m256i r1112 =
+          _mm256_inserti128_si256(_mm256_castsi128_si256(s11), s12, 1);
+      s[5] = _mm256_unpacklo_epi8(r1011, r1112);
+      const __m256i res_0 = convolve12_16_avx2(s, f);
+
+      __m256i res = _mm256_packus_epi16(res_0, res_0);
+
+      _mm_storel_epi64((__m128i *)&dst[i * dst_stride],
+                       _mm256_castsi256_si128(res));
+      _mm_storel_epi64((__m128i *)&dst[(i + 1) * dst_stride],
+                       _mm256_extracti128_si256(res, 1));
+
+      reuse_src_data_avx2(s + 1, s);
+    }
+  } else {
+    for (int j = 0; j < w; j += 16) {
+      const __m128i s0 =
+          _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_stride + j));
+      const __m128i s1 =
+          _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_stride + j));
+      const __m128i s2 =
+          _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_stride + j));
+      const __m128i s3 =
+          _mm_loadu_si128((const __m128i *)(src_ptr + 3 * src_stride + j));
+      const __m128i s4 =
+          _mm_loadu_si128((const __m128i *)(src_ptr + 4 * src_stride + j));
+      const __m128i s5 =
+          _mm_loadu_si128((const __m128i *)(src_ptr + 5 * src_stride + j));
+      const __m128i s6 =
+          _mm_loadu_si128((const __m128i *)(src_ptr + 6 * src_stride + j));
+      const __m128i s7 =
+          _mm_loadu_si128((const __m128i *)(src_ptr + 7 * src_stride + j));
+      const __m128i s8 =
+          _mm_loadu_si128((const __m128i *)(src_ptr + 8 * src_stride + j));
+      const __m128i s9 =
+          _mm_loadu_si128((const __m128i *)(src_ptr + 9 * src_stride + j));
+      const __m128i s10t =
+          _mm_loadu_si128((const __m128i *)(src_ptr + 10 * src_stride + j));
+
+      const __m256i r01 =
+          _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1);
+      const __m256i r12 =
+          _mm256_inserti128_si256(_mm256_castsi128_si256(s1), s2, 1);
+      const __m256i r23 =
+          _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1);
+      const __m256i r34 =
+          _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1);
+      const __m256i r45 =
+          _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1);
+      const __m256i r56 =
+          _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1);
+      const __m256i r67 =
+          _mm256_inserti128_si256(_mm256_castsi128_si256(s6), s7, 1);
+      const __m256i r78 =
+          _mm256_inserti128_si256(_mm256_castsi128_si256(s7), s8, 1);
+      const __m256i r89 =
+          _mm256_inserti128_si256(_mm256_castsi128_si256(s8), s9, 1);
+      const __m256i r910 =
+          _mm256_inserti128_si256(_mm256_castsi128_si256(s9), s10t, 1);
+
+      s[0] = _mm256_unpacklo_epi8(r01, r12);
+      s[1] = _mm256_unpacklo_epi8(r23, r34);
+      s[2] = _mm256_unpacklo_epi8(r45, r56);
+      s[3] = _mm256_unpacklo_epi8(r67, r78);
+      s[4] = _mm256_unpacklo_epi8(r89, r910);
+
+      s[6] = _mm256_unpackhi_epi8(r01, r12);
+      s[7] = _mm256_unpackhi_epi8(r23, r34);
+      s[8] = _mm256_unpackhi_epi8(r45, r56);
+      s[9] = _mm256_unpackhi_epi8(r67, r78);
+      s[10] = _mm256_unpackhi_epi8(r89, r910);
+      for (int i = 0; i < h; i += 2) {
+        const __m128i s10 = _mm_loadu_si128(
+            (const __m128i *)(src_ptr + (i + 10) * src_stride + j));
+        const __m128i s11 = _mm_loadu_si128(
+            (const __m128i *)(src_ptr + (i + 11) * src_stride + j));
+        const __m128i s12 = _mm_loadu_si128(
+            (const __m128i *)(src_ptr + (i + 12) * src_stride + j));
+
+        const __m256i r1011 =
+            _mm256_inserti128_si256(_mm256_castsi128_si256(s10), s11, 1);
+        const __m256i r1112 =
+            _mm256_inserti128_si256(_mm256_castsi128_si256(s11), s12, 1);
+
+        s[5] = _mm256_unpacklo_epi8(r1011, r1112);
+        s[11] = _mm256_unpackhi_epi8(r1011, r1112);
+
+        const __m256i res_0 = convolve12_16_avx2(s, f);
+        const __m256i res_1 = convolve12_16_avx2(s + 6, f);
+
+        __m256i res = _mm256_packus_epi16(res_0, res_1);
+
+        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
+                         _mm256_castsi256_si128(res));
+        _mm_storeu_si128((__m128i *)&dst[(i + 1) * dst_stride + j],
+                         _mm256_extracti128_si256(res, 1));
+
+        reuse_src_data_avx2(s + 1, s);
+        reuse_src_data_avx2(s + 7, s + 6);
+      }
+    }
+  }
+}
+
+void vpx_convolve12_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                         ptrdiff_t dst_stride, const InterpKernel12 *filter,
+                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+                         int w, int h) {
+  assert(x_step_q4 == 16 && y_step_q4 == 16);
+  assert(h == 32 || h == 16 || h == 8);
+  assert(w == 32 || w == 16 || w == 8);
+  DECLARE_ALIGNED(32, uint8_t, temp[BW * (BH + MAX_FILTER_TAP - 1)]);
+  const int temp_stride = BW;
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + MAX_FILTER_TAP;
+  vpx_convolve12_horiz_avx2(src - src_stride * (MAX_FILTER_TAP / 2 - 1),
+                            src_stride, temp, temp_stride, filter, x0_q4,
+                            x_step_q4, y0_q4, y_step_q4, w,
+                            intermediate_height);
+  vpx_convolve12_vert_avx2(temp + temp_stride * (MAX_FILTER_TAP / 2 - 1),
+                           temp_stride, dst, dst_stride, filter, x0_q4,
+                           x_step_q4, y0_q4, y_step_q4, w, h);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_sse4.c b/media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_sse4.c
new file mode 100644
index 0000000000..7571bfccac
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_sse4.c
@@ -0,0 +1,875 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+#include "vp9/encoder/vp9_temporal_filter_constants.h"
+
+// Read in 8 pixels from a and b as 8-bit unsigned integers, compute the
+// difference squared, and store as unsigned 16-bit integer to dst.
+static INLINE void store_dist_8(const uint8_t *a, const uint8_t *b,
+                                uint16_t *dst) {
+  const __m128i a_reg = _mm_loadl_epi64((const __m128i *)a);
+  const __m128i b_reg = _mm_loadl_epi64((const __m128i *)b);
+
+  const __m128i a_first = _mm_cvtepu8_epi16(a_reg);
+  const __m128i b_first = _mm_cvtepu8_epi16(b_reg);
+
+  __m128i dist_first;
+
+  dist_first = _mm_sub_epi16(a_first, b_first);
+  dist_first = _mm_mullo_epi16(dist_first, dist_first);
+
+  _mm_storeu_si128((__m128i *)dst, dist_first);
+}
+
+static INLINE void store_dist_16(const uint8_t *a, const uint8_t *b,
+                                 uint16_t *dst) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a_reg = _mm_loadu_si128((const __m128i *)a);
+  const __m128i b_reg = _mm_loadu_si128((const __m128i *)b);
+
+  const __m128i a_first = _mm_cvtepu8_epi16(a_reg);
+  const __m128i a_second = _mm_unpackhi_epi8(a_reg, zero);
+  const __m128i b_first = _mm_cvtepu8_epi16(b_reg);
+  const __m128i b_second = _mm_unpackhi_epi8(b_reg, zero);
+
+  __m128i dist_first, dist_second;
+
+  dist_first = _mm_sub_epi16(a_first, b_first);
+  dist_second = _mm_sub_epi16(a_second, b_second);
+  dist_first = _mm_mullo_epi16(dist_first, dist_first);
+  dist_second = _mm_mullo_epi16(dist_second, dist_second);
+
+  _mm_storeu_si128((__m128i *)dst, dist_first);
+  _mm_storeu_si128((__m128i *)(dst + 8), dist_second);
+}
+
+static INLINE void read_dist_8(const uint16_t *dist, __m128i *dist_reg) {
+  *dist_reg = _mm_loadu_si128((const __m128i *)dist);
+}
+
+static INLINE void read_dist_16(const uint16_t *dist, __m128i *reg_first,
+                                __m128i *reg_second) {
+  read_dist_8(dist, reg_first);
+  read_dist_8(dist + 8, reg_second);
+}
+
+// Average the value based on the number of values summed (9 for pixels away
+// from the border, 4 for pixels in corners, and 6 for other edge values).
+//
+// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
+// by weight.
+static INLINE __m128i average_8(__m128i sum, const __m128i *mul_constants,
+                                const int strength, const int rounding,
+                                const __m128i *weight) {
+  // _mm_srl_epi16 uses the lower 64 bit value for the shift.
+  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
+  const __m128i rounding_u16 = _mm_set1_epi16(rounding);
+  const __m128i weight_u16 = *weight;
+  const __m128i sixteen = _mm_set1_epi16(16);
+
+  // modifier * 3 / index;
+  sum = _mm_mulhi_epu16(sum, *mul_constants);
+
+  sum = _mm_adds_epu16(sum, rounding_u16);
+  sum = _mm_srl_epi16(sum, strength_u128);
+
+  // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
+  // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
+  // So this needs to use the epu16 version which did not come until SSE4.
+  sum = _mm_min_epu16(sum, sixteen);
+
+  sum = _mm_sub_epi16(sixteen, sum);
+
+  return _mm_mullo_epi16(sum, weight_u16);
+}
+
+// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.'
+static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred,
+                                   uint16_t *count, uint32_t *accumulator) {
+  const __m128i pred_u8 = _mm_loadl_epi64((const __m128i *)pred);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i count_u16 = _mm_loadu_si128((const __m128i *)count);
+  __m128i pred_u16 = _mm_cvtepu8_epi16(pred_u8);
+  __m128i pred_0_u32, pred_1_u32;
+  __m128i accum_0_u32, accum_1_u32;
+
+  count_u16 = _mm_adds_epu16(count_u16, sum_u16);
+  _mm_storeu_si128((__m128i *)count, count_u16);
+
+  pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16);
+
+  pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
+  pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);
+
+  accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
+  accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
+
+  accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
+  accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
+
+  _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
+  _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
+}
+
+static INLINE void accumulate_and_store_16(const __m128i sum_0_u16,
+                                           const __m128i sum_1_u16,
+                                           const uint8_t *pred, uint16_t *count,
+                                           uint32_t *accumulator) {
+  const __m128i pred_u8 = _mm_loadu_si128((const __m128i *)pred);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i count_0_u16 = _mm_loadu_si128((const __m128i *)count),
+          count_1_u16 = _mm_loadu_si128((const __m128i *)(count + 8));
+  __m128i pred_0_u16 = _mm_cvtepu8_epi16(pred_u8),
+          pred_1_u16 = _mm_unpackhi_epi8(pred_u8, zero);
+  __m128i pred_0_u32, pred_1_u32, pred_2_u32, pred_3_u32;
+  __m128i accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32;
+
+  count_0_u16 = _mm_adds_epu16(count_0_u16, sum_0_u16);
+  _mm_storeu_si128((__m128i *)count, count_0_u16);
+
+  count_1_u16 = _mm_adds_epu16(count_1_u16, sum_1_u16);
+  _mm_storeu_si128((__m128i *)(count + 8), count_1_u16);
+
+  pred_0_u16 = _mm_mullo_epi16(sum_0_u16, pred_0_u16);
+  pred_1_u16 = _mm_mullo_epi16(sum_1_u16, pred_1_u16);
+
+  pred_0_u32 = _mm_cvtepu16_epi32(pred_0_u16);
+  pred_1_u32 = _mm_unpackhi_epi16(pred_0_u16, zero);
+  pred_2_u32 = _mm_cvtepu16_epi32(pred_1_u16);
+  pred_3_u32 = _mm_unpackhi_epi16(pred_1_u16, zero);
+
+  accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
+  accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
+  accum_2_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 8));
+  accum_3_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 12));
+
+  accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
+  accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
+  accum_2_u32 = _mm_add_epi32(pred_2_u32, accum_2_u32);
+  accum_3_u32 = _mm_add_epi32(pred_3_u32, accum_3_u32);
+
+  _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
+  _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
+  _mm_storeu_si128((__m128i *)(accumulator + 8), accum_2_u32);
+  _mm_storeu_si128((__m128i *)(accumulator + 12), accum_3_u32);
+}
+
+// Read in 8 pixels from y_dist. For each index i, compute y_dist[i-1] +
+// y_dist[i] + y_dist[i+1] and store in sum as 16-bit unsigned int.
+static INLINE void get_sum_8(const uint16_t *y_dist, __m128i *sum) {
+  __m128i dist_reg, dist_left, dist_right;
+
+  dist_reg = _mm_loadu_si128((const __m128i *)y_dist);
+  dist_left = _mm_loadu_si128((const __m128i *)(y_dist - 1));
+  dist_right = _mm_loadu_si128((const __m128i *)(y_dist + 1));
+
+  *sum = _mm_adds_epu16(dist_reg, dist_left);
+  *sum = _mm_adds_epu16(*sum, dist_right);
+}
+
+// Read in 16 pixels from y_dist. For each index i, compute y_dist[i-1] +
+// y_dist[i] + y_dist[i+1]. Store the result for first 8 pixels in sum_first and
+// the rest in sum_second.
+static INLINE void get_sum_16(const uint16_t *y_dist, __m128i *sum_first,
+                              __m128i *sum_second) {
+  get_sum_8(y_dist, sum_first);
+  get_sum_8(y_dist + 8, sum_second);
+}
+
+// Read in a row of chroma values corresponds to a row of 16 luma values.
+static INLINE void read_chroma_dist_row_16(int ss_x, const uint16_t *u_dist,
+                                           const uint16_t *v_dist,
+                                           __m128i *u_first, __m128i *u_second,
+                                           __m128i *v_first,
+                                           __m128i *v_second) {
+  if (!ss_x) {
+    // If there is no chroma subsampling in the horizontal direction, then we
+    // need to load 16 entries from chroma.
+    read_dist_16(u_dist, u_first, u_second);
+    read_dist_16(v_dist, v_first, v_second);
+  } else {  // ss_x == 1
+    // Otherwise, we only need to load 8 entries
+    __m128i u_reg, v_reg;
+
+    read_dist_8(u_dist, &u_reg);
+
+    *u_first = _mm_unpacklo_epi16(u_reg, u_reg);
+    *u_second = _mm_unpackhi_epi16(u_reg, u_reg);
+
+    read_dist_8(v_dist, &v_reg);
+
+    *v_first = _mm_unpacklo_epi16(v_reg, v_reg);
+    *v_second = _mm_unpackhi_epi16(v_reg, v_reg);
+  }
+}
+
+// Horizontal add unsigned 16-bit ints in src and store them as signed 32-bit
+// int in dst.
+static INLINE void hadd_epu16(__m128i *src, __m128i *dst) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i shift_right = _mm_srli_si128(*src, 2);
+
+  const __m128i odd = _mm_blend_epi16(shift_right, zero, 170);
+  const __m128i even = _mm_blend_epi16(*src, zero, 170);
+
+  *dst = _mm_add_epi32(even, odd);
+}
+
+// Add a row of luma distortion to 8 corresponding chroma mods.
+static INLINE void add_luma_dist_to_8_chroma_mod(const uint16_t *y_dist,
+                                                 int ss_x, int ss_y,
+                                                 __m128i *u_mod,
+                                                 __m128i *v_mod) {
+  __m128i y_reg;
+  if (!ss_x) {
+    read_dist_8(y_dist, &y_reg);
+    if (ss_y == 1) {
+      __m128i y_tmp;
+      read_dist_8(y_dist + DIST_STRIDE, &y_tmp);
+
+      y_reg = _mm_adds_epu16(y_reg, y_tmp);
+    }
+  } else {
+    __m128i y_first, y_second;
+    read_dist_16(y_dist, &y_first, &y_second);
+    if (ss_y == 1) {
+      __m128i y_tmp_0, y_tmp_1;
+      read_dist_16(y_dist + DIST_STRIDE, &y_tmp_0, &y_tmp_1);
+
+      y_first = _mm_adds_epu16(y_first, y_tmp_0);
+      y_second = _mm_adds_epu16(y_second, y_tmp_1);
+    }
+
+    hadd_epu16(&y_first, &y_first);
+    hadd_epu16(&y_second, &y_second);
+
+    y_reg = _mm_packus_epi32(y_first, y_second);
+  }
+
+  *u_mod = _mm_adds_epu16(*u_mod, y_reg);
+  *v_mod = _mm_adds_epu16(*v_mod, y_reg);
+}
+
+// Apply temporal filter to the luma components. This performs temporal
+// filtering on a luma block of 16 X block_height. Use blk_fw as an array of
+// size 4 for the weights for each of the 4 subblocks if blk_fw is not NULL,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void vp9_apply_temporal_filter_luma_16(
+    const uint8_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist,
+    const int16_t *const *neighbors_first,
+    const int16_t *const *neighbors_second, int top_weight, int bottom_weight,
+    const int *blk_fw) {
+  const int rounding = (1 << strength) >> 1;
+  __m128i weight_first, weight_second;
+
+  __m128i mul_first, mul_second;
+
+  __m128i sum_row_1_first, sum_row_1_second;
+  __m128i sum_row_2_first, sum_row_2_second;
+  __m128i sum_row_3_first, sum_row_3_second;
+
+  __m128i u_first, u_second;
+  __m128i v_first, v_second;
+
+  __m128i sum_row_first;
+  __m128i sum_row_second;
+
+  // Loop variables
+  unsigned int h;
+
+  assert(strength >= 0);
+  assert(strength <= 6);
+
+  assert(block_width == 16);
+  (void)block_width;
+
+  // Initialize the weights
+  if (blk_fw) {
+    weight_first = _mm_set1_epi16(blk_fw[0]);
+    weight_second = _mm_set1_epi16(blk_fw[1]);
+  } else {
+    weight_first = _mm_set1_epi16(top_weight);
+    weight_second = weight_first;
+  }
+
+  // First row
+  mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]);
+  mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]);
+
+  // Add luma values
+  get_sum_16(y_dist, &sum_row_2_first, &sum_row_2_second);
+  get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+  sum_row_first = _mm_adds_epu16(sum_row_2_first, sum_row_3_first);
+  sum_row_second = _mm_adds_epu16(sum_row_2_second, sum_row_3_second);
+
+  // Add chroma values
+  read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
+                          &v_second);
+
+  sum_row_first = _mm_adds_epu16(sum_row_first, u_first);
+  sum_row_second = _mm_adds_epu16(sum_row_second, u_second);
+
+  sum_row_first = _mm_adds_epu16(sum_row_first, v_first);
+  sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
+
+  // Get modifier and store result
+  sum_row_first =
+      average_8(sum_row_first, &mul_first, strength, rounding, &weight_first);
+  sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding,
+                             &weight_second);
+  accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+                          y_accum);
+
+  y_pre += y_pre_stride;
+  y_count += y_pre_stride;
+  y_accum += y_pre_stride;
+  y_dist += DIST_STRIDE;
+
+  u_dist += DIST_STRIDE;
+  v_dist += DIST_STRIDE;
+
+  // Then all the rows except the last one
+  mul_first = _mm_load_si128((const __m128i *)neighbors_first[1]);
+  mul_second = _mm_load_si128((const __m128i *)neighbors_second[1]);
+
+  for (h = 1; h < block_height - 1; ++h) {
+    // Move the weight to bottom half
+    if (!use_whole_blk && h == block_height / 2) {
+      if (blk_fw) {
+        weight_first = _mm_set1_epi16(blk_fw[2]);
+        weight_second = _mm_set1_epi16(blk_fw[3]);
+      } else {
+        weight_first = _mm_set1_epi16(bottom_weight);
+        weight_second = weight_first;
+      }
+    }
+    // Shift the rows up
+    sum_row_1_first = sum_row_2_first;
+    sum_row_1_second = sum_row_2_second;
+    sum_row_2_first = sum_row_3_first;
+    sum_row_2_second = sum_row_3_second;
+
+    // Add luma values to the modifier
+    sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first);
+    sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second);
+
+    get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+    sum_row_first = _mm_adds_epu16(sum_row_first, sum_row_3_first);
+    sum_row_second = _mm_adds_epu16(sum_row_second, sum_row_3_second);
+
+    // Add chroma values to the modifier
+    if (ss_y == 0 || h % 2 == 0) {
+      // Only calculate the new chroma distortion if we are at a pixel that
+      // corresponds to a new chroma row
+      read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second,
+                              &v_first, &v_second);
+
+      u_dist += DIST_STRIDE;
+      v_dist += DIST_STRIDE;
+    }
+
+    sum_row_first = _mm_adds_epu16(sum_row_first, u_first);
+    sum_row_second = _mm_adds_epu16(sum_row_second, u_second);
+    sum_row_first = _mm_adds_epu16(sum_row_first, v_first);
+    sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
+
+    // Get modifier and store result
+    sum_row_first =
+        average_8(sum_row_first, &mul_first, strength, rounding, &weight_first);
+    sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding,
+                               &weight_second);
+    accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+                            y_accum);
+
+    y_pre += y_pre_stride;
+    y_count += y_pre_stride;
+    y_accum += y_pre_stride;
+    y_dist += DIST_STRIDE;
+  }
+
+  // The last row
+  mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]);
+  mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]);
+
+  // Shift the rows up
+  sum_row_1_first = sum_row_2_first;
+  sum_row_1_second = sum_row_2_second;
+  sum_row_2_first = sum_row_3_first;
+  sum_row_2_second = sum_row_3_second;
+
+  // Add luma values to the modifier
+  sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first);
+  sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second);
+
+  // Add chroma values to the modifier
+  if (ss_y == 0) {
+    // Only calculate the new chroma distortion if we are at a pixel that
+    // corresponds to a new chroma row
+    read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
+                            &v_second);
+  }
+
+  sum_row_first = _mm_adds_epu16(sum_row_first, u_first);
+  sum_row_second = _mm_adds_epu16(sum_row_second, u_second);
+  sum_row_first = _mm_adds_epu16(sum_row_first, v_first);
+  sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
+
+  // Get modifier and store result
+  sum_row_first =
+      average_8(sum_row_first, &mul_first, strength, rounding, &weight_first);
+  sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding,
+                             &weight_second);
+  accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+                          y_accum);
+}
+
+// Perform temporal filter for the luma component.
+static void vp9_apply_temporal_filter_luma(
+    const uint8_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) {
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int blk_col_step = 16, uv_blk_col_step = 16 >> ss_x;
+  const unsigned int mid_width = block_width >> 1,
+                     last_width = block_width - blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const int16_t *const *neighbors_first;
+  const int16_t *const *neighbors_second;
+
+  if (block_width == 16) {
+    // Special Case: The blockwidth is 16 and we are operating on a row of 16
+    // chroma pixels. In this case, we can't use the usual left-middle-right
+    // pattern. We also don't support splitting now.
+    neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
+    neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
+    if (use_whole_blk) {
+      vp9_apply_temporal_filter_luma_16(
+          y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+          use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+          neighbors_second, top_weight, bottom_weight, NULL);
+    } else {
+      vp9_apply_temporal_filter_luma_16(
+          y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+          use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+          neighbors_second, 0, 0, blk_fw);
+    }
+
+    return;
+  }
+
+  // Left
+  neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
+  neighbors_second = LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  vp9_apply_temporal_filter_luma_16(
+      y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+      use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+      neighbors_second, top_weight, bottom_weight, NULL);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  neighbors_first = LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  for (; blk_col < mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    vp9_apply_temporal_filter_luma_16(
+        y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+        use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+        neighbors_second, top_weight, bottom_weight, NULL);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; blk_col < last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    vp9_apply_temporal_filter_luma_16(
+        y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+        use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+        neighbors_second, top_weight, bottom_weight, NULL);
+  }
+
+  // Right
+  neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
+  vp9_apply_temporal_filter_luma_16(
+      y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+      use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+      neighbors_second, top_weight, bottom_weight, NULL);
+}
+
+// Apply temporal filter to the chroma components. This performs temporal
+// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
+// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void vp9_apply_temporal_filter_chroma_8(
+    const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride,
+    unsigned int uv_block_height, int ss_x, int ss_y, int strength,
+    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist,
+    const int16_t *const *neighbors, int top_weight, int bottom_weight,
+    const int *blk_fw) {
+  const int rounding = (1 << strength) >> 1;
+
+  __m128i weight;
+
+  __m128i mul;
+
+  __m128i u_sum_row_1, u_sum_row_2, u_sum_row_3;
+  __m128i v_sum_row_1, v_sum_row_2, v_sum_row_3;
+
+  __m128i u_sum_row, v_sum_row;
+
+  // Loop variable
+  unsigned int h;
+
+  // Initialize weight
+  if (blk_fw) {
+    weight = _mm_setr_epi16(blk_fw[0], blk_fw[0], blk_fw[0], blk_fw[0],
+                            blk_fw[1], blk_fw[1], blk_fw[1], blk_fw[1]);
+  } else {
+    weight = _mm_set1_epi16(top_weight);
+  }
+
+  // First row
+  mul = _mm_load_si128((const __m128i *)neighbors[0]);
+
+  // Add chroma values
+  get_sum_8(u_dist, &u_sum_row_2);
+  get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
+
+  u_sum_row = _mm_adds_epu16(u_sum_row_2, u_sum_row_3);
+
+  get_sum_8(v_dist, &v_sum_row_2);
+  get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
+
+  v_sum_row = _mm_adds_epu16(v_sum_row_2, v_sum_row_3);
+
+  // Add luma values
+  add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+  // Get modifier and store result
+  u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight);
+  v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight);
+
+  accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+  accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+
+  u_pre += uv_pre_stride;
+  u_dist += DIST_STRIDE;
+  v_pre += uv_pre_stride;
+  v_dist += DIST_STRIDE;
+  u_count += uv_pre_stride;
+  u_accum += uv_pre_stride;
+  v_count += uv_pre_stride;
+  v_accum += uv_pre_stride;
+
+  y_dist += DIST_STRIDE * (1 + ss_y);
+
+  // Then all the rows except the last one
+  mul = _mm_load_si128((const __m128i *)neighbors[1]);
+
+  for (h = 1; h < uv_block_height - 1; ++h) {
+    // Move the weight pointer to the bottom half of the blocks
+    if (h == uv_block_height / 2) {
+      if (blk_fw) {
+        weight = _mm_setr_epi16(blk_fw[2], blk_fw[2], blk_fw[2], blk_fw[2],
+                                blk_fw[3], blk_fw[3], blk_fw[3], blk_fw[3]);
+      } else {
+        weight = _mm_set1_epi16(bottom_weight);
+      }
+    }
+
+    // Shift the rows up
+    u_sum_row_1 = u_sum_row_2;
+    u_sum_row_2 = u_sum_row_3;
+
+    v_sum_row_1 = v_sum_row_2;
+    v_sum_row_2 = v_sum_row_3;
+
+    // Add chroma values
+    u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2);
+    get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
+    u_sum_row = _mm_adds_epu16(u_sum_row, u_sum_row_3);
+
+    v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2);
+    get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
+    v_sum_row = _mm_adds_epu16(v_sum_row, v_sum_row_3);
+
+    // Add luma values
+    add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+    // Get modifier and store result
+    u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight);
+    v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight);
+
+    accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+    accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+
+    u_pre += uv_pre_stride;
+    u_dist += DIST_STRIDE;
+    v_pre += uv_pre_stride;
+    v_dist += DIST_STRIDE;
+    u_count += uv_pre_stride;
+    u_accum += uv_pre_stride;
+    v_count += uv_pre_stride;
+    v_accum += uv_pre_stride;
+
+    y_dist += DIST_STRIDE * (1 + ss_y);
+  }
+
+  // The last row
+  mul = _mm_load_si128((const __m128i *)neighbors[0]);
+
+  // Shift the rows up
+  u_sum_row_1 = u_sum_row_2;
+  u_sum_row_2 = u_sum_row_3;
+
+  v_sum_row_1 = v_sum_row_2;
+  v_sum_row_2 = v_sum_row_3;
+
+  // Add chroma values
+  u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2);
+  v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2);
+
+  // Add luma values
+  add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+  // Get modifier and store result
+  u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight);
+  v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight);
+
+  accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+  accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+}
+
+// Perform temporal filter for the chroma components.
+static void vp9_apply_temporal_filter_chroma(
+    const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride,
+    unsigned int block_width, unsigned int block_height, int ss_x, int ss_y,
+    int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum,
+    uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) {
+  const unsigned int uv_width = block_width >> ss_x,
+                     uv_height = block_height >> ss_y;
+
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
+  const unsigned int uv_mid_width = uv_width >> 1,
+                     uv_last_width = uv_width - uv_blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const int16_t *const *neighbors;
+
+  if (uv_width == 8) {
+    // Special Case: We are subsampling in x direction on a 16x16 block. Since
+    // we are operating on a row of 8 chroma pixels, we can't use the usual
+    // left-middle-right pattern.
+    assert(ss_x);
+
+    if (ss_y) {
+      neighbors = CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS;
+    } else {
+      neighbors = CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS;
+    }
+
+    if (use_whole_blk) {
+      vp9_apply_temporal_filter_chroma_8(
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height,
+          ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+          v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+          bottom_weight, NULL);
+    } else {
+      vp9_apply_temporal_filter_chroma_8(
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height,
+          ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+          v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, 0, 0, blk_fw);
+    }
+
+    return;
+  }
+
+  // Left
+  if (ss_x && ss_y) {
+    neighbors = CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors = CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+  } else {
+    neighbors = CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
+  }
+
+  vp9_apply_temporal_filter_chroma_8(
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+      ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+      bottom_weight, NULL);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  if (ss_x && ss_y) {
+    neighbors = CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors = CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else {
+    neighbors = CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+  }
+
+  for (; uv_blk_col < uv_mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    vp9_apply_temporal_filter_chroma_8(
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+        ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+        v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+        bottom_weight, NULL);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; uv_blk_col < uv_last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    vp9_apply_temporal_filter_chroma_8(
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+        ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+        v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+        bottom_weight, NULL);
+  }
+
+  // Right
+  if (ss_x && ss_y) {
+    neighbors = CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors = CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else {
+    neighbors = CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
+  }
+
+  vp9_apply_temporal_filter_chroma_8(
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+      ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+      bottom_weight, NULL);
+}
+
+void vp9_apply_temporal_filter_sse4_1(
+    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *const blk_fw,
+    int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum,
+    uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) {
+  const unsigned int chroma_height = block_height >> ss_y,
+                     chroma_width = block_width >> ss_x;
+
+  DECLARE_ALIGNED(16, uint16_t, y_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint16_t, u_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint16_t, v_dist[BH * DIST_STRIDE]) = { 0 };
+  const int *blk_fw_ptr = blk_fw;
+
+  uint16_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
+           *v_dist_ptr = v_dist + 1;
+  const uint8_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src;
+  const uint8_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre;
+
+  // Loop variables
+  unsigned int row, blk_col;
+
+  assert(block_width <= BW && "block width too large");
+  assert(block_height <= BH && "block height too large");
+  assert(block_width % 16 == 0 && "block width must be multiple of 16");
+  assert(block_height % 2 == 0 && "block height must be even");
+  assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
+         "invalid chroma subsampling");
+  assert(strength >= 0 && strength <= 6 && "invalid temporal filter strength");
+  assert(blk_fw[0] >= 0 && "filter weight must be positive");
+  assert(
+      (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
+      "subblock filter weight must be positive");
+  assert(blk_fw[0] <= 2 && "subblock filter weight must be less than 2");
+  assert(
+      (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
+      "subblock filter weight must be less than 2");
+
+  // Precompute the difference squared
+  for (row = 0; row < block_height; row++) {
+    for (blk_col = 0; blk_col < block_width; blk_col += 16) {
+      store_dist_16(y_src_ptr + blk_col, y_pre_ptr + blk_col,
+                    y_dist_ptr + blk_col);
+    }
+    y_src_ptr += y_src_stride;
+    y_pre_ptr += y_pre_stride;
+    y_dist_ptr += DIST_STRIDE;
+  }
+
+  for (row = 0; row < chroma_height; row++) {
+    for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
+      store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
+                   u_dist_ptr + blk_col);
+      store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
+                   v_dist_ptr + blk_col);
+    }
+
+    u_src_ptr += uv_src_stride;
+    u_pre_ptr += uv_pre_stride;
+    u_dist_ptr += DIST_STRIDE;
+    v_src_ptr += uv_src_stride;
+    v_pre_ptr += uv_pre_stride;
+    v_dist_ptr += DIST_STRIDE;
+  }
+
+  y_dist_ptr = y_dist + 1;
+  u_dist_ptr = u_dist + 1;
+  v_dist_ptr = v_dist + 1;
+
+  vp9_apply_temporal_filter_luma(y_pre, y_pre_stride, block_width, block_height,
+                                 ss_x, ss_y, strength, blk_fw_ptr,
+                                 use_whole_blk, y_accum, y_count, y_dist_ptr,
+                                 u_dist_ptr, v_dist_ptr);
+
+  vp9_apply_temporal_filter_chroma(
+      u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
+      strength, blk_fw_ptr, use_whole_blk, u_accum, u_count, v_accum, v_count,
+      y_dist_ptr, u_dist_ptr, v_dist_ptr);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_ssse3.c b/media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_ssse3.c
new file mode 100644
index 0000000000..abf0ae1381
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_ssse3.c
@@ -0,0 +1,279 @@
+/*
+ *  Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <tmmintrin.h>  // SSSE3
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_src_mask1_ssse3[32]) = { 0, 1, 1, 2, 2, 3, 3, 4,
+                                                 4, 5, 5, 6, 6, 7, 7, 8 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_src_mask2_ssse3[32]) = { 2, 3, 3, 4, 4, 5, 5, 6,
+                                                 6, 7, 7, 8, 8, 9, 9, 10 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_src_mask3_ssse3[32]) = { 4, 5, 5, 6,  6,  7,  7,  8,
+                                                 8, 9, 9, 10, 10, 11, 11, 12 };
+
+DECLARE_ALIGNED(16, static const uint8_t, shuffle_src_mask4_ssse3[32]) = {
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+static INLINE void sign_extend_16bit_to_32bit_ssse3(__m128i in, __m128i zero,
+                                                    __m128i *out_lo,
+                                                    __m128i *out_hi) {
+  const __m128i sign_bits = _mm_cmpgt_epi16(zero, in);
+  *out_lo = _mm_unpacklo_epi16(in, sign_bits);
+  *out_hi = _mm_unpackhi_epi16(in, sign_bits);
+}
+
+static INLINE void shuffle_12tap_filter_ssse3(const int16_t *filter,
+                                              __m128i *f) {
+  const __m128i f_low = _mm_loadu_si128((const __m128i *)filter);
+  const __m128i f_high = _mm_loadl_epi64((const __m128i *)(filter + 8));
+
+  f[0] = _mm_shuffle_epi8(f_low, _mm_set1_epi16(0x0200u));
+  f[1] = _mm_shuffle_epi8(f_low, _mm_set1_epi16(0x0604u));
+  f[2] = _mm_shuffle_epi8(f_low, _mm_set1_epi16(0x0a08u));
+  f[3] = _mm_shuffle_epi8(f_low, _mm_set1_epi16(0x0e0cu));
+  f[4] = _mm_shuffle_epi8(f_high, _mm_set1_epi16(0x0200u));
+  f[5] = _mm_shuffle_epi8(f_high, _mm_set1_epi16(0x0604u));
+}
+
+static INLINE void shuffle_src_data_ssse3(const __m128i *r1, const __m128i *r2,
+                                          const __m128i *f, __m128i *s) {
+  s[0] = _mm_shuffle_epi8(*r1, f[0]);
+  s[1] = _mm_shuffle_epi8(*r1, f[1]);
+  s[2] = _mm_shuffle_epi8(*r1, f[2]);
+  s[3] = _mm_shuffle_epi8(*r1, f[3]);
+  s[4] = _mm_shuffle_epi8(*r2, f[0]);
+  s[5] = _mm_shuffle_epi8(*r2, f[1]);
+}
+
+static INLINE void reuse_src_data_ssse3(const __m128i *src, __m128i *des) {
+  des[0] = src[0];
+  des[1] = src[1];
+  des[2] = src[2];
+  des[3] = src[3];
+  des[4] = src[4];
+}
+
+static INLINE __m128i convolve12_16_ssse3(const __m128i *const s,
+                                          const __m128i *const f) {
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m128i k_64 = _mm_set1_epi16(1 << (FILTER_BITS - 1));
+  const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+  const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+  const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+  const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+  const __m128i x4 = _mm_maddubs_epi16(s[4], f[4]);
+  const __m128i x5 = _mm_maddubs_epi16(s[5], f[5]);
+  __m128i sum1, sum2, sum3, s0, s1, s2, s3, s4, s5;
+
+  sum1 = _mm_add_epi16(x0, x2);
+  sum2 = _mm_add_epi16(x3, x5);
+  sum3 = _mm_add_epi16(x1, x4);
+  sum3 = _mm_add_epi16(sum3, k_64);
+
+  sign_extend_16bit_to_32bit_ssse3(sum1, _mm_setzero_si128(), &s0, &s1);
+  sign_extend_16bit_to_32bit_ssse3(sum2, _mm_setzero_si128(), &s2, &s3);
+  sign_extend_16bit_to_32bit_ssse3(sum3, _mm_setzero_si128(), &s4, &s5);
+  sum1 = _mm_add_epi32(s0, s2);
+  sum2 = _mm_add_epi32(s1, s3);
+  sum1 = _mm_add_epi32(sum1, s4);
+  sum2 = _mm_add_epi32(sum2, s5);
+
+  // round and shift by 7 bit each 32 bit
+  // 0 1 2 3
+  sum1 = _mm_srai_epi32(sum1, FILTER_BITS);
+  // 4 5 6 7
+  sum2 = _mm_srai_epi32(sum2, FILTER_BITS);
+
+  // 0 1 2 3 4 5 6 7
+  __m128i const res = _mm_packs_epi32(sum1, sum2);
+  return res;
+}
+
+void vpx_convolve12_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+                                uint8_t *dst, ptrdiff_t dst_stride,
+                                const InterpKernel12 *filter, int x0_q4,
+                                int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                int h) {
+  assert(x_step_q4 == 16);
+  assert(w == 32 || w == 16 || w == 8);
+  (void)y0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+  const uint8_t *src_ptr = src;
+  src_ptr -= MAX_FILTER_TAP / 2 - 1;
+  __m128i s[6], f[6], src_mask[4];
+
+  shuffle_12tap_filter_ssse3(filter[x0_q4], f);
+  src_mask[0] = _mm_load_si128((__m128i const *)shuffle_src_mask1_ssse3);
+  src_mask[1] = _mm_load_si128((__m128i const *)shuffle_src_mask2_ssse3);
+  src_mask[2] = _mm_load_si128((__m128i const *)shuffle_src_mask3_ssse3);
+  src_mask[3] = _mm_load_si128((__m128i const *)shuffle_src_mask4_ssse3);
+  if (w == 8) {
+    for (int i = 0; i < h; i += 2) {
+      // s00 s01 s02 s03 s04 s05 s06 s07 s08 s09 s010 s011 s012 s013 s014 s015
+      const __m128i row0 =
+          _mm_loadu_si128((const __m128i *)&src_ptr[i * src_stride]);
+      // s08 s09 s010 s011 s012 s013 s014 s015 s016 s017 s018 s019 s020 s021
+      // s022 s023
+      const __m128i row0_8 =
+          _mm_loadu_si128((const __m128i *)&src_ptr[i * src_stride + 8]);
+      // s10 s11 s12 s13 s14 s15 s16 s17 s18 s19 s110 s111 s112 s113 s114 s115
+      const __m128i row1 =
+          _mm_loadu_si128((const __m128i *)&src_ptr[(i + 1) * src_stride]);
+      const __m128i row1_8 =
+          _mm_loadu_si128((const __m128i *)&src_ptr[(i + 1) * src_stride + 8]);
+
+      shuffle_src_data_ssse3(&row0, &row0_8, src_mask, s);
+      const __m128i res_0 = convolve12_16_ssse3(s, f);
+
+      shuffle_src_data_ssse3(&row1, &row1_8, src_mask, s);
+      const __m128i res_1 = convolve12_16_ssse3(s, f);
+
+      const __m128i res = _mm_packus_epi16(res_0, res_1);
+      _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res);
+      _mm_storel_epi64((__m128i *)&dst[(i + 1) * dst_stride],
+                       _mm_srli_si128(res, 8));
+    }
+  } else {
+    for (int j = 0; j < w; j += 16) {
+      for (int i = 0; i < h; i++) {
+        // s00 s01 s02 s03 s04 s05 s06 s07 s08 s09 s010 s011 s012 s013 s014 s015
+        const __m128i r0 =
+            _mm_loadu_si128((const __m128i *)&src_ptr[i * src_stride + j]);
+        // s016 s017 s018 s019 s020 s021 s022 s023 s024 s025 s026 s027 s028 s029
+        // s030 s031
+        const __m128i r2 =
+            _mm_loadu_si128((const __m128i *)&src_ptr[i * src_stride + j + 16]);
+
+        // s08 s09 s010 s011 s012 s013 s014 s015 s016 s017 s018 s019 s020 s021
+        // s022 s023
+        const __m128i r1 = _mm_alignr_epi8(r2, r0, 8);
+
+        shuffle_src_data_ssse3(&r0, &r1, src_mask, s);
+        const __m128i res_0 = convolve12_16_ssse3(s, f);
+
+        shuffle_src_data_ssse3(&r1, &r2, src_mask, s);
+        const __m128i res_1 = convolve12_16_ssse3(s, f);
+
+        const __m128i res = _mm_packus_epi16(res_0, res_1);
+        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+      }
+    }
+  }
+}
+
+void vpx_convolve12_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const InterpKernel12 *filter, int x0_q4,
+                               int x_step_q4, int y0_q4, int y_step_q4, int w,
+                               int h) {
+  assert(y_step_q4 == 16);
+  assert(h == 32 || h == 16 || h == 8);
+  assert(w == 32 || w == 16 || w == 8);
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+  const uint8_t *src_ptr = src;
+  src_ptr -= src_stride * (MAX_FILTER_TAP / 2 - 1);
+  __m128i s[12], f[6];
+
+  shuffle_12tap_filter_ssse3(filter[y0_q4], f);
+  for (int j = 0; j < w; j += 8) {
+    const __m128i s0 =
+        _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_stride + j));
+    const __m128i s1 =
+        _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_stride + j));
+    const __m128i s2 =
+        _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_stride + j));
+    const __m128i s3 =
+        _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_stride + j));
+    const __m128i s4 =
+        _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_stride + j));
+    const __m128i s5 =
+        _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_stride + j));
+    const __m128i s6 =
+        _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_stride + j));
+    const __m128i s7 =
+        _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_stride + j));
+    const __m128i s8 =
+        _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_stride + j));
+    const __m128i s9 =
+        _mm_loadl_epi64((const __m128i *)(src_ptr + 9 * src_stride + j));
+    const __m128i s10t =
+        _mm_loadl_epi64((const __m128i *)(src_ptr + 10 * src_stride + j));
+
+    // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+    s[0] = _mm_unpacklo_epi8(s0, s1);
+    s[1] = _mm_unpacklo_epi8(s2, s3);
+    s[2] = _mm_unpacklo_epi8(s4, s5);
+    s[3] = _mm_unpacklo_epi8(s6, s7);
+    s[4] = _mm_unpacklo_epi8(s8, s9);
+
+    s[6] = _mm_unpacklo_epi8(s1, s2);
+    s[7] = _mm_unpacklo_epi8(s3, s4);
+    s[8] = _mm_unpacklo_epi8(s5, s6);
+    s[9] = _mm_unpacklo_epi8(s7, s8);
+    s[10] = _mm_unpacklo_epi8(s9, s10t);
+    for (int i = 0; i < h; i += 2) {
+      const __m128i s10 = _mm_loadl_epi64(
+          (const __m128i *)(src_ptr + (i + 10) * src_stride + j));
+      const __m128i s11 = _mm_loadl_epi64(
+          (const __m128i *)(src_ptr + (i + 11) * src_stride + j));
+      const __m128i s12 = _mm_loadl_epi64(
+          (const __m128i *)(src_ptr + (i + 12) * src_stride + j));
+
+      s[5] = _mm_unpacklo_epi8(s10, s11);
+      s[11] = _mm_unpacklo_epi8(s11, s12);
+
+      const __m128i res_0 = convolve12_16_ssse3(s, f);
+      const __m128i res_1 = convolve12_16_ssse3(s + 6, f);
+
+      __m128i res = _mm_packus_epi16(res_0, res_1);
+
+      _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res);
+      _mm_storel_epi64((__m128i *)&dst[(i + 1) * dst_stride + j],
+                       _mm_srli_si128(res, 8));
+
+      reuse_src_data_ssse3(s + 1, s);
+      reuse_src_data_ssse3(s + 7, s + 6);
+    }
+  }
+}
+
+void vpx_convolve12_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const InterpKernel12 *filter, int x0_q4,
+                          int x_step_q4, int y0_q4, int y_step_q4, int w,
+                          int h) {
+  assert(x_step_q4 == 16 && y_step_q4 == 16);
+  assert(h == 32 || h == 16 || h == 8);
+  assert(w == 32 || w == 16 || w == 8);
+  DECLARE_ALIGNED(32, uint8_t, temp[BW * (BH + MAX_FILTER_TAP - 1)]);
+  const int temp_stride = BW;
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + MAX_FILTER_TAP;
+  vpx_convolve12_horiz_ssse3(src - src_stride * (MAX_FILTER_TAP / 2 - 1),
+                             src_stride, temp, temp_stride, filter, x0_q4,
+                             x_step_q4, y0_q4, y_step_q4, w,
+                             intermediate_height);
+  vpx_convolve12_vert_ssse3(temp + temp_stride * (MAX_FILTER_TAP / 2 - 1),
+                            temp_stride, dst, dst_stride, filter, x0_q4,
+                            x_step_q4, y0_q4, y_step_q4, w, h);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c
index 0712779b75..e9943447fd 100644
--- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c
@@ -14,7 +14,9 @@
 #include "./vp9_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 #include "vpx_dsp/x86/fwd_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 #include "vpx_ports/mem.h"
 
@@ -71,7 +73,7 @@ static INLINE void transpose_4x4(__m128i *res) {
 }
 
 static void fdct4_sse2(__m128i *in) {
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
@@ -109,7 +111,7 @@ static void fadst4_sse2(__m128i *in) {
   const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
   const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
   const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
-  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i kZero = _mm_setzero_si128();
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   __m128i u[8], v[8];
   __m128i in7 = _mm_add_epi16(in[0], in[1]);
@@ -169,454 +171,13 @@ void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride,
       fadst4_sse2(in);
       write_buffer_4x4(output, in);
       break;
-    case ADST_ADST:
+    default:
+      assert(tx_type == ADST_ADST);
       load_buffer_4x4(input, in, stride);
       fadst4_sse2(in);
       fadst4_sse2(in);
       write_buffer_4x4(output, in);
       break;
-    default: assert(0); break;
-  }
-}
-
-void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
-                            int16_t *coeff_ptr, intptr_t n_coeffs,
-                            int skip_block, const int16_t *zbin_ptr,
-                            const int16_t *round_ptr, const int16_t *quant_ptr,
-                            const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
-                            int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                            uint16_t *eob_ptr, const int16_t *scan_ptr,
-                            const int16_t *iscan_ptr) {
-  __m128i zero;
-  int pass;
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  // Load input
-  __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
-  __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
-  __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
-  __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
-  __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
-  __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
-  __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
-  __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
-  __m128i *in[8];
-  int index = 0;
-
-  (void)scan_ptr;
-  (void)zbin_ptr;
-  (void)quant_shift_ptr;
-  (void)coeff_ptr;
-
-  // Pre-condition input (shift by two)
-  in0 = _mm_slli_epi16(in0, 2);
-  in1 = _mm_slli_epi16(in1, 2);
-  in2 = _mm_slli_epi16(in2, 2);
-  in3 = _mm_slli_epi16(in3, 2);
-  in4 = _mm_slli_epi16(in4, 2);
-  in5 = _mm_slli_epi16(in5, 2);
-  in6 = _mm_slli_epi16(in6, 2);
-  in7 = _mm_slli_epi16(in7, 2);
-
-  in[0] = &in0;
-  in[1] = &in1;
-  in[2] = &in2;
-  in[3] = &in3;
-  in[4] = &in4;
-  in[5] = &in5;
-  in[6] = &in6;
-  in[7] = &in7;
-
-  // We do two passes, first the columns, then the rows. The results of the
-  // first pass are transposed so that the same column code can be reused. The
-  // results of the second pass are also transposed so that the rows (processed
-  // as columns) are put back in row positions.
-  for (pass = 0; pass < 2; pass++) {
-    // To store results of each pass before the transpose.
-    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
-    // Add/subtract
-    const __m128i q0 = _mm_add_epi16(in0, in7);
-    const __m128i q1 = _mm_add_epi16(in1, in6);
-    const __m128i q2 = _mm_add_epi16(in2, in5);
-    const __m128i q3 = _mm_add_epi16(in3, in4);
-    const __m128i q4 = _mm_sub_epi16(in3, in4);
-    const __m128i q5 = _mm_sub_epi16(in2, in5);
-    const __m128i q6 = _mm_sub_epi16(in1, in6);
-    const __m128i q7 = _mm_sub_epi16(in0, in7);
-    // Work on first four results
-    {
-      // Add/subtract
-      const __m128i r0 = _mm_add_epi16(q0, q3);
-      const __m128i r1 = _mm_add_epi16(q1, q2);
-      const __m128i r2 = _mm_sub_epi16(q1, q2);
-      const __m128i r3 = _mm_sub_epi16(q0, q3);
-      // Interleave to do the multiply by constants which gets us into 32bits
-      const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
-      const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
-      const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
-      const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
-      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
-      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
-      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
-      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
-      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
-      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
-      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
-      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
-      // dct_const_round_shift
-      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-      // Combine
-      res0 = _mm_packs_epi32(w0, w1);
-      res4 = _mm_packs_epi32(w2, w3);
-      res2 = _mm_packs_epi32(w4, w5);
-      res6 = _mm_packs_epi32(w6, w7);
-    }
-    // Work on next four results
-    {
-      // Interleave to do the multiply by constants which gets us into 32bits
-      const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
-      const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
-      const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
-      const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
-      const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
-      const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
-      // dct_const_round_shift
-      const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
-      const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
-      const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
-      const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
-      const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
-      const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
-      const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
-      const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
-      // Combine
-      const __m128i r0 = _mm_packs_epi32(s0, s1);
-      const __m128i r1 = _mm_packs_epi32(s2, s3);
-      // Add/subtract
-      const __m128i x0 = _mm_add_epi16(q4, r0);
-      const __m128i x1 = _mm_sub_epi16(q4, r0);
-      const __m128i x2 = _mm_sub_epi16(q7, r1);
-      const __m128i x3 = _mm_add_epi16(q7, r1);
-      // Interleave to do the multiply by constants which gets us into 32bits
-      const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
-      const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
-      const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
-      const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
-      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
-      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
-      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
-      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
-      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
-      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
-      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
-      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
-      // dct_const_round_shift
-      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-      // Combine
-      res1 = _mm_packs_epi32(w0, w1);
-      res7 = _mm_packs_epi32(w2, w3);
-      res5 = _mm_packs_epi32(w4, w5);
-      res3 = _mm_packs_epi32(w6, w7);
-    }
-    // Transpose the 8x8.
-    {
-      // 00 01 02 03 04 05 06 07
-      // 10 11 12 13 14 15 16 17
-      // 20 21 22 23 24 25 26 27
-      // 30 31 32 33 34 35 36 37
-      // 40 41 42 43 44 45 46 47
-      // 50 51 52 53 54 55 56 57
-      // 60 61 62 63 64 65 66 67
-      // 70 71 72 73 74 75 76 77
-      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
-      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
-      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
-      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
-      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
-      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
-      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
-      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
-      // 00 10 01 11 02 12 03 13
-      // 20 30 21 31 22 32 23 33
-      // 04 14 05 15 06 16 07 17
-      // 24 34 25 35 26 36 27 37
-      // 40 50 41 51 42 52 43 53
-      // 60 70 61 71 62 72 63 73
-      // 54 54 55 55 56 56 57 57
-      // 64 74 65 75 66 76 67 77
-      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-      // 00 10 20 30 01 11 21 31
-      // 40 50 60 70 41 51 61 71
-      // 02 12 22 32 03 13 23 33
-      // 42 52 62 72 43 53 63 73
-      // 04 14 24 34 05 15 21 36
-      // 44 54 64 74 45 55 61 76
-      // 06 16 26 36 07 17 27 37
-      // 46 56 66 76 47 57 67 77
-      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
-      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
-      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
-      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
-      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
-      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
-      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
-      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
-      // 00 10 20 30 40 50 60 70
-      // 01 11 21 31 41 51 61 71
-      // 02 12 22 32 42 52 62 72
-      // 03 13 23 33 43 53 63 73
-      // 04 14 24 34 44 54 64 74
-      // 05 15 25 35 45 55 65 75
-      // 06 16 26 36 46 56 66 76
-      // 07 17 27 37 47 57 67 77
-    }
-  }
-  // Post-condition output and store it
-  {
-    // Post-condition (division by two)
-    //    division of two 16 bits signed numbers using shifts
-    //    n / 2 = (n - (n >> 15)) >> 1
-    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
-    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
-    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
-    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
-    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
-    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
-    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
-    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
-    in0 = _mm_sub_epi16(in0, sign_in0);
-    in1 = _mm_sub_epi16(in1, sign_in1);
-    in2 = _mm_sub_epi16(in2, sign_in2);
-    in3 = _mm_sub_epi16(in3, sign_in3);
-    in4 = _mm_sub_epi16(in4, sign_in4);
-    in5 = _mm_sub_epi16(in5, sign_in5);
-    in6 = _mm_sub_epi16(in6, sign_in6);
-    in7 = _mm_sub_epi16(in7, sign_in7);
-    in0 = _mm_srai_epi16(in0, 1);
-    in1 = _mm_srai_epi16(in1, 1);
-    in2 = _mm_srai_epi16(in2, 1);
-    in3 = _mm_srai_epi16(in3, 1);
-    in4 = _mm_srai_epi16(in4, 1);
-    in5 = _mm_srai_epi16(in5, 1);
-    in6 = _mm_srai_epi16(in6, 1);
-    in7 = _mm_srai_epi16(in7, 1);
-  }
-
-  iscan_ptr += n_coeffs;
-  qcoeff_ptr += n_coeffs;
-  dqcoeff_ptr += n_coeffs;
-  n_coeffs = -n_coeffs;
-  zero = _mm_setzero_si128();
-
-  if (!skip_block) {
-    __m128i eob;
-    __m128i round, quant, dequant;
-    {
-      __m128i coeff0, coeff1;
-
-      // Setup global values
-      {
-        round = _mm_load_si128((const __m128i *)round_ptr);
-        quant = _mm_load_si128((const __m128i *)quant_ptr);
-        dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-      }
-
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-        // Do DC and first 15 AC
-        coeff0 = *in[0];
-        coeff1 = *in[1];
-
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        round = _mm_unpackhi_epi64(round, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        quant = _mm_unpackhi_epi64(quant, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
-        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
-
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        dequant = _mm_unpackhi_epi64(dequant, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
-        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
-      }
-
-      {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob = _mm_max_epi16(eob, eob1);
-      }
-      n_coeffs += 8 * 2;
-    }
-
-    // AC only loop
-    index = 2;
-    while (n_coeffs < 0) {
-      __m128i coeff0, coeff1;
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-
-        assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1);
-        coeff0 = *in[index];
-        coeff1 = *in[index + 1];
-
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
-        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
-
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
-        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
-      }
-
-      {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob0, eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob0 = _mm_max_epi16(eob0, eob1);
-        eob = _mm_max_epi16(eob, eob0);
-      }
-      n_coeffs += 8 * 2;
-      index += 2;
-    }
-
-    // Accumulate EOB
-    {
-      __m128i eob_shuffled;
-      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      *eob_ptr = _mm_extract_epi16(eob, 1);
-    }
-  } else {
-    do {
-      _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
-      _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
-      n_coeffs += 8 * 2;
-    } while (n_coeffs < 0);
-    *eob_ptr = 0;
   }
 }
 
@@ -708,61 +269,9 @@ static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res,
   store_output(&res[7], (output + 7 * stride));
 }
 
-// perform in-place transpose
-static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
-  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
-  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
-  // 00 10 01 11 02 12 03 13
-  // 20 30 21 31 22 32 23 33
-  // 04 14 05 15 06 16 07 17
-  // 24 34 25 35 26 36 27 37
-  // 40 50 41 51 42 52 43 53
-  // 60 70 61 71 62 72 63 73
-  // 44 54 45 55 46 56 47 57
-  // 64 74 65 75 66 76 67 77
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-  // 00 10 20 30 01 11 21 31
-  // 40 50 60 70 41 51 61 71
-  // 02 12 22 32 03 13 23 33
-  // 42 52 62 72 43 53 63 73
-  // 04 14 24 34 05 15 25 35
-  // 44 54 64 74 45 55 65 75
-  // 06 16 26 36 07 17 27 37
-  // 46 56 66 76 47 57 67 77
-  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
-  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
-  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
-  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
-  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
-  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
-  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
-  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
-  // 00 10 20 30 40 50 60 70
-  // 01 11 21 31 41 51 61 71
-  // 02 12 22 32 42 52 62 72
-  // 03 13 23 33 43 53 63 73
-  // 04 14 24 34 44 54 64 74
-  // 05 15 25 35 45 55 65 75
-  // 06 16 26 36 46 56 66 76
-  // 07 17 27 37 47 57 67 77
-}
-
 static void fdct8_sse2(__m128i *in) {
   // constants
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
@@ -897,7 +406,7 @@ static void fdct8_sse2(__m128i *in) {
   in[7] = _mm_packs_epi32(v6, v7);
 
   // transpose
-  array_transpose_8x8(in, in);
+  transpose_16bit_8x8(in, in);
 }
 
 static void fadst8_sse2(__m128i *in) {
@@ -914,8 +423,8 @@ static void fadst8_sse2(__m128i *in) {
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__const_0 = _mm_set1_epi16(0);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__const_0 = _mm_setzero_si128();
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
 
   __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
@@ -1127,7 +636,7 @@ static void fadst8_sse2(__m128i *in) {
   in[7] = _mm_sub_epi16(k__const_0, s1);
 
   // transpose
-  array_transpose_8x8(in, in);
+  transpose_16bit_8x8(in, in);
 }
 
 void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride,
@@ -1150,14 +659,14 @@ void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride,
       right_shift_8x8(in, 1);
       write_buffer_8x8(output, in, 8);
       break;
-    case ADST_ADST:
+    default:
+      assert(tx_type == ADST_ADST);
       load_buffer_8x8(input, in, stride);
       fadst8_sse2(in);
       fadst8_sse2(in);
       right_shift_8x8(in, 1);
       write_buffer_8x8(output, in, 8);
       break;
-    default: assert(0); break;
   }
 }
 
@@ -1184,23 +693,6 @@ static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0,
   write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
 }
 
-static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
-  __m128i tbuf[8];
-  array_transpose_8x8(res0, res0);
-  array_transpose_8x8(res1, tbuf);
-  array_transpose_8x8(res0 + 8, res1);
-  array_transpose_8x8(res1 + 8, res1 + 8);
-
-  res0[8] = tbuf[0];
-  res0[9] = tbuf[1];
-  res0[10] = tbuf[2];
-  res0[11] = tbuf[3];
-  res0[12] = tbuf[4];
-  res0[13] = tbuf[5];
-  res0[14] = tbuf[6];
-  res0[15] = tbuf[7];
-}
-
 static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
   // perform rounding operations
   right_shift_8x8(res0, 2);
@@ -1212,7 +704,7 @@ static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
 static void fdct16_8col(__m128i *in) {
   // perform 16x16 1-D DCT for 8 columns
   __m128i i[8], s[8], p[8], t[8], u[16], v[16];
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
@@ -1559,12 +1051,12 @@ static void fadst16_8col(__m128i *in) {
   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i kZero = _mm_setzero_si128();
 
   u[0] = _mm_unpacklo_epi16(in[15], in[0]);
   u[1] = _mm_unpackhi_epi16(in[15], in[0]);
@@ -2004,13 +1496,13 @@ static void fadst16_8col(__m128i *in) {
 static void fdct16_sse2(__m128i *in0, __m128i *in1) {
   fdct16_8col(in0);
   fdct16_8col(in1);
-  array_transpose_16x16(in0, in1);
+  transpose_16bit_16x16(in0, in1);
 }
 
 static void fadst16_sse2(__m128i *in0, __m128i *in1) {
   fadst16_8col(in0);
   fadst16_8col(in1);
-  array_transpose_16x16(in0, in1);
+  transpose_16bit_16x16(in0, in1);
 }
 
 void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
@@ -2033,13 +1525,13 @@ void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
       fadst16_sse2(in0, in1);
       write_buffer_16x16(output, in0, in1, 16);
       break;
-    case ADST_ADST:
+    default:
+      assert(tx_type == ADST_ADST);
       load_buffer_16x16(input, in0, in1, stride);
       fadst16_sse2(in0, in1);
       right_shift_16x16(in0, in1);
       fadst16_sse2(in0, in1);
       write_buffer_16x16(output, in0, in1, 16);
       break;
-    default: assert(0); break;
   }
 }
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm b/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm
index ced37bd16e..8152dce864 100644
--- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm
@@ -11,6 +11,7 @@
 %define private_prefix vp9
 
 %include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
 
 SECTION .text
 
@@ -62,25 +63,7 @@ cglobal fwht4x4, 3, 4, 8, input, output, stride
   psllw           m0,        2
   psllw           m1,        2
 
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; sign extension
-  mova            m2,             m0
-  mova            m3,             m1
-  punpcklwd       m0,             m0
-  punpcklwd       m1,             m1
-  punpckhwd       m2,             m2
-  punpckhwd       m3,             m3
-  psrad           m0,             16
-  psrad           m1,             16
-  psrad           m2,             16
-  psrad           m3,             16
-  mova            [outputq],      m0
-  mova            [outputq + 16], m2
-  mova            [outputq + 32], m1
-  mova            [outputq + 48], m3
-%else
-  mova            [outputq],      m0
-  mova            [outputq + 16], m1
-%endif
+  STORE_TRAN_LOW 0, outputq, 0, 2, 3
+  STORE_TRAN_LOW 1, outputq, 8, 2, 3
 
   RET
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c
deleted file mode 100644
index b3c3d7beb9..0000000000
--- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c
+++ /dev/null
@@ -1,467 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-#include <tmmintrin.h>  // SSSE3
-
-#include "./vp9_rtcd.h"
-#include "./vpx_config.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_dsp/x86/fdct.h"
-#include "vpx_dsp/x86/inv_txfm_sse2.h"
-#include "vpx_dsp/x86/txfm_common_sse2.h"
-
-void vp9_fdct8x8_quant_ssse3(
-    const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs,
-    int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr,
-    const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
-    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-    uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) {
-  __m128i zero;
-  int pass;
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__dual_p16_p16 = dual_set_epi16(23170, 23170);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  // Load input
-  __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
-  __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
-  __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
-  __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
-  __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
-  __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
-  __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
-  __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
-  __m128i *in[8];
-  int index = 0;
-
-  (void)scan_ptr;
-  (void)zbin_ptr;
-  (void)quant_shift_ptr;
-  (void)coeff_ptr;
-
-  // Pre-condition input (shift by two)
-  in0 = _mm_slli_epi16(in0, 2);
-  in1 = _mm_slli_epi16(in1, 2);
-  in2 = _mm_slli_epi16(in2, 2);
-  in3 = _mm_slli_epi16(in3, 2);
-  in4 = _mm_slli_epi16(in4, 2);
-  in5 = _mm_slli_epi16(in5, 2);
-  in6 = _mm_slli_epi16(in6, 2);
-  in7 = _mm_slli_epi16(in7, 2);
-
-  in[0] = &in0;
-  in[1] = &in1;
-  in[2] = &in2;
-  in[3] = &in3;
-  in[4] = &in4;
-  in[5] = &in5;
-  in[6] = &in6;
-  in[7] = &in7;
-
-  // We do two passes, first the columns, then the rows. The results of the
-  // first pass are transposed so that the same column code can be reused. The
-  // results of the second pass are also transposed so that the rows (processed
-  // as columns) are put back in row positions.
-  for (pass = 0; pass < 2; pass++) {
-    // To store results of each pass before the transpose.
-    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
-    // Add/subtract
-    const __m128i q0 = _mm_add_epi16(in0, in7);
-    const __m128i q1 = _mm_add_epi16(in1, in6);
-    const __m128i q2 = _mm_add_epi16(in2, in5);
-    const __m128i q3 = _mm_add_epi16(in3, in4);
-    const __m128i q4 = _mm_sub_epi16(in3, in4);
-    const __m128i q5 = _mm_sub_epi16(in2, in5);
-    const __m128i q6 = _mm_sub_epi16(in1, in6);
-    const __m128i q7 = _mm_sub_epi16(in0, in7);
-    // Work on first four results
-    {
-      // Add/subtract
-      const __m128i r0 = _mm_add_epi16(q0, q3);
-      const __m128i r1 = _mm_add_epi16(q1, q2);
-      const __m128i r2 = _mm_sub_epi16(q1, q2);
-      const __m128i r3 = _mm_sub_epi16(q0, q3);
-      // Interleave to do the multiply by constants which gets us into 32bits
-      const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
-      const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
-      const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
-      const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
-
-      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
-      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
-      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
-      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
-
-      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
-      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
-      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
-      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
-      // dct_const_round_shift
-
-      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-
-      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-
-      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-
-      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-      // Combine
-
-      res0 = _mm_packs_epi32(w0, w1);
-      res4 = _mm_packs_epi32(w2, w3);
-      res2 = _mm_packs_epi32(w4, w5);
-      res6 = _mm_packs_epi32(w6, w7);
-    }
-    // Work on next four results
-    {
-      // Interleave to do the multiply by constants which gets us into 32bits
-      const __m128i d0 = _mm_sub_epi16(q6, q5);
-      const __m128i d1 = _mm_add_epi16(q6, q5);
-      const __m128i r0 = _mm_mulhrs_epi16(d0, k__dual_p16_p16);
-      const __m128i r1 = _mm_mulhrs_epi16(d1, k__dual_p16_p16);
-
-      // Add/subtract
-      const __m128i x0 = _mm_add_epi16(q4, r0);
-      const __m128i x1 = _mm_sub_epi16(q4, r0);
-      const __m128i x2 = _mm_sub_epi16(q7, r1);
-      const __m128i x3 = _mm_add_epi16(q7, r1);
-      // Interleave to do the multiply by constants which gets us into 32bits
-      const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
-      const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
-      const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
-      const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
-      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
-      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
-      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
-      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
-      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
-      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
-      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
-      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
-      // dct_const_round_shift
-      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-      // Combine
-      res1 = _mm_packs_epi32(w0, w1);
-      res7 = _mm_packs_epi32(w2, w3);
-      res5 = _mm_packs_epi32(w4, w5);
-      res3 = _mm_packs_epi32(w6, w7);
-    }
-    // Transpose the 8x8.
-    {
-      // 00 01 02 03 04 05 06 07
-      // 10 11 12 13 14 15 16 17
-      // 20 21 22 23 24 25 26 27
-      // 30 31 32 33 34 35 36 37
-      // 40 41 42 43 44 45 46 47
-      // 50 51 52 53 54 55 56 57
-      // 60 61 62 63 64 65 66 67
-      // 70 71 72 73 74 75 76 77
-      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
-      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
-      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
-      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
-      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
-      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
-      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
-      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
-      // 00 10 01 11 02 12 03 13
-      // 20 30 21 31 22 32 23 33
-      // 04 14 05 15 06 16 07 17
-      // 24 34 25 35 26 36 27 37
-      // 40 50 41 51 42 52 43 53
-      // 60 70 61 71 62 72 63 73
-      // 54 54 55 55 56 56 57 57
-      // 64 74 65 75 66 76 67 77
-      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-      // 00 10 20 30 01 11 21 31
-      // 40 50 60 70 41 51 61 71
-      // 02 12 22 32 03 13 23 33
-      // 42 52 62 72 43 53 63 73
-      // 04 14 24 34 05 15 21 36
-      // 44 54 64 74 45 55 61 76
-      // 06 16 26 36 07 17 27 37
-      // 46 56 66 76 47 57 67 77
-      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
-      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
-      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
-      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
-      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
-      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
-      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
-      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
-      // 00 10 20 30 40 50 60 70
-      // 01 11 21 31 41 51 61 71
-      // 02 12 22 32 42 52 62 72
-      // 03 13 23 33 43 53 63 73
-      // 04 14 24 34 44 54 64 74
-      // 05 15 25 35 45 55 65 75
-      // 06 16 26 36 46 56 66 76
-      // 07 17 27 37 47 57 67 77
-    }
-  }
-  // Post-condition output and store it
-  {
-    // Post-condition (division by two)
-    //    division of two 16 bits signed numbers using shifts
-    //    n / 2 = (n - (n >> 15)) >> 1
-    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
-    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
-    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
-    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
-    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
-    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
-    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
-    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
-    in0 = _mm_sub_epi16(in0, sign_in0);
-    in1 = _mm_sub_epi16(in1, sign_in1);
-    in2 = _mm_sub_epi16(in2, sign_in2);
-    in3 = _mm_sub_epi16(in3, sign_in3);
-    in4 = _mm_sub_epi16(in4, sign_in4);
-    in5 = _mm_sub_epi16(in5, sign_in5);
-    in6 = _mm_sub_epi16(in6, sign_in6);
-    in7 = _mm_sub_epi16(in7, sign_in7);
-    in0 = _mm_srai_epi16(in0, 1);
-    in1 = _mm_srai_epi16(in1, 1);
-    in2 = _mm_srai_epi16(in2, 1);
-    in3 = _mm_srai_epi16(in3, 1);
-    in4 = _mm_srai_epi16(in4, 1);
-    in5 = _mm_srai_epi16(in5, 1);
-    in6 = _mm_srai_epi16(in6, 1);
-    in7 = _mm_srai_epi16(in7, 1);
-  }
-
-  iscan_ptr += n_coeffs;
-  qcoeff_ptr += n_coeffs;
-  dqcoeff_ptr += n_coeffs;
-  n_coeffs = -n_coeffs;
-  zero = _mm_setzero_si128();
-
-  if (!skip_block) {
-    __m128i eob;
-    __m128i round, quant, dequant, thr;
-    int16_t nzflag;
-    {
-      __m128i coeff0, coeff1;
-
-      // Setup global values
-      {
-        round = _mm_load_si128((const __m128i *)round_ptr);
-        quant = _mm_load_si128((const __m128i *)quant_ptr);
-        dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-      }
-
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-        // Do DC and first 15 AC
-        coeff0 = *in[0];
-        coeff1 = *in[1];
-
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        round = _mm_unpackhi_epi64(round, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        quant = _mm_unpackhi_epi64(quant, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
-        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
-
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        dequant = _mm_unpackhi_epi64(dequant, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
-        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
-      }
-
-      {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob = _mm_max_epi16(eob, eob1);
-      }
-      n_coeffs += 8 * 2;
-    }
-
-    // AC only loop
-    index = 2;
-    thr = _mm_srai_epi16(dequant, 1);
-    while (n_coeffs < 0) {
-      __m128i coeff0, coeff1;
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-
-        assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1);
-        coeff0 = *in[index];
-        coeff1 = *in[index + 1];
-
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
-                 _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
-
-        if (nzflag) {
-          qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-          qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-          qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-          qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
-          // Reinsert signs
-          qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
-          qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
-          qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-          qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-          store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
-          store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
-
-          coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-          coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-          store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
-          store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
-        } else {
-          // Maybe a more efficient way to store 0?
-          store_zero_tran_low(qcoeff_ptr + n_coeffs);
-          store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
-
-          store_zero_tran_low(dqcoeff_ptr + n_coeffs);
-          store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
-        }
-      }
-
-      if (nzflag) {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob0, eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob0 = _mm_max_epi16(eob0, eob1);
-        eob = _mm_max_epi16(eob, eob0);
-      }
-      n_coeffs += 8 * 2;
-      index += 2;
-    }
-
-    // Accumulate EOB
-    {
-      __m128i eob_shuffled;
-      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      *eob_ptr = _mm_extract_epi16(eob, 1);
-    }
-  } else {
-    do {
-      store_zero_tran_low(dqcoeff_ptr + n_coeffs);
-      store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
-      store_zero_tran_low(qcoeff_ptr + n_coeffs);
-      store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
-      n_coeffs += 8 * 2;
-    } while (n_coeffs < 0);
-    *eob_ptr = 0;
-  }
-}
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c
index 91d0602f9d..5930bf491e 100644
--- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c
@@ -13,7 +13,6 @@
 #include "./vpx_config.h"
 #include "./vp9_rtcd.h"
 
-#include "vpx_ports/emmintrin_compat.h"
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/encoder/vp9_context_tree.h"
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
deleted file mode 100644
index 2f3c66c083..0000000000
--- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
+++ /dev/null
@@ -1,310 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#if defined(_MSC_VER)
-#include <intrin.h>
-#endif
-#include <emmintrin.h>
-#include <smmintrin.h>
-
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vp9/encoder/vp9_encoder.h"
-#include "vpx_ports/mem.h"
-
-#ifdef __GNUC__
-#define LIKELY(v) __builtin_expect(v, 1)
-#define UNLIKELY(v) __builtin_expect(v, 0)
-#else
-#define LIKELY(v) (v)
-#define UNLIKELY(v) (v)
-#endif
-
-static INLINE int_mv pack_int_mv(int16_t row, int16_t col) {
-  int_mv result;
-  result.as_mv.row = row;
-  result.as_mv.col = col;
-  return result;
-}
-
-static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) {
-  // This is simplified from the C implementation to utilise that
-  //  x->nmvjointsadcost[1] == x->nmvjointsadcost[2]  and
-  //  x->nmvjointsadcost[1] == x->nmvjointsadcost[3]
-  return mv.as_int == 0 ? 0 : 1;
-}
-
-static INLINE int mv_cost(const int_mv mv, const int *joint_cost,
-                          int *const comp_cost[2]) {
-  return joint_cost[get_mv_joint(mv)] + comp_cost[0][mv.as_mv.row] +
-         comp_cost[1][mv.as_mv.col];
-}
-
-static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref,
-                          int sad_per_bit) {
-  const int_mv diff =
-      pack_int_mv(mv.as_mv.row - ref->row, mv.as_mv.col - ref->col);
-  return ROUND_POWER_OF_TWO(
-      (unsigned)mv_cost(diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit,
-      VP9_PROB_COST_SHIFT);
-}
-
-/*****************************************************************************
- * This function utilizes 3 properties of the cost function lookup tables,   *
- * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in       *
- * vp9_encoder.c.                                                            *
- * For the joint cost:                                                       *
- *   - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3]           *
- * For the component costs:                                                  *
- *   - For all i: mvsadcost[0][i] == mvsadcost[1][i]                         *
- *         (Equal costs for both components)                                 *
- *   - For all i: mvsadcost[0][i] == mvsadcost[0][-i]                        *
- *         (Cost function is even)                                           *
- * If these do not hold, then this function cannot be used without           *
- * modification, in which case you can revert to using the C implementation, *
- * which does not rely on these properties.                                  *
- *****************************************************************************/
-int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
-                               const search_site_config *cfg, MV *ref_mv,
-                               MV *best_mv, int search_param, int sad_per_bit,
-                               int *num00, const vp9_variance_fn_ptr_t *fn_ptr,
-                               const MV *center_mv) {
-  const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max);
-  const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int);
-  const int_mv minmv = pack_int_mv(x->mv_limits.row_min, x->mv_limits.col_min);
-  const __m128i v_min_mv_w = _mm_set1_epi32(minmv.as_int);
-
-  const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit);
-
-  const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]);
-  const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]);
-
-  // search_param determines the length of the initial step and hence the number
-  // of iterations.
-  // 0 = initial step (MAX_FIRST_STEP) pel
-  // 1 = (MAX_FIRST_STEP/2) pel,
-  // 2 = (MAX_FIRST_STEP/4) pel...
-  const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param];
-  const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param];
-  const int tot_steps = cfg->total_steps - search_param;
-
-  const int_mv fcenter_mv =
-      pack_int_mv(center_mv->row >> 3, center_mv->col >> 3);
-  const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int);
-
-  const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row);
-  const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col);
-
-  int_mv bmv = pack_int_mv(ref_row, ref_col);
-  int_mv new_bmv = bmv;
-  __m128i v_bmv_w = _mm_set1_epi32(bmv.as_int);
-
-  const int what_stride = x->plane[0].src.stride;
-  const int in_what_stride = x->e_mbd.plane[0].pre[0].stride;
-  const uint8_t *const what = x->plane[0].src.buf;
-  const uint8_t *const in_what =
-      x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
-
-  // Work out the start point for the search
-  const uint8_t *best_address = in_what;
-  const uint8_t *new_best_address = best_address;
-#if ARCH_X86_64
-  __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
-#else
-  __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address);
-#endif
-
-  unsigned int best_sad;
-  int i, j, step;
-
-  // Check the prerequisite cost function properties that are easy to check
-  // in an assert. See the function-level documentation for details on all
-  // prerequisites.
-  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
-  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);
-
-  // Check the starting position
-  best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride);
-  best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit);
-
-  *num00 = 0;
-
-  for (i = 0, step = 0; step < tot_steps; step++) {
-    for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) {
-      __m128i v_sad_d, v_cost_d, v_outside_d, v_inside_d, v_diff_mv_w;
-#if ARCH_X86_64
-      __m128i v_blocka[2];
-#else
-      __m128i v_blocka[1];
-#endif
-
-      // Compute the candidate motion vectors
-      const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i *)&ss_mv[i]);
-      const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w);
-      // Clamp them to the search bounds
-      __m128i v_these_mv_clamp_w = v_these_mv_w;
-      v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w);
-      v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w);
-      // The ones that did not change are inside the search area
-      v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w);
-
-      // If none of them are inside, then move on
-      if (LIKELY(_mm_test_all_zeros(v_inside_d, v_inside_d))) {
-        continue;
-      }
-
-      // The inverse mask indicates which of the MVs are outside
-      v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8(0xff));
-      // Shift right to keep the sign bit clear, we will use this later
-      // to set the cost to the maximum value.
-      v_outside_d = _mm_srli_epi32(v_outside_d, 1);
-
-      // Compute the difference MV
-      v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv);
-      // We utilise the fact that the cost function is even, and use the
-      // absolute difference. This allows us to use unsigned indexes later
-      // and reduces cache pressure somewhat as only a half of the table
-      // is ever referenced.
-      v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w);
-
-      // Compute the SIMD pointer offsets.
-      {
-#if ARCH_X86_64  //  sizeof(intptr_t) == 8
-        // Load the offsets
-        __m128i v_bo10_q = _mm_loadu_si128((const __m128i *)&ss_os[i + 0]);
-        __m128i v_bo32_q = _mm_loadu_si128((const __m128i *)&ss_os[i + 2]);
-        // Set the ones falling outside to zero
-        v_bo10_q = _mm_and_si128(v_bo10_q, _mm_cvtepi32_epi64(v_inside_d));
-        v_bo32_q =
-            _mm_and_si128(v_bo32_q, _mm_unpackhi_epi32(v_inside_d, v_inside_d));
-        // Compute the candidate addresses
-        v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q);
-        v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q);
-#else  // ARCH_X86 //  sizeof(intptr_t) == 4
-        __m128i v_bo_d = _mm_loadu_si128((const __m128i *)&ss_os[i]);
-        v_bo_d = _mm_and_si128(v_bo_d, v_inside_d);
-        v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d);
-#endif
-      }
-
-      fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
-                     in_what_stride, (uint32_t *)&v_sad_d);
-
-      // Look up the component cost of the residual motion vector
-      {
-        const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0);
-        const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1);
-        const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2);
-        const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3);
-        const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4);
-        const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5);
-        const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6);
-        const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7);
-
-        // Note: This is a use case for vpgather in AVX2
-        const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0];
-        const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1];
-        const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2];
-        const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3];
-
-        __m128i v_cost_10_d, v_cost_32_d;
-        v_cost_10_d = _mm_cvtsi32_si128(cost0);
-        v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1);
-        v_cost_32_d = _mm_cvtsi32_si128(cost2);
-        v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1);
-        v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d);
-      }
-
-      // Now add in the joint cost
-      {
-        const __m128i v_sel_d =
-            _mm_cmpeq_epi32(v_diff_mv_w, _mm_setzero_si128());
-        const __m128i v_joint_cost_d =
-            _mm_blendv_epi8(v_joint_cost_1_d, v_joint_cost_0_d, v_sel_d);
-        v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d);
-      }
-
-      // Multiply by sad_per_bit
-      v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d);
-      // ROUND_POWER_OF_TWO(v_cost_d, VP9_PROB_COST_SHIFT)
-      v_cost_d = _mm_add_epi32(v_cost_d,
-                               _mm_set1_epi32(1 << (VP9_PROB_COST_SHIFT - 1)));
-      v_cost_d = _mm_srai_epi32(v_cost_d, VP9_PROB_COST_SHIFT);
-      // Add the cost to the sad
-      v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d);
-
-      // Make the motion vectors outside the search area have max cost
-      // by or'ing in the comparison mask, this way the minimum search won't
-      // pick them.
-      v_sad_d = _mm_or_si128(v_sad_d, v_outside_d);
-
-      // Find the minimum value and index horizontally in v_sad_d
-      {
-        // Try speculatively on 16 bits, so we can use the minpos intrinsic
-        const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d);
-        const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w);
-
-        uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0);
-        uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1);
-
-        // If the local best value is not saturated, just use it, otherwise
-        // find the horizontal minimum again the hard way on 32 bits.
-        // This is executed rarely.
-        if (UNLIKELY(local_best_sad == 0xffff)) {
-          __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d;
-
-          v_loval_d = v_sad_d;
-          v_loidx_d = _mm_set_epi32(3, 2, 1, 0);
-          v_hival_d = _mm_srli_si128(v_loval_d, 8);
-          v_hiidx_d = _mm_srli_si128(v_loidx_d, 8);
-
-          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);
-
-          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
-          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);
-          v_hival_d = _mm_srli_si128(v_loval_d, 4);
-          v_hiidx_d = _mm_srli_si128(v_loidx_d, 4);
-
-          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);
-
-          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
-          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);
-
-          local_best_sad = _mm_extract_epi32(v_loval_d, 0);
-          local_best_idx = _mm_extract_epi32(v_loidx_d, 0);
-        }
-
-        // Update the global minimum if the local minimum is smaller
-        if (LIKELY(local_best_sad < best_sad)) {
-          new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx];
-          new_best_address = ((const uint8_t **)v_blocka)[local_best_idx];
-
-          best_sad = local_best_sad;
-        }
-      }
-    }
-
-    bmv = new_bmv;
-    best_address = new_best_address;
-
-    v_bmv_w = _mm_set1_epi32(bmv.as_int);
-#if ARCH_X86_64
-    v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
-#else
-    v_ba_d = _mm_set1_epi32((intptr_t)best_address);
-#endif
-
-    if (UNLIKELY(best_address == in_what)) {
-      (*num00)++;
-    }
-  }
-
-  *best_mv = bmv.as_mv;
-  return best_sad;
-}
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_avx2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_avx2.c
new file mode 100644
index 0000000000..99fef31d16
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_avx2.c
@@ -0,0 +1,161 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vp9_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/bitdepth_conversion_avx2.h"
+
+int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+                             intptr_t block_size, int64_t *ssz) {
+  __m256i sse_256, ssz_256;
+  __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi;
+  __m256i sse_hi, ssz_hi;
+  __m128i sse_128, ssz_128;
+  int64_t sse;
+  const __m256i zero = _mm256_setzero_si256();
+
+  // If the block size is 16 then the results will fit in 32 bits.
+  if (block_size == 16) {
+    __m256i coeff_256, dqcoeff_256, coeff_hi, dqcoeff_hi;
+    // Load 16 elements for coeff and dqcoeff.
+    coeff_256 = load_tran_low(coeff);
+    dqcoeff_256 = load_tran_low(dqcoeff);
+    // dqcoeff - coeff
+    dqcoeff_256 = _mm256_sub_epi16(dqcoeff_256, coeff_256);
+    // madd (dqcoeff - coeff)
+    dqcoeff_256 = _mm256_madd_epi16(dqcoeff_256, dqcoeff_256);
+    // madd coeff
+    coeff_256 = _mm256_madd_epi16(coeff_256, coeff_256);
+    // Save the higher 64 bit of each 128 bit lane.
+    dqcoeff_hi = _mm256_srli_si256(dqcoeff_256, 8);
+    coeff_hi = _mm256_srli_si256(coeff_256, 8);
+    // Add the higher 64 bit to the low 64 bit.
+    dqcoeff_256 = _mm256_add_epi32(dqcoeff_256, dqcoeff_hi);
+    coeff_256 = _mm256_add_epi32(coeff_256, coeff_hi);
+    // Expand each double word in the lower 64 bits to quad word.
+    sse_256 = _mm256_unpacklo_epi32(dqcoeff_256, zero);
+    ssz_256 = _mm256_unpacklo_epi32(coeff_256, zero);
+  } else {
+    int i;
+    assert(block_size % 32 == 0);
+    sse_256 = zero;
+    ssz_256 = zero;
+
+    for (i = 0; i < block_size; i += 32) {
+      __m256i coeff_0, coeff_1, dqcoeff_0, dqcoeff_1;
+      // Load 32 elements for coeff and dqcoeff.
+      coeff_0 = load_tran_low(coeff + i);
+      dqcoeff_0 = load_tran_low(dqcoeff + i);
+      coeff_1 = load_tran_low(coeff + i + 16);
+      dqcoeff_1 = load_tran_low(dqcoeff + i + 16);
+      // dqcoeff - coeff
+      dqcoeff_0 = _mm256_sub_epi16(dqcoeff_0, coeff_0);
+      dqcoeff_1 = _mm256_sub_epi16(dqcoeff_1, coeff_1);
+      // madd (dqcoeff - coeff)
+      dqcoeff_0 = _mm256_madd_epi16(dqcoeff_0, dqcoeff_0);
+      dqcoeff_1 = _mm256_madd_epi16(dqcoeff_1, dqcoeff_1);
+      // madd coeff
+      coeff_0 = _mm256_madd_epi16(coeff_0, coeff_0);
+      coeff_1 = _mm256_madd_epi16(coeff_1, coeff_1);
+      // Add the first madd (dqcoeff - coeff) with the second.
+      dqcoeff_0 = _mm256_add_epi32(dqcoeff_0, dqcoeff_1);
+      // Add the first madd (coeff) with the second.
+      coeff_0 = _mm256_add_epi32(coeff_0, coeff_1);
+      // Expand each double word of madd (dqcoeff - coeff) to quad word.
+      exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_0, zero);
+      exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_0, zero);
+      // expand each double word of madd (coeff) to quad word
+      exp_coeff_lo = _mm256_unpacklo_epi32(coeff_0, zero);
+      exp_coeff_hi = _mm256_unpackhi_epi32(coeff_0, zero);
+      // Add each quad word of madd (dqcoeff - coeff) and madd (coeff).
+      sse_256 = _mm256_add_epi64(sse_256, exp_dqcoeff_lo);
+      ssz_256 = _mm256_add_epi64(ssz_256, exp_coeff_lo);
+      sse_256 = _mm256_add_epi64(sse_256, exp_dqcoeff_hi);
+      ssz_256 = _mm256_add_epi64(ssz_256, exp_coeff_hi);
+    }
+  }
+  // Save the higher 64 bit of each 128 bit lane.
+  sse_hi = _mm256_srli_si256(sse_256, 8);
+  ssz_hi = _mm256_srli_si256(ssz_256, 8);
+  // Add the higher 64 bit to the low 64 bit.
+  sse_256 = _mm256_add_epi64(sse_256, sse_hi);
+  ssz_256 = _mm256_add_epi64(ssz_256, ssz_hi);
+
+  // Add each 64 bit from each of the 128 bit lane of the 256 bit.
+  sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256),
+                          _mm256_extractf128_si256(sse_256, 1));
+
+  ssz_128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_256),
+                          _mm256_extractf128_si256(ssz_256, 1));
+
+  // Store the results.
+  _mm_storel_epi64((__m128i *)(&sse), sse_128);
+
+  _mm_storel_epi64((__m128i *)(ssz), ssz_128);
+  return sse;
+}
+
+int64_t vp9_block_error_fp_avx2(const tran_low_t *coeff,
+                                const tran_low_t *dqcoeff, int block_size) {
+  int i;
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i sse_256 = zero;
+  __m256i sse_hi;
+  __m128i sse_128;
+  int64_t sse;
+
+  if (block_size == 16) {
+    // Load 16 elements for coeff and dqcoeff.
+    const __m256i _coeff = load_tran_low(coeff);
+    const __m256i _dqcoeff = load_tran_low(dqcoeff);
+    // dqcoeff - coeff
+    const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff);
+    // madd (dqcoeff - coeff)
+    const __m256i error_lo = _mm256_madd_epi16(diff, diff);
+    // Save the higher 64 bit of each 128 bit lane.
+    const __m256i error_hi = _mm256_srli_si256(error_lo, 8);
+    // Add the higher 64 bit to the low 64 bit.
+    const __m256i error = _mm256_add_epi32(error_lo, error_hi);
+    // Expand each double word in the lower 64 bits to quad word.
+    sse_256 = _mm256_unpacklo_epi32(error, zero);
+  } else {
+    for (i = 0; i < block_size; i += 16) {
+      // Load 16 elements for coeff and dqcoeff.
+      const __m256i _coeff = load_tran_low(coeff);
+      const __m256i _dqcoeff = load_tran_low(dqcoeff);
+      const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff);
+      const __m256i error = _mm256_madd_epi16(diff, diff);
+      // Expand each double word of madd (dqcoeff - coeff) to quad word.
+      const __m256i exp_error_lo = _mm256_unpacklo_epi32(error, zero);
+      const __m256i exp_error_hi = _mm256_unpackhi_epi32(error, zero);
+      // Add each quad word of madd (dqcoeff - coeff).
+      sse_256 = _mm256_add_epi64(sse_256, exp_error_lo);
+      sse_256 = _mm256_add_epi64(sse_256, exp_error_hi);
+      coeff += 16;
+      dqcoeff += 16;
+    }
+  }
+  // Save the higher 64 bit of each 128 bit lane.
+  sse_hi = _mm256_srli_si256(sse_256, 8);
+  // Add the higher 64 bit to the low 64 bit.
+  sse_256 = _mm256_add_epi64(sse_256, sse_hi);
+
+  // Add each 64 bit from each of the 128 bit lane of the 256 bit.
+  sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256),
+                          _mm256_extractf128_si256(sse_256, 1));
+
+  // Store the results.
+  _mm_storel_epi64((__m128i *)&sse, sse_128);
+  return sse;
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c
deleted file mode 100644
index 453af2a403..0000000000
--- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Usee of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <immintrin.h>  // AVX2
-
-#include "./vp9_rtcd.h"
-#include "vpx/vpx_integer.h"
-
-int64_t vp9_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff,
-                             intptr_t block_size, int64_t *ssz) {
-  __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg;
-  __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi;
-  __m256i sse_reg_64hi, ssz_reg_64hi;
-  __m128i sse_reg128, ssz_reg128;
-  int64_t sse;
-  int i;
-  const __m256i zero_reg = _mm256_set1_epi16(0);
-
-  // init sse and ssz registerd to zero
-  sse_reg = _mm256_set1_epi16(0);
-  ssz_reg = _mm256_set1_epi16(0);
-
-  for (i = 0; i < block_size; i += 16) {
-    // load 32 bytes from coeff and dqcoeff
-    coeff_reg = _mm256_loadu_si256((const __m256i *)(coeff + i));
-    dqcoeff_reg = _mm256_loadu_si256((const __m256i *)(dqcoeff + i));
-    // dqcoeff - coeff
-    dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg);
-    // madd (dqcoeff - coeff)
-    dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg);
-    // madd coeff
-    coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg);
-    // expand each double word of madd (dqcoeff - coeff) to quad word
-    exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg);
-    exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg);
-    // expand each double word of madd (coeff) to quad word
-    exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg);
-    exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg);
-    // add each quad word of madd (dqcoeff - coeff) and madd (coeff)
-    sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo);
-    ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo);
-    sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi);
-    ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi);
-  }
-  // save the higher 64 bit of each 128 bit lane
-  sse_reg_64hi = _mm256_srli_si256(sse_reg, 8);
-  ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8);
-  // add the higher 64 bit to the low 64 bit
-  sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi);
-  ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi);
-
-  // add each 64 bit from each of the 128 bit lane of the 256 bit
-  sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg),
-                             _mm256_extractf128_si256(sse_reg, 1));
-
-  ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg),
-                             _mm256_extractf128_si256(ssz_reg, 1));
-
-  // store the results
-  _mm_storel_epi64((__m128i *)(&sse), sse_reg128);
-
-  _mm_storel_epi64((__m128i *)(ssz), ssz_reg128);
-  return sse;
-}
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_sse2.asm b/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
index 5b0238272b..7beec130ab 100644
--- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
@@ -11,6 +11,7 @@
 %define private_prefix vp9
 
 %include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
 
 SECTION .text
 
@@ -22,14 +23,14 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
   pxor      m4, m4                 ; sse accumulator
   pxor      m6, m6                 ; ssz accumulator
   pxor      m5, m5                 ; dedicated zero register
-  lea     uqcq, [uqcq+sizeq*2]
-  lea     dqcq, [dqcq+sizeq*2]
-  neg    sizeq
 .loop:
-  mova      m2, [uqcq+sizeq*2]
-  mova      m0, [dqcq+sizeq*2]
-  mova      m3, [uqcq+sizeq*2+mmsize]
-  mova      m1, [dqcq+sizeq*2+mmsize]
+  LOAD_TRAN_LOW 2, uqcq, 0
+  LOAD_TRAN_LOW 0, dqcq, 0
+  LOAD_TRAN_LOW 3, uqcq, 8
+  LOAD_TRAN_LOW 1, dqcq, 8
+  INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16
+  INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16
+  sub    sizeq, 16
   psubw     m0, m2
   psubw     m1, m3
   ; individual errors are max. 15bit+sign, so squares are 30bit, and
@@ -38,32 +39,26 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
   pmaddwd   m1, m1
   pmaddwd   m2, m2
   pmaddwd   m3, m3
+  ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
+  paddd     m0, m1
+  paddd     m2, m3
   ; accumulate in 64bit
   punpckldq m7, m0, m5
   punpckhdq m0, m5
   paddq     m4, m7
-  punpckldq m7, m1, m5
-  paddq     m4, m0
-  punpckhdq m1, m5
-  paddq     m4, m7
   punpckldq m7, m2, m5
-  paddq     m4, m1
+  paddq     m4, m0
   punpckhdq m2, m5
   paddq     m6, m7
-  punpckldq m7, m3, m5
   paddq     m6, m2
-  punpckhdq m3, m5
-  paddq     m6, m7
-  paddq     m6, m3
-  add    sizeq, mmsize
-  jl .loop
+  jg .loop
 
   ; accumulate horizontally and store in return value
   movhlps   m5, m4
   movhlps   m7, m6
   paddq     m4, m5
   paddq     m6, m7
-%if ARCH_X86_64
+%if VPX_ARCH_X86_64
   movq    rax, m4
   movq [sszq], m6
 %else
@@ -75,44 +70,42 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
 %endif
   RET
 
-; Compute the sum of squared difference between two int16_t vectors.
-; int64_t vp9_block_error_fp(int16_t *coeff, int16_t *dqcoeff,
+; Compute the sum of squared difference between two tran_low_t vectors.
+; Vectors are converted (if necessary) to int16_t for calculations.
+; int64_t vp9_block_error_fp(tran_low_t *coeff, tran_low_t *dqcoeff,
 ;                            intptr_t block_size)
 
 INIT_XMM sse2
 cglobal block_error_fp, 3, 3, 6, uqc, dqc, size
   pxor      m4, m4                 ; sse accumulator
   pxor      m5, m5                 ; dedicated zero register
-  lea     uqcq, [uqcq+sizeq*2]
-  lea     dqcq, [dqcq+sizeq*2]
-  neg    sizeq
 .loop:
-  mova      m2, [uqcq+sizeq*2]
-  mova      m0, [dqcq+sizeq*2]
-  mova      m3, [uqcq+sizeq*2+mmsize]
-  mova      m1, [dqcq+sizeq*2+mmsize]
+  LOAD_TRAN_LOW 2, uqcq, 0
+  LOAD_TRAN_LOW 0, dqcq, 0
+  LOAD_TRAN_LOW 3, uqcq, 8
+  LOAD_TRAN_LOW 1, dqcq, 8
+  INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16
+  INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16
+  sub    sizeq, 16
   psubw     m0, m2
   psubw     m1, m3
   ; individual errors are max. 15bit+sign, so squares are 30bit, and
   ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
   pmaddwd   m0, m0
   pmaddwd   m1, m1
+  ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
+  paddd     m0, m1
   ; accumulate in 64bit
   punpckldq m3, m0, m5
   punpckhdq m0, m5
   paddq     m4, m3
-  punpckldq m3, m1, m5
   paddq     m4, m0
-  punpckhdq m1, m5
-  paddq     m4, m3
-  paddq     m4, m1
-  add    sizeq, mmsize
-  jl .loop
+  jnz .loop
 
   ; accumulate horizontally and store in return value
   movhlps   m5, m4
   paddq     m4, m5
-%if ARCH_X86_64
+%if VPX_ARCH_X86_64
   movq    rax, m4
 %else
   pshufd   m5, m4, 0x1
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
index fa2a6449b0..628dc4fead 100644
--- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
@@ -13,192 +13,895 @@
 #include "./vp9_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
+#include "vpx_dsp/x86/convolve_ssse3.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
 #include "vpx_scale/yv12config.h"
 
-extern void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
-                                         YV12_BUFFER_CONFIG *dst);
+static INLINE __m128i scale_plane_2_to_1_phase_0_kernel(
+    const uint8_t *const src, const __m128i *const mask) {
+  const __m128i a = _mm_loadu_si128((const __m128i *)(&src[0]));
+  const __m128i b = _mm_loadu_si128((const __m128i *)(&src[16]));
+  const __m128i a_and = _mm_and_si128(a, *mask);
+  const __m128i b_and = _mm_and_si128(b, *mask);
+  return _mm_packus_epi16(a_and, b_and);
+}
 
-static void downsample_2_to_1_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride, int w,
-                                    int h) {
+static void scale_plane_2_to_1_phase_0(const uint8_t *src,
+                                       const ptrdiff_t src_stride, uint8_t *dst,
+                                       const ptrdiff_t dst_stride,
+                                       const int dst_w, const int dst_h) {
+  const int max_width = (dst_w + 15) & ~15;
   const __m128i mask = _mm_set1_epi16(0x00FF);
-  const int max_width = w & ~15;
+  int y = dst_h;
+
+  do {
+    int x = max_width;
+    do {
+      const __m128i d = scale_plane_2_to_1_phase_0_kernel(src, &mask);
+      _mm_storeu_si128((__m128i *)dst, d);
+      src += 32;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 2 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static void scale_plane_4_to_1_phase_0(const uint8_t *src,
+                                       const ptrdiff_t src_stride, uint8_t *dst,
+                                       const ptrdiff_t dst_stride,
+                                       const int dst_w, const int dst_h) {
+  const int max_width = (dst_w + 15) & ~15;
+  const __m128i mask = _mm_set1_epi32(0x000000FF);
+  int y = dst_h;
+
+  do {
+    int x = max_width;
+    do {
+      const __m128i d0 = scale_plane_2_to_1_phase_0_kernel(&src[0], &mask);
+      const __m128i d1 = scale_plane_2_to_1_phase_0_kernel(&src[32], &mask);
+      const __m128i d2 = _mm_packus_epi16(d0, d1);
+      _mm_storeu_si128((__m128i *)dst, d2);
+      src += 64;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 4 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static INLINE __m128i scale_plane_bilinear_kernel(const __m128i *const s,
+                                                  const __m128i c0c1) {
+  const __m128i k_64 = _mm_set1_epi16(1 << 6);
+  const __m128i t0 = _mm_maddubs_epi16(s[0], c0c1);
+  const __m128i t1 = _mm_maddubs_epi16(s[1], c0c1);
+  // round and shift by 7 bit each 16 bit
+  const __m128i t2 = _mm_adds_epi16(t0, k_64);
+  const __m128i t3 = _mm_adds_epi16(t1, k_64);
+  const __m128i t4 = _mm_srai_epi16(t2, 7);
+  const __m128i t5 = _mm_srai_epi16(t3, 7);
+  return _mm_packus_epi16(t4, t5);
+}
+
+static void scale_plane_2_to_1_bilinear(const uint8_t *src,
+                                        const ptrdiff_t src_stride,
+                                        uint8_t *dst,
+                                        const ptrdiff_t dst_stride,
+                                        const int dst_w, const int dst_h,
+                                        const __m128i c0c1) {
+  const int max_width = (dst_w + 15) & ~15;
+  int y = dst_h;
+
+  do {
+    int x = max_width;
+    do {
+      __m128i s[2], d[2];
+
+      // Horizontal
+      // Even rows
+      s[0] = _mm_loadu_si128((const __m128i *)(src + 0));
+      s[1] = _mm_loadu_si128((const __m128i *)(src + 16));
+      d[0] = scale_plane_bilinear_kernel(s, c0c1);
+
+      // odd rows
+      s[0] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0));
+      s[1] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16));
+      d[1] = scale_plane_bilinear_kernel(s, c0c1);
+
+      // Vertical
+      s[0] = _mm_unpacklo_epi8(d[0], d[1]);
+      s[1] = _mm_unpackhi_epi8(d[0], d[1]);
+      d[0] = scale_plane_bilinear_kernel(s, c0c1);
+
+      _mm_storeu_si128((__m128i *)dst, d[0]);
+      src += 32;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 2 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static void scale_plane_4_to_1_bilinear(const uint8_t *src,
+                                        const ptrdiff_t src_stride,
+                                        uint8_t *dst,
+                                        const ptrdiff_t dst_stride,
+                                        const int dst_w, const int dst_h,
+                                        const __m128i c0c1) {
+  const int max_width = (dst_w + 15) & ~15;
+  int y = dst_h;
+
+  do {
+    int x = max_width;
+    do {
+      __m128i s[8], d[8];
+
+      // Note: Using _mm_packus_epi32() in SSE4.1 could be faster.
+      //       Here we tried to not use shuffle instructions which would be slow
+      //       on some x86 CPUs.
+
+      // Horizontal
+      // 000 001 xx xx 004 005 xx xx  008 009 xx xx 00C 00D xx xx
+      // 010 011 xx xx 014 015 xx xx  018 019 xx xx 01C 01D xx xx
+      // 020 021 xx xx 024 025 xx xx  028 029 xx xx 02C 02D xx xx
+      // 030 031 xx xx 034 035 xx xx  038 039 xx xx 03C 03D xx xx
+      // 100 101 xx xx 104 105 xx xx  108 109 xx xx 10C 10D xx xx
+      // 110 111 xx xx 114 115 xx xx  118 119 xx xx 11C 11D xx xx
+      // 120 121 xx xx 124 125 xx xx  128 129 xx xx 12C 12D xx xx
+      // 130 131 xx xx 134 135 xx xx  138 139 xx xx 13C 13D xx xx
+      s[0] = _mm_loadu_si128((const __m128i *)(&src[0]));
+      s[1] = _mm_loadu_si128((const __m128i *)(&src[16]));
+      s[2] = _mm_loadu_si128((const __m128i *)(&src[32]));
+      s[3] = _mm_loadu_si128((const __m128i *)(&src[48]));
+      s[4] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0));
+      s[5] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16));
+      s[6] = _mm_loadu_si128((const __m128i *)(src + src_stride + 32));
+      s[7] = _mm_loadu_si128((const __m128i *)(src + src_stride + 48));
+
+      // 000 001 100 101 xx xx xx xx  004 005 104 105 xx xx xx xx
+      // 008 009 108 109 xx xx xx xx  00C 00D 10C 10D xx xx xx xx
+      // 010 011 110 111 xx xx xx xx  014 015 114 115 xx xx xx xx
+      // 018 019 118 119 xx xx xx xx  01C 01D 11C 11D xx xx xx xx
+      // 020 021 120 121 xx xx xx xx  024 025 124 125 xx xx xx xx
+      // 028 029 128 129 xx xx xx xx  02C 02D 12C 12D xx xx xx xx
+      // 030 031 130 131 xx xx xx xx  034 035 134 135 xx xx xx xx
+      // 038 039 138 139 xx xx xx xx  03C 03D 13C 13D xx xx xx xx
+      d[0] = _mm_unpacklo_epi16(s[0], s[4]);
+      d[1] = _mm_unpackhi_epi16(s[0], s[4]);
+      d[2] = _mm_unpacklo_epi16(s[1], s[5]);
+      d[3] = _mm_unpackhi_epi16(s[1], s[5]);
+      d[4] = _mm_unpacklo_epi16(s[2], s[6]);
+      d[5] = _mm_unpackhi_epi16(s[2], s[6]);
+      d[6] = _mm_unpacklo_epi16(s[3], s[7]);
+      d[7] = _mm_unpackhi_epi16(s[3], s[7]);
+
+      // 000 001 100 101 008 009 108 109  xx xx xx xx xx xx xx xx
+      // 004 005 104 105 00C 00D 10C 10D  xx xx xx xx xx xx xx xx
+      // 010 011 110 111 018 019 118 119  xx xx xx xx xx xx xx xx
+      // 014 015 114 115 01C 01D 11C 11D  xx xx xx xx xx xx xx xx
+      // 020 021 120 121 028 029 128 129  xx xx xx xx xx xx xx xx
+      // 024 025 124 125 02C 02D 12C 12D  xx xx xx xx xx xx xx xx
+      // 030 031 130 131 038 039 138 139  xx xx xx xx xx xx xx xx
+      // 034 035 134 135 03C 03D 13C 13D  xx xx xx xx xx xx xx xx
+      s[0] = _mm_unpacklo_epi32(d[0], d[1]);
+      s[1] = _mm_unpackhi_epi32(d[0], d[1]);
+      s[2] = _mm_unpacklo_epi32(d[2], d[3]);
+      s[3] = _mm_unpackhi_epi32(d[2], d[3]);
+      s[4] = _mm_unpacklo_epi32(d[4], d[5]);
+      s[5] = _mm_unpackhi_epi32(d[4], d[5]);
+      s[6] = _mm_unpacklo_epi32(d[6], d[7]);
+      s[7] = _mm_unpackhi_epi32(d[6], d[7]);
+
+      // 000 001 100 101 004 005 104 105  008 009 108 109 00C 00D 10C 10D
+      // 010 011 110 111 014 015 114 115  018 019 118 119 01C 01D 11C 11D
+      // 020 021 120 121 024 025 124 125  028 029 128 129 02C 02D 12C 12D
+      // 030 031 130 131 034 035 134 135  038 039 138 139 03C 03D 13C 13D
+      d[0] = _mm_unpacklo_epi32(s[0], s[1]);
+      d[1] = _mm_unpacklo_epi32(s[2], s[3]);
+      d[2] = _mm_unpacklo_epi32(s[4], s[5]);
+      d[3] = _mm_unpacklo_epi32(s[6], s[7]);
+
+      d[0] = scale_plane_bilinear_kernel(&d[0], c0c1);
+      d[1] = scale_plane_bilinear_kernel(&d[2], c0c1);
+
+      // Vertical
+      d[0] = scale_plane_bilinear_kernel(d, c0c1);
+
+      _mm_storeu_si128((__m128i *)dst, d[0]);
+      src += 64;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 4 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const int16_t *const coef,
+                                       uint8_t *const temp_buffer) {
+  const int width_hor = (w + 3) & ~3;
+  const int width_ver = (w + 7) & ~7;
+  const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+  const int height_ver = (h + 3) & ~3;
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  __m128i s[11], d[4];
+  __m128i f[4];
+
+  assert(w && h);
+
+  shuffle_filter_ssse3(coef, f);
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1;
+
+  // horizontal 4x8
+  do {
+    load_8bit_8x8(src + 2, src_stride, s);
+    // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
+    // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
+    // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
+    // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77 (overlapped)
+    transpose_16bit_4x8(s, s);
+    x = width_hor;
+
+    do {
+      src += 8;
+      load_8bit_8x8(src, src_stride, &s[3]);
+      // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
+      // 08 09 18 19 28 29 38 39  48 49 58 59 68 69 78 79
+      // 0A 0B 1A 1B 2A 2B 3A 3B  4A 4B 5A 5B 6A 6B 7A 7B
+      // 0C 0D 1C 1D 2C 2D 3C 3D  4C 4D 5C 5D 6C 6D 7C 7D
+      transpose_16bit_4x8(&s[3], &s[3]);
+
+      d[0] = convolve8_8_ssse3(&s[0], f);  // 00 10 20 30 40 50 60 70
+      d[1] = convolve8_8_ssse3(&s[1], f);  // 01 11 21 31 41 51 61 71
+      d[2] = convolve8_8_ssse3(&s[2], f);  // 02 12 22 32 42 52 62 72
+      d[3] = convolve8_8_ssse3(&s[3], f);  // 03 13 23 33 43 53 63 73
+
+      // 00 10 20 30 40 50 60 70  02 12 22 32 42 52 62 72
+      // 01 11 21 31 41 51 61 71  03 13 23 33 43 53 63 73
+      d[0] = _mm_packus_epi16(d[0], d[2]);
+      d[1] = _mm_packus_epi16(d[1], d[3]);
+      // 00 10 01 11 20 30 21 31  40 50 41 51 60 70 61 71
+      // 02 12 03 13 22 32 23 33  42 52 43 53 62 72 63 73
+      d[2] = _mm_unpacklo_epi16(d[0], d[1]);
+      d[3] = _mm_unpackhi_epi16(d[0], d[1]);
+      // 00 10 01 11 02 12 03 13  20 30 21 31 22 32 23 33
+      // 40 50 41 51 42 52 43 53  60 70 61 71 62 72 63 73
+      d[0] = _mm_unpacklo_epi32(d[2], d[3]);
+      d[1] = _mm_unpackhi_epi32(d[2], d[3]);
+      store_8bit_8x4_from_16x2(d, t, 2 * width_hor);
+
+      s[0] = s[4];
+      s[1] = s[5];
+      s[2] = s[6];
+
+      t += 8;
+      x -= 4;
+    } while (x);
+    src += 8 * src_stride - 2 * width_hor;
+    t += 6 * width_hor;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x4
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    // 00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+    // 20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+    // 40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
+    s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor));
+    s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor));
+    s[2] = _mm_loadu_si128((const __m128i *)(t + 4 * width_hor));
+    t += 6 * width_hor;
+    y = height_ver;
+
+    do {
+      // 60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
+      // 80 90 81 91 82 92 83 93  84 94 85 95 86 96 87 77
+      // A0 B0 A1 B1 A2 B2 A3 B3  A4 B4 A5 B5 A6 B6 A7 77
+      // C0 D0 C1 D1 C2 D2 C3 D3  C4 D4 C5 D5 C6 D6 C7 77
+      loadu_8bit_16x4(t, 2 * width_hor, &s[3]);
+      t += 8 * width_hor;
+
+      d[0] = convolve8_8_ssse3(&s[0], f);  // 00 01 02 03 04 05 06 07
+      d[1] = convolve8_8_ssse3(&s[1], f);  // 10 11 12 13 14 15 16 17
+      d[2] = convolve8_8_ssse3(&s[2], f);  // 20 21 22 23 24 25 26 27
+      d[3] = convolve8_8_ssse3(&s[3], f);  // 30 31 32 33 34 35 36 37
+
+      // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
+      // 20 21 22 23 24 25 26 27  30 31 32 33 34 35 36 37
+      d[0] = _mm_packus_epi16(d[0], d[1]);
+      d[1] = _mm_packus_epi16(d[2], d[3]);
+      store_8bit_8x4_from_16x2(d, dst, dst_stride);
+
+      s[0] = s[4];
+      s[1] = s[5];
+      s[2] = s[6];
+
+      dst += 4 * dst_stride;
+      y -= 4;
+    } while (y);
+    t -= width_hor * (2 * height_ver + 6);
+    t += 16;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const int16_t *const coef,
+                                       uint8_t *const temp_buffer) {
+  const int width_hor = (w + 1) & ~1;
+  const int width_ver = (w + 7) & ~7;
+  const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+  const int height_ver = (h + 1) & ~1;
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  __m128i s[11], d[4];
+  __m128i f[4];
+
+  assert(w && h);
+
+  shuffle_filter_ssse3(coef, f);
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3;
+
+  // horizontal 2x8
+  do {
+    load_8bit_8x8(src + 4, src_stride, s);
+    // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
+    // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
+    // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75 (overlapped)
+    // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77 (overlapped)
+    transpose_16bit_4x8(s, s);
+    x = width_hor;
+
+    do {
+      src += 8;
+      load_8bit_8x8(src, src_stride, &s[2]);
+      // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
+      // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
+      // 08 09 18 19 28 29 38 39  48 49 58 59 68 69 78 79
+      // 0A 0B 1A 1B 2A 2B 3A 3B  4A 4B 5A 5B 6A 6B 7A 7B
+      transpose_16bit_4x8(&s[2], &s[2]);
+
+      d[0] = convolve8_8_ssse3(&s[0], f);  // 00 10 20 30 40 50 60 70
+      d[1] = convolve8_8_ssse3(&s[2], f);  // 01 11 21 31 41 51 61 71
+
+      // 00 10 20 30 40 50 60 70  xx xx xx xx xx xx xx xx
+      // 01 11 21 31 41 51 61 71  xx xx xx xx xx xx xx xx
+      d[0] = _mm_packus_epi16(d[0], d[0]);
+      d[1] = _mm_packus_epi16(d[1], d[1]);
+      // 00 10 01 11 20 30 21 31  40 50 41 51 60 70 61 71
+      d[0] = _mm_unpacklo_epi16(d[0], d[1]);
+      store_8bit_4x4_sse2(d[0], t, 2 * width_hor);
+
+      s[0] = s[4];
+      s[1] = s[5];
+
+      t += 4;
+      x -= 2;
+    } while (x);
+    src += 8 * src_stride - 4 * width_hor;
+    t += 6 * width_hor;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x2
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    // 00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+    // 20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+    s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor));
+    s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor));
+    t += 4 * width_hor;
+    y = height_ver;
+
+    do {
+      // 40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
+      // 60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
+      // 80 90 81 91 82 92 83 93  84 94 85 95 86 96 87 77
+      // A0 B0 A1 B1 A2 B2 A3 B3  A4 B4 A5 B5 A6 B6 A7 77
+      loadu_8bit_16x4(t, 2 * width_hor, &s[2]);
+      t += 8 * width_hor;
+
+      d[0] = convolve8_8_ssse3(&s[0], f);  // 00 01 02 03 04 05 06 07
+      d[1] = convolve8_8_ssse3(&s[2], f);  // 10 11 12 13 14 15 16 17
+
+      // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
+      d[0] = _mm_packus_epi16(d[0], d[1]);
+      _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]);
+      _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]);
+
+      s[0] = s[4];
+      s[1] = s[5];
+
+      dst += 2 * dst_stride;
+      y -= 2;
+    } while (y);
+    t -= width_hor * (4 * height_ver + 4);
+    t += 16;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+typedef void (*shuffle_filter_funcs)(const int16_t *const filter,
+                                     __m128i *const f);
+
+typedef __m128i (*convolve8_funcs)(const __m128i *const s,
+                                   const __m128i *const f);
+
+static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const InterpKernel *const coef,
+                                       const int phase_scaler,
+                                       uint8_t *const temp_buffer) {
+  static const int step_q4 = 16 * 4 / 3;
+  const int width_hor = (w + 5) - ((w + 5) % 6);
+  const int stride_hor = 2 * width_hor + 4;  // store 4 extra pixels
+  const int width_ver = (w + 7) & ~7;
+  // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows
+  // above and (SUBPEL_TAPS / 2) extra rows below.
+  const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+  const int height_ver = (h + 5) - ((h + 5) % 6);
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  __m128i s[12], d[6], dd[4];
+  __m128i f0[4], f1[5], f2[5];
+  // The offset of the first row is always less than 1 pixel.
+  const int offset1_q4 = phase_scaler + 1 * step_q4;
+  const int offset2_q4 = phase_scaler + 2 * step_q4;
+  // offset_idxx indicates the pixel offset is even (0) or odd (1).
+  // It's used to choose the src offset and filter coefficient offset.
+  const int offset_idx1 = (offset1_q4 >> 4) & 1;
+  const int offset_idx2 = (offset2_q4 >> 4) & 1;
+  static const shuffle_filter_funcs kShuffleFilterFuncs[2] = {
+    shuffle_filter_ssse3, shuffle_filter_odd_ssse3
+  };
+  static const convolve8_funcs kConvolve8Funcs[2] = {
+    convolve8_8_even_offset_ssse3, convolve8_8_odd_offset_ssse3
+  };
+
+  assert(w && h);
+
+  shuffle_filter_ssse3(coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK], f0);
+  kShuffleFilterFuncs[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1);
+  kShuffleFilterFuncs[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2);
+
+  // Sub 64 to avoid overflow.
+  // Coef 128 would be treated as -128 in PMADDUBSW. Sub 64 here.
+  // Coef 128 is in either fx[1] or fx[2] depending on the phase idx.
+  // When filter phase idx is 1, the two biggest coefficients are shuffled
+  // together, and the sum of them are always no less than 128. Sub 64 here.
+  // After the subtraction, when the sum of all positive coefficients are no
+  // larger than 128, and the sum of all negative coefficients are no
+  // less than -128, there will be no overflow in the convolve8 functions.
+  f0[1] = _mm_sub_epi8(f0[1], _mm_set1_epi8(64));
+  f1[1 + offset_idx1] = _mm_sub_epi8(f1[1 + offset_idx1], _mm_set1_epi8(64));
+  f2[1 + offset_idx2] = _mm_sub_epi8(f2[1 + offset_idx2], _mm_set1_epi8(64));
+
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 - 1;
+
+  // horizontal 6x8
+  do {
+    load_8bit_8x8(src, src_stride, s);
+    // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
+    // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
+    // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
+    // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
+    transpose_16bit_4x8(s, s);
+    x = width_hor;
+
+    do {
+      src += 8;
+      load_8bit_8x8(src, src_stride, &s[4]);
+      // 08 09 18 19 28 29 38 39  48 49 58 59 68 69 78 79
+      // 0A 0B 1A 1B 2A 2B 3A 3B  4A 4B 5A 5B 6A 6B 7A 7B
+      // OC 0D 1C 1D 2C 2D 3C 3D  4C 4D 5C 5D 6C 6D 7C 7D
+      // 0E 0F 1E 1F 2E 2F 3E 3F  4E 4F 5E 5F 6E 6F 7E 7F
+      transpose_16bit_4x8(&s[4], &s[4]);
+
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
+      d[1] = kConvolve8Funcs[offset_idx1](&s[offset1_q4 >> 5], f1);
+      d[2] = kConvolve8Funcs[offset_idx2](&s[offset2_q4 >> 5], f2);
+      d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
+      d[4] = kConvolve8Funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
+      d[5] = kConvolve8Funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
+
+      // 00 10 20 30 40 50 60 70  02 12 22 32 42 52 62 72
+      // 01 11 21 31 41 51 61 71  03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74  xx xx xx xx xx xx xx xx
+      // 05 15 25 35 45 55 65 75  xx xx xx xx xx xx xx xx
+      dd[0] = _mm_packus_epi16(d[0], d[2]);
+      dd[1] = _mm_packus_epi16(d[1], d[3]);
+      dd[2] = _mm_packus_epi16(d[4], d[4]);
+      dd[3] = _mm_packus_epi16(d[5], d[5]);
+
+      // 00 10 01 11 20 30 21 31  40 50 41 51 60 70 61 71
+      // 02 12 03 13 22 32 23 33  42 52 43 53 62 72 63 73
+      // 04 14 05 15 24 34 25 35  44 54 45 55 64 74 65 75
+      d[0] = _mm_unpacklo_epi16(dd[0], dd[1]);
+      d[1] = _mm_unpackhi_epi16(dd[0], dd[1]);
+      d[2] = _mm_unpacklo_epi16(dd[2], dd[3]);
+
+      // 00 10 01 11 02 12 03 13  20 30 21 31 22 32 23 33
+      // 40 50 41 51 42 52 43 53  60 70 61 71 62 72 63 73
+      // 04 14 05 15 xx xx xx xx  24 34 25 35 xx xx xx xx
+      // 44 54 45 55 xx xx xx xx  64 74 65 75 xx xx xx xx
+      dd[0] = _mm_unpacklo_epi32(d[0], d[1]);
+      dd[1] = _mm_unpackhi_epi32(d[0], d[1]);
+      dd[2] = _mm_unpacklo_epi32(d[2], d[2]);
+      dd[3] = _mm_unpackhi_epi32(d[2], d[2]);
+
+      // 00 10 01 11 02 12 03 13  04 14 05 15 xx xx xx xx
+      // 20 30 21 31 22 32 23 33  24 34 25 35 xx xx xx xx
+      // 40 50 41 51 42 52 43 53  44 54 45 55 xx xx xx xx
+      // 60 70 61 71 62 72 63 73  64 74 65 75 xx xx xx xx
+      d[0] = _mm_unpacklo_epi64(dd[0], dd[2]);
+      d[1] = _mm_unpackhi_epi64(dd[0], dd[2]);
+      d[2] = _mm_unpacklo_epi64(dd[1], dd[3]);
+      d[3] = _mm_unpackhi_epi64(dd[1], dd[3]);
+
+      // store 4 extra pixels
+      storeu_8bit_16x4(d, t, stride_hor);
+
+      s[0] = s[4];
+      s[1] = s[5];
+      s[2] = s[6];
+      s[3] = s[7];
+
+      t += 12;
+      x -= 6;
+    } while (x);
+    src += 8 * src_stride - 4 * width_hor / 3;
+    t += 3 * stride_hor + 4;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x6
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    // 00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+    // 20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+    // 40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
+    // 60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
+    loadu_8bit_16x4(t, stride_hor, s);
+    y = height_ver;
+
+    do {
+      // 80 90 81 91 82 92 83 93  84 94 85 95 86 96 87 97
+      // A0 B0 A1 B1 A2 B2 A3 B3  A4 B4 A5 B5 A6 B6 A7 B7
+      // C0 D0 C1 D1 C2 D2 C3 D3  C4 D4 C5 D5 C6 D6 C7 D7
+      // E0 F0 E1 F1 E2 F2 E3 F3  E4 F4 E5 F5 E6 F6 E7 F7
+      t += 4 * stride_hor;
+      loadu_8bit_16x4(t, stride_hor, &s[4]);
+
+      d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
+      d[1] = kConvolve8Funcs[offset_idx1](&s[offset1_q4 >> 5], f1);
+      d[2] = kConvolve8Funcs[offset_idx2](&s[offset2_q4 >> 5], f2);
+      d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
+      d[4] = kConvolve8Funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
+      d[5] = kConvolve8Funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
+
+      // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
+      // 20 21 22 23 24 25 26 27  30 31 32 33 34 35 36 37
+      // 40 41 42 43 44 45 46 47  50 51 52 53 54 55 56 57
+      d[0] = _mm_packus_epi16(d[0], d[1]);
+      d[2] = _mm_packus_epi16(d[2], d[3]);
+      d[4] = _mm_packus_epi16(d[4], d[5]);
+
+      _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]);
+      _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]);
+      _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), d[2]);
+      _mm_storeh_epi64((__m128i *)(dst + 3 * dst_stride), d[2]);
+      _mm_storel_epi64((__m128i *)(dst + 4 * dst_stride), d[4]);
+      _mm_storeh_epi64((__m128i *)(dst + 5 * dst_stride), d[4]);
+
+      s[0] = s[4];
+      s[1] = s[5];
+      s[2] = s[6];
+      s[3] = s[7];
+
+      dst += 6 * dst_stride;
+      y -= 6;
+    } while (y);
+    t -= stride_hor * 2 * height_ver / 3;
+    t += 16;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+static INLINE __m128i scale_1_to_2_phase_0_kernel(const __m128i *const s,
+                                                  const __m128i *const f) {
+  __m128i ss[4], temp;
+
+  ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+  ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+  ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+  ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
+  temp = convolve8_8_ssse3(ss, f);
+  return _mm_packus_epi16(temp, temp);
+}
+
+// Only calculate odd columns since even columns are just src pixels' copies.
+static void scale_1_to_2_phase_0_row(const uint8_t *src, uint8_t *dst,
+                                     const int w, const __m128i *const f) {
+  int x = w;
+
+  do {
+    __m128i s[8], temp;
+    s[0] = _mm_loadl_epi64((const __m128i *)(src + 0));
+    s[1] = _mm_loadl_epi64((const __m128i *)(src + 1));
+    s[2] = _mm_loadl_epi64((const __m128i *)(src + 2));
+    s[3] = _mm_loadl_epi64((const __m128i *)(src + 3));
+    s[4] = _mm_loadl_epi64((const __m128i *)(src + 4));
+    s[5] = _mm_loadl_epi64((const __m128i *)(src + 5));
+    s[6] = _mm_loadl_epi64((const __m128i *)(src + 6));
+    s[7] = _mm_loadl_epi64((const __m128i *)(src + 7));
+    temp = scale_1_to_2_phase_0_kernel(s, f);
+    _mm_storel_epi64((__m128i *)dst, temp);
+    src += 8;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+static void scale_plane_1_to_2_phase_0(const uint8_t *src,
+                                       const ptrdiff_t src_stride, uint8_t *dst,
+                                       const ptrdiff_t dst_stride,
+                                       const int src_w, const int src_h,
+                                       const int16_t *const coef,
+                                       uint8_t *const temp_buffer) {
+  int max_width;
   int y;
-  for (y = 0; y < h; ++y) {
+  uint8_t *tmp[9];
+  __m128i f[4];
+
+  max_width = (src_w + 7) & ~7;
+  tmp[0] = temp_buffer + 0 * max_width;
+  tmp[1] = temp_buffer + 1 * max_width;
+  tmp[2] = temp_buffer + 2 * max_width;
+  tmp[3] = temp_buffer + 3 * max_width;
+  tmp[4] = temp_buffer + 4 * max_width;
+  tmp[5] = temp_buffer + 5 * max_width;
+  tmp[6] = temp_buffer + 6 * max_width;
+  tmp[7] = temp_buffer + 7 * max_width;
+
+  shuffle_filter_ssse3(coef, f);
+
+  scale_1_to_2_phase_0_row(src - 3 * src_stride - 3, tmp[0], max_width, f);
+  scale_1_to_2_phase_0_row(src - 2 * src_stride - 3, tmp[1], max_width, f);
+  scale_1_to_2_phase_0_row(src - 1 * src_stride - 3, tmp[2], max_width, f);
+  scale_1_to_2_phase_0_row(src + 0 * src_stride - 3, tmp[3], max_width, f);
+  scale_1_to_2_phase_0_row(src + 1 * src_stride - 3, tmp[4], max_width, f);
+  scale_1_to_2_phase_0_row(src + 2 * src_stride - 3, tmp[5], max_width, f);
+  scale_1_to_2_phase_0_row(src + 3 * src_stride - 3, tmp[6], max_width, f);
+
+  y = src_h;
+  do {
     int x;
-    for (x = 0; x < max_width; x += 16) {
-      const __m128i a = _mm_loadu_si128((const __m128i *)(src + x * 2 + 0));
-      const __m128i b = _mm_loadu_si128((const __m128i *)(src + x * 2 + 16));
-      const __m128i a_and = _mm_and_si128(a, mask);
-      const __m128i b_and = _mm_and_si128(b, mask);
-      const __m128i c = _mm_packus_epi16(a_and, b_and);
-      _mm_storeu_si128((__m128i *)(dst + x), c);
+    scale_1_to_2_phase_0_row(src + 4 * src_stride - 3, tmp[7], max_width, f);
+    for (x = 0; x < max_width; x += 8) {
+      __m128i s[8], C, D, CD;
+
+      // Even rows
+      const __m128i a = _mm_loadl_epi64((const __m128i *)(src + x));
+      const __m128i b = _mm_loadl_epi64((const __m128i *)(tmp[3] + x));
+      const __m128i ab = _mm_unpacklo_epi8(a, b);
+      _mm_storeu_si128((__m128i *)(dst + 2 * x), ab);
+
+      // Odd rows
+      // Even columns
+      load_8bit_8x8(src + x - 3 * src_stride, src_stride, s);
+      C = scale_1_to_2_phase_0_kernel(s, f);
+
+      // Odd columns
+      s[0] = _mm_loadl_epi64((const __m128i *)(tmp[0] + x));
+      s[1] = _mm_loadl_epi64((const __m128i *)(tmp[1] + x));
+      s[2] = _mm_loadl_epi64((const __m128i *)(tmp[2] + x));
+      s[3] = _mm_loadl_epi64((const __m128i *)(tmp[3] + x));
+      s[4] = _mm_loadl_epi64((const __m128i *)(tmp[4] + x));
+      s[5] = _mm_loadl_epi64((const __m128i *)(tmp[5] + x));
+      s[6] = _mm_loadl_epi64((const __m128i *)(tmp[6] + x));
+      s[7] = _mm_loadl_epi64((const __m128i *)(tmp[7] + x));
+      D = scale_1_to_2_phase_0_kernel(s, f);
+
+      CD = _mm_unpacklo_epi8(C, D);
+      _mm_storeu_si128((__m128i *)(dst + dst_stride + 2 * x), CD);
     }
-    for (; x < w; ++x) dst[x] = src[x * 2];
-    src += src_stride * 2;
-    dst += dst_stride;
-  }
-}
 
-static INLINE __m128i filter(const __m128i *const a, const __m128i *const b,
-                             const __m128i *const c, const __m128i *const d,
-                             const __m128i *const e, const __m128i *const f,
-                             const __m128i *const g, const __m128i *const h) {
-  const __m128i coeffs_ab =
-      _mm_set_epi8(6, -1, 6, -1, 6, -1, 6, -1, 6, -1, 6, -1, 6, -1, 6, -1);
-  const __m128i coeffs_cd = _mm_set_epi8(78, -19, 78, -19, 78, -19, 78, -19, 78,
-                                         -19, 78, -19, 78, -19, 78, -19);
-  const __m128i const64_x16 = _mm_set1_epi16(64);
-  const __m128i ab = _mm_unpacklo_epi8(*a, *b);
-  const __m128i cd = _mm_unpacklo_epi8(*c, *d);
-  const __m128i fe = _mm_unpacklo_epi8(*f, *e);
-  const __m128i hg = _mm_unpacklo_epi8(*h, *g);
-  const __m128i ab_terms = _mm_maddubs_epi16(ab, coeffs_ab);
-  const __m128i cd_terms = _mm_maddubs_epi16(cd, coeffs_cd);
-  const __m128i fe_terms = _mm_maddubs_epi16(fe, coeffs_cd);
-  const __m128i hg_terms = _mm_maddubs_epi16(hg, coeffs_ab);
-  // can not overflow
-  const __m128i abcd_terms = _mm_add_epi16(ab_terms, cd_terms);
-  // can not overflow
-  const __m128i fehg_terms = _mm_add_epi16(fe_terms, hg_terms);
-  // can overflow, use saturating add
-  const __m128i terms = _mm_adds_epi16(abcd_terms, fehg_terms);
-  const __m128i round = _mm_adds_epi16(terms, const64_x16);
-  const __m128i shift = _mm_srai_epi16(round, 7);
-  return _mm_packus_epi16(shift, shift);
-}
-
-static void eight_tap_row_ssse3(const uint8_t *src, uint8_t *dst, int w) {
-  const int max_width = w & ~7;
-  int x = 0;
-  for (; x < max_width; x += 8) {
-    const __m128i a = _mm_loadl_epi64((const __m128i *)(src + x + 0));
-    const __m128i b = _mm_loadl_epi64((const __m128i *)(src + x + 1));
-    const __m128i c = _mm_loadl_epi64((const __m128i *)(src + x + 2));
-    const __m128i d = _mm_loadl_epi64((const __m128i *)(src + x + 3));
-    const __m128i e = _mm_loadl_epi64((const __m128i *)(src + x + 4));
-    const __m128i f = _mm_loadl_epi64((const __m128i *)(src + x + 5));
-    const __m128i g = _mm_loadl_epi64((const __m128i *)(src + x + 6));
-    const __m128i h = _mm_loadl_epi64((const __m128i *)(src + x + 7));
-    const __m128i pack = filter(&a, &b, &c, &d, &e, &f, &g, &h);
-    _mm_storel_epi64((__m128i *)(dst + x), pack);
-  }
-}
-
-static void upsample_1_to_2_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride, int dst_w,
-                                  int dst_h) {
-  dst_w /= 2;
-  dst_h /= 2;
-  {
-    DECLARE_ALIGNED(16, uint8_t, tmp[1920 * 8]);
-    uint8_t *tmp0 = tmp + dst_w * 0;
-    uint8_t *tmp1 = tmp + dst_w * 1;
-    uint8_t *tmp2 = tmp + dst_w * 2;
-    uint8_t *tmp3 = tmp + dst_w * 3;
-    uint8_t *tmp4 = tmp + dst_w * 4;
-    uint8_t *tmp5 = tmp + dst_w * 5;
-    uint8_t *tmp6 = tmp + dst_w * 6;
-    uint8_t *tmp7 = tmp + dst_w * 7;
-    uint8_t *tmp8 = NULL;
-    const int max_width = dst_w & ~7;
-    int y;
-    eight_tap_row_ssse3(src - src_stride * 3 - 3, tmp0, dst_w);
-    eight_tap_row_ssse3(src - src_stride * 2 - 3, tmp1, dst_w);
-    eight_tap_row_ssse3(src - src_stride * 1 - 3, tmp2, dst_w);
-    eight_tap_row_ssse3(src + src_stride * 0 - 3, tmp3, dst_w);
-    eight_tap_row_ssse3(src + src_stride * 1 - 3, tmp4, dst_w);
-    eight_tap_row_ssse3(src + src_stride * 2 - 3, tmp5, dst_w);
-    eight_tap_row_ssse3(src + src_stride * 3 - 3, tmp6, dst_w);
-    for (y = 0; y < dst_h; y++) {
-      int x;
-      eight_tap_row_ssse3(src + src_stride * 4 - 3, tmp7, dst_w);
-      for (x = 0; x < max_width; x += 8) {
-        const __m128i A = _mm_loadl_epi64((const __m128i *)(src + x));
-        const __m128i B = _mm_loadl_epi64((const __m128i *)(tmp3 + x));
-        const __m128i AB = _mm_unpacklo_epi8(A, B);
-        __m128i C, D, CD;
-        _mm_storeu_si128((__m128i *)(dst + x * 2), AB);
-        {
-          const __m128i a =
-              _mm_loadl_epi64((const __m128i *)(src + x - src_stride * 3));
-          const __m128i b =
-              _mm_loadl_epi64((const __m128i *)(src + x - src_stride * 2));
-          const __m128i c =
-              _mm_loadl_epi64((const __m128i *)(src + x - src_stride * 1));
-          const __m128i d =
-              _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 0));
-          const __m128i e =
-              _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 1));
-          const __m128i f =
-              _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 2));
-          const __m128i g =
-              _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 3));
-          const __m128i h =
-              _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 4));
-          C = filter(&a, &b, &c, &d, &e, &f, &g, &h);
-        }
-        {
-          const __m128i a = _mm_loadl_epi64((const __m128i *)(tmp0 + x));
-          const __m128i b = _mm_loadl_epi64((const __m128i *)(tmp1 + x));
-          const __m128i c = _mm_loadl_epi64((const __m128i *)(tmp2 + x));
-          const __m128i d = _mm_loadl_epi64((const __m128i *)(tmp3 + x));
-          const __m128i e = _mm_loadl_epi64((const __m128i *)(tmp4 + x));
-          const __m128i f = _mm_loadl_epi64((const __m128i *)(tmp5 + x));
-          const __m128i g = _mm_loadl_epi64((const __m128i *)(tmp6 + x));
-          const __m128i h = _mm_loadl_epi64((const __m128i *)(tmp7 + x));
-          D = filter(&a, &b, &c, &d, &e, &f, &g, &h);
-        }
-        CD = _mm_unpacklo_epi8(C, D);
-        _mm_storeu_si128((__m128i *)(dst + x * 2 + dst_stride), CD);
-      }
-      src += src_stride;
-      dst += dst_stride * 2;
-      tmp8 = tmp0;
-      tmp0 = tmp1;
-      tmp1 = tmp2;
-      tmp2 = tmp3;
-      tmp3 = tmp4;
-      tmp4 = tmp5;
-      tmp5 = tmp6;
-      tmp6 = tmp7;
-      tmp7 = tmp8;
-    }
-  }
+    src += src_stride;
+    dst += 2 * dst_stride;
+    tmp[8] = tmp[0];
+    tmp[0] = tmp[1];
+    tmp[1] = tmp[2];
+    tmp[2] = tmp[3];
+    tmp[3] = tmp[4];
+    tmp[4] = tmp[5];
+    tmp[5] = tmp[6];
+    tmp[6] = tmp[7];
+    tmp[7] = tmp[8];
+  } while (--y);
 }
 
 void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
-                                      YV12_BUFFER_CONFIG *dst) {
+                                      YV12_BUFFER_CONFIG *dst,
+                                      uint8_t filter_type, int phase_scaler) {
   const int src_w = src->y_crop_width;
   const int src_h = src->y_crop_height;
   const int dst_w = dst->y_crop_width;
   const int dst_h = dst->y_crop_height;
-  const int dst_uv_w = dst_w / 2;
-  const int dst_uv_h = dst_h / 2;
+  const int dst_uv_w = dst->uv_crop_width;
+  const int dst_uv_h = dst->uv_crop_height;
+  int scaled = 0;
+
+  // phase_scaler is usually 0 or 8.
+  assert(phase_scaler >= 0 && phase_scaler < 16);
 
   if (dst_w * 2 == src_w && dst_h * 2 == src_h) {
-    downsample_2_to_1_ssse3(src->y_buffer, src->y_stride, dst->y_buffer,
-                            dst->y_stride, dst_w, dst_h);
-    downsample_2_to_1_ssse3(src->u_buffer, src->uv_stride, dst->u_buffer,
-                            dst->uv_stride, dst_uv_w, dst_uv_h);
-    downsample_2_to_1_ssse3(src->v_buffer, src->uv_stride, dst->v_buffer,
-                            dst->uv_stride, dst_uv_w, dst_uv_h);
-    vpx_extend_frame_borders(dst);
-  } else if (dst_w == src_w * 2 && dst_h == src_h * 2) {
-    // The upsample() supports widths up to 1920 * 2.  If greater, fall back
-    // to vp9_scale_and_extend_frame_c().
-    if (dst_w / 2 <= 1920) {
-      upsample_1_to_2_ssse3(src->y_buffer, src->y_stride, dst->y_buffer,
-                            dst->y_stride, dst_w, dst_h);
-      upsample_1_to_2_ssse3(src->u_buffer, src->uv_stride, dst->u_buffer,
-                            dst->uv_stride, dst_uv_w, dst_uv_h);
-      upsample_1_to_2_ssse3(src->v_buffer, src->uv_stride, dst->v_buffer,
-                            dst->uv_stride, dst_uv_w, dst_uv_h);
-      vpx_extend_frame_borders(dst);
+    // 2 to 1
+    scaled = 1;
+
+    if (phase_scaler == 0) {
+      scale_plane_2_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer,
+                                 dst->y_stride, dst_w, dst_h);
+      scale_plane_2_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+      scale_plane_2_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+    } else if (filter_type == BILINEAR) {
+      const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3];
+      const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4];
+      const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8));  // c0 and c1 >= 0
+      scale_plane_2_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
+                                  dst->y_stride, dst_w, dst_h, c0c1);
+      scale_plane_2_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0c1);
+      scale_plane_2_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0c1);
     } else {
-      vp9_scale_and_extend_frame_c(src, dst);
+      const int buffer_stride = (dst_w + 3) & ~3;
+      const int buffer_height = (2 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
+      uint8_t *const temp_buffer =
+          (uint8_t *)malloc(buffer_stride * buffer_height);
+      if (temp_buffer) {
+        scale_plane_2_to_1_general(
+            src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+            dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer);
+        scale_plane_2_to_1_general(
+            src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+            temp_buffer);
+        scale_plane_2_to_1_general(
+            src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+            temp_buffer);
+        free(temp_buffer);
+      } else {
+        scaled = 0;
+      }
     }
+  } else if (4 * dst_w == src_w && 4 * dst_h == src_h) {
+    // 4 to 1
+    scaled = 1;
+    if (phase_scaler == 0) {
+      scale_plane_4_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer,
+                                 dst->y_stride, dst_w, dst_h);
+      scale_plane_4_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+      scale_plane_4_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+    } else if (filter_type == BILINEAR) {
+      const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3];
+      const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4];
+      const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8));  // c0 and c1 >= 0
+      scale_plane_4_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
+                                  dst->y_stride, dst_w, dst_h, c0c1);
+      scale_plane_4_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0c1);
+      scale_plane_4_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0c1);
+    } else {
+      const int buffer_stride = (dst_w + 1) & ~1;
+      const int buffer_height = (4 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
+      // When dst_w is 1 or 2, we need extra padding to avoid heap read overflow
+      const int extra_padding = 16;
+      uint8_t *const temp_buffer =
+          (uint8_t *)malloc(buffer_stride * buffer_height + extra_padding);
+      if (temp_buffer) {
+        scale_plane_4_to_1_general(
+            src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+            dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer);
+        scale_plane_4_to_1_general(
+            src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+            temp_buffer);
+        scale_plane_4_to_1_general(
+            src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+            temp_buffer);
+        free(temp_buffer);
+      } else {
+        scaled = 0;
+      }
+    }
+  } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) {
+    // 4 to 3
+    const int buffer_stride_hor = (dst_w + 5) - ((dst_w + 5) % 6) + 2;
+    const int buffer_stride_ver = (dst_w + 7) & ~7;
+    const int buffer_height = (4 * dst_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+    // When the vertical filter reads more pixels than the horizontal filter
+    // generated in each row, we need extra padding to avoid heap read overflow.
+    // For example, the horizontal filter generates 18 pixels but the vertical
+    // filter reads 24 pixels in a row. The difference is multiplied by 2 since
+    // two rows are interlaced together in the optimization.
+    const int extra_padding = (buffer_stride_ver > buffer_stride_hor)
+                                  ? 2 * (buffer_stride_ver - buffer_stride_hor)
+                                  : 0;
+    const int buffer_size = buffer_stride_hor * buffer_height + extra_padding;
+    uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_size);
+    if (temp_buffer) {
+      scaled = 1;
+      scale_plane_4_to_3_general(
+          src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+          dst_h, vp9_filter_kernels[filter_type], phase_scaler, temp_buffer);
+      scale_plane_4_to_3_general(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h,
+                                 vp9_filter_kernels[filter_type], phase_scaler,
+                                 temp_buffer);
+      scale_plane_4_to_3_general(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h,
+                                 vp9_filter_kernels[filter_type], phase_scaler,
+                                 temp_buffer);
+      free(temp_buffer);
+    }
+  } else if (dst_w == src_w * 2 && dst_h == src_h * 2 && phase_scaler == 0) {
+    // 1 to 2
+    uint8_t *const temp_buffer = (uint8_t *)malloc(8 * ((src_w + 7) & ~7));
+    if (temp_buffer) {
+      scaled = 1;
+      scale_plane_1_to_2_phase_0(
+          src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, src_w,
+          src_h, vp9_filter_kernels[filter_type][8], temp_buffer);
+      const int src_uv_w = src->uv_crop_width;
+      const int src_uv_h = src->uv_crop_height;
+      scale_plane_1_to_2_phase_0(
+          src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+          src_uv_w, src_uv_h, vp9_filter_kernels[filter_type][8], temp_buffer);
+      scale_plane_1_to_2_phase_0(
+          src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+          src_uv_w, src_uv_h, vp9_filter_kernels[filter_type][8], temp_buffer);
+      free(temp_buffer);
+    }
+  }
+
+  if (scaled) {
+    vpx_extend_frame_borders(dst);
   } else {
-    vp9_scale_and_extend_frame_c(src, dst);
+    // Call c version for all other scaling ratios.
+    vp9_scale_and_extend_frame_c(src, dst, filter_type, phase_scaler);
   }
 }
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c
index 91f627c343..d7aafe7b01 100644
--- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c
@@ -11,27 +11,28 @@
 #include <emmintrin.h>
 #include <stdio.h>
 
+#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_common.h"
 
-int64_t vp9_highbd_block_error_sse2(tran_low_t *coeff, tran_low_t *dqcoeff,
-                                    intptr_t block_size, int64_t *ssz,
-                                    int bps) {
+int64_t vp9_highbd_block_error_sse2(const tran_low_t *coeff,
+                                    const tran_low_t *dqcoeff,
+                                    intptr_t block_size, int64_t *ssz, int bd) {
   int i, j, test;
   uint32_t temp[4];
   __m128i max, min, cmp0, cmp1, cmp2, cmp3;
   int64_t error = 0, sqcoeff = 0;
-  const int shift = 2 * (bps - 8);
+  const int shift = 2 * (bd - 8);
   const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
 
   for (i = 0; i < block_size; i += 8) {
     // Load the data into xmm registers
-    __m128i mm_coeff = _mm_load_si128((__m128i *)(coeff + i));
-    __m128i mm_coeff2 = _mm_load_si128((__m128i *)(coeff + i + 4));
-    __m128i mm_dqcoeff = _mm_load_si128((__m128i *)(dqcoeff + i));
-    __m128i mm_dqcoeff2 = _mm_load_si128((__m128i *)(dqcoeff + i + 4));
+    __m128i mm_coeff = _mm_load_si128((const __m128i *)(coeff + i));
+    __m128i mm_coeff2 = _mm_load_si128((const __m128i *)(coeff + i + 4));
+    __m128i mm_dqcoeff = _mm_load_si128((const __m128i *)(dqcoeff + i));
+    __m128i mm_dqcoeff2 = _mm_load_si128((const __m128i *)(dqcoeff + i + 4));
     // Check if any values require more than 15 bit
     max = _mm_set1_epi32(0x3fff);
-    min = _mm_set1_epi32(0xffffc000);
+    min = _mm_set1_epi32((int32_t)0xffffc000);
     cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max),
                          _mm_cmplt_epi32(mm_coeff, min));
     cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max),
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_error_avx.asm b/media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_error_avx.asm
deleted file mode 100644
index e476323e14..0000000000
--- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_error_avx.asm
+++ /dev/null
@@ -1,261 +0,0 @@
-;
-;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%define private_prefix vp9
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-ALIGN 16
-
-;
-; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff,
-;                                     intptr_t block_size, int64_t *ssz)
-;
-
-INIT_XMM avx
-cglobal highbd_block_error_8bit, 4, 5, 8, uqc, dqc, size, ssz
-  vzeroupper
-
-  ; If only one iteration is required, then handle this as a special case.
-  ; It is the most frequent case, so we can have a significant gain here
-  ; by not setting up a loop and accumulators.
-  cmp    sizeq, 16
-  jne   .generic
-
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-  ;; Common case of size == 16
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-  ; Load input vectors
-  mova      xm0, [dqcq]
-  packssdw  xm0, [dqcq+16]
-  mova      xm2, [uqcq]
-  packssdw  xm2, [uqcq+16]
-
-  mova      xm1, [dqcq+32]
-  packssdw  xm1, [dqcq+48]
-  mova      xm3, [uqcq+32]
-  packssdw  xm3, [uqcq+48]
-
-  ; Compute the errors.
-  psubw     xm0, xm2
-  psubw     xm1, xm3
-
-  ; Individual errors are max 15bit+sign, so squares are 30bit, and
-  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
-  pmaddwd   xm2, xm2
-  pmaddwd   xm3, xm3
-
-  pmaddwd   xm0, xm0
-  pmaddwd   xm1, xm1
-
-  ; Squares are always positive, so we can use unsigned arithmetic after
-  ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
-  ; fit in 32bits
-  paddd     xm2, xm3
-  paddd     xm0, xm1
-
-  ; Accumulate horizontally in 64 bits, there is no chance of overflow here
-  pxor      xm5, xm5
-
-  pblendw   xm3, xm5, xm2, 0x33 ; Zero extended  low of a pair of 32 bits
-  psrlq     xm2, 32             ; Zero extended high of a pair of 32 bits
-
-  pblendw   xm1, xm5, xm0, 0x33 ; Zero extended  low of a pair of 32 bits
-  psrlq     xm0, 32             ; Zero extended high of a pair of 32 bits
-
-  paddq     xm2, xm3
-  paddq     xm0, xm1
-
-  psrldq    xm3, xm2, 8
-  psrldq    xm1, xm0, 8
-
-  paddq     xm2, xm3
-  paddq     xm0, xm1
-
-  ; Store the return value
-%if ARCH_X86_64
-  movq      rax, xm0
-  movq   [sszq], xm2
-%else
-  movd      eax, xm0
-  pextrd    edx, xm0, 1
-  movq   [sszd], xm2
-%endif
-  RET
-
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-  ;; Generic case of size != 16, speculative low precision
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-  ALIGN 16
-.generic:
-  pxor      xm4, xm4                ; sse accumulator
-  pxor      xm5, xm5                ; overflow detection register for xm4
-  pxor      xm6, xm6                ; ssz accumulator
-  pxor      xm7, xm7                ; overflow detection register for xm6
-  lea      uqcq, [uqcq+sizeq*4]
-  lea      dqcq, [dqcq+sizeq*4]
-  neg     sizeq
-
-  ; Push the negative size as the high precision code might need it
-  push    sizeq
-
-.loop:
-  ; Load input vectors
-  mova      xm0, [dqcq+sizeq*4]
-  packssdw  xm0, [dqcq+sizeq*4+16]
-  mova      xm2, [uqcq+sizeq*4]
-  packssdw  xm2, [uqcq+sizeq*4+16]
-
-  mova      xm1, [dqcq+sizeq*4+32]
-  packssdw  xm1, [dqcq+sizeq*4+48]
-  mova      xm3, [uqcq+sizeq*4+32]
-  packssdw  xm3, [uqcq+sizeq*4+48]
-
-  add     sizeq, 16
-
-  ; Compute the squared errors.
-  ; Individual errors are max 15bit+sign, so squares are 30bit, and
-  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
-  psubw     xm0, xm2
-  pmaddwd   xm2, xm2
-  pmaddwd   xm0, xm0
-
-  psubw     xm1, xm3
-  pmaddwd   xm3, xm3
-  pmaddwd   xm1, xm1
-
-  ; Squares are always positive, so we can use unsigned arithmetic after
-  ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
-  ; fit in 32bits
-  paddd     xm2, xm3
-  paddd     xm0, xm1
-
-  ; We accumulate using 32 bit arithmetic, but detect potential overflow
-  ; by checking if the MSB of the accumulators have ever been a set bit.
-  ; If yes, we redo the whole compute at the end on higher precision, but
-  ; this happens extremely rarely, so we still achieve a net gain.
-  paddd     xm4, xm0
-  paddd     xm6, xm2
-  por       xm5, xm4  ; OR in the accumulator for overflow detection
-  por       xm7, xm6  ; OR in the accumulator for overflow detection
-
-  jnz .loop
-
-  ; Add pairs horizontally (still only on 32 bits)
-  phaddd    xm4, xm4
-  por       xm5, xm4  ; OR in the accumulator for overflow detection
-  phaddd    xm6, xm6
-  por       xm7, xm6  ; OR in the accumulator for overflow detection
-
-  ; Check for possibility of overflow by testing if bit 32 of each dword lane
-  ; have ever been set. If they were not, then there was no overflow and the
-  ; final sum will fit in 32 bits. If overflow happened, then
-  ; we redo the whole computation on higher precision.
-  por       xm7, xm5
-  pmovmskb   r4, xm7
-  test       r4, 0x8888
-  jnz .highprec
-
-  phaddd    xm4, xm4
-  phaddd    xm6, xm6
-  pmovzxdq  xm4, xm4
-  pmovzxdq  xm6, xm6
-
-  ; Restore stack
-  pop     sizeq
-
-  ; Store the return value
-%if ARCH_X86_64
-  movq      rax, xm4
-  movq   [sszq], xm6
-%else
-  movd      eax, xm4
-  pextrd    edx, xm4, 1
-  movq   [sszd], xm6
-%endif
-  RET
-
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-  ;; Generic case of size != 16, high precision case
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-.highprec:
-  pxor      xm4, xm4                 ; sse accumulator
-  pxor      xm5, xm5                 ; dedicated zero register
-  pxor      xm6, xm6                 ; ssz accumulator
-  pop     sizeq
-
-.loophp:
-  mova      xm0, [dqcq+sizeq*4]
-  packssdw  xm0, [dqcq+sizeq*4+16]
-  mova      xm2, [uqcq+sizeq*4]
-  packssdw  xm2, [uqcq+sizeq*4+16]
-
-  mova      xm1, [dqcq+sizeq*4+32]
-  packssdw  xm1, [dqcq+sizeq*4+48]
-  mova      xm3, [uqcq+sizeq*4+32]
-  packssdw  xm3, [uqcq+sizeq*4+48]
-
-  add     sizeq, 16
-
-  ; individual errors are max. 15bit+sign, so squares are 30bit, and
-  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
-
-  psubw     xm0, xm2
-  pmaddwd   xm2, xm2
-  pmaddwd   xm0, xm0
-
-  psubw     xm1, xm3
-  pmaddwd   xm3, xm3
-  pmaddwd   xm1, xm1
-
-  ; accumulate in 64bit
-  punpckldq xm7, xm0, xm5
-  punpckhdq xm0, xm5
-  paddq     xm4, xm7
-
-  punpckldq xm7, xm2, xm5
-  punpckhdq xm2, xm5
-  paddq     xm6, xm7
-
-  punpckldq xm7, xm1, xm5
-  punpckhdq xm1, xm5
-  paddq     xm4, xm7
-
-  punpckldq xm7, xm3, xm5
-  punpckhdq xm3, xm5
-  paddq     xm6, xm7
-
-  paddq     xm4, xm0
-  paddq     xm4, xm1
-  paddq     xm6, xm2
-  paddq     xm6, xm3
-
-  jnz .loophp
-
-  ; Accumulate horizontally
-  movhlps   xm5, xm4
-  movhlps   xm7, xm6
-  paddq     xm4, xm5
-  paddq     xm6, xm7
-
-  ; Store the return value
-%if ARCH_X86_64
-  movq      rax, xm4
-  movq   [sszq], xm6
-%else
-  movd      eax, xm4
-  pextrd    edx, xm4, 1
-  movq   [sszd], xm6
-%endif
-  RET
-
-END
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_error_sse2.asm b/media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_error_sse2.asm
deleted file mode 100644
index f3b8f01947..0000000000
--- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_error_sse2.asm
+++ /dev/null
@@ -1,98 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%define private_prefix vp9
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-ALIGN 16
-
-;
-; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff,
-;                                     intptr_t block_size, int64_t *ssz)
-;
-
-INIT_XMM sse2
-cglobal highbd_block_error_8bit, 3, 3, 8, uqc, dqc, size, ssz
-  pxor      m4, m4                 ; sse accumulator
-  pxor      m6, m6                 ; ssz accumulator
-  pxor      m5, m5                 ; dedicated zero register
-  lea     uqcq, [uqcq+sizeq*4]
-  lea     dqcq, [dqcq+sizeq*4]
-  neg    sizeq
-
-  ALIGN 16
-
-.loop:
-  mova      m0, [dqcq+sizeq*4]
-  packssdw  m0, [dqcq+sizeq*4+mmsize]
-  mova      m2, [uqcq+sizeq*4]
-  packssdw  m2, [uqcq+sizeq*4+mmsize]
-
-  mova      m1, [dqcq+sizeq*4+mmsize*2]
-  packssdw  m1, [dqcq+sizeq*4+mmsize*3]
-  mova      m3, [uqcq+sizeq*4+mmsize*2]
-  packssdw  m3, [uqcq+sizeq*4+mmsize*3]
-
-  add    sizeq, mmsize
-
-  ; individual errors are max. 15bit+sign, so squares are 30bit, and
-  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
-
-  psubw     m0, m2
-  pmaddwd   m2, m2
-  pmaddwd   m0, m0
-
-  psubw     m1, m3
-  pmaddwd   m3, m3
-  pmaddwd   m1, m1
-
-  ; accumulate in 64bit
-  punpckldq m7, m0, m5
-  punpckhdq m0, m5
-  paddq     m4, m7
-
-  punpckldq m7, m2, m5
-  punpckhdq m2, m5
-  paddq     m6, m7
-
-  punpckldq m7, m1, m5
-  punpckhdq m1, m5
-  paddq     m4, m7
-
-  punpckldq m7, m3, m5
-  punpckhdq m3, m5
-  paddq     m6, m7
-
-  paddq     m4, m0
-  paddq     m4, m1
-  paddq     m6, m2
-  paddq     m6, m3
-
-  jnz .loop
-
-  ; accumulate horizontally and store in return value
-  movhlps   m5, m4
-  movhlps   m7, m6
-  paddq     m4, m5
-  paddq     m6, m7
-
-%if ARCH_X86_64
-  movq    rax, m4
-  movq [sszq], m6
-%else
-  mov     eax, sszm
-  pshufd   m5, m4, 0x1
-  movq  [eax], m6
-  movd    eax, m4
-  movd    edx, m5
-%endif
-  RET
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c
new file mode 100644
index 0000000000..bf44b08674
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c
@@ -0,0 +1,439 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>  // AVX2
+
+#include "./vp9_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/bitdepth_conversion_avx2.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
+
+// Zero fill 8 positions in the output buffer.
+static VPX_FORCE_INLINE void store_zero_tran_low(tran_low_t *a) {
+  const __m256i zero = _mm256_setzero_si256();
+#if CONFIG_VP9_HIGHBITDEPTH
+  _mm256_storeu_si256((__m256i *)(a), zero);
+  _mm256_storeu_si256((__m256i *)(a + 8), zero);
+#else
+  _mm256_storeu_si256((__m256i *)(a), zero);
+#endif
+}
+
+static VPX_FORCE_INLINE void load_fp_values_avx2(
+    const struct macroblock_plane *mb_plane, __m256i *round, __m256i *quant,
+    const int16_t *dequant_ptr, __m256i *dequant) {
+  *round = _mm256_castsi128_si256(
+      _mm_load_si128((const __m128i *)mb_plane->round_fp));
+  *round = _mm256_permute4x64_epi64(*round, 0x54);
+  *quant = _mm256_castsi128_si256(
+      _mm_load_si128((const __m128i *)mb_plane->quant_fp));
+  *quant = _mm256_permute4x64_epi64(*quant, 0x54);
+  *dequant =
+      _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
+  *dequant = _mm256_permute4x64_epi64(*dequant, 0x54);
+}
+
+static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan,
+                                                 __m256i v_eobmax,
+                                                 __m256i v_mask) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const __m256i v_iscan = _mm256_permute4x64_epi64(
+      _mm256_loadu_si256((const __m256i *)iscan), 0xD8);
+#else
+  const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan);
+#endif
+  const __m256i v_nz_iscan = _mm256_and_si256(v_iscan, v_mask);
+  return _mm256_max_epi16(v_eobmax, v_nz_iscan);
+}
+
+static VPX_FORCE_INLINE uint16_t get_max_eob(__m256i eob256) {
+  const __m256i eob_lo = eob256;
+  // Copy upper 128 to lower 128
+  const __m256i eob_hi = _mm256_permute2x128_si256(eob256, eob256, 0X81);
+  __m256i eob = _mm256_max_epi16(eob_lo, eob_hi);
+  __m256i eob_s = _mm256_shuffle_epi32(eob, 0xe);
+  eob = _mm256_max_epi16(eob, eob_s);
+  eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+  eob = _mm256_max_epi16(eob, eob_s);
+  eob_s = _mm256_shufflelo_epi16(eob, 1);
+  eob = _mm256_max_epi16(eob, eob_s);
+#if defined(_MSC_VER) && (_MSC_VER < 1910)
+  return _mm_cvtsi128_si32(_mm256_extracti128_si256(eob, 0)) & 0xffff;
+#else
+  return (uint16_t)_mm256_extract_epi16(eob, 0);
+#endif
+}
+
+static VPX_FORCE_INLINE void quantize_fp_16(
+    const __m256i *round, const __m256i *quant, const __m256i *dequant,
+    const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) {
+  const __m256i coeff = load_tran_low(coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+  const int32_t nzflag =
+      _mm256_movemask_epi8(_mm256_cmpgt_epi16(abs_coeff, *thr));
+
+  if (nzflag) {
+    const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round);
+    const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant);
+    const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff);
+    const __m256i dqcoeff = _mm256_mullo_epi16(qcoeff, *dequant);
+    const __m256i nz_mask =
+        _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256());
+    store_tran_low(qcoeff, qcoeff_ptr);
+    store_tran_low(dqcoeff, dqcoeff_ptr);
+
+    *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask);
+  } else {
+    store_zero_tran_low(qcoeff_ptr);
+    store_zero_tran_low(dqcoeff_ptr);
+  }
+}
+
+void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                          const struct macroblock_plane *const mb_plane,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const struct ScanOrder *const scan_order) {
+  __m256i round, quant, dequant, thr;
+  __m256i eob_max = _mm256_setzero_si256();
+  const int16_t *iscan = scan_order->iscan;
+
+  coeff_ptr += n_coeffs;
+  iscan += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+
+  // Setup global values
+  load_fp_values_avx2(mb_plane, &round, &quant, dequant_ptr, &dequant);
+  thr = _mm256_setzero_si256();
+
+  quantize_fp_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+                 iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                 dqcoeff_ptr + n_coeffs, &eob_max);
+
+  n_coeffs += 8 * 2;
+
+  // remove dc constants
+  dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31);
+  quant = _mm256_permute2x128_si256(quant, quant, 0x31);
+  round = _mm256_permute2x128_si256(round, round, 0x31);
+  thr = _mm256_srai_epi16(dequant, 1);
+
+  // AC only loop
+  while (n_coeffs < 0) {
+    quantize_fp_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+                   iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                   dqcoeff_ptr + n_coeffs, &eob_max);
+    n_coeffs += 8 * 2;
+  }
+
+  *eob_ptr = get_max_eob(eob_max);
+}
+
+// Enable this flag when matching the optimized code to
+// vp9_quantize_fp_32x32_c(). Disabled, the optimized code will match the
+// existing ssse3 code and quantize_fp_32x32_nz_c().
+//
+// #define MATCH_VP9_QUANTIZE_FP_32X32_C
+
+#ifndef MATCH_VP9_QUANTIZE_FP_32X32_C
+static VPX_FORCE_INLINE void quantize_fp_32x32_16_no_nzflag(
+    const __m256i *round, const __m256i *quant, const __m256i *dequant,
+    const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) {
+  const __m256i coeff = load_tran_low(coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+  const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round);
+  const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant);
+  const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff);
+  const __m256i abs_dqcoeff =
+      _mm256_srli_epi16(_mm256_mullo_epi16(abs_qcoeff, *dequant), 1);
+  const __m256i dqcoeff = _mm256_sign_epi16(abs_dqcoeff, coeff);
+  const __m256i nz_mask =
+      _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256());
+  store_tran_low(qcoeff, qcoeff_ptr);
+  store_tran_low(dqcoeff, dqcoeff_ptr);
+
+  *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask);
+  (void)thr;
+}
+#endif
+
+static VPX_FORCE_INLINE void quantize_fp_32x32_16(
+    const __m256i *round, const __m256i *quant, const __m256i *dequant,
+    const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) {
+  const __m256i coeff = load_tran_low(coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+  const __m256i thr_mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+  const int32_t nzflag = _mm256_movemask_epi8(thr_mask);
+
+  if (nzflag) {
+#ifdef MATCH_VP9_QUANTIZE_FP_32X32_C
+    const __m256i tmp_rnd =
+        _mm256_and_si256(_mm256_adds_epi16(abs_coeff, *round), thr_mask);
+#else
+    const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round);
+#endif
+    const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant);
+    const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff);
+    const __m256i abs_dqcoeff =
+        _mm256_srli_epi16(_mm256_mullo_epi16(abs_qcoeff, *dequant), 1);
+    const __m256i dqcoeff = _mm256_sign_epi16(abs_dqcoeff, coeff);
+    const __m256i nz_mask =
+        _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256());
+    store_tran_low(qcoeff, qcoeff_ptr);
+    store_tran_low(dqcoeff, dqcoeff_ptr);
+
+    *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask);
+  } else {
+    store_zero_tran_low(qcoeff_ptr);
+    store_zero_tran_low(dqcoeff_ptr);
+  }
+}
+
+void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const struct macroblock_plane *const mb_plane,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const struct ScanOrder *const scan_order) {
+  __m256i round, quant, dequant, thr;
+  __m256i eob_max = _mm256_setzero_si256();
+  const int16_t *iscan = scan_order->iscan;
+
+  coeff_ptr += n_coeffs;
+  iscan += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+
+  // Setup global values
+  load_fp_values_avx2(mb_plane, &round, &quant, dequant_ptr, &dequant);
+  thr = _mm256_srli_epi16(dequant, 2);
+  quant = _mm256_slli_epi16(quant, 1);
+  {
+    const __m256i rnd = _mm256_set1_epi16((int16_t)1);
+    round = _mm256_add_epi16(round, rnd);
+    round = _mm256_srai_epi16(round, 1);
+  }
+
+#ifdef MATCH_VP9_QUANTIZE_FP_32X32_C
+  // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when
+  // calculating the zbin mask.
+  thr = _mm256_sub_epi16(thr, _mm256_set1_epi16(1));
+  quantize_fp_32x32_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+                       iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                       dqcoeff_ptr + n_coeffs, &eob_max);
+#else
+  quantize_fp_32x32_16_no_nzflag(
+      &round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, iscan + n_coeffs,
+      qcoeff_ptr + n_coeffs, dqcoeff_ptr + n_coeffs, &eob_max);
+#endif
+
+  n_coeffs += 8 * 2;
+
+  // remove dc constants
+  dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31);
+  quant = _mm256_permute2x128_si256(quant, quant, 0x31);
+  round = _mm256_permute2x128_si256(round, round, 0x31);
+  thr = _mm256_permute2x128_si256(thr, thr, 0x31);
+
+  // AC only loop
+  while (n_coeffs < 0) {
+    quantize_fp_32x32_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+                         iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                         dqcoeff_ptr + n_coeffs, &eob_max);
+    n_coeffs += 8 * 2;
+  }
+
+  *eob_ptr = get_max_eob(eob_max);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x,
+                                                               const __m256i *y,
+                                                               int log_scale) {
+  __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+  __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+  const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+  const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+  prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+  prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale);
+  prod_lo = _mm256_and_si256(prod_lo, mask);
+  prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale);
+  prod_hi = _mm256_slli_epi64(prod_hi, 32);
+  return _mm256_or_si256(prod_lo, prod_hi);
+}
+
+static VPX_FORCE_INLINE __m256i highbd_init_256(const int16_t *val_ptr) {
+  const __m128i v = _mm_load_si128((const __m128i *)val_ptr);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i dc = _mm_unpacklo_epi16(v, zero);
+  const __m128i ac = _mm_unpackhi_epi16(v, zero);
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
+}
+
+static VPX_FORCE_INLINE void highbd_load_fp_values(
+    const struct macroblock_plane *mb_plane, __m256i *round, __m256i *quant,
+    const int16_t *dequant_ptr, __m256i *dequant) {
+  *round = highbd_init_256(mb_plane->round_fp);
+  *quant = highbd_init_256(mb_plane->quant_fp);
+  *dequant = highbd_init_256(dequant_ptr);
+}
+
+static VPX_FORCE_INLINE __m256i highbd_get_max_lane_eob(
+    const int16_t *iscan_ptr, __m256i eobmax, __m256i nz_mask) {
+  const __m256i packed_nz_mask =
+      _mm256_packs_epi32(nz_mask, _mm256_setzero_si256());
+  const __m256i packed_nz_mask_perm =
+      _mm256_permute4x64_epi64(packed_nz_mask, 0xD8);
+  const __m256i iscan =
+      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr));
+  const __m256i nz_iscan = _mm256_and_si256(iscan, packed_nz_mask_perm);
+  return _mm256_max_epi16(eobmax, nz_iscan);
+}
+
+static VPX_FORCE_INLINE void highbd_quantize_fp(
+    const __m256i *round, const __m256i *quant, const __m256i *dequant,
+    const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob) {
+  const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+  const __m256i tmp_rnd = _mm256_add_epi32(abs_coeff, *round);
+  const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp_rnd, quant, 0);
+  const __m256i abs_dq = _mm256_mullo_epi32(abs_q, *dequant);
+  const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+  const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+  const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+
+  _mm256_storeu_si256((__m256i *)qcoeff_ptr, q);
+  _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dq);
+
+  *eob = highbd_get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+}
+
+void vp9_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                 const struct macroblock_plane *const mb_plane,
+                                 tran_low_t *qcoeff_ptr,
+                                 tran_low_t *dqcoeff_ptr,
+                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                 const struct ScanOrder *const scan_order) {
+  const int step = 8;
+  __m256i round, quant, dequant;
+  __m256i eob_max = _mm256_setzero_si256();
+  const int16_t *iscan = scan_order->iscan;
+
+  coeff_ptr += n_coeffs;
+  iscan += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+
+  // Setup global values
+  highbd_load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant);
+
+  highbd_quantize_fp(&round, &quant, &dequant, coeff_ptr + n_coeffs,
+                     iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                     dqcoeff_ptr + n_coeffs, &eob_max);
+
+  n_coeffs += step;
+
+  // remove dc constants
+  dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31);
+  quant = _mm256_permute2x128_si256(quant, quant, 0x31);
+  round = _mm256_permute2x128_si256(round, round, 0x31);
+
+  // AC only loop
+  while (n_coeffs < 0) {
+    highbd_quantize_fp(&round, &quant, &dequant, coeff_ptr + n_coeffs,
+                       iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                       dqcoeff_ptr + n_coeffs, &eob_max);
+    n_coeffs += step;
+  }
+
+  *eob_ptr = get_max_eob(eob_max);
+}
+
+static VPX_FORCE_INLINE void highbd_quantize_fp_32x32(
+    const __m256i *round, const __m256i *quant, const __m256i *dequant,
+    const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob) {
+  const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+  const __m256i thr_mask = _mm256_cmpgt_epi32(abs_coeff, *thr);
+  const __m256i tmp_rnd =
+      _mm256_and_si256(_mm256_add_epi32(abs_coeff, *round), thr_mask);
+  const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp_rnd, quant, 0);
+  const __m256i abs_dq =
+      _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, *dequant), 1);
+  const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+  const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+  const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+
+  _mm256_storeu_si256((__m256i *)qcoeff_ptr, q);
+  _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dq);
+
+  *eob = highbd_get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+}
+
+void vp9_highbd_quantize_fp_32x32_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+    const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const struct ScanOrder *const scan_order) {
+  const int step = 8;
+  __m256i round, quant, dequant, thr;
+  __m256i eob_max = _mm256_setzero_si256();
+  const int16_t *iscan = scan_order->iscan;
+
+  coeff_ptr += n_coeffs;
+  iscan += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+
+  // Setup global values
+  highbd_load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant);
+  thr = _mm256_srli_epi32(dequant, 2);
+  // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when
+  // calculating the zbin mask.
+  thr = _mm256_sub_epi32(thr, _mm256_set1_epi32(1));
+  quant = _mm256_slli_epi32(quant, 1);
+  round = _mm256_srai_epi32(_mm256_add_epi32(round, _mm256_set1_epi32(1)), 1);
+
+  highbd_quantize_fp_32x32(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+                           iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                           dqcoeff_ptr + n_coeffs, &eob_max);
+
+  n_coeffs += step;
+
+  // remove dc constants
+  dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31);
+  quant = _mm256_permute2x128_si256(quant, quant, 0x31);
+  round = _mm256_permute2x128_si256(round, round, 0x31);
+  thr = _mm256_permute2x128_si256(thr, thr, 0x31);
+
+  // AC only loop
+  while (n_coeffs < 0) {
+    highbd_quantize_fp_32x32(
+        &round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, iscan + n_coeffs,
+        qcoeff_ptr + n_coeffs, dqcoeff_ptr + n_coeffs, &eob_max);
+    n_coeffs += step;
+  }
+
+  *eob_ptr = get_max_eob(eob_max);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
index 3f8ee5f244..2481eb366e 100644
--- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
@@ -8,203 +8,119 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 #include <emmintrin.h>
 #include <xmmintrin.h>
 
 #include "./vp9_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 
-void vp9_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs,
-                          int skip_block, const int16_t *zbin_ptr,
-                          const int16_t *round_ptr, const int16_t *quant_ptr,
-                          const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
-                          int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                          uint16_t *eob_ptr, const int16_t *scan_ptr,
-                          const int16_t *iscan_ptr) {
-  __m128i zero;
+void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                          const struct macroblock_plane *const mb_plane,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const struct ScanOrder *const scan_order) {
+  const __m128i zero = _mm_setzero_si128();
   __m128i thr;
-  int16_t nzflag;
-  (void)scan_ptr;
-  (void)zbin_ptr;
-  (void)quant_shift_ptr;
+  int nzflag;
+  int index = 16;
+  __m128i round, quant, dequant;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i eob;
+  const int16_t *iscan = scan_order->iscan;
 
-  coeff_ptr += n_coeffs;
-  iscan_ptr += n_coeffs;
-  qcoeff_ptr += n_coeffs;
-  dqcoeff_ptr += n_coeffs;
-  n_coeffs = -n_coeffs;
-  zero = _mm_setzero_si128();
+  // Setup global values.
+  load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant);
 
-  if (!skip_block) {
-    __m128i eob;
-    __m128i round, quant, dequant;
-    {
-      __m128i coeff0, coeff1;
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
 
-      // Setup global values
-      {
-        round = _mm_load_si128((const __m128i *)round_ptr);
-        quant = _mm_load_si128((const __m128i *)quant_ptr);
-        dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-      }
+  // Poor man's abs().
+  coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
 
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-        // Do DC and first 15 AC
-        coeff0 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs));
-        coeff1 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs) + 1);
+  qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+  qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
 
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+  round = _mm_unpackhi_epi64(round, round);
+  quant = _mm_unpackhi_epi64(quant, quant);
 
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        round = _mm_unpackhi_epi64(round, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        quant = _mm_unpackhi_epi64(quant, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+  qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+  qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
 
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+  // Reinsert signs.
+  qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
 
-        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
-        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+  store_tran_low(qcoeff0, qcoeff_ptr);
+  store_tran_low(qcoeff1, qcoeff_ptr + 8);
 
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        dequant = _mm_unpackhi_epi64(dequant, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+  qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+  dequant = _mm_unpackhi_epi64(dequant, dequant);
+  qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
 
-        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
-        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
-      }
+  store_tran_low(qcoeff0, dqcoeff_ptr);
+  store_tran_low(qcoeff1, dqcoeff_ptr + 8);
 
-      {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob = _mm_max_epi16(eob, eob1);
-      }
-      n_coeffs += 8 * 2;
+  eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
+
+  thr = _mm_srai_epi16(dequant, 1);
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+    // Poor man's abs().
+    coeff0_sign = _mm_srai_epi16(coeff0, 15);
+    coeff1_sign = _mm_srai_epi16(coeff1, 15);
+    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+    nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+             _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+    if (nzflag) {
+      __m128i eob0;
+      qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+      qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+      qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+      qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+      // Reinsert signs.
+      qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+      qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+      store_tran_low(qcoeff0, qcoeff_ptr + index);
+      store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+      qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+      qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+      store_tran_low(qcoeff0, dqcoeff_ptr + index);
+      store_tran_low(qcoeff1, dqcoeff_ptr + index + 8);
+
+      eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
+      eob = _mm_max_epi16(eob, eob0);
+    } else {
+      store_zero_tran_low(qcoeff_ptr + index);
+      store_zero_tran_low(qcoeff_ptr + index + 8);
+
+      store_zero_tran_low(dqcoeff_ptr + index);
+      store_zero_tran_low(dqcoeff_ptr + index + 8);
     }
 
-    thr = _mm_srai_epi16(dequant, 1);
-
-    // AC only loop
-    while (n_coeffs < 0) {
-      __m128i coeff0, coeff1;
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-
-        coeff0 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs));
-        coeff1 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs) + 1);
-
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
-                 _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
-
-        if (nzflag) {
-          qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-          qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-          qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-          qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
-          // Reinsert signs
-          qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
-          qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
-          qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-          qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
-
-          coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-          coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
-        } else {
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
-
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
-        }
-      }
-
-      if (nzflag) {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob0, eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob0 = _mm_max_epi16(eob0, eob1);
-        eob = _mm_max_epi16(eob, eob0);
-      }
-      n_coeffs += 8 * 2;
-    }
-
-    // Accumulate EOB
-    {
-      __m128i eob_shuffled;
-      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      *eob_ptr = _mm_extract_epi16(eob, 1);
-    }
-  } else {
-    do {
-      _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
-      _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
-      n_coeffs += 8 * 2;
-    } while (n_coeffs < 0);
-    *eob_ptr = 0;
+    index += 16;
   }
+
+  *eob_ptr = accumulate_eob(eob);
 }
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c
new file mode 100644
index 0000000000..98decae749
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c
@@ -0,0 +1,252 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <tmmintrin.h>
+
+#include "./vp9_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
+#include "vpx_dsp/x86/quantize_ssse3.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
+
+void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                           const struct macroblock_plane *const mb_plane,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                           const struct ScanOrder *const scan_order) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i thr;
+  int nzflag;
+  int index = 16;
+  __m128i round, quant, dequant;
+  __m128i coeff0, coeff1;
+  __m128i qcoeff0, qcoeff1;
+  __m128i eob;
+  const int16_t *iscan = scan_order->iscan;
+
+  // Setup global values.
+  load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
+
+  qcoeff0 = _mm_abs_epi16(coeff0);
+  qcoeff1 = _mm_abs_epi16(coeff1);
+
+  qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+  qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+
+  round = _mm_unpackhi_epi64(round, round);
+  quant = _mm_unpackhi_epi64(quant, quant);
+
+  qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+  qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+  // Reinsert signs.
+  qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+  qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+  store_tran_low(qcoeff0, qcoeff_ptr);
+  store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+  qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+  dequant = _mm_unpackhi_epi64(dequant, dequant);
+  qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+  store_tran_low(qcoeff0, dqcoeff_ptr);
+  store_tran_low(qcoeff1, dqcoeff_ptr + 8);
+
+  eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
+
+  thr = _mm_srai_epi16(dequant, 1);
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+    qcoeff0 = _mm_abs_epi16(coeff0);
+    qcoeff1 = _mm_abs_epi16(coeff1);
+
+    nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+             _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+    if (nzflag) {
+      __m128i eob0;
+      qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+      qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+      qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+      qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+      // Reinsert signs.
+      qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+      qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+      store_tran_low(qcoeff0, qcoeff_ptr + index);
+      store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+      qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+      qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+      store_tran_low(qcoeff0, dqcoeff_ptr + index);
+      store_tran_low(qcoeff1, dqcoeff_ptr + index + 8);
+
+      eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
+      eob = _mm_max_epi16(eob, eob0);
+    } else {
+      store_zero_tran_low(qcoeff_ptr + index);
+      store_zero_tran_low(qcoeff_ptr + index + 8);
+
+      store_zero_tran_low(dqcoeff_ptr + index);
+      store_zero_tran_low(dqcoeff_ptr + index + 8);
+    }
+
+    index += 16;
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
+
+void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                 const struct macroblock_plane *const mb_plane,
+                                 tran_low_t *qcoeff_ptr,
+                                 tran_low_t *dqcoeff_ptr,
+                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                 const struct ScanOrder *const scan_order) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one_s16 = _mm_set1_epi16(1);
+  __m128i thr;
+  int nzflag;
+  int index = 16;
+  __m128i round, quant, dequant;
+  __m128i coeff0, coeff1;
+  __m128i qcoeff0, qcoeff1;
+  __m128i eob;
+  const int16_t *iscan = scan_order->iscan;
+
+  // Setup global values.
+  load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant);
+  // The 32x32 halves round.
+  round = _mm_add_epi16(round, one_s16);
+  round = _mm_srli_epi16(round, 1);
+
+  // The 16x16 shifts by 16, the 32x32 shifts by 15. We want to use pmulhw so
+  // upshift quant to account for this.
+  quant = _mm_slli_epi16(quant, 1);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
+
+  qcoeff0 = _mm_abs_epi16(coeff0);
+  qcoeff1 = _mm_abs_epi16(coeff1);
+
+  qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+  qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+
+  round = _mm_unpackhi_epi64(round, round);
+  quant = _mm_unpackhi_epi64(quant, quant);
+
+  qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+  qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+  // Reinsert signs.
+  qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+  qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+  store_tran_low(qcoeff0, qcoeff_ptr);
+  store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+  // Get the abs value of qcoeff again so we can use shifts for division.
+  qcoeff0 = _mm_abs_epi16(qcoeff0);
+  qcoeff1 = _mm_abs_epi16(qcoeff1);
+
+  qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+  dequant = _mm_unpackhi_epi64(dequant, dequant);
+  qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+  // Divide by 2.
+  qcoeff0 = _mm_srli_epi16(qcoeff0, 1);
+  qcoeff1 = _mm_srli_epi16(qcoeff1, 1);
+
+  // Reinsert signs.
+  qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+  qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+  store_tran_low(qcoeff0, dqcoeff_ptr);
+  store_tran_low(qcoeff1, dqcoeff_ptr + 8);
+
+  eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
+
+  thr = _mm_srai_epi16(dequant, 2);
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+    qcoeff0 = _mm_abs_epi16(coeff0);
+    qcoeff1 = _mm_abs_epi16(coeff1);
+
+    nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+             _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+    if (nzflag) {
+      qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+      qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+      qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+      qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+      // Reinsert signs.
+      qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+      qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+      store_tran_low(qcoeff0, qcoeff_ptr + index);
+      store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+      // Get the abs value of qcoeff again so we can use shifts for division.
+      qcoeff0 = _mm_abs_epi16(qcoeff0);
+      qcoeff1 = _mm_abs_epi16(qcoeff1);
+
+      qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+      qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+      // Divide by 2.
+      qcoeff0 = _mm_srli_epi16(qcoeff0, 1);
+      qcoeff1 = _mm_srli_epi16(qcoeff1, 1);
+
+      // Reinsert signs.
+      qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+      qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+      store_tran_low(qcoeff0, dqcoeff_ptr + index);
+      store_tran_low(qcoeff1, dqcoeff_ptr + index + 8);
+    } else {
+      store_zero_tran_low(qcoeff_ptr + index);
+      store_zero_tran_low(qcoeff_ptr + index + 8);
+
+      store_zero_tran_low(dqcoeff_ptr + index);
+      store_zero_tran_low(dqcoeff_ptr + index + 8);
+    }
+
+    if (nzflag) {
+      const __m128i eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
+      eob = _mm_max_epi16(eob, eob0);
+    }
+    index += 16;
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
deleted file mode 100644
index ec61c0c3a7..0000000000
--- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
+++ /dev/null
@@ -1,201 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%define private_prefix vp9
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pw_1: times 8 dw 1
-
-SECTION .text
-
-%macro QUANTIZE_FP 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
-                                shift, qcoeff, dqcoeff, dequant, \
-                                eob, scan, iscan
-  cmp                    dword skipm, 0
-  jne .blank
-
-  ; actual quantize loop - setup pointers, rounders, etc.
-  movifnidn                   coeffq, coeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, dequantmp
-  movifnidn                    zbinq, zbinmp
-  movifnidn                   roundq, roundmp
-  movifnidn                   quantq, quantmp
-  mova                            m1, [roundq]             ; m1 = round
-  mova                            m2, [quantq]             ; m2 = quant
-%ifidn %1, fp_32x32
-  pcmpeqw                         m5, m5
-  psrlw                           m5, 15
-  paddw                           m1, m5
-  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
-%endif
-  mova                            m3, [r2q]                ; m3 = dequant
-  mov                             r3, qcoeffmp
-  mov                             r4, dqcoeffmp
-  mov                             r5, iscanmp
-%ifidn %1, fp_32x32
-  psllw                           m2, 1
-%endif
-  pxor                            m5, m5                   ; m5 = dedicated zero
-
-  lea                         coeffq, [  coeffq+ncoeffq*2]
-  lea                            r5q, [  r5q+ncoeffq*2]
-  lea                            r3q, [ r3q+ncoeffq*2]
-  lea                            r4q, [r4q+ncoeffq*2]
-  neg                        ncoeffq
-
-  ; get DC and first 15 AC coeffs
-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpeqw                         m7, m7
-
-  paddsw                          m6, m1                   ; m6 += round
-  punpckhqdq                      m1, m1
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
-  punpckhqdq                      m2, m2
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  psignw                          m8, m9                   ; m8 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  mova            [r3q+ncoeffq*2+ 0], m8
-  mova            [r3q+ncoeffq*2+16], m13
-%ifidn %1, fp_32x32
-  pabsw                           m8, m8
-  pabsw                          m13, m13
-%endif
-  pmullw                          m8, m3                   ; r4[i] = r3[i] * q
-  punpckhqdq                      m3, m3
-  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
-%ifidn %1, fp_32x32
-  psrlw                           m8, 1
-  psrlw                          m13, 1
-  psignw                          m8, m9
-  psignw                         m13, m10
-  psrlw                           m0, m3, 2
-%else
-  psrlw                           m0, m3, 1
-%endif
-  mova            [r4q+ncoeffq*2+ 0], m8
-  mova            [r4q+ncoeffq*2+16], m13
-  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                   ; m6 = scan[i] + 1
-  psubw                          m11, m7                   ; m11 = scan[i] + 1
-  pandn                           m8, m6                   ; m8 = max(eob)
-  pandn                          m13, m11                  ; m13 = max(eob)
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jz .accumulate_eob
-
-.ac_only_loop:
-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-
-  pcmpgtw                         m7, m6,  m0
-  pcmpgtw                        m12, m11, m0
-  pmovmskb                       r6d, m7
-  pmovmskb                       r2d, m12
-
-  or                              r6, r2
-  jz .skip_iter
-
-  pcmpeqw                         m7, m7
-
-  paddsw                          m6, m1                   ; m6 += round
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  psignw                         m14, m9                   ; m14 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  mova            [r3q+ncoeffq*2+ 0], m14
-  mova            [r3q+ncoeffq*2+16], m13
-%ifidn %1, fp_32x32
-  pabsw                          m14, m14
-  pabsw                          m13, m13
-%endif
-  pmullw                         m14, m3                   ; r4[i] = r3[i] * q
-  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
-%ifidn %1, fp_32x32
-  psrlw                          m14, 1
-  psrlw                          m13, 1
-  psignw                         m14, m9
-  psignw                         m13, m10
-%endif
-  mova            [r4q+ncoeffq*2+ 0], m14
-  mova            [r4q+ncoeffq*2+16], m13
-  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                   ; m6 = scan[i] + 1
-  psubw                          m11, m7                   ; m11 = scan[i] + 1
-  pandn                          m14, m6                   ; m14 = max(eob)
-  pandn                          m13, m11                  ; m13 = max(eob)
-  pmaxsw                          m8, m14
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jl .ac_only_loop
-
-  jmp .accumulate_eob
-.skip_iter:
-  mova            [r3q+ncoeffq*2+ 0], m5
-  mova            [r3q+ncoeffq*2+16], m5
-  mova            [r4q+ncoeffq*2+ 0], m5
-  mova            [r4q+ncoeffq*2+16], m5
-  add                        ncoeffq, mmsize
-  jl .ac_only_loop
-
-.accumulate_eob:
-  ; horizontally accumulate/max eobs and write into [eob] memory pointer
-  mov                             r2, eobmp
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  pextrw                          r6, m8, 0
-  mov                           [r2], r6
-  RET
-
-  ; skip-block, i.e. just write all zeroes
-.blank:
-  mov                             r0, dqcoeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, qcoeffmp
-  mov                             r3, eobmp
-
-  lea                            r0q, [r0q+ncoeffq*2]
-  lea                            r2q, [r2q+ncoeffq*2]
-  neg                        ncoeffq
-  pxor                            m7, m7
-.blank_loop:
-  mova            [r0q+ncoeffq*2+ 0], m7
-  mova            [r0q+ncoeffq*2+16], m7
-  mova            [r2q+ncoeffq*2+ 0], m7
-  mova            [r2q+ncoeffq*2+16], m7
-  add                        ncoeffq, mmsize
-  jl .blank_loop
-  mov                     word [r3q], 0
-  RET
-%endmacro
-
-INIT_XMM ssse3
-QUANTIZE_FP fp, 7
-QUANTIZE_FP fp_32x32, 7
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm b/media/libvpx/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
deleted file mode 100644
index 21aaa93831..0000000000
--- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
+++ /dev/null
@@ -1,212 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-; void vp9_temporal_filter_apply_sse2 | arg
-;  (unsigned char  *frame1,           |  0
-;   unsigned int    stride,           |  1
-;   unsigned char  *frame2,           |  2
-;   unsigned int    block_width,      |  3
-;   unsigned int    block_height,     |  4
-;   int             strength,         |  5
-;   int             filter_weight,    |  6
-;   unsigned int   *accumulator,      |  7
-;   unsigned short *count)            |  8
-global sym(vp9_temporal_filter_apply_sse2) PRIVATE
-sym(vp9_temporal_filter_apply_sse2):
-
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ALIGN_STACK 16, rax
-    %define block_width    0
-    %define block_height  16
-    %define strength      32
-    %define filter_weight 48
-    %define rounding_bit  64
-    %define rbp_backup    80
-    %define stack_size    96
-    sub         rsp,           stack_size
-    mov         [rsp + rbp_backup], rbp
-    ; end prolog
-
-        mov         edx,            arg(3)
-        mov         [rsp + block_width], rdx
-        mov         edx,            arg(4)
-        mov         [rsp + block_height], rdx
-        movd        xmm6,           arg(5)
-        movdqa      [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
-
-        ; calculate the rounding bit outside the loop
-        ; 0x8000 >> (16 - strength)
-        mov         rdx,            16
-        sub         rdx,            arg(5) ; 16 - strength
-        movq        xmm4,           rdx    ; can't use rdx w/ shift
-        movdqa      xmm5,           [GLOBAL(_const_top_bit)]
-        psrlw       xmm5,           xmm4
-        movdqa      [rsp + rounding_bit], xmm5
-
-        mov         rsi,            arg(0) ; src/frame1
-        mov         rdx,            arg(2) ; predictor frame
-        mov         rdi,            arg(7) ; accumulator
-        mov         rax,            arg(8) ; count
-
-        ; dup the filter weight and store for later
-        movd        xmm0,           arg(6) ; filter_weight
-        pshuflw     xmm0,           xmm0, 0
-        punpcklwd   xmm0,           xmm0
-        movdqa      [rsp + filter_weight], xmm0
-
-        mov         rbp,            arg(1) ; stride
-        pxor        xmm7,           xmm7   ; zero for extraction
-
-        mov         rcx,            [rsp + block_width]
-        imul        rcx,            [rsp + block_height]
-        add         rcx,            rdx
-        cmp         dword ptr [rsp + block_width], 8
-        jne         .temporal_filter_apply_load_16
-
-.temporal_filter_apply_load_8:
-        movq        xmm0,           [rsi]  ; first row
-        lea         rsi,            [rsi + rbp] ; += stride
-        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
-        movq        xmm1,           [rsi]  ; second row
-        lea         rsi,            [rsi + rbp] ; += stride
-        punpcklbw   xmm1,           xmm7   ; src[ 8-15]
-        jmp         .temporal_filter_apply_load_finished
-
-.temporal_filter_apply_load_16:
-        movdqa      xmm0,           [rsi]  ; src (frame1)
-        lea         rsi,            [rsi + rbp] ; += stride
-        movdqa      xmm1,           xmm0
-        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
-        punpckhbw   xmm1,           xmm7   ; src[ 8-15]
-
-.temporal_filter_apply_load_finished:
-        movdqa      xmm2,           [rdx]  ; predictor (frame2)
-        movdqa      xmm3,           xmm2
-        punpcklbw   xmm2,           xmm7   ; pred[ 0- 7]
-        punpckhbw   xmm3,           xmm7   ; pred[ 8-15]
-
-        ; modifier = src_byte - pixel_value
-        psubw       xmm0,           xmm2   ; src - pred[ 0- 7]
-        psubw       xmm1,           xmm3   ; src - pred[ 8-15]
-
-        ; modifier *= modifier
-        pmullw      xmm0,           xmm0   ; modifer[ 0- 7]^2
-        pmullw      xmm1,           xmm1   ; modifer[ 8-15]^2
-
-        ; modifier *= 3
-        pmullw      xmm0,           [GLOBAL(_const_3w)]
-        pmullw      xmm1,           [GLOBAL(_const_3w)]
-
-        ; modifer += 0x8000 >> (16 - strength)
-        paddw       xmm0,           [rsp + rounding_bit]
-        paddw       xmm1,           [rsp + rounding_bit]
-
-        ; modifier >>= strength
-        psrlw       xmm0,           [rsp + strength]
-        psrlw       xmm1,           [rsp + strength]
-
-        ; modifier = 16 - modifier
-        ; saturation takes care of modifier > 16
-        movdqa      xmm3,           [GLOBAL(_const_16w)]
-        movdqa      xmm2,           [GLOBAL(_const_16w)]
-        psubusw     xmm3,           xmm1
-        psubusw     xmm2,           xmm0
-
-        ; modifier *= filter_weight
-        pmullw      xmm2,           [rsp + filter_weight]
-        pmullw      xmm3,           [rsp + filter_weight]
-
-        ; count
-        movdqa      xmm4,           [rax]
-        movdqa      xmm5,           [rax+16]
-        ; += modifier
-        paddw       xmm4,           xmm2
-        paddw       xmm5,           xmm3
-        ; write back
-        movdqa      [rax],          xmm4
-        movdqa      [rax+16],       xmm5
-        lea         rax,            [rax + 16*2] ; count += 16*(sizeof(short))
-
-        ; load and extract the predictor up to shorts
-        pxor        xmm7,           xmm7
-        movdqa      xmm0,           [rdx]
-        lea         rdx,            [rdx + 16*1] ; pred += 16*(sizeof(char))
-        movdqa      xmm1,           xmm0
-        punpcklbw   xmm0,           xmm7   ; pred[ 0- 7]
-        punpckhbw   xmm1,           xmm7   ; pred[ 8-15]
-
-        ; modifier *= pixel_value
-        pmullw      xmm0,           xmm2
-        pmullw      xmm1,           xmm3
-
-        ; expand to double words
-        movdqa      xmm2,           xmm0
-        punpcklwd   xmm0,           xmm7   ; [ 0- 3]
-        punpckhwd   xmm2,           xmm7   ; [ 4- 7]
-        movdqa      xmm3,           xmm1
-        punpcklwd   xmm1,           xmm7   ; [ 8-11]
-        punpckhwd   xmm3,           xmm7   ; [12-15]
-
-        ; accumulator
-        movdqa      xmm4,           [rdi]
-        movdqa      xmm5,           [rdi+16]
-        movdqa      xmm6,           [rdi+32]
-        movdqa      xmm7,           [rdi+48]
-        ; += modifier
-        paddd       xmm4,           xmm0
-        paddd       xmm5,           xmm2
-        paddd       xmm6,           xmm1
-        paddd       xmm7,           xmm3
-        ; write back
-        movdqa      [rdi],          xmm4
-        movdqa      [rdi+16],       xmm5
-        movdqa      [rdi+32],       xmm6
-        movdqa      [rdi+48],       xmm7
-        lea         rdi,            [rdi + 16*4] ; accumulator += 16*(sizeof(int))
-
-        cmp         rdx,            rcx
-        je          .temporal_filter_apply_epilog
-        pxor        xmm7,           xmm7   ; zero for extraction
-        cmp         dword ptr [rsp + block_width], 16
-        je          .temporal_filter_apply_load_16
-        jmp         .temporal_filter_apply_load_8
-
-.temporal_filter_apply_epilog:
-    ; begin epilog
-    mov         rbp,            [rsp + rbp_backup]
-    add         rsp,            stack_size
-    pop         rsp
-    pop         rdi
-    pop         rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-_const_3w:
-    times 8 dw 3
-align 16
-_const_top_bit:
-    times 8 dw 1<<15
-align 16
-_const_16w
-    times 8 dw 16
diff --git a/media/libvpx/libvpx/vp9/ratectrl_rtc.cc b/media/libvpx/libvpx/vp9/ratectrl_rtc.cc
new file mode 100644
index 0000000000..b29e1ec236
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/ratectrl_rtc.cc
@@ -0,0 +1,354 @@
+/*
+ *  Copyright (c) 2020 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "vp9/ratectrl_rtc.h"
+
+#include <new>
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_picklpf.h"
+#include "vpx/vp8cx.h"
+#include "vpx/vpx_codec.h"
+#include "vpx_mem/vpx_mem.h"
+
+namespace libvpx {
+
+std::unique_ptr<VP9RateControlRTC> VP9RateControlRTC::Create(
+    const VP9RateControlRtcConfig &cfg) {
+  std::unique_ptr<VP9RateControlRTC> rc_api(new (std::nothrow)
+                                                VP9RateControlRTC());
+  if (!rc_api) return nullptr;
+  rc_api->cpi_ = static_cast<VP9_COMP *>(vpx_memalign(32, sizeof(*cpi_)));
+  if (!rc_api->cpi_) return nullptr;
+  vp9_zero(*rc_api->cpi_);
+
+  if (!rc_api->InitRateControl(cfg)) return nullptr;
+  if (cfg.aq_mode) {
+    VP9_COMP *const cpi = rc_api->cpi_;
+    cpi->segmentation_map = static_cast<uint8_t *>(
+        vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols,
+                   sizeof(*cpi->segmentation_map)));
+    if (!cpi->segmentation_map) return nullptr;
+    cpi->cyclic_refresh =
+        vp9_cyclic_refresh_alloc(cpi->common.mi_rows, cpi->common.mi_cols);
+    cpi->cyclic_refresh->content_mode = 0;
+  }
+  return rc_api;
+}
+
+VP9RateControlRTC::~VP9RateControlRTC() {
+  if (cpi_) {
+    if (cpi_->svc.number_spatial_layers > 1 ||
+        cpi_->svc.number_temporal_layers > 1) {
+      for (int sl = 0; sl < cpi_->svc.number_spatial_layers; sl++) {
+        for (int tl = 0; tl < cpi_->svc.number_temporal_layers; tl++) {
+          int layer = LAYER_IDS_TO_IDX(sl, tl, cpi_->oxcf.ts_number_layers);
+          LAYER_CONTEXT *const lc = &cpi_->svc.layer_context[layer];
+          vpx_free(lc->map);
+          vpx_free(lc->last_coded_q_map);
+          vpx_free(lc->consec_zero_mv);
+        }
+      }
+    }
+    if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+      vpx_free(cpi_->segmentation_map);
+      cpi_->segmentation_map = NULL;
+      vp9_cyclic_refresh_free(cpi_->cyclic_refresh);
+    }
+    vpx_free(cpi_);
+  }
+}
+
+bool VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) {
+  VP9_COMMON *cm = &cpi_->common;
+  VP9EncoderConfig *oxcf = &cpi_->oxcf;
+  RATE_CONTROL *const rc = &cpi_->rc;
+  cm->profile = PROFILE_0;
+  cm->bit_depth = VPX_BITS_8;
+  cm->show_frame = 1;
+  oxcf->profile = cm->profile;
+  oxcf->bit_depth = cm->bit_depth;
+  oxcf->rc_mode = rc_cfg.rc_mode;
+  oxcf->pass = 0;
+  oxcf->aq_mode = rc_cfg.aq_mode ? CYCLIC_REFRESH_AQ : NO_AQ;
+  oxcf->content = VP9E_CONTENT_DEFAULT;
+  oxcf->drop_frames_water_mark = 0;
+  cm->current_video_frame = 0;
+  rc->kf_boost = DEFAULT_KF_BOOST;
+
+  if (!UpdateRateControl(rc_cfg)) return false;
+  vp9_set_mb_mi(cm, cm->width, cm->height);
+
+  cpi_->use_svc = (cpi_->svc.number_spatial_layers > 1 ||
+                   cpi_->svc.number_temporal_layers > 1)
+                      ? 1
+                      : 0;
+
+  rc->rc_1_frame = 0;
+  rc->rc_2_frame = 0;
+  vp9_rc_init_minq_luts();
+  vp9_rc_init(oxcf, 0, rc);
+  rc->constrain_gf_key_freq_onepass_vbr = 0;
+  cpi_->sf.use_nonrd_pick_mode = 1;
+  return true;
+}
+
+bool VP9RateControlRTC::UpdateRateControl(
+    const VP9RateControlRtcConfig &rc_cfg) {
+  // Since VPX_MAX_LAYERS (12) is less than the product of VPX_SS_MAX_LAYERS (5)
+  // and VPX_TS_MAX_LAYERS (5), check all three.
+  if (rc_cfg.ss_number_layers < 1 ||
+      rc_cfg.ss_number_layers > VPX_SS_MAX_LAYERS ||
+      rc_cfg.ts_number_layers < 1 ||
+      rc_cfg.ts_number_layers > VPX_TS_MAX_LAYERS ||
+      rc_cfg.ss_number_layers * rc_cfg.ts_number_layers > VPX_MAX_LAYERS) {
+    return false;
+  }
+
+  VP9_COMMON *cm = &cpi_->common;
+  VP9EncoderConfig *oxcf = &cpi_->oxcf;
+  RATE_CONTROL *const rc = &cpi_->rc;
+
+  cm->width = rc_cfg.width;
+  cm->height = rc_cfg.height;
+  oxcf->width = rc_cfg.width;
+  oxcf->height = rc_cfg.height;
+  oxcf->worst_allowed_q = vp9_quantizer_to_qindex(rc_cfg.max_quantizer);
+  oxcf->best_allowed_q = vp9_quantizer_to_qindex(rc_cfg.min_quantizer);
+  rc->worst_quality = oxcf->worst_allowed_q;
+  rc->best_quality = oxcf->best_allowed_q;
+  oxcf->init_framerate = rc_cfg.framerate;
+  oxcf->target_bandwidth = 1000 * rc_cfg.target_bandwidth;
+  oxcf->starting_buffer_level_ms = rc_cfg.buf_initial_sz;
+  oxcf->optimal_buffer_level_ms = rc_cfg.buf_optimal_sz;
+  oxcf->maximum_buffer_size_ms = rc_cfg.buf_sz;
+  oxcf->under_shoot_pct = rc_cfg.undershoot_pct;
+  oxcf->over_shoot_pct = rc_cfg.overshoot_pct;
+  oxcf->drop_frames_water_mark = rc_cfg.frame_drop_thresh;
+  oxcf->content = rc_cfg.is_screen ? VP9E_CONTENT_SCREEN : VP9E_CONTENT_DEFAULT;
+  oxcf->ss_number_layers = rc_cfg.ss_number_layers;
+  oxcf->ts_number_layers = rc_cfg.ts_number_layers;
+  oxcf->temporal_layering_mode =
+      (VP9E_TEMPORAL_LAYERING_MODE)((rc_cfg.ts_number_layers > 1)
+                                        ? rc_cfg.ts_number_layers
+                                        : 0);
+
+  cpi_->oxcf.rc_max_intra_bitrate_pct = rc_cfg.max_intra_bitrate_pct;
+  cpi_->oxcf.rc_max_inter_bitrate_pct = rc_cfg.max_inter_bitrate_pct;
+  cpi_->framerate = rc_cfg.framerate;
+  cpi_->svc.number_spatial_layers = rc_cfg.ss_number_layers;
+  cpi_->svc.number_temporal_layers = rc_cfg.ts_number_layers;
+
+  vp9_set_mb_mi(cm, cm->width, cm->height);
+
+  if (setjmp(cpi_->common.error.jmp)) {
+    cpi_->common.error.setjmp = 0;
+    vpx_clear_system_state();
+    return false;
+  }
+  cpi_->common.error.setjmp = 1;
+
+  for (int tl = 0; tl < cpi_->svc.number_temporal_layers; ++tl) {
+    oxcf->ts_rate_decimator[tl] = rc_cfg.ts_rate_decimator[tl];
+  }
+  for (int sl = 0; sl < cpi_->svc.number_spatial_layers; ++sl) {
+    for (int tl = 0; tl < cpi_->svc.number_temporal_layers; ++tl) {
+      const int layer =
+          LAYER_IDS_TO_IDX(sl, tl, cpi_->svc.number_temporal_layers);
+      LAYER_CONTEXT *lc = &cpi_->svc.layer_context[layer];
+      RATE_CONTROL *const lrc = &lc->rc;
+      oxcf->layer_target_bitrate[layer] =
+          1000 * rc_cfg.layer_target_bitrate[layer];
+      lrc->worst_quality =
+          vp9_quantizer_to_qindex(rc_cfg.max_quantizers[layer]);
+      lrc->best_quality = vp9_quantizer_to_qindex(rc_cfg.min_quantizers[layer]);
+      lc->scaling_factor_num = rc_cfg.scaling_factor_num[sl];
+      lc->scaling_factor_den = rc_cfg.scaling_factor_den[sl];
+    }
+  }
+  vp9_set_rc_buffer_sizes(cpi_);
+  vp9_new_framerate(cpi_, cpi_->framerate);
+  if (cpi_->svc.number_temporal_layers > 1 ||
+      cpi_->svc.number_spatial_layers > 1) {
+    if (cm->current_video_frame == 0) {
+      vp9_init_layer_context(cpi_);
+      // svc->framedrop_mode is not currently exposed, so only allow for
+      // full superframe drop for now.
+      cpi_->svc.framedrop_mode = FULL_SUPERFRAME_DROP;
+    }
+    vp9_update_layer_context_change_config(cpi_,
+                                           (int)cpi_->oxcf.target_bandwidth);
+    cpi_->svc.max_consec_drop = rc_cfg.max_consec_drop;
+  }
+  vp9_check_reset_rc_flag(cpi_);
+
+  cpi_->common.error.setjmp = 0;
+  return true;
+}
+
+// Compute the QP for the frame. If the frame is dropped this function
+// returns kDrop, and no QP is computed. If the frame is encoded (not dropped)
+// the QP is computed and kOk is returned.
+FrameDropDecision VP9RateControlRTC::ComputeQP(
+    const VP9FrameParamsQpRTC &frame_params) {
+  VP9_COMMON *const cm = &cpi_->common;
+  int width, height;
+  cpi_->svc.spatial_layer_id = frame_params.spatial_layer_id;
+  cpi_->svc.temporal_layer_id = frame_params.temporal_layer_id;
+  if (cpi_->svc.number_spatial_layers > 1) {
+    const int layer = LAYER_IDS_TO_IDX(cpi_->svc.spatial_layer_id,
+                                       cpi_->svc.temporal_layer_id,
+                                       cpi_->svc.number_temporal_layers);
+    LAYER_CONTEXT *lc = &cpi_->svc.layer_context[layer];
+    get_layer_resolution(cpi_->oxcf.width, cpi_->oxcf.height,
+                         lc->scaling_factor_num, lc->scaling_factor_den, &width,
+                         &height);
+    cm->width = width;
+    cm->height = height;
+  }
+  vp9_set_mb_mi(cm, cm->width, cm->height);
+  cm->frame_type = static_cast<FRAME_TYPE>(frame_params.frame_type);
+  // This is needed to ensure key frame does not get unset in rc_get_svc_params.
+  cpi_->frame_flags = (cm->frame_type == KEY_FRAME) ? FRAMEFLAGS_KEY : 0;
+  cpi_->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0;
+  cpi_->sf.use_nonrd_pick_mode = 1;
+  if (cpi_->svc.number_spatial_layers == 1 &&
+      cpi_->svc.number_temporal_layers == 1) {
+    int target = 0;
+    if (cpi_->oxcf.rc_mode == VPX_CBR) {
+      if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+        vp9_cyclic_refresh_update_parameters(cpi_);
+      if (frame_is_intra_only(cm))
+        target = vp9_calc_iframe_target_size_one_pass_cbr(cpi_);
+      else
+        target = vp9_calc_pframe_target_size_one_pass_cbr(cpi_);
+    } else if (cpi_->oxcf.rc_mode == VPX_VBR) {
+      if (cm->frame_type == KEY_FRAME) {
+        cpi_->rc.this_key_frame_forced = cm->current_video_frame != 0;
+        cpi_->rc.frames_to_key = cpi_->oxcf.key_freq;
+      }
+      vp9_set_gf_update_one_pass_vbr(cpi_);
+      if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+        vp9_cyclic_refresh_update_parameters(cpi_);
+      if (frame_is_intra_only(cm))
+        target = vp9_calc_iframe_target_size_one_pass_vbr(cpi_);
+      else
+        target = vp9_calc_pframe_target_size_one_pass_vbr(cpi_);
+    }
+    vp9_rc_set_frame_target(cpi_, target);
+    vp9_update_buffer_level_preencode(cpi_);
+  } else {
+    vp9_update_temporal_layer_framerate(cpi_);
+    vp9_restore_layer_context(cpi_);
+    vp9_rc_get_svc_params(cpi_);
+  }
+  if (cpi_->svc.spatial_layer_id == 0) vp9_zero(cpi_->svc.drop_spatial_layer);
+  // SVC: check for skip encoding of enhancement layer if the
+  // layer target bandwidth = 0.
+  if (vp9_svc_check_skip_enhancement_layer(cpi_))
+    return FrameDropDecision::kDrop;
+  // Check for dropping this frame based on buffer level.
+  // Never drop on key frame, or if base layer is key for svc,
+  if (!frame_is_intra_only(cm) &&
+      (!cpi_->use_svc ||
+       !cpi_->svc.layer_context[cpi_->svc.temporal_layer_id].is_key_frame)) {
+    if (vp9_rc_drop_frame(cpi_)) {
+      // For FULL_SUPERFRAME_DROP mode (the only mode considered here):
+      // if the superframe drop is decided we need to save the layer context for
+      // all spatial layers, and call update_buffer_level and postencode_drop
+      // for all spatial layers.
+      if (cpi_->svc.number_spatial_layers > 1 ||
+          cpi_->svc.number_temporal_layers > 1) {
+        vp9_save_layer_context(cpi_);
+        for (int sl = 1; sl < cpi_->svc.number_spatial_layers; sl++) {
+          cpi_->svc.spatial_layer_id = sl;
+          vp9_restore_layer_context(cpi_);
+          vp9_update_buffer_level_svc_preencode(cpi_);
+          vp9_rc_postencode_update_drop_frame(cpi_);
+          vp9_save_layer_context(cpi_);
+        }
+      }
+      return FrameDropDecision::kDrop;
+    }
+  }
+  // Compute the QP for the frame.
+  int bottom_index, top_index;
+  cpi_->common.base_qindex =
+      vp9_rc_pick_q_and_bounds(cpi_, &bottom_index, &top_index);
+
+  if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_setup(cpi_);
+  if (cpi_->svc.number_spatial_layers > 1 ||
+      cpi_->svc.number_temporal_layers > 1)
+    vp9_save_layer_context(cpi_);
+
+  cpi_->last_frame_dropped = 0;
+  cpi_->svc.last_layer_dropped[cpi_->svc.spatial_layer_id] = 0;
+  if (cpi_->svc.spatial_layer_id == cpi_->svc.number_spatial_layers - 1)
+    cpi_->svc.num_encoded_top_layer++;
+
+  return FrameDropDecision::kOk;
+}
+
+int VP9RateControlRTC::GetQP() const { return cpi_->common.base_qindex; }
+
+int VP9RateControlRTC::GetLoopfilterLevel() const {
+  struct loopfilter *const lf = &cpi_->common.lf;
+  vp9_pick_filter_level(nullptr, cpi_, LPF_PICK_FROM_Q);
+  return lf->filter_level;
+}
+
+bool VP9RateControlRTC::GetSegmentationData(
+    VP9SegmentationData *segmentation_data) const {
+  if (!cpi_->cyclic_refresh || !cpi_->cyclic_refresh->apply_cyclic_refresh) {
+    return false;
+  }
+
+  segmentation_data->segmentation_map = cpi_->segmentation_map;
+  segmentation_data->segmentation_map_size =
+      cpi_->common.mi_cols * cpi_->common.mi_rows;
+  segmentation_data->delta_q = cpi_->cyclic_refresh->qindex_delta;
+  segmentation_data->delta_q_size = 3u;
+  return true;
+}
+
+void VP9RateControlRTC::PostEncodeUpdate(
+    uint64_t encoded_frame_size, const VP9FrameParamsQpRTC &frame_params) {
+  cpi_->common.frame_type = static_cast<FRAME_TYPE>(frame_params.frame_type);
+  cpi_->svc.spatial_layer_id = frame_params.spatial_layer_id;
+  cpi_->svc.temporal_layer_id = frame_params.temporal_layer_id;
+  if (cpi_->svc.number_spatial_layers > 1 ||
+      cpi_->svc.number_temporal_layers > 1) {
+    vp9_restore_layer_context(cpi_);
+    const int layer = LAYER_IDS_TO_IDX(cpi_->svc.spatial_layer_id,
+                                       cpi_->svc.temporal_layer_id,
+                                       cpi_->svc.number_temporal_layers);
+    LAYER_CONTEXT *lc = &cpi_->svc.layer_context[layer];
+    cpi_->common.base_qindex = lc->frame_qp;
+    cpi_->common.MBs = lc->MBs;
+    // For spatial-svc, allow cyclic-refresh to be applied on the spatial
+    // layers, for the base temporal layer.
+    if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+        cpi_->svc.number_spatial_layers > 1 &&
+        cpi_->svc.temporal_layer_id == 0) {
+      CYCLIC_REFRESH *const cr = cpi_->cyclic_refresh;
+      cr->qindex_delta[0] = lc->qindex_delta[0];
+      cr->qindex_delta[1] = lc->qindex_delta[1];
+      cr->qindex_delta[2] = lc->qindex_delta[2];
+    }
+  }
+  vp9_rc_postencode_update(cpi_, encoded_frame_size);
+  if (cpi_->svc.number_spatial_layers > 1 ||
+      cpi_->svc.number_temporal_layers > 1)
+    vp9_save_layer_context(cpi_);
+  cpi_->common.current_video_frame++;
+}
+
+}  // namespace libvpx
diff --git a/media/libvpx/libvpx/vp9/ratectrl_rtc.h b/media/libvpx/libvpx/vp9/ratectrl_rtc.h
new file mode 100644
index 0000000000..4c39255886
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/ratectrl_rtc.h
@@ -0,0 +1,106 @@
+/*
+ *  Copyright (c) 2020 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_RATECTRL_RTC_H_
+#define VPX_VP9_RATECTRL_RTC_H_
+
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <memory>
+
+#include "vpx/vpx_encoder.h"
+#include "vpx/internal/vpx_ratectrl_rtc.h"
+
+struct VP9_COMP;
+
+namespace libvpx {
+struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig {
+  VP9RateControlRtcConfig() {
+    memset(layer_target_bitrate, 0, sizeof(layer_target_bitrate));
+    memset(ts_rate_decimator, 0, sizeof(ts_rate_decimator));
+    scaling_factor_num[0] = 1;
+    scaling_factor_den[0] = 1;
+    max_quantizers[0] = max_quantizer;
+    min_quantizers[0] = min_quantizer;
+  }
+
+  // Number of spatial layers
+  int ss_number_layers = 1;
+  int max_quantizers[VPX_MAX_LAYERS] = {};
+  int min_quantizers[VPX_MAX_LAYERS] = {};
+  int scaling_factor_num[VPX_SS_MAX_LAYERS] = {};
+  int scaling_factor_den[VPX_SS_MAX_LAYERS] = {};
+  // This is only for SVC for now.
+  int max_consec_drop = std::numeric_limits<int>::max();
+};
+
+struct VP9FrameParamsQpRTC {
+  RcFrameType frame_type;
+  int spatial_layer_id;
+  int temporal_layer_id;
+};
+
+struct VP9SegmentationData {
+  const uint8_t *segmentation_map;
+  size_t segmentation_map_size;
+  const int *delta_q;
+  size_t delta_q_size;
+};
+
+// This interface allows using VP9 real-time rate control without initializing
+// the encoder. To use this interface, you need to link with libvpxrc.a.
+//
+// #include "vp9/ratectrl_rtc.h"
+// VP9RateControlRtcConfig cfg;
+// VP9FrameParamsQpRTC frame_params;
+//
+// YourFunctionToInitializeConfig(cfg);
+// std::unique_ptr<VP9RateControlRTC> rc_api = VP9RateControlRTC::Create(cfg);
+// // start encoding
+// while (frame_to_encode) {
+//   if (config_changed)
+//     rc_api->UpdateRateControl(cfg);
+//   YourFunctionToFillFrameParams(frame_params);
+//   rc_api->ComputeQP(frame_params);
+//   YourFunctionToUseQP(rc_api->GetQP());
+//   YourFunctionToUseLoopfilter(rc_api->GetLoopfilterLevel());
+//   // After encoding
+//   rc_api->PostEncode(encoded_frame_size, frame_params);
+// }
+class VP9RateControlRTC {
+ public:
+  static std::unique_ptr<VP9RateControlRTC> Create(
+      const VP9RateControlRtcConfig &cfg);
+  ~VP9RateControlRTC();
+
+  bool UpdateRateControl(const VP9RateControlRtcConfig &rc_cfg);
+  // GetQP() needs to be called after ComputeQP() to get the latest QP
+  int GetQP() const;
+  int GetLoopfilterLevel() const;
+  bool GetSegmentationData(VP9SegmentationData *segmentation_data) const;
+  // ComputeQP computes the QP if the frame is not dropped (kOk return),
+  // otherwise it returns kDrop and subsequent GetQP and PostEncodeUpdate
+  // are not to be called (vp9_rc_postencode_update_drop_frame is already
+  // called via ComputeQP if drop is decided).
+  FrameDropDecision ComputeQP(const VP9FrameParamsQpRTC &frame_params);
+  // Feedback to rate control with the size of current encoded frame
+  void PostEncodeUpdate(uint64_t encoded_frame_size,
+                        const VP9FrameParamsQpRTC &frame_params);
+
+ private:
+  VP9RateControlRTC() = default;
+  bool InitRateControl(const VP9RateControlRtcConfig &cfg);
+  struct VP9_COMP *cpi_ = nullptr;
+};
+
+}  // namespace libvpx
+
+#endif  // VPX_VP9_RATECTRL_RTC_H_
diff --git a/media/libvpx/libvpx/vp9/vp9_common.mk b/media/libvpx/libvpx/vp9/vp9_common.mk
index 5bfc0d3599..5ef2f891a8 100644
--- a/media/libvpx/libvpx/vp9/vp9_common.mk
+++ b/media/libvpx/libvpx/vp9/vp9_common.mk
@@ -10,6 +10,7 @@
 
 VP9_COMMON_SRCS-yes += vp9_common.mk
 VP9_COMMON_SRCS-yes += vp9_iface_common.h
+VP9_COMMON_SRCS-yes += vp9_iface_common.c
 VP9_COMMON_SRCS-yes += common/vp9_ppflags.h
 VP9_COMMON_SRCS-yes += common/vp9_alloccommon.c
 VP9_COMMON_SRCS-yes += common/vp9_blockd.c
@@ -63,30 +64,36 @@ VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c
+
+ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP9_COMMON_SRCS-$(HAVE_MSA)   += common/mips/msa/vp9_idct4x4_msa.c
+VP9_COMMON_SRCS-$(HAVE_MSA)   += common/mips/msa/vp9_idct8x8_msa.c
+VP9_COMMON_SRCS-$(HAVE_MSA)   += common/mips/msa/vp9_idct16x16_msa.c
+endif  # !CONFIG_VP9_HIGHBITDEPTH
+
+VP9_COMMON_SRCS-$(HAVE_SSE2)  += common/x86/vp9_idct_intrin_sse2.c
+VP9_COMMON_SRCS-$(HAVE_VSX)   += common/ppc/vp9_idct_vsx.c
+VP9_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp9_iht4x4_add_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp9_iht8x8_add_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp9_iht16x16_add_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp9_iht_neon.h
+
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
+VP9_COMMON_SRCS-$(HAVE_MSA)  += common/mips/msa/vp9_mfqe_msa.c
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_mfqe_sse2.asm
 endif
 
 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans4_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans8_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans16_dspr2.c
-endif
-
-# common (msa)
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct4x4_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct8x8_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c
-
-ifeq ($(CONFIG_VP9_POSTPROC),yes)
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_mfqe_msa.c
-endif
-
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
-
-ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans4_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans8_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans16_dspr2.c
+else
+VP9_COMMON_SRCS-$(HAVE_NEON)   += common/arm/neon/vp9_highbd_iht4x4_add_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON)   += common/arm/neon/vp9_highbd_iht8x8_add_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON)   += common/arm/neon/vp9_highbd_iht16x16_add_neon.c
+VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht4x4_add_sse4.c
+VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht8x8_add_sse4.c
+VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht16x16_add_sse4.c
 endif
 
 $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.pl))
diff --git a/media/libvpx/libvpx/vp9/vp9_cx_iface.c b/media/libvpx/libvpx/vp9/vp9_cx_iface.c
index 77b13da58f..3e896848ff 100644
--- a/media/libvpx/libvpx/vp9/vp9_cx_iface.c
+++ b/media/libvpx/libvpx/vp9/vp9_cx_iface.c
@@ -8,21 +8,35 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <limits.h>
+#include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 
 #include "./vpx_config.h"
 #include "vpx/vpx_encoder.h"
-#include "vpx_ports/vpx_once.h"
+#include "vpx/vpx_ext_ratectrl.h"
+#include "vpx_dsp/psnr.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/static_assert.h"
 #include "vpx_ports/system_state.h"
+#include "vpx_util/vpx_timestamp.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "./vpx_version.h"
 #include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_ethread.h"
 #include "vpx/vp8cx.h"
+#include "vp9/common/vp9_alloccommon.h"
+#include "vp9/common/vp9_scale.h"
+#include "vp9/vp9_cx_iface.h"
 #include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_lookahead.h"
+#include "vp9/vp9_cx_iface.h"
 #include "vp9/vp9_iface_common.h"
 
-struct vp9_extracfg {
+#include "vpx/vpx_tpl.h"
+
+typedef struct vp9_extracfg {
   int cpu_used;  // available cpu percentage in 1/16
   unsigned int enable_auto_alt_ref;
   unsigned int noise_sensitivity;
@@ -30,6 +44,8 @@ struct vp9_extracfg {
   unsigned int static_thresh;
   unsigned int tile_columns;
   unsigned int tile_rows;
+  unsigned int enable_tpl_model;
+  unsigned int enable_keyframe_filtering;
   unsigned int arnr_max_frames;
   unsigned int arnr_strength;
   unsigned int min_gf_interval;
@@ -51,16 +67,25 @@ struct vp9_extracfg {
   vpx_color_range_t color_range;
   int render_width;
   int render_height;
-};
+  unsigned int row_mt;
+  unsigned int motion_vector_unit_test;
+  int delta_q_uv;
+} vp9_extracfg;
 
 static struct vp9_extracfg default_extra_cfg = {
-  0,                     // cpu_used
+#if CONFIG_REALTIME_ONLY
+  5,  // cpu_used
+#else
+  0,  // cpu_used
+#endif
   1,                     // enable_auto_alt_ref
   0,                     // noise_sensitivity
   0,                     // sharpness
   0,                     // static_thresh
   6,                     // tile_columns
   0,                     // tile_rows
+  1,                     // enable_tpl_model
+  0,                     // enable_keyframe_filtering
   7,                     // arnr_max_frames
   5,                     // arnr_strength
   0,                     // min_gf_interval; 0 -> default decision
@@ -82,12 +107,17 @@ static struct vp9_extracfg default_extra_cfg = {
   0,                     // color range
   0,                     // render width
   0,                     // render height
+  0,                     // row_mt
+  0,                     // motion_vector_unit_test
+  0,                     // delta_q_uv
 };
 
 struct vpx_codec_alg_priv {
   vpx_codec_priv_t base;
   vpx_codec_enc_cfg_t cfg;
   struct vp9_extracfg extra_cfg;
+  vpx_codec_pts_t pts_offset;
+  unsigned char pts_offset_initialized;
   VP9EncoderConfig oxcf;
   VP9_COMP *cpi;
   unsigned char *cx_data;
@@ -105,8 +135,14 @@ struct vpx_codec_alg_priv {
   vpx_codec_priv_output_cx_pkt_cb_pair_t output_cx_pkt_cb;
   // BufferPool that holds all reference frames.
   BufferPool *buffer_pool;
+  vpx_fixed_buf_t global_headers;
+  int global_header_subsampling;
 };
 
+// Called by encoder_set_config() and encoder_encode() only. Must not be called
+// by encoder_init() because the `error` paramerer (cpi->common.error) will be
+// destroyed by vpx_codec_enc_init_ver() after encoder_init() returns an error.
+// See the "IMPORTANT" comment in vpx_codec_enc_init_ver().
 static vpx_codec_err_t update_error_state(
     vpx_codec_alg_priv_t *ctx, const struct vpx_internal_error_info *error) {
   const vpx_codec_err_t res = error->error_code;
@@ -124,10 +160,10 @@ static vpx_codec_err_t update_error_state(
     return VPX_CODEC_INVALID_PARAM; \
   } while (0)
 
-#define RANGE_CHECK(p, memb, lo, hi)                                 \
-  do {                                                               \
-    if (!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \
-      ERROR(#memb " out of range [" #lo ".." #hi "]");               \
+#define RANGE_CHECK(p, memb, lo, hi)                                     \
+  do {                                                                   \
+    if (!(((p)->memb == (lo) || (p)->memb > (lo)) && (p)->memb <= (hi))) \
+      ERROR(#memb " out of range [" #lo ".." #hi "]");                   \
   } while (0)
 
 #define RANGE_CHECK_HI(p, memb, hi)                                     \
@@ -148,8 +184,8 @@ static vpx_codec_err_t update_error_state(
 static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
                                        const vpx_codec_enc_cfg_t *cfg,
                                        const struct vp9_extracfg *extra_cfg) {
-  RANGE_CHECK(cfg, g_w, 1, 65535);  // 16 bits available
-  RANGE_CHECK(cfg, g_h, 1, 65535);  // 16 bits available
+  RANGE_CHECK(cfg, g_w, 1, 65536);  // 16 bits available
+  RANGE_CHECK(cfg, g_h, 1, 65536);  // 16 bits available
   RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000);
   RANGE_CHECK(cfg, g_timebase.num, 1, 1000000000);
   RANGE_CHECK_HI(cfg, g_profile, 3);
@@ -161,18 +197,23 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   RANGE_CHECK(extra_cfg, aq_mode, 0, AQ_MODE_COUNT - 2);
   RANGE_CHECK(extra_cfg, alt_ref_aq, 0, 1);
   RANGE_CHECK(extra_cfg, frame_periodic_boost, 0, 1);
-  RANGE_CHECK_HI(cfg, g_threads, 64);
+  RANGE_CHECK_HI(cfg, g_threads, MAX_NUM_THREADS);
   RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
   RANGE_CHECK(cfg, rc_end_usage, VPX_VBR, VPX_Q);
   RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100);
   RANGE_CHECK_HI(cfg, rc_overshoot_pct, 100);
   RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
+  RANGE_CHECK(cfg, rc_2pass_vbr_corpus_complexity, 0, 10000);
   RANGE_CHECK(cfg, kf_mode, VPX_KF_DISABLED, VPX_KF_AUTO);
   RANGE_CHECK_BOOL(cfg, rc_resize_allowed);
   RANGE_CHECK_HI(cfg, rc_dropframe_thresh, 100);
   RANGE_CHECK_HI(cfg, rc_resize_up_thresh, 100);
   RANGE_CHECK_HI(cfg, rc_resize_down_thresh, 100);
+#if CONFIG_REALTIME_ONLY
+  RANGE_CHECK(cfg, g_pass, VPX_RC_ONE_PASS, VPX_RC_ONE_PASS);
+#else
   RANGE_CHECK(cfg, g_pass, VPX_RC_ONE_PASS, VPX_RC_LAST_PASS);
+#endif
   RANGE_CHECK(extra_cfg, min_gf_interval, 0, (MAX_LAG_BUFFERS - 1));
   RANGE_CHECK(extra_cfg, max_gf_interval, 0, (MAX_LAG_BUFFERS - 1));
   if (extra_cfg->max_gf_interval > 0) {
@@ -183,6 +224,13 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
                 (MAX_LAG_BUFFERS - 1));
   }
 
+  // For formation of valid ARF groups lag_in _frames should be 0 or greater
+  // than the max_gf_interval + 2
+  if (cfg->g_lag_in_frames > 0 && extra_cfg->max_gf_interval > 0 &&
+      cfg->g_lag_in_frames < extra_cfg->max_gf_interval + 2) {
+    ERROR("Set lag in frames to 0 (low delay) or >= (max-gf-interval + 2)");
+  }
+
   if (cfg->rc_resize_allowed == 1) {
     RANGE_CHECK(cfg, rc_scaled_width, 0, cfg->g_w);
     RANGE_CHECK(cfg, rc_scaled_height, 0, cfg->g_h);
@@ -198,7 +246,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
         level != LEVEL_4 && level != LEVEL_4_1 && level != LEVEL_5 &&
         level != LEVEL_5_1 && level != LEVEL_5_2 && level != LEVEL_6 &&
         level != LEVEL_6_1 && level != LEVEL_6_2 && level != LEVEL_UNKNOWN &&
-        level != LEVEL_MAX)
+        level != LEVEL_AUTO && level != LEVEL_MAX)
       ERROR("target_level is invalid");
   }
 
@@ -221,22 +269,6 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
         ERROR("ts_rate_decimator factors are not powers of 2");
   }
 
-#if CONFIG_SPATIAL_SVC
-
-  if ((cfg->ss_number_layers > 1 || cfg->ts_number_layers > 1) &&
-      cfg->g_pass == VPX_RC_LAST_PASS) {
-    unsigned int i, alt_ref_sum = 0;
-    for (i = 0; i < cfg->ss_number_layers; ++i) {
-      if (cfg->ss_enable_auto_alt_ref[i]) ++alt_ref_sum;
-    }
-    if (alt_ref_sum > REF_FRAMES - cfg->ss_number_layers)
-      ERROR("Not enough ref buffers for svc alt ref frames");
-    if (cfg->ss_number_layers * cfg->ts_number_layers > 3 &&
-        cfg->g_error_resilient == 0)
-      ERROR("Multiple frame context are not supported for more than 3 layers");
-  }
-#endif
-
   // VP9 does not support a lower bound on the keyframe interval in
   // automatic keyframe placement mode.
   if (cfg->kf_mode != VPX_KF_DISABLED && cfg->kf_min_dist != cfg->kf_max_dist &&
@@ -245,8 +277,10 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
         "kf_min_dist not supported in auto mode, use 0 "
         "or kf_max_dist instead.");
 
-  RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, 2);
-  RANGE_CHECK(extra_cfg, cpu_used, -8, 8);
+  RANGE_CHECK(extra_cfg, row_mt, 0, 1);
+  RANGE_CHECK(extra_cfg, motion_vector_unit_test, 0, 2);
+  RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, MAX_ARF_LAYERS);
+  RANGE_CHECK(extra_cfg, cpu_used, -9, 9);
   RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
   RANGE_CHECK(extra_cfg, tile_columns, 0, 6);
   RANGE_CHECK(extra_cfg, tile_rows, 0, 2);
@@ -259,10 +293,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   RANGE_CHECK(extra_cfg, content, VP9E_CONTENT_DEFAULT,
               VP9E_CONTENT_INVALID - 1);
 
-  // TODO(yaowu): remove this when ssim tuning is implemented for vp9
-  if (extra_cfg->tuning == VP8_TUNE_SSIM)
-    ERROR("Option --tune=ssim is not currently supported in VP9.");
-
+#if !CONFIG_REALTIME_ONLY
   if (cfg->g_pass == VPX_RC_LAST_PASS) {
     const size_t packet_sz = sizeof(FIRSTPASS_STATS);
     const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz);
@@ -314,6 +345,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
         ERROR("rc_twopass_stats_in missing EOS stats packet");
     }
   }
+#endif  // !CONFIG_REALTIME_ONLY
 
 #if !CONFIG_VP9_HIGHBITDEPTH
   if (cfg->g_profile > (unsigned int)PROFILE_1) {
@@ -333,6 +365,24 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   }
   RANGE_CHECK(extra_cfg, color_space, VPX_CS_UNKNOWN, VPX_CS_SRGB);
   RANGE_CHECK(extra_cfg, color_range, VPX_CR_STUDIO_RANGE, VPX_CR_FULL_RANGE);
+
+  // The range below shall be further tuned.
+  RANGE_CHECK(cfg, use_vizier_rc_params, 0, 1);
+  RANGE_CHECK(cfg, active_wq_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, err_per_mb_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, sr_default_decay_limit.den, 1, 1000);
+  RANGE_CHECK(cfg, sr_diff_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_err_per_mb_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_frame_min_boost_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_frame_max_boost_subs_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, kf_max_total_boost_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, gf_max_total_boost_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, gf_frame_max_boost_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, zm_factor.den, 1, 1000);
+  RANGE_CHECK(cfg, rd_mult_inter_qp_fac.den, 1, 1000);
+  RANGE_CHECK(cfg, rd_mult_arf_qp_fac.den, 1, 1000);
+  RANGE_CHECK(cfg, rd_mult_key_qp_fac.den, 1, 1000);
+
   return VPX_CODEC_OK;
 }
 
@@ -341,14 +391,15 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx,
   switch (img->fmt) {
     case VPX_IMG_FMT_YV12:
     case VPX_IMG_FMT_I420:
-    case VPX_IMG_FMT_I42016: break;
+    case VPX_IMG_FMT_I42016:
+    case VPX_IMG_FMT_NV12: break;
     case VPX_IMG_FMT_I422:
     case VPX_IMG_FMT_I444:
     case VPX_IMG_FMT_I440:
       if (ctx->cfg.g_profile != (unsigned int)PROFILE_1) {
         ERROR(
-            "Invalid image format. I422, I444, I440 images are "
-            "not supported in profile.");
+            "Invalid image format. I422, I444, I440 images are not supported "
+            "in profile.");
       }
       break;
     case VPX_IMG_FMT_I42216:
@@ -363,8 +414,8 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx,
       break;
     default:
       ERROR(
-          "Invalid image format. Only YV12, I420, I422, I444 images are "
-          "supported.");
+          "Invalid image format. Only YV12, I420, I422, I444, I440, NV12 "
+          "images are supported.");
       break;
   }
 
@@ -377,6 +428,7 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx,
 static int get_image_bps(const vpx_image_t *img) {
   switch (img->fmt) {
     case VPX_IMG_FMT_YV12:
+    case VPX_IMG_FMT_NV12:
     case VPX_IMG_FMT_I420: return 12;
     case VPX_IMG_FMT_I422: return 16;
     case VPX_IMG_FMT_I444: return 24;
@@ -419,10 +471,20 @@ static void config_target_level(VP9EncoderConfig *oxcf) {
   oxcf->worst_allowed_q = vp9_quantizer_to_qindex(63);
 
   // Adjust minimum art-ref distance.
-  if (oxcf->min_gf_interval <
-      (int)vp9_level_defs[target_level_index].min_altref_distance)
+  // min_gf_interval should be no less than min_altref_distance + 1,
+  // as the encoder may produce bitstream with alt-ref distance being
+  // min_gf_interval - 1.
+  if (oxcf->min_gf_interval <=
+      (int)vp9_level_defs[target_level_index].min_altref_distance) {
     oxcf->min_gf_interval =
-        (int)vp9_level_defs[target_level_index].min_altref_distance;
+        (int)vp9_level_defs[target_level_index].min_altref_distance + 1;
+    // If oxcf->max_gf_interval == 0, it will be assigned with a default value
+    // in vp9_rc_set_gf_interval_range().
+    if (oxcf->max_gf_interval != 0) {
+      oxcf->max_gf_interval =
+          VPXMAX(oxcf->max_gf_interval, oxcf->min_gf_interval);
+    }
+  }
 
   // Adjust maximum column tiles.
   if (vp9_level_defs[target_level_index].max_col_tiles <
@@ -434,20 +496,33 @@ static void config_target_level(VP9EncoderConfig *oxcf) {
   }
 }
 
+static vpx_rational64_t get_g_timebase_in_ts(vpx_rational_t g_timebase) {
+  vpx_rational64_t g_timebase_in_ts;
+  g_timebase_in_ts.den = g_timebase.den;
+  g_timebase_in_ts.num = g_timebase.num;
+  g_timebase_in_ts.num *= TICKS_PER_SEC;
+  reduce_ratio(&g_timebase_in_ts);
+  return g_timebase_in_ts;
+}
+
 static vpx_codec_err_t set_encoder_config(
-    VP9EncoderConfig *oxcf, const vpx_codec_enc_cfg_t *cfg,
+    VP9EncoderConfig *oxcf, vpx_codec_enc_cfg_t *cfg,
     const struct vp9_extracfg *extra_cfg) {
-  const int is_vbr = cfg->rc_end_usage == VPX_VBR;
   int sl, tl;
+  unsigned int raw_target_rate;
   oxcf->profile = cfg->g_profile;
   oxcf->max_threads = (int)cfg->g_threads;
   oxcf->width = cfg->g_w;
   oxcf->height = cfg->g_h;
   oxcf->bit_depth = cfg->g_bit_depth;
   oxcf->input_bit_depth = cfg->g_input_bit_depth;
+  // TODO(angiebird): Figure out if we can just use g_timebase to indicate the
+  // inverse of framerate
   // guess a frame rate if out of whack, use 30
   oxcf->init_framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num;
   if (oxcf->init_framerate > 180) oxcf->init_framerate = 30;
+  oxcf->g_timebase = cfg->g_timebase;
+  oxcf->g_timebase_in_ts = get_g_timebase_in_ts(oxcf->g_timebase);
 
   oxcf->mode = GOOD;
 
@@ -461,8 +536,15 @@ static vpx_codec_err_t set_encoder_config(
       cfg->g_pass == VPX_RC_FIRST_PASS ? 0 : cfg->g_lag_in_frames;
   oxcf->rc_mode = cfg->rc_end_usage;
 
+  raw_target_rate =
+      (unsigned int)((int64_t)oxcf->width * oxcf->height * oxcf->bit_depth * 3 *
+                     oxcf->init_framerate / 1000);
+  // Cap target bitrate to raw rate or 1000Mbps, whichever is less
+  cfg->rc_target_bitrate =
+      VPXMIN(VPXMIN(raw_target_rate, cfg->rc_target_bitrate), 1000000);
+
   // Convert target bandwidth from Kbit/s to Bit/s
-  oxcf->target_bandwidth = 1000 * cfg->rc_target_bitrate;
+  oxcf->target_bandwidth = 1000 * (int64_t)cfg->rc_target_bitrate;
   oxcf->rc_max_intra_bitrate_pct = extra_cfg->rc_max_intra_bitrate_pct;
   oxcf->rc_max_inter_bitrate_pct = extra_cfg->rc_max_inter_bitrate_pct;
   oxcf->gf_cbr_boost_pct = extra_cfg->gf_cbr_boost_pct;
@@ -488,15 +570,16 @@ static vpx_codec_err_t set_encoder_config(
     oxcf->resize_mode = RESIZE_NONE;
   }
 
-  oxcf->maximum_buffer_size_ms = is_vbr ? 240000 : cfg->rc_buf_sz;
-  oxcf->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz;
-  oxcf->optimal_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_optimal_sz;
+  oxcf->maximum_buffer_size_ms = cfg->rc_buf_sz;
+  oxcf->starting_buffer_level_ms = cfg->rc_buf_initial_sz;
+  oxcf->optimal_buffer_level_ms = cfg->rc_buf_optimal_sz;
 
   oxcf->drop_frames_water_mark = cfg->rc_dropframe_thresh;
 
   oxcf->two_pass_vbrbias = cfg->rc_2pass_vbr_bias_pct;
   oxcf->two_pass_vbrmin_section = cfg->rc_2pass_vbr_minsection_pct;
   oxcf->two_pass_vbrmax_section = cfg->rc_2pass_vbr_maxsection_pct;
+  oxcf->vbr_corpus_complexity = cfg->rc_2pass_vbr_corpus_complexity;
 
   oxcf->auto_key =
       cfg->kf_mode == VPX_KF_AUTO && cfg->kf_min_dist != cfg->kf_max_dist;
@@ -506,14 +589,16 @@ static vpx_codec_err_t set_encoder_config(
   oxcf->speed = abs(extra_cfg->cpu_used);
   oxcf->encode_breakout = extra_cfg->static_thresh;
   oxcf->enable_auto_arf = extra_cfg->enable_auto_alt_ref;
-  oxcf->noise_sensitivity = extra_cfg->noise_sensitivity;
+  if (oxcf->bit_depth == VPX_BITS_8) {
+    oxcf->noise_sensitivity = extra_cfg->noise_sensitivity;
+  } else {
+    // Disable denoiser for high bitdepth since vp9_denoiser_filter only works
+    // for 8 bits.
+    oxcf->noise_sensitivity = 0;
+  }
   oxcf->sharpness = extra_cfg->sharpness;
 
-  oxcf->two_pass_stats_in = cfg->rc_twopass_stats_in;
-
-#if CONFIG_FP_MB_STATS
-  oxcf->firstpass_mb_stats_in = cfg->rc_firstpass_mb_stats_in;
-#endif
+  vp9_set_first_pass_stats(oxcf, &cfg->rc_twopass_stats_in);
 
   oxcf->color_space = extra_cfg->color_space;
   oxcf->color_range = extra_cfg->color_range;
@@ -529,6 +614,10 @@ static vpx_codec_err_t set_encoder_config(
 
   oxcf->tile_columns = extra_cfg->tile_columns;
 
+  oxcf->enable_tpl_model = extra_cfg->enable_tpl_model;
+
+  oxcf->enable_keyframe_filtering = extra_cfg->enable_keyframe_filtering;
+
   // TODO(yunqing): The dependencies between row tiles cause error in multi-
   // threaded encoding. For now, tile_rows is forced to be 0 in this case.
   // The further fix can be done by adding synchronizations after a tile row
@@ -554,20 +643,23 @@ static vpx_codec_err_t set_encoder_config(
 
   oxcf->target_level = extra_cfg->target_level;
 
+  oxcf->row_mt = extra_cfg->row_mt;
+  oxcf->motion_vector_unit_test = extra_cfg->motion_vector_unit_test;
+
+  oxcf->delta_q_uv = extra_cfg->delta_q_uv;
+
   for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
-#if CONFIG_SPATIAL_SVC
-    oxcf->ss_enable_auto_arf[sl] = cfg->ss_enable_auto_alt_ref[sl];
-#endif
     for (tl = 0; tl < oxcf->ts_number_layers; ++tl) {
-      oxcf->layer_target_bitrate[sl * oxcf->ts_number_layers + tl] =
-          1000 * cfg->layer_target_bitrate[sl * oxcf->ts_number_layers + tl];
+      const int layer = sl * oxcf->ts_number_layers + tl;
+      if (cfg->layer_target_bitrate[layer] > INT_MAX / 1000)
+        oxcf->layer_target_bitrate[layer] = INT_MAX;
+      else
+        oxcf->layer_target_bitrate[layer] =
+            1000 * cfg->layer_target_bitrate[layer];
     }
   }
   if (oxcf->ss_number_layers == 1 && oxcf->pass != 0) {
     oxcf->ss_target_bitrate[0] = (int)oxcf->target_bandwidth;
-#if CONFIG_SPATIAL_SVC
-    oxcf->ss_enable_auto_arf[0] = extra_cfg->enable_auto_alt_ref;
-#endif
   }
   if (oxcf->ts_number_layers > 1) {
     for (tl = 0; tl < VPX_TS_MAX_LAYERS; ++tl) {
@@ -579,54 +671,161 @@ static vpx_codec_err_t set_encoder_config(
   }
 
   if (get_level_index(oxcf->target_level) >= 0) config_target_level(oxcf);
-  /*
-  printf("Current VP9 Settings: \n");
-  printf("target_bandwidth: %d\n", oxcf->target_bandwidth);
-  printf("target_level: %d\n", oxcf->target_level);
-  printf("noise_sensitivity: %d\n", oxcf->noise_sensitivity);
-  printf("sharpness: %d\n",    oxcf->sharpness);
-  printf("cpu_used: %d\n",  oxcf->cpu_used);
-  printf("Mode: %d\n",     oxcf->mode);
-  printf("auto_key: %d\n",  oxcf->auto_key);
-  printf("key_freq: %d\n", oxcf->key_freq);
-  printf("end_usage: %d\n", oxcf->end_usage);
-  printf("under_shoot_pct: %d\n", oxcf->under_shoot_pct);
-  printf("over_shoot_pct: %d\n", oxcf->over_shoot_pct);
-  printf("starting_buffer_level: %d\n", oxcf->starting_buffer_level);
-  printf("optimal_buffer_level: %d\n",  oxcf->optimal_buffer_level);
-  printf("maximum_buffer_size: %d\n", oxcf->maximum_buffer_size);
-  printf("fixed_q: %d\n",  oxcf->fixed_q);
-  printf("worst_allowed_q: %d\n", oxcf->worst_allowed_q);
-  printf("best_allowed_q: %d\n", oxcf->best_allowed_q);
-  printf("allow_spatial_resampling: %d\n", oxcf->allow_spatial_resampling);
-  printf("scaled_frame_width: %d\n", oxcf->scaled_frame_width);
-  printf("scaled_frame_height: %d\n", oxcf->scaled_frame_height);
-  printf("two_pass_vbrbias: %d\n",  oxcf->two_pass_vbrbias);
-  printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section);
-  printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section);
-  printf("lag_in_frames: %d\n", oxcf->lag_in_frames);
-  printf("enable_auto_arf: %d\n", oxcf->enable_auto_arf);
-  printf("Version: %d\n", oxcf->Version);
-  printf("encode_breakout: %d\n", oxcf->encode_breakout);
-  printf("error resilient: %d\n", oxcf->error_resilient_mode);
-  printf("frame parallel detokenization: %d\n",
-         oxcf->frame_parallel_decoding_mode);
-  */
+  // vp9_dump_encoder_config(oxcf, stderr);
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t set_twopass_params_from_config(
+    const vpx_codec_enc_cfg_t *const cfg, struct VP9_COMP *cpi) {
+  if (!cfg->use_vizier_rc_params) return VPX_CODEC_OK;
+  if (cpi == NULL) return VPX_CODEC_ERROR;
+
+  cpi->twopass.use_vizier_rc_params = cfg->use_vizier_rc_params;
+
+  // The values set here are factors that will be applied to default values
+  // to get the final value used in the two pass code. Hence 1.0 will
+  // match the default behaviour when not using passed in values.
+  // We also apply limits here to prevent the user from applying settings
+  // that make no sense.
+  cpi->twopass.active_wq_factor =
+      (double)cfg->active_wq_factor.num / (double)cfg->active_wq_factor.den;
+  if (cpi->twopass.active_wq_factor < 0.25)
+    cpi->twopass.active_wq_factor = 0.25;
+  else if (cpi->twopass.active_wq_factor > 16.0)
+    cpi->twopass.active_wq_factor = 16.0;
+
+  cpi->twopass.err_per_mb =
+      (double)cfg->err_per_mb_factor.num / (double)cfg->err_per_mb_factor.den;
+  if (cpi->twopass.err_per_mb < 0.25)
+    cpi->twopass.err_per_mb = 0.25;
+  else if (cpi->twopass.err_per_mb > 4.0)
+    cpi->twopass.err_per_mb = 4.0;
+
+  cpi->twopass.sr_default_decay_limit =
+      (double)cfg->sr_default_decay_limit.num /
+      (double)cfg->sr_default_decay_limit.den;
+  if (cpi->twopass.sr_default_decay_limit < 0.25)
+    cpi->twopass.sr_default_decay_limit = 0.25;
+  // If the default changes this will need to change.
+  else if (cpi->twopass.sr_default_decay_limit > 1.33)
+    cpi->twopass.sr_default_decay_limit = 1.33;
+
+  cpi->twopass.sr_diff_factor =
+      (double)cfg->sr_diff_factor.num / (double)cfg->sr_diff_factor.den;
+  if (cpi->twopass.sr_diff_factor < 0.25)
+    cpi->twopass.sr_diff_factor = 0.25;
+  else if (cpi->twopass.sr_diff_factor > 4.0)
+    cpi->twopass.sr_diff_factor = 4.0;
+
+  cpi->twopass.kf_err_per_mb = (double)cfg->kf_err_per_mb_factor.num /
+                               (double)cfg->kf_err_per_mb_factor.den;
+  if (cpi->twopass.kf_err_per_mb < 0.25)
+    cpi->twopass.kf_err_per_mb = 0.25;
+  else if (cpi->twopass.kf_err_per_mb > 4.0)
+    cpi->twopass.kf_err_per_mb = 4.0;
+
+  cpi->twopass.kf_frame_min_boost = (double)cfg->kf_frame_min_boost_factor.num /
+                                    (double)cfg->kf_frame_min_boost_factor.den;
+  if (cpi->twopass.kf_frame_min_boost < 0.25)
+    cpi->twopass.kf_frame_min_boost = 0.25;
+  else if (cpi->twopass.kf_frame_min_boost > 4.0)
+    cpi->twopass.kf_frame_min_boost = 4.0;
+
+  cpi->twopass.kf_frame_max_boost_first =
+      (double)cfg->kf_frame_max_boost_first_factor.num /
+      (double)cfg->kf_frame_max_boost_first_factor.den;
+  if (cpi->twopass.kf_frame_max_boost_first < 0.25)
+    cpi->twopass.kf_frame_max_boost_first = 0.25;
+  else if (cpi->twopass.kf_frame_max_boost_first > 4.0)
+    cpi->twopass.kf_frame_max_boost_first = 4.0;
+
+  cpi->twopass.kf_frame_max_boost_subs =
+      (double)cfg->kf_frame_max_boost_subs_factor.num /
+      (double)cfg->kf_frame_max_boost_subs_factor.den;
+  if (cpi->twopass.kf_frame_max_boost_subs < 0.25)
+    cpi->twopass.kf_frame_max_boost_subs = 0.25;
+  else if (cpi->twopass.kf_frame_max_boost_subs > 4.0)
+    cpi->twopass.kf_frame_max_boost_subs = 4.0;
+
+  cpi->twopass.kf_max_total_boost = (double)cfg->kf_max_total_boost_factor.num /
+                                    (double)cfg->kf_max_total_boost_factor.den;
+  if (cpi->twopass.kf_max_total_boost < 0.25)
+    cpi->twopass.kf_max_total_boost = 0.25;
+  else if (cpi->twopass.kf_max_total_boost > 4.0)
+    cpi->twopass.kf_max_total_boost = 4.0;
+
+  cpi->twopass.gf_max_total_boost = (double)cfg->gf_max_total_boost_factor.num /
+                                    (double)cfg->gf_max_total_boost_factor.den;
+  if (cpi->twopass.gf_max_total_boost < 0.25)
+    cpi->twopass.gf_max_total_boost = 0.25;
+  else if (cpi->twopass.gf_max_total_boost > 4.0)
+    cpi->twopass.gf_max_total_boost = 4.0;
+
+  cpi->twopass.gf_frame_max_boost = (double)cfg->gf_frame_max_boost_factor.num /
+                                    (double)cfg->gf_frame_max_boost_factor.den;
+  if (cpi->twopass.gf_frame_max_boost < 0.25)
+    cpi->twopass.gf_frame_max_boost = 0.25;
+  else if (cpi->twopass.gf_frame_max_boost > 4.0)
+    cpi->twopass.gf_frame_max_boost = 4.0;
+
+  cpi->twopass.zm_factor =
+      (double)cfg->zm_factor.num / (double)cfg->zm_factor.den;
+  if (cpi->twopass.zm_factor < 0.25)
+    cpi->twopass.zm_factor = 0.25;
+  else if (cpi->twopass.zm_factor > 2.0)
+    cpi->twopass.zm_factor = 2.0;
+
+  cpi->rd_ctrl.rd_mult_inter_qp_fac = (double)cfg->rd_mult_inter_qp_fac.num /
+                                      (double)cfg->rd_mult_inter_qp_fac.den;
+  if (cpi->rd_ctrl.rd_mult_inter_qp_fac < 0.25)
+    cpi->rd_ctrl.rd_mult_inter_qp_fac = 0.25;
+  else if (cpi->rd_ctrl.rd_mult_inter_qp_fac > 4.0)
+    cpi->rd_ctrl.rd_mult_inter_qp_fac = 4.0;
+
+  cpi->rd_ctrl.rd_mult_arf_qp_fac =
+      (double)cfg->rd_mult_arf_qp_fac.num / (double)cfg->rd_mult_arf_qp_fac.den;
+  if (cpi->rd_ctrl.rd_mult_arf_qp_fac < 0.25)
+    cpi->rd_ctrl.rd_mult_arf_qp_fac = 0.25;
+  else if (cpi->rd_ctrl.rd_mult_arf_qp_fac > 4.0)
+    cpi->rd_ctrl.rd_mult_arf_qp_fac = 4.0;
+
+  cpi->rd_ctrl.rd_mult_key_qp_fac =
+      (double)cfg->rd_mult_key_qp_fac.num / (double)cfg->rd_mult_key_qp_fac.den;
+  if (cpi->rd_ctrl.rd_mult_key_qp_fac < 0.25)
+    cpi->rd_ctrl.rd_mult_key_qp_fac = 0.25;
+  else if (cpi->rd_ctrl.rd_mult_key_qp_fac > 4.0)
+    cpi->rd_ctrl.rd_mult_key_qp_fac = 4.0;
+
   return VPX_CODEC_OK;
 }
 
 static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx,
                                           const vpx_codec_enc_cfg_t *cfg) {
   vpx_codec_err_t res;
-  int force_key = 0;
+  volatile int force_key = 0;
 
   if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) {
     if (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS)
       ERROR("Cannot change width or height after initialization");
-    if (!valid_ref_frame_size(ctx->cfg.g_w, ctx->cfg.g_h, cfg->g_w, cfg->g_h) ||
+    // Note: function encoder_set_config() is allowed to be called multiple
+    // times. However, when the original frame width or height is less than two
+    // times of the new frame width or height, a forced key frame should be
+    // used (for the case of single spatial layer, since otherwise a previous
+    //  encoded frame at a lower layer may be the desired reference). To make
+    //  sure the correct detection of a forced key frame, we need
+    // to update the frame width and height only when the actual encoding is
+    // performed. cpi->last_coded_width and cpi->last_coded_height are used to
+    // track the actual coded frame size.
+    if ((ctx->cpi->last_coded_width && ctx->cpi->last_coded_height &&
+         (!valid_ref_frame_size(ctx->cpi->last_coded_width,
+                                ctx->cpi->last_coded_height, cfg->g_w,
+                                cfg->g_h) &&
+          ctx->cpi->svc.number_spatial_layers == 1)) ||
         (ctx->cpi->initial_width && (int)cfg->g_w > ctx->cpi->initial_width) ||
-        (ctx->cpi->initial_height && (int)cfg->g_h > ctx->cpi->initial_height))
+        (ctx->cpi->initial_height &&
+         (int)cfg->g_h > ctx->cpi->initial_height)) {
       force_key = 1;
+    }
   }
 
   // Prevent increasing lag_in_frames. This check is stricter than it needs
@@ -637,18 +836,29 @@ static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx,
     ERROR("Cannot increase lag_in_frames");
 
   res = validate_config(ctx, cfg, &ctx->extra_cfg);
+  if (res != VPX_CODEC_OK) return res;
 
-  if (res == VPX_CODEC_OK) {
-    ctx->cfg = *cfg;
-    set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
-    // On profile change, request a key frame
-    force_key |= ctx->cpi->common.profile != ctx->oxcf.profile;
-    vp9_change_config(ctx->cpi, &ctx->oxcf);
+  if (setjmp(ctx->cpi->common.error.jmp)) {
+    const vpx_codec_err_t codec_err =
+        update_error_state(ctx, &ctx->cpi->common.error);
+    ctx->cpi->common.error.setjmp = 0;
+    vpx_clear_system_state();
+    assert(codec_err != VPX_CODEC_OK);
+    return codec_err;
   }
+  ctx->cpi->common.error.setjmp = 1;
+
+  ctx->cfg = *cfg;
+  set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+  set_twopass_params_from_config(&ctx->cfg, ctx->cpi);
+  // On profile change, request a key frame
+  force_key |= ctx->cpi->common.profile != ctx->oxcf.profile;
+  vp9_change_config(ctx->cpi, &ctx->oxcf);
 
   if (force_key) ctx->next_frame_flags |= VPX_EFLAG_FORCE_KF;
 
-  return res;
+  ctx->cpi->common.error.setjmp = 0;
+  return VPX_CODEC_OK;
 }
 
 static vpx_codec_err_t ctrl_get_quantizer(vpx_codec_alg_priv_t *ctx,
@@ -667,12 +877,32 @@ static vpx_codec_err_t ctrl_get_quantizer64(vpx_codec_alg_priv_t *ctx,
   return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t ctrl_get_quantizer_svc_layers(vpx_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  int *const arg = va_arg(args, int *);
+  int i;
+  if (arg == NULL) return VPX_CODEC_INVALID_PARAM;
+  for (i = 0; i < VPX_SS_MAX_LAYERS; i++) {
+    arg[i] = ctx->cpi->svc.base_qindex[i];
+  }
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_get_loopfilter_level(vpx_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return VPX_CODEC_INVALID_PARAM;
+  *arg = ctx->cpi->common.lf.filter_level;
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t update_extra_cfg(vpx_codec_alg_priv_t *ctx,
                                         const struct vp9_extracfg *extra_cfg) {
   const vpx_codec_err_t res = validate_config(ctx, &ctx->cfg, extra_cfg);
   if (res == VPX_CODEC_OK) {
     ctx->extra_cfg = *extra_cfg;
     set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+    set_twopass_params_from_config(&ctx->cfg, ctx->cpi);
     vp9_change_config(ctx->cpi, &ctx->oxcf);
   }
   return res;
@@ -681,7 +911,13 @@ static vpx_codec_err_t update_extra_cfg(vpx_codec_alg_priv_t *ctx,
 static vpx_codec_err_t ctrl_set_cpuused(vpx_codec_alg_priv_t *ctx,
                                         va_list args) {
   struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  // Use fastest speed setting (speed 9 or -9) if it's set beyond the range.
   extra_cfg.cpu_used = CAST(VP8E_SET_CPUUSED, args);
+  extra_cfg.cpu_used = clamp(extra_cfg.cpu_used, -9, 9);
+#if CONFIG_REALTIME_ONLY
+  if (extra_cfg.cpu_used > -5 && extra_cfg.cpu_used < 5)
+    extra_cfg.cpu_used = (extra_cfg.cpu_used > 0) ? 5 : -5;
+#endif
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
@@ -727,6 +963,21 @@ static vpx_codec_err_t ctrl_set_tile_rows(vpx_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_set_tpl_model(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_tpl_model = CAST(VP9E_SET_TPL, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_keyframe_filtering(vpx_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_keyframe_filtering =
+      CAST(VP9E_SET_KEY_FRAME_FILTERING, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static vpx_codec_err_t ctrl_set_arnr_max_frames(vpx_codec_alg_priv_t *ctx,
                                                 va_list args) {
   struct vp9_extracfg extra_cfg = ctx->extra_cfg;
@@ -774,7 +1025,7 @@ static vpx_codec_err_t ctrl_set_rc_max_inter_bitrate_pct(
     vpx_codec_alg_priv_t *ctx, va_list args) {
   struct vp9_extracfg extra_cfg = ctx->extra_cfg;
   extra_cfg.rc_max_inter_bitrate_pct =
-      CAST(VP8E_SET_MAX_INTER_BITRATE_PCT, args);
+      CAST(VP9E_SET_MAX_INTER_BITRATE_PCT, args);
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
@@ -804,6 +1055,7 @@ static vpx_codec_err_t ctrl_set_aq_mode(vpx_codec_alg_priv_t *ctx,
                                         va_list args) {
   struct vp9_extracfg extra_cfg = ctx->extra_cfg;
   extra_cfg.aq_mode = CAST(VP9E_SET_AQ_MODE, args);
+  if (ctx->cpi->fixed_qp_onepass) extra_cfg.aq_mode = 0;
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
@@ -842,6 +1094,34 @@ static vpx_codec_err_t ctrl_set_target_level(vpx_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_set_row_mt(vpx_codec_alg_priv_t *ctx,
+                                       va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.row_mt = CAST(VP9E_SET_ROW_MT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_rtc_external_ratectrl(vpx_codec_alg_priv_t *ctx,
+                                                      va_list args) {
+  VP9_COMP *const cpi = ctx->cpi;
+  const unsigned int data = va_arg(args, unsigned int);
+  if (data) {
+    cpi->compute_frame_low_motion_onepass = 0;
+    cpi->rc.constrain_gf_key_freq_onepass_vbr = 0;
+    cpi->cyclic_refresh->content_mode = 0;
+    cpi->disable_scene_detection_rtc_ratectrl = 1;
+  }
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_enable_motion_vector_unit_test(
+    vpx_codec_alg_priv_t *ctx, va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.motion_vector_unit_test =
+      CAST(VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static vpx_codec_err_t ctrl_get_level(vpx_codec_alg_priv_t *ctx, va_list args) {
   int *const arg = va_arg(args, int *);
   if (arg == NULL) return VPX_CODEC_INVALID_PARAM;
@@ -864,12 +1144,6 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
     priv->buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(BufferPool));
     if (priv->buffer_pool == NULL) return VPX_CODEC_MEM_ERROR;
 
-#if CONFIG_MULTITHREAD
-    if (pthread_mutex_init(&priv->buffer_pool->pool_mutex, NULL)) {
-      return VPX_CODEC_MEM_ERROR;
-    }
-#endif
-
     if (ctx->config.enc) {
       // Update the reference to the config structure to an internal copy.
       priv->cfg = *ctx->config.enc;
@@ -877,21 +1151,21 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
     }
 
     priv->extra_cfg = default_extra_cfg;
-    once(vp9_initialize_enc);
+    vp9_initialize_enc();
 
     res = validate_config(priv, &priv->cfg, &priv->extra_cfg);
 
     if (res == VPX_CODEC_OK) {
+      priv->pts_offset_initialized = 0;
+      priv->global_header_subsampling = -1;
       set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg);
 #if CONFIG_VP9_HIGHBITDEPTH
       priv->oxcf.use_highbitdepth =
           (ctx->init_flags & VPX_CODEC_USE_HIGHBITDEPTH) ? 1 : 0;
 #endif
       priv->cpi = vp9_create_compressor(&priv->oxcf, priv->buffer_pool);
-      if (priv->cpi == NULL)
-        res = VPX_CODEC_MEM_ERROR;
-      else
-        priv->cpi->output_pkt_list = &priv->pkt_list.head;
+      if (priv->cpi == NULL) res = VPX_CODEC_MEM_ERROR;
+      set_twopass_params_from_config(&priv->cfg, priv->cpi);
     }
   }
 
@@ -900,29 +1174,36 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
 
 static vpx_codec_err_t encoder_destroy(vpx_codec_alg_priv_t *ctx) {
   free(ctx->cx_data);
+  free(ctx->global_headers.buf);
   vp9_remove_compressor(ctx->cpi);
-#if CONFIG_MULTITHREAD
-  pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
-#endif
   vpx_free(ctx->buffer_pool);
   vpx_free(ctx);
   return VPX_CODEC_OK;
 }
 
-static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
-                                    unsigned long duration,
-                                    unsigned long deadline) {
+static vpx_codec_err_t pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
+                                               unsigned long duration,
+                                               vpx_enc_deadline_t deadline) {
   MODE new_mode = BEST;
 
+#if CONFIG_REALTIME_ONLY
+  (void)duration;
+  deadline = VPX_DL_REALTIME;
+#else
   switch (ctx->cfg.g_pass) {
     case VPX_RC_ONE_PASS:
       if (deadline > 0) {
-        const vpx_codec_enc_cfg_t *const cfg = &ctx->cfg;
-
         // Convert duration parameter from stream timebase to microseconds.
-        const uint64_t duration_us = (uint64_t)duration * 1000000 *
-                                     (uint64_t)cfg->g_timebase.num /
-                                     (uint64_t)cfg->g_timebase.den;
+        VPX_STATIC_ASSERT(TICKS_PER_SEC > 1000000 &&
+                          (TICKS_PER_SEC % 1000000) == 0);
+
+        if (duration > UINT64_MAX / (uint64_t)ctx->oxcf.g_timebase_in_ts.num) {
+          ERROR("duration is too big");
+        }
+        uint64_t duration_us = duration *
+                               (uint64_t)ctx->oxcf.g_timebase_in_ts.num /
+                               ((uint64_t)ctx->oxcf.g_timebase_in_ts.den *
+                                (TICKS_PER_SEC / 1000000));
 
         // If the deadline is more that the duration this frame is to be shown,
         // use good quality mode. Otherwise use realtime mode.
@@ -934,6 +1215,7 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
     case VPX_RC_FIRST_PASS: break;
     case VPX_RC_LAST_PASS: new_mode = deadline > 0 ? GOOD : BEST; break;
   }
+#endif  // CONFIG_REALTIME_ONLY
 
   if (deadline == VPX_DL_REALTIME) {
     ctx->oxcf.pass = 0;
@@ -944,6 +1226,7 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
     ctx->oxcf.mode = new_mode;
     vp9_change_config(ctx->cpi, &ctx->oxcf);
   }
+  return VPX_CODEC_OK;
 }
 
 // Turn on to test if supplemental superframe data breaks decoding
@@ -1005,71 +1288,68 @@ static int write_superframe_index(vpx_codec_alg_priv_t *ctx) {
   return index_sz;
 }
 
-static int64_t timebase_units_to_ticks(const vpx_rational_t *timebase,
-                                       int64_t n) {
-  return n * TICKS_PER_SEC * timebase->num / timebase->den;
-}
-
-static int64_t ticks_to_timebase_units(const vpx_rational_t *timebase,
-                                       int64_t n) {
-  const int64_t round = (int64_t)TICKS_PER_SEC * timebase->num / 2 - 1;
-  return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC;
-}
-
 static vpx_codec_frame_flags_t get_frame_pkt_flags(const VP9_COMP *cpi,
                                                    unsigned int lib_flags) {
   vpx_codec_frame_flags_t flags = lib_flags << 16;
 
   if (lib_flags & FRAMEFLAGS_KEY ||
-      (cpi->use_svc &&
-       cpi->svc
-           .layer_context[cpi->svc.spatial_layer_id *
-                              cpi->svc.number_temporal_layers +
-                          cpi->svc.temporal_layer_id]
-           .is_key_frame))
+      (cpi->use_svc && cpi->svc
+                           .layer_context[cpi->svc.spatial_layer_id *
+                                              cpi->svc.number_temporal_layers +
+                                          cpi->svc.temporal_layer_id]
+                           .is_key_frame))
     flags |= VPX_FRAME_IS_KEY;
 
+  if (!cpi->common.show_frame) {
+    flags |= VPX_FRAME_IS_INVISIBLE;
+  }
+
   if (cpi->droppable) flags |= VPX_FRAME_IS_DROPPABLE;
 
   return flags;
 }
 
+static INLINE vpx_codec_cx_pkt_t get_psnr_pkt(const PSNR_STATS *psnr) {
+  vpx_codec_cx_pkt_t pkt;
+  pkt.kind = VPX_CODEC_PSNR_PKT;
+  pkt.data.psnr = *psnr;
+  return pkt;
+}
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE vpx_codec_cx_pkt_t
+get_first_pass_stats_pkt(FIRSTPASS_STATS *stats) {
+  // WARNNING: This function assumes that stats will
+  // exist and not be changed until the packet is processed
+  // TODO(angiebird): Refactor the code to avoid using the assumption.
+  vpx_codec_cx_pkt_t pkt;
+  pkt.kind = VPX_CODEC_STATS_PKT;
+  pkt.data.twopass_stats.buf = stats;
+  pkt.data.twopass_stats.sz = sizeof(*stats);
+  return pkt;
+}
+#endif
+
 const size_t kMinCompressedSize = 8192;
 static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
                                       const vpx_image_t *img,
-                                      vpx_codec_pts_t pts,
+                                      vpx_codec_pts_t pts_val,
                                       unsigned long duration,
                                       vpx_enc_frame_flags_t enc_flags,
-                                      unsigned long deadline) {
+                                      vpx_enc_deadline_t deadline) {
   volatile vpx_codec_err_t res = VPX_CODEC_OK;
   volatile vpx_enc_frame_flags_t flags = enc_flags;
+  volatile vpx_codec_pts_t pts = pts_val;
   VP9_COMP *const cpi = ctx->cpi;
-  const vpx_rational_t *const timebase = &ctx->cfg.g_timebase;
+  const vpx_rational64_t *const timebase_in_ts = &ctx->oxcf.g_timebase_in_ts;
   size_t data_sz;
+  vpx_codec_cx_pkt_t pkt;
+  memset(&pkt, 0, sizeof(pkt));
 
   if (cpi == NULL) return VPX_CODEC_INVALID_PARAM;
 
-  if (cpi->oxcf.pass == 2 && cpi->level_constraint.level_index >= 0 &&
-      !cpi->level_constraint.rc_config_updated) {
-    SVC *const svc = &cpi->svc;
-    const int is_two_pass_svc =
-        (svc->number_spatial_layers > 1) || (svc->number_temporal_layers > 1);
-    const VP9EncoderConfig *const oxcf = &cpi->oxcf;
-    TWO_PASS *const twopass = &cpi->twopass;
-    FIRSTPASS_STATS *stats = &twopass->total_stats;
-    if (is_two_pass_svc) {
-      const double frame_rate = 10000000.0 * stats->count / stats->duration;
-      vp9_update_spatial_layer_framerate(cpi, frame_rate);
-      twopass->bits_left =
-          (int64_t)(stats->duration *
-                    svc->layer_context[svc->spatial_layer_id].target_bandwidth /
-                    10000000.0);
-    } else {
-      twopass->bits_left =
-          (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0);
-    }
-    cpi->level_constraint.rc_config_updated = 1;
-  }
+  cpi->last_coded_width = ctx->oxcf.width;
+  cpi->last_coded_height = ctx->oxcf.height;
 
   if (img != NULL) {
     res = validate_img(ctx, img);
@@ -1077,7 +1357,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
       // There's no codec control for multiple alt-refs so check the encoder
       // instance for its status to determine the compressed data size.
       data_sz = ctx->cfg.g_w * ctx->cfg.g_h * get_image_bps(img) / 8 *
-                (cpi->multi_arf_allowed ? 8 : 2);
+                (cpi->multi_layer_arf ? 8 : 2);
       if (data_sz < kMinCompressedSize) data_sz = kMinCompressedSize;
       if (ctx->cx_data == NULL || ctx->cx_data_sz < data_sz) {
         ctx->cx_data_sz = data_sz;
@@ -1087,10 +1367,27 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
           return VPX_CODEC_MEM_ERROR;
         }
       }
+
+      int chroma_subsampling = -1;
+      if ((img->fmt & VPX_IMG_FMT_I420) == VPX_IMG_FMT_I420 ||
+          (img->fmt & VPX_IMG_FMT_NV12) == VPX_IMG_FMT_NV12 ||
+          (img->fmt & VPX_IMG_FMT_YV12) == VPX_IMG_FMT_YV12) {
+        chroma_subsampling = 1;  // matches default for Codec Parameter String
+      } else if ((img->fmt & VPX_IMG_FMT_I422) == VPX_IMG_FMT_I422) {
+        chroma_subsampling = 2;
+      } else if ((img->fmt & VPX_IMG_FMT_I444) == VPX_IMG_FMT_I444) {
+        chroma_subsampling = 3;
+      }
+      if (chroma_subsampling > ctx->global_header_subsampling) {
+        ctx->global_header_subsampling = chroma_subsampling;
+      }
     }
   }
 
-  pick_quickcompress_mode(ctx, duration, deadline);
+  res = pick_quickcompress_mode(ctx, duration, deadline);
+  if (res != VPX_CODEC_OK) {
+    return res;
+  }
   vpx_codec_pkt_list_init(&ctx->pkt_list);
 
   // Handle Flags
@@ -1108,7 +1405,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
   }
   cpi->common.error.setjmp = 1;
 
-  vp9_apply_encoding_flags(cpi, flags);
+  if (res == VPX_CODEC_OK) vp9_apply_encoding_flags(cpi, flags);
 
   // Handle fixed keyframe intervals
   if (ctx->cfg.kf_mode == VPX_KF_AUTO &&
@@ -1121,33 +1418,71 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
 
   if (res == VPX_CODEC_OK) {
     unsigned int lib_flags = 0;
-    YV12_BUFFER_CONFIG sd;
-    int64_t dst_time_stamp = timebase_units_to_ticks(timebase, pts);
-    int64_t dst_end_time_stamp =
-        timebase_units_to_ticks(timebase, pts + duration);
     size_t size, cx_data_sz;
     unsigned char *cx_data;
 
+    // Per-frame PSNR is not supported when g_lag_in_frames is greater than 0.
+    if ((flags & VPX_EFLAG_CALCULATE_PSNR) && ctx->cfg.g_lag_in_frames != 0) {
+      vpx_internal_error(
+          &ctx->cpi->common.error, VPX_CODEC_INCAPABLE,
+          "Cannot calculate per-frame PSNR when g_lag_in_frames is nonzero");
+    }
     // Set up internal flags
-    if (ctx->base.init_flags & VPX_CODEC_USE_PSNR) cpi->b_calculate_psnr = 1;
+#if CONFIG_INTERNAL_STATS
+    assert(cpi->b_calculate_psnr == 1);
+#else
+    cpi->b_calculate_psnr = (ctx->base.init_flags & VPX_CODEC_USE_PSNR) ||
+                            (flags & VPX_EFLAG_CALCULATE_PSNR);
+#endif
 
     if (img != NULL) {
+      YV12_BUFFER_CONFIG sd;
+
+      if (!ctx->pts_offset_initialized) {
+        ctx->pts_offset = pts;
+        ctx->pts_offset_initialized = 1;
+      }
+      if (pts < ctx->pts_offset) {
+        vpx_internal_error(&cpi->common.error, VPX_CODEC_INVALID_PARAM,
+                           "pts is smaller than initial pts");
+      }
+      pts -= ctx->pts_offset;
+      if (pts > INT64_MAX / timebase_in_ts->num) {
+        vpx_internal_error(
+            &cpi->common.error, VPX_CODEC_INVALID_PARAM,
+            "conversion of relative pts to ticks would overflow");
+      }
+      const int64_t dst_time_stamp =
+          timebase_units_to_ticks(timebase_in_ts, pts);
+
+      cpi->svc.timebase_fac = timebase_units_to_ticks(timebase_in_ts, 1);
+      cpi->svc.time_stamp_superframe = dst_time_stamp;
+
+#if ULONG_MAX > INT64_MAX
+      if (duration > INT64_MAX) {
+        vpx_internal_error(&cpi->common.error, VPX_CODEC_INVALID_PARAM,
+                           "duration is too big");
+      }
+#endif
+      if (pts > INT64_MAX - (int64_t)duration) {
+        vpx_internal_error(&cpi->common.error, VPX_CODEC_INVALID_PARAM,
+                           "relative pts + duration is too big");
+      }
+      vpx_codec_pts_t pts_end = pts + (int64_t)duration;
+      if (pts_end > INT64_MAX / timebase_in_ts->num) {
+        vpx_internal_error(
+            &cpi->common.error, VPX_CODEC_INVALID_PARAM,
+            "conversion of relative pts + duration to ticks would overflow");
+      }
+      const int64_t dst_end_time_stamp =
+          timebase_units_to_ticks(timebase_in_ts, pts_end);
       res = image2yuvconfig(img, &sd);
 
-      if (sd.y_width != ctx->cfg.g_w || sd.y_height != ctx->cfg.g_h) {
-        /* from vpx_encoder.h for g_w/g_h:
-           "Note that the frames passed as input to the encoder must have this
-           resolution"
-        */
-        ctx->base.err_detail = "Invalid input frame resolution";
-        res = VPX_CODEC_INVALID_PARAM;
-      } else {
-        // Store the original flags in to the frame buffer. Will extract the
-        // key frame flag when we actually encode this frame.
-        if (vp9_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd,
-                                  dst_time_stamp, dst_end_time_stamp)) {
-          res = update_error_state(ctx, &cpi->common.error);
-        }
+      // Store the original flags in to the frame buffer. Will extract the
+      // key frame flag when we actually encode this frame.
+      if (vp9_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd,
+                                dst_time_stamp, dst_end_time_stamp)) {
+        res = update_error_state(ctx, &cpi->common.error);
       }
       ctx->next_frame_flags = 0;
     }
@@ -1157,127 +1492,154 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
 
     /* Any pending invisible frames? */
     if (ctx->pending_cx_data) {
+      assert(cx_data_sz >= ctx->pending_cx_data_sz);
       memmove(cx_data, ctx->pending_cx_data, ctx->pending_cx_data_sz);
       ctx->pending_cx_data = cx_data;
       cx_data += ctx->pending_cx_data_sz;
       cx_data_sz -= ctx->pending_cx_data_sz;
 
-      /* TODO: this is a minimal check, the underlying codec doesn't respect
-       * the buffer size anyway.
+      /* TODO(webm:1844): this is a minimal check, the underlying codec doesn't
+       * respect the buffer size anyway.
        */
       if (cx_data_sz < ctx->cx_data_sz / 2) {
         vpx_internal_error(&cpi->common.error, VPX_CODEC_ERROR,
                            "Compressed data buffer too small");
-        return VPX_CODEC_ERROR;
       }
     }
 
-    while (cx_data_sz >= ctx->cx_data_sz / 2 &&
-           -1 != vp9_get_compressed_data(cpi, &lib_flags, &size, cx_data,
-                                         &dst_time_stamp, &dst_end_time_stamp,
-                                         !img)) {
-      if (size) {
-        vpx_codec_cx_pkt_t pkt;
+    if (cpi->oxcf.pass == 1 && !cpi->use_svc) {
+#if !CONFIG_REALTIME_ONLY
+      // compute first pass stats
+      if (img) {
+        int ret;
+        int64_t dst_time_stamp;
+        int64_t dst_end_time_stamp;
+        vpx_codec_cx_pkt_t fps_pkt;
+        ENCODE_FRAME_RESULT encode_frame_result;
+        vp9_init_encode_frame_result(&encode_frame_result);
+        // TODO(angiebird): Call vp9_first_pass directly
+        ret = vp9_get_compressed_data(
+            cpi, &lib_flags, &size, cx_data, cx_data_sz, &dst_time_stamp,
+            &dst_end_time_stamp, !img, &encode_frame_result);
+        assert(size == 0);  // There is no compressed data in the first pass
+        (void)ret;
+        assert(ret == 0);
+        fps_pkt = get_first_pass_stats_pkt(&cpi->twopass.this_frame_stats);
+        vpx_codec_pkt_list_add(&ctx->pkt_list.head, &fps_pkt);
+      } else {
+        if (!cpi->twopass.first_pass_done) {
+          vpx_codec_cx_pkt_t fps_pkt;
+          vp9_end_first_pass(cpi);
+          fps_pkt = get_first_pass_stats_pkt(&cpi->twopass.total_stats);
+          vpx_codec_pkt_list_add(&ctx->pkt_list.head, &fps_pkt);
+        }
+      }
+#else   // !CONFIG_REALTIME_ONLY
+      assert(0);
+#endif  // !CONFIG_REALTIME_ONLY
+    } else {
+      ENCODE_FRAME_RESULT encode_frame_result;
+      int64_t dst_time_stamp;
+      int64_t dst_end_time_stamp;
+      vp9_init_encode_frame_result(&encode_frame_result);
+      while (cx_data_sz >= ctx->cx_data_sz / 2 &&
+             -1 != vp9_get_compressed_data(cpi, &lib_flags, &size, cx_data,
+                                           cx_data_sz, &dst_time_stamp,
+                                           &dst_end_time_stamp, !img,
+                                           &encode_frame_result)) {
+        // Pack psnr pkt.
+        if (size > 0) {
+          PSNR_STATS psnr;
+          if (vp9_get_psnr(cpi, &psnr)) {
+            vpx_codec_cx_pkt_t psnr_pkt = get_psnr_pkt(&psnr);
+            vpx_codec_pkt_list_add(&ctx->pkt_list.head, &psnr_pkt);
+          }
+        }
 
-#if CONFIG_SPATIAL_SVC
-        if (cpi->use_svc)
-          cpi->svc
-              .layer_context[cpi->svc.spatial_layer_id *
-                             cpi->svc.number_temporal_layers]
-              .layer_size += size;
-#endif
+        if (size || (cpi->use_svc && cpi->svc.skip_enhancement_layer)) {
+          // Pack invisible frames with the next visible frame
+          if (!cpi->common.show_frame ||
+              (cpi->use_svc && cpi->svc.spatial_layer_id <
+                                   cpi->svc.number_spatial_layers - 1)) {
+            if (ctx->pending_cx_data == NULL) ctx->pending_cx_data = cx_data;
+            ctx->pending_cx_data_sz += size;
+            if (size)
+              ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
+            ctx->pending_frame_magnitude |= size;
+            cx_data += size;
+            cx_data_sz -= size;
+            pkt.data.frame.width[cpi->svc.spatial_layer_id] = cpi->common.width;
+            pkt.data.frame.height[cpi->svc.spatial_layer_id] =
+                cpi->common.height;
+            pkt.data.frame.spatial_layer_encoded[cpi->svc.spatial_layer_id] =
+                1 - cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id];
 
-        // Pack invisible frames with the next visible frame
-        if (!cpi->common.show_frame ||
-            (cpi->use_svc &&
-             cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1)) {
-          if (ctx->pending_cx_data == 0) ctx->pending_cx_data = cx_data;
-          ctx->pending_cx_data_sz += size;
-          ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
-          ctx->pending_frame_magnitude |= size;
-          cx_data += size;
-          cx_data_sz -= size;
+            if (ctx->output_cx_pkt_cb.output_cx_pkt) {
+              pkt.kind = VPX_CODEC_CX_FRAME_PKT;
+              pkt.data.frame.pts =
+                  ticks_to_timebase_units(timebase_in_ts, dst_time_stamp) +
+                  ctx->pts_offset;
+              pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units(
+                  timebase_in_ts, dst_end_time_stamp - dst_time_stamp);
+              pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
+              pkt.data.frame.buf = ctx->pending_cx_data;
+              pkt.data.frame.sz = size;
+              ctx->pending_cx_data = NULL;
+              ctx->pending_cx_data_sz = 0;
+              ctx->pending_frame_count = 0;
+              ctx->pending_frame_magnitude = 0;
+              ctx->output_cx_pkt_cb.output_cx_pkt(
+                  &pkt, ctx->output_cx_pkt_cb.user_priv);
+            }
+            continue;
+          }
 
-          if (ctx->output_cx_pkt_cb.output_cx_pkt) {
-            pkt.kind = VPX_CODEC_CX_FRAME_PKT;
-            pkt.data.frame.pts =
-                ticks_to_timebase_units(timebase, dst_time_stamp);
-            pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units(
-                timebase, dst_end_time_stamp - dst_time_stamp);
-            pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
+          // Add the frame packet to the list of returned packets.
+          pkt.kind = VPX_CODEC_CX_FRAME_PKT;
+          pkt.data.frame.pts =
+              ticks_to_timebase_units(timebase_in_ts, dst_time_stamp) +
+              ctx->pts_offset;
+          pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units(
+              timebase_in_ts, dst_end_time_stamp - dst_time_stamp);
+          pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
+          pkt.data.frame.width[cpi->svc.spatial_layer_id] = cpi->common.width;
+          pkt.data.frame.height[cpi->svc.spatial_layer_id] = cpi->common.height;
+          pkt.data.frame.spatial_layer_encoded[cpi->svc.spatial_layer_id] =
+              1 - cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id];
+
+          if (ctx->pending_cx_data) {
+            if (size)
+              ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
+            ctx->pending_frame_magnitude |= size;
+            ctx->pending_cx_data_sz += size;
+            // write the superframe only for the case when
+            if (!ctx->output_cx_pkt_cb.output_cx_pkt)
+              size += write_superframe_index(ctx);
             pkt.data.frame.buf = ctx->pending_cx_data;
-            pkt.data.frame.sz = size;
+            pkt.data.frame.sz = ctx->pending_cx_data_sz;
             ctx->pending_cx_data = NULL;
             ctx->pending_cx_data_sz = 0;
             ctx->pending_frame_count = 0;
             ctx->pending_frame_magnitude = 0;
+          } else {
+            pkt.data.frame.buf = cx_data;
+            pkt.data.frame.sz = size;
+          }
+          pkt.data.frame.partition_id = -1;
+
+          if (ctx->output_cx_pkt_cb.output_cx_pkt)
             ctx->output_cx_pkt_cb.output_cx_pkt(
                 &pkt, ctx->output_cx_pkt_cb.user_priv);
+          else
+            vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
+
+          cx_data += size;
+          cx_data_sz -= size;
+          if (is_one_pass_svc(cpi) && (cpi->svc.spatial_layer_id ==
+                                       cpi->svc.number_spatial_layers - 1)) {
+            // Encoded all spatial layers; exit loop.
+            break;
           }
-          continue;
-        }
-
-        // Add the frame packet to the list of returned packets.
-        pkt.kind = VPX_CODEC_CX_FRAME_PKT;
-        pkt.data.frame.pts = ticks_to_timebase_units(timebase, dst_time_stamp);
-        pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units(
-            timebase, dst_end_time_stamp - dst_time_stamp);
-        pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
-
-        if (ctx->pending_cx_data) {
-          ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
-          ctx->pending_frame_magnitude |= size;
-          ctx->pending_cx_data_sz += size;
-          // write the superframe only for the case when
-          if (!ctx->output_cx_pkt_cb.output_cx_pkt)
-            size += write_superframe_index(ctx);
-          pkt.data.frame.buf = ctx->pending_cx_data;
-          pkt.data.frame.sz = ctx->pending_cx_data_sz;
-          ctx->pending_cx_data = NULL;
-          ctx->pending_cx_data_sz = 0;
-          ctx->pending_frame_count = 0;
-          ctx->pending_frame_magnitude = 0;
-        } else {
-          pkt.data.frame.buf = cx_data;
-          pkt.data.frame.sz = size;
-        }
-        pkt.data.frame.partition_id = -1;
-
-        if (ctx->output_cx_pkt_cb.output_cx_pkt)
-          ctx->output_cx_pkt_cb.output_cx_pkt(&pkt,
-                                              ctx->output_cx_pkt_cb.user_priv);
-        else
-          vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
-
-        cx_data += size;
-        cx_data_sz -= size;
-#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION)
-#if CONFIG_SPATIAL_SVC
-        if (cpi->use_svc && !ctx->output_cx_pkt_cb.output_cx_pkt) {
-          vpx_codec_cx_pkt_t pkt_sizes, pkt_psnr;
-          int sl;
-          vp9_zero(pkt_sizes);
-          vp9_zero(pkt_psnr);
-          pkt_sizes.kind = VPX_CODEC_SPATIAL_SVC_LAYER_SIZES;
-          pkt_psnr.kind = VPX_CODEC_SPATIAL_SVC_LAYER_PSNR;
-          for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) {
-            LAYER_CONTEXT *lc =
-                &cpi->svc.layer_context[sl * cpi->svc.number_temporal_layers];
-            pkt_sizes.data.layer_sizes[sl] = lc->layer_size;
-            pkt_psnr.data.layer_psnr[sl] = lc->psnr_pkt;
-            lc->layer_size = 0;
-          }
-
-          vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_sizes);
-
-          vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_psnr);
-        }
-#endif
-#endif
-        if (is_one_pass_cbr_svc(cpi) &&
-            (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) {
-          // Encoded all spatial layers; exit loop.
-          break;
         }
       }
     }
@@ -1303,9 +1665,8 @@ static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx,
     vp9_set_reference_enc(ctx->cpi, ref_frame_to_vp9_reframe(frame->frame_type),
                           &sd);
     return VPX_CODEC_OK;
-  } else {
-    return VPX_CODEC_INVALID_PARAM;
   }
+  return VPX_CODEC_INVALID_PARAM;
 }
 
 static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx,
@@ -1319,9 +1680,8 @@ static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx,
     vp9_copy_reference_enc(ctx->cpi,
                            ref_frame_to_vp9_reframe(frame->frame_type), &sd);
     return VPX_CODEC_OK;
-  } else {
-    return VPX_CODEC_INVALID_PARAM;
   }
+  return VPX_CODEC_INVALID_PARAM;
 }
 
 static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx,
@@ -1329,14 +1689,13 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx,
   vp9_ref_frame_t *const frame = va_arg(args, vp9_ref_frame_t *);
 
   if (frame != NULL) {
-    YV12_BUFFER_CONFIG *fb = get_ref_frame(&ctx->cpi->common, frame->idx);
+    const int fb_idx = ctx->cpi->common.cur_show_frame_fb_idx;
+    YV12_BUFFER_CONFIG *fb = get_buf_frame(&ctx->cpi->common, fb_idx);
     if (fb == NULL) return VPX_CODEC_ERROR;
-
     yuvconfig2image(&frame->img, fb, NULL);
     return VPX_CODEC_OK;
-  } else {
-    return VPX_CODEC_INVALID_PARAM;
   }
+  return VPX_CODEC_INVALID_PARAM;
 }
 
 static vpx_codec_err_t ctrl_set_previewpp(vpx_codec_alg_priv_t *ctx,
@@ -1346,9 +1705,8 @@ static vpx_codec_err_t ctrl_set_previewpp(vpx_codec_alg_priv_t *ctx,
   if (config != NULL) {
     ctx->preview_ppcfg = *config;
     return VPX_CODEC_OK;
-  } else {
-    return VPX_CODEC_INVALID_PARAM;
   }
+  return VPX_CODEC_INVALID_PARAM;
 #else
   (void)ctx;
   (void)args;
@@ -1356,6 +1714,34 @@ static vpx_codec_err_t ctrl_set_previewpp(vpx_codec_alg_priv_t *ctx,
 #endif
 }
 
+// Returns the contents of CodecPrivate described in:
+// https://www.webmproject.org/docs/container/#vp9-codec-feature-metadata-codecprivate
+// This includes Profile, Level, Bit depth and Chroma subsampling. Each entry
+// is 3 bytes. 1 byte ID, 1 byte length (= 1) and 1 byte value.
+static vpx_fixed_buf_t *encoder_get_global_headers(vpx_codec_alg_priv_t *ctx) {
+  if (!ctx->cpi) return NULL;
+
+  const unsigned int profile = ctx->cfg.g_profile;
+  const VP9_LEVEL level = vp9_get_level(&ctx->cpi->level_info.level_spec);
+  const vpx_bit_depth_t bit_depth = ctx->cfg.g_bit_depth;
+  const int subsampling = ctx->global_header_subsampling;
+  const uint8_t buf[12] = {
+    1, 1, (uint8_t)profile,   2, 1, (uint8_t)level,
+    3, 1, (uint8_t)bit_depth, 4, 1, (uint8_t)subsampling
+  };
+
+  if (ctx->global_headers.buf) free(ctx->global_headers.buf);
+  ctx->global_headers.buf = malloc(sizeof(buf));
+  if (!ctx->global_headers.buf) return NULL;
+
+  ctx->global_headers.sz = sizeof(buf);
+  // No data or I440, which isn't mapped.
+  if (ctx->global_header_subsampling == -1) ctx->global_headers.sz -= 3;
+  memcpy(ctx->global_headers.buf, buf, ctx->global_headers.sz);
+
+  return &ctx->global_headers;
+}
+
 static vpx_image_t *encoder_get_preview(vpx_codec_alg_priv_t *ctx) {
   YV12_BUFFER_CONFIG sd;
   vp9_ppflags_t flags;
@@ -1370,17 +1756,20 @@ static vpx_image_t *encoder_get_preview(vpx_codec_alg_priv_t *ctx) {
   if (vp9_get_preview_raw_frame(ctx->cpi, &sd, &flags) == 0) {
     yuvconfig2image(&ctx->preview_img, &sd, NULL);
     return &ctx->preview_img;
-  } else {
-    return NULL;
   }
+  return NULL;
 }
 
 static vpx_codec_err_t ctrl_set_roi_map(vpx_codec_alg_priv_t *ctx,
                                         va_list args) {
-  (void)ctx;
-  (void)args;
+  vpx_roi_map_t *data = va_arg(args, vpx_roi_map_t *);
 
-  // TODO(yaowu): Need to re-implement and test for VP9.
+  if (data) {
+    vpx_roi_map_t *roi = (vpx_roi_map_t *)data;
+    return vp9_set_roi_map(ctx->cpi, roi->roi_map, roi->rows, roi->cols,
+                           roi->delta_q, roi->delta_lf, roi->skip,
+                           roi->ref_frame);
+  }
   return VPX_CODEC_INVALID_PARAM;
 }
 
@@ -1392,11 +1781,10 @@ static vpx_codec_err_t ctrl_set_active_map(vpx_codec_alg_priv_t *ctx,
     if (!vp9_set_active_map(ctx->cpi, map->active_map, (int)map->rows,
                             (int)map->cols))
       return VPX_CODEC_OK;
-    else
-      return VPX_CODEC_INVALID_PARAM;
-  } else {
+
     return VPX_CODEC_INVALID_PARAM;
   }
+  return VPX_CODEC_INVALID_PARAM;
 }
 
 static vpx_codec_err_t ctrl_get_active_map(vpx_codec_alg_priv_t *ctx,
@@ -1407,11 +1795,10 @@ static vpx_codec_err_t ctrl_get_active_map(vpx_codec_alg_priv_t *ctx,
     if (!vp9_get_active_map(ctx->cpi, map->active_map, (int)map->rows,
                             (int)map->cols))
       return VPX_CODEC_OK;
-    else
-      return VPX_CODEC_INVALID_PARAM;
-  } else {
+
     return VPX_CODEC_INVALID_PARAM;
   }
+  return VPX_CODEC_INVALID_PARAM;
 }
 
 static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx,
@@ -1419,13 +1806,11 @@ static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx,
   vpx_scaling_mode_t *const mode = va_arg(args, vpx_scaling_mode_t *);
 
   if (mode) {
-    const int res =
-        vp9_set_internal_size(ctx->cpi, (VPX_SCALING)mode->h_scaling_mode,
-                              (VPX_SCALING)mode->v_scaling_mode);
+    const int res = vp9_set_internal_size(ctx->cpi, mode->h_scaling_mode,
+                                          mode->v_scaling_mode);
     return (res == 0) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM;
-  } else {
-    return VPX_CODEC_INVALID_PARAM;
   }
+  return VPX_CODEC_INVALID_PARAM;
 }
 
 static vpx_codec_err_t ctrl_set_svc(vpx_codec_alg_priv_t *ctx, va_list args) {
@@ -1445,6 +1830,9 @@ static vpx_codec_err_t ctrl_set_svc(vpx_codec_alg_priv_t *ctx, va_list args) {
       cfg->ss_number_layers > 1 && cfg->ts_number_layers > 1) {
     return VPX_CODEC_INVALID_PARAM;
   }
+
+  vp9_set_row_mt(ctx->cpi);
+
   return VPX_CODEC_OK;
 }
 
@@ -1453,22 +1841,23 @@ static vpx_codec_err_t ctrl_set_svc_layer_id(vpx_codec_alg_priv_t *ctx,
   vpx_svc_layer_id_t *const data = va_arg(args, vpx_svc_layer_id_t *);
   VP9_COMP *const cpi = (VP9_COMP *)ctx->cpi;
   SVC *const svc = &cpi->svc;
+  int sl;
 
-  svc->first_spatial_layer_to_encode = data->spatial_layer_id;
   svc->spatial_layer_to_encode = data->spatial_layer_id;
+  svc->first_spatial_layer_to_encode = data->spatial_layer_id;
+  // TODO(jianj): Deprecated to be removed.
   svc->temporal_layer_id = data->temporal_layer_id;
+  // Allow for setting temporal layer per spatial layer for superframe.
+  for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) {
+    svc->temporal_layer_id_per_spatial[sl] =
+        data->temporal_layer_id_per_spatial[sl];
+  }
   // Checks on valid layer_id input.
   if (svc->temporal_layer_id < 0 ||
       svc->temporal_layer_id >= (int)ctx->cfg.ts_number_layers) {
     return VPX_CODEC_INVALID_PARAM;
   }
-  if (svc->first_spatial_layer_to_encode < 0 ||
-      svc->first_spatial_layer_to_encode >= (int)ctx->cfg.ss_number_layers) {
-    return VPX_CODEC_INVALID_PARAM;
-  }
-  // First spatial layer to encode not implemented for two-pass.
-  if (is_two_pass_svc(cpi) && svc->first_spatial_layer_to_encode > 0)
-    return VPX_CODEC_INVALID_PARAM;
+
   return VPX_CODEC_OK;
 }
 
@@ -1499,29 +1888,112 @@ static vpx_codec_err_t ctrl_set_svc_parameters(vpx_codec_alg_priv_t *ctx,
       LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
       lc->max_q = params->max_quantizers[layer];
       lc->min_q = params->min_quantizers[layer];
+      // Checks on valid scale factors.
+      if (params->scaling_factor_num[sl] < 1 ||
+          params->scaling_factor_den[sl] < 1 ||
+          (params->scaling_factor_num[sl] > params->scaling_factor_den[sl])) {
+        return VPX_CODEC_INVALID_PARAM;
+      }
       lc->scaling_factor_num = params->scaling_factor_num[sl];
       lc->scaling_factor_den = params->scaling_factor_den[sl];
       lc->speed = params->speed_per_layer[sl];
+      lc->loopfilter_ctrl = params->loopfilter_ctrl[sl];
     }
   }
 
   return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t ctrl_get_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  VP9_COMP *const cpi = ctx->cpi;
+  vpx_svc_ref_frame_config_t *data = va_arg(args, vpx_svc_ref_frame_config_t *);
+  int sl;
+  for (sl = 0; sl <= cpi->svc.spatial_layer_id; sl++) {
+    data->update_buffer_slot[sl] = cpi->svc.update_buffer_slot[sl];
+    data->reference_last[sl] = cpi->svc.reference_last[sl];
+    data->reference_golden[sl] = cpi->svc.reference_golden[sl];
+    data->reference_alt_ref[sl] = cpi->svc.reference_altref[sl];
+    data->lst_fb_idx[sl] = cpi->svc.lst_fb_idx[sl];
+    data->gld_fb_idx[sl] = cpi->svc.gld_fb_idx[sl];
+    data->alt_fb_idx[sl] = cpi->svc.alt_fb_idx[sl];
+    // TODO(jianj): Remove these 3, deprecated.
+    data->update_last[sl] = cpi->svc.update_last[sl];
+    data->update_golden[sl] = cpi->svc.update_golden[sl];
+    data->update_alt_ref[sl] = cpi->svc.update_altref[sl];
+  }
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t ctrl_set_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx,
                                                      va_list args) {
   VP9_COMP *const cpi = ctx->cpi;
   vpx_svc_ref_frame_config_t *data = va_arg(args, vpx_svc_ref_frame_config_t *);
   int sl;
+  cpi->svc.use_set_ref_frame_config = 1;
   for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) {
-    cpi->svc.ext_frame_flags[sl] = data->frame_flags[sl];
-    cpi->svc.ext_lst_fb_idx[sl] = data->lst_fb_idx[sl];
-    cpi->svc.ext_gld_fb_idx[sl] = data->gld_fb_idx[sl];
-    cpi->svc.ext_alt_fb_idx[sl] = data->alt_fb_idx[sl];
+    cpi->svc.update_buffer_slot[sl] = data->update_buffer_slot[sl];
+    cpi->svc.reference_last[sl] = data->reference_last[sl];
+    cpi->svc.reference_golden[sl] = data->reference_golden[sl];
+    cpi->svc.reference_altref[sl] = data->reference_alt_ref[sl];
+    cpi->svc.lst_fb_idx[sl] = data->lst_fb_idx[sl];
+    cpi->svc.gld_fb_idx[sl] = data->gld_fb_idx[sl];
+    cpi->svc.alt_fb_idx[sl] = data->alt_fb_idx[sl];
+    cpi->svc.duration[sl] = data->duration[sl];
   }
   return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t ctrl_set_svc_inter_layer_pred(vpx_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  const int data = va_arg(args, int);
+  VP9_COMP *const cpi = ctx->cpi;
+  cpi->svc.disable_inter_layer_pred = data;
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_set_svc_frame_drop_layer(vpx_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  VP9_COMP *const cpi = ctx->cpi;
+  vpx_svc_frame_drop_t *data = va_arg(args, vpx_svc_frame_drop_t *);
+  int sl;
+  cpi->svc.framedrop_mode = data->framedrop_mode;
+  for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl)
+    cpi->svc.framedrop_thresh[sl] = data->framedrop_thresh[sl];
+  // Don't allow max_consec_drop values below 1.
+  cpi->svc.max_consec_drop = VPXMAX(1, data->max_consec_drop);
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_set_svc_gf_temporal_ref(vpx_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  VP9_COMP *const cpi = ctx->cpi;
+  const unsigned int data = va_arg(args, unsigned int);
+  cpi->svc.use_gf_temporal_ref = data;
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_set_svc_spatial_layer_sync(
+    vpx_codec_alg_priv_t *ctx, va_list args) {
+  VP9_COMP *const cpi = ctx->cpi;
+  vpx_svc_spatial_layer_sync_t *data =
+      va_arg(args, vpx_svc_spatial_layer_sync_t *);
+  int sl;
+  for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl)
+    cpi->svc.spatial_layer_sync[sl] = data->spatial_layer_sync[sl];
+  cpi->svc.set_intra_only_frame = data->base_layer_intra_only;
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_set_delta_q_uv(vpx_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  int data = va_arg(args, int);
+  data = clamp(data, -15, 15);
+  extra_cfg.delta_q_uv = data;
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static vpx_codec_err_t ctrl_register_cx_callback(vpx_codec_alg_priv_t *ctx,
                                                  va_list args) {
   vpx_codec_priv_output_cx_pkt_cb_pair_t *cbp =
@@ -1562,13 +2034,99 @@ static vpx_codec_err_t ctrl_set_render_size(vpx_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_set_postencode_drop(vpx_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  VP9_COMP *const cpi = ctx->cpi;
+  const unsigned int data = va_arg(args, unsigned int);
+  cpi->rc.ext_use_post_encode_drop = data;
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_set_disable_overshoot_maxq_cbr(
+    vpx_codec_alg_priv_t *ctx, va_list args) {
+  VP9_COMP *const cpi = ctx->cpi;
+  const unsigned int data = va_arg(args, unsigned int);
+  cpi->rc.disable_overshoot_maxq_cbr = data;
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_set_disable_loopfilter(vpx_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  VP9_COMP *const cpi = ctx->cpi;
+  const unsigned int data = va_arg(args, unsigned int);
+  cpi->loopfilter_ctrl = data;
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx,
+                                                      va_list args) {
+  vpx_rc_funcs_t funcs = *CAST(VP9E_SET_EXTERNAL_RATE_CONTROL, args);
+  VP9_COMP *cpi = ctx->cpi;
+  EXT_RATECTRL *ext_ratectrl = &cpi->ext_ratectrl;
+  const VP9EncoderConfig *oxcf = &cpi->oxcf;
+  if (oxcf->pass == 2) {
+    const FRAME_INFO *frame_info = &cpi->frame_info;
+    vpx_rc_config_t ratectrl_config;
+    vpx_codec_err_t codec_status;
+    memset(&ratectrl_config, 0, sizeof(ratectrl_config));
+
+    ratectrl_config.frame_width = frame_info->frame_width;
+    ratectrl_config.frame_height = frame_info->frame_height;
+    ratectrl_config.show_frame_count = cpi->twopass.first_pass_info.num_frames;
+    ratectrl_config.max_gf_interval = oxcf->max_gf_interval;
+    ratectrl_config.min_gf_interval = oxcf->min_gf_interval;
+    // TODO(angiebird): Double check whether this is the proper way to set up
+    // target_bitrate and frame_rate.
+    ratectrl_config.target_bitrate_kbps = (int)(oxcf->target_bandwidth / 1000);
+    ratectrl_config.frame_rate_num = oxcf->g_timebase.den;
+    ratectrl_config.frame_rate_den = oxcf->g_timebase.num;
+    ratectrl_config.overshoot_percent = oxcf->over_shoot_pct;
+    ratectrl_config.undershoot_percent = oxcf->under_shoot_pct;
+    ratectrl_config.min_base_q_index = oxcf->best_allowed_q;
+    ratectrl_config.max_base_q_index = oxcf->worst_allowed_q;
+    ratectrl_config.base_qp = oxcf->cq_level;
+
+    if (oxcf->rc_mode == VPX_VBR) {
+      ratectrl_config.rc_mode = VPX_RC_VBR;
+    } else if (oxcf->rc_mode == VPX_Q) {
+      ratectrl_config.rc_mode = VPX_RC_QMODE;
+    } else if (oxcf->rc_mode == VPX_CQ) {
+      ratectrl_config.rc_mode = VPX_RC_CQ;
+    }
+
+    codec_status = vp9_extrc_create(funcs, ratectrl_config, ext_ratectrl);
+    if (codec_status != VPX_CODEC_OK) {
+      return codec_status;
+    }
+  }
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_set_quantizer_one_pass(vpx_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  VP9_COMP *const cpi = ctx->cpi;
+  const int qp = va_arg(args, int);
+  vpx_codec_enc_cfg_t *cfg = &ctx->cfg;
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  vpx_codec_err_t res;
+
+  if (qp < 0 || qp > 63) return VPX_CODEC_INVALID_PARAM;
+
+  cfg->rc_min_quantizer = cfg->rc_max_quantizer = qp;
+  extra_cfg.aq_mode = 0;
+  cpi->fixed_qp_onepass = 1;
+
+  res = update_extra_cfg(ctx, &extra_cfg);
+  return res;
+}
+
 static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { VP8_COPY_REFERENCE, ctrl_copy_reference },
 
   // Setters
   { VP8_SET_REFERENCE, ctrl_set_reference },
   { VP8_SET_POSTPROC, ctrl_set_previewpp },
-  { VP8E_SET_ROI_MAP, ctrl_set_roi_map },
+  { VP9E_SET_ROI_MAP, ctrl_set_roi_map },
   { VP8E_SET_ACTIVEMAP, ctrl_set_active_map },
   { VP8E_SET_SCALEMODE, ctrl_set_scale_mode },
   { VP8E_SET_CPUUSED, ctrl_set_cpuused },
@@ -1577,6 +2135,8 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { VP8E_SET_STATIC_THRESHOLD, ctrl_set_static_thresh },
   { VP9E_SET_TILE_COLUMNS, ctrl_set_tile_columns },
   { VP9E_SET_TILE_ROWS, ctrl_set_tile_rows },
+  { VP9E_SET_TPL, ctrl_set_tpl_model },
+  { VP9E_SET_KEY_FRAME_FILTERING, ctrl_set_keyframe_filtering },
   { VP8E_SET_ARNR_MAXFRAMES, ctrl_set_arnr_max_frames },
   { VP8E_SET_ARNR_STRENGTH, ctrl_set_arnr_strength },
   { VP8E_SET_ARNR_TYPE, ctrl_set_arnr_type },
@@ -1603,14 +2163,30 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { VP9E_SET_SVC_REF_FRAME_CONFIG, ctrl_set_svc_ref_frame_config },
   { VP9E_SET_RENDER_SIZE, ctrl_set_render_size },
   { VP9E_SET_TARGET_LEVEL, ctrl_set_target_level },
+  { VP9E_SET_ROW_MT, ctrl_set_row_mt },
+  { VP9E_SET_POSTENCODE_DROP, ctrl_set_postencode_drop },
+  { VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, ctrl_set_disable_overshoot_maxq_cbr },
+  { VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, ctrl_enable_motion_vector_unit_test },
+  { VP9E_SET_SVC_INTER_LAYER_PRED, ctrl_set_svc_inter_layer_pred },
+  { VP9E_SET_SVC_FRAME_DROP_LAYER, ctrl_set_svc_frame_drop_layer },
+  { VP9E_SET_SVC_GF_TEMPORAL_REF, ctrl_set_svc_gf_temporal_ref },
+  { VP9E_SET_SVC_SPATIAL_LAYER_SYNC, ctrl_set_svc_spatial_layer_sync },
+  { VP9E_SET_DELTA_Q_UV, ctrl_set_delta_q_uv },
+  { VP9E_SET_DISABLE_LOOPFILTER, ctrl_set_disable_loopfilter },
+  { VP9E_SET_RTC_EXTERNAL_RATECTRL, ctrl_set_rtc_external_ratectrl },
+  { VP9E_SET_EXTERNAL_RATE_CONTROL, ctrl_set_external_rate_control },
+  { VP9E_SET_QUANTIZER_ONE_PASS, ctrl_set_quantizer_one_pass },
 
   // Getters
   { VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer },
   { VP8E_GET_LAST_QUANTIZER_64, ctrl_get_quantizer64 },
+  { VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, ctrl_get_quantizer_svc_layers },
+  { VP9E_GET_LOOPFILTER_LEVEL, ctrl_get_loopfilter_level },
   { VP9_GET_REFERENCE, ctrl_get_reference },
   { VP9E_GET_SVC_LAYER_ID, ctrl_get_svc_layer_id },
   { VP9E_GET_ACTIVEMAP, ctrl_get_active_map },
   { VP9E_GET_LEVEL, ctrl_get_level },
+  { VP9E_GET_SVC_REF_FRAME_CONFIG, ctrl_get_svc_ref_frame_config },
 
   { -1, NULL },
 };
@@ -1619,7 +2195,7 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
   { 0,
     {
         // NOLINT
-        0,  // g_usage
+        0,  // g_usage (unused)
         8,  // g_threads
         0,  // g_profile
 
@@ -1640,13 +2216,13 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
         0,   // rc_resize_allowed
         0,   // rc_scaled_width
         0,   // rc_scaled_height
-        60,  // rc_resize_down_thresold
-        30,  // rc_resize_up_thresold
+        60,  // rc_resize_down_thresh
+        30,  // rc_resize_up_thresh
 
         VPX_VBR,      // rc_end_usage
         { NULL, 0 },  // rc_twopass_stats_in
         { NULL, 0 },  // rc_firstpass_mb_stats_in
-        256,          // rc_target_bandwidth
+        256,          // rc_target_bitrate
         0,            // rc_min_quantizer
         63,           // rc_max_quantizer
         25,           // rc_undershoot_pct
@@ -1659,6 +2235,7 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
         50,    // rc_two_pass_vbrbias
         0,     // rc_two_pass_vbrmin_section
         2000,  // rc_two_pass_vbrmax_section
+        0,     // rc_2pass_vbr_corpus_complexity (non 0 for corpus vbr)
 
         // keyframing settings (kf)
         VPX_KF_AUTO,  // g_kfmode
@@ -1667,14 +2244,30 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
 
         VPX_SS_DEFAULT_LAYERS,  // ss_number_layers
         { 0 },
-        { 0 },  // ss_target_bitrate
-        1,      // ts_number_layers
-        { 0 },  // ts_target_bitrate
-        { 0 },  // ts_rate_decimator
-        0,      // ts_periodicity
-        { 0 },  // ts_layer_id
-        { 0 },  // layer_taget_bitrate
-        0       // temporal_layering_mode
+        { 0 },     // ss_target_bitrate
+        1,         // ts_number_layers
+        { 0 },     // ts_target_bitrate
+        { 0 },     // ts_rate_decimator
+        0,         // ts_periodicity
+        { 0 },     // ts_layer_id
+        { 0 },     // layer_target_bitrate
+        0,         // temporal_layering_mode
+        0,         // use_vizier_rc_params
+        { 1, 1 },  // active_wq_factor
+        { 1, 1 },  // err_per_mb_factor
+        { 1, 1 },  // sr_default_decay_limit
+        { 1, 1 },  // sr_diff_factor
+        { 1, 1 },  // kf_err_per_mb_factor
+        { 1, 1 },  // kf_frame_min_boost_factor
+        { 1, 1 },  // kf_frame_max_boost_first_factor
+        { 1, 1 },  // kf_frame_max_boost_subs_factor
+        { 1, 1 },  // kf_max_total_boost_factor
+        { 1, 1 },  // gf_max_total_boost_factor
+        { 1, 1 },  // gf_frame_max_boost_factor
+        { 1, 1 },  // zm_factor
+        { 1, 1 },  // rd_mult_inter_qp_fac
+        { 1, 1 },  // rd_mult_arf_qp_fac
+        { 1, 1 },  // rd_mult_key_qp_fac
     } },
 };
 
@@ -1701,13 +2294,238 @@ CODEC_INTERFACE(vpx_codec_vp9_cx) = {
   },
   {
       // NOLINT
-      1,                      // 1 cfg map
-      encoder_usage_cfg_map,  // vpx_codec_enc_cfg_map_t
-      encoder_encode,         // vpx_codec_encode_fn_t
-      encoder_get_cxdata,     // vpx_codec_get_cx_data_fn_t
-      encoder_set_config,     // vpx_codec_enc_config_set_fn_t
-      NULL,                   // vpx_codec_get_global_headers_fn_t
-      encoder_get_preview,    // vpx_codec_get_preview_frame_fn_t
-      NULL                    // vpx_codec_enc_mr_get_mem_loc_fn_t
+      1,                           // 1 cfg map
+      encoder_usage_cfg_map,       // vpx_codec_enc_cfg_map_t
+      encoder_encode,              // vpx_codec_encode_fn_t
+      encoder_get_cxdata,          // vpx_codec_get_cx_data_fn_t
+      encoder_set_config,          // vpx_codec_enc_config_set_fn_t
+      encoder_get_global_headers,  // vpx_codec_get_global_headers_fn_t
+      encoder_get_preview,         // vpx_codec_get_preview_frame_fn_t
+      NULL,                        // vpx_codec_enc_mr_get_mem_loc_fn_t
+      NULL                         // vpx_codec_enc_mr_free_mem_loc_fn_t
   }
 };
+
+static vpx_codec_enc_cfg_t get_enc_cfg(int frame_width, int frame_height,
+                                       vpx_rational_t frame_rate,
+                                       int target_bitrate,
+                                       vpx_enc_pass enc_pass) {
+  vpx_codec_enc_cfg_t enc_cfg = encoder_usage_cfg_map[0].cfg;
+  enc_cfg.g_w = frame_width;
+  enc_cfg.g_h = frame_height;
+  enc_cfg.rc_target_bitrate = target_bitrate;
+  enc_cfg.g_pass = enc_pass;
+  // g_timebase is the inverse of frame_rate
+  enc_cfg.g_timebase.num = frame_rate.den;
+  enc_cfg.g_timebase.den = frame_rate.num;
+  return enc_cfg;
+}
+
+static vp9_extracfg get_extra_cfg(void) {
+  vp9_extracfg extra_cfg = default_extra_cfg;
+  return extra_cfg;
+}
+
+VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height,
+                                        vpx_rational_t frame_rate,
+                                        int target_bitrate, int encode_speed,
+                                        int target_level,
+                                        vpx_enc_pass enc_pass) {
+  /* This function will generate the same VP9EncoderConfig used by the
+   * vpxenc command given below.
+   * The configs in the vpxenc command corresponds to parameters of
+   * vp9_get_encoder_config() as follows.
+   *
+   * WIDTH:   frame_width
+   * HEIGHT:  frame_height
+   * FPS:     frame_rate
+   * BITRATE: target_bitrate
+   * CPU_USED:encode_speed
+   * TARGET_LEVEL: target_level
+   *
+   * INPUT, OUTPUT, LIMIT will not affect VP9EncoderConfig
+   *
+   * vpxenc command:
+   * INPUT=bus_cif.y4m
+   * OUTPUT=output.webm
+   * WIDTH=352
+   * HEIGHT=288
+   * BITRATE=600
+   * FPS=30/1
+   * LIMIT=150
+   * CPU_USED=0
+   * TARGET_LEVEL=0
+   * ./vpxenc --limit=$LIMIT --width=$WIDTH --height=$HEIGHT --fps=$FPS
+   * --lag-in-frames=25 \
+   *  --codec=vp9 --good --cpu-used=CPU_USED --threads=0 --profile=0 \
+   *  --min-q=0 --max-q=63 --auto-alt-ref=1 --passes=2 --kf-max-dist=150 \
+   *  --kf-min-dist=0 --drop-frame=0 --static-thresh=0 --bias-pct=50 \
+   *  --minsection-pct=0 --maxsection-pct=150 --arnr-maxframes=7 --psnr \
+   *  --arnr-strength=5 --sharpness=0 --undershoot-pct=100 --overshoot-pct=100 \
+   *  --frame-parallel=0 --tile-columns=0 --cpu-used=0 --end-usage=vbr \
+   *  --target-bitrate=$BITRATE --target-level=0 -o $OUTPUT $INPUT
+   */
+
+  VP9EncoderConfig oxcf;
+  vp9_extracfg extra_cfg = get_extra_cfg();
+  vpx_codec_enc_cfg_t enc_cfg = get_enc_cfg(
+      frame_width, frame_height, frame_rate, target_bitrate, enc_pass);
+  set_encoder_config(&oxcf, &enc_cfg, &extra_cfg);
+
+  // These settings are made to match the settings of the vpxenc command.
+  oxcf.key_freq = 150;
+  oxcf.under_shoot_pct = 100;
+  oxcf.over_shoot_pct = 100;
+  oxcf.max_threads = 0;
+  oxcf.tile_columns = 0;
+  oxcf.frame_parallel_decoding_mode = 0;
+  oxcf.two_pass_vbrmax_section = 150;
+  oxcf.speed = abs(encode_speed);
+  oxcf.target_level = target_level;
+  return oxcf;
+}
+
+#define DUMP_STRUCT_VALUE(fp, structure, value) \
+  fprintf(fp, #value " %" PRId64 "\n", (int64_t)(structure)->value)
+
+void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf, FILE *fp) {
+  DUMP_STRUCT_VALUE(fp, oxcf, profile);
+  DUMP_STRUCT_VALUE(fp, oxcf, bit_depth);
+  DUMP_STRUCT_VALUE(fp, oxcf, width);
+  DUMP_STRUCT_VALUE(fp, oxcf, height);
+  DUMP_STRUCT_VALUE(fp, oxcf, input_bit_depth);
+  DUMP_STRUCT_VALUE(fp, oxcf, init_framerate);
+  // TODO(angiebird): dump g_timebase
+  // TODO(angiebird): dump g_timebase_in_ts
+
+  DUMP_STRUCT_VALUE(fp, oxcf, target_bandwidth);
+
+  DUMP_STRUCT_VALUE(fp, oxcf, noise_sensitivity);
+  DUMP_STRUCT_VALUE(fp, oxcf, sharpness);
+  DUMP_STRUCT_VALUE(fp, oxcf, speed);
+  DUMP_STRUCT_VALUE(fp, oxcf, rc_max_intra_bitrate_pct);
+  DUMP_STRUCT_VALUE(fp, oxcf, rc_max_inter_bitrate_pct);
+  DUMP_STRUCT_VALUE(fp, oxcf, gf_cbr_boost_pct);
+
+  DUMP_STRUCT_VALUE(fp, oxcf, mode);
+  DUMP_STRUCT_VALUE(fp, oxcf, pass);
+
+  // Key Framing Operations
+  DUMP_STRUCT_VALUE(fp, oxcf, auto_key);
+  DUMP_STRUCT_VALUE(fp, oxcf, key_freq);
+
+  DUMP_STRUCT_VALUE(fp, oxcf, lag_in_frames);
+
+  // ----------------------------------------------------------------
+  // DATARATE CONTROL OPTIONS
+
+  // vbr, cbr, constrained quality or constant quality
+  DUMP_STRUCT_VALUE(fp, oxcf, rc_mode);
+
+  // buffer targeting aggressiveness
+  DUMP_STRUCT_VALUE(fp, oxcf, under_shoot_pct);
+  DUMP_STRUCT_VALUE(fp, oxcf, over_shoot_pct);
+
+  // buffering parameters
+  // TODO(angiebird): dump tarting_buffer_level_ms
+  // TODO(angiebird): dump ptimal_buffer_level_ms
+  // TODO(angiebird): dump maximum_buffer_size_ms
+
+  // Frame drop threshold.
+  DUMP_STRUCT_VALUE(fp, oxcf, drop_frames_water_mark);
+
+  // controlling quality
+  DUMP_STRUCT_VALUE(fp, oxcf, fixed_q);
+  DUMP_STRUCT_VALUE(fp, oxcf, worst_allowed_q);
+  DUMP_STRUCT_VALUE(fp, oxcf, best_allowed_q);
+  DUMP_STRUCT_VALUE(fp, oxcf, cq_level);
+  DUMP_STRUCT_VALUE(fp, oxcf, aq_mode);
+
+  // Special handling of Adaptive Quantization for AltRef frames
+  DUMP_STRUCT_VALUE(fp, oxcf, alt_ref_aq);
+
+  // Internal frame size scaling.
+  DUMP_STRUCT_VALUE(fp, oxcf, resize_mode);
+  DUMP_STRUCT_VALUE(fp, oxcf, scaled_frame_width);
+  DUMP_STRUCT_VALUE(fp, oxcf, scaled_frame_height);
+
+  // Enable feature to reduce the frame quantization every x frames.
+  DUMP_STRUCT_VALUE(fp, oxcf, frame_periodic_boost);
+
+  // two pass datarate control
+  DUMP_STRUCT_VALUE(fp, oxcf, two_pass_vbrbias);
+  DUMP_STRUCT_VALUE(fp, oxcf, two_pass_vbrmin_section);
+  DUMP_STRUCT_VALUE(fp, oxcf, two_pass_vbrmax_section);
+  DUMP_STRUCT_VALUE(fp, oxcf, vbr_corpus_complexity);
+  // END DATARATE CONTROL OPTIONS
+  // ----------------------------------------------------------------
+
+  // Spatial and temporal scalability.
+  DUMP_STRUCT_VALUE(fp, oxcf, ss_number_layers);
+  DUMP_STRUCT_VALUE(fp, oxcf, ts_number_layers);
+
+  // Bitrate allocation for spatial layers.
+  // TODO(angiebird): dump layer_target_bitrate[VPX_MAX_LAYERS]
+  // TODO(angiebird): dump ss_target_bitrate[VPX_SS_MAX_LAYERS]
+  // TODO(angiebird): dump ss_enable_auto_arf[VPX_SS_MAX_LAYERS]
+  // TODO(angiebird): dump ts_rate_decimator[VPX_TS_MAX_LAYERS]
+
+  DUMP_STRUCT_VALUE(fp, oxcf, enable_auto_arf);
+  DUMP_STRUCT_VALUE(fp, oxcf, encode_breakout);
+  DUMP_STRUCT_VALUE(fp, oxcf, error_resilient_mode);
+  DUMP_STRUCT_VALUE(fp, oxcf, frame_parallel_decoding_mode);
+
+  DUMP_STRUCT_VALUE(fp, oxcf, arnr_max_frames);
+  DUMP_STRUCT_VALUE(fp, oxcf, arnr_strength);
+
+  DUMP_STRUCT_VALUE(fp, oxcf, min_gf_interval);
+  DUMP_STRUCT_VALUE(fp, oxcf, max_gf_interval);
+
+  DUMP_STRUCT_VALUE(fp, oxcf, tile_columns);
+  DUMP_STRUCT_VALUE(fp, oxcf, tile_rows);
+
+  DUMP_STRUCT_VALUE(fp, oxcf, enable_tpl_model);
+
+  DUMP_STRUCT_VALUE(fp, oxcf, enable_keyframe_filtering);
+
+  DUMP_STRUCT_VALUE(fp, oxcf, max_threads);
+
+  DUMP_STRUCT_VALUE(fp, oxcf, target_level);
+
+  // TODO(angiebird): dump two_pass_stats_in
+  DUMP_STRUCT_VALUE(fp, oxcf, tuning);
+  DUMP_STRUCT_VALUE(fp, oxcf, content);
+#if CONFIG_VP9_HIGHBITDEPTH
+  DUMP_STRUCT_VALUE(fp, oxcf, use_highbitdepth);
+#endif
+  DUMP_STRUCT_VALUE(fp, oxcf, color_space);
+  DUMP_STRUCT_VALUE(fp, oxcf, color_range);
+  DUMP_STRUCT_VALUE(fp, oxcf, render_width);
+  DUMP_STRUCT_VALUE(fp, oxcf, render_height);
+  DUMP_STRUCT_VALUE(fp, oxcf, temporal_layering_mode);
+
+  DUMP_STRUCT_VALUE(fp, oxcf, row_mt);
+  DUMP_STRUCT_VALUE(fp, oxcf, motion_vector_unit_test);
+  DUMP_STRUCT_VALUE(fp, oxcf, delta_q_uv);
+}
+
+FRAME_INFO vp9_get_frame_info(const VP9EncoderConfig *oxcf) {
+  FRAME_INFO frame_info;
+  int dummy;
+  frame_info.frame_width = oxcf->width;
+  frame_info.frame_height = oxcf->height;
+  frame_info.render_frame_width = oxcf->width;
+  frame_info.render_frame_height = oxcf->height;
+  frame_info.bit_depth = oxcf->bit_depth;
+  vp9_set_mi_size(&frame_info.mi_rows, &frame_info.mi_cols, &dummy,
+                  frame_info.frame_width, frame_info.frame_height);
+  vp9_set_mb_size(&frame_info.mb_rows, &frame_info.mb_cols, &frame_info.num_mbs,
+                  frame_info.mi_rows, frame_info.mi_cols);
+  // TODO(angiebird): Figure out how to get subsampling_x/y here
+  return frame_info;
+}
+
+void vp9_set_first_pass_stats(VP9EncoderConfig *oxcf,
+                              const vpx_fixed_buf_t *stats) {
+  oxcf->two_pass_stats_in = *stats;
+}
diff --git a/media/libvpx/libvpx/vp9/vp9_cx_iface.h b/media/libvpx/libvpx/vp9/vp9_cx_iface.h
new file mode 100644
index 0000000000..f2de8507ff
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/vp9_cx_iface.h
@@ -0,0 +1,49 @@
+/*
+ *  Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_VP9_CX_IFACE_H_
+#define VPX_VP9_VP9_CX_IFACE_H_
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/common/vp9_onyxc_int.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height,
+                                        vpx_rational_t frame_rate,
+                                        int target_bitrate, int encode_speed,
+                                        int target_level,
+                                        vpx_enc_pass enc_pass);
+
+void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf, FILE *fp);
+
+FRAME_INFO vp9_get_frame_info(const VP9EncoderConfig *oxcf);
+
+static INLINE int64_t
+timebase_units_to_ticks(const vpx_rational64_t *timestamp_ratio, int64_t n) {
+  return n * timestamp_ratio->num / timestamp_ratio->den;
+}
+
+static INLINE int64_t
+ticks_to_timebase_units(const vpx_rational64_t *timestamp_ratio, int64_t n) {
+  int64_t round = timestamp_ratio->num / 2;
+  if (round > 0) --round;
+  return (n * timestamp_ratio->den + round) / timestamp_ratio->num;
+}
+
+void vp9_set_first_pass_stats(VP9EncoderConfig *oxcf,
+                              const vpx_fixed_buf_t *stats);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VP9_VP9_CX_IFACE_H_
diff --git a/media/libvpx/libvpx/vp9/vp9_dx_iface.c b/media/libvpx/libvpx/vp9/vp9_dx_iface.c
index 0a3e84a0da..b6eeee008d 100644
--- a/media/libvpx/libvpx/vp9/vp9_dx_iface.c
+++ b/media/libvpx/libvpx/vp9/vp9_dx_iface.c
@@ -19,7 +19,6 @@
 #include "vpx/vpx_decoder.h"
 #include "vpx_dsp/bitreader_buffer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_util/vpx_thread.h"
 
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_frame_buffers.h"
@@ -47,12 +46,6 @@ static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx,
     ctx->priv->init_flags = ctx->init_flags;
     priv->si.sz = sizeof(priv->si);
     priv->flushed = 0;
-    // Only do frame parallel decode when threads > 1.
-    priv->frame_parallel_decode =
-        (ctx->config.dec && (ctx->config.dec->threads > 1) &&
-         (ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING))
-            ? 1
-            : 0;
     if (ctx->config.dec) {
       priv->cfg = *ctx->config.dec;
       ctx->config.dec = &priv->cfg;
@@ -63,33 +56,8 @@ static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx,
 }
 
 static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) {
-  if (ctx->frame_workers != NULL) {
-    int i;
-    // Shutdown all threads before reclaiming any memory. The frame-level
-    // parallel decoder may access data from another worker.
-    for (i = 0; i < ctx->num_frame_workers; ++i) {
-      VPxWorker *const worker = &ctx->frame_workers[i];
-      vpx_get_worker_interface()->end(worker);
-    }
-    for (i = 0; i < ctx->num_frame_workers; ++i) {
-      VPxWorker *const worker = &ctx->frame_workers[i];
-      FrameWorkerData *const frame_worker_data =
-          (FrameWorkerData *)worker->data1;
-      vp9_remove_common(&frame_worker_data->pbi->common);
-#if CONFIG_VP9_POSTPROC
-      vp9_free_postproc_buffers(&frame_worker_data->pbi->common);
-#endif
-      vp9_decoder_remove(frame_worker_data->pbi);
-      vpx_free(frame_worker_data->scratch_buffer);
-#if CONFIG_MULTITHREAD
-      pthread_mutex_destroy(&frame_worker_data->stats_mutex);
-      pthread_cond_destroy(&frame_worker_data->stats_cond);
-#endif
-      vpx_free(frame_worker_data);
-    }
-#if CONFIG_MULTITHREAD
-    pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
-#endif
+  if (ctx->pbi != NULL) {
+    vp9_decoder_remove(ctx->pbi);
   }
 
   if (ctx->buffer_pool) {
@@ -97,7 +65,6 @@ static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) {
     vp9_free_internal_frame_buffers(&ctx->buffer_pool->int_frame_buffers);
   }
 
-  vpx_free(ctx->frame_workers);
   vpx_free(ctx->buffer_pool);
   vpx_free(ctx);
   return VPX_CODEC_OK;
@@ -129,7 +96,7 @@ static vpx_codec_err_t decoder_peek_si_internal(
     const uint8_t *data, unsigned int data_sz, vpx_codec_stream_info_t *si,
     int *is_intra_only, vpx_decrypt_cb decrypt_cb, void *decrypt_state) {
   int intra_only_flag = 0;
-  uint8_t clear_buffer[10];
+  uint8_t clear_buffer[11];
 
   if (data + data_sz <= data) return VPX_CODEC_INVALID_PARAM;
 
@@ -190,6 +157,9 @@ static vpx_codec_err_t decoder_peek_si_internal(
         if (profile > PROFILE_0) {
           if (!parse_bitdepth_colorspace_sampling(profile, &rb))
             return VPX_CODEC_UNSUP_BITSTREAM;
+          // The colorspace info may cause vp9_read_frame_size() to need 11
+          // bytes.
+          if (data_sz < 11) return VPX_CODEC_UNSUP_BITSTREAM;
         }
         rb.bit_offset += REF_FRAMES;  // refresh_frame_flags
         vp9_read_frame_size(&rb, (int *)&si->w, (int *)&si->h);
@@ -230,34 +200,32 @@ static vpx_codec_err_t update_error_state(
   return error->error_code;
 }
 
-static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) {
-  int i;
+static vpx_codec_err_t init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) {
+  VP9_COMMON *const cm = &ctx->pbi->common;
+  BufferPool *const pool = cm->buffer_pool;
 
-  for (i = 0; i < ctx->num_frame_workers; ++i) {
-    VPxWorker *const worker = &ctx->frame_workers[i];
-    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-    VP9_COMMON *const cm = &frame_worker_data->pbi->common;
-    BufferPool *const pool = cm->buffer_pool;
+  cm->new_fb_idx = INVALID_IDX;
+  cm->byte_alignment = ctx->byte_alignment;
+  cm->skip_loop_filter = ctx->skip_loop_filter;
 
-    cm->new_fb_idx = INVALID_IDX;
-    cm->byte_alignment = ctx->byte_alignment;
-    cm->skip_loop_filter = ctx->skip_loop_filter;
+  if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
+    pool->get_fb_cb = ctx->get_ext_fb_cb;
+    pool->release_fb_cb = ctx->release_ext_fb_cb;
+    pool->cb_priv = ctx->ext_priv;
+  } else {
+    pool->get_fb_cb = vp9_get_frame_buffer;
+    pool->release_fb_cb = vp9_release_frame_buffer;
 
-    if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
-      pool->get_fb_cb = ctx->get_ext_fb_cb;
-      pool->release_fb_cb = ctx->release_ext_fb_cb;
-      pool->cb_priv = ctx->ext_priv;
-    } else {
-      pool->get_fb_cb = vp9_get_frame_buffer;
-      pool->release_fb_cb = vp9_release_frame_buffer;
-
-      if (vp9_alloc_internal_frame_buffers(&pool->int_frame_buffers))
-        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                           "Failed to initialize internal frame buffers");
-
-      pool->cb_priv = &pool->int_frame_buffers;
+    if (vp9_alloc_internal_frame_buffers(&pool->int_frame_buffers)) {
+      vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                         "Failed to initialize internal frame buffers");
+      return VPX_CODEC_MEM_ERROR;
     }
+
+    pool->cb_priv = &pool->int_frame_buffers;
   }
+
+  return VPX_CODEC_OK;
 }
 
 static void set_default_ppflags(vp8_postproc_cfg_t *cfg) {
@@ -273,138 +241,62 @@ static void set_ppflags(const vpx_codec_alg_priv_t *ctx, vp9_ppflags_t *flags) {
   flags->noise_level = ctx->postproc_cfg.noise_level;
 }
 
-static int frame_worker_hook(void *arg1, void *arg2) {
-  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)arg1;
-  const uint8_t *data = frame_worker_data->data;
-  (void)arg2;
+#undef ERROR
+#define ERROR(str)                  \
+  do {                              \
+    ctx->base.err_detail = str;     \
+    return VPX_CODEC_INVALID_PARAM; \
+  } while (0)
 
-  frame_worker_data->result = vp9_receive_compressed_data(
-      frame_worker_data->pbi, frame_worker_data->data_size, &data);
-  frame_worker_data->data_end = data;
-
-  if (frame_worker_data->pbi->frame_parallel_decode) {
-    // In frame parallel decoding, a worker thread must successfully decode all
-    // the compressed data.
-    if (frame_worker_data->result != 0 ||
-        frame_worker_data->data + frame_worker_data->data_size - 1 > data) {
-      VPxWorker *const worker = frame_worker_data->pbi->frame_worker_owner;
-      BufferPool *const pool = frame_worker_data->pbi->common.buffer_pool;
-      // Signal all the other threads that are waiting for this frame.
-      vp9_frameworker_lock_stats(worker);
-      frame_worker_data->frame_context_ready = 1;
-      lock_buffer_pool(pool);
-      frame_worker_data->pbi->cur_buf->buf.corrupted = 1;
-      unlock_buffer_pool(pool);
-      frame_worker_data->pbi->need_resync = 1;
-      vp9_frameworker_signal_stats(worker);
-      vp9_frameworker_unlock_stats(worker);
-      return 0;
-    }
-  } else if (frame_worker_data->result != 0) {
-    // Check decode result in serial decode.
-    frame_worker_data->pbi->cur_buf->buf.corrupted = 1;
-    frame_worker_data->pbi->need_resync = 1;
-  }
-  return !frame_worker_data->result;
-}
+#define RANGE_CHECK(p, memb, lo, hi)                                     \
+  do {                                                                   \
+    if (!(((p)->memb == (lo) || (p)->memb > (lo)) && (p)->memb <= (hi))) \
+      ERROR(#memb " out of range [" #lo ".." #hi "]");                   \
+  } while (0)
 
 static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
-  int i;
-  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
-
+  vpx_codec_err_t res;
   ctx->last_show_frame = -1;
-  ctx->next_submit_worker_id = 0;
-  ctx->last_submit_worker_id = 0;
-  ctx->next_output_worker_id = 0;
-  ctx->frame_cache_read = 0;
-  ctx->frame_cache_write = 0;
-  ctx->num_cache_frames = 0;
   ctx->need_resync = 1;
-  ctx->num_frame_workers =
-      (ctx->frame_parallel_decode == 1) ? ctx->cfg.threads : 1;
-  if (ctx->num_frame_workers > MAX_DECODE_THREADS)
-    ctx->num_frame_workers = MAX_DECODE_THREADS;
-  ctx->available_threads = ctx->num_frame_workers;
   ctx->flushed = 0;
 
   ctx->buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(BufferPool));
   if (ctx->buffer_pool == NULL) return VPX_CODEC_MEM_ERROR;
 
-#if CONFIG_MULTITHREAD
-  if (pthread_mutex_init(&ctx->buffer_pool->pool_mutex, NULL)) {
-    set_error_detail(ctx, "Failed to allocate buffer pool mutex");
+  ctx->pbi = vp9_decoder_create(ctx->buffer_pool);
+  if (ctx->pbi == NULL) {
+    vpx_free(ctx->buffer_pool);
+    ctx->buffer_pool = NULL;
+    set_error_detail(ctx, "Failed to allocate decoder");
     return VPX_CODEC_MEM_ERROR;
   }
-#endif
+  ctx->pbi->max_threads = ctx->cfg.threads;
+  ctx->pbi->inv_tile_order = ctx->invert_tile_order;
 
-  ctx->frame_workers = (VPxWorker *)vpx_malloc(ctx->num_frame_workers *
-                                               sizeof(*ctx->frame_workers));
-  if (ctx->frame_workers == NULL) {
-    set_error_detail(ctx, "Failed to allocate frame_workers");
-    return VPX_CODEC_MEM_ERROR;
-  }
+  RANGE_CHECK(ctx, row_mt, 0, 1);
+  ctx->pbi->row_mt = ctx->row_mt;
 
-  for (i = 0; i < ctx->num_frame_workers; ++i) {
-    VPxWorker *const worker = &ctx->frame_workers[i];
-    FrameWorkerData *frame_worker_data = NULL;
-    winterface->init(worker);
-    worker->data1 = vpx_memalign(32, sizeof(FrameWorkerData));
-    if (worker->data1 == NULL) {
-      set_error_detail(ctx, "Failed to allocate frame_worker_data");
-      return VPX_CODEC_MEM_ERROR;
-    }
-    frame_worker_data = (FrameWorkerData *)worker->data1;
-    frame_worker_data->pbi = vp9_decoder_create(ctx->buffer_pool);
-    if (frame_worker_data->pbi == NULL) {
-      set_error_detail(ctx, "Failed to allocate frame_worker_data");
-      return VPX_CODEC_MEM_ERROR;
-    }
-    frame_worker_data->pbi->frame_worker_owner = worker;
-    frame_worker_data->worker_id = i;
-    frame_worker_data->scratch_buffer = NULL;
-    frame_worker_data->scratch_buffer_size = 0;
-    frame_worker_data->frame_context_ready = 0;
-    frame_worker_data->received_frame = 0;
-#if CONFIG_MULTITHREAD
-    if (pthread_mutex_init(&frame_worker_data->stats_mutex, NULL)) {
-      set_error_detail(ctx, "Failed to allocate frame_worker_data mutex");
-      return VPX_CODEC_MEM_ERROR;
-    }
-
-    if (pthread_cond_init(&frame_worker_data->stats_cond, NULL)) {
-      set_error_detail(ctx, "Failed to allocate frame_worker_data cond");
-      return VPX_CODEC_MEM_ERROR;
-    }
-#endif
-    // If decoding in serial mode, FrameWorker thread could create tile worker
-    // thread or loopfilter thread.
-    frame_worker_data->pbi->max_threads =
-        (ctx->frame_parallel_decode == 0) ? ctx->cfg.threads : 0;
-
-    frame_worker_data->pbi->inv_tile_order = ctx->invert_tile_order;
-    frame_worker_data->pbi->frame_parallel_decode = ctx->frame_parallel_decode;
-    frame_worker_data->pbi->common.frame_parallel_decode =
-        ctx->frame_parallel_decode;
-    worker->hook = (VPxWorkerHook)frame_worker_hook;
-    if (!winterface->reset(worker)) {
-      set_error_detail(ctx, "Frame Worker thread creation failed");
-      return VPX_CODEC_MEM_ERROR;
-    }
-  }
+  RANGE_CHECK(ctx, lpf_opt, 0, 1);
+  ctx->pbi->lpf_mt_opt = ctx->lpf_opt;
 
   // If postprocessing was enabled by the application and a
   // configuration has not been provided, default it.
   if (!ctx->postproc_cfg_set && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC))
     set_default_ppflags(&ctx->postproc_cfg);
 
-  init_buffer_callbacks(ctx);
-
-  return VPX_CODEC_OK;
+  res = init_buffer_callbacks(ctx);
+  if (res != VPX_CODEC_OK) {
+    vpx_free(ctx->buffer_pool);
+    ctx->buffer_pool = NULL;
+    vp9_decoder_remove(ctx->pbi);
+    ctx->pbi = NULL;
+  }
+  return res;
 }
 
 static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx,
                                 const VP9Decoder *const pbi) {
-  // Clear resync flag if worker got a key frame or intra only frame.
+  // Clear resync flag if the decoder got a key frame or intra only frame.
   if (ctx->need_resync == 1 && pbi->need_resync == 0 &&
       (pbi->common.intra_only || pbi->common.frame_type == KEY_FRAME))
     ctx->need_resync = 0;
@@ -412,10 +304,7 @@ static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx,
 
 static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
                                   const uint8_t **data, unsigned int data_sz,
-                                  void *user_priv, int64_t deadline) {
-  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
-  (void)deadline;
-
+                                  void *user_priv) {
   // Determine the stream parameters. Note that we rely on peek_si to
   // validate that we have a buffer that does not wrap around the top
   // of the heap.
@@ -429,108 +318,29 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
     if (!ctx->si.is_kf && !is_intra_only) return VPX_CODEC_ERROR;
   }
 
-  if (!ctx->frame_parallel_decode) {
-    VPxWorker *const worker = ctx->frame_workers;
-    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-    frame_worker_data->data = *data;
-    frame_worker_data->data_size = data_sz;
-    frame_worker_data->user_priv = user_priv;
-    frame_worker_data->received_frame = 1;
+  ctx->user_priv = user_priv;
 
-    // Set these even if already initialized.  The caller may have changed the
-    // decrypt config between frames.
-    frame_worker_data->pbi->decrypt_cb = ctx->decrypt_cb;
-    frame_worker_data->pbi->decrypt_state = ctx->decrypt_state;
+  // Set these even if already initialized.  The caller may have changed the
+  // decrypt config between frames.
+  ctx->pbi->decrypt_cb = ctx->decrypt_cb;
+  ctx->pbi->decrypt_state = ctx->decrypt_state;
 
-    worker->had_error = 0;
-    winterface->execute(worker);
-
-    // Update data pointer after decode.
-    *data = frame_worker_data->data_end;
-
-    if (worker->had_error)
-      return update_error_state(ctx, &frame_worker_data->pbi->common.error);
-
-    check_resync(ctx, frame_worker_data->pbi);
-  } else {
-    VPxWorker *const worker = &ctx->frame_workers[ctx->next_submit_worker_id];
-    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-    // Copy context from last worker thread to next worker thread.
-    if (ctx->next_submit_worker_id != ctx->last_submit_worker_id)
-      vp9_frameworker_copy_context(
-          &ctx->frame_workers[ctx->next_submit_worker_id],
-          &ctx->frame_workers[ctx->last_submit_worker_id]);
-
-    frame_worker_data->pbi->ready_for_new_data = 0;
-    // Copy the compressed data into worker's internal buffer.
-    // TODO(hkuang): Will all the workers allocate the same size
-    // as the size of the first intra frame be better? This will
-    // avoid too many deallocate and allocate.
-    if (frame_worker_data->scratch_buffer_size < data_sz) {
-      vpx_free(frame_worker_data->scratch_buffer);
-      frame_worker_data->scratch_buffer = (uint8_t *)vpx_malloc(data_sz);
-      if (frame_worker_data->scratch_buffer == NULL) {
-        set_error_detail(ctx, "Failed to reallocate scratch buffer");
-        return VPX_CODEC_MEM_ERROR;
-      }
-      frame_worker_data->scratch_buffer_size = data_sz;
-    }
-    frame_worker_data->data_size = data_sz;
-    memcpy(frame_worker_data->scratch_buffer, *data, data_sz);
-
-    frame_worker_data->frame_decoded = 0;
-    frame_worker_data->frame_context_ready = 0;
-    frame_worker_data->received_frame = 1;
-    frame_worker_data->data = frame_worker_data->scratch_buffer;
-    frame_worker_data->user_priv = user_priv;
-
-    if (ctx->next_submit_worker_id != ctx->last_submit_worker_id)
-      ctx->last_submit_worker_id =
-          (ctx->last_submit_worker_id + 1) % ctx->num_frame_workers;
-
-    ctx->next_submit_worker_id =
-        (ctx->next_submit_worker_id + 1) % ctx->num_frame_workers;
-    --ctx->available_threads;
-    worker->had_error = 0;
-    winterface->launch(worker);
+  if (vp9_receive_compressed_data(ctx->pbi, data_sz, data)) {
+    ctx->pbi->cur_buf->buf.corrupted = 1;
+    ctx->pbi->need_resync = 1;
+    ctx->need_resync = 1;
+    return update_error_state(ctx, &ctx->pbi->common.error);
   }
 
+  check_resync(ctx, ctx->pbi);
+
   return VPX_CODEC_OK;
 }
 
-static void wait_worker_and_cache_frame(vpx_codec_alg_priv_t *ctx) {
-  YV12_BUFFER_CONFIG sd;
-  vp9_ppflags_t flags = { 0, 0, 0 };
-  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
-  VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
-  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-  ctx->next_output_worker_id =
-      (ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
-  // TODO(hkuang): Add worker error handling here.
-  winterface->sync(worker);
-  frame_worker_data->received_frame = 0;
-  ++ctx->available_threads;
-
-  check_resync(ctx, frame_worker_data->pbi);
-
-  if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) {
-    VP9_COMMON *const cm = &frame_worker_data->pbi->common;
-    RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
-    ctx->frame_cache[ctx->frame_cache_write].fb_idx = cm->new_fb_idx;
-    yuvconfig2image(&ctx->frame_cache[ctx->frame_cache_write].img, &sd,
-                    frame_worker_data->user_priv);
-    ctx->frame_cache[ctx->frame_cache_write].img.fb_priv =
-        frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
-    ctx->frame_cache_write = (ctx->frame_cache_write + 1) % FRAME_CACHE_SIZE;
-    ++ctx->num_cache_frames;
-  }
-}
-
 static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
                                       const uint8_t *data, unsigned int data_sz,
-                                      void *user_priv, long deadline) {
+                                      void *user_priv) {
   const uint8_t *data_start = data;
-  const uint8_t *const data_end = data + data_sz;
   vpx_codec_err_t res;
   uint32_t frame_sizes[8];
   int frame_count;
@@ -543,9 +353,9 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
   // Reset flushed when receiving a valid frame.
   ctx->flushed = 0;
 
-  // Initialize the decoder workers on the first frame.
-  if (ctx->frame_workers == NULL) {
-    const vpx_codec_err_t res = init_decoder(ctx);
+  // Initialize the decoder on the first frame.
+  if (ctx->pbi == NULL) {
+    res = init_decoder(ctx);
     if (res != VPX_CODEC_OK) return res;
   }
 
@@ -556,91 +366,37 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
   if (ctx->svc_decoding && ctx->svc_spatial_layer < frame_count - 1)
     frame_count = ctx->svc_spatial_layer + 1;
 
-  if (ctx->frame_parallel_decode) {
-    // Decode in frame parallel mode. When decoding in this mode, the frame
-    // passed to the decoder must be either a normal frame or a superframe with
-    // superframe index so the decoder could get each frame's start position
-    // in the superframe.
-    if (frame_count > 0) {
-      int i;
+  // Decode in serial mode.
+  if (frame_count > 0) {
+    const uint8_t *const data_end = data + data_sz;
+    int i;
 
-      for (i = 0; i < frame_count; ++i) {
-        const uint8_t *data_start_copy = data_start;
-        const uint32_t frame_size = frame_sizes[i];
-        if (data_start < data ||
-            frame_size > (uint32_t)(data_end - data_start)) {
-          set_error_detail(ctx, "Invalid frame size in index");
-          return VPX_CODEC_CORRUPT_FRAME;
-        }
-
-        if (ctx->available_threads == 0) {
-          // No more threads for decoding. Wait until the next output worker
-          // finishes decoding. Then copy the decoded frame into cache.
-          if (ctx->num_cache_frames < FRAME_CACHE_SIZE) {
-            wait_worker_and_cache_frame(ctx);
-          } else {
-            // TODO(hkuang): Add unit test to test this path.
-            set_error_detail(ctx, "Frame output cache is full.");
-            return VPX_CODEC_ERROR;
-          }
-        }
-
-        res =
-            decode_one(ctx, &data_start_copy, frame_size, user_priv, deadline);
-        if (res != VPX_CODEC_OK) return res;
-        data_start += frame_size;
-      }
-    } else {
-      if (ctx->available_threads == 0) {
-        // No more threads for decoding. Wait until the next output worker
-        // finishes decoding. Then copy the decoded frame into cache.
-        if (ctx->num_cache_frames < FRAME_CACHE_SIZE) {
-          wait_worker_and_cache_frame(ctx);
-        } else {
-          // TODO(hkuang): Add unit test to test this path.
-          set_error_detail(ctx, "Frame output cache is full.");
-          return VPX_CODEC_ERROR;
-        }
+    for (i = 0; i < frame_count; ++i) {
+      const uint8_t *data_start_copy = data_start;
+      const uint32_t frame_size = frame_sizes[i];
+      if (data_start < data || frame_size > (uint32_t)(data_end - data_start)) {
+        set_error_detail(ctx, "Invalid frame size in index");
+        return VPX_CODEC_CORRUPT_FRAME;
       }
 
-      res = decode_one(ctx, &data, data_sz, user_priv, deadline);
+      res = decode_one(ctx, &data_start_copy, frame_size, user_priv);
       if (res != VPX_CODEC_OK) return res;
+
+      data_start += frame_size;
     }
   } else {
-    // Decode in serial mode.
-    if (frame_count > 0) {
-      int i;
+    const uint8_t *const data_end = data + data_sz;
+    while (data_start < data_end) {
+      const uint32_t frame_size = (uint32_t)(data_end - data_start);
+      res = decode_one(ctx, &data_start, frame_size, user_priv);
+      if (res != VPX_CODEC_OK) return res;
 
-      for (i = 0; i < frame_count; ++i) {
-        const uint8_t *data_start_copy = data_start;
-        const uint32_t frame_size = frame_sizes[i];
-        vpx_codec_err_t res;
-        if (data_start < data ||
-            frame_size > (uint32_t)(data_end - data_start)) {
-          set_error_detail(ctx, "Invalid frame size in index");
-          return VPX_CODEC_CORRUPT_FRAME;
-        }
-
-        res =
-            decode_one(ctx, &data_start_copy, frame_size, user_priv, deadline);
-        if (res != VPX_CODEC_OK) return res;
-
-        data_start += frame_size;
-      }
-    } else {
+      // Account for suboptimal termination by the encoder.
       while (data_start < data_end) {
-        const uint32_t frame_size = (uint32_t)(data_end - data_start);
-        const vpx_codec_err_t res =
-            decode_one(ctx, &data_start, frame_size, user_priv, deadline);
-        if (res != VPX_CODEC_OK) return res;
-
-        // Account for suboptimal termination by the encoder.
-        while (data_start < data_end) {
-          const uint8_t marker =
-              read_marker(ctx->decrypt_cb, ctx->decrypt_state, data_start);
-          if (marker) break;
-          ++data_start;
-        }
+        const uint8_t marker =
+            read_marker(ctx->decrypt_cb, ctx->decrypt_state, data_start);
+        if (marker) break;
+        ++data_start;
       }
     }
   }
@@ -648,80 +404,28 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
   return res;
 }
 
-static void release_last_output_frame(vpx_codec_alg_priv_t *ctx) {
-  RefCntBuffer *const frame_bufs = ctx->buffer_pool->frame_bufs;
-  // Decrease reference count of last output frame in frame parallel mode.
-  if (ctx->frame_parallel_decode && ctx->last_show_frame >= 0) {
-    BufferPool *const pool = ctx->buffer_pool;
-    lock_buffer_pool(pool);
-    decrease_ref_count(ctx->last_show_frame, frame_bufs, pool);
-    unlock_buffer_pool(pool);
-  }
-}
-
 static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx,
                                       vpx_codec_iter_t *iter) {
   vpx_image_t *img = NULL;
 
-  // Only return frame when all the cpu are busy or
-  // application fluhsed the decoder in frame parallel decode.
-  if (ctx->frame_parallel_decode && ctx->available_threads > 0 &&
-      !ctx->flushed) {
-    return NULL;
-  }
+  // Legacy parameter carried over from VP8. Has no effect for VP9 since we
+  // always return only 1 frame per decode call.
+  (void)iter;
 
-  // Output the frames in the cache first.
-  if (ctx->num_cache_frames > 0) {
-    release_last_output_frame(ctx);
-    ctx->last_show_frame = ctx->frame_cache[ctx->frame_cache_read].fb_idx;
-    if (ctx->need_resync) return NULL;
-    img = &ctx->frame_cache[ctx->frame_cache_read].img;
-    ctx->frame_cache_read = (ctx->frame_cache_read + 1) % FRAME_CACHE_SIZE;
-    --ctx->num_cache_frames;
-    return img;
-  }
-
-  // iter acts as a flip flop, so an image is only returned on the first
-  // call to get_frame.
-  if (*iter == NULL && ctx->frame_workers != NULL) {
-    do {
-      YV12_BUFFER_CONFIG sd;
-      vp9_ppflags_t flags = { 0, 0, 0 };
-      const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
-      VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
-      FrameWorkerData *const frame_worker_data =
-          (FrameWorkerData *)worker->data1;
-      ctx->next_output_worker_id =
-          (ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
-      if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
-        set_ppflags(ctx, &flags);
-      // Wait for the frame from worker thread.
-      if (winterface->sync(worker)) {
-        // Check if worker has received any frames.
-        if (frame_worker_data->received_frame == 1) {
-          ++ctx->available_threads;
-          frame_worker_data->received_frame = 0;
-          check_resync(ctx, frame_worker_data->pbi);
-        }
-        if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) {
-          VP9_COMMON *const cm = &frame_worker_data->pbi->common;
-          RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
-          release_last_output_frame(ctx);
-          ctx->last_show_frame = frame_worker_data->pbi->common.new_fb_idx;
-          if (ctx->need_resync) return NULL;
-          yuvconfig2image(&ctx->img, &sd, frame_worker_data->user_priv);
-          ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
-          img = &ctx->img;
-          return img;
-        }
-      } else {
-        // Decoding failed. Release the worker thread.
-        frame_worker_data->received_frame = 0;
-        ++ctx->available_threads;
-        ctx->need_resync = 1;
-        if (ctx->flushed != 1) return NULL;
-      }
-    } while (ctx->next_output_worker_id != ctx->next_submit_worker_id);
+  if (ctx->pbi != NULL) {
+    YV12_BUFFER_CONFIG sd;
+    vp9_ppflags_t flags = { 0, 0, 0 };
+    if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) set_ppflags(ctx, &flags);
+    if (vp9_get_raw_frame(ctx->pbi, &sd, &flags) == 0) {
+      VP9_COMMON *const cm = &ctx->pbi->common;
+      RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+      ctx->last_show_frame = ctx->pbi->common.new_fb_idx;
+      if (ctx->need_resync) return NULL;
+      yuvconfig2image(&ctx->img, &sd, ctx->user_priv);
+      ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
+      img = &ctx->img;
+      return img;
+    }
   }
   return NULL;
 }
@@ -731,7 +435,7 @@ static vpx_codec_err_t decoder_set_fb_fn(
     vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
   if (cb_get == NULL || cb_release == NULL) {
     return VPX_CODEC_INVALID_PARAM;
-  } else if (ctx->frame_workers == NULL) {
+  } else if (ctx->pbi == NULL) {
     // If the decoder has already been initialized, do not accept changes to
     // the frame buffer functions.
     ctx->get_ext_fb_cb = cb_get;
@@ -747,21 +451,12 @@ static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx,
                                           va_list args) {
   vpx_ref_frame_t *const data = va_arg(args, vpx_ref_frame_t *);
 
-  // Only support this function in serial decode.
-  if (ctx->frame_parallel_decode) {
-    set_error_detail(ctx, "Not supported in frame parallel decode");
-    return VPX_CODEC_INCAPABLE;
-  }
-
   if (data) {
     vpx_ref_frame_t *const frame = (vpx_ref_frame_t *)data;
     YV12_BUFFER_CONFIG sd;
-    VPxWorker *const worker = ctx->frame_workers;
-    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
     image2yuvconfig(&frame->img, &sd);
-    return vp9_set_reference_dec(&frame_worker_data->pbi->common,
-                                 ref_frame_to_vp9_reframe(frame->frame_type),
-                                 &sd);
+    return vp9_set_reference_dec(
+        &ctx->pbi->common, ref_frame_to_vp9_reframe(frame->frame_type), &sd);
   } else {
     return VPX_CODEC_INVALID_PARAM;
   }
@@ -771,20 +466,12 @@ static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx,
                                            va_list args) {
   vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
 
-  // Only support this function in serial decode.
-  if (ctx->frame_parallel_decode) {
-    set_error_detail(ctx, "Not supported in frame parallel decode");
-    return VPX_CODEC_INCAPABLE;
-  }
-
   if (data) {
     vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
     YV12_BUFFER_CONFIG sd;
-    VPxWorker *const worker = ctx->frame_workers;
-    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
     image2yuvconfig(&frame->img, &sd);
-    return vp9_copy_reference_dec(frame_worker_data->pbi,
-                                  (VP9_REFFRAME)frame->frame_type, &sd);
+    return vp9_copy_reference_dec(ctx->pbi, (VP9_REFFRAME)frame->frame_type,
+                                  &sd);
   } else {
     return VPX_CODEC_INVALID_PARAM;
   }
@@ -794,20 +481,16 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx,
                                           va_list args) {
   vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *);
 
-  // Only support this function in serial decode.
-  if (ctx->frame_parallel_decode) {
-    set_error_detail(ctx, "Not supported in frame parallel decode");
-    return VPX_CODEC_INCAPABLE;
-  }
-
   if (data) {
-    YV12_BUFFER_CONFIG *fb;
-    VPxWorker *const worker = ctx->frame_workers;
-    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-    fb = get_ref_frame(&frame_worker_data->pbi->common, data->idx);
-    if (fb == NULL) return VPX_CODEC_ERROR;
-    yuvconfig2image(&data->img, fb, NULL);
-    return VPX_CODEC_OK;
+    if (ctx->pbi) {
+      const int fb_idx = ctx->pbi->common.cur_show_frame_fb_idx;
+      YV12_BUFFER_CONFIG *fb = get_buf_frame(&ctx->pbi->common, fb_idx);
+      if (fb == NULL) return VPX_CODEC_ERROR;
+      yuvconfig2image(&data->img, fb, NULL);
+      return VPX_CODEC_OK;
+    } else {
+      return VPX_CODEC_ERROR;
+    }
   } else {
     return VPX_CODEC_INVALID_PARAM;
   }
@@ -832,22 +515,21 @@ static vpx_codec_err_t ctrl_set_postproc(vpx_codec_alg_priv_t *ctx,
 #endif
 }
 
+static vpx_codec_err_t ctrl_get_quantizer(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL || ctx->pbi == NULL) return VPX_CODEC_INVALID_PARAM;
+  *arg = ctx->pbi->common.base_qindex;
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t ctrl_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
                                                  va_list args) {
   int *const update_info = va_arg(args, int *);
 
-  // Only support this function in serial decode.
-  if (ctx->frame_parallel_decode) {
-    set_error_detail(ctx, "Not supported in frame parallel decode");
-    return VPX_CODEC_INCAPABLE;
-  }
-
   if (update_info) {
-    if (ctx->frame_workers) {
-      VPxWorker *const worker = ctx->frame_workers;
-      FrameWorkerData *const frame_worker_data =
-          (FrameWorkerData *)worker->data1;
-      *update_info = frame_worker_data->pbi->refresh_frame_flags;
+    if (ctx->pbi != NULL) {
+      *update_info = ctx->pbi->refresh_frame_flags;
       return VPX_CODEC_OK;
     } else {
       return VPX_CODEC_ERROR;
@@ -862,14 +544,9 @@ static vpx_codec_err_t ctrl_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
   int *corrupted = va_arg(args, int *);
 
   if (corrupted) {
-    if (ctx->frame_workers) {
-      VPxWorker *const worker = ctx->frame_workers;
-      FrameWorkerData *const frame_worker_data =
-          (FrameWorkerData *)worker->data1;
-      RefCntBuffer *const frame_bufs =
-          frame_worker_data->pbi->common.buffer_pool->frame_bufs;
-      if (frame_worker_data->pbi->common.frame_to_show == NULL)
-        return VPX_CODEC_ERROR;
+    if (ctx->pbi != NULL) {
+      RefCntBuffer *const frame_bufs = ctx->pbi->common.buffer_pool->frame_bufs;
+      if (ctx->pbi->common.frame_to_show == NULL) return VPX_CODEC_ERROR;
       if (ctx->last_show_frame >= 0)
         *corrupted = frame_bufs[ctx->last_show_frame].buf.corrupted;
       return VPX_CODEC_OK;
@@ -885,18 +562,9 @@ static vpx_codec_err_t ctrl_get_frame_size(vpx_codec_alg_priv_t *ctx,
                                            va_list args) {
   int *const frame_size = va_arg(args, int *);
 
-  // Only support this function in serial decode.
-  if (ctx->frame_parallel_decode) {
-    set_error_detail(ctx, "Not supported in frame parallel decode");
-    return VPX_CODEC_INCAPABLE;
-  }
-
   if (frame_size) {
-    if (ctx->frame_workers) {
-      VPxWorker *const worker = ctx->frame_workers;
-      FrameWorkerData *const frame_worker_data =
-          (FrameWorkerData *)worker->data1;
-      const VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+    if (ctx->pbi != NULL) {
+      const VP9_COMMON *const cm = &ctx->pbi->common;
       frame_size[0] = cm->width;
       frame_size[1] = cm->height;
       return VPX_CODEC_OK;
@@ -912,18 +580,9 @@ static vpx_codec_err_t ctrl_get_render_size(vpx_codec_alg_priv_t *ctx,
                                             va_list args) {
   int *const render_size = va_arg(args, int *);
 
-  // Only support this function in serial decode.
-  if (ctx->frame_parallel_decode) {
-    set_error_detail(ctx, "Not supported in frame parallel decode");
-    return VPX_CODEC_INCAPABLE;
-  }
-
   if (render_size) {
-    if (ctx->frame_workers) {
-      VPxWorker *const worker = ctx->frame_workers;
-      FrameWorkerData *const frame_worker_data =
-          (FrameWorkerData *)worker->data1;
-      const VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+    if (ctx->pbi != NULL) {
+      const VP9_COMMON *const cm = &ctx->pbi->common;
       render_size[0] = cm->render_width;
       render_size[1] = cm->render_height;
       return VPX_CODEC_OK;
@@ -938,13 +597,10 @@ static vpx_codec_err_t ctrl_get_render_size(vpx_codec_alg_priv_t *ctx,
 static vpx_codec_err_t ctrl_get_bit_depth(vpx_codec_alg_priv_t *ctx,
                                           va_list args) {
   unsigned int *const bit_depth = va_arg(args, unsigned int *);
-  VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
 
   if (bit_depth) {
-    if (worker) {
-      FrameWorkerData *const frame_worker_data =
-          (FrameWorkerData *)worker->data1;
-      const VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+    if (ctx->pbi != NULL) {
+      const VP9_COMMON *const cm = &ctx->pbi->common;
       *bit_depth = cm->bit_depth;
       return VPX_CODEC_OK;
     } else {
@@ -983,10 +639,8 @@ static vpx_codec_err_t ctrl_set_byte_alignment(vpx_codec_alg_priv_t *ctx,
     return VPX_CODEC_INVALID_PARAM;
 
   ctx->byte_alignment = byte_alignment;
-  if (ctx->frame_workers) {
-    VPxWorker *const worker = ctx->frame_workers;
-    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-    frame_worker_data->pbi->common.byte_alignment = byte_alignment;
+  if (ctx->pbi != NULL) {
+    ctx->pbi->common.byte_alignment = byte_alignment;
   }
   return VPX_CODEC_OK;
 }
@@ -995,10 +649,8 @@ static vpx_codec_err_t ctrl_set_skip_loop_filter(vpx_codec_alg_priv_t *ctx,
                                                  va_list args) {
   ctx->skip_loop_filter = va_arg(args, int);
 
-  if (ctx->frame_workers) {
-    VPxWorker *const worker = ctx->frame_workers;
-    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-    frame_worker_data->pbi->common.skip_loop_filter = ctx->skip_loop_filter;
+  if (ctx->pbi != NULL) {
+    ctx->pbi->common.skip_loop_filter = ctx->skip_loop_filter;
   }
 
   return VPX_CODEC_OK;
@@ -1014,6 +666,20 @@ static vpx_codec_err_t ctrl_set_spatial_layer_svc(vpx_codec_alg_priv_t *ctx,
     return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t ctrl_set_row_mt(vpx_codec_alg_priv_t *ctx,
+                                       va_list args) {
+  ctx->row_mt = va_arg(args, int);
+
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_enable_lpf_opt(vpx_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  ctx->lpf_opt = va_arg(args, int);
+
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   { VP8_COPY_REFERENCE, ctrl_copy_reference },
 
@@ -1025,8 +691,11 @@ static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   { VP9_SET_BYTE_ALIGNMENT, ctrl_set_byte_alignment },
   { VP9_SET_SKIP_LOOP_FILTER, ctrl_set_skip_loop_filter },
   { VP9_DECODE_SVC_SPATIAL_LAYER, ctrl_set_spatial_layer_svc },
+  { VP9D_SET_ROW_MT, ctrl_set_row_mt },
+  { VP9D_SET_LOOP_FILTER_OPT, ctrl_enable_lpf_opt },
 
   // Getters
+  { VPXD_GET_LAST_QUANTIZER, ctrl_get_quantizer },
   { VP8D_GET_LAST_REF_UPDATES, ctrl_get_last_ref_updates },
   { VP8D_GET_FRAME_CORRUPTED, ctrl_get_frame_corrupted },
   { VP9_GET_REFERENCE, ctrl_get_reference },
@@ -1043,7 +712,10 @@ static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
 CODEC_INTERFACE(vpx_codec_vp9_dx) = {
   "WebM Project VP9 Decoder" VERSION_STRING,
   VPX_CODEC_INTERNAL_ABI_VERSION,
-  VPX_CODEC_CAP_DECODER | VP9_CAP_POSTPROC |
+#if CONFIG_VP9_HIGHBITDEPTH
+  VPX_CODEC_CAP_HIGHBITDEPTH |
+#endif
+      VPX_CODEC_CAP_DECODER | VP9_CAP_POSTPROC |
       VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER,  // vpx_codec_caps_t
   decoder_init,                             // vpx_codec_init_fn_t
   decoder_destroy,                          // vpx_codec_destroy_fn_t
@@ -1065,6 +737,7 @@ CODEC_INTERFACE(vpx_codec_vp9_dx) = {
       NULL,  // vpx_codec_enc_config_set_fn_t
       NULL,  // vpx_codec_get_global_headers_fn_t
       NULL,  // vpx_codec_get_preview_frame_fn_t
-      NULL   // vpx_codec_enc_mr_get_mem_loc_fn_t
+      NULL,  // vpx_codec_enc_mr_get_mem_loc_fn_t
+      NULL   // vpx_codec_enc_mr_free_mem_loc_fn_t
   }
 };
diff --git a/media/libvpx/libvpx/vp9/vp9_dx_iface.h b/media/libvpx/libvpx/vp9/vp9_dx_iface.h
index c1559599b8..f60688c4db 100644
--- a/media/libvpx/libvpx/vp9/vp9_dx_iface.h
+++ b/media/libvpx/libvpx/vp9/vp9_dx_iface.h
@@ -8,26 +8,19 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_VP9_DX_IFACE_H_
-#define VP9_VP9_DX_IFACE_H_
+#ifndef VPX_VP9_VP9_DX_IFACE_H_
+#define VPX_VP9_VP9_DX_IFACE_H_
 
 #include "vp9/decoder/vp9_decoder.h"
 
 typedef vpx_codec_stream_info_t vp9_stream_info_t;
 
-// This limit is due to framebuffer numbers.
-// TODO(hkuang): Remove this limit after implementing ondemand framebuffers.
-#define FRAME_CACHE_SIZE 6  // Cache maximum 6 decoded frames.
-
-typedef struct cache_frame {
-  int fb_idx;
-  vpx_image_t img;
-} cache_frame;
-
 struct vpx_codec_alg_priv {
   vpx_codec_priv_t base;
   vpx_codec_dec_cfg_t cfg;
   vp9_stream_info_t si;
+  VP9Decoder *pbi;
+  void *user_priv;
   int postproc_cfg_set;
   vp8_postproc_cfg_t postproc_cfg;
   vpx_decrypt_cb decrypt_cb;
@@ -40,20 +33,8 @@ struct vpx_codec_alg_priv {
   int byte_alignment;
   int skip_loop_filter;
 
-  // Frame parallel related.
-  int frame_parallel_decode;  // frame-based threading.
-  VPxWorker *frame_workers;
-  int num_frame_workers;
-  int next_submit_worker_id;
-  int last_submit_worker_id;
-  int next_output_worker_id;
-  int available_threads;
-  cache_frame frame_cache[FRAME_CACHE_SIZE];
-  int frame_cache_write;
-  int frame_cache_read;
-  int num_cache_frames;
   int need_resync;  // wait for key/intra-only frame
-  // BufferPool that holds all reference frames. Shared by all the FrameWorkers.
+  // BufferPool that holds all reference frames.
   BufferPool *buffer_pool;
 
   // External frame buffer info to save for VP9 common.
@@ -64,6 +45,8 @@ struct vpx_codec_alg_priv {
   // Allow for decoding up to a given spatial layer for SVC stream.
   int svc_decoding;
   int svc_spatial_layer;
+  int row_mt;
+  int lpf_opt;
 };
 
-#endif  // VP9_VP9_DX_IFACE_H_
+#endif  // VPX_VP9_VP9_DX_IFACE_H_
diff --git a/media/libvpx/libvpx/vp9/vp9_iface_common.c b/media/libvpx/libvpx/vp9/vp9_iface_common.c
new file mode 100644
index 0000000000..8d031694d8
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/vp9_iface_common.c
@@ -0,0 +1,136 @@
+/*
+ *  Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed  by a BSD-style license that can be
+ *  found in the LICENSE file in the root of the source tree. An additional
+ *  intellectual property  rights grant can  be found in the  file PATENTS.
+ *  All contributing  project authors may be  found in the AUTHORS  file in
+ *  the root of the source tree.
+ */
+
+#include "vp9/vp9_iface_common.h"
+void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12,
+                     void *user_priv) {
+  /** vpx_img_wrap() doesn't allow specifying independent strides for
+   * the Y, U, and V planes, nor other alignment adjustments that
+   * might be representable by a YV12_BUFFER_CONFIG, so we just
+   * initialize all the fields.*/
+  int bps;
+  if (!yv12->subsampling_y) {
+    if (!yv12->subsampling_x) {
+      img->fmt = VPX_IMG_FMT_I444;
+      bps = 24;
+    } else {
+      img->fmt = VPX_IMG_FMT_I422;
+      bps = 16;
+    }
+  } else {
+    if (!yv12->subsampling_x) {
+      img->fmt = VPX_IMG_FMT_I440;
+      bps = 16;
+    } else {
+      img->fmt = VPX_IMG_FMT_I420;
+      bps = 12;
+    }
+  }
+  img->cs = yv12->color_space;
+  img->range = yv12->color_range;
+  img->bit_depth = 8;
+  img->w = yv12->y_stride;
+  img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * VP9_ENC_BORDER_IN_PIXELS, 3);
+  img->d_w = yv12->y_crop_width;
+  img->d_h = yv12->y_crop_height;
+  img->r_w = yv12->render_width;
+  img->r_h = yv12->render_height;
+  img->x_chroma_shift = yv12->subsampling_x;
+  img->y_chroma_shift = yv12->subsampling_y;
+  img->planes[VPX_PLANE_Y] = yv12->y_buffer;
+  img->planes[VPX_PLANE_U] = yv12->u_buffer;
+  img->planes[VPX_PLANE_V] = yv12->v_buffer;
+  img->planes[VPX_PLANE_ALPHA] = NULL;
+  img->stride[VPX_PLANE_Y] = yv12->y_stride;
+  img->stride[VPX_PLANE_U] = yv12->uv_stride;
+  img->stride[VPX_PLANE_V] = yv12->uv_stride;
+  img->stride[VPX_PLANE_ALPHA] = yv12->y_stride;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (yv12->flags & YV12_FLAG_HIGHBITDEPTH) {
+    // vpx_image_t uses byte strides and a pointer to the first byte
+    // of the image.
+    img->fmt = (vpx_img_fmt_t)(img->fmt | VPX_IMG_FMT_HIGHBITDEPTH);
+    img->bit_depth = yv12->bit_depth;
+    img->planes[VPX_PLANE_Y] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->y_buffer);
+    img->planes[VPX_PLANE_U] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->u_buffer);
+    img->planes[VPX_PLANE_V] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->v_buffer);
+    img->planes[VPX_PLANE_ALPHA] = NULL;
+    img->stride[VPX_PLANE_Y] = 2 * yv12->y_stride;
+    img->stride[VPX_PLANE_U] = 2 * yv12->uv_stride;
+    img->stride[VPX_PLANE_V] = 2 * yv12->uv_stride;
+    img->stride[VPX_PLANE_ALPHA] = 2 * yv12->y_stride;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  img->bps = bps;
+  img->user_priv = user_priv;
+  img->img_data = yv12->buffer_alloc;
+  img->img_data_owner = 0;
+  img->self_allocd = 0;
+}
+
+vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
+                                YV12_BUFFER_CONFIG *yv12) {
+  yv12->y_buffer = img->planes[VPX_PLANE_Y];
+  yv12->u_buffer = img->planes[VPX_PLANE_U];
+  yv12->v_buffer = img->planes[VPX_PLANE_V];
+
+  yv12->y_crop_width = img->d_w;
+  yv12->y_crop_height = img->d_h;
+  yv12->render_width = img->r_w;
+  yv12->render_height = img->r_h;
+  yv12->y_width = img->d_w;
+  yv12->y_height = img->d_h;
+
+  yv12->uv_width = img->x_chroma_shift == 1 || img->fmt == VPX_IMG_FMT_NV12
+                       ? (1 + yv12->y_width) / 2
+                       : yv12->y_width;
+  yv12->uv_height =
+      img->y_chroma_shift == 1 ? (1 + yv12->y_height) / 2 : yv12->y_height;
+  yv12->uv_crop_width = yv12->uv_width;
+  yv12->uv_crop_height = yv12->uv_height;
+
+  yv12->y_stride = img->stride[VPX_PLANE_Y];
+  yv12->uv_stride = img->stride[VPX_PLANE_U];
+  yv12->color_space = img->cs;
+  yv12->color_range = img->range;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+    // In vpx_image_t
+    //     planes point to uint8 address of start of data
+    //     stride counts uint8s to reach next row
+    // In YV12_BUFFER_CONFIG
+    //     y_buffer, u_buffer, v_buffer point to uint16 address of data
+    //     stride and border counts in uint16s
+    // This means that all the address calculations in the main body of code
+    // should work correctly.
+    // However, before we do any pixel operations we need to cast the address
+    // to a uint16 ponter and double its value.
+    yv12->y_buffer = CONVERT_TO_BYTEPTR(yv12->y_buffer);
+    yv12->u_buffer = CONVERT_TO_BYTEPTR(yv12->u_buffer);
+    yv12->v_buffer = CONVERT_TO_BYTEPTR(yv12->v_buffer);
+    yv12->y_stride >>= 1;
+    yv12->uv_stride >>= 1;
+    yv12->flags = YV12_FLAG_HIGHBITDEPTH;
+  } else {
+    yv12->flags = 0;
+  }
+  yv12->border = (yv12->y_stride - img->w) / 2;
+#else
+  yv12->border = (img->stride[VPX_PLANE_Y] - img->w) / 2;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  yv12->subsampling_x = img->x_chroma_shift;
+  yv12->subsampling_y = img->y_chroma_shift;
+  // When reading the data, UV are in one plane for NV12 format, thus
+  // x_chroma_shift is 0. After converting, UV are in separate planes, and
+  // subsampling_x should be set to 1.
+  if (img->fmt == VPX_IMG_FMT_NV12) yv12->subsampling_x = 1;
+  return VPX_CODEC_OK;
+}
diff --git a/media/libvpx/libvpx/vp9/vp9_iface_common.h b/media/libvpx/libvpx/vp9/vp9_iface_common.h
index d68872750b..e646917c69 100644
--- a/media/libvpx/libvpx/vp9/vp9_iface_common.h
+++ b/media/libvpx/libvpx/vp9/vp9_iface_common.h
@@ -7,133 +7,27 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef VP9_VP9_IFACE_COMMON_H_
-#define VP9_VP9_IFACE_COMMON_H_
+#ifndef VPX_VP9_VP9_IFACE_COMMON_H_
+#define VPX_VP9_VP9_IFACE_COMMON_H_
 
+#include <assert.h>
 #include "vpx_ports/mem.h"
+#include "vpx/vp8.h"
+#include "vpx_scale/yv12config.h"
+#include "common/vp9_enums.h"
 
-static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12,
-                            void *user_priv) {
-  /** vpx_img_wrap() doesn't allow specifying independent strides for
-    * the Y, U, and V planes, nor other alignment adjustments that
-    * might be representable by a YV12_BUFFER_CONFIG, so we just
-    * initialize all the fields.*/
-  int bps;
-  if (!yv12->subsampling_y) {
-    if (!yv12->subsampling_x) {
-      img->fmt = VPX_IMG_FMT_I444;
-      bps = 24;
-    } else {
-      img->fmt = VPX_IMG_FMT_I422;
-      bps = 16;
-    }
-  } else {
-    if (!yv12->subsampling_x) {
-      img->fmt = VPX_IMG_FMT_I440;
-      bps = 16;
-    } else {
-      img->fmt = VPX_IMG_FMT_I420;
-      bps = 12;
-    }
-  }
-  img->cs = yv12->color_space;
-  img->range = yv12->color_range;
-  img->bit_depth = 8;
-  img->w = yv12->y_stride;
-  img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * VP9_ENC_BORDER_IN_PIXELS, 3);
-  img->d_w = yv12->y_crop_width;
-  img->d_h = yv12->y_crop_height;
-  img->r_w = yv12->render_width;
-  img->r_h = yv12->render_height;
-  img->x_chroma_shift = yv12->subsampling_x;
-  img->y_chroma_shift = yv12->subsampling_y;
-  img->planes[VPX_PLANE_Y] = yv12->y_buffer;
-  img->planes[VPX_PLANE_U] = yv12->u_buffer;
-  img->planes[VPX_PLANE_V] = yv12->v_buffer;
-  img->planes[VPX_PLANE_ALPHA] = NULL;
-  img->stride[VPX_PLANE_Y] = yv12->y_stride;
-  img->stride[VPX_PLANE_U] = yv12->uv_stride;
-  img->stride[VPX_PLANE_V] = yv12->uv_stride;
-  img->stride[VPX_PLANE_ALPHA] = yv12->y_stride;
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (yv12->flags & YV12_FLAG_HIGHBITDEPTH) {
-    // vpx_image_t uses byte strides and a pointer to the first byte
-    // of the image.
-    img->fmt = (vpx_img_fmt_t)(img->fmt | VPX_IMG_FMT_HIGHBITDEPTH);
-    img->bit_depth = yv12->bit_depth;
-    img->planes[VPX_PLANE_Y] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->y_buffer);
-    img->planes[VPX_PLANE_U] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->u_buffer);
-    img->planes[VPX_PLANE_V] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->v_buffer);
-    img->planes[VPX_PLANE_ALPHA] = NULL;
-    img->stride[VPX_PLANE_Y] = 2 * yv12->y_stride;
-    img->stride[VPX_PLANE_U] = 2 * yv12->uv_stride;
-    img->stride[VPX_PLANE_V] = 2 * yv12->uv_stride;
-    img->stride[VPX_PLANE_ALPHA] = 2 * yv12->y_stride;
-  }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-  img->bps = bps;
-  img->user_priv = user_priv;
-  img->img_data = yv12->buffer_alloc;
-  img->img_data_owner = 0;
-  img->self_allocd = 0;
-}
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
-                                       YV12_BUFFER_CONFIG *yv12) {
-  yv12->y_buffer = img->planes[VPX_PLANE_Y];
-  yv12->u_buffer = img->planes[VPX_PLANE_U];
-  yv12->v_buffer = img->planes[VPX_PLANE_V];
+void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12,
+                     void *user_priv);
 
-  yv12->y_crop_width = img->d_w;
-  yv12->y_crop_height = img->d_h;
-  yv12->render_width = img->r_w;
-  yv12->render_height = img->r_h;
-  yv12->y_width = img->d_w;
-  yv12->y_height = img->d_h;
+vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
+                                YV12_BUFFER_CONFIG *yv12);
 
-  yv12->uv_width =
-      img->x_chroma_shift == 1 ? (1 + yv12->y_width) / 2 : yv12->y_width;
-  yv12->uv_height =
-      img->y_chroma_shift == 1 ? (1 + yv12->y_height) / 2 : yv12->y_height;
-  yv12->uv_crop_width = yv12->uv_width;
-  yv12->uv_crop_height = yv12->uv_height;
-
-  yv12->y_stride = img->stride[VPX_PLANE_Y];
-  yv12->uv_stride = img->stride[VPX_PLANE_U];
-  yv12->color_space = img->cs;
-  yv12->color_range = img->range;
-
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
-    // In vpx_image_t
-    //     planes point to uint8 address of start of data
-    //     stride counts uint8s to reach next row
-    // In YV12_BUFFER_CONFIG
-    //     y_buffer, u_buffer, v_buffer point to uint16 address of data
-    //     stride and border counts in uint16s
-    // This means that all the address calculations in the main body of code
-    // should work correctly.
-    // However, before we do any pixel operations we need to cast the address
-    // to a uint16 ponter and double its value.
-    yv12->y_buffer = CONVERT_TO_BYTEPTR(yv12->y_buffer);
-    yv12->u_buffer = CONVERT_TO_BYTEPTR(yv12->u_buffer);
-    yv12->v_buffer = CONVERT_TO_BYTEPTR(yv12->v_buffer);
-    yv12->y_stride >>= 1;
-    yv12->uv_stride >>= 1;
-    yv12->flags = YV12_FLAG_HIGHBITDEPTH;
-  } else {
-    yv12->flags = 0;
-  }
-  yv12->border = (yv12->y_stride - img->w) / 2;
-#else
-  yv12->border = (img->stride[VPX_PLANE_Y] - img->w) / 2;
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-  yv12->subsampling_x = img->x_chroma_shift;
-  yv12->subsampling_y = img->y_chroma_shift;
-  return VPX_CODEC_OK;
-}
-
-static VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) {
+static INLINE VP9_REFFRAME
+ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) {
   switch (frame) {
     case VP8_LAST_FRAME: return VP9_LAST_FLAG;
     case VP8_GOLD_FRAME: return VP9_GOLD_FLAG;
@@ -142,4 +36,9 @@ static VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) {
   assert(0 && "Invalid Reference Frame");
   return VP9_LAST_FLAG;
 }
-#endif  // VP9_VP9_IFACE_COMMON_H_
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VP9_VP9_IFACE_COMMON_H_
diff --git a/media/libvpx/libvpx/vp9/vp9cx.mk b/media/libvpx/libvpx/vp9/vp9cx.mk
index a8ca0d5935..04af1c51e9 100644
--- a/media/libvpx/libvpx/vp9/vp9cx.mk
+++ b/media/libvpx/libvpx/vp9/vp9cx.mk
@@ -16,6 +16,7 @@ VP9_CX_SRCS_REMOVE-yes += $(VP9_COMMON_SRCS_REMOVE-yes)
 VP9_CX_SRCS_REMOVE-no  += $(VP9_COMMON_SRCS_REMOVE-no)
 
 VP9_CX_SRCS-yes += vp9_cx_iface.c
+VP9_CX_SRCS-yes += vp9_cx_iface.h
 
 VP9_CX_SRCS-yes += encoder/vp9_bitstream.c
 VP9_CX_SRCS-yes += encoder/vp9_context_tree.c
@@ -39,9 +40,14 @@ VP9_CX_SRCS-yes += encoder/vp9_encodemb.h
 VP9_CX_SRCS-yes += encoder/vp9_encodemv.h
 VP9_CX_SRCS-yes += encoder/vp9_extend.h
 VP9_CX_SRCS-yes += encoder/vp9_firstpass.h
+VP9_CX_SRCS-yes += encoder/vp9_firstpass_stats.h
+VP9_CX_SRCS-yes += encoder/vp9_frame_scale.c
+VP9_CX_SRCS-yes += encoder/vp9_job_queue.h
 VP9_CX_SRCS-yes += encoder/vp9_lookahead.c
 VP9_CX_SRCS-yes += encoder/vp9_lookahead.h
 VP9_CX_SRCS-yes += encoder/vp9_mcomp.h
+VP9_CX_SRCS-yes += encoder/vp9_multi_thread.c
+VP9_CX_SRCS-yes += encoder/vp9_multi_thread.h
 VP9_CX_SRCS-yes += encoder/vp9_encoder.h
 VP9_CX_SRCS-yes += encoder/vp9_quantize.h
 VP9_CX_SRCS-yes += encoder/vp9_ratectrl.h
@@ -60,6 +66,7 @@ VP9_CX_SRCS-yes += encoder/vp9_ratectrl.c
 VP9_CX_SRCS-yes += encoder/vp9_rd.c
 VP9_CX_SRCS-yes += encoder/vp9_rdopt.c
 VP9_CX_SRCS-yes += encoder/vp9_pickmode.c
+VP9_CX_SRCS-yes += encoder/vp9_partition_models.h
 VP9_CX_SRCS-yes += encoder/vp9_segmentation.c
 VP9_CX_SRCS-yes += encoder/vp9_segmentation.h
 VP9_CX_SRCS-yes += encoder/vp9_speed_features.c
@@ -70,6 +77,9 @@ VP9_CX_SRCS-yes += encoder/vp9_svc_layercontext.c
 VP9_CX_SRCS-yes += encoder/vp9_resize.c
 VP9_CX_SRCS-yes += encoder/vp9_resize.h
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_blockiness.c
+VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_blockiness.h
+VP9_CX_SRCS-$(CONFIG_NON_GREEDY_MV) += encoder/vp9_non_greedy_mv.c
+VP9_CX_SRCS-$(CONFIG_NON_GREEDY_MV) += encoder/vp9_non_greedy_mv.h
 
 VP9_CX_SRCS-yes += encoder/vp9_tokenize.c
 VP9_CX_SRCS-yes += encoder/vp9_treewriter.c
@@ -87,57 +97,97 @@ VP9_CX_SRCS-yes += encoder/vp9_skin_detection.c
 VP9_CX_SRCS-yes += encoder/vp9_skin_detection.h
 VP9_CX_SRCS-yes += encoder/vp9_noise_estimate.c
 VP9_CX_SRCS-yes += encoder/vp9_noise_estimate.h
+VP9_CX_SRCS-yes += encoder/vp9_ext_ratectrl.c
+VP9_CX_SRCS-yes += encoder/vp9_ext_ratectrl.h
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.h
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.c
 endif
 VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.c
 VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h
+VP9_CX_SRCS-yes += encoder/vp9_tpl_model.c
+VP9_CX_SRCS-yes += encoder/vp9_tpl_model.h
 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
 
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/temporal_filter_ssse3.c
+VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_sse4.c
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/temporal_filter_avx2.c
+VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/vp9_temporal_filter_constants.h
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_temporal_filter_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/vp9_temporal_filter_constants.h
+VP9_CX_SRCS-$(HAVE_NEON_DOTPROD) += encoder/arm/neon/vp9_temporal_filter_neon_dotprod.c
+VP9_CX_SRCS-$(HAVE_NEON_I8MM) += encoder/arm/neon/vp9_temporal_filter_neon_i8mm.c
+
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
-VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c
+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.c
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_quantize_avx2.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_diamond_search_sad_neon.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/highbd_temporal_filter_ssse3.c
+VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_temporal_filter_sse4.c
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/highbd_temporal_filter_avx2.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_highbd_temporal_filter_neon.c
+VP9_CX_SRCS-$(HAVE_SVE2) += encoder/arm/neon/vp9_highbd_temporal_filter_sve2.c
 endif
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm
-ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_error_sse2.asm
-VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_highbd_error_avx.asm
-else
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
-endif
-
-ifeq ($(ARCH_X86_64),yes)
-VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm
-endif
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_intrin_sse2.c
-VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3.c
-ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_frame_scale_ssse3.c
-endif
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
 
 ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_denoiser_sse2.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_denoiser_neon.c
 endif
 
-VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_avx2.c
 
-ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c
-endif
+VP9_CX_SRCS-$(HAVE_SVE)  += encoder/arm/neon/vp9_error_sve.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_frame_scale_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_highbd_error_neon.c
+endif
 
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c
+
+ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct4x4_msa.c
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct8x8_msa.c
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h
-VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_temporal_filter_msa.c
+endif  # !CONFIG_VP9_HIGHBITDEPTH
+
+VP9_CX_SRCS-$(HAVE_VSX) += encoder/ppc/vp9_quantize_vsx.c
+
+# Strip unnecessary files with CONFIG_REALTIME_ONLY
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_firstpass.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_mbgraph.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_temporal_filter.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/temporal_filter_ssse3.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/temporal_filter_sse4.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/temporal_filter_avx2.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_temporal_filter_constants.h
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/highbd_temporal_filter_ssse3.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/highbd_temporal_filter_sse4.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/highbd_temporal_filter_avx2.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/arm/neon/vp9_temporal_filter_neon.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/arm/neon/vp9_temporal_filter_neon_dotprod.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/arm/neon/vp9_temporal_filter_neon_i8mm.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/arm/neon/vp9_highbd_temporal_filter_neon.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/arm/neon/vp9_highbd_temporal_filter_sve2.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_alt_ref_aq.h
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_alt_ref_aq.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_variance.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_variance.h
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_360.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_360.h
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_complexity.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_complexity.h
 
 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
diff --git a/media/libvpx/libvpx/vp9/vp9dx.mk b/media/libvpx/libvpx/vp9/vp9dx.mk
index 4c6fd00715..93a5f368bd 100644
--- a/media/libvpx/libvpx/vp9/vp9dx.mk
+++ b/media/libvpx/libvpx/vp9/vp9dx.mk
@@ -24,11 +24,11 @@ VP9_DX_SRCS-yes += decoder/vp9_decodeframe.h
 VP9_DX_SRCS-yes += decoder/vp9_detokenize.c
 VP9_DX_SRCS-yes += decoder/vp9_decodemv.h
 VP9_DX_SRCS-yes += decoder/vp9_detokenize.h
-VP9_DX_SRCS-yes += decoder/vp9_dthread.c
-VP9_DX_SRCS-yes += decoder/vp9_dthread.h
 VP9_DX_SRCS-yes += decoder/vp9_decoder.c
 VP9_DX_SRCS-yes += decoder/vp9_decoder.h
 VP9_DX_SRCS-yes += decoder/vp9_dsubexp.c
 VP9_DX_SRCS-yes += decoder/vp9_dsubexp.h
+VP9_DX_SRCS-yes += decoder/vp9_job_queue.c
+VP9_DX_SRCS-yes += decoder/vp9_job_queue.h
 
 VP9_DX_SRCS-yes := $(filter-out $(VP9_DX_SRCS_REMOVE-yes),$(VP9_DX_SRCS-yes))
diff --git a/media/libvpx/libvpx/vpx/exports_spatial_svc b/media/libvpx/libvpx/vpx/exports_spatial_svc
deleted file mode 100644
index d258a1d618..0000000000
--- a/media/libvpx/libvpx/vpx/exports_spatial_svc
+++ /dev/null
@@ -1,6 +0,0 @@
-text vpx_svc_dump_statistics
-text vpx_svc_encode
-text vpx_svc_get_message
-text vpx_svc_init
-text vpx_svc_release
-text vpx_svc_set_options
diff --git a/media/libvpx/libvpx/vpx/internal/vpx_codec_internal.h b/media/libvpx/libvpx/vpx/internal/vpx_codec_internal.h
index 522e5c1684..ff51881eea 100644
--- a/media/libvpx/libvpx/vpx/internal/vpx_codec_internal.h
+++ b/media/libvpx/libvpx/vpx/internal/vpx_codec_internal.h
@@ -27,25 +27,29 @@
  *     </pre>
  *
  * An application instantiates a specific decoder instance by using
- * vpx_codec_init() and a pointer to the algorithm's interface structure:
+ * vpx_codec_dec_init() and a pointer to the algorithm's interface structure:
  *     <pre>
  *     my_app.c:
  *       extern vpx_codec_iface_t my_codec;
  *       {
  *           vpx_codec_ctx_t algo;
- *           res = vpx_codec_init(&algo, &my_codec);
+ *           int threads = 4;
+ *           vpx_codec_dec_cfg_t cfg = { threads, 0, 0 };
+ *           res = vpx_codec_dec_init(&algo, &my_codec, &cfg, 0);
  *       }
  *     </pre>
  *
  * Once initialized, the instance is manged using other functions from
  * the vpx_codec_* family.
  */
-#ifndef VPX_INTERNAL_VPX_CODEC_INTERNAL_H_
-#define VPX_INTERNAL_VPX_CODEC_INTERNAL_H_
+#ifndef VPX_VPX_INTERNAL_VPX_CODEC_INTERNAL_H_
+#define VPX_VPX_INTERNAL_VPX_CODEC_INTERNAL_H_
 #include "../vpx_decoder.h"
 #include "../vpx_encoder.h"
 #include <stdarg.h>
 
+#include "vpx_config.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -66,7 +70,7 @@ typedef struct vpx_codec_priv_enc_mr_cfg vpx_codec_priv_enc_mr_cfg_t;
 /*!\brief init function pointer prototype
  *
  * Performs algorithm-specific initialization of the decoder context. This
- * function is called by the generic vpx_codec_init() wrapper function, so
+ * function is called by vpx_codec_dec_init() and vpx_codec_enc_init(), so
  * plugins implementing this interface may trust the input parameters to be
  * properly initialized.
  *
@@ -175,16 +179,15 @@ typedef const struct vpx_codec_ctrl_fn_map {
 /*!\brief decode data function pointer prototype
  *
  * Processes a buffer of coded data. If the processing results in a new
- * decoded frame becoming available, #VPX_CODEC_CB_PUT_SLICE and
- * #VPX_CODEC_CB_PUT_FRAME events are generated as appropriate. This
- * function is called by the generic vpx_codec_decode() wrapper function,
- * so plugins implementing this interface may trust the input parameters
- * to be properly initialized.
+ * decoded frame becoming available, put_slice and put_frame callbacks
+ * are invoked as appropriate. This function is called by the generic
+ * vpx_codec_decode() wrapper function, so plugins implementing this
+ * interface may trust the input parameters to be properly initialized.
  *
  * \param[in] ctx          Pointer to this instance's context
  * \param[in] data         Pointer to this block of new coded data. If
- *                         NULL, a #VPX_CODEC_CB_PUT_FRAME event is posted
- *                         for the previously decoded frame.
+ *                         NULL, the put_frame callback is invoked for
+ *                         the previously decoded frame.
  * \param[in] data_sz      Size of the coded data, in bytes.
  *
  * \return Returns #VPX_CODEC_OK if the coded data was processed completely
@@ -195,8 +198,7 @@ typedef const struct vpx_codec_ctrl_fn_map {
 typedef vpx_codec_err_t (*vpx_codec_decode_fn_t)(vpx_codec_alg_priv_t *ctx,
                                                  const uint8_t *data,
                                                  unsigned int data_sz,
-                                                 void *user_priv,
-                                                 long deadline);
+                                                 void *user_priv);
 
 /*!\brief Decoded frames iterator
  *
@@ -252,7 +254,7 @@ typedef vpx_codec_err_t (*vpx_codec_encode_fn_t)(vpx_codec_alg_priv_t *ctx,
                                                  vpx_codec_pts_t pts,
                                                  unsigned long duration,
                                                  vpx_enc_frame_flags_t flags,
-                                                 unsigned long deadline);
+                                                 vpx_enc_deadline_t deadline);
 typedef const vpx_codec_cx_pkt_t *(*vpx_codec_get_cx_data_fn_t)(
     vpx_codec_alg_priv_t *ctx, vpx_codec_iter_t *iter);
 
@@ -267,6 +269,8 @@ typedef vpx_image_t *(*vpx_codec_get_preview_frame_fn_t)(
 typedef vpx_codec_err_t (*vpx_codec_enc_mr_get_mem_loc_fn_t)(
     const vpx_codec_enc_cfg_t *cfg, void **mem_loc);
 
+typedef void (*vpx_codec_enc_mr_free_mem_loc_fn_t)(void *mem_loc);
+
 /*!\brief usage configuration mapping
  *
  * This structure stores the mapping between usage identifiers and
@@ -282,7 +286,7 @@ typedef const struct vpx_codec_enc_cfg_map {
   vpx_codec_enc_cfg_t cfg;
 } vpx_codec_enc_cfg_map_t;
 
-/*!\brief Decoder algorithm interface interface
+/*!\brief Decoder algorithm interface
  *
  * All decoders \ref MUST expose a variable of this type.
  */
@@ -316,6 +320,8 @@ struct vpx_codec_iface {
         get_preview; /**< \copydoc ::vpx_codec_get_preview_frame_fn_t */
     vpx_codec_enc_mr_get_mem_loc_fn_t
         mr_get_mem_loc; /**< \copydoc ::vpx_codec_enc_mr_get_mem_loc_fn_t */
+    vpx_codec_enc_mr_free_mem_loc_fn_t
+        mr_free_mem_loc; /**< \copydoc ::vpx_codec_enc_mr_free_mem_loc_fn_t */
   } enc;
 };
 
@@ -426,6 +432,27 @@ struct vpx_internal_error_info {
   jmp_buf jmp;
 };
 
+#if CONFIG_DEBUG
+#define CHECK_MEM_ERROR(error, lval, expr)                                  \
+  do {                                                                      \
+    assert((error)->setjmp);                                                \
+    (lval) = (expr);                                                        \
+    if (!(lval))                                                            \
+      vpx_internal_error(error, VPX_CODEC_MEM_ERROR,                        \
+                         "Failed to allocate " #lval " at %s:%d", __FILE__, \
+                         __LINE__);                                         \
+  } while (0)
+#else
+#define CHECK_MEM_ERROR(error, lval, expr)             \
+  do {                                                 \
+    assert((error)->setjmp);                           \
+    (lval) = (expr);                                   \
+    if (!(lval))                                       \
+      vpx_internal_error(error, VPX_CODEC_MEM_ERROR,   \
+                         "Failed to allocate " #lval); \
+  } while (0)
+#endif
+
 #define CLANG_ANALYZER_NORETURN
 #if defined(__has_feature)
 #if __has_feature(attribute_analyzer_noreturn)
@@ -434,12 +461,24 @@ struct vpx_internal_error_info {
 #endif
 #endif
 
+// Tells the compiler to perform `printf` format string checking if the
+// compiler supports it; see the 'format' attribute in
+// <https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html>.
+#define LIBVPX_FORMAT_PRINTF(string_index, first_to_check)
+#if defined(__has_attribute)
+#if __has_attribute(format)
+#undef LIBVPX_FORMAT_PRINTF
+#define LIBVPX_FORMAT_PRINTF(string_index, first_to_check) \
+  __attribute__((__format__(__printf__, string_index, first_to_check)))
+#endif
+#endif
+
 void vpx_internal_error(struct vpx_internal_error_info *info,
-                        vpx_codec_err_t error, const char *fmt,
-                        ...) CLANG_ANALYZER_NORETURN;
+                        vpx_codec_err_t error, const char *fmt, ...)
+    LIBVPX_FORMAT_PRINTF(3, 4) CLANG_ANALYZER_NORETURN;
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VPX_INTERNAL_VPX_CODEC_INTERNAL_H_
+#endif  // VPX_VPX_INTERNAL_VPX_CODEC_INTERNAL_H_
diff --git a/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h b/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h
new file mode 100644
index 0000000000..2643b5578a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h
@@ -0,0 +1,80 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_INTERNAL_VPX_RATECTRL_RTC_H_
+#define VPX_VPX_INTERNAL_VPX_RATECTRL_RTC_H_
+
+#include "vpx/vpx_encoder.h"
+
+namespace libvpx {
+
+enum class RcFrameType { kKeyFrame = 0, kInterFrame = 1 };
+
+enum class FrameDropDecision {
+  kOk,    // Frame is encoded.
+  kDrop,  // Frame is dropped.
+};
+
+struct UVDeltaQP {
+  // For the UV channel: the QP for the dc/ac value is given as
+  // GetQP() + uvdc/ac_delta_q, where the uvdc/ac_delta_q are negative numbers.
+  int uvdc_delta_q;
+  int uvac_delta_q;
+};
+
+struct VpxRateControlRtcConfig {
+  VpxRateControlRtcConfig() {
+    width = 1280;
+    height = 720;
+    max_quantizer = 63;
+    min_quantizer = 2;
+    target_bandwidth = 1000;
+    buf_initial_sz = 600;
+    buf_optimal_sz = 600;
+    buf_sz = 1000;
+    undershoot_pct = overshoot_pct = 50;
+    max_intra_bitrate_pct = 50;
+    max_inter_bitrate_pct = 0;
+    framerate = 30.0;
+    ts_number_layers = 1;
+    rc_mode = VPX_CBR;
+    aq_mode = 0;
+    layer_target_bitrate[0] = static_cast<int>(target_bandwidth);
+    ts_rate_decimator[0] = 1;
+    frame_drop_thresh = 0;
+    is_screen = false;
+  }
+
+  int width;
+  int height;
+  // 0-63
+  int max_quantizer;
+  int min_quantizer;
+  int64_t target_bandwidth;
+  int64_t buf_initial_sz;
+  int64_t buf_optimal_sz;
+  int64_t buf_sz;
+  int undershoot_pct;
+  int overshoot_pct;
+  int max_intra_bitrate_pct;
+  int max_inter_bitrate_pct;
+  double framerate;
+  // Number of temporal layers
+  int ts_number_layers;
+  int layer_target_bitrate[VPX_MAX_LAYERS];
+  int ts_rate_decimator[VPX_TS_MAX_LAYERS];
+  // vbr, cbr
+  enum vpx_rc_mode rc_mode;
+  int aq_mode;
+  int frame_drop_thresh;
+  bool is_screen;
+};
+}  // namespace libvpx
+#endif  // VPX_VPX_INTERNAL_VPX_RATECTRL_RTC_H_
diff --git a/media/libvpx/libvpx/vpx/src/vpx_codec.c b/media/libvpx/libvpx/vpx/src/vpx_codec.c
index 10331aa21b..c54a18ecd4 100644
--- a/media/libvpx/libvpx/vpx/src/vpx_codec.c
+++ b/media/libvpx/libvpx/vpx/src/vpx_codec.c
@@ -13,6 +13,7 @@
  *
  */
 #include <stdarg.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include "vpx/vpx_integer.h"
 #include "vpx/internal/vpx_codec_internal.h"
@@ -50,12 +51,12 @@ const char *vpx_codec_err_to_string(vpx_codec_err_t err) {
   return "Unrecognized error code";
 }
 
-const char *vpx_codec_error(vpx_codec_ctx_t *ctx) {
+const char *vpx_codec_error(const vpx_codec_ctx_t *ctx) {
   return (ctx) ? vpx_codec_err_to_string(ctx->err)
                : vpx_codec_err_to_string(VPX_CODEC_INVALID_PARAM);
 }
 
-const char *vpx_codec_error_detail(vpx_codec_ctx_t *ctx) {
+const char *vpx_codec_error_detail(const vpx_codec_ctx_t *ctx) {
   if (ctx && ctx->err)
     return ctx->priv ? ctx->priv->err_detail : ctx->err_detail;
 
@@ -82,7 +83,7 @@ vpx_codec_err_t vpx_codec_destroy(vpx_codec_ctx_t *ctx) {
 }
 
 vpx_codec_caps_t vpx_codec_get_caps(vpx_codec_iface_t *iface) {
-  return (iface) ? iface->caps : 0;
+  return iface ? iface->caps : 0;
 }
 
 vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...) {
@@ -97,7 +98,7 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...) {
 
     res = VPX_CODEC_INCAPABLE;
 
-    for (entry = ctx->iface->ctrl_maps; entry && entry->fn; entry++) {
+    for (entry = ctx->iface->ctrl_maps; entry->fn; entry++) {
       if (!entry->ctrl_id || entry->ctrl_id == ctrl_id) {
         va_list ap;
 
diff --git a/media/libvpx/libvpx/vpx/src/vpx_decoder.c b/media/libvpx/libvpx/vpx/src/vpx_decoder.c
index fc1c2bccae..c79cc708cd 100644
--- a/media/libvpx/libvpx/vpx/src/vpx_decoder.c
+++ b/media/libvpx/libvpx/vpx/src/vpx_decoder.c
@@ -105,6 +105,7 @@ vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t *ctx, const uint8_t *data,
                                  unsigned int data_sz, void *user_priv,
                                  long deadline) {
   vpx_codec_err_t res;
+  (void)deadline;
 
   /* Sanity checks */
   /* NULL data ptr allowed if data_sz is 0 too */
@@ -112,10 +113,8 @@ vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t *ctx, const uint8_t *data,
     res = VPX_CODEC_INVALID_PARAM;
   else if (!ctx->iface || !ctx->priv)
     res = VPX_CODEC_ERROR;
-  else {
-    res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv,
-                                 deadline);
-  }
+  else
+    res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv);
 
   return SAVE_STATUS(ctx, res);
 }
@@ -138,9 +137,10 @@ vpx_codec_err_t vpx_codec_register_put_frame_cb(vpx_codec_ctx_t *ctx,
 
   if (!ctx || !cb)
     res = VPX_CODEC_INVALID_PARAM;
-  else if (!ctx->iface || !ctx->priv ||
-           !(ctx->iface->caps & VPX_CODEC_CAP_PUT_FRAME))
+  else if (!ctx->iface || !ctx->priv)
     res = VPX_CODEC_ERROR;
+  else if (!(ctx->iface->caps & VPX_CODEC_CAP_PUT_FRAME))
+    res = VPX_CODEC_INCAPABLE;
   else {
     ctx->priv->dec.put_frame_cb.u.put_frame = cb;
     ctx->priv->dec.put_frame_cb.user_priv = user_priv;
@@ -157,9 +157,10 @@ vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t *ctx,
 
   if (!ctx || !cb)
     res = VPX_CODEC_INVALID_PARAM;
-  else if (!ctx->iface || !ctx->priv ||
-           !(ctx->iface->caps & VPX_CODEC_CAP_PUT_SLICE))
+  else if (!ctx->iface || !ctx->priv)
     res = VPX_CODEC_ERROR;
+  else if (!(ctx->iface->caps & VPX_CODEC_CAP_PUT_SLICE))
+    res = VPX_CODEC_INCAPABLE;
   else {
     ctx->priv->dec.put_slice_cb.u.put_slice = cb;
     ctx->priv->dec.put_slice_cb.user_priv = user_priv;
@@ -176,9 +177,10 @@ vpx_codec_err_t vpx_codec_set_frame_buffer_functions(
 
   if (!ctx || !cb_get || !cb_release) {
     res = VPX_CODEC_INVALID_PARAM;
-  } else if (!ctx->iface || !ctx->priv ||
-             !(ctx->iface->caps & VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER)) {
+  } else if (!ctx->iface || !ctx->priv) {
     res = VPX_CODEC_ERROR;
+  } else if (!(ctx->iface->caps & VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER)) {
+    res = VPX_CODEC_INCAPABLE;
   } else {
     res = ctx->iface->dec.set_fb_fn(get_alg_priv(ctx), cb_get, cb_release,
                                     cb_priv);
diff --git a/media/libvpx/libvpx/vpx/src/vpx_encoder.c b/media/libvpx/libvpx/vpx/src/vpx_encoder.c
index 42d49970fe..36dfa51897 100644
--- a/media/libvpx/libvpx/vpx/src/vpx_encoder.c
+++ b/media/libvpx/libvpx/vpx/src/vpx_encoder.c
@@ -14,13 +14,13 @@
  */
 #include <assert.h>
 #include <limits.h>
-#include <stdlib.h>
+#include <stdint.h>
 #include <string.h>
-#include "vp8/common/blockd.h"
 #include "vpx_config.h"
+#include "vpx/vpx_encoder.h"
 #include "vpx/internal/vpx_codec_internal.h"
 
-#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
+#define SAVE_STATUS(ctx, var) ((ctx) ? ((ctx)->err = (var)) : (var))
 
 static vpx_codec_alg_priv_t *get_alg_priv(vpx_codec_ctx_t *ctx) {
   return (vpx_codec_alg_priv_t *)ctx->priv;
@@ -54,6 +54,10 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx,
     res = ctx->iface->init(ctx, NULL);
 
     if (res) {
+      // IMPORTANT: ctx->priv->err_detail must be null or point to a string
+      // that remains valid after ctx->priv is destroyed, such as a C string
+      // literal. This makes it safe to call vpx_codec_error_detail() after
+      // vpx_codec_enc_init_ver() failed.
       ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
       vpx_codec_destroy(ctx);
     }
@@ -63,13 +67,14 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx,
 }
 
 vpx_codec_err_t vpx_codec_enc_init_multi_ver(
-    vpx_codec_ctx_t *ctx, vpx_codec_iface_t *iface, vpx_codec_enc_cfg_t *cfg,
-    int num_enc, vpx_codec_flags_t flags, vpx_rational_t *dsf, int ver) {
+    vpx_codec_ctx_t *ctx, vpx_codec_iface_t *iface,
+    const vpx_codec_enc_cfg_t *cfg, int num_enc, vpx_codec_flags_t flags,
+    const vpx_rational_t *dsf, int ver) {
   vpx_codec_err_t res = VPX_CODEC_OK;
 
   if (ver != VPX_ENCODER_ABI_VERSION)
     res = VPX_CODEC_ABI_MISMATCH;
-  else if (!ctx || !iface || !cfg || (num_enc > 16 || num_enc < 1))
+  else if (!ctx || !iface || !cfg || (num_enc > 16 || num_enc < 1) || !dsf)
     res = VPX_CODEC_INVALID_PARAM;
   else if (iface->abi_version != VPX_CODEC_INTERNAL_ABI_VERSION)
     res = VPX_CODEC_ABI_MISMATCH;
@@ -82,7 +87,10 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(
     res = VPX_CODEC_INCAPABLE;
   else {
     int i;
-    void *mem_loc = NULL;
+#if CONFIG_MULTI_RES_ENCODING
+    int mem_loc_owned = 0;
+#endif
+    void *mem_loc;
 
     if (iface->enc.mr_get_mem_loc == NULL) return VPX_CODEC_INCAPABLE;
 
@@ -94,27 +102,20 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(
         if (dsf->num < 1 || dsf->num > 4096 || dsf->den < 1 ||
             dsf->den > dsf->num) {
           res = VPX_CODEC_INVALID_PARAM;
-          break;
         } else {
-
           mr_cfg.mr_low_res_mode_info = mem_loc;
           mr_cfg.mr_total_resolutions = num_enc;
           mr_cfg.mr_encoder_id = num_enc - 1 - i;
-          mr_cfg.mr_down_sampling_factor.num = dsf->num;
-          mr_cfg.mr_down_sampling_factor.den = dsf->den;
-
-          /* Force Key-frame synchronization. Namely, encoder at higher
-           * resolution always use the same frame_type chosen by the
-           * lowest-resolution encoder.
-           */
-          if (mr_cfg.mr_encoder_id)
-            cfg->kf_mode = VPX_KF_DISABLED;
+          mr_cfg.mr_down_sampling_factor = *dsf;
 
           ctx->iface = iface;
           ctx->name = iface->name;
           ctx->priv = NULL;
           ctx->init_flags = flags;
           ctx->config.enc = cfg;
+          // ctx takes ownership of mr_cfg.mr_low_res_mode_info if and only if
+          // this call succeeds. The first ctx entry in the array is
+          // responsible for freeing the memory.
           res = ctx->iface->init(ctx, &mr_cfg);
         }
 
@@ -132,13 +133,16 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(
             i--;
           }
 #if CONFIG_MULTI_RES_ENCODING
-          assert(mem_loc);
-          free(((LOWER_RES_FRAME_INFO *)mem_loc)->mb_info);
-          free(mem_loc);
+          if (!mem_loc_owned) {
+            assert(mem_loc);
+            iface->enc.mr_free_mem_loc(mem_loc);
+          }
 #endif
           return SAVE_STATUS(ctx, res);
         }
-
+#if CONFIG_MULTI_RES_ENCODING
+        mem_loc_owned = 1;
+#endif
         ctx++;
         cfg++;
         dsf++;
@@ -154,52 +158,42 @@ vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface,
                                              vpx_codec_enc_cfg_t *cfg,
                                              unsigned int usage) {
   vpx_codec_err_t res;
-  vpx_codec_enc_cfg_map_t *map;
-  int i;
 
-  if (!iface || !cfg || usage > INT_MAX)
+  if (!iface || !cfg || usage != 0)
     res = VPX_CODEC_INVALID_PARAM;
   else if (!(iface->caps & VPX_CODEC_CAP_ENCODER))
     res = VPX_CODEC_INCAPABLE;
   else {
-    res = VPX_CODEC_INVALID_PARAM;
-
-    for (i = 0; i < iface->enc.cfg_map_count; ++i) {
-      map = iface->enc.cfg_maps + i;
-      if (map->usage == (int)usage) {
-        *cfg = map->cfg;
-        cfg->g_usage = usage;
-        res = VPX_CODEC_OK;
-        break;
-      }
-    }
+    assert(iface->enc.cfg_map_count == 1);
+    *cfg = iface->enc.cfg_maps->cfg;
+    res = VPX_CODEC_OK;
   }
 
   return res;
 }
 
-#if ARCH_X86 || ARCH_X86_64
+#if VPX_ARCH_X86 || VPX_ARCH_X86_64
 /* On X86, disable the x87 unit's internal 80 bit precision for better
  * consistency with the SSE unit's 64 bit precision.
  */
 #include "vpx_ports/x86.h"
 #define FLOATING_POINT_INIT() \
   do {                        \
-    unsigned short x87_orig_mode = x87_set_double_precision();
+  unsigned short x87_orig_mode = x87_set_double_precision()
 #define FLOATING_POINT_RESTORE()       \
   x87_set_control_word(x87_orig_mode); \
   }                                    \
   while (0)
 
 #else
-static void FLOATING_POINT_INIT() {}
-static void FLOATING_POINT_RESTORE() {}
+static void FLOATING_POINT_INIT(void) {}
+static void FLOATING_POINT_RESTORE(void) {}
 #endif
 
 vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img,
                                  vpx_codec_pts_t pts, unsigned long duration,
                                  vpx_enc_frame_flags_t flags,
-                                 unsigned long deadline) {
+                                 vpx_enc_deadline_t deadline) {
   vpx_codec_err_t res = VPX_CODEC_OK;
 
   if (!ctx || (img && !duration))
@@ -208,6 +202,10 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img,
     res = VPX_CODEC_ERROR;
   else if (!(ctx->iface->caps & VPX_CODEC_CAP_ENCODER))
     res = VPX_CODEC_INCAPABLE;
+#if ULONG_MAX > UINT32_MAX
+  else if (duration > UINT32_MAX || deadline > UINT32_MAX)
+    res = VPX_CODEC_INVALID_PARAM;
+#endif
   else {
     unsigned int num_enc = ctx->priv->enc.total_encoders;
 
diff --git a/media/libvpx/libvpx/vpx/src/vpx_image.c b/media/libvpx/libvpx/vpx/src/vpx_image.c
index dba439c10a..7f9f6cd4d0 100644
--- a/media/libvpx/libvpx/vpx/src/vpx_image.c
+++ b/media/libvpx/libvpx/vpx/src/vpx_image.c
@@ -8,6 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
+#include <limits.h>
 #include <stdlib.h>
 #include <string.h>
 
@@ -20,9 +22,22 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt,
                                      unsigned int buf_align,
                                      unsigned int stride_align,
                                      unsigned char *img_data) {
-  unsigned int h, w, s, xcs, ycs, bps;
-  unsigned int stride_in_bytes;
-  int align;
+  unsigned int h, w, xcs, ycs, bps;
+  uint64_t s;
+  int stride_in_bytes;
+  unsigned int align;
+
+  if (img != NULL) memset(img, 0, sizeof(vpx_image_t));
+
+  if (fmt == VPX_IMG_FMT_NONE) goto fail;
+
+  /* Impose maximum values on input parameters so that this function can
+   * perform arithmetic operations without worrying about overflows.
+   */
+  if (d_w > 0x08000000 || d_h > 0x08000000 || buf_align > 65536 ||
+      stride_align > 65536) {
+    goto fail;
+  }
 
   /* Treat align==0 like align==1 */
   if (!buf_align) buf_align = 1;
@@ -38,23 +53,9 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt,
 
   /* Get sample size for this format */
   switch (fmt) {
-    case VPX_IMG_FMT_RGB32:
-    case VPX_IMG_FMT_RGB32_LE:
-    case VPX_IMG_FMT_ARGB:
-    case VPX_IMG_FMT_ARGB_LE: bps = 32; break;
-    case VPX_IMG_FMT_RGB24:
-    case VPX_IMG_FMT_BGR24: bps = 24; break;
-    case VPX_IMG_FMT_RGB565:
-    case VPX_IMG_FMT_RGB565_LE:
-    case VPX_IMG_FMT_RGB555:
-    case VPX_IMG_FMT_RGB555_LE:
-    case VPX_IMG_FMT_UYVY:
-    case VPX_IMG_FMT_YUY2:
-    case VPX_IMG_FMT_YVYU: bps = 16; break;
     case VPX_IMG_FMT_I420:
     case VPX_IMG_FMT_YV12:
-    case VPX_IMG_FMT_VPXI420:
-    case VPX_IMG_FMT_VPXYV12: bps = 12; break;
+    case VPX_IMG_FMT_NV12: bps = 12; break;
     case VPX_IMG_FMT_I422:
     case VPX_IMG_FMT_I440: bps = 16; break;
     case VPX_IMG_FMT_I444: bps = 24; break;
@@ -66,11 +67,11 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt,
   }
 
   /* Get chroma shift values for this format */
+  // For VPX_IMG_FMT_NV12, xcs needs to be 0 such that UV data is all read at
+  // once.
   switch (fmt) {
     case VPX_IMG_FMT_I420:
     case VPX_IMG_FMT_YV12:
-    case VPX_IMG_FMT_VPXI420:
-    case VPX_IMG_FMT_VPXYV12:
     case VPX_IMG_FMT_I422:
     case VPX_IMG_FMT_I42016:
     case VPX_IMG_FMT_I42216: xcs = 1; break;
@@ -79,23 +80,36 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt,
 
   switch (fmt) {
     case VPX_IMG_FMT_I420:
+    case VPX_IMG_FMT_NV12:
     case VPX_IMG_FMT_I440:
     case VPX_IMG_FMT_YV12:
-    case VPX_IMG_FMT_VPXI420:
-    case VPX_IMG_FMT_VPXYV12:
     case VPX_IMG_FMT_I42016:
     case VPX_IMG_FMT_I44016: ycs = 1; break;
     default: ycs = 0; break;
   }
 
-  /* Calculate storage sizes given the chroma subsampling */
-  align = (1 << xcs) - 1;
-  w = (d_w + align) & ~align;
-  align = (1 << ycs) - 1;
-  h = (d_h + align) & ~align;
-  s = (fmt & VPX_IMG_FMT_PLANAR) ? w : bps * w / 8;
-  s = (s + stride_align - 1) & ~(stride_align - 1);
-  stride_in_bytes = (fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? s * 2 : s;
+  /* Calculate storage sizes. */
+  if (img_data) {
+    /* If the buffer was allocated externally, the width and height shouldn't
+     * be adjusted. */
+    w = d_w;
+    h = d_h;
+  } else {
+    /* Calculate storage sizes given the chroma subsampling */
+    align = (1 << xcs) - 1;
+    w = (d_w + align) & ~align;
+    assert(d_w <= w);
+    align = (1 << ycs) - 1;
+    h = (d_h + align) & ~align;
+    assert(d_h <= h);
+  }
+
+  s = (fmt & VPX_IMG_FMT_PLANAR) ? w : (uint64_t)bps * w / 8;
+  s = (fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? s * 2 : s;
+  s = (s + stride_align - 1) & ~((uint64_t)stride_align - 1);
+  if (s > INT_MAX) goto fail;
+  stride_in_bytes = (int)s;
+  s = (fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? s / 2 : s;
 
   /* Allocate the new image */
   if (!img) {
@@ -104,16 +118,14 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt,
     if (!img) goto fail;
 
     img->self_allocd = 1;
-  } else {
-    memset(img, 0, sizeof(vpx_image_t));
   }
 
   img->img_data = img_data;
 
   if (!img_data) {
-    const uint64_t alloc_size = (fmt & VPX_IMG_FMT_PLANAR)
-                                    ? (uint64_t)h * s * bps / 8
-                                    : (uint64_t)h * s;
+    uint64_t alloc_size;
+    alloc_size = (fmt & VPX_IMG_FMT_PLANAR) ? (uint64_t)h * s * bps / 8
+                                            : (uint64_t)h * s;
 
     if (alloc_size != (size_t)alloc_size) goto fail;
 
@@ -135,8 +147,12 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt,
   img->stride[VPX_PLANE_Y] = img->stride[VPX_PLANE_ALPHA] = stride_in_bytes;
   img->stride[VPX_PLANE_U] = img->stride[VPX_PLANE_V] = stride_in_bytes >> xcs;
 
-  /* Default viewport to entire image */
-  if (!vpx_img_set_rect(img, 0, 0, d_w, d_h)) return img;
+  /* Default viewport to entire image. (This vpx_img_set_rect call always
+   * succeeds.) */
+  int ret = vpx_img_set_rect(img, 0, 0, d_w, d_h);
+  assert(ret == 0);
+  (void)ret;
+  return img;
 
 fail:
   vpx_img_free(img);
@@ -152,16 +168,15 @@ vpx_image_t *vpx_img_alloc(vpx_image_t *img, vpx_img_fmt_t fmt,
 vpx_image_t *vpx_img_wrap(vpx_image_t *img, vpx_img_fmt_t fmt, unsigned int d_w,
                           unsigned int d_h, unsigned int stride_align,
                           unsigned char *img_data) {
-  /* By setting buf_align = 1, we don't change buffer alignment in this
-   * function. */
+  /* Set buf_align = 1. It is ignored by img_alloc_helper because img_data is
+   * not NULL. */
   return img_alloc_helper(img, fmt, d_w, d_h, 1, stride_align, img_data);
 }
 
 int vpx_img_set_rect(vpx_image_t *img, unsigned int x, unsigned int y,
                      unsigned int w, unsigned int h) {
-  unsigned char *data;
-
-  if (x + w <= img->w && y + h <= img->h) {
+  if (x <= UINT_MAX - w && x + w <= img->w && y <= UINT_MAX - h &&
+      y + h <= img->h) {
     img->d_w = w;
     img->d_h = h;
 
@@ -172,34 +187,38 @@ int vpx_img_set_rect(vpx_image_t *img, unsigned int x, unsigned int y,
     } else {
       const int bytes_per_sample =
           (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
-      data = img->img_data;
+      unsigned char *data = img->img_data;
 
       if (img->fmt & VPX_IMG_FMT_HAS_ALPHA) {
         img->planes[VPX_PLANE_ALPHA] =
             data + x * bytes_per_sample + y * img->stride[VPX_PLANE_ALPHA];
-        data += img->h * img->stride[VPX_PLANE_ALPHA];
+        data += (size_t)img->h * img->stride[VPX_PLANE_ALPHA];
       }
 
       img->planes[VPX_PLANE_Y] =
           data + x * bytes_per_sample + y * img->stride[VPX_PLANE_Y];
-      data += img->h * img->stride[VPX_PLANE_Y];
+      data += (size_t)img->h * img->stride[VPX_PLANE_Y];
 
-      if (!(img->fmt & VPX_IMG_FMT_UV_FLIP)) {
+      unsigned int uv_x = x >> img->x_chroma_shift;
+      unsigned int uv_y = y >> img->y_chroma_shift;
+      if (img->fmt == VPX_IMG_FMT_NV12) {
         img->planes[VPX_PLANE_U] =
-            data + (x >> img->x_chroma_shift) * bytes_per_sample +
-            (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_U];
-        data += (img->h >> img->y_chroma_shift) * img->stride[VPX_PLANE_U];
+            data + uv_x + uv_y * img->stride[VPX_PLANE_U];
+        img->planes[VPX_PLANE_V] = img->planes[VPX_PLANE_U] + 1;
+      } else if (!(img->fmt & VPX_IMG_FMT_UV_FLIP)) {
+        img->planes[VPX_PLANE_U] =
+            data + uv_x * bytes_per_sample + uv_y * img->stride[VPX_PLANE_U];
+        data +=
+            (size_t)(img->h >> img->y_chroma_shift) * img->stride[VPX_PLANE_U];
         img->planes[VPX_PLANE_V] =
-            data + (x >> img->x_chroma_shift) * bytes_per_sample +
-            (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_V];
+            data + uv_x * bytes_per_sample + uv_y * img->stride[VPX_PLANE_V];
       } else {
         img->planes[VPX_PLANE_V] =
-            data + (x >> img->x_chroma_shift) * bytes_per_sample +
-            (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_V];
-        data += (img->h >> img->y_chroma_shift) * img->stride[VPX_PLANE_V];
+            data + uv_x * bytes_per_sample + uv_y * img->stride[VPX_PLANE_V];
+        data +=
+            (size_t)(img->h >> img->y_chroma_shift) * img->stride[VPX_PLANE_V];
         img->planes[VPX_PLANE_U] =
-            data + (x >> img->x_chroma_shift) * bytes_per_sample +
-            (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_U];
+            data + uv_x * bytes_per_sample + uv_y * img->stride[VPX_PLANE_U];
       }
     }
     return 0;
diff --git a/media/libvpx/libvpx/vpx/vp8.h b/media/libvpx/libvpx/vpx/vp8.h
index 059c9d0f65..f30dafed58 100644
--- a/media/libvpx/libvpx/vpx/vp8.h
+++ b/media/libvpx/libvpx/vpx/vp8.h
@@ -10,7 +10,7 @@
 
 /*!\defgroup vp8 VP8
  * \ingroup codecs
- * VP8 is vpx's newest video compression algorithm that uses motion
+ * VP8 is a video compression algorithm that uses motion
  * compensated prediction, Discrete Cosine Transform (DCT) coding of the
  * prediction error signal and context dependent entropy coding techniques
  * based on arithmetic principles. It features:
@@ -27,8 +27,8 @@
 /*!\file
  * \brief Provides controls common to both the VP8 encoder and decoder.
  */
-#ifndef VPX_VP8_H_
-#define VPX_VP8_H_
+#ifndef VPX_VPX_VP8_H_
+#define VPX_VPX_VP8_H_
 
 #include "./vpx_codec.h"
 #include "./vpx_image.h"
@@ -47,10 +47,6 @@ enum vp8_com_control_id {
   VP8_SET_REFERENCE = 1,
   VP8_COPY_REFERENCE = 2, /**< get a copy of reference frame from the decoder */
   VP8_SET_POSTPROC = 3,   /**< set the decoder's post processing settings  */
-  VP8_SET_DBG_COLOR_REF_FRAME = 4, /**< \deprecated */
-  VP8_SET_DBG_COLOR_MB_MODES = 5,  /**< \deprecated */
-  VP8_SET_DBG_COLOR_B_MODES = 6,   /**< \deprecated */
-  VP8_SET_DBG_DISPLAY_MV = 7,      /**< \deprecated */
 
   /* TODO(jkoleszar): The encoder incorrectly reuses some of these values (5+)
    * for its control ids. These should be migrated to something like the
@@ -70,12 +66,7 @@ enum vp8_postproc_level {
   VP8_DEBLOCK = 1 << 0,
   VP8_DEMACROBLOCK = 1 << 1,
   VP8_ADDNOISE = 1 << 2,
-  VP8_DEBUG_TXT_FRAME_INFO = 1 << 3, /**< print frame information */
-  VP8_DEBUG_TXT_MBLK_MODES =
-      1 << 4, /**< print macro block modes over each macro block */
-  VP8_DEBUG_TXT_DC_DIFF = 1 << 5,   /**< print dc diff for each macro block */
-  VP8_DEBUG_TXT_RATE_INFO = 1 << 6, /**< print video rate info (encoder only) */
-  VP8_MFQE = 1 << 10
+  VP8_MFQE = 1 << 3
 };
 
 /*!\brief post process flags
@@ -132,14 +123,6 @@ VPX_CTRL_USE_TYPE(VP8_COPY_REFERENCE, vpx_ref_frame_t *)
 #define VPX_CTRL_VP8_COPY_REFERENCE
 VPX_CTRL_USE_TYPE(VP8_SET_POSTPROC, vp8_postproc_cfg_t *)
 #define VPX_CTRL_VP8_SET_POSTPROC
-VPX_CTRL_USE_TYPE_DEPRECATED(VP8_SET_DBG_COLOR_REF_FRAME, int)
-#define VPX_CTRL_VP8_SET_DBG_COLOR_REF_FRAME
-VPX_CTRL_USE_TYPE_DEPRECATED(VP8_SET_DBG_COLOR_MB_MODES, int)
-#define VPX_CTRL_VP8_SET_DBG_COLOR_MB_MODES
-VPX_CTRL_USE_TYPE_DEPRECATED(VP8_SET_DBG_COLOR_B_MODES, int)
-#define VPX_CTRL_VP8_SET_DBG_COLOR_B_MODES
-VPX_CTRL_USE_TYPE_DEPRECATED(VP8_SET_DBG_DISPLAY_MV, int)
-#define VPX_CTRL_VP8_SET_DBG_DISPLAY_MV
 VPX_CTRL_USE_TYPE(VP9_GET_REFERENCE, vp9_ref_frame_t *)
 #define VPX_CTRL_VP9_GET_REFERENCE
 
@@ -150,4 +133,4 @@ VPX_CTRL_USE_TYPE(VP9_GET_REFERENCE, vp9_ref_frame_t *)
 }  // extern "C"
 #endif
 
-#endif  // VPX_VP8_H_
+#endif  // VPX_VPX_VP8_H_
diff --git a/media/libvpx/libvpx/vpx/vp8cx.h b/media/libvpx/libvpx/vpx/vp8cx.h
index cc90159bc3..3a432cc12f 100644
--- a/media/libvpx/libvpx/vpx/vp8cx.h
+++ b/media/libvpx/libvpx/vpx/vp8cx.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef VPX_VP8CX_H_
-#define VPX_VP8CX_H_
+#ifndef VPX_VPX_VP8CX_H_
+#define VPX_VPX_VP8CX_H_
 
 /*!\defgroup vp8_encoder WebM VP8/VP9 Encoder
  * \ingroup vp8
@@ -17,6 +17,7 @@
  */
 #include "./vp8.h"
 #include "./vpx_encoder.h"
+#include "./vpx_ext_ratectrl.h"
 
 /*!\file
  * \brief Provides definitions for using VP8 or VP9 encoder algorithm within the
@@ -32,7 +33,15 @@ extern "C" {
  * This interface provides the capability to encode raw VP8 streams.
  * @{
  */
+
+/*!\brief A single instance of the VP8 encoder.
+ *\deprecated This access mechanism is provided for backwards compatibility;
+ * prefer vpx_codec_vp8_cx().
+ */
 extern vpx_codec_iface_t vpx_codec_vp8_cx_algo;
+
+/*!\brief The interface to the VP8 encoder.
+ */
 extern vpx_codec_iface_t *vpx_codec_vp8_cx(void);
 /*!@} - end algorithm interface member group*/
 
@@ -41,7 +50,15 @@ extern vpx_codec_iface_t *vpx_codec_vp8_cx(void);
  * This interface provides the capability to encode raw VP9 streams.
  * @{
  */
+
+/*!\brief A single instance of the VP9 encoder.
+ *\deprecated This access mechanism is provided for backwards compatibility;
+ * prefer vpx_codec_vp9_cx().
+ */
 extern vpx_codec_iface_t vpx_codec_vp9_cx_algo;
+
+/*!\brief The interface to the VP9 encoder.
+ */
 extern vpx_codec_iface_t *vpx_codec_vp9_cx(void);
 /*!@} - end algorithm interface member group*/
 
@@ -125,7 +142,7 @@ extern vpx_codec_iface_t *vpx_codec_vp9_cx(void);
 enum vp8e_enc_control_id {
   /*!\brief Codec control function to pass an ROI map to encoder.
    *
-   * Supported in codecs: VP8, VP9
+   * Supported in codecs: VP8
    */
   VP8E_SET_ROI_MAP = 8,
 
@@ -148,13 +165,17 @@ enum vp8e_enc_control_id {
    * speed at the expense of quality.
    *
    * \note Valid range for VP8: -16..16
-   * \note Valid range for VP9: -8..8
+   * \note Valid range for VP9: -9..9
+   * \note A negative value (-n) is treated as its absolute value (n) in VP9.
    *
    * Supported in codecs: VP8, VP9
    */
   VP8E_SET_CPUUSED = 13,
 
-  /*!\brief Codec control function to enable automatic set and use alf frames.
+  /*!\brief Codec control function to enable automatic use of arf frames.
+   *
+   * \note Valid range for VP8: 0..1
+   * \note Valid range for VP9: 0..6
    *
    * Supported in codecs: VP8, VP9
    */
@@ -169,7 +190,10 @@ enum vp8e_enc_control_id {
    */
   VP8E_SET_NOISE_SENSITIVITY,
 
-  /*!\brief Codec control function to set sharpness.
+  /*!\brief Codec control function to set higher sharpness at the expense
+   * of a lower PSNR.
+   *
+   * \note Valid range: 0..7
    *
    * Supported in codecs: VP8, VP9
    */
@@ -225,10 +249,10 @@ enum vp8e_enc_control_id {
    */
   VP8E_SET_TUNING,
 
-  /*!\brief Codec control function to set constrained quality level.
+  /*!\brief Codec control function to set constrained / constant quality level.
    *
-   * \attention For this value to be used vpx_codec_enc_cfg_t::g_usage must be
-   *            set to #VPX_CQ.
+   * \attention For this value to be used vpx_codec_enc_cfg_t::rc_end_usage must
+   *            be set to #VPX_CQ or #VPX_Q
    * \note Valid range: 0..63
    *
    * Supported in codecs: VP8, VP9
@@ -279,7 +303,7 @@ enum vp8e_enc_control_id {
    * the feature is off, i.e., no golden frame boost in CBR mode and
    * average bitrate target is used.
    *
-   * For example, to allow 100% more bits, i.e, 2X, in a golden frame
+   * For example, to allow 100% more bits, i.e., 2X, in a golden frame
    * than average frame, set this to 100.
    *
    * Supported in codecs: VP9
@@ -333,11 +357,12 @@ enum vp8e_enc_control_id {
    *             2 = 4 tile columns
    *             .....
    *             n = 2**n tile columns
-   * The requested tile columns will be capped by encoder based on image size
-   * limitation (The minimum width of a tile column is 256 pixel, the maximum
-   * is 4096).
+   * The requested tile columns will be capped by the encoder based on image
+   * size limitations (The minimum width of a tile column is 256 pixels, the
+   * maximum is 4096).
    *
-   * By default, the value is 0, i.e. one single column tile for entire image.
+   * By default, the value is 6, i.e., the maximum number of tiles supported by
+   * the resolution.
    *
    * Supported in codecs: VP9
    */
@@ -368,10 +393,10 @@ enum vp8e_enc_control_id {
    * VP9 has a bitstream feature to reduce decoding dependency between frames
    * by turning off backward update of probability context used in encoding
    * and decoding. This allows staged parallel processing of more than one
-   * video frames in the decoder. This control function provides a mean to
+   * video frame in the decoder. This control function provides a means to
    * turn this feature on or off for bitstreams produced by encoder.
    *
-   * By default, this feature is off.
+   * By default, this feature is on.
    *
    * Supported in codecs: VP9
    */
@@ -407,7 +432,7 @@ enum vp8e_enc_control_id {
 
   /*!\brief Codec control function to set noise sensitivity.
    *
-   *  0: off, 1: On(YOnly)
+   *  0: off, 1: On(YOnly), 2: For SVC only, on top two spatial layers(YOnly)
    *
    * Supported in codecs: VP9
    */
@@ -422,6 +447,12 @@ enum vp8e_enc_control_id {
    */
   VP9E_SET_SVC,
 
+  /*!\brief Codec control function to pass an ROI map to encoder.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_ROI_MAP,
+
   /*!\brief Codec control function to set parameters for SVC.
    * \note Parameters contain min_q, max_q, scaling factor for each of the
    *       SVC layers.
@@ -443,6 +474,7 @@ enum vp8e_enc_control_id {
    * \note Valid parameter range:
    *              VP9E_CONTENT_DEFAULT = Regular video content (Default)
    *              VP9E_CONTENT_SCREEN  = Screen capture content
+   *              VP9E_CONTENT_FILM    = Film content: improves grain retention
    *
    * Supported in codecs: VP9
    */
@@ -479,25 +511,13 @@ enum vp8e_enc_control_id {
    */
   VP9E_SET_COLOR_SPACE,
 
-  /*!\brief Codec control function to set temporal layering mode.
-   * \note Valid ranges: 0..3, default is "0"
-   * (VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING).
-   *                     0 = VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING
-   *                     1 = VP9E_TEMPORAL_LAYERING_MODE_BYPASS
-   *                     2 = VP9E_TEMPORAL_LAYERING_MODE_0101
-   *                     3 = VP9E_TEMPORAL_LAYERING_MODE_0212
-   *
-   * Supported in codecs: VP9
-   */
-  VP9E_SET_TEMPORAL_LAYERING_MODE,
-
   /*!\brief Codec control function to set minimum interval between GF/ARF frames
    *
    * By default the value is set as 4.
    *
    * Supported in codecs: VP9
    */
-  VP9E_SET_MIN_GF_INTERVAL,
+  VP9E_SET_MIN_GF_INTERVAL = 48,
 
   /*!\brief Codec control function to set minimum interval between GF/ARF frames
    *
@@ -527,7 +547,7 @@ enum vp8e_enc_control_id {
    * struct #vpx_svc_ref_frame_config defined below.
    *
    * Supported in codecs: VP9
-  */
+   */
   VP9E_SET_SVC_REF_FRAME_CONFIG,
 
   /*!\brief Codec control function to set intended rendering image size.
@@ -547,6 +567,14 @@ enum vp8e_enc_control_id {
    */
   VP9E_SET_TARGET_LEVEL,
 
+  /*!\brief Codec control function to set row level multi-threading.
+   *
+   * 0 : off, 1 : on
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_ROW_MT,
+
   /*!\brief Codec control function to get bitstream level.
    *
    * Supported in codecs: VP9
@@ -564,19 +592,190 @@ enum vp8e_enc_control_id {
   VP9E_SET_ALT_REF_AQ,
 
   /*!\brief Boost percentage for Golden Frame in CBR mode.
-    *
-    * This value controls the amount of boost given to Golden Frame in
-    * CBR mode. It is expressed as a percentage of the average
-    * per-frame bitrate, with the special (and default) value 0 meaning
-    * the feature is off, i.e., no golden frame boost in CBR mode and
-    * average bitrate target is used.
-    *
-    * For example, to allow 100% more bits, i.e, 2X, in a golden frame
-    * than average frame, set this to 100.
-    *
-    * Supported in codecs: VP8
-    */
+   *
+   * This value controls the amount of boost given to Golden Frame in
+   * CBR mode. It is expressed as a percentage of the average
+   * per-frame bitrate, with the special (and default) value 0 meaning
+   * the feature is off, i.e., no golden frame boost in CBR mode and
+   * average bitrate target is used.
+   *
+   * For example, to allow 100% more bits, i.e., 2X, in a golden frame
+   * than average frame, set this to 100.
+   *
+   * Supported in codecs: VP8
+   */
   VP8E_SET_GF_CBR_BOOST_PCT,
+
+  /*!\brief Codec control function to enable the extreme motion vector unit test
+   * in VP9. Please note that this is only used in motion vector unit test.
+   *
+   * 0 : off, 1 : MAX_EXTREME_MV, 2 : MIN_EXTREME_MV
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST,
+
+  /*!\brief Codec control function to constrain the inter-layer prediction
+   * (prediction of lower spatial resolution) in VP9 SVC.
+   *
+   * 0 : inter-layer prediction on, 1 : off, 2 : off only on non-key frames
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_SVC_INTER_LAYER_PRED,
+
+  /*!\brief Codec control function to set mode and thresholds for frame
+   *  dropping in SVC. Drop frame thresholds are set per-layer. Mode is set as:
+   * 0 : layer-dependent dropping, 1 : constrained dropping, current layer drop
+   * forces drop on all upper layers. Default mode is 0.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_SVC_FRAME_DROP_LAYER,
+
+  /*!\brief Codec control function to get the refresh and reference flags and
+   * the buffer indices, up to the last encoded spatial layer.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_GET_SVC_REF_FRAME_CONFIG,
+
+  /*!\brief Codec control function to enable/disable use of golden reference as
+   * a second temporal reference for SVC. Only used when inter-layer prediction
+   * is disabled on INTER frames.
+   *
+   * 0: Off, 1: Enabled (default)
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_SVC_GF_TEMPORAL_REF,
+
+  /*!\brief Codec control function to enable spatial layer sync frame, for any
+   * spatial layer. Enabling it for layer k means spatial layer k will disable
+   * all temporal prediction, but keep the inter-layer prediction. It will
+   * refresh any temporal reference buffer for that layer, and reset the
+   * temporal layer for the superframe to 0. Setting the layer sync for base
+   * spatial layer forces a key frame. Default is off (0) for all spatial
+   * layers. Spatial layer sync flag is reset to 0 after each encoded layer,
+   * so when control is invoked it is only used for the current superframe.
+   *
+   * 0: Off (default), 1: Enabled
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_SVC_SPATIAL_LAYER_SYNC,
+
+  /*!\brief Codec control function to enable temporal dependency model.
+   *
+   * Vp9 allows the encoder to run temporal dependency model and use it to
+   * improve the compression performance. To enable, set this parameter to be
+   * 1. The default value is set to be 1.
+   */
+  VP9E_SET_TPL,
+
+  /*!\brief Codec control function to enable post encode frame drop.
+   *
+   * This will allow encoder to drop frame after it's encoded.
+   *
+   * 0: Off (default), 1: Enabled
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_POSTENCODE_DROP,
+
+  /*!\brief Codec control function to set delta q for uv.
+   *
+   * Cap it at +/-15.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_DELTA_Q_UV,
+
+  /*!\brief Codec control function to disable increase Q on overshoot in CBR.
+   *
+   * 0: On (default), 1: Disable.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR,
+
+  /*!\brief Codec control function to disable loopfilter.
+   *
+   * 0: Loopfilter on all frames, 1: Disable on non reference frames.
+   * 2: Disable on all frames.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_DISABLE_LOOPFILTER,
+
+  /*!\brief Codec control function to enable external rate control library.
+   *
+   * args[0]: path of the rate control library
+   *
+   * args[1]: private config of the rate control library
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_EXTERNAL_RATE_CONTROL,
+
+  /*!\brief Codec control to disable internal features in rate control.
+   *
+   * This will do 3 things, only for 1 pass:
+   *  - Turn off low motion computation
+   *  - Turn off gf update constraint on key frame frequency
+   *  - Turn off content mode for cyclic refresh
+   *
+   * With those, the rate control is expected to work exactly the same as the
+   * interface provided in ratectrl_rtc.cc/h
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_RTC_EXTERNAL_RATECTRL,
+
+  /*!\brief Codec control function to get loopfilter level in the encoder.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_GET_LOOPFILTER_LEVEL,
+
+  /*!\brief Codec control to get last quantizers for all spatial layers.
+   *
+   * Return value uses an array of internal quantizers scale defined by the
+   * codec, for all spatial layers.
+   * The size of the array passed in should be #VPX_SS_MAX_LAYERS.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_GET_LAST_QUANTIZER_SVC_LAYERS,
+
+  /*!\brief Codec control to disable internal features in rate control.
+   *
+   * This will turn off cyclic refresh for vp8.
+   *
+   * With this, the rate control is expected to work exactly the same as the
+   * interface provided in vp8_ratectrl_rtc.cc/h
+   *
+   * Supported in codecs: VP8
+   */
+  VP8E_SET_RTC_EXTERNAL_RATECTRL,
+
+  /*!\brief Codec control to set quantizer for the next frame.
+   *
+   * This will turn off cyclic refresh. Only applicable to 1-pass without
+   * spatial layers.
+   *
+   * Supported in codecs: VP9
+   *
+   */
+  VP9E_SET_QUANTIZER_ONE_PASS,
+
+  /*!\brief Codec control function to enable key frame temporal filtering.
+   *
+   * Vp9 allows the encoder to run key frame temporal filtering and use it to
+   * improve the compression performance. To enable, set this parameter to be
+   * 1. The default value is set to be 0.
+   */
+  VP9E_SET_KEY_FRAME_FILTERING,
 };
 
 /*!\brief vpx 1-D scaling mode
@@ -624,16 +823,20 @@ typedef enum vp9e_temporal_layering_mode {
  */
 
 typedef struct vpx_roi_map {
-  /*! An id between 0 and 3 for each 16x16 region within a frame. */
+  /*! If ROI is enabled. */
+  uint8_t enabled;
+  /*! An id between 0-3 (0-7 for vp9) for each 16x16 (8x8 for VP9)
+   * region within a frame. */
   unsigned char *roi_map;
   unsigned int rows; /**< Number of rows. */
   unsigned int cols; /**< Number of columns. */
-  // TODO(paulwilkins): broken for VP9 which has 8 segments
-  // q and loop filter deltas for each segment
-  // (see MAX_MB_SEGMENTS)
-  int delta_q[4];  /**< Quantizer deltas. */
-  int delta_lf[4]; /**< Loop filter deltas. */
-  /*! Static breakout threshold for each segment. */
+  /*! VP8 only uses the first 4 segments. VP9 uses 8 segments. */
+  int delta_q[8];  /**< Quantizer deltas. Valid range: [-63, 63].*/
+  int delta_lf[8]; /**< Loop filter deltas. Valid range: [-63, 63].*/
+  /*! skip and ref frame segment is only used in VP9. */
+  int skip[8];      /**< Skip this block. */
+  int ref_frame[8]; /**< Reference frame for this block. */
+  /*! Static breakout threshold for each segment. Only used in VP8. */
   unsigned int static_threshold[4];
 } vpx_roi_map_t;
 
@@ -674,10 +877,11 @@ typedef enum {
   VP8_EIGHT_TOKENPARTITION = 3
 } vp8e_token_partitions;
 
-/*!brief VP9 encoder content type */
+/*!\brief VP9 encoder content type */
 typedef enum {
   VP9E_CONTENT_DEFAULT,
   VP9E_CONTENT_SCREEN,
+  VP9E_CONTENT_FILM,
   VP9E_CONTENT_INVALID
 } vp9e_tune_content;
 
@@ -696,11 +900,13 @@ typedef enum { VP8_TUNE_PSNR, VP8_TUNE_SSIM } vp8e_tuning;
  *
  */
 typedef struct vpx_svc_layer_id {
-  int spatial_layer_id;  /**< Spatial layer id number. */
+  int spatial_layer_id; /**< First spatial layer to start encoding. */
+  // TODO(jianj): Deprecated, to be removed.
   int temporal_layer_id; /**< Temporal layer id number. */
+  int temporal_layer_id_per_spatial[VPX_SS_MAX_LAYERS]; /**< Temp layer id. */
 } vpx_svc_layer_id_t;
 
-/*!\brief  vp9 svc frame flag parameters.
+/*!\brief vp9 svc frame flag parameters.
  *
  * This defines the frame flags and buffer indices for each spatial layer for
  * svc encoding.
@@ -709,12 +915,58 @@ typedef struct vpx_svc_layer_id {
  *
  */
 typedef struct vpx_svc_ref_frame_config {
-  int frame_flags[VPX_TS_MAX_LAYERS]; /**< Frame flags. */
-  int lst_fb_idx[VPX_TS_MAX_LAYERS];  /**< Last buffer index. */
-  int gld_fb_idx[VPX_TS_MAX_LAYERS];  /**< Golden buffer index. */
-  int alt_fb_idx[VPX_TS_MAX_LAYERS];  /**< Altref buffer index. */
+  int lst_fb_idx[VPX_SS_MAX_LAYERS];         /**< Last buffer index. */
+  int gld_fb_idx[VPX_SS_MAX_LAYERS];         /**< Golden buffer index. */
+  int alt_fb_idx[VPX_SS_MAX_LAYERS];         /**< Altref buffer index. */
+  int update_buffer_slot[VPX_SS_MAX_LAYERS]; /**< Update reference frames. */
+  // TODO(jianj): Remove update_last/golden/alt_ref, these are deprecated.
+  int update_last[VPX_SS_MAX_LAYERS];       /**< Update last. */
+  int update_golden[VPX_SS_MAX_LAYERS];     /**< Update golden. */
+  int update_alt_ref[VPX_SS_MAX_LAYERS];    /**< Update altref. */
+  int reference_last[VPX_SS_MAX_LAYERS];    /**< Last as reference. */
+  int reference_golden[VPX_SS_MAX_LAYERS];  /**< Golden as reference. */
+  int reference_alt_ref[VPX_SS_MAX_LAYERS]; /**< Altref as reference. */
+  int64_t duration[VPX_SS_MAX_LAYERS];      /**< Duration per spatial layer. */
 } vpx_svc_ref_frame_config_t;
 
+/*!\brief VP9 svc frame dropping mode.
+ *
+ * This defines the frame drop mode for SVC.
+ *
+ */
+typedef enum {
+  CONSTRAINED_LAYER_DROP,
+  /**< Upper layers are constrained to drop if current layer drops. */
+  LAYER_DROP,           /**< Any spatial layer can drop. */
+  FULL_SUPERFRAME_DROP, /**< Only full superframe can drop. */
+  CONSTRAINED_FROM_ABOVE_DROP,
+  /**< Lower layers are constrained to drop if current layer drops. */
+} SVC_LAYER_DROP_MODE;
+
+/*!\brief vp9 svc frame dropping parameters.
+ *
+ * This defines the frame drop thresholds for each spatial layer, and
+ * the frame dropping mode: 0 = layer based frame dropping (default),
+ * 1 = constrained dropping where current layer drop forces all upper
+ * spatial layers to drop.
+ */
+typedef struct vpx_svc_frame_drop {
+  int framedrop_thresh[VPX_SS_MAX_LAYERS]; /**< Frame drop thresholds */
+  SVC_LAYER_DROP_MODE
+  framedrop_mode;      /**< Layer-based or constrained dropping. */
+  int max_consec_drop; /**< Maximum consecutive drops, for any layer. */
+} vpx_svc_frame_drop_t;
+
+/*!\brief vp9 svc spatial layer sync parameters.
+ *
+ * This defines the spatial layer sync flag, defined per spatial layer.
+ *
+ */
+typedef struct vpx_svc_spatial_layer_sync {
+  int spatial_layer_sync[VPX_SS_MAX_LAYERS]; /**< Sync layer flags */
+  int base_layer_intra_only; /**< Flag for setting Intra-only frame on base */
+} vpx_svc_spatial_layer_sync_t;
+
 /*!\cond */
 /*!\brief VP8 encoder control function parameter type
  *
@@ -723,26 +975,12 @@ typedef struct vpx_svc_ref_frame_config {
  *
  */
 
-VPX_CTRL_USE_TYPE(VP8E_SET_FRAME_FLAGS, int)
-#define VPX_CTRL_VP8E_SET_FRAME_FLAGS
-VPX_CTRL_USE_TYPE(VP8E_SET_TEMPORAL_LAYER_ID, int)
-#define VPX_CTRL_VP8E_SET_TEMPORAL_LAYER_ID
 VPX_CTRL_USE_TYPE(VP8E_SET_ROI_MAP, vpx_roi_map_t *)
 #define VPX_CTRL_VP8E_SET_ROI_MAP
 VPX_CTRL_USE_TYPE(VP8E_SET_ACTIVEMAP, vpx_active_map_t *)
 #define VPX_CTRL_VP8E_SET_ACTIVEMAP
 VPX_CTRL_USE_TYPE(VP8E_SET_SCALEMODE, vpx_scaling_mode_t *)
 #define VPX_CTRL_VP8E_SET_SCALEMODE
-
-VPX_CTRL_USE_TYPE(VP9E_SET_SVC, int)
-#define VPX_CTRL_VP9E_SET_SVC
-VPX_CTRL_USE_TYPE(VP9E_SET_SVC_PARAMETERS, void *)
-#define VPX_CTRL_VP9E_SET_SVC_PARAMETERS
-VPX_CTRL_USE_TYPE(VP9E_REGISTER_CX_CALLBACK, void *)
-#define VPX_CTRL_VP9E_REGISTER_CX_CALLBACK
-VPX_CTRL_USE_TYPE(VP9E_SET_SVC_LAYER_ID, vpx_svc_layer_id_t *)
-#define VPX_CTRL_VP9E_SET_SVC_LAYER_ID
-
 VPX_CTRL_USE_TYPE(VP8E_SET_CPUUSED, int)
 #define VPX_CTRL_VP8E_SET_CPUUSED
 VPX_CTRL_USE_TYPE(VP8E_SET_ENABLEAUTOALTREF, unsigned int)
@@ -755,7 +993,10 @@ VPX_CTRL_USE_TYPE(VP8E_SET_STATIC_THRESHOLD, unsigned int)
 #define VPX_CTRL_VP8E_SET_STATIC_THRESHOLD
 VPX_CTRL_USE_TYPE(VP8E_SET_TOKEN_PARTITIONS, int) /* vp8e_token_partitions */
 #define VPX_CTRL_VP8E_SET_TOKEN_PARTITIONS
-
+VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER, int *)
+#define VPX_CTRL_VP8E_GET_LAST_QUANTIZER
+VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER_64, int *)
+#define VPX_CTRL_VP8E_GET_LAST_QUANTIZER_64
 VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_MAXFRAMES, unsigned int)
 #define VPX_CTRL_VP8E_SET_ARNR_MAXFRAMES
 VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_STRENGTH, unsigned int)
@@ -766,80 +1007,107 @@ VPX_CTRL_USE_TYPE(VP8E_SET_TUNING, int) /* vp8e_tuning */
 #define VPX_CTRL_VP8E_SET_TUNING
 VPX_CTRL_USE_TYPE(VP8E_SET_CQ_LEVEL, unsigned int)
 #define VPX_CTRL_VP8E_SET_CQ_LEVEL
-
+VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTRA_BITRATE_PCT, unsigned int)
+#define VPX_CTRL_VP8E_SET_MAX_INTRA_BITRATE_PCT
+VPX_CTRL_USE_TYPE(VP8E_SET_FRAME_FLAGS, int)
+#define VPX_CTRL_VP8E_SET_FRAME_FLAGS
+VPX_CTRL_USE_TYPE(VP9E_SET_MAX_INTER_BITRATE_PCT, unsigned int)
+#define VPX_CTRL_VP9E_SET_MAX_INTER_BITRATE_PCT
+VPX_CTRL_USE_TYPE(VP9E_SET_GF_CBR_BOOST_PCT, unsigned int)
+#define VPX_CTRL_VP9E_SET_GF_CBR_BOOST_PCT
+VPX_CTRL_USE_TYPE(VP8E_SET_TEMPORAL_LAYER_ID, int)
+#define VPX_CTRL_VP8E_SET_TEMPORAL_LAYER_ID
+VPX_CTRL_USE_TYPE(VP8E_SET_SCREEN_CONTENT_MODE, unsigned int)
+#define VPX_CTRL_VP8E_SET_SCREEN_CONTENT_MODE
+VPX_CTRL_USE_TYPE(VP9E_SET_LOSSLESS, unsigned int)
+#define VPX_CTRL_VP9E_SET_LOSSLESS
 VPX_CTRL_USE_TYPE(VP9E_SET_TILE_COLUMNS, int)
 #define VPX_CTRL_VP9E_SET_TILE_COLUMNS
 VPX_CTRL_USE_TYPE(VP9E_SET_TILE_ROWS, int)
 #define VPX_CTRL_VP9E_SET_TILE_ROWS
-
-VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER, int *)
-#define VPX_CTRL_VP8E_GET_LAST_QUANTIZER
-VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER_64, int *)
-#define VPX_CTRL_VP8E_GET_LAST_QUANTIZER_64
-VPX_CTRL_USE_TYPE(VP9E_GET_SVC_LAYER_ID, vpx_svc_layer_id_t *)
-#define VPX_CTRL_VP9E_GET_SVC_LAYER_ID
-
-VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTRA_BITRATE_PCT, unsigned int)
-#define VPX_CTRL_VP8E_SET_MAX_INTRA_BITRATE_PCT
-VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTER_BITRATE_PCT, unsigned int)
-#define VPX_CTRL_VP8E_SET_MAX_INTER_BITRATE_PCT
-
-VPX_CTRL_USE_TYPE(VP8E_SET_GF_CBR_BOOST_PCT, unsigned int)
-#define VPX_CTRL_VP8E_SET_GF_CBR_BOOST_PCT
-
-VPX_CTRL_USE_TYPE(VP8E_SET_SCREEN_CONTENT_MODE, unsigned int)
-#define VPX_CTRL_VP8E_SET_SCREEN_CONTENT_MODE
-
-VPX_CTRL_USE_TYPE(VP9E_SET_GF_CBR_BOOST_PCT, unsigned int)
-#define VPX_CTRL_VP9E_SET_GF_CBR_BOOST_PCT
-
-VPX_CTRL_USE_TYPE(VP9E_SET_LOSSLESS, unsigned int)
-#define VPX_CTRL_VP9E_SET_LOSSLESS
-
 VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PARALLEL_DECODING, unsigned int)
 #define VPX_CTRL_VP9E_SET_FRAME_PARALLEL_DECODING
-
 VPX_CTRL_USE_TYPE(VP9E_SET_AQ_MODE, unsigned int)
 #define VPX_CTRL_VP9E_SET_AQ_MODE
-
-VPX_CTRL_USE_TYPE(VP9E_SET_ALT_REF_AQ, int)
-#define VPX_CTRL_VP9E_SET_ALT_REF_AQ
-
 VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PERIODIC_BOOST, unsigned int)
 #define VPX_CTRL_VP9E_SET_FRAME_PERIODIC_BOOST
-
 VPX_CTRL_USE_TYPE(VP9E_SET_NOISE_SENSITIVITY, unsigned int)
 #define VPX_CTRL_VP9E_SET_NOISE_SENSITIVITY
-
+VPX_CTRL_USE_TYPE(VP9E_SET_SVC, int)
+#define VPX_CTRL_VP9E_SET_SVC
+VPX_CTRL_USE_TYPE(VP9E_SET_ROI_MAP, vpx_roi_map_t *)
+#define VPX_CTRL_VP9E_SET_ROI_MAP
+VPX_CTRL_USE_TYPE(VP9E_SET_SVC_PARAMETERS, void *)
+#define VPX_CTRL_VP9E_SET_SVC_PARAMETERS
+VPX_CTRL_USE_TYPE(VP9E_SET_SVC_LAYER_ID, vpx_svc_layer_id_t *)
+#define VPX_CTRL_VP9E_SET_SVC_LAYER_ID
 VPX_CTRL_USE_TYPE(VP9E_SET_TUNE_CONTENT, int) /* vp9e_tune_content */
 #define VPX_CTRL_VP9E_SET_TUNE_CONTENT
-
+VPX_CTRL_USE_TYPE(VP9E_GET_SVC_LAYER_ID, vpx_svc_layer_id_t *)
+#define VPX_CTRL_VP9E_GET_SVC_LAYER_ID
+VPX_CTRL_USE_TYPE(VP9E_REGISTER_CX_CALLBACK, void *)
+#define VPX_CTRL_VP9E_REGISTER_CX_CALLBACK
 VPX_CTRL_USE_TYPE(VP9E_SET_COLOR_SPACE, int)
 #define VPX_CTRL_VP9E_SET_COLOR_SPACE
-
 VPX_CTRL_USE_TYPE(VP9E_SET_MIN_GF_INTERVAL, unsigned int)
 #define VPX_CTRL_VP9E_SET_MIN_GF_INTERVAL
-
 VPX_CTRL_USE_TYPE(VP9E_SET_MAX_GF_INTERVAL, unsigned int)
 #define VPX_CTRL_VP9E_SET_MAX_GF_INTERVAL
-
 VPX_CTRL_USE_TYPE(VP9E_GET_ACTIVEMAP, vpx_active_map_t *)
 #define VPX_CTRL_VP9E_GET_ACTIVEMAP
-
 VPX_CTRL_USE_TYPE(VP9E_SET_COLOR_RANGE, int)
 #define VPX_CTRL_VP9E_SET_COLOR_RANGE
-
 VPX_CTRL_USE_TYPE(VP9E_SET_SVC_REF_FRAME_CONFIG, vpx_svc_ref_frame_config_t *)
 #define VPX_CTRL_VP9E_SET_SVC_REF_FRAME_CONFIG
-
 VPX_CTRL_USE_TYPE(VP9E_SET_RENDER_SIZE, int *)
 #define VPX_CTRL_VP9E_SET_RENDER_SIZE
-
 VPX_CTRL_USE_TYPE(VP9E_SET_TARGET_LEVEL, unsigned int)
 #define VPX_CTRL_VP9E_SET_TARGET_LEVEL
-
+VPX_CTRL_USE_TYPE(VP9E_SET_ROW_MT, unsigned int)
+#define VPX_CTRL_VP9E_SET_ROW_MT
 VPX_CTRL_USE_TYPE(VP9E_GET_LEVEL, int *)
 #define VPX_CTRL_VP9E_GET_LEVEL
+VPX_CTRL_USE_TYPE(VP9E_SET_ALT_REF_AQ, int)
+#define VPX_CTRL_VP9E_SET_ALT_REF_AQ
+VPX_CTRL_USE_TYPE(VP8E_SET_GF_CBR_BOOST_PCT, unsigned int)
+#define VPX_CTRL_VP8E_SET_GF_CBR_BOOST_PCT
+VPX_CTRL_USE_TYPE(VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, unsigned int)
+#define VPX_CTRL_VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST
+VPX_CTRL_USE_TYPE(VP9E_SET_SVC_INTER_LAYER_PRED, unsigned int)
+#define VPX_CTRL_VP9E_SET_SVC_INTER_LAYER_PRED
+VPX_CTRL_USE_TYPE(VP9E_SET_SVC_FRAME_DROP_LAYER, vpx_svc_frame_drop_t *)
+#define VPX_CTRL_VP9E_SET_SVC_FRAME_DROP_LAYER
+VPX_CTRL_USE_TYPE(VP9E_GET_SVC_REF_FRAME_CONFIG, vpx_svc_ref_frame_config_t *)
+#define VPX_CTRL_VP9E_GET_SVC_REF_FRAME_CONFIG
+VPX_CTRL_USE_TYPE(VP9E_SET_SVC_GF_TEMPORAL_REF, unsigned int)
+#define VPX_CTRL_VP9E_SET_SVC_GF_TEMPORAL_REF
+VPX_CTRL_USE_TYPE(VP9E_SET_SVC_SPATIAL_LAYER_SYNC,
+                  vpx_svc_spatial_layer_sync_t *)
+#define VPX_CTRL_VP9E_SET_SVC_SPATIAL_LAYER_SYNC
+VPX_CTRL_USE_TYPE(VP9E_SET_TPL, int)
+#define VPX_CTRL_VP9E_SET_TPL
+VPX_CTRL_USE_TYPE(VP9E_SET_POSTENCODE_DROP, unsigned int)
+#define VPX_CTRL_VP9E_SET_POSTENCODE_DROP
+VPX_CTRL_USE_TYPE(VP9E_SET_DELTA_Q_UV, int)
+#define VPX_CTRL_VP9E_SET_DELTA_Q_UV
+VPX_CTRL_USE_TYPE(VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, int)
+#define VPX_CTRL_VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR
+VPX_CTRL_USE_TYPE(VP9E_SET_DISABLE_LOOPFILTER, int)
+#define VPX_CTRL_VP9E_SET_DISABLE_LOOPFILTER
+VPX_CTRL_USE_TYPE(VP9E_SET_EXTERNAL_RATE_CONTROL, vpx_rc_funcs_t *)
+#define VPX_CTRL_VP9E_SET_EXTERNAL_RATE_CONTROL
+VPX_CTRL_USE_TYPE(VP9E_SET_RTC_EXTERNAL_RATECTRL, int)
+#define VPX_CTRL_VP9E_SET_RTC_EXTERNAL_RATECTRL
+VPX_CTRL_USE_TYPE(VP9E_GET_LOOPFILTER_LEVEL, int *)
+#define VPX_CTRL_VP9E_GET_LOOPFILTER_LEVEL
+VPX_CTRL_USE_TYPE(VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, int *)
+#define VPX_CTRL_VP9E_GET_LAST_QUANTIZER_SVC_LAYERS
+VPX_CTRL_USE_TYPE(VP8E_SET_RTC_EXTERNAL_RATECTRL, int)
+#define VPX_CTRL_VP8E_SET_RTC_EXTERNAL_RATECTRL
+VPX_CTRL_USE_TYPE(VP9E_SET_QUANTIZER_ONE_PASS, int)
+#define VPX_CTRL_VP9E_SET_QUANTIZER_ONE_PASS
+VPX_CTRL_USE_TYPE(VP9E_SET_KEY_FRAME_FILTERING, int)
+#define VPX_CTRL_VP9E_SET_KEY_FRAME_FILTERING
 
 /*!\endcond */
 /*! @} - end defgroup vp8_encoder */
@@ -847,4 +1115,4 @@ VPX_CTRL_USE_TYPE(VP9E_GET_LEVEL, int *)
 }  // extern "C"
 #endif
 
-#endif  // VPX_VP8CX_H_
+#endif  // VPX_VPX_VP8CX_H_
diff --git a/media/libvpx/libvpx/vpx/vp8dx.h b/media/libvpx/libvpx/vpx/vp8dx.h
index 0d7759eb25..8c13649f4a 100644
--- a/media/libvpx/libvpx/vpx/vp8dx.h
+++ b/media/libvpx/libvpx/vpx/vp8dx.h
@@ -17,8 +17,8 @@
  * \brief Provides definitions for using VP8 or VP9 within the vpx Decoder
  *        interface.
  */
-#ifndef VPX_VP8DX_H_
-#define VPX_VP8DX_H_
+#ifndef VPX_VPX_VP8DX_H_
+#define VPX_VPX_VP8DX_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -32,7 +32,15 @@ extern "C" {
  * This interface provides the capability to decode VP8 streams.
  * @{
  */
+
+/*!\brief A single instance of the VP8 decoder.
+ *\deprecated This access mechanism is provided for backwards compatibility;
+ * prefer vpx_codec_vp8_dx().
+ */
 extern vpx_codec_iface_t vpx_codec_vp8_dx_algo;
+
+/*!\brief The interface to the VP8 decoder.
+ */
 extern vpx_codec_iface_t *vpx_codec_vp8_dx(void);
 /*!@} - end algorithm interface member group*/
 
@@ -41,7 +49,15 @@ extern vpx_codec_iface_t *vpx_codec_vp8_dx(void);
  * This interface provides the capability to decode VP9 streams.
  * @{
  */
+
+/*!\brief A single instance of the VP9 decoder.
+ *\deprecated This access mechanism is provided for backwards compatibility;
+ * prefer vpx_codec_vp9_dx().
+ */
 extern vpx_codec_iface_t vpx_codec_vp9_dx_algo;
+
+/*!\brief The interface to the VP9 decoder.
+ */
 extern vpx_codec_iface_t *vpx_codec_vp9_dx(void);
 /*!@} - end algorithm interface member group*/
 
@@ -116,6 +132,32 @@ enum vp8_dec_control_id {
    */
   VP9_DECODE_SVC_SPATIAL_LAYER,
 
+  /*!\brief Codec control function to get last decoded frame quantizer.
+   *
+   * Return value uses internal quantizer scale defined by the codec.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VPXD_GET_LAST_QUANTIZER,
+
+  /*!\brief Codec control function to set row level multi-threading.
+   *
+   * 0 : off, 1 : on
+   *
+   * Supported in codecs: VP9
+   */
+  VP9D_SET_ROW_MT,
+
+  /*!\brief Codec control function to set loopfilter optimization.
+   *
+   * 0 : off, Loop filter is done after all tiles have been decoded
+   * 1 : on, Loop filter is done immediately after decode without
+   *     waiting for all threads to sync.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9D_SET_LOOP_FILTER_OPT,
+
   VP8_DECODER_CTRL_ID_MAX
 };
 
@@ -137,10 +179,6 @@ typedef struct vpx_decrypt_init {
   void *decrypt_state;
 } vpx_decrypt_init;
 
-/*!\brief A deprecated alias for vpx_decrypt_init.
- */
-typedef vpx_decrypt_init vp8_decrypt_init;
-
 /*!\cond */
 /*!\brief VP8 decoder control function parameter type
  *
@@ -159,16 +197,26 @@ VPX_CTRL_USE_TYPE(VPXD_SET_DECRYPTOR, vpx_decrypt_init *)
 #define VPX_CTRL_VPXD_SET_DECRYPTOR
 VPX_CTRL_USE_TYPE(VP8D_SET_DECRYPTOR, vpx_decrypt_init *)
 #define VPX_CTRL_VP8D_SET_DECRYPTOR
+VPX_CTRL_USE_TYPE(VP9D_GET_FRAME_SIZE, int *)
+#define VPX_CTRL_VP9D_GET_FRAME_SIZE
 VPX_CTRL_USE_TYPE(VP9D_GET_DISPLAY_SIZE, int *)
 #define VPX_CTRL_VP9D_GET_DISPLAY_SIZE
 VPX_CTRL_USE_TYPE(VP9D_GET_BIT_DEPTH, unsigned int *)
 #define VPX_CTRL_VP9D_GET_BIT_DEPTH
-VPX_CTRL_USE_TYPE(VP9D_GET_FRAME_SIZE, int *)
-#define VPX_CTRL_VP9D_GET_FRAME_SIZE
+VPX_CTRL_USE_TYPE(VP9_SET_BYTE_ALIGNMENT, int)
+#define VPX_CTRL_VP9_SET_BYTE_ALIGNMENT
 VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int)
 #define VPX_CTRL_VP9_INVERT_TILE_DECODE_ORDER
-#define VPX_CTRL_VP9_DECODE_SVC_SPATIAL_LAYER
+VPX_CTRL_USE_TYPE(VP9_SET_SKIP_LOOP_FILTER, int)
+#define VPX_CTRL_VP9_SET_SKIP_LOOP_FILTER
 VPX_CTRL_USE_TYPE(VP9_DECODE_SVC_SPATIAL_LAYER, int)
+#define VPX_CTRL_VP9_DECODE_SVC_SPATIAL_LAYER
+VPX_CTRL_USE_TYPE(VPXD_GET_LAST_QUANTIZER, int *)
+#define VPX_CTRL_VPXD_GET_LAST_QUANTIZER
+VPX_CTRL_USE_TYPE(VP9D_SET_ROW_MT, int)
+#define VPX_CTRL_VP9_DECODE_SET_ROW_MT
+VPX_CTRL_USE_TYPE(VP9D_SET_LOOP_FILTER_OPT, int)
+#define VPX_CTRL_VP9_SET_LOOP_FILTER_OPT
 
 /*!\endcond */
 /*! @} - end defgroup vp8_decoder */
@@ -177,4 +225,4 @@ VPX_CTRL_USE_TYPE(VP9_DECODE_SVC_SPATIAL_LAYER, int)
 }  // extern "C"
 #endif
 
-#endif  // VPX_VP8DX_H_
+#endif  // VPX_VPX_VP8DX_H_
diff --git a/media/libvpx/libvpx/vpx/vpx_codec.h b/media/libvpx/libvpx/vpx/vpx_codec.h
index fe75d23872..602889773d 100644
--- a/media/libvpx/libvpx/vpx/vpx_codec.h
+++ b/media/libvpx/libvpx/vpx/vpx_codec.h
@@ -22,58 +22,62 @@
  * video codec algorithm.
  *
  * An application instantiates a specific codec instance by using
- * vpx_codec_init() and a pointer to the algorithm's interface structure:
+ * vpx_codec_dec_init() or vpx_codec_enc_init() and a pointer to the
+ * algorithm's interface structure:
  *     <pre>
  *     my_app.c:
  *       extern vpx_codec_iface_t my_codec;
  *       {
  *           vpx_codec_ctx_t algo;
- *           res = vpx_codec_init(&algo, &my_codec);
+ *           int threads = 4;
+ *           vpx_codec_dec_cfg_t cfg = { threads, 0, 0 };
+ *           res = vpx_codec_dec_init(&algo, &my_codec, &cfg, 0);
  *       }
  *     </pre>
  *
  * Once initialized, the instance is manged using other functions from
  * the vpx_codec_* family.
  */
-#ifndef VPX_VPX_CODEC_H_
-#define VPX_VPX_CODEC_H_
+#ifndef VPX_VPX_VPX_CODEC_H_
+#define VPX_VPX_VPX_CODEC_H_
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#include "./vpx_integer.h"
 #include "./vpx_image.h"
+#include "./vpx_integer.h"
 
 /*!\brief Decorator indicating a function is deprecated */
-#ifndef DEPRECATED
-#if defined(__GNUC__) && __GNUC__
-#define DEPRECATED __attribute__((deprecated))
+#ifndef VPX_DEPRECATED
+#if defined(__GNUC__)
+#define VPX_DEPRECATED __attribute__((deprecated))
 #elif defined(_MSC_VER)
-#define DEPRECATED
+#define VPX_DEPRECATED
 #else
-#define DEPRECATED
+#define VPX_DEPRECATED
 #endif
-#endif /* DEPRECATED */
+#endif /* VPX_DEPRECATED */
 
-#ifndef DECLSPEC_DEPRECATED
-#if defined(__GNUC__) && __GNUC__
-#define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */
+#ifndef VPX_DECLSPEC_DEPRECATED
+#if defined(__GNUC__)
+#define VPX_DECLSPEC_DEPRECATED /**< \copydoc #VPX_DEPRECATED */
 #elif defined(_MSC_VER)
-/*!\brief \copydoc #DEPRECATED */
-#define DECLSPEC_DEPRECATED __declspec(deprecated)
+/*!\brief \copydoc #VPX_DEPRECATED */
+#define VPX_DECLSPEC_DEPRECATED __declspec(deprecated)
 #else
-#define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */
+#define VPX_DECLSPEC_DEPRECATED /**< \copydoc #VPX_DEPRECATED */
 #endif
-#endif /* DECLSPEC_DEPRECATED */
+#endif /* VPX_DECLSPEC_DEPRECATED */
 
 /*!\brief Decorator indicating a function is potentially unused */
-#ifdef UNUSED
-#elif defined(__GNUC__) || defined(__clang__)
-#define UNUSED __attribute__((unused))
+#ifndef VPX_UNUSED
+#if defined(__GNUC__) || defined(__clang__)
+#define VPX_UNUSED __attribute__((unused))
 #else
-#define UNUSED
+#define VPX_UNUSED
 #endif
+#endif /* VPX_UNUSED */
 
 /*!\brief Current ABI version number
  *
@@ -83,7 +87,7 @@ extern "C" {
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures
  */
-#define VPX_CODEC_ABI_VERSION (3 + VPX_IMAGE_ABI_VERSION) /**<\hideinitializer*/
+#define VPX_CODEC_ABI_VERSION (4 + VPX_IMAGE_ABI_VERSION) /**<\hideinitializer*/
 
 /*!\brief Algorithm return codes */
 typedef enum {
@@ -152,6 +156,10 @@ typedef long vpx_codec_caps_t;
 #define VPX_CODEC_CAP_DECODER 0x1 /**< Is a decoder */
 #define VPX_CODEC_CAP_ENCODER 0x2 /**< Is an encoder */
 
+/*! Can support images at greater than 8 bitdepth.
+ */
+#define VPX_CODEC_CAP_HIGHBITDEPTH 0x4
+
 /*! \brief Initialization-time Feature Enabling
  *
  *  Certain codec features must be known at initialization time, to allow for
@@ -236,11 +244,11 @@ typedef enum vpx_bit_depth {
  */
 int vpx_codec_version(void);
 #define VPX_VERSION_MAJOR(v) \
-  ((v >> 16) & 0xff) /**< extract major from packed version */
+  (((v) >> 16) & 0xff) /**< extract major from packed version */
 #define VPX_VERSION_MINOR(v) \
-  ((v >> 8) & 0xff) /**< extract minor from packed version */
+  (((v) >> 8) & 0xff) /**< extract minor from packed version */
 #define VPX_VERSION_PATCH(v) \
-  ((v >> 0) & 0xff) /**< extract patch from packed version */
+  (((v) >> 0) & 0xff) /**< extract patch from packed version */
 
 /*!\brief Return the version major number */
 #define vpx_codec_version_major() ((vpx_codec_version() >> 16) & 0xff)
@@ -310,19 +318,21 @@ const char *vpx_codec_err_to_string(vpx_codec_err_t err);
  * \param[in]    ctx     Pointer to this instance's context.
  *
  */
-const char *vpx_codec_error(vpx_codec_ctx_t *ctx);
+const char *vpx_codec_error(const vpx_codec_ctx_t *ctx);
 
 /*!\brief Retrieve detailed error information for codec context
  *
  * Returns a human readable string providing detailed information about
- * the last error.
+ * the last error. The returned string is only valid until the next
+ * vpx_codec_* function call (except vpx_codec_error and
+ * vpx_codec_error_detail) on the codec context.
  *
  * \param[in]    ctx     Pointer to this instance's context.
  *
  * \retval NULL
  *     No detailed information is available.
  */
-const char *vpx_codec_error_detail(vpx_codec_ctx_t *ctx);
+const char *vpx_codec_error_detail(const vpx_codec_ctx_t *ctx);
 
 /* REQUIRED FUNCTIONS
  *
@@ -337,9 +347,11 @@ const char *vpx_codec_error_detail(vpx_codec_ctx_t *ctx);
  * \param[in] ctx   Pointer to this instance's context
  *
  * \retval #VPX_CODEC_OK
- *     The codec algorithm initialized.
- * \retval #VPX_CODEC_MEM_ERROR
- *     Memory allocation failed.
+ *     The codec instance has been destroyed.
+ * \retval #VPX_CODEC_INVALID_PARAM
+ *     ctx is a null pointer.
+ * \retval #VPX_CODEC_ERROR
+ *     Codec context not initialized.
  */
 vpx_codec_err_t vpx_codec_destroy(vpx_codec_ctx_t *ctx);
 
@@ -409,7 +421,7 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...);
  */
 #define VPX_CTRL_USE_TYPE(id, typ)                                           \
   static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *, int, typ) \
-      UNUSED;                                                                \
+      VPX_UNUSED;                                                            \
                                                                              \
   static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *ctx,        \
                                                 int ctrl_id, typ data) {     \
@@ -426,13 +438,13 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...);
  * It defines a static function with the correctly typed arguments as a
  * wrapper to the type-unsafe internal function.
  */
-#define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ)                        \
-  DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \
-      vpx_codec_ctx_t *, int, typ) DEPRECATED UNUSED;                \
-                                                                     \
-  DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \
-      vpx_codec_ctx_t *ctx, int ctrl_id, typ data) {                 \
-    return vpx_codec_control_(ctx, ctrl_id, data);                   \
+#define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ)                            \
+  VPX_DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \
+      vpx_codec_ctx_t *, int, typ) VPX_DEPRECATED VPX_UNUSED;            \
+                                                                         \
+  VPX_DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \
+      vpx_codec_ctx_t *ctx, int ctrl_id, typ data) {                     \
+    return vpx_codec_control_(ctx, ctrl_id, data);                       \
   } /**<\hideinitializer*/
 
 /*!\brief vpx_codec_control void type definition macro
@@ -447,7 +459,7 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...);
  */
 #define VPX_CTRL_VOID(id)                                               \
   static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *, int) \
-      UNUSED;                                                           \
+      VPX_UNUSED;                                                       \
                                                                         \
   static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *ctx,   \
                                                 int ctrl_id) {          \
@@ -460,4 +472,4 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...);
 #ifdef __cplusplus
 }
 #endif
-#endif  // VPX_VPX_CODEC_H_
+#endif  // VPX_VPX_VPX_CODEC_H_
diff --git a/media/libvpx/libvpx/vpx/vpx_codec.mk b/media/libvpx/libvpx/vpx/vpx_codec.mk
index b77f45817b..778f1a6146 100644
--- a/media/libvpx/libvpx/vpx/vpx_codec.mk
+++ b/media/libvpx/libvpx/vpx/vpx_codec.mk
@@ -15,10 +15,6 @@ API_SRCS-$(CONFIG_VP8_ENCODER) += vp8.h
 API_SRCS-$(CONFIG_VP8_ENCODER) += vp8cx.h
 API_DOC_SRCS-$(CONFIG_VP8_ENCODER) += vp8.h
 API_DOC_SRCS-$(CONFIG_VP8_ENCODER) += vp8cx.h
-ifeq ($(CONFIG_VP9_ENCODER),yes)
-  API_SRCS-$(CONFIG_SPATIAL_SVC) += src/svc_encodeframe.c
-  API_SRCS-$(CONFIG_SPATIAL_SVC) += svc_context.h
-endif
 
 API_SRCS-$(CONFIG_VP8_DECODER) += vp8.h
 API_SRCS-$(CONFIG_VP8_DECODER) += vp8dx.h
@@ -28,14 +24,17 @@ API_DOC_SRCS-$(CONFIG_VP8_DECODER) += vp8dx.h
 API_DOC_SRCS-yes += vpx_codec.h
 API_DOC_SRCS-yes += vpx_decoder.h
 API_DOC_SRCS-yes += vpx_encoder.h
+API_DOC_SRCS-$(CONFIG_ENCODERS) += vpx_ext_ratectrl.h
 API_DOC_SRCS-yes += vpx_frame_buffer.h
 API_DOC_SRCS-yes += vpx_image.h
+API_DOC_SRCS-$(CONFIG_ENCODERS) += vpx_tpl.h
 
 API_SRCS-yes += src/vpx_decoder.c
 API_SRCS-yes += vpx_decoder.h
 API_SRCS-yes += src/vpx_encoder.c
 API_SRCS-yes += vpx_encoder.h
 API_SRCS-yes += internal/vpx_codec_internal.h
+API_SRCS-yes += internal/vpx_ratectrl_rtc.h
 API_SRCS-yes += src/vpx_codec.c
 API_SRCS-yes += src/vpx_image.c
 API_SRCS-yes += vpx_codec.h
@@ -43,3 +42,5 @@ API_SRCS-yes += vpx_codec.mk
 API_SRCS-yes += vpx_frame_buffer.h
 API_SRCS-yes += vpx_image.h
 API_SRCS-yes += vpx_integer.h
+API_SRCS-yes += vpx_ext_ratectrl.h
+API_SRCS-yes += vpx_tpl.h
diff --git a/media/libvpx/libvpx/vpx/vpx_decoder.h b/media/libvpx/libvpx/vpx/vpx_decoder.h
index 2ff12112bc..0536d5d5ad 100644
--- a/media/libvpx/libvpx/vpx/vpx_decoder.h
+++ b/media/libvpx/libvpx/vpx/vpx_decoder.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef VPX_VPX_DECODER_H_
-#define VPX_VPX_DECODER_H_
+#ifndef VPX_VPX_VPX_DECODER_H_
+#define VPX_VPX_VPX_DECODER_H_
 
 /*!\defgroup decoder Decoder Algorithm Interface
  * \ingroup codec
@@ -29,7 +29,7 @@
 extern "C" {
 #endif
 
-#include "./vpx_codec.h"
+#include "./vpx_codec.h"  // IWYU pragma: export
 #include "./vpx_frame_buffer.h"
 
 /*!\brief Current ABI version number
@@ -58,6 +58,10 @@ extern "C" {
 #define VPX_CODEC_CAP_ERROR_CONCEALMENT 0x80000
 /*!\brief Can receive encoded frames one fragment at a time */
 #define VPX_CODEC_CAP_INPUT_FRAGMENTS 0x100000
+/*!\brief Can support frame-based multi-threading */
+#define VPX_CODEC_CAP_FRAME_THREADING 0x200000
+/*!\brief Can support external frame buffers */
+#define VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER 0x400000
 
 /*! \brief Initialization-time Feature Enabling
  *
@@ -66,11 +70,6 @@ extern "C" {
  *
  *  The available flags are specified by VPX_CODEC_USE_* defines.
  */
-/*!\brief Can support frame-based multi-threading */
-#define VPX_CODEC_CAP_FRAME_THREADING 0x200000
-/*!brief Can support external frame buffers */
-#define VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER 0x400000
-
 #define VPX_CODEC_USE_POSTPROC 0x10000 /**< Postprocess decoded frame */
 /*!\brief Conceal errors in decoded frames */
 #define VPX_CODEC_USE_ERROR_CONCEALMENT 0x20000
@@ -128,7 +127,7 @@ typedef struct vpx_codec_dec_cfg {
  * \param[in]    ver     ABI version number. Must be set to
  *                       VPX_DECODER_ABI_VERSION
  * \retval #VPX_CODEC_OK
- *     The decoder algorithm initialized.
+ *     The decoder algorithm has been initialized.
  * \retval #VPX_CODEC_MEM_ERROR
  *     Memory allocation failed.
  */
@@ -153,7 +152,7 @@ vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t *ctx,
  * \param[in]      iface   Pointer to the algorithm interface
  * \param[in]      data    Pointer to a block of data to parse
  * \param[in]      data_sz Size of the data buffer
- * \param[in,out]  si      Pointer to stream info to update. The size member
+ * \param[in,out]  si      Pointer to stream info to update. The sz member
  *                         \ref MUST be properly initialized, but \ref MAY be
  *                         clobbered by the algorithm. This parameter \ref MAY
  *                         be NULL.
@@ -171,7 +170,7 @@ vpx_codec_err_t vpx_codec_peek_stream_info(vpx_codec_iface_t *iface,
  * Returns information about the stream that has been parsed during decoding.
  *
  * \param[in]      ctx     Pointer to this instance's context
- * \param[in,out]  si      Pointer to stream info to update. The size member
+ * \param[in,out]  si      Pointer to stream info to update. The sz member
  *                         \ref MUST be properly initialized, but \ref MAY be
  *                         clobbered by the algorithm. This parameter \ref MAY
  *                         be NULL.
@@ -185,8 +184,8 @@ vpx_codec_err_t vpx_codec_get_stream_info(vpx_codec_ctx_t *ctx,
 /*!\brief Decode data
  *
  * Processes a buffer of coded data. If the processing results in a new
- * decoded frame becoming available, PUT_SLICE and PUT_FRAME events may be
- * generated, as appropriate. Encoded data \ref MUST be passed in DTS (decode
+ * decoded frame becoming available, put_slice and put_frame callbacks may be
+ * invoked, as appropriate. Encoded data \ref MUST be passed in DTS (decode
  * time stamp) order. Frames produced will always be in PTS (presentation
  * time stamp) order.
  * If the decoder is configured with VPX_CODEC_USE_INPUT_FRAGMENTS enabled,
@@ -199,13 +198,15 @@ vpx_codec_err_t vpx_codec_get_stream_info(vpx_codec_ctx_t *ctx,
  *
  * \param[in] ctx          Pointer to this instance's context
  * \param[in] data         Pointer to this block of new coded data. If
- *                         NULL, a VPX_CODEC_CB_PUT_FRAME event is posted
- *                         for the previously decoded frame.
+ *                         NULL, the put_frame callback is invoked for
+ *                         the previously decoded frame.
  * \param[in] data_sz      Size of the coded data, in bytes.
  * \param[in] user_priv    Application specific data to associate with
  *                         this frame.
  * \param[in] deadline     Soft deadline the decoder should attempt to meet,
  *                         in us. Set to zero for unlimited.
+ *                         NOTE: The deadline parameter is ignored. Always
+ *                         pass 0.
  *
  * \return Returns #VPX_CODEC_OK if the coded data was processed completely
  *         and future pictures can be decoded without error. Otherwise,
@@ -236,11 +237,10 @@ vpx_image_t *vpx_codec_get_frame(vpx_codec_ctx_t *ctx, vpx_codec_iter_t *iter);
 
 /*!\defgroup cap_put_frame Frame-Based Decoding Functions
  *
- * The following functions are required to be implemented for all decoders
- * that advertise the VPX_CODEC_CAP_PUT_FRAME capability. Calling these
- * functions
- * for codecs that don't advertise this capability will result in an error
- * code being returned, usually VPX_CODEC_ERROR
+ * The following function is required to be implemented for all decoders
+ * that advertise the VPX_CODEC_CAP_PUT_FRAME capability. Calling this
+ * function for codecs that don't advertise this capability will result in
+ * an error code being returned, usually VPX_CODEC_INCAPABLE.
  * @{
  */
 
@@ -264,8 +264,9 @@ typedef void (*vpx_codec_put_frame_cb_fn_t)(void *user_priv,
  * \retval #VPX_CODEC_OK
  *     Callback successfully registered.
  * \retval #VPX_CODEC_ERROR
- *     Decoder context not initialized, or algorithm not capable of
- *     posting slice completion.
+ *     Decoder context not initialized.
+ * \retval #VPX_CODEC_INCAPABLE
+ *     Algorithm not capable of posting frame completion.
  */
 vpx_codec_err_t vpx_codec_register_put_frame_cb(vpx_codec_ctx_t *ctx,
                                                 vpx_codec_put_frame_cb_fn_t cb,
@@ -275,18 +276,17 @@ vpx_codec_err_t vpx_codec_register_put_frame_cb(vpx_codec_ctx_t *ctx,
 
 /*!\defgroup cap_put_slice Slice-Based Decoding Functions
  *
- * The following functions are required to be implemented for all decoders
- * that advertise the VPX_CODEC_CAP_PUT_SLICE capability. Calling these
- * functions
- * for codecs that don't advertise this capability will result in an error
- * code being returned, usually VPX_CODEC_ERROR
+ * The following function is required to be implemented for all decoders
+ * that advertise the VPX_CODEC_CAP_PUT_SLICE capability. Calling this
+ * function for codecs that don't advertise this capability will result in
+ * an error code being returned, usually VPX_CODEC_INCAPABLE.
  * @{
  */
 
 /*!\brief put slice callback prototype
  *
  * This callback is invoked by the decoder to notify the application of
- * the availability of partially decoded image data. The
+ * the availability of partially decoded image data.
  */
 typedef void (*vpx_codec_put_slice_cb_fn_t)(void *user_priv,
                                             const vpx_image_t *img,
@@ -305,8 +305,9 @@ typedef void (*vpx_codec_put_slice_cb_fn_t)(void *user_priv,
  * \retval #VPX_CODEC_OK
  *     Callback successfully registered.
  * \retval #VPX_CODEC_ERROR
- *     Decoder context not initialized, or algorithm not capable of
- *     posting slice completion.
+ *     Decoder context not initialized.
+ * \retval #VPX_CODEC_INCAPABLE
+ *     Algorithm not capable of posting slice completion.
  */
 vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t *ctx,
                                                 vpx_codec_put_slice_cb_fn_t cb,
@@ -316,10 +317,10 @@ vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t *ctx,
 
 /*!\defgroup cap_external_frame_buffer External Frame Buffer Functions
  *
- * The following section is required to be implemented for all decoders
+ * The following function is required to be implemented for all decoders
  * that advertise the VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER capability.
  * Calling this function for codecs that don't advertise this capability
- * will result in an error code being returned, usually VPX_CODEC_ERROR.
+ * will result in an error code being returned, usually VPX_CODEC_INCAPABLE.
  *
  * \note
  * Currently this only works with VP9.
@@ -344,8 +345,9 @@ vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t *ctx,
  * \retval #VPX_CODEC_INVALID_PARAM
  *     One or more of the callbacks were NULL.
  * \retval #VPX_CODEC_ERROR
- *     Decoder context not initialized, or algorithm not capable of
- *     using external frame buffers.
+ *     Decoder context not initialized.
+ * \retval #VPX_CODEC_INCAPABLE
+ *     Algorithm not capable of using external frame buffers.
  *
  * \note
  * When decoding VP9, the application may be required to pass in at least
@@ -362,4 +364,4 @@ vpx_codec_err_t vpx_codec_set_frame_buffer_functions(
 #ifdef __cplusplus
 }
 #endif
-#endif  // VPX_VPX_DECODER_H_
+#endif  // VPX_VPX_VPX_DECODER_H_
diff --git a/media/libvpx/libvpx/vpx/vpx_encoder.h b/media/libvpx/libvpx/vpx/vpx_encoder.h
index 28fcd5f999..04e6b028f3 100644
--- a/media/libvpx/libvpx/vpx/vpx_encoder.h
+++ b/media/libvpx/libvpx/vpx/vpx_encoder.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef VPX_VPX_ENCODER_H_
-#define VPX_VPX_ENCODER_H_
+#ifndef VPX_VPX_VPX_ENCODER_H_
+#define VPX_VPX_VPX_ENCODER_H_
 
 /*!\defgroup encoder Encoder Algorithm Interface
  * \ingroup codec
@@ -29,7 +29,8 @@
 extern "C" {
 #endif
 
-#include "./vpx_codec.h"
+#include "./vpx_codec.h"  // IWYU pragma: export
+#include "./vpx_ext_ratectrl.h"
 
 /*! Temporal Scalability: Maximum length of the sequence defining frame
  * layer membership
@@ -39,15 +40,9 @@ extern "C" {
 /*! Temporal Scalability: Maximum number of coding layers */
 #define VPX_TS_MAX_LAYERS 5
 
-/*!\deprecated Use #VPX_TS_MAX_PERIODICITY instead. */
-#define MAX_PERIODICITY VPX_TS_MAX_PERIODICITY
-
 /*! Temporal+Spatial Scalability: Maximum number of coding layers */
 #define VPX_MAX_LAYERS 12  // 3 temporal + 4 spatial layers are allowed.
 
-/*!\deprecated Use #VPX_MAX_LAYERS instead. */
-#define MAX_LAYERS VPX_MAX_LAYERS  // 3 temporal + 4 spatial layers allowed.
-
 /*! Spatial Scalability: Maximum number of coding layers */
 #define VPX_SS_MAX_LAYERS 5
 
@@ -61,9 +56,15 @@ extern "C" {
  * must be bumped.  Examples include, but are not limited to, changing
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures
+ *
+ * \note
+ * VPX_ENCODER_ABI_VERSION has a VPX_EXT_RATECTRL_ABI_VERSION component
+ * because the VP9E_SET_EXTERNAL_RATE_CONTROL codec control uses
+ * vpx_rc_funcs_t.
  */
 #define VPX_ENCODER_ABI_VERSION \
-  (5 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
+  (18 + VPX_CODEC_ABI_VERSION + \
+   VPX_EXT_RATECTRL_ABI_VERSION) /**<\hideinitializer*/
 
 /*! \brief Encoder capabilities bitfield
  *
@@ -83,10 +84,6 @@ extern "C" {
  */
 #define VPX_CODEC_CAP_OUTPUT_PARTITION 0x20000
 
-/*! Can support input images at greater than 8 bitdepth.
- */
-#define VPX_CODEC_CAP_HIGHBITDEPTH 0x40000
-
 /*! \brief Initialization-time Feature Enabling
  *
  *  Certain codec features must be known at initialization time, to allow
@@ -123,14 +120,14 @@ typedef int64_t vpx_codec_pts_t;
  * support frame types that are codec specific (MPEG-1 D-frames for example)
  */
 typedef uint32_t vpx_codec_frame_flags_t;
-#define VPX_FRAME_IS_KEY 0x1 /**< frame is the start of a GOP */
+#define VPX_FRAME_IS_KEY 0x1u /**< frame is the start of a GOP */
 /*!\brief frame can be dropped without affecting the stream (no future frame
  * depends on this one) */
-#define VPX_FRAME_IS_DROPPABLE 0x2
+#define VPX_FRAME_IS_DROPPABLE 0x2u
 /*!\brief frame should be decoded but will not be shown */
-#define VPX_FRAME_IS_INVISIBLE 0x4
+#define VPX_FRAME_IS_INVISIBLE 0x4u
 /*!\brief this is a fragment of the encoded frame */
-#define VPX_FRAME_IS_FRAGMENT 0x8
+#define VPX_FRAME_IS_FRAGMENT 0x8u
 
 /*!\brief Error Resilient flags
  *
@@ -140,12 +137,13 @@ typedef uint32_t vpx_codec_frame_flags_t;
  */
 typedef uint32_t vpx_codec_er_flags_t;
 /*!\brief Improve resiliency against losses of whole frames */
-#define VPX_ERROR_RESILIENT_DEFAULT 0x1
+#define VPX_ERROR_RESILIENT_DEFAULT 0x1u
 /*!\brief The frame partitions are independently decodable by the bool decoder,
  * meaning that partitions can be decoded even though earlier partitions have
  * been lost. Note that intra prediction is still done over the partition
- * boundary. */
-#define VPX_ERROR_RESILIENT_PARTITIONS 0x2
+ * boundary.
+ * \note This is only supported by VP8.*/
+#define VPX_ERROR_RESILIENT_PARTITIONS 0x2u
 
 /*!\brief Encoder output packet variants
  *
@@ -154,16 +152,10 @@ typedef uint32_t vpx_codec_er_flags_t;
  * extend this list to provide additional functionality.
  */
 enum vpx_codec_cx_pkt_kind {
-  VPX_CODEC_CX_FRAME_PKT,   /**< Compressed video frame */
-  VPX_CODEC_STATS_PKT,      /**< Two-pass statistics for this frame */
-  VPX_CODEC_FPMB_STATS_PKT, /**< first pass mb statistics for this frame */
-  VPX_CODEC_PSNR_PKT,       /**< PSNR statistics for this frame */
-// Spatial SVC is still experimental and may be removed before the next ABI
-// bump.
-#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION)
-  VPX_CODEC_SPATIAL_SVC_LAYER_SIZES, /**< Sizes for each layer in this frame*/
-  VPX_CODEC_SPATIAL_SVC_LAYER_PSNR,  /**< PSNR for each layer in this frame*/
-#endif
+  VPX_CODEC_CX_FRAME_PKT,    /**< Compressed video frame */
+  VPX_CODEC_STATS_PKT,       /**< Two-pass statistics for this frame */
+  VPX_CODEC_FPMB_STATS_PKT,  /**< first pass mb statistics for this frame */
+  VPX_CODEC_PSNR_PKT,        /**< PSNR statistics for this frame */
   VPX_CODEC_CUSTOM_PKT = 256 /**< Algorithm extensions  */
 };
 
@@ -187,6 +179,13 @@ typedef struct vpx_codec_cx_pkt {
        * Only applicable when "output partition" mode is enabled. First
        * partition has id 0.*/
       int partition_id;
+      /*!\brief Width and height of frames in this packet. VP8 will only use the
+       * first one.*/
+      unsigned int width[VPX_SS_MAX_LAYERS];  /**< frame width */
+      unsigned int height[VPX_SS_MAX_LAYERS]; /**< frame height */
+      /*!\brief Flag to indicate if spatial layer frame in this packet is
+       * encoded or dropped. VP8 will always be set to 1.*/
+      uint8_t spatial_layer_encoded[VPX_SS_MAX_LAYERS];
     } frame;                            /**< data for compressed frame packet */
     vpx_fixed_buf_t twopass_stats;      /**< data for two-pass packet */
     vpx_fixed_buf_t firstpass_mb_stats; /**< first pass mb packet */
@@ -194,14 +193,9 @@ typedef struct vpx_codec_cx_pkt {
       unsigned int samples[4]; /**< Number of samples, total/y/u/v */
       uint64_t sse[4];         /**< sum squared error, total/y/u/v */
       double psnr[4];          /**< PSNR, total/y/u/v */
+      int spatial_layer_id;    /**< Spatial layer id */
     } psnr;                    /**< data for PSNR packet */
     vpx_fixed_buf_t raw;       /**< data for arbitrary packets */
-// Spatial SVC is still experimental and may be removed before the next
-// ABI bump.
-#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION)
-    size_t layer_sizes[VPX_SS_MAX_LAYERS];
-    struct vpx_psnr_pkt layer_psnr[VPX_SS_MAX_LAYERS];
-#endif
 
     /* This packet size is fixed to allow codecs to extend this
      * interface without having to manage storage for raw packets,
@@ -217,8 +211,6 @@ typedef struct vpx_codec_cx_pkt {
  * This callback function, when registered, returns with packets when each
  * spatial layer is encoded.
  */
-// putting the definitions here for now. (agrange: find if there
-// is a better place for this)
 typedef void (*vpx_codec_enc_output_cx_pkt_cb_fn_t)(vpx_codec_cx_pkt_t *pkt,
                                                     void *user_data);
 
@@ -238,11 +230,11 @@ typedef struct vpx_rational {
 } vpx_rational_t; /**< alias for struct vpx_rational */
 
 /*!\brief Multi-pass Encoding Pass */
-enum vpx_enc_pass {
+typedef enum vpx_enc_pass {
   VPX_RC_ONE_PASS,   /**< Single pass mode */
   VPX_RC_FIRST_PASS, /**< First pass of multi-pass mode */
   VPX_RC_LAST_PASS   /**< Final pass of multi-pass mode */
-};
+} vpx_enc_pass;
 
 /*!\brief Rate control mode */
 enum vpx_rc_mode {
@@ -275,6 +267,8 @@ enum vpx_kf_mode {
  */
 typedef long vpx_enc_frame_flags_t;
 #define VPX_EFLAG_FORCE_KF (1 << 0) /**< Force this frame to be a keyframe */
+/** Calculate PSNR on this frame, requires g_lag_in_frames to be 0 */
+#define VPX_EFLAG_CALCULATE_PSNR (1 << 1)
 
 /*!\brief Encoder configuration structure
  *
@@ -287,12 +281,9 @@ typedef struct vpx_codec_enc_cfg {
    * generic settings (g)
    */
 
-  /*!\brief Algorithm specific "usage" value
+  /*!\brief Deprecated: Algorithm specific "usage" value
    *
-   * Algorithms may define multiple values for usage, which may convey the
-   * intent of how the application intends to use the stream. If this value
-   * is non-zero, consult the documentation for the codec to determine its
-   * meaning.
+   * This value must be zero.
    */
   unsigned int g_usage;
 
@@ -403,9 +394,6 @@ typedef struct vpx_codec_enc_cfg {
    * trade-off is often acceptable, but for many applications is not. It can
    * be disabled in these cases.
    *
-   * Note that not all codecs support this feature. All vpx VPx codecs do.
-   * For other codecs, consult the documentation for that algorithm.
-   *
    * This threshold is described as a percentage of the target data buffer.
    * When the data buffer falls below this percentage of fullness, a
    * dropped frame is indicated. Set the threshold to zero (0) to disable
@@ -478,7 +466,9 @@ typedef struct vpx_codec_enc_cfg {
 
   /*!\brief Target data rate
    *
-   * Target bandwidth to use for this stream, in kilobits per second.
+   * Target bitrate to use for this stream, in kilobits per second.
+   * Internally capped to the smaller of the uncompressed bitrate and
+   * 1000000 kilobits per second.
    */
   unsigned int rc_target_bitrate;
 
@@ -491,8 +481,7 @@ typedef struct vpx_codec_enc_cfg {
    * The quantizer is the most direct control over the quality of the
    * encoded image. The range of valid values for the quantizer is codec
    * specific. Consult the documentation for the codec to determine the
-   * values to use. To determine the range programmatically, call
-   * vpx_codec_enc_config_default() with a usage value of 0.
+   * values to use.
    */
   unsigned int rc_min_quantizer;
 
@@ -501,8 +490,7 @@ typedef struct vpx_codec_enc_cfg {
    * The quantizer is the most direct control over the quality of the
    * encoded image. The range of valid values for the quantizer is codec
    * specific. Consult the documentation for the codec to determine the
-   * values to use. To determine the range programmatically, call
-   * vpx_codec_enc_config_default() with a usage value of 0.
+   * values to use.
    */
   unsigned int rc_max_quantizer;
 
@@ -512,25 +500,31 @@ typedef struct vpx_codec_enc_cfg {
 
   /*!\brief Rate control adaptation undershoot control
    *
-   * This value, expressed as a percentage of the target bitrate,
+   * VP8: Expressed as a percentage of the target bitrate,
    * controls the maximum allowed adaptation speed of the codec.
    * This factor controls the maximum amount of bits that can
    * be subtracted from the target bitrate in order to compensate
    * for prior overshoot.
-   *
-   * Valid values in the range 0-1000.
+   * VP9: Expressed as a percentage of the target bitrate, a threshold
+   * undershoot level (current rate vs target) beyond which more aggressive
+   * corrective measures are taken.
+   *   *
+   * Valid values in the range VP8:0-100 VP9: 0-100.
    */
   unsigned int rc_undershoot_pct;
 
   /*!\brief Rate control adaptation overshoot control
    *
-   * This value, expressed as a percentage of the target bitrate,
+   * VP8: Expressed as a percentage of the target bitrate,
    * controls the maximum allowed adaptation speed of the codec.
    * This factor controls the maximum amount of bits that can
    * be added to the target bitrate in order to compensate for
    * prior undershoot.
+   * VP9: Expressed as a percentage of the target bitrate, a threshold
+   * overshoot level (current rate vs target) beyond which more aggressive
+   * corrective measures are taken.
    *
-   * Valid values in the range 0-1000.
+   * Valid values in the range VP8:0-100 VP9: 0-100.
    */
   unsigned int rc_overshoot_pct;
 
@@ -595,6 +589,13 @@ typedef struct vpx_codec_enc_cfg {
    */
   unsigned int rc_2pass_vbr_maxsection_pct;
 
+  /*!\brief Two-pass corpus vbr mode complexity control
+   * Used only in VP9: A value representing the corpus midpoint complexity
+   * for corpus vbr mode. This value defaults to 0 which disables corpus vbr
+   * mode in favour of normal vbr mode.
+   */
+  unsigned int rc_2pass_vbr_corpus_complexity;
+
   /*
    * keyframing settings (kf)
    */
@@ -645,7 +646,7 @@ typedef struct vpx_codec_enc_cfg {
   /*!\brief Target bitrate for each spatial layer.
    *
    * These values specify the target coding bitrate to be used for each
-   * spatial layer.
+   * spatial layer. (in kbps)
    */
   unsigned int ss_target_bitrate[VPX_SS_MAX_LAYERS];
 
@@ -658,7 +659,7 @@ typedef struct vpx_codec_enc_cfg {
   /*!\brief Target bitrate for each temporal layer.
    *
    * These values specify the target coding bitrate to be used for each
-   * temporal layer.
+   * temporal layer. (in kbps)
    */
   unsigned int ts_target_bitrate[VPX_TS_MAX_LAYERS];
 
@@ -675,7 +676,7 @@ typedef struct vpx_codec_enc_cfg {
    * membership of frames to temporal layers. For example, if the
    * ts_periodicity = 8, then the frames are assigned to coding layers with a
    * repeated sequence of length 8.
-  */
+   */
   unsigned int ts_periodicity;
 
   /*!\brief Template defining the membership of frames to temporal layers.
@@ -684,13 +685,13 @@ typedef struct vpx_codec_enc_cfg {
    * For a 2-layer encoding that assigns even numbered frames to one temporal
    * layer (0) and odd numbered frames to a second temporal layer (1) with
    * ts_periodicity=8, then ts_layer_id = (0,1,0,1,0,1,0,1).
-  */
+   */
   unsigned int ts_layer_id[VPX_TS_MAX_PERIODICITY];
 
   /*!\brief Target bitrate for each spatial/temporal layer.
    *
    * These values specify the target coding bitrate to be used for each
-   * spatial/temporal layer.
+   * spatial/temporal layer. (in kbps)
    *
    */
   unsigned int layer_target_bitrate[VPX_MAX_LAYERS];
@@ -703,6 +704,151 @@ typedef struct vpx_codec_enc_cfg {
    *
    */
   int temporal_layering_mode;
+
+  /*!\brief A flag indicating whether to use external rate control parameters.
+   * By default is 0. If set to 1, the following parameters will be used in the
+   * rate control system.
+   */
+  int use_vizier_rc_params;
+
+  /*!\brief Active worst quality factor.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t active_wq_factor;
+
+  /*!\brief Error per macroblock adjustment factor.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t err_per_mb_factor;
+
+  /*!\brief Second reference default decay limit.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t sr_default_decay_limit;
+
+  /*!\brief Second reference difference factor.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t sr_diff_factor;
+
+  /*!\brief Keyframe error per macroblock adjustment factor.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t kf_err_per_mb_factor;
+
+  /*!\brief Keyframe minimum boost adjustment factor.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t kf_frame_min_boost_factor;
+
+  /*!\brief Keyframe maximum boost adjustment factor, for the first keyframe
+   * in a chunk.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t kf_frame_max_boost_first_factor;
+
+  /*!\brief Keyframe maximum boost adjustment factor, for subsequent keyframes.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t kf_frame_max_boost_subs_factor;
+
+  /*!\brief Keyframe maximum total boost adjustment factor.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t kf_max_total_boost_factor;
+
+  /*!\brief Golden frame maximum total boost adjustment factor.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t gf_max_total_boost_factor;
+
+  /*!\brief Golden frame maximum boost adjustment factor.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t gf_frame_max_boost_factor;
+
+  /*!\brief Zero motion power factor.
+   *
+   * Rate control parameters, set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t zm_factor;
+
+  /*!\brief Rate-distortion multiplier for inter frames.
+   * The multiplier is a crucial parameter in the calculation of rate distortion
+   * cost. It is often related to the qp (qindex) value.
+   * Rate control parameters, could be set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t rd_mult_inter_qp_fac;
+
+  /*!\brief Rate-distortion multiplier for alt-ref frames.
+   * The multiplier is a crucial parameter in the calculation of rate distortion
+   * cost. It is often related to the qp (qindex) value.
+   * Rate control parameters, could be set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t rd_mult_arf_qp_fac;
+
+  /*!\brief Rate-distortion multiplier for key frames.
+   * The multiplier is a crucial parameter in the calculation of rate distortion
+   * cost. It is often related to the qp (qindex) value.
+   * Rate control parameters, could be set from external experiment results.
+   * Only when |use_vizier_rc_params| is set to 1, the pass in value will be
+   * used. Otherwise, the default value is used.
+   *
+   */
+  vpx_rational_t rd_mult_key_qp_fac;
 } vpx_codec_enc_cfg_t; /**< alias for struct vpx_codec_enc_cfg */
 
 /*!\brief  vp9 svc extra configure parameters
@@ -717,11 +863,12 @@ typedef struct vpx_svc_parameters {
   int scaling_factor_den[VPX_MAX_LAYERS]; /**< Scaling factor-denominator */
   int speed_per_layer[VPX_MAX_LAYERS];    /**< Speed setting for each sl */
   int temporal_layering_mode;             /**< Temporal layering mode */
+  int loopfilter_ctrl[VPX_MAX_LAYERS];    /**< Loopfilter ctrl for each sl */
 } vpx_svc_extra_cfg_t;
 
 /*!\brief Initialize an encoder instance
  *
- * Initializes a encoder context using the given interface. Applications
+ * Initializes an encoder context using the given interface. Applications
  * should call the vpx_codec_enc_init convenience macro instead of this
  * function directly, to ensure that the ABI version number parameter
  * is properly initialized.
@@ -730,9 +877,12 @@ typedef struct vpx_svc_parameters {
  * is not thread safe and should be guarded with a lock if being used
  * in a multithreaded context.
  *
+ * If vpx_codec_enc_init_ver() fails, it is not necessary to call
+ * vpx_codec_destroy() on the encoder context.
+ *
  * \param[in]    ctx     Pointer to this instance's context.
  * \param[in]    iface   Pointer to the algorithm interface to use.
- * \param[in]    cfg     Configuration to use, if known. May be NULL.
+ * \param[in]    cfg     Configuration to use.
  * \param[in]    flags   Bitfield of VPX_CODEC_USE_* flags
  * \param[in]    ver     ABI version number. Must be set to
  *                       VPX_ENCODER_ABI_VERSION
@@ -755,27 +905,32 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx,
 
 /*!\brief Initialize multi-encoder instance
  *
- * Initializes multi-encoder context using the given interface.
+ * Initializes multiple encoder contexts using the given interface.
  * Applications should call the vpx_codec_enc_init_multi convenience macro
  * instead of this function directly, to ensure that the ABI version number
  * parameter is properly initialized.
  *
- * \param[in]    ctx     Pointer to this instance's context.
+ * \param[in]    ctx     Pointer to an array of num_enc instances' contexts.
  * \param[in]    iface   Pointer to the algorithm interface to use.
- * \param[in]    cfg     Configuration to use, if known. May be NULL.
+ * \param[in]    cfg     An array of num_enc configurations to use.
  * \param[in]    num_enc Total number of encoders.
  * \param[in]    flags   Bitfield of VPX_CODEC_USE_* flags
- * \param[in]    dsf     Pointer to down-sampling factors.
+ * \param[in]    dsf     Pointer to an array of num_enc down-sampling factors.
  * \param[in]    ver     ABI version number. Must be set to
  *                       VPX_ENCODER_ABI_VERSION
  * \retval #VPX_CODEC_OK
- *     The decoder algorithm initialized.
+ *     The encoder algorithm has been initialized.
  * \retval #VPX_CODEC_MEM_ERROR
  *     Memory allocation failed.
+ *
+ * \note
+ * This is only supported by VP8. iface must point to the interface to the VP8
+ * encoder.
  */
 vpx_codec_err_t vpx_codec_enc_init_multi_ver(
-    vpx_codec_ctx_t *ctx, vpx_codec_iface_t *iface, vpx_codec_enc_cfg_t *cfg,
-    int num_enc, vpx_codec_flags_t flags, vpx_rational_t *dsf, int ver);
+    vpx_codec_ctx_t *ctx, vpx_codec_iface_t *iface,
+    const vpx_codec_enc_cfg_t *cfg, int num_enc, vpx_codec_flags_t flags,
+    const vpx_rational_t *dsf, int ver);
 
 /*!\brief Convenience macro for vpx_codec_enc_init_multi_ver()
  *
@@ -795,7 +950,7 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(
  *
  * \param[in]    iface     Pointer to the algorithm interface to use.
  * \param[out]   cfg       Configuration buffer to populate.
- * \param[in]    reserved  Must set to 0 for VP8 and VP9.
+ * \param[in]    usage     Must be set to 0.
  *
  * \retval #VPX_CODEC_OK
  *     The configuration was populated.
@@ -806,7 +961,7 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(
  */
 vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface,
                                              vpx_codec_enc_cfg_t *cfg,
-                                             unsigned int reserved);
+                                             unsigned int usage);
 
 /*!\brief Set or change configuration
  *
@@ -829,21 +984,36 @@ vpx_codec_err_t vpx_codec_enc_config_set(vpx_codec_ctx_t *ctx,
  *
  * Retrieves a stream level global header packet, if supported by the codec.
  *
+ * \li VP8: Unsupported
+ * \li VP9: Returns a buffer of <tt>ID (1 byte)|Length (1 byte)|Length
+ * bytes</tt> values. The function should be called after encoding to retrieve
+ * the most accurate information.
+ *
  * \param[in]    ctx     Pointer to this instance's context
  *
  * \retval NULL
  *     Encoder does not support global header
  * \retval Non-NULL
- *     Pointer to buffer containing global header packet
+ *     Pointer to buffer containing global header packet. The buffer pointer
+ *     and its contents are only valid for the lifetime of \a ctx. The contents
+ *     may change in subsequent calls to the function.
+ * \sa
+ * https://www.webmproject.org/docs/container/#vp9-codec-feature-metadata-codecprivate
  */
 vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx);
 
+/*!\brief Encode Deadline
+ *
+ * This type indicates a deadline, in microseconds, to be passed to
+ * vpx_codec_encode().
+ */
+typedef unsigned long vpx_enc_deadline_t;
 /*!\brief deadline parameter analogous to VPx REALTIME mode. */
-#define VPX_DL_REALTIME (1)
+#define VPX_DL_REALTIME 1ul
 /*!\brief deadline parameter analogous to  VPx GOOD QUALITY mode. */
-#define VPX_DL_GOOD_QUALITY (1000000)
+#define VPX_DL_GOOD_QUALITY 1000000ul
 /*!\brief deadline parameter analogous to VPx BEST QUALITY mode. */
-#define VPX_DL_BEST_QUALITY (0)
+#define VPX_DL_BEST_QUALITY 0ul
 /*!\brief Encode a frame
  *
  * Encodes a video frame at the given "presentation time." The presentation
@@ -855,7 +1025,7 @@ vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx);
  * implicit that limiting the available time to encode will degrade the
  * output quality. The encoder can be given an unlimited time to produce the
  * best possible frame by specifying a deadline of '0'. This deadline
- * supercedes the VPx notion of "best quality, good quality, realtime".
+ * supersedes the VPx notion of "best quality, good quality, realtime".
  * Applications that wish to map these former settings to the new deadline
  * based system can use the symbols #VPX_DL_REALTIME, #VPX_DL_GOOD_QUALITY,
  * and #VPX_DL_BEST_QUALITY.
@@ -868,6 +1038,8 @@ vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx);
  *
  * \param[in]    ctx       Pointer to this instance's context
  * \param[in]    img       Image data to encode, NULL to flush.
+ *                         Encoding sample values outside the range
+ *                         [0..(1<<img->bit_depth)-1] is undefined behavior.
  * \param[in]    pts       Presentation time stamp, in timebase units.
  * \param[in]    duration  Duration to show frame, in timebase units.
  * \param[in]    flags     Flags to use for encoding this frame.
@@ -883,7 +1055,7 @@ vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx);
 vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img,
                                  vpx_codec_pts_t pts, unsigned long duration,
                                  vpx_enc_frame_flags_t flags,
-                                 unsigned long deadline);
+                                 vpx_enc_deadline_t deadline);
 
 /*!\brief Set compressed data output buffer
  *
@@ -927,6 +1099,12 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img,
  *     The buffer was set successfully.
  * \retval #VPX_CODEC_INVALID_PARAM
  *     A parameter was NULL, the image format is unsupported, etc.
+ *
+ * \note
+ * `duration` and `deadline` are of the unsigned long type, which can be 32
+ * or 64 bits. `duration` and `deadline` must be less than or equal to
+ * UINT32_MAX so that their ranges are independent of the size of unsigned
+ * long.
  */
 vpx_codec_err_t vpx_codec_set_cx_data_buf(vpx_codec_ctx_t *ctx,
                                           const vpx_fixed_buf_t *buf,
@@ -977,4 +1155,4 @@ const vpx_image_t *vpx_codec_get_preview_frame(vpx_codec_ctx_t *ctx);
 #ifdef __cplusplus
 }
 #endif
-#endif  // VPX_VPX_ENCODER_H_
+#endif  // VPX_VPX_VPX_ENCODER_H_
diff --git a/media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h b/media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h
new file mode 100644
index 0000000000..1c502f8101
--- /dev/null
+++ b/media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h
@@ -0,0 +1,605 @@
+/*
+ *  Copyright (c) 2020 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*!\file
+ * \brief Defines structs and callbacks needed for external rate control.
+ *
+ */
+#ifndef VPX_VPX_VPX_EXT_RATECTRL_H_
+#define VPX_VPX_VPX_EXT_RATECTRL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "./vpx_integer.h"
+#include "./vpx_tpl.h"
+
+/*!\brief Current ABI version number
+ *
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped. Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures.
+ */
+#define VPX_EXT_RATECTRL_ABI_VERSION (7 + VPX_TPL_ABI_VERSION)
+
+/*!\brief Corresponds to MAX_STATIC_GF_GROUP_LENGTH defined in vp9_ratectrl.h
+ */
+#define VPX_RC_MAX_STATIC_GF_GROUP_LENGTH 250
+
+/*!\brief Max number of ref frames returned by the external RC.
+ *
+ * Corresponds to MAX_REF_FRAMES defined in vp9_blockd.h.
+ */
+#define VPX_RC_MAX_REF_FRAMES 4
+
+/*!\brief The type of the external rate control.
+ *
+ * This controls what encoder parameters are determined by the external rate
+ * control.
+ */
+typedef enum vpx_rc_type {
+  /*!
+   * The external rate control doesn't determine anything.
+   * This mode is used as baseline.
+   */
+  VPX_RC_NONE = 0,
+  /*!
+   * The external rate control model determines the quantization parameter (QP)
+   * for each frame.
+   */
+  VPX_RC_QP = 1 << 0,
+  /*!
+   * The external rate control model determines the group of picture (GOP) of
+   * the video sequence.
+   */
+  VPX_RC_GOP = 1 << 1,
+  /*!
+   * The external rate control model determines the rate-distortion multiplier
+   * (rdmult) for the current frame.
+   */
+  VPX_RC_RDMULT = 1 << 2,
+  /*!
+   * The external rate control model determines both QP and GOP.
+   */
+  VPX_RC_GOP_QP = VPX_RC_QP | VPX_RC_GOP,
+  /*!
+   * The external rate control model determines the QP, GOP and the rdmult.
+   */
+  VPX_RC_GOP_QP_RDMULT = VPX_RC_QP | VPX_RC_GOP | VPX_RC_RDMULT
+} vpx_rc_type_t;
+
+/*!\brief The rate control mode for the external rate control model.
+ */
+typedef enum vpx_ext_rc_mode {
+  VPX_RC_QMODE = 0,
+  VPX_RC_VBR = 1,
+  VPX_RC_CQ = 2,
+} vpx_ext_rc_mode_t;
+
+/*!\brief Corresponds to FRAME_UPDATE_TYPE defined in vp9_firstpass.h.
+ */
+typedef enum vpx_rc_frame_update_type {
+  VPX_RC_INVALID_UPDATE_TYPE = -1,
+  VPX_RC_KF_UPDATE = 0,
+  VPX_RC_LF_UPDATE = 1,
+  VPX_RC_GF_UPDATE = 2,
+  VPX_RC_ARF_UPDATE = 3,
+  VPX_RC_OVERLAY_UPDATE = 4,
+  VPX_RC_MID_OVERLAY_UPDATE = 5,
+  VPX_RC_USE_BUF_FRAME = 6,
+} vpx_rc_frame_update_type_t;
+
+/*!\brief Name for the ref frames returned by the external RC.
+ *
+ * Corresponds to the ref frames defined in vp9_blockd.h.
+ */
+typedef enum vpx_rc_ref_name {
+  VPX_RC_INVALID_REF_FRAME = -1,
+  VPX_RC_INTRA_FRAME = 0,
+  VPX_RC_LAST_FRAME = 1,
+  VPX_RC_GOLDEN_FRAME = 2,
+  VPX_RC_ALTREF_FRAME = 3,
+} vpx_rc_ref_name_t;
+
+/*!\brief Abstract rate control model handler
+ *
+ * The encoder will receive the model handler from
+ * vpx_rc_funcs_t::create_model().
+ */
+typedef void *vpx_rc_model_t;
+
+/*!\brief A reserved value for the q index.
+ * If the external rate control model returns this value,
+ * the encoder will use the default q selected by libvpx's rate control
+ * system.
+ */
+#define VPX_DEFAULT_Q -1
+
+/*!\brief A reserved value for the rdmult.
+ * If the external rate control model returns this value,
+ * the encoder will use the default rdmult selected by libvpx's rate control
+ * system.
+ */
+#define VPX_DEFAULT_RDMULT -1
+
+/*!\brief Superblock quantization parameters
+ * Store the superblock quantization parameters
+ */
+typedef struct sb_parameters {
+  int q_index; /**< Quantizer step index [0..255]*/
+  int rdmult;  /**< Superblock level Lagrangian multiplier*/
+} sb_params;
+
+/*!\brief Encode frame decision made by the external rate control model
+ *
+ * The encoder will receive the decision from the external rate control model
+ * through vpx_rc_funcs_t::get_encodeframe_decision().
+ */
+typedef struct vpx_rc_encodeframe_decision {
+  int q_index;    /**< Required: Quantizer step index [0..255]*/
+  int rdmult;     /**< Required: Frame level Lagrangian multiplier*/
+  int delta_q_uv; /**< Required: Delta QP for UV */
+  /*!
+   * Optional: Superblock quantization parameters
+   * It is zero initialized by default. It will be set for key and ARF frames
+   * but not leaf frames.
+   */
+  sb_params *sb_params_list;
+} vpx_rc_encodeframe_decision_t;
+
+/*!\brief Information for the frame to be encoded.
+ *
+ * The encoder will send the information to external rate control model through
+ * vpx_rc_funcs_t::get_encodeframe_decision().
+ *
+ */
+typedef struct vpx_rc_encodeframe_info {
+  /*!
+   * 0: Key frame
+   * 1: Inter frame
+   * 2: Alternate reference frame
+   * 3: Overlay frame
+   * 4: Golden frame
+   */
+  int frame_type;
+  int show_index;   /**< display index, starts from zero*/
+  int coding_index; /**< coding index, starts from zero*/
+  /*!
+   * index of the current frame in this group of picture, starts from zero.
+   */
+  int gop_index;
+  int ref_frame_coding_indexes[3]; /**< three reference frames' coding indices*/
+  /*!
+   * The validity of the three reference frames.
+   * 0: Invalid
+   * 1: Valid
+   */
+  int ref_frame_valid_list[3];
+  /*!
+   * The length of the current GOP.
+   */
+  int gop_size;
+  /*!
+   * Whether the current GOP uses an alt ref.
+   */
+  int use_alt_ref;
+} vpx_rc_encodeframe_info_t;
+
+/*!\brief Frame coding result
+ *
+ * The encoder will send the result to the external rate control model through
+ * vpx_rc_funcs_t::update_encodeframe_result().
+ */
+typedef struct vpx_rc_encodeframe_result {
+  int64_t bit_count;          /**< number of bits spent on coding the frame*/
+  int actual_encoding_qindex; /**< the actual qindex used to encode the frame*/
+} vpx_rc_encodeframe_result_t;
+
+/*!\brief Status returned by rate control callback functions.
+ */
+typedef enum vpx_rc_status {
+  VPX_RC_OK = 0,
+  VPX_RC_ERROR = 1,
+} vpx_rc_status_t;
+
+/*!\brief First pass frame stats
+ * This is a mirror of vp9's FIRSTPASS_STATS except that spatial_layer_id is
+ * omitted
+ */
+typedef struct vpx_rc_frame_stats {
+  /*!
+   * Frame number in display order, if stats are for a single frame.
+   * No real meaning for a collection of frames.
+   */
+  double frame;
+  /*!
+   * Weight assigned to this frame (or total weight for the collection of
+   * frames) currently based on intra factor and brightness factor. This is used
+   * to distribute bits between easier and harder frames.
+   */
+  double weight;
+  /*!
+   * Intra prediction error.
+   */
+  double intra_error;
+  /*!
+   * Best of intra pred error and inter pred error using last frame as ref.
+   */
+  double coded_error;
+  /*!
+   * Best of intra pred error and inter pred error using golden frame as ref.
+   */
+  double sr_coded_error;
+  /*!
+   * Estimate the noise energy of the current frame.
+   */
+  double frame_noise_energy;
+  /*!
+   * Percentage of blocks with inter pred error < intra pred error.
+   */
+  double pcnt_inter;
+  /*!
+   * Percentage of blocks using (inter prediction and) non-zero motion vectors.
+   */
+  double pcnt_motion;
+  /*!
+   * Percentage of blocks where golden frame was better than last or intra:
+   * inter pred error using golden frame < inter pred error using last frame and
+   * inter pred error using golden frame < intra pred error
+   */
+  double pcnt_second_ref;
+  /*!
+   * Percentage of blocks where intra and inter prediction errors were very
+   * close.
+   */
+  double pcnt_neutral;
+  /*!
+   * Percentage of blocks that have intra error < inter error and inter error <
+   * LOW_I_THRESH
+   * - bit_depth 8: LOW_I_THRESH = 24000
+   * - bit_depth 10: LOW_I_THRESH = 24000 << 4
+   * - bit_depth 12: LOW_I_THRESH = 24000 << 8
+   */
+  double pcnt_intra_low;
+  /*!
+   * Percentage of blocks that have intra error < inter error and intra error <
+   * LOW_I_THRESH but inter error >= LOW_I_THRESH LOW_I_THRESH
+   * - bit_depth 8: LOW_I_THRESH = 24000
+   * - bit_depth 10: LOW_I_THRESH = 24000 << 4
+   * - bit_depth 12: LOW_I_THRESH = 24000 << 8
+   */
+  double pcnt_intra_high;
+  /*!
+   * Percentage of blocks that have almost no intra error residual
+   * (i.e. are in effect completely flat and untextured in the intra
+   * domain). In natural videos this is uncommon, but it is much more
+   * common in animations, graphics and screen content, so may be used
+   * as a signal to detect these types of content.
+   */
+  double intra_skip_pct;
+  /*!
+   * Percentage of blocks that have intra error < SMOOTH_INTRA_THRESH
+   * - bit_depth 8:  SMOOTH_INTRA_THRESH = 4000
+   * - bit_depth 10: SMOOTH_INTRA_THRESH = 4000 << 4
+   * - bit_depth 12: SMOOTH_INTRA_THRESH = 4000 << 8
+   */
+  double intra_smooth_pct;
+  /*!
+   * Image mask rows top and bottom.
+   */
+  double inactive_zone_rows;
+  /*!
+   * Image mask columns at left and right edges.
+   */
+  double inactive_zone_cols;
+  /*!
+   * Mean of row motion vectors.
+   */
+  double MVr;
+  /*!
+   * Mean of absolute value of row motion vectors.
+   */
+  double mvr_abs;
+  /*!
+   * Mean of column motion vectors.
+   */
+  double MVc;
+  /*!
+   * Mean of absolute value of column motion vectors.
+   */
+  double mvc_abs;
+  /*!
+   * Variance of row motion vectors.
+   */
+  double MVrv;
+  /*!
+   * Variance of column motion vectors.
+   */
+  double MVcv;
+  /*!
+   * Value in range [-1,1] indicating fraction of row and column motion vectors
+   * that point inwards (negative MV value) or outwards (positive MV value).
+   * For example, value of 1 indicates, all row/column MVs are inwards.
+   */
+  double mv_in_out_count;
+  /*!
+   * Duration of the frame / collection of frames.
+   */
+  double duration;
+  /*!
+   * 1.0 if stats are for a single frame, or
+   * number of frames whose stats are accumulated.
+   */
+  double count;
+  /*!
+   * Number of new mv in a frame.
+   */
+  double new_mv_count;
+} vpx_rc_frame_stats_t;
+
+/*!\brief Collection of first pass frame stats
+ */
+typedef struct vpx_rc_firstpass_stats {
+  /*!
+   * Pointer to first pass frame stats.
+   * The pointed array of vpx_rc_frame_stats_t should have length equal to
+   * number of show frames in the video.
+   */
+  vpx_rc_frame_stats_t *frame_stats;
+  /*!
+   * Number of show frames in the video.
+   */
+  int num_frames;
+} vpx_rc_firstpass_stats_t;
+
+/*!\brief Encode config sent to external rate control model
+ */
+typedef struct vpx_rc_config {
+  int frame_width;      /**< frame width */
+  int frame_height;     /**< frame height */
+  int show_frame_count; /**< number of visible frames in the video */
+  int max_gf_interval;  /**< max GOP size in number of show frames */
+  int min_gf_interval;  /**< min GOP size in number of show frames */
+  /*!
+   * Target bitrate in kilobytes per second
+   */
+  int target_bitrate_kbps;
+  int frame_rate_num; /**< numerator of frame rate */
+  int frame_rate_den; /**< denominator of frame rate */
+  /*!
+   * The following fields are only for external rate control models that support
+   * different rate control modes.
+   */
+  vpx_ext_rc_mode_t rc_mode; /**< Q mode or VBR mode */
+  int overshoot_percent;     /**< for VBR mode only */
+  int undershoot_percent;    /**< for VBR mode only */
+  int min_base_q_index;      /**< for VBR mode only */
+  int max_base_q_index;      /**< for VBR mode only */
+  int base_qp;               /**< base QP for leaf frames, 0-255 */
+} vpx_rc_config_t;
+
+/*!\brief Control what ref frame to use and its index.
+ */
+typedef struct vpx_rc_ref_frame {
+  /*!
+   * Ref frame index. Corresponding to |lst_fb_idx|, |gld_fb_idx| or
+   * |alt_fb_idx| in VP9_COMP depending on the ref frame #name.
+   */
+  int index[VPX_RC_MAX_REF_FRAMES];
+  /*!
+   * Ref frame name. This decides whether the #index is used as
+   * |lst_fb_idx|, |gld_fb_idx| or |alt_fb_idx| in VP9_COMP.
+   *
+   */
+  vpx_rc_ref_name_t name[VPX_RC_MAX_REF_FRAMES];
+} vpx_rc_ref_frame_t;
+
+/*!\brief The decision made by the external rate control model to set the
+ * group of picture.
+ */
+typedef struct vpx_rc_gop_decision {
+  int gop_coding_frames; /**< The number of frames of this GOP */
+  int use_alt_ref;       /**< Whether to use alt ref for this GOP */
+  int use_key_frame;     /**< Whether to set key frame for this GOP */
+  /*!
+   * Frame type for each frame in this GOP.
+   * This will be populated to |update_type| in GF_GROUP defined in
+   * vp9_firstpass.h
+   */
+  vpx_rc_frame_update_type_t update_type[VPX_RC_MAX_STATIC_GF_GROUP_LENGTH + 2];
+  /*! Ref frame buffer index to be updated for each frame in this GOP. */
+  int update_ref_index[VPX_RC_MAX_STATIC_GF_GROUP_LENGTH + 2];
+  /*! Ref frame list to be used for each frame in this GOP. */
+  vpx_rc_ref_frame_t ref_frame_list[VPX_RC_MAX_STATIC_GF_GROUP_LENGTH + 2];
+} vpx_rc_gop_decision_t;
+
+/*!\brief The decision made by the external rate control model to set the
+ * key frame location and the show frame count in the key frame group
+ */
+typedef struct vpx_rc_key_frame_decision {
+  int key_frame_show_index; /**< This key frame's show index in the video */
+  int key_frame_group_size; /**< Show frame count of this key frame group */
+} vpx_rc_key_frame_decision_t;
+
+/*!\brief Create an external rate control model callback prototype
+ *
+ * This callback is invoked by the encoder to create an external rate control
+ * model.
+ *
+ * \param[in]  priv                Callback's private data
+ * \param[in]  ratectrl_config     Pointer to vpx_rc_config_t
+ * \param[out] rate_ctrl_model_ptr Pointer to vpx_rc_model_t
+ */
+typedef vpx_rc_status_t (*vpx_rc_create_model_cb_fn_t)(
+    void *priv, const vpx_rc_config_t *ratectrl_config,
+    vpx_rc_model_t *rate_ctrl_model_ptr);
+
+/*!\brief Send first pass stats to the external rate control model callback
+ * prototype
+ *
+ * This callback is invoked by the encoder to send first pass stats to the
+ * external rate control model.
+ *
+ * \param[in]  rate_ctrl_model    rate control model
+ * \param[in]  first_pass_stats   first pass stats
+ */
+typedef vpx_rc_status_t (*vpx_rc_send_firstpass_stats_cb_fn_t)(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_firstpass_stats_t *first_pass_stats);
+
+/*!\brief Send TPL stats for the current GOP to the external rate control model
+ * callback prototype
+ *
+ * This callback is invoked by the encoder to send TPL stats for the GOP to the
+ * external rate control model.
+ *
+ * \param[in]  rate_ctrl_model  rate control model
+ * \param[in]  tpl_gop_stats    TPL stats for current GOP
+ */
+typedef vpx_rc_status_t (*vpx_rc_send_tpl_gop_stats_cb_fn_t)(
+    vpx_rc_model_t rate_ctrl_model, const VpxTplGopStats *tpl_gop_stats);
+
+/*!\brief Receive encode frame decision callback prototype
+ *
+ * This callback is invoked by the encoder to receive encode frame decision from
+ * the external rate control model.
+ *
+ * \param[in]  rate_ctrl_model    rate control model
+ * \param[in]  frame_gop_index    index of the frame in current gop
+ * \param[out] frame_decision     encode decision of the coding frame
+ */
+typedef vpx_rc_status_t (*vpx_rc_get_encodeframe_decision_cb_fn_t)(
+    vpx_rc_model_t rate_ctrl_model, const int frame_gop_index,
+    vpx_rc_encodeframe_decision_t *frame_decision);
+
+/*!\brief Update encode frame result callback prototype
+ *
+ * This callback is invoked by the encoder to update encode frame result to the
+ * external rate control model.
+ *
+ * \param[in]  rate_ctrl_model     rate control model
+ * \param[out] encode_frame_result encode result of the coding frame
+ */
+typedef vpx_rc_status_t (*vpx_rc_update_encodeframe_result_cb_fn_t)(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_encodeframe_result_t *encode_frame_result);
+
+/*!\brief Get the key frame decision from the external rate control model.
+ *
+ * This callback is invoked by the encoder to get key frame decision from
+ * the external rate control model.
+ *
+ * \param[in]  rate_ctrl_model    rate control model
+ * \param[out] key_frame_decision key frame decision from the model
+ */
+typedef vpx_rc_status_t (*vpx_rc_get_key_frame_decision_cb_fn_t)(
+    vpx_rc_model_t rate_ctrl_model,
+    vpx_rc_key_frame_decision_t *key_frame_decision);
+
+/*!\brief Get the GOP structure from the external rate control model.
+ *
+ * This callback is invoked by the encoder to get GOP decisions from
+ * the external rate control model.
+ *
+ * \param[in]  rate_ctrl_model  rate control model
+ * \param[out] gop_decision     GOP decision from the model
+ */
+typedef vpx_rc_status_t (*vpx_rc_get_gop_decision_cb_fn_t)(
+    vpx_rc_model_t rate_ctrl_model, vpx_rc_gop_decision_t *gop_decision);
+
+/*!\brief Get the frame rdmult from the external rate control model.
+ *
+ * This callback is invoked by the encoder to get rdmult from
+ * the external rate control model.
+ *
+ * \param[in]  rate_ctrl_model  rate control model
+ * \param[in]  frame_info       information collected from the encoder
+ * \param[out] rdmult           frame rate-distortion multiplier from the model
+ */
+typedef vpx_rc_status_t (*vpx_rc_get_frame_rdmult_cb_fn_t)(
+    vpx_rc_model_t rate_ctrl_model, const vpx_rc_encodeframe_info_t *frame_info,
+    int *rdmult);
+
+/*!\brief Delete the external rate control model callback prototype
+ *
+ * This callback is invoked by the encoder to delete the external rate control
+ * model.
+ *
+ * \param[in]  rate_ctrl_model     rate control model
+ */
+typedef vpx_rc_status_t (*vpx_rc_delete_model_cb_fn_t)(
+    vpx_rc_model_t rate_ctrl_model);
+
+/*!\brief Callback function set for external rate control.
+ *
+ * The user can enable external rate control by registering
+ * a set of callback functions with the codec control flag
+ * #VP9E_SET_EXTERNAL_RATE_CONTROL.
+ */
+typedef struct vpx_rc_funcs {
+  /*!
+   * The rate control type of this API.
+   */
+  vpx_rc_type_t rc_type;
+  /*!
+   * Create an external rate control model.
+   */
+  vpx_rc_create_model_cb_fn_t create_model;
+  /*!
+   * Send first pass stats to the external rate control model.
+   */
+  vpx_rc_send_firstpass_stats_cb_fn_t send_firstpass_stats;
+  /*!
+   * Send TPL stats for current GOP to the external rate control model.
+   */
+  vpx_rc_send_tpl_gop_stats_cb_fn_t send_tpl_gop_stats;
+  /*!
+   * Get encodeframe decision from the external rate control model.
+   */
+  vpx_rc_get_encodeframe_decision_cb_fn_t get_encodeframe_decision;
+  /*!
+   * Update encodeframe result to the external rate control model.
+   */
+  vpx_rc_update_encodeframe_result_cb_fn_t update_encodeframe_result;
+  /*!
+   * Get key frame decision from the external rate control model.
+   */
+  vpx_rc_get_key_frame_decision_cb_fn_t get_key_frame_decision;
+  /*!
+   * Get GOP decisions from the external rate control model.
+   */
+  vpx_rc_get_gop_decision_cb_fn_t get_gop_decision;
+  /*!
+   * Get rdmult for the frame from the external rate control model.
+   */
+  vpx_rc_get_frame_rdmult_cb_fn_t get_frame_rdmult;
+  /*!
+   * Delete the external rate control model.
+   */
+  vpx_rc_delete_model_cb_fn_t delete_model;
+
+  /*!
+   * Rate control log path.
+   */
+  const char *rate_ctrl_log_path;
+  /*!
+   * Private data for the external rate control model.
+   */
+  void *priv;
+} vpx_rc_funcs_t;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VPX_VPX_EXT_RATECTRL_H_
diff --git a/media/libvpx/libvpx/vpx/vpx_frame_buffer.h b/media/libvpx/libvpx/vpx/vpx_frame_buffer.h
index ad70cdd572..fc8320017b 100644
--- a/media/libvpx/libvpx/vpx/vpx_frame_buffer.h
+++ b/media/libvpx/libvpx/vpx/vpx_frame_buffer.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_VPX_FRAME_BUFFER_H_
-#define VPX_VPX_FRAME_BUFFER_H_
+#ifndef VPX_VPX_VPX_FRAME_BUFFER_H_
+#define VPX_VPX_VPX_FRAME_BUFFER_H_
 
 /*!\file
  * \brief Describes the decoder external frame buffer interface.
@@ -52,12 +52,12 @@ typedef struct vpx_codec_frame_buffer {
  * data. The callback is triggered when the decoder needs a frame buffer to
  * decode a compressed image into. This function may be called more than once
  * for every call to vpx_codec_decode. The application may set fb->priv to
- * some data which will be passed back in the ximage and the release function
- * call. |fb| is guaranteed to not be NULL. On success the callback must
- * return 0. Any failure the callback must return a value less than 0.
+ * some data which will be passed back in the vpx_image_t and the release
+ * function call. |fb| is guaranteed to not be NULL. On success the callback
+ * must return 0. Any failure the callback must return a value less than 0.
  *
  * \param[in] priv         Callback's private data
- * \param[in] new_size     Size in bytes needed by the buffer
+ * \param[in] min_size     Size in bytes needed by the buffer
  * \param[in,out] fb       Pointer to vpx_codec_frame_buffer_t
  */
 typedef int (*vpx_get_frame_buffer_cb_fn_t)(void *priv, size_t min_size,
@@ -80,4 +80,4 @@ typedef int (*vpx_release_frame_buffer_cb_fn_t)(void *priv,
 }  // extern "C"
 #endif
 
-#endif  // VPX_VPX_FRAME_BUFFER_H_
+#endif  // VPX_VPX_VPX_FRAME_BUFFER_H_
diff --git a/media/libvpx/libvpx/vpx/vpx_image.h b/media/libvpx/libvpx/vpx/vpx_image.h
index d6d3166d2f..cca1c42f00 100644
--- a/media/libvpx/libvpx/vpx/vpx_image.h
+++ b/media/libvpx/libvpx/vpx/vpx_image.h
@@ -12,8 +12,8 @@
  * \brief Describes the vpx image descriptor and associated operations
  *
  */
-#ifndef VPX_VPX_IMAGE_H_
-#define VPX_VPX_IMAGE_H_
+#ifndef VPX_VPX_VPX_IMAGE_H_
+#define VPX_VPX_VPX_IMAGE_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -27,7 +27,7 @@ extern "C" {
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures
  */
-#define VPX_IMAGE_ABI_VERSION (4) /**<\hideinitializer*/
+#define VPX_IMAGE_ABI_VERSION (5) /**<\hideinitializer*/
 
 #define VPX_IMG_FMT_PLANAR 0x100       /**< Image is a planar format. */
 #define VPX_IMG_FMT_UV_FLIP 0x200      /**< V plane precedes U in memory. */
@@ -37,29 +37,13 @@ extern "C" {
 /*!\brief List of supported image formats */
 typedef enum vpx_img_fmt {
   VPX_IMG_FMT_NONE,
-  VPX_IMG_FMT_RGB24,     /**< 24 bit per pixel packed RGB */
-  VPX_IMG_FMT_RGB32,     /**< 32 bit per pixel packed 0RGB */
-  VPX_IMG_FMT_RGB565,    /**< 16 bit per pixel, 565 */
-  VPX_IMG_FMT_RGB555,    /**< 16 bit per pixel, 555 */
-  VPX_IMG_FMT_UYVY,      /**< UYVY packed YUV */
-  VPX_IMG_FMT_YUY2,      /**< YUYV packed YUV */
-  VPX_IMG_FMT_YVYU,      /**< YVYU packed YUV */
-  VPX_IMG_FMT_BGR24,     /**< 24 bit per pixel packed BGR */
-  VPX_IMG_FMT_RGB32_LE,  /**< 32 bit packed BGR0 */
-  VPX_IMG_FMT_ARGB,      /**< 32 bit packed ARGB, alpha=255 */
-  VPX_IMG_FMT_ARGB_LE,   /**< 32 bit packed BGRA, alpha=255 */
-  VPX_IMG_FMT_RGB565_LE, /**< 16 bit per pixel, gggbbbbb rrrrrggg */
-  VPX_IMG_FMT_RGB555_LE, /**< 16 bit per pixel, gggbbbbb 0rrrrrgg */
   VPX_IMG_FMT_YV12 =
       VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP | 1, /**< planar YVU */
   VPX_IMG_FMT_I420 = VPX_IMG_FMT_PLANAR | 2,
-  VPX_IMG_FMT_VPXYV12 = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP |
-                        3, /** < planar 4:2:0 format with vpx color space */
-  VPX_IMG_FMT_VPXI420 = VPX_IMG_FMT_PLANAR | 4,
   VPX_IMG_FMT_I422 = VPX_IMG_FMT_PLANAR | 5,
   VPX_IMG_FMT_I444 = VPX_IMG_FMT_PLANAR | 6,
   VPX_IMG_FMT_I440 = VPX_IMG_FMT_PLANAR | 7,
-  VPX_IMG_FMT_444A = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_HAS_ALPHA | 6,
+  VPX_IMG_FMT_NV12 = VPX_IMG_FMT_PLANAR | 9,
   VPX_IMG_FMT_I42016 = VPX_IMG_FMT_I420 | VPX_IMG_FMT_HIGHBITDEPTH,
   VPX_IMG_FMT_I42216 = VPX_IMG_FMT_I422 | VPX_IMG_FMT_HIGHBITDEPTH,
   VPX_IMG_FMT_I44416 = VPX_IMG_FMT_I444 | VPX_IMG_FMT_HIGHBITDEPTH,
@@ -80,8 +64,12 @@ typedef enum vpx_color_space {
 
 /*!\brief List of supported color range */
 typedef enum vpx_color_range {
-  VPX_CR_STUDIO_RANGE = 0, /**< Y [16..235], UV [16..240] */
-  VPX_CR_FULL_RANGE = 1    /**< YUV/RGB [0..255] */
+  VPX_CR_STUDIO_RANGE = 0, /**<- Y  [16..235],  UV  [16..240]  (bit depth 8) */
+                           /**<- Y  [64..940],  UV  [64..960]  (bit depth 10) */
+                           /**<- Y [256..3760], UV [256..3840] (bit depth 12) */
+  VPX_CR_FULL_RANGE = 1    /**<- YUV/RGB [0..255]  (bit depth 8) */
+                           /**<- YUV/RGB [0..1023] (bit depth 10) */
+                           /**<- YUV/RGB [0..4095] (bit depth 12) */
 } vpx_color_range_t;       /**< alias for enum vpx_color_range */
 
 /**\brief Image Descriptor */
@@ -142,16 +130,19 @@ typedef struct vpx_image_rect {
 /*!\brief Open a descriptor, allocating storage for the underlying image
  *
  * Returns a descriptor for storing an image of the given format. The
- * storage for the descriptor is allocated on the heap.
+ * storage for the image is allocated on the heap.
  *
  * \param[in]    img       Pointer to storage for descriptor. If this parameter
  *                         is NULL, the storage for the descriptor will be
  *                         allocated on the heap.
  * \param[in]    fmt       Format for the image
- * \param[in]    d_w       Width of the image
- * \param[in]    d_h       Height of the image
+ * \param[in]    d_w       Width of the image. Must not exceed 0x08000000
+ *                         (2^27).
+ * \param[in]    d_h       Height of the image. Must not exceed 0x08000000
+ *                         (2^27).
  * \param[in]    align     Alignment, in bytes, of the image buffer and
- *                         each row in the image(stride).
+ *                         each row in the image (stride). Must not exceed
+ *                         65536.
  *
  * \return Returns a pointer to the initialized image descriptor. If the img
  *         parameter is non-null, the value of the img parameter will be
@@ -164,30 +155,65 @@ vpx_image_t *vpx_img_alloc(vpx_image_t *img, vpx_img_fmt_t fmt,
 /*!\brief Open a descriptor, using existing storage for the underlying image
  *
  * Returns a descriptor for storing an image of the given format. The
- * storage for descriptor has been allocated elsewhere, and a descriptor is
+ * storage for the image has been allocated elsewhere, and a descriptor is
  * desired to "wrap" that storage.
  *
- * \param[in]    img       Pointer to storage for descriptor. If this parameter
- *                         is NULL, the storage for the descriptor will be
- *                         allocated on the heap.
- * \param[in]    fmt       Format for the image
- * \param[in]    d_w       Width of the image
- * \param[in]    d_h       Height of the image
- * \param[in]    align     Alignment, in bytes, of each row in the image.
- * \param[in]    img_data  Storage to use for the image
+ * \param[in]    img           Pointer to storage for descriptor. If this
+ *                             parameter is NULL, the storage for the descriptor
+ *                             will be allocated on the heap.
+ * \param[in]    fmt           Format for the image
+ * \param[in]    d_w           Width of the image. Must not exceed 0x08000000
+ *                             (2^27).
+ * \param[in]    d_h           Height of the image. Must not exceed 0x08000000
+ *                             (2^27).
+ * \param[in]    stride_align  Alignment, in bytes, of each row in the image
+ *                             (stride). Must not exceed 65536.
+ * \param[in]    img_data      Storage to use for the image. The storage must
+ *                             outlive the returned image descriptor; it can be
+ *                             disposed of after calling vpx_img_free().
  *
  * \return Returns a pointer to the initialized image descriptor. If the img
  *         parameter is non-null, the value of the img parameter will be
  *         returned.
+ *
+ * \note \a img_data is required to have a minimum allocation size that
+ *       satisfies the requirements of the \a fmt, \a d_w, \a d_h and \a
+ *       stride_align parameters. This size can be calculated as follows (see
+ *       \c img_alloc_helper in the vpx_image.c file in the libvpx source tree
+ *       for more detail):
+ * \code
+ * align = (1 << x_chroma_shift) - 1;
+ * w = (d_w + align) & ~align;
+ * align = (1 << y_chroma_shift) - 1;
+ * h = (d_h + align) & ~align;
+ *
+ * s = (fmt & VPX_IMG_FMT_PLANAR) ? w : (uint64_t)bps * w / 8;
+ * s = (fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? s * 2 : s;
+ * s = (s + stride_align - 1) & ~((uint64_t)stride_align - 1);
+ * s = (fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? s / 2 : s;
+ * alloc_size = (fmt & VPX_IMG_FMT_PLANAR) ? (uint64_t)h * s * bps / 8
+ *                                         : (uint64_t)h * s;
+ * \endcode
+ * \a x_chroma_shift, \a y_chroma_shift and \a bps can be obtained by calling
+ * \ref vpx_img_wrap with a non-\c NULL \a img_data parameter. The \c
+ * vpx_image_t pointer should \em not be used in other API calls until \em after
+ * a successful call to \ref vpx_img_wrap with a valid image buffer. For
+ * example:
+ * \code
+ * vpx_img_wrap(img, fmt, d_w, d_h, stride_align, (unsigned char *)1);
+ * ... calculate buffer size and allocate buffer as described earlier
+ * vpx_img_wrap(img, fmt, d_w, d_h, stride_align, img_data);
+ * \endcode
  */
 vpx_image_t *vpx_img_wrap(vpx_image_t *img, vpx_img_fmt_t fmt, unsigned int d_w,
-                          unsigned int d_h, unsigned int align,
+                          unsigned int d_h, unsigned int stride_align,
                           unsigned char *img_data);
 
 /*!\brief Set the rectangle identifying the displayed portion of the image
  *
  * Updates the displayed rectangle (aka viewport) on the image surface to
- * match the specified coordinates and size.
+ * match the specified coordinates and size. Specifically, sets img->d_w,
+ * img->d_h, and elements of the img->planes[] array.
  *
  * \param[in]    img       Image descriptor
  * \param[in]    x         leftmost column
@@ -195,7 +221,7 @@ vpx_image_t *vpx_img_wrap(vpx_image_t *img, vpx_img_fmt_t fmt, unsigned int d_w,
  * \param[in]    w         width
  * \param[in]    h         height
  *
- * \return 0 if the requested rectangle is valid, nonzero otherwise.
+ * \return 0 if the requested rectangle is valid, nonzero (-1) otherwise.
  */
 int vpx_img_set_rect(vpx_image_t *img, unsigned int x, unsigned int y,
                      unsigned int w, unsigned int h);
@@ -221,4 +247,4 @@ void vpx_img_free(vpx_image_t *img);
 }  // extern "C"
 #endif
 
-#endif  // VPX_VPX_IMAGE_H_
+#endif  // VPX_VPX_VPX_IMAGE_H_
diff --git a/media/libvpx/libvpx/vpx/vpx_integer.h b/media/libvpx/libvpx/vpx/vpx_integer.h
index 0c27142ff9..34e3796411 100644
--- a/media/libvpx/libvpx/vpx/vpx_integer.h
+++ b/media/libvpx/libvpx/vpx/vpx_integer.h
@@ -8,39 +8,22 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_VPX_INTEGER_H_
-#define VPX_VPX_INTEGER_H_
+#ifndef VPX_VPX_VPX_INTEGER_H_
+#define VPX_VPX_VPX_INTEGER_H_
 
 /* get ptrdiff_t, size_t, wchar_t, NULL */
-#include <stddef.h>
+#include <stddef.h>  // IWYU pragma: export
 
 #if defined(_MSC_VER)
 #define VPX_FORCE_INLINE __forceinline
 #define VPX_INLINE __inline
 #else
-#define VPX_FORCE_INLINE __inline__ __attribute__(always_inline)
+#define VPX_FORCE_INLINE __inline__ __attribute__((always_inline))
 // TODO(jbb): Allow a way to force inline off for older compilers.
 #define VPX_INLINE inline
 #endif
 
-#if !defined(VPX_DONT_DEFINE_STDINT_TYPES)
-
-#if defined(VPX_EMULATE_INTTYPES)
-typedef signed char int8_t;
-typedef signed short int16_t;
-typedef signed int int32_t;
-
-typedef unsigned char uint8_t;
-typedef unsigned short uint16_t;
-typedef unsigned int uint32_t;
-
-#ifndef _UINTPTR_T_DEFINED
-typedef size_t uintptr_t;
-#endif
-
-#else
-
-/* Most platforms have the C99 standard integer types. */
+/* Assume platforms have the C99 standard integer types. */
 
 #if defined(__cplusplus)
 #if !defined(__STDC_FORMAT_MACROS)
@@ -51,17 +34,7 @@ typedef size_t uintptr_t;
 #endif
 #endif  // __cplusplus
 
-#include <stdint.h>
+#include <inttypes.h>  // IWYU pragma: export
+#include <stdint.h>    // IWYU pragma: export
 
-#endif
-
-#endif // VPX_DONT_DEFINE_STDINT_TYPES
-
-/* VS2010 defines stdint.h, but not inttypes.h */
-#if defined(_MSC_VER) && _MSC_VER < 1800
-#define PRId64 "I64d"
-#else
-#include <inttypes.h>
-#endif
-
-#endif  // VPX_VPX_INTEGER_H_
+#endif  // VPX_VPX_VPX_INTEGER_H_
diff --git a/media/libvpx/libvpx/vpx/vpx_tpl.h b/media/libvpx/libvpx/vpx/vpx_tpl.h
new file mode 100644
index 0000000000..ecee5d1efe
--- /dev/null
+++ b/media/libvpx/libvpx/vpx/vpx_tpl.h
@@ -0,0 +1,69 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*!\file
+ * \brief Describes the TPL stats descriptor and associated operations
+ *
+ */
+#ifndef VPX_VPX_VPX_TPL_H_
+#define VPX_VPX_VPX_TPL_H_
+
+#include "./vpx_integer.h"
+#include "./vpx_codec.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief Current ABI version number
+ *
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped.  Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures
+ */
+#define VPX_TPL_ABI_VERSION 5 /**<\hideinitializer*/
+
+/*!\brief Temporal dependency model stats for each block before propagation */
+typedef struct VpxTplBlockStats {
+  int16_t row;            /**< Pixel row of the top left corner */
+  int16_t col;            /**< Pixel col of the top left corner */
+  int64_t intra_cost;     /**< Intra cost */
+  int64_t inter_cost;     /**< Inter cost */
+  int16_t mv_r;           /**< Motion vector row in pixel */
+  int16_t mv_c;           /**< Motion vector col in pixel */
+  int64_t srcrf_rate;     /**< Rate from source ref frame */
+  int64_t srcrf_dist;     /**< Distortion from source ref frame */
+  int64_t pred_error;     /**< Prediction error */
+  int64_t inter_pred_err; /**< Inter prediction error */
+  int64_t intra_pred_err; /**< Intra prediction error */
+  int ref_frame_index;    /**< Ref frame index in the ref frame buffer */
+} VpxTplBlockStats;
+
+/*!\brief Temporal dependency model stats for each frame before propagation */
+typedef struct VpxTplFrameStats {
+  int frame_width;  /**< Frame width */
+  int frame_height; /**< Frame height */
+  int num_blocks;   /**< Number of blocks. Size of block_stats_list */
+  VpxTplBlockStats *block_stats_list; /**< List of tpl stats for each block */
+} VpxTplFrameStats;
+
+/*!\brief Temporal dependency model stats for each GOP before propagation */
+typedef struct VpxTplGopStats {
+  int size; /**< GOP size, also the size of frame_stats_list. */
+  VpxTplFrameStats *frame_stats_list; /**< List of tpl stats for each frame */
+} VpxTplGopStats;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VPX_VPX_TPL_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/add_noise.c b/media/libvpx/libvpx/vpx_dsp/add_noise.c
index a2b4c9010f..6839e97928 100644
--- a/media/libvpx/libvpx/vpx_dsp/add_noise.c
+++ b/media/libvpx/libvpx/vpx_dsp/add_noise.c
@@ -15,6 +15,7 @@
 #include "./vpx_dsp_rtcd.h"
 
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/postproc.h"
 #include "vpx_ports/mem.h"
 
 void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp,
@@ -51,6 +52,7 @@ int vpx_setup_noise(double sigma, int8_t *noise, int size) {
     const int a_i = (int)(0.5 + 256 * gaussian(sigma, 0, i));
     if (a_i) {
       for (j = 0; j < a_i; ++j) {
+        if (next + j >= 256) goto set_noise;
         char_dist[next + j] = (int8_t)i;
       }
       next = next + j;
@@ -62,6 +64,7 @@ int vpx_setup_noise(double sigma, int8_t *noise, int size) {
     char_dist[next] = 0;
   }
 
+set_noise:
   for (i = 0; i < size; ++i) {
     noise[i] = char_dist[rand() & 0xff];  // NOLINT
   }
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/avg_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/avg_neon.c
index 001517d33e..1b17a326b4 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/avg_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/avg_neon.c
@@ -15,145 +15,124 @@
 #include "./vpx_config.h"
 
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
 
-static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {
-  const uint32x4_t a = vpaddlq_u16(v_16x8);
-  const uint64x2_t b = vpaddlq_u32(a);
-  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                                vreinterpret_u32_u64(vget_high_u64(b)));
-  return vget_lane_u32(c, 0);
+uint32_t vpx_avg_4x4_neon(const uint8_t *a, int a_stride) {
+  const uint8x16_t b = load_unaligned_u8q(a, a_stride);
+  const uint16x8_t c = vaddl_u8(vget_low_u8(b), vget_high_u8(b));
+  return (horizontal_add_uint16x8(c) + (1 << 3)) >> 4;
 }
 
-unsigned int vpx_avg_4x4_neon(const uint8_t *s, int p) {
-  uint16x8_t v_sum;
-  uint32x2_t v_s0 = vdup_n_u32(0);
-  uint32x2_t v_s1 = vdup_n_u32(0);
-  v_s0 = vld1_lane_u32((const uint32_t *)s, v_s0, 0);
-  v_s0 = vld1_lane_u32((const uint32_t *)(s + p), v_s0, 1);
-  v_s1 = vld1_lane_u32((const uint32_t *)(s + 2 * p), v_s1, 0);
-  v_s1 = vld1_lane_u32((const uint32_t *)(s + 3 * p), v_s1, 1);
-  v_sum = vaddl_u8(vreinterpret_u8_u32(v_s0), vreinterpret_u8_u32(v_s1));
-  return (horizontal_add_u16x8(v_sum) + 8) >> 4;
-}
+uint32_t vpx_avg_8x8_neon(const uint8_t *a, int a_stride) {
+  int i;
+  uint8x8_t b, c;
+  uint16x8_t sum;
+  b = vld1_u8(a);
+  a += a_stride;
+  c = vld1_u8(a);
+  a += a_stride;
+  sum = vaddl_u8(b, c);
 
-unsigned int vpx_avg_8x8_neon(const uint8_t *s, int p) {
-  uint8x8_t v_s0 = vld1_u8(s);
-  const uint8x8_t v_s1 = vld1_u8(s + p);
-  uint16x8_t v_sum = vaddl_u8(v_s0, v_s1);
+  for (i = 0; i < 6; ++i) {
+    const uint8x8_t d = vld1_u8(a);
+    a += a_stride;
+    sum = vaddw_u8(sum, d);
+  }
 
-  v_s0 = vld1_u8(s + 2 * p);
-  v_sum = vaddw_u8(v_sum, v_s0);
-
-  v_s0 = vld1_u8(s + 3 * p);
-  v_sum = vaddw_u8(v_sum, v_s0);
-
-  v_s0 = vld1_u8(s + 4 * p);
-  v_sum = vaddw_u8(v_sum, v_s0);
-
-  v_s0 = vld1_u8(s + 5 * p);
-  v_sum = vaddw_u8(v_sum, v_s0);
-
-  v_s0 = vld1_u8(s + 6 * p);
-  v_sum = vaddw_u8(v_sum, v_s0);
-
-  v_s0 = vld1_u8(s + 7 * p);
-  v_sum = vaddw_u8(v_sum, v_s0);
-
-  return (horizontal_add_u16x8(v_sum) + 32) >> 6;
+  return (horizontal_add_uint16x8(sum) + (1 << 5)) >> 6;
 }
 
 // coeff: 16 bits, dynamic range [-32640, 32640].
 // length: value range {16, 64, 256, 1024}.
-int vpx_satd_neon(const int16_t *coeff, int length) {
-  const int16x4_t zero = vdup_n_s16(0);
-  int32x4_t accum = vdupq_n_s32(0);
+// satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
+int vpx_satd_neon(const tran_low_t *coeff, int length) {
+  int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
 
   do {
-    const int16x8_t src0 = vld1q_s16(coeff);
-    const int16x8_t src8 = vld1q_s16(coeff + 8);
-    accum = vabal_s16(accum, vget_low_s16(src0), zero);
-    accum = vabal_s16(accum, vget_high_s16(src0), zero);
-    accum = vabal_s16(accum, vget_low_s16(src8), zero);
-    accum = vabal_s16(accum, vget_high_s16(src8), zero);
+    int16x8_t abs0, abs1;
+    const int16x8_t s0 = load_tran_low_to_s16q(coeff);
+    const int16x8_t s1 = load_tran_low_to_s16q(coeff + 8);
+
+    abs0 = vabsq_s16(s0);
+    sum_s32[0] = vpadalq_s16(sum_s32[0], abs0);
+    abs1 = vabsq_s16(s1);
+    sum_s32[1] = vpadalq_s16(sum_s32[1], abs1);
+
     length -= 16;
     coeff += 16;
   } while (length != 0);
 
-  {
-    // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
-    const int64x2_t s0 = vpaddlq_s32(accum);  // cascading summation of 'accum'.
-    const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),
-                                  vreinterpret_s32_s64(vget_high_s64(s0)));
-    const int satd = vget_lane_s32(s1, 0);
-    return satd;
-  }
+  return horizontal_add_int32x4(vaddq_s32(sum_s32[0], sum_s32[1]));
 }
 
 void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
                           const int ref_stride, const int height) {
   int i;
-  uint16x8_t vec_sum_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_hi = vdupq_n_u16(0);
-  const int shift_factor = ((height >> 5) + 3) * -1;
-  const int16x8_t vec_shift = vdupq_n_s16(shift_factor);
+  uint8x16_t r0, r1, r2, r3;
+  uint16x8_t sum_lo[2], sum_hi[2];
+  uint16x8_t tmp_lo[2], tmp_hi[2];
+  int16x8_t avg_lo, avg_hi;
 
-  for (i = 0; i < height; i += 8) {
-    const uint8x16_t vec_row1 = vld1q_u8(ref);
-    const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride);
-    const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2);
-    const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3);
-    const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4);
-    const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5);
-    const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6);
-    const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7);
+  const int norm_factor = (height >> 5) + 3;
+  const int16x8_t neg_norm_factor = vdupq_n_s16(-norm_factor);
 
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1));
+  assert(height >= 4 && height % 4 == 0);
 
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2));
+  r0 = vld1q_u8(ref + 0 * ref_stride);
+  r1 = vld1q_u8(ref + 1 * ref_stride);
+  r2 = vld1q_u8(ref + 2 * ref_stride);
+  r3 = vld1q_u8(ref + 3 * ref_stride);
 
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3));
+  sum_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
+  sum_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
+  sum_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
+  sum_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
 
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4));
+  ref += 4 * ref_stride;
 
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5));
+  for (i = 4; i < height; i += 4) {
+    r0 = vld1q_u8(ref + 0 * ref_stride);
+    r1 = vld1q_u8(ref + 1 * ref_stride);
+    r2 = vld1q_u8(ref + 2 * ref_stride);
+    r3 = vld1q_u8(ref + 3 * ref_stride);
 
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6));
+    tmp_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
+    tmp_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
+    tmp_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
+    tmp_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
 
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7));
+    sum_lo[0] = vaddq_u16(sum_lo[0], tmp_lo[0]);
+    sum_hi[0] = vaddq_u16(sum_hi[0], tmp_hi[0]);
+    sum_lo[1] = vaddq_u16(sum_lo[1], tmp_lo[1]);
+    sum_hi[1] = vaddq_u16(sum_hi[1], tmp_hi[1]);
 
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8));
-
-    ref += ref_stride * 8;
+    ref += 4 * ref_stride;
   }
 
-  vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift);
-  vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift);
+  sum_lo[0] = vaddq_u16(sum_lo[0], sum_lo[1]);
+  sum_hi[0] = vaddq_u16(sum_hi[0], sum_hi[1]);
 
-  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo));
-  hbuf += 8;
-  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi));
+  avg_lo = vshlq_s16(vreinterpretq_s16_u16(sum_lo[0]), neg_norm_factor);
+  avg_hi = vshlq_s16(vreinterpretq_s16_u16(sum_hi[0]), neg_norm_factor);
+
+  vst1q_s16(hbuf, avg_lo);
+  vst1q_s16(hbuf + 8, avg_hi);
 }
 
 int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) {
+  uint16x8_t sum;
   int i;
-  uint16x8_t vec_sum = vdupq_n_u16(0);
 
-  for (i = 0; i < width; i += 16) {
-    const uint8x16_t vec_row = vld1q_u8(ref);
-    vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row));
-    vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row));
-    ref += 16;
+  assert(width >= 16 && width % 16 == 0);
+
+  sum = vpaddlq_u8(vld1q_u8(ref));
+  for (i = 16; i < width; i += 16) {
+    sum = vpadalq_u8(sum, vld1q_u8(ref + i));
   }
 
-  return horizontal_add_u16x8(vec_sum);
+  return (int16_t)horizontal_add_uint16x8(sum);
 }
 
 // ref, src = [0, 510] - max diff = 16-bits
@@ -183,7 +162,7 @@ int vpx_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
 
   {
     // Note: 'total''s pairwise addition could be implemented similarly to
-    // horizontal_add_u16x8(), but one less vpaddl with 'total' when paired
+    // horizontal_add_uint16x8(), but one less vpaddl with 'total' when paired
     // with the summation of 'sse' performed better on a Cortex-A15.
     const int32x4_t t0 = vpaddlq_s16(total);  // cascading summation of 'total'
     const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0));
@@ -232,11 +211,16 @@ void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
   const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max);
   const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min);
 
-  // Split to D and start doing pairwise.
+#if VPX_ARCH_AARCH64
+  *min = *max = 0;  // Clear high bits
+  *((uint8_t *)max) = vmaxvq_u8(ab07_max);
+  *((uint8_t *)min) = vminvq_u8(ab07_min);
+#else
+  // Split into 64-bit vectors and execute pairwise min/max.
   uint8x8_t ab_max = vmax_u8(vget_high_u8(ab07_max), vget_low_u8(ab07_max));
   uint8x8_t ab_min = vmin_u8(vget_high_u8(ab07_min), vget_low_u8(ab07_min));
 
-  // Enough runs of vpmax/min propogate the max/min values to every position.
+  // Enough runs of vpmax/min propagate the max/min values to every position.
   ab_max = vpmax_u8(ab_max, ab_max);
   ab_min = vpmin_u8(ab_min, ab_min);
 
@@ -250,4 +234,5 @@ void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
   // Store directly to avoid costly neon->gpr transfer.
   vst1_lane_u8((uint8_t *)max, ab_max, 0);
   vst1_lane_u8((uint8_t *)min, ab_min, 0);
+#endif
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/avg_pred_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/avg_pred_neon.c
new file mode 100644
index 0000000000..5afdece0ab
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/avg_pred_neon.c
@@ -0,0 +1,65 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+
+void vpx_comp_avg_pred_neon(uint8_t *comp, const uint8_t *pred, int width,
+                            int height, const uint8_t *ref, int ref_stride) {
+  if (width > 8) {
+    int x, y = height;
+    do {
+      for (x = 0; x < width; x += 16) {
+        const uint8x16_t p = vld1q_u8(pred + x);
+        const uint8x16_t r = vld1q_u8(ref + x);
+        const uint8x16_t avg = vrhaddq_u8(p, r);
+        vst1q_u8(comp + x, avg);
+      }
+      comp += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--y);
+  } else if (width == 8) {
+    int i = width * height;
+    do {
+      const uint8x16_t p = vld1q_u8(pred);
+      uint8x16_t r;
+      const uint8x8_t r_0 = vld1_u8(ref);
+      const uint8x8_t r_1 = vld1_u8(ref + ref_stride);
+      r = vcombine_u8(r_0, r_1);
+      ref += 2 * ref_stride;
+      r = vrhaddq_u8(r, p);
+      vst1q_u8(comp, r);
+
+      pred += 16;
+      comp += 16;
+      i -= 16;
+    } while (i);
+  } else {
+    int i = width * height;
+    assert(width == 4);
+    do {
+      const uint8x16_t p = vld1q_u8(pred);
+      uint8x16_t r;
+
+      r = load_unaligned_u8q(ref, ref_stride);
+      ref += 4 * ref_stride;
+      r = vrhaddq_u8(r, p);
+      vst1q_u8(comp, r);
+
+      pred += 16;
+      comp += 16;
+      i -= 16;
+    } while (i);
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/deblock_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/deblock_neon.c
index ed1a4df25c..7efce32735 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/deblock_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/deblock_neon.c
@@ -15,6 +15,8 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 
+extern const int16_t vpx_rv[];
+
 static uint8x8_t average_k_out(const uint8x8_t a2, const uint8x8_t a1,
                                const uint8x8_t v0, const uint8x8_t b1,
                                const uint8x8_t b2) {
@@ -89,11 +91,6 @@ void vpx_post_proc_down_and_across_mb_row_neon(uint8_t *src_ptr,
   int row;
   int col;
 
-  // Process a stripe of macroblocks. The stripe will be a multiple of 16 (for
-  // Y) or 8 (for U/V) wide (cols) and the height (size) will be 16 (for Y) or 8
-  // (for U/V).
-  assert((size == 8 || size == 16) && cols % 8 == 0);
-
   // While columns of length 16 can be processed, load them.
   for (col = 0; col < cols - 8; col += 16) {
     uint8x16_t a0, a1, a2, a3, a4, a5, a6, a7;
@@ -384,3 +381,100 @@ void vpx_mbpost_proc_across_ip_neon(uint8_t *src, int pitch, int rows, int cols,
     src += pitch;
   }
 }
+
+// Apply filter of (vpx_rv + sum + s[c]) >> 4.
+static uint8x8_t filter_pixels_rv(const int16x8_t sum, const uint8x8_t s,
+                                  const int16x8_t rv) {
+  const int16x8_t s16 = vreinterpretq_s16_u16(vmovl_u8(s));
+  const int16x8_t sum_s = vaddq_s16(sum, s16);
+  const int16x8_t rounded = vaddq_s16(sum_s, rv);
+
+  return vqshrun_n_s16(rounded, 4);
+}
+
+void vpx_mbpost_proc_down_neon(uint8_t *dst, int pitch, int rows, int cols,
+                               int flimit) {
+  int row, col, i;
+  const int32x4_t f = vdupq_n_s32(flimit);
+  uint8x8_t below_context = vdup_n_u8(0);
+
+  // 8 columns are processed at a time.
+  // If rows is less than 8 the bottom border extension fails.
+  assert(cols % 8 == 0);
+  assert(rows >= 8);
+
+  // Load and keep the first 8 values in memory. Process a vertical stripe that
+  // is 8 wide.
+  for (col = 0; col < cols; col += 8) {
+    uint8x8_t s, above_context[8];
+    int16x8_t sum, sum_tmp;
+    int32x4_t sumsq_low, sumsq_high;
+
+    // Load and extend the top border.
+    s = vld1_u8(dst);
+    for (i = 0; i < 8; i++) {
+      above_context[i] = s;
+    }
+
+    sum_tmp = vreinterpretq_s16_u16(vmovl_u8(s));
+
+    // sum * 9
+    sum = vmulq_n_s16(sum_tmp, 9);
+
+    // (sum * 9) * sum == sum * sum * 9
+    sumsq_low = vmull_s16(vget_low_s16(sum), vget_low_s16(sum_tmp));
+    sumsq_high = vmull_s16(vget_high_s16(sum), vget_high_s16(sum_tmp));
+
+    // Load and discard the next 6 values to prime sum and sumsq.
+    for (i = 1; i <= 6; ++i) {
+      const uint8x8_t a = vld1_u8(dst + i * pitch);
+      const int16x8_t b = vreinterpretq_s16_u16(vmovl_u8(a));
+      sum = vaddq_s16(sum, b);
+
+      sumsq_low = vmlal_s16(sumsq_low, vget_low_s16(b), vget_low_s16(b));
+      sumsq_high = vmlal_s16(sumsq_high, vget_high_s16(b), vget_high_s16(b));
+    }
+
+    for (row = 0; row < rows; ++row) {
+      uint8x8_t mask, output;
+      int16x8_t x, y;
+      int32x4_t xy_low, xy_high;
+
+      s = vld1_u8(dst + row * pitch);
+
+      // Extend the bottom border.
+      if (row + 7 < rows) {
+        below_context = vld1_u8(dst + (row + 7) * pitch);
+      }
+
+      x = vreinterpretq_s16_u16(vsubl_u8(below_context, above_context[0]));
+      y = vreinterpretq_s16_u16(vaddl_u8(below_context, above_context[0]));
+      xy_low = vmull_s16(vget_low_s16(x), vget_low_s16(y));
+      xy_high = vmull_s16(vget_high_s16(x), vget_high_s16(y));
+
+      sum = vaddq_s16(sum, x);
+
+      sumsq_low = vaddq_s32(sumsq_low, xy_low);
+      sumsq_high = vaddq_s32(sumsq_high, xy_high);
+
+      mask = combine_mask(vget_low_s16(sum), vget_high_s16(sum), sumsq_low,
+                          sumsq_high, f);
+
+      output = filter_pixels_rv(sum, s, vld1q_s16(vpx_rv + (row & 127)));
+      output = vbsl_u8(mask, output, s);
+
+      vst1_u8(dst + row * pitch, output);
+
+      above_context[0] = above_context[1];
+      above_context[1] = above_context[2];
+      above_context[2] = above_context[3];
+      above_context[3] = above_context[4];
+      above_context[4] = above_context[5];
+      above_context[5] = above_context[6];
+      above_context[6] = above_context[7];
+      above_context[7] = s;
+    }
+
+    dst += 8;
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.c
new file mode 100644
index 0000000000..fde71ff30d
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.c
@@ -0,0 +1,439 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct16x16_neon.h"
+
+// Some builds of gcc 4.9.2 and .3 have trouble with some of the inline
+// functions.
+#if !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && \
+    __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __GNUC_PATCHLEVEL__ < 4
+
+void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
+  vpx_fdct16x16_c(input, output, stride);
+}
+
+#else
+
+// Main body of fdct16x16.
+static void vpx_fdct8x16_body(const int16x8_t *in /*[16]*/,
+                              int16x8_t *out /*[16]*/) {
+  int16x8_t s[8];
+  int16x8_t x[4];
+  int16x8_t step[8];
+
+  // stage 1
+  // From fwd_txfm.c: Work on the first eight values; fdct8(input,
+  // even_results);"
+  s[0] = vaddq_s16(in[0], in[7]);
+  s[1] = vaddq_s16(in[1], in[6]);
+  s[2] = vaddq_s16(in[2], in[5]);
+  s[3] = vaddq_s16(in[3], in[4]);
+  s[4] = vsubq_s16(in[3], in[4]);
+  s[5] = vsubq_s16(in[2], in[5]);
+  s[6] = vsubq_s16(in[1], in[6]);
+  s[7] = vsubq_s16(in[0], in[7]);
+
+  // fdct4(step, step);
+  x[0] = vaddq_s16(s[0], s[3]);
+  x[1] = vaddq_s16(s[1], s[2]);
+  x[2] = vsubq_s16(s[1], s[2]);
+  x[3] = vsubq_s16(s[0], s[3]);
+
+  // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
+  butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0],
+                                          &out[8]);
+  // out[4]  = fdct_round_shift(x3 * cospi_8_64  + x2 * cospi_24_64);
+  // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
+  butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[4], &out[12]);
+
+  //  Stage 2
+  // Re-using source s5/s6
+  // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
+  // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
+  butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &s[6], &s[5]);
+
+  //  Stage 3
+  x[0] = vaddq_s16(s[4], s[5]);
+  x[1] = vsubq_s16(s[4], s[5]);
+  x[2] = vsubq_s16(s[7], s[6]);
+  x[3] = vaddq_s16(s[7], s[6]);
+
+  // Stage 4
+  // out[2]  = fdct_round_shift(x3 * cospi_4_64  + x0 * cospi_28_64)
+  // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64)
+  butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[2], &out[14]);
+  // out[6]  = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64)
+  // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64)
+  butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[10], &out[6]);
+
+  // step 2
+  // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
+  // That file distinguished between "in_high" and "step1" but the only
+  // difference is that "in_high" is the first 8 values and "step 1" is the
+  // second. Here, since they are all in one array, "step1" values are += 8.
+
+  // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64)
+  // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
+  // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
+  // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
+  butterfly_one_coeff_s16_fast(in[13], in[10], cospi_16_64, &s[5], &s[2]);
+  butterfly_one_coeff_s16_fast(in[12], in[11], cospi_16_64, &s[4], &s[3]);
+
+  // step 3
+  s[0] = vaddq_s16(in[8], s[3]);
+  s[1] = vaddq_s16(in[9], s[2]);
+  x[0] = vsubq_s16(in[9], s[2]);
+  x[1] = vsubq_s16(in[8], s[3]);
+  x[2] = vsubq_s16(in[15], s[4]);
+  x[3] = vsubq_s16(in[14], s[5]);
+  s[6] = vaddq_s16(in[14], s[5]);
+  s[7] = vaddq_s16(in[15], s[4]);
+
+  // step 4
+  // step2[6] = fdct_round_shift(step3[6] * cospi_8_64  + step3[1] *
+  // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1]
+  // * cospi_8_64)
+  butterfly_two_coeff(s[6], s[1], cospi_8_64, cospi_24_64, &s[6], &s[1]);
+
+  // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
+  // step2[5] = fdct_round_shift(step3[2] * cospi_8_64  - step3[5] *
+  // cospi_24_64)
+  butterfly_two_coeff(x[0], x[3], cospi_24_64, cospi_8_64, &s[2], &s[5]);
+
+  // step 5
+  step[0] = vaddq_s16(s[0], s[1]);
+  step[1] = vsubq_s16(s[0], s[1]);
+  step[2] = vaddq_s16(x[1], s[2]);
+  step[3] = vsubq_s16(x[1], s[2]);
+  step[4] = vsubq_s16(x[2], s[5]);
+  step[5] = vaddq_s16(x[2], s[5]);
+  step[6] = vsubq_s16(s[7], s[6]);
+  step[7] = vaddq_s16(s[7], s[6]);
+
+  // step 6
+  // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64)
+  // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64)
+  butterfly_two_coeff(step[6], step[1], cospi_18_64, cospi_14_64, &out[9],
+                      &out[7]);
+  // out[1]  = fdct_round_shift(step1[7] * cospi_2_64  + step1[0] * cospi_30_64)
+  // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64)
+  butterfly_two_coeff(step[7], step[0], cospi_2_64, cospi_30_64, &out[1],
+                      &out[15]);
+
+  // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64)
+  // out[3]  = fdct_round_shift(step1[4] * cospi_6_64  - step1[3] * cospi_26_64)
+  butterfly_two_coeff(step[4], step[3], cospi_26_64, cospi_6_64, &out[13],
+                      &out[3]);
+
+  // out[5]  = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64)
+  // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64)
+  butterfly_two_coeff(step[5], step[2], cospi_10_64, cospi_22_64, &out[5],
+                      &out[11]);
+}
+
+void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
+  int16x8_t temp0[16];
+  int16x8_t temp1[16];
+  int16x8_t temp2[16];
+  int16x8_t temp3[16];
+
+  // Left half.
+  load_cross(input, stride, temp0);
+  scale_input(temp0, temp1);
+  vpx_fdct8x16_body(temp1, temp0);
+
+  // Right half.
+  load_cross(input + 8, stride, temp1);
+  scale_input(temp1, temp2);
+  vpx_fdct8x16_body(temp2, temp1);
+
+  // Transpose top left and top right quarters into one contiguous location to
+  // process to the top half.
+
+  transpose_s16_8x8q(&temp0[0], &temp2[0]);
+  transpose_s16_8x8q(&temp1[0], &temp2[8]);
+  partial_round_shift(temp2);
+  cross_input(temp2, temp3);
+  vpx_fdct8x16_body(temp3, temp2);
+  transpose_s16_8x8(&temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4],
+                    &temp2[5], &temp2[6], &temp2[7]);
+  transpose_s16_8x8(&temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12],
+                    &temp2[13], &temp2[14], &temp2[15]);
+  store(output, temp2);
+  store(output + 8, temp2 + 8);
+  output += 8 * 16;
+
+  // Transpose bottom left and bottom right quarters into one contiguous
+  // location to process to the bottom half.
+  transpose_s16_8x8q(&temp0[8], &temp1[0]);
+
+  transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
+                    &temp1[13], &temp1[14], &temp1[15]);
+  partial_round_shift(temp1);
+  cross_input(temp1, temp0);
+  vpx_fdct8x16_body(temp0, temp1);
+  transpose_s16_8x8(&temp1[0], &temp1[1], &temp1[2], &temp1[3], &temp1[4],
+                    &temp1[5], &temp1[6], &temp1[7]);
+  transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
+                    &temp1[13], &temp1[14], &temp1[15]);
+  store(output, temp1);
+  store(output + 8, temp1 + 8);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+// Main body of fdct8x16 column
+static void vpx_highbd_fdct8x16_body(int32x4_t *left /*[16]*/,
+                                     int32x4_t *right /* [16] */) {
+  int32x4_t sl[8];
+  int32x4_t sr[8];
+  int32x4_t xl[4];
+  int32x4_t xr[4];
+  int32x4_t inl[8];
+  int32x4_t inr[8];
+  int32x4_t stepl[8];
+  int32x4_t stepr[8];
+
+  // stage 1
+  // From fwd_txfm.c: Work on the first eight values; fdct8(input,
+  // even_results);"
+  sl[0] = vaddq_s32(left[0], left[7]);
+  sr[0] = vaddq_s32(right[0], right[7]);
+  sl[1] = vaddq_s32(left[1], left[6]);
+  sr[1] = vaddq_s32(right[1], right[6]);
+  sl[2] = vaddq_s32(left[2], left[5]);
+  sr[2] = vaddq_s32(right[2], right[5]);
+  sl[3] = vaddq_s32(left[3], left[4]);
+  sr[3] = vaddq_s32(right[3], right[4]);
+  sl[4] = vsubq_s32(left[3], left[4]);
+  sr[4] = vsubq_s32(right[3], right[4]);
+  sl[5] = vsubq_s32(left[2], left[5]);
+  sr[5] = vsubq_s32(right[2], right[5]);
+  sl[6] = vsubq_s32(left[1], left[6]);
+  sr[6] = vsubq_s32(right[1], right[6]);
+  sl[7] = vsubq_s32(left[0], left[7]);
+  sr[7] = vsubq_s32(right[0], right[7]);
+
+  // Copy values 8-15 as we're storing in-place
+  inl[0] = left[8];
+  inr[0] = right[8];
+  inl[1] = left[9];
+  inr[1] = right[9];
+  inl[2] = left[10];
+  inr[2] = right[10];
+  inl[3] = left[11];
+  inr[3] = right[11];
+  inl[4] = left[12];
+  inr[4] = right[12];
+  inl[5] = left[13];
+  inr[5] = right[13];
+  inl[6] = left[14];
+  inr[6] = right[14];
+  inl[7] = left[15];
+  inr[7] = right[15];
+
+  // fdct4(step, step);
+  xl[0] = vaddq_s32(sl[0], sl[3]);
+  xr[0] = vaddq_s32(sr[0], sr[3]);
+  xl[1] = vaddq_s32(sl[1], sl[2]);
+  xr[1] = vaddq_s32(sr[1], sr[2]);
+  xl[2] = vsubq_s32(sl[1], sl[2]);
+  xr[2] = vsubq_s32(sr[1], sr[2]);
+  xl[3] = vsubq_s32(sl[0], sl[3]);
+  xr[3] = vsubq_s32(sr[0], sr[3]);
+
+  // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
+  butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
+                               &left[0], &right[0], &left[8], &right[8]);
+
+  // out[4]  = fdct_round_shift(x3 * cospi_8_64  + x2 * cospi_24_64);
+  // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
+  butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64,
+                                     cospi_24_64, &left[4], &right[4],
+                                     &left[12], &right[12]);
+
+  //  Stage 2
+  // Re-using source s5/s6
+  // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
+  // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
+  butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &sl[6],
+                               &sr[6], &sl[5], &sr[5]);
+
+  //  Stage 3
+  xl[0] = vaddq_s32(sl[4], sl[5]);
+  xr[0] = vaddq_s32(sr[4], sr[5]);
+  xl[1] = vsubq_s32(sl[4], sl[5]);
+  xr[1] = vsubq_s32(sr[4], sr[5]);
+  xl[2] = vsubq_s32(sl[7], sl[6]);
+  xr[2] = vsubq_s32(sr[7], sr[6]);
+  xl[3] = vaddq_s32(sl[7], sl[6]);
+  xr[3] = vaddq_s32(sr[7], sr[6]);
+
+  // Stage 4
+  // out[2]  = fdct_round_shift(x3 * cospi_4_64  + x0 * cospi_28_64)
+  // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64,
+                                     cospi_28_64, &left[2], &right[2],
+                                     &left[14], &right[14]);
+  // out[6]  = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64)
+  // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64,
+                                     cospi_12_64, &left[10], &right[10],
+                                     &left[6], &right[6]);
+
+  // step 2
+  // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
+  // That file distinguished between "in_high" and "step1" but the only
+  // difference is that "in_high" is the first 8 values and "step 1" is the
+  // second. Here, since they are all in one array, "step1" values are += 8.
+
+  // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64)
+  // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
+  // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
+  // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
+  butterfly_one_coeff_s32_fast(inl[5], inr[5], inl[2], inr[2], cospi_16_64,
+                               &sl[5], &sr[5], &sl[2], &sr[2]);
+  butterfly_one_coeff_s32_fast(inl[4], inr[4], inl[3], inr[3], cospi_16_64,
+                               &sl[4], &sr[4], &sl[3], &sr[3]);
+
+  // step 3
+  sl[0] = vaddq_s32(inl[0], sl[3]);
+  sr[0] = vaddq_s32(inr[0], sr[3]);
+  sl[1] = vaddq_s32(inl[1], sl[2]);
+  sr[1] = vaddq_s32(inr[1], sr[2]);
+  xl[0] = vsubq_s32(inl[1], sl[2]);
+  xr[0] = vsubq_s32(inr[1], sr[2]);
+  xl[1] = vsubq_s32(inl[0], sl[3]);
+  xr[1] = vsubq_s32(inr[0], sr[3]);
+  xl[2] = vsubq_s32(inl[7], sl[4]);
+  xr[2] = vsubq_s32(inr[7], sr[4]);
+  xl[3] = vsubq_s32(inl[6], sl[5]);
+  xr[3] = vsubq_s32(inr[6], sr[5]);
+  sl[6] = vaddq_s32(inl[6], sl[5]);
+  sr[6] = vaddq_s32(inr[6], sr[5]);
+  sl[7] = vaddq_s32(inl[7], sl[4]);
+  sr[7] = vaddq_s32(inr[7], sr[4]);
+
+  // step 4
+  // step2[6] = fdct_round_shift(step3[6] * cospi_8_64  + step3[1] *
+  // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1]
+  // * cospi_8_64)
+  butterfly_two_coeff_s32_s64_narrow(sl[6], sr[6], sl[1], sr[1], cospi_8_64,
+                                     cospi_24_64, &sl[6], &sr[6], &sl[1],
+                                     &sr[1]);
+  // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
+  // step2[5] = fdct_round_shift(step3[2] * cospi_8_64  - step3[5] *
+  // cospi_24_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[0], xr[0], xl[3], xr[3], cospi_24_64,
+                                     cospi_8_64, &sl[2], &sr[2], &sl[5],
+                                     &sr[5]);
+
+  // step 5
+  stepl[0] = vaddq_s32(sl[0], sl[1]);
+  stepr[0] = vaddq_s32(sr[0], sr[1]);
+  stepl[1] = vsubq_s32(sl[0], sl[1]);
+  stepr[1] = vsubq_s32(sr[0], sr[1]);
+  stepl[2] = vaddq_s32(xl[1], sl[2]);
+  stepr[2] = vaddq_s32(xr[1], sr[2]);
+  stepl[3] = vsubq_s32(xl[1], sl[2]);
+  stepr[3] = vsubq_s32(xr[1], sr[2]);
+  stepl[4] = vsubq_s32(xl[2], sl[5]);
+  stepr[4] = vsubq_s32(xr[2], sr[5]);
+  stepl[5] = vaddq_s32(xl[2], sl[5]);
+  stepr[5] = vaddq_s32(xr[2], sr[5]);
+  stepl[6] = vsubq_s32(sl[7], sl[6]);
+  stepr[6] = vsubq_s32(sr[7], sr[6]);
+  stepl[7] = vaddq_s32(sl[7], sl[6]);
+  stepr[7] = vaddq_s32(sr[7], sr[6]);
+
+  // step 6
+  // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64)
+  // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64)
+  butterfly_two_coeff_s32_s64_narrow(stepl[6], stepr[6], stepl[1], stepr[1],
+                                     cospi_18_64, cospi_14_64, &left[9],
+                                     &right[9], &left[7], &right[7]);
+  // out[1]  = fdct_round_shift(step1[7] * cospi_2_64  + step1[0] * cospi_30_64)
+  // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64)
+  butterfly_two_coeff_s32_s64_narrow(stepl[7], stepr[7], stepl[0], stepr[0],
+                                     cospi_2_64, cospi_30_64, &left[1],
+                                     &right[1], &left[15], &right[15]);
+  // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64)
+  // out[3]  = fdct_round_shift(step1[4] * cospi_6_64  - step1[3] * cospi_26_64)
+  butterfly_two_coeff_s32_s64_narrow(stepl[4], stepr[4], stepl[3], stepr[3],
+                                     cospi_26_64, cospi_6_64, &left[13],
+                                     &right[13], &left[3], &right[3]);
+  // out[5]  = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64)
+  // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64)
+  butterfly_two_coeff_s32_s64_narrow(stepl[5], stepr[5], stepl[2], stepr[2],
+                                     cospi_10_64, cospi_22_64, &left[5],
+                                     &right[5], &left[11], &right[11]);
+}
+
+void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output,
+                               int stride) {
+  int16x8_t temp0[16];
+  int32x4_t left1[16], left2[16], left3[16], left4[16], right1[16], right2[16],
+      right3[16], right4[16];
+
+  // Left half.
+  load_cross(input, stride, temp0);
+  highbd_scale_input(temp0, left1, right1);
+  vpx_highbd_fdct8x16_body(left1, right1);
+
+  // right half.
+  load_cross(input + 8, stride, temp0);
+  highbd_scale_input(temp0, left2, right2);
+  vpx_highbd_fdct8x16_body(left2, right2);
+
+  // Transpose top left and top right quarters into one contiguous location to
+  // process to the top half.
+
+  transpose_s32_8x8_2(left1, right1, left3, right3);
+  transpose_s32_8x8_2(left2, right2, left3 + 8, right3 + 8);
+  transpose_s32_8x8_2(left1 + 8, right1 + 8, left4, right4);
+  transpose_s32_8x8_2(left2 + 8, right2 + 8, left4 + 8, right4 + 8);
+
+  highbd_partial_round_shift(left3, right3);
+  highbd_cross_input(left3, right3, left1, right1);
+  vpx_highbd_fdct8x16_body(left1, right1);
+
+  // Transpose bottom left and bottom right quarters into one contiguous
+  // location to process to the bottom half.
+
+  highbd_partial_round_shift(left4, right4);
+  highbd_cross_input(left4, right4, left2, right2);
+  vpx_highbd_fdct8x16_body(left2, right2);
+
+  transpose_s32_8x8_2(left1, right1, left3, right3);
+  transpose_s32_8x8_2(left2, right2, left3 + 8, right3 + 8);
+  transpose_s32_8x8_2(left1 + 8, right1 + 8, left4, right4);
+  transpose_s32_8x8_2(left2 + 8, right2 + 8, left4 + 8, right4 + 8);
+  store16_s32(output, left3);
+  output += 4;
+  store16_s32(output, right3);
+  output += 4;
+
+  store16_s32(output, left4);
+  output += 4;
+  store16_s32(output, right4);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#endif  // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) &&
+        // __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __GNUC_PATCHLEVEL__ < 4
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.h
new file mode 100644
index 0000000000..cd58675ca4
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.h
@@ -0,0 +1,318 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_
+
+#include <arm_neon.h>
+
+#include "fdct_neon.h"
+
+static INLINE void load(const int16_t *a, int stride, int16x8_t *b /*[16]*/) {
+  b[0] = vld1q_s16(a);
+  a += stride;
+  b[1] = vld1q_s16(a);
+  a += stride;
+  b[2] = vld1q_s16(a);
+  a += stride;
+  b[3] = vld1q_s16(a);
+  a += stride;
+  b[4] = vld1q_s16(a);
+  a += stride;
+  b[5] = vld1q_s16(a);
+  a += stride;
+  b[6] = vld1q_s16(a);
+  a += stride;
+  b[7] = vld1q_s16(a);
+  a += stride;
+  b[8] = vld1q_s16(a);
+  a += stride;
+  b[9] = vld1q_s16(a);
+  a += stride;
+  b[10] = vld1q_s16(a);
+  a += stride;
+  b[11] = vld1q_s16(a);
+  a += stride;
+  b[12] = vld1q_s16(a);
+  a += stride;
+  b[13] = vld1q_s16(a);
+  a += stride;
+  b[14] = vld1q_s16(a);
+  a += stride;
+  b[15] = vld1q_s16(a);
+}
+
+// Store 8 16x8 values, assuming stride == 16.
+static INLINE void store(tran_low_t *a, const int16x8_t *b /*[8]*/) {
+  store_s16q_to_tran_low(a, b[0]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[1]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[2]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[3]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[4]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[5]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[6]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[7]);
+}
+
+// Load step of each pass. Add and subtract clear across the input, requiring
+// all 16 values to be loaded. For the first pass it also multiplies by 4.
+
+// To maybe reduce register usage this could be combined with the load() step to
+// get the first 4 and last 4 values, cross those, then load the middle 8 values
+// and cross them.
+static INLINE void scale_input(const int16x8_t *a /*[16]*/,
+                               int16x8_t *b /*[16]*/) {
+  b[0] = vshlq_n_s16(a[0], 2);
+  b[1] = vshlq_n_s16(a[1], 2);
+  b[2] = vshlq_n_s16(a[2], 2);
+  b[3] = vshlq_n_s16(a[3], 2);
+  b[4] = vshlq_n_s16(a[4], 2);
+  b[5] = vshlq_n_s16(a[5], 2);
+  b[6] = vshlq_n_s16(a[6], 2);
+  b[7] = vshlq_n_s16(a[7], 2);
+
+  b[8] = vshlq_n_s16(a[8], 2);
+  b[9] = vshlq_n_s16(a[9], 2);
+  b[10] = vshlq_n_s16(a[10], 2);
+  b[11] = vshlq_n_s16(a[11], 2);
+  b[12] = vshlq_n_s16(a[12], 2);
+  b[13] = vshlq_n_s16(a[13], 2);
+  b[14] = vshlq_n_s16(a[14], 2);
+  b[15] = vshlq_n_s16(a[15], 2);
+}
+
+static INLINE void cross_input(const int16x8_t *a /*[16]*/,
+                               int16x8_t *b /*[16]*/) {
+  b[0] = vaddq_s16(a[0], a[15]);
+  b[1] = vaddq_s16(a[1], a[14]);
+  b[2] = vaddq_s16(a[2], a[13]);
+  b[3] = vaddq_s16(a[3], a[12]);
+  b[4] = vaddq_s16(a[4], a[11]);
+  b[5] = vaddq_s16(a[5], a[10]);
+  b[6] = vaddq_s16(a[6], a[9]);
+  b[7] = vaddq_s16(a[7], a[8]);
+
+  b[8] = vsubq_s16(a[7], a[8]);
+  b[9] = vsubq_s16(a[6], a[9]);
+  b[10] = vsubq_s16(a[5], a[10]);
+  b[11] = vsubq_s16(a[4], a[11]);
+  b[12] = vsubq_s16(a[3], a[12]);
+  b[13] = vsubq_s16(a[2], a[13]);
+  b[14] = vsubq_s16(a[1], a[14]);
+  b[15] = vsubq_s16(a[0], a[15]);
+}
+
+static INLINE void load_cross(const int16_t *a, int stride,
+                              int16x8_t *b /*[16]*/) {
+  b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 15 * stride));
+  b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 14 * stride));
+  b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 13 * stride));
+  b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 12 * stride));
+  b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 11 * stride));
+  b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 10 * stride));
+  b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 9 * stride));
+  b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 8 * stride));
+
+  b[8] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 8 * stride));
+  b[9] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 9 * stride));
+  b[10] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 10 * stride));
+  b[11] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 11 * stride));
+  b[12] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 12 * stride));
+  b[13] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 13 * stride));
+  b[14] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 14 * stride));
+  b[15] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 15 * stride));
+}
+
+// Quarter round at the beginning of the second pass. Can't use vrshr (rounding)
+// because this only adds 1, not 1 << 2.
+static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) {
+  const int16x8_t one = vdupq_n_s16(1);
+  a[0] = vshrq_n_s16(vaddq_s16(a[0], one), 2);
+  a[1] = vshrq_n_s16(vaddq_s16(a[1], one), 2);
+  a[2] = vshrq_n_s16(vaddq_s16(a[2], one), 2);
+  a[3] = vshrq_n_s16(vaddq_s16(a[3], one), 2);
+  a[4] = vshrq_n_s16(vaddq_s16(a[4], one), 2);
+  a[5] = vshrq_n_s16(vaddq_s16(a[5], one), 2);
+  a[6] = vshrq_n_s16(vaddq_s16(a[6], one), 2);
+  a[7] = vshrq_n_s16(vaddq_s16(a[7], one), 2);
+  a[8] = vshrq_n_s16(vaddq_s16(a[8], one), 2);
+  a[9] = vshrq_n_s16(vaddq_s16(a[9], one), 2);
+  a[10] = vshrq_n_s16(vaddq_s16(a[10], one), 2);
+  a[11] = vshrq_n_s16(vaddq_s16(a[11], one), 2);
+  a[12] = vshrq_n_s16(vaddq_s16(a[12], one), 2);
+  a[13] = vshrq_n_s16(vaddq_s16(a[13], one), 2);
+  a[14] = vshrq_n_s16(vaddq_s16(a[14], one), 2);
+  a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE void highbd_scale_input(const int16x8_t *a /*[16]*/,
+                                      int32x4_t *left /*[16]*/,
+                                      int32x4_t *right /* [16] */) {
+  left[0] = vshll_n_s16(vget_low_s16(a[0]), 2);
+  left[1] = vshll_n_s16(vget_low_s16(a[1]), 2);
+  left[2] = vshll_n_s16(vget_low_s16(a[2]), 2);
+  left[3] = vshll_n_s16(vget_low_s16(a[3]), 2);
+  left[4] = vshll_n_s16(vget_low_s16(a[4]), 2);
+  left[5] = vshll_n_s16(vget_low_s16(a[5]), 2);
+  left[6] = vshll_n_s16(vget_low_s16(a[6]), 2);
+  left[7] = vshll_n_s16(vget_low_s16(a[7]), 2);
+  left[8] = vshll_n_s16(vget_low_s16(a[8]), 2);
+  left[9] = vshll_n_s16(vget_low_s16(a[9]), 2);
+  left[10] = vshll_n_s16(vget_low_s16(a[10]), 2);
+  left[11] = vshll_n_s16(vget_low_s16(a[11]), 2);
+  left[12] = vshll_n_s16(vget_low_s16(a[12]), 2);
+  left[13] = vshll_n_s16(vget_low_s16(a[13]), 2);
+  left[14] = vshll_n_s16(vget_low_s16(a[14]), 2);
+  left[15] = vshll_n_s16(vget_low_s16(a[15]), 2);
+
+  right[0] = vshll_n_s16(vget_high_s16(a[0]), 2);
+  right[1] = vshll_n_s16(vget_high_s16(a[1]), 2);
+  right[2] = vshll_n_s16(vget_high_s16(a[2]), 2);
+  right[3] = vshll_n_s16(vget_high_s16(a[3]), 2);
+  right[4] = vshll_n_s16(vget_high_s16(a[4]), 2);
+  right[5] = vshll_n_s16(vget_high_s16(a[5]), 2);
+  right[6] = vshll_n_s16(vget_high_s16(a[6]), 2);
+  right[7] = vshll_n_s16(vget_high_s16(a[7]), 2);
+  right[8] = vshll_n_s16(vget_high_s16(a[8]), 2);
+  right[9] = vshll_n_s16(vget_high_s16(a[9]), 2);
+  right[10] = vshll_n_s16(vget_high_s16(a[10]), 2);
+  right[11] = vshll_n_s16(vget_high_s16(a[11]), 2);
+  right[12] = vshll_n_s16(vget_high_s16(a[12]), 2);
+  right[13] = vshll_n_s16(vget_high_s16(a[13]), 2);
+  right[14] = vshll_n_s16(vget_high_s16(a[14]), 2);
+  right[15] = vshll_n_s16(vget_high_s16(a[15]), 2);
+}
+
+static INLINE void highbd_cross_input(const int32x4_t *a_left /*[16]*/,
+                                      int32x4_t *a_right /*[16]*/,
+                                      int32x4_t *b_left /*[16]*/,
+                                      int32x4_t *b_right /*[16]*/) {
+  b_left[0] = vaddq_s32(a_left[0], a_left[15]);
+  b_left[1] = vaddq_s32(a_left[1], a_left[14]);
+  b_left[2] = vaddq_s32(a_left[2], a_left[13]);
+  b_left[3] = vaddq_s32(a_left[3], a_left[12]);
+  b_left[4] = vaddq_s32(a_left[4], a_left[11]);
+  b_left[5] = vaddq_s32(a_left[5], a_left[10]);
+  b_left[6] = vaddq_s32(a_left[6], a_left[9]);
+  b_left[7] = vaddq_s32(a_left[7], a_left[8]);
+
+  b_right[0] = vaddq_s32(a_right[0], a_right[15]);
+  b_right[1] = vaddq_s32(a_right[1], a_right[14]);
+  b_right[2] = vaddq_s32(a_right[2], a_right[13]);
+  b_right[3] = vaddq_s32(a_right[3], a_right[12]);
+  b_right[4] = vaddq_s32(a_right[4], a_right[11]);
+  b_right[5] = vaddq_s32(a_right[5], a_right[10]);
+  b_right[6] = vaddq_s32(a_right[6], a_right[9]);
+  b_right[7] = vaddq_s32(a_right[7], a_right[8]);
+
+  b_left[8] = vsubq_s32(a_left[7], a_left[8]);
+  b_left[9] = vsubq_s32(a_left[6], a_left[9]);
+  b_left[10] = vsubq_s32(a_left[5], a_left[10]);
+  b_left[11] = vsubq_s32(a_left[4], a_left[11]);
+  b_left[12] = vsubq_s32(a_left[3], a_left[12]);
+  b_left[13] = vsubq_s32(a_left[2], a_left[13]);
+  b_left[14] = vsubq_s32(a_left[1], a_left[14]);
+  b_left[15] = vsubq_s32(a_left[0], a_left[15]);
+
+  b_right[8] = vsubq_s32(a_right[7], a_right[8]);
+  b_right[9] = vsubq_s32(a_right[6], a_right[9]);
+  b_right[10] = vsubq_s32(a_right[5], a_right[10]);
+  b_right[11] = vsubq_s32(a_right[4], a_right[11]);
+  b_right[12] = vsubq_s32(a_right[3], a_right[12]);
+  b_right[13] = vsubq_s32(a_right[2], a_right[13]);
+  b_right[14] = vsubq_s32(a_right[1], a_right[14]);
+  b_right[15] = vsubq_s32(a_right[0], a_right[15]);
+}
+
+static INLINE void highbd_partial_round_shift(int32x4_t *left /*[16]*/,
+                                              int32x4_t *right /* [16] */) {
+  const int32x4_t one = vdupq_n_s32(1);
+  left[0] = vshrq_n_s32(vaddq_s32(left[0], one), 2);
+  left[1] = vshrq_n_s32(vaddq_s32(left[1], one), 2);
+  left[2] = vshrq_n_s32(vaddq_s32(left[2], one), 2);
+  left[3] = vshrq_n_s32(vaddq_s32(left[3], one), 2);
+  left[4] = vshrq_n_s32(vaddq_s32(left[4], one), 2);
+  left[5] = vshrq_n_s32(vaddq_s32(left[5], one), 2);
+  left[6] = vshrq_n_s32(vaddq_s32(left[6], one), 2);
+  left[7] = vshrq_n_s32(vaddq_s32(left[7], one), 2);
+  left[8] = vshrq_n_s32(vaddq_s32(left[8], one), 2);
+  left[9] = vshrq_n_s32(vaddq_s32(left[9], one), 2);
+  left[10] = vshrq_n_s32(vaddq_s32(left[10], one), 2);
+  left[11] = vshrq_n_s32(vaddq_s32(left[11], one), 2);
+  left[12] = vshrq_n_s32(vaddq_s32(left[12], one), 2);
+  left[13] = vshrq_n_s32(vaddq_s32(left[13], one), 2);
+  left[14] = vshrq_n_s32(vaddq_s32(left[14], one), 2);
+  left[15] = vshrq_n_s32(vaddq_s32(left[15], one), 2);
+
+  right[0] = vshrq_n_s32(vaddq_s32(right[0], one), 2);
+  right[1] = vshrq_n_s32(vaddq_s32(right[1], one), 2);
+  right[2] = vshrq_n_s32(vaddq_s32(right[2], one), 2);
+  right[3] = vshrq_n_s32(vaddq_s32(right[3], one), 2);
+  right[4] = vshrq_n_s32(vaddq_s32(right[4], one), 2);
+  right[5] = vshrq_n_s32(vaddq_s32(right[5], one), 2);
+  right[6] = vshrq_n_s32(vaddq_s32(right[6], one), 2);
+  right[7] = vshrq_n_s32(vaddq_s32(right[7], one), 2);
+  right[8] = vshrq_n_s32(vaddq_s32(right[8], one), 2);
+  right[9] = vshrq_n_s32(vaddq_s32(right[9], one), 2);
+  right[10] = vshrq_n_s32(vaddq_s32(right[10], one), 2);
+  right[11] = vshrq_n_s32(vaddq_s32(right[11], one), 2);
+  right[12] = vshrq_n_s32(vaddq_s32(right[12], one), 2);
+  right[13] = vshrq_n_s32(vaddq_s32(right[13], one), 2);
+  right[14] = vshrq_n_s32(vaddq_s32(right[14], one), 2);
+  right[15] = vshrq_n_s32(vaddq_s32(right[15], one), 2);
+}
+
+// Store 16 32x4 vectors, assuming stride == 16.
+static INLINE void store16_s32(tran_low_t *a, const int32x4_t *b /*[32]*/) {
+  vst1q_s32(a, b[0]);
+  a += 16;
+  vst1q_s32(a, b[1]);
+  a += 16;
+  vst1q_s32(a, b[2]);
+  a += 16;
+  vst1q_s32(a, b[3]);
+  a += 16;
+  vst1q_s32(a, b[4]);
+  a += 16;
+  vst1q_s32(a, b[5]);
+  a += 16;
+  vst1q_s32(a, b[6]);
+  a += 16;
+  vst1q_s32(a, b[7]);
+  a += 16;
+  vst1q_s32(a, b[8]);
+  a += 16;
+  vst1q_s32(a, b[9]);
+  a += 16;
+  vst1q_s32(a, b[10]);
+  a += 16;
+  vst1q_s32(a, b[11]);
+  a += 16;
+  vst1q_s32(a, b[12]);
+  a += 16;
+  vst1q_s32(a, b[13]);
+  a += 16;
+  vst1q_s32(a, b[14]);
+  a += 16;
+  vst1q_s32(a, b[15]);
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#endif  // VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.c
new file mode 100644
index 0000000000..a91730ce8b
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.c
@@ -0,0 +1,419 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
+#include "vpx_dsp/arm/fdct32x32_neon.h"
+
+// Most gcc 4.9 distributions outside of Android do not generate correct code
+// for this function.
+#if !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && \
+    __GNUC__ == 4 && __GNUC_MINOR__ <= 9
+
+void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
+  vpx_fdct32x32_c(input, output, stride);
+}
+
+void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
+                           int stride) {
+  vpx_fdct32x32_rd_c(input, output, stride);
+}
+
+#else
+
+void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
+  int16x8_t temp0[32];
+  int16x8_t temp1[32];
+  int16x8_t temp2[32];
+  int16x8_t temp3[32];
+  int16x8_t temp4[32];
+  int16x8_t temp5[32];
+
+  // Process in 8x32 columns.
+  load_cross(input, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp1);
+
+  load_cross(input + 8, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp2);
+
+  load_cross(input + 16, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp3);
+
+  load_cross(input + 24, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp4);
+
+  // Generate the top row by munging the first set of 8 from each one together.
+  transpose_s16_8x8q(&temp1[0], &temp0[0]);
+  transpose_s16_8x8q(&temp2[0], &temp0[8]);
+  transpose_s16_8x8q(&temp3[0], &temp0[16]);
+  transpose_s16_8x8q(&temp4[0], &temp0[24]);
+
+  dct_body_second_pass(temp0, temp5);
+
+  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+                    &temp5[5], &temp5[6], &temp5[7]);
+  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+                    &temp5[13], &temp5[14], &temp5[15]);
+  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+                    &temp5[21], &temp5[22], &temp5[23]);
+  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+                    &temp5[29], &temp5[30], &temp5[31]);
+  store(output, temp5);
+
+  // Second row of 8x32.
+  transpose_s16_8x8q(&temp1[8], &temp0[0]);
+  transpose_s16_8x8q(&temp2[8], &temp0[8]);
+  transpose_s16_8x8q(&temp3[8], &temp0[16]);
+  transpose_s16_8x8q(&temp4[8], &temp0[24]);
+
+  dct_body_second_pass(temp0, temp5);
+
+  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+                    &temp5[5], &temp5[6], &temp5[7]);
+  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+                    &temp5[13], &temp5[14], &temp5[15]);
+  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+                    &temp5[21], &temp5[22], &temp5[23]);
+  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+                    &temp5[29], &temp5[30], &temp5[31]);
+  store(output + 8 * 32, temp5);
+
+  // Third row of 8x32
+  transpose_s16_8x8q(&temp1[16], &temp0[0]);
+  transpose_s16_8x8q(&temp2[16], &temp0[8]);
+  transpose_s16_8x8q(&temp3[16], &temp0[16]);
+  transpose_s16_8x8q(&temp4[16], &temp0[24]);
+
+  dct_body_second_pass(temp0, temp5);
+
+  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+                    &temp5[5], &temp5[6], &temp5[7]);
+  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+                    &temp5[13], &temp5[14], &temp5[15]);
+  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+                    &temp5[21], &temp5[22], &temp5[23]);
+  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+                    &temp5[29], &temp5[30], &temp5[31]);
+  store(output + 16 * 32, temp5);
+
+  // Final row of 8x32.
+  transpose_s16_8x8q(&temp1[24], &temp0[0]);
+  transpose_s16_8x8q(&temp2[24], &temp0[8]);
+  transpose_s16_8x8q(&temp3[24], &temp0[16]);
+  transpose_s16_8x8q(&temp4[24], &temp0[24]);
+
+  dct_body_second_pass(temp0, temp5);
+
+  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+                    &temp5[5], &temp5[6], &temp5[7]);
+  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+                    &temp5[13], &temp5[14], &temp5[15]);
+  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+                    &temp5[21], &temp5[22], &temp5[23]);
+  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+                    &temp5[29], &temp5[30], &temp5[31]);
+  store(output + 24 * 32, temp5);
+}
+
+void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
+                           int stride) {
+  int16x8_t temp0[32];
+  int16x8_t temp1[32];
+  int16x8_t temp2[32];
+  int16x8_t temp3[32];
+  int16x8_t temp4[32];
+  int16x8_t temp5[32];
+
+  // Process in 8x32 columns.
+  load_cross(input, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp1);
+
+  load_cross(input + 8, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp2);
+
+  load_cross(input + 16, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp3);
+
+  load_cross(input + 24, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp4);
+
+  // Generate the top row by munging the first set of 8 from each one together.
+  transpose_s16_8x8q(&temp1[0], &temp0[0]);
+  transpose_s16_8x8q(&temp2[0], &temp0[8]);
+  transpose_s16_8x8q(&temp3[0], &temp0[16]);
+  transpose_s16_8x8q(&temp4[0], &temp0[24]);
+
+  dct_body_second_pass_rd(temp0, temp5);
+
+  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+                    &temp5[5], &temp5[6], &temp5[7]);
+  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+                    &temp5[13], &temp5[14], &temp5[15]);
+  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+                    &temp5[21], &temp5[22], &temp5[23]);
+  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+                    &temp5[29], &temp5[30], &temp5[31]);
+  store(output, temp5);
+
+  // Second row of 8x32.
+  transpose_s16_8x8q(&temp1[8], &temp0[0]);
+  transpose_s16_8x8q(&temp2[8], &temp0[8]);
+  transpose_s16_8x8q(&temp3[8], &temp0[16]);
+  transpose_s16_8x8q(&temp4[8], &temp0[24]);
+
+  dct_body_second_pass_rd(temp0, temp5);
+
+  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+                    &temp5[5], &temp5[6], &temp5[7]);
+  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+                    &temp5[13], &temp5[14], &temp5[15]);
+  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+                    &temp5[21], &temp5[22], &temp5[23]);
+  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+                    &temp5[29], &temp5[30], &temp5[31]);
+  store(output + 8 * 32, temp5);
+
+  // Third row of 8x32
+  transpose_s16_8x8q(&temp1[16], &temp0[0]);
+  transpose_s16_8x8q(&temp2[16], &temp0[8]);
+  transpose_s16_8x8q(&temp3[16], &temp0[16]);
+  transpose_s16_8x8q(&temp4[16], &temp0[24]);
+
+  dct_body_second_pass_rd(temp0, temp5);
+
+  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+                    &temp5[5], &temp5[6], &temp5[7]);
+  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+                    &temp5[13], &temp5[14], &temp5[15]);
+  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+                    &temp5[21], &temp5[22], &temp5[23]);
+  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+                    &temp5[29], &temp5[30], &temp5[31]);
+  store(output + 16 * 32, temp5);
+
+  // Final row of 8x32.
+  transpose_s16_8x8q(&temp1[24], &temp0[0]);
+  transpose_s16_8x8q(&temp2[24], &temp0[8]);
+  transpose_s16_8x8q(&temp3[24], &temp0[16]);
+  transpose_s16_8x8q(&temp4[24], &temp0[24]);
+
+  dct_body_second_pass_rd(temp0, temp5);
+
+  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+                    &temp5[5], &temp5[6], &temp5[7]);
+  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+                    &temp5[13], &temp5[14], &temp5[15]);
+  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+                    &temp5[21], &temp5[22], &temp5[23]);
+  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+                    &temp5[29], &temp5[30], &temp5[31]);
+  store(output + 24 * 32, temp5);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct32x32_neon(const int16_t *input, tran_low_t *output,
+                               int stride) {
+  int16x8_t temp0[32];
+  int32x4_t left1[32], left2[32], left3[32], left4[32], right1[32], right2[32],
+      right3[32], right4[32];
+  int32x4_t left5[32], right5[32], left6[32], right6[32], left7[32], right7[32],
+      left8[32], right8[32];
+  int32x4_t temp1[32], temp2[32];
+
+  // Process in 8x32 columns.
+  load_cross(input, stride, temp0);
+  highbd_scale_input(temp0, left1, right1);
+  highbd_dct8x32_body_first_pass(left1, right1);
+  highbd_partial_sub_round_shift(left1, right1);
+
+  load_cross(input + 8, stride, temp0);
+  highbd_scale_input(temp0, left2, right2);
+  highbd_dct8x32_body_first_pass(left2, right2);
+  highbd_partial_sub_round_shift(left2, right2);
+
+  load_cross(input + 16, stride, temp0);
+  highbd_scale_input(temp0, left3, right3);
+  highbd_dct8x32_body_first_pass(left3, right3);
+  highbd_partial_sub_round_shift(left3, right3);
+
+  load_cross(input + 24, stride, temp0);
+  highbd_scale_input(temp0, left4, right4);
+  highbd_dct8x32_body_first_pass(left4, right4);
+  highbd_partial_sub_round_shift(left4, right4);
+
+  // Generate the top row by munging the first set of 8 from each one together.
+  transpose_s32_8x8_2(left1, right1, temp1, temp2);
+  transpose_s32_8x8_2(left2, right2, temp1 + 8, temp2 + 8);
+  transpose_s32_8x8_2(left3, right3, temp1 + 16, temp2 + 16);
+  transpose_s32_8x8_2(left4, right4, temp1 + 24, temp2 + 24);
+
+  highbd_cross_input(temp1, temp2, left5, right5);
+  highbd_dct8x32_body_second_pass(left5, right5);
+  highbd_partial_add_round_shift(left5, right5);
+
+  // Second row of 8x32.
+  transpose_s32_8x8_2(left1 + 8, right1 + 8, temp1, temp2);
+  transpose_s32_8x8_2(left2 + 8, right2 + 8, temp1 + 8, temp2 + 8);
+  transpose_s32_8x8_2(left3 + 8, right3 + 8, temp1 + 16, temp2 + 16);
+  transpose_s32_8x8_2(left4 + 8, right4 + 8, temp1 + 24, temp2 + 24);
+
+  highbd_cross_input(temp1, temp2, left6, right6);
+  highbd_dct8x32_body_second_pass(left6, right6);
+  highbd_partial_add_round_shift(left6, right6);
+
+  // Third row of 8x32
+  transpose_s32_8x8_2(left1 + 16, right1 + 16, temp1, temp2);
+  transpose_s32_8x8_2(left2 + 16, right2 + 16, temp1 + 8, temp2 + 8);
+  transpose_s32_8x8_2(left3 + 16, right3 + 16, temp1 + 16, temp2 + 16);
+  transpose_s32_8x8_2(left4 + 16, right4 + 16, temp1 + 24, temp2 + 24);
+
+  highbd_cross_input(temp1, temp2, left7, right7);
+  highbd_dct8x32_body_second_pass(left7, right7);
+  highbd_partial_add_round_shift(left7, right7);
+
+  // Final row of 8x32.
+  transpose_s32_8x8_2(left1 + 24, right1 + 24, temp1, temp2);
+  transpose_s32_8x8_2(left2 + 24, right2 + 24, temp1 + 8, temp2 + 8);
+  transpose_s32_8x8_2(left3 + 24, right3 + 24, temp1 + 16, temp2 + 16);
+  transpose_s32_8x8_2(left4 + 24, right4 + 24, temp1 + 24, temp2 + 24);
+
+  highbd_cross_input(temp1, temp2, left8, right8);
+  highbd_dct8x32_body_second_pass(left8, right8);
+  highbd_partial_add_round_shift(left8, right8);
+
+  // Final transpose
+  transpose_s32_8x8_2(left5, right5, left1, right1);
+  transpose_s32_8x8_2(left5 + 8, right5 + 8, left2, right2);
+  transpose_s32_8x8_2(left5 + 16, right5 + 16, left3, right3);
+  transpose_s32_8x8_2(left5 + 24, right5 + 24, left4, right4);
+  transpose_s32_8x8_2(left6, right6, left1 + 8, right1 + 8);
+  transpose_s32_8x8_2(left6 + 8, right6 + 8, left2 + 8, right2 + 8);
+  transpose_s32_8x8_2(left6 + 16, right6 + 16, left3 + 8, right3 + 8);
+  transpose_s32_8x8_2(left6 + 24, right6 + 24, left4 + 8, right4 + 8);
+  transpose_s32_8x8_2(left7, right7, left1 + 16, right1 + 16);
+  transpose_s32_8x8_2(left7 + 8, right7 + 8, left2 + 16, right2 + 16);
+  transpose_s32_8x8_2(left7 + 16, right7 + 16, left3 + 16, right3 + 16);
+  transpose_s32_8x8_2(left7 + 24, right7 + 24, left4 + 16, right4 + 16);
+  transpose_s32_8x8_2(left8, right8, left1 + 24, right1 + 24);
+  transpose_s32_8x8_2(left8 + 8, right8 + 8, left2 + 24, right2 + 24);
+  transpose_s32_8x8_2(left8 + 16, right8 + 16, left3 + 24, right3 + 24);
+  transpose_s32_8x8_2(left8 + 24, right8 + 24, left4 + 24, right4 + 24);
+
+  store32x32_s32(output, left1, right1, left2, right2, left3, right3, left4,
+                 right4);
+}
+
+void vpx_highbd_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
+                                  int stride) {
+  int16x8_t temp0[32];
+  int32x4_t left1[32], left2[32], left3[32], left4[32], right1[32], right2[32],
+      right3[32], right4[32];
+  int32x4_t left5[32], right5[32], left6[32], right6[32], left7[32], right7[32],
+      left8[32], right8[32];
+  int32x4_t temp1[32], temp2[32];
+
+  // Process in 8x32 columns.
+  load_cross(input, stride, temp0);
+  highbd_scale_input(temp0, left1, right1);
+  highbd_dct8x32_body_first_pass(left1, right1);
+  highbd_partial_sub_round_shift(left1, right1);
+
+  load_cross(input + 8, stride, temp0);
+  highbd_scale_input(temp0, left2, right2);
+  highbd_dct8x32_body_first_pass(left2, right2);
+  highbd_partial_sub_round_shift(left2, right2);
+
+  load_cross(input + 16, stride, temp0);
+  highbd_scale_input(temp0, left3, right3);
+  highbd_dct8x32_body_first_pass(left3, right3);
+  highbd_partial_sub_round_shift(left3, right3);
+
+  load_cross(input + 24, stride, temp0);
+  highbd_scale_input(temp0, left4, right4);
+  highbd_dct8x32_body_first_pass(left4, right4);
+  highbd_partial_sub_round_shift(left4, right4);
+
+  // Generate the top row by munging the first set of 8 from each one together.
+  transpose_s32_8x8_2(left1, right1, temp1, temp2);
+  transpose_s32_8x8_2(left2, right2, temp1 + 8, temp2 + 8);
+  transpose_s32_8x8_2(left3, right3, temp1 + 16, temp2 + 16);
+  transpose_s32_8x8_2(left4, right4, temp1 + 24, temp2 + 24);
+
+  highbd_cross_input(temp1, temp2, left5, right5);
+  highbd_dct8x32_body_second_pass_rd(left5, right5);
+
+  // Second row of 8x32.
+  transpose_s32_8x8_2(left1 + 8, right1 + 8, temp1, temp2);
+  transpose_s32_8x8_2(left2 + 8, right2 + 8, temp1 + 8, temp2 + 8);
+  transpose_s32_8x8_2(left3 + 8, right3 + 8, temp1 + 16, temp2 + 16);
+  transpose_s32_8x8_2(left4 + 8, right4 + 8, temp1 + 24, temp2 + 24);
+
+  highbd_cross_input(temp1, temp2, left6, right6);
+  highbd_dct8x32_body_second_pass_rd(left6, right6);
+
+  // Third row of 8x32
+  transpose_s32_8x8_2(left1 + 16, right1 + 16, temp1, temp2);
+  transpose_s32_8x8_2(left2 + 16, right2 + 16, temp1 + 8, temp2 + 8);
+  transpose_s32_8x8_2(left3 + 16, right3 + 16, temp1 + 16, temp2 + 16);
+  transpose_s32_8x8_2(left4 + 16, right4 + 16, temp1 + 24, temp2 + 24);
+
+  highbd_cross_input(temp1, temp2, left7, right7);
+  highbd_dct8x32_body_second_pass_rd(left7, right7);
+
+  // Final row of 8x32.
+  transpose_s32_8x8_2(left1 + 24, right1 + 24, temp1, temp2);
+  transpose_s32_8x8_2(left2 + 24, right2 + 24, temp1 + 8, temp2 + 8);
+  transpose_s32_8x8_2(left3 + 24, right3 + 24, temp1 + 16, temp2 + 16);
+  transpose_s32_8x8_2(left4 + 24, right4 + 24, temp1 + 24, temp2 + 24);
+
+  highbd_cross_input(temp1, temp2, left8, right8);
+  highbd_dct8x32_body_second_pass_rd(left8, right8);
+
+  // Final transpose
+  transpose_s32_8x8_2(left5, right5, left1, right1);
+  transpose_s32_8x8_2(left5 + 8, right5 + 8, left2, right2);
+  transpose_s32_8x8_2(left5 + 16, right5 + 16, left3, right3);
+  transpose_s32_8x8_2(left5 + 24, right5 + 24, left4, right4);
+  transpose_s32_8x8_2(left6, right6, left1 + 8, right1 + 8);
+  transpose_s32_8x8_2(left6 + 8, right6 + 8, left2 + 8, right2 + 8);
+  transpose_s32_8x8_2(left6 + 16, right6 + 16, left3 + 8, right3 + 8);
+  transpose_s32_8x8_2(left6 + 24, right6 + 24, left4 + 8, right4 + 8);
+  transpose_s32_8x8_2(left7, right7, left1 + 16, right1 + 16);
+  transpose_s32_8x8_2(left7 + 8, right7 + 8, left2 + 16, right2 + 16);
+  transpose_s32_8x8_2(left7 + 16, right7 + 16, left3 + 16, right3 + 16);
+  transpose_s32_8x8_2(left7 + 24, right7 + 24, left4 + 16, right4 + 16);
+  transpose_s32_8x8_2(left8, right8, left1 + 24, right1 + 24);
+  transpose_s32_8x8_2(left8 + 8, right8 + 8, left2 + 24, right2 + 24);
+  transpose_s32_8x8_2(left8 + 16, right8 + 16, left3 + 24, right3 + 24);
+  transpose_s32_8x8_2(left8 + 24, right8 + 24, left4 + 24, right4 + 24);
+
+  store32x32_s32(output, left1, right1, left2, right2, left3, right3, left4,
+                 right4);
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#endif  // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) &&
+        // __GNUC__ == 4 && __GNUC_MINOR__ <= 9
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.h
new file mode 100644
index 0000000000..3b9e64c6df
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.h
@@ -0,0 +1,2919 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
+
+// Load & cross the first 8 and last 8, then the middle
+static INLINE void load_cross(const int16_t *a, int stride, int16x8_t *b) {
+  b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride));
+  b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride));
+  b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride));
+  b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride));
+  b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride));
+  b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride));
+  b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride));
+  b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride));
+
+  b[24] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride));
+  b[25] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride));
+  b[26] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride));
+  b[27] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride));
+  b[28] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride));
+  b[29] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride));
+  b[30] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride));
+  b[31] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride));
+
+  b[8] = vaddq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride));
+  b[9] = vaddq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride));
+  b[10] = vaddq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride));
+  b[11] = vaddq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride));
+  b[12] = vaddq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride));
+  b[13] = vaddq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride));
+  b[14] = vaddq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride));
+  b[15] = vaddq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride));
+
+  b[16] = vsubq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride));
+  b[17] = vsubq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride));
+  b[18] = vsubq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride));
+  b[19] = vsubq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride));
+  b[20] = vsubq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride));
+  b[21] = vsubq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride));
+  b[22] = vsubq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride));
+  b[23] = vsubq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride));
+}
+
+#define STORE_S16(src, index, dest)           \
+  do {                                        \
+    store_s16q_to_tran_low(dest, src[index]); \
+    dest += 8;                                \
+  } while (0)
+
+// Store 32 16x8 values, assuming stride == 32.
+// Slight twist: store horizontally in blocks of 8.
+static INLINE void store(tran_low_t *a, const int16x8_t *b) {
+  STORE_S16(b, 0, a);
+  STORE_S16(b, 8, a);
+  STORE_S16(b, 16, a);
+  STORE_S16(b, 24, a);
+  STORE_S16(b, 1, a);
+  STORE_S16(b, 9, a);
+  STORE_S16(b, 17, a);
+  STORE_S16(b, 25, a);
+  STORE_S16(b, 2, a);
+  STORE_S16(b, 10, a);
+  STORE_S16(b, 18, a);
+  STORE_S16(b, 26, a);
+  STORE_S16(b, 3, a);
+  STORE_S16(b, 11, a);
+  STORE_S16(b, 19, a);
+  STORE_S16(b, 27, a);
+  STORE_S16(b, 4, a);
+  STORE_S16(b, 12, a);
+  STORE_S16(b, 20, a);
+  STORE_S16(b, 28, a);
+  STORE_S16(b, 5, a);
+  STORE_S16(b, 13, a);
+  STORE_S16(b, 21, a);
+  STORE_S16(b, 29, a);
+  STORE_S16(b, 6, a);
+  STORE_S16(b, 14, a);
+  STORE_S16(b, 22, a);
+  STORE_S16(b, 30, a);
+  STORE_S16(b, 7, a);
+  STORE_S16(b, 15, a);
+  STORE_S16(b, 23, a);
+  STORE_S16(b, 31, a);
+}
+
+#undef STORE_S16
+
+static INLINE void scale_input(const int16x8_t *in /*32*/,
+                               int16x8_t *out /*32*/) {
+  out[0] = vshlq_n_s16(in[0], 2);
+  out[1] = vshlq_n_s16(in[1], 2);
+  out[2] = vshlq_n_s16(in[2], 2);
+  out[3] = vshlq_n_s16(in[3], 2);
+  out[4] = vshlq_n_s16(in[4], 2);
+  out[5] = vshlq_n_s16(in[5], 2);
+  out[6] = vshlq_n_s16(in[6], 2);
+  out[7] = vshlq_n_s16(in[7], 2);
+
+  out[8] = vshlq_n_s16(in[8], 2);
+  out[9] = vshlq_n_s16(in[9], 2);
+  out[10] = vshlq_n_s16(in[10], 2);
+  out[11] = vshlq_n_s16(in[11], 2);
+  out[12] = vshlq_n_s16(in[12], 2);
+  out[13] = vshlq_n_s16(in[13], 2);
+  out[14] = vshlq_n_s16(in[14], 2);
+  out[15] = vshlq_n_s16(in[15], 2);
+
+  out[16] = vshlq_n_s16(in[16], 2);
+  out[17] = vshlq_n_s16(in[17], 2);
+  out[18] = vshlq_n_s16(in[18], 2);
+  out[19] = vshlq_n_s16(in[19], 2);
+  out[20] = vshlq_n_s16(in[20], 2);
+  out[21] = vshlq_n_s16(in[21], 2);
+  out[22] = vshlq_n_s16(in[22], 2);
+  out[23] = vshlq_n_s16(in[23], 2);
+
+  out[24] = vshlq_n_s16(in[24], 2);
+  out[25] = vshlq_n_s16(in[25], 2);
+  out[26] = vshlq_n_s16(in[26], 2);
+  out[27] = vshlq_n_s16(in[27], 2);
+  out[28] = vshlq_n_s16(in[28], 2);
+  out[29] = vshlq_n_s16(in[29], 2);
+  out[30] = vshlq_n_s16(in[30], 2);
+  out[31] = vshlq_n_s16(in[31], 2);
+}
+
+static INLINE void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
+  int16x8_t a[32];
+  int16x8_t b[32];
+
+  // Stage 1: Done as part of the load.
+
+  // Stage 2.
+  // Mini cross. X the first 16 values and the middle 8 of the second half.
+  a[0] = vaddq_s16(in[0], in[15]);
+  a[1] = vaddq_s16(in[1], in[14]);
+  a[2] = vaddq_s16(in[2], in[13]);
+  a[3] = vaddq_s16(in[3], in[12]);
+  a[4] = vaddq_s16(in[4], in[11]);
+  a[5] = vaddq_s16(in[5], in[10]);
+  a[6] = vaddq_s16(in[6], in[9]);
+  a[7] = vaddq_s16(in[7], in[8]);
+
+  a[8] = vsubq_s16(in[7], in[8]);
+  a[9] = vsubq_s16(in[6], in[9]);
+  a[10] = vsubq_s16(in[5], in[10]);
+  a[11] = vsubq_s16(in[4], in[11]);
+  a[12] = vsubq_s16(in[3], in[12]);
+  a[13] = vsubq_s16(in[2], in[13]);
+  a[14] = vsubq_s16(in[1], in[14]);
+  a[15] = vsubq_s16(in[0], in[15]);
+
+  a[16] = in[16];
+  a[17] = in[17];
+  a[18] = in[18];
+  a[19] = in[19];
+
+  butterfly_one_coeff_s16_s32_narrow(in[27], in[20], cospi_16_64, &a[27],
+                                     &a[20]);
+  butterfly_one_coeff_s16_s32_narrow(in[26], in[21], cospi_16_64, &a[26],
+                                     &a[21]);
+  butterfly_one_coeff_s16_s32_narrow(in[25], in[22], cospi_16_64, &a[25],
+                                     &a[22]);
+  butterfly_one_coeff_s16_s32_narrow(in[24], in[23], cospi_16_64, &a[24],
+                                     &a[23]);
+
+  a[28] = in[28];
+  a[29] = in[29];
+  a[30] = in[30];
+  a[31] = in[31];
+
+  // Stage 3.
+  b[0] = vaddq_s16(a[0], a[7]);
+  b[1] = vaddq_s16(a[1], a[6]);
+  b[2] = vaddq_s16(a[2], a[5]);
+  b[3] = vaddq_s16(a[3], a[4]);
+
+  b[4] = vsubq_s16(a[3], a[4]);
+  b[5] = vsubq_s16(a[2], a[5]);
+  b[6] = vsubq_s16(a[1], a[6]);
+  b[7] = vsubq_s16(a[0], a[7]);
+
+  b[8] = a[8];
+  b[9] = a[9];
+
+  butterfly_one_coeff_s16_s32_narrow(a[13], a[10], cospi_16_64, &b[13], &b[10]);
+  butterfly_one_coeff_s16_s32_narrow(a[12], a[11], cospi_16_64, &b[12], &b[11]);
+
+  b[14] = a[14];
+  b[15] = a[15];
+
+  b[16] = vaddq_s16(in[16], a[23]);
+  b[17] = vaddq_s16(in[17], a[22]);
+  b[18] = vaddq_s16(in[18], a[21]);
+  b[19] = vaddq_s16(in[19], a[20]);
+
+  b[20] = vsubq_s16(in[19], a[20]);
+  b[21] = vsubq_s16(in[18], a[21]);
+  b[22] = vsubq_s16(in[17], a[22]);
+  b[23] = vsubq_s16(in[16], a[23]);
+
+  b[24] = vsubq_s16(in[31], a[24]);
+  b[25] = vsubq_s16(in[30], a[25]);
+  b[26] = vsubq_s16(in[29], a[26]);
+  b[27] = vsubq_s16(in[28], a[27]);
+
+  b[28] = vaddq_s16(in[28], a[27]);
+  b[29] = vaddq_s16(in[29], a[26]);
+  b[30] = vaddq_s16(in[30], a[25]);
+  b[31] = vaddq_s16(in[31], a[24]);
+
+  // Stage 4.
+  a[0] = vaddq_s16(b[0], b[3]);
+  a[1] = vaddq_s16(b[1], b[2]);
+  a[2] = vsubq_s16(b[1], b[2]);
+  a[3] = vsubq_s16(b[0], b[3]);
+
+  a[4] = b[4];
+
+  butterfly_one_coeff_s16_s32_narrow(b[6], b[5], cospi_16_64, &a[6], &a[5]);
+
+  a[7] = b[7];
+
+  a[8] = vaddq_s16(b[8], b[11]);
+  a[9] = vaddq_s16(b[9], b[10]);
+  a[10] = vsubq_s16(b[9], b[10]);
+  a[11] = vsubq_s16(b[8], b[11]);
+  a[12] = vsubq_s16(b[15], b[12]);
+  a[13] = vsubq_s16(b[14], b[13]);
+  a[14] = vaddq_s16(b[14], b[13]);
+  a[15] = vaddq_s16(b[15], b[12]);
+
+  a[16] = b[16];
+  a[17] = b[17];
+
+  butterfly_two_coeff(b[29], b[18], cospi_8_64, cospi_24_64, &a[29], &a[18]);
+  butterfly_two_coeff(b[28], b[19], cospi_8_64, cospi_24_64, &a[28], &a[19]);
+  butterfly_two_coeff(b[27], b[20], cospi_24_64, -cospi_8_64, &a[27], &a[20]);
+  butterfly_two_coeff(b[26], b[21], cospi_24_64, -cospi_8_64, &a[26], &a[21]);
+
+  a[22] = b[22];
+  a[23] = b[23];
+  a[24] = b[24];
+  a[25] = b[25];
+
+  a[30] = b[30];
+  a[31] = b[31];
+
+  // Stage 5.
+  butterfly_one_coeff_s16_fast(a[0], a[1], cospi_16_64, &b[0], &b[1]);
+  butterfly_two_coeff(a[3], a[2], cospi_8_64, cospi_24_64, &b[2], &b[3]);
+
+  b[4] = vaddq_s16(a[4], a[5]);
+  b[5] = vsubq_s16(a[4], a[5]);
+  b[6] = vsubq_s16(a[7], a[6]);
+  b[7] = vaddq_s16(a[7], a[6]);
+
+  b[8] = a[8];
+
+  butterfly_two_coeff(a[14], a[9], cospi_8_64, cospi_24_64, &b[14], &b[9]);
+  butterfly_two_coeff(a[13], a[10], cospi_24_64, -cospi_8_64, &b[13], &b[10]);
+
+  b[11] = a[11];
+  b[12] = a[12];
+
+  b[15] = a[15];
+
+  b[16] = vaddq_s16(a[19], a[16]);
+  b[17] = vaddq_s16(a[18], a[17]);
+  b[18] = vsubq_s16(a[17], a[18]);
+  b[19] = vsubq_s16(a[16], a[19]);
+  b[20] = vsubq_s16(a[23], a[20]);
+  b[21] = vsubq_s16(a[22], a[21]);
+  b[22] = vaddq_s16(a[21], a[22]);
+  b[23] = vaddq_s16(a[20], a[23]);
+  b[24] = vaddq_s16(a[27], a[24]);
+  b[25] = vaddq_s16(a[26], a[25]);
+  b[26] = vsubq_s16(a[25], a[26]);
+  b[27] = vsubq_s16(a[24], a[27]);
+  b[28] = vsubq_s16(a[31], a[28]);
+  b[29] = vsubq_s16(a[30], a[29]);
+  b[30] = vaddq_s16(a[29], a[30]);
+  b[31] = vaddq_s16(a[28], a[31]);
+
+  // Stage 6.
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+
+  butterfly_two_coeff(b[7], b[4], cospi_4_64, cospi_28_64, &a[4], &a[7]);
+  butterfly_two_coeff(b[6], b[5], cospi_20_64, cospi_12_64, &a[5], &a[6]);
+
+  a[8] = vaddq_s16(b[8], b[9]);
+  a[9] = vsubq_s16(b[8], b[9]);
+  a[10] = vsubq_s16(b[11], b[10]);
+  a[11] = vaddq_s16(b[11], b[10]);
+  a[12] = vaddq_s16(b[12], b[13]);
+  a[13] = vsubq_s16(b[12], b[13]);
+  a[14] = vsubq_s16(b[15], b[14]);
+  a[15] = vaddq_s16(b[15], b[14]);
+
+  a[16] = b[16];
+  a[19] = b[19];
+  a[20] = b[20];
+  a[23] = b[23];
+  a[24] = b[24];
+  a[27] = b[27];
+  a[28] = b[28];
+  a[31] = b[31];
+
+  butterfly_two_coeff(b[30], b[17], cospi_4_64, cospi_28_64, &a[30], &a[17]);
+  butterfly_two_coeff(b[29], b[18], cospi_28_64, -cospi_4_64, &a[29], &a[18]);
+
+  butterfly_two_coeff(b[26], b[21], cospi_20_64, cospi_12_64, &a[26], &a[21]);
+  butterfly_two_coeff(b[25], b[22], cospi_12_64, -cospi_20_64, &a[25], &a[22]);
+
+  // Stage 7.
+  b[0] = a[0];
+  b[1] = a[1];
+  b[2] = a[2];
+  b[3] = a[3];
+  b[4] = a[4];
+  b[5] = a[5];
+  b[6] = a[6];
+  b[7] = a[7];
+
+  butterfly_two_coeff(a[15], a[8], cospi_2_64, cospi_30_64, &b[8], &b[15]);
+  butterfly_two_coeff(a[14], a[9], cospi_18_64, cospi_14_64, &b[9], &b[14]);
+  butterfly_two_coeff(a[13], a[10], cospi_10_64, cospi_22_64, &b[10], &b[13]);
+  butterfly_two_coeff(a[12], a[11], cospi_26_64, cospi_6_64, &b[11], &b[12]);
+
+  b[16] = vaddq_s16(a[16], a[17]);
+  b[17] = vsubq_s16(a[16], a[17]);
+  b[18] = vsubq_s16(a[19], a[18]);
+  b[19] = vaddq_s16(a[19], a[18]);
+  b[20] = vaddq_s16(a[20], a[21]);
+  b[21] = vsubq_s16(a[20], a[21]);
+  b[22] = vsubq_s16(a[23], a[22]);
+  b[23] = vaddq_s16(a[23], a[22]);
+  b[24] = vaddq_s16(a[24], a[25]);
+  b[25] = vsubq_s16(a[24], a[25]);
+  b[26] = vsubq_s16(a[27], a[26]);
+  b[27] = vaddq_s16(a[27], a[26]);
+  b[28] = vaddq_s16(a[28], a[29]);
+  b[29] = vsubq_s16(a[28], a[29]);
+  b[30] = vsubq_s16(a[31], a[30]);
+  b[31] = vaddq_s16(a[31], a[30]);
+
+  // Final stage.
+  // Also compute partial rounding shift:
+  // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+  out[0] = sub_round_shift_s16(b[0]);
+  out[16] = sub_round_shift_s16(b[1]);
+  out[8] = sub_round_shift_s16(b[2]);
+  out[24] = sub_round_shift_s16(b[3]);
+  out[4] = sub_round_shift_s16(b[4]);
+  out[20] = sub_round_shift_s16(b[5]);
+  out[12] = sub_round_shift_s16(b[6]);
+  out[28] = sub_round_shift_s16(b[7]);
+  out[2] = sub_round_shift_s16(b[8]);
+  out[18] = sub_round_shift_s16(b[9]);
+  out[10] = sub_round_shift_s16(b[10]);
+  out[26] = sub_round_shift_s16(b[11]);
+  out[6] = sub_round_shift_s16(b[12]);
+  out[22] = sub_round_shift_s16(b[13]);
+  out[14] = sub_round_shift_s16(b[14]);
+  out[30] = sub_round_shift_s16(b[15]);
+
+  butterfly_two_coeff(b[31], b[16], cospi_1_64, cospi_31_64, &a[1], &a[31]);
+  out[1] = sub_round_shift_s16(a[1]);
+  out[31] = sub_round_shift_s16(a[31]);
+
+  butterfly_two_coeff(b[30], b[17], cospi_17_64, cospi_15_64, &a[17], &a[15]);
+  out[17] = sub_round_shift_s16(a[17]);
+  out[15] = sub_round_shift_s16(a[15]);
+
+  butterfly_two_coeff(b[29], b[18], cospi_9_64, cospi_23_64, &a[9], &a[23]);
+  out[9] = sub_round_shift_s16(a[9]);
+  out[23] = sub_round_shift_s16(a[23]);
+
+  butterfly_two_coeff(b[28], b[19], cospi_25_64, cospi_7_64, &a[25], &a[7]);
+  out[25] = sub_round_shift_s16(a[25]);
+  out[7] = sub_round_shift_s16(a[7]);
+
+  butterfly_two_coeff(b[27], b[20], cospi_5_64, cospi_27_64, &a[5], &a[27]);
+  out[5] = sub_round_shift_s16(a[5]);
+  out[27] = sub_round_shift_s16(a[27]);
+
+  butterfly_two_coeff(b[26], b[21], cospi_21_64, cospi_11_64, &a[21], &a[11]);
+  out[21] = sub_round_shift_s16(a[21]);
+  out[11] = sub_round_shift_s16(a[11]);
+
+  butterfly_two_coeff(b[25], b[22], cospi_13_64, cospi_19_64, &a[13], &a[19]);
+  out[13] = sub_round_shift_s16(a[13]);
+  out[19] = sub_round_shift_s16(a[19]);
+
+  butterfly_two_coeff(b[24], b[23], cospi_29_64, cospi_3_64, &a[29], &a[3]);
+  out[29] = sub_round_shift_s16(a[29]);
+  out[3] = sub_round_shift_s16(a[3]);
+}
+
+#define PASS_THROUGH(src, dst, element)    \
+  do {                                     \
+    dst##_lo[element] = src##_lo[element]; \
+    dst##_hi[element] = src##_hi[element]; \
+  } while (0)
+
+#define ADD_S16_S32(a, left_index, right_index, b, b_index)                   \
+  do {                                                                        \
+    b##_lo[b_index] =                                                         \
+        vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
+    b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]),                 \
+                                vget_high_s16(a[right_index]));               \
+  } while (0)
+
+#define SUB_S16_S32(a, left_index, right_index, b, b_index)                   \
+  do {                                                                        \
+    b##_lo[b_index] =                                                         \
+        vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
+    b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]),                 \
+                                vget_high_s16(a[right_index]));               \
+  } while (0)
+
+#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index)                     \
+  do {                                                                       \
+    c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index]));  \
+    c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \
+  } while (0)
+
+#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \
+  do {                                                                     \
+    temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index]));           \
+    temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index]));          \
+    c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]);   \
+    c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]);   \
+  } while (0)
+
+#define ADD_S32(a, left_index, right_index, b, b_index)                   \
+  do {                                                                    \
+    b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \
+    b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \
+  } while (0)
+
+#define SUB_S32(a, left_index, right_index, b, b_index)                   \
+  do {                                                                    \
+    b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \
+    b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \
+  } while (0)
+
+#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b,   \
+                              add_index, sub_index)                      \
+  do {                                                                   \
+    butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \
+                                &b##_lo[add_index], &b##_hi[add_index],  \
+                                &b##_lo[sub_index], &b##_hi[sub_index]); \
+  } while (0)
+
+#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index,  \
+                          sub_index)                                           \
+  do {                                                                         \
+    butterfly_one_coeff_s32_fast(                                              \
+        a##_lo[left_index], a##_hi[left_index], a##_lo[right_index],           \
+        a##_hi[right_index], constant, &b##_lo[add_index], &b##_hi[add_index], \
+        &b##_lo[sub_index], &b##_hi[sub_index]);                               \
+  } while (0)
+
+#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant,           \
+                          right_constant, b, add_index, sub_index)             \
+  do {                                                                         \
+    butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index],            \
+                            a##_lo[right_index], a##_hi[right_index],          \
+                            left_constant, right_constant, &b##_lo[add_index], \
+                            &b##_hi[add_index], &b##_lo[sub_index],            \
+                            &b##_hi[sub_index]);                               \
+  } while (0)
+
+static INLINE void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
+  int16x8_t a[32];
+  int16x8_t b[32];
+  int32x4_t c_lo[32];
+  int32x4_t c_hi[32];
+  int32x4_t d_lo[32];
+  int32x4_t d_hi[32];
+
+  // Stage 1. Done as part of the load for the first pass.
+  a[0] = vaddq_s16(in[0], in[31]);
+  a[1] = vaddq_s16(in[1], in[30]);
+  a[2] = vaddq_s16(in[2], in[29]);
+  a[3] = vaddq_s16(in[3], in[28]);
+  a[4] = vaddq_s16(in[4], in[27]);
+  a[5] = vaddq_s16(in[5], in[26]);
+  a[6] = vaddq_s16(in[6], in[25]);
+  a[7] = vaddq_s16(in[7], in[24]);
+  a[8] = vaddq_s16(in[8], in[23]);
+  a[9] = vaddq_s16(in[9], in[22]);
+  a[10] = vaddq_s16(in[10], in[21]);
+  a[11] = vaddq_s16(in[11], in[20]);
+  a[12] = vaddq_s16(in[12], in[19]);
+  a[13] = vaddq_s16(in[13], in[18]);
+  a[14] = vaddq_s16(in[14], in[17]);
+  a[15] = vaddq_s16(in[15], in[16]);
+  a[16] = vsubq_s16(in[15], in[16]);
+  a[17] = vsubq_s16(in[14], in[17]);
+  a[18] = vsubq_s16(in[13], in[18]);
+  a[19] = vsubq_s16(in[12], in[19]);
+  a[20] = vsubq_s16(in[11], in[20]);
+  a[21] = vsubq_s16(in[10], in[21]);
+  a[22] = vsubq_s16(in[9], in[22]);
+  a[23] = vsubq_s16(in[8], in[23]);
+  a[24] = vsubq_s16(in[7], in[24]);
+  a[25] = vsubq_s16(in[6], in[25]);
+  a[26] = vsubq_s16(in[5], in[26]);
+  a[27] = vsubq_s16(in[4], in[27]);
+  a[28] = vsubq_s16(in[3], in[28]);
+  a[29] = vsubq_s16(in[2], in[29]);
+  a[30] = vsubq_s16(in[1], in[30]);
+  a[31] = vsubq_s16(in[0], in[31]);
+
+  // Stage 2.
+  b[0] = vaddq_s16(a[0], a[15]);
+  b[1] = vaddq_s16(a[1], a[14]);
+  b[2] = vaddq_s16(a[2], a[13]);
+  b[3] = vaddq_s16(a[3], a[12]);
+  b[4] = vaddq_s16(a[4], a[11]);
+  b[5] = vaddq_s16(a[5], a[10]);
+  b[6] = vaddq_s16(a[6], a[9]);
+  b[7] = vaddq_s16(a[7], a[8]);
+
+  b[8] = vsubq_s16(a[7], a[8]);
+  b[9] = vsubq_s16(a[6], a[9]);
+  b[10] = vsubq_s16(a[5], a[10]);
+  b[11] = vsubq_s16(a[4], a[11]);
+  b[12] = vsubq_s16(a[3], a[12]);
+  b[13] = vsubq_s16(a[2], a[13]);
+  b[14] = vsubq_s16(a[1], a[14]);
+  b[15] = vsubq_s16(a[0], a[15]);
+
+  b[16] = a[16];
+  b[17] = a[17];
+  b[18] = a[18];
+  b[19] = a[19];
+
+  butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]);
+  butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]);
+  butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]);
+  butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]);
+
+  b[28] = a[28];
+  b[29] = a[29];
+  b[30] = a[30];
+  b[31] = a[31];
+
+  // Stage 3. With extreme values for input this calculation rolls over int16_t.
+  // The sources for b[0] get added multiple times and, through testing, have
+  // been shown to overflow starting here.
+  ADD_S16_S32(b, 0, 7, c, 0);
+  ADD_S16_S32(b, 1, 6, c, 1);
+  ADD_S16_S32(b, 2, 5, c, 2);
+  ADD_S16_S32(b, 3, 4, c, 3);
+  SUB_S16_S32(b, 3, 4, c, 4);
+  SUB_S16_S32(b, 2, 5, c, 5);
+  SUB_S16_S32(b, 1, 6, c, 6);
+  SUB_S16_S32(b, 0, 7, c, 7);
+
+  a[8] = b[8];
+  a[9] = b[9];
+
+  BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10);
+  BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11);
+
+  a[14] = b[14];
+  a[15] = b[15];
+
+  ADD_S16_S32(b, 16, 23, c, 16);
+  ADD_S16_S32(b, 17, 22, c, 17);
+  ADD_S16_S32(b, 18, 21, c, 18);
+  ADD_S16_S32(b, 19, 20, c, 19);
+  SUB_S16_S32(b, 19, 20, c, 20);
+  SUB_S16_S32(b, 18, 21, c, 21);
+  SUB_S16_S32(b, 17, 22, c, 22);
+  SUB_S16_S32(b, 16, 23, c, 23);
+  SUB_S16_S32(b, 31, 24, c, 24);
+  SUB_S16_S32(b, 30, 25, c, 25);
+  SUB_S16_S32(b, 29, 26, c, 26);
+  SUB_S16_S32(b, 28, 27, c, 27);
+  ADD_S16_S32(b, 28, 27, c, 28);
+  ADD_S16_S32(b, 29, 26, c, 29);
+  ADD_S16_S32(b, 30, 25, c, 30);
+  ADD_S16_S32(b, 31, 24, c, 31);
+
+  // Stage 4.
+  ADD_S32(c, 0, 3, d, 0);
+  ADD_S32(c, 1, 2, d, 1);
+  SUB_S32(c, 1, 2, d, 2);
+  SUB_S32(c, 0, 3, d, 3);
+
+  PASS_THROUGH(c, d, 4);
+
+  BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5);
+
+  PASS_THROUGH(c, d, 7);
+
+  ADDW_S16_S32(c, 11, a, 8, d, 8);
+  ADDW_S16_S32(c, 10, a, 9, d, 9);
+  SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10);
+  SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11);
+  SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12);
+  SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13);
+  ADDW_S16_S32(c, 13, b, 14, d, 14);
+  ADDW_S16_S32(c, 12, b, 15, d, 15);
+
+  PASS_THROUGH(c, d, 16);
+  PASS_THROUGH(c, d, 17);
+
+  BUTTERFLY_TWO_S32(c, 29, 18, cospi_8_64, cospi_24_64, d, 29, 18);
+  BUTTERFLY_TWO_S32(c, 28, 19, cospi_8_64, cospi_24_64, d, 28, 19);
+  BUTTERFLY_TWO_S32(c, 27, 20, cospi_24_64, -cospi_8_64, d, 27, 20);
+  BUTTERFLY_TWO_S32(c, 26, 21, cospi_24_64, -cospi_8_64, d, 26, 21);
+
+  PASS_THROUGH(c, d, 22);
+  PASS_THROUGH(c, d, 23);
+  PASS_THROUGH(c, d, 24);
+  PASS_THROUGH(c, d, 25);
+
+  PASS_THROUGH(c, d, 30);
+  PASS_THROUGH(c, d, 31);
+
+  // Stage 5.
+  BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1);
+  BUTTERFLY_TWO_S32(d, 3, 2, cospi_8_64, cospi_24_64, c, 2, 3);
+
+  ADD_S32(d, 4, 5, c, 4);
+  SUB_S32(d, 4, 5, c, 5);
+  SUB_S32(d, 7, 6, c, 6);
+  ADD_S32(d, 7, 6, c, 7);
+
+  PASS_THROUGH(d, c, 8);
+
+  BUTTERFLY_TWO_S32(d, 14, 9, cospi_8_64, cospi_24_64, c, 14, 9);
+  BUTTERFLY_TWO_S32(d, 13, 10, cospi_24_64, -cospi_8_64, c, 13, 10);
+
+  PASS_THROUGH(d, c, 11);
+  PASS_THROUGH(d, c, 12);
+  PASS_THROUGH(d, c, 15);
+
+  ADD_S32(d, 16, 19, c, 16);
+  ADD_S32(d, 17, 18, c, 17);
+  SUB_S32(d, 17, 18, c, 18);
+  SUB_S32(d, 16, 19, c, 19);
+  SUB_S32(d, 23, 20, c, 20);
+  SUB_S32(d, 22, 21, c, 21);
+  ADD_S32(d, 22, 21, c, 22);
+  ADD_S32(d, 23, 20, c, 23);
+  ADD_S32(d, 24, 27, c, 24);
+  ADD_S32(d, 25, 26, c, 25);
+  SUB_S32(d, 25, 26, c, 26);
+  SUB_S32(d, 24, 27, c, 27);
+  SUB_S32(d, 31, 28, c, 28);
+  SUB_S32(d, 30, 29, c, 29);
+  ADD_S32(d, 30, 29, c, 30);
+  ADD_S32(d, 31, 28, c, 31);
+
+  // Stage 6.
+  PASS_THROUGH(c, d, 0);
+  PASS_THROUGH(c, d, 1);
+  PASS_THROUGH(c, d, 2);
+  PASS_THROUGH(c, d, 3);
+
+  BUTTERFLY_TWO_S32(c, 7, 4, cospi_4_64, cospi_28_64, d, 4, 7);
+  BUTTERFLY_TWO_S32(c, 6, 5, cospi_20_64, cospi_12_64, d, 5, 6);
+
+  ADD_S32(c, 8, 9, d, 8);
+  SUB_S32(c, 8, 9, d, 9);
+  SUB_S32(c, 11, 10, d, 10);
+  ADD_S32(c, 11, 10, d, 11);
+  ADD_S32(c, 12, 13, d, 12);
+  SUB_S32(c, 12, 13, d, 13);
+  SUB_S32(c, 15, 14, d, 14);
+  ADD_S32(c, 15, 14, d, 15);
+
+  PASS_THROUGH(c, d, 16);
+  PASS_THROUGH(c, d, 19);
+  PASS_THROUGH(c, d, 20);
+  PASS_THROUGH(c, d, 23);
+  PASS_THROUGH(c, d, 24);
+  PASS_THROUGH(c, d, 27);
+  PASS_THROUGH(c, d, 28);
+  PASS_THROUGH(c, d, 31);
+
+  BUTTERFLY_TWO_S32(c, 30, 17, cospi_4_64, cospi_28_64, d, 30, 17);
+  BUTTERFLY_TWO_S32(c, 29, 18, cospi_28_64, -cospi_4_64, d, 29, 18);
+  BUTTERFLY_TWO_S32(c, 26, 21, cospi_20_64, cospi_12_64, d, 26, 21);
+  BUTTERFLY_TWO_S32(c, 25, 22, cospi_12_64, -cospi_20_64, d, 25, 22);
+
+  // Stage 7.
+  PASS_THROUGH(d, c, 0);
+  PASS_THROUGH(d, c, 1);
+  PASS_THROUGH(d, c, 2);
+  PASS_THROUGH(d, c, 3);
+  PASS_THROUGH(d, c, 4);
+  PASS_THROUGH(d, c, 5);
+  PASS_THROUGH(d, c, 6);
+  PASS_THROUGH(d, c, 7);
+
+  BUTTERFLY_TWO_S32(d, 15, 8, cospi_2_64, cospi_30_64, c, 8, 15);
+  BUTTERFLY_TWO_S32(d, 14, 9, cospi_18_64, cospi_14_64, c, 9, 14);
+  BUTTERFLY_TWO_S32(d, 13, 10, cospi_10_64, cospi_22_64, c, 10, 13);
+  BUTTERFLY_TWO_S32(d, 12, 11, cospi_26_64, cospi_6_64, c, 11, 12);
+
+  ADD_S32(d, 16, 17, c, 16);
+  SUB_S32(d, 16, 17, c, 17);
+  SUB_S32(d, 19, 18, c, 18);
+  ADD_S32(d, 19, 18, c, 19);
+  ADD_S32(d, 20, 21, c, 20);
+  SUB_S32(d, 20, 21, c, 21);
+  SUB_S32(d, 23, 22, c, 22);
+  ADD_S32(d, 23, 22, c, 23);
+  ADD_S32(d, 24, 25, c, 24);
+  SUB_S32(d, 24, 25, c, 25);
+  SUB_S32(d, 27, 26, c, 26);
+  ADD_S32(d, 27, 26, c, 27);
+  ADD_S32(d, 28, 29, c, 28);
+  SUB_S32(d, 28, 29, c, 29);
+  SUB_S32(d, 31, 30, c, 30);
+  ADD_S32(d, 31, 30, c, 31);
+
+  // Final stage.
+  // Roll rounding into this function so we can pass back int16x8.
+
+  out[0] = add_round_shift_s32_narrow(c_lo[0], c_hi[0]);
+  out[16] = add_round_shift_s32_narrow(c_lo[1], c_hi[1]);
+
+  out[8] = add_round_shift_s32_narrow(c_lo[2], c_hi[2]);
+  out[24] = add_round_shift_s32_narrow(c_lo[3], c_hi[3]);
+  out[4] = add_round_shift_s32_narrow(c_lo[4], c_hi[4]);
+  out[20] = add_round_shift_s32_narrow(c_lo[5], c_hi[5]);
+  out[12] = add_round_shift_s32_narrow(c_lo[6], c_hi[6]);
+
+  out[28] = add_round_shift_s32_narrow(c_lo[7], c_hi[7]);
+  out[2] = add_round_shift_s32_narrow(c_lo[8], c_hi[8]);
+  out[18] = add_round_shift_s32_narrow(c_lo[9], c_hi[9]);
+  out[10] = add_round_shift_s32_narrow(c_lo[10], c_hi[10]);
+
+  out[26] = add_round_shift_s32_narrow(c_lo[11], c_hi[11]);
+  out[6] = add_round_shift_s32_narrow(c_lo[12], c_hi[12]);
+  out[22] = add_round_shift_s32_narrow(c_lo[13], c_hi[13]);
+  out[14] = add_round_shift_s32_narrow(c_lo[14], c_hi[14]);
+  out[30] = add_round_shift_s32_narrow(c_lo[15], c_hi[15]);
+
+  BUTTERFLY_TWO_S32(c, 31, 16, cospi_1_64, cospi_31_64, d, 1, 31);
+  out[1] = add_round_shift_s32_narrow(d_lo[1], d_hi[1]);
+  out[31] = add_round_shift_s32_narrow(d_lo[31], d_hi[31]);
+
+  BUTTERFLY_TWO_S32(c, 30, 17, cospi_17_64, cospi_15_64, d, 17, 15);
+  out[17] = add_round_shift_s32_narrow(d_lo[17], d_hi[17]);
+  out[15] = add_round_shift_s32_narrow(d_lo[15], d_hi[15]);
+
+  BUTTERFLY_TWO_S32(c, 29, 18, cospi_9_64, cospi_23_64, d, 9, 23);
+  out[9] = add_round_shift_s32_narrow(d_lo[9], d_hi[9]);
+  out[23] = add_round_shift_s32_narrow(d_lo[23], d_hi[23]);
+
+  BUTTERFLY_TWO_S32(c, 28, 19, cospi_25_64, cospi_7_64, d, 25, 7);
+  out[25] = add_round_shift_s32_narrow(d_lo[25], d_hi[25]);
+  out[7] = add_round_shift_s32_narrow(d_lo[7], d_hi[7]);
+
+  BUTTERFLY_TWO_S32(c, 27, 20, cospi_5_64, cospi_27_64, d, 5, 27);
+  out[5] = add_round_shift_s32_narrow(d_lo[5], d_hi[5]);
+  out[27] = add_round_shift_s32_narrow(d_lo[27], d_hi[27]);
+
+  BUTTERFLY_TWO_S32(c, 26, 21, cospi_21_64, cospi_11_64, d, 21, 11);
+  out[21] = add_round_shift_s32_narrow(d_lo[21], d_hi[21]);
+  out[11] = add_round_shift_s32_narrow(d_lo[11], d_hi[11]);
+
+  BUTTERFLY_TWO_S32(c, 25, 22, cospi_13_64, cospi_19_64, d, 13, 19);
+  out[13] = add_round_shift_s32_narrow(d_lo[13], d_hi[13]);
+  out[19] = add_round_shift_s32_narrow(d_lo[19], d_hi[19]);
+
+  BUTTERFLY_TWO_S32(c, 24, 23, cospi_29_64, cospi_3_64, d, 29, 3);
+  out[29] = add_round_shift_s32_narrow(d_lo[29], d_hi[29]);
+  out[3] = add_round_shift_s32_narrow(d_lo[3], d_hi[3]);
+}
+
+static INLINE void dct_body_second_pass_rd(const int16x8_t *in,
+                                           int16x8_t *out) {
+  int16x8_t a[32];
+  int16x8_t b[32];
+
+  // Stage 1. Done as part of the load for the first pass.
+  a[0] = vaddq_s16(in[0], in[31]);
+  a[1] = vaddq_s16(in[1], in[30]);
+  a[2] = vaddq_s16(in[2], in[29]);
+  a[3] = vaddq_s16(in[3], in[28]);
+  a[4] = vaddq_s16(in[4], in[27]);
+  a[5] = vaddq_s16(in[5], in[26]);
+  a[6] = vaddq_s16(in[6], in[25]);
+  a[7] = vaddq_s16(in[7], in[24]);
+  a[8] = vaddq_s16(in[8], in[23]);
+  a[9] = vaddq_s16(in[9], in[22]);
+  a[10] = vaddq_s16(in[10], in[21]);
+  a[11] = vaddq_s16(in[11], in[20]);
+  a[12] = vaddq_s16(in[12], in[19]);
+  a[13] = vaddq_s16(in[13], in[18]);
+  a[14] = vaddq_s16(in[14], in[17]);
+  a[15] = vaddq_s16(in[15], in[16]);
+  a[16] = vsubq_s16(in[15], in[16]);
+  a[17] = vsubq_s16(in[14], in[17]);
+  a[18] = vsubq_s16(in[13], in[18]);
+  a[19] = vsubq_s16(in[12], in[19]);
+  a[20] = vsubq_s16(in[11], in[20]);
+  a[21] = vsubq_s16(in[10], in[21]);
+  a[22] = vsubq_s16(in[9], in[22]);
+  a[23] = vsubq_s16(in[8], in[23]);
+  a[24] = vsubq_s16(in[7], in[24]);
+  a[25] = vsubq_s16(in[6], in[25]);
+  a[26] = vsubq_s16(in[5], in[26]);
+  a[27] = vsubq_s16(in[4], in[27]);
+  a[28] = vsubq_s16(in[3], in[28]);
+  a[29] = vsubq_s16(in[2], in[29]);
+  a[30] = vsubq_s16(in[1], in[30]);
+  a[31] = vsubq_s16(in[0], in[31]);
+
+  // Stage 2.
+  // For the "rd" version, all the values are rounded down after stage 2 to keep
+  // the values in 16 bits.
+  b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15]));
+  b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14]));
+  b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13]));
+  b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12]));
+  b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11]));
+  b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10]));
+  b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9]));
+  b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8]));
+
+  b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8]));
+  b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9]));
+  b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10]));
+  b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11]));
+  b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12]));
+  b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13]));
+  b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14]));
+  b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15]));
+
+  b[16] = add_round_shift_s16(a[16]);
+  b[17] = add_round_shift_s16(a[17]);
+  b[18] = add_round_shift_s16(a[18]);
+  b[19] = add_round_shift_s16(a[19]);
+
+  butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]);
+  butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]);
+  butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]);
+  butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]);
+  b[20] = add_round_shift_s16(b[20]);
+  b[21] = add_round_shift_s16(b[21]);
+  b[22] = add_round_shift_s16(b[22]);
+  b[23] = add_round_shift_s16(b[23]);
+  b[24] = add_round_shift_s16(b[24]);
+  b[25] = add_round_shift_s16(b[25]);
+  b[26] = add_round_shift_s16(b[26]);
+  b[27] = add_round_shift_s16(b[27]);
+
+  b[28] = add_round_shift_s16(a[28]);
+  b[29] = add_round_shift_s16(a[29]);
+  b[30] = add_round_shift_s16(a[30]);
+  b[31] = add_round_shift_s16(a[31]);
+
+  // Stage 3.
+  a[0] = vaddq_s16(b[0], b[7]);
+  a[1] = vaddq_s16(b[1], b[6]);
+  a[2] = vaddq_s16(b[2], b[5]);
+  a[3] = vaddq_s16(b[3], b[4]);
+
+  a[4] = vsubq_s16(b[3], b[4]);
+  a[5] = vsubq_s16(b[2], b[5]);
+  a[6] = vsubq_s16(b[1], b[6]);
+  a[7] = vsubq_s16(b[0], b[7]);
+
+  a[8] = b[8];
+  a[9] = b[9];
+
+  butterfly_one_coeff_s16_s32_narrow(b[13], b[10], cospi_16_64, &a[13], &a[10]);
+  butterfly_one_coeff_s16_s32_narrow(b[12], b[11], cospi_16_64, &a[12], &a[11]);
+
+  a[14] = b[14];
+  a[15] = b[15];
+
+  a[16] = vaddq_s16(b[16], b[23]);
+  a[17] = vaddq_s16(b[17], b[22]);
+  a[18] = vaddq_s16(b[18], b[21]);
+  a[19] = vaddq_s16(b[19], b[20]);
+
+  a[20] = vsubq_s16(b[19], b[20]);
+  a[21] = vsubq_s16(b[18], b[21]);
+  a[22] = vsubq_s16(b[17], b[22]);
+  a[23] = vsubq_s16(b[16], b[23]);
+
+  a[24] = vsubq_s16(b[31], b[24]);
+  a[25] = vsubq_s16(b[30], b[25]);
+  a[26] = vsubq_s16(b[29], b[26]);
+  a[27] = vsubq_s16(b[28], b[27]);
+
+  a[28] = vaddq_s16(b[28], b[27]);
+  a[29] = vaddq_s16(b[29], b[26]);
+  a[30] = vaddq_s16(b[30], b[25]);
+  a[31] = vaddq_s16(b[31], b[24]);
+
+  // Stage 4.
+  b[0] = vaddq_s16(a[0], a[3]);
+  b[1] = vaddq_s16(a[1], a[2]);
+  b[2] = vsubq_s16(a[1], a[2]);
+  b[3] = vsubq_s16(a[0], a[3]);
+
+  b[4] = a[4];
+
+  butterfly_one_coeff_s16_s32_narrow(a[6], a[5], cospi_16_64, &b[6], &b[5]);
+
+  b[7] = a[7];
+
+  b[8] = vaddq_s16(a[8], a[11]);
+  b[9] = vaddq_s16(a[9], a[10]);
+  b[10] = vsubq_s16(a[9], a[10]);
+  b[11] = vsubq_s16(a[8], a[11]);
+  b[12] = vsubq_s16(a[15], a[12]);
+  b[13] = vsubq_s16(a[14], a[13]);
+  b[14] = vaddq_s16(a[14], a[13]);
+  b[15] = vaddq_s16(a[15], a[12]);
+
+  b[16] = a[16];
+  b[17] = a[17];
+
+  butterfly_two_coeff(a[29], a[18], cospi_8_64, cospi_24_64, &b[29], &b[18]);
+  butterfly_two_coeff(a[28], a[19], cospi_8_64, cospi_24_64, &b[28], &b[19]);
+  butterfly_two_coeff(a[27], a[20], cospi_24_64, -cospi_8_64, &b[27], &b[20]);
+  butterfly_two_coeff(a[26], a[21], cospi_24_64, -cospi_8_64, &b[26], &b[21]);
+
+  b[22] = a[22];
+  b[23] = a[23];
+  b[24] = a[24];
+  b[25] = a[25];
+
+  b[30] = a[30];
+  b[31] = a[31];
+
+  // Stage 5.
+  butterfly_one_coeff_s16_s32_narrow(b[0], b[1], cospi_16_64, &a[0], &a[1]);
+  butterfly_two_coeff(b[3], b[2], cospi_8_64, cospi_24_64, &a[2], &a[3]);
+
+  a[4] = vaddq_s16(b[4], b[5]);
+  a[5] = vsubq_s16(b[4], b[5]);
+  a[6] = vsubq_s16(b[7], b[6]);
+  a[7] = vaddq_s16(b[7], b[6]);
+
+  a[8] = b[8];
+
+  butterfly_two_coeff(b[14], b[9], cospi_8_64, cospi_24_64, &a[14], &a[9]);
+  butterfly_two_coeff(b[13], b[10], cospi_24_64, -cospi_8_64, &a[13], &a[10]);
+
+  a[11] = b[11];
+  a[12] = b[12];
+
+  a[15] = b[15];
+
+  a[16] = vaddq_s16(b[19], b[16]);
+  a[17] = vaddq_s16(b[18], b[17]);
+  a[18] = vsubq_s16(b[17], b[18]);
+  a[19] = vsubq_s16(b[16], b[19]);
+  a[20] = vsubq_s16(b[23], b[20]);
+  a[21] = vsubq_s16(b[22], b[21]);
+  a[22] = vaddq_s16(b[21], b[22]);
+  a[23] = vaddq_s16(b[20], b[23]);
+  a[24] = vaddq_s16(b[27], b[24]);
+  a[25] = vaddq_s16(b[26], b[25]);
+  a[26] = vsubq_s16(b[25], b[26]);
+  a[27] = vsubq_s16(b[24], b[27]);
+  a[28] = vsubq_s16(b[31], b[28]);
+  a[29] = vsubq_s16(b[30], b[29]);
+  a[30] = vaddq_s16(b[29], b[30]);
+  a[31] = vaddq_s16(b[28], b[31]);
+
+  // Stage 6.
+  b[0] = a[0];
+  b[1] = a[1];
+  b[2] = a[2];
+  b[3] = a[3];
+
+  butterfly_two_coeff(a[7], a[4], cospi_4_64, cospi_28_64, &b[4], &b[7]);
+  butterfly_two_coeff(a[6], a[5], cospi_20_64, cospi_12_64, &b[5], &b[6]);
+
+  b[8] = vaddq_s16(a[8], a[9]);
+  b[9] = vsubq_s16(a[8], a[9]);
+  b[10] = vsubq_s16(a[11], a[10]);
+  b[11] = vaddq_s16(a[11], a[10]);
+  b[12] = vaddq_s16(a[12], a[13]);
+  b[13] = vsubq_s16(a[12], a[13]);
+  b[14] = vsubq_s16(a[15], a[14]);
+  b[15] = vaddq_s16(a[15], a[14]);
+
+  b[16] = a[16];
+  b[19] = a[19];
+  b[20] = a[20];
+  b[23] = a[23];
+  b[24] = a[24];
+  b[27] = a[27];
+  b[28] = a[28];
+  b[31] = a[31];
+
+  butterfly_two_coeff(a[30], a[17], cospi_4_64, cospi_28_64, &b[30], &b[17]);
+  butterfly_two_coeff(a[29], a[18], cospi_28_64, -cospi_4_64, &b[29], &b[18]);
+
+  butterfly_two_coeff(a[26], a[21], cospi_20_64, cospi_12_64, &b[26], &b[21]);
+  butterfly_two_coeff(a[25], a[22], cospi_12_64, -cospi_20_64, &b[25], &b[22]);
+
+  // Stage 7.
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+  a[4] = b[4];
+  a[5] = b[5];
+  a[6] = b[6];
+  a[7] = b[7];
+
+  butterfly_two_coeff(b[15], b[8], cospi_2_64, cospi_30_64, &a[8], &a[15]);
+  butterfly_two_coeff(b[14], b[9], cospi_18_64, cospi_14_64, &a[9], &a[14]);
+  butterfly_two_coeff(b[13], b[10], cospi_10_64, cospi_22_64, &a[10], &a[13]);
+  butterfly_two_coeff(b[12], b[11], cospi_26_64, cospi_6_64, &a[11], &a[12]);
+
+  a[16] = vaddq_s16(b[16], b[17]);
+  a[17] = vsubq_s16(b[16], b[17]);
+  a[18] = vsubq_s16(b[19], b[18]);
+  a[19] = vaddq_s16(b[19], b[18]);
+  a[20] = vaddq_s16(b[20], b[21]);
+  a[21] = vsubq_s16(b[20], b[21]);
+  a[22] = vsubq_s16(b[23], b[22]);
+  a[23] = vaddq_s16(b[23], b[22]);
+  a[24] = vaddq_s16(b[24], b[25]);
+  a[25] = vsubq_s16(b[24], b[25]);
+  a[26] = vsubq_s16(b[27], b[26]);
+  a[27] = vaddq_s16(b[27], b[26]);
+  a[28] = vaddq_s16(b[28], b[29]);
+  a[29] = vsubq_s16(b[28], b[29]);
+  a[30] = vsubq_s16(b[31], b[30]);
+  a[31] = vaddq_s16(b[31], b[30]);
+
+  // Final stage.
+  out[0] = a[0];
+  out[16] = a[1];
+  out[8] = a[2];
+  out[24] = a[3];
+  out[4] = a[4];
+  out[20] = a[5];
+  out[12] = a[6];
+  out[28] = a[7];
+  out[2] = a[8];
+  out[18] = a[9];
+  out[10] = a[10];
+  out[26] = a[11];
+  out[6] = a[12];
+  out[22] = a[13];
+  out[14] = a[14];
+  out[30] = a[15];
+
+  butterfly_two_coeff(a[31], a[16], cospi_1_64, cospi_31_64, &out[1], &out[31]);
+  butterfly_two_coeff(a[30], a[17], cospi_17_64, cospi_15_64, &out[17],
+                      &out[15]);
+  butterfly_two_coeff(a[29], a[18], cospi_9_64, cospi_23_64, &out[9], &out[23]);
+  butterfly_two_coeff(a[28], a[19], cospi_25_64, cospi_7_64, &out[25], &out[7]);
+  butterfly_two_coeff(a[27], a[20], cospi_5_64, cospi_27_64, &out[5], &out[27]);
+  butterfly_two_coeff(a[26], a[21], cospi_21_64, cospi_11_64, &out[21],
+                      &out[11]);
+  butterfly_two_coeff(a[25], a[22], cospi_13_64, cospi_19_64, &out[13],
+                      &out[19]);
+  butterfly_two_coeff(a[24], a[23], cospi_29_64, cospi_3_64, &out[29], &out[3]);
+}
+
+#undef PASS_THROUGH
+#undef ADD_S16_S32
+#undef SUB_S16_S32
+#undef ADDW_S16_S32
+#undef SUBW_S16_S32
+#undef ADD_S32
+#undef SUB_S32
+#undef BUTTERFLY_ONE_S16_S32
+#undef BUTTERFLY_ONE_S32
+#undef BUTTERFLY_TWO_S32
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+// Store 32 32x4 vectors, assuming stride == 32.
+static INLINE void store32x32_s32(
+    tran_low_t *a, const int32x4_t *l1 /*[16]*/, const int32x4_t *r1 /*[16]*/,
+    const int32x4_t *l2 /*[16]*/, const int32x4_t *r2 /*[16]*/,
+    const int32x4_t *l3 /*[16]*/, const int32x4_t *r3 /*[16]*/,
+    const int32x4_t *l4 /*[16]*/, const int32x4_t *r4 /*[16]*/) {
+  int i;
+  for (i = 0; i < 32; i++) {
+    vst1q_s32(a, l1[i]);
+    vst1q_s32(a + 4, r1[i]);
+    vst1q_s32(a + 8, l2[i]);
+    vst1q_s32(a + 12, r2[i]);
+    vst1q_s32(a + 16, l3[i]);
+    vst1q_s32(a + 20, r3[i]);
+    vst1q_s32(a + 24, l4[i]);
+    vst1q_s32(a + 28, r4[i]);
+    a += 32;
+  }
+}
+
+static INLINE void highbd_scale_input(const int16x8_t *a /*[32]*/,
+                                      int32x4_t *left /*[32]*/,
+                                      int32x4_t *right /* [32] */) {
+  left[0] = vshll_n_s16(vget_low_s16(a[0]), 2);
+  left[1] = vshll_n_s16(vget_low_s16(a[1]), 2);
+  left[2] = vshll_n_s16(vget_low_s16(a[2]), 2);
+  left[3] = vshll_n_s16(vget_low_s16(a[3]), 2);
+  left[4] = vshll_n_s16(vget_low_s16(a[4]), 2);
+  left[5] = vshll_n_s16(vget_low_s16(a[5]), 2);
+  left[6] = vshll_n_s16(vget_low_s16(a[6]), 2);
+  left[7] = vshll_n_s16(vget_low_s16(a[7]), 2);
+  left[8] = vshll_n_s16(vget_low_s16(a[8]), 2);
+  left[9] = vshll_n_s16(vget_low_s16(a[9]), 2);
+  left[10] = vshll_n_s16(vget_low_s16(a[10]), 2);
+  left[11] = vshll_n_s16(vget_low_s16(a[11]), 2);
+  left[12] = vshll_n_s16(vget_low_s16(a[12]), 2);
+  left[13] = vshll_n_s16(vget_low_s16(a[13]), 2);
+  left[14] = vshll_n_s16(vget_low_s16(a[14]), 2);
+  left[15] = vshll_n_s16(vget_low_s16(a[15]), 2);
+  left[16] = vshll_n_s16(vget_low_s16(a[16]), 2);
+  left[17] = vshll_n_s16(vget_low_s16(a[17]), 2);
+  left[18] = vshll_n_s16(vget_low_s16(a[18]), 2);
+  left[19] = vshll_n_s16(vget_low_s16(a[19]), 2);
+  left[20] = vshll_n_s16(vget_low_s16(a[20]), 2);
+  left[21] = vshll_n_s16(vget_low_s16(a[21]), 2);
+  left[22] = vshll_n_s16(vget_low_s16(a[22]), 2);
+  left[23] = vshll_n_s16(vget_low_s16(a[23]), 2);
+  left[24] = vshll_n_s16(vget_low_s16(a[24]), 2);
+  left[25] = vshll_n_s16(vget_low_s16(a[25]), 2);
+  left[26] = vshll_n_s16(vget_low_s16(a[26]), 2);
+  left[27] = vshll_n_s16(vget_low_s16(a[27]), 2);
+  left[28] = vshll_n_s16(vget_low_s16(a[28]), 2);
+  left[29] = vshll_n_s16(vget_low_s16(a[29]), 2);
+  left[30] = vshll_n_s16(vget_low_s16(a[30]), 2);
+  left[31] = vshll_n_s16(vget_low_s16(a[31]), 2);
+
+  right[0] = vshll_n_s16(vget_high_s16(a[0]), 2);
+  right[1] = vshll_n_s16(vget_high_s16(a[1]), 2);
+  right[2] = vshll_n_s16(vget_high_s16(a[2]), 2);
+  right[3] = vshll_n_s16(vget_high_s16(a[3]), 2);
+  right[4] = vshll_n_s16(vget_high_s16(a[4]), 2);
+  right[5] = vshll_n_s16(vget_high_s16(a[5]), 2);
+  right[6] = vshll_n_s16(vget_high_s16(a[6]), 2);
+  right[7] = vshll_n_s16(vget_high_s16(a[7]), 2);
+  right[8] = vshll_n_s16(vget_high_s16(a[8]), 2);
+  right[9] = vshll_n_s16(vget_high_s16(a[9]), 2);
+  right[10] = vshll_n_s16(vget_high_s16(a[10]), 2);
+  right[11] = vshll_n_s16(vget_high_s16(a[11]), 2);
+  right[12] = vshll_n_s16(vget_high_s16(a[12]), 2);
+  right[13] = vshll_n_s16(vget_high_s16(a[13]), 2);
+  right[14] = vshll_n_s16(vget_high_s16(a[14]), 2);
+  right[15] = vshll_n_s16(vget_high_s16(a[15]), 2);
+  right[16] = vshll_n_s16(vget_high_s16(a[16]), 2);
+  right[17] = vshll_n_s16(vget_high_s16(a[17]), 2);
+  right[18] = vshll_n_s16(vget_high_s16(a[18]), 2);
+  right[19] = vshll_n_s16(vget_high_s16(a[19]), 2);
+  right[20] = vshll_n_s16(vget_high_s16(a[20]), 2);
+  right[21] = vshll_n_s16(vget_high_s16(a[21]), 2);
+  right[22] = vshll_n_s16(vget_high_s16(a[22]), 2);
+  right[23] = vshll_n_s16(vget_high_s16(a[23]), 2);
+  right[24] = vshll_n_s16(vget_high_s16(a[24]), 2);
+  right[25] = vshll_n_s16(vget_high_s16(a[25]), 2);
+  right[26] = vshll_n_s16(vget_high_s16(a[26]), 2);
+  right[27] = vshll_n_s16(vget_high_s16(a[27]), 2);
+  right[28] = vshll_n_s16(vget_high_s16(a[28]), 2);
+  right[29] = vshll_n_s16(vget_high_s16(a[29]), 2);
+  right[30] = vshll_n_s16(vget_high_s16(a[30]), 2);
+  right[31] = vshll_n_s16(vget_high_s16(a[31]), 2);
+}
+
+static INLINE void highbd_cross_input(const int32x4_t *a_left /*[32]*/,
+                                      int32x4_t *a_right /*[32]*/,
+                                      int32x4_t *b_left /*[32]*/,
+                                      int32x4_t *b_right /*[32]*/) {
+  // Stage 1. Done as part of the load for the first pass.
+  b_left[0] = vaddq_s32(a_left[0], a_left[31]);
+  b_left[1] = vaddq_s32(a_left[1], a_left[30]);
+  b_left[2] = vaddq_s32(a_left[2], a_left[29]);
+  b_left[3] = vaddq_s32(a_left[3], a_left[28]);
+  b_left[4] = vaddq_s32(a_left[4], a_left[27]);
+  b_left[5] = vaddq_s32(a_left[5], a_left[26]);
+  b_left[6] = vaddq_s32(a_left[6], a_left[25]);
+  b_left[7] = vaddq_s32(a_left[7], a_left[24]);
+  b_left[8] = vaddq_s32(a_left[8], a_left[23]);
+  b_left[9] = vaddq_s32(a_left[9], a_left[22]);
+  b_left[10] = vaddq_s32(a_left[10], a_left[21]);
+  b_left[11] = vaddq_s32(a_left[11], a_left[20]);
+  b_left[12] = vaddq_s32(a_left[12], a_left[19]);
+  b_left[13] = vaddq_s32(a_left[13], a_left[18]);
+  b_left[14] = vaddq_s32(a_left[14], a_left[17]);
+  b_left[15] = vaddq_s32(a_left[15], a_left[16]);
+
+  b_right[0] = vaddq_s32(a_right[0], a_right[31]);
+  b_right[1] = vaddq_s32(a_right[1], a_right[30]);
+  b_right[2] = vaddq_s32(a_right[2], a_right[29]);
+  b_right[3] = vaddq_s32(a_right[3], a_right[28]);
+  b_right[4] = vaddq_s32(a_right[4], a_right[27]);
+  b_right[5] = vaddq_s32(a_right[5], a_right[26]);
+  b_right[6] = vaddq_s32(a_right[6], a_right[25]);
+  b_right[7] = vaddq_s32(a_right[7], a_right[24]);
+  b_right[8] = vaddq_s32(a_right[8], a_right[23]);
+  b_right[9] = vaddq_s32(a_right[9], a_right[22]);
+  b_right[10] = vaddq_s32(a_right[10], a_right[21]);
+  b_right[11] = vaddq_s32(a_right[11], a_right[20]);
+  b_right[12] = vaddq_s32(a_right[12], a_right[19]);
+  b_right[13] = vaddq_s32(a_right[13], a_right[18]);
+  b_right[14] = vaddq_s32(a_right[14], a_right[17]);
+  b_right[15] = vaddq_s32(a_right[15], a_right[16]);
+
+  b_left[16] = vsubq_s32(a_left[15], a_left[16]);
+  b_left[17] = vsubq_s32(a_left[14], a_left[17]);
+  b_left[18] = vsubq_s32(a_left[13], a_left[18]);
+  b_left[19] = vsubq_s32(a_left[12], a_left[19]);
+  b_left[20] = vsubq_s32(a_left[11], a_left[20]);
+  b_left[21] = vsubq_s32(a_left[10], a_left[21]);
+  b_left[22] = vsubq_s32(a_left[9], a_left[22]);
+  b_left[23] = vsubq_s32(a_left[8], a_left[23]);
+  b_left[24] = vsubq_s32(a_left[7], a_left[24]);
+  b_left[25] = vsubq_s32(a_left[6], a_left[25]);
+  b_left[26] = vsubq_s32(a_left[5], a_left[26]);
+  b_left[27] = vsubq_s32(a_left[4], a_left[27]);
+  b_left[28] = vsubq_s32(a_left[3], a_left[28]);
+  b_left[29] = vsubq_s32(a_left[2], a_left[29]);
+  b_left[30] = vsubq_s32(a_left[1], a_left[30]);
+  b_left[31] = vsubq_s32(a_left[0], a_left[31]);
+
+  b_right[16] = vsubq_s32(a_right[15], a_right[16]);
+  b_right[17] = vsubq_s32(a_right[14], a_right[17]);
+  b_right[18] = vsubq_s32(a_right[13], a_right[18]);
+  b_right[19] = vsubq_s32(a_right[12], a_right[19]);
+  b_right[20] = vsubq_s32(a_right[11], a_right[20]);
+  b_right[21] = vsubq_s32(a_right[10], a_right[21]);
+  b_right[22] = vsubq_s32(a_right[9], a_right[22]);
+  b_right[23] = vsubq_s32(a_right[8], a_right[23]);
+  b_right[24] = vsubq_s32(a_right[7], a_right[24]);
+  b_right[25] = vsubq_s32(a_right[6], a_right[25]);
+  b_right[26] = vsubq_s32(a_right[5], a_right[26]);
+  b_right[27] = vsubq_s32(a_right[4], a_right[27]);
+  b_right[28] = vsubq_s32(a_right[3], a_right[28]);
+  b_right[29] = vsubq_s32(a_right[2], a_right[29]);
+  b_right[30] = vsubq_s32(a_right[1], a_right[30]);
+  b_right[31] = vsubq_s32(a_right[0], a_right[31]);
+}
+
+static INLINE void highbd_partial_add_round_shift(int32x4_t *left /*[32]*/,
+                                                  int32x4_t *right /* [32] */) {
+  // Also compute partial rounding shift:
+  // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+
+  left[0] = add_round_shift_s32(left[0]);
+  left[1] = add_round_shift_s32(left[1]);
+  left[2] = add_round_shift_s32(left[2]);
+  left[3] = add_round_shift_s32(left[3]);
+  left[4] = add_round_shift_s32(left[4]);
+  left[5] = add_round_shift_s32(left[5]);
+  left[6] = add_round_shift_s32(left[6]);
+  left[7] = add_round_shift_s32(left[7]);
+  left[8] = add_round_shift_s32(left[8]);
+  left[9] = add_round_shift_s32(left[9]);
+  left[10] = add_round_shift_s32(left[10]);
+  left[11] = add_round_shift_s32(left[11]);
+  left[12] = add_round_shift_s32(left[12]);
+  left[13] = add_round_shift_s32(left[13]);
+  left[14] = add_round_shift_s32(left[14]);
+  left[15] = add_round_shift_s32(left[15]);
+  left[16] = add_round_shift_s32(left[16]);
+  left[17] = add_round_shift_s32(left[17]);
+  left[18] = add_round_shift_s32(left[18]);
+  left[19] = add_round_shift_s32(left[19]);
+  left[20] = add_round_shift_s32(left[20]);
+  left[21] = add_round_shift_s32(left[21]);
+  left[22] = add_round_shift_s32(left[22]);
+  left[23] = add_round_shift_s32(left[23]);
+  left[24] = add_round_shift_s32(left[24]);
+  left[25] = add_round_shift_s32(left[25]);
+  left[26] = add_round_shift_s32(left[26]);
+  left[27] = add_round_shift_s32(left[27]);
+  left[28] = add_round_shift_s32(left[28]);
+  left[29] = add_round_shift_s32(left[29]);
+  left[30] = add_round_shift_s32(left[30]);
+  left[31] = add_round_shift_s32(left[31]);
+
+  right[0] = add_round_shift_s32(right[0]);
+  right[1] = add_round_shift_s32(right[1]);
+  right[2] = add_round_shift_s32(right[2]);
+  right[3] = add_round_shift_s32(right[3]);
+  right[4] = add_round_shift_s32(right[4]);
+  right[5] = add_round_shift_s32(right[5]);
+  right[6] = add_round_shift_s32(right[6]);
+  right[7] = add_round_shift_s32(right[7]);
+  right[8] = add_round_shift_s32(right[8]);
+  right[9] = add_round_shift_s32(right[9]);
+  right[10] = add_round_shift_s32(right[10]);
+  right[11] = add_round_shift_s32(right[11]);
+  right[12] = add_round_shift_s32(right[12]);
+  right[13] = add_round_shift_s32(right[13]);
+  right[14] = add_round_shift_s32(right[14]);
+  right[15] = add_round_shift_s32(right[15]);
+  right[16] = add_round_shift_s32(right[16]);
+  right[17] = add_round_shift_s32(right[17]);
+  right[18] = add_round_shift_s32(right[18]);
+  right[19] = add_round_shift_s32(right[19]);
+  right[20] = add_round_shift_s32(right[20]);
+  right[21] = add_round_shift_s32(right[21]);
+  right[22] = add_round_shift_s32(right[22]);
+  right[23] = add_round_shift_s32(right[23]);
+  right[24] = add_round_shift_s32(right[24]);
+  right[25] = add_round_shift_s32(right[25]);
+  right[26] = add_round_shift_s32(right[26]);
+  right[27] = add_round_shift_s32(right[27]);
+  right[28] = add_round_shift_s32(right[28]);
+  right[29] = add_round_shift_s32(right[29]);
+  right[30] = add_round_shift_s32(right[30]);
+  right[31] = add_round_shift_s32(right[31]);
+}
+
+static INLINE void highbd_partial_sub_round_shift(int32x4_t *left /*[32]*/,
+                                                  int32x4_t *right /* [32] */) {
+  // Also compute partial rounding shift:
+  // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+
+  left[0] = sub_round_shift_s32(left[0]);
+  left[1] = sub_round_shift_s32(left[1]);
+  left[2] = sub_round_shift_s32(left[2]);
+  left[3] = sub_round_shift_s32(left[3]);
+  left[4] = sub_round_shift_s32(left[4]);
+  left[5] = sub_round_shift_s32(left[5]);
+  left[6] = sub_round_shift_s32(left[6]);
+  left[7] = sub_round_shift_s32(left[7]);
+  left[8] = sub_round_shift_s32(left[8]);
+  left[9] = sub_round_shift_s32(left[9]);
+  left[10] = sub_round_shift_s32(left[10]);
+  left[11] = sub_round_shift_s32(left[11]);
+  left[12] = sub_round_shift_s32(left[12]);
+  left[13] = sub_round_shift_s32(left[13]);
+  left[14] = sub_round_shift_s32(left[14]);
+  left[15] = sub_round_shift_s32(left[15]);
+  left[16] = sub_round_shift_s32(left[16]);
+  left[17] = sub_round_shift_s32(left[17]);
+  left[18] = sub_round_shift_s32(left[18]);
+  left[19] = sub_round_shift_s32(left[19]);
+  left[20] = sub_round_shift_s32(left[20]);
+  left[21] = sub_round_shift_s32(left[21]);
+  left[22] = sub_round_shift_s32(left[22]);
+  left[23] = sub_round_shift_s32(left[23]);
+  left[24] = sub_round_shift_s32(left[24]);
+  left[25] = sub_round_shift_s32(left[25]);
+  left[26] = sub_round_shift_s32(left[26]);
+  left[27] = sub_round_shift_s32(left[27]);
+  left[28] = sub_round_shift_s32(left[28]);
+  left[29] = sub_round_shift_s32(left[29]);
+  left[30] = sub_round_shift_s32(left[30]);
+  left[31] = sub_round_shift_s32(left[31]);
+
+  right[0] = sub_round_shift_s32(right[0]);
+  right[1] = sub_round_shift_s32(right[1]);
+  right[2] = sub_round_shift_s32(right[2]);
+  right[3] = sub_round_shift_s32(right[3]);
+  right[4] = sub_round_shift_s32(right[4]);
+  right[5] = sub_round_shift_s32(right[5]);
+  right[6] = sub_round_shift_s32(right[6]);
+  right[7] = sub_round_shift_s32(right[7]);
+  right[8] = sub_round_shift_s32(right[8]);
+  right[9] = sub_round_shift_s32(right[9]);
+  right[10] = sub_round_shift_s32(right[10]);
+  right[11] = sub_round_shift_s32(right[11]);
+  right[12] = sub_round_shift_s32(right[12]);
+  right[13] = sub_round_shift_s32(right[13]);
+  right[14] = sub_round_shift_s32(right[14]);
+  right[15] = sub_round_shift_s32(right[15]);
+  right[16] = sub_round_shift_s32(right[16]);
+  right[17] = sub_round_shift_s32(right[17]);
+  right[18] = sub_round_shift_s32(right[18]);
+  right[19] = sub_round_shift_s32(right[19]);
+  right[20] = sub_round_shift_s32(right[20]);
+  right[21] = sub_round_shift_s32(right[21]);
+  right[22] = sub_round_shift_s32(right[22]);
+  right[23] = sub_round_shift_s32(right[23]);
+  right[24] = sub_round_shift_s32(right[24]);
+  right[25] = sub_round_shift_s32(right[25]);
+  right[26] = sub_round_shift_s32(right[26]);
+  right[27] = sub_round_shift_s32(right[27]);
+  right[28] = sub_round_shift_s32(right[28]);
+  right[29] = sub_round_shift_s32(right[29]);
+  right[30] = sub_round_shift_s32(right[30]);
+  right[31] = sub_round_shift_s32(right[31]);
+}
+
+static INLINE void highbd_dct8x32_body_first_pass(int32x4_t *left /*32*/,
+                                                  int32x4_t *right /*32*/) {
+  int32x4_t al[32], ar[32];
+  int32x4_t bl[32], br[32];
+
+  // Stage 1: Done as part of the load.
+
+  // Stage 2.
+  // Mini cross. X the first 16 values and the middle 8 of the second half.
+  al[0] = vaddq_s32(left[0], left[15]);
+  ar[0] = vaddq_s32(right[0], right[15]);
+  al[1] = vaddq_s32(left[1], left[14]);
+  ar[1] = vaddq_s32(right[1], right[14]);
+  al[2] = vaddq_s32(left[2], left[13]);
+  ar[2] = vaddq_s32(right[2], right[13]);
+  al[3] = vaddq_s32(left[3], left[12]);
+  ar[3] = vaddq_s32(right[3], right[12]);
+  al[4] = vaddq_s32(left[4], left[11]);
+  ar[4] = vaddq_s32(right[4], right[11]);
+  al[5] = vaddq_s32(left[5], left[10]);
+  ar[5] = vaddq_s32(right[5], right[10]);
+  al[6] = vaddq_s32(left[6], left[9]);
+  ar[6] = vaddq_s32(right[6], right[9]);
+  al[7] = vaddq_s32(left[7], left[8]);
+  ar[7] = vaddq_s32(right[7], right[8]);
+
+  al[8] = vsubq_s32(left[7], left[8]);
+  ar[8] = vsubq_s32(right[7], right[8]);
+  al[9] = vsubq_s32(left[6], left[9]);
+  ar[9] = vsubq_s32(right[6], right[9]);
+  al[10] = vsubq_s32(left[5], left[10]);
+  ar[10] = vsubq_s32(right[5], right[10]);
+  al[11] = vsubq_s32(left[4], left[11]);
+  ar[11] = vsubq_s32(right[4], right[11]);
+  al[12] = vsubq_s32(left[3], left[12]);
+  ar[12] = vsubq_s32(right[3], right[12]);
+  al[13] = vsubq_s32(left[2], left[13]);
+  ar[13] = vsubq_s32(right[2], right[13]);
+  al[14] = vsubq_s32(left[1], left[14]);
+  ar[14] = vsubq_s32(right[1], right[14]);
+  al[15] = vsubq_s32(left[0], left[15]);
+  ar[15] = vsubq_s32(right[0], right[15]);
+
+  al[16] = left[16];
+  ar[16] = right[16];
+  al[17] = left[17];
+  ar[17] = right[17];
+  al[18] = left[18];
+  ar[18] = right[18];
+  al[19] = left[19];
+  ar[19] = right[19];
+
+  butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
+                               cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
+  butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
+                               cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
+  butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
+                               cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
+  butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
+                               cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
+
+  al[28] = left[28];
+  ar[28] = right[28];
+  al[29] = left[29];
+  ar[29] = right[29];
+  al[30] = left[30];
+  ar[30] = right[30];
+  al[31] = left[31];
+  ar[31] = right[31];
+
+  // Stage 3.
+  bl[0] = vaddq_s32(al[0], al[7]);
+  br[0] = vaddq_s32(ar[0], ar[7]);
+  bl[1] = vaddq_s32(al[1], al[6]);
+  br[1] = vaddq_s32(ar[1], ar[6]);
+  bl[2] = vaddq_s32(al[2], al[5]);
+  br[2] = vaddq_s32(ar[2], ar[5]);
+  bl[3] = vaddq_s32(al[3], al[4]);
+  br[3] = vaddq_s32(ar[3], ar[4]);
+
+  bl[4] = vsubq_s32(al[3], al[4]);
+  br[4] = vsubq_s32(ar[3], ar[4]);
+  bl[5] = vsubq_s32(al[2], al[5]);
+  br[5] = vsubq_s32(ar[2], ar[5]);
+  bl[6] = vsubq_s32(al[1], al[6]);
+  br[6] = vsubq_s32(ar[1], ar[6]);
+  bl[7] = vsubq_s32(al[0], al[7]);
+  br[7] = vsubq_s32(ar[0], ar[7]);
+
+  bl[8] = al[8];
+  br[8] = ar[8];
+  bl[9] = al[9];
+  br[9] = ar[9];
+
+  butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
+                               &bl[13], &br[13], &bl[10], &br[10]);
+  butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
+                               &bl[12], &br[12], &bl[11], &br[11]);
+
+  bl[14] = al[14];
+  br[14] = ar[14];
+  bl[15] = al[15];
+  br[15] = ar[15];
+
+  bl[16] = vaddq_s32(left[16], al[23]);
+  br[16] = vaddq_s32(right[16], ar[23]);
+  bl[17] = vaddq_s32(left[17], al[22]);
+  br[17] = vaddq_s32(right[17], ar[22]);
+  bl[18] = vaddq_s32(left[18], al[21]);
+  br[18] = vaddq_s32(right[18], ar[21]);
+  bl[19] = vaddq_s32(left[19], al[20]);
+  br[19] = vaddq_s32(right[19], ar[20]);
+
+  bl[20] = vsubq_s32(left[19], al[20]);
+  br[20] = vsubq_s32(right[19], ar[20]);
+  bl[21] = vsubq_s32(left[18], al[21]);
+  br[21] = vsubq_s32(right[18], ar[21]);
+  bl[22] = vsubq_s32(left[17], al[22]);
+  br[22] = vsubq_s32(right[17], ar[22]);
+  bl[23] = vsubq_s32(left[16], al[23]);
+  br[23] = vsubq_s32(right[16], ar[23]);
+
+  bl[24] = vsubq_s32(left[31], al[24]);
+  br[24] = vsubq_s32(right[31], ar[24]);
+  bl[25] = vsubq_s32(left[30], al[25]);
+  br[25] = vsubq_s32(right[30], ar[25]);
+  bl[26] = vsubq_s32(left[29], al[26]);
+  br[26] = vsubq_s32(right[29], ar[26]);
+  bl[27] = vsubq_s32(left[28], al[27]);
+  br[27] = vsubq_s32(right[28], ar[27]);
+
+  bl[28] = vaddq_s32(left[28], al[27]);
+  br[28] = vaddq_s32(right[28], ar[27]);
+  bl[29] = vaddq_s32(left[29], al[26]);
+  br[29] = vaddq_s32(right[29], ar[26]);
+  bl[30] = vaddq_s32(left[30], al[25]);
+  br[30] = vaddq_s32(right[30], ar[25]);
+  bl[31] = vaddq_s32(left[31], al[24]);
+  br[31] = vaddq_s32(right[31], ar[24]);
+
+  // Stage 4.
+  al[0] = vaddq_s32(bl[0], bl[3]);
+  ar[0] = vaddq_s32(br[0], br[3]);
+  al[1] = vaddq_s32(bl[1], bl[2]);
+  ar[1] = vaddq_s32(br[1], br[2]);
+  al[2] = vsubq_s32(bl[1], bl[2]);
+  ar[2] = vsubq_s32(br[1], br[2]);
+  al[3] = vsubq_s32(bl[0], bl[3]);
+  ar[3] = vsubq_s32(br[0], br[3]);
+
+  al[4] = bl[4];
+  ar[4] = br[4];
+
+  butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
+                               &ar[6], &al[5], &ar[5]);
+
+  al[7] = bl[7];
+  ar[7] = br[7];
+
+  al[8] = vaddq_s32(bl[8], bl[11]);
+  ar[8] = vaddq_s32(br[8], br[11]);
+  al[9] = vaddq_s32(bl[9], bl[10]);
+  ar[9] = vaddq_s32(br[9], br[10]);
+  al[10] = vsubq_s32(bl[9], bl[10]);
+  ar[10] = vsubq_s32(br[9], br[10]);
+  al[11] = vsubq_s32(bl[8], bl[11]);
+  ar[11] = vsubq_s32(br[8], br[11]);
+  al[12] = vsubq_s32(bl[15], bl[12]);
+  ar[12] = vsubq_s32(br[15], br[12]);
+  al[13] = vsubq_s32(bl[14], bl[13]);
+  ar[13] = vsubq_s32(br[14], br[13]);
+  al[14] = vaddq_s32(bl[14], bl[13]);
+  ar[14] = vaddq_s32(br[14], br[13]);
+  al[15] = vaddq_s32(bl[15], bl[12]);
+  ar[15] = vaddq_s32(br[15], br[12]);
+
+  al[16] = bl[16];
+  ar[16] = br[16];
+  al[17] = bl[17];
+  ar[17] = br[17];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64,
+                                     cospi_24_64, &al[29], &ar[29], &al[18],
+                                     &ar[18]);
+  butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64,
+                                     cospi_24_64, &al[28], &ar[28], &al[19],
+                                     &ar[19]);
+  butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20],
+                                     cospi_24_64, -cospi_8_64, &al[27], &ar[27],
+                                     &al[20], &ar[20]);
+  butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+                                     cospi_24_64, -cospi_8_64, &al[26], &ar[26],
+                                     &al[21], &ar[21]);
+
+  al[22] = bl[22];
+  ar[22] = br[22];
+  al[23] = bl[23];
+  ar[23] = br[23];
+  al[24] = bl[24];
+  ar[24] = br[24];
+  al[25] = bl[25];
+  ar[25] = br[25];
+
+  al[30] = bl[30];
+  ar[30] = br[30];
+  al[31] = bl[31];
+  ar[31] = br[31];
+
+  // Stage 5.
+  butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
+                               &br[0], &bl[1], &br[1]);
+  butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64,
+                                     cospi_24_64, &bl[2], &br[2], &bl[3],
+                                     &br[3]);
+
+  bl[4] = vaddq_s32(al[4], al[5]);
+  br[4] = vaddq_s32(ar[4], ar[5]);
+  bl[5] = vsubq_s32(al[4], al[5]);
+  br[5] = vsubq_s32(ar[4], ar[5]);
+  bl[6] = vsubq_s32(al[7], al[6]);
+  br[6] = vsubq_s32(ar[7], ar[6]);
+  bl[7] = vaddq_s32(al[7], al[6]);
+  br[7] = vaddq_s32(ar[7], ar[6]);
+
+  bl[8] = al[8];
+  br[8] = ar[8];
+
+  butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64,
+                                     cospi_24_64, &bl[14], &br[14], &bl[9],
+                                     &br[9]);
+  butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+                                     cospi_24_64, -cospi_8_64, &bl[13], &br[13],
+                                     &bl[10], &br[10]);
+
+  bl[11] = al[11];
+  br[11] = ar[11];
+  bl[12] = al[12];
+  br[12] = ar[12];
+
+  bl[15] = al[15];
+  br[15] = ar[15];
+
+  bl[16] = vaddq_s32(al[19], al[16]);
+  br[16] = vaddq_s32(ar[19], ar[16]);
+  bl[17] = vaddq_s32(al[18], al[17]);
+  br[17] = vaddq_s32(ar[18], ar[17]);
+  bl[18] = vsubq_s32(al[17], al[18]);
+  br[18] = vsubq_s32(ar[17], ar[18]);
+  bl[19] = vsubq_s32(al[16], al[19]);
+  br[19] = vsubq_s32(ar[16], ar[19]);
+  bl[20] = vsubq_s32(al[23], al[20]);
+  br[20] = vsubq_s32(ar[23], ar[20]);
+  bl[21] = vsubq_s32(al[22], al[21]);
+  br[21] = vsubq_s32(ar[22], ar[21]);
+  bl[22] = vaddq_s32(al[21], al[22]);
+  br[22] = vaddq_s32(ar[21], ar[22]);
+  bl[23] = vaddq_s32(al[20], al[23]);
+  br[23] = vaddq_s32(ar[20], ar[23]);
+  bl[24] = vaddq_s32(al[27], al[24]);
+  br[24] = vaddq_s32(ar[27], ar[24]);
+  bl[25] = vaddq_s32(al[26], al[25]);
+  br[25] = vaddq_s32(ar[26], ar[25]);
+  bl[26] = vsubq_s32(al[25], al[26]);
+  br[26] = vsubq_s32(ar[25], ar[26]);
+  bl[27] = vsubq_s32(al[24], al[27]);
+  br[27] = vsubq_s32(ar[24], ar[27]);
+  bl[28] = vsubq_s32(al[31], al[28]);
+  br[28] = vsubq_s32(ar[31], ar[28]);
+  bl[29] = vsubq_s32(al[30], al[29]);
+  br[29] = vsubq_s32(ar[30], ar[29]);
+  bl[30] = vaddq_s32(al[29], al[30]);
+  br[30] = vaddq_s32(ar[29], ar[30]);
+  bl[31] = vaddq_s32(al[28], al[31]);
+  br[31] = vaddq_s32(ar[28], ar[31]);
+
+  // Stage 6.
+  al[0] = bl[0];
+  ar[0] = br[0];
+  al[1] = bl[1];
+  ar[1] = br[1];
+  al[2] = bl[2];
+  ar[2] = br[2];
+  al[3] = bl[3];
+  ar[3] = br[3];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64,
+                                     cospi_28_64, &al[4], &ar[4], &al[7],
+                                     &ar[7]);
+  butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64,
+                                     cospi_12_64, &al[5], &ar[5], &al[6],
+                                     &ar[6]);
+
+  al[8] = vaddq_s32(bl[8], bl[9]);
+  ar[8] = vaddq_s32(br[8], br[9]);
+  al[9] = vsubq_s32(bl[8], bl[9]);
+  ar[9] = vsubq_s32(br[8], br[9]);
+  al[10] = vsubq_s32(bl[11], bl[10]);
+  ar[10] = vsubq_s32(br[11], br[10]);
+  al[11] = vaddq_s32(bl[11], bl[10]);
+  ar[11] = vaddq_s32(br[11], br[10]);
+  al[12] = vaddq_s32(bl[12], bl[13]);
+  ar[12] = vaddq_s32(br[12], br[13]);
+  al[13] = vsubq_s32(bl[12], bl[13]);
+  ar[13] = vsubq_s32(br[12], br[13]);
+  al[14] = vsubq_s32(bl[15], bl[14]);
+  ar[14] = vsubq_s32(br[15], br[14]);
+  al[15] = vaddq_s32(bl[15], bl[14]);
+  ar[15] = vaddq_s32(br[15], br[14]);
+
+  al[16] = bl[16];
+  ar[16] = br[16];
+  al[19] = bl[19];
+  ar[19] = br[19];
+  al[20] = bl[20];
+  ar[20] = br[20];
+  al[23] = bl[23];
+  ar[23] = br[23];
+  al[24] = bl[24];
+  ar[24] = br[24];
+  al[27] = bl[27];
+  ar[27] = br[27];
+  al[28] = bl[28];
+  ar[28] = br[28];
+  al[31] = bl[31];
+  ar[31] = br[31];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64,
+                                     cospi_28_64, &al[30], &ar[30], &al[17],
+                                     &ar[17]);
+  butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18],
+                                     cospi_28_64, -cospi_4_64, &al[29], &ar[29],
+                                     &al[18], &ar[18]);
+  butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+                                     cospi_20_64, cospi_12_64, &al[26], &ar[26],
+                                     &al[21], &ar[21]);
+  butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+                                     cospi_12_64, -cospi_20_64, &al[25],
+                                     &ar[25], &al[22], &ar[22]);
+
+  // Stage 7.
+  bl[0] = al[0];
+  br[0] = ar[0];
+  bl[1] = al[1];
+  br[1] = ar[1];
+  bl[2] = al[2];
+  br[2] = ar[2];
+  bl[3] = al[3];
+  br[3] = ar[3];
+  bl[4] = al[4];
+  br[4] = ar[4];
+  bl[5] = al[5];
+  br[5] = ar[5];
+  bl[6] = al[6];
+  br[6] = ar[6];
+  bl[7] = al[7];
+  br[7] = ar[7];
+
+  butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64,
+                                     cospi_30_64, &bl[8], &br[8], &bl[15],
+                                     &br[15]);
+  butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64,
+                                     cospi_14_64, &bl[9], &br[9], &bl[14],
+                                     &br[14]);
+  butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+                                     cospi_10_64, cospi_22_64, &bl[10], &br[10],
+                                     &bl[13], &br[13]);
+  butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11],
+                                     cospi_26_64, cospi_6_64, &bl[11], &br[11],
+                                     &bl[12], &br[12]);
+
+  bl[16] = vaddq_s32(al[16], al[17]);
+  br[16] = vaddq_s32(ar[16], ar[17]);
+  bl[17] = vsubq_s32(al[16], al[17]);
+  br[17] = vsubq_s32(ar[16], ar[17]);
+  bl[18] = vsubq_s32(al[19], al[18]);
+  br[18] = vsubq_s32(ar[19], ar[18]);
+  bl[19] = vaddq_s32(al[19], al[18]);
+  br[19] = vaddq_s32(ar[19], ar[18]);
+  bl[20] = vaddq_s32(al[20], al[21]);
+  br[20] = vaddq_s32(ar[20], ar[21]);
+  bl[21] = vsubq_s32(al[20], al[21]);
+  br[21] = vsubq_s32(ar[20], ar[21]);
+  bl[22] = vsubq_s32(al[23], al[22]);
+  br[22] = vsubq_s32(ar[23], ar[22]);
+  bl[23] = vaddq_s32(al[23], al[22]);
+  br[23] = vaddq_s32(ar[23], ar[22]);
+  bl[24] = vaddq_s32(al[24], al[25]);
+  br[24] = vaddq_s32(ar[24], ar[25]);
+  bl[25] = vsubq_s32(al[24], al[25]);
+  br[25] = vsubq_s32(ar[24], ar[25]);
+  bl[26] = vsubq_s32(al[27], al[26]);
+  br[26] = vsubq_s32(ar[27], ar[26]);
+  bl[27] = vaddq_s32(al[27], al[26]);
+  br[27] = vaddq_s32(ar[27], ar[26]);
+  bl[28] = vaddq_s32(al[28], al[29]);
+  br[28] = vaddq_s32(ar[28], ar[29]);
+  bl[29] = vsubq_s32(al[28], al[29]);
+  br[29] = vsubq_s32(ar[28], ar[29]);
+  bl[30] = vsubq_s32(al[31], al[30]);
+  br[30] = vsubq_s32(ar[31], ar[30]);
+  bl[31] = vaddq_s32(al[31], al[30]);
+  br[31] = vaddq_s32(ar[31], ar[30]);
+
+  // Final stage.
+
+  left[0] = bl[0];
+  right[0] = br[0];
+  left[16] = bl[1];
+  right[16] = br[1];
+  left[8] = bl[2];
+  right[8] = br[2];
+  left[24] = bl[3];
+  right[24] = br[3];
+  left[4] = bl[4];
+  right[4] = br[4];
+  left[20] = bl[5];
+  right[20] = br[5];
+  left[12] = bl[6];
+  right[12] = br[6];
+  left[28] = bl[7];
+  right[28] = br[7];
+  left[2] = bl[8];
+  right[2] = br[8];
+  left[18] = bl[9];
+  right[18] = br[9];
+  left[10] = bl[10];
+  right[10] = br[10];
+  left[26] = bl[11];
+  right[26] = br[11];
+  left[6] = bl[12];
+  right[6] = br[12];
+  left[22] = bl[13];
+  right[22] = br[13];
+  left[14] = bl[14];
+  right[14] = br[14];
+  left[30] = bl[15];
+  right[30] = br[15];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64,
+                                     cospi_31_64, &al[1], &ar[1], &al[31],
+                                     &ar[31]);
+  left[1] = al[1];
+  right[1] = ar[1];
+  left[31] = al[31];
+  right[31] = ar[31];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17],
+                                     cospi_17_64, cospi_15_64, &al[17], &ar[17],
+                                     &al[15], &ar[15]);
+  left[17] = al[17];
+  right[17] = ar[17];
+  left[15] = al[15];
+  right[15] = ar[15];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64,
+                                     cospi_23_64, &al[9], &ar[9], &al[23],
+                                     &ar[23]);
+  left[9] = al[9];
+  right[9] = ar[9];
+  left[23] = al[23];
+  right[23] = ar[23];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19],
+                                     cospi_25_64, cospi_7_64, &al[25], &ar[25],
+                                     &al[7], &ar[7]);
+  left[25] = al[25];
+  right[25] = ar[25];
+  left[7] = al[7];
+  right[7] = ar[7];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64,
+                                     cospi_27_64, &al[5], &ar[5], &al[27],
+                                     &ar[27]);
+  left[5] = al[5];
+  right[5] = ar[5];
+  left[27] = al[27];
+  right[27] = ar[27];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+                                     cospi_21_64, cospi_11_64, &al[21], &ar[21],
+                                     &al[11], &ar[11]);
+  left[21] = al[21];
+  right[21] = ar[21];
+  left[11] = al[11];
+  right[11] = ar[11];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+                                     cospi_13_64, cospi_19_64, &al[13], &ar[13],
+                                     &al[19], &ar[19]);
+  left[13] = al[13];
+  right[13] = ar[13];
+  left[19] = al[19];
+  right[19] = ar[19];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23],
+                                     cospi_29_64, cospi_3_64, &al[29], &ar[29],
+                                     &al[3], &ar[3]);
+  left[29] = al[29];
+  right[29] = ar[29];
+  left[3] = al[3];
+  right[3] = ar[3];
+}
+
+static INLINE void highbd_dct8x32_body_second_pass(int32x4_t *left /*32*/,
+                                                   int32x4_t *right /*32*/) {
+  int32x4_t al[32], ar[32];
+  int32x4_t bl[32], br[32];
+
+  // Stage 1: Done as part of the load.
+
+  // Stage 2.
+  // Mini cross. X the first 16 values and the middle 8 of the second half.
+  al[0] = vaddq_s32(left[0], left[15]);
+  ar[0] = vaddq_s32(right[0], right[15]);
+  al[1] = vaddq_s32(left[1], left[14]);
+  ar[1] = vaddq_s32(right[1], right[14]);
+  al[2] = vaddq_s32(left[2], left[13]);
+  ar[2] = vaddq_s32(right[2], right[13]);
+  al[3] = vaddq_s32(left[3], left[12]);
+  ar[3] = vaddq_s32(right[3], right[12]);
+  al[4] = vaddq_s32(left[4], left[11]);
+  ar[4] = vaddq_s32(right[4], right[11]);
+  al[5] = vaddq_s32(left[5], left[10]);
+  ar[5] = vaddq_s32(right[5], right[10]);
+  al[6] = vaddq_s32(left[6], left[9]);
+  ar[6] = vaddq_s32(right[6], right[9]);
+  al[7] = vaddq_s32(left[7], left[8]);
+  ar[7] = vaddq_s32(right[7], right[8]);
+
+  al[8] = vsubq_s32(left[7], left[8]);
+  ar[8] = vsubq_s32(right[7], right[8]);
+  al[9] = vsubq_s32(left[6], left[9]);
+  ar[9] = vsubq_s32(right[6], right[9]);
+  al[10] = vsubq_s32(left[5], left[10]);
+  ar[10] = vsubq_s32(right[5], right[10]);
+  al[11] = vsubq_s32(left[4], left[11]);
+  ar[11] = vsubq_s32(right[4], right[11]);
+  al[12] = vsubq_s32(left[3], left[12]);
+  ar[12] = vsubq_s32(right[3], right[12]);
+  al[13] = vsubq_s32(left[2], left[13]);
+  ar[13] = vsubq_s32(right[2], right[13]);
+  al[14] = vsubq_s32(left[1], left[14]);
+  ar[14] = vsubq_s32(right[1], right[14]);
+  al[15] = vsubq_s32(left[0], left[15]);
+  ar[15] = vsubq_s32(right[0], right[15]);
+
+  al[16] = left[16];
+  ar[16] = right[16];
+  al[17] = left[17];
+  ar[17] = right[17];
+  al[18] = left[18];
+  ar[18] = right[18];
+  al[19] = left[19];
+  ar[19] = right[19];
+
+  butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
+                               cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
+  butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
+                               cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
+  butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
+                               cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
+  butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
+                               cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
+
+  al[28] = left[28];
+  ar[28] = right[28];
+  al[29] = left[29];
+  ar[29] = right[29];
+  al[30] = left[30];
+  ar[30] = right[30];
+  al[31] = left[31];
+  ar[31] = right[31];
+
+  // Stage 3.
+  bl[0] = vaddq_s32(al[0], al[7]);
+  br[0] = vaddq_s32(ar[0], ar[7]);
+  bl[1] = vaddq_s32(al[1], al[6]);
+  br[1] = vaddq_s32(ar[1], ar[6]);
+  bl[2] = vaddq_s32(al[2], al[5]);
+  br[2] = vaddq_s32(ar[2], ar[5]);
+  bl[3] = vaddq_s32(al[3], al[4]);
+  br[3] = vaddq_s32(ar[3], ar[4]);
+
+  bl[4] = vsubq_s32(al[3], al[4]);
+  br[4] = vsubq_s32(ar[3], ar[4]);
+  bl[5] = vsubq_s32(al[2], al[5]);
+  br[5] = vsubq_s32(ar[2], ar[5]);
+  bl[6] = vsubq_s32(al[1], al[6]);
+  br[6] = vsubq_s32(ar[1], ar[6]);
+  bl[7] = vsubq_s32(al[0], al[7]);
+  br[7] = vsubq_s32(ar[0], ar[7]);
+
+  bl[8] = al[8];
+  br[8] = ar[8];
+  bl[9] = al[9];
+  br[9] = ar[9];
+
+  butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
+                               &bl[13], &br[13], &bl[10], &br[10]);
+  butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
+                               &bl[12], &br[12], &bl[11], &br[11]);
+
+  bl[14] = al[14];
+  br[14] = ar[14];
+  bl[15] = al[15];
+  br[15] = ar[15];
+
+  bl[16] = vaddq_s32(left[16], al[23]);
+  br[16] = vaddq_s32(right[16], ar[23]);
+  bl[17] = vaddq_s32(left[17], al[22]);
+  br[17] = vaddq_s32(right[17], ar[22]);
+  bl[18] = vaddq_s32(left[18], al[21]);
+  br[18] = vaddq_s32(right[18], ar[21]);
+  bl[19] = vaddq_s32(left[19], al[20]);
+  br[19] = vaddq_s32(right[19], ar[20]);
+
+  bl[20] = vsubq_s32(left[19], al[20]);
+  br[20] = vsubq_s32(right[19], ar[20]);
+  bl[21] = vsubq_s32(left[18], al[21]);
+  br[21] = vsubq_s32(right[18], ar[21]);
+  bl[22] = vsubq_s32(left[17], al[22]);
+  br[22] = vsubq_s32(right[17], ar[22]);
+  bl[23] = vsubq_s32(left[16], al[23]);
+  br[23] = vsubq_s32(right[16], ar[23]);
+
+  bl[24] = vsubq_s32(left[31], al[24]);
+  br[24] = vsubq_s32(right[31], ar[24]);
+  bl[25] = vsubq_s32(left[30], al[25]);
+  br[25] = vsubq_s32(right[30], ar[25]);
+  bl[26] = vsubq_s32(left[29], al[26]);
+  br[26] = vsubq_s32(right[29], ar[26]);
+  bl[27] = vsubq_s32(left[28], al[27]);
+  br[27] = vsubq_s32(right[28], ar[27]);
+
+  bl[28] = vaddq_s32(left[28], al[27]);
+  br[28] = vaddq_s32(right[28], ar[27]);
+  bl[29] = vaddq_s32(left[29], al[26]);
+  br[29] = vaddq_s32(right[29], ar[26]);
+  bl[30] = vaddq_s32(left[30], al[25]);
+  br[30] = vaddq_s32(right[30], ar[25]);
+  bl[31] = vaddq_s32(left[31], al[24]);
+  br[31] = vaddq_s32(right[31], ar[24]);
+
+  // Stage 4.
+  al[0] = vaddq_s32(bl[0], bl[3]);
+  ar[0] = vaddq_s32(br[0], br[3]);
+  al[1] = vaddq_s32(bl[1], bl[2]);
+  ar[1] = vaddq_s32(br[1], br[2]);
+  al[2] = vsubq_s32(bl[1], bl[2]);
+  ar[2] = vsubq_s32(br[1], br[2]);
+  al[3] = vsubq_s32(bl[0], bl[3]);
+  ar[3] = vsubq_s32(br[0], br[3]);
+
+  al[4] = bl[4];
+  ar[4] = br[4];
+
+  butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
+                               &ar[6], &al[5], &ar[5]);
+
+  al[7] = bl[7];
+  ar[7] = br[7];
+
+  al[8] = vaddq_s32(bl[8], bl[11]);
+  ar[8] = vaddq_s32(br[8], br[11]);
+  al[9] = vaddq_s32(bl[9], bl[10]);
+  ar[9] = vaddq_s32(br[9], br[10]);
+  al[10] = vsubq_s32(bl[9], bl[10]);
+  ar[10] = vsubq_s32(br[9], br[10]);
+  al[11] = vsubq_s32(bl[8], bl[11]);
+  ar[11] = vsubq_s32(br[8], br[11]);
+  al[12] = vsubq_s32(bl[15], bl[12]);
+  ar[12] = vsubq_s32(br[15], br[12]);
+  al[13] = vsubq_s32(bl[14], bl[13]);
+  ar[13] = vsubq_s32(br[14], br[13]);
+  al[14] = vaddq_s32(bl[14], bl[13]);
+  ar[14] = vaddq_s32(br[14], br[13]);
+  al[15] = vaddq_s32(bl[15], bl[12]);
+  ar[15] = vaddq_s32(br[15], br[12]);
+
+  al[16] = bl[16];
+  ar[16] = br[16];
+  al[17] = bl[17];
+  ar[17] = br[17];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64,
+                                     cospi_24_64, &al[29], &ar[29], &al[18],
+                                     &ar[18]);
+  butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64,
+                                     cospi_24_64, &al[28], &ar[28], &al[19],
+                                     &ar[19]);
+  butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20],
+                                     cospi_24_64, -cospi_8_64, &al[27], &ar[27],
+                                     &al[20], &ar[20]);
+  butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+                                     cospi_24_64, -cospi_8_64, &al[26], &ar[26],
+                                     &al[21], &ar[21]);
+
+  al[22] = bl[22];
+  ar[22] = br[22];
+  al[23] = bl[23];
+  ar[23] = br[23];
+  al[24] = bl[24];
+  ar[24] = br[24];
+  al[25] = bl[25];
+  ar[25] = br[25];
+
+  al[30] = bl[30];
+  ar[30] = br[30];
+  al[31] = bl[31];
+  ar[31] = br[31];
+
+  // Stage 5.
+  butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
+                               &br[0], &bl[1], &br[1]);
+  butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64,
+                                     cospi_24_64, &bl[2], &br[2], &bl[3],
+                                     &br[3]);
+
+  bl[4] = vaddq_s32(al[4], al[5]);
+  br[4] = vaddq_s32(ar[4], ar[5]);
+  bl[5] = vsubq_s32(al[4], al[5]);
+  br[5] = vsubq_s32(ar[4], ar[5]);
+  bl[6] = vsubq_s32(al[7], al[6]);
+  br[6] = vsubq_s32(ar[7], ar[6]);
+  bl[7] = vaddq_s32(al[7], al[6]);
+  br[7] = vaddq_s32(ar[7], ar[6]);
+
+  bl[8] = al[8];
+  br[8] = ar[8];
+
+  butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64,
+                                     cospi_24_64, &bl[14], &br[14], &bl[9],
+                                     &br[9]);
+  butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+                                     cospi_24_64, -cospi_8_64, &bl[13], &br[13],
+                                     &bl[10], &br[10]);
+
+  bl[11] = al[11];
+  br[11] = ar[11];
+  bl[12] = al[12];
+  br[12] = ar[12];
+
+  bl[15] = al[15];
+  br[15] = ar[15];
+
+  bl[16] = vaddq_s32(al[19], al[16]);
+  br[16] = vaddq_s32(ar[19], ar[16]);
+  bl[17] = vaddq_s32(al[18], al[17]);
+  br[17] = vaddq_s32(ar[18], ar[17]);
+  bl[18] = vsubq_s32(al[17], al[18]);
+  br[18] = vsubq_s32(ar[17], ar[18]);
+  bl[19] = vsubq_s32(al[16], al[19]);
+  br[19] = vsubq_s32(ar[16], ar[19]);
+  bl[20] = vsubq_s32(al[23], al[20]);
+  br[20] = vsubq_s32(ar[23], ar[20]);
+  bl[21] = vsubq_s32(al[22], al[21]);
+  br[21] = vsubq_s32(ar[22], ar[21]);
+  bl[22] = vaddq_s32(al[21], al[22]);
+  br[22] = vaddq_s32(ar[21], ar[22]);
+  bl[23] = vaddq_s32(al[20], al[23]);
+  br[23] = vaddq_s32(ar[20], ar[23]);
+  bl[24] = vaddq_s32(al[27], al[24]);
+  br[24] = vaddq_s32(ar[27], ar[24]);
+  bl[25] = vaddq_s32(al[26], al[25]);
+  br[25] = vaddq_s32(ar[26], ar[25]);
+  bl[26] = vsubq_s32(al[25], al[26]);
+  br[26] = vsubq_s32(ar[25], ar[26]);
+  bl[27] = vsubq_s32(al[24], al[27]);
+  br[27] = vsubq_s32(ar[24], ar[27]);
+  bl[28] = vsubq_s32(al[31], al[28]);
+  br[28] = vsubq_s32(ar[31], ar[28]);
+  bl[29] = vsubq_s32(al[30], al[29]);
+  br[29] = vsubq_s32(ar[30], ar[29]);
+  bl[30] = vaddq_s32(al[29], al[30]);
+  br[30] = vaddq_s32(ar[29], ar[30]);
+  bl[31] = vaddq_s32(al[28], al[31]);
+  br[31] = vaddq_s32(ar[28], ar[31]);
+
+  // Stage 6.
+  al[0] = bl[0];
+  ar[0] = br[0];
+  al[1] = bl[1];
+  ar[1] = br[1];
+  al[2] = bl[2];
+  ar[2] = br[2];
+  al[3] = bl[3];
+  ar[3] = br[3];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64,
+                                     cospi_28_64, &al[4], &ar[4], &al[7],
+                                     &ar[7]);
+  butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64,
+                                     cospi_12_64, &al[5], &ar[5], &al[6],
+                                     &ar[6]);
+
+  al[8] = vaddq_s32(bl[8], bl[9]);
+  ar[8] = vaddq_s32(br[8], br[9]);
+  al[9] = vsubq_s32(bl[8], bl[9]);
+  ar[9] = vsubq_s32(br[8], br[9]);
+  al[10] = vsubq_s32(bl[11], bl[10]);
+  ar[10] = vsubq_s32(br[11], br[10]);
+  al[11] = vaddq_s32(bl[11], bl[10]);
+  ar[11] = vaddq_s32(br[11], br[10]);
+  al[12] = vaddq_s32(bl[12], bl[13]);
+  ar[12] = vaddq_s32(br[12], br[13]);
+  al[13] = vsubq_s32(bl[12], bl[13]);
+  ar[13] = vsubq_s32(br[12], br[13]);
+  al[14] = vsubq_s32(bl[15], bl[14]);
+  ar[14] = vsubq_s32(br[15], br[14]);
+  al[15] = vaddq_s32(bl[15], bl[14]);
+  ar[15] = vaddq_s32(br[15], br[14]);
+
+  al[16] = bl[16];
+  ar[16] = br[16];
+  al[19] = bl[19];
+  ar[19] = br[19];
+  al[20] = bl[20];
+  ar[20] = br[20];
+  al[23] = bl[23];
+  ar[23] = br[23];
+  al[24] = bl[24];
+  ar[24] = br[24];
+  al[27] = bl[27];
+  ar[27] = br[27];
+  al[28] = bl[28];
+  ar[28] = br[28];
+  al[31] = bl[31];
+  ar[31] = br[31];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64,
+                                     cospi_28_64, &al[30], &ar[30], &al[17],
+                                     &ar[17]);
+  butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18],
+                                     cospi_28_64, -cospi_4_64, &al[29], &ar[29],
+                                     &al[18], &ar[18]);
+  butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+                                     cospi_20_64, cospi_12_64, &al[26], &ar[26],
+                                     &al[21], &ar[21]);
+  butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+                                     cospi_12_64, -cospi_20_64, &al[25],
+                                     &ar[25], &al[22], &ar[22]);
+
+  // Stage 7.
+  bl[0] = al[0];
+  br[0] = ar[0];
+  bl[1] = al[1];
+  br[1] = ar[1];
+  bl[2] = al[2];
+  br[2] = ar[2];
+  bl[3] = al[3];
+  br[3] = ar[3];
+  bl[4] = al[4];
+  br[4] = ar[4];
+  bl[5] = al[5];
+  br[5] = ar[5];
+  bl[6] = al[6];
+  br[6] = ar[6];
+  bl[7] = al[7];
+  br[7] = ar[7];
+
+  butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64,
+                                     cospi_30_64, &bl[8], &br[8], &bl[15],
+                                     &br[15]);
+  butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64,
+                                     cospi_14_64, &bl[9], &br[9], &bl[14],
+                                     &br[14]);
+  butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+                                     cospi_10_64, cospi_22_64, &bl[10], &br[10],
+                                     &bl[13], &br[13]);
+  butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11],
+                                     cospi_26_64, cospi_6_64, &bl[11], &br[11],
+                                     &bl[12], &br[12]);
+
+  bl[16] = vaddq_s32(al[16], al[17]);
+  br[16] = vaddq_s32(ar[16], ar[17]);
+  bl[17] = vsubq_s32(al[16], al[17]);
+  br[17] = vsubq_s32(ar[16], ar[17]);
+  bl[18] = vsubq_s32(al[19], al[18]);
+  br[18] = vsubq_s32(ar[19], ar[18]);
+  bl[19] = vaddq_s32(al[19], al[18]);
+  br[19] = vaddq_s32(ar[19], ar[18]);
+  bl[20] = vaddq_s32(al[20], al[21]);
+  br[20] = vaddq_s32(ar[20], ar[21]);
+  bl[21] = vsubq_s32(al[20], al[21]);
+  br[21] = vsubq_s32(ar[20], ar[21]);
+  bl[22] = vsubq_s32(al[23], al[22]);
+  br[22] = vsubq_s32(ar[23], ar[22]);
+  bl[23] = vaddq_s32(al[23], al[22]);
+  br[23] = vaddq_s32(ar[23], ar[22]);
+  bl[24] = vaddq_s32(al[24], al[25]);
+  br[24] = vaddq_s32(ar[24], ar[25]);
+  bl[25] = vsubq_s32(al[24], al[25]);
+  br[25] = vsubq_s32(ar[24], ar[25]);
+  bl[26] = vsubq_s32(al[27], al[26]);
+  br[26] = vsubq_s32(ar[27], ar[26]);
+  bl[27] = vaddq_s32(al[27], al[26]);
+  br[27] = vaddq_s32(ar[27], ar[26]);
+  bl[28] = vaddq_s32(al[28], al[29]);
+  br[28] = vaddq_s32(ar[28], ar[29]);
+  bl[29] = vsubq_s32(al[28], al[29]);
+  br[29] = vsubq_s32(ar[28], ar[29]);
+  bl[30] = vsubq_s32(al[31], al[30]);
+  br[30] = vsubq_s32(ar[31], ar[30]);
+  bl[31] = vaddq_s32(al[31], al[30]);
+  br[31] = vaddq_s32(ar[31], ar[30]);
+
+  // Final stage.
+
+  left[0] = bl[0];
+  right[0] = br[0];
+  left[16] = bl[1];
+  right[16] = br[1];
+  left[8] = bl[2];
+  right[8] = br[2];
+  left[24] = bl[3];
+  right[24] = br[3];
+  left[4] = bl[4];
+  right[4] = br[4];
+  left[20] = bl[5];
+  right[20] = br[5];
+  left[12] = bl[6];
+  right[12] = br[6];
+  left[28] = bl[7];
+  right[28] = br[7];
+  left[2] = bl[8];
+  right[2] = br[8];
+  left[18] = bl[9];
+  right[18] = br[9];
+  left[10] = bl[10];
+  right[10] = br[10];
+  left[26] = bl[11];
+  right[26] = br[11];
+  left[6] = bl[12];
+  right[6] = br[12];
+  left[22] = bl[13];
+  right[22] = br[13];
+  left[14] = bl[14];
+  right[14] = br[14];
+  left[30] = bl[15];
+  right[30] = br[15];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64,
+                                     cospi_31_64, &al[1], &ar[1], &al[31],
+                                     &ar[31]);
+  left[1] = al[1];
+  right[1] = ar[1];
+  left[31] = al[31];
+  right[31] = ar[31];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17],
+                                     cospi_17_64, cospi_15_64, &al[17], &ar[17],
+                                     &al[15], &ar[15]);
+  left[17] = al[17];
+  right[17] = ar[17];
+  left[15] = al[15];
+  right[15] = ar[15];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64,
+                                     cospi_23_64, &al[9], &ar[9], &al[23],
+                                     &ar[23]);
+  left[9] = al[9];
+  right[9] = ar[9];
+  left[23] = al[23];
+  right[23] = ar[23];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19],
+                                     cospi_25_64, cospi_7_64, &al[25], &ar[25],
+                                     &al[7], &ar[7]);
+  left[25] = al[25];
+  right[25] = ar[25];
+  left[7] = al[7];
+  right[7] = ar[7];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64,
+                                     cospi_27_64, &al[5], &ar[5], &al[27],
+                                     &ar[27]);
+  left[5] = al[5];
+  right[5] = ar[5];
+  left[27] = al[27];
+  right[27] = ar[27];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+                                     cospi_21_64, cospi_11_64, &al[21], &ar[21],
+                                     &al[11], &ar[11]);
+  left[21] = al[21];
+  right[21] = ar[21];
+  left[11] = al[11];
+  right[11] = ar[11];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+                                     cospi_13_64, cospi_19_64, &al[13], &ar[13],
+                                     &al[19], &ar[19]);
+  left[13] = al[13];
+  right[13] = ar[13];
+  left[19] = al[19];
+  right[19] = ar[19];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23],
+                                     cospi_29_64, cospi_3_64, &al[29], &ar[29],
+                                     &al[3], &ar[3]);
+  left[29] = al[29];
+  right[29] = ar[29];
+  left[3] = al[3];
+  right[3] = ar[3];
+}
+
+static INLINE void highbd_dct8x32_body_second_pass_rd(int32x4_t *left /*32*/,
+                                                      int32x4_t *right /*32*/) {
+  int32x4_t al[32], ar[32];
+  int32x4_t bl[32], br[32];
+
+  // Stage 1: Done as part of the load.
+
+  // Stage 2.
+  // For the "rd" version, all the values are rounded down after stage 2 to keep
+  // the values in 16 bits.
+  al[0] = add_round_shift_s32(vaddq_s32(left[0], left[15]));
+  ar[0] = add_round_shift_s32(vaddq_s32(right[0], right[15]));
+  al[1] = add_round_shift_s32(vaddq_s32(left[1], left[14]));
+  ar[1] = add_round_shift_s32(vaddq_s32(right[1], right[14]));
+  al[2] = add_round_shift_s32(vaddq_s32(left[2], left[13]));
+  ar[2] = add_round_shift_s32(vaddq_s32(right[2], right[13]));
+  al[3] = add_round_shift_s32(vaddq_s32(left[3], left[12]));
+  ar[3] = add_round_shift_s32(vaddq_s32(right[3], right[12]));
+  al[4] = add_round_shift_s32(vaddq_s32(left[4], left[11]));
+  ar[4] = add_round_shift_s32(vaddq_s32(right[4], right[11]));
+  al[5] = add_round_shift_s32(vaddq_s32(left[5], left[10]));
+  ar[5] = add_round_shift_s32(vaddq_s32(right[5], right[10]));
+  al[6] = add_round_shift_s32(vaddq_s32(left[6], left[9]));
+  ar[6] = add_round_shift_s32(vaddq_s32(right[6], right[9]));
+  al[7] = add_round_shift_s32(vaddq_s32(left[7], left[8]));
+  ar[7] = add_round_shift_s32(vaddq_s32(right[7], right[8]));
+
+  al[8] = add_round_shift_s32(vsubq_s32(left[7], left[8]));
+  ar[8] = add_round_shift_s32(vsubq_s32(right[7], right[8]));
+  al[9] = add_round_shift_s32(vsubq_s32(left[6], left[9]));
+  ar[9] = add_round_shift_s32(vsubq_s32(right[6], right[9]));
+  al[10] = add_round_shift_s32(vsubq_s32(left[5], left[10]));
+  ar[10] = add_round_shift_s32(vsubq_s32(right[5], right[10]));
+  al[11] = add_round_shift_s32(vsubq_s32(left[4], left[11]));
+  ar[11] = add_round_shift_s32(vsubq_s32(right[4], right[11]));
+  al[12] = add_round_shift_s32(vsubq_s32(left[3], left[12]));
+  ar[12] = add_round_shift_s32(vsubq_s32(right[3], right[12]));
+  al[13] = add_round_shift_s32(vsubq_s32(left[2], left[13]));
+  ar[13] = add_round_shift_s32(vsubq_s32(right[2], right[13]));
+  al[14] = add_round_shift_s32(vsubq_s32(left[1], left[14]));
+  ar[14] = add_round_shift_s32(vsubq_s32(right[1], right[14]));
+  al[15] = add_round_shift_s32(vsubq_s32(left[0], left[15]));
+  ar[15] = add_round_shift_s32(vsubq_s32(right[0], right[15]));
+
+  al[16] = add_round_shift_s32(left[16]);
+  ar[16] = add_round_shift_s32(right[16]);
+  al[17] = add_round_shift_s32(left[17]);
+  ar[17] = add_round_shift_s32(right[17]);
+  al[18] = add_round_shift_s32(left[18]);
+  ar[18] = add_round_shift_s32(right[18]);
+  al[19] = add_round_shift_s32(left[19]);
+  ar[19] = add_round_shift_s32(right[19]);
+
+  butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
+                               cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
+  butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
+                               cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
+  butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
+                               cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
+  butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
+                               cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
+
+  al[20] = add_round_shift_s32(al[20]);
+  ar[20] = add_round_shift_s32(ar[20]);
+  al[21] = add_round_shift_s32(al[21]);
+  ar[21] = add_round_shift_s32(ar[21]);
+  al[22] = add_round_shift_s32(al[22]);
+  ar[22] = add_round_shift_s32(ar[22]);
+  al[23] = add_round_shift_s32(al[23]);
+  ar[23] = add_round_shift_s32(ar[23]);
+  al[24] = add_round_shift_s32(al[24]);
+  ar[24] = add_round_shift_s32(ar[24]);
+  al[25] = add_round_shift_s32(al[25]);
+  ar[25] = add_round_shift_s32(ar[25]);
+  al[26] = add_round_shift_s32(al[26]);
+  ar[26] = add_round_shift_s32(ar[26]);
+  al[27] = add_round_shift_s32(al[27]);
+  ar[27] = add_round_shift_s32(ar[27]);
+
+  al[28] = add_round_shift_s32(left[28]);
+  ar[28] = add_round_shift_s32(right[28]);
+  al[29] = add_round_shift_s32(left[29]);
+  ar[29] = add_round_shift_s32(right[29]);
+  al[30] = add_round_shift_s32(left[30]);
+  ar[30] = add_round_shift_s32(right[30]);
+  al[31] = add_round_shift_s32(left[31]);
+  ar[31] = add_round_shift_s32(right[31]);
+
+  // Stage 3.
+  bl[0] = vaddq_s32(al[0], al[7]);
+  br[0] = vaddq_s32(ar[0], ar[7]);
+  bl[1] = vaddq_s32(al[1], al[6]);
+  br[1] = vaddq_s32(ar[1], ar[6]);
+  bl[2] = vaddq_s32(al[2], al[5]);
+  br[2] = vaddq_s32(ar[2], ar[5]);
+  bl[3] = vaddq_s32(al[3], al[4]);
+  br[3] = vaddq_s32(ar[3], ar[4]);
+
+  bl[4] = vsubq_s32(al[3], al[4]);
+  br[4] = vsubq_s32(ar[3], ar[4]);
+  bl[5] = vsubq_s32(al[2], al[5]);
+  br[5] = vsubq_s32(ar[2], ar[5]);
+  bl[6] = vsubq_s32(al[1], al[6]);
+  br[6] = vsubq_s32(ar[1], ar[6]);
+  bl[7] = vsubq_s32(al[0], al[7]);
+  br[7] = vsubq_s32(ar[0], ar[7]);
+
+  bl[8] = al[8];
+  br[8] = ar[8];
+  bl[9] = al[9];
+  br[9] = ar[9];
+
+  butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
+                               &bl[13], &br[13], &bl[10], &br[10]);
+  butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
+                               &bl[12], &br[12], &bl[11], &br[11]);
+
+  bl[14] = al[14];
+  br[14] = ar[14];
+  bl[15] = al[15];
+  br[15] = ar[15];
+
+  bl[16] = vaddq_s32(al[16], al[23]);
+  br[16] = vaddq_s32(ar[16], ar[23]);
+  bl[17] = vaddq_s32(al[17], al[22]);
+  br[17] = vaddq_s32(ar[17], ar[22]);
+  bl[18] = vaddq_s32(al[18], al[21]);
+  br[18] = vaddq_s32(ar[18], ar[21]);
+  bl[19] = vaddq_s32(al[19], al[20]);
+  br[19] = vaddq_s32(ar[19], ar[20]);
+
+  bl[20] = vsubq_s32(al[19], al[20]);
+  br[20] = vsubq_s32(ar[19], ar[20]);
+  bl[21] = vsubq_s32(al[18], al[21]);
+  br[21] = vsubq_s32(ar[18], ar[21]);
+  bl[22] = vsubq_s32(al[17], al[22]);
+  br[22] = vsubq_s32(ar[17], ar[22]);
+  bl[23] = vsubq_s32(al[16], al[23]);
+  br[23] = vsubq_s32(ar[16], ar[23]);
+
+  bl[24] = vsubq_s32(al[31], al[24]);
+  br[24] = vsubq_s32(ar[31], ar[24]);
+  bl[25] = vsubq_s32(al[30], al[25]);
+  br[25] = vsubq_s32(ar[30], ar[25]);
+  bl[26] = vsubq_s32(al[29], al[26]);
+  br[26] = vsubq_s32(ar[29], ar[26]);
+  bl[27] = vsubq_s32(al[28], al[27]);
+  br[27] = vsubq_s32(ar[28], ar[27]);
+
+  bl[28] = vaddq_s32(al[28], al[27]);
+  br[28] = vaddq_s32(ar[28], ar[27]);
+  bl[29] = vaddq_s32(al[29], al[26]);
+  br[29] = vaddq_s32(ar[29], ar[26]);
+  bl[30] = vaddq_s32(al[30], al[25]);
+  br[30] = vaddq_s32(ar[30], ar[25]);
+  bl[31] = vaddq_s32(al[31], al[24]);
+  br[31] = vaddq_s32(ar[31], ar[24]);
+
+  // Stage 4.
+  al[0] = vaddq_s32(bl[0], bl[3]);
+  ar[0] = vaddq_s32(br[0], br[3]);
+  al[1] = vaddq_s32(bl[1], bl[2]);
+  ar[1] = vaddq_s32(br[1], br[2]);
+  al[2] = vsubq_s32(bl[1], bl[2]);
+  ar[2] = vsubq_s32(br[1], br[2]);
+  al[3] = vsubq_s32(bl[0], bl[3]);
+  ar[3] = vsubq_s32(br[0], br[3]);
+
+  al[4] = bl[4];
+  ar[4] = br[4];
+
+  butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
+                               &ar[6], &al[5], &ar[5]);
+
+  al[7] = bl[7];
+  ar[7] = br[7];
+
+  al[8] = vaddq_s32(bl[8], bl[11]);
+  ar[8] = vaddq_s32(br[8], br[11]);
+  al[9] = vaddq_s32(bl[9], bl[10]);
+  ar[9] = vaddq_s32(br[9], br[10]);
+  al[10] = vsubq_s32(bl[9], bl[10]);
+  ar[10] = vsubq_s32(br[9], br[10]);
+  al[11] = vsubq_s32(bl[8], bl[11]);
+  ar[11] = vsubq_s32(br[8], br[11]);
+  al[12] = vsubq_s32(bl[15], bl[12]);
+  ar[12] = vsubq_s32(br[15], br[12]);
+  al[13] = vsubq_s32(bl[14], bl[13]);
+  ar[13] = vsubq_s32(br[14], br[13]);
+  al[14] = vaddq_s32(bl[14], bl[13]);
+  ar[14] = vaddq_s32(br[14], br[13]);
+  al[15] = vaddq_s32(bl[15], bl[12]);
+  ar[15] = vaddq_s32(br[15], br[12]);
+
+  al[16] = bl[16];
+  ar[16] = br[16];
+  al[17] = bl[17];
+  ar[17] = br[17];
+
+  butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_8_64,
+                          cospi_24_64, &al[29], &ar[29], &al[18], &ar[18]);
+  butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_8_64,
+                          cospi_24_64, &al[28], &ar[28], &al[19], &ar[19]);
+  butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_24_64,
+                          -cospi_8_64, &al[27], &ar[27], &al[20], &ar[20]);
+  butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_24_64,
+                          -cospi_8_64, &al[26], &ar[26], &al[21], &ar[21]);
+
+  al[22] = bl[22];
+  ar[22] = br[22];
+  al[23] = bl[23];
+  ar[23] = br[23];
+  al[24] = bl[24];
+  ar[24] = br[24];
+  al[25] = bl[25];
+  ar[25] = br[25];
+
+  al[30] = bl[30];
+  ar[30] = br[30];
+  al[31] = bl[31];
+  ar[31] = br[31];
+
+  // Stage 5.
+  butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
+                               &br[0], &bl[1], &br[1]);
+  butterfly_two_coeff_s32(al[3], ar[3], al[2], ar[2], cospi_8_64, cospi_24_64,
+                          &bl[2], &br[2], &bl[3], &br[3]);
+
+  bl[4] = vaddq_s32(al[4], al[5]);
+  br[4] = vaddq_s32(ar[4], ar[5]);
+  bl[5] = vsubq_s32(al[4], al[5]);
+  br[5] = vsubq_s32(ar[4], ar[5]);
+  bl[6] = vsubq_s32(al[7], al[6]);
+  br[6] = vsubq_s32(ar[7], ar[6]);
+  bl[7] = vaddq_s32(al[7], al[6]);
+  br[7] = vaddq_s32(ar[7], ar[6]);
+
+  bl[8] = al[8];
+  br[8] = ar[8];
+
+  butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_8_64, cospi_24_64,
+                          &bl[14], &br[14], &bl[9], &br[9]);
+  butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_24_64,
+                          -cospi_8_64, &bl[13], &br[13], &bl[10], &br[10]);
+
+  bl[11] = al[11];
+  br[11] = ar[11];
+  bl[12] = al[12];
+  br[12] = ar[12];
+
+  bl[15] = al[15];
+  br[15] = ar[15];
+
+  bl[16] = vaddq_s32(al[19], al[16]);
+  br[16] = vaddq_s32(ar[19], ar[16]);
+  bl[17] = vaddq_s32(al[18], al[17]);
+  br[17] = vaddq_s32(ar[18], ar[17]);
+  bl[18] = vsubq_s32(al[17], al[18]);
+  br[18] = vsubq_s32(ar[17], ar[18]);
+  bl[19] = vsubq_s32(al[16], al[19]);
+  br[19] = vsubq_s32(ar[16], ar[19]);
+  bl[20] = vsubq_s32(al[23], al[20]);
+  br[20] = vsubq_s32(ar[23], ar[20]);
+  bl[21] = vsubq_s32(al[22], al[21]);
+  br[21] = vsubq_s32(ar[22], ar[21]);
+  bl[22] = vaddq_s32(al[21], al[22]);
+  br[22] = vaddq_s32(ar[21], ar[22]);
+  bl[23] = vaddq_s32(al[20], al[23]);
+  br[23] = vaddq_s32(ar[20], ar[23]);
+  bl[24] = vaddq_s32(al[27], al[24]);
+  br[24] = vaddq_s32(ar[27], ar[24]);
+  bl[25] = vaddq_s32(al[26], al[25]);
+  br[25] = vaddq_s32(ar[26], ar[25]);
+  bl[26] = vsubq_s32(al[25], al[26]);
+  br[26] = vsubq_s32(ar[25], ar[26]);
+  bl[27] = vsubq_s32(al[24], al[27]);
+  br[27] = vsubq_s32(ar[24], ar[27]);
+  bl[28] = vsubq_s32(al[31], al[28]);
+  br[28] = vsubq_s32(ar[31], ar[28]);
+  bl[29] = vsubq_s32(al[30], al[29]);
+  br[29] = vsubq_s32(ar[30], ar[29]);
+  bl[30] = vaddq_s32(al[29], al[30]);
+  br[30] = vaddq_s32(ar[29], ar[30]);
+  bl[31] = vaddq_s32(al[28], al[31]);
+  br[31] = vaddq_s32(ar[28], ar[31]);
+
+  // Stage 6.
+  al[0] = bl[0];
+  ar[0] = br[0];
+  al[1] = bl[1];
+  ar[1] = br[1];
+  al[2] = bl[2];
+  ar[2] = br[2];
+  al[3] = bl[3];
+  ar[3] = br[3];
+
+  butterfly_two_coeff_s32(bl[7], br[7], bl[4], br[4], cospi_4_64, cospi_28_64,
+                          &al[4], &ar[4], &al[7], &ar[7]);
+  butterfly_two_coeff_s32(bl[6], br[6], bl[5], br[5], cospi_20_64, cospi_12_64,
+                          &al[5], &ar[5], &al[6], &ar[6]);
+
+  al[8] = vaddq_s32(bl[8], bl[9]);
+  ar[8] = vaddq_s32(br[8], br[9]);
+  al[9] = vsubq_s32(bl[8], bl[9]);
+  ar[9] = vsubq_s32(br[8], br[9]);
+  al[10] = vsubq_s32(bl[11], bl[10]);
+  ar[10] = vsubq_s32(br[11], br[10]);
+  al[11] = vaddq_s32(bl[11], bl[10]);
+  ar[11] = vaddq_s32(br[11], br[10]);
+  al[12] = vaddq_s32(bl[12], bl[13]);
+  ar[12] = vaddq_s32(br[12], br[13]);
+  al[13] = vsubq_s32(bl[12], bl[13]);
+  ar[13] = vsubq_s32(br[12], br[13]);
+  al[14] = vsubq_s32(bl[15], bl[14]);
+  ar[14] = vsubq_s32(br[15], br[14]);
+  al[15] = vaddq_s32(bl[15], bl[14]);
+  ar[15] = vaddq_s32(br[15], br[14]);
+
+  al[16] = bl[16];
+  ar[16] = br[16];
+  al[19] = bl[19];
+  ar[19] = br[19];
+  al[20] = bl[20];
+  ar[20] = br[20];
+  al[23] = bl[23];
+  ar[23] = br[23];
+  al[24] = bl[24];
+  ar[24] = br[24];
+  al[27] = bl[27];
+  ar[27] = br[27];
+  al[28] = bl[28];
+  ar[28] = br[28];
+  al[31] = bl[31];
+  ar[31] = br[31];
+
+  butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_4_64,
+                          cospi_28_64, &al[30], &ar[30], &al[17], &ar[17]);
+  butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_28_64,
+                          -cospi_4_64, &al[29], &ar[29], &al[18], &ar[18]);
+  butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_20_64,
+                          cospi_12_64, &al[26], &ar[26], &al[21], &ar[21]);
+  butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_12_64,
+                          -cospi_20_64, &al[25], &ar[25], &al[22], &ar[22]);
+
+  // Stage 7.
+  bl[0] = al[0];
+  br[0] = ar[0];
+  bl[1] = al[1];
+  br[1] = ar[1];
+  bl[2] = al[2];
+  br[2] = ar[2];
+  bl[3] = al[3];
+  br[3] = ar[3];
+  bl[4] = al[4];
+  br[4] = ar[4];
+  bl[5] = al[5];
+  br[5] = ar[5];
+  bl[6] = al[6];
+  br[6] = ar[6];
+  bl[7] = al[7];
+  br[7] = ar[7];
+
+  butterfly_two_coeff_s32(al[15], ar[15], al[8], ar[8], cospi_2_64, cospi_30_64,
+                          &bl[8], &br[8], &bl[15], &br[15]);
+  butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_18_64,
+                          cospi_14_64, &bl[9], &br[9], &bl[14], &br[14]);
+  butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_10_64,
+                          cospi_22_64, &bl[10], &br[10], &bl[13], &br[13]);
+  butterfly_two_coeff_s32(al[12], ar[12], al[11], ar[11], cospi_26_64,
+                          cospi_6_64, &bl[11], &br[11], &bl[12], &br[12]);
+
+  bl[16] = vaddq_s32(al[16], al[17]);
+  br[16] = vaddq_s32(ar[16], ar[17]);
+  bl[17] = vsubq_s32(al[16], al[17]);
+  br[17] = vsubq_s32(ar[16], ar[17]);
+  bl[18] = vsubq_s32(al[19], al[18]);
+  br[18] = vsubq_s32(ar[19], ar[18]);
+  bl[19] = vaddq_s32(al[19], al[18]);
+  br[19] = vaddq_s32(ar[19], ar[18]);
+  bl[20] = vaddq_s32(al[20], al[21]);
+  br[20] = vaddq_s32(ar[20], ar[21]);
+  bl[21] = vsubq_s32(al[20], al[21]);
+  br[21] = vsubq_s32(ar[20], ar[21]);
+  bl[22] = vsubq_s32(al[23], al[22]);
+  br[22] = vsubq_s32(ar[23], ar[22]);
+  bl[23] = vaddq_s32(al[23], al[22]);
+  br[23] = vaddq_s32(ar[23], ar[22]);
+  bl[24] = vaddq_s32(al[24], al[25]);
+  br[24] = vaddq_s32(ar[24], ar[25]);
+  bl[25] = vsubq_s32(al[24], al[25]);
+  br[25] = vsubq_s32(ar[24], ar[25]);
+  bl[26] = vsubq_s32(al[27], al[26]);
+  br[26] = vsubq_s32(ar[27], ar[26]);
+  bl[27] = vaddq_s32(al[27], al[26]);
+  br[27] = vaddq_s32(ar[27], ar[26]);
+  bl[28] = vaddq_s32(al[28], al[29]);
+  br[28] = vaddq_s32(ar[28], ar[29]);
+  bl[29] = vsubq_s32(al[28], al[29]);
+  br[29] = vsubq_s32(ar[28], ar[29]);
+  bl[30] = vsubq_s32(al[31], al[30]);
+  br[30] = vsubq_s32(ar[31], ar[30]);
+  bl[31] = vaddq_s32(al[31], al[30]);
+  br[31] = vaddq_s32(ar[31], ar[30]);
+
+  // Final stage.
+  left[0] = bl[0];
+  right[0] = br[0];
+  left[16] = bl[1];
+  right[16] = br[1];
+  left[8] = bl[2];
+  right[8] = br[2];
+  left[24] = bl[3];
+  right[24] = br[3];
+  left[4] = bl[4];
+  right[4] = br[4];
+  left[20] = bl[5];
+  right[20] = br[5];
+  left[12] = bl[6];
+  right[12] = br[6];
+  left[28] = bl[7];
+  right[28] = br[7];
+  left[2] = bl[8];
+  right[2] = br[8];
+  left[18] = bl[9];
+  right[18] = br[9];
+  left[10] = bl[10];
+  right[10] = br[10];
+  left[26] = bl[11];
+  right[26] = br[11];
+  left[6] = bl[12];
+  right[6] = br[12];
+  left[22] = bl[13];
+  right[22] = br[13];
+  left[14] = bl[14];
+  right[14] = br[14];
+  left[30] = bl[15];
+  right[30] = br[15];
+
+  butterfly_two_coeff_s32(bl[31], br[31], bl[16], br[16], cospi_1_64,
+                          cospi_31_64, &al[1], &ar[1], &al[31], &ar[31]);
+  left[1] = al[1];
+  right[1] = ar[1];
+  left[31] = al[31];
+  right[31] = ar[31];
+
+  butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_17_64,
+                          cospi_15_64, &al[17], &ar[17], &al[15], &ar[15]);
+  left[17] = al[17];
+  right[17] = ar[17];
+  left[15] = al[15];
+  right[15] = ar[15];
+
+  butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_9_64,
+                          cospi_23_64, &al[9], &ar[9], &al[23], &ar[23]);
+  left[9] = al[9];
+  right[9] = ar[9];
+  left[23] = al[23];
+  right[23] = ar[23];
+
+  butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_25_64,
+                          cospi_7_64, &al[25], &ar[25], &al[7], &ar[7]);
+  left[25] = al[25];
+  right[25] = ar[25];
+  left[7] = al[7];
+  right[7] = ar[7];
+
+  butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_5_64,
+                          cospi_27_64, &al[5], &ar[5], &al[27], &ar[27]);
+  left[5] = al[5];
+  right[5] = ar[5];
+  left[27] = al[27];
+  right[27] = ar[27];
+
+  butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_21_64,
+                          cospi_11_64, &al[21], &ar[21], &al[11], &ar[11]);
+  left[21] = al[21];
+  right[21] = ar[21];
+  left[11] = al[11];
+  right[11] = ar[11];
+
+  butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_13_64,
+                          cospi_19_64, &al[13], &ar[13], &al[19], &ar[19]);
+  left[13] = al[13];
+  right[13] = ar[13];
+  left[19] = al[19];
+  right[19] = ar[19];
+
+  butterfly_two_coeff_s32(bl[24], br[24], bl[23], br[23], cospi_29_64,
+                          cospi_3_64, &al[29], &ar[29], &al[3], &ar[3]);
+  left[29] = al[29];
+  right[29] = ar[29];
+  left[3] = al[3];
+  right[3] = ar[3];
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#endif  // VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.c
new file mode 100644
index 0000000000..4bc968ecba
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.c
@@ -0,0 +1,85 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct4x4_neon.h"
+
+void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
+                      int stride) {
+  // input[M * stride] * 16
+  int16x4_t in[4];
+  in[0] = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
+  in[1] = vshl_n_s16(vld1_s16(input + 1 * stride), 4);
+  in[2] = vshl_n_s16(vld1_s16(input + 2 * stride), 4);
+  in[3] = vshl_n_s16(vld1_s16(input + 3 * stride), 4);
+
+  // If the very first value != 0, then add 1.
+  if (input[0] != 0) {
+    const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1));
+    in[0] = vadd_s16(in[0], one);
+  }
+  vpx_fdct4x4_pass1_neon(in);
+  vpx_fdct4x4_pass2_neon(in);
+  {
+    // Not quite a rounding shift. Only add 1 despite shifting by 2.
+    const int16x8_t one = vdupq_n_s16(1);
+    int16x8_t out_01 = vcombine_s16(in[0], in[1]);
+    int16x8_t out_23 = vcombine_s16(in[2], in[3]);
+    out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2);
+    out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2);
+    store_s16q_to_tran_low(final_output + 0 * 8, out_01);
+    store_s16q_to_tran_low(final_output + 1 * 8, out_23);
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
+                             int stride) {
+  const int32x4_t const_one = vdupq_n_s32(1);
+
+  // input[M * stride] * 16
+  int32x4_t in[4];
+  in[0] = vshll_n_s16(vld1_s16(input + 0 * stride), 4);
+  in[1] = vshll_n_s16(vld1_s16(input + 1 * stride), 4);
+  in[2] = vshll_n_s16(vld1_s16(input + 2 * stride), 4);
+  in[3] = vshll_n_s16(vld1_s16(input + 3 * stride), 4);
+
+  // If the very first value != 0, then add 1.
+  if (input[0] != 0) {
+    static const int32_t k1000[4] = { 1, 0, 0, 0 };
+    in[0] = vaddq_s32(in[0], vld1q_s32(k1000));
+  }
+
+  vpx_highbd_fdct4x4_pass1_neon(in);
+  vpx_highbd_fdct4x4_pass1_neon(in);
+  {
+    // Not quite a rounding shift. Only add 1 despite shifting by 2.
+    in[0] = vshrq_n_s32(vaddq_s32(in[0], const_one), 2);
+    in[1] = vshrq_n_s32(vaddq_s32(in[1], const_one), 2);
+    in[2] = vshrq_n_s32(vaddq_s32(in[2], const_one), 2);
+    in[3] = vshrq_n_s32(vaddq_s32(in[3], const_one), 2);
+
+    vst1q_s32(final_output, in[0]);
+    vst1q_s32(final_output + 4, in[1]);
+    vst1q_s32(final_output + 8, in[2]);
+    vst1q_s32(final_output + 12, in[3]);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.h
new file mode 100644
index 0000000000..de3db9774c
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.h
@@ -0,0 +1,105 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) {
+  int16x4_t out[4];
+
+  const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
+  const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
+
+  // in_0 +/- in_3, in_1 +/- in_2
+  const int16x8_t s_01 = vaddq_s16(input_01, input_32);
+  const int16x8_t s_32 = vsubq_s16(input_01, input_32);
+
+  // step_0 +/- step_1, step_2 +/- step_3
+  const int16x4_t s_0 = vget_low_s16(s_01);
+  const int16x4_t s_1 = vget_high_s16(s_01);
+  const int16x4_t s_2 = vget_high_s16(s_32);
+  const int16x4_t s_3 = vget_low_s16(s_32);
+
+  // fdct_round_shift(s_0 +/- s_1) * cospi_16_64
+  butterfly_one_coeff_s16_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]);
+
+  // s_3 * cospi_8_64 + s_2 * cospi_24_64
+  // s_3 * cospi_24_64 - s_2 * cospi_8_64
+  butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]);
+
+  transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]);
+
+  in[0] = out[0];
+  in[1] = out[1];
+  in[2] = out[2];
+  in[3] = out[3];
+}
+
+static INLINE void vpx_fdct4x4_pass2_neon(int16x4_t *in) {
+  int16x4_t out[4];
+
+  const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
+  const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
+
+  // in_0 +/- in_3, in_1 +/- in_2
+  const int16x8_t s_01 = vaddq_s16(input_01, input_32);
+  const int16x8_t s_32 = vsubq_s16(input_01, input_32);
+
+  // step_0 +/- step_1, step_2 +/- step_3
+  const int16x4_t s_0 = vget_low_s16(s_01);
+  const int16x4_t s_1 = vget_high_s16(s_01);
+  const int16x4_t s_2 = vget_high_s16(s_32);
+  const int16x4_t s_3 = vget_low_s16(s_32);
+
+  // fdct_round_shift(s_0 +/- s_1) * cospi_16_64
+  butterfly_one_coeff_s16_s32_fast_narrow_half(s_0, s_1, cospi_16_64, &out[0],
+                                               &out[2]);
+
+  // s_3 * cospi_8_64 + s_2 * cospi_24_64
+  // s_3 * cospi_24_64 - s_2 * cospi_8_64
+  butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]);
+
+  transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]);
+
+  in[0] = out[0];
+  in[1] = out[1];
+  in[2] = out[2];
+  in[3] = out[3];
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE void vpx_highbd_fdct4x4_pass1_neon(int32x4_t *in) {
+  int32x4_t out[4];
+  // in_0 +/- in_3, in_1 +/- in_2
+  const int32x4_t s_0 = vaddq_s32(in[0], in[3]);
+  const int32x4_t s_1 = vaddq_s32(in[1], in[2]);
+  const int32x4_t s_2 = vsubq_s32(in[1], in[2]);
+  const int32x4_t s_3 = vsubq_s32(in[0], in[3]);
+
+  butterfly_one_coeff_s32_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]);
+
+  // out[1] = s_3 * cospi_8_64 + s_2 * cospi_24_64
+  // out[3] = s_3 * cospi_24_64 - s_2 * cospi_8_64
+  butterfly_two_coeff_s32_s64_narrow_half(s_3, s_2, cospi_8_64, cospi_24_64,
+                                          &out[1], &out[3]);
+
+  transpose_s32_4x4(&out[0], &out[1], &out[2], &out[3]);
+
+  in[0] = out[0];
+  in[1] = out[1];
+  in[2] = out[2];
+  in[3] = out[3];
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.c
new file mode 100644
index 0000000000..75ee6f2230
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.c
@@ -0,0 +1,143 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/fdct8x8_neon.h"
+
+void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
+                      int stride) {
+  // stage 1
+  int16x8_t in[8];
+  in[0] = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
+  in[1] = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
+  in[2] = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
+  in[3] = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
+  in[4] = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
+  in[5] = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
+  in[6] = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
+  in[7] = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
+
+  vpx_fdct8x8_pass1_neon(in);
+  vpx_fdct8x8_pass2_neon(in);
+  {
+    // from vpx_dct_sse2.c
+    // Post-condition (division by two)
+    //    division of two 16 bits signed numbers using shifts
+    //    n / 2 = (n - (n >> 15)) >> 1
+    const int16x8_t sign_in0 = vshrq_n_s16(in[0], 15);
+    const int16x8_t sign_in1 = vshrq_n_s16(in[1], 15);
+    const int16x8_t sign_in2 = vshrq_n_s16(in[2], 15);
+    const int16x8_t sign_in3 = vshrq_n_s16(in[3], 15);
+    const int16x8_t sign_in4 = vshrq_n_s16(in[4], 15);
+    const int16x8_t sign_in5 = vshrq_n_s16(in[5], 15);
+    const int16x8_t sign_in6 = vshrq_n_s16(in[6], 15);
+    const int16x8_t sign_in7 = vshrq_n_s16(in[7], 15);
+    in[0] = vhsubq_s16(in[0], sign_in0);
+    in[1] = vhsubq_s16(in[1], sign_in1);
+    in[2] = vhsubq_s16(in[2], sign_in2);
+    in[3] = vhsubq_s16(in[3], sign_in3);
+    in[4] = vhsubq_s16(in[4], sign_in4);
+    in[5] = vhsubq_s16(in[5], sign_in5);
+    in[6] = vhsubq_s16(in[6], sign_in6);
+    in[7] = vhsubq_s16(in[7], sign_in7);
+    // store results
+    store_s16q_to_tran_low(final_output + 0 * 8, in[0]);
+    store_s16q_to_tran_low(final_output + 1 * 8, in[1]);
+    store_s16q_to_tran_low(final_output + 2 * 8, in[2]);
+    store_s16q_to_tran_low(final_output + 3 * 8, in[3]);
+    store_s16q_to_tran_low(final_output + 4 * 8, in[4]);
+    store_s16q_to_tran_low(final_output + 5 * 8, in[5]);
+    store_s16q_to_tran_low(final_output + 6 * 8, in[6]);
+    store_s16q_to_tran_low(final_output + 7 * 8, in[7]);
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
+                             int stride) {
+  // input[M * stride] * 16
+  int32x4_t left[8], right[8];
+  int16x8_t in[8];
+  in[0] = vld1q_s16(input + 0 * stride);
+  in[1] = vld1q_s16(input + 1 * stride);
+  in[2] = vld1q_s16(input + 2 * stride);
+  in[3] = vld1q_s16(input + 3 * stride);
+  in[4] = vld1q_s16(input + 4 * stride);
+  in[5] = vld1q_s16(input + 5 * stride);
+  in[6] = vld1q_s16(input + 6 * stride);
+  in[7] = vld1q_s16(input + 7 * stride);
+
+  left[0] = vshll_n_s16(vget_low_s16(in[0]), 2);
+  left[1] = vshll_n_s16(vget_low_s16(in[1]), 2);
+  left[2] = vshll_n_s16(vget_low_s16(in[2]), 2);
+  left[3] = vshll_n_s16(vget_low_s16(in[3]), 2);
+  left[4] = vshll_n_s16(vget_low_s16(in[4]), 2);
+  left[5] = vshll_n_s16(vget_low_s16(in[5]), 2);
+  left[6] = vshll_n_s16(vget_low_s16(in[6]), 2);
+  left[7] = vshll_n_s16(vget_low_s16(in[7]), 2);
+  right[0] = vshll_n_s16(vget_high_s16(in[0]), 2);
+  right[1] = vshll_n_s16(vget_high_s16(in[1]), 2);
+  right[2] = vshll_n_s16(vget_high_s16(in[2]), 2);
+  right[3] = vshll_n_s16(vget_high_s16(in[3]), 2);
+  right[4] = vshll_n_s16(vget_high_s16(in[4]), 2);
+  right[5] = vshll_n_s16(vget_high_s16(in[5]), 2);
+  right[6] = vshll_n_s16(vget_high_s16(in[6]), 2);
+  right[7] = vshll_n_s16(vget_high_s16(in[7]), 2);
+
+  vpx_highbd_fdct8x8_pass1_neon(left, right);
+  vpx_highbd_fdct8x8_pass2_neon(left, right);
+  {
+    left[0] = add_round_shift_half_s32(left[0]);
+    left[1] = add_round_shift_half_s32(left[1]);
+    left[2] = add_round_shift_half_s32(left[2]);
+    left[3] = add_round_shift_half_s32(left[3]);
+    left[4] = add_round_shift_half_s32(left[4]);
+    left[5] = add_round_shift_half_s32(left[5]);
+    left[6] = add_round_shift_half_s32(left[6]);
+    left[7] = add_round_shift_half_s32(left[7]);
+    right[0] = add_round_shift_half_s32(right[0]);
+    right[1] = add_round_shift_half_s32(right[1]);
+    right[2] = add_round_shift_half_s32(right[2]);
+    right[3] = add_round_shift_half_s32(right[3]);
+    right[4] = add_round_shift_half_s32(right[4]);
+    right[5] = add_round_shift_half_s32(right[5]);
+    right[6] = add_round_shift_half_s32(right[6]);
+    right[7] = add_round_shift_half_s32(right[7]);
+
+    // store results
+    vst1q_s32(final_output, left[0]);
+    vst1q_s32(final_output + 4, right[0]);
+    vst1q_s32(final_output + 8, left[1]);
+    vst1q_s32(final_output + 12, right[1]);
+    vst1q_s32(final_output + 16, left[2]);
+    vst1q_s32(final_output + 20, right[2]);
+    vst1q_s32(final_output + 24, left[3]);
+    vst1q_s32(final_output + 28, right[3]);
+    vst1q_s32(final_output + 32, left[4]);
+    vst1q_s32(final_output + 36, right[4]);
+    vst1q_s32(final_output + 40, left[5]);
+    vst1q_s32(final_output + 44, right[5]);
+    vst1q_s32(final_output + 48, left[6]);
+    vst1q_s32(final_output + 52, right[6]);
+    vst1q_s32(final_output + 56, left[7]);
+    vst1q_s32(final_output + 60, right[7]);
+  }
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.h
new file mode 100644
index 0000000000..cc65157430
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.h
@@ -0,0 +1,307 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in,
+                                                      int16x8_t *out) {
+  int16x8_t s[8], x[4], t[2];
+
+  s[0] = vaddq_s16(in[0], in[7]);
+  s[1] = vaddq_s16(in[1], in[6]);
+  s[2] = vaddq_s16(in[2], in[5]);
+  s[3] = vaddq_s16(in[3], in[4]);
+  s[4] = vsubq_s16(in[3], in[4]);
+  s[5] = vsubq_s16(in[2], in[5]);
+  s[6] = vsubq_s16(in[1], in[6]);
+  s[7] = vsubq_s16(in[0], in[7]);
+  // fdct4(step, step);
+  x[0] = vaddq_s16(s[0], s[3]);
+  x[1] = vaddq_s16(s[1], s[2]);
+  x[2] = vsubq_s16(s[1], s[2]);
+  x[3] = vsubq_s16(s[0], s[3]);
+
+  // fdct4(step, step);
+  // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+  butterfly_one_coeff_s16_fast(x[0], x[1], cospi_16_64, &out[0], &out[4]);
+  // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+  // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+  butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]);
+
+  // Stage 2
+  // t0 = (s6 - s5) * cospi_16_64;
+  // t1 = (s6 + s5) * cospi_16_64;
+  butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &t[1], &t[0]);
+
+  // Stage 3
+  x[0] = vaddq_s16(s[4], t[0]);
+  x[1] = vsubq_s16(s[4], t[0]);
+  x[2] = vsubq_s16(s[7], t[1]);
+  x[3] = vaddq_s16(s[7], t[1]);
+
+  // Stage 4
+  // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+  // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+  butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]);
+
+  // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+  // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+  butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]);
+}
+
+static INLINE void vpx_fdct8x8_pass2_notranspose_neon(int16x8_t *in,
+                                                      int16x8_t *out) {
+  int16x8_t s[8], x[4], t[2];
+
+  s[0] = vaddq_s16(in[0], in[7]);
+  s[1] = vaddq_s16(in[1], in[6]);
+  s[2] = vaddq_s16(in[2], in[5]);
+  s[3] = vaddq_s16(in[3], in[4]);
+  s[4] = vsubq_s16(in[3], in[4]);
+  s[5] = vsubq_s16(in[2], in[5]);
+  s[6] = vsubq_s16(in[1], in[6]);
+  s[7] = vsubq_s16(in[0], in[7]);
+  // fdct4(step, step);
+  x[0] = vaddq_s16(s[0], s[3]);
+  x[1] = vaddq_s16(s[1], s[2]);
+  x[2] = vsubq_s16(s[1], s[2]);
+  x[3] = vsubq_s16(s[0], s[3]);
+
+  // fdct4(step, step);
+  // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+  butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0],
+                                          &out[4]);
+  // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+  // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+  butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]);
+
+  // Stage 2
+  // t0 = (s6 - s5) * cospi_16_64;
+  // t1 = (s6 + s5) * cospi_16_64;
+  butterfly_one_coeff_s16_s32_fast_narrow(s[6], s[5], cospi_16_64, &t[1],
+                                          &t[0]);
+
+  // Stage 3
+  x[0] = vaddq_s16(s[4], t[0]);
+  x[1] = vsubq_s16(s[4], t[0]);
+  x[2] = vsubq_s16(s[7], t[1]);
+  x[3] = vaddq_s16(s[7], t[1]);
+
+  // Stage 4
+  // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+  // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+  butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]);
+
+  // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+  // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+  butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]);
+}
+
+static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) {
+  int16x8_t out[8];
+  vpx_fdct8x8_pass1_notranspose_neon(in, out);
+  // transpose 8x8
+  transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+                    &out[6], &out[7]);
+  in[0] = out[0];
+  in[1] = out[1];
+  in[2] = out[2];
+  in[3] = out[3];
+  in[4] = out[4];
+  in[5] = out[5];
+  in[6] = out[6];
+  in[7] = out[7];
+}
+
+static INLINE void vpx_fdct8x8_pass2_neon(int16x8_t *in) {
+  int16x8_t out[8];
+  vpx_fdct8x8_pass2_notranspose_neon(in, out);
+  // transpose 8x8
+  transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+                    &out[6], &out[7]);
+  in[0] = out[0];
+  in[1] = out[1];
+  in[2] = out[2];
+  in[3] = out[3];
+  in[4] = out[4];
+  in[5] = out[5];
+  in[6] = out[6];
+  in[7] = out[7];
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void vpx_highbd_fdct8x8_pass1_notranspose_neon(int32x4_t *left,
+                                                             int32x4_t *right) {
+  int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4];
+
+  sl[0] = vaddq_s32(left[0], left[7]);
+  sl[1] = vaddq_s32(left[1], left[6]);
+  sl[2] = vaddq_s32(left[2], left[5]);
+  sl[3] = vaddq_s32(left[3], left[4]);
+  sl[4] = vsubq_s32(left[3], left[4]);
+  sl[5] = vsubq_s32(left[2], left[5]);
+  sl[6] = vsubq_s32(left[1], left[6]);
+  sl[7] = vsubq_s32(left[0], left[7]);
+  sr[0] = vaddq_s32(right[0], right[7]);
+  sr[1] = vaddq_s32(right[1], right[6]);
+  sr[2] = vaddq_s32(right[2], right[5]);
+  sr[3] = vaddq_s32(right[3], right[4]);
+  sr[4] = vsubq_s32(right[3], right[4]);
+  sr[5] = vsubq_s32(right[2], right[5]);
+  sr[6] = vsubq_s32(right[1], right[6]);
+  sr[7] = vsubq_s32(right[0], right[7]);
+
+  // fdct4(step, step);
+  // x0 = s0 + s3;
+  xl[0] = vaddq_s32(sl[0], sl[3]);
+  xr[0] = vaddq_s32(sr[0], sr[3]);
+  // x1 = s1 + s2;
+  xl[1] = vaddq_s32(sl[1], sl[2]);
+  xr[1] = vaddq_s32(sr[1], sr[2]);
+  // x2 = s1 - s2;
+  xl[2] = vsubq_s32(sl[1], sl[2]);
+  xr[2] = vsubq_s32(sr[1], sr[2]);
+  // x3 = s0 - s3;
+  xl[3] = vsubq_s32(sl[0], sl[3]);
+  xr[3] = vsubq_s32(sr[0], sr[3]);
+
+  // fdct4(step, step);
+  // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+  butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
+                               &left[0], &right[0], &left[4], &right[4]);
+  // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+  // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+  butterfly_two_coeff_s32(xl[3], xr[3], xl[2], xr[2], cospi_8_64, cospi_24_64,
+                          &left[2], &right[2], &left[6], &right[6]);
+
+  // Stage 2
+  // t0 = (s6 - s5) * cospi_16_64;
+  // t1 = (s6 + s5) * cospi_16_64;
+  butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1],
+                               &tr[1], &tl[0], &tr[0]);
+
+  // Stage 3
+  xl[0] = vaddq_s32(sl[4], tl[0]);
+  xr[0] = vaddq_s32(sr[4], tr[0]);
+  xl[1] = vsubq_s32(sl[4], tl[0]);
+  xr[1] = vsubq_s32(sr[4], tr[0]);
+  xl[2] = vsubq_s32(sl[7], tl[1]);
+  xr[2] = vsubq_s32(sr[7], tr[1]);
+  xl[3] = vaddq_s32(sl[7], tl[1]);
+  xr[3] = vaddq_s32(sr[7], tr[1]);
+
+  // Stage 4
+  // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+  // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+  butterfly_two_coeff_s32(xl[3], xr[3], xl[0], xr[0], cospi_4_64, cospi_28_64,
+                          &left[1], &right[1], &left[7], &right[7]);
+
+  // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+  // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+  butterfly_two_coeff_s32(xl[2], xr[2], xl[1], xr[1], cospi_20_64, cospi_12_64,
+                          &left[5], &right[5], &left[3], &right[3]);
+}
+
+static INLINE void vpx_highbd_fdct8x8_pass2_notranspose_neon(int32x4_t *left,
+                                                             int32x4_t *right) {
+  int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4];
+
+  sl[0] = vaddq_s32(left[0], left[7]);
+  sl[1] = vaddq_s32(left[1], left[6]);
+  sl[2] = vaddq_s32(left[2], left[5]);
+  sl[3] = vaddq_s32(left[3], left[4]);
+  sl[4] = vsubq_s32(left[3], left[4]);
+  sl[5] = vsubq_s32(left[2], left[5]);
+  sl[6] = vsubq_s32(left[1], left[6]);
+  sl[7] = vsubq_s32(left[0], left[7]);
+  sr[0] = vaddq_s32(right[0], right[7]);
+  sr[1] = vaddq_s32(right[1], right[6]);
+  sr[2] = vaddq_s32(right[2], right[5]);
+  sr[3] = vaddq_s32(right[3], right[4]);
+  sr[4] = vsubq_s32(right[3], right[4]);
+  sr[5] = vsubq_s32(right[2], right[5]);
+  sr[6] = vsubq_s32(right[1], right[6]);
+  sr[7] = vsubq_s32(right[0], right[7]);
+
+  // fdct4(step, step);
+  // x0 = s0 + s3;
+  xl[0] = vaddq_s32(sl[0], sl[3]);
+  xr[0] = vaddq_s32(sr[0], sr[3]);
+  // x1 = s1 + s2;
+  xl[1] = vaddq_s32(sl[1], sl[2]);
+  xr[1] = vaddq_s32(sr[1], sr[2]);
+  // x2 = s1 - s2;
+  xl[2] = vsubq_s32(sl[1], sl[2]);
+  xr[2] = vsubq_s32(sr[1], sr[2]);
+  // x3 = s0 - s3;
+  xl[3] = vsubq_s32(sl[0], sl[3]);
+  xr[3] = vsubq_s32(sr[0], sr[3]);
+
+  // fdct4(step, step);
+  // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+  butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
+                               &left[0], &right[0], &left[4], &right[4]);
+  // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+  // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64,
+                                     cospi_24_64, &left[2], &right[2], &left[6],
+                                     &right[6]);
+
+  // Stage 2
+  // t0 = (s6 - s5) * cospi_16_64;
+  // t1 = (s6 + s5) * cospi_16_64;
+  butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1],
+                               &tr[1], &tl[0], &tr[0]);
+
+  // Stage 3
+  xl[0] = vaddq_s32(sl[4], tl[0]);
+  xr[0] = vaddq_s32(sr[4], tr[0]);
+  xl[1] = vsubq_s32(sl[4], tl[0]);
+  xr[1] = vsubq_s32(sr[4], tr[0]);
+  xl[2] = vsubq_s32(sl[7], tl[1]);
+  xr[2] = vsubq_s32(sr[7], tr[1]);
+  xl[3] = vaddq_s32(sl[7], tl[1]);
+  xr[3] = vaddq_s32(sr[7], tr[1]);
+
+  // Stage 4
+  // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+  // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64,
+                                     cospi_28_64, &left[1], &right[1], &left[7],
+                                     &right[7]);
+
+  // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+  // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64,
+                                     cospi_12_64, &left[5], &right[5], &left[3],
+                                     &right[3]);
+}
+
+static INLINE void vpx_highbd_fdct8x8_pass1_neon(int32x4_t *left,
+                                                 int32x4_t *right) {
+  vpx_highbd_fdct8x8_pass1_notranspose_neon(left, right);
+  transpose_s32_8x8_2(left, right, left, right);
+}
+
+static INLINE void vpx_highbd_fdct8x8_pass2_neon(int32x4_t *left,
+                                                 int32x4_t *right) {
+  vpx_highbd_fdct8x8_pass2_notranspose_neon(left, right);
+  transpose_s32_8x8_2(left, right, left, right);
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/fdct_neon.h
new file mode 100644
index 0000000000..16f5c5fc0e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct_neon.h
@@ -0,0 +1,542 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT_NEON_H_
+
+#include <arm_neon.h>
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulh_s16 operation on half vector
+// can be slightly less accurate, adequate for pass1
+static INLINE void butterfly_one_coeff_s16_fast_half(const int16x4_t a,
+                                                     const int16x4_t b,
+                                                     const tran_coef_t constant,
+                                                     int16x4_t *add,
+                                                     int16x4_t *sub) {
+  int16x4_t c = vdup_n_s16(2 * constant);
+  *add = vqrdmulh_s16(vadd_s16(a, b), c);
+  *sub = vqrdmulh_s16(vsub_s16(a, b), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulh_s16 operation on full vector
+// can be slightly less accurate, adequate for pass1
+static INLINE void butterfly_one_coeff_s16_fast(const int16x8_t a,
+                                                const int16x8_t b,
+                                                const tran_coef_t constant,
+                                                int16x8_t *add,
+                                                int16x8_t *sub) {
+  int16x8_t c = vdupq_n_s16(2 * constant);
+  *add = vqrdmulhq_s16(vaddq_s16(a, b), c);
+  *sub = vqrdmulhq_s16(vsubq_s16(a, b), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns full 32-bit values, high/low
+static INLINE void butterfly_one_coeff_s16_s32_fast(
+    const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+    int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
+    int32x4_t *sub_hi) {
+  int32x4_t c = vdupq_n_s32(constant << 17);
+  const int16x4_t a_lo = vget_low_s16(a);
+  const int16x4_t a_hi = vget_high_s16(a);
+  const int16x4_t b_lo = vget_low_s16(b);
+  const int16x4_t b_hi = vget_high_s16(b);
+  *add_lo = vqrdmulhq_s32(vaddl_s16(a_lo, b_lo), c);
+  *add_hi = vqrdmulhq_s32(vaddl_s16(a_hi, b_hi), c);
+  *sub_lo = vqrdmulhq_s32(vsubl_s16(a_lo, b_lo), c);
+  *sub_hi = vqrdmulhq_s32(vsubl_s16(a_hi, b_hi), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns full 32-bit values, high/low
+static INLINE void butterfly_one_coeff_s16_s32_fast_narrow(
+    const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+    int16x8_t *add, int16x8_t *sub) {
+  int32x4_t add_lo, add_hi, sub_lo, sub_hi;
+  butterfly_one_coeff_s16_s32_fast(a, b, constant, &add_lo, &add_hi, &sub_lo,
+                                   &sub_hi);
+  *add = vcombine_s16(vmovn_s32(add_lo), vmovn_s32(add_hi));
+  *sub = vcombine_s16(vmovn_s32(sub_lo), vmovn_s32(sub_hi));
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns full 32-bit values, high/low
+static INLINE void butterfly_one_coeff_s16_s32_fast_half(
+    const int16x4_t a, const int16x4_t b, const tran_coef_t constant,
+    int32x4_t *add, int32x4_t *sub) {
+  int32x4_t c = vdupq_n_s32(constant << 17);
+  *add = vqrdmulhq_s32(vaddl_s16(a, b), c);
+  *sub = vqrdmulhq_s32(vsubl_s16(a, b), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on half vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns narrowed down 16-bit values
+static INLINE void butterfly_one_coeff_s16_s32_fast_narrow_half(
+    const int16x4_t a, const int16x4_t b, const tran_coef_t constant,
+    int16x4_t *add, int16x4_t *sub) {
+  int32x4_t add32, sub32;
+  butterfly_one_coeff_s16_s32_fast_half(a, b, constant, &add32, &sub32);
+  *add = vmovn_s32(add32);
+  *sub = vmovn_s32(sub32);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Original Variant that performs normal implementation on full vector
+// fully accurate does 32-bit processing, takes 16-bit values
+static INLINE void butterfly_one_coeff_s16_s32(
+    const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+    int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
+    int32x4_t *sub_hi) {
+  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
+  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
+  const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
+  const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
+  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
+  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
+  *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
+  *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
+  *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
+  *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Original Variant that performs normal implementation on full vector
+// fully accurate does 32-bit processing, takes 16-bit values
+// returns narrowed down 16-bit values
+static INLINE void butterfly_one_coeff_s16_s32_narrow(
+    const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+    int16x8_t *add, int16x8_t *sub) {
+  int32x4_t add32_lo, add32_hi, sub32_lo, sub32_hi;
+  butterfly_one_coeff_s16_s32(a, b, constant, &add32_lo, &add32_hi, &sub32_lo,
+                              &sub32_hi);
+  *add = vcombine_s16(vmovn_s32(add32_lo), vmovn_s32(add32_hi));
+  *sub = vcombine_s16(vmovn_s32(sub32_lo), vmovn_s32(sub32_hi));
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values,
+// high/low
+static INLINE void butterfly_one_coeff_s32_noround(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo,
+    int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  const int32x4_t a1 = vmulq_n_s32(a_lo, constant);
+  const int32x4_t a2 = vmulq_n_s32(a_hi, constant);
+  const int32x4_t a3 = vmulq_n_s32(a_lo, constant);
+  const int32x4_t a4 = vmulq_n_s32(a_hi, constant);
+  *add_lo = vmlaq_n_s32(a1, b_lo, constant);
+  *add_hi = vmlaq_n_s32(a2, b_hi, constant);
+  *sub_lo = vmlsq_n_s32(a3, b_lo, constant);
+  *sub_hi = vmlsq_n_s32(a4, b_hi, constant);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values,
+// high/low
+static INLINE void butterfly_one_coeff_s32_fast_half(const int32x4_t a,
+                                                     const int32x4_t b,
+                                                     const tran_coef_t constant,
+                                                     int32x4_t *add,
+                                                     int32x4_t *sub) {
+  const int32x4_t c = vdupq_n_s32(constant << 17);
+  *add = vqrdmulhq_s32(vaddq_s32(a, b), c);
+  *sub = vqrdmulhq_s32(vsubq_s32(a, b), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values,
+// high/low
+static INLINE void butterfly_one_coeff_s32_fast(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo,
+    int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  const int32x4_t c = vdupq_n_s32(constant << 17);
+  *add_lo = vqrdmulhq_s32(vaddq_s32(a_lo, b_lo), c);
+  *add_hi = vqrdmulhq_s32(vaddq_s32(a_hi, b_hi), c);
+  *sub_lo = vqrdmulhq_s32(vsubq_s32(a_lo, b_lo), c);
+  *sub_hi = vqrdmulhq_s32(vsubq_s32(a_hi, b_hi), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs normal implementation on full vector
+// more accurate does 64-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_one_coeff_s32_s64_narrow(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo,
+    int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  // ac holds the following values:
+  // ac: vget_low_s32(a_lo) * c, vget_high_s32(a_lo) * c,
+  //     vget_low_s32(a_hi) * c, vget_high_s32(a_hi) * c
+  int64x2_t ac[4];
+  int64x2_t sum[4];
+  int64x2_t diff[4];
+
+  ac[0] = vmull_n_s32(vget_low_s32(a_lo), constant);
+  ac[1] = vmull_n_s32(vget_high_s32(a_lo), constant);
+  ac[2] = vmull_n_s32(vget_low_s32(a_hi), constant);
+  ac[3] = vmull_n_s32(vget_high_s32(a_hi), constant);
+
+  sum[0] = vmlal_n_s32(ac[0], vget_low_s32(b_lo), constant);
+  sum[1] = vmlal_n_s32(ac[1], vget_high_s32(b_lo), constant);
+  sum[2] = vmlal_n_s32(ac[2], vget_low_s32(b_hi), constant);
+  sum[3] = vmlal_n_s32(ac[3], vget_high_s32(b_hi), constant);
+  *add_lo = vcombine_s32(vrshrn_n_s64(sum[0], DCT_CONST_BITS),
+                         vrshrn_n_s64(sum[1], DCT_CONST_BITS));
+  *add_hi = vcombine_s32(vrshrn_n_s64(sum[2], DCT_CONST_BITS),
+                         vrshrn_n_s64(sum[3], DCT_CONST_BITS));
+
+  diff[0] = vmlsl_n_s32(ac[0], vget_low_s32(b_lo), constant);
+  diff[1] = vmlsl_n_s32(ac[1], vget_high_s32(b_lo), constant);
+  diff[2] = vmlsl_n_s32(ac[2], vget_low_s32(b_hi), constant);
+  diff[3] = vmlsl_n_s32(ac[3], vget_high_s32(b_hi), constant);
+  *sub_lo = vcombine_s32(vrshrn_n_s64(diff[0], DCT_CONST_BITS),
+                         vrshrn_n_s64(diff[1], DCT_CONST_BITS));
+  *sub_hi = vcombine_s32(vrshrn_n_s64(diff[2], DCT_CONST_BITS),
+                         vrshrn_n_s64(diff[3], DCT_CONST_BITS));
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on half vector
+// more accurate does 64-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32_s64_narrow_half(
+    const int32x4_t a, const int32x4_t b, const tran_coef_t constant1,
+    const tran_coef_t constant2, int32x4_t *add, int32x4_t *sub) {
+  const int32x2_t a_lo = vget_low_s32(a);
+  const int32x2_t a_hi = vget_high_s32(a);
+  const int32x2_t b_lo = vget_low_s32(b);
+  const int32x2_t b_hi = vget_high_s32(b);
+
+  const int64x2_t axc0_64_lo = vmull_n_s32(a_lo, constant1);
+  const int64x2_t axc0_64_hi = vmull_n_s32(a_hi, constant1);
+  const int64x2_t axc1_64_lo = vmull_n_s32(a_lo, constant2);
+  const int64x2_t axc1_64_hi = vmull_n_s32(a_hi, constant2);
+
+  const int64x2_t sum_lo = vmlal_n_s32(axc0_64_lo, b_lo, constant2);
+  const int64x2_t sum_hi = vmlal_n_s32(axc0_64_hi, b_hi, constant2);
+  const int64x2_t diff_lo = vmlsl_n_s32(axc1_64_lo, b_lo, constant1);
+  const int64x2_t diff_hi = vmlsl_n_s32(axc1_64_hi, b_hi, constant1);
+
+  *add = vcombine_s32(vrshrn_n_s64(sum_lo, DCT_CONST_BITS),
+                      vrshrn_n_s64(sum_hi, DCT_CONST_BITS));
+  *sub = vcombine_s32(vrshrn_n_s64(diff_lo, DCT_CONST_BITS),
+                      vrshrn_n_s64(diff_hi, DCT_CONST_BITS));
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on full vector
+// more accurate does 64-bit processing, takes and returns 64-bit values
+// returns results without rounding
+static INLINE void butterfly_two_coeff_s32_s64_noround(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const tran_coef_t constant1,
+    const tran_coef_t constant2, int64x2_t *add_lo /*[2]*/,
+    int64x2_t *add_hi /*[2]*/, int64x2_t *sub_lo /*[2]*/,
+    int64x2_t *sub_hi /*[2]*/) {
+  // ac1/ac2 hold the following values:
+  // ac1: vget_low_s32(a_lo) * c1, vget_high_s32(a_lo) * c1,
+  //      vget_low_s32(a_hi) * c1, vget_high_s32(a_hi) * c1
+  // ac2: vget_low_s32(a_lo) * c2, vget_high_s32(a_lo) * c2,
+  //      vget_low_s32(a_hi) * c2, vget_high_s32(a_hi) * c2
+  int64x2_t ac1[4];
+  int64x2_t ac2[4];
+
+  ac1[0] = vmull_n_s32(vget_low_s32(a_lo), constant1);
+  ac1[1] = vmull_n_s32(vget_high_s32(a_lo), constant1);
+  ac1[2] = vmull_n_s32(vget_low_s32(a_hi), constant1);
+  ac1[3] = vmull_n_s32(vget_high_s32(a_hi), constant1);
+  ac2[0] = vmull_n_s32(vget_low_s32(a_lo), constant2);
+  ac2[1] = vmull_n_s32(vget_high_s32(a_lo), constant2);
+  ac2[2] = vmull_n_s32(vget_low_s32(a_hi), constant2);
+  ac2[3] = vmull_n_s32(vget_high_s32(a_hi), constant2);
+
+  add_lo[0] = vmlal_n_s32(ac1[0], vget_low_s32(b_lo), constant2);
+  add_lo[1] = vmlal_n_s32(ac1[1], vget_high_s32(b_lo), constant2);
+  add_hi[0] = vmlal_n_s32(ac1[2], vget_low_s32(b_hi), constant2);
+  add_hi[1] = vmlal_n_s32(ac1[3], vget_high_s32(b_hi), constant2);
+
+  sub_lo[0] = vmlsl_n_s32(ac2[0], vget_low_s32(b_lo), constant1);
+  sub_lo[1] = vmlsl_n_s32(ac2[1], vget_high_s32(b_lo), constant1);
+  sub_hi[0] = vmlsl_n_s32(ac2[2], vget_low_s32(b_hi), constant1);
+  sub_hi[1] = vmlsl_n_s32(ac2[3], vget_high_s32(b_hi), constant1);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on full vector
+// more accurate does 64-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32_s64_narrow(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const tran_coef_t constant1,
+    const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+    int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  // ac1/ac2 hold the following values:
+  // ac1: vget_low_s32(a_lo) * c1, vget_high_s32(a_lo) * c1,
+  //      vget_low_s32(a_hi) * c1, vget_high_s32(a_hi) * c1
+  // ac2: vget_low_s32(a_lo) * c2, vget_high_s32(a_lo) * c2,
+  //      vget_low_s32(a_hi) * c2, vget_high_s32(a_hi) * c2
+  int64x2_t ac1[4];
+  int64x2_t ac2[4];
+  int64x2_t sum[4];
+  int64x2_t diff[4];
+
+  ac1[0] = vmull_n_s32(vget_low_s32(a_lo), constant1);
+  ac1[1] = vmull_n_s32(vget_high_s32(a_lo), constant1);
+  ac1[2] = vmull_n_s32(vget_low_s32(a_hi), constant1);
+  ac1[3] = vmull_n_s32(vget_high_s32(a_hi), constant1);
+  ac2[0] = vmull_n_s32(vget_low_s32(a_lo), constant2);
+  ac2[1] = vmull_n_s32(vget_high_s32(a_lo), constant2);
+  ac2[2] = vmull_n_s32(vget_low_s32(a_hi), constant2);
+  ac2[3] = vmull_n_s32(vget_high_s32(a_hi), constant2);
+
+  sum[0] = vmlal_n_s32(ac1[0], vget_low_s32(b_lo), constant2);
+  sum[1] = vmlal_n_s32(ac1[1], vget_high_s32(b_lo), constant2);
+  sum[2] = vmlal_n_s32(ac1[2], vget_low_s32(b_hi), constant2);
+  sum[3] = vmlal_n_s32(ac1[3], vget_high_s32(b_hi), constant2);
+  *add_lo = vcombine_s32(vrshrn_n_s64(sum[0], DCT_CONST_BITS),
+                         vrshrn_n_s64(sum[1], DCT_CONST_BITS));
+  *add_hi = vcombine_s32(vrshrn_n_s64(sum[2], DCT_CONST_BITS),
+                         vrshrn_n_s64(sum[3], DCT_CONST_BITS));
+
+  diff[0] = vmlsl_n_s32(ac2[0], vget_low_s32(b_lo), constant1);
+  diff[1] = vmlsl_n_s32(ac2[1], vget_high_s32(b_lo), constant1);
+  diff[2] = vmlsl_n_s32(ac2[2], vget_low_s32(b_hi), constant1);
+  diff[3] = vmlsl_n_s32(ac2[3], vget_high_s32(b_hi), constant1);
+  *sub_lo = vcombine_s32(vrshrn_n_s64(diff[0], DCT_CONST_BITS),
+                         vrshrn_n_s64(diff[1], DCT_CONST_BITS));
+  *sub_hi = vcombine_s32(vrshrn_n_s64(diff[2], DCT_CONST_BITS),
+                         vrshrn_n_s64(diff[3], DCT_CONST_BITS));
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s16_s32_noround(
+    const int16x4_t a_lo, const int16x4_t a_hi, const int16x4_t b_lo,
+    const int16x4_t b_hi, const tran_coef_t constant1,
+    const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+    int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  const int32x4_t a1 = vmull_n_s16(a_lo, constant1);
+  const int32x4_t a2 = vmull_n_s16(a_hi, constant1);
+  const int32x4_t a3 = vmull_n_s16(a_lo, constant2);
+  const int32x4_t a4 = vmull_n_s16(a_hi, constant2);
+  *add_lo = vmlal_n_s16(a1, b_lo, constant2);
+  *add_hi = vmlal_n_s16(a2, b_hi, constant2);
+  *sub_lo = vmlsl_n_s16(a3, b_lo, constant1);
+  *sub_hi = vmlsl_n_s16(a4, b_hi, constant1);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32_noround(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const tran_coef_t constant1,
+    const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+    int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  const int32x4_t a1 = vmulq_n_s32(a_lo, constant1);
+  const int32x4_t a2 = vmulq_n_s32(a_hi, constant1);
+  const int32x4_t a3 = vmulq_n_s32(a_lo, constant2);
+  const int32x4_t a4 = vmulq_n_s32(a_hi, constant2);
+  *add_lo = vmlaq_n_s32(a1, b_lo, constant2);
+  *add_hi = vmlaq_n_s32(a2, b_hi, constant2);
+  *sub_lo = vmlsq_n_s32(a3, b_lo, constant1);
+  *sub_hi = vmlsq_n_s32(a4, b_hi, constant1);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on half vector
+// more accurate does 32-bit processing, takes and returns 16-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_half(const int16x4_t a,
+                                            const int16x4_t b,
+                                            const tran_coef_t constant1,
+                                            const tran_coef_t constant2,
+                                            int16x4_t *add, int16x4_t *sub) {
+  const int32x4_t a1 = vmull_n_s16(a, constant1);
+  const int32x4_t a2 = vmull_n_s16(a, constant2);
+  const int32x4_t sum = vmlal_n_s16(a1, b, constant2);
+  const int32x4_t diff = vmlsl_n_s16(a2, b, constant1);
+  *add = vqrshrn_n_s32(sum, DCT_CONST_BITS);
+  *sub = vqrshrn_n_s32(diff, DCT_CONST_BITS);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 16-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
+                                       const tran_coef_t constant1,
+                                       const tran_coef_t constant2,
+                                       int16x8_t *add, int16x8_t *sub) {
+  const int32x4_t a1 = vmull_n_s16(vget_low_s16(a), constant1);
+  const int32x4_t a2 = vmull_n_s16(vget_high_s16(a), constant1);
+  const int32x4_t a3 = vmull_n_s16(vget_low_s16(a), constant2);
+  const int32x4_t a4 = vmull_n_s16(vget_high_s16(a), constant2);
+  const int32x4_t sum0 = vmlal_n_s16(a1, vget_low_s16(b), constant2);
+  const int32x4_t sum1 = vmlal_n_s16(a2, vget_high_s16(b), constant2);
+  const int32x4_t diff0 = vmlsl_n_s16(a3, vget_low_s16(b), constant1);
+  const int32x4_t diff1 = vmlsl_n_s16(a4, vget_high_s16(b), constant1);
+  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
+  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
+  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
+  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
+  *add = vcombine_s16(rounded0, rounded1);
+  *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const tran_coef_t constant1,
+    const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+    int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  const int32x4_t a1 = vmulq_n_s32(a_lo, constant1);
+  const int32x4_t a2 = vmulq_n_s32(a_hi, constant1);
+  const int32x4_t a3 = vmulq_n_s32(a_lo, constant2);
+  const int32x4_t a4 = vmulq_n_s32(a_hi, constant2);
+  const int32x4_t sum0 = vmlaq_n_s32(a1, b_lo, constant2);
+  const int32x4_t sum1 = vmlaq_n_s32(a2, b_hi, constant2);
+  const int32x4_t diff0 = vmlsq_n_s32(a3, b_lo, constant1);
+  const int32x4_t diff1 = vmlsq_n_s32(a4, b_hi, constant1);
+  *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
+  *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
+  *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
+  *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+}
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift without rounding.
+static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) {
+  const int16x8_t one = vdupq_n_s16(1);
+  const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
+  const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
+  const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
+  return vshrq_n_s16(vaddq_s16(vaddq_s16(a, a_sign_s16), one), 2);
+}
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift and round,
+// return narrowed results
+static INLINE int16x8_t add_round_shift_s32_narrow(const int32x4_t a_lo,
+                                                   const int32x4_t a_hi) {
+  const int32x4_t one = vdupq_n_s32(1);
+  const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo);
+  const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31);
+  const int32x4_t a_lo_sign_s32 = vreinterpretq_s32_u32(a_lo_sign_u32);
+  const int16x4_t b_lo =
+      vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_lo, a_lo_sign_s32), one), 2);
+  const uint32x4_t a_hi_u32 = vreinterpretq_u32_s32(a_hi);
+  const uint32x4_t a_hi_sign_u32 = vshrq_n_u32(a_hi_u32, 31);
+  const int32x4_t a_hi_sign_s32 = vreinterpretq_s32_u32(a_hi_sign_u32);
+  const int16x4_t b_hi =
+      vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_hi, a_hi_sign_s32), one), 2);
+  return vcombine_s16(b_lo, b_hi);
+}
+
+// Add 1 if negative, and shift by 1.
+// In practice, add the sign bit, then shift and round
+static INLINE int32x4_t add_round_shift_half_s32(const int32x4_t a) {
+  const uint32x4_t a_u32 = vreinterpretq_u32_s32(a);
+  const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31);
+  const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32);
+  return vshrq_n_s32(vaddq_s32(a, a_sign_s32), 1);
+}
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift without rounding.
+static INLINE int32x4_t add_round_shift_s32(const int32x4_t a) {
+  const int32x4_t one = vdupq_n_s32(1);
+  const uint32x4_t a_u32 = vreinterpretq_u32_s32(a);
+  const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31);
+  const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32);
+  return vshrq_n_s32(vaddq_s32(vaddq_s32(a, a_sign_s32), one), 2);
+}
+
+// Add 2 if positive, 1 if negative, and shift by 2.
+// In practice, subtract the sign bit, then shift with rounding.
+static INLINE int16x8_t sub_round_shift_s16(const int16x8_t a) {
+  const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
+  const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
+  const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
+  return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2);
+}
+
+// Add 2 if positive, 1 if negative, and shift by 2.
+// In practice, subtract the sign bit, then shift with rounding.
+static INLINE int32x4_t sub_round_shift_s32(const int32x4_t a) {
+  const uint32x4_t a_u32 = vreinterpretq_u32_s32(a);
+  const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31);
+  const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32);
+  return vrshrq_n_s32(vsubq_s32(a, a_sign_s32), 2);
+}
+
+static INLINE int32x4_t add_s64_round_narrow(const int64x2_t *a /*[2]*/,
+                                             const int64x2_t *b /*[2]*/) {
+  int64x2_t result[2];
+  result[0] = vaddq_s64(a[0], b[0]);
+  result[1] = vaddq_s64(a[1], b[1]);
+  return vcombine_s32(vrshrn_n_s64(result[0], DCT_CONST_BITS),
+                      vrshrn_n_s64(result[1], DCT_CONST_BITS));
+}
+
+static INLINE int32x4_t sub_s64_round_narrow(const int64x2_t *a /*[2]*/,
+                                             const int64x2_t *b /*[2]*/) {
+  int64x2_t result[2];
+  result[0] = vsubq_s64(a[0], b[0]);
+  result[1] = vsubq_s64(a[1], b[1]);
+  return vcombine_s32(vrshrn_n_s64(result[0], DCT_CONST_BITS),
+                      vrshrn_n_s64(result[1], DCT_CONST_BITS));
+}
+
+static INLINE int32x4_t add_s32_s64_narrow(const int32x4_t a,
+                                           const int32x4_t b) {
+  int64x2_t a64[2], b64[2], result[2];
+  a64[0] = vmovl_s32(vget_low_s32(a));
+  a64[1] = vmovl_s32(vget_high_s32(a));
+  b64[0] = vmovl_s32(vget_low_s32(b));
+  b64[1] = vmovl_s32(vget_high_s32(b));
+  result[0] = vaddq_s64(a64[0], b64[0]);
+  result[1] = vaddq_s64(a64[1], b64[1]);
+  return vcombine_s32(vmovn_s64(result[0]), vmovn_s64(result[1]));
+}
+
+static INLINE int32x4_t sub_s32_s64_narrow(const int32x4_t a,
+                                           const int32x4_t b) {
+  int64x2_t a64[2], b64[2], result[2];
+  a64[0] = vmovl_s32(vget_low_s32(a));
+  a64[1] = vmovl_s32(vget_high_s32(a));
+  b64[0] = vmovl_s32(vget_low_s32(b));
+  b64[1] = vmovl_s32(vget_high_s32(b));
+  result[0] = vsubq_s64(a64[0], b64[0]);
+  result[1] = vsubq_s64(a64[1], b64[1]);
+  return vcombine_s32(vmovn_s64(result[0]), vmovn_s64(result[1]));
+}
+
+#endif  // VPX_VPX_DSP_ARM_FDCT_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct_partial_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/fdct_partial_neon.c
new file mode 100644
index 0000000000..ee9e599e09
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct_partial_neon.c
@@ -0,0 +1,183 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) {
+  int16x4_t a0, a1, a2, a3;
+  int16x8_t b0, b1;
+  int16x8_t c;
+
+  a0 = vld1_s16(input);
+  input += stride;
+  a1 = vld1_s16(input);
+  input += stride;
+  a2 = vld1_s16(input);
+  input += stride;
+  a3 = vld1_s16(input);
+
+  b0 = vcombine_s16(a0, a1);
+  b1 = vcombine_s16(a2, a3);
+
+  c = vaddq_s16(b0, b1);
+
+  output[0] = (tran_low_t)(horizontal_add_int16x8(c) << 1);
+  output[1] = 0;
+}
+
+// Visual Studio 2022 (cl.exe) < 17.7 targeting AArch64 with optimizations
+// enabled will fail with an internal compiler error. See:
+// https://developercommunity.visualstudio.com/t/Compiler-crash-C1001-when-building-a-for/10346110
+#if defined(_MSC_VER) && _MSC_VER < 1937 && defined(_M_ARM64) && \
+    !defined(__clang__)
+#define AOM_WORK_AROUND_MSVC_BUG_10346110
+#endif
+
+#ifdef AOM_WORK_AROUND_MSVC_BUG_10346110
+#pragma optimize("", off)
+#endif
+void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) {
+  int r;
+  int16x8_t sum = vld1q_s16(&input[0]);
+
+  for (r = 1; r < 8; ++r) {
+    const int16x8_t input_00 = vld1q_s16(&input[r * stride]);
+    sum = vaddq_s16(sum, input_00);
+  }
+
+  output[0] = (tran_low_t)horizontal_add_int16x8(sum);
+  output[1] = 0;
+}
+#ifdef AOM_WORK_AROUND_MSVC_BUG_10346110
+#pragma optimize("", on)
+#endif
+#undef AOM_WORK_AROUND_MSVC_BUG_10346110
+
+void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  int r;
+  int16x8_t left = vld1q_s16(input);
+  int16x8_t right = vld1q_s16(input + 8);
+  int32_t sum;
+  input += stride;
+
+  for (r = 1; r < 16; ++r) {
+    const int16x8_t a = vld1q_s16(input);
+    const int16x8_t b = vld1q_s16(input + 8);
+    input += stride;
+    left = vaddq_s16(left, a);
+    right = vaddq_s16(right, b);
+  }
+
+  sum = horizontal_add_int16x8(left) + horizontal_add_int16x8(right);
+
+  output[0] = (tran_low_t)(sum >> 1);
+  output[1] = 0;
+}
+
+void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  int r;
+  int16x8_t a0 = vld1q_s16(input);
+  int16x8_t a1 = vld1q_s16(input + 8);
+  int16x8_t a2 = vld1q_s16(input + 16);
+  int16x8_t a3 = vld1q_s16(input + 24);
+  int32_t sum;
+  input += stride;
+
+  for (r = 1; r < 32; ++r) {
+    const int16x8_t b0 = vld1q_s16(input);
+    const int16x8_t b1 = vld1q_s16(input + 8);
+    const int16x8_t b2 = vld1q_s16(input + 16);
+    const int16x8_t b3 = vld1q_s16(input + 24);
+    input += stride;
+    a0 = vaddq_s16(a0, b0);
+    a1 = vaddq_s16(a1, b1);
+    a2 = vaddq_s16(a2, b2);
+    a3 = vaddq_s16(a3, b3);
+  }
+
+  sum = horizontal_add_int16x8(a0);
+  sum += horizontal_add_int16x8(a1);
+  sum += horizontal_add_int16x8(a2);
+  sum += horizontal_add_int16x8(a3);
+  output[0] = (tran_low_t)(sum >> 3);
+  output[1] = 0;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct16x16_1_neon(const int16_t *input, tran_low_t *output,
+                                 int stride) {
+  int32x4_t partial_sum[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+                               vdupq_n_s32(0) };
+  int32_t sum;
+
+  int r = 0;
+  do {
+    const int16x8_t a = vld1q_s16(input);
+    const int16x8_t b = vld1q_s16(input + 8);
+    input += stride;
+    partial_sum[0] = vaddw_s16(partial_sum[0], vget_low_s16(a));
+    partial_sum[1] = vaddw_s16(partial_sum[1], vget_high_s16(a));
+    partial_sum[2] = vaddw_s16(partial_sum[2], vget_low_s16(b));
+    partial_sum[3] = vaddw_s16(partial_sum[3], vget_high_s16(b));
+    r++;
+  } while (r < 16);
+
+  partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[1]);
+  partial_sum[2] = vaddq_s32(partial_sum[2], partial_sum[3]);
+  partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[2]);
+  sum = horizontal_add_int32x4(partial_sum[0]);
+
+  output[0] = (tran_low_t)(sum >> 1);
+  output[1] = 0;
+}
+
+void vpx_highbd_fdct32x32_1_neon(const int16_t *input, tran_low_t *output,
+                                 int stride) {
+  int32x4_t partial_sum[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+                               vdupq_n_s32(0) };
+
+  int32_t sum;
+
+  int r = 0;
+  do {
+    const int16x8_t a0 = vld1q_s16(input);
+    const int16x8_t a1 = vld1q_s16(input + 8);
+    const int16x8_t a2 = vld1q_s16(input + 16);
+    const int16x8_t a3 = vld1q_s16(input + 24);
+    input += stride;
+    partial_sum[0] = vaddw_s16(partial_sum[0], vget_low_s16(a0));
+    partial_sum[0] = vaddw_s16(partial_sum[0], vget_high_s16(a0));
+    partial_sum[1] = vaddw_s16(partial_sum[1], vget_low_s16(a1));
+    partial_sum[1] = vaddw_s16(partial_sum[1], vget_high_s16(a1));
+    partial_sum[2] = vaddw_s16(partial_sum[2], vget_low_s16(a2));
+    partial_sum[2] = vaddw_s16(partial_sum[2], vget_high_s16(a2));
+    partial_sum[3] = vaddw_s16(partial_sum[3], vget_low_s16(a3));
+    partial_sum[3] = vaddw_s16(partial_sum[3], vget_high_s16(a3));
+    r++;
+  } while (r < 32);
+
+  partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[1]);
+  partial_sum[2] = vaddq_s32(partial_sum[2], partial_sum[3]);
+  partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[2]);
+  sum = horizontal_add_int32x4(partial_sum[0]);
+
+  output[0] = (tran_low_t)(sum >> 3);
+  output[1] = 0;
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fwd_txfm_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
deleted file mode 100644
index e9503f13d7..0000000000
--- a/media/libvpx/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
+++ /dev/null
@@ -1,220 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_config.h"
-#include "vpx_dsp/txfm_common.h"
-
-void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
-  int i;
-  // stage 1
-  int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
-  int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
-  int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
-  int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
-  int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
-  int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
-  int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
-  int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
-  for (i = 0; i < 2; ++i) {
-    int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7;
-    const int16x8_t v_s0 = vaddq_s16(input_0, input_7);
-    const int16x8_t v_s1 = vaddq_s16(input_1, input_6);
-    const int16x8_t v_s2 = vaddq_s16(input_2, input_5);
-    const int16x8_t v_s3 = vaddq_s16(input_3, input_4);
-    const int16x8_t v_s4 = vsubq_s16(input_3, input_4);
-    const int16x8_t v_s5 = vsubq_s16(input_2, input_5);
-    const int16x8_t v_s6 = vsubq_s16(input_1, input_6);
-    const int16x8_t v_s7 = vsubq_s16(input_0, input_7);
-    // fdct4(step, step);
-    int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
-    int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
-    int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
-    int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
-    // fdct4(step, step);
-    int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
-    int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
-    int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
-    int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
-    int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64);
-    int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64);
-    int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64);
-    int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64);
-    v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64);
-    v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64);
-    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64);
-    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64);
-    v_t0_lo = vmulq_n_s32(v_t0_lo, (int32_t)cospi_16_64);
-    v_t0_hi = vmulq_n_s32(v_t0_hi, (int32_t)cospi_16_64);
-    v_t1_lo = vmulq_n_s32(v_t1_lo, (int32_t)cospi_16_64);
-    v_t1_hi = vmulq_n_s32(v_t1_hi, (int32_t)cospi_16_64);
-    {
-      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
-      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
-      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
-      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
-      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
-      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
-      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
-      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
-      out_0 = vcombine_s16(a, c);  // 00 01 02 03 40 41 42 43
-      out_2 = vcombine_s16(e, g);  // 20 21 22 23 60 61 62 63
-      out_4 = vcombine_s16(b, d);  // 04 05 06 07 44 45 46 47
-      out_6 = vcombine_s16(f, h);  // 24 25 26 27 64 65 66 67
-    }
-    // Stage 2
-    v_x0 = vsubq_s16(v_s6, v_s5);
-    v_x1 = vaddq_s16(v_s6, v_s5);
-    v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64);
-    v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64);
-    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64);
-    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64);
-    {
-      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
-      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
-      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
-      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
-      const int16x8_t ab = vcombine_s16(a, b);
-      const int16x8_t cd = vcombine_s16(c, d);
-      // Stage 3
-      v_x0 = vaddq_s16(v_s4, ab);
-      v_x1 = vsubq_s16(v_s4, ab);
-      v_x2 = vsubq_s16(v_s7, cd);
-      v_x3 = vaddq_s16(v_s7, cd);
-    }
-    // Stage 4
-    v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64);
-    v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64);
-    v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64);
-    v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64);
-    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64);
-    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64);
-    v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64);
-    v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64);
-    v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64);
-    v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64);
-    v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64);
-    v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64);
-    v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64);
-    v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64);
-    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64);
-    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64);
-    {
-      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
-      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
-      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
-      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
-      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
-      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
-      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
-      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
-      out_1 = vcombine_s16(a, c);  // 10 11 12 13 50 51 52 53
-      out_3 = vcombine_s16(e, g);  // 30 31 32 33 70 71 72 73
-      out_5 = vcombine_s16(b, d);  // 14 15 16 17 54 55 56 57
-      out_7 = vcombine_s16(f, h);  // 34 35 36 37 74 75 76 77
-    }
-    // transpose 8x8
-    {
-      // 00 01 02 03 40 41 42 43
-      // 10 11 12 13 50 51 52 53
-      // 20 21 22 23 60 61 62 63
-      // 30 31 32 33 70 71 72 73
-      // 04 05 06 07 44 45 46 47
-      // 14 15 16 17 54 55 56 57
-      // 24 25 26 27 64 65 66 67
-      // 34 35 36 37 74 75 76 77
-      const int32x4x2_t r02_s32 =
-          vtrnq_s32(vreinterpretq_s32_s16(out_0), vreinterpretq_s32_s16(out_2));
-      const int32x4x2_t r13_s32 =
-          vtrnq_s32(vreinterpretq_s32_s16(out_1), vreinterpretq_s32_s16(out_3));
-      const int32x4x2_t r46_s32 =
-          vtrnq_s32(vreinterpretq_s32_s16(out_4), vreinterpretq_s32_s16(out_6));
-      const int32x4x2_t r57_s32 =
-          vtrnq_s32(vreinterpretq_s32_s16(out_5), vreinterpretq_s32_s16(out_7));
-      const int16x8x2_t r01_s16 =
-          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
-                    vreinterpretq_s16_s32(r13_s32.val[0]));
-      const int16x8x2_t r23_s16 =
-          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
-                    vreinterpretq_s16_s32(r13_s32.val[1]));
-      const int16x8x2_t r45_s16 =
-          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
-                    vreinterpretq_s16_s32(r57_s32.val[0]));
-      const int16x8x2_t r67_s16 =
-          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
-                    vreinterpretq_s16_s32(r57_s32.val[1]));
-      input_0 = r01_s16.val[0];
-      input_1 = r01_s16.val[1];
-      input_2 = r23_s16.val[0];
-      input_3 = r23_s16.val[1];
-      input_4 = r45_s16.val[0];
-      input_5 = r45_s16.val[1];
-      input_6 = r67_s16.val[0];
-      input_7 = r67_s16.val[1];
-      // 00 10 20 30 40 50 60 70
-      // 01 11 21 31 41 51 61 71
-      // 02 12 22 32 42 52 62 72
-      // 03 13 23 33 43 53 63 73
-      // 04 14 24 34 44 54 64 74
-      // 05 15 25 35 45 55 65 75
-      // 06 16 26 36 46 56 66 76
-      // 07 17 27 37 47 57 67 77
-    }
-  }  // for
-  {
-    // from vpx_dct_sse2.c
-    // Post-condition (division by two)
-    //    division of two 16 bits signed numbers using shifts
-    //    n / 2 = (n - (n >> 15)) >> 1
-    const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15);
-    const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15);
-    const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15);
-    const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15);
-    const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15);
-    const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15);
-    const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15);
-    const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15);
-    input_0 = vhsubq_s16(input_0, sign_in0);
-    input_1 = vhsubq_s16(input_1, sign_in1);
-    input_2 = vhsubq_s16(input_2, sign_in2);
-    input_3 = vhsubq_s16(input_3, sign_in3);
-    input_4 = vhsubq_s16(input_4, sign_in4);
-    input_5 = vhsubq_s16(input_5, sign_in5);
-    input_6 = vhsubq_s16(input_6, sign_in6);
-    input_7 = vhsubq_s16(input_7, sign_in7);
-    // store results
-    vst1q_s16(&final_output[0 * 8], input_0);
-    vst1q_s16(&final_output[1 * 8], input_1);
-    vst1q_s16(&final_output[2 * 8], input_2);
-    vst1q_s16(&final_output[3 * 8], input_3);
-    vst1q_s16(&final_output[4 * 8], input_4);
-    vst1q_s16(&final_output[5 * 8], input_5);
-    vst1q_s16(&final_output[6 * 8], input_6);
-    vst1q_s16(&final_output[7 * 8], input_7);
-  }
-}
-
-void vpx_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) {
-  int r;
-  int16x8_t sum = vld1q_s16(&input[0]);
-  for (r = 1; r < 8; ++r) {
-    const int16x8_t input_00 = vld1q_s16(&input[r * stride]);
-    sum = vaddq_s16(sum, input_00);
-  }
-  {
-    const int32x4_t a = vpaddlq_s16(sum);
-    const int64x2_t b = vpaddlq_s32(a);
-    const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
-                                 vreinterpret_s32_s64(vget_high_s64(b)));
-    output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0);
-    output[1] = 0;
-  }
-}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/hadamard_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/hadamard_neon.c
index 977323497a..f5a044be4d 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/hadamard_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/hadamard_neon.c
@@ -11,6 +11,9 @@
 #include <arm_neon.h>
 
 #include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 
 static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
@@ -44,8 +47,8 @@ static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
   *a7 = vaddq_s16(c1, c5);
 }
 
-void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride,
-                           int16_t *coeff) {
+void vpx_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+                           tran_low_t *coeff) {
   int16x8_t a0 = vld1q_s16(src_diff);
   int16x8_t a1 = vld1q_s16(src_diff + src_stride);
   int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride);
@@ -63,18 +66,18 @@ void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride,
 
   // Skip the second transpose because it is not required.
 
-  vst1q_s16(coeff + 0, a0);
-  vst1q_s16(coeff + 8, a1);
-  vst1q_s16(coeff + 16, a2);
-  vst1q_s16(coeff + 24, a3);
-  vst1q_s16(coeff + 32, a4);
-  vst1q_s16(coeff + 40, a5);
-  vst1q_s16(coeff + 48, a6);
-  vst1q_s16(coeff + 56, a7);
+  store_s16q_to_tran_low(coeff + 0, a0);
+  store_s16q_to_tran_low(coeff + 8, a1);
+  store_s16q_to_tran_low(coeff + 16, a2);
+  store_s16q_to_tran_low(coeff + 24, a3);
+  store_s16q_to_tran_low(coeff + 32, a4);
+  store_s16q_to_tran_low(coeff + 40, a5);
+  store_s16q_to_tran_low(coeff + 48, a6);
+  store_s16q_to_tran_low(coeff + 56, a7);
 }
 
-void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride,
-                             int16_t *coeff) {
+void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
   int i;
 
   /* Rearrange 16x16 to 8x32 and remove stride.
@@ -88,10 +91,10 @@ void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride,
   vpx_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
 
   for (i = 0; i < 64; i += 8) {
-    const int16x8_t a0 = vld1q_s16(coeff + 0);
-    const int16x8_t a1 = vld1q_s16(coeff + 64);
-    const int16x8_t a2 = vld1q_s16(coeff + 128);
-    const int16x8_t a3 = vld1q_s16(coeff + 192);
+    const int16x8_t a0 = load_tran_low_to_s16q(coeff + 0);
+    const int16x8_t a1 = load_tran_low_to_s16q(coeff + 64);
+    const int16x8_t a2 = load_tran_low_to_s16q(coeff + 128);
+    const int16x8_t a3 = load_tran_low_to_s16q(coeff + 192);
 
     const int16x8_t b0 = vhaddq_s16(a0, a1);
     const int16x8_t b1 = vhsubq_s16(a0, a1);
@@ -103,10 +106,52 @@ void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride,
     const int16x8_t c2 = vsubq_s16(b0, b2);
     const int16x8_t c3 = vsubq_s16(b1, b3);
 
-    vst1q_s16(coeff + 0, c0);
-    vst1q_s16(coeff + 64, c1);
-    vst1q_s16(coeff + 128, c2);
-    vst1q_s16(coeff + 192, c3);
+    store_s16q_to_tran_low(coeff + 0, c0);
+    store_s16q_to_tran_low(coeff + 64, c1);
+    store_s16q_to_tran_low(coeff + 128, c2);
+    store_s16q_to_tran_low(coeff + 192, c3);
+
+    coeff += 8;
+  }
+}
+
+void vpx_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
+  int i;
+
+  /* Rearrange 32x32 to 16x64 and remove stride.
+   * Top left first. */
+  vpx_hadamard_16x16_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
+  /* Top right. */
+  vpx_hadamard_16x16_neon(src_diff + 16 + 0 * src_stride, src_stride,
+                          coeff + 256);
+  /* Bottom left. */
+  vpx_hadamard_16x16_neon(src_diff + 0 + 16 * src_stride, src_stride,
+                          coeff + 512);
+  /* Bottom right. */
+  vpx_hadamard_16x16_neon(src_diff + 16 + 16 * src_stride, src_stride,
+                          coeff + 768);
+
+  for (i = 0; i < 256; i += 8) {
+    const int16x8_t a0 = load_tran_low_to_s16q(coeff + 0);
+    const int16x8_t a1 = load_tran_low_to_s16q(coeff + 256);
+    const int16x8_t a2 = load_tran_low_to_s16q(coeff + 512);
+    const int16x8_t a3 = load_tran_low_to_s16q(coeff + 768);
+
+    const int16x8_t b0 = vshrq_n_s16(vhaddq_s16(a0, a1), 1);
+    const int16x8_t b1 = vshrq_n_s16(vhsubq_s16(a0, a1), 1);
+    const int16x8_t b2 = vshrq_n_s16(vhaddq_s16(a2, a3), 1);
+    const int16x8_t b3 = vshrq_n_s16(vhsubq_s16(a2, a3), 1);
+
+    const int16x8_t c0 = vaddq_s16(b0, b2);
+    const int16x8_t c1 = vaddq_s16(b1, b3);
+    const int16x8_t c2 = vsubq_s16(b0, b2);
+    const int16x8_t c3 = vsubq_s16(b1, b3);
+
+    store_s16q_to_tran_low(coeff + 0, c0);
+    store_s16q_to_tran_low(coeff + 256, c1);
+    store_s16q_to_tran_low(coeff + 512, c2);
+    store_s16q_to_tran_low(coeff + 768, c3);
 
     coeff += 8;
   }
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_neon.c
new file mode 100644
index 0000000000..4265596c8c
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_neon.c
@@ -0,0 +1,140 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+uint32_t vpx_highbd_avg_4x4_neon(const uint8_t *s8, int p) {
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8);
+  const uint16x8_t a0 = load_unaligned_u16q(a_ptr + 0 * p, p);
+  const uint16x8_t a1 = load_unaligned_u16q(a_ptr + 2 * p, p);
+  return (horizontal_add_uint16x8(vaddq_u16(a0, a1)) + (1 << 3)) >> 4;
+}
+
+uint32_t vpx_highbd_avg_8x8_neon(const uint8_t *s8, int p) {
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8);
+  uint16x8_t sum, a0, a1, a2, a3, a4, a5, a6, a7;
+
+  load_u16_8x8(a_ptr, p, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  sum = vaddq_u16(a0, a1);
+  sum = vaddq_u16(sum, a2);
+  sum = vaddq_u16(sum, a3);
+  sum = vaddq_u16(sum, a4);
+  sum = vaddq_u16(sum, a5);
+  sum = vaddq_u16(sum, a6);
+  sum = vaddq_u16(sum, a7);
+
+  return (horizontal_add_uint16x8(sum) + (1 << 5)) >> 6;
+}
+
+// coeff: 32 bits, dynamic range [-2147483648, 2147483647].
+// length: value range {16, 64, 256, 1024}.
+// satd: 42 bits, dynamic range [-2147483648 * 1024, 2147483647 * 1024]
+int vpx_highbd_satd_neon(const tran_low_t *coeff, int length) {
+  int64x2_t sum_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+  do {
+    int32x4_t abs0, abs1;
+    const int32x4_t s0 = load_tran_low_to_s32q(coeff);
+    const int32x4_t s1 = load_tran_low_to_s32q(coeff + 4);
+
+    abs0 = vabsq_s32(s0);
+    sum_s64[0] = vpadalq_s32(sum_s64[0], abs0);
+    abs1 = vabsq_s32(s1);
+    sum_s64[1] = vpadalq_s32(sum_s64[1], abs1);
+
+    length -= 8;
+    coeff += 8;
+  } while (length != 0);
+
+  return (int)horizontal_add_int64x2(vaddq_s64(sum_s64[0], sum_s64[1]));
+}
+
+void vpx_highbd_minmax_8x8_neon(const uint8_t *s8, int p, const uint8_t *d8,
+                                int dp, int *min, int *max) {
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8);
+  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(d8);
+
+  const uint16x8_t a0 = vld1q_u16(a_ptr + 0 * p);
+  const uint16x8_t a1 = vld1q_u16(a_ptr + 1 * p);
+  const uint16x8_t a2 = vld1q_u16(a_ptr + 2 * p);
+  const uint16x8_t a3 = vld1q_u16(a_ptr + 3 * p);
+  const uint16x8_t a4 = vld1q_u16(a_ptr + 4 * p);
+  const uint16x8_t a5 = vld1q_u16(a_ptr + 5 * p);
+  const uint16x8_t a6 = vld1q_u16(a_ptr + 6 * p);
+  const uint16x8_t a7 = vld1q_u16(a_ptr + 7 * p);
+
+  const uint16x8_t b0 = vld1q_u16(b_ptr + 0 * dp);
+  const uint16x8_t b1 = vld1q_u16(b_ptr + 1 * dp);
+  const uint16x8_t b2 = vld1q_u16(b_ptr + 2 * dp);
+  const uint16x8_t b3 = vld1q_u16(b_ptr + 3 * dp);
+  const uint16x8_t b4 = vld1q_u16(b_ptr + 4 * dp);
+  const uint16x8_t b5 = vld1q_u16(b_ptr + 5 * dp);
+  const uint16x8_t b6 = vld1q_u16(b_ptr + 6 * dp);
+  const uint16x8_t b7 = vld1q_u16(b_ptr + 7 * dp);
+
+  const uint16x8_t abs_diff0 = vabdq_u16(a0, b0);
+  const uint16x8_t abs_diff1 = vabdq_u16(a1, b1);
+  const uint16x8_t abs_diff2 = vabdq_u16(a2, b2);
+  const uint16x8_t abs_diff3 = vabdq_u16(a3, b3);
+  const uint16x8_t abs_diff4 = vabdq_u16(a4, b4);
+  const uint16x8_t abs_diff5 = vabdq_u16(a5, b5);
+  const uint16x8_t abs_diff6 = vabdq_u16(a6, b6);
+  const uint16x8_t abs_diff7 = vabdq_u16(a7, b7);
+
+  const uint16x8_t max01 = vmaxq_u16(abs_diff0, abs_diff1);
+  const uint16x8_t max23 = vmaxq_u16(abs_diff2, abs_diff3);
+  const uint16x8_t max45 = vmaxq_u16(abs_diff4, abs_diff5);
+  const uint16x8_t max67 = vmaxq_u16(abs_diff6, abs_diff7);
+
+  const uint16x8_t max0123 = vmaxq_u16(max01, max23);
+  const uint16x8_t max4567 = vmaxq_u16(max45, max67);
+  const uint16x8_t max07 = vmaxq_u16(max0123, max4567);
+
+  const uint16x8_t min01 = vminq_u16(abs_diff0, abs_diff1);
+  const uint16x8_t min23 = vminq_u16(abs_diff2, abs_diff3);
+  const uint16x8_t min45 = vminq_u16(abs_diff4, abs_diff5);
+  const uint16x8_t min67 = vminq_u16(abs_diff6, abs_diff7);
+
+  const uint16x8_t min0123 = vminq_u16(min01, min23);
+  const uint16x8_t min4567 = vminq_u16(min45, min67);
+  const uint16x8_t min07 = vminq_u16(min0123, min4567);
+
+#if VPX_ARCH_AARCH64
+  *min = *max = 0;  // Clear high bits
+  *((uint16_t *)max) = vmaxvq_u16(max07);
+  *((uint16_t *)min) = vminvq_u16(min07);
+#else
+  // Split into 64-bit vectors and execute pairwise min/max.
+  uint16x4_t ab_max = vmax_u16(vget_high_u16(max07), vget_low_u16(max07));
+  uint16x4_t ab_min = vmin_u16(vget_high_u16(min07), vget_low_u16(min07));
+
+  // Enough runs of vpmax/min propagate the max/min values to every position.
+  ab_max = vpmax_u16(ab_max, ab_max);
+  ab_min = vpmin_u16(ab_min, ab_min);
+
+  ab_max = vpmax_u16(ab_max, ab_max);
+  ab_min = vpmin_u16(ab_min, ab_min);
+
+  ab_max = vpmax_u16(ab_max, ab_max);
+  ab_min = vpmin_u16(ab_min, ab_min);
+
+  *min = *max = 0;  // Clear high bits
+  // Store directly to avoid costly neon->gpr transfer.
+  vst1_lane_u16((uint16_t *)max, ab_max, 0);
+  vst1_lane_u16((uint16_t *)min, ab_min, 0);
+#endif
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_pred_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_pred_neon.c
new file mode 100644
index 0000000000..3063acbb3e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_pred_neon.c
@@ -0,0 +1,64 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred,
+                                   int width, int height, const uint16_t *ref,
+                                   int ref_stride) {
+  int i = height;
+  if (width > 8) {
+    do {
+      int j = 0;
+      do {
+        const uint16x8_t p = vld1q_u16(pred + j);
+        const uint16x8_t r = vld1q_u16(ref + j);
+
+        uint16x8_t avg = vrhaddq_u16(p, r);
+        vst1q_u16(comp_pred + j, avg);
+
+        j += 8;
+      } while (j < width);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--i != 0);
+  } else if (width == 8) {
+    do {
+      const uint16x8_t p = vld1q_u16(pred);
+      const uint16x8_t r = vld1q_u16(ref);
+
+      uint16x8_t avg = vrhaddq_u16(p, r);
+      vst1q_u16(comp_pred, avg);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--i != 0);
+  } else {
+    assert(width == 4);
+    do {
+      const uint16x4_t p = vld1_u16(pred);
+      const uint16x4_t r = vld1_u16(ref);
+
+      uint16x4_t avg = vrhadd_u16(p, r);
+      vst1_u16(comp_pred, avg);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--i != 0);
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_convolve8_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/highbd_convolve8_neon.h
new file mode 100644
index 0000000000..ccc4a797b7
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_convolve8_neon.h
@@ -0,0 +1,46 @@
+/*
+ *  Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_
+#define VPX_VPX_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE uint16x4_t highbd_convolve4_4_neon(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t filters, const uint16x4_t max) {
+  int32x4_t sum = vmull_lane_s16(s0, filters, 0);
+  sum = vmlal_lane_s16(sum, s1, filters, 1);
+  sum = vmlal_lane_s16(sum, s2, filters, 2);
+  sum = vmlal_lane_s16(sum, s3, filters, 3);
+
+  uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS);
+  return vmin_u16(res, max);
+}
+
+static INLINE uint16x8_t highbd_convolve4_8_neon(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x4_t filters, const uint16x8_t max) {
+  int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filters, 0);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters, 1);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters, 2);
+  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters, 3);
+
+  int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filters, 0);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters, 1);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters, 2);
+  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters, 3);
+
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+                                vqrshrun_n_s32(sum1, FILTER_BITS));
+  return vminq_u16(res, max);
+}
+
+#endif  // VPX_VPX_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_convolve8_sve.h b/media/libvpx/libvpx/vpx_dsp/arm/highbd_convolve8_sve.h
new file mode 100644
index 0000000000..bc90d9b4dd
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_convolve8_sve.h
@@ -0,0 +1,99 @@
+/*
+ *  Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_HIGHBD_CONVOLVE8_SVE_H_
+#define VPX_VPX_DSP_ARM_HIGHBD_CONVOLVE8_SVE_H_
+
+#include <arm_neon.h>
+
+#include "vpx_dsp/arm/vpx_neon_sve_bridge.h"
+
+static INLINE uint16x4_t highbd_convolve4_4_sve(const int16x4_t s[4],
+                                                const int16x8_t filter,
+                                                const uint16x4_t max) {
+  int16x8_t s01 = vcombine_s16(s[0], s[1]);
+  int16x8_t s23 = vcombine_s16(s[2], s[3]);
+
+  int64x2_t sum01 = vpx_dotq_lane_s16(vdupq_n_s64(0), s01, filter, 0);
+  int64x2_t sum23 = vpx_dotq_lane_s16(vdupq_n_s64(0), s23, filter, 0);
+
+  int32x4_t res_s32 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+
+  uint16x4_t res_u16 = vqrshrun_n_s32(res_s32, FILTER_BITS);
+  return vmin_u16(res_u16, max);
+}
+
+static INLINE uint16x8_t highbd_convolve4_8_sve(const int16x8_t s[4],
+                                                const int16x8_t filter,
+                                                const uint16x8_t max,
+                                                uint16x8_t idx) {
+  int64x2_t sum04 = vpx_dotq_lane_s16(vdupq_n_s64(0), s[0], filter, 0);
+  int64x2_t sum15 = vpx_dotq_lane_s16(vdupq_n_s64(0), s[1], filter, 0);
+  int64x2_t sum26 = vpx_dotq_lane_s16(vdupq_n_s64(0), s[2], filter, 0);
+  int64x2_t sum37 = vpx_dotq_lane_s16(vdupq_n_s64(0), s[3], filter, 0);
+
+  int32x4_t res0 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15));
+  int32x4_t res1 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37));
+
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(res0, FILTER_BITS),
+                                vqrshrun_n_s32(res1, FILTER_BITS));
+
+  res = vpx_tbl_u16(res, idx);
+
+  return vminq_u16(res, max);
+}
+
+static INLINE uint16x4_t highbd_convolve8_4(const int16x8_t s[4],
+                                            const int16x8_t filter,
+                                            const uint16x4_t max) {
+  int64x2_t sum[4];
+
+  sum[0] = vpx_dotq_s16(vdupq_n_s64(0), s[0], filter);
+  sum[1] = vpx_dotq_s16(vdupq_n_s64(0), s[1], filter);
+  sum[2] = vpx_dotq_s16(vdupq_n_s64(0), s[2], filter);
+  sum[3] = vpx_dotq_s16(vdupq_n_s64(0), s[3], filter);
+
+  sum[0] = vpaddq_s64(sum[0], sum[1]);
+  sum[2] = vpaddq_s64(sum[2], sum[3]);
+
+  int32x4_t res_s32 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2]));
+
+  uint16x4_t res_u16 = vqrshrun_n_s32(res_s32, FILTER_BITS);
+  return vmin_u16(res_u16, max);
+}
+
+static INLINE uint16x8_t highbd_convolve8_8(const int16x8_t s[8],
+                                            const int16x8_t filter,
+                                            const uint16x8_t max) {
+  int64x2_t sum[8];
+
+  sum[0] = vpx_dotq_s16(vdupq_n_s64(0), s[0], filter);
+  sum[1] = vpx_dotq_s16(vdupq_n_s64(0), s[1], filter);
+  sum[2] = vpx_dotq_s16(vdupq_n_s64(0), s[2], filter);
+  sum[3] = vpx_dotq_s16(vdupq_n_s64(0), s[3], filter);
+  sum[4] = vpx_dotq_s16(vdupq_n_s64(0), s[4], filter);
+  sum[5] = vpx_dotq_s16(vdupq_n_s64(0), s[5], filter);
+  sum[6] = vpx_dotq_s16(vdupq_n_s64(0), s[6], filter);
+  sum[7] = vpx_dotq_s16(vdupq_n_s64(0), s[7], filter);
+
+  int64x2_t sum01 = vpaddq_s64(sum[0], sum[1]);
+  int64x2_t sum23 = vpaddq_s64(sum[2], sum[3]);
+  int64x2_t sum45 = vpaddq_s64(sum[4], sum[5]);
+  int64x2_t sum67 = vpaddq_s64(sum[6], sum[7]);
+
+  int32x4_t res0 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+  int32x4_t res1 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
+
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(res0, FILTER_BITS),
+                                vqrshrun_n_s32(res1, FILTER_BITS));
+  return vminq_u16(res, max);
+}
+
+#endif  // VPX_VPX_DSP_ARM_HIGHBD_CONVOLVE8_SVE_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_hadamard_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_hadamard_neon.c
new file mode 100644
index 0000000000..7be88f6bcb
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_hadamard_neon.c
@@ -0,0 +1,215 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+static INLINE void hadamard_highbd_col8_first_pass(int16x8_t *a0, int16x8_t *a1,
+                                                   int16x8_t *a2, int16x8_t *a3,
+                                                   int16x8_t *a4, int16x8_t *a5,
+                                                   int16x8_t *a6,
+                                                   int16x8_t *a7) {
+  int16x8_t b0 = vaddq_s16(*a0, *a1);
+  int16x8_t b1 = vsubq_s16(*a0, *a1);
+  int16x8_t b2 = vaddq_s16(*a2, *a3);
+  int16x8_t b3 = vsubq_s16(*a2, *a3);
+  int16x8_t b4 = vaddq_s16(*a4, *a5);
+  int16x8_t b5 = vsubq_s16(*a4, *a5);
+  int16x8_t b6 = vaddq_s16(*a6, *a7);
+  int16x8_t b7 = vsubq_s16(*a6, *a7);
+
+  int16x8_t c0 = vaddq_s16(b0, b2);
+  int16x8_t c2 = vsubq_s16(b0, b2);
+  int16x8_t c1 = vaddq_s16(b1, b3);
+  int16x8_t c3 = vsubq_s16(b1, b3);
+  int16x8_t c4 = vaddq_s16(b4, b6);
+  int16x8_t c6 = vsubq_s16(b4, b6);
+  int16x8_t c5 = vaddq_s16(b5, b7);
+  int16x8_t c7 = vsubq_s16(b5, b7);
+
+  *a0 = vaddq_s16(c0, c4);
+  *a2 = vsubq_s16(c0, c4);
+  *a7 = vaddq_s16(c1, c5);
+  *a6 = vsubq_s16(c1, c5);
+  *a3 = vaddq_s16(c2, c6);
+  *a1 = vsubq_s16(c2, c6);
+  *a4 = vaddq_s16(c3, c7);
+  *a5 = vsubq_s16(c3, c7);
+}
+
+static INLINE void hadamard_highbd_col4_second_pass(int16x4_t a0, int16x4_t a1,
+                                                    int16x4_t a2, int16x4_t a3,
+                                                    int16x4_t a4, int16x4_t a5,
+                                                    int16x4_t a6, int16x4_t a7,
+                                                    tran_low_t *coeff) {
+  int32x4_t b0 = vaddl_s16(a0, a1);
+  int32x4_t b1 = vsubl_s16(a0, a1);
+  int32x4_t b2 = vaddl_s16(a2, a3);
+  int32x4_t b3 = vsubl_s16(a2, a3);
+  int32x4_t b4 = vaddl_s16(a4, a5);
+  int32x4_t b5 = vsubl_s16(a4, a5);
+  int32x4_t b6 = vaddl_s16(a6, a7);
+  int32x4_t b7 = vsubl_s16(a6, a7);
+
+  int32x4_t c0 = vaddq_s32(b0, b2);
+  int32x4_t c2 = vsubq_s32(b0, b2);
+  int32x4_t c1 = vaddq_s32(b1, b3);
+  int32x4_t c3 = vsubq_s32(b1, b3);
+  int32x4_t c4 = vaddq_s32(b4, b6);
+  int32x4_t c6 = vsubq_s32(b4, b6);
+  int32x4_t c5 = vaddq_s32(b5, b7);
+  int32x4_t c7 = vsubq_s32(b5, b7);
+
+  int32x4_t d0 = vaddq_s32(c0, c4);
+  int32x4_t d2 = vsubq_s32(c0, c4);
+  int32x4_t d7 = vaddq_s32(c1, c5);
+  int32x4_t d6 = vsubq_s32(c1, c5);
+  int32x4_t d3 = vaddq_s32(c2, c6);
+  int32x4_t d1 = vsubq_s32(c2, c6);
+  int32x4_t d4 = vaddq_s32(c3, c7);
+  int32x4_t d5 = vsubq_s32(c3, c7);
+
+  store_s32q_to_tran_low(coeff + 0, d0);
+  store_s32q_to_tran_low(coeff + 4, d1);
+  store_s32q_to_tran_low(coeff + 8, d2);
+  store_s32q_to_tran_low(coeff + 12, d3);
+  store_s32q_to_tran_low(coeff + 16, d4);
+  store_s32q_to_tran_low(coeff + 20, d5);
+  store_s32q_to_tran_low(coeff + 24, d6);
+  store_s32q_to_tran_low(coeff + 28, d7);
+}
+
+void vpx_highbd_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+                                  tran_low_t *coeff) {
+  int16x4_t b0, b1, b2, b3, b4, b5, b6, b7;
+
+  int16x8_t s0 = vld1q_s16(src_diff + 0 * src_stride);
+  int16x8_t s1 = vld1q_s16(src_diff + 1 * src_stride);
+  int16x8_t s2 = vld1q_s16(src_diff + 2 * src_stride);
+  int16x8_t s3 = vld1q_s16(src_diff + 3 * src_stride);
+  int16x8_t s4 = vld1q_s16(src_diff + 4 * src_stride);
+  int16x8_t s5 = vld1q_s16(src_diff + 5 * src_stride);
+  int16x8_t s6 = vld1q_s16(src_diff + 6 * src_stride);
+  int16x8_t s7 = vld1q_s16(src_diff + 7 * src_stride);
+
+  // For the first pass we can stay in 16-bit elements (4095*8 = 32760).
+  hadamard_highbd_col8_first_pass(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+  transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+  // For the second pass we need to widen to 32-bit elements, so we're
+  // processing 4 columns at a time.
+  // Skip the second transpose because it is not required.
+
+  b0 = vget_low_s16(s0);
+  b1 = vget_low_s16(s1);
+  b2 = vget_low_s16(s2);
+  b3 = vget_low_s16(s3);
+  b4 = vget_low_s16(s4);
+  b5 = vget_low_s16(s5);
+  b6 = vget_low_s16(s6);
+  b7 = vget_low_s16(s7);
+
+  hadamard_highbd_col4_second_pass(b0, b1, b2, b3, b4, b5, b6, b7, coeff);
+
+  b0 = vget_high_s16(s0);
+  b1 = vget_high_s16(s1);
+  b2 = vget_high_s16(s2);
+  b3 = vget_high_s16(s3);
+  b4 = vget_high_s16(s4);
+  b5 = vget_high_s16(s5);
+  b6 = vget_high_s16(s6);
+  b7 = vget_high_s16(s7);
+
+  hadamard_highbd_col4_second_pass(b0, b1, b2, b3, b4, b5, b6, b7, coeff + 32);
+}
+
+void vpx_highbd_hadamard_16x16_neon(const int16_t *src_diff,
+                                    ptrdiff_t src_stride, tran_low_t *coeff) {
+  int i = 0;
+
+  // Rearrange 16x16 to 8x32 and remove stride.
+  // Top left first.
+  vpx_highbd_hadamard_8x8_neon(src_diff, src_stride, coeff);
+  // Top right.
+  vpx_highbd_hadamard_8x8_neon(src_diff + 8, src_stride, coeff + 64);
+  // Bottom left.
+  vpx_highbd_hadamard_8x8_neon(src_diff + 8 * src_stride, src_stride,
+                               coeff + 128);
+  // Bottom right.
+  vpx_highbd_hadamard_8x8_neon(src_diff + 8 * src_stride + 8, src_stride,
+                               coeff + 192);
+
+  do {
+    int32x4_t a0 = load_tran_low_to_s32q(coeff + 4 * i);
+    int32x4_t a1 = load_tran_low_to_s32q(coeff + 4 * i + 64);
+    int32x4_t a2 = load_tran_low_to_s32q(coeff + 4 * i + 128);
+    int32x4_t a3 = load_tran_low_to_s32q(coeff + 4 * i + 192);
+
+    int32x4_t b0 = vhaddq_s32(a0, a1);
+    int32x4_t b1 = vhsubq_s32(a0, a1);
+    int32x4_t b2 = vhaddq_s32(a2, a3);
+    int32x4_t b3 = vhsubq_s32(a2, a3);
+
+    int32x4_t c0 = vaddq_s32(b0, b2);
+    int32x4_t c1 = vaddq_s32(b1, b3);
+    int32x4_t c2 = vsubq_s32(b0, b2);
+    int32x4_t c3 = vsubq_s32(b1, b3);
+
+    store_s32q_to_tran_low(coeff + 4 * i, c0);
+    store_s32q_to_tran_low(coeff + 4 * i + 64, c1);
+    store_s32q_to_tran_low(coeff + 4 * i + 128, c2);
+    store_s32q_to_tran_low(coeff + 4 * i + 192, c3);
+  } while (++i < 16);
+}
+
+void vpx_highbd_hadamard_32x32_neon(const int16_t *src_diff,
+                                    ptrdiff_t src_stride, tran_low_t *coeff) {
+  int i = 0;
+
+  // Rearrange 32x32 to 16x64 and remove stride.
+  // Top left first.
+  vpx_highbd_hadamard_16x16_neon(src_diff, src_stride, coeff);
+  // Top right.
+  vpx_highbd_hadamard_16x16_neon(src_diff + 16, src_stride, coeff + 256);
+  // Bottom left.
+  vpx_highbd_hadamard_16x16_neon(src_diff + 16 * src_stride, src_stride,
+                                 coeff + 512);
+  // Bottom right.
+  vpx_highbd_hadamard_16x16_neon(src_diff + 16 * src_stride + 16, src_stride,
+                                 coeff + 768);
+
+  do {
+    int32x4_t a0 = load_tran_low_to_s32q(coeff + 4 * i);
+    int32x4_t a1 = load_tran_low_to_s32q(coeff + 4 * i + 256);
+    int32x4_t a2 = load_tran_low_to_s32q(coeff + 4 * i + 512);
+    int32x4_t a3 = load_tran_low_to_s32q(coeff + 4 * i + 768);
+
+    int32x4_t b0 = vshrq_n_s32(vaddq_s32(a0, a1), 2);
+    int32x4_t b1 = vshrq_n_s32(vsubq_s32(a0, a1), 2);
+    int32x4_t b2 = vshrq_n_s32(vaddq_s32(a2, a3), 2);
+    int32x4_t b3 = vshrq_n_s32(vsubq_s32(a2, a3), 2);
+
+    int32x4_t c0 = vaddq_s32(b0, b2);
+    int32x4_t c1 = vaddq_s32(b1, b3);
+    int32x4_t c2 = vsubq_s32(b0, b2);
+    int32x4_t c3 = vsubq_s32(b1, b3);
+
+    store_s32q_to_tran_low(coeff + 4 * i, c0);
+    store_s32q_to_tran_low(coeff + 4 * i + 256, c1);
+    store_s32q_to_tran_low(coeff + 4 * i + 512, c2);
+    store_s32q_to_tran_low(coeff + 4 * i + 768, c3);
+  } while (++i < 64);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c
new file mode 100644
index 0000000000..654ab42ca4
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c
@@ -0,0 +1,1361 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/highbd_idct_neon.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE int32x4_t dct_const_round_shift_high_4(const int64x2x2_t in) {
+  int32x2x2_t t32;
+
+  t32.val[0] = vrshrn_n_s64(in.val[0], DCT_CONST_BITS);
+  t32.val[1] = vrshrn_n_s64(in.val[1], DCT_CONST_BITS);
+  return vcombine_s32(t32.val[0], t32.val[1]);
+}
+
+static INLINE void dct_const_round_shift_high_4_dual(
+    const int64x2x2_t *const in, int32x4_t *const d0, int32x4_t *const d1) {
+  *d0 = dct_const_round_shift_high_4(in[0]);
+  *d1 = dct_const_round_shift_high_4(in[1]);
+}
+
+static INLINE int32x4x2_t
+dct_const_round_shift_high_4x2_int64x2x2(const int64x2x2_t *const in) {
+  int32x4x2_t out;
+  out.val[0] = dct_const_round_shift_high_4(in[0]);
+  out.val[1] = dct_const_round_shift_high_4(in[1]);
+  return out;
+}
+
+static INLINE void dct_const_round_shift_high_4x2x2(const int64x2x2_t *const in,
+                                                    int32x4x2_t *const d0,
+                                                    int32x4x2_t *const d1) {
+  *d0 = dct_const_round_shift_high_4x2_int64x2x2(in + 0);
+  *d1 = dct_const_round_shift_high_4x2_int64x2x2(in + 2);
+}
+
+static INLINE void highbd_idct_cospi_2_30(const int32x4x2_t s0,
+                                          const int32x4x2_t s1,
+                                          const int32x4_t cospi_2_30_10_22,
+                                          int32x4x2_t *const d0,
+                                          int32x4x2_t *const d1) {
+  int64x2x2_t t[4];
+
+  t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+                               vget_low_s32(cospi_2_30_10_22), 1);
+  t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+                               vget_low_s32(cospi_2_30_10_22), 1);
+  t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+                               vget_low_s32(cospi_2_30_10_22), 1);
+  t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+                               vget_low_s32(cospi_2_30_10_22), 1);
+  t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+                               vget_low_s32(cospi_2_30_10_22), 1);
+  t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+                               vget_low_s32(cospi_2_30_10_22), 1);
+  t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+                               vget_low_s32(cospi_2_30_10_22), 1);
+  t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+                               vget_low_s32(cospi_2_30_10_22), 1);
+  t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+                               vget_low_s32(cospi_2_30_10_22), 0);
+  t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+                               vget_low_s32(cospi_2_30_10_22), 0);
+  t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+                               vget_low_s32(cospi_2_30_10_22), 0);
+  t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+                               vget_low_s32(cospi_2_30_10_22), 0);
+  t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+                               vget_low_s32(cospi_2_30_10_22), 0);
+  t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+                               vget_low_s32(cospi_2_30_10_22), 0);
+  t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+                               vget_low_s32(cospi_2_30_10_22), 0);
+  t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+                               vget_low_s32(cospi_2_30_10_22), 0);
+  dct_const_round_shift_high_4x2x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_4_28(const int32x4x2_t s0,
+                                          const int32x4x2_t s1,
+                                          const int32x4_t cospi_4_12_20N_28,
+                                          int32x4x2_t *const d0,
+                                          int32x4x2_t *const d1) {
+  int64x2x2_t t[4];
+
+  t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+                               vget_high_s32(cospi_4_12_20N_28), 1);
+  t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+                               vget_high_s32(cospi_4_12_20N_28), 1);
+  t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+                               vget_high_s32(cospi_4_12_20N_28), 1);
+  t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+                               vget_high_s32(cospi_4_12_20N_28), 1);
+  t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+                               vget_high_s32(cospi_4_12_20N_28), 1);
+  t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+                               vget_high_s32(cospi_4_12_20N_28), 1);
+  t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+                               vget_high_s32(cospi_4_12_20N_28), 1);
+  t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+                               vget_high_s32(cospi_4_12_20N_28), 1);
+  t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+                               vget_low_s32(cospi_4_12_20N_28), 0);
+  t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+                               vget_low_s32(cospi_4_12_20N_28), 0);
+  t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+                               vget_low_s32(cospi_4_12_20N_28), 0);
+  t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+                               vget_low_s32(cospi_4_12_20N_28), 0);
+  t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+                               vget_low_s32(cospi_4_12_20N_28), 0);
+  t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+                               vget_low_s32(cospi_4_12_20N_28), 0);
+  t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+                               vget_low_s32(cospi_4_12_20N_28), 0);
+  t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+                               vget_low_s32(cospi_4_12_20N_28), 0);
+  dct_const_round_shift_high_4x2x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_6_26(const int32x4x2_t s0,
+                                          const int32x4x2_t s1,
+                                          const int32x4_t cospi_6_26N_14_18N,
+                                          int32x4x2_t *const d0,
+                                          int32x4x2_t *const d1) {
+  int64x2x2_t t[4];
+
+  t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+                               vget_low_s32(cospi_6_26N_14_18N), 0);
+  t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+                               vget_low_s32(cospi_6_26N_14_18N), 0);
+  t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+                               vget_low_s32(cospi_6_26N_14_18N), 0);
+  t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+                               vget_low_s32(cospi_6_26N_14_18N), 0);
+  t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+                               vget_low_s32(cospi_6_26N_14_18N), 0);
+  t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+                               vget_low_s32(cospi_6_26N_14_18N), 0);
+  t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+                               vget_low_s32(cospi_6_26N_14_18N), 0);
+  t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+                               vget_low_s32(cospi_6_26N_14_18N), 0);
+  t[0].val[0] = vmlal_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+                               vget_low_s32(cospi_6_26N_14_18N), 1);
+  t[0].val[1] = vmlal_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+                               vget_low_s32(cospi_6_26N_14_18N), 1);
+  t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+                               vget_low_s32(cospi_6_26N_14_18N), 1);
+  t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+                               vget_low_s32(cospi_6_26N_14_18N), 1);
+  t[2].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+                               vget_low_s32(cospi_6_26N_14_18N), 1);
+  t[2].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+                               vget_low_s32(cospi_6_26N_14_18N), 1);
+  t[3].val[0] = vmlsl_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+                               vget_low_s32(cospi_6_26N_14_18N), 1);
+  t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+                               vget_low_s32(cospi_6_26N_14_18N), 1);
+  dct_const_round_shift_high_4x2x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_10_22(const int32x4x2_t s0,
+                                           const int32x4x2_t s1,
+                                           const int32x4_t cospi_2_30_10_22,
+                                           int32x4x2_t *const d0,
+                                           int32x4x2_t *const d1) {
+  int64x2x2_t t[4];
+
+  t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+                               vget_high_s32(cospi_2_30_10_22), 1);
+  t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+                               vget_high_s32(cospi_2_30_10_22), 1);
+  t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+                               vget_high_s32(cospi_2_30_10_22), 1);
+  t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+                               vget_high_s32(cospi_2_30_10_22), 1);
+  t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+                               vget_high_s32(cospi_2_30_10_22), 1);
+  t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+                               vget_high_s32(cospi_2_30_10_22), 1);
+  t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+                               vget_high_s32(cospi_2_30_10_22), 1);
+  t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+                               vget_high_s32(cospi_2_30_10_22), 1);
+  t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+                               vget_high_s32(cospi_2_30_10_22), 0);
+  t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+                               vget_high_s32(cospi_2_30_10_22), 0);
+  t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+                               vget_high_s32(cospi_2_30_10_22), 0);
+  t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+                               vget_high_s32(cospi_2_30_10_22), 0);
+  t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+                               vget_high_s32(cospi_2_30_10_22), 0);
+  t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+                               vget_high_s32(cospi_2_30_10_22), 0);
+  t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+                               vget_high_s32(cospi_2_30_10_22), 0);
+  t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+                               vget_high_s32(cospi_2_30_10_22), 0);
+  dct_const_round_shift_high_4x2x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_12_20(const int32x4x2_t s0,
+                                           const int32x4x2_t s1,
+                                           const int32x4_t cospi_4_12_20N_28,
+                                           int32x4x2_t *const d0,
+                                           int32x4x2_t *const d1) {
+  int64x2x2_t t[4];
+
+  t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+                               vget_low_s32(cospi_4_12_20N_28), 1);
+  t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+                               vget_low_s32(cospi_4_12_20N_28), 1);
+  t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+                               vget_low_s32(cospi_4_12_20N_28), 1);
+  t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+                               vget_low_s32(cospi_4_12_20N_28), 1);
+  t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+                               vget_low_s32(cospi_4_12_20N_28), 1);
+  t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+                               vget_low_s32(cospi_4_12_20N_28), 1);
+  t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+                               vget_low_s32(cospi_4_12_20N_28), 1);
+  t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+                               vget_low_s32(cospi_4_12_20N_28), 1);
+  t[0].val[0] = vmlal_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+                               vget_high_s32(cospi_4_12_20N_28), 0);
+  t[0].val[1] = vmlal_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+                               vget_high_s32(cospi_4_12_20N_28), 0);
+  t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+                               vget_high_s32(cospi_4_12_20N_28), 0);
+  t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+                               vget_high_s32(cospi_4_12_20N_28), 0);
+  t[2].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+                               vget_high_s32(cospi_4_12_20N_28), 0);
+  t[2].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+                               vget_high_s32(cospi_4_12_20N_28), 0);
+  t[3].val[0] = vmlsl_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+                               vget_high_s32(cospi_4_12_20N_28), 0);
+  t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+                               vget_high_s32(cospi_4_12_20N_28), 0);
+  dct_const_round_shift_high_4x2x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_14_18(const int32x4x2_t s0,
+                                           const int32x4x2_t s1,
+                                           const int32x4_t cospi_6_26N_14_18N,
+                                           int32x4x2_t *const d0,
+                                           int32x4x2_t *const d1) {
+  int64x2x2_t t[4];
+
+  t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+                               vget_high_s32(cospi_6_26N_14_18N), 0);
+  t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+                               vget_high_s32(cospi_6_26N_14_18N), 0);
+  t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+                               vget_high_s32(cospi_6_26N_14_18N), 0);
+  t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+                               vget_high_s32(cospi_6_26N_14_18N), 0);
+  t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+                               vget_high_s32(cospi_6_26N_14_18N), 0);
+  t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+                               vget_high_s32(cospi_6_26N_14_18N), 0);
+  t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+                               vget_high_s32(cospi_6_26N_14_18N), 0);
+  t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+                               vget_high_s32(cospi_6_26N_14_18N), 0);
+  t[0].val[0] = vmlal_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+                               vget_high_s32(cospi_6_26N_14_18N), 1);
+  t[0].val[1] = vmlal_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+                               vget_high_s32(cospi_6_26N_14_18N), 1);
+  t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+                               vget_high_s32(cospi_6_26N_14_18N), 1);
+  t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+                               vget_high_s32(cospi_6_26N_14_18N), 1);
+  t[2].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+                               vget_high_s32(cospi_6_26N_14_18N), 1);
+  t[2].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+                               vget_high_s32(cospi_6_26N_14_18N), 1);
+  t[3].val[0] = vmlsl_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+                               vget_high_s32(cospi_6_26N_14_18N), 1);
+  t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+                               vget_high_s32(cospi_6_26N_14_18N), 1);
+  dct_const_round_shift_high_4x2x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_8_24_q_kernel(
+    const int32x4x2_t s0, const int32x4x2_t s1, const int32x4_t cospi_0_8_16_24,
+    int64x2x2_t *const t) {
+  t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+                               vget_high_s32(cospi_0_8_16_24), 1);
+  t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+                               vget_high_s32(cospi_0_8_16_24), 1);
+  t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+                               vget_high_s32(cospi_0_8_16_24), 1);
+  t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+                               vget_high_s32(cospi_0_8_16_24), 1);
+  t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+                               vget_high_s32(cospi_0_8_16_24), 1);
+  t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+                               vget_high_s32(cospi_0_8_16_24), 1);
+  t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+                               vget_high_s32(cospi_0_8_16_24), 1);
+  t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+                               vget_high_s32(cospi_0_8_16_24), 1);
+  t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+                               vget_low_s32(cospi_0_8_16_24), 1);
+  t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+                               vget_low_s32(cospi_0_8_16_24), 1);
+  t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+                               vget_low_s32(cospi_0_8_16_24), 1);
+  t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+                               vget_low_s32(cospi_0_8_16_24), 1);
+  t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+                               vget_low_s32(cospi_0_8_16_24), 1);
+  t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+                               vget_low_s32(cospi_0_8_16_24), 1);
+  t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+                               vget_low_s32(cospi_0_8_16_24), 1);
+  t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+                               vget_low_s32(cospi_0_8_16_24), 1);
+}
+
+static INLINE void highbd_idct_cospi_8_24_d_kernel(
+    const int32x4_t s0, const int32x4_t s1, const int32x4_t cospi_0_8_16_24,
+    int64x2x2_t *const t) {
+  t[0].val[0] =
+      vmull_lane_s32(vget_low_s32(s0), vget_high_s32(cospi_0_8_16_24), 1);
+  t[0].val[1] =
+      vmull_lane_s32(vget_high_s32(s0), vget_high_s32(cospi_0_8_16_24), 1);
+  t[1].val[0] =
+      vmull_lane_s32(vget_low_s32(s1), vget_high_s32(cospi_0_8_16_24), 1);
+  t[1].val[1] =
+      vmull_lane_s32(vget_high_s32(s1), vget_high_s32(cospi_0_8_16_24), 1);
+  t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1),
+                               vget_low_s32(cospi_0_8_16_24), 1);
+  t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1),
+                               vget_low_s32(cospi_0_8_16_24), 1);
+  t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s0),
+                               vget_low_s32(cospi_0_8_16_24), 1);
+  t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s0),
+                               vget_low_s32(cospi_0_8_16_24), 1);
+}
+
+static INLINE void highbd_idct_cospi_8_24_q(const int32x4x2_t s0,
+                                            const int32x4x2_t s1,
+                                            const int32x4_t cospi_0_8_16_24,
+                                            int32x4x2_t *const d0,
+                                            int32x4x2_t *const d1) {
+  int64x2x2_t t[4];
+
+  highbd_idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t);
+  dct_const_round_shift_high_4x2x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_8_24_d(const int32x4_t s0,
+                                            const int32x4_t s1,
+                                            const int32x4_t cospi_0_8_16_24,
+                                            int32x4_t *const d0,
+                                            int32x4_t *const d1) {
+  int64x2x2_t t[2];
+
+  highbd_idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t);
+  dct_const_round_shift_high_4_dual(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_8_24_neg_q(const int32x4x2_t s0,
+                                                const int32x4x2_t s1,
+                                                const int32x4_t cospi_0_8_16_24,
+                                                int32x4x2_t *const d0,
+                                                int32x4x2_t *const d1) {
+  int64x2x2_t t[4];
+
+  highbd_idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t);
+  t[2].val[0] = vsubq_s64(vdupq_n_s64(0), t[2].val[0]);
+  t[2].val[1] = vsubq_s64(vdupq_n_s64(0), t[2].val[1]);
+  t[3].val[0] = vsubq_s64(vdupq_n_s64(0), t[3].val[0]);
+  t[3].val[1] = vsubq_s64(vdupq_n_s64(0), t[3].val[1]);
+  dct_const_round_shift_high_4x2x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_8_24_neg_d(const int32x4_t s0,
+                                                const int32x4_t s1,
+                                                const int32x4_t cospi_0_8_16_24,
+                                                int32x4_t *const d0,
+                                                int32x4_t *const d1) {
+  int64x2x2_t t[2];
+
+  highbd_idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t);
+  t[1].val[0] = vsubq_s64(vdupq_n_s64(0), t[1].val[0]);
+  t[1].val[1] = vsubq_s64(vdupq_n_s64(0), t[1].val[1]);
+  dct_const_round_shift_high_4_dual(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_16_16_q(const int32x4x2_t s0,
+                                             const int32x4x2_t s1,
+                                             const int32x4_t cospi_0_8_16_24,
+                                             int32x4x2_t *const d0,
+                                             int32x4x2_t *const d1) {
+  int64x2x2_t t[6];
+
+  t[4].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+                               vget_high_s32(cospi_0_8_16_24), 0);
+  t[4].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+                               vget_high_s32(cospi_0_8_16_24), 0);
+  t[5].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+                               vget_high_s32(cospi_0_8_16_24), 0);
+  t[5].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+                               vget_high_s32(cospi_0_8_16_24), 0);
+  t[0].val[0] = vmlsl_lane_s32(t[4].val[0], vget_low_s32(s0.val[0]),
+                               vget_high_s32(cospi_0_8_16_24), 0);
+  t[0].val[1] = vmlsl_lane_s32(t[4].val[1], vget_high_s32(s0.val[0]),
+                               vget_high_s32(cospi_0_8_16_24), 0);
+  t[1].val[0] = vmlsl_lane_s32(t[5].val[0], vget_low_s32(s0.val[1]),
+                               vget_high_s32(cospi_0_8_16_24), 0);
+  t[1].val[1] = vmlsl_lane_s32(t[5].val[1], vget_high_s32(s0.val[1]),
+                               vget_high_s32(cospi_0_8_16_24), 0);
+  t[2].val[0] = vmlal_lane_s32(t[4].val[0], vget_low_s32(s0.val[0]),
+                               vget_high_s32(cospi_0_8_16_24), 0);
+  t[2].val[1] = vmlal_lane_s32(t[4].val[1], vget_high_s32(s0.val[0]),
+                               vget_high_s32(cospi_0_8_16_24), 0);
+  t[3].val[0] = vmlal_lane_s32(t[5].val[0], vget_low_s32(s0.val[1]),
+                               vget_high_s32(cospi_0_8_16_24), 0);
+  t[3].val[1] = vmlal_lane_s32(t[5].val[1], vget_high_s32(s0.val[1]),
+                               vget_high_s32(cospi_0_8_16_24), 0);
+  dct_const_round_shift_high_4x2x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_16_16_d(const int32x4_t s0,
+                                             const int32x4_t s1,
+                                             const int32x4_t cospi_0_8_16_24,
+                                             int32x4_t *const d0,
+                                             int32x4_t *const d1) {
+  int64x2x2_t t[3];
+
+  t[2].val[0] =
+      vmull_lane_s32(vget_low_s32(s1), vget_high_s32(cospi_0_8_16_24), 0);
+  t[2].val[1] =
+      vmull_lane_s32(vget_high_s32(s1), vget_high_s32(cospi_0_8_16_24), 0);
+  t[0].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0),
+                               vget_high_s32(cospi_0_8_16_24), 0);
+  t[0].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0),
+                               vget_high_s32(cospi_0_8_16_24), 0);
+  t[1].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0),
+                               vget_high_s32(cospi_0_8_16_24), 0);
+  t[1].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0),
+                               vget_high_s32(cospi_0_8_16_24), 0);
+  dct_const_round_shift_high_4_dual(t, d0, d1);
+}
+
+static INLINE void highbd_idct16x16_add_stage7_dual(
+    const int32x4x2_t *const step2, int32x4x2_t *const out) {
+  out[0].val[0] = vaddq_s32(step2[0].val[0], step2[15].val[0]);
+  out[0].val[1] = vaddq_s32(step2[0].val[1], step2[15].val[1]);
+  out[1].val[0] = vaddq_s32(step2[1].val[0], step2[14].val[0]);
+  out[1].val[1] = vaddq_s32(step2[1].val[1], step2[14].val[1]);
+  out[2].val[0] = vaddq_s32(step2[2].val[0], step2[13].val[0]);
+  out[2].val[1] = vaddq_s32(step2[2].val[1], step2[13].val[1]);
+  out[3].val[0] = vaddq_s32(step2[3].val[0], step2[12].val[0]);
+  out[3].val[1] = vaddq_s32(step2[3].val[1], step2[12].val[1]);
+  out[4].val[0] = vaddq_s32(step2[4].val[0], step2[11].val[0]);
+  out[4].val[1] = vaddq_s32(step2[4].val[1], step2[11].val[1]);
+  out[5].val[0] = vaddq_s32(step2[5].val[0], step2[10].val[0]);
+  out[5].val[1] = vaddq_s32(step2[5].val[1], step2[10].val[1]);
+  out[6].val[0] = vaddq_s32(step2[6].val[0], step2[9].val[0]);
+  out[6].val[1] = vaddq_s32(step2[6].val[1], step2[9].val[1]);
+  out[7].val[0] = vaddq_s32(step2[7].val[0], step2[8].val[0]);
+  out[7].val[1] = vaddq_s32(step2[7].val[1], step2[8].val[1]);
+  out[8].val[0] = vsubq_s32(step2[7].val[0], step2[8].val[0]);
+  out[8].val[1] = vsubq_s32(step2[7].val[1], step2[8].val[1]);
+  out[9].val[0] = vsubq_s32(step2[6].val[0], step2[9].val[0]);
+  out[9].val[1] = vsubq_s32(step2[6].val[1], step2[9].val[1]);
+  out[10].val[0] = vsubq_s32(step2[5].val[0], step2[10].val[0]);
+  out[10].val[1] = vsubq_s32(step2[5].val[1], step2[10].val[1]);
+  out[11].val[0] = vsubq_s32(step2[4].val[0], step2[11].val[0]);
+  out[11].val[1] = vsubq_s32(step2[4].val[1], step2[11].val[1]);
+  out[12].val[0] = vsubq_s32(step2[3].val[0], step2[12].val[0]);
+  out[12].val[1] = vsubq_s32(step2[3].val[1], step2[12].val[1]);
+  out[13].val[0] = vsubq_s32(step2[2].val[0], step2[13].val[0]);
+  out[13].val[1] = vsubq_s32(step2[2].val[1], step2[13].val[1]);
+  out[14].val[0] = vsubq_s32(step2[1].val[0], step2[14].val[0]);
+  out[14].val[1] = vsubq_s32(step2[1].val[1], step2[14].val[1]);
+  out[15].val[0] = vsubq_s32(step2[0].val[0], step2[15].val[0]);
+  out[15].val[1] = vsubq_s32(step2[0].val[1], step2[15].val[1]);
+}
+
+static INLINE void highbd_idct16x16_add_stage7(const int32x4_t *const step2,
+                                               int32x4_t *const out) {
+  out[0] = vaddq_s32(step2[0], step2[15]);
+  out[1] = vaddq_s32(step2[1], step2[14]);
+  out[2] = vaddq_s32(step2[2], step2[13]);
+  out[3] = vaddq_s32(step2[3], step2[12]);
+  out[4] = vaddq_s32(step2[4], step2[11]);
+  out[5] = vaddq_s32(step2[5], step2[10]);
+  out[6] = vaddq_s32(step2[6], step2[9]);
+  out[7] = vaddq_s32(step2[7], step2[8]);
+  out[8] = vsubq_s32(step2[7], step2[8]);
+  out[9] = vsubq_s32(step2[6], step2[9]);
+  out[10] = vsubq_s32(step2[5], step2[10]);
+  out[11] = vsubq_s32(step2[4], step2[11]);
+  out[12] = vsubq_s32(step2[3], step2[12]);
+  out[13] = vsubq_s32(step2[2], step2[13]);
+  out[14] = vsubq_s32(step2[1], step2[14]);
+  out[15] = vsubq_s32(step2[0], step2[15]);
+}
+
+void vpx_highbd_idct16x16_256_add_half1d(const int32_t *input, int32_t *output,
+                                         uint16_t *dest, const int stride,
+                                         const int bd) {
+  const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0);
+  const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4);
+  const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8);
+  const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12);
+  int32x4x2_t in[16], step1[16], step2[16], out[16];
+
+  // Load input (16x8)
+  in[0].val[0] = vld1q_s32(input);
+  in[0].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[8].val[0] = vld1q_s32(input);
+  in[8].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[1].val[0] = vld1q_s32(input);
+  in[1].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[9].val[0] = vld1q_s32(input);
+  in[9].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[2].val[0] = vld1q_s32(input);
+  in[2].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[10].val[0] = vld1q_s32(input);
+  in[10].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[3].val[0] = vld1q_s32(input);
+  in[3].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[11].val[0] = vld1q_s32(input);
+  in[11].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[4].val[0] = vld1q_s32(input);
+  in[4].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[12].val[0] = vld1q_s32(input);
+  in[12].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[5].val[0] = vld1q_s32(input);
+  in[5].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[13].val[0] = vld1q_s32(input);
+  in[13].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[6].val[0] = vld1q_s32(input);
+  in[6].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[14].val[0] = vld1q_s32(input);
+  in[14].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[7].val[0] = vld1q_s32(input);
+  in[7].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[15].val[0] = vld1q_s32(input);
+  in[15].val[1] = vld1q_s32(input + 4);
+
+  // Transpose
+  transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+                    &in[7]);
+  transpose_s32_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14],
+                    &in[15]);
+
+  // stage 1
+  step1[0] = in[0 / 2];
+  step1[1] = in[16 / 2];
+  step1[2] = in[8 / 2];
+  step1[3] = in[24 / 2];
+  step1[4] = in[4 / 2];
+  step1[5] = in[20 / 2];
+  step1[6] = in[12 / 2];
+  step1[7] = in[28 / 2];
+  step1[8] = in[2 / 2];
+  step1[9] = in[18 / 2];
+  step1[10] = in[10 / 2];
+  step1[11] = in[26 / 2];
+  step1[12] = in[6 / 2];
+  step1[13] = in[22 / 2];
+  step1[14] = in[14 / 2];
+  step1[15] = in[30 / 2];
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[1] = step1[1];
+  step2[2] = step1[2];
+  step2[3] = step1[3];
+  step2[4] = step1[4];
+  step2[5] = step1[5];
+  step2[6] = step1[6];
+  step2[7] = step1[7];
+  highbd_idct_cospi_2_30(step1[8], step1[15], cospi_2_30_10_22, &step2[8],
+                         &step2[15]);
+  highbd_idct_cospi_14_18(step1[9], step1[14], cospi_6_26N_14_18N, &step2[9],
+                          &step2[14]);
+  highbd_idct_cospi_10_22(step1[10], step1[13], cospi_2_30_10_22, &step2[10],
+                          &step2[13]);
+  highbd_idct_cospi_6_26(step1[11], step1[12], cospi_6_26N_14_18N, &step2[11],
+                         &step2[12]);
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+  highbd_idct_cospi_4_28(step2[4], step2[7], cospi_4_12_20N_28, &step1[4],
+                         &step1[7]);
+  highbd_idct_cospi_12_20(step2[5], step2[6], cospi_4_12_20N_28, &step1[5],
+                          &step1[6]);
+  step1[8].val[0] = vaddq_s32(step2[8].val[0], step2[9].val[0]);
+  step1[8].val[1] = vaddq_s32(step2[8].val[1], step2[9].val[1]);
+  step1[9].val[0] = vsubq_s32(step2[8].val[0], step2[9].val[0]);
+  step1[9].val[1] = vsubq_s32(step2[8].val[1], step2[9].val[1]);
+  step1[10].val[0] = vsubq_s32(step2[11].val[0], step2[10].val[0]);
+  step1[10].val[1] = vsubq_s32(step2[11].val[1], step2[10].val[1]);
+  step1[11].val[0] = vaddq_s32(step2[11].val[0], step2[10].val[0]);
+  step1[11].val[1] = vaddq_s32(step2[11].val[1], step2[10].val[1]);
+  step1[12].val[0] = vaddq_s32(step2[12].val[0], step2[13].val[0]);
+  step1[12].val[1] = vaddq_s32(step2[12].val[1], step2[13].val[1]);
+  step1[13].val[0] = vsubq_s32(step2[12].val[0], step2[13].val[0]);
+  step1[13].val[1] = vsubq_s32(step2[12].val[1], step2[13].val[1]);
+  step1[14].val[0] = vsubq_s32(step2[15].val[0], step2[14].val[0]);
+  step1[14].val[1] = vsubq_s32(step2[15].val[1], step2[14].val[1]);
+  step1[15].val[0] = vaddq_s32(step2[15].val[0], step2[14].val[0]);
+  step1[15].val[1] = vaddq_s32(step2[15].val[1], step2[14].val[1]);
+
+  // stage 4
+  highbd_idct_cospi_16_16_q(step1[1], step1[0], cospi_0_8_16_24, &step2[1],
+                            &step2[0]);
+  highbd_idct_cospi_8_24_q(step1[2], step1[3], cospi_0_8_16_24, &step2[2],
+                           &step2[3]);
+  step2[4].val[0] = vaddq_s32(step1[4].val[0], step1[5].val[0]);
+  step2[4].val[1] = vaddq_s32(step1[4].val[1], step1[5].val[1]);
+  step2[5].val[0] = vsubq_s32(step1[4].val[0], step1[5].val[0]);
+  step2[5].val[1] = vsubq_s32(step1[4].val[1], step1[5].val[1]);
+  step2[6].val[0] = vsubq_s32(step1[7].val[0], step1[6].val[0]);
+  step2[6].val[1] = vsubq_s32(step1[7].val[1], step1[6].val[1]);
+  step2[7].val[0] = vaddq_s32(step1[7].val[0], step1[6].val[0]);
+  step2[7].val[1] = vaddq_s32(step1[7].val[1], step1[6].val[1]);
+  step2[8] = step1[8];
+  highbd_idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+                           &step2[14]);
+  highbd_idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24,
+                               &step2[13], &step2[10]);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  // stage 5
+  step1[0].val[0] = vaddq_s32(step2[0].val[0], step2[3].val[0]);
+  step1[0].val[1] = vaddq_s32(step2[0].val[1], step2[3].val[1]);
+  step1[1].val[0] = vaddq_s32(step2[1].val[0], step2[2].val[0]);
+  step1[1].val[1] = vaddq_s32(step2[1].val[1], step2[2].val[1]);
+  step1[2].val[0] = vsubq_s32(step2[1].val[0], step2[2].val[0]);
+  step1[2].val[1] = vsubq_s32(step2[1].val[1], step2[2].val[1]);
+  step1[3].val[0] = vsubq_s32(step2[0].val[0], step2[3].val[0]);
+  step1[3].val[1] = vsubq_s32(step2[0].val[1], step2[3].val[1]);
+  step1[4] = step2[4];
+  highbd_idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5],
+                            &step1[6]);
+  step1[7] = step2[7];
+  step1[8].val[0] = vaddq_s32(step2[8].val[0], step2[11].val[0]);
+  step1[8].val[1] = vaddq_s32(step2[8].val[1], step2[11].val[1]);
+  step1[9].val[0] = vaddq_s32(step2[9].val[0], step2[10].val[0]);
+  step1[9].val[1] = vaddq_s32(step2[9].val[1], step2[10].val[1]);
+  step1[10].val[0] = vsubq_s32(step2[9].val[0], step2[10].val[0]);
+  step1[10].val[1] = vsubq_s32(step2[9].val[1], step2[10].val[1]);
+  step1[11].val[0] = vsubq_s32(step2[8].val[0], step2[11].val[0]);
+  step1[11].val[1] = vsubq_s32(step2[8].val[1], step2[11].val[1]);
+  step1[12].val[0] = vsubq_s32(step2[15].val[0], step2[12].val[0]);
+  step1[12].val[1] = vsubq_s32(step2[15].val[1], step2[12].val[1]);
+  step1[13].val[0] = vsubq_s32(step2[14].val[0], step2[13].val[0]);
+  step1[13].val[1] = vsubq_s32(step2[14].val[1], step2[13].val[1]);
+  step1[14].val[0] = vaddq_s32(step2[14].val[0], step2[13].val[0]);
+  step1[14].val[1] = vaddq_s32(step2[14].val[1], step2[13].val[1]);
+  step1[15].val[0] = vaddq_s32(step2[15].val[0], step2[12].val[0]);
+  step1[15].val[1] = vaddq_s32(step2[15].val[1], step2[12].val[1]);
+
+  // stage 6
+  step2[0].val[0] = vaddq_s32(step1[0].val[0], step1[7].val[0]);
+  step2[0].val[1] = vaddq_s32(step1[0].val[1], step1[7].val[1]);
+  step2[1].val[0] = vaddq_s32(step1[1].val[0], step1[6].val[0]);
+  step2[1].val[1] = vaddq_s32(step1[1].val[1], step1[6].val[1]);
+  step2[2].val[0] = vaddq_s32(step1[2].val[0], step1[5].val[0]);
+  step2[2].val[1] = vaddq_s32(step1[2].val[1], step1[5].val[1]);
+  step2[3].val[0] = vaddq_s32(step1[3].val[0], step1[4].val[0]);
+  step2[3].val[1] = vaddq_s32(step1[3].val[1], step1[4].val[1]);
+  step2[4].val[0] = vsubq_s32(step1[3].val[0], step1[4].val[0]);
+  step2[4].val[1] = vsubq_s32(step1[3].val[1], step1[4].val[1]);
+  step2[5].val[0] = vsubq_s32(step1[2].val[0], step1[5].val[0]);
+  step2[5].val[1] = vsubq_s32(step1[2].val[1], step1[5].val[1]);
+  step2[6].val[0] = vsubq_s32(step1[1].val[0], step1[6].val[0]);
+  step2[6].val[1] = vsubq_s32(step1[1].val[1], step1[6].val[1]);
+  step2[7].val[0] = vsubq_s32(step1[0].val[0], step1[7].val[0]);
+  step2[7].val[1] = vsubq_s32(step1[0].val[1], step1[7].val[1]);
+  highbd_idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+                            &step2[13]);
+  highbd_idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+                            &step2[12]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+  highbd_idct16x16_add_stage7_dual(step2, out);
+
+  if (output) {
+    highbd_idct16x16_store_pass1(out, output);
+  } else {
+    highbd_idct16x16_add_store(out, dest, stride, bd);
+  }
+}
+
+static INLINE int32x4x2_t highbd_idct_cospi_lane0_dual(const int32x4x2_t s,
+                                                       const int32x2_t coef) {
+  int64x2x2_t t[2];
+
+  t[0].val[0] = vmull_lane_s32(vget_low_s32(s.val[0]), coef, 0);
+  t[0].val[1] = vmull_lane_s32(vget_high_s32(s.val[0]), coef, 0);
+  t[1].val[0] = vmull_lane_s32(vget_low_s32(s.val[1]), coef, 0);
+  t[1].val[1] = vmull_lane_s32(vget_high_s32(s.val[1]), coef, 0);
+  return dct_const_round_shift_high_4x2_int64x2x2(t);
+}
+
+static INLINE int32x4_t highbd_idct_cospi_lane0(const int32x4_t s,
+                                                const int32x2_t coef) {
+  int64x2x2_t t;
+
+  t.val[0] = vmull_lane_s32(vget_low_s32(s), coef, 0);
+  t.val[1] = vmull_lane_s32(vget_high_s32(s), coef, 0);
+  return dct_const_round_shift_high_4(t);
+}
+
+static INLINE int32x4x2_t highbd_idct_cospi_lane1_dual(const int32x4x2_t s,
+                                                       const int32x2_t coef) {
+  int64x2x2_t t[2];
+
+  t[0].val[0] = vmull_lane_s32(vget_low_s32(s.val[0]), coef, 1);
+  t[0].val[1] = vmull_lane_s32(vget_high_s32(s.val[0]), coef, 1);
+  t[1].val[0] = vmull_lane_s32(vget_low_s32(s.val[1]), coef, 1);
+  t[1].val[1] = vmull_lane_s32(vget_high_s32(s.val[1]), coef, 1);
+  return dct_const_round_shift_high_4x2_int64x2x2(t);
+}
+
+static INLINE int32x4_t highbd_idct_cospi_lane1(const int32x4_t s,
+                                                const int32x2_t coef) {
+  int64x2x2_t t;
+
+  t.val[0] = vmull_lane_s32(vget_low_s32(s), coef, 1);
+  t.val[1] = vmull_lane_s32(vget_high_s32(s), coef, 1);
+  return dct_const_round_shift_high_4(t);
+}
+
+static void vpx_highbd_idct16x16_38_add_half1d(const int32_t *input,
+                                               int32_t *output, uint16_t *dest,
+                                               const int stride, const int bd) {
+  const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0);
+  const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4);
+  const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8);
+  const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12);
+  int32x4x2_t in[8], step1[16], step2[16], out[16];
+
+  // Load input (8x8)
+  in[0].val[0] = vld1q_s32(input);
+  in[0].val[1] = vld1q_s32(input + 4);
+  input += 16;
+  in[1].val[0] = vld1q_s32(input);
+  in[1].val[1] = vld1q_s32(input + 4);
+  input += 16;
+  in[2].val[0] = vld1q_s32(input);
+  in[2].val[1] = vld1q_s32(input + 4);
+  input += 16;
+  in[3].val[0] = vld1q_s32(input);
+  in[3].val[1] = vld1q_s32(input + 4);
+  input += 16;
+  in[4].val[0] = vld1q_s32(input);
+  in[4].val[1] = vld1q_s32(input + 4);
+  input += 16;
+  in[5].val[0] = vld1q_s32(input);
+  in[5].val[1] = vld1q_s32(input + 4);
+  input += 16;
+  in[6].val[0] = vld1q_s32(input);
+  in[6].val[1] = vld1q_s32(input + 4);
+  input += 16;
+  in[7].val[0] = vld1q_s32(input);
+  in[7].val[1] = vld1q_s32(input + 4);
+
+  // Transpose
+  transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+                    &in[7]);
+
+  // stage 1
+  step1[0] = in[0 / 2];
+  step1[2] = in[8 / 2];
+  step1[4] = in[4 / 2];
+  step1[6] = in[12 / 2];
+  step1[8] = in[2 / 2];
+  step1[10] = in[10 / 2];
+  step1[12] = in[6 / 2];
+  step1[14] = in[14 / 2];  // 0 in pass 1
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[2] = step1[2];
+  step2[4] = step1[4];
+  step2[6] = step1[6];
+  step2[8] =
+      highbd_idct_cospi_lane1_dual(step1[8], vget_low_s32(cospi_2_30_10_22));
+  step2[9] = highbd_idct_cospi_lane1_dual(step1[14],
+                                          vget_high_s32(cospi_6_26N_14_18N));
+  step2[10] =
+      highbd_idct_cospi_lane1_dual(step1[10], vget_high_s32(cospi_2_30_10_22));
+  step2[11] =
+      highbd_idct_cospi_lane1_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N));
+  step2[12] =
+      highbd_idct_cospi_lane0_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N));
+  step2[13] =
+      highbd_idct_cospi_lane0_dual(step1[10], vget_high_s32(cospi_2_30_10_22));
+  step2[14] = highbd_idct_cospi_lane0_dual(step1[14],
+                                           vget_high_s32(cospi_6_26N_14_18N));
+  step2[15] =
+      highbd_idct_cospi_lane0_dual(step1[8], vget_low_s32(cospi_2_30_10_22));
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[2] = step2[2];
+  step1[4] =
+      highbd_idct_cospi_lane1_dual(step2[4], vget_high_s32(cospi_4_12_20N_28));
+  step1[5] =
+      highbd_idct_cospi_lane0_dual(step2[6], vget_high_s32(cospi_4_12_20N_28));
+  step1[6] =
+      highbd_idct_cospi_lane1_dual(step2[6], vget_low_s32(cospi_4_12_20N_28));
+  step1[7] =
+      highbd_idct_cospi_lane0_dual(step2[4], vget_low_s32(cospi_4_12_20N_28));
+  step1[8] = highbd_idct_add_dual(step2[8], step2[9]);
+  step1[9] = highbd_idct_sub_dual(step2[8], step2[9]);
+  step1[10] = highbd_idct_sub_dual(step2[11], step2[10]);
+  step1[11] = highbd_idct_add_dual(step2[11], step2[10]);
+  step1[12] = highbd_idct_add_dual(step2[12], step2[13]);
+  step1[13] = highbd_idct_sub_dual(step2[12], step2[13]);
+  step1[14] = highbd_idct_sub_dual(step2[15], step2[14]);
+  step1[15] = highbd_idct_add_dual(step2[15], step2[14]);
+
+  // stage 4
+  step2[0] = step2[1] =
+      highbd_idct_cospi_lane0_dual(step1[0], vget_high_s32(cospi_0_8_16_24));
+  step2[2] =
+      highbd_idct_cospi_lane1_dual(step1[2], vget_high_s32(cospi_0_8_16_24));
+  step2[3] =
+      highbd_idct_cospi_lane1_dual(step1[2], vget_low_s32(cospi_0_8_16_24));
+  step2[4] = highbd_idct_add_dual(step1[4], step1[5]);
+  step2[5] = highbd_idct_sub_dual(step1[4], step1[5]);
+  step2[6] = highbd_idct_sub_dual(step1[7], step1[6]);
+  step2[7] = highbd_idct_add_dual(step1[7], step1[6]);
+  step2[8] = step1[8];
+  highbd_idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+                           &step2[14]);
+  highbd_idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24,
+                               &step2[13], &step2[10]);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  // stage 5
+  step1[0] = highbd_idct_add_dual(step2[0], step2[3]);
+  step1[1] = highbd_idct_add_dual(step2[1], step2[2]);
+  step1[2] = highbd_idct_sub_dual(step2[1], step2[2]);
+  step1[3] = highbd_idct_sub_dual(step2[0], step2[3]);
+  step1[4] = step2[4];
+  highbd_idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5],
+                            &step1[6]);
+  step1[7] = step2[7];
+  step1[8] = highbd_idct_add_dual(step2[8], step2[11]);
+  step1[9] = highbd_idct_add_dual(step2[9], step2[10]);
+  step1[10] = highbd_idct_sub_dual(step2[9], step2[10]);
+  step1[11] = highbd_idct_sub_dual(step2[8], step2[11]);
+  step1[12] = highbd_idct_sub_dual(step2[15], step2[12]);
+  step1[13] = highbd_idct_sub_dual(step2[14], step2[13]);
+  step1[14] = highbd_idct_add_dual(step2[14], step2[13]);
+  step1[15] = highbd_idct_add_dual(step2[15], step2[12]);
+
+  // stage 6
+  step2[0] = highbd_idct_add_dual(step1[0], step1[7]);
+  step2[1] = highbd_idct_add_dual(step1[1], step1[6]);
+  step2[2] = highbd_idct_add_dual(step1[2], step1[5]);
+  step2[3] = highbd_idct_add_dual(step1[3], step1[4]);
+  step2[4] = highbd_idct_sub_dual(step1[3], step1[4]);
+  step2[5] = highbd_idct_sub_dual(step1[2], step1[5]);
+  step2[6] = highbd_idct_sub_dual(step1[1], step1[6]);
+  step2[7] = highbd_idct_sub_dual(step1[0], step1[7]);
+  highbd_idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+                            &step2[13]);
+  highbd_idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+                            &step2[12]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+  highbd_idct16x16_add_stage7_dual(step2, out);
+
+  if (output) {
+    highbd_idct16x16_store_pass1(out, output);
+  } else {
+    highbd_idct16x16_add_store(out, dest, stride, bd);
+  }
+}
+
+static void highbd_idct16x16_10_add_half1d_pass1(const tran_low_t *input,
+                                                 int32_t *output) {
+  const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0);
+  const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4);
+  const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8);
+  const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12);
+  int32x4_t in[4], step1[16], step2[16], out[16];
+
+  // Load input (4x4)
+  in[0] = vld1q_s32(input);
+  input += 16;
+  in[1] = vld1q_s32(input);
+  input += 16;
+  in[2] = vld1q_s32(input);
+  input += 16;
+  in[3] = vld1q_s32(input);
+
+  // Transpose
+  transpose_s32_4x4(&in[0], &in[1], &in[2], &in[3]);
+
+  // stage 1
+  step1[0] = in[0 / 2];
+  step1[4] = in[4 / 2];
+  step1[8] = in[2 / 2];
+  step1[12] = in[6 / 2];
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[4] = step1[4];
+  step2[8] = highbd_idct_cospi_lane1(step1[8], vget_low_s32(cospi_2_30_10_22));
+  step2[11] =
+      highbd_idct_cospi_lane1(step1[12], vget_low_s32(cospi_6_26N_14_18N));
+  step2[12] =
+      highbd_idct_cospi_lane0(step1[12], vget_low_s32(cospi_6_26N_14_18N));
+  step2[15] = highbd_idct_cospi_lane0(step1[8], vget_low_s32(cospi_2_30_10_22));
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[4] =
+      highbd_idct_cospi_lane1(step2[4], vget_high_s32(cospi_4_12_20N_28));
+  step1[7] = highbd_idct_cospi_lane0(step2[4], vget_low_s32(cospi_4_12_20N_28));
+  step1[8] = step2[8];
+  step1[9] = step2[8];
+  step1[10] = step2[11];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[13] = step2[12];
+  step1[14] = step2[15];
+  step1[15] = step2[15];
+
+  // stage 4
+  step2[0] = step2[1] =
+      highbd_idct_cospi_lane0(step1[0], vget_high_s32(cospi_0_8_16_24));
+  step2[4] = step1[4];
+  step2[5] = step1[4];
+  step2[6] = step1[7];
+  step2[7] = step1[7];
+  step2[8] = step1[8];
+  highbd_idct_cospi_8_24_d(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+                           &step2[14]);
+  highbd_idct_cospi_8_24_neg_d(step1[13], step1[10], cospi_0_8_16_24,
+                               &step2[13], &step2[10]);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  // stage 5
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[1];
+  step1[3] = step2[0];
+  step1[4] = step2[4];
+  highbd_idct_cospi_16_16_d(step2[5], step2[6], cospi_0_8_16_24, &step1[5],
+                            &step1[6]);
+  step1[7] = step2[7];
+  step1[8] = vaddq_s32(step2[8], step2[11]);
+  step1[9] = vaddq_s32(step2[9], step2[10]);
+  step1[10] = vsubq_s32(step2[9], step2[10]);
+  step1[11] = vsubq_s32(step2[8], step2[11]);
+  step1[12] = vsubq_s32(step2[15], step2[12]);
+  step1[13] = vsubq_s32(step2[14], step2[13]);
+  step1[14] = vaddq_s32(step2[14], step2[13]);
+  step1[15] = vaddq_s32(step2[15], step2[12]);
+
+  // stage 6
+  step2[0] = vaddq_s32(step1[0], step1[7]);
+  step2[1] = vaddq_s32(step1[1], step1[6]);
+  step2[2] = vaddq_s32(step1[2], step1[5]);
+  step2[3] = vaddq_s32(step1[3], step1[4]);
+  step2[4] = vsubq_s32(step1[3], step1[4]);
+  step2[5] = vsubq_s32(step1[2], step1[5]);
+  step2[6] = vsubq_s32(step1[1], step1[6]);
+  step2[7] = vsubq_s32(step1[0], step1[7]);
+  highbd_idct_cospi_16_16_d(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+                            &step2[13]);
+  highbd_idct_cospi_16_16_d(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+                            &step2[12]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+  highbd_idct16x16_add_stage7(step2, out);
+
+  // pass 1: save the result into output
+  vst1q_s32(output, out[0]);
+  output += 4;
+  vst1q_s32(output, out[1]);
+  output += 4;
+  vst1q_s32(output, out[2]);
+  output += 4;
+  vst1q_s32(output, out[3]);
+  output += 4;
+  vst1q_s32(output, out[4]);
+  output += 4;
+  vst1q_s32(output, out[5]);
+  output += 4;
+  vst1q_s32(output, out[6]);
+  output += 4;
+  vst1q_s32(output, out[7]);
+  output += 4;
+  vst1q_s32(output, out[8]);
+  output += 4;
+  vst1q_s32(output, out[9]);
+  output += 4;
+  vst1q_s32(output, out[10]);
+  output += 4;
+  vst1q_s32(output, out[11]);
+  output += 4;
+  vst1q_s32(output, out[12]);
+  output += 4;
+  vst1q_s32(output, out[13]);
+  output += 4;
+  vst1q_s32(output, out[14]);
+  output += 4;
+  vst1q_s32(output, out[15]);
+}
+
+static void highbd_idct16x16_10_add_half1d_pass2(const int32_t *input,
+                                                 int32_t *const output,
+                                                 uint16_t *const dest,
+                                                 const int stride,
+                                                 const int bd) {
+  const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0);
+  const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4);
+  const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8);
+  const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12);
+  int32x4x2_t in[4], step1[16], step2[16], out[16];
+
+  // Load input (4x8)
+  in[0].val[0] = vld1q_s32(input);
+  input += 4;
+  in[0].val[1] = vld1q_s32(input);
+  input += 4;
+  in[1].val[0] = vld1q_s32(input);
+  input += 4;
+  in[1].val[1] = vld1q_s32(input);
+  input += 4;
+  in[2].val[0] = vld1q_s32(input);
+  input += 4;
+  in[2].val[1] = vld1q_s32(input);
+  input += 4;
+  in[3].val[0] = vld1q_s32(input);
+  input += 4;
+  in[3].val[1] = vld1q_s32(input);
+
+  // Transpose
+  transpose_s32_4x8(&in[0].val[0], &in[0].val[1], &in[1].val[0], &in[1].val[1],
+                    &in[2].val[0], &in[2].val[1], &in[3].val[0], &in[3].val[1]);
+
+  // stage 1
+  step1[0] = in[0 / 2];
+  step1[4] = in[4 / 2];
+  step1[8] = in[2 / 2];
+  step1[12] = in[6 / 2];
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[4] = step1[4];
+  step2[8] =
+      highbd_idct_cospi_lane1_dual(step1[8], vget_low_s32(cospi_2_30_10_22));
+  step2[11] =
+      highbd_idct_cospi_lane1_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N));
+  step2[12] =
+      highbd_idct_cospi_lane0_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N));
+  step2[15] =
+      highbd_idct_cospi_lane0_dual(step1[8], vget_low_s32(cospi_2_30_10_22));
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[4] =
+      highbd_idct_cospi_lane1_dual(step2[4], vget_high_s32(cospi_4_12_20N_28));
+  step1[7] =
+      highbd_idct_cospi_lane0_dual(step2[4], vget_low_s32(cospi_4_12_20N_28));
+  step1[8] = step2[8];
+  step1[9] = step2[8];
+  step1[10] = step2[11];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[13] = step2[12];
+  step1[14] = step2[15];
+  step1[15] = step2[15];
+
+  // stage 4
+  step2[0] = step2[1] =
+      highbd_idct_cospi_lane0_dual(step1[0], vget_high_s32(cospi_0_8_16_24));
+  step2[4] = step1[4];
+  step2[5] = step1[4];
+  step2[6] = step1[7];
+  step2[7] = step1[7];
+  step2[8] = step1[8];
+  highbd_idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+                           &step2[14]);
+  highbd_idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24,
+                               &step2[13], &step2[10]);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  // stage 5
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[1];
+  step1[3] = step2[0];
+  step1[4] = step2[4];
+  highbd_idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5],
+                            &step1[6]);
+  step1[7] = step2[7];
+  step1[8] = highbd_idct_add_dual(step2[8], step2[11]);
+  step1[9] = highbd_idct_add_dual(step2[9], step2[10]);
+  step1[10] = highbd_idct_sub_dual(step2[9], step2[10]);
+  step1[11] = highbd_idct_sub_dual(step2[8], step2[11]);
+  step1[12] = highbd_idct_sub_dual(step2[15], step2[12]);
+  step1[13] = highbd_idct_sub_dual(step2[14], step2[13]);
+  step1[14] = highbd_idct_add_dual(step2[14], step2[13]);
+  step1[15] = highbd_idct_add_dual(step2[15], step2[12]);
+
+  // stage 6
+  step2[0] = highbd_idct_add_dual(step1[0], step1[7]);
+  step2[1] = highbd_idct_add_dual(step1[1], step1[6]);
+  step2[2] = highbd_idct_add_dual(step1[2], step1[5]);
+  step2[3] = highbd_idct_add_dual(step1[3], step1[4]);
+  step2[4] = highbd_idct_sub_dual(step1[3], step1[4]);
+  step2[5] = highbd_idct_sub_dual(step1[2], step1[5]);
+  step2[6] = highbd_idct_sub_dual(step1[1], step1[6]);
+  step2[7] = highbd_idct_sub_dual(step1[0], step1[7]);
+  highbd_idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+                            &step2[13]);
+  highbd_idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+                            &step2[12]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+  highbd_idct16x16_add_stage7_dual(step2, out);
+
+  if (output) {
+    highbd_idct16x16_store_pass1(out, output);
+  } else {
+    highbd_idct16x16_add_store(out, dest, stride, bd);
+  }
+}
+
+void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint16_t *dest,
+                                       int stride, int bd) {
+  if (bd == 8) {
+    int16_t row_idct_output[16 * 16];
+
+    // pass 1
+    // Parallel idct on the upper 8 rows
+    vpx_idct16x16_256_add_half1d(input, row_idct_output, dest, stride, 1);
+
+    // Parallel idct on the lower 8 rows
+    vpx_idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8, dest,
+                                 stride, 1);
+
+    // pass 2
+    // Parallel idct to get the left 8 columns
+    vpx_idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride, 1);
+
+    // Parallel idct to get the right 8 columns
+    vpx_idct16x16_256_add_half1d(row_idct_output + 8 * 16, NULL, dest + 8,
+                                 stride, 1);
+  } else {
+    int32_t row_idct_output[16 * 16];
+
+    // pass 1
+    // Parallel idct on the upper 8 rows
+    vpx_highbd_idct16x16_256_add_half1d(input, row_idct_output, dest, stride,
+                                        bd);
+
+    // Parallel idct on the lower 8 rows
+    vpx_highbd_idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8,
+                                        dest, stride, bd);
+
+    // pass 2
+    // Parallel idct to get the left 8 columns
+    vpx_highbd_idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride,
+                                        bd);
+
+    // Parallel idct to get the right 8 columns
+    vpx_highbd_idct16x16_256_add_half1d(row_idct_output + 8 * 16, NULL,
+                                        dest + 8, stride, bd);
+  }
+}
+
+void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint16_t *dest,
+                                      int stride, int bd) {
+  if (bd == 8) {
+    int16_t row_idct_output[16 * 16];
+
+    // pass 1
+    // Parallel idct on the upper 8 rows
+    vpx_idct16x16_38_add_half1d(input, row_idct_output, dest, stride, 1);
+
+    // pass 2
+    // Parallel idct to get the left 8 columns
+    vpx_idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, 1);
+
+    // Parallel idct to get the right 8 columns
+    vpx_idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8,
+                                stride, 1);
+  } else {
+    int32_t row_idct_output[16 * 16];
+
+    // pass 1
+    // Parallel idct on the upper 8 rows
+    vpx_highbd_idct16x16_38_add_half1d(input, row_idct_output, dest, stride,
+                                       bd);
+
+    // pass 2
+    // Parallel idct to get the left 8 columns
+    vpx_highbd_idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, bd);
+
+    // Parallel idct to get the right 8 columns
+    vpx_highbd_idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8,
+                                       stride, bd);
+  }
+}
+
+void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint16_t *dest,
+                                      int stride, int bd) {
+  if (bd == 8) {
+    int16_t row_idct_output[4 * 16];
+
+    // pass 1
+    // Parallel idct on the upper 8 rows
+    vpx_idct16x16_10_add_half1d_pass1(input, row_idct_output);
+
+    // pass 2
+    // Parallel idct to get the left 8 columns
+    vpx_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride, 1);
+
+    // Parallel idct to get the right 8 columns
+    vpx_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8,
+                                      stride, 1);
+  } else {
+    int32_t row_idct_output[4 * 16];
+
+    // pass 1
+    // Parallel idct on the upper 8 rows
+    highbd_idct16x16_10_add_half1d_pass1(input, row_idct_output);
+
+    // pass 2
+    // Parallel idct to get the left 8 columns
+    highbd_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride,
+                                         bd);
+
+    // Parallel idct to get the right 8 columns
+    highbd_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL,
+                                         dest + 8, stride, bd);
+  }
+}
+
+static INLINE void highbd_idct16x16_1_add_pos_kernel(uint16_t **dest,
+                                                     const int stride,
+                                                     const int16x8_t res,
+                                                     const int16x8_t max) {
+  const uint16x8_t a0 = vld1q_u16(*dest + 0);
+  const uint16x8_t a1 = vld1q_u16(*dest + 8);
+  const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
+  const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
+  const int16x8_t c0 = vminq_s16(b0, max);
+  const int16x8_t c1 = vminq_s16(b1, max);
+  vst1q_u16(*dest + 0, vreinterpretq_u16_s16(c0));
+  vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1));
+  *dest += stride;
+}
+
+static INLINE void highbd_idct16x16_1_add_neg_kernel(uint16_t **dest,
+                                                     const int stride,
+                                                     const int16x8_t res) {
+  const uint16x8_t a0 = vld1q_u16(*dest + 0);
+  const uint16x8_t a1 = vld1q_u16(*dest + 8);
+  const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
+  const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
+  const uint16x8_t c0 = vqshluq_n_s16(b0, 0);
+  const uint16x8_t c1 = vqshluq_n_s16(b1, 0);
+  vst1q_u16(*dest + 0, c0);
+  vst1q_u16(*dest + 8, c1);
+  *dest += stride;
+}
+
+void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint16_t *dest,
+                                     int stride, int bd) {
+  const tran_low_t out0 = HIGHBD_WRAPLOW(
+      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+  const tran_low_t out1 = HIGHBD_WRAPLOW(
+      dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd);
+  const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
+  const int16x8_t dc = vdupq_n_s16(a1);
+  int i;
+
+  if (a1 >= 0) {
+    const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+    for (i = 0; i < 4; ++i) {
+      highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
+      highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
+      highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
+      highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
+    }
+  } else {
+    for (i = 0; i < 4; ++i) {
+      highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
+      highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
+      highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
+      highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    }
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c
new file mode 100644
index 0000000000..5b36f73367
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c
@@ -0,0 +1,640 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE void load_from_transformed(const int32_t *const trans_buf,
+                                         const int first, const int second,
+                                         int32x4x2_t *const q0,
+                                         int32x4x2_t *const q1) {
+  q0->val[0] = vld1q_s32(trans_buf + first * 8);
+  q0->val[1] = vld1q_s32(trans_buf + first * 8 + 4);
+  q1->val[0] = vld1q_s32(trans_buf + second * 8);
+  q1->val[1] = vld1q_s32(trans_buf + second * 8 + 4);
+}
+
+static INLINE void load_from_output(const int32_t *const out, const int first,
+                                    const int second, int32x4x2_t *const q0,
+                                    int32x4x2_t *const q1) {
+  q0->val[0] = vld1q_s32(out + first * 32);
+  q0->val[1] = vld1q_s32(out + first * 32 + 4);
+  q1->val[0] = vld1q_s32(out + second * 32);
+  q1->val[1] = vld1q_s32(out + second * 32 + 4);
+}
+
+static INLINE void store_in_output(int32_t *const out, const int first,
+                                   const int second, const int32x4x2_t q0,
+                                   const int32x4x2_t q1) {
+  vst1q_s32(out + first * 32, q0.val[0]);
+  vst1q_s32(out + first * 32 + 4, q0.val[1]);
+  vst1q_s32(out + second * 32, q1.val[0]);
+  vst1q_s32(out + second * 32 + 4, q1.val[1]);
+}
+
+static INLINE void highbd_store_combine_results(
+    uint16_t *p1, uint16_t *p2, const int stride, const int32x4x2_t q0,
+    const int32x4x2_t q1, const int32x4x2_t q2, const int32x4x2_t q3,
+    const int16x8_t max) {
+  int16x8_t o[4];
+  uint16x8_t d[4];
+
+  d[0] = vld1q_u16(p1);
+  p1 += stride;
+  d[1] = vld1q_u16(p1);
+  d[3] = vld1q_u16(p2);
+  p2 -= stride;
+  d[2] = vld1q_u16(p2);
+
+  o[0] = vcombine_s16(vrshrn_n_s32(q0.val[0], 6), vrshrn_n_s32(q0.val[1], 6));
+  o[1] = vcombine_s16(vrshrn_n_s32(q1.val[0], 6), vrshrn_n_s32(q1.val[1], 6));
+  o[2] = vcombine_s16(vrshrn_n_s32(q2.val[0], 6), vrshrn_n_s32(q2.val[1], 6));
+  o[3] = vcombine_s16(vrshrn_n_s32(q3.val[0], 6), vrshrn_n_s32(q3.val[1], 6));
+
+  o[0] = vqaddq_s16(o[0], vreinterpretq_s16_u16(d[0]));
+  o[1] = vqaddq_s16(o[1], vreinterpretq_s16_u16(d[1]));
+  o[2] = vqaddq_s16(o[2], vreinterpretq_s16_u16(d[2]));
+  o[3] = vqaddq_s16(o[3], vreinterpretq_s16_u16(d[3]));
+  o[0] = vminq_s16(o[0], max);
+  o[1] = vminq_s16(o[1], max);
+  o[2] = vminq_s16(o[2], max);
+  o[3] = vminq_s16(o[3], max);
+  d[0] = vqshluq_n_s16(o[0], 0);
+  d[1] = vqshluq_n_s16(o[1], 0);
+  d[2] = vqshluq_n_s16(o[2], 0);
+  d[3] = vqshluq_n_s16(o[3], 0);
+
+  vst1q_u16(p1, d[1]);
+  p1 -= stride;
+  vst1q_u16(p1, d[0]);
+  vst1q_u16(p2, d[2]);
+  p2 += stride;
+  vst1q_u16(p2, d[3]);
+}
+
+static INLINE void do_butterfly(const int32x4x2_t qIn0, const int32x4x2_t qIn1,
+                                const int32_t first_const,
+                                const int32_t second_const,
+                                int32x4x2_t *const qOut0,
+                                int32x4x2_t *const qOut1) {
+  int64x2x2_t q[4];
+  int32x2_t d[6];
+
+  // Note: using v{mul, mla, mls}l_n_s32 here slows down 35% with gcc 4.9.
+  d[4] = vdup_n_s32(first_const);
+  d[5] = vdup_n_s32(second_const);
+
+  q[0].val[0] = vmull_s32(vget_low_s32(qIn0.val[0]), d[4]);
+  q[0].val[1] = vmull_s32(vget_high_s32(qIn0.val[0]), d[4]);
+  q[1].val[0] = vmull_s32(vget_low_s32(qIn0.val[1]), d[4]);
+  q[1].val[1] = vmull_s32(vget_high_s32(qIn0.val[1]), d[4]);
+  q[0].val[0] = vmlsl_s32(q[0].val[0], vget_low_s32(qIn1.val[0]), d[5]);
+  q[0].val[1] = vmlsl_s32(q[0].val[1], vget_high_s32(qIn1.val[0]), d[5]);
+  q[1].val[0] = vmlsl_s32(q[1].val[0], vget_low_s32(qIn1.val[1]), d[5]);
+  q[1].val[1] = vmlsl_s32(q[1].val[1], vget_high_s32(qIn1.val[1]), d[5]);
+
+  q[2].val[0] = vmull_s32(vget_low_s32(qIn0.val[0]), d[5]);
+  q[2].val[1] = vmull_s32(vget_high_s32(qIn0.val[0]), d[5]);
+  q[3].val[0] = vmull_s32(vget_low_s32(qIn0.val[1]), d[5]);
+  q[3].val[1] = vmull_s32(vget_high_s32(qIn0.val[1]), d[5]);
+  q[2].val[0] = vmlal_s32(q[2].val[0], vget_low_s32(qIn1.val[0]), d[4]);
+  q[2].val[1] = vmlal_s32(q[2].val[1], vget_high_s32(qIn1.val[0]), d[4]);
+  q[3].val[0] = vmlal_s32(q[3].val[0], vget_low_s32(qIn1.val[1]), d[4]);
+  q[3].val[1] = vmlal_s32(q[3].val[1], vget_high_s32(qIn1.val[1]), d[4]);
+
+  qOut0->val[0] = vcombine_s32(vrshrn_n_s64(q[0].val[0], DCT_CONST_BITS),
+                               vrshrn_n_s64(q[0].val[1], DCT_CONST_BITS));
+  qOut0->val[1] = vcombine_s32(vrshrn_n_s64(q[1].val[0], DCT_CONST_BITS),
+                               vrshrn_n_s64(q[1].val[1], DCT_CONST_BITS));
+  qOut1->val[0] = vcombine_s32(vrshrn_n_s64(q[2].val[0], DCT_CONST_BITS),
+                               vrshrn_n_s64(q[2].val[1], DCT_CONST_BITS));
+  qOut1->val[1] = vcombine_s32(vrshrn_n_s64(q[3].val[0], DCT_CONST_BITS),
+                               vrshrn_n_s64(q[3].val[1], DCT_CONST_BITS));
+}
+
+static INLINE void load_s32x4q_dual(const int32_t *in, int32x4x2_t *const s) {
+  s[0].val[0] = vld1q_s32(in);
+  s[0].val[1] = vld1q_s32(in + 4);
+  in += 32;
+  s[1].val[0] = vld1q_s32(in);
+  s[1].val[1] = vld1q_s32(in + 4);
+  in += 32;
+  s[2].val[0] = vld1q_s32(in);
+  s[2].val[1] = vld1q_s32(in + 4);
+  in += 32;
+  s[3].val[0] = vld1q_s32(in);
+  s[3].val[1] = vld1q_s32(in + 4);
+  in += 32;
+  s[4].val[0] = vld1q_s32(in);
+  s[4].val[1] = vld1q_s32(in + 4);
+  in += 32;
+  s[5].val[0] = vld1q_s32(in);
+  s[5].val[1] = vld1q_s32(in + 4);
+  in += 32;
+  s[6].val[0] = vld1q_s32(in);
+  s[6].val[1] = vld1q_s32(in + 4);
+  in += 32;
+  s[7].val[0] = vld1q_s32(in);
+  s[7].val[1] = vld1q_s32(in + 4);
+}
+
+static INLINE void transpose_and_store_s32_8x8(int32x4x2_t *const a,
+                                               int32_t **out) {
+  transpose_s32_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+
+  vst1q_s32(*out, a[0].val[0]);
+  *out += 4;
+  vst1q_s32(*out, a[0].val[1]);
+  *out += 4;
+  vst1q_s32(*out, a[1].val[0]);
+  *out += 4;
+  vst1q_s32(*out, a[1].val[1]);
+  *out += 4;
+  vst1q_s32(*out, a[2].val[0]);
+  *out += 4;
+  vst1q_s32(*out, a[2].val[1]);
+  *out += 4;
+  vst1q_s32(*out, a[3].val[0]);
+  *out += 4;
+  vst1q_s32(*out, a[3].val[1]);
+  *out += 4;
+  vst1q_s32(*out, a[4].val[0]);
+  *out += 4;
+  vst1q_s32(*out, a[4].val[1]);
+  *out += 4;
+  vst1q_s32(*out, a[5].val[0]);
+  *out += 4;
+  vst1q_s32(*out, a[5].val[1]);
+  *out += 4;
+  vst1q_s32(*out, a[6].val[0]);
+  *out += 4;
+  vst1q_s32(*out, a[6].val[1]);
+  *out += 4;
+  vst1q_s32(*out, a[7].val[0]);
+  *out += 4;
+  vst1q_s32(*out, a[7].val[1]);
+  *out += 4;
+}
+
+static INLINE void idct32_transpose_pair(const int32_t *input, int32_t *t_buf) {
+  int i;
+  int32x4x2_t s[8];
+
+  for (i = 0; i < 4; i++, input += 8) {
+    load_s32x4q_dual(input, s);
+    transpose_and_store_s32_8x8(s, &t_buf);
+  }
+}
+
+static INLINE void idct32_bands_end_1st_pass(int32_t *const out,
+                                             int32x4x2_t *const q) {
+  store_in_output(out, 16, 17, q[6], q[7]);
+  store_in_output(out, 14, 15, q[8], q[9]);
+
+  load_from_output(out, 30, 31, &q[0], &q[1]);
+  q[4] = highbd_idct_add_dual(q[2], q[1]);
+  q[5] = highbd_idct_add_dual(q[3], q[0]);
+  q[6] = highbd_idct_sub_dual(q[3], q[0]);
+  q[7] = highbd_idct_sub_dual(q[2], q[1]);
+  store_in_output(out, 30, 31, q[6], q[7]);
+  store_in_output(out, 0, 1, q[4], q[5]);
+
+  load_from_output(out, 12, 13, &q[0], &q[1]);
+  q[2] = highbd_idct_add_dual(q[10], q[1]);
+  q[3] = highbd_idct_add_dual(q[11], q[0]);
+  q[4] = highbd_idct_sub_dual(q[11], q[0]);
+  q[5] = highbd_idct_sub_dual(q[10], q[1]);
+
+  load_from_output(out, 18, 19, &q[0], &q[1]);
+  q[8] = highbd_idct_add_dual(q[4], q[1]);
+  q[9] = highbd_idct_add_dual(q[5], q[0]);
+  q[6] = highbd_idct_sub_dual(q[5], q[0]);
+  q[7] = highbd_idct_sub_dual(q[4], q[1]);
+  store_in_output(out, 18, 19, q[6], q[7]);
+  store_in_output(out, 12, 13, q[8], q[9]);
+
+  load_from_output(out, 28, 29, &q[0], &q[1]);
+  q[4] = highbd_idct_add_dual(q[2], q[1]);
+  q[5] = highbd_idct_add_dual(q[3], q[0]);
+  q[6] = highbd_idct_sub_dual(q[3], q[0]);
+  q[7] = highbd_idct_sub_dual(q[2], q[1]);
+  store_in_output(out, 28, 29, q[6], q[7]);
+  store_in_output(out, 2, 3, q[4], q[5]);
+
+  load_from_output(out, 10, 11, &q[0], &q[1]);
+  q[2] = highbd_idct_add_dual(q[12], q[1]);
+  q[3] = highbd_idct_add_dual(q[13], q[0]);
+  q[4] = highbd_idct_sub_dual(q[13], q[0]);
+  q[5] = highbd_idct_sub_dual(q[12], q[1]);
+
+  load_from_output(out, 20, 21, &q[0], &q[1]);
+  q[8] = highbd_idct_add_dual(q[4], q[1]);
+  q[9] = highbd_idct_add_dual(q[5], q[0]);
+  q[6] = highbd_idct_sub_dual(q[5], q[0]);
+  q[7] = highbd_idct_sub_dual(q[4], q[1]);
+  store_in_output(out, 20, 21, q[6], q[7]);
+  store_in_output(out, 10, 11, q[8], q[9]);
+
+  load_from_output(out, 26, 27, &q[0], &q[1]);
+  q[4] = highbd_idct_add_dual(q[2], q[1]);
+  q[5] = highbd_idct_add_dual(q[3], q[0]);
+  q[6] = highbd_idct_sub_dual(q[3], q[0]);
+  q[7] = highbd_idct_sub_dual(q[2], q[1]);
+  store_in_output(out, 26, 27, q[6], q[7]);
+  store_in_output(out, 4, 5, q[4], q[5]);
+
+  load_from_output(out, 8, 9, &q[0], &q[1]);
+  q[2] = highbd_idct_add_dual(q[14], q[1]);
+  q[3] = highbd_idct_add_dual(q[15], q[0]);
+  q[4] = highbd_idct_sub_dual(q[15], q[0]);
+  q[5] = highbd_idct_sub_dual(q[14], q[1]);
+
+  load_from_output(out, 22, 23, &q[0], &q[1]);
+  q[8] = highbd_idct_add_dual(q[4], q[1]);
+  q[9] = highbd_idct_add_dual(q[5], q[0]);
+  q[6] = highbd_idct_sub_dual(q[5], q[0]);
+  q[7] = highbd_idct_sub_dual(q[4], q[1]);
+  store_in_output(out, 22, 23, q[6], q[7]);
+  store_in_output(out, 8, 9, q[8], q[9]);
+
+  load_from_output(out, 24, 25, &q[0], &q[1]);
+  q[4] = highbd_idct_add_dual(q[2], q[1]);
+  q[5] = highbd_idct_add_dual(q[3], q[0]);
+  q[6] = highbd_idct_sub_dual(q[3], q[0]);
+  q[7] = highbd_idct_sub_dual(q[2], q[1]);
+  store_in_output(out, 24, 25, q[6], q[7]);
+  store_in_output(out, 6, 7, q[4], q[5]);
+}
+
+static INLINE void idct32_bands_end_2nd_pass(const int32_t *const out,
+                                             uint16_t *const dest,
+                                             const int stride,
+                                             const int16x8_t max,
+                                             int32x4x2_t *const q) {
+  uint16_t *dest0 = dest + 0 * stride;
+  uint16_t *dest1 = dest + 31 * stride;
+  uint16_t *dest2 = dest + 16 * stride;
+  uint16_t *dest3 = dest + 15 * stride;
+  const int str2 = stride << 1;
+
+  highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9],
+                               max);
+  dest2 += str2;
+  dest3 -= str2;
+
+  load_from_output(out, 30, 31, &q[0], &q[1]);
+  q[4] = highbd_idct_add_dual(q[2], q[1]);
+  q[5] = highbd_idct_add_dual(q[3], q[0]);
+  q[6] = highbd_idct_sub_dual(q[3], q[0]);
+  q[7] = highbd_idct_sub_dual(q[2], q[1]);
+  highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7],
+                               max);
+  dest0 += str2;
+  dest1 -= str2;
+
+  load_from_output(out, 12, 13, &q[0], &q[1]);
+  q[2] = highbd_idct_add_dual(q[10], q[1]);
+  q[3] = highbd_idct_add_dual(q[11], q[0]);
+  q[4] = highbd_idct_sub_dual(q[11], q[0]);
+  q[5] = highbd_idct_sub_dual(q[10], q[1]);
+
+  load_from_output(out, 18, 19, &q[0], &q[1]);
+  q[8] = highbd_idct_add_dual(q[4], q[1]);
+  q[9] = highbd_idct_add_dual(q[5], q[0]);
+  q[6] = highbd_idct_sub_dual(q[5], q[0]);
+  q[7] = highbd_idct_sub_dual(q[4], q[1]);
+  highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9],
+                               max);
+  dest2 += str2;
+  dest3 -= str2;
+
+  load_from_output(out, 28, 29, &q[0], &q[1]);
+  q[4] = highbd_idct_add_dual(q[2], q[1]);
+  q[5] = highbd_idct_add_dual(q[3], q[0]);
+  q[6] = highbd_idct_sub_dual(q[3], q[0]);
+  q[7] = highbd_idct_sub_dual(q[2], q[1]);
+  highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7],
+                               max);
+  dest0 += str2;
+  dest1 -= str2;
+
+  load_from_output(out, 10, 11, &q[0], &q[1]);
+  q[2] = highbd_idct_add_dual(q[12], q[1]);
+  q[3] = highbd_idct_add_dual(q[13], q[0]);
+  q[4] = highbd_idct_sub_dual(q[13], q[0]);
+  q[5] = highbd_idct_sub_dual(q[12], q[1]);
+
+  load_from_output(out, 20, 21, &q[0], &q[1]);
+  q[8] = highbd_idct_add_dual(q[4], q[1]);
+  q[9] = highbd_idct_add_dual(q[5], q[0]);
+  q[6] = highbd_idct_sub_dual(q[5], q[0]);
+  q[7] = highbd_idct_sub_dual(q[4], q[1]);
+  highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9],
+                               max);
+  dest2 += str2;
+  dest3 -= str2;
+
+  load_from_output(out, 26, 27, &q[0], &q[1]);
+  q[4] = highbd_idct_add_dual(q[2], q[1]);
+  q[5] = highbd_idct_add_dual(q[3], q[0]);
+  q[6] = highbd_idct_sub_dual(q[3], q[0]);
+  q[7] = highbd_idct_sub_dual(q[2], q[1]);
+  highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7],
+                               max);
+  dest0 += str2;
+  dest1 -= str2;
+
+  load_from_output(out, 8, 9, &q[0], &q[1]);
+  q[2] = highbd_idct_add_dual(q[14], q[1]);
+  q[3] = highbd_idct_add_dual(q[15], q[0]);
+  q[4] = highbd_idct_sub_dual(q[15], q[0]);
+  q[5] = highbd_idct_sub_dual(q[14], q[1]);
+
+  load_from_output(out, 22, 23, &q[0], &q[1]);
+  q[8] = highbd_idct_add_dual(q[4], q[1]);
+  q[9] = highbd_idct_add_dual(q[5], q[0]);
+  q[6] = highbd_idct_sub_dual(q[5], q[0]);
+  q[7] = highbd_idct_sub_dual(q[4], q[1]);
+  highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9],
+                               max);
+
+  load_from_output(out, 24, 25, &q[0], &q[1]);
+  q[4] = highbd_idct_add_dual(q[2], q[1]);
+  q[5] = highbd_idct_add_dual(q[3], q[0]);
+  q[6] = highbd_idct_sub_dual(q[3], q[0]);
+  q[7] = highbd_idct_sub_dual(q[2], q[1]);
+  highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7],
+                               max);
+}
+
+static INLINE void vpx_highbd_idct32_32_neon(const tran_low_t *input,
+                                             uint16_t *dst, const int stride,
+                                             const int bd) {
+  int i, idct32_pass_loop;
+  int32_t trans_buf[32 * 8];
+  int32_t pass1[32 * 32];
+  int32_t pass2[32 * 32];
+  int32_t *out;
+  int32x4x2_t q[16];
+
+  for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
+       idct32_pass_loop++, input = pass1, out = pass2) {
+    for (i = 0; i < 4; i++, out += 8) {  // idct32_bands_loop
+      idct32_transpose_pair(input, trans_buf);
+      input += 32 * 8;
+
+      // -----------------------------------------
+      // BLOCK A: 16-19,28-31
+      // -----------------------------------------
+      // generate 16,17,30,31
+      // part of stage 1
+      load_from_transformed(trans_buf, 1, 31, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_31_64, cospi_1_64, &q[0], &q[2]);
+      load_from_transformed(trans_buf, 17, 15, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_15_64, cospi_17_64, &q[1], &q[3]);
+      // part of stage 2
+      q[4] = highbd_idct_add_dual(q[0], q[1]);
+      q[13] = highbd_idct_sub_dual(q[0], q[1]);
+      q[6] = highbd_idct_add_dual(q[2], q[3]);
+      q[14] = highbd_idct_sub_dual(q[2], q[3]);
+      // part of stage 3
+      do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[5], &q[7]);
+
+      // generate 18,19,28,29
+      // part of stage 1
+      load_from_transformed(trans_buf, 9, 23, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_23_64, cospi_9_64, &q[0], &q[2]);
+      load_from_transformed(trans_buf, 25, 7, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_7_64, cospi_25_64, &q[1], &q[3]);
+      // part of stage 2
+      q[13] = highbd_idct_sub_dual(q[3], q[2]);
+      q[3] = highbd_idct_add_dual(q[3], q[2]);
+      q[14] = highbd_idct_sub_dual(q[1], q[0]);
+      q[2] = highbd_idct_add_dual(q[1], q[0]);
+      // part of stage 3
+      do_butterfly(q[14], q[13], -cospi_4_64, -cospi_28_64, &q[1], &q[0]);
+      // part of stage 4
+      q[8] = highbd_idct_add_dual(q[4], q[2]);
+      q[9] = highbd_idct_add_dual(q[5], q[0]);
+      q[10] = highbd_idct_add_dual(q[7], q[1]);
+      q[15] = highbd_idct_add_dual(q[6], q[3]);
+      q[13] = highbd_idct_sub_dual(q[5], q[0]);
+      q[14] = highbd_idct_sub_dual(q[7], q[1]);
+      store_in_output(out, 16, 31, q[8], q[15]);
+      store_in_output(out, 17, 30, q[9], q[10]);
+      // part of stage 5
+      do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[0], &q[1]);
+      store_in_output(out, 29, 18, q[1], q[0]);
+      // part of stage 4
+      q[13] = highbd_idct_sub_dual(q[4], q[2]);
+      q[14] = highbd_idct_sub_dual(q[6], q[3]);
+      // part of stage 5
+      do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[4], &q[6]);
+      store_in_output(out, 19, 28, q[4], q[6]);
+
+      // -----------------------------------------
+      // BLOCK B: 20-23,24-27
+      // -----------------------------------------
+      // generate 20,21,26,27
+      // part of stage 1
+      load_from_transformed(trans_buf, 5, 27, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_27_64, cospi_5_64, &q[0], &q[2]);
+      load_from_transformed(trans_buf, 21, 11, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_11_64, cospi_21_64, &q[1], &q[3]);
+      // part of stage 2
+      q[13] = highbd_idct_sub_dual(q[0], q[1]);
+      q[0] = highbd_idct_add_dual(q[0], q[1]);
+      q[14] = highbd_idct_sub_dual(q[2], q[3]);
+      q[2] = highbd_idct_add_dual(q[2], q[3]);
+      // part of stage 3
+      do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]);
+
+      // generate 22,23,24,25
+      // part of stage 1
+      load_from_transformed(trans_buf, 13, 19, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_19_64, cospi_13_64, &q[5], &q[7]);
+      load_from_transformed(trans_buf, 29, 3, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_3_64, cospi_29_64, &q[4], &q[6]);
+      // part of stage 2
+      q[14] = highbd_idct_sub_dual(q[4], q[5]);
+      q[5] = highbd_idct_add_dual(q[4], q[5]);
+      q[13] = highbd_idct_sub_dual(q[6], q[7]);
+      q[6] = highbd_idct_add_dual(q[6], q[7]);
+      // part of stage 3
+      do_butterfly(q[14], q[13], -cospi_20_64, -cospi_12_64, &q[4], &q[7]);
+      // part of stage 4
+      q[10] = highbd_idct_add_dual(q[7], q[1]);
+      q[11] = highbd_idct_add_dual(q[5], q[0]);
+      q[12] = highbd_idct_add_dual(q[6], q[2]);
+      q[15] = highbd_idct_add_dual(q[4], q[3]);
+      // part of stage 6
+      load_from_output(out, 16, 17, &q[14], &q[13]);
+      q[8] = highbd_idct_add_dual(q[14], q[11]);
+      q[9] = highbd_idct_add_dual(q[13], q[10]);
+      q[13] = highbd_idct_sub_dual(q[13], q[10]);
+      q[11] = highbd_idct_sub_dual(q[14], q[11]);
+      store_in_output(out, 17, 16, q[9], q[8]);
+      load_from_output(out, 30, 31, &q[14], &q[9]);
+      q[8] = highbd_idct_sub_dual(q[9], q[12]);
+      q[10] = highbd_idct_add_dual(q[14], q[15]);
+      q[14] = highbd_idct_sub_dual(q[14], q[15]);
+      q[12] = highbd_idct_add_dual(q[9], q[12]);
+      store_in_output(out, 30, 31, q[10], q[12]);
+      // part of stage 7
+      do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]);
+      store_in_output(out, 25, 22, q[14], q[13]);
+      do_butterfly(q[8], q[11], cospi_16_64, cospi_16_64, &q[13], &q[14]);
+      store_in_output(out, 24, 23, q[14], q[13]);
+      // part of stage 4
+      q[14] = highbd_idct_sub_dual(q[5], q[0]);
+      q[13] = highbd_idct_sub_dual(q[6], q[2]);
+      do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[5], &q[6]);
+      q[14] = highbd_idct_sub_dual(q[7], q[1]);
+      q[13] = highbd_idct_sub_dual(q[4], q[3]);
+      do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[0], &q[1]);
+      // part of stage 6
+      load_from_output(out, 18, 19, &q[14], &q[13]);
+      q[8] = highbd_idct_add_dual(q[14], q[1]);
+      q[9] = highbd_idct_add_dual(q[13], q[6]);
+      q[13] = highbd_idct_sub_dual(q[13], q[6]);
+      q[1] = highbd_idct_sub_dual(q[14], q[1]);
+      store_in_output(out, 18, 19, q[8], q[9]);
+      load_from_output(out, 28, 29, &q[8], &q[9]);
+      q[14] = highbd_idct_sub_dual(q[8], q[5]);
+      q[10] = highbd_idct_add_dual(q[8], q[5]);
+      q[11] = highbd_idct_add_dual(q[9], q[0]);
+      q[0] = highbd_idct_sub_dual(q[9], q[0]);
+      store_in_output(out, 28, 29, q[10], q[11]);
+      // part of stage 7
+      do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]);
+      store_in_output(out, 20, 27, q[13], q[14]);
+      do_butterfly(q[0], q[1], cospi_16_64, cospi_16_64, &q[1], &q[0]);
+      store_in_output(out, 21, 26, q[1], q[0]);
+
+      // -----------------------------------------
+      // BLOCK C: 8-10,11-15
+      // -----------------------------------------
+      // generate 8,9,14,15
+      // part of stage 2
+      load_from_transformed(trans_buf, 2, 30, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_30_64, cospi_2_64, &q[0], &q[2]);
+      load_from_transformed(trans_buf, 18, 14, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_14_64, cospi_18_64, &q[1], &q[3]);
+      // part of stage 3
+      q[13] = highbd_idct_sub_dual(q[0], q[1]);
+      q[0] = highbd_idct_add_dual(q[0], q[1]);
+      q[14] = highbd_idct_sub_dual(q[2], q[3]);
+      q[2] = highbd_idct_add_dual(q[2], q[3]);
+      // part of stage 4
+      do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[1], &q[3]);
+
+      // generate 10,11,12,13
+      // part of stage 2
+      load_from_transformed(trans_buf, 10, 22, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_22_64, cospi_10_64, &q[5], &q[7]);
+      load_from_transformed(trans_buf, 26, 6, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_6_64, cospi_26_64, &q[4], &q[6]);
+      // part of stage 3
+      q[14] = highbd_idct_sub_dual(q[4], q[5]);
+      q[5] = highbd_idct_add_dual(q[4], q[5]);
+      q[13] = highbd_idct_sub_dual(q[6], q[7]);
+      q[6] = highbd_idct_add_dual(q[6], q[7]);
+      // part of stage 4
+      do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[4], &q[7]);
+      // part of stage 5
+      q[8] = highbd_idct_add_dual(q[0], q[5]);
+      q[9] = highbd_idct_add_dual(q[1], q[7]);
+      q[13] = highbd_idct_sub_dual(q[1], q[7]);
+      q[14] = highbd_idct_sub_dual(q[3], q[4]);
+      q[10] = highbd_idct_add_dual(q[3], q[4]);
+      q[15] = highbd_idct_add_dual(q[2], q[6]);
+      store_in_output(out, 8, 15, q[8], q[15]);
+      store_in_output(out, 9, 14, q[9], q[10]);
+      // part of stage 6
+      do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]);
+      store_in_output(out, 13, 10, q[3], q[1]);
+      q[13] = highbd_idct_sub_dual(q[0], q[5]);
+      q[14] = highbd_idct_sub_dual(q[2], q[6]);
+      do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]);
+      store_in_output(out, 11, 12, q[1], q[3]);
+
+      // -----------------------------------------
+      // BLOCK D: 0-3,4-7
+      // -----------------------------------------
+      // generate 4,5,6,7
+      // part of stage 3
+      load_from_transformed(trans_buf, 4, 28, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[0], &q[2]);
+      load_from_transformed(trans_buf, 20, 12, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]);
+      // part of stage 4
+      q[13] = highbd_idct_sub_dual(q[0], q[1]);
+      q[0] = highbd_idct_add_dual(q[0], q[1]);
+      q[14] = highbd_idct_sub_dual(q[2], q[3]);
+      q[2] = highbd_idct_add_dual(q[2], q[3]);
+      // part of stage 5
+      do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]);
+
+      // generate 0,1,2,3
+      // part of stage 4
+      load_from_transformed(trans_buf, 0, 16, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[5], &q[7]);
+      load_from_transformed(trans_buf, 8, 24, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[14], &q[6]);
+      // part of stage 5
+      q[4] = highbd_idct_add_dual(q[7], q[6]);
+      q[7] = highbd_idct_sub_dual(q[7], q[6]);
+      q[6] = highbd_idct_sub_dual(q[5], q[14]);
+      q[5] = highbd_idct_add_dual(q[5], q[14]);
+      // part of stage 6
+      q[8] = highbd_idct_add_dual(q[4], q[2]);
+      q[9] = highbd_idct_add_dual(q[5], q[3]);
+      q[10] = highbd_idct_add_dual(q[6], q[1]);
+      q[11] = highbd_idct_add_dual(q[7], q[0]);
+      q[12] = highbd_idct_sub_dual(q[7], q[0]);
+      q[13] = highbd_idct_sub_dual(q[6], q[1]);
+      q[14] = highbd_idct_sub_dual(q[5], q[3]);
+      q[15] = highbd_idct_sub_dual(q[4], q[2]);
+      // part of stage 7
+      load_from_output(out, 14, 15, &q[0], &q[1]);
+      q[2] = highbd_idct_add_dual(q[8], q[1]);
+      q[3] = highbd_idct_add_dual(q[9], q[0]);
+      q[4] = highbd_idct_sub_dual(q[9], q[0]);
+      q[5] = highbd_idct_sub_dual(q[8], q[1]);
+      load_from_output(out, 16, 17, &q[0], &q[1]);
+      q[8] = highbd_idct_add_dual(q[4], q[1]);
+      q[9] = highbd_idct_add_dual(q[5], q[0]);
+      q[6] = highbd_idct_sub_dual(q[5], q[0]);
+      q[7] = highbd_idct_sub_dual(q[4], q[1]);
+
+      if (idct32_pass_loop == 0) {
+        idct32_bands_end_1st_pass(out, q);
+      } else {
+        const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+        idct32_bands_end_2nd_pass(out, dst, stride, max, q);
+        dst += 8;
+      }
+    }
+  }
+}
+
+void vpx_highbd_idct32x32_1024_add_neon(const tran_low_t *input, uint16_t *dest,
+                                        int stride, int bd) {
+  if (bd == 8) {
+    vpx_idct32_32_neon(input, CAST_TO_BYTEPTR(dest), stride, 1);
+  } else {
+    vpx_highbd_idct32_32_neon(input, dest, stride, bd);
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c
new file mode 100644
index 0000000000..6750c1a426
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c
@@ -0,0 +1,757 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/highbd_idct_neon.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE void load_8x8_s32_dual(
+    const tran_low_t *input, int32x4x2_t *const in0, int32x4x2_t *const in1,
+    int32x4x2_t *const in2, int32x4x2_t *const in3, int32x4x2_t *const in4,
+    int32x4x2_t *const in5, int32x4x2_t *const in6, int32x4x2_t *const in7) {
+  in0->val[0] = vld1q_s32(input);
+  in0->val[1] = vld1q_s32(input + 4);
+  input += 32;
+  in1->val[0] = vld1q_s32(input);
+  in1->val[1] = vld1q_s32(input + 4);
+  input += 32;
+  in2->val[0] = vld1q_s32(input);
+  in2->val[1] = vld1q_s32(input + 4);
+  input += 32;
+  in3->val[0] = vld1q_s32(input);
+  in3->val[1] = vld1q_s32(input + 4);
+  input += 32;
+  in4->val[0] = vld1q_s32(input);
+  in4->val[1] = vld1q_s32(input + 4);
+  input += 32;
+  in5->val[0] = vld1q_s32(input);
+  in5->val[1] = vld1q_s32(input + 4);
+  input += 32;
+  in6->val[0] = vld1q_s32(input);
+  in6->val[1] = vld1q_s32(input + 4);
+  input += 32;
+  in7->val[0] = vld1q_s32(input);
+  in7->val[1] = vld1q_s32(input + 4);
+}
+
+static INLINE void load_4x8_s32_dual(const tran_low_t *input,
+                                     int32x4_t *const in0, int32x4_t *const in1,
+                                     int32x4_t *const in2, int32x4_t *const in3,
+                                     int32x4_t *const in4, int32x4_t *const in5,
+                                     int32x4_t *const in6,
+                                     int32x4_t *const in7) {
+  *in0 = vld1q_s32(input);
+  input += 32;
+  *in1 = vld1q_s32(input);
+  input += 32;
+  *in2 = vld1q_s32(input);
+  input += 32;
+  *in3 = vld1q_s32(input);
+  input += 32;
+  *in4 = vld1q_s32(input);
+  input += 32;
+  *in5 = vld1q_s32(input);
+  input += 32;
+  *in6 = vld1q_s32(input);
+  input += 32;
+  *in7 = vld1q_s32(input);
+}
+
+// Only for the first pass of the  _135_ variant. Since it only uses values from
+// the top left 16x16 it can safely assume all the remaining values are 0 and
+// skip an awful lot of calculations. In fact, only the first 12 columns make
+// the cut. None of the elements in the 13th, 14th, 15th or 16th columns are
+// used so it skips any calls to input[12|13|14|15] too.
+// In C this does a single row of 32 for each call. Here it transposes the top
+// left 12x8 to allow using SIMD.
+
+// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 135 non-zero
+// coefficients as follows:
+//      0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
+//  0   0   2   5  10  17  25  38  47  62  83 101 121
+//  1   1   4   8  15  22  30  45  58  74  92 112 133
+//  2   3   7  12  18  28  36  52  64  82 102 118
+//  3   6  11  16  23  31  43  60  73  90 109 126
+//  4   9  14  19  29  37  50  65  78  98 116 134
+//  5  13  20  26  35  44  54  72  85 105 123
+//  6  21  27  33  42  53  63  80  94 113 132
+//  7  24  32  39  48  57  71  88 104 120
+//  8  34  40  46  56  68  81  96 111 130
+//  9  41  49  55  67  77  91 107 124
+// 10  51  59  66  76  89  99 119 131
+// 11  61  69  75  87 100 114 129
+// 12  70  79  86  97 108 122
+// 13  84  93 103 110 125
+// 14  98 106 115 127
+// 15 117 128
+static void vpx_highbd_idct32_12_neon(const tran_low_t *const input,
+                                      int32_t *output) {
+  int32x4x2_t in[12], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32],
+      s8[32];
+
+  load_8x8_s32_dual(input, &in[0], &in[1], &in[2], &in[3], &in[4], &in[5],
+                    &in[6], &in[7]);
+  transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+                    &in[7]);
+
+  load_4x8_s32_dual(input + 8, &in[8].val[0], &in[8].val[1], &in[9].val[0],
+                    &in[9].val[1], &in[10].val[0], &in[10].val[1],
+                    &in[11].val[0], &in[11].val[1]);
+  transpose_s32_4x8(&in[8].val[0], &in[8].val[1], &in[9].val[0], &in[9].val[1],
+                    &in[10].val[0], &in[10].val[1], &in[11].val[0],
+                    &in[11].val[1]);
+
+  // stage 1
+  s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64);
+  s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64);
+
+  s1[18] = multiply_shift_and_narrow_s32_dual(in[9], cospi_23_64);
+  s1[29] = multiply_shift_and_narrow_s32_dual(in[9], cospi_9_64);
+
+  s1[19] = multiply_shift_and_narrow_s32_dual(in[7], -cospi_25_64);
+  s1[28] = multiply_shift_and_narrow_s32_dual(in[7], cospi_7_64);
+
+  s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64);
+  s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64);
+
+  s1[21] = multiply_shift_and_narrow_s32_dual(in[11], -cospi_21_64);
+  s1[26] = multiply_shift_and_narrow_s32_dual(in[11], cospi_11_64);
+
+  s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64);
+  s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64);
+
+  // stage 2
+  s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64);
+  s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64);
+
+  s2[10] = multiply_shift_and_narrow_s32_dual(in[10], cospi_22_64);
+  s2[13] = multiply_shift_and_narrow_s32_dual(in[10], cospi_10_64);
+
+  s2[11] = multiply_shift_and_narrow_s32_dual(in[6], -cospi_26_64);
+  s2[12] = multiply_shift_and_narrow_s32_dual(in[6], cospi_6_64);
+
+  s2[18] = highbd_idct_sub_dual(s1[19], s1[18]);
+  s2[19] = highbd_idct_add_dual(s1[18], s1[19]);
+  s2[20] = highbd_idct_add_dual(s1[20], s1[21]);
+  s2[21] = highbd_idct_sub_dual(s1[20], s1[21]);
+  s2[26] = highbd_idct_sub_dual(s1[27], s1[26]);
+  s2[27] = highbd_idct_add_dual(s1[26], s1[27]);
+  s2[28] = highbd_idct_add_dual(s1[28], s1[29]);
+  s2[29] = highbd_idct_sub_dual(s1[28], s1[29]);
+
+  // stage 3
+  s3[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64);
+  s3[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64);
+
+  s3[10] = highbd_idct_sub_dual(s2[11], s2[10]);
+  s3[11] = highbd_idct_add_dual(s2[10], s2[11]);
+  s3[12] = highbd_idct_add_dual(s2[12], s2[13]);
+  s3[13] = highbd_idct_sub_dual(s2[12], s2[13]);
+
+  s3[17] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_4_64,
+                                                         s1[31], cospi_28_64);
+  s3[30] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_28_64,
+                                                         s1[31], cospi_4_64);
+
+  s3[18] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_28_64,
+                                                         s2[29], -cospi_4_64);
+  s3[29] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_4_64,
+                                                         s2[29], cospi_28_64);
+
+  s3[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_20_64,
+                                                         s2[26], cospi_12_64);
+  s3[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], cospi_12_64,
+                                                         s2[26], cospi_20_64);
+
+  s3[22] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_12_64,
+                                                         s1[24], -cospi_20_64);
+  s3[25] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_20_64,
+                                                         s1[24], cospi_12_64);
+
+  // stage 4
+  s4[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64);
+  s4[2] = multiply_shift_and_narrow_s32_dual(in[8], cospi_24_64);
+  s4[3] = multiply_shift_and_narrow_s32_dual(in[8], cospi_8_64);
+
+  s4[9] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], -cospi_8_64,
+                                                        s2[15], cospi_24_64);
+  s4[14] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], cospi_24_64,
+                                                         s2[15], cospi_8_64);
+
+  s4[10] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_24_64,
+                                                         s3[13], -cospi_8_64);
+  s4[13] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_8_64,
+                                                         s3[13], cospi_24_64);
+
+  s4[16] = highbd_idct_add_dual(s1[16], s2[19]);
+  s4[17] = highbd_idct_add_dual(s3[17], s3[18]);
+  s4[18] = highbd_idct_sub_dual(s3[17], s3[18]);
+  s4[19] = highbd_idct_sub_dual(s1[16], s2[19]);
+  s4[20] = highbd_idct_sub_dual(s1[23], s2[20]);
+  s4[21] = highbd_idct_sub_dual(s3[22], s3[21]);
+  s4[22] = highbd_idct_add_dual(s3[21], s3[22]);
+  s4[23] = highbd_idct_add_dual(s2[20], s1[23]);
+  s4[24] = highbd_idct_add_dual(s1[24], s2[27]);
+  s4[25] = highbd_idct_add_dual(s3[25], s3[26]);
+  s4[26] = highbd_idct_sub_dual(s3[25], s3[26]);
+  s4[27] = highbd_idct_sub_dual(s1[24], s2[27]);
+  s4[28] = highbd_idct_sub_dual(s1[31], s2[28]);
+  s4[29] = highbd_idct_sub_dual(s3[30], s3[29]);
+  s4[30] = highbd_idct_add_dual(s3[29], s3[30]);
+  s4[31] = highbd_idct_add_dual(s2[28], s1[31]);
+
+  // stage 5
+  s5[0] = highbd_idct_add_dual(s4[0], s4[3]);
+  s5[1] = highbd_idct_add_dual(s4[0], s4[2]);
+  s5[2] = highbd_idct_sub_dual(s4[0], s4[2]);
+  s5[3] = highbd_idct_sub_dual(s4[0], s4[3]);
+
+  s5[5] = sub_multiply_shift_and_narrow_s32_dual(s3[7], s3[4], cospi_16_64);
+  s5[6] = add_multiply_shift_and_narrow_s32_dual(s3[4], s3[7], cospi_16_64);
+
+  s5[8] = highbd_idct_add_dual(s2[8], s3[11]);
+  s5[9] = highbd_idct_add_dual(s4[9], s4[10]);
+  s5[10] = highbd_idct_sub_dual(s4[9], s4[10]);
+  s5[11] = highbd_idct_sub_dual(s2[8], s3[11]);
+  s5[12] = highbd_idct_sub_dual(s2[15], s3[12]);
+  s5[13] = highbd_idct_sub_dual(s4[14], s4[13]);
+  s5[14] = highbd_idct_add_dual(s4[13], s4[14]);
+  s5[15] = highbd_idct_add_dual(s2[15], s3[12]);
+
+  s5[18] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], -cospi_8_64,
+                                                         s4[29], cospi_24_64);
+  s5[29] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], cospi_24_64,
+                                                         s4[29], cospi_8_64);
+
+  s5[19] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], -cospi_8_64,
+                                                         s4[28], cospi_24_64);
+  s5[28] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], cospi_24_64,
+                                                         s4[28], cospi_8_64);
+
+  s5[20] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_24_64,
+                                                         s4[27], -cospi_8_64);
+  s5[27] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_8_64,
+                                                         s4[27], cospi_24_64);
+
+  s5[21] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_24_64,
+                                                         s4[26], -cospi_8_64);
+  s5[26] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_8_64,
+                                                         s4[26], cospi_24_64);
+
+  // stage 6
+  s6[0] = highbd_idct_add_dual(s5[0], s3[7]);
+  s6[1] = highbd_idct_add_dual(s5[1], s5[6]);
+  s6[2] = highbd_idct_add_dual(s5[2], s5[5]);
+  s6[3] = highbd_idct_add_dual(s5[3], s3[4]);
+  s6[4] = highbd_idct_sub_dual(s5[3], s3[4]);
+  s6[5] = highbd_idct_sub_dual(s5[2], s5[5]);
+  s6[6] = highbd_idct_sub_dual(s5[1], s5[6]);
+  s6[7] = highbd_idct_sub_dual(s5[0], s3[7]);
+
+  s6[10] = sub_multiply_shift_and_narrow_s32_dual(s5[13], s5[10], cospi_16_64);
+  s6[13] = add_multiply_shift_and_narrow_s32_dual(s5[10], s5[13], cospi_16_64);
+
+  s6[11] = sub_multiply_shift_and_narrow_s32_dual(s5[12], s5[11], cospi_16_64);
+  s6[12] = add_multiply_shift_and_narrow_s32_dual(s5[11], s5[12], cospi_16_64);
+
+  s6[16] = highbd_idct_add_dual(s4[16], s4[23]);
+  s6[17] = highbd_idct_add_dual(s4[17], s4[22]);
+  s6[18] = highbd_idct_add_dual(s5[18], s5[21]);
+  s6[19] = highbd_idct_add_dual(s5[19], s5[20]);
+  s6[20] = highbd_idct_sub_dual(s5[19], s5[20]);
+  s6[21] = highbd_idct_sub_dual(s5[18], s5[21]);
+  s6[22] = highbd_idct_sub_dual(s4[17], s4[22]);
+  s6[23] = highbd_idct_sub_dual(s4[16], s4[23]);
+
+  s6[24] = highbd_idct_sub_dual(s4[31], s4[24]);
+  s6[25] = highbd_idct_sub_dual(s4[30], s4[25]);
+  s6[26] = highbd_idct_sub_dual(s5[29], s5[26]);
+  s6[27] = highbd_idct_sub_dual(s5[28], s5[27]);
+  s6[28] = highbd_idct_add_dual(s5[27], s5[28]);
+  s6[29] = highbd_idct_add_dual(s5[26], s5[29]);
+  s6[30] = highbd_idct_add_dual(s4[25], s4[30]);
+  s6[31] = highbd_idct_add_dual(s4[24], s4[31]);
+
+  // stage 7
+  s7[0] = highbd_idct_add_dual(s6[0], s5[15]);
+  s7[1] = highbd_idct_add_dual(s6[1], s5[14]);
+  s7[2] = highbd_idct_add_dual(s6[2], s6[13]);
+  s7[3] = highbd_idct_add_dual(s6[3], s6[12]);
+  s7[4] = highbd_idct_add_dual(s6[4], s6[11]);
+  s7[5] = highbd_idct_add_dual(s6[5], s6[10]);
+  s7[6] = highbd_idct_add_dual(s6[6], s5[9]);
+  s7[7] = highbd_idct_add_dual(s6[7], s5[8]);
+  s7[8] = highbd_idct_sub_dual(s6[7], s5[8]);
+  s7[9] = highbd_idct_sub_dual(s6[6], s5[9]);
+  s7[10] = highbd_idct_sub_dual(s6[5], s6[10]);
+  s7[11] = highbd_idct_sub_dual(s6[4], s6[11]);
+  s7[12] = highbd_idct_sub_dual(s6[3], s6[12]);
+  s7[13] = highbd_idct_sub_dual(s6[2], s6[13]);
+  s7[14] = highbd_idct_sub_dual(s6[1], s5[14]);
+  s7[15] = highbd_idct_sub_dual(s6[0], s5[15]);
+
+  s7[20] = sub_multiply_shift_and_narrow_s32_dual(s6[27], s6[20], cospi_16_64);
+  s7[27] = add_multiply_shift_and_narrow_s32_dual(s6[20], s6[27], cospi_16_64);
+
+  s7[21] = sub_multiply_shift_and_narrow_s32_dual(s6[26], s6[21], cospi_16_64);
+  s7[26] = add_multiply_shift_and_narrow_s32_dual(s6[21], s6[26], cospi_16_64);
+
+  s7[22] = sub_multiply_shift_and_narrow_s32_dual(s6[25], s6[22], cospi_16_64);
+  s7[25] = add_multiply_shift_and_narrow_s32_dual(s6[22], s6[25], cospi_16_64);
+
+  s7[23] = sub_multiply_shift_and_narrow_s32_dual(s6[24], s6[23], cospi_16_64);
+  s7[24] = add_multiply_shift_and_narrow_s32_dual(s6[23], s6[24], cospi_16_64);
+
+  // final stage
+  s8[0] = highbd_idct_add_dual(s7[0], s6[31]);
+  s8[1] = highbd_idct_add_dual(s7[1], s6[30]);
+  s8[2] = highbd_idct_add_dual(s7[2], s6[29]);
+  s8[3] = highbd_idct_add_dual(s7[3], s6[28]);
+  s8[4] = highbd_idct_add_dual(s7[4], s7[27]);
+  s8[5] = highbd_idct_add_dual(s7[5], s7[26]);
+  s8[6] = highbd_idct_add_dual(s7[6], s7[25]);
+  s8[7] = highbd_idct_add_dual(s7[7], s7[24]);
+  s8[8] = highbd_idct_add_dual(s7[8], s7[23]);
+  s8[9] = highbd_idct_add_dual(s7[9], s7[22]);
+  s8[10] = highbd_idct_add_dual(s7[10], s7[21]);
+  s8[11] = highbd_idct_add_dual(s7[11], s7[20]);
+  s8[12] = highbd_idct_add_dual(s7[12], s6[19]);
+  s8[13] = highbd_idct_add_dual(s7[13], s6[18]);
+  s8[14] = highbd_idct_add_dual(s7[14], s6[17]);
+  s8[15] = highbd_idct_add_dual(s7[15], s6[16]);
+  s8[16] = highbd_idct_sub_dual(s7[15], s6[16]);
+  s8[17] = highbd_idct_sub_dual(s7[14], s6[17]);
+  s8[18] = highbd_idct_sub_dual(s7[13], s6[18]);
+  s8[19] = highbd_idct_sub_dual(s7[12], s6[19]);
+  s8[20] = highbd_idct_sub_dual(s7[11], s7[20]);
+  s8[21] = highbd_idct_sub_dual(s7[10], s7[21]);
+  s8[22] = highbd_idct_sub_dual(s7[9], s7[22]);
+  s8[23] = highbd_idct_sub_dual(s7[8], s7[23]);
+  s8[24] = highbd_idct_sub_dual(s7[7], s7[24]);
+  s8[25] = highbd_idct_sub_dual(s7[6], s7[25]);
+  s8[26] = highbd_idct_sub_dual(s7[5], s7[26]);
+  s8[27] = highbd_idct_sub_dual(s7[4], s7[27]);
+  s8[28] = highbd_idct_sub_dual(s7[3], s6[28]);
+  s8[29] = highbd_idct_sub_dual(s7[2], s6[29]);
+  s8[30] = highbd_idct_sub_dual(s7[1], s6[30]);
+  s8[31] = highbd_idct_sub_dual(s7[0], s6[31]);
+
+  vst1q_s32(output + 0, s8[0].val[0]);
+  vst1q_s32(output + 4, s8[0].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[1].val[0]);
+  vst1q_s32(output + 4, s8[1].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[2].val[0]);
+  vst1q_s32(output + 4, s8[2].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[3].val[0]);
+  vst1q_s32(output + 4, s8[3].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[4].val[0]);
+  vst1q_s32(output + 4, s8[4].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[5].val[0]);
+  vst1q_s32(output + 4, s8[5].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[6].val[0]);
+  vst1q_s32(output + 4, s8[6].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[7].val[0]);
+  vst1q_s32(output + 4, s8[7].val[1]);
+  output += 16;
+
+  vst1q_s32(output + 0, s8[8].val[0]);
+  vst1q_s32(output + 4, s8[8].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[9].val[0]);
+  vst1q_s32(output + 4, s8[9].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[10].val[0]);
+  vst1q_s32(output + 4, s8[10].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[11].val[0]);
+  vst1q_s32(output + 4, s8[11].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[12].val[0]);
+  vst1q_s32(output + 4, s8[12].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[13].val[0]);
+  vst1q_s32(output + 4, s8[13].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[14].val[0]);
+  vst1q_s32(output + 4, s8[14].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[15].val[0]);
+  vst1q_s32(output + 4, s8[15].val[1]);
+  output += 16;
+
+  vst1q_s32(output + 0, s8[16].val[0]);
+  vst1q_s32(output + 4, s8[16].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[17].val[0]);
+  vst1q_s32(output + 4, s8[17].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[18].val[0]);
+  vst1q_s32(output + 4, s8[18].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[19].val[0]);
+  vst1q_s32(output + 4, s8[19].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[20].val[0]);
+  vst1q_s32(output + 4, s8[20].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[21].val[0]);
+  vst1q_s32(output + 4, s8[21].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[22].val[0]);
+  vst1q_s32(output + 4, s8[22].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[23].val[0]);
+  vst1q_s32(output + 4, s8[23].val[1]);
+  output += 16;
+
+  vst1q_s32(output + 0, s8[24].val[0]);
+  vst1q_s32(output + 4, s8[24].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[25].val[0]);
+  vst1q_s32(output + 4, s8[25].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[26].val[0]);
+  vst1q_s32(output + 4, s8[26].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[27].val[0]);
+  vst1q_s32(output + 4, s8[27].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[28].val[0]);
+  vst1q_s32(output + 4, s8[28].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[29].val[0]);
+  vst1q_s32(output + 4, s8[29].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[30].val[0]);
+  vst1q_s32(output + 4, s8[30].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, s8[31].val[0]);
+  vst1q_s32(output + 4, s8[31].val[1]);
+}
+
+static void vpx_highbd_idct32_16_neon(const int32_t *const input,
+                                      uint16_t *const output, const int stride,
+                                      const int bd) {
+  int32x4x2_t in[16], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32],
+      out[32];
+
+  load_and_transpose_s32_8x8(input, 16, &in[0], &in[1], &in[2], &in[3], &in[4],
+                             &in[5], &in[6], &in[7]);
+
+  load_and_transpose_s32_8x8(input + 8, 16, &in[8], &in[9], &in[10], &in[11],
+                             &in[12], &in[13], &in[14], &in[15]);
+
+  // stage 1
+  s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64);
+  s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64);
+
+  s1[17] = multiply_shift_and_narrow_s32_dual(in[15], -cospi_17_64);
+  s1[30] = multiply_shift_and_narrow_s32_dual(in[15], cospi_15_64);
+
+  s1[18] = multiply_shift_and_narrow_s32_dual(in[9], cospi_23_64);
+  s1[29] = multiply_shift_and_narrow_s32_dual(in[9], cospi_9_64);
+
+  s1[19] = multiply_shift_and_narrow_s32_dual(in[7], -cospi_25_64);
+  s1[28] = multiply_shift_and_narrow_s32_dual(in[7], cospi_7_64);
+
+  s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64);
+  s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64);
+
+  s1[21] = multiply_shift_and_narrow_s32_dual(in[11], -cospi_21_64);
+  s1[26] = multiply_shift_and_narrow_s32_dual(in[11], cospi_11_64);
+
+  s1[22] = multiply_shift_and_narrow_s32_dual(in[13], cospi_19_64);
+  s1[25] = multiply_shift_and_narrow_s32_dual(in[13], cospi_13_64);
+
+  s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64);
+  s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64);
+
+  // stage 2
+  s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64);
+  s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64);
+
+  s2[9] = multiply_shift_and_narrow_s32_dual(in[14], -cospi_18_64);
+  s2[14] = multiply_shift_and_narrow_s32_dual(in[14], cospi_14_64);
+
+  s2[10] = multiply_shift_and_narrow_s32_dual(in[10], cospi_22_64);
+  s2[13] = multiply_shift_and_narrow_s32_dual(in[10], cospi_10_64);
+
+  s2[11] = multiply_shift_and_narrow_s32_dual(in[6], -cospi_26_64);
+  s2[12] = multiply_shift_and_narrow_s32_dual(in[6], cospi_6_64);
+
+  s2[16] = highbd_idct_add_dual(s1[16], s1[17]);
+  s2[17] = highbd_idct_sub_dual(s1[16], s1[17]);
+  s2[18] = highbd_idct_sub_dual(s1[19], s1[18]);
+  s2[19] = highbd_idct_add_dual(s1[18], s1[19]);
+  s2[20] = highbd_idct_add_dual(s1[20], s1[21]);
+  s2[21] = highbd_idct_sub_dual(s1[20], s1[21]);
+  s2[22] = highbd_idct_sub_dual(s1[23], s1[22]);
+  s2[23] = highbd_idct_add_dual(s1[22], s1[23]);
+  s2[24] = highbd_idct_add_dual(s1[24], s1[25]);
+  s2[25] = highbd_idct_sub_dual(s1[24], s1[25]);
+  s2[26] = highbd_idct_sub_dual(s1[27], s1[26]);
+  s2[27] = highbd_idct_add_dual(s1[26], s1[27]);
+  s2[28] = highbd_idct_add_dual(s1[28], s1[29]);
+  s2[29] = highbd_idct_sub_dual(s1[28], s1[29]);
+  s2[30] = highbd_idct_sub_dual(s1[31], s1[30]);
+  s2[31] = highbd_idct_add_dual(s1[30], s1[31]);
+
+  // stage 3
+  s3[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64);
+  s3[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64);
+
+  s3[5] = multiply_shift_and_narrow_s32_dual(in[12], -cospi_20_64);
+  s3[6] = multiply_shift_and_narrow_s32_dual(in[12], cospi_12_64);
+
+  s3[8] = highbd_idct_add_dual(s2[8], s2[9]);
+  s3[9] = highbd_idct_sub_dual(s2[8], s2[9]);
+  s3[10] = highbd_idct_sub_dual(s2[11], s2[10]);
+  s3[11] = highbd_idct_add_dual(s2[10], s2[11]);
+  s3[12] = highbd_idct_add_dual(s2[12], s2[13]);
+  s3[13] = highbd_idct_sub_dual(s2[12], s2[13]);
+  s3[14] = highbd_idct_sub_dual(s2[15], s2[14]);
+  s3[15] = highbd_idct_add_dual(s2[14], s2[15]);
+
+  s3[17] = multiply_accumulate_shift_and_narrow_s32_dual(s2[17], -cospi_4_64,
+                                                         s2[30], cospi_28_64);
+  s3[30] = multiply_accumulate_shift_and_narrow_s32_dual(s2[17], cospi_28_64,
+                                                         s2[30], cospi_4_64);
+
+  s3[18] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_28_64,
+                                                         s2[29], -cospi_4_64);
+  s3[29] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_4_64,
+                                                         s2[29], cospi_28_64);
+
+  s3[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_20_64,
+                                                         s2[26], cospi_12_64);
+  s3[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], cospi_12_64,
+                                                         s2[26], cospi_20_64);
+
+  s3[22] = multiply_accumulate_shift_and_narrow_s32_dual(s2[22], -cospi_12_64,
+                                                         s2[25], -cospi_20_64);
+  s3[25] = multiply_accumulate_shift_and_narrow_s32_dual(s2[22], -cospi_20_64,
+                                                         s2[25], cospi_12_64);
+
+  // stage 4
+  s4[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64);
+  s4[2] = multiply_shift_and_narrow_s32_dual(in[8], cospi_24_64);
+  s4[3] = multiply_shift_and_narrow_s32_dual(in[8], cospi_8_64);
+
+  s4[4] = highbd_idct_add_dual(s3[4], s3[5]);
+  s4[5] = highbd_idct_sub_dual(s3[4], s3[5]);
+  s4[6] = highbd_idct_sub_dual(s3[7], s3[6]);
+  s4[7] = highbd_idct_add_dual(s3[6], s3[7]);
+
+  s4[9] = multiply_accumulate_shift_and_narrow_s32_dual(s3[9], -cospi_8_64,
+                                                        s3[14], cospi_24_64);
+  s4[14] = multiply_accumulate_shift_and_narrow_s32_dual(s3[9], cospi_24_64,
+                                                         s3[14], cospi_8_64);
+
+  s4[10] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_24_64,
+                                                         s3[13], -cospi_8_64);
+  s4[13] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_8_64,
+                                                         s3[13], cospi_24_64);
+
+  s4[16] = highbd_idct_add_dual(s2[16], s2[19]);
+  s4[17] = highbd_idct_add_dual(s3[17], s3[18]);
+  s4[18] = highbd_idct_sub_dual(s3[17], s3[18]);
+  s4[19] = highbd_idct_sub_dual(s2[16], s2[19]);
+  s4[20] = highbd_idct_sub_dual(s2[23], s2[20]);
+  s4[21] = highbd_idct_sub_dual(s3[22], s3[21]);
+  s4[22] = highbd_idct_add_dual(s3[21], s3[22]);
+  s4[23] = highbd_idct_add_dual(s2[20], s2[23]);
+  s4[24] = highbd_idct_add_dual(s2[24], s2[27]);
+  s4[25] = highbd_idct_add_dual(s3[25], s3[26]);
+  s4[26] = highbd_idct_sub_dual(s3[25], s3[26]);
+  s4[27] = highbd_idct_sub_dual(s2[24], s2[27]);
+  s4[28] = highbd_idct_sub_dual(s2[31], s2[28]);
+  s4[29] = highbd_idct_sub_dual(s3[30], s3[29]);
+  s4[30] = highbd_idct_add_dual(s3[29], s3[30]);
+  s4[31] = highbd_idct_add_dual(s2[28], s2[31]);
+
+  // stage 5
+  s5[0] = highbd_idct_add_dual(s4[0], s4[3]);
+  s5[1] = highbd_idct_add_dual(s4[0], s4[2]);
+  s5[2] = highbd_idct_sub_dual(s4[0], s4[2]);
+  s5[3] = highbd_idct_sub_dual(s4[0], s4[3]);
+
+  s5[5] = sub_multiply_shift_and_narrow_s32_dual(s4[6], s4[5], cospi_16_64);
+  s5[6] = add_multiply_shift_and_narrow_s32_dual(s4[5], s4[6], cospi_16_64);
+
+  s5[8] = highbd_idct_add_dual(s3[8], s3[11]);
+  s5[9] = highbd_idct_add_dual(s4[9], s4[10]);
+  s5[10] = highbd_idct_sub_dual(s4[9], s4[10]);
+  s5[11] = highbd_idct_sub_dual(s3[8], s3[11]);
+  s5[12] = highbd_idct_sub_dual(s3[15], s3[12]);
+  s5[13] = highbd_idct_sub_dual(s4[14], s4[13]);
+  s5[14] = highbd_idct_add_dual(s4[13], s4[14]);
+  s5[15] = highbd_idct_add_dual(s3[15], s3[12]);
+
+  s5[18] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], -cospi_8_64,
+                                                         s4[29], cospi_24_64);
+  s5[29] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], cospi_24_64,
+                                                         s4[29], cospi_8_64);
+
+  s5[19] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], -cospi_8_64,
+                                                         s4[28], cospi_24_64);
+  s5[28] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], cospi_24_64,
+                                                         s4[28], cospi_8_64);
+
+  s5[20] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_24_64,
+                                                         s4[27], -cospi_8_64);
+  s5[27] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_8_64,
+                                                         s4[27], cospi_24_64);
+
+  s5[21] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_24_64,
+                                                         s4[26], -cospi_8_64);
+  s5[26] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_8_64,
+                                                         s4[26], cospi_24_64);
+
+  // stage 6
+  s6[0] = highbd_idct_add_dual(s5[0], s4[7]);
+  s6[1] = highbd_idct_add_dual(s5[1], s5[6]);
+  s6[2] = highbd_idct_add_dual(s5[2], s5[5]);
+  s6[3] = highbd_idct_add_dual(s5[3], s4[4]);
+  s6[4] = highbd_idct_sub_dual(s5[3], s4[4]);
+  s6[5] = highbd_idct_sub_dual(s5[2], s5[5]);
+  s6[6] = highbd_idct_sub_dual(s5[1], s5[6]);
+  s6[7] = highbd_idct_sub_dual(s5[0], s4[7]);
+
+  s6[10] = sub_multiply_shift_and_narrow_s32_dual(s5[13], s5[10], cospi_16_64);
+  s6[13] = add_multiply_shift_and_narrow_s32_dual(s5[10], s5[13], cospi_16_64);
+
+  s6[11] = sub_multiply_shift_and_narrow_s32_dual(s5[12], s5[11], cospi_16_64);
+  s6[12] = add_multiply_shift_and_narrow_s32_dual(s5[11], s5[12], cospi_16_64);
+
+  s6[16] = highbd_idct_add_dual(s4[16], s4[23]);
+  s6[17] = highbd_idct_add_dual(s4[17], s4[22]);
+  s6[18] = highbd_idct_add_dual(s5[18], s5[21]);
+  s6[19] = highbd_idct_add_dual(s5[19], s5[20]);
+  s6[20] = highbd_idct_sub_dual(s5[19], s5[20]);
+  s6[21] = highbd_idct_sub_dual(s5[18], s5[21]);
+  s6[22] = highbd_idct_sub_dual(s4[17], s4[22]);
+  s6[23] = highbd_idct_sub_dual(s4[16], s4[23]);
+  s6[24] = highbd_idct_sub_dual(s4[31], s4[24]);
+  s6[25] = highbd_idct_sub_dual(s4[30], s4[25]);
+  s6[26] = highbd_idct_sub_dual(s5[29], s5[26]);
+  s6[27] = highbd_idct_sub_dual(s5[28], s5[27]);
+  s6[28] = highbd_idct_add_dual(s5[27], s5[28]);
+  s6[29] = highbd_idct_add_dual(s5[26], s5[29]);
+  s6[30] = highbd_idct_add_dual(s4[25], s4[30]);
+  s6[31] = highbd_idct_add_dual(s4[24], s4[31]);
+
+  // stage 7
+  s7[0] = highbd_idct_add_dual(s6[0], s5[15]);
+  s7[1] = highbd_idct_add_dual(s6[1], s5[14]);
+  s7[2] = highbd_idct_add_dual(s6[2], s6[13]);
+  s7[3] = highbd_idct_add_dual(s6[3], s6[12]);
+  s7[4] = highbd_idct_add_dual(s6[4], s6[11]);
+  s7[5] = highbd_idct_add_dual(s6[5], s6[10]);
+  s7[6] = highbd_idct_add_dual(s6[6], s5[9]);
+  s7[7] = highbd_idct_add_dual(s6[7], s5[8]);
+  s7[8] = highbd_idct_sub_dual(s6[7], s5[8]);
+  s7[9] = highbd_idct_sub_dual(s6[6], s5[9]);
+  s7[10] = highbd_idct_sub_dual(s6[5], s6[10]);
+  s7[11] = highbd_idct_sub_dual(s6[4], s6[11]);
+  s7[12] = highbd_idct_sub_dual(s6[3], s6[12]);
+  s7[13] = highbd_idct_sub_dual(s6[2], s6[13]);
+  s7[14] = highbd_idct_sub_dual(s6[1], s5[14]);
+  s7[15] = highbd_idct_sub_dual(s6[0], s5[15]);
+
+  s7[20] = sub_multiply_shift_and_narrow_s32_dual(s6[27], s6[20], cospi_16_64);
+  s7[27] = add_multiply_shift_and_narrow_s32_dual(s6[20], s6[27], cospi_16_64);
+
+  s7[21] = sub_multiply_shift_and_narrow_s32_dual(s6[26], s6[21], cospi_16_64);
+  s7[26] = add_multiply_shift_and_narrow_s32_dual(s6[21], s6[26], cospi_16_64);
+
+  s7[22] = sub_multiply_shift_and_narrow_s32_dual(s6[25], s6[22], cospi_16_64);
+  s7[25] = add_multiply_shift_and_narrow_s32_dual(s6[22], s6[25], cospi_16_64);
+
+  s7[23] = sub_multiply_shift_and_narrow_s32_dual(s6[24], s6[23], cospi_16_64);
+  s7[24] = add_multiply_shift_and_narrow_s32_dual(s6[23], s6[24], cospi_16_64);
+
+  // final stage
+  out[0] = highbd_idct_add_dual(s7[0], s6[31]);
+  out[1] = highbd_idct_add_dual(s7[1], s6[30]);
+  out[2] = highbd_idct_add_dual(s7[2], s6[29]);
+  out[3] = highbd_idct_add_dual(s7[3], s6[28]);
+  out[4] = highbd_idct_add_dual(s7[4], s7[27]);
+  out[5] = highbd_idct_add_dual(s7[5], s7[26]);
+  out[6] = highbd_idct_add_dual(s7[6], s7[25]);
+  out[7] = highbd_idct_add_dual(s7[7], s7[24]);
+  out[8] = highbd_idct_add_dual(s7[8], s7[23]);
+  out[9] = highbd_idct_add_dual(s7[9], s7[22]);
+  out[10] = highbd_idct_add_dual(s7[10], s7[21]);
+  out[11] = highbd_idct_add_dual(s7[11], s7[20]);
+  out[12] = highbd_idct_add_dual(s7[12], s6[19]);
+  out[13] = highbd_idct_add_dual(s7[13], s6[18]);
+  out[14] = highbd_idct_add_dual(s7[14], s6[17]);
+  out[15] = highbd_idct_add_dual(s7[15], s6[16]);
+  out[16] = highbd_idct_sub_dual(s7[15], s6[16]);
+  out[17] = highbd_idct_sub_dual(s7[14], s6[17]);
+  out[18] = highbd_idct_sub_dual(s7[13], s6[18]);
+  out[19] = highbd_idct_sub_dual(s7[12], s6[19]);
+  out[20] = highbd_idct_sub_dual(s7[11], s7[20]);
+  out[21] = highbd_idct_sub_dual(s7[10], s7[21]);
+  out[22] = highbd_idct_sub_dual(s7[9], s7[22]);
+  out[23] = highbd_idct_sub_dual(s7[8], s7[23]);
+  out[24] = highbd_idct_sub_dual(s7[7], s7[24]);
+  out[25] = highbd_idct_sub_dual(s7[6], s7[25]);
+  out[26] = highbd_idct_sub_dual(s7[5], s7[26]);
+  out[27] = highbd_idct_sub_dual(s7[4], s7[27]);
+  out[28] = highbd_idct_sub_dual(s7[3], s6[28]);
+  out[29] = highbd_idct_sub_dual(s7[2], s6[29]);
+  out[30] = highbd_idct_sub_dual(s7[1], s6[30]);
+  out[31] = highbd_idct_sub_dual(s7[0], s6[31]);
+
+  highbd_idct16x16_add_store(out, output, stride, bd);
+  highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd);
+}
+
+void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint16_t *dest,
+                                       int stride, int bd) {
+  int i;
+
+  if (bd == 8) {
+    int16_t temp[32 * 16];
+    int16_t *t = temp;
+    vpx_idct32_12_neon(input, temp);
+    vpx_idct32_12_neon(input + 32 * 8, temp + 8);
+
+    for (i = 0; i < 32; i += 8) {
+      vpx_idct32_16_neon(t, dest, stride, 1);
+      t += (16 * 8);
+      dest += 8;
+    }
+  } else {
+    int32_t temp[32 * 16];
+    int32_t *t = temp;
+    vpx_highbd_idct32_12_neon(input, temp);
+    vpx_highbd_idct32_12_neon(input + 32 * 8, temp + 8);
+
+    for (i = 0; i < 32; i += 8) {
+      vpx_highbd_idct32_16_neon(t, dest, stride, bd);
+      t += (16 * 8);
+      dest += 8;
+    }
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c
new file mode 100644
index 0000000000..f05932cec3
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c
@@ -0,0 +1,625 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/highbd_idct_neon.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+// Only for the first pass of the  _34_ variant. Since it only uses values from
+// the top left 8x8 it can safely assume all the remaining values are 0 and skip
+// an awful lot of calculations. In fact, only the first 6 columns make the cut.
+// None of the elements in the 7th or 8th column are used so it skips any calls
+// to input[67] too.
+// In C this does a single row of 32 for each call. Here it transposes the top
+// left 8x8 to allow using SIMD.
+
+// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 34 non-zero
+// coefficients as follows:
+//    0  1  2  3  4  5  6  7
+// 0  0  2  5 10 17 25
+// 1  1  4  8 15 22 30
+// 2  3  7 12 18 28
+// 3  6 11 16 23 31
+// 4  9 14 19 29
+// 5 13 20 26
+// 6 21 27 33
+// 7 24 32
+static void vpx_highbd_idct32_6_neon(const tran_low_t *input, int32_t *output) {
+  int32x4x2_t in[8], s1[32], s2[32], s3[32];
+
+  in[0].val[0] = vld1q_s32(input);
+  in[0].val[1] = vld1q_s32(input + 4);
+  input += 32;
+  in[1].val[0] = vld1q_s32(input);
+  in[1].val[1] = vld1q_s32(input + 4);
+  input += 32;
+  in[2].val[0] = vld1q_s32(input);
+  in[2].val[1] = vld1q_s32(input + 4);
+  input += 32;
+  in[3].val[0] = vld1q_s32(input);
+  in[3].val[1] = vld1q_s32(input + 4);
+  input += 32;
+  in[4].val[0] = vld1q_s32(input);
+  in[4].val[1] = vld1q_s32(input + 4);
+  input += 32;
+  in[5].val[0] = vld1q_s32(input);
+  in[5].val[1] = vld1q_s32(input + 4);
+  input += 32;
+  in[6].val[0] = vld1q_s32(input);
+  in[6].val[1] = vld1q_s32(input + 4);
+  input += 32;
+  in[7].val[0] = vld1q_s32(input);
+  in[7].val[1] = vld1q_s32(input + 4);
+  transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+                    &in[7]);
+
+  // stage 1
+  // input[1] * cospi_31_64 - input[31] * cospi_1_64 (but input[31] == 0)
+  s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64);
+  // input[1] * cospi_1_64 + input[31] * cospi_31_64 (but input[31] == 0)
+  s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64);
+
+  s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64);
+  s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64);
+
+  s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64);
+  s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64);
+
+  // stage 2
+  s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64);
+  s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64);
+
+  // stage 3
+  s1[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64);
+  s1[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64);
+
+  s1[17] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_4_64,
+                                                         s1[31], cospi_28_64);
+  s1[30] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_28_64,
+                                                         s1[31], cospi_4_64);
+
+  s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], -cospi_20_64,
+                                                         s1[27], cospi_12_64);
+  s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], cospi_12_64,
+                                                         s1[27], cospi_20_64);
+
+  s1[22] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_12_64,
+                                                         s1[24], -cospi_20_64);
+  s1[25] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_20_64,
+                                                         s1[24], cospi_12_64);
+
+  // stage 4
+  s1[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64);
+
+  s2[9] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], -cospi_8_64,
+                                                        s2[15], cospi_24_64);
+  s2[14] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], cospi_24_64,
+                                                         s2[15], cospi_8_64);
+
+  s2[20] = highbd_idct_sub_dual(s1[23], s1[20]);
+  s2[21] = highbd_idct_sub_dual(s1[22], s1[21]);
+  s2[22] = highbd_idct_add_dual(s1[21], s1[22]);
+  s2[23] = highbd_idct_add_dual(s1[20], s1[23]);
+  s2[24] = highbd_idct_add_dual(s1[24], s1[27]);
+  s2[25] = highbd_idct_add_dual(s1[25], s1[26]);
+  s2[26] = highbd_idct_sub_dual(s1[25], s1[26]);
+  s2[27] = highbd_idct_sub_dual(s1[24], s1[27]);
+
+  // stage 5
+  s1[5] = sub_multiply_shift_and_narrow_s32_dual(s1[7], s1[4], cospi_16_64);
+  s1[6] = add_multiply_shift_and_narrow_s32_dual(s1[4], s1[7], cospi_16_64);
+
+  s1[18] = multiply_accumulate_shift_and_narrow_s32_dual(s1[17], -cospi_8_64,
+                                                         s1[30], cospi_24_64);
+  s1[29] = multiply_accumulate_shift_and_narrow_s32_dual(s1[17], cospi_24_64,
+                                                         s1[30], cospi_8_64);
+
+  s1[19] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_8_64,
+                                                         s1[31], cospi_24_64);
+  s1[28] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_24_64,
+                                                         s1[31], cospi_8_64);
+
+  s1[20] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_24_64,
+                                                         s2[27], -cospi_8_64);
+  s1[27] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_8_64,
+                                                         s2[27], cospi_24_64);
+
+  s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_24_64,
+                                                         s2[26], -cospi_8_64);
+  s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_8_64,
+                                                         s2[26], cospi_24_64);
+
+  // stage 6
+  s2[0] = highbd_idct_add_dual(s1[0], s1[7]);
+  s2[1] = highbd_idct_add_dual(s1[0], s1[6]);
+  s2[2] = highbd_idct_add_dual(s1[0], s1[5]);
+  s2[3] = highbd_idct_add_dual(s1[0], s1[4]);
+  s2[4] = highbd_idct_sub_dual(s1[0], s1[4]);
+  s2[5] = highbd_idct_sub_dual(s1[0], s1[5]);
+  s2[6] = highbd_idct_sub_dual(s1[0], s1[6]);
+  s2[7] = highbd_idct_sub_dual(s1[0], s1[7]);
+
+  s2[10] = sub_multiply_shift_and_narrow_s32_dual(s2[14], s2[9], cospi_16_64);
+  s2[13] = add_multiply_shift_and_narrow_s32_dual(s2[9], s2[14], cospi_16_64);
+
+  s2[11] = sub_multiply_shift_and_narrow_s32_dual(s2[15], s2[8], cospi_16_64);
+  s2[12] = add_multiply_shift_and_narrow_s32_dual(s2[8], s2[15], cospi_16_64);
+
+  s2[16] = highbd_idct_add_dual(s1[16], s2[23]);
+  s2[17] = highbd_idct_add_dual(s1[17], s2[22]);
+  s2[18] = highbd_idct_add_dual(s1[18], s1[21]);
+  s2[19] = highbd_idct_add_dual(s1[19], s1[20]);
+  s2[20] = highbd_idct_sub_dual(s1[19], s1[20]);
+  s2[21] = highbd_idct_sub_dual(s1[18], s1[21]);
+  s2[22] = highbd_idct_sub_dual(s1[17], s2[22]);
+  s2[23] = highbd_idct_sub_dual(s1[16], s2[23]);
+
+  s3[24] = highbd_idct_sub_dual(s1[31], s2[24]);
+  s3[25] = highbd_idct_sub_dual(s1[30], s2[25]);
+  s3[26] = highbd_idct_sub_dual(s1[29], s1[26]);
+  s3[27] = highbd_idct_sub_dual(s1[28], s1[27]);
+  s2[28] = highbd_idct_add_dual(s1[27], s1[28]);
+  s2[29] = highbd_idct_add_dual(s1[26], s1[29]);
+  s2[30] = highbd_idct_add_dual(s2[25], s1[30]);
+  s2[31] = highbd_idct_add_dual(s2[24], s1[31]);
+
+  // stage 7
+  s1[0] = highbd_idct_add_dual(s2[0], s2[15]);
+  s1[1] = highbd_idct_add_dual(s2[1], s2[14]);
+  s1[2] = highbd_idct_add_dual(s2[2], s2[13]);
+  s1[3] = highbd_idct_add_dual(s2[3], s2[12]);
+  s1[4] = highbd_idct_add_dual(s2[4], s2[11]);
+  s1[5] = highbd_idct_add_dual(s2[5], s2[10]);
+  s1[6] = highbd_idct_add_dual(s2[6], s2[9]);
+  s1[7] = highbd_idct_add_dual(s2[7], s2[8]);
+  s1[8] = highbd_idct_sub_dual(s2[7], s2[8]);
+  s1[9] = highbd_idct_sub_dual(s2[6], s2[9]);
+  s1[10] = highbd_idct_sub_dual(s2[5], s2[10]);
+  s1[11] = highbd_idct_sub_dual(s2[4], s2[11]);
+  s1[12] = highbd_idct_sub_dual(s2[3], s2[12]);
+  s1[13] = highbd_idct_sub_dual(s2[2], s2[13]);
+  s1[14] = highbd_idct_sub_dual(s2[1], s2[14]);
+  s1[15] = highbd_idct_sub_dual(s2[0], s2[15]);
+
+  s1[20] = sub_multiply_shift_and_narrow_s32_dual(s3[27], s2[20], cospi_16_64);
+  s1[27] = add_multiply_shift_and_narrow_s32_dual(s2[20], s3[27], cospi_16_64);
+
+  s1[21] = sub_multiply_shift_and_narrow_s32_dual(s3[26], s2[21], cospi_16_64);
+  s1[26] = add_multiply_shift_and_narrow_s32_dual(s2[21], s3[26], cospi_16_64);
+
+  s1[22] = sub_multiply_shift_and_narrow_s32_dual(s3[25], s2[22], cospi_16_64);
+  s1[25] = add_multiply_shift_and_narrow_s32_dual(s2[22], s3[25], cospi_16_64);
+
+  s1[23] = sub_multiply_shift_and_narrow_s32_dual(s3[24], s2[23], cospi_16_64);
+  s1[24] = add_multiply_shift_and_narrow_s32_dual(s2[23], s3[24], cospi_16_64);
+
+  // final stage
+  s3[0] = highbd_idct_add_dual(s1[0], s2[31]);
+  s3[1] = highbd_idct_add_dual(s1[1], s2[30]);
+  s3[2] = highbd_idct_add_dual(s1[2], s2[29]);
+  s3[3] = highbd_idct_add_dual(s1[3], s2[28]);
+  s3[4] = highbd_idct_add_dual(s1[4], s1[27]);
+  s3[5] = highbd_idct_add_dual(s1[5], s1[26]);
+  s3[6] = highbd_idct_add_dual(s1[6], s1[25]);
+  s3[7] = highbd_idct_add_dual(s1[7], s1[24]);
+  s3[8] = highbd_idct_add_dual(s1[8], s1[23]);
+  s3[9] = highbd_idct_add_dual(s1[9], s1[22]);
+  s3[10] = highbd_idct_add_dual(s1[10], s1[21]);
+  s3[11] = highbd_idct_add_dual(s1[11], s1[20]);
+  s3[12] = highbd_idct_add_dual(s1[12], s2[19]);
+  s3[13] = highbd_idct_add_dual(s1[13], s2[18]);
+  s3[14] = highbd_idct_add_dual(s1[14], s2[17]);
+  s3[15] = highbd_idct_add_dual(s1[15], s2[16]);
+  s3[16] = highbd_idct_sub_dual(s1[15], s2[16]);
+  s3[17] = highbd_idct_sub_dual(s1[14], s2[17]);
+  s3[18] = highbd_idct_sub_dual(s1[13], s2[18]);
+  s3[19] = highbd_idct_sub_dual(s1[12], s2[19]);
+  s3[20] = highbd_idct_sub_dual(s1[11], s1[20]);
+  s3[21] = highbd_idct_sub_dual(s1[10], s1[21]);
+  s3[22] = highbd_idct_sub_dual(s1[9], s1[22]);
+  s3[23] = highbd_idct_sub_dual(s1[8], s1[23]);
+  s3[24] = highbd_idct_sub_dual(s1[7], s1[24]);
+  s3[25] = highbd_idct_sub_dual(s1[6], s1[25]);
+  s3[26] = highbd_idct_sub_dual(s1[5], s1[26]);
+  s3[27] = highbd_idct_sub_dual(s1[4], s1[27]);
+  s3[28] = highbd_idct_sub_dual(s1[3], s2[28]);
+  s3[29] = highbd_idct_sub_dual(s1[2], s2[29]);
+  s3[30] = highbd_idct_sub_dual(s1[1], s2[30]);
+  s3[31] = highbd_idct_sub_dual(s1[0], s2[31]);
+
+  vst1q_s32(output, s3[0].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[0].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[1].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[1].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[2].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[2].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[3].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[3].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[4].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[4].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[5].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[5].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[6].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[6].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[7].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[7].val[1]);
+  output += 4;
+
+  vst1q_s32(output, s3[8].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[8].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[9].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[9].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[10].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[10].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[11].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[11].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[12].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[12].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[13].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[13].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[14].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[14].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[15].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[15].val[1]);
+  output += 4;
+
+  vst1q_s32(output, s3[16].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[16].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[17].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[17].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[18].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[18].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[19].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[19].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[20].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[20].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[21].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[21].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[22].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[22].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[23].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[23].val[1]);
+  output += 4;
+
+  vst1q_s32(output, s3[24].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[24].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[25].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[25].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[26].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[26].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[27].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[27].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[28].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[28].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[29].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[29].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[30].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[30].val[1]);
+  output += 4;
+  vst1q_s32(output, s3[31].val[0]);
+  output += 4;
+  vst1q_s32(output, s3[31].val[1]);
+}
+
+static void vpx_highbd_idct32_8_neon(const int32_t *input, uint16_t *output,
+                                     int stride, const int bd) {
+  int32x4x2_t in[8], s1[32], s2[32], s3[32], out[32];
+
+  load_and_transpose_s32_8x8(input, 8, &in[0], &in[1], &in[2], &in[3], &in[4],
+                             &in[5], &in[6], &in[7]);
+
+  // stage 1
+  s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64);
+  s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64);
+
+  // Different for _8_
+  s1[19] = multiply_shift_and_narrow_s32_dual(in[7], -cospi_25_64);
+  s1[28] = multiply_shift_and_narrow_s32_dual(in[7], cospi_7_64);
+
+  s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64);
+  s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64);
+
+  s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64);
+  s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64);
+
+  // stage 2
+  s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64);
+  s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64);
+
+  s2[11] = multiply_shift_and_narrow_s32_dual(in[6], -cospi_26_64);
+  s2[12] = multiply_shift_and_narrow_s32_dual(in[6], cospi_6_64);
+
+  // stage 3
+  s1[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64);
+  s1[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64);
+
+  s1[17] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_4_64,
+                                                         s1[31], cospi_28_64);
+  s1[30] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_28_64,
+                                                         s1[31], cospi_4_64);
+
+  // Different for _8_
+  s1[18] = multiply_accumulate_shift_and_narrow_s32_dual(s1[19], -cospi_28_64,
+                                                         s1[28], -cospi_4_64);
+  s1[29] = multiply_accumulate_shift_and_narrow_s32_dual(s1[19], -cospi_4_64,
+                                                         s1[28], cospi_28_64);
+
+  s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], -cospi_20_64,
+                                                         s1[27], cospi_12_64);
+  s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], cospi_12_64,
+                                                         s1[27], cospi_20_64);
+
+  s1[22] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_12_64,
+                                                         s1[24], -cospi_20_64);
+  s1[25] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_20_64,
+                                                         s1[24], cospi_12_64);
+
+  // stage 4
+  s1[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64);
+
+  s2[9] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], -cospi_8_64,
+                                                        s2[15], cospi_24_64);
+  s2[14] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], cospi_24_64,
+                                                         s2[15], cospi_8_64);
+
+  s2[10] = multiply_accumulate_shift_and_narrow_s32_dual(s2[11], -cospi_24_64,
+                                                         s2[12], -cospi_8_64);
+  s2[13] = multiply_accumulate_shift_and_narrow_s32_dual(s2[11], -cospi_8_64,
+                                                         s2[12], cospi_24_64);
+
+  s2[16] = highbd_idct_add_dual(s1[16], s1[19]);
+
+  s2[17] = highbd_idct_add_dual(s1[17], s1[18]);
+  s2[18] = highbd_idct_sub_dual(s1[17], s1[18]);
+
+  s2[19] = highbd_idct_sub_dual(s1[16], s1[19]);
+
+  s2[20] = highbd_idct_sub_dual(s1[23], s1[20]);
+  s2[21] = highbd_idct_sub_dual(s1[22], s1[21]);
+
+  s2[22] = highbd_idct_add_dual(s1[21], s1[22]);
+  s2[23] = highbd_idct_add_dual(s1[20], s1[23]);
+
+  s2[24] = highbd_idct_add_dual(s1[24], s1[27]);
+  s2[25] = highbd_idct_add_dual(s1[25], s1[26]);
+  s2[26] = highbd_idct_sub_dual(s1[25], s1[26]);
+  s2[27] = highbd_idct_sub_dual(s1[24], s1[27]);
+
+  s2[28] = highbd_idct_sub_dual(s1[31], s1[28]);
+  s2[29] = highbd_idct_sub_dual(s1[30], s1[29]);
+  s2[30] = highbd_idct_add_dual(s1[29], s1[30]);
+  s2[31] = highbd_idct_add_dual(s1[28], s1[31]);
+
+  // stage 5
+  s1[5] = sub_multiply_shift_and_narrow_s32_dual(s1[7], s1[4], cospi_16_64);
+  s1[6] = add_multiply_shift_and_narrow_s32_dual(s1[4], s1[7], cospi_16_64);
+
+  s1[8] = highbd_idct_add_dual(s2[8], s2[11]);
+  s1[9] = highbd_idct_add_dual(s2[9], s2[10]);
+  s1[10] = highbd_idct_sub_dual(s2[9], s2[10]);
+  s1[11] = highbd_idct_sub_dual(s2[8], s2[11]);
+  s1[12] = highbd_idct_sub_dual(s2[15], s2[12]);
+  s1[13] = highbd_idct_sub_dual(s2[14], s2[13]);
+  s1[14] = highbd_idct_add_dual(s2[13], s2[14]);
+  s1[15] = highbd_idct_add_dual(s2[12], s2[15]);
+
+  s1[18] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_8_64,
+                                                         s2[29], cospi_24_64);
+  s1[29] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], cospi_24_64,
+                                                         s2[29], cospi_8_64);
+
+  s1[19] = multiply_accumulate_shift_and_narrow_s32_dual(s2[19], -cospi_8_64,
+                                                         s2[28], cospi_24_64);
+  s1[28] = multiply_accumulate_shift_and_narrow_s32_dual(s2[19], cospi_24_64,
+                                                         s2[28], cospi_8_64);
+
+  s1[20] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_24_64,
+                                                         s2[27], -cospi_8_64);
+  s1[27] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_8_64,
+                                                         s2[27], cospi_24_64);
+
+  s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_24_64,
+                                                         s2[26], -cospi_8_64);
+  s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_8_64,
+                                                         s2[26], cospi_24_64);
+
+  // stage 6
+  s2[0] = highbd_idct_add_dual(s1[0], s1[7]);
+  s2[1] = highbd_idct_add_dual(s1[0], s1[6]);
+  s2[2] = highbd_idct_add_dual(s1[0], s1[5]);
+  s2[3] = highbd_idct_add_dual(s1[0], s1[4]);
+  s2[4] = highbd_idct_sub_dual(s1[0], s1[4]);
+  s2[5] = highbd_idct_sub_dual(s1[0], s1[5]);
+  s2[6] = highbd_idct_sub_dual(s1[0], s1[6]);
+  s2[7] = highbd_idct_sub_dual(s1[0], s1[7]);
+
+  s2[10] = sub_multiply_shift_and_narrow_s32_dual(s1[13], s1[10], cospi_16_64);
+  s2[13] = add_multiply_shift_and_narrow_s32_dual(s1[10], s1[13], cospi_16_64);
+
+  s2[11] = sub_multiply_shift_and_narrow_s32_dual(s1[12], s1[11], cospi_16_64);
+  s2[12] = add_multiply_shift_and_narrow_s32_dual(s1[11], s1[12], cospi_16_64);
+
+  s1[16] = highbd_idct_add_dual(s2[16], s2[23]);
+  s1[17] = highbd_idct_add_dual(s2[17], s2[22]);
+  s2[18] = highbd_idct_add_dual(s1[18], s1[21]);
+  s2[19] = highbd_idct_add_dual(s1[19], s1[20]);
+  s2[20] = highbd_idct_sub_dual(s1[19], s1[20]);
+  s2[21] = highbd_idct_sub_dual(s1[18], s1[21]);
+  s1[22] = highbd_idct_sub_dual(s2[17], s2[22]);
+  s1[23] = highbd_idct_sub_dual(s2[16], s2[23]);
+
+  s3[24] = highbd_idct_sub_dual(s2[31], s2[24]);
+  s3[25] = highbd_idct_sub_dual(s2[30], s2[25]);
+  s3[26] = highbd_idct_sub_dual(s1[29], s1[26]);
+  s3[27] = highbd_idct_sub_dual(s1[28], s1[27]);
+  s2[28] = highbd_idct_add_dual(s1[27], s1[28]);
+  s2[29] = highbd_idct_add_dual(s1[26], s1[29]);
+  s2[30] = highbd_idct_add_dual(s2[25], s2[30]);
+  s2[31] = highbd_idct_add_dual(s2[24], s2[31]);
+
+  // stage 7
+  s1[0] = highbd_idct_add_dual(s2[0], s1[15]);
+  s1[1] = highbd_idct_add_dual(s2[1], s1[14]);
+  s1[2] = highbd_idct_add_dual(s2[2], s2[13]);
+  s1[3] = highbd_idct_add_dual(s2[3], s2[12]);
+  s1[4] = highbd_idct_add_dual(s2[4], s2[11]);
+  s1[5] = highbd_idct_add_dual(s2[5], s2[10]);
+  s1[6] = highbd_idct_add_dual(s2[6], s1[9]);
+  s1[7] = highbd_idct_add_dual(s2[7], s1[8]);
+  s1[8] = highbd_idct_sub_dual(s2[7], s1[8]);
+  s1[9] = highbd_idct_sub_dual(s2[6], s1[9]);
+  s1[10] = highbd_idct_sub_dual(s2[5], s2[10]);
+  s1[11] = highbd_idct_sub_dual(s2[4], s2[11]);
+  s1[12] = highbd_idct_sub_dual(s2[3], s2[12]);
+  s1[13] = highbd_idct_sub_dual(s2[2], s2[13]);
+  s1[14] = highbd_idct_sub_dual(s2[1], s1[14]);
+  s1[15] = highbd_idct_sub_dual(s2[0], s1[15]);
+
+  s1[20] = sub_multiply_shift_and_narrow_s32_dual(s3[27], s2[20], cospi_16_64);
+  s1[27] = add_multiply_shift_and_narrow_s32_dual(s2[20], s3[27], cospi_16_64);
+
+  s1[21] = sub_multiply_shift_and_narrow_s32_dual(s3[26], s2[21], cospi_16_64);
+  s1[26] = add_multiply_shift_and_narrow_s32_dual(s2[21], s3[26], cospi_16_64);
+
+  s2[22] = sub_multiply_shift_and_narrow_s32_dual(s3[25], s1[22], cospi_16_64);
+  s1[25] = add_multiply_shift_and_narrow_s32_dual(s1[22], s3[25], cospi_16_64);
+
+  s2[23] = sub_multiply_shift_and_narrow_s32_dual(s3[24], s1[23], cospi_16_64);
+  s1[24] = add_multiply_shift_and_narrow_s32_dual(s1[23], s3[24], cospi_16_64);
+
+  // final stage
+  out[0] = highbd_idct_add_dual(s1[0], s2[31]);
+  out[1] = highbd_idct_add_dual(s1[1], s2[30]);
+  out[2] = highbd_idct_add_dual(s1[2], s2[29]);
+  out[3] = highbd_idct_add_dual(s1[3], s2[28]);
+  out[4] = highbd_idct_add_dual(s1[4], s1[27]);
+  out[5] = highbd_idct_add_dual(s1[5], s1[26]);
+  out[6] = highbd_idct_add_dual(s1[6], s1[25]);
+  out[7] = highbd_idct_add_dual(s1[7], s1[24]);
+  out[8] = highbd_idct_add_dual(s1[8], s2[23]);
+  out[9] = highbd_idct_add_dual(s1[9], s2[22]);
+  out[10] = highbd_idct_add_dual(s1[10], s1[21]);
+  out[11] = highbd_idct_add_dual(s1[11], s1[20]);
+  out[12] = highbd_idct_add_dual(s1[12], s2[19]);
+  out[13] = highbd_idct_add_dual(s1[13], s2[18]);
+  out[14] = highbd_idct_add_dual(s1[14], s1[17]);
+  out[15] = highbd_idct_add_dual(s1[15], s1[16]);
+  out[16] = highbd_idct_sub_dual(s1[15], s1[16]);
+  out[17] = highbd_idct_sub_dual(s1[14], s1[17]);
+  out[18] = highbd_idct_sub_dual(s1[13], s2[18]);
+  out[19] = highbd_idct_sub_dual(s1[12], s2[19]);
+  out[20] = highbd_idct_sub_dual(s1[11], s1[20]);
+  out[21] = highbd_idct_sub_dual(s1[10], s1[21]);
+  out[22] = highbd_idct_sub_dual(s1[9], s2[22]);
+  out[23] = highbd_idct_sub_dual(s1[8], s2[23]);
+  out[24] = highbd_idct_sub_dual(s1[7], s1[24]);
+  out[25] = highbd_idct_sub_dual(s1[6], s1[25]);
+  out[26] = highbd_idct_sub_dual(s1[5], s1[26]);
+  out[27] = highbd_idct_sub_dual(s1[4], s1[27]);
+  out[28] = highbd_idct_sub_dual(s1[3], s2[28]);
+  out[29] = highbd_idct_sub_dual(s1[2], s2[29]);
+  out[30] = highbd_idct_sub_dual(s1[1], s2[30]);
+  out[31] = highbd_idct_sub_dual(s1[0], s2[31]);
+
+  highbd_idct16x16_add_store(out, output, stride, bd);
+  highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd);
+}
+
+void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint16_t *dest,
+                                      int stride, int bd) {
+  int i;
+
+  if (bd == 8) {
+    int16_t temp[32 * 8];
+    int16_t *t = temp;
+
+    vpx_idct32_6_neon(input, t);
+
+    for (i = 0; i < 32; i += 8) {
+      vpx_idct32_8_neon(t, dest, stride, 1);
+      t += (8 * 8);
+      dest += 8;
+    }
+  } else {
+    int32_t temp[32 * 8];
+    int32_t *t = temp;
+
+    vpx_highbd_idct32_6_neon(input, t);
+
+    for (i = 0; i < 32; i += 8) {
+      vpx_highbd_idct32_8_neon(t, dest, stride, bd);
+      t += (8 * 8);
+      dest += 8;
+    }
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c
new file mode 100644
index 0000000000..c1354c0c1a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c
@@ -0,0 +1,88 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void highbd_idct32x32_1_add_pos_kernel(uint16_t **dest,
+                                                     const int stride,
+                                                     const int16x8_t res,
+                                                     const int16x8_t max) {
+  const uint16x8_t a0 = vld1q_u16(*dest);
+  const uint16x8_t a1 = vld1q_u16(*dest + 8);
+  const uint16x8_t a2 = vld1q_u16(*dest + 16);
+  const uint16x8_t a3 = vld1q_u16(*dest + 24);
+  const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
+  const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
+  const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2));
+  const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3));
+  const int16x8_t c0 = vminq_s16(b0, max);
+  const int16x8_t c1 = vminq_s16(b1, max);
+  const int16x8_t c2 = vminq_s16(b2, max);
+  const int16x8_t c3 = vminq_s16(b3, max);
+  vst1q_u16(*dest, vreinterpretq_u16_s16(c0));
+  vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1));
+  vst1q_u16(*dest + 16, vreinterpretq_u16_s16(c2));
+  vst1q_u16(*dest + 24, vreinterpretq_u16_s16(c3));
+  *dest += stride;
+}
+
+static INLINE void highbd_idct32x32_1_add_neg_kernel(uint16_t **dest,
+                                                     const int stride,
+                                                     const int16x8_t res) {
+  const uint16x8_t a0 = vld1q_u16(*dest);
+  const uint16x8_t a1 = vld1q_u16(*dest + 8);
+  const uint16x8_t a2 = vld1q_u16(*dest + 16);
+  const uint16x8_t a3 = vld1q_u16(*dest + 24);
+  const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
+  const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
+  const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2));
+  const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3));
+  const uint16x8_t c0 = vqshluq_n_s16(b0, 0);
+  const uint16x8_t c1 = vqshluq_n_s16(b1, 0);
+  const uint16x8_t c2 = vqshluq_n_s16(b2, 0);
+  const uint16x8_t c3 = vqshluq_n_s16(b3, 0);
+  vst1q_u16(*dest, c0);
+  vst1q_u16(*dest + 8, c1);
+  vst1q_u16(*dest + 16, c2);
+  vst1q_u16(*dest + 24, c3);
+  *dest += stride;
+}
+
+void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint16_t *dest,
+                                     int stride, int bd) {
+  const tran_low_t out0 = HIGHBD_WRAPLOW(
+      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+  const tran_low_t out1 = HIGHBD_WRAPLOW(
+      dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd);
+  const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
+  const int16x8_t dc = vdupq_n_s16(a1);
+  int i;
+
+  if (a1 >= 0) {
+    const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+    for (i = 0; i < 8; ++i) {
+      highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);
+      highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);
+      highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);
+      highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);
+    }
+  } else {
+    for (i = 0; i < 8; ++i) {
+      highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);
+      highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);
+      highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);
+      highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);
+    }
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c
index b9e226a684..7be1dad1d3 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c
@@ -11,27 +11,10 @@
 #include <arm_neon.h>
 
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/highbd_idct_neon.h"
 #include "vpx_dsp/arm/idct_neon.h"
 #include "vpx_dsp/inv_txfm.h"
 
-static INLINE void highbd_idct4x4_1_add_kernel1(uint16_t **dest,
-                                                const int stride,
-                                                const int16x8_t res,
-                                                const int16x8_t max) {
-  const uint16x4_t a0 = vld1_u16(*dest);
-  const uint16x4_t a1 = vld1_u16(*dest + stride);
-  const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a0, a1));
-  // Note: In some profile tests, res is quite close to +/-32767.
-  // We use saturating addition.
-  const int16x8_t b = vqaddq_s16(res, a);
-  const int16x8_t c = vminq_s16(b, max);
-  const uint16x8_t d = vqshluq_n_s16(c, 0);
-  vst1_u16(*dest, vget_low_u16(d));
-  *dest += stride;
-  vst1_u16(*dest, vget_high_u16(d));
-  *dest += stride;
-}
-
 // res is in reverse row order
 static INLINE void highbd_idct4x4_1_add_kernel2(uint16_t **dest,
                                                 const int stride,
@@ -51,123 +34,56 @@ static INLINE void highbd_idct4x4_1_add_kernel2(uint16_t **dest,
   *dest += stride;
 }
 
-void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint16_t *dest,
                                    int stride, int bd) {
   const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
-  const tran_low_t out0 =
-      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
-  const tran_low_t out1 =
-      HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
+  const tran_low_t out0 = HIGHBD_WRAPLOW(
+      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+  const tran_low_t out1 = HIGHBD_WRAPLOW(
+      dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd);
   const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4);
   const int16x8_t dc = vdupq_n_s16(a1);
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
   highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max);
   highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max);
 }
 
-static INLINE void idct4x4_16_kernel_bd10(const int32x4_t cospis,
-                                          int32x4_t *const a0,
-                                          int32x4_t *const a1,
-                                          int32x4_t *const a2,
-                                          int32x4_t *const a3) {
-  int32x4_t b0, b1, b2, b3;
-
-  transpose_s32_4x4(a0, a1, a2, a3);
-  b0 = vaddq_s32(*a0, *a2);
-  b1 = vsubq_s32(*a0, *a2);
-  b0 = vmulq_lane_s32(b0, vget_high_s32(cospis), 0);
-  b1 = vmulq_lane_s32(b1, vget_high_s32(cospis), 0);
-  b2 = vmulq_lane_s32(*a1, vget_high_s32(cospis), 1);
-  b3 = vmulq_lane_s32(*a1, vget_low_s32(cospis), 1);
-  b2 = vmlsq_lane_s32(b2, *a3, vget_low_s32(cospis), 1);
-  b3 = vmlaq_lane_s32(b3, *a3, vget_high_s32(cospis), 1);
-  b0 = vrshrq_n_s32(b0, 14);
-  b1 = vrshrq_n_s32(b1, 14);
-  b2 = vrshrq_n_s32(b2, 14);
-  b3 = vrshrq_n_s32(b3, 14);
-  *a0 = vaddq_s32(b0, b3);
-  *a1 = vaddq_s32(b1, b2);
-  *a2 = vsubq_s32(b1, b2);
-  *a3 = vsubq_s32(b0, b3);
-}
-
-static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis,
-                                          int32x4_t *const a0,
-                                          int32x4_t *const a1,
-                                          int32x4_t *const a2,
-                                          int32x4_t *const a3) {
-  int32x4_t b0, b1, b2, b3;
-  int64x2_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11;
-
-  transpose_s32_4x4(a0, a1, a2, a3);
-  b0 = vaddq_s32(*a0, *a2);
-  b1 = vsubq_s32(*a0, *a2);
-  c0 = vmull_lane_s32(vget_low_s32(b0), vget_high_s32(cospis), 0);
-  c1 = vmull_lane_s32(vget_high_s32(b0), vget_high_s32(cospis), 0);
-  c2 = vmull_lane_s32(vget_low_s32(b1), vget_high_s32(cospis), 0);
-  c3 = vmull_lane_s32(vget_high_s32(b1), vget_high_s32(cospis), 0);
-  c4 = vmull_lane_s32(vget_low_s32(*a1), vget_high_s32(cospis), 1);
-  c5 = vmull_lane_s32(vget_high_s32(*a1), vget_high_s32(cospis), 1);
-  c6 = vmull_lane_s32(vget_low_s32(*a1), vget_low_s32(cospis), 1);
-  c7 = vmull_lane_s32(vget_high_s32(*a1), vget_low_s32(cospis), 1);
-  c8 = vmull_lane_s32(vget_low_s32(*a3), vget_low_s32(cospis), 1);
-  c9 = vmull_lane_s32(vget_high_s32(*a3), vget_low_s32(cospis), 1);
-  c10 = vmull_lane_s32(vget_low_s32(*a3), vget_high_s32(cospis), 1);
-  c11 = vmull_lane_s32(vget_high_s32(*a3), vget_high_s32(cospis), 1);
-  c4 = vsubq_s64(c4, c8);
-  c5 = vsubq_s64(c5, c9);
-  c6 = vaddq_s64(c6, c10);
-  c7 = vaddq_s64(c7, c11);
-  b0 = vcombine_s32(vrshrn_n_s64(c0, 14), vrshrn_n_s64(c1, 14));
-  b1 = vcombine_s32(vrshrn_n_s64(c2, 14), vrshrn_n_s64(c3, 14));
-  b2 = vcombine_s32(vrshrn_n_s64(c4, 14), vrshrn_n_s64(c5, 14));
-  b3 = vcombine_s32(vrshrn_n_s64(c6, 14), vrshrn_n_s64(c7, 14));
-  *a0 = vaddq_s32(b0, b3);
-  *a1 = vaddq_s32(b1, b2);
-  *a2 = vsubq_s32(b1, b2);
-  *a3 = vsubq_s32(b0, b3);
-}
-
-void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint16_t *dest,
                                     int stride, int bd) {
-  DECLARE_ALIGNED(16, static const int32_t, kCospi32[4]) = { 0, 15137, 11585,
-                                                             6270 };
   const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
-  int32x4_t c0 = vld1q_s32(input);
-  int32x4_t c1 = vld1q_s32(input + 4);
-  int32x4_t c2 = vld1q_s32(input + 8);
-  int32x4_t c3 = vld1q_s32(input + 12);
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  int16x8_t a0, a1;
+  int16x8_t a[2];
+  int32x4_t c[4];
+
+  c[0] = vld1q_s32(input);
+  c[1] = vld1q_s32(input + 4);
+  c[2] = vld1q_s32(input + 8);
+  c[3] = vld1q_s32(input + 12);
 
   if (bd == 8) {
-    const int16x4_t cospis = vld1_s16(kCospi);
-
     // Rows
-    a0 = vcombine_s16(vmovn_s32(c0), vmovn_s32(c1));
-    a1 = vcombine_s16(vmovn_s32(c2), vmovn_s32(c3));
-    idct4x4_16_kernel_bd8(cospis, &a0, &a1);
+    a[0] = vcombine_s16(vmovn_s32(c[0]), vmovn_s32(c[1]));
+    a[1] = vcombine_s16(vmovn_s32(c[2]), vmovn_s32(c[3]));
+    transpose_idct4x4_16_bd8(a);
 
     // Columns
-    a1 = vcombine_s16(vget_high_s16(a1), vget_low_s16(a1));
-    idct4x4_16_kernel_bd8(cospis, &a0, &a1);
-    a0 = vrshrq_n_s16(a0, 4);
-    a1 = vrshrq_n_s16(a1, 4);
+    a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+    transpose_idct4x4_16_bd8(a);
+    a[0] = vrshrq_n_s16(a[0], 4);
+    a[1] = vrshrq_n_s16(a[1], 4);
   } else {
     const int32x4_t cospis = vld1q_s32(kCospi32);
 
     if (bd == 10) {
-      idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3);
-      idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3);
+      idct4x4_16_kernel_bd10(cospis, c);
+      idct4x4_16_kernel_bd10(cospis, c);
     } else {
-      idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3);
-      idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3);
+      idct4x4_16_kernel_bd12(cospis, c);
+      idct4x4_16_kernel_bd12(cospis, c);
     }
-    a0 = vcombine_s16(vqrshrn_n_s32(c0, 4), vqrshrn_n_s32(c1, 4));
-    a1 = vcombine_s16(vqrshrn_n_s32(c3, 4), vqrshrn_n_s32(c2, 4));
+    a[0] = vcombine_s16(vqrshrn_n_s32(c[0], 4), vqrshrn_n_s32(c[1], 4));
+    a[1] = vcombine_s16(vqrshrn_n_s32(c[3], 4), vqrshrn_n_s32(c[2], 4));
   }
 
-  highbd_idct4x4_1_add_kernel1(&dest, stride, a0, max);
-  highbd_idct4x4_1_add_kernel2(&dest, stride, a1, max);
+  highbd_idct4x4_1_add_kernel1(&dest, stride, a[0], max);
+  highbd_idct4x4_1_add_kernel2(&dest, stride, a[1], max);
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c
index c1c0f645d1..bed3227ca7 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c
@@ -11,41 +11,61 @@
 #include <arm_neon.h>
 
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/highbd_idct_neon.h"
 #include "vpx_dsp/arm/idct_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/inv_txfm.h"
 
-static INLINE void highbd_idct8x8_1_add_kernel(uint16_t **dest,
-                                               const int stride,
-                                               const int16x8_t res,
-                                               const int16x8_t max) {
+static INLINE void highbd_idct8x8_1_add_pos_kernel(uint16_t **dest,
+                                                   const int stride,
+                                                   const int16x8_t res,
+                                                   const int16x8_t max) {
   const uint16x8_t a = vld1q_u16(*dest);
   const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a));
   const int16x8_t c = vminq_s16(b, max);
-  const uint16x8_t d = vqshluq_n_s16(c, 0);
-  vst1q_u16(*dest, d);
+  vst1q_u16(*dest, vreinterpretq_u16_s16(c));
   *dest += stride;
 }
 
-void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest8,
+static INLINE void highbd_idct8x8_1_add_neg_kernel(uint16_t **dest,
+                                                   const int stride,
+                                                   const int16x8_t res) {
+  const uint16x8_t a = vld1q_u16(*dest);
+  const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a));
+  const uint16x8_t c = vqshluq_n_s16(b, 0);
+  vst1q_u16(*dest, c);
+  *dest += stride;
+}
+
+void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint16_t *dest,
                                    int stride, int bd) {
-  const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
-  const tran_low_t out0 =
-      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
-  const tran_low_t out1 =
-      HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
+  const tran_low_t out0 = HIGHBD_WRAPLOW(
+      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+  const tran_low_t out1 = HIGHBD_WRAPLOW(
+      dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd);
   const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5);
   const int16x8_t dc = vdupq_n_s16(a1);
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
-  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
-  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
-  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
-  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
-  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
-  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
-  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
+  if (a1 >= 0) {
+    const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+  } else {
+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+  }
 }
 
 static INLINE void idct8x8_12_half1d_bd10(
@@ -62,18 +82,18 @@ static INLINE void idct8x8_12_half1d_bd10(
   step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0);
   step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1);
   step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0);
-  step1[4] = vrshrq_n_s32(step1[4], 14);
-  step1[5] = vrshrq_n_s32(step1[5], 14);
-  step1[6] = vrshrq_n_s32(step1[6], 14);
-  step1[7] = vrshrq_n_s32(step1[7], 14);
+  step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS);
+  step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+  step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
+  step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS);
 
   // stage 2
   step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0);
   step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1);
   step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1);
-  step2[1] = vrshrq_n_s32(step2[1], 14);
-  step2[2] = vrshrq_n_s32(step2[2], 14);
-  step2[3] = vrshrq_n_s32(step2[3], 14);
+  step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS);
+  step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS);
+  step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS);
 
   step2[4] = vaddq_s32(step1[4], step1[5]);
   step2[5] = vsubq_s32(step1[4], step1[5]);
@@ -89,8 +109,8 @@ static INLINE void idct8x8_12_half1d_bd10(
   step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0);
   step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
   step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
-  step1[5] = vrshrq_n_s32(step1[5], 14);
-  step1[6] = vrshrq_n_s32(step1[6], 14);
+  step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+  step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
 
   // stage 4
   *io0 = vaddq_s32(step1[0], step2[7]);
@@ -108,7 +128,7 @@ static INLINE void idct8x8_12_half1d_bd12(
     int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
     int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
     int32x4_t *const io7) {
-  int32x2_t input_1l, input_1h, input_3l, input_3h;
+  int32x2_t input1l, input1h, input3l, input3h;
   int32x2_t step1l[2], step1h[2];
   int32x4_t step1[8], step2[8];
   int64x2_t t64[8];
@@ -117,31 +137,31 @@ static INLINE void idct8x8_12_half1d_bd12(
   transpose_s32_4x4(io0, io1, io2, io3);
 
   // stage 1
-  input_1l = vget_low_s32(*io1);
-  input_1h = vget_high_s32(*io1);
-  input_3l = vget_low_s32(*io3);
-  input_3h = vget_high_s32(*io3);
+  input1l = vget_low_s32(*io1);
+  input1h = vget_high_s32(*io1);
+  input3l = vget_low_s32(*io3);
+  input3h = vget_high_s32(*io3);
   step1l[0] = vget_low_s32(*io0);
   step1h[0] = vget_high_s32(*io0);
   step1l[1] = vget_low_s32(*io2);
   step1h[1] = vget_high_s32(*io2);
 
-  t64[0] = vmull_lane_s32(input_1l, vget_high_s32(cospis1), 1);
-  t64[1] = vmull_lane_s32(input_1h, vget_high_s32(cospis1), 1);
-  t64[2] = vmull_lane_s32(input_3l, vget_high_s32(cospis1), 0);
-  t64[3] = vmull_lane_s32(input_3h, vget_high_s32(cospis1), 0);
-  t64[4] = vmull_lane_s32(input_3l, vget_low_s32(cospis1), 1);
-  t64[5] = vmull_lane_s32(input_3h, vget_low_s32(cospis1), 1);
-  t64[6] = vmull_lane_s32(input_1l, vget_low_s32(cospis1), 0);
-  t64[7] = vmull_lane_s32(input_1h, vget_low_s32(cospis1), 0);
-  t32[0] = vrshrn_n_s64(t64[0], 14);
-  t32[1] = vrshrn_n_s64(t64[1], 14);
-  t32[2] = vrshrn_n_s64(t64[2], 14);
-  t32[3] = vrshrn_n_s64(t64[3], 14);
-  t32[4] = vrshrn_n_s64(t64[4], 14);
-  t32[5] = vrshrn_n_s64(t64[5], 14);
-  t32[6] = vrshrn_n_s64(t64[6], 14);
-  t32[7] = vrshrn_n_s64(t64[7], 14);
+  t64[0] = vmull_lane_s32(input1l, vget_high_s32(cospis1), 1);
+  t64[1] = vmull_lane_s32(input1h, vget_high_s32(cospis1), 1);
+  t64[2] = vmull_lane_s32(input3l, vget_high_s32(cospis1), 0);
+  t64[3] = vmull_lane_s32(input3h, vget_high_s32(cospis1), 0);
+  t64[4] = vmull_lane_s32(input3l, vget_low_s32(cospis1), 1);
+  t64[5] = vmull_lane_s32(input3h, vget_low_s32(cospis1), 1);
+  t64[6] = vmull_lane_s32(input1l, vget_low_s32(cospis1), 0);
+  t64[7] = vmull_lane_s32(input1h, vget_low_s32(cospis1), 0);
+  t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+  t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+  t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+  t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+  t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+  t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
   step1[4] = vcombine_s32(t32[0], t32[1]);
   step1[5] = vcombine_s32(t32[2], t32[3]);
   step1[6] = vcombine_s32(t32[4], t32[5]);
@@ -154,12 +174,12 @@ static INLINE void idct8x8_12_half1d_bd12(
   t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1);
   t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1);
   t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1);
-  t32[2] = vrshrn_n_s64(t64[2], 14);
-  t32[3] = vrshrn_n_s64(t64[3], 14);
-  t32[4] = vrshrn_n_s64(t64[4], 14);
-  t32[5] = vrshrn_n_s64(t64[5], 14);
-  t32[6] = vrshrn_n_s64(t64[6], 14);
-  t32[7] = vrshrn_n_s64(t64[7], 14);
+  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+  t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+  t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+  t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+  t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
   step2[1] = vcombine_s32(t32[2], t32[3]);
   step2[2] = vcombine_s32(t32[4], t32[5]);
   step2[3] = vcombine_s32(t32[6], t32[7]);
@@ -185,10 +205,10 @@ static INLINE void idct8x8_12_half1d_bd12(
       vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
   t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]),
                           vget_high_s32(cospis0), 0);
-  t32[0] = vrshrn_n_s64(t64[0], 14);
-  t32[1] = vrshrn_n_s64(t64[1], 14);
-  t32[2] = vrshrn_n_s64(t64[2], 14);
-  t32[3] = vrshrn_n_s64(t64[3], 14);
+  t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+  t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
   step1[5] = vcombine_s32(t32[0], t32[1]);
   step1[6] = vcombine_s32(t32[2], t32[3]);
 
@@ -203,83 +223,15 @@ static INLINE void idct8x8_12_half1d_bd12(
   *io7 = vsubq_s32(step1[0], step2[7]);
 }
 
-static INLINE void highbd_add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2,
-                                 int16x8_t a3, int16x8_t a4, int16x8_t a5,
-                                 int16x8_t a6, int16x8_t a7, uint16_t *dest,
-                                 const int stride, const int bd) {
-  const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
-  const uint16_t *dst = dest;
-  uint16x8_t d0, d1, d2, d3, d4, d5, d6, d7;
-  uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16, d4_u16, d5_u16, d6_u16, d7_u16;
-  int16x8_t d0_s16, d1_s16, d2_s16, d3_s16, d4_s16, d5_s16, d6_s16, d7_s16;
-
-  d0 = vld1q_u16(dst);
-  dst += stride;
-  d1 = vld1q_u16(dst);
-  dst += stride;
-  d2 = vld1q_u16(dst);
-  dst += stride;
-  d3 = vld1q_u16(dst);
-  dst += stride;
-  d4 = vld1q_u16(dst);
-  dst += stride;
-  d5 = vld1q_u16(dst);
-  dst += stride;
-  d6 = vld1q_u16(dst);
-  dst += stride;
-  d7 = vld1q_u16(dst);
-
-  d0_s16 = vqaddq_s16(a0, vreinterpretq_s16_u16(d0));
-  d1_s16 = vqaddq_s16(a1, vreinterpretq_s16_u16(d1));
-  d2_s16 = vqaddq_s16(a2, vreinterpretq_s16_u16(d2));
-  d3_s16 = vqaddq_s16(a3, vreinterpretq_s16_u16(d3));
-  d4_s16 = vqaddq_s16(a4, vreinterpretq_s16_u16(d4));
-  d5_s16 = vqaddq_s16(a5, vreinterpretq_s16_u16(d5));
-  d6_s16 = vqaddq_s16(a6, vreinterpretq_s16_u16(d6));
-  d7_s16 = vqaddq_s16(a7, vreinterpretq_s16_u16(d7));
-
-  d0_s16 = vminq_s16(d0_s16, max);
-  d1_s16 = vminq_s16(d1_s16, max);
-  d2_s16 = vminq_s16(d2_s16, max);
-  d3_s16 = vminq_s16(d3_s16, max);
-  d4_s16 = vminq_s16(d4_s16, max);
-  d5_s16 = vminq_s16(d5_s16, max);
-  d6_s16 = vminq_s16(d6_s16, max);
-  d7_s16 = vminq_s16(d7_s16, max);
-  d0_u16 = vqshluq_n_s16(d0_s16, 0);
-  d1_u16 = vqshluq_n_s16(d1_s16, 0);
-  d2_u16 = vqshluq_n_s16(d2_s16, 0);
-  d3_u16 = vqshluq_n_s16(d3_s16, 0);
-  d4_u16 = vqshluq_n_s16(d4_s16, 0);
-  d5_u16 = vqshluq_n_s16(d5_s16, 0);
-  d6_u16 = vqshluq_n_s16(d6_s16, 0);
-  d7_u16 = vqshluq_n_s16(d7_s16, 0);
-
-  vst1q_u16(dest, d0_u16);
-  dest += stride;
-  vst1q_u16(dest, d1_u16);
-  dest += stride;
-  vst1q_u16(dest, d2_u16);
-  dest += stride;
-  vst1q_u16(dest, d3_u16);
-  dest += stride;
-  vst1q_u16(dest, d4_u16);
-  dest += stride;
-  vst1q_u16(dest, d5_u16);
-  dest += stride;
-  vst1q_u16(dest, d6_u16);
-  dest += stride;
-  vst1q_u16(dest, d7_u16);
-}
-
-void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint16_t *dest,
                                     int stride, int bd) {
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  int32x4_t a0 = vld1q_s32(input);
-  int32x4_t a1 = vld1q_s32(input + 8);
-  int32x4_t a2 = vld1q_s32(input + 16);
-  int32x4_t a3 = vld1q_s32(input + 24);
-  int16x8_t c0, c1, c2, c3, c4, c5, c6, c7;
+  int32x4_t a[16];
+  int16x8_t c[8];
+
+  a[0] = vld1q_s32(input);
+  a[1] = vld1q_s32(input + 8);
+  a[2] = vld1q_s32(input + 16);
+  a[3] = vld1q_s32(input + 24);
 
   if (bd == 8) {
     const int16x8_t cospis = vld1q_s16(kCospi);
@@ -287,328 +239,133 @@ void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest8,
     const int16x4_t cospis0 = vget_low_s16(cospis);     // cospi 0, 8, 16, 24
     const int16x4_t cospisd0 = vget_low_s16(cospisd);   // doubled 0, 8, 16, 24
     const int16x4_t cospisd1 = vget_high_s16(cospisd);  // doubled 4, 12, 20, 28
-    int16x4_t b0 = vmovn_s32(a0);
-    int16x4_t b1 = vmovn_s32(a1);
-    int16x4_t b2 = vmovn_s32(a2);
-    int16x4_t b3 = vmovn_s32(a3);
-    int16x4_t b4, b5, b6, b7;
+    int16x4_t b[8];
 
-    idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, &b0, &b1, &b2, &b3, &b4,
-                         &b5, &b6, &b7);
-    idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, b0, b1, b2, b3, b4, b5,
-                         b6, b7, &c0, &c1, &c2, &c3, &c4, &c5, &c6, &c7);
-    c0 = vrshrq_n_s16(c0, 5);
-    c1 = vrshrq_n_s16(c1, 5);
-    c2 = vrshrq_n_s16(c2, 5);
-    c3 = vrshrq_n_s16(c3, 5);
-    c4 = vrshrq_n_s16(c4, 5);
-    c5 = vrshrq_n_s16(c5, 5);
-    c6 = vrshrq_n_s16(c6, 5);
-    c7 = vrshrq_n_s16(c7, 5);
+    b[0] = vmovn_s32(a[0]);
+    b[1] = vmovn_s32(a[1]);
+    b[2] = vmovn_s32(a[2]);
+    b[3] = vmovn_s32(a[3]);
+
+    idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, b);
+    idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, b, c);
+    c[0] = vrshrq_n_s16(c[0], 5);
+    c[1] = vrshrq_n_s16(c[1], 5);
+    c[2] = vrshrq_n_s16(c[2], 5);
+    c[3] = vrshrq_n_s16(c[3], 5);
+    c[4] = vrshrq_n_s16(c[4], 5);
+    c[5] = vrshrq_n_s16(c[5], 5);
+    c[6] = vrshrq_n_s16(c[6], 5);
+    c[7] = vrshrq_n_s16(c[7], 5);
   } else {
     const int32x4_t cospis0 = vld1q_s32(kCospi32);      // cospi 0, 8, 16, 24
     const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4);  // cospi 4, 12, 20, 28
-    int32x4_t a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15;
 
     if (bd == 10) {
-      idct8x8_12_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5,
-                             &a6, &a7);
-      idct8x8_12_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a8, &a9,
-                             &a10, &a11);
-      idct8x8_12_half1d_bd10(cospis0, cospis1, &a4, &a5, &a6, &a7, &a12, &a13,
-                             &a14, &a15);
+      idct8x8_12_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+                             &a[4], &a[5], &a[6], &a[7]);
+      idct8x8_12_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+                             &a[8], &a[9], &a[10], &a[11]);
+      idct8x8_12_half1d_bd10(cospis0, cospis1, &a[4], &a[5], &a[6], &a[7],
+                             &a[12], &a[13], &a[14], &a[15]);
     } else {
-      idct8x8_12_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5,
-                             &a6, &a7);
-      idct8x8_12_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a8, &a9,
-                             &a10, &a11);
-      idct8x8_12_half1d_bd12(cospis0, cospis1, &a4, &a5, &a6, &a7, &a12, &a13,
-                             &a14, &a15);
+      idct8x8_12_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+                             &a[4], &a[5], &a[6], &a[7]);
+      idct8x8_12_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+                             &a[8], &a[9], &a[10], &a[11]);
+      idct8x8_12_half1d_bd12(cospis0, cospis1, &a[4], &a[5], &a[6], &a[7],
+                             &a[12], &a[13], &a[14], &a[15]);
     }
-    c0 = vcombine_s16(vrshrn_n_s32(a0, 5), vrshrn_n_s32(a4, 5));
-    c1 = vcombine_s16(vrshrn_n_s32(a1, 5), vrshrn_n_s32(a5, 5));
-    c2 = vcombine_s16(vrshrn_n_s32(a2, 5), vrshrn_n_s32(a6, 5));
-    c3 = vcombine_s16(vrshrn_n_s32(a3, 5), vrshrn_n_s32(a7, 5));
-    c4 = vcombine_s16(vrshrn_n_s32(a8, 5), vrshrn_n_s32(a12, 5));
-    c5 = vcombine_s16(vrshrn_n_s32(a9, 5), vrshrn_n_s32(a13, 5));
-    c6 = vcombine_s16(vrshrn_n_s32(a10, 5), vrshrn_n_s32(a14, 5));
-    c7 = vcombine_s16(vrshrn_n_s32(a11, 5), vrshrn_n_s32(a15, 5));
+    c[0] = vcombine_s16(vrshrn_n_s32(a[0], 5), vrshrn_n_s32(a[4], 5));
+    c[1] = vcombine_s16(vrshrn_n_s32(a[1], 5), vrshrn_n_s32(a[5], 5));
+    c[2] = vcombine_s16(vrshrn_n_s32(a[2], 5), vrshrn_n_s32(a[6], 5));
+    c[3] = vcombine_s16(vrshrn_n_s32(a[3], 5), vrshrn_n_s32(a[7], 5));
+    c[4] = vcombine_s16(vrshrn_n_s32(a[8], 5), vrshrn_n_s32(a[12], 5));
+    c[5] = vcombine_s16(vrshrn_n_s32(a[9], 5), vrshrn_n_s32(a[13], 5));
+    c[6] = vcombine_s16(vrshrn_n_s32(a[10], 5), vrshrn_n_s32(a[14], 5));
+    c[7] = vcombine_s16(vrshrn_n_s32(a[11], 5), vrshrn_n_s32(a[15], 5));
   }
-  highbd_add8x8(c0, c1, c2, c3, c4, c5, c6, c7, dest, stride, bd);
+  highbd_add8x8(c, dest, stride, bd);
 }
 
-static INLINE void idct8x8_64_half1d_bd10(
-    const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
-    int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
-    int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
-    int32x4_t *const io7) {
-  int32x4_t step1[8], step2[8];
-
-  transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7);
-
-  // stage 1
-  step1[4] = vmulq_lane_s32(*io1, vget_high_s32(cospis1), 1);
-  step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0);
-  step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1);
-  step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0);
-
-  step1[4] = vmlsq_lane_s32(step1[4], *io7, vget_low_s32(cospis1), 0);
-  step1[5] = vmlaq_lane_s32(step1[5], *io5, vget_low_s32(cospis1), 1);
-  step1[6] = vmlsq_lane_s32(step1[6], *io5, vget_high_s32(cospis1), 0);
-  step1[7] = vmlaq_lane_s32(step1[7], *io7, vget_high_s32(cospis1), 1);
-
-  step1[4] = vrshrq_n_s32(step1[4], 14);
-  step1[5] = vrshrq_n_s32(step1[5], 14);
-  step1[6] = vrshrq_n_s32(step1[6], 14);
-  step1[7] = vrshrq_n_s32(step1[7], 14);
-
-  // stage 2
-  step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0);
-  step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1);
-  step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1);
-
-  step2[0] = vmlaq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0);
-  step2[1] = vmlsq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0);
-  step2[2] = vmlsq_lane_s32(step2[2], *io6, vget_low_s32(cospis0), 1);
-  step2[3] = vmlaq_lane_s32(step2[3], *io6, vget_high_s32(cospis0), 1);
-
-  step2[0] = vrshrq_n_s32(step2[0], 14);
-  step2[1] = vrshrq_n_s32(step2[1], 14);
-  step2[2] = vrshrq_n_s32(step2[2], 14);
-  step2[3] = vrshrq_n_s32(step2[3], 14);
-
-  step2[4] = vaddq_s32(step1[4], step1[5]);
-  step2[5] = vsubq_s32(step1[4], step1[5]);
-  step2[6] = vsubq_s32(step1[7], step1[6]);
-  step2[7] = vaddq_s32(step1[7], step1[6]);
-
-  // stage 3
-  step1[0] = vaddq_s32(step2[0], step2[3]);
-  step1[1] = vaddq_s32(step2[1], step2[2]);
-  step1[2] = vsubq_s32(step2[1], step2[2]);
-  step1[3] = vsubq_s32(step2[0], step2[3]);
-
-  step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0);
-  step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
-  step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
-  step1[5] = vrshrq_n_s32(step1[5], 14);
-  step1[6] = vrshrq_n_s32(step1[6], 14);
-
-  // stage 4
-  *io0 = vaddq_s32(step1[0], step2[7]);
-  *io1 = vaddq_s32(step1[1], step1[6]);
-  *io2 = vaddq_s32(step1[2], step1[5]);
-  *io3 = vaddq_s32(step1[3], step2[4]);
-  *io4 = vsubq_s32(step1[3], step2[4]);
-  *io5 = vsubq_s32(step1[2], step1[5]);
-  *io6 = vsubq_s32(step1[1], step1[6]);
-  *io7 = vsubq_s32(step1[0], step2[7]);
-}
-
-static INLINE void idct8x8_64_half1d_bd12(
-    const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
-    int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
-    int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
-    int32x4_t *const io7) {
-  int32x2_t input_1l, input_1h, input_3l, input_3h, input_5l, input_5h,
-      input_7l, input_7h;
-  int32x2_t step1l[4], step1h[4];
-  int32x4_t step1[8], step2[8];
-  int64x2_t t64[8];
-  int32x2_t t32[8];
-
-  transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7);
-
-  // stage 1
-  input_1l = vget_low_s32(*io1);
-  input_1h = vget_high_s32(*io1);
-  input_3l = vget_low_s32(*io3);
-  input_3h = vget_high_s32(*io3);
-  input_5l = vget_low_s32(*io5);
-  input_5h = vget_high_s32(*io5);
-  input_7l = vget_low_s32(*io7);
-  input_7h = vget_high_s32(*io7);
-  step1l[0] = vget_low_s32(*io0);
-  step1h[0] = vget_high_s32(*io0);
-  step1l[1] = vget_low_s32(*io2);
-  step1h[1] = vget_high_s32(*io2);
-  step1l[2] = vget_low_s32(*io4);
-  step1h[2] = vget_high_s32(*io4);
-  step1l[3] = vget_low_s32(*io6);
-  step1h[3] = vget_high_s32(*io6);
-
-  t64[0] = vmull_lane_s32(input_1l, vget_high_s32(cospis1), 1);
-  t64[1] = vmull_lane_s32(input_1h, vget_high_s32(cospis1), 1);
-  t64[2] = vmull_lane_s32(input_3l, vget_high_s32(cospis1), 0);
-  t64[3] = vmull_lane_s32(input_3h, vget_high_s32(cospis1), 0);
-  t64[4] = vmull_lane_s32(input_3l, vget_low_s32(cospis1), 1);
-  t64[5] = vmull_lane_s32(input_3h, vget_low_s32(cospis1), 1);
-  t64[6] = vmull_lane_s32(input_1l, vget_low_s32(cospis1), 0);
-  t64[7] = vmull_lane_s32(input_1h, vget_low_s32(cospis1), 0);
-  t64[0] = vmlsl_lane_s32(t64[0], input_7l, vget_low_s32(cospis1), 0);
-  t64[1] = vmlsl_lane_s32(t64[1], input_7h, vget_low_s32(cospis1), 0);
-  t64[2] = vmlal_lane_s32(t64[2], input_5l, vget_low_s32(cospis1), 1);
-  t64[3] = vmlal_lane_s32(t64[3], input_5h, vget_low_s32(cospis1), 1);
-  t64[4] = vmlsl_lane_s32(t64[4], input_5l, vget_high_s32(cospis1), 0);
-  t64[5] = vmlsl_lane_s32(t64[5], input_5h, vget_high_s32(cospis1), 0);
-  t64[6] = vmlal_lane_s32(t64[6], input_7l, vget_high_s32(cospis1), 1);
-  t64[7] = vmlal_lane_s32(t64[7], input_7h, vget_high_s32(cospis1), 1);
-  t32[0] = vrshrn_n_s64(t64[0], 14);
-  t32[1] = vrshrn_n_s64(t64[1], 14);
-  t32[2] = vrshrn_n_s64(t64[2], 14);
-  t32[3] = vrshrn_n_s64(t64[3], 14);
-  t32[4] = vrshrn_n_s64(t64[4], 14);
-  t32[5] = vrshrn_n_s64(t64[5], 14);
-  t32[6] = vrshrn_n_s64(t64[6], 14);
-  t32[7] = vrshrn_n_s64(t64[7], 14);
-  step1[4] = vcombine_s32(t32[0], t32[1]);
-  step1[5] = vcombine_s32(t32[2], t32[3]);
-  step1[6] = vcombine_s32(t32[4], t32[5]);
-  step1[7] = vcombine_s32(t32[6], t32[7]);
-
-  // stage 2
-  t64[2] = vmull_lane_s32(step1l[0], vget_high_s32(cospis0), 0);
-  t64[3] = vmull_lane_s32(step1h[0], vget_high_s32(cospis0), 0);
-  t64[4] = vmull_lane_s32(step1l[1], vget_high_s32(cospis0), 1);
-  t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1);
-  t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1);
-  t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1);
-  t64[0] = vmlal_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0);
-  t64[1] = vmlal_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0);
-  t64[2] = vmlsl_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0);
-  t64[3] = vmlsl_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0);
-  t64[4] = vmlsl_lane_s32(t64[4], step1l[3], vget_low_s32(cospis0), 1);
-  t64[5] = vmlsl_lane_s32(t64[5], step1h[3], vget_low_s32(cospis0), 1);
-  t64[6] = vmlal_lane_s32(t64[6], step1l[3], vget_high_s32(cospis0), 1);
-  t64[7] = vmlal_lane_s32(t64[7], step1h[3], vget_high_s32(cospis0), 1);
-  t32[0] = vrshrn_n_s64(t64[0], 14);
-  t32[1] = vrshrn_n_s64(t64[1], 14);
-  t32[2] = vrshrn_n_s64(t64[2], 14);
-  t32[3] = vrshrn_n_s64(t64[3], 14);
-  t32[4] = vrshrn_n_s64(t64[4], 14);
-  t32[5] = vrshrn_n_s64(t64[5], 14);
-  t32[6] = vrshrn_n_s64(t64[6], 14);
-  t32[7] = vrshrn_n_s64(t64[7], 14);
-  step2[0] = vcombine_s32(t32[0], t32[1]);
-  step2[1] = vcombine_s32(t32[2], t32[3]);
-  step2[2] = vcombine_s32(t32[4], t32[5]);
-  step2[3] = vcombine_s32(t32[6], t32[7]);
-
-  step2[4] = vaddq_s32(step1[4], step1[5]);
-  step2[5] = vsubq_s32(step1[4], step1[5]);
-  step2[6] = vsubq_s32(step1[7], step1[6]);
-  step2[7] = vaddq_s32(step1[7], step1[6]);
-
-  // stage 3
-  step1[0] = vaddq_s32(step2[0], step2[3]);
-  step1[1] = vaddq_s32(step2[1], step2[2]);
-  step1[2] = vsubq_s32(step2[1], step2[2]);
-  step1[3] = vsubq_s32(step2[0], step2[3]);
-
-  t64[2] = vmull_lane_s32(vget_low_s32(step2[6]), vget_high_s32(cospis0), 0);
-  t64[3] = vmull_lane_s32(vget_high_s32(step2[6]), vget_high_s32(cospis0), 0);
-  t64[0] =
-      vmlsl_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
-  t64[1] = vmlsl_lane_s32(t64[3], vget_high_s32(step2[5]),
-                          vget_high_s32(cospis0), 0);
-  t64[2] =
-      vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
-  t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]),
-                          vget_high_s32(cospis0), 0);
-  t32[0] = vrshrn_n_s64(t64[0], 14);
-  t32[1] = vrshrn_n_s64(t64[1], 14);
-  t32[2] = vrshrn_n_s64(t64[2], 14);
-  t32[3] = vrshrn_n_s64(t64[3], 14);
-  step1[5] = vcombine_s32(t32[0], t32[1]);
-  step1[6] = vcombine_s32(t32[2], t32[3]);
-
-  // stage 4
-  *io0 = vaddq_s32(step1[0], step2[7]);
-  *io1 = vaddq_s32(step1[1], step1[6]);
-  *io2 = vaddq_s32(step1[2], step1[5]);
-  *io3 = vaddq_s32(step1[3], step2[4]);
-  *io4 = vsubq_s32(step1[3], step2[4]);
-  *io5 = vsubq_s32(step1[2], step1[5]);
-  *io6 = vsubq_s32(step1[1], step1[6]);
-  *io7 = vsubq_s32(step1[0], step2[7]);
-}
-
-void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint16_t *dest,
                                     int stride, int bd) {
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  int32x4_t a0 = vld1q_s32(input);
-  int32x4_t a1 = vld1q_s32(input + 4);
-  int32x4_t a2 = vld1q_s32(input + 8);
-  int32x4_t a3 = vld1q_s32(input + 12);
-  int32x4_t a4 = vld1q_s32(input + 16);
-  int32x4_t a5 = vld1q_s32(input + 20);
-  int32x4_t a6 = vld1q_s32(input + 24);
-  int32x4_t a7 = vld1q_s32(input + 28);
-  int32x4_t a8 = vld1q_s32(input + 32);
-  int32x4_t a9 = vld1q_s32(input + 36);
-  int32x4_t a10 = vld1q_s32(input + 40);
-  int32x4_t a11 = vld1q_s32(input + 44);
-  int32x4_t a12 = vld1q_s32(input + 48);
-  int32x4_t a13 = vld1q_s32(input + 52);
-  int32x4_t a14 = vld1q_s32(input + 56);
-  int32x4_t a15 = vld1q_s32(input + 60);
-  int16x8_t c0, c1, c2, c3, c4, c5, c6, c7;
+  int32x4_t a[16];
+  int16x8_t c[8];
+
+  a[0] = vld1q_s32(input);
+  a[1] = vld1q_s32(input + 4);
+  a[2] = vld1q_s32(input + 8);
+  a[3] = vld1q_s32(input + 12);
+  a[4] = vld1q_s32(input + 16);
+  a[5] = vld1q_s32(input + 20);
+  a[6] = vld1q_s32(input + 24);
+  a[7] = vld1q_s32(input + 28);
+  a[8] = vld1q_s32(input + 32);
+  a[9] = vld1q_s32(input + 36);
+  a[10] = vld1q_s32(input + 40);
+  a[11] = vld1q_s32(input + 44);
+  a[12] = vld1q_s32(input + 48);
+  a[13] = vld1q_s32(input + 52);
+  a[14] = vld1q_s32(input + 56);
+  a[15] = vld1q_s32(input + 60);
 
   if (bd == 8) {
     const int16x8_t cospis = vld1q_s16(kCospi);
     const int16x4_t cospis0 = vget_low_s16(cospis);   // cospi 0, 8, 16, 24
     const int16x4_t cospis1 = vget_high_s16(cospis);  // cospi 4, 12, 20, 28
-    int16x8_t b0 = vcombine_s16(vmovn_s32(a0), vmovn_s32(a1));
-    int16x8_t b1 = vcombine_s16(vmovn_s32(a2), vmovn_s32(a3));
-    int16x8_t b2 = vcombine_s16(vmovn_s32(a4), vmovn_s32(a5));
-    int16x8_t b3 = vcombine_s16(vmovn_s32(a6), vmovn_s32(a7));
-    int16x8_t b4 = vcombine_s16(vmovn_s32(a8), vmovn_s32(a9));
-    int16x8_t b5 = vcombine_s16(vmovn_s32(a10), vmovn_s32(a11));
-    int16x8_t b6 = vcombine_s16(vmovn_s32(a12), vmovn_s32(a13));
-    int16x8_t b7 = vcombine_s16(vmovn_s32(a14), vmovn_s32(a15));
+    int16x8_t b[8];
 
-    idct8x8_64_1d_bd8(cospis0, cospis1, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7);
-    idct8x8_64_1d_bd8(cospis0, cospis1, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7);
+    b[0] = vcombine_s16(vmovn_s32(a[0]), vmovn_s32(a[1]));
+    b[1] = vcombine_s16(vmovn_s32(a[2]), vmovn_s32(a[3]));
+    b[2] = vcombine_s16(vmovn_s32(a[4]), vmovn_s32(a[5]));
+    b[3] = vcombine_s16(vmovn_s32(a[6]), vmovn_s32(a[7]));
+    b[4] = vcombine_s16(vmovn_s32(a[8]), vmovn_s32(a[9]));
+    b[5] = vcombine_s16(vmovn_s32(a[10]), vmovn_s32(a[11]));
+    b[6] = vcombine_s16(vmovn_s32(a[12]), vmovn_s32(a[13]));
+    b[7] = vcombine_s16(vmovn_s32(a[14]), vmovn_s32(a[15]));
 
-    c0 = vrshrq_n_s16(b0, 5);
-    c1 = vrshrq_n_s16(b1, 5);
-    c2 = vrshrq_n_s16(b2, 5);
-    c3 = vrshrq_n_s16(b3, 5);
-    c4 = vrshrq_n_s16(b4, 5);
-    c5 = vrshrq_n_s16(b5, 5);
-    c6 = vrshrq_n_s16(b6, 5);
-    c7 = vrshrq_n_s16(b7, 5);
+    idct8x8_64_1d_bd8(cospis0, cospis1, b);
+    idct8x8_64_1d_bd8(cospis0, cospis1, b);
+
+    c[0] = vrshrq_n_s16(b[0], 5);
+    c[1] = vrshrq_n_s16(b[1], 5);
+    c[2] = vrshrq_n_s16(b[2], 5);
+    c[3] = vrshrq_n_s16(b[3], 5);
+    c[4] = vrshrq_n_s16(b[4], 5);
+    c[5] = vrshrq_n_s16(b[5], 5);
+    c[6] = vrshrq_n_s16(b[6], 5);
+    c[7] = vrshrq_n_s16(b[7], 5);
   } else {
     const int32x4_t cospis0 = vld1q_s32(kCospi32);      // cospi 0, 8, 16, 24
     const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4);  // cospi 4, 12, 20, 28
 
     if (bd == 10) {
-      idct8x8_64_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5,
-                             &a6, &a7);
-      idct8x8_64_half1d_bd10(cospis0, cospis1, &a8, &a9, &a10, &a11, &a12, &a13,
-                             &a14, &a15);
-      idct8x8_64_half1d_bd10(cospis0, cospis1, &a0, &a8, &a1, &a9, &a2, &a10,
-                             &a3, &a11);
-      idct8x8_64_half1d_bd10(cospis0, cospis1, &a4, &a12, &a5, &a13, &a6, &a14,
-                             &a7, &a15);
+      idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+                             &a[4], &a[5], &a[6], &a[7]);
+      idct8x8_64_half1d_bd10(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
+                             &a[12], &a[13], &a[14], &a[15]);
+      idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
+                             &a[2], &a[10], &a[3], &a[11]);
+      idct8x8_64_half1d_bd10(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
+                             &a[6], &a[14], &a[7], &a[15]);
     } else {
-      idct8x8_64_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5,
-                             &a6, &a7);
-      idct8x8_64_half1d_bd12(cospis0, cospis1, &a8, &a9, &a10, &a11, &a12, &a13,
-                             &a14, &a15);
-      idct8x8_64_half1d_bd12(cospis0, cospis1, &a0, &a8, &a1, &a9, &a2, &a10,
-                             &a3, &a11);
-      idct8x8_64_half1d_bd12(cospis0, cospis1, &a4, &a12, &a5, &a13, &a6, &a14,
-                             &a7, &a15);
+      idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+                             &a[4], &a[5], &a[6], &a[7]);
+      idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
+                             &a[12], &a[13], &a[14], &a[15]);
+      idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
+                             &a[2], &a[10], &a[3], &a[11]);
+      idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
+                             &a[6], &a[14], &a[7], &a[15]);
     }
-    c0 = vcombine_s16(vrshrn_n_s32(a0, 5), vrshrn_n_s32(a4, 5));
-    c1 = vcombine_s16(vrshrn_n_s32(a8, 5), vrshrn_n_s32(a12, 5));
-    c2 = vcombine_s16(vrshrn_n_s32(a1, 5), vrshrn_n_s32(a5, 5));
-    c3 = vcombine_s16(vrshrn_n_s32(a9, 5), vrshrn_n_s32(a13, 5));
-    c4 = vcombine_s16(vrshrn_n_s32(a2, 5), vrshrn_n_s32(a6, 5));
-    c5 = vcombine_s16(vrshrn_n_s32(a10, 5), vrshrn_n_s32(a14, 5));
-    c6 = vcombine_s16(vrshrn_n_s32(a3, 5), vrshrn_n_s32(a7, 5));
-    c7 = vcombine_s16(vrshrn_n_s32(a11, 5), vrshrn_n_s32(a15, 5));
+    c[0] = vcombine_s16(vrshrn_n_s32(a[0], 5), vrshrn_n_s32(a[4], 5));
+    c[1] = vcombine_s16(vrshrn_n_s32(a[8], 5), vrshrn_n_s32(a[12], 5));
+    c[2] = vcombine_s16(vrshrn_n_s32(a[1], 5), vrshrn_n_s32(a[5], 5));
+    c[3] = vcombine_s16(vrshrn_n_s32(a[9], 5), vrshrn_n_s32(a[13], 5));
+    c[4] = vcombine_s16(vrshrn_n_s32(a[2], 5), vrshrn_n_s32(a[6], 5));
+    c[5] = vcombine_s16(vrshrn_n_s32(a[10], 5), vrshrn_n_s32(a[14], 5));
+    c[6] = vcombine_s16(vrshrn_n_s32(a[3], 5), vrshrn_n_s32(a[7], 5));
+    c[7] = vcombine_s16(vrshrn_n_s32(a[11], 5), vrshrn_n_s32(a[15], 5));
   }
-  highbd_add8x8(c0, c1, c2, c3, c4, c5, c6, c7, dest, stride, bd);
+  highbd_add8x8(c, dest, stride, bd);
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct_neon.h
new file mode 100644
index 0000000000..518ef4336e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct_neon.h
@@ -0,0 +1,474 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_HIGHBD_IDCT_NEON_H_
+#define VPX_VPX_DSP_ARM_HIGHBD_IDCT_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void highbd_idct4x4_1_add_kernel1(uint16_t **dest,
+                                                const int stride,
+                                                const int16x8_t res,
+                                                const int16x8_t max) {
+  const uint16x4_t a0 = vld1_u16(*dest);
+  const uint16x4_t a1 = vld1_u16(*dest + stride);
+  const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a0, a1));
+  // Note: In some profile tests, res is quite close to +/-32767.
+  // We use saturating addition.
+  const int16x8_t b = vqaddq_s16(res, a);
+  const int16x8_t c = vminq_s16(b, max);
+  const uint16x8_t d = vqshluq_n_s16(c, 0);
+  vst1_u16(*dest, vget_low_u16(d));
+  *dest += stride;
+  vst1_u16(*dest, vget_high_u16(d));
+  *dest += stride;
+}
+
+static INLINE void idct4x4_16_kernel_bd10(const int32x4_t cospis,
+                                          int32x4_t *const a) {
+  int32x4_t b0, b1, b2, b3;
+
+  transpose_s32_4x4(&a[0], &a[1], &a[2], &a[3]);
+  b0 = vaddq_s32(a[0], a[2]);
+  b1 = vsubq_s32(a[0], a[2]);
+  b0 = vmulq_lane_s32(b0, vget_high_s32(cospis), 0);
+  b1 = vmulq_lane_s32(b1, vget_high_s32(cospis), 0);
+  b2 = vmulq_lane_s32(a[1], vget_high_s32(cospis), 1);
+  b3 = vmulq_lane_s32(a[1], vget_low_s32(cospis), 1);
+  b2 = vmlsq_lane_s32(b2, a[3], vget_low_s32(cospis), 1);
+  b3 = vmlaq_lane_s32(b3, a[3], vget_high_s32(cospis), 1);
+  b0 = vrshrq_n_s32(b0, DCT_CONST_BITS);
+  b1 = vrshrq_n_s32(b1, DCT_CONST_BITS);
+  b2 = vrshrq_n_s32(b2, DCT_CONST_BITS);
+  b3 = vrshrq_n_s32(b3, DCT_CONST_BITS);
+  a[0] = vaddq_s32(b0, b3);
+  a[1] = vaddq_s32(b1, b2);
+  a[2] = vsubq_s32(b1, b2);
+  a[3] = vsubq_s32(b0, b3);
+}
+
+static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis,
+                                          int32x4_t *const a) {
+  int32x4_t b0, b1, b2, b3;
+  int64x2_t c[12];
+
+  transpose_s32_4x4(&a[0], &a[1], &a[2], &a[3]);
+  b0 = vaddq_s32(a[0], a[2]);
+  b1 = vsubq_s32(a[0], a[2]);
+  c[0] = vmull_lane_s32(vget_low_s32(b0), vget_high_s32(cospis), 0);
+  c[1] = vmull_lane_s32(vget_high_s32(b0), vget_high_s32(cospis), 0);
+  c[2] = vmull_lane_s32(vget_low_s32(b1), vget_high_s32(cospis), 0);
+  c[3] = vmull_lane_s32(vget_high_s32(b1), vget_high_s32(cospis), 0);
+  c[4] = vmull_lane_s32(vget_low_s32(a[1]), vget_high_s32(cospis), 1);
+  c[5] = vmull_lane_s32(vget_high_s32(a[1]), vget_high_s32(cospis), 1);
+  c[6] = vmull_lane_s32(vget_low_s32(a[1]), vget_low_s32(cospis), 1);
+  c[7] = vmull_lane_s32(vget_high_s32(a[1]), vget_low_s32(cospis), 1);
+  c[8] = vmull_lane_s32(vget_low_s32(a[3]), vget_low_s32(cospis), 1);
+  c[9] = vmull_lane_s32(vget_high_s32(a[3]), vget_low_s32(cospis), 1);
+  c[10] = vmull_lane_s32(vget_low_s32(a[3]), vget_high_s32(cospis), 1);
+  c[11] = vmull_lane_s32(vget_high_s32(a[3]), vget_high_s32(cospis), 1);
+  c[4] = vsubq_s64(c[4], c[8]);
+  c[5] = vsubq_s64(c[5], c[9]);
+  c[6] = vaddq_s64(c[6], c[10]);
+  c[7] = vaddq_s64(c[7], c[11]);
+  b0 = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS),
+                    vrshrn_n_s64(c[1], DCT_CONST_BITS));
+  b1 = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS),
+                    vrshrn_n_s64(c[3], DCT_CONST_BITS));
+  b2 = vcombine_s32(vrshrn_n_s64(c[4], DCT_CONST_BITS),
+                    vrshrn_n_s64(c[5], DCT_CONST_BITS));
+  b3 = vcombine_s32(vrshrn_n_s64(c[6], DCT_CONST_BITS),
+                    vrshrn_n_s64(c[7], DCT_CONST_BITS));
+  a[0] = vaddq_s32(b0, b3);
+  a[1] = vaddq_s32(b1, b2);
+  a[2] = vsubq_s32(b1, b2);
+  a[3] = vsubq_s32(b0, b3);
+}
+
+static INLINE void highbd_add8x8(int16x8_t *const a, uint16_t *dest,
+                                 const int stride, const int bd) {
+  const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+  const uint16_t *dst = dest;
+  uint16x8_t d0, d1, d2, d3, d4, d5, d6, d7;
+  uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16, d4_u16, d5_u16, d6_u16, d7_u16;
+  int16x8_t d0_s16, d1_s16, d2_s16, d3_s16, d4_s16, d5_s16, d6_s16, d7_s16;
+
+  d0 = vld1q_u16(dst);
+  dst += stride;
+  d1 = vld1q_u16(dst);
+  dst += stride;
+  d2 = vld1q_u16(dst);
+  dst += stride;
+  d3 = vld1q_u16(dst);
+  dst += stride;
+  d4 = vld1q_u16(dst);
+  dst += stride;
+  d5 = vld1q_u16(dst);
+  dst += stride;
+  d6 = vld1q_u16(dst);
+  dst += stride;
+  d7 = vld1q_u16(dst);
+
+  d0_s16 = vqaddq_s16(a[0], vreinterpretq_s16_u16(d0));
+  d1_s16 = vqaddq_s16(a[1], vreinterpretq_s16_u16(d1));
+  d2_s16 = vqaddq_s16(a[2], vreinterpretq_s16_u16(d2));
+  d3_s16 = vqaddq_s16(a[3], vreinterpretq_s16_u16(d3));
+  d4_s16 = vqaddq_s16(a[4], vreinterpretq_s16_u16(d4));
+  d5_s16 = vqaddq_s16(a[5], vreinterpretq_s16_u16(d5));
+  d6_s16 = vqaddq_s16(a[6], vreinterpretq_s16_u16(d6));
+  d7_s16 = vqaddq_s16(a[7], vreinterpretq_s16_u16(d7));
+
+  d0_s16 = vminq_s16(d0_s16, max);
+  d1_s16 = vminq_s16(d1_s16, max);
+  d2_s16 = vminq_s16(d2_s16, max);
+  d3_s16 = vminq_s16(d3_s16, max);
+  d4_s16 = vminq_s16(d4_s16, max);
+  d5_s16 = vminq_s16(d5_s16, max);
+  d6_s16 = vminq_s16(d6_s16, max);
+  d7_s16 = vminq_s16(d7_s16, max);
+  d0_u16 = vqshluq_n_s16(d0_s16, 0);
+  d1_u16 = vqshluq_n_s16(d1_s16, 0);
+  d2_u16 = vqshluq_n_s16(d2_s16, 0);
+  d3_u16 = vqshluq_n_s16(d3_s16, 0);
+  d4_u16 = vqshluq_n_s16(d4_s16, 0);
+  d5_u16 = vqshluq_n_s16(d5_s16, 0);
+  d6_u16 = vqshluq_n_s16(d6_s16, 0);
+  d7_u16 = vqshluq_n_s16(d7_s16, 0);
+
+  vst1q_u16(dest, d0_u16);
+  dest += stride;
+  vst1q_u16(dest, d1_u16);
+  dest += stride;
+  vst1q_u16(dest, d2_u16);
+  dest += stride;
+  vst1q_u16(dest, d3_u16);
+  dest += stride;
+  vst1q_u16(dest, d4_u16);
+  dest += stride;
+  vst1q_u16(dest, d5_u16);
+  dest += stride;
+  vst1q_u16(dest, d6_u16);
+  dest += stride;
+  vst1q_u16(dest, d7_u16);
+}
+
+static INLINE void idct8x8_64_half1d_bd10(
+    const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
+    int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
+    int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
+    int32x4_t *const io7) {
+  int32x4_t step1[8], step2[8];
+
+  transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7);
+
+  // stage 1
+  step1[4] = vmulq_lane_s32(*io1, vget_high_s32(cospis1), 1);
+  step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0);
+  step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1);
+  step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0);
+
+  step1[4] = vmlsq_lane_s32(step1[4], *io7, vget_low_s32(cospis1), 0);
+  step1[5] = vmlaq_lane_s32(step1[5], *io5, vget_low_s32(cospis1), 1);
+  step1[6] = vmlsq_lane_s32(step1[6], *io5, vget_high_s32(cospis1), 0);
+  step1[7] = vmlaq_lane_s32(step1[7], *io7, vget_high_s32(cospis1), 1);
+
+  step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS);
+  step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+  step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
+  step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS);
+
+  // stage 2
+  step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0);
+  step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1);
+  step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1);
+
+  step2[0] = vmlaq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0);
+  step2[1] = vmlsq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0);
+  step2[2] = vmlsq_lane_s32(step2[2], *io6, vget_low_s32(cospis0), 1);
+  step2[3] = vmlaq_lane_s32(step2[3], *io6, vget_high_s32(cospis0), 1);
+
+  step2[0] = vrshrq_n_s32(step2[0], DCT_CONST_BITS);
+  step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS);
+  step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS);
+  step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS);
+
+  step2[4] = vaddq_s32(step1[4], step1[5]);
+  step2[5] = vsubq_s32(step1[4], step1[5]);
+  step2[6] = vsubq_s32(step1[7], step1[6]);
+  step2[7] = vaddq_s32(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = vaddq_s32(step2[0], step2[3]);
+  step1[1] = vaddq_s32(step2[1], step2[2]);
+  step1[2] = vsubq_s32(step2[1], step2[2]);
+  step1[3] = vsubq_s32(step2[0], step2[3]);
+
+  step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0);
+  step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
+  step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
+  step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+  step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
+
+  // stage 4
+  *io0 = vaddq_s32(step1[0], step2[7]);
+  *io1 = vaddq_s32(step1[1], step1[6]);
+  *io2 = vaddq_s32(step1[2], step1[5]);
+  *io3 = vaddq_s32(step1[3], step2[4]);
+  *io4 = vsubq_s32(step1[3], step2[4]);
+  *io5 = vsubq_s32(step1[2], step1[5]);
+  *io6 = vsubq_s32(step1[1], step1[6]);
+  *io7 = vsubq_s32(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_64_half1d_bd12(
+    const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
+    int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
+    int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
+    int32x4_t *const io7) {
+  int32x2_t input1l, input1h, input3l, input3h, input5l, input5h, input7l,
+      input7h;
+  int32x2_t step1l[4], step1h[4];
+  int32x4_t step1[8], step2[8];
+  int64x2_t t64[8];
+  int32x2_t t32[8];
+
+  transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7);
+
+  // stage 1
+  input1l = vget_low_s32(*io1);
+  input1h = vget_high_s32(*io1);
+  input3l = vget_low_s32(*io3);
+  input3h = vget_high_s32(*io3);
+  input5l = vget_low_s32(*io5);
+  input5h = vget_high_s32(*io5);
+  input7l = vget_low_s32(*io7);
+  input7h = vget_high_s32(*io7);
+  step1l[0] = vget_low_s32(*io0);
+  step1h[0] = vget_high_s32(*io0);
+  step1l[1] = vget_low_s32(*io2);
+  step1h[1] = vget_high_s32(*io2);
+  step1l[2] = vget_low_s32(*io4);
+  step1h[2] = vget_high_s32(*io4);
+  step1l[3] = vget_low_s32(*io6);
+  step1h[3] = vget_high_s32(*io6);
+
+  t64[0] = vmull_lane_s32(input1l, vget_high_s32(cospis1), 1);
+  t64[1] = vmull_lane_s32(input1h, vget_high_s32(cospis1), 1);
+  t64[2] = vmull_lane_s32(input3l, vget_high_s32(cospis1), 0);
+  t64[3] = vmull_lane_s32(input3h, vget_high_s32(cospis1), 0);
+  t64[4] = vmull_lane_s32(input3l, vget_low_s32(cospis1), 1);
+  t64[5] = vmull_lane_s32(input3h, vget_low_s32(cospis1), 1);
+  t64[6] = vmull_lane_s32(input1l, vget_low_s32(cospis1), 0);
+  t64[7] = vmull_lane_s32(input1h, vget_low_s32(cospis1), 0);
+  t64[0] = vmlsl_lane_s32(t64[0], input7l, vget_low_s32(cospis1), 0);
+  t64[1] = vmlsl_lane_s32(t64[1], input7h, vget_low_s32(cospis1), 0);
+  t64[2] = vmlal_lane_s32(t64[2], input5l, vget_low_s32(cospis1), 1);
+  t64[3] = vmlal_lane_s32(t64[3], input5h, vget_low_s32(cospis1), 1);
+  t64[4] = vmlsl_lane_s32(t64[4], input5l, vget_high_s32(cospis1), 0);
+  t64[5] = vmlsl_lane_s32(t64[5], input5h, vget_high_s32(cospis1), 0);
+  t64[6] = vmlal_lane_s32(t64[6], input7l, vget_high_s32(cospis1), 1);
+  t64[7] = vmlal_lane_s32(t64[7], input7h, vget_high_s32(cospis1), 1);
+  t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+  t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+  t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+  t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+  t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+  t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
+  step1[4] = vcombine_s32(t32[0], t32[1]);
+  step1[5] = vcombine_s32(t32[2], t32[3]);
+  step1[6] = vcombine_s32(t32[4], t32[5]);
+  step1[7] = vcombine_s32(t32[6], t32[7]);
+
+  // stage 2
+  t64[2] = vmull_lane_s32(step1l[0], vget_high_s32(cospis0), 0);
+  t64[3] = vmull_lane_s32(step1h[0], vget_high_s32(cospis0), 0);
+  t64[4] = vmull_lane_s32(step1l[1], vget_high_s32(cospis0), 1);
+  t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1);
+  t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1);
+  t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1);
+  t64[0] = vmlal_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0);
+  t64[1] = vmlal_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0);
+  t64[2] = vmlsl_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0);
+  t64[3] = vmlsl_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0);
+  t64[4] = vmlsl_lane_s32(t64[4], step1l[3], vget_low_s32(cospis0), 1);
+  t64[5] = vmlsl_lane_s32(t64[5], step1h[3], vget_low_s32(cospis0), 1);
+  t64[6] = vmlal_lane_s32(t64[6], step1l[3], vget_high_s32(cospis0), 1);
+  t64[7] = vmlal_lane_s32(t64[7], step1h[3], vget_high_s32(cospis0), 1);
+  t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+  t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+  t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+  t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+  t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+  t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
+  step2[0] = vcombine_s32(t32[0], t32[1]);
+  step2[1] = vcombine_s32(t32[2], t32[3]);
+  step2[2] = vcombine_s32(t32[4], t32[5]);
+  step2[3] = vcombine_s32(t32[6], t32[7]);
+
+  step2[4] = vaddq_s32(step1[4], step1[5]);
+  step2[5] = vsubq_s32(step1[4], step1[5]);
+  step2[6] = vsubq_s32(step1[7], step1[6]);
+  step2[7] = vaddq_s32(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = vaddq_s32(step2[0], step2[3]);
+  step1[1] = vaddq_s32(step2[1], step2[2]);
+  step1[2] = vsubq_s32(step2[1], step2[2]);
+  step1[3] = vsubq_s32(step2[0], step2[3]);
+
+  t64[2] = vmull_lane_s32(vget_low_s32(step2[6]), vget_high_s32(cospis0), 0);
+  t64[3] = vmull_lane_s32(vget_high_s32(step2[6]), vget_high_s32(cospis0), 0);
+  t64[0] =
+      vmlsl_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
+  t64[1] = vmlsl_lane_s32(t64[3], vget_high_s32(step2[5]),
+                          vget_high_s32(cospis0), 0);
+  t64[2] =
+      vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
+  t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]),
+                          vget_high_s32(cospis0), 0);
+  t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+  t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+  step1[5] = vcombine_s32(t32[0], t32[1]);
+  step1[6] = vcombine_s32(t32[2], t32[3]);
+
+  // stage 4
+  *io0 = vaddq_s32(step1[0], step2[7]);
+  *io1 = vaddq_s32(step1[1], step1[6]);
+  *io2 = vaddq_s32(step1[2], step1[5]);
+  *io3 = vaddq_s32(step1[3], step2[4]);
+  *io4 = vsubq_s32(step1[3], step2[4]);
+  *io5 = vsubq_s32(step1[2], step1[5]);
+  *io6 = vsubq_s32(step1[1], step1[6]);
+  *io7 = vsubq_s32(step1[0], step2[7]);
+}
+
+static INLINE void highbd_idct16x16_store_pass1(const int32x4x2_t *const out,
+                                                int32_t *output) {
+  // Save the result into output
+  vst1q_s32(output + 0, out[0].val[0]);
+  vst1q_s32(output + 4, out[0].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[1].val[0]);
+  vst1q_s32(output + 4, out[1].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[2].val[0]);
+  vst1q_s32(output + 4, out[2].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[3].val[0]);
+  vst1q_s32(output + 4, out[3].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[4].val[0]);
+  vst1q_s32(output + 4, out[4].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[5].val[0]);
+  vst1q_s32(output + 4, out[5].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[6].val[0]);
+  vst1q_s32(output + 4, out[6].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[7].val[0]);
+  vst1q_s32(output + 4, out[7].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[8].val[0]);
+  vst1q_s32(output + 4, out[8].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[9].val[0]);
+  vst1q_s32(output + 4, out[9].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[10].val[0]);
+  vst1q_s32(output + 4, out[10].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[11].val[0]);
+  vst1q_s32(output + 4, out[11].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[12].val[0]);
+  vst1q_s32(output + 4, out[12].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[13].val[0]);
+  vst1q_s32(output + 4, out[13].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[14].val[0]);
+  vst1q_s32(output + 4, out[14].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[15].val[0]);
+  vst1q_s32(output + 4, out[15].val[1]);
+}
+
+static INLINE void highbd_idct16x16_add_store(const int32x4x2_t *const out,
+                                              uint16_t *dest, const int stride,
+                                              const int bd) {
+  // Add the result to dest
+  const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+  int16x8_t o[16];
+  o[0] = vcombine_s16(vrshrn_n_s32(out[0].val[0], 6),
+                      vrshrn_n_s32(out[0].val[1], 6));
+  o[1] = vcombine_s16(vrshrn_n_s32(out[1].val[0], 6),
+                      vrshrn_n_s32(out[1].val[1], 6));
+  o[2] = vcombine_s16(vrshrn_n_s32(out[2].val[0], 6),
+                      vrshrn_n_s32(out[2].val[1], 6));
+  o[3] = vcombine_s16(vrshrn_n_s32(out[3].val[0], 6),
+                      vrshrn_n_s32(out[3].val[1], 6));
+  o[4] = vcombine_s16(vrshrn_n_s32(out[4].val[0], 6),
+                      vrshrn_n_s32(out[4].val[1], 6));
+  o[5] = vcombine_s16(vrshrn_n_s32(out[5].val[0], 6),
+                      vrshrn_n_s32(out[5].val[1], 6));
+  o[6] = vcombine_s16(vrshrn_n_s32(out[6].val[0], 6),
+                      vrshrn_n_s32(out[6].val[1], 6));
+  o[7] = vcombine_s16(vrshrn_n_s32(out[7].val[0], 6),
+                      vrshrn_n_s32(out[7].val[1], 6));
+  o[8] = vcombine_s16(vrshrn_n_s32(out[8].val[0], 6),
+                      vrshrn_n_s32(out[8].val[1], 6));
+  o[9] = vcombine_s16(vrshrn_n_s32(out[9].val[0], 6),
+                      vrshrn_n_s32(out[9].val[1], 6));
+  o[10] = vcombine_s16(vrshrn_n_s32(out[10].val[0], 6),
+                       vrshrn_n_s32(out[10].val[1], 6));
+  o[11] = vcombine_s16(vrshrn_n_s32(out[11].val[0], 6),
+                       vrshrn_n_s32(out[11].val[1], 6));
+  o[12] = vcombine_s16(vrshrn_n_s32(out[12].val[0], 6),
+                       vrshrn_n_s32(out[12].val[1], 6));
+  o[13] = vcombine_s16(vrshrn_n_s32(out[13].val[0], 6),
+                       vrshrn_n_s32(out[13].val[1], 6));
+  o[14] = vcombine_s16(vrshrn_n_s32(out[14].val[0], 6),
+                       vrshrn_n_s32(out[14].val[1], 6));
+  o[15] = vcombine_s16(vrshrn_n_s32(out[15].val[0], 6),
+                       vrshrn_n_s32(out[15].val[1], 6));
+  highbd_idct16x16_add8x1(o[0], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[1], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[2], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[3], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[4], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[5], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[6], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[7], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[8], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[9], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[10], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[11], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[12], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[13], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[14], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[15], max, &dest, stride);
+}
+
+void vpx_highbd_idct16x16_256_add_half1d(const int32_t *input, int32_t *output,
+                                         uint16_t *dest, const int stride,
+                                         const int bd);
+
+#endif  // VPX_VPX_DSP_ARM_HIGHBD_IDCT_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c
index 6f7e5da762..235cb5b996 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c
@@ -12,23 +12,22 @@
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
+#include "sum_neon.h"
 #include "vpx/vpx_integer.h"
 
 //------------------------------------------------------------------------------
 // DC 4x4
 
-static INLINE uint16x4_t dc_sum_4(const uint16_t *ref) {
+static INLINE uint16_t dc_sum_4(const uint16_t *ref) {
   const uint16x4_t ref_u16 = vld1_u16(ref);
-  const uint16x4_t p0 = vpadd_u16(ref_u16, ref_u16);
-  return vpadd_u16(p0, p0);
+  return horizontal_add_uint16x4(ref_u16);
 }
 
 static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
                                 const uint16x4_t dc) {
-  const uint16x4_t dc_dup = vdup_lane_u16(dc, 0);
   int i;
   for (i = 0; i < 4; ++i, dst += stride) {
-    vst1_u16(dst, dc_dup);
+    vst1_u16(dst, dc);
   }
 }
 
@@ -37,21 +36,17 @@ void vpx_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
                                       const uint16_t *left, int bd) {
   const uint16x4_t a = vld1_u16(above);
   const uint16x4_t l = vld1_u16(left);
-  uint16x4_t sum;
-  uint16x4_t dc;
+  const uint16_t sum = horizontal_add_uint16x4(vadd_u16(a, l));
+  const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 3);
   (void)bd;
-  sum = vadd_u16(a, l);
-  sum = vpadd_u16(sum, sum);
-  sum = vpadd_u16(sum, sum);
-  dc = vrshr_n_u16(sum, 3);
   dc_store_4x4(dst, stride, dc);
 }
 
 void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
                                            const uint16_t *above,
                                            const uint16_t *left, int bd) {
-  const uint16x4_t sum = dc_sum_4(left);
-  const uint16x4_t dc = vrshr_n_u16(sum, 2);
+  const uint16_t sum = dc_sum_4(left);
+  const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 2);
   (void)above;
   (void)bd;
   dc_store_4x4(dst, stride, dc);
@@ -60,8 +55,8 @@ void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_dc_top_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
                                           const uint16_t *above,
                                           const uint16_t *left, int bd) {
-  const uint16x4_t sum = dc_sum_4(above);
-  const uint16x4_t dc = vrshr_n_u16(sum, 2);
+  const uint16_t sum = dc_sum_4(above);
+  const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 2);
   (void)left;
   (void)bd;
   dc_store_4x4(dst, stride, dc);
@@ -79,19 +74,16 @@ void vpx_highbd_dc_128_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
 //------------------------------------------------------------------------------
 // DC 8x8
 
-static INLINE uint16x4_t dc_sum_8(const uint16_t *ref) {
+static INLINE uint16_t dc_sum_8(const uint16_t *ref) {
   const uint16x8_t ref_u16 = vld1q_u16(ref);
-  uint16x4_t sum = vadd_u16(vget_low_u16(ref_u16), vget_high_u16(ref_u16));
-  sum = vpadd_u16(sum, sum);
-  return vpadd_u16(sum, sum);
+  return horizontal_add_uint16x8(ref_u16);
 }
 
 static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride,
-                                const uint16x4_t dc) {
-  const uint16x8_t dc_dup = vdupq_lane_u16(dc, 0);
+                                const uint16x8_t dc) {
   int i;
   for (i = 0; i < 8; ++i, dst += stride) {
-    vst1q_u16(dst, dc_dup);
+    vst1q_u16(dst, dc);
   }
 }
 
@@ -101,20 +93,17 @@ void vpx_highbd_dc_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
   const uint16x8_t above_u16 = vld1q_u16(above);
   const uint16x8_t left_u16 = vld1q_u16(left);
   const uint16x8_t p0 = vaddq_u16(above_u16, left_u16);
-  uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
-  uint16x4_t dc;
+  const uint16_t sum = horizontal_add_uint16x8(p0);
+  const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4);
   (void)bd;
-  sum = vpadd_u16(sum, sum);
-  sum = vpadd_u16(sum, sum);
-  dc = vrshr_n_u16(sum, 4);
   dc_store_8x8(dst, stride, dc);
 }
 
 void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
                                            const uint16_t *above,
                                            const uint16_t *left, int bd) {
-  const uint16x4_t sum = dc_sum_8(left);
-  const uint16x4_t dc = vrshr_n_u16(sum, 3);
+  const uint16_t sum = dc_sum_8(left);
+  const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 3);
   (void)above;
   (void)bd;
   dc_store_8x8(dst, stride, dc);
@@ -123,8 +112,8 @@ void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
                                           const uint16_t *above,
                                           const uint16_t *left, int bd) {
-  const uint16x4_t sum = dc_sum_8(above);
-  const uint16x4_t dc = vrshr_n_u16(sum, 3);
+  const uint16_t sum = dc_sum_8(above);
+  const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 3);
   (void)left;
   (void)bd;
   dc_store_8x8(dst, stride, dc);
@@ -133,7 +122,7 @@ void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
                                           const uint16_t *above,
                                           const uint16_t *left, int bd) {
-  const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+  const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1));
   (void)above;
   (void)left;
   dc_store_8x8(dst, stride, dc);
@@ -142,47 +131,43 @@ void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
 //------------------------------------------------------------------------------
 // DC 16x16
 
-static INLINE uint16x4_t dc_sum_16(const uint16_t *ref) {
-  const uint16x8x2_t ref_u16 = vld2q_u16(ref);
-  const uint16x8_t p0 = vaddq_u16(ref_u16.val[0], ref_u16.val[1]);
-  uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
-  sum = vpadd_u16(sum, sum);
-  return vpadd_u16(sum, sum);
+static INLINE uint16_t dc_sum_16(const uint16_t *ref) {
+  const uint16x8_t ref_u16_0 = vld1q_u16(ref + 0);
+  const uint16x8_t ref_u16_1 = vld1q_u16(ref + 8);
+  const uint16x8_t p0 = vaddq_u16(ref_u16_0, ref_u16_1);
+  return horizontal_add_uint16x8(p0);
 }
 
 static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride,
-                                  const uint16x4_t dc) {
-  uint16x8x2_t dc_dup;
+                                  const uint16x8_t dc) {
   int i;
-  dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0);
   for (i = 0; i < 16; ++i, dst += stride) {
-    vst2q_u16(dst, dc_dup);
+    vst1q_u16(dst + 0, dc);
+    vst1q_u16(dst + 8, dc);
   }
 }
 
 void vpx_highbd_dc_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
                                         const uint16_t *above,
                                         const uint16_t *left, int bd) {
-  const uint16x8x2_t a = vld2q_u16(above);
-  const uint16x8x2_t l = vld2q_u16(left);
-  const uint16x8_t pa = vaddq_u16(a.val[0], a.val[1]);
-  const uint16x8_t pl = vaddq_u16(l.val[0], l.val[1]);
+  const uint16x8_t a0 = vld1q_u16(above + 0);
+  const uint16x8_t a1 = vld1q_u16(above + 8);
+  const uint16x8_t l0 = vld1q_u16(left + 0);
+  const uint16x8_t l1 = vld1q_u16(left + 8);
+  const uint16x8_t pa = vaddq_u16(a0, a1);
+  const uint16x8_t pl = vaddq_u16(l0, l1);
   const uint16x8_t pal0 = vaddq_u16(pa, pl);
-  uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0));
-  uint32x2_t sum;
-  uint16x4_t dc;
+  const uint32_t sum = horizontal_add_uint16x8(pal0);
+  const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0);
   (void)bd;
-  pal1 = vpadd_u16(pal1, pal1);
-  sum = vpaddl_u16(pal1);
-  dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5));
   dc_store_16x16(dst, stride, dc);
 }
 
 void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
                                              const uint16_t *above,
                                              const uint16_t *left, int bd) {
-  const uint16x4_t sum = dc_sum_16(left);
-  const uint16x4_t dc = vrshr_n_u16(sum, 4);
+  const uint16_t sum = dc_sum_16(left);
+  const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4);
   (void)above;
   (void)bd;
   dc_store_16x16(dst, stride, dc);
@@ -191,8 +176,8 @@ void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
                                             const uint16_t *above,
                                             const uint16_t *left, int bd) {
-  const uint16x4_t sum = dc_sum_16(above);
-  const uint16x4_t dc = vrshr_n_u16(sum, 4);
+  const uint16_t sum = dc_sum_16(above);
+  const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4);
   (void)left;
   (void)bd;
   dc_store_16x16(dst, stride, dc);
@@ -201,7 +186,7 @@ void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
                                             const uint16_t *above,
                                             const uint16_t *left, int bd) {
-  const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+  const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1));
   (void)above;
   (void)left;
   dc_store_16x16(dst, stride, dc);
@@ -210,56 +195,58 @@ void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
 //------------------------------------------------------------------------------
 // DC 32x32
 
-static INLINE uint32x2_t dc_sum_32(const uint16_t *ref) {
-  const uint16x8x4_t r = vld4q_u16(ref);
-  const uint16x8_t p0 = vaddq_u16(r.val[0], r.val[1]);
-  const uint16x8_t p1 = vaddq_u16(r.val[2], r.val[3]);
+static INLINE uint32_t dc_sum_32(const uint16_t *ref) {
+  const uint16x8_t r0 = vld1q_u16(ref + 0);
+  const uint16x8_t r1 = vld1q_u16(ref + 8);
+  const uint16x8_t r2 = vld1q_u16(ref + 16);
+  const uint16x8_t r3 = vld1q_u16(ref + 24);
+  const uint16x8_t p0 = vaddq_u16(r0, r1);
+  const uint16x8_t p1 = vaddq_u16(r2, r3);
   const uint16x8_t p2 = vaddq_u16(p0, p1);
-  uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
-  sum = vpadd_u16(sum, sum);
-  return vpaddl_u16(sum);
+  return horizontal_add_uint16x8(p2);
 }
 
 static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride,
-                                  const uint16x4_t dc) {
-  uint16x8x2_t dc_dup;
+                                  const uint16x8_t dc) {
   int i;
-  dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0);
-
   for (i = 0; i < 32; ++i) {
-    vst2q_u16(dst, dc_dup);
-    dst += 16;
-    vst2q_u16(dst, dc_dup);
-    dst += stride - 16;
+    vst1q_u16(dst + 0, dc);
+    vst1q_u16(dst + 8, dc);
+    vst1q_u16(dst + 16, dc);
+    vst1q_u16(dst + 24, dc);
+    dst += stride;
   }
 }
 
 void vpx_highbd_dc_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
                                         const uint16_t *above,
                                         const uint16_t *left, int bd) {
-  const uint16x8x4_t a = vld4q_u16(above);
-  const uint16x8x4_t l = vld4q_u16(left);
-  const uint16x8_t pa0 = vaddq_u16(a.val[0], a.val[1]);
-  const uint16x8_t pa1 = vaddq_u16(a.val[2], a.val[3]);
-  const uint16x8_t pl0 = vaddq_u16(l.val[0], l.val[1]);
-  const uint16x8_t pl1 = vaddq_u16(l.val[2], l.val[3]);
+  const uint16x8_t a0 = vld1q_u16(above + 0);
+  const uint16x8_t a1 = vld1q_u16(above + 8);
+  const uint16x8_t a2 = vld1q_u16(above + 16);
+  const uint16x8_t a3 = vld1q_u16(above + 24);
+  const uint16x8_t l0 = vld1q_u16(left + 0);
+  const uint16x8_t l1 = vld1q_u16(left + 8);
+  const uint16x8_t l2 = vld1q_u16(left + 16);
+  const uint16x8_t l3 = vld1q_u16(left + 24);
+  const uint16x8_t pa0 = vaddq_u16(a0, a1);
+  const uint16x8_t pa1 = vaddq_u16(a2, a3);
+  const uint16x8_t pl0 = vaddq_u16(l0, l1);
+  const uint16x8_t pl1 = vaddq_u16(l2, l3);
   const uint16x8_t pa = vaddq_u16(pa0, pa1);
   const uint16x8_t pl = vaddq_u16(pl0, pl1);
   const uint16x8_t pal0 = vaddq_u16(pa, pl);
-  const uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0));
-  uint32x2_t sum = vpaddl_u16(pal1);
-  uint16x4_t dc;
+  const uint32_t sum = horizontal_add_uint16x8(pal0);
+  const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 6), 0);
   (void)bd;
-  sum = vpadd_u32(sum, sum);
-  dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 6));
   dc_store_32x32(dst, stride, dc);
 }
 
 void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
                                              const uint16_t *above,
                                              const uint16_t *left, int bd) {
-  const uint32x2_t sum = dc_sum_32(left);
-  const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5));
+  const uint32_t sum = dc_sum_32(left);
+  const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0);
   (void)above;
   (void)bd;
   dc_store_32x32(dst, stride, dc);
@@ -268,8 +255,8 @@ void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
                                             const uint16_t *above,
                                             const uint16_t *left, int bd) {
-  const uint32x2_t sum = dc_sum_32(above);
-  const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5));
+  const uint32_t sum = dc_sum_32(above);
+  const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0);
   (void)left;
   (void)bd;
   dc_store_32x32(dst, stride, dc);
@@ -278,7 +265,7 @@ void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
                                             const uint16_t *above,
                                             const uint16_t *left, int bd) {
-  const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+  const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1));
   (void)above;
   (void)left;
   dc_store_32x32(dst, stride, dc);
@@ -289,166 +276,1304 @@ void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_d45_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
                                        const uint16_t *above,
                                        const uint16_t *left, int bd) {
-  const uint16x8_t ABCDEFGH = vld1q_u16(above);
-  const uint16x8_t BCDEFGH0 = vld1q_u16(above + 1);
-  const uint16x8_t CDEFGH00 = vld1q_u16(above + 2);
-  const uint16x8_t avg1 = vhaddq_u16(ABCDEFGH, CDEFGH00);
-  const uint16x8_t avg2 = vrhaddq_u16(avg1, BCDEFGH0);
-  const uint16x4_t avg2_low = vget_low_u16(avg2);
-  const uint16x4_t avg2_high = vget_high_u16(avg2);
-  const uint16x4_t r1 = vext_u16(avg2_low, avg2_high, 1);
-  const uint16x4_t r2 = vext_u16(avg2_low, avg2_high, 2);
-  const uint16x4_t r3 = vext_u16(avg2_low, avg2_high, 3);
+  uint16x8_t a0, a1, a2, d0;
+  uint16_t a7;
   (void)left;
   (void)bd;
-  vst1_u16(dst, avg2_low);
-  dst += stride;
-  vst1_u16(dst, r1);
-  dst += stride;
-  vst1_u16(dst, r2);
-  dst += stride;
-  vst1_u16(dst, r3);
-  vst1q_lane_u16(dst + 3, ABCDEFGH, 7);
-}
 
-static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride,
-                               const uint16x8_t above_right, uint16x8_t *row) {
-  *row = vextq_u16(*row, above_right, 1);
-  vst1q_u16(*dst, *row);
-  *dst += stride;
+  a0 = vld1q_u16(above);
+  a7 = above[7];
+
+  // [ above[1], ..., above[6], x, x ]
+  a1 = vextq_u16(a0, a0, 1);
+  // [ above[2], ..., above[7], x, x ]
+  a2 = vextq_u16(a0, a0, 2);
+
+  // d0[0] = AVG3(above[0], above[1], above[2]);
+  // ...
+  // d0[5] = AVG3(above[5], above[6], above[7]);
+  // d0[6] = x (don't care)
+  // d0[7] = x (don't care)
+  d0 = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
+
+  // We want:
+  // stride=0 [ d0[0], d0[1], d0[2],    d0[3] ]
+  // stride=1 [ d0[1], d0[2], d0[3],    d0[4] ]
+  // stride=2 [ d0[2], d0[3], d0[4],    d0[5] ]
+  // stride=2 [ d0[3], d0[4], d0[5], above[7] ]
+  vst1_u16(dst + 0 * stride, vget_low_u16(d0));
+  vst1_u16(dst + 1 * stride, vget_low_u16(vextq_u16(d0, d0, 1)));
+  vst1_u16(dst + 2 * stride, vget_low_u16(vextq_u16(d0, d0, 2)));
+  vst1_u16(dst + 3 * stride, vget_low_u16(vextq_u16(d0, d0, 3)));
+
+  // We stored d0[6] above, so fixup into above[7].
+  dst[3 * stride + 3] = a7;
 }
 
 void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
                                        const uint16_t *above,
                                        const uint16_t *left, int bd) {
-  const uint16x8_t A0 = vld1q_u16(above);
-  const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0), 3);
-  const uint16x8_t A1 = vld1q_u16(above + 1);
-  const uint16x8_t A2 = vld1q_u16(above + 2);
-  const uint16x8_t avg1 = vhaddq_u16(A0, A2);
-  uint16x8_t row = vrhaddq_u16(avg1, A1);
+  uint16x8_t ax0, a0, a1, a7, d0;
   (void)left;
   (void)bd;
 
-  vst1q_u16(dst, row);
-  dst += stride;
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  vst1q_u16(dst, above_right);
-}
+  a0 = vld1q_u16(above + 0);
+  a1 = vld1q_u16(above + 1);
+  a7 = vld1q_dup_u16(above + 7);
 
-static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride,
-                                const uint16x8_t above_right, uint16x8_t *row_0,
-                                uint16x8_t *row_1) {
-  *row_0 = vextq_u16(*row_0, *row_1, 1);
-  *row_1 = vextq_u16(*row_1, above_right, 1);
-  vst1q_u16(*dst, *row_0);
-  *dst += 8;
-  vst1q_u16(*dst, *row_1);
-  *dst += stride - 8;
+  // We want to calculate the AVG3 result in lanes 1-7 inclusive so we can
+  // shift in above[7] later, so shift a0 across by one to get the right
+  // inputs:
+  // [ x, above[0], ... , above[6] ]
+  ax0 = vextq_u16(a0, a0, 7);
+
+  // d0[0] = x (don't care)
+  // d0[1] = AVG3(above[0], above[1], above[2]);
+  // ...
+  // d0[7] = AVG3(above[6], above[7], above[8]);
+  d0 = vrhaddq_u16(vhaddq_u16(ax0, a1), a0);
+
+  // Undo the earlier ext, incrementally shift in duplicates of above[7].
+  vst1q_u16(dst + 0 * stride, vextq_u16(d0, a7, 1));
+  vst1q_u16(dst + 1 * stride, vextq_u16(d0, a7, 2));
+  vst1q_u16(dst + 2 * stride, vextq_u16(d0, a7, 3));
+  vst1q_u16(dst + 3 * stride, vextq_u16(d0, a7, 4));
+  vst1q_u16(dst + 4 * stride, vextq_u16(d0, a7, 5));
+  vst1q_u16(dst + 5 * stride, vextq_u16(d0, a7, 6));
+  vst1q_u16(dst + 6 * stride, vextq_u16(d0, a7, 7));
+  vst1q_u16(dst + 7 * stride, a7);
 }
 
 void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
                                          const uint16_t *above,
                                          const uint16_t *left, int bd) {
-  const uint16x8_t A0_0 = vld1q_u16(above);
-  const uint16x8_t A0_1 = vld1q_u16(above + 8);
-  const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_1), 3);
-  const uint16x8_t A1_0 = vld1q_u16(above + 1);
-  const uint16x8_t A1_1 = vld1q_u16(above + 9);
-  const uint16x8_t A2_0 = vld1q_u16(above + 2);
-  const uint16x8_t A2_1 = vld1q_u16(above + 10);
-  const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0);
-  const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1);
-  uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0);
-  uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1);
+  uint16x8_t ax0, a0, a1, a7, a8, a9, a15, d0[2];
   (void)left;
   (void)bd;
 
-  vst1q_u16(dst, row_0);
-  vst1q_u16(dst + 8, row_1);
-  dst += stride;
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  vst1q_u16(dst, above_right);
-  vst1q_u16(dst + 8, above_right);
+  a0 = vld1q_u16(above + 0);
+  a1 = vld1q_u16(above + 1);
+  a7 = vld1q_u16(above + 7);
+  a8 = vld1q_u16(above + 8);
+  a9 = vld1q_u16(above + 9);
+  a15 = vld1q_dup_u16(above + 15);
+
+  // [ x, above[0], ... , above[6] ]
+  ax0 = vextq_u16(a0, a0, 7);
+
+  // We have one unused lane here to leave room to shift in above[15] in the
+  // last lane:
+  // d0[0][1] = x (don't care)
+  // d0[0][1] = AVG3(above[0], above[1], above[2]);
+  // ...
+  // d0[0][7] = AVG3(above[6], above[7], above[8]);
+  // d0[1][0] = AVG3(above[7], above[8], above[9]);
+  // ...
+  // d0[1][7] = AVG3(above[14], above[15], above[16]);
+  d0[0] = vrhaddq_u16(vhaddq_u16(ax0, a1), a0);
+  d0[1] = vrhaddq_u16(vhaddq_u16(a7, a9), a8);
+
+  // Incrementally shift in duplicates of above[15].
+  vst1q_u16(dst + 0 * stride + 0, vextq_u16(d0[0], d0[1], 1));
+  vst1q_u16(dst + 0 * stride + 8, vextq_u16(d0[1], a15, 1));
+  vst1q_u16(dst + 1 * stride + 0, vextq_u16(d0[0], d0[1], 2));
+  vst1q_u16(dst + 1 * stride + 8, vextq_u16(d0[1], a15, 2));
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 3));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[1], a15, 3));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(d0[0], d0[1], 4));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(d0[1], a15, 4));
+  vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 5));
+  vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[1], a15, 5));
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(d0[0], d0[1], 6));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(d0[1], a15, 6));
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 7));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[1], a15, 7));
+  vst1q_u16(dst + 7 * stride + 0, d0[1]);
+  vst1q_u16(dst + 7 * stride + 8, a15);
+
+  vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[1], a15, 1));
+  vst1q_u16(dst + 8 * stride + 8, a15);
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(d0[1], a15, 2));
+  vst1q_u16(dst + 9 * stride + 8, a15);
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[1], a15, 3));
+  vst1q_u16(dst + 10 * stride + 8, a15);
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(d0[1], a15, 4));
+  vst1q_u16(dst + 11 * stride + 8, a15);
+  vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[1], a15, 5));
+  vst1q_u16(dst + 12 * stride + 8, a15);
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(d0[1], a15, 6));
+  vst1q_u16(dst + 13 * stride + 8, a15);
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[1], a15, 7));
+  vst1q_u16(dst + 14 * stride + 8, a15);
+  vst1q_u16(dst + 15 * stride + 0, a15);
+  vst1q_u16(dst + 15 * stride + 8, a15);
 }
 
 void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
                                          const uint16_t *above,
                                          const uint16_t *left, int bd) {
-  const uint16x8_t A0_0 = vld1q_u16(above);
-  const uint16x8_t A0_1 = vld1q_u16(above + 8);
-  const uint16x8_t A0_2 = vld1q_u16(above + 16);
-  const uint16x8_t A0_3 = vld1q_u16(above + 24);
-  const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_3), 3);
-  const uint16x8_t A1_0 = vld1q_u16(above + 1);
-  const uint16x8_t A1_1 = vld1q_u16(above + 9);
-  const uint16x8_t A1_2 = vld1q_u16(above + 17);
-  const uint16x8_t A1_3 = vld1q_u16(above + 25);
-  const uint16x8_t A2_0 = vld1q_u16(above + 2);
-  const uint16x8_t A2_1 = vld1q_u16(above + 10);
-  const uint16x8_t A2_2 = vld1q_u16(above + 18);
-  const uint16x8_t A2_3 = vld1q_u16(above + 26);
-  const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0);
-  const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1);
-  const uint16x8_t avg_2 = vhaddq_u16(A0_2, A2_2);
-  const uint16x8_t avg_3 = vhaddq_u16(A0_3, A2_3);
-  uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0);
-  uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1);
-  uint16x8_t row_2 = vrhaddq_u16(avg_2, A1_2);
-  uint16x8_t row_3 = vrhaddq_u16(avg_3, A1_3);
+  uint16x8_t ax0, a0, a1, a7, a8, a9, a15, a16, a17, a23, a24, a25, a31, d0[4];
   int i;
   (void)left;
   (void)bd;
 
-  vst1q_u16(dst, row_0);
-  dst += 8;
-  vst1q_u16(dst, row_1);
-  dst += 8;
-  vst1q_u16(dst, row_2);
-  dst += 8;
-  vst1q_u16(dst, row_3);
-  dst += stride - 24;
+  a0 = vld1q_u16(above + 0);
+  a1 = vld1q_u16(above + 1);
+  a7 = vld1q_u16(above + 7);
+  a8 = vld1q_u16(above + 8);
+  a9 = vld1q_u16(above + 9);
+  a15 = vld1q_u16(above + 15);
+  a16 = vld1q_u16(above + 16);
+  a17 = vld1q_u16(above + 17);
+  a23 = vld1q_u16(above + 23);
+  a24 = vld1q_u16(above + 24);
+  a25 = vld1q_u16(above + 25);
+  a31 = vld1q_dup_u16(above + 31);
 
-  for (i = 0; i < 30; ++i) {
-    row_0 = vextq_u16(row_0, row_1, 1);
-    row_1 = vextq_u16(row_1, row_2, 1);
-    row_2 = vextq_u16(row_2, row_3, 1);
-    row_3 = vextq_u16(row_3, above_right, 1);
-    vst1q_u16(dst, row_0);
-    dst += 8;
-    vst1q_u16(dst, row_1);
-    dst += 8;
-    vst1q_u16(dst, row_2);
-    dst += 8;
-    vst1q_u16(dst, row_3);
-    dst += stride - 24;
+  // [ x, above[0], ... , above[6] ]
+  ax0 = vextq_u16(a0, a0, 7);
+
+  d0[0] = vrhaddq_u16(vhaddq_u16(ax0, a1), a0);
+  d0[1] = vrhaddq_u16(vhaddq_u16(a7, a9), a8);
+  d0[2] = vrhaddq_u16(vhaddq_u16(a15, a17), a16);
+  d0[3] = vrhaddq_u16(vhaddq_u16(a23, a25), a24);
+
+  for (i = 0; i < 32; ++i) {
+    d0[0] = vextq_u16(d0[0], d0[1], 1);
+    d0[1] = vextq_u16(d0[1], d0[2], 1);
+    d0[2] = vextq_u16(d0[2], d0[3], 1);
+    d0[3] = vextq_u16(d0[3], a31, 1);
+    vst1q_u16(dst + 0, d0[0]);
+    vst1q_u16(dst + 8, d0[1]);
+    vst1q_u16(dst + 16, d0[2]);
+    vst1q_u16(dst + 24, d0[3]);
+    dst += stride;
   }
+}
 
-  vst1q_u16(dst, above_right);
-  dst += 8;
-  vst1q_u16(dst, above_right);
-  dst += 8;
-  vst1q_u16(dst, above_right);
-  dst += 8;
-  vst1q_u16(dst, above_right);
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_d63_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  uint16x4_t a0, a1, a2, a3, d0, d1, d2, d3;
+  (void)left;
+  (void)bd;
+
+  a0 = vld1_u16(above + 0);
+  a1 = vld1_u16(above + 1);
+  a2 = vld1_u16(above + 2);
+  a3 = vld1_u16(above + 3);
+
+  d0 = vrhadd_u16(a0, a1);
+  d1 = vrhadd_u16(vhadd_u16(a0, a2), a1);
+  d2 = vrhadd_u16(a1, a2);
+  d3 = vrhadd_u16(vhadd_u16(a1, a3), a2);
+
+  // Note that here we are performing a full avg calculation for the final
+  // elements rather than storing a duplicate of above[3], which differs
+  // (correctly) from the general scheme employed by the bs={8,16,32}
+  // implementations in order to match the original C implementation.
+  vst1_u16(dst + 0 * stride, d0);
+  vst1_u16(dst + 1 * stride, d1);
+  vst1_u16(dst + 2 * stride, d2);
+  vst1_u16(dst + 3 * stride, d3);
+}
+
+void vpx_highbd_d63_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  uint16x8_t a0, a1, a2, a7, d0, d1, d0_ext, d1_ext;
+  (void)left;
+  (void)bd;
+
+  a0 = vld1q_u16(above + 0);
+  a1 = vld1q_u16(above + 1);
+  a2 = vld1q_u16(above + 2);
+  a7 = vld1q_dup_u16(above + 7);
+
+  d0 = vrhaddq_u16(a0, a1);
+  d1 = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
+
+  // We want to store:
+  // stride=0 [ d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], d0[7] ]
+  // stride=1 [ d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], d1[7] ]
+  // stride=2 [ d0[1], d0[2], d0[3], d0[4], d0[5], d0[6],  a[7],  a[7] ]
+  // stride=3 [ d1[1], d1[2], d1[3], d1[4], d1[5], d1[6],  a[7],  a[7] ]
+  // stride=4 [ d0[2], d0[3], d0[4], d0[5], d0[6],  a[7],  a[7],  a[7] ]
+  // stride=5 [ d1[2], d1[3], d1[4], d1[5], d1[6],  a[7],  a[7],  a[7] ]
+  // stride=6 [ d0[3], d0[4], d0[5], d0[6],  a[7],  a[7],  a[7],  a[7] ]
+  // stride=7 [ d1[3], d1[4], d1[5], d1[6],  a[7],  a[7],  a[7],  a[7] ]
+  // Note in particular that d0[7] and d1[7] are only ever referenced in the
+  // stride=0 and stride=1 cases respectively, and in later strides are
+  // replaced by a copy of above[7]. These are equivalent if for i>7,
+  // above[i]==above[7], however that is not always the case.
+
+  // Strip out d0[7] and d1[7] so that we can replace it with an additional
+  // copy of above[7], the first vector here doesn't matter so just reuse
+  // d0/d1.
+  d0_ext = vextq_u16(d0, d0, 7);
+  d1_ext = vextq_u16(d1, d1, 7);
+
+  // Shuffle in duplicates of above[7] and store.
+  vst1q_u16(dst + 0 * stride, d0);
+  vst1q_u16(dst + 1 * stride, d1);
+  vst1q_u16(dst + 2 * stride, vextq_u16(d0_ext, a7, 2));
+  vst1q_u16(dst + 3 * stride, vextq_u16(d1_ext, a7, 2));
+  vst1q_u16(dst + 4 * stride, vextq_u16(d0_ext, a7, 3));
+  vst1q_u16(dst + 5 * stride, vextq_u16(d1_ext, a7, 3));
+  vst1q_u16(dst + 6 * stride, vextq_u16(d0_ext, a7, 4));
+  vst1q_u16(dst + 7 * stride, vextq_u16(d1_ext, a7, 4));
+}
+
+void vpx_highbd_d63_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  // See vpx_highbd_d63_predictor_8x8_neon for details on the implementation.
+  uint16x8_t a0, a1, a2, a8, a9, a10, a15, d0[2], d1[2], d0_ext, d1_ext;
+  (void)left;
+  (void)bd;
+
+  a0 = vld1q_u16(above + 0);
+  a1 = vld1q_u16(above + 1);
+  a2 = vld1q_u16(above + 2);
+  a8 = vld1q_u16(above + 8);
+  a9 = vld1q_u16(above + 9);
+  a10 = vld1q_u16(above + 10);
+  a15 = vld1q_dup_u16(above + 15);
+
+  d0[0] = vrhaddq_u16(a0, a1);
+  d0[1] = vrhaddq_u16(a8, a9);
+  d1[0] = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
+  d1[1] = vrhaddq_u16(vhaddq_u16(a8, a10), a9);
+
+  // Strip out the final element of d0/d1 so that we can replace it with an
+  // additional copy of above[7], the first vector here doesn't matter so just
+  // reuse the same vector.
+  d0_ext = vextq_u16(d0[1], d0[1], 7);
+  d1_ext = vextq_u16(d1[1], d1[1], 7);
+
+  // Shuffle in duplicates of above[7] and store. Note that cases involving
+  // {d0,d1}_ext require an extra shift to undo the shifting out of the final
+  // element from above.
+  vst1q_u16(dst + 0 * stride + 0, d0[0]);
+  vst1q_u16(dst + 0 * stride + 8, d0[1]);
+  vst1q_u16(dst + 1 * stride + 0, d1[0]);
+  vst1q_u16(dst + 1 * stride + 8, d1[1]);
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 1));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0_ext, a15, 2));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1[0], d1[1], 1));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1_ext, a15, 2));
+  vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 2));
+  vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0_ext, a15, 3));
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1[0], d1[1], 2));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1_ext, a15, 3));
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 3));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0_ext, a15, 4));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1[0], d1[1], 3));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1_ext, a15, 4));
+  vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[0], d0[1], 4));
+  vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0_ext, a15, 5));
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1[0], d1[1], 4));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1_ext, a15, 5));
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[0], d0[1], 5));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0_ext, a15, 6));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1[0], d1[1], 5));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1_ext, a15, 6));
+  vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[0], d0[1], 6));
+  vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0_ext, a15, 7));
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1[0], d1[1], 6));
+  vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1_ext, a15, 7));
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[0], d0[1], 7));
+  vst1q_u16(dst + 14 * stride + 8, a15);
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1[0], d1[1], 7));
+  vst1q_u16(dst + 15 * stride + 8, a15);
+}
+
+void vpx_highbd_d63_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  // See vpx_highbd_d63_predictor_8x8_neon for details on the implementation.
+  uint16x8_t a0, a1, a2, a8, a9, a10, a16, a17, a18, a24, a25, a26, a31, d0[4],
+      d1[4], d0_ext, d1_ext;
+  (void)left;
+  (void)bd;
+
+  a0 = vld1q_u16(above + 0);
+  a1 = vld1q_u16(above + 1);
+  a2 = vld1q_u16(above + 2);
+  a8 = vld1q_u16(above + 8);
+  a9 = vld1q_u16(above + 9);
+  a10 = vld1q_u16(above + 10);
+  a16 = vld1q_u16(above + 16);
+  a17 = vld1q_u16(above + 17);
+  a18 = vld1q_u16(above + 18);
+  a24 = vld1q_u16(above + 24);
+  a25 = vld1q_u16(above + 25);
+  a26 = vld1q_u16(above + 26);
+  a31 = vld1q_dup_u16(above + 31);
+
+  d0[0] = vrhaddq_u16(a0, a1);
+  d0[1] = vrhaddq_u16(a8, a9);
+  d0[2] = vrhaddq_u16(a16, a17);
+  d0[3] = vrhaddq_u16(a24, a25);
+  d1[0] = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
+  d1[1] = vrhaddq_u16(vhaddq_u16(a8, a10), a9);
+  d1[2] = vrhaddq_u16(vhaddq_u16(a16, a18), a17);
+  d1[3] = vrhaddq_u16(vhaddq_u16(a24, a26), a25);
+
+  // Strip out the final element of d0/d1 so that we can replace it with an
+  // additional copy of above[7], the first vector here doesn't matter so just
+  // reuse the same vector.
+  d0_ext = vextq_u16(d0[3], d0[3], 7);
+  d1_ext = vextq_u16(d1[3], d1[3], 7);
+
+  // Shuffle in duplicates of above[7] and store. Note that cases involving
+  // {d0,d1}_ext require an extra shift to undo the shifting out of the final
+  // element from above.
+
+  vst1q_u16(dst + 0 * stride + 0, d0[0]);
+  vst1q_u16(dst + 0 * stride + 8, d0[1]);
+  vst1q_u16(dst + 0 * stride + 16, d0[2]);
+  vst1q_u16(dst + 0 * stride + 24, d0[3]);
+  vst1q_u16(dst + 1 * stride + 0, d1[0]);
+  vst1q_u16(dst + 1 * stride + 8, d1[1]);
+  vst1q_u16(dst + 1 * stride + 16, d1[2]);
+  vst1q_u16(dst + 1 * stride + 24, d1[3]);
+
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 1));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[1], d0[2], 1));
+  vst1q_u16(dst + 2 * stride + 16, vextq_u16(d0[2], d0[3], 1));
+  vst1q_u16(dst + 2 * stride + 24, vextq_u16(d0_ext, a31, 2));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1[0], d1[1], 1));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[1], d1[2], 1));
+  vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[2], d1[3], 1));
+  vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1_ext, a31, 2));
+
+  vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 2));
+  vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[1], d0[2], 2));
+  vst1q_u16(dst + 4 * stride + 16, vextq_u16(d0[2], d0[3], 2));
+  vst1q_u16(dst + 4 * stride + 24, vextq_u16(d0_ext, a31, 3));
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1[0], d1[1], 2));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1[1], d1[2], 2));
+  vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[2], d1[3], 2));
+  vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1_ext, a31, 3));
+
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 3));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[1], d0[2], 3));
+  vst1q_u16(dst + 6 * stride + 16, vextq_u16(d0[2], d0[3], 3));
+  vst1q_u16(dst + 6 * stride + 24, vextq_u16(d0_ext, a31, 4));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1[0], d1[1], 3));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1[1], d1[2], 3));
+  vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[2], d1[3], 3));
+  vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1_ext, a31, 4));
+
+  vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[0], d0[1], 4));
+  vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0[1], d0[2], 4));
+  vst1q_u16(dst + 8 * stride + 16, vextq_u16(d0[2], d0[3], 4));
+  vst1q_u16(dst + 8 * stride + 24, vextq_u16(d0_ext, a31, 5));
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1[0], d1[1], 4));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1[1], d1[2], 4));
+  vst1q_u16(dst + 9 * stride + 16, vextq_u16(d1[2], d1[3], 4));
+  vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1_ext, a31, 5));
+
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[0], d0[1], 5));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0[1], d0[2], 5));
+  vst1q_u16(dst + 10 * stride + 16, vextq_u16(d0[2], d0[3], 5));
+  vst1q_u16(dst + 10 * stride + 24, vextq_u16(d0_ext, a31, 6));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1[0], d1[1], 5));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1[1], d1[2], 5));
+  vst1q_u16(dst + 11 * stride + 16, vextq_u16(d1[2], d1[3], 5));
+  vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1_ext, a31, 6));
+
+  vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[0], d0[1], 6));
+  vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0[1], d0[2], 6));
+  vst1q_u16(dst + 12 * stride + 16, vextq_u16(d0[2], d0[3], 6));
+  vst1q_u16(dst + 12 * stride + 24, vextq_u16(d0_ext, a31, 7));
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1[0], d1[1], 6));
+  vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1[1], d1[2], 6));
+  vst1q_u16(dst + 13 * stride + 16, vextq_u16(d1[2], d1[3], 6));
+  vst1q_u16(dst + 13 * stride + 24, vextq_u16(d1_ext, a31, 7));
+
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[0], d0[1], 7));
+  vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0[1], d0[2], 7));
+  vst1q_u16(dst + 14 * stride + 16, vextq_u16(d0[2], d0[3], 7));
+  vst1q_u16(dst + 14 * stride + 24, a31);
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1[0], d1[1], 7));
+  vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1[1], d1[2], 7));
+  vst1q_u16(dst + 15 * stride + 16, vextq_u16(d1[2], d1[3], 7));
+  vst1q_u16(dst + 15 * stride + 24, a31);
+
+  vst1q_u16(dst + 16 * stride + 0, d0[1]);
+  vst1q_u16(dst + 16 * stride + 8, d0[2]);
+  vst1q_u16(dst + 16 * stride + 16, vextq_u16(d0_ext, a31, 1));
+  vst1q_u16(dst + 16 * stride + 24, a31);
+  vst1q_u16(dst + 17 * stride + 0, d1[1]);
+  vst1q_u16(dst + 17 * stride + 8, d1[2]);
+  vst1q_u16(dst + 17 * stride + 16, vextq_u16(d1_ext, a31, 1));
+  vst1q_u16(dst + 17 * stride + 24, a31);
+
+  vst1q_u16(dst + 18 * stride + 0, vextq_u16(d0[1], d0[2], 1));
+  vst1q_u16(dst + 18 * stride + 8, vextq_u16(d0[2], d0[3], 1));
+  vst1q_u16(dst + 18 * stride + 16, vextq_u16(d0_ext, a31, 2));
+  vst1q_u16(dst + 18 * stride + 24, a31);
+  vst1q_u16(dst + 19 * stride + 0, vextq_u16(d1[1], d1[2], 1));
+  vst1q_u16(dst + 19 * stride + 8, vextq_u16(d1[2], d1[3], 1));
+  vst1q_u16(dst + 19 * stride + 16, vextq_u16(d1_ext, a31, 2));
+  vst1q_u16(dst + 19 * stride + 24, a31);
+
+  vst1q_u16(dst + 20 * stride + 0, vextq_u16(d0[1], d0[2], 2));
+  vst1q_u16(dst + 20 * stride + 8, vextq_u16(d0[2], d0[3], 2));
+  vst1q_u16(dst + 20 * stride + 16, vextq_u16(d0_ext, a31, 3));
+  vst1q_u16(dst + 20 * stride + 24, a31);
+  vst1q_u16(dst + 21 * stride + 0, vextq_u16(d1[1], d1[2], 2));
+  vst1q_u16(dst + 21 * stride + 8, vextq_u16(d1[2], d1[3], 2));
+  vst1q_u16(dst + 21 * stride + 16, vextq_u16(d1_ext, a31, 3));
+  vst1q_u16(dst + 21 * stride + 24, a31);
+
+  vst1q_u16(dst + 22 * stride + 0, vextq_u16(d0[1], d0[2], 3));
+  vst1q_u16(dst + 22 * stride + 8, vextq_u16(d0[2], d0[3], 3));
+  vst1q_u16(dst + 22 * stride + 16, vextq_u16(d0_ext, a31, 4));
+  vst1q_u16(dst + 22 * stride + 24, a31);
+  vst1q_u16(dst + 23 * stride + 0, vextq_u16(d1[1], d1[2], 3));
+  vst1q_u16(dst + 23 * stride + 8, vextq_u16(d1[2], d1[3], 3));
+  vst1q_u16(dst + 23 * stride + 16, vextq_u16(d1_ext, a31, 4));
+  vst1q_u16(dst + 23 * stride + 24, a31);
+
+  vst1q_u16(dst + 24 * stride + 0, vextq_u16(d0[1], d0[2], 4));
+  vst1q_u16(dst + 24 * stride + 8, vextq_u16(d0[2], d0[3], 4));
+  vst1q_u16(dst + 24 * stride + 16, vextq_u16(d0_ext, a31, 5));
+  vst1q_u16(dst + 24 * stride + 24, a31);
+  vst1q_u16(dst + 25 * stride + 0, vextq_u16(d1[1], d1[2], 4));
+  vst1q_u16(dst + 25 * stride + 8, vextq_u16(d1[2], d1[3], 4));
+  vst1q_u16(dst + 25 * stride + 16, vextq_u16(d1_ext, a31, 5));
+  vst1q_u16(dst + 25 * stride + 24, a31);
+
+  vst1q_u16(dst + 26 * stride + 0, vextq_u16(d0[1], d0[2], 5));
+  vst1q_u16(dst + 26 * stride + 8, vextq_u16(d0[2], d0[3], 5));
+  vst1q_u16(dst + 26 * stride + 16, vextq_u16(d0_ext, a31, 6));
+  vst1q_u16(dst + 26 * stride + 24, a31);
+  vst1q_u16(dst + 27 * stride + 0, vextq_u16(d1[1], d1[2], 5));
+  vst1q_u16(dst + 27 * stride + 8, vextq_u16(d1[2], d1[3], 5));
+  vst1q_u16(dst + 27 * stride + 16, vextq_u16(d1_ext, a31, 6));
+  vst1q_u16(dst + 27 * stride + 24, a31);
+
+  vst1q_u16(dst + 28 * stride + 0, vextq_u16(d0[1], d0[2], 6));
+  vst1q_u16(dst + 28 * stride + 8, vextq_u16(d0[2], d0[3], 6));
+  vst1q_u16(dst + 28 * stride + 16, vextq_u16(d0_ext, a31, 7));
+  vst1q_u16(dst + 28 * stride + 24, a31);
+  vst1q_u16(dst + 29 * stride + 0, vextq_u16(d1[1], d1[2], 6));
+  vst1q_u16(dst + 29 * stride + 8, vextq_u16(d1[2], d1[3], 6));
+  vst1q_u16(dst + 29 * stride + 16, vextq_u16(d1_ext, a31, 7));
+  vst1q_u16(dst + 29 * stride + 24, a31);
+
+  vst1q_u16(dst + 30 * stride + 0, vextq_u16(d0[1], d0[2], 7));
+  vst1q_u16(dst + 30 * stride + 8, vextq_u16(d0[2], d0[3], 7));
+  vst1q_u16(dst + 30 * stride + 16, a31);
+  vst1q_u16(dst + 30 * stride + 24, a31);
+  vst1q_u16(dst + 31 * stride + 0, vextq_u16(d1[1], d1[2], 7));
+  vst1q_u16(dst + 31 * stride + 8, vextq_u16(d1[2], d1[3], 7));
+  vst1q_u16(dst + 31 * stride + 16, a31);
+  vst1q_u16(dst + 31 * stride + 24, a31);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_d117_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  uint16x4_t az, a0, l0az, l0, l1, azl0, col0, col0_even, col0_odd, d0, d1;
+  (void)bd;
+
+  az = vld1_u16(above - 1);
+  a0 = vld1_u16(above + 0);
+  // [ left[0], above[-1], above[0], above[1] ]
+  l0az = vext_u16(vld1_dup_u16(left), az, 3);
+
+  l0 = vld1_u16(left + 0);
+  // The last lane here is unused, reading left[4] could cause a buffer
+  // over-read, so just fill with a duplicate of left[0] to avoid needing to
+  // materialize a zero:
+  // [ left[1], left[2], left[3], x ]
+  l1 = vext_u16(l0, l0, 1);
+  // [ above[-1], left[0], left[1], left[2] ]
+  azl0 = vext_u16(vld1_dup_u16(above - 1), l0, 3);
+
+  d0 = vrhadd_u16(az, a0);
+  d1 = vrhadd_u16(vhadd_u16(l0az, a0), az);
+
+  col0 = vrhadd_u16(vhadd_u16(azl0, l1), l0);
+  col0_even = vdup_lane_u16(col0, 0);
+  col0_odd = vdup_lane_u16(col0, 1);
+
+  vst1_u16(dst + 0 * stride, d0);
+  vst1_u16(dst + 1 * stride, d1);
+  vst1_u16(dst + 2 * stride, vext_u16(col0_even, d0, 3));
+  vst1_u16(dst + 3 * stride, vext_u16(col0_odd, d1, 3));
+}
+
+void vpx_highbd_d117_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  uint16x8_t az, a0, l0az, l0, l1, azl0, col0, col0_even, col0_odd, d0, d1;
+  (void)bd;
+
+  az = vld1q_u16(above - 1);
+  a0 = vld1q_u16(above + 0);
+  // [ left[0], above[-1], ..., left[5] ]
+  l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+  l0 = vld1q_u16(left + 0);
+  // The last lane here is unused, reading left[8] could cause a buffer
+  // over-read, so just fill with a duplicate of left[0] to avoid needing to
+  // materialize a zero:
+  // [ left[1], ... , left[7], x ]
+  l1 = vextq_u16(l0, l0, 1);
+  // [ above[-1], left[0], ..., left[6] ]
+  azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+  // d0[0] = AVG2(above[-1], above[0])
+  // ...
+  // d0[7] = AVG2(above[6], above[7])
+  d0 = vrhaddq_u16(az, a0);
+
+  // d1[0] = AVG3(left[0], above[-1], above[0])
+  // d1[1] = AVG3(above[-1], above[0], above[1])
+  // ...
+  // d1[7] = AVG3(above[5], above[6], above[7])
+  d1 = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+
+  // The ext instruction shifts elements in from the end of the vector rather
+  // than the start, so reverse the vector to put the elements to be shifted in
+  // at the end:
+  // col0[7] = AVG3(above[-1], left[0], left[1])
+  // col0[6] = AVG3(left[0], left[1], left[2])
+  // ...
+  // col0[0] = AVG3(left[6], left[7], left[8])
+  col0 = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+  col0 = vrev64q_u16(vextq_u16(col0, col0, 4));
+
+  // We don't care about the first parameter to this uzp since we only ever use
+  // the high three elements, we just use col0 again since it is already
+  // available:
+  // col0_even = [ x, x, x, x, x, col0[3], col0[5], col0[7] ]
+  // col0_odd = [ x, x, x, x, x, col0[2], col0[4], col0[6] ]
+  col0_even = vuzpq_u16(col0, col0).val[1];
+  col0_odd = vuzpq_u16(col0, col0).val[0];
+
+  // Incrementally shift more elements from col0 into d0/1:
+  // stride=0 [ d0[0],   d0[1],   d0[2],   d0[3], d0[4], d0[5], d0[6], d0[7] ]
+  // stride=1 [ d1[0],   d1[1],   d1[2],   d1[3], d1[4], d1[5], d1[6], d1[7] ]
+  // stride=2 [ col0[7], d0[0],   d0[1],   d0[2], d0[3], d0[4], d0[5], d0[6] ]
+  // stride=3 [ col0[6], d1[0],   d1[1],   d1[2], d1[3], d1[4], d1[5], d1[6] ]
+  // stride=4 [ col0[5], col0[7], d0[0],   d0[1], d0[2], d0[3], d0[4], d0[5] ]
+  // stride=5 [ col0[4], col0[6], d1[0],   d1[1], d1[2], d1[3], d1[4], d1[5] ]
+  // stride=6 [ col0[3], col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4] ]
+  // stride=7 [ col0[2], col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4] ]
+  vst1q_u16(dst + 0 * stride, d0);
+  vst1q_u16(dst + 1 * stride, d1);
+  vst1q_u16(dst + 2 * stride, vextq_u16(col0_even, d0, 7));
+  vst1q_u16(dst + 3 * stride, vextq_u16(col0_odd, d1, 7));
+  vst1q_u16(dst + 4 * stride, vextq_u16(col0_even, d0, 6));
+  vst1q_u16(dst + 5 * stride, vextq_u16(col0_odd, d1, 6));
+  vst1q_u16(dst + 6 * stride, vextq_u16(col0_even, d0, 5));
+  vst1q_u16(dst + 7 * stride, vextq_u16(col0_odd, d1, 5));
+}
+
+void vpx_highbd_d117_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  uint16x8_t az, a0, a6, a7, a8, l0az, l0, l1, l7, l8, l9, azl0, col0_lo,
+      col0_hi, col0_even, col0_odd, d0_lo, d0_hi, d1_lo, d1_hi;
+  (void)bd;
+
+  az = vld1q_u16(above - 1);
+  a0 = vld1q_u16(above + 0);
+  a6 = vld1q_u16(above + 6);
+  a7 = vld1q_u16(above + 7);
+  a8 = vld1q_u16(above + 8);
+  // [ left[0], above[-1], ..., left[5] ]
+  l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+  l0 = vld1q_u16(left + 0);
+  l1 = vld1q_u16(left + 1);
+  l7 = vld1q_u16(left + 7);
+  l8 = vld1q_u16(left + 8);
+  // The last lane here is unused, reading left[16] could cause a buffer
+  // over-read, so just fill with a duplicate of left[8] to avoid needing to
+  // materialize a zero:
+  // [ left[9], ... , left[15], x ]
+  l9 = vextq_u16(l8, l8, 1);
+  // [ above[-1], left[0], ..., left[6] ]
+  azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+  d0_lo = vrhaddq_u16(az, a0);
+  d0_hi = vrhaddq_u16(a7, a8);
+  d1_lo = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+  d1_hi = vrhaddq_u16(vhaddq_u16(a6, a8), a7);
+
+  col0_lo = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+  col0_hi = vrhaddq_u16(vhaddq_u16(l7, l9), l8);
+
+  // Reverse within each vector, then swap the array indices in the uzp to
+  // complete the reversal across all 16 elements.
+  col0_lo = vrev64q_u16(vextq_u16(col0_lo, col0_lo, 4));
+  col0_hi = vrev64q_u16(vextq_u16(col0_hi, col0_hi, 4));
+  col0_even = vuzpq_u16(col0_hi, col0_lo).val[1];
+  col0_odd = vuzpq_u16(col0_hi, col0_lo).val[0];
+
+  vst1q_u16(dst + 0 * stride + 0, d0_lo);
+  vst1q_u16(dst + 0 * stride + 8, d0_hi);
+  vst1q_u16(dst + 1 * stride + 0, d1_lo);
+  vst1q_u16(dst + 1 * stride + 8, d1_hi);
+
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(col0_even, d0_lo, 7));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0_lo, d0_hi, 7));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(col0_odd, d1_lo, 7));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1_lo, d1_hi, 7));
+
+  vst1q_u16(dst + 4 * stride + 0, vextq_u16(col0_even, d0_lo, 6));
+  vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0_lo, d0_hi, 6));
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(col0_odd, d1_lo, 6));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1_lo, d1_hi, 6));
+
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(col0_even, d0_lo, 5));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0_lo, d0_hi, 5));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(col0_odd, d1_lo, 5));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1_lo, d1_hi, 5));
+
+  vst1q_u16(dst + 8 * stride + 0, vextq_u16(col0_even, d0_lo, 4));
+  vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0_lo, d0_hi, 4));
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(col0_odd, d1_lo, 4));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1_lo, d1_hi, 4));
+
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(col0_even, d0_lo, 3));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0_lo, d0_hi, 3));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(col0_odd, d1_lo, 3));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1_lo, d1_hi, 3));
+
+  vst1q_u16(dst + 12 * stride + 0, vextq_u16(col0_even, d0_lo, 2));
+  vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0_lo, d0_hi, 2));
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(col0_odd, d1_lo, 2));
+  vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1_lo, d1_hi, 2));
+
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(col0_even, d0_lo, 1));
+  vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0_lo, d0_hi, 1));
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(col0_odd, d1_lo, 1));
+  vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1_lo, d1_hi, 1));
+}
+
+void vpx_highbd_d117_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  uint16x8_t az, a0, a6, a7, a8, a14, a15, a16, a22, a23, a24, l0az, l0, l1, l7,
+      l8, l9, l15, l16, l17, l23, l24, l25, azl0, d0[4], d1[4], col0[4],
+      col0_even[2], col0_odd[2];
+  (void)bd;
+
+  az = vld1q_u16(above - 1);
+  a0 = vld1q_u16(above + 0);
+  a6 = vld1q_u16(above + 6);
+  a7 = vld1q_u16(above + 7);
+  a8 = vld1q_u16(above + 8);
+  a14 = vld1q_u16(above + 14);
+  a15 = vld1q_u16(above + 15);
+  a16 = vld1q_u16(above + 16);
+  a22 = vld1q_u16(above + 22);
+  a23 = vld1q_u16(above + 23);
+  a24 = vld1q_u16(above + 24);
+  // [ left[0], above[-1], ..., left[5] ]
+  l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+  l0 = vld1q_u16(left + 0);
+  l1 = vld1q_u16(left + 1);
+  l7 = vld1q_u16(left + 7);
+  l8 = vld1q_u16(left + 8);
+  l9 = vld1q_u16(left + 9);
+  l15 = vld1q_u16(left + 15);
+  l16 = vld1q_u16(left + 16);
+  l17 = vld1q_u16(left + 17);
+  l23 = vld1q_u16(left + 23);
+  l24 = vld1q_u16(left + 24);
+  l25 = vld1q_u16(left + 25);
+  // The last lane here is unused, reading left[32] could cause a buffer
+  // over-read, so just fill with a duplicate of left[24] to avoid needing to
+  // materialize a zero:
+  // [ left[25], ... , left[31], x ]
+  l25 = vextq_u16(l24, l24, 1);
+  // [ above[-1], left[0], ..., left[6] ]
+  azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+  d0[0] = vrhaddq_u16(az, a0);
+  d0[1] = vrhaddq_u16(a7, a8);
+  d0[2] = vrhaddq_u16(a15, a16);
+  d0[3] = vrhaddq_u16(a23, a24);
+  d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+  d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7);
+  d1[2] = vrhaddq_u16(vhaddq_u16(a14, a16), a15);
+  d1[3] = vrhaddq_u16(vhaddq_u16(a22, a24), a23);
+
+  col0[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+  col0[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8);
+  col0[2] = vrhaddq_u16(vhaddq_u16(l15, l17), l16);
+  col0[3] = vrhaddq_u16(vhaddq_u16(l23, l25), l24);
+
+  // Reverse within each vector, then swap the array indices in both the uzp
+  // and the col0_{even,odd} assignment to complete the reversal across all
+  // 32-elements.
+  col0[0] = vrev64q_u16(vextq_u16(col0[0], col0[0], 4));
+  col0[1] = vrev64q_u16(vextq_u16(col0[1], col0[1], 4));
+  col0[2] = vrev64q_u16(vextq_u16(col0[2], col0[2], 4));
+  col0[3] = vrev64q_u16(vextq_u16(col0[3], col0[3], 4));
+
+  col0_even[1] = vuzpq_u16(col0[1], col0[0]).val[1];
+  col0_even[0] = vuzpq_u16(col0[3], col0[2]).val[1];
+  col0_odd[1] = vuzpq_u16(col0[1], col0[0]).val[0];
+  col0_odd[0] = vuzpq_u16(col0[3], col0[2]).val[0];
+
+  vst1q_u16(dst + 0 * stride + 0, d0[0]);
+  vst1q_u16(dst + 0 * stride + 8, d0[1]);
+  vst1q_u16(dst + 0 * stride + 16, d0[2]);
+  vst1q_u16(dst + 0 * stride + 24, d0[3]);
+  vst1q_u16(dst + 1 * stride + 0, d1[0]);
+  vst1q_u16(dst + 1 * stride + 8, d1[1]);
+  vst1q_u16(dst + 1 * stride + 16, d1[2]);
+  vst1q_u16(dst + 1 * stride + 24, d1[3]);
+
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(col0_even[1], d0[0], 7));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[0], d0[1], 7));
+  vst1q_u16(dst + 2 * stride + 16, vextq_u16(d0[1], d0[2], 7));
+  vst1q_u16(dst + 2 * stride + 24, vextq_u16(d0[2], d0[3], 7));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(col0_odd[1], d1[0], 7));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 7));
+  vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[1], d1[2], 7));
+  vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1[2], d1[3], 7));
+
+  vst1q_u16(dst + 4 * stride + 0, vextq_u16(col0_even[1], d0[0], 6));
+  vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[0], d0[1], 6));
+  vst1q_u16(dst + 4 * stride + 16, vextq_u16(d0[1], d0[2], 6));
+  vst1q_u16(dst + 4 * stride + 24, vextq_u16(d0[2], d0[3], 6));
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(col0_odd[1], d1[0], 6));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1[0], d1[1], 6));
+  vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[1], d1[2], 6));
+  vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1[2], d1[3], 6));
+
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(col0_even[1], d0[0], 5));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[0], d0[1], 5));
+  vst1q_u16(dst + 6 * stride + 16, vextq_u16(d0[1], d0[2], 5));
+  vst1q_u16(dst + 6 * stride + 24, vextq_u16(d0[2], d0[3], 5));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(col0_odd[1], d1[0], 5));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1[0], d1[1], 5));
+  vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[1], d1[2], 5));
+  vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1[2], d1[3], 5));
+
+  vst1q_u16(dst + 8 * stride + 0, vextq_u16(col0_even[1], d0[0], 4));
+  vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0[0], d0[1], 4));
+  vst1q_u16(dst + 8 * stride + 16, vextq_u16(d0[1], d0[2], 4));
+  vst1q_u16(dst + 8 * stride + 24, vextq_u16(d0[2], d0[3], 4));
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(col0_odd[1], d1[0], 4));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1[0], d1[1], 4));
+  vst1q_u16(dst + 9 * stride + 16, vextq_u16(d1[1], d1[2], 4));
+  vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1[2], d1[3], 4));
+
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(col0_even[1], d0[0], 3));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0[0], d0[1], 3));
+  vst1q_u16(dst + 10 * stride + 16, vextq_u16(d0[1], d0[2], 3));
+  vst1q_u16(dst + 10 * stride + 24, vextq_u16(d0[2], d0[3], 3));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(col0_odd[1], d1[0], 3));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1[0], d1[1], 3));
+  vst1q_u16(dst + 11 * stride + 16, vextq_u16(d1[1], d1[2], 3));
+  vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1[2], d1[3], 3));
+
+  vst1q_u16(dst + 12 * stride + 0, vextq_u16(col0_even[1], d0[0], 2));
+  vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0[0], d0[1], 2));
+  vst1q_u16(dst + 12 * stride + 16, vextq_u16(d0[1], d0[2], 2));
+  vst1q_u16(dst + 12 * stride + 24, vextq_u16(d0[2], d0[3], 2));
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(col0_odd[1], d1[0], 2));
+  vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1[0], d1[1], 2));
+  vst1q_u16(dst + 13 * stride + 16, vextq_u16(d1[1], d1[2], 2));
+  vst1q_u16(dst + 13 * stride + 24, vextq_u16(d1[2], d1[3], 2));
+
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(col0_even[1], d0[0], 1));
+  vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0[0], d0[1], 1));
+  vst1q_u16(dst + 14 * stride + 16, vextq_u16(d0[1], d0[2], 1));
+  vst1q_u16(dst + 14 * stride + 24, vextq_u16(d0[2], d0[3], 1));
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(col0_odd[1], d1[0], 1));
+  vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1[0], d1[1], 1));
+  vst1q_u16(dst + 15 * stride + 16, vextq_u16(d1[1], d1[2], 1));
+  vst1q_u16(dst + 15 * stride + 24, vextq_u16(d1[2], d1[3], 1));
+
+  vst1q_u16(dst + 16 * stride + 0, col0_even[1]);
+  vst1q_u16(dst + 16 * stride + 8, d0[0]);
+  vst1q_u16(dst + 16 * stride + 16, d0[1]);
+  vst1q_u16(dst + 16 * stride + 24, d0[2]);
+  vst1q_u16(dst + 17 * stride + 0, col0_odd[1]);
+  vst1q_u16(dst + 17 * stride + 8, d1[0]);
+  vst1q_u16(dst + 17 * stride + 16, d1[1]);
+  vst1q_u16(dst + 17 * stride + 24, d1[2]);
+
+  vst1q_u16(dst + 18 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 7));
+  vst1q_u16(dst + 18 * stride + 8, vextq_u16(col0_even[1], d0[0], 7));
+  vst1q_u16(dst + 18 * stride + 16, vextq_u16(d0[0], d0[1], 7));
+  vst1q_u16(dst + 18 * stride + 24, vextq_u16(d0[1], d0[2], 7));
+  vst1q_u16(dst + 19 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 7));
+  vst1q_u16(dst + 19 * stride + 8, vextq_u16(col0_odd[1], d1[0], 7));
+  vst1q_u16(dst + 19 * stride + 16, vextq_u16(d1[0], d1[1], 7));
+  vst1q_u16(dst + 19 * stride + 24, vextq_u16(d1[1], d1[2], 7));
+
+  vst1q_u16(dst + 20 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 6));
+  vst1q_u16(dst + 20 * stride + 8, vextq_u16(col0_even[1], d0[0], 6));
+  vst1q_u16(dst + 20 * stride + 16, vextq_u16(d0[0], d0[1], 6));
+  vst1q_u16(dst + 20 * stride + 24, vextq_u16(d0[1], d0[2], 6));
+  vst1q_u16(dst + 21 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 6));
+  vst1q_u16(dst + 21 * stride + 8, vextq_u16(col0_odd[1], d1[0], 6));
+  vst1q_u16(dst + 21 * stride + 16, vextq_u16(d1[0], d1[1], 6));
+  vst1q_u16(dst + 21 * stride + 24, vextq_u16(d1[1], d1[2], 6));
+
+  vst1q_u16(dst + 22 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 5));
+  vst1q_u16(dst + 22 * stride + 8, vextq_u16(col0_even[1], d0[0], 5));
+  vst1q_u16(dst + 22 * stride + 16, vextq_u16(d0[0], d0[1], 5));
+  vst1q_u16(dst + 22 * stride + 24, vextq_u16(d0[1], d0[2], 5));
+  vst1q_u16(dst + 23 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 5));
+  vst1q_u16(dst + 23 * stride + 8, vextq_u16(col0_odd[1], d1[0], 5));
+  vst1q_u16(dst + 23 * stride + 16, vextq_u16(d1[0], d1[1], 5));
+  vst1q_u16(dst + 23 * stride + 24, vextq_u16(d1[1], d1[2], 5));
+
+  vst1q_u16(dst + 24 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 4));
+  vst1q_u16(dst + 24 * stride + 8, vextq_u16(col0_even[1], d0[0], 4));
+  vst1q_u16(dst + 24 * stride + 16, vextq_u16(d0[0], d0[1], 4));
+  vst1q_u16(dst + 24 * stride + 24, vextq_u16(d0[1], d0[2], 4));
+  vst1q_u16(dst + 25 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 4));
+  vst1q_u16(dst + 25 * stride + 8, vextq_u16(col0_odd[1], d1[0], 4));
+  vst1q_u16(dst + 25 * stride + 16, vextq_u16(d1[0], d1[1], 4));
+  vst1q_u16(dst + 25 * stride + 24, vextq_u16(d1[1], d1[2], 4));
+
+  vst1q_u16(dst + 26 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 3));
+  vst1q_u16(dst + 26 * stride + 8, vextq_u16(col0_even[1], d0[0], 3));
+  vst1q_u16(dst + 26 * stride + 16, vextq_u16(d0[0], d0[1], 3));
+  vst1q_u16(dst + 26 * stride + 24, vextq_u16(d0[1], d0[2], 3));
+  vst1q_u16(dst + 27 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 3));
+  vst1q_u16(dst + 27 * stride + 8, vextq_u16(col0_odd[1], d1[0], 3));
+  vst1q_u16(dst + 27 * stride + 16, vextq_u16(d1[0], d1[1], 3));
+  vst1q_u16(dst + 27 * stride + 24, vextq_u16(d1[1], d1[2], 3));
+
+  vst1q_u16(dst + 28 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 2));
+  vst1q_u16(dst + 28 * stride + 8, vextq_u16(col0_even[1], d0[0], 2));
+  vst1q_u16(dst + 28 * stride + 16, vextq_u16(d0[0], d0[1], 2));
+  vst1q_u16(dst + 28 * stride + 24, vextq_u16(d0[1], d0[2], 2));
+  vst1q_u16(dst + 29 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 2));
+  vst1q_u16(dst + 29 * stride + 8, vextq_u16(col0_odd[1], d1[0], 2));
+  vst1q_u16(dst + 29 * stride + 16, vextq_u16(d1[0], d1[1], 2));
+  vst1q_u16(dst + 29 * stride + 24, vextq_u16(d1[1], d1[2], 2));
+
+  vst1q_u16(dst + 30 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 1));
+  vst1q_u16(dst + 30 * stride + 8, vextq_u16(col0_even[1], d0[0], 1));
+  vst1q_u16(dst + 30 * stride + 16, vextq_u16(d0[0], d0[1], 1));
+  vst1q_u16(dst + 30 * stride + 24, vextq_u16(d0[1], d0[2], 1));
+  vst1q_u16(dst + 31 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 1));
+  vst1q_u16(dst + 31 * stride + 8, vextq_u16(col0_odd[1], d1[0], 1));
+  vst1q_u16(dst + 31 * stride + 16, vextq_u16(d1[0], d1[1], 1));
+  vst1q_u16(dst + 31 * stride + 24, vextq_u16(d1[1], d1[2], 1));
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_d153_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation.
+  uint16x4_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d20_lo, d20_hi;
+  (void)bd;
+
+  az = vld1_u16(above - 1);
+  a0 = vld1_u16(above + 0);
+  // [ left[0], above[-1], above[0], above[1] ]
+  l0az = vext_u16(vld1_dup_u16(left), az, 3);
+
+  l0 = vld1_u16(left);
+  // The last lane here is unused, reading left[4] could cause a buffer
+  // over-read, so just fill with a duplicate of left[0] to avoid needing to
+  // materialize a zero:
+  // [ left[1], left[2], left[3], x ]
+  l1 = vext_u16(l0, l0, 1);
+  // [ above[-1], left[0], left[1], left[2] ]
+  azl0 = vext_u16(vld1_dup_u16(above - 1), l0, 3);
+
+  d0 = vrhadd_u16(azl0, l0);
+  d1 = vrhadd_u16(vhadd_u16(l0az, a0), az);
+  d2 = vrhadd_u16(vhadd_u16(azl0, l1), l0);
+
+  d20_lo = vzip_u16(vrev64_u16(d2), vrev64_u16(d0)).val[0];
+  d20_hi = vzip_u16(vrev64_u16(d2), vrev64_u16(d0)).val[1];
+
+  // Incrementally shift more elements from d0/d2 reversed into d1:
+  // stride=0 [ d0[0], d1[0], d1[1], d1[2] ]
+  // stride=1 [ d0[1], d2[0], d0[0], d1[0] ]
+  // stride=2 [ d0[2], d2[1], d0[1], d2[0] ]
+  // stride=3 [ d0[3], d2[2], d0[2], d2[1] ]
+  vst1_u16(dst + 0 * stride, vext_u16(d20_hi, d1, 3));
+  vst1_u16(dst + 1 * stride, vext_u16(d20_hi, d1, 1));
+  vst1_u16(dst + 2 * stride, vext_u16(d20_lo, d20_hi, 3));
+  vst1_u16(dst + 3 * stride, vext_u16(d20_lo, d20_hi, 1));
+}
+
+void vpx_highbd_d153_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  uint16x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d0_rev, d2_rev, d20_lo,
+      d20_hi;
+  (void)bd;
+
+  az = vld1q_u16(above - 1);
+  a0 = vld1q_u16(above + 0);
+  // [ left[0], above[-1], ... , above[5] ]
+  l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+  l0 = vld1q_u16(left);
+  // The last lane here is unused, reading left[8] could cause a buffer
+  // over-read, so just fill with a duplicate of left[0] to avoid needing to
+  // materialize a zero:
+  // [ left[1], ... , left[7], x ]
+  l1 = vextq_u16(l0, l0, 1);
+  // [ above[-1], left[0], ... , left[6] ]
+  azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+  // d0[0] = AVG2(above[-1], left[0])
+  // d0[1] = AVG2(left[0], left[1])
+  // ...
+  // d0[7] = AVG2(left[6], left[7])
+  d0 = vrhaddq_u16(azl0, l0);
+
+  // d1[0] = AVG3(left[0], above[-1], above[0])
+  // d1[1] = AVG3(above[-1], above[0], above[1])
+  // ...
+  // d1[7] = AVG3(above[5], above[6], above[7])
+  d1 = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+
+  // d2[0] = AVG3(above[-1], left[0], left[1])
+  // d2[1] = AVG3(left[0], left[1], left[2])
+  // ...
+  // d2[7] = AVG3(left[6], left[7], left[8])
+  d2 = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+
+  // The ext instruction shifts elements in from the end of the vector rather
+  // than the start, so reverse the vectors to put the elements to be shifted
+  // in at the end:
+  d0_rev = vrev64q_u16(vextq_u16(d0, d0, 4));
+  d2_rev = vrev64q_u16(vextq_u16(d2, d2, 4));
+
+  d20_lo = vzipq_u16(d2_rev, d0_rev).val[0];
+  d20_hi = vzipq_u16(d2_rev, d0_rev).val[1];
+
+  // Incrementally shift more elements from d0/d2 reversed into d1:
+  // stride=0 [ d0[0], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ]
+  // stride=1 [ d0[1], d2[0], d0[0], d1[0], d1[1], d1[2], d1[3], d1[4] ]
+  // stride=2 [ d0[2], d2[1], d0[1], d2[0], d0[0], d1[0], d1[1], d1[2] ]
+  // stride=3 [ d0[3], d2[2], d0[2], d2[1], d0[1], d2[0], d0[0], d1[0] ]
+  // stride=4 [ d0[4], d2[3], d0[3], d2[2], d0[2], d2[1], d0[1], d2[0] ]
+  // stride=5 [ d0[5], d2[4], d0[4], d2[3], d0[3], d2[2], d0[2], d2[1] ]
+  // stride=6 [ d0[6], d2[5], d0[5], d2[4], d0[4], d2[3], d0[3], d2[2] ]
+  // stride=7 [ d0[7], d2[6], d0[6], d2[5], d0[5], d2[4], d0[4], d2[3] ]
+  vst1q_u16(dst + 0 * stride, vextq_u16(d20_hi, d1, 7));
+  vst1q_u16(dst + 1 * stride, vextq_u16(d20_hi, d1, 5));
+  vst1q_u16(dst + 2 * stride, vextq_u16(d20_hi, d1, 3));
+  vst1q_u16(dst + 3 * stride, vextq_u16(d20_hi, d1, 1));
+  vst1q_u16(dst + 4 * stride, vextq_u16(d20_lo, d20_hi, 7));
+  vst1q_u16(dst + 5 * stride, vextq_u16(d20_lo, d20_hi, 5));
+  vst1q_u16(dst + 6 * stride, vextq_u16(d20_lo, d20_hi, 3));
+  vst1q_u16(dst + 7 * stride, vextq_u16(d20_lo, d20_hi, 1));
+}
+
+void vpx_highbd_d153_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation.
+  uint16x8_t az, a0, a6, a7, a8, l0az, l0, l1, l7, l8, l9, azl0, d0[2], d1[2],
+      d2[2], d20[4];
+  (void)bd;
+
+  az = vld1q_u16(above - 1);
+  a0 = vld1q_u16(above + 0);
+  a6 = vld1q_u16(above + 6);
+  a7 = vld1q_u16(above + 7);
+  a8 = vld1q_u16(above + 8);
+  // [ left[0], above[-1], ... , above[13] ]
+  l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+  l0 = vld1q_u16(left + 0);
+  l1 = vld1q_u16(left + 1);
+  l7 = vld1q_u16(left + 7);
+  l8 = vld1q_u16(left + 8);
+  // The last lane here is unused, reading left[16] could cause a buffer
+  // over-read, so just fill with a duplicate of left[8] to avoid needing to
+  // materialize a zero:
+  // [ left[9], ... , left[15], x ]
+  l9 = vextq_u16(l8, l8, 1);
+  // [ above[-1], left[0], ... , left[14] ]
+  azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+  d0[0] = vrhaddq_u16(azl0, l0);
+  d0[1] = vrhaddq_u16(l7, l8);
+  d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+  d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7);
+  d2[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+  d2[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8);
+
+  d0[0] = vrev64q_u16(vextq_u16(d0[0], d0[0], 4));
+  d0[1] = vrev64q_u16(vextq_u16(d0[1], d0[1], 4));
+  d2[0] = vrev64q_u16(vextq_u16(d2[0], d2[0], 4));
+  d2[1] = vrev64q_u16(vextq_u16(d2[1], d2[1], 4));
+
+  d20[0] = vzipq_u16(d2[1], d0[1]).val[0];
+  d20[1] = vzipq_u16(d2[1], d0[1]).val[1];
+  d20[2] = vzipq_u16(d2[0], d0[0]).val[0];
+  d20[3] = vzipq_u16(d2[0], d0[0]).val[1];
+
+  vst1q_u16(dst + 0 * stride + 0, vextq_u16(d20[3], d1[0], 7));
+  vst1q_u16(dst + 0 * stride + 8, vextq_u16(d1[0], d1[1], 7));
+  vst1q_u16(dst + 1 * stride + 0, vextq_u16(d20[3], d1[0], 5));
+  vst1q_u16(dst + 1 * stride + 8, vextq_u16(d1[0], d1[1], 5));
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(d20[3], d1[0], 3));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(d1[0], d1[1], 3));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(d20[3], d1[0], 1));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 1));
+
+  vst1q_u16(dst + 4 * stride + 0, vextq_u16(d20[2], d20[3], 7));
+  vst1q_u16(dst + 4 * stride + 8, vextq_u16(d20[3], d1[0], 7));
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(d20[2], d20[3], 5));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(d20[3], d1[0], 5));
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(d20[2], d20[3], 3));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(d20[3], d1[0], 3));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(d20[2], d20[3], 1));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(d20[3], d1[0], 1));
+
+  vst1q_u16(dst + 8 * stride + 0, vextq_u16(d20[1], d20[2], 7));
+  vst1q_u16(dst + 8 * stride + 8, vextq_u16(d20[2], d20[3], 7));
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(d20[1], d20[2], 5));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(d20[2], d20[3], 5));
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(d20[1], d20[2], 3));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(d20[2], d20[3], 3));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(d20[1], d20[2], 1));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(d20[2], d20[3], 1));
+
+  vst1q_u16(dst + 12 * stride + 0, vextq_u16(d20[0], d20[1], 7));
+  vst1q_u16(dst + 12 * stride + 8, vextq_u16(d20[1], d20[2], 7));
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(d20[0], d20[1], 5));
+  vst1q_u16(dst + 13 * stride + 8, vextq_u16(d20[1], d20[2], 5));
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(d20[0], d20[1], 3));
+  vst1q_u16(dst + 14 * stride + 8, vextq_u16(d20[1], d20[2], 3));
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(d20[0], d20[1], 1));
+  vst1q_u16(dst + 15 * stride + 8, vextq_u16(d20[1], d20[2], 1));
+}
+
+void vpx_highbd_d153_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation.
+  uint16x8_t az, a0, a6, a7, a8, a14, a15, a16, a22, a23, a24, l0az, l0, l1, l7,
+      l8, l9, l15, l16, l17, l23, l24, l25, azl0, d0[4], d1[4], d2[4], d20[8];
+  (void)bd;
+
+  az = vld1q_u16(above - 1);
+  a0 = vld1q_u16(above + 0);
+  a6 = vld1q_u16(above + 6);
+  a7 = vld1q_u16(above + 7);
+  a8 = vld1q_u16(above + 8);
+  a14 = vld1q_u16(above + 14);
+  a15 = vld1q_u16(above + 15);
+  a16 = vld1q_u16(above + 16);
+  a22 = vld1q_u16(above + 22);
+  a23 = vld1q_u16(above + 23);
+  a24 = vld1q_u16(above + 24);
+  // [ left[0], above[-1], ... , above[13] ]
+  l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+  l0 = vld1q_u16(left + 0);
+  l1 = vld1q_u16(left + 1);
+  l7 = vld1q_u16(left + 7);
+  l8 = vld1q_u16(left + 8);
+  l9 = vld1q_u16(left + 9);
+  l15 = vld1q_u16(left + 15);
+  l16 = vld1q_u16(left + 16);
+  l17 = vld1q_u16(left + 17);
+  l23 = vld1q_u16(left + 23);
+  l24 = vld1q_u16(left + 24);
+  // The last lane here is unused, reading left[32] could cause a buffer
+  // over-read, so just fill with a duplicate of left[24] to avoid needing to
+  // materialize a zero:
+  // [ left[25], ... , left[31], x ]
+  l25 = vextq_u16(l24, l24, 1);
+  // [ above[-1], left[0], ... , left[14] ]
+  azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+  d0[0] = vrhaddq_u16(azl0, l0);
+  d0[1] = vrhaddq_u16(l7, l8);
+  d0[2] = vrhaddq_u16(l15, l16);
+  d0[3] = vrhaddq_u16(l23, l24);
+
+  d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+  d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7);
+  d1[2] = vrhaddq_u16(vhaddq_u16(a14, a16), a15);
+  d1[3] = vrhaddq_u16(vhaddq_u16(a22, a24), a23);
+
+  d2[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+  d2[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8);
+  d2[2] = vrhaddq_u16(vhaddq_u16(l15, l17), l16);
+  d2[3] = vrhaddq_u16(vhaddq_u16(l23, l25), l24);
+
+  d0[0] = vrev64q_u16(vextq_u16(d0[0], d0[0], 4));
+  d0[1] = vrev64q_u16(vextq_u16(d0[1], d0[1], 4));
+  d0[2] = vrev64q_u16(vextq_u16(d0[2], d0[2], 4));
+  d0[3] = vrev64q_u16(vextq_u16(d0[3], d0[3], 4));
+  d2[0] = vrev64q_u16(vextq_u16(d2[0], d2[0], 4));
+  d2[1] = vrev64q_u16(vextq_u16(d2[1], d2[1], 4));
+  d2[2] = vrev64q_u16(vextq_u16(d2[2], d2[2], 4));
+  d2[3] = vrev64q_u16(vextq_u16(d2[3], d2[3], 4));
+
+  d20[0] = vzipq_u16(d2[3], d0[3]).val[0];
+  d20[1] = vzipq_u16(d2[3], d0[3]).val[1];
+  d20[2] = vzipq_u16(d2[2], d0[2]).val[0];
+  d20[3] = vzipq_u16(d2[2], d0[2]).val[1];
+  d20[4] = vzipq_u16(d2[1], d0[1]).val[0];
+  d20[5] = vzipq_u16(d2[1], d0[1]).val[1];
+  d20[6] = vzipq_u16(d2[0], d0[0]).val[0];
+  d20[7] = vzipq_u16(d2[0], d0[0]).val[1];
+
+  vst1q_u16(dst + 0 * stride + 0, vextq_u16(d20[7], d1[0], 7));
+  vst1q_u16(dst + 0 * stride + 8, vextq_u16(d1[0], d1[1], 7));
+  vst1q_u16(dst + 0 * stride + 16, vextq_u16(d1[1], d1[2], 7));
+  vst1q_u16(dst + 0 * stride + 24, vextq_u16(d1[2], d1[3], 7));
+  vst1q_u16(dst + 1 * stride + 0, vextq_u16(d20[7], d1[0], 5));
+  vst1q_u16(dst + 1 * stride + 8, vextq_u16(d1[0], d1[1], 5));
+  vst1q_u16(dst + 1 * stride + 16, vextq_u16(d1[1], d1[2], 5));
+  vst1q_u16(dst + 1 * stride + 24, vextq_u16(d1[2], d1[3], 5));
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(d20[7], d1[0], 3));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(d1[0], d1[1], 3));
+  vst1q_u16(dst + 2 * stride + 16, vextq_u16(d1[1], d1[2], 3));
+  vst1q_u16(dst + 2 * stride + 24, vextq_u16(d1[2], d1[3], 3));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(d20[7], d1[0], 1));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 1));
+  vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[1], d1[2], 1));
+  vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1[2], d1[3], 1));
+
+  vst1q_u16(dst + 4 * stride + 0, vextq_u16(d20[6], d20[7], 7));
+  vst1q_u16(dst + 4 * stride + 8, vextq_u16(d20[7], d1[0], 7));
+  vst1q_u16(dst + 4 * stride + 16, vextq_u16(d1[0], d1[1], 7));
+  vst1q_u16(dst + 4 * stride + 24, vextq_u16(d1[1], d1[2], 7));
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(d20[6], d20[7], 5));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(d20[7], d1[0], 5));
+  vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[0], d1[1], 5));
+  vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1[1], d1[2], 5));
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(d20[6], d20[7], 3));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(d20[7], d1[0], 3));
+  vst1q_u16(dst + 6 * stride + 16, vextq_u16(d1[0], d1[1], 3));
+  vst1q_u16(dst + 6 * stride + 24, vextq_u16(d1[1], d1[2], 3));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(d20[6], d20[7], 1));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(d20[7], d1[0], 1));
+  vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[0], d1[1], 1));
+  vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1[1], d1[2], 1));
+
+  vst1q_u16(dst + 8 * stride + 0, vextq_u16(d20[5], d20[6], 7));
+  vst1q_u16(dst + 8 * stride + 8, vextq_u16(d20[6], d20[7], 7));
+  vst1q_u16(dst + 8 * stride + 16, vextq_u16(d20[7], d1[0], 7));
+  vst1q_u16(dst + 8 * stride + 24, vextq_u16(d1[0], d1[1], 7));
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(d20[5], d20[6], 5));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(d20[6], d20[7], 5));
+  vst1q_u16(dst + 9 * stride + 16, vextq_u16(d20[7], d1[0], 5));
+  vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1[0], d1[1], 5));
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(d20[5], d20[6], 3));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(d20[6], d20[7], 3));
+  vst1q_u16(dst + 10 * stride + 16, vextq_u16(d20[7], d1[0], 3));
+  vst1q_u16(dst + 10 * stride + 24, vextq_u16(d1[0], d1[1], 3));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(d20[5], d20[6], 1));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(d20[6], d20[7], 1));
+  vst1q_u16(dst + 11 * stride + 16, vextq_u16(d20[7], d1[0], 1));
+  vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1[0], d1[1], 1));
+
+  vst1q_u16(dst + 12 * stride + 0, vextq_u16(d20[4], d20[5], 7));
+  vst1q_u16(dst + 12 * stride + 8, vextq_u16(d20[5], d20[6], 7));
+  vst1q_u16(dst + 12 * stride + 16, vextq_u16(d20[6], d20[7], 7));
+  vst1q_u16(dst + 12 * stride + 24, vextq_u16(d20[7], d1[0], 7));
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(d20[4], d20[5], 5));
+  vst1q_u16(dst + 13 * stride + 8, vextq_u16(d20[5], d20[6], 5));
+  vst1q_u16(dst + 13 * stride + 16, vextq_u16(d20[6], d20[7], 5));
+  vst1q_u16(dst + 13 * stride + 24, vextq_u16(d20[7], d1[0], 5));
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(d20[4], d20[5], 3));
+  vst1q_u16(dst + 14 * stride + 8, vextq_u16(d20[5], d20[6], 3));
+  vst1q_u16(dst + 14 * stride + 16, vextq_u16(d20[6], d20[7], 3));
+  vst1q_u16(dst + 14 * stride + 24, vextq_u16(d20[7], d1[0], 3));
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(d20[4], d20[5], 1));
+  vst1q_u16(dst + 15 * stride + 8, vextq_u16(d20[5], d20[6], 1));
+  vst1q_u16(dst + 15 * stride + 16, vextq_u16(d20[6], d20[7], 1));
+  vst1q_u16(dst + 15 * stride + 24, vextq_u16(d20[7], d1[0], 1));
+
+  vst1q_u16(dst + 16 * stride + 0, vextq_u16(d20[3], d20[4], 7));
+  vst1q_u16(dst + 16 * stride + 8, vextq_u16(d20[4], d20[5], 7));
+  vst1q_u16(dst + 16 * stride + 16, vextq_u16(d20[5], d20[6], 7));
+  vst1q_u16(dst + 16 * stride + 24, vextq_u16(d20[6], d20[7], 7));
+  vst1q_u16(dst + 17 * stride + 0, vextq_u16(d20[3], d20[4], 5));
+  vst1q_u16(dst + 17 * stride + 8, vextq_u16(d20[4], d20[5], 5));
+  vst1q_u16(dst + 17 * stride + 16, vextq_u16(d20[5], d20[6], 5));
+  vst1q_u16(dst + 17 * stride + 24, vextq_u16(d20[6], d20[7], 5));
+  vst1q_u16(dst + 18 * stride + 0, vextq_u16(d20[3], d20[4], 3));
+  vst1q_u16(dst + 18 * stride + 8, vextq_u16(d20[4], d20[5], 3));
+  vst1q_u16(dst + 18 * stride + 16, vextq_u16(d20[5], d20[6], 3));
+  vst1q_u16(dst + 18 * stride + 24, vextq_u16(d20[6], d20[7], 3));
+  vst1q_u16(dst + 19 * stride + 0, vextq_u16(d20[3], d20[4], 1));
+  vst1q_u16(dst + 19 * stride + 8, vextq_u16(d20[4], d20[5], 1));
+  vst1q_u16(dst + 19 * stride + 16, vextq_u16(d20[5], d20[6], 1));
+  vst1q_u16(dst + 19 * stride + 24, vextq_u16(d20[6], d20[7], 1));
+
+  vst1q_u16(dst + 20 * stride + 0, vextq_u16(d20[2], d20[3], 7));
+  vst1q_u16(dst + 20 * stride + 8, vextq_u16(d20[3], d20[4], 7));
+  vst1q_u16(dst + 20 * stride + 16, vextq_u16(d20[4], d20[5], 7));
+  vst1q_u16(dst + 20 * stride + 24, vextq_u16(d20[5], d20[6], 7));
+  vst1q_u16(dst + 21 * stride + 0, vextq_u16(d20[2], d20[3], 5));
+  vst1q_u16(dst + 21 * stride + 8, vextq_u16(d20[3], d20[4], 5));
+  vst1q_u16(dst + 21 * stride + 16, vextq_u16(d20[4], d20[5], 5));
+  vst1q_u16(dst + 21 * stride + 24, vextq_u16(d20[5], d20[6], 5));
+  vst1q_u16(dst + 22 * stride + 0, vextq_u16(d20[2], d20[3], 3));
+  vst1q_u16(dst + 22 * stride + 8, vextq_u16(d20[3], d20[4], 3));
+  vst1q_u16(dst + 22 * stride + 16, vextq_u16(d20[4], d20[5], 3));
+  vst1q_u16(dst + 22 * stride + 24, vextq_u16(d20[5], d20[6], 3));
+  vst1q_u16(dst + 23 * stride + 0, vextq_u16(d20[2], d20[3], 1));
+  vst1q_u16(dst + 23 * stride + 8, vextq_u16(d20[3], d20[4], 1));
+  vst1q_u16(dst + 23 * stride + 16, vextq_u16(d20[4], d20[5], 1));
+  vst1q_u16(dst + 23 * stride + 24, vextq_u16(d20[5], d20[6], 1));
+
+  vst1q_u16(dst + 24 * stride + 0, vextq_u16(d20[1], d20[2], 7));
+  vst1q_u16(dst + 24 * stride + 8, vextq_u16(d20[2], d20[3], 7));
+  vst1q_u16(dst + 24 * stride + 16, vextq_u16(d20[3], d20[4], 7));
+  vst1q_u16(dst + 24 * stride + 24, vextq_u16(d20[4], d20[5], 7));
+  vst1q_u16(dst + 25 * stride + 0, vextq_u16(d20[1], d20[2], 5));
+  vst1q_u16(dst + 25 * stride + 8, vextq_u16(d20[2], d20[3], 5));
+  vst1q_u16(dst + 25 * stride + 16, vextq_u16(d20[3], d20[4], 5));
+  vst1q_u16(dst + 25 * stride + 24, vextq_u16(d20[4], d20[5], 5));
+  vst1q_u16(dst + 26 * stride + 0, vextq_u16(d20[1], d20[2], 3));
+  vst1q_u16(dst + 26 * stride + 8, vextq_u16(d20[2], d20[3], 3));
+  vst1q_u16(dst + 26 * stride + 16, vextq_u16(d20[3], d20[4], 3));
+  vst1q_u16(dst + 26 * stride + 24, vextq_u16(d20[4], d20[5], 3));
+  vst1q_u16(dst + 27 * stride + 0, vextq_u16(d20[1], d20[2], 1));
+  vst1q_u16(dst + 27 * stride + 8, vextq_u16(d20[2], d20[3], 1));
+  vst1q_u16(dst + 27 * stride + 16, vextq_u16(d20[3], d20[4], 1));
+  vst1q_u16(dst + 27 * stride + 24, vextq_u16(d20[4], d20[5], 1));
+
+  vst1q_u16(dst + 28 * stride + 0, vextq_u16(d20[0], d20[1], 7));
+  vst1q_u16(dst + 28 * stride + 8, vextq_u16(d20[1], d20[2], 7));
+  vst1q_u16(dst + 28 * stride + 16, vextq_u16(d20[2], d20[3], 7));
+  vst1q_u16(dst + 28 * stride + 24, vextq_u16(d20[3], d20[4], 7));
+  vst1q_u16(dst + 29 * stride + 0, vextq_u16(d20[0], d20[1], 5));
+  vst1q_u16(dst + 29 * stride + 8, vextq_u16(d20[1], d20[2], 5));
+  vst1q_u16(dst + 29 * stride + 16, vextq_u16(d20[2], d20[3], 5));
+  vst1q_u16(dst + 29 * stride + 24, vextq_u16(d20[3], d20[4], 5));
+  vst1q_u16(dst + 30 * stride + 0, vextq_u16(d20[0], d20[1], 3));
+  vst1q_u16(dst + 30 * stride + 8, vextq_u16(d20[1], d20[2], 3));
+  vst1q_u16(dst + 30 * stride + 16, vextq_u16(d20[2], d20[3], 3));
+  vst1q_u16(dst + 30 * stride + 24, vextq_u16(d20[3], d20[4], 3));
+  vst1q_u16(dst + 31 * stride + 0, vextq_u16(d20[0], d20[1], 1));
+  vst1q_u16(dst + 31 * stride + 8, vextq_u16(d20[1], d20[2], 1));
+  vst1q_u16(dst + 31 * stride + 16, vextq_u16(d20[2], d20[3], 1));
+  vst1q_u16(dst + 31 * stride + 24, vextq_u16(d20[3], d20[4], 1));
 }
 
 // -----------------------------------------------------------------------------
@@ -696,6 +1821,311 @@ void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
 
 //------------------------------------------------------------------------------
 
+void vpx_highbd_d207_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  uint16x4_t l0, l1, l2, l3, c0, c1, c01_lo, c01_hi;
+  (void)above;
+  (void)bd;
+
+  l0 = vld1_u16(left + 0);
+  l3 = vld1_dup_u16(left + 3);
+
+  // [ left[1], left[2], left[3], left[3] ]
+  l1 = vext_u16(l0, l3, 1);
+  // [ left[2], left[3], left[3], left[3] ]
+  l2 = vext_u16(l0, l3, 2);
+
+  c0 = vrhadd_u16(l0, l1);
+  c1 = vrhadd_u16(vhadd_u16(l0, l2), l1);
+
+  c01_lo = vzip_u16(c0, c1).val[0];
+  c01_hi = vzip_u16(c0, c1).val[1];
+
+  // stride=0 [ c0[0], c1[0],   c0[1],   c1[1] ]
+  // stride=1 [ c0[1], c1[1],   c0[2],   c1[2] ]
+  // stride=2 [ c0[2], c1[2],   c0[3],   c1[3] ]
+  // stride=3 [ c0[3], c1[3], left[3], left[3] ]
+  vst1_u16(dst + 0 * stride, c01_lo);
+  vst1_u16(dst + 1 * stride, vext_u16(c01_lo, c01_hi, 2));
+  vst1_u16(dst + 2 * stride, c01_hi);
+  vst1_u16(dst + 3 * stride, vext_u16(c01_hi, l3, 2));
+}
+
+void vpx_highbd_d207_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  uint16x8_t l0, l1, l2, l7, c0, c1, c01_lo, c01_hi;
+  (void)above;
+  (void)bd;
+
+  l0 = vld1q_u16(left + 0);
+  l7 = vld1q_dup_u16(left + 7);
+
+  // [ left[1], left[2], left[3], left[4], left[5], left[6], left[7], left[7] ]
+  l1 = vextq_u16(l0, l7, 1);
+  // [ left[2], left[3], left[4], left[5], left[6], left[7], left[7], left[7] ]
+  l2 = vextq_u16(l0, l7, 2);
+
+  c0 = vrhaddq_u16(l0, l1);
+  c1 = vrhaddq_u16(vhaddq_u16(l0, l2), l1);
+
+  c01_lo = vzipq_u16(c0, c1).val[0];
+  c01_hi = vzipq_u16(c0, c1).val[1];
+
+  vst1q_u16(dst + 0 * stride, c01_lo);
+  vst1q_u16(dst + 1 * stride, vextq_u16(c01_lo, c01_hi, 2));
+  vst1q_u16(dst + 2 * stride, vextq_u16(c01_lo, c01_hi, 4));
+  vst1q_u16(dst + 3 * stride, vextq_u16(c01_lo, c01_hi, 6));
+  vst1q_u16(dst + 4 * stride, c01_hi);
+  vst1q_u16(dst + 5 * stride, vextq_u16(c01_hi, l7, 2));
+  vst1q_u16(dst + 6 * stride, vextq_u16(c01_hi, l7, 4));
+  vst1q_u16(dst + 7 * stride, vextq_u16(c01_hi, l7, 6));
+}
+
+void vpx_highbd_d207_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  uint16x8_t l0, l1, l2, l8, l9, l10, l15, c0[2], c1[2], c01[4];
+  (void)above;
+  (void)bd;
+
+  l0 = vld1q_u16(left + 0);
+  l1 = vld1q_u16(left + 1);
+  l2 = vld1q_u16(left + 2);
+  l8 = vld1q_u16(left + 8);
+  l15 = vld1q_dup_u16(left + 15);
+
+  l9 = vextq_u16(l8, l15, 1);
+  l10 = vextq_u16(l8, l15, 2);
+
+  c0[0] = vrhaddq_u16(l0, l1);
+  c0[1] = vrhaddq_u16(l8, l9);
+  c1[0] = vrhaddq_u16(vhaddq_u16(l0, l2), l1);
+  c1[1] = vrhaddq_u16(vhaddq_u16(l8, l10), l9);
+
+  c01[0] = vzipq_u16(c0[0], c1[0]).val[0];
+  c01[1] = vzipq_u16(c0[0], c1[0]).val[1];
+  c01[2] = vzipq_u16(c0[1], c1[1]).val[0];
+  c01[3] = vzipq_u16(c0[1], c1[1]).val[1];
+
+  vst1q_u16(dst + 0 * stride + 0, c01[0]);
+  vst1q_u16(dst + 0 * stride + 8, c01[1]);
+  vst1q_u16(dst + 1 * stride + 0, vextq_u16(c01[0], c01[1], 2));
+  vst1q_u16(dst + 1 * stride + 8, vextq_u16(c01[1], c01[2], 2));
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(c01[0], c01[1], 4));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(c01[1], c01[2], 4));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(c01[0], c01[1], 6));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(c01[1], c01[2], 6));
+
+  vst1q_u16(dst + 4 * stride + 0, c01[1]);
+  vst1q_u16(dst + 4 * stride + 8, c01[2]);
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(c01[1], c01[2], 2));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(c01[2], c01[3], 2));
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(c01[1], c01[2], 4));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(c01[2], c01[3], 4));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(c01[1], c01[2], 6));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(c01[2], c01[3], 6));
+
+  vst1q_u16(dst + 8 * stride + 0, c01[2]);
+  vst1q_u16(dst + 8 * stride + 8, c01[3]);
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(c01[2], c01[3], 2));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(c01[3], l15, 2));
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(c01[2], c01[3], 4));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(c01[3], l15, 4));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(c01[2], c01[3], 6));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(c01[3], l15, 6));
+
+  vst1q_u16(dst + 12 * stride + 0, c01[3]);
+  vst1q_u16(dst + 12 * stride + 8, l15);
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(c01[3], l15, 2));
+  vst1q_u16(dst + 13 * stride + 8, l15);
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(c01[3], l15, 4));
+  vst1q_u16(dst + 14 * stride + 8, l15);
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(c01[3], l15, 6));
+  vst1q_u16(dst + 15 * stride + 8, l15);
+}
+
+void vpx_highbd_d207_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  uint16x8_t l0, l1, l2, l8, l9, l10, l16, l17, l18, l24, l25, l26, l31, c0[4],
+      c1[4], c01[8];
+  (void)above;
+  (void)bd;
+
+  l0 = vld1q_u16(left + 0);
+  l1 = vld1q_u16(left + 1);
+  l2 = vld1q_u16(left + 2);
+  l8 = vld1q_u16(left + 8);
+  l9 = vld1q_u16(left + 9);
+  l10 = vld1q_u16(left + 10);
+  l16 = vld1q_u16(left + 16);
+  l17 = vld1q_u16(left + 17);
+  l18 = vld1q_u16(left + 18);
+  l24 = vld1q_u16(left + 24);
+  l31 = vld1q_dup_u16(left + 31);
+
+  l25 = vextq_u16(l24, l31, 1);
+  l26 = vextq_u16(l24, l31, 2);
+
+  c0[0] = vrhaddq_u16(l0, l1);
+  c0[1] = vrhaddq_u16(l8, l9);
+  c0[2] = vrhaddq_u16(l16, l17);
+  c0[3] = vrhaddq_u16(l24, l25);
+  c1[0] = vrhaddq_u16(vhaddq_u16(l0, l2), l1);
+  c1[1] = vrhaddq_u16(vhaddq_u16(l8, l10), l9);
+  c1[2] = vrhaddq_u16(vhaddq_u16(l16, l18), l17);
+  c1[3] = vrhaddq_u16(vhaddq_u16(l24, l26), l25);
+
+  c01[0] = vzipq_u16(c0[0], c1[0]).val[0];
+  c01[1] = vzipq_u16(c0[0], c1[0]).val[1];
+  c01[2] = vzipq_u16(c0[1], c1[1]).val[0];
+  c01[3] = vzipq_u16(c0[1], c1[1]).val[1];
+  c01[4] = vzipq_u16(c0[2], c1[2]).val[0];
+  c01[5] = vzipq_u16(c0[2], c1[2]).val[1];
+  c01[6] = vzipq_u16(c0[3], c1[3]).val[0];
+  c01[7] = vzipq_u16(c0[3], c1[3]).val[1];
+
+  vst1q_u16(dst + 0 * stride + 0, c01[0]);
+  vst1q_u16(dst + 0 * stride + 8, c01[1]);
+  vst1q_u16(dst + 0 * stride + 16, c01[2]);
+  vst1q_u16(dst + 0 * stride + 24, c01[3]);
+  vst1q_u16(dst + 1 * stride + 0, vextq_u16(c01[0], c01[1], 2));
+  vst1q_u16(dst + 1 * stride + 8, vextq_u16(c01[1], c01[2], 2));
+  vst1q_u16(dst + 1 * stride + 16, vextq_u16(c01[2], c01[3], 2));
+  vst1q_u16(dst + 1 * stride + 24, vextq_u16(c01[3], c01[4], 2));
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(c01[0], c01[1], 4));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(c01[1], c01[2], 4));
+  vst1q_u16(dst + 2 * stride + 16, vextq_u16(c01[2], c01[3], 4));
+  vst1q_u16(dst + 2 * stride + 24, vextq_u16(c01[3], c01[4], 4));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(c01[0], c01[1], 6));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(c01[1], c01[2], 6));
+  vst1q_u16(dst + 3 * stride + 16, vextq_u16(c01[2], c01[3], 6));
+  vst1q_u16(dst + 3 * stride + 24, vextq_u16(c01[3], c01[4], 6));
+
+  vst1q_u16(dst + 4 * stride + 0, c01[1]);
+  vst1q_u16(dst + 4 * stride + 8, c01[2]);
+  vst1q_u16(dst + 4 * stride + 16, c01[3]);
+  vst1q_u16(dst + 4 * stride + 24, c01[4]);
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(c01[1], c01[2], 2));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(c01[2], c01[3], 2));
+  vst1q_u16(dst + 5 * stride + 16, vextq_u16(c01[3], c01[4], 2));
+  vst1q_u16(dst + 5 * stride + 24, vextq_u16(c01[4], c01[5], 2));
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(c01[1], c01[2], 4));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(c01[2], c01[3], 4));
+  vst1q_u16(dst + 6 * stride + 16, vextq_u16(c01[3], c01[4], 4));
+  vst1q_u16(dst + 6 * stride + 24, vextq_u16(c01[4], c01[5], 4));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(c01[1], c01[2], 6));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(c01[2], c01[3], 6));
+  vst1q_u16(dst + 7 * stride + 16, vextq_u16(c01[3], c01[4], 6));
+  vst1q_u16(dst + 7 * stride + 24, vextq_u16(c01[4], c01[5], 6));
+
+  vst1q_u16(dst + 8 * stride + 0, c01[2]);
+  vst1q_u16(dst + 8 * stride + 8, c01[3]);
+  vst1q_u16(dst + 8 * stride + 16, c01[4]);
+  vst1q_u16(dst + 8 * stride + 24, c01[5]);
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(c01[2], c01[3], 2));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(c01[3], c01[4], 2));
+  vst1q_u16(dst + 9 * stride + 16, vextq_u16(c01[4], c01[5], 2));
+  vst1q_u16(dst + 9 * stride + 24, vextq_u16(c01[5], c01[6], 2));
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(c01[2], c01[3], 4));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(c01[3], c01[4], 4));
+  vst1q_u16(dst + 10 * stride + 16, vextq_u16(c01[4], c01[5], 4));
+  vst1q_u16(dst + 10 * stride + 24, vextq_u16(c01[5], c01[6], 4));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(c01[2], c01[3], 6));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(c01[3], c01[4], 6));
+  vst1q_u16(dst + 11 * stride + 16, vextq_u16(c01[4], c01[5], 6));
+  vst1q_u16(dst + 11 * stride + 24, vextq_u16(c01[5], c01[6], 6));
+
+  vst1q_u16(dst + 12 * stride + 0, c01[3]);
+  vst1q_u16(dst + 12 * stride + 8, c01[4]);
+  vst1q_u16(dst + 12 * stride + 16, c01[5]);
+  vst1q_u16(dst + 12 * stride + 24, c01[6]);
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(c01[3], c01[4], 2));
+  vst1q_u16(dst + 13 * stride + 8, vextq_u16(c01[4], c01[5], 2));
+  vst1q_u16(dst + 13 * stride + 16, vextq_u16(c01[5], c01[6], 2));
+  vst1q_u16(dst + 13 * stride + 24, vextq_u16(c01[6], c01[7], 2));
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(c01[3], c01[4], 4));
+  vst1q_u16(dst + 14 * stride + 8, vextq_u16(c01[4], c01[5], 4));
+  vst1q_u16(dst + 14 * stride + 16, vextq_u16(c01[5], c01[6], 4));
+  vst1q_u16(dst + 14 * stride + 24, vextq_u16(c01[6], c01[7], 4));
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(c01[3], c01[4], 6));
+  vst1q_u16(dst + 15 * stride + 8, vextq_u16(c01[4], c01[5], 6));
+  vst1q_u16(dst + 15 * stride + 16, vextq_u16(c01[5], c01[6], 6));
+  vst1q_u16(dst + 15 * stride + 24, vextq_u16(c01[6], c01[7], 6));
+
+  vst1q_u16(dst + 16 * stride + 0, c01[4]);
+  vst1q_u16(dst + 16 * stride + 8, c01[5]);
+  vst1q_u16(dst + 16 * stride + 16, c01[6]);
+  vst1q_u16(dst + 16 * stride + 24, c01[7]);
+  vst1q_u16(dst + 17 * stride + 0, vextq_u16(c01[4], c01[5], 2));
+  vst1q_u16(dst + 17 * stride + 8, vextq_u16(c01[5], c01[6], 2));
+  vst1q_u16(dst + 17 * stride + 16, vextq_u16(c01[6], c01[7], 2));
+  vst1q_u16(dst + 17 * stride + 24, vextq_u16(c01[7], l31, 2));
+  vst1q_u16(dst + 18 * stride + 0, vextq_u16(c01[4], c01[5], 4));
+  vst1q_u16(dst + 18 * stride + 8, vextq_u16(c01[5], c01[6], 4));
+  vst1q_u16(dst + 18 * stride + 16, vextq_u16(c01[6], c01[7], 4));
+  vst1q_u16(dst + 18 * stride + 24, vextq_u16(c01[7], l31, 4));
+  vst1q_u16(dst + 19 * stride + 0, vextq_u16(c01[4], c01[5], 6));
+  vst1q_u16(dst + 19 * stride + 8, vextq_u16(c01[5], c01[6], 6));
+  vst1q_u16(dst + 19 * stride + 16, vextq_u16(c01[6], c01[7], 6));
+  vst1q_u16(dst + 19 * stride + 24, vextq_u16(c01[7], l31, 6));
+
+  vst1q_u16(dst + 20 * stride + 0, c01[5]);
+  vst1q_u16(dst + 20 * stride + 8, c01[6]);
+  vst1q_u16(dst + 20 * stride + 16, c01[7]);
+  vst1q_u16(dst + 20 * stride + 24, l31);
+  vst1q_u16(dst + 21 * stride + 0, vextq_u16(c01[5], c01[6], 2));
+  vst1q_u16(dst + 21 * stride + 8, vextq_u16(c01[6], c01[7], 2));
+  vst1q_u16(dst + 21 * stride + 16, vextq_u16(c01[7], l31, 2));
+  vst1q_u16(dst + 21 * stride + 24, vextq_u16(l31, l31, 2));
+  vst1q_u16(dst + 22 * stride + 0, vextq_u16(c01[5], c01[6], 4));
+  vst1q_u16(dst + 22 * stride + 8, vextq_u16(c01[6], c01[7], 4));
+  vst1q_u16(dst + 22 * stride + 16, vextq_u16(c01[7], l31, 4));
+  vst1q_u16(dst + 22 * stride + 24, vextq_u16(l31, l31, 4));
+  vst1q_u16(dst + 23 * stride + 0, vextq_u16(c01[5], c01[6], 6));
+  vst1q_u16(dst + 23 * stride + 8, vextq_u16(c01[6], c01[7], 6));
+  vst1q_u16(dst + 23 * stride + 16, vextq_u16(c01[7], l31, 6));
+  vst1q_u16(dst + 23 * stride + 24, vextq_u16(l31, l31, 6));
+
+  vst1q_u16(dst + 24 * stride + 0, c01[6]);
+  vst1q_u16(dst + 24 * stride + 8, c01[7]);
+  vst1q_u16(dst + 24 * stride + 16, l31);
+  vst1q_u16(dst + 24 * stride + 24, l31);
+  vst1q_u16(dst + 25 * stride + 0, vextq_u16(c01[6], c01[7], 2));
+  vst1q_u16(dst + 25 * stride + 8, vextq_u16(c01[7], l31, 2));
+  vst1q_u16(dst + 25 * stride + 16, vextq_u16(l31, l31, 2));
+  vst1q_u16(dst + 25 * stride + 24, vextq_u16(l31, l31, 2));
+  vst1q_u16(dst + 26 * stride + 0, vextq_u16(c01[6], c01[7], 4));
+  vst1q_u16(dst + 26 * stride + 8, vextq_u16(c01[7], l31, 4));
+  vst1q_u16(dst + 26 * stride + 16, vextq_u16(l31, l31, 4));
+  vst1q_u16(dst + 26 * stride + 24, vextq_u16(l31, l31, 4));
+  vst1q_u16(dst + 27 * stride + 0, vextq_u16(c01[6], c01[7], 6));
+  vst1q_u16(dst + 27 * stride + 8, vextq_u16(c01[7], l31, 6));
+  vst1q_u16(dst + 27 * stride + 16, vextq_u16(l31, l31, 6));
+  vst1q_u16(dst + 27 * stride + 24, vextq_u16(l31, l31, 6));
+
+  vst1q_u16(dst + 28 * stride + 0, c01[7]);
+  vst1q_u16(dst + 28 * stride + 8, l31);
+  vst1q_u16(dst + 28 * stride + 16, l31);
+  vst1q_u16(dst + 28 * stride + 24, l31);
+  vst1q_u16(dst + 29 * stride + 0, vextq_u16(c01[7], l31, 2));
+  vst1q_u16(dst + 29 * stride + 8, vextq_u16(l31, l31, 2));
+  vst1q_u16(dst + 29 * stride + 16, vextq_u16(l31, l31, 2));
+  vst1q_u16(dst + 29 * stride + 24, vextq_u16(l31, l31, 2));
+  vst1q_u16(dst + 30 * stride + 0, vextq_u16(c01[7], l31, 4));
+  vst1q_u16(dst + 30 * stride + 8, vextq_u16(l31, l31, 4));
+  vst1q_u16(dst + 30 * stride + 16, vextq_u16(l31, l31, 4));
+  vst1q_u16(dst + 30 * stride + 24, vextq_u16(l31, l31, 4));
+  vst1q_u16(dst + 31 * stride + 0, vextq_u16(c01[7], l31, 6));
+  vst1q_u16(dst + 31 * stride + 8, vextq_u16(l31, l31, 6));
+  vst1q_u16(dst + 31 * stride + 16, vextq_u16(l31, l31, 6));
+  vst1q_u16(dst + 31 * stride + 24, vextq_u16(l31, l31, 6));
+}
+
+//------------------------------------------------------------------------------
+
 void vpx_highbd_v_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
                                      const uint16_t *above,
                                      const uint16_t *left, int bd) {
@@ -725,30 +2155,36 @@ void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
                                        const uint16_t *above,
                                        const uint16_t *left, int bd) {
-  const uint16x8x2_t row = vld2q_u16(above);
+  const uint16x8_t row0 = vld1q_u16(above + 0);
+  const uint16x8_t row1 = vld1q_u16(above + 8);
   int i;
   (void)left;
   (void)bd;
 
-  for (i = 0; i < 16; i++, dst += stride) {
-    vst2q_u16(dst, row);
+  for (i = 0; i < 16; i++) {
+    vst1q_u16(dst + 0, row0);
+    vst1q_u16(dst + 8, row1);
+    dst += stride;
   }
 }
 
 void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
                                        const uint16_t *above,
                                        const uint16_t *left, int bd) {
-  const uint16x8x2_t row0 = vld2q_u16(above);
-  const uint16x8x2_t row1 = vld2q_u16(above + 16);
+  const uint16x8_t row0 = vld1q_u16(above + 0);
+  const uint16x8_t row1 = vld1q_u16(above + 8);
+  const uint16x8_t row2 = vld1q_u16(above + 16);
+  const uint16x8_t row3 = vld1q_u16(above + 24);
   int i;
   (void)left;
   (void)bd;
 
   for (i = 0; i < 32; i++) {
-    vst2q_u16(dst, row0);
-    dst += 16;
-    vst2q_u16(dst, row1);
-    dst += stride - 16;
+    vst1q_u16(dst + 0, row0);
+    vst1q_u16(dst + 8, row1);
+    vst1q_u16(dst + 16, row2);
+    vst1q_u16(dst + 24, row3);
+    dst += stride;
   }
 }
 
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c
index 5530c6425b..8d6e8acc4c 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c
@@ -661,6 +661,17 @@ void vpx_highbd_lpf_vertical_8_dual_neon(
   vpx_highbd_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, bd);
 }
 
+// Quiet warnings of the form: 'vpx_dsp/arm/highbd_loopfilter_neon.c|675 col 67|
+// warning: 'oq1' may be used uninitialized in this function
+// [-Wmaybe-uninitialized]', for oq1-op1. Without reworking the code or adding
+// an additional branch this warning cannot be silenced otherwise. The
+// loopfilter is only called when needed for a block so these output pixels
+// will be set.
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+
 static void lpf_horizontal_16_kernel(uint16_t *s, int p,
                                      const uint16x8_t blimit_vec,
                                      const uint16x8_t limit_vec,
@@ -723,6 +734,10 @@ static void lpf_vertical_16_kernel(uint16_t *s, int p,
   }
 }
 
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+
 void vpx_highbd_lpf_horizontal_16_neon(uint16_t *s, int p,
                                        const uint8_t *blimit,
                                        const uint8_t *limit,
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_quantize_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_quantize_neon.c
new file mode 100644
index 0000000000..c2ad34a695
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_quantize_neon.c
@@ -0,0 +1,300 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
+
+static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store(
+    const int32x4_t dqcoeff_0, const int32x4_t dqcoeff_1,
+    tran_low_t *dqcoeff_ptr) {
+  vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+  vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
+}
+
+static VPX_FORCE_INLINE void highbd_quantize_8_neon(
+    const int32x4_t coeff_0, const int32x4_t coeff_1, const int32x4_t zbin,
+    const int32x4_t round, const int32x4_t quant, const int32x4_t quant_shift,
+    int32x4_t *qcoeff_0, int32x4_t *qcoeff_1) {
+  // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values
+  const int32x4_t coeff_0_sign = vshrq_n_s32(coeff_0, 31);
+  const int32x4_t coeff_1_sign = vshrq_n_s32(coeff_1, 31);
+  const int32x4_t coeff_0_abs = vabsq_s32(coeff_0);
+  const int32x4_t coeff_1_abs = vabsq_s32(coeff_1);
+
+  // Calculate 2 masks of elements outside the bin
+  const int32x4_t zbin_mask_0 =
+      vreinterpretq_s32_u32(vcgeq_s32(coeff_0_abs, zbin));
+  const int32x4_t zbin_mask_1 = vreinterpretq_s32_u32(
+      vcgeq_s32(coeff_1_abs, vdupq_lane_s32(vget_low_s32(zbin), 1)));
+
+  // Get the rounded values
+  const int32x4_t rounded_0 = vaddq_s32(coeff_0_abs, round);
+  const int32x4_t rounded_1 =
+      vaddq_s32(coeff_1_abs, vdupq_lane_s32(vget_low_s32(round), 1));
+
+  // (round * (quant << 15) * 2) >> 16 == (round * quant)
+  int32x4_t qcoeff_tmp_0 = vqdmulhq_s32(rounded_0, quant);
+  int32x4_t qcoeff_tmp_1 =
+      vqdmulhq_s32(rounded_1, vdupq_lane_s32(vget_low_s32(quant), 1));
+
+  // Add rounded values
+  qcoeff_tmp_0 = vaddq_s32(qcoeff_tmp_0, rounded_0);
+  qcoeff_tmp_1 = vaddq_s32(qcoeff_tmp_1, rounded_1);
+
+  // (round * (quant_shift << 15) * 2) >> 16 == (round * quant_shift)
+  qcoeff_tmp_0 = vqdmulhq_s32(qcoeff_tmp_0, quant_shift);
+  qcoeff_tmp_1 =
+      vqdmulhq_s32(qcoeff_tmp_1, vdupq_lane_s32(vget_low_s32(quant_shift), 1));
+
+  // Restore the sign bit.
+  qcoeff_tmp_0 = veorq_s32(qcoeff_tmp_0, coeff_0_sign);
+  qcoeff_tmp_1 = veorq_s32(qcoeff_tmp_1, coeff_1_sign);
+  qcoeff_tmp_0 = vsubq_s32(qcoeff_tmp_0, coeff_0_sign);
+  qcoeff_tmp_1 = vsubq_s32(qcoeff_tmp_1, coeff_1_sign);
+
+  // Only keep the relevant coeffs
+  *qcoeff_0 = vandq_s32(qcoeff_tmp_0, zbin_mask_0);
+  *qcoeff_1 = vandq_s32(qcoeff_tmp_1, zbin_mask_1);
+}
+
+static VPX_FORCE_INLINE int16x8_t
+highbd_quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+                       tran_low_t *dqcoeff_ptr, const int32x4_t zbin,
+                       const int32x4_t round, const int32x4_t quant,
+                       const int32x4_t quant_shift, const int32x4_t dequant) {
+  int32x4_t qcoeff_0, qcoeff_1, dqcoeff_0, dqcoeff_1;
+
+  // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values
+  const int32x4_t coeff_0 = vld1q_s32(coeff_ptr);
+  const int32x4_t coeff_1 = vld1q_s32(coeff_ptr + 4);
+  highbd_quantize_8_neon(coeff_0, coeff_1, zbin, round, quant, quant_shift,
+                         &qcoeff_0, &qcoeff_1);
+
+  // Store the 32-bit qcoeffs
+  vst1q_s32(qcoeff_ptr, qcoeff_0);
+  vst1q_s32(qcoeff_ptr + 4, qcoeff_1);
+
+  // Calculate and store the dqcoeffs
+  dqcoeff_0 = vmulq_s32(qcoeff_0, dequant);
+  dqcoeff_1 = vmulq_s32(qcoeff_1, vdupq_lane_s32(vget_low_s32(dequant), 1));
+
+  highbd_calculate_dqcoeff_and_store(dqcoeff_0, dqcoeff_1, dqcoeff_ptr);
+
+  return vcombine_s16(vmovn_s32(qcoeff_0), vmovn_s32(qcoeff_1));
+}
+
+void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const struct macroblock_plane *const mb_plane,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const struct ScanOrder *const scan_order) {
+  const int16x8_t neg_one = vdupq_n_s16(-1);
+  uint16x8_t eob_max;
+  const int16_t *iscan = scan_order->iscan;
+
+  // Only the first element of each vector is DC.
+  // High half has identical elements, but we can reconstruct it from the low
+  // half by duplicating the 2nd element. So we only need to pass a 4x32-bit
+  // vector
+  int32x4_t zbin = vmovl_s16(vld1_s16(mb_plane->zbin));
+  int32x4_t round = vmovl_s16(vld1_s16(mb_plane->round));
+  // Extend the quant, quant_shift vectors to ones of 32-bit elements
+  // scale to high-half, so we can use vqdmulhq_s32
+  int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant)), 15);
+  int32x4_t quant_shift =
+      vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant_shift)), 15);
+  int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr));
+
+  // Process first 8 values which include a dc component.
+  {
+    const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+    const int16x8_t qcoeff =
+        highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+                               quant, quant_shift, dequant);
+
+    // Set non-zero elements to -1 and use that to extract values for eob.
+    eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
+
+    __builtin_prefetch(coeff_ptr + 64);
+
+    coeff_ptr += 8;
+    iscan += 8;
+    qcoeff_ptr += 8;
+    dqcoeff_ptr += 8;
+  }
+
+  n_coeffs -= 8;
+
+  {
+    zbin = vdupq_lane_s32(vget_low_s32(zbin), 1);
+    round = vdupq_lane_s32(vget_low_s32(round), 1);
+    quant = vdupq_lane_s32(vget_low_s32(quant), 1);
+    quant_shift = vdupq_lane_s32(vget_low_s32(quant_shift), 1);
+    dequant = vdupq_lane_s32(vget_low_s32(dequant), 1);
+
+    do {
+      const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+      const int16x8_t qcoeff =
+          highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin,
+                                 round, quant, quant_shift, dequant);
+
+      // Set non-zero elements to -1 and use that to extract values for eob.
+      eob_max =
+          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
+
+      __builtin_prefetch(coeff_ptr + 64);
+      coeff_ptr += 8;
+      iscan += 8;
+      qcoeff_ptr += 8;
+      dqcoeff_ptr += 8;
+      n_coeffs -= 8;
+    } while (n_coeffs > 0);
+  }
+
+#if VPX_ARCH_AARCH64
+  *eob_ptr = vmaxvq_u16(eob_max);
+#else
+  {
+    const uint16x4_t eob_max_0 =
+        vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
+    const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
+    const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
+    vst1_lane_u16(eob_ptr, eob_max_2, 0);
+  }
+#endif  // VPX_ARCH_AARCH64
+}
+
+static VPX_FORCE_INLINE int32x4_t extract_sign_bit(int32x4_t a) {
+  return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
+}
+
+static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store_32x32(
+    int32x4_t dqcoeff_0, int32x4_t dqcoeff_1, tran_low_t *dqcoeff_ptr) {
+  // Add 1 if negative to round towards zero because the C uses division.
+  dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
+  dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+
+  dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1);
+  dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1);
+  vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+  vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
+}
+
+static VPX_FORCE_INLINE int16x8_t highbd_quantize_b_32x32_neon(
+    const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int32x4_t zbin, const int32x4_t round,
+    const int32x4_t quant, const int32x4_t quant_shift,
+    const int32x4_t dequant) {
+  int32x4_t qcoeff_0, qcoeff_1, dqcoeff_0, dqcoeff_1;
+
+  // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values
+  const int32x4_t coeff_0 = vld1q_s32(coeff_ptr);
+  const int32x4_t coeff_1 = vld1q_s32(coeff_ptr + 4);
+  highbd_quantize_8_neon(coeff_0, coeff_1, zbin, round, quant, quant_shift,
+                         &qcoeff_0, &qcoeff_1);
+
+  // Store the 32-bit qcoeffs
+  vst1q_s32(qcoeff_ptr, qcoeff_0);
+  vst1q_s32(qcoeff_ptr + 4, qcoeff_1);
+
+  // Calculate and store the dqcoeffs
+  dqcoeff_0 = vmulq_s32(qcoeff_0, dequant);
+  dqcoeff_1 = vmulq_s32(qcoeff_1, vdupq_lane_s32(vget_low_s32(dequant), 1));
+
+  highbd_calculate_dqcoeff_and_store_32x32(dqcoeff_0, dqcoeff_1, dqcoeff_ptr);
+
+  return vcombine_s16(vmovn_s32(qcoeff_0), vmovn_s32(qcoeff_1));
+}
+
+void vpx_highbd_quantize_b_32x32_neon(
+    const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+    uint16_t *eob_ptr, const struct ScanOrder *const scan_order) {
+  const int16x8_t neg_one = vdupq_n_s16(-1);
+  uint16x8_t eob_max;
+  int i;
+  const int16_t *iscan = scan_order->iscan;
+
+  // Only the first element of each vector is DC.
+  // High half has identical elements, but we can reconstruct it from the low
+  // half by duplicating the 2nd element. So we only need to pass a 4x32-bit
+  // vector
+  int32x4_t zbin = vrshrq_n_s32(vmovl_s16(vld1_s16(mb_plane->zbin)), 1);
+  int32x4_t round = vrshrq_n_s32(vmovl_s16(vld1_s16(mb_plane->round)), 1);
+  // Extend the quant, quant_shift vectors to ones of 32-bit elements
+  // scale to high-half, so we can use vqdmulhq_s32
+  int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant)), 15);
+  int32x4_t quant_shift =
+      vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant_shift)), 16);
+  int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr));
+
+  // Process first 8 values which include a dc component.
+  {
+    const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+    const int16x8_t qcoeff =
+        highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin,
+                                     round, quant, quant_shift, dequant);
+
+    // Set non-zero elements to -1 and use that to extract values for eob.
+    eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
+
+    __builtin_prefetch(coeff_ptr + 64);
+    coeff_ptr += 8;
+    iscan += 8;
+    qcoeff_ptr += 8;
+    dqcoeff_ptr += 8;
+  }
+
+  {
+    zbin = vdupq_lane_s32(vget_low_s32(zbin), 1);
+    round = vdupq_lane_s32(vget_low_s32(round), 1);
+    quant = vdupq_lane_s32(vget_low_s32(quant), 1);
+    quant_shift = vdupq_lane_s32(vget_low_s32(quant_shift), 1);
+    dequant = vdupq_lane_s32(vget_low_s32(dequant), 1);
+
+    for (i = 1; i < 32 * 32 / 8; ++i) {
+      const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+      const int16x8_t qcoeff =
+          highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin,
+                                       round, quant, quant_shift, dequant);
+
+      // Set non-zero elements to -1 and use that to extract values for eob.
+      eob_max =
+          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
+
+      __builtin_prefetch(coeff_ptr + 64);
+      coeff_ptr += 8;
+      iscan += 8;
+      qcoeff_ptr += 8;
+      dqcoeff_ptr += 8;
+    }
+  }
+
+#if VPX_ARCH_AARCH64
+  *eob_ptr = vmaxvq_u16(eob_max);
+#else
+  {
+    const uint16x4_t eob_max_0 =
+        vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
+    const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
+    const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
+    vst1_lane_u16(eob_ptr, eob_max_2, 0);
+  }
+#endif  // VPX_ARCH_AARCH64
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad4d_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad4d_neon.c
new file mode 100644
index 0000000000..a6684b0534
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad4d_neon.c
@@ -0,0 +1,273 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_sad4xhx4d_neon(const uint8_t *src_ptr, int src_stride,
+                                         const uint8_t *const ref_ptr[4],
+                                         int ref_stride, uint32_t res[4],
+                                         int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    uint16x4_t s = vld1_u16(src16_ptr + i * src_stride);
+    uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride);
+    uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride);
+    uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride);
+    uint16x4_t r3 = vld1_u16(ref16_ptr3 + i * ref_stride);
+
+    sum[0] = vabal_u16(sum[0], s, r0);
+    sum[1] = vabal_u16(sum[1], s, r1);
+    sum[2] = vabal_u16(sum[2], s, r2);
+    sum[3] = vabal_u16(sum[3], s, r3);
+
+  } while (++i < h);
+
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+static INLINE void highbd_sad8xhx4d_neon(const uint8_t *src_ptr, int src_stride,
+                                         const uint8_t *const ref_ptr[4],
+                                         int ref_stride, uint32_t res[4],
+                                         int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+  uint32x4_t sum_u32[4];
+
+  int i = 0;
+  do {
+    uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
+
+    sum[0] = vabaq_u16(sum[0], s, vld1q_u16(ref16_ptr0 + i * ref_stride));
+    sum[1] = vabaq_u16(sum[1], s, vld1q_u16(ref16_ptr1 + i * ref_stride));
+    sum[2] = vabaq_u16(sum[2], s, vld1q_u16(ref16_ptr2 + i * ref_stride));
+    sum[3] = vabaq_u16(sum[3], s, vld1q_u16(ref16_ptr3 + i * ref_stride));
+
+  } while (++i < h);
+
+  sum_u32[0] = vpaddlq_u16(sum[0]);
+  sum_u32[1] = vpaddlq_u16(sum[1]);
+  sum_u32[2] = vpaddlq_u16(sum[2]);
+  sum_u32[3] = vpaddlq_u16(sum[3]);
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum_u32));
+}
+
+static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref,
+                             uint32x4_t *const sad_sum) {
+  uint16x8_t abs_diff = vabdq_u16(src, ref);
+  *sad_sum = vpadalq_u16(*sad_sum, abs_diff);
+}
+
+static INLINE void highbd_sad16xhx4d_neon(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *const ref_ptr[4],
+                                          int ref_stride, uint32_t res[4],
+                                          int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum[4];
+
+  int i = 0;
+  do {
+    uint16x8_t s0, s1;
+
+    s0 = vld1q_u16(src16_ptr + i * src_stride);
+    sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]);
+    sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]);
+    sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]);
+    sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum_lo[3]);
+
+    s1 = vld1q_u16(src16_ptr + i * src_stride + 8);
+    sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]);
+    sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]);
+    sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]);
+    sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + 8), &sum_hi[3]);
+
+  } while (++i < h);
+
+  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+static INLINE void highbd_sadwxhx4d_neon(const uint8_t *src_ptr, int src_stride,
+                                         const uint8_t *const ref_ptr[4],
+                                         int ref_stride, uint32_t res[4], int w,
+                                         int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum[4];
+
+  int i = 0;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0, s1, s2, s3;
+
+      s0 = vld1q_u16(src16_ptr + i * src_stride + j);
+      sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]);
+      sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]);
+      sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]);
+      sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride + j), &sum_lo[3]);
+
+      s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8);
+      sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]);
+      sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]);
+      sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]);
+      sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 8), &sum_hi[3]);
+
+      s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16);
+      sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16),
+                &sum_lo[0]);
+      sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16),
+                &sum_lo[1]);
+      sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16),
+                &sum_lo[2]);
+      sad8_neon(s2, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 16),
+                &sum_lo[3]);
+
+      s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24);
+      sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24),
+                &sum_hi[0]);
+      sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24),
+                &sum_hi[1]);
+      sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24),
+                &sum_hi[2]);
+      sad8_neon(s3, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 24),
+                &sum_hi[3]);
+
+      j += 32;
+    } while (j < w);
+
+  } while (++i < h);
+
+  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+static INLINE void highbd_sad64xhx4d_neon(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *const ref_ptr[4],
+                                          int ref_stride, uint32_t res[4],
+                                          int h) {
+  highbd_sadwxhx4d_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64, h);
+}
+
+static INLINE void highbd_sad32xhx4d_neon(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *const ref_ptr[4],
+                                          int ref_stride, uint32_t res[4],
+                                          int h) {
+  highbd_sadwxhx4d_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32, h);
+}
+
+#define HBD_SAD_WXH_4D_NEON(w, h)                                            \
+  void vpx_highbd_sad##w##x##h##x4d_neon(                                    \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad##w##xhx4d_neon(src, src_stride, ref_array, ref_stride,        \
+                              sad_array, (h));                               \
+  }
+
+HBD_SAD_WXH_4D_NEON(4, 4)
+HBD_SAD_WXH_4D_NEON(4, 8)
+
+HBD_SAD_WXH_4D_NEON(8, 4)
+HBD_SAD_WXH_4D_NEON(8, 8)
+HBD_SAD_WXH_4D_NEON(8, 16)
+
+HBD_SAD_WXH_4D_NEON(16, 8)
+HBD_SAD_WXH_4D_NEON(16, 16)
+HBD_SAD_WXH_4D_NEON(16, 32)
+
+HBD_SAD_WXH_4D_NEON(32, 16)
+HBD_SAD_WXH_4D_NEON(32, 32)
+HBD_SAD_WXH_4D_NEON(32, 64)
+
+HBD_SAD_WXH_4D_NEON(64, 32)
+HBD_SAD_WXH_4D_NEON(64, 64)
+
+#undef HBD_SAD_WXH_4D_NEON
+
+#define HBD_SAD_SKIP_WXH_4D_NEON(w, h)                                        \
+  void vpx_highbd_sad_skip_##w##x##h##x4d_neon(                               \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4],  \
+      int ref_stride, uint32_t sad_array[4]) {                                \
+    highbd_sad##w##xhx4d_neon(src, 2 * src_stride, ref_array, 2 * ref_stride, \
+                              sad_array, ((h) >> 1));                         \
+    sad_array[0] <<= 1;                                                       \
+    sad_array[1] <<= 1;                                                       \
+    sad_array[2] <<= 1;                                                       \
+    sad_array[3] <<= 1;                                                       \
+  }
+
+HBD_SAD_SKIP_WXH_4D_NEON(4, 4)
+HBD_SAD_SKIP_WXH_4D_NEON(4, 8)
+
+HBD_SAD_SKIP_WXH_4D_NEON(8, 4)
+HBD_SAD_SKIP_WXH_4D_NEON(8, 8)
+HBD_SAD_SKIP_WXH_4D_NEON(8, 16)
+
+HBD_SAD_SKIP_WXH_4D_NEON(16, 8)
+HBD_SAD_SKIP_WXH_4D_NEON(16, 16)
+HBD_SAD_SKIP_WXH_4D_NEON(16, 32)
+
+HBD_SAD_SKIP_WXH_4D_NEON(32, 16)
+HBD_SAD_SKIP_WXH_4D_NEON(32, 32)
+HBD_SAD_SKIP_WXH_4D_NEON(32, 64)
+
+HBD_SAD_SKIP_WXH_4D_NEON(64, 32)
+HBD_SAD_SKIP_WXH_4D_NEON(64, 64)
+
+#undef HBD_SAD_SKIP_WXH_4D_NEON
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad_neon.c
new file mode 100644
index 0000000000..af36431036
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad_neon.c
@@ -0,0 +1,452 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE uint32_t highbd_sad4xh_neon(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *ref_ptr,
+                                          int ref_stride, int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  do {
+    uint16x4_t s = vld1_u16(src16_ptr);
+    uint16x4_t r = vld1_u16(ref16_ptr);
+    sum = vabal_u16(sum, s, r);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+  } while (--h != 0);
+
+  return horizontal_add_uint32x4(sum);
+}
+
+static INLINE uint32_t highbd_sad8xh_neon(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *ref_ptr,
+                                          int ref_stride, int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  uint16x8_t sum = vdupq_n_u16(0);
+  assert(h <= 16);
+
+  do {
+    uint16x8_t s = vld1q_u16(src16_ptr);
+    uint16x8_t r = vld1q_u16(ref16_ptr);
+    sum = vabaq_u16(sum, s, r);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+  } while (--h != 0);
+
+  return horizontal_add_uint16x8(sum);
+}
+
+static INLINE uint32_t highbd_sad16xh_neon(const uint8_t *src_ptr,
+                                           int src_stride,
+                                           const uint8_t *ref_ptr,
+                                           int ref_stride, int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  uint32x4_t sum_u32 = vdupq_n_u32(0);
+
+  // 'h_overflow' is the number of 16-wide rows we can process before 16-bit
+  // accumulators overflow. After hitting this limit accumulate into 32-bit
+  // elements. 65535 / 4095 ~= 16, so 16 16-wide rows using two accumulators.
+  const int h_overflow = 16;
+  // If block height 'h' is smaller than this limit, use 'h' instead.
+  const int h_limit = h < h_overflow ? h : h_overflow;
+  assert(h % h_limit == 0);
+
+  do {
+    uint16x8_t sum_u16[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+    int i = h_limit;
+    do {
+      uint16x8_t s0, s1, r0, r1;
+
+      s0 = vld1q_u16(src16_ptr);
+      r0 = vld1q_u16(ref16_ptr);
+      sum_u16[0] = vabaq_u16(sum_u16[0], s0, r0);
+
+      s1 = vld1q_u16(src16_ptr + 8);
+      r1 = vld1q_u16(ref16_ptr + 8);
+      sum_u16[1] = vabaq_u16(sum_u16[1], s1, r1);
+
+      src16_ptr += src_stride;
+      ref16_ptr += ref_stride;
+    } while (--i != 0);
+
+    sum_u32 = vpadalq_u16(sum_u32, sum_u16[0]);
+    sum_u32 = vpadalq_u16(sum_u32, sum_u16[1]);
+    h -= h_limit;
+  } while (h != 0);
+  return horizontal_add_uint32x4(sum_u32);
+}
+
+static INLINE uint32_t highbd_sadwxh_neon(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *ref_ptr,
+                                          int ref_stride, int w, int h,
+                                          const int h_overflow) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  uint32x4_t sum_u32 = vdupq_n_u32(0);
+  const int h_limit = h < h_overflow ? h : h_overflow;
+  assert(h % h_limit == 0);
+
+  do {
+    uint16x8_t sum_u16[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                              vdupq_n_u16(0) };
+    int i = 0;
+    do {
+      int j = 0;
+      do {
+        uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3;
+
+        s0 = vld1q_u16(src16_ptr + j);
+        r0 = vld1q_u16(ref16_ptr + j);
+        sum_u16[0] = vabaq_u16(sum_u16[0], s0, r0);
+
+        s1 = vld1q_u16(src16_ptr + j + 8);
+        r1 = vld1q_u16(ref16_ptr + j + 8);
+        sum_u16[1] = vabaq_u16(sum_u16[1], s1, r1);
+
+        s2 = vld1q_u16(src16_ptr + j + 16);
+        r2 = vld1q_u16(ref16_ptr + j + 16);
+        sum_u16[2] = vabaq_u16(sum_u16[2], s2, r2);
+
+        s3 = vld1q_u16(src16_ptr + j + 24);
+        r3 = vld1q_u16(ref16_ptr + j + 24);
+        sum_u16[3] = vabaq_u16(sum_u16[3], s3, r3);
+
+        j += 32;
+      } while (j < w);
+      src16_ptr += src_stride;
+      ref16_ptr += ref_stride;
+    } while (++i != h_limit);
+
+    sum_u32 = vpadalq_u16(sum_u32, sum_u16[0]);
+    sum_u32 = vpadalq_u16(sum_u32, sum_u16[1]);
+    sum_u32 = vpadalq_u16(sum_u32, sum_u16[2]);
+    sum_u32 = vpadalq_u16(sum_u32, sum_u16[3]);
+    h -= h_limit;
+  } while (h != 0);
+  return horizontal_add_uint32x4(sum_u32);
+}
+
+static INLINE uint32_t highbd_sad32xh_neon(const uint8_t *src_ptr,
+                                           int src_stride,
+                                           const uint8_t *ref_ptr,
+                                           int ref_stride, int h) {
+  // 'h_overflow' is the number of 32-wide rows we can process before 16-bit
+  // accumulators overflow. After hitting this limit accumulate into 32-bit
+  // elements. 65535 / 4095 ~= 16, so 16 32-wide rows using four accumulators.
+  const int h_overflow = 16;
+  return highbd_sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
+                            h_overflow);
+}
+
+static INLINE uint32_t highbd_sad64xh_neon(const uint8_t *src_ptr,
+                                           int src_stride,
+                                           const uint8_t *ref_ptr,
+                                           int ref_stride, int h) {
+  // 'h_overflow' is the number of 64-wide rows we can process before 16-bit
+  // accumulators overflow. After hitting this limit accumulate into 32-bit
+  // elements. 65535 / 4095 ~= 16, so 8 64-wide rows using four accumulators.
+  const int h_overflow = 8;
+  return highbd_sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h,
+                            h_overflow);
+}
+
+#define HBD_SAD_WXH_NEON(w, h)                                            \
+  unsigned int vpx_highbd_sad##w##x##h##_neon(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,             \
+      int ref_stride) {                                                   \
+    return highbd_sad##w##xh_neon(src, src_stride, ref, ref_stride, (h)); \
+  }
+
+HBD_SAD_WXH_NEON(4, 4)
+HBD_SAD_WXH_NEON(4, 8)
+
+HBD_SAD_WXH_NEON(8, 4)
+HBD_SAD_WXH_NEON(8, 8)
+HBD_SAD_WXH_NEON(8, 16)
+
+HBD_SAD_WXH_NEON(16, 8)
+HBD_SAD_WXH_NEON(16, 16)
+HBD_SAD_WXH_NEON(16, 32)
+
+HBD_SAD_WXH_NEON(32, 16)
+HBD_SAD_WXH_NEON(32, 32)
+HBD_SAD_WXH_NEON(32, 64)
+
+HBD_SAD_WXH_NEON(64, 32)
+HBD_SAD_WXH_NEON(64, 64)
+
+#undef HBD_SAD_WXH_NEON
+
+#define HBD_SAD_SKIP_WXH_NEON(w, h)                             \
+  unsigned int vpx_highbd_sad_skip_##w##x##h##_neon(            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,   \
+      int ref_stride) {                                         \
+    return 2 * highbd_sad##w##xh_neon(src, 2 * src_stride, ref, \
+                                      2 * ref_stride, (h) / 2); \
+  }
+
+HBD_SAD_SKIP_WXH_NEON(4, 4)
+HBD_SAD_SKIP_WXH_NEON(4, 8)
+
+HBD_SAD_SKIP_WXH_NEON(8, 4)
+HBD_SAD_SKIP_WXH_NEON(8, 8)
+HBD_SAD_SKIP_WXH_NEON(8, 16)
+
+HBD_SAD_SKIP_WXH_NEON(16, 8)
+HBD_SAD_SKIP_WXH_NEON(16, 16)
+HBD_SAD_SKIP_WXH_NEON(16, 32)
+
+HBD_SAD_SKIP_WXH_NEON(32, 16)
+HBD_SAD_SKIP_WXH_NEON(32, 32)
+HBD_SAD_SKIP_WXH_NEON(32, 64)
+
+HBD_SAD_SKIP_WXH_NEON(64, 32)
+HBD_SAD_SKIP_WXH_NEON(64, 64)
+
+#undef HBD_SAD_SKIP_WXH_NEON
+
+static INLINE uint32_t highbd_sad4xh_avg_neon(const uint8_t *src_ptr,
+                                              int src_stride,
+                                              const uint8_t *ref_ptr,
+                                              int ref_stride, int h,
+                                              const uint8_t *second_pred) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  do {
+    uint16x4_t s = vld1_u16(src16_ptr);
+    uint16x4_t r = vld1_u16(ref16_ptr);
+    uint16x4_t p = vld1_u16(pred16_ptr);
+
+    uint16x4_t avg = vrhadd_u16(r, p);
+    sum = vabal_u16(sum, s, avg);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+    pred16_ptr += 4;
+  } while (--h != 0);
+
+  return horizontal_add_uint32x4(sum);
+}
+
+static INLINE uint32_t highbd_sad8xh_avg_neon(const uint8_t *src_ptr,
+                                              int src_stride,
+                                              const uint8_t *ref_ptr,
+                                              int ref_stride, int h,
+                                              const uint8_t *second_pred) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  uint16x8_t sum = vdupq_n_u16(0);
+  assert(h <= 16);
+
+  do {
+    uint16x8_t s = vld1q_u16(src16_ptr);
+    uint16x8_t r = vld1q_u16(ref16_ptr);
+    uint16x8_t p = vld1q_u16(pred16_ptr);
+
+    uint16x8_t avg = vrhaddq_u16(r, p);
+    sum = vabaq_u16(sum, s, avg);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+    pred16_ptr += 8;
+  } while (--h != 0);
+
+  return horizontal_add_uint16x8(sum);
+}
+
+static INLINE uint32_t highbd_sad16xh_avg_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *ref_ptr,
+                                               int ref_stride, int h,
+                                               const uint8_t *second_pred) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  uint32x4_t sum_u32 = vdupq_n_u32(0);
+
+  // 'h_overflow' is the number of 16-wide rows we can process before 16-bit
+  // accumulators overflow. After hitting this limit accumulate into 32-bit
+  // elements. 65535 / 4095 ~= 16, so 16 16-wide rows using two accumulators.
+  const int h_overflow = 16;
+  // If block height 'h' is smaller than this limit, use 'h' instead.
+  const int h_limit = h < h_overflow ? h : h_overflow;
+  assert(h % h_limit == 0);
+
+  do {
+    uint16x8_t sum_u16[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+    int i = h_limit;
+    do {
+      uint16x8_t s0, s1, r0, r1, p0, p1;
+      uint16x8_t avg0, avg1;
+
+      s0 = vld1q_u16(src16_ptr);
+      r0 = vld1q_u16(ref16_ptr);
+      p0 = vld1q_u16(pred16_ptr);
+      avg0 = vrhaddq_u16(r0, p0);
+      sum_u16[0] = vabaq_u16(sum_u16[0], s0, avg0);
+
+      s1 = vld1q_u16(src16_ptr + 8);
+      r1 = vld1q_u16(ref16_ptr + 8);
+      p1 = vld1q_u16(pred16_ptr + 8);
+      avg1 = vrhaddq_u16(r1, p1);
+      sum_u16[1] = vabaq_u16(sum_u16[1], s1, avg1);
+
+      src16_ptr += src_stride;
+      ref16_ptr += ref_stride;
+      pred16_ptr += 16;
+    } while (--i != 0);
+
+    sum_u32 = vpadalq_u16(sum_u32, sum_u16[0]);
+    sum_u32 = vpadalq_u16(sum_u32, sum_u16[1]);
+    h -= h_limit;
+  } while (h != 0);
+  return horizontal_add_uint32x4(sum_u32);
+}
+
+static INLINE uint32_t highbd_sadwxh_avg_neon(const uint8_t *src_ptr,
+                                              int src_stride,
+                                              const uint8_t *ref_ptr,
+                                              int ref_stride, int w, int h,
+                                              const uint8_t *second_pred,
+                                              const int h_overflow) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  uint32x4_t sum_u32 = vdupq_n_u32(0);
+  const int h_limit = h < h_overflow ? h : h_overflow;
+  assert(h % h_limit == 0);
+
+  do {
+    uint16x8_t sum_u16[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                              vdupq_n_u16(0) };
+
+    int i = h_limit;
+    do {
+      int j = 0;
+      do {
+        uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3;
+        uint16x8_t avg0, avg1, avg2, avg3;
+
+        s0 = vld1q_u16(src16_ptr + j);
+        r0 = vld1q_u16(ref16_ptr + j);
+        p0 = vld1q_u16(pred16_ptr + j);
+        avg0 = vrhaddq_u16(r0, p0);
+        sum_u16[0] = vabaq_u16(sum_u16[0], s0, avg0);
+
+        s1 = vld1q_u16(src16_ptr + j + 8);
+        r1 = vld1q_u16(ref16_ptr + j + 8);
+        p1 = vld1q_u16(pred16_ptr + j + 8);
+        avg1 = vrhaddq_u16(r1, p1);
+        sum_u16[1] = vabaq_u16(sum_u16[1], s1, avg1);
+
+        s2 = vld1q_u16(src16_ptr + j + 16);
+        r2 = vld1q_u16(ref16_ptr + j + 16);
+        p2 = vld1q_u16(pred16_ptr + j + 16);
+        avg2 = vrhaddq_u16(r2, p2);
+        sum_u16[2] = vabaq_u16(sum_u16[2], s2, avg2);
+
+        s3 = vld1q_u16(src16_ptr + j + 24);
+        r3 = vld1q_u16(ref16_ptr + j + 24);
+        p3 = vld1q_u16(pred16_ptr + j + 24);
+        avg3 = vrhaddq_u16(r3, p3);
+        sum_u16[3] = vabaq_u16(sum_u16[3], s3, avg3);
+
+        j += 32;
+      } while (j < w);
+
+      src16_ptr += src_stride;
+      ref16_ptr += ref_stride;
+      pred16_ptr += w;
+    } while (--i != 0);
+
+    sum_u32 = vpadalq_u16(sum_u32, sum_u16[0]);
+    sum_u32 = vpadalq_u16(sum_u32, sum_u16[1]);
+    sum_u32 = vpadalq_u16(sum_u32, sum_u16[2]);
+    sum_u32 = vpadalq_u16(sum_u32, sum_u16[3]);
+    h -= h_limit;
+  } while (h != 0);
+  return horizontal_add_uint32x4(sum_u32);
+}
+
+static INLINE uint32_t highbd_sad32xh_avg_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *ref_ptr,
+                                               int ref_stride, int h,
+                                               const uint8_t *second_pred) {
+  // 'h_overflow' is the number of 32-wide rows we can process before 16-bit
+  // accumulators overflow. After hitting this limit accumulate into 32-bit
+  // elements. 65535 / 4095 ~= 16, so 16 32-wide rows using four accumulators.
+  const int h_overflow = 16;
+  return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
+                                second_pred, h_overflow);
+}
+
+static INLINE uint32_t highbd_sad64xh_avg_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *ref_ptr,
+                                               int ref_stride, int h,
+                                               const uint8_t *second_pred) {
+  // 'h_overflow' is the number of 64-wide rows we can process before 16-bit
+  // accumulators overflow. After hitting this limit accumulate into 32-bit
+  // elements. 65535 / 4095 ~= 16, so 8 64-wide rows using four accumulators.
+  const int h_overflow = 8;
+  return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h,
+                                second_pred, h_overflow);
+}
+
+#define HBD_SAD_WXH_AVG_NEON(w, h)                                            \
+  unsigned int vpx_highbd_sad##w##x##h##_avg_neon(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred) {                                           \
+    return highbd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h),  \
+                                      second_pred);                           \
+  }
+
+HBD_SAD_WXH_AVG_NEON(4, 4)
+HBD_SAD_WXH_AVG_NEON(4, 8)
+
+HBD_SAD_WXH_AVG_NEON(8, 4)
+HBD_SAD_WXH_AVG_NEON(8, 8)
+HBD_SAD_WXH_AVG_NEON(8, 16)
+
+HBD_SAD_WXH_AVG_NEON(16, 8)
+HBD_SAD_WXH_AVG_NEON(16, 16)
+HBD_SAD_WXH_AVG_NEON(16, 32)
+
+HBD_SAD_WXH_AVG_NEON(32, 16)
+HBD_SAD_WXH_AVG_NEON(32, 32)
+HBD_SAD_WXH_AVG_NEON(32, 64)
+
+HBD_SAD_WXH_AVG_NEON(64, 32)
+HBD_SAD_WXH_AVG_NEON(64, 64)
+
+#undef HBD_SAD_WXH_AVG_NEON
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_sse_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sse_neon.c
new file mode 100644
index 0000000000..91dfebf900
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sse_neon.c
@@ -0,0 +1,238 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <stdint.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_sse_8x1_init_neon(const uint16_t *src,
+                                            const uint16_t *ref,
+                                            uint32x4_t *sse_acc0,
+                                            uint32x4_t *sse_acc1) {
+  uint16x8_t s = vld1q_u16(src);
+  uint16x8_t r = vld1q_u16(ref);
+
+  uint16x8_t abs_diff = vabdq_u16(s, r);
+  uint16x4_t abs_diff_lo = vget_low_u16(abs_diff);
+  uint16x4_t abs_diff_hi = vget_high_u16(abs_diff);
+
+  *sse_acc0 = vmull_u16(abs_diff_lo, abs_diff_lo);
+  *sse_acc1 = vmull_u16(abs_diff_hi, abs_diff_hi);
+}
+
+static INLINE void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref,
+                                       uint32x4_t *sse_acc0,
+                                       uint32x4_t *sse_acc1) {
+  uint16x8_t s = vld1q_u16(src);
+  uint16x8_t r = vld1q_u16(ref);
+
+  uint16x8_t abs_diff = vabdq_u16(s, r);
+  uint16x4_t abs_diff_lo = vget_low_u16(abs_diff);
+  uint16x4_t abs_diff_hi = vget_high_u16(abs_diff);
+
+  *sse_acc0 = vmlal_u16(*sse_acc0, abs_diff_lo, abs_diff_lo);
+  *sse_acc1 = vmlal_u16(*sse_acc1, abs_diff_hi, abs_diff_hi);
+}
+
+static INLINE int64_t highbd_sse_64xh_neon(const uint16_t *src, int src_stride,
+                                           const uint16_t *ref, int ref_stride,
+                                           int height) {
+  uint32x4_t sse[8];
+  highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+  highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+  highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+  highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]);
+  highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]);
+  highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+    highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+    highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+    highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]);
+    highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]);
+    highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_uint32x4_x8(sse);
+}
+
+static INLINE int64_t highbd_sse_32xh_neon(const uint16_t *src, int src_stride,
+                                           const uint16_t *ref, int ref_stride,
+                                           int height) {
+  uint32x4_t sse[8];
+  highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+  highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+  highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+    highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+    highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_uint32x4_x8(sse);
+}
+
+static INLINE int64_t highbd_sse_16xh_neon(const uint16_t *src, int src_stride,
+                                           const uint16_t *ref, int ref_stride,
+                                           int height) {
+  uint32x4_t sse[4];
+  highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_uint32x4_x4(sse);
+}
+
+static INLINE int64_t highbd_sse_8xh_neon(const uint16_t *src, int src_stride,
+                                          const uint16_t *ref, int ref_stride,
+                                          int height) {
+  uint32x4_t sse[2];
+  highbd_sse_8x1_init_neon(src, ref, &sse[0], &sse[1]);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    highbd_sse_8x1_neon(src, ref, &sse[0], &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_uint32x4_x2(sse);
+}
+
+static INLINE int64_t highbd_sse_4xh_neon(const uint16_t *src, int src_stride,
+                                          const uint16_t *ref, int ref_stride,
+                                          int height) {
+  // Peel the first loop iteration.
+  uint16x4_t s = vld1_u16(src);
+  uint16x4_t r = vld1_u16(ref);
+
+  uint16x4_t abs_diff = vabd_u16(s, r);
+  uint32x4_t sse = vmull_u16(abs_diff, abs_diff);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    s = vld1_u16(src);
+    r = vld1_u16(ref);
+
+    abs_diff = vabd_u16(s, r);
+    sse = vmlal_u16(sse, abs_diff, abs_diff);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_uint32x4(sse);
+}
+
+static INLINE int64_t highbd_sse_wxh_neon(const uint16_t *src, int src_stride,
+                                          const uint16_t *ref, int ref_stride,
+                                          int width, int height) {
+  // { 0, 1, 2, 3, 4, 5, 6, 7 }
+  uint16x8_t k01234567 = vmovl_u8(vcreate_u8(0x0706050403020100));
+  uint16x8_t remainder_mask = vcltq_u16(k01234567, vdupq_n_u16(width & 7));
+  uint64_t sse = 0;
+
+  do {
+    int w = width;
+    int offset = 0;
+
+    do {
+      uint16x8_t s = vld1q_u16(src + offset);
+      uint16x8_t r = vld1q_u16(ref + offset);
+      uint16x8_t abs_diff;
+      uint16x4_t abs_diff_lo;
+      uint16x4_t abs_diff_hi;
+      uint32x4_t sse_u32;
+
+      if (w < 8) {
+        // Mask out-of-range elements.
+        s = vandq_u16(s, remainder_mask);
+        r = vandq_u16(r, remainder_mask);
+      }
+
+      abs_diff = vabdq_u16(s, r);
+      abs_diff_lo = vget_low_u16(abs_diff);
+      abs_diff_hi = vget_high_u16(abs_diff);
+
+      sse_u32 = vmull_u16(abs_diff_lo, abs_diff_lo);
+      sse_u32 = vmlal_u16(sse_u32, abs_diff_hi, abs_diff_hi);
+
+      sse += horizontal_long_add_uint32x4(sse_u32);
+
+      offset += 8;
+      w -= 8;
+    } while (w > 0);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--height != 0);
+
+  return sse;
+}
+
+int64_t vpx_highbd_sse_neon(const uint8_t *src8, int src_stride,
+                            const uint8_t *ref8, int ref_stride, int width,
+                            int height) {
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+
+  switch (width) {
+    case 4:
+      return highbd_sse_4xh_neon(src, src_stride, ref, ref_stride, height);
+    case 8:
+      return highbd_sse_8xh_neon(src, src_stride, ref, ref_stride, height);
+    case 16:
+      return highbd_sse_16xh_neon(src, src_stride, ref, ref_stride, height);
+    case 32:
+      return highbd_sse_32xh_neon(src, src_stride, ref, ref_stride, height);
+    case 64:
+      return highbd_sse_64xh_neon(src, src_stride, ref, ref_stride, height);
+    default:
+      return highbd_sse_wxh_neon(src, src_stride, ref, ref_stride, width,
+                                 height);
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c
new file mode 100644
index 0000000000..f8b94620d4
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c
@@ -0,0 +1,586 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+
+// The bilinear filters look like this:
+//
+// {{ 128,  0 }, { 112, 16 }, { 96, 32 }, { 80,  48 },
+//  {  64, 64 }, {  48, 80 }, { 32, 96 }, { 16, 112 }}
+//
+// We can factor out the highest common multiple, such that the sum of both
+// weights will be 8 instead of 128. The benefits of this are two-fold:
+//
+// 1) We can infer the filter values from the filter_offset parameter in the
+// bilinear filter functions below - we don't have to actually load the values
+// from memory:
+// f0 = 8 - filter_offset
+// f1 = filter_offset
+//
+// 2) Scaling the pixel values by 8, instead of 128 enables us to operate on
+// 16-bit data types at all times, rather than widening out to 32-bit and
+// requiring double the number of data processing instructions. (12-bit * 8 =
+// 15-bit.)
+
+// Process a block exactly 4 wide and any height.
+static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr,
+                                             uint16_t *dst_ptr, int src_stride,
+                                             int pixel_step, int dst_height,
+                                             int filter_offset) {
+  const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
+  const uint16x4_t f1 = vdup_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint16x4_t s0 = load_unaligned_u16(src_ptr);
+    uint16x4_t s1 = load_unaligned_u16(src_ptr + pixel_step);
+
+    uint16x4_t blend = vmul_u16(s0, f0);
+    blend = vmla_u16(blend, s1, f1);
+    blend = vrshr_n_u16(blend, 3);
+
+    vst1_u16(dst_ptr, blend);
+
+    src_ptr += src_stride;
+    dst_ptr += 4;
+  } while (--i != 0);
+}
+
+// Process a block which is a multiple of 8 and any height.
+static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr,
+                                                uint16_t *dst_ptr,
+                                                int src_stride, int pixel_step,
+                                                int dst_width, int dst_height,
+                                                int filter_offset) {
+  const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+  const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+
+      uint16x8_t blend = vmulq_u16(s0, f0);
+      blend = vmlaq_u16(blend, s1, f1);
+      blend = vrshrq_n_u16(blend, 3);
+
+      vst1q_u16(dst_ptr + j, blend);
+
+      j += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+static void highbd_var_filter_block2d_bil_w8(const uint16_t *src_ptr,
+                                             uint16_t *dst_ptr, int src_stride,
+                                             int pixel_step, int dst_height,
+                                             int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      8, dst_height, filter_offset);
+}
+static void highbd_var_filter_block2d_bil_w16(const uint16_t *src_ptr,
+                                              uint16_t *dst_ptr, int src_stride,
+                                              int pixel_step, int dst_height,
+                                              int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      16, dst_height, filter_offset);
+}
+static void highbd_var_filter_block2d_bil_w32(const uint16_t *src_ptr,
+                                              uint16_t *dst_ptr, int src_stride,
+                                              int pixel_step, int dst_height,
+                                              int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      32, dst_height, filter_offset);
+}
+static void highbd_var_filter_block2d_bil_w64(const uint16_t *src_ptr,
+                                              uint16_t *dst_ptr, int src_stride,
+                                              int pixel_step, int dst_height,
+                                              int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      64, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
+                                          uint16_t *dst_ptr, int src_stride,
+                                          int pixel_step, int dst_width,
+                                          int dst_height) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+      uint16x8_t avg = vrhaddq_u16(s0, s1);
+      vst1q_u16(dst_ptr + j, avg);
+
+      j += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+#define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)                           \
+  unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon(     \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, uint32_t *sse) {                     \
+    uint16_t tmp0[w * (h + 1)];                                                \
+    uint16_t tmp1[w * h];                                                      \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+                                                                               \
+    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1),  \
+                                       xoffset);                               \
+    highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);          \
+                                                                               \
+    return vpx_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
+                                                     w, ref, ref_stride, sse); \
+  }
+
+#define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)               \
+  unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon(     \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, unsigned int *sse) {                 \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+                                                                               \
+    if (xoffset == 0) {                                                        \
+      if (yoffset == 0) {                                                      \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse);    \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp[w * h];                                                   \
+        highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \
+                                      h);                                      \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      } else {                                                                 \
+        uint16_t tmp[w * h];                                                   \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride,           \
+                                           src_stride, h, yoffset);            \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h);     \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
+                                      (h + 1));                                \
+        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
+                                      (h + 1));                                \
+        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    } else {                                                                   \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h,    \
+                                           xoffset);                           \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
+                                           (h + 1), xoffset);                  \
+        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
+                                           (h + 1), xoffset);                  \
+        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    }                                                                          \
+  }
+
+// 8-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
+
+// 10-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
+
+// 12-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
+
+// Combine bilinear filter with vpx_highbd_comp_avg_pred for blocks having
+// width 4.
+static void highbd_avg_pred_var_filter_block2d_bil_w4(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
+  const uint16x4_t f1 = vdup_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint16x4_t s0 = load_unaligned_u16(src_ptr);
+    uint16x4_t s1 = load_unaligned_u16(src_ptr + pixel_step);
+    uint16x4_t p = vld1_u16(second_pred);
+
+    uint16x4_t blend = vmul_u16(s0, f0);
+    blend = vmla_u16(blend, s1, f1);
+    blend = vrshr_n_u16(blend, 3);
+
+    vst1_u16(dst_ptr, vrhadd_u16(blend, p));
+
+    src_ptr += src_stride;
+    dst_ptr += 4;
+    second_pred += 4;
+  } while (--i != 0);
+}
+
+// Combine bilinear filter with vpx_highbd_comp_avg_pred for large blocks.
+static void highbd_avg_pred_var_filter_block2d_bil_large(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_width, int dst_height, int filter_offset,
+    const uint16_t *second_pred) {
+  const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+  const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+      uint16x8_t p = vld1q_u16(second_pred);
+
+      uint16x8_t blend = vmulq_u16(s0, f0);
+      blend = vmlaq_u16(blend, s1, f1);
+      blend = vrshrq_n_u16(blend, 3);
+
+      vst1q_u16(dst_ptr + j, vrhaddq_u16(blend, p));
+
+      j += 8;
+      second_pred += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w8(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 8, dst_height,
+                                               filter_offset, second_pred);
+}
+static void highbd_avg_pred_var_filter_block2d_bil_w16(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 16, dst_height,
+                                               filter_offset, second_pred);
+}
+static void highbd_avg_pred_var_filter_block2d_bil_w32(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 32, dst_height,
+                                               filter_offset, second_pred);
+}
+static void highbd_avg_pred_var_filter_block2d_bil_w64(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 64, dst_height,
+                                               filter_offset, second_pred);
+}
+
+// Combine averaging subpel filter with vpx_highbd_comp_avg_pred.
+static void highbd_avg_pred_var_filter_block2d_avg(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_width, int dst_height, const uint16_t *second_pred) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+      uint16x8_t avg = vrhaddq_u16(s0, s1);
+
+      uint16x8_t p = vld1q_u16(second_pred);
+      avg = vrhaddq_u16(avg, p);
+
+      vst1q_u16(dst_ptr + j, avg);
+
+      j += 8;
+      second_pred += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+// Implementation of vpx_highbd_comp_avg_pred for blocks having width >= 16.
+static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
+                            int src_stride, int dst_width, int dst_height,
+                            const uint16_t *second_pred) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s = vld1q_u16(src_ptr + j);
+      uint16x8_t p = vld1q_u16(second_pred);
+
+      uint16x8_t avg = vrhaddq_u16(s, p);
+
+      vst1q_u16(dst_ptr + j, avg);
+
+      j += 8;
+      second_pred += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)                       \
+  uint32_t vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon(     \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, uint32_t *sse,                       \
+      const uint8_t *second_pred) {                                            \
+    uint16_t tmp0[w * (h + 1)];                                                \
+    uint16_t tmp1[w * h];                                                      \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+                                                                               \
+    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1),  \
+                                       xoffset);                               \
+    highbd_avg_pred_var_filter_block2d_bil_w##w(                               \
+        tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));       \
+                                                                               \
+    return vpx_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
+                                                     w, ref, ref_stride, sse); \
+  }
+
+#define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)           \
+  unsigned int vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
+      const uint8_t *src, int source_stride, int xoffset, int yoffset,         \
+      const uint8_t *ref, int ref_stride, unsigned int *sse,                   \
+      const uint8_t *second_pred) {                                            \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+                                                                               \
+    if (xoffset == 0) {                                                        \
+      uint16_t tmp[w * h];                                                     \
+      if (yoffset == 0) {                                                      \
+        highbd_avg_pred(src_ptr, tmp, source_stride, w, h,                     \
+                        CONVERT_TO_SHORTPTR(second_pred));                     \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      } else if (yoffset == 4) {                                               \
+        highbd_avg_pred_var_filter_block2d_avg(                                \
+            src_ptr, tmp, source_stride, source_stride, w, h,                  \
+            CONVERT_TO_SHORTPTR(second_pred));                                 \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      } else {                                                                 \
+        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+            src_ptr, tmp, source_stride, source_stride, h, yoffset,            \
+            CONVERT_TO_SHORTPTR(second_pred));                                 \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_avg_pred_var_filter_block2d_avg(                                \
+            src_ptr, tmp0, source_stride, 1, w, h,                             \
+            CONVERT_TO_SHORTPTR(second_pred));                                 \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w,      \
+                                      (h + 1));                                \
+        highbd_avg_pred_var_filter_block2d_avg(                                \
+            tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred));         \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w,      \
+                                      (h + 1));                                \
+        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+            tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));   \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    } else {                                                                   \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+            src_ptr, tmp0, source_stride, 1, h, xoffset,                       \
+            CONVERT_TO_SHORTPTR(second_pred));                                 \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1,    \
+                                           (h + 1), xoffset);                  \
+        highbd_avg_pred_var_filter_block2d_avg(                                \
+            tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred));         \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1,    \
+                                           (h + 1), xoffset);                  \
+        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+            tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));   \
+        return vpx_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    }                                                                          \
+  }
+
+// 8-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64)
+
+// 10-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64)
+
+// 12-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64)
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon.c
new file mode 100644
index 0000000000..309ae7fd35
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon.c
@@ -0,0 +1,436 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_ports/mem.h"
+
+// Process a block of width 4 two rows at a time.
+static INLINE void highbd_variance_4xh_neon(const uint16_t *src_ptr,
+                                            int src_stride,
+                                            const uint16_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            uint64_t *sse, int64_t *sum) {
+  int16x8_t sum_s16 = vdupq_n_s16(0);
+  int32x4_t sse_s32 = vdupq_n_s32(0);
+
+  int i = h;
+  do {
+    const uint16x8_t s = load_unaligned_u16q(src_ptr, src_stride);
+    const uint16x8_t r = load_unaligned_u16q(ref_ptr, ref_stride);
+
+    int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+    sum_s16 = vaddq_s16(sum_s16, diff);
+
+    sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff));
+    sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff));
+
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  *sum = horizontal_add_int16x8(sum_s16);
+  *sse = horizontal_add_int32x4(sse_s32);
+}
+
+// For 8-bit and 10-bit data, since we're using two int32x4 accumulators, all
+// block sizes can be processed in 32-bit elements (1023*1023*64*16 = 1071645696
+// for a 64x64 block).
+static INLINE void highbd_variance_large_neon(const uint16_t *src_ptr,
+                                              int src_stride,
+                                              const uint16_t *ref_ptr,
+                                              int ref_stride, int w, int h,
+                                              uint64_t *sse, int64_t *sum) {
+  int32x4_t sum_s32 = vdupq_n_s32(0);
+  int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      const uint16x8_t s = vld1q_u16(src_ptr + j);
+      const uint16x8_t r = vld1q_u16(ref_ptr + j);
+
+      const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+      sum_s32 = vpadalq_s16(sum_s32, diff);
+
+      sse_s32[0] =
+          vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff));
+      sse_s32[1] =
+          vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff));
+
+      j += 8;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  *sum = horizontal_add_int32x4(sum_s32);
+  *sse = horizontal_long_add_uint32x4(vaddq_u32(
+      vreinterpretq_u32_s32(sse_s32[0]), vreinterpretq_u32_s32(sse_s32[1])));
+}
+
+static INLINE void highbd_variance_8xh_neon(const uint16_t *src, int src_stride,
+                                            const uint16_t *ref, int ref_stride,
+                                            int h, uint64_t *sse,
+                                            int64_t *sum) {
+  highbd_variance_large_neon(src, src_stride, ref, ref_stride, 8, h, sse, sum);
+}
+
+static INLINE void highbd_variance_16xh_neon(const uint16_t *src,
+                                             int src_stride,
+                                             const uint16_t *ref,
+                                             int ref_stride, int h,
+                                             uint64_t *sse, int64_t *sum) {
+  highbd_variance_large_neon(src, src_stride, ref, ref_stride, 16, h, sse, sum);
+}
+
+static INLINE void highbd_variance_32xh_neon(const uint16_t *src,
+                                             int src_stride,
+                                             const uint16_t *ref,
+                                             int ref_stride, int h,
+                                             uint64_t *sse, int64_t *sum) {
+  highbd_variance_large_neon(src, src_stride, ref, ref_stride, 32, h, sse, sum);
+}
+
+static INLINE void highbd_variance_64xh_neon(const uint16_t *src,
+                                             int src_stride,
+                                             const uint16_t *ref,
+                                             int ref_stride, int h,
+                                             uint64_t *sse, int64_t *sum) {
+  highbd_variance_large_neon(src, src_stride, ref, ref_stride, 64, h, sse, sum);
+}
+
+// For 12-bit data, we can only accumulate up to 128 elements in the sum of
+// squares (4095*4095*128 = 2146435200), and because we're using two int32x4
+// accumulators, we can only process up to 32 32-element rows (32*32/8 = 128)
+// or 16 64-element rows before we have to accumulate into 64-bit elements.
+// Therefore blocks of size 32x64, 64x32 and 64x64 are processed in a different
+// helper function.
+
+// Process a block of any size where the width is divisible by 8, with
+// accumulation into 64-bit elements.
+static INLINE void highbd_variance_xlarge_neon(
+    const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr,
+    int ref_stride, int w, int h, int h_limit, uint64_t *sse, int64_t *sum) {
+  int32x4_t sum_s32 = vdupq_n_s32(0);
+  int64x2_t sse_s64 = vdupq_n_s64(0);
+
+  // 'h_limit' is the number of 'w'-width rows we can process before our 32-bit
+  // accumulator overflows. After hitting this limit we accumulate into 64-bit
+  // elements.
+  int h_tmp = h > h_limit ? h_limit : h;
+
+  int i = 0;
+  do {
+    int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+    do {
+      int j = 0;
+      do {
+        const uint16x8_t s0 = vld1q_u16(src_ptr + j);
+        const uint16x8_t r0 = vld1q_u16(ref_ptr + j);
+
+        const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s0, r0));
+        sum_s32 = vpadalq_s16(sum_s32, diff);
+
+        sse_s32[0] =
+            vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff));
+        sse_s32[1] =
+            vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff));
+
+        j += 8;
+      } while (j < w);
+
+      src_ptr += src_stride;
+      ref_ptr += ref_stride;
+      i++;
+    } while (i < h_tmp);
+
+    sse_s64 = vpadalq_s32(sse_s64, sse_s32[0]);
+    sse_s64 = vpadalq_s32(sse_s64, sse_s32[1]);
+    h_tmp += h_limit;
+  } while (i < h);
+
+  *sum = horizontal_add_int32x4(sum_s32);
+  *sse = (uint64_t)horizontal_add_int64x2(sse_s64);
+}
+
+static INLINE void highbd_variance_32xh_xlarge_neon(
+    const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride,
+    int h, uint64_t *sse, int64_t *sum) {
+  highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 32, h, 32, sse,
+                              sum);
+}
+
+static INLINE void highbd_variance_64xh_xlarge_neon(
+    const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride,
+    int h, uint64_t *sse, int64_t *sum) {
+  highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 64, h, 16, sse,
+                              sum);
+}
+
+#define HBD_VARIANCE_WXH_8_NEON(w, h)                                 \
+  uint32_t vpx_highbd_8_variance##w##x##h##_neon(                     \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    int sum;                                                          \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \
+                                 &sse_long, &sum_long);               \
+    *sse = (uint32_t)sse_long;                                        \
+    sum = (int)sum_long;                                              \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (w * h));         \
+  }
+
+#define HBD_VARIANCE_WXH_10_NEON(w, h)                                \
+  uint32_t vpx_highbd_10_variance##w##x##h##_neon(                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    int sum;                                                          \
+    int64_t var;                                                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \
+                                 &sse_long, &sum_long);               \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);                 \
+    sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);                       \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h));         \
+    return (var >= 0) ? (uint32_t)var : 0;                            \
+  }
+
+#define HBD_VARIANCE_WXH_12_NEON(w, h)                                \
+  uint32_t vpx_highbd_12_variance##w##x##h##_neon(                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    int sum;                                                          \
+    int64_t var;                                                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \
+                                 &sse_long, &sum_long);               \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);                 \
+    sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);                       \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h));         \
+    return (var >= 0) ? (uint32_t)var : 0;                            \
+  }
+
+#define HBD_VARIANCE_WXH_12_XLARGE_NEON(w, h)                                \
+  uint32_t vpx_highbd_12_variance##w##x##h##_neon(                           \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
+      int ref_stride, uint32_t *sse) {                                       \
+    int sum;                                                                 \
+    int64_t var;                                                             \
+    uint64_t sse_long = 0;                                                   \
+    int64_t sum_long = 0;                                                    \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                            \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                            \
+    highbd_variance_##w##xh_xlarge_neon(src, src_stride, ref, ref_stride, h, \
+                                        &sse_long, &sum_long);               \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);                        \
+    sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);                              \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h));                \
+    return (var >= 0) ? (uint32_t)var : 0;                                   \
+  }
+
+// 8-bit
+HBD_VARIANCE_WXH_8_NEON(4, 4)
+HBD_VARIANCE_WXH_8_NEON(4, 8)
+
+HBD_VARIANCE_WXH_8_NEON(8, 4)
+HBD_VARIANCE_WXH_8_NEON(8, 8)
+HBD_VARIANCE_WXH_8_NEON(8, 16)
+
+HBD_VARIANCE_WXH_8_NEON(16, 8)
+HBD_VARIANCE_WXH_8_NEON(16, 16)
+HBD_VARIANCE_WXH_8_NEON(16, 32)
+
+HBD_VARIANCE_WXH_8_NEON(32, 16)
+HBD_VARIANCE_WXH_8_NEON(32, 32)
+HBD_VARIANCE_WXH_8_NEON(32, 64)
+
+HBD_VARIANCE_WXH_8_NEON(64, 32)
+HBD_VARIANCE_WXH_8_NEON(64, 64)
+
+// 10-bit
+HBD_VARIANCE_WXH_10_NEON(4, 4)
+HBD_VARIANCE_WXH_10_NEON(4, 8)
+
+HBD_VARIANCE_WXH_10_NEON(8, 4)
+HBD_VARIANCE_WXH_10_NEON(8, 8)
+HBD_VARIANCE_WXH_10_NEON(8, 16)
+
+HBD_VARIANCE_WXH_10_NEON(16, 8)
+HBD_VARIANCE_WXH_10_NEON(16, 16)
+HBD_VARIANCE_WXH_10_NEON(16, 32)
+
+HBD_VARIANCE_WXH_10_NEON(32, 16)
+HBD_VARIANCE_WXH_10_NEON(32, 32)
+HBD_VARIANCE_WXH_10_NEON(32, 64)
+
+HBD_VARIANCE_WXH_10_NEON(64, 32)
+HBD_VARIANCE_WXH_10_NEON(64, 64)
+
+// 12-bit
+HBD_VARIANCE_WXH_12_NEON(4, 4)
+HBD_VARIANCE_WXH_12_NEON(4, 8)
+
+HBD_VARIANCE_WXH_12_NEON(8, 4)
+HBD_VARIANCE_WXH_12_NEON(8, 8)
+HBD_VARIANCE_WXH_12_NEON(8, 16)
+
+HBD_VARIANCE_WXH_12_NEON(16, 8)
+HBD_VARIANCE_WXH_12_NEON(16, 16)
+HBD_VARIANCE_WXH_12_NEON(16, 32)
+
+HBD_VARIANCE_WXH_12_NEON(32, 16)
+HBD_VARIANCE_WXH_12_NEON(32, 32)
+HBD_VARIANCE_WXH_12_XLARGE_NEON(32, 64)
+
+HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 32)
+HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 64)
+
+#define HIGHBD_GET_VAR(S)                                             \
+  void vpx_highbd_8_get##S##x##S##var_neon(                           \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse, int *sum) {                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##S##xh_neon(src, src_stride, ref, ref_stride, S, \
+                                 &sse_long, &sum_long);               \
+    *sse = (uint32_t)sse_long;                                        \
+    *sum = (int)sum_long;                                             \
+  }                                                                   \
+                                                                      \
+  void vpx_highbd_10_get##S##x##S##var_neon(                          \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse, int *sum) {                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##S##xh_neon(src, src_stride, ref, ref_stride, S, \
+                                 &sse_long, &sum_long);               \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);                 \
+    *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);                      \
+  }                                                                   \
+                                                                      \
+  void vpx_highbd_12_get##S##x##S##var_neon(                          \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse, int *sum) {                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##S##xh_neon(src, src_stride, ref, ref_stride, S, \
+                                 &sse_long, &sum_long);               \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);                 \
+    *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);                      \
+  }
+
+HIGHBD_GET_VAR(8)
+HIGHBD_GET_VAR(16)
+
+static INLINE uint32_t highbd_mse_wxh_neon(const uint16_t *src_ptr,
+                                           int src_stride,
+                                           const uint16_t *ref_ptr,
+                                           int ref_stride, int w, int h) {
+  uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s = vld1q_u16(src_ptr + j);
+      uint16x8_t r = vld1q_u16(ref_ptr + j);
+
+      uint16x8_t diff = vabdq_u16(s, r);
+
+      sse_u32[0] =
+          vmlal_u16(sse_u32[0], vget_low_u16(diff), vget_low_u16(diff));
+      sse_u32[1] =
+          vmlal_u16(sse_u32[1], vget_high_u16(diff), vget_high_u16(diff));
+
+      j += 8;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+}
+
+static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr,
+                                            int src_stride,
+                                            const uint16_t *ref_ptr,
+                                            int ref_stride, int h) {
+  return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 8, h);
+}
+
+static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr,
+                                             int src_stride,
+                                             const uint16_t *ref_ptr,
+                                             int ref_stride, int h) {
+  return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, h);
+}
+
+#define HIGHBD_MSE_WXH_NEON(w, h)                                         \
+  uint32_t vpx_highbd_8_mse##w##x##h##_neon(                              \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,     \
+      int ref_stride, uint32_t *sse) {                                    \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                         \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                         \
+    *sse = highbd_mse8_##w##xh_neon(src, src_stride, ref, ref_stride, h); \
+    return *sse;                                                          \
+  }                                                                       \
+                                                                          \
+  uint32_t vpx_highbd_10_mse##w##x##h##_neon(                             \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,     \
+      int ref_stride, uint32_t *sse) {                                    \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                         \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                         \
+    *sse = highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h);   \
+    *sse = ROUND_POWER_OF_TWO(*sse, 4);                                   \
+    return *sse;                                                          \
+  }                                                                       \
+                                                                          \
+  uint32_t vpx_highbd_12_mse##w##x##h##_neon(                             \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,     \
+      int ref_stride, uint32_t *sse) {                                    \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                         \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                         \
+    *sse = highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h);   \
+    *sse = ROUND_POWER_OF_TWO(*sse, 8);                                   \
+    return *sse;                                                          \
+  }
+
+HIGHBD_MSE_WXH_NEON(16, 16)
+HIGHBD_MSE_WXH_NEON(16, 8)
+HIGHBD_MSE_WXH_NEON(8, 16)
+HIGHBD_MSE_WXH_NEON(8, 8)
+
+#undef HIGHBD_MSE_WXH_NEON
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon_dotprod.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon_dotprod.c
new file mode 100644
index 0000000000..1a88720172
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon_dotprod.c
@@ -0,0 +1,96 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_ports/mem.h"
+
+static INLINE uint32_t highbd_mse8_8xh_neon_dotprod(const uint16_t *src_ptr,
+                                                    int src_stride,
+                                                    const uint16_t *ref_ptr,
+                                                    int ref_stride, int h) {
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h / 2;
+  do {
+    uint16x8_t s0, s1, r0, r1;
+    uint8x16_t s, r, diff;
+
+    s0 = vld1q_u16(src_ptr);
+    src_ptr += src_stride;
+    s1 = vld1q_u16(src_ptr);
+    src_ptr += src_stride;
+    r0 = vld1q_u16(ref_ptr);
+    ref_ptr += ref_stride;
+    r1 = vld1q_u16(ref_ptr);
+    ref_ptr += ref_stride;
+
+    s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
+    r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
+
+    diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, diff, diff);
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(sse_u32);
+}
+
+static INLINE uint32_t highbd_mse8_16xh_neon_dotprod(const uint16_t *src_ptr,
+                                                     int src_stride,
+                                                     const uint16_t *ref_ptr,
+                                                     int ref_stride, int h) {
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint16x8_t s0, s1, r0, r1;
+    uint8x16_t s, r, diff;
+
+    s0 = vld1q_u16(src_ptr);
+    s1 = vld1q_u16(src_ptr + 8);
+    r0 = vld1q_u16(ref_ptr);
+    r1 = vld1q_u16(ref_ptr + 8);
+
+    s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
+    r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
+
+    diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, diff, diff);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(sse_u32);
+}
+
+#define HIGHBD_MSE_WXH_NEON_DOTPROD(w, h)                                      \
+  uint32_t vpx_highbd_8_mse##w##x##h##_neon_dotprod(                           \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,          \
+      int ref_stride, uint32_t *sse) {                                         \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                              \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                              \
+    *sse =                                                                     \
+        highbd_mse8_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h); \
+    return *sse;                                                               \
+  }
+
+HIGHBD_MSE_WXH_NEON_DOTPROD(16, 16)
+HIGHBD_MSE_WXH_NEON_DOTPROD(16, 8)
+HIGHBD_MSE_WXH_NEON_DOTPROD(8, 16)
+HIGHBD_MSE_WXH_NEON_DOTPROD(8, 8)
+
+#undef HIGHBD_MSE_WXH_NEON_DOTPROD
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_sve.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_sve.c
new file mode 100644
index 0000000000..cebe06b099
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_sve.c
@@ -0,0 +1,344 @@
+/*
+ * Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS.  All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_dsp/arm/vpx_neon_sve_bridge.h"
+#include "vpx_ports/mem.h"
+
+static INLINE uint32_t highbd_mse_wxh_sve(const uint16_t *src_ptr,
+                                          int src_stride,
+                                          const uint16_t *ref_ptr,
+                                          int ref_stride, int w, int h) {
+  uint64x2_t sse = vdupq_n_u64(0);
+
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s = vld1q_u16(src_ptr + j);
+      uint16x8_t r = vld1q_u16(ref_ptr + j);
+
+      uint16x8_t diff = vabdq_u16(s, r);
+
+      sse = vpx_dotq_u16(sse, diff, diff);
+
+      j += 8;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--h != 0);
+
+  return (uint32_t)horizontal_add_uint64x2(sse);
+}
+
+#define HIGHBD_MSE_WXH_SVE(w, h)                                      \
+  uint32_t vpx_highbd_10_mse##w##x##h##_sve(                          \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    uint32_t sse_tmp =                                                \
+        highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h);   \
+    sse_tmp = ROUND_POWER_OF_TWO(sse_tmp, 4);                         \
+    *sse = sse_tmp;                                                   \
+    return sse_tmp;                                                   \
+  }                                                                   \
+                                                                      \
+  uint32_t vpx_highbd_12_mse##w##x##h##_sve(                          \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    uint32_t sse_tmp =                                                \
+        highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h);   \
+    sse_tmp = ROUND_POWER_OF_TWO(sse_tmp, 8);                         \
+    *sse = sse_tmp;                                                   \
+    return sse_tmp;                                                   \
+  }
+
+HIGHBD_MSE_WXH_SVE(16, 16)
+HIGHBD_MSE_WXH_SVE(16, 8)
+HIGHBD_MSE_WXH_SVE(8, 16)
+HIGHBD_MSE_WXH_SVE(8, 8)
+
+#undef HIGHBD_MSE_WXH_SVE
+
+// Process a block of width 4 two rows at a time.
+static INLINE void highbd_variance_4xh_sve(const uint16_t *src_ptr,
+                                           int src_stride,
+                                           const uint16_t *ref_ptr,
+                                           int ref_stride, int h, uint64_t *sse,
+                                           int64_t *sum) {
+  int16x8_t sum_s16 = vdupq_n_s16(0);
+  int64x2_t sse_s64 = vdupq_n_s64(0);
+
+  do {
+    const uint16x8_t s = load_unaligned_u16q(src_ptr, src_stride);
+    const uint16x8_t r = load_unaligned_u16q(ref_ptr, ref_stride);
+
+    int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+    sum_s16 = vaddq_s16(sum_s16, diff);
+    sse_s64 = vpx_dotq_s16(sse_s64, diff, diff);
+
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
+    h -= 2;
+  } while (h != 0);
+
+  *sum = horizontal_add_int16x8(sum_s16);
+  *sse = horizontal_add_int64x2(sse_s64);
+}
+
+static INLINE void highbd_variance_8xh_sve(const uint16_t *src_ptr,
+                                           int src_stride,
+                                           const uint16_t *ref_ptr,
+                                           int ref_stride, int h, uint64_t *sse,
+                                           int64_t *sum) {
+  int32x4_t sum_s32 = vdupq_n_s32(0);
+  int64x2_t sse_s64 = vdupq_n_s64(0);
+
+  do {
+    const uint16x8_t s = vld1q_u16(src_ptr);
+    const uint16x8_t r = vld1q_u16(ref_ptr);
+
+    const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+    sum_s32 = vpadalq_s16(sum_s32, diff);
+    sse_s64 = vpx_dotq_s16(sse_s64, diff, diff);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--h != 0);
+
+  *sum = horizontal_add_int32x4(sum_s32);
+  *sse = horizontal_add_int64x2(sse_s64);
+}
+
+static INLINE void highbd_variance_16xh_sve(const uint16_t *src_ptr,
+                                            int src_stride,
+                                            const uint16_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            uint64_t *sse, int64_t *sum) {
+  int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+  int64x2_t sse_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+  do {
+    const uint16x8_t s0 = vld1q_u16(src_ptr);
+    const uint16x8_t s1 = vld1q_u16(src_ptr + 8);
+
+    const uint16x8_t r0 = vld1q_u16(ref_ptr);
+    const uint16x8_t r1 = vld1q_u16(ref_ptr + 8);
+
+    const int16x8_t diff0 = vreinterpretq_s16_u16(vsubq_u16(s0, r0));
+    const int16x8_t diff1 = vreinterpretq_s16_u16(vsubq_u16(s1, r1));
+
+    sum_s32[0] = vpadalq_s16(sum_s32[0], diff0);
+    sum_s32[1] = vpadalq_s16(sum_s32[1], diff1);
+
+    sse_s64[0] = vpx_dotq_s16(sse_s64[0], diff0, diff0);
+    sse_s64[1] = vpx_dotq_s16(sse_s64[1], diff1, diff1);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--h != 0);
+
+  sum_s32[0] = vaddq_s32(sum_s32[0], sum_s32[1]);
+  sse_s64[0] = vaddq_s64(sse_s64[0], sse_s64[1]);
+
+  *sum = horizontal_add_int32x4(sum_s32[0]);
+  *sse = horizontal_add_int64x2(sse_s64[0]);
+}
+
+static INLINE void highbd_variance_wxh_sve(const uint16_t *src_ptr,
+                                           int src_stride,
+                                           const uint16_t *ref_ptr,
+                                           int ref_stride, int w, int h,
+                                           uint64_t *sse, int64_t *sum) {
+  int32x4_t sum_s32[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+                           vdupq_n_s32(0) };
+  int64x2_t sse_s64[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
+                           vdupq_n_s64(0) };
+
+  do {
+    int i = 0;
+    do {
+      const uint16x8_t s0 = vld1q_u16(src_ptr + i);
+      const uint16x8_t s1 = vld1q_u16(src_ptr + i + 8);
+      const uint16x8_t s2 = vld1q_u16(src_ptr + i + 16);
+      const uint16x8_t s3 = vld1q_u16(src_ptr + i + 24);
+
+      const uint16x8_t r0 = vld1q_u16(ref_ptr + i);
+      const uint16x8_t r1 = vld1q_u16(ref_ptr + i + 8);
+      const uint16x8_t r2 = vld1q_u16(ref_ptr + i + 16);
+      const uint16x8_t r3 = vld1q_u16(ref_ptr + i + 24);
+
+      const int16x8_t diff0 = vreinterpretq_s16_u16(vsubq_u16(s0, r0));
+      const int16x8_t diff1 = vreinterpretq_s16_u16(vsubq_u16(s1, r1));
+      const int16x8_t diff2 = vreinterpretq_s16_u16(vsubq_u16(s2, r2));
+      const int16x8_t diff3 = vreinterpretq_s16_u16(vsubq_u16(s3, r3));
+
+      sum_s32[0] = vpadalq_s16(sum_s32[0], diff0);
+      sum_s32[1] = vpadalq_s16(sum_s32[1], diff1);
+      sum_s32[2] = vpadalq_s16(sum_s32[2], diff2);
+      sum_s32[3] = vpadalq_s16(sum_s32[3], diff3);
+
+      sse_s64[0] = vpx_dotq_s16(sse_s64[0], diff0, diff0);
+      sse_s64[1] = vpx_dotq_s16(sse_s64[1], diff1, diff1);
+      sse_s64[2] = vpx_dotq_s16(sse_s64[2], diff2, diff2);
+      sse_s64[3] = vpx_dotq_s16(sse_s64[3], diff3, diff3);
+
+      i += 32;
+    } while (i < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--h != 0);
+
+  sum_s32[0] = vaddq_s32(sum_s32[0], sum_s32[1]);
+  sum_s32[2] = vaddq_s32(sum_s32[2], sum_s32[3]);
+  sum_s32[0] = vaddq_s32(sum_s32[0], sum_s32[2]);
+
+  sse_s64[0] = vaddq_s64(sse_s64[0], sse_s64[1]);
+  sse_s64[2] = vaddq_s64(sse_s64[2], sse_s64[3]);
+  sse_s64[0] = vaddq_s64(sse_s64[0], sse_s64[2]);
+
+  *sum = horizontal_add_int32x4(sum_s32[0]);
+  *sse = horizontal_add_int64x2(sse_s64[0]);
+}
+
+static INLINE void highbd_variance_32xh_sve(const uint16_t *src, int src_stride,
+                                            const uint16_t *ref, int ref_stride,
+                                            int h, uint64_t *sse,
+                                            int64_t *sum) {
+  highbd_variance_wxh_sve(src, src_stride, ref, ref_stride, 32, h, sse, sum);
+}
+
+static INLINE void highbd_variance_64xh_sve(const uint16_t *src, int src_stride,
+                                            const uint16_t *ref, int ref_stride,
+                                            int h, uint64_t *sse,
+                                            int64_t *sum) {
+  highbd_variance_wxh_sve(src, src_stride, ref, ref_stride, 64, h, sse, sum);
+}
+
+#define HBD_VARIANCE_WXH_SVE(w, h)                                    \
+  uint32_t vpx_highbd_8_variance##w##x##h##_sve(                      \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    int sum;                                                          \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h,  \
+                                &sse_long, &sum_long);                \
+    *sse = (uint32_t)sse_long;                                        \
+    sum = (int)sum_long;                                              \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (w * h));         \
+  }                                                                   \
+                                                                      \
+  uint32_t vpx_highbd_10_variance##w##x##h##_sve(                     \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    int sum;                                                          \
+    int64_t var;                                                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h,  \
+                                &sse_long, &sum_long);                \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);                 \
+    sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);                       \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h));         \
+    return (var >= 0) ? (uint32_t)var : 0;                            \
+  }                                                                   \
+                                                                      \
+  uint32_t vpx_highbd_12_variance##w##x##h##_sve(                     \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    int sum;                                                          \
+    int64_t var;                                                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h,  \
+                                &sse_long, &sum_long);                \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);                 \
+    sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);                       \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h));         \
+    return (var >= 0) ? (uint32_t)var : 0;                            \
+  }
+
+HBD_VARIANCE_WXH_SVE(4, 4)
+HBD_VARIANCE_WXH_SVE(4, 8)
+
+HBD_VARIANCE_WXH_SVE(8, 4)
+HBD_VARIANCE_WXH_SVE(8, 8)
+HBD_VARIANCE_WXH_SVE(8, 16)
+
+HBD_VARIANCE_WXH_SVE(16, 8)
+HBD_VARIANCE_WXH_SVE(16, 16)
+HBD_VARIANCE_WXH_SVE(16, 32)
+
+HBD_VARIANCE_WXH_SVE(32, 16)
+HBD_VARIANCE_WXH_SVE(32, 32)
+HBD_VARIANCE_WXH_SVE(32, 64)
+
+HBD_VARIANCE_WXH_SVE(64, 32)
+HBD_VARIANCE_WXH_SVE(64, 64)
+
+#define HIGHBD_GET_VAR_SVE(s)                                         \
+  void vpx_highbd_8_get##s##x##s##var_sve(                            \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse, int *sum) {                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##s##xh_sve(src, src_stride, ref, ref_stride, s,  \
+                                &sse_long, &sum_long);                \
+    *sse = (uint32_t)sse_long;                                        \
+    *sum = (int)sum_long;                                             \
+  }                                                                   \
+                                                                      \
+  void vpx_highbd_10_get##s##x##s##var_sve(                           \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse, int *sum) {                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##s##xh_sve(src, src_stride, ref, ref_stride, s,  \
+                                &sse_long, &sum_long);                \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);                 \
+    *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);                      \
+  }                                                                   \
+                                                                      \
+  void vpx_highbd_12_get##s##x##s##var_sve(                           \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse, int *sum) {                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##s##xh_sve(src, src_stride, ref, ref_stride, s,  \
+                                &sse_long, &sum_long);                \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);                 \
+    *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);                      \
+  }
+
+HIGHBD_GET_VAR_SVE(8)
+HIGHBD_GET_VAR_SVE(16)
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
index 1fde13e8d6..cc6307f923 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
@@ -14,84 +14,22 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/highbd_convolve8_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
 #include "vpx_ports/mem.h"
 
-static INLINE void load_4x4(const int16_t *s, ptrdiff_t p, int16x4_t *s0,
-                            int16x4_t *s1, int16x4_t *s2, int16x4_t *s3) {
-  *s0 = vld1_s16(s);
-  s += p;
-  *s1 = vld1_s16(s);
-  s += p;
-  *s2 = vld1_s16(s);
-  s += p;
-  *s3 = vld1_s16(s);
-}
-
-static INLINE void load_8x4(const uint16_t *s, ptrdiff_t p, uint16x8_t *s0,
-                            uint16x8_t *s1, uint16x8_t *s2, uint16x8_t *s3) {
-  *s0 = vld1q_u16(s);
-  s += p;
-  *s1 = vld1q_u16(s);
-  s += p;
-  *s2 = vld1q_u16(s);
-  s += p;
-  *s3 = vld1q_u16(s);
-}
-
-static INLINE void load_8x8(const int16_t *s, ptrdiff_t p, int16x8_t *s0,
-                            int16x8_t *s1, int16x8_t *s2, int16x8_t *s3,
-                            int16x8_t *s4, int16x8_t *s5, int16x8_t *s6,
-                            int16x8_t *s7) {
-  *s0 = vld1q_s16(s);
-  s += p;
-  *s1 = vld1q_s16(s);
-  s += p;
-  *s2 = vld1q_s16(s);
-  s += p;
-  *s3 = vld1q_s16(s);
-  s += p;
-  *s4 = vld1q_s16(s);
-  s += p;
-  *s5 = vld1q_s16(s);
-  s += p;
-  *s6 = vld1q_s16(s);
-  s += p;
-  *s7 = vld1q_s16(s);
-}
-
-static INLINE void store_8x8(uint16_t *s, ptrdiff_t p, const uint16x8_t s0,
-                             const uint16x8_t s1, const uint16x8_t s2,
-                             const uint16x8_t s3, const uint16x8_t s4,
-                             const uint16x8_t s5, const uint16x8_t s6,
-                             const uint16x8_t s7) {
-  vst1q_u16(s, s0);
-  s += p;
-  vst1q_u16(s, s1);
-  s += p;
-  vst1q_u16(s, s2);
-  s += p;
-  vst1q_u16(s, s3);
-  s += p;
-  vst1q_u16(s, s4);
-  s += p;
-  vst1q_u16(s, s5);
-  s += p;
-  vst1q_u16(s, s6);
-  s += p;
-  vst1q_u16(s, s7);
-}
-
-static INLINE int32x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
-                                    const int16x4_t s2, const int16x4_t s3,
-                                    const int16x4_t s4, const int16x4_t s5,
-                                    const int16x4_t s6, const int16x4_t s7,
-                                    const int16x8_t filters) {
+static INLINE uint16x4_t
+highbd_convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+                   const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+                   const int16x4_t s6, const int16x4_t s7,
+                   const int16x8_t filters, const uint16x4_t max) {
   const int16x4_t filters_lo = vget_low_s16(filters);
   const int16x4_t filters_hi = vget_high_s16(filters);
-  int32x4_t sum = vdupq_n_s32(0);
 
-  sum = vmlal_lane_s16(sum, s0, filters_lo, 0);
+  int32x4_t sum = vmull_lane_s16(s0, filters_lo, 0);
   sum = vmlal_lane_s16(sum, s1, filters_lo, 1);
   sum = vmlal_lane_s16(sum, s2, filters_lo, 2);
   sum = vmlal_lane_s16(sum, s3, filters_lo, 3);
@@ -99,22 +37,20 @@ static INLINE int32x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
   sum = vmlal_lane_s16(sum, s5, filters_hi, 1);
   sum = vmlal_lane_s16(sum, s6, filters_hi, 2);
   sum = vmlal_lane_s16(sum, s7, filters_hi, 3);
-  return sum;
+
+  uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS);
+  return vmin_u16(res, max);
 }
 
-static INLINE uint16x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
-                                     const int16x8_t s2, const int16x8_t s3,
-                                     const int16x8_t s4, const int16x8_t s5,
-                                     const int16x8_t s6, const int16x8_t s7,
-                                     const int16x8_t filters,
-                                     const uint16x8_t max) {
+static INLINE uint16x8_t
+highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+                   const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+                   const int16x8_t s6, const int16x8_t s7,
+                   const int16x8_t filters, const uint16x8_t max) {
   const int16x4_t filters_lo = vget_low_s16(filters);
   const int16x4_t filters_hi = vget_high_s16(filters);
-  int32x4_t sum0 = vdupq_n_s32(0);
-  int32x4_t sum1 = vdupq_n_s32(0);
-  uint16x8_t d;
 
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filters_lo, 0);
+  int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filters_lo, 0);
   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters_lo, 1);
   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters_lo, 2);
   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters_lo, 3);
@@ -122,7 +58,8 @@ static INLINE uint16x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filters_hi, 1);
   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filters_hi, 2);
   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filters_hi, 3);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filters_lo, 0);
+
+  int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filters_lo, 0);
   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters_lo, 1);
   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters_lo, 2);
   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters_lo, 3);
@@ -130,285 +67,421 @@ static INLINE uint16x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filters_hi, 1);
   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filters_hi, 2);
   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filters_hi, 3);
-  d = vcombine_u16(vqrshrun_n_s32(sum0, 7), vqrshrun_n_s32(sum1, 7));
-  d = vminq_u16(d, max);
-  return d;
+
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+                                vqrshrun_n_s32(sum1, FILTER_BITS));
+  return vminq_u16(res, max);
 }
 
-void vpx_highbd_convolve8_horiz_neon(const uint8_t *src8, ptrdiff_t src_stride,
-                                     uint8_t *dst8, ptrdiff_t dst_stride,
-                                     const int16_t *filter_x, int x_step_q4,
-                                     const int16_t *filter_y,  // unused
-                                     int y_step_q4,            // unused
+static INLINE void highbd_convolve_4tap_horiz_neon(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int16x4_t filter, int bd) {
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    do {
+      int16x4_t s0[4], s1[4], s2[4], s3[4];
+      load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+      load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+      load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+      load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+      uint16x4_t d0 =
+          highbd_convolve4_4_neon(s0[0], s0[1], s0[2], s0[3], filter, max);
+      uint16x4_t d1 =
+          highbd_convolve4_4_neon(s1[0], s1[1], s1[2], s1[3], filter, max);
+      uint16x4_t d2 =
+          highbd_convolve4_4_neon(s2[0], s2[1], s2[2], s2[3], filter, max);
+      uint16x4_t d3 =
+          highbd_convolve4_4_neon(s3[0], s3[1], s3[2], s3[3], filter, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int width = w;
+
+      do {
+        int16x8_t s0[4], s1[4], s2[4], s3[4];
+        load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+        load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+        load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+        load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+        uint16x8_t d0 =
+            highbd_convolve4_8_neon(s0[0], s0[1], s0[2], s0[3], filter, max);
+        uint16x8_t d1 =
+            highbd_convolve4_8_neon(s1[0], s1[1], s1[2], s1[3], filter, max);
+        uint16x8_t d2 =
+            highbd_convolve4_8_neon(s2[0], s2[1], s2[2], s2[3], filter, max);
+        uint16x8_t d3 =
+            highbd_convolve4_8_neon(s3[0], s3[1], s3[2], s3[3], filter, max);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+static INLINE void highbd_convolve_8tap_horiz_neon(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int16x8_t filter, int bd) {
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    do {
+      int16x4_t s0[8], s1[8], s2[8], s3[8];
+      load_s16_4x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                   &s0[4], &s0[5], &s0[6], &s0[7]);
+      load_s16_4x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                   &s1[4], &s1[5], &s1[6], &s1[7]);
+      load_s16_4x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                   &s2[4], &s2[5], &s2[6], &s2[7]);
+      load_s16_4x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                   &s3[4], &s3[5], &s3[6], &s3[7]);
+
+      uint16x4_t d0 = highbd_convolve8_4(s0[0], s0[1], s0[2], s0[3], s0[4],
+                                         s0[5], s0[6], s0[7], filter, max);
+      uint16x4_t d1 = highbd_convolve8_4(s1[0], s1[1], s1[2], s1[3], s1[4],
+                                         s1[5], s1[6], s1[7], filter, max);
+      uint16x4_t d2 = highbd_convolve8_4(s2[0], s2[1], s2[2], s2[3], s2[4],
+                                         s2[5], s2[6], s2[7], filter, max);
+      uint16x4_t d3 = highbd_convolve8_4(s3[0], s3[1], s3[2], s3[3], s3[4],
+                                         s3[5], s3[6], s3[7], filter, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int width = w;
+
+      do {
+        int16x8_t s0[8], s1[8], s2[8], s3[8];
+        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                     &s0[4], &s0[5], &s0[6], &s0[7]);
+        load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                     &s1[4], &s1[5], &s1[6], &s1[7]);
+        load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                     &s2[4], &s2[5], &s2[6], &s2[7]);
+        load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                     &s3[4], &s3[5], &s3[6], &s3[7]);
+
+        uint16x8_t d0 = highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4],
+                                           s0[5], s0[6], s0[7], filter, max);
+        uint16x8_t d1 = highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4],
+                                           s1[5], s1[6], s1[7], filter, max);
+        uint16x8_t d2 = highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4],
+                                           s2[5], s2[6], s2[7], filter, max);
+        uint16x8_t d3 = highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4],
+                                           s3[5], s3[6], s3[7], filter, max);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
+                                     uint16_t *dst, ptrdiff_t dst_stride,
+                                     const InterpKernel *filter, int x0_q4,
+                                     int x_step_q4, int y0_q4, int y_step_q4,
                                      int w, int h, int bd) {
   if (x_step_q4 != 16) {
-    vpx_highbd_convolve8_horiz_c(src8, src_stride, dst8, dst_stride, filter_x,
-                                 x_step_q4, filter_y, y_step_q4, w, h, bd);
+    vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
+    return;
+  }
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
+    const int16x4_t x_filter_4tap = vld1_s16(filter[x0_q4] + 2);
+    highbd_convolve_4tap_horiz_neon(src - 1, src_stride, dst, dst_stride, w, h,
+                                    x_filter_4tap, bd);
   } else {
-    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-    const int16x8_t filters = vld1q_s16(filter_x);
-    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
-    uint16x8_t t0, t1, t2, t3;
-
-    assert(!((intptr_t)dst & 3));
-    assert(!(dst_stride & 3));
-
-    src -= 3;
-
-    if (h == 4) {
-      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-      int32x4_t d0, d1, d2, d3;
-      uint16x8_t d01, d23;
-
-      __builtin_prefetch(src + 0 * src_stride);
-      __builtin_prefetch(src + 1 * src_stride);
-      __builtin_prefetch(src + 2 * src_stride);
-      __builtin_prefetch(src + 3 * src_stride);
-      load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
-      transpose_u16_8x4(&t0, &t1, &t2, &t3);
-      s0 = vreinterpret_s16_u16(vget_low_u16(t0));
-      s1 = vreinterpret_s16_u16(vget_low_u16(t1));
-      s2 = vreinterpret_s16_u16(vget_low_u16(t2));
-      s3 = vreinterpret_s16_u16(vget_low_u16(t3));
-      s4 = vreinterpret_s16_u16(vget_high_u16(t0));
-      s5 = vreinterpret_s16_u16(vget_high_u16(t1));
-      s6 = vreinterpret_s16_u16(vget_high_u16(t2));
-      __builtin_prefetch(dst + 0 * dst_stride);
-      __builtin_prefetch(dst + 1 * dst_stride);
-      __builtin_prefetch(dst + 2 * dst_stride);
-      __builtin_prefetch(dst + 3 * dst_stride);
-      src += 7;
-
-      do {
-        load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10);
-        transpose_s16_4x4d(&s7, &s8, &s9, &s10);
-
-        d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-
-        d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
-        d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
-        d01 = vminq_u16(d01, max);
-        d23 = vminq_u16(d23, max);
-        transpose_u16_4x4q(&d01, &d23);
-
-        vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01));
-        vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23));
-        vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01));
-        vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23));
-
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-        s3 = s7;
-        s4 = s8;
-        s5 = s9;
-        s6 = s10;
-        src += 4;
-        dst += 4;
-        w -= 4;
-      } while (w > 0);
-    } else {
-      int16x8_t t4, t5, t6, t7;
-      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-      uint16x8_t d0, d1, d2, d3;
-
-      if (w == 4) {
-        do {
-          load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
-                   &s5, &s6, &s7);
-          transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-
-          load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10,
-                   &t4, &t5, &t6, &t7);
-          src += 8 * src_stride;
-          __builtin_prefetch(dst + 0 * dst_stride);
-          __builtin_prefetch(dst + 1 * dst_stride);
-          __builtin_prefetch(dst + 2 * dst_stride);
-          __builtin_prefetch(dst + 3 * dst_stride);
-          __builtin_prefetch(dst + 4 * dst_stride);
-          __builtin_prefetch(dst + 5 * dst_stride);
-          __builtin_prefetch(dst + 6 * dst_stride);
-          __builtin_prefetch(dst + 7 * dst_stride);
-          transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7);
-
-          __builtin_prefetch(src + 0 * src_stride);
-          __builtin_prefetch(src + 1 * src_stride);
-          __builtin_prefetch(src + 2 * src_stride);
-          __builtin_prefetch(src + 3 * src_stride);
-          __builtin_prefetch(src + 4 * src_stride);
-          __builtin_prefetch(src + 5 * src_stride);
-          __builtin_prefetch(src + 6 * src_stride);
-          __builtin_prefetch(src + 7 * src_stride);
-          d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
-          d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
-          d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
-          d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
-
-          transpose_u16_8x4(&d0, &d1, &d2, &d3);
-          vst1_u16(dst, vget_low_u16(d0));
-          dst += dst_stride;
-          vst1_u16(dst, vget_low_u16(d1));
-          dst += dst_stride;
-          vst1_u16(dst, vget_low_u16(d2));
-          dst += dst_stride;
-          vst1_u16(dst, vget_low_u16(d3));
-          dst += dst_stride;
-          vst1_u16(dst, vget_high_u16(d0));
-          dst += dst_stride;
-          vst1_u16(dst, vget_high_u16(d1));
-          dst += dst_stride;
-          vst1_u16(dst, vget_high_u16(d2));
-          dst += dst_stride;
-          vst1_u16(dst, vget_high_u16(d3));
-          dst += dst_stride;
-          h -= 8;
-        } while (h > 0);
-      } else {
-        int width;
-        const uint16_t *s;
-        uint16_t *d;
-        int16x8_t s11, s12, s13, s14;
-        uint16x8_t d4, d5, d6, d7;
-
-        do {
-          __builtin_prefetch(src + 0 * src_stride);
-          __builtin_prefetch(src + 1 * src_stride);
-          __builtin_prefetch(src + 2 * src_stride);
-          __builtin_prefetch(src + 3 * src_stride);
-          __builtin_prefetch(src + 4 * src_stride);
-          __builtin_prefetch(src + 5 * src_stride);
-          __builtin_prefetch(src + 6 * src_stride);
-          __builtin_prefetch(src + 7 * src_stride);
-          load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
-                   &s5, &s6, &s7);
-          transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-
-          width = w;
-          s = src + 7;
-          d = dst;
-          __builtin_prefetch(dst + 0 * dst_stride);
-          __builtin_prefetch(dst + 1 * dst_stride);
-          __builtin_prefetch(dst + 2 * dst_stride);
-          __builtin_prefetch(dst + 3 * dst_stride);
-          __builtin_prefetch(dst + 4 * dst_stride);
-          __builtin_prefetch(dst + 5 * dst_stride);
-          __builtin_prefetch(dst + 6 * dst_stride);
-          __builtin_prefetch(dst + 7 * dst_stride);
-
-          do {
-            load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11,
-                     &s12, &s13, &s14);
-            transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14);
-
-            d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
-            d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
-            d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
-            d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
-            d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, max);
-            d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, max);
-            d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, max);
-            d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, max);
-
-            transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
-            store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
-
-            s0 = s8;
-            s1 = s9;
-            s2 = s10;
-            s3 = s11;
-            s4 = s12;
-            s5 = s13;
-            s6 = s14;
-            s += 8;
-            d += 8;
-            width -= 8;
-          } while (width > 0);
-          src += 8 * src_stride;
-          dst += 8 * dst_stride;
-          h -= 8;
-        } while (h > 0);
-      }
-    }
+    const int16x8_t x_filter_8tap = vld1q_s16(filter[x0_q4]);
+    highbd_convolve_8tap_horiz_neon(src - 3, src_stride, dst, dst_stride, w, h,
+                                    x_filter_8tap, bd);
   }
 }
 
-void vpx_highbd_convolve8_avg_horiz_neon(const uint8_t *src8,
-                                         ptrdiff_t src_stride, uint8_t *dst8,
+void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
+                                         ptrdiff_t src_stride, uint16_t *dst,
                                          ptrdiff_t dst_stride,
-                                         const int16_t *filter_x, int x_step_q4,
-                                         const int16_t *filter_y,  // unused
-                                         int y_step_q4,            // unused
-                                         int w, int h, int bd) {
+                                         const InterpKernel *filter, int x0_q4,
+                                         int x_step_q4, int y0_q4,
+                                         int y_step_q4, int w, int h, int bd) {
   if (x_step_q4 != 16) {
-    vpx_highbd_convolve8_avg_horiz_c(src8, src_stride, dst8, dst_stride,
-                                     filter_x, x_step_q4, filter_y, y_step_q4,
-                                     w, h, bd);
+    vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+                                     x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+                                     bd);
+    return;
+  }
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  const int16x8_t filters = vld1q_s16(filter[x0_q4]);
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    do {
+      int16x4_t s0[8], s1[8], s2[8], s3[8];
+      load_s16_4x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                   &s0[4], &s0[5], &s0[6], &s0[7]);
+      load_s16_4x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                   &s1[4], &s1[5], &s1[6], &s1[7]);
+      load_s16_4x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                   &s2[4], &s2[5], &s2[6], &s2[7]);
+      load_s16_4x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                   &s3[4], &s3[5], &s3[6], &s3[7]);
+
+      uint16x4_t d0 = highbd_convolve8_4(s0[0], s0[1], s0[2], s0[3], s0[4],
+                                         s0[5], s0[6], s0[7], filters, max);
+      uint16x4_t d1 = highbd_convolve8_4(s1[0], s1[1], s1[2], s1[3], s1[4],
+                                         s1[5], s1[6], s1[7], filters, max);
+      uint16x4_t d2 = highbd_convolve8_4(s2[0], s2[1], s2[2], s2[3], s2[4],
+                                         s2[5], s2[6], s2[7], filters, max);
+      uint16x4_t d3 = highbd_convolve8_4(s3[0], s3[1], s3[2], s3[3], s3[4],
+                                         s3[5], s3[6], s3[7], filters, max);
+
+      d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride));
+      d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride));
+      d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride));
+      d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride));
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
   } else {
-    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-    const int16x8_t filters = vld1q_s16(filter_x);
     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
-    uint16x8_t t0, t1, t2, t3;
 
-    assert(!((intptr_t)dst & 3));
-    assert(!(dst_stride & 3));
-
-    src -= 3;
-
-    if (h == 4) {
-      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-      int32x4_t d0, d1, d2, d3;
-      uint16x8_t d01, d23, t01, t23;
-
-      __builtin_prefetch(src + 0 * src_stride);
-      __builtin_prefetch(src + 1 * src_stride);
-      __builtin_prefetch(src + 2 * src_stride);
-      __builtin_prefetch(src + 3 * src_stride);
-      load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
-      transpose_u16_8x4(&t0, &t1, &t2, &t3);
-      s0 = vreinterpret_s16_u16(vget_low_u16(t0));
-      s1 = vreinterpret_s16_u16(vget_low_u16(t1));
-      s2 = vreinterpret_s16_u16(vget_low_u16(t2));
-      s3 = vreinterpret_s16_u16(vget_low_u16(t3));
-      s4 = vreinterpret_s16_u16(vget_high_u16(t0));
-      s5 = vreinterpret_s16_u16(vget_high_u16(t1));
-      s6 = vreinterpret_s16_u16(vget_high_u16(t2));
-      __builtin_prefetch(dst + 0 * dst_stride);
-      __builtin_prefetch(dst + 1 * dst_stride);
-      __builtin_prefetch(dst + 2 * dst_stride);
-      __builtin_prefetch(dst + 3 * dst_stride);
-      src += 7;
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int width = w;
 
       do {
-        load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10);
-        transpose_s16_4x4d(&s7, &s8, &s9, &s10);
+        int16x8_t s0[8], s1[8], s2[8], s3[8];
+        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                     &s0[4], &s0[5], &s0[6], &s0[7]);
+        load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                     &s1[4], &s1[5], &s1[6], &s1[7]);
+        load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                     &s2[4], &s2[5], &s2[6], &s2[7]);
+        load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                     &s3[4], &s3[5], &s3[6], &s3[7]);
 
-        d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+        uint16x8_t d0 = highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4],
+                                           s0[5], s0[6], s0[7], filters, max);
+        uint16x8_t d1 = highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4],
+                                           s1[5], s1[6], s1[7], filters, max);
+        uint16x8_t d2 = highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4],
+                                           s2[5], s2[6], s2[7], filters, max);
+        uint16x8_t d3 = highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4],
+                                           s3[5], s3[6], s3[7], filters, max);
 
-        t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
-        t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
-        t01 = vminq_u16(t01, max);
-        t23 = vminq_u16(t23, max);
-        transpose_u16_4x4q(&t01, &t23);
+        d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
+        d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
+        d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
+        d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
 
-        d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
-                           vld1_u16(dst + 2 * dst_stride));
-        d23 = vcombine_u16(vld1_u16(dst + 1 * dst_stride),
-                           vld1_u16(dst + 3 * dst_stride));
-        d01 = vrhaddq_u16(d01, t01);
-        d23 = vrhaddq_u16(d23, t23);
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
-        vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01));
-        vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23));
-        vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01));
-        vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23));
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+static INLINE void highbd_convolve_4tap_vert_neon(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int16x4_t filter, int bd) {
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    int16x4_t s0, s1, s2;
+    load_s16_4x3(s, src_stride, &s0, &s1, &s2);
+
+    s += 3 * src_stride;
+
+    do {
+      int16x4_t s3, s4, s5, s6;
+      load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6);
+
+      uint16x4_t d0 = highbd_convolve4_4_neon(s0, s1, s2, s3, filter, max);
+      uint16x4_t d1 = highbd_convolve4_4_neon(s1, s2, s3, s4, filter, max);
+      uint16x4_t d2 = highbd_convolve4_4_neon(s2, s3, s4, s5, filter, max);
+      uint16x4_t d3 = highbd_convolve4_4_neon(s3, s4, s5, s6, filter, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int height = h;
+
+      int16x8_t s0, s1, s2;
+      load_s16_8x3(s, src_stride, &s0, &s1, &s2);
+
+      s += 3 * src_stride;
+
+      do {
+        int16x8_t s3, s4, s5, s6;
+        load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
+
+        uint16x8_t d0 = highbd_convolve4_8_neon(s0, s1, s2, s3, filter, max);
+        uint16x8_t d1 = highbd_convolve4_8_neon(s1, s2, s3, s4, filter, max);
+        uint16x8_t d2 = highbd_convolve4_8_neon(s2, s3, s4, s5, filter, max);
+        uint16x8_t d3 = highbd_convolve4_8_neon(s3, s4, s5, s6, filter, max);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+static INLINE void highbd_convolve_8tap_vert_neon(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int16x8_t filter, int bd) {
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    int16x4_t s0, s1, s2, s3, s4, s5, s6;
+    load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+
+    s += 7 * src_stride;
+
+    do {
+      int16x4_t s7, s8, s9, s10;
+      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+      uint16x4_t d0 =
+          highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter, max);
+      uint16x4_t d1 =
+          highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter, max);
+      uint16x4_t d2 =
+          highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter, max);
+      uint16x4_t d3 =
+          highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int height = h;
+
+      int16x8_t s0, s1, s2, s3, s4, s5, s6;
+      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+
+      s += 7 * src_stride;
+
+      do {
+        int16x8_t s7, s8, s9, s10;
+        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+        uint16x8_t d0 =
+            highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter, max);
+        uint16x8_t d1 =
+            highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter, max);
+        uint16x8_t d2 =
+            highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter, max);
+        uint16x8_t d3 =
+            highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter, max);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
         s0 = s4;
         s1 = s5;
@@ -417,410 +490,140 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint8_t *src8,
         s4 = s8;
         s5 = s9;
         s6 = s10;
-        src += 4;
-        dst += 4;
-        w -= 4;
-      } while (w > 0);
-    } else {
-      int16x8_t t4, t5, t6, t7;
-      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-      uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3;
-
-      if (w == 4) {
-        do {
-          load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
-                   &s5, &s6, &s7);
-          transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-
-          load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10,
-                   &t4, &t5, &t6, &t7);
-          src += 8 * src_stride;
-          __builtin_prefetch(dst + 0 * dst_stride);
-          __builtin_prefetch(dst + 1 * dst_stride);
-          __builtin_prefetch(dst + 2 * dst_stride);
-          __builtin_prefetch(dst + 3 * dst_stride);
-          __builtin_prefetch(dst + 4 * dst_stride);
-          __builtin_prefetch(dst + 5 * dst_stride);
-          __builtin_prefetch(dst + 6 * dst_stride);
-          __builtin_prefetch(dst + 7 * dst_stride);
-          transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7);
-
-          __builtin_prefetch(src + 0 * src_stride);
-          __builtin_prefetch(src + 1 * src_stride);
-          __builtin_prefetch(src + 2 * src_stride);
-          __builtin_prefetch(src + 3 * src_stride);
-          __builtin_prefetch(src + 4 * src_stride);
-          __builtin_prefetch(src + 5 * src_stride);
-          __builtin_prefetch(src + 6 * src_stride);
-          __builtin_prefetch(src + 7 * src_stride);
-          t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
-          t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
-          t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
-          t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
-          transpose_u16_8x4(&t0, &t1, &t2, &t3);
-
-          d0 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
-                            vld1_u16(dst + 4 * dst_stride));
-          d1 = vcombine_u16(vld1_u16(dst + 1 * dst_stride),
-                            vld1_u16(dst + 5 * dst_stride));
-          d2 = vcombine_u16(vld1_u16(dst + 2 * dst_stride),
-                            vld1_u16(dst + 6 * dst_stride));
-          d3 = vcombine_u16(vld1_u16(dst + 3 * dst_stride),
-                            vld1_u16(dst + 7 * dst_stride));
-          d0 = vrhaddq_u16(d0, t0);
-          d1 = vrhaddq_u16(d1, t1);
-          d2 = vrhaddq_u16(d2, t2);
-          d3 = vrhaddq_u16(d3, t3);
-
-          vst1_u16(dst, vget_low_u16(d0));
-          dst += dst_stride;
-          vst1_u16(dst, vget_low_u16(d1));
-          dst += dst_stride;
-          vst1_u16(dst, vget_low_u16(d2));
-          dst += dst_stride;
-          vst1_u16(dst, vget_low_u16(d3));
-          dst += dst_stride;
-          vst1_u16(dst, vget_high_u16(d0));
-          dst += dst_stride;
-          vst1_u16(dst, vget_high_u16(d1));
-          dst += dst_stride;
-          vst1_u16(dst, vget_high_u16(d2));
-          dst += dst_stride;
-          vst1_u16(dst, vget_high_u16(d3));
-          dst += dst_stride;
-          h -= 8;
-        } while (h > 0);
-      } else {
-        int width;
-        const uint16_t *s;
-        uint16_t *d;
-        int16x8_t s11, s12, s13, s14;
-        uint16x8_t d4, d5, d6, d7;
-
-        do {
-          __builtin_prefetch(src + 0 * src_stride);
-          __builtin_prefetch(src + 1 * src_stride);
-          __builtin_prefetch(src + 2 * src_stride);
-          __builtin_prefetch(src + 3 * src_stride);
-          __builtin_prefetch(src + 4 * src_stride);
-          __builtin_prefetch(src + 5 * src_stride);
-          __builtin_prefetch(src + 6 * src_stride);
-          __builtin_prefetch(src + 7 * src_stride);
-          load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
-                   &s5, &s6, &s7);
-          transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-
-          width = w;
-          s = src + 7;
-          d = dst;
-          __builtin_prefetch(dst + 0 * dst_stride);
-          __builtin_prefetch(dst + 1 * dst_stride);
-          __builtin_prefetch(dst + 2 * dst_stride);
-          __builtin_prefetch(dst + 3 * dst_stride);
-          __builtin_prefetch(dst + 4 * dst_stride);
-          __builtin_prefetch(dst + 5 * dst_stride);
-          __builtin_prefetch(dst + 6 * dst_stride);
-          __builtin_prefetch(dst + 7 * dst_stride);
-
-          do {
-            load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11,
-                     &s12, &s13, &s14);
-            transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14);
-
-            d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
-            d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
-            d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
-            d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
-            d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, max);
-            d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, max);
-            d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, max);
-            d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, max);
-
-            transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
-
-            d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
-            d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
-            d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
-            d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
-            d4 = vrhaddq_u16(d4, vld1q_u16(d + 4 * dst_stride));
-            d5 = vrhaddq_u16(d5, vld1q_u16(d + 5 * dst_stride));
-            d6 = vrhaddq_u16(d6, vld1q_u16(d + 6 * dst_stride));
-            d7 = vrhaddq_u16(d7, vld1q_u16(d + 7 * dst_stride));
-
-            store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
-
-            s0 = s8;
-            s1 = s9;
-            s2 = s10;
-            s3 = s11;
-            s4 = s12;
-            s5 = s13;
-            s6 = s14;
-            s += 8;
-            d += 8;
-            width -= 8;
-          } while (width > 0);
-          src += 8 * src_stride;
-          dst += 8 * dst_stride;
-          h -= 8;
-        } while (h > 0);
-      }
-    }
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
   }
 }
 
-void vpx_highbd_convolve8_vert_neon(const uint8_t *src8, ptrdiff_t src_stride,
-                                    uint8_t *dst8, ptrdiff_t dst_stride,
-                                    const int16_t *filter_x,  // unused
-                                    int x_step_q4,            // unused
-                                    const int16_t *filter_y, int y_step_q4,
+void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
+                                    uint16_t *dst, ptrdiff_t dst_stride,
+                                    const InterpKernel *filter, int x0_q4,
+                                    int x_step_q4, int y0_q4, int y_step_q4,
                                     int w, int h, int bd) {
   if (y_step_q4 != 16) {
-    vpx_highbd_convolve8_vert_c(src8, src_stride, dst8, dst_stride, filter_x,
-                                x_step_q4, filter_y, y_step_q4, w, h, bd);
+    vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                                x_step_q4, y0_q4, y_step_q4, w, h, bd);
+    return;
+  }
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(y_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
+    const int16x4_t y_filter_4tap = vld1_s16(filter[y0_q4] + 2);
+    highbd_convolve_4tap_vert_neon(src - src_stride, src_stride, dst,
+                                   dst_stride, w, h, y_filter_4tap, bd);
   } else {
-    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-    const int16x8_t filters = vld1q_s16(filter_y);
-    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
-
-    assert(!((intptr_t)dst & 3));
-    assert(!(dst_stride & 3));
-
-    src -= 3 * src_stride;
-
-    if (w == 4) {
-      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-      int32x4_t d0, d1, d2, d3;
-      uint16x8_t d01, d23;
-
-      s0 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s1 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s2 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s3 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s4 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s5 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s6 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-
-      do {
-        s7 = vreinterpret_s16_u16(vld1_u16(src));
-        src += src_stride;
-        s8 = vreinterpret_s16_u16(vld1_u16(src));
-        src += src_stride;
-        s9 = vreinterpret_s16_u16(vld1_u16(src));
-        src += src_stride;
-        s10 = vreinterpret_s16_u16(vld1_u16(src));
-        src += src_stride;
-
-        __builtin_prefetch(dst + 0 * dst_stride);
-        __builtin_prefetch(dst + 1 * dst_stride);
-        __builtin_prefetch(dst + 2 * dst_stride);
-        __builtin_prefetch(dst + 3 * dst_stride);
-        __builtin_prefetch(src + 0 * src_stride);
-        __builtin_prefetch(src + 1 * src_stride);
-        __builtin_prefetch(src + 2 * src_stride);
-        __builtin_prefetch(src + 3 * src_stride);
-        d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-
-        d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
-        d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
-        d01 = vminq_u16(d01, max);
-        d23 = vminq_u16(d23, max);
-        vst1_u16(dst, vget_low_u16(d01));
-        dst += dst_stride;
-        vst1_u16(dst, vget_high_u16(d01));
-        dst += dst_stride;
-        vst1_u16(dst, vget_low_u16(d23));
-        dst += dst_stride;
-        vst1_u16(dst, vget_high_u16(d23));
-        dst += dst_stride;
-
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-        s3 = s7;
-        s4 = s8;
-        s5 = s9;
-        s6 = s10;
-        h -= 4;
-      } while (h > 0);
-    } else {
-      int height;
-      const uint16_t *s;
-      uint16_t *d;
-      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-      uint16x8_t d0, d1, d2, d3;
-
-      do {
-        __builtin_prefetch(src + 0 * src_stride);
-        __builtin_prefetch(src + 1 * src_stride);
-        __builtin_prefetch(src + 2 * src_stride);
-        __builtin_prefetch(src + 3 * src_stride);
-        __builtin_prefetch(src + 4 * src_stride);
-        __builtin_prefetch(src + 5 * src_stride);
-        __builtin_prefetch(src + 6 * src_stride);
-        s = src;
-        s0 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s1 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s2 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s3 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s4 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s5 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s6 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        d = dst;
-        height = h;
-
-        do {
-          s7 = vreinterpretq_s16_u16(vld1q_u16(s));
-          s += src_stride;
-          s8 = vreinterpretq_s16_u16(vld1q_u16(s));
-          s += src_stride;
-          s9 = vreinterpretq_s16_u16(vld1q_u16(s));
-          s += src_stride;
-          s10 = vreinterpretq_s16_u16(vld1q_u16(s));
-          s += src_stride;
-
-          __builtin_prefetch(d + 0 * dst_stride);
-          __builtin_prefetch(d + 1 * dst_stride);
-          __builtin_prefetch(d + 2 * dst_stride);
-          __builtin_prefetch(d + 3 * dst_stride);
-          __builtin_prefetch(s + 0 * src_stride);
-          __builtin_prefetch(s + 1 * src_stride);
-          __builtin_prefetch(s + 2 * src_stride);
-          __builtin_prefetch(s + 3 * src_stride);
-          d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
-          d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
-          d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
-          d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
-
-          vst1q_u16(d, d0);
-          d += dst_stride;
-          vst1q_u16(d, d1);
-          d += dst_stride;
-          vst1q_u16(d, d2);
-          d += dst_stride;
-          vst1q_u16(d, d3);
-          d += dst_stride;
-
-          s0 = s4;
-          s1 = s5;
-          s2 = s6;
-          s3 = s7;
-          s4 = s8;
-          s5 = s9;
-          s6 = s10;
-          height -= 4;
-        } while (height > 0);
-        src += 8;
-        dst += 8;
-        w -= 8;
-      } while (w > 0);
-    }
+    const int16x8_t y_filter_8tap = vld1q_s16(filter[y0_q4]);
+    highbd_convolve_8tap_vert_neon(src - 3 * src_stride, src_stride, dst,
+                                   dst_stride, w, h, y_filter_8tap, bd);
   }
 }
 
-void vpx_highbd_convolve8_avg_vert_neon(const uint8_t *src8,
-                                        ptrdiff_t src_stride, uint8_t *dst8,
+void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src,
+                                        ptrdiff_t src_stride, uint16_t *dst,
                                         ptrdiff_t dst_stride,
-                                        const int16_t *filter_x,  // unused
-                                        int x_step_q4,            // unused
-                                        const int16_t *filter_y, int y_step_q4,
+                                        const InterpKernel *filter, int x0_q4,
+                                        int x_step_q4, int y0_q4, int y_step_q4,
                                         int w, int h, int bd) {
   if (y_step_q4 != 16) {
-    vpx_highbd_convolve8_avg_vert_c(src8, src_stride, dst8, dst_stride,
-                                    filter_x, x_step_q4, filter_y, y_step_q4, w,
-                                    h, bd);
+    vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+                                    x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+                                    bd);
+    return;
+  }
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  const int16x8_t filters = vld1q_s16(filter[y0_q4]);
+
+  src -= 3 * src_stride;
+
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    int16x4_t s0, s1, s2, s3, s4, s5, s6;
+    load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+
+    s += 7 * src_stride;
+
+    do {
+      int16x4_t s7, s8, s9, s10;
+      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+      uint16x4_t d0 =
+          highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+      uint16x4_t d1 =
+          highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+      uint16x4_t d2 =
+          highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+      uint16x4_t d3 =
+          highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+
+      d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride));
+      d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride));
+      d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride));
+      d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride));
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
   } else {
-    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-    const int16x8_t filters = vld1q_s16(filter_y);
     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
 
-    assert(!((intptr_t)dst & 3));
-    assert(!(dst_stride & 3));
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int height = h;
 
-    src -= 3 * src_stride;
+      int16x8_t s0, s1, s2, s3, s4, s5, s6;
+      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
 
-    if (w == 4) {
-      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-      int32x4_t d0, d1, d2, d3;
-      uint16x8_t d01, d23, t01, t23;
-
-      s0 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s1 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s2 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s3 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s4 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s5 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      s6 = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
+      s += 7 * src_stride;
 
       do {
-        s7 = vreinterpret_s16_u16(vld1_u16(src));
-        src += src_stride;
-        s8 = vreinterpret_s16_u16(vld1_u16(src));
-        src += src_stride;
-        s9 = vreinterpret_s16_u16(vld1_u16(src));
-        src += src_stride;
-        s10 = vreinterpret_s16_u16(vld1_u16(src));
-        src += src_stride;
+        int16x8_t s7, s8, s9, s10;
+        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
 
-        __builtin_prefetch(dst + 0 * dst_stride);
-        __builtin_prefetch(dst + 1 * dst_stride);
-        __builtin_prefetch(dst + 2 * dst_stride);
-        __builtin_prefetch(dst + 3 * dst_stride);
-        __builtin_prefetch(src + 0 * src_stride);
-        __builtin_prefetch(src + 1 * src_stride);
-        __builtin_prefetch(src + 2 * src_stride);
-        __builtin_prefetch(src + 3 * src_stride);
-        d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+        uint16x8_t d0 =
+            highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+        uint16x8_t d1 =
+            highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+        uint16x8_t d2 =
+            highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+        uint16x8_t d3 =
+            highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
 
-        t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
-        t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
-        t01 = vminq_u16(t01, max);
-        t23 = vminq_u16(t23, max);
+        d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
+        d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
+        d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
+        d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
 
-        d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
-                           vld1_u16(dst + 1 * dst_stride));
-        d23 = vcombine_u16(vld1_u16(dst + 2 * dst_stride),
-                           vld1_u16(dst + 3 * dst_stride));
-        d01 = vrhaddq_u16(d01, t01);
-        d23 = vrhaddq_u16(d23, t23);
-
-        vst1_u16(dst, vget_low_u16(d01));
-        dst += dst_stride;
-        vst1_u16(dst, vget_high_u16(d01));
-        dst += dst_stride;
-        vst1_u16(dst, vget_low_u16(d23));
-        dst += dst_stride;
-        vst1_u16(dst, vget_high_u16(d23));
-        dst += dst_stride;
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
 
         s0 = s4;
         s1 = s5;
@@ -829,95 +632,600 @@ void vpx_highbd_convolve8_avg_vert_neon(const uint8_t *src8,
         s4 = s8;
         s5 = s9;
         s6 = s10;
-        h -= 4;
-      } while (h > 0);
-    } else {
-      int height;
-      const uint16_t *s;
-      uint16_t *d;
-      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-      uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3;
-
-      do {
-        __builtin_prefetch(src + 0 * src_stride);
-        __builtin_prefetch(src + 1 * src_stride);
-        __builtin_prefetch(src + 2 * src_stride);
-        __builtin_prefetch(src + 3 * src_stride);
-        __builtin_prefetch(src + 4 * src_stride);
-        __builtin_prefetch(src + 5 * src_stride);
-        __builtin_prefetch(src + 6 * src_stride);
-        s = src;
-        s0 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s1 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s2 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s3 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s4 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s5 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        s6 = vreinterpretq_s16_u16(vld1q_u16(s));
-        s += src_stride;
-        d = dst;
-        height = h;
-
-        do {
-          s7 = vreinterpretq_s16_u16(vld1q_u16(s));
-          s += src_stride;
-          s8 = vreinterpretq_s16_u16(vld1q_u16(s));
-          s += src_stride;
-          s9 = vreinterpretq_s16_u16(vld1q_u16(s));
-          s += src_stride;
-          s10 = vreinterpretq_s16_u16(vld1q_u16(s));
-          s += src_stride;
-
-          __builtin_prefetch(d + 0 * dst_stride);
-          __builtin_prefetch(d + 1 * dst_stride);
-          __builtin_prefetch(d + 2 * dst_stride);
-          __builtin_prefetch(d + 3 * dst_stride);
-          __builtin_prefetch(s + 0 * src_stride);
-          __builtin_prefetch(s + 1 * src_stride);
-          __builtin_prefetch(s + 2 * src_stride);
-          __builtin_prefetch(s + 3 * src_stride);
-          t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
-          t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
-          t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
-          t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
-
-          d0 = vld1q_u16(d + 0 * dst_stride);
-          d1 = vld1q_u16(d + 1 * dst_stride);
-          d2 = vld1q_u16(d + 2 * dst_stride);
-          d3 = vld1q_u16(d + 3 * dst_stride);
-          d0 = vrhaddq_u16(d0, t0);
-          d1 = vrhaddq_u16(d1, t1);
-          d2 = vrhaddq_u16(d2, t2);
-          d3 = vrhaddq_u16(d3, t3);
-
-          vst1q_u16(d, d0);
-          d += dst_stride;
-          vst1q_u16(d, d1);
-          d += dst_stride;
-          vst1q_u16(d, d2);
-          d += dst_stride;
-          vst1q_u16(d, d3);
-          d += dst_stride;
-
-          s0 = s4;
-          s1 = s5;
-          s2 = s6;
-          s3 = s7;
-          s4 = s8;
-          s5 = s9;
-          s6 = s10;
-          height -= 4;
-        } while (height > 0);
-        src += 8;
-        dst += 8;
-        w -= 8;
-      } while (w > 0);
-    }
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
   }
 }
+
+static INLINE void highbd_convolve_2d_4tap_neon(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter,
+    const int16x4_t y_filter, int bd) {
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    int16x4_t h_s0[4], h_s1[4], h_s2[4];
+    load_s16_4x4(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3]);
+    load_s16_4x4(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3]);
+    load_s16_4x4(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3]);
+
+    int16x4_t v_s0 = vreinterpret_s16_u16(highbd_convolve4_4_neon(
+        h_s0[0], h_s0[1], h_s0[2], h_s0[3], x_filter, max));
+    int16x4_t v_s1 = vreinterpret_s16_u16(highbd_convolve4_4_neon(
+        h_s1[0], h_s1[1], h_s1[2], h_s1[3], x_filter, max));
+    int16x4_t v_s2 = vreinterpret_s16_u16(highbd_convolve4_4_neon(
+        h_s2[0], h_s2[1], h_s2[2], h_s2[3], x_filter, max));
+
+    s += 3 * src_stride;
+
+    do {
+      int16x4_t h_s3[4], h_s4[4], h_s5[4], h_s6[4];
+      load_s16_4x4(s + 0 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2],
+                   &h_s3[3]);
+      load_s16_4x4(s + 1 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2],
+                   &h_s4[3]);
+      load_s16_4x4(s + 2 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2],
+                   &h_s5[3]);
+      load_s16_4x4(s + 3 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2],
+                   &h_s6[3]);
+
+      int16x4_t v_s3 = vreinterpret_s16_u16(highbd_convolve4_4_neon(
+          h_s3[0], h_s3[1], h_s3[2], h_s3[3], x_filter, max));
+      int16x4_t v_s4 = vreinterpret_s16_u16(highbd_convolve4_4_neon(
+          h_s4[0], h_s4[1], h_s4[2], h_s4[3], x_filter, max));
+      int16x4_t v_s5 = vreinterpret_s16_u16(highbd_convolve4_4_neon(
+          h_s5[0], h_s5[1], h_s5[2], h_s5[3], x_filter, max));
+      int16x4_t v_s6 = vreinterpret_s16_u16(highbd_convolve4_4_neon(
+          h_s6[0], h_s6[1], h_s6[2], h_s6[3], x_filter, max));
+
+      uint16x4_t d0 =
+          highbd_convolve4_4_neon(v_s0, v_s1, v_s2, v_s3, y_filter, max);
+      uint16x4_t d1 =
+          highbd_convolve4_4_neon(v_s1, v_s2, v_s3, v_s4, y_filter, max);
+      uint16x4_t d2 =
+          highbd_convolve4_4_neon(v_s2, v_s3, v_s4, v_s5, y_filter, max);
+      uint16x4_t d3 =
+          highbd_convolve4_4_neon(v_s3, v_s4, v_s5, v_s6, y_filter, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      v_s0 = v_s4;
+      v_s1 = v_s5;
+      v_s2 = v_s6;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+
+    return;
+  }
+
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+  do {
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+    int height = h;
+
+    int16x8_t h_s0[4], h_s1[4], h_s2[4];
+    load_s16_8x4(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3]);
+    load_s16_8x4(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3]);
+    load_s16_8x4(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3]);
+
+    int16x8_t v_s0 = vreinterpretq_s16_u16(highbd_convolve4_8_neon(
+        h_s0[0], h_s0[1], h_s0[2], h_s0[3], x_filter, max));
+    int16x8_t v_s1 = vreinterpretq_s16_u16(highbd_convolve4_8_neon(
+        h_s1[0], h_s1[1], h_s1[2], h_s1[3], x_filter, max));
+    int16x8_t v_s2 = vreinterpretq_s16_u16(highbd_convolve4_8_neon(
+        h_s2[0], h_s2[1], h_s2[2], h_s2[3], x_filter, max));
+
+    s += 3 * src_stride;
+
+    do {
+      int16x8_t h_s3[4], h_s4[4], h_s5[4], h_s6[4];
+      load_s16_8x4(s + 0 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2],
+                   &h_s3[3]);
+      load_s16_8x4(s + 1 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2],
+                   &h_s4[3]);
+      load_s16_8x4(s + 2 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2],
+                   &h_s5[3]);
+      load_s16_8x4(s + 3 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2],
+                   &h_s6[3]);
+
+      int16x8_t v_s3 = vreinterpretq_s16_u16(highbd_convolve4_8_neon(
+          h_s3[0], h_s3[1], h_s3[2], h_s3[3], x_filter, max));
+      int16x8_t v_s4 = vreinterpretq_s16_u16(highbd_convolve4_8_neon(
+          h_s4[0], h_s4[1], h_s4[2], h_s4[3], x_filter, max));
+      int16x8_t v_s5 = vreinterpretq_s16_u16(highbd_convolve4_8_neon(
+          h_s5[0], h_s5[1], h_s5[2], h_s5[3], x_filter, max));
+      int16x8_t v_s6 = vreinterpretq_s16_u16(highbd_convolve4_8_neon(
+          h_s6[0], h_s6[1], h_s6[2], h_s6[3], x_filter, max));
+
+      uint16x8_t d0 =
+          highbd_convolve4_8_neon(v_s0, v_s1, v_s2, v_s3, y_filter, max);
+      uint16x8_t d1 =
+          highbd_convolve4_8_neon(v_s1, v_s2, v_s3, v_s4, y_filter, max);
+      uint16x8_t d2 =
+          highbd_convolve4_8_neon(v_s2, v_s3, v_s4, v_s5, y_filter, max);
+      uint16x8_t d3 =
+          highbd_convolve4_8_neon(v_s3, v_s4, v_s5, v_s6, y_filter, max);
+
+      store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      v_s0 = v_s4;
+      v_s1 = v_s5;
+      v_s2 = v_s6;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+    src += 8;
+    dst += 8;
+    w -= 8;
+  } while (w != 0);
+}
+
+static INLINE void highbd_convolve_2d_8tap_neon(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int16x8_t x_filter,
+    const int16x8_t y_filter, int bd) {
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    int16x4_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8];
+    load_s16_4x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3],
+                 &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]);
+    load_s16_4x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3],
+                 &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]);
+    load_s16_4x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3],
+                 &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]);
+    load_s16_4x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3],
+                 &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]);
+    load_s16_4x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3],
+                 &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]);
+    load_s16_4x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3],
+                 &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]);
+    load_s16_4x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3],
+                 &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]);
+
+    int16x4_t v_s0 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5],
+                           h_s0[6], h_s0[7], x_filter, max));
+    int16x4_t v_s1 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5],
+                           h_s1[6], h_s1[7], x_filter, max));
+    int16x4_t v_s2 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5],
+                           h_s2[6], h_s2[7], x_filter, max));
+    int16x4_t v_s3 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5],
+                           h_s3[6], h_s3[7], x_filter, max));
+    int16x4_t v_s4 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5],
+                           h_s4[6], h_s4[7], x_filter, max));
+    int16x4_t v_s5 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5],
+                           h_s5[6], h_s5[7], x_filter, max));
+    int16x4_t v_s6 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5],
+                           h_s6[6], h_s6[7], x_filter, max));
+
+    s += 7 * src_stride;
+
+    do {
+      int16x4_t h_s7[8], h_s8[8], h_s9[8], h_s10[8];
+      load_s16_4x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2],
+                   &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]);
+      load_s16_4x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2],
+                   &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]);
+      load_s16_4x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2],
+                   &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]);
+      load_s16_4x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2],
+                   &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]);
+
+      int16x4_t v_s7 = vreinterpret_s16_u16(
+          highbd_convolve8_4(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4],
+                             h_s7[5], h_s7[6], h_s7[7], x_filter, max));
+      int16x4_t v_s8 = vreinterpret_s16_u16(
+          highbd_convolve8_4(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4],
+                             h_s8[5], h_s8[6], h_s8[7], x_filter, max));
+      int16x4_t v_s9 = vreinterpret_s16_u16(
+          highbd_convolve8_4(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4],
+                             h_s9[5], h_s9[6], h_s9[7], x_filter, max));
+      int16x4_t v_s10 = vreinterpret_s16_u16(
+          highbd_convolve8_4(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4],
+                             h_s10[5], h_s10[6], h_s10[7], x_filter, max));
+
+      uint16x4_t d0 = highbd_convolve8_4(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
+                                         v_s6, v_s7, y_filter, max);
+      uint16x4_t d1 = highbd_convolve8_4(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
+                                         v_s7, v_s8, y_filter, max);
+      uint16x4_t d2 = highbd_convolve8_4(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
+                                         v_s8, v_s9, y_filter, max);
+      uint16x4_t d3 = highbd_convolve8_4(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
+                                         v_s9, v_s10, y_filter, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      v_s0 = v_s4;
+      v_s1 = v_s5;
+      v_s2 = v_s6;
+      v_s3 = v_s7;
+      v_s4 = v_s8;
+      v_s5 = v_s9;
+      v_s6 = v_s10;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+
+    return;
+  }
+
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+  do {
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+    int height = h;
+
+    int16x8_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8];
+    load_s16_8x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3],
+                 &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]);
+    load_s16_8x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3],
+                 &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]);
+    load_s16_8x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3],
+                 &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]);
+    load_s16_8x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3],
+                 &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]);
+    load_s16_8x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3],
+                 &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]);
+    load_s16_8x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3],
+                 &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]);
+    load_s16_8x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3],
+                 &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]);
+
+    int16x8_t v_s0 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5],
+                           h_s0[6], h_s0[7], x_filter, max));
+    int16x8_t v_s1 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5],
+                           h_s1[6], h_s1[7], x_filter, max));
+    int16x8_t v_s2 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5],
+                           h_s2[6], h_s2[7], x_filter, max));
+    int16x8_t v_s3 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5],
+                           h_s3[6], h_s3[7], x_filter, max));
+    int16x8_t v_s4 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5],
+                           h_s4[6], h_s4[7], x_filter, max));
+    int16x8_t v_s5 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5],
+                           h_s5[6], h_s5[7], x_filter, max));
+    int16x8_t v_s6 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5],
+                           h_s6[6], h_s6[7], x_filter, max));
+
+    s += 7 * src_stride;
+
+    do {
+      int16x8_t h_s7[8], h_s8[8], h_s9[8], h_s10[8];
+      load_s16_8x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2],
+                   &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]);
+      load_s16_8x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2],
+                   &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]);
+      load_s16_8x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2],
+                   &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]);
+      load_s16_8x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2],
+                   &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]);
+
+      int16x8_t v_s7 = vreinterpretq_s16_u16(
+          highbd_convolve8_8(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4],
+                             h_s7[5], h_s7[6], h_s7[7], x_filter, max));
+      int16x8_t v_s8 = vreinterpretq_s16_u16(
+          highbd_convolve8_8(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4],
+                             h_s8[5], h_s8[6], h_s8[7], x_filter, max));
+      int16x8_t v_s9 = vreinterpretq_s16_u16(
+          highbd_convolve8_8(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4],
+                             h_s9[5], h_s9[6], h_s9[7], x_filter, max));
+      int16x8_t v_s10 = vreinterpretq_s16_u16(
+          highbd_convolve8_8(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4],
+                             h_s10[5], h_s10[6], h_s10[7], x_filter, max));
+
+      uint16x8_t d0 = highbd_convolve8_8(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
+                                         v_s6, v_s7, y_filter, max);
+      uint16x8_t d1 = highbd_convolve8_8(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
+                                         v_s7, v_s8, y_filter, max);
+      uint16x8_t d2 = highbd_convolve8_8(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
+                                         v_s8, v_s9, y_filter, max);
+      uint16x8_t d3 = highbd_convolve8_8(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
+                                         v_s9, v_s10, y_filter, max);
+
+      store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      v_s0 = v_s4;
+      v_s1 = v_s5;
+      v_s2 = v_s6;
+      v_s3 = v_s7;
+      v_s4 = v_s8;
+      v_s5 = v_s9;
+      v_s6 = v_s10;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+    src += 8;
+    dst += 8;
+    w -= 8;
+  } while (w != 0);
+}
+
+void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride,
+                               uint16_t *dst, ptrdiff_t dst_stride,
+                               const InterpKernel *filter, int x0_q4,
+                               int x_step_q4, int y0_q4, int y_step_q4, int w,
+                               int h, int bd) {
+  if (x_step_q4 != 16 || y_step_q4 != 16) {
+    vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                           x_step_q4, y0_q4, y_step_q4, w, h, bd);
+    return;
+  }
+
+  const int x_filter_taps = vpx_get_filter_taps(filter[x0_q4]) <= 4 ? 4 : 8;
+  const int y_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
+  // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2
+  // lines post both horizontally and vertically.
+  const ptrdiff_t horiz_offset = x_filter_taps / 2 - 1;
+  const ptrdiff_t vert_offset = (y_filter_taps / 2 - 1) * src_stride;
+
+  if (x_filter_taps == 4 && y_filter_taps == 4) {
+    const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
+    const int16x4_t y_filter = vld1_s16(filter[y0_q4] + 2);
+
+    highbd_convolve_2d_4tap_neon(src - horiz_offset - vert_offset, src_stride,
+                                 dst, dst_stride, w, h, x_filter, y_filter, bd);
+    return;
+  }
+
+  const int16x8_t x_filter = vld1q_s16(filter[x0_q4]);
+  const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
+  highbd_convolve_2d_8tap_neon(src - horiz_offset - vert_offset, src_stride,
+                               dst, dst_stride, w, h, x_filter, y_filter, bd);
+}
+
+void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
+                                   uint16_t *dst, ptrdiff_t dst_stride,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int x_step_q4, int y0_q4, int y_step_q4,
+                                   int w, int h, int bd) {
+  if (x_step_q4 != 16 || y_step_q4 != 16) {
+    vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                               x_step_q4, y0_q4, y_step_q4, w, h, bd);
+    return;
+  }
+
+  // Averaging convolution always uses an 8-tap filter.
+  const ptrdiff_t horiz_offset = SUBPEL_TAPS / 2 - 1;
+  const ptrdiff_t vert_offset = (SUBPEL_TAPS / 2 - 1) * src_stride;
+  // Account for needing SUBPEL_TAPS / 2 - 1 lines prior and SUBPEL_TAPS / 2
+  // lines post both horizontally and vertically.
+  src = src - horiz_offset - vert_offset;
+
+  const int16x8_t x_filter = vld1q_s16(filter[x0_q4]);
+  const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    int16x4_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8];
+    load_s16_4x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3],
+                 &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]);
+    load_s16_4x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3],
+                 &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]);
+    load_s16_4x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3],
+                 &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]);
+    load_s16_4x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3],
+                 &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]);
+    load_s16_4x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3],
+                 &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]);
+    load_s16_4x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3],
+                 &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]);
+    load_s16_4x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3],
+                 &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]);
+
+    int16x4_t v_s0 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5],
+                           h_s0[6], h_s0[7], x_filter, max));
+    int16x4_t v_s1 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5],
+                           h_s1[6], h_s1[7], x_filter, max));
+    int16x4_t v_s2 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5],
+                           h_s2[6], h_s2[7], x_filter, max));
+    int16x4_t v_s3 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5],
+                           h_s3[6], h_s3[7], x_filter, max));
+    int16x4_t v_s4 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5],
+                           h_s4[6], h_s4[7], x_filter, max));
+    int16x4_t v_s5 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5],
+                           h_s5[6], h_s5[7], x_filter, max));
+    int16x4_t v_s6 = vreinterpret_s16_u16(
+        highbd_convolve8_4(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5],
+                           h_s6[6], h_s6[7], x_filter, max));
+
+    s += 7 * src_stride;
+
+    do {
+      int16x4_t h_s7[8], h_s8[8], h_s9[8], h_s10[8];
+      load_s16_4x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2],
+                   &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]);
+      load_s16_4x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2],
+                   &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]);
+      load_s16_4x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2],
+                   &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]);
+      load_s16_4x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2],
+                   &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]);
+
+      int16x4_t v_s7 = vreinterpret_s16_u16(
+          highbd_convolve8_4(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4],
+                             h_s7[5], h_s7[6], h_s7[7], x_filter, max));
+      int16x4_t v_s8 = vreinterpret_s16_u16(
+          highbd_convolve8_4(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4],
+                             h_s8[5], h_s8[6], h_s8[7], x_filter, max));
+      int16x4_t v_s9 = vreinterpret_s16_u16(
+          highbd_convolve8_4(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4],
+                             h_s9[5], h_s9[6], h_s9[7], x_filter, max));
+      int16x4_t v_s10 = vreinterpret_s16_u16(
+          highbd_convolve8_4(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4],
+                             h_s10[5], h_s10[6], h_s10[7], x_filter, max));
+
+      uint16x4_t d0 = highbd_convolve8_4(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
+                                         v_s6, v_s7, y_filter, max);
+      uint16x4_t d1 = highbd_convolve8_4(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
+                                         v_s7, v_s8, y_filter, max);
+      uint16x4_t d2 = highbd_convolve8_4(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
+                                         v_s8, v_s9, y_filter, max);
+      uint16x4_t d3 = highbd_convolve8_4(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
+                                         v_s9, v_s10, y_filter, max);
+
+      d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride));
+      d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride));
+      d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride));
+      d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride));
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      v_s0 = v_s4;
+      v_s1 = v_s5;
+      v_s2 = v_s6;
+      v_s3 = v_s7;
+      v_s4 = v_s8;
+      v_s5 = v_s9;
+      v_s6 = v_s10;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+
+    return;
+  }
+
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+  do {
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+    int height = h;
+
+    int16x8_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8];
+    load_s16_8x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3],
+                 &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]);
+    load_s16_8x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3],
+                 &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]);
+    load_s16_8x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3],
+                 &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]);
+    load_s16_8x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3],
+                 &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]);
+    load_s16_8x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3],
+                 &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]);
+    load_s16_8x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3],
+                 &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]);
+    load_s16_8x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3],
+                 &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]);
+
+    int16x8_t v_s0 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5],
+                           h_s0[6], h_s0[7], x_filter, max));
+    int16x8_t v_s1 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5],
+                           h_s1[6], h_s1[7], x_filter, max));
+    int16x8_t v_s2 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5],
+                           h_s2[6], h_s2[7], x_filter, max));
+    int16x8_t v_s3 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5],
+                           h_s3[6], h_s3[7], x_filter, max));
+    int16x8_t v_s4 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5],
+                           h_s4[6], h_s4[7], x_filter, max));
+    int16x8_t v_s5 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5],
+                           h_s5[6], h_s5[7], x_filter, max));
+    int16x8_t v_s6 = vreinterpretq_s16_u16(
+        highbd_convolve8_8(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5],
+                           h_s6[6], h_s6[7], x_filter, max));
+
+    s += 7 * src_stride;
+
+    do {
+      int16x8_t h_s7[8], h_s8[8], h_s9[8], h_s10[8];
+      load_s16_8x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2],
+                   &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]);
+      load_s16_8x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2],
+                   &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]);
+      load_s16_8x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2],
+                   &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]);
+      load_s16_8x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2],
+                   &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]);
+
+      int16x8_t v_s7 = vreinterpretq_s16_u16(
+          highbd_convolve8_8(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4],
+                             h_s7[5], h_s7[6], h_s7[7], x_filter, max));
+      int16x8_t v_s8 = vreinterpretq_s16_u16(
+          highbd_convolve8_8(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4],
+                             h_s8[5], h_s8[6], h_s8[7], x_filter, max));
+      int16x8_t v_s9 = vreinterpretq_s16_u16(
+          highbd_convolve8_8(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4],
+                             h_s9[5], h_s9[6], h_s9[7], x_filter, max));
+      int16x8_t v_s10 = vreinterpretq_s16_u16(
+          highbd_convolve8_8(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4],
+                             h_s10[5], h_s10[6], h_s10[7], x_filter, max));
+
+      uint16x8_t d0 = highbd_convolve8_8(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
+                                         v_s6, v_s7, y_filter, max);
+      uint16x8_t d1 = highbd_convolve8_8(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
+                                         v_s7, v_s8, y_filter, max);
+      uint16x8_t d2 = highbd_convolve8_8(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
+                                         v_s8, v_s9, y_filter, max);
+      uint16x8_t d3 = highbd_convolve8_8(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
+                                         v_s9, v_s10, y_filter, max);
+
+      d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
+      d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
+      d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
+      d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
+
+      store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+      v_s0 = v_s4;
+      v_s1 = v_s5;
+      v_s2 = v_s6;
+      v_s3 = v_s7;
+      v_s4 = v_s8;
+      v_s5 = v_s9;
+      v_s6 = v_s10;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+    src += 8;
+    dst += 8;
+    w -= 8;
+  } while (w != 0);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve.c
new file mode 100644
index 0000000000..f909e06a18
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve.c
@@ -0,0 +1,271 @@
+/*
+ *  Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/highbd_convolve8_sve.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_neon_sve_bridge.h"
+
+DECLARE_ALIGNED(16, static const uint16_t, kTblConv4_8[8]) = { 0, 2, 4, 6,
+                                                               1, 3, 5, 7 };
+
+static INLINE void highbd_convolve_4tap_horiz_sve(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int16x4_t filters, int bd) {
+  const int16x8_t filter = vcombine_s16(filters, vdup_n_s16(0));
+
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    do {
+      int16x4_t s0[4], s1[4], s2[4], s3[4];
+      load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+      load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+      load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+      load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+      uint16x4_t d0 = highbd_convolve4_4_sve(s0, filter, max);
+      uint16x4_t d1 = highbd_convolve4_4_sve(s1, filter, max);
+      uint16x4_t d2 = highbd_convolve4_4_sve(s2, filter, max);
+      uint16x4_t d3 = highbd_convolve4_4_sve(s3, filter, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+    const uint16x8_t idx = vld1q_u16(kTblConv4_8);
+
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int width = w;
+
+      do {
+        int16x8_t s0[4], s1[4], s2[4], s3[4];
+        load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+        load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+        load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+        load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+        uint16x8_t d0 = highbd_convolve4_8_sve(s0, filter, max, idx);
+        uint16x8_t d1 = highbd_convolve4_8_sve(s1, filter, max, idx);
+        uint16x8_t d2 = highbd_convolve4_8_sve(s2, filter, max, idx);
+        uint16x8_t d3 = highbd_convolve4_8_sve(s3, filter, max, idx);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+static INLINE void highbd_convolve_8tap_horiz_sve(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int16x8_t filters, int bd) {
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    do {
+      int16x8_t s0[4], s1[4], s2[4], s3[4];
+      load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+      load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+      load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+      load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+      uint16x4_t d0 = highbd_convolve8_4(s0, filters, max);
+      uint16x4_t d1 = highbd_convolve8_4(s1, filters, max);
+      uint16x4_t d2 = highbd_convolve8_4(s2, filters, max);
+      uint16x4_t d3 = highbd_convolve8_4(s3, filters, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int width = w;
+
+      do {
+        int16x8_t s0[8], s1[8], s2[8], s3[8];
+        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                     &s0[4], &s0[5], &s0[6], &s0[7]);
+        load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                     &s1[4], &s1[5], &s1[6], &s1[7]);
+        load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                     &s2[4], &s2[5], &s2[6], &s2[7]);
+        load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                     &s3[4], &s3[5], &s3[6], &s3[7]);
+
+        uint16x8_t d0 = highbd_convolve8_8(s0, filters, max);
+        uint16x8_t d1 = highbd_convolve8_8(s1, filters, max);
+        uint16x8_t d2 = highbd_convolve8_8(s2, filters, max);
+        uint16x8_t d3 = highbd_convolve8_8(s3, filters, max);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+void vpx_highbd_convolve8_horiz_sve(const uint16_t *src, ptrdiff_t src_stride,
+                                    uint16_t *dst, ptrdiff_t dst_stride,
+                                    const InterpKernel *filter, int x0_q4,
+                                    int x_step_q4, int y0_q4, int y_step_q4,
+                                    int w, int h, int bd) {
+  if (x_step_q4 != 16) {
+    vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
+    return;
+  }
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
+    const int16x4_t x_filter_4tap = vld1_s16(filter[x0_q4] + 2);
+    highbd_convolve_4tap_horiz_sve(src - 1, src_stride, dst, dst_stride, w, h,
+                                   x_filter_4tap, bd);
+  } else {
+    const int16x8_t x_filter_8tap = vld1q_s16(filter[x0_q4]);
+    highbd_convolve_8tap_horiz_sve(src - 3, src_stride, dst, dst_stride, w, h,
+                                   x_filter_8tap, bd);
+  }
+}
+
+void vpx_highbd_convolve8_avg_horiz_sve(const uint16_t *src,
+                                        ptrdiff_t src_stride, uint16_t *dst,
+                                        ptrdiff_t dst_stride,
+                                        const InterpKernel *filter, int x0_q4,
+                                        int x_step_q4, int y0_q4, int y_step_q4,
+                                        int w, int h, int bd) {
+  if (x_step_q4 != 16) {
+    vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+                                     x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+                                     bd);
+    return;
+  }
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  const int16x8_t filters = vld1q_s16(filter[x0_q4]);
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    do {
+      int16x8_t s0[4], s1[4], s2[4], s3[4];
+      load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+      load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+      load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+      load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+      uint16x4_t d0 = highbd_convolve8_4(s0, filters, max);
+      uint16x4_t d1 = highbd_convolve8_4(s1, filters, max);
+      uint16x4_t d2 = highbd_convolve8_4(s2, filters, max);
+      uint16x4_t d3 = highbd_convolve8_4(s3, filters, max);
+
+      d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride));
+      d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride));
+      d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride));
+      d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride));
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int width = w;
+
+      do {
+        int16x8_t s0[8], s1[8], s2[8], s3[8];
+        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                     &s0[4], &s0[5], &s0[6], &s0[7]);
+        load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                     &s1[4], &s1[5], &s1[6], &s1[7]);
+        load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                     &s2[4], &s2[5], &s2[6], &s2[7]);
+        load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                     &s3[4], &s3[5], &s3[6], &s3[7]);
+
+        uint16x8_t d0 = highbd_convolve8_8(s0, filters, max);
+        uint16x8_t d1 = highbd_convolve8_8(s1, filters, max);
+        uint16x8_t d2 = highbd_convolve8_8(s2, filters, max);
+        uint16x8_t d3 = highbd_convolve8_8(s3, filters, max);
+
+        d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
+        d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
+        d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
+        d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve2.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve2.c
new file mode 100644
index 0000000000..8408f98f4a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve2.c
@@ -0,0 +1,660 @@
+/*
+ *  Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/highbd_convolve8_neon.h"
+#include "vpx_dsp/arm/highbd_convolve8_sve.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_neon_sve_bridge.h"
+#include "vpx_dsp/arm/vpx_neon_sve2_bridge.h"
+
+// clang-format off
+DECLARE_ALIGNED(16, static const uint16_t, kDotProdMergeBlockTbl[24]) = {
+  // Shift left and insert new last column in transposed 4x4 block.
+  1, 2, 3, 0, 5, 6, 7, 4,
+  // Shift left and insert two new columns in transposed 4x4 block.
+  2, 3, 0, 1, 6, 7, 4, 5,
+  // Shift left and insert three new columns in transposed 4x4 block.
+  3, 0, 1, 2, 7, 4, 5, 6,
+};
+// clang-format on
+
+DECLARE_ALIGNED(16, static const uint16_t, kTblConv4_8[8]) = { 0, 2, 4, 6,
+                                                               1, 3, 5, 7 };
+
+static INLINE uint16x4_t highbd_convolve8_4_v(int16x8_t s_lo[2],
+                                              int16x8_t s_hi[2],
+                                              int16x8_t filter,
+                                              uint16x4_t max) {
+  int64x2_t sum01 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[0], filter, 0);
+  sum01 = vpx_dotq_lane_s16(sum01, s_hi[0], filter, 1);
+
+  int64x2_t sum23 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[1], filter, 0);
+  sum23 = vpx_dotq_lane_s16(sum23, s_hi[1], filter, 1);
+
+  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+
+  uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS);
+  return vmin_u16(res, max);
+}
+
+static INLINE uint16x8_t highbd_convolve8_8_v(const int16x8_t s_lo[4],
+                                              const int16x8_t s_hi[4],
+                                              const int16x8_t filter,
+                                              const uint16x8_t max) {
+  int64x2_t sum01 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[0], filter, 0);
+  sum01 = vpx_dotq_lane_s16(sum01, s_hi[0], filter, 1);
+
+  int64x2_t sum23 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[1], filter, 0);
+  sum23 = vpx_dotq_lane_s16(sum23, s_hi[1], filter, 1);
+
+  int64x2_t sum45 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[2], filter, 0);
+  sum45 = vpx_dotq_lane_s16(sum45, s_hi[2], filter, 1);
+
+  int64x2_t sum67 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[3], filter, 0);
+  sum67 = vpx_dotq_lane_s16(sum67, s_hi[3], filter, 1);
+
+  int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
+  int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
+
+  uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS),
+                                vqrshrun_n_s32(sum4567, FILTER_BITS));
+  return vminq_u16(res, max);
+}
+
+static INLINE void highbd_convolve8_8tap_vert_sve2(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int16x8_t filter, int bd) {
+  assert(w >= 4 && h >= 4);
+
+  do {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+    int height = h;
+
+    int16x4_t s0, s1, s2, s3, s4, s5, s6;
+    load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    s += 7 * src_stride;
+
+    int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
+    transpose_concat_s16_4x4(s0, s1, s2, s3, &s0123[0], &s0123[1]);
+    transpose_concat_s16_4x4(s1, s2, s3, s4, &s1234[0], &s1234[1]);
+    transpose_concat_s16_4x4(s2, s3, s4, s5, &s2345[0], &s2345[1]);
+    transpose_concat_s16_4x4(s3, s4, s5, s6, &s3456[0], &s3456[1]);
+
+    do {
+      int16x4_t s7, s8, s9, sA;
+
+      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &sA);
+
+      int16x8_t s4567[2], s5678[2], s6789[2], s789A[2];
+      transpose_concat_s16_4x4(s4, s5, s6, s7, &s4567[0], &s4567[1]);
+      transpose_concat_s16_4x4(s5, s6, s7, s8, &s5678[0], &s5678[1]);
+      transpose_concat_s16_4x4(s6, s7, s8, s9, &s6789[0], &s6789[1]);
+      transpose_concat_s16_4x4(s7, s8, s9, sA, &s789A[0], &s789A[1]);
+
+      uint16x4_t d0 = highbd_convolve8_4_v(s0123, s4567, filter, max);
+      uint16x4_t d1 = highbd_convolve8_4_v(s1234, s5678, filter, max);
+      uint16x4_t d2 = highbd_convolve8_4_v(s2345, s6789, filter, max);
+      uint16x4_t d3 = highbd_convolve8_4_v(s3456, s789A, filter, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s0123[0] = s4567[0];
+      s0123[1] = s4567[1];
+      s1234[0] = s5678[0];
+      s1234[1] = s5678[1];
+      s2345[0] = s6789[0];
+      s2345[1] = s6789[1];
+      s3456[0] = s789A[0];
+      s3456[1] = s789A[1];
+
+      s4 = s8;
+      s5 = s9;
+      s6 = sA;
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      height -= 4;
+    } while (height != 0);
+
+    src += 4;
+    dst += 4;
+    w -= 4;
+  } while (w != 0);
+}
+
+void vpx_highbd_convolve8_vert_sve2(const uint16_t *src, ptrdiff_t src_stride,
+                                    uint16_t *dst, ptrdiff_t dst_stride,
+                                    const InterpKernel *filter, int x0_q4,
+                                    int x_step_q4, int y0_q4, int y_step_q4,
+                                    int w, int h, int bd) {
+  if (y_step_q4 != 16) {
+    vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                                x_step_q4, y0_q4, y_step_q4, w, h, bd);
+    return;
+  }
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(y_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
+    vpx_highbd_convolve8_vert_neon(src, src_stride, dst, dst_stride, filter,
+                                   x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+                                   bd);
+  } else {
+    const int16x8_t y_filter_8tap = vld1q_s16(filter[y0_q4]);
+    highbd_convolve8_8tap_vert_sve2(src - 3 * src_stride, src_stride, dst,
+                                    dst_stride, w, h, y_filter_8tap, bd);
+  }
+}
+
+void vpx_highbd_convolve8_avg_vert_sve2(const uint16_t *src,
+                                        ptrdiff_t src_stride, uint16_t *dst,
+                                        ptrdiff_t dst_stride,
+                                        const InterpKernel *filter, int x0_q4,
+                                        int x_step_q4, int y0_q4, int y_step_q4,
+                                        int w, int h, int bd) {
+  if (y_step_q4 != 16) {
+    vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+                                    x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+                                    bd);
+    return;
+  }
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  const int16x8_t filters = vld1q_s16(filter[y0_q4]);
+
+  src -= 3 * src_stride;
+
+  uint16x8x3_t merge_tbl_idx = vld1q_u16_x3(kDotProdMergeBlockTbl);
+
+  // Correct indices by the size of vector length.
+  merge_tbl_idx.val[0] = vaddq_u16(
+      merge_tbl_idx.val[0],
+      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)));
+  merge_tbl_idx.val[1] = vaddq_u16(
+      merge_tbl_idx.val[1],
+      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)));
+  merge_tbl_idx.val[2] = vaddq_u16(
+      merge_tbl_idx.val[2],
+      vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)));
+
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    int16x4_t s0, s1, s2, s3, s4, s5, s6;
+    load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    s += 7 * src_stride;
+
+    int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
+    transpose_concat_s16_4x4(s0, s1, s2, s3, &s0123[0], &s0123[1]);
+    transpose_concat_s16_4x4(s1, s2, s3, s4, &s1234[0], &s1234[1]);
+    transpose_concat_s16_4x4(s2, s3, s4, s5, &s2345[0], &s2345[1]);
+    transpose_concat_s16_4x4(s3, s4, s5, s6, &s3456[0], &s3456[1]);
+
+    do {
+      int16x4_t s7, s8, s9, sA;
+
+      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &sA);
+
+      int16x8_t s4567[2], s5678[2], s6789[2], s789A[2];
+      transpose_concat_s16_4x4(s7, s8, s9, sA, &s789A[0], &s789A[1]);
+
+      vpx_tbl2x2_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]);
+      vpx_tbl2x2_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]);
+      vpx_tbl2x2_s16(s3456, s789A, s6789, merge_tbl_idx.val[2]);
+
+      uint16x4_t d0 = highbd_convolve8_4_v(s0123, s4567, filters, max);
+      uint16x4_t d1 = highbd_convolve8_4_v(s1234, s5678, filters, max);
+      uint16x4_t d2 = highbd_convolve8_4_v(s2345, s6789, filters, max);
+      uint16x4_t d3 = highbd_convolve8_4_v(s3456, s789A, filters, max);
+
+      d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride));
+      d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride));
+      d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride));
+      d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride));
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s0123[0] = s4567[0];
+      s0123[1] = s4567[1];
+      s1234[0] = s5678[0];
+      s1234[1] = s5678[1];
+      s2345[0] = s6789[0];
+      s2345[1] = s6789[1];
+      s3456[0] = s789A[0];
+      s3456[1] = s789A[1];
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int height = h;
+
+      int16x8_t s0, s1, s2, s3, s4, s5, s6;
+      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
+      transpose_concat_s16_8x4(s0, s1, s2, s3, &s0123[0], &s0123[1], &s0123[2],
+                               &s0123[3]);
+      transpose_concat_s16_8x4(s1, s2, s3, s4, &s1234[0], &s1234[1], &s1234[2],
+                               &s1234[3]);
+      transpose_concat_s16_8x4(s2, s3, s4, s5, &s2345[0], &s2345[1], &s2345[2],
+                               &s2345[3]);
+      transpose_concat_s16_8x4(s3, s4, s5, s6, &s3456[0], &s3456[1], &s3456[2],
+                               &s3456[3]);
+
+      do {
+        int16x8_t s7, s8, s9, sA;
+        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &sA);
+
+        int16x8_t s4567[4], s5678[5], s6789[4], s789A[4];
+        transpose_concat_s16_8x4(s7, s8, s9, sA, &s789A[0], &s789A[1],
+                                 &s789A[2], &s789A[3]);
+
+        vpx_tbl2x4_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]);
+        vpx_tbl2x4_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]);
+        vpx_tbl2x4_s16(s3456, s789A, s6789, merge_tbl_idx.val[2]);
+
+        uint16x8_t d0 = highbd_convolve8_8_v(s0123, s4567, filters, max);
+        uint16x8_t d1 = highbd_convolve8_8_v(s1234, s5678, filters, max);
+        uint16x8_t d2 = highbd_convolve8_8_v(s2345, s6789, filters, max);
+        uint16x8_t d3 = highbd_convolve8_8_v(s3456, s789A, filters, max);
+
+        d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
+        d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
+        d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
+        d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0123[0] = s4567[0];
+        s0123[1] = s4567[1];
+        s0123[2] = s4567[2];
+        s0123[3] = s4567[3];
+        s1234[0] = s5678[0];
+        s1234[1] = s5678[1];
+        s1234[2] = s5678[2];
+        s1234[3] = s5678[3];
+        s2345[0] = s6789[0];
+        s2345[1] = s6789[1];
+        s2345[2] = s6789[2];
+        s2345[3] = s6789[3];
+        s3456[0] = s789A[0];
+        s3456[1] = s789A[1];
+        s3456[2] = s789A[2];
+        s3456[3] = s789A[3];
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+static INLINE void highbd_convolve_2d_4tap_sve2(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filters,
+    const int16x4_t y_filters, int bd) {
+  const int16x8_t x_filter = vcombine_s16(x_filters, vdup_n_s16(0));
+
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    int16x4_t h_s0[4], h_s1[4], h_s2[4];
+    load_s16_4x4(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3]);
+    load_s16_4x4(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3]);
+    load_s16_4x4(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3]);
+
+    int16x4_t v_s0 =
+        vreinterpret_s16_u16(highbd_convolve4_4_sve(h_s0, x_filter, max));
+    int16x4_t v_s1 =
+        vreinterpret_s16_u16(highbd_convolve4_4_sve(h_s1, x_filter, max));
+    int16x4_t v_s2 =
+        vreinterpret_s16_u16(highbd_convolve4_4_sve(h_s2, x_filter, max));
+
+    s += 3 * src_stride;
+
+    do {
+      int16x4_t h_s3[4], h_s4[4], h_s5[4], h_s6[4];
+      load_s16_4x4(s + 0 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2],
+                   &h_s3[3]);
+      load_s16_4x4(s + 1 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2],
+                   &h_s4[3]);
+      load_s16_4x4(s + 2 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2],
+                   &h_s5[3]);
+      load_s16_4x4(s + 3 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2],
+                   &h_s6[3]);
+
+      int16x4_t v_s3 =
+          vreinterpret_s16_u16(highbd_convolve4_4_sve(h_s3, x_filter, max));
+      int16x4_t v_s4 =
+          vreinterpret_s16_u16(highbd_convolve4_4_sve(h_s4, x_filter, max));
+      int16x4_t v_s5 =
+          vreinterpret_s16_u16(highbd_convolve4_4_sve(h_s5, x_filter, max));
+      int16x4_t v_s6 =
+          vreinterpret_s16_u16(highbd_convolve4_4_sve(h_s6, x_filter, max));
+
+      uint16x4_t d0 =
+          highbd_convolve4_4_neon(v_s0, v_s1, v_s2, v_s3, y_filters, max);
+      uint16x4_t d1 =
+          highbd_convolve4_4_neon(v_s1, v_s2, v_s3, v_s4, y_filters, max);
+      uint16x4_t d2 =
+          highbd_convolve4_4_neon(v_s2, v_s3, v_s4, v_s5, y_filters, max);
+      uint16x4_t d3 =
+          highbd_convolve4_4_neon(v_s3, v_s4, v_s5, v_s6, y_filters, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      v_s0 = v_s4;
+      v_s1 = v_s5;
+      v_s2 = v_s6;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+    const uint16x8_t idx = vld1q_u16(kTblConv4_8);
+
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int height = h;
+
+      int16x8_t h_s0[4], h_s1[4], h_s2[4];
+      load_s16_8x4(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2],
+                   &h_s0[3]);
+      load_s16_8x4(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2],
+                   &h_s1[3]);
+      load_s16_8x4(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2],
+                   &h_s2[3]);
+
+      int16x8_t v_s0 = vreinterpretq_s16_u16(
+          highbd_convolve4_8_sve(h_s0, x_filter, max, idx));
+      int16x8_t v_s1 = vreinterpretq_s16_u16(
+          highbd_convolve4_8_sve(h_s1, x_filter, max, idx));
+      int16x8_t v_s2 = vreinterpretq_s16_u16(
+          highbd_convolve4_8_sve(h_s2, x_filter, max, idx));
+
+      s += 3 * src_stride;
+
+      do {
+        int16x8_t h_s3[4], h_s4[4], h_s5[4], h_s6[4];
+        load_s16_8x4(s + 0 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2],
+                     &h_s3[3]);
+        load_s16_8x4(s + 1 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2],
+                     &h_s4[3]);
+        load_s16_8x4(s + 2 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2],
+                     &h_s5[3]);
+        load_s16_8x4(s + 3 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2],
+                     &h_s6[3]);
+
+        int16x8_t v_s3 = vreinterpretq_s16_u16(
+            highbd_convolve4_8_sve(h_s3, x_filter, max, idx));
+        int16x8_t v_s4 = vreinterpretq_s16_u16(
+            highbd_convolve4_8_sve(h_s4, x_filter, max, idx));
+        int16x8_t v_s5 = vreinterpretq_s16_u16(
+            highbd_convolve4_8_sve(h_s5, x_filter, max, idx));
+        int16x8_t v_s6 = vreinterpretq_s16_u16(
+            highbd_convolve4_8_sve(h_s6, x_filter, max, idx));
+
+        uint16x8_t d0 =
+            highbd_convolve4_8_neon(v_s0, v_s1, v_s2, v_s3, y_filters, max);
+        uint16x8_t d1 =
+            highbd_convolve4_8_neon(v_s1, v_s2, v_s3, v_s4, y_filters, max);
+        uint16x8_t d2 =
+            highbd_convolve4_8_neon(v_s2, v_s3, v_s4, v_s5, y_filters, max);
+        uint16x8_t d3 =
+            highbd_convolve4_8_neon(v_s3, v_s4, v_s5, v_s6, y_filters, max);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        v_s0 = v_s4;
+        v_s1 = v_s5;
+        v_s2 = v_s6;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+static INLINE void highbd_convolve8_2d_horiz_sve2(
+    const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4,
+    int y0_q4, int y_step_q4, int w, int h, int bd) {
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+  assert(h % 4 == 3 && h >= 7);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  const int16x8_t filters = vld1q_s16(filter[x0_q4]);
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+
+    do {
+      int16x8_t s0[4], s1[4], s2[4], s3[4];
+      load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+      load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+      load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+      load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
+
+      uint16x4_t d0 = highbd_convolve8_4(s0, filters, max);
+      uint16x4_t d1 = highbd_convolve8_4(s1, filters, max);
+      uint16x4_t d2 = highbd_convolve8_4(s2, filters, max);
+      uint16x4_t d3 = highbd_convolve8_4(s3, filters, max);
+
+      store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h != 3);
+
+    // Process final three rows (h % 4 == 3).
+    int16x8_t s0[4], s1[4], s2[4];
+    load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
+    load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
+    load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
+
+    uint16x4_t d0 = highbd_convolve8_4(s0, filters, max);
+    uint16x4_t d1 = highbd_convolve8_4(s1, filters, max);
+    uint16x4_t d2 = highbd_convolve8_4(s2, filters, max);
+
+    store_u16_4x3(d, dst_stride, d0, d1, d2);
+  } else {
+    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+    do {
+      const int16_t *s = (const int16_t *)src;
+      uint16_t *d = dst;
+      int width = w;
+
+      do {
+        int16x8_t s0[8], s1[8], s2[8], s3[8];
+        load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                     &s0[4], &s0[5], &s0[6], &s0[7]);
+        load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                     &s1[4], &s1[5], &s1[6], &s1[7]);
+        load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                     &s2[4], &s2[5], &s2[6], &s2[7]);
+        load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
+                     &s3[4], &s3[5], &s3[6], &s3[7]);
+
+        uint16x8_t d0 = highbd_convolve8_8(s0, filters, max);
+        uint16x8_t d1 = highbd_convolve8_8(s1, filters, max);
+        uint16x8_t d2 = highbd_convolve8_8(s2, filters, max);
+        uint16x8_t d3 = highbd_convolve8_8(s3, filters, max);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 3);
+
+    // Process final three rows (h % 4 == 3).
+    const int16_t *s = (const int16_t *)src;
+    uint16_t *d = dst;
+    int width = w;
+
+    do {
+      int16x8_t s0[8], s1[8], s2[8];
+      load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
+                   &s0[4], &s0[5], &s0[6], &s0[7]);
+      load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
+                   &s1[4], &s1[5], &s1[6], &s1[7]);
+      load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
+                   &s2[4], &s2[5], &s2[6], &s2[7]);
+
+      uint16x8_t d0 = highbd_convolve8_8(s0, filters, max);
+      uint16x8_t d1 = highbd_convolve8_8(s1, filters, max);
+      uint16x8_t d2 = highbd_convolve8_8(s2, filters, max);
+
+      store_u16_8x3(d, dst_stride, d0, d1, d2);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+  }
+}
+
+void vpx_highbd_convolve8_sve2(const uint16_t *src, ptrdiff_t src_stride,
+                               uint16_t *dst, ptrdiff_t dst_stride,
+                               const InterpKernel *filter, int x0_q4,
+                               int x_step_q4, int y0_q4, int y_step_q4, int w,
+                               int h, int bd) {
+  if (x_step_q4 != 16 || y_step_q4 != 16) {
+    vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                           x_step_q4, y0_q4, y_step_q4, w, h, bd);
+    return;
+  }
+
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
+
+  const int horiz_filter_taps = vpx_get_filter_taps(filter[x0_q4]) <= 4 ? 4 : 8;
+  const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
+
+  if (horiz_filter_taps == 4 || vert_filter_taps == 4) {
+    const ptrdiff_t horiz_offset = horiz_filter_taps / 2 - 1;
+    const ptrdiff_t vert_offset = (vert_filter_taps / 2 - 1) * src_stride;
+    const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
+    const int16x4_t y_filter = vld1_s16(filter[y0_q4] + 2);
+
+    highbd_convolve_2d_4tap_sve2(src - horiz_offset - vert_offset, src_stride,
+                                 dst, dst_stride, w, h, x_filter, y_filter, bd);
+    return;
+  }
+
+  // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
+  // maximum buffer size to 64 * (64 + 7).
+  DECLARE_ALIGNED(32, uint16_t, im_block[64 * 71]);
+  const int im_stride = 64;
+
+  // Account for the vertical phase needing SUBPEL_TAPS / 2 - 1 lines prior
+  // and SUBPEL_TAPS / 2 lines post.
+  const int im_height = h + SUBPEL_TAPS - 1;
+  const ptrdiff_t border_offset = SUBPEL_TAPS / 2 - 1;
+
+  highbd_convolve8_2d_horiz_sve2(src - src_stride * border_offset, src_stride,
+                                 im_block, im_stride, filter, x0_q4, x_step_q4,
+                                 y0_q4, y_step_q4, w, im_height, bd);
+
+  // Step into the temporary buffer border_offset rows to get actual frame data.
+  vpx_highbd_convolve8_vert_sve2(im_block + im_stride * border_offset,
+                                 im_stride, dst, dst_stride, filter, x0_q4,
+                                 x_step_q4, y0_q4, y_step_q4, w, h, bd);
+}
+
+void vpx_highbd_convolve8_avg_sve2(const uint16_t *src, ptrdiff_t src_stride,
+                                   uint16_t *dst, ptrdiff_t dst_stride,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int x_step_q4, int y0_q4, int y_step_q4,
+                                   int w, int h, int bd) {
+  if (x_step_q4 != 16 || y_step_q4 != 16) {
+    vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                               x_step_q4, y0_q4, y_step_q4, w, h, bd);
+    return;
+  }
+
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
+
+  // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
+  // maximum buffer size to 64 * (64 + 7).
+  DECLARE_ALIGNED(32, uint16_t, im_block[64 * 71]);
+  const int im_stride = 64;
+
+  // Account for the vertical phase needing SUBPEL_TAPS / 2 - 1 lines prior
+  // and SUBPEL_TAPS / 2 lines post.
+  const int im_height = h + SUBPEL_TAPS - 1;
+  const ptrdiff_t border_offset = SUBPEL_TAPS / 2 - 1;
+
+  highbd_convolve8_2d_horiz_sve2(src - src_stride * border_offset, src_stride,
+                                 im_block, im_stride, filter, x0_q4, x_step_q4,
+                                 y0_q4, y_step_q4, w, im_height, bd);
+
+  // Step into the temporary buffer border_offset rows to get actual frame data.
+  vpx_highbd_convolve8_avg_vert_sve2(im_block + im_stride * border_offset,
+                                     im_stride, dst, dst_stride, filter, x0_q4,
+                                     x_step_q4, y0_q4, y_step_q4, w, h, bd);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c
index f4d70761eb..765a054f8d 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c
@@ -13,18 +13,16 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 
-void vpx_highbd_convolve_avg_neon(const uint8_t *src8, ptrdiff_t src_stride,
-                                  uint8_t *dst8, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int filter_x_stride,
-                                  const int16_t *filter_y, int filter_y_stride,
+void vpx_highbd_convolve_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
+                                  uint16_t *dst, ptrdiff_t dst_stride,
+                                  const InterpKernel *filter, int x0_q4,
+                                  int x_step_q4, int y0_q4, int y_step_q4,
                                   int w, int h, int bd) {
-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
   (void)bd;
 
   if (w < 8) {  // avg4
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
index a980ab1a38..7751082083 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
@@ -13,91 +13,101 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 
-void vpx_highbd_convolve_copy_neon(const uint8_t *src8, ptrdiff_t src_stride,
-                                   uint8_t *dst8, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int filter_x_stride,
-                                   const int16_t *filter_y, int filter_y_stride,
+void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride,
+                                   uint16_t *dst, ptrdiff_t dst_stride,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int x_step_q4, int y0_q4, int y_step_q4,
                                    int w, int h, int bd) {
-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
   (void)bd;
 
   if (w < 8) {  // copy4
+    uint16x4_t s0, s1;
     do {
-      vst1_u16(dst, vld1_u16(src));
+      s0 = vld1_u16(src);
       src += src_stride;
+      s1 = vld1_u16(src);
+      src += src_stride;
+
+      vst1_u16(dst, s0);
       dst += dst_stride;
-      vst1_u16(dst, vld1_u16(src));
-      src += src_stride;
+      vst1_u16(dst, s1);
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else if (w == 8) {  // copy8
+    uint16x8_t s0, s1;
     do {
-      vst1q_u16(dst, vld1q_u16(src));
+      s0 = vld1q_u16(src);
       src += src_stride;
+      s1 = vld1q_u16(src);
+      src += src_stride;
+
+      vst1q_u16(dst, s0);
       dst += dst_stride;
-      vst1q_u16(dst, vld1q_u16(src));
-      src += src_stride;
+      vst1q_u16(dst, s1);
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else if (w < 32) {  // copy16
+    uint16x8_t s0, s1, s2, s3;
     do {
-      vst2q_u16(dst, vld2q_u16(src));
+      s0 = vld1q_u16(src);
+      s1 = vld1q_u16(src + 8);
       src += src_stride;
-      dst += dst_stride;
-      vst2q_u16(dst, vld2q_u16(src));
+      s2 = vld1q_u16(src);
+      s3 = vld1q_u16(src + 8);
       src += src_stride;
+
+      vst1q_u16(dst, s0);
+      vst1q_u16(dst + 8, s1);
       dst += dst_stride;
-      vst2q_u16(dst, vld2q_u16(src));
-      src += src_stride;
+      vst1q_u16(dst, s2);
+      vst1q_u16(dst + 8, s3);
       dst += dst_stride;
-      vst2q_u16(dst, vld2q_u16(src));
-      src += src_stride;
-      dst += dst_stride;
-      h -= 4;
-    } while (h > 0);
+      h -= 2;
+    } while (h != 0);
   } else if (w == 32) {  // copy32
+    uint16x8_t s0, s1, s2, s3;
     do {
-      vst4q_u16(dst, vld4q_u16(src));
+      s0 = vld1q_u16(src);
+      s1 = vld1q_u16(src + 8);
+      s2 = vld1q_u16(src + 16);
+      s3 = vld1q_u16(src + 24);
       src += src_stride;
+
+      vst1q_u16(dst, s0);
+      vst1q_u16(dst + 8, s1);
+      vst1q_u16(dst + 16, s2);
+      vst1q_u16(dst + 24, s3);
       dst += dst_stride;
-      vst4q_u16(dst, vld4q_u16(src));
-      src += src_stride;
-      dst += dst_stride;
-      vst4q_u16(dst, vld4q_u16(src));
-      src += src_stride;
-      dst += dst_stride;
-      vst4q_u16(dst, vld4q_u16(src));
-      src += src_stride;
-      dst += dst_stride;
-      h -= 4;
-    } while (h > 0);
+    } while (--h != 0);
   } else {  // copy64
+    uint16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
     do {
-      vst4q_u16(dst, vld4q_u16(src));
-      vst4q_u16(dst + 32, vld4q_u16(src + 32));
+      s0 = vld1q_u16(src);
+      s1 = vld1q_u16(src + 8);
+      s2 = vld1q_u16(src + 16);
+      s3 = vld1q_u16(src + 24);
+      s4 = vld1q_u16(src + 32);
+      s5 = vld1q_u16(src + 40);
+      s6 = vld1q_u16(src + 48);
+      s7 = vld1q_u16(src + 56);
       src += src_stride;
+
+      vst1q_u16(dst, s0);
+      vst1q_u16(dst + 8, s1);
+      vst1q_u16(dst + 16, s2);
+      vst1q_u16(dst + 24, s3);
+      vst1q_u16(dst + 32, s4);
+      vst1q_u16(dst + 40, s5);
+      vst1q_u16(dst + 48, s6);
+      vst1q_u16(dst + 56, s7);
       dst += dst_stride;
-      vst4q_u16(dst, vld4q_u16(src));
-      vst4q_u16(dst + 32, vld4q_u16(src + 32));
-      src += src_stride;
-      dst += dst_stride;
-      vst4q_u16(dst, vld4q_u16(src));
-      vst4q_u16(dst + 32, vld4q_u16(src + 32));
-      src += src_stride;
-      dst += dst_stride;
-      vst4q_u16(dst, vld4q_u16(src));
-      vst4q_u16(dst + 32, vld4q_u16(src + 32));
-      src += src_stride;
-      dst += dst_stride;
-      h -= 4;
-    } while (h > 0);
+    } while (--h != 0);
   }
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c
deleted file mode 100644
index 4e6e109920..0000000000
--- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_dsp/vpx_filter.h"
-#include "vpx_ports/mem.h"
-
-void vpx_highbd_convolve8_neon(const uint8_t *src8, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4, int w,
-                               int h, int bd) {
-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y));
-  // + 1 to make it divisible by 4
-  DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]);
-  const int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  /* Filter starting 3 lines back. The neon implementation will ignore the given
-   * height and filter a multiple of 4 lines. Since this goes in to the temp
-   * buffer which has lots of extra room and is subsequently discarded this is
-   * safe if somewhat less than ideal.   */
-  vpx_highbd_convolve8_horiz_neon(CONVERT_TO_BYTEPTR(src - src_stride * 3),
-                                  src_stride, CONVERT_TO_BYTEPTR(temp), w,
-                                  filter_x, x_step_q4, filter_y, y_step_q4, w,
-                                  intermediate_height, bd);
-
-  /* Step into the temp buffer 3 lines to get the actual frame data */
-  vpx_highbd_convolve8_vert_neon(CONVERT_TO_BYTEPTR(temp + w * 3), w, dst,
-                                 dst_stride, filter_x, x_step_q4, filter_y,
-                                 y_step_q4, w, h, bd);
-}
-
-void vpx_highbd_convolve8_avg_neon(const uint8_t *src8, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int x_step_q4,
-                                   const int16_t *filter_y, int y_step_q4,
-                                   int w, int h, int bd) {
-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y));
-  // + 1 to make it divisible by 4
-  DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]);
-  const int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  /* This implementation has the same issues as above. In addition, we only want
-   * to average the values after both passes.
-   */
-  vpx_highbd_convolve8_horiz_neon(CONVERT_TO_BYTEPTR(src - src_stride * 3),
-                                  src_stride, CONVERT_TO_BYTEPTR(temp), w,
-                                  filter_x, x_step_q4, filter_y, y_step_q4, w,
-                                  intermediate_height, bd);
-  vpx_highbd_convolve8_avg_vert_neon(CONVERT_TO_BYTEPTR(temp + w * 3), w, dst,
-                                     dst_stride, filter_x, x_step_q4, filter_y,
-                                     y_step_q4, w, h, bd);
-}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm
deleted file mode 100644
index d648840df4..0000000000
--- a/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm
+++ /dev/null
@@ -1,196 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-    EXPORT  |vpx_idct16x16_1_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vpx_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, int stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int stride)
-
-|vpx_idct16x16_1_add_neon| PROC
-    ldrsh            r0, [r0]
-
-    ; cospi_16_64 = 11585
-    movw             r12, #0x2d41
-
-    ; out = dct_const_round_shift(input[0] * cospi_16_64)
-    mul              r0, r0, r12               ; input[0] * cospi_16_64
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; out = dct_const_round_shift(out * cospi_16_64)
-    mul              r0, r0, r12               ; out * cospi_16_64
-    mov              r12, r1                   ; save dest
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; a1 = ROUND_POWER_OF_TWO(out, 6)
-    add              r0, r0, #32               ; + (1 <<((6) - 1))
-    asr              r0, r0, #6                ; >> 6
-
-    vdup.s16         q0, r0                    ; duplicate a1
-    mov              r0, #8
-    sub              r2, #8
-
-    ; load destination data row0 - row3
-    vld1.64          {d2}, [r1], r0
-    vld1.64          {d3}, [r1], r2
-    vld1.64          {d4}, [r1], r0
-    vld1.64          {d5}, [r1], r2
-    vld1.64          {d6}, [r1], r0
-    vld1.64          {d7}, [r1], r2
-    vld1.64          {d16}, [r1], r0
-    vld1.64          {d17}, [r1], r2
-
-    vaddw.u8         q9, q0, d2                ; dest[x] + a1
-    vaddw.u8         q10, q0, d3               ; dest[x] + a1
-    vaddw.u8         q11, q0, d4               ; dest[x] + a1
-    vaddw.u8         q12, q0, d5               ; dest[x] + a1
-    vqmovun.s16      d2, q9                    ; clip_pixel
-    vqmovun.s16      d3, q10                   ; clip_pixel
-    vqmovun.s16      d30, q11                  ; clip_pixel
-    vqmovun.s16      d31, q12                  ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
-    vaddw.u8         q10, q0, d7                ; dest[x] + a1
-    vaddw.u8         q11, q0, d16               ; dest[x] + a1
-    vaddw.u8         q12, q0, d17               ; dest[x] + a1
-    vqmovun.s16      d2, q9                     ; clip_pixel
-    vqmovun.s16      d3, q10                    ; clip_pixel
-    vqmovun.s16      d30, q11                   ; clip_pixel
-    vqmovun.s16      d31, q12                   ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    ; load destination data row4 - row7
-    vld1.64          {d2}, [r1], r0
-    vld1.64          {d3}, [r1], r2
-    vld1.64          {d4}, [r1], r0
-    vld1.64          {d5}, [r1], r2
-    vld1.64          {d6}, [r1], r0
-    vld1.64          {d7}, [r1], r2
-    vld1.64          {d16}, [r1], r0
-    vld1.64          {d17}, [r1], r2
-
-    vaddw.u8         q9, q0, d2                ; dest[x] + a1
-    vaddw.u8         q10, q0, d3               ; dest[x] + a1
-    vaddw.u8         q11, q0, d4               ; dest[x] + a1
-    vaddw.u8         q12, q0, d5               ; dest[x] + a1
-    vqmovun.s16      d2, q9                    ; clip_pixel
-    vqmovun.s16      d3, q10                   ; clip_pixel
-    vqmovun.s16      d30, q11                  ; clip_pixel
-    vqmovun.s16      d31, q12                  ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
-    vaddw.u8         q10, q0, d7                ; dest[x] + a1
-    vaddw.u8         q11, q0, d16               ; dest[x] + a1
-    vaddw.u8         q12, q0, d17               ; dest[x] + a1
-    vqmovun.s16      d2, q9                     ; clip_pixel
-    vqmovun.s16      d3, q10                    ; clip_pixel
-    vqmovun.s16      d30, q11                   ; clip_pixel
-    vqmovun.s16      d31, q12                   ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    ; load destination data row8 - row11
-    vld1.64          {d2}, [r1], r0
-    vld1.64          {d3}, [r1], r2
-    vld1.64          {d4}, [r1], r0
-    vld1.64          {d5}, [r1], r2
-    vld1.64          {d6}, [r1], r0
-    vld1.64          {d7}, [r1], r2
-    vld1.64          {d16}, [r1], r0
-    vld1.64          {d17}, [r1], r2
-
-    vaddw.u8         q9, q0, d2                ; dest[x] + a1
-    vaddw.u8         q10, q0, d3               ; dest[x] + a1
-    vaddw.u8         q11, q0, d4               ; dest[x] + a1
-    vaddw.u8         q12, q0, d5               ; dest[x] + a1
-    vqmovun.s16      d2, q9                    ; clip_pixel
-    vqmovun.s16      d3, q10                   ; clip_pixel
-    vqmovun.s16      d30, q11                  ; clip_pixel
-    vqmovun.s16      d31, q12                  ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
-    vaddw.u8         q10, q0, d7                ; dest[x] + a1
-    vaddw.u8         q11, q0, d16               ; dest[x] + a1
-    vaddw.u8         q12, q0, d17               ; dest[x] + a1
-    vqmovun.s16      d2, q9                     ; clip_pixel
-    vqmovun.s16      d3, q10                    ; clip_pixel
-    vqmovun.s16      d30, q11                   ; clip_pixel
-    vqmovun.s16      d31, q12                   ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    ; load destination data row12 - row15
-    vld1.64          {d2}, [r1], r0
-    vld1.64          {d3}, [r1], r2
-    vld1.64          {d4}, [r1], r0
-    vld1.64          {d5}, [r1], r2
-    vld1.64          {d6}, [r1], r0
-    vld1.64          {d7}, [r1], r2
-    vld1.64          {d16}, [r1], r0
-    vld1.64          {d17}, [r1], r2
-
-    vaddw.u8         q9, q0, d2                ; dest[x] + a1
-    vaddw.u8         q10, q0, d3               ; dest[x] + a1
-    vaddw.u8         q11, q0, d4               ; dest[x] + a1
-    vaddw.u8         q12, q0, d5               ; dest[x] + a1
-    vqmovun.s16      d2, q9                    ; clip_pixel
-    vqmovun.s16      d3, q10                   ; clip_pixel
-    vqmovun.s16      d30, q11                  ; clip_pixel
-    vqmovun.s16      d31, q12                  ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
-    vaddw.u8         q10, q0, d7                ; dest[x] + a1
-    vaddw.u8         q11, q0, d16               ; dest[x] + a1
-    vaddw.u8         q12, q0, d17               ; dest[x] + a1
-    vqmovun.s16      d2, q9                     ; clip_pixel
-    vqmovun.s16      d3, q10                    ; clip_pixel
-    vqmovun.s16      d30, q11                   ; clip_pixel
-    vqmovun.s16      d31, q12                   ; clip_pixel
-    vst1.64          {d2}, [r12], r0
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r0
-    vst1.64          {d31}, [r12], r2
-
-    bx               lr
-    ENDP             ; |vpx_idct16x16_1_add_neon|
-
-    END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
index 968bc5cc3a..bf5192a683 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
@@ -32,7 +32,8 @@ static INLINE void idct16x16_1_add_neg_kernel(uint8_t **dest, const int stride,
 
 void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest,
                               int stride) {
-  const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  const int16_t out0 =
+      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
   const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
   const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
 
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm
deleted file mode 100644
index ea6b099d3b..0000000000
--- a/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm
+++ /dev/null
@@ -1,1176 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    INCLUDE vpx_dsp/arm/idct_neon.asm.S
-
-    EXPORT  |vpx_idct16x16_256_add_neon_pass1|
-    EXPORT  |vpx_idct16x16_256_add_neon_pass2|
-    IF CONFIG_VP9_HIGHBITDEPTH
-    EXPORT  |vpx_idct16x16_256_add_neon_pass1_tran_low|
-    EXPORT  |vpx_idct16x16_256_add_neon_pass2_tran_low|
-    ENDIF
-    EXPORT  |vpx_idct16x16_10_add_neon_pass1|
-    EXPORT  |vpx_idct16x16_10_add_neon_pass2|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.
-    MACRO
-    TRANSPOSE8X8
-    vswp            d17, d24
-    vswp            d23, d30
-    vswp            d21, d28
-    vswp            d19, d26
-    vtrn.32         q8, q10
-    vtrn.32         q9, q11
-    vtrn.32         q12, q14
-    vtrn.32         q13, q15
-    vtrn.16         q8, q9
-    vtrn.16         q10, q11
-    vtrn.16         q12, q13
-    vtrn.16         q14, q15
-    MEND
-
-    AREA    Block, CODE, READONLY ; name this block of code
-;void |vpx_idct16x16_256_add_neon_pass1|(const int16_t *input, int16_t *output)
-;
-; r0  const int16_t *input
-; r1  int16_t *output
-
-; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
-; will be stored back into q8-q15 registers. This function will touch q0-q7
-; registers and use them as buffer during calculation.
-|vpx_idct16x16_256_add_neon_pass1| PROC
-
-    ; TODO(hkuang): Find a better way to load the elements.
-    ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
-    vld2.s16        {q8,q9}, [r0]!
-    vld2.s16        {q9,q10}, [r0]!
-    vld2.s16        {q10,q11}, [r0]!
-    vld2.s16        {q11,q12}, [r0]!
-    vld2.s16        {q12,q13}, [r0]!
-    vld2.s16        {q13,q14}, [r0]!
-    vld2.s16        {q14,q15}, [r0]!
-    vld2.s16        {q1,q2}, [r0]!
-    vmov.s16        q15, q1
-
-idct16x16_256_add_neon_pass1
-    ; cospi_28_64 = 3196
-    movw            r3, #0x0c7c
-
-    ; cospi_4_64  = 16069
-    movw            r12, #0x3ec5
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; stage 3
-    vdup.16         d0, r3                    ; duplicate cospi_28_64
-    vdup.16         d1, r12                   ; duplicate cospi_4_64
-
-    ; preloading to avoid stall
-    ; cospi_12_64 = 13623
-    movw            r3, #0x3537
-
-    ; cospi_20_64 = 9102
-    movw            r12, #0x238e
-
-    ; step2[4] * cospi_28_64
-    vmull.s16       q2, d18, d0
-    vmull.s16       q3, d19, d0
-
-    ; step2[4] * cospi_4_64
-    vmull.s16       q5, d18, d1
-    vmull.s16       q6, d19, d1
-
-    ; temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64
-    vmlsl.s16       q2, d30, d1
-    vmlsl.s16       q3, d31, d1
-
-    ; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64
-    vmlal.s16       q5, d30, d0
-    vmlal.s16       q6, d31, d0
-
-    vdup.16         d2, r3                    ; duplicate cospi_12_64
-    vdup.16         d3, r12                   ; duplicate cospi_20_64
-
-    ; dct_const_round_shift(temp1)
-    vrshrn.s32      d8, q2, #14               ; >> 14
-    vrshrn.s32      d9, q3, #14               ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vrshrn.s32      d14, q5, #14              ; >> 14
-    vrshrn.s32      d15, q6, #14              ; >> 14
-
-    ; preloading to avoid stall
-    ; cospi_16_64 = 11585
-    movw            r3, #0x2d41
-
-    ; cospi_24_64 = 6270
-    movw            r12, #0x187e
-
-    ; step2[5] * cospi_12_64
-    vmull.s16       q2, d26, d2
-    vmull.s16       q3, d27, d2
-
-    ; step2[5] * cospi_20_64
-    vmull.s16       q9, d26, d3
-    vmull.s16       q15, d27, d3
-
-    ; temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64
-    vmlsl.s16       q2, d22, d3
-    vmlsl.s16       q3, d23, d3
-
-    ; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64
-    vmlal.s16       q9, d22, d2
-    vmlal.s16       q15, d23, d2
-
-    ; dct_const_round_shift(temp1)
-    vrshrn.s32      d10, q2, #14              ; >> 14
-    vrshrn.s32      d11, q3, #14              ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vrshrn.s32      d12, q9, #14              ; >> 14
-    vrshrn.s32      d13, q15, #14             ; >> 14
-
-    ; stage 4
-    vdup.16         d30, r3                   ; cospi_16_64
-
-    ; step1[0] * cospi_16_64
-    vmull.s16       q2, d16, d30
-    vmull.s16       q11, d17, d30
-
-    ; step1[1] * cospi_16_64
-    vmull.s16       q0, d24, d30
-    vmull.s16       q1, d25, d30
-
-    ; cospi_8_64 = 15137
-    movw            r3, #0x3b21
-
-    vdup.16         d30, r12                  ; duplicate cospi_24_64
-    vdup.16         d31, r3                   ; duplicate cospi_8_64
-
-    ; temp1 = (step1[0] + step1[1]) * cospi_16_64
-    vadd.s32        q3, q2, q0
-    vadd.s32        q12, q11, q1
-
-    ; temp2 = (step1[0] - step1[1]) * cospi_16_64
-    vsub.s32        q13, q2, q0
-    vsub.s32        q1, q11, q1
-
-    ; dct_const_round_shift(temp1)
-    vrshrn.s32      d16, q3, #14              ; >> 14
-    vrshrn.s32      d17, q12, #14             ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vrshrn.s32      d18, q13, #14             ; >> 14
-    vrshrn.s32      d19, q1, #14              ; >> 14
-
-    ; step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
-    ; step1[2] * cospi_8_64
-    vmull.s16       q0, d20, d31
-    vmull.s16       q1, d21, d31
-
-    ; step1[2] * cospi_24_64
-    vmull.s16       q12, d20, d30
-    vmull.s16       q13, d21, d30
-
-    ; temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64
-    vmlal.s16       q0, d28, d30
-    vmlal.s16       q1, d29, d30
-
-    ; temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64
-    vmlsl.s16       q12, d28, d31
-    vmlsl.s16       q13, d29, d31
-
-    ; dct_const_round_shift(temp2)
-    vrshrn.s32      d22, q0, #14              ; >> 14
-    vrshrn.s32      d23, q1, #14              ; >> 14
-
-    ; dct_const_round_shift(temp1)
-    vrshrn.s32      d20, q12, #14             ; >> 14
-    vrshrn.s32      d21, q13, #14             ; >> 14
-
-    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5];
-    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5];
-    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7];
-    vadd.s16        q15, q6, q7               ; step2[7] = step1[6] + step1[7];
-
-    ; cospi_16_64 = 11585
-    movw            r3, #0x2d41
-
-    ; stage 5
-    vadd.s16        q0, q8, q11               ; step1[0] = step2[0] + step2[3];
-    vadd.s16        q1, q9, q10               ; step1[1] = step2[1] + step2[2];
-    vsub.s16        q2, q9, q10               ; step1[2] = step2[1] - step2[2];
-    vsub.s16        q3, q8, q11               ; step1[3] = step2[0] - step2[3];
-
-    vdup.16         d16, r3;                  ; duplicate cospi_16_64
-
-    ; step2[5] * cospi_16_64
-    vmull.s16       q11, d26, d16
-    vmull.s16       q12, d27, d16
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q9, d28, d16
-    vmull.s16       q10, d29, d16
-
-    ; temp1 = (step2[6] - step2[5]) * cospi_16_64
-    vsub.s32        q6, q9, q11
-    vsub.s32        q13, q10, q12
-
-    ; temp2 = (step2[5] + step2[6]) * cospi_16_64
-    vadd.s32        q9, q9, q11
-    vadd.s32        q10, q10, q12
-
-    ; dct_const_round_shift(temp1)
-    vrshrn.s32      d10, q6, #14              ; >> 14
-    vrshrn.s32      d11, q13, #14             ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vrshrn.s32      d12, q9, #14              ; >> 14
-    vrshrn.s32      d13, q10, #14             ; >> 14
-
-    ; stage 6
-    vadd.s16        q8, q0, q15               ; step2[0] = step1[0] + step1[7];
-    vadd.s16        q9, q1, q6                ; step2[1] = step1[1] + step1[6];
-    vadd.s16        q10, q2, q5               ; step2[2] = step1[2] + step1[5];
-    vadd.s16        q11, q3, q4               ; step2[3] = step1[3] + step1[4];
-    vsub.s16        q12, q3, q4               ; step2[4] = step1[3] - step1[4];
-    vsub.s16        q13, q2, q5               ; step2[5] = step1[2] - step1[5];
-    vsub.s16        q14, q1, q6               ; step2[6] = step1[1] - step1[6];
-    vsub.s16        q15, q0, q15              ; step2[7] = step1[0] - step1[7];
-
-    ; store the data
-    vst1.64         {q8-q9}, [r1]!
-    vst1.64         {q10-q11}, [r1]!
-    vst1.64         {q12-q13}, [r1]!
-    vst1.64         {q14-q15}, [r1]
-
-    bx              lr
-    ENDP  ; |vpx_idct16x16_256_add_neon_pass1|
-
-    IF CONFIG_VP9_HIGHBITDEPTH
-;void |vpx_idct16x16_256_add_neon_pass1_tran_low|(const tran_low_t *input,
-;                                                 int16_t *output)
-;
-; r0  const tran_low_t *input
-; r1  int16_t *output
-
-|vpx_idct16x16_256_add_neon_pass1_tran_low| PROC
-    LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0
-    LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0
-    LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0
-    LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0
-    LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0
-    LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0
-    LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0
-    LOAD_TRAN_LOW_TO_S16X2 d2, d3, d4, d5, r0
-    vmov.s16        q15, q1
-
-    b               idct16x16_256_add_neon_pass1
-    ENDP  ; |vpx_idct16x16_256_add_neon_pass1_tran_low|
-    ENDIF  ; CONFIG_VP9_HIGHBITDEPTH
-
-;void vpx_idct16x16_256_add_neon_pass2(const int16_t *src,
-;                                      int16_t *output,
-;                                      int16_t *pass1_output,
-;                                      int16_t skip_adding,
-;                                      uint8_t *dest,
-;                                      int stride)
-;
-; r0  const int16_t *src
-; r1  int16_t *output
-; r2  int16_t *pass1_output
-; r3  int16_t skip_adding
-; r4  uint8_t *dest
-; r5  int stride
-
-; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
-; will be stored back into q8-q15 registers. This function will touch q0-q7
-; registers and use them as buffer during calculation.
-|vpx_idct16x16_256_add_neon_pass2| PROC
-    ; TODO(hkuang): Find a better way to load the elements.
-    ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15
-    vld2.s16        {q8,q9}, [r0]!
-    vld2.s16        {q9,q10}, [r0]!
-    vld2.s16        {q10,q11}, [r0]!
-    vld2.s16        {q11,q12}, [r0]!
-    vld2.s16        {q12,q13}, [r0]!
-    vld2.s16        {q13,q14}, [r0]!
-    vld2.s16        {q14,q15}, [r0]!
-    vld2.s16        {q0,q1}, [r0]!
-    vmov.s16        q15, q0;
-
-idct16x16_256_add_neon_pass2
-    push            {r3-r9}
-
-    ; cospi_30_64 = 1606
-    movw            r3, #0x0646
-
-    ; cospi_2_64  = 16305
-    movw            r12, #0x3fb1
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; stage 3
-    vdup.16         d12, r3                   ; duplicate cospi_30_64
-    vdup.16         d13, r12                  ; duplicate cospi_2_64
-
-    ; preloading to avoid stall
-    ; cospi_14_64 = 12665
-    movw            r3, #0x3179
-
-    ; cospi_18_64 = 10394
-    movw            r12, #0x289a
-
-    ; step1[8] * cospi_30_64
-    vmull.s16       q2, d16, d12
-    vmull.s16       q3, d17, d12
-
-    ; step1[8] * cospi_2_64
-    vmull.s16       q1, d16, d13
-    vmull.s16       q4, d17, d13
-
-    ; temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64
-    vmlsl.s16       q2, d30, d13
-    vmlsl.s16       q3, d31, d13
-
-    ; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64
-    vmlal.s16       q1, d30, d12
-    vmlal.s16       q4, d31, d12
-
-    vdup.16         d30, r3                   ; duplicate cospi_14_64
-    vdup.16         d31, r12                  ; duplicate cospi_18_64
-
-    ; dct_const_round_shift(temp1)
-    vrshrn.s32      d0, q2, #14               ; >> 14
-    vrshrn.s32      d1, q3, #14               ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vrshrn.s32      d14, q1, #14              ; >> 14
-    vrshrn.s32      d15, q4, #14              ; >> 14
-
-    ; preloading to avoid stall
-    ; cospi_22_64 = 7723
-    movw            r3, #0x1e2b
-
-    ; cospi_10_64 = 14449
-    movw            r12, #0x3871
-
-    ; step1[9] * cospi_14_64
-    vmull.s16       q2, d24, d30
-    vmull.s16       q3, d25, d30
-
-    ; step1[9] * cospi_18_64
-    vmull.s16       q4, d24, d31
-    vmull.s16       q5, d25, d31
-
-    ; temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64
-    vmlsl.s16       q2, d22, d31
-    vmlsl.s16       q3, d23, d31
-
-    ; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64
-    vmlal.s16       q4, d22, d30
-    vmlal.s16       q5, d23, d30
-
-    vdup.16         d30, r3                   ; duplicate cospi_22_64
-    vdup.16         d31, r12                  ; duplicate cospi_10_64
-
-    ; dct_const_round_shift(temp1)
-    vrshrn.s32      d2, q2, #14               ; >> 14
-    vrshrn.s32      d3, q3, #14               ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vrshrn.s32      d12, q4, #14              ; >> 14
-    vrshrn.s32      d13, q5, #14              ; >> 14
-
-    ; step1[10] * cospi_22_64
-    vmull.s16       q11, d20, d30
-    vmull.s16       q12, d21, d30
-
-    ; step1[10] * cospi_10_64
-    vmull.s16       q4, d20, d31
-    vmull.s16       q5, d21, d31
-
-    ; temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64
-    vmlsl.s16       q11, d26, d31
-    vmlsl.s16       q12, d27, d31
-
-    ; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64
-    vmlal.s16       q4, d26, d30
-    vmlal.s16       q5, d27, d30
-
-    ; preloading to avoid stall
-    ; cospi_6_64 = 15679
-    movw            r3, #0x3d3f
-
-    ; cospi_26_64 = 4756
-    movw            r12, #0x1294
-
-    vdup.16         d30, r3                   ; duplicate cospi_6_64
-    vdup.16         d31, r12                  ; duplicate cospi_26_64
-
-    ; dct_const_round_shift(temp1)
-    vrshrn.s32      d4, q11, #14              ; >> 14
-    vrshrn.s32      d5, q12, #14              ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vrshrn.s32      d11, q5, #14              ; >> 14
-    vrshrn.s32      d10, q4, #14              ; >> 14
-
-    ; step1[11] * cospi_6_64
-    vmull.s16       q10, d28, d30
-    vmull.s16       q11, d29, d30
-
-    ; step1[11] * cospi_26_64
-    vmull.s16       q12, d28, d31
-    vmull.s16       q13, d29, d31
-
-    ; temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64
-    vmlsl.s16       q10, d18, d31
-    vmlsl.s16       q11, d19, d31
-
-    ; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64
-    vmlal.s16       q12, d18, d30
-    vmlal.s16       q13, d19, d30
-
-    vsub.s16        q9, q0, q1                ; step1[9]=step2[8]-step2[9]
-    vadd.s16        q0, q0, q1                ; step1[8]=step2[8]+step2[9]
-
-    ; dct_const_round_shift(temp1)
-    vrshrn.s32      d6, q10, #14              ; >> 14
-    vrshrn.s32      d7, q11, #14              ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vrshrn.s32      d8, q12, #14              ; >> 14
-    vrshrn.s32      d9, q13, #14              ; >> 14
-
-    ; stage 3
-    vsub.s16        q10, q3, q2               ; step1[10]=-step2[10]+step2[11]
-    vadd.s16        q11, q2, q3               ; step1[11]=step2[10]+step2[11]
-    vadd.s16        q12, q4, q5               ; step1[12]=step2[12]+step2[13]
-    vsub.s16        q13, q4, q5               ; step1[13]=step2[12]-step2[13]
-    vsub.s16        q14, q7, q6               ; step1[14]=-step2[14]+tep2[15]
-    vadd.s16        q7, q6, q7                ; step1[15]=step2[14]+step2[15]
-
-    ; stage 4
-    ; cospi_24_64 = 6270
-    movw            r3, #0x187e
-
-    ; cospi_8_64 = 15137
-    movw            r12, #0x3b21
-
-    ; -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
-    vdup.16         d30, r12                  ; duplicate cospi_8_64
-    vdup.16         d31, r3                   ; duplicate cospi_24_64
-
-    ; step1[9] * cospi_24_64
-    vmull.s16       q2, d18, d31
-    vmull.s16       q3, d19, d31
-
-    ; step1[14] * cospi_24_64
-    vmull.s16       q4, d28, d31
-    vmull.s16       q5, d29, d31
-
-    ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64
-    vmlal.s16       q2, d28, d30
-    vmlal.s16       q3, d29, d30
-
-    ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
-    vmlsl.s16       q4, d18, d30
-    vmlsl.s16       q5, d19, d30
-
-    rsb             r12, #0
-    vdup.16         d30, r12                  ; duplicate -cospi_8_64
-
-    ; dct_const_round_shift(temp2)
-    vrshrn.s32      d12, q2, #14              ; >> 14
-    vrshrn.s32      d13, q3, #14              ; >> 14
-
-    ; dct_const_round_shift(temp1)
-    vrshrn.s32      d2, q4, #14               ; >> 14
-    vrshrn.s32      d3, q5, #14               ; >> 14
-
-    vmov.s16        q3, q11
-    vmov.s16        q4, q12
-
-    ; - step1[13] * cospi_8_64
-    vmull.s16       q11, d26, d30
-    vmull.s16       q12, d27, d30
-
-    ; -step1[10] * cospi_8_64
-    vmull.s16       q8, d20, d30
-    vmull.s16       q9, d21, d30
-
-    ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
-    vmlsl.s16       q11, d20, d31
-    vmlsl.s16       q12, d21, d31
-
-    ; temp1 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
-    vmlal.s16       q8, d26, d31
-    vmlal.s16       q9, d27, d31
-
-    ; dct_const_round_shift(temp2)
-    vrshrn.s32      d4, q11, #14              ; >> 14
-    vrshrn.s32      d5, q12, #14              ; >> 14
-
-    ; dct_const_round_shift(temp1)
-    vrshrn.s32      d10, q8, #14              ; >> 14
-    vrshrn.s32      d11, q9, #14              ; >> 14
-
-    ; stage 5
-    vadd.s16        q8, q0, q3                ; step1[8] = step2[8]+step2[11];
-    vadd.s16        q9, q1, q2                ; step1[9] = step2[9]+step2[10];
-    vsub.s16        q10, q1, q2               ; step1[10] = step2[9]-step2[10];
-    vsub.s16        q11, q0, q3               ; step1[11] = step2[8]-step2[11];
-    vsub.s16        q12, q7, q4               ; step1[12] =-step2[12]+step2[15];
-    vsub.s16        q13, q6, q5               ; step1[13] =-step2[13]+step2[14];
-    vadd.s16        q14, q6, q5               ; step1[14] =step2[13]+step2[14];
-    vadd.s16        q15, q7, q4               ; step1[15] =step2[12]+step2[15];
-
-    ; stage 6.
-    ; cospi_16_64 = 11585
-    movw            r12, #0x2d41
-
-    vdup.16         d14, r12                  ; duplicate cospi_16_64
-
-    ; step1[13] * cospi_16_64
-    vmull.s16       q3, d26, d14
-    vmull.s16       q4, d27, d14
-
-    ; step1[10] * cospi_16_64
-    vmull.s16       q0, d20, d14
-    vmull.s16       q1, d21, d14
-
-    ; temp1 = (-step1[10] + step1[13]) * cospi_16_64
-    vsub.s32        q5, q3, q0
-    vsub.s32        q6, q4, q1
-
-    ; temp2 = (step1[10] + step1[13]) * cospi_16_64
-    vadd.s32        q10, q3, q0
-    vadd.s32        q4, q4, q1
-
-    ; dct_const_round_shift(temp1)
-    vrshrn.s32      d4, q5, #14               ; >> 14
-    vrshrn.s32      d5, q6, #14               ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vrshrn.s32      d10, q10, #14             ; >> 14
-    vrshrn.s32      d11, q4, #14              ; >> 14
-
-    ; step1[11] * cospi_16_64
-    vmull.s16       q0, d22, d14
-    vmull.s16       q1, d23, d14
-
-    ; step1[12] * cospi_16_64
-    vmull.s16       q13, d24, d14
-    vmull.s16       q6, d25, d14
-
-    ; temp1 = (-step1[11] + step1[12]) * cospi_16_64
-    vsub.s32        q10, q13, q0
-    vsub.s32        q4, q6, q1
-
-    ; temp2 = (step1[11] + step1[12]) * cospi_16_64
-    vadd.s32        q13, q13, q0
-    vadd.s32        q6, q6, q1
-
-    ; dct_const_round_shift(temp1)
-    vrshrn.s32      d6, q10, #14              ; >> 14
-    vrshrn.s32      d7, q4, #14               ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vrshrn.s32      d8, q13, #14              ; >> 14
-    vrshrn.s32      d9, q6, #14               ; >> 14
-
-    mov              r4, #16                  ; pass1_output stride
-    ldr              r3, [sp]                 ; load skip_adding
-    cmp              r3, #0                   ; check if need adding dest data
-    beq              skip_adding_dest
-
-    ldr              r7, [sp, #28]            ; dest used to save element 0-7
-    mov              r9, r7                   ; save dest pointer for later use
-    ldr              r8, [sp, #32]            ; load stride
-
-    ; stage 7
-    ; load the data in pass1
-    vld1.s16        {q0}, [r2], r4            ; load data step2[0]
-    vld1.s16        {q1}, [r2], r4            ; load data step2[1]
-    vld1.s16        {q10}, [r2], r4           ; load data step2[2]
-    vld1.s16        {q11}, [r2], r4           ; load data step2[3]
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vld1.64         {d13}, [r7], r8           ; load destinatoin data
-    vadd.s16        q12, q0, q15              ; step2[0] + step2[15]
-    vadd.s16        q13, q1, q14              ; step2[1] + step2[14]
-    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
-    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
-    vaddw.u8        q12, q12, d12             ; + dest[j * stride + i]
-    vaddw.u8        q13, q13, d13             ; + dest[j * stride + i]
-    vqmovun.s16     d12, q12                  ; clip pixel
-    vqmovun.s16     d13, q13                  ; clip pixel
-    vst1.64         {d12}, [r9], r8           ; store the data
-    vst1.64         {d13}, [r9], r8           ; store the data
-    vsub.s16        q14, q1, q14              ; step2[1] - step2[14]
-    vsub.s16        q15, q0, q15              ; step2[0] - step2[15]
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vld1.64         {d13}, [r7], r8           ; load destinatoin data
-    vadd.s16        q12, q10, q5              ; step2[2] + step2[13]
-    vadd.s16        q13, q11, q4              ; step2[3] + step2[12]
-    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
-    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
-    vaddw.u8        q12, q12, d12             ; + dest[j * stride + i]
-    vaddw.u8        q13, q13, d13             ; + dest[j * stride + i]
-    vqmovun.s16     d12, q12                  ; clip pixel
-    vqmovun.s16     d13, q13                  ; clip pixel
-    vst1.64         {d12}, [r9], r8           ; store the data
-    vst1.64         {d13}, [r9], r8           ; store the data
-    vsub.s16        q4, q11, q4               ; step2[3] - step2[12]
-    vsub.s16        q5, q10, q5               ; step2[2] - step2[13]
-    vld1.s16        {q0}, [r2], r4            ; load data step2[4]
-    vld1.s16        {q1}, [r2], r4            ; load data step2[5]
-    vld1.s16        {q10}, [r2], r4           ; load data step2[6]
-    vld1.s16        {q11}, [r2], r4           ; load data step2[7]
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vld1.64         {d13}, [r7], r8           ; load destinatoin data
-    vadd.s16        q12, q0, q3               ; step2[4] + step2[11]
-    vadd.s16        q13, q1, q2               ; step2[5] + step2[10]
-    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
-    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
-    vaddw.u8        q12, q12, d12             ; + dest[j * stride + i]
-    vaddw.u8        q13, q13, d13             ; + dest[j * stride + i]
-    vqmovun.s16     d12, q12                  ; clip pixel
-    vqmovun.s16     d13, q13                  ; clip pixel
-    vst1.64         {d12}, [r9], r8           ; store the data
-    vst1.64         {d13}, [r9], r8           ; store the data
-    vsub.s16        q2, q1, q2                ; step2[5] - step2[10]
-    vsub.s16        q3, q0, q3                ; step2[4] - step2[11]
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vld1.64         {d13}, [r7], r8           ; load destinatoin data
-    vadd.s16        q12, q10, q9              ; step2[6] + step2[9]
-    vadd.s16        q13, q11, q8              ; step2[7] + step2[8]
-    vrshr.s16       q12, q12, #6              ; ROUND_POWER_OF_TWO
-    vrshr.s16       q13, q13, #6              ; ROUND_POWER_OF_TWO
-    vaddw.u8        q12, q12, d12             ; + dest[j * stride + i]
-    vaddw.u8        q13, q13, d13             ; + dest[j * stride + i]
-    vqmovun.s16     d12, q12                  ; clip pixel
-    vqmovun.s16     d13, q13                  ; clip pixel
-    vst1.64         {d12}, [r9], r8           ; store the data
-    vst1.64         {d13}, [r9], r8           ; store the data
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vld1.64         {d13}, [r7], r8           ; load destinatoin data
-    vsub.s16        q8, q11, q8               ; step2[7] - step2[8]
-    vsub.s16        q9, q10, q9               ; step2[6] - step2[9]
-
-    ; store the data  output 8,9,10,11,12,13,14,15
-    vrshr.s16       q8, q8, #6                ; ROUND_POWER_OF_TWO
-    vaddw.u8        q8, q8, d12               ; + dest[j * stride + i]
-    vqmovun.s16     d12, q8                   ; clip pixel
-    vst1.64         {d12}, [r9], r8           ; store the data
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vrshr.s16       q9, q9, #6
-    vaddw.u8        q9, q9, d13               ; + dest[j * stride + i]
-    vqmovun.s16     d13, q9                   ; clip pixel
-    vst1.64         {d13}, [r9], r8           ; store the data
-    vld1.64         {d13}, [r7], r8           ; load destinatoin data
-    vrshr.s16       q2, q2, #6
-    vaddw.u8        q2, q2, d12               ; + dest[j * stride + i]
-    vqmovun.s16     d12, q2                   ; clip pixel
-    vst1.64         {d12}, [r9], r8           ; store the data
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vrshr.s16       q3, q3, #6
-    vaddw.u8        q3, q3, d13               ; + dest[j * stride + i]
-    vqmovun.s16     d13, q3                   ; clip pixel
-    vst1.64         {d13}, [r9], r8           ; store the data
-    vld1.64         {d13}, [r7], r8           ; load destinatoin data
-    vrshr.s16       q4, q4, #6
-    vaddw.u8        q4, q4, d12               ; + dest[j * stride + i]
-    vqmovun.s16     d12, q4                   ; clip pixel
-    vst1.64         {d12}, [r9], r8           ; store the data
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vrshr.s16       q5, q5, #6
-    vaddw.u8        q5, q5, d13               ; + dest[j * stride + i]
-    vqmovun.s16     d13, q5                   ; clip pixel
-    vst1.64         {d13}, [r9], r8           ; store the data
-    vld1.64         {d13}, [r7], r8           ; load destinatoin data
-    vrshr.s16       q14, q14, #6
-    vaddw.u8        q14, q14, d12             ; + dest[j * stride + i]
-    vqmovun.s16     d12, q14                  ; clip pixel
-    vst1.64         {d12}, [r9], r8           ; store the data
-    vld1.64         {d12}, [r7], r8           ; load destinatoin data
-    vrshr.s16       q15, q15, #6
-    vaddw.u8        q15, q15, d13             ; + dest[j * stride + i]
-    vqmovun.s16     d13, q15                  ; clip pixel
-    vst1.64         {d13}, [r9], r8           ; store the data
-    b               end_idct16x16_pass2
-
-skip_adding_dest
-    ; stage 7
-    ; load the data in pass1
-    mov              r5, #24
-    mov              r3, #8
-
-    vld1.s16        {q0}, [r2], r4            ; load data step2[0]
-    vld1.s16        {q1}, [r2], r4            ; load data step2[1]
-    vadd.s16        q12, q0, q15              ; step2[0] + step2[15]
-    vadd.s16        q13, q1, q14              ; step2[1] + step2[14]
-    vld1.s16        {q10}, [r2], r4           ; load data step2[2]
-    vld1.s16        {q11}, [r2], r4           ; load data step2[3]
-    vst1.64         {d24}, [r1], r3           ; store output[0]
-    vst1.64         {d25}, [r1], r5
-    vst1.64         {d26}, [r1], r3           ; store output[1]
-    vst1.64         {d27}, [r1], r5
-    vadd.s16        q12, q10, q5              ; step2[2] + step2[13]
-    vadd.s16        q13, q11, q4              ; step2[3] + step2[12]
-    vsub.s16        q14, q1, q14              ; step2[1] - step2[14]
-    vsub.s16        q15, q0, q15              ; step2[0] - step2[15]
-    vst1.64         {d24}, [r1], r3           ; store output[2]
-    vst1.64         {d25}, [r1], r5
-    vst1.64         {d26}, [r1], r3           ; store output[3]
-    vst1.64         {d27}, [r1], r5
-    vsub.s16        q4, q11, q4               ; step2[3] - step2[12]
-    vsub.s16        q5, q10, q5               ; step2[2] - step2[13]
-    vld1.s16        {q0}, [r2], r4            ; load data step2[4]
-    vld1.s16        {q1}, [r2], r4            ; load data step2[5]
-    vadd.s16        q12, q0, q3               ; step2[4] + step2[11]
-    vadd.s16        q13, q1, q2               ; step2[5] + step2[10]
-    vld1.s16        {q10}, [r2], r4           ; load data step2[6]
-    vld1.s16        {q11}, [r2], r4           ; load data step2[7]
-    vst1.64         {d24}, [r1], r3           ; store output[4]
-    vst1.64         {d25}, [r1], r5
-    vst1.64         {d26}, [r1], r3           ; store output[5]
-    vst1.64         {d27}, [r1], r5
-    vadd.s16        q12, q10, q9              ; step2[6] + step2[9]
-    vadd.s16        q13, q11, q8              ; step2[7] + step2[8]
-    vsub.s16        q2, q1, q2                ; step2[5] - step2[10]
-    vsub.s16        q3, q0, q3                ; step2[4] - step2[11]
-    vsub.s16        q8, q11, q8               ; step2[7] - step2[8]
-    vsub.s16        q9, q10, q9               ; step2[6] - step2[9]
-    vst1.64         {d24}, [r1], r3           ; store output[6]
-    vst1.64         {d25}, [r1], r5
-    vst1.64         {d26}, [r1], r3           ; store output[7]
-    vst1.64         {d27}, [r1], r5
-
-    ; store the data  output 8,9,10,11,12,13,14,15
-    vst1.64         {d16}, [r1], r3
-    vst1.64         {d17}, [r1], r5
-    vst1.64         {d18}, [r1], r3
-    vst1.64         {d19}, [r1], r5
-    vst1.64         {d4}, [r1], r3
-    vst1.64         {d5}, [r1], r5
-    vst1.64         {d6}, [r1], r3
-    vst1.64         {d7}, [r1], r5
-    vst1.64         {d8}, [r1], r3
-    vst1.64         {d9}, [r1], r5
-    vst1.64         {d10}, [r1], r3
-    vst1.64         {d11}, [r1], r5
-    vst1.64         {d28}, [r1], r3
-    vst1.64         {d29}, [r1], r5
-    vst1.64         {d30}, [r1], r3
-    vst1.64         {d31}, [r1], r5
-end_idct16x16_pass2
-    pop             {r3-r9}
-    bx              lr
-    ENDP  ; |vpx_idct16x16_256_add_neon_pass2|
-
-    IF CONFIG_VP9_HIGHBITDEPTH
-;void vpx_idct16x16_256_add_neon_pass2_tran_low(const tran_low_t *src,
-;                                               int16_t *output,
-;                                               int16_t *pass1_output,
-;                                               int16_t skip_adding,
-;                                               uint8_t *dest,
-;                                               int stride)
-;
-; r0  const tran_low_t *src
-; r1  int16_t *output
-; r2  int16_t *pass1_output
-; r3  int16_t skip_adding
-; r4  uint8_t *dest
-; r5  int stride
-
-|vpx_idct16x16_256_add_neon_pass2_tran_low| PROC
-    LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0
-    LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0
-    LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0
-    LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0
-    LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0
-    LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0
-    LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0
-    LOAD_TRAN_LOW_TO_S16X2 d0, d1, d2, d3, r0
-    vmov.s16        q15, q0
-
-    b               idct16x16_256_add_neon_pass2
-    ENDP  ; |vpx_idct16x16_256_add_neon_pass2_tran_low|
-    ENDIF  ; CONFIG_VP9_HIGHBITDEPTH
-
-;void |vpx_idct16x16_10_add_neon_pass1|(const tran_low_t *input,
-;                                       int16_t *output)
-;
-; r0  const tran_low_t *input
-; r1  int16_t *output
-
-; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
-; will be stored back into q8-q15 registers. This function will touch q0-q7
-; registers and use them as buffer during calculation.
-|vpx_idct16x16_10_add_neon_pass1| PROC
-
-    ; TODO(hkuang): Find a better way to load the elements.
-    ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
-    LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0
-    LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0
-    LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0
-    LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0
-    LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0
-    LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0
-    LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0
-    LOAD_TRAN_LOW_TO_S16X2 d2, d3, d4, d5, r0
-    vmov.s16        q15, q1
-
-    ; cospi_28_64*2 = 6392
-    movw            r3, #0x18f8
-
-    ; cospi_4_64*2  = 32138
-    movw            r12, #0x7d8a
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; stage 3
-    vdup.16         q0, r3                    ; duplicate cospi_28_64*2
-    vdup.16         q1, r12                   ; duplicate cospi_4_64*2
-
-    ; The following instructions use vqrdmulh to do the
-    ; dct_const_round_shift(step2[4] * cospi_28_64). vvqrdmulh will multiply,
-    ; double, and return the high 16 bits, effectively giving >> 15. Doubling
-    ; the constant will change this to >> 14.
-    ; dct_const_round_shift(step2[4] * cospi_28_64);
-    vqrdmulh.s16    q4, q9, q0
-
-    ; preloading to avoid stall
-    ; cospi_16_64*2 = 23170
-    movw            r3, #0x5a82
-
-    ; dct_const_round_shift(step2[4] * cospi_4_64);
-    vqrdmulh.s16    q7, q9, q1
-
-    ; stage 4
-    vdup.16         q1, r3                    ; cospi_16_64*2
-
-    ; cospi_16_64 = 11585
-    movw            r3, #0x2d41
-
-    vdup.16         d4, r3;                   ; duplicate cospi_16_64
-
-    ; dct_const_round_shift(step1[0] * cospi_16_64)
-    vqrdmulh.s16    q8, q8, q1
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q9, d14, d4
-    vmull.s16       q10, d15, d4
-
-    ; step2[5] * cospi_16_64
-    vmull.s16       q12, d9, d4
-    vmull.s16       q11, d8, d4
-
-    ; temp1 = (step2[6] - step2[5]) * cospi_16_64
-    vsub.s32        q15, q10, q12
-    vsub.s32        q6, q9, q11
-
-    ; temp2 = (step2[5] + step2[6]) * cospi_16_64
-    vadd.s32        q9, q9, q11
-    vadd.s32        q10, q10, q12
-
-    ; dct_const_round_shift(temp1)
-    vrshrn.s32      d11, q15, #14             ; >> 14
-    vrshrn.s32      d10, q6, #14              ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vrshrn.s32      d12, q9, #14              ; >> 14
-    vrshrn.s32      d13, q10, #14             ; >> 14
-
-    ; stage 6
-    vadd.s16        q2, q8, q7                ; step2[0] = step1[0] + step1[7];
-    vadd.s16        q10, q8, q5               ; step2[2] = step1[2] + step1[5];
-    vadd.s16        q11, q8, q4               ; step2[3] = step1[3] + step1[4];
-    vadd.s16        q9, q8, q6                ; step2[1] = step1[1] + step1[6];
-    vsub.s16        q12, q8, q4               ; step2[4] = step1[3] - step1[4];
-    vsub.s16        q13, q8, q5               ; step2[5] = step1[2] - step1[5];
-    vsub.s16        q14, q8, q6               ; step2[6] = step1[1] - step1[6];
-    vsub.s16        q15, q8, q7               ; step2[7] = step1[0] - step1[7];
-
-    ; store the data
-    vst1.64         {q2}, [r1]!
-    vst1.64         {q9-q10}, [r1]!
-    vst1.64         {q11-q12}, [r1]!
-    vst1.64         {q13-q14}, [r1]!
-    vst1.64         {q15}, [r1]
-
-    bx              lr
-    ENDP  ; |vpx_idct16x16_10_add_neon_pass1|
-
-;void vpx_idct16x16_10_add_neon_pass2(const tran_low_t *src, int16_t *output,
-;                                     int16_t *pass1_output)
-;
-; r0  const tran_low_t *src
-; r1  int16_t *output
-; r2  int16_t *pass1_output
-
-; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
-; will be stored back into q8-q15 registers. This function will touch q0-q7
-; registers and use them as buffer during calculation.
-|vpx_idct16x16_10_add_neon_pass2| PROC
-    push            {r3-r9}
-
-    ; TODO(hkuang): Find a better way to load the elements.
-    ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15
-    LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0
-    LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0
-    LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0
-    LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0
-    LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0
-    LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0
-    LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0
-    LOAD_TRAN_LOW_TO_S16X2 d0, d1, d2, d3, r0
-    vmov.s16        q15, q0;
-
-    ; 2*cospi_30_64 = 3212
-    movw            r3, #0x0c8c
-
-    ; 2*cospi_2_64  = 32610
-    movw            r12, #0x7f62
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; stage 3
-    vdup.16         q6, r3                    ; duplicate 2*cospi_30_64
-
-    ; dct_const_round_shift(step1[8] * cospi_30_64)
-    vqrdmulh.s16    q0, q8, q6
-
-    vdup.16         q6, r12                   ; duplicate 2*cospi_2_64
-
-    ; dct_const_round_shift(step1[8] * cospi_2_64)
-    vqrdmulh.s16    q7, q8, q6
-
-    ; preloading to avoid stall
-    ; 2*cospi_26_64 = 9512
-    movw            r12, #0x2528
-    rsb             r12, #0
-    vdup.16         q15, r12                  ; duplicate -2*cospi_26_64
-
-    ; 2*cospi_6_64 = 31358
-    movw            r3, #0x7a7e
-    vdup.16         q14, r3                   ; duplicate 2*cospi_6_64
-
-    ; dct_const_round_shift(- step1[12] * cospi_26_64)
-    vqrdmulh.s16    q3, q9, q15
-
-    ; dct_const_round_shift(step1[12] * cospi_6_64)
-    vqrdmulh.s16    q4, q9, q14
-
-    ; stage 4
-    ; cospi_24_64 = 6270
-    movw            r3, #0x187e
-    vdup.16         d31, r3                   ; duplicate cospi_24_64
-
-    ; cospi_8_64 = 15137
-    movw            r12, #0x3b21
-    vdup.16         d30, r12                  ; duplicate cospi_8_64
-
-    ; step1[14] * cospi_24_64
-    vmull.s16       q12, d14, d31
-    vmull.s16       q5, d15, d31
-
-    ; step1[9] * cospi_24_64
-    vmull.s16       q2, d0, d31
-    vmull.s16       q11, d1, d31
-
-    ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
-    vmlsl.s16       q12, d0, d30
-    vmlsl.s16       q5, d1, d30
-
-    ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64
-    vmlal.s16       q2, d14, d30
-    vmlal.s16       q11, d15, d30
-
-    rsb              r12, #0
-    vdup.16          d30, r12                 ; duplicate -cospi_8_64
-
-    ; dct_const_round_shift(temp1)
-    vrshrn.s32      d2, q12, #14              ; >> 14
-    vrshrn.s32      d3, q5, #14               ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vrshrn.s32      d12, q2, #14              ; >> 14
-    vrshrn.s32      d13, q11, #14             ; >> 14
-
-    ; - step1[13] * cospi_8_64
-    vmull.s16       q10, d8, d30
-    vmull.s16       q13, d9, d30
-
-    ; -step1[10] * cospi_8_64
-    vmull.s16       q8, d6, d30
-    vmull.s16       q9, d7, d30
-
-    ; temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64
-    vmlsl.s16       q10, d6, d31
-    vmlsl.s16       q13, d7, d31
-
-    ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
-    vmlal.s16       q8, d8, d31
-    vmlal.s16       q9, d9, d31
-
-    ; dct_const_round_shift(temp1)
-    vrshrn.s32      d4, q10, #14              ; >> 14
-    vrshrn.s32      d5, q13, #14              ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vrshrn.s32      d10, q8, #14              ; >> 14
-    vrshrn.s32      d11, q9, #14              ; >> 14
-
-    ; stage 5
-    vadd.s16        q8, q0, q3                ; step1[8] = step2[8]+step2[11];
-    vadd.s16        q9, q1, q2                ; step1[9] = step2[9]+step2[10];
-    vsub.s16        q10, q1, q2               ; step1[10] = step2[9]-step2[10];
-    vsub.s16        q11, q0, q3               ; step1[11] = step2[8]-step2[11];
-    vsub.s16        q12, q7, q4               ; step1[12] =-step2[12]+step2[15];
-    vsub.s16        q13, q6, q5               ; step1[13] =-step2[13]+step2[14];
-    vadd.s16        q14, q6, q5               ; step1[14] =step2[13]+step2[14];
-    vadd.s16        q15, q7, q4               ; step1[15] =step2[12]+step2[15];
-
-    ; stage 6.
-    ; cospi_16_64 = 11585
-    movw            r12, #0x2d41
-
-    vdup.16         d14, r12                  ; duplicate cospi_16_64
-
-    ; step1[13] * cospi_16_64
-    vmull.s16       q3, d26, d14
-    vmull.s16       q4, d27, d14
-
-    ; step1[10] * cospi_16_64
-    vmull.s16       q0, d20, d14
-    vmull.s16       q1, d21, d14
-
-    ; temp1 = (-step1[10] + step1[13]) * cospi_16_64
-    vsub.s32        q5, q3, q0
-    vsub.s32        q6, q4, q1
-
-    ; temp2 = (step1[10] + step1[13]) * cospi_16_64
-    vadd.s32        q0, q3, q0
-    vadd.s32        q1, q4, q1
-
-    ; dct_const_round_shift(temp1)
-    vrshrn.s32      d4, q5, #14               ; >> 14
-    vrshrn.s32      d5, q6, #14               ; >> 14
-
-    ; dct_const_round_shift(temp2)
-    vrshrn.s32      d10, q0, #14              ; >> 14
-    vrshrn.s32      d11, q1, #14              ; >> 14
-
-    ; step1[11] * cospi_16_64
-    vmull.s16       q0, d22, d14
-    vmull.s16       q1, d23, d14
-
-    ; step1[12] * cospi_16_64
-    vmull.s16       q13, d24, d14
-    vmull.s16       q6, d25, d14
-
-    ; temp1 = (-step1[11] + step1[12]) * cospi_16_64
-    vsub.s32        q10, q13, q0
-    vsub.s32        q4, q6, q1
-
-    ; temp2 = (step1[11] + step1[12]) * cospi_16_64
-    vadd.s32        q13, q13, q0
-    vadd.s32        q6, q6, q1
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vrshrn.s32      d6, q10, #14              ; >> 14
-    vrshrn.s32      d7, q4, #14               ; >> 14
-
-    ; dct_const_round_shift((step1[11] + step1[12]) * cospi_16_64);
-    vrshrn.s32      d8, q13, #14              ; >> 14
-    vrshrn.s32      d9, q6, #14               ; >> 14
-
-    mov              r4, #16                  ; pass1_output stride
-    ldr              r3, [sp]                 ; load skip_adding
-
-    ; stage 7
-    ; load the data in pass1
-    mov              r5, #24
-    mov              r3, #8
-
-    vld1.s16        {q0}, [r2], r4            ; load data step2[0]
-    vld1.s16        {q1}, [r2], r4            ; load data step2[1]
-    vadd.s16        q12, q0, q15              ; step2[0] + step2[15]
-    vadd.s16        q13, q1, q14              ; step2[1] + step2[14]
-    vld1.s16        {q10}, [r2], r4           ; load data step2[2]
-    vld1.s16        {q11}, [r2], r4           ; load data step2[3]
-    vst1.64         {d24}, [r1], r3           ; store output[0]
-    vst1.64         {d25}, [r1], r5
-    vst1.64         {d26}, [r1], r3           ; store output[1]
-    vst1.64         {d27}, [r1], r5
-    vadd.s16        q12, q10, q5              ; step2[2] + step2[13]
-    vadd.s16        q13, q11, q4              ; step2[3] + step2[12]
-    vsub.s16        q14, q1, q14              ; step2[1] - step2[14]
-    vsub.s16        q15, q0, q15              ; step2[0] - step2[15]
-    vst1.64         {d24}, [r1], r3           ; store output[2]
-    vst1.64         {d25}, [r1], r5
-    vst1.64         {d26}, [r1], r3           ; store output[3]
-    vst1.64         {d27}, [r1], r5
-    vsub.s16        q4, q11, q4               ; step2[3] - step2[12]
-    vsub.s16        q5, q10, q5               ; step2[2] - step2[13]
-    vld1.s16        {q0}, [r2], r4            ; load data step2[4]
-    vld1.s16        {q1}, [r2], r4            ; load data step2[5]
-    vadd.s16        q12, q0, q3               ; step2[4] + step2[11]
-    vadd.s16        q13, q1, q2               ; step2[5] + step2[10]
-    vld1.s16        {q10}, [r2], r4           ; load data step2[6]
-    vld1.s16        {q11}, [r2], r4           ; load data step2[7]
-    vst1.64         {d24}, [r1], r3           ; store output[4]
-    vst1.64         {d25}, [r1], r5
-    vst1.64         {d26}, [r1], r3           ; store output[5]
-    vst1.64         {d27}, [r1], r5
-    vadd.s16        q12, q10, q9              ; step2[6] + step2[9]
-    vadd.s16        q13, q11, q8              ; step2[7] + step2[8]
-    vsub.s16        q2, q1, q2                ; step2[5] - step2[10]
-    vsub.s16        q3, q0, q3                ; step2[4] - step2[11]
-    vsub.s16        q8, q11, q8               ; step2[7] - step2[8]
-    vsub.s16        q9, q10, q9               ; step2[6] - step2[9]
-    vst1.64         {d24}, [r1], r3           ; store output[6]
-    vst1.64         {d25}, [r1], r5
-    vst1.64         {d26}, [r1], r3           ; store output[7]
-    vst1.64         {d27}, [r1], r5
-
-    ; store the data  output 8,9,10,11,12,13,14,15
-    vst1.64         {d16}, [r1], r3
-    vst1.64         {d17}, [r1], r5
-    vst1.64         {d18}, [r1], r3
-    vst1.64         {d19}, [r1], r5
-    vst1.64         {d4}, [r1], r3
-    vst1.64         {d5}, [r1], r5
-    vst1.64         {d6}, [r1], r3
-    vst1.64         {d7}, [r1], r5
-    vst1.64         {d8}, [r1], r3
-    vst1.64         {d9}, [r1], r5
-    vst1.64         {d10}, [r1], r3
-    vst1.64         {d11}, [r1], r5
-    vst1.64         {d28}, [r1], r3
-    vst1.64         {d29}, [r1], r5
-    vst1.64         {d30}, [r1], r3
-    vst1.64         {d31}, [r1], r5
-end_idct10_16x16_pass2
-    pop             {r3-r9}
-    bx              lr
-    ENDP  ; |vpx_idct16x16_10_add_neon_pass2|
-    END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
index f4eb24615e..fc7f4a7747 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
@@ -10,1255 +10,755 @@
 
 #include <arm_neon.h>
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/txfm_common.h"
 
-static void idct16x16_256_add_neon_pass1(const int16x8_t s0, const int16x8_t s1,
-                                         const int16x8_t s2, const int16x8_t s3,
-                                         const int16x8_t s4, const int16x8_t s5,
-                                         const int16x8_t s6, const int16x8_t s7,
-                                         int16_t *out) {
-  int16x4_t d0s16, d1s16, d2s16, d3s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32;
-  int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0,
+                                int16x4_t *const d1) {
+  *d0 = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+  *d1 = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
+}
 
-  q8s16 = s0;
-  q9s16 = s1;
-  q10s16 = s2;
-  q11s16 = s3;
-  q12s16 = s4;
-  q13s16 = s5;
-  q14s16 = s6;
-  q15s16 = s7;
+static INLINE void idct_cospi_8_24_d_kernel(const int16x4_t s0,
+                                            const int16x4_t s1,
+                                            const int16x4_t cospi_0_8_16_24,
+                                            int32x4_t *const t32) {
+  t32[0] = vmull_lane_s16(s0, cospi_0_8_16_24, 3);
+  t32[1] = vmull_lane_s16(s1, cospi_0_8_16_24, 3);
+  t32[0] = vmlsl_lane_s16(t32[0], s1, cospi_0_8_16_24, 1);
+  t32[1] = vmlal_lane_s16(t32[1], s0, cospi_0_8_16_24, 1);
+}
 
-  transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                    &q15s16);
+static INLINE void idct_cospi_8_24_d(const int16x4_t s0, const int16x4_t s1,
+                                     const int16x4_t cospi_0_8_16_24,
+                                     int16x4_t *const d0, int16x4_t *const d1) {
+  int32x4_t t32[2];
 
-  d16s16 = vget_low_s16(q8s16);
-  d17s16 = vget_high_s16(q8s16);
-  d18s16 = vget_low_s16(q9s16);
-  d19s16 = vget_high_s16(q9s16);
-  d20s16 = vget_low_s16(q10s16);
-  d21s16 = vget_high_s16(q10s16);
-  d22s16 = vget_low_s16(q11s16);
-  d23s16 = vget_high_s16(q11s16);
-  d24s16 = vget_low_s16(q12s16);
-  d25s16 = vget_high_s16(q12s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-  d28s16 = vget_low_s16(q14s16);
-  d29s16 = vget_high_s16(q14s16);
-  d30s16 = vget_low_s16(q15s16);
-  d31s16 = vget_high_s16(q15s16);
+  idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32);
+  wrap_low_4x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_8_24_neg_d(const int16x4_t s0, const int16x4_t s1,
+                                         const int16x4_t cospi_0_8_16_24,
+                                         int16x4_t *const d0,
+                                         int16x4_t *const d1) {
+  int32x4_t t32[2];
+
+  idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32);
+  t32[1] = vnegq_s32(t32[1]);
+  wrap_low_4x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_16_16_d(const int16x4_t s0, const int16x4_t s1,
+                                      const int16x4_t cospi_0_8_16_24,
+                                      int16x4_t *const d0,
+                                      int16x4_t *const d1) {
+  int32x4_t t32[3];
+
+  t32[2] = vmull_lane_s16(s1, cospi_0_8_16_24, 2);
+  t32[0] = vmlsl_lane_s16(t32[2], s0, cospi_0_8_16_24, 2);
+  t32[1] = vmlal_lane_s16(t32[2], s0, cospi_0_8_16_24, 2);
+  wrap_low_4x2(t32, d0, d1);
+}
+
+void vpx_idct16x16_256_add_half1d(const void *const input, int16_t *output,
+                                  void *const dest, const int stride,
+                                  const int highbd_flag) {
+  const int16x8_t cospis0 = vld1q_s16(kCospi);
+  const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
+  const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
+  const int16x4_t cospi_4_12_20N_28 = vget_high_s16(cospis0);
+  const int16x4_t cospi_2_30_10_22 = vget_low_s16(cospis1);
+  const int16x4_t cospi_6_26N_14_18N = vget_high_s16(cospis1);
+  int16x8_t in[16], step1[16], step2[16], out[16];
+
+  // Load input (16x8)
+  if (output) {
+    const tran_low_t *inputT = (const tran_low_t *)input;
+    in[0] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[8] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[1] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[9] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[2] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[10] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[3] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[11] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[4] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[12] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[5] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[13] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[6] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[14] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[7] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[15] = load_tran_low_to_s16q(inputT);
+  } else {
+    const int16_t *inputT = (const int16_t *)input;
+    in[0] = vld1q_s16(inputT);
+    inputT += 8;
+    in[8] = vld1q_s16(inputT);
+    inputT += 8;
+    in[1] = vld1q_s16(inputT);
+    inputT += 8;
+    in[9] = vld1q_s16(inputT);
+    inputT += 8;
+    in[2] = vld1q_s16(inputT);
+    inputT += 8;
+    in[10] = vld1q_s16(inputT);
+    inputT += 8;
+    in[3] = vld1q_s16(inputT);
+    inputT += 8;
+    in[11] = vld1q_s16(inputT);
+    inputT += 8;
+    in[4] = vld1q_s16(inputT);
+    inputT += 8;
+    in[12] = vld1q_s16(inputT);
+    inputT += 8;
+    in[5] = vld1q_s16(inputT);
+    inputT += 8;
+    in[13] = vld1q_s16(inputT);
+    inputT += 8;
+    in[6] = vld1q_s16(inputT);
+    inputT += 8;
+    in[14] = vld1q_s16(inputT);
+    inputT += 8;
+    in[7] = vld1q_s16(inputT);
+    inputT += 8;
+    in[15] = vld1q_s16(inputT);
+  }
+
+  // Transpose
+  transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+                    &in[7]);
+  transpose_s16_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14],
+                    &in[15]);
+
+  // stage 1
+  step1[0] = in[0 / 2];
+  step1[1] = in[16 / 2];
+  step1[2] = in[8 / 2];
+  step1[3] = in[24 / 2];
+  step1[4] = in[4 / 2];
+  step1[5] = in[20 / 2];
+  step1[6] = in[12 / 2];
+  step1[7] = in[28 / 2];
+  step1[8] = in[2 / 2];
+  step1[9] = in[18 / 2];
+  step1[10] = in[10 / 2];
+  step1[11] = in[26 / 2];
+  step1[12] = in[6 / 2];
+  step1[13] = in[22 / 2];
+  step1[14] = in[14 / 2];
+  step1[15] = in[30 / 2];
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[1] = step1[1];
+  step2[2] = step1[2];
+  step2[3] = step1[3];
+  step2[4] = step1[4];
+  step2[5] = step1[5];
+  step2[6] = step1[6];
+  step2[7] = step1[7];
+  idct_cospi_2_30(step1[8], step1[15], cospi_2_30_10_22, &step2[8], &step2[15]);
+  idct_cospi_14_18(step1[9], step1[14], cospi_6_26N_14_18N, &step2[9],
+                   &step2[14]);
+  idct_cospi_10_22(step1[10], step1[13], cospi_2_30_10_22, &step2[10],
+                   &step2[13]);
+  idct_cospi_6_26(step1[11], step1[12], cospi_6_26N_14_18N, &step2[11],
+                  &step2[12]);
 
   // stage 3
-  d0s16 = vdup_n_s16((int16_t)cospi_28_64);
-  d1s16 = vdup_n_s16((int16_t)cospi_4_64);
-
-  q2s32 = vmull_s16(d18s16, d0s16);
-  q3s32 = vmull_s16(d19s16, d0s16);
-  q5s32 = vmull_s16(d18s16, d1s16);
-  q6s32 = vmull_s16(d19s16, d1s16);
-
-  q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
-  q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
-  q5s32 = vmlal_s16(q5s32, d30s16, d0s16);
-  q6s32 = vmlal_s16(q6s32, d31s16, d0s16);
-
-  d2s16 = vdup_n_s16((int16_t)cospi_12_64);
-  d3s16 = vdup_n_s16((int16_t)cospi_20_64);
-
-  d8s16 = vrshrn_n_s32(q2s32, 14);
-  d9s16 = vrshrn_n_s32(q3s32, 14);
-  d14s16 = vrshrn_n_s32(q5s32, 14);
-  d15s16 = vrshrn_n_s32(q6s32, 14);
-  q4s16 = vcombine_s16(d8s16, d9s16);
-  q7s16 = vcombine_s16(d14s16, d15s16);
-
-  q2s32 = vmull_s16(d26s16, d2s16);
-  q3s32 = vmull_s16(d27s16, d2s16);
-  q9s32 = vmull_s16(d26s16, d3s16);
-  q15s32 = vmull_s16(d27s16, d3s16);
-
-  q2s32 = vmlsl_s16(q2s32, d22s16, d3s16);
-  q3s32 = vmlsl_s16(q3s32, d23s16, d3s16);
-  q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
-  q15s32 = vmlal_s16(q15s32, d23s16, d2s16);
-
-  d10s16 = vrshrn_n_s32(q2s32, 14);
-  d11s16 = vrshrn_n_s32(q3s32, 14);
-  d12s16 = vrshrn_n_s32(q9s32, 14);
-  d13s16 = vrshrn_n_s32(q15s32, 14);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+  idct_cospi_4_28(step2[4], step2[7], cospi_4_12_20N_28, &step1[4], &step1[7]);
+  idct_cospi_12_20(step2[5], step2[6], cospi_4_12_20N_28, &step1[5], &step1[6]);
+  step1[8] = vaddq_s16(step2[8], step2[9]);
+  step1[9] = vsubq_s16(step2[8], step2[9]);
+  step1[10] = vsubq_s16(step2[11], step2[10]);
+  step1[11] = vaddq_s16(step2[11], step2[10]);
+  step1[12] = vaddq_s16(step2[12], step2[13]);
+  step1[13] = vsubq_s16(step2[12], step2[13]);
+  step1[14] = vsubq_s16(step2[15], step2[14]);
+  step1[15] = vaddq_s16(step2[15], step2[14]);
 
   // stage 4
-  d30s16 = vdup_n_s16((int16_t)cospi_16_64);
-
-  q2s32 = vmull_s16(d16s16, d30s16);
-  q11s32 = vmull_s16(d17s16, d30s16);
-  q0s32 = vmull_s16(d24s16, d30s16);
-  q1s32 = vmull_s16(d25s16, d30s16);
-
-  d30s16 = vdup_n_s16((int16_t)cospi_24_64);
-  d31s16 = vdup_n_s16((int16_t)cospi_8_64);
-
-  q3s32 = vaddq_s32(q2s32, q0s32);
-  q12s32 = vaddq_s32(q11s32, q1s32);
-  q13s32 = vsubq_s32(q2s32, q0s32);
-  q1s32 = vsubq_s32(q11s32, q1s32);
-
-  d16s16 = vrshrn_n_s32(q3s32, 14);
-  d17s16 = vrshrn_n_s32(q12s32, 14);
-  d18s16 = vrshrn_n_s32(q13s32, 14);
-  d19s16 = vrshrn_n_s32(q1s32, 14);
-  q8s16 = vcombine_s16(d16s16, d17s16);
-  q9s16 = vcombine_s16(d18s16, d19s16);
-
-  q0s32 = vmull_s16(d20s16, d31s16);
-  q1s32 = vmull_s16(d21s16, d31s16);
-  q12s32 = vmull_s16(d20s16, d30s16);
-  q13s32 = vmull_s16(d21s16, d30s16);
-
-  q0s32 = vmlal_s16(q0s32, d28s16, d30s16);
-  q1s32 = vmlal_s16(q1s32, d29s16, d30s16);
-  q12s32 = vmlsl_s16(q12s32, d28s16, d31s16);
-  q13s32 = vmlsl_s16(q13s32, d29s16, d31s16);
-
-  d22s16 = vrshrn_n_s32(q0s32, 14);
-  d23s16 = vrshrn_n_s32(q1s32, 14);
-  d20s16 = vrshrn_n_s32(q12s32, 14);
-  d21s16 = vrshrn_n_s32(q13s32, 14);
-  q10s16 = vcombine_s16(d20s16, d21s16);
-  q11s16 = vcombine_s16(d22s16, d23s16);
-
-  q13s16 = vsubq_s16(q4s16, q5s16);
-  q4s16 = vaddq_s16(q4s16, q5s16);
-  q14s16 = vsubq_s16(q7s16, q6s16);
-  q15s16 = vaddq_s16(q6s16, q7s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-  d28s16 = vget_low_s16(q14s16);
-  d29s16 = vget_high_s16(q14s16);
+  idct_cospi_16_16_q(step1[1], step1[0], cospi_0_8_16_24, &step2[1], &step2[0]);
+  idct_cospi_8_24_q(step1[2], step1[3], cospi_0_8_16_24, &step2[2], &step2[3]);
+  step2[4] = vaddq_s16(step1[4], step1[5]);
+  step2[5] = vsubq_s16(step1[4], step1[5]);
+  step2[6] = vsubq_s16(step1[7], step1[6]);
+  step2[7] = vaddq_s16(step1[7], step1[6]);
+  step2[8] = step1[8];
+  idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+                    &step2[14]);
+  idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
+                        &step2[10]);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
 
   // stage 5
-  q0s16 = vaddq_s16(q8s16, q11s16);
-  q1s16 = vaddq_s16(q9s16, q10s16);
-  q2s16 = vsubq_s16(q9s16, q10s16);
-  q3s16 = vsubq_s16(q8s16, q11s16);
-
-  d16s16 = vdup_n_s16((int16_t)cospi_16_64);
-
-  q11s32 = vmull_s16(d26s16, d16s16);
-  q12s32 = vmull_s16(d27s16, d16s16);
-  q9s32 = vmull_s16(d28s16, d16s16);
-  q10s32 = vmull_s16(d29s16, d16s16);
-
-  q6s32 = vsubq_s32(q9s32, q11s32);
-  q13s32 = vsubq_s32(q10s32, q12s32);
-  q9s32 = vaddq_s32(q9s32, q11s32);
-  q10s32 = vaddq_s32(q10s32, q12s32);
-
-  d10s16 = vrshrn_n_s32(q6s32, 14);
-  d11s16 = vrshrn_n_s32(q13s32, 14);
-  d12s16 = vrshrn_n_s32(q9s32, 14);
-  d13s16 = vrshrn_n_s32(q10s32, 14);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
+  step1[0] = vaddq_s16(step2[0], step2[3]);
+  step1[1] = vaddq_s16(step2[1], step2[2]);
+  step1[2] = vsubq_s16(step2[1], step2[2]);
+  step1[3] = vsubq_s16(step2[0], step2[3]);
+  step1[4] = step2[4];
+  idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
+  step1[7] = step2[7];
+  step1[8] = vaddq_s16(step2[8], step2[11]);
+  step1[9] = vaddq_s16(step2[9], step2[10]);
+  step1[10] = vsubq_s16(step2[9], step2[10]);
+  step1[11] = vsubq_s16(step2[8], step2[11]);
+  step1[12] = vsubq_s16(step2[15], step2[12]);
+  step1[13] = vsubq_s16(step2[14], step2[13]);
+  step1[14] = vaddq_s16(step2[14], step2[13]);
+  step1[15] = vaddq_s16(step2[15], step2[12]);
 
   // stage 6
-  q8s16 = vaddq_s16(q0s16, q15s16);
-  q9s16 = vaddq_s16(q1s16, q6s16);
-  q10s16 = vaddq_s16(q2s16, q5s16);
-  q11s16 = vaddq_s16(q3s16, q4s16);
-  q12s16 = vsubq_s16(q3s16, q4s16);
-  q13s16 = vsubq_s16(q2s16, q5s16);
-  q14s16 = vsubq_s16(q1s16, q6s16);
-  q15s16 = vsubq_s16(q0s16, q15s16);
-
-  // store the data
-  vst1q_s16(out, q8s16);
-  out += 8;
-  vst1q_s16(out, q9s16);
-  out += 8;
-  vst1q_s16(out, q10s16);
-  out += 8;
-  vst1q_s16(out, q11s16);
-  out += 8;
-  vst1q_s16(out, q12s16);
-  out += 8;
-  vst1q_s16(out, q13s16);
-  out += 8;
-  vst1q_s16(out, q14s16);
-  out += 8;
-  vst1q_s16(out, q15s16);
-}
-
-void vpx_idct16x16_256_add_neon_pass1(const int16_t *in, int16_t *out) {
-  int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-  int16x8x2_t v;
-
-  v = vld2q_s16(in);
-  s0 = v.val[0];
-  in += 16;
-  v = vld2q_s16(in);
-  s1 = v.val[0];
-  in += 16;
-  v = vld2q_s16(in);
-  s2 = v.val[0];
-  in += 16;
-  v = vld2q_s16(in);
-  s3 = v.val[0];
-  in += 16;
-  v = vld2q_s16(in);
-  s4 = v.val[0];
-  in += 16;
-  v = vld2q_s16(in);
-  s5 = v.val[0];
-  in += 16;
-  v = vld2q_s16(in);
-  s6 = v.val[0];
-  in += 16;
-  v = vld2q_s16(in);
-  s7 = v.val[0];
-
-  idct16x16_256_add_neon_pass1(s0, s1, s2, s3, s4, s5, s6, s7, out);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vpx_idct16x16_256_add_neon_pass1_tran_low(const tran_low_t *in,
-                                               int16_t *out) {
-  int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-  int16x8x2_t v;
-
-  v = load_tran_low_to_s16x2q(in);
-  s0 = v.val[0];
-  in += 16;
-  v = load_tran_low_to_s16x2q(in);
-  s1 = v.val[0];
-  in += 16;
-  v = load_tran_low_to_s16x2q(in);
-  s2 = v.val[0];
-  in += 16;
-  v = load_tran_low_to_s16x2q(in);
-  s3 = v.val[0];
-  in += 16;
-  v = load_tran_low_to_s16x2q(in);
-  s4 = v.val[0];
-  in += 16;
-  v = load_tran_low_to_s16x2q(in);
-  s5 = v.val[0];
-  in += 16;
-  v = load_tran_low_to_s16x2q(in);
-  s6 = v.val[0];
-  in += 16;
-  v = load_tran_low_to_s16x2q(in);
-  s7 = v.val[0];
-
-  idct16x16_256_add_neon_pass1(s0, s1, s2, s3, s4, s5, s6, s7, out);
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-static void idct16x16_256_add_neon_pass2(const int16x8_t s0, const int16x8_t s1,
-                                         const int16x8_t s2, const int16x8_t s3,
-                                         const int16x8_t s4, const int16x8_t s5,
-                                         const int16x8_t s6, const int16x8_t s7,
-                                         int16_t *out, int16_t *pass1_output,
-                                         int16_t skip_adding, uint8_t *dest,
-                                         int stride) {
-  uint8_t *d;
-  uint8x8_t d12u8, d13u8;
-  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  uint64x1_t d24u64, d25u64, d26u64, d27u64;
-  int64x1_t d12s64, d13s64;
-  uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16;
-  uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16;
-  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
-  int32x4_t q10s32, q11s32, q12s32, q13s32;
-
-  q8s16 = s0;
-  q9s16 = s1;
-  q10s16 = s2;
-  q11s16 = s3;
-  q12s16 = s4;
-  q13s16 = s5;
-  q14s16 = s6;
-  q15s16 = s7;
-
-  transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                    &q15s16);
-
-  d16s16 = vget_low_s16(q8s16);
-  d17s16 = vget_high_s16(q8s16);
-  d18s16 = vget_low_s16(q9s16);
-  d19s16 = vget_high_s16(q9s16);
-  d20s16 = vget_low_s16(q10s16);
-  d21s16 = vget_high_s16(q10s16);
-  d22s16 = vget_low_s16(q11s16);
-  d23s16 = vget_high_s16(q11s16);
-  d24s16 = vget_low_s16(q12s16);
-  d25s16 = vget_high_s16(q12s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-  d28s16 = vget_low_s16(q14s16);
-  d29s16 = vget_high_s16(q14s16);
-  d30s16 = vget_low_s16(q15s16);
-  d31s16 = vget_high_s16(q15s16);
-
-  // stage 3
-  d12s16 = vdup_n_s16((int16_t)cospi_30_64);
-  d13s16 = vdup_n_s16((int16_t)cospi_2_64);
-
-  q2s32 = vmull_s16(d16s16, d12s16);
-  q3s32 = vmull_s16(d17s16, d12s16);
-  q1s32 = vmull_s16(d16s16, d13s16);
-  q4s32 = vmull_s16(d17s16, d13s16);
-
-  q2s32 = vmlsl_s16(q2s32, d30s16, d13s16);
-  q3s32 = vmlsl_s16(q3s32, d31s16, d13s16);
-  q1s32 = vmlal_s16(q1s32, d30s16, d12s16);
-  q4s32 = vmlal_s16(q4s32, d31s16, d12s16);
-
-  d0s16 = vrshrn_n_s32(q2s32, 14);
-  d1s16 = vrshrn_n_s32(q3s32, 14);
-  d14s16 = vrshrn_n_s32(q1s32, 14);
-  d15s16 = vrshrn_n_s32(q4s32, 14);
-  q0s16 = vcombine_s16(d0s16, d1s16);
-  q7s16 = vcombine_s16(d14s16, d15s16);
-
-  d30s16 = vdup_n_s16((int16_t)cospi_14_64);
-  d31s16 = vdup_n_s16((int16_t)cospi_18_64);
-
-  q2s32 = vmull_s16(d24s16, d30s16);
-  q3s32 = vmull_s16(d25s16, d30s16);
-  q4s32 = vmull_s16(d24s16, d31s16);
-  q5s32 = vmull_s16(d25s16, d31s16);
-
-  q2s32 = vmlsl_s16(q2s32, d22s16, d31s16);
-  q3s32 = vmlsl_s16(q3s32, d23s16, d31s16);
-  q4s32 = vmlal_s16(q4s32, d22s16, d30s16);
-  q5s32 = vmlal_s16(q5s32, d23s16, d30s16);
-
-  d2s16 = vrshrn_n_s32(q2s32, 14);
-  d3s16 = vrshrn_n_s32(q3s32, 14);
-  d12s16 = vrshrn_n_s32(q4s32, 14);
-  d13s16 = vrshrn_n_s32(q5s32, 14);
-  q1s16 = vcombine_s16(d2s16, d3s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  d30s16 = vdup_n_s16((int16_t)cospi_22_64);
-  d31s16 = vdup_n_s16((int16_t)cospi_10_64);
-
-  q11s32 = vmull_s16(d20s16, d30s16);
-  q12s32 = vmull_s16(d21s16, d30s16);
-  q4s32 = vmull_s16(d20s16, d31s16);
-  q5s32 = vmull_s16(d21s16, d31s16);
-
-  q11s32 = vmlsl_s16(q11s32, d26s16, d31s16);
-  q12s32 = vmlsl_s16(q12s32, d27s16, d31s16);
-  q4s32 = vmlal_s16(q4s32, d26s16, d30s16);
-  q5s32 = vmlal_s16(q5s32, d27s16, d30s16);
-
-  d4s16 = vrshrn_n_s32(q11s32, 14);
-  d5s16 = vrshrn_n_s32(q12s32, 14);
-  d11s16 = vrshrn_n_s32(q5s32, 14);
-  d10s16 = vrshrn_n_s32(q4s32, 14);
-  q2s16 = vcombine_s16(d4s16, d5s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-
-  d30s16 = vdup_n_s16((int16_t)cospi_6_64);
-  d31s16 = vdup_n_s16((int16_t)cospi_26_64);
-
-  q10s32 = vmull_s16(d28s16, d30s16);
-  q11s32 = vmull_s16(d29s16, d30s16);
-  q12s32 = vmull_s16(d28s16, d31s16);
-  q13s32 = vmull_s16(d29s16, d31s16);
-
-  q10s32 = vmlsl_s16(q10s32, d18s16, d31s16);
-  q11s32 = vmlsl_s16(q11s32, d19s16, d31s16);
-  q12s32 = vmlal_s16(q12s32, d18s16, d30s16);
-  q13s32 = vmlal_s16(q13s32, d19s16, d30s16);
-
-  d6s16 = vrshrn_n_s32(q10s32, 14);
-  d7s16 = vrshrn_n_s32(q11s32, 14);
-  d8s16 = vrshrn_n_s32(q12s32, 14);
-  d9s16 = vrshrn_n_s32(q13s32, 14);
-  q3s16 = vcombine_s16(d6s16, d7s16);
-  q4s16 = vcombine_s16(d8s16, d9s16);
-
-  // stage 3
-  q9s16 = vsubq_s16(q0s16, q1s16);
-  q0s16 = vaddq_s16(q0s16, q1s16);
-  q10s16 = vsubq_s16(q3s16, q2s16);
-  q11s16 = vaddq_s16(q2s16, q3s16);
-  q12s16 = vaddq_s16(q4s16, q5s16);
-  q13s16 = vsubq_s16(q4s16, q5s16);
-  q14s16 = vsubq_s16(q7s16, q6s16);
-  q7s16 = vaddq_s16(q6s16, q7s16);
-
-  // stage 4
-  d18s16 = vget_low_s16(q9s16);
-  d19s16 = vget_high_s16(q9s16);
-  d20s16 = vget_low_s16(q10s16);
-  d21s16 = vget_high_s16(q10s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-  d28s16 = vget_low_s16(q14s16);
-  d29s16 = vget_high_s16(q14s16);
-
-  d30s16 = vdup_n_s16((int16_t)cospi_8_64);
-  d31s16 = vdup_n_s16((int16_t)cospi_24_64);
-
-  q2s32 = vmull_s16(d18s16, d31s16);
-  q3s32 = vmull_s16(d19s16, d31s16);
-  q4s32 = vmull_s16(d28s16, d31s16);
-  q5s32 = vmull_s16(d29s16, d31s16);
-
-  q2s32 = vmlal_s16(q2s32, d28s16, d30s16);
-  q3s32 = vmlal_s16(q3s32, d29s16, d30s16);
-  q4s32 = vmlsl_s16(q4s32, d18s16, d30s16);
-  q5s32 = vmlsl_s16(q5s32, d19s16, d30s16);
-
-  d12s16 = vrshrn_n_s32(q2s32, 14);
-  d13s16 = vrshrn_n_s32(q3s32, 14);
-  d2s16 = vrshrn_n_s32(q4s32, 14);
-  d3s16 = vrshrn_n_s32(q5s32, 14);
-  q1s16 = vcombine_s16(d2s16, d3s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  q3s16 = q11s16;
-  q4s16 = q12s16;
-
-  d30s16 = vdup_n_s16(-cospi_8_64);
-  q11s32 = vmull_s16(d26s16, d30s16);
-  q12s32 = vmull_s16(d27s16, d30s16);
-  q8s32 = vmull_s16(d20s16, d30s16);
-  q9s32 = vmull_s16(d21s16, d30s16);
-
-  q11s32 = vmlsl_s16(q11s32, d20s16, d31s16);
-  q12s32 = vmlsl_s16(q12s32, d21s16, d31s16);
-  q8s32 = vmlal_s16(q8s32, d26s16, d31s16);
-  q9s32 = vmlal_s16(q9s32, d27s16, d31s16);
-
-  d4s16 = vrshrn_n_s32(q11s32, 14);
-  d5s16 = vrshrn_n_s32(q12s32, 14);
-  d10s16 = vrshrn_n_s32(q8s32, 14);
-  d11s16 = vrshrn_n_s32(q9s32, 14);
-  q2s16 = vcombine_s16(d4s16, d5s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-
-  // stage 5
-  q8s16 = vaddq_s16(q0s16, q3s16);
-  q9s16 = vaddq_s16(q1s16, q2s16);
-  q10s16 = vsubq_s16(q1s16, q2s16);
-  q11s16 = vsubq_s16(q0s16, q3s16);
-  q12s16 = vsubq_s16(q7s16, q4s16);
-  q13s16 = vsubq_s16(q6s16, q5s16);
-  q14s16 = vaddq_s16(q6s16, q5s16);
-  q15s16 = vaddq_s16(q7s16, q4s16);
-
-  // stage 6
-  d20s16 = vget_low_s16(q10s16);
-  d21s16 = vget_high_s16(q10s16);
-  d22s16 = vget_low_s16(q11s16);
-  d23s16 = vget_high_s16(q11s16);
-  d24s16 = vget_low_s16(q12s16);
-  d25s16 = vget_high_s16(q12s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-
-  d14s16 = vdup_n_s16((int16_t)cospi_16_64);
-
-  q3s32 = vmull_s16(d26s16, d14s16);
-  q4s32 = vmull_s16(d27s16, d14s16);
-  q0s32 = vmull_s16(d20s16, d14s16);
-  q1s32 = vmull_s16(d21s16, d14s16);
-
-  q5s32 = vsubq_s32(q3s32, q0s32);
-  q6s32 = vsubq_s32(q4s32, q1s32);
-  q10s32 = vaddq_s32(q3s32, q0s32);
-  q4s32 = vaddq_s32(q4s32, q1s32);
-
-  d4s16 = vrshrn_n_s32(q5s32, 14);
-  d5s16 = vrshrn_n_s32(q6s32, 14);
-  d10s16 = vrshrn_n_s32(q10s32, 14);
-  d11s16 = vrshrn_n_s32(q4s32, 14);
-  q2s16 = vcombine_s16(d4s16, d5s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-
-  q0s32 = vmull_s16(d22s16, d14s16);
-  q1s32 = vmull_s16(d23s16, d14s16);
-  q13s32 = vmull_s16(d24s16, d14s16);
-  q6s32 = vmull_s16(d25s16, d14s16);
-
-  q10s32 = vsubq_s32(q13s32, q0s32);
-  q4s32 = vsubq_s32(q6s32, q1s32);
-  q13s32 = vaddq_s32(q13s32, q0s32);
-  q6s32 = vaddq_s32(q6s32, q1s32);
-
-  d6s16 = vrshrn_n_s32(q10s32, 14);
-  d7s16 = vrshrn_n_s32(q4s32, 14);
-  d8s16 = vrshrn_n_s32(q13s32, 14);
-  d9s16 = vrshrn_n_s32(q6s32, 14);
-  q3s16 = vcombine_s16(d6s16, d7s16);
-  q4s16 = vcombine_s16(d8s16, d9s16);
+  step2[0] = vaddq_s16(step1[0], step1[7]);
+  step2[1] = vaddq_s16(step1[1], step1[6]);
+  step2[2] = vaddq_s16(step1[2], step1[5]);
+  step2[3] = vaddq_s16(step1[3], step1[4]);
+  step2[4] = vsubq_s16(step1[3], step1[4]);
+  step2[5] = vsubq_s16(step1[2], step1[5]);
+  step2[6] = vsubq_s16(step1[1], step1[6]);
+  step2[7] = vsubq_s16(step1[0], step1[7]);
+  idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+                     &step2[13]);
+  idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+                     &step2[12]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  step2[14] = step1[14];
+  step2[15] = step1[15];
 
   // stage 7
-  if (skip_adding != 0) {
-    d = dest;
-    // load the data in pass1
-    q0s16 = vld1q_s16(pass1_output);
-    pass1_output += 8;
-    q1s16 = vld1q_s16(pass1_output);
-    pass1_output += 8;
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += stride;
-    d13s64 = vld1_s64((int64_t *)dest);
-    dest += stride;
+  idct16x16_add_stage7(step2, out);
 
-    q12s16 = vaddq_s16(q0s16, q15s16);
-    q13s16 = vaddq_s16(q1s16, q14s16);
-    q12s16 = vrshrq_n_s16(q12s16, 6);
-    q13s16 = vrshrq_n_s16(q13s16, 6);
-    q12u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
-    q13u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-    d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += stride;
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
-    d += stride;
-    q14s16 = vsubq_s16(q1s16, q14s16);
-    q15s16 = vsubq_s16(q0s16, q15s16);
-
-    q10s16 = vld1q_s16(pass1_output);
-    pass1_output += 8;
-    q11s16 = vld1q_s16(pass1_output);
-    pass1_output += 8;
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += stride;
-    d13s64 = vld1_s64((int64_t *)dest);
-    dest += stride;
-    q12s16 = vaddq_s16(q10s16, q5s16);
-    q13s16 = vaddq_s16(q11s16, q4s16);
-    q12s16 = vrshrq_n_s16(q12s16, 6);
-    q13s16 = vrshrq_n_s16(q13s16, 6);
-    q12u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
-    q13u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-    d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += stride;
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
-    d += stride;
-    q4s16 = vsubq_s16(q11s16, q4s16);
-    q5s16 = vsubq_s16(q10s16, q5s16);
-
-    q0s16 = vld1q_s16(pass1_output);
-    pass1_output += 8;
-    q1s16 = vld1q_s16(pass1_output);
-    pass1_output += 8;
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += stride;
-    d13s64 = vld1_s64((int64_t *)dest);
-    dest += stride;
-    q12s16 = vaddq_s16(q0s16, q3s16);
-    q13s16 = vaddq_s16(q1s16, q2s16);
-    q12s16 = vrshrq_n_s16(q12s16, 6);
-    q13s16 = vrshrq_n_s16(q13s16, 6);
-    q12u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
-    q13u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-    d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += stride;
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
-    d += stride;
-    q2s16 = vsubq_s16(q1s16, q2s16);
-    q3s16 = vsubq_s16(q0s16, q3s16);
-
-    q10s16 = vld1q_s16(pass1_output);
-    pass1_output += 8;
-    q11s16 = vld1q_s16(pass1_output);
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += stride;
-    d13s64 = vld1_s64((int64_t *)dest);
-    dest += stride;
-    q12s16 = vaddq_s16(q10s16, q9s16);
-    q13s16 = vaddq_s16(q11s16, q8s16);
-    q12s16 = vrshrq_n_s16(q12s16, 6);
-    q13s16 = vrshrq_n_s16(q13s16, 6);
-    q12u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
-    q13u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
-    d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += stride;
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
-    d += stride;
-    q8s16 = vsubq_s16(q11s16, q8s16);
-    q9s16 = vsubq_s16(q10s16, q9s16);
-
-    // store the data  out 8,9,10,11,12,13,14,15
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += stride;
-    q8s16 = vrshrq_n_s16(q8s16, 6);
-    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += stride;
-    q9s16 = vrshrq_n_s16(q9s16, 6);
-    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += stride;
-    q2s16 = vrshrq_n_s16(q2s16, 6);
-    q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += stride;
-    q3s16 = vrshrq_n_s16(q3s16, 6);
-    q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += stride;
-    q4s16 = vrshrq_n_s16(q4s16, 6);
-    q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += stride;
-    q5s16 = vrshrq_n_s16(q5s16, 6);
-    q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    dest += stride;
-    q14s16 = vrshrq_n_s16(q14s16, 6);
-    q14u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q14s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-    d += stride;
-
-    d12s64 = vld1_s64((int64_t *)dest);
-    q15s16 = vrshrq_n_s16(q15s16, 6);
-    q15u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q15s16), vreinterpret_u8_s64(d12s64));
-    d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16));
-    vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
-  } else {  // skip_adding_dest
-    q0s16 = vld1q_s16(pass1_output);
-    pass1_output += 8;
-    q1s16 = vld1q_s16(pass1_output);
-    pass1_output += 8;
-    q12s16 = vaddq_s16(q0s16, q15s16);
-    q13s16 = vaddq_s16(q1s16, q14s16);
-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-    vst1_u64((uint64_t *)out, d24u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d25u64);
-    out += 12;
-    vst1_u64((uint64_t *)out, d26u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d27u64);
-    out += 12;
-    q14s16 = vsubq_s16(q1s16, q14s16);
-    q15s16 = vsubq_s16(q0s16, q15s16);
-
-    q10s16 = vld1q_s16(pass1_output);
-    pass1_output += 8;
-    q11s16 = vld1q_s16(pass1_output);
-    pass1_output += 8;
-    q12s16 = vaddq_s16(q10s16, q5s16);
-    q13s16 = vaddq_s16(q11s16, q4s16);
-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-    vst1_u64((uint64_t *)out, d24u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d25u64);
-    out += 12;
-    vst1_u64((uint64_t *)out, d26u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d27u64);
-    out += 12;
-    q4s16 = vsubq_s16(q11s16, q4s16);
-    q5s16 = vsubq_s16(q10s16, q5s16);
-
-    q0s16 = vld1q_s16(pass1_output);
-    pass1_output += 8;
-    q1s16 = vld1q_s16(pass1_output);
-    pass1_output += 8;
-    q12s16 = vaddq_s16(q0s16, q3s16);
-    q13s16 = vaddq_s16(q1s16, q2s16);
-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-    vst1_u64((uint64_t *)out, d24u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d25u64);
-    out += 12;
-    vst1_u64((uint64_t *)out, d26u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d27u64);
-    out += 12;
-    q2s16 = vsubq_s16(q1s16, q2s16);
-    q3s16 = vsubq_s16(q0s16, q3s16);
-
-    q10s16 = vld1q_s16(pass1_output);
-    pass1_output += 8;
-    q11s16 = vld1q_s16(pass1_output);
-    pass1_output += 8;
-    q12s16 = vaddq_s16(q10s16, q9s16);
-    q13s16 = vaddq_s16(q11s16, q8s16);
-    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-    vst1_u64((uint64_t *)out, d24u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d25u64);
-    out += 12;
-    vst1_u64((uint64_t *)out, d26u64);
-    out += 4;
-    vst1_u64((uint64_t *)out, d27u64);
-    out += 12;
-    q8s16 = vsubq_s16(q11s16, q8s16);
-    q9s16 = vsubq_s16(q10s16, q9s16);
-
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16)));
-    out += 12;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16)));
-    out += 4;
-    vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16)));
+  if (output) {
+    idct16x16_store_pass1(out, output);
+  } else {
+    if (highbd_flag) {
+      idct16x16_add_store_bd8(out, dest, stride);
+    } else {
+      idct16x16_add_store(out, dest, stride);
+    }
   }
 }
 
-void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *out,
-                                      int16_t *pass1_output,
-                                      int16_t skip_adding, uint8_t *dest,
-                                      int stride) {
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  int16x8x2_t q0x2s16;
+void vpx_idct16x16_38_add_half1d(const void *const input, int16_t *const output,
+                                 void *const dest, const int stride,
+                                 const int highbd_flag) {
+  const int16x8_t cospis0 = vld1q_s16(kCospi);
+  const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
+  const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
+  const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
+  const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
+  const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
+  const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
+  const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
+  const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
+  int16x8_t in[8], step1[16], step2[16], out[16];
 
-  q0x2s16 = vld2q_s16(src);
-  q8s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q9s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q10s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q11s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q12s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q13s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q14s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q15s16 = q0x2s16.val[0];
+  // Load input (8x8)
+  if (output) {
+    const tran_low_t *inputT = (const tran_low_t *)input;
+    in[0] = load_tran_low_to_s16q(inputT);
+    inputT += 16;
+    in[1] = load_tran_low_to_s16q(inputT);
+    inputT += 16;
+    in[2] = load_tran_low_to_s16q(inputT);
+    inputT += 16;
+    in[3] = load_tran_low_to_s16q(inputT);
+    inputT += 16;
+    in[4] = load_tran_low_to_s16q(inputT);
+    inputT += 16;
+    in[5] = load_tran_low_to_s16q(inputT);
+    inputT += 16;
+    in[6] = load_tran_low_to_s16q(inputT);
+    inputT += 16;
+    in[7] = load_tran_low_to_s16q(inputT);
+  } else {
+    const int16_t *inputT = (const int16_t *)input;
+    in[0] = vld1q_s16(inputT);
+    inputT += 16;
+    in[1] = vld1q_s16(inputT);
+    inputT += 16;
+    in[2] = vld1q_s16(inputT);
+    inputT += 16;
+    in[3] = vld1q_s16(inputT);
+    inputT += 16;
+    in[4] = vld1q_s16(inputT);
+    inputT += 16;
+    in[5] = vld1q_s16(inputT);
+    inputT += 16;
+    in[6] = vld1q_s16(inputT);
+    inputT += 16;
+    in[7] = vld1q_s16(inputT);
+  }
 
-  idct16x16_256_add_neon_pass2(q8s16, q9s16, q10s16, q11s16, q12s16, q13s16,
-                               q14s16, q15s16, out, pass1_output, skip_adding,
-                               dest, stride);
-}
+  // Transpose
+  transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+                    &in[7]);
 
-#if CONFIG_VP9_HIGHBITDEPTH
-void vpx_idct16x16_256_add_neon_pass2_tran_low(const tran_low_t *src,
-                                               int16_t *out,
-                                               int16_t *pass1_output,
-                                               int16_t skip_adding,
-                                               uint8_t *dest, int stride) {
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  int16x8x2_t q0x2s16;
+  // stage 1
+  step1[0] = in[0 / 2];
+  step1[2] = in[8 / 2];
+  step1[4] = in[4 / 2];
+  step1[6] = in[12 / 2];
+  step1[8] = in[2 / 2];
+  step1[10] = in[10 / 2];
+  step1[12] = in[6 / 2];
+  step1[14] = in[14 / 2];  // 0 in pass 1
 
-  q0x2s16 = load_tran_low_to_s16x2q(src);
-  q8s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = load_tran_low_to_s16x2q(src);
-  q9s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = load_tran_low_to_s16x2q(src);
-  q10s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = load_tran_low_to_s16x2q(src);
-  q11s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = load_tran_low_to_s16x2q(src);
-  q12s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = load_tran_low_to_s16x2q(src);
-  q13s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = load_tran_low_to_s16x2q(src);
-  q14s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = load_tran_low_to_s16x2q(src);
-  q15s16 = q0x2s16.val[0];
-
-  idct16x16_256_add_neon_pass2(q8s16, q9s16, q10s16, q11s16, q12s16, q13s16,
-                               q14s16, q15s16, out, pass1_output, skip_adding,
-                               dest, stride);
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-void vpx_idct16x16_10_add_neon_pass1(const tran_low_t *in, int16_t *out) {
-  int16x4_t d4s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  int32x4_t q6s32, q9s32;
-  int32x4_t q10s32, q11s32, q12s32, q15s32;
-  int16x8x2_t q0x2s16;
-
-  q0x2s16 = load_tran_low_to_s16x2q(in);
-  q8s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = load_tran_low_to_s16x2q(in);
-  q9s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = load_tran_low_to_s16x2q(in);
-  q10s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = load_tran_low_to_s16x2q(in);
-  q11s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = load_tran_low_to_s16x2q(in);
-  q12s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = load_tran_low_to_s16x2q(in);
-  q13s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = load_tran_low_to_s16x2q(in);
-  q14s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = load_tran_low_to_s16x2q(in);
-  q15s16 = q0x2s16.val[0];
-
-  transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                    &q15s16);
+  // stage 2
+  step2[0] = step1[0];
+  step2[2] = step1[2];
+  step2[4] = step1[4];
+  step2[6] = step1[6];
+  step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1);
+  step2[9] = vqrdmulhq_lane_s16(step1[14], cospid_6_26_14_18N, 3);
+  step2[10] = vqrdmulhq_lane_s16(step1[10], cospid_2_30_10_22, 3);
+  step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1);
+  step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0);
+  step2[13] = vqrdmulhq_lane_s16(step1[10], cospid_2_30_10_22, 2);
+  step2[14] = vqrdmulhq_lane_s16(step1[14], cospid_6_26_14_18N, 2);
+  step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0);
 
   // stage 3
-  q0s16 = vdupq_n_s16((int16_t)cospi_28_64 * 2);
-  q1s16 = vdupq_n_s16((int16_t)cospi_4_64 * 2);
-
-  q4s16 = vqrdmulhq_s16(q9s16, q0s16);
-  q7s16 = vqrdmulhq_s16(q9s16, q1s16);
+  step1[0] = step2[0];
+  step1[2] = step2[2];
+  step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3);
+  step1[5] = vqrdmulhq_lane_s16(step2[6], cospid_4_12_20N_28, 2);
+  step1[6] = vqrdmulhq_lane_s16(step2[6], cospid_4_12_20N_28, 1);
+  step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0);
+  step1[8] = vaddq_s16(step2[8], step2[9]);
+  step1[9] = vsubq_s16(step2[8], step2[9]);
+  step1[10] = vsubq_s16(step2[11], step2[10]);
+  step1[11] = vaddq_s16(step2[11], step2[10]);
+  step1[12] = vaddq_s16(step2[12], step2[13]);
+  step1[13] = vsubq_s16(step2[12], step2[13]);
+  step1[14] = vsubq_s16(step2[15], step2[14]);
+  step1[15] = vaddq_s16(step2[15], step2[14]);
 
   // stage 4
-  q1s16 = vdupq_n_s16((int16_t)cospi_16_64 * 2);
-  d4s16 = vdup_n_s16((int16_t)cospi_16_64);
-
-  q8s16 = vqrdmulhq_s16(q8s16, q1s16);
-
-  d8s16 = vget_low_s16(q4s16);
-  d9s16 = vget_high_s16(q4s16);
-  d14s16 = vget_low_s16(q7s16);
-  d15s16 = vget_high_s16(q7s16);
-  q9s32 = vmull_s16(d14s16, d4s16);
-  q10s32 = vmull_s16(d15s16, d4s16);
-  q12s32 = vmull_s16(d9s16, d4s16);
-  q11s32 = vmull_s16(d8s16, d4s16);
-
-  q15s32 = vsubq_s32(q10s32, q12s32);
-  q6s32 = vsubq_s32(q9s32, q11s32);
-  q9s32 = vaddq_s32(q9s32, q11s32);
-  q10s32 = vaddq_s32(q10s32, q12s32);
-
-  d11s16 = vrshrn_n_s32(q15s32, 14);
-  d10s16 = vrshrn_n_s32(q6s32, 14);
-  d12s16 = vrshrn_n_s32(q9s32, 14);
-  d13s16 = vrshrn_n_s32(q10s32, 14);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  // stage 6
-  q2s16 = vaddq_s16(q8s16, q7s16);
-  q9s16 = vaddq_s16(q8s16, q6s16);
-  q10s16 = vaddq_s16(q8s16, q5s16);
-  q11s16 = vaddq_s16(q8s16, q4s16);
-  q12s16 = vsubq_s16(q8s16, q4s16);
-  q13s16 = vsubq_s16(q8s16, q5s16);
-  q14s16 = vsubq_s16(q8s16, q6s16);
-  q15s16 = vsubq_s16(q8s16, q7s16);
-
-  // store the data
-  vst1q_s16(out, q2s16);
-  out += 8;
-  vst1q_s16(out, q9s16);
-  out += 8;
-  vst1q_s16(out, q10s16);
-  out += 8;
-  vst1q_s16(out, q11s16);
-  out += 8;
-  vst1q_s16(out, q12s16);
-  out += 8;
-  vst1q_s16(out, q13s16);
-  out += 8;
-  vst1q_s16(out, q14s16);
-  out += 8;
-  vst1q_s16(out, q15s16);
-}
-
-void vpx_idct16x16_10_add_neon_pass2(const tran_low_t *src, int16_t *out,
-                                     int16_t *pass1_output) {
-  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  int16x4_t d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16;
-  uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64;
-  uint64x1_t d16u64, d17u64, d18u64, d19u64;
-  uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
-  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
-  int32x4_t q10s32, q11s32, q12s32, q13s32;
-  int16x8x2_t q0x2s16;
-
-  q0x2s16 = load_tran_low_to_s16x2q(src);
-  q8s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = load_tran_low_to_s16x2q(src);
-  q9s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = load_tran_low_to_s16x2q(src);
-  q10s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = load_tran_low_to_s16x2q(src);
-  q11s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = load_tran_low_to_s16x2q(src);
-  q12s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = load_tran_low_to_s16x2q(src);
-  q13s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = load_tran_low_to_s16x2q(src);
-  q14s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = load_tran_low_to_s16x2q(src);
-  q15s16 = q0x2s16.val[0];
-
-  transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                    &q15s16);
-
-  // stage 3
-  q6s16 = vdupq_n_s16((int16_t)cospi_30_64 * 2);
-  q0s16 = vqrdmulhq_s16(q8s16, q6s16);
-  q6s16 = vdupq_n_s16((int16_t)cospi_2_64 * 2);
-  q7s16 = vqrdmulhq_s16(q8s16, q6s16);
-
-  q15s16 = vdupq_n_s16((int16_t)-cospi_26_64 * 2);
-  q14s16 = vdupq_n_s16((int16_t)cospi_6_64 * 2);
-  q3s16 = vqrdmulhq_s16(q9s16, q15s16);
-  q4s16 = vqrdmulhq_s16(q9s16, q14s16);
-
-  // stage 4
-  d0s16 = vget_low_s16(q0s16);
-  d1s16 = vget_high_s16(q0s16);
-  d6s16 = vget_low_s16(q3s16);
-  d7s16 = vget_high_s16(q3s16);
-  d8s16 = vget_low_s16(q4s16);
-  d9s16 = vget_high_s16(q4s16);
-  d14s16 = vget_low_s16(q7s16);
-  d15s16 = vget_high_s16(q7s16);
-
-  d30s16 = vdup_n_s16((int16_t)cospi_8_64);
-  d31s16 = vdup_n_s16((int16_t)cospi_24_64);
-
-  q12s32 = vmull_s16(d14s16, d31s16);
-  q5s32 = vmull_s16(d15s16, d31s16);
-  q2s32 = vmull_s16(d0s16, d31s16);
-  q11s32 = vmull_s16(d1s16, d31s16);
-
-  q12s32 = vmlsl_s16(q12s32, d0s16, d30s16);
-  q5s32 = vmlsl_s16(q5s32, d1s16, d30s16);
-  q2s32 = vmlal_s16(q2s32, d14s16, d30s16);
-  q11s32 = vmlal_s16(q11s32, d15s16, d30s16);
-
-  d2s16 = vrshrn_n_s32(q12s32, 14);
-  d3s16 = vrshrn_n_s32(q5s32, 14);
-  d12s16 = vrshrn_n_s32(q2s32, 14);
-  d13s16 = vrshrn_n_s32(q11s32, 14);
-  q1s16 = vcombine_s16(d2s16, d3s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  d30s16 = vdup_n_s16(-cospi_8_64);
-  q10s32 = vmull_s16(d8s16, d30s16);
-  q13s32 = vmull_s16(d9s16, d30s16);
-  q8s32 = vmull_s16(d6s16, d30s16);
-  q9s32 = vmull_s16(d7s16, d30s16);
-
-  q10s32 = vmlsl_s16(q10s32, d6s16, d31s16);
-  q13s32 = vmlsl_s16(q13s32, d7s16, d31s16);
-  q8s32 = vmlal_s16(q8s32, d8s16, d31s16);
-  q9s32 = vmlal_s16(q9s32, d9s16, d31s16);
-
-  d4s16 = vrshrn_n_s32(q10s32, 14);
-  d5s16 = vrshrn_n_s32(q13s32, 14);
-  d10s16 = vrshrn_n_s32(q8s32, 14);
-  d11s16 = vrshrn_n_s32(q9s32, 14);
-  q2s16 = vcombine_s16(d4s16, d5s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
+  step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2);
+  step2[2] = vqrdmulhq_lane_s16(step1[2], cospid_0_8_16_24, 3);
+  step2[3] = vqrdmulhq_lane_s16(step1[2], cospid_0_8_16_24, 1);
+  step2[4] = vaddq_s16(step1[4], step1[5]);
+  step2[5] = vsubq_s16(step1[4], step1[5]);
+  step2[6] = vsubq_s16(step1[7], step1[6]);
+  step2[7] = vaddq_s16(step1[7], step1[6]);
+  step2[8] = step1[8];
+  idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+                    &step2[14]);
+  idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
+                        &step2[10]);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
 
   // stage 5
-  q8s16 = vaddq_s16(q0s16, q3s16);
-  q9s16 = vaddq_s16(q1s16, q2s16);
-  q10s16 = vsubq_s16(q1s16, q2s16);
-  q11s16 = vsubq_s16(q0s16, q3s16);
-  q12s16 = vsubq_s16(q7s16, q4s16);
-  q13s16 = vsubq_s16(q6s16, q5s16);
-  q14s16 = vaddq_s16(q6s16, q5s16);
-  q15s16 = vaddq_s16(q7s16, q4s16);
+  step1[0] = vaddq_s16(step2[0], step2[3]);
+  step1[1] = vaddq_s16(step2[1], step2[2]);
+  step1[2] = vsubq_s16(step2[1], step2[2]);
+  step1[3] = vsubq_s16(step2[0], step2[3]);
+  step1[4] = step2[4];
+  idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
+  step1[7] = step2[7];
+  step1[8] = vaddq_s16(step2[8], step2[11]);
+  step1[9] = vaddq_s16(step2[9], step2[10]);
+  step1[10] = vsubq_s16(step2[9], step2[10]);
+  step1[11] = vsubq_s16(step2[8], step2[11]);
+  step1[12] = vsubq_s16(step2[15], step2[12]);
+  step1[13] = vsubq_s16(step2[14], step2[13]);
+  step1[14] = vaddq_s16(step2[14], step2[13]);
+  step1[15] = vaddq_s16(step2[15], step2[12]);
 
   // stage 6
-  d20s16 = vget_low_s16(q10s16);
-  d21s16 = vget_high_s16(q10s16);
-  d22s16 = vget_low_s16(q11s16);
-  d23s16 = vget_high_s16(q11s16);
-  d24s16 = vget_low_s16(q12s16);
-  d25s16 = vget_high_s16(q12s16);
-  d26s16 = vget_low_s16(q13s16);
-  d27s16 = vget_high_s16(q13s16);
-
-  d14s16 = vdup_n_s16((int16_t)cospi_16_64);
-  q3s32 = vmull_s16(d26s16, d14s16);
-  q4s32 = vmull_s16(d27s16, d14s16);
-  q0s32 = vmull_s16(d20s16, d14s16);
-  q1s32 = vmull_s16(d21s16, d14s16);
-
-  q5s32 = vsubq_s32(q3s32, q0s32);
-  q6s32 = vsubq_s32(q4s32, q1s32);
-  q0s32 = vaddq_s32(q3s32, q0s32);
-  q4s32 = vaddq_s32(q4s32, q1s32);
-
-  d4s16 = vrshrn_n_s32(q5s32, 14);
-  d5s16 = vrshrn_n_s32(q6s32, 14);
-  d10s16 = vrshrn_n_s32(q0s32, 14);
-  d11s16 = vrshrn_n_s32(q4s32, 14);
-  q2s16 = vcombine_s16(d4s16, d5s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-
-  q0s32 = vmull_s16(d22s16, d14s16);
-  q1s32 = vmull_s16(d23s16, d14s16);
-  q13s32 = vmull_s16(d24s16, d14s16);
-  q6s32 = vmull_s16(d25s16, d14s16);
-
-  q10s32 = vsubq_s32(q13s32, q0s32);
-  q4s32 = vsubq_s32(q6s32, q1s32);
-  q13s32 = vaddq_s32(q13s32, q0s32);
-  q6s32 = vaddq_s32(q6s32, q1s32);
-
-  d6s16 = vrshrn_n_s32(q10s32, 14);
-  d7s16 = vrshrn_n_s32(q4s32, 14);
-  d8s16 = vrshrn_n_s32(q13s32, 14);
-  d9s16 = vrshrn_n_s32(q6s32, 14);
-  q3s16 = vcombine_s16(d6s16, d7s16);
-  q4s16 = vcombine_s16(d8s16, d9s16);
+  step2[0] = vaddq_s16(step1[0], step1[7]);
+  step2[1] = vaddq_s16(step1[1], step1[6]);
+  step2[2] = vaddq_s16(step1[2], step1[5]);
+  step2[3] = vaddq_s16(step1[3], step1[4]);
+  step2[4] = vsubq_s16(step1[3], step1[4]);
+  step2[5] = vsubq_s16(step1[2], step1[5]);
+  step2[6] = vsubq_s16(step1[1], step1[6]);
+  step2[7] = vsubq_s16(step1[0], step1[7]);
+  idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+                     &step2[13]);
+  idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+                     &step2[12]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  step2[14] = step1[14];
+  step2[15] = step1[15];
 
   // stage 7
-  q0s16 = vld1q_s16(pass1_output);
-  pass1_output += 8;
-  q1s16 = vld1q_s16(pass1_output);
-  pass1_output += 8;
-  q12s16 = vaddq_s16(q0s16, q15s16);
-  q13s16 = vaddq_s16(q1s16, q14s16);
-  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-  vst1_u64((uint64_t *)out, d24u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d25u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d26u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d27u64);
-  out += 12;
-  q14s16 = vsubq_s16(q1s16, q14s16);
-  q15s16 = vsubq_s16(q0s16, q15s16);
+  idct16x16_add_stage7(step2, out);
 
-  q10s16 = vld1q_s16(pass1_output);
-  pass1_output += 8;
-  q11s16 = vld1q_s16(pass1_output);
-  pass1_output += 8;
-  q12s16 = vaddq_s16(q10s16, q5s16);
-  q13s16 = vaddq_s16(q11s16, q4s16);
-  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-  vst1_u64((uint64_t *)out, d24u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d25u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d26u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d27u64);
-  out += 12;
-  q4s16 = vsubq_s16(q11s16, q4s16);
-  q5s16 = vsubq_s16(q10s16, q5s16);
-
-  q0s16 = vld1q_s16(pass1_output);
-  pass1_output += 8;
-  q1s16 = vld1q_s16(pass1_output);
-  pass1_output += 8;
-  q12s16 = vaddq_s16(q0s16, q3s16);
-  q13s16 = vaddq_s16(q1s16, q2s16);
-  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-  vst1_u64((uint64_t *)out, d24u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d25u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d26u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d27u64);
-  out += 12;
-  q2s16 = vsubq_s16(q1s16, q2s16);
-  q3s16 = vsubq_s16(q0s16, q3s16);
-
-  q10s16 = vld1q_s16(pass1_output);
-  pass1_output += 8;
-  q11s16 = vld1q_s16(pass1_output);
-  q12s16 = vaddq_s16(q10s16, q9s16);
-  q13s16 = vaddq_s16(q11s16, q8s16);
-  d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
-  d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
-  d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
-  d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
-  vst1_u64((uint64_t *)out, d24u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d25u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d26u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d27u64);
-  out += 12;
-  q8s16 = vsubq_s16(q11s16, q8s16);
-  q9s16 = vsubq_s16(q10s16, q9s16);
-
-  d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
-  d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
-  d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16));
-  d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16));
-  d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16));
-  d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16));
-  d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16));
-  d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16));
-  d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
-  d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
-  d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
-  d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
-  d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
-  d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
-  d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
-  d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
-
-  vst1_u64((uint64_t *)out, d16u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d17u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d18u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d19u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d4u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d5u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d6u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d7u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d8u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d9u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d10u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d11u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d28u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d29u64);
-  out += 12;
-  vst1_u64((uint64_t *)out, d30u64);
-  out += 4;
-  vst1_u64((uint64_t *)out, d31u64);
+  if (output) {
+    idct16x16_store_pass1(out, output);
+  } else {
+    if (highbd_flag) {
+      idct16x16_add_store_bd8(out, dest, stride);
+    } else {
+      idct16x16_add_store(out, dest, stride);
+    }
+  }
+}
+
+void vpx_idct16x16_10_add_half1d_pass1(const tran_low_t *input,
+                                       int16_t *output) {
+  const int16x8_t cospis0 = vld1q_s16(kCospi);
+  const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
+  const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
+  const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
+  const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
+  const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
+  const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
+  const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
+  const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
+  int16x4_t in[4], step1[16], step2[16], out[16];
+
+  // Load input (4x4)
+  in[0] = load_tran_low_to_s16d(input);
+  input += 16;
+  in[1] = load_tran_low_to_s16d(input);
+  input += 16;
+  in[2] = load_tran_low_to_s16d(input);
+  input += 16;
+  in[3] = load_tran_low_to_s16d(input);
+
+  // Transpose
+  transpose_s16_4x4d(&in[0], &in[1], &in[2], &in[3]);
+
+  // stage 1
+  step1[0] = in[0 / 2];
+  step1[4] = in[4 / 2];
+  step1[8] = in[2 / 2];
+  step1[12] = in[6 / 2];
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[4] = step1[4];
+  step2[8] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 1);
+  step2[11] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 1);
+  step2[12] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 0);
+  step2[15] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 0);
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[4] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 3);
+  step1[7] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 0);
+  step1[8] = step2[8];
+  step1[9] = step2[8];
+  step1[10] = step2[11];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[13] = step2[12];
+  step1[14] = step2[15];
+  step1[15] = step2[15];
+
+  // stage 4
+  step2[0] = step2[1] = vqrdmulh_lane_s16(step1[0], cospid_0_8_16_24, 2);
+  step2[4] = step1[4];
+  step2[5] = step1[4];
+  step2[6] = step1[7];
+  step2[7] = step1[7];
+  step2[8] = step1[8];
+  idct_cospi_8_24_d(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+                    &step2[14]);
+  idct_cospi_8_24_neg_d(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
+                        &step2[10]);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  // stage 5
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[1];
+  step1[3] = step2[0];
+  step1[4] = step2[4];
+  idct_cospi_16_16_d(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
+  step1[7] = step2[7];
+  step1[8] = vadd_s16(step2[8], step2[11]);
+  step1[9] = vadd_s16(step2[9], step2[10]);
+  step1[10] = vsub_s16(step2[9], step2[10]);
+  step1[11] = vsub_s16(step2[8], step2[11]);
+  step1[12] = vsub_s16(step2[15], step2[12]);
+  step1[13] = vsub_s16(step2[14], step2[13]);
+  step1[14] = vadd_s16(step2[14], step2[13]);
+  step1[15] = vadd_s16(step2[15], step2[12]);
+
+  // stage 6
+  step2[0] = vadd_s16(step1[0], step1[7]);
+  step2[1] = vadd_s16(step1[1], step1[6]);
+  step2[2] = vadd_s16(step1[2], step1[5]);
+  step2[3] = vadd_s16(step1[3], step1[4]);
+  step2[4] = vsub_s16(step1[3], step1[4]);
+  step2[5] = vsub_s16(step1[2], step1[5]);
+  step2[6] = vsub_s16(step1[1], step1[6]);
+  step2[7] = vsub_s16(step1[0], step1[7]);
+  idct_cospi_16_16_d(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+                     &step2[13]);
+  idct_cospi_16_16_d(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+                     &step2[12]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+  out[0] = vadd_s16(step2[0], step2[15]);
+  out[1] = vadd_s16(step2[1], step2[14]);
+  out[2] = vadd_s16(step2[2], step2[13]);
+  out[3] = vadd_s16(step2[3], step2[12]);
+  out[4] = vadd_s16(step2[4], step2[11]);
+  out[5] = vadd_s16(step2[5], step2[10]);
+  out[6] = vadd_s16(step2[6], step2[9]);
+  out[7] = vadd_s16(step2[7], step2[8]);
+  out[8] = vsub_s16(step2[7], step2[8]);
+  out[9] = vsub_s16(step2[6], step2[9]);
+  out[10] = vsub_s16(step2[5], step2[10]);
+  out[11] = vsub_s16(step2[4], step2[11]);
+  out[12] = vsub_s16(step2[3], step2[12]);
+  out[13] = vsub_s16(step2[2], step2[13]);
+  out[14] = vsub_s16(step2[1], step2[14]);
+  out[15] = vsub_s16(step2[0], step2[15]);
+
+  // pass 1: save the result into output
+  vst1_s16(output, out[0]);
+  output += 4;
+  vst1_s16(output, out[1]);
+  output += 4;
+  vst1_s16(output, out[2]);
+  output += 4;
+  vst1_s16(output, out[3]);
+  output += 4;
+  vst1_s16(output, out[4]);
+  output += 4;
+  vst1_s16(output, out[5]);
+  output += 4;
+  vst1_s16(output, out[6]);
+  output += 4;
+  vst1_s16(output, out[7]);
+  output += 4;
+  vst1_s16(output, out[8]);
+  output += 4;
+  vst1_s16(output, out[9]);
+  output += 4;
+  vst1_s16(output, out[10]);
+  output += 4;
+  vst1_s16(output, out[11]);
+  output += 4;
+  vst1_s16(output, out[12]);
+  output += 4;
+  vst1_s16(output, out[13]);
+  output += 4;
+  vst1_s16(output, out[14]);
+  output += 4;
+  vst1_s16(output, out[15]);
+}
+
+void vpx_idct16x16_10_add_half1d_pass2(const int16_t *input,
+                                       int16_t *const output, void *const dest,
+                                       const int stride,
+                                       const int highbd_flag) {
+  const int16x8_t cospis0 = vld1q_s16(kCospi);
+  const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
+  const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
+  const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
+  const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
+  const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
+  const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
+  const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
+  const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
+  int16x4_t ind[8];
+  int16x8_t in[4], step1[16], step2[16], out[16];
+
+  // Load input (4x8)
+  ind[0] = vld1_s16(input);
+  input += 4;
+  ind[1] = vld1_s16(input);
+  input += 4;
+  ind[2] = vld1_s16(input);
+  input += 4;
+  ind[3] = vld1_s16(input);
+  input += 4;
+  ind[4] = vld1_s16(input);
+  input += 4;
+  ind[5] = vld1_s16(input);
+  input += 4;
+  ind[6] = vld1_s16(input);
+  input += 4;
+  ind[7] = vld1_s16(input);
+
+  // Transpose
+  transpose_s16_4x8(ind[0], ind[1], ind[2], ind[3], ind[4], ind[5], ind[6],
+                    ind[7], &in[0], &in[1], &in[2], &in[3]);
+
+  // stage 1
+  step1[0] = in[0 / 2];
+  step1[4] = in[4 / 2];
+  step1[8] = in[2 / 2];
+  step1[12] = in[6 / 2];
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[4] = step1[4];
+  step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1);
+  step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1);
+  step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0);
+  step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0);
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3);
+  step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0);
+  step1[8] = step2[8];
+  step1[9] = step2[8];
+  step1[10] = step2[11];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[13] = step2[12];
+  step1[14] = step2[15];
+  step1[15] = step2[15];
+
+  // stage 4
+  step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2);
+  step2[4] = step1[4];
+  step2[5] = step1[4];
+  step2[6] = step1[7];
+  step2[7] = step1[7];
+  step2[8] = step1[8];
+  idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+                    &step2[14]);
+  idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
+                        &step2[10]);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  // stage 5
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[1];
+  step1[3] = step2[0];
+  step1[4] = step2[4];
+  idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
+  step1[7] = step2[7];
+  step1[8] = vaddq_s16(step2[8], step2[11]);
+  step1[9] = vaddq_s16(step2[9], step2[10]);
+  step1[10] = vsubq_s16(step2[9], step2[10]);
+  step1[11] = vsubq_s16(step2[8], step2[11]);
+  step1[12] = vsubq_s16(step2[15], step2[12]);
+  step1[13] = vsubq_s16(step2[14], step2[13]);
+  step1[14] = vaddq_s16(step2[14], step2[13]);
+  step1[15] = vaddq_s16(step2[15], step2[12]);
+
+  // stage 6
+  step2[0] = vaddq_s16(step1[0], step1[7]);
+  step2[1] = vaddq_s16(step1[1], step1[6]);
+  step2[2] = vaddq_s16(step1[2], step1[5]);
+  step2[3] = vaddq_s16(step1[3], step1[4]);
+  step2[4] = vsubq_s16(step1[3], step1[4]);
+  step2[5] = vsubq_s16(step1[2], step1[5]);
+  step2[6] = vsubq_s16(step1[1], step1[6]);
+  step2[7] = vsubq_s16(step1[0], step1[7]);
+  idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+                     &step2[13]);
+  idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+                     &step2[12]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+  idct16x16_add_stage7(step2, out);
+
+  if (output) {
+    idct16x16_store_pass1(out, output);
+  } else {
+    if (highbd_flag) {
+      idct16x16_add_store_bd8(out, dest, stride);
+    } else {
+      idct16x16_add_store(out, dest, stride);
+    }
+  }
+}
+
+void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest,
+                                int stride) {
+  int16_t row_idct_output[16 * 16];
+
+  // pass 1
+  // Parallel idct on the upper 8 rows
+  vpx_idct16x16_256_add_half1d(input, row_idct_output, dest, stride, 0);
+
+  // Parallel idct on the lower 8 rows
+  vpx_idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8, dest,
+                               stride, 0);
+
+  // pass 2
+  // Parallel idct to get the left 8 columns
+  vpx_idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride, 0);
+
+  // Parallel idct to get the right 8 columns
+  vpx_idct16x16_256_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride,
+                               0);
+}
+
+void vpx_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest,
+                               int stride) {
+  int16_t row_idct_output[16 * 16];
+
+  // pass 1
+  // Parallel idct on the upper 8 rows
+  vpx_idct16x16_38_add_half1d(input, row_idct_output, dest, stride, 0);
+
+  // pass 2
+  // Parallel idct to get the left 8 columns
+  vpx_idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, 0);
+
+  // Parallel idct to get the right 8 columns
+  vpx_idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride,
+                              0);
+}
+
+void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest,
+                               int stride) {
+  int16_t row_idct_output[4 * 16];
+
+  // pass 1
+  // Parallel idct on the upper 8 rows
+  vpx_idct16x16_10_add_half1d_pass1(input, row_idct_output);
+
+  // pass 2
+  // Parallel idct to get the left 8 columns
+  vpx_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride, 0);
+
+  // Parallel idct to get the right 8 columns
+  vpx_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8,
+                                    stride, 0);
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_neon.c
deleted file mode 100644
index 47366bcb7d..0000000000
--- a/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_neon.c
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-
-void vpx_idct16x16_256_add_neon_pass1(const int16_t *input, int16_t *output);
-void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *output,
-                                      int16_t *pass1_output,
-                                      int16_t skip_adding, uint8_t *dest,
-                                      int stride);
-#if CONFIG_VP9_HIGHBITDEPTH
-void vpx_idct16x16_256_add_neon_pass1_tran_low(const tran_low_t *input,
-                                               int16_t *output);
-void vpx_idct16x16_256_add_neon_pass2_tran_low(const tran_low_t *src,
-                                               int16_t *output,
-                                               int16_t *pass1_output,
-                                               int16_t skip_adding,
-                                               uint8_t *dest, int stride);
-#else
-#define vpx_idct16x16_256_add_neon_pass1_tran_low \
-  vpx_idct16x16_256_add_neon_pass1
-#define vpx_idct16x16_256_add_neon_pass2_tran_low \
-  vpx_idct16x16_256_add_neon_pass2
-#endif
-
-void vpx_idct16x16_10_add_neon_pass1(const tran_low_t *input, int16_t *output);
-void vpx_idct16x16_10_add_neon_pass2(const tran_low_t *src, int16_t *output,
-                                     int16_t *pass1_output);
-
-#if HAVE_NEON_ASM
-/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
-extern void vpx_push_neon(int64_t *store);
-extern void vpx_pop_neon(int64_t *store);
-#endif  // HAVE_NEON_ASM
-
-void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest,
-                                int stride) {
-#if HAVE_NEON_ASM
-  int64_t store_reg[8];
-#endif
-  int16_t pass1_output[16 * 16] = { 0 };
-  int16_t row_idct_output[16 * 16] = { 0 };
-
-#if HAVE_NEON_ASM
-  // save d8-d15 register values.
-  vpx_push_neon(store_reg);
-#endif
-
-  /* Parallel idct on the upper 8 rows */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vpx_idct16x16_256_add_neon_pass1_tran_low(input, pass1_output);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7
-  // which will be saved into row_idct_output.
-  vpx_idct16x16_256_add_neon_pass2_tran_low(input + 1, row_idct_output,
-                                            pass1_output, 0, dest, stride);
-
-  /* Parallel idct on the lower 8 rows */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vpx_idct16x16_256_add_neon_pass1_tran_low(input + 8 * 16, pass1_output);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7
-  // which will be saved into row_idct_output.
-  vpx_idct16x16_256_add_neon_pass2_tran_low(
-      input + 8 * 16 + 1, row_idct_output + 8, pass1_output, 0, dest, stride);
-
-  /* Parallel idct on the left 8 columns */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7.
-  // Then add the result to the destination data.
-  vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
-                                   pass1_output, 1, dest, stride);
-
-  /* Parallel idct on the right 8 columns */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7.
-  // Then add the result to the destination data.
-  vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
-                                   row_idct_output + 8, pass1_output, 1,
-                                   dest + 8, stride);
-
-#if HAVE_NEON_ASM
-  // restore d8-d15 register values.
-  vpx_pop_neon(store_reg);
-#endif
-}
-
-void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest,
-                               int stride) {
-#if HAVE_NEON_ASM
-  int64_t store_reg[8];
-#endif
-  int16_t pass1_output[16 * 16] = { 0 };
-  int16_t row_idct_output[16 * 16] = { 0 };
-
-#if HAVE_NEON_ASM
-  // save d8-d15 register values.
-  vpx_push_neon(store_reg);
-#endif
-
-  /* Parallel idct on the upper 8 rows */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vpx_idct16x16_10_add_neon_pass1(input, pass1_output);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7
-  // which will be saved into row_idct_output.
-  vpx_idct16x16_10_add_neon_pass2(input + 1, row_idct_output, pass1_output);
-
-  /* Skip Parallel idct on the lower 8 rows as they are all 0s */
-
-  /* Parallel idct on the left 8 columns */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7.
-  // Then add the result to the destination data.
-  vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
-                                   pass1_output, 1, dest, stride);
-
-  /* Parallel idct on the right 8 columns */
-  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
-  // stage 6 result in pass1_output.
-  vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output);
-
-  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
-  // with result in pass1(pass1_output) to calculate final result in stage 7.
-  // Then add the result to the destination data.
-  vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
-                                   row_idct_output + 8, pass1_output, 1,
-                                   dest + 8, stride);
-
-#if HAVE_NEON_ASM
-  // restore d8-d15 register values.
-  vpx_pop_neon(store_reg);
-#endif
-}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c
index 28b9465584..057731ad92 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c
@@ -13,6 +13,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/txfm_common.h"
 
@@ -87,614 +88,573 @@ static INLINE void load_4x8_s16(const tran_low_t *input, int16x4_t *const in0,
 // 13  84  93 103 110 125
 // 14  98 106 115 127
 // 15 117 128
-static void idct32_12_neon(const tran_low_t *input, int16_t *output) {
-  int16x8_t in0, in1, in2, in3, in4, in5, in6, in7;
-  int16x4_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int16x8_t in8, in9, in10, in11;
-  int16x8_t s1_16, s1_18, s1_19, s1_20, s1_21, s1_23, s1_24, s1_26, s1_27,
-      s1_28, s1_29, s1_31;
-  int16x8_t s2_8, s2_10, s2_11, s2_12, s2_13, s2_15, s2_18, s2_19, s2_20, s2_21,
-      s2_26, s2_27, s2_28, s2_29;
-  int16x8_t s3_4, s3_7, s3_10, s3_11, s3_12, s3_13, s3_17, s3_18, s3_21, s3_22,
-      s3_25, s3_26, s3_29, s3_30;
-  int16x8_t s4_0, s4_2, s4_3, s4_9, s4_10, s4_13, s4_14, s4_16, s4_17, s4_18,
-      s4_19, s4_20, s4_21, s4_22, s4_23, s4_24, s4_25, s4_26, s4_27, s4_28,
-      s4_29, s4_30, s4_31;
-  int16x8_t s5_0, s5_1, s5_2, s5_3, s5_5, s5_6, s5_8, s5_9, s5_10, s5_11, s5_12,
-      s5_13, s5_14, s5_15, s5_18, s5_19, s5_20, s5_21, s5_26, s5_27, s5_28,
-      s5_29;
-  int16x8_t s6_0, s6_1, s6_2, s6_3, s6_4, s6_5, s6_6, s6_7, s6_10, s6_11, s6_12,
-      s6_13, s6_16, s6_17, s6_18, s6_19, s6_20, s6_21, s6_22, s6_23, s6_24,
-      s6_25, s6_26, s6_27, s6_28, s6_29, s6_30, s6_31;
-  int16x8_t s7_0, s7_1, s7_2, s7_3, s7_4, s7_5, s7_6, s7_7, s7_8, s7_9, s7_10,
-      s7_11, s7_12, s7_13, s7_14, s7_15, s7_20, s7_21, s7_22, s7_23, s7_24,
-      s7_25, s7_26, s7_27;
+void vpx_idct32_12_neon(const tran_low_t *const input, int16_t *output) {
+  int16x4_t tmp[8];
+  int16x8_t in[12], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32];
 
-  load_8x8_s16(input, &in0, &in1, &in2, &in3, &in4, &in5, &in6, &in7);
-  transpose_s16_8x8(&in0, &in1, &in2, &in3, &in4, &in5, &in6, &in7);
+  load_8x8_s16(input, &in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+               &in[7]);
+  transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+                    &in[7]);
 
-  load_4x8_s16(input + 8, &tmp0, &tmp1, &tmp2, &tmp3, &tmp4, &tmp5, &tmp6,
-               &tmp7);
-  transpose_s16_4x8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, &in8, &in9,
-                    &in10, &in11);
+  load_4x8_s16(input + 8, &tmp[0], &tmp[1], &tmp[2], &tmp[3], &tmp[4], &tmp[5],
+               &tmp[6], &tmp[7]);
+  transpose_s16_4x8(tmp[0], tmp[1], tmp[2], tmp[3], tmp[4], tmp[5], tmp[6],
+                    tmp[7], &in[8], &in[9], &in[10], &in[11]);
 
   // stage 1
-  s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64);
-  s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64);
+  s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64);
+  s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64);
 
-  s1_18 = multiply_shift_and_narrow_s16(in9, cospi_23_64);
-  s1_29 = multiply_shift_and_narrow_s16(in9, cospi_9_64);
+  s1[18] = multiply_shift_and_narrow_s16(in[9], cospi_23_64);
+  s1[29] = multiply_shift_and_narrow_s16(in[9], cospi_9_64);
 
-  s1_19 = multiply_shift_and_narrow_s16(in7, -cospi_25_64);
-  s1_28 = multiply_shift_and_narrow_s16(in7, cospi_7_64);
+  s1[19] = multiply_shift_and_narrow_s16(in[7], -cospi_25_64);
+  s1[28] = multiply_shift_and_narrow_s16(in[7], cospi_7_64);
 
-  s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64);
-  s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64);
+  s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64);
+  s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64);
 
-  s1_21 = multiply_shift_and_narrow_s16(in11, -cospi_21_64);
-  s1_26 = multiply_shift_and_narrow_s16(in11, cospi_11_64);
+  s1[21] = multiply_shift_and_narrow_s16(in[11], -cospi_21_64);
+  s1[26] = multiply_shift_and_narrow_s16(in[11], cospi_11_64);
 
-  s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64);
-  s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64);
+  s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64);
+  s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64);
 
   // stage 2
-  s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64);
-  s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64);
+  s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64);
+  s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64);
 
-  s2_10 = multiply_shift_and_narrow_s16(in10, cospi_22_64);
-  s2_13 = multiply_shift_and_narrow_s16(in10, cospi_10_64);
+  s2[10] = multiply_shift_and_narrow_s16(in[10], cospi_22_64);
+  s2[13] = multiply_shift_and_narrow_s16(in[10], cospi_10_64);
 
-  s2_11 = multiply_shift_and_narrow_s16(in6, -cospi_26_64);
-  s2_12 = multiply_shift_and_narrow_s16(in6, cospi_6_64);
+  s2[11] = multiply_shift_and_narrow_s16(in[6], -cospi_26_64);
+  s2[12] = multiply_shift_and_narrow_s16(in[6], cospi_6_64);
 
-  s2_18 = vsubq_s16(s1_19, s1_18);
-  s2_19 = vaddq_s16(s1_18, s1_19);
-  s2_20 = vaddq_s16(s1_20, s1_21);
-  s2_21 = vsubq_s16(s1_20, s1_21);
-  s2_26 = vsubq_s16(s1_27, s1_26);
-  s2_27 = vaddq_s16(s1_26, s1_27);
-  s2_28 = vaddq_s16(s1_28, s1_29);
-  s2_29 = vsubq_s16(s1_28, s1_29);
+  s2[18] = vsubq_s16(s1[19], s1[18]);
+  s2[19] = vaddq_s16(s1[18], s1[19]);
+  s2[20] = vaddq_s16(s1[20], s1[21]);
+  s2[21] = vsubq_s16(s1[20], s1[21]);
+  s2[26] = vsubq_s16(s1[27], s1[26]);
+  s2[27] = vaddq_s16(s1[26], s1[27]);
+  s2[28] = vaddq_s16(s1[28], s1[29]);
+  s2[29] = vsubq_s16(s1[28], s1[29]);
 
   // stage 3
-  s3_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64);
-  s3_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64);
+  s3[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64);
+  s3[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64);
 
-  s3_10 = vsubq_s16(s2_11, s2_10);
-  s3_11 = vaddq_s16(s2_10, s2_11);
-  s3_12 = vaddq_s16(s2_12, s2_13);
-  s3_13 = vsubq_s16(s2_12, s2_13);
+  s3[10] = vsubq_s16(s2[11], s2[10]);
+  s3[11] = vaddq_s16(s2[10], s2[11]);
+  s3[12] = vaddq_s16(s2[12], s2[13]);
+  s3[13] = vsubq_s16(s2[12], s2[13]);
 
-  s3_17 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_4_64, s1_31,
-                                                   cospi_28_64);
-  s3_30 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_28_64, s1_31,
-                                                   cospi_4_64);
+  s3[17] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_4_64, s1[31],
+                                                    cospi_28_64);
+  s3[30] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_28_64, s1[31],
+                                                    cospi_4_64);
 
-  s3_18 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_28_64, s2_29,
-                                                   -cospi_4_64);
-  s3_29 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_4_64, s2_29,
-                                                   cospi_28_64);
+  s3[18] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_28_64,
+                                                    s2[29], -cospi_4_64);
+  s3[29] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_4_64, s2[29],
+                                                    cospi_28_64);
 
-  s3_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_20_64, s2_26,
-                                                   cospi_12_64);
-  s3_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, cospi_12_64, s2_26,
-                                                   cospi_20_64);
+  s3[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_20_64,
+                                                    s2[26], cospi_12_64);
+  s3[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], cospi_12_64, s2[26],
+                                                    cospi_20_64);
 
-  s3_22 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_12_64, s1_24,
-                                                   -cospi_20_64);
-  s3_25 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_20_64, s1_24,
-                                                   cospi_12_64);
+  s3[22] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_12_64,
+                                                    s1[24], -cospi_20_64);
+  s3[25] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_20_64,
+                                                    s1[24], cospi_12_64);
 
   // stage 4
-  s4_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64);
-  s4_2 = multiply_shift_and_narrow_s16(in8, cospi_24_64);
-  s4_3 = multiply_shift_and_narrow_s16(in8, cospi_8_64);
+  s4[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64);
+  s4[2] = multiply_shift_and_narrow_s16(in[8], cospi_24_64);
+  s4[3] = multiply_shift_and_narrow_s16(in[8], cospi_8_64);
 
-  s4_9 = multiply_accumulate_shift_and_narrow_s16(s2_8, -cospi_8_64, s2_15,
-                                                  cospi_24_64);
-  s4_14 = multiply_accumulate_shift_and_narrow_s16(s2_8, cospi_24_64, s2_15,
-                                                   cospi_8_64);
-
-  s4_10 = multiply_accumulate_shift_and_narrow_s16(s3_10, -cospi_24_64, s3_13,
-                                                   -cospi_8_64);
-  s4_13 = multiply_accumulate_shift_and_narrow_s16(s3_10, -cospi_8_64, s3_13,
+  s4[9] = multiply_accumulate_shift_and_narrow_s16(s2[8], -cospi_8_64, s2[15],
                                                    cospi_24_64);
+  s4[14] = multiply_accumulate_shift_and_narrow_s16(s2[8], cospi_24_64, s2[15],
+                                                    cospi_8_64);
 
-  s4_16 = vaddq_s16(s1_16, s2_19);
-  s4_17 = vaddq_s16(s3_17, s3_18);
-  s4_18 = vsubq_s16(s3_17, s3_18);
-  s4_19 = vsubq_s16(s1_16, s2_19);
-  s4_20 = vsubq_s16(s1_23, s2_20);
-  s4_21 = vsubq_s16(s3_22, s3_21);
-  s4_22 = vaddq_s16(s3_21, s3_22);
-  s4_23 = vaddq_s16(s2_20, s1_23);
-  s4_24 = vaddq_s16(s1_24, s2_27);
-  s4_25 = vaddq_s16(s3_25, s3_26);
-  s4_26 = vsubq_s16(s3_25, s3_26);
-  s4_27 = vsubq_s16(s1_24, s2_27);
-  s4_28 = vsubq_s16(s1_31, s2_28);
-  s4_29 = vsubq_s16(s3_30, s3_29);
-  s4_30 = vaddq_s16(s3_29, s3_30);
-  s4_31 = vaddq_s16(s2_28, s1_31);
+  s4[10] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_24_64,
+                                                    s3[13], -cospi_8_64);
+  s4[13] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_8_64, s3[13],
+                                                    cospi_24_64);
+
+  s4[16] = vaddq_s16(s1[16], s2[19]);
+  s4[17] = vaddq_s16(s3[17], s3[18]);
+  s4[18] = vsubq_s16(s3[17], s3[18]);
+  s4[19] = vsubq_s16(s1[16], s2[19]);
+  s4[20] = vsubq_s16(s1[23], s2[20]);
+  s4[21] = vsubq_s16(s3[22], s3[21]);
+  s4[22] = vaddq_s16(s3[21], s3[22]);
+  s4[23] = vaddq_s16(s2[20], s1[23]);
+  s4[24] = vaddq_s16(s1[24], s2[27]);
+  s4[25] = vaddq_s16(s3[25], s3[26]);
+  s4[26] = vsubq_s16(s3[25], s3[26]);
+  s4[27] = vsubq_s16(s1[24], s2[27]);
+  s4[28] = vsubq_s16(s1[31], s2[28]);
+  s4[29] = vsubq_s16(s3[30], s3[29]);
+  s4[30] = vaddq_s16(s3[29], s3[30]);
+  s4[31] = vaddq_s16(s2[28], s1[31]);
 
   // stage 5
-  s5_0 = vaddq_s16(s4_0, s4_3);
-  s5_1 = vaddq_s16(s4_0, s4_2);
-  s5_2 = vsubq_s16(s4_0, s4_2);
-  s5_3 = vsubq_s16(s4_0, s4_3);
+  s5[0] = vaddq_s16(s4[0], s4[3]);
+  s5[1] = vaddq_s16(s4[0], s4[2]);
+  s5[2] = vsubq_s16(s4[0], s4[2]);
+  s5[3] = vsubq_s16(s4[0], s4[3]);
 
-  s5_5 = sub_multiply_shift_and_narrow_s16(s3_7, s3_4, cospi_16_64);
-  s5_6 = add_multiply_shift_and_narrow_s16(s3_4, s3_7, cospi_16_64);
+  s5[5] = sub_multiply_shift_and_narrow_s16(s3[7], s3[4], cospi_16_64);
+  s5[6] = add_multiply_shift_and_narrow_s16(s3[4], s3[7], cospi_16_64);
 
-  s5_8 = vaddq_s16(s2_8, s3_11);
-  s5_9 = vaddq_s16(s4_9, s4_10);
-  s5_10 = vsubq_s16(s4_9, s4_10);
-  s5_11 = vsubq_s16(s2_8, s3_11);
-  s5_12 = vsubq_s16(s2_15, s3_12);
-  s5_13 = vsubq_s16(s4_14, s4_13);
-  s5_14 = vaddq_s16(s4_13, s4_14);
-  s5_15 = vaddq_s16(s2_15, s3_12);
+  s5[8] = vaddq_s16(s2[8], s3[11]);
+  s5[9] = vaddq_s16(s4[9], s4[10]);
+  s5[10] = vsubq_s16(s4[9], s4[10]);
+  s5[11] = vsubq_s16(s2[8], s3[11]);
+  s5[12] = vsubq_s16(s2[15], s3[12]);
+  s5[13] = vsubq_s16(s4[14], s4[13]);
+  s5[14] = vaddq_s16(s4[13], s4[14]);
+  s5[15] = vaddq_s16(s2[15], s3[12]);
 
-  s5_18 = multiply_accumulate_shift_and_narrow_s16(s4_18, -cospi_8_64, s4_29,
-                                                   cospi_24_64);
-  s5_29 = multiply_accumulate_shift_and_narrow_s16(s4_18, cospi_24_64, s4_29,
-                                                   cospi_8_64);
+  s5[18] = multiply_accumulate_shift_and_narrow_s16(s4[18], -cospi_8_64, s4[29],
+                                                    cospi_24_64);
+  s5[29] = multiply_accumulate_shift_and_narrow_s16(s4[18], cospi_24_64, s4[29],
+                                                    cospi_8_64);
 
-  s5_19 = multiply_accumulate_shift_and_narrow_s16(s4_19, -cospi_8_64, s4_28,
-                                                   cospi_24_64);
-  s5_28 = multiply_accumulate_shift_and_narrow_s16(s4_19, cospi_24_64, s4_28,
-                                                   cospi_8_64);
+  s5[19] = multiply_accumulate_shift_and_narrow_s16(s4[19], -cospi_8_64, s4[28],
+                                                    cospi_24_64);
+  s5[28] = multiply_accumulate_shift_and_narrow_s16(s4[19], cospi_24_64, s4[28],
+                                                    cospi_8_64);
 
-  s5_20 = multiply_accumulate_shift_and_narrow_s16(s4_20, -cospi_24_64, s4_27,
-                                                   -cospi_8_64);
-  s5_27 = multiply_accumulate_shift_and_narrow_s16(s4_20, -cospi_8_64, s4_27,
-                                                   cospi_24_64);
+  s5[20] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_24_64,
+                                                    s4[27], -cospi_8_64);
+  s5[27] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_8_64, s4[27],
+                                                    cospi_24_64);
 
-  s5_21 = multiply_accumulate_shift_and_narrow_s16(s4_21, -cospi_24_64, s4_26,
-                                                   -cospi_8_64);
-  s5_26 = multiply_accumulate_shift_and_narrow_s16(s4_21, -cospi_8_64, s4_26,
-                                                   cospi_24_64);
+  s5[21] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_24_64,
+                                                    s4[26], -cospi_8_64);
+  s5[26] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_8_64, s4[26],
+                                                    cospi_24_64);
 
   // stage 6
-  s6_0 = vaddq_s16(s5_0, s3_7);
-  s6_1 = vaddq_s16(s5_1, s5_6);
-  s6_2 = vaddq_s16(s5_2, s5_5);
-  s6_3 = vaddq_s16(s5_3, s3_4);
-  s6_4 = vsubq_s16(s5_3, s3_4);
-  s6_5 = vsubq_s16(s5_2, s5_5);
-  s6_6 = vsubq_s16(s5_1, s5_6);
-  s6_7 = vsubq_s16(s5_0, s3_7);
+  s6[0] = vaddq_s16(s5[0], s3[7]);
+  s6[1] = vaddq_s16(s5[1], s5[6]);
+  s6[2] = vaddq_s16(s5[2], s5[5]);
+  s6[3] = vaddq_s16(s5[3], s3[4]);
+  s6[4] = vsubq_s16(s5[3], s3[4]);
+  s6[5] = vsubq_s16(s5[2], s5[5]);
+  s6[6] = vsubq_s16(s5[1], s5[6]);
+  s6[7] = vsubq_s16(s5[0], s3[7]);
 
-  s6_10 = sub_multiply_shift_and_narrow_s16(s5_13, s5_10, cospi_16_64);
-  s6_13 = add_multiply_shift_and_narrow_s16(s5_10, s5_13, cospi_16_64);
+  s6[10] = sub_multiply_shift_and_narrow_s16(s5[13], s5[10], cospi_16_64);
+  s6[13] = add_multiply_shift_and_narrow_s16(s5[10], s5[13], cospi_16_64);
 
-  s6_11 = sub_multiply_shift_and_narrow_s16(s5_12, s5_11, cospi_16_64);
-  s6_12 = add_multiply_shift_and_narrow_s16(s5_11, s5_12, cospi_16_64);
+  s6[11] = sub_multiply_shift_and_narrow_s16(s5[12], s5[11], cospi_16_64);
+  s6[12] = add_multiply_shift_and_narrow_s16(s5[11], s5[12], cospi_16_64);
 
-  s6_16 = vaddq_s16(s4_16, s4_23);
-  s6_17 = vaddq_s16(s4_17, s4_22);
-  s6_18 = vaddq_s16(s5_18, s5_21);
-  s6_19 = vaddq_s16(s5_19, s5_20);
-  s6_20 = vsubq_s16(s5_19, s5_20);
-  s6_21 = vsubq_s16(s5_18, s5_21);
-  s6_22 = vsubq_s16(s4_17, s4_22);
-  s6_23 = vsubq_s16(s4_16, s4_23);
+  s6[16] = vaddq_s16(s4[16], s4[23]);
+  s6[17] = vaddq_s16(s4[17], s4[22]);
+  s6[18] = vaddq_s16(s5[18], s5[21]);
+  s6[19] = vaddq_s16(s5[19], s5[20]);
+  s6[20] = vsubq_s16(s5[19], s5[20]);
+  s6[21] = vsubq_s16(s5[18], s5[21]);
+  s6[22] = vsubq_s16(s4[17], s4[22]);
+  s6[23] = vsubq_s16(s4[16], s4[23]);
 
-  s6_24 = vsubq_s16(s4_31, s4_24);
-  s6_25 = vsubq_s16(s4_30, s4_25);
-  s6_26 = vsubq_s16(s5_29, s5_26);
-  s6_27 = vsubq_s16(s5_28, s5_27);
-  s6_28 = vaddq_s16(s5_27, s5_28);
-  s6_29 = vaddq_s16(s5_26, s5_29);
-  s6_30 = vaddq_s16(s4_25, s4_30);
-  s6_31 = vaddq_s16(s4_24, s4_31);
+  s6[24] = vsubq_s16(s4[31], s4[24]);
+  s6[25] = vsubq_s16(s4[30], s4[25]);
+  s6[26] = vsubq_s16(s5[29], s5[26]);
+  s6[27] = vsubq_s16(s5[28], s5[27]);
+  s6[28] = vaddq_s16(s5[27], s5[28]);
+  s6[29] = vaddq_s16(s5[26], s5[29]);
+  s6[30] = vaddq_s16(s4[25], s4[30]);
+  s6[31] = vaddq_s16(s4[24], s4[31]);
 
   // stage 7
-  s7_0 = vaddq_s16(s6_0, s5_15);
-  s7_1 = vaddq_s16(s6_1, s5_14);
-  s7_2 = vaddq_s16(s6_2, s6_13);
-  s7_3 = vaddq_s16(s6_3, s6_12);
-  s7_4 = vaddq_s16(s6_4, s6_11);
-  s7_5 = vaddq_s16(s6_5, s6_10);
-  s7_6 = vaddq_s16(s6_6, s5_9);
-  s7_7 = vaddq_s16(s6_7, s5_8);
-  s7_8 = vsubq_s16(s6_7, s5_8);
-  s7_9 = vsubq_s16(s6_6, s5_9);
-  s7_10 = vsubq_s16(s6_5, s6_10);
-  s7_11 = vsubq_s16(s6_4, s6_11);
-  s7_12 = vsubq_s16(s6_3, s6_12);
-  s7_13 = vsubq_s16(s6_2, s6_13);
-  s7_14 = vsubq_s16(s6_1, s5_14);
-  s7_15 = vsubq_s16(s6_0, s5_15);
+  s7[0] = vaddq_s16(s6[0], s5[15]);
+  s7[1] = vaddq_s16(s6[1], s5[14]);
+  s7[2] = vaddq_s16(s6[2], s6[13]);
+  s7[3] = vaddq_s16(s6[3], s6[12]);
+  s7[4] = vaddq_s16(s6[4], s6[11]);
+  s7[5] = vaddq_s16(s6[5], s6[10]);
+  s7[6] = vaddq_s16(s6[6], s5[9]);
+  s7[7] = vaddq_s16(s6[7], s5[8]);
+  s7[8] = vsubq_s16(s6[7], s5[8]);
+  s7[9] = vsubq_s16(s6[6], s5[9]);
+  s7[10] = vsubq_s16(s6[5], s6[10]);
+  s7[11] = vsubq_s16(s6[4], s6[11]);
+  s7[12] = vsubq_s16(s6[3], s6[12]);
+  s7[13] = vsubq_s16(s6[2], s6[13]);
+  s7[14] = vsubq_s16(s6[1], s5[14]);
+  s7[15] = vsubq_s16(s6[0], s5[15]);
 
-  s7_20 = sub_multiply_shift_and_narrow_s16(s6_27, s6_20, cospi_16_64);
-  s7_27 = add_multiply_shift_and_narrow_s16(s6_20, s6_27, cospi_16_64);
+  s7[20] = sub_multiply_shift_and_narrow_s16(s6[27], s6[20], cospi_16_64);
+  s7[27] = add_multiply_shift_and_narrow_s16(s6[20], s6[27], cospi_16_64);
 
-  s7_21 = sub_multiply_shift_and_narrow_s16(s6_26, s6_21, cospi_16_64);
-  s7_26 = add_multiply_shift_and_narrow_s16(s6_21, s6_26, cospi_16_64);
+  s7[21] = sub_multiply_shift_and_narrow_s16(s6[26], s6[21], cospi_16_64);
+  s7[26] = add_multiply_shift_and_narrow_s16(s6[21], s6[26], cospi_16_64);
 
-  s7_22 = sub_multiply_shift_and_narrow_s16(s6_25, s6_22, cospi_16_64);
-  s7_25 = add_multiply_shift_and_narrow_s16(s6_22, s6_25, cospi_16_64);
+  s7[22] = sub_multiply_shift_and_narrow_s16(s6[25], s6[22], cospi_16_64);
+  s7[25] = add_multiply_shift_and_narrow_s16(s6[22], s6[25], cospi_16_64);
 
-  s7_23 = sub_multiply_shift_and_narrow_s16(s6_24, s6_23, cospi_16_64);
-  s7_24 = add_multiply_shift_and_narrow_s16(s6_23, s6_24, cospi_16_64);
+  s7[23] = sub_multiply_shift_and_narrow_s16(s6[24], s6[23], cospi_16_64);
+  s7[24] = add_multiply_shift_and_narrow_s16(s6[23], s6[24], cospi_16_64);
 
   // final stage
-  vst1q_s16(output, vaddq_s16(s7_0, s6_31));
+  vst1q_s16(output, vaddq_s16(s7[0], s6[31]));
   output += 16;
-  vst1q_s16(output, vaddq_s16(s7_1, s6_30));
+  vst1q_s16(output, vaddq_s16(s7[1], s6[30]));
   output += 16;
-  vst1q_s16(output, vaddq_s16(s7_2, s6_29));
+  vst1q_s16(output, vaddq_s16(s7[2], s6[29]));
   output += 16;
-  vst1q_s16(output, vaddq_s16(s7_3, s6_28));
+  vst1q_s16(output, vaddq_s16(s7[3], s6[28]));
   output += 16;
-  vst1q_s16(output, vaddq_s16(s7_4, s7_27));
+  vst1q_s16(output, vaddq_s16(s7[4], s7[27]));
   output += 16;
-  vst1q_s16(output, vaddq_s16(s7_5, s7_26));
+  vst1q_s16(output, vaddq_s16(s7[5], s7[26]));
   output += 16;
-  vst1q_s16(output, vaddq_s16(s7_6, s7_25));
+  vst1q_s16(output, vaddq_s16(s7[6], s7[25]));
   output += 16;
-  vst1q_s16(output, vaddq_s16(s7_7, s7_24));
+  vst1q_s16(output, vaddq_s16(s7[7], s7[24]));
   output += 16;
 
-  vst1q_s16(output, vaddq_s16(s7_8, s7_23));
+  vst1q_s16(output, vaddq_s16(s7[8], s7[23]));
   output += 16;
-  vst1q_s16(output, vaddq_s16(s7_9, s7_22));
+  vst1q_s16(output, vaddq_s16(s7[9], s7[22]));
   output += 16;
-  vst1q_s16(output, vaddq_s16(s7_10, s7_21));
+  vst1q_s16(output, vaddq_s16(s7[10], s7[21]));
   output += 16;
-  vst1q_s16(output, vaddq_s16(s7_11, s7_20));
+  vst1q_s16(output, vaddq_s16(s7[11], s7[20]));
   output += 16;
-  vst1q_s16(output, vaddq_s16(s7_12, s6_19));
+  vst1q_s16(output, vaddq_s16(s7[12], s6[19]));
   output += 16;
-  vst1q_s16(output, vaddq_s16(s7_13, s6_18));
+  vst1q_s16(output, vaddq_s16(s7[13], s6[18]));
   output += 16;
-  vst1q_s16(output, vaddq_s16(s7_14, s6_17));
+  vst1q_s16(output, vaddq_s16(s7[14], s6[17]));
   output += 16;
-  vst1q_s16(output, vaddq_s16(s7_15, s6_16));
+  vst1q_s16(output, vaddq_s16(s7[15], s6[16]));
   output += 16;
 
-  vst1q_s16(output, vsubq_s16(s7_15, s6_16));
+  vst1q_s16(output, vsubq_s16(s7[15], s6[16]));
   output += 16;
-  vst1q_s16(output, vsubq_s16(s7_14, s6_17));
+  vst1q_s16(output, vsubq_s16(s7[14], s6[17]));
   output += 16;
-  vst1q_s16(output, vsubq_s16(s7_13, s6_18));
+  vst1q_s16(output, vsubq_s16(s7[13], s6[18]));
   output += 16;
-  vst1q_s16(output, vsubq_s16(s7_12, s6_19));
+  vst1q_s16(output, vsubq_s16(s7[12], s6[19]));
   output += 16;
-  vst1q_s16(output, vsubq_s16(s7_11, s7_20));
+  vst1q_s16(output, vsubq_s16(s7[11], s7[20]));
   output += 16;
-  vst1q_s16(output, vsubq_s16(s7_10, s7_21));
+  vst1q_s16(output, vsubq_s16(s7[10], s7[21]));
   output += 16;
-  vst1q_s16(output, vsubq_s16(s7_9, s7_22));
+  vst1q_s16(output, vsubq_s16(s7[9], s7[22]));
   output += 16;
-  vst1q_s16(output, vsubq_s16(s7_8, s7_23));
+  vst1q_s16(output, vsubq_s16(s7[8], s7[23]));
   output += 16;
 
-  vst1q_s16(output, vsubq_s16(s7_7, s7_24));
+  vst1q_s16(output, vsubq_s16(s7[7], s7[24]));
   output += 16;
-  vst1q_s16(output, vsubq_s16(s7_6, s7_25));
+  vst1q_s16(output, vsubq_s16(s7[6], s7[25]));
   output += 16;
-  vst1q_s16(output, vsubq_s16(s7_5, s7_26));
+  vst1q_s16(output, vsubq_s16(s7[5], s7[26]));
   output += 16;
-  vst1q_s16(output, vsubq_s16(s7_4, s7_27));
+  vst1q_s16(output, vsubq_s16(s7[4], s7[27]));
   output += 16;
-  vst1q_s16(output, vsubq_s16(s7_3, s6_28));
+  vst1q_s16(output, vsubq_s16(s7[3], s6[28]));
   output += 16;
-  vst1q_s16(output, vsubq_s16(s7_2, s6_29));
+  vst1q_s16(output, vsubq_s16(s7[2], s6[29]));
   output += 16;
-  vst1q_s16(output, vsubq_s16(s7_1, s6_30));
+  vst1q_s16(output, vsubq_s16(s7[1], s6[30]));
   output += 16;
-  vst1q_s16(output, vsubq_s16(s7_0, s6_31));
+  vst1q_s16(output, vsubq_s16(s7[0], s6[31]));
 }
 
-static void idct32_16_neon(const int16_t *input, uint8_t *output, int stride) {
-  int16x8_t in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
-      in13, in14, in15;
-  int16x8_t s1_16, s1_17, s1_18, s1_19, s1_20, s1_21, s1_22, s1_23, s1_24,
-      s1_25, s1_26, s1_27, s1_28, s1_29, s1_30, s1_31;
-  int16x8_t s2_8, s2_9, s2_10, s2_11, s2_12, s2_13, s2_14, s2_15, s2_16, s2_17,
-      s2_18, s2_19, s2_20, s2_21, s2_22, s2_23, s2_24, s2_25, s2_26, s2_27,
-      s2_28, s2_29, s2_30, s2_31;
-  int16x8_t s3_4, s3_5, s3_6, s3_7, s3_8, s3_9, s3_10, s3_11, s3_12, s3_13,
-      s3_14, s3_15, s3_17, s3_18, s3_21, s3_22, s3_25, s3_26, s3_29, s3_30;
-  int16x8_t s4_0, s4_2, s4_3, s4_4, s4_5, s4_6, s4_7, s4_9, s4_10, s4_13, s4_14,
-      s4_16, s4_17, s4_18, s4_19, s4_20, s4_21, s4_22, s4_23, s4_24, s4_25,
-      s4_26, s4_27, s4_28, s4_29, s4_30, s4_31;
-  int16x8_t s5_0, s5_1, s5_2, s5_3, s5_5, s5_6, s5_8, s5_9, s5_10, s5_11, s5_12,
-      s5_13, s5_14, s5_15, s5_18, s5_19, s5_20, s5_21, s5_26, s5_27, s5_28,
-      s5_29;
-  int16x8_t s6_0, s6_1, s6_2, s6_3, s6_4, s6_5, s6_6, s6_7, s6_10, s6_11, s6_12,
-      s6_13, s6_16, s6_17, s6_18, s6_19, s6_20, s6_21, s6_22, s6_23, s6_24,
-      s6_25, s6_26, s6_27, s6_28, s6_29, s6_30, s6_31;
-  int16x8_t s7_0, s7_1, s7_2, s7_3, s7_4, s7_5, s7_6, s7_7, s7_8, s7_9, s7_10,
-      s7_11, s7_12, s7_13, s7_14, s7_15, s7_20, s7_21, s7_22, s7_23, s7_24,
-      s7_25, s7_26, s7_27;
-  int16x8_t out0, out1, out2, out3, out4, out5, out6, out7;
+void vpx_idct32_16_neon(const int16_t *const input, void *const output,
+                        const int stride, const int highbd_flag) {
+  int16x8_t in[16], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32],
+      out[32];
 
-  load_and_transpose_s16_8x8(input, 16, &in0, &in1, &in2, &in3, &in4, &in5,
-                             &in6, &in7);
+  load_and_transpose_s16_8x8(input, 16, &in[0], &in[1], &in[2], &in[3], &in[4],
+                             &in[5], &in[6], &in[7]);
 
-  load_and_transpose_s16_8x8(input + 8, 16, &in8, &in9, &in10, &in11, &in12,
-                             &in13, &in14, &in15);
+  load_and_transpose_s16_8x8(input + 8, 16, &in[8], &in[9], &in[10], &in[11],
+                             &in[12], &in[13], &in[14], &in[15]);
 
   // stage 1
-  s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64);
-  s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64);
+  s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64);
+  s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64);
 
-  s1_17 = multiply_shift_and_narrow_s16(in15, -cospi_17_64);
-  s1_30 = multiply_shift_and_narrow_s16(in15, cospi_15_64);
+  s1[17] = multiply_shift_and_narrow_s16(in[15], -cospi_17_64);
+  s1[30] = multiply_shift_and_narrow_s16(in[15], cospi_15_64);
 
-  s1_18 = multiply_shift_and_narrow_s16(in9, cospi_23_64);
-  s1_29 = multiply_shift_and_narrow_s16(in9, cospi_9_64);
+  s1[18] = multiply_shift_and_narrow_s16(in[9], cospi_23_64);
+  s1[29] = multiply_shift_and_narrow_s16(in[9], cospi_9_64);
 
-  s1_19 = multiply_shift_and_narrow_s16(in7, -cospi_25_64);
-  s1_28 = multiply_shift_and_narrow_s16(in7, cospi_7_64);
+  s1[19] = multiply_shift_and_narrow_s16(in[7], -cospi_25_64);
+  s1[28] = multiply_shift_and_narrow_s16(in[7], cospi_7_64);
 
-  s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64);
-  s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64);
+  s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64);
+  s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64);
 
-  s1_21 = multiply_shift_and_narrow_s16(in11, -cospi_21_64);
-  s1_26 = multiply_shift_and_narrow_s16(in11, cospi_11_64);
+  s1[21] = multiply_shift_and_narrow_s16(in[11], -cospi_21_64);
+  s1[26] = multiply_shift_and_narrow_s16(in[11], cospi_11_64);
 
-  s1_22 = multiply_shift_and_narrow_s16(in13, cospi_19_64);
-  s1_25 = multiply_shift_and_narrow_s16(in13, cospi_13_64);
+  s1[22] = multiply_shift_and_narrow_s16(in[13], cospi_19_64);
+  s1[25] = multiply_shift_and_narrow_s16(in[13], cospi_13_64);
 
-  s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64);
-  s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64);
+  s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64);
+  s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64);
 
   // stage 2
-  s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64);
-  s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64);
+  s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64);
+  s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64);
 
-  s2_9 = multiply_shift_and_narrow_s16(in14, -cospi_18_64);
-  s2_14 = multiply_shift_and_narrow_s16(in14, cospi_14_64);
+  s2[9] = multiply_shift_and_narrow_s16(in[14], -cospi_18_64);
+  s2[14] = multiply_shift_and_narrow_s16(in[14], cospi_14_64);
 
-  s2_10 = multiply_shift_and_narrow_s16(in10, cospi_22_64);
-  s2_13 = multiply_shift_and_narrow_s16(in10, cospi_10_64);
+  s2[10] = multiply_shift_and_narrow_s16(in[10], cospi_22_64);
+  s2[13] = multiply_shift_and_narrow_s16(in[10], cospi_10_64);
 
-  s2_11 = multiply_shift_and_narrow_s16(in6, -cospi_26_64);
-  s2_12 = multiply_shift_and_narrow_s16(in6, cospi_6_64);
+  s2[11] = multiply_shift_and_narrow_s16(in[6], -cospi_26_64);
+  s2[12] = multiply_shift_and_narrow_s16(in[6], cospi_6_64);
 
-  s2_16 = vaddq_s16(s1_16, s1_17);
-  s2_17 = vsubq_s16(s1_16, s1_17);
-  s2_18 = vsubq_s16(s1_19, s1_18);
-  s2_19 = vaddq_s16(s1_18, s1_19);
-  s2_20 = vaddq_s16(s1_20, s1_21);
-  s2_21 = vsubq_s16(s1_20, s1_21);
-  s2_22 = vsubq_s16(s1_23, s1_22);
-  s2_23 = vaddq_s16(s1_22, s1_23);
-  s2_24 = vaddq_s16(s1_24, s1_25);
-  s2_25 = vsubq_s16(s1_24, s1_25);
-  s2_26 = vsubq_s16(s1_27, s1_26);
-  s2_27 = vaddq_s16(s1_26, s1_27);
-  s2_28 = vaddq_s16(s1_28, s1_29);
-  s2_29 = vsubq_s16(s1_28, s1_29);
-  s2_30 = vsubq_s16(s1_31, s1_30);
-  s2_31 = vaddq_s16(s1_30, s1_31);
+  s2[16] = vaddq_s16(s1[16], s1[17]);
+  s2[17] = vsubq_s16(s1[16], s1[17]);
+  s2[18] = vsubq_s16(s1[19], s1[18]);
+  s2[19] = vaddq_s16(s1[18], s1[19]);
+  s2[20] = vaddq_s16(s1[20], s1[21]);
+  s2[21] = vsubq_s16(s1[20], s1[21]);
+  s2[22] = vsubq_s16(s1[23], s1[22]);
+  s2[23] = vaddq_s16(s1[22], s1[23]);
+  s2[24] = vaddq_s16(s1[24], s1[25]);
+  s2[25] = vsubq_s16(s1[24], s1[25]);
+  s2[26] = vsubq_s16(s1[27], s1[26]);
+  s2[27] = vaddq_s16(s1[26], s1[27]);
+  s2[28] = vaddq_s16(s1[28], s1[29]);
+  s2[29] = vsubq_s16(s1[28], s1[29]);
+  s2[30] = vsubq_s16(s1[31], s1[30]);
+  s2[31] = vaddq_s16(s1[30], s1[31]);
 
   // stage 3
-  s3_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64);
-  s3_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64);
+  s3[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64);
+  s3[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64);
 
-  s3_5 = multiply_shift_and_narrow_s16(in12, -cospi_20_64);
-  s3_6 = multiply_shift_and_narrow_s16(in12, cospi_12_64);
+  s3[5] = multiply_shift_and_narrow_s16(in[12], -cospi_20_64);
+  s3[6] = multiply_shift_and_narrow_s16(in[12], cospi_12_64);
 
-  s3_8 = vaddq_s16(s2_8, s2_9);
-  s3_9 = vsubq_s16(s2_8, s2_9);
-  s3_10 = vsubq_s16(s2_11, s2_10);
-  s3_11 = vaddq_s16(s2_10, s2_11);
-  s3_12 = vaddq_s16(s2_12, s2_13);
-  s3_13 = vsubq_s16(s2_12, s2_13);
-  s3_14 = vsubq_s16(s2_15, s2_14);
-  s3_15 = vaddq_s16(s2_14, s2_15);
+  s3[8] = vaddq_s16(s2[8], s2[9]);
+  s3[9] = vsubq_s16(s2[8], s2[9]);
+  s3[10] = vsubq_s16(s2[11], s2[10]);
+  s3[11] = vaddq_s16(s2[10], s2[11]);
+  s3[12] = vaddq_s16(s2[12], s2[13]);
+  s3[13] = vsubq_s16(s2[12], s2[13]);
+  s3[14] = vsubq_s16(s2[15], s2[14]);
+  s3[15] = vaddq_s16(s2[14], s2[15]);
 
-  s3_17 = multiply_accumulate_shift_and_narrow_s16(s2_17, -cospi_4_64, s2_30,
-                                                   cospi_28_64);
-  s3_30 = multiply_accumulate_shift_and_narrow_s16(s2_17, cospi_28_64, s2_30,
-                                                   cospi_4_64);
+  s3[17] = multiply_accumulate_shift_and_narrow_s16(s2[17], -cospi_4_64, s2[30],
+                                                    cospi_28_64);
+  s3[30] = multiply_accumulate_shift_and_narrow_s16(s2[17], cospi_28_64, s2[30],
+                                                    cospi_4_64);
 
-  s3_18 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_28_64, s2_29,
-                                                   -cospi_4_64);
-  s3_29 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_4_64, s2_29,
-                                                   cospi_28_64);
+  s3[18] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_28_64,
+                                                    s2[29], -cospi_4_64);
+  s3[29] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_4_64, s2[29],
+                                                    cospi_28_64);
 
-  s3_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_20_64, s2_26,
-                                                   cospi_12_64);
-  s3_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, cospi_12_64, s2_26,
-                                                   cospi_20_64);
+  s3[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_20_64,
+                                                    s2[26], cospi_12_64);
+  s3[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], cospi_12_64, s2[26],
+                                                    cospi_20_64);
 
-  s3_22 = multiply_accumulate_shift_and_narrow_s16(s2_22, -cospi_12_64, s2_25,
-                                                   -cospi_20_64);
-  s3_25 = multiply_accumulate_shift_and_narrow_s16(s2_22, -cospi_20_64, s2_25,
-                                                   cospi_12_64);
+  s3[22] = multiply_accumulate_shift_and_narrow_s16(s2[22], -cospi_12_64,
+                                                    s2[25], -cospi_20_64);
+  s3[25] = multiply_accumulate_shift_and_narrow_s16(s2[22], -cospi_20_64,
+                                                    s2[25], cospi_12_64);
 
   // stage 4
-  s4_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64);
-  s4_2 = multiply_shift_and_narrow_s16(in8, cospi_24_64);
-  s4_3 = multiply_shift_and_narrow_s16(in8, cospi_8_64);
+  s4[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64);
+  s4[2] = multiply_shift_and_narrow_s16(in[8], cospi_24_64);
+  s4[3] = multiply_shift_and_narrow_s16(in[8], cospi_8_64);
 
-  s4_4 = vaddq_s16(s3_4, s3_5);
-  s4_5 = vsubq_s16(s3_4, s3_5);
-  s4_6 = vsubq_s16(s3_7, s3_6);
-  s4_7 = vaddq_s16(s3_6, s3_7);
+  s4[4] = vaddq_s16(s3[4], s3[5]);
+  s4[5] = vsubq_s16(s3[4], s3[5]);
+  s4[6] = vsubq_s16(s3[7], s3[6]);
+  s4[7] = vaddq_s16(s3[6], s3[7]);
 
-  s4_9 = multiply_accumulate_shift_and_narrow_s16(s3_9, -cospi_8_64, s3_14,
-                                                  cospi_24_64);
-  s4_14 = multiply_accumulate_shift_and_narrow_s16(s3_9, cospi_24_64, s3_14,
-                                                   cospi_8_64);
-
-  s4_10 = multiply_accumulate_shift_and_narrow_s16(s3_10, -cospi_24_64, s3_13,
-                                                   -cospi_8_64);
-  s4_13 = multiply_accumulate_shift_and_narrow_s16(s3_10, -cospi_8_64, s3_13,
+  s4[9] = multiply_accumulate_shift_and_narrow_s16(s3[9], -cospi_8_64, s3[14],
                                                    cospi_24_64);
+  s4[14] = multiply_accumulate_shift_and_narrow_s16(s3[9], cospi_24_64, s3[14],
+                                                    cospi_8_64);
 
-  s4_16 = vaddq_s16(s2_16, s2_19);
-  s4_17 = vaddq_s16(s3_17, s3_18);
-  s4_18 = vsubq_s16(s3_17, s3_18);
-  s4_19 = vsubq_s16(s2_16, s2_19);
-  s4_20 = vsubq_s16(s2_23, s2_20);
-  s4_21 = vsubq_s16(s3_22, s3_21);
-  s4_22 = vaddq_s16(s3_21, s3_22);
-  s4_23 = vaddq_s16(s2_20, s2_23);
-  s4_24 = vaddq_s16(s2_24, s2_27);
-  s4_25 = vaddq_s16(s3_25, s3_26);
-  s4_26 = vsubq_s16(s3_25, s3_26);
-  s4_27 = vsubq_s16(s2_24, s2_27);
-  s4_28 = vsubq_s16(s2_31, s2_28);
-  s4_29 = vsubq_s16(s3_30, s3_29);
-  s4_30 = vaddq_s16(s3_29, s3_30);
-  s4_31 = vaddq_s16(s2_28, s2_31);
+  s4[10] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_24_64,
+                                                    s3[13], -cospi_8_64);
+  s4[13] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_8_64, s3[13],
+                                                    cospi_24_64);
+
+  s4[16] = vaddq_s16(s2[16], s2[19]);
+  s4[17] = vaddq_s16(s3[17], s3[18]);
+  s4[18] = vsubq_s16(s3[17], s3[18]);
+  s4[19] = vsubq_s16(s2[16], s2[19]);
+  s4[20] = vsubq_s16(s2[23], s2[20]);
+  s4[21] = vsubq_s16(s3[22], s3[21]);
+  s4[22] = vaddq_s16(s3[21], s3[22]);
+  s4[23] = vaddq_s16(s2[20], s2[23]);
+  s4[24] = vaddq_s16(s2[24], s2[27]);
+  s4[25] = vaddq_s16(s3[25], s3[26]);
+  s4[26] = vsubq_s16(s3[25], s3[26]);
+  s4[27] = vsubq_s16(s2[24], s2[27]);
+  s4[28] = vsubq_s16(s2[31], s2[28]);
+  s4[29] = vsubq_s16(s3[30], s3[29]);
+  s4[30] = vaddq_s16(s3[29], s3[30]);
+  s4[31] = vaddq_s16(s2[28], s2[31]);
 
   // stage 5
-  s5_0 = vaddq_s16(s4_0, s4_3);
-  s5_1 = vaddq_s16(s4_0, s4_2);
-  s5_2 = vsubq_s16(s4_0, s4_2);
-  s5_3 = vsubq_s16(s4_0, s4_3);
+  s5[0] = vaddq_s16(s4[0], s4[3]);
+  s5[1] = vaddq_s16(s4[0], s4[2]);
+  s5[2] = vsubq_s16(s4[0], s4[2]);
+  s5[3] = vsubq_s16(s4[0], s4[3]);
 
-  s5_5 = sub_multiply_shift_and_narrow_s16(s4_6, s4_5, cospi_16_64);
-  s5_6 = add_multiply_shift_and_narrow_s16(s4_5, s4_6, cospi_16_64);
+  s5[5] = sub_multiply_shift_and_narrow_s16(s4[6], s4[5], cospi_16_64);
+  s5[6] = add_multiply_shift_and_narrow_s16(s4[5], s4[6], cospi_16_64);
 
-  s5_8 = vaddq_s16(s3_8, s3_11);
-  s5_9 = vaddq_s16(s4_9, s4_10);
-  s5_10 = vsubq_s16(s4_9, s4_10);
-  s5_11 = vsubq_s16(s3_8, s3_11);
-  s5_12 = vsubq_s16(s3_15, s3_12);
-  s5_13 = vsubq_s16(s4_14, s4_13);
-  s5_14 = vaddq_s16(s4_13, s4_14);
-  s5_15 = vaddq_s16(s3_15, s3_12);
+  s5[8] = vaddq_s16(s3[8], s3[11]);
+  s5[9] = vaddq_s16(s4[9], s4[10]);
+  s5[10] = vsubq_s16(s4[9], s4[10]);
+  s5[11] = vsubq_s16(s3[8], s3[11]);
+  s5[12] = vsubq_s16(s3[15], s3[12]);
+  s5[13] = vsubq_s16(s4[14], s4[13]);
+  s5[14] = vaddq_s16(s4[13], s4[14]);
+  s5[15] = vaddq_s16(s3[15], s3[12]);
 
-  s5_18 = multiply_accumulate_shift_and_narrow_s16(s4_18, -cospi_8_64, s4_29,
-                                                   cospi_24_64);
-  s5_29 = multiply_accumulate_shift_and_narrow_s16(s4_18, cospi_24_64, s4_29,
-                                                   cospi_8_64);
+  s5[18] = multiply_accumulate_shift_and_narrow_s16(s4[18], -cospi_8_64, s4[29],
+                                                    cospi_24_64);
+  s5[29] = multiply_accumulate_shift_and_narrow_s16(s4[18], cospi_24_64, s4[29],
+                                                    cospi_8_64);
 
-  s5_19 = multiply_accumulate_shift_and_narrow_s16(s4_19, -cospi_8_64, s4_28,
-                                                   cospi_24_64);
-  s5_28 = multiply_accumulate_shift_and_narrow_s16(s4_19, cospi_24_64, s4_28,
-                                                   cospi_8_64);
+  s5[19] = multiply_accumulate_shift_and_narrow_s16(s4[19], -cospi_8_64, s4[28],
+                                                    cospi_24_64);
+  s5[28] = multiply_accumulate_shift_and_narrow_s16(s4[19], cospi_24_64, s4[28],
+                                                    cospi_8_64);
 
-  s5_20 = multiply_accumulate_shift_and_narrow_s16(s4_20, -cospi_24_64, s4_27,
-                                                   -cospi_8_64);
-  s5_27 = multiply_accumulate_shift_and_narrow_s16(s4_20, -cospi_8_64, s4_27,
-                                                   cospi_24_64);
+  s5[20] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_24_64,
+                                                    s4[27], -cospi_8_64);
+  s5[27] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_8_64, s4[27],
+                                                    cospi_24_64);
 
-  s5_21 = multiply_accumulate_shift_and_narrow_s16(s4_21, -cospi_24_64, s4_26,
-                                                   -cospi_8_64);
-  s5_26 = multiply_accumulate_shift_and_narrow_s16(s4_21, -cospi_8_64, s4_26,
-                                                   cospi_24_64);
+  s5[21] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_24_64,
+                                                    s4[26], -cospi_8_64);
+  s5[26] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_8_64, s4[26],
+                                                    cospi_24_64);
 
   // stage 6
-  s6_0 = vaddq_s16(s5_0, s4_7);
-  s6_1 = vaddq_s16(s5_1, s5_6);
-  s6_2 = vaddq_s16(s5_2, s5_5);
-  s6_3 = vaddq_s16(s5_3, s4_4);
-  s6_4 = vsubq_s16(s5_3, s4_4);
-  s6_5 = vsubq_s16(s5_2, s5_5);
-  s6_6 = vsubq_s16(s5_1, s5_6);
-  s6_7 = vsubq_s16(s5_0, s4_7);
+  s6[0] = vaddq_s16(s5[0], s4[7]);
+  s6[1] = vaddq_s16(s5[1], s5[6]);
+  s6[2] = vaddq_s16(s5[2], s5[5]);
+  s6[3] = vaddq_s16(s5[3], s4[4]);
+  s6[4] = vsubq_s16(s5[3], s4[4]);
+  s6[5] = vsubq_s16(s5[2], s5[5]);
+  s6[6] = vsubq_s16(s5[1], s5[6]);
+  s6[7] = vsubq_s16(s5[0], s4[7]);
 
-  s6_10 = sub_multiply_shift_and_narrow_s16(s5_13, s5_10, cospi_16_64);
-  s6_13 = add_multiply_shift_and_narrow_s16(s5_10, s5_13, cospi_16_64);
+  s6[10] = sub_multiply_shift_and_narrow_s16(s5[13], s5[10], cospi_16_64);
+  s6[13] = add_multiply_shift_and_narrow_s16(s5[10], s5[13], cospi_16_64);
 
-  s6_11 = sub_multiply_shift_and_narrow_s16(s5_12, s5_11, cospi_16_64);
-  s6_12 = add_multiply_shift_and_narrow_s16(s5_11, s5_12, cospi_16_64);
+  s6[11] = sub_multiply_shift_and_narrow_s16(s5[12], s5[11], cospi_16_64);
+  s6[12] = add_multiply_shift_and_narrow_s16(s5[11], s5[12], cospi_16_64);
 
-  s6_16 = vaddq_s16(s4_16, s4_23);
-  s6_17 = vaddq_s16(s4_17, s4_22);
-  s6_18 = vaddq_s16(s5_18, s5_21);
-  s6_19 = vaddq_s16(s5_19, s5_20);
-  s6_20 = vsubq_s16(s5_19, s5_20);
-  s6_21 = vsubq_s16(s5_18, s5_21);
-  s6_22 = vsubq_s16(s4_17, s4_22);
-  s6_23 = vsubq_s16(s4_16, s4_23);
-  s6_24 = vsubq_s16(s4_31, s4_24);
-  s6_25 = vsubq_s16(s4_30, s4_25);
-  s6_26 = vsubq_s16(s5_29, s5_26);
-  s6_27 = vsubq_s16(s5_28, s5_27);
-  s6_28 = vaddq_s16(s5_27, s5_28);
-  s6_29 = vaddq_s16(s5_26, s5_29);
-  s6_30 = vaddq_s16(s4_25, s4_30);
-  s6_31 = vaddq_s16(s4_24, s4_31);
+  s6[16] = vaddq_s16(s4[16], s4[23]);
+  s6[17] = vaddq_s16(s4[17], s4[22]);
+  s6[18] = vaddq_s16(s5[18], s5[21]);
+  s6[19] = vaddq_s16(s5[19], s5[20]);
+  s6[20] = vsubq_s16(s5[19], s5[20]);
+  s6[21] = vsubq_s16(s5[18], s5[21]);
+  s6[22] = vsubq_s16(s4[17], s4[22]);
+  s6[23] = vsubq_s16(s4[16], s4[23]);
+  s6[24] = vsubq_s16(s4[31], s4[24]);
+  s6[25] = vsubq_s16(s4[30], s4[25]);
+  s6[26] = vsubq_s16(s5[29], s5[26]);
+  s6[27] = vsubq_s16(s5[28], s5[27]);
+  s6[28] = vaddq_s16(s5[27], s5[28]);
+  s6[29] = vaddq_s16(s5[26], s5[29]);
+  s6[30] = vaddq_s16(s4[25], s4[30]);
+  s6[31] = vaddq_s16(s4[24], s4[31]);
 
   // stage 7
-  s7_0 = vaddq_s16(s6_0, s5_15);
-  s7_1 = vaddq_s16(s6_1, s5_14);
-  s7_2 = vaddq_s16(s6_2, s6_13);
-  s7_3 = vaddq_s16(s6_3, s6_12);
-  s7_4 = vaddq_s16(s6_4, s6_11);
-  s7_5 = vaddq_s16(s6_5, s6_10);
-  s7_6 = vaddq_s16(s6_6, s5_9);
-  s7_7 = vaddq_s16(s6_7, s5_8);
-  s7_8 = vsubq_s16(s6_7, s5_8);
-  s7_9 = vsubq_s16(s6_6, s5_9);
-  s7_10 = vsubq_s16(s6_5, s6_10);
-  s7_11 = vsubq_s16(s6_4, s6_11);
-  s7_12 = vsubq_s16(s6_3, s6_12);
-  s7_13 = vsubq_s16(s6_2, s6_13);
-  s7_14 = vsubq_s16(s6_1, s5_14);
-  s7_15 = vsubq_s16(s6_0, s5_15);
+  s7[0] = vaddq_s16(s6[0], s5[15]);
+  s7[1] = vaddq_s16(s6[1], s5[14]);
+  s7[2] = vaddq_s16(s6[2], s6[13]);
+  s7[3] = vaddq_s16(s6[3], s6[12]);
+  s7[4] = vaddq_s16(s6[4], s6[11]);
+  s7[5] = vaddq_s16(s6[5], s6[10]);
+  s7[6] = vaddq_s16(s6[6], s5[9]);
+  s7[7] = vaddq_s16(s6[7], s5[8]);
+  s7[8] = vsubq_s16(s6[7], s5[8]);
+  s7[9] = vsubq_s16(s6[6], s5[9]);
+  s7[10] = vsubq_s16(s6[5], s6[10]);
+  s7[11] = vsubq_s16(s6[4], s6[11]);
+  s7[12] = vsubq_s16(s6[3], s6[12]);
+  s7[13] = vsubq_s16(s6[2], s6[13]);
+  s7[14] = vsubq_s16(s6[1], s5[14]);
+  s7[15] = vsubq_s16(s6[0], s5[15]);
 
-  s7_20 = sub_multiply_shift_and_narrow_s16(s6_27, s6_20, cospi_16_64);
-  s7_27 = add_multiply_shift_and_narrow_s16(s6_20, s6_27, cospi_16_64);
+  s7[20] = sub_multiply_shift_and_narrow_s16(s6[27], s6[20], cospi_16_64);
+  s7[27] = add_multiply_shift_and_narrow_s16(s6[20], s6[27], cospi_16_64);
 
-  s7_21 = sub_multiply_shift_and_narrow_s16(s6_26, s6_21, cospi_16_64);
-  s7_26 = add_multiply_shift_and_narrow_s16(s6_21, s6_26, cospi_16_64);
+  s7[21] = sub_multiply_shift_and_narrow_s16(s6[26], s6[21], cospi_16_64);
+  s7[26] = add_multiply_shift_and_narrow_s16(s6[21], s6[26], cospi_16_64);
 
-  s7_22 = sub_multiply_shift_and_narrow_s16(s6_25, s6_22, cospi_16_64);
-  s7_25 = add_multiply_shift_and_narrow_s16(s6_22, s6_25, cospi_16_64);
+  s7[22] = sub_multiply_shift_and_narrow_s16(s6[25], s6[22], cospi_16_64);
+  s7[25] = add_multiply_shift_and_narrow_s16(s6[22], s6[25], cospi_16_64);
 
-  s7_23 = sub_multiply_shift_and_narrow_s16(s6_24, s6_23, cospi_16_64);
-  s7_24 = add_multiply_shift_and_narrow_s16(s6_23, s6_24, cospi_16_64);
+  s7[23] = sub_multiply_shift_and_narrow_s16(s6[24], s6[23], cospi_16_64);
+  s7[24] = add_multiply_shift_and_narrow_s16(s6[23], s6[24], cospi_16_64);
 
   // final stage
-  out0 = vaddq_s16(s7_0, s6_31);
-  out1 = vaddq_s16(s7_1, s6_30);
-  out2 = vaddq_s16(s7_2, s6_29);
-  out3 = vaddq_s16(s7_3, s6_28);
-  out4 = vaddq_s16(s7_4, s7_27);
-  out5 = vaddq_s16(s7_5, s7_26);
-  out6 = vaddq_s16(s7_6, s7_25);
-  out7 = vaddq_s16(s7_7, s7_24);
+  out[0] = final_add(s7[0], s6[31]);
+  out[1] = final_add(s7[1], s6[30]);
+  out[2] = final_add(s7[2], s6[29]);
+  out[3] = final_add(s7[3], s6[28]);
+  out[4] = final_add(s7[4], s7[27]);
+  out[5] = final_add(s7[5], s7[26]);
+  out[6] = final_add(s7[6], s7[25]);
+  out[7] = final_add(s7[7], s7[24]);
+  out[8] = final_add(s7[8], s7[23]);
+  out[9] = final_add(s7[9], s7[22]);
+  out[10] = final_add(s7[10], s7[21]);
+  out[11] = final_add(s7[11], s7[20]);
+  out[12] = final_add(s7[12], s6[19]);
+  out[13] = final_add(s7[13], s6[18]);
+  out[14] = final_add(s7[14], s6[17]);
+  out[15] = final_add(s7[15], s6[16]);
+  out[16] = final_sub(s7[15], s6[16]);
+  out[17] = final_sub(s7[14], s6[17]);
+  out[18] = final_sub(s7[13], s6[18]);
+  out[19] = final_sub(s7[12], s6[19]);
+  out[20] = final_sub(s7[11], s7[20]);
+  out[21] = final_sub(s7[10], s7[21]);
+  out[22] = final_sub(s7[9], s7[22]);
+  out[23] = final_sub(s7[8], s7[23]);
+  out[24] = final_sub(s7[7], s7[24]);
+  out[25] = final_sub(s7[6], s7[25]);
+  out[26] = final_sub(s7[5], s7[26]);
+  out[27] = final_sub(s7[4], s7[27]);
+  out[28] = final_sub(s7[3], s6[28]);
+  out[29] = final_sub(s7[2], s6[29]);
+  out[30] = final_sub(s7[1], s6[30]);
+  out[31] = final_sub(s7[0], s6[31]);
 
-  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7, output,
-                       stride);
-
-  out0 = vaddq_s16(s7_8, s7_23);
-  out1 = vaddq_s16(s7_9, s7_22);
-  out2 = vaddq_s16(s7_10, s7_21);
-  out3 = vaddq_s16(s7_11, s7_20);
-  out4 = vaddq_s16(s7_12, s6_19);
-  out5 = vaddq_s16(s7_13, s6_18);
-  out6 = vaddq_s16(s7_14, s6_17);
-  out7 = vaddq_s16(s7_15, s6_16);
-
-  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
-                       output + (8 * stride), stride);
-
-  out0 = vsubq_s16(s7_15, s6_16);
-  out1 = vsubq_s16(s7_14, s6_17);
-  out2 = vsubq_s16(s7_13, s6_18);
-  out3 = vsubq_s16(s7_12, s6_19);
-  out4 = vsubq_s16(s7_11, s7_20);
-  out5 = vsubq_s16(s7_10, s7_21);
-  out6 = vsubq_s16(s7_9, s7_22);
-  out7 = vsubq_s16(s7_8, s7_23);
-
-  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
-                       output + (16 * stride), stride);
-
-  out0 = vsubq_s16(s7_7, s7_24);
-  out1 = vsubq_s16(s7_6, s7_25);
-  out2 = vsubq_s16(s7_5, s7_26);
-  out3 = vsubq_s16(s7_4, s7_27);
-  out4 = vsubq_s16(s7_3, s6_28);
-  out5 = vsubq_s16(s7_2, s6_29);
-  out6 = vsubq_s16(s7_1, s6_30);
-  out7 = vsubq_s16(s7_0, s6_31);
-
-  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
-                       output + (24 * stride), stride);
+  if (highbd_flag) {
+    highbd_add_and_store_bd8(out, output, stride);
+  } else {
+    uint8_t *const outputT = (uint8_t *)output;
+    add_and_store_u8_s16(out + 0, outputT, stride);
+    add_and_store_u8_s16(out + 8, outputT + (8 * stride), stride);
+    add_and_store_u8_s16(out + 16, outputT + (16 * stride), stride);
+    add_and_store_u8_s16(out + 24, outputT + (24 * stride), stride);
+  }
 }
 
 void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest,
@@ -703,11 +663,11 @@ void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest,
   int16_t temp[32 * 16];
   int16_t *t = temp;
 
-  idct32_12_neon(input, temp);
-  idct32_12_neon(input + 32 * 8, temp + 8);
+  vpx_idct32_12_neon(input, temp);
+  vpx_idct32_12_neon(input + 32 * 8, temp + 8);
 
   for (i = 0; i < 32; i += 8) {
-    idct32_16_neon(t, dest, stride);
+    vpx_idct32_16_neon(t, dest, stride, 0);
     t += (16 * 8);
     dest += 8;
   }
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
index 604d82abd1..8920b93363 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
@@ -39,7 +39,8 @@ static INLINE void idct32x32_1_add_neg_kernel(uint8_t **dest, const int stride,
 void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest,
                               int stride) {
   int i;
-  const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  const int16_t out0 =
+      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
   const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
   const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
 
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c
index b56deeea6d..f570547e44 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c
@@ -13,6 +13,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/txfm_common.h"
 
@@ -35,487 +36,465 @@
 // 5 13 20 26
 // 6 21 27 33
 // 7 24 32
-static void idct32_6_neon(const tran_low_t *input, int16_t *output) {
-  int16x8_t in0, in1, in2, in3, in4, in5, in6, in7;
-  int16x8_t s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s1_9, s1_10,
-      s1_11, s1_12, s1_13, s1_14, s1_15, s1_16, s1_17, s1_18, s1_19, s1_20,
-      s1_21, s1_22, s1_23, s1_24, s1_25, s1_26, s1_27, s1_28, s1_29, s1_30,
-      s1_31;
-  int16x8_t s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s2_9, s2_10,
-      s2_11, s2_12, s2_13, s2_14, s2_15, s2_16, s2_17, s2_18, s2_19, s2_20,
-      s2_21, s2_22, s2_23, s2_24, s2_25, s2_26, s2_27, s2_28, s2_29, s2_30,
-      s2_31;
-  int16x8_t s3_24, s3_25, s3_26, s3_27;
+void vpx_idct32_6_neon(const tran_low_t *input, int16_t *output) {
+  int16x8_t in[8], s1[32], s2[32], s3[32];
 
-  in0 = load_tran_low_to_s16q(input);
+  in[0] = load_tran_low_to_s16q(input);
   input += 32;
-  in1 = load_tran_low_to_s16q(input);
+  in[1] = load_tran_low_to_s16q(input);
   input += 32;
-  in2 = load_tran_low_to_s16q(input);
+  in[2] = load_tran_low_to_s16q(input);
   input += 32;
-  in3 = load_tran_low_to_s16q(input);
+  in[3] = load_tran_low_to_s16q(input);
   input += 32;
-  in4 = load_tran_low_to_s16q(input);
+  in[4] = load_tran_low_to_s16q(input);
   input += 32;
-  in5 = load_tran_low_to_s16q(input);
+  in[5] = load_tran_low_to_s16q(input);
   input += 32;
-  in6 = load_tran_low_to_s16q(input);
+  in[6] = load_tran_low_to_s16q(input);
   input += 32;
-  in7 = load_tran_low_to_s16q(input);
-  transpose_s16_8x8(&in0, &in1, &in2, &in3, &in4, &in5, &in6, &in7);
+  in[7] = load_tran_low_to_s16q(input);
+  transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+                    &in[7]);
 
   // stage 1
   // input[1] * cospi_31_64 - input[31] * cospi_1_64 (but input[31] == 0)
-  s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64);
+  s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64);
   // input[1] * cospi_1_64 + input[31] * cospi_31_64 (but input[31] == 0)
-  s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64);
+  s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64);
 
-  s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64);
-  s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64);
+  s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64);
+  s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64);
 
-  s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64);
-  s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64);
+  s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64);
+  s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64);
 
   // stage 2
-  s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64);
-  s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64);
+  s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64);
+  s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64);
 
   // stage 3
-  s1_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64);
-  s1_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64);
+  s1[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64);
+  s1[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64);
 
-  s1_17 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_4_64, s1_31,
-                                                   cospi_28_64);
-  s1_30 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_28_64, s1_31,
-                                                   cospi_4_64);
+  s1[17] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_4_64, s1[31],
+                                                    cospi_28_64);
+  s1[30] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_28_64, s1[31],
+                                                    cospi_4_64);
 
-  s1_21 = multiply_accumulate_shift_and_narrow_s16(s1_20, -cospi_20_64, s1_27,
-                                                   cospi_12_64);
-  s1_26 = multiply_accumulate_shift_and_narrow_s16(s1_20, cospi_12_64, s1_27,
-                                                   cospi_20_64);
+  s1[21] = multiply_accumulate_shift_and_narrow_s16(s1[20], -cospi_20_64,
+                                                    s1[27], cospi_12_64);
+  s1[26] = multiply_accumulate_shift_and_narrow_s16(s1[20], cospi_12_64, s1[27],
+                                                    cospi_20_64);
 
-  s1_22 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_12_64, s1_24,
-                                                   -cospi_20_64);
-  s1_25 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_20_64, s1_24,
-                                                   cospi_12_64);
+  s1[22] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_12_64,
+                                                    s1[24], -cospi_20_64);
+  s1[25] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_20_64,
+                                                    s1[24], cospi_12_64);
 
   // stage 4
-  s1_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64);
+  s1[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64);
 
-  s2_9 = multiply_accumulate_shift_and_narrow_s16(s2_8, -cospi_8_64, s2_15,
-                                                  cospi_24_64);
-  s2_14 = multiply_accumulate_shift_and_narrow_s16(s2_8, cospi_24_64, s2_15,
-                                                   cospi_8_64);
+  s2[9] = multiply_accumulate_shift_and_narrow_s16(s2[8], -cospi_8_64, s2[15],
+                                                   cospi_24_64);
+  s2[14] = multiply_accumulate_shift_and_narrow_s16(s2[8], cospi_24_64, s2[15],
+                                                    cospi_8_64);
 
-  s2_20 = vsubq_s16(s1_23, s1_20);
-  s2_21 = vsubq_s16(s1_22, s1_21);
-  s2_22 = vaddq_s16(s1_21, s1_22);
-  s2_23 = vaddq_s16(s1_20, s1_23);
-  s2_24 = vaddq_s16(s1_24, s1_27);
-  s2_25 = vaddq_s16(s1_25, s1_26);
-  s2_26 = vsubq_s16(s1_25, s1_26);
-  s2_27 = vsubq_s16(s1_24, s1_27);
+  s2[20] = vsubq_s16(s1[23], s1[20]);
+  s2[21] = vsubq_s16(s1[22], s1[21]);
+  s2[22] = vaddq_s16(s1[21], s1[22]);
+  s2[23] = vaddq_s16(s1[20], s1[23]);
+  s2[24] = vaddq_s16(s1[24], s1[27]);
+  s2[25] = vaddq_s16(s1[25], s1[26]);
+  s2[26] = vsubq_s16(s1[25], s1[26]);
+  s2[27] = vsubq_s16(s1[24], s1[27]);
 
   // stage 5
-  s1_5 = sub_multiply_shift_and_narrow_s16(s1_7, s1_4, cospi_16_64);
-  s1_6 = add_multiply_shift_and_narrow_s16(s1_4, s1_7, cospi_16_64);
+  s1[5] = sub_multiply_shift_and_narrow_s16(s1[7], s1[4], cospi_16_64);
+  s1[6] = add_multiply_shift_and_narrow_s16(s1[4], s1[7], cospi_16_64);
 
-  s1_18 = multiply_accumulate_shift_and_narrow_s16(s1_17, -cospi_8_64, s1_30,
-                                                   cospi_24_64);
-  s1_29 = multiply_accumulate_shift_and_narrow_s16(s1_17, cospi_24_64, s1_30,
-                                                   cospi_8_64);
+  s1[18] = multiply_accumulate_shift_and_narrow_s16(s1[17], -cospi_8_64, s1[30],
+                                                    cospi_24_64);
+  s1[29] = multiply_accumulate_shift_and_narrow_s16(s1[17], cospi_24_64, s1[30],
+                                                    cospi_8_64);
 
-  s1_19 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_8_64, s1_31,
-                                                   cospi_24_64);
-  s1_28 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_24_64, s1_31,
-                                                   cospi_8_64);
+  s1[19] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_8_64, s1[31],
+                                                    cospi_24_64);
+  s1[28] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_24_64, s1[31],
+                                                    cospi_8_64);
 
-  s1_20 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_24_64, s2_27,
-                                                   -cospi_8_64);
-  s1_27 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_8_64, s2_27,
-                                                   cospi_24_64);
+  s1[20] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_24_64,
+                                                    s2[27], -cospi_8_64);
+  s1[27] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_8_64, s2[27],
+                                                    cospi_24_64);
 
-  s1_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_24_64, s2_26,
-                                                   -cospi_8_64);
-  s1_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_8_64, s2_26,
-                                                   cospi_24_64);
+  s1[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_24_64,
+                                                    s2[26], -cospi_8_64);
+  s1[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_8_64, s2[26],
+                                                    cospi_24_64);
 
   // stage 6
-  s2_0 = vaddq_s16(s1_0, s1_7);
-  s2_1 = vaddq_s16(s1_0, s1_6);
-  s2_2 = vaddq_s16(s1_0, s1_5);
-  s2_3 = vaddq_s16(s1_0, s1_4);
-  s2_4 = vsubq_s16(s1_0, s1_4);
-  s2_5 = vsubq_s16(s1_0, s1_5);
-  s2_6 = vsubq_s16(s1_0, s1_6);
-  s2_7 = vsubq_s16(s1_0, s1_7);
+  s2[0] = vaddq_s16(s1[0], s1[7]);
+  s2[1] = vaddq_s16(s1[0], s1[6]);
+  s2[2] = vaddq_s16(s1[0], s1[5]);
+  s2[3] = vaddq_s16(s1[0], s1[4]);
+  s2[4] = vsubq_s16(s1[0], s1[4]);
+  s2[5] = vsubq_s16(s1[0], s1[5]);
+  s2[6] = vsubq_s16(s1[0], s1[6]);
+  s2[7] = vsubq_s16(s1[0], s1[7]);
 
-  s2_10 = sub_multiply_shift_and_narrow_s16(s2_14, s2_9, cospi_16_64);
-  s2_13 = add_multiply_shift_and_narrow_s16(s2_9, s2_14, cospi_16_64);
+  s2[10] = sub_multiply_shift_and_narrow_s16(s2[14], s2[9], cospi_16_64);
+  s2[13] = add_multiply_shift_and_narrow_s16(s2[9], s2[14], cospi_16_64);
 
-  s2_11 = sub_multiply_shift_and_narrow_s16(s2_15, s2_8, cospi_16_64);
-  s2_12 = add_multiply_shift_and_narrow_s16(s2_8, s2_15, cospi_16_64);
+  s2[11] = sub_multiply_shift_and_narrow_s16(s2[15], s2[8], cospi_16_64);
+  s2[12] = add_multiply_shift_and_narrow_s16(s2[8], s2[15], cospi_16_64);
 
-  s2_16 = vaddq_s16(s1_16, s2_23);
-  s2_17 = vaddq_s16(s1_17, s2_22);
-  s2_18 = vaddq_s16(s1_18, s1_21);
-  s2_19 = vaddq_s16(s1_19, s1_20);
-  s2_20 = vsubq_s16(s1_19, s1_20);
-  s2_21 = vsubq_s16(s1_18, s1_21);
-  s2_22 = vsubq_s16(s1_17, s2_22);
-  s2_23 = vsubq_s16(s1_16, s2_23);
+  s2[16] = vaddq_s16(s1[16], s2[23]);
+  s2[17] = vaddq_s16(s1[17], s2[22]);
+  s2[18] = vaddq_s16(s1[18], s1[21]);
+  s2[19] = vaddq_s16(s1[19], s1[20]);
+  s2[20] = vsubq_s16(s1[19], s1[20]);
+  s2[21] = vsubq_s16(s1[18], s1[21]);
+  s2[22] = vsubq_s16(s1[17], s2[22]);
+  s2[23] = vsubq_s16(s1[16], s2[23]);
 
-  s3_24 = vsubq_s16(s1_31, s2_24);
-  s3_25 = vsubq_s16(s1_30, s2_25);
-  s3_26 = vsubq_s16(s1_29, s1_26);
-  s3_27 = vsubq_s16(s1_28, s1_27);
-  s2_28 = vaddq_s16(s1_27, s1_28);
-  s2_29 = vaddq_s16(s1_26, s1_29);
-  s2_30 = vaddq_s16(s2_25, s1_30);
-  s2_31 = vaddq_s16(s2_24, s1_31);
+  s3[24] = vsubq_s16(s1[31], s2[24]);
+  s3[25] = vsubq_s16(s1[30], s2[25]);
+  s3[26] = vsubq_s16(s1[29], s1[26]);
+  s3[27] = vsubq_s16(s1[28], s1[27]);
+  s2[28] = vaddq_s16(s1[27], s1[28]);
+  s2[29] = vaddq_s16(s1[26], s1[29]);
+  s2[30] = vaddq_s16(s2[25], s1[30]);
+  s2[31] = vaddq_s16(s2[24], s1[31]);
 
   // stage 7
-  s1_0 = vaddq_s16(s2_0, s2_15);
-  s1_1 = vaddq_s16(s2_1, s2_14);
-  s1_2 = vaddq_s16(s2_2, s2_13);
-  s1_3 = vaddq_s16(s2_3, s2_12);
-  s1_4 = vaddq_s16(s2_4, s2_11);
-  s1_5 = vaddq_s16(s2_5, s2_10);
-  s1_6 = vaddq_s16(s2_6, s2_9);
-  s1_7 = vaddq_s16(s2_7, s2_8);
-  s1_8 = vsubq_s16(s2_7, s2_8);
-  s1_9 = vsubq_s16(s2_6, s2_9);
-  s1_10 = vsubq_s16(s2_5, s2_10);
-  s1_11 = vsubq_s16(s2_4, s2_11);
-  s1_12 = vsubq_s16(s2_3, s2_12);
-  s1_13 = vsubq_s16(s2_2, s2_13);
-  s1_14 = vsubq_s16(s2_1, s2_14);
-  s1_15 = vsubq_s16(s2_0, s2_15);
+  s1[0] = vaddq_s16(s2[0], s2[15]);
+  s1[1] = vaddq_s16(s2[1], s2[14]);
+  s1[2] = vaddq_s16(s2[2], s2[13]);
+  s1[3] = vaddq_s16(s2[3], s2[12]);
+  s1[4] = vaddq_s16(s2[4], s2[11]);
+  s1[5] = vaddq_s16(s2[5], s2[10]);
+  s1[6] = vaddq_s16(s2[6], s2[9]);
+  s1[7] = vaddq_s16(s2[7], s2[8]);
+  s1[8] = vsubq_s16(s2[7], s2[8]);
+  s1[9] = vsubq_s16(s2[6], s2[9]);
+  s1[10] = vsubq_s16(s2[5], s2[10]);
+  s1[11] = vsubq_s16(s2[4], s2[11]);
+  s1[12] = vsubq_s16(s2[3], s2[12]);
+  s1[13] = vsubq_s16(s2[2], s2[13]);
+  s1[14] = vsubq_s16(s2[1], s2[14]);
+  s1[15] = vsubq_s16(s2[0], s2[15]);
 
-  s1_20 = sub_multiply_shift_and_narrow_s16(s3_27, s2_20, cospi_16_64);
-  s1_27 = add_multiply_shift_and_narrow_s16(s2_20, s3_27, cospi_16_64);
+  s1[20] = sub_multiply_shift_and_narrow_s16(s3[27], s2[20], cospi_16_64);
+  s1[27] = add_multiply_shift_and_narrow_s16(s2[20], s3[27], cospi_16_64);
 
-  s1_21 = sub_multiply_shift_and_narrow_s16(s3_26, s2_21, cospi_16_64);
-  s1_26 = add_multiply_shift_and_narrow_s16(s2_21, s3_26, cospi_16_64);
+  s1[21] = sub_multiply_shift_and_narrow_s16(s3[26], s2[21], cospi_16_64);
+  s1[26] = add_multiply_shift_and_narrow_s16(s2[21], s3[26], cospi_16_64);
 
-  s1_22 = sub_multiply_shift_and_narrow_s16(s3_25, s2_22, cospi_16_64);
-  s1_25 = add_multiply_shift_and_narrow_s16(s2_22, s3_25, cospi_16_64);
+  s1[22] = sub_multiply_shift_and_narrow_s16(s3[25], s2[22], cospi_16_64);
+  s1[25] = add_multiply_shift_and_narrow_s16(s2[22], s3[25], cospi_16_64);
 
-  s1_23 = sub_multiply_shift_and_narrow_s16(s3_24, s2_23, cospi_16_64);
-  s1_24 = add_multiply_shift_and_narrow_s16(s2_23, s3_24, cospi_16_64);
+  s1[23] = sub_multiply_shift_and_narrow_s16(s3[24], s2[23], cospi_16_64);
+  s1[24] = add_multiply_shift_and_narrow_s16(s2[23], s3[24], cospi_16_64);
 
   // final stage
-  vst1q_s16(output, vaddq_s16(s1_0, s2_31));
+  vst1q_s16(output, vaddq_s16(s1[0], s2[31]));
   output += 8;
-  vst1q_s16(output, vaddq_s16(s1_1, s2_30));
+  vst1q_s16(output, vaddq_s16(s1[1], s2[30]));
   output += 8;
-  vst1q_s16(output, vaddq_s16(s1_2, s2_29));
+  vst1q_s16(output, vaddq_s16(s1[2], s2[29]));
   output += 8;
-  vst1q_s16(output, vaddq_s16(s1_3, s2_28));
+  vst1q_s16(output, vaddq_s16(s1[3], s2[28]));
   output += 8;
-  vst1q_s16(output, vaddq_s16(s1_4, s1_27));
+  vst1q_s16(output, vaddq_s16(s1[4], s1[27]));
   output += 8;
-  vst1q_s16(output, vaddq_s16(s1_5, s1_26));
+  vst1q_s16(output, vaddq_s16(s1[5], s1[26]));
   output += 8;
-  vst1q_s16(output, vaddq_s16(s1_6, s1_25));
+  vst1q_s16(output, vaddq_s16(s1[6], s1[25]));
   output += 8;
-  vst1q_s16(output, vaddq_s16(s1_7, s1_24));
+  vst1q_s16(output, vaddq_s16(s1[7], s1[24]));
   output += 8;
 
-  vst1q_s16(output, vaddq_s16(s1_8, s1_23));
+  vst1q_s16(output, vaddq_s16(s1[8], s1[23]));
   output += 8;
-  vst1q_s16(output, vaddq_s16(s1_9, s1_22));
+  vst1q_s16(output, vaddq_s16(s1[9], s1[22]));
   output += 8;
-  vst1q_s16(output, vaddq_s16(s1_10, s1_21));
+  vst1q_s16(output, vaddq_s16(s1[10], s1[21]));
   output += 8;
-  vst1q_s16(output, vaddq_s16(s1_11, s1_20));
+  vst1q_s16(output, vaddq_s16(s1[11], s1[20]));
   output += 8;
-  vst1q_s16(output, vaddq_s16(s1_12, s2_19));
+  vst1q_s16(output, vaddq_s16(s1[12], s2[19]));
   output += 8;
-  vst1q_s16(output, vaddq_s16(s1_13, s2_18));
+  vst1q_s16(output, vaddq_s16(s1[13], s2[18]));
   output += 8;
-  vst1q_s16(output, vaddq_s16(s1_14, s2_17));
+  vst1q_s16(output, vaddq_s16(s1[14], s2[17]));
   output += 8;
-  vst1q_s16(output, vaddq_s16(s1_15, s2_16));
+  vst1q_s16(output, vaddq_s16(s1[15], s2[16]));
   output += 8;
 
-  vst1q_s16(output, vsubq_s16(s1_15, s2_16));
+  vst1q_s16(output, vsubq_s16(s1[15], s2[16]));
   output += 8;
-  vst1q_s16(output, vsubq_s16(s1_14, s2_17));
+  vst1q_s16(output, vsubq_s16(s1[14], s2[17]));
   output += 8;
-  vst1q_s16(output, vsubq_s16(s1_13, s2_18));
+  vst1q_s16(output, vsubq_s16(s1[13], s2[18]));
   output += 8;
-  vst1q_s16(output, vsubq_s16(s1_12, s2_19));
+  vst1q_s16(output, vsubq_s16(s1[12], s2[19]));
   output += 8;
-  vst1q_s16(output, vsubq_s16(s1_11, s1_20));
+  vst1q_s16(output, vsubq_s16(s1[11], s1[20]));
   output += 8;
-  vst1q_s16(output, vsubq_s16(s1_10, s1_21));
+  vst1q_s16(output, vsubq_s16(s1[10], s1[21]));
   output += 8;
-  vst1q_s16(output, vsubq_s16(s1_9, s1_22));
+  vst1q_s16(output, vsubq_s16(s1[9], s1[22]));
   output += 8;
-  vst1q_s16(output, vsubq_s16(s1_8, s1_23));
+  vst1q_s16(output, vsubq_s16(s1[8], s1[23]));
   output += 8;
 
-  vst1q_s16(output, vsubq_s16(s1_7, s1_24));
+  vst1q_s16(output, vsubq_s16(s1[7], s1[24]));
   output += 8;
-  vst1q_s16(output, vsubq_s16(s1_6, s1_25));
+  vst1q_s16(output, vsubq_s16(s1[6], s1[25]));
   output += 8;
-  vst1q_s16(output, vsubq_s16(s1_5, s1_26));
+  vst1q_s16(output, vsubq_s16(s1[5], s1[26]));
   output += 8;
-  vst1q_s16(output, vsubq_s16(s1_4, s1_27));
+  vst1q_s16(output, vsubq_s16(s1[4], s1[27]));
   output += 8;
-  vst1q_s16(output, vsubq_s16(s1_3, s2_28));
+  vst1q_s16(output, vsubq_s16(s1[3], s2[28]));
   output += 8;
-  vst1q_s16(output, vsubq_s16(s1_2, s2_29));
+  vst1q_s16(output, vsubq_s16(s1[2], s2[29]));
   output += 8;
-  vst1q_s16(output, vsubq_s16(s1_1, s2_30));
+  vst1q_s16(output, vsubq_s16(s1[1], s2[30]));
   output += 8;
-  vst1q_s16(output, vsubq_s16(s1_0, s2_31));
+  vst1q_s16(output, vsubq_s16(s1[0], s2[31]));
 }
 
-static void idct32_8_neon(const int16_t *input, uint8_t *output, int stride) {
-  int16x8_t in0, in1, in2, in3, in4, in5, in6, in7;
-  int16x8_t out0, out1, out2, out3, out4, out5, out6, out7;
-  int16x8_t s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s1_9, s1_10,
-      s1_11, s1_12, s1_13, s1_14, s1_15, s1_16, s1_17, s1_18, s1_19, s1_20,
-      s1_21, s1_22, s1_23, s1_24, s1_25, s1_26, s1_27, s1_28, s1_29, s1_30,
-      s1_31;
-  int16x8_t s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s2_9, s2_10,
-      s2_11, s2_12, s2_13, s2_14, s2_15, s2_16, s2_17, s2_18, s2_19, s2_20,
-      s2_21, s2_22, s2_23, s2_24, s2_25, s2_26, s2_27, s2_28, s2_29, s2_30,
-      s2_31;
-  int16x8_t s3_24, s3_25, s3_26, s3_27;
+void vpx_idct32_8_neon(const int16_t *input, void *const output, int stride,
+                       const int highbd_flag) {
+  int16x8_t in[8], s1[32], s2[32], s3[32], out[32];
 
-  load_and_transpose_s16_8x8(input, 8, &in0, &in1, &in2, &in3, &in4, &in5, &in6,
-                             &in7);
+  load_and_transpose_s16_8x8(input, 8, &in[0], &in[1], &in[2], &in[3], &in[4],
+                             &in[5], &in[6], &in[7]);
 
   // stage 1
-  s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64);
-  s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64);
+  s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64);
+  s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64);
 
   // Different for _8_
-  s1_19 = multiply_shift_and_narrow_s16(in7, -cospi_25_64);
-  s1_28 = multiply_shift_and_narrow_s16(in7, cospi_7_64);
+  s1[19] = multiply_shift_and_narrow_s16(in[7], -cospi_25_64);
+  s1[28] = multiply_shift_and_narrow_s16(in[7], cospi_7_64);
 
-  s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64);
-  s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64);
+  s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64);
+  s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64);
 
-  s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64);
-  s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64);
+  s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64);
+  s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64);
 
   // stage 2
-  s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64);
-  s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64);
+  s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64);
+  s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64);
 
-  s2_11 = multiply_shift_and_narrow_s16(in6, -cospi_26_64);
-  s2_12 = multiply_shift_and_narrow_s16(in6, cospi_6_64);
+  s2[11] = multiply_shift_and_narrow_s16(in[6], -cospi_26_64);
+  s2[12] = multiply_shift_and_narrow_s16(in[6], cospi_6_64);
 
   // stage 3
-  s1_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64);
-  s1_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64);
+  s1[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64);
+  s1[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64);
 
-  s1_17 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_4_64, s1_31,
-                                                   cospi_28_64);
-  s1_30 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_28_64, s1_31,
-                                                   cospi_4_64);
+  s1[17] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_4_64, s1[31],
+                                                    cospi_28_64);
+  s1[30] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_28_64, s1[31],
+                                                    cospi_4_64);
 
   // Different for _8_
-  s1_18 = multiply_accumulate_shift_and_narrow_s16(s1_19, -cospi_28_64, s1_28,
-                                                   -cospi_4_64);
-  s1_29 = multiply_accumulate_shift_and_narrow_s16(s1_19, -cospi_4_64, s1_28,
-                                                   cospi_28_64);
+  s1[18] = multiply_accumulate_shift_and_narrow_s16(s1[19], -cospi_28_64,
+                                                    s1[28], -cospi_4_64);
+  s1[29] = multiply_accumulate_shift_and_narrow_s16(s1[19], -cospi_4_64, s1[28],
+                                                    cospi_28_64);
 
-  s1_21 = multiply_accumulate_shift_and_narrow_s16(s1_20, -cospi_20_64, s1_27,
-                                                   cospi_12_64);
-  s1_26 = multiply_accumulate_shift_and_narrow_s16(s1_20, cospi_12_64, s1_27,
-                                                   cospi_20_64);
+  s1[21] = multiply_accumulate_shift_and_narrow_s16(s1[20], -cospi_20_64,
+                                                    s1[27], cospi_12_64);
+  s1[26] = multiply_accumulate_shift_and_narrow_s16(s1[20], cospi_12_64, s1[27],
+                                                    cospi_20_64);
 
-  s1_22 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_12_64, s1_24,
-                                                   -cospi_20_64);
-  s1_25 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_20_64, s1_24,
-                                                   cospi_12_64);
+  s1[22] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_12_64,
+                                                    s1[24], -cospi_20_64);
+  s1[25] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_20_64,
+                                                    s1[24], cospi_12_64);
 
   // stage 4
-  s1_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64);
+  s1[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64);
 
-  s2_9 = multiply_accumulate_shift_and_narrow_s16(s2_8, -cospi_8_64, s2_15,
-                                                  cospi_24_64);
-  s2_14 = multiply_accumulate_shift_and_narrow_s16(s2_8, cospi_24_64, s2_15,
-                                                   cospi_8_64);
-
-  s2_10 = multiply_accumulate_shift_and_narrow_s16(s2_11, -cospi_24_64, s2_12,
-                                                   -cospi_8_64);
-  s2_13 = multiply_accumulate_shift_and_narrow_s16(s2_11, -cospi_8_64, s2_12,
+  s2[9] = multiply_accumulate_shift_and_narrow_s16(s2[8], -cospi_8_64, s2[15],
                                                    cospi_24_64);
+  s2[14] = multiply_accumulate_shift_and_narrow_s16(s2[8], cospi_24_64, s2[15],
+                                                    cospi_8_64);
 
-  s2_16 = vaddq_s16(s1_16, s1_19);
+  s2[10] = multiply_accumulate_shift_and_narrow_s16(s2[11], -cospi_24_64,
+                                                    s2[12], -cospi_8_64);
+  s2[13] = multiply_accumulate_shift_and_narrow_s16(s2[11], -cospi_8_64, s2[12],
+                                                    cospi_24_64);
 
-  s2_17 = vaddq_s16(s1_17, s1_18);
-  s2_18 = vsubq_s16(s1_17, s1_18);
+  s2[16] = vaddq_s16(s1[16], s1[19]);
 
-  s2_19 = vsubq_s16(s1_16, s1_19);
+  s2[17] = vaddq_s16(s1[17], s1[18]);
+  s2[18] = vsubq_s16(s1[17], s1[18]);
 
-  s2_20 = vsubq_s16(s1_23, s1_20);
-  s2_21 = vsubq_s16(s1_22, s1_21);
+  s2[19] = vsubq_s16(s1[16], s1[19]);
 
-  s2_22 = vaddq_s16(s1_21, s1_22);
-  s2_23 = vaddq_s16(s1_20, s1_23);
+  s2[20] = vsubq_s16(s1[23], s1[20]);
+  s2[21] = vsubq_s16(s1[22], s1[21]);
 
-  s2_24 = vaddq_s16(s1_24, s1_27);
-  s2_25 = vaddq_s16(s1_25, s1_26);
-  s2_26 = vsubq_s16(s1_25, s1_26);
-  s2_27 = vsubq_s16(s1_24, s1_27);
+  s2[22] = vaddq_s16(s1[21], s1[22]);
+  s2[23] = vaddq_s16(s1[20], s1[23]);
 
-  s2_28 = vsubq_s16(s1_31, s1_28);
-  s2_29 = vsubq_s16(s1_30, s1_29);
-  s2_30 = vaddq_s16(s1_29, s1_30);
-  s2_31 = vaddq_s16(s1_28, s1_31);
+  s2[24] = vaddq_s16(s1[24], s1[27]);
+  s2[25] = vaddq_s16(s1[25], s1[26]);
+  s2[26] = vsubq_s16(s1[25], s1[26]);
+  s2[27] = vsubq_s16(s1[24], s1[27]);
+
+  s2[28] = vsubq_s16(s1[31], s1[28]);
+  s2[29] = vsubq_s16(s1[30], s1[29]);
+  s2[30] = vaddq_s16(s1[29], s1[30]);
+  s2[31] = vaddq_s16(s1[28], s1[31]);
 
   // stage 5
-  s1_5 = sub_multiply_shift_and_narrow_s16(s1_7, s1_4, cospi_16_64);
-  s1_6 = add_multiply_shift_and_narrow_s16(s1_4, s1_7, cospi_16_64);
+  s1[5] = sub_multiply_shift_and_narrow_s16(s1[7], s1[4], cospi_16_64);
+  s1[6] = add_multiply_shift_and_narrow_s16(s1[4], s1[7], cospi_16_64);
 
-  s1_8 = vaddq_s16(s2_8, s2_11);
-  s1_9 = vaddq_s16(s2_9, s2_10);
-  s1_10 = vsubq_s16(s2_9, s2_10);
-  s1_11 = vsubq_s16(s2_8, s2_11);
-  s1_12 = vsubq_s16(s2_15, s2_12);
-  s1_13 = vsubq_s16(s2_14, s2_13);
-  s1_14 = vaddq_s16(s2_13, s2_14);
-  s1_15 = vaddq_s16(s2_12, s2_15);
+  s1[8] = vaddq_s16(s2[8], s2[11]);
+  s1[9] = vaddq_s16(s2[9], s2[10]);
+  s1[10] = vsubq_s16(s2[9], s2[10]);
+  s1[11] = vsubq_s16(s2[8], s2[11]);
+  s1[12] = vsubq_s16(s2[15], s2[12]);
+  s1[13] = vsubq_s16(s2[14], s2[13]);
+  s1[14] = vaddq_s16(s2[13], s2[14]);
+  s1[15] = vaddq_s16(s2[12], s2[15]);
 
-  s1_18 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_8_64, s2_29,
-                                                   cospi_24_64);
-  s1_29 = multiply_accumulate_shift_and_narrow_s16(s2_18, cospi_24_64, s2_29,
-                                                   cospi_8_64);
+  s1[18] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_8_64, s2[29],
+                                                    cospi_24_64);
+  s1[29] = multiply_accumulate_shift_and_narrow_s16(s2[18], cospi_24_64, s2[29],
+                                                    cospi_8_64);
 
-  s1_19 = multiply_accumulate_shift_and_narrow_s16(s2_19, -cospi_8_64, s2_28,
-                                                   cospi_24_64);
-  s1_28 = multiply_accumulate_shift_and_narrow_s16(s2_19, cospi_24_64, s2_28,
-                                                   cospi_8_64);
+  s1[19] = multiply_accumulate_shift_and_narrow_s16(s2[19], -cospi_8_64, s2[28],
+                                                    cospi_24_64);
+  s1[28] = multiply_accumulate_shift_and_narrow_s16(s2[19], cospi_24_64, s2[28],
+                                                    cospi_8_64);
 
-  s1_20 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_24_64, s2_27,
-                                                   -cospi_8_64);
-  s1_27 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_8_64, s2_27,
-                                                   cospi_24_64);
+  s1[20] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_24_64,
+                                                    s2[27], -cospi_8_64);
+  s1[27] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_8_64, s2[27],
+                                                    cospi_24_64);
 
-  s1_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_24_64, s2_26,
-                                                   -cospi_8_64);
-  s1_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_8_64, s2_26,
-                                                   cospi_24_64);
+  s1[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_24_64,
+                                                    s2[26], -cospi_8_64);
+  s1[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_8_64, s2[26],
+                                                    cospi_24_64);
 
   // stage 6
-  s2_0 = vaddq_s16(s1_0, s1_7);
-  s2_1 = vaddq_s16(s1_0, s1_6);
-  s2_2 = vaddq_s16(s1_0, s1_5);
-  s2_3 = vaddq_s16(s1_0, s1_4);
-  s2_4 = vsubq_s16(s1_0, s1_4);
-  s2_5 = vsubq_s16(s1_0, s1_5);
-  s2_6 = vsubq_s16(s1_0, s1_6);
-  s2_7 = vsubq_s16(s1_0, s1_7);
+  s2[0] = vaddq_s16(s1[0], s1[7]);
+  s2[1] = vaddq_s16(s1[0], s1[6]);
+  s2[2] = vaddq_s16(s1[0], s1[5]);
+  s2[3] = vaddq_s16(s1[0], s1[4]);
+  s2[4] = vsubq_s16(s1[0], s1[4]);
+  s2[5] = vsubq_s16(s1[0], s1[5]);
+  s2[6] = vsubq_s16(s1[0], s1[6]);
+  s2[7] = vsubq_s16(s1[0], s1[7]);
 
-  s2_10 = sub_multiply_shift_and_narrow_s16(s1_13, s1_10, cospi_16_64);
-  s2_13 = add_multiply_shift_and_narrow_s16(s1_10, s1_13, cospi_16_64);
+  s2[10] = sub_multiply_shift_and_narrow_s16(s1[13], s1[10], cospi_16_64);
+  s2[13] = add_multiply_shift_and_narrow_s16(s1[10], s1[13], cospi_16_64);
 
-  s2_11 = sub_multiply_shift_and_narrow_s16(s1_12, s1_11, cospi_16_64);
-  s2_12 = add_multiply_shift_and_narrow_s16(s1_11, s1_12, cospi_16_64);
+  s2[11] = sub_multiply_shift_and_narrow_s16(s1[12], s1[11], cospi_16_64);
+  s2[12] = add_multiply_shift_and_narrow_s16(s1[11], s1[12], cospi_16_64);
 
-  s1_16 = vaddq_s16(s2_16, s2_23);
-  s1_17 = vaddq_s16(s2_17, s2_22);
-  s2_18 = vaddq_s16(s1_18, s1_21);
-  s2_19 = vaddq_s16(s1_19, s1_20);
-  s2_20 = vsubq_s16(s1_19, s1_20);
-  s2_21 = vsubq_s16(s1_18, s1_21);
-  s1_22 = vsubq_s16(s2_17, s2_22);
-  s1_23 = vsubq_s16(s2_16, s2_23);
+  s1[16] = vaddq_s16(s2[16], s2[23]);
+  s1[17] = vaddq_s16(s2[17], s2[22]);
+  s2[18] = vaddq_s16(s1[18], s1[21]);
+  s2[19] = vaddq_s16(s1[19], s1[20]);
+  s2[20] = vsubq_s16(s1[19], s1[20]);
+  s2[21] = vsubq_s16(s1[18], s1[21]);
+  s1[22] = vsubq_s16(s2[17], s2[22]);
+  s1[23] = vsubq_s16(s2[16], s2[23]);
 
-  s3_24 = vsubq_s16(s2_31, s2_24);
-  s3_25 = vsubq_s16(s2_30, s2_25);
-  s3_26 = vsubq_s16(s1_29, s1_26);
-  s3_27 = vsubq_s16(s1_28, s1_27);
-  s2_28 = vaddq_s16(s1_27, s1_28);
-  s2_29 = vaddq_s16(s1_26, s1_29);
-  s2_30 = vaddq_s16(s2_25, s2_30);
-  s2_31 = vaddq_s16(s2_24, s2_31);
+  s3[24] = vsubq_s16(s2[31], s2[24]);
+  s3[25] = vsubq_s16(s2[30], s2[25]);
+  s3[26] = vsubq_s16(s1[29], s1[26]);
+  s3[27] = vsubq_s16(s1[28], s1[27]);
+  s2[28] = vaddq_s16(s1[27], s1[28]);
+  s2[29] = vaddq_s16(s1[26], s1[29]);
+  s2[30] = vaddq_s16(s2[25], s2[30]);
+  s2[31] = vaddq_s16(s2[24], s2[31]);
 
   // stage 7
-  s1_0 = vaddq_s16(s2_0, s1_15);
-  s1_1 = vaddq_s16(s2_1, s1_14);
-  s1_2 = vaddq_s16(s2_2, s2_13);
-  s1_3 = vaddq_s16(s2_3, s2_12);
-  s1_4 = vaddq_s16(s2_4, s2_11);
-  s1_5 = vaddq_s16(s2_5, s2_10);
-  s1_6 = vaddq_s16(s2_6, s1_9);
-  s1_7 = vaddq_s16(s2_7, s1_8);
-  s1_8 = vsubq_s16(s2_7, s1_8);
-  s1_9 = vsubq_s16(s2_6, s1_9);
-  s1_10 = vsubq_s16(s2_5, s2_10);
-  s1_11 = vsubq_s16(s2_4, s2_11);
-  s1_12 = vsubq_s16(s2_3, s2_12);
-  s1_13 = vsubq_s16(s2_2, s2_13);
-  s1_14 = vsubq_s16(s2_1, s1_14);
-  s1_15 = vsubq_s16(s2_0, s1_15);
+  s1[0] = vaddq_s16(s2[0], s1[15]);
+  s1[1] = vaddq_s16(s2[1], s1[14]);
+  s1[2] = vaddq_s16(s2[2], s2[13]);
+  s1[3] = vaddq_s16(s2[3], s2[12]);
+  s1[4] = vaddq_s16(s2[4], s2[11]);
+  s1[5] = vaddq_s16(s2[5], s2[10]);
+  s1[6] = vaddq_s16(s2[6], s1[9]);
+  s1[7] = vaddq_s16(s2[7], s1[8]);
+  s1[8] = vsubq_s16(s2[7], s1[8]);
+  s1[9] = vsubq_s16(s2[6], s1[9]);
+  s1[10] = vsubq_s16(s2[5], s2[10]);
+  s1[11] = vsubq_s16(s2[4], s2[11]);
+  s1[12] = vsubq_s16(s2[3], s2[12]);
+  s1[13] = vsubq_s16(s2[2], s2[13]);
+  s1[14] = vsubq_s16(s2[1], s1[14]);
+  s1[15] = vsubq_s16(s2[0], s1[15]);
 
-  s1_20 = sub_multiply_shift_and_narrow_s16(s3_27, s2_20, cospi_16_64);
-  s1_27 = add_multiply_shift_and_narrow_s16(s2_20, s3_27, cospi_16_64);
+  s1[20] = sub_multiply_shift_and_narrow_s16(s3[27], s2[20], cospi_16_64);
+  s1[27] = add_multiply_shift_and_narrow_s16(s2[20], s3[27], cospi_16_64);
 
-  s1_21 = sub_multiply_shift_and_narrow_s16(s3_26, s2_21, cospi_16_64);
-  s1_26 = add_multiply_shift_and_narrow_s16(s2_21, s3_26, cospi_16_64);
+  s1[21] = sub_multiply_shift_and_narrow_s16(s3[26], s2[21], cospi_16_64);
+  s1[26] = add_multiply_shift_and_narrow_s16(s2[21], s3[26], cospi_16_64);
 
-  s2_22 = sub_multiply_shift_and_narrow_s16(s3_25, s1_22, cospi_16_64);
-  s1_25 = add_multiply_shift_and_narrow_s16(s1_22, s3_25, cospi_16_64);
+  s2[22] = sub_multiply_shift_and_narrow_s16(s3[25], s1[22], cospi_16_64);
+  s1[25] = add_multiply_shift_and_narrow_s16(s1[22], s3[25], cospi_16_64);
 
-  s2_23 = sub_multiply_shift_and_narrow_s16(s3_24, s1_23, cospi_16_64);
-  s1_24 = add_multiply_shift_and_narrow_s16(s1_23, s3_24, cospi_16_64);
+  s2[23] = sub_multiply_shift_and_narrow_s16(s3[24], s1[23], cospi_16_64);
+  s1[24] = add_multiply_shift_and_narrow_s16(s1[23], s3[24], cospi_16_64);
 
   // final stage
-  out0 = vaddq_s16(s1_0, s2_31);
-  out1 = vaddq_s16(s1_1, s2_30);
-  out2 = vaddq_s16(s1_2, s2_29);
-  out3 = vaddq_s16(s1_3, s2_28);
-  out4 = vaddq_s16(s1_4, s1_27);
-  out5 = vaddq_s16(s1_5, s1_26);
-  out6 = vaddq_s16(s1_6, s1_25);
-  out7 = vaddq_s16(s1_7, s1_24);
+  out[0] = final_add(s1[0], s2[31]);
+  out[1] = final_add(s1[1], s2[30]);
+  out[2] = final_add(s1[2], s2[29]);
+  out[3] = final_add(s1[3], s2[28]);
+  out[4] = final_add(s1[4], s1[27]);
+  out[5] = final_add(s1[5], s1[26]);
+  out[6] = final_add(s1[6], s1[25]);
+  out[7] = final_add(s1[7], s1[24]);
+  out[8] = final_add(s1[8], s2[23]);
+  out[9] = final_add(s1[9], s2[22]);
+  out[10] = final_add(s1[10], s1[21]);
+  out[11] = final_add(s1[11], s1[20]);
+  out[12] = final_add(s1[12], s2[19]);
+  out[13] = final_add(s1[13], s2[18]);
+  out[14] = final_add(s1[14], s1[17]);
+  out[15] = final_add(s1[15], s1[16]);
+  out[16] = final_sub(s1[15], s1[16]);
+  out[17] = final_sub(s1[14], s1[17]);
+  out[18] = final_sub(s1[13], s2[18]);
+  out[19] = final_sub(s1[12], s2[19]);
+  out[20] = final_sub(s1[11], s1[20]);
+  out[21] = final_sub(s1[10], s1[21]);
+  out[22] = final_sub(s1[9], s2[22]);
+  out[23] = final_sub(s1[8], s2[23]);
+  out[24] = final_sub(s1[7], s1[24]);
+  out[25] = final_sub(s1[6], s1[25]);
+  out[26] = final_sub(s1[5], s1[26]);
+  out[27] = final_sub(s1[4], s1[27]);
+  out[28] = final_sub(s1[3], s2[28]);
+  out[29] = final_sub(s1[2], s2[29]);
+  out[30] = final_sub(s1[1], s2[30]);
+  out[31] = final_sub(s1[0], s2[31]);
 
-  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7, output,
-                       stride);
-
-  out0 = vaddq_s16(s1_8, s2_23);
-  out1 = vaddq_s16(s1_9, s2_22);
-  out2 = vaddq_s16(s1_10, s1_21);
-  out3 = vaddq_s16(s1_11, s1_20);
-  out4 = vaddq_s16(s1_12, s2_19);
-  out5 = vaddq_s16(s1_13, s2_18);
-  out6 = vaddq_s16(s1_14, s1_17);
-  out7 = vaddq_s16(s1_15, s1_16);
-
-  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
-                       output + (8 * stride), stride);
-
-  out0 = vsubq_s16(s1_15, s1_16);
-  out1 = vsubq_s16(s1_14, s1_17);
-  out2 = vsubq_s16(s1_13, s2_18);
-  out3 = vsubq_s16(s1_12, s2_19);
-  out4 = vsubq_s16(s1_11, s1_20);
-  out5 = vsubq_s16(s1_10, s1_21);
-  out6 = vsubq_s16(s1_9, s2_22);
-  out7 = vsubq_s16(s1_8, s2_23);
-
-  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
-                       output + (16 * stride), stride);
-
-  out0 = vsubq_s16(s1_7, s1_24);
-  out1 = vsubq_s16(s1_6, s1_25);
-  out2 = vsubq_s16(s1_5, s1_26);
-  out3 = vsubq_s16(s1_4, s1_27);
-  out4 = vsubq_s16(s1_3, s2_28);
-  out5 = vsubq_s16(s1_2, s2_29);
-  out6 = vsubq_s16(s1_1, s2_30);
-  out7 = vsubq_s16(s1_0, s2_31);
-
-  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
-                       output + (24 * stride), stride);
+  if (highbd_flag) {
+    highbd_add_and_store_bd8(out, output, stride);
+  } else {
+    uint8_t *const outputT = (uint8_t *)output;
+    add_and_store_u8_s16(out + 0, outputT, stride);
+    add_and_store_u8_s16(out + 8, outputT + (8 * stride), stride);
+    add_and_store_u8_s16(out + 16, outputT + (16 * stride), stride);
+    add_and_store_u8_s16(out + 24, outputT + (24 * stride), stride);
+  }
 }
 
 void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest,
@@ -524,10 +503,10 @@ void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest,
   int16_t temp[32 * 8];
   int16_t *t = temp;
 
-  idct32_6_neon(input, t);
+  vpx_idct32_6_neon(input, t);
 
   for (i = 0; i < 32; i += 8) {
-    idct32_8_neon(t, dest, stride);
+    vpx_idct32_8_neon(t, dest, stride, 0);
     t += (8 * 8);
     dest += 8;
   }
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_add_neon.c
index de1bf97875..9f4589ea96 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_add_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_add_neon.c
@@ -13,147 +13,143 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/txfm_common.h"
 
-#define LOAD_FROM_TRANSPOSED(prev, first, second) \
-  q14s16 = vld1q_s16(trans_buf + first * 8);      \
-  q13s16 = vld1q_s16(trans_buf + second * 8);
+static INLINE void load_from_transformed(const int16_t *const trans_buf,
+                                         const int first, const int second,
+                                         int16x8_t *const q0,
+                                         int16x8_t *const q1) {
+  *q0 = vld1q_s16(trans_buf + first * 8);
+  *q1 = vld1q_s16(trans_buf + second * 8);
+}
 
-#define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \
-  qA = vld1q_s16(out + first * 32);                   \
-  qB = vld1q_s16(out + second * 32);
+static INLINE void load_from_output(const int16_t *const out, const int first,
+                                    const int second, int16x8_t *const q0,
+                                    int16x8_t *const q1) {
+  *q0 = vld1q_s16(out + first * 32);
+  *q1 = vld1q_s16(out + second * 32);
+}
 
-#define STORE_IN_OUTPUT(prev, first, second, qA, qB) \
-  vst1q_s16(out + first * 32, qA);                   \
-  vst1q_s16(out + second * 32, qB);
+static INLINE void store_in_output(int16_t *const out, const int first,
+                                   const int second, const int16x8_t q0,
+                                   const int16x8_t q1) {
+  vst1q_s16(out + first * 32, q0);
+  vst1q_s16(out + second * 32, q1);
+}
 
-#define STORE_COMBINE_CENTER_RESULTS(r10, r9) \
-  __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, q6s16, q7s16, q8s16, q9s16);
-static INLINE void __STORE_COMBINE_CENTER_RESULTS(uint8_t *p1, uint8_t *p2,
-                                                  int stride, int16x8_t q6s16,
-                                                  int16x8_t q7s16,
-                                                  int16x8_t q8s16,
-                                                  int16x8_t q9s16) {
-  int16x4_t d8s16, d9s16, d10s16, d11s16;
+static INLINE void store_combine_results(uint8_t *p1, uint8_t *p2,
+                                         const int stride, int16x8_t q0,
+                                         int16x8_t q1, int16x8_t q2,
+                                         int16x8_t q3) {
+  uint8x8_t d[4];
 
-  d8s16 = vld1_s16((int16_t *)p1);
+  d[0] = vld1_u8(p1);
   p1 += stride;
-  d11s16 = vld1_s16((int16_t *)p2);
+  d[1] = vld1_u8(p1);
+  d[3] = vld1_u8(p2);
   p2 -= stride;
-  d9s16 = vld1_s16((int16_t *)p1);
-  d10s16 = vld1_s16((int16_t *)p2);
+  d[2] = vld1_u8(p2);
 
-  q7s16 = vrshrq_n_s16(q7s16, 6);
-  q8s16 = vrshrq_n_s16(q8s16, 6);
-  q9s16 = vrshrq_n_s16(q9s16, 6);
-  q6s16 = vrshrq_n_s16(q6s16, 6);
+  q0 = vrshrq_n_s16(q0, 6);
+  q1 = vrshrq_n_s16(q1, 6);
+  q2 = vrshrq_n_s16(q2, 6);
+  q3 = vrshrq_n_s16(q3, 6);
 
-  q7s16 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d9s16)));
-  q8s16 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s16(d10s16)));
-  q9s16 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s16(d11s16)));
-  q6s16 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d8s16)));
+  q0 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q0), d[0]));
+  q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1), d[1]));
+  q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2), d[2]));
+  q3 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q3), d[3]));
 
-  d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
-  d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16));
-  d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16));
-  d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
+  d[0] = vqmovun_s16(q0);
+  d[1] = vqmovun_s16(q1);
+  d[2] = vqmovun_s16(q2);
+  d[3] = vqmovun_s16(q3);
 
-  vst1_s16((int16_t *)p1, d9s16);
+  vst1_u8(p1, d[1]);
   p1 -= stride;
-  vst1_s16((int16_t *)p2, d10s16);
+  vst1_u8(p1, d[0]);
+  vst1_u8(p2, d[2]);
   p2 += stride;
-  vst1_s16((int16_t *)p1, d8s16);
-  vst1_s16((int16_t *)p2, d11s16);
+  vst1_u8(p2, d[3]);
 }
 
-#define STORE_COMBINE_EXTREME_RESULTS(r7, r6) \
-  __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, q4s16, q5s16, q6s16, q7s16);
-static INLINE void __STORE_COMBINE_EXTREME_RESULTS(uint8_t *p1, uint8_t *p2,
-                                                   int stride, int16x8_t q4s16,
-                                                   int16x8_t q5s16,
-                                                   int16x8_t q6s16,
-                                                   int16x8_t q7s16) {
-  int16x4_t d4s16, d5s16, d6s16, d7s16;
+static INLINE void highbd_store_combine_results_bd8(uint16_t *p1, uint16_t *p2,
+                                                    const int stride,
+                                                    int16x8_t q0, int16x8_t q1,
+                                                    int16x8_t q2,
+                                                    int16x8_t q3) {
+  uint16x8_t d[4];
 
-  d4s16 = vld1_s16((int16_t *)p1);
+  d[0] = vld1q_u16(p1);
   p1 += stride;
-  d7s16 = vld1_s16((int16_t *)p2);
+  d[1] = vld1q_u16(p1);
+  d[3] = vld1q_u16(p2);
   p2 -= stride;
-  d5s16 = vld1_s16((int16_t *)p1);
-  d6s16 = vld1_s16((int16_t *)p2);
+  d[2] = vld1q_u16(p2);
 
-  q5s16 = vrshrq_n_s16(q5s16, 6);
-  q6s16 = vrshrq_n_s16(q6s16, 6);
-  q7s16 = vrshrq_n_s16(q7s16, 6);
-  q4s16 = vrshrq_n_s16(q4s16, 6);
+  q0 = vrshrq_n_s16(q0, 6);
+  q1 = vrshrq_n_s16(q1, 6);
+  q2 = vrshrq_n_s16(q2, 6);
+  q3 = vrshrq_n_s16(q3, 6);
 
-  q5s16 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s16(d5s16)));
-  q6s16 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d6s16)));
-  q7s16 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d7s16)));
-  q4s16 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s16(d4s16)));
+  q0 = vaddq_s16(q0, vreinterpretq_s16_u16(d[0]));
+  q1 = vaddq_s16(q1, vreinterpretq_s16_u16(d[1]));
+  q2 = vaddq_s16(q2, vreinterpretq_s16_u16(d[2]));
+  q3 = vaddq_s16(q3, vreinterpretq_s16_u16(d[3]));
 
-  d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16));
-  d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
-  d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
-  d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16));
+  d[0] = vmovl_u8(vqmovun_s16(q0));
+  d[1] = vmovl_u8(vqmovun_s16(q1));
+  d[2] = vmovl_u8(vqmovun_s16(q2));
+  d[3] = vmovl_u8(vqmovun_s16(q3));
 
-  vst1_s16((int16_t *)p1, d5s16);
+  vst1q_u16(p1, d[1]);
   p1 -= stride;
-  vst1_s16((int16_t *)p2, d6s16);
+  vst1q_u16(p1, d[0]);
+  vst1q_u16(p2, d[2]);
   p2 += stride;
-  vst1_s16((int16_t *)p2, d7s16);
-  vst1_s16((int16_t *)p1, d4s16);
+  vst1q_u16(p2, d[3]);
 }
 
-#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \
-  DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB);
-static INLINE void DO_BUTTERFLY(int16x8_t q14s16, int16x8_t q13s16,
-                                int16_t first_const, int16_t second_const,
-                                int16x8_t *qAs16, int16x8_t *qBs16) {
-  int16x4_t d30s16, d31s16;
-  int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32;
-  int16x4_t dCs16, dDs16, dAs16, dBs16;
+static INLINE void do_butterfly(const int16x8_t qIn0, const int16x8_t qIn1,
+                                const int16_t first_const,
+                                const int16_t second_const,
+                                int16x8_t *const qOut0,
+                                int16x8_t *const qOut1) {
+  int32x4_t q[4];
+  int16x4_t d[6];
 
-  dCs16 = vget_low_s16(q14s16);
-  dDs16 = vget_high_s16(q14s16);
-  dAs16 = vget_low_s16(q13s16);
-  dBs16 = vget_high_s16(q13s16);
+  d[0] = vget_low_s16(qIn0);
+  d[1] = vget_high_s16(qIn0);
+  d[2] = vget_low_s16(qIn1);
+  d[3] = vget_high_s16(qIn1);
 
-  d30s16 = vdup_n_s16(first_const);
-  d31s16 = vdup_n_s16(second_const);
+  // Note: using v{mul, mla, mls}l_n_s16 here slows down 35% with gcc 4.9.
+  d[4] = vdup_n_s16(first_const);
+  d[5] = vdup_n_s16(second_const);
 
-  q8s32 = vmull_s16(dCs16, d30s16);
-  q10s32 = vmull_s16(dAs16, d31s16);
-  q9s32 = vmull_s16(dDs16, d30s16);
-  q11s32 = vmull_s16(dBs16, d31s16);
-  q12s32 = vmull_s16(dCs16, d31s16);
+  q[0] = vmull_s16(d[0], d[4]);
+  q[1] = vmull_s16(d[1], d[4]);
+  q[0] = vmlsl_s16(q[0], d[2], d[5]);
+  q[1] = vmlsl_s16(q[1], d[3], d[5]);
 
-  q8s32 = vsubq_s32(q8s32, q10s32);
-  q9s32 = vsubq_s32(q9s32, q11s32);
+  q[2] = vmull_s16(d[0], d[5]);
+  q[3] = vmull_s16(d[1], d[5]);
+  q[2] = vmlal_s16(q[2], d[2], d[4]);
+  q[3] = vmlal_s16(q[3], d[3], d[4]);
 
-  q10s32 = vmull_s16(dDs16, d31s16);
-  q11s32 = vmull_s16(dAs16, d30s16);
-  q15s32 = vmull_s16(dBs16, d30s16);
-
-  q11s32 = vaddq_s32(q12s32, q11s32);
-  q10s32 = vaddq_s32(q10s32, q15s32);
-
-  *qAs16 = vcombine_s16(vrshrn_n_s32(q8s32, 14), vrshrn_n_s32(q9s32, 14));
-  *qBs16 = vcombine_s16(vrshrn_n_s32(q11s32, 14), vrshrn_n_s32(q10s32, 14));
+  *qOut0 = vcombine_s16(vrshrn_n_s32(q[0], DCT_CONST_BITS),
+                        vrshrn_n_s32(q[1], DCT_CONST_BITS));
+  *qOut1 = vcombine_s16(vrshrn_n_s32(q[2], DCT_CONST_BITS),
+                        vrshrn_n_s32(q[3], DCT_CONST_BITS));
 }
 
-static INLINE void load_s16x8q(const int16_t *in, int16x8_t *s0, int16x8_t *s1,
-                               int16x8_t *s2, int16x8_t *s3, int16x8_t *s4,
-                               int16x8_t *s5, int16x8_t *s6, int16x8_t *s7) {
+static INLINE void load_s16x8q(const int16_t *in, int16x8_t *const s0,
+                               int16x8_t *const s1, int16x8_t *const s2,
+                               int16x8_t *const s3, int16x8_t *const s4,
+                               int16x8_t *const s5, int16x8_t *const s6,
+                               int16x8_t *const s7) {
   *s0 = vld1q_s16(in);
   in += 32;
   *s1 = vld1q_s16(in);
@@ -207,11 +203,10 @@ static INLINE void idct32_transpose_pair(const int16_t *input, int16_t *t_buf) {
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static INLINE void load_s16x8q_tran_low(const tran_low_t *in, int16x8_t *s0,
-                                        int16x8_t *s1, int16x8_t *s2,
-                                        int16x8_t *s3, int16x8_t *s4,
-                                        int16x8_t *s5, int16x8_t *s6,
-                                        int16x8_t *s7) {
+static INLINE void load_s16x8q_tran_low(
+    const tran_low_t *in, int16x8_t *const s0, int16x8_t *const s1,
+    int16x8_t *const s2, int16x8_t *const s3, int16x8_t *const s4,
+    int16x8_t *const s5, int16x8_t *const s6, int16x8_t *const s7) {
   *s0 = load_tran_low_to_s16q(in);
   in += 32;
   *s1 = load_tran_low_to_s16q(in);
@@ -243,197 +238,287 @@ static INLINE void idct32_transpose_pair_tran_low(const tran_low_t *input,
 #define idct32_transpose_pair_tran_low idct32_transpose_pair
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-static INLINE void idct32_bands_end_1st_pass(int16_t *out, int16x8_t q2s16,
-                                             int16x8_t q3s16, int16x8_t q6s16,
-                                             int16x8_t q7s16, int16x8_t q8s16,
-                                             int16x8_t q9s16, int16x8_t q10s16,
-                                             int16x8_t q11s16, int16x8_t q12s16,
-                                             int16x8_t q13s16, int16x8_t q14s16,
-                                             int16x8_t q15s16) {
-  int16x8_t q0s16, q1s16, q4s16, q5s16;
+static INLINE void idct32_bands_end_1st_pass(int16_t *const out,
+                                             int16x8_t *const q) {
+  store_in_output(out, 16, 17, q[6], q[7]);
+  store_in_output(out, 14, 15, q[8], q[9]);
 
-  STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16);
-  STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16);
+  load_from_output(out, 30, 31, &q[0], &q[1]);
+  q[4] = vaddq_s16(q[2], q[1]);
+  q[5] = vaddq_s16(q[3], q[0]);
+  q[6] = vsubq_s16(q[3], q[0]);
+  q[7] = vsubq_s16(q[2], q[1]);
+  store_in_output(out, 30, 31, q[6], q[7]);
+  store_in_output(out, 0, 1, q[4], q[5]);
 
-  LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16);
-  q4s16 = vaddq_s16(q2s16, q1s16);
-  q5s16 = vaddq_s16(q3s16, q0s16);
-  q6s16 = vsubq_s16(q3s16, q0s16);
-  q7s16 = vsubq_s16(q2s16, q1s16);
-  STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16);
-  STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16);
+  load_from_output(out, 12, 13, &q[0], &q[1]);
+  q[2] = vaddq_s16(q[10], q[1]);
+  q[3] = vaddq_s16(q[11], q[0]);
+  q[4] = vsubq_s16(q[11], q[0]);
+  q[5] = vsubq_s16(q[10], q[1]);
 
-  LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16);
-  q2s16 = vaddq_s16(q10s16, q1s16);
-  q3s16 = vaddq_s16(q11s16, q0s16);
-  q4s16 = vsubq_s16(q11s16, q0s16);
-  q5s16 = vsubq_s16(q10s16, q1s16);
+  load_from_output(out, 18, 19, &q[0], &q[1]);
+  q[8] = vaddq_s16(q[4], q[1]);
+  q[9] = vaddq_s16(q[5], q[0]);
+  q[6] = vsubq_s16(q[5], q[0]);
+  q[7] = vsubq_s16(q[4], q[1]);
+  store_in_output(out, 18, 19, q[6], q[7]);
+  store_in_output(out, 12, 13, q[8], q[9]);
 
-  LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16);
-  q8s16 = vaddq_s16(q4s16, q1s16);
-  q9s16 = vaddq_s16(q5s16, q0s16);
-  q6s16 = vsubq_s16(q5s16, q0s16);
-  q7s16 = vsubq_s16(q4s16, q1s16);
-  STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16);
-  STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16);
+  load_from_output(out, 28, 29, &q[0], &q[1]);
+  q[4] = vaddq_s16(q[2], q[1]);
+  q[5] = vaddq_s16(q[3], q[0]);
+  q[6] = vsubq_s16(q[3], q[0]);
+  q[7] = vsubq_s16(q[2], q[1]);
+  store_in_output(out, 28, 29, q[6], q[7]);
+  store_in_output(out, 2, 3, q[4], q[5]);
 
-  LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16);
-  q4s16 = vaddq_s16(q2s16, q1s16);
-  q5s16 = vaddq_s16(q3s16, q0s16);
-  q6s16 = vsubq_s16(q3s16, q0s16);
-  q7s16 = vsubq_s16(q2s16, q1s16);
-  STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16);
-  STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16);
+  load_from_output(out, 10, 11, &q[0], &q[1]);
+  q[2] = vaddq_s16(q[12], q[1]);
+  q[3] = vaddq_s16(q[13], q[0]);
+  q[4] = vsubq_s16(q[13], q[0]);
+  q[5] = vsubq_s16(q[12], q[1]);
 
-  LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16);
-  q2s16 = vaddq_s16(q12s16, q1s16);
-  q3s16 = vaddq_s16(q13s16, q0s16);
-  q4s16 = vsubq_s16(q13s16, q0s16);
-  q5s16 = vsubq_s16(q12s16, q1s16);
+  load_from_output(out, 20, 21, &q[0], &q[1]);
+  q[8] = vaddq_s16(q[4], q[1]);
+  q[9] = vaddq_s16(q[5], q[0]);
+  q[6] = vsubq_s16(q[5], q[0]);
+  q[7] = vsubq_s16(q[4], q[1]);
+  store_in_output(out, 20, 21, q[6], q[7]);
+  store_in_output(out, 10, 11, q[8], q[9]);
 
-  LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16);
-  q8s16 = vaddq_s16(q4s16, q1s16);
-  q9s16 = vaddq_s16(q5s16, q0s16);
-  q6s16 = vsubq_s16(q5s16, q0s16);
-  q7s16 = vsubq_s16(q4s16, q1s16);
-  STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16);
-  STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16);
+  load_from_output(out, 26, 27, &q[0], &q[1]);
+  q[4] = vaddq_s16(q[2], q[1]);
+  q[5] = vaddq_s16(q[3], q[0]);
+  q[6] = vsubq_s16(q[3], q[0]);
+  q[7] = vsubq_s16(q[2], q[1]);
+  store_in_output(out, 26, 27, q[6], q[7]);
+  store_in_output(out, 4, 5, q[4], q[5]);
 
-  LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16);
-  q4s16 = vaddq_s16(q2s16, q1s16);
-  q5s16 = vaddq_s16(q3s16, q0s16);
-  q6s16 = vsubq_s16(q3s16, q0s16);
-  q7s16 = vsubq_s16(q2s16, q1s16);
-  STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16);
-  STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16);
+  load_from_output(out, 8, 9, &q[0], &q[1]);
+  q[2] = vaddq_s16(q[14], q[1]);
+  q[3] = vaddq_s16(q[15], q[0]);
+  q[4] = vsubq_s16(q[15], q[0]);
+  q[5] = vsubq_s16(q[14], q[1]);
 
-  LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16);
-  q2s16 = vaddq_s16(q14s16, q1s16);
-  q3s16 = vaddq_s16(q15s16, q0s16);
-  q4s16 = vsubq_s16(q15s16, q0s16);
-  q5s16 = vsubq_s16(q14s16, q1s16);
+  load_from_output(out, 22, 23, &q[0], &q[1]);
+  q[8] = vaddq_s16(q[4], q[1]);
+  q[9] = vaddq_s16(q[5], q[0]);
+  q[6] = vsubq_s16(q[5], q[0]);
+  q[7] = vsubq_s16(q[4], q[1]);
+  store_in_output(out, 22, 23, q[6], q[7]);
+  store_in_output(out, 8, 9, q[8], q[9]);
 
-  LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16);
-  q8s16 = vaddq_s16(q4s16, q1s16);
-  q9s16 = vaddq_s16(q5s16, q0s16);
-  q6s16 = vsubq_s16(q5s16, q0s16);
-  q7s16 = vsubq_s16(q4s16, q1s16);
-  STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16);
-  STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16);
-
-  LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16);
-  q4s16 = vaddq_s16(q2s16, q1s16);
-  q5s16 = vaddq_s16(q3s16, q0s16);
-  q6s16 = vsubq_s16(q3s16, q0s16);
-  q7s16 = vsubq_s16(q2s16, q1s16);
-  STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16);
-  STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16);
+  load_from_output(out, 24, 25, &q[0], &q[1]);
+  q[4] = vaddq_s16(q[2], q[1]);
+  q[5] = vaddq_s16(q[3], q[0]);
+  q[6] = vsubq_s16(q[3], q[0]);
+  q[7] = vsubq_s16(q[2], q[1]);
+  store_in_output(out, 24, 25, q[6], q[7]);
+  store_in_output(out, 6, 7, q[4], q[5]);
 }
 
-static INLINE void idct32_bands_end_2nd_pass(
-    int16_t *out, uint8_t *dest, int stride, int16x8_t q2s16, int16x8_t q3s16,
-    int16x8_t q6s16, int16x8_t q7s16, int16x8_t q8s16, int16x8_t q9s16,
-    int16x8_t q10s16, int16x8_t q11s16, int16x8_t q12s16, int16x8_t q13s16,
-    int16x8_t q14s16, int16x8_t q15s16) {
-  uint8_t *r6 = dest + 31 * stride;
-  uint8_t *r7 = dest /* +  0 * stride*/;
-  uint8_t *r9 = dest + 15 * stride;
-  uint8_t *r10 = dest + 16 * stride;
-  int str2 = stride << 1;
-  int16x8_t q0s16, q1s16, q4s16, q5s16;
+static INLINE void idct32_bands_end_2nd_pass(const int16_t *const out,
+                                             uint8_t *const dest,
+                                             const int stride,
+                                             int16x8_t *const q) {
+  uint8_t *dest0 = dest + 0 * stride;
+  uint8_t *dest1 = dest + 31 * stride;
+  uint8_t *dest2 = dest + 16 * stride;
+  uint8_t *dest3 = dest + 15 * stride;
+  const int str2 = stride << 1;
 
-  STORE_COMBINE_CENTER_RESULTS(r10, r9);
-  r10 += str2;
-  r9 -= str2;
+  store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]);
+  dest2 += str2;
+  dest3 -= str2;
 
-  LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16)
-  q4s16 = vaddq_s16(q2s16, q1s16);
-  q5s16 = vaddq_s16(q3s16, q0s16);
-  q6s16 = vsubq_s16(q3s16, q0s16);
-  q7s16 = vsubq_s16(q2s16, q1s16);
-  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
-  r7 += str2;
-  r6 -= str2;
+  load_from_output(out, 30, 31, &q[0], &q[1]);
+  q[4] = final_add(q[2], q[1]);
+  q[5] = final_add(q[3], q[0]);
+  q[6] = final_sub(q[3], q[0]);
+  q[7] = final_sub(q[2], q[1]);
+  store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]);
+  dest0 += str2;
+  dest1 -= str2;
 
-  LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16)
-  q2s16 = vaddq_s16(q10s16, q1s16);
-  q3s16 = vaddq_s16(q11s16, q0s16);
-  q4s16 = vsubq_s16(q11s16, q0s16);
-  q5s16 = vsubq_s16(q10s16, q1s16);
+  load_from_output(out, 12, 13, &q[0], &q[1]);
+  q[2] = vaddq_s16(q[10], q[1]);
+  q[3] = vaddq_s16(q[11], q[0]);
+  q[4] = vsubq_s16(q[11], q[0]);
+  q[5] = vsubq_s16(q[10], q[1]);
 
-  LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16)
-  q8s16 = vaddq_s16(q4s16, q1s16);
-  q9s16 = vaddq_s16(q5s16, q0s16);
-  q6s16 = vsubq_s16(q5s16, q0s16);
-  q7s16 = vsubq_s16(q4s16, q1s16);
-  STORE_COMBINE_CENTER_RESULTS(r10, r9);
-  r10 += str2;
-  r9 -= str2;
+  load_from_output(out, 18, 19, &q[0], &q[1]);
+  q[8] = final_add(q[4], q[1]);
+  q[9] = final_add(q[5], q[0]);
+  q[6] = final_sub(q[5], q[0]);
+  q[7] = final_sub(q[4], q[1]);
+  store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]);
+  dest2 += str2;
+  dest3 -= str2;
 
-  LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16)
-  q4s16 = vaddq_s16(q2s16, q1s16);
-  q5s16 = vaddq_s16(q3s16, q0s16);
-  q6s16 = vsubq_s16(q3s16, q0s16);
-  q7s16 = vsubq_s16(q2s16, q1s16);
-  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
-  r7 += str2;
-  r6 -= str2;
+  load_from_output(out, 28, 29, &q[0], &q[1]);
+  q[4] = final_add(q[2], q[1]);
+  q[5] = final_add(q[3], q[0]);
+  q[6] = final_sub(q[3], q[0]);
+  q[7] = final_sub(q[2], q[1]);
+  store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]);
+  dest0 += str2;
+  dest1 -= str2;
 
-  LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16)
-  q2s16 = vaddq_s16(q12s16, q1s16);
-  q3s16 = vaddq_s16(q13s16, q0s16);
-  q4s16 = vsubq_s16(q13s16, q0s16);
-  q5s16 = vsubq_s16(q12s16, q1s16);
+  load_from_output(out, 10, 11, &q[0], &q[1]);
+  q[2] = vaddq_s16(q[12], q[1]);
+  q[3] = vaddq_s16(q[13], q[0]);
+  q[4] = vsubq_s16(q[13], q[0]);
+  q[5] = vsubq_s16(q[12], q[1]);
 
-  LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16)
-  q8s16 = vaddq_s16(q4s16, q1s16);
-  q9s16 = vaddq_s16(q5s16, q0s16);
-  q6s16 = vsubq_s16(q5s16, q0s16);
-  q7s16 = vsubq_s16(q4s16, q1s16);
-  STORE_COMBINE_CENTER_RESULTS(r10, r9);
-  r10 += str2;
-  r9 -= str2;
+  load_from_output(out, 20, 21, &q[0], &q[1]);
+  q[8] = final_add(q[4], q[1]);
+  q[9] = final_add(q[5], q[0]);
+  q[6] = final_sub(q[5], q[0]);
+  q[7] = final_sub(q[4], q[1]);
+  store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]);
+  dest2 += str2;
+  dest3 -= str2;
 
-  LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16)
-  q4s16 = vaddq_s16(q2s16, q1s16);
-  q5s16 = vaddq_s16(q3s16, q0s16);
-  q6s16 = vsubq_s16(q3s16, q0s16);
-  q7s16 = vsubq_s16(q2s16, q1s16);
-  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
-  r7 += str2;
-  r6 -= str2;
+  load_from_output(out, 26, 27, &q[0], &q[1]);
+  q[4] = final_add(q[2], q[1]);
+  q[5] = final_add(q[3], q[0]);
+  q[6] = final_sub(q[3], q[0]);
+  q[7] = final_sub(q[2], q[1]);
+  store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]);
+  dest0 += str2;
+  dest1 -= str2;
 
-  LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16)
-  q2s16 = vaddq_s16(q14s16, q1s16);
-  q3s16 = vaddq_s16(q15s16, q0s16);
-  q4s16 = vsubq_s16(q15s16, q0s16);
-  q5s16 = vsubq_s16(q14s16, q1s16);
+  load_from_output(out, 8, 9, &q[0], &q[1]);
+  q[2] = vaddq_s16(q[14], q[1]);
+  q[3] = vaddq_s16(q[15], q[0]);
+  q[4] = vsubq_s16(q[15], q[0]);
+  q[5] = vsubq_s16(q[14], q[1]);
 
-  LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16)
-  q8s16 = vaddq_s16(q4s16, q1s16);
-  q9s16 = vaddq_s16(q5s16, q0s16);
-  q6s16 = vsubq_s16(q5s16, q0s16);
-  q7s16 = vsubq_s16(q4s16, q1s16);
-  STORE_COMBINE_CENTER_RESULTS(r10, r9);
+  load_from_output(out, 22, 23, &q[0], &q[1]);
+  q[8] = final_add(q[4], q[1]);
+  q[9] = final_add(q[5], q[0]);
+  q[6] = final_sub(q[5], q[0]);
+  q[7] = final_sub(q[4], q[1]);
+  store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]);
 
-  LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16)
-  q4s16 = vaddq_s16(q2s16, q1s16);
-  q5s16 = vaddq_s16(q3s16, q0s16);
-  q6s16 = vsubq_s16(q3s16, q0s16);
-  q7s16 = vsubq_s16(q2s16, q1s16);
-  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+  load_from_output(out, 24, 25, &q[0], &q[1]);
+  q[4] = final_add(q[2], q[1]);
+  q[5] = final_add(q[3], q[0]);
+  q[6] = final_sub(q[3], q[0]);
+  q[7] = final_sub(q[2], q[1]);
+  store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]);
 }
 
-void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest,
-                                 int stride) {
+static INLINE void highbd_idct32_bands_end_2nd_pass_bd8(
+    const int16_t *const out, uint16_t *const dest, const int stride,
+    int16x8_t *const q) {
+  uint16_t *dest0 = dest + 0 * stride;
+  uint16_t *dest1 = dest + 31 * stride;
+  uint16_t *dest2 = dest + 16 * stride;
+  uint16_t *dest3 = dest + 15 * stride;
+  const int str2 = stride << 1;
+
+  highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8],
+                                   q[9]);
+  dest2 += str2;
+  dest3 -= str2;
+
+  load_from_output(out, 30, 31, &q[0], &q[1]);
+  q[4] = final_add(q[2], q[1]);
+  q[5] = final_add(q[3], q[0]);
+  q[6] = final_sub(q[3], q[0]);
+  q[7] = final_sub(q[2], q[1]);
+  highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6],
+                                   q[7]);
+  dest0 += str2;
+  dest1 -= str2;
+
+  load_from_output(out, 12, 13, &q[0], &q[1]);
+  q[2] = vaddq_s16(q[10], q[1]);
+  q[3] = vaddq_s16(q[11], q[0]);
+  q[4] = vsubq_s16(q[11], q[0]);
+  q[5] = vsubq_s16(q[10], q[1]);
+
+  load_from_output(out, 18, 19, &q[0], &q[1]);
+  q[8] = final_add(q[4], q[1]);
+  q[9] = final_add(q[5], q[0]);
+  q[6] = final_sub(q[5], q[0]);
+  q[7] = final_sub(q[4], q[1]);
+  highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8],
+                                   q[9]);
+  dest2 += str2;
+  dest3 -= str2;
+
+  load_from_output(out, 28, 29, &q[0], &q[1]);
+  q[4] = final_add(q[2], q[1]);
+  q[5] = final_add(q[3], q[0]);
+  q[6] = final_sub(q[3], q[0]);
+  q[7] = final_sub(q[2], q[1]);
+  highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6],
+                                   q[7]);
+  dest0 += str2;
+  dest1 -= str2;
+
+  load_from_output(out, 10, 11, &q[0], &q[1]);
+  q[2] = vaddq_s16(q[12], q[1]);
+  q[3] = vaddq_s16(q[13], q[0]);
+  q[4] = vsubq_s16(q[13], q[0]);
+  q[5] = vsubq_s16(q[12], q[1]);
+
+  load_from_output(out, 20, 21, &q[0], &q[1]);
+  q[8] = final_add(q[4], q[1]);
+  q[9] = final_add(q[5], q[0]);
+  q[6] = final_sub(q[5], q[0]);
+  q[7] = final_sub(q[4], q[1]);
+  highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8],
+                                   q[9]);
+  dest2 += str2;
+  dest3 -= str2;
+
+  load_from_output(out, 26, 27, &q[0], &q[1]);
+  q[4] = final_add(q[2], q[1]);
+  q[5] = final_add(q[3], q[0]);
+  q[6] = final_sub(q[3], q[0]);
+  q[7] = final_sub(q[2], q[1]);
+  highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6],
+                                   q[7]);
+  dest0 += str2;
+  dest1 -= str2;
+
+  load_from_output(out, 8, 9, &q[0], &q[1]);
+  q[2] = vaddq_s16(q[14], q[1]);
+  q[3] = vaddq_s16(q[15], q[0]);
+  q[4] = vsubq_s16(q[15], q[0]);
+  q[5] = vsubq_s16(q[14], q[1]);
+
+  load_from_output(out, 22, 23, &q[0], &q[1]);
+  q[8] = final_add(q[4], q[1]);
+  q[9] = final_add(q[5], q[0]);
+  q[6] = final_sub(q[5], q[0]);
+  q[7] = final_sub(q[4], q[1]);
+  highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8],
+                                   q[9]);
+
+  load_from_output(out, 24, 25, &q[0], &q[1]);
+  q[4] = final_add(q[2], q[1]);
+  q[5] = final_add(q[3], q[0]);
+  q[6] = final_sub(q[3], q[0]);
+  q[7] = final_sub(q[2], q[1]);
+  highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6],
+                                   q[7]);
+}
+
+void vpx_idct32_32_neon(const tran_low_t *input, uint8_t *dest,
+                        const int stride, const int highbd_flag) {
   int i, idct32_pass_loop;
   int16_t trans_buf[32 * 8];
   int16_t pass1[32 * 32];
   int16_t pass2[32 * 32];
   const int16_t *input_pass2 = pass1;  // input of pass2 is the result of pass1
   int16_t *out;
-  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+  int16x8_t q[16];
+  uint16_t *dst = CAST_TO_SHORTPTR(dest);
 
   for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
        idct32_pass_loop++, out = pass2) {
@@ -451,237 +536,241 @@ void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest,
       // -----------------------------------------
       // generate 16,17,30,31
       // part of stage 1
-      LOAD_FROM_TRANSPOSED(0, 1, 31)
-      DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16)
-      LOAD_FROM_TRANSPOSED(31, 17, 15)
-      DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16)
+      load_from_transformed(trans_buf, 1, 31, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_31_64, cospi_1_64, &q[0], &q[2]);
+      load_from_transformed(trans_buf, 17, 15, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_15_64, cospi_17_64, &q[1], &q[3]);
       // part of stage 2
-      q4s16 = vaddq_s16(q0s16, q1s16);
-      q13s16 = vsubq_s16(q0s16, q1s16);
-      q6s16 = vaddq_s16(q2s16, q3s16);
-      q14s16 = vsubq_s16(q2s16, q3s16);
+      q[4] = vaddq_s16(q[0], q[1]);
+      q[13] = vsubq_s16(q[0], q[1]);
+      q[6] = vaddq_s16(q[2], q[3]);
+      q[14] = vsubq_s16(q[2], q[3]);
       // part of stage 3
-      DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16)
+      do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[5], &q[7]);
 
       // generate 18,19,28,29
       // part of stage 1
-      LOAD_FROM_TRANSPOSED(15, 9, 23)
-      DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16)
-      LOAD_FROM_TRANSPOSED(23, 25, 7)
-      DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16)
+      load_from_transformed(trans_buf, 9, 23, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_23_64, cospi_9_64, &q[0], &q[2]);
+      load_from_transformed(trans_buf, 25, 7, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_7_64, cospi_25_64, &q[1], &q[3]);
       // part of stage 2
-      q13s16 = vsubq_s16(q3s16, q2s16);
-      q3s16 = vaddq_s16(q3s16, q2s16);
-      q14s16 = vsubq_s16(q1s16, q0s16);
-      q2s16 = vaddq_s16(q1s16, q0s16);
+      q[13] = vsubq_s16(q[3], q[2]);
+      q[3] = vaddq_s16(q[3], q[2]);
+      q[14] = vsubq_s16(q[1], q[0]);
+      q[2] = vaddq_s16(q[1], q[0]);
       // part of stage 3
-      DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16)
+      do_butterfly(q[14], q[13], -cospi_4_64, -cospi_28_64, &q[1], &q[0]);
       // part of stage 4
-      q8s16 = vaddq_s16(q4s16, q2s16);
-      q9s16 = vaddq_s16(q5s16, q0s16);
-      q10s16 = vaddq_s16(q7s16, q1s16);
-      q15s16 = vaddq_s16(q6s16, q3s16);
-      q13s16 = vsubq_s16(q5s16, q0s16);
-      q14s16 = vsubq_s16(q7s16, q1s16);
-      STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16)
-      STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16)
+      q[8] = vaddq_s16(q[4], q[2]);
+      q[9] = vaddq_s16(q[5], q[0]);
+      q[10] = vaddq_s16(q[7], q[1]);
+      q[15] = vaddq_s16(q[6], q[3]);
+      q[13] = vsubq_s16(q[5], q[0]);
+      q[14] = vsubq_s16(q[7], q[1]);
+      store_in_output(out, 16, 31, q[8], q[15]);
+      store_in_output(out, 17, 30, q[9], q[10]);
       // part of stage 5
-      DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16)
-      STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16)
+      do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[0], &q[1]);
+      store_in_output(out, 29, 18, q[1], q[0]);
       // part of stage 4
-      q13s16 = vsubq_s16(q4s16, q2s16);
-      q14s16 = vsubq_s16(q6s16, q3s16);
+      q[13] = vsubq_s16(q[4], q[2]);
+      q[14] = vsubq_s16(q[6], q[3]);
       // part of stage 5
-      DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16)
-      STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16)
+      do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[4], &q[6]);
+      store_in_output(out, 19, 28, q[4], q[6]);
 
       // -----------------------------------------
       // BLOCK B: 20-23,24-27
       // -----------------------------------------
       // generate 20,21,26,27
       // part of stage 1
-      LOAD_FROM_TRANSPOSED(7, 5, 27)
-      DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16)
-      LOAD_FROM_TRANSPOSED(27, 21, 11)
-      DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16)
+      load_from_transformed(trans_buf, 5, 27, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_27_64, cospi_5_64, &q[0], &q[2]);
+      load_from_transformed(trans_buf, 21, 11, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_11_64, cospi_21_64, &q[1], &q[3]);
       // part of stage 2
-      q13s16 = vsubq_s16(q0s16, q1s16);
-      q0s16 = vaddq_s16(q0s16, q1s16);
-      q14s16 = vsubq_s16(q2s16, q3s16);
-      q2s16 = vaddq_s16(q2s16, q3s16);
+      q[13] = vsubq_s16(q[0], q[1]);
+      q[0] = vaddq_s16(q[0], q[1]);
+      q[14] = vsubq_s16(q[2], q[3]);
+      q[2] = vaddq_s16(q[2], q[3]);
       // part of stage 3
-      DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
+      do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]);
 
       // generate 22,23,24,25
       // part of stage 1
-      LOAD_FROM_TRANSPOSED(11, 13, 19)
-      DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16)
-      LOAD_FROM_TRANSPOSED(19, 29, 3)
-      DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16)
+      load_from_transformed(trans_buf, 13, 19, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_19_64, cospi_13_64, &q[5], &q[7]);
+      load_from_transformed(trans_buf, 29, 3, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_3_64, cospi_29_64, &q[4], &q[6]);
       // part of stage 2
-      q14s16 = vsubq_s16(q4s16, q5s16);
-      q5s16 = vaddq_s16(q4s16, q5s16);
-      q13s16 = vsubq_s16(q6s16, q7s16);
-      q6s16 = vaddq_s16(q6s16, q7s16);
+      q[14] = vsubq_s16(q[4], q[5]);
+      q[5] = vaddq_s16(q[4], q[5]);
+      q[13] = vsubq_s16(q[6], q[7]);
+      q[6] = vaddq_s16(q[6], q[7]);
       // part of stage 3
-      DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16)
+      do_butterfly(q[14], q[13], -cospi_20_64, -cospi_12_64, &q[4], &q[7]);
       // part of stage 4
-      q10s16 = vaddq_s16(q7s16, q1s16);
-      q11s16 = vaddq_s16(q5s16, q0s16);
-      q12s16 = vaddq_s16(q6s16, q2s16);
-      q15s16 = vaddq_s16(q4s16, q3s16);
+      q[10] = vaddq_s16(q[7], q[1]);
+      q[11] = vaddq_s16(q[5], q[0]);
+      q[12] = vaddq_s16(q[6], q[2]);
+      q[15] = vaddq_s16(q[4], q[3]);
       // part of stage 6
-      LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16)
-      q8s16 = vaddq_s16(q14s16, q11s16);
-      q9s16 = vaddq_s16(q13s16, q10s16);
-      q13s16 = vsubq_s16(q13s16, q10s16);
-      q11s16 = vsubq_s16(q14s16, q11s16);
-      STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16)
-      LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16)
-      q8s16 = vsubq_s16(q9s16, q12s16);
-      q10s16 = vaddq_s16(q14s16, q15s16);
-      q14s16 = vsubq_s16(q14s16, q15s16);
-      q12s16 = vaddq_s16(q9s16, q12s16);
-      STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16)
+      load_from_output(out, 16, 17, &q[14], &q[13]);
+      q[8] = vaddq_s16(q[14], q[11]);
+      q[9] = vaddq_s16(q[13], q[10]);
+      q[13] = vsubq_s16(q[13], q[10]);
+      q[11] = vsubq_s16(q[14], q[11]);
+      store_in_output(out, 17, 16, q[9], q[8]);
+      load_from_output(out, 30, 31, &q[14], &q[9]);
+      q[8] = vsubq_s16(q[9], q[12]);
+      q[10] = vaddq_s16(q[14], q[15]);
+      q[14] = vsubq_s16(q[14], q[15]);
+      q[12] = vaddq_s16(q[9], q[12]);
+      store_in_output(out, 30, 31, q[10], q[12]);
       // part of stage 7
-      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
-      STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16)
-      q13s16 = q11s16;
-      q14s16 = q8s16;
-      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
-      STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16)
+      do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]);
+      store_in_output(out, 25, 22, q[14], q[13]);
+      do_butterfly(q[8], q[11], cospi_16_64, cospi_16_64, &q[13], &q[14]);
+      store_in_output(out, 24, 23, q[14], q[13]);
       // part of stage 4
-      q14s16 = vsubq_s16(q5s16, q0s16);
-      q13s16 = vsubq_s16(q6s16, q2s16);
-      DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16);
-      q14s16 = vsubq_s16(q7s16, q1s16);
-      q13s16 = vsubq_s16(q4s16, q3s16);
-      DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16);
+      q[14] = vsubq_s16(q[5], q[0]);
+      q[13] = vsubq_s16(q[6], q[2]);
+      do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[5], &q[6]);
+      q[14] = vsubq_s16(q[7], q[1]);
+      q[13] = vsubq_s16(q[4], q[3]);
+      do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[0], &q[1]);
       // part of stage 6
-      LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16)
-      q8s16 = vaddq_s16(q14s16, q1s16);
-      q9s16 = vaddq_s16(q13s16, q6s16);
-      q13s16 = vsubq_s16(q13s16, q6s16);
-      q1s16 = vsubq_s16(q14s16, q1s16);
-      STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16)
-      LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16)
-      q14s16 = vsubq_s16(q8s16, q5s16);
-      q10s16 = vaddq_s16(q8s16, q5s16);
-      q11s16 = vaddq_s16(q9s16, q0s16);
-      q0s16 = vsubq_s16(q9s16, q0s16);
-      STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16)
+      load_from_output(out, 18, 19, &q[14], &q[13]);
+      q[8] = vaddq_s16(q[14], q[1]);
+      q[9] = vaddq_s16(q[13], q[6]);
+      q[13] = vsubq_s16(q[13], q[6]);
+      q[1] = vsubq_s16(q[14], q[1]);
+      store_in_output(out, 18, 19, q[8], q[9]);
+      load_from_output(out, 28, 29, &q[8], &q[9]);
+      q[14] = vsubq_s16(q[8], q[5]);
+      q[10] = vaddq_s16(q[8], q[5]);
+      q[11] = vaddq_s16(q[9], q[0]);
+      q[0] = vsubq_s16(q[9], q[0]);
+      store_in_output(out, 28, 29, q[10], q[11]);
       // part of stage 7
-      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
-      STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16)
-      DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64, &q1s16, &q0s16);
-      STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16)
+      do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]);
+      store_in_output(out, 20, 27, q[13], q[14]);
+      do_butterfly(q[0], q[1], cospi_16_64, cospi_16_64, &q[1], &q[0]);
+      store_in_output(out, 21, 26, q[1], q[0]);
 
       // -----------------------------------------
       // BLOCK C: 8-10,11-15
       // -----------------------------------------
       // generate 8,9,14,15
       // part of stage 2
-      LOAD_FROM_TRANSPOSED(3, 2, 30)
-      DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16)
-      LOAD_FROM_TRANSPOSED(30, 18, 14)
-      DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16)
+      load_from_transformed(trans_buf, 2, 30, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_30_64, cospi_2_64, &q[0], &q[2]);
+      load_from_transformed(trans_buf, 18, 14, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_14_64, cospi_18_64, &q[1], &q[3]);
       // part of stage 3
-      q13s16 = vsubq_s16(q0s16, q1s16);
-      q0s16 = vaddq_s16(q0s16, q1s16);
-      q14s16 = vsubq_s16(q2s16, q3s16);
-      q2s16 = vaddq_s16(q2s16, q3s16);
+      q[13] = vsubq_s16(q[0], q[1]);
+      q[0] = vaddq_s16(q[0], q[1]);
+      q[14] = vsubq_s16(q[2], q[3]);
+      q[2] = vaddq_s16(q[2], q[3]);
       // part of stage 4
-      DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16)
+      do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[1], &q[3]);
 
       // generate 10,11,12,13
       // part of stage 2
-      LOAD_FROM_TRANSPOSED(14, 10, 22)
-      DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16)
-      LOAD_FROM_TRANSPOSED(22, 26, 6)
-      DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16)
+      load_from_transformed(trans_buf, 10, 22, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_22_64, cospi_10_64, &q[5], &q[7]);
+      load_from_transformed(trans_buf, 26, 6, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_6_64, cospi_26_64, &q[4], &q[6]);
       // part of stage 3
-      q14s16 = vsubq_s16(q4s16, q5s16);
-      q5s16 = vaddq_s16(q4s16, q5s16);
-      q13s16 = vsubq_s16(q6s16, q7s16);
-      q6s16 = vaddq_s16(q6s16, q7s16);
+      q[14] = vsubq_s16(q[4], q[5]);
+      q[5] = vaddq_s16(q[4], q[5]);
+      q[13] = vsubq_s16(q[6], q[7]);
+      q[6] = vaddq_s16(q[6], q[7]);
       // part of stage 4
-      DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16)
+      do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[4], &q[7]);
       // part of stage 5
-      q8s16 = vaddq_s16(q0s16, q5s16);
-      q9s16 = vaddq_s16(q1s16, q7s16);
-      q13s16 = vsubq_s16(q1s16, q7s16);
-      q14s16 = vsubq_s16(q3s16, q4s16);
-      q10s16 = vaddq_s16(q3s16, q4s16);
-      q15s16 = vaddq_s16(q2s16, q6s16);
-      STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16)
-      STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16)
+      q[8] = vaddq_s16(q[0], q[5]);
+      q[9] = vaddq_s16(q[1], q[7]);
+      q[13] = vsubq_s16(q[1], q[7]);
+      q[14] = vsubq_s16(q[3], q[4]);
+      q[10] = vaddq_s16(q[3], q[4]);
+      q[15] = vaddq_s16(q[2], q[6]);
+      store_in_output(out, 8, 15, q[8], q[15]);
+      store_in_output(out, 9, 14, q[9], q[10]);
       // part of stage 6
-      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
-      STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16)
-      q13s16 = vsubq_s16(q0s16, q5s16);
-      q14s16 = vsubq_s16(q2s16, q6s16);
-      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
-      STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16)
+      do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]);
+      store_in_output(out, 13, 10, q[3], q[1]);
+      q[13] = vsubq_s16(q[0], q[5]);
+      q[14] = vsubq_s16(q[2], q[6]);
+      do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]);
+      store_in_output(out, 11, 12, q[1], q[3]);
 
       // -----------------------------------------
       // BLOCK D: 0-3,4-7
       // -----------------------------------------
       // generate 4,5,6,7
       // part of stage 3
-      LOAD_FROM_TRANSPOSED(6, 4, 28)
-      DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16)
-      LOAD_FROM_TRANSPOSED(28, 20, 12)
-      DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
+      load_from_transformed(trans_buf, 4, 28, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[0], &q[2]);
+      load_from_transformed(trans_buf, 20, 12, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]);
       // part of stage 4
-      q13s16 = vsubq_s16(q0s16, q1s16);
-      q0s16 = vaddq_s16(q0s16, q1s16);
-      q14s16 = vsubq_s16(q2s16, q3s16);
-      q2s16 = vaddq_s16(q2s16, q3s16);
+      q[13] = vsubq_s16(q[0], q[1]);
+      q[0] = vaddq_s16(q[0], q[1]);
+      q[14] = vsubq_s16(q[2], q[3]);
+      q[2] = vaddq_s16(q[2], q[3]);
       // part of stage 5
-      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+      do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]);
 
       // generate 0,1,2,3
       // part of stage 4
-      LOAD_FROM_TRANSPOSED(12, 0, 16)
-      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16)
-      LOAD_FROM_TRANSPOSED(16, 8, 24)
-      DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16)
+      load_from_transformed(trans_buf, 0, 16, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[5], &q[7]);
+      load_from_transformed(trans_buf, 8, 24, &q[14], &q[13]);
+      do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[14], &q[6]);
       // part of stage 5
-      q4s16 = vaddq_s16(q7s16, q6s16);
-      q7s16 = vsubq_s16(q7s16, q6s16);
-      q6s16 = vsubq_s16(q5s16, q14s16);
-      q5s16 = vaddq_s16(q5s16, q14s16);
+      q[4] = vaddq_s16(q[7], q[6]);
+      q[7] = vsubq_s16(q[7], q[6]);
+      q[6] = vsubq_s16(q[5], q[14]);
+      q[5] = vaddq_s16(q[5], q[14]);
       // part of stage 6
-      q8s16 = vaddq_s16(q4s16, q2s16);
-      q9s16 = vaddq_s16(q5s16, q3s16);
-      q10s16 = vaddq_s16(q6s16, q1s16);
-      q11s16 = vaddq_s16(q7s16, q0s16);
-      q12s16 = vsubq_s16(q7s16, q0s16);
-      q13s16 = vsubq_s16(q6s16, q1s16);
-      q14s16 = vsubq_s16(q5s16, q3s16);
-      q15s16 = vsubq_s16(q4s16, q2s16);
+      q[8] = vaddq_s16(q[4], q[2]);
+      q[9] = vaddq_s16(q[5], q[3]);
+      q[10] = vaddq_s16(q[6], q[1]);
+      q[11] = vaddq_s16(q[7], q[0]);
+      q[12] = vsubq_s16(q[7], q[0]);
+      q[13] = vsubq_s16(q[6], q[1]);
+      q[14] = vsubq_s16(q[5], q[3]);
+      q[15] = vsubq_s16(q[4], q[2]);
       // part of stage 7
-      LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16)
-      q2s16 = vaddq_s16(q8s16, q1s16);
-      q3s16 = vaddq_s16(q9s16, q0s16);
-      q4s16 = vsubq_s16(q9s16, q0s16);
-      q5s16 = vsubq_s16(q8s16, q1s16);
-      LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16)
-      q8s16 = vaddq_s16(q4s16, q1s16);
-      q9s16 = vaddq_s16(q5s16, q0s16);
-      q6s16 = vsubq_s16(q5s16, q0s16);
-      q7s16 = vsubq_s16(q4s16, q1s16);
+      load_from_output(out, 14, 15, &q[0], &q[1]);
+      q[2] = vaddq_s16(q[8], q[1]);
+      q[3] = vaddq_s16(q[9], q[0]);
+      q[4] = vsubq_s16(q[9], q[0]);
+      q[5] = vsubq_s16(q[8], q[1]);
+      load_from_output(out, 16, 17, &q[0], &q[1]);
+      q[8] = final_add(q[4], q[1]);
+      q[9] = final_add(q[5], q[0]);
+      q[6] = final_sub(q[5], q[0]);
+      q[7] = final_sub(q[4], q[1]);
 
       if (idct32_pass_loop == 0) {
-        idct32_bands_end_1st_pass(out, q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
-                                  q10s16, q11s16, q12s16, q13s16, q14s16,
-                                  q15s16);
+        idct32_bands_end_1st_pass(out, q);
       } else {
-        idct32_bands_end_2nd_pass(out, dest, stride, q2s16, q3s16, q6s16, q7s16,
-                                  q8s16, q9s16, q10s16, q11s16, q12s16, q13s16,
-                                  q14s16, q15s16);
-        dest += 8;
+        if (highbd_flag) {
+          highbd_idct32_bands_end_2nd_pass_bd8(out, dst, stride, q);
+          dst += 8;
+        } else {
+          idct32_bands_end_2nd_pass(out, dest, stride, q);
+          dest += 8;
+        }
       }
     }
   }
 }
+
+void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest,
+                                 int stride) {
+  vpx_idct32_32_neon(input, dest, stride, 0);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
index d1eae24a22..a14b895431 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
@@ -12,6 +12,7 @@
 #include <assert.h>
 
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/inv_txfm.h"
 
 static INLINE void idct4x4_1_add_kernel(uint8_t **dest, const int stride,
@@ -31,7 +32,8 @@ static INLINE void idct4x4_1_add_kernel(uint8_t **dest, const int stride,
 
 void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest,
                             int stride) {
-  const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  const int16_t out0 =
+      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
   const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
   const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4);
   const int16x8_t dc = vdupq_n_s16(a1);
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm
index 184d218941..175ba7fbc2 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm
@@ -17,7 +17,7 @@
 
     INCLUDE vpx_dsp/arm/idct_neon.asm.S
 
-    AREA     Block, CODE, READONLY ; name this block of code
+    AREA     Block, CODE, READONLY
 ;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int stride)
 ;
 ; r0  int16_t input
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
index bff98cbc16..8192ee4cf8 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
@@ -13,52 +13,47 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/txfm_common.h"
 
 void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
                              int stride) {
   const uint8_t *dst = dest;
-  const int16x4_t cospis = vld1_s16(kCospi);
-  uint32x2_t dest01_u32 = vdup_n_u32(0);
-  uint32x2_t dest32_u32 = vdup_n_u32(0);
-  int16x8_t a0, a1;
-  uint8x8_t d01, d32;
-  uint16x8_t d01_u16, d32_u16;
+  uint32x2_t s32 = vdup_n_u32(0);
+  int16x8_t a[2];
+  uint8x8_t s, d[2];
+  uint16x8_t sum[2];
 
   assert(!((intptr_t)dest % sizeof(uint32_t)));
   assert(!(stride % sizeof(uint32_t)));
 
   // Rows
-  a0 = load_tran_low_to_s16q(input);
-  a1 = load_tran_low_to_s16q(input + 8);
-  idct4x4_16_kernel_bd8(cospis, &a0, &a1);
+  a[0] = load_tran_low_to_s16q(input);
+  a[1] = load_tran_low_to_s16q(input + 8);
+  transpose_idct4x4_16_bd8(a);
 
   // Columns
-  a1 = vcombine_s16(vget_high_s16(a1), vget_low_s16(a1));
-  idct4x4_16_kernel_bd8(cospis, &a0, &a1);
-  a0 = vrshrq_n_s16(a0, 4);
-  a1 = vrshrq_n_s16(a1, 4);
+  a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+  transpose_idct4x4_16_bd8(a);
+  a[0] = vrshrq_n_s16(a[0], 4);
+  a[1] = vrshrq_n_s16(a[1], 4);
 
-  dest01_u32 = vld1_lane_u32((const uint32_t *)dst, dest01_u32, 0);
+  s = load_u8(dst, stride);
+  dst += 2 * stride;
+  // The elements are loaded in reverse order.
+  s32 = vld1_lane_u32((const uint32_t *)dst, s32, 1);
   dst += stride;
-  dest01_u32 = vld1_lane_u32((const uint32_t *)dst, dest01_u32, 1);
-  dst += stride;
-  dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 1);
-  dst += stride;
-  dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 0);
+  s32 = vld1_lane_u32((const uint32_t *)dst, s32, 0);
 
-  d01_u16 =
-      vaddw_u8(vreinterpretq_u16_s16(a0), vreinterpret_u8_u32(dest01_u32));
-  d32_u16 =
-      vaddw_u8(vreinterpretq_u16_s16(a1), vreinterpret_u8_u32(dest32_u32));
-  d01 = vqmovun_s16(vreinterpretq_s16_u16(d01_u16));
-  d32 = vqmovun_s16(vreinterpretq_s16_u16(d32_u16));
+  sum[0] = vaddw_u8(vreinterpretq_u16_s16(a[0]), s);
+  sum[1] = vaddw_u8(vreinterpretq_u16_s16(a[1]), vreinterpret_u8_u32(s32));
+  d[0] = vqmovun_s16(vreinterpretq_s16_u16(sum[0]));
+  d[1] = vqmovun_s16(vreinterpretq_s16_u16(sum[1]));
 
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d01), 0);
+  store_u8(dest, stride, d[0]);
+  dest += 2 * stride;
+  // The elements are stored in reverse order.
+  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d[1]), 1);
   dest += stride;
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d01), 1);
-  dest += stride;
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 1);
-  dest += stride;
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 0);
+  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d[1]), 0);
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm
deleted file mode 100644
index 29f678a038..0000000000
--- a/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm
+++ /dev/null
@@ -1,86 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-    EXPORT  |vpx_idct8x8_1_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vpx_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, int stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int stride)
-
-|vpx_idct8x8_1_add_neon| PROC
-    ldrsh            r0, [r0]
-
-    ; cospi_16_64 = 11585
-    movw             r12, #0x2d41
-
-    ; out = dct_const_round_shift(input[0] * cospi_16_64)
-    mul              r0, r0, r12               ; input[0] * cospi_16_64
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; out = dct_const_round_shift(out * cospi_16_64)
-    mul              r0, r0, r12               ; out * cospi_16_64
-    mov              r12, r1                   ; save dest
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; a1 = ROUND_POWER_OF_TWO(out, 5)
-    add              r0, r0, #16               ; + (1 <<((5) - 1))
-    asr              r0, r0, #5                ; >> 5
-
-    vdup.s16         q0, r0                    ; duplicate a1
-
-    ; load destination data
-    vld1.64          {d2}, [r1], r2
-    vld1.64          {d3}, [r1], r2
-    vld1.64          {d4}, [r1], r2
-    vld1.64          {d5}, [r1], r2
-    vld1.64          {d6}, [r1], r2
-    vld1.64          {d7}, [r1], r2
-    vld1.64          {d16}, [r1], r2
-    vld1.64          {d17}, [r1]
-
-    vaddw.u8         q9, q0, d2                ; dest[x] + a1
-    vaddw.u8         q10, q0, d3               ; dest[x] + a1
-    vaddw.u8         q11, q0, d4               ; dest[x] + a1
-    vaddw.u8         q12, q0, d5               ; dest[x] + a1
-    vqmovun.s16      d2, q9                    ; clip_pixel
-    vqmovun.s16      d3, q10                   ; clip_pixel
-    vqmovun.s16      d30, q11                  ; clip_pixel
-    vqmovun.s16      d31, q12                  ; clip_pixel
-    vst1.64          {d2}, [r12], r2
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r2
-    vst1.64          {d31}, [r12], r2
-
-    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
-    vaddw.u8         q10, q0, d7                ; dest[x] + a1
-    vaddw.u8         q11, q0, d16               ; dest[x] + a1
-    vaddw.u8         q12, q0, d17               ; dest[x] + a1
-    vqmovun.s16      d2, q9                     ; clip_pixel
-    vqmovun.s16      d3, q10                    ; clip_pixel
-    vqmovun.s16      d30, q11                   ; clip_pixel
-    vqmovun.s16      d31, q12                   ; clip_pixel
-    vst1.64          {d2}, [r12], r2
-    vst1.64          {d3}, [r12], r2
-    vst1.64          {d30}, [r12], r2
-    vst1.64          {d31}, [r12], r2
-
-    bx               lr
-    ENDP             ; |vpx_idct8x8_1_add_neon|
-
-    END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
index 7bcce913bd..ce9b459589 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
@@ -36,7 +36,8 @@ static INLINE void idct8x8_1_add_neg_kernel(uint8_t **dest, const int stride,
 
 void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest,
                             int stride) {
-  const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  const int16_t out0 =
+      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
   const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
   const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5);
 
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm
deleted file mode 100644
index 2bfbcc5a52..0000000000
--- a/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm
+++ /dev/null
@@ -1,507 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT  |vpx_idct8x8_64_add_neon|
-    EXPORT  |vpx_idct8x8_12_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    INCLUDE vpx_dsp/arm/idct_neon.asm.S
-
-    ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are
-    ; loaded in q8-q15. The output will be stored back into q8-q15 registers.
-    ; This macro will touch q0-q7 registers and use them as buffer during
-    ; calculation.
-    MACRO
-    IDCT8x8_1D
-    ; stage 1
-    vdup.16         d0, r3                    ; duplicate cospi_28_64
-    vdup.16         d1, r4                    ; duplicate cospi_4_64
-    vdup.16         d2, r5                    ; duplicate cospi_12_64
-    vdup.16         d3, r6                    ; duplicate cospi_20_64
-
-    ; input[1] * cospi_28_64
-    vmull.s16       q2, d18, d0
-    vmull.s16       q3, d19, d0
-
-    ; input[5] * cospi_12_64
-    vmull.s16       q5, d26, d2
-    vmull.s16       q6, d27, d2
-
-    ; input[1]*cospi_28_64-input[7]*cospi_4_64
-    vmlsl.s16       q2, d30, d1
-    vmlsl.s16       q3, d31, d1
-
-    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
-    vmlsl.s16       q5, d22, d3
-    vmlsl.s16       q6, d23, d3
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vrshrn.s32      d8, q2, #14               ; >> 14
-    vrshrn.s32      d9, q3, #14               ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vrshrn.s32      d10, q5, #14              ; >> 14
-    vrshrn.s32      d11, q6, #14              ; >> 14
-
-    ; input[1] * cospi_4_64
-    vmull.s16       q2, d18, d1
-    vmull.s16       q3, d19, d1
-
-    ; input[5] * cospi_20_64
-    vmull.s16       q9, d26, d3
-    vmull.s16       q13, d27, d3
-
-    ; input[1]*cospi_4_64+input[7]*cospi_28_64
-    vmlal.s16       q2, d30, d0
-    vmlal.s16       q3, d31, d0
-
-    ; input[5] * cospi_20_64 + input[3] * cospi_12_64
-    vmlal.s16       q9, d22, d2
-    vmlal.s16       q13, d23, d2
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vrshrn.s32      d14, q2, #14              ; >> 14
-    vrshrn.s32      d15, q3, #14              ; >> 14
-
-    ; stage 2 & stage 3 - even half
-    vdup.16         d0, r7                    ; duplicate cospi_16_64
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vrshrn.s32      d12, q9, #14              ; >> 14
-    vrshrn.s32      d13, q13, #14             ; >> 14
-
-    ; input[0] * cospi_16_64
-    vmull.s16       q2, d16, d0
-    vmull.s16       q3, d17, d0
-
-    ; input[0] * cospi_16_64
-    vmull.s16       q13, d16, d0
-    vmull.s16       q15, d17, d0
-
-    ; (input[0] + input[2]) * cospi_16_64
-    vmlal.s16       q2,  d24, d0
-    vmlal.s16       q3, d25, d0
-
-    ; (input[0] - input[2]) * cospi_16_64
-    vmlsl.s16       q13, d24, d0
-    vmlsl.s16       q15, d25, d0
-
-    vdup.16         d0, r8                    ; duplicate cospi_24_64
-    vdup.16         d1, r9                    ; duplicate cospi_8_64
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vrshrn.s32      d18, q2, #14              ; >> 14
-    vrshrn.s32      d19, q3, #14              ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vrshrn.s32      d22, q13, #14             ; >> 14
-    vrshrn.s32      d23, q15, #14             ; >> 14
-
-    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
-    ; input[1] * cospi_24_64
-    vmull.s16       q2, d20, d0
-    vmull.s16       q3, d21, d0
-
-    ; input[1] * cospi_8_64
-    vmull.s16       q8, d20, d1
-    vmull.s16       q12, d21, d1
-
-    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
-    vmlsl.s16       q2, d28, d1
-    vmlsl.s16       q3, d29, d1
-
-    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
-    vmlal.s16       q8, d28, d0
-    vmlal.s16       q12, d29, d0
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vrshrn.s32      d26, q2, #14              ; >> 14
-    vrshrn.s32      d27, q3, #14              ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vrshrn.s32      d30, q8, #14              ; >> 14
-    vrshrn.s32      d31, q12, #14             ; >> 14
-
-    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
-    vadd.s16        q1, q11, q13              ; output[1] = step[1] + step[2]
-    vsub.s16        q2, q11, q13              ; output[2] = step[1] - step[2]
-    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
-
-    ; stage 3 -odd half
-    vdup.16         d16, r7                   ; duplicate cospi_16_64
-
-    ; stage 2 - odd half
-    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
-    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
-    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
-    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q9, d28, d16
-    vmull.s16       q10, d29, d16
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q11, d28, d16
-    vmull.s16       q12, d29, d16
-
-    ; (step2[6] - step2[5]) * cospi_16_64
-    vmlsl.s16       q9, d26, d16
-    vmlsl.s16       q10, d27, d16
-
-    ; (step2[5] + step2[6]) * cospi_16_64
-    vmlal.s16       q11, d26, d16
-    vmlal.s16       q12, d27, d16
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vrshrn.s32      d10, q9, #14              ; >> 14
-    vrshrn.s32      d11, q10, #14             ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vrshrn.s32      d12, q11, #14             ; >> 14
-    vrshrn.s32      d13, q12, #14             ; >> 14
-
-    ; stage 4
-    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
-    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
-    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
-    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
-    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
-    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
-    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
-    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
-    MEND
-
-    ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.
-    MACRO
-    TRANSPOSE8X8
-    vswp            d17, d24
-    vswp            d23, d30
-    vswp            d21, d28
-    vswp            d19, d26
-    vtrn.32         q8, q10
-    vtrn.32         q9, q11
-    vtrn.32         q12, q14
-    vtrn.32         q13, q15
-    vtrn.16         q8, q9
-    vtrn.16         q10, q11
-    vtrn.16         q12, q13
-    vtrn.16         q14, q15
-    MEND
-
-    AREA    Block, CODE, READONLY ; name this block of code
-;void vpx_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int stride)
-
-|vpx_idct8x8_64_add_neon| PROC
-    push            {r4-r9}
-    vpush           {d8-d15}
-    LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0
-    LOAD_TRAN_LOW_TO_S16 d20, d21, d22, d23, r0
-    LOAD_TRAN_LOW_TO_S16 d24, d25, d26, d27, r0
-    LOAD_TRAN_LOW_TO_S16 d28, d29, d30, d31, r0
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; cospi_28_64 = 3196
-    movw            r3, #0x0c7c
-
-    ; cospi_4_64  = 16069
-    movw            r4, #0x3ec5
-
-    ; cospi_12_64 = 13623
-    movw            r5, #0x3537
-
-    ; cospi_20_64 = 9102
-    movw            r6, #0x238e
-
-    ; cospi_16_64 = 11585
-    movw            r7, #0x2d41
-
-    ; cospi_24_64 = 6270
-    movw            r8, #0x187e
-
-    ; cospi_8_64 = 15137
-    movw            r9, #0x3b21
-
-    ; First transform rows
-    IDCT8x8_1D
-
-    ; Transpose the matrix
-    TRANSPOSE8X8
-
-    ; Then transform columns
-    IDCT8x8_1D
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
-    vrshr.s16       q8, q8, #5
-    vrshr.s16       q9, q9, #5
-    vrshr.s16       q10, q10, #5
-    vrshr.s16       q11, q11, #5
-    vrshr.s16       q12, q12, #5
-    vrshr.s16       q13, q13, #5
-    vrshr.s16       q14, q14, #5
-    vrshr.s16       q15, q15, #5
-
-    ; save dest pointer
-    mov             r0, r1
-
-    ; load destination data
-    vld1.64         {d0}, [r1], r2
-    vld1.64         {d1}, [r1], r2
-    vld1.64         {d2}, [r1], r2
-    vld1.64         {d3}, [r1], r2
-    vld1.64         {d4}, [r1], r2
-    vld1.64         {d5}, [r1], r2
-    vld1.64         {d6}, [r1], r2
-    vld1.64         {d7}, [r1]
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * stride + i]
-    vaddw.u8        q8, q8, d0
-    vaddw.u8        q9, q9, d1
-    vaddw.u8        q10, q10, d2
-    vaddw.u8        q11, q11, d3
-    vaddw.u8        q12, q12, d4
-    vaddw.u8        q13, q13, d5
-    vaddw.u8        q14, q14, d6
-    vaddw.u8        q15, q15, d7
-
-    ; clip_pixel
-    vqmovun.s16     d0, q8
-    vqmovun.s16     d1, q9
-    vqmovun.s16     d2, q10
-    vqmovun.s16     d3, q11
-    vqmovun.s16     d4, q12
-    vqmovun.s16     d5, q13
-    vqmovun.s16     d6, q14
-    vqmovun.s16     d7, q15
-
-    ; store the data
-    vst1.64         {d0}, [r0], r2
-    vst1.64         {d1}, [r0], r2
-    vst1.64         {d2}, [r0], r2
-    vst1.64         {d3}, [r0], r2
-    vst1.64         {d4}, [r0], r2
-    vst1.64         {d5}, [r0], r2
-    vst1.64         {d6}, [r0], r2
-    vst1.64         {d7}, [r0], r2
-
-    vpop            {d8-d15}
-    pop             {r4-r9}
-    bx              lr
-    ENDP  ; |vpx_idct8x8_64_add_neon|
-
-;void vpx_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int stride)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int stride)
-
-|vpx_idct8x8_12_add_neon| PROC
-    push            {r4-r9}
-    vpush           {d8-d15}
-    LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0
-    LOAD_TRAN_LOW_TO_S16 d20, d21, d22, d23, r0
-    LOAD_TRAN_LOW_TO_S16 d24, d25, d26, d27, r0
-    LOAD_TRAN_LOW_TO_S16 d28, d29, d30, d31, r0
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; cospi_28_64 = 3196
-    movw            r3, #0x0c7c
-
-    ; cospi_4_64  = 16069
-    movw            r4, #0x3ec5
-
-    ; cospi_12_64 = 13623
-    movw            r5, #0x3537
-
-    ; cospi_20_64 = 9102
-    movw            r6, #0x238e
-
-    ; cospi_16_64 = 11585
-    movw            r7, #0x2d41
-
-    ; cospi_24_64 = 6270
-    movw            r8, #0x187e
-
-    ; cospi_8_64 = 15137
-    movw            r9, #0x3b21
-
-    ; First transform rows
-    ; stage 1
-    ; The following instructions use vqrdmulh to do the
-    ; dct_const_round_shift(input[1] * cospi_28_64). vqrdmulh will do doubling
-    ; multiply and shift the result by 16 bits instead of 14 bits. So we need
-    ; to double the constants before multiplying to compensate this.
-    mov             r12, r3, lsl #1
-    vdup.16         q0, r12                   ; duplicate cospi_28_64*2
-    mov             r12, r4, lsl #1
-    vdup.16         q1, r12                   ; duplicate cospi_4_64*2
-
-    ; dct_const_round_shift(input[1] * cospi_28_64)
-    vqrdmulh.s16    q4, q9, q0
-
-    mov             r12, r6, lsl #1
-    rsb             r12, #0
-    vdup.16         q0, r12                   ; duplicate -cospi_20_64*2
-
-    ; dct_const_round_shift(input[1] * cospi_4_64)
-    vqrdmulh.s16    q7, q9, q1
-
-    mov             r12, r5, lsl #1
-    vdup.16         q1, r12                   ; duplicate cospi_12_64*2
-
-    ; dct_const_round_shift(- input[3] * cospi_20_64)
-    vqrdmulh.s16    q5, q11, q0
-
-    mov             r12, r7, lsl #1
-    vdup.16         q0, r12                   ; duplicate cospi_16_64*2
-
-    ; dct_const_round_shift(input[3] * cospi_12_64)
-    vqrdmulh.s16    q6, q11, q1
-
-    ; stage 2 & stage 3 - even half
-    mov             r12, r8, lsl #1
-    vdup.16         q1, r12                   ; duplicate cospi_24_64*2
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrdmulh.s16    q9, q8, q0
-
-    mov             r12, r9, lsl #1
-    vdup.16         q0, r12                   ; duplicate cospi_8_64*2
-
-    ; dct_const_round_shift(input[1] * cospi_24_64)
-    vqrdmulh.s16    q13, q10, q1
-
-    ; dct_const_round_shift(input[1] * cospi_8_64)
-    vqrdmulh.s16    q15, q10, q0
-
-    ; stage 3 -odd half
-    vdup.16         d16, r7                   ; duplicate cospi_16_64
-
-    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
-    vadd.s16        q1, q9, q13               ; output[1] = step[1] + step[2]
-    vsub.s16        q2, q9, q13               ; output[2] = step[1] - step[2]
-    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
-
-    ; stage 2 - odd half
-    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
-    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
-    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
-    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q9, d28, d16
-    vmull.s16       q10, d29, d16
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q11, d28, d16
-    vmull.s16       q12, d29, d16
-
-    ; (step2[6] - step2[5]) * cospi_16_64
-    vmlsl.s16       q9, d26, d16
-    vmlsl.s16       q10, d27, d16
-
-    ; (step2[5] + step2[6]) * cospi_16_64
-    vmlal.s16       q11, d26, d16
-    vmlal.s16       q12, d27, d16
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vrshrn.s32      d10, q9, #14              ; >> 14
-    vrshrn.s32      d11, q10, #14             ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vrshrn.s32      d12, q11, #14             ; >> 14
-    vrshrn.s32      d13, q12, #14             ; >> 14
-
-    ; stage 4
-    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
-    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
-    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
-    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
-    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
-    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
-    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
-    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
-
-    ; Transpose the matrix
-    TRANSPOSE8X8
-
-    ; Then transform columns
-    IDCT8x8_1D
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
-    vrshr.s16       q8, q8, #5
-    vrshr.s16       q9, q9, #5
-    vrshr.s16       q10, q10, #5
-    vrshr.s16       q11, q11, #5
-    vrshr.s16       q12, q12, #5
-    vrshr.s16       q13, q13, #5
-    vrshr.s16       q14, q14, #5
-    vrshr.s16       q15, q15, #5
-
-    ; save dest pointer
-    mov             r0, r1
-
-    ; load destination data
-    vld1.64         {d0}, [r1], r2
-    vld1.64         {d1}, [r1], r2
-    vld1.64         {d2}, [r1], r2
-    vld1.64         {d3}, [r1], r2
-    vld1.64         {d4}, [r1], r2
-    vld1.64         {d5}, [r1], r2
-    vld1.64         {d6}, [r1], r2
-    vld1.64         {d7}, [r1]
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * stride + i]
-    vaddw.u8        q8, q8, d0
-    vaddw.u8        q9, q9, d1
-    vaddw.u8        q10, q10, d2
-    vaddw.u8        q11, q11, d3
-    vaddw.u8        q12, q12, d4
-    vaddw.u8        q13, q13, d5
-    vaddw.u8        q14, q14, d6
-    vaddw.u8        q15, q15, d7
-
-    ; clip_pixel
-    vqmovun.s16     d0, q8
-    vqmovun.s16     d1, q9
-    vqmovun.s16     d2, q10
-    vqmovun.s16     d3, q11
-    vqmovun.s16     d4, q12
-    vqmovun.s16     d5, q13
-    vqmovun.s16     d6, q14
-    vqmovun.s16     d7, q15
-
-    ; store the data
-    vst1.64         {d0}, [r0], r2
-    vst1.64         {d1}, [r0], r2
-    vst1.64         {d2}, [r0], r2
-    vst1.64         {d3}, [r0], r2
-    vst1.64         {d4}, [r0], r2
-    vst1.64         {d5}, [r0], r2
-    vst1.64         {d6}, [r0], r2
-    vst1.64         {d7}, [r0], r2
-
-    vpop            {d8-d15}
-    pop             {r4-r9}
-    bx              lr
-    ENDP  ; |vpx_idct8x8_12_add_neon|
-
-    END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
index 279da67d74..7471387e47 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
@@ -13,94 +13,29 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/txfm_common.h"
 
-static INLINE void add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2,
-                          int16x8_t a3, int16x8_t a4, int16x8_t a5,
-                          int16x8_t a6, int16x8_t a7, uint8_t *dest,
-                          const int stride) {
-  const uint8_t *dst = dest;
-  uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
-  uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16, d4_u16, d5_u16, d6_u16, d7_u16;
-
-  a0 = vrshrq_n_s16(a0, 5);
-  a1 = vrshrq_n_s16(a1, 5);
-  a2 = vrshrq_n_s16(a2, 5);
-  a3 = vrshrq_n_s16(a3, 5);
-  a4 = vrshrq_n_s16(a4, 5);
-  a5 = vrshrq_n_s16(a5, 5);
-  a6 = vrshrq_n_s16(a6, 5);
-  a7 = vrshrq_n_s16(a7, 5);
-
-  d0 = vld1_u8(dst);
-  dst += stride;
-  d1 = vld1_u8(dst);
-  dst += stride;
-  d2 = vld1_u8(dst);
-  dst += stride;
-  d3 = vld1_u8(dst);
-  dst += stride;
-  d4 = vld1_u8(dst);
-  dst += stride;
-  d5 = vld1_u8(dst);
-  dst += stride;
-  d6 = vld1_u8(dst);
-  dst += stride;
-  d7 = vld1_u8(dst);
-
-  d0_u16 = vaddw_u8(vreinterpretq_u16_s16(a0), d0);
-  d1_u16 = vaddw_u8(vreinterpretq_u16_s16(a1), d1);
-  d2_u16 = vaddw_u8(vreinterpretq_u16_s16(a2), d2);
-  d3_u16 = vaddw_u8(vreinterpretq_u16_s16(a3), d3);
-  d4_u16 = vaddw_u8(vreinterpretq_u16_s16(a4), d4);
-  d5_u16 = vaddw_u8(vreinterpretq_u16_s16(a5), d5);
-  d6_u16 = vaddw_u8(vreinterpretq_u16_s16(a6), d6);
-  d7_u16 = vaddw_u8(vreinterpretq_u16_s16(a7), d7);
-
-  d0 = vqmovun_s16(vreinterpretq_s16_u16(d0_u16));
-  d1 = vqmovun_s16(vreinterpretq_s16_u16(d1_u16));
-  d2 = vqmovun_s16(vreinterpretq_s16_u16(d2_u16));
-  d3 = vqmovun_s16(vreinterpretq_s16_u16(d3_u16));
-  d4 = vqmovun_s16(vreinterpretq_s16_u16(d4_u16));
-  d5 = vqmovun_s16(vreinterpretq_s16_u16(d5_u16));
-  d6 = vqmovun_s16(vreinterpretq_s16_u16(d6_u16));
-  d7 = vqmovun_s16(vreinterpretq_s16_u16(d7_u16));
-
-  vst1_u8(dest, d0);
-  dest += stride;
-  vst1_u8(dest, d1);
-  dest += stride;
-  vst1_u8(dest, d2);
-  dest += stride;
-  vst1_u8(dest, d3);
-  dest += stride;
-  vst1_u8(dest, d4);
-  dest += stride;
-  vst1_u8(dest, d5);
-  dest += stride;
-  vst1_u8(dest, d6);
-  dest += stride;
-  vst1_u8(dest, d7);
-}
-
 void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
                              int stride) {
   const int16x8_t cospis = vld1q_s16(kCospi);
   const int16x4_t cospis0 = vget_low_s16(cospis);   // cospi 0, 8, 16, 24
   const int16x4_t cospis1 = vget_high_s16(cospis);  // cospi 4, 12, 20, 28
-  int16x8_t a0 = load_tran_low_to_s16q(input);
-  int16x8_t a1 = load_tran_low_to_s16q(input + 8);
-  int16x8_t a2 = load_tran_low_to_s16q(input + 16);
-  int16x8_t a3 = load_tran_low_to_s16q(input + 24);
-  int16x8_t a4 = load_tran_low_to_s16q(input + 32);
-  int16x8_t a5 = load_tran_low_to_s16q(input + 40);
-  int16x8_t a6 = load_tran_low_to_s16q(input + 48);
-  int16x8_t a7 = load_tran_low_to_s16q(input + 56);
+  int16x8_t a[8];
 
-  idct8x8_64_1d_bd8(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
-  idct8x8_64_1d_bd8(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
-  add8x8(a0, a1, a2, a3, a4, a5, a6, a7, dest, stride);
+  a[0] = load_tran_low_to_s16q(input);
+  a[1] = load_tran_low_to_s16q(input + 8);
+  a[2] = load_tran_low_to_s16q(input + 16);
+  a[3] = load_tran_low_to_s16q(input + 24);
+  a[4] = load_tran_low_to_s16q(input + 32);
+  a[5] = load_tran_low_to_s16q(input + 40);
+  a[6] = load_tran_low_to_s16q(input + 48);
+  a[7] = load_tran_low_to_s16q(input + 56);
+
+  idct8x8_64_1d_bd8(cospis0, cospis1, a);
+  idct8x8_64_1d_bd8(cospis0, cospis1, a);
+  idct8x8_add8x8_neon(a, dest, stride);
 }
 
 void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest,
@@ -110,17 +45,15 @@ void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest,
   const int16x4_t cospis0 = vget_low_s16(cospis);     // cospi 0, 8, 16, 24
   const int16x4_t cospisd0 = vget_low_s16(cospisd);   // doubled 0, 8, 16, 24
   const int16x4_t cospisd1 = vget_high_s16(cospisd);  // doubled 4, 12, 20, 28
-  int16x4_t a0, a1, a2, a3, a4, a5, a6, a7;
-  int16x8_t b0, b1, b2, b3, b4, b5, b6, b7;
+  int16x4_t a[8];
+  int16x8_t b[8];
 
-  a0 = load_tran_low_to_s16d(input);
-  a1 = load_tran_low_to_s16d(input + 8);
-  a2 = load_tran_low_to_s16d(input + 16);
-  a3 = load_tran_low_to_s16d(input + 24);
+  a[0] = load_tran_low_to_s16d(input);
+  a[1] = load_tran_low_to_s16d(input + 8);
+  a[2] = load_tran_low_to_s16d(input + 16);
+  a[3] = load_tran_low_to_s16d(input + 24);
 
-  idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, &a0, &a1, &a2, &a3, &a4,
-                       &a5, &a6, &a7);
-  idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, a0, a1, a2, a3, a4, a5, a6,
-                       a7, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7);
-  add8x8(b0, b1, b2, b3, b4, b5, b6, b7, dest, stride);
+  idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, a);
+  idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, a, b);
+  idct8x8_add8x8_neon(b, dest, stride);
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/idct_neon.h
index aecc543dbb..c02311326b 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/idct_neon.h
+++ b/media/libvpx/libvpx/vpx_dsp/arm/idct_neon.h
@@ -8,87 +8,113 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_ARM_IDCT_NEON_H_
-#define VPX_DSP_ARM_IDCT_NEON_H_
+#ifndef VPX_VPX_DSP_ARM_IDCT_NEON_H_
+#define VPX_VPX_DSP_ARM_IDCT_NEON_H_
 
 #include <arm_neon.h>
 
 #include "./vpx_config.h"
 #include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
-DECLARE_ALIGNED(16, static const int16_t, kCospi[8]) = {
-  16384 /*  cospi_0_64 */,  15137 /* cospi_8_64 */,
-  11585 /*  cospi_16_64 */, 6270 /* cospi_24_64 */,
-  16069 /*  cospi_4_64 */,  13623 /* cospi_12_64 */,
-  -9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */
+static const int16_t kCospi[16] = {
+  16384 /*  cospi_0_64  */, 15137 /*  cospi_8_64  */,
+  11585 /*  cospi_16_64 */, 6270 /*  cospi_24_64 */,
+  16069 /*  cospi_4_64  */, 13623 /*  cospi_12_64 */,
+  -9102 /* -cospi_20_64 */, 3196 /*  cospi_28_64 */,
+  16305 /*  cospi_2_64  */, 1606 /*  cospi_30_64 */,
+  14449 /*  cospi_10_64 */, 7723 /*  cospi_22_64 */,
+  15679 /*  cospi_6_64  */, -4756 /* -cospi_26_64 */,
+  12665 /*  cospi_14_64 */, -10394 /* -cospi_18_64 */
 };
 
-DECLARE_ALIGNED(16, static const int32_t, kCospi32[8]) = {
-  16384 /*  cospi_0_64 */,  15137 /* cospi_8_64 */,
-  11585 /*  cospi_16_64 */, 6270 /* cospi_24_64 */,
-  16069 /*  cospi_4_64 */,  13623 /* cospi_12_64 */,
-  -9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */
+static const int32_t kCospi32[16] = {
+  16384 /*  cospi_0_64  */, 15137 /*  cospi_8_64  */,
+  11585 /*  cospi_16_64 */, 6270 /*  cospi_24_64 */,
+  16069 /*  cospi_4_64  */, 13623 /*  cospi_12_64 */,
+  -9102 /* -cospi_20_64 */, 3196 /*  cospi_28_64 */,
+  16305 /*  cospi_2_64  */, 1606 /*  cospi_30_64 */,
+  14449 /*  cospi_10_64 */, 7723 /*  cospi_22_64 */,
+  15679 /*  cospi_6_64  */, -4756 /* -cospi_26_64 */,
+  12665 /*  cospi_14_64 */, -10394 /* -cospi_18_64 */
 };
 
 //------------------------------------------------------------------------------
-// Helper functions used to load tran_low_t into int16, narrowing if necessary.
-
-static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) {
+// Use saturating add/sub to avoid overflow in 2nd pass in high bit-depth
+static INLINE int16x8_t final_add(const int16x8_t a, const int16x8_t b) {
 #if CONFIG_VP9_HIGHBITDEPTH
-  const int32x4x2_t v0 = vld2q_s32(buf);
-  const int32x4x2_t v1 = vld2q_s32(buf + 8);
-  const int16x4_t s0 = vmovn_s32(v0.val[0]);
-  const int16x4_t s1 = vmovn_s32(v0.val[1]);
-  const int16x4_t s2 = vmovn_s32(v1.val[0]);
-  const int16x4_t s3 = vmovn_s32(v1.val[1]);
-  int16x8x2_t res;
-  res.val[0] = vcombine_s16(s0, s2);
-  res.val[1] = vcombine_s16(s1, s3);
-  return res;
+  return vqaddq_s16(a, b);
 #else
-  return vld2q_s16(buf);
+  return vaddq_s16(a, b);
 #endif
 }
 
-static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
+static INLINE int16x8_t final_sub(const int16x8_t a, const int16x8_t b) {
 #if CONFIG_VP9_HIGHBITDEPTH
-  const int32x4_t v0 = vld1q_s32(buf);
-  const int32x4_t v1 = vld1q_s32(buf + 4);
-  const int16x4_t s0 = vmovn_s32(v0);
-  const int16x4_t s1 = vmovn_s32(v1);
-  return vcombine_s16(s0, s1);
+  return vqsubq_s16(a, b);
 #else
-  return vld1q_s16(buf);
-#endif
-}
-
-static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  const int32x4_t v0 = vld1q_s32(buf);
-  return vmovn_s32(v0);
-#else
-  return vld1_s16(buf);
+  return vsubq_s16(a, b);
 #endif
 }
 
 //------------------------------------------------------------------------------
 
-// Multiply a by a_const. Saturate, shift and narrow by 14.
+static INLINE int32x4x2_t highbd_idct_add_dual(const int32x4x2_t s0,
+                                               const int32x4x2_t s1) {
+  int32x4x2_t t;
+  t.val[0] = vaddq_s32(s0.val[0], s1.val[0]);
+  t.val[1] = vaddq_s32(s0.val[1], s1.val[1]);
+  return t;
+}
+
+static INLINE int32x4x2_t highbd_idct_sub_dual(const int32x4x2_t s0,
+                                               const int32x4x2_t s1) {
+  int32x4x2_t t;
+  t.val[0] = vsubq_s32(s0.val[0], s1.val[0]);
+  t.val[1] = vsubq_s32(s0.val[1], s1.val[1]);
+  return t;
+}
+
+//------------------------------------------------------------------------------
+
+static INLINE int16x8_t dct_const_round_shift_low_8(const int32x4_t *const in) {
+  return vcombine_s16(vrshrn_n_s32(in[0], DCT_CONST_BITS),
+                      vrshrn_n_s32(in[1], DCT_CONST_BITS));
+}
+
+static INLINE void dct_const_round_shift_low_8_dual(const int32x4_t *const t32,
+                                                    int16x8_t *const d0,
+                                                    int16x8_t *const d1) {
+  *d0 = dct_const_round_shift_low_8(t32 + 0);
+  *d1 = dct_const_round_shift_low_8(t32 + 2);
+}
+
+static INLINE int32x4x2_t
+dct_const_round_shift_high_4x2(const int64x2_t *const in) {
+  int32x4x2_t out;
+  out.val[0] = vcombine_s32(vrshrn_n_s64(in[0], DCT_CONST_BITS),
+                            vrshrn_n_s64(in[1], DCT_CONST_BITS));
+  out.val[1] = vcombine_s32(vrshrn_n_s64(in[2], DCT_CONST_BITS),
+                            vrshrn_n_s64(in[3], DCT_CONST_BITS));
+  return out;
+}
+
+// Multiply a by a_const. Saturate, shift and narrow by DCT_CONST_BITS.
 static INLINE int16x8_t multiply_shift_and_narrow_s16(const int16x8_t a,
                                                       const int16_t a_const) {
-  // Shift by 14 + rounding will be within 16 bits for well formed streams.
-  // See WRAPLOW and dct_const_round_shift for details.
+  // Shift by DCT_CONST_BITS + rounding will be within 16 bits for well formed
+  // streams. See WRAPLOW and dct_const_round_shift for details.
   // This instruction doubles the result and returns the high half, essentially
   // resulting in a right shift by 15. By multiplying the constant first that
-  // becomes a right shift by 14.
+  // becomes a right shift by DCT_CONST_BITS.
   // The largest possible value used here is
   // vpx_dsp/txfm_common.h:cospi_1_64 = 16364 (* 2 = 32728) a which falls *just*
   // within the range of int16_t (+32767 / -32768) even when negated.
   return vqrdmulhq_n_s16(a, a_const * 2);
 }
 
-// Add a and b, then multiply by ab_const. Shift and narrow by 14.
+// Add a and b, then multiply by ab_const. Shift and narrow by DCT_CONST_BITS.
 static INLINE int16x8_t add_multiply_shift_and_narrow_s16(
     const int16x8_t a, const int16x8_t b, const int16_t ab_const) {
   // In both add_ and it's pair, sub_, the input for well-formed streams will be
@@ -98,94 +124,162 @@ static INLINE int16x8_t add_multiply_shift_and_narrow_s16(
   // input) this function can not use vaddq_s16.
   // In order to match existing behavior and intentionally out of range tests,
   // expand the addition up to 32 bits to prevent truncation.
-  int32x4_t temp_low = vaddl_s16(vget_low_s16(a), vget_low_s16(b));
-  int32x4_t temp_high = vaddl_s16(vget_high_s16(a), vget_high_s16(b));
-  temp_low = vmulq_n_s32(temp_low, ab_const);
-  temp_high = vmulq_n_s32(temp_high, ab_const);
-  return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));
+  int32x4_t t[2];
+  t[0] = vaddl_s16(vget_low_s16(a), vget_low_s16(b));
+  t[1] = vaddl_s16(vget_high_s16(a), vget_high_s16(b));
+  t[0] = vmulq_n_s32(t[0], ab_const);
+  t[1] = vmulq_n_s32(t[1], ab_const);
+  return dct_const_round_shift_low_8(t);
 }
 
-// Subtract b from a, then multiply by ab_const. Shift and narrow by 14.
+// Subtract b from a, then multiply by ab_const. Shift and narrow by
+// DCT_CONST_BITS.
 static INLINE int16x8_t sub_multiply_shift_and_narrow_s16(
     const int16x8_t a, const int16x8_t b, const int16_t ab_const) {
-  int32x4_t temp_low = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
-  int32x4_t temp_high = vsubl_s16(vget_high_s16(a), vget_high_s16(b));
-  temp_low = vmulq_n_s32(temp_low, ab_const);
-  temp_high = vmulq_n_s32(temp_high, ab_const);
-  return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));
+  int32x4_t t[2];
+  t[0] = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
+  t[1] = vsubl_s16(vget_high_s16(a), vget_high_s16(b));
+  t[0] = vmulq_n_s32(t[0], ab_const);
+  t[1] = vmulq_n_s32(t[1], ab_const);
+  return dct_const_round_shift_low_8(t);
 }
 
 // Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by
-// 14.
+// DCT_CONST_BITS.
 static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16(
     const int16x8_t a, const int16_t a_const, const int16x8_t b,
     const int16_t b_const) {
-  int32x4_t temp_low = vmull_n_s16(vget_low_s16(a), a_const);
-  int32x4_t temp_high = vmull_n_s16(vget_high_s16(a), a_const);
-  temp_low = vmlal_n_s16(temp_low, vget_low_s16(b), b_const);
-  temp_high = vmlal_n_s16(temp_high, vget_high_s16(b), b_const);
-  return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));
+  int32x4_t t[2];
+  t[0] = vmull_n_s16(vget_low_s16(a), a_const);
+  t[1] = vmull_n_s16(vget_high_s16(a), a_const);
+  t[0] = vmlal_n_s16(t[0], vget_low_s16(b), b_const);
+  t[1] = vmlal_n_s16(t[1], vget_high_s16(b), b_const);
+  return dct_const_round_shift_low_8(t);
+}
+
+//------------------------------------------------------------------------------
+
+// Note: The following 4 functions could use 32-bit operations for bit-depth 10.
+//       However, although it's 20% faster with gcc, it's 20% slower with clang.
+//       Use 64-bit operations for now.
+
+// Multiply a by a_const. Saturate, shift and narrow by DCT_CONST_BITS.
+static INLINE int32x4x2_t
+multiply_shift_and_narrow_s32_dual(const int32x4x2_t a, const int32_t a_const) {
+  int64x2_t b[4];
+
+  b[0] = vmull_n_s32(vget_low_s32(a.val[0]), a_const);
+  b[1] = vmull_n_s32(vget_high_s32(a.val[0]), a_const);
+  b[2] = vmull_n_s32(vget_low_s32(a.val[1]), a_const);
+  b[3] = vmull_n_s32(vget_high_s32(a.val[1]), a_const);
+  return dct_const_round_shift_high_4x2(b);
+}
+
+// Add a and b, then multiply by ab_const. Shift and narrow by DCT_CONST_BITS.
+static INLINE int32x4x2_t add_multiply_shift_and_narrow_s32_dual(
+    const int32x4x2_t a, const int32x4x2_t b, const int32_t ab_const) {
+  int32x4_t t[2];
+  int64x2_t c[4];
+
+  t[0] = vaddq_s32(a.val[0], b.val[0]);
+  t[1] = vaddq_s32(a.val[1], b.val[1]);
+  c[0] = vmull_n_s32(vget_low_s32(t[0]), ab_const);
+  c[1] = vmull_n_s32(vget_high_s32(t[0]), ab_const);
+  c[2] = vmull_n_s32(vget_low_s32(t[1]), ab_const);
+  c[3] = vmull_n_s32(vget_high_s32(t[1]), ab_const);
+  return dct_const_round_shift_high_4x2(c);
+}
+
+// Subtract b from a, then multiply by ab_const. Shift and narrow by
+// DCT_CONST_BITS.
+static INLINE int32x4x2_t sub_multiply_shift_and_narrow_s32_dual(
+    const int32x4x2_t a, const int32x4x2_t b, const int32_t ab_const) {
+  int32x4_t t[2];
+  int64x2_t c[4];
+
+  t[0] = vsubq_s32(a.val[0], b.val[0]);
+  t[1] = vsubq_s32(a.val[1], b.val[1]);
+  c[0] = vmull_n_s32(vget_low_s32(t[0]), ab_const);
+  c[1] = vmull_n_s32(vget_high_s32(t[0]), ab_const);
+  c[2] = vmull_n_s32(vget_low_s32(t[1]), ab_const);
+  c[3] = vmull_n_s32(vget_high_s32(t[1]), ab_const);
+  return dct_const_round_shift_high_4x2(c);
+}
+
+// Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by
+// DCT_CONST_BITS.
+static INLINE int32x4x2_t multiply_accumulate_shift_and_narrow_s32_dual(
+    const int32x4x2_t a, const int32_t a_const, const int32x4x2_t b,
+    const int32_t b_const) {
+  int64x2_t c[4];
+  c[0] = vmull_n_s32(vget_low_s32(a.val[0]), a_const);
+  c[1] = vmull_n_s32(vget_high_s32(a.val[0]), a_const);
+  c[2] = vmull_n_s32(vget_low_s32(a.val[1]), a_const);
+  c[3] = vmull_n_s32(vget_high_s32(a.val[1]), a_const);
+  c[0] = vmlal_n_s32(c[0], vget_low_s32(b.val[0]), b_const);
+  c[1] = vmlal_n_s32(c[1], vget_high_s32(b.val[0]), b_const);
+  c[2] = vmlal_n_s32(c[2], vget_low_s32(b.val[1]), b_const);
+  c[3] = vmlal_n_s32(c[3], vget_high_s32(b.val[1]), b_const);
+  return dct_const_round_shift_high_4x2(c);
 }
 
 // Shift the output down by 6 and add it to the destination buffer.
-static INLINE void add_and_store_u8_s16(const int16x8_t a0, const int16x8_t a1,
-                                        const int16x8_t a2, const int16x8_t a3,
-                                        const int16x8_t a4, const int16x8_t a5,
-                                        const int16x8_t a6, const int16x8_t a7,
-                                        uint8_t *b, const int b_stride) {
-  uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7;
-  int16x8_t c0, c1, c2, c3, c4, c5, c6, c7;
-  b0 = vld1_u8(b);
-  b += b_stride;
-  b1 = vld1_u8(b);
-  b += b_stride;
-  b2 = vld1_u8(b);
-  b += b_stride;
-  b3 = vld1_u8(b);
-  b += b_stride;
-  b4 = vld1_u8(b);
-  b += b_stride;
-  b5 = vld1_u8(b);
-  b += b_stride;
-  b6 = vld1_u8(b);
-  b += b_stride;
-  b7 = vld1_u8(b);
-  b -= (7 * b_stride);
+static INLINE void add_and_store_u8_s16(const int16x8_t *const a, uint8_t *d,
+                                        const int stride) {
+  uint8x8_t b[8];
+  int16x8_t c[8];
+
+  b[0] = vld1_u8(d);
+  d += stride;
+  b[1] = vld1_u8(d);
+  d += stride;
+  b[2] = vld1_u8(d);
+  d += stride;
+  b[3] = vld1_u8(d);
+  d += stride;
+  b[4] = vld1_u8(d);
+  d += stride;
+  b[5] = vld1_u8(d);
+  d += stride;
+  b[6] = vld1_u8(d);
+  d += stride;
+  b[7] = vld1_u8(d);
+  d -= (7 * stride);
 
   // c = b + (a >> 6)
-  c0 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b0)), a0, 6);
-  c1 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b1)), a1, 6);
-  c2 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b2)), a2, 6);
-  c3 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b3)), a3, 6);
-  c4 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b4)), a4, 6);
-  c5 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b5)), a5, 6);
-  c6 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b6)), a6, 6);
-  c7 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b7)), a7, 6);
+  c[0] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[0])), a[0], 6);
+  c[1] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[1])), a[1], 6);
+  c[2] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[2])), a[2], 6);
+  c[3] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[3])), a[3], 6);
+  c[4] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[4])), a[4], 6);
+  c[5] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[5])), a[5], 6);
+  c[6] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[6])), a[6], 6);
+  c[7] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[7])), a[7], 6);
 
-  b0 = vqmovun_s16(c0);
-  b1 = vqmovun_s16(c1);
-  b2 = vqmovun_s16(c2);
-  b3 = vqmovun_s16(c3);
-  b4 = vqmovun_s16(c4);
-  b5 = vqmovun_s16(c5);
-  b6 = vqmovun_s16(c6);
-  b7 = vqmovun_s16(c7);
+  b[0] = vqmovun_s16(c[0]);
+  b[1] = vqmovun_s16(c[1]);
+  b[2] = vqmovun_s16(c[2]);
+  b[3] = vqmovun_s16(c[3]);
+  b[4] = vqmovun_s16(c[4]);
+  b[5] = vqmovun_s16(c[5]);
+  b[6] = vqmovun_s16(c[6]);
+  b[7] = vqmovun_s16(c[7]);
 
-  vst1_u8(b, b0);
-  b += b_stride;
-  vst1_u8(b, b1);
-  b += b_stride;
-  vst1_u8(b, b2);
-  b += b_stride;
-  vst1_u8(b, b3);
-  b += b_stride;
-  vst1_u8(b, b4);
-  b += b_stride;
-  vst1_u8(b, b5);
-  b += b_stride;
-  vst1_u8(b, b6);
-  b += b_stride;
-  vst1_u8(b, b7);
+  vst1_u8(d, b[0]);
+  d += stride;
+  vst1_u8(d, b[1]);
+  d += stride;
+  vst1_u8(d, b[2]);
+  d += stride;
+  vst1_u8(d, b[3]);
+  d += stride;
+  vst1_u8(d, b[4]);
+  d += stride;
+  vst1_u8(d, b[5]);
+  d += stride;
+  vst1_u8(d, b[6]);
+  d += stride;
+  vst1_u8(d, b[7]);
 }
 
 static INLINE uint8x16_t create_dcq(const int16_t dc) {
@@ -194,56 +288,53 @@ static INLINE uint8x16_t create_dcq(const int16_t dc) {
   return vdupq_n_u8((uint8_t)t);
 }
 
-static INLINE void idct4x4_16_kernel_bd8(const int16x4_t cospis,
-                                         int16x8_t *const a0,
-                                         int16x8_t *const a1) {
-  int16x4_t b0, b1, b2, b3;
-  int32x4_t c0, c1, c2, c3;
-  int16x8_t d0, d1;
+static INLINE void idct4x4_16_kernel_bd8(int16x8_t *const a) {
+  const int16x4_t cospis = vld1_s16(kCospi);
+  int16x4_t b[4];
+  int32x4_t c[4];
+  int16x8_t d[2];
 
-  transpose_s16_4x4q(a0, a1);
-  b0 = vget_low_s16(*a0);
-  b1 = vget_high_s16(*a0);
-  b2 = vget_low_s16(*a1);
-  b3 = vget_high_s16(*a1);
-  c0 = vmull_lane_s16(b0, cospis, 2);
-  c2 = vmull_lane_s16(b1, cospis, 2);
-  c1 = vsubq_s32(c0, c2);
-  c0 = vaddq_s32(c0, c2);
-  c2 = vmull_lane_s16(b2, cospis, 3);
-  c3 = vmull_lane_s16(b2, cospis, 1);
-  c2 = vmlsl_lane_s16(c2, b3, cospis, 1);
-  c3 = vmlal_lane_s16(c3, b3, cospis, 3);
-  b0 = vrshrn_n_s32(c0, 14);
-  b1 = vrshrn_n_s32(c1, 14);
-  b2 = vrshrn_n_s32(c2, 14);
-  b3 = vrshrn_n_s32(c3, 14);
-  d0 = vcombine_s16(b0, b1);
-  d1 = vcombine_s16(b3, b2);
-  *a0 = vaddq_s16(d0, d1);
-  *a1 = vsubq_s16(d0, d1);
+  b[0] = vget_low_s16(a[0]);
+  b[1] = vget_high_s16(a[0]);
+  b[2] = vget_low_s16(a[1]);
+  b[3] = vget_high_s16(a[1]);
+  c[0] = vmull_lane_s16(b[0], cospis, 2);
+  c[2] = vmull_lane_s16(b[1], cospis, 2);
+  c[1] = vsubq_s32(c[0], c[2]);
+  c[0] = vaddq_s32(c[0], c[2]);
+  c[3] = vmull_lane_s16(b[2], cospis, 3);
+  c[2] = vmull_lane_s16(b[2], cospis, 1);
+  c[3] = vmlsl_lane_s16(c[3], b[3], cospis, 1);
+  c[2] = vmlal_lane_s16(c[2], b[3], cospis, 3);
+  dct_const_round_shift_low_8_dual(c, &d[0], &d[1]);
+  a[0] = vaddq_s16(d[0], d[1]);
+  a[1] = vsubq_s16(d[0], d[1]);
 }
 
-static INLINE void idct8x8_12_pass1_bd8(
-    const int16x4_t cospis0, const int16x4_t cospisd0, const int16x4_t cospisd1,
-    int16x4_t *const io0, int16x4_t *const io1, int16x4_t *const io2,
-    int16x4_t *const io3, int16x4_t *const io4, int16x4_t *const io5,
-    int16x4_t *const io6, int16x4_t *const io7) {
+static INLINE void transpose_idct4x4_16_bd8(int16x8_t *const a) {
+  transpose_s16_4x4q(&a[0], &a[1]);
+  idct4x4_16_kernel_bd8(a);
+}
+
+static INLINE void idct8x8_12_pass1_bd8(const int16x4_t cospis0,
+                                        const int16x4_t cospisd0,
+                                        const int16x4_t cospisd1,
+                                        int16x4_t *const io) {
   int16x4_t step1[8], step2[8];
   int32x4_t t32[2];
 
-  transpose_s16_4x4d(io0, io1, io2, io3);
+  transpose_s16_4x4d(&io[0], &io[1], &io[2], &io[3]);
 
   // stage 1
-  step1[4] = vqrdmulh_lane_s16(*io1, cospisd1, 3);
-  step1[5] = vqrdmulh_lane_s16(*io3, cospisd1, 2);
-  step1[6] = vqrdmulh_lane_s16(*io3, cospisd1, 1);
-  step1[7] = vqrdmulh_lane_s16(*io1, cospisd1, 0);
+  step1[4] = vqrdmulh_lane_s16(io[1], cospisd1, 3);
+  step1[5] = vqrdmulh_lane_s16(io[3], cospisd1, 2);
+  step1[6] = vqrdmulh_lane_s16(io[3], cospisd1, 1);
+  step1[7] = vqrdmulh_lane_s16(io[1], cospisd1, 0);
 
   // stage 2
-  step2[1] = vqrdmulh_lane_s16(*io0, cospisd0, 2);
-  step2[2] = vqrdmulh_lane_s16(*io2, cospisd0, 3);
-  step2[3] = vqrdmulh_lane_s16(*io2, cospisd0, 1);
+  step2[1] = vqrdmulh_lane_s16(io[0], cospisd0, 2);
+  step2[2] = vqrdmulh_lane_s16(io[2], cospisd0, 3);
+  step2[3] = vqrdmulh_lane_s16(io[2], cospisd0, 1);
 
   step2[4] = vadd_s16(step1[4], step1[5]);
   step2[5] = vsub_s16(step1[4], step1[5]);
@@ -259,36 +350,31 @@ static INLINE void idct8x8_12_pass1_bd8(
   t32[1] = vmull_lane_s16(step2[6], cospis0, 2);
   t32[0] = vmlsl_lane_s16(t32[1], step2[5], cospis0, 2);
   t32[1] = vmlal_lane_s16(t32[1], step2[5], cospis0, 2);
-  step1[5] = vrshrn_n_s32(t32[0], 14);
-  step1[6] = vrshrn_n_s32(t32[1], 14);
+  step1[5] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+  step1[6] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
 
   // stage 4
-  *io0 = vadd_s16(step1[0], step2[7]);
-  *io1 = vadd_s16(step1[1], step1[6]);
-  *io2 = vadd_s16(step1[2], step1[5]);
-  *io3 = vadd_s16(step1[3], step2[4]);
-  *io4 = vsub_s16(step1[3], step2[4]);
-  *io5 = vsub_s16(step1[2], step1[5]);
-  *io6 = vsub_s16(step1[1], step1[6]);
-  *io7 = vsub_s16(step1[0], step2[7]);
+  io[0] = vadd_s16(step1[0], step2[7]);
+  io[1] = vadd_s16(step1[1], step1[6]);
+  io[2] = vadd_s16(step1[2], step1[5]);
+  io[3] = vadd_s16(step1[3], step2[4]);
+  io[4] = vsub_s16(step1[3], step2[4]);
+  io[5] = vsub_s16(step1[2], step1[5]);
+  io[6] = vsub_s16(step1[1], step1[6]);
+  io[7] = vsub_s16(step1[0], step2[7]);
 }
 
-static INLINE void idct8x8_12_pass2_bd8(
-    const int16x4_t cospis0, const int16x4_t cospisd0, const int16x4_t cospisd1,
-    const int16x4_t input0, const int16x4_t input1, const int16x4_t input2,
-    const int16x4_t input3, const int16x4_t input4, const int16x4_t input5,
-    const int16x4_t input6, const int16x4_t input7, int16x8_t *const output0,
-    int16x8_t *const output1, int16x8_t *const output2,
-    int16x8_t *const output3, int16x8_t *const output4,
-    int16x8_t *const output5, int16x8_t *const output6,
-    int16x8_t *const output7) {
+static INLINE void idct8x8_12_pass2_bd8(const int16x4_t cospis0,
+                                        const int16x4_t cospisd0,
+                                        const int16x4_t cospisd1,
+                                        const int16x4_t *const input,
+                                        int16x8_t *const output) {
   int16x8_t in[4];
   int16x8_t step1[8], step2[8];
   int32x4_t t32[8];
-  int16x4_t t16[8];
 
-  transpose_s16_4x8(input0, input1, input2, input3, input4, input5, input6,
-                    input7, &in[0], &in[1], &in[2], &in[3]);
+  transpose_s16_4x8(input[0], input[1], input[2], input[3], input[4], input[5],
+                    input[6], input[7], &in[0], &in[1], &in[2], &in[3]);
 
   // stage 1
   step1[4] = vqrdmulhq_lane_s16(in[1], cospisd1, 3);
@@ -318,86 +404,64 @@ static INLINE void idct8x8_12_pass2_bd8(
   t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
   t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
   t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
-  t16[0] = vrshrn_n_s32(t32[0], 14);
-  t16[1] = vrshrn_n_s32(t32[1], 14);
-  t16[2] = vrshrn_n_s32(t32[2], 14);
-  t16[3] = vrshrn_n_s32(t32[3], 14);
-  step1[5] = vcombine_s16(t16[0], t16[1]);
-  step1[6] = vcombine_s16(t16[2], t16[3]);
+  dct_const_round_shift_low_8_dual(t32, &step1[5], &step1[6]);
 
   // stage 4
-  *output0 = vaddq_s16(step1[0], step2[7]);
-  *output1 = vaddq_s16(step1[1], step1[6]);
-  *output2 = vaddq_s16(step1[2], step1[5]);
-  *output3 = vaddq_s16(step1[3], step2[4]);
-  *output4 = vsubq_s16(step1[3], step2[4]);
-  *output5 = vsubq_s16(step1[2], step1[5]);
-  *output6 = vsubq_s16(step1[1], step1[6]);
-  *output7 = vsubq_s16(step1[0], step2[7]);
+  output[0] = vaddq_s16(step1[0], step2[7]);
+  output[1] = vaddq_s16(step1[1], step1[6]);
+  output[2] = vaddq_s16(step1[2], step1[5]);
+  output[3] = vaddq_s16(step1[3], step2[4]);
+  output[4] = vsubq_s16(step1[3], step2[4]);
+  output[5] = vsubq_s16(step1[2], step1[5]);
+  output[6] = vsubq_s16(step1[1], step1[6]);
+  output[7] = vsubq_s16(step1[0], step2[7]);
 }
 
-static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0,
-                                     const int16x4_t cospis1,
-                                     int16x8_t *const io0, int16x8_t *const io1,
-                                     int16x8_t *const io2, int16x8_t *const io3,
-                                     int16x8_t *const io4, int16x8_t *const io5,
-                                     int16x8_t *const io6,
-                                     int16x8_t *const io7) {
-  int16x4_t input_1l, input_1h, input_3l, input_3h, input_5l, input_5h,
-      input_7l, input_7h;
+static INLINE void idct8x8_64_1d_bd8_kernel(const int16x4_t cospis0,
+                                            const int16x4_t cospis1,
+                                            int16x8_t *const io) {
+  int16x4_t input1l, input1h, input3l, input3h, input5l, input5h, input7l,
+      input7h;
   int16x4_t step1l[4], step1h[4];
   int16x8_t step1[8], step2[8];
   int32x4_t t32[8];
-  int16x4_t t16[8];
-
-  transpose_s16_8x8(io0, io1, io2, io3, io4, io5, io6, io7);
 
   // stage 1
-  input_1l = vget_low_s16(*io1);
-  input_1h = vget_high_s16(*io1);
-  input_3l = vget_low_s16(*io3);
-  input_3h = vget_high_s16(*io3);
-  input_5l = vget_low_s16(*io5);
-  input_5h = vget_high_s16(*io5);
-  input_7l = vget_low_s16(*io7);
-  input_7h = vget_high_s16(*io7);
-  step1l[0] = vget_low_s16(*io0);
-  step1h[0] = vget_high_s16(*io0);
-  step1l[1] = vget_low_s16(*io2);
-  step1h[1] = vget_high_s16(*io2);
-  step1l[2] = vget_low_s16(*io4);
-  step1h[2] = vget_high_s16(*io4);
-  step1l[3] = vget_low_s16(*io6);
-  step1h[3] = vget_high_s16(*io6);
+  input1l = vget_low_s16(io[1]);
+  input1h = vget_high_s16(io[1]);
+  input3l = vget_low_s16(io[3]);
+  input3h = vget_high_s16(io[3]);
+  input5l = vget_low_s16(io[5]);
+  input5h = vget_high_s16(io[5]);
+  input7l = vget_low_s16(io[7]);
+  input7h = vget_high_s16(io[7]);
+  step1l[0] = vget_low_s16(io[0]);
+  step1h[0] = vget_high_s16(io[0]);
+  step1l[1] = vget_low_s16(io[2]);
+  step1h[1] = vget_high_s16(io[2]);
+  step1l[2] = vget_low_s16(io[4]);
+  step1h[2] = vget_high_s16(io[4]);
+  step1l[3] = vget_low_s16(io[6]);
+  step1h[3] = vget_high_s16(io[6]);
 
-  t32[0] = vmull_lane_s16(input_1l, cospis1, 3);
-  t32[1] = vmull_lane_s16(input_1h, cospis1, 3);
-  t32[2] = vmull_lane_s16(input_3l, cospis1, 2);
-  t32[3] = vmull_lane_s16(input_3h, cospis1, 2);
-  t32[4] = vmull_lane_s16(input_3l, cospis1, 1);
-  t32[5] = vmull_lane_s16(input_3h, cospis1, 1);
-  t32[6] = vmull_lane_s16(input_1l, cospis1, 0);
-  t32[7] = vmull_lane_s16(input_1h, cospis1, 0);
-  t32[0] = vmlsl_lane_s16(t32[0], input_7l, cospis1, 0);
-  t32[1] = vmlsl_lane_s16(t32[1], input_7h, cospis1, 0);
-  t32[2] = vmlal_lane_s16(t32[2], input_5l, cospis1, 1);
-  t32[3] = vmlal_lane_s16(t32[3], input_5h, cospis1, 1);
-  t32[4] = vmlsl_lane_s16(t32[4], input_5l, cospis1, 2);
-  t32[5] = vmlsl_lane_s16(t32[5], input_5h, cospis1, 2);
-  t32[6] = vmlal_lane_s16(t32[6], input_7l, cospis1, 3);
-  t32[7] = vmlal_lane_s16(t32[7], input_7h, cospis1, 3);
-  t16[0] = vrshrn_n_s32(t32[0], 14);
-  t16[1] = vrshrn_n_s32(t32[1], 14);
-  t16[2] = vrshrn_n_s32(t32[2], 14);
-  t16[3] = vrshrn_n_s32(t32[3], 14);
-  t16[4] = vrshrn_n_s32(t32[4], 14);
-  t16[5] = vrshrn_n_s32(t32[5], 14);
-  t16[6] = vrshrn_n_s32(t32[6], 14);
-  t16[7] = vrshrn_n_s32(t32[7], 14);
-  step1[4] = vcombine_s16(t16[0], t16[1]);
-  step1[5] = vcombine_s16(t16[2], t16[3]);
-  step1[6] = vcombine_s16(t16[4], t16[5]);
-  step1[7] = vcombine_s16(t16[6], t16[7]);
+  t32[0] = vmull_lane_s16(input1l, cospis1, 3);
+  t32[1] = vmull_lane_s16(input1h, cospis1, 3);
+  t32[2] = vmull_lane_s16(input3l, cospis1, 2);
+  t32[3] = vmull_lane_s16(input3h, cospis1, 2);
+  t32[4] = vmull_lane_s16(input3l, cospis1, 1);
+  t32[5] = vmull_lane_s16(input3h, cospis1, 1);
+  t32[6] = vmull_lane_s16(input1l, cospis1, 0);
+  t32[7] = vmull_lane_s16(input1h, cospis1, 0);
+  t32[0] = vmlsl_lane_s16(t32[0], input7l, cospis1, 0);
+  t32[1] = vmlsl_lane_s16(t32[1], input7h, cospis1, 0);
+  t32[2] = vmlal_lane_s16(t32[2], input5l, cospis1, 1);
+  t32[3] = vmlal_lane_s16(t32[3], input5h, cospis1, 1);
+  t32[4] = vmlsl_lane_s16(t32[4], input5l, cospis1, 2);
+  t32[5] = vmlsl_lane_s16(t32[5], input5h, cospis1, 2);
+  t32[6] = vmlal_lane_s16(t32[6], input7l, cospis1, 3);
+  t32[7] = vmlal_lane_s16(t32[7], input7h, cospis1, 3);
+  dct_const_round_shift_low_8_dual(&t32[0], &step1[4], &step1[5]);
+  dct_const_round_shift_low_8_dual(&t32[4], &step1[6], &step1[7]);
 
   // stage 2
   t32[2] = vmull_lane_s16(step1l[0], cospis0, 2);
@@ -414,18 +478,8 @@ static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0,
   t32[5] = vmlsl_lane_s16(t32[5], step1h[3], cospis0, 1);
   t32[6] = vmlal_lane_s16(t32[6], step1l[3], cospis0, 3);
   t32[7] = vmlal_lane_s16(t32[7], step1h[3], cospis0, 3);
-  t16[0] = vrshrn_n_s32(t32[0], 14);
-  t16[1] = vrshrn_n_s32(t32[1], 14);
-  t16[2] = vrshrn_n_s32(t32[2], 14);
-  t16[3] = vrshrn_n_s32(t32[3], 14);
-  t16[4] = vrshrn_n_s32(t32[4], 14);
-  t16[5] = vrshrn_n_s32(t32[5], 14);
-  t16[6] = vrshrn_n_s32(t32[6], 14);
-  t16[7] = vrshrn_n_s32(t32[7], 14);
-  step2[0] = vcombine_s16(t16[0], t16[1]);
-  step2[1] = vcombine_s16(t16[2], t16[3]);
-  step2[2] = vcombine_s16(t16[4], t16[5]);
-  step2[3] = vcombine_s16(t16[6], t16[7]);
+  dct_const_round_shift_low_8_dual(&t32[0], &step2[0], &step2[1]);
+  dct_const_round_shift_low_8_dual(&t32[4], &step2[2], &step2[3]);
 
   step2[4] = vaddq_s16(step1[4], step1[5]);
   step2[5] = vsubq_s16(step1[4], step1[5]);
@@ -444,22 +498,422 @@ static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0,
   t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
   t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
   t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
-  t16[0] = vrshrn_n_s32(t32[0], 14);
-  t16[1] = vrshrn_n_s32(t32[1], 14);
-  t16[2] = vrshrn_n_s32(t32[2], 14);
-  t16[3] = vrshrn_n_s32(t32[3], 14);
-  step1[5] = vcombine_s16(t16[0], t16[1]);
-  step1[6] = vcombine_s16(t16[2], t16[3]);
+  dct_const_round_shift_low_8_dual(t32, &step1[5], &step1[6]);
 
   // stage 4
-  *io0 = vaddq_s16(step1[0], step2[7]);
-  *io1 = vaddq_s16(step1[1], step1[6]);
-  *io2 = vaddq_s16(step1[2], step1[5]);
-  *io3 = vaddq_s16(step1[3], step2[4]);
-  *io4 = vsubq_s16(step1[3], step2[4]);
-  *io5 = vsubq_s16(step1[2], step1[5]);
-  *io6 = vsubq_s16(step1[1], step1[6]);
-  *io7 = vsubq_s16(step1[0], step2[7]);
+  io[0] = vaddq_s16(step1[0], step2[7]);
+  io[1] = vaddq_s16(step1[1], step1[6]);
+  io[2] = vaddq_s16(step1[2], step1[5]);
+  io[3] = vaddq_s16(step1[3], step2[4]);
+  io[4] = vsubq_s16(step1[3], step2[4]);
+  io[5] = vsubq_s16(step1[2], step1[5]);
+  io[6] = vsubq_s16(step1[1], step1[6]);
+  io[7] = vsubq_s16(step1[0], step2[7]);
 }
 
-#endif  // VPX_DSP_ARM_IDCT_NEON_H_
+static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0,
+                                     const int16x4_t cospis1,
+                                     int16x8_t *const io) {
+  transpose_s16_8x8(&io[0], &io[1], &io[2], &io[3], &io[4], &io[5], &io[6],
+                    &io[7]);
+  idct8x8_64_1d_bd8_kernel(cospis0, cospis1, io);
+}
+
+static INLINE void idct_cospi_8_24_q_kernel(const int16x8_t s0,
+                                            const int16x8_t s1,
+                                            const int16x4_t cospi_0_8_16_24,
+                                            int32x4_t *const t32) {
+  t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_0_8_16_24, 3);
+  t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_0_8_16_24, 3);
+  t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_0_8_16_24, 3);
+  t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_0_8_16_24, 3);
+  t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_0_8_16_24, 1);
+  t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_0_8_16_24, 1);
+  t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_0_8_16_24, 1);
+  t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_0_8_16_24, 1);
+}
+
+static INLINE void idct_cospi_8_24_q(const int16x8_t s0, const int16x8_t s1,
+                                     const int16x4_t cospi_0_8_16_24,
+                                     int16x8_t *const d0, int16x8_t *const d1) {
+  int32x4_t t32[4];
+
+  idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32);
+  dct_const_round_shift_low_8_dual(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_8_24_neg_q(const int16x8_t s0, const int16x8_t s1,
+                                         const int16x4_t cospi_0_8_16_24,
+                                         int16x8_t *const d0,
+                                         int16x8_t *const d1) {
+  int32x4_t t32[4];
+
+  idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32);
+  t32[2] = vnegq_s32(t32[2]);
+  t32[3] = vnegq_s32(t32[3]);
+  dct_const_round_shift_low_8_dual(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_16_16_q(const int16x8_t s0, const int16x8_t s1,
+                                      const int16x4_t cospi_0_8_16_24,
+                                      int16x8_t *const d0,
+                                      int16x8_t *const d1) {
+  int32x4_t t32[6];
+
+  t32[4] = vmull_lane_s16(vget_low_s16(s1), cospi_0_8_16_24, 2);
+  t32[5] = vmull_lane_s16(vget_high_s16(s1), cospi_0_8_16_24, 2);
+  t32[0] = vmlsl_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2);
+  t32[1] = vmlsl_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2);
+  t32[2] = vmlal_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2);
+  t32[3] = vmlal_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2);
+  dct_const_round_shift_low_8_dual(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_2_30(const int16x8_t s0, const int16x8_t s1,
+                                   const int16x4_t cospi_2_30_10_22,
+                                   int16x8_t *const d0, int16x8_t *const d1) {
+  int32x4_t t32[4];
+
+  t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 1);
+  t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 1);
+  t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 1);
+  t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 1);
+  t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 0);
+  t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 0);
+  t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 0);
+  t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 0);
+  dct_const_round_shift_low_8_dual(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_4_28(const int16x8_t s0, const int16x8_t s1,
+                                   const int16x4_t cospi_4_12_20N_28,
+                                   int16x8_t *const d0, int16x8_t *const d1) {
+  int32x4_t t32[4];
+
+  t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 3);
+  t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 3);
+  t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 3);
+  t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 3);
+  t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 0);
+  t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 0);
+  t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 0);
+  t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 0);
+  dct_const_round_shift_low_8_dual(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_6_26(const int16x8_t s0, const int16x8_t s1,
+                                   const int16x4_t cospi_6_26N_14_18N,
+                                   int16x8_t *const d0, int16x8_t *const d1) {
+  int32x4_t t32[4];
+
+  t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26N_14_18N, 0);
+  t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26N_14_18N, 0);
+  t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26N_14_18N, 0);
+  t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26N_14_18N, 0);
+  t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26N_14_18N, 1);
+  t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26N_14_18N, 1);
+  t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26N_14_18N, 1);
+  t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26N_14_18N, 1);
+  dct_const_round_shift_low_8_dual(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_10_22(const int16x8_t s0, const int16x8_t s1,
+                                    const int16x4_t cospi_2_30_10_22,
+                                    int16x8_t *const d0, int16x8_t *const d1) {
+  int32x4_t t32[4];
+
+  t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 3);
+  t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 3);
+  t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 3);
+  t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 3);
+  t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 2);
+  t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 2);
+  t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 2);
+  t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 2);
+  dct_const_round_shift_low_8_dual(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_12_20(const int16x8_t s0, const int16x8_t s1,
+                                    const int16x4_t cospi_4_12_20N_28,
+                                    int16x8_t *const d0, int16x8_t *const d1) {
+  int32x4_t t32[4];
+
+  t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 1);
+  t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 1);
+  t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 1);
+  t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 1);
+  t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 2);
+  t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 2);
+  t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 2);
+  t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 2);
+  dct_const_round_shift_low_8_dual(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_14_18(const int16x8_t s0, const int16x8_t s1,
+                                    const int16x4_t cospi_6_26N_14_18N,
+                                    int16x8_t *const d0, int16x8_t *const d1) {
+  int32x4_t t32[4];
+
+  t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26N_14_18N, 2);
+  t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26N_14_18N, 2);
+  t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26N_14_18N, 2);
+  t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26N_14_18N, 2);
+  t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26N_14_18N, 3);
+  t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26N_14_18N, 3);
+  t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26N_14_18N, 3);
+  t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26N_14_18N, 3);
+  dct_const_round_shift_low_8_dual(t32, d0, d1);
+}
+
+static INLINE void idct16x16_add_stage7(const int16x8_t *const step2,
+                                        int16x8_t *const out) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  // Use saturating add/sub to avoid overflow in 2nd pass
+  out[0] = vqaddq_s16(step2[0], step2[15]);
+  out[1] = vqaddq_s16(step2[1], step2[14]);
+  out[2] = vqaddq_s16(step2[2], step2[13]);
+  out[3] = vqaddq_s16(step2[3], step2[12]);
+  out[4] = vqaddq_s16(step2[4], step2[11]);
+  out[5] = vqaddq_s16(step2[5], step2[10]);
+  out[6] = vqaddq_s16(step2[6], step2[9]);
+  out[7] = vqaddq_s16(step2[7], step2[8]);
+  out[8] = vqsubq_s16(step2[7], step2[8]);
+  out[9] = vqsubq_s16(step2[6], step2[9]);
+  out[10] = vqsubq_s16(step2[5], step2[10]);
+  out[11] = vqsubq_s16(step2[4], step2[11]);
+  out[12] = vqsubq_s16(step2[3], step2[12]);
+  out[13] = vqsubq_s16(step2[2], step2[13]);
+  out[14] = vqsubq_s16(step2[1], step2[14]);
+  out[15] = vqsubq_s16(step2[0], step2[15]);
+#else
+  out[0] = vaddq_s16(step2[0], step2[15]);
+  out[1] = vaddq_s16(step2[1], step2[14]);
+  out[2] = vaddq_s16(step2[2], step2[13]);
+  out[3] = vaddq_s16(step2[3], step2[12]);
+  out[4] = vaddq_s16(step2[4], step2[11]);
+  out[5] = vaddq_s16(step2[5], step2[10]);
+  out[6] = vaddq_s16(step2[6], step2[9]);
+  out[7] = vaddq_s16(step2[7], step2[8]);
+  out[8] = vsubq_s16(step2[7], step2[8]);
+  out[9] = vsubq_s16(step2[6], step2[9]);
+  out[10] = vsubq_s16(step2[5], step2[10]);
+  out[11] = vsubq_s16(step2[4], step2[11]);
+  out[12] = vsubq_s16(step2[3], step2[12]);
+  out[13] = vsubq_s16(step2[2], step2[13]);
+  out[14] = vsubq_s16(step2[1], step2[14]);
+  out[15] = vsubq_s16(step2[0], step2[15]);
+#endif
+}
+
+static INLINE void idct16x16_store_pass1(const int16x8_t *const out,
+                                         int16_t *output) {
+  // Save the result into output
+  vst1q_s16(output, out[0]);
+  output += 16;
+  vst1q_s16(output, out[1]);
+  output += 16;
+  vst1q_s16(output, out[2]);
+  output += 16;
+  vst1q_s16(output, out[3]);
+  output += 16;
+  vst1q_s16(output, out[4]);
+  output += 16;
+  vst1q_s16(output, out[5]);
+  output += 16;
+  vst1q_s16(output, out[6]);
+  output += 16;
+  vst1q_s16(output, out[7]);
+  output += 16;
+  vst1q_s16(output, out[8]);
+  output += 16;
+  vst1q_s16(output, out[9]);
+  output += 16;
+  vst1q_s16(output, out[10]);
+  output += 16;
+  vst1q_s16(output, out[11]);
+  output += 16;
+  vst1q_s16(output, out[12]);
+  output += 16;
+  vst1q_s16(output, out[13]);
+  output += 16;
+  vst1q_s16(output, out[14]);
+  output += 16;
+  vst1q_s16(output, out[15]);
+}
+
+static INLINE void idct8x8_add8x1(const int16x8_t a, uint8_t **const dest,
+                                  const int stride) {
+  const uint8x8_t s = vld1_u8(*dest);
+  const int16x8_t res = vrshrq_n_s16(a, 5);
+  const uint16x8_t q = vaddw_u8(vreinterpretq_u16_s16(res), s);
+  const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(q));
+  vst1_u8(*dest, d);
+  *dest += stride;
+}
+
+static INLINE void idct8x8_add8x8_neon(int16x8_t *const out, uint8_t *dest,
+                                       const int stride) {
+  idct8x8_add8x1(out[0], &dest, stride);
+  idct8x8_add8x1(out[1], &dest, stride);
+  idct8x8_add8x1(out[2], &dest, stride);
+  idct8x8_add8x1(out[3], &dest, stride);
+  idct8x8_add8x1(out[4], &dest, stride);
+  idct8x8_add8x1(out[5], &dest, stride);
+  idct8x8_add8x1(out[6], &dest, stride);
+  idct8x8_add8x1(out[7], &dest, stride);
+}
+
+static INLINE void idct16x16_add8x1(const int16x8_t a, uint8_t **const dest,
+                                    const int stride) {
+  const uint8x8_t s = vld1_u8(*dest);
+  const int16x8_t res = vrshrq_n_s16(a, 6);
+  const uint16x8_t q = vaddw_u8(vreinterpretq_u16_s16(res), s);
+  const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(q));
+  vst1_u8(*dest, d);
+  *dest += stride;
+}
+
+static INLINE void idct16x16_add_store(const int16x8_t *const out,
+                                       uint8_t *dest, const int stride) {
+  // Add the result to dest
+  idct16x16_add8x1(out[0], &dest, stride);
+  idct16x16_add8x1(out[1], &dest, stride);
+  idct16x16_add8x1(out[2], &dest, stride);
+  idct16x16_add8x1(out[3], &dest, stride);
+  idct16x16_add8x1(out[4], &dest, stride);
+  idct16x16_add8x1(out[5], &dest, stride);
+  idct16x16_add8x1(out[6], &dest, stride);
+  idct16x16_add8x1(out[7], &dest, stride);
+  idct16x16_add8x1(out[8], &dest, stride);
+  idct16x16_add8x1(out[9], &dest, stride);
+  idct16x16_add8x1(out[10], &dest, stride);
+  idct16x16_add8x1(out[11], &dest, stride);
+  idct16x16_add8x1(out[12], &dest, stride);
+  idct16x16_add8x1(out[13], &dest, stride);
+  idct16x16_add8x1(out[14], &dest, stride);
+  idct16x16_add8x1(out[15], &dest, stride);
+}
+
+static INLINE void highbd_idct16x16_add8x1(const int16x8_t a,
+                                           const int16x8_t max,
+                                           uint16_t **const dest,
+                                           const int stride) {
+  const uint16x8_t s = vld1q_u16(*dest);
+  const int16x8_t res0 = vqaddq_s16(a, vreinterpretq_s16_u16(s));
+  const int16x8_t res1 = vminq_s16(res0, max);
+  const uint16x8_t d = vqshluq_n_s16(res1, 0);
+  vst1q_u16(*dest, d);
+  *dest += stride;
+}
+
+static INLINE void idct16x16_add_store_bd8(int16x8_t *const out, uint16_t *dest,
+                                           const int stride) {
+  // Add the result to dest
+  const int16x8_t max = vdupq_n_s16((1 << 8) - 1);
+  out[0] = vrshrq_n_s16(out[0], 6);
+  out[1] = vrshrq_n_s16(out[1], 6);
+  out[2] = vrshrq_n_s16(out[2], 6);
+  out[3] = vrshrq_n_s16(out[3], 6);
+  out[4] = vrshrq_n_s16(out[4], 6);
+  out[5] = vrshrq_n_s16(out[5], 6);
+  out[6] = vrshrq_n_s16(out[6], 6);
+  out[7] = vrshrq_n_s16(out[7], 6);
+  out[8] = vrshrq_n_s16(out[8], 6);
+  out[9] = vrshrq_n_s16(out[9], 6);
+  out[10] = vrshrq_n_s16(out[10], 6);
+  out[11] = vrshrq_n_s16(out[11], 6);
+  out[12] = vrshrq_n_s16(out[12], 6);
+  out[13] = vrshrq_n_s16(out[13], 6);
+  out[14] = vrshrq_n_s16(out[14], 6);
+  out[15] = vrshrq_n_s16(out[15], 6);
+  highbd_idct16x16_add8x1(out[0], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[1], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[2], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[3], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[4], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[5], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[6], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[7], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[8], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[9], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[10], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[11], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[12], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[13], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[14], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[15], max, &dest, stride);
+}
+
+static INLINE void highbd_idct16x16_add8x1_bd8(const int16x8_t a,
+                                               uint16_t **const dest,
+                                               const int stride) {
+  const uint16x8_t s = vld1q_u16(*dest);
+  const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), a, 6);
+  const uint16x8_t d = vmovl_u8(vqmovun_s16(res));
+  vst1q_u16(*dest, d);
+  *dest += stride;
+}
+
+static INLINE void highbd_add_and_store_bd8(const int16x8_t *const a,
+                                            uint16_t *out, const int stride) {
+  highbd_idct16x16_add8x1_bd8(a[0], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[1], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[2], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[3], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[4], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[5], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[6], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[7], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[8], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[9], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[10], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[11], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[12], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[13], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[14], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[15], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[16], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[17], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[18], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[19], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[20], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[21], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[22], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[23], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[24], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[25], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[26], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[27], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[28], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[29], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[30], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[31], &out, stride);
+}
+
+void vpx_idct16x16_256_add_half1d(const void *const input, int16_t *output,
+                                  void *const dest, const int stride,
+                                  const int highbd_flag);
+
+void vpx_idct16x16_38_add_half1d(const void *const input, int16_t *const output,
+                                 void *const dest, const int stride,
+                                 const int highbd_flag);
+
+void vpx_idct16x16_10_add_half1d_pass1(const tran_low_t *input,
+                                       int16_t *output);
+
+void vpx_idct16x16_10_add_half1d_pass2(const int16_t *input,
+                                       int16_t *const output, void *const dest,
+                                       const int stride, const int highbd_flag);
+
+void vpx_idct32_32_neon(const tran_low_t *input, uint8_t *dest,
+                        const int stride, const int highbd_flag);
+
+void vpx_idct32_12_neon(const tran_low_t *const input, int16_t *output);
+void vpx_idct32_16_neon(const int16_t *const input, void *const output,
+                        const int stride, const int highbd_flag);
+
+void vpx_idct32_6_neon(const tran_low_t *input, int16_t *output);
+void vpx_idct32_8_neon(const int16_t *input, void *const output, int stride,
+                       const int highbd_flag);
+
+#endif  // VPX_VPX_DSP_ARM_IDCT_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon.c
index fb1fa6b681..4f909e4935 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon.c
@@ -12,51 +12,47 @@
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
+#include "mem_neon.h"
+#include "sum_neon.h"
 #include "vpx/vpx_integer.h"
 
 //------------------------------------------------------------------------------
 // DC 4x4
 
-static INLINE uint16x4_t dc_sum_4(const uint8_t *ref) {
-  const uint8x8_t ref_u8 = vld1_u8(ref);
-  const uint16x4_t p0 = vpaddl_u8(ref_u8);
-  return vpadd_u16(p0, p0);
+static INLINE uint16_t dc_sum_4(const uint8_t *ref) {
+  return horizontal_add_uint8x4(load_unaligned_u8_4x1(ref));
 }
 
 static INLINE void dc_store_4x4(uint8_t *dst, ptrdiff_t stride,
                                 const uint8x8_t dc) {
-  const uint8x8_t dc_dup = vdup_lane_u8(dc, 0);
   int i;
   for (i = 0; i < 4; ++i, dst += stride) {
-    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dc_dup), 0);
+    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dc), 0);
   }
 }
 
 void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
-  const uint8x8_t a = vld1_u8(above);
-  const uint8x8_t l = vld1_u8(left);
-  const uint16x8_t al = vaddl_u8(a, l);
-  uint16x4_t sum;
-  uint8x8_t dc;
-  sum = vpadd_u16(vget_low_u16(al), vget_low_u16(al));
-  sum = vpadd_u16(sum, sum);
-  dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3));
+  const uint8x8_t a = load_unaligned_u8_4x1(above);
+  const uint8x8_t l = load_unaligned_u8_4x1(left);
+  const uint16x4_t al = vget_low_u16(vaddl_u8(a, l));
+  const uint16_t sum = horizontal_add_uint16x4(al);
+  const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3);
   dc_store_4x4(dst, stride, dc);
 }
 
 void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
-  const uint16x4_t sum = dc_sum_4(left);
-  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 2));
+  const uint16_t sum = dc_sum_4(left);
+  const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 2);
   (void)above;
   dc_store_4x4(dst, stride, dc);
 }
 
 void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                    const uint8_t *above, const uint8_t *left) {
-  const uint16x4_t sum = dc_sum_4(above);
-  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 2));
+  const uint16_t sum = dc_sum_4(above);
+  const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 2);
   (void)left;
   dc_store_4x4(dst, stride, dc);
 }
@@ -72,19 +68,15 @@ void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
 //------------------------------------------------------------------------------
 // DC 8x8
 
-static INLINE uint16x4_t dc_sum_8(const uint8_t *ref) {
-  const uint8x8_t ref_u8 = vld1_u8(ref);
-  uint16x4_t sum = vpaddl_u8(ref_u8);
-  sum = vpadd_u16(sum, sum);
-  return vpadd_u16(sum, sum);
+static INLINE uint16_t dc_sum_8(const uint8_t *ref) {
+  return horizontal_add_uint8x8(vld1_u8(ref));
 }
 
 static INLINE void dc_store_8x8(uint8_t *dst, ptrdiff_t stride,
                                 const uint8x8_t dc) {
-  const uint8x8_t dc_dup = vdup_lane_u8(dc, 0);
   int i;
   for (i = 0; i < 8; ++i, dst += stride) {
-    vst1_u8(dst, dc_dup);
+    vst1_u8(dst, dc);
   }
 }
 
@@ -92,28 +84,24 @@ void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
   const uint8x8_t above_u8 = vld1_u8(above);
   const uint8x8_t left_u8 = vld1_u8(left);
-  const uint8x16_t above_and_left = vcombine_u8(above_u8, left_u8);
-  const uint16x8_t p0 = vpaddlq_u8(above_and_left);
-  uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
-  uint8x8_t dc;
-  sum = vpadd_u16(sum, sum);
-  sum = vpadd_u16(sum, sum);
-  dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4));
+  const uint16x8_t al = vaddl_u8(above_u8, left_u8);
+  const uint16_t sum = horizontal_add_uint16x8(al);
+  const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 4);
   dc_store_8x8(dst, stride, dc);
 }
 
 void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
-  const uint16x4_t sum = dc_sum_8(left);
-  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3));
+  const uint16_t sum = dc_sum_8(left);
+  const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3);
   (void)above;
   dc_store_8x8(dst, stride, dc);
 }
 
 void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                    const uint8_t *above, const uint8_t *left) {
-  const uint16x4_t sum = dc_sum_8(above);
-  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3));
+  const uint16_t sum = dc_sum_8(above);
+  const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3);
   (void)left;
   dc_store_8x8(dst, stride, dc);
 }
@@ -129,20 +117,15 @@ void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
 //------------------------------------------------------------------------------
 // DC 16x16
 
-static INLINE uint16x4_t dc_sum_16(const uint8_t *ref) {
-  const uint8x16_t ref_u8 = vld1q_u8(ref);
-  const uint16x8_t p0 = vpaddlq_u8(ref_u8);
-  uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
-  sum = vpadd_u16(sum, sum);
-  return vpadd_u16(sum, sum);
+static INLINE uint16_t dc_sum_16(const uint8_t *ref) {
+  return horizontal_add_uint8x16(vld1q_u8(ref));
 }
 
 static INLINE void dc_store_16x16(uint8_t *dst, ptrdiff_t stride,
-                                  const uint8x8_t dc) {
-  const uint8x16_t dc_dup = vdupq_lane_u8(dc, 0);
+                                  const uint8x16_t dc) {
   int i;
   for (i = 0; i < 16; ++i, dst += stride) {
-    vst1q_u8(dst, dc_dup);
+    vst1q_u8(dst + 0, dc);
   }
 }
 
@@ -150,22 +133,19 @@ void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
   const uint8x16_t ref0 = vld1q_u8(above);
   const uint8x16_t ref1 = vld1q_u8(left);
-  const uint16x8_t p0 = vpaddlq_u8(ref0);
-  const uint16x8_t p1 = vpaddlq_u8(ref1);
-  const uint16x8_t p2 = vaddq_u16(p0, p1);
-  uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
-  uint8x8_t dc;
-  sum = vpadd_u16(sum, sum);
-  sum = vpadd_u16(sum, sum);
-  dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5));
+  const uint16x8_t a = vpaddlq_u8(ref0);
+  const uint16x8_t l = vpaddlq_u8(ref1);
+  const uint16x8_t al = vaddq_u16(a, l);
+  const uint16_t sum = horizontal_add_uint16x8(al);
+  const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0);
   dc_store_16x16(dst, stride, dc);
 }
 
 void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
-  const uint16x4_t sum = dc_sum_16(left);
-  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4));
+  const uint16_t sum = dc_sum_16(left);
+  const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 4), 0);
   (void)above;
   dc_store_16x16(dst, stride, dc);
 }
@@ -173,8 +153,8 @@ void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
 void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
-  const uint16x4_t sum = dc_sum_16(above);
-  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4));
+  const uint16_t sum = dc_sum_16(above);
+  const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 4), 0);
   (void)left;
   dc_store_16x16(dst, stride, dc);
 }
@@ -182,7 +162,7 @@ void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
 void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
-  const uint8x8_t dc = vdup_n_u8(0x80);
+  const uint8x16_t dc = vdupq_n_u8(0x80);
   (void)above;
   (void)left;
   dc_store_16x16(dst, stride, dc);
@@ -191,51 +171,41 @@ void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
 //------------------------------------------------------------------------------
 // DC 32x32
 
-static INLINE uint16x4_t dc_sum_32(const uint8_t *ref) {
-  const uint8x16x2_t r = vld2q_u8(ref);
-  const uint16x8_t p0 = vpaddlq_u8(r.val[0]);
-  const uint16x8_t p1 = vpaddlq_u8(r.val[1]);
-  const uint16x8_t p2 = vaddq_u16(p0, p1);
-  uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
-  sum = vpadd_u16(sum, sum);
-  return vpadd_u16(sum, sum);
+static INLINE uint16_t dc_sum_32(const uint8_t *ref) {
+  const uint8x16_t r0 = vld1q_u8(ref + 0);
+  const uint8x16_t r1 = vld1q_u8(ref + 16);
+  const uint16x8_t r01 = vaddq_u16(vpaddlq_u8(r0), vpaddlq_u8(r1));
+  return horizontal_add_uint16x8(r01);
 }
 
 static INLINE void dc_store_32x32(uint8_t *dst, ptrdiff_t stride,
-                                  const uint8x8_t dc) {
-  uint8x16x2_t dc_dup;
+                                  const uint8x16_t dc) {
   int i;
-  dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u8(dc, 0);
-
   for (i = 0; i < 32; ++i, dst += stride) {
-    vst2q_u8(dst, dc_dup);
+    vst1q_u8(dst + 0, dc);
+    vst1q_u8(dst + 16, dc);
   }
 }
 
 void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
-  const uint8x16x2_t a = vld2q_u8(above);
-  const uint8x16x2_t l = vld2q_u8(left);
-  const uint16x8_t pa0 = vpaddlq_u8(a.val[0]);
-  const uint16x8_t pl0 = vpaddlq_u8(l.val[0]);
-  const uint16x8_t pa1 = vpaddlq_u8(a.val[1]);
-  const uint16x8_t pl1 = vpaddlq_u8(l.val[1]);
-  const uint16x8_t pa = vaddq_u16(pa0, pa1);
-  const uint16x8_t pl = vaddq_u16(pl0, pl1);
-  const uint16x8_t pal = vaddq_u16(pa, pl);
-  uint16x4_t sum = vadd_u16(vget_low_u16(pal), vget_high_u16(pal));
-  uint8x8_t dc;
-  sum = vpadd_u16(sum, sum);
-  sum = vpadd_u16(sum, sum);
-  dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 6));
+  const uint8x16_t a0 = vld1q_u8(above + 0);
+  const uint8x16_t a1 = vld1q_u8(above + 16);
+  const uint8x16_t l0 = vld1q_u8(left + 0);
+  const uint8x16_t l1 = vld1q_u8(left + 16);
+  const uint16x8_t a01 = vaddq_u16(vpaddlq_u8(a0), vpaddlq_u8(a1));
+  const uint16x8_t l01 = vaddq_u16(vpaddlq_u8(l0), vpaddlq_u8(l1));
+  const uint16x8_t al = vaddq_u16(a01, l01);
+  const uint16_t sum = horizontal_add_uint16x8(al);
+  const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 6), 0);
   dc_store_32x32(dst, stride, dc);
 }
 
 void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
-  const uint16x4_t sum = dc_sum_32(left);
-  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5));
+  const uint16_t sum = dc_sum_32(left);
+  const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0);
   (void)above;
   dc_store_32x32(dst, stride, dc);
 }
@@ -243,8 +213,8 @@ void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
 void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
-  const uint16x4_t sum = dc_sum_32(above);
-  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5));
+  const uint16_t sum = dc_sum_32(above);
+  const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0);
   (void)left;
   dc_store_32x32(dst, stride, dc);
 }
@@ -252,7 +222,7 @@ void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
 void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
-  const uint8x8_t dc = vdup_n_u8(0x80);
+  const uint8x16_t dc = vdupq_n_u8(0x80);
   (void)above;
   (void)left;
   dc_store_32x32(dst, stride, dc);
@@ -262,123 +232,629 @@ void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
 
 void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  const uint8x8_t ABCDEFGH = vld1_u8(above);
-  const uint64x1_t A1 = vshr_n_u64(vreinterpret_u64_u8(ABCDEFGH), 8);
-  const uint64x1_t A2 = vshr_n_u64(vreinterpret_u64_u8(ABCDEFGH), 16);
-  const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
-  const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
-  const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00);
-  const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
-  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
-  const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
-  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
-  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
-  const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+  uint8x8_t a0, a1, a2, d0;
+  uint8_t a7;
   (void)left;
-  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
-  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
-  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
-  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
-  vst1_lane_u8(dst + 3 * stride + 3, ABCDEFGH, 7);
-}
 
-static INLINE void d45_store_8(uint8_t **dst, const ptrdiff_t stride,
-                               const uint8x8_t above_right, uint8x8_t *row) {
-  *row = vext_u8(*row, above_right, 1);
-  vst1_u8(*dst, *row);
-  *dst += stride;
+  a0 = vld1_u8(above);
+  a7 = above[7];
+
+  // [ above[1], ..., above[6], x, x ]
+  a1 = vext_u8(a0, a0, 1);
+  // [ above[2], ..., above[7], x, x ]
+  a2 = vext_u8(a0, a0, 2);
+
+  // d0[0] = AVG3(above[0], above[1], above[2]);
+  // ...
+  // d0[5] = AVG3(above[5], above[6], above[7]);
+  // d0[6] = x (don't care)
+  // d0[7] = x (don't care)
+  d0 = vrhadd_u8(vhadd_u8(a0, a2), a1);
+
+  // We want:
+  // stride=0 [ d0[0], d0[1], d0[2],    d0[3] ]
+  // stride=1 [ d0[1], d0[2], d0[3],    d0[4] ]
+  // stride=2 [ d0[2], d0[3], d0[4],    d0[5] ]
+  // stride=2 [ d0[3], d0[4], d0[5], above[7] ]
+  store_u8_4x1(dst + 0 * stride, d0);
+  store_u8_4x1(dst + 1 * stride, vext_u8(d0, d0, 1));
+  store_u8_4x1(dst + 2 * stride, vext_u8(d0, d0, 2));
+  store_u8_4x1(dst + 3 * stride, vext_u8(d0, d0, 3));
+
+  // We stored d0[6] above, so fixup into above[7].
+  dst[3 * stride + 3] = a7;
 }
 
 void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  const uint8x8_t A0 = vld1_u8(above);
-  const uint8x8_t above_right = vdup_lane_u8(A0, 7);
-  const uint8x8_t A1 = vext_u8(A0, above_right, 1);
-  const uint8x8_t A2 = vext_u8(A0, above_right, 2);
-  const uint8x8_t avg1 = vhadd_u8(A0, A2);
-  uint8x8_t row = vrhadd_u8(avg1, A1);
+  uint8x8_t ax0, a0, a1, a7, d0;
   (void)left;
 
-  vst1_u8(dst, row);
-  dst += stride;
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  vst1_u8(dst, above_right);
-}
+  a0 = vld1_u8(above + 0);
+  a1 = vld1_u8(above + 1);
+  a7 = vld1_dup_u8(above + 7);
 
-static INLINE void d45_store_16(uint8_t **dst, const ptrdiff_t stride,
-                                const uint8x16_t above_right, uint8x16_t *row) {
-  *row = vextq_u8(*row, above_right, 1);
-  vst1q_u8(*dst, *row);
-  *dst += stride;
+  // We want to calculate the AVG3 result in lanes 1-7 inclusive so we can
+  // shift in above[7] later, so shift a0 across by one to get the right
+  // inputs:
+  // [ x, above[0], ... , above[6] ]
+  ax0 = vext_u8(a0, a0, 7);
+
+  // d0[0] = x (don't care)
+  // d0[1] = AVG3(above[0], above[1], above[2]);
+  // ...
+  // d0[7] = AVG3(above[6], above[7], above[8]);
+  d0 = vrhadd_u8(vhadd_u8(ax0, a1), a0);
+
+  // Undo the earlier ext, incrementally shift in duplicates of above[7].
+  vst1_u8(dst + 0 * stride, vext_u8(d0, a7, 1));
+  vst1_u8(dst + 1 * stride, vext_u8(d0, a7, 2));
+  vst1_u8(dst + 2 * stride, vext_u8(d0, a7, 3));
+  vst1_u8(dst + 3 * stride, vext_u8(d0, a7, 4));
+  vst1_u8(dst + 4 * stride, vext_u8(d0, a7, 5));
+  vst1_u8(dst + 5 * stride, vext_u8(d0, a7, 6));
+  vst1_u8(dst + 6 * stride, vext_u8(d0, a7, 7));
+  vst1_u8(dst + 7 * stride, a7);
 }
 
 void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                   const uint8_t *above, const uint8_t *left) {
-  const uint8x16_t A0 = vld1q_u8(above);
-  const uint8x16_t above_right = vdupq_lane_u8(vget_high_u8(A0), 7);
-  const uint8x16_t A1 = vextq_u8(A0, above_right, 1);
-  const uint8x16_t A2 = vextq_u8(A0, above_right, 2);
-  const uint8x16_t avg1 = vhaddq_u8(A0, A2);
-  uint8x16_t row = vrhaddq_u8(avg1, A1);
+  uint8x16_t ax0, a0, a1, a15, d0;
   (void)left;
 
-  vst1q_u8(dst, row);
-  dst += stride;
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  vst1q_u8(dst, above_right);
+  a0 = vld1q_u8(above + 0);
+  a1 = vld1q_u8(above + 1);
+  a15 = vld1q_dup_u8(above + 15);
+
+  // We want to calculate the AVG3 result in lanes 1-15 inclusive so we can
+  // shift in above[15] later, so shift a0 across by one to get the right
+  // inputs:
+  // [ x, above[0], ... , above[14] ]
+  ax0 = vextq_u8(a0, a0, 15);
+
+  // d0[0] = x (don't care)
+  // d0[1] = AVG3(above[0], above[1], above[2]);
+  // ...
+  // d0[15] = AVG3(above[14], above[15], above[16]);
+  d0 = vrhaddq_u8(vhaddq_u8(ax0, a1), a0);
+
+  // Undo the earlier ext, incrementally shift in duplicates of above[15].
+  vst1q_u8(dst + 0 * stride, vextq_u8(d0, a15, 1));
+  vst1q_u8(dst + 1 * stride, vextq_u8(d0, a15, 2));
+  vst1q_u8(dst + 2 * stride, vextq_u8(d0, a15, 3));
+  vst1q_u8(dst + 3 * stride, vextq_u8(d0, a15, 4));
+  vst1q_u8(dst + 4 * stride, vextq_u8(d0, a15, 5));
+  vst1q_u8(dst + 5 * stride, vextq_u8(d0, a15, 6));
+  vst1q_u8(dst + 6 * stride, vextq_u8(d0, a15, 7));
+  vst1q_u8(dst + 7 * stride, vextq_u8(d0, a15, 8));
+  vst1q_u8(dst + 8 * stride, vextq_u8(d0, a15, 9));
+  vst1q_u8(dst + 9 * stride, vextq_u8(d0, a15, 10));
+  vst1q_u8(dst + 10 * stride, vextq_u8(d0, a15, 11));
+  vst1q_u8(dst + 11 * stride, vextq_u8(d0, a15, 12));
+  vst1q_u8(dst + 12 * stride, vextq_u8(d0, a15, 13));
+  vst1q_u8(dst + 13 * stride, vextq_u8(d0, a15, 14));
+  vst1q_u8(dst + 14 * stride, vextq_u8(d0, a15, 15));
+  vst1q_u8(dst + 15 * stride, a15);
 }
 
 void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                   const uint8_t *above, const uint8_t *left) {
-  const uint8x16_t A0_0 = vld1q_u8(above);
-  const uint8x16_t A0_1 = vld1q_u8(above + 16);
-  const uint8x16_t above_right = vdupq_lane_u8(vget_high_u8(A0_1), 7);
-  const uint8x16_t A1_0 = vld1q_u8(above + 1);
-  const uint8x16_t A1_1 = vld1q_u8(above + 17);
-  const uint8x16_t A2_0 = vld1q_u8(above + 2);
-  const uint8x16_t A2_1 = vld1q_u8(above + 18);
-  const uint8x16_t avg_0 = vhaddq_u8(A0_0, A2_0);
-  const uint8x16_t avg_1 = vhaddq_u8(A0_1, A2_1);
-  uint8x16_t row_0 = vrhaddq_u8(avg_0, A1_0);
-  uint8x16_t row_1 = vrhaddq_u8(avg_1, A1_1);
-  int i;
+  uint8x16_t ax0, a0, a1, a15, a16, a17, a31, d0[2];
   (void)left;
 
-  vst1q_u8(dst, row_0);
-  dst += 16;
-  vst1q_u8(dst, row_1);
-  dst += stride - 16;
+  a0 = vld1q_u8(above + 0);
+  a1 = vld1q_u8(above + 1);
+  a15 = vld1q_u8(above + 15);
+  a16 = vld1q_u8(above + 16);
+  a17 = vld1q_u8(above + 17);
+  a31 = vld1q_dup_u8(above + 31);
 
-  for (i = 0; i < 30; ++i) {
-    row_0 = vextq_u8(row_0, row_1, 1);
-    row_1 = vextq_u8(row_1, above_right, 1);
-    vst1q_u8(dst, row_0);
-    dst += 16;
-    vst1q_u8(dst, row_1);
-    dst += stride - 16;
-  }
+  // We want to calculate the AVG3 result in lanes 1-15 inclusive so we can
+  // shift in above[15] later, so shift a0 across by one to get the right
+  // inputs:
+  // [ x, above[0], ... , above[14] ]
+  ax0 = vextq_u8(a0, a0, 15);
 
-  vst1q_u8(dst, above_right);
-  dst += 16;
-  vst1q_u8(dst, row_1);
+  // d0[0] = x (don't care)
+  // d0[1] = AVG3(above[0], above[1], above[2]);
+  // ...
+  // d0[15] = AVG3(above[14], above[15], above[16]);
+  d0[0] = vrhaddq_u8(vhaddq_u8(ax0, a1), a0);
+  d0[1] = vrhaddq_u8(vhaddq_u8(a15, a17), a16);
+
+  // Undo the earlier ext, incrementally shift in duplicates of above[15].
+  vst1q_u8(dst + 0 * stride + 0, vextq_u8(d0[0], d0[1], 1));
+  vst1q_u8(dst + 0 * stride + 16, vextq_u8(d0[1], a31, 1));
+  vst1q_u8(dst + 1 * stride + 0, vextq_u8(d0[0], d0[1], 2));
+  vst1q_u8(dst + 1 * stride + 16, vextq_u8(d0[1], a31, 2));
+  vst1q_u8(dst + 2 * stride + 0, vextq_u8(d0[0], d0[1], 3));
+  vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0[1], a31, 3));
+  vst1q_u8(dst + 3 * stride + 0, vextq_u8(d0[0], d0[1], 4));
+  vst1q_u8(dst + 3 * stride + 16, vextq_u8(d0[1], a31, 4));
+  vst1q_u8(dst + 4 * stride + 0, vextq_u8(d0[0], d0[1], 5));
+  vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0[1], a31, 5));
+  vst1q_u8(dst + 5 * stride + 0, vextq_u8(d0[0], d0[1], 6));
+  vst1q_u8(dst + 5 * stride + 16, vextq_u8(d0[1], a31, 6));
+  vst1q_u8(dst + 6 * stride + 0, vextq_u8(d0[0], d0[1], 7));
+  vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0[1], a31, 7));
+  vst1q_u8(dst + 7 * stride + 0, vextq_u8(d0[0], d0[1], 8));
+  vst1q_u8(dst + 7 * stride + 16, vextq_u8(d0[1], a31, 8));
+  vst1q_u8(dst + 8 * stride + 0, vextq_u8(d0[0], d0[1], 9));
+  vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0[1], a31, 9));
+  vst1q_u8(dst + 9 * stride + 0, vextq_u8(d0[0], d0[1], 10));
+  vst1q_u8(dst + 9 * stride + 16, vextq_u8(d0[1], a31, 10));
+  vst1q_u8(dst + 10 * stride + 0, vextq_u8(d0[0], d0[1], 11));
+  vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0[1], a31, 11));
+  vst1q_u8(dst + 11 * stride + 0, vextq_u8(d0[0], d0[1], 12));
+  vst1q_u8(dst + 11 * stride + 16, vextq_u8(d0[1], a31, 12));
+  vst1q_u8(dst + 12 * stride + 0, vextq_u8(d0[0], d0[1], 13));
+  vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0[1], a31, 13));
+  vst1q_u8(dst + 13 * stride + 0, vextq_u8(d0[0], d0[1], 14));
+  vst1q_u8(dst + 13 * stride + 16, vextq_u8(d0[1], a31, 14));
+  vst1q_u8(dst + 14 * stride + 0, vextq_u8(d0[0], d0[1], 15));
+  vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0[1], a31, 15));
+  vst1q_u8(dst + 15 * stride + 0, d0[1]);
+  vst1q_u8(dst + 15 * stride + 16, a31);
+
+  vst1q_u8(dst + 16 * stride + 0, vextq_u8(d0[1], a31, 1));
+  vst1q_u8(dst + 16 * stride + 16, a31);
+  vst1q_u8(dst + 17 * stride + 0, vextq_u8(d0[1], a31, 2));
+  vst1q_u8(dst + 17 * stride + 16, a31);
+  vst1q_u8(dst + 18 * stride + 0, vextq_u8(d0[1], a31, 3));
+  vst1q_u8(dst + 18 * stride + 16, a31);
+  vst1q_u8(dst + 19 * stride + 0, vextq_u8(d0[1], a31, 4));
+  vst1q_u8(dst + 19 * stride + 16, a31);
+  vst1q_u8(dst + 20 * stride + 0, vextq_u8(d0[1], a31, 5));
+  vst1q_u8(dst + 20 * stride + 16, a31);
+  vst1q_u8(dst + 21 * stride + 0, vextq_u8(d0[1], a31, 6));
+  vst1q_u8(dst + 21 * stride + 16, a31);
+  vst1q_u8(dst + 22 * stride + 0, vextq_u8(d0[1], a31, 7));
+  vst1q_u8(dst + 22 * stride + 16, a31);
+  vst1q_u8(dst + 23 * stride + 0, vextq_u8(d0[1], a31, 8));
+  vst1q_u8(dst + 23 * stride + 16, a31);
+  vst1q_u8(dst + 24 * stride + 0, vextq_u8(d0[1], a31, 9));
+  vst1q_u8(dst + 24 * stride + 16, a31);
+  vst1q_u8(dst + 25 * stride + 0, vextq_u8(d0[1], a31, 10));
+  vst1q_u8(dst + 25 * stride + 16, a31);
+  vst1q_u8(dst + 26 * stride + 0, vextq_u8(d0[1], a31, 11));
+  vst1q_u8(dst + 26 * stride + 16, a31);
+  vst1q_u8(dst + 27 * stride + 0, vextq_u8(d0[1], a31, 12));
+  vst1q_u8(dst + 27 * stride + 16, a31);
+  vst1q_u8(dst + 28 * stride + 0, vextq_u8(d0[1], a31, 13));
+  vst1q_u8(dst + 28 * stride + 16, a31);
+  vst1q_u8(dst + 29 * stride + 0, vextq_u8(d0[1], a31, 14));
+  vst1q_u8(dst + 29 * stride + 16, a31);
+  vst1q_u8(dst + 30 * stride + 0, vextq_u8(d0[1], a31, 15));
+  vst1q_u8(dst + 30 * stride + 16, a31);
+  vst1q_u8(dst + 31 * stride + 0, a31);
+  vst1q_u8(dst + 31 * stride + 16, a31);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_d63_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  uint8x8_t a0, a1, a2, a3, d0, d1, d2, d3;
+  (void)left;
+
+  a0 = load_unaligned_u8_4x1(above + 0);
+  a1 = load_unaligned_u8_4x1(above + 1);
+  a2 = load_unaligned_u8_4x1(above + 2);
+  a3 = load_unaligned_u8_4x1(above + 3);
+
+  d0 = vrhadd_u8(a0, a1);
+  d1 = vrhadd_u8(vhadd_u8(a0, a2), a1);
+  d2 = vrhadd_u8(a1, a2);
+  d3 = vrhadd_u8(vhadd_u8(a1, a3), a2);
+
+  store_u8_4x1(dst + 0 * stride, d0);
+  store_u8_4x1(dst + 1 * stride, d1);
+  store_u8_4x1(dst + 2 * stride, d2);
+  store_u8_4x1(dst + 3 * stride, d3);
+}
+
+void vpx_d63_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  uint8x8_t a0, a1, a2, a7, d0, d1;
+  (void)left;
+
+  a0 = vld1_u8(above + 0);
+  a1 = vld1_u8(above + 1);
+  a2 = vld1_u8(above + 2);
+  a7 = vld1_dup_u8(above + 7);
+
+  d0 = vrhadd_u8(a0, a1);
+  d1 = vrhadd_u8(vhadd_u8(a0, a2), a1);
+
+  vst1_u8(dst + 0 * stride, d0);
+  vst1_u8(dst + 1 * stride, d1);
+
+  d0 = vext_u8(d0, d0, 7);
+  d1 = vext_u8(d1, d1, 7);
+
+  vst1_u8(dst + 2 * stride, vext_u8(d0, a7, 2));
+  vst1_u8(dst + 3 * stride, vext_u8(d1, a7, 2));
+  vst1_u8(dst + 4 * stride, vext_u8(d0, a7, 3));
+  vst1_u8(dst + 5 * stride, vext_u8(d1, a7, 3));
+  vst1_u8(dst + 6 * stride, vext_u8(d0, a7, 4));
+  vst1_u8(dst + 7 * stride, vext_u8(d1, a7, 4));
+}
+
+void vpx_d63_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  uint8x16_t a0, a1, a2, a15, d0, d1;
+  (void)left;
+
+  a0 = vld1q_u8(above + 0);
+  a1 = vld1q_u8(above + 1);
+  a2 = vld1q_u8(above + 2);
+  a15 = vld1q_dup_u8(above + 15);
+
+  d0 = vrhaddq_u8(a0, a1);
+  d1 = vrhaddq_u8(vhaddq_u8(a0, a2), a1);
+
+  vst1q_u8(dst + 0 * stride, d0);
+  vst1q_u8(dst + 1 * stride, d1);
+
+  d0 = vextq_u8(d0, d0, 15);
+  d1 = vextq_u8(d1, d1, 15);
+
+  vst1q_u8(dst + 2 * stride, vextq_u8(d0, a15, 2));
+  vst1q_u8(dst + 3 * stride, vextq_u8(d1, a15, 2));
+  vst1q_u8(dst + 4 * stride, vextq_u8(d0, a15, 3));
+  vst1q_u8(dst + 5 * stride, vextq_u8(d1, a15, 3));
+  vst1q_u8(dst + 6 * stride, vextq_u8(d0, a15, 4));
+  vst1q_u8(dst + 7 * stride, vextq_u8(d1, a15, 4));
+  vst1q_u8(dst + 8 * stride, vextq_u8(d0, a15, 5));
+  vst1q_u8(dst + 9 * stride, vextq_u8(d1, a15, 5));
+  vst1q_u8(dst + 10 * stride, vextq_u8(d0, a15, 6));
+  vst1q_u8(dst + 11 * stride, vextq_u8(d1, a15, 6));
+  vst1q_u8(dst + 12 * stride, vextq_u8(d0, a15, 7));
+  vst1q_u8(dst + 13 * stride, vextq_u8(d1, a15, 7));
+  vst1q_u8(dst + 14 * stride, vextq_u8(d0, a15, 8));
+  vst1q_u8(dst + 15 * stride, vextq_u8(d1, a15, 8));
+}
+
+void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  uint8x16_t a0, a1, a2, a16, a17, a18, a31, d0_lo, d0_hi, d1_lo, d1_hi;
+  (void)left;
+
+  a0 = vld1q_u8(above + 0);
+  a1 = vld1q_u8(above + 1);
+  a2 = vld1q_u8(above + 2);
+  a16 = vld1q_u8(above + 16);
+  a17 = vld1q_u8(above + 17);
+  a18 = vld1q_u8(above + 18);
+  a31 = vld1q_dup_u8(above + 31);
+
+  d0_lo = vrhaddq_u8(a0, a1);
+  d0_hi = vrhaddq_u8(a16, a17);
+  d1_lo = vrhaddq_u8(vhaddq_u8(a0, a2), a1);
+  d1_hi = vrhaddq_u8(vhaddq_u8(a16, a18), a17);
+
+  vst1q_u8(dst + 0 * stride + 0, d0_lo);
+  vst1q_u8(dst + 0 * stride + 16, d0_hi);
+  vst1q_u8(dst + 1 * stride + 0, d1_lo);
+  vst1q_u8(dst + 1 * stride + 16, d1_hi);
+
+  d0_hi = vextq_u8(d0_lo, d0_hi, 15);
+  d0_lo = vextq_u8(d0_lo, d0_lo, 15);
+  d1_hi = vextq_u8(d1_lo, d1_hi, 15);
+  d1_lo = vextq_u8(d1_lo, d1_lo, 15);
+
+  vst1q_u8(dst + 2 * stride + 0, vextq_u8(d0_lo, d0_hi, 2));
+  vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_hi, a31, 2));
+  vst1q_u8(dst + 3 * stride + 0, vextq_u8(d1_lo, d1_hi, 2));
+  vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_hi, a31, 2));
+  vst1q_u8(dst + 4 * stride + 0, vextq_u8(d0_lo, d0_hi, 3));
+  vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_hi, a31, 3));
+  vst1q_u8(dst + 5 * stride + 0, vextq_u8(d1_lo, d1_hi, 3));
+  vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_hi, a31, 3));
+  vst1q_u8(dst + 6 * stride + 0, vextq_u8(d0_lo, d0_hi, 4));
+  vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_hi, a31, 4));
+  vst1q_u8(dst + 7 * stride + 0, vextq_u8(d1_lo, d1_hi, 4));
+  vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_hi, a31, 4));
+  vst1q_u8(dst + 8 * stride + 0, vextq_u8(d0_lo, d0_hi, 5));
+  vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_hi, a31, 5));
+  vst1q_u8(dst + 9 * stride + 0, vextq_u8(d1_lo, d1_hi, 5));
+  vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_hi, a31, 5));
+  vst1q_u8(dst + 10 * stride + 0, vextq_u8(d0_lo, d0_hi, 6));
+  vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_hi, a31, 6));
+  vst1q_u8(dst + 11 * stride + 0, vextq_u8(d1_lo, d1_hi, 6));
+  vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_hi, a31, 6));
+  vst1q_u8(dst + 12 * stride + 0, vextq_u8(d0_lo, d0_hi, 7));
+  vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_hi, a31, 7));
+  vst1q_u8(dst + 13 * stride + 0, vextq_u8(d1_lo, d1_hi, 7));
+  vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_hi, a31, 7));
+  vst1q_u8(dst + 14 * stride + 0, vextq_u8(d0_lo, d0_hi, 8));
+  vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_hi, a31, 8));
+  vst1q_u8(dst + 15 * stride + 0, vextq_u8(d1_lo, d1_hi, 8));
+  vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_hi, a31, 8));
+  vst1q_u8(dst + 16 * stride + 0, vextq_u8(d0_lo, d0_hi, 9));
+  vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_hi, a31, 9));
+  vst1q_u8(dst + 17 * stride + 0, vextq_u8(d1_lo, d1_hi, 9));
+  vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_hi, a31, 9));
+  vst1q_u8(dst + 18 * stride + 0, vextq_u8(d0_lo, d0_hi, 10));
+  vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_hi, a31, 10));
+  vst1q_u8(dst + 19 * stride + 0, vextq_u8(d1_lo, d1_hi, 10));
+  vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_hi, a31, 10));
+  vst1q_u8(dst + 20 * stride + 0, vextq_u8(d0_lo, d0_hi, 11));
+  vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_hi, a31, 11));
+  vst1q_u8(dst + 21 * stride + 0, vextq_u8(d1_lo, d1_hi, 11));
+  vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_hi, a31, 11));
+  vst1q_u8(dst + 22 * stride + 0, vextq_u8(d0_lo, d0_hi, 12));
+  vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_hi, a31, 12));
+  vst1q_u8(dst + 23 * stride + 0, vextq_u8(d1_lo, d1_hi, 12));
+  vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_hi, a31, 12));
+  vst1q_u8(dst + 24 * stride + 0, vextq_u8(d0_lo, d0_hi, 13));
+  vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_hi, a31, 13));
+  vst1q_u8(dst + 25 * stride + 0, vextq_u8(d1_lo, d1_hi, 13));
+  vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_hi, a31, 13));
+  vst1q_u8(dst + 26 * stride + 0, vextq_u8(d0_lo, d0_hi, 14));
+  vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_hi, a31, 14));
+  vst1q_u8(dst + 27 * stride + 0, vextq_u8(d1_lo, d1_hi, 14));
+  vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_hi, a31, 14));
+  vst1q_u8(dst + 28 * stride + 0, vextq_u8(d0_lo, d0_hi, 15));
+  vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_hi, a31, 15));
+  vst1q_u8(dst + 29 * stride + 0, vextq_u8(d1_lo, d1_hi, 15));
+  vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_hi, a31, 15));
+  vst1q_u8(dst + 30 * stride + 0, d0_hi);
+  vst1q_u8(dst + 30 * stride + 16, a31);
+  vst1q_u8(dst + 31 * stride + 0, d1_hi);
+  vst1q_u8(dst + 31 * stride + 16, a31);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_d117_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  // See vpx_d117_predictor_8x8_neon for more details on the implementation.
+  uint8x8_t az, a0, l0az, d0, d1, d2, d3, col0, col1;
+
+  az = load_unaligned_u8_4x1(above - 1);
+  a0 = load_unaligned_u8_4x1(above + 0);
+  // [ left[0], above[-1], above[0], above[1], x, x, x, x ]
+  l0az = vext_u8(vld1_dup_u8(left), az, 7);
+
+  col0 = vdup_n_u8((above[-1] + 2 * left[0] + left[1] + 2) >> 2);
+  col1 = vdup_n_u8((left[0] + 2 * left[1] + left[2] + 2) >> 2);
+
+  d0 = vrhadd_u8(az, a0);
+  d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
+  d2 = vext_u8(col0, d0, 7);
+  d3 = vext_u8(col1, d1, 7);
+
+  store_u8_4x1(dst + 0 * stride, d0);
+  store_u8_4x1(dst + 1 * stride, d1);
+  store_u8_4x1(dst + 2 * stride, d2);
+  store_u8_4x1(dst + 3 * stride, d3);
+}
+
+void vpx_d117_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  uint8x8_t az, a0, l0az, d0, d1, l0, l1, azl0, col0, col0_even, col0_odd;
+
+  az = vld1_u8(above - 1);
+  a0 = vld1_u8(above + 0);
+  // [ left[0], above[-1], ... , above[5] ]
+  l0az = vext_u8(vld1_dup_u8(left), az, 7);
+
+  l0 = vld1_u8(left + 0);
+  // The last lane here is unused, reading left[8] could cause a buffer
+  // over-read, so just fill with a duplicate of left[0] to avoid needing to
+  // materialize a zero:
+  // [ left[1], ... , left[7], x ]
+  l1 = vext_u8(l0, l0, 1);
+  // [ above[-1], left[0], ... , left[6] ]
+  azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7);
+
+  // d0[0] = AVG2(above[-1], above[0])
+  // d0[1] = AVG2(above[0], above[1])
+  // ...
+  // d0[7] = AVG2(above[6], above[7])
+  d0 = vrhadd_u8(az, a0);
+
+  // d1[0] = AVG3(left[0], above[-1], above[0])
+  // d1[1] = AVG3(above[-1], above[0], above[1])
+  // ...
+  // d1[7] = AVG3(above[5], above[6], above[7])
+  d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
+
+  // The ext instruction shifts elements in from the end of the vector rather
+  // than the start, so reverse the vector to put the elements to be shifted in
+  // at the end. The lowest two lanes here are unused:
+  // col0[7] = AVG3(above[-1], left[0], left[1])
+  // col0[6] = AVG3(left[0], left[1], left[2])
+  // ...
+  // col0[2] = AVG3(left[4], left[5], left[6])
+  // col0[1] = x (don't care)
+  // col0[0] = x (don't care)
+  col0 = vrev64_u8(vrhadd_u8(vhadd_u8(azl0, l1), l0));
+
+  // We don't care about the first parameter to this uzp since we only ever use
+  // the high three elements, we just use col0 again since it is already
+  // available:
+  // col0_even = [ x, x, x, x, x, col0[3], col0[5], col0[7] ]
+  // col0_odd = [ x, x, x, x, x, col0[2], col0[4], col0[6] ]
+  col0_even = vuzp_u8(col0, col0).val[1];
+  col0_odd = vuzp_u8(col0, col0).val[0];
+
+  // Incrementally shift more elements from col0 into d0/1:
+  // stride=0 [ d0[0],   d0[1],   d0[2],   d0[3], d0[4], d0[5], d0[6], d0[7] ]
+  // stride=1 [ d1[0],   d1[1],   d1[2],   d1[3], d1[4], d1[5], d1[6], d1[7] ]
+  // stride=2 [ col0[7], d0[0],   d0[1],   d0[2], d0[3], d0[4], d0[5], d0[6] ]
+  // stride=3 [ col0[6], d1[0],   d1[1],   d1[2], d1[3], d1[4], d1[5], d1[6] ]
+  // stride=4 [ col0[5], col0[7], d0[0],   d0[1], d0[2], d0[3], d0[4], d0[5] ]
+  // stride=5 [ col0[4], col0[6], d1[0],   d1[1], d1[2], d1[3], d1[4], d1[5] ]
+  // stride=6 [ col0[3], col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4] ]
+  // stride=7 [ col0[2], col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4] ]
+  vst1_u8(dst + 0 * stride, d0);
+  vst1_u8(dst + 1 * stride, d1);
+  vst1_u8(dst + 2 * stride, vext_u8(col0_even, d0, 7));
+  vst1_u8(dst + 3 * stride, vext_u8(col0_odd, d1, 7));
+  vst1_u8(dst + 4 * stride, vext_u8(col0_even, d0, 6));
+  vst1_u8(dst + 5 * stride, vext_u8(col0_odd, d1, 6));
+  vst1_u8(dst + 6 * stride, vext_u8(col0_even, d0, 5));
+  vst1_u8(dst + 7 * stride, vext_u8(col0_odd, d1, 5));
+}
+
+void vpx_d117_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  // See vpx_d117_predictor_8x8_neon for more details on the implementation.
+  uint8x16_t az, a0, l0az, d0, d1, l0, l1, azl0, col0, col0_even, col0_odd;
+
+  az = vld1q_u8(above - 1);
+  a0 = vld1q_u8(above + 0);
+  // [ left[0], above[-1], ... , above[13] ]
+  l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
+
+  l0 = vld1q_u8(left + 0);
+  // The last lane here is unused, reading left[16] could cause a buffer
+  // over-read, so just fill with a duplicate of left[0] to avoid needing to
+  // materialize a zero:
+  // [ left[1], ... , left[15], x ]
+  l1 = vextq_u8(l0, l0, 1);
+  // [ above[-1], left[0], ... , left[14] ]
+  azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
+
+  d0 = vrhaddq_u8(az, a0);
+  d1 = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
+
+  col0 = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
+  col0 = vrev64q_u8(vextq_u8(col0, col0, 8));
+
+  // The low nine lanes here are unused so the first input to the uzp is
+  // unused, so just use a duplicate of col0 since we have it already. This
+  // also means that the lowest lane of col0 here is unused.
+  col0_even = vuzpq_u8(col0, col0).val[1];
+  col0_odd = vuzpq_u8(col0, col0).val[0];
+
+  vst1q_u8(dst + 0 * stride, d0);
+  vst1q_u8(dst + 1 * stride, d1);
+  vst1q_u8(dst + 2 * stride, vextq_u8(col0_even, d0, 15));
+  vst1q_u8(dst + 3 * stride, vextq_u8(col0_odd, d1, 15));
+  vst1q_u8(dst + 4 * stride, vextq_u8(col0_even, d0, 14));
+  vst1q_u8(dst + 5 * stride, vextq_u8(col0_odd, d1, 14));
+  vst1q_u8(dst + 6 * stride, vextq_u8(col0_even, d0, 13));
+  vst1q_u8(dst + 7 * stride, vextq_u8(col0_odd, d1, 13));
+  vst1q_u8(dst + 8 * stride, vextq_u8(col0_even, d0, 12));
+  vst1q_u8(dst + 9 * stride, vextq_u8(col0_odd, d1, 12));
+  vst1q_u8(dst + 10 * stride, vextq_u8(col0_even, d0, 11));
+  vst1q_u8(dst + 11 * stride, vextq_u8(col0_odd, d1, 11));
+  vst1q_u8(dst + 12 * stride, vextq_u8(col0_even, d0, 10));
+  vst1q_u8(dst + 13 * stride, vextq_u8(col0_odd, d1, 10));
+  vst1q_u8(dst + 14 * stride, vextq_u8(col0_even, d0, 9));
+  vst1q_u8(dst + 15 * stride, vextq_u8(col0_odd, d1, 9));
+}
+
+void vpx_d117_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  // See vpx_d117_predictor_8x8_neon for more details on the implementation.
+  uint8x16_t az, a0, a14, a15, a16, l0az, d0_lo, d0_hi, d1_lo, d1_hi, l0, l1,
+      l15, l16, l17, azl0, col0_lo, col0_hi, col0_even, col0_odd;
+
+  az = vld1q_u8(above - 1);
+  a0 = vld1q_u8(above + 0);
+  a14 = vld1q_u8(above + 14);
+  a15 = vld1q_u8(above + 15);
+  a16 = vld1q_u8(above + 16);
+  // [ left[0], above[-1], ... , above[13] ]
+  l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
+
+  l0 = vld1q_u8(left + 0);
+  l1 = vld1q_u8(left + 1);
+  l15 = vld1q_u8(left + 15);
+  l16 = vld1q_u8(left + 16);
+  // The last lane here is unused, reading left[32] would cause a buffer
+  // over-read (observed as an address-sanitizer failure), so just fill with a
+  // duplicate of left[16] to avoid needing to materialize a zero:
+  // [ left[17], ... , left[31], x ]
+  l17 = vextq_u8(l16, l16, 1);
+  // [ above[-1], left[0], ... , left[14] ]
+  azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
+
+  d0_lo = vrhaddq_u8(az, a0);
+  d0_hi = vrhaddq_u8(a15, a16);
+  d1_lo = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
+  d1_hi = vrhaddq_u8(vhaddq_u8(a14, a16), a15);
+
+  // The last lane of col0_hi is unused here.
+  col0_lo = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
+  col0_hi = vrhaddq_u8(vhaddq_u8(l15, l17), l16);
+
+  col0_lo = vrev64q_u8(vextq_u8(col0_lo, col0_lo, 8));
+  col0_hi = vrev64q_u8(vextq_u8(col0_hi, col0_hi, 8));
+
+  // The first lane of these are unused since they are only ever called as
+  // ext(col0, _, i) where i >= 1.
+  col0_even = vuzpq_u8(col0_hi, col0_lo).val[1];
+  col0_odd = vuzpq_u8(col0_hi, col0_lo).val[0];
+
+  vst1q_u8(dst + 0 * stride + 0, d0_lo);
+  vst1q_u8(dst + 0 * stride + 16, d0_hi);
+  vst1q_u8(dst + 1 * stride + 0, d1_lo);
+  vst1q_u8(dst + 1 * stride + 16, d1_hi);
+  vst1q_u8(dst + 2 * stride + 0, vextq_u8(col0_even, d0_lo, 15));
+  vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_lo, d0_hi, 15));
+  vst1q_u8(dst + 3 * stride + 0, vextq_u8(col0_odd, d1_lo, 15));
+  vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_lo, d1_hi, 15));
+  vst1q_u8(dst + 4 * stride + 0, vextq_u8(col0_even, d0_lo, 14));
+  vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_lo, d0_hi, 14));
+  vst1q_u8(dst + 5 * stride + 0, vextq_u8(col0_odd, d1_lo, 14));
+  vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_lo, d1_hi, 14));
+  vst1q_u8(dst + 6 * stride + 0, vextq_u8(col0_even, d0_lo, 13));
+  vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_lo, d0_hi, 13));
+  vst1q_u8(dst + 7 * stride + 0, vextq_u8(col0_odd, d1_lo, 13));
+  vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_lo, d1_hi, 13));
+  vst1q_u8(dst + 8 * stride + 0, vextq_u8(col0_even, d0_lo, 12));
+  vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_lo, d0_hi, 12));
+  vst1q_u8(dst + 9 * stride + 0, vextq_u8(col0_odd, d1_lo, 12));
+  vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_lo, d1_hi, 12));
+  vst1q_u8(dst + 10 * stride + 0, vextq_u8(col0_even, d0_lo, 11));
+  vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_lo, d0_hi, 11));
+  vst1q_u8(dst + 11 * stride + 0, vextq_u8(col0_odd, d1_lo, 11));
+  vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_lo, d1_hi, 11));
+  vst1q_u8(dst + 12 * stride + 0, vextq_u8(col0_even, d0_lo, 10));
+  vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_lo, d0_hi, 10));
+  vst1q_u8(dst + 13 * stride + 0, vextq_u8(col0_odd, d1_lo, 10));
+  vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_lo, d1_hi, 10));
+  vst1q_u8(dst + 14 * stride + 0, vextq_u8(col0_even, d0_lo, 9));
+  vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_lo, d0_hi, 9));
+  vst1q_u8(dst + 15 * stride + 0, vextq_u8(col0_odd, d1_lo, 9));
+  vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_lo, d1_hi, 9));
+  vst1q_u8(dst + 16 * stride + 0, vextq_u8(col0_even, d0_lo, 8));
+  vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_lo, d0_hi, 8));
+  vst1q_u8(dst + 17 * stride + 0, vextq_u8(col0_odd, d1_lo, 8));
+  vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_lo, d1_hi, 8));
+  vst1q_u8(dst + 18 * stride + 0, vextq_u8(col0_even, d0_lo, 7));
+  vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_lo, d0_hi, 7));
+  vst1q_u8(dst + 19 * stride + 0, vextq_u8(col0_odd, d1_lo, 7));
+  vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_lo, d1_hi, 7));
+  vst1q_u8(dst + 20 * stride + 0, vextq_u8(col0_even, d0_lo, 6));
+  vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_lo, d0_hi, 6));
+  vst1q_u8(dst + 21 * stride + 0, vextq_u8(col0_odd, d1_lo, 6));
+  vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_lo, d1_hi, 6));
+  vst1q_u8(dst + 22 * stride + 0, vextq_u8(col0_even, d0_lo, 5));
+  vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_lo, d0_hi, 5));
+  vst1q_u8(dst + 23 * stride + 0, vextq_u8(col0_odd, d1_lo, 5));
+  vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_lo, d1_hi, 5));
+  vst1q_u8(dst + 24 * stride + 0, vextq_u8(col0_even, d0_lo, 4));
+  vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_lo, d0_hi, 4));
+  vst1q_u8(dst + 25 * stride + 0, vextq_u8(col0_odd, d1_lo, 4));
+  vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_lo, d1_hi, 4));
+  vst1q_u8(dst + 26 * stride + 0, vextq_u8(col0_even, d0_lo, 3));
+  vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_lo, d0_hi, 3));
+  vst1q_u8(dst + 27 * stride + 0, vextq_u8(col0_odd, d1_lo, 3));
+  vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_lo, d1_hi, 3));
+  vst1q_u8(dst + 28 * stride + 0, vextq_u8(col0_even, d0_lo, 2));
+  vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_lo, d0_hi, 2));
+  vst1q_u8(dst + 29 * stride + 0, vextq_u8(col0_odd, d1_lo, 2));
+  vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_lo, d1_hi, 2));
+  vst1q_u8(dst + 30 * stride + 0, vextq_u8(col0_even, d0_lo, 1));
+  vst1q_u8(dst + 30 * stride + 16, vextq_u8(d0_lo, d0_hi, 1));
+  vst1q_u8(dst + 31 * stride + 0, vextq_u8(col0_odd, d1_lo, 1));
+  vst1q_u8(dst + 31 * stride + 16, vextq_u8(d1_lo, d1_hi, 1));
 }
 
 // -----------------------------------------------------------------------------
@@ -390,22 +866,14 @@ void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
   const uint8x8_t L3210 = vrev64_u8(L0123);
   const uint8x8_t L3210XA012 = vext_u8(L3210, XA0123, 4);
   const uint8x8_t L210XA0123 = vext_u8(L3210, XA0123, 5);
-  const uint8x8_t L10XA0123_ =
-      vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(L210XA0123), 8));
+  const uint8x8_t L10XA0123_ = vext_u8(L210XA0123, L210XA0123, 1);
   const uint8x8_t avg1 = vhadd_u8(L10XA0123_, L3210XA012);
   const uint8x8_t avg2 = vrhadd_u8(avg1, L210XA0123);
-  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
-  const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
-  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
-  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
-  const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
-  vst1_lane_u32((uint32_t *)dst, r0, 0);
-  dst += stride;
-  vst1_lane_u32((uint32_t *)dst, r1, 0);
-  dst += stride;
-  vst1_lane_u32((uint32_t *)dst, r2, 0);
-  dst += stride;
-  vst1_lane_u32((uint32_t *)dst, r3, 0);
+
+  store_u8_4x1(dst + 0 * stride, vext_u8(avg2, avg2, 3));
+  store_u8_4x1(dst + 1 * stride, vext_u8(avg2, avg2, 2));
+  store_u8_4x1(dst + 2 * stride, vext_u8(avg2, avg2, 1));
+  store_u8_4x1(dst + 3 * stride, avg2);
 }
 
 void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
@@ -422,31 +890,15 @@ void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
   const uint8x16_t L543210XA01234567_ = vcombine_u8(L543210XA0, A1234567_);
   const uint8x16_t avg = vhaddq_u8(L76543210XA0123456, L543210XA01234567_);
   const uint8x16_t row = vrhaddq_u8(avg, L6543210XA01234567);
-  const uint8x8_t row_0 = vget_low_u8(row);
-  const uint8x8_t row_1 = vget_high_u8(row);
-  const uint8x8_t r0 = vext_u8(row_0, row_1, 7);
-  const uint8x8_t r1 = vext_u8(row_0, row_1, 6);
-  const uint8x8_t r2 = vext_u8(row_0, row_1, 5);
-  const uint8x8_t r3 = vext_u8(row_0, row_1, 4);
-  const uint8x8_t r4 = vext_u8(row_0, row_1, 3);
-  const uint8x8_t r5 = vext_u8(row_0, row_1, 2);
-  const uint8x8_t r6 = vext_u8(row_0, row_1, 1);
 
-  vst1_u8(dst, r0);
-  dst += stride;
-  vst1_u8(dst, r1);
-  dst += stride;
-  vst1_u8(dst, r2);
-  dst += stride;
-  vst1_u8(dst, r3);
-  dst += stride;
-  vst1_u8(dst, r4);
-  dst += stride;
-  vst1_u8(dst, r5);
-  dst += stride;
-  vst1_u8(dst, r6);
-  dst += stride;
-  vst1_u8(dst, row_0);
+  vst1_u8(dst + 0 * stride, vget_low_u8(vextq_u8(row, row, 7)));
+  vst1_u8(dst + 1 * stride, vget_low_u8(vextq_u8(row, row, 6)));
+  vst1_u8(dst + 2 * stride, vget_low_u8(vextq_u8(row, row, 5)));
+  vst1_u8(dst + 3 * stride, vget_low_u8(vextq_u8(row, row, 4)));
+  vst1_u8(dst + 4 * stride, vget_low_u8(vextq_u8(row, row, 3)));
+  vst1_u8(dst + 5 * stride, vget_low_u8(vextq_u8(row, row, 2)));
+  vst1_u8(dst + 6 * stride, vget_low_u8(vextq_u8(row, row, 1)));
+  vst1_u8(dst + 7 * stride, vget_low_u8(row));
 }
 
 static INLINE void d135_store_16x8(
@@ -489,6 +941,7 @@ void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
   const uint8x16_t avg_1 = vhaddq_u8(XA0123456789abcde, A123456789abcdef_);
   const uint8x16_t row_0 = vrhaddq_u8(avg_0, Ledcba9876543210X);
   const uint8x16_t row_1 = vrhaddq_u8(avg_1, A0123456789abcdef);
+
   const uint8x16_t r_0 = vextq_u8(row_0, row_1, 15);
   const uint8x16_t r_1 = vextq_u8(row_0, row_1, 14);
   const uint8x16_t r_2 = vextq_u8(row_0, row_1, 13);
@@ -496,7 +949,7 @@ void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
   const uint8x16_t r_4 = vextq_u8(row_0, row_1, 11);
   const uint8x16_t r_5 = vextq_u8(row_0, row_1, 10);
   const uint8x16_t r_6 = vextq_u8(row_0, row_1, 9);
-  const uint8x16_t r_7 = vcombine_u8(vget_high_u8(row_0), vget_low_u8(row_1));
+  const uint8x16_t r_7 = vextq_u8(row_0, row_1, 8);
   const uint8x16_t r_8 = vextq_u8(row_0, row_1, 7);
   const uint8x16_t r_9 = vextq_u8(row_0, row_1, 6);
   const uint8x16_t r_a = vextq_u8(row_0, row_1, 5);
@@ -669,6 +1122,452 @@ void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
 
 // -----------------------------------------------------------------------------
 
+void vpx_d153_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  // See vpx_d153_predictor_8x8_neon for more details on the implementation.
+  uint8x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02;
+
+  az = load_unaligned_u8_4x1(above - 1);
+  a0 = load_unaligned_u8_4x1(above + 0);
+  // [ left[0], above[-1], above[0], above[1], x, x, x, x ]
+  l0az = vext_u8(vld1_dup_u8(left), az, 7);
+
+  l0 = load_unaligned_u8_4x1(left + 0);
+  l1 = load_unaligned_u8_4x1(left + 1);
+  // [ above[-1], left[0], left[1], left[2], x, x, x, x ]
+  azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7);
+
+  d0 = vrhadd_u8(azl0, l0);
+  d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
+  d2 = vrhadd_u8(vhadd_u8(azl0, l1), l0);
+
+  d02 = vrev64_u8(vzip_u8(d0, d2).val[0]);
+
+  store_u8_4x1(dst + 0 * stride, vext_u8(d02, d1, 7));
+  store_u8_4x1(dst + 1 * stride, vext_u8(d02, d1, 5));
+  store_u8_4x1(dst + 2 * stride, vext_u8(d02, d1, 3));
+  store_u8_4x1(dst + 3 * stride, vext_u8(d02, d1, 1));
+}
+
+void vpx_d153_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  uint8x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02_lo, d02_hi;
+
+  az = vld1_u8(above - 1);
+  a0 = vld1_u8(above + 0);
+  // [ left[0], above[-1], ... , above[5] ]
+  l0az = vext_u8(vld1_dup_u8(left), az, 7);
+
+  l0 = vld1_u8(left);
+  // The last lane here is unused, reading left[8] could cause a buffer
+  // over-read, so just fill with a duplicate of left[0] to avoid needing to
+  // materialize a zero:
+  // [ left[1], ... , left[7], x ]
+  l1 = vext_u8(l0, l0, 1);
+  // [ above[-1], left[0], ... , left[6] ]
+  azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7);
+
+  // d0[0] = AVG2(above[-1], left[0])
+  // d0[1] = AVG2(left[0], left[1])
+  // ...
+  // d0[7] = AVG2(left[6], left[7])
+  d0 = vrhadd_u8(azl0, l0);
+
+  // d1[0] = AVG3(left[0], above[-1], above[0])
+  // d1[1] = AVG3(above[-1], above[0], above[1])
+  // ...
+  // d1[7] = AVG3(above[5], above[6], above[7])
+  d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
+
+  // d2[0] = AVG3(above[-1], left[0], left[1])
+  // d2[1] = AVG3(left[0], left[1], left[2])
+  // ...
+  // d2[6] = AVG3(left[5], left[6], left[7])
+  // d2[7] = x (don't care)
+  d2 = vrhadd_u8(vhadd_u8(azl0, l1), l0);
+
+  // The ext instruction shifts elements in from the end of the vector rather
+  // than the start, so reverse the vectors to put the elements to be shifted
+  // in at the end. The lowest lane of d02_lo is unused.
+  d02_lo = vzip_u8(vrev64_u8(d2), vrev64_u8(d0)).val[0];
+  d02_hi = vzip_u8(vrev64_u8(d2), vrev64_u8(d0)).val[1];
+
+  // Incrementally shift more elements from d0/d2 reversed into d1:
+  // stride=0 [ d0[0], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ]
+  // stride=1 [ d0[1], d2[0], d0[0], d1[0], d1[1], d1[2], d1[3], d1[4] ]
+  // stride=2 [ d0[2], d2[1], d0[1], d2[0], d0[0], d1[0], d1[1], d1[2] ]
+  // stride=3 [ d0[3], d2[2], d0[2], d2[1], d0[1], d2[0], d0[0], d1[0] ]
+  // stride=4 [ d0[4], d2[3], d0[3], d2[2], d0[2], d2[1], d0[1], d2[0] ]
+  // stride=5 [ d0[5], d2[4], d0[4], d2[3], d0[3], d2[2], d0[2], d2[1] ]
+  // stride=6 [ d0[6], d2[5], d0[5], d2[4], d0[4], d2[3], d0[3], d2[2] ]
+  // stride=7 [ d0[7], d2[6], d0[6], d2[5], d0[5], d2[4], d0[4], d2[3] ]
+  vst1_u8(dst + 0 * stride, vext_u8(d02_hi, d1, 7));
+  vst1_u8(dst + 1 * stride, vext_u8(d02_hi, d1, 5));
+  vst1_u8(dst + 2 * stride, vext_u8(d02_hi, d1, 3));
+  vst1_u8(dst + 3 * stride, vext_u8(d02_hi, d1, 1));
+  vst1_u8(dst + 4 * stride, vext_u8(d02_lo, d02_hi, 7));
+  vst1_u8(dst + 5 * stride, vext_u8(d02_lo, d02_hi, 5));
+  vst1_u8(dst + 6 * stride, vext_u8(d02_lo, d02_hi, 3));
+  vst1_u8(dst + 7 * stride, vext_u8(d02_lo, d02_hi, 1));
+}
+
+void vpx_d153_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  // See vpx_d153_predictor_8x8_neon for more details on the implementation.
+  uint8x16_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02_lo, d02_hi;
+
+  az = vld1q_u8(above - 1);
+  a0 = vld1q_u8(above + 0);
+  // [ left[0], above[-1], ... , above[13] ]
+  l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
+
+  l0 = vld1q_u8(left + 0);
+  // The last lane here is unused, reading left[16] could cause a buffer
+  // over-read, so just fill with a duplicate of left[0] to avoid needing to
+  // materialize a zero:
+  // [ left[1], ... , left[15], x ]
+  l1 = vextq_u8(l0, l0, 1);
+  // [ above[-1], left[0], ... , left[14] ]
+  azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
+
+  d0 = vrhaddq_u8(azl0, l0);
+  d1 = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
+  d2 = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
+
+  d0 = vrev64q_u8(vextq_u8(d0, d0, 8));
+  d2 = vrev64q_u8(vextq_u8(d2, d2, 8));
+
+  // The lowest lane of d02_lo is unused.
+  d02_lo = vzipq_u8(d2, d0).val[0];
+  d02_hi = vzipq_u8(d2, d0).val[1];
+
+  vst1q_u8(dst + 0 * stride, vextq_u8(d02_hi, d1, 15));
+  vst1q_u8(dst + 1 * stride, vextq_u8(d02_hi, d1, 13));
+  vst1q_u8(dst + 2 * stride, vextq_u8(d02_hi, d1, 11));
+  vst1q_u8(dst + 3 * stride, vextq_u8(d02_hi, d1, 9));
+  vst1q_u8(dst + 4 * stride, vextq_u8(d02_hi, d1, 7));
+  vst1q_u8(dst + 5 * stride, vextq_u8(d02_hi, d1, 5));
+  vst1q_u8(dst + 6 * stride, vextq_u8(d02_hi, d1, 3));
+  vst1q_u8(dst + 7 * stride, vextq_u8(d02_hi, d1, 1));
+  vst1q_u8(dst + 8 * stride, vextq_u8(d02_lo, d02_hi, 15));
+  vst1q_u8(dst + 9 * stride, vextq_u8(d02_lo, d02_hi, 13));
+  vst1q_u8(dst + 10 * stride, vextq_u8(d02_lo, d02_hi, 11));
+  vst1q_u8(dst + 11 * stride, vextq_u8(d02_lo, d02_hi, 9));
+  vst1q_u8(dst + 12 * stride, vextq_u8(d02_lo, d02_hi, 7));
+  vst1q_u8(dst + 13 * stride, vextq_u8(d02_lo, d02_hi, 5));
+  vst1q_u8(dst + 14 * stride, vextq_u8(d02_lo, d02_hi, 3));
+  vst1q_u8(dst + 15 * stride, vextq_u8(d02_lo, d02_hi, 1));
+}
+
+void vpx_d153_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  // See vpx_d153_predictor_8x8_neon for more details on the implementation.
+  uint8x16_t az, a0, a14, a15, a16, l0az, l0, l1, l15, l16, l17, azl0, d0_lo,
+      d0_hi, d1_lo, d1_hi, d2_lo, d2_hi;
+  uint8x16x2_t d02_hi, d02_lo;
+
+  az = vld1q_u8(above - 1);
+  a0 = vld1q_u8(above + 0);
+  a14 = vld1q_u8(above + 14);
+  a15 = vld1q_u8(above + 15);
+  a16 = vld1q_u8(above + 16);
+  // [ left[0], above[-1], ... , above[13] ]
+  l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
+
+  l0 = vld1q_u8(left);
+  l1 = vld1q_u8(left + 1);
+  l15 = vld1q_u8(left + 15);
+  l16 = vld1q_u8(left + 16);
+  // The last lane here is unused, reading left[32] would cause a buffer
+  // over-read (observed as an address-sanitizer failure), so just fill with a
+  // duplicate of left[16] to avoid needing to materialize a zero:
+  // [ left[17], ... , left[31], x ]
+  l17 = vextq_u8(l16, l16, 1);
+  // [ above[-1], left[0], ... , left[14] ]
+  azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
+
+  d0_lo = vrhaddq_u8(azl0, l0);
+  d0_hi = vrhaddq_u8(l15, l16);
+
+  d1_lo = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
+  d1_hi = vrhaddq_u8(vhaddq_u8(a14, a16), a15);
+
+  // The highest lane of d2_hi is unused.
+  d2_lo = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
+  d2_hi = vrhaddq_u8(vhaddq_u8(l15, l17), l16);
+
+  d0_lo = vrev64q_u8(vextq_u8(d0_lo, d0_lo, 8));
+  d0_hi = vrev64q_u8(vextq_u8(d0_hi, d0_hi, 8));
+
+  d2_lo = vrev64q_u8(vextq_u8(d2_lo, d2_lo, 8));
+  d2_hi = vrev64q_u8(vextq_u8(d2_hi, d2_hi, 8));
+
+  // d02_hi.val[0][0] is unused here.
+  d02_hi = vzipq_u8(d2_hi, d0_hi);
+  d02_lo = vzipq_u8(d2_lo, d0_lo);
+
+  vst1q_u8(dst + 0 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 15));
+  vst1q_u8(dst + 0 * stride + 16, vextq_u8(d1_lo, d1_hi, 15));
+  vst1q_u8(dst + 1 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 13));
+  vst1q_u8(dst + 1 * stride + 16, vextq_u8(d1_lo, d1_hi, 13));
+  vst1q_u8(dst + 2 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 11));
+  vst1q_u8(dst + 2 * stride + 16, vextq_u8(d1_lo, d1_hi, 11));
+  vst1q_u8(dst + 3 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 9));
+  vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_lo, d1_hi, 9));
+  vst1q_u8(dst + 4 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 7));
+  vst1q_u8(dst + 4 * stride + 16, vextq_u8(d1_lo, d1_hi, 7));
+  vst1q_u8(dst + 5 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 5));
+  vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_lo, d1_hi, 5));
+  vst1q_u8(dst + 6 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 3));
+  vst1q_u8(dst + 6 * stride + 16, vextq_u8(d1_lo, d1_hi, 3));
+  vst1q_u8(dst + 7 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 1));
+  vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_lo, d1_hi, 1));
+  vst1q_u8(dst + 8 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 15));
+  vst1q_u8(dst + 8 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 15));
+  vst1q_u8(dst + 9 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 13));
+  vst1q_u8(dst + 9 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 13));
+  vst1q_u8(dst + 10 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 11));
+  vst1q_u8(dst + 10 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 11));
+  vst1q_u8(dst + 11 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 9));
+  vst1q_u8(dst + 11 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 9));
+  vst1q_u8(dst + 12 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 7));
+  vst1q_u8(dst + 12 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 7));
+  vst1q_u8(dst + 13 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 5));
+  vst1q_u8(dst + 13 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 5));
+  vst1q_u8(dst + 14 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 3));
+  vst1q_u8(dst + 14 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 3));
+  vst1q_u8(dst + 15 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 1));
+  vst1q_u8(dst + 15 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 1));
+  vst1q_u8(dst + 16 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 15));
+  vst1q_u8(dst + 16 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 15));
+  vst1q_u8(dst + 17 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 13));
+  vst1q_u8(dst + 17 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 13));
+  vst1q_u8(dst + 18 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 11));
+  vst1q_u8(dst + 18 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 11));
+  vst1q_u8(dst + 19 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 9));
+  vst1q_u8(dst + 19 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 9));
+  vst1q_u8(dst + 20 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 7));
+  vst1q_u8(dst + 20 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 7));
+  vst1q_u8(dst + 21 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 5));
+  vst1q_u8(dst + 21 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 5));
+  vst1q_u8(dst + 22 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 3));
+  vst1q_u8(dst + 22 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 3));
+  vst1q_u8(dst + 23 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 1));
+  vst1q_u8(dst + 23 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 1));
+  vst1q_u8(dst + 24 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 15));
+  vst1q_u8(dst + 24 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 15));
+  vst1q_u8(dst + 25 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 13));
+  vst1q_u8(dst + 25 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 13));
+  vst1q_u8(dst + 26 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 11));
+  vst1q_u8(dst + 26 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 11));
+  vst1q_u8(dst + 27 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 9));
+  vst1q_u8(dst + 27 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 9));
+  vst1q_u8(dst + 28 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 7));
+  vst1q_u8(dst + 28 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 7));
+  vst1q_u8(dst + 29 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 5));
+  vst1q_u8(dst + 29 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 5));
+  vst1q_u8(dst + 30 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 3));
+  vst1q_u8(dst + 30 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 3));
+  vst1q_u8(dst + 31 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 1));
+  vst1q_u8(dst + 31 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 1));
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_d207_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  uint8x8_t l0, l3, l1, l2, c0, c1, c01, d0, d1;
+  (void)above;
+
+  // We need the low half lanes here for the c0/c1 arithmetic but the high half
+  // lanes for the ext:
+  // [ left[0], left[1], left[2], left[3], left[0], left[1], left[2], left[3] ]
+  l0 = load_replicate_u8_4x1(left + 0);
+  l3 = vld1_dup_u8(left + 3);
+
+  // [ left[1], left[2], left[3], left[3], x, x, x, x ]
+  l1 = vext_u8(l0, l3, 5);
+  // [ left[2], left[3], left[3], left[3], x, x, x, x ]
+  l2 = vext_u8(l0, l3, 6);
+
+  c0 = vrhadd_u8(l0, l1);
+  c1 = vrhadd_u8(vhadd_u8(l0, l2), l1);
+
+  // [ c0[0], c1[0], c0[1], c1[1], c0[2], c1[2], c0[3], c1[3] ]
+  c01 = vzip_u8(c0, c1).val[0];
+
+  d0 = c01;
+  d1 = vext_u8(c01, l3, 2);
+
+  // Store the high half of the vector for stride={2,3} to avoid needing
+  // additional ext instructions:
+  // stride=0 [ c0[0], c1[0],   c0[1],   c1[1] ]
+  // stride=1 [ c0[1], c1[1],   c0[2],   c1[2] ]
+  // stride=2 [ c0[2], c1[2],   c0[3],   c1[3] ]
+  // stride=3 [ c0[3], c1[3], left[3], left[3] ]
+  store_u8_4x1(dst + 0 * stride, d0);
+  store_u8_4x1(dst + 1 * stride, d1);
+  store_u8_4x1_high(dst + 2 * stride, d0);
+  store_u8_4x1_high(dst + 3 * stride, d1);
+}
+
+void vpx_d207_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  uint8x8_t l7, l0, l1, l2, c0, c1, c01_lo, c01_hi;
+  (void)above;
+
+  l0 = vld1_u8(left + 0);
+  l7 = vld1_dup_u8(left + 7);
+
+  // [ left[1], left[2], left[3], left[4], left[5], left[6], left[7], left[7] ]
+  l1 = vext_u8(l0, l7, 1);
+  // [ left[2], left[3], left[4], left[5], left[6], left[7], left[7], left[7] ]
+  l2 = vext_u8(l0, l7, 2);
+
+  c0 = vrhadd_u8(l0, l1);
+  c1 = vrhadd_u8(vhadd_u8(l0, l2), l1);
+
+  c01_lo = vzip_u8(c0, c1).val[0];
+  c01_hi = vzip_u8(c0, c1).val[1];
+
+  vst1_u8(dst + 0 * stride, c01_lo);
+  vst1_u8(dst + 1 * stride, vext_u8(c01_lo, c01_hi, 2));
+  vst1_u8(dst + 2 * stride, vext_u8(c01_lo, c01_hi, 4));
+  vst1_u8(dst + 3 * stride, vext_u8(c01_lo, c01_hi, 6));
+  vst1_u8(dst + 4 * stride, c01_hi);
+  vst1_u8(dst + 5 * stride, vext_u8(c01_hi, l7, 2));
+  vst1_u8(dst + 6 * stride, vext_u8(c01_hi, l7, 4));
+  vst1_u8(dst + 7 * stride, vext_u8(c01_hi, l7, 6));
+}
+
+void vpx_d207_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  uint8x16_t l15, l0, l1, l2, c0, c1, c01_lo, c01_hi;
+  (void)above;
+
+  l0 = vld1q_u8(left + 0);
+  l15 = vld1q_dup_u8(left + 15);
+
+  l1 = vextq_u8(l0, l15, 1);
+  l2 = vextq_u8(l0, l15, 2);
+
+  c0 = vrhaddq_u8(l0, l1);
+  c1 = vrhaddq_u8(vhaddq_u8(l0, l2), l1);
+
+  c01_lo = vzipq_u8(c0, c1).val[0];
+  c01_hi = vzipq_u8(c0, c1).val[1];
+
+  vst1q_u8(dst + 0 * stride, c01_lo);
+  vst1q_u8(dst + 1 * stride, vextq_u8(c01_lo, c01_hi, 2));
+  vst1q_u8(dst + 2 * stride, vextq_u8(c01_lo, c01_hi, 4));
+  vst1q_u8(dst + 3 * stride, vextq_u8(c01_lo, c01_hi, 6));
+  vst1q_u8(dst + 4 * stride, vextq_u8(c01_lo, c01_hi, 8));
+  vst1q_u8(dst + 5 * stride, vextq_u8(c01_lo, c01_hi, 10));
+  vst1q_u8(dst + 6 * stride, vextq_u8(c01_lo, c01_hi, 12));
+  vst1q_u8(dst + 7 * stride, vextq_u8(c01_lo, c01_hi, 14));
+  vst1q_u8(dst + 8 * stride, c01_hi);
+  vst1q_u8(dst + 9 * stride, vextq_u8(c01_hi, l15, 2));
+  vst1q_u8(dst + 10 * stride, vextq_u8(c01_hi, l15, 4));
+  vst1q_u8(dst + 11 * stride, vextq_u8(c01_hi, l15, 6));
+  vst1q_u8(dst + 12 * stride, vextq_u8(c01_hi, l15, 8));
+  vst1q_u8(dst + 13 * stride, vextq_u8(c01_hi, l15, 10));
+  vst1q_u8(dst + 14 * stride, vextq_u8(c01_hi, l15, 12));
+  vst1q_u8(dst + 15 * stride, vextq_u8(c01_hi, l15, 14));
+}
+
+void vpx_d207_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  uint8x16_t l0_lo, l0_hi, l1_lo, l1_hi, l2_lo, l2_hi, l31, c0_lo, c0_hi, c1_lo,
+      c1_hi, c01[4];
+  (void)above;
+
+  l0_lo = vld1q_u8(left + 0);
+  l0_hi = vld1q_u8(left + 16);
+  l31 = vld1q_dup_u8(left + 31);
+
+  l1_lo = vextq_u8(l0_lo, l0_hi, 1);
+  l1_hi = vextq_u8(l0_hi, l31, 1);
+  l2_lo = vextq_u8(l0_lo, l0_hi, 2);
+  l2_hi = vextq_u8(l0_hi, l31, 2);
+
+  c0_lo = vrhaddq_u8(l0_lo, l1_lo);
+  c0_hi = vrhaddq_u8(l0_hi, l1_hi);
+  c1_lo = vrhaddq_u8(vhaddq_u8(l0_lo, l2_lo), l1_lo);
+  c1_hi = vrhaddq_u8(vhaddq_u8(l0_hi, l2_hi), l1_hi);
+
+  c01[0] = vzipq_u8(c0_lo, c1_lo).val[0];
+  c01[1] = vzipq_u8(c0_lo, c1_lo).val[1];
+  c01[2] = vzipq_u8(c0_hi, c1_hi).val[0];
+  c01[3] = vzipq_u8(c0_hi, c1_hi).val[1];
+
+  vst1q_u8(dst + 0 * stride + 0, c01[0]);
+  vst1q_u8(dst + 0 * stride + 16, c01[1]);
+  vst1q_u8(dst + 1 * stride + 0, vextq_u8(c01[0], c01[1], 2));
+  vst1q_u8(dst + 1 * stride + 16, vextq_u8(c01[1], c01[2], 2));
+  vst1q_u8(dst + 2 * stride + 0, vextq_u8(c01[0], c01[1], 4));
+  vst1q_u8(dst + 2 * stride + 16, vextq_u8(c01[1], c01[2], 4));
+  vst1q_u8(dst + 3 * stride + 0, vextq_u8(c01[0], c01[1], 6));
+  vst1q_u8(dst + 3 * stride + 16, vextq_u8(c01[1], c01[2], 6));
+  vst1q_u8(dst + 4 * stride + 0, vextq_u8(c01[0], c01[1], 8));
+  vst1q_u8(dst + 4 * stride + 16, vextq_u8(c01[1], c01[2], 8));
+  vst1q_u8(dst + 5 * stride + 0, vextq_u8(c01[0], c01[1], 10));
+  vst1q_u8(dst + 5 * stride + 16, vextq_u8(c01[1], c01[2], 10));
+  vst1q_u8(dst + 6 * stride + 0, vextq_u8(c01[0], c01[1], 12));
+  vst1q_u8(dst + 6 * stride + 16, vextq_u8(c01[1], c01[2], 12));
+  vst1q_u8(dst + 7 * stride + 0, vextq_u8(c01[0], c01[1], 14));
+  vst1q_u8(dst + 7 * stride + 16, vextq_u8(c01[1], c01[2], 14));
+  vst1q_u8(dst + 8 * stride + 0, c01[1]);
+  vst1q_u8(dst + 8 * stride + 16, c01[2]);
+  vst1q_u8(dst + 9 * stride + 0, vextq_u8(c01[1], c01[2], 2));
+  vst1q_u8(dst + 9 * stride + 16, vextq_u8(c01[2], c01[3], 2));
+  vst1q_u8(dst + 10 * stride + 0, vextq_u8(c01[1], c01[2], 4));
+  vst1q_u8(dst + 10 * stride + 16, vextq_u8(c01[2], c01[3], 4));
+  vst1q_u8(dst + 11 * stride + 0, vextq_u8(c01[1], c01[2], 6));
+  vst1q_u8(dst + 11 * stride + 16, vextq_u8(c01[2], c01[3], 6));
+  vst1q_u8(dst + 12 * stride + 0, vextq_u8(c01[1], c01[2], 8));
+  vst1q_u8(dst + 12 * stride + 16, vextq_u8(c01[2], c01[3], 8));
+  vst1q_u8(dst + 13 * stride + 0, vextq_u8(c01[1], c01[2], 10));
+  vst1q_u8(dst + 13 * stride + 16, vextq_u8(c01[2], c01[3], 10));
+  vst1q_u8(dst + 14 * stride + 0, vextq_u8(c01[1], c01[2], 12));
+  vst1q_u8(dst + 14 * stride + 16, vextq_u8(c01[2], c01[3], 12));
+  vst1q_u8(dst + 15 * stride + 0, vextq_u8(c01[1], c01[2], 14));
+  vst1q_u8(dst + 15 * stride + 16, vextq_u8(c01[2], c01[3], 14));
+  vst1q_u8(dst + 16 * stride + 0, c01[2]);
+  vst1q_u8(dst + 16 * stride + 16, c01[3]);
+  vst1q_u8(dst + 17 * stride + 0, vextq_u8(c01[2], c01[3], 2));
+  vst1q_u8(dst + 17 * stride + 16, vextq_u8(c01[3], l31, 2));
+  vst1q_u8(dst + 18 * stride + 0, vextq_u8(c01[2], c01[3], 4));
+  vst1q_u8(dst + 18 * stride + 16, vextq_u8(c01[3], l31, 4));
+  vst1q_u8(dst + 19 * stride + 0, vextq_u8(c01[2], c01[3], 6));
+  vst1q_u8(dst + 19 * stride + 16, vextq_u8(c01[3], l31, 6));
+  vst1q_u8(dst + 20 * stride + 0, vextq_u8(c01[2], c01[3], 8));
+  vst1q_u8(dst + 20 * stride + 16, vextq_u8(c01[3], l31, 8));
+  vst1q_u8(dst + 21 * stride + 0, vextq_u8(c01[2], c01[3], 10));
+  vst1q_u8(dst + 21 * stride + 16, vextq_u8(c01[3], l31, 10));
+  vst1q_u8(dst + 22 * stride + 0, vextq_u8(c01[2], c01[3], 12));
+  vst1q_u8(dst + 22 * stride + 16, vextq_u8(c01[3], l31, 12));
+  vst1q_u8(dst + 23 * stride + 0, vextq_u8(c01[2], c01[3], 14));
+  vst1q_u8(dst + 23 * stride + 16, vextq_u8(c01[3], l31, 14));
+  vst1q_u8(dst + 24 * stride + 0, c01[3]);
+  vst1q_u8(dst + 24 * stride + 16, l31);
+  vst1q_u8(dst + 25 * stride + 0, vextq_u8(c01[3], l31, 2));
+  vst1q_u8(dst + 25 * stride + 16, l31);
+  vst1q_u8(dst + 26 * stride + 0, vextq_u8(c01[3], l31, 4));
+  vst1q_u8(dst + 26 * stride + 16, l31);
+  vst1q_u8(dst + 27 * stride + 0, vextq_u8(c01[3], l31, 6));
+  vst1q_u8(dst + 27 * stride + 16, l31);
+  vst1q_u8(dst + 28 * stride + 0, vextq_u8(c01[3], l31, 8));
+  vst1q_u8(dst + 28 * stride + 16, l31);
+  vst1q_u8(dst + 29 * stride + 0, vextq_u8(c01[3], l31, 10));
+  vst1q_u8(dst + 29 * stride + 16, l31);
+  vst1q_u8(dst + 30 * stride + 0, vextq_u8(c01[3], l31, 12));
+  vst1q_u8(dst + 30 * stride + 16, l31);
+  vst1q_u8(dst + 31 * stride + 0, vextq_u8(c01[3], l31, 14));
+  vst1q_u8(dst + 31 * stride + 16, l31);
+}
+
+// -----------------------------------------------------------------------------
+
 #if !HAVE_NEON_ASM
 
 void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm
index a042d40acb..a81a9d1013 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm
+++ b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm
@@ -201,7 +201,7 @@
     str         lr, [sp, #16]              ; thresh1
     add         sp, #4
     pop         {r0-r1, lr}
-    add         r0, r1, lsl #3             ; s + 8 * pitch
+    add         r0, r0, r1, lsl #3         ; s + 8 * pitch
     b           vpx_lpf_vertical_8_neon
     ENDP        ; |vpx_lpf_vertical_8_dual_neon|
 
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c
index 7419cea022..579096d78a 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c
@@ -162,7 +162,7 @@ FUN_FLIP_SIGN(16, q_)  // flip_sign_16
 
 #define FUN_FLIP_SIGN_BACK(w, r)                                         \
   static INLINE uint8x##w##_t flip_sign_back_##w(const int8x##w##_t v) { \
-    const int8x##w##_t sign_bit = vdup##r##n_s8(0x80);                   \
+    const int8x##w##_t sign_bit = vdup##r##n_s8((int8_t)0x80);           \
     return vreinterpret##r##u8_s8(veor##r##s8(v, sign_bit));             \
   }
 
@@ -975,6 +975,17 @@ FUN_LPF_16_KERNEL(_, 8)        // lpf_16_kernel
 FUN_LPF_16_KERNEL(_dual_, 16)  // lpf_16_dual_kernel
 #undef FUN_LPF_16_KERNEL
 
+// Quiet warnings of the form: 'vpx_dsp/arm/loopfilter_neon.c|981 col 42|
+// warning: 'oq1' may be used uninitialized in this function
+// [-Wmaybe-uninitialized]', for oq1-op1. Without reworking the code or adding
+// an additional branch this warning cannot be silenced otherwise. The
+// loopfilter is only called when needed for a block so these output pixels
+// will be set.
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+
 void vpx_lpf_horizontal_16_neon(uint8_t *s, int p, const uint8_t *blimit,
                                 const uint8_t *limit, const uint8_t *thresh) {
   uint8x8_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6,
@@ -1090,3 +1101,7 @@ void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
               vget_high_u8(oq0), vget_high_u8(oq1));
   }
 }
+
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h
new file mode 100644
index 0000000000..7088eb5450
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h
@@ -0,0 +1,757 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_MEM_NEON_H_
+#define VPX_VPX_DSP_ARM_MEM_NEON_H_
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+// Support for these xN intrinsics is lacking in older versions of GCC.
+#if defined(__GNUC__) && !defined(__clang__)
+#if __GNUC__ < 8 || defined(__arm__)
+static INLINE uint8x16x2_t vld1q_u8_x2(uint8_t const *ptr) {
+  uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
+  return res;
+}
+#endif
+
+#if __GNUC__ < 9 || defined(__arm__)
+static INLINE uint8x16x3_t vld1q_u8_x3(uint8_t const *ptr) {
+  uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
+                         vld1q_u8(ptr + 2 * 16) } };
+  return res;
+}
+#endif
+#endif
+
+static INLINE int16x4_t create_s16x4_neon(const int16_t c0, const int16_t c1,
+                                          const int16_t c2, const int16_t c3) {
+  return vcreate_s16((uint16_t)c0 | ((uint32_t)c1 << 16) |
+                     ((uint64_t)(uint16_t)c2 << 32) | ((uint64_t)c3 << 48));
+}
+
+static INLINE int32x2_t create_s32x2_neon(const int32_t c0, const int32_t c1) {
+  return vcreate_s32((uint32_t)c0 | ((uint64_t)(uint32_t)c1 << 32));
+}
+
+static INLINE int32x4_t create_s32x4_neon(const int32_t c0, const int32_t c1,
+                                          const int32_t c2, const int32_t c3) {
+  return vcombine_s32(create_s32x2_neon(c0, c1), create_s32x2_neon(c2, c3));
+}
+
+// Helper functions used to load tran_low_t into int16, narrowing if necessary.
+static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int32x4x2_t v0 = vld2q_s32(buf);
+  const int32x4x2_t v1 = vld2q_s32(buf + 8);
+  const int16x4_t s0 = vmovn_s32(v0.val[0]);
+  const int16x4_t s1 = vmovn_s32(v0.val[1]);
+  const int16x4_t s2 = vmovn_s32(v1.val[0]);
+  const int16x4_t s3 = vmovn_s32(v1.val[1]);
+  int16x8x2_t res;
+  res.val[0] = vcombine_s16(s0, s2);
+  res.val[1] = vcombine_s16(s1, s3);
+  return res;
+#else
+  return vld2q_s16(buf);
+#endif
+}
+
+static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int32x4_t v0 = vld1q_s32(buf);
+  const int32x4_t v1 = vld1q_s32(buf + 4);
+  const int16x4_t s0 = vmovn_s32(v0);
+  const int16x4_t s1 = vmovn_s32(v1);
+  return vcombine_s16(s0, s1);
+#else
+  return vld1q_s16(buf);
+#endif
+}
+
+static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int32x4_t v0 = vld1q_s32(buf);
+  return vmovn_s32(v0);
+#else
+  return vld1_s16(buf);
+#endif
+}
+
+static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
+  const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
+  vst1q_s32(buf, v0);
+  vst1q_s32(buf + 4, v1);
+#else
+  vst1q_s16(buf, a);
+#endif
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void store_s32q_to_tran_low(tran_low_t *buf, const int32x4_t a) {
+  vst1q_s32(buf, a);
+}
+
+static INLINE int32x4_t load_tran_low_to_s32q(const tran_low_t *buf) {
+  return vld1q_s32(buf);
+}
+#endif
+
+// Propagate type information to the compiler. Without this the compiler may
+// assume the required alignment of uint32_t (4 bytes) and add alignment hints
+// to the memory access.
+//
+// This is used for functions operating on uint8_t which wish to load or store 4
+// values at a time but which may not be on 4 byte boundaries.
+static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) {
+  memcpy(buf, &a, 4);
+}
+
+// Load 4 contiguous bytes when alignment is not guaranteed.
+static INLINE uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) {
+  uint32_t a;
+  uint32x2_t a_u32;
+  memcpy(&a, buf, 4);
+  a_u32 = vdup_n_u32(0);
+  a_u32 = vset_lane_u32(a, a_u32, 0);
+  return vreinterpret_u8_u32(a_u32);
+}
+
+// Load 4 contiguous bytes and replicate across a vector when alignment is not
+// guaranteed.
+static INLINE uint8x8_t load_replicate_u8_4x1(const uint8_t *buf) {
+  uint32_t a;
+  memcpy(&a, buf, 4);
+  return vreinterpret_u8_u32(vdup_n_u32(a));
+}
+
+// Store 4 contiguous bytes from the low half of an 8x8 vector.
+static INLINE void store_u8_4x1(uint8_t *buf, uint8x8_t a) {
+  vst1_lane_u32((uint32_t *)buf, vreinterpret_u32_u8(a), 0);
+}
+
+// Store 4 contiguous bytes from the high half of an 8x8 vector.
+static INLINE void store_u8_4x1_high(uint8_t *buf, uint8x8_t a) {
+  vst1_lane_u32((uint32_t *)buf, vreinterpret_u32_u8(a), 1);
+}
+
+// Load 2 sets of 4 bytes when alignment is not guaranteed.
+static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf,
+                                          ptrdiff_t stride) {
+  uint32_t a;
+  uint32x2_t a_u32 = vdup_n_u32(0);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vset_lane_u32(a, a_u32, 0);
+  memcpy(&a, buf, 4);
+  a_u32 = vset_lane_u32(a, a_u32, 1);
+  return vreinterpret_u8_u32(a_u32);
+}
+
+// Load 8 bytes when alignment is not guaranteed.
+static INLINE uint16x4_t load_unaligned_u16(const uint16_t *buf) {
+  uint64_t a;
+  uint64x1_t a_u64 = vdup_n_u64(0);
+  memcpy(&a, buf, 8);
+  a_u64 = vset_lane_u64(a, a_u64, 0);
+  return vreinterpret_u16_u64(a_u64);
+}
+
+// Load 2 sets of 8 bytes when alignment is not guaranteed.
+static INLINE uint16x8_t load_unaligned_u16q(const uint16_t *buf,
+                                             ptrdiff_t stride) {
+  uint64_t a;
+  uint64x2_t a_u64 = vdupq_n_u64(0);
+  memcpy(&a, buf, 8);
+  buf += stride;
+  a_u64 = vsetq_lane_u64(a, a_u64, 0);
+  memcpy(&a, buf, 8);
+  a_u64 = vsetq_lane_u64(a, a_u64, 1);
+  return vreinterpretq_u16_u64(a_u64);
+}
+
+// Store 2 sets of 4 bytes when alignment is not guaranteed.
+static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride,
+                                      const uint8x8_t a) {
+  const uint32x2_t a_u32 = vreinterpret_u32_u8(a);
+  uint32_to_mem(buf, vget_lane_u32(a_u32, 0));
+  buf += stride;
+  uint32_to_mem(buf, vget_lane_u32(a_u32, 1));
+}
+
+// Load 4 sets of 4 bytes when alignment is not guaranteed.
+static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf,
+                                            ptrdiff_t stride) {
+  uint32_t a;
+  uint32x4_t a_u32 = vdupq_n_u32(0);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vsetq_lane_u32(a, a_u32, 0);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vsetq_lane_u32(a, a_u32, 1);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vsetq_lane_u32(a, a_u32, 2);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vsetq_lane_u32(a, a_u32, 3);
+  return vreinterpretq_u8_u32(a_u32);
+}
+
+// Store 4 sets of 4 bytes when alignment is not guaranteed.
+static INLINE void store_unaligned_u8q(uint8_t *buf, ptrdiff_t stride,
+                                       const uint8x16_t a) {
+  const uint32x4_t a_u32 = vreinterpretq_u32_u8(a);
+  uint32_to_mem(buf, vgetq_lane_u32(a_u32, 0));
+  buf += stride;
+  uint32_to_mem(buf, vgetq_lane_u32(a_u32, 1));
+  buf += stride;
+  uint32_to_mem(buf, vgetq_lane_u32(a_u32, 2));
+  buf += stride;
+  uint32_to_mem(buf, vgetq_lane_u32(a_u32, 3));
+}
+
+// Load 2 sets of 4 bytes when alignment is guaranteed.
+static INLINE uint8x8_t load_u8(const uint8_t *buf, ptrdiff_t stride) {
+  uint32x2_t a = vdup_n_u32(0);
+
+  assert(!((intptr_t)buf % sizeof(uint32_t)));
+  assert(!(stride % sizeof(uint32_t)));
+
+  a = vld1_lane_u32((const uint32_t *)buf, a, 0);
+  buf += stride;
+  a = vld1_lane_u32((const uint32_t *)buf, a, 1);
+  return vreinterpret_u8_u32(a);
+}
+
+// Store 2 sets of 4 bytes when alignment is guaranteed.
+static INLINE void store_u8(uint8_t *buf, ptrdiff_t stride, const uint8x8_t a) {
+  uint32x2_t a_u32 = vreinterpret_u32_u8(a);
+
+  assert(!((intptr_t)buf % sizeof(uint32_t)));
+  assert(!(stride % sizeof(uint32_t)));
+
+  vst1_lane_u32((uint32_t *)buf, a_u32, 0);
+  buf += stride;
+  vst1_lane_u32((uint32_t *)buf, a_u32, 1);
+}
+
+static INLINE void store_u8_8x3(uint8_t *s, const ptrdiff_t p,
+                                const uint8x8_t s0, const uint8x8_t s1,
+                                const uint8x8_t s2) {
+  vst1_u8(s, s0);
+  s += p;
+  vst1_u8(s, s1);
+  s += p;
+  vst1_u8(s, s2);
+}
+
+static INLINE void load_u8_8x3(const uint8_t *s, const ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+}
+
+static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2, uint8x8_t *const s3) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+  s += p;
+  *s3 = vld1_u8(s);
+}
+
+static INLINE void store_u8_8x4(uint8_t *s, const ptrdiff_t p,
+                                const uint8x8_t s0, const uint8x8_t s1,
+                                const uint8x8_t s2, const uint8x8_t s3) {
+  vst1_u8(s, s0);
+  s += p;
+  vst1_u8(s, s1);
+  s += p;
+  vst1_u8(s, s2);
+  s += p;
+  vst1_u8(s, s3);
+}
+
+static INLINE void load_u8_16x3(const uint8_t *s, const ptrdiff_t p,
+                                uint8x16_t *const s0, uint8x16_t *const s1,
+                                uint8x16_t *const s2) {
+  *s0 = vld1q_u8(s);
+  s += p;
+  *s1 = vld1q_u8(s);
+  s += p;
+  *s2 = vld1q_u8(s);
+}
+
+static INLINE void load_u8_16x4(const uint8_t *s, const ptrdiff_t p,
+                                uint8x16_t *const s0, uint8x16_t *const s1,
+                                uint8x16_t *const s2, uint8x16_t *const s3) {
+  *s0 = vld1q_u8(s);
+  s += p;
+  *s1 = vld1q_u8(s);
+  s += p;
+  *s2 = vld1q_u8(s);
+  s += p;
+  *s3 = vld1q_u8(s);
+}
+
+static INLINE void store_u8_16x4(uint8_t *s, const ptrdiff_t p,
+                                 const uint8x16_t s0, const uint8x16_t s1,
+                                 const uint8x16_t s2, const uint8x16_t s3) {
+  vst1q_u8(s, s0);
+  s += p;
+  vst1q_u8(s, s1);
+  s += p;
+  vst1q_u8(s, s2);
+  s += p;
+  vst1q_u8(s, s3);
+}
+
+static INLINE void load_u8_8x7(const uint8_t *s, const ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2, uint8x8_t *const s3,
+                               uint8x8_t *const s4, uint8x8_t *const s5,
+                               uint8x8_t *const s6) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+  s += p;
+  *s3 = vld1_u8(s);
+  s += p;
+  *s4 = vld1_u8(s);
+  s += p;
+  *s5 = vld1_u8(s);
+  s += p;
+  *s6 = vld1_u8(s);
+}
+
+static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2, uint8x8_t *const s3,
+                               uint8x8_t *const s4, uint8x8_t *const s5,
+                               uint8x8_t *const s6, uint8x8_t *const s7) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+  s += p;
+  *s3 = vld1_u8(s);
+  s += p;
+  *s4 = vld1_u8(s);
+  s += p;
+  *s5 = vld1_u8(s);
+  s += p;
+  *s6 = vld1_u8(s);
+  s += p;
+  *s7 = vld1_u8(s);
+}
+
+static INLINE void load_u8_8x11(const uint8_t *s, ptrdiff_t p,
+                                uint8x8_t *const s0, uint8x8_t *const s1,
+                                uint8x8_t *const s2, uint8x8_t *const s3,
+                                uint8x8_t *const s4, uint8x8_t *const s5,
+                                uint8x8_t *const s6, uint8x8_t *const s7,
+                                uint8x8_t *const s8, uint8x8_t *const s9,
+                                uint8x8_t *const s10) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+  s += p;
+  *s3 = vld1_u8(s);
+  s += p;
+  *s4 = vld1_u8(s);
+  s += p;
+  *s5 = vld1_u8(s);
+  s += p;
+  *s6 = vld1_u8(s);
+  s += p;
+  *s7 = vld1_u8(s);
+  s += p;
+  *s8 = vld1_u8(s);
+  s += p;
+  *s9 = vld1_u8(s);
+  s += p;
+  *s10 = vld1_u8(s);
+}
+
+static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,
+                                const uint8x8_t s0, const uint8x8_t s1,
+                                const uint8x8_t s2, const uint8x8_t s3,
+                                const uint8x8_t s4, const uint8x8_t s5,
+                                const uint8x8_t s6, const uint8x8_t s7) {
+  vst1_u8(s, s0);
+  s += p;
+  vst1_u8(s, s1);
+  s += p;
+  vst1_u8(s, s2);
+  s += p;
+  vst1_u8(s, s3);
+  s += p;
+  vst1_u8(s, s4);
+  s += p;
+  vst1_u8(s, s5);
+  s += p;
+  vst1_u8(s, s6);
+  s += p;
+  vst1_u8(s, s7);
+}
+
+static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p,
+                                uint8x16_t *const s0, uint8x16_t *const s1,
+                                uint8x16_t *const s2, uint8x16_t *const s3,
+                                uint8x16_t *const s4, uint8x16_t *const s5,
+                                uint8x16_t *const s6, uint8x16_t *const s7) {
+  *s0 = vld1q_u8(s);
+  s += p;
+  *s1 = vld1q_u8(s);
+  s += p;
+  *s2 = vld1q_u8(s);
+  s += p;
+  *s3 = vld1q_u8(s);
+  s += p;
+  *s4 = vld1q_u8(s);
+  s += p;
+  *s5 = vld1q_u8(s);
+  s += p;
+  *s6 = vld1q_u8(s);
+  s += p;
+  *s7 = vld1q_u8(s);
+}
+
+static INLINE void store_u8_16x8(uint8_t *s, const ptrdiff_t p,
+                                 const uint8x16_t s0, const uint8x16_t s1,
+                                 const uint8x16_t s2, const uint8x16_t s3,
+                                 const uint8x16_t s4, const uint8x16_t s5,
+                                 const uint8x16_t s6, const uint8x16_t s7) {
+  vst1q_u8(s, s0);
+  s += p;
+  vst1q_u8(s, s1);
+  s += p;
+  vst1q_u8(s, s2);
+  s += p;
+  vst1q_u8(s, s3);
+  s += p;
+  vst1q_u8(s, s4);
+  s += p;
+  vst1q_u8(s, s5);
+  s += p;
+  vst1q_u8(s, s6);
+  s += p;
+  vst1q_u8(s, s7);
+}
+
+static INLINE void store_u16_4x3(uint16_t *s, const ptrdiff_t p,
+                                 const uint16x4_t s0, const uint16x4_t s1,
+                                 const uint16x4_t s2) {
+  vst1_u16(s, s0);
+  s += p;
+  vst1_u16(s, s1);
+  s += p;
+  vst1_u16(s, s2);
+}
+
+static INLINE void load_s16_4x3(const int16_t *s, const ptrdiff_t p,
+                                int16x4_t *s0, int16x4_t *s1, int16x4_t *s2) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+}
+
+static INLINE void load_s16_4x4(const int16_t *s, const ptrdiff_t p,
+                                int16x4_t *s0, int16x4_t *s1, int16x4_t *s2,
+                                int16x4_t *s3) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+  s += p;
+  *s3 = vld1_s16(s);
+}
+
+static INLINE void load_s16_4x11(const int16_t *s, const ptrdiff_t p,
+                                 int16x4_t *s0, int16x4_t *s1, int16x4_t *s2,
+                                 int16x4_t *s3, int16x4_t *s4, int16x4_t *s5,
+                                 int16x4_t *s6, int16x4_t *s7, int16x4_t *s8,
+                                 int16x4_t *s9, int16x4_t *s10) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+  s += p;
+  *s3 = vld1_s16(s);
+  s += p;
+  *s4 = vld1_s16(s);
+  s += p;
+  *s5 = vld1_s16(s);
+  s += p;
+  *s6 = vld1_s16(s);
+  s += p;
+  *s7 = vld1_s16(s);
+  s += p;
+  *s8 = vld1_s16(s);
+  s += p;
+  *s9 = vld1_s16(s);
+  s += p;
+  *s10 = vld1_s16(s);
+}
+
+static INLINE void store_u16_4x4(uint16_t *s, const ptrdiff_t p,
+                                 const uint16x4_t s0, const uint16x4_t s1,
+                                 const uint16x4_t s2, const uint16x4_t s3) {
+  vst1_u16(s, s0);
+  s += p;
+  vst1_u16(s, s1);
+  s += p;
+  vst1_u16(s, s2);
+  s += p;
+  vst1_u16(s, s3);
+}
+
+static INLINE void load_s16_4x7(const int16_t *s, const ptrdiff_t p,
+                                int16x4_t *s0, int16x4_t *s1, int16x4_t *s2,
+                                int16x4_t *s3, int16x4_t *s4, int16x4_t *s5,
+                                int16x4_t *s6) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+  s += p;
+  *s3 = vld1_s16(s);
+  s += p;
+  *s4 = vld1_s16(s);
+  s += p;
+  *s5 = vld1_s16(s);
+  s += p;
+  *s6 = vld1_s16(s);
+}
+
+static INLINE void load_s16_8x3(const int16_t *s, const ptrdiff_t p,
+                                int16x8_t *s0, int16x8_t *s1, int16x8_t *s2) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+}
+
+static INLINE void load_s16_8x4(const int16_t *s, const ptrdiff_t p,
+                                int16x8_t *s0, int16x8_t *s1, int16x8_t *s2,
+                                int16x8_t *s3) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+}
+
+static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
+                                uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
+                                uint16x8_t *s3) {
+  *s0 = vld1q_u16(s);
+  s += p;
+  *s1 = vld1q_u16(s);
+  s += p;
+  *s2 = vld1q_u16(s);
+  s += p;
+  *s3 = vld1q_u16(s);
+}
+
+static INLINE void store_u16_8x4(uint16_t *s, const ptrdiff_t p,
+                                 const uint16x8_t s0, const uint16x8_t s1,
+                                 const uint16x8_t s2, const uint16x8_t s3) {
+  vst1q_u16(s, s0);
+  s += p;
+  vst1q_u16(s, s1);
+  s += p;
+  vst1q_u16(s, s2);
+  s += p;
+  vst1q_u16(s, s3);
+}
+
+static INLINE void store_u16_8x3(uint16_t *s, const ptrdiff_t p,
+                                 const uint16x8_t s0, const uint16x8_t s1,
+                                 const uint16x8_t s2) {
+  vst1q_u16(s, s0);
+  s += p;
+  vst1q_u16(s, s1);
+  s += p;
+  vst1q_u16(s, s2);
+}
+
+static INLINE void load_s16_8x7(const int16_t *s, const ptrdiff_t p,
+                                int16x8_t *s0, int16x8_t *s1, int16x8_t *s2,
+                                int16x8_t *s3, int16x8_t *s4, int16x8_t *s5,
+                                int16x8_t *s6) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+  s += p;
+  *s4 = vld1q_s16(s);
+  s += p;
+  *s5 = vld1q_s16(s);
+  s += p;
+  *s6 = vld1q_s16(s);
+}
+
+static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
+                                uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
+                                uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5,
+                                uint16x8_t *s6, uint16x8_t *s7) {
+  *s0 = vld1q_u16(s);
+  s += p;
+  *s1 = vld1q_u16(s);
+  s += p;
+  *s2 = vld1q_u16(s);
+  s += p;
+  *s3 = vld1q_u16(s);
+  s += p;
+  *s4 = vld1q_u16(s);
+  s += p;
+  *s5 = vld1q_u16(s);
+  s += p;
+  *s6 = vld1q_u16(s);
+  s += p;
+  *s7 = vld1q_u16(s);
+}
+
+static INLINE void load_s16_4x8(const int16_t *s, const ptrdiff_t p,
+                                int16x4_t *s0, int16x4_t *s1, int16x4_t *s2,
+                                int16x4_t *s3, int16x4_t *s4, int16x4_t *s5,
+                                int16x4_t *s6, int16x4_t *s7) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+  s += p;
+  *s3 = vld1_s16(s);
+  s += p;
+  *s4 = vld1_s16(s);
+  s += p;
+  *s5 = vld1_s16(s);
+  s += p;
+  *s6 = vld1_s16(s);
+  s += p;
+  *s7 = vld1_s16(s);
+}
+
+static INLINE void load_s16_8x11(const int16_t *s, const ptrdiff_t p,
+                                 int16x8_t *s0, int16x8_t *s1, int16x8_t *s2,
+                                 int16x8_t *s3, int16x8_t *s4, int16x8_t *s5,
+                                 int16x8_t *s6, int16x8_t *s7, int16x8_t *s8,
+                                 int16x8_t *s9, int16x8_t *s10) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+  s += p;
+  *s4 = vld1q_s16(s);
+  s += p;
+  *s5 = vld1q_s16(s);
+  s += p;
+  *s6 = vld1q_s16(s);
+  s += p;
+  *s7 = vld1q_s16(s);
+  s += p;
+  *s8 = vld1q_s16(s);
+  s += p;
+  *s9 = vld1q_s16(s);
+  s += p;
+  *s10 = vld1q_s16(s);
+}
+
+static INLINE void load_s16_8x12(const int16_t *s, const ptrdiff_t p,
+                                 int16x8_t *s0, int16x8_t *s1, int16x8_t *s2,
+                                 int16x8_t *s3, int16x8_t *s4, int16x8_t *s5,
+                                 int16x8_t *s6, int16x8_t *s7, int16x8_t *s8,
+                                 int16x8_t *s9, int16x8_t *s10,
+                                 int16x8_t *s11) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+  s += p;
+  *s4 = vld1q_s16(s);
+  s += p;
+  *s5 = vld1q_s16(s);
+  s += p;
+  *s6 = vld1q_s16(s);
+  s += p;
+  *s7 = vld1q_s16(s);
+  s += p;
+  *s8 = vld1q_s16(s);
+  s += p;
+  *s9 = vld1q_s16(s);
+  s += p;
+  *s10 = vld1q_s16(s);
+  s += p;
+  *s11 = vld1q_s16(s);
+}
+
+static INLINE void load_s16_8x8(const int16_t *s, const ptrdiff_t p,
+                                int16x8_t *s0, int16x8_t *s1, int16x8_t *s2,
+                                int16x8_t *s3, int16x8_t *s4, int16x8_t *s5,
+                                int16x8_t *s6, int16x8_t *s7) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+  s += p;
+  *s4 = vld1q_s16(s);
+  s += p;
+  *s5 = vld1q_s16(s);
+  s += p;
+  *s6 = vld1q_s16(s);
+  s += p;
+  *s7 = vld1q_s16(s);
+}
+
+#endif  // VPX_VPX_DSP_ARM_MEM_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/quantize_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/quantize_neon.c
new file mode 100644
index 0000000000..e2351fa2cc
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/quantize_neon.c
@@ -0,0 +1,286 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
+
+static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
+                                               const int16x8_t dequant,
+                                               tran_low_t *dqcoeff_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int32x4_t dqcoeff_0 =
+      vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
+  const int32x4_t dqcoeff_1 =
+      vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
+
+  vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+  vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
+#else
+  vst1q_s16(dqcoeff_ptr, vmulq_s16(qcoeff, dequant));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+
+static INLINE int16x8_t
+quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+                tran_low_t *dqcoeff_ptr, const int16x8_t zbin,
+                const int16x8_t round, const int16x8_t quant,
+                const int16x8_t quant_shift, const int16x8_t dequant) {
+  // Load coeffs as 8 x 16-bit ints, take sign and abs values
+  const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+  const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+  const int16x8_t coeff_abs = vabsq_s16(coeff);
+
+  // Calculate mask of elements outside the bin
+  const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+
+  // Get the rounded values
+  const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
+
+  // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
+  int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
+
+  qcoeff = vaddq_s16(qcoeff, rounded);
+
+  // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
+  qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
+
+  // Restore the sign bit.
+  qcoeff = veorq_s16(qcoeff, coeff_sign);
+  qcoeff = vsubq_s16(qcoeff, coeff_sign);
+
+  // Only keep the relevant coeffs
+  qcoeff = vandq_s16(qcoeff, zbin_mask);
+  store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+
+  calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
+
+  return qcoeff;
+}
+
+void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                         const struct macroblock_plane *const mb_plane,
+                         tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                         const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                         const struct ScanOrder *const scan_order) {
+  const int16x8_t neg_one = vdupq_n_s16(-1);
+  uint16x8_t eob_max;
+  int16_t const *iscan = scan_order->iscan;
+
+  // Only the first element of each vector is DC.
+  int16x8_t zbin = vld1q_s16(mb_plane->zbin);
+  int16x8_t round = vld1q_s16(mb_plane->round);
+  int16x8_t quant = vld1q_s16(mb_plane->quant);
+  int16x8_t quant_shift = vld1q_s16(mb_plane->quant_shift);
+  int16x8_t dequant = vld1q_s16(dequant_ptr);
+
+  // Process first 8 values which include a dc component.
+  {
+    const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+    const int16x8_t qcoeff =
+        quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, quant,
+                        quant_shift, dequant);
+
+    // Set non-zero elements to -1 and use that to extract values for eob.
+    eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
+
+    __builtin_prefetch(coeff_ptr + 64);
+    coeff_ptr += 8;
+    iscan += 8;
+    qcoeff_ptr += 8;
+    dqcoeff_ptr += 8;
+  }
+
+  n_coeffs -= 8;
+
+  {
+    zbin = vdupq_lane_s16(vget_low_s16(zbin), 1);
+    round = vdupq_lane_s16(vget_low_s16(round), 1);
+    quant = vdupq_lane_s16(vget_low_s16(quant), 1);
+    quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1);
+    dequant = vdupq_lane_s16(vget_low_s16(dequant), 1);
+
+    do {
+      const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+      const int16x8_t qcoeff =
+          quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+                          quant, quant_shift, dequant);
+
+      // Set non-zero elements to -1 and use that to extract values for eob.
+      eob_max =
+          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
+
+      __builtin_prefetch(coeff_ptr + 64);
+      coeff_ptr += 8;
+      iscan += 8;
+      qcoeff_ptr += 8;
+      dqcoeff_ptr += 8;
+      n_coeffs -= 8;
+    } while (n_coeffs > 0);
+  }
+
+#if VPX_ARCH_AARCH64
+  *eob_ptr = vmaxvq_u16(eob_max);
+#else
+  {
+    const uint16x4_t eob_max_0 =
+        vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
+    const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
+    const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
+    vst1_lane_u16(eob_ptr, eob_max_2, 0);
+  }
+#endif  // VPX_ARCH_AARCH64
+}
+
+static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
+  return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
+}
+
+static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff,
+                                                     const int16x8_t dequant,
+                                                     tran_low_t *dqcoeff_ptr) {
+  int32x4_t dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
+  int32x4_t dqcoeff_1 =
+      vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
+
+  // Add 1 if negative to round towards zero because the C uses division.
+  dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
+  dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1);
+  dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1);
+  vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+  vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
+#else
+  vst1q_s16(dqcoeff_ptr,
+            vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+
+static INLINE int16x8_t
+quantize_b_32x32_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+                      tran_low_t *dqcoeff_ptr, const int16x8_t zbin,
+                      const int16x8_t round, const int16x8_t quant,
+                      const int16x8_t quant_shift, const int16x8_t dequant) {
+  // Load coeffs as 8 x 16-bit ints, take sign and abs values
+  const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+  const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+  const int16x8_t coeff_abs = vabsq_s16(coeff);
+
+  // Calculate mask of elements outside the bin
+  const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+
+  // Get the rounded values
+  const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
+
+  // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
+  int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
+
+  qcoeff = vaddq_s16(qcoeff, rounded);
+
+  // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
+  qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
+
+  // Restore the sign bit.
+  qcoeff = veorq_s16(qcoeff, coeff_sign);
+  qcoeff = vsubq_s16(qcoeff, coeff_sign);
+
+  // Only keep the relevant coeffs
+  qcoeff = vandq_s16(qcoeff, zbin_mask);
+  store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+
+  calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
+
+  return qcoeff;
+}
+
+// Main difference is that zbin values are halved before comparison and dqcoeff
+// values are divided by 2. zbin is rounded but dqcoeff is not.
+void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr,
+                               const struct macroblock_plane *mb_plane,
+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                               const struct ScanOrder *const scan_order) {
+  const int16x8_t neg_one = vdupq_n_s16(-1);
+  uint16x8_t eob_max;
+  int i;
+  const int16_t *iscan = scan_order->iscan;
+
+  // Only the first element of each vector is DC.
+  int16x8_t zbin = vrshrq_n_s16(vld1q_s16(mb_plane->zbin), 1);
+  int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round), 1);
+  int16x8_t quant = vld1q_s16(mb_plane->quant);
+  int16x8_t quant_shift = vld1q_s16(mb_plane->quant_shift);
+  int16x8_t dequant = vld1q_s16(dequant_ptr);
+
+  // Process first 8 values which include a dc component.
+  {
+    const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+    const int16x8_t qcoeff =
+        quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+                              quant, quant_shift, dequant);
+
+    // Set non-zero elements to -1 and use that to extract values for eob.
+    eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
+
+    __builtin_prefetch(coeff_ptr + 64);
+    coeff_ptr += 8;
+    iscan += 8;
+    qcoeff_ptr += 8;
+    dqcoeff_ptr += 8;
+  }
+
+  {
+    zbin = vdupq_lane_s16(vget_low_s16(zbin), 1);
+    round = vdupq_lane_s16(vget_low_s16(round), 1);
+    quant = vdupq_lane_s16(vget_low_s16(quant), 1);
+    quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1);
+    dequant = vdupq_lane_s16(vget_low_s16(dequant), 1);
+
+    for (i = 1; i < 32 * 32 / 8; ++i) {
+      const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+      const int16x8_t qcoeff =
+          quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+                                quant, quant_shift, dequant);
+
+      // Set non-zero elements to -1 and use that to extract values for eob.
+      eob_max =
+          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
+
+      __builtin_prefetch(coeff_ptr + 64);
+      coeff_ptr += 8;
+      iscan += 8;
+      qcoeff_ptr += 8;
+      dqcoeff_ptr += 8;
+    }
+  }
+
+#if VPX_ARCH_AARCH64
+  *eob_ptr = vmaxvq_u16(eob_max);
+#else
+  {
+    const uint16x4_t eob_max_0 =
+        vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
+    const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
+    const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
+    vst1_lane_u16(eob_ptr, eob_max_2, 0);
+  }
+#endif  // VPX_ARCH_AARCH64
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c
index dc20398000..713eec7a92 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c
@@ -10,215 +10,219 @@
 
 #include <arm_neon.h>
 
+#include <assert.h>
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
 
-static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
-                                                    const uint16x8_t vec_hi) {
-  const uint32x4_t vec_l_lo =
-      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
-  const uint32x4_t vec_l_hi =
-      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
-  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
-  const uint64x2_t b = vpaddlq_u32(a);
-  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                                vreinterpret_u32_u64(vget_high_u64(b)));
-  return vget_lane_u32(c, 0);
+static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
+                              uint16x8_t *const sad_sum) {
+  uint8x16_t abs_diff = vabdq_u8(src, ref);
+  *sad_sum = vpadalq_u8(*sad_sum, abs_diff);
 }
 
-// Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16,
-// vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo
-// and vec_sum_ref_hi.
-static void sad_neon_64(const uint8x16_t vec_src_00,
-                        const uint8x16_t vec_src_16,
-                        const uint8x16_t vec_src_32,
-                        const uint8x16_t vec_src_48, const uint8_t *ref,
-                        uint16x8_t *vec_sum_ref_lo,
-                        uint16x8_t *vec_sum_ref_hi) {
-  const uint8x16_t vec_ref_00 = vld1q_u8(ref);
-  const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
-  const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
-  const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
+static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref[4], int ref_stride,
+                                   uint32_t res[4], int h) {
+  uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                           vdupq_n_u16(0) };
+  uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                           vdupq_n_u16(0) };
 
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
-                             vget_low_u8(vec_ref_00));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
-                             vget_high_u8(vec_ref_00));
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
-                             vget_low_u8(vec_ref_16));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
-                             vget_high_u8(vec_ref_16));
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32),
-                             vget_low_u8(vec_ref_32));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32),
-                             vget_high_u8(vec_ref_32));
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48),
-                             vget_low_u8(vec_ref_48));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48),
-                             vget_high_u8(vec_ref_48));
+  int i = 0;
+  do {
+    uint8x16_t s0, s1, s2, s3;
+
+    s0 = vld1q_u8(src + i * src_stride);
+    sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+    sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+    sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+    sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+    s1 = vld1q_u8(src + i * src_stride + 16);
+    sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+    sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+    sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+    sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+    s2 = vld1q_u8(src + i * src_stride + 32);
+    sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]);
+    sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]);
+    sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]);
+    sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]);
+
+    s3 = vld1q_u8(src + i * src_stride + 48);
+    sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]);
+    sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]);
+    sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]);
+    sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]);
+
+    i++;
+  } while (i < h);
+
+  vst1q_u32(res, horizontal_long_add_4d_uint16x8(sum_lo, sum_hi));
 }
 
-// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16,
-// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi.
-static void sad_neon_32(const uint8x16_t vec_src_00,
-                        const uint8x16_t vec_src_16, const uint8_t *ref,
-                        uint16x8_t *vec_sum_ref_lo,
-                        uint16x8_t *vec_sum_ref_hi) {
-  const uint8x16_t vec_ref_00 = vld1q_u8(ref);
-  const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref[4], int ref_stride,
+                                   uint32_t res[4], int h) {
+  uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                           vdupq_n_u16(0) };
+  uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                           vdupq_n_u16(0) };
 
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
-                             vget_low_u8(vec_ref_00));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
-                             vget_high_u8(vec_ref_00));
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
-                             vget_low_u8(vec_ref_16));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
-                             vget_high_u8(vec_ref_16));
+  int i = 0;
+  do {
+    uint8x16_t s0, s1;
+
+    s0 = vld1q_u8(src + i * src_stride);
+    sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+    sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+    sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+    sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+    s1 = vld1q_u8(src + i * src_stride + 16);
+    sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+    sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+    sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+    sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+    i++;
+  } while (i < h);
+
+  vst1q_u32(res, horizontal_long_add_4d_uint16x8(sum_lo, sum_hi));
 }
 
-void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t *res) {
-  int i;
-  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
-  const uint8_t *ref0, *ref1, *ref2, *ref3;
-  ref0 = ref[0];
-  ref1 = ref[1];
-  ref2 = ref[2];
-  ref3 = ref[3];
+static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref[4], int ref_stride,
+                                   uint32_t res[4], int h) {
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
 
-  for (i = 0; i < 64; ++i) {
-    const uint8x16_t vec_src_00 = vld1q_u8(src);
-    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
-    const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
-    const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
+  int i = 0;
+  do {
+    const uint8x16_t s = vld1q_u8(src + i * src_stride);
+    sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]);
+    sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]);
+    sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]);
+    sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]);
 
-    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0,
-                &vec_sum_ref0_lo, &vec_sum_ref0_hi);
-    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1,
-                &vec_sum_ref1_lo, &vec_sum_ref1_hi);
-    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2,
-                &vec_sum_ref2_lo, &vec_sum_ref2_hi);
-    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3,
-                &vec_sum_ref3_lo, &vec_sum_ref3_hi);
+    i++;
+  } while (i < h);
 
-    src += src_stride;
-    ref0 += ref_stride;
-    ref1 += ref_stride;
-    ref2 += ref_stride;
-    ref3 += ref_stride;
+  vst1q_u32(res, horizontal_add_4d_uint16x8(sum));
+}
+
+static INLINE void sad8_neon(uint8x8_t src, uint8x8_t ref,
+                             uint16x8_t *const sad_sum) {
+  uint8x8_t abs_diff = vabd_u8(src, ref);
+  *sad_sum = vaddw_u8(*sad_sum, abs_diff);
+}
+
+static INLINE void sad8xhx4d_neon(const uint8_t *src, int src_stride,
+                                  const uint8_t *const ref[4], int ref_stride,
+                                  uint32_t res[4], int h) {
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+
+  int i = 0;
+  do {
+    const uint8x8_t s = vld1_u8(src + i * src_stride);
+    sad8_neon(s, vld1_u8(ref[0] + i * ref_stride), &sum[0]);
+    sad8_neon(s, vld1_u8(ref[1] + i * ref_stride), &sum[1]);
+    sad8_neon(s, vld1_u8(ref[2] + i * ref_stride), &sum[2]);
+    sad8_neon(s, vld1_u8(ref[3] + i * ref_stride), &sum[3]);
+
+    i++;
+  } while (i < h);
+
+  vst1q_u32(res, horizontal_add_4d_uint16x8(sum));
+}
+
+static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride,
+                                  const uint8_t *const ref[4], int ref_stride,
+                                  uint32_t res[4], int h) {
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+
+  int i = 0;
+  do {
+    uint8x8_t s = load_unaligned_u8(src + i * src_stride, src_stride);
+    uint8x8_t r0 = load_unaligned_u8(ref[0] + i * ref_stride, ref_stride);
+    uint8x8_t r1 = load_unaligned_u8(ref[1] + i * ref_stride, ref_stride);
+    uint8x8_t r2 = load_unaligned_u8(ref[2] + i * ref_stride, ref_stride);
+    uint8x8_t r3 = load_unaligned_u8(ref[3] + i * ref_stride, ref_stride);
+
+    sad8_neon(s, r0, &sum[0]);
+    sad8_neon(s, r1, &sum[1]);
+    sad8_neon(s, r2, &sum[2]);
+    sad8_neon(s, r3, &sum[3]);
+
+    i += 2;
+  } while (i < h);
+
+  vst1q_u32(res, horizontal_add_4d_uint16x8(sum));
+}
+
+#define SAD_WXH_4D_NEON(w, h)                                                 \
+  void vpx_sad##w##x##h##x4d_neon(const uint8_t *src_ptr, int src_stride,     \
+                                  const uint8_t *const ref_array[4],          \
+                                  int ref_stride, uint32_t sad_array[4]) {    \
+    sad##w##xhx4d_neon(src_ptr, src_stride, ref_array, ref_stride, sad_array, \
+                       (h));                                                  \
   }
 
-  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
-  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
-  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
-  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
-}
+SAD_WXH_4D_NEON(4, 4)
+SAD_WXH_4D_NEON(4, 8)
 
-void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t *res) {
-  int i;
-  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
-  const uint8_t *ref0, *ref1, *ref2, *ref3;
-  ref0 = ref[0];
-  ref1 = ref[1];
-  ref2 = ref[2];
-  ref3 = ref[3];
+SAD_WXH_4D_NEON(8, 4)
+SAD_WXH_4D_NEON(8, 8)
+SAD_WXH_4D_NEON(8, 16)
 
-  for (i = 0; i < 32; ++i) {
-    const uint8x16_t vec_src_00 = vld1q_u8(src);
-    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+SAD_WXH_4D_NEON(16, 8)
+SAD_WXH_4D_NEON(16, 16)
+SAD_WXH_4D_NEON(16, 32)
 
-    sad_neon_32(vec_src_00, vec_src_16, ref0, &vec_sum_ref0_lo,
-                &vec_sum_ref0_hi);
-    sad_neon_32(vec_src_00, vec_src_16, ref1, &vec_sum_ref1_lo,
-                &vec_sum_ref1_hi);
-    sad_neon_32(vec_src_00, vec_src_16, ref2, &vec_sum_ref2_lo,
-                &vec_sum_ref2_hi);
-    sad_neon_32(vec_src_00, vec_src_16, ref3, &vec_sum_ref3_lo,
-                &vec_sum_ref3_hi);
+SAD_WXH_4D_NEON(32, 16)
+SAD_WXH_4D_NEON(32, 32)
+SAD_WXH_4D_NEON(32, 64)
 
-    src += src_stride;
-    ref0 += ref_stride;
-    ref1 += ref_stride;
-    ref2 += ref_stride;
-    ref3 += ref_stride;
+SAD_WXH_4D_NEON(64, 32)
+SAD_WXH_4D_NEON(64, 64)
+
+#undef SAD_WXH_4D_NEON
+
+#define SAD_SKIP_WXH_4D_NEON(w, h)                                         \
+  void vpx_sad_skip_##w##x##h##x4d_neon(                                   \
+      const uint8_t *src_ptr, int src_stride,                              \
+      const uint8_t *const ref_array[4], int ref_stride,                   \
+      uint32_t sad_array[4]) {                                             \
+    sad##w##xhx4d_neon(src_ptr, 2 * src_stride, ref_array, 2 * ref_stride, \
+                       sad_array, ((h) >> 1));                             \
+    sad_array[0] <<= 1;                                                    \
+    sad_array[1] <<= 1;                                                    \
+    sad_array[2] <<= 1;                                                    \
+    sad_array[3] <<= 1;                                                    \
   }
 
-  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
-  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
-  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
-  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
-}
+SAD_SKIP_WXH_4D_NEON(4, 4)
+SAD_SKIP_WXH_4D_NEON(4, 8)
 
-void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t *res) {
-  int i;
-  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
-  const uint8_t *ref0, *ref1, *ref2, *ref3;
-  ref0 = ref[0];
-  ref1 = ref[1];
-  ref2 = ref[2];
-  ref3 = ref[3];
+SAD_SKIP_WXH_4D_NEON(8, 4)
+SAD_SKIP_WXH_4D_NEON(8, 8)
+SAD_SKIP_WXH_4D_NEON(8, 16)
 
-  for (i = 0; i < 16; ++i) {
-    const uint8x16_t vec_src = vld1q_u8(src);
-    const uint8x16_t vec_ref0 = vld1q_u8(ref0);
-    const uint8x16_t vec_ref1 = vld1q_u8(ref1);
-    const uint8x16_t vec_ref2 = vld1q_u8(ref2);
-    const uint8x16_t vec_ref3 = vld1q_u8(ref3);
+SAD_SKIP_WXH_4D_NEON(16, 8)
+SAD_SKIP_WXH_4D_NEON(16, 16)
+SAD_SKIP_WXH_4D_NEON(16, 32)
 
-    vec_sum_ref0_lo =
-        vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref0));
-    vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src),
-                               vget_high_u8(vec_ref0));
-    vec_sum_ref1_lo =
-        vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref1));
-    vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src),
-                               vget_high_u8(vec_ref1));
-    vec_sum_ref2_lo =
-        vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref2));
-    vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src),
-                               vget_high_u8(vec_ref2));
-    vec_sum_ref3_lo =
-        vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref3));
-    vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src),
-                               vget_high_u8(vec_ref3));
+SAD_SKIP_WXH_4D_NEON(32, 16)
+SAD_SKIP_WXH_4D_NEON(32, 32)
+SAD_SKIP_WXH_4D_NEON(32, 64)
 
-    src += src_stride;
-    ref0 += ref_stride;
-    ref1 += ref_stride;
-    ref2 += ref_stride;
-    ref3 += ref_stride;
-  }
+SAD_SKIP_WXH_4D_NEON(64, 32)
+SAD_SKIP_WXH_4D_NEON(64, 64)
 
-  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
-  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
-  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
-  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
-}
+#undef SAD_SKIP_WXH_4D_NEON
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon_dotprod.c b/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon_dotprod.c
new file mode 100644
index 0000000000..933fc48b8c
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon_dotprod.c
@@ -0,0 +1,176 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
+                              uint32x4_t *const sad_sum) {
+  uint8x16_t abs_diff = vabdq_u8(src, ref);
+  *sad_sum = vdotq_u32(*sad_sum, abs_diff, vdupq_n_u8(1));
+}
+
+static INLINE void sad64xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+                                           const uint8_t *const ref[4],
+                                           int ref_stride, uint32_t res[4],
+                                           int h) {
+  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum[4];
+
+  int i = 0;
+  do {
+    uint8x16_t s0, s1, s2, s3;
+
+    s0 = vld1q_u8(src + i * src_stride);
+    sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+    sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+    sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+    sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+    s1 = vld1q_u8(src + i * src_stride + 16);
+    sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+    sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+    sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+    sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+    s2 = vld1q_u8(src + i * src_stride + 32);
+    sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]);
+    sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]);
+    sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]);
+    sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]);
+
+    s3 = vld1q_u8(src + i * src_stride + 48);
+    sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]);
+    sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]);
+    sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]);
+    sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]);
+
+  } while (++i < h);
+
+  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+static INLINE void sad32xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+                                           const uint8_t *const ref[4],
+                                           int ref_stride, uint32_t res[4],
+                                           int h) {
+  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum[4];
+
+  int i = 0;
+  do {
+    uint8x16_t s0, s1;
+
+    s0 = vld1q_u8(src + i * src_stride);
+    sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+    sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+    sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+    sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+    s1 = vld1q_u8(src + i * src_stride + 16);
+    sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+    sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+    sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+    sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+  } while (++i < h);
+
+  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+static INLINE void sad16xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+                                           const uint8_t *const ref[4],
+                                           int ref_stride, uint32_t res[4],
+                                           int h) {
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    const uint8x16_t s = vld1q_u8(src + i * src_stride);
+    sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]);
+    sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]);
+    sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]);
+    sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]);
+
+  } while (++i < h);
+
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+#define SAD_WXH_4D_NEON_DOTPROD(w, h)                                      \
+  void vpx_sad##w##x##h##x4d_neon_dotprod(                                 \
+      const uint8_t *src_ptr, int src_stride,                              \
+      const uint8_t *const ref_array[4], int ref_stride,                   \
+      uint32_t sad_array[4]) {                                             \
+    sad##w##xhx4d_neon_dotprod(src_ptr, src_stride, ref_array, ref_stride, \
+                               sad_array, (h));                            \
+  }
+
+SAD_WXH_4D_NEON_DOTPROD(16, 8)
+SAD_WXH_4D_NEON_DOTPROD(16, 16)
+SAD_WXH_4D_NEON_DOTPROD(16, 32)
+
+SAD_WXH_4D_NEON_DOTPROD(32, 16)
+SAD_WXH_4D_NEON_DOTPROD(32, 32)
+SAD_WXH_4D_NEON_DOTPROD(32, 64)
+
+SAD_WXH_4D_NEON_DOTPROD(64, 32)
+SAD_WXH_4D_NEON_DOTPROD(64, 64)
+
+#undef SAD_WXH_4D_NEON_DOTPROD
+
+#define SAD_SKIP_WXH_4D_NEON_DOTPROD(w, h)                             \
+  void vpx_sad_skip_##w##x##h##x4d_neon_dotprod(                       \
+      const uint8_t *src_ptr, int src_stride,                          \
+      const uint8_t *const ref_array[4], int ref_stride,               \
+      uint32_t sad_array[4]) {                                         \
+    sad##w##xhx4d_neon_dotprod(src_ptr, 2 * src_stride, ref_array,     \
+                               2 * ref_stride, sad_array, ((h) >> 1)); \
+    sad_array[0] <<= 1;                                                \
+    sad_array[1] <<= 1;                                                \
+    sad_array[2] <<= 1;                                                \
+    sad_array[3] <<= 1;                                                \
+  }
+
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 8)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 16)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 32)
+
+SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 16)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 32)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 64)
+
+SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 32)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 64)
+
+#undef SAD_SKIP_WXH_4D_NEON_DOTPROD
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sad_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/sad_neon.c
index ff3228768c..4dd87ddc0f 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/sad_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/sad_neon.c
@@ -11,213 +11,381 @@
 #include <arm_neon.h>
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
 
-unsigned int vpx_sad8x16_neon(unsigned char *src_ptr, int src_stride,
-                              unsigned char *ref_ptr, int ref_stride) {
-  uint8x8_t d0, d8;
-  uint16x8_t q12;
-  uint32x4_t q1;
-  uint64x2_t q3;
-  uint32x2_t d5;
-  int i;
+static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+  uint32x4_t sum_u32;
 
-  d0 = vld1_u8(src_ptr);
-  src_ptr += src_stride;
-  d8 = vld1_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  q12 = vabdl_u8(d0, d8);
+  int i = h;
+  do {
+    uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3;
+    uint8x16_t diff0, diff1, diff2, diff3;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    diff0 = vabdq_u8(s0, r0);
+    sum[0] = vpadalq_u8(sum[0], diff0);
+
+    s1 = vld1q_u8(src_ptr + 16);
+    r1 = vld1q_u8(ref_ptr + 16);
+    diff1 = vabdq_u8(s1, r1);
+    sum[1] = vpadalq_u8(sum[1], diff1);
+
+    s2 = vld1q_u8(src_ptr + 32);
+    r2 = vld1q_u8(ref_ptr + 32);
+    diff2 = vabdq_u8(s2, r2);
+    sum[2] = vpadalq_u8(sum[2], diff2);
+
+    s3 = vld1q_u8(src_ptr + 48);
+    r3 = vld1q_u8(ref_ptr + 48);
+    diff3 = vabdq_u8(s3, r3);
+    sum[3] = vpadalq_u8(sum[3], diff3);
 
-  for (i = 0; i < 15; i++) {
-    d0 = vld1_u8(src_ptr);
     src_ptr += src_stride;
-    d8 = vld1_u8(ref_ptr);
     ref_ptr += ref_stride;
-    q12 = vabal_u8(q12, d0, d8);
-  }
+  } while (--i != 0);
 
-  q1 = vpaddlq_u16(q12);
-  q3 = vpaddlq_u32(q1);
-  d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
-                vreinterpret_u32_u64(vget_high_u64(q3)));
+  sum_u32 = vpaddlq_u16(sum[0]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[3]);
 
-  return vget_lane_u32(d5, 0);
+  return horizontal_add_uint32x4(sum_u32);
 }
 
-unsigned int vpx_sad4x4_neon(unsigned char *src_ptr, int src_stride,
-                             unsigned char *ref_ptr, int ref_stride) {
-  uint8x8_t d0, d8;
-  uint16x8_t q12;
-  uint32x2_t d1;
-  uint64x1_t d3;
-  int i;
+static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  uint32x4_t sum = vdupq_n_u32(0);
 
-  d0 = vld1_u8(src_ptr);
-  src_ptr += src_stride;
-  d8 = vld1_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  q12 = vabdl_u8(d0, d8);
+  int i = h;
+  do {
+    uint8x16_t s0 = vld1q_u8(src_ptr);
+    uint8x16_t r0 = vld1q_u8(ref_ptr);
+    uint8x16_t diff0 = vabdq_u8(s0, r0);
+    uint16x8_t sum0 = vpaddlq_u8(diff0);
+
+    uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+    uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+    uint8x16_t diff1 = vabdq_u8(s1, r1);
+    uint16x8_t sum1 = vpaddlq_u8(diff1);
+
+    sum = vpadalq_u16(sum, sum0);
+    sum = vpadalq_u16(sum, sum1);
 
-  for (i = 0; i < 3; i++) {
-    d0 = vld1_u8(src_ptr);
     src_ptr += src_stride;
-    d8 = vld1_u8(ref_ptr);
     ref_ptr += ref_stride;
-    q12 = vabal_u8(q12, d0, d8);
-  }
+  } while (--i != 0);
 
-  d1 = vpaddl_u16(vget_low_u16(q12));
-  d3 = vpaddl_u32(d1);
-
-  return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
+  return horizontal_add_uint32x4(sum);
 }
 
-unsigned int vpx_sad16x8_neon(unsigned char *src_ptr, int src_stride,
-                              unsigned char *ref_ptr, int ref_stride) {
-  uint8x16_t q0, q4;
-  uint16x8_t q12, q13;
-  uint32x4_t q1;
-  uint64x2_t q3;
-  uint32x2_t d5;
-  int i;
+static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  uint16x8_t sum = vdupq_n_u16(0);
 
-  q0 = vld1q_u8(src_ptr);
-  src_ptr += src_stride;
-  q4 = vld1q_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
-  q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
+  int i = h;
+  do {
+    uint8x16_t s = vld1q_u8(src_ptr);
+    uint8x16_t r = vld1q_u8(ref_ptr);
+
+    uint8x16_t diff = vabdq_u8(s, r);
+    sum = vpadalq_u8(sum, diff);
 
-  for (i = 0; i < 7; i++) {
-    q0 = vld1q_u8(src_ptr);
     src_ptr += src_stride;
-    q4 = vld1q_u8(ref_ptr);
     ref_ptr += ref_stride;
-    q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
-    q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
+  } while (--i != 0);
+
+  return horizontal_add_uint16x8(sum);
+}
+
+static INLINE unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
+                                       int h) {
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h;
+  do {
+    uint8x8_t s = vld1_u8(src_ptr);
+    uint8x8_t r = vld1_u8(ref_ptr);
+
+    sum = vabal_u8(sum, s, r);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint16x8(sum);
+}
+
+static INLINE unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
+                                       int h) {
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h / 2;
+  do {
+    uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+    uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+
+    sum = vabal_u8(sum, s, r);
+
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint16x8(sum);
+}
+
+#define SAD_WXH_NEON(w, h)                                                   \
+  unsigned int vpx_sad##w##x##h##_neon(const uint8_t *src, int src_stride,   \
+                                       const uint8_t *ref, int ref_stride) { \
+    return sad##w##xh_neon(src, src_stride, ref, ref_stride, (h));           \
   }
 
-  q12 = vaddq_u16(q12, q13);
-  q1 = vpaddlq_u16(q12);
-  q3 = vpaddlq_u32(q1);
-  d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
-                vreinterpret_u32_u64(vget_high_u64(q3)));
+SAD_WXH_NEON(4, 4)
+SAD_WXH_NEON(4, 8)
 
-  return vget_lane_u32(d5, 0);
-}
+SAD_WXH_NEON(8, 4)
+SAD_WXH_NEON(8, 8)
+SAD_WXH_NEON(8, 16)
 
-static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
-                                                    const uint16x8_t vec_hi) {
-  const uint32x4_t vec_l_lo =
-      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
-  const uint32x4_t vec_l_hi =
-      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
-  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
-  const uint64x2_t b = vpaddlq_u32(a);
-  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                                vreinterpret_u32_u64(vget_high_u64(b)));
-  return vget_lane_u32(c, 0);
-}
-static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
-  const uint32x4_t a = vpaddlq_u16(vec_16x8);
-  const uint64x2_t b = vpaddlq_u32(a);
-  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                                vreinterpret_u32_u64(vget_high_u64(b)));
-  return vget_lane_u32(c, 0);
-}
+SAD_WXH_NEON(16, 8)
+SAD_WXH_NEON(16, 16)
+SAD_WXH_NEON(16, 32)
 
-unsigned int vpx_sad64x64_neon(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride) {
-  int i;
-  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
-  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
-  for (i = 0; i < 64; ++i) {
-    const uint8x16_t vec_src_00 = vld1q_u8(src);
-    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
-    const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
-    const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
-    const uint8x16_t vec_ref_00 = vld1q_u8(ref);
-    const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
-    const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
-    const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
-    src += src_stride;
-    ref += ref_stride;
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
-                            vget_low_u8(vec_ref_00));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
-                            vget_high_u8(vec_ref_00));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
-                            vget_low_u8(vec_ref_16));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
-                            vget_high_u8(vec_ref_16));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32),
-                            vget_low_u8(vec_ref_32));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32),
-                            vget_high_u8(vec_ref_32));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48),
-                            vget_low_u8(vec_ref_48));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48),
-                            vget_high_u8(vec_ref_48));
+SAD_WXH_NEON(32, 16)
+SAD_WXH_NEON(32, 32)
+SAD_WXH_NEON(32, 64)
+
+SAD_WXH_NEON(64, 32)
+SAD_WXH_NEON(64, 64)
+
+#undef SAD_WXH_NEON
+
+#define SAD_SKIP_WXH_NEON(w, h)                                                \
+  unsigned int vpx_sad_skip_##w##x##h##_neon(                                  \
+      const uint8_t *src, int src_stride, const uint8_t *ref,                  \
+      int ref_stride) {                                                        \
+    return 2 *                                                                 \
+           sad##w##xh_neon(src, 2 * src_stride, ref, 2 * ref_stride, (h) / 2); \
   }
-  return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi);
+
+SAD_SKIP_WXH_NEON(4, 4)
+SAD_SKIP_WXH_NEON(4, 8)
+
+SAD_SKIP_WXH_NEON(8, 4)
+SAD_SKIP_WXH_NEON(8, 8)
+SAD_SKIP_WXH_NEON(8, 16)
+
+SAD_SKIP_WXH_NEON(16, 8)
+SAD_SKIP_WXH_NEON(16, 16)
+SAD_SKIP_WXH_NEON(16, 32)
+
+SAD_SKIP_WXH_NEON(32, 16)
+SAD_SKIP_WXH_NEON(32, 32)
+SAD_SKIP_WXH_NEON(32, 64)
+
+SAD_SKIP_WXH_NEON(64, 32)
+SAD_SKIP_WXH_NEON(64, 64)
+
+#undef SAD_SKIP_WXH_NEON
+
+static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr,
+                                            int src_stride,
+                                            const uint8_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            const uint8_t *second_pred) {
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+  uint32x4_t sum_u32;
+
+  int i = h;
+  do {
+    uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3;
+    uint8x16_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    p0 = vld1q_u8(second_pred);
+    avg0 = vrhaddq_u8(r0, p0);
+    diff0 = vabdq_u8(s0, avg0);
+    sum[0] = vpadalq_u8(sum[0], diff0);
+
+    s1 = vld1q_u8(src_ptr + 16);
+    r1 = vld1q_u8(ref_ptr + 16);
+    p1 = vld1q_u8(second_pred + 16);
+    avg1 = vrhaddq_u8(r1, p1);
+    diff1 = vabdq_u8(s1, avg1);
+    sum[1] = vpadalq_u8(sum[1], diff1);
+
+    s2 = vld1q_u8(src_ptr + 32);
+    r2 = vld1q_u8(ref_ptr + 32);
+    p2 = vld1q_u8(second_pred + 32);
+    avg2 = vrhaddq_u8(r2, p2);
+    diff2 = vabdq_u8(s2, avg2);
+    sum[2] = vpadalq_u8(sum[2], diff2);
+
+    s3 = vld1q_u8(src_ptr + 48);
+    r3 = vld1q_u8(ref_ptr + 48);
+    p3 = vld1q_u8(second_pred + 48);
+    avg3 = vrhaddq_u8(r3, p3);
+    diff3 = vabdq_u8(s3, avg3);
+    sum[3] = vpadalq_u8(sum[3], diff3);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 64;
+  } while (--i != 0);
+
+  sum_u32 = vpaddlq_u16(sum[0]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+
+  return horizontal_add_uint32x4(sum_u32);
 }
 
-unsigned int vpx_sad32x32_neon(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride) {
-  int i;
-  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
-  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
+static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
+                                            int src_stride,
+                                            const uint8_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            const uint8_t *second_pred) {
+  uint32x4_t sum = vdupq_n_u32(0);
 
-  for (i = 0; i < 32; ++i) {
-    const uint8x16_t vec_src_00 = vld1q_u8(src);
-    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
-    const uint8x16_t vec_ref_00 = vld1q_u8(ref);
-    const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
-    src += src_stride;
-    ref += ref_stride;
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
-                            vget_low_u8(vec_ref_00));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
-                            vget_high_u8(vec_ref_00));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
-                            vget_low_u8(vec_ref_16));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
-                            vget_high_u8(vec_ref_16));
+  int i = h;
+  do {
+    uint8x16_t s0 = vld1q_u8(src_ptr);
+    uint8x16_t r0 = vld1q_u8(ref_ptr);
+    uint8x16_t p0 = vld1q_u8(second_pred);
+    uint8x16_t avg0 = vrhaddq_u8(r0, p0);
+    uint8x16_t diff0 = vabdq_u8(s0, avg0);
+    uint16x8_t sum0 = vpaddlq_u8(diff0);
+
+    uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+    uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+    uint8x16_t p1 = vld1q_u8(second_pred + 16);
+    uint8x16_t avg1 = vrhaddq_u8(r1, p1);
+    uint8x16_t diff1 = vabdq_u8(s1, avg1);
+    uint16x8_t sum1 = vpaddlq_u8(diff1);
+
+    sum = vpadalq_u16(sum, sum0);
+    sum = vpadalq_u16(sum, sum1);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 32;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(sum);
+}
+
+static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
+                                            int src_stride,
+                                            const uint8_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            const uint8_t *second_pred) {
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h;
+  do {
+    uint8x16_t s = vld1q_u8(src_ptr);
+    uint8x16_t r = vld1q_u8(ref_ptr);
+    uint8x16_t p = vld1q_u8(second_pred);
+
+    uint8x16_t avg = vrhaddq_u8(r, p);
+    uint8x16_t diff = vabdq_u8(s, avg);
+    sum = vpadalq_u8(sum, diff);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
+  } while (--i != 0);
+
+  return horizontal_add_uint16x8(sum);
+}
+
+static INLINE unsigned int sad8xh_avg_neon(const uint8_t *src_ptr,
+                                           int src_stride,
+                                           const uint8_t *ref_ptr,
+                                           int ref_stride, int h,
+                                           const uint8_t *second_pred) {
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h;
+  do {
+    uint8x8_t s = vld1_u8(src_ptr);
+    uint8x8_t r = vld1_u8(ref_ptr);
+    uint8x8_t p = vld1_u8(second_pred);
+
+    uint8x8_t avg = vrhadd_u8(r, p);
+    sum = vabal_u8(sum, s, avg);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 8;
+  } while (--i != 0);
+
+  return horizontal_add_uint16x8(sum);
+}
+
+static INLINE unsigned int sad4xh_avg_neon(const uint8_t *src_ptr,
+                                           int src_stride,
+                                           const uint8_t *ref_ptr,
+                                           int ref_stride, int h,
+                                           const uint8_t *second_pred) {
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h / 2;
+  do {
+    uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+    uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+    uint8x8_t p = vld1_u8(second_pred);
+
+    uint8x8_t avg = vrhadd_u8(r, p);
+    sum = vabal_u8(sum, s, avg);
+
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
+    second_pred += 8;
+  } while (--i != 0);
+
+  return horizontal_add_uint16x8(sum);
+}
+
+#define SAD_WXH_AVG_NEON(w, h)                                             \
+  uint32_t vpx_sad##w##x##h##_avg_neon(const uint8_t *src, int src_stride, \
+                                       const uint8_t *ref, int ref_stride, \
+                                       const uint8_t *second_pred) {       \
+    return sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h),      \
+                               second_pred);                               \
   }
-  return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
-}
 
-unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride) {
-  int i;
-  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
-  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
+SAD_WXH_AVG_NEON(4, 4)
+SAD_WXH_AVG_NEON(4, 8)
 
-  for (i = 0; i < 16; ++i) {
-    const uint8x16_t vec_src = vld1q_u8(src);
-    const uint8x16_t vec_ref = vld1q_u8(ref);
-    src += src_stride;
-    ref += ref_stride;
-    vec_accum_lo =
-        vabal_u8(vec_accum_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref));
-    vec_accum_hi =
-        vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref));
-  }
-  return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
-}
+SAD_WXH_AVG_NEON(8, 4)
+SAD_WXH_AVG_NEON(8, 8)
+SAD_WXH_AVG_NEON(8, 16)
 
-unsigned int vpx_sad8x8_neon(const uint8_t *src, int src_stride,
-                             const uint8_t *ref, int ref_stride) {
-  int i;
-  uint16x8_t vec_accum = vdupq_n_u16(0);
+SAD_WXH_AVG_NEON(16, 8)
+SAD_WXH_AVG_NEON(16, 16)
+SAD_WXH_AVG_NEON(16, 32)
 
-  for (i = 0; i < 8; ++i) {
-    const uint8x8_t vec_src = vld1_u8(src);
-    const uint8x8_t vec_ref = vld1_u8(ref);
-    src += src_stride;
-    ref += ref_stride;
-    vec_accum = vabal_u8(vec_accum, vec_src, vec_ref);
-  }
-  return horizontal_add_16x8(vec_accum);
-}
+SAD_WXH_AVG_NEON(32, 16)
+SAD_WXH_AVG_NEON(32, 32)
+SAD_WXH_AVG_NEON(32, 64)
+
+SAD_WXH_AVG_NEON(64, 32)
+SAD_WXH_AVG_NEON(64, 64)
+
+#undef SAD_WXH_AVG_NEON
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sad_neon_dotprod.c b/media/libvpx/libvpx/vpx_dsp/arm/sad_neon_dotprod.c
new file mode 100644
index 0000000000..fbc0b8d75f
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/sad_neon_dotprod.c
@@ -0,0 +1,247 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE unsigned int sadwxh_neon_dotprod(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *ref_ptr,
+                                               int ref_stride, int w, int h) {
+  // Only two accumulators are required for optimal instruction throughput of
+  // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+      s0 = vld1q_u8(src_ptr + j);
+      r0 = vld1q_u8(ref_ptr + j);
+      diff0 = vabdq_u8(s0, r0);
+      sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+      s1 = vld1q_u8(src_ptr + j + 16);
+      r1 = vld1q_u8(ref_ptr + j + 16);
+      diff1 = vabdq_u8(s1, r1);
+      sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+      j += 32;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad64xh_neon_dotprod(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *ref_ptr,
+                                                int ref_stride, int h) {
+  return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64, h);
+}
+
+static INLINE unsigned int sad32xh_neon_dotprod(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *ref_ptr,
+                                                int ref_stride, int h) {
+  return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32, h);
+}
+
+static INLINE unsigned int sad16xh_neon_dotprod(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *ref_ptr,
+                                                int ref_stride, int h) {
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h / 2;
+  do {
+    uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    diff0 = vabdq_u8(s0, r0);
+    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+
+    s1 = vld1q_u8(src_ptr);
+    r1 = vld1q_u8(ref_ptr);
+    diff1 = vabdq_u8(s1, r1);
+    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+#define SAD_WXH_NEON_DOTPROD(w, h)                                         \
+  unsigned int vpx_sad##w##x##h##_neon_dotprod(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,              \
+      int ref_stride) {                                                    \
+    return sad##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, (h)); \
+  }
+
+SAD_WXH_NEON_DOTPROD(16, 8)
+SAD_WXH_NEON_DOTPROD(16, 16)
+SAD_WXH_NEON_DOTPROD(16, 32)
+
+SAD_WXH_NEON_DOTPROD(32, 16)
+SAD_WXH_NEON_DOTPROD(32, 32)
+SAD_WXH_NEON_DOTPROD(32, 64)
+
+SAD_WXH_NEON_DOTPROD(64, 32)
+SAD_WXH_NEON_DOTPROD(64, 64)
+
+#undef SAD_WXH_NEON_DOTPROD
+
+#define SAD_SKIP_WXH_NEON_DOTPROD(w, h)                          \
+  unsigned int vpx_sad_skip_##w##x##h##_neon_dotprod(            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,    \
+      int ref_stride) {                                          \
+    return 2 * sad##w##xh_neon_dotprod(src, 2 * src_stride, ref, \
+                                       2 * ref_stride, (h) / 2); \
+  }
+
+SAD_SKIP_WXH_NEON_DOTPROD(16, 8)
+SAD_SKIP_WXH_NEON_DOTPROD(16, 16)
+SAD_SKIP_WXH_NEON_DOTPROD(16, 32)
+
+SAD_SKIP_WXH_NEON_DOTPROD(32, 16)
+SAD_SKIP_WXH_NEON_DOTPROD(32, 32)
+SAD_SKIP_WXH_NEON_DOTPROD(32, 64)
+
+SAD_SKIP_WXH_NEON_DOTPROD(64, 32)
+SAD_SKIP_WXH_NEON_DOTPROD(64, 64)
+
+#undef SAD_SKIP_WXH_NEON_DOTPROD
+
+static INLINE unsigned int sadwxh_avg_neon_dotprod(const uint8_t *src_ptr,
+                                                   int src_stride,
+                                                   const uint8_t *ref_ptr,
+                                                   int ref_stride, int w, int h,
+                                                   const uint8_t *second_pred) {
+  // Only two accumulators are required for optimal instruction throughput of
+  // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+      s0 = vld1q_u8(src_ptr + j);
+      r0 = vld1q_u8(ref_ptr + j);
+      p0 = vld1q_u8(second_pred);
+      avg0 = vrhaddq_u8(r0, p0);
+      diff0 = vabdq_u8(s0, avg0);
+      sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+      s1 = vld1q_u8(src_ptr + j + 16);
+      r1 = vld1q_u8(ref_ptr + j + 16);
+      p1 = vld1q_u8(second_pred + 16);
+      avg1 = vrhaddq_u8(r1, p1);
+      diff1 = vabdq_u8(s1, avg1);
+      sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+      j += 32;
+      second_pred += 32;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad64xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred) {
+  return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+                                 h, second_pred);
+}
+
+static INLINE unsigned int sad32xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred) {
+  return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32,
+                                 h, second_pred);
+}
+
+static INLINE unsigned int sad16xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred) {
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h / 2;
+  do {
+    uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    p0 = vld1q_u8(second_pred);
+    avg0 = vrhaddq_u8(r0, p0);
+    diff0 = vabdq_u8(s0, avg0);
+    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
+
+    s1 = vld1q_u8(src_ptr);
+    r1 = vld1q_u8(ref_ptr);
+    p1 = vld1q_u8(second_pred);
+    avg1 = vrhaddq_u8(r1, p1);
+    diff1 = vabdq_u8(s1, avg1);
+    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+#define SAD_WXH_AVG_NEON_DOTPROD(w, h)                                        \
+  uint32_t vpx_sad##w##x##h##_avg_neon_dotprod(                               \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred) {                                           \
+    return sad##w##xh_avg_neon_dotprod(src, src_stride, ref, ref_stride, (h), \
+                                       second_pred);                          \
+  }
+
+SAD_WXH_AVG_NEON_DOTPROD(16, 8)
+SAD_WXH_AVG_NEON_DOTPROD(16, 16)
+SAD_WXH_AVG_NEON_DOTPROD(16, 32)
+
+SAD_WXH_AVG_NEON_DOTPROD(32, 16)
+SAD_WXH_AVG_NEON_DOTPROD(32, 32)
+SAD_WXH_AVG_NEON_DOTPROD(32, 64)
+
+SAD_WXH_AVG_NEON_DOTPROD(64, 32)
+SAD_WXH_AVG_NEON_DOTPROD(64, 64)
+
+#undef SAD_WXH_AVG_NEON_DOTPROD
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sse_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/sse_neon.c
new file mode 100644
index 0000000000..2dd57e596c
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/sse_neon.c
@@ -0,0 +1,188 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <stdint.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE void sse_16x1_neon(const uint8_t *src, const uint8_t *ref,
+                                 uint32x4_t *sse) {
+  uint8x16_t s = vld1q_u8(src);
+  uint8x16_t r = vld1q_u8(ref);
+
+  uint8x16_t abs_diff = vabdq_u8(s, r);
+  uint8x8_t abs_diff_lo = vget_low_u8(abs_diff);
+  uint8x8_t abs_diff_hi = vget_high_u8(abs_diff);
+
+  *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_lo, abs_diff_lo));
+  *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_hi, abs_diff_hi));
+}
+
+static INLINE void sse_8x1_neon(const uint8_t *src, const uint8_t *ref,
+                                uint32x4_t *sse) {
+  uint8x8_t s = vld1_u8(src);
+  uint8x8_t r = vld1_u8(ref);
+
+  uint8x8_t abs_diff = vabd_u8(s, r);
+
+  *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff));
+}
+
+static INLINE void sse_4x2_neon(const uint8_t *src, int src_stride,
+                                const uint8_t *ref, int ref_stride,
+                                uint32x4_t *sse) {
+  uint8x8_t s = load_unaligned_u8(src, src_stride);
+  uint8x8_t r = load_unaligned_u8(ref, ref_stride);
+
+  uint8x8_t abs_diff = vabd_u8(s, r);
+
+  *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff));
+}
+
+static INLINE uint32_t sse_wxh_neon(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    int width, int height) {
+  uint32x4_t sse = vdupq_n_u32(0);
+
+  if ((width & 0x07) && ((width & 0x07) < 5)) {
+    int i = height;
+    do {
+      int j = 0;
+      do {
+        sse_8x1_neon(src + j, ref + j, &sse);
+        sse_8x1_neon(src + j + src_stride, ref + j + ref_stride, &sse);
+        j += 8;
+      } while (j + 4 < width);
+
+      sse_4x2_neon(src + j, src_stride, ref + j, ref_stride, &sse);
+      src += 2 * src_stride;
+      ref += 2 * ref_stride;
+      i -= 2;
+    } while (i != 0);
+  } else {
+    int i = height;
+    do {
+      int j = 0;
+      do {
+        sse_8x1_neon(src + j, ref + j, &sse);
+        j += 8;
+      } while (j < width);
+
+      src += src_stride;
+      ref += ref_stride;
+    } while (--i != 0);
+  }
+  return horizontal_add_uint32x4(sse);
+}
+
+static INLINE uint32_t sse_64xh_neon(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride,
+                                     int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon(src, ref, &sse[0]);
+    sse_16x1_neon(src + 16, ref + 16, &sse[1]);
+    sse_16x1_neon(src + 32, ref + 32, &sse[0]);
+    sse_16x1_neon(src + 48, ref + 48, &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_32xh_neon(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride,
+                                     int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon(src, ref, &sse[0]);
+    sse_16x1_neon(src + 16, ref + 16, &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_16xh_neon(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride,
+                                     int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon(src, ref, &sse[0]);
+    src += src_stride;
+    ref += ref_stride;
+    sse_16x1_neon(src, ref, &sse[1]);
+    src += src_stride;
+    ref += ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_8xh_neon(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    int height) {
+  uint32x4_t sse = vdupq_n_u32(0);
+
+  int i = height;
+  do {
+    sse_8x1_neon(src, ref, &sse);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(sse);
+}
+
+static INLINE uint32_t sse_4xh_neon(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    int height) {
+  uint32x4_t sse = vdupq_n_u32(0);
+
+  int i = height;
+  do {
+    sse_4x2_neon(src, src_stride, ref, ref_stride, &sse);
+
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  return horizontal_add_uint32x4(sse);
+}
+
+int64_t vpx_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref,
+                     int ref_stride, int width, int height) {
+  switch (width) {
+    case 4: return sse_4xh_neon(src, src_stride, ref, ref_stride, height);
+    case 8: return sse_8xh_neon(src, src_stride, ref, ref_stride, height);
+    case 16: return sse_16xh_neon(src, src_stride, ref, ref_stride, height);
+    case 32: return sse_32xh_neon(src, src_stride, ref, ref_stride, height);
+    case 64: return sse_64xh_neon(src, src_stride, ref, ref_stride, height);
+    default:
+      return sse_wxh_neon(src, src_stride, ref, ref_stride, width, height);
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sse_neon_dotprod.c b/media/libvpx/libvpx/vpx_dsp/arm/sse_neon_dotprod.c
new file mode 100644
index 0000000000..8777773918
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/sse_neon_dotprod.c
@@ -0,0 +1,197 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE void sse_16x1_neon_dotprod(const uint8_t *src, const uint8_t *ref,
+                                         uint32x4_t *sse) {
+  uint8x16_t s = vld1q_u8(src);
+  uint8x16_t r = vld1q_u8(ref);
+
+  uint8x16_t abs_diff = vabdq_u8(s, r);
+
+  *sse = vdotq_u32(*sse, abs_diff, abs_diff);
+}
+
+static INLINE void sse_8x1_neon_dotprod(const uint8_t *src, const uint8_t *ref,
+                                        uint32x2_t *sse) {
+  uint8x8_t s = vld1_u8(src);
+  uint8x8_t r = vld1_u8(ref);
+
+  uint8x8_t abs_diff = vabd_u8(s, r);
+
+  *sse = vdot_u32(*sse, abs_diff, abs_diff);
+}
+
+static INLINE void sse_4x2_neon_dotprod(const uint8_t *src, int src_stride,
+                                        const uint8_t *ref, int ref_stride,
+                                        uint32x2_t *sse) {
+  uint8x8_t s = load_unaligned_u8(src, src_stride);
+  uint8x8_t r = load_unaligned_u8(ref, ref_stride);
+
+  uint8x8_t abs_diff = vabd_u8(s, r);
+
+  *sse = vdot_u32(*sse, abs_diff, abs_diff);
+}
+
+static INLINE uint32_t sse_wxh_neon_dotprod(const uint8_t *src, int src_stride,
+                                            const uint8_t *ref, int ref_stride,
+                                            int width, int height) {
+  uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) };
+
+  if ((width & 0x07) && ((width & 0x07) < 5)) {
+    int i = height;
+    do {
+      int j = 0;
+      do {
+        sse_8x1_neon_dotprod(src + j, ref + j, &sse[0]);
+        sse_8x1_neon_dotprod(src + j + src_stride, ref + j + ref_stride,
+                             &sse[1]);
+        j += 8;
+      } while (j + 4 < width);
+
+      sse_4x2_neon_dotprod(src + j, src_stride, ref + j, ref_stride, &sse[0]);
+      src += 2 * src_stride;
+      ref += 2 * ref_stride;
+      i -= 2;
+    } while (i != 0);
+  } else {
+    int i = height;
+    do {
+      int j = 0;
+      do {
+        sse_8x1_neon_dotprod(src + j, ref + j, &sse[0]);
+        sse_8x1_neon_dotprod(src + j + src_stride, ref + j + ref_stride,
+                             &sse[1]);
+        j += 8;
+      } while (j < width);
+
+      src += 2 * src_stride;
+      ref += 2 * ref_stride;
+      i -= 2;
+    } while (i != 0);
+  }
+  return horizontal_add_uint32x4(vcombine_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_64xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                             const uint8_t *ref, int ref_stride,
+                                             int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon_dotprod(src, ref, &sse[0]);
+    sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]);
+    sse_16x1_neon_dotprod(src + 32, ref + 32, &sse[0]);
+    sse_16x1_neon_dotprod(src + 48, ref + 48, &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_32xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                             const uint8_t *ref, int ref_stride,
+                                             int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon_dotprod(src, ref, &sse[0]);
+    sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_16xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                             const uint8_t *ref, int ref_stride,
+                                             int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon_dotprod(src, ref, &sse[0]);
+    src += src_stride;
+    ref += ref_stride;
+    sse_16x1_neon_dotprod(src, ref, &sse[1]);
+    src += src_stride;
+    ref += ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_8xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                            const uint8_t *ref, int ref_stride,
+                                            int height) {
+  uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_8x1_neon_dotprod(src, ref, &sse[0]);
+    src += src_stride;
+    ref += ref_stride;
+    sse_8x1_neon_dotprod(src, ref, &sse[1]);
+    src += src_stride;
+    ref += ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  return horizontal_add_uint32x4(vcombine_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_4xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                            const uint8_t *ref, int ref_stride,
+                                            int height) {
+  uint32x2_t sse = vdup_n_u32(0);
+
+  int i = height;
+  do {
+    sse_4x2_neon_dotprod(src, src_stride, ref, ref_stride, &sse);
+
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  return horizontal_add_uint32x2(sse);
+}
+
+int64_t vpx_sse_neon_dotprod(const uint8_t *src, int src_stride,
+                             const uint8_t *ref, int ref_stride, int width,
+                             int height) {
+  switch (width) {
+    case 4:
+      return sse_4xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+    case 8:
+      return sse_8xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+    case 16:
+      return sse_16xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+    case 32:
+      return sse_32xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+    case 64:
+      return sse_64xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+    default:
+      return sse_wxh_neon_dotprod(src, src_stride, ref, ref_stride, width,
+                                  height);
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/subpel_variance_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/subpel_variance_neon.c
index f044e11a15..d92f1615d7 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/subpel_variance_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/subpel_variance_neon.c
@@ -12,122 +12,478 @@
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"
 
-#include "vpx_ports/mem.h"
 #include "vpx/vpx_integer.h"
 
 #include "vpx_dsp/variance.h"
+#include "vpx_dsp/arm/mem_neon.h"
 
-static const uint8_t bilinear_filters[8][2] = {
-  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
-  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
-};
+// Process a block exactly 4 wide and a multiple of 2 high.
+static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                      int src_stride, int pixel_step,
+                                      int dst_height, int filter_offset) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
 
-static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
-                                      uint8_t *output_ptr,
-                                      unsigned int src_pixels_per_line,
-                                      int pixel_step,
-                                      unsigned int output_height,
-                                      unsigned int output_width,
-                                      const uint8_t *filter) {
-  const uint8x8_t f0 = vmov_n_u8(filter[0]);
-  const uint8x8_t f1 = vmov_n_u8(filter[1]);
-  unsigned int i;
-  for (i = 0; i < output_height; ++i) {
-    const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
-    const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
-    const uint16x8_t a = vmull_u8(src_0, f0);
-    const uint16x8_t b = vmlal_u8(a, src_1, f1);
-    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
-    vst1_u8(&output_ptr[0], out);
-    // Next row...
-    src_ptr += src_pixels_per_line;
-    output_ptr += output_width;
+  int i = dst_height;
+  do {
+    uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+    uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
+    uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
+    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+    vst1_u8(dst_ptr, blend_u8);
+
+    src_ptr += 2 * src_stride;
+    dst_ptr += 2 * 4;
+    i -= 2;
+  } while (i != 0);
+}
+
+// Process a block exactly 8 wide and any height.
+static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                      int src_stride, int pixel_step,
+                                      int dst_height, int filter_offset) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint8x8_t s0 = vld1_u8(src_ptr);
+    uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
+    uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
+    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+    vst1_u8(dst_ptr, blend_u8);
+
+    src_ptr += src_stride;
+    dst_ptr += 8;
+  } while (--i != 0);
+}
+
+// Process a block which is a mutiple of 16 wide and any height.
+static void var_filter_block2d_bil_large(const uint8_t *src_ptr,
+                                         uint8_t *dst_ptr, int src_stride,
+                                         int pixel_step, int dst_width,
+                                         int dst_height, int filter_offset) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr + j);
+      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+      uint16x8_t blend_l =
+          vmlal_u8(vmull_u8(vget_low_u8(s0), f0), vget_low_u8(s1), f1);
+      uint16x8_t blend_h =
+          vmlal_u8(vmull_u8(vget_high_u8(s0), f0), vget_high_u8(s1), f1);
+      uint8x8_t out_lo = vrshrn_n_u16(blend_l, 3);
+      uint8x8_t out_hi = vrshrn_n_u16(blend_h, 3);
+      vst1q_u8(dst_ptr + j, vcombine_u8(out_lo, out_hi));
+
+      j += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                       int src_stride, int pixel_step,
+                                       int dst_height, int filter_offset) {
+  var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16,
+                               dst_height, filter_offset);
+}
+static void var_filter_block2d_bil_w32(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                       int src_stride, int pixel_step,
+                                       int dst_height, int filter_offset) {
+  var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32,
+                               dst_height, filter_offset);
+}
+static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                       int src_stride, int pixel_step,
+                                       int dst_height, int filter_offset) {
+  var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64,
+                               dst_height, filter_offset);
+}
+
+static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                   int src_stride, int pixel_step,
+                                   int dst_width, int dst_height) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr + j);
+      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+      uint8x16_t avg = vrhaddq_u8(s0, s1);
+      vst1q_u8(dst_ptr + j, avg);
+
+      j += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+#define SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                          \
+  unsigned int vpx_sub_pixel_variance##w##x##h##_neon(                   \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,      \
+      const uint8_t *ref, int ref_stride, uint32_t *sse) {               \
+    uint8_t tmp0[w * (h + padding)];                                     \
+    uint8_t tmp1[w * h];                                                 \
+    var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+                                xoffset);                                \
+    var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);           \
+    return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
   }
-}
 
-static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
-                                       uint8_t *output_ptr,
-                                       unsigned int src_pixels_per_line,
-                                       int pixel_step,
-                                       unsigned int output_height,
-                                       unsigned int output_width,
-                                       const uint8_t *filter) {
-  const uint8x8_t f0 = vmov_n_u8(filter[0]);
-  const uint8x8_t f1 = vmov_n_u8(filter[1]);
-  unsigned int i, j;
-  for (i = 0; i < output_height; ++i) {
-    for (j = 0; j < output_width; j += 16) {
-      const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
-      const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
-      const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
-      const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
-      const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
-      const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
-      const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
-      const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
-      vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi));
-    }
-    // Next row...
-    src_ptr += src_pixels_per_line;
-    output_ptr += output_width;
+#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                  \
+  unsigned int vpx_sub_pixel_variance##w##x##h##_neon(                       \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint8_t *ref, int ref_stride, unsigned int *sse) {               \
+    if (xoffset == 0) {                                                      \
+      if (yoffset == 0) {                                                    \
+        return vpx_variance##w##x##h(src, src_stride, ref, ref_stride, sse); \
+      } else if (yoffset == 4) {                                             \
+        uint8_t tmp[w * h];                                                  \
+        var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h);      \
+        return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse);          \
+      } else {                                                               \
+        uint8_t tmp[w * h];                                                  \
+        var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h,     \
+                                    yoffset);                                \
+        return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse);          \
+      }                                                                      \
+    } else if (xoffset == 4) {                                               \
+      uint8_t tmp0[w * (h + padding)];                                       \
+      if (yoffset == 0) {                                                    \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h);              \
+        return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse);         \
+      } else if (yoffset == 4) {                                             \
+        uint8_t tmp1[w * (h + padding)];                                     \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));  \
+        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                      \
+        return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
+      } else {                                                               \
+        uint8_t tmp1[w * (h + padding)];                                     \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));  \
+        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);           \
+        return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
+      }                                                                      \
+    } else {                                                                 \
+      uint8_t tmp0[w * (h + padding)];                                       \
+      if (yoffset == 0) {                                                    \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset);   \
+        return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse);         \
+      } else if (yoffset == 4) {                                             \
+        uint8_t tmp1[w * h];                                                 \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+                                    xoffset);                                \
+        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                      \
+        return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
+      } else {                                                               \
+        uint8_t tmp1[w * h];                                                 \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+                                    xoffset);                                \
+        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);           \
+        return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
+      }                                                                      \
+    }                                                                        \
   }
+
+// 4x<h> blocks are processed two rows at a time, so require an extra row of
+// padding.
+SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
+SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
+
+SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
+SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
+SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 4.
+static void avg_pred_var_filter_block2d_bil_w4(const uint8_t *src_ptr,
+                                               uint8_t *dst_ptr, int src_stride,
+                                               int pixel_step, int dst_height,
+                                               int filter_offset,
+                                               const uint8_t *second_pred) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+    uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
+    uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
+    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+
+    uint8x8_t p = vld1_u8(second_pred);
+    uint8x8_t avg = vrhadd_u8(blend_u8, p);
+
+    vst1_u8(dst_ptr, avg);
+
+    src_ptr += 2 * src_stride;
+    dst_ptr += 2 * 4;
+    second_pred += 2 * 4;
+    i -= 2;
+  } while (i != 0);
 }
 
-unsigned int vpx_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride,
-                                            int xoffset, int yoffset,
-                                            const uint8_t *dst, int dst_stride,
-                                            unsigned int *sse) {
-  DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);
-  DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 8.
+static void avg_pred_var_filter_block2d_bil_w8(const uint8_t *src_ptr,
+                                               uint8_t *dst_ptr, int src_stride,
+                                               int pixel_step, int dst_height,
+                                               int filter_offset,
+                                               const uint8_t *second_pred) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
 
-  var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8,
-                            bilinear_filters[xoffset]);
-  var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8,
-                            bilinear_filters[yoffset]);
-  return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
+  int i = dst_height;
+  do {
+    uint8x8_t s0 = vld1_u8(src_ptr);
+    uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
+    uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
+    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+
+    uint8x8_t p = vld1_u8(second_pred);
+    uint8x8_t avg = vrhadd_u8(blend_u8, p);
+
+    vst1_u8(dst_ptr, avg);
+
+    src_ptr += src_stride;
+    dst_ptr += 8;
+    second_pred += 8;
+  } while (--i > 0);
 }
 
-unsigned int vpx_sub_pixel_variance16x16_neon(const uint8_t *src,
-                                              int src_stride, int xoffset,
-                                              int yoffset, const uint8_t *dst,
-                                              int dst_stride,
-                                              unsigned int *sse) {
-  DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);
-  DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);
+// Combine bilinear filter with vpx_comp_avg_pred for large blocks.
+static void avg_pred_var_filter_block2d_bil_large(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_width, int dst_height, int filter_offset,
+    const uint8_t *second_pred) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
 
-  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16,
-                             bilinear_filters[xoffset]);
-  var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16,
-                             bilinear_filters[yoffset]);
-  return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr + j);
+      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+      uint16x8_t blend_l =
+          vmlal_u8(vmull_u8(vget_low_u8(s0), f0), vget_low_u8(s1), f1);
+      uint16x8_t blend_h =
+          vmlal_u8(vmull_u8(vget_high_u8(s0), f0), vget_high_u8(s1), f1);
+      uint8x16_t blend_u8 =
+          vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
+
+      uint8x16_t p = vld1q_u8(second_pred);
+      uint8x16_t avg = vrhaddq_u8(blend_u8, p);
+
+      vst1q_u8(dst_ptr + j, avg);
+
+      j += 16;
+      second_pred += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
 }
 
-unsigned int vpx_sub_pixel_variance32x32_neon(const uint8_t *src,
-                                              int src_stride, int xoffset,
-                                              int yoffset, const uint8_t *dst,
-                                              int dst_stride,
-                                              unsigned int *sse) {
-  DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);
-  DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);
-
-  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32,
-                             bilinear_filters[xoffset]);
-  var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32,
-                             bilinear_filters[yoffset]);
-  return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 16.
+static void avg_pred_var_filter_block2d_bil_w16(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint8_t *second_pred) {
+  avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                        pixel_step, 16, dst_height,
+                                        filter_offset, second_pred);
 }
 
-unsigned int vpx_sub_pixel_variance64x64_neon(const uint8_t *src,
-                                              int src_stride, int xoffset,
-                                              int yoffset, const uint8_t *dst,
-                                              int dst_stride,
-                                              unsigned int *sse) {
-  DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);
-  DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);
-
-  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64,
-                             bilinear_filters[xoffset]);
-  var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64,
-                             bilinear_filters[yoffset]);
-  return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 32.
+static void avg_pred_var_filter_block2d_bil_w32(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint8_t *second_pred) {
+  avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                        pixel_step, 32, dst_height,
+                                        filter_offset, second_pred);
 }
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 64.
+static void avg_pred_var_filter_block2d_bil_w64(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint8_t *second_pred) {
+  avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                        pixel_step, 64, dst_height,
+                                        filter_offset, second_pred);
+}
+
+// Combine averaging subpel filter with vpx_comp_avg_pred.
+static void avg_pred_var_filter_block2d_avg(const uint8_t *src_ptr,
+                                            uint8_t *dst_ptr, int src_stride,
+                                            int pixel_step, int dst_width,
+                                            int dst_height,
+                                            const uint8_t *second_pred) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr + j);
+      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+      uint8x16_t avg = vrhaddq_u8(s0, s1);
+
+      uint8x16_t p = vld1q_u8(second_pred);
+      avg = vrhaddq_u8(avg, p);
+
+      vst1q_u8(dst_ptr + j, avg);
+
+      j += 16;
+      second_pred += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+// Implementation of vpx_comp_avg_pred for blocks having width >= 16.
+static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride,
+                     int dst_width, int dst_height,
+                     const uint8_t *second_pred) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s = vld1q_u8(src_ptr + j);
+      uint8x16_t p = vld1q_u8(second_pred);
+
+      uint8x16_t avg = vrhaddq_u8(s, p);
+
+      vst1q_u8(dst_ptr + j, avg);
+
+      j += 16;
+      second_pred += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+#define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding)                         \
+  unsigned int vpx_sub_pixel_avg_variance##w##x##h##_neon(                  \
+      const uint8_t *src, int source_stride, int xoffset, int yoffset,      \
+      const uint8_t *ref, int ref_stride, uint32_t *sse,                    \
+      const uint8_t *second_pred) {                                         \
+    uint8_t tmp0[w * (h + padding)];                                        \
+    uint8_t tmp1[w * h];                                                    \
+    var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \
+                                xoffset);                                   \
+    avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset,      \
+                                         second_pred);                      \
+    return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);            \
+  }
+
+#define SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding)                \
+  unsigned int vpx_sub_pixel_avg_variance##w##x##h##_neon(                     \
+      const uint8_t *src, int source_stride, int xoffset, int yoffset,         \
+      const uint8_t *ref, int ref_stride, unsigned int *sse,                   \
+      const uint8_t *second_pred) {                                            \
+    if (xoffset == 0) {                                                        \
+      uint8_t tmp[w * h];                                                      \
+      if (yoffset == 0) {                                                      \
+        avg_pred(src, tmp, source_stride, w, h, second_pred);                  \
+        return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
+      } else if (yoffset == 4) {                                               \
+        avg_pred_var_filter_block2d_avg(src, tmp, source_stride,               \
+                                        source_stride, w, h, second_pred);     \
+        return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
+      } else {                                                                 \
+        avg_pred_var_filter_block2d_bil_w##w(                                  \
+            src, tmp, source_stride, source_stride, h, yoffset, second_pred);  \
+        return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint8_t tmp0[w * (h + padding)];                                         \
+      if (yoffset == 0) {                                                      \
+        avg_pred_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h,     \
+                                        second_pred);                          \
+        return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse);           \
+      } else if (yoffset == 4) {                                               \
+        uint8_t tmp1[w * (h + padding)];                                       \
+        var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+        avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred);  \
+        return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
+      } else {                                                                 \
+        uint8_t tmp1[w * (h + padding)];                                       \
+        var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+        avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset,     \
+                                             second_pred);                     \
+        return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
+      }                                                                        \
+    } else {                                                                   \
+      uint8_t tmp0[w * (h + padding)];                                         \
+      if (yoffset == 0) {                                                      \
+        avg_pred_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h,   \
+                                             xoffset, second_pred);            \
+        return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse);           \
+      } else if (yoffset == 4) {                                               \
+        uint8_t tmp1[w * h];                                                   \
+        var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1,               \
+                                    (h + padding), xoffset);                   \
+        avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred);  \
+        return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
+      } else {                                                                 \
+        uint8_t tmp1[w * h];                                                   \
+        var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1,               \
+                                    (h + padding), xoffset);                   \
+        avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset,     \
+                                             second_pred);                     \
+        return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
+      }                                                                        \
+    }                                                                          \
+  }
+
+// 4x<h> blocks are processed two rows at a time, so require an extra row of
+// padding.
+SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2)
+SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2)
+
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1)
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/subtract_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/subtract_neon.c
index ce81fb630f..2c008e48ab 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/subtract_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/subtract_neon.c
@@ -9,71 +9,129 @@
  */
 
 #include <arm_neon.h>
+#include <assert.h>
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
 
 void vpx_subtract_block_neon(int rows, int cols, int16_t *diff,
                              ptrdiff_t diff_stride, const uint8_t *src,
                              ptrdiff_t src_stride, const uint8_t *pred,
                              ptrdiff_t pred_stride) {
-  int r, c;
+  int r = rows, c;
 
   if (cols > 16) {
-    for (r = 0; r < rows; ++r) {
+    do {
       for (c = 0; c < cols; c += 32) {
-        const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]);
-        const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]);
-        const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]);
-        const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]);
-        const uint16x8_t v_diff_lo_00 =
-            vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00));
-        const uint16x8_t v_diff_hi_00 =
-            vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00));
-        const uint16x8_t v_diff_lo_16 =
-            vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16));
-        const uint16x8_t v_diff_hi_16 =
-            vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16));
-        vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00));
-        vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00));
-        vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16));
-        vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16));
+        const uint8x16_t s0 = vld1q_u8(&src[c + 0]);
+        const uint8x16_t s1 = vld1q_u8(&src[c + 16]);
+        const uint8x16_t p0 = vld1q_u8(&pred[c + 0]);
+        const uint8x16_t p1 = vld1q_u8(&pred[c + 16]);
+        const uint16x8_t d0 = vsubl_u8(vget_low_u8(s0), vget_low_u8(p0));
+        const uint16x8_t d1 = vsubl_u8(vget_high_u8(s0), vget_high_u8(p0));
+        const uint16x8_t d2 = vsubl_u8(vget_low_u8(s1), vget_low_u8(p1));
+        const uint16x8_t d3 = vsubl_u8(vget_high_u8(s1), vget_high_u8(p1));
+        vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(d0));
+        vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(d1));
+        vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(d2));
+        vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(d3));
       }
       diff += diff_stride;
       pred += pred_stride;
       src += src_stride;
-    }
+    } while (--r);
   } else if (cols > 8) {
-    for (r = 0; r < rows; ++r) {
-      const uint8x16_t v_src = vld1q_u8(&src[0]);
-      const uint8x16_t v_pred = vld1q_u8(&pred[0]);
-      const uint16x8_t v_diff_lo =
-          vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred));
-      const uint16x8_t v_diff_hi =
-          vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred));
-      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo));
-      vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi));
+    do {
+      const uint8x16_t s = vld1q_u8(&src[0]);
+      const uint8x16_t p = vld1q_u8(&pred[0]);
+      const uint16x8_t d0 = vsubl_u8(vget_low_u8(s), vget_low_u8(p));
+      const uint16x8_t d1 = vsubl_u8(vget_high_u8(s), vget_high_u8(p));
+      vst1q_s16(&diff[0], vreinterpretq_s16_u16(d0));
+      vst1q_s16(&diff[8], vreinterpretq_s16_u16(d1));
       diff += diff_stride;
       pred += pred_stride;
       src += src_stride;
-    }
+    } while (--r);
   } else if (cols > 4) {
-    for (r = 0; r < rows; ++r) {
-      const uint8x8_t v_src = vld1_u8(&src[0]);
-      const uint8x8_t v_pred = vld1_u8(&pred[0]);
-      const uint16x8_t v_diff = vsubl_u8(v_src, v_pred);
+    do {
+      const uint8x8_t s = vld1_u8(&src[0]);
+      const uint8x8_t p = vld1_u8(&pred[0]);
+      const uint16x8_t v_diff = vsubl_u8(s, p);
       vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));
       diff += diff_stride;
       pred += pred_stride;
       src += src_stride;
-    }
+    } while (--r);
   } else {
-    for (r = 0; r < rows; ++r) {
-      for (c = 0; c < cols; ++c) diff[c] = src[c] - pred[c];
-
-      diff += diff_stride;
-      pred += pred_stride;
-      src += src_stride;
-    }
+    assert(cols == 4);
+    do {
+      const uint8x8_t s = load_unaligned_u8(src, (int)src_stride);
+      const uint8x8_t p = load_unaligned_u8(pred, (int)pred_stride);
+      const uint16x8_t d = vsubl_u8(s, p);
+      vst1_s16(diff + 0 * diff_stride, vreinterpret_s16_u16(vget_low_u16(d)));
+      vst1_s16(diff + 1 * diff_stride, vreinterpret_s16_u16(vget_high_u16(d)));
+      diff += 2 * diff_stride;
+      pred += 2 * pred_stride;
+      src += 2 * src_stride;
+      r -= 2;
+    } while (r);
   }
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_subtract_block_neon(int rows, int cols, int16_t *diff_ptr,
+                                    ptrdiff_t diff_stride,
+                                    const uint8_t *src8_ptr,
+                                    ptrdiff_t src_stride,
+                                    const uint8_t *pred8_ptr,
+                                    ptrdiff_t pred_stride, int bd) {
+  int r = rows, c;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8_ptr);
+  (void)bd;
+
+  if (cols >= 16) {
+    do {
+      for (c = 0; c < cols; c += 16) {
+        const uint16x8_t s0 = vld1q_u16(&src[c + 0]);
+        const uint16x8_t s1 = vld1q_u16(&src[c + 8]);
+        const uint16x8_t p0 = vld1q_u16(&pred[c + 0]);
+        const uint16x8_t p1 = vld1q_u16(&pred[c + 8]);
+        const uint16x8_t d0 = vsubq_u16(s0, p0);
+        const uint16x8_t d1 = vsubq_u16(s1, p1);
+        vst1q_s16(&diff_ptr[c + 0], vreinterpretq_s16_u16(d0));
+        vst1q_s16(&diff_ptr[c + 8], vreinterpretq_s16_u16(d1));
+      }
+      diff_ptr += diff_stride;
+      pred += pred_stride;
+      src += src_stride;
+    } while (--r);
+  } else if (cols >= 8) {
+    do {
+      for (c = 0; c < cols; c += 8) {
+        const uint16x8_t s = vld1q_u16(&src[c]);
+        const uint16x8_t p = vld1q_u16(&pred[c]);
+        const uint16x8_t d0 = vsubq_u16(s, p);
+        vst1q_s16(&diff_ptr[c], vreinterpretq_s16_u16(d0));
+      }
+      diff_ptr += diff_stride;
+      pred += pred_stride;
+      src += src_stride;
+    } while (--r);
+  } else if (cols >= 4) {
+    do {
+      for (c = 0; c < cols; c += 4) {
+        const uint16x4_t s = vld1_u16(&src[c]);
+        const uint16x4_t p = vld1_u16(&pred[c]);
+        const uint16x4_t v_diff = vsub_u16(s, p);
+        vst1_s16(&diff_ptr[c], vreinterpret_s16_u16(v_diff));
+      }
+      diff_ptr += diff_stride;
+      pred += pred_stride;
+      src += src_stride;
+    } while (--r);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h
new file mode 100644
index 0000000000..11821dc10e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h
@@ -0,0 +1,275 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_SUM_NEON_H_
+#define VPX_VPX_DSP_ARM_SUM_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+static INLINE uint16_t horizontal_add_uint8x4(const uint8x8_t a) {
+#if VPX_ARCH_AARCH64
+  return vaddlv_u8(a);
+#else
+  const uint16x4_t b = vpaddl_u8(a);
+  const uint16x4_t c = vpadd_u16(b, b);
+  return vget_lane_u16(c, 0);
+#endif
+}
+
+static INLINE uint16_t horizontal_add_uint8x8(const uint8x8_t a) {
+#if VPX_ARCH_AARCH64
+  return vaddlv_u8(a);
+#else
+  const uint16x4_t b = vpaddl_u8(a);
+  const uint16x4_t c = vpadd_u16(b, b);
+  const uint16x4_t d = vpadd_u16(c, c);
+  return vget_lane_u16(d, 0);
+#endif
+}
+
+static INLINE uint16_t horizontal_add_uint8x16(const uint8x16_t a) {
+#if VPX_ARCH_AARCH64
+  return vaddlvq_u8(a);
+#else
+  const uint16x8_t b = vpaddlq_u8(a);
+  const uint16x4_t c = vadd_u16(vget_low_u16(b), vget_high_u16(b));
+  const uint16x4_t d = vpadd_u16(c, c);
+  const uint16x4_t e = vpadd_u16(d, d);
+  return vget_lane_u16(e, 0);
+#endif
+}
+
+static INLINE uint16_t horizontal_add_uint16x4(const uint16x4_t a) {
+#if VPX_ARCH_AARCH64
+  return vaddv_u16(a);
+#else
+  const uint16x4_t b = vpadd_u16(a, a);
+  const uint16x4_t c = vpadd_u16(b, b);
+  return vget_lane_u16(c, 0);
+#endif
+}
+
+static INLINE int32_t horizontal_add_int16x8(const int16x8_t a) {
+#if VPX_ARCH_AARCH64
+  return vaddlvq_s16(a);
+#else
+  const int32x4_t b = vpaddlq_s16(a);
+  const int64x2_t c = vpaddlq_s32(b);
+  const int32x2_t d = vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)),
+                               vreinterpret_s32_s64(vget_high_s64(c)));
+  return vget_lane_s32(d, 0);
+#endif
+}
+
+static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) {
+#if VPX_ARCH_AARCH64
+  return vaddlvq_u16(a);
+#else
+  const uint32x4_t b = vpaddlq_u16(a);
+  const uint64x2_t c = vpaddlq_u32(b);
+  const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
+                                vreinterpret_u32_u64(vget_high_u64(c)));
+  return vget_lane_u32(d, 0);
+#endif
+}
+
+static INLINE uint32x4_t horizontal_add_4d_uint16x8(const uint16x8_t sum[4]) {
+#if VPX_ARCH_AARCH64
+  const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
+  const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
+  const uint16x8_t b0 = vpaddq_u16(a0, a1);
+  return vpaddlq_u16(b0);
+#else
+  const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
+  const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
+  const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
+  const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
+  const uint16x4_t b0 = vpadd_u16(a0, a1);
+  const uint16x4_t b1 = vpadd_u16(a2, a3);
+  return vpaddlq_u16(vcombine_u16(b0, b1));
+#endif
+}
+
+static INLINE uint32_t horizontal_long_add_uint16x8(const uint16x8_t vec_lo,
+                                                    const uint16x8_t vec_hi) {
+#if VPX_ARCH_AARCH64
+  return vaddlvq_u16(vec_lo) + vaddlvq_u16(vec_hi);
+#else
+  const uint32x4_t vec_l_lo =
+      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
+  const uint32x4_t vec_l_hi =
+      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
+  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+#endif
+}
+
+static INLINE uint32x4_t horizontal_long_add_4d_uint16x8(
+    const uint16x8_t sum_lo[4], const uint16x8_t sum_hi[4]) {
+  const uint32x4_t a0 = vpaddlq_u16(sum_lo[0]);
+  const uint32x4_t a1 = vpaddlq_u16(sum_lo[1]);
+  const uint32x4_t a2 = vpaddlq_u16(sum_lo[2]);
+  const uint32x4_t a3 = vpaddlq_u16(sum_lo[3]);
+  const uint32x4_t b0 = vpadalq_u16(a0, sum_hi[0]);
+  const uint32x4_t b1 = vpadalq_u16(a1, sum_hi[1]);
+  const uint32x4_t b2 = vpadalq_u16(a2, sum_hi[2]);
+  const uint32x4_t b3 = vpadalq_u16(a3, sum_hi[3]);
+#if VPX_ARCH_AARCH64
+  const uint32x4_t c0 = vpaddq_u32(b0, b1);
+  const uint32x4_t c1 = vpaddq_u32(b2, b3);
+  return vpaddq_u32(c0, c1);
+#else
+  const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0));
+  const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1));
+  const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2));
+  const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3));
+  const uint32x2_t d0 = vpadd_u32(c0, c1);
+  const uint32x2_t d1 = vpadd_u32(c2, c3);
+  return vcombine_u32(d0, d1);
+#endif
+}
+
+static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) {
+#if VPX_ARCH_AARCH64
+  return vaddv_s32(a);
+#else
+  return vget_lane_s32(a, 0) + vget_lane_s32(a, 1);
+#endif
+}
+
+static INLINE uint32_t horizontal_add_uint32x2(const uint32x2_t a) {
+#if VPX_ARCH_AARCH64
+  return vaddv_u32(a);
+#else
+  const uint64x1_t b = vpaddl_u32(a);
+  return vget_lane_u32(vreinterpret_u32_u64(b), 0);
+#endif
+}
+
+static INLINE int32_t horizontal_add_int32x4(const int32x4_t a) {
+#if VPX_ARCH_AARCH64
+  return vaddvq_s32(a);
+#else
+  const int64x2_t b = vpaddlq_s32(a);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+#endif
+}
+
+static INLINE uint32_t horizontal_add_uint32x4(const uint32x4_t a) {
+#if VPX_ARCH_AARCH64
+  return vaddvq_u32(a);
+#else
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+#endif
+}
+
+static INLINE uint32x4_t horizontal_add_4d_uint32x4(const uint32x4_t sum[4]) {
+#if VPX_ARCH_AARCH64
+  uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]);
+  uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]);
+  return vpaddq_u32(res01, res23);
+#else
+  uint32x4_t res = vdupq_n_u32(0);
+  res = vsetq_lane_u32(horizontal_add_uint32x4(sum[0]), res, 0);
+  res = vsetq_lane_u32(horizontal_add_uint32x4(sum[1]), res, 1);
+  res = vsetq_lane_u32(horizontal_add_uint32x4(sum[2]), res, 2);
+  res = vsetq_lane_u32(horizontal_add_uint32x4(sum[3]), res, 3);
+  return res;
+#endif
+}
+
+static INLINE uint64_t horizontal_long_add_uint32x4(const uint32x4_t a) {
+#if VPX_ARCH_AARCH64
+  return vaddlvq_u32(a);
+#else
+  const uint64x2_t b = vpaddlq_u32(a);
+  return vgetq_lane_u64(b, 0) + vgetq_lane_u64(b, 1);
+#endif
+}
+
+static INLINE int64_t horizontal_add_int64x2(const int64x2_t a) {
+#if VPX_ARCH_AARCH64
+  return vaddvq_s64(a);
+#else
+  return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1);
+#endif
+}
+
+static INLINE uint64_t horizontal_add_uint64x2(const uint64x2_t a) {
+#if VPX_ARCH_AARCH64
+  return vaddvq_u64(a);
+#else
+  return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1);
+#endif
+}
+
+static INLINE uint64_t horizontal_long_add_uint32x4_x2(const uint32x4_t a[2]) {
+  return horizontal_long_add_uint32x4(a[0]) +
+         horizontal_long_add_uint32x4(a[1]);
+}
+
+static INLINE uint64_t horizontal_long_add_uint32x4_x4(const uint32x4_t a[4]) {
+  uint64x2_t sum = vpaddlq_u32(a[0]);
+  sum = vpadalq_u32(sum, a[1]);
+  sum = vpadalq_u32(sum, a[2]);
+  sum = vpadalq_u32(sum, a[3]);
+
+  return horizontal_add_uint64x2(sum);
+}
+
+static INLINE uint64_t horizontal_long_add_uint32x4_x8(const uint32x4_t a[8]) {
+  uint64x2_t sum[2];
+  sum[0] = vpaddlq_u32(a[0]);
+  sum[1] = vpaddlq_u32(a[1]);
+  sum[0] = vpadalq_u32(sum[0], a[2]);
+  sum[1] = vpadalq_u32(sum[1], a[3]);
+  sum[0] = vpadalq_u32(sum[0], a[4]);
+  sum[1] = vpadalq_u32(sum[1], a[5]);
+  sum[0] = vpadalq_u32(sum[0], a[6]);
+  sum[1] = vpadalq_u32(sum[1], a[7]);
+
+  return horizontal_add_uint64x2(vaddq_u64(sum[0], sum[1]));
+}
+
+static INLINE uint64_t
+horizontal_long_add_uint32x4_x16(const uint32x4_t a[16]) {
+  uint64x2_t sum[2];
+  sum[0] = vpaddlq_u32(a[0]);
+  sum[1] = vpaddlq_u32(a[1]);
+  sum[0] = vpadalq_u32(sum[0], a[2]);
+  sum[1] = vpadalq_u32(sum[1], a[3]);
+  sum[0] = vpadalq_u32(sum[0], a[4]);
+  sum[1] = vpadalq_u32(sum[1], a[5]);
+  sum[0] = vpadalq_u32(sum[0], a[6]);
+  sum[1] = vpadalq_u32(sum[1], a[7]);
+  sum[0] = vpadalq_u32(sum[0], a[8]);
+  sum[1] = vpadalq_u32(sum[1], a[9]);
+  sum[0] = vpadalq_u32(sum[0], a[10]);
+  sum[1] = vpadalq_u32(sum[1], a[11]);
+  sum[0] = vpadalq_u32(sum[0], a[12]);
+  sum[1] = vpadalq_u32(sum[1], a[13]);
+  sum[0] = vpadalq_u32(sum[0], a[14]);
+  sum[1] = vpadalq_u32(sum[1], a[15]);
+
+  return horizontal_add_uint64x2(vaddq_u64(sum[0], sum[1]));
+}
+
+#endif  // VPX_VPX_DSP_ARM_SUM_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_neon.c
new file mode 100644
index 0000000000..074afe3258
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_neon.c
@@ -0,0 +1,100 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+uint64_t vpx_sum_squares_2d_i16_neon(const int16_t *src, int stride, int size) {
+  if (size == 4) {
+    int16x4_t s[4];
+    int32x4_t sum_s32;
+
+    s[0] = vld1_s16(src + 0 * stride);
+    s[1] = vld1_s16(src + 1 * stride);
+    s[2] = vld1_s16(src + 2 * stride);
+    s[3] = vld1_s16(src + 3 * stride);
+
+    sum_s32 = vmull_s16(s[0], s[0]);
+    sum_s32 = vmlal_s16(sum_s32, s[1], s[1]);
+    sum_s32 = vmlal_s16(sum_s32, s[2], s[2]);
+    sum_s32 = vmlal_s16(sum_s32, s[3], s[3]);
+
+    return horizontal_long_add_uint32x4(vreinterpretq_u32_s32(sum_s32));
+  } else {
+    uint64x2_t sum_u64 = vdupq_n_u64(0);
+    int rows = size;
+
+    do {
+      const int16_t *src_ptr = src;
+      int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+      int cols = size;
+
+      do {
+        int16x8_t s[8];
+
+        s[0] = vld1q_s16(src_ptr + 0 * stride);
+        s[1] = vld1q_s16(src_ptr + 1 * stride);
+        s[2] = vld1q_s16(src_ptr + 2 * stride);
+        s[3] = vld1q_s16(src_ptr + 3 * stride);
+        s[4] = vld1q_s16(src_ptr + 4 * stride);
+        s[5] = vld1q_s16(src_ptr + 5 * stride);
+        s[6] = vld1q_s16(src_ptr + 6 * stride);
+        s[7] = vld1q_s16(src_ptr + 7 * stride);
+
+        sum_s32[0] =
+            vmlal_s16(sum_s32[0], vget_low_s16(s[0]), vget_low_s16(s[0]));
+        sum_s32[0] =
+            vmlal_s16(sum_s32[0], vget_low_s16(s[1]), vget_low_s16(s[1]));
+        sum_s32[0] =
+            vmlal_s16(sum_s32[0], vget_low_s16(s[2]), vget_low_s16(s[2]));
+        sum_s32[0] =
+            vmlal_s16(sum_s32[0], vget_low_s16(s[3]), vget_low_s16(s[3]));
+        sum_s32[0] =
+            vmlal_s16(sum_s32[0], vget_low_s16(s[4]), vget_low_s16(s[4]));
+        sum_s32[0] =
+            vmlal_s16(sum_s32[0], vget_low_s16(s[5]), vget_low_s16(s[5]));
+        sum_s32[0] =
+            vmlal_s16(sum_s32[0], vget_low_s16(s[6]), vget_low_s16(s[6]));
+        sum_s32[0] =
+            vmlal_s16(sum_s32[0], vget_low_s16(s[7]), vget_low_s16(s[7]));
+
+        sum_s32[1] =
+            vmlal_s16(sum_s32[1], vget_high_s16(s[0]), vget_high_s16(s[0]));
+        sum_s32[1] =
+            vmlal_s16(sum_s32[1], vget_high_s16(s[1]), vget_high_s16(s[1]));
+        sum_s32[1] =
+            vmlal_s16(sum_s32[1], vget_high_s16(s[2]), vget_high_s16(s[2]));
+        sum_s32[1] =
+            vmlal_s16(sum_s32[1], vget_high_s16(s[3]), vget_high_s16(s[3]));
+        sum_s32[1] =
+            vmlal_s16(sum_s32[1], vget_high_s16(s[4]), vget_high_s16(s[4]));
+        sum_s32[1] =
+            vmlal_s16(sum_s32[1], vget_high_s16(s[5]), vget_high_s16(s[5]));
+        sum_s32[1] =
+            vmlal_s16(sum_s32[1], vget_high_s16(s[6]), vget_high_s16(s[6]));
+        sum_s32[1] =
+            vmlal_s16(sum_s32[1], vget_high_s16(s[7]), vget_high_s16(s[7]));
+
+        src_ptr += 8;
+        cols -= 8;
+      } while (cols);
+
+      sum_u64 = vpadalq_u32(sum_u64, vreinterpretq_u32_s32(sum_s32[0]));
+      sum_u64 = vpadalq_u32(sum_u64, vreinterpretq_u32_s32(sum_s32[1]));
+      src += 8 * stride;
+      rows -= 8;
+    } while (rows);
+
+    return horizontal_add_uint64x2(sum_u64);
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_sve.c b/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_sve.c
new file mode 100644
index 0000000000..a18cbbd736
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_sve.c
@@ -0,0 +1,73 @@
+/*
+ *  Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_dsp/arm/vpx_neon_sve_bridge.h"
+
+uint64_t vpx_sum_squares_2d_i16_sve(const int16_t *src, int stride, int size) {
+  if (size == 4) {
+    int16x4_t s[4];
+    int64x2_t sum = vdupq_n_s64(0);
+
+    s[0] = vld1_s16(src + 0 * stride);
+    s[1] = vld1_s16(src + 1 * stride);
+    s[2] = vld1_s16(src + 2 * stride);
+    s[3] = vld1_s16(src + 3 * stride);
+
+    int16x8_t s01 = vcombine_s16(s[0], s[1]);
+    int16x8_t s23 = vcombine_s16(s[2], s[3]);
+
+    sum = vpx_dotq_s16(sum, s01, s01);
+    sum = vpx_dotq_s16(sum, s23, s23);
+
+    return horizontal_add_uint64x2(vreinterpretq_u64_s64(sum));
+  } else {
+    int rows = size;
+    int64x2_t sum[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
+                         vdupq_n_s64(0) };
+
+    do {
+      const int16_t *src_ptr = src;
+      int cols = size;
+
+      do {
+        int16x8_t s[8];
+        load_s16_8x8(src_ptr, stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                     &s[6], &s[7]);
+
+        sum[0] = vpx_dotq_s16(sum[0], s[0], s[0]);
+        sum[1] = vpx_dotq_s16(sum[1], s[1], s[1]);
+        sum[2] = vpx_dotq_s16(sum[2], s[2], s[2]);
+        sum[3] = vpx_dotq_s16(sum[3], s[3], s[3]);
+        sum[0] = vpx_dotq_s16(sum[0], s[4], s[4]);
+        sum[1] = vpx_dotq_s16(sum[1], s[5], s[5]);
+        sum[2] = vpx_dotq_s16(sum[2], s[6], s[6]);
+        sum[3] = vpx_dotq_s16(sum[3], s[7], s[7]);
+
+        src_ptr += 8;
+        cols -= 8;
+      } while (cols);
+
+      src += 8 * stride;
+      rows -= 8;
+    } while (rows);
+
+    sum[0] = vaddq_s64(sum[0], sum[1]);
+    sum[2] = vaddq_s64(sum[2], sum[3]);
+    sum[0] = vaddq_s64(sum[0], sum[2]);
+
+    return horizontal_add_uint64x2(vreinterpretq_u64_s64(sum[0]));
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h
index 8366ce50b8..b6b46fc47e 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h
+++ b/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_ARM_TRANSPOSE_NEON_H_
-#define VPX_DSP_ARM_TRANSPOSE_NEON_H_
+#ifndef VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_
+#define VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_
 
 #include <arm_neon.h>
 
@@ -23,44 +23,77 @@
 // b0.val[1]: 04 05 06 07 20 21 22 23
 static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
   int16x8x2_t b0;
+#if VPX_ARCH_AARCH64
+  b0.val[0] = vreinterpretq_s16_s64(
+      vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+  b0.val[1] = vreinterpretq_s16_s64(
+      vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+#else
   b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
                            vreinterpret_s16_s32(vget_low_s32(a1)));
   b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
                            vreinterpret_s16_s32(vget_high_s32(a1)));
+#endif
   return b0;
 }
 
 static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
   int32x4x2_t b0;
+#if VPX_ARCH_AARCH64
+  b0.val[0] = vreinterpretq_s32_s64(
+      vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+  b0.val[1] = vreinterpretq_s32_s64(
+      vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+#else
   b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
   b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
+#endif
   return b0;
 }
 
 static INLINE int64x2x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) {
   int64x2x2_t b0;
+#if VPX_ARCH_AARCH64
+  b0.val[0] = vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1));
+  b0.val[1] = vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1));
+#else
   b0.val[0] = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(a0)),
                            vreinterpret_s64_s32(vget_low_s32(a1)));
   b0.val[1] = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(a0)),
                            vreinterpret_s64_s32(vget_high_s32(a1)));
+#endif
   return b0;
 }
 
 static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) {
   uint8x16x2_t b0;
+#if VPX_ARCH_AARCH64
+  b0.val[0] = vreinterpretq_u8_u64(
+      vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+  b0.val[1] = vreinterpretq_u8_u64(
+      vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+#else
   b0.val[0] = vcombine_u8(vreinterpret_u8_u32(vget_low_u32(a0)),
                           vreinterpret_u8_u32(vget_low_u32(a1)));
   b0.val[1] = vcombine_u8(vreinterpret_u8_u32(vget_high_u32(a0)),
                           vreinterpret_u8_u32(vget_high_u32(a1)));
+#endif
   return b0;
 }
 
 static INLINE uint16x8x2_t vpx_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
   uint16x8x2_t b0;
+#if VPX_ARCH_AARCH64
+  b0.val[0] = vreinterpretq_u16_u64(
+      vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+  b0.val[1] = vreinterpretq_u16_u64(
+      vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+#else
   b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
                            vreinterpret_u16_u32(vget_low_u32(a1)));
   b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
                            vreinterpret_u16_u32(vget_high_u32(a1)));
+#endif
   return b0;
 }
 
@@ -138,20 +171,16 @@ static INLINE void transpose_s16_4x4q(int16x8_t *a0, int16x8_t *a1) {
       vtrnq_s32(vreinterpretq_s32_s16(*a0), vreinterpretq_s32_s16(*a1));
 
   // Swap 64 bit elements resulting in:
-  // c0.val[0]: 00 01 20 21  02 03 22 23
-  // c0.val[1]: 10 11 30 31  12 13 32 33
+  // c0: 00 01 20 21  02 03 22 23
+  // c1: 10 11 30 31  12 13 32 33
 
-  const int32x4_t c0 =
-      vcombine_s32(vget_low_s32(b0.val[0]), vget_low_s32(b0.val[1]));
-  const int32x4_t c1 =
-      vcombine_s32(vget_high_s32(b0.val[0]), vget_high_s32(b0.val[1]));
+  const int16x8x2_t c0 = vpx_vtrnq_s64_to_s16(b0.val[0], b0.val[1]);
 
   // Swap 16 bit elements resulting in:
   // d0.val[0]: 00 10 20 30  02 12 22 32
   // d0.val[1]: 01 11 21 31  03 13 23 33
 
-  const int16x8x2_t d0 =
-      vtrnq_s16(vreinterpretq_s16_s32(c0), vreinterpretq_s16_s32(c1));
+  const int16x8x2_t d0 = vtrnq_s16(c0.val[0], c0.val[1]);
 
   *a0 = d0.val[0];
   *a1 = d0.val[1];
@@ -169,20 +198,16 @@ static INLINE void transpose_u16_4x4q(uint16x8_t *a0, uint16x8_t *a1) {
       vtrnq_u32(vreinterpretq_u32_u16(*a0), vreinterpretq_u32_u16(*a1));
 
   // Swap 64 bit elements resulting in:
-  // c0.val[0]: 00 01 20 21  02 03 22 23
-  // c0.val[1]: 10 11 30 31  12 13 32 33
+  // c0: 00 01 20 21  02 03 22 23
+  // c1: 10 11 30 31  12 13 32 33
 
-  const uint32x4_t c0 =
-      vcombine_u32(vget_low_u32(b0.val[0]), vget_low_u32(b0.val[1]));
-  const uint32x4_t c1 =
-      vcombine_u32(vget_high_u32(b0.val[0]), vget_high_u32(b0.val[1]));
+  const uint16x8x2_t c0 = vpx_vtrnq_u64_to_u16(b0.val[0], b0.val[1]);
 
   // Swap 16 bit elements resulting in:
   // d0.val[0]: 00 10 20 30  02 12 22 32
   // d0.val[1]: 01 11 21 31  03 13 23 33
 
-  const uint16x8x2_t d0 =
-      vtrnq_u16(vreinterpretq_u16_u32(c0), vreinterpretq_u16_u32(c1));
+  const uint16x8x2_t d0 = vtrnq_u16(c0.val[0], c0.val[1]);
 
   *a0 = d0.val[0];
   *a1 = d0.val[1];
@@ -281,7 +306,7 @@ static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1,
                                      const int16x4_t a6, const int16x4_t a7,
                                      int16x8_t *const o0, int16x8_t *const o1,
                                      int16x8_t *const o2, int16x8_t *const o3) {
-  // Swap 16 bit elements. Goes from:
+  // Combine rows. Goes from:
   // a0: 00 01 02 03
   // a1: 10 11 12 13
   // a2: 20 21 22 23
@@ -291,53 +316,40 @@ static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1,
   // a6: 60 61 62 63
   // a7: 70 71 72 73
   // to:
-  // b0.val[0]: 00 10 02 12
-  // b0.val[1]: 01 11 03 13
-  // b1.val[0]: 20 30 22 32
-  // b1.val[1]: 21 31 23 33
-  // b2.val[0]: 40 50 42 52
-  // b2.val[1]: 41 51 43 53
-  // b3.val[0]: 60 70 62 72
-  // b3.val[1]: 61 71 63 73
+  // b0: 00 01 02 03 40 41 42 43
+  // b1: 10 11 12 13 50 51 52 53
+  // b2: 20 21 22 23 60 61 62 63
+  // b3: 30 31 32 33 70 71 72 73
 
-  const int16x4x2_t b0 = vtrn_s16(a0, a1);
-  const int16x4x2_t b1 = vtrn_s16(a2, a3);
-  const int16x4x2_t b2 = vtrn_s16(a4, a5);
-  const int16x4x2_t b3 = vtrn_s16(a6, a7);
+  const int16x8_t b0 = vcombine_s16(a0, a4);
+  const int16x8_t b1 = vcombine_s16(a1, a5);
+  const int16x8_t b2 = vcombine_s16(a2, a6);
+  const int16x8_t b3 = vcombine_s16(a3, a7);
+
+  // Swap 16 bit elements resulting in:
+  // c0.val[0]: 00 10 02 12 40 50 42 52
+  // c0.val[1]: 01 11 03 13 41 51 43 53
+  // c1.val[0]: 20 30 22 32 60 70 62 72
+  // c1.val[1]: 21 31 23 33 61 71 63 73
+
+  const int16x8x2_t c0 = vtrnq_s16(b0, b1);
+  const int16x8x2_t c1 = vtrnq_s16(b2, b3);
 
   // Swap 32 bit elements resulting in:
-  // c0.val[0]: 00 10 20 30
-  // c0.val[1]: 02 12 22 32
-  // c1.val[0]: 01 11 21 31
-  // c1.val[1]: 03 13 23 33
-  // c2.val[0]: 40 50 60 70
-  // c2.val[1]: 42 52 62 72
-  // c3.val[0]: 41 51 61 71
-  // c3.val[1]: 43 53 63 73
+  // d0.val[0]: 00 10 20 30 40 50 60 70
+  // d0.val[1]: 02 12 22 32 42 52 62 72
+  // d1.val[0]: 01 11 21 31 41 51 61 71
+  // d1.val[1]: 03 13 23 33 43 53 63 73
 
-  const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
-                                  vreinterpret_s32_s16(b1.val[0]));
-  const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
-                                  vreinterpret_s32_s16(b1.val[1]));
-  const int32x2x2_t c2 = vtrn_s32(vreinterpret_s32_s16(b2.val[0]),
-                                  vreinterpret_s32_s16(b3.val[0]));
-  const int32x2x2_t c3 = vtrn_s32(vreinterpret_s32_s16(b2.val[1]),
-                                  vreinterpret_s32_s16(b3.val[1]));
+  const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
+                                   vreinterpretq_s32_s16(c1.val[0]));
+  const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
+                                   vreinterpretq_s32_s16(c1.val[1]));
 
-  // Swap 64 bit elements resulting in:
-  // o0: 00 10 20 30 40 50 60 70
-  // o1: 01 11 21 31 41 51 61 71
-  // o2: 02 12 22 32 42 52 62 72
-  // o3: 03 13 23 33 43 53 63 73
-
-  *o0 = vcombine_s16(vreinterpret_s16_s32(c0.val[0]),
-                     vreinterpret_s16_s32(c2.val[0]));
-  *o1 = vcombine_s16(vreinterpret_s16_s32(c1.val[0]),
-                     vreinterpret_s16_s32(c3.val[0]));
-  *o2 = vcombine_s16(vreinterpret_s16_s32(c0.val[1]),
-                     vreinterpret_s16_s32(c2.val[1]));
-  *o3 = vcombine_s16(vreinterpret_s16_s32(c1.val[1]),
-                     vreinterpret_s16_s32(c3.val[1]));
+  *o0 = vreinterpretq_s16_s32(d0.val[0]);
+  *o1 = vreinterpretq_s16_s32(d1.val[0]);
+  *o2 = vreinterpretq_s16_s32(d0.val[1]);
+  *o3 = vreinterpretq_s16_s32(d1.val[1]);
 }
 
 static INLINE void transpose_s32_4x8(int32x4_t *const a0, int32x4_t *const a1,
@@ -512,12 +524,20 @@ static INLINE void transpose_s32_8x4(int32x4_t *const a0, int32x4_t *const a1,
   *a7 = vreinterpretq_s32_s64(c3.val[1]);
 }
 
-// Note: Using 'd' registers or 'q' registers has almost identical speed. We use
-// 'q' registers here to save some instructions.
 static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
                                     uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5,
                                     uint8x8_t *a6, uint8x8_t *a7) {
-  // Swap 8 bit elements. Goes from:
+  // Widen to 128-bit registers (usually a no-op once inlined.)
+  const uint8x16_t a0q = vcombine_u8(*a0, vdup_n_u8(0));
+  const uint8x16_t a1q = vcombine_u8(*a1, vdup_n_u8(0));
+  const uint8x16_t a2q = vcombine_u8(*a2, vdup_n_u8(0));
+  const uint8x16_t a3q = vcombine_u8(*a3, vdup_n_u8(0));
+  const uint8x16_t a4q = vcombine_u8(*a4, vdup_n_u8(0));
+  const uint8x16_t a5q = vcombine_u8(*a5, vdup_n_u8(0));
+  const uint8x16_t a6q = vcombine_u8(*a6, vdup_n_u8(0));
+  const uint8x16_t a7q = vcombine_u8(*a7, vdup_n_u8(0));
+
+  // Zip 8 bit elements. Goes from:
   // a0: 00 01 02 03 04 05 06 07
   // a1: 10 11 12 13 14 15 16 17
   // a2: 20 21 22 23 24 25 26 27
@@ -527,47 +547,115 @@ static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
   // a6: 60 61 62 63 64 65 66 67
   // a7: 70 71 72 73 74 75 76 77
   // to:
-  // b0.val[0]: 00 10 02 12 04 14 06 16  40 50 42 52 44 54 46 56
-  // b0.val[1]: 01 11 03 13 05 15 07 17  41 51 43 53 45 55 47 57
-  // b1.val[0]: 20 30 22 32 24 34 26 36  60 70 62 72 64 74 66 76
-  // b1.val[1]: 21 31 23 33 25 35 27 37  61 71 63 73 65 75 67 77
+  // b0: 00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+  // b1: 20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+  // b2: 40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
+  // b3: 60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
+  const uint8x16_t b0 = vzipq_u8(a0q, a1q).val[0];
+  const uint8x16_t b1 = vzipq_u8(a2q, a3q).val[0];
+  const uint8x16_t b2 = vzipq_u8(a4q, a5q).val[0];
+  const uint8x16_t b3 = vzipq_u8(a6q, a7q).val[0];
 
-  const uint8x16x2_t b0 =
-      vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5));
-  const uint8x16x2_t b1 =
-      vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7));
+  // Zip 16 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
+  // c0.val[1]: 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
+  // c1.val[0]: 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
+  // c1.val[1]: 44 54 64 74 45 55 65 75  46 66 56 76 47 67 57 77
+  const uint16x8x2_t c0 =
+      vzipq_u16(vreinterpretq_u16_u8(b0), vreinterpretq_u16_u8(b1));
+  const uint16x8x2_t c1 =
+      vzipq_u16(vreinterpretq_u16_u8(b2), vreinterpretq_u16_u8(b3));
 
-  // Swap 16 bit elements resulting in:
-  // c0.val[0]: 00 10 20 30 04 14 24 34  40 50 60 70 44 54 64 74
-  // c0.val[1]: 02 12 22 32 06 16 26 36  42 52 62 72 46 56 66 76
-  // c1.val[0]: 01 11 21 31 05 15 25 35  41 51 61 71 45 55 65 75
-  // c1.val[1]: 03 13 23 33 07 17 27 37  43 53 63 73 47 57 67 77
-
-  const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
-                                    vreinterpretq_u16_u8(b1.val[0]));
-  const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
-                                    vreinterpretq_u16_u8(b1.val[1]));
-
-  // Unzip 32 bit elements resulting in:
+  // Zip 32 bit elements resulting in:
   // d0.val[0]: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
-  // d0.val[1]: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
-  // d1.val[0]: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
+  // d0.val[1]: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
+  // d1.val[0]: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
   // d1.val[1]: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
-  const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
+  const uint32x4x2_t d0 = vzipq_u32(vreinterpretq_u32_u16(c0.val[0]),
                                     vreinterpretq_u32_u16(c1.val[0]));
-  const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
+  const uint32x4x2_t d1 = vzipq_u32(vreinterpretq_u32_u16(c0.val[1]),
                                     vreinterpretq_u32_u16(c1.val[1]));
 
   *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
   *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
-  *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
-  *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
-  *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
-  *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
+  *a2 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
+  *a3 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
+  *a4 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
+  *a5 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
   *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
   *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
 }
 
+// Transpose 8x8 to a new location.
+static INLINE void transpose_s16_8x8q(int16x8_t *a, int16x8_t *out) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  // b2.val[0]: 40 50 42 52 44 54 46 56
+  // b2.val[1]: 41 51 43 53 45 55 47 57
+  // b3.val[0]: 60 70 62 72 64 74 66 76
+  // b3.val[1]: 61 71 63 73 65 75 67 77
+
+  const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]);
+  const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]);
+  const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]);
+  const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  // c2.val[0]: 40 50 60 70 44 54 64 74
+  // c2.val[1]: 42 52 62 72 46 56 66 76
+  // c3.val[0]: 41 51 61 71 45 55 65 75
+  // c3.val[1]: 43 53 63 73 47 57 67 77
+
+  const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+                                   vreinterpretq_s32_s16(b1.val[0]));
+  const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+                                   vreinterpretq_s32_s16(b1.val[1]));
+  const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+                                   vreinterpretq_s32_s16(b3.val[0]));
+  const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+                                   vreinterpretq_s32_s16(b3.val[1]));
+
+  // Swap 64 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70
+  // d0.val[1]: 04 14 24 34 44 54 64 74
+  // d1.val[0]: 01 11 21 31 41 51 61 71
+  // d1.val[1]: 05 15 25 35 45 55 65 75
+  // d2.val[0]: 02 12 22 32 42 52 62 72
+  // d2.val[1]: 06 16 26 36 46 56 66 76
+  // d3.val[0]: 03 13 23 33 43 53 63 73
+  // d3.val[1]: 07 17 27 37 47 57 67 77
+
+  const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
+  const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
+  const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
+  const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
+
+  out[0] = d0.val[0];
+  out[1] = d1.val[0];
+  out[2] = d2.val[0];
+  out[3] = d3.val[0];
+  out[4] = d0.val[1];
+  out[5] = d1.val[1];
+  out[6] = d2.val[1];
+  out[7] = d3.val[1];
+}
+
 static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
                                      int16x8_t *a2, int16x8_t *a3,
                                      int16x8_t *a4, int16x8_t *a5,
@@ -624,6 +712,7 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
   // d2.val[1]: 06 16 26 36 46 56 66 76
   // d3.val[0]: 03 13 23 33 43 53 63 73
   // d3.val[1]: 07 17 27 37 47 57 67 77
+
   const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
   const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
   const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
@@ -695,6 +784,7 @@ static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
   // d2.val[1]: 06 16 26 36 46 56 66 76
   // d3.val[0]: 03 13 23 33 43 53 63 73
   // d3.val[1]: 07 17 27 37 47 57 67 77
+
   const uint16x8x2_t d0 = vpx_vtrnq_u64_to_u16(c0.val[0], c2.val[0]);
   const uint16x8x2_t d1 = vpx_vtrnq_u64_to_u16(c1.val[0], c3.val[0]);
   const uint16x8x2_t d2 = vpx_vtrnq_u64_to_u16(c0.val[1], c2.val[1]);
@@ -710,6 +800,190 @@ static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
   *a7 = d3.val[1];
 }
 
+static INLINE void transpose_s32_8x8(int32x4x2_t *a0, int32x4x2_t *a1,
+                                     int32x4x2_t *a2, int32x4x2_t *a3,
+                                     int32x4x2_t *a4, int32x4x2_t *a5,
+                                     int32x4x2_t *a6, int32x4x2_t *a7) {
+  // Swap 32 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
+  // to:
+  // b0: 00 10 02 12 01 11 03 13
+  // b1: 20 30 22 32 21 31 23 33
+  // b2: 40 50 42 52 41 51 43 53
+  // b3: 60 70 62 72 61 71 63 73
+  // b4: 04 14 06 16 05 15 07 17
+  // b5: 24 34 26 36 25 35 27 37
+  // b6: 44 54 46 56 45 55 47 57
+  // b7: 64 74 66 76 65 75 67 77
+
+  const int32x4x2_t b0 = vtrnq_s32(a0->val[0], a1->val[0]);
+  const int32x4x2_t b1 = vtrnq_s32(a2->val[0], a3->val[0]);
+  const int32x4x2_t b2 = vtrnq_s32(a4->val[0], a5->val[0]);
+  const int32x4x2_t b3 = vtrnq_s32(a6->val[0], a7->val[0]);
+  const int32x4x2_t b4 = vtrnq_s32(a0->val[1], a1->val[1]);
+  const int32x4x2_t b5 = vtrnq_s32(a2->val[1], a3->val[1]);
+  const int32x4x2_t b6 = vtrnq_s32(a4->val[1], a5->val[1]);
+  const int32x4x2_t b7 = vtrnq_s32(a6->val[1], a7->val[1]);
+
+  // Swap 64 bit elements resulting in:
+  // c0: 00 10 20 30 02 12 22 32
+  // c1: 01 11 21 31 03 13 23 33
+  // c2: 40 50 60 70 42 52 62 72
+  // c3: 41 51 61 71 43 53 63 73
+  // c4: 04 14 24 34 06 16 26 36
+  // c5: 05 15 25 35 07 17 27 37
+  // c6: 44 54 64 74 46 56 66 76
+  // c7: 45 55 65 75 47 57 67 77
+  const int32x4x2_t c0 = vpx_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
+  const int32x4x2_t c1 = vpx_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
+  const int32x4x2_t c2 = vpx_vtrnq_s64_to_s32(b2.val[0], b3.val[0]);
+  const int32x4x2_t c3 = vpx_vtrnq_s64_to_s32(b2.val[1], b3.val[1]);
+  const int32x4x2_t c4 = vpx_vtrnq_s64_to_s32(b4.val[0], b5.val[0]);
+  const int32x4x2_t c5 = vpx_vtrnq_s64_to_s32(b4.val[1], b5.val[1]);
+  const int32x4x2_t c6 = vpx_vtrnq_s64_to_s32(b6.val[0], b7.val[0]);
+  const int32x4x2_t c7 = vpx_vtrnq_s64_to_s32(b6.val[1], b7.val[1]);
+
+  // Swap 128 bit elements resulting in:
+  // a0: 00 10 20 30 40 50 60 70
+  // a1: 01 11 21 31 41 51 61 71
+  // a2: 02 12 22 32 42 52 62 72
+  // a3: 03 13 23 33 43 53 63 73
+  // a4: 04 14 24 34 44 54 64 74
+  // a5: 05 15 25 35 45 55 65 75
+  // a6: 06 16 26 36 46 56 66 76
+  // a7: 07 17 27 37 47 57 67 77
+  a0->val[0] = c0.val[0];
+  a0->val[1] = c2.val[0];
+  a1->val[0] = c1.val[0];
+  a1->val[1] = c3.val[0];
+  a2->val[0] = c0.val[1];
+  a2->val[1] = c2.val[1];
+  a3->val[0] = c1.val[1];
+  a3->val[1] = c3.val[1];
+  a4->val[0] = c4.val[0];
+  a4->val[1] = c6.val[0];
+  a5->val[0] = c5.val[0];
+  a5->val[1] = c7.val[0];
+  a6->val[0] = c4.val[1];
+  a6->val[1] = c6.val[1];
+  a7->val[0] = c5.val[1];
+  a7->val[1] = c7.val[1];
+}
+
+// Helper transpose function for highbd FDCT variants
+static INLINE void transpose_s32_8x8_2(int32x4_t *left /*[8]*/,
+                                       int32x4_t *right /*[8]*/,
+                                       int32x4_t *out_left /*[8]*/,
+                                       int32x4_t *out_right /*[8]*/) {
+  int32x4x2_t out[8];
+
+  out[0].val[0] = left[0];
+  out[0].val[1] = right[0];
+  out[1].val[0] = left[1];
+  out[1].val[1] = right[1];
+  out[2].val[0] = left[2];
+  out[2].val[1] = right[2];
+  out[3].val[0] = left[3];
+  out[3].val[1] = right[3];
+  out[4].val[0] = left[4];
+  out[4].val[1] = right[4];
+  out[5].val[0] = left[5];
+  out[5].val[1] = right[5];
+  out[6].val[0] = left[6];
+  out[6].val[1] = right[6];
+  out[7].val[0] = left[7];
+  out[7].val[1] = right[7];
+
+  transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+                    &out[6], &out[7]);
+
+  out_left[0] = out[0].val[0];
+  out_left[1] = out[1].val[0];
+  out_left[2] = out[2].val[0];
+  out_left[3] = out[3].val[0];
+  out_left[4] = out[4].val[0];
+  out_left[5] = out[5].val[0];
+  out_left[6] = out[6].val[0];
+  out_left[7] = out[7].val[0];
+  out_right[0] = out[0].val[1];
+  out_right[1] = out[1].val[1];
+  out_right[2] = out[2].val[1];
+  out_right[3] = out[3].val[1];
+  out_right[4] = out[4].val[1];
+  out_right[5] = out[5].val[1];
+  out_right[6] = out[6].val[1];
+  out_right[7] = out[7].val[1];
+}
+
+static INLINE void transpose_s32_16x16(int32x4_t *left1, int32x4_t *right1,
+                                       int32x4_t *left2, int32x4_t *right2) {
+  int32x4_t tl[16], tr[16];
+
+  // transpose the 4 8x8 quadrants separately but first swap quadrants 2 and 3.
+  tl[0] = left1[8];
+  tl[1] = left1[9];
+  tl[2] = left1[10];
+  tl[3] = left1[11];
+  tl[4] = left1[12];
+  tl[5] = left1[13];
+  tl[6] = left1[14];
+  tl[7] = left1[15];
+  tr[0] = right1[8];
+  tr[1] = right1[9];
+  tr[2] = right1[10];
+  tr[3] = right1[11];
+  tr[4] = right1[12];
+  tr[5] = right1[13];
+  tr[6] = right1[14];
+  tr[7] = right1[15];
+
+  left1[8] = left2[0];
+  left1[9] = left2[1];
+  left1[10] = left2[2];
+  left1[11] = left2[3];
+  left1[12] = left2[4];
+  left1[13] = left2[5];
+  left1[14] = left2[6];
+  left1[15] = left2[7];
+  right1[8] = right2[0];
+  right1[9] = right2[1];
+  right1[10] = right2[2];
+  right1[11] = right2[3];
+  right1[12] = right2[4];
+  right1[13] = right2[5];
+  right1[14] = right2[6];
+  right1[15] = right2[7];
+
+  left2[0] = tl[0];
+  left2[1] = tl[1];
+  left2[2] = tl[2];
+  left2[3] = tl[3];
+  left2[4] = tl[4];
+  left2[5] = tl[5];
+  left2[6] = tl[6];
+  left2[7] = tl[7];
+  right2[0] = tr[0];
+  right2[1] = tr[1];
+  right2[2] = tr[2];
+  right2[3] = tr[3];
+  right2[4] = tr[4];
+  right2[5] = tr[5];
+  right2[6] = tr[6];
+  right2[7] = tr[7];
+
+  transpose_s32_8x8_2(left1, right1, left1, right1);
+  transpose_s32_8x8_2(left2, right2, left2, right2);
+  transpose_s32_8x8_2(left1 + 8, right1 + 8, left1 + 8, right1 + 8);
+  transpose_s32_8x8_2(left2 + 8, right2 + 8, left2 + 8, right2 + 8);
+}
+
 static INLINE void transpose_u8_16x8(
     const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2,
     const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5,
@@ -1107,6 +1381,45 @@ static INLINE void transpose_u8_16x16(
   *o15 = e7.val[1];
 }
 
+static INLINE void transpose_s16_16x16(int16x8_t *in0, int16x8_t *in1) {
+  int16x8_t t[8];
+
+  // transpose the 4 8x8 quadrants separately but first swap quadrants 2 and 3.
+  t[0] = in0[8];
+  t[1] = in0[9];
+  t[2] = in0[10];
+  t[3] = in0[11];
+  t[4] = in0[12];
+  t[5] = in0[13];
+  t[6] = in0[14];
+  t[7] = in0[15];
+  in0[8] = in1[0];
+  in0[9] = in1[1];
+  in0[10] = in1[2];
+  in0[11] = in1[3];
+  in0[12] = in1[4];
+  in0[13] = in1[5];
+  in0[14] = in1[6];
+  in0[15] = in1[7];
+  in1[0] = t[0];
+  in1[1] = t[1];
+  in1[2] = t[2];
+  in1[3] = t[3];
+  in1[4] = t[4];
+  in1[5] = t[5];
+  in1[6] = t[6];
+  in1[7] = t[7];
+
+  transpose_s16_8x8(&in0[0], &in0[1], &in0[2], &in0[3], &in0[4], &in0[5],
+                    &in0[6], &in0[7]);
+  transpose_s16_8x8(&in0[8], &in0[9], &in0[10], &in0[11], &in0[12], &in0[13],
+                    &in0[14], &in0[15]);
+  transpose_s16_8x8(&in1[0], &in1[1], &in1[2], &in1[3], &in1[4], &in1[5],
+                    &in1[6], &in1[7]);
+  transpose_s16_8x8(&in1[8], &in1[9], &in1[10], &in1[11], &in1[12], &in1[13],
+                    &in1[14], &in1[15]);
+}
+
 static INLINE void load_and_transpose_u8_4x8(const uint8_t *a,
                                              const int a_stride, uint8x8_t *a0,
                                              uint8x8_t *a1, uint8x8_t *a2,
@@ -1204,4 +1517,190 @@ static INLINE void load_and_transpose_s16_8x8(const int16_t *a,
 
   transpose_s16_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
 }
-#endif  // VPX_DSP_ARM_TRANSPOSE_NEON_H_
+
+static INLINE void load_and_transpose_s32_8x8(
+    const int32_t *a, const int a_stride, int32x4x2_t *const a0,
+    int32x4x2_t *const a1, int32x4x2_t *const a2, int32x4x2_t *const a3,
+    int32x4x2_t *const a4, int32x4x2_t *const a5, int32x4x2_t *const a6,
+    int32x4x2_t *const a7) {
+  a0->val[0] = vld1q_s32(a);
+  a0->val[1] = vld1q_s32(a + 4);
+  a += a_stride;
+  a1->val[0] = vld1q_s32(a);
+  a1->val[1] = vld1q_s32(a + 4);
+  a += a_stride;
+  a2->val[0] = vld1q_s32(a);
+  a2->val[1] = vld1q_s32(a + 4);
+  a += a_stride;
+  a3->val[0] = vld1q_s32(a);
+  a3->val[1] = vld1q_s32(a + 4);
+  a += a_stride;
+  a4->val[0] = vld1q_s32(a);
+  a4->val[1] = vld1q_s32(a + 4);
+  a += a_stride;
+  a5->val[0] = vld1q_s32(a);
+  a5->val[1] = vld1q_s32(a + 4);
+  a += a_stride;
+  a6->val[0] = vld1q_s32(a);
+  a6->val[1] = vld1q_s32(a + 4);
+  a += a_stride;
+  a7->val[0] = vld1q_s32(a);
+  a7->val[1] = vld1q_s32(a + 4);
+
+  transpose_s32_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
+}
+
+static INLINE void transpose_concat_s16_4x4(const int16x4_t a0,
+                                            const int16x4_t a1,
+                                            const int16x4_t a2,
+                                            const int16x4_t a3, int16x8_t *b0,
+                                            int16x8_t *b1) {
+  // Transpose 16-bit elements:
+  // a0: 00, 01, 02, 03
+  // a1: 10, 11, 12, 13
+  // a2: 20, 21, 22, 23
+  // a3: 30, 31, 32, 33
+  //
+  // b0: 00 10 20 30 01 11 21 31
+  // b1: 02 12 22 32 03 13 23 33
+
+  int16x8_t a0q = vcombine_s16(a0, vdup_n_s16(0));
+  int16x8_t a1q = vcombine_s16(a1, vdup_n_s16(0));
+  int16x8_t a2q = vcombine_s16(a2, vdup_n_s16(0));
+  int16x8_t a3q = vcombine_s16(a3, vdup_n_s16(0));
+
+  int16x8_t a02 = vzipq_s16(a0q, a2q).val[0];
+  int16x8_t a13 = vzipq_s16(a1q, a3q).val[0];
+
+  int16x8x2_t a0123 = vzipq_s16(a02, a13);
+
+  *b0 = a0123.val[0];
+  *b1 = a0123.val[1];
+}
+
+static INLINE void transpose_concat_s16_8x4(const int16x8_t a0,
+                                            const int16x8_t a1,
+                                            const int16x8_t a2,
+                                            const int16x8_t a3, int16x8_t *b0,
+                                            int16x8_t *b1, int16x8_t *b2,
+                                            int16x8_t *b3) {
+  // Transpose 16-bit elements:
+  // a0: 00, 01, 02, 03, 04, 05, 06, 07
+  // a1: 10, 11, 12, 13, 14, 15, 16, 17
+  // a2: 20, 21, 22, 23, 24, 25, 26, 27
+  // a3: 30, 31, 32, 33, 34, 35, 36, 37
+  //
+  // b0: 00 10 20 30 01 11 21 31
+  // b1: 02 12 22 32 03 13 23 33
+  // b2: 04 14 24 34 05 15 25 35
+  // b3: 06 16 26 36 07 17 27 37
+
+  int16x8x2_t a02 = vzipq_s16(a0, a2);
+  int16x8x2_t a13 = vzipq_s16(a1, a3);
+
+  int16x8x2_t a0123_lo = vzipq_s16(a02.val[0], a13.val[0]);
+  int16x8x2_t a0123_hi = vzipq_s16(a02.val[1], a13.val[1]);
+
+  *b0 = a0123_lo.val[0];
+  *b1 = a0123_lo.val[1];
+  *b2 = a0123_hi.val[0];
+  *b3 = a0123_hi.val[1];
+}
+
+static INLINE void transpose_concat_s8_8x4(int8x8_t a0, int8x8_t a1,
+                                           int8x8_t a2, int8x8_t a3,
+                                           int8x16_t *b0, int8x16_t *b1) {
+  // Transpose 8-bit elements and concatenate result rows as follows:
+  // a0: 00, 01, 02, 03, 04, 05, 06, 07
+  // a1: 10, 11, 12, 13, 14, 15, 16, 17
+  // a2: 20, 21, 22, 23, 24, 25, 26, 27
+  // a3: 30, 31, 32, 33, 34, 35, 36, 37
+  //
+  // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+  // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+
+  int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0));
+  int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0));
+  int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0));
+  int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0));
+
+  int8x16_t a02 = vzipq_s8(a0q, a2q).val[0];
+  int8x16_t a13 = vzipq_s8(a1q, a3q).val[0];
+
+  int8x16x2_t a0123 = vzipq_s8(a02, a13);
+
+  *b0 = a0123.val[0];
+  *b1 = a0123.val[1];
+}
+
+static INLINE void transpose_concat_u8_8x4(uint8x8_t a0, uint8x8_t a1,
+                                           uint8x8_t a2, uint8x8_t a3,
+                                           uint8x16_t *b0, uint8x16_t *b1) {
+  // Transpose 8-bit elements and concatenate result rows as follows:
+  // a0: 00, 01, 02, 03, 04, 05, 06, 07
+  // a1: 10, 11, 12, 13, 14, 15, 16, 17
+  // a2: 20, 21, 22, 23, 24, 25, 26, 27
+  // a3: 30, 31, 32, 33, 34, 35, 36, 37
+  //
+  // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+  // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+
+  uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0));
+  uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0));
+  uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0));
+  uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0));
+
+  uint8x16_t a02 = vzipq_u8(a0q, a2q).val[0];
+  uint8x16_t a13 = vzipq_u8(a1q, a3q).val[0];
+
+  uint8x16x2_t a0123 = vzipq_u8(a02, a13);
+
+  *b0 = a0123.val[0];
+  *b1 = a0123.val[1];
+}
+
+static INLINE void transpose_concat_s8_4x4(int8x8_t a0, int8x8_t a1,
+                                           int8x8_t a2, int8x8_t a3,
+                                           int8x16_t *b) {
+  // Transpose 8-bit elements and concatenate result rows as follows:
+  // a0: 00, 01, 02, 03, XX, XX, XX, XX
+  // a1: 10, 11, 12, 13, XX, XX, XX, XX
+  // a2: 20, 21, 22, 23, XX, XX, XX, XX
+  // a3: 30, 31, 32, 33, XX, XX, XX, XX
+  //
+  // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+
+  int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0));
+  int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0));
+  int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0));
+  int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0));
+
+  int8x16_t a02 = vzipq_s8(a0q, a2q).val[0];
+  int8x16_t a13 = vzipq_s8(a1q, a3q).val[0];
+
+  *b = vzipq_s8(a02, a13).val[0];
+}
+
+static INLINE void transpose_concat_u8_4x4(uint8x8_t a0, uint8x8_t a1,
+                                           uint8x8_t a2, uint8x8_t a3,
+                                           uint8x16_t *b) {
+  // Transpose 8-bit elements and concatenate result rows as follows:
+  // a0: 00, 01, 02, 03, XX, XX, XX, XX
+  // a1: 10, 11, 12, 13, XX, XX, XX, XX
+  // a2: 20, 21, 22, 23, XX, XX, XX, XX
+  // a3: 30, 31, 32, 33, XX, XX, XX, XX
+  //
+  // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+
+  uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0));
+  uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0));
+  uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0));
+  uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0));
+
+  uint8x16_t a02 = vzipq_u8(a0q, a2q).val[0];
+  uint8x16_t a13 = vzipq_u8(a1q, a3q).val[0];
+
+  *b = vzipq_u8(a02, a13).val[0];
+}
+
+#endif  // VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/variance_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/variance_neon.c
index b6d7f86a4b..efb2c1d8da 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/variance_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/variance_neon.c
@@ -9,391 +9,324 @@
  */
 
 #include <arm_neon.h>
+#include <assert.h>
 
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"
 
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
 #include "vpx_ports/mem.h"
 
-static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
-  const int32x4_t a = vpaddlq_s16(v_16x8);
-  const int64x2_t b = vpaddlq_s32(a);
-  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
-                               vreinterpret_s32_s64(vget_high_s64(b)));
-  return vget_lane_s32(c, 0);
+// Process a block of width 4 two rows at a time.
+static INLINE void variance_4xh_neon(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *ref_ptr, int ref_stride,
+                                     int h, uint32_t *sse, int *sum) {
+  int16x8_t sum_s16 = vdupq_n_s16(0);
+  int32x4_t sse_s32 = vdupq_n_s32(0);
+  int i = h;
+
+  // Number of rows we can process before 'sum_s16' overflows:
+  // 32767 / 255 ~= 128, but we use an 8-wide accumulator; so 256 4-wide rows.
+  assert(h <= 256);
+
+  do {
+    const uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+    const uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+    const int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r));
+
+    sum_s16 = vaddq_s16(sum_s16, diff);
+
+    sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff));
+    sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff));
+
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  *sum = horizontal_add_int16x8(sum_s16);
+  *sse = (uint32_t)horizontal_add_int32x4(sse_s32);
 }
 
-static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
-  const int64x2_t b = vpaddlq_s32(v_32x4);
-  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
-                               vreinterpret_s32_s64(vget_high_s64(b)));
-  return vget_lane_s32(c, 0);
+// Process a block of width 8 one row at a time.
+static INLINE void variance_8xh_neon(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *ref_ptr, int ref_stride,
+                                     int h, uint32_t *sse, int *sum) {
+  int16x8_t sum_s16 = vdupq_n_s16(0);
+  int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+  int i = h;
+
+  // Number of rows we can process before 'sum_s16' overflows:
+  // 32767 / 255 ~= 128
+  assert(h <= 128);
+
+  do {
+    const uint8x8_t s = vld1_u8(src_ptr);
+    const uint8x8_t r = vld1_u8(ref_ptr);
+    const int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r));
+
+    sum_s16 = vaddq_s16(sum_s16, diff);
+
+    sse_s32[0] = vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff));
+    sse_s32[1] =
+        vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  *sum = horizontal_add_int16x8(sum_s16);
+  *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
 }
 
-// w * h must be less than 2048 or local variable v_sum may overflow.
-static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b,
-                             int b_stride, int w, int h, uint32_t *sse,
-                             int *sum) {
-  int i, j;
-  int16x8_t v_sum = vdupq_n_s16(0);
-  int32x4_t v_sse_lo = vdupq_n_s32(0);
-  int32x4_t v_sse_hi = vdupq_n_s32(0);
+// Process a block of width 16 one row at a time.
+static INLINE void variance_16xh_neon(const uint8_t *src_ptr, int src_stride,
+                                      const uint8_t *ref_ptr, int ref_stride,
+                                      int h, uint32_t *sse, int *sum) {
+  int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) };
+  int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+  int i = h;
 
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; j += 8) {
-      const uint8x8_t v_a = vld1_u8(&a[j]);
-      const uint8x8_t v_b = vld1_u8(&b[j]);
-      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
-      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
-      v_sum = vaddq_s16(v_sum, sv_diff);
-      v_sse_lo =
-          vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff));
-      v_sse_hi =
-          vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff));
-    }
-    a += a_stride;
-    b += b_stride;
+  // Number of rows we can process before 'sum_s16' accumulators overflow:
+  // 32767 / 255 ~= 128, so 128 16-wide rows.
+  assert(h <= 128);
+
+  do {
+    const uint8x16_t s = vld1q_u8(src_ptr);
+    const uint8x16_t r = vld1q_u8(ref_ptr);
+
+    const int16x8_t diff_l =
+        vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r)));
+    const int16x8_t diff_h =
+        vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r)));
+
+    sum_s16[0] = vaddq_s16(sum_s16[0], diff_l);
+    sum_s16[1] = vaddq_s16(sum_s16[1], diff_h);
+
+    sse_s32[0] =
+        vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l));
+    sse_s32[1] =
+        vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l));
+    sse_s32[0] =
+        vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h));
+    sse_s32[1] =
+        vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  *sum = horizontal_add_int16x8(vaddq_s16(sum_s16[0], sum_s16[1]));
+  *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
+}
+
+// Process a block of any size where the width is divisible by 16.
+static INLINE void variance_large_neon(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
+                                       int w, int h, int h_limit,
+                                       unsigned int *sse, int *sum) {
+  int32x4_t sum_s32 = vdupq_n_s32(0);
+  int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+  // 'h_limit' is the number of 'w'-width rows we can process before our 16-bit
+  // accumulator overflows. After hitting this limit we accumulate into 32-bit
+  // elements.
+  int h_tmp = h > h_limit ? h_limit : h;
+
+  int i = 0;
+  do {
+    int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) };
+    do {
+      int j = 0;
+      do {
+        const uint8x16_t s = vld1q_u8(src_ptr + j);
+        const uint8x16_t r = vld1q_u8(ref_ptr + j);
+
+        const int16x8_t diff_l =
+            vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r)));
+        const int16x8_t diff_h =
+            vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r)));
+
+        sum_s16[0] = vaddq_s16(sum_s16[0], diff_l);
+        sum_s16[1] = vaddq_s16(sum_s16[1], diff_h);
+
+        sse_s32[0] =
+            vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l));
+        sse_s32[1] =
+            vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l));
+        sse_s32[0] =
+            vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h));
+        sse_s32[1] =
+            vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h));
+
+        j += 16;
+      } while (j < w);
+
+      src_ptr += src_stride;
+      ref_ptr += ref_stride;
+      i++;
+    } while (i < h_tmp);
+
+    sum_s32 = vpadalq_s16(sum_s32, sum_s16[0]);
+    sum_s32 = vpadalq_s16(sum_s32, sum_s16[1]);
+
+    h_tmp += h_limit;
+  } while (i < h);
+
+  *sum = horizontal_add_int32x4(sum_s32);
+  *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
+}
+
+static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride, int h,
+                                      uint32_t *sse, int *sum) {
+  variance_large_neon(src, src_stride, ref, ref_stride, 32, h, 64, sse, sum);
+}
+
+static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride, int h,
+                                      uint32_t *sse, int *sum) {
+  variance_large_neon(src, src_stride, ref, ref_stride, 64, h, 32, sse, sum);
+}
+
+void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride,
+                        const uint8_t *ref_ptr, int ref_stride,
+                        unsigned int *sse, int *sum) {
+  variance_8xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse, sum);
+}
+
+void vpx_get16x16var_neon(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *ref_ptr, int ref_stride,
+                          unsigned int *sse, int *sum) {
+  variance_16xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, sse, sum);
+}
+
+#define VARIANCE_WXH_NEON(w, h, shift)                                        \
+  unsigned int vpx_variance##w##x##h##_neon(                                  \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    int sum;                                                                  \
+    variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, sse, &sum);    \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);                  \
   }
 
-  *sum = horizontal_add_s16x8(v_sum);
-  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
+VARIANCE_WXH_NEON(4, 4, 4)
+VARIANCE_WXH_NEON(4, 8, 5)
+
+VARIANCE_WXH_NEON(8, 4, 5)
+VARIANCE_WXH_NEON(8, 8, 6)
+VARIANCE_WXH_NEON(8, 16, 7)
+
+VARIANCE_WXH_NEON(16, 8, 7)
+VARIANCE_WXH_NEON(16, 16, 8)
+VARIANCE_WXH_NEON(16, 32, 9)
+
+VARIANCE_WXH_NEON(32, 16, 9)
+VARIANCE_WXH_NEON(32, 32, 10)
+VARIANCE_WXH_NEON(32, 64, 11)
+
+VARIANCE_WXH_NEON(64, 32, 11)
+VARIANCE_WXH_NEON(64, 64, 12)
+
+#undef VARIANCE_WXH_NEON
+
+static INLINE unsigned int vpx_mse8xh_neon(const unsigned char *src_ptr,
+                                           int src_stride,
+                                           const unsigned char *ref_ptr,
+                                           int ref_stride, int h) {
+  uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h / 2;
+  do {
+    uint8x8_t s0, s1, r0, r1, diff0, diff1;
+    uint16x8_t sse0, sse1;
+
+    s0 = vld1_u8(src_ptr);
+    src_ptr += src_stride;
+    s1 = vld1_u8(src_ptr);
+    src_ptr += src_stride;
+    r0 = vld1_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    r1 = vld1_u8(ref_ptr);
+    ref_ptr += ref_stride;
+
+    diff0 = vabd_u8(s0, r0);
+    diff1 = vabd_u8(s1, r1);
+
+    sse0 = vmull_u8(diff0, diff0);
+    sse_u32[0] = vpadalq_u16(sse_u32[0], sse0);
+    sse1 = vmull_u8(diff1, diff1);
+    sse_u32[1] = vpadalq_u16(sse_u32[1], sse1);
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
 }
 
-void vpx_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
-                        int b_stride, unsigned int *sse, int *sum) {
-  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum);
+static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr,
+                                            int src_stride,
+                                            const unsigned char *ref_ptr,
+                                            int ref_stride, int h) {
+  uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    uint8x16_t s, r, diff;
+    uint16x8_t sse0, sse1;
+
+    s = vld1q_u8(src_ptr);
+    src_ptr += src_stride;
+    r = vld1q_u8(ref_ptr);
+    ref_ptr += ref_stride;
+
+    diff = vabdq_u8(s, r);
+
+    sse0 = vmull_u8(vget_low_u8(diff), vget_low_u8(diff));
+    sse_u32[0] = vpadalq_u16(sse_u32[0], sse0);
+    sse1 = vmull_u8(vget_high_u8(diff), vget_high_u8(diff));
+    sse_u32[1] = vpadalq_u16(sse_u32[1], sse1);
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
 }
 
-void vpx_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
-                          int b_stride, unsigned int *sse, int *sum) {
-  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum);
-}
-
-unsigned int vpx_variance8x8_neon(const uint8_t *a, int a_stride,
-                                  const uint8_t *b, int b_stride,
-                                  unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
-  return *sse - ((sum * sum) >> 6);
-}
-
-unsigned int vpx_variance16x16_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
-  return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
-}
-
-unsigned int vpx_variance32x32_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
-}
-
-unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum1, sum2;
-  uint32_t sse1, sse2;
-  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
-  variance_neon_w8(a + (32 * a_stride), a_stride, b + (32 * b_stride), b_stride,
-                   32, 32, &sse2, &sum2);
-  *sse = sse1 + sse2;
-  sum1 += sum2;
-  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
-}
-
-unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum1, sum2;
-  uint32_t sse1, sse2;
-  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
-  variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
-                   64, 16, &sse2, &sum2);
-  *sse = sse1 + sse2;
-  sum1 += sum2;
-  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
-}
-
-unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum1, sum2;
-  uint32_t sse1, sse2;
-
-  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
-  variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
-                   64, 16, &sse2, &sum2);
-  sse1 += sse2;
-  sum1 += sum2;
-
-  variance_neon_w8(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride),
-                   b_stride, 64, 16, &sse2, &sum2);
-  sse1 += sse2;
-  sum1 += sum2;
-
-  variance_neon_w8(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride),
-                   b_stride, 64, 16, &sse2, &sum2);
-  *sse = sse1 + sse2;
-  sum1 += sum2;
-  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12);
-}
-
-unsigned int vpx_variance16x8_neon(const unsigned char *src_ptr,
-                                   int source_stride,
+unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
                                    const unsigned char *ref_ptr,
-                                   int recon_stride, unsigned int *sse) {
-  int i;
-  int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
-  uint32x2_t d0u32, d10u32;
-  int64x1_t d0s64, d1s64;
-  uint8x16_t q0u8, q1u8, q2u8, q3u8;
-  uint16x8_t q11u16, q12u16, q13u16, q14u16;
-  int32x4_t q8s32, q9s32, q10s32;
-  int64x2_t q0s64, q1s64, q5s64;
+                                   int ref_stride) {
+  uint8x8_t s[2], r[2];
+  uint16x8_t abs_diff[2];
+  uint32x4_t sse;
 
-  q8s32 = vdupq_n_s32(0);
-  q9s32 = vdupq_n_s32(0);
-  q10s32 = vdupq_n_s32(0);
+  s[0] = load_u8(src_ptr, src_stride);
+  r[0] = load_u8(ref_ptr, ref_stride);
+  src_ptr += 2 * src_stride;
+  ref_ptr += 2 * ref_stride;
+  s[1] = load_u8(src_ptr, src_stride);
+  r[1] = load_u8(ref_ptr, ref_stride);
 
-  for (i = 0; i < 4; i++) {
-    q0u8 = vld1q_u8(src_ptr);
-    src_ptr += source_stride;
-    q1u8 = vld1q_u8(src_ptr);
-    src_ptr += source_stride;
-    __builtin_prefetch(src_ptr);
+  abs_diff[0] = vabdl_u8(s[0], r[0]);
+  abs_diff[1] = vabdl_u8(s[1], r[1]);
 
-    q2u8 = vld1q_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    q3u8 = vld1q_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    __builtin_prefetch(ref_ptr);
+  sse = vmull_u16(vget_low_u16(abs_diff[0]), vget_low_u16(abs_diff[0]));
+  sse = vmlal_u16(sse, vget_high_u16(abs_diff[0]), vget_high_u16(abs_diff[0]));
+  sse = vmlal_u16(sse, vget_low_u16(abs_diff[1]), vget_low_u16(abs_diff[1]));
+  sse = vmlal_u16(sse, vget_high_u16(abs_diff[1]), vget_high_u16(abs_diff[1]));
 
-    q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
-    q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
-    q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
-    q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
+  return horizontal_add_uint32x4(sse);
+}
 
-    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
-    q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
-    q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
-    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
-    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
-    d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-    d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
-    q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
-    q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
-
-    d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
-    d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
-    q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
-    q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+#define VPX_MSE_WXH_NEON(w, h)                                               \
+  unsigned int vpx_mse##w##x##h##_neon(                                      \
+      const unsigned char *src_ptr, int src_stride,                          \
+      const unsigned char *ref_ptr, int ref_stride, unsigned int *sse) {     \
+    *sse = vpx_mse##w##xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, h); \
+    return *sse;                                                             \
   }
 
-  q10s32 = vaddq_s32(q10s32, q9s32);
-  q0s64 = vpaddlq_s32(q8s32);
-  q1s64 = vpaddlq_s32(q10s32);
+VPX_MSE_WXH_NEON(8, 8)
+VPX_MSE_WXH_NEON(8, 16)
+VPX_MSE_WXH_NEON(16, 8)
+VPX_MSE_WXH_NEON(16, 16)
 
-  d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
-  d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-  q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
-  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
-  d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
-  d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
-  return vget_lane_u32(d0u32, 0);
-}
-
-unsigned int vpx_variance8x16_neon(const unsigned char *src_ptr,
-                                   int source_stride,
-                                   const unsigned char *ref_ptr,
-                                   int recon_stride, unsigned int *sse) {
-  int i;
-  uint8x8_t d0u8, d2u8, d4u8, d6u8;
-  int16x4_t d22s16, d23s16, d24s16, d25s16;
-  uint32x2_t d0u32, d10u32;
-  int64x1_t d0s64, d1s64;
-  uint16x8_t q11u16, q12u16;
-  int32x4_t q8s32, q9s32, q10s32;
-  int64x2_t q0s64, q1s64, q5s64;
-
-  q8s32 = vdupq_n_s32(0);
-  q9s32 = vdupq_n_s32(0);
-  q10s32 = vdupq_n_s32(0);
-
-  for (i = 0; i < 8; i++) {
-    d0u8 = vld1_u8(src_ptr);
-    src_ptr += source_stride;
-    d2u8 = vld1_u8(src_ptr);
-    src_ptr += source_stride;
-    __builtin_prefetch(src_ptr);
-
-    d4u8 = vld1_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    d6u8 = vld1_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    __builtin_prefetch(ref_ptr);
-
-    q11u16 = vsubl_u8(d0u8, d4u8);
-    q12u16 = vsubl_u8(d2u8, d6u8);
-
-    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
-    q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
-    q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
-    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
-    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-  }
-
-  q10s32 = vaddq_s32(q10s32, q9s32);
-  q0s64 = vpaddlq_s32(q8s32);
-  q1s64 = vpaddlq_s32(q10s32);
-
-  d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
-  d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-  q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
-  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
-  d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
-  d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
-  return vget_lane_u32(d0u32, 0);
-}
-
-unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int source_stride,
-                               const unsigned char *ref_ptr, int recon_stride,
-                               unsigned int *sse) {
-  int i;
-  int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
-  int64x1_t d0s64;
-  uint8x16_t q0u8, q1u8, q2u8, q3u8;
-  int32x4_t q7s32, q8s32, q9s32, q10s32;
-  uint16x8_t q11u16, q12u16, q13u16, q14u16;
-  int64x2_t q1s64;
-
-  q7s32 = vdupq_n_s32(0);
-  q8s32 = vdupq_n_s32(0);
-  q9s32 = vdupq_n_s32(0);
-  q10s32 = vdupq_n_s32(0);
-
-  for (i = 0; i < 8; i++) {  // mse16x16_neon_loop
-    q0u8 = vld1q_u8(src_ptr);
-    src_ptr += source_stride;
-    q1u8 = vld1q_u8(src_ptr);
-    src_ptr += source_stride;
-    q2u8 = vld1q_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    q3u8 = vld1q_u8(ref_ptr);
-    ref_ptr += recon_stride;
-
-    q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
-    q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
-    q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
-    q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
-
-    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-    q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
-    q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
-
-    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
-    d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-    d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-    q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
-    q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
-
-    d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
-    d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
-    q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
-    q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
-  }
-
-  q7s32 = vaddq_s32(q7s32, q8s32);
-  q9s32 = vaddq_s32(q9s32, q10s32);
-  q10s32 = vaddq_s32(q7s32, q9s32);
-
-  q1s64 = vpaddlq_s32(q10s32);
-  d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
-  return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
-}
-
-unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr,
-                                   int source_stride,
-                                   const unsigned char *ref_ptr,
-                                   int recon_stride) {
-  int16x4_t d22s16, d24s16, d26s16, d28s16;
-  int64x1_t d0s64;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
-  int32x4_t q7s32, q8s32, q9s32, q10s32;
-  uint16x8_t q11u16, q12u16, q13u16, q14u16;
-  int64x2_t q1s64;
-
-  d0u8 = vld1_u8(src_ptr);
-  src_ptr += source_stride;
-  d4u8 = vld1_u8(ref_ptr);
-  ref_ptr += recon_stride;
-  d1u8 = vld1_u8(src_ptr);
-  src_ptr += source_stride;
-  d5u8 = vld1_u8(ref_ptr);
-  ref_ptr += recon_stride;
-  d2u8 = vld1_u8(src_ptr);
-  src_ptr += source_stride;
-  d6u8 = vld1_u8(ref_ptr);
-  ref_ptr += recon_stride;
-  d3u8 = vld1_u8(src_ptr);
-  src_ptr += source_stride;
-  d7u8 = vld1_u8(ref_ptr);
-  ref_ptr += recon_stride;
-
-  q11u16 = vsubl_u8(d0u8, d4u8);
-  q12u16 = vsubl_u8(d1u8, d5u8);
-  q13u16 = vsubl_u8(d2u8, d6u8);
-  q14u16 = vsubl_u8(d3u8, d7u8);
-
-  d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
-  d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
-  d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
-  d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
-
-  q7s32 = vmull_s16(d22s16, d22s16);
-  q8s32 = vmull_s16(d24s16, d24s16);
-  q9s32 = vmull_s16(d26s16, d26s16);
-  q10s32 = vmull_s16(d28s16, d28s16);
-
-  q7s32 = vaddq_s32(q7s32, q8s32);
-  q9s32 = vaddq_s32(q9s32, q10s32);
-  q9s32 = vaddq_s32(q7s32, q9s32);
-
-  q1s64 = vpaddlq_s32(q9s32);
-  d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-  return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
-}
+#undef VPX_MSE_WXH_NEON
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/variance_neon_dotprod.c b/media/libvpx/libvpx/vpx_dsp/arm/variance_neon_dotprod.c
new file mode 100644
index 0000000000..ab843e9fca
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/variance_neon_dotprod.c
@@ -0,0 +1,298 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_ports/mem.h"
+
+// Process a block of width 4 four rows at a time.
+static INLINE void variance_4xh_neon_dotprod(const uint8_t *src_ptr,
+                                             int src_stride,
+                                             const uint8_t *ref_ptr,
+                                             int ref_stride, int h,
+                                             uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    const uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride);
+    const uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride);
+
+    const uint8x16_t abs_diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+    src_ptr += 4 * src_stride;
+    ref_ptr += 4 * ref_stride;
+    i -= 4;
+  } while (i != 0);
+
+  *sum = horizontal_add_int32x4(
+      vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
+  *sse = horizontal_add_uint32x4(sse_u32);
+}
+
+// Process a block of width 8 two rows at a time.
+static INLINE void variance_8xh_neon_dotprod(const uint8_t *src_ptr,
+                                             int src_stride,
+                                             const uint8_t *ref_ptr,
+                                             int ref_stride, int h,
+                                             uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    const uint8x16_t s =
+        vcombine_u8(vld1_u8(src_ptr), vld1_u8(src_ptr + src_stride));
+    const uint8x16_t r =
+        vcombine_u8(vld1_u8(ref_ptr), vld1_u8(ref_ptr + ref_stride));
+
+    const uint8x16_t abs_diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  *sum = horizontal_add_int32x4(
+      vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
+  *sse = horizontal_add_uint32x4(sse_u32);
+}
+
+// Process a block of width 16 one row at a time.
+static INLINE void variance_16xh_neon_dotprod(const uint8_t *src_ptr,
+                                              int src_stride,
+                                              const uint8_t *ref_ptr,
+                                              int ref_stride, int h,
+                                              uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    const uint8x16_t s = vld1q_u8(src_ptr);
+    const uint8x16_t r = vld1q_u8(ref_ptr);
+
+    const uint8x16_t abs_diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  *sum = horizontal_add_int32x4(
+      vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
+  *sse = horizontal_add_uint32x4(sse_u32);
+}
+
+// Process a block of any size where the width is divisible by 16.
+static INLINE void variance_large_neon_dotprod(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *ref_ptr,
+                                               int ref_stride, int w, int h,
+                                               uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      const uint8x16_t s = vld1q_u8(src_ptr + j);
+      const uint8x16_t r = vld1q_u8(ref_ptr + j);
+
+      const uint8x16_t abs_diff = vabdq_u8(s, r);
+      sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+      src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+      ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+      j += 16;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  *sum = horizontal_add_int32x4(
+      vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
+  *sse = horizontal_add_uint32x4(sse_u32);
+}
+
+static INLINE void variance_32xh_neon_dotprod(const uint8_t *src,
+                                              int src_stride,
+                                              const uint8_t *ref,
+                                              int ref_stride, int h,
+                                              uint32_t *sse, int *sum) {
+  variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 32, h, sse,
+                              sum);
+}
+
+static INLINE void variance_64xh_neon_dotprod(const uint8_t *src,
+                                              int src_stride,
+                                              const uint8_t *ref,
+                                              int ref_stride, int h,
+                                              uint32_t *sse, int *sum) {
+  variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 64, h, sse,
+                              sum);
+}
+
+void vpx_get8x8var_neon_dotprod(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_ptr, int ref_stride,
+                                unsigned int *sse, int *sum) {
+  variance_8xh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse,
+                            sum);
+}
+
+void vpx_get16x16var_neon_dotprod(const uint8_t *src_ptr, int src_stride,
+                                  const uint8_t *ref_ptr, int ref_stride,
+                                  unsigned int *sse, int *sum) {
+  variance_16xh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 16, sse,
+                             sum);
+}
+
+#define VARIANCE_WXH_NEON_DOTPROD(w, h, shift)                                \
+  unsigned int vpx_variance##w##x##h##_neon_dotprod(                          \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    int sum;                                                                  \
+    variance_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h, sse,   \
+                                  &sum);                                      \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);                  \
+  }
+
+VARIANCE_WXH_NEON_DOTPROD(4, 4, 4)
+VARIANCE_WXH_NEON_DOTPROD(4, 8, 5)
+
+VARIANCE_WXH_NEON_DOTPROD(8, 4, 5)
+VARIANCE_WXH_NEON_DOTPROD(8, 8, 6)
+VARIANCE_WXH_NEON_DOTPROD(8, 16, 7)
+
+VARIANCE_WXH_NEON_DOTPROD(16, 8, 7)
+VARIANCE_WXH_NEON_DOTPROD(16, 16, 8)
+VARIANCE_WXH_NEON_DOTPROD(16, 32, 9)
+
+VARIANCE_WXH_NEON_DOTPROD(32, 16, 9)
+VARIANCE_WXH_NEON_DOTPROD(32, 32, 10)
+VARIANCE_WXH_NEON_DOTPROD(32, 64, 11)
+
+VARIANCE_WXH_NEON_DOTPROD(64, 32, 11)
+VARIANCE_WXH_NEON_DOTPROD(64, 64, 12)
+
+#undef VARIANCE_WXH_NEON_DOTPROD
+
+static INLINE unsigned int vpx_mse8xh_neon_dotprod(const unsigned char *src_ptr,
+                                                   int src_stride,
+                                                   const unsigned char *ref_ptr,
+                                                   int ref_stride, int h) {
+  uint32x2_t sse_u32[2] = { vdup_n_u32(0), vdup_n_u32(0) };
+
+  int i = h / 2;
+  do {
+    uint8x8_t s0, s1, r0, r1, diff0, diff1;
+
+    s0 = vld1_u8(src_ptr);
+    src_ptr += src_stride;
+    s1 = vld1_u8(src_ptr);
+    src_ptr += src_stride;
+    r0 = vld1_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    r1 = vld1_u8(ref_ptr);
+    ref_ptr += ref_stride;
+
+    diff0 = vabd_u8(s0, r0);
+    diff1 = vabd_u8(s1, r1);
+
+    sse_u32[0] = vdot_u32(sse_u32[0], diff0, diff0);
+    sse_u32[1] = vdot_u32(sse_u32[1], diff1, diff1);
+  } while (--i != 0);
+
+  return horizontal_add_uint32x2(vadd_u32(sse_u32[0], sse_u32[1]));
+}
+
+static INLINE unsigned int vpx_mse16xh_neon_dotprod(
+    const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr,
+    int ref_stride, int h) {
+  uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h / 2;
+  do {
+    uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+    s0 = vld1q_u8(src_ptr);
+    src_ptr += src_stride;
+    s1 = vld1q_u8(src_ptr);
+    src_ptr += src_stride;
+    r0 = vld1q_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    r1 = vld1q_u8(ref_ptr);
+    ref_ptr += ref_stride;
+
+    diff0 = vabdq_u8(s0, r0);
+    diff1 = vabdq_u8(s1, r1);
+
+    sse_u32[0] = vdotq_u32(sse_u32[0], diff0, diff0);
+    sse_u32[1] = vdotq_u32(sse_u32[1], diff1, diff1);
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+}
+
+unsigned int vpx_get4x4sse_cs_neon_dotprod(const unsigned char *src_ptr,
+                                           int src_stride,
+                                           const unsigned char *ref_ptr,
+                                           int ref_stride) {
+  uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride);
+  uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride);
+
+  uint8x16_t abs_diff = vabdq_u8(s, r);
+
+  uint32x4_t sse = vdotq_u32(vdupq_n_u32(0), abs_diff, abs_diff);
+
+  return horizontal_add_uint32x4(sse);
+}
+
+#define VPX_MSE_WXH_NEON_DOTPROD(w, h)                                   \
+  unsigned int vpx_mse##w##x##h##_neon_dotprod(                          \
+      const unsigned char *src_ptr, int src_stride,                      \
+      const unsigned char *ref_ptr, int ref_stride, unsigned int *sse) { \
+    *sse = vpx_mse##w##xh_neon_dotprod(src_ptr, src_stride, ref_ptr,     \
+                                       ref_stride, h);                   \
+    return *sse;                                                         \
+  }
+
+VPX_MSE_WXH_NEON_DOTPROD(8, 8)
+VPX_MSE_WXH_NEON_DOTPROD(8, 16)
+VPX_MSE_WXH_NEON_DOTPROD(16, 8)
+VPX_MSE_WXH_NEON_DOTPROD(16, 16)
+
+#undef VPX_MSE_WXH_NEON_DOTPROD
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm
new file mode 100644
index 0000000000..d8e4bcc3a7
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm
@@ -0,0 +1,438 @@
+;
+;  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers*****************************************
+;    r0 => src
+;    r1 => dst
+;    r2 =>  src_stride
+;    r3 =>  dst_stride
+;    r4 => filter_x0
+;    r8 =>  ht
+;    r10 =>  wd
+
+    EXPORT          |vpx_convolve8_avg_horiz_filter_type1_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA  ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_avg_horiz_filter_type1_neon| PROC
+
+    stmfd           sp!,    {r4  -  r12,    r14} ;stack stores the values of
+                                                 ; the arguments
+    vpush           {d8  -  d15}                 ; stack offset by 64
+    mov             r4,     r1
+    mov             r1,     r2
+    mov             r2,     r4
+
+start_loop_count
+    ldr             r4,     [sp,    #104]   ;loads pi1_coeff
+    ldr             r8,     [sp,    #108]   ;loads x0_q4
+    add             r4,     r4,     r8,     lsl #4 ;r4 = filter[x0_q4]
+    ldr             r8,     [sp,    #128]   ;loads ht
+    ldr             r10,    [sp,    #124]   ;loads wd
+    vld2.8          {d0,    d1},    [r4]    ;coeff = vld1_s8(pi1_coeff)
+    mov             r11,    #1
+    subs            r14,    r8,     #0      ;checks for ht == 0
+    vabs.s8         d2,     d0              ;vabs_s8(coeff)
+    vdup.8          d24,    d2[0]           ;coeffabs_0 = vdup_lane_u8(coeffabs,
+                                            ; 0)
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    vdup.8          d25,    d2[1]           ;coeffabs_1 = vdup_lane_u8(coeffabs,
+                                            ; 1)
+    add             r4,     r12,    r2      ;pu1_src_tmp2_8 = pu1_src + src_strd
+    vdup.8          d26,    d2[2]           ;coeffabs_2 = vdup_lane_u8(coeffabs,
+                                            ; 2)
+    rsb             r9,     r10,    r2,     lsl #1 ;2*src_strd - wd
+    vdup.8          d27,    d2[3]           ;coeffabs_3 = vdup_lane_u8(coeffabs,
+                                            ; 3)
+    rsb             r8,     r10,    r3,     lsl #1 ;2*dst_strd - wd
+    vdup.8          d28,    d2[4]           ;coeffabs_4 = vdup_lane_u8(coeffabs,
+                                            ; 4)
+    vdup.8          d29,    d2[5]           ;coeffabs_5 = vdup_lane_u8(coeffabs,
+                                            ; 5)
+    vdup.8          d30,    d2[6]           ;coeffabs_6 = vdup_lane_u8(coeffabs,
+                                            ; 6)
+    vdup.8          d31,    d2[7]           ;coeffabs_7 = vdup_lane_u8(coeffabs,
+                                            ; 7)
+    mov             r7,     r1
+    cmp             r10,    #4
+    ble             outer_loop_4
+
+    cmp             r10,    #24
+    moveq           r10,    #16
+    addeq           r8,     #8
+    addeq           r9,     #8
+    cmp             r10,    #16
+    bge             outer_loop_16
+
+    cmp             r10,    #12
+    addeq           r8,     #4
+    addeq           r9,     #4
+    b               outer_loop_8
+
+outer_loop8_residual
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    mov             r1,     r7
+    mov             r14,    #32
+    add             r1,     #16
+    add             r12,    #16
+    mov             r10,    #8
+    add             r8,     #8
+    add             r9,     #8
+
+outer_loop_8
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    subs            r5,     r10,    #0      ;checks wd
+    ble             end_inner_loop_8
+
+inner_loop_8
+    mov             r7,     #0xc000
+    vld1.u32        {d0},   [r12],  r11     ;vector load pu1_src
+    vdup.16         q4,     r7
+    vld1.u32        {d1},   [r12],  r11
+    vdup.16         q5,     r7
+    vld1.u32        {d2},   [r12],  r11
+    vld1.u32        {d3},   [r12],  r11
+    mov             r7,     #0x4000
+    vld1.u32        {d4},   [r12],  r11
+    vmlsl.u8        q4,     d1,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {d5},   [r12],  r11
+    vmlal.u8        q4,     d3,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {d6},   [r12],  r11
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vld1.u32        {d7},   [r12],  r11
+    vmlal.u8        q4,     d2,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {d12},  [r4],   r11     ;vector load pu1_src + src_strd
+    vmlal.u8        q4,     d4,     d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vld1.u32        {d13},  [r4],   r11
+    vmlal.u8        q4,     d5,     d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vld1.u32        {d14},  [r4],   r11
+    vmlsl.u8        q4,     d6,     d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vld1.u32        {d15},  [r4],   r11
+    vmlsl.u8        q4,     d7,     d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    vld1.u32        {d16},  [r4],   r11     ;vector load pu1_src + src_strd
+    vdup.16         q11,    r7
+    vmlal.u8        q5,     d15,    d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {d17},  [r4],   r11
+    vmlal.u8        q5,     d14,    d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vhadd.s16       q4,     q4,     q11
+    vld1.u32        {d18},  [r4],   r11
+    vmlal.u8        q5,     d16,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vld1.u32        {d19},  [r4],   r11     ;vector load pu1_src + src_strd
+    vmlal.u8        q5,     d17,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vld1.u8         {d6},   [r1]
+    vqrshrun.s16    d20,    q4,     #6      ;right shift and saturating narrow
+                                            ; result 1
+    vmlsl.u8        q5,     d18,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q5,     d19,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    vld1.u8         {d7},   [r6]
+    vrhadd.u8       d20,    d20,    d6
+    vmlsl.u8        q5,     d12,    d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vmlsl.u8        q5,     d13,    d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vst1.8          {d20},  [r1]!           ;store the result pu1_dst
+    vhadd.s16       q5,     q5,     q11
+    subs            r5,     r5,     #8      ;decrement the wd loop
+    vqrshrun.s16    d8,     q5,     #6      ;right shift and saturating narrow
+                                            ; result 2
+    vrhadd.u8       d8,     d8,     d7
+    vst1.8          {d8},   [r6]!           ;store the result pu1_dst
+    cmp             r5,     #4
+    bgt             inner_loop_8
+
+end_inner_loop_8
+    subs            r14,    r14,    #2      ;decrement the ht loop
+    add             r12,    r12,    r9      ;increment the src pointer by
+                                            ; 2*src_strd-wd
+    add             r1,     r1,     r8      ;increment the dst pointer by
+                                            ; 2*dst_strd-wd
+    bgt             outer_loop_8
+
+    ldr             r10,    [sp,    #120]   ;loads wd
+    cmp             r10,    #12
+    beq             outer_loop4_residual
+
+end_loops
+    b               end_func
+
+outer_loop_16
+    str             r0,     [sp,  #-4]!
+    str             r7,     [sp,  #-4]!
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    and             r0,     r12,    #31
+    mov             r7,     #0xc000
+    sub             r5,     r10,    #0      ;checks wd
+    pld             [r4,    r2,     lsl #1]
+    pld             [r12,   r2,     lsl #1]
+    vld1.u32        {q0},   [r12],  r11     ;vector load pu1_src
+    vdup.16         q4,     r7
+    vld1.u32        {q1},   [r12],  r11
+    vld1.u32        {q2},   [r12],  r11
+    vld1.u32        {q3},   [r12],  r11
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vld1.u32        {q6},   [r12],  r11
+    vmlsl.u8        q4,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {q7},   [r12],  r11
+    vmlal.u8        q4,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {q8},   [r12],  r11
+    vmlal.u8        q4,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {q9},   [r12],  r11
+    vmlal.u8        q4,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vmlal.u8        q4,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vdup.16         q10,    r7
+    vmlsl.u8        q4,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q4,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+
+inner_loop_16
+    vmlsl.u8        q10,    d1,     d24
+    vdup.16         q5,     r7
+    vmlsl.u8        q10,    d3,     d25
+    mov             r7,     #0x4000
+    vdup.16         q11,    r7
+    vmlal.u8        q10,    d5,     d26
+    vld1.u32        {q0},   [r4],   r11     ;vector load pu1_src
+    vhadd.s16       q4,     q4,     q11
+    vld1.u32        {q1},   [r4],   r11
+    vmlal.u8        q10,    d7,     d27
+    add             r12,    #8
+    subs            r5,     r5,     #16
+    vmlal.u8        q10,    d13,    d28
+    vld1.u32        {q2},   [r4],   r11
+    vmlal.u8        q10,    d15,    d29
+    vld1.u32        {q3},   [r4],   r11
+    vqrshrun.s16    d8,     q4,     #6      ;right shift and saturating narrow
+                                            ; result 1
+    vmlsl.u8        q10,    d17,    d30
+    vld1.u32        {q6},   [r4],   r11
+    vmlsl.u8        q10,    d19,    d31
+    vld1.u32        {q7},   [r4],   r11
+    add             r7,     r1,     #8
+    vmlsl.u8        q5,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vmlsl.u8        q5,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {q8},   [r4],   r11
+    vhadd.s16       q10,    q10,    q11
+    vld1.u32        {q9},   [r4],   r11
+    vld1.u8         {d0},   [r1]
+    vmlal.u8        q5,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u8         {d2},   [r7]
+    vmlal.u8        q5,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    add             r4,     #8
+    mov             r7,     #0xc000
+    vmlal.u8        q5,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vmlal.u8        q5,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vqrshrun.s16    d9,     q10,    #6
+    vdup.16         q11,    r7
+    vmlsl.u8        q5,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q5,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    mov             r7,     #0x4000
+    vrhadd.u8       d8,     d8,     d0
+    vrhadd.u8       d9,     d9,     d2
+    vmlsl.u8        q11,    d1,     d24
+    vmlsl.u8        q11,    d3,     d25
+    vdup.16         q10,    r7
+    vmlal.u8        q11,    d5,     d26
+    pld             [r12,   r2,     lsl #2]
+    pld             [r4,    r2,     lsl #2]
+    addeq           r12,    r12,    r9      ;increment the src pointer by
+                                            ; 2*src_strd-wd
+    addeq           r4,     r12,    r2      ;pu1_src + src_strd
+    vmlal.u8        q11,    d7,     d27
+    vmlal.u8        q11,    d13,    d28
+    vst1.8          {q4},   [r1]!           ;store the result pu1_dst
+    subeq           r14,    r14,    #2
+    vhadd.s16       q5,     q5,     q10
+    vmlal.u8        q11,    d15,    d29
+    addeq           r1,     r1,     r8
+    vmlsl.u8        q11,    d17,    d30
+    cmp             r14,    #0
+    vmlsl.u8        q11,    d19,    d31
+    vqrshrun.s16    d10,    q5,     #6      ;right shift and saturating narrow
+                                            ; result 2
+    beq             epilog_16
+
+    vld1.u32        {q0},   [r12],  r11     ;vector load pu1_src
+    mov             r7,     #0xc000
+    cmp             r5,     #0
+    vld1.u32        {q1},   [r12],  r11
+    vhadd.s16       q11,    q11,    q10
+    vld1.u32        {q2},   [r12],  r11
+    vdup.16         q4,     r7
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vdup.16         q10,    r7
+    vld1.u32        {q3},   [r12],  r11
+    add             r7,     r6,     #8
+    moveq           r5,     r10
+    vld1.u8         {d0},   [r6]
+    vmlsl.u8        q4,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u8         {d2},   [r7]
+    vqrshrun.s16    d11,    q11,    #6
+    vmlal.u8        q4,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {q6},   [r12],  r11
+    vrhadd.u8       d10,    d10,    d0
+    vld1.u32        {q7},   [r12],  r11
+    vrhadd.u8       d11,    d11,    d2
+    vld1.u32        {q8},   [r12],  r11
+    vmlal.u8        q4,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {q9},   [r12],  r11
+    vmlal.u8        q4,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vmlal.u8        q4,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    mov             r7,     #0xc000
+    vmlsl.u8        q4,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vst1.8          {q5},   [r6]!           ;store the result pu1_dst
+    vmlsl.u8        q4,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    addeq           r6,     r1,     r3      ;pu1_dst + dst_strd
+    b               inner_loop_16
+
+epilog_16
+    mov             r7,     #0x4000
+    ldr             r0,     [sp],   #4
+    ldr             r10,    [sp,    #120]
+    vdup.16         q10,    r7
+    vhadd.s16       q11,    q11,    q10
+    vqrshrun.s16    d11,    q11,    #6
+    add             r7,     r6,     #8
+    vld1.u8         {d20},  [r6]
+    vld1.u8         {d21},  [r7]
+    vrhadd.u8       d10,    d10,    d20
+    vrhadd.u8       d11,    d11,    d21
+    vst1.8          {q5},   [r6]!           ;store the result pu1_dst
+    ldr             r7,     [sp],   #4
+    cmp             r10,    #24
+    beq             outer_loop8_residual
+
+end_loops1
+    b               end_func
+
+outer_loop4_residual
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    mov             r1,     r7
+    add             r1,     #8
+    mov             r10,    #4
+    add             r12,    #8
+    mov             r14,    #16
+    add             r8,     #4
+    add             r9,     #4
+
+outer_loop_4
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    subs            r5,     r10,    #0      ;checks wd
+    ble             end_inner_loop_4
+
+inner_loop_4
+    vld1.u32        {d0},   [r12],  r11     ;vector load pu1_src
+    vld1.u32        {d1},   [r12],  r11
+    vld1.u32        {d2},   [r12],  r11
+    vld1.u32        {d3},   [r12],  r11
+    vld1.u32        {d4},   [r12],  r11
+    vld1.u32        {d5},   [r12],  r11
+    vld1.u32        {d6},   [r12],  r11
+    vld1.u32        {d7},   [r12],  r11
+    sub             r12,    r12,    #4
+    vld1.u32        {d12},  [r4],   r11     ;vector load pu1_src + src_strd
+    vld1.u32        {d13},  [r4],   r11
+    vzip.32         d0,     d12             ;vector zip the i iteration and ii
+                                            ; interation in single register
+    vld1.u32        {d14},  [r4],   r11
+    vzip.32         d1,     d13
+    vld1.u32        {d15},  [r4],   r11
+    vzip.32         d2,     d14
+    vld1.u32        {d16},  [r4],   r11
+    vzip.32         d3,     d15
+    vld1.u32        {d17},  [r4],   r11
+    vzip.32         d4,     d16
+    vld1.u32        {d18},  [r4],   r11
+    vzip.32         d5,     d17
+    vld1.u32        {d19},  [r4],   r11
+    mov             r7,     #0xc000
+    vdup.16         q4,     r7
+    sub             r4,     r4,     #4
+    vzip.32         d6,     d18
+    vzip.32         d7,     d19
+    vmlsl.u8        q4,     d1,     d25     ;arithmetic operations for ii
+                                            ; iteration in the same time
+    vmlsl.u8        q4,     d0,     d24
+    vmlal.u8        q4,     d2,     d26
+    vmlal.u8        q4,     d3,     d27
+    vmlal.u8        q4,     d4,     d28
+    vmlal.u8        q4,     d5,     d29
+    vmlsl.u8        q4,     d6,     d30
+    vmlsl.u8        q4,     d7,     d31
+    mov             r7,     #0x4000
+    vdup.16         q10,    r7
+    vhadd.s16       q4,     q4,     q10
+    vqrshrun.s16    d8,     q4,     #6
+    vld1.u32        {d10[0]},       [r1]
+    vld1.u32        {d10[1]},       [r6]
+    vrhadd.u8       d8,     d8,     d10
+    vst1.32         {d8[0]},[r1]!           ;store the i iteration result which
+                                            ; is in upper part of the register
+    vst1.32         {d8[1]},[r6]!           ;store the ii iteration result which
+                                            ; is in lower part of the register
+    subs            r5,     r5,     #4      ;decrement the wd by 4
+    bgt             inner_loop_4
+
+end_inner_loop_4
+    subs            r14,    r14,    #2      ;decrement the ht by 4
+    add             r12,    r12,    r9      ;increment the input pointer
+                                            ; 2*src_strd-wd
+    add             r1,     r1,     r8      ;increment the output pointer
+                                            ; 2*dst_strd-wd
+    bgt             outer_loop_4
+
+end_func
+    vpop            {d8  -  d15}
+    ldmfd           sp!,    {r4  -  r12,    r15} ;reload the registers from sp
+
+    ENDP
+
+    END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm
new file mode 100644
index 0000000000..7a77747fec
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm
@@ -0,0 +1,439 @@
+;
+;  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+;    r0 => src
+;    r1 => dst
+;    r2 =>  src_stride
+;    r3 =>  dst_stride
+;    r4 => filter_x0
+;    r8 =>  ht
+;    r10 =>  wd
+
+    EXPORT          |vpx_convolve8_avg_horiz_filter_type2_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA  ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_avg_horiz_filter_type2_neon| PROC
+
+    stmfd           sp!,    {r4  -  r12,    r14} ;stack stores the values of
+                                                 ; the arguments
+    vpush           {d8  -  d15}                 ; stack offset by 64
+    mov             r4,     r1
+    mov             r1,     r2
+    mov             r2,     r4
+
+start_loop_count
+    ldr             r4,     [sp,    #104]   ;loads pi1_coeff
+    ldr             r8,     [sp,    #108]   ;loads x0_q4
+    add             r4,     r4,     r8,     lsl #4 ;r4 = filter[x0_q4]
+    ldr             r8,     [sp,    #128]   ;loads ht
+    ldr             r10,    [sp,    #124]   ;loads wd
+    vld2.8          {d0,    d1},    [r4]    ;coeff = vld1_s8(pi1_coeff)
+    mov             r11,    #1
+    subs            r14,    r8,     #0      ;checks for ht == 0
+    vabs.s8         d2,     d0              ;vabs_s8(coeff)
+    vdup.8          d24,    d2[0]           ;coeffabs_0 = vdup_lane_u8(coeffabs,
+                                            ; 0)
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    vdup.8          d25,    d2[1]           ;coeffabs_1 = vdup_lane_u8(coeffabs,
+                                            ; 1)
+    add             r4,     r12,    r2      ;pu1_src_tmp2_8 = pu1_src + src_strd
+    vdup.8          d26,    d2[2]           ;coeffabs_2 = vdup_lane_u8(coeffabs,
+                                            ; 2)
+    rsb             r9,     r10,    r2,     lsl #1 ;2*src_strd - wd
+    vdup.8          d27,    d2[3]           ;coeffabs_3 = vdup_lane_u8(coeffabs,
+                                            ; 3)
+    rsb             r8,     r10,    r3,     lsl #1 ;2*dst_strd - wd
+    vdup.8          d28,    d2[4]           ;coeffabs_4 = vdup_lane_u8(coeffabs,
+                                            ; 4)
+    vdup.8          d29,    d2[5]           ;coeffabs_5 = vdup_lane_u8(coeffabs,
+                                            ; 5)
+    vdup.8          d30,    d2[6]           ;coeffabs_6 = vdup_lane_u8(coeffabs,
+                                            ; 6)
+    vdup.8          d31,    d2[7]           ;coeffabs_7 = vdup_lane_u8(coeffabs,
+                                            ; 7)
+    mov             r7,     r1
+    cmp             r10,    #4
+    ble             outer_loop_4
+
+    cmp             r10,    #24
+    moveq           r10,    #16
+    addeq           r8,     #8
+    addeq           r9,     #8
+    cmp             r10,    #16
+    bge             outer_loop_16
+
+    cmp             r10,    #12
+    addeq           r8,     #4
+    addeq           r9,     #4
+    b               outer_loop_8
+
+outer_loop8_residual
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    mov             r1,     r7
+    mov             r14,    #32
+    add             r1,     #16
+    add             r12,    #16
+    mov             r10,    #8
+    add             r8,     #8
+    add             r9,     #8
+
+outer_loop_8
+
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    subs            r5,     r10,    #0      ;checks wd
+    ble             end_inner_loop_8
+
+inner_loop_8
+    mov             r7,     #0xc000
+    vld1.u32        {d0},   [r12],  r11     ;vector load pu1_src
+    vdup.16         q4,     r7
+    vld1.u32        {d1},   [r12],  r11
+    vdup.16         q5,     r7
+    vld1.u32        {d2},   [r12],  r11
+    vld1.u32        {d3},   [r12],  r11
+    mov             r7,     #0x4000
+    vld1.u32        {d4},   [r12],  r11
+    vmlal.u8        q4,     d1,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {d5},   [r12],  r11
+    vmlal.u8        q4,     d3,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {d6},   [r12],  r11
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vld1.u32        {d7},   [r12],  r11
+    vmlsl.u8        q4,     d2,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {d12},  [r4],   r11     ;vector load pu1_src + src_strd
+    vmlal.u8        q4,     d4,     d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vld1.u32        {d13},  [r4],   r11
+    vmlsl.u8        q4,     d5,     d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vld1.u32        {d14},  [r4],   r11
+    vmlal.u8        q4,     d6,     d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vld1.u32        {d15},  [r4],   r11
+    vmlsl.u8        q4,     d7,     d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    vld1.u32        {d16},  [r4],   r11     ;vector load pu1_src + src_strd
+    vdup.16         q11,    r7
+    vmlal.u8        q5,     d15,    d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {d17},  [r4],   r11
+    vmlsl.u8        q5,     d14,    d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vhadd.s16       q4,     q4,     q11
+    vld1.u32        {d18},  [r4],   r11
+    vmlal.u8        q5,     d16,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vld1.u32        {d19},  [r4],   r11     ;vector load pu1_src + src_strd
+    vmlsl.u8        q5,     d17,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vld1.u8         {d6},   [r1]
+    vqrshrun.s16    d20,    q4,     #6      ;right shift and saturating narrow
+                                            ; result 1
+    vmlal.u8        q5,     d18,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q5,     d19,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    vld1.u8         {d7},   [r6]
+    vrhadd.u8       d20,    d20,    d6
+    vmlsl.u8        q5,     d12,    d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vmlal.u8        q5,     d13,    d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vst1.8          {d20},  [r1]!           ;store the result pu1_dst
+    vhadd.s16       q5,     q5,     q11
+    subs            r5,     r5,     #8      ;decrement the wd loop
+    vqrshrun.s16    d8,     q5,     #6      ;right shift and saturating narrow
+                                            ; result 2
+    vrhadd.u8       d8,     d8,     d7
+    vst1.8          {d8},   [r6]!           ;store the result pu1_dst
+    cmp             r5,     #4
+    bgt             inner_loop_8
+
+end_inner_loop_8
+    subs            r14,    r14,    #2      ;decrement the ht loop
+    add             r12,    r12,    r9      ;increment the src pointer by
+                                            ; 2*src_strd-wd
+    add             r1,     r1,     r8      ;increment the dst pointer by
+                                            ; 2*dst_strd-wd
+    bgt             outer_loop_8
+
+    ldr             r10,    [sp,    #120]   ;loads wd
+    cmp             r10,    #12
+    beq             outer_loop4_residual
+
+end_loops
+    b               end_func
+
+outer_loop_16
+    str             r0,     [sp,  #-4]!
+    str             r7,     [sp,  #-4]!
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    and             r0,     r12,    #31
+    mov             r7,     #0xc000
+    sub             r5,     r10,    #0      ;checks wd
+    pld             [r4,    r2,     lsl #1]
+    pld             [r12,   r2,     lsl #1]
+    vld1.u32        {q0},   [r12],  r11     ;vector load pu1_src
+    vdup.16         q4,     r7
+    vld1.u32        {q1},   [r12],  r11
+    vld1.u32        {q2},   [r12],  r11
+    vld1.u32        {q3},   [r12],  r11
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vld1.u32        {q6},   [r12],  r11
+    vmlal.u8        q4,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {q7},   [r12],  r11
+    vmlsl.u8        q4,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {q8},   [r12],  r11
+    vmlal.u8        q4,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {q9},   [r12],  r11
+    vmlal.u8        q4,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vmlsl.u8        q4,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vdup.16         q10,    r7
+    vmlal.u8        q4,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q4,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+
+inner_loop_16
+    vmlsl.u8        q10,    d1,     d24
+    vdup.16         q5,     r7
+    vmlal.u8        q10,    d3,     d25
+    mov             r7,     #0x4000
+    vdup.16         q11,    r7
+    vmlsl.u8        q10,    d5,     d26
+    vld1.u32        {q0},   [r4],   r11     ;vector load pu1_src
+    vhadd.s16       q4,     q4,     q11
+    vld1.u32        {q1},   [r4],   r11
+    vmlal.u8        q10,    d7,     d27
+    add             r12,    #8
+    subs            r5,     r5,     #16
+    vmlal.u8        q10,    d13,    d28
+    vld1.u32        {q2},   [r4],   r11
+    vmlsl.u8        q10,    d15,    d29
+    vld1.u32        {q3},   [r4],   r11
+    vqrshrun.s16    d8,     q4,     #6      ;right shift and saturating narrow
+                                            ; result 1
+    vmlal.u8        q10,    d17,    d30
+    vld1.u32        {q6},   [r4],   r11
+    vmlsl.u8        q10,    d19,    d31
+    vld1.u32        {q7},   [r4],   r11
+    add             r7,     r1,     #8
+    vmlsl.u8        q5,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vmlal.u8        q5,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {q8},   [r4],   r11
+    vhadd.s16       q10,    q10,    q11
+    vld1.u32        {q9},   [r4],   r11
+    vld1.u8         {d0},   [r1]
+    vmlsl.u8        q5,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u8         {d2},   [r7]
+    vmlal.u8        q5,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    add             r4,     #8
+    mov             r7,     #0xc000
+    vmlal.u8        q5,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vmlsl.u8        q5,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vqrshrun.s16    d9,     q10,    #6
+    vdup.16         q11,    r7
+    vmlal.u8        q5,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q5,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    mov             r7,     #0x4000
+    vrhadd.u8       d8,     d8,     d0
+    vrhadd.u8       d9,     d9,     d2
+    vmlsl.u8        q11,    d1,     d24
+    vmlal.u8        q11,    d3,     d25
+    vdup.16         q10,    r7
+    vmlsl.u8        q11,    d5,     d26
+    pld             [r12,   r2,     lsl #2]
+    pld             [r4,    r2,     lsl #2]
+    addeq           r12,    r12,    r9      ;increment the src pointer by
+                                            ; 2*src_strd-wd
+    addeq           r4,     r12,    r2      ;pu1_src + src_strd
+    vmlal.u8        q11,    d7,     d27
+    vmlal.u8        q11,    d13,    d28
+    vst1.8          {q4},   [r1]!           ;store the result pu1_dst
+    subeq           r14,    r14,    #2
+    vhadd.s16       q5,     q5,     q10
+    vmlsl.u8        q11,    d15,    d29
+    addeq           r1,     r1,     r8
+    vmlal.u8        q11,    d17,    d30
+    cmp             r14,    #0
+    vmlsl.u8        q11,    d19,    d31
+    vqrshrun.s16    d10,    q5,     #6      ;right shift and saturating narrow
+                                            ; result 2
+    beq             epilog_16
+
+    vld1.u32        {q0},   [r12],  r11     ;vector load pu1_src
+    mov             r7,     #0xc000
+    cmp             r5,     #0
+    vld1.u32        {q1},   [r12],  r11
+    vhadd.s16       q11,    q11,    q10
+    vld1.u32        {q2},   [r12],  r11
+    vdup.16         q4,     r7
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vdup.16         q10,    r7
+    vld1.u32        {q3},   [r12],  r11
+    add             r7,     r6,     #8
+    moveq           r5,     r10
+    vld1.u8         {d0},   [r6]
+    vmlal.u8        q4,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u8         {d2},   [r7]
+    vqrshrun.s16    d11,    q11,    #6
+    vmlsl.u8        q4,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {q6},   [r12],  r11
+    vrhadd.u8       d10,    d10,    d0
+    vld1.u32        {q7},   [r12],  r11
+    vrhadd.u8       d11,    d11,    d2
+    vld1.u32        {q8},   [r12],  r11
+    vmlal.u8        q4,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {q9},   [r12],  r11
+    vmlal.u8        q4,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vmlsl.u8        q4,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    mov             r7,     #0xc000
+    vmlal.u8        q4,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vst1.8          {q5},   [r6]!           ;store the result pu1_dst
+    vmlsl.u8        q4,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    addeq           r6,     r1,     r3      ;pu1_dst + dst_strd
+    b               inner_loop_16
+
+epilog_16
+    mov             r7,     #0x4000
+    ldr             r0,     [sp],   #4
+    ldr             r10,    [sp,    #120]
+    vdup.16         q10,    r7
+    vhadd.s16       q11,    q11,    q10
+    vqrshrun.s16    d11,    q11,    #6
+    add             r7,     r6,     #8
+    vld1.u8         {d20},  [r6]
+    vld1.u8         {d21},  [r7]
+    vrhadd.u8       d10,    d10,    d20
+    vrhadd.u8       d11,    d11,    d21
+    vst1.8          {q5},   [r6]!           ;store the result pu1_dst
+    ldr             r7,     [sp],   #4
+    cmp             r10,    #24
+    beq             outer_loop8_residual
+
+end_loops1
+    b               end_func
+
+outer_loop4_residual
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    mov             r1,     r7
+    add             r1,     #8
+    mov             r10,    #4
+    add             r12,    #8
+    mov             r14,    #16
+    add             r8,     #4
+    add             r9,     #4
+
+outer_loop_4
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    subs            r5,     r10,    #0      ;checks wd
+    ble             end_inner_loop_4
+
+inner_loop_4
+    vld1.u32        {d0},   [r12],  r11     ;vector load pu1_src
+    vld1.u32        {d1},   [r12],  r11
+    vld1.u32        {d2},   [r12],  r11
+    vld1.u32        {d3},   [r12],  r11
+    vld1.u32        {d4},   [r12],  r11
+    vld1.u32        {d5},   [r12],  r11
+    vld1.u32        {d6},   [r12],  r11
+    vld1.u32        {d7},   [r12],  r11
+    sub             r12,    r12,    #4
+    vld1.u32        {d12},  [r4],   r11     ;vector load pu1_src + src_strd
+    vld1.u32        {d13},  [r4],   r11
+    vzip.32         d0,     d12             ;vector zip the i iteration and ii
+                                            ; interation in single register
+    vld1.u32        {d14},  [r4],   r11
+    vzip.32         d1,     d13
+    vld1.u32        {d15},  [r4],   r11
+    vzip.32         d2,     d14
+    vld1.u32        {d16},  [r4],   r11
+    vzip.32         d3,     d15
+    vld1.u32        {d17},  [r4],   r11
+    vzip.32         d4,     d16
+    vld1.u32        {d18},  [r4],   r11
+    vzip.32         d5,     d17
+    vld1.u32        {d19},  [r4],   r11
+    mov             r7,     #0xc000
+    vdup.16         q4,     r7
+    sub             r4,     r4,     #4
+    vzip.32         d6,     d18
+    vzip.32         d7,     d19
+    vmlal.u8        q4,     d1,     d25     ;arithmetic operations for ii
+                                            ; iteration in the same time
+    vmlsl.u8        q4,     d0,     d24
+    vmlsl.u8        q4,     d2,     d26
+    vmlal.u8        q4,     d3,     d27
+    vmlal.u8        q4,     d4,     d28
+    vmlsl.u8        q4,     d5,     d29
+    vmlal.u8        q4,     d6,     d30
+    vmlsl.u8        q4,     d7,     d31
+    mov             r7,     #0x4000
+    vdup.16         q10,    r7
+    vhadd.s16       q4,     q4,     q10
+    vqrshrun.s16    d8,     q4,     #6
+    vld1.u32        {d10[0]},       [r1]
+    vld1.u32        {d10[1]},       [r6]
+    vrhadd.u8       d8,     d8,     d10
+    vst1.32         {d8[0]},[r1]!           ;store the i iteration result which
+                                            ; is in upper part of the register
+    vst1.32         {d8[1]},[r6]!           ;store the ii iteration result which
+                                            ; is in lower part of the register
+    subs            r5,     r5,     #4      ;decrement the wd by 4
+    bgt             inner_loop_4
+
+end_inner_loop_4
+    subs            r14,    r14,    #2      ;decrement the ht by 4
+    add             r12,    r12,    r9      ;increment the input pointer
+                                            ; 2*src_strd-wd
+    add             r1,     r1,     r8      ;increment the output pointer
+                                            ; 2*dst_strd-wd
+    bgt             outer_loop_4
+
+end_func
+    vpop            {d8  -  d15}
+    ldmfd           sp!,    {r4  -  r12,    r15} ;reload the registers from sp
+
+    ENDP
+
+    END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm
deleted file mode 100644
index e279d570fc..0000000000
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm
+++ /dev/null
@@ -1,292 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    ; These functions are only valid when:
-    ; x_step_q4 == 16
-    ; w%4 == 0
-    ; h%4 == 0
-    ; taps == 8
-    ; VP9_FILTER_WEIGHT == 128
-    ; VP9_FILTER_SHIFT == 7
-
-    EXPORT  |vpx_convolve8_avg_horiz_neon|
-    EXPORT  |vpx_convolve8_avg_vert_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Multiply and accumulate by q0
-    MACRO
-    MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
-    vmull.s16 $dst, $src0, d0[0]
-    vmlal.s16 $dst, $src1, d0[1]
-    vmlal.s16 $dst, $src2, d0[2]
-    vmlal.s16 $dst, $src3, d0[3]
-    vmlal.s16 $dst, $src4, d1[0]
-    vmlal.s16 $dst, $src5, d1[1]
-    vmlal.s16 $dst, $src6, d1[2]
-    vmlal.s16 $dst, $src7, d1[3]
-    MEND
-
-; r0    const uint8_t *src
-; r1    int src_stride
-; r2    uint8_t *dst
-; r3    int dst_stride
-; sp[]const int16_t *filter_x
-; sp[]int x_step_q4
-; sp[]const int16_t *filter_y ; unused
-; sp[]int y_step_q4           ; unused
-; sp[]int w
-; sp[]int h
-
-|vpx_convolve8_avg_horiz_neon| PROC
-    push            {r4-r10, lr}
-
-    sub             r0, r0, #3              ; adjust for taps
-
-    ldr             r5, [sp, #32]           ; filter_x
-    ldr             r6, [sp, #48]           ; w
-    ldr             r7, [sp, #52]           ; h
-
-    vld1.s16        {q0}, [r5]              ; filter_x
-
-    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
-    add             r8, r8, #4              ; -src_stride * 3 + 4
-
-    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
-    add             r4, r4, #4              ; -dst_stride * 3 + 4
-
-    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
-    sub             r9, r9, #7
-    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop
-
-    mov             r10, r6                 ; w loop counter
-
-vpx_convolve8_avg_loop_horiz_v
-    vld1.8          {d24}, [r0], r1
-    vld1.8          {d25}, [r0], r1
-    vld1.8          {d26}, [r0], r1
-    vld1.8          {d27}, [r0], r8
-
-    vtrn.16         q12, q13
-    vtrn.8          d24, d25
-    vtrn.8          d26, d27
-
-    pld             [r0, r1, lsl #2]
-
-    vmovl.u8        q8, d24
-    vmovl.u8        q9, d25
-    vmovl.u8        q10, d26
-    vmovl.u8        q11, d27
-
-    ; save a few instructions in the inner loop
-    vswp            d17, d18
-    vmov            d23, d21
-
-    add             r0, r0, #3
-
-vpx_convolve8_avg_loop_horiz
-    add             r5, r0, #64
-
-    vld1.32         {d28[]}, [r0], r1
-    vld1.32         {d29[]}, [r0], r1
-    vld1.32         {d31[]}, [r0], r1
-    vld1.32         {d30[]}, [r0], r8
-
-    pld             [r5]
-
-    vtrn.16         d28, d31
-    vtrn.16         d29, d30
-    vtrn.8          d28, d29
-    vtrn.8          d31, d30
-
-    pld             [r5, r1]
-
-    ; extract to s16
-    vtrn.32         q14, q15
-    vmovl.u8        q12, d28
-    vmovl.u8        q13, d29
-
-    pld             [r5, r1, lsl #1]
-
-    ; slightly out of order load to match the existing data
-    vld1.u32        {d6[0]}, [r2], r3
-    vld1.u32        {d7[0]}, [r2], r3
-    vld1.u32        {d6[1]}, [r2], r3
-    vld1.u32        {d7[1]}, [r2], r3
-
-    sub             r2, r2, r3, lsl #2      ; reset for store
-
-    ; src[] * filter_x
-    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
-    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
-    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
-    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
-
-    pld             [r5, -r8]
-
-    ; += 64 >> 7
-    vqrshrun.s32    d2, q1, #7
-    vqrshrun.s32    d3, q2, #7
-    vqrshrun.s32    d4, q14, #7
-    vqrshrun.s32    d5, q15, #7
-
-    ; saturate
-    vqmovn.u16      d2, q1
-    vqmovn.u16      d3, q2
-
-    ; transpose
-    vtrn.16         d2, d3
-    vtrn.32         d2, d3
-    vtrn.8          d2, d3
-
-    ; average the new value and the dst value
-    vrhadd.u8       q1, q1, q3
-
-    vst1.u32        {d2[0]}, [r2@32], r3
-    vst1.u32        {d3[0]}, [r2@32], r3
-    vst1.u32        {d2[1]}, [r2@32], r3
-    vst1.u32        {d3[1]}, [r2@32], r4
-
-    vmov            q8,  q9
-    vmov            d20, d23
-    vmov            q11, q12
-    vmov            q9,  q13
-
-    subs            r6, r6, #4              ; w -= 4
-    bgt             vpx_convolve8_avg_loop_horiz
-
-    ; outer loop
-    mov             r6, r10                 ; restore w counter
-    add             r0, r0, r9              ; src += src_stride * 4 - w
-    add             r2, r2, r12             ; dst += dst_stride * 4 - w
-    subs            r7, r7, #4              ; h -= 4
-    bgt vpx_convolve8_avg_loop_horiz_v
-
-    pop             {r4-r10, pc}
-
-    ENDP
-
-|vpx_convolve8_avg_vert_neon| PROC
-    push            {r4-r8, lr}
-
-    ; adjust for taps
-    sub             r0, r0, r1
-    sub             r0, r0, r1, lsl #1
-
-    ldr             r4, [sp, #32]           ; filter_y
-    ldr             r6, [sp, #40]           ; w
-    ldr             lr, [sp, #44]           ; h
-
-    vld1.s16        {q0}, [r4]              ; filter_y
-
-    lsl             r1, r1, #1
-    lsl             r3, r3, #1
-
-vpx_convolve8_avg_loop_vert_h
-    mov             r4, r0
-    add             r7, r0, r1, asr #1
-    mov             r5, r2
-    add             r8, r2, r3, asr #1
-    mov             r12, lr                 ; h loop counter
-
-    vld1.u32        {d16[0]}, [r4], r1
-    vld1.u32        {d16[1]}, [r7], r1
-    vld1.u32        {d18[0]}, [r4], r1
-    vld1.u32        {d18[1]}, [r7], r1
-    vld1.u32        {d20[0]}, [r4], r1
-    vld1.u32        {d20[1]}, [r7], r1
-    vld1.u32        {d22[0]}, [r4], r1
-
-    vmovl.u8        q8, d16
-    vmovl.u8        q9, d18
-    vmovl.u8        q10, d20
-    vmovl.u8        q11, d22
-
-vpx_convolve8_avg_loop_vert
-    ; always process a 4x4 block at a time
-    vld1.u32        {d24[0]}, [r7], r1
-    vld1.u32        {d26[0]}, [r4], r1
-    vld1.u32        {d26[1]}, [r7], r1
-    vld1.u32        {d24[1]}, [r4], r1
-
-    ; extract to s16
-    vmovl.u8        q12, d24
-    vmovl.u8        q13, d26
-
-    vld1.u32        {d6[0]}, [r5@32], r3
-    vld1.u32        {d6[1]}, [r8@32], r3
-    vld1.u32        {d7[0]}, [r5@32], r3
-    vld1.u32        {d7[1]}, [r8@32], r3
-
-    pld             [r7]
-    pld             [r4]
-
-    ; src[] * filter_y
-    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
-
-    pld             [r7, r1]
-    pld             [r4, r1]
-
-    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
-
-    pld             [r5]
-    pld             [r8]
-
-    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
-
-    pld             [r5, r3]
-    pld             [r8, r3]
-
-    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
-
-    ; += 64 >> 7
-    vqrshrun.s32    d2, q1, #7
-    vqrshrun.s32    d3, q2, #7
-    vqrshrun.s32    d4, q14, #7
-    vqrshrun.s32    d5, q15, #7
-
-    ; saturate
-    vqmovn.u16      d2, q1
-    vqmovn.u16      d3, q2
-
-    ; average the new value and the dst value
-    vrhadd.u8       q1, q1, q3
-
-    sub             r5, r5, r3, lsl #1      ; reset for store
-    sub             r8, r8, r3, lsl #1
-
-    vst1.u32        {d2[0]}, [r5@32], r3
-    vst1.u32        {d2[1]}, [r8@32], r3
-    vst1.u32        {d3[0]}, [r5@32], r3
-    vst1.u32        {d3[1]}, [r8@32], r3
-
-    vmov            q8, q10
-    vmov            d18, d22
-    vmov            d19, d24
-    vmov            q10, q13
-    vmov            d22, d25
-
-    subs            r12, r12, #4            ; h -= 4
-    bgt             vpx_convolve8_avg_loop_vert
-
-    ; outer loop
-    add             r0, r0, #4
-    add             r2, r2, #4
-    subs            r6, r6, #4              ; w -= 4
-    bgt             vpx_convolve8_avg_loop_vert_h
-
-    pop             {r4-r8, pc}
-
-    ENDP
-    END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm
new file mode 100644
index 0000000000..d310a83dad
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm
@@ -0,0 +1,486 @@
+;
+;  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+;    r0 => src
+;    r1 => dst
+;    r2 =>  src_stride
+;    r6 =>  dst_stride
+;    r12 => filter_y0
+;    r5 =>  ht
+;    r3 =>  wd
+
+    EXPORT          |vpx_convolve8_avg_vert_filter_type1_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA  ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_avg_vert_filter_type1_neon| PROC
+
+    stmfd           sp!,    {r4  -  r12,    r14} ;stack stores the values of
+                                                 ; the arguments
+    vpush           {d8  -  d15}                 ; stack offset by 64
+    mov             r4,     r1
+    mov             r1,     r2
+    mov             r2,     r4
+    vmov.i16        q15,    #0x4000
+    mov             r11,    #0xc000
+    ldr             r12,    [sp,    #104]   ;load filter
+    ldr             r6,     [sp,    #116]   ;load y0_q4
+    add             r12,    r12,    r6,     lsl #4 ;r12 = filter[y0_q4]
+    mov             r6,     r3
+    ldr             r5,     [sp,    #124]   ;load wd
+    vld2.8          {d0,    d1},    [r12]   ;coeff = vld1_s8(pi1_coeff)
+    sub             r12,    r2,     r2,     lsl #2 ;src_ctrd & pi1_coeff
+    vabs.s8         d0,     d0              ;vabs_s8(coeff)
+    add             r0,     r0,     r12     ;r0->pu1_src    r12->pi1_coeff
+    ldr             r3,     [sp,    #128]   ;load ht
+    subs            r7,     r3,     #0      ;r3->ht
+    vdup.u8         d22,    d0[0]           ;coeffabs_0 = vdup_lane_u8(coeffabs,
+                                            ; 0);
+    cmp             r5,     #8
+    vdup.u8         d23,    d0[1]           ;coeffabs_1 = vdup_lane_u8(coeffabs,
+                                            ; 1);
+    vdup.u8         d24,    d0[2]           ;coeffabs_2 = vdup_lane_u8(coeffabs,
+                                            ; 2);
+    vdup.u8         d25,    d0[3]           ;coeffabs_3 = vdup_lane_u8(coeffabs,
+                                            ; 3);
+    vdup.u8         d26,    d0[4]           ;coeffabs_4 = vdup_lane_u8(coeffabs,
+                                            ; 4);
+    vdup.u8         d27,    d0[5]           ;coeffabs_5 = vdup_lane_u8(coeffabs,
+                                            ; 5);
+    vdup.u8         d28,    d0[6]           ;coeffabs_6 = vdup_lane_u8(coeffabs,
+                                            ; 6);
+    vdup.u8         d29,    d0[7]           ;coeffabs_7 = vdup_lane_u8(coeffabs,
+                                            ; 7);
+    blt             core_loop_wd_4          ;core loop wd 4 jump
+    str             r0,     [sp,  #-4]!
+    str             r1,     [sp,  #-4]!
+    bic             r4,     r5,     #7      ;r5 ->wd
+    rsb             r9,     r4,     r6,     lsl #2 ;r6->dst_strd    r5    ->wd
+    rsb             r8,     r4,     r2,     lsl #2 ;r2->src_strd
+    mov             r3,     r5,     lsr #3  ;divide by 8
+    mul             r7,     r3              ;multiply height by width
+    sub             r7,     #4              ;subtract by one for epilog
+
+prolog
+    and             r10,    r0,     #31
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vdup.16         q4,     r11
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    subs            r4,     r4,     #8
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+                                            ; coeffabs_1);
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vdup.16         q5,     r11
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    addle           r0,     r0,     r8
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    bicle           r4,     r5,     #7      ;r5 ->wd
+    vmlal.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    pld             [r3]
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    pld             [r3,    r2]
+    pld             [r3,    r2,     lsl #1]
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    add             r3,     r3,     r2
+    vmlal.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    pld             [r3,    r2,     lsl #1]
+    vmlsl.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    vld1.u8         {d20},  [r1]
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q6,     d3,     d23
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q6,     d2,     d22
+    vrhadd.u8       d8,     d8,     d20
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q6,     d4,     d24
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d5,     d25
+    vmlal.u8        q6,     d6,     d26
+    add             r14,    r1,     r6
+    vmlal.u8        q6,     d7,     d27
+    vmlsl.u8        q6,     d16,    d28
+    vst1.8          {d8},   [r1]!           ;vst1_u8(pu1_dst,sto_res);
+    vmlsl.u8        q6,     d17,    d29
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    addle           r1,     r1,     r9
+    vmlsl.u8        q7,     d4,     d23
+    subs            r7,     r7,     #4
+    vmlsl.u8        q7,     d3,     d22
+    vmlal.u8        q7,     d5,     d24
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d6,     d25
+    vrhadd.u8       d10,    d10,    d20
+    vhadd.s16       q6,     q6,     q15
+    vdup.16         q4,     r11
+    vmlal.u8        q7,     d7,     d26
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d16,    d27
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d17,    d28
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d18,    d29
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    vqrshrun.s16    d12,    q6,     #6
+    blt             epilog_end              ;jumps to epilog_end
+
+    beq             epilog                  ;jumps to epilog
+
+main_loop_8
+    subs            r4,     r4,     #8
+    vmlsl.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+                                            ; coeffabs_1);
+    vld1.u8         {d20},  [r14]
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    addle           r0,     r0,     r8
+    bicle           r4,     r5,     #7      ;r5 ->wd
+    vmlal.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vrhadd.u8       d12,    d12,    d20
+    vhadd.s16       q7,     q7,     q15
+    vdup.16         q5,     r11
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vmlsl.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vst1.8          {d12},  [r14],  r6
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d14,    q7,     #6
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vmlsl.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    vrhadd.u8       d14,    d14,    d20
+    vmlal.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    vst1.8          {d14},  [r14],  r6
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    add             r14,    r1,     #0
+    vmlal.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    add             r1,     r1,     #8
+    vmlsl.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    addle           r1,     r1,     r9
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vmlsl.u8        q6,     d3,     d23
+    add             r10,    r3,     r2,     lsl #3 ; 10*strd - 8+2
+    vmlsl.u8        q6,     d2,     d22
+    vrhadd.u8       d8,     d8,     d20
+    add             r10,    r10,    r2      ; 11*strd
+    vmlal.u8        q6,     d4,     d24
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q6,     d5,     d25
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d6,     d26
+    vst1.8          {d8},   [r14],  r6      ;vst1_u8(pu1_dst,sto_res);
+    pld             [r10]                   ;11+ 0
+    vmlal.u8        q6,     d7,     d27
+    pld             [r10,   r2]             ;11+ 1*strd
+    pld             [r10,   r2,     lsl #1] ;11+ 2*strd
+    vmlsl.u8        q6,     d16,    d28
+    add             r10,    r10,    r2      ;12*strd
+    vmlsl.u8        q6,     d17,    d29
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+
+    pld             [r10,   r2,     lsl #1] ;11+ 3*strd
+    vmlsl.u8        q7,     d4,     d23
+    vmlsl.u8        q7,     d3,     d22
+    vrhadd.u8       d10,    d10,    d20
+    subs            r7,     r7,     #4
+    vmlal.u8        q7,     d5,     d24
+    vmlal.u8        q7,     d6,     d25
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vhadd.s16       q6,     q6,     q15
+    vdup.16         q4,     r11
+    vmlal.u8        q7,     d7,     d26
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d16,    d27
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d17,    d28
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d18,    d29
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vqrshrun.s16    d12,    q6,     #6
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    bgt             main_loop_8             ;jumps to main_loop_8
+
+epilog
+    vld1.u8         {d20},  [r14]
+    vmlsl.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+                                            ; coeffabs_1);
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    vmlal.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vrhadd.u8       d12,    d12,    d20
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vhadd.s16       q7,     q7,     q15
+    vdup.16         q5,     r11
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vmlal.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vmlsl.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vst1.8          {d12},  [r14],  r6
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d14,    q7,     #6
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    vrhadd.u8       d14,    d14,    d20
+    vmlal.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    vmlal.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    vmlsl.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    vst1.8          {d14},  [r14],  r6
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    vld1.u8         {d20},  [r1]
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q6,     d3,     d23
+    vmlsl.u8        q6,     d2,     d22
+    vrhadd.u8       d8,     d8,     d20
+    vmlal.u8        q6,     d4,     d24
+    vmlal.u8        q6,     d5,     d25
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d6,     d26
+    vmlal.u8        q6,     d7,     d27
+    add             r14,    r1,     r6
+    vmlsl.u8        q6,     d16,    d28
+    vst1.8          {d8},   [r1]!           ;vst1_u8(pu1_dst,sto_res);
+    vmlsl.u8        q6,     d17,    d29
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d4,     d23
+    vmlsl.u8        q7,     d3,     d22
+    vrhadd.u8       d10,    d10,    d20
+    vmlal.u8        q7,     d5,     d24
+    vmlal.u8        q7,     d6,     d25
+    vhadd.s16       q6,     q6,     q15
+    vmlal.u8        q7,     d7,     d26
+    vmlal.u8        q7,     d16,    d27
+    vmlsl.u8        q7,     d17,    d28
+    vmlsl.u8        q7,     d18,    d29
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    vqrshrun.s16    d12,    q6,     #6
+
+epilog_end
+    vld1.u8         {d20},  [r14]
+    vrhadd.u8       d12,    d12,    d20
+    vst1.8          {d12},  [r14],  r6
+    vhadd.s16       q7,     q7,     q15
+    vqrshrun.s16    d14,    q7,     #6
+    vld1.u8         {d20},  [r14]
+    vrhadd.u8       d14,    d14,    d20
+    vst1.8          {d14},  [r14],  r6
+
+end_loops
+    tst             r5,     #7
+    ldr             r1,     [sp],   #4
+    ldr             r0,     [sp],   #4
+    vpopeq          {d8  -  d15}
+    ldmfdeq         sp!,    {r4  -  r12,    r15} ;reload the registers from sp
+    mov             r5,     #4
+    add             r0,     r0,     #8
+    add             r1,     r1,     #8
+    mov             r7,     #16
+
+core_loop_wd_4
+    rsb             r9,     r5,     r6,     lsl #2 ;r6->dst_strd    r5    ->wd
+    rsb             r8,     r5,     r2,     lsl #2 ;r2->src_strd
+    vmov.i8         d4,     #0
+
+outer_loop_wd_4
+    subs            r12,    r5,     #0
+    ble             end_inner_loop_wd_4     ;outer loop jump
+
+inner_loop_wd_4
+    add             r3,     r0,     r2
+    vld1.u32        {d4[1]},[r3],   r2      ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 1);
+    subs            r12,    r12,    #4
+    vdup.u32        d5,     d4[1]           ;src_tmp2 = vdup_lane_u32(src_tmp1,
+                                            ; 1);
+    vld1.u32        {d5[1]},[r3],   r2      ;src_tmp2 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp2, 1);
+    vld1.u32        {d4[0]},[r0]            ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 0);
+    vdup.16         q0,     r11
+    vmlsl.u8        q0,     d5,     d23     ;mul_res1 =
+                                            ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1);
+    vdup.u32        d6,     d5[1]           ;src_tmp3 = vdup_lane_u32(src_tmp2,
+                                            ; 1);
+    add             r0,     r0,     #4
+    vld1.u32        {d6[1]},[r3],   r2      ;src_tmp3 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp3, 1);
+    vmlsl.u8        q0,     d4,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp1), coeffabs_0);
+    vdup.u32        d7,     d6[1]           ;src_tmp4 = vdup_lane_u32(src_tmp3,
+                                            ; 1);
+    vld1.u32        {d7[1]},[r3],   r2      ;src_tmp4 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp4, 1);
+    vmlal.u8        q0,     d6,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp3), coeffabs_2);
+    vdup.16         q4,     r11
+    vmlsl.u8        q4,     d7,     d23
+    vdup.u32        d4,     d7[1]           ;src_tmp1 = vdup_lane_u32(src_tmp4,
+                                            ; 1);
+    vmull.u8        q1,     d7,     d25     ;mul_res2 =
+                                            ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3);
+    vld1.u32        {d4[1]},[r3],   r2      ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 1);
+    vmlsl.u8        q4,     d6,     d22
+    vmlal.u8        q0,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp1), coeffabs_4);
+    vdup.u32        d5,     d4[1]           ;src_tmp2 = vdup_lane_u32(src_tmp1,
+                                            ; 1);
+    vmlal.u8        q4,     d4,     d24
+    vld1.u32        {d5[1]},[r3],   r2      ;src_tmp2 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp2, 1);
+    vmlal.u8        q1,     d5,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; vreinterpret_u8_u32(src_tmp2), coeffabs_5);
+    vdup.u32        d6,     d5[1]           ;src_tmp3 = vdup_lane_u32(src_tmp2,
+                                            ; 1);
+    vmlal.u8        q4,     d5,     d25
+    vld1.u32        {d6[1]},[r3],   r2      ;src_tmp3 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp3, 1);
+    vmlsl.u8        q0,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp3), coeffabs_6);
+    vdup.u32        d7,     d6[1]           ;src_tmp4 = vdup_lane_u32(src_tmp3,
+                                            ; 1);
+    vmlal.u8        q4,     d6,     d26
+    vld1.u32        {d7[1]},[r3],   r2      ;src_tmp4 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp4, 1);
+    vmlsl.u8        q1,     d7,     d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; vreinterpret_u8_u32(src_tmp4), coeffabs_7);
+    vdup.u32        d4,     d7[1]
+    vadd.i16        q0,     q0,     q1      ;mul_res1 = vaddq_u16(mul_res1,
+                                            ; mul_res2);
+    vmlal.u8        q4,     d7,     d27
+    vld1.u32        {d4[1]},[r3],   r2
+    vmlsl.u8        q4,     d4,     d28
+    vdup.u32        d5,     d4[1]
+    vhadd.s16       q0,     q0,     q15
+    vqrshrun.s16    d0,     q0,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u32        {d5[1]},[r3]
+    add             r3,     r1,     r6
+    vld1.u32        {d20[0]},       [r1]
+    vld1.u32        {d20[1]},       [r3]
+    vrhadd.u8       d0,     d0,     d20
+    vst1.32         {d0[0]},[r1]            ;vst1_lane_u32((uint32_t *)pu1_dst,
+                                            ; vreinterpret_u32_u8(sto_res), 0);
+    vmlsl.u8        q4,     d5,     d29
+    vst1.32         {d0[1]},[r3],   r6      ;vst1_lane_u32((uint32_t
+                                            ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
+    vhadd.s16       q4,     q4,     q15
+    vqrshrun.s16    d8,     q4,     #6
+    mov             r4,     r3
+    vld1.u32        {d20[0]},       [r4],   r6
+    vld1.u32        {d20[1]},       [r4]
+    vrhadd.u8       d8,     d8,     d20
+    vst1.32         {d8[0]},[r3],   r6
+    add             r1,     r1,     #4
+    vst1.32         {d8[1]},[r3]
+    bgt             inner_loop_wd_4
+
+end_inner_loop_wd_4
+    subs            r7,     r7,     #4
+    add             r1,     r1,     r9
+    add             r0,     r0,     r8
+    bgt             outer_loop_wd_4
+
+    vpop            {d8  -  d15}
+    ldmfd           sp!,    {r4  -  r12,    r15} ;reload the registers from sp
+
+    ENDP
+
+    END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm
new file mode 100644
index 0000000000..c5695fbda8
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm
@@ -0,0 +1,487 @@
+;
+;  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+;    r0 => src
+;    r1 => dst
+;    r2 =>  src_stride
+;    r6 =>  dst_stride
+;    r12 => filter_y0
+;    r5 =>  ht
+;    r3 =>  wd
+
+    EXPORT          |vpx_convolve8_avg_vert_filter_type2_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA  ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_avg_vert_filter_type2_neon| PROC
+
+    stmfd           sp!,    {r4  -  r12,    r14} ;stack stores the values of
+                                                 ; the arguments
+    vpush           {d8  -  d15}                 ; stack offset by 64
+    mov             r4,     r1
+    mov             r1,     r2
+    mov             r2,     r4
+    vmov.i16        q15,    #0x4000
+    mov             r11,    #0xc000
+    ldr             r12,    [sp,    #104]   ;load filter
+    ldr             r6,     [sp,    #116]   ;load y0_q4
+    add             r12,    r12,    r6,     lsl #4 ;r12 = filter[y0_q4]
+    mov             r6,     r3
+    ldr             r5,     [sp,    #124]   ;load wd
+    vld2.8          {d0,    d1},    [r12]   ;coeff = vld1_s8(pi1_coeff)
+    sub             r12,    r2,     r2,     lsl #2 ;src_ctrd & pi1_coeff
+    vabs.s8         d0,     d0              ;vabs_s8(coeff)
+    add             r0,     r0,     r12     ;r0->pu1_src    r12->pi1_coeff
+    ldr             r3,     [sp,    #128]   ;load ht
+    subs            r7,     r3,     #0      ;r3->ht
+    vdup.u8         d22,    d0[0]           ;coeffabs_0 = vdup_lane_u8(coeffabs,
+                                            ; 0);
+    cmp             r5,     #8
+    vdup.u8         d23,    d0[1]           ;coeffabs_1 = vdup_lane_u8(coeffabs,
+                                            ; 1);
+    vdup.u8         d24,    d0[2]           ;coeffabs_2 = vdup_lane_u8(coeffabs,
+                                            ; 2);
+    vdup.u8         d25,    d0[3]           ;coeffabs_3 = vdup_lane_u8(coeffabs,
+                                            ; 3);
+    vdup.u8         d26,    d0[4]           ;coeffabs_4 = vdup_lane_u8(coeffabs,
+                                            ; 4);
+    vdup.u8         d27,    d0[5]           ;coeffabs_5 = vdup_lane_u8(coeffabs,
+                                            ; 5);
+    vdup.u8         d28,    d0[6]           ;coeffabs_6 = vdup_lane_u8(coeffabs,
+                                            ; 6);
+    vdup.u8         d29,    d0[7]           ;coeffabs_7 = vdup_lane_u8(coeffabs,
+                                            ; 7);
+    blt             core_loop_wd_4          ;core loop wd 4 jump
+
+    str             r0,     [sp,  #-4]!
+    str             r1,     [sp,  #-4]!
+    bic             r4,     r5,     #7      ;r5 ->wd
+    rsb             r9,     r4,     r6,     lsl #2 ;r6->dst_strd    r5    ->wd
+    rsb             r8,     r4,     r2,     lsl #2 ;r2->src_strd
+    mov             r3,     r5,     lsr #3  ;divide by 8
+    mul             r7,     r3              ;multiply height by width
+    sub             r7,     #4              ;subtract by one for epilog
+
+prolog
+    and             r10,    r0,     #31
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vdup.16         q4,     r11
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    subs            r4,     r4,     #8
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+                                            ; coeffabs_1);
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vdup.16         q5,     r11
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    addle           r0,     r0,     r8
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    bicle           r4,     r5,     #7      ;r5 ->wd
+    vmlsl.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    pld             [r3]
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    pld             [r3,    r2]
+    pld             [r3,    r2,     lsl #1]
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    add             r3,     r3,     r2
+    vmlsl.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    pld             [r3,    r2,     lsl #1]
+    vmlal.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    vld1.u8         {d20},  [r1]
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q6,     d3,     d23
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q6,     d2,     d22
+    vrhadd.u8       d8,     d8,     d20
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q6,     d4,     d24
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d5,     d25
+    vmlal.u8        q6,     d6,     d26
+    add             r14,    r1,     r6
+    vmlsl.u8        q6,     d7,     d27
+    vmlal.u8        q6,     d16,    d28
+    vst1.8          {d8},   [r1]!           ;vst1_u8(pu1_dst,sto_res);
+    vmlsl.u8        q6,     d17,    d29
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    addle           r1,     r1,     r9
+    vmlal.u8        q7,     d4,     d23
+    subs            r7,     r7,     #4
+    vmlsl.u8        q7,     d3,     d22
+    vmlsl.u8        q7,     d5,     d24
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d6,     d25
+    vrhadd.u8       d10,    d10,    d20
+    vhadd.s16       q6,     q6,     q15
+    vdup.16         q4,     r11
+    vmlal.u8        q7,     d7,     d26
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d16,    d27
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d17,    d28
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d18,    d29
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    vqrshrun.s16    d12,    q6,     #6
+    blt             epilog_end              ;jumps to epilog_end
+
+    beq             epilog                  ;jumps to epilog
+
+main_loop_8
+    subs            r4,     r4,     #8
+    vmlal.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+                                            ; coeffabs_1);
+    vld1.u8         {d20},  [r14]
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    addle           r0,     r0,     r8
+    bicle           r4,     r5,     #7      ;r5 ->wd
+    vmlsl.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vrhadd.u8       d12,    d12,    d20
+    vhadd.s16       q7,     q7,     q15
+    vdup.16         q5,     r11
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vmlal.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vst1.8          {d12},  [r14],  r6
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d14,    q7,     #6
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vmlal.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    vrhadd.u8       d14,    d14,    d20
+    vmlsl.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    vst1.8          {d14},  [r14],  r6
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    add             r14,    r1,     #0
+    vmlsl.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    add             r1,     r1,     #8
+    vmlal.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    addle           r1,     r1,     r9
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vmlal.u8        q6,     d3,     d23
+    add             r10,    r3,     r2,     lsl #3 ; 10*strd - 8+2
+    vmlsl.u8        q6,     d2,     d22
+    vrhadd.u8       d8,     d8,     d20
+    add             r10,    r10,    r2      ; 11*strd
+    vmlsl.u8        q6,     d4,     d24
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q6,     d5,     d25
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d6,     d26
+    vst1.8          {d8},   [r14],  r6      ;vst1_u8(pu1_dst,sto_res);
+    pld             [r10]                   ;11+ 0
+    vmlsl.u8        q6,     d7,     d27
+    pld             [r10,   r2]             ;11+ 1*strd
+    pld             [r10,   r2,     lsl #1] ;11+ 2*strd
+    vmlal.u8        q6,     d16,    d28
+    add             r10,    r10,    r2      ;12*strd
+    vmlsl.u8        q6,     d17,    d29
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    pld             [r10,   r2,     lsl #1] ;11+ 3*strd
+    vmlal.u8        q7,     d4,     d23
+    vmlsl.u8        q7,     d3,     d22
+    vrhadd.u8       d10,    d10,    d20
+    subs            r7,     r7,     #4
+    vmlsl.u8        q7,     d5,     d24
+    vmlal.u8        q7,     d6,     d25
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vhadd.s16       q6,     q6,     q15
+    vdup.16         q4,     r11
+    vmlal.u8        q7,     d7,     d26
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d16,    d27
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d17,    d28
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d18,    d29
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vqrshrun.s16    d12,    q6,     #6
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    bgt             main_loop_8             ;jumps to main_loop_8
+
+epilog
+    vld1.u8         {d20},  [r14]
+    vmlal.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+                                            ; coeffabs_1);
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    vmlsl.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vrhadd.u8       d12,    d12,    d20
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vhadd.s16       q7,     q7,     q15
+    vdup.16         q5,     r11
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vmlsl.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vmlal.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vst1.8          {d12},  [r14],  r6
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d14,    q7,     #6
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    vrhadd.u8       d14,    d14,    d20
+    vmlsl.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    vmlsl.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    vmlal.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    vst1.8          {d14},  [r14],  r6
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    vld1.u8         {d20},  [r1]
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q6,     d3,     d23
+    vmlsl.u8        q6,     d2,     d22
+    vrhadd.u8       d8,     d8,     d20
+    vmlsl.u8        q6,     d4,     d24
+    vmlal.u8        q6,     d5,     d25
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d6,     d26
+    vmlsl.u8        q6,     d7,     d27
+    add             r14,    r1,     r6
+    vmlal.u8        q6,     d16,    d28
+    vst1.8          {d8},   [r1]!           ;vst1_u8(pu1_dst,sto_res);
+    vmlsl.u8        q6,     d17,    d29
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d4,     d23
+    vmlsl.u8        q7,     d3,     d22
+    vrhadd.u8       d10,    d10,    d20
+    vmlsl.u8        q7,     d5,     d24
+    vmlal.u8        q7,     d6,     d25
+    vhadd.s16       q6,     q6,     q15
+    vmlal.u8        q7,     d7,     d26
+    vmlsl.u8        q7,     d16,    d27
+    vmlal.u8        q7,     d17,    d28
+    vmlsl.u8        q7,     d18,    d29
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    vqrshrun.s16    d12,    q6,     #6
+
+epilog_end
+    vld1.u8         {d20},  [r14]
+    vrhadd.u8       d12,    d12,    d20
+    vst1.8          {d12},  [r14],  r6
+    vhadd.s16       q7,     q7,     q15
+    vqrshrun.s16    d14,    q7,     #6
+    vld1.u8         {d20},  [r14]
+    vrhadd.u8       d14,    d14,    d20
+    vst1.8          {d14},  [r14],  r6
+
+end_loops
+    tst             r5,     #7
+    ldr             r1,     [sp],   #4
+    ldr             r0,     [sp],   #4
+    vpopeq          {d8  -  d15}
+    ldmfdeq         sp!,    {r4  -  r12,    r15} ;reload the registers from sp
+
+    mov             r5,     #4
+    add             r0,     r0,     #8
+    add             r1,     r1,     #8
+    mov             r7,     #16
+
+core_loop_wd_4
+    rsb             r9,     r5,     r6,     lsl #2 ;r6->dst_strd    r5    ->wd
+    rsb             r8,     r5,     r2,     lsl #2 ;r2->src_strd
+    vmov.i8         d4,     #0
+
+outer_loop_wd_4
+    subs            r12,    r5,     #0
+    ble             end_inner_loop_wd_4     ;outer loop jump
+
+inner_loop_wd_4
+    add             r3,     r0,     r2
+    vld1.u32        {d4[1]},[r3],   r2      ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 1);
+    subs            r12,    r12,    #4
+    vdup.u32        d5,     d4[1]           ;src_tmp2 = vdup_lane_u32(src_tmp1,
+                                            ; 1);
+    vld1.u32        {d5[1]},[r3],   r2      ;src_tmp2 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp2, 1);
+    vld1.u32        {d4[0]},[r0]            ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 0);
+    vdup.16         q0,     r11
+    vmlal.u8        q0,     d5,     d23     ;mul_res1 =
+                                            ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1);
+    vdup.u32        d6,     d5[1]           ;src_tmp3 = vdup_lane_u32(src_tmp2,
+                                            ; 1);
+    add             r0,     r0,     #4
+    vld1.u32        {d6[1]},[r3],   r2      ;src_tmp3 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp3, 1);
+    vmlsl.u8        q0,     d4,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp1), coeffabs_0);
+    vdup.u32        d7,     d6[1]           ;src_tmp4 = vdup_lane_u32(src_tmp3,
+                                            ; 1);
+    vld1.u32        {d7[1]},[r3],   r2      ;src_tmp4 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp4, 1);
+    vmlsl.u8        q0,     d6,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp3), coeffabs_2);
+    vdup.16         q4,     r11
+    vmlal.u8        q4,     d7,     d23
+    vdup.u32        d4,     d7[1]           ;src_tmp1 = vdup_lane_u32(src_tmp4,
+                                            ; 1);
+    vmull.u8        q1,     d7,     d25     ;mul_res2 =
+                                            ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3);
+    vld1.u32        {d4[1]},[r3],   r2      ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 1);
+    vmlsl.u8        q4,     d6,     d22
+    vmlal.u8        q0,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp1), coeffabs_4);
+    vdup.u32        d5,     d4[1]           ;src_tmp2 = vdup_lane_u32(src_tmp1,
+                                            ; 1);
+    vmlsl.u8        q4,     d4,     d24
+    vld1.u32        {d5[1]},[r3],   r2      ;src_tmp2 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp2, 1);
+    vmlsl.u8        q1,     d5,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; vreinterpret_u8_u32(src_tmp2), coeffabs_5);
+    vdup.u32        d6,     d5[1]           ;src_tmp3 = vdup_lane_u32(src_tmp2,
+                                            ; 1);
+    vmlal.u8        q4,     d5,     d25
+    vld1.u32        {d6[1]},[r3],   r2      ;src_tmp3 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp3, 1);
+    vmlal.u8        q0,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp3), coeffabs_6);
+    vdup.u32        d7,     d6[1]           ;src_tmp4 = vdup_lane_u32(src_tmp3,
+                                            ; 1);
+    vmlal.u8        q4,     d6,     d26
+    vld1.u32        {d7[1]},[r3],   r2      ;src_tmp4 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp4, 1);
+    vmlsl.u8        q1,     d7,     d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; vreinterpret_u8_u32(src_tmp4), coeffabs_7);
+    vdup.u32        d4,     d7[1]
+    vadd.i16        q0,     q0,     q1      ;mul_res1 = vaddq_u16(mul_res1,
+                                            ; mul_res2);
+    vmlsl.u8        q4,     d7,     d27
+    vld1.u32        {d4[1]},[r3],   r2
+    vmlal.u8        q4,     d4,     d28
+    vdup.u32        d5,     d4[1]
+    vhadd.s16       q0,     q0,     q15
+    vqrshrun.s16    d0,     q0,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u32        {d5[1]},[r3]
+    add             r3,     r1,     r6
+    vld1.u32        {d20[0]},       [r1]
+    vld1.u32        {d20[1]},       [r3]
+    vrhadd.u8       d0,     d0,     d20
+    vst1.32         {d0[0]},[r1]            ;vst1_lane_u32((uint32_t *)pu1_dst,
+                                            ; vreinterpret_u32_u8(sto_res), 0);
+    vmlsl.u8        q4,     d5,     d29
+    vst1.32         {d0[1]},[r3],   r6      ;vst1_lane_u32((uint32_t
+                                            ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
+    vhadd.s16       q4,     q4,     q15
+    vqrshrun.s16    d8,     q4,     #6
+    mov             r4,     r3
+    vld1.u32        {d20[0]},       [r4],   r6
+    vld1.u32        {d20[1]},       [r4]
+    vrhadd.u8       d8,     d8,     d20
+    vst1.32         {d8[0]},[r3],   r6
+    add             r1,     r1,     #4
+    vst1.32         {d8[1]},[r3]
+    bgt             inner_loop_wd_4
+
+end_inner_loop_wd_4
+    subs            r7,     r7,     #4
+    add             r1,     r1,     r9
+    add             r0,     r0,     r8
+    bgt             outer_loop_wd_4
+
+    vpop            {d8  -  d15}
+    ldmfd           sp!,    {r4  -  r12,    r15} ;reload the registers from sp
+
+    ENDP
+
+    END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm
new file mode 100644
index 0000000000..fa1b732466
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm
@@ -0,0 +1,415 @@
+;
+;  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+;    r0 => src
+;    r1 => dst
+;    r2 =>  src_stride
+;    r3 =>  dst_stride
+;    r4 => filter_x0
+;    r8 =>  ht
+;    r10 =>  wd
+
+    EXPORT          |vpx_convolve8_horiz_filter_type1_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA  ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_horiz_filter_type1_neon| PROC
+
+    stmfd           sp!,    {r4  -  r12,    r14} ;stack stores the values of
+                                                 ; the arguments
+    vpush           {d8  -  d15}                 ; stack offset by 64
+    mov             r4,     r1
+    mov             r1,     r2
+    mov             r2,     r4
+start_loop_count
+    ldr             r4,     [sp,    #104]   ;loads pi1_coeff
+    ldr             r8,     [sp,    #108]   ;loads x0_q4
+    add             r4,     r4,     r8,     lsl #4 ;r4 = filter[x0_q4]
+    ldr             r8,     [sp,    #128]   ;loads ht
+    ldr             r10,    [sp,    #124]   ;loads wd
+    vld2.8          {d0,    d1},    [r4]    ;coeff = vld1_s8(pi1_coeff)
+    mov             r11,    #1
+    subs            r14,    r8,     #0      ;checks for ht == 0
+    vabs.s8         d2,     d0              ;vabs_s8(coeff)
+    vdup.8          d24,    d2[0]           ;coeffabs_0 = vdup_lane_u8(coeffabs,
+                                            ; 0)
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    vdup.8          d25,    d2[1]           ;coeffabs_1 = vdup_lane_u8(coeffabs,
+                                            ; 1)
+    add             r4,     r12,    r2      ;pu1_src_tmp2_8 = pu1_src + src_strd
+    vdup.8          d26,    d2[2]           ;coeffabs_2 = vdup_lane_u8(coeffabs,
+                                            ; 2)
+    rsb             r9,     r10,    r2,     lsl #1 ;2*src_strd - wd
+    vdup.8          d27,    d2[3]           ;coeffabs_3 = vdup_lane_u8(coeffabs,
+                                            ; 3)
+    rsb             r8,     r10,    r3,     lsl #1 ;2*dst_strd - wd
+    vdup.8          d28,    d2[4]           ;coeffabs_4 = vdup_lane_u8(coeffabs,
+                                            ; 4)
+    vdup.8          d29,    d2[5]           ;coeffabs_5 = vdup_lane_u8(coeffabs,
+                                            ; 5)
+    vdup.8          d30,    d2[6]           ;coeffabs_6 = vdup_lane_u8(coeffabs,
+                                            ; 6)
+    vdup.8          d31,    d2[7]           ;coeffabs_7 = vdup_lane_u8(coeffabs,
+                                            ; 7)
+    mov             r7,     r1
+    cmp             r10,    #4
+    ble             outer_loop_4
+
+    cmp             r10,    #24
+    moveq           r10,    #16
+    addeq           r8,     #8
+    addeq           r9,     #8
+    cmp             r10,    #16
+    bge             outer_loop_16
+
+    cmp             r10,    #12
+    addeq           r8,     #4
+    addeq           r9,     #4
+    b               outer_loop_8
+
+outer_loop8_residual
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    mov             r1,     r7
+    mov             r14,    #32
+    add             r1,     #16
+    add             r12,    #16
+    mov             r10,    #8
+    add             r8,     #8
+    add             r9,     #8
+
+outer_loop_8
+
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    subs            r5,     r10,    #0      ;checks wd
+    ble             end_inner_loop_8
+
+inner_loop_8
+    mov             r7,     #0xc000
+    vld1.u32        {d0},   [r12],  r11     ;vector load pu1_src
+    vdup.16         q4,     r7
+    vld1.u32        {d1},   [r12],  r11
+    vdup.16         q5,     r7
+    vld1.u32        {d2},   [r12],  r11
+    vld1.u32        {d3},   [r12],  r11
+    mov             r7,     #0x4000
+    vld1.u32        {d4},   [r12],  r11
+    vmlsl.u8        q4,     d1,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {d5},   [r12],  r11
+    vmlal.u8        q4,     d3,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {d6},   [r12],  r11
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vld1.u32        {d7},   [r12],  r11
+    vmlal.u8        q4,     d2,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {d12},  [r4],   r11     ;vector load pu1_src + src_strd
+    vmlal.u8        q4,     d4,     d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vld1.u32        {d13},  [r4],   r11
+    vmlal.u8        q4,     d5,     d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vld1.u32        {d14},  [r4],   r11
+    vmlsl.u8        q4,     d6,     d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vld1.u32        {d15},  [r4],   r11
+    vmlsl.u8        q4,     d7,     d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    vld1.u32        {d16},  [r4],   r11     ;vector load pu1_src + src_strd
+    vdup.16         q11,    r7
+    vmlal.u8        q5,     d15,    d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {d17},  [r4],   r11
+    vmlal.u8        q5,     d14,    d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vhadd.s16       q4,     q4,     q11
+    vld1.u32        {d18},  [r4],   r11
+    vmlal.u8        q5,     d16,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vld1.u32        {d19},  [r4],   r11     ;vector load pu1_src + src_strd
+    vmlal.u8        q5,     d17,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vmlsl.u8        q5,     d18,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q5,     d19,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    vqrshrun.s16    d20,    q4,     #6      ;right shift and saturating narrow
+                                            ; result 1
+    vmlsl.u8        q5,     d12,    d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vmlsl.u8        q5,     d13,    d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vst1.8          {d20},  [r1]!           ;store the result pu1_dst
+    vhadd.s16       q5,     q5,     q11
+    subs            r5,     r5,     #8      ;decrement the wd loop
+    vqrshrun.s16    d8,     q5,     #6      ;right shift and saturating narrow
+                                            ; result 2
+    vst1.8          {d8},   [r6]!           ;store the result pu1_dst
+    cmp             r5,     #4
+    bgt             inner_loop_8
+
+end_inner_loop_8
+    subs            r14,    r14,    #2      ;decrement the ht loop
+    add             r12,    r12,    r9      ;increment the src pointer by
+                                            ; 2*src_strd-wd
+    add             r1,     r1,     r8      ;increment the dst pointer by
+                                            ; 2*dst_strd-wd
+    bgt             outer_loop_8
+
+    ldr             r10,    [sp,    #120]   ;loads wd
+    cmp             r10,    #12
+    beq             outer_loop4_residual
+
+end_loops
+    b               end_func
+
+outer_loop_16
+    str             r0,     [sp,  #-4]!
+    str             r7,     [sp,  #-4]!
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    and             r0,     r12,    #31
+    mov             r7,     #0xc000
+    sub             r5,     r10,    #0      ;checks wd
+    pld             [r4,    r2,     lsl #1]
+    pld             [r12,   r2,     lsl #1]
+    vld1.u32        {q0},   [r12],  r11     ;vector load pu1_src
+    vdup.16         q4,     r7
+    vld1.u32        {q1},   [r12],  r11
+    vld1.u32        {q2},   [r12],  r11
+    vld1.u32        {q3},   [r12],  r11
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vld1.u32        {q6},   [r12],  r11
+    vmlsl.u8        q4,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {q7},   [r12],  r11
+    vmlal.u8        q4,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {q8},   [r12],  r11
+    vmlal.u8        q4,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {q9},   [r12],  r11
+    vmlal.u8        q4,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vmlal.u8        q4,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vdup.16         q10,    r7
+    vmlsl.u8        q4,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q4,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+
+inner_loop_16
+    vmlsl.u8        q10,    d1,     d24
+    vdup.16         q5,     r7
+    vmlsl.u8        q10,    d3,     d25
+    mov             r7,     #0x4000
+    vdup.16         q11,    r7
+    vmlal.u8        q10,    d5,     d26
+    vld1.u32        {q0},   [r4],   r11     ;vector load pu1_src
+    vhadd.s16       q4,     q4,     q11
+    vld1.u32        {q1},   [r4],   r11
+    vmlal.u8        q10,    d7,     d27
+    add             r12,    #8
+    subs            r5,     r5,     #16
+    vmlal.u8        q10,    d13,    d28
+    vld1.u32        {q2},   [r4],   r11
+    vmlal.u8        q10,    d15,    d29
+    vld1.u32        {q3},   [r4],   r11
+    vqrshrun.s16    d8,     q4,     #6      ;right shift and saturating narrow
+                                            ; result 1
+    vmlsl.u8        q10,    d17,    d30
+    vld1.u32        {q6},   [r4],   r11
+    vmlsl.u8        q10,    d19,    d31
+    vld1.u32        {q7},   [r4],   r11
+    vmlsl.u8        q5,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vmlsl.u8        q5,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {q8},   [r4],   r11
+    vhadd.s16       q10,    q10,    q11
+    vld1.u32        {q9},   [r4],   r11
+    vmlal.u8        q5,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vmlal.u8        q5,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    add             r4,     #8
+    mov             r7,     #0xc000
+    vmlal.u8        q5,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vmlal.u8        q5,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vqrshrun.s16    d9,     q10,    #6
+    vdup.16         q11,    r7
+    vmlsl.u8        q5,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q5,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    mov             r7,     #0x4000
+    vmlsl.u8        q11,    d1,     d24
+    vst1.8          {q4},   [r1]!           ;store the result pu1_dst
+    vmlsl.u8        q11,    d3,     d25
+    vdup.16         q10,    r7
+    vmlal.u8        q11,    d5,     d26
+    pld             [r12,   r2,     lsl #2]
+    pld             [r4,    r2,     lsl #2]
+    addeq           r12,    r12,    r9      ;increment the src pointer by
+                                            ; 2*src_strd-wd
+    addeq           r4,     r12,    r2      ;pu1_src + src_strd
+    vmlal.u8        q11,    d7,     d27
+    addeq           r1,     r1,     r8
+    subeq           r14,    r14,    #2
+    vmlal.u8        q11,    d13,    d28
+    vhadd.s16       q5,     q5,     q10
+    vmlal.u8        q11,    d15,    d29
+    vmlsl.u8        q11,    d17,    d30
+    cmp             r14,    #0
+    vmlsl.u8        q11,    d19,    d31
+    vqrshrun.s16    d10,    q5,     #6      ;right shift and saturating narrow
+                                            ; result 2
+    beq             epilog_16
+
+    vld1.u32        {q0},   [r12],  r11     ;vector load pu1_src
+    mov             r7,     #0xc000
+    cmp             r5,     #0
+    vld1.u32        {q1},   [r12],  r11
+    vhadd.s16       q11,    q11,    q10
+    vld1.u32        {q2},   [r12],  r11
+    vdup.16         q4,     r7
+    vld1.u32        {q3},   [r12],  r11
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vld1.u32        {q6},   [r12],  r11
+    vld1.u32        {q7},   [r12],  r11
+    vmlsl.u8        q4,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {q8},   [r12],  r11
+    vmlal.u8        q4,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {q9},   [r12],  r11
+    vqrshrun.s16    d11,    q11,    #6
+    vmlal.u8        q4,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    moveq           r5,     r10
+    vmlal.u8        q4,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vdup.16         q10,    r7
+    vmlal.u8        q4,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vst1.8          {q5},   [r6]!           ;store the result pu1_dst
+    vmlsl.u8        q4,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q4,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    addeq           r6,     r1,     r3      ;pu1_dst + dst_strd
+    b               inner_loop_16
+
+epilog_16
+    mov             r7,     #0x4000
+    ldr             r0,     [sp],   #4
+    ldr             r10,    [sp,    #120]
+    vdup.16         q10,    r7
+    vhadd.s16       q11,    q11,    q10
+    vqrshrun.s16    d11,    q11,    #6
+    vst1.8          {q5},   [r6]!           ;store the result pu1_dst
+    ldr             r7,     [sp],   #4
+    cmp             r10,    #24
+    beq             outer_loop8_residual
+
+end_loops1
+    b               end_func
+
+outer_loop4_residual
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    mov             r1,     r7
+    add             r1,     #8
+    mov             r10,    #4
+    add             r12,    #8
+    mov             r14,    #16
+    add             r8,     #4
+    add             r9,     #4
+
+outer_loop_4
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    subs            r5,     r10,    #0      ;checks wd
+    ble             end_inner_loop_4
+
+inner_loop_4
+    vld1.u32        {d0},   [r12],  r11     ;vector load pu1_src
+    vld1.u32        {d1},   [r12],  r11
+    vld1.u32        {d2},   [r12],  r11
+    vld1.u32        {d3},   [r12],  r11
+    vld1.u32        {d4},   [r12],  r11
+    vld1.u32        {d5},   [r12],  r11
+    vld1.u32        {d6},   [r12],  r11
+    vld1.u32        {d7},   [r12],  r11
+    sub             r12,    r12,    #4
+    vld1.u32        {d12},  [r4],   r11     ;vector load pu1_src + src_strd
+    vld1.u32        {d13},  [r4],   r11
+    vzip.32         d0,     d12             ;vector zip the i iteration and ii
+                                            ; interation in single register
+    vld1.u32        {d14},  [r4],   r11
+    vzip.32         d1,     d13
+    vld1.u32        {d15},  [r4],   r11
+    vzip.32         d2,     d14
+    vld1.u32        {d16},  [r4],   r11
+    vzip.32         d3,     d15
+    vld1.u32        {d17},  [r4],   r11
+    vzip.32         d4,     d16
+    vld1.u32        {d18},  [r4],   r11
+    vzip.32         d5,     d17
+    vld1.u32        {d19},  [r4],   r11
+    mov             r7,     #0xc000
+    vdup.16         q4,     r7
+    sub             r4,     r4,     #4
+    vzip.32         d6,     d18
+    vzip.32         d7,     d19
+    vmlsl.u8        q4,     d1,     d25     ;arithmetic operations for ii
+                                            ; iteration in the same time
+    vmlsl.u8        q4,     d0,     d24
+    vmlal.u8        q4,     d2,     d26
+    vmlal.u8        q4,     d3,     d27
+    vmlal.u8        q4,     d4,     d28
+    vmlal.u8        q4,     d5,     d29
+    vmlsl.u8        q4,     d6,     d30
+    vmlsl.u8        q4,     d7,     d31
+    mov             r7,     #0x4000
+    vdup.16         q10,    r7
+    vhadd.s16       q4,     q4,     q10
+    vqrshrun.s16    d8,     q4,     #6
+    vst1.32         {d8[0]},[r1]!           ;store the i iteration result which
+                                            ; is in upper part of the register
+    vst1.32         {d8[1]},[r6]!           ;store the ii iteration result which
+                                            ; is in lower part of the register
+    subs            r5,     r5,     #4      ;decrement the wd by 4
+    bgt             inner_loop_4
+
+end_inner_loop_4
+    subs            r14,    r14,    #2      ;decrement the ht by 4
+    add             r12,    r12,    r9      ;increment the input pointer
+                                            ; 2*src_strd-wd
+    add             r1,     r1,     r8      ;increment the output pointer
+                                            ; 2*dst_strd-wd
+    bgt             outer_loop_4
+
+end_func
+    vpop            {d8  -  d15}
+    ldmfd           sp!,    {r4  -  r12,    r15} ;reload the registers from sp
+
+    ENDP
+
+    END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm
new file mode 100644
index 0000000000..90b2c8fef7
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm
@@ -0,0 +1,415 @@
+;
+;  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+;    r0 => src
+;    r1 => dst
+;    r2 =>  src_stride
+;    r3 =>  dst_stride
+;    r4 => filter_x0
+;    r8 =>  ht
+;    r10 =>  wd
+
+    EXPORT          |vpx_convolve8_horiz_filter_type2_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA  ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_horiz_filter_type2_neon| PROC
+
+    stmfd           sp!,    {r4  -  r12,    r14} ;stack stores the values of
+                                                 ; the arguments
+    vpush           {d8  -  d15}                 ; stack offset by 64
+    mov             r4,     r1
+    mov             r1,     r2
+    mov             r2,     r4
+
+start_loop_count
+    ldr             r4,     [sp,    #104]   ;loads pi1_coeff
+    ldr             r8,     [sp,    #108]   ;loads x0_q4
+    add             r4,     r4,     r8,     lsl #4 ;r4 = filter[x0_q4]
+    ldr             r8,     [sp,    #128]   ;loads ht
+    ldr             r10,    [sp,    #124]   ;loads wd
+    vld2.8          {d0,    d1},    [r4]    ;coeff = vld1_s8(pi1_coeff)
+    mov             r11,    #1
+    subs            r14,    r8,     #0      ;checks for ht == 0
+    vabs.s8         d2,     d0              ;vabs_s8(coeff)
+    vdup.8          d24,    d2[0]           ;coeffabs_0 = vdup_lane_u8(coeffabs,
+                                            ; 0)
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    vdup.8          d25,    d2[1]           ;coeffabs_1 = vdup_lane_u8(coeffabs,
+                                            ; 1)
+    add             r4,     r12,    r2      ;pu1_src_tmp2_8 = pu1_src + src_strd
+    vdup.8          d26,    d2[2]           ;coeffabs_2 = vdup_lane_u8(coeffabs,
+                                            ; 2)
+    rsb             r9,     r10,    r2,     lsl #1 ;2*src_strd - wd
+    vdup.8          d27,    d2[3]           ;coeffabs_3 = vdup_lane_u8(coeffabs,
+                                            ; 3)
+    rsb             r8,     r10,    r3,     lsl #1 ;2*dst_strd - wd
+    vdup.8          d28,    d2[4]           ;coeffabs_4 = vdup_lane_u8(coeffabs,
+                                            ; 4)
+    vdup.8          d29,    d2[5]           ;coeffabs_5 = vdup_lane_u8(coeffabs,
+                                            ; 5)
+    vdup.8          d30,    d2[6]           ;coeffabs_6 = vdup_lane_u8(coeffabs,
+                                            ; 6)
+    vdup.8          d31,    d2[7]           ;coeffabs_7 = vdup_lane_u8(coeffabs,
+                                            ; 7)
+    mov             r7,     r1
+    cmp             r10,    #4
+    ble             outer_loop_4
+
+    cmp             r10,    #24
+    moveq           r10,    #16
+    addeq           r8,     #8
+    addeq           r9,     #8
+    cmp             r10,    #16
+    bge             outer_loop_16
+
+    cmp             r10,    #12
+    addeq           r8,     #4
+    addeq           r9,     #4
+    b               outer_loop_8
+
+outer_loop8_residual
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    mov             r1,     r7
+    mov             r14,    #32
+    add             r1,     #16
+    add             r12,    #16
+    mov             r10,    #8
+    add             r8,     #8
+    add             r9,     #8
+
+outer_loop_8
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    subs            r5,     r10,    #0      ;checks wd
+    ble             end_inner_loop_8
+
+inner_loop_8
+    mov             r7,     #0xc000
+    vld1.u32        {d0},   [r12],  r11     ;vector load pu1_src
+    vdup.16         q4,     r7
+    vld1.u32        {d1},   [r12],  r11
+    vdup.16         q5,     r7
+    vld1.u32        {d2},   [r12],  r11
+    vld1.u32        {d3},   [r12],  r11
+    mov             r7,     #0x4000
+    vld1.u32        {d4},   [r12],  r11
+    vmlal.u8        q4,     d1,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {d5},   [r12],  r11
+    vmlal.u8        q4,     d3,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {d6},   [r12],  r11
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vld1.u32        {d7},   [r12],  r11
+    vmlsl.u8        q4,     d2,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {d12},  [r4],   r11     ;vector load pu1_src + src_strd
+    vmlal.u8        q4,     d4,     d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vld1.u32        {d13},  [r4],   r11
+    vmlsl.u8        q4,     d5,     d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vld1.u32        {d14},  [r4],   r11
+    vmlal.u8        q4,     d6,     d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vld1.u32        {d15},  [r4],   r11
+    vmlsl.u8        q4,     d7,     d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    vld1.u32        {d16},  [r4],   r11     ;vector load pu1_src + src_strd
+    vdup.16         q11,    r7
+    vmlal.u8        q5,     d15,    d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {d17},  [r4],   r11
+    vmlsl.u8        q5,     d14,    d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vhadd.s16       q4,     q4,     q11
+    vld1.u32        {d18},  [r4],   r11
+    vmlal.u8        q5,     d16,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vld1.u32        {d19},  [r4],   r11     ;vector load pu1_src + src_strd
+    vmlsl.u8        q5,     d17,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vmlal.u8        q5,     d18,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q5,     d19,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    vqrshrun.s16    d20,    q4,     #6      ;right shift and saturating narrow
+                                            ; result 1
+    vmlsl.u8        q5,     d12,    d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vmlal.u8        q5,     d13,    d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vst1.8          {d20},  [r1]!           ;store the result pu1_dst
+    vhadd.s16       q5,     q5,     q11
+    subs            r5,     r5,     #8      ;decrement the wd loop
+    vqrshrun.s16    d8,     q5,     #6      ;right shift and saturating narrow
+                                            ; result 2
+    vst1.8          {d8},   [r6]!           ;store the result pu1_dst
+    cmp             r5,     #4
+    bgt             inner_loop_8
+
+end_inner_loop_8
+    subs            r14,    r14,    #2      ;decrement the ht loop
+    add             r12,    r12,    r9      ;increment the src pointer by
+                                            ; 2*src_strd-wd
+    add             r1,     r1,     r8      ;increment the dst pointer by
+                                            ; 2*dst_strd-wd
+    bgt             outer_loop_8
+
+    ldr             r10,    [sp,    #120]   ;loads wd
+    cmp             r10,    #12
+    beq             outer_loop4_residual
+
+end_loops
+    b               end_func
+
+outer_loop_16
+    str             r0,     [sp,  #-4]!
+    str             r7,     [sp,  #-4]!
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    and             r0,     r12,    #31
+    mov             r7,     #0xc000
+    sub             r5,     r10,    #0      ;checks wd
+    pld             [r4,    r2,     lsl #1]
+    pld             [r12,   r2,     lsl #1]
+    vld1.u32        {q0},   [r12],  r11     ;vector load pu1_src
+    vdup.16         q4,     r7
+    vld1.u32        {q1},   [r12],  r11
+    vld1.u32        {q2},   [r12],  r11
+    vld1.u32        {q3},   [r12],  r11
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vld1.u32        {q6},   [r12],  r11
+    vmlal.u8        q4,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {q7},   [r12],  r11
+    vmlsl.u8        q4,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {q8},   [r12],  r11
+    vmlal.u8        q4,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {q9},   [r12],  r11
+    vmlal.u8        q4,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vmlsl.u8        q4,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vdup.16         q10,    r7
+    vmlal.u8        q4,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q4,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+
+inner_loop_16
+    vmlsl.u8        q10,    d1,     d24
+    vdup.16         q5,     r7
+    vmlal.u8        q10,    d3,     d25
+    mov             r7,     #0x4000
+    vdup.16         q11,    r7
+    vmlsl.u8        q10,    d5,     d26
+    vld1.u32        {q0},   [r4],   r11     ;vector load pu1_src
+    vhadd.s16       q4,     q4,     q11
+    vld1.u32        {q1},   [r4],   r11
+    vmlal.u8        q10,    d7,     d27
+    add             r12,    #8
+    subs            r5,     r5,     #16
+    vmlal.u8        q10,    d13,    d28
+    vld1.u32        {q2},   [r4],   r11
+    vmlsl.u8        q10,    d15,    d29
+    vld1.u32        {q3},   [r4],   r11
+    vqrshrun.s16    d8,     q4,     #6      ;right shift and saturating narrow
+                                            ; result 1
+    vmlal.u8        q10,    d17,    d30
+    vld1.u32        {q6},   [r4],   r11
+    vmlsl.u8        q10,    d19,    d31
+    vld1.u32        {q7},   [r4],   r11
+    vmlsl.u8        q5,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vmlal.u8        q5,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {q8},   [r4],   r11
+    vhadd.s16       q10,    q10,    q11
+    vld1.u32        {q9},   [r4],   r11
+    vmlsl.u8        q5,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vmlal.u8        q5,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    add             r4,     #8
+    mov             r7,     #0xc000
+    vmlal.u8        q5,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vmlsl.u8        q5,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vqrshrun.s16    d9,     q10,    #6
+    vdup.16         q11,    r7
+    vmlal.u8        q5,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q5,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    mov             r7,     #0x4000
+    vmlsl.u8        q11,    d1,     d24
+    vst1.8          {q4},   [r1]!           ;store the result pu1_dst
+    vmlal.u8        q11,    d3,     d25
+    vdup.16         q10,    r7
+    vmlsl.u8        q11,    d5,     d26
+    pld             [r12,   r2,     lsl #2]
+    pld             [r4,    r2,     lsl #2]
+    addeq           r12,    r12,    r9      ;increment the src pointer by
+                                            ; 2*src_strd-wd
+    addeq           r4,     r12,    r2      ;pu1_src + src_strd
+    vmlal.u8        q11,    d7,     d27
+    addeq           r1,     r1,     r8
+    subeq           r14,    r14,    #2
+    vmlal.u8        q11,    d13,    d28
+    vhadd.s16       q5,     q5,     q10
+    vmlsl.u8        q11,    d15,    d29
+    vmlal.u8        q11,    d17,    d30
+    cmp             r14,    #0
+    vmlsl.u8        q11,    d19,    d31
+    vqrshrun.s16    d10,    q5,     #6      ;right shift and saturating narrow
+                                            ; result 2
+    beq             epilog_16
+
+    vld1.u32        {q0},   [r12],  r11     ;vector load pu1_src
+    mov             r7,     #0xc000
+    cmp             r5,     #0
+    vld1.u32        {q1},   [r12],  r11
+    vhadd.s16       q11,    q11,    q10
+    vld1.u32        {q2},   [r12],  r11
+    vdup.16         q4,     r7
+    vld1.u32        {q3},   [r12],  r11
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vld1.u32        {q6},   [r12],  r11
+    vld1.u32        {q7},   [r12],  r11
+    vmlal.u8        q4,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {q8},   [r12],  r11
+    vmlsl.u8        q4,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {q9},   [r12],  r11
+    vqrshrun.s16    d11,    q11,    #6
+    vmlal.u8        q4,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    moveq           r5,     r10
+    vmlal.u8        q4,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vdup.16         q10,    r7
+    vmlsl.u8        q4,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vst1.8          {q5},   [r6]!           ;store the result pu1_dst
+    vmlal.u8        q4,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q4,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    addeq           r6,     r1,     r3      ;pu1_dst + dst_strd
+    b               inner_loop_16
+
+epilog_16
+    mov             r7,     #0x4000
+    ldr             r0,     [sp],   #4
+    ldr             r10,    [sp,    #120]
+    vdup.16         q10,    r7
+    vhadd.s16       q11,    q11,    q10
+    vqrshrun.s16    d11,    q11,    #6
+    vst1.8          {q5},   [r6]!           ;store the result pu1_dst
+    ldr             r7,     [sp],   #4
+    cmp             r10,    #24
+    beq             outer_loop8_residual
+
+end_loops1
+    b               end_func
+
+outer_loop4_residual
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    mov             r1,     r7
+    add             r1,     #8
+    mov             r10,    #4
+    add             r12,    #8
+    mov             r14,    #16
+    add             r8,     #4
+    add             r9,     #4
+
+outer_loop_4
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    subs            r5,     r10,    #0      ;checks wd
+    ble             end_inner_loop_4
+
+inner_loop_4
+    vld1.u32        {d0},   [r12],  r11     ;vector load pu1_src
+    vld1.u32        {d1},   [r12],  r11
+    vld1.u32        {d2},   [r12],  r11
+    vld1.u32        {d3},   [r12],  r11
+    vld1.u32        {d4},   [r12],  r11
+    vld1.u32        {d5},   [r12],  r11
+    vld1.u32        {d6},   [r12],  r11
+    vld1.u32        {d7},   [r12],  r11
+    sub             r12,    r12,    #4
+    vld1.u32        {d12},  [r4],   r11     ;vector load pu1_src + src_strd
+    vld1.u32        {d13},  [r4],   r11
+    vzip.32         d0,     d12             ;vector zip the i iteration and ii
+                                            ; interation in single register
+    vld1.u32        {d14},  [r4],   r11
+    vzip.32         d1,     d13
+    vld1.u32        {d15},  [r4],   r11
+    vzip.32         d2,     d14
+    vld1.u32        {d16},  [r4],   r11
+    vzip.32         d3,     d15
+    vld1.u32        {d17},  [r4],   r11
+    vzip.32         d4,     d16
+    vld1.u32        {d18},  [r4],   r11
+    vzip.32         d5,     d17
+    vld1.u32        {d19},  [r4],   r11
+    mov             r7,     #0xc000
+    vdup.16         q4,     r7
+    sub             r4,     r4,     #4
+    vzip.32         d6,     d18
+    vzip.32         d7,     d19
+    vmlal.u8        q4,     d1,     d25     ;arithmetic operations for ii
+                                            ; iteration in the same time
+    vmlsl.u8        q4,     d0,     d24
+    vmlsl.u8        q4,     d2,     d26
+    vmlal.u8        q4,     d3,     d27
+    vmlal.u8        q4,     d4,     d28
+    vmlsl.u8        q4,     d5,     d29
+    vmlal.u8        q4,     d6,     d30
+    vmlsl.u8        q4,     d7,     d31
+    mov             r7,     #0x4000
+    vdup.16         q10,    r7
+    vhadd.s16       q4,     q4,     q10
+    vqrshrun.s16    d8,     q4,     #6
+    vst1.32         {d8[0]},[r1]!           ;store the i iteration result which
+                                            ; is in upper part of the register
+    vst1.32         {d8[1]},[r6]!           ;store the ii iteration result which
+                                            ; is in lower part of the register
+    subs            r5,     r5,     #4      ;decrement the wd by 4
+    bgt             inner_loop_4
+
+end_inner_loop_4
+    subs            r14,    r14,    #2      ;decrement the ht by 4
+    add             r12,    r12,    r9      ;increment the input pointer
+                                            ; 2*src_strd-wd
+    add             r1,     r1,     r8      ;increment the output pointer
+                                            ; 2*dst_strd-wd
+    bgt             outer_loop_4
+
+end_func
+    vpop            {d8  -  d15}
+    ldmfd           sp!,    {r4  -  r12,    r15} ;reload the registers from sp
+
+    ENDP
+
+    END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
index 1386838eea..037ea1142d 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -14,198 +14,122 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
+#include "vpx_dsp/vpx_filter.h"
 #include "vpx_ports/mem.h"
 
-// Note:
-// 1. src is not always 32-bit aligned, so don't call vld1_lane_u32(src).
-// 2. After refactoring the shared code in kernel loops with inline functions,
-// the decoder speed dropped a lot when using gcc compiler. Therefore there is
-// no refactoring for those parts by now.
-// 3. For horizontal convolve, there is an alternative optimization that
-// convolves a single row in each loop. For each row, 8 sample banks with 4 or 8
-// samples in each are read from memory: src, (src+1), (src+2), (src+3),
-// (src+4), (src+5), (src+6), (src+7), or prepared by vector extract
-// instructions. This optimization is much faster in speed unit test, but slowed
-// down the whole decoder by 5%.
+static INLINE void convolve_4tap_horiz_neon(const uint8_t *src,
+                                            ptrdiff_t src_stride, uint8_t *dst,
+                                            ptrdiff_t dst_stride, int w, int h,
+                                            const int16x8_t filter) {
+  // 4-tap and bilinear filter values are even, so halve them to reduce
+  // intermediate precision requirements.
+  const uint8x8_t x_filter =
+      vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(filter)), 1);
 
-static INLINE void load_8x4(const uint8_t *s, ptrdiff_t p, uint8x8_t *s0,
-                            uint8x8_t *s1, uint8x8_t *s2, uint8x8_t *s3) {
-  *s0 = vld1_u8(s);
-  s += p;
-  *s1 = vld1_u8(s);
-  s += p;
-  *s2 = vld1_u8(s);
-  s += p;
-  *s3 = vld1_u8(s);
+  // Neon does not have lane-referencing multiply or multiply-accumulate
+  // instructions that operate on vectors of 8-bit elements. This means we have
+  // to duplicate filter taps into a whole vector and use standard multiply /
+  // multiply-accumulate instructions.
+  const uint8x8_t filter_taps[4] = { vdup_lane_u8(x_filter, 2),
+                                     vdup_lane_u8(x_filter, 3),
+                                     vdup_lane_u8(x_filter, 4),
+                                     vdup_lane_u8(x_filter, 5) };
+
+  if (w == 4) {
+    do {
+      uint8x8_t s01[4];
+
+      s01[0] = load_unaligned_u8(src + 0, src_stride);
+      s01[1] = load_unaligned_u8(src + 1, src_stride);
+      s01[2] = load_unaligned_u8(src + 2, src_stride);
+      s01[3] = load_unaligned_u8(src + 3, src_stride);
+
+      uint8x8_t d01 = convolve4_8(s01[0], s01[1], s01[2], s01[3], filter_taps);
+
+      store_unaligned_u8(dst, dst_stride, d01);
+
+      src += 2 * src_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else {
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int width = w;
+
+      do {
+        uint8x8_t s0[4], s1[4];
+
+        s0[0] = vld1_u8(s + 0);
+        s0[1] = vld1_u8(s + 1);
+        s0[2] = vld1_u8(s + 2);
+        s0[3] = vld1_u8(s + 3);
+
+        s1[0] = vld1_u8(s + src_stride + 0);
+        s1[1] = vld1_u8(s + src_stride + 1);
+        s1[2] = vld1_u8(s + src_stride + 2);
+        s1[3] = vld1_u8(s + src_stride + 3);
+
+        uint8x8_t d0 = convolve4_8(s0[0], s0[1], s0[2], s0[3], filter_taps);
+        uint8x8_t d1 = convolve4_8(s1[0], s1[1], s1[2], s1[3], filter_taps);
+
+        vst1_u8(d, d0);
+        vst1_u8(d + dst_stride, d1);
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 2 * src_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h > 0);
+  }
 }
 
-static INLINE void load_8x8(const uint8_t *s, ptrdiff_t p, uint8x8_t *s0,
-                            uint8x8_t *s1, uint8x8_t *s2, uint8x8_t *s3,
-                            uint8x8_t *s4, uint8x8_t *s5, uint8x8_t *s6,
-                            uint8x8_t *s7) {
-  *s0 = vld1_u8(s);
-  s += p;
-  *s1 = vld1_u8(s);
-  s += p;
-  *s2 = vld1_u8(s);
-  s += p;
-  *s3 = vld1_u8(s);
-  s += p;
-  *s4 = vld1_u8(s);
-  s += p;
-  *s5 = vld1_u8(s);
-  s += p;
-  *s6 = vld1_u8(s);
-  s += p;
-  *s7 = vld1_u8(s);
-}
-
-static INLINE void store_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
-                             const uint8x8_t s1, const uint8x8_t s2,
-                             const uint8x8_t s3, const uint8x8_t s4,
-                             const uint8x8_t s5, const uint8x8_t s6,
-                             const uint8x8_t s7) {
-  vst1_u8(s, s0);
-  s += p;
-  vst1_u8(s, s1);
-  s += p;
-  vst1_u8(s, s2);
-  s += p;
-  vst1_u8(s, s3);
-  s += p;
-  vst1_u8(s, s4);
-  s += p;
-  vst1_u8(s, s5);
-  s += p;
-  vst1_u8(s, s6);
-  s += p;
-  vst1_u8(s, s7);
-}
-
-static INLINE int16x4_t convolve8_4(int16x4_t s0, int16x4_t s1, int16x4_t s2,
-                                    int16x4_t s3, int16x4_t s4, int16x4_t s5,
-                                    int16x4_t s6, int16x4_t s7,
-                                    int16x8_t filters, int16x4_t filter3,
-                                    int16x4_t filter4) {
-  const int16x4_t filters_lo = vget_low_s16(filters);
-  const int16x4_t filters_hi = vget_high_s16(filters);
-  int16x4_t sum = vdup_n_s16(0);
-
-  sum = vmla_lane_s16(sum, s0, filters_lo, 0);
-  sum = vmla_lane_s16(sum, s1, filters_lo, 1);
-  sum = vmla_lane_s16(sum, s2, filters_lo, 2);
-  sum = vmla_lane_s16(sum, s5, filters_hi, 1);
-  sum = vmla_lane_s16(sum, s6, filters_hi, 2);
-  sum = vmla_lane_s16(sum, s7, filters_hi, 3);
-  sum = vqadd_s16(sum, vmul_s16(s3, filter3));
-  sum = vqadd_s16(sum, vmul_s16(s4, filter4));
-  return sum;
-}
-
-static INLINE int16x8_t convolve8_8(int16x8_t s0, int16x8_t s1, int16x8_t s2,
-                                    int16x8_t s3, int16x8_t s4, int16x8_t s5,
-                                    int16x8_t s6, int16x8_t s7,
-                                    int16x8_t filters, int16x8_t filter3,
-                                    int16x8_t filter4) {
-  const int16x4_t filters_lo = vget_low_s16(filters);
-  const int16x4_t filters_hi = vget_high_s16(filters);
-  int16x8_t sum = vdupq_n_s16(0);
-
-  sum = vmlaq_lane_s16(sum, s0, filters_lo, 0);
-  sum = vmlaq_lane_s16(sum, s1, filters_lo, 1);
-  sum = vmlaq_lane_s16(sum, s2, filters_lo, 2);
-  sum = vmlaq_lane_s16(sum, s5, filters_hi, 1);
-  sum = vmlaq_lane_s16(sum, s6, filters_hi, 2);
-  sum = vmlaq_lane_s16(sum, s7, filters_hi, 3);
-  sum = vqaddq_s16(sum, vmulq_s16(s3, filter3));
-  sum = vqaddq_s16(sum, vmulq_s16(s4, filter4));
-  return sum;
-}
-
-void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y,  // unused
-                              int y_step_q4,            // unused
-                              int w, int h) {
-  const int16x8_t filters = vld1q_s16(filter_x);
-  uint8x8_t t0, t1, t2, t3;
-
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
-  assert(x_step_q4 == 16);
-
-  (void)x_step_q4;
-  (void)y_step_q4;
-  (void)filter_y;
-
-  src -= 3;
-
+static INLINE void convolve_8tap_horiz_neon(const uint8_t *src,
+                                            ptrdiff_t src_stride, uint8_t *dst,
+                                            ptrdiff_t dst_stride, int w, int h,
+                                            const int16x8_t filter) {
   if (h == 4) {
-    uint8x8_t d01, d23;
-    int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0,
-        d1, d2, d3;
-    int16x8_t tt0, tt1, tt2, tt3;
+    uint8x8_t t0, t1, t2, t3;
+    load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
 
-    __builtin_prefetch(src + 0 * src_stride);
-    __builtin_prefetch(src + 1 * src_stride);
-    __builtin_prefetch(src + 2 * src_stride);
-    __builtin_prefetch(src + 3 * src_stride);
-    filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
-    filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
-    load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
     transpose_u8_8x4(&t0, &t1, &t2, &t3);
-    tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-    tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-    tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-    tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-    s0 = vget_low_s16(tt0);
-    s1 = vget_low_s16(tt1);
-    s2 = vget_low_s16(tt2);
-    s3 = vget_low_s16(tt3);
-    s4 = vget_high_s16(tt0);
-    s5 = vget_high_s16(tt1);
-    s6 = vget_high_s16(tt2);
-    __builtin_prefetch(dst + 0 * dst_stride);
-    __builtin_prefetch(dst + 1 * dst_stride);
-    __builtin_prefetch(dst + 2 * dst_stride);
-    __builtin_prefetch(dst + 3 * dst_stride);
+    int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+    int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
     src += 7;
 
     do {
-      load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
-      transpose_u8_8x4(&t0, &t1, &t2, &t3);
-      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      s7 = vget_low_s16(tt0);
-      s8 = vget_low_s16(tt1);
-      s9 = vget_low_s16(tt2);
-      s10 = vget_low_s16(tt3);
+      uint8x8_t t7, t8, t9, t10;
+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
 
-      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                       filter4);
-      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                       filter4);
-      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                       filter4);
-      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                       filter4);
+      transpose_u8_8x4(&t7, &t8, &t9, &t10);
+      int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
+      int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8)));
+      int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t9)));
+      int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t10)));
+
+      int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+      int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+      int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+      int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
       transpose_u8_4x4(&d01, &d23);
 
-      vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride),
-                    vreinterpret_u32_u8(d01), 0);
-      vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride),
-                    vreinterpret_u32_u8(d23), 0);
-      vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride),
-                    vreinterpret_u32_u8(d01), 1);
-      vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride),
-                    vreinterpret_u32_u8(d23), 1);
+      store_u8(dst + 0 * dst_stride, 2 * dst_stride, d01);
+      store_u8(dst + 1 * dst_stride, 2 * dst_stride, d23);
 
       s0 = s4;
       s1 = s5;
@@ -217,157 +141,94 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       src += 4;
       dst += 4;
       w -= 4;
-    } while (w > 0);
+    } while (w != 0);
   } else {
-    const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
-    const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
-    int width;
-    const uint8_t *s;
-    uint8x8_t t4, t5, t6, t7;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-
     if (w == 4) {
       do {
-        load_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-        s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+        load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+        load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
+                    &t7);
+
+        transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+        uint8x8_t d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+        uint8x8_t d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+        uint8x8_t d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+        uint8x8_t d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+
+        transpose_u8_8x4(&d04, &d15, &d26, &d37);
+
+        store_u8(dst + 0 * dst_stride, 4 * dst_stride, d04);
+        store_u8(dst + 1 * dst_stride, 4 * dst_stride, d15);
+        store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26);
+        store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37);
 
-        load_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
         src += 8 * src_stride;
-        __builtin_prefetch(dst + 0 * dst_stride);
-        __builtin_prefetch(dst + 1 * dst_stride);
-        __builtin_prefetch(dst + 2 * dst_stride);
-        __builtin_prefetch(dst + 3 * dst_stride);
-        __builtin_prefetch(dst + 4 * dst_stride);
-        __builtin_prefetch(dst + 5 * dst_stride);
-        __builtin_prefetch(dst + 6 * dst_stride);
-        __builtin_prefetch(dst + 7 * dst_stride);
-        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-
-        __builtin_prefetch(src + 0 * src_stride);
-        __builtin_prefetch(src + 1 * src_stride);
-        __builtin_prefetch(src + 2 * src_stride);
-        __builtin_prefetch(src + 3 * src_stride);
-        __builtin_prefetch(src + 4 * src_stride);
-        __builtin_prefetch(src + 5 * src_stride);
-        __builtin_prefetch(src + 6 * src_stride);
-        __builtin_prefetch(src + 7 * src_stride);
-        d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                         filter4);
-        d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                         filter4);
-        d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                         filter4);
-        d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                         filter4);
-
-        t0 = vqrshrun_n_s16(d0, 7);
-        t1 = vqrshrun_n_s16(d1, 7);
-        t2 = vqrshrun_n_s16(d2, 7);
-        t3 = vqrshrun_n_s16(d3, 7);
-        transpose_u8_8x4(&t0, &t1, &t2, &t3);
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 0);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 0);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 0);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 0);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 1);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 1);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 1);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 1);
-        dst += dst_stride;
+        dst += 8 * dst_stride;
         h -= 8;
       } while (h > 0);
     } else {
-      uint8_t *d;
-      int16x8_t s11, s12, s13, s14, d4, d5, d6, d7;
-
       do {
-        __builtin_prefetch(src + 0 * src_stride);
-        __builtin_prefetch(src + 1 * src_stride);
-        __builtin_prefetch(src + 2 * src_stride);
-        __builtin_prefetch(src + 3 * src_stride);
-        __builtin_prefetch(src + 4 * src_stride);
-        __builtin_prefetch(src + 5 * src_stride);
-        __builtin_prefetch(src + 6 * src_stride);
-        __builtin_prefetch(src + 7 * src_stride);
-        load_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-        s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+        load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
 
-        width = w;
-        s = src + 7;
-        d = dst;
-        __builtin_prefetch(dst + 0 * dst_stride);
-        __builtin_prefetch(dst + 1 * dst_stride);
-        __builtin_prefetch(dst + 2 * dst_stride);
-        __builtin_prefetch(dst + 3 * dst_stride);
-        __builtin_prefetch(dst + 4 * dst_stride);
-        __builtin_prefetch(dst + 5 * dst_stride);
-        __builtin_prefetch(dst + 6 * dst_stride);
-        __builtin_prefetch(dst + 7 * dst_stride);
+        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+        const uint8_t *s = src + 7;
+        uint8_t *d = dst;
+        int width = w;
 
         do {
-          load_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-          s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-          s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
-          s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
-          s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-          s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
-          s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
-          s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
-          s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+          uint8x8_t t8, t9, t10, t11, t12, t13, t14, t15;
+          load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14,
+                      &t15);
 
-          d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                           filter4);
-          d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                           filter4);
-          d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                           filter4);
-          d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                           filter4);
-          d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3,
-                           filter4);
-          d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3,
-                           filter4);
-          d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3,
-                           filter4);
-          d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters,
-                           filter3, filter4);
+          transpose_u8_8x8(&t8, &t9, &t10, &t11, &t12, &t13, &t14, &t15);
+          int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t8));
+          int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t9));
+          int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t10));
+          int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t11));
+          int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t12));
+          int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t13));
+          int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t14));
+          int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t15));
 
-          t0 = vqrshrun_n_s16(d0, 7);
-          t1 = vqrshrun_n_s16(d1, 7);
-          t2 = vqrshrun_n_s16(d2, 7);
-          t3 = vqrshrun_n_s16(d3, 7);
-          t4 = vqrshrun_n_s16(d4, 7);
-          t5 = vqrshrun_n_s16(d5, 7);
-          t6 = vqrshrun_n_s16(d6, 7);
-          t7 = vqrshrun_n_s16(d7, 7);
-          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-          store_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
+          uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+          uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+          uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+          uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+          uint8x8_t d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter);
+          uint8x8_t d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter);
+          uint8x8_t d6 =
+              convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter);
+          uint8x8_t d7 =
+              convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter);
+
+          transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+          store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
 
           s0 = s8;
           s1 = s9;
@@ -379,7 +240,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
           s += 8;
           d += 8;
           width -= 8;
-        } while (width > 0);
+        } while (width != 0);
         src += 8 * src_stride;
         dst += 8 * dst_stride;
         h -= 8;
@@ -388,93 +249,89 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
+void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const InterpKernel *filter, int x0_q4,
+                              int x_step_q4, int y0_q4, int y_step_q4, int w,
+                              int h) {
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  const int16x8_t x_filter = vld1q_s16(filter[x0_q4]);
+
+  if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
+    convolve_4tap_horiz_neon(src - 1, src_stride, dst, dst_stride, w, h,
+                             x_filter);
+  } else {
+    convolve_8tap_horiz_neon(src - 3, src_stride, dst, dst_stride, w, h,
+                             x_filter);
+  }
+}
+
 void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y,  // unused
-                                  int y_step_q4,            // unused
+                                  const InterpKernel *filter, int x0_q4,
+                                  int x_step_q4, int y0_q4, int y_step_q4,
                                   int w, int h) {
-  const int16x8_t filters = vld1q_s16(filter_x);
-  uint8x8_t t0, t1, t2, t3;
+  const int16x8_t filters = vld1q_s16(filter[x0_q4]);
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
   assert(x_step_q4 == 16);
 
   (void)x_step_q4;
+  (void)y0_q4;
   (void)y_step_q4;
-  (void)filter_y;
 
   src -= 3;
 
   if (h == 4) {
-    uint8x8_t d01, d23;
-    int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0,
-        d1, d2, d3;
-    int16x8_t tt0, tt1, tt2, tt3;
-    uint32x4_t d0123 = vdupq_n_u32(0);
+    uint8x8_t t0, t1, t2, t3;
+    load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
 
-    __builtin_prefetch(src + 0 * src_stride);
-    __builtin_prefetch(src + 1 * src_stride);
-    __builtin_prefetch(src + 2 * src_stride);
-    __builtin_prefetch(src + 3 * src_stride);
-    filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
-    filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
-    load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
     transpose_u8_8x4(&t0, &t1, &t2, &t3);
-    tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-    tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-    tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-    tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-    s0 = vget_low_s16(tt0);
-    s1 = vget_low_s16(tt1);
-    s2 = vget_low_s16(tt2);
-    s3 = vget_low_s16(tt3);
-    s4 = vget_high_s16(tt0);
-    s5 = vget_high_s16(tt1);
-    s6 = vget_high_s16(tt2);
-    __builtin_prefetch(dst + 0 * dst_stride);
-    __builtin_prefetch(dst + 1 * dst_stride);
-    __builtin_prefetch(dst + 2 * dst_stride);
-    __builtin_prefetch(dst + 3 * dst_stride);
+    int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+    int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
     src += 7;
 
     do {
-      load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
-      transpose_u8_8x4(&t0, &t1, &t2, &t3);
-      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      s7 = vget_low_s16(tt0);
-      s8 = vget_low_s16(tt1);
-      s9 = vget_low_s16(tt2);
-      s10 = vget_low_s16(tt3);
+      uint8x8_t t7, t8, t9, t10;
+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
 
-      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                       filter4);
-      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                       filter4);
-      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                       filter4);
-      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                       filter4);
+      transpose_u8_8x4(&t7, &t8, &t9, &t10);
+      int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
+      int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8)));
+      int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t9)));
+      int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t10)));
+
+      int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+      int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+      int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+      int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
       transpose_u8_4x4(&d01, &d23);
 
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2);
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1);
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
-      d0123 = vreinterpretq_u32_u8(
-          vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23)));
+      uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, 2 * dst_stride);
+      uint8x8_t dd23 = load_u8(dst + 1 * dst_stride, 2 * dst_stride);
 
-      vst1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
-      vst1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2);
-      vst1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1);
-      vst1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
+
+      store_u8(dst + 0 * dst_stride, 2 * dst_stride, d01);
+      store_u8(dst + 1 * dst_stride, 2 * dst_stride, d23);
 
       s0 = s4;
       s1 = s5;
@@ -486,190 +343,114 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       src += 4;
       dst += 4;
       w -= 4;
-    } while (w > 0);
+    } while (w != 0);
   } else {
-    const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
-    const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
-    int width;
-    const uint8_t *s;
-    uint8x8_t t4, t5, t6, t7;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-
     if (w == 4) {
-      uint32x4_t d0415 = vdupq_n_u32(0);
-      uint32x4_t d2637 = vdupq_n_u32(0);
       do {
-        load_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-        s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+        load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+        load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
+                    &t7);
+
+        transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+        uint8x8_t d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        uint8x8_t d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        uint8x8_t d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        uint8x8_t d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+        transpose_u8_8x4(&d04, &d15, &d26, &d37);
+
+        uint8x8_t dd04 = load_u8(dst + 0 * dst_stride, 4 * dst_stride);
+        uint8x8_t dd15 = load_u8(dst + 1 * dst_stride, 4 * dst_stride);
+        uint8x8_t dd26 = load_u8(dst + 2 * dst_stride, 4 * dst_stride);
+        uint8x8_t dd37 = load_u8(dst + 3 * dst_stride, 4 * dst_stride);
+
+        d04 = vrhadd_u8(d04, dd04);
+        d15 = vrhadd_u8(d15, dd15);
+        d26 = vrhadd_u8(d26, dd26);
+        d37 = vrhadd_u8(d37, dd37);
+
+        store_u8(dst + 0 * dst_stride, 4 * dst_stride, d04);
+        store_u8(dst + 1 * dst_stride, 4 * dst_stride, d15);
+        store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26);
+        store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37);
 
-        load_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
         src += 8 * src_stride;
-        __builtin_prefetch(dst + 0 * dst_stride);
-        __builtin_prefetch(dst + 1 * dst_stride);
-        __builtin_prefetch(dst + 2 * dst_stride);
-        __builtin_prefetch(dst + 3 * dst_stride);
-        __builtin_prefetch(dst + 4 * dst_stride);
-        __builtin_prefetch(dst + 5 * dst_stride);
-        __builtin_prefetch(dst + 6 * dst_stride);
-        __builtin_prefetch(dst + 7 * dst_stride);
-        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-
-        __builtin_prefetch(src + 0 * src_stride);
-        __builtin_prefetch(src + 1 * src_stride);
-        __builtin_prefetch(src + 2 * src_stride);
-        __builtin_prefetch(src + 3 * src_stride);
-        __builtin_prefetch(src + 4 * src_stride);
-        __builtin_prefetch(src + 5 * src_stride);
-        __builtin_prefetch(src + 6 * src_stride);
-        __builtin_prefetch(src + 7 * src_stride);
-        d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                         filter4);
-        d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                         filter4);
-        d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                         filter4);
-        d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                         filter4);
-
-        t0 = vqrshrun_n_s16(d0, 7);
-        t1 = vqrshrun_n_s16(d1, 7);
-        t2 = vqrshrun_n_s16(d2, 7);
-        t3 = vqrshrun_n_s16(d3, 7);
-        transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
-        d0415 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0415, 0);
-        d0415 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0415, 2);
-        d2637 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d2637, 0);
-        d2637 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d2637, 2);
-        d0415 = vld1q_lane_u32((uint32_t *)(dst + 4 * dst_stride), d0415, 1);
-        d0415 = vld1q_lane_u32((uint32_t *)(dst + 5 * dst_stride), d0415, 3);
-        d2637 = vld1q_lane_u32((uint32_t *)(dst + 6 * dst_stride), d2637, 1);
-        d2637 = vld1q_lane_u32((uint32_t *)(dst + 7 * dst_stride), d2637, 3);
-        d0415 = vreinterpretq_u32_u8(
-            vrhaddq_u8(vreinterpretq_u8_u32(d0415), vcombine_u8(t0, t1)));
-        d2637 = vreinterpretq_u32_u8(
-            vrhaddq_u8(vreinterpretq_u8_u32(d2637), vcombine_u8(t2, t3)));
-
-        vst1q_lane_u32((uint32_t *)dst, d0415, 0);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d0415, 2);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d2637, 0);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d2637, 2);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d0415, 1);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d0415, 3);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d2637, 1);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d2637, 3);
-        dst += dst_stride;
+        dst += 8 * dst_stride;
         h -= 8;
-      } while (h > 0);
+      } while (h != 0);
     } else {
-      uint8_t *d;
-      int16x8_t s11, s12, s13, s14, d4, d5, d6, d7;
-      uint8x16_t d01, d23, d45, d67;
-
       do {
-        __builtin_prefetch(src + 0 * src_stride);
-        __builtin_prefetch(src + 1 * src_stride);
-        __builtin_prefetch(src + 2 * src_stride);
-        __builtin_prefetch(src + 3 * src_stride);
-        __builtin_prefetch(src + 4 * src_stride);
-        __builtin_prefetch(src + 5 * src_stride);
-        __builtin_prefetch(src + 6 * src_stride);
-        __builtin_prefetch(src + 7 * src_stride);
-        load_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-        s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+        load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
 
-        width = w;
-        s = src + 7;
-        d = dst;
-        __builtin_prefetch(dst + 0 * dst_stride);
-        __builtin_prefetch(dst + 1 * dst_stride);
-        __builtin_prefetch(dst + 2 * dst_stride);
-        __builtin_prefetch(dst + 3 * dst_stride);
-        __builtin_prefetch(dst + 4 * dst_stride);
-        __builtin_prefetch(dst + 5 * dst_stride);
-        __builtin_prefetch(dst + 6 * dst_stride);
-        __builtin_prefetch(dst + 7 * dst_stride);
+        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+        const uint8_t *s = src + 7;
+        uint8_t *d = dst;
+        int width = w;
 
         do {
-          load_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-          s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-          s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
-          s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
-          s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-          s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
-          s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
-          s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
-          s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+          uint8x8_t t8, t9, t10, t11, t12, t13, t14, t15;
+          load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14,
+                      &t15);
 
-          d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                           filter4);
-          d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                           filter4);
-          d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                           filter4);
-          d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                           filter4);
-          d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3,
-                           filter4);
-          d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3,
-                           filter4);
-          d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3,
-                           filter4);
-          d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters,
-                           filter3, filter4);
+          transpose_u8_8x8(&t8, &t9, &t10, &t11, &t12, &t13, &t14, &t15);
+          int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t8));
+          int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t9));
+          int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t10));
+          int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t11));
+          int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t12));
+          int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t13));
+          int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t14));
+          int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t15));
 
-          t0 = vqrshrun_n_s16(d0, 7);
-          t1 = vqrshrun_n_s16(d1, 7);
-          t2 = vqrshrun_n_s16(d2, 7);
-          t3 = vqrshrun_n_s16(d3, 7);
-          t4 = vqrshrun_n_s16(d4, 7);
-          t5 = vqrshrun_n_s16(d5, 7);
-          t6 = vqrshrun_n_s16(d6, 7);
-          t7 = vqrshrun_n_s16(d7, 7);
-          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+          uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+          uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+          uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+          uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+          uint8x8_t d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
+          uint8x8_t d5 =
+              convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
+          uint8x8_t d6 =
+              convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
+          uint8x8_t d7 =
+              convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
 
-          d01 = vcombine_u8(vld1_u8(d + 0 * dst_stride),
-                            vld1_u8(d + 1 * dst_stride));
-          d23 = vcombine_u8(vld1_u8(d + 2 * dst_stride),
-                            vld1_u8(d + 3 * dst_stride));
-          d45 = vcombine_u8(vld1_u8(d + 4 * dst_stride),
-                            vld1_u8(d + 5 * dst_stride));
-          d67 = vcombine_u8(vld1_u8(d + 6 * dst_stride),
-                            vld1_u8(d + 7 * dst_stride));
-          d01 = vrhaddq_u8(d01, vcombine_u8(t0, t1));
-          d23 = vrhaddq_u8(d23, vcombine_u8(t2, t3));
-          d45 = vrhaddq_u8(d45, vcombine_u8(t4, t5));
-          d67 = vrhaddq_u8(d67, vcombine_u8(t6, t7));
+          transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
 
-          store_8x8(d, dst_stride, vget_low_u8(d01), vget_high_u8(d01),
-                    vget_low_u8(d23), vget_high_u8(d23), vget_low_u8(d45),
-                    vget_high_u8(d45), vget_low_u8(d67), vget_high_u8(d67));
+          d0 = vrhadd_u8(d0, vld1_u8(d + 0 * dst_stride));
+          d1 = vrhadd_u8(d1, vld1_u8(d + 1 * dst_stride));
+          d2 = vrhadd_u8(d2, vld1_u8(d + 2 * dst_stride));
+          d3 = vrhadd_u8(d3, vld1_u8(d + 3 * dst_stride));
+          d4 = vrhadd_u8(d4, vld1_u8(d + 4 * dst_stride));
+          d5 = vrhadd_u8(d5, vld1_u8(d + 5 * dst_stride));
+          d6 = vrhadd_u8(d6, vld1_u8(d + 6 * dst_stride));
+          d7 = vrhadd_u8(d7, vld1_u8(d + 7 * dst_stride));
+
+          store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
 
           s0 = s8;
           s1 = s9;
@@ -681,272 +462,187 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
           s += 8;
           d += 8;
           width -= 8;
-        } while (width > 0);
+        } while (width != 0);
         src += 8 * src_stride;
         dst += 8 * dst_stride;
         h -= 8;
-      } while (h > 0);
+      } while (h != 0);
     }
   }
 }
 
+static INLINE void convolve_8tap_vert_neon(const uint8_t *src,
+                                           ptrdiff_t src_stride, uint8_t *dst,
+                                           ptrdiff_t dst_stride, int w, int h,
+                                           const int16x8_t filter) {
+  if (w == 4) {
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+    int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+    int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+    int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+    int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+    int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+    int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
+    int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
+
+    src += 7 * src_stride;
+
+    do {
+      uint8x8_t t7, t8, t9, t10;
+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+      int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7)));
+      int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t8)));
+      int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t9)));
+      int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t10)));
+
+      int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+      int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+      int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+      int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    do {
+      uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+      load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      const uint8_t *s = src + 7 * src_stride;
+      uint8_t *d = dst;
+      int height = h;
+
+      do {
+        uint8x8_t t7, t8, t9, t10;
+        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
+        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
+        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
+
+        uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+        uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+        uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+        uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
 void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x,  // unused
-                             int x_step_q4,            // unused
-                             const int16_t *filter_y, int y_step_q4, int w,
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
                              int h) {
-  const int16x8_t filters = vld1q_s16(filter_y);
-
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
   assert(y_step_q4 == 16);
 
+  (void)x0_q4;
   (void)x_step_q4;
   (void)y_step_q4;
-  (void)filter_x;
 
-  src -= 3 * src_stride;
+  const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
 
-  if (w == 4) {
-    const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
-    const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
-    uint8x8_t d01, d23;
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-
-    s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-
-    do {
-      s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-      s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-      s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-      s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-
-      __builtin_prefetch(dst + 0 * dst_stride);
-      __builtin_prefetch(dst + 1 * dst_stride);
-      __builtin_prefetch(dst + 2 * dst_stride);
-      __builtin_prefetch(dst + 3 * dst_stride);
-      __builtin_prefetch(src + 0 * src_stride);
-      __builtin_prefetch(src + 1 * src_stride);
-      __builtin_prefetch(src + 2 * src_stride);
-      __builtin_prefetch(src + 3 * src_stride);
-      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                       filter4);
-      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                       filter4);
-      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                       filter4);
-      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                       filter4);
-
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
-      dst += dst_stride;
-
-      s0 = s4;
-      s1 = s5;
-      s2 = s6;
-      s3 = s7;
-      s4 = s8;
-      s5 = s9;
-      s6 = s10;
-      h -= 4;
-    } while (h > 0);
+  if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
+    convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride, w, h,
+                            y_filter);
   } else {
-    const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
-    const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
-    int height;
-    const uint8_t *s;
-    uint8_t *d;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-
-    do {
-      __builtin_prefetch(src + 0 * src_stride);
-      __builtin_prefetch(src + 1 * src_stride);
-      __builtin_prefetch(src + 2 * src_stride);
-      __builtin_prefetch(src + 3 * src_stride);
-      __builtin_prefetch(src + 4 * src_stride);
-      __builtin_prefetch(src + 5 * src_stride);
-      __builtin_prefetch(src + 6 * src_stride);
-      s = src;
-      s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      d = dst;
-      height = h;
-
-      do {
-        s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-        s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-        s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-        s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-
-        __builtin_prefetch(d + 0 * dst_stride);
-        __builtin_prefetch(d + 1 * dst_stride);
-        __builtin_prefetch(d + 2 * dst_stride);
-        __builtin_prefetch(d + 3 * dst_stride);
-        __builtin_prefetch(s + 0 * src_stride);
-        __builtin_prefetch(s + 1 * src_stride);
-        __builtin_prefetch(s + 2 * src_stride);
-        __builtin_prefetch(s + 3 * src_stride);
-        d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                         filter4);
-        d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                         filter4);
-        d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                         filter4);
-        d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                         filter4);
-
-        vst1_u8(d, vqrshrun_n_s16(d0, 7));
-        d += dst_stride;
-        vst1_u8(d, vqrshrun_n_s16(d1, 7));
-        d += dst_stride;
-        vst1_u8(d, vqrshrun_n_s16(d2, 7));
-        d += dst_stride;
-        vst1_u8(d, vqrshrun_n_s16(d3, 7));
-        d += dst_stride;
-
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-        s3 = s7;
-        s4 = s8;
-        s5 = s9;
-        s6 = s10;
-        height -= 4;
-      } while (height > 0);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    } while (w > 0);
+    convolve_8tap_vert_neon(src - 3 * src_stride, src_stride, dst, dst_stride,
+                            w, h, y_filter);
   }
 }
 
 void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                 const int16_t *filter_x,  // unused
-                                 int x_step_q4,            // unused
-                                 const int16_t *filter_y, int y_step_q4, int w,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
                                  int h) {
-  const int16x8_t filters = vld1q_s16(filter_y);
+  const int16x8_t filters = vld1q_s16(filter[y0_q4]);
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
   assert(y_step_q4 == 16);
 
+  (void)x0_q4;
   (void)x_step_q4;
   (void)y_step_q4;
-  (void)filter_x;
 
   src -= 3 * src_stride;
 
   if (w == 4) {
-    const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
-    const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
-    uint8x8_t d01, d23;
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-    uint32x4_t d0123 = vdupq_n_u32(0);
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+    int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+    int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+    int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+    int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+    int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+    int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
+    int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
 
-    s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
+    src += 7 * src_stride;
 
     do {
-      s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-      s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-      s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-      s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
+      uint8x8_t t7, t8, t9, t10;
+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+      int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7)));
+      int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t8)));
+      int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t9)));
+      int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t10)));
 
-      __builtin_prefetch(dst + 0 * dst_stride);
-      __builtin_prefetch(dst + 1 * dst_stride);
-      __builtin_prefetch(dst + 2 * dst_stride);
-      __builtin_prefetch(dst + 3 * dst_stride);
-      __builtin_prefetch(src + 0 * src_stride);
-      __builtin_prefetch(src + 1 * src_stride);
-      __builtin_prefetch(src + 2 * src_stride);
-      __builtin_prefetch(src + 3 * src_stride);
-      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                       filter4);
-      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                       filter4);
-      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                       filter4);
-      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                       filter4);
+      int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+      int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+      int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+      int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
+      uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
 
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 1);
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 2);
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
-      d0123 = vreinterpretq_u32_u8(
-          vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23)));
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
 
-      vst1q_lane_u32((uint32_t *)dst, d0123, 0);
-      dst += dst_stride;
-      vst1q_lane_u32((uint32_t *)dst, d0123, 1);
-      dst += dst_stride;
-      vst1q_lane_u32((uint32_t *)dst, d0123, 2);
-      dst += dst_stride;
-      vst1q_lane_u32((uint32_t *)dst, d0123, 3);
-      dst += dst_stride;
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
 
       s0 = s4;
       s1 = s5;
@@ -955,87 +651,45 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       s4 = s8;
       s5 = s9;
       s6 = s10;
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
       h -= 4;
-    } while (h > 0);
+    } while (h != 0);
   } else {
-    const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
-    const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
-    int height;
-    const uint8_t *s;
-    uint8_t *d;
-    uint8x16_t d01, d23, dd01, dd23;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-
     do {
-      __builtin_prefetch(src + 0 * src_stride);
-      __builtin_prefetch(src + 1 * src_stride);
-      __builtin_prefetch(src + 2 * src_stride);
-      __builtin_prefetch(src + 3 * src_stride);
-      __builtin_prefetch(src + 4 * src_stride);
-      __builtin_prefetch(src + 5 * src_stride);
-      __builtin_prefetch(src + 6 * src_stride);
-      s = src;
-      s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      d = dst;
-      height = h;
+      uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+      load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      const uint8_t *s = src + 7 * src_stride;
+      uint8_t *d = dst;
+      int height = h;
 
       do {
-        s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-        s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-        s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-        s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
+        uint8x8_t t7, t8, t9, t10;
+        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
+        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
+        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
 
-        __builtin_prefetch(d + 0 * dst_stride);
-        __builtin_prefetch(d + 1 * dst_stride);
-        __builtin_prefetch(d + 2 * dst_stride);
-        __builtin_prefetch(d + 3 * dst_stride);
-        __builtin_prefetch(s + 0 * src_stride);
-        __builtin_prefetch(s + 1 * src_stride);
-        __builtin_prefetch(s + 2 * src_stride);
-        __builtin_prefetch(s + 3 * src_stride);
-        d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                         filter4);
-        d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                         filter4);
-        d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                         filter4);
-        d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                         filter4);
+        uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
 
-        d01 = vcombine_u8(vqrshrun_n_s16(d0, 7), vqrshrun_n_s16(d1, 7));
-        d23 = vcombine_u8(vqrshrun_n_s16(d2, 7), vqrshrun_n_s16(d3, 7));
-        dd01 = vcombine_u8(vld1_u8(d + 0 * dst_stride),
-                           vld1_u8(d + 1 * dst_stride));
-        dd23 = vcombine_u8(vld1_u8(d + 2 * dst_stride),
-                           vld1_u8(d + 3 * dst_stride));
-        dd01 = vrhaddq_u8(dd01, d01);
-        dd23 = vrhaddq_u8(dd23, d23);
+        d0 = vrhadd_u8(d0, vld1_u8(d + 0 * dst_stride));
+        d1 = vrhadd_u8(d1, vld1_u8(d + 1 * dst_stride));
+        d2 = vrhadd_u8(d2, vld1_u8(d + 2 * dst_stride));
+        d3 = vrhadd_u8(d3, vld1_u8(d + 3 * dst_stride));
 
-        vst1_u8(d, vget_low_u8(dd01));
-        d += dst_stride;
-        vst1_u8(d, vget_high_u8(dd01));
-        d += dst_stride;
-        vst1_u8(d, vget_low_u8(dd23));
-        d += dst_stride;
-        vst1_u8(d, vget_high_u8(dd23));
-        d += dst_stride;
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
         s0 = s4;
         s1 = s5;
@@ -1045,10 +699,12 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         s5 = s9;
         s6 = s10;
         height -= 4;
-      } while (height > 0);
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+      } while (height != 0);
       src += 8;
       dst += 8;
       w -= 8;
-    } while (w > 0);
+    } while (w != 0);
   }
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
new file mode 100644
index 0000000000..10cc761ccd
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -0,0 +1,172 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_
+#define VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_filter.h"
+
+static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
+                                    const int16x4_t s2, const int16x4_t s3,
+                                    const int16x4_t s4, const int16x4_t s5,
+                                    const int16x4_t s6, const int16x4_t s7,
+                                    const int16x8_t filters) {
+  const int16x4_t filters_lo = vget_low_s16(filters);
+  const int16x4_t filters_hi = vget_high_s16(filters);
+  int16x4_t sum;
+
+  sum = vmul_lane_s16(s0, filters_lo, 0);
+  sum = vmla_lane_s16(sum, s1, filters_lo, 1);
+  sum = vmla_lane_s16(sum, s2, filters_lo, 2);
+  sum = vmla_lane_s16(sum, s5, filters_hi, 1);
+  sum = vmla_lane_s16(sum, s6, filters_hi, 2);
+  sum = vmla_lane_s16(sum, s7, filters_hi, 3);
+  sum = vqadd_s16(sum, vmul_lane_s16(s3, filters_lo, 3));
+  sum = vqadd_s16(sum, vmul_lane_s16(s4, filters_hi, 0));
+  return sum;
+}
+
+static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
+                                    const int16x8_t s2, const int16x8_t s3,
+                                    const int16x8_t s4, const int16x8_t s5,
+                                    const int16x8_t s6, const int16x8_t s7,
+                                    const int16x8_t filters) {
+  const int16x4_t filters_lo = vget_low_s16(filters);
+  const int16x4_t filters_hi = vget_high_s16(filters);
+  int16x8_t sum;
+
+  sum = vmulq_lane_s16(s0, filters_lo, 0);
+  sum = vmlaq_lane_s16(sum, s1, filters_lo, 1);
+  sum = vmlaq_lane_s16(sum, s2, filters_lo, 2);
+  sum = vmlaq_lane_s16(sum, s5, filters_hi, 1);
+  sum = vmlaq_lane_s16(sum, s6, filters_hi, 2);
+  sum = vmlaq_lane_s16(sum, s7, filters_hi, 3);
+  sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filters_lo, 3));
+  sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filters_hi, 0));
+  return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
+                                       const int16x8_t filters) {
+  int16x8_t ss[8];
+
+  ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
+  ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
+  ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
+  ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
+  ss[4] = vreinterpretq_s16_u16(vmovl_u8(s[4]));
+  ss[5] = vreinterpretq_s16_u16(vmovl_u8(s[5]));
+  ss[6] = vreinterpretq_s16_u16(vmovl_u8(s[6]));
+  ss[7] = vreinterpretq_s16_u16(vmovl_u8(s[7]));
+
+  return convolve8_8(ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7],
+                     filters);
+}
+
+// 2-tap (bilinear) filter values are always positive, but 4-tap filter values
+// are negative on the outer edges (taps 0 and 3), with taps 1 and 2 having much
+// greater positive values to compensate. To use instructions that operate on
+// 8-bit types we also need the types to be unsigned. Subtracting the products
+// of taps 0 and 3 from the products of taps 1 and 2 always works given that
+// 2-tap filters are 0-padded.
+static INLINE uint8x8_t convolve4_8(const uint8x8_t s0, const uint8x8_t s1,
+                                    const uint8x8_t s2, const uint8x8_t s3,
+                                    const uint8x8_t filter_taps[4]) {
+  uint16x8_t sum = vmull_u8(s1, filter_taps[1]);
+  sum = vmlal_u8(sum, s2, filter_taps[2]);
+  sum = vmlsl_u8(sum, s0, filter_taps[0]);
+  sum = vmlsl_u8(sum, s3, filter_taps[3]);
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(vreinterpretq_s16_u16(sum), FILTER_BITS - 1);
+}
+
+static INLINE void convolve_4tap_vert_neon(const uint8_t *src,
+                                           ptrdiff_t src_stride, uint8_t *dst,
+                                           ptrdiff_t dst_stride, int w, int h,
+                                           const int16x8_t filter) {
+  // 4-tap and bilinear filter values are even, so halve them to reduce
+  // intermediate precision requirements.
+  const uint8x8_t y_filter =
+      vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(filter)), 1);
+
+  // Neon does not have lane-referencing multiply or multiply-accumulate
+  // instructions that operate on vectors of 8-bit elements. This means we have
+  // to duplicate filter taps into a whole vector and use standard multiply /
+  // multiply-accumulate instructions.
+  const uint8x8_t filter_taps[4] = { vdup_lane_u8(y_filter, 2),
+                                     vdup_lane_u8(y_filter, 3),
+                                     vdup_lane_u8(y_filter, 4),
+                                     vdup_lane_u8(y_filter, 5) };
+
+  if (w == 4) {
+    uint8x8_t s01 = load_unaligned_u8(src + 0 * src_stride, src_stride);
+    uint8x8_t s12 = load_unaligned_u8(src + 1 * src_stride, src_stride);
+
+    src += 2 * src_stride;
+
+    do {
+      uint8x8_t s23 = load_unaligned_u8(src + 0 * src_stride, src_stride);
+      uint8x8_t s34 = load_unaligned_u8(src + 1 * src_stride, src_stride);
+      uint8x8_t s45 = load_unaligned_u8(src + 2 * src_stride, src_stride);
+      uint8x8_t s56 = load_unaligned_u8(src + 3 * src_stride, src_stride);
+
+      uint8x8_t d01 = convolve4_8(s01, s12, s23, s34, filter_taps);
+      uint8x8_t d23 = convolve4_8(s23, s34, s45, s56, filter_taps);
+
+      store_unaligned_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_unaligned_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      s01 = s45;
+      s12 = s56;
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int height = h;
+
+      uint8x8_t s0, s1, s2;
+      load_u8_8x3(s, src_stride, &s0, &s1, &s2);
+
+      s += 3 * src_stride;
+
+      do {
+        uint8x8_t s3, s4, s5, s6;
+        load_u8_8x4(s, src_stride, &s3, &s4, &s5, &s6);
+
+        uint8x8_t d0 = convolve4_8(s0, s1, s2, s3, filter_taps);
+        uint8x8_t d1 = convolve4_8(s1, s2, s3, s4, filter_taps);
+        uint8x8_t d2 = convolve4_8(s2, s3, s4, s5, filter_taps);
+        uint8x8_t d3 = convolve4_8(s3, s4, s5, s6, filter_taps);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+#endif  // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm
deleted file mode 100644
index 2d0f2ae065..0000000000
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm
+++ /dev/null
@@ -1,270 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    ; These functions are only valid when:
-    ; x_step_q4 == 16
-    ; w%4 == 0
-    ; h%4 == 0
-    ; taps == 8
-    ; VP9_FILTER_WEIGHT == 128
-    ; VP9_FILTER_SHIFT == 7
-
-    EXPORT  |vpx_convolve8_horiz_neon|
-    EXPORT  |vpx_convolve8_vert_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Multiply and accumulate by q0
-    MACRO
-    MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
-    vmull.s16 $dst, $src0, d0[0]
-    vmlal.s16 $dst, $src1, d0[1]
-    vmlal.s16 $dst, $src2, d0[2]
-    vmlal.s16 $dst, $src3, d0[3]
-    vmlal.s16 $dst, $src4, d1[0]
-    vmlal.s16 $dst, $src5, d1[1]
-    vmlal.s16 $dst, $src6, d1[2]
-    vmlal.s16 $dst, $src7, d1[3]
-    MEND
-
-; r0    const uint8_t *src
-; r1    int src_stride
-; r2    uint8_t *dst
-; r3    int dst_stride
-; sp[]const int16_t *filter_x
-; sp[]int x_step_q4
-; sp[]const int16_t *filter_y ; unused
-; sp[]int y_step_q4           ; unused
-; sp[]int w
-; sp[]int h
-
-|vpx_convolve8_horiz_neon| PROC
-    push            {r4-r10, lr}
-
-    sub             r0, r0, #3              ; adjust for taps
-
-    ldr             r5, [sp, #32]           ; filter_x
-    ldr             r6, [sp, #48]           ; w
-    ldr             r7, [sp, #52]           ; h
-
-    vld1.s16        {q0}, [r5]              ; filter_x
-
-    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
-    add             r8, r8, #4              ; -src_stride * 3 + 4
-
-    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
-    add             r4, r4, #4              ; -dst_stride * 3 + 4
-
-    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
-    sub             r9, r9, #7
-    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop
-
-    mov             r10, r6                 ; w loop counter
-
-vpx_convolve8_loop_horiz_v
-    vld1.8          {d24}, [r0], r1
-    vld1.8          {d25}, [r0], r1
-    vld1.8          {d26}, [r0], r1
-    vld1.8          {d27}, [r0], r8
-
-    vtrn.16         q12, q13
-    vtrn.8          d24, d25
-    vtrn.8          d26, d27
-
-    pld             [r0, r1, lsl #2]
-
-    vmovl.u8        q8, d24
-    vmovl.u8        q9, d25
-    vmovl.u8        q10, d26
-    vmovl.u8        q11, d27
-
-    ; save a few instructions in the inner loop
-    vswp            d17, d18
-    vmov            d23, d21
-
-    add             r0, r0, #3
-
-vpx_convolve8_loop_horiz
-    add             r5, r0, #64
-
-    vld1.32         {d28[]}, [r0], r1
-    vld1.32         {d29[]}, [r0], r1
-    vld1.32         {d31[]}, [r0], r1
-    vld1.32         {d30[]}, [r0], r8
-
-    pld             [r5]
-
-    vtrn.16         d28, d31
-    vtrn.16         d29, d30
-    vtrn.8          d28, d29
-    vtrn.8          d31, d30
-
-    pld             [r5, r1]
-
-    ; extract to s16
-    vtrn.32         q14, q15
-    vmovl.u8        q12, d28
-    vmovl.u8        q13, d29
-
-    pld             [r5, r1, lsl #1]
-
-    ; src[] * filter_x
-    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
-    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
-    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
-    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
-
-    pld             [r5, -r8]
-
-    ; += 64 >> 7
-    vqrshrun.s32    d2, q1, #7
-    vqrshrun.s32    d3, q2, #7
-    vqrshrun.s32    d4, q14, #7
-    vqrshrun.s32    d5, q15, #7
-
-    ; saturate
-    vqmovn.u16      d2, q1
-    vqmovn.u16      d3, q2
-
-    ; transpose
-    vtrn.16         d2, d3
-    vtrn.32         d2, d3
-    vtrn.8          d2, d3
-
-    vst1.u32        {d2[0]}, [r2@32], r3
-    vst1.u32        {d3[0]}, [r2@32], r3
-    vst1.u32        {d2[1]}, [r2@32], r3
-    vst1.u32        {d3[1]}, [r2@32], r4
-
-    vmov            q8,  q9
-    vmov            d20, d23
-    vmov            q11, q12
-    vmov            q9,  q13
-
-    subs            r6, r6, #4              ; w -= 4
-    bgt             vpx_convolve8_loop_horiz
-
-    ; outer loop
-    mov             r6, r10                 ; restore w counter
-    add             r0, r0, r9              ; src += src_stride * 4 - w
-    add             r2, r2, r12             ; dst += dst_stride * 4 - w
-    subs            r7, r7, #4              ; h -= 4
-    bgt vpx_convolve8_loop_horiz_v
-
-    pop             {r4-r10, pc}
-
-    ENDP
-
-|vpx_convolve8_vert_neon| PROC
-    push            {r4-r8, lr}
-
-    ; adjust for taps
-    sub             r0, r0, r1
-    sub             r0, r0, r1, lsl #1
-
-    ldr             r4, [sp, #32]           ; filter_y
-    ldr             r6, [sp, #40]           ; w
-    ldr             lr, [sp, #44]           ; h
-
-    vld1.s16        {q0}, [r4]              ; filter_y
-
-    lsl             r1, r1, #1
-    lsl             r3, r3, #1
-
-vpx_convolve8_loop_vert_h
-    mov             r4, r0
-    add             r7, r0, r1, asr #1
-    mov             r5, r2
-    add             r8, r2, r3, asr #1
-    mov             r12, lr                 ; h loop counter
-
-    vld1.u32        {d16[0]}, [r4], r1
-    vld1.u32        {d16[1]}, [r7], r1
-    vld1.u32        {d18[0]}, [r4], r1
-    vld1.u32        {d18[1]}, [r7], r1
-    vld1.u32        {d20[0]}, [r4], r1
-    vld1.u32        {d20[1]}, [r7], r1
-    vld1.u32        {d22[0]}, [r4], r1
-
-    vmovl.u8        q8, d16
-    vmovl.u8        q9, d18
-    vmovl.u8        q10, d20
-    vmovl.u8        q11, d22
-
-vpx_convolve8_loop_vert
-    ; always process a 4x4 block at a time
-    vld1.u32        {d24[0]}, [r7], r1
-    vld1.u32        {d26[0]}, [r4], r1
-    vld1.u32        {d26[1]}, [r7], r1
-    vld1.u32        {d24[1]}, [r4], r1
-
-    ; extract to s16
-    vmovl.u8        q12, d24
-    vmovl.u8        q13, d26
-
-    pld             [r5]
-    pld             [r8]
-
-    ; src[] * filter_y
-    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
-
-    pld             [r5, r3]
-    pld             [r8, r3]
-
-    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
-
-    pld             [r7]
-    pld             [r4]
-
-    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
-
-    pld             [r7, r1]
-    pld             [r4, r1]
-
-    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
-
-    ; += 64 >> 7
-    vqrshrun.s32    d2, q1, #7
-    vqrshrun.s32    d3, q2, #7
-    vqrshrun.s32    d4, q14, #7
-    vqrshrun.s32    d5, q15, #7
-
-    ; saturate
-    vqmovn.u16      d2, q1
-    vqmovn.u16      d3, q2
-
-    vst1.u32        {d2[0]}, [r5@32], r3
-    vst1.u32        {d2[1]}, [r8@32], r3
-    vst1.u32        {d3[0]}, [r5@32], r3
-    vst1.u32        {d3[1]}, [r8@32], r3
-
-    vmov            q8, q10
-    vmov            d18, d22
-    vmov            d19, d24
-    vmov            q10, q13
-    vmov            d22, d25
-
-    subs            r12, r12, #4            ; h -= 4
-    bgt             vpx_convolve8_loop_vert
-
-    ; outer loop
-    add             r0, r0, #4
-    add             r2, r2, #4
-    subs            r6, r6, #4              ; w -= 4
-    bgt             vpx_convolve8_loop_vert_h
-
-    pop             {r4-r8, pc}
-
-    ENDP
-    END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c
new file mode 100644
index 0000000000..c4177c5385
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c
@@ -0,0 +1,41 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vp9/common/vp9_filter.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon_asm.h"
+
+/* Type1 and Type2 functions are called depending on the position of the
+ * negative and positive coefficients in the filter. In type1, the filter kernel
+ * used is sub_pel_filters_8lp, in which only the first two and the last two
+ * coefficients are negative. In type2, the negative coefficients are 0, 2, 5 &
+ * 7.
+ */
+
+#define DEFINE_FILTER(dir)                                                   \
+  void vpx_convolve8_##dir##_neon(                                           \
+      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
+      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,           \
+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {               \
+    if (filter == vp9_filter_kernels[1]) {                                   \
+      vpx_convolve8_##dir##_filter_type1_neon(                               \
+          src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, \
+          y_step_q4, w, h);                                                  \
+    } else {                                                                 \
+      vpx_convolve8_##dir##_filter_type2_neon(                               \
+          src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, \
+          y_step_q4, w, h);                                                  \
+    }                                                                        \
+  }
+
+DEFINE_FILTER(horiz)
+DEFINE_FILTER(avg_horiz)
+DEFINE_FILTER(vert)
+DEFINE_FILTER(avg_vert)
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.h
new file mode 100644
index 0000000000..f1c7d62ed0
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.h
@@ -0,0 +1,29 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_ASM_H_
+#define VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_ASM_H_
+
+#define DECLARE_FILTER(dir, type)                                  \
+  void vpx_convolve8_##dir##_filter_##type##_neon(                 \
+      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,      \
+      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \
+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+DECLARE_FILTER(horiz, type1)
+DECLARE_FILTER(avg_horiz, type1)
+DECLARE_FILTER(horiz, type2)
+DECLARE_FILTER(avg_horiz, type2)
+DECLARE_FILTER(vert, type1)
+DECLARE_FILTER(avg_vert, type1)
+DECLARE_FILTER(vert, type2)
+DECLARE_FILTER(avg_vert, type2)
+
+#endif  // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_ASM_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
new file mode 100644
index 0000000000..b919843de6
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
@@ -0,0 +1,1024 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/mem.h"
+
+// Filter values always sum to 128.
+#define FILTER_SUM 128
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
+  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
+  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
+  // Shift left and insert new last column in transposed 4x4 block.
+  1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
+  // Shift left and insert two new columns in transposed 4x4 block.
+  2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+  // Shift left and insert three new columns in transposed 4x4 block.
+  3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+};
+
+static INLINE int16x4_t convolve4_4_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16_t permute_tbl) {
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t samples_128 =
+      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  int8x16_t perm_samples = vqtbl1q_s8(samples_128, permute_tbl);
+
+  // Accumulate into 128 * FILTER_SUM to account for range transform. (Divide
+  // by 2 since we halved the filter values.)
+  int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM / 2);
+  int32x4_t sum = vdotq_lane_s32(acc, perm_samples, filters, 0);
+
+  // Further narrowing and packing is performed by the caller.
+  return vmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve4_8_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16x2_t permute_tbl) {
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t samples_128 =
+      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
+                                vqtbl1q_s8(samples_128, permute_tbl.val[1]) };
+
+  // Accumulate into 128 * FILTER_SUM to account for range transform. (Divide
+  // by 2 since we halved the filter values.)
+  int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM / 2);
+  // First 4 output values.
+  int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0);
+  // Second 4 output values.
+  int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+static INLINE int16x4_t convolve8_4_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16x2_t permute_tbl) {
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t samples_128 =
+      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
+                                vqtbl1q_s8(samples_128, permute_tbl.val[1]) };
+
+  // Accumulate into 128 * FILTER_SUM to account for range transform.
+  int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM);
+  int32x4_t sum = vdotq_lane_s32(acc, perm_samples[0], filters, 0);
+  sum = vdotq_lane_s32(sum, perm_samples[1], filters, 1);
+
+  // Further narrowing and packing is performed by the caller.
+  return vshrn_n_s32(sum, 1);
+}
+
+static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16x3_t permute_tbl) {
+  // Transform sample range to [-128, 127] for 8-bit signed dot product.
+  int8x16_t samples_128 =
+      vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+  int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
+                                vqtbl1q_s8(samples_128, permute_tbl.val[1]),
+                                vqtbl1q_s8(samples_128, permute_tbl.val[2]) };
+
+  // Accumulate into 128 * FILTER_SUM to account for range transform.
+  int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM);
+  // First 4 output values.
+  int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0);
+  sum0 = vdotq_lane_s32(sum0, perm_samples[1], filters, 1);
+  // Second 4 output values.
+  int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0);
+  sum1 = vdotq_lane_s32(sum1, perm_samples[2], filters, 1);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1));
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+static INLINE void convolve_4tap_horiz_neon_dotprod(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
+  if (w == 4) {
+    const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x4_t t0 = convolve4_4_h(s0, filter, permute_tbl);
+      int16x4_t t1 = convolve4_4_h(s1, filter, permute_tbl);
+      int16x4_t t2 = convolve4_4_h(s2, filter, permute_tbl);
+      int16x4_t t3 = convolve4_4_h(s3, filter, permute_tbl);
+      // We halved the filter values so -1 from right shift.
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int width = w;
+
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        uint8x8_t d0 = convolve4_8_h(s0, filter, permute_tbl);
+        uint8x8_t d1 = convolve4_8_h(s1, filter, permute_tbl);
+        uint8x8_t d2 = convolve4_8_h(s2, filter, permute_tbl);
+        uint8x8_t d3 = convolve4_8_h(s3, filter, permute_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+static INLINE void convolve_8tap_horiz_neon_dotprod(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
+  if (w == 4) {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x4_t t0 = convolve8_4_h(s0, filter, permute_tbl);
+      int16x4_t t1 = convolve8_4_h(s1, filter, permute_tbl);
+      int16x4_t t2 = convolve8_4_h(s2, filter, permute_tbl);
+      int16x4_t t3 = convolve8_4_h(s3, filter, permute_tbl);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int width = w;
+
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
+        uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
+        uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
+        uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const InterpKernel *filter, int x0_q4,
+                                      int x_step_q4, int y0_q4, int y_step_q4,
+                                      int w, int h) {
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
+    // Load 4-tap filter into first 4 elements of the vector.
+    // All 4-tap and bilinear filter values are even, so halve them to reduce
+    // intermediate precision requirements.
+    const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
+    const int8x8_t x_filter_4tap =
+        vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
+
+    convolve_4tap_horiz_neon_dotprod(src - 1, src_stride, dst, dst_stride, w, h,
+                                     x_filter_4tap);
+
+  } else {
+    const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+
+    convolve_8tap_horiz_neon_dotprod(src - 3, src_stride, dst, dst_stride, w, h,
+                                     x_filter_8tap);
+  }
+}
+
+void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src,
+                                          ptrdiff_t src_stride, uint8_t *dst,
+                                          ptrdiff_t dst_stride,
+                                          const InterpKernel *filter, int x0_q4,
+                                          int x_step_q4, int y0_q4,
+                                          int y_step_q4, int w, int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x4_t t0 = convolve8_4_h(s0, filters, permute_tbl);
+      int16x4_t t1 = convolve8_4_h(s1, filters, permute_tbl);
+      int16x4_t t2 = convolve8_4_h(s2, filters, permute_tbl);
+      int16x4_t t3 = convolve8_4_h(s3, filters, permute_tbl);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
+
+      uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int width = w;
+
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        uint8x8_t d0 = convolve8_8_h(s0, filters, permute_tbl);
+        uint8x8_t d1 = convolve8_8_h(s1, filters, permute_tbl);
+        uint8x8_t d2 = convolve8_8_h(s2, filters, permute_tbl);
+        uint8x8_t d3 = convolve8_8_h(s3, filters, permute_tbl);
+
+        uint8x8_t dd0, dd1, dd2, dd3;
+        load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        d0 = vrhadd_u8(d0, dd0);
+        d1 = vrhadd_u8(d1, dd1);
+        d2 = vrhadd_u8(d2, dd2);
+        d3 = vrhadd_u8(d3, dd3);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+static INLINE int16x4_t convolve8_4_v(const int8x16_t samples_lo,
+                                      const int8x16_t samples_hi,
+                                      const int8x8_t filters) {
+  // The sample range transform and permutation are performed by the caller.
+
+  // Accumulate into 128 * FILTER_SUM to account for range transform.
+  int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM);
+  int32x4_t sum = vdotq_lane_s32(acc, samples_lo, filters, 0);
+  sum = vdotq_lane_s32(sum, samples_hi, filters, 1);
+
+  // Further narrowing and packing is performed by the caller.
+  return vshrn_n_s32(sum, 1);
+}
+
+static INLINE uint8x8_t convolve8_8_v(const int8x16_t samples0_lo,
+                                      const int8x16_t samples0_hi,
+                                      const int8x16_t samples1_lo,
+                                      const int8x16_t samples1_hi,
+                                      const int8x8_t filters) {
+  // The sample range transform and permutation are performed by the caller.
+
+  // Accumulate into 128 * FILTER_SUM to account for range transform.
+  int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM);
+  // First 4 output values.
+  int32x4_t sum0 = vdotq_lane_s32(acc, samples0_lo, filters, 0);
+  sum0 = vdotq_lane_s32(sum0, samples0_hi, filters, 1);
+  // Second 4 output values.
+  int32x4_t sum1 = vdotq_lane_s32(acc, samples1_lo, filters, 0);
+  sum1 = vdotq_lane_s32(sum1, samples1_hi, filters, 1);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1));
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+static INLINE void convolve_8tap_vert_neon_dotprod(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+
+  if (w == 4) {
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+    src += 7 * src_stride;
+
+    // Transform sample range to [-128, 127] for 8-bit signed dot product.
+    int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
+    int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
+    int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
+    int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
+    int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
+    int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
+    int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
+
+    // This operation combines a conventional transpose and the sample permute
+    // (see horizontal case) required before computing the dot product.
+    int8x16_t s0123, s1234, s2345, s3456;
+    transpose_concat_s8_4x4(s0, s1, s2, s3, &s0123);
+    transpose_concat_s8_4x4(s1, s2, s3, s4, &s1234);
+    transpose_concat_s8_4x4(s2, s3, s4, s5, &s2345);
+    transpose_concat_s8_4x4(s3, s4, s5, s6, &s3456);
+
+    do {
+      uint8x8_t t7, t8, t9, t10;
+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+      int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
+      int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
+      int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
+      int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
+
+      int8x16_t s78910;
+      transpose_concat_s8_4x4(s7, s8, s9, s10, &s78910);
+
+      // Merge new data into block from previous iteration.
+      int8x16x2_t samples_LUT = { { s3456, s78910 } };
+      int8x16_t s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+      int8x16_t s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+      int8x16_t s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+      int16x4_t d0 = convolve8_4_v(s0123, s4567, filter);
+      int16x4_t d1 = convolve8_4_v(s1234, s5678, filter);
+      int16x4_t d2 = convolve8_4_v(s2345, s6789, filter);
+      int16x4_t d3 = convolve8_4_v(s3456, s78910, filter);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      /* Prepare block for next iteration - re-using as much as possible. */
+      /* Shuffle everything up four rows. */
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int height = h;
+
+      uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+      s += 7 * src_stride;
+
+      // Transform sample range to [-128, 127] for 8-bit signed dot product.
+      int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
+      int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
+      int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
+      int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
+      int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
+      int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
+      int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
+
+      // This operation combines a conventional transpose and the sample permute
+      // (see horizontal case) required before computing the dot product.
+      int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+          s3456_lo, s3456_hi;
+      transpose_concat_s8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+      transpose_concat_s8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+      transpose_concat_s8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+      transpose_concat_s8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
+
+      do {
+        uint8x8_t t7, t8, t9, t10;
+        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+
+        int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
+        int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
+        int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
+        int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
+
+        int8x16_t s78910_lo, s78910_hi;
+        transpose_concat_s8_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
+
+        // Merge new data into block from previous iteration.
+        int8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } };
+        int8x16_t s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        int8x16_t s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        int8x16_t s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        int8x16_t s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        int8x16_t s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        int8x16_t s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        uint8x8_t d0 =
+            convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filter);
+        uint8x8_t d1 =
+            convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filter);
+        uint8x8_t d2 =
+            convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filter);
+        uint8x8_t d3 =
+            convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filter);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        // Prepare block for next iteration - re-using as much as possible.
+        // Shuffle everything up four rows.
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                     uint8_t *dst, ptrdiff_t dst_stride,
+                                     const InterpKernel *filter, int x0_q4,
+                                     int x_step_q4, int y0_q4, int y_step_q4,
+                                     int w, int h) {
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(y_step_q4 == 16);
+
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
+    const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
+    convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride, w, h,
+                            y_filter);
+  } else {
+    const int8x8_t y_filter = vmovn_s16(vld1q_s16(filter[y0_q4]));
+
+    convolve_8tap_vert_neon_dotprod(src - 3 * src_stride, src_stride, dst,
+                                    dst_stride, w, h, y_filter);
+  }
+}
+
+void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
+                                         ptrdiff_t src_stride, uint8_t *dst,
+                                         ptrdiff_t dst_stride,
+                                         const InterpKernel *filter, int x0_q4,
+                                         int x_step_q4, int y0_q4,
+                                         int y_step_q4, int w, int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(y_step_q4 == 16);
+
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= 3 * src_stride;
+
+  if (w == 4) {
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+    src += 7 * src_stride;
+
+    // Transform sample range to [-128, 127] for 8-bit signed dot product.
+    int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
+    int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
+    int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
+    int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
+    int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
+    int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
+    int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
+
+    // This operation combines a conventional transpose and the sample permute
+    // (see horizontal case) required before computing the dot product.
+    int8x16_t s0123, s1234, s2345, s3456;
+    transpose_concat_s8_4x4(s0, s1, s2, s3, &s0123);
+    transpose_concat_s8_4x4(s1, s2, s3, s4, &s1234);
+    transpose_concat_s8_4x4(s2, s3, s4, s5, &s2345);
+    transpose_concat_s8_4x4(s3, s4, s5, s6, &s3456);
+
+    do {
+      uint8x8_t t7, t8, t9, t10;
+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+      int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
+      int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
+      int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
+      int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
+
+      int8x16_t s78910;
+      transpose_concat_s8_4x4(s7, s8, s9, s10, &s78910);
+
+      // Merge new data into block from previous iteration.
+      int8x16x2_t samples_LUT = { { s3456, s78910 } };
+      int8x16_t s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+      int8x16_t s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+      int8x16_t s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+      int16x4_t d0 = convolve8_4_v(s0123, s4567, filters);
+      int16x4_t d1 = convolve8_4_v(s1234, s5678, filters);
+      int16x4_t d2 = convolve8_4_v(s2345, s6789, filters);
+      int16x4_t d3 = convolve8_4_v(s3456, s78910, filters);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+      uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      // Prepare block for next iteration - re-using as much as possible.
+      // Shuffle everything up four rows.
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int height = h;
+
+      uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+      s += 7 * src_stride;
+
+      // Transform sample range to [-128, 127] for 8-bit signed dot product.
+      int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
+      int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
+      int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
+      int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
+      int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
+      int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
+      int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
+
+      // This operation combines a conventional transpose and the sample permute
+      // (see horizontal case) required before computing the dot product.
+      int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+          s3456_lo, s3456_hi;
+      transpose_concat_s8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+      transpose_concat_s8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+      transpose_concat_s8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+      transpose_concat_s8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
+
+      do {
+        uint8x8_t t7, t8, t9, t10;
+        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+
+        int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
+        int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
+        int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
+        int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
+
+        int8x16_t s78910_lo, s78910_hi;
+        transpose_concat_s8_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
+
+        // Merge new data into block from previous iteration.
+        int8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } };
+        int8x16_t s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        int8x16_t s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        int8x16_t s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        int8x16_t s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        int8x16_t s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        int8x16_t s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        uint8x8_t d0 =
+            convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filters);
+        uint8x8_t d1 =
+            convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filters);
+        uint8x8_t d2 =
+            convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filters);
+        uint8x8_t d3 =
+            convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filters);
+
+        uint8x8_t dd0, dd1, dd2, dd3;
+        load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        d0 = vrhadd_u8(d0, dd0);
+        d1 = vrhadd_u8(d1, dd1);
+        d2 = vrhadd_u8(d2, dd2);
+        d3 = vrhadd_u8(d3, dd3);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        // Prepare block for next iteration - re-using as much as possible.
+        // Shuffle everything up four rows.
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+static INLINE void convolve_4tap_2d_neon_dotprod(const uint8_t *src,
+                                                 ptrdiff_t src_stride,
+                                                 uint8_t *dst,
+                                                 ptrdiff_t dst_stride, int w,
+                                                 int h, const int8x8_t x_filter,
+                                                 const uint8x8_t y_filter) {
+  // Neon does not have lane-referencing multiply or multiply-accumulate
+  // instructions that operate on vectors of 8-bit elements. This means we have
+  // to duplicate filter taps into a whole vector and use standard multiply /
+  // multiply-accumulate instructions.
+  const uint8x8_t y_filter_taps[4] = { vdup_lane_u8(y_filter, 2),
+                                       vdup_lane_u8(y_filter, 3),
+                                       vdup_lane_u8(y_filter, 4),
+                                       vdup_lane_u8(y_filter, 5) };
+
+  if (w == 4) {
+    const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+
+    uint8x16_t h_s0, h_s1, h_s2;
+    load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2);
+
+    int16x4_t t0 = convolve4_4_h(h_s0, x_filter, permute_tbl);
+    int16x4_t t1 = convolve4_4_h(h_s1, x_filter, permute_tbl);
+    int16x4_t t2 = convolve4_4_h(h_s2, x_filter, permute_tbl);
+    // We halved the filter values so -1 from right shift.
+    uint8x8_t v_s01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+    uint8x8_t v_s12 = vqrshrun_n_s16(vcombine_s16(t1, t2), FILTER_BITS - 1);
+
+    src += 3 * src_stride;
+
+    do {
+      uint8x16_t h_s3, h_s4, h_s5, h_s6;
+      load_u8_16x4(src, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
+
+      int16x4_t t3 = convolve4_4_h(h_s3, x_filter, permute_tbl);
+      int16x4_t t4 = convolve4_4_h(h_s4, x_filter, permute_tbl);
+      int16x4_t t5 = convolve4_4_h(h_s5, x_filter, permute_tbl);
+      int16x4_t t6 = convolve4_4_h(h_s6, x_filter, permute_tbl);
+      // We halved the filter values so -1 from right shift.
+      uint8x8_t v_s34 = vqrshrun_n_s16(vcombine_s16(t3, t4), FILTER_BITS - 1);
+      uint8x8_t v_s56 = vqrshrun_n_s16(vcombine_s16(t5, t6), FILTER_BITS - 1);
+      uint8x8_t v_s23 = vext_u8(v_s12, v_s34, 4);
+      uint8x8_t v_s45 = vext_u8(v_s34, v_s56, 4);
+
+      uint8x8_t d01 = convolve4_8(v_s01, v_s12, v_s23, v_s34, y_filter_taps);
+      uint8x8_t d23 = convolve4_8(v_s23, v_s34, v_s45, v_s56, y_filter_taps);
+
+      store_unaligned_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_unaligned_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      v_s01 = v_s45;
+      v_s12 = v_s56;
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int height = h;
+
+      uint8x16_t h_s0, h_s1, h_s2;
+      load_u8_16x3(s, src_stride, &h_s0, &h_s1, &h_s2);
+
+      uint8x8_t v_s0 = convolve4_8_h(h_s0, x_filter, permute_tbl);
+      uint8x8_t v_s1 = convolve4_8_h(h_s1, x_filter, permute_tbl);
+      uint8x8_t v_s2 = convolve4_8_h(h_s2, x_filter, permute_tbl);
+
+      s += 3 * src_stride;
+
+      do {
+        uint8x16_t h_s3, h_s4, h_s5, h_s6;
+        load_u8_16x4(s, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
+
+        uint8x8_t v_s3 = convolve4_8_h(h_s3, x_filter, permute_tbl);
+        uint8x8_t v_s4 = convolve4_8_h(h_s4, x_filter, permute_tbl);
+        uint8x8_t v_s5 = convolve4_8_h(h_s5, x_filter, permute_tbl);
+        uint8x8_t v_s6 = convolve4_8_h(h_s6, x_filter, permute_tbl);
+
+        uint8x8_t d0 = convolve4_8(v_s0, v_s1, v_s2, v_s3, y_filter_taps);
+        uint8x8_t d1 = convolve4_8(v_s1, v_s2, v_s3, v_s4, y_filter_taps);
+        uint8x8_t d2 = convolve4_8(v_s2, v_s3, v_s4, v_s5, y_filter_taps);
+        uint8x8_t d3 = convolve4_8(v_s3, v_s4, v_s5, v_s6, y_filter_taps);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        v_s0 = v_s4;
+        v_s1 = v_s5;
+        v_s2 = v_s6;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+static INLINE void convolve_8tap_2d_horiz_neon_dotprod(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
+  if (w == 4) {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl);
+      int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl);
+      int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl);
+      int16x4_t d3 = convolve8_4_h(s3, filter, permute_tbl);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm()
+    // below for further details on possible values of block height.
+    uint8x16_t s0, s1, s2;
+    load_u8_16x3(src, src_stride, &s0, &s1, &s2);
+
+    int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl);
+    int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl);
+    int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl);
+    uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+    uint8x8_t d23 =
+        vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1);
+
+    store_u8(dst + 0 * dst_stride, dst_stride, d01);
+    store_u8_4x1(dst + 2 * dst_stride, d23);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int width = w;
+
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
+        uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
+        uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
+        uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm()
+    // below for further details on possible values of block height.
+    const uint8_t *s = src;
+    uint8_t *d = dst;
+    int width = w;
+
+    do {
+      uint8x16_t s0, s1, s2;
+      load_u8_16x3(s, src_stride, &s0, &s1, &s2);
+
+      uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
+      uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
+      uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
+
+      store_u8_8x3(d, dst_stride, d0, d1, d2);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width > 0);
+  }
+}
+
+void vpx_convolve8_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                uint8_t *dst, ptrdiff_t dst_stride,
+                                const InterpKernel *filter, int x0_q4,
+                                int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                int h) {
+  assert(x_step_q4 == 16);
+  assert(y_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  const int x_filter_taps = vpx_get_filter_taps(filter[x0_q4]) <= 4 ? 4 : 8;
+  const int y_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
+  // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2
+  // lines post both horizontally and vertically.
+  const ptrdiff_t horiz_offset = x_filter_taps / 2 - 1;
+  const ptrdiff_t vert_offset = (y_filter_taps / 2 - 1) * src_stride;
+
+  if (x_filter_taps == 4 && y_filter_taps == 4) {
+    const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
+    const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
+    // 4-tap and bilinear filter values are even, so halve them to reduce
+    // intermediate precision requirements.
+    const int8x8_t x_filter_4tap =
+        vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
+    const uint8x8_t y_filter_4tap =
+        vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(y_filter)), 1);
+
+    convolve_4tap_2d_neon_dotprod(src - horiz_offset - vert_offset, src_stride,
+                                  dst, dst_stride, w, h, x_filter_4tap,
+                                  y_filter_4tap);
+    return;
+  }
+
+  // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
+  // maximum buffer size to 64 * (64 + 7).
+  DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]);
+  const int im_stride = 64;
+  const int im_height = h + SUBPEL_TAPS - 1;
+
+  const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4]));
+
+  convolve_8tap_2d_horiz_neon_dotprod(src - horiz_offset - vert_offset,
+                                      src_stride, im_block, im_stride, w,
+                                      im_height, x_filter_8tap);
+
+  convolve_8tap_vert_neon_dotprod(im_block, im_stride, dst, dst_stride, w, h,
+                                  y_filter_8tap);
+}
+
+void vpx_convolve8_avg_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                    uint8_t *dst, ptrdiff_t dst_stride,
+                                    const InterpKernel *filter, int x0_q4,
+                                    int x_step_q4, int y0_q4, int y_step_q4,
+                                    int w, int h) {
+  DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]);
+  const int im_stride = 64;
+
+  // Averaging convolution always uses an 8-tap filter.
+  // Account for the vertical phase needing 3 lines prior and 4 lines post.
+  const int im_height = h + SUBPEL_TAPS - 1;
+  const ptrdiff_t offset = SUBPEL_TAPS / 2 - 1;
+
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
+
+  const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+
+  convolve_8tap_2d_horiz_neon_dotprod(src - offset - offset * src_stride,
+                                      src_stride, im_block, im_stride, w,
+                                      im_height, x_filter_8tap);
+
+  vpx_convolve8_avg_vert_neon_dotprod(im_block + offset * im_stride, im_stride,
+                                      dst, dst_stride, filter, x0_q4, x_step_q4,
+                                      y0_q4, y_step_q4, w, h);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
new file mode 100644
index 0000000000..b9d88bcfb1
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
@@ -0,0 +1,943 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/mem.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
+  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
+  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
+  // Shift left and insert new last column in transposed 4x4 block.
+  1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
+  // Shift left and insert two new columns in transposed 4x4 block.
+  2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+  // Shift left and insert three new columns in transposed 4x4 block.
+  3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+};
+
+static INLINE int16x4_t convolve4_4_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16_t permute_tbl) {
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl);
+
+  int32x4_t sum =
+      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples, filters, 0);
+
+  // Further narrowing and packing is performed by the caller.
+  return vmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve4_8_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16x2_t permute_tbl) {
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+                                     vqtbl1q_u8(samples, permute_tbl.val[1]) };
+
+  // First 4 output values.
+  int32x4_t sum0 =
+      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+  // Second 4 output values.
+  int32x4_t sum1 =
+      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1));
+  // We halved the filter values so -1 from right shift.
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+static INLINE int16x4_t convolve8_4_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16x2_t permute_tbl) {
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+                                     vqtbl1q_u8(samples, permute_tbl.val[1]) };
+
+  int32x4_t sum =
+      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+  sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1);
+
+  // Further narrowing and packing is performed by the caller.
+  return vshrn_n_s32(sum, 1);
+}
+
+static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples,
+                                      const int8x8_t filters,
+                                      const uint8x16x3_t permute_tbl) {
+  // Permute samples ready for dot product.
+  // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+  // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+  // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+  uint8x16_t permuted_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]),
+                                     vqtbl1q_u8(samples, permute_tbl.val[1]),
+                                     vqtbl1q_u8(samples, permute_tbl.val[2]) };
+
+  // First 4 output values.
+  int32x4_t sum0 =
+      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+  sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
+  // Second 4 output values.
+  int32x4_t sum1 =
+      vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
+  sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1));
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+static INLINE void convolve_4tap_horiz_neon_i8mm(const uint8_t *src,
+                                                 ptrdiff_t src_stride,
+                                                 uint8_t *dst,
+                                                 ptrdiff_t dst_stride, int w,
+                                                 int h, const int8x8_t filter) {
+  if (w == 4) {
+    const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x4_t t0 = convolve4_4_h(s0, filter, permute_tbl);
+      int16x4_t t1 = convolve4_4_h(s1, filter, permute_tbl);
+      int16x4_t t2 = convolve4_4_h(s2, filter, permute_tbl);
+      int16x4_t t3 = convolve4_4_h(s3, filter, permute_tbl);
+      // We halved the filter values so -1 from right shift.
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int width = w;
+
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        uint8x8_t d0 = convolve4_8_h(s0, filter, permute_tbl);
+        uint8x8_t d1 = convolve4_8_h(s1, filter, permute_tbl);
+        uint8x8_t d2 = convolve4_8_h(s2, filter, permute_tbl);
+        uint8x8_t d3 = convolve4_8_h(s3, filter, permute_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+static INLINE void convolve_8tap_horiz_neon_i8mm(const uint8_t *src,
+                                                 ptrdiff_t src_stride,
+                                                 uint8_t *dst,
+                                                 ptrdiff_t dst_stride, int w,
+                                                 int h, const int8x8_t filter) {
+  if (w == 4) {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x4_t t0 = convolve8_4_h(s0, filter, permute_tbl);
+      int16x4_t t1 = convolve8_4_h(s1, filter, permute_tbl);
+      int16x4_t t2 = convolve8_4_h(s2, filter, permute_tbl);
+      int16x4_t t3 = convolve8_4_h(s3, filter, permute_tbl);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int width = w;
+
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
+        uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
+        uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
+        uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int x_step_q4, int y0_q4, int y_step_q4,
+                                   int w, int h) {
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
+    // Load 4-tap filter into first 4 elements of the vector.
+    // All 4-tap and bilinear filter values are even, so halve them to reduce
+    // intermediate precision requirements.
+    const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
+    const int8x8_t x_filter_4tap =
+        vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
+
+    convolve_4tap_horiz_neon_i8mm(src - 1, src_stride, dst, dst_stride, w, h,
+                                  x_filter_4tap);
+
+  } else {
+    const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+
+    convolve_8tap_horiz_neon_i8mm(src - 3, src_stride, dst, dst_stride, w, h,
+                                  x_filter_8tap);
+  }
+}
+
+void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                       uint8_t *dst, ptrdiff_t dst_stride,
+                                       const InterpKernel *filter, int x0_q4,
+                                       int x_step_q4, int y0_q4, int y_step_q4,
+                                       int w, int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x4_t t0 = convolve8_4_h(s0, filters, permute_tbl);
+      int16x4_t t1 = convolve8_4_h(s1, filters, permute_tbl);
+      int16x4_t t2 = convolve8_4_h(s2, filters, permute_tbl);
+      int16x4_t t3 = convolve8_4_h(s3, filters, permute_tbl);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
+
+      uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int width = w;
+
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        uint8x8_t d0 = convolve8_8_h(s0, filters, permute_tbl);
+        uint8x8_t d1 = convolve8_8_h(s1, filters, permute_tbl);
+        uint8x8_t d2 = convolve8_8_h(s2, filters, permute_tbl);
+        uint8x8_t d3 = convolve8_8_h(s3, filters, permute_tbl);
+
+        uint8x8_t dd0, dd1, dd2, dd3;
+        load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        d0 = vrhadd_u8(d0, dd0);
+        d1 = vrhadd_u8(d1, dd1);
+        d2 = vrhadd_u8(d2, dd2);
+        d3 = vrhadd_u8(d3, dd3);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+static INLINE int16x4_t convolve8_4_v(const uint8x16_t samples_lo,
+                                      const uint8x16_t samples_hi,
+                                      const int8x8_t filters) {
+  // Sample permutation is performed by the caller.
+  int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0);
+  sum = vusdotq_lane_s32(sum, samples_hi, filters, 1);
+
+  // Further narrowing and packing is performed by the caller.
+  return vshrn_n_s32(sum, 1);
+}
+
+static INLINE uint8x8_t convolve8_8_v(const uint8x16_t samples0_lo,
+                                      const uint8x16_t samples0_hi,
+                                      const uint8x16_t samples1_lo,
+                                      const uint8x16_t samples1_hi,
+                                      const int8x8_t filters) {
+  // Sample permutation is performed by the caller.
+
+  // First 4 output values.
+  int32x4_t sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0);
+  sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1);
+  // Second 4 output values.
+  int32x4_t sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0);
+  sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1);
+
+  // Narrow and re-pack.
+  int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1));
+  return vqrshrun_n_s16(sum, FILTER_BITS - 1);
+}
+
+static INLINE void convolve_8tap_vert_neon_i8mm(const uint8_t *src,
+                                                ptrdiff_t src_stride,
+                                                uint8_t *dst,
+                                                ptrdiff_t dst_stride, int w,
+                                                int h, const int8x8_t filter) {
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  if (w == 4) {
+    uint8x8_t s0, s1, s2, s3, s4, s5, s6;
+    load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    src += 7 * src_stride;
+
+    // This operation combines a conventional transpose and the sample permute
+    // (see horizontal case) required before computing the dot product.
+    uint8x16_t s0123, s1234, s2345, s3456;
+    transpose_concat_u8_4x4(s0, s1, s2, s3, &s0123);
+    transpose_concat_u8_4x4(s1, s2, s3, s4, &s1234);
+    transpose_concat_u8_4x4(s2, s3, s4, s5, &s2345);
+    transpose_concat_u8_4x4(s3, s4, s5, s6, &s3456);
+
+    do {
+      uint8x8_t s7, s8, s9, s10;
+      load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
+
+      uint8x16_t s78910;
+      transpose_concat_u8_4x4(s7, s8, s9, s10, &s78910);
+
+      // Merge new data into block from previous iteration.
+      uint8x16x2_t samples_LUT = { { s3456, s78910 } };
+      uint8x16_t s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+      uint8x16_t s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+      uint8x16_t s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+      int16x4_t d0 = convolve8_4_v(s0123, s4567, filter);
+      int16x4_t d1 = convolve8_4_v(s1234, s5678, filter);
+      int16x4_t d2 = convolve8_4_v(s2345, s6789, filter);
+      int16x4_t d3 = convolve8_4_v(s3456, s78910, filter);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      // Prepare block for next iteration - re-using as much as possible.
+      // Shuffle everything up four rows.
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int height = h;
+
+      uint8x8_t s0, s1, s2, s3, s4, s5, s6;
+      load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      // This operation combines a conventional transpose and the sample permute
+      // (see horizontal case) required before computing the dot product.
+      uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+          s3456_lo, s3456_hi;
+      transpose_concat_u8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+      transpose_concat_u8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+      transpose_concat_u8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+      transpose_concat_u8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
+
+      do {
+        uint8x8_t s7, s8, s9, s10;
+        load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+        uint8x16_t s78910_lo, s78910_hi;
+        transpose_concat_u8_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
+
+        // Merge new data into block from previous iteration.
+        uint8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } };
+        uint8x16_t s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        uint8x16_t s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        uint8x16_t s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        uint8x16_t s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        uint8x16_t s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        uint8x16_t s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        uint8x8_t d0 =
+            convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filter);
+        uint8x8_t d1 =
+            convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filter);
+        uint8x8_t d2 =
+            convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filter);
+        uint8x8_t d3 =
+            convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filter);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        // Prepare block for next iteration - re-using as much as possible.
+        // Shuffle everything up four rows.
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const InterpKernel *filter, int x0_q4,
+                                  int x_step_q4, int y0_q4, int y_step_q4,
+                                  int w, int h) {
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(y_step_q4 == 16);
+
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
+    const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
+    convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride, w, h,
+                            y_filter);
+  } else {
+    const int8x8_t y_filter = vmovn_s16(vld1q_s16(filter[y0_q4]));
+
+    convolve_8tap_vert_neon_i8mm(src - 3 * src_stride, src_stride, dst,
+                                 dst_stride, w, h, y_filter);
+  }
+}
+
+void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const InterpKernel *filter, int x0_q4,
+                                      int x_step_q4, int y0_q4, int y_step_q4,
+                                      int w, int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(y_step_q4 == 16);
+
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= 3 * src_stride;
+
+  if (w == 4) {
+    uint8x8_t s0, s1, s2, s3, s4, s5, s6;
+    load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    src += 7 * src_stride;
+
+    // This operation combines a conventional transpose and the sample permute
+    // (see horizontal case) required before computing the dot product.
+    uint8x16_t s0123, s1234, s2345, s3456;
+    transpose_concat_u8_4x4(s0, s1, s2, s3, &s0123);
+    transpose_concat_u8_4x4(s1, s2, s3, s4, &s1234);
+    transpose_concat_u8_4x4(s2, s3, s4, s5, &s2345);
+    transpose_concat_u8_4x4(s3, s4, s5, s6, &s3456);
+
+    do {
+      uint8x8_t s7, s8, s9, s10;
+      load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
+
+      uint8x16_t s78910;
+      transpose_concat_u8_4x4(s7, s8, s9, s10, &s78910);
+
+      // Merge new data into block from previous iteration.
+      uint8x16x2_t samples_LUT = { { s3456, s78910 } };
+      uint8x16_t s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+      uint8x16_t s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+      uint8x16_t s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+      int16x4_t d0 = convolve8_4_v(s0123, s4567, filters);
+      int16x4_t d1 = convolve8_4_v(s1234, s5678, filters);
+      int16x4_t d2 = convolve8_4_v(s2345, s6789, filters);
+      int16x4_t d3 = convolve8_4_v(s3456, s78910, filters);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+      uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      // Prepare block for next iteration - re-using as much as possible.
+      // Shuffle everything up four rows.
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int height = h;
+
+      uint8x8_t s0, s1, s2, s3, s4, s5, s6;
+      load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      // This operation combines a conventional transpose and the sample permute
+      // (see horizontal case) required before computing the dot product.
+      uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+          s3456_lo, s3456_hi;
+      transpose_concat_u8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+      transpose_concat_u8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+      transpose_concat_u8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+      transpose_concat_u8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
+
+      do {
+        uint8x8_t s7, s8, s9, s10;
+        load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+        uint8x16_t s78910_lo, s78910_hi;
+        transpose_concat_u8_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
+
+        // Merge new data into block from previous iteration.
+        uint8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } };
+        uint8x16_t s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        uint8x16_t s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        uint8x16_t s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        uint8x16_t s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        uint8x16_t s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        uint8x16_t s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        uint8x8_t d0 =
+            convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filters);
+        uint8x8_t d1 =
+            convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filters);
+        uint8x8_t d2 =
+            convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filters);
+        uint8x8_t d3 =
+            convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filters);
+
+        uint8x8_t dd0, dd1, dd2, dd3;
+        load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        d0 = vrhadd_u8(d0, dd0);
+        d1 = vrhadd_u8(d1, dd1);
+        d2 = vrhadd_u8(d2, dd2);
+        d3 = vrhadd_u8(d3, dd3);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        /* Prepare block for next iteration - re-using as much as possible. */
+        /* Shuffle everything up four rows. */
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+static INLINE void convolve_4tap_2d_neon_i8mm(const uint8_t *src,
+                                              ptrdiff_t src_stride,
+                                              uint8_t *dst,
+                                              ptrdiff_t dst_stride, int w,
+                                              int h, const int8x8_t x_filter,
+                                              const uint8x8_t y_filter) {
+  // Neon does not have lane-referencing multiply or multiply-accumulate
+  // instructions that operate on vectors of 8-bit elements. This means we have
+  // to duplicate filter taps into a whole vector and use standard multiply /
+  // multiply-accumulate instructions.
+  const uint8x8_t y_filter_taps[4] = { vdup_lane_u8(y_filter, 2),
+                                       vdup_lane_u8(y_filter, 3),
+                                       vdup_lane_u8(y_filter, 4),
+                                       vdup_lane_u8(y_filter, 5) };
+
+  if (w == 4) {
+    const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl);
+
+    uint8x16_t h_s0, h_s1, h_s2;
+    load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2);
+
+    int16x4_t t0 = convolve4_4_h(h_s0, x_filter, permute_tbl);
+    int16x4_t t1 = convolve4_4_h(h_s1, x_filter, permute_tbl);
+    int16x4_t t2 = convolve4_4_h(h_s2, x_filter, permute_tbl);
+    // We halved the filter values so -1 from right shift.
+    uint8x8_t v_s01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
+    uint8x8_t v_s12 = vqrshrun_n_s16(vcombine_s16(t1, t2), FILTER_BITS - 1);
+
+    src += 3 * src_stride;
+
+    do {
+      uint8x16_t h_s3, h_s4, h_s5, h_s6;
+      load_u8_16x4(src, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
+
+      int16x4_t t3 = convolve4_4_h(h_s3, x_filter, permute_tbl);
+      int16x4_t t4 = convolve4_4_h(h_s4, x_filter, permute_tbl);
+      int16x4_t t5 = convolve4_4_h(h_s5, x_filter, permute_tbl);
+      int16x4_t t6 = convolve4_4_h(h_s6, x_filter, permute_tbl);
+      // We halved the filter values so -1 from right shift.
+      uint8x8_t v_s34 = vqrshrun_n_s16(vcombine_s16(t3, t4), FILTER_BITS - 1);
+      uint8x8_t v_s56 = vqrshrun_n_s16(vcombine_s16(t5, t6), FILTER_BITS - 1);
+      uint8x8_t v_s23 = vext_u8(v_s12, v_s34, 4);
+      uint8x8_t v_s45 = vext_u8(v_s34, v_s56, 4);
+
+      uint8x8_t d01 = convolve4_8(v_s01, v_s12, v_s23, v_s34, y_filter_taps);
+      uint8x8_t d23 = convolve4_8(v_s23, v_s34, v_s45, v_s56, y_filter_taps);
+
+      store_unaligned_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_unaligned_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      v_s01 = v_s45;
+      v_s12 = v_s56;
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int height = h;
+
+      uint8x16_t h_s0, h_s1, h_s2;
+      load_u8_16x3(s, src_stride, &h_s0, &h_s1, &h_s2);
+
+      uint8x8_t v_s0 = convolve4_8_h(h_s0, x_filter, permute_tbl);
+      uint8x8_t v_s1 = convolve4_8_h(h_s1, x_filter, permute_tbl);
+      uint8x8_t v_s2 = convolve4_8_h(h_s2, x_filter, permute_tbl);
+
+      s += 3 * src_stride;
+
+      do {
+        uint8x16_t h_s3, h_s4, h_s5, h_s6;
+        load_u8_16x4(s, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
+
+        uint8x8_t v_s3 = convolve4_8_h(h_s3, x_filter, permute_tbl);
+        uint8x8_t v_s4 = convolve4_8_h(h_s4, x_filter, permute_tbl);
+        uint8x8_t v_s5 = convolve4_8_h(h_s5, x_filter, permute_tbl);
+        uint8x8_t v_s6 = convolve4_8_h(h_s6, x_filter, permute_tbl);
+
+        uint8x8_t d0 = convolve4_8(v_s0, v_s1, v_s2, v_s3, y_filter_taps);
+        uint8x8_t d1 = convolve4_8(v_s1, v_s2, v_s3, v_s4, y_filter_taps);
+        uint8x8_t d2 = convolve4_8(v_s2, v_s3, v_s4, v_s5, y_filter_taps);
+        uint8x8_t d3 = convolve4_8(v_s3, v_s4, v_s5, v_s6, y_filter_taps);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        v_s0 = v_s4;
+        v_s1 = v_s5;
+        v_s2 = v_s6;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+static INLINE void convolve_8tap_2d_horiz_neon_i8mm(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) {
+  if (w == 4) {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl);
+      int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl);
+      int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl);
+      int16x4_t d3 = convolve8_4_h(s3, filter, permute_tbl);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm()
+    // below for further details on possible values of block height.
+    uint8x16_t s0, s1, s2;
+    load_u8_16x3(src, src_stride, &s0, &s1, &s2);
+
+    int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl);
+    int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl);
+    int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl);
+    uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1);
+    uint8x8_t d23 =
+        vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1);
+
+    store_u8(dst + 0 * dst_stride, dst_stride, d01);
+    store_u8_4x1(dst + 2 * dst_stride, d23);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+
+    do {
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+      int width = w;
+
+      do {
+        uint8x16_t s0, s1, s2, s3;
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
+        uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
+        uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
+        uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm()
+    // below for further details on possible values of block height.
+    const uint8_t *s = src;
+    uint8_t *d = dst;
+    int width = w;
+
+    do {
+      uint8x16_t s0, s1, s2;
+      load_u8_16x3(s, src_stride, &s0, &s1, &s2);
+
+      uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl);
+      uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl);
+      uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl);
+
+      store_u8_8x3(d, dst_stride, d0, d1, d2);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width > 0);
+  }
+}
+
+void vpx_convolve8_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
+                             int h) {
+  assert(x_step_q4 == 16);
+  assert(y_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  const int x_filter_taps = vpx_get_filter_taps(filter[x0_q4]) <= 4 ? 4 : 8;
+  const int y_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
+  // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2
+  // lines post both horizontally and vertically.
+  const ptrdiff_t horiz_offset = x_filter_taps / 2 - 1;
+  const ptrdiff_t vert_offset = (y_filter_taps / 2 - 1) * src_stride;
+
+  if (x_filter_taps == 4 && y_filter_taps == 4) {
+    const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
+    const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
+
+    // 4-tap and bilinear filter values are even, so halve them to reduce
+    // intermediate precision requirements.
+    const int8x8_t x_filter_4tap =
+        vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
+    const uint8x8_t y_filter_4tap =
+        vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(y_filter)), 1);
+
+    convolve_4tap_2d_neon_i8mm(src - horiz_offset - vert_offset, src_stride,
+                               dst, dst_stride, w, h, x_filter_4tap,
+                               y_filter_4tap);
+    return;
+  }
+
+  // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
+  // maximum buffer size to 64 * (64 + 7).
+  DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]);
+  const int im_stride = 64;
+  const int im_height = h + SUBPEL_TAPS - 1;
+
+  const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4]));
+
+  convolve_8tap_2d_horiz_neon_i8mm(src - horiz_offset - vert_offset, src_stride,
+                                   im_block, im_stride, w, im_height,
+                                   x_filter_8tap);
+
+  convolve_8tap_vert_neon_i8mm(im_block, im_stride, dst, dst_stride, w, h,
+                               y_filter_8tap);
+}
+
+void vpx_convolve8_avg_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                 int h) {
+  DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]);
+  const int im_stride = 64;
+
+  // Averaging convolution always uses an 8-tap filter.
+  // Account for the vertical phase needing 3 lines prior and 4 lines post.
+  const int im_height = h + SUBPEL_TAPS - 1;
+  const ptrdiff_t offset = SUBPEL_TAPS / 2 - 1;
+
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
+
+  const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4]));
+
+  convolve_8tap_2d_horiz_neon_i8mm(src - offset - offset * src_stride,
+                                   src_stride, im_block, im_stride, w,
+                                   im_height, x_filter_8tap);
+
+  vpx_convolve8_avg_vert_neon_i8mm(im_block + offset * im_stride, im_stride,
+                                   dst, dst_stride, filter, x0_q4, x_step_q4,
+                                   y0_q4, y_step_q4, w, h);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm
new file mode 100644
index 0000000000..2666d4253e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm
@@ -0,0 +1,457 @@
+;
+;  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+;    r0 => src
+;    r1 => dst
+;    r2 =>  src_stride
+;    r6 =>  dst_stride
+;    r12 => filter_y0
+;    r5 =>  ht
+;    r3 =>  wd
+
+    EXPORT          |vpx_convolve8_vert_filter_type1_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA  ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_vert_filter_type1_neon| PROC
+
+    stmfd           sp!,    {r4  -  r12,    r14} ;stack stores the values of
+                                                 ; the arguments
+    vpush           {d8  -  d15}                 ; stack offset by 64
+    mov             r4,     r1
+    mov             r1,     r2
+    mov             r2,     r4
+    vmov.i16        q15,    #0x4000
+    mov             r11,    #0xc000
+    ldr             r12,    [sp,    #104]   ;load filter
+    ldr             r6,     [sp,    #116]   ;load y0_q4
+    add             r12,    r12,    r6,     lsl #4 ;r12 = filter[y0_q4]
+    mov             r6,     r3
+    ldr             r5,     [sp,    #124]   ;load wd
+    vld2.8          {d0,    d1},    [r12]   ;coeff = vld1_s8(pi1_coeff)
+    sub             r12,    r2,     r2,     lsl #2 ;src_ctrd & pi1_coeff
+    vabs.s8         d0,     d0              ;vabs_s8(coeff)
+    add             r0,     r0,     r12     ;r0->pu1_src    r12->pi1_coeff
+    ldr             r3,     [sp,    #128]   ;load ht
+    subs            r7,     r3,     #0      ;r3->ht
+    vdup.u8         d22,    d0[0]           ;coeffabs_0 = vdup_lane_u8(coeffabs,
+                                            ; 0);
+    cmp             r5,     #8
+    vdup.u8         d23,    d0[1]           ;coeffabs_1 = vdup_lane_u8(coeffabs,
+                                            ; 1);
+    vdup.u8         d24,    d0[2]           ;coeffabs_2 = vdup_lane_u8(coeffabs,
+                                            ; 2);
+    vdup.u8         d25,    d0[3]           ;coeffabs_3 = vdup_lane_u8(coeffabs,
+                                            ; 3);
+    vdup.u8         d26,    d0[4]           ;coeffabs_4 = vdup_lane_u8(coeffabs,
+                                            ; 4);
+    vdup.u8         d27,    d0[5]           ;coeffabs_5 = vdup_lane_u8(coeffabs,
+                                            ; 5);
+    vdup.u8         d28,    d0[6]           ;coeffabs_6 = vdup_lane_u8(coeffabs,
+                                            ; 6);
+    vdup.u8         d29,    d0[7]           ;coeffabs_7 = vdup_lane_u8(coeffabs,
+                                            ; 7);
+    blt             core_loop_wd_4          ;core loop wd 4 jump
+
+    str             r0,     [sp,  #-4]!
+    str             r1,     [sp,  #-4]!
+    bic             r4,     r5,     #7      ;r5 ->wd
+    rsb             r9,     r4,     r6,     lsl #2 ;r6->dst_strd    r5    ->wd
+    rsb             r8,     r4,     r2,     lsl #2 ;r2->src_strd
+    mov             r3,     r5,     lsr #3  ;divide by 8
+    mul             r7,     r3              ;multiply height by width
+    sub             r7,     #4              ;subtract by one for epilog
+
+prolog
+    and             r10,    r0,     #31
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vdup.16         q4,     r11
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    subs            r4,     r4,     #8
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+                                            ; coeffabs_1);
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vdup.16         q5,     r11
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    addle           r0,     r0,     r8
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    bicle           r4,     r5,     #7      ;r5 ->wd
+    vmlal.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    pld             [r3]
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    pld             [r3,    r2]
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    pld             [r3,    r2,     lsl #1]
+    vmlal.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    add             r3,     r3,     r2
+    vmlsl.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    pld             [r3,    r2,     lsl #1]
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q6,     d3,     d23
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q6,     d2,     d22
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q6,     d4,     d24
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d5,     d25
+    vmlal.u8        q6,     d6,     d26
+    vmlal.u8        q6,     d7,     d27
+    vmlsl.u8        q6,     d16,    d28
+    vmlsl.u8        q6,     d17,    d29
+    add             r14,    r1,     r6
+    vst1.8          {d8},   [r1]!           ;vst1_u8(pu1_dst,sto_res);
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    addle           r1,     r1,     r9
+    vmlsl.u8        q7,     d4,     d23
+    subs            r7,     r7,     #4
+    vmlsl.u8        q7,     d3,     d22
+    vmlal.u8        q7,     d5,     d24
+    vmlal.u8        q7,     d6,     d25
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vhadd.s16       q6,     q6,     q15
+    vdup.16         q4,     r11
+    vmlal.u8        q7,     d7,     d26
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d16,    d27
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d17,    d28
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d18,    d29
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    vqrshrun.s16    d12,    q6,     #6
+    blt             epilog_end              ;jumps to epilog_end
+
+    beq             epilog                  ;jumps to epilog
+
+main_loop_8
+    subs            r4,     r4,     #8
+    vmlsl.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+                                            ; coeffabs_1);
+    addle           r0,     r0,     r8
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    bicle           r4,     r5,     #7      ;r5 ->wd
+    vmlal.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vhadd.s16       q7,     q7,     q15
+    vdup.16         q5,     r11
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vmlsl.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vst1.8          {d12},  [r14],  r6
+    vqrshrun.s16    d14,    q7,     #6
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vmlsl.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    vmlal.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    vst1.8          {d14},  [r14],  r6
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    add             r14,    r1,     #0
+    vmlal.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    add             r1,     r1,     #8
+    vmlsl.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    addle           r1,     r1,     r9
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vmlsl.u8        q6,     d3,     d23
+    add             r10,    r3,     r2,     lsl #3 ; 10*strd - 8+2
+    vmlsl.u8        q6,     d2,     d22
+    add             r10,    r10,    r2      ; 11*strd
+    vmlal.u8        q6,     d4,     d24
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q6,     d5,     d25
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d6,     d26
+    vst1.8          {d8},   [r14],  r6      ;vst1_u8(pu1_dst,sto_res);
+    pld             [r10]                   ;11+ 0
+    vmlal.u8        q6,     d7,     d27
+    pld             [r10,   r2]             ;11+ 1*strd
+    vmlsl.u8        q6,     d16,    d28
+    pld             [r10,   r2,     lsl #1] ;11+ 2*strd
+    vmlsl.u8        q6,     d17,    d29
+    add             r10,    r10,    r2      ;12*strd
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    pld             [r10,   r2,     lsl #1] ;11+ 3*strd
+    vmlsl.u8        q7,     d4,     d23
+    vmlsl.u8        q7,     d3,     d22
+    subs            r7,     r7,     #4
+    vmlal.u8        q7,     d5,     d24
+    vmlal.u8        q7,     d6,     d25
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vhadd.s16       q6,     q6,     q15
+    vdup.16         q4,     r11
+    vmlal.u8        q7,     d7,     d26
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d16,    d27
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d17,    d28
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d18,    d29
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vqrshrun.s16    d12,    q6,     #6
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    bgt             main_loop_8             ;jumps to main_loop_8
+
+epilog
+    vmlsl.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+                                            ; coeffabs_1);
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    vmlal.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vhadd.s16       q7,     q7,     q15
+    vdup.16         q5,     r11
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vmlal.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vmlsl.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vst1.8          {d12},  [r14],  r6
+    vqrshrun.s16    d14,    q7,     #6
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    vmlal.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    vmlal.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    vmlsl.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    vst1.8          {d14},  [r14],  r6
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q6,     d3,     d23
+    vmlsl.u8        q6,     d2,     d22
+    vmlal.u8        q6,     d4,     d24
+    vmlal.u8        q6,     d5,     d25
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d6,     d26
+    vmlal.u8        q6,     d7,     d27
+    vmlsl.u8        q6,     d16,    d28
+    vmlsl.u8        q6,     d17,    d29
+    add             r14,    r1,     r6
+    vst1.8          {d8},   [r1]!           ;vst1_u8(pu1_dst,sto_res);
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d4,     d23
+    vmlsl.u8        q7,     d3,     d22
+    vmlal.u8        q7,     d5,     d24
+    vmlal.u8        q7,     d6,     d25
+    vhadd.s16       q6,     q6,     q15
+    vmlal.u8        q7,     d7,     d26
+    vmlal.u8        q7,     d16,    d27
+    vmlsl.u8        q7,     d17,    d28
+    vmlsl.u8        q7,     d18,    d29
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    vqrshrun.s16    d12,    q6,     #6
+
+epilog_end
+    vst1.8          {d12},  [r14],  r6
+    vhadd.s16       q7,     q7,     q15
+    vqrshrun.s16    d14,    q7,     #6
+    vst1.8          {d14},  [r14],  r6
+
+end_loops
+    tst             r5,     #7
+    ldr             r1,     [sp],   #4
+    ldr             r0,     [sp],   #4
+    vpopeq          {d8  -  d15}
+    ldmfdeq         sp!,    {r4  -  r12,    r15} ;reload the registers from
+                                            ; sp
+    mov             r5,     #4
+    add             r0,     r0,     #8
+    add             r1,     r1,     #8
+    mov             r7,     #16
+
+core_loop_wd_4
+    rsb             r9,     r5,     r6,     lsl #2 ;r6->dst_strd    r5    ->wd
+    rsb             r8,     r5,     r2,     lsl #2 ;r2->src_strd
+    vmov.i8         d4,     #0
+
+outer_loop_wd_4
+    subs            r12,    r5,     #0
+    ble             end_inner_loop_wd_4     ;outer loop jump
+
+inner_loop_wd_4
+    add             r3,     r0,     r2
+    vld1.u32        {d4[1]},[r3],   r2      ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 1);
+    subs            r12,    r12,    #4
+    vdup.u32        d5,     d4[1]           ;src_tmp2 = vdup_lane_u32(src_tmp1,
+                                            ; 1);
+    vld1.u32        {d5[1]},[r3],   r2      ;src_tmp2 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp2, 1);
+    vld1.u32        {d4[0]},[r0]            ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 0);
+    vdup.16         q0,     r11
+    vmlsl.u8        q0,     d5,     d23     ;mul_res1 =
+                                            ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1);
+
+    vdup.u32        d6,     d5[1]           ;src_tmp3 = vdup_lane_u32(src_tmp2,
+                                            ; 1);
+    add             r0,     r0,     #4
+    vld1.u32        {d6[1]},[r3],   r2      ;src_tmp3 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp3, 1);
+    vmlsl.u8        q0,     d4,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp1), coeffabs_0);
+    vdup.u32        d7,     d6[1]           ;src_tmp4 = vdup_lane_u32(src_tmp3,
+                                            ; 1);
+    vld1.u32        {d7[1]},[r3],   r2      ;src_tmp4 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp4, 1);
+    vmlal.u8        q0,     d6,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp3), coeffabs_2);
+    vdup.16         q4,     r11
+    vmlsl.u8        q4,     d7,     d23
+    vdup.u32        d4,     d7[1]           ;src_tmp1 = vdup_lane_u32(src_tmp4,
+                                            ; 1);
+    vmull.u8        q1,     d7,     d25     ;mul_res2 =
+                                            ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3);
+    vld1.u32        {d4[1]},[r3],   r2      ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 1);
+    vmlsl.u8        q4,     d6,     d22
+    vmlal.u8        q0,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp1), coeffabs_4);
+    vdup.u32        d5,     d4[1]           ;src_tmp2 = vdup_lane_u32(src_tmp1,
+                                            ; 1);
+    vmlal.u8        q4,     d4,     d24
+    vld1.u32        {d5[1]},[r3],   r2      ;src_tmp2 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp2, 1);
+    vmlal.u8        q1,     d5,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; vreinterpret_u8_u32(src_tmp2), coeffabs_5);
+    vdup.u32        d6,     d5[1]           ;src_tmp3 = vdup_lane_u32(src_tmp2,
+                                            ; 1);
+    vmlal.u8        q4,     d5,     d25
+    vld1.u32        {d6[1]},[r3],   r2      ;src_tmp3 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp3, 1);
+    vmlsl.u8        q0,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp3), coeffabs_6);
+    vdup.u32        d7,     d6[1]           ;src_tmp4 = vdup_lane_u32(src_tmp3,
+                                            ; 1);
+    vmlal.u8        q4,     d6,     d26
+    vld1.u32        {d7[1]},[r3],   r2      ;src_tmp4 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp4, 1);
+    vmlsl.u8        q1,     d7,     d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; vreinterpret_u8_u32(src_tmp4), coeffabs_7);
+    vdup.u32        d4,     d7[1]
+    vadd.i16        q0,     q0,     q1      ;mul_res1 = vaddq_u16(mul_res1,
+                                            ; mul_res2);
+    vmlal.u8        q4,     d7,     d27
+    vld1.u32        {d4[1]},[r3],   r2
+    vmlsl.u8        q4,     d4,     d28
+    vdup.u32        d5,     d4[1]
+    vhadd.s16       q0,     q0,     q15
+    vqrshrun.s16    d0,     q0,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u32        {d5[1]},[r3]
+    add             r3,     r1,     r6
+    vst1.32         {d0[0]},[r1]            ;vst1_lane_u32((uint32_t *)pu1_dst,
+                                            ; vreinterpret_u32_u8(sto_res), 0);
+    vmlsl.u8        q4,     d5,     d29
+    vst1.32         {d0[1]},[r3],   r6      ;vst1_lane_u32((uint32_t
+                                            ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
+    vhadd.s16       q4,     q4,     q15
+    vqrshrun.s16    d8,     q4,     #6
+    vst1.32         {d8[0]},[r3],   r6
+    add             r1,     r1,     #4
+    vst1.32         {d8[1]},[r3]
+    bgt             inner_loop_wd_4
+
+end_inner_loop_wd_4
+    subs            r7,     r7,     #4
+    add             r1,     r1,     r9
+    add             r0,     r0,     r8
+    bgt             outer_loop_wd_4
+
+    vpop            {d8  -  d15}
+    ldmfd           sp!,    {r4  -  r12,    r15} ;reload the registers from sp
+
+    ENDP
+
+    END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm
new file mode 100644
index 0000000000..cb5d6d3fe5
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm
@@ -0,0 +1,455 @@
+;
+;  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+;    r0 => src
+;    r1 => dst
+;    r2 =>  src_stride
+;    r6 =>  dst_stride
+;    r12 => filter_y0
+;    r5 =>  ht
+;    r3 =>  wd
+
+    EXPORT          |vpx_convolve8_vert_filter_type2_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA  ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_vert_filter_type2_neon| PROC
+
+    stmfd           sp!,    {r4  -  r12,    r14} ;stack stores the values of
+                                                 ; the arguments
+    vpush           {d8  -  d15}                 ; stack offset by 64
+    mov             r4,     r1
+    mov             r1,     r2
+    mov             r2,     r4
+    vmov.i16        q15,    #0x4000
+    mov             r11,    #0xc000
+    ldr             r12,    [sp,    #104]   ;load filter
+    ldr             r6,     [sp,    #116]   ;load y0_q4
+    add             r12,    r12,    r6,     lsl #4 ;r12 = filter[y0_q4]
+    mov             r6,     r3
+    ldr             r5,     [sp,    #124]   ;load wd
+    vld2.8          {d0,    d1},    [r12]   ;coeff = vld1_s8(pi1_coeff)
+    sub             r12,    r2,     r2,     lsl #2 ;src_ctrd & pi1_coeff
+    vabs.s8         d0,     d0              ;vabs_s8(coeff)
+    add             r0,     r0,     r12     ;r0->pu1_src    r12->pi1_coeff
+    ldr             r3,     [sp,    #128]   ;load ht
+    subs            r7,     r3,     #0      ;r3->ht
+    vdup.u8         d22,    d0[0]           ;coeffabs_0 = vdup_lane_u8(coeffabs,
+                                            ; 0);
+    cmp             r5,     #8
+    vdup.u8         d23,    d0[1]           ;coeffabs_1 = vdup_lane_u8(coeffabs,
+                                            ; 1);
+    vdup.u8         d24,    d0[2]           ;coeffabs_2 = vdup_lane_u8(coeffabs,
+                                            ; 2);
+    vdup.u8         d25,    d0[3]           ;coeffabs_3 = vdup_lane_u8(coeffabs,
+                                            ; 3);
+    vdup.u8         d26,    d0[4]           ;coeffabs_4 = vdup_lane_u8(coeffabs,
+                                            ; 4);
+    vdup.u8         d27,    d0[5]           ;coeffabs_5 = vdup_lane_u8(coeffabs,
+                                            ; 5);
+    vdup.u8         d28,    d0[6]           ;coeffabs_6 = vdup_lane_u8(coeffabs,
+                                            ; 6);
+    vdup.u8         d29,    d0[7]           ;coeffabs_7 = vdup_lane_u8(coeffabs,
+                                            ; 7);
+    blt             core_loop_wd_4          ;core loop wd 4 jump
+
+    str             r0,     [sp,  #-4]!
+    str             r1,     [sp,  #-4]!
+    bic             r4,     r5,     #7      ;r5 ->wd
+    rsb             r9,     r4,     r6,     lsl #2 ;r6->dst_strd    r5    ->wd
+    rsb             r8,     r4,     r2,     lsl #2 ;r2->src_strd
+    mov             r3,     r5,     lsr #3  ;divide by 8
+    mul             r7,     r3              ;multiply height by width
+    sub             r7,     #4              ;subtract by one for epilog
+
+prolog
+    and             r10,    r0,     #31
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vdup.16         q4,     r11
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    subs            r4,     r4,     #8
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+                                            ; coeffabs_1);
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vdup.16         q5,     r11
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    addle           r0,     r0,     r8
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    bicle           r4,     r5,     #7      ;r5 ->wd
+    vmlsl.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    pld             [r3]
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    pld             [r3,    r2]
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    pld             [r3,    r2,     lsl #1]
+    vmlsl.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    add             r3,     r3,     r2
+    vmlal.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    pld             [r3,    r2,     lsl #1]
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q6,     d3,     d23
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q6,     d2,     d22
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q6,     d4,     d24
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d5,     d25
+    vmlal.u8        q6,     d6,     d26
+    vmlsl.u8        q6,     d7,     d27
+    vmlal.u8        q6,     d16,    d28
+    vmlsl.u8        q6,     d17,    d29
+    add             r14,    r1,     r6
+    vst1.8          {d8},   [r1]!           ;vst1_u8(pu1_dst,sto_res);
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    addle           r1,     r1,     r9
+    vmlal.u8        q7,     d4,     d23
+    subs            r7,     r7,     #4
+    vmlsl.u8        q7,     d3,     d22
+    vmlsl.u8        q7,     d5,     d24
+    vmlal.u8        q7,     d6,     d25
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vhadd.s16       q6,     q6,     q15
+    vdup.16         q4,     r11
+    vmlal.u8        q7,     d7,     d26
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d16,    d27
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d17,    d28
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d18,    d29
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    vqrshrun.s16    d12,    q6,     #6
+    blt             epilog_end              ;jumps to epilog_end
+
+    beq             epilog                  ;jumps to epilog
+
+main_loop_8
+    subs            r4,     r4,     #8
+    vmlal.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+                                            ; coeffabs_1);
+    addle           r0,     r0,     r8
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    bicle           r4,     r5,     #7      ;r5 ->wd
+    vmlsl.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vhadd.s16       q7,     q7,     q15
+    vdup.16         q5,     r11
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vmlal.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vst1.8          {d12},  [r14],  r6
+    vqrshrun.s16    d14,    q7,     #6
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vmlal.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    vmlsl.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    vst1.8          {d14},  [r14],  r6
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    add             r14,    r1,     #0
+    vmlsl.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    add             r1,     r1,     #8
+    vmlal.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    addle           r1,     r1,     r9
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vmlal.u8        q6,     d3,     d23
+    add             r10,    r3,     r2,     lsl #3 ; 10*strd - 8+2
+    vmlsl.u8        q6,     d2,     d22
+    add             r10,    r10,    r2      ; 11*strd
+    vmlsl.u8        q6,     d4,     d24
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q6,     d5,     d25
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d6,     d26
+    vst1.8          {d8},   [r14],  r6      ;vst1_u8(pu1_dst,sto_res);
+    pld             [r10]                   ;11+ 0
+    vmlsl.u8        q6,     d7,     d27
+    pld             [r10,   r2]             ;11+ 1*strd
+    vmlal.u8        q6,     d16,    d28
+    pld             [r10,   r2,     lsl #1] ;11+ 2*strd
+    vmlsl.u8        q6,     d17,    d29
+    add             r10,    r10,    r2      ;12*strd
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    pld             [r10,   r2,     lsl #1] ;11+ 3*strd
+    vmlal.u8        q7,     d4,     d23
+    vmlsl.u8        q7,     d3,     d22
+    subs            r7,     r7,     #4
+    vmlsl.u8        q7,     d5,     d24
+    vmlal.u8        q7,     d6,     d25
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vhadd.s16       q6,     q6,     q15
+    vdup.16         q4,     r11
+    vmlal.u8        q7,     d7,     d26
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d16,    d27
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d17,    d28
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d18,    d29
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vqrshrun.s16    d12,    q6,     #6
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    bgt             main_loop_8             ;jumps to main_loop_8
+
+epilog
+    vmlal.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    vmlsl.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vhadd.s16       q7,     q7,     q15
+    vdup.16         q5,     r11
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vmlsl.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vmlal.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vst1.8          {d12},  [r14],  r6
+    vqrshrun.s16    d14,    q7,     #6
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    vmlsl.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    vmlsl.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    vmlal.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    vst1.8          {d14},  [r14],  r6
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q6,     d3,     d23
+    vmlsl.u8        q6,     d2,     d22
+    vmlsl.u8        q6,     d4,     d24
+    vmlal.u8        q6,     d5,     d25
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d6,     d26
+    vmlsl.u8        q6,     d7,     d27
+    vmlal.u8        q6,     d16,    d28
+    vmlsl.u8        q6,     d17,    d29
+    add             r14,    r1,     r6
+    vst1.8          {d8},   [r1]!           ;vst1_u8(pu1_dst,sto_res);
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d4,     d23
+    vmlsl.u8        q7,     d3,     d22
+    vmlsl.u8        q7,     d5,     d24
+    vmlal.u8        q7,     d6,     d25
+    vhadd.s16       q6,     q6,     q15
+    vmlal.u8        q7,     d7,     d26
+    vmlsl.u8        q7,     d16,    d27
+    vmlal.u8        q7,     d17,    d28
+    vmlsl.u8        q7,     d18,    d29
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    vqrshrun.s16    d12,    q6,     #6
+
+epilog_end
+    vst1.8          {d12},  [r14],  r6
+    vhadd.s16       q7,     q7,     q15
+    vqrshrun.s16    d14,    q7,     #6
+    vst1.8          {d14},  [r14],  r6
+
+end_loops
+    tst             r5,     #7
+    ldr             r1,     [sp],   #4
+    ldr             r0,     [sp],   #4
+    vpopeq          {d8  -  d15}
+    ldmfdeq         sp!,    {r4  -  r12,    r15} ;reload the registers from sp
+    mov             r5,     #4
+    add             r0,     r0,     #8
+    add             r1,     r1,     #8
+    mov             r7,     #16
+
+core_loop_wd_4
+    rsb             r9,     r5,     r6,     lsl #2 ;r6->dst_strd    r5    ->wd
+    rsb             r8,     r5,     r2,     lsl #2 ;r2->src_strd
+    vmov.i8         d4,     #0
+
+outer_loop_wd_4
+    subs            r12,    r5,     #0
+    ble             end_inner_loop_wd_4     ;outer loop jump
+
+inner_loop_wd_4
+    add             r3,     r0,     r2
+    vld1.u32        {d4[1]},[r3],   r2      ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 1);
+    subs            r12,    r12,    #4
+    vdup.u32        d5,     d4[1]           ;src_tmp2 = vdup_lane_u32(src_tmp1,
+                                            ; 1);
+    vld1.u32        {d5[1]},[r3],   r2      ;src_tmp2 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp2, 1);
+    vld1.u32        {d4[0]},[r0]            ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 0);
+    vdup.16         q0,     r11
+    vmlal.u8        q0,     d5,     d23     ;mul_res1 =
+                                            ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1);
+    vdup.u32        d6,     d5[1]           ;src_tmp3 = vdup_lane_u32(src_tmp2,
+                                            ; 1);
+    add             r0,     r0,     #4
+    vld1.u32        {d6[1]},[r3],   r2      ;src_tmp3 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp3, 1);
+    vmlsl.u8        q0,     d4,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp1), coeffabs_0);
+    vdup.u32        d7,     d6[1]           ;src_tmp4 = vdup_lane_u32(src_tmp3,
+                                            ; 1);
+    vld1.u32        {d7[1]},[r3],   r2      ;src_tmp4 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp4, 1);
+    vmlsl.u8        q0,     d6,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp3), coeffabs_2);
+    vdup.16         q4,     r11
+    vmlal.u8        q4,     d7,     d23
+    vdup.u32        d4,     d7[1]           ;src_tmp1 = vdup_lane_u32(src_tmp4,
+                                            ; 1);
+    vmull.u8        q1,     d7,     d25     ;mul_res2 =
+                                            ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3);
+    vld1.u32        {d4[1]},[r3],   r2      ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 1);
+    vmlsl.u8        q4,     d6,     d22
+    vmlal.u8        q0,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp1), coeffabs_4);
+    vdup.u32        d5,     d4[1]           ;src_tmp2 = vdup_lane_u32(src_tmp1,
+                                            ; 1);
+    vmlsl.u8        q4,     d4,     d24
+    vld1.u32        {d5[1]},[r3],   r2      ;src_tmp2 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp2, 1);
+    vmlsl.u8        q1,     d5,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; vreinterpret_u8_u32(src_tmp2), coeffabs_5);
+    vdup.u32        d6,     d5[1]           ;src_tmp3 = vdup_lane_u32(src_tmp2,
+                                            ; 1);
+    vmlal.u8        q4,     d5,     d25
+    vld1.u32        {d6[1]},[r3],   r2      ;src_tmp3 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp3, 1);
+    vmlal.u8        q0,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp3), coeffabs_6);
+    vdup.u32        d7,     d6[1]           ;src_tmp4 = vdup_lane_u32(src_tmp3,
+                                            ; 1);
+    vmlal.u8        q4,     d6,     d26
+    vld1.u32        {d7[1]},[r3],   r2      ;src_tmp4 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp4, 1);
+    vmlsl.u8        q1,     d7,     d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; vreinterpret_u8_u32(src_tmp4), coeffabs_7);
+    vdup.u32        d4,     d7[1]
+    vadd.i16        q0,     q0,     q1      ;mul_res1 = vaddq_u16(mul_res1,
+                                            ; mul_res2);
+    vmlsl.u8        q4,     d7,     d27
+    vld1.u32        {d4[1]},[r3],   r2
+    vmlal.u8        q4,     d4,     d28
+    vdup.u32        d5,     d4[1]
+    vhadd.s16       q0,     q0,     q15
+    vqrshrun.s16    d0,     q0,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u32        {d5[1]},[r3]
+    add             r3,     r1,     r6
+    vst1.32         {d0[0]},[r1]            ;vst1_lane_u32((uint32_t *)pu1_dst,
+                                            ; vreinterpret_u32_u8(sto_res), 0);
+    vmlsl.u8        q4,     d5,     d29
+    vst1.32         {d0[1]},[r3],   r6      ;vst1_lane_u32((uint32_t
+                                            ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
+    vhadd.s16       q4,     q4,     q15
+    vqrshrun.s16    d8,     q4,     #6
+    vst1.32         {d8[0]},[r3],   r6
+    add             r1,     r1,     #4
+    vst1.32         {d8[1]},[r3]
+    bgt             inner_loop_wd_4
+
+end_inner_loop_wd_4
+    subs            r7,     r7,     #4
+    add             r1,     r1,     r9
+    add             r0,     r0,     r8
+    bgt             outer_loop_wd_4
+
+    vpop            {d8  -  d15}
+    ldmfd           sp!,    {r4  -  r12,    r15} ;reload the registers from sp
+
+    ENDP
+
+    END
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c
index 04cb835fa3..8e3ee599f4 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c
@@ -15,13 +15,13 @@
 
 void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int filter_x_stride,
-                           const int16_t *filter_y, int filter_y_stride, int w,
-                           int h) {
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+                           const InterpKernel *filter, int x0_q4, int x_step_q4,
+                           int y0_q4, int y_step_q4, int w, int h) {
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
 
   if (w < 8) {  // avg4
     uint8x8_t s0, s1;
@@ -43,7 +43,7 @@ void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
       vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dd0), 1);
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else if (w == 8) {  // avg8
     uint8x8_t s0, s1, d0, d1;
     uint8x16_t s01, d01;
@@ -64,7 +64,7 @@ void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
       vst1_u8(dst, vget_high_u8(d01));
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else if (w < 32) {  // avg16
     uint8x16_t s0, s1, d0, d1;
     do {
@@ -83,7 +83,7 @@ void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
       vst1q_u8(dst, d1);
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else if (w == 32) {  // avg32
     uint8x16_t s0, s1, s2, s3, d0, d1, d2, d3;
     do {
@@ -110,7 +110,7 @@ void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
       vst1q_u8(dst + 16, d3);
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else {  // avg64
     uint8x16_t s0, s1, s2, s3, d0, d1, d2, d3;
     do {
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm
index 97e6189fda..efd6574f1f 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm
@@ -17,7 +17,7 @@
 
 |vpx_convolve_avg_neon| PROC
     push                {r4-r6, lr}
-    ldrd                r4, r5, [sp, #32]
+    ldrd                r4, r5, [sp, #36]
     mov                 r6, r2
 
     cmp                 r4, #32
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c
index a8f690acd4..bea7c98437 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c
@@ -9,30 +9,32 @@
  */
 
 #include <arm_neon.h>
+#include <string.h>
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 
 void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int filter_x_stride,
-                            const int16_t *filter_y, int filter_y_stride, int w,
+                            const InterpKernel *filter, int x0_q4,
+                            int x_step_q4, int y0_q4, int y_step_q4, int w,
                             int h) {
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
 
   if (w < 8) {  // copy4
     do {
-      *(uint32_t *)dst = *(const uint32_t *)src;
+      memcpy(dst, src, 4);
       src += src_stride;
       dst += dst_stride;
-      *(uint32_t *)dst = *(const uint32_t *)src;
+      memcpy(dst, src, 4);
       src += src_stride;
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else if (w == 8) {  // copy8
     uint8x8_t s0, s1;
     do {
@@ -46,7 +48,7 @@ void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
       vst1_u8(dst, s1);
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else if (w < 32) {  // copy16
     uint8x16_t s0, s1;
     do {
@@ -60,7 +62,7 @@ void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
       vst1q_u8(dst, s1);
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else if (w == 32) {  // copy32
     uint8x16_t s0, s1, s2, s3;
     do {
@@ -78,7 +80,7 @@ void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
       vst1q_u8(dst + 16, s3);
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else {  // copy64
     uint8x16_t s0, s1, s2, s3;
     do {
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm
index 89164ad48b..7a66e3ce2f 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm
@@ -17,7 +17,7 @@
 
 |vpx_convolve_copy_neon| PROC
     push                {r4-r5, lr}
-    ldrd                r4, r5, [sp, #28]
+    ldrd                r4, r5, [sp, #32]
 
     cmp                 r4, #32
     bgt                 copy64
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
index 6ca0e501b3..de5fa29471 100644
--- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
@@ -12,53 +12,61 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
 #include "vpx_ports/mem.h"
 
 void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                        ptrdiff_t dst_stride, const int16_t *filter_x,
-                        int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                        ptrdiff_t dst_stride, const InterpKernel *filter,
+                        int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
                         int w, int h) {
-  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
-   * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
-   */
-  DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
+  // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
+  // maximum buffer size to 64 * (64 + 7) (+1 row to make it divisible by 4).
+  DECLARE_ALIGNED(32, uint8_t, im_block[64 * 72]);
+  const int im_stride = 64;
 
-  // Account for the vertical phase needing 3 lines prior and 4 lines post
-  const int intermediate_height = h + 7;
+  const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
+  // Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior
+  // and vert_filter_taps / 2 lines post. (+1 to make total divisible by 4.)
+  const int im_height = h + vert_filter_taps;
+  const ptrdiff_t border_offset = vert_filter_taps / 2 - 1;
 
   assert(y_step_q4 == 16);
   assert(x_step_q4 == 16);
 
-  /* Filter starting 3 lines back. The neon implementation will ignore the given
-   * height and filter a multiple of 4 lines. Since this goes in to the temp
-   * buffer which has lots of extra room and is subsequently discarded this is
-   * safe if somewhat less than ideal.   */
-  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter_x,
-                           x_step_q4, filter_y, y_step_q4, w,
-                           intermediate_height);
+  // Filter starting border_offset rows back. The Neon implementation will
+  // ignore the given height and filter a multiple of 4 lines. Since this goes
+  // into the temporary buffer which has lots of extra room and is subsequently
+  // discarded this is safe if somewhat less than ideal.
+  vpx_convolve8_horiz_neon(src - src_stride * border_offset, src_stride,
+                           im_block, im_stride, filter, x0_q4, x_step_q4, y0_q4,
+                           y_step_q4, w, im_height);
 
-  /* Step into the temp buffer 3 lines to get the actual frame data */
-  vpx_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x, x_step_q4,
-                          filter_y, y_step_q4, w, h);
+  // Step into the temporary buffer border_offset rows to get actual frame data.
+  vpx_convolve8_vert_neon(im_block + im_stride * border_offset, im_stride, dst,
+                          dst_stride, filter, x0_q4, x_step_q4, y0_q4,
+                          y_step_q4, w, h);
 }
 
 void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4, int w,
+                            const InterpKernel *filter, int x0_q4,
+                            int x_step_q4, int y0_q4, int y_step_q4, int w,
                             int h) {
-  DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
-  const int intermediate_height = h + 7;
+  DECLARE_ALIGNED(32, uint8_t, im_block[64 * 72]);
+  const int im_stride = 64;
+  const int im_height = h + SUBPEL_TAPS;
+  const ptrdiff_t border_offset = SUBPEL_TAPS / 2 - 1;
 
   assert(y_step_q4 == 16);
   assert(x_step_q4 == 16);
 
-  /* This implementation has the same issues as above. In addition, we only want
-   * to average the values after both passes.
-   */
-  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter_x,
-                           x_step_q4, filter_y, y_step_q4, w,
-                           intermediate_height);
-  vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x,
-                              x_step_q4, filter_y, y_step_q4, w, h);
+  // This implementation has the same issues as above. In addition, we only want
+  // to average the values after both passes.
+  vpx_convolve8_horiz_neon(src - src_stride * border_offset, src_stride,
+                           im_block, im_stride, filter, x0_q4, x_step_q4, y0_q4,
+                           y_step_q4, w, im_height);
+
+  vpx_convolve8_avg_vert_neon(im_block + im_stride * border_offset, im_stride,
+                              dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4,
+                              y_step_q4, w, h);
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve2_bridge.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve2_bridge.h
new file mode 100644
index 0000000000..f30143e3e3
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve2_bridge.h
@@ -0,0 +1,46 @@
+/*
+ *  Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_VPX_NEON_SVE2_BRIDGE_H_
+#define VPX_VPX_DSP_ARM_VPX_NEON_SVE2_BRIDGE_H_
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+#include <arm_neon_sve_bridge.h>
+
+// Some very useful instructions are exclusive to the SVE2 instruction set.
+// However, we can access these instructions from a predominantly Neon context
+// by making use of the Neon-SVE bridge intrinsics to reinterpret Neon vectors
+// as SVE vectors - with the high part of the SVE vector (if it's longer than
+// 128 bits) being "don't care".
+
+static INLINE int16x8_t vpx_tbl2_s16(int16x8_t s0, int16x8_t s1,
+                                     uint16x8_t tbl) {
+  svint16x2_t samples = svcreate2_s16(svset_neonq_s16(svundef_s16(), s0),
+                                      svset_neonq_s16(svundef_s16(), s1));
+  return svget_neonq_s16(
+      svtbl2_s16(samples, svset_neonq_u16(svundef_u16(), tbl)));
+}
+
+static INLINE void vpx_tbl2x4_s16(int16x8_t s0[4], int16x8_t s1[4],
+                                  int16x8_t res[4], uint16x8_t idx) {
+  res[0] = vpx_tbl2_s16(s0[0], s1[0], idx);
+  res[1] = vpx_tbl2_s16(s0[1], s1[1], idx);
+  res[2] = vpx_tbl2_s16(s0[2], s1[2], idx);
+  res[3] = vpx_tbl2_s16(s0[3], s1[3], idx);
+}
+
+static INLINE void vpx_tbl2x2_s16(int16x8_t s0[2], int16x8_t s1[2],
+                                  int16x8_t res[2], uint16x8_t idx) {
+  res[0] = vpx_tbl2_s16(s0[0], s1[0], idx);
+  res[1] = vpx_tbl2_s16(s0[1], s1[1], idx);
+}
+
+#endif  // VPX_VPX_DSP_ARM_VPX_NEON_SVE2_BRIDGE_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve_bridge.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve_bridge.h
new file mode 100644
index 0000000000..2a2d89f42a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve_bridge.h
@@ -0,0 +1,56 @@
+/*
+ *  Copyright (c) 2024 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_VPX_NEON_SVE_BRIDGE_H_
+#define VPX_VPX_DSP_ARM_VPX_NEON_SVE_BRIDGE_H_
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+#include <arm_neon_sve_bridge.h>
+
+// Dot product instructions operating on 16-bit input elements are exclusive to
+// the SVE instruction set. However, we can access these instructions from a
+// predominantly Neon context by making use of the Neon-SVE bridge intrinsics
+// to reinterpret Neon vectors as SVE vectors - with the high part of the SVE
+// vector (if it's longer than 128 bits) being "don't care".
+
+// While sub-optimal on machines that have SVE vector length > 128-bit - as the
+// remainder of the vector is unused - this approach is still beneficial when
+// compared to a Neon-only solution.
+
+static INLINE uint64x2_t vpx_dotq_u16(uint64x2_t acc, uint16x8_t x,
+                                      uint16x8_t y) {
+  return svget_neonq_u64(svdot_u64(svset_neonq_u64(svundef_u64(), acc),
+                                   svset_neonq_u16(svundef_u16(), x),
+                                   svset_neonq_u16(svundef_u16(), y)));
+}
+
+static INLINE int64x2_t vpx_dotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y) {
+  return svget_neonq_s64(svdot_s64(svset_neonq_s64(svundef_s64(), acc),
+                                   svset_neonq_s16(svundef_s16(), x),
+                                   svset_neonq_s16(svundef_s16(), y)));
+}
+
+#define vpx_dotq_lane_s16(acc, x, y, lane)                            \
+  svget_neonq_s64(svdot_lane_s64(svset_neonq_s64(svundef_s64(), acc), \
+                                 svset_neonq_s16(svundef_s16(), x),   \
+                                 svset_neonq_s16(svundef_s16(), y), lane))
+
+static INLINE uint16x8_t vpx_tbl_u16(uint16x8_t data, uint16x8_t indices) {
+  return svget_neonq_u16(svtbl_u16(svset_neonq_u16(svundef_u16(), data),
+                                   svset_neonq_u16(svundef_u16(), indices)));
+}
+
+static INLINE int16x8_t vpx_tbl_s16(int16x8_t data, uint16x8_t indices) {
+  return svget_neonq_s16(svtbl_s16(svset_neonq_s16(svundef_s16(), data),
+                                   svset_neonq_u16(svundef_u16(), indices)));
+}
+
+#endif  // VPX_VPX_DSP_ARM_VPX_NEON_SVE_BRIDGE_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
new file mode 100644
index 0000000000..f40b6a907f
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
@@ -0,0 +1,316 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/mem.h"
+
+static INLINE void scaledconvolve_horiz_neon(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const x_filter,
+    const int x0_q4, const int x_step_q4, int w, int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
+
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  if (w == 4) {
+    do {
+      int x_q4 = x0_q4;
+
+      // Process a 4x4 tile.
+      for (int r = 0; r < 4; ++r) {
+        const uint8_t *s = &src[x_q4 >> SUBPEL_BITS];
+
+        if (x_q4 & SUBPEL_MASK) {
+          const int16x8_t filter = vld1q_s16(x_filter[x_q4 & SUBPEL_MASK]);
+
+          uint8x8_t t0, t1, t2, t3;
+          load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+          transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+          int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+          int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+          int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+          int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+          int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+          int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+          int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+          int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+          int16x4_t dd0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+          uint8x8_t d0 =
+              vqrshrun_n_s16(vcombine_s16(dd0, vdup_n_s16(0)), FILTER_BITS);
+
+          store_u8_4x1(&temp[4 * r], d0);
+        } else {
+          // Memcpy for non-subpel locations.
+          s += SUBPEL_TAPS / 2 - 1;
+
+          for (int c = 0; c < 4; ++c) {
+            temp[r * 4 + c] = s[c * src_stride];
+          }
+        }
+        x_q4 += x_step_q4;
+      }
+
+      // Transpose the 4x4 result tile and store.
+      uint8x8_t d01 = vld1_u8(temp + 0);
+      uint8x8_t d23 = vld1_u8(temp + 8);
+
+      transpose_u8_4x4(&d01, &d23);
+
+      store_u8_4x1(dst + 0 * dst_stride, d01);
+      store_u8_4x1(dst + 1 * dst_stride, d23);
+      store_u8_4x1_high(dst + 2 * dst_stride, d01);
+      store_u8_4x1_high(dst + 3 * dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+    return;
+  }
+
+  do {
+    int x_q4 = x0_q4;
+    uint8_t *d = dst;
+    int width = w;
+
+    do {
+      // Process an 8x8 tile.
+      for (int r = 0; r < 8; ++r) {
+        const uint8_t *s = &src[x_q4 >> SUBPEL_BITS];
+
+        if (x_q4 & SUBPEL_MASK) {
+          const int16x8_t filter = vld1q_s16(x_filter[x_q4 & SUBPEL_MASK]);
+
+          uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+          load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+          int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+          int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+          int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+          int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+          int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+          int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+          int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+          int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+          uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+
+          vst1_u8(&temp[r * 8], d0);
+        } else {
+          // Memcpy for non-subpel locations.
+          s += SUBPEL_TAPS / 2 - 1;
+
+          for (int c = 0; c < 8; ++c) {
+            temp[r * 8 + c] = s[c * src_stride];
+          }
+        }
+        x_q4 += x_step_q4;
+      }
+
+      // Transpose the 8x8 result tile and store.
+      uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
+      load_u8_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+      transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+      store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+
+    src += 8 * src_stride;
+    dst += 8 * dst_stride;
+    h -= 8;
+  } while (h > 0);
+}
+
+static INLINE void scaledconvolve_vert_neon(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filter,
+    const int y0_q4, const int y_step_q4, int w, int h) {
+  int y_q4 = y0_q4;
+
+  if (w == 4) {
+    do {
+      const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+
+      if (y_q4 & SUBPEL_MASK) {
+        const int16x8_t filter = vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]);
+
+        uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+        int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+        int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+        int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+        int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4)));
+        int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
+        int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6)));
+        int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7)));
+
+        int16x4_t dd0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+        uint8x8_t d0 =
+            vqrshrun_n_s16(vcombine_s16(dd0, vdup_n_s16(0)), FILTER_BITS);
+
+        store_u8_4x1(dst, d0);
+      } else {
+        // Memcpy for non-subpel locations.
+        memcpy(dst, &s[(SUBPEL_TAPS / 2 - 1) * src_stride], 4);
+      }
+
+      y_q4 += y_step_q4;
+      dst += dst_stride;
+    } while (--h != 0);
+    return;
+  }
+
+  if (w == 8) {
+    do {
+      const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+
+      if (y_q4 & SUBPEL_MASK) {
+        const int16x8_t filter = vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]);
+
+        uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+        uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+
+        vst1_u8(dst, d0);
+      } else {
+        // Memcpy for non-subpel locations.
+        memcpy(dst, &s[(SUBPEL_TAPS / 2 - 1) * src_stride], 8);
+      }
+
+      y_q4 += y_step_q4;
+      dst += dst_stride;
+    } while (--h != 0);
+    return;
+  }
+
+  do {
+    const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    uint8_t *d = dst;
+    int width = w;
+
+    if (y_q4 & SUBPEL_MASK) {
+      do {
+        const int16x8_t filter = vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]);
+
+        uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7;
+        load_u8_16x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+        int16x8_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
+        s0[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0)));
+        s1[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t1)));
+        s2[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t2)));
+        s3[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t3)));
+        s4[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t4)));
+        s5[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t5)));
+        s6[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t6)));
+        s7[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t7)));
+
+        s0[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0)));
+        s1[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t1)));
+        s2[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t2)));
+        s3[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t3)));
+        s4[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t4)));
+        s5[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t5)));
+        s6[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t6)));
+        s7[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t7)));
+
+        uint8x8_t d0 = convolve8_8(s0[0], s1[0], s2[0], s3[0], s4[0], s5[0],
+                                   s6[0], s7[0], filter);
+        uint8x8_t d1 = convolve8_8(s0[1], s1[1], s2[1], s3[1], s4[1], s5[1],
+                                   s6[1], s7[1], filter);
+
+        vst1q_u8(d, vcombine_u8(d0, d1));
+
+        s += 16;
+        d += 16;
+        width -= 16;
+      } while (width != 0);
+    } else {
+      // Memcpy for non-subpel locations.
+      s += (SUBPEL_TAPS / 2 - 1) * src_stride;
+
+      do {
+        uint8x16_t s0 = vld1q_u8(s);
+        vst1q_u8(d, s0);
+        s += 16;
+        d += 16;
+        width -= 16;
+      } while (width != 0);
+    }
+
+    y_q4 += y_step_q4;
+    dst += dst_stride;
+  } while (--h != 0);
+}
+
+void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                        ptrdiff_t dst_stride, const InterpKernel *filter,
+                        int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+                        int w, int h) {
+  // Fixed size intermediate buffer, im_block, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the im_block buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  // --Require an additional 8 rows for the horiz_w8 transpose tail.
+  // When calling in frame scaling function, the smallest scaling factor is x1/4
+  // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+  // big enough.
+  DECLARE_ALIGNED(16, uint8_t, im_block[(135 + 8) * 64]);
+  const int im_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+  const ptrdiff_t im_stride = 64;
+
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+  assert(x_step_q4 <= 64);
+
+  scaledconvolve_horiz_neon(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                            src_stride, im_block, im_stride, filter, x0_q4,
+                            x_step_q4, w, im_height);
+
+  scaledconvolve_vert_neon(im_block, im_stride, dst, dst_stride, filter, y0_q4,
+                           y_step_q4, w, h);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/avg.c b/media/libvpx/libvpx/vpx_dsp/avg.c
index 4d9abb8de3..a8dcab7dae 100644
--- a/media/libvpx/libvpx/vpx_dsp/avg.c
+++ b/media/libvpx/libvpx/vpx_dsp/avg.c
@@ -7,6 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+
+#include <assert.h>
 #include <stdlib.h>
 
 #include "./vpx_dsp_rtcd.h"
@@ -32,9 +34,169 @@ unsigned int vpx_avg_4x4_c(const uint8_t *s, int p) {
   return (sum + 8) >> 4;
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+// src_diff: 13 bit, dynamic range [-4095, 4095]
+// coeff: 16 bit
+static void hadamard_highbd_col8_first_pass(const int16_t *src_diff,
+                                            ptrdiff_t src_stride,
+                                            int16_t *coeff) {
+  int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
+  int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
+  int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
+  int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
+  int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
+  int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
+  int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
+  int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
+
+  int16_t c0 = b0 + b2;
+  int16_t c1 = b1 + b3;
+  int16_t c2 = b0 - b2;
+  int16_t c3 = b1 - b3;
+  int16_t c4 = b4 + b6;
+  int16_t c5 = b5 + b7;
+  int16_t c6 = b4 - b6;
+  int16_t c7 = b5 - b7;
+
+  coeff[0] = c0 + c4;
+  coeff[7] = c1 + c5;
+  coeff[3] = c2 + c6;
+  coeff[4] = c3 + c7;
+  coeff[2] = c0 - c4;
+  coeff[6] = c1 - c5;
+  coeff[1] = c2 - c6;
+  coeff[5] = c3 - c7;
+}
+
+// src_diff: 16 bit, dynamic range [-32760, 32760]
+// coeff: 19 bit
+static void hadamard_highbd_col8_second_pass(const int16_t *src_diff,
+                                             ptrdiff_t src_stride,
+                                             int32_t *coeff) {
+  int32_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
+  int32_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
+  int32_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
+  int32_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
+  int32_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
+  int32_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
+  int32_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
+  int32_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
+
+  int32_t c0 = b0 + b2;
+  int32_t c1 = b1 + b3;
+  int32_t c2 = b0 - b2;
+  int32_t c3 = b1 - b3;
+  int32_t c4 = b4 + b6;
+  int32_t c5 = b5 + b7;
+  int32_t c6 = b4 - b6;
+  int32_t c7 = b5 - b7;
+
+  coeff[0] = c0 + c4;
+  coeff[7] = c1 + c5;
+  coeff[3] = c2 + c6;
+  coeff[4] = c3 + c7;
+  coeff[2] = c0 - c4;
+  coeff[6] = c1 - c5;
+  coeff[1] = c2 - c6;
+  coeff[5] = c3 - c7;
+}
+
+// The order of the output coeff of the hadamard is not important. For
+// optimization purposes the final transpose may be skipped.
+void vpx_highbd_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                               tran_low_t *coeff) {
+  int idx;
+  int16_t buffer[64];
+  int32_t buffer2[64];
+  int16_t *tmp_buf = &buffer[0];
+  for (idx = 0; idx < 8; ++idx) {
+    // src_diff: 13 bit
+    // buffer: 16 bit, dynamic range [-32760, 32760]
+    hadamard_highbd_col8_first_pass(src_diff, src_stride, tmp_buf);
+    tmp_buf += 8;
+    ++src_diff;
+  }
+
+  tmp_buf = &buffer[0];
+  for (idx = 0; idx < 8; ++idx) {
+    // buffer: 16 bit
+    // buffer2: 19 bit, dynamic range [-262080, 262080]
+    hadamard_highbd_col8_second_pass(tmp_buf, 8, buffer2 + 8 * idx);
+    ++tmp_buf;
+  }
+
+  for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx];
+}
+
+// In place 16x16 2D Hadamard transform
+void vpx_highbd_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                                 tran_low_t *coeff) {
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    // src_diff: 13 bit, dynamic range [-4095, 4095]
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+    vpx_highbd_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
+  }
+
+  // coeff: 19 bit, dynamic range [-262080, 262080]
+  for (idx = 0; idx < 64; ++idx) {
+    tran_low_t a0 = coeff[0];
+    tran_low_t a1 = coeff[64];
+    tran_low_t a2 = coeff[128];
+    tran_low_t a3 = coeff[192];
+
+    tran_low_t b0 = (a0 + a1) >> 1;
+    tran_low_t b1 = (a0 - a1) >> 1;
+    tran_low_t b2 = (a2 + a3) >> 1;
+    tran_low_t b3 = (a2 - a3) >> 1;
+
+    // new coeff dynamic range: 20 bit
+    coeff[0] = b0 + b2;
+    coeff[64] = b1 + b3;
+    coeff[128] = b0 - b2;
+    coeff[192] = b1 - b3;
+
+    ++coeff;
+  }
+}
+
+void vpx_highbd_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                                 tran_low_t *coeff) {
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    // src_diff: 13 bit, dynamic range [-4095, 4095]
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+    vpx_highbd_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256);
+  }
+
+  // coeff: 20 bit
+  for (idx = 0; idx < 256; ++idx) {
+    tran_low_t a0 = coeff[0];
+    tran_low_t a1 = coeff[256];
+    tran_low_t a2 = coeff[512];
+    tran_low_t a3 = coeff[768];
+
+    tran_low_t b0 = (a0 + a1) >> 2;
+    tran_low_t b1 = (a0 - a1) >> 2;
+    tran_low_t b2 = (a2 + a3) >> 2;
+    tran_low_t b3 = (a2 - a3) >> 2;
+
+    // new coeff dynamic range: 20 bit
+    coeff[0] = b0 + b2;
+    coeff[256] = b1 + b3;
+    coeff[512] = b0 - b2;
+    coeff[768] = b1 - b3;
+
+    ++coeff;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 // src_diff: first pass, 9 bit, dynamic range [-255, 255]
 //           second pass, 12 bit, dynamic range [-2040, 2040]
-static void hadamard_col8(const int16_t *src_diff, int src_stride,
+static void hadamard_col8(const int16_t *src_diff, ptrdiff_t src_stride,
                           int16_t *coeff) {
   int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
   int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
@@ -66,10 +228,11 @@ static void hadamard_col8(const int16_t *src_diff, int src_stride,
 
 // The order of the output coeff of the hadamard is not important. For
 // optimization purposes the final transpose may be skipped.
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride,
-                        int16_t *coeff) {
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                        tran_low_t *coeff) {
   int idx;
   int16_t buffer[64];
+  int16_t buffer2[64];
   int16_t *tmp_buf = &buffer[0];
   for (idx = 0; idx < 8; ++idx) {
     hadamard_col8(src_diff, src_stride, tmp_buf);  // src_diff: 9 bit
@@ -80,17 +243,19 @@ void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride,
 
   tmp_buf = &buffer[0];
   for (idx = 0; idx < 8; ++idx) {
-    hadamard_col8(tmp_buf, 8, coeff);  // tmp_buf: 12 bit
-                                       // dynamic range [-2040, 2040]
-    coeff += 8;                        // coeff: 15 bit
-                                       // dynamic range [-16320, 16320]
+    hadamard_col8(tmp_buf, 8, buffer2 + 8 * idx);  // tmp_buf: 12 bit
+    // dynamic range [-2040, 2040]
+    // buffer2: 15 bit
+    // dynamic range [-16320, 16320]
     ++tmp_buf;
   }
+
+  for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx];
 }
 
 // In place 16x16 2D Hadamard transform
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
-                          int16_t *coeff) {
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                          tran_low_t *coeff) {
   int idx;
   for (idx = 0; idx < 4; ++idx) {
     // src_diff: 9 bit, dynamic range [-255, 255]
@@ -101,15 +266,15 @@ void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
 
   // coeff: 15 bit, dynamic range [-16320, 16320]
   for (idx = 0; idx < 64; ++idx) {
-    int16_t a0 = coeff[0];
-    int16_t a1 = coeff[64];
-    int16_t a2 = coeff[128];
-    int16_t a3 = coeff[192];
+    tran_low_t a0 = coeff[0];
+    tran_low_t a1 = coeff[64];
+    tran_low_t a2 = coeff[128];
+    tran_low_t a3 = coeff[192];
 
-    int16_t b0 = (a0 + a1) >> 1;  // (a0 + a1): 16 bit, [-32640, 32640]
-    int16_t b1 = (a0 - a1) >> 1;  // b0-b3: 15 bit, dynamic range
-    int16_t b2 = (a2 + a3) >> 1;  // [-16320, 16320]
-    int16_t b3 = (a2 - a3) >> 1;
+    tran_low_t b0 = (a0 + a1) >> 1;  // (a0 + a1): 16 bit, [-32640, 32640]
+    tran_low_t b1 = (a0 - a1) >> 1;  // b0-b3: 15 bit, dynamic range
+    tran_low_t b2 = (a2 + a3) >> 1;  // [-16320, 16320]
+    tran_low_t b3 = (a2 - a3) >> 1;
 
     coeff[0] = b0 + b2;  // 16 bit, [-32640, 32640]
     coeff[64] = b1 + b3;
@@ -120,9 +285,53 @@ void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
   }
 }
 
+void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                          tran_low_t *coeff) {
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    // src_diff: 9 bit, dynamic range [-255, 255]
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+    vpx_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256);
+  }
+
+  // coeff: 16 bit, dynamic range [-32768, 32767]
+  for (idx = 0; idx < 256; ++idx) {
+    tran_low_t a0 = coeff[0];
+    tran_low_t a1 = coeff[256];
+    tran_low_t a2 = coeff[512];
+    tran_low_t a3 = coeff[768];
+
+    tran_low_t b0 = (a0 + a1) >> 2;  // (a0 + a1): 17 bit, [-65536, 65535]
+    tran_low_t b1 = (a0 - a1) >> 2;  // b0-b3: 15 bit, dynamic range
+    tran_low_t b2 = (a2 + a3) >> 2;  // [-16384, 16383]
+    tran_low_t b3 = (a2 - a3) >> 2;
+
+    coeff[0] = b0 + b2;  // 16 bit, [-32768, 32767]
+    coeff[256] = b1 + b3;
+    coeff[512] = b0 - b2;
+    coeff[768] = b1 - b3;
+
+    ++coeff;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+// coeff: dynamic range 20 bit.
+// length: value range {16, 64, 256, 1024}.
+int vpx_highbd_satd_c(const tran_low_t *coeff, int length) {
+  int i;
+  int satd = 0;
+  for (i = 0; i < length; ++i) satd += abs(coeff[i]);
+
+  // satd: 30 bits
+  return satd;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 // coeff: 16 bits, dynamic range [-32640, 32640].
 // length: value range {16, 64, 256, 1024}.
-int vpx_satd_c(const int16_t *coeff, int length) {
+int vpx_satd_c(const tran_low_t *coeff, int length) {
   int i;
   int satd = 0;
   for (i = 0; i < length; ++i) satd += abs(coeff[i]);
@@ -137,6 +346,7 @@ void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref,
                        const int ref_stride, const int height) {
   int idx;
   const int norm_factor = height >> 1;
+  assert(height >= 2);
   for (idx = 0; idx < 16; ++idx) {
     int i;
     hbuf[idx] = 0;
@@ -218,7 +428,7 @@ void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
   int i, j;
   const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
   const uint16_t *d = CONVERT_TO_SHORTPTR(d8);
-  *min = 255;
+  *min = 65535;
   *max = 0;
   for (i = 0; i < 8; ++i, s += p, d += dp) {
     for (j = 0; j < 8; ++j) {
diff --git a/media/libvpx/libvpx/vpx_dsp/bitreader.h b/media/libvpx/libvpx/vpx_dsp/bitreader.h
index 6ee2a58632..a5927ea2ad 100644
--- a/media/libvpx/libvpx/vpx_dsp/bitreader.h
+++ b/media/libvpx/libvpx/vpx_dsp/bitreader.h
@@ -8,10 +8,11 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_BITREADER_H_
-#define VPX_DSP_BITREADER_H_
+#ifndef VPX_VPX_DSP_BITREADER_H_
+#define VPX_VPX_DSP_BITREADER_H_
 
 #include <stddef.h>
+#include <stdio.h>
 #include <limits.h>
 
 #include "./vpx_config.h"
@@ -19,6 +20,9 @@
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/prob.h"
+#if CONFIG_BITSTREAM_DEBUG
+#include "vpx_util/vpx_debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG
 
 #ifdef __cplusplus
 extern "C" {
@@ -94,7 +98,7 @@ static INLINE int vpx_read(vpx_reader *r, int prob) {
   }
 
   {
-    register int shift = vpx_norm[range];
+    const unsigned char shift = vpx_norm[(unsigned char)range];
     range <<= shift;
     value <<= shift;
     count -= shift;
@@ -103,6 +107,31 @@ static INLINE int vpx_read(vpx_reader *r, int prob) {
   r->count = count;
   r->range = range;
 
+#if CONFIG_BITSTREAM_DEBUG
+  {
+    const int queue_r = bitstream_queue_get_read();
+    const int frame_idx = bitstream_queue_get_frame_read();
+    int ref_result, ref_prob;
+    bitstream_queue_pop(&ref_result, &ref_prob);
+    if ((int)bit != ref_result) {
+      fprintf(stderr,
+              "\n *** [bit] result error, frame_idx_r %d bit %d ref_result %d "
+              "queue_r %d\n",
+              frame_idx, bit, ref_result, queue_r);
+
+      assert(0);
+    }
+    if (prob != ref_prob) {
+      fprintf(stderr,
+              "\n *** [bit] prob error, frame_idx_r %d prob %d ref_prob %d "
+              "queue_r %d\n",
+              frame_idx, prob, ref_prob, queue_r);
+
+      assert(0);
+    }
+  }
+#endif
+
   return bit;
 }
 
@@ -131,4 +160,4 @@ static INLINE int vpx_read_tree(vpx_reader *r, const vpx_tree_index *tree,
 }  // extern "C"
 #endif
 
-#endif  // VPX_DSP_BITREADER_H_
+#endif  // VPX_VPX_DSP_BITREADER_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/bitreader_buffer.c b/media/libvpx/libvpx/vpx_dsp/bitreader_buffer.c
index e99fffb605..f59f1f7cb9 100644
--- a/media/libvpx/libvpx/vpx_dsp/bitreader_buffer.c
+++ b/media/libvpx/libvpx/vpx_dsp/bitreader_buffer.c
@@ -23,7 +23,7 @@ int vpx_rb_read_bit(struct vpx_read_bit_buffer *rb) {
     rb->bit_offset = off + 1;
     return bit;
   } else {
-    rb->error_handler(rb->error_handler_data);
+    if (rb->error_handler != NULL) rb->error_handler(rb->error_handler_data);
     return 0;
   }
 }
@@ -40,11 +40,5 @@ int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb, int bits) {
 }
 
 int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb, int bits) {
-#if CONFIG_MISC_FIXES
-  const int nbits = sizeof(unsigned) * 8 - bits - 1;
-  const unsigned value = (unsigned)vpx_rb_read_literal(rb, bits + 1) << nbits;
-  return ((int)value) >> nbits;
-#else
   return vpx_rb_read_signed_literal(rb, bits);
-#endif
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/bitreader_buffer.h b/media/libvpx/libvpx/vpx_dsp/bitreader_buffer.h
index 8a48a95ed1..b27703a4db 100644
--- a/media/libvpx/libvpx/vpx_dsp/bitreader_buffer.h
+++ b/media/libvpx/libvpx/vpx_dsp/bitreader_buffer.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_BITREADER_BUFFER_H_
-#define VPX_DSP_BITREADER_BUFFER_H_
+#ifndef VPX_VPX_DSP_BITREADER_BUFFER_H_
+#define VPX_VPX_DSP_BITREADER_BUFFER_H_
 
 #include <limits.h>
 
@@ -44,4 +44,4 @@ int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb, int bits);
 }  // extern "C"
 #endif
 
-#endif  // VPX_DSP_BITREADER_BUFFER_H_
+#endif  // VPX_VPX_DSP_BITREADER_BUFFER_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/bitwriter.c b/media/libvpx/libvpx/vpx_dsp/bitwriter.c
index 81e28b309f..d3ef9bd89a 100644
--- a/media/libvpx/libvpx/vpx_dsp/bitwriter.c
+++ b/media/libvpx/libvpx/vpx_dsp/bitwriter.c
@@ -9,23 +9,47 @@
  */
 
 #include <assert.h>
+#include <limits.h>
 
 #include "./bitwriter.h"
 
-void vpx_start_encode(vpx_writer *br, uint8_t *source) {
+#if CONFIG_BITSTREAM_DEBUG
+#include "vpx_util/vpx_debug_util.h"
+#endif
+
+void vpx_start_encode(vpx_writer *br, uint8_t *source, size_t size) {
   br->lowvalue = 0;
   br->range = 255;
   br->count = -24;
-  br->buffer = source;
+  br->error = 0;
   br->pos = 0;
+  // Make sure it is safe to cast br->pos to int in vpx_write().
+  if (size > INT_MAX) size = INT_MAX;
+  br->size = (unsigned int)size;
+  br->buffer = source;
   vpx_write_bit(br, 0);
 }
 
-void vpx_stop_encode(vpx_writer *br) {
+int vpx_stop_encode(vpx_writer *br) {
   int i;
 
+#if CONFIG_BITSTREAM_DEBUG
+  bitstream_queue_set_skip_write(1);
+#endif
   for (i = 0; i < 32; i++) vpx_write_bit(br, 0);
 
   // Ensure there's no ambigous collision with any index marker bytes
-  if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0) br->buffer[br->pos++] = 0;
+  if (!br->error && (br->buffer[br->pos - 1] & 0xe0) == 0xc0) {
+    if (br->pos < br->size) {
+      br->buffer[br->pos++] = 0;
+    } else {
+      br->error = 1;
+    }
+  }
+
+#if CONFIG_BITSTREAM_DEBUG
+  bitstream_queue_set_skip_write(0);
+#endif
+
+  return br->error ? -1 : 0;
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/bitwriter.h b/media/libvpx/libvpx/vpx_dsp/bitwriter.h
index 41040cf935..daff331daf 100644
--- a/media/libvpx/libvpx/vpx_dsp/bitwriter.h
+++ b/media/libvpx/libvpx/vpx_dsp/bitwriter.h
@@ -8,12 +8,18 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_BITWRITER_H_
-#define VPX_DSP_BITWRITER_H_
+#ifndef VPX_VPX_DSP_BITWRITER_H_
+#define VPX_VPX_DSP_BITWRITER_H_
 
+#include <stdio.h>
+
+#include "vpx_ports/compiler_attributes.h"
 #include "vpx_ports/mem.h"
 
 #include "vpx_dsp/prob.h"
+#if CONFIG_BITSTREAM_DEBUG
+#include "vpx_util/vpx_debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG
 
 #ifdef __cplusplus
 extern "C" {
@@ -23,19 +29,43 @@ typedef struct vpx_writer {
   unsigned int lowvalue;
   unsigned int range;
   int count;
+  // Whether there has been an error.
+  int error;
+  // We maintain the invariant that pos <= size, i.e., we never write beyond
+  // the end of the buffer. If pos would be incremented to be greater than
+  // size, leave pos unchanged and set error to 1.
   unsigned int pos;
+  unsigned int size;
   uint8_t *buffer;
 } vpx_writer;
 
-void vpx_start_encode(vpx_writer *bc, uint8_t *buffer);
-void vpx_stop_encode(vpx_writer *bc);
+void vpx_start_encode(vpx_writer *br, uint8_t *source, size_t size);
+// Returns 0 on success and returns -1 in case of error.
+int vpx_stop_encode(vpx_writer *br);
 
-static INLINE void vpx_write(vpx_writer *br, int bit, int probability) {
+static INLINE VPX_NO_UNSIGNED_SHIFT_CHECK void vpx_write(vpx_writer *br,
+                                                         int bit,
+                                                         int probability) {
   unsigned int split;
   int count = br->count;
   unsigned int range = br->range;
   unsigned int lowvalue = br->lowvalue;
-  register int shift;
+  int shift;
+
+#if CONFIG_BITSTREAM_DEBUG
+  /*
+  int queue_r = 0;
+  int frame_idx_r = 0;
+  int queue_w = bitstream_queue_get_write();
+  int frame_idx_w = bitstream_queue_get_frame_write();
+  if (frame_idx_w == frame_idx_r && queue_w == queue_r) {
+    fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n",
+            frame_idx_w, queue_w);
+    assert(0);
+  }
+  */
+  bitstream_queue_push(bit, probability);
+#endif
 
   split = 1 + (((range - 1) * probability) >> 8);
 
@@ -54,18 +84,25 @@ static INLINE void vpx_write(vpx_writer *br, int bit, int probability) {
   if (count >= 0) {
     int offset = shift - count;
 
-    if ((lowvalue << (offset - 1)) & 0x80000000) {
-      int x = br->pos - 1;
+    if (!br->error) {
+      if ((lowvalue << (offset - 1)) & 0x80000000) {
+        int x = (int)br->pos - 1;
 
-      while (x >= 0 && br->buffer[x] == 0xff) {
-        br->buffer[x] = 0;
-        x--;
+        while (x >= 0 && br->buffer[x] == 0xff) {
+          br->buffer[x] = 0;
+          x--;
+        }
+
+        // TODO(wtc): How to prove x >= 0?
+        br->buffer[x] += 1;
       }
 
-      br->buffer[x] += 1;
+      if (br->pos < br->size) {
+        br->buffer[br->pos++] = (lowvalue >> (24 - offset)) & 0xff;
+      } else {
+        br->error = 1;
+      }
     }
-
-    br->buffer[br->pos++] = (lowvalue >> (24 - offset));
     lowvalue <<= offset;
     shift = count;
     lowvalue &= 0xffffff;
@@ -94,4 +131,4 @@ static INLINE void vpx_write_literal(vpx_writer *w, int data, int bits) {
 }  // extern "C"
 #endif
 
-#endif  // VPX_DSP_BITWRITER_H_
+#endif  // VPX_VPX_DSP_BITWRITER_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.c b/media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.c
index 1043cdc784..b3a2490f2c 100644
--- a/media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.c
+++ b/media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.c
@@ -8,24 +8,43 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 #include <limits.h>
 #include <stdlib.h>
 
 #include "./vpx_config.h"
 #include "./bitwriter_buffer.h"
 
+void vpx_wb_init(struct vpx_write_bit_buffer *wb, uint8_t *bit_buffer,
+                 size_t size) {
+  wb->error = 0;
+  wb->bit_offset = 0;
+  wb->size = size;
+  wb->bit_buffer = bit_buffer;
+}
+
+int vpx_wb_has_error(const struct vpx_write_bit_buffer *wb) {
+  return wb->error;
+}
+
 size_t vpx_wb_bytes_written(const struct vpx_write_bit_buffer *wb) {
+  assert(!wb->error);
   return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0);
 }
 
 void vpx_wb_write_bit(struct vpx_write_bit_buffer *wb, int bit) {
+  if (wb->error) return;
   const int off = (int)wb->bit_offset;
   const int p = off / CHAR_BIT;
   const int q = CHAR_BIT - 1 - off % CHAR_BIT;
+  if ((size_t)p >= wb->size) {
+    wb->error = 1;
+    return;
+  }
   if (q == CHAR_BIT - 1) {
     wb->bit_buffer[p] = bit << q;
   } else {
-    wb->bit_buffer[p] &= ~(1 << q);
+    assert((wb->bit_buffer[p] & (1 << q)) == 0);
     wb->bit_buffer[p] |= bit << q;
   }
   wb->bit_offset = off + 1;
@@ -38,10 +57,6 @@ void vpx_wb_write_literal(struct vpx_write_bit_buffer *wb, int data, int bits) {
 
 void vpx_wb_write_inv_signed_literal(struct vpx_write_bit_buffer *wb, int data,
                                      int bits) {
-#if CONFIG_MISC_FIXES
-  vpx_wb_write_literal(wb, data, bits + 1);
-#else
   vpx_wb_write_literal(wb, abs(data), bits);
   vpx_wb_write_bit(wb, data < 0);
-#endif
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.h b/media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.h
index a123a2fe8c..3ee0e9658b 100644
--- a/media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.h
+++ b/media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_BITWRITER_BUFFER_H_
-#define VPX_DSP_BITWRITER_BUFFER_H_
+#ifndef VPX_VPX_DSP_BITWRITER_BUFFER_H_
+#define VPX_VPX_DSP_BITWRITER_BUFFER_H_
 
 #include "vpx/vpx_integer.h"
 
@@ -18,10 +18,24 @@ extern "C" {
 #endif
 
 struct vpx_write_bit_buffer {
-  uint8_t *bit_buffer;
+  // Whether there has been an error.
+  int error;
+  // We maintain the invariant that bit_offset <= size * CHAR_BIT, i.e., we
+  // never write beyond the end of bit_buffer. If bit_offset would be
+  // incremented to be greater than size * CHAR_BIT, leave bit_offset unchanged
+  // and set error to 1.
   size_t bit_offset;
+  // Size of bit_buffer in bytes.
+  size_t size;
+  uint8_t *bit_buffer;
 };
 
+void vpx_wb_init(struct vpx_write_bit_buffer *wb, uint8_t *bit_buffer,
+                 size_t size);
+
+int vpx_wb_has_error(const struct vpx_write_bit_buffer *wb);
+
+// Must not be called if vpx_wb_has_error(wb) returns true.
 size_t vpx_wb_bytes_written(const struct vpx_write_bit_buffer *wb);
 
 void vpx_wb_write_bit(struct vpx_write_bit_buffer *wb, int bit);
@@ -35,4 +49,4 @@ void vpx_wb_write_inv_signed_literal(struct vpx_write_bit_buffer *wb, int data,
 }  // extern "C"
 #endif
 
-#endif  // VPX_DSP_BITWRITER_BUFFER_H_
+#endif  // VPX_VPX_DSP_BITWRITER_BUFFER_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/deblock.c b/media/libvpx/libvpx/vpx_dsp/deblock.c
index 6c27484979..455b73bbce 100644
--- a/media/libvpx/libvpx/vpx_dsp/deblock.c
+++ b/media/libvpx/libvpx/vpx_dsp/deblock.c
@@ -7,7 +7,9 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <assert.h>
 #include <stdlib.h>
+#include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 
 const int16_t vpx_rv[] = {
@@ -37,32 +39,36 @@ const int16_t vpx_rv[] = {
   9,  10, 13,
 };
 
-void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
-                                            unsigned char *dst_ptr,
-                                            int src_pixels_per_line,
-                                            int dst_pixels_per_line, int cols,
-                                            unsigned char *f, int size) {
+void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src,
+                                            unsigned char *dst, int src_pitch,
+                                            int dst_pitch, int cols,
+                                            unsigned char *flimits, int size) {
   unsigned char *p_src, *p_dst;
   int row;
   int col;
   unsigned char v;
   unsigned char d[4];
 
+  assert(size >= 8);
+  assert(cols >= 8);
+
   for (row = 0; row < size; row++) {
     /* post_proc_down for one row */
-    p_src = src_ptr;
-    p_dst = dst_ptr;
+    p_src = src;
+    p_dst = dst;
 
     for (col = 0; col < cols; col++) {
-      unsigned char p_above2 = p_src[col - 2 * src_pixels_per_line];
-      unsigned char p_above1 = p_src[col - src_pixels_per_line];
-      unsigned char p_below1 = p_src[col + src_pixels_per_line];
-      unsigned char p_below2 = p_src[col + 2 * src_pixels_per_line];
+      unsigned char p_above2 = p_src[col - 2 * src_pitch];
+      unsigned char p_above1 = p_src[col - src_pitch];
+      unsigned char p_below1 = p_src[col + src_pitch];
+      unsigned char p_below2 = p_src[col + 2 * src_pitch];
 
       v = p_src[col];
 
-      if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col]) &&
-          (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col])) {
+      if ((abs(v - p_above2) < flimits[col]) &&
+          (abs(v - p_above1) < flimits[col]) &&
+          (abs(v - p_below1) < flimits[col]) &&
+          (abs(v - p_below2) < flimits[col])) {
         unsigned char k1, k2, k3;
         k1 = (p_above2 + p_above1 + 1) >> 1;
         k2 = (p_below2 + p_below1 + 1) >> 1;
@@ -74,8 +80,8 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
     }
 
     /* now post_proc_across */
-    p_src = dst_ptr;
-    p_dst = dst_ptr;
+    p_src = dst;
+    p_dst = dst;
 
     p_src[-2] = p_src[-1] = p_src[0];
     p_src[cols] = p_src[cols + 1] = p_src[cols - 1];
@@ -83,10 +89,10 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
     for (col = 0; col < cols; col++) {
       v = p_src[col];
 
-      if ((abs(v - p_src[col - 2]) < f[col]) &&
-          (abs(v - p_src[col - 1]) < f[col]) &&
-          (abs(v - p_src[col + 1]) < f[col]) &&
-          (abs(v - p_src[col + 2]) < f[col])) {
+      if ((abs(v - p_src[col - 2]) < flimits[col]) &&
+          (abs(v - p_src[col - 1]) < flimits[col]) &&
+          (abs(v - p_src[col + 1]) < flimits[col]) &&
+          (abs(v - p_src[col + 2]) < flimits[col])) {
         unsigned char k1, k2, k3;
         k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1;
         k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1;
@@ -104,8 +110,8 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
     p_dst[col - 1] = d[(col - 1) & 3];
 
     /* next row */
-    src_ptr += src_pixels_per_line;
-    dst_ptr += dst_pixels_per_line;
+    src += src_pitch;
+    dst += dst_pitch;
   }
 }
 
diff --git a/media/libvpx/libvpx/vpx_dsp/fastssim.c b/media/libvpx/libvpx/vpx_dsp/fastssim.c
index 0469071a17..4d32a02a55 100644
--- a/media/libvpx/libvpx/vpx_dsp/fastssim.c
+++ b/media/libvpx/libvpx/vpx_dsp/fastssim.c
@@ -47,7 +47,7 @@ struct fs_ctx {
   unsigned *col_buf;
 };
 
-static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
+static int fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
   unsigned char *data;
   size_t data_size;
   int lw;
@@ -71,6 +71,7 @@ static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
     lh = (lh + 1) >> 1;
   }
   data = (unsigned char *)malloc(data_size);
+  if (!data) return -1;
   _ctx->level = (fs_level *)data;
   _ctx->nlevels = _nlevels;
   data += _nlevels * sizeof(*_ctx->level);
@@ -95,6 +96,7 @@ static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
     lh = (lh + 1) >> 1;
   }
   _ctx->col_buf = (unsigned *)data;
+  return 0;
 }
 
 static void fs_ctx_clear(fs_ctx *_ctx) { free(_ctx->level); }
@@ -128,10 +130,12 @@ static void fs_downsample_level(fs_ctx *_ctx, int _l) {
       int i1;
       i0 = 2 * i;
       i1 = FS_MINI(i0 + 1, w2);
-      dst1[j * w + i] = src1[j0offs + i0] + src1[j0offs + i1] +
-                        src1[j1offs + i0] + src1[j1offs + i1];
-      dst2[j * w + i] = src2[j0offs + i0] + src2[j0offs + i1] +
-                        src2[j1offs + i0] + src2[j1offs + i1];
+      dst1[j * w + i] =
+          (uint32_t)((int64_t)src1[j0offs + i0] + src1[j0offs + i1] +
+                     src1[j1offs + i0] + src1[j1offs + i1]);
+      dst2[j * w + i] =
+          (uint32_t)((int64_t)src2[j0offs + i0] + src2[j0offs + i1] +
+                     src2[j1offs + i0] + src2[j1offs + i1]);
     }
   }
 }
@@ -220,12 +224,12 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
   ssim = _ctx->level[_l].ssim;
   c1 = (double)(ssim_c1 * 4096 * (1 << 4 * _l));
   for (j = 0; j < h; j++) {
-    unsigned mux;
-    unsigned muy;
+    int64_t mux;
+    int64_t muy;
     int i0;
     int i1;
-    mux = 5 * col_sums_x[0];
-    muy = 5 * col_sums_y[0];
+    mux = (int64_t)5 * col_sums_x[0];
+    muy = (int64_t)5 * col_sums_y[0];
     for (i = 1; i < 4; i++) {
       i1 = FS_MINI(i, w - 1);
       mux += col_sums_x[i1];
@@ -237,8 +241,8 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
       if (i + 1 < w) {
         i0 = FS_MAXI(0, i - 4);
         i1 = FS_MINI(i + 4, w - 1);
-        mux += col_sums_x[i1] - col_sums_x[i0];
-        muy += col_sums_x[i1] - col_sums_x[i0];
+        mux += (int)col_sums_x[i1] - (int)col_sums_x[i0];
+        muy += (int)col_sums_x[i1] - (int)col_sums_x[i0];
       }
     }
     if (j + 1 < h) {
@@ -246,8 +250,10 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
       for (i = 0; i < w; i++) col_sums_x[i] -= im1[j0offs + i];
       for (i = 0; i < w; i++) col_sums_y[i] -= im2[j0offs + i];
       j1offs = FS_MINI(j + 4, h - 1) * w;
-      for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i];
-      for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i];
+      for (i = 0; i < w; i++)
+        col_sums_x[i] = (uint32_t)((int64_t)col_sums_x[i] + im1[j1offs + i]);
+      for (i = 0; i < w; i++)
+        col_sums_y[i] = (uint32_t)((int64_t)col_sums_y[i] + im2[j1offs + i]);
     }
   }
 }
@@ -343,18 +349,18 @@ static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) {
   for (j = 0; j < h + 4; j++) {
     if (j < h - 1) {
       for (i = 0; i < w - 1; i++) {
-        unsigned g1;
-        unsigned g2;
-        unsigned gx;
-        unsigned gy;
-        g1 = abs((int)im1[(j + 1) * w + i + 1] - (int)im1[j * w + i]);
-        g2 = abs((int)im1[(j + 1) * w + i] - (int)im1[j * w + i + 1]);
+        int64_t g1;
+        int64_t g2;
+        int64_t gx;
+        int64_t gy;
+        g1 = labs((int64_t)im1[(j + 1) * w + i + 1] - (int64_t)im1[j * w + i]);
+        g2 = labs((int64_t)im1[(j + 1) * w + i] - (int64_t)im1[j * w + i + 1]);
         gx = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2);
-        g1 = abs((int)im2[(j + 1) * w + i + 1] - (int)im2[j * w + i]);
-        g2 = abs((int)im2[(j + 1) * w + i] - (int)im2[j * w + i + 1]);
-        gy = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2);
-        gx_buf[(j & 7) * stride + i + 4] = gx;
-        gy_buf[(j & 7) * stride + i + 4] = gy;
+        g1 = labs((int64_t)im2[(j + 1) * w + i + 1] - (int64_t)im2[j * w + i]);
+        g2 = labs((int64_t)im2[(j + 1) * w + i] - (int64_t)im2[j * w + i + 1]);
+        gy = ((int64_t)4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2));
+        gx_buf[(j & 7) * stride + i + 4] = (uint32_t)gx;
+        gy_buf[(j & 7) * stride + i + 4] = (uint32_t)gy;
       }
     } else {
       memset(gx_buf + (j & 7) * stride, 0, stride * sizeof(*gx_buf));
@@ -452,7 +458,7 @@ static double calc_ssim(const uint8_t *_src, int _systride, const uint8_t *_dst,
   double ret;
   int l;
   ret = 1;
-  fs_ctx_init(&ctx, _w, _h, FS_NLEVELS);
+  if (fs_ctx_init(&ctx, _w, _h, FS_NLEVELS)) return 99.0;
   fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _bd,
                        _shift);
   for (l = 0; l < FS_NLEVELS - 1; l++) {
diff --git a/media/libvpx/libvpx/vpx_dsp/fwd_txfm.c b/media/libvpx/libvpx/vpx_dsp/fwd_txfm.c
index aa59601094..ef66de0247 100644
--- a/media/libvpx/libvpx/vpx_dsp/fwd_txfm.c
+++ b/media/libvpx/libvpx/vpx_dsp/fwd_txfm.c
@@ -84,14 +84,14 @@ void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
   for (r = 0; r < 4; ++r)
     for (c = 0; c < 4; ++c) sum += input[r * stride + c];
 
-  output[0] = sum << 1;
+  output[0] = sum * 2;
 }
 
-void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
+void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride) {
   int i, j;
   tran_low_t intermediate[64];
   int pass;
-  tran_low_t *output = intermediate;
+  tran_low_t *out = intermediate;
   const tran_low_t *in = NULL;
 
   // Transform columns
@@ -133,10 +133,10 @@ void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
       t1 = (x0 - x1) * cospi_16_64;
       t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
       t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
-      output[0] = (tran_low_t)fdct_round_shift(t0);
-      output[2] = (tran_low_t)fdct_round_shift(t2);
-      output[4] = (tran_low_t)fdct_round_shift(t1);
-      output[6] = (tran_low_t)fdct_round_shift(t3);
+      out[0] = (tran_low_t)fdct_round_shift(t0);
+      out[2] = (tran_low_t)fdct_round_shift(t2);
+      out[4] = (tran_low_t)fdct_round_shift(t1);
+      out[6] = (tran_low_t)fdct_round_shift(t3);
 
       // Stage 2
       t0 = (s6 - s5) * cospi_16_64;
@@ -155,19 +155,19 @@ void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
       t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
       t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
       t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
-      output[1] = (tran_low_t)fdct_round_shift(t0);
-      output[3] = (tran_low_t)fdct_round_shift(t2);
-      output[5] = (tran_low_t)fdct_round_shift(t1);
-      output[7] = (tran_low_t)fdct_round_shift(t3);
-      output += 8;
+      out[1] = (tran_low_t)fdct_round_shift(t0);
+      out[3] = (tran_low_t)fdct_round_shift(t2);
+      out[5] = (tran_low_t)fdct_round_shift(t1);
+      out[7] = (tran_low_t)fdct_round_shift(t3);
+      out += 8;
     }
     in = intermediate;
-    output = final_output;
+    out = output;
   }
 
   // Rows
   for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2;
+    for (j = 0; j < 8; ++j) output[j + i * 8] /= 2;
   }
 }
 
@@ -705,9 +705,9 @@ void vpx_fdct32(const tran_high_t *input, tran_high_t *output, int round) {
   output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
 }
 
-void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
+void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride) {
   int i, j;
-  tran_high_t output[32 * 32];
+  tran_high_t out[32 * 32];
 
   // Columns
   for (i = 0; i < 32; ++i) {
@@ -715,16 +715,16 @@ void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
     for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
     vpx_fdct32(temp_in, temp_out, 0);
     for (j = 0; j < 32; ++j)
-      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+      out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
   }
 
   // Rows
   for (i = 0; i < 32; ++i) {
     tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
+    for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32];
     vpx_fdct32(temp_in, temp_out, 0);
     for (j = 0; j < 32; ++j)
-      out[j + i * 32] =
+      output[j + i * 32] =
           (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
   }
 }
@@ -732,9 +732,9 @@ void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
 // Note that although we use dct_32_round in dct32 computation flow,
 // this 2d fdct32x32 for rate-distortion optimization loop is operating
 // within 16 bits precision.
-void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
+void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride) {
   int i, j;
-  tran_high_t output[32 * 32];
+  tran_high_t out[32 * 32];
 
   // Columns
   for (i = 0; i < 32; ++i) {
@@ -745,15 +745,15 @@ void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
       // TODO(cd): see quality impact of only doing
       //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
       //           PS: also change code in vpx_dsp/x86/vpx_dct_sse2.c
-      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+      out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
   }
 
   // Rows
   for (i = 0; i < 32; ++i) {
     tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
+    for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32];
     vpx_fdct32(temp_in, temp_out, 1);
-    for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
+    for (j = 0; j < 32; ++j) output[j + i * 32] = (tran_low_t)temp_out[j];
   }
 }
 
@@ -772,14 +772,14 @@ void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output,
   vpx_fdct4x4_c(input, output, stride);
 }
 
-void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
+void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output,
                           int stride) {
-  vpx_fdct8x8_c(input, final_output, stride);
+  vpx_fdct8x8_c(input, output, stride);
 }
 
-void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,
+void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output,
                             int stride) {
-  vpx_fdct8x8_1_c(input, final_output, stride);
+  vpx_fdct8x8_1_c(input, output, stride);
 }
 
 void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
@@ -792,17 +792,18 @@ void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
   vpx_fdct16x16_1_c(input, output, stride);
 }
 
-void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
-  vpx_fdct32x32_c(input, out, stride);
+void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output,
+                            int stride) {
+  vpx_fdct32x32_c(input, output, stride);
 }
 
-void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
+void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output,
                                int stride) {
-  vpx_fdct32x32_rd_c(input, out, stride);
+  vpx_fdct32x32_rd_c(input, output, stride);
 }
 
-void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *out,
+void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output,
                               int stride) {
-  vpx_fdct32x32_1_c(input, out, stride);
+  vpx_fdct32x32_1_c(input, output, stride);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/fwd_txfm.h b/media/libvpx/libvpx/vpx_dsp/fwd_txfm.h
index 29e139c73b..a43c8ea7f7 100644
--- a/media/libvpx/libvpx/vpx_dsp/fwd_txfm.h
+++ b/media/libvpx/libvpx/vpx_dsp/fwd_txfm.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_FWD_TXFM_H_
-#define VPX_DSP_FWD_TXFM_H_
+#ifndef VPX_VPX_DSP_FWD_TXFM_H_
+#define VPX_VPX_DSP_FWD_TXFM_H_
 
 #include "vpx_dsp/txfm_common.h"
 
@@ -22,4 +22,4 @@ static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
 }
 
 void vpx_fdct32(const tran_high_t *input, tran_high_t *output, int round);
-#endif  // VPX_DSP_FWD_TXFM_H_
+#endif  // VPX_VPX_DSP_FWD_TXFM_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/intrapred.c b/media/libvpx/libvpx/vpx_dsp/intrapred.c
index eca17a983e..4b37deef0e 100644
--- a/media/libvpx/libvpx/vpx_dsp/intrapred.c
+++ b/media/libvpx/libvpx/vpx_dsp/intrapred.c
@@ -14,7 +14,7 @@
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 
-#define DST(x, y) dst[(x) + (y)*stride]
+#define DST(x, y) dst[(x) + (y) * stride]
 #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
 #define AVG2(a, b) (((a) + (b) + 1) >> 1)
 
@@ -42,23 +42,6 @@ static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
       dst[r * stride + c] = dst[(r + 1) * stride + c - 2];
 }
 
-#if CONFIG_MISC_FIXES
-static INLINE void d207e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                                   const uint8_t *above, const uint8_t *left) {
-  int r, c;
-  (void)above;
-
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) {
-      dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1],
-                            left[(c >> 1) + r + 2])
-                     : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]);
-    }
-    dst += stride;
-  }
-}
-#endif  // CONFIG_MISC_FIXES
-
 static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                  const uint8_t *above, const uint8_t *left) {
   int r, c;
@@ -76,22 +59,6 @@ static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
   }
 }
 
-#if CONFIG_MISC_FIXES
-static INLINE void d63e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                                  const uint8_t *above, const uint8_t *left) {
-  int r, c;
-  (void)left;
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) {
-      dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1],
-                            above[(r >> 1) + c + 2])
-                     : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]);
-    }
-    dst += stride;
-  }
-}
-#endif  // CONFIG_MISC_FIXES
-
 static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                  const uint8_t *above, const uint8_t *left) {
   const uint8_t above_right = above[bs - 1];
@@ -111,21 +78,6 @@ static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
   }
 }
 
-#if CONFIG_MISC_FIXES
-static INLINE void d45e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
-                                  const uint8_t *above, const uint8_t *left) {
-  int r, c;
-  (void)left;
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) {
-      dst[c] = AVG3(above[r + c], above[r + c + 1],
-                    above[r + c + 1 + (r + c + 2 < bs * 2)]);
-    }
-    dst += stride;
-  }
-}
-#endif  // CONFIG_MISC_FIXES
-
 static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                   const uint8_t *above, const uint8_t *left) {
   int r, c;
@@ -367,7 +319,7 @@ void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
   DST(3, 3) = AVG3(E, F, G);  // differs from vp8
 }
 
-void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
   const int A = above[0];
   const int B = above[1];
@@ -533,75 +485,46 @@ static INLINE void highbd_d207_predictor(uint16_t *dst, ptrdiff_t stride,
   }
 }
 
-#if CONFIG_MISC_FIXES
-static INLINE void highbd_d207e_predictor(uint16_t *dst, ptrdiff_t stride,
-                                          int bs, const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  int r, c;
-  (void)above;
-  (void)bd;
-
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) {
-      dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1],
-                            left[(c >> 1) + r + 2])
-                     : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]);
-    }
-    dst += stride;
-  }
-}
-#endif  // CONFIG_MISC_FIXES
-
 static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
                                         const uint16_t *above,
                                         const uint16_t *left, int bd) {
   int r, c;
+  int size;
   (void)left;
   (void)bd;
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) {
-      dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1],
-                            above[(r >> 1) + c + 2])
-                     : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]);
-    }
-    dst += stride;
+  for (c = 0; c < bs; ++c) {
+    dst[c] = AVG2(above[c], above[c + 1]);
+    dst[stride + c] = AVG3(above[c], above[c + 1], above[c + 2]);
+  }
+  for (r = 2, size = bs - 2; r < bs; r += 2, --size) {
+    memcpy(dst + (r + 0) * stride, dst + (r >> 1), size * sizeof(*dst));
+    vpx_memset16(dst + (r + 0) * stride + size, above[bs - 1], bs - size);
+    memcpy(dst + (r + 1) * stride, dst + stride + (r >> 1),
+           size * sizeof(*dst));
+    vpx_memset16(dst + (r + 1) * stride + size, above[bs - 1], bs - size);
   }
 }
 
-#define highbd_d63e_predictor highbd_d63_predictor
-
 static INLINE void highbd_d45_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
                                         const uint16_t *above,
                                         const uint16_t *left, int bd) {
-  int r, c;
+  const uint16_t above_right = above[bs - 1];
+  const uint16_t *const dst_row0 = dst;
+  int x, size;
   (void)left;
   (void)bd;
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) {
-      dst[c] = r + c + 2 < bs * 2
-                   ? AVG3(above[r + c], above[r + c + 1], above[r + c + 2])
-                   : above[bs * 2 - 1];
-    }
-    dst += stride;
-  }
-}
 
-#if CONFIG_MISC_FIXES
-static INLINE void highbd_d45e_predictor(uint16_t *dst, ptrdiff_t stride,
-                                         int bs, const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  int r, c;
-  (void)left;
-  (void)bd;
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) {
-      dst[c] = AVG3(above[r + c], above[r + c + 1],
-                    above[r + c + 1 + (r + c + 2 < bs * 2)]);
-    }
+  for (x = 0; x < bs - 1; ++x) {
+    dst[x] = AVG3(above[x], above[x + 1], above[x + 2]);
+  }
+  dst[bs - 1] = above_right;
+  dst += stride;
+  for (x = 1, size = bs - 2; x < bs; ++x, --size) {
+    memcpy(dst, dst_row0 + x, size * sizeof(*dst));
+    vpx_memset16(dst + size, above_right, x + 1);
     dst += stride;
   }
 }
-#endif  // CONFIG_MISC_FIXES
 
 static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride,
                                          int bs, const uint16_t *above,
@@ -633,19 +556,30 @@ static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride,
 static INLINE void highbd_d135_predictor(uint16_t *dst, ptrdiff_t stride,
                                          int bs, const uint16_t *above,
                                          const uint16_t *left, int bd) {
-  int r, c;
+  int i;
+#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ > 7
+  // silence a spurious -Warray-bounds warning, possibly related to:
+  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56273
+  uint16_t border[69];
+#else
+  uint16_t border[32 + 32 - 1];  // outer border from bottom-left to top-right
+#endif
   (void)bd;
-  dst[0] = AVG3(left[0], above[-1], above[0]);
-  for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
 
-  dst[stride] = AVG3(above[-1], left[0], left[1]);
-  for (r = 2; r < bs; ++r)
-    dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]);
+  // dst(bs, bs - 2)[0], i.e., border starting at bottom-left
+  for (i = 0; i < bs - 2; ++i) {
+    border[i] = AVG3(left[bs - 3 - i], left[bs - 2 - i], left[bs - 1 - i]);
+  }
+  border[bs - 2] = AVG3(above[-1], left[0], left[1]);
+  border[bs - 1] = AVG3(left[0], above[-1], above[0]);
+  border[bs - 0] = AVG3(above[-1], above[0], above[1]);
+  // dst[0][2, size), i.e., remaining top border ascending
+  for (i = 0; i < bs - 2; ++i) {
+    border[bs + 1 + i] = AVG3(above[i], above[i + 1], above[i + 2]);
+  }
 
-  dst += stride;
-  for (r = 1; r < bs; ++r) {
-    for (c = 1; c < bs; c++) dst[c] = dst[-stride + c - 1];
-    dst += stride;
+  for (i = 0; i < bs; ++i) {
+    memcpy(dst + i * stride, border + bs - 1 - i, bs * sizeof(dst[0]));
   }
 }
 
@@ -776,6 +710,144 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
     dst += stride;
   }
 }
+
+void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const int I = left[0];
+  const int J = left[1];
+  const int K = left[2];
+  const int L = left[3];
+  (void)above;
+  (void)bd;
+  DST(0, 0) = AVG2(I, J);
+  DST(2, 0) = DST(0, 1) = AVG2(J, K);
+  DST(2, 1) = DST(0, 2) = AVG2(K, L);
+  DST(1, 0) = AVG3(I, J, K);
+  DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
+  DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
+  DST(3, 2) = DST(2, 2) = DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
+}
+
+void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  const int A = above[0];
+  const int B = above[1];
+  const int C = above[2];
+  const int D = above[3];
+  const int E = above[4];
+  const int F = above[5];
+  const int G = above[6];
+  (void)left;
+  (void)bd;
+  DST(0, 0) = AVG2(A, B);
+  DST(1, 0) = DST(0, 2) = AVG2(B, C);
+  DST(2, 0) = DST(1, 2) = AVG2(C, D);
+  DST(3, 0) = DST(2, 2) = AVG2(D, E);
+  DST(3, 2) = AVG2(E, F);  // differs from vp8
+
+  DST(0, 1) = AVG3(A, B, C);
+  DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
+  DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
+  DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
+  DST(3, 3) = AVG3(E, F, G);  // differs from vp8
+}
+
+void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  const int A = above[0];
+  const int B = above[1];
+  const int C = above[2];
+  const int D = above[3];
+  const int E = above[4];
+  const int F = above[5];
+  const int G = above[6];
+  const int H = above[7];
+  (void)left;
+  (void)bd;
+  DST(0, 0) = AVG3(A, B, C);
+  DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
+  DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E);
+  DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
+  DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
+  DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
+  DST(3, 3) = H;  // differs from vp8
+}
+
+void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const int I = left[0];
+  const int J = left[1];
+  const int K = left[2];
+  const int X = above[-1];
+  const int A = above[0];
+  const int B = above[1];
+  const int C = above[2];
+  const int D = above[3];
+  (void)bd;
+  DST(0, 0) = DST(1, 2) = AVG2(X, A);
+  DST(1, 0) = DST(2, 2) = AVG2(A, B);
+  DST(2, 0) = DST(3, 2) = AVG2(B, C);
+  DST(3, 0) = AVG2(C, D);
+
+  DST(0, 3) = AVG3(K, J, I);
+  DST(0, 2) = AVG3(J, I, X);
+  DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
+  DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
+  DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
+  DST(3, 1) = AVG3(B, C, D);
+}
+
+void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const int I = left[0];
+  const int J = left[1];
+  const int K = left[2];
+  const int L = left[3];
+  const int X = above[-1];
+  const int A = above[0];
+  const int B = above[1];
+  const int C = above[2];
+  const int D = above[3];
+  (void)bd;
+  DST(0, 3) = AVG3(J, K, L);
+  DST(1, 3) = DST(0, 2) = AVG3(I, J, K);
+  DST(2, 3) = DST(1, 2) = DST(0, 1) = AVG3(X, I, J);
+  DST(3, 3) = DST(2, 2) = DST(1, 1) = DST(0, 0) = AVG3(A, X, I);
+  DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X);
+  DST(3, 1) = DST(2, 0) = AVG3(C, B, A);
+  DST(3, 0) = AVG3(D, C, B);
+}
+
+void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const int I = left[0];
+  const int J = left[1];
+  const int K = left[2];
+  const int L = left[3];
+  const int X = above[-1];
+  const int A = above[0];
+  const int B = above[1];
+  const int C = above[2];
+  (void)bd;
+
+  DST(0, 0) = DST(2, 1) = AVG2(I, X);
+  DST(0, 1) = DST(2, 2) = AVG2(J, I);
+  DST(0, 2) = DST(2, 3) = AVG2(K, J);
+  DST(0, 3) = AVG2(L, K);
+
+  DST(3, 0) = AVG3(A, B, C);
+  DST(2, 0) = AVG3(X, A, B);
+  DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
+  DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
+  DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
+  DST(1, 3) = AVG3(L, K, J);
+}
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 // This serves as a wrapper function, so that all the prediction functions
@@ -811,7 +883,6 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
   intra_pred_sized(type, 8) \
   intra_pred_sized(type, 16) \
   intra_pred_sized(type, 32) \
-  intra_pred_highbd_sized(type, 4) \
   intra_pred_highbd_sized(type, 8) \
   intra_pred_highbd_sized(type, 16) \
   intra_pred_highbd_sized(type, 32)
@@ -832,11 +903,6 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
 intra_pred_no_4x4(d207)
 intra_pred_no_4x4(d63)
 intra_pred_no_4x4(d45)
-#if CONFIG_MISC_FIXES
-intra_pred_allsizes(d207e)
-intra_pred_allsizes(d63e)
-intra_pred_no_4x4(d45e)
-#endif
 intra_pred_no_4x4(d117)
 intra_pred_no_4x4(d135)
 intra_pred_no_4x4(d153)
diff --git a/media/libvpx/libvpx/vpx_dsp/inv_txfm.c b/media/libvpx/libvpx/vpx_dsp/inv_txfm.c
index 0f9aff1892..97655b3a9e 100644
--- a/media/libvpx/libvpx/vpx_dsp/inv_txfm.c
+++ b/media/libvpx/libvpx/vpx_dsp/inv_txfm.c
@@ -67,11 +67,11 @@ void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   }
 }
 
-void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int stride) {
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i;
   tran_high_t a1, e1;
   tran_low_t tmp[4];
-  const tran_low_t *ip = in;
+  const tran_low_t *ip = input;
   tran_low_t *op = tmp;
 
   a1 = ip[0] >> UNIT_QUANT_SHIFT;
@@ -93,17 +93,54 @@ void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int stride) {
   }
 }
 
+void iadst4_c(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+  tran_low_t x0 = input[0];
+  tran_low_t x1 = input[1];
+  tran_low_t x2 = input[2];
+  tran_low_t x3 = input[3];
+
+  if (!(x0 | x1 | x2 | x3)) {
+    memset(output, 0, 4 * sizeof(*output));
+    return;
+  }
+
+  // 32-bit result is enough for the following multiplications.
+  s0 = sinpi_1_9 * x0;
+  s1 = sinpi_2_9 * x0;
+  s2 = sinpi_3_9 * x1;
+  s3 = sinpi_4_9 * x2;
+  s4 = sinpi_1_9 * x2;
+  s5 = sinpi_2_9 * x3;
+  s6 = sinpi_4_9 * x3;
+  s7 = WRAPLOW(x0 - x2 + x3);
+
+  s0 = s0 + s3 + s5;
+  s1 = s1 - s4 - s6;
+  s3 = s2;
+  s2 = sinpi_3_9 * s7;
+
+  // 1-D transform scaling factor is sqrt(2).
+  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+  // + 1b (addition) = 29b.
+  // Hence the output bit depth is 15b.
+  output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
+  output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
+  output[2] = WRAPLOW(dct_const_round_shift(s2));
+  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
+}
+
 void idct4_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step[4];
+  int16_t step[4];
   tran_high_t temp1, temp2;
 
   // stage 1
-  temp1 = (input[0] + input[2]) * cospi_16_64;
-  temp2 = (input[0] - input[2]) * cospi_16_64;
+  temp1 = ((int16_t)input[0] + (int16_t)input[2]) * cospi_16_64;
+  temp2 = ((int16_t)input[0] - (int16_t)input[2]) * cospi_16_64;
   step[0] = WRAPLOW(dct_const_round_shift(temp1));
   step[1] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
-  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+  temp1 = (int16_t)input[1] * cospi_24_64 - (int16_t)input[3] * cospi_8_64;
+  temp2 = (int16_t)input[1] * cospi_8_64 + (int16_t)input[3] * cospi_24_64;
   step[2] = WRAPLOW(dct_const_round_shift(temp1));
   step[3] = WRAPLOW(dct_const_round_shift(temp2));
 
@@ -141,7 +178,8 @@ void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  tran_low_t out =
+      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
 
   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 4);
@@ -155,134 +193,6 @@ void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   }
 }
 
-void idct8_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step1[8], step2[8];
-  tran_high_t temp1, temp2;
-
-  // stage 1
-  step1[0] = input[0];
-  step1[2] = input[4];
-  step1[1] = input[2];
-  step1[3] = input[6];
-  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
-  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
-  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-
-  // stage 2
-  temp1 = (step1[0] + step1[2]) * cospi_16_64;
-  temp2 = (step1[0] - step1[2]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[4] = WRAPLOW(step1[4] + step1[5]);
-  step2[5] = WRAPLOW(step1[4] - step1[5]);
-  step2[6] = WRAPLOW(-step1[6] + step1[7]);
-  step2[7] = WRAPLOW(step1[6] + step1[7]);
-
-  // stage 3
-  step1[0] = WRAPLOW(step2[0] + step2[3]);
-  step1[1] = WRAPLOW(step2[1] + step2[2]);
-  step1[2] = WRAPLOW(step2[1] - step2[2]);
-  step1[3] = WRAPLOW(step2[0] - step2[3]);
-  step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-  step1[7] = step2[7];
-
-  // stage 4
-  output[0] = WRAPLOW(step1[0] + step1[7]);
-  output[1] = WRAPLOW(step1[1] + step1[6]);
-  output[2] = WRAPLOW(step1[2] + step1[5]);
-  output[3] = WRAPLOW(step1[3] + step1[4]);
-  output[4] = WRAPLOW(step1[3] - step1[4]);
-  output[5] = WRAPLOW(step1[2] - step1[5]);
-  output[6] = WRAPLOW(step1[1] - step1[6]);
-  output[7] = WRAPLOW(step1[0] - step1[7]);
-}
-
-void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  int i, j;
-  tran_low_t out[8 * 8];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[8], temp_out[8];
-
-  // First transform rows
-  for (i = 0; i < 8; ++i) {
-    idct8_c(input, outptr);
-    input += 8;
-    outptr += 8;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-    idct8_c(temp_in, temp_out);
-    for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
-    }
-  }
-}
-
-void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  int i, j;
-  tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
-
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
-  a1 = ROUND_POWER_OF_TWO(out, 5);
-  for (j = 0; j < 8; ++j) {
-    for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
-    dest += stride;
-  }
-}
-
-void iadst4_c(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-  tran_low_t x0 = input[0];
-  tran_low_t x1 = input[1];
-  tran_low_t x2 = input[2];
-  tran_low_t x3 = input[3];
-
-  if (!(x0 | x1 | x2 | x3)) {
-    memset(output, 0, 4 * sizeof(*output));
-    return;
-  }
-
-  s0 = sinpi_1_9 * x0;
-  s1 = sinpi_2_9 * x0;
-  s2 = sinpi_3_9 * x1;
-  s3 = sinpi_4_9 * x2;
-  s4 = sinpi_1_9 * x2;
-  s5 = sinpi_2_9 * x3;
-  s6 = sinpi_4_9 * x3;
-  s7 = WRAPLOW(x0 - x2 + x3);
-
-  s0 = s0 + s3 + s5;
-  s1 = s1 - s4 - s6;
-  s3 = s2;
-  s2 = sinpi_3_9 * s7;
-
-  // 1-D transform scaling factor is sqrt(2).
-  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
-  // + 1b (addition) = 29b.
-  // Hence the output bit depth is 15b.
-  output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
-  output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
-  output[2] = WRAPLOW(dct_const_round_shift(s2));
-  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
-}
-
 void iadst8_c(const tran_low_t *input, tran_low_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
   tran_high_t x0 = input[7];
@@ -358,6 +268,85 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) {
   output[7] = WRAPLOW(-x1);
 }
 
+void idct8_c(const tran_low_t *input, tran_low_t *output) {
+  int16_t step1[8], step2[8];
+  tran_high_t temp1, temp2;
+
+  // stage 1
+  step1[0] = (int16_t)input[0];
+  step1[2] = (int16_t)input[4];
+  step1[1] = (int16_t)input[2];
+  step1[3] = (int16_t)input[6];
+  temp1 = (int16_t)input[1] * cospi_28_64 - (int16_t)input[7] * cospi_4_64;
+  temp2 = (int16_t)input[1] * cospi_4_64 + (int16_t)input[7] * cospi_28_64;
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = (int16_t)input[5] * cospi_12_64 - (int16_t)input[3] * cospi_20_64;
+  temp2 = (int16_t)input[5] * cospi_20_64 + (int16_t)input[3] * cospi_12_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+
+  // stage 2
+  temp1 = (step1[0] + step1[2]) * cospi_16_64;
+  temp2 = (step1[0] - step1[2]) * cospi_16_64;
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
+  temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[4] = WRAPLOW(step1[4] + step1[5]);
+  step2[5] = WRAPLOW(step1[4] - step1[5]);
+  step2[6] = WRAPLOW(-step1[6] + step1[7]);
+  step2[7] = WRAPLOW(step1[6] + step1[7]);
+
+  // stage 3
+  step1[0] = WRAPLOW(step2[0] + step2[3]);
+  step1[1] = WRAPLOW(step2[1] + step2[2]);
+  step1[2] = WRAPLOW(step2[1] - step2[2]);
+  step1[3] = WRAPLOW(step2[0] - step2[3]);
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[7] = step2[7];
+
+  // stage 4
+  output[0] = WRAPLOW(step1[0] + step1[7]);
+  output[1] = WRAPLOW(step1[1] + step1[6]);
+  output[2] = WRAPLOW(step1[2] + step1[5]);
+  output[3] = WRAPLOW(step1[3] + step1[4]);
+  output[4] = WRAPLOW(step1[3] - step1[4]);
+  output[5] = WRAPLOW(step1[2] - step1[5]);
+  output[6] = WRAPLOW(step1[1] - step1[6]);
+  output[7] = WRAPLOW(step1[0] - step1[7]);
+}
+
+void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  int i, j;
+  tran_low_t out[8 * 8];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[8], temp_out[8];
+
+  // First transform rows
+  for (i = 0; i < 8; ++i) {
+    idct8_c(input, outptr);
+    input += 8;
+    outptr += 8;
+  }
+
+  // Then transform columns
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
+    idct8_c(temp_in, temp_out);
+    for (j = 0; j < 8; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
+    }
+  }
+}
+
 void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
   tran_low_t out[8 * 8] = { 0 };
@@ -383,193 +372,17 @@ void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   }
 }
 
-void idct16_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step1[16], step2[16];
-  tran_high_t temp1, temp2;
-
-  // stage 1
-  step1[0] = input[0 / 2];
-  step1[1] = input[16 / 2];
-  step1[2] = input[8 / 2];
-  step1[3] = input[24 / 2];
-  step1[4] = input[4 / 2];
-  step1[5] = input[20 / 2];
-  step1[6] = input[12 / 2];
-  step1[7] = input[28 / 2];
-  step1[8] = input[2 / 2];
-  step1[9] = input[18 / 2];
-  step1[10] = input[10 / 2];
-  step1[11] = input[26 / 2];
-  step1[12] = input[6 / 2];
-  step1[13] = input[22 / 2];
-  step1[14] = input[14 / 2];
-  step1[15] = input[30 / 2];
-
-  // stage 2
-  step2[0] = step1[0];
-  step2[1] = step1[1];
-  step2[2] = step1[2];
-  step2[3] = step1[3];
-  step2[4] = step1[4];
-  step2[5] = step1[5];
-  step2[6] = step1[6];
-  step2[7] = step1[7];
-
-  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
-  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[15] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
-  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
-  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
-  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
-
-  // stage 3
-  step1[0] = step2[0];
-  step1[1] = step2[1];
-  step1[2] = step2[2];
-  step1[3] = step2[3];
-
-  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
-  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
-  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-
-  step1[8] = WRAPLOW(step2[8] + step2[9]);
-  step1[9] = WRAPLOW(step2[8] - step2[9]);
-  step1[10] = WRAPLOW(-step2[10] + step2[11]);
-  step1[11] = WRAPLOW(step2[10] + step2[11]);
-  step1[12] = WRAPLOW(step2[12] + step2[13]);
-  step1[13] = WRAPLOW(step2[12] - step2[13]);
-  step1[14] = WRAPLOW(-step2[14] + step2[15]);
-  step1[15] = WRAPLOW(step2[14] + step2[15]);
-
-  // stage 4
-  temp1 = (step1[0] + step1[1]) * cospi_16_64;
-  temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[4] = WRAPLOW(step1[4] + step1[5]);
-  step2[5] = WRAPLOW(step1[4] - step1[5]);
-  step2[6] = WRAPLOW(-step1[6] + step1[7]);
-  step2[7] = WRAPLOW(step1[6] + step1[7]);
-
-  step2[8] = step1[8];
-  step2[15] = step1[15];
-  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
-  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
-  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[11] = step1[11];
-  step2[12] = step1[12];
-
-  // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3]);
-  step1[1] = WRAPLOW(step2[1] + step2[2]);
-  step1[2] = WRAPLOW(step2[1] - step2[2]);
-  step1[3] = WRAPLOW(step2[0] - step2[3]);
-  step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-  step1[7] = step2[7];
-
-  step1[8] = WRAPLOW(step2[8] + step2[11]);
-  step1[9] = WRAPLOW(step2[9] + step2[10]);
-  step1[10] = WRAPLOW(step2[9] - step2[10]);
-  step1[11] = WRAPLOW(step2[8] - step2[11]);
-  step1[12] = WRAPLOW(-step2[12] + step2[15]);
-  step1[13] = WRAPLOW(-step2[13] + step2[14]);
-  step1[14] = WRAPLOW(step2[13] + step2[14]);
-  step1[15] = WRAPLOW(step2[12] + step2[15]);
-
-  // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7]);
-  step2[1] = WRAPLOW(step1[1] + step1[6]);
-  step2[2] = WRAPLOW(step1[2] + step1[5]);
-  step2[3] = WRAPLOW(step1[3] + step1[4]);
-  step2[4] = WRAPLOW(step1[3] - step1[4]);
-  step2[5] = WRAPLOW(step1[2] - step1[5]);
-  step2[6] = WRAPLOW(step1[1] - step1[6]);
-  step2[7] = WRAPLOW(step1[0] - step1[7]);
-  step2[8] = step1[8];
-  step2[9] = step1[9];
-  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
-  temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
-  temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[14] = step1[14];
-  step2[15] = step1[15];
-
-  // stage 7
-  output[0] = WRAPLOW(step2[0] + step2[15]);
-  output[1] = WRAPLOW(step2[1] + step2[14]);
-  output[2] = WRAPLOW(step2[2] + step2[13]);
-  output[3] = WRAPLOW(step2[3] + step2[12]);
-  output[4] = WRAPLOW(step2[4] + step2[11]);
-  output[5] = WRAPLOW(step2[5] + step2[10]);
-  output[6] = WRAPLOW(step2[6] + step2[9]);
-  output[7] = WRAPLOW(step2[7] + step2[8]);
-  output[8] = WRAPLOW(step2[7] - step2[8]);
-  output[9] = WRAPLOW(step2[6] - step2[9]);
-  output[10] = WRAPLOW(step2[5] - step2[10]);
-  output[11] = WRAPLOW(step2[4] - step2[11]);
-  output[12] = WRAPLOW(step2[3] - step2[12]);
-  output[13] = WRAPLOW(step2[2] - step2[13]);
-  output[14] = WRAPLOW(step2[1] - step2[14]);
-  output[15] = WRAPLOW(step2[0] - step2[15]);
-}
-
-void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
-                             int stride) {
+void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
-  tran_low_t out[16 * 16];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[16], temp_out[16];
+  tran_high_t a1;
+  tran_low_t out =
+      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
 
-  // First transform rows
-  for (i = 0; i < 16; ++i) {
-    idct16_c(input, outptr);
-    input += 16;
-    outptr += 16;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-    idct16_c(temp_in, temp_out);
-    for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
-    }
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+  a1 = ROUND_POWER_OF_TWO(out, 5);
+  for (j = 0; j < 8; ++j) {
+    for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
+    dest += stride;
   }
 }
 
@@ -741,6 +554,222 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
   output[15] = WRAPLOW(-x1);
 }
 
+void idct16_c(const tran_low_t *input, tran_low_t *output) {
+  int16_t step1[16], step2[16];
+  tran_high_t temp1, temp2;
+
+  // stage 1
+  step1[0] = (int16_t)input[0 / 2];
+  step1[1] = (int16_t)input[16 / 2];
+  step1[2] = (int16_t)input[8 / 2];
+  step1[3] = (int16_t)input[24 / 2];
+  step1[4] = (int16_t)input[4 / 2];
+  step1[5] = (int16_t)input[20 / 2];
+  step1[6] = (int16_t)input[12 / 2];
+  step1[7] = (int16_t)input[28 / 2];
+  step1[8] = (int16_t)input[2 / 2];
+  step1[9] = (int16_t)input[18 / 2];
+  step1[10] = (int16_t)input[10 / 2];
+  step1[11] = (int16_t)input[26 / 2];
+  step1[12] = (int16_t)input[6 / 2];
+  step1[13] = (int16_t)input[22 / 2];
+  step1[14] = (int16_t)input[14 / 2];
+  step1[15] = (int16_t)input[30 / 2];
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[1] = step1[1];
+  step2[2] = step1[2];
+  step2[3] = step1[3];
+  step2[4] = step1[4];
+  step2[5] = step1[5];
+  step2[6] = step1[6];
+  step2[7] = step1[7];
+
+  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+  step2[8] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[15] = WRAPLOW(dct_const_round_shift(temp2));
+
+  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
+
+  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+
+  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+
+  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+
+  step1[8] = WRAPLOW(step2[8] + step2[9]);
+  step1[9] = WRAPLOW(step2[8] - step2[9]);
+  step1[10] = WRAPLOW(-step2[10] + step2[11]);
+  step1[11] = WRAPLOW(step2[10] + step2[11]);
+  step1[12] = WRAPLOW(step2[12] + step2[13]);
+  step1[13] = WRAPLOW(step2[12] - step2[13]);
+  step1[14] = WRAPLOW(-step2[14] + step2[15]);
+  step1[15] = WRAPLOW(step2[14] + step2[15]);
+
+  // stage 4
+  temp1 = (step1[0] + step1[1]) * cospi_16_64;
+  temp2 = (step1[0] - step1[1]) * cospi_16_64;
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[4] = WRAPLOW(step1[4] + step1[5]);
+  step2[5] = WRAPLOW(step1[4] - step1[5]);
+  step2[6] = WRAPLOW(-step1[6] + step1[7]);
+  step2[7] = WRAPLOW(step1[6] + step1[7]);
+
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  // stage 5
+  step1[0] = WRAPLOW(step2[0] + step2[3]);
+  step1[1] = WRAPLOW(step2[1] + step2[2]);
+  step1[2] = WRAPLOW(step2[1] - step2[2]);
+  step1[3] = WRAPLOW(step2[0] - step2[3]);
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[7] = step2[7];
+
+  step1[8] = WRAPLOW(step2[8] + step2[11]);
+  step1[9] = WRAPLOW(step2[9] + step2[10]);
+  step1[10] = WRAPLOW(step2[9] - step2[10]);
+  step1[11] = WRAPLOW(step2[8] - step2[11]);
+  step1[12] = WRAPLOW(-step2[12] + step2[15]);
+  step1[13] = WRAPLOW(-step2[13] + step2[14]);
+  step1[14] = WRAPLOW(step2[13] + step2[14]);
+  step1[15] = WRAPLOW(step2[12] + step2[15]);
+
+  // stage 6
+  step2[0] = WRAPLOW(step1[0] + step1[7]);
+  step2[1] = WRAPLOW(step1[1] + step1[6]);
+  step2[2] = WRAPLOW(step1[2] + step1[5]);
+  step2[3] = WRAPLOW(step1[3] + step1[4]);
+  step2[4] = WRAPLOW(step1[3] - step1[4]);
+  step2[5] = WRAPLOW(step1[2] - step1[5]);
+  step2[6] = WRAPLOW(step1[1] - step1[6]);
+  step2[7] = WRAPLOW(step1[0] - step1[7]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+  temp2 = (step1[10] + step1[13]) * cospi_16_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+  temp2 = (step1[11] + step1[12]) * cospi_16_64;
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+  output[0] = (tran_low_t)WRAPLOW(step2[0] + step2[15]);
+  output[1] = (tran_low_t)WRAPLOW(step2[1] + step2[14]);
+  output[2] = (tran_low_t)WRAPLOW(step2[2] + step2[13]);
+  output[3] = (tran_low_t)WRAPLOW(step2[3] + step2[12]);
+  output[4] = (tran_low_t)WRAPLOW(step2[4] + step2[11]);
+  output[5] = (tran_low_t)WRAPLOW(step2[5] + step2[10]);
+  output[6] = (tran_low_t)WRAPLOW(step2[6] + step2[9]);
+  output[7] = (tran_low_t)WRAPLOW(step2[7] + step2[8]);
+  output[8] = (tran_low_t)WRAPLOW(step2[7] - step2[8]);
+  output[9] = (tran_low_t)WRAPLOW(step2[6] - step2[9]);
+  output[10] = (tran_low_t)WRAPLOW(step2[5] - step2[10]);
+  output[11] = (tran_low_t)WRAPLOW(step2[4] - step2[11]);
+  output[12] = (tran_low_t)WRAPLOW(step2[3] - step2[12]);
+  output[13] = (tran_low_t)WRAPLOW(step2[2] - step2[13]);
+  output[14] = (tran_low_t)WRAPLOW(step2[1] - step2[14]);
+  output[15] = (tran_low_t)WRAPLOW(step2[0] - step2[15]);
+}
+
+void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
+                             int stride) {
+  int i, j;
+  tran_low_t out[16 * 16];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[16], temp_out[16];
+
+  // First transform rows
+  for (i = 0; i < 16; ++i) {
+    idct16_c(input, outptr);
+    input += 16;
+    outptr += 16;
+  }
+
+  // Then transform columns
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
+    idct16_c(temp_in, temp_out);
+    for (j = 0; j < 16; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
+    }
+  }
+}
+
+void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  int i, j;
+  tran_low_t out[16 * 16] = { 0 };
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[16], temp_out[16];
+
+  // First transform rows. Since all non-zero dct coefficients are in
+  // upper-left 8x8 area, we only need to calculate first 8 rows here.
+  for (i = 0; i < 8; ++i) {
+    idct16_c(input, outptr);
+    input += 16;
+    outptr += 16;
+  }
+
+  // Then transform columns
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
+    idct16_c(temp_in, temp_out);
+    for (j = 0; j < 16; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
+    }
+  }
+}
+
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
                             int stride) {
   int i, j;
@@ -770,7 +799,8 @@ void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
 void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  tran_low_t out =
+      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
 
   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 6);
@@ -781,64 +811,64 @@ void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 }
 
 void idct32_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step1[32], step2[32];
+  int16_t step1[32], step2[32];
   tran_high_t temp1, temp2;
 
   // stage 1
-  step1[0] = input[0];
-  step1[1] = input[16];
-  step1[2] = input[8];
-  step1[3] = input[24];
-  step1[4] = input[4];
-  step1[5] = input[20];
-  step1[6] = input[12];
-  step1[7] = input[28];
-  step1[8] = input[2];
-  step1[9] = input[18];
-  step1[10] = input[10];
-  step1[11] = input[26];
-  step1[12] = input[6];
-  step1[13] = input[22];
-  step1[14] = input[14];
-  step1[15] = input[30];
+  step1[0] = (int16_t)input[0];
+  step1[1] = (int16_t)input[16];
+  step1[2] = (int16_t)input[8];
+  step1[3] = (int16_t)input[24];
+  step1[4] = (int16_t)input[4];
+  step1[5] = (int16_t)input[20];
+  step1[6] = (int16_t)input[12];
+  step1[7] = (int16_t)input[28];
+  step1[8] = (int16_t)input[2];
+  step1[9] = (int16_t)input[18];
+  step1[10] = (int16_t)input[10];
+  step1[11] = (int16_t)input[26];
+  step1[12] = (int16_t)input[6];
+  step1[13] = (int16_t)input[22];
+  step1[14] = (int16_t)input[14];
+  step1[15] = (int16_t)input[30];
 
-  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
-  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
+  temp1 = (int16_t)input[1] * cospi_31_64 - (int16_t)input[31] * cospi_1_64;
+  temp2 = (int16_t)input[1] * cospi_1_64 + (int16_t)input[31] * cospi_31_64;
   step1[16] = WRAPLOW(dct_const_round_shift(temp1));
   step1[31] = WRAPLOW(dct_const_round_shift(temp2));
 
-  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
-  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
+  temp1 = (int16_t)input[17] * cospi_15_64 - (int16_t)input[15] * cospi_17_64;
+  temp2 = (int16_t)input[17] * cospi_17_64 + (int16_t)input[15] * cospi_15_64;
   step1[17] = WRAPLOW(dct_const_round_shift(temp1));
   step1[30] = WRAPLOW(dct_const_round_shift(temp2));
 
-  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
-  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
+  temp1 = (int16_t)input[9] * cospi_23_64 - (int16_t)input[23] * cospi_9_64;
+  temp2 = (int16_t)input[9] * cospi_9_64 + (int16_t)input[23] * cospi_23_64;
   step1[18] = WRAPLOW(dct_const_round_shift(temp1));
   step1[29] = WRAPLOW(dct_const_round_shift(temp2));
 
-  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
-  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
+  temp1 = (int16_t)input[25] * cospi_7_64 - (int16_t)input[7] * cospi_25_64;
+  temp2 = (int16_t)input[25] * cospi_25_64 + (int16_t)input[7] * cospi_7_64;
   step1[19] = WRAPLOW(dct_const_round_shift(temp1));
   step1[28] = WRAPLOW(dct_const_round_shift(temp2));
 
-  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
-  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
+  temp1 = (int16_t)input[5] * cospi_27_64 - (int16_t)input[27] * cospi_5_64;
+  temp2 = (int16_t)input[5] * cospi_5_64 + (int16_t)input[27] * cospi_27_64;
   step1[20] = WRAPLOW(dct_const_round_shift(temp1));
   step1[27] = WRAPLOW(dct_const_round_shift(temp2));
 
-  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
-  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
+  temp1 = (int16_t)input[21] * cospi_11_64 - (int16_t)input[11] * cospi_21_64;
+  temp2 = (int16_t)input[21] * cospi_21_64 + (int16_t)input[11] * cospi_11_64;
   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
 
-  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
-  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
+  temp1 = (int16_t)input[13] * cospi_19_64 - (int16_t)input[19] * cospi_13_64;
+  temp2 = (int16_t)input[13] * cospi_13_64 + (int16_t)input[19] * cospi_19_64;
   step1[22] = WRAPLOW(dct_const_round_shift(temp1));
   step1[25] = WRAPLOW(dct_const_round_shift(temp2));
 
-  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
-  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
+  temp1 = (int16_t)input[29] * cospi_3_64 - (int16_t)input[3] * cospi_29_64;
+  temp2 = (int16_t)input[29] * cospi_29_64 + (int16_t)input[3] * cospi_3_64;
   step1[23] = WRAPLOW(dct_const_round_shift(temp1));
   step1[24] = WRAPLOW(dct_const_round_shift(temp2));
 
@@ -1156,16 +1186,10 @@ void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
 
   // Rows
   for (i = 0; i < 32; ++i) {
-    int16_t zero_coeff[16];
-    for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
-    for (j = 0; j < 8; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 4; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 2; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+    int16_t zero_coeff = 0;
+    for (j = 0; j < 32; ++j) zero_coeff |= input[j];
 
-    if (zero_coeff[0] | zero_coeff[1])
+    if (zero_coeff)
       idct32_c(input, outptr);
     else
       memset(outptr, 0, sizeof(tran_low_t) * 32);
@@ -1239,7 +1263,8 @@ void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  tran_low_t out =
+      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
 
   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 6);
@@ -1264,7 +1289,7 @@ static INLINE int detect_invalid_highbd_input(const tran_low_t *input,
   return 0;
 }
 
-void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
                                  int stride, int bd) {
   /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
      0.5 shifts per pixel. */
@@ -1273,7 +1298,6 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
   tran_high_t a1, b1, c1, d1, e1;
   const tran_low_t *ip = input;
   tran_low_t *op = output;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
   for (i = 0; i < 4; i++) {
     a1 = ip[0] >> UNIT_QUANT_SHIFT;
@@ -1322,14 +1346,13 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
   }
 }
 
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest,
                                 int stride, int bd) {
   int i;
   tran_high_t a1, e1;
   tran_low_t tmp[4];
-  const tran_low_t *ip = in;
+  const tran_low_t *ip = input;
   tran_low_t *op = tmp;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   (void)bd;
 
   a1 = ip[0] >> UNIT_QUANT_SHIFT;
@@ -1351,178 +1374,6 @@ void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
   }
 }
 
-void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
-  tran_low_t step[4];
-  tran_high_t temp1, temp2;
-  (void)bd;
-
-  if (detect_invalid_highbd_input(input, 4)) {
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
-    assert(0 && "invalid highbd txfm input");
-#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
-    memset(output, 0, sizeof(*output) * 4);
-    return;
-  }
-
-  // stage 1
-  temp1 = (input[0] + input[2]) * cospi_16_64;
-  temp2 = (input[0] - input[2]) * cospi_16_64;
-  step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
-  step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
-  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-  step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
-  step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-
-  // stage 2
-  output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
-  output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd);
-  output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
-  output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
-}
-
-void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
-                                 int stride, int bd) {
-  int i, j;
-  tran_low_t out[4 * 4];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[4], temp_out[4];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  // Rows
-  for (i = 0; i < 4; ++i) {
-    vpx_highbd_idct4_c(input, outptr, bd);
-    input += 4;
-    outptr += 4;
-  }
-
-  // Columns
-  for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
-    vpx_highbd_idct4_c(temp_in, temp_out, bd);
-    for (j = 0; j < 4; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
-    }
-  }
-}
-
-void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
-                                int stride, int bd) {
-  int i;
-  tran_high_t a1;
-  tran_low_t out =
-      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
-  a1 = ROUND_POWER_OF_TWO(out, 4);
-
-  for (i = 0; i < 4; i++) {
-    dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
-    dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
-    dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
-    dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
-    dest += stride;
-  }
-}
-
-void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
-  tran_low_t step1[8], step2[8];
-  tran_high_t temp1, temp2;
-
-  if (detect_invalid_highbd_input(input, 8)) {
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
-    assert(0 && "invalid highbd txfm input");
-#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
-    memset(output, 0, sizeof(*output) * 8);
-    return;
-  }
-
-  // stage 1
-  step1[0] = input[0];
-  step1[2] = input[4];
-  step1[1] = input[2];
-  step1[3] = input[6];
-  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
-  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-  step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
-  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-
-  // stage 2 & stage 3 - even half
-  vpx_highbd_idct4_c(step1, step1, bd);
-
-  // stage 2 - odd half
-  step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
-  step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
-  step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
-  step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
-
-  // stage 3 - odd half
-  step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  step1[7] = step2[7];
-
-  // stage 4
-  output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
-  output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
-  output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
-  output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
-  output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
-  output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
-  output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
-  output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
-}
-
-void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
-                                 int stride, int bd) {
-  int i, j;
-  tran_low_t out[8 * 8];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[8], temp_out[8];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  // First transform rows
-  for (i = 0; i < 8; ++i) {
-    vpx_highbd_idct8_c(input, outptr, bd);
-    input += 8;
-    outptr += 8;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-    vpx_highbd_idct8_c(temp_in, temp_out, bd);
-    for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
-    }
-  }
-}
-
-void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
-                                int stride, int bd) {
-  int i, j;
-  tran_high_t a1;
-  tran_low_t out =
-      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
-  a1 = ROUND_POWER_OF_TWO(out, 5);
-  for (j = 0; j < 8; ++j) {
-    for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
-    dest += stride;
-  }
-}
-
 void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
   tran_low_t x0 = input[0];
@@ -1544,13 +1395,13 @@ void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
     return;
   }
 
-  s0 = sinpi_1_9 * x0;
-  s1 = sinpi_2_9 * x0;
-  s2 = sinpi_3_9 * x1;
-  s3 = sinpi_4_9 * x2;
-  s4 = sinpi_1_9 * x2;
-  s5 = sinpi_2_9 * x3;
-  s6 = sinpi_4_9 * x3;
+  s0 = (tran_high_t)sinpi_1_9 * x0;
+  s1 = (tran_high_t)sinpi_2_9 * x0;
+  s2 = (tran_high_t)sinpi_3_9 * x1;
+  s3 = (tran_high_t)sinpi_4_9 * x2;
+  s4 = (tran_high_t)sinpi_1_9 * x2;
+  s5 = (tran_high_t)sinpi_2_9 * x3;
+  s6 = (tran_high_t)sinpi_4_9 * x3;
   s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
 
   s0 = s0 + s3 + s5;
@@ -1568,6 +1419,83 @@ void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
   output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);
 }
 
+void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_low_t step[4];
+  tran_high_t temp1, temp2;
+  (void)bd;
+
+  if (detect_invalid_highbd_input(input, 4)) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+    assert(0 && "invalid highbd txfm input");
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+    memset(output, 0, sizeof(*output) * 4);
+    return;
+  }
+
+  // stage 1
+  temp1 = (input[0] + input[2]) * (tran_high_t)cospi_16_64;
+  temp2 = (input[0] - input[2]) * (tran_high_t)cospi_16_64;
+  step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 =
+      input[1] * (tran_high_t)cospi_24_64 - input[3] * (tran_high_t)cospi_8_64;
+  temp2 =
+      input[1] * (tran_high_t)cospi_8_64 + input[3] * (tran_high_t)cospi_24_64;
+  step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+
+  // stage 2
+  output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
+  output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd);
+  output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
+  output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
+}
+
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
+                                 int stride, int bd) {
+  int i, j;
+  tran_low_t out[4 * 4];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[4], temp_out[4];
+
+  // Rows
+  for (i = 0; i < 4; ++i) {
+    vpx_highbd_idct4_c(input, outptr, bd);
+    input += 4;
+    outptr += 4;
+  }
+
+  // Columns
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
+    vpx_highbd_idct4_c(temp_in, temp_out, bd);
+    for (j = 0; j < 4; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+    }
+  }
+}
+
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest,
+                                int stride, int bd) {
+  int i;
+  tran_high_t a1;
+  tran_low_t out = HIGHBD_WRAPLOW(
+      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+
+  out =
+      HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
+  a1 = ROUND_POWER_OF_TWO(out, 4);
+
+  for (i = 0; i < 4; i++) {
+    dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
+    dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
+    dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
+    dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
+    dest += stride;
+  }
+}
+
 void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
   tran_low_t x0 = input[7];
@@ -1594,14 +1522,14 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   }
 
   // stage 1
-  s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
-  s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
-  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
-  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
-  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
-  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
-  s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
-  s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+  s0 = (tran_high_t)cospi_2_64 * x0 + (tran_high_t)cospi_30_64 * x1;
+  s1 = (tran_high_t)cospi_30_64 * x0 - (tran_high_t)cospi_2_64 * x1;
+  s2 = (tran_high_t)cospi_10_64 * x2 + (tran_high_t)cospi_22_64 * x3;
+  s3 = (tran_high_t)cospi_22_64 * x2 - (tran_high_t)cospi_10_64 * x3;
+  s4 = (tran_high_t)cospi_18_64 * x4 + (tran_high_t)cospi_14_64 * x5;
+  s5 = (tran_high_t)cospi_14_64 * x4 - (tran_high_t)cospi_18_64 * x5;
+  s6 = (tran_high_t)cospi_26_64 * x6 + (tran_high_t)cospi_6_64 * x7;
+  s7 = (tran_high_t)cospi_6_64 * x6 - (tran_high_t)cospi_26_64 * x7;
 
   x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd);
   x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd);
@@ -1617,10 +1545,10 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s1 = x1;
   s2 = x2;
   s3 = x3;
-  s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
-  s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
-  s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
-  s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+  s4 = (tran_high_t)cospi_8_64 * x4 + (tran_high_t)cospi_24_64 * x5;
+  s5 = (tran_high_t)cospi_24_64 * x4 - (tran_high_t)cospi_8_64 * x5;
+  s6 = (tran_high_t)(-cospi_24_64) * x6 + (tran_high_t)cospi_8_64 * x7;
+  s7 = (tran_high_t)cospi_8_64 * x6 + (tran_high_t)cospi_24_64 * x7;
 
   x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
   x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
@@ -1632,10 +1560,10 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
 
   // stage 3
-  s2 = cospi_16_64 * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (x6 - x7);
+  s2 = (tran_high_t)cospi_16_64 * (x2 + x3);
+  s3 = (tran_high_t)cospi_16_64 * (x2 - x3);
+  s6 = (tran_high_t)cospi_16_64 * (x6 + x7);
+  s7 = (tran_high_t)cospi_16_64 * (x6 - x7);
 
   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
@@ -1652,13 +1580,95 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   output[7] = HIGHBD_WRAPLOW(-x1, bd);
 }
 
-void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_low_t step1[8], step2[8];
+  tran_high_t temp1, temp2;
+
+  if (detect_invalid_highbd_input(input, 8)) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+    assert(0 && "invalid highbd txfm input");
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+    memset(output, 0, sizeof(*output) * 8);
+    return;
+  }
+
+  // stage 1
+  step1[0] = input[0];
+  step1[2] = input[4];
+  step1[1] = input[2];
+  step1[3] = input[6];
+  temp1 =
+      input[1] * (tran_high_t)cospi_28_64 - input[7] * (tran_high_t)cospi_4_64;
+  temp2 =
+      input[1] * (tran_high_t)cospi_4_64 + input[7] * (tran_high_t)cospi_28_64;
+  step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 =
+      input[5] * (tran_high_t)cospi_12_64 - input[3] * (tran_high_t)cospi_20_64;
+  temp2 =
+      input[5] * (tran_high_t)cospi_20_64 + input[3] * (tran_high_t)cospi_12_64;
+  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+
+  // stage 2 & stage 3 - even half
+  vpx_highbd_idct4_c(step1, step1, bd);
+
+  // stage 2 - odd half
+  step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
+
+  // stage 3 - odd half
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64;
+  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[7] = step2[7];
+
+  // stage 4
+  output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
+  output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
+  output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
+  output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
+  output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
+  output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
+  output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
+  output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
+}
+
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest,
+                                 int stride, int bd) {
+  int i, j;
+  tran_low_t out[8 * 8];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[8], temp_out[8];
+
+  // First transform rows
+  for (i = 0; i < 8; ++i) {
+    vpx_highbd_idct8_c(input, outptr, bd);
+    input += 8;
+    outptr += 8;
+  }
+
+  // Then transform columns
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
+    vpx_highbd_idct8_c(temp_in, temp_out, bd);
+    for (j = 0; j < 8; ++j) {
+      dest[j * stride + i] = highbd_clip_pixel_add(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+    }
+  }
+}
+
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest,
                                  int stride, int bd) {
   int i, j;
   tran_low_t out[8 * 8] = { 0 };
   tran_low_t *outptr = out;
   tran_low_t temp_in[8], temp_out[8];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
   // First transform rows
   // Only first 4 row has non-zero coefs
@@ -1679,6 +1689,199 @@ void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest8,
   }
 }
 
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest,
+                                int stride, int bd) {
+  int i, j;
+  tran_high_t a1;
+  tran_low_t out = HIGHBD_WRAPLOW(
+      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+
+  out =
+      HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
+  a1 = ROUND_POWER_OF_TWO(out, 5);
+  for (j = 0; j < 8; ++j) {
+    for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
+    dest += stride;
+  }
+}
+
+void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+  tran_high_t s9, s10, s11, s12, s13, s14, s15;
+  tran_low_t x0 = input[15];
+  tran_low_t x1 = input[0];
+  tran_low_t x2 = input[13];
+  tran_low_t x3 = input[2];
+  tran_low_t x4 = input[11];
+  tran_low_t x5 = input[4];
+  tran_low_t x6 = input[9];
+  tran_low_t x7 = input[6];
+  tran_low_t x8 = input[7];
+  tran_low_t x9 = input[8];
+  tran_low_t x10 = input[5];
+  tran_low_t x11 = input[10];
+  tran_low_t x12 = input[3];
+  tran_low_t x13 = input[12];
+  tran_low_t x14 = input[1];
+  tran_low_t x15 = input[14];
+  (void)bd;
+
+  if (detect_invalid_highbd_input(input, 16)) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+    assert(0 && "invalid highbd txfm input");
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+    memset(output, 0, sizeof(*output) * 16);
+    return;
+  }
+
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
+        x13 | x14 | x15)) {
+    memset(output, 0, 16 * sizeof(*output));
+    return;
+  }
+
+  // stage 1
+  s0 = x0 * (tran_high_t)cospi_1_64 + x1 * (tran_high_t)cospi_31_64;
+  s1 = x0 * (tran_high_t)cospi_31_64 - x1 * (tran_high_t)cospi_1_64;
+  s2 = x2 * (tran_high_t)cospi_5_64 + x3 * (tran_high_t)cospi_27_64;
+  s3 = x2 * (tran_high_t)cospi_27_64 - x3 * (tran_high_t)cospi_5_64;
+  s4 = x4 * (tran_high_t)cospi_9_64 + x5 * (tran_high_t)cospi_23_64;
+  s5 = x4 * (tran_high_t)cospi_23_64 - x5 * (tran_high_t)cospi_9_64;
+  s6 = x6 * (tran_high_t)cospi_13_64 + x7 * (tran_high_t)cospi_19_64;
+  s7 = x6 * (tran_high_t)cospi_19_64 - x7 * (tran_high_t)cospi_13_64;
+  s8 = x8 * (tran_high_t)cospi_17_64 + x9 * (tran_high_t)cospi_15_64;
+  s9 = x8 * (tran_high_t)cospi_15_64 - x9 * (tran_high_t)cospi_17_64;
+  s10 = x10 * (tran_high_t)cospi_21_64 + x11 * (tran_high_t)cospi_11_64;
+  s11 = x10 * (tran_high_t)cospi_11_64 - x11 * (tran_high_t)cospi_21_64;
+  s12 = x12 * (tran_high_t)cospi_25_64 + x13 * (tran_high_t)cospi_7_64;
+  s13 = x12 * (tran_high_t)cospi_7_64 - x13 * (tran_high_t)cospi_25_64;
+  s14 = x14 * (tran_high_t)cospi_29_64 + x15 * (tran_high_t)cospi_3_64;
+  s15 = x14 * (tran_high_t)cospi_3_64 - x15 * (tran_high_t)cospi_29_64;
+
+  x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd);
+  x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd);
+  x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd);
+  x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd);
+  x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd);
+  x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd);
+  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd);
+  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd);
+  x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd);
+  x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd);
+  x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd);
+  x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd);
+  x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd);
+  x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd);
+  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd);
+  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4;
+  s5 = x5;
+  s6 = x6;
+  s7 = x7;
+  s8 = x8 * (tran_high_t)cospi_4_64 + x9 * (tran_high_t)cospi_28_64;
+  s9 = x8 * (tran_high_t)cospi_28_64 - x9 * (tran_high_t)cospi_4_64;
+  s10 = x10 * (tran_high_t)cospi_20_64 + x11 * (tran_high_t)cospi_12_64;
+  s11 = x10 * (tran_high_t)cospi_12_64 - x11 * (tran_high_t)cospi_20_64;
+  s12 = -x12 * (tran_high_t)cospi_28_64 + x13 * (tran_high_t)cospi_4_64;
+  s13 = x12 * (tran_high_t)cospi_4_64 + x13 * (tran_high_t)cospi_28_64;
+  s14 = -x14 * (tran_high_t)cospi_12_64 + x15 * (tran_high_t)cospi_20_64;
+  s15 = x14 * (tran_high_t)cospi_20_64 + x15 * (tran_high_t)cospi_12_64;
+
+  x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
+  x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
+  x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
+  x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
+  x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
+  x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
+  x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
+  x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
+  x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd);
+  x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd);
+  x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd);
+  x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd);
+  x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd);
+  x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd);
+  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd);
+  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd);
+
+  // stage 3
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4 * (tran_high_t)cospi_8_64 + x5 * (tran_high_t)cospi_24_64;
+  s5 = x4 * (tran_high_t)cospi_24_64 - x5 * (tran_high_t)cospi_8_64;
+  s6 = -x6 * (tran_high_t)cospi_24_64 + x7 * (tran_high_t)cospi_8_64;
+  s7 = x6 * (tran_high_t)cospi_8_64 + x7 * (tran_high_t)cospi_24_64;
+  s8 = x8;
+  s9 = x9;
+  s10 = x10;
+  s11 = x11;
+  s12 = x12 * (tran_high_t)cospi_8_64 + x13 * (tran_high_t)cospi_24_64;
+  s13 = x12 * (tran_high_t)cospi_24_64 - x13 * (tran_high_t)cospi_8_64;
+  s14 = -x14 * (tran_high_t)cospi_24_64 + x15 * (tran_high_t)cospi_8_64;
+  s15 = x14 * (tran_high_t)cospi_8_64 + x15 * (tran_high_t)cospi_24_64;
+
+  x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
+  x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
+  x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
+  x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
+  x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
+  x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
+  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
+  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
+  x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
+  x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
+  x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
+  x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
+  x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd);
+  x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd);
+  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd);
+  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd);
+
+  // stage 4
+  s2 = (tran_high_t)(-cospi_16_64) * (x2 + x3);
+  s3 = (tran_high_t)cospi_16_64 * (x2 - x3);
+  s6 = (tran_high_t)cospi_16_64 * (x6 + x7);
+  s7 = (tran_high_t)cospi_16_64 * (-x6 + x7);
+  s10 = (tran_high_t)cospi_16_64 * (x10 + x11);
+  s11 = (tran_high_t)cospi_16_64 * (-x10 + x11);
+  s14 = (tran_high_t)(-cospi_16_64) * (x14 + x15);
+  s15 = (tran_high_t)cospi_16_64 * (x14 - x15);
+
+  x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
+  x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
+  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
+  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
+  x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd);
+  x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd);
+  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd);
+  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd);
+
+  output[0] = HIGHBD_WRAPLOW(x0, bd);
+  output[1] = HIGHBD_WRAPLOW(-x8, bd);
+  output[2] = HIGHBD_WRAPLOW(x12, bd);
+  output[3] = HIGHBD_WRAPLOW(-x4, bd);
+  output[4] = HIGHBD_WRAPLOW(x6, bd);
+  output[5] = HIGHBD_WRAPLOW(x14, bd);
+  output[6] = HIGHBD_WRAPLOW(x10, bd);
+  output[7] = HIGHBD_WRAPLOW(x2, bd);
+  output[8] = HIGHBD_WRAPLOW(x3, bd);
+  output[9] = HIGHBD_WRAPLOW(x11, bd);
+  output[10] = HIGHBD_WRAPLOW(x15, bd);
+  output[11] = HIGHBD_WRAPLOW(x7, bd);
+  output[12] = HIGHBD_WRAPLOW(x5, bd);
+  output[13] = HIGHBD_WRAPLOW(-x13, bd);
+  output[14] = HIGHBD_WRAPLOW(x9, bd);
+  output[15] = HIGHBD_WRAPLOW(-x1, bd);
+}
+
 void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   tran_low_t step1[16], step2[16];
   tran_high_t temp1, temp2;
@@ -1720,23 +1923,31 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step2[6] = step1[6];
   step2[7] = step1[7];
 
-  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
-  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+  temp1 =
+      step1[8] * (tran_high_t)cospi_30_64 - step1[15] * (tran_high_t)cospi_2_64;
+  temp2 =
+      step1[8] * (tran_high_t)cospi_2_64 + step1[15] * (tran_high_t)cospi_30_64;
   step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
-  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+  temp1 = step1[9] * (tran_high_t)cospi_14_64 -
+          step1[14] * (tran_high_t)cospi_18_64;
+  temp2 = step1[9] * (tran_high_t)cospi_18_64 +
+          step1[14] * (tran_high_t)cospi_14_64;
   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
-  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+  temp1 = step1[10] * (tran_high_t)cospi_22_64 -
+          step1[13] * (tran_high_t)cospi_10_64;
+  temp2 = step1[10] * (tran_high_t)cospi_10_64 +
+          step1[13] * (tran_high_t)cospi_22_64;
   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
-  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+  temp1 = step1[11] * (tran_high_t)cospi_6_64 -
+          step1[12] * (tran_high_t)cospi_26_64;
+  temp2 = step1[11] * (tran_high_t)cospi_26_64 +
+          step1[12] * (tran_high_t)cospi_6_64;
   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
@@ -1746,12 +1957,16 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[2] = step2[2];
   step1[3] = step2[3];
 
-  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
-  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+  temp1 =
+      step2[4] * (tran_high_t)cospi_28_64 - step2[7] * (tran_high_t)cospi_4_64;
+  temp2 =
+      step2[4] * (tran_high_t)cospi_4_64 + step2[7] * (tran_high_t)cospi_28_64;
   step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
-  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+  temp1 =
+      step2[5] * (tran_high_t)cospi_12_64 - step2[6] * (tran_high_t)cospi_20_64;
+  temp2 =
+      step2[5] * (tran_high_t)cospi_20_64 + step2[6] * (tran_high_t)cospi_12_64;
   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
@@ -1765,12 +1980,14 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
 
   // stage 4
-  temp1 = (step1[0] + step1[1]) * cospi_16_64;
-  temp2 = (step1[0] - step1[1]) * cospi_16_64;
+  temp1 = (step1[0] + step1[1]) * (tran_high_t)cospi_16_64;
+  temp2 = (step1[0] - step1[1]) * (tran_high_t)cospi_16_64;
   step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+  temp1 =
+      step1[2] * (tran_high_t)cospi_24_64 - step1[3] * (tran_high_t)cospi_8_64;
+  temp2 =
+      step1[2] * (tran_high_t)cospi_8_64 + step1[3] * (tran_high_t)cospi_24_64;
   step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
@@ -1780,12 +1997,16 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
 
   step2[8] = step1[8];
   step2[15] = step1[15];
-  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
-  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+  temp1 = -step1[9] * (tran_high_t)cospi_8_64 +
+          step1[14] * (tran_high_t)cospi_24_64;
+  temp2 =
+      step1[9] * (tran_high_t)cospi_24_64 + step1[14] * (tran_high_t)cospi_8_64;
   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
-  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+  temp1 = -step1[10] * (tran_high_t)cospi_24_64 -
+          step1[13] * (tran_high_t)cospi_8_64;
+  temp2 = -step1[10] * (tran_high_t)cospi_8_64 +
+          step1[13] * (tran_high_t)cospi_24_64;
   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[11] = step1[11];
@@ -1797,8 +2018,8 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
   step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
   step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64;
   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
@@ -1823,12 +2044,12 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
   step2[8] = step1[8];
   step2[9] = step1[9];
-  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
-  temp2 = (step1[10] + step1[13]) * cospi_16_64;
+  temp1 = (-step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
+  temp2 = (step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
-  temp2 = (step1[11] + step1[12]) * cospi_16_64;
+  temp1 = (-step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
+  temp2 = (step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[14] = step1[14];
@@ -1853,13 +2074,12 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
 }
 
-void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest,
                                     int stride, int bd) {
   int i, j;
   tran_low_t out[16 * 16];
   tran_low_t *outptr = out;
   tran_low_t temp_in[16], temp_out[16];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
   // First transform rows
   for (i = 0; i < 16; ++i) {
@@ -1879,190 +2099,40 @@ void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
   }
 }
 
-void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
-  tran_high_t s9, s10, s11, s12, s13, s14, s15;
-  tran_low_t x0 = input[15];
-  tran_low_t x1 = input[0];
-  tran_low_t x2 = input[13];
-  tran_low_t x3 = input[2];
-  tran_low_t x4 = input[11];
-  tran_low_t x5 = input[4];
-  tran_low_t x6 = input[9];
-  tran_low_t x7 = input[6];
-  tran_low_t x8 = input[7];
-  tran_low_t x9 = input[8];
-  tran_low_t x10 = input[5];
-  tran_low_t x11 = input[10];
-  tran_low_t x12 = input[3];
-  tran_low_t x13 = input[12];
-  tran_low_t x14 = input[1];
-  tran_low_t x15 = input[14];
-  (void)bd;
-
-  if (detect_invalid_highbd_input(input, 16)) {
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
-    assert(0 && "invalid highbd txfm input");
-#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
-    memset(output, 0, sizeof(*output) * 16);
-    return;
-  }
-
-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
-        x13 | x14 | x15)) {
-    memset(output, 0, 16 * sizeof(*output));
-    return;
-  }
-
-  // stage 1
-  s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
-  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
-  s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
-  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
-  s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
-  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
-  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
-  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
-  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
-  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
-  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
-  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
-  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
-  s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
-  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
-  s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
-
-  x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd);
-  x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd);
-  x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd);
-  x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd);
-  x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd);
-  x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd);
-  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd);
-  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd);
-  x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd);
-  x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd);
-  x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd);
-  x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd);
-  x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd);
-  x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd);
-  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd);
-  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd);
-
-  // stage 2
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4;
-  s5 = x5;
-  s6 = x6;
-  s7 = x7;
-  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
-  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
-  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
-  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
-  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
-  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
-  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
-  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
-
-  x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
-  x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
-  x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
-  x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
-  x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
-  x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
-  x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
-  x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
-  x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd);
-  x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd);
-  x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd);
-  x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd);
-  x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd);
-  x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd);
-  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd);
-  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd);
-
-  // stage 3
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
-  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
-  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
-  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
-  s8 = x8;
-  s9 = x9;
-  s10 = x10;
-  s11 = x11;
-  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
-  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
-  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
-  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
-
-  x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
-  x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
-  x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
-  x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
-  x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
-  x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
-  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
-  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
-  x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
-  x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
-  x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
-  x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
-  x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd);
-  x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd);
-  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd);
-  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd);
-
-  // stage 4
-  s2 = (-cospi_16_64) * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (-x6 + x7);
-  s10 = cospi_16_64 * (x10 + x11);
-  s11 = cospi_16_64 * (-x10 + x11);
-  s14 = (-cospi_16_64) * (x14 + x15);
-  s15 = cospi_16_64 * (x14 - x15);
-
-  x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
-  x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
-  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
-  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
-  x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd);
-  x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd);
-  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd);
-  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd);
-
-  output[0] = HIGHBD_WRAPLOW(x0, bd);
-  output[1] = HIGHBD_WRAPLOW(-x8, bd);
-  output[2] = HIGHBD_WRAPLOW(x12, bd);
-  output[3] = HIGHBD_WRAPLOW(-x4, bd);
-  output[4] = HIGHBD_WRAPLOW(x6, bd);
-  output[5] = HIGHBD_WRAPLOW(x14, bd);
-  output[6] = HIGHBD_WRAPLOW(x10, bd);
-  output[7] = HIGHBD_WRAPLOW(x2, bd);
-  output[8] = HIGHBD_WRAPLOW(x3, bd);
-  output[9] = HIGHBD_WRAPLOW(x11, bd);
-  output[10] = HIGHBD_WRAPLOW(x15, bd);
-  output[11] = HIGHBD_WRAPLOW(x7, bd);
-  output[12] = HIGHBD_WRAPLOW(x5, bd);
-  output[13] = HIGHBD_WRAPLOW(-x13, bd);
-  output[14] = HIGHBD_WRAPLOW(x9, bd);
-  output[15] = HIGHBD_WRAPLOW(-x1, bd);
-}
-
-void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest,
+                                   int stride, int bd) {
+  int i, j;
+  tran_low_t out[16 * 16] = { 0 };
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[16], temp_out[16];
+
+  // First transform rows. Since all non-zero dct coefficients are in
+  // upper-left 8x8 area, we only need to calculate first 8 rows here.
+  for (i = 0; i < 8; ++i) {
+    vpx_highbd_idct16_c(input, outptr, bd);
+    input += 16;
+    outptr += 16;
+  }
+
+  // Then transform columns
+  for (i = 0; i < 16; ++i) {
+    uint16_t *destT = dest;
+    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
+    vpx_highbd_idct16_c(temp_in, temp_out, bd);
+    for (j = 0; j < 16; ++j) {
+      destT[i] = highbd_clip_pixel_add(destT[i],
+                                       ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+      destT += stride;
+    }
+  }
+}
+
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest,
                                    int stride, int bd) {
   int i, j;
   tran_low_t out[16 * 16] = { 0 };
   tran_low_t *outptr = out;
   tran_low_t temp_in[16], temp_out[16];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
   // First transform rows. Since all non-zero dct coefficients are in
   // upper-left 4x4 area, we only need to calculate first 4 rows here.
@@ -2083,15 +2153,15 @@ void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
   }
 }
 
-void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest,
                                   int stride, int bd) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out =
-      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  tran_low_t out = HIGHBD_WRAPLOW(
+      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
 
-  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+  out =
+      HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 6);
   for (j = 0; j < 16; ++j) {
     for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
@@ -2131,43 +2201,59 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step1[14] = input[14];
   step1[15] = input[30];
 
-  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
-  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
+  temp1 =
+      input[1] * (tran_high_t)cospi_31_64 - input[31] * (tran_high_t)cospi_1_64;
+  temp2 =
+      input[1] * (tran_high_t)cospi_1_64 + input[31] * (tran_high_t)cospi_31_64;
   step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
-  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
+  temp1 = input[17] * (tran_high_t)cospi_15_64 -
+          input[15] * (tran_high_t)cospi_17_64;
+  temp2 = input[17] * (tran_high_t)cospi_17_64 +
+          input[15] * (tran_high_t)cospi_15_64;
   step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
-  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
+  temp1 =
+      input[9] * (tran_high_t)cospi_23_64 - input[23] * (tran_high_t)cospi_9_64;
+  temp2 =
+      input[9] * (tran_high_t)cospi_9_64 + input[23] * (tran_high_t)cospi_23_64;
   step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
-  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
+  temp1 =
+      input[25] * (tran_high_t)cospi_7_64 - input[7] * (tran_high_t)cospi_25_64;
+  temp2 =
+      input[25] * (tran_high_t)cospi_25_64 + input[7] * (tran_high_t)cospi_7_64;
   step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
-  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
+  temp1 =
+      input[5] * (tran_high_t)cospi_27_64 - input[27] * (tran_high_t)cospi_5_64;
+  temp2 =
+      input[5] * (tran_high_t)cospi_5_64 + input[27] * (tran_high_t)cospi_27_64;
   step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
-  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
+  temp1 = input[21] * (tran_high_t)cospi_11_64 -
+          input[11] * (tran_high_t)cospi_21_64;
+  temp2 = input[21] * (tran_high_t)cospi_21_64 +
+          input[11] * (tran_high_t)cospi_11_64;
   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
-  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
+  temp1 = input[13] * (tran_high_t)cospi_19_64 -
+          input[19] * (tran_high_t)cospi_13_64;
+  temp2 = input[13] * (tran_high_t)cospi_13_64 +
+          input[19] * (tran_high_t)cospi_19_64;
   step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
-  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
+  temp1 =
+      input[29] * (tran_high_t)cospi_3_64 - input[3] * (tran_high_t)cospi_29_64;
+  temp2 =
+      input[29] * (tran_high_t)cospi_29_64 + input[3] * (tran_high_t)cospi_3_64;
   step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
@@ -2181,23 +2267,31 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step2[6] = step1[6];
   step2[7] = step1[7];
 
-  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
-  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+  temp1 =
+      step1[8] * (tran_high_t)cospi_30_64 - step1[15] * (tran_high_t)cospi_2_64;
+  temp2 =
+      step1[8] * (tran_high_t)cospi_2_64 + step1[15] * (tran_high_t)cospi_30_64;
   step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
-  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+  temp1 = step1[9] * (tran_high_t)cospi_14_64 -
+          step1[14] * (tran_high_t)cospi_18_64;
+  temp2 = step1[9] * (tran_high_t)cospi_18_64 +
+          step1[14] * (tran_high_t)cospi_14_64;
   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
-  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+  temp1 = step1[10] * (tran_high_t)cospi_22_64 -
+          step1[13] * (tran_high_t)cospi_10_64;
+  temp2 = step1[10] * (tran_high_t)cospi_10_64 +
+          step1[13] * (tran_high_t)cospi_22_64;
   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
-  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+  temp1 = step1[11] * (tran_high_t)cospi_6_64 -
+          step1[12] * (tran_high_t)cospi_26_64;
+  temp2 = step1[11] * (tran_high_t)cospi_26_64 +
+          step1[12] * (tran_high_t)cospi_6_64;
   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
@@ -2224,12 +2318,16 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step1[2] = step2[2];
   step1[3] = step2[3];
 
-  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
-  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+  temp1 =
+      step2[4] * (tran_high_t)cospi_28_64 - step2[7] * (tran_high_t)cospi_4_64;
+  temp2 =
+      step2[4] * (tran_high_t)cospi_4_64 + step2[7] * (tran_high_t)cospi_28_64;
   step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
-  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+  temp1 =
+      step2[5] * (tran_high_t)cospi_12_64 - step2[6] * (tran_high_t)cospi_20_64;
+  temp2 =
+      step2[5] * (tran_high_t)cospi_20_64 + step2[6] * (tran_high_t)cospi_12_64;
   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
@@ -2244,22 +2342,30 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
 
   step1[16] = step2[16];
   step1[31] = step2[31];
-  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
-  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
+  temp1 = -step2[17] * (tran_high_t)cospi_4_64 +
+          step2[30] * (tran_high_t)cospi_28_64;
+  temp2 = step2[17] * (tran_high_t)cospi_28_64 +
+          step2[30] * (tran_high_t)cospi_4_64;
   step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
-  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
+  temp1 = -step2[18] * (tran_high_t)cospi_28_64 -
+          step2[29] * (tran_high_t)cospi_4_64;
+  temp2 = -step2[18] * (tran_high_t)cospi_4_64 +
+          step2[29] * (tran_high_t)cospi_28_64;
   step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[19] = step2[19];
   step1[20] = step2[20];
-  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
-  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
+  temp1 = -step2[21] * (tran_high_t)cospi_20_64 +
+          step2[26] * (tran_high_t)cospi_12_64;
+  temp2 = step2[21] * (tran_high_t)cospi_12_64 +
+          step2[26] * (tran_high_t)cospi_20_64;
   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
-  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
+  temp1 = -step2[22] * (tran_high_t)cospi_12_64 -
+          step2[25] * (tran_high_t)cospi_20_64;
+  temp2 = -step2[22] * (tran_high_t)cospi_20_64 +
+          step2[25] * (tran_high_t)cospi_12_64;
   step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[23] = step2[23];
@@ -2268,12 +2374,14 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step1[28] = step2[28];
 
   // stage 4
-  temp1 = (step1[0] + step1[1]) * cospi_16_64;
-  temp2 = (step1[0] - step1[1]) * cospi_16_64;
+  temp1 = (step1[0] + step1[1]) * (tran_high_t)cospi_16_64;
+  temp2 = (step1[0] - step1[1]) * (tran_high_t)cospi_16_64;
   step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+  temp1 =
+      step1[2] * (tran_high_t)cospi_24_64 - step1[3] * (tran_high_t)cospi_8_64;
+  temp2 =
+      step1[2] * (tran_high_t)cospi_8_64 + step1[3] * (tran_high_t)cospi_24_64;
   step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
@@ -2283,12 +2391,16 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
 
   step2[8] = step1[8];
   step2[15] = step1[15];
-  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
-  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+  temp1 = -step1[9] * (tran_high_t)cospi_8_64 +
+          step1[14] * (tran_high_t)cospi_24_64;
+  temp2 =
+      step1[9] * (tran_high_t)cospi_24_64 + step1[14] * (tran_high_t)cospi_8_64;
   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
-  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+  temp1 = -step1[10] * (tran_high_t)cospi_24_64 -
+          step1[13] * (tran_high_t)cospi_8_64;
+  temp2 = -step1[10] * (tran_high_t)cospi_8_64 +
+          step1[13] * (tran_high_t)cospi_24_64;
   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[11] = step1[11];
@@ -2318,8 +2430,8 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
   step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
   step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64;
   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
@@ -2335,20 +2447,28 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
 
   step1[16] = step2[16];
   step1[17] = step2[17];
-  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
-  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
+  temp1 = -step2[18] * (tran_high_t)cospi_8_64 +
+          step2[29] * (tran_high_t)cospi_24_64;
+  temp2 = step2[18] * (tran_high_t)cospi_24_64 +
+          step2[29] * (tran_high_t)cospi_8_64;
   step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
-  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
+  temp1 = -step2[19] * (tran_high_t)cospi_8_64 +
+          step2[28] * (tran_high_t)cospi_24_64;
+  temp2 = step2[19] * (tran_high_t)cospi_24_64 +
+          step2[28] * (tran_high_t)cospi_8_64;
   step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
-  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
+  temp1 = -step2[20] * (tran_high_t)cospi_24_64 -
+          step2[27] * (tran_high_t)cospi_8_64;
+  temp2 = -step2[20] * (tran_high_t)cospi_8_64 +
+          step2[27] * (tran_high_t)cospi_24_64;
   step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
-  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
+  temp1 = -step2[21] * (tran_high_t)cospi_24_64 -
+          step2[26] * (tran_high_t)cospi_8_64;
+  temp2 = -step2[21] * (tran_high_t)cospi_8_64 +
+          step2[26] * (tran_high_t)cospi_24_64;
   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[22] = step2[22];
@@ -2369,12 +2489,12 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
   step2[8] = step1[8];
   step2[9] = step1[9];
-  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
-  temp2 = (step1[10] + step1[13]) * cospi_16_64;
+  temp1 = (-step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
+  temp2 = (step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
-  temp2 = (step1[11] + step1[12]) * cospi_16_64;
+  temp1 = (-step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
+  temp2 = (step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[14] = step1[14];
@@ -2420,20 +2540,20 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step1[17] = step2[17];
   step1[18] = step2[18];
   step1[19] = step2[19];
-  temp1 = (-step2[20] + step2[27]) * cospi_16_64;
-  temp2 = (step2[20] + step2[27]) * cospi_16_64;
+  temp1 = (-step2[20] + step2[27]) * (tran_high_t)cospi_16_64;
+  temp2 = (step2[20] + step2[27]) * (tran_high_t)cospi_16_64;
   step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = (-step2[21] + step2[26]) * cospi_16_64;
-  temp2 = (step2[21] + step2[26]) * cospi_16_64;
+  temp1 = (-step2[21] + step2[26]) * (tran_high_t)cospi_16_64;
+  temp2 = (step2[21] + step2[26]) * (tran_high_t)cospi_16_64;
   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = (-step2[22] + step2[25]) * cospi_16_64;
-  temp2 = (step2[22] + step2[25]) * cospi_16_64;
+  temp1 = (-step2[22] + step2[25]) * (tran_high_t)cospi_16_64;
+  temp2 = (step2[22] + step2[25]) * (tran_high_t)cospi_16_64;
   step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = (-step2[23] + step2[24]) * cospi_16_64;
-  temp2 = (step2[23] + step2[24]) * cospi_16_64;
+  temp1 = (-step2[23] + step2[24]) * (tran_high_t)cospi_16_64;
+  temp2 = (step2[23] + step2[24]) * (tran_high_t)cospi_16_64;
   step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[28] = step2[28];
@@ -2476,26 +2596,19 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
 }
 
-void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest,
                                      int stride, int bd) {
   int i, j;
   tran_low_t out[32 * 32];
   tran_low_t *outptr = out;
   tran_low_t temp_in[32], temp_out[32];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
   // Rows
   for (i = 0; i < 32; ++i) {
-    tran_low_t zero_coeff[16];
-    for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
-    for (j = 0; j < 8; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 4; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 2; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+    tran_low_t zero_coeff = 0;
+    for (j = 0; j < 32; ++j) zero_coeff |= input[j];
 
-    if (zero_coeff[0] | zero_coeff[1])
+    if (zero_coeff)
       highbd_idct32_c(input, outptr, bd);
     else
       memset(outptr, 0, sizeof(tran_low_t) * 32);
@@ -2514,13 +2627,40 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
   }
 }
 
-void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest,
+                                    int stride, int bd) {
+  int i, j;
+  tran_low_t out[32 * 32] = { 0 };
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[32], temp_out[32];
+
+  // Rows
+  // Only upper-left 16x16 has non-zero coeff
+  for (i = 0; i < 16; ++i) {
+    highbd_idct32_c(input, outptr, bd);
+    input += 32;
+    outptr += 32;
+  }
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    uint16_t *destT = dest;
+    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
+    highbd_idct32_c(temp_in, temp_out, bd);
+    for (j = 0; j < 32; ++j) {
+      destT[i] = highbd_clip_pixel_add(destT[i],
+                                       ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+      destT += stride;
+    }
+  }
+}
+
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest,
                                    int stride, int bd) {
   int i, j;
   tran_low_t out[32 * 32] = { 0 };
   tran_low_t *outptr = out;
   tran_low_t temp_in[32], temp_out[32];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
   // Rows
   // Only upper-left 8x8 has non-zero coeff
@@ -2541,15 +2681,15 @@ void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
   }
 }
 
-void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest,
                                   int stride, int bd) {
   int i, j;
   int a1;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  tran_low_t out =
-      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+  tran_low_t out = HIGHBD_WRAPLOW(
+      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
 
-  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+  out =
+      HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 6);
 
   for (j = 0; j < 32; ++j) {
diff --git a/media/libvpx/libvpx/vpx_dsp/inv_txfm.h b/media/libvpx/libvpx/vpx_dsp/inv_txfm.h
index 13137659fa..6eedbeac35 100644
--- a/media/libvpx/libvpx/vpx_dsp/inv_txfm.h
+++ b/media/libvpx/libvpx/vpx_dsp/inv_txfm.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_INV_TXFM_H_
-#define VPX_DSP_INV_TXFM_H_
+#ifndef VPX_VPX_DSP_INV_TXFM_H_
+#define VPX_VPX_DSP_INV_TXFM_H_
 
 #include <assert.h>
 
@@ -76,7 +76,6 @@ static INLINE tran_high_t highbd_check_range(tran_high_t input, int bd) {
 // bd of 10 uses trans_low with 18bits, need to remove 14bits
 // bd of 12 uses trans_low with 20bits, need to remove 12bits
 // bd of x uses trans_low with 8+x bits, need to remove 24-x bits
-
 #define WRAPLOW(x) ((((int32_t)check_range(x)) << 16) >> 16)
 #if CONFIG_VP9_HIGHBITDEPTH
 #define HIGHBD_WRAPLOW(x, bd) \
@@ -123,4 +122,4 @@ static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
 }  // extern "C"
 #endif
 
-#endif  // VPX_DSP_INV_TXFM_H_
+#endif  // VPX_VPX_DSP_INV_TXFM_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/avg_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/avg_lsx.c
new file mode 100644
index 0000000000..750c9de29f
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/avg_lsx.c
@@ -0,0 +1,90 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/bitdepth_conversion_lsx.h"
+
+void vpx_hadamard_8x8_lsx(const int16_t *src, ptrdiff_t src_stride,
+                          tran_low_t *dst) {
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  ptrdiff_t src_stride2 = src_stride << 1;
+  ptrdiff_t src_stride3 = src_stride2 + src_stride;
+  ptrdiff_t src_stride4 = src_stride2 << 1;
+  ptrdiff_t src_stride6 = src_stride3 << 1;
+
+  int16_t *src_tmp = (int16_t *)src;
+  src0 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride2, src_tmp, src_stride4, src1, src2);
+  src3 = __lsx_vldx(src_tmp, src_stride6);
+  src_tmp += src_stride4;
+  src4 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride2, src_tmp, src_stride4, src5, src6);
+  src7 = __lsx_vldx(src_tmp, src_stride6);
+
+  LSX_BUTTERFLY_8_H(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2,
+                    tmp4, tmp6, tmp7, tmp5, tmp3, tmp1);
+  LSX_BUTTERFLY_8_H(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1,
+                    src4, src5, src7, src6, src3, src2);
+  LSX_BUTTERFLY_8_H(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7,
+                    tmp3, tmp4, tmp5, tmp1, tmp6, tmp2);
+  LSX_TRANSPOSE8x8_H(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
+                     src2, src3, src4, src5, src6, src7);
+  LSX_BUTTERFLY_8_H(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2,
+                    tmp4, tmp6, tmp7, tmp5, tmp3, tmp1);
+  LSX_BUTTERFLY_8_H(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1,
+                    src4, src5, src7, src6, src3, src2);
+  LSX_BUTTERFLY_8_H(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7,
+                    tmp3, tmp4, tmp5, tmp1, tmp6, tmp2);
+  store_tran_low(tmp0, dst, 0);
+  store_tran_low(tmp1, dst, 8);
+  store_tran_low(tmp2, dst, 16);
+  store_tran_low(tmp3, dst, 24);
+  store_tran_low(tmp4, dst, 32);
+  store_tran_low(tmp5, dst, 40);
+  store_tran_low(tmp6, dst, 48);
+  store_tran_low(tmp7, dst, 56);
+}
+
+void vpx_hadamard_16x16_lsx(const int16_t *src, ptrdiff_t src_stride,
+                            tran_low_t *dst) {
+  int i;
+  __m128i a0, a1, a2, a3, b0, b1, b2, b3;
+
+  /* Rearrange 16x16 to 8x32 and remove stride.
+   * Top left first. */
+  vpx_hadamard_8x8_lsx(src + 0 + 0 * src_stride, src_stride, dst + 0);
+  /* Top right. */
+  vpx_hadamard_8x8_lsx(src + 8 + 0 * src_stride, src_stride, dst + 64);
+  /* Bottom left. */
+  vpx_hadamard_8x8_lsx(src + 0 + 8 * src_stride, src_stride, dst + 128);
+  /* Bottom right. */
+  vpx_hadamard_8x8_lsx(src + 8 + 8 * src_stride, src_stride, dst + 192);
+
+  for (i = 0; i < 64; i += 8) {
+    a0 = load_tran_low(dst);
+    a1 = load_tran_low(dst + 64);
+    a2 = load_tran_low(dst + 128);
+    a3 = load_tran_low(dst + 192);
+
+    LSX_BUTTERFLY_4_H(a0, a2, a3, a1, b0, b2, b3, b1);
+    DUP4_ARG2(__lsx_vsrai_h, b0, 1, b1, 1, b2, 1, b3, 1, b0, b1, b2, b3);
+    LSX_BUTTERFLY_4_H(b0, b1, b3, b2, a0, a1, a3, a2);
+
+    store_tran_low(a0, dst, 0);
+    store_tran_low(a1, dst, 64);
+    store_tran_low(a2, dst, 128);
+    store_tran_low(a3, dst, 192);
+
+    dst += 8;
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/avg_pred_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/avg_pred_lsx.c
new file mode 100644
index 0000000000..482626080a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/avg_pred_lsx.c
@@ -0,0 +1,83 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+void vpx_comp_avg_pred_lsx(uint8_t *comp_pred, const uint8_t *pred, int width,
+                           int height, const uint8_t *ref, int ref_stride) {
+  // width > 8 || width == 8 || width == 4
+  if (width > 8) {
+    int i, j;
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; j += 16) {
+        __m128i p, r, avg;
+
+        p = __lsx_vld(pred + j, 0);
+        r = __lsx_vld(ref + j, 0);
+        avg = __lsx_vavgr_bu(p, r);
+        __lsx_vst(avg, comp_pred + j, 0);
+      }
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    }
+  } else if (width == 8) {
+    int i = height * width;
+    do {
+      __m128i p, r, r_0, r_1;
+
+      p = __lsx_vld(pred, 0);
+      r_0 = __lsx_vld(ref, 0);
+      ref += ref_stride;
+      r_1 = __lsx_vld(ref, 0);
+      ref += ref_stride;
+      r = __lsx_vilvl_d(r_1, r_0);
+      r = __lsx_vavgr_bu(p, r);
+
+      __lsx_vst(r, comp_pred, 0);
+
+      pred += 16;
+      comp_pred += 16;
+      i -= 16;
+    } while (i);
+  } else {  // width = 4
+    int i = height * width;
+    assert(width == 4);
+    do {
+      __m128i p, r, r_0, r_1, r_2, r_3;
+      p = __lsx_vld(pred, 0);
+
+      if (width == ref_stride) {
+        r = __lsx_vld(ref, 0);
+        ref += 16;
+      } else {
+        r_0 = __lsx_vld(ref, 0);
+        ref += ref_stride;
+        r_1 = __lsx_vld(ref, 0);
+        ref += ref_stride;
+        r_2 = __lsx_vld(ref, 0);
+        ref += ref_stride;
+        r_3 = __lsx_vld(ref, 0);
+        ref += ref_stride;
+        DUP2_ARG2(__lsx_vilvl_w, r_1, r_0, r_3, r_2, r_0, r_2);
+        r = __lsx_vilvl_d(r_2, r_0);
+      }
+      r = __lsx_vavgr_bu(p, r);
+
+      __lsx_vst(r, comp_pred, 0);
+      comp_pred += 16;
+      pred += 16;
+      i -= 16;
+    } while (i);
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/bitdepth_conversion_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/bitdepth_conversion_lsx.h
new file mode 100644
index 0000000000..b0db1e99c5
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/bitdepth_conversion_lsx.h
@@ -0,0 +1,41 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static INLINE __m128i load_tran_low(const tran_low_t *s) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  __m128i v0_m = __lsx_vld(s, 0);
+  __m128i v1_m = __lsx_vld(s + 4, 0);
+  return __lsx_vsrlni_h_w(v0_m, v1_m, 0);
+#else
+  return __lsx_vld(s, 0);
+#endif
+}
+
+static INLINE void store_tran_low(__m128i v, tran_low_t *s, int32_t c) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  __m128i v0_m, v1_m;
+  v1_m = __lsx_vexth_w_h(v);
+  v0_m = __lsx_vsllwil_w_h(v, 0);
+  __lsx_vst(v0_m, s + c, 0);
+  __lsx_vst(v1_m, s + c + 4, 0);
+#else
+  __lsx_vst(v, s + c, 0);
+#endif
+}
+
+#endif  // VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_dct32x32_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_dct32x32_lsx.c
new file mode 100644
index 0000000000..9bb3877212
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_dct32x32_lsx.c
@@ -0,0 +1,1176 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/fwd_txfm_lsx.h"
+#include "vpx_dsp/fwd_txfm.h"
+
+#define UNPCK_SH_SW(in, out0, out1)  \
+  do {                               \
+    out0 = __lsx_vsllwil_w_h(in, 0); \
+    out1 = __lsx_vexth_w_h(in);      \
+  } while (0)
+
+static void fdct8x32_1d_column_load_butterfly(const int16_t *input,
+                                              int32_t src_stride,
+                                              int16_t *temp_buff) {
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i step0, step1, step2, step3;
+  __m128i in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
+  __m128i step0_1, step1_1, step2_1, step3_1;
+
+  int32_t stride = src_stride << 1;
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride2 + stride;
+  const int16_t *input_tmp = (int16_t *)input;
+
+  in0 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1, in2);
+  in3 = __lsx_vldx(input_tmp, stride3);
+
+  input_tmp += stride2;
+  in0_1 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1_1, in2_1);
+  in3_1 = __lsx_vldx(input_tmp, stride3);
+
+  input_tmp = input + (src_stride * 24);
+  in4_1 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5_1, in6_1);
+  in7_1 = __lsx_vldx(input_tmp, stride3);
+
+  input_tmp += stride2;
+  in4 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5, in6);
+  in7 = __lsx_vldx(input_tmp, stride3);
+
+  DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+  DUP4_ARG2(__lsx_vslli_h, in0_1, 2, in1_1, 2, in2_1, 2, in3_1, 2, in0_1, in1_1,
+            in2_1, in3_1);
+  DUP4_ARG2(__lsx_vslli_h, in4_1, 2, in5_1, 2, in6_1, 2, in7_1, 2, in4_1, in5_1,
+            in6_1, in7_1);
+  LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
+                    step3, in4, in5, in6, in7);
+  LSX_BUTTERFLY_8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+                    step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1,
+                    in7_1);
+
+  __lsx_vst(step0, temp_buff, 0);
+  __lsx_vst(step1, temp_buff, 16);
+  __lsx_vst(step2, temp_buff, 32);
+  __lsx_vst(step3, temp_buff, 48);
+
+  __lsx_vst(in4, temp_buff, 448);
+  __lsx_vst(in5, temp_buff, 464);
+  __lsx_vst(in6, temp_buff, 480);
+  __lsx_vst(in7, temp_buff, 496);
+
+  __lsx_vst(step0_1, temp_buff, 64);
+  __lsx_vst(step1_1, temp_buff, 80);
+  __lsx_vst(step2_1, temp_buff, 96);
+  __lsx_vst(step3_1, temp_buff, 112);
+
+  __lsx_vst(in4_1, temp_buff, 384);
+  __lsx_vst(in5_1, temp_buff, 400);
+  __lsx_vst(in6_1, temp_buff, 416);
+  __lsx_vst(in7_1, temp_buff, 432);
+
+  /* 3rd and 4th set */
+  input_tmp = input + (src_stride * 8);
+  in0 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1, in2);
+  in3 = __lsx_vldx(input_tmp, stride3);
+
+  input_tmp += stride2;
+  in0_1 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1_1, in2_1);
+  in3_1 = __lsx_vldx(input_tmp, stride3);
+
+  input_tmp += stride2;
+  in4_1 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5_1, in6_1);
+  in7_1 = __lsx_vldx(input_tmp, stride3);
+
+  input_tmp += stride2;
+  in4 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5, in6);
+  in7 = __lsx_vldx(input_tmp, stride3);
+  DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+  DUP4_ARG2(__lsx_vslli_h, in0_1, 2, in1_1, 2, in2_1, 2, in3_1, 2, in0_1, in1_1,
+            in2_1, in3_1);
+  DUP4_ARG2(__lsx_vslli_h, in4_1, 2, in5_1, 2, in6_1, 2, in7_1, 2, in4_1, in5_1,
+            in6_1, in7_1);
+
+  LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
+                    step3, in4, in5, in6, in7);
+  LSX_BUTTERFLY_8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+                    step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1,
+                    in7_1);
+
+  __lsx_vst(step0, temp_buff, 128);
+  __lsx_vst(step1, temp_buff, 144);
+  __lsx_vst(step2, temp_buff, 160);
+  __lsx_vst(step3, temp_buff, 176);
+
+  __lsx_vst(in4, temp_buff, 320);
+  __lsx_vst(in5, temp_buff, 336);
+  __lsx_vst(in6, temp_buff, 352);
+  __lsx_vst(in7, temp_buff, 368);
+
+  __lsx_vst(step0_1, temp_buff, 192);
+  __lsx_vst(step1_1, temp_buff, 208);
+  __lsx_vst(step2_1, temp_buff, 224);
+  __lsx_vst(step3_1, temp_buff, 240);
+
+  __lsx_vst(in4_1, temp_buff, 256);
+  __lsx_vst(in5_1, temp_buff, 272);
+  __lsx_vst(in6_1, temp_buff, 288);
+  __lsx_vst(in7_1, temp_buff, 304);
+}
+
+static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) {
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i temp0, temp1;
+
+  /* fdct even */
+  DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48, in0, in1, in2,
+            in3);
+  DUP4_ARG2(__lsx_vld, input, 192, input, 208, input, 224, input, 240, in12,
+            in13, in14, in15);
+  LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1,
+                    vec2, vec3, in12, in13, in14, in15);
+  DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 96, input, 112, in4, in5,
+            in6, in7);
+  DUP4_ARG2(__lsx_vld, input, 128, input, 144, input, 160, input, 176, in8, in9,
+            in10, in11);
+  LSX_BUTTERFLY_8_H(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6,
+                    vec7, in8, in9, in10, in11);
+
+  /* Stage 3 */
+  DUP4_ARG2(__lsx_vadd_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0,
+            in1, in2, in3);
+  LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, in4, in1, in0);
+  DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  __lsx_vst(temp0, temp, 0);
+  __lsx_vst(temp1, temp, 1024);
+
+  DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  __lsx_vst(temp0, temp, 512);
+  __lsx_vst(temp1, temp, 1536);
+
+  DUP4_ARG2(__lsx_vsub_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7,
+            vec6, vec5, vec4);
+  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+  DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  __lsx_vst(temp0, temp, 256);
+  __lsx_vst(temp1, temp, 1792);
+
+  DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7);
+  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  __lsx_vst(temp0, temp, 1280);
+  __lsx_vst(temp1, temp, 768);
+
+  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+  DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0,
+            vec1, vec6, in2);
+  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+  DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7);
+  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  __lsx_vst(temp0, temp, 128);
+  __lsx_vst(temp1, temp, 1920);
+
+  DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2);
+  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  __lsx_vst(temp0, temp, 1152);
+  __lsx_vst(temp1, temp, 896);
+
+  DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5);
+  temp0 = __lsx_vneg_h(vec2);
+  DOTP_CONST_PAIR(temp0, vec5, cospi_24_64, cospi_8_64, in2, in1);
+  DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0,
+            vec2, vec5);
+  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  __lsx_vst(temp0, temp, 640);
+  __lsx_vst(temp1, temp, 1408);
+
+  DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4);
+  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  __lsx_vst(temp0, temp, 384);
+  __lsx_vst(temp1, temp, 1664);
+}
+
+static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) {
+  __m128i in16, in17, in18, in19, in20, in21, in22, in23;
+  __m128i in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
+  __m128i tmp0, tmp1;
+
+  DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 160, input, 176, in20, in21,
+            in26, in27);
+
+  DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+  DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+  DUP4_ARG2(__lsx_vld, input, 32, input, 48, input, 192, input, 208, in18, in19,
+            in28, in29);
+
+  vec4 = __lsx_vsub_h(in19, in20);
+  __lsx_vst(vec4, input, 64);
+  vec4 = __lsx_vsub_h(in18, in21);
+  __lsx_vst(vec4, input, 80);
+  vec4 = __lsx_vsub_h(in29, in26);
+  __lsx_vst(vec4, input, 160);
+  vec4 = __lsx_vsub_h(in28, in27);
+  __lsx_vst(vec4, input, 176);
+
+  in21 = __lsx_vadd_h(in18, in21);
+  in20 = __lsx_vadd_h(in19, in20);
+  in27 = __lsx_vadd_h(in28, in27);
+  in26 = __lsx_vadd_h(in29, in26);
+
+  DUP4_ARG2(__lsx_vld, input, 96, input, 112, input, 128, input, 144, in22,
+            in23, in24, in25);
+  DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+  DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+
+  DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 224, input, 240, in16, in17,
+            in30, in31);
+
+  vec4 = __lsx_vsub_h(in17, in22);
+  __lsx_vst(vec4, input, 32);
+  vec4 = __lsx_vsub_h(in16, in23);
+  __lsx_vst(vec4, input, 48);
+  vec4 = __lsx_vsub_h(in31, in24);
+  __lsx_vst(vec4, input, 192);
+  vec4 = __lsx_vsub_h(in30, in25);
+  __lsx_vst(vec4, input, 208);
+
+  DUP4_ARG2(__lsx_vadd_h, in16, in23, in17, in22, in30, in25, in31, in24, in16,
+            in17, in30, in31);
+  DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+  DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+  DUP4_ARG2(__lsx_vadd_h, in16, in19, in17, in18, in30, in29, in31, in28, in27,
+            in22, in21, in25);
+  DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+  DUP2_ARG2(__lsx_vadd_h, in27, in26, in25, in24, in23, in20);
+  DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  __lsx_vst(vec5, temp_ptr, 0);
+  __lsx_vst(vec4, temp_ptr, 1920);
+
+  DUP2_ARG2(__lsx_vsub_h, in27, in26, in25, in24, in22, in21);
+  DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  __lsx_vst(vec5, temp_ptr, 896);
+  __lsx_vst(vec4, temp_ptr, 1024);
+
+  DUP4_ARG2(__lsx_vsub_h, in17, in18, in16, in19, in31, in28, in30, in29, in23,
+            in26, in24, in20);
+  tmp0 = __lsx_vneg_h(in23);
+  DOTP_CONST_PAIR(tmp0, in20, cospi_28_64, cospi_4_64, in27, in25);
+  DUP2_ARG2(__lsx_vsub_h, in26, in27, in24, in25, in23, in20);
+  DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  __lsx_vst(vec4, temp_ptr, 1408);
+  __lsx_vst(vec5, temp_ptr, 512);
+
+  DUP2_ARG2(__lsx_vadd_h, in26, in27, in24, in25, in22, in21);
+  DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  __lsx_vst(vec4, temp_ptr, 384);
+  __lsx_vst(vec5, temp_ptr, 1536);
+
+  DUP4_ARG2(__lsx_vld, input, 32, input, 48, input, 64, input, 80, in22, in23,
+            in20, in21);
+  DUP4_ARG2(__lsx_vld, input, 160, input, 176, input, 192, input, 208, in26,
+            in27, in24, in25);
+  in16 = in20;
+  in17 = in21;
+  DUP2_ARG1(__lsx_vneg_h, in16, in17, tmp0, tmp1);
+  DOTP_CONST_PAIR(tmp0, in27, cospi_24_64, cospi_8_64, in20, in27);
+  DOTP_CONST_PAIR(tmp1, in26, cospi_24_64, cospi_8_64, in21, in26);
+  DUP4_ARG2(__lsx_vsub_h, in23, in20, in22, in21, in25, in26, in24, in27, in28,
+            in17, in18, in31);
+  DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+  DUP2_ARG2(__lsx_vadd_h, in28, in29, in31, in30, in16, in19);
+  DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  __lsx_vst(vec5, temp_ptr, 1664);
+  __lsx_vst(vec4, temp_ptr, 256);
+
+  DUP2_ARG2(__lsx_vsub_h, in28, in29, in31, in30, in17, in18);
+  DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  __lsx_vst(vec5, temp_ptr, 640);
+  __lsx_vst(vec4, temp_ptr, 1280);
+
+  DUP4_ARG2(__lsx_vadd_h, in22, in21, in23, in20, in24, in27, in25, in26, in16,
+            in29, in30, in19);
+  tmp0 = __lsx_vneg_h(in16);
+  DOTP_CONST_PAIR(tmp0, in19, cospi_12_64, cospi_20_64, in28, in31);
+  DUP2_ARG2(__lsx_vsub_h, in29, in28, in30, in31, in16, in19);
+  DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  __lsx_vst(vec5, temp_ptr, 1152);
+  __lsx_vst(vec4, temp_ptr, 768);
+
+  DUP2_ARG2(__lsx_vadd_h, in29, in28, in30, in31, in17, in18);
+  DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  __lsx_vst(vec5, temp_ptr, 128);
+  __lsx_vst(vec4, temp_ptr, 1792);
+}
+
+static void fdct8x32_1d_column(const int16_t *input, int32_t src_stride,
+                               int16_t *tmp_buf, int16_t *tmp_buf_big) {
+  fdct8x32_1d_column_load_butterfly(input, src_stride, tmp_buf);
+  fdct8x32_1d_column_even_store(tmp_buf, tmp_buf_big);
+  fdct8x32_1d_column_odd_store(tmp_buf + 128, (tmp_buf_big + 32));
+}
+
+static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff,
+                                           int16_t *output) {
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+  __m128i step0, step1, step2, step3, step4, step5, step6, step7;
+
+  DUP4_ARG2(__lsx_vld, temp_buff, 0, temp_buff, 64, temp_buff, 128, temp_buff,
+            192, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vld, temp_buff, 256, temp_buff, 320, temp_buff, 384,
+            temp_buff, 448, in4, in5, in6, in7);
+  DUP4_ARG2(__lsx_vld, temp_buff, 48, temp_buff, 112, temp_buff, 176, temp_buff,
+            240, in8, in9, in10, in11);
+  DUP4_ARG2(__lsx_vld, temp_buff, 304, temp_buff, 368, temp_buff, 432,
+            temp_buff, 496, in12, in13, in14, in15);
+  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+  LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+                     in10, in11, in12, in13, in14, in15);
+  LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+                     in11, in12, in13, in14, in15, step0, step1, step2, step3,
+                     step4, step5, step6, step7, in8, in9, in10, in11, in12,
+                     in13, in14, in15);
+
+  __lsx_vst(step0, output, 0);
+  __lsx_vst(step1, output, 16);
+  __lsx_vst(step2, output, 32);
+  __lsx_vst(step3, output, 48);
+  __lsx_vst(step4, output, 64);
+  __lsx_vst(step5, output, 80);
+  __lsx_vst(step6, output, 96);
+  __lsx_vst(step7, output, 112);
+
+  __lsx_vst(in8, output, 384);
+  __lsx_vst(in9, output, 400);
+  __lsx_vst(in10, output, 416);
+  __lsx_vst(in11, output, 432);
+  __lsx_vst(in12, output, 448);
+  __lsx_vst(in13, output, 464);
+  __lsx_vst(in14, output, 480);
+  __lsx_vst(in15, output, 496);
+
+  /* 2nd set */
+  DUP4_ARG2(__lsx_vld, temp_buff, 16, temp_buff, 80, temp_buff, 144, temp_buff,
+            208, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vld, temp_buff, 272, temp_buff, 336, temp_buff, 400,
+            temp_buff, 464, in4, in5, in6, in7);
+  DUP4_ARG2(__lsx_vld, temp_buff, 32, temp_buff, 96, temp_buff, 160, temp_buff,
+            224, in8, in9, in10, in11);
+  DUP4_ARG2(__lsx_vld, temp_buff, 288, temp_buff, 352, temp_buff, 416,
+            temp_buff, 480, in12, in13, in14, in15);
+  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+  LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+                     in10, in11, in12, in13, in14, in15);
+  LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+                     in11, in12, in13, in14, in15, step0, step1, step2, step3,
+                     step4, step5, step6, step7, in8, in9, in10, in11, in12,
+                     in13, in14, in15);
+
+  __lsx_vst(step0, output, 128);
+  __lsx_vst(step1, output, 144);
+  __lsx_vst(step2, output, 160);
+  __lsx_vst(step3, output, 176);
+  __lsx_vst(step4, output, 192);
+  __lsx_vst(step5, output, 208);
+  __lsx_vst(step6, output, 224);
+  __lsx_vst(step7, output, 240);
+
+  __lsx_vst(in8, output, 256);
+  __lsx_vst(in9, output, 272);
+  __lsx_vst(in10, output, 288);
+  __lsx_vst(in11, output, 304);
+  __lsx_vst(in12, output, 320);
+  __lsx_vst(in13, output, 336);
+  __lsx_vst(in14, output, 352);
+  __lsx_vst(in15, output, 368);
+}
+
+static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
+                                    int16_t *out) {
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l;
+  __m128i vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r;
+  __m128i tmp0_w, tmp1_w, tmp2_w, tmp3_w;
+
+  /* fdct32 even */
+  /* stage 2 */
+  DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48, in0, in1, in2,
+            in3);
+  DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 96, input, 112, in4, in5,
+            in6, in7);
+  DUP4_ARG2(__lsx_vld, input, 128, input, 144, input, 160, input, 176, in8, in9,
+            in10, in11);
+  DUP4_ARG2(__lsx_vld, input, 192, input, 208, input, 224, input, 240, in12,
+            in13, in14, in15);
+
+  LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+                     in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4,
+                     vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14,
+                     in15);
+
+  __lsx_vst(vec0, interm_ptr, 0);
+  __lsx_vst(vec1, interm_ptr, 16);
+  __lsx_vst(vec2, interm_ptr, 32);
+  __lsx_vst(vec3, interm_ptr, 48);
+  __lsx_vst(vec4, interm_ptr, 64);
+  __lsx_vst(vec5, interm_ptr, 80);
+  __lsx_vst(vec6, interm_ptr, 96);
+  __lsx_vst(vec7, interm_ptr, 112);
+
+  __lsx_vst(in8, interm_ptr, 128);
+  __lsx_vst(in9, interm_ptr, 144);
+  __lsx_vst(in10, interm_ptr, 160);
+  __lsx_vst(in11, interm_ptr, 176);
+  __lsx_vst(in12, interm_ptr, 192);
+  __lsx_vst(in13, interm_ptr, 208);
+  __lsx_vst(in14, interm_ptr, 224);
+  __lsx_vst(in15, interm_ptr, 240);
+
+  /* Stage 3 */
+  UNPCK_SH_SW(vec0, vec0_l, vec0_r);
+  UNPCK_SH_SW(vec1, vec1_l, vec1_r);
+  UNPCK_SH_SW(vec2, vec2_l, vec2_r);
+  UNPCK_SH_SW(vec3, vec3_l, vec3_r);
+  UNPCK_SH_SW(vec4, vec4_l, vec4_r);
+  UNPCK_SH_SW(vec5, vec5_l, vec5_r);
+  UNPCK_SH_SW(vec6, vec6_l, vec6_r);
+  UNPCK_SH_SW(vec7, vec7_l, vec7_r);
+  DUP4_ARG2(__lsx_vadd_w, vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r,
+            vec3_r, vec4_r, tmp0_w, tmp1_w, tmp2_w, tmp3_w);
+  LSX_BUTTERFLY_4_W(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r,
+                    vec5_r);
+  DUP4_ARG2(__lsx_vadd_w, vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l,
+            vec3_l, vec4_l, vec0_r, vec1_r, vec2_r, vec3_r);
+
+  tmp3_w = __lsx_vadd_w(vec0_r, vec3_r);
+  vec0_r = __lsx_vsub_w(vec0_r, vec3_r);
+  vec3_r = __lsx_vadd_w(vec1_r, vec2_r);
+  vec1_r = __lsx_vsub_w(vec1_r, vec2_r);
+
+  DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64,
+                    vec4_r, tmp3_w, vec6_r, vec3_r);
+  FDCT32_POSTPROC_NEG_W(vec4_r);
+  FDCT32_POSTPROC_NEG_W(tmp3_w);
+  FDCT32_POSTPROC_NEG_W(vec6_r);
+  FDCT32_POSTPROC_NEG_W(vec3_r);
+  DUP2_ARG2(__lsx_vpickev_h, vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
+  __lsx_vst(vec5, out, 0);
+  __lsx_vst(vec4, out, 16);
+
+  DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64,
+                    vec4_r, tmp3_w, vec6_r, vec3_r);
+  FDCT32_POSTPROC_NEG_W(vec4_r);
+  FDCT32_POSTPROC_NEG_W(tmp3_w);
+  FDCT32_POSTPROC_NEG_W(vec6_r);
+  FDCT32_POSTPROC_NEG_W(vec3_r);
+  DUP2_ARG2(__lsx_vpickev_h, vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
+  __lsx_vst(vec5, out, 32);
+  __lsx_vst(vec4, out, 48);
+
+  DUP4_ARG2(__lsx_vld, interm_ptr, 0, interm_ptr, 16, interm_ptr, 32,
+            interm_ptr, 48, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, interm_ptr, 64, interm_ptr, 80, interm_ptr, 96,
+            interm_ptr, 112, vec4, vec5, vec6, vec7);
+  DUP4_ARG2(__lsx_vsub_h, vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4,
+            vec5, vec6, vec7);
+  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+  DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  __lsx_vst(in4, out, 64);
+  __lsx_vst(in5, out, 112);
+
+  DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7);
+  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  __lsx_vst(in4, out, 80);
+  __lsx_vst(in5, out, 96);
+
+  DUP4_ARG2(__lsx_vld, interm_ptr, 128, interm_ptr, 144, interm_ptr, 160,
+            interm_ptr, 176, in8, in9, in10, in11);
+  DUP4_ARG2(__lsx_vld, interm_ptr, 192, interm_ptr, 208, interm_ptr, 224,
+            interm_ptr, 240, in12, in13, in14, in15);
+  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+  DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0,
+            vec1, vec6, in2);
+  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+  DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7);
+  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  __lsx_vst(in4, out, 128);
+  __lsx_vst(in5, out, 240);
+
+  DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2);
+  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  __lsx_vst(in4, out, 144);
+  __lsx_vst(in5, out, 224);
+
+  DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5);
+  tmp0_w = __lsx_vneg_h(vec2);
+  DOTP_CONST_PAIR(tmp0_w, vec5, cospi_24_64, cospi_8_64, in2, in1);
+  DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0,
+            vec2, vec5);
+  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  __lsx_vst(in4, out, 160);
+  __lsx_vst(in5, out, 208);
+
+  DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4);
+  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  __lsx_vst(in4, out, 192);
+  __lsx_vst(in5, out, 176);
+}
+
+static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) {
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
+
+  /* fdct32 even */
+  /* stage 2 */
+  DUP4_ARG2(__lsx_vld, temp, 0, temp, 16, temp, 32, temp, 48, in0, in1, in2,
+            in3);
+  DUP4_ARG2(__lsx_vld, temp, 64, temp, 80, temp, 96, temp, 112, in4, in5, in6,
+            in7);
+  DUP4_ARG2(__lsx_vld, temp, 128, temp, 144, temp, 160, temp, 176, in8, in9,
+            in10, in11);
+  DUP4_ARG2(__lsx_vld, temp, 192, temp, 208, temp, 224, temp, 240, in12, in13,
+            in14, in15);
+
+  LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+                     in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4,
+                     vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14,
+                     in15);
+  /* Stage 3 */
+  DUP4_ARG2(__lsx_vadd_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0,
+            in1, in2, in3);
+  LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, in4, in1, in0);
+  DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  __lsx_vst(temp0, out, 0);
+  __lsx_vst(temp1, out, 16);
+
+  DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  __lsx_vst(temp0, out, 32);
+  __lsx_vst(temp1, out, 48);
+
+  DUP4_ARG2(__lsx_vsub_h, vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4,
+            vec5, vec6, vec7);
+  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+  DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  __lsx_vst(temp0, out, 64);
+  __lsx_vst(temp1, out, 112);
+
+  DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7);
+  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  __lsx_vst(temp0, out, 80);
+  __lsx_vst(temp1, out, 96);
+
+  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+  DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0,
+            vec1, vec6, in2);
+  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+  DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7);
+  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  __lsx_vst(temp0, out, 128);
+  __lsx_vst(temp1, out, 240);
+
+  DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2);
+  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  __lsx_vst(temp0, out, 144);
+  __lsx_vst(temp1, out, 224);
+
+  DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5);
+  temp0 = __lsx_vneg_h(vec2);
+  DOTP_CONST_PAIR(temp0, vec5, cospi_24_64, cospi_8_64, in2, in1);
+  DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0,
+            vec2, vec5)
+  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  __lsx_vst(temp0, out, 160);
+  __lsx_vst(temp1, out, 208);
+
+  DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4);
+  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  __lsx_vst(temp0, out, 192);
+  __lsx_vst(temp1, out, 176);
+}
+
+static void fdct8x32_1d_row_odd(int16_t *temp, int16_t *interm_ptr,
+                                int16_t *out) {
+  __m128i in16, in17, in18, in19, in20, in21, in22, in23;
+  __m128i in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
+  __m128i tmp0, tmp1;
+
+  in20 = __lsx_vld(temp, 64);
+  in21 = __lsx_vld(temp, 80);
+  in26 = __lsx_vld(temp, 160);
+  in27 = __lsx_vld(temp, 176);
+
+  DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+  DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+  in18 = __lsx_vld(temp, 32);
+  in19 = __lsx_vld(temp, 48);
+  in28 = __lsx_vld(temp, 192);
+  in29 = __lsx_vld(temp, 208);
+
+  vec4 = __lsx_vsub_h(in19, in20);
+  __lsx_vst(vec4, interm_ptr, 64);
+  vec4 = __lsx_vsub_h(in18, in21);
+  __lsx_vst(vec4, interm_ptr, 176);
+  vec4 = __lsx_vsub_h(in28, in27);
+  __lsx_vst(vec4, interm_ptr, 112);
+  vec4 = __lsx_vsub_h(in29, in26);
+  __lsx_vst(vec4, interm_ptr, 128);
+
+  DUP4_ARG2(__lsx_vadd_h, in18, in21, in19, in20, in28, in27, in29, in26, in21,
+            in20, in27, in26);
+
+  in22 = __lsx_vld(temp, 96);
+  in23 = __lsx_vld(temp, 112);
+  in24 = __lsx_vld(temp, 128);
+  in25 = __lsx_vld(temp, 144);
+
+  DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+  DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+
+  in16 = __lsx_vld(temp, 0);
+  in17 = __lsx_vld(temp, 16);
+  in30 = __lsx_vld(temp, 224);
+  in31 = __lsx_vld(temp, 240);
+
+  vec4 = __lsx_vsub_h(in17, in22);
+  __lsx_vst(vec4, interm_ptr, 80);
+  vec4 = __lsx_vsub_h(in30, in25);
+  __lsx_vst(vec4, interm_ptr, 96);
+  vec4 = __lsx_vsub_h(in31, in24);
+  __lsx_vst(vec4, interm_ptr, 144);
+  vec4 = __lsx_vsub_h(in16, in23);
+  __lsx_vst(vec4, interm_ptr, 160);
+
+  DUP4_ARG2(__lsx_vadd_h, in16, in23, in17, in22, in30, in25, in31, in24, in16,
+            in17, in30, in31);
+  DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+  DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+
+  DUP4_ARG2(__lsx_vadd_h, in16, in19, in17, in18, in30, in29, in31, in28, in27,
+            in22, in21, in25);
+  DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+  DUP2_ARG2(__lsx_vadd_h, in27, in26, in25, in24, in23, in20);
+
+  DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  __lsx_vst(vec5, out, 0);
+  __lsx_vst(vec4, out, 240);
+
+  DUP2_ARG2(__lsx_vsub_h, in27, in26, in25, in24, in22, in21);
+
+  DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  __lsx_vst(vec5, out, 224);
+  __lsx_vst(vec4, out, 16);
+
+  DUP4_ARG2(__lsx_vsub_h, in17, in18, in16, in19, in31, in28, in30, in29, in23,
+            in26, in24, in20);
+  tmp0 = __lsx_vneg_h(in23);
+  DOTP_CONST_PAIR(tmp0, in20, cospi_28_64, cospi_4_64, in27, in25);
+  DUP2_ARG2(__lsx_vsub_h, in26, in27, in24, in25, in23, in20);
+
+  DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  __lsx_vst(vec4, out, 32);
+  __lsx_vst(vec5, out, 208);
+
+  DUP2_ARG2(__lsx_vadd_h, in26, in27, in24, in25, in22, in21);
+  DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  __lsx_vst(vec4, out, 48);
+  __lsx_vst(vec5, out, 192);
+
+  in20 = __lsx_vld(interm_ptr, 64);
+  in21 = __lsx_vld(interm_ptr, 176);
+  in27 = __lsx_vld(interm_ptr, 112);
+  in26 = __lsx_vld(interm_ptr, 128);
+
+  in16 = in20;
+  in17 = in21;
+  DUP2_ARG1(__lsx_vneg_h, in16, in17, tmp0, tmp1);
+  DOTP_CONST_PAIR(tmp0, in27, cospi_24_64, cospi_8_64, in20, in27);
+  DOTP_CONST_PAIR(tmp1, in26, cospi_24_64, cospi_8_64, in21, in26);
+
+  in22 = __lsx_vld(interm_ptr, 80);
+  in25 = __lsx_vld(interm_ptr, 96);
+  in24 = __lsx_vld(interm_ptr, 144);
+  in23 = __lsx_vld(interm_ptr, 160);
+
+  DUP4_ARG2(__lsx_vsub_h, in23, in20, in22, in21, in25, in26, in24, in27, in28,
+            in17, in18, in31);
+  DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+  DUP2_ARG2(__lsx_vadd_h, in28, in29, in31, in30, in16, in19);
+  DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  __lsx_vst(vec5, out, 64);
+  __lsx_vst(vec4, out, 176);
+
+  DUP2_ARG2(__lsx_vsub_h, in28, in29, in31, in30, in17, in18);
+  DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  __lsx_vst(vec5, out, 80);
+  __lsx_vst(vec4, out, 160);
+
+  DUP4_ARG2(__lsx_vadd_h, in22, in21, in23, in20, in24, in27, in25, in26, in16,
+            in29, in30, in19);
+  tmp0 = __lsx_vneg_h(in16);
+  DOTP_CONST_PAIR(tmp0, in19, cospi_12_64, cospi_20_64, in28, in31);
+  DUP2_ARG2(__lsx_vsub_h, in29, in28, in30, in31, in16, in19);
+
+  DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  __lsx_vst(vec5, out, 144);
+  __lsx_vst(vec4, out, 96);
+
+  DUP2_ARG2(__lsx_vadd_h, in29, in28, in30, in31, in17, in18);
+
+  DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  __lsx_vst(vec4, out, 112);
+  __lsx_vst(vec5, out, 128);
+}
+
+static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) {
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
+
+  /* 1st set */
+  in0 = __lsx_vld(temp, 0);
+  in4 = __lsx_vld(temp, 64);
+  in2 = __lsx_vld(temp, 128);
+  in6 = __lsx_vld(temp, 192);
+  in1 = __lsx_vld(temp, 256);
+  in7 = __lsx_vld(temp, 304);
+  in3 = __lsx_vld(temp, 384);
+  in5 = __lsx_vld(temp, 432);
+
+  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+
+  /* 2nd set */
+  in0_1 = __lsx_vld(temp, 32);
+  in1_1 = __lsx_vld(temp, 464);
+  in2_1 = __lsx_vld(temp, 160);
+  in3_1 = __lsx_vld(temp, 336);
+  in4_1 = __lsx_vld(temp, 96);
+  in5_1 = __lsx_vld(temp, 352);
+  in6_1 = __lsx_vld(temp, 224);
+  in7_1 = __lsx_vld(temp, 480);
+
+  __lsx_vst(in0, output, 0);
+  __lsx_vst(in1, output, 64);
+  __lsx_vst(in2, output, 128);
+  __lsx_vst(in3, output, 192);
+  __lsx_vst(in4, output, 256);
+  __lsx_vst(in5, output, 320);
+  __lsx_vst(in6, output, 384);
+  __lsx_vst(in7, output, 448);
+
+  LSX_TRANSPOSE8x8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+                     in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
+
+  /* 3rd set */
+  in0 = __lsx_vld(temp, 16);
+  in1 = __lsx_vld(temp, 272);
+  in2 = __lsx_vld(temp, 144);
+  in3 = __lsx_vld(temp, 400);
+  in4 = __lsx_vld(temp, 80);
+  in5 = __lsx_vld(temp, 416);
+  in6 = __lsx_vld(temp, 208);
+  in7 = __lsx_vld(temp, 288);
+
+  __lsx_vst(in0_1, output, 16);
+  __lsx_vst(in1_1, output, 80);
+  __lsx_vst(in2_1, output, 144);
+  __lsx_vst(in3_1, output, 208);
+  __lsx_vst(in4_1, output, 272);
+  __lsx_vst(in5_1, output, 336);
+  __lsx_vst(in6_1, output, 400);
+  __lsx_vst(in7_1, output, 464);
+
+  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+
+  __lsx_vst(in0, output, 32);
+  __lsx_vst(in1, output, 96);
+  __lsx_vst(in2, output, 160);
+  __lsx_vst(in3, output, 224);
+  __lsx_vst(in4, output, 288);
+  __lsx_vst(in5, output, 352);
+  __lsx_vst(in6, output, 416);
+  __lsx_vst(in7, output, 480);
+
+  /* 4th set */
+  in0_1 = __lsx_vld(temp, 48);
+  in1_1 = __lsx_vld(temp, 448);
+  in2_1 = __lsx_vld(temp, 176);
+  in3_1 = __lsx_vld(temp, 320);
+  in4_1 = __lsx_vld(temp, 112);
+  in5_1 = __lsx_vld(temp, 368);
+  in6_1 = __lsx_vld(temp, 240);
+  in7_1 = __lsx_vld(temp, 496);
+
+  LSX_TRANSPOSE8x8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+                     in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
+
+  __lsx_vst(in0_1, output, 48);
+  __lsx_vst(in1_1, output, 112);
+  __lsx_vst(in2_1, output, 176);
+  __lsx_vst(in3_1, output, 240);
+  __lsx_vst(in4_1, output, 304);
+  __lsx_vst(in5_1, output, 368);
+  __lsx_vst(in6_1, output, 432);
+  __lsx_vst(in7_1, output, 496);
+}
+
+static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, int16_t *output) {
+  fdct8x32_1d_row_load_butterfly(temp, temp_buf);
+  fdct8x32_1d_row_even(temp_buf, temp_buf);
+  fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128);
+  fdct8x32_1d_row_transpose_store(temp_buf, output);
+}
+
+static void fdct32x8_1d_row_4x(int16_t *tmp_buf_big, int16_t *tmp_buf,
+                               int16_t *output) {
+  fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
+  fdct8x32_1d_row_even_4x(tmp_buf, tmp_buf_big, tmp_buf);
+  fdct8x32_1d_row_odd(tmp_buf + 128, tmp_buf_big, tmp_buf + 128);
+  fdct8x32_1d_row_transpose_store(tmp_buf, output);
+}
+
+void vpx_fdct32x32_lsx(const int16_t *input, int16_t *output,
+                       int32_t src_stride) {
+  int i;
+  DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
+  DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
+
+  /* column transform */
+  for (i = 0; i < 4; ++i) {
+    fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf,
+                       tmp_buf_big + (8 * i));
+  }
+
+  /* row transform */
+  fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output);
+
+  /* row transform */
+  for (i = 1; i < 4; ++i) {
+    fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256));
+  }
+}
+
+static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) {
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
+
+  /* fdct32 even */
+  /* stage 2 */
+  DUP4_ARG2(__lsx_vld, temp, 0, temp, 16, temp, 32, temp, 48, in0, in1, in2,
+            in3);
+  DUP4_ARG2(__lsx_vld, temp, 64, temp, 80, temp, 96, temp, 112, in4, in5, in6,
+            in7);
+  DUP4_ARG2(__lsx_vld, temp, 128, temp, 144, temp, 160, temp, 176, in8, in9,
+            in10, in11);
+  DUP4_ARG2(__lsx_vld, temp, 192, temp, 208, temp, 224, temp, 240, in12, in13,
+            in14, in15);
+  LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+                     in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4,
+                     vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14,
+                     in15);
+
+  FDCT_POSTPROC_2V_NEG_H(vec0, vec1);
+  FDCT_POSTPROC_2V_NEG_H(vec2, vec3);
+  FDCT_POSTPROC_2V_NEG_H(vec4, vec5);
+  FDCT_POSTPROC_2V_NEG_H(vec6, vec7);
+  FDCT_POSTPROC_2V_NEG_H(in8, in9);
+  FDCT_POSTPROC_2V_NEG_H(in10, in11);
+  FDCT_POSTPROC_2V_NEG_H(in12, in13);
+  FDCT_POSTPROC_2V_NEG_H(in14, in15);
+
+  /* Stage 3 */
+  DUP4_ARG2(__lsx_vadd_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0,
+            in1, in2, in3);
+
+  temp0 = __lsx_vadd_h(in0, in3);
+  in0 = __lsx_vsub_h(in0, in3);
+  in3 = __lsx_vadd_h(in1, in2);
+  in1 = __lsx_vsub_h(in1, in2);
+
+  DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0);
+  __lsx_vst(temp0, out, 0);
+  __lsx_vst(temp1, out, 16);
+
+  DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+  __lsx_vst(temp0, out, 32);
+  __lsx_vst(temp1, out, 48);
+
+  DUP4_ARG2(__lsx_vsub_h, vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4,
+            vec5, vec6, vec7);
+  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+  DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+  __lsx_vst(temp0, out, 64);
+  __lsx_vst(temp1, out, 112);
+
+  DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7);
+  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+  __lsx_vst(temp0, out, 80);
+  __lsx_vst(temp1, out, 96);
+
+  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+  DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0,
+            vec1, vec6, in2);
+  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+  DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7);
+  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+  __lsx_vst(temp0, out, 128);
+  __lsx_vst(temp1, out, 240);
+
+  DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2);
+  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+  __lsx_vst(temp0, out, 144);
+  __lsx_vst(temp1, out, 224);
+
+  DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5);
+  temp0 = __lsx_vneg_h(vec2);
+  DOTP_CONST_PAIR(temp0, vec5, cospi_24_64, cospi_8_64, in2, in1);
+  DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0,
+            vec2, vec5);
+  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+  __lsx_vst(temp0, out, 160);
+  __lsx_vst(temp1, out, 208);
+
+  DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4);
+  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+  __lsx_vst(temp0, out, 192);
+  __lsx_vst(temp1, out, 176);
+}
+
+static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr,
+                                   int16_t *out) {
+  __m128i in16, in17, in18, in19, in20, in21, in22, in23;
+  __m128i in24, in25, in26, in27, in28, in29, in30, in31;
+  __m128i vec4, vec5, tmp0, tmp1;
+
+  in20 = __lsx_vld(temp, 64);
+  in21 = __lsx_vld(temp, 80);
+  in26 = __lsx_vld(temp, 160);
+  in27 = __lsx_vld(temp, 176);
+
+  DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+  DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+  FDCT_POSTPROC_2V_NEG_H(in20, in21);
+  FDCT_POSTPROC_2V_NEG_H(in26, in27);
+
+  in18 = __lsx_vld(temp, 32);
+  in19 = __lsx_vld(temp, 48);
+  in28 = __lsx_vld(temp, 192);
+  in29 = __lsx_vld(temp, 208);
+
+  FDCT_POSTPROC_2V_NEG_H(in18, in19);
+  FDCT_POSTPROC_2V_NEG_H(in28, in29);
+
+  vec4 = __lsx_vsub_h(in19, in20);
+  __lsx_vst(vec4, interm_ptr, 64);
+  vec4 = __lsx_vsub_h(in18, in21);
+  __lsx_vst(vec4, interm_ptr, 176);
+  vec4 = __lsx_vsub_h(in29, in26);
+  __lsx_vst(vec4, interm_ptr, 128);
+  vec4 = __lsx_vsub_h(in28, in27);
+  __lsx_vst(vec4, interm_ptr, 112);
+
+  DUP4_ARG2(__lsx_vadd_h, in18, in21, in19, in20, in28, in27, in29, in26, in21,
+            in20, in27, in26);
+
+  in22 = __lsx_vld(temp, 96);
+  in23 = __lsx_vld(temp, 112);
+  in24 = __lsx_vld(temp, 128);
+  in25 = __lsx_vld(temp, 144);
+
+  DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+  DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+  FDCT_POSTPROC_2V_NEG_H(in22, in23);
+  FDCT_POSTPROC_2V_NEG_H(in24, in25);
+
+  in16 = __lsx_vld(temp, 0);
+  in17 = __lsx_vld(temp, 16);
+  in30 = __lsx_vld(temp, 224);
+  in31 = __lsx_vld(temp, 240);
+
+  FDCT_POSTPROC_2V_NEG_H(in16, in17);
+  FDCT_POSTPROC_2V_NEG_H(in30, in31);
+
+  vec4 = __lsx_vsub_h(in17, in22);
+  __lsx_vst(vec4, interm_ptr, 80);
+  vec4 = __lsx_vsub_h(in30, in25);
+  __lsx_vst(vec4, interm_ptr, 96);
+  vec4 = __lsx_vsub_h(in31, in24);
+  __lsx_vst(vec4, interm_ptr, 144);
+  vec4 = __lsx_vsub_h(in16, in23);
+  __lsx_vst(vec4, interm_ptr, 160);
+
+  DUP4_ARG2(__lsx_vadd_h, in16, in23, in17, in22, in30, in25, in31, in24, in16,
+            in17, in30, in31);
+  DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+  DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+  DUP4_ARG2(__lsx_vadd_h, in16, in19, in17, in18, in30, in29, in31, in28, in27,
+            in22, in21, in25);
+  DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+  DUP2_ARG2(__lsx_vadd_h, in27, in26, in25, in24, in23, in20);
+  DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+  __lsx_vst(vec5, out, 0);
+  __lsx_vst(vec4, out, 240);
+
+  DUP2_ARG2(__lsx_vsub_h, in27, in26, in25, in24, in22, in21);
+  DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+  __lsx_vst(vec5, out, 224);
+  __lsx_vst(vec4, out, 16);
+
+  DUP4_ARG2(__lsx_vsub_h, in17, in18, in16, in19, in31, in28, in30, in29, in23,
+            in26, in24, in20);
+  tmp0 = __lsx_vneg_h(in23);
+  DOTP_CONST_PAIR(tmp0, in20, cospi_28_64, cospi_4_64, in27, in25);
+  DUP2_ARG2(__lsx_vsub_h, in26, in27, in24, in25, in23, in20);
+  DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+  __lsx_vst(vec4, out, 32);
+  __lsx_vst(vec5, out, 208);
+
+  DUP2_ARG2(__lsx_vadd_h, in26, in27, in24, in25, in22, in21);
+  DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+  __lsx_vst(vec4, out, 48);
+  __lsx_vst(vec5, out, 192);
+
+  in20 = __lsx_vld(interm_ptr, 64);
+  in21 = __lsx_vld(interm_ptr, 176);
+  in27 = __lsx_vld(interm_ptr, 112);
+  in26 = __lsx_vld(interm_ptr, 128);
+
+  in16 = in20;
+  in17 = in21;
+  DUP2_ARG1(__lsx_vneg_h, in16, in17, tmp0, tmp1);
+  DOTP_CONST_PAIR(tmp0, in27, cospi_24_64, cospi_8_64, in20, in27);
+  DOTP_CONST_PAIR(tmp1, in26, cospi_24_64, cospi_8_64, in21, in26);
+
+  in22 = __lsx_vld(interm_ptr, 80);
+  in25 = __lsx_vld(interm_ptr, 96);
+  in24 = __lsx_vld(interm_ptr, 144);
+  in23 = __lsx_vld(interm_ptr, 160);
+
+  DUP4_ARG2(__lsx_vsub_h, in23, in20, in22, in21, in25, in26, in24, in27, in28,
+            in17, in18, in31);
+  DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+  in16 = __lsx_vadd_h(in28, in29);
+  in19 = __lsx_vadd_h(in31, in30);
+  DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+  __lsx_vst(vec5, out, 64);
+  __lsx_vst(vec4, out, 176);
+
+  DUP2_ARG2(__lsx_vsub_h, in28, in29, in31, in30, in17, in18);
+  DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+  __lsx_vst(vec5, out, 80);
+  __lsx_vst(vec4, out, 160);
+
+  DUP4_ARG2(__lsx_vadd_h, in22, in21, in23, in20, in24, in27, in25, in26, in16,
+            in29, in30, in19);
+  tmp0 = __lsx_vneg_h(in16);
+  DOTP_CONST_PAIR(tmp0, in19, cospi_12_64, cospi_20_64, in28, in31);
+  DUP2_ARG2(__lsx_vsub_h, in29, in28, in30, in31, in16, in19);
+  DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+  __lsx_vst(vec5, out, 144);
+  __lsx_vst(vec4, out, 96);
+
+  DUP2_ARG2(__lsx_vadd_h, in29, in28, in30, in31, in17, in18);
+  DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+  __lsx_vst(vec4, out, 112);
+  __lsx_vst(vec5, out, 128);
+}
+
+static void fdct32x8_1d_row_rd(int16_t *tmp_buf_big, int16_t *tmp_buf,
+                               int16_t *output) {
+  fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
+  fdct8x32_1d_row_even_rd(tmp_buf, tmp_buf);
+  fdct8x32_1d_row_odd_rd((tmp_buf + 128), tmp_buf_big, (tmp_buf + 128));
+  fdct8x32_1d_row_transpose_store(tmp_buf, output);
+}
+
+void vpx_fdct32x32_rd_lsx(const int16_t *input, int16_t *out,
+                          int32_t src_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
+  DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
+
+  /* column transform */
+  for (i = 0; i < 4; ++i) {
+    fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0],
+                       &tmp_buf_big[0] + (8 * i));
+  }
+  /* row transform */
+  for (i = 0; i < 4; ++i) {
+    fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0],
+                       out + (8 * i * 32));
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.c
new file mode 100644
index 0000000000..508532b9d8
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.c
@@ -0,0 +1,350 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/fwd_txfm_lsx.h"
+
+#define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  do {                                                                         \
+    __m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3;                            \
+                                                                               \
+    DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1);                \
+    DUP2_ARG2(__lsx_vilvh_h, _in2, _in0, _in3, _in1, _s2, _s3);                \
+    _t0 = __lsx_vilvl_h(_s1, _s0);                                             \
+    _t1 = __lsx_vilvh_h(_s1, _s0);                                             \
+    _t2 = __lsx_vilvl_h(_s3, _s2);                                             \
+    _t3 = __lsx_vilvh_h(_s3, _s2);                                             \
+    DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2);              \
+    DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3);              \
+  } while (0)
+
+#if !CONFIG_VP9_HIGHBITDEPTH
+void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
+                        int32_t src_stride) {
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+  __m128i stp21, stp22, stp23, stp24, stp25, stp26, stp30;
+  __m128i stp31, stp32, stp33, stp34, stp35, stp36, stp37;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5;
+  __m128i coeff = { 0x187e3b21d2bf2d41, 0x238e3537e782c4df };
+  __m128i coeff1 = { 0x289a317906463fb1, 0x12943d3f1e2b3871 };
+  __m128i coeff2 = { 0xed6cd766c78fc04f, 0x0 };
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t src_stride6 = src_stride4 + src_stride2;
+  int32_t src_stride8 = src_stride4 << 1;
+  int16_t *input_tmp = (int16_t *)input;
+  in0 = __lsx_vld(input_tmp, 0);
+  DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4,
+            input_tmp, src_stride6, input_tmp, src_stride8, in1, in2, in3, in4);
+  input_tmp += src_stride4;
+  DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4,
+            input_tmp, src_stride6, input_tmp, src_stride8, in5, in6, in7, in8);
+  input_tmp += src_stride4;
+  DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4,
+            input_tmp, src_stride6, input_tmp, src_stride8, in9, in10, in11,
+            in12);
+  input_tmp += src_stride4;
+  DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in13,
+            in14);
+  input_tmp += src_stride2;
+  in15 = __lsx_vldx(input_tmp, src_stride2);
+
+  DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+  DUP4_ARG2(__lsx_vslli_h, in8, 2, in9, 2, in10, 2, in11, 2, in8, in9, in10,
+            in11);
+  DUP4_ARG2(__lsx_vslli_h, in12, 2, in13, 2, in14, 2, in15, 2, in12, in13, in14,
+            in15);
+  DUP4_ARG2(__lsx_vadd_h, in0, in15, in1, in14, in2, in13, in3, in12, tmp0,
+            tmp1, tmp2, tmp3);
+  DUP4_ARG2(__lsx_vadd_h, in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5,
+            tmp6, tmp7);
+  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
+                tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+  __lsx_vst(tmp0, tmp_ptr, 0);
+  __lsx_vst(tmp1, tmp_ptr, 64);
+  __lsx_vst(tmp2, tmp_ptr, 128);
+  __lsx_vst(tmp3, tmp_ptr, 192);
+  __lsx_vst(tmp4, tmp_ptr, 256);
+  __lsx_vst(tmp5, tmp_ptr, 320);
+  __lsx_vst(tmp6, tmp_ptr, 384);
+  __lsx_vst(tmp7, tmp_ptr, 448);
+  DUP4_ARG2(__lsx_vsub_h, in0, in15, in1, in14, in2, in13, in3, in12, in15,
+            in14, in13, in12);
+  DUP4_ARG2(__lsx_vsub_h, in4, in11, in5, in10, in6, in9, in7, in8, in11, in10,
+            in9, in8);
+
+  tmp_ptr += 16;
+
+  /* stp 1 */
+  DUP2_ARG2(__lsx_vilvh_h, in10, in13, in11, in12, vec2, vec4);
+  DUP2_ARG2(__lsx_vilvl_h, in10, in13, in11, in12, vec3, vec5);
+
+  cnst4 = __lsx_vreplvei_h(coeff, 0);
+  DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4, stp25);
+
+  cnst5 = __lsx_vreplvei_h(coeff, 1);
+  cnst5 = __lsx_vpackev_h(cnst5, cnst4);
+  DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5, stp22);
+  DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4, stp24);
+  DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5, stp23);
+
+  /* stp2 */
+  LSX_BUTTERFLY_4_H(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33);
+  LSX_BUTTERFLY_4_H(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34);
+  DUP2_ARG2(__lsx_vilvh_h, stp36, stp31, stp35, stp32, vec2, vec4);
+  DUP2_ARG2(__lsx_vilvl_h, stp36, stp31, stp35, stp32, vec3, vec5);
+  DUP2_ARG2(__lsx_vreplvei_h, coeff, 2, coeff, 3, cnst0, cnst1);
+  cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+  DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0, stp26);
+
+  cnst0 = __lsx_vreplvei_h(coeff, 4);
+  cnst1 = __lsx_vpackev_h(cnst1, cnst0);
+  DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1, stp21);
+
+  LSX_BUTTERFLY_4_H(stp30, stp37, stp26, stp21, in8, in15, in14, in9);
+  vec1 = __lsx_vilvl_h(in15, in8);
+  vec0 = __lsx_vilvh_h(in15, in8);
+
+  DUP2_ARG2(__lsx_vreplvei_h, coeff1, 0, coeff1, 1, cnst0, cnst1);
+  cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+
+  DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+  __lsx_vst(in8, tmp_ptr, 0);
+
+  cnst0 = __lsx_vreplvei_h(coeff2, 0);
+  cnst0 = __lsx_vpackev_h(cnst1, cnst0);
+  DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+  __lsx_vst(in8, tmp_ptr, 448);
+
+  vec1 = __lsx_vilvl_h(in14, in9);
+  vec0 = __lsx_vilvh_h(in14, in9);
+  DUP2_ARG2(__lsx_vreplvei_h, coeff1, 2, coeff1, 3, cnst0, cnst1);
+  cnst1 = __lsx_vpackev_h(cnst1, cnst0);
+
+  DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1, in8);
+  __lsx_vst(in8, tmp_ptr, 256);
+
+  cnst1 = __lsx_vreplvei_h(coeff2, 2);
+  cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+  DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+  __lsx_vst(in8, tmp_ptr, 192);
+
+  DUP2_ARG2(__lsx_vreplvei_h, coeff, 2, coeff, 5, cnst0, cnst1);
+  cnst1 = __lsx_vpackev_h(cnst1, cnst0);
+  DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1, stp25);
+
+  cnst1 = __lsx_vreplvei_h(coeff, 3);
+  cnst1 = __lsx_vpackev_h(cnst0, cnst1);
+  DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1, stp22);
+
+  /* stp4 */
+  DUP2_ARG2(__lsx_vadd_h, stp34, stp25, stp33, stp22, in13, in10);
+
+  vec1 = __lsx_vilvl_h(in13, in10);
+  vec0 = __lsx_vilvh_h(in13, in10);
+  DUP2_ARG2(__lsx_vreplvei_h, coeff1, 4, coeff1, 5, cnst0, cnst1);
+  cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+  DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+  __lsx_vst(in8, tmp_ptr, 128);
+
+  cnst0 = __lsx_vreplvei_h(coeff2, 1);
+  cnst0 = __lsx_vpackev_h(cnst1, cnst0);
+  DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+  __lsx_vst(in8, tmp_ptr, 320);
+
+  DUP2_ARG2(__lsx_vsub_h, stp34, stp25, stp33, stp22, in12, in11);
+  vec1 = __lsx_vilvl_h(in12, in11);
+  vec0 = __lsx_vilvh_h(in12, in11);
+  DUP2_ARG2(__lsx_vreplvei_h, coeff1, 6, coeff1, 7, cnst0, cnst1);
+  cnst1 = __lsx_vpackev_h(cnst1, cnst0);
+
+  DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1, in8);
+  __lsx_vst(in8, tmp_ptr, 384);
+
+  cnst1 = __lsx_vreplvei_h(coeff2, 3);
+  cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+  DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+  __lsx_vst(in8, tmp_ptr, 64);
+}
+
+void fdct16x8_1d_row(int16_t *input, int16_t *output) {
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+  int16_t *input_tmp = input;
+
+  DUP4_ARG2(__lsx_vld, input, 0, input, 32, input, 64, input, 96, in0, in1, in2,
+            in3);
+  DUP4_ARG2(__lsx_vld, input, 128, input, 160, input, 192, input, 224, in4, in5,
+            in6, in7);
+  DUP4_ARG2(__lsx_vld, input_tmp, 16, input_tmp, 48, input_tmp, 80, input_tmp,
+            112, in8, in9, in10, in11);
+  DUP4_ARG2(__lsx_vld, input_tmp, 144, input_tmp, 176, input_tmp, 208,
+            input_tmp, 240, in12, in13, in14, in15);
+
+  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+  LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+                     in10, in11, in12, in13, in14, in15);
+  DUP4_ARG2(__lsx_vaddi_hu, in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vaddi_hu, in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7);
+  DUP4_ARG2(__lsx_vaddi_hu, in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10,
+            in11);
+  DUP4_ARG2(__lsx_vaddi_hu, in12, 1, in13, 1, in14, 1, in15, 1, in12, in13,
+            in14, in15);
+
+  DUP4_ARG2(__lsx_vsrai_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vsrai_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+  DUP4_ARG2(__lsx_vsrai_h, in8, 2, in9, 2, in10, 2, in11, 2, in8, in9, in10,
+            in11);
+  DUP4_ARG2(__lsx_vsrai_h, in12, 2, in13, 2, in14, 2, in15, 2, in12, in13, in14,
+            in15);
+  LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+                     in11, in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4,
+                     tmp5, tmp6, tmp7, in8, in9, in10, in11, in12, in13, in14,
+                     in15);
+  __lsx_vst(in8, input, 0);
+  __lsx_vst(in9, input, 32);
+  __lsx_vst(in10, input, 64);
+  __lsx_vst(in11, input, 96);
+  __lsx_vst(in12, input, 128);
+  __lsx_vst(in13, input, 160);
+  __lsx_vst(in14, input, 192);
+  __lsx_vst(in15, input, 224);
+
+  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
+                tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+  DUP4_ARG2(__lsx_vld, input, 0, input, 32, input, 64, input, 96, in8, in9,
+            in10, in11);
+  DUP4_ARG2(__lsx_vld, input, 128, input, 160, input, 192, input, 224, in12,
+            in13, in14, in15);
+  FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3,
+               in4, in5, in6, in7);
+  LSX_TRANSPOSE8x8_H(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0,
+                     tmp1, in1, tmp2, in2, tmp3, in3);
+  __lsx_vst(tmp0, output, 0);
+  __lsx_vst(in0, output, 32);
+  __lsx_vst(tmp1, output, 64);
+  __lsx_vst(in1, output, 96);
+  __lsx_vst(tmp2, output, 128);
+  __lsx_vst(in2, output, 160);
+  __lsx_vst(tmp3, output, 192);
+  __lsx_vst(in3, output, 224);
+
+  LSX_TRANSPOSE8x8_H(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4,
+                     tmp5, in5, tmp6, in6, tmp7, in7);
+  __lsx_vst(tmp4, output, 16);
+  __lsx_vst(in4, output, 48);
+  __lsx_vst(tmp5, output, 80);
+  __lsx_vst(in5, output, 112);
+  __lsx_vst(tmp6, output, 144);
+  __lsx_vst(in6, output, 176);
+  __lsx_vst(tmp7, output, 208);
+  __lsx_vst(in7, output, 240);
+}
+
+void vpx_fdct4x4_lsx(const int16_t *input, int16_t *output,
+                     int32_t src_stride) {
+  __m128i in0, in1, in2, in3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t src_stride6 = src_stride4 + src_stride2;
+
+  in0 = __lsx_vld(input, 0);
+  DUP2_ARG2(__lsx_vldx, input, src_stride2, input, src_stride4, in1, in2);
+  in3 = __lsx_vldx(input, src_stride6);
+
+  /* fdct4 pre-process */
+  {
+    __m128i vec, mask;
+    __m128i zero = __lsx_vldi(0);
+
+    mask = __lsx_vinsgr2vr_b(zero, 1, 0);
+    DUP4_ARG2(__lsx_vslli_h, in0, 4, in1, 4, in2, 4, in3, 4, in0, in1, in2,
+              in3);
+    vec = __lsx_vseqi_h(in0, 0);
+    vec = __lsx_vxori_b(vec, 255);
+    vec = __lsx_vand_v(mask, vec);
+    in0 = __lsx_vadd_h(in0, vec);
+  }
+
+  VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+  LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+  VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+  LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vaddi_hu, in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vsrai_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+  DUP2_ARG2(__lsx_vpickev_d, in1, in0, in3, in2, in0, in2);
+  __lsx_vst(in0, output, 0);
+  __lsx_vst(in2, output, 16);
+}
+
+void vpx_fdct8x8_lsx(const int16_t *input, int16_t *output,
+                     int32_t src_stride) {
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t src_stride6 = src_stride4 + src_stride2;
+  int16_t *input_tmp = (int16_t *)input;
+
+  in0 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in1,
+            in2);
+  in3 = __lsx_vldx(input_tmp, src_stride6);
+  input_tmp += src_stride4;
+  in4 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in5,
+            in6);
+  in7 = __lsx_vldx(input_tmp, src_stride6);
+
+  DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+
+  VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+            in5, in6, in7);
+  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+  VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+            in5, in6, in7);
+  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+  SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
+
+  __lsx_vst(in0, output, 0);
+  __lsx_vst(in1, output, 16);
+  __lsx_vst(in2, output, 32);
+  __lsx_vst(in3, output, 48);
+  __lsx_vst(in4, output, 64);
+  __lsx_vst(in5, output, 80);
+  __lsx_vst(in6, output, 96);
+  __lsx_vst(in7, output, 112);
+}
+
+void vpx_fdct16x16_lsx(const int16_t *input, int16_t *output,
+                       int32_t src_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]);
+
+  /* column transform */
+  for (i = 0; i < 2; ++i) {
+    fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride);
+  }
+
+  /* row transform */
+  for (i = 0; i < 2; ++i) {
+    fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i)));
+  }
+}
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.h
new file mode 100644
index 0000000000..4a9fce9a3d
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.h
@@ -0,0 +1,381 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_
+
+#include "vpx_dsp/loongarch/txfm_macros_lsx.h"
+#include "vpx_dsp/txfm_common.h"
+
+#define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3)                 \
+  do {                                                                        \
+    __m128i cnst0_m, cnst1_m, cnst2_m, cnst3_m;                               \
+    __m128i vec0_m, vec1_m, vec2_m, vec3_m;                                   \
+    __m128i vec4_m, vec5_m, vec6_m, vec7_m;                                   \
+    __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x000000000000c4df };             \
+                                                                              \
+    LSX_BUTTERFLY_4_H(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m);    \
+    DUP2_ARG2(__lsx_vilvl_h, vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, cnst0_m, cnst1_m);    \
+    cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
+    vec5_m = __lsx_vdp2_w_h(vec0_m, cnst1_m);                                 \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 3, cnst2_m, cnst3_m);    \
+    cnst2_m = __lsx_vpackev_h(cnst3_m, cnst2_m);                              \
+    vec7_m = __lsx_vdp2_w_h(vec2_m, cnst2_m);                                 \
+                                                                              \
+    vec4_m = __lsx_vdp2_w_h(vec0_m, cnst0_m);                                 \
+    cnst2_m = __lsx_vreplvei_h(coeff_m, 2);                                   \
+    cnst2_m = __lsx_vpackev_h(cnst2_m, cnst3_m);                              \
+    vec6_m = __lsx_vdp2_w_h(vec2_m, cnst2_m);                                 \
+                                                                              \
+    DUP4_ARG3(__lsx_vssrarni_h_w, vec4_m, vec4_m, DCT_CONST_BITS, vec5_m,     \
+              vec5_m, DCT_CONST_BITS, vec6_m, vec6_m, DCT_CONST_BITS, vec7_m, \
+              vec7_m, DCT_CONST_BITS, out0, out2, out1, out3);                \
+  } while (0)
+
+#define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
+                  out3, out4, out5, out6, out7)                             \
+  do {                                                                      \
+    __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m;                       \
+    __m128i s7_m, x0_m, x1_m, x2_m, x3_m;                                   \
+    __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 };           \
+                                                                            \
+    /* FDCT stage1 */                                                       \
+    LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m,   \
+                      s2_m, s3_m, s4_m, s5_m, s6_m, s7_m);                  \
+    LSX_BUTTERFLY_4_H(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m);      \
+    DUP2_ARG2(__lsx_vilvh_h, x1_m, x0_m, x3_m, x2_m, s0_m, s2_m);           \
+    DUP2_ARG2(__lsx_vilvl_h, x1_m, x0_m, x3_m, x2_m, s1_m, s3_m);           \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, x0_m, x1_m);        \
+    x1_m = __lsx_vpackev_h(x1_m, x0_m);                                     \
+    DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, out4);                          \
+                                                                            \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, x2_m, x3_m);        \
+    x2_m = __lsx_vneg_h(x2_m);                                              \
+    x2_m = __lsx_vpackev_h(x3_m, x2_m);                                     \
+    DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out6);                          \
+                                                                            \
+    DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, out0);                          \
+    x2_m = __lsx_vreplvei_h(coeff_m, 2);                                    \
+    x2_m = __lsx_vpackev_h(x2_m, x3_m);                                     \
+    DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out2);                          \
+                                                                            \
+    /* stage2 */                                                            \
+    s1_m = __lsx_vilvl_h(s5_m, s6_m);                                       \
+    s0_m = __lsx_vilvh_h(s5_m, s6_m);                                       \
+                                                                            \
+    DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, s6_m);                          \
+    DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, s5_m);                          \
+                                                                            \
+    /* stage3 */                                                            \
+    LSX_BUTTERFLY_4_H(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m);      \
+                                                                            \
+    /* stage4 */                                                            \
+    DUP2_ARG2(__lsx_vilvh_h, x3_m, x0_m, x2_m, x1_m, s4_m, s6_m);           \
+    DUP2_ARG2(__lsx_vilvl_h, x3_m, x0_m, x2_m, x1_m, s5_m, s7_m);           \
+                                                                            \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 5, x0_m, x1_m);        \
+    x1_m = __lsx_vpackev_h(x0_m, x1_m);                                     \
+    DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m, out1);                          \
+                                                                            \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 6, coeff_m, 7, x2_m, x3_m);        \
+    x2_m = __lsx_vpackev_h(x3_m, x2_m);                                     \
+    DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out5);                          \
+                                                                            \
+    x1_m = __lsx_vreplvei_h(coeff_m, 5);                                    \
+    x0_m = __lsx_vneg_h(x0_m);                                              \
+    x0_m = __lsx_vpackev_h(x1_m, x0_m);                                     \
+    DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m, out7);                          \
+    x2_m = __lsx_vreplvei_h(coeff_m, 6);                                    \
+    x3_m = __lsx_vneg_h(x3_m);                                              \
+    x2_m = __lsx_vpackev_h(x2_m, x3_m);                                     \
+    DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3);                          \
+  } while (0)
+
+#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7)             \
+  do {                                                                      \
+    __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+                                                                            \
+    DUP4_ARG2(__lsx_vsrli_h, in0, 15, in1, 15, in2, 15, in3, 15, vec0_m,    \
+              vec1_m, vec2_m, vec3_m);                                      \
+    DUP4_ARG2(__lsx_vsrli_h, in4, 15, in5, 15, in6, 15, in7, 15, vec4_m,    \
+              vec5_m, vec6_m, vec7_m);                                      \
+    DUP4_ARG2(__lsx_vavg_h, vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m,  \
+              in3, in0, in1, in2, in3);                                     \
+    DUP4_ARG2(__lsx_vavg_h, vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m,  \
+              in7, in4, in5, in6, in7);                                     \
+  } while (0)
+
+#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \
+  do {                                       \
+    __m128i tp0_m, tp1_m;                    \
+    __m128i one = __lsx_vreplgr2vr_h(1);     \
+                                             \
+    tp0_m = __lsx_vslei_h(vec0, 0);          \
+    tp1_m = __lsx_vslei_h(vec1, 0);          \
+    tp0_m = __lsx_vxori_b(tp0_m, 255);       \
+    tp1_m = __lsx_vxori_b(tp1_m, 255);       \
+    vec0 = __lsx_vadd_h(vec0, one);          \
+    vec1 = __lsx_vadd_h(vec1, one);          \
+    tp0_m = __lsx_vand_v(one, tp0_m);        \
+    tp1_m = __lsx_vand_v(one, tp1_m);        \
+    vec0 = __lsx_vadd_h(vec0, tp0_m);        \
+    vec1 = __lsx_vadd_h(vec1, tp1_m);        \
+    vec0 = __lsx_vsrai_h(vec0, 2);           \
+    vec1 = __lsx_vsrai_h(vec1, 2);           \
+  } while (0)
+
+#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) \
+  do {                                     \
+    __m128i tp0_m, tp1_m;                  \
+    __m128i one_m = __lsx_vldi(0x401);     \
+                                           \
+    tp0_m = __lsx_vslti_h(vec0, 0);        \
+    tp1_m = __lsx_vslti_h(vec1, 0);        \
+    vec0 = __lsx_vadd_h(vec0, one_m);      \
+    vec1 = __lsx_vadd_h(vec1, one_m);      \
+    tp0_m = __lsx_vand_v(one_m, tp0_m);    \
+    tp1_m = __lsx_vand_v(one_m, tp1_m);    \
+    vec0 = __lsx_vadd_h(vec0, tp0_m);      \
+    vec1 = __lsx_vadd_h(vec1, tp1_m);      \
+    vec0 = __lsx_vsrai_h(vec0, 2);         \
+    vec1 = __lsx_vsrai_h(vec1, 2);         \
+  } while (0)
+
+#define FDCT32_POSTPROC_NEG_W(vec)         \
+  do {                                     \
+    __m128i temp_m;                        \
+    __m128i one_m = __lsx_vreplgr2vr_w(1); \
+                                           \
+    temp_m = __lsx_vslti_w(vec, 0);        \
+    vec = __lsx_vadd_w(vec, one_m);        \
+    temp_m = __lsx_vand_v(one_m, temp_m);  \
+    vec = __lsx_vadd_w(vec, temp_m);       \
+    vec = __lsx_vsrai_w(vec, 2);           \
+  } while (0)
+
+#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, reg1_right,       \
+                          const0, const1, out0, out1, out2, out3)             \
+  do {                                                                        \
+    __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m;                   \
+    __m128i tp0_m, tp1_m, tp2_m, tp3_m, _tmp0, _tmp1;                         \
+    __m128i k0_m = __lsx_vreplgr2vr_w((int32_t)const0);                       \
+                                                                              \
+    s0_m = __lsx_vreplgr2vr_w((int32_t)const1);                               \
+    k0_m = __lsx_vpackev_w(s0_m, k0_m);                                       \
+                                                                              \
+    DUP2_ARG1(__lsx_vneg_w, reg1_left, reg1_right, _tmp0, _tmp1);             \
+    s1_m = __lsx_vilvl_w(_tmp0, reg0_left);                                   \
+    s0_m = __lsx_vilvh_w(_tmp0, reg0_left);                                   \
+    s3_m = __lsx_vilvl_w(reg0_left, reg1_left);                               \
+    s2_m = __lsx_vilvh_w(reg0_left, reg1_left);                               \
+    s5_m = __lsx_vilvl_w(_tmp1, reg0_right);                                  \
+    s4_m = __lsx_vilvh_w(_tmp1, reg0_right);                                  \
+    s7_m = __lsx_vilvl_w(reg0_right, reg1_right);                             \
+    s6_m = __lsx_vilvh_w(reg0_right, reg1_right);                             \
+    DUP2_ARG2(__lsx_vdp2_d_w, s0_m, k0_m, s1_m, k0_m, tp0_m, tp1_m);          \
+    DUP2_ARG2(__lsx_vdp2_d_w, s4_m, k0_m, s5_m, k0_m, tp2_m, tp3_m);          \
+    DUP2_ARG3(__lsx_vssrarni_w_d, tp0_m, tp1_m, DCT_CONST_BITS, tp2_m, tp3_m, \
+              DCT_CONST_BITS, out0, out1);                                    \
+    DUP2_ARG2(__lsx_vdp2_d_w, s2_m, k0_m, s3_m, k0_m, tp0_m, tp1_m);          \
+    DUP2_ARG2(__lsx_vdp2_d_w, s6_m, k0_m, s7_m, k0_m, tp2_m, tp3_m);          \
+    DUP2_ARG3(__lsx_vssrarni_w_d, tp0_m, tp1_m, DCT_CONST_BITS, tp2_m, tp3_m, \
+              DCT_CONST_BITS, out2, out3);                                    \
+  } while (0)
+
+#define VP9_ADDBLK_ST8x4_UB(dst, _stride, _stride2, _stride3, in0, in1, in2,   \
+                            in3)                                               \
+  do {                                                                         \
+    __m128i dst0_m, dst1_m, dst2_m, dst3_m;                                    \
+    __m128i tmp0_m, tmp1_m;                                                    \
+    __m128i res0_m, res1_m, res2_m, res3_m;                                    \
+                                                                               \
+    dst0_m = __lsx_vld(dst, 0);                                                \
+    DUP2_ARG2(__lsx_vldx, dst, _stride, dst, _stride2, dst1_m, dst2_m);        \
+    dst3_m = __lsx_vldx(dst, _stride3);                                        \
+    DUP4_ARG2(__lsx_vsllwil_hu_bu, dst0_m, 0, dst1_m, 0, dst2_m, 0, dst3_m, 0, \
+              res0_m, res1_m, res2_m, res3_m);                                 \
+    DUP4_ARG2(__lsx_vadd_h, res0_m, in0, res1_m, in1, res2_m, in2, res3_m,     \
+              in3, res0_m, res1_m, res2_m, res3_m);                            \
+    DUP2_ARG3(__lsx_vssrarni_bu_h, res1_m, res0_m, 0, res3_m, res2_m, 0,       \
+              tmp0_m, tmp1_m);                                                 \
+    __lsx_vstelm_d(tmp0_m, dst, 0, 0);                                         \
+    __lsx_vstelm_d(tmp0_m, dst + _stride, 0, 1);                               \
+    __lsx_vstelm_d(tmp1_m, dst + _stride2, 0, 0);                              \
+    __lsx_vstelm_d(tmp1_m, dst + _stride3, 0, 1);                              \
+  } while (0)
+
+#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                      out2, out3, out4, out5, out6, out7)                 \
+  do {                                                                    \
+    __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m;               \
+    __m128i x0_m, x1_m, x2_m, x3_m;                                       \
+    __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 };         \
+                                                                          \
+    /* FDCT stage1 */                                                     \
+    LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, \
+                      s2_m, s3_m, s4_m, s5_m, s6_m, s7_m);                \
+    LSX_BUTTERFLY_4_H(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m);    \
+    DUP2_ARG2(__lsx_vilvh_h, x1_m, x0_m, x3_m, x2_m, s0_m, s2_m);         \
+    DUP2_ARG2(__lsx_vilvl_h, x1_m, x0_m, x3_m, x2_m, s1_m, s3_m);         \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, x0_m, x1_m);      \
+    x1_m = __lsx_vpackev_h(x1_m, x0_m);                                   \
+    DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, out4);                        \
+                                                                          \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, x2_m, x3_m);      \
+    x2_m = __lsx_vneg_h(x2_m);                                            \
+    x2_m = __lsx_vpackev_h(x3_m, x2_m);                                   \
+    DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out6);                        \
+                                                                          \
+    DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, out0);                        \
+    x2_m = __lsx_vreplvei_h(coeff_m, 2);                                  \
+    x2_m = __lsx_vpackev_h(x2_m, x3_m);                                   \
+    DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out2);                        \
+                                                                          \
+    /* stage2 */                                                          \
+    s1_m = __lsx_vilvl_h(s5_m, s6_m);                                     \
+    s0_m = __lsx_vilvh_h(s5_m, s6_m);                                     \
+                                                                          \
+    DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, s6_m);                        \
+    DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, s5_m);                        \
+                                                                          \
+    /* stage3 */                                                          \
+    LSX_BUTTERFLY_4_H(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m);    \
+                                                                          \
+    /* stage4 */                                                          \
+    DUP2_ARG2(__lsx_vilvh_h, x3_m, x0_m, x2_m, x1_m, s4_m, s6_m);         \
+    DUP2_ARG2(__lsx_vilvl_h, x3_m, x0_m, x2_m, x1_m, s5_m, s7_m);         \
+                                                                          \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 5, x0_m, x1_m);      \
+    x1_m = __lsx_vpackev_h(x0_m, x1_m);                                   \
+    DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m, out1);                        \
+                                                                          \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 6, coeff_m, 7, x2_m, x3_m);      \
+    x2_m = __lsx_vpackev_h(x3_m, x2_m);                                   \
+    DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out5);                        \
+                                                                          \
+    x1_m = __lsx_vreplvei_h(coeff_m, 5);                                  \
+    x0_m = __lsx_vneg_h(x0_m);                                            \
+    x0_m = __lsx_vpackev_h(x1_m, x0_m);                                   \
+    DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m, out7);                        \
+                                                                          \
+    x2_m = __lsx_vreplvei_h(coeff_m, 6);                                  \
+    x3_m = __lsx_vneg_h(x3_m);                                            \
+    x2_m = __lsx_vpackev_h(x2_m, x3_m);                                   \
+    DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3);                        \
+  } while (0)
+
+#define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6,  \
+                     input7, out1, out3, out5, out7, out9, out11, out13,      \
+                     out15)                                                   \
+  do {                                                                        \
+    __m128i stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m;             \
+    __m128i stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m;             \
+    __m128i stp36_m, stp37_m, vec0_m, vec1_m;                                 \
+    __m128i vec2_m, vec3_m, vec4_m, vec5_m, vec6_m;                           \
+    __m128i cnst0_m, cnst1_m, cnst4_m, cnst5_m;                               \
+    __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e3537e782c4df };             \
+    __m128i coeff1_m = { 0x289a317906463fb1, 0x12943d3f1e2b3871 };            \
+    __m128i coeff2_m = { 0xed6cd766c78fc04f, 0x0 };                           \
+                                                                              \
+    /* stp 1 */                                                               \
+    DUP2_ARG2(__lsx_vilvh_h, input2, input5, input3, input4, vec2_m, vec4_m); \
+    DUP2_ARG2(__lsx_vilvl_h, input2, input5, input3, input4, vec3_m, vec5_m); \
+                                                                              \
+    cnst4_m = __lsx_vreplvei_h(coeff_m, 0);                                   \
+    DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m, stp25_m);                  \
+                                                                              \
+    cnst5_m = __lsx_vreplvei_h(coeff_m, 1);                                   \
+    cnst5_m = __lsx_vpackev_h(cnst5_m, cnst4_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m, stp22_m);                  \
+    DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m, stp24_m);                  \
+    DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m, stp23_m);                  \
+                                                                              \
+    /* stp2 */                                                                \
+    LSX_BUTTERFLY_4_H(input0, input1, stp22_m, stp23_m, stp30_m, stp31_m,     \
+                      stp32_m, stp33_m);                                      \
+    LSX_BUTTERFLY_4_H(input7, input6, stp25_m, stp24_m, stp37_m, stp36_m,     \
+                      stp35_m, stp34_m);                                      \
+                                                                              \
+    DUP2_ARG2(__lsx_vilvh_h, stp36_m, stp31_m, stp35_m, stp32_m, vec2_m,      \
+              vec4_m);                                                        \
+    DUP2_ARG2(__lsx_vilvl_h, stp36_m, stp31_m, stp35_m, stp32_m, vec3_m,      \
+              vec5_m);                                                        \
+                                                                              \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, cnst0_m, cnst1_m);    \
+    cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m, stp26_m);                  \
+                                                                              \
+    cnst0_m = __lsx_vreplvei_h(coeff_m, 4);                                   \
+    cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m, stp21_m);                  \
+                                                                              \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 5, coeff_m, 2, cnst0_m, cnst1_m);    \
+    cnst1_m = __lsx_vpackev_h(cnst0_m, cnst1_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m, stp25_m);                  \
+                                                                              \
+    cnst0_m = __lsx_vreplvei_h(coeff_m, 3);                                   \
+    cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m, stp22_m);                  \
+                                                                              \
+    /* stp4 */                                                                \
+    LSX_BUTTERFLY_4_H(stp30_m, stp37_m, stp26_m, stp21_m, vec6_m, vec2_m,     \
+                      vec4_m, vec5_m);                                        \
+    LSX_BUTTERFLY_4_H(stp33_m, stp34_m, stp25_m, stp22_m, stp21_m, stp23_m,   \
+                      stp24_m, stp31_m);                                      \
+                                                                              \
+    vec1_m = __lsx_vilvl_h(vec2_m, vec6_m);                                   \
+    vec0_m = __lsx_vilvh_h(vec2_m, vec6_m);                                   \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 0, coeff1_m, 1, cnst0_m, cnst1_m);  \
+    cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m);                              \
+                                                                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out1);                     \
+                                                                              \
+    cnst0_m = __lsx_vreplvei_h(coeff2_m, 0);                                  \
+    cnst0_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out15);                    \
+                                                                              \
+    vec1_m = __lsx_vilvl_h(vec4_m, vec5_m);                                   \
+    vec0_m = __lsx_vilvh_h(vec4_m, vec5_m);                                   \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 2, coeff1_m, 3, cnst0_m, cnst1_m);  \
+    cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
+                                                                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m, out9);                     \
+                                                                              \
+    cnst1_m = __lsx_vreplvei_h(coeff2_m, 2);                                  \
+    cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out7);                     \
+                                                                              \
+    vec1_m = __lsx_vilvl_h(stp23_m, stp21_m);                                 \
+    vec0_m = __lsx_vilvh_h(stp23_m, stp21_m);                                 \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 4, coeff1_m, 5, cnst0_m, cnst1_m);  \
+    cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out5);                     \
+                                                                              \
+    cnst0_m = __lsx_vreplvei_h(coeff2_m, 1);                                  \
+    cnst0_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out11);                    \
+                                                                              \
+    vec1_m = __lsx_vilvl_h(stp24_m, stp31_m);                                 \
+    vec0_m = __lsx_vilvh_h(stp24_m, stp31_m);                                 \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 6, coeff1_m, 7, cnst0_m, cnst1_m);  \
+    cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
+                                                                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m, out13);                    \
+                                                                              \
+    cnst1_m = __lsx_vreplvei_h(coeff2_m, 3);                                  \
+    cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out3);                     \
+  } while (0)
+
+void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
+                        int32_t src_stride);
+void fdct16x8_1d_row(int16_t *input, int16_t *output);
+#endif  // VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/idct32x32_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/idct32x32_lsx.c
new file mode 100644
index 0000000000..ec07f57d90
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/idct32x32_lsx.c
@@ -0,0 +1,834 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/fwd_txfm_lsx.h"
+
+#define UNPCK_UB_SH(_in, _out0, _out1)   \
+  do {                                   \
+    _out0 = __lsx_vsllwil_hu_bu(_in, 0); \
+    _out1 = __lsx_vexth_hu_bu(_in);      \
+  } while (0)
+
+static void idct32x8_row_transpose_store(const int16_t *input,
+                                         int16_t *tmp_buf) {
+  __m128i m0, m1, m2, m3, m4, m5, m6, m7;
+  __m128i n0, n1, n2, n3, n4, n5, n6, n7;
+
+  /* 1st & 2nd 8x8 */
+  DUP4_ARG2(__lsx_vld, input, 0, input, 64, input, 128, input, 192, m0, n0, m1,
+            n1);
+  DUP4_ARG2(__lsx_vld, input, 256, input, 320, input, 384, input, 448, m2, n2,
+            m3, n3);
+  DUP4_ARG2(__lsx_vld, input, 16, input, 80, input, 144, input, 208, m4, n4, m5,
+            n5);
+  DUP4_ARG2(__lsx_vld, input, 272, input, 336, input, 400, input, 464, m6, n6,
+            m7, n7);
+
+  LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+                     n3);
+  LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+                     n7);
+
+  __lsx_vst(m0, tmp_buf, 0);
+  __lsx_vst(n0, tmp_buf, 16);
+  __lsx_vst(m1, tmp_buf, 32);
+  __lsx_vst(n1, tmp_buf, 48);
+  __lsx_vst(m2, tmp_buf, 64);
+  __lsx_vst(n2, tmp_buf, 80);
+  __lsx_vst(m3, tmp_buf, 96);
+  __lsx_vst(n3, tmp_buf, 112);
+  __lsx_vst(m4, tmp_buf, 128);
+  __lsx_vst(n4, tmp_buf, 144);
+  __lsx_vst(m5, tmp_buf, 160);
+  __lsx_vst(n5, tmp_buf, 176);
+  __lsx_vst(m6, tmp_buf, 192);
+  __lsx_vst(n6, tmp_buf, 208);
+  __lsx_vst(m7, tmp_buf, 224);
+  __lsx_vst(n7, tmp_buf, 240);
+
+  /* 3rd & 4th 8x8 */
+  DUP4_ARG2(__lsx_vld, input, 32, input, 96, input, 160, input, 224, m0, n0, m1,
+            n1);
+  DUP4_ARG2(__lsx_vld, input, 288, input, 352, input, 416, input, 480, m2, n2,
+            m3, n3);
+  DUP4_ARG2(__lsx_vld, input, 48, input, 112, input, 176, input, 240, m4, n4,
+            m5, n5);
+  DUP4_ARG2(__lsx_vld, input, 304, input, 368, input, 432, input, 496, m6, n6,
+            m7, n7);
+
+  LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+                     n3);
+  LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+                     n7);
+
+  __lsx_vst(m0, tmp_buf, 256);
+  __lsx_vst(n0, tmp_buf, 272);
+  __lsx_vst(m1, tmp_buf, 288);
+  __lsx_vst(n1, tmp_buf, 304);
+  __lsx_vst(m2, tmp_buf, 320);
+  __lsx_vst(n2, tmp_buf, 336);
+  __lsx_vst(m3, tmp_buf, 352);
+  __lsx_vst(n3, tmp_buf, 368);
+  __lsx_vst(m4, tmp_buf, 384);
+  __lsx_vst(n4, tmp_buf, 400);
+  __lsx_vst(m5, tmp_buf, 416);
+  __lsx_vst(n5, tmp_buf, 432);
+  __lsx_vst(m6, tmp_buf, 448);
+  __lsx_vst(n6, tmp_buf, 464);
+  __lsx_vst(m7, tmp_buf, 480);
+  __lsx_vst(n7, tmp_buf, 496);
+}
+
+static void idct32x8_row_even_process_store(int16_t *tmp_buf,
+                                            int16_t *tmp_eve_buf) {
+  __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  __m128i stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
+  __m128i tmp0;
+
+  /* Even stage 1 */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 64, tmp_buf, 128, tmp_buf, 192,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 256, tmp_buf, 320, tmp_buf, 384, tmp_buf, 448,
+            reg4, reg5, reg6, reg7);
+
+  DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
+  DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
+  LSX_BUTTERFLY_4_H(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
+  DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+  loc1 = vec3;
+  loc0 = vec1;
+
+  DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
+  DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
+  LSX_BUTTERFLY_4_H(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
+  LSX_BUTTERFLY_4_H(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
+  LSX_BUTTERFLY_4_H(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
+
+  /* Even stage 2 */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 32, tmp_buf, 96, tmp_buf, 160, tmp_buf, 224,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 288, tmp_buf, 352, tmp_buf, 416, tmp_buf, 480,
+            reg4, reg5, reg6, reg7);
+  DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
+  DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
+
+  vec0 = __lsx_vadd_h(reg0, reg4);
+  reg0 = __lsx_vsub_h(reg0, reg4);
+  reg4 = __lsx_vadd_h(reg6, reg2);
+  reg6 = __lsx_vsub_h(reg6, reg2);
+  reg2 = __lsx_vadd_h(reg1, reg5);
+  reg1 = __lsx_vsub_h(reg1, reg5);
+  reg5 = __lsx_vadd_h(reg7, reg3);
+  reg7 = __lsx_vsub_h(reg7, reg3);
+  reg3 = vec0;
+
+  vec1 = reg2;
+  reg2 = __lsx_vadd_h(reg3, reg4);
+  reg3 = __lsx_vsub_h(reg3, reg4);
+  reg4 = __lsx_vsub_h(reg5, vec1);
+  reg5 = __lsx_vadd_h(reg5, vec1);
+
+  tmp0 = __lsx_vneg_h(reg6);
+  DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
+  DOTP_CONST_PAIR(tmp0, reg1, cospi_24_64, cospi_8_64, reg6, reg1);
+
+  vec0 = __lsx_vsub_h(reg0, reg6);
+  reg0 = __lsx_vadd_h(reg0, reg6);
+  vec1 = __lsx_vsub_h(reg7, reg1);
+  reg7 = __lsx_vadd_h(reg7, reg1);
+
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
+
+  /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
+  LSX_BUTTERFLY_4_H(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
+  __lsx_vst(loc0, tmp_eve_buf, 240);
+  __lsx_vst(loc1, tmp_eve_buf, 0);
+  __lsx_vst(loc2, tmp_eve_buf, 224);
+  __lsx_vst(loc3, tmp_eve_buf, 16);
+
+  LSX_BUTTERFLY_4_H(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
+  __lsx_vst(loc0, tmp_eve_buf, 208);
+  __lsx_vst(loc1, tmp_eve_buf, 32);
+  __lsx_vst(loc2, tmp_eve_buf, 192);
+  __lsx_vst(loc3, tmp_eve_buf, 48);
+
+  /* Store 8 */
+  LSX_BUTTERFLY_4_H(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
+  __lsx_vst(loc0, tmp_eve_buf, 176);
+  __lsx_vst(loc1, tmp_eve_buf, 64);
+  __lsx_vst(loc2, tmp_eve_buf, 160);
+  __lsx_vst(loc3, tmp_eve_buf, 80);
+
+  LSX_BUTTERFLY_4_H(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
+  __lsx_vst(loc0, tmp_eve_buf, 144);
+  __lsx_vst(loc1, tmp_eve_buf, 96);
+  __lsx_vst(loc2, tmp_eve_buf, 128);
+  __lsx_vst(loc3, tmp_eve_buf, 112);
+}
+
+static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
+                                           int16_t *tmp_odd_buf) {
+  __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+
+  /* Odd stage 1 */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 16, tmp_buf, 112, tmp_buf, 144, tmp_buf, 240,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 272, tmp_buf, 368, tmp_buf, 400, tmp_buf, 496,
+            reg4, reg5, reg6, reg7);
+
+  DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
+  DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
+
+  vec0 = __lsx_vadd_h(reg0, reg3);
+  reg0 = __lsx_vsub_h(reg0, reg3);
+  reg3 = __lsx_vadd_h(reg7, reg4);
+  reg7 = __lsx_vsub_h(reg7, reg4);
+  reg4 = __lsx_vadd_h(reg1, reg2);
+  reg1 = __lsx_vsub_h(reg1, reg2);
+  reg2 = __lsx_vadd_h(reg6, reg5);
+  reg6 = __lsx_vsub_h(reg6, reg5);
+  reg5 = vec0;
+
+  /* 4 Stores */
+  DUP2_ARG2(__lsx_vadd_h, reg5, reg4, reg3, reg2, vec0, vec1);
+  __lsx_vst(vec0, tmp_odd_buf, 64);
+  __lsx_vst(vec1, tmp_odd_buf, 80);
+
+  DUP2_ARG2(__lsx_vsub_h, reg5, reg4, reg3, reg2, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
+  __lsx_vst(vec0, tmp_odd_buf, 0);
+  __lsx_vst(vec1, tmp_odd_buf, 16);
+
+  /* 4 Stores */
+  DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
+  LSX_BUTTERFLY_4_H(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
+  __lsx_vst(vec0, tmp_odd_buf, 96);
+  __lsx_vst(vec1, tmp_odd_buf, 112);
+
+  DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
+  __lsx_vst(vec2, tmp_odd_buf, 32);
+  __lsx_vst(vec3, tmp_odd_buf, 48);
+
+  /* Odd stage 2 */
+  /* 8 loads */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 48, tmp_buf, 80, tmp_buf, 176, tmp_buf, 208,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 304, tmp_buf, 336, tmp_buf, 432, tmp_buf, 464,
+            reg4, reg5, reg6, reg7);
+
+  DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
+  DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
+  DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
+
+  /* 4 Stores */
+  DUP4_ARG2(__lsx_vsub_h, reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0,
+            vec1, vec2, vec3);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
+  DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
+
+  LSX_BUTTERFLY_4_H(loc3, loc2, loc0, loc1, vec1, vec0, vec2, vec3);
+  __lsx_vst(vec0, tmp_odd_buf, 192);
+  __lsx_vst(vec1, tmp_odd_buf, 240);
+
+  DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
+  __lsx_vst(vec0, tmp_odd_buf, 160);
+  __lsx_vst(vec1, tmp_odd_buf, 176);
+
+  /* 4 Stores */
+  DUP4_ARG2(__lsx_vadd_h, reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec1,
+            vec2, vec0, vec3);
+  LSX_BUTTERFLY_4_H(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
+  __lsx_vst(reg0, tmp_odd_buf, 208);
+  __lsx_vst(reg1, tmp_odd_buf, 224);
+
+  DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
+  __lsx_vst(reg0, tmp_odd_buf, 128);
+  __lsx_vst(reg1, tmp_odd_buf, 144);
+
+  /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 16, tmp_odd_buf, 32,
+            tmp_odd_buf, 48, reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 128, tmp_odd_buf, 144, tmp_odd_buf, 160,
+            tmp_odd_buf, 176, reg4, reg5, reg6, reg7);
+  DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0,
+            loc1, loc2, loc3);
+  __lsx_vst(loc0, tmp_odd_buf, 0);
+  __lsx_vst(loc1, tmp_odd_buf, 16);
+  __lsx_vst(loc2, tmp_odd_buf, 32);
+  __lsx_vst(loc3, tmp_odd_buf, 48);
+
+  DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg1, reg5, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+  DUP2_ARG2(__lsx_vsub_h, reg2, reg6, reg3, reg7, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+  __lsx_vst(loc0, tmp_odd_buf, 128);
+  __lsx_vst(loc1, tmp_odd_buf, 144);
+  __lsx_vst(loc2, tmp_odd_buf, 160);
+  __lsx_vst(loc3, tmp_odd_buf, 176);
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 80, tmp_odd_buf, 96,
+            tmp_odd_buf, 112, reg1, reg2, reg0, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 192, tmp_odd_buf, 208, tmp_odd_buf, 224,
+            tmp_odd_buf, 240, reg4, reg5, reg6, reg7);
+
+  DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0,
+            loc1, loc2, loc3);
+  __lsx_vst(loc0, tmp_odd_buf, 64);
+  __lsx_vst(loc1, tmp_odd_buf, 80);
+  __lsx_vst(loc2, tmp_odd_buf, 96);
+  __lsx_vst(loc3, tmp_odd_buf, 112);
+
+  DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg3, reg7, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+  DUP2_ARG2(__lsx_vsub_h, reg1, reg5, reg2, reg6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+  __lsx_vst(loc0, tmp_odd_buf, 192);
+  __lsx_vst(loc1, tmp_odd_buf, 208);
+  __lsx_vst(loc2, tmp_odd_buf, 224);
+  __lsx_vst(loc3, tmp_odd_buf, 240);
+}
+
+static void idct_butterfly_transpose_store(int16_t *tmp_buf,
+                                           int16_t *tmp_eve_buf,
+                                           int16_t *tmp_odd_buf, int16_t *dst) {
+  __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  __m128i m0, m1, m2, m3, m4, m5, m6, m7;
+  __m128i n0, n1, n2, n3, n4, n5, n6, n7;
+  __m128i reg0, reg1, reg2, reg3;
+
+  /* FINAL BUTTERFLY : Dependency on Even & Odd */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 144, tmp_odd_buf, 224,
+            tmp_odd_buf, 96, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, tmp_eve_buf, 0, tmp_eve_buf, 128, tmp_eve_buf, 64,
+            tmp_eve_buf, 192, loc0, loc1, loc2, loc3);
+
+  DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0,
+            m4, m2, m6);
+  DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0,
+            reg1, reg2, reg3);
+  __lsx_vst(reg0, tmp_buf, 496);
+  __lsx_vst(reg1, tmp_buf, 368);
+  __lsx_vst(reg2, tmp_buf, 432);
+  __lsx_vst(reg3, tmp_buf, 304);
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 208, tmp_odd_buf, 160,
+            tmp_odd_buf, 48, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, tmp_eve_buf, 32, tmp_eve_buf, 160, tmp_eve_buf, 96,
+            tmp_eve_buf, 224, loc0, loc1, loc2, loc3);
+
+  DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1,
+            m5, m3, m7);
+  DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0,
+            reg1, reg2, reg3);
+  __lsx_vst(reg0, tmp_buf, 464);
+  __lsx_vst(reg1, tmp_buf, 336);
+  __lsx_vst(reg2, tmp_buf, 400);
+  __lsx_vst(reg3, tmp_buf, 272);
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 32, tmp_odd_buf, 176, tmp_odd_buf, 192,
+            tmp_odd_buf, 112, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, tmp_eve_buf, 16, tmp_eve_buf, 144, tmp_eve_buf, 80,
+            tmp_eve_buf, 208, loc0, loc1, loc2, loc3);
+
+  DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0,
+            n4, n2, n6);
+  DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0,
+            reg1, reg2, reg3);
+  __lsx_vst(reg0, tmp_buf, 480);
+  __lsx_vst(reg1, tmp_buf, 352);
+  __lsx_vst(reg2, tmp_buf, 416);
+  __lsx_vst(reg3, tmp_buf, 288);
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 80, tmp_odd_buf, 240, tmp_odd_buf, 128,
+            tmp_odd_buf, 16, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, tmp_eve_buf, 48, tmp_eve_buf, 176, tmp_eve_buf, 112,
+            tmp_eve_buf, 240, loc0, loc1, loc2, loc3);
+  DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1,
+            n5, n3, n7);
+  DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0,
+            reg1, reg2, reg3);
+  __lsx_vst(reg0, tmp_buf, 448);
+  __lsx_vst(reg1, tmp_buf, 320);
+  __lsx_vst(reg2, tmp_buf, 384);
+  __lsx_vst(reg3, tmp_buf, 256);
+
+  /* Transpose : 16 vectors */
+  /* 1st & 2nd 8x8 */
+  LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+                     n3);
+  __lsx_vst(m0, dst, 0);
+  __lsx_vst(n0, dst, 64);
+  __lsx_vst(m1, dst, 128);
+  __lsx_vst(n1, dst, 192);
+  __lsx_vst(m2, dst, 256);
+  __lsx_vst(n2, dst, 320);
+  __lsx_vst(m3, dst, 384);
+  __lsx_vst(n3, dst, 448);
+
+  LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+                     n7);
+  __lsx_vst(m4, dst, 16);
+  __lsx_vst(n4, dst, 80);
+  __lsx_vst(m5, dst, 144);
+  __lsx_vst(n5, dst, 208);
+  __lsx_vst(m6, dst, 272);
+  __lsx_vst(n6, dst, 336);
+  __lsx_vst(m7, dst, 400);
+  __lsx_vst(n7, dst, 464);
+
+  /* 3rd & 4th 8x8 */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 256, tmp_buf, 272, tmp_buf, 288, tmp_buf, 304,
+            m0, n0, m1, n1);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 320, tmp_buf, 336, tmp_buf, 352, tmp_buf, 368,
+            m2, n2, m3, n3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 384, tmp_buf, 400, tmp_buf, 416, tmp_buf, 432,
+            m4, n4, m5, n5);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 448, tmp_buf, 464, tmp_buf, 480, tmp_buf, 496,
+            m6, n6, m7, n7);
+  LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+                     n3);
+  LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+                     n7);
+  __lsx_vst(m0, dst, 32);
+  __lsx_vst(n0, dst, 96);
+  __lsx_vst(m1, dst, 160);
+  __lsx_vst(n1, dst, 224);
+  __lsx_vst(m2, dst, 288);
+  __lsx_vst(n2, dst, 352);
+  __lsx_vst(m3, dst, 416);
+  __lsx_vst(n3, dst, 480);
+  __lsx_vst(m4, dst, 48);
+  __lsx_vst(n4, dst, 112);
+  __lsx_vst(m5, dst, 176);
+  __lsx_vst(n5, dst, 240);
+  __lsx_vst(m6, dst, 304);
+  __lsx_vst(n6, dst, 368);
+  __lsx_vst(m7, dst, 432);
+  __lsx_vst(n7, dst, 496);
+}
+
+static void idct32x8_1d_rows_lsx(const int16_t *input, int16_t *output) {
+  DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]);
+  DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
+  DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
+
+  idct32x8_row_transpose_store(input, &tmp_buf[0]);
+  idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]);
+  idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]);
+  idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], &tmp_odd_buf[0],
+                                 output);
+}
+
+static void idct8x32_column_even_process_store(int16_t *tmp_buf,
+                                               int16_t *tmp_eve_buf) {
+  __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  __m128i stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
+  __m128i tmp0;
+
+  /* Even stage 1 */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 256, tmp_buf, 512, tmp_buf, 768,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 1024, tmp_buf, 1280, tmp_buf, 1536, tmp_buf,
+            1792, reg4, reg5, reg6, reg7);
+  tmp_buf += 64;
+
+  DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
+  DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
+  LSX_BUTTERFLY_4_H(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
+  DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+  loc1 = vec3;
+  loc0 = vec1;
+
+  DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
+  DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
+  LSX_BUTTERFLY_4_H(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
+  LSX_BUTTERFLY_4_H(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
+  LSX_BUTTERFLY_4_H(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
+
+  /* Even stage 2 */
+  /* Load 8 */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 256, tmp_buf, 512, tmp_buf, 768,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 1024, tmp_buf, 1280, tmp_buf, 1536, tmp_buf,
+            1792, reg4, reg5, reg6, reg7);
+  DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
+  DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
+
+  vec0 = __lsx_vadd_h(reg0, reg4);
+  reg0 = __lsx_vsub_h(reg0, reg4);
+  reg4 = __lsx_vadd_h(reg6, reg2);
+  reg6 = __lsx_vsub_h(reg6, reg2);
+  reg2 = __lsx_vadd_h(reg1, reg5);
+  reg1 = __lsx_vsub_h(reg1, reg5);
+  reg5 = __lsx_vadd_h(reg7, reg3);
+  reg7 = __lsx_vsub_h(reg7, reg3);
+  reg3 = vec0;
+
+  vec1 = reg2;
+  reg2 = __lsx_vadd_h(reg3, reg4);
+  reg3 = __lsx_vsub_h(reg3, reg4);
+  reg4 = __lsx_vsub_h(reg5, vec1);
+  reg5 = __lsx_vadd_h(reg5, vec1);
+
+  tmp0 = __lsx_vneg_h(reg6);
+  DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
+  DOTP_CONST_PAIR(tmp0, reg1, cospi_24_64, cospi_8_64, reg6, reg1);
+
+  vec0 = __lsx_vsub_h(reg0, reg6);
+  reg0 = __lsx_vadd_h(reg0, reg6);
+  vec1 = __lsx_vsub_h(reg7, reg1);
+  reg7 = __lsx_vadd_h(reg7, reg1);
+
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
+
+  /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
+  /* Store 8 */
+  LSX_BUTTERFLY_4_H(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
+  __lsx_vst(loc1, tmp_eve_buf, 0);
+  __lsx_vst(loc3, tmp_eve_buf, 16);
+  __lsx_vst(loc2, tmp_eve_buf, 224);
+  __lsx_vst(loc0, tmp_eve_buf, 240);
+
+  LSX_BUTTERFLY_4_H(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
+  __lsx_vst(loc1, tmp_eve_buf, 32);
+  __lsx_vst(loc3, tmp_eve_buf, 48);
+  __lsx_vst(loc2, tmp_eve_buf, 192);
+  __lsx_vst(loc0, tmp_eve_buf, 208);
+
+  /* Store 8 */
+  LSX_BUTTERFLY_4_H(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
+  __lsx_vst(loc1, tmp_eve_buf, 64);
+  __lsx_vst(loc3, tmp_eve_buf, 80);
+  __lsx_vst(loc2, tmp_eve_buf, 160);
+  __lsx_vst(loc0, tmp_eve_buf, 176);
+
+  LSX_BUTTERFLY_4_H(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
+  __lsx_vst(loc1, tmp_eve_buf, 96);
+  __lsx_vst(loc3, tmp_eve_buf, 112);
+  __lsx_vst(loc2, tmp_eve_buf, 128);
+  __lsx_vst(loc0, tmp_eve_buf, 144);
+}
+
+static void idct8x32_column_odd_process_store(int16_t *tmp_buf,
+                                              int16_t *tmp_odd_buf) {
+  __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+
+  /* Odd stage 1 */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 64, tmp_buf, 448, tmp_buf, 576, tmp_buf, 960,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 1088, tmp_buf, 1472, tmp_buf, 1600, tmp_buf,
+            1984, reg4, reg5, reg6, reg7);
+
+  DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
+  DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
+
+  vec0 = __lsx_vadd_h(reg0, reg3);
+  reg0 = __lsx_vsub_h(reg0, reg3);
+  reg3 = __lsx_vadd_h(reg7, reg4);
+  reg7 = __lsx_vsub_h(reg7, reg4);
+  reg4 = __lsx_vadd_h(reg1, reg2);
+  reg1 = __lsx_vsub_h(reg1, reg2);
+  reg2 = __lsx_vadd_h(reg6, reg5);
+  reg6 = __lsx_vsub_h(reg6, reg5);
+  reg5 = vec0;
+
+  /* 4 Stores */
+  DUP2_ARG2(__lsx_vadd_h, reg5, reg4, reg3, reg2, vec0, vec1);
+  __lsx_vst(vec0, tmp_odd_buf, 64);
+  __lsx_vst(vec1, tmp_odd_buf, 80);
+  DUP2_ARG2(__lsx_vsub_h, reg5, reg4, reg3, reg2, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
+  __lsx_vst(vec0, tmp_odd_buf, 0);
+  __lsx_vst(vec1, tmp_odd_buf, 16);
+
+  /* 4 Stores */
+  DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
+  LSX_BUTTERFLY_4_H(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
+  DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
+  __lsx_vst(vec0, tmp_odd_buf, 96);
+  __lsx_vst(vec1, tmp_odd_buf, 112);
+  __lsx_vst(vec2, tmp_odd_buf, 32);
+  __lsx_vst(vec3, tmp_odd_buf, 48);
+
+  /* Odd stage 2 */
+  /* 8 loads */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 192, tmp_buf, 320, tmp_buf, 704, tmp_buf, 832,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 1216, tmp_buf, 1344, tmp_buf, 1728, tmp_buf,
+            1856, reg4, reg5, reg6, reg7);
+  DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
+  DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
+  DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
+
+  /* 4 Stores */
+  DUP4_ARG2(__lsx_vsub_h, reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0,
+            vec1, vec2, vec3);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
+  DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
+  LSX_BUTTERFLY_4_H(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2);
+  __lsx_vst(vec0, tmp_odd_buf, 192);
+  __lsx_vst(vec1, tmp_odd_buf, 240);
+  DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
+  __lsx_vst(vec0, tmp_odd_buf, 160);
+  __lsx_vst(vec1, tmp_odd_buf, 176);
+
+  /* 4 Stores */
+  DUP4_ARG2(__lsx_vadd_h, reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, vec0,
+            vec1, vec2, vec3);
+  LSX_BUTTERFLY_4_H(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
+  __lsx_vst(reg0, tmp_odd_buf, 208);
+  __lsx_vst(reg1, tmp_odd_buf, 224);
+  DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
+  __lsx_vst(reg0, tmp_odd_buf, 128);
+  __lsx_vst(reg1, tmp_odd_buf, 144);
+
+  /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 16, tmp_odd_buf, 32,
+            tmp_odd_buf, 48, reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 128, tmp_odd_buf, 144, tmp_odd_buf, 160,
+            tmp_odd_buf, 176, reg4, reg5, reg6, reg7);
+  DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0,
+            loc1, loc2, loc3);
+  __lsx_vst(loc0, tmp_odd_buf, 0);
+  __lsx_vst(loc1, tmp_odd_buf, 16);
+  __lsx_vst(loc2, tmp_odd_buf, 32);
+  __lsx_vst(loc3, tmp_odd_buf, 48);
+
+  DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg1, reg5, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+  DUP2_ARG2(__lsx_vsub_h, reg2, reg6, reg3, reg7, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+  __lsx_vst(loc0, tmp_odd_buf, 128);
+  __lsx_vst(loc1, tmp_odd_buf, 144);
+  __lsx_vst(loc2, tmp_odd_buf, 160);
+  __lsx_vst(loc3, tmp_odd_buf, 176);
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 80, tmp_odd_buf, 96,
+            tmp_odd_buf, 112, reg1, reg2, reg0, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 192, tmp_odd_buf, 208, tmp_odd_buf, 224,
+            tmp_odd_buf, 240, reg4, reg5, reg6, reg7);
+  DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0,
+            loc1, loc2, loc3);
+  __lsx_vst(loc0, tmp_odd_buf, 64);
+  __lsx_vst(loc1, tmp_odd_buf, 80);
+  __lsx_vst(loc2, tmp_odd_buf, 96);
+  __lsx_vst(loc3, tmp_odd_buf, 112);
+
+  DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg3, reg7, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+  DUP2_ARG2(__lsx_vsub_h, reg1, reg5, reg2, reg6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+  __lsx_vst(loc0, tmp_odd_buf, 192);
+  __lsx_vst(loc1, tmp_odd_buf, 208);
+  __lsx_vst(loc2, tmp_odd_buf, 224);
+  __lsx_vst(loc3, tmp_odd_buf, 240);
+}
+
+static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
+                                             int16_t *tmp_odd_buf, uint8_t *dst,
+                                             int32_t dst_stride) {
+  __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  __m128i m0, m1, m2, m3, m4, m5, m6, m7;
+  __m128i n0, n1, n2, n3, n4, n5, n6, n7;
+  int32_t stride = dst_stride << 2;
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride + stride2;
+
+  /* FINAL BUTTERFLY : Dependency on Even & Odd */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 144, tmp_odd_buf, 224,
+            tmp_odd_buf, 96, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, tmp_eve_buf, 0, tmp_eve_buf, 128, tmp_eve_buf, 64,
+            tmp_eve_buf, 192, loc0, loc1, loc2, loc3);
+
+  DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0,
+            m4, m2, m6);
+  DUP4_ARG2(__lsx_vsrari_h, m0, 6, m2, 6, m4, 6, m6, 6, m0, m2, m4, m6);
+  VP9_ADDBLK_ST8x4_UB(dst, stride, stride2, stride3, m0, m2, m4, m6);
+  DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6,
+            m2, m4, m0);
+  DUP4_ARG2(__lsx_vsrari_h, m0, 6, m2, 6, m4, 6, m6, 6, m0, m2, m4, m6);
+  VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), stride, stride2, stride3, m0, m2,
+                      m4, m6);
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 208, tmp_odd_buf, 160,
+            tmp_odd_buf, 48, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, tmp_eve_buf, 32, tmp_eve_buf, 160, tmp_eve_buf, 96,
+            tmp_eve_buf, 224, loc0, loc1, loc2, loc3);
+
+  DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1,
+            m5, m3, m7);
+  DUP4_ARG2(__lsx_vsrari_h, m1, 6, m3, 6, m5, 6, m7, 6, m1, m3, m5, m7);
+  VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), stride, stride2, stride3, m1, m3,
+                      m5, m7);
+  DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7,
+            m3, m5, m1);
+  DUP4_ARG2(__lsx_vsrari_h, m1, 6, m3, 6, m5, 6, m7, 6, m1, m3, m5, m7);
+  VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), stride, stride2, stride3, m1, m3,
+                      m5, m7);
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 32, tmp_odd_buf, 176, tmp_odd_buf, 192,
+            tmp_odd_buf, 112, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, tmp_eve_buf, 16, tmp_eve_buf, 144, tmp_eve_buf, 80,
+            tmp_eve_buf, 208, loc0, loc1, loc2, loc3);
+  DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0,
+            n4, n2, n6);
+  DUP4_ARG2(__lsx_vsrari_h, n0, 6, n2, 6, n4, 6, n6, 6, n0, n2, n4, n6);
+  VP9_ADDBLK_ST8x4_UB((dst + dst_stride), stride, stride2, stride3, n0, n2, n4,
+                      n6);
+  DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6,
+            n2, n4, n0);
+  DUP4_ARG2(__lsx_vsrari_h, n0, 6, n2, 6, n4, 6, n6, 6, n0, n2, n4, n6);
+  VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), stride, stride2, stride3, n0, n2,
+                      n4, n6);
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 80, tmp_odd_buf, 240, tmp_odd_buf, 128,
+            tmp_odd_buf, 16, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, tmp_eve_buf, 48, tmp_eve_buf, 176, tmp_eve_buf, 112,
+            tmp_eve_buf, 240, loc0, loc1, loc2, loc3);
+  DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1,
+            n5, n3, n7);
+  DUP4_ARG2(__lsx_vsrari_h, n1, 6, n3, 6, n5, 6, n7, 6, n1, n3, n5, n7);
+  VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), stride, stride2, stride3, n1, n3,
+                      n5, n7);
+  DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7,
+            n3, n5, n1);
+  DUP4_ARG2(__lsx_vsrari_h, n1, 6, n3, 6, n5, 6, n7, 6, n1, n3, n5, n7);
+  VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), stride, stride2, stride3, n1, n3,
+                      n5, n7);
+}
+
+static void idct8x32_1d_columns_addblk_lsx(int16_t *input, uint8_t *dst,
+                                           int32_t dst_stride) {
+  DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
+  DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
+
+  idct8x32_column_even_process_store(input, &tmp_eve_buf[0]);
+  idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]);
+  idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], dst,
+                                   dst_stride);
+}
+
+void vpx_idct32x32_1024_add_lsx(const int16_t *input, uint8_t *dst,
+                                int32_t dst_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
+  int16_t *out_ptr = out_arr;
+
+  /* transform rows */
+  for (i = 0; i < 4; ++i) {
+    /* process 32 * 8 block */
+    idct32x8_1d_rows_lsx((input + (i << 8)), (out_ptr + (i << 8)));
+  }
+
+  for (i = 0; i < 4; ++i) {
+    /* process 8 * 32 block */
+    idct8x32_1d_columns_addblk_lsx((out_ptr + (i << 3)), (dst + (i << 3)),
+                                   dst_stride);
+  }
+}
+
+void vpx_idct32x32_34_add_lsx(const int16_t *input, uint8_t *dst,
+                              int32_t dst_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
+  int16_t *out_ptr = out_arr;
+  __m128i zero = __lsx_vldi(0);
+
+  for (i = 32; i--;) {
+    __lsx_vst(zero, out_ptr, 0);
+    __lsx_vst(zero, out_ptr, 16);
+    __lsx_vst(zero, out_ptr, 32);
+    __lsx_vst(zero, out_ptr, 48);
+    out_ptr += 32;
+  }
+
+  out_ptr = out_arr;
+
+  /* rows: only upper-left 8x8 has non-zero coeff */
+  idct32x8_1d_rows_lsx(input, out_ptr);
+
+  /* transform columns */
+  for (i = 0; i < 4; ++i) {
+    /* process 8 * 32 block */
+    idct8x32_1d_columns_addblk_lsx((out_ptr + (i << 3)), (dst + (i << 3)),
+                                   dst_stride);
+  }
+}
+
+void vpx_idct32x32_1_add_lsx(const int16_t *input, uint8_t *dst,
+                             int32_t dst_stride) {
+  int32_t i;
+  int16_t out;
+  __m128i dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+  __m128i res0, res1, res2, res3, res4, res5, res6, res7, vec;
+
+  out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
+  out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
+  out = ROUND_POWER_OF_TWO(out, 6);
+
+  vec = __lsx_vreplgr2vr_h(out);
+
+  for (i = 16; i--;) {
+    DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1);
+    dst2 = __lsx_vldx(dst, dst_stride);
+    dst3 = __lsx_vldx(dst + 16, dst_stride);
+
+    UNPCK_UB_SH(dst0, res0, res4);
+    UNPCK_UB_SH(dst1, res1, res5);
+    UNPCK_UB_SH(dst2, res2, res6);
+    UNPCK_UB_SH(dst3, res3, res7);
+
+    DUP4_ARG2(__lsx_vadd_h, res0, vec, res1, vec, res2, vec, res3, vec, res0,
+              res1, res2, res3);
+    DUP4_ARG2(__lsx_vadd_h, res4, vec, res5, vec, res6, vec, res7, vec, res4,
+              res5, res6, res7);
+    DUP4_ARG3(__lsx_vssrarni_bu_h, res4, res0, 0, res5, res1, 0, res6, res2, 0,
+              res7, res3, 0, tmp0, tmp1, tmp2, tmp3);
+    __lsx_vst(tmp0, dst, 0);
+    __lsx_vst(tmp1, dst, 16);
+    dst += dst_stride;
+    __lsx_vst(tmp2, dst, 0);
+    __lsx_vst(tmp3, dst, 16);
+    dst += dst_stride;
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/intrapred_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/intrapred_lsx.c
new file mode 100644
index 0000000000..f990211791
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/intrapred_lsx.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS.  All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static inline void intra_predict_dc_8x8_lsx(const uint8_t *src_top,
+                                            const uint8_t *src_left,
+                                            uint8_t *dst, int32_t dst_stride) {
+  uint64_t val0, val1;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i store, sum_h, sum_w, sum_d;
+  __m128i src = { 0 };
+
+  val0 = *(const uint64_t *)src_top;
+  val1 = *(const uint64_t *)src_left;
+  DUP2_ARG3(__lsx_vinsgr2vr_d, src, val0, 0, src, val1, 1, src, src);
+  sum_h = __lsx_vhaddw_hu_bu(src, src);
+  sum_w = __lsx_vhaddw_wu_hu(sum_h, sum_h);
+  sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w);
+  sum_w = __lsx_vpickev_w(sum_d, sum_d);
+  sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w);
+  sum_w = __lsx_vsrari_w(sum_d, 4);
+  store = __lsx_vreplvei_b(sum_w, 0);
+
+  __lsx_vstelm_d(store, dst, 0, 0);
+  __lsx_vstelm_d(store, dst + dst_stride, 0, 0);
+  __lsx_vstelm_d(store, dst + dst_stride_x2, 0, 0);
+  __lsx_vstelm_d(store, dst + dst_stride_x3, 0, 0);
+  dst += dst_stride_x4;
+  __lsx_vstelm_d(store, dst, 0, 0);
+  __lsx_vstelm_d(store, dst + dst_stride, 0, 0);
+  __lsx_vstelm_d(store, dst + dst_stride_x2, 0, 0);
+  __lsx_vstelm_d(store, dst + dst_stride_x3, 0, 0);
+}
+
+static inline void intra_predict_dc_16x16_lsx(const uint8_t *src_top,
+                                              const uint8_t *src_left,
+                                              uint8_t *dst,
+                                              int32_t dst_stride) {
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i top, left, out;
+  __m128i sum_h, sum_top, sum_left;
+  __m128i sum_w;
+  __m128i sum_d;
+
+  DUP2_ARG2(__lsx_vld, src_top, 0, src_left, 0, top, left);
+  DUP2_ARG2(__lsx_vhaddw_hu_bu, top, top, left, left, sum_top, sum_left);
+  sum_h = __lsx_vadd_h(sum_top, sum_left);
+  sum_w = __lsx_vhaddw_wu_hu(sum_h, sum_h);
+  sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w);
+  sum_w = __lsx_vpickev_w(sum_d, sum_d);
+  sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w);
+  sum_w = __lsx_vsrari_w(sum_d, 5);
+  out = __lsx_vreplvei_b(sum_w, 0);
+
+  __lsx_vstx(out, dst, 0);
+  __lsx_vstx(out, dst, dst_stride);
+  __lsx_vstx(out, dst, dst_stride_x2);
+  __lsx_vstx(out, dst, dst_stride_x3);
+  dst += dst_stride_x4;
+  __lsx_vstx(out, dst, 0);
+  __lsx_vstx(out, dst, dst_stride);
+  __lsx_vstx(out, dst, dst_stride_x2);
+  __lsx_vstx(out, dst, dst_stride_x3);
+  dst += dst_stride_x4;
+  __lsx_vstx(out, dst, 0);
+  __lsx_vstx(out, dst, dst_stride);
+  __lsx_vstx(out, dst, dst_stride_x2);
+  __lsx_vstx(out, dst, dst_stride_x3);
+  dst += dst_stride_x4;
+  __lsx_vstx(out, dst, 0);
+  __lsx_vstx(out, dst, dst_stride);
+  __lsx_vstx(out, dst, dst_stride_x2);
+  __lsx_vstx(out, dst, dst_stride_x3);
+}
+
+void vpx_dc_predictor_8x8_lsx(uint8_t *dst, ptrdiff_t y_stride,
+                              const uint8_t *above, const uint8_t *left) {
+  intra_predict_dc_8x8_lsx(above, left, dst, y_stride);
+}
+
+void vpx_dc_predictor_16x16_lsx(uint8_t *dst, ptrdiff_t y_stride,
+                                const uint8_t *above, const uint8_t *left) {
+  intra_predict_dc_16x16_lsx(above, left, dst, y_stride);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_16_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_16_lsx.c
new file mode 100644
index 0000000000..0503df9966
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_16_lsx.c
@@ -0,0 +1,1320 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Hecai Yuan <yuanhecai@loongson.cn>
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS.  All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/loopfilter_lsx.h"
+#include "vpx_ports/mem.h"
+
+#define LSX_LD_8(_src, _stride, _stride2, _stride3, _stride4, _in0, _in1, \
+                 _in2, _in3, _in4, _in5, _in6, _in7)                      \
+  do {                                                                    \
+    _in0 = __lsx_vld(_src, 0);                                            \
+    _in1 = __lsx_vldx(_src, _stride);                                     \
+    _in2 = __lsx_vldx(_src, _stride2);                                    \
+    _in3 = __lsx_vldx(_src, _stride3);                                    \
+    _src += _stride4;                                                     \
+    _in4 = __lsx_vld(_src, 0);                                            \
+    _in5 = __lsx_vldx(_src, _stride);                                     \
+    _in6 = __lsx_vldx(_src, _stride2);                                    \
+    _in7 = __lsx_vldx(_src, _stride3);                                    \
+  } while (0)
+
+#define LSX_ST_8(_dst0, _dst1, _dst2, _dst3, _dst4, _dst5, _dst6, _dst7, _dst, \
+                 _stride, _stride2, _stride3, _stride4)                        \
+  do {                                                                         \
+    __lsx_vst(_dst0, _dst, 0);                                                 \
+    __lsx_vstx(_dst1, _dst, _stride);                                          \
+    __lsx_vstx(_dst2, _dst, _stride2);                                         \
+    __lsx_vstx(_dst3, _dst, _stride3);                                         \
+    _dst += _stride4;                                                          \
+    __lsx_vst(_dst4, _dst, 0);                                                 \
+    __lsx_vstx(_dst5, _dst, _stride);                                          \
+    __lsx_vstx(_dst6, _dst, _stride2);                                         \
+    __lsx_vstx(_dst7, _dst, _stride3);                                         \
+  } while (0)
+
+static int32_t hz_lpf_t4_and_t8_16w(uint8_t *dst, int32_t stride,
+                                    uint8_t *filter48,
+                                    const uint8_t *b_limit_ptr,
+                                    const uint8_t *limit_ptr,
+                                    const uint8_t *thresh_ptr) {
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  __m128i flat, mask, hev, thresh, b_limit, limit;
+  __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
+  __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
+  __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
+  __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
+
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride2 + stride;
+  int32_t stride4 = stride2 << 1;
+
+  /* load vector elements */
+  DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst,
+            -stride, p3, p2, p1, p0);
+
+  q0 = __lsx_vld(dst, 0);
+  DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
+  q3 = __lsx_vldx(dst, stride3);
+
+  thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+  b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+  limit = __lsx_vldrepl_b(limit_ptr, 0);
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  if (__lsx_bz_v(flat)) {
+    __lsx_vstx(p1_out, dst, -stride2);
+    __lsx_vstx(p0_out, dst, -stride);
+    __lsx_vst(q0_out, dst, 0);
+    __lsx_vstx(q1_out, dst, stride);
+
+    return 1;
+  }
+
+  DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
+            p0_l);
+  DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
+            q3_l);
+
+  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+              p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+  DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h);
+  DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h);
+  VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
+              p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
+
+  /* convert 16 bit output data into 8 bit */
+  DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l,
+            p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l,
+            p1_filt8_l, p0_filt8_l, q0_filt8_l);
+  DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l,
+            q1_filt8_l, q2_filt8_l);
+
+  /* store pixel values */
+  DUP4_ARG3(__lsx_vbitsel_v, p2, p2_filt8_l, flat, p1_out, p1_filt8_l, flat,
+            p0_out, p0_filt8_l, flat, q0_out, q0_filt8_l, flat, p2_out, p1_out,
+            p0_out, q0_out);
+  DUP2_ARG3(__lsx_vbitsel_v, q1_out, q1_filt8_l, flat, q2, q2_filt8_l, flat,
+            q1_out, q2_out);
+
+  __lsx_vst(p2_out, filter48, 0);
+  __lsx_vst(p1_out, filter48, 16);
+  __lsx_vst(p0_out, filter48, 32);
+  __lsx_vst(q0_out, filter48, 48);
+  __lsx_vst(q1_out, filter48, 64);
+  __lsx_vst(q2_out, filter48, 80);
+  __lsx_vst(flat, filter48, 96);
+
+  return 0;
+}
+
+static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) {
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride2 + stride;
+  int32_t stride4 = stride2 << 1;
+  uint8_t *dst_tmp0 = dst - stride4;
+  uint8_t *dst_tmp1 = dst + stride4;
+
+  __m128i flat, flat2, filter8;
+  __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  __m128i out_h, out_l;
+  v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
+  v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
+  v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
+  v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
+  v8u16 p7_h_in, p6_h_in, p5_h_in, p4_h_in;
+  v8u16 p3_h_in, p2_h_in, p1_h_in, p0_h_in;
+  v8u16 q7_h_in, q6_h_in, q5_h_in, q4_h_in;
+  v8u16 q3_h_in, q2_h_in, q1_h_in, q0_h_in;
+  v8u16 tmp0_l, tmp1_l, tmp0_h, tmp1_h;
+
+  flat = __lsx_vld(filter48, 96);
+
+  DUP4_ARG2(__lsx_vldx, dst_tmp0, -stride4, dst_tmp0, -stride3, dst_tmp0,
+            -stride2, dst_tmp0, -stride, p7, p6, p5, p4);
+
+  p3 = __lsx_vld(dst_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, dst_tmp0, stride, dst_tmp0, stride2, p2, p1);
+  p0 = __lsx_vldx(dst_tmp0, stride3);
+
+  q0 = __lsx_vld(dst, 0);
+  DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
+  q3 = __lsx_vldx(dst, stride3);
+
+  q4 = __lsx_vld(dst_tmp1, 0);
+  DUP2_ARG2(__lsx_vldx, dst_tmp1, stride, dst_tmp1, stride2, q5, q6);
+  q7 = __lsx_vldx(dst_tmp1, stride3);
+
+  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+  if (__lsx_bz_v(flat2)) {
+    DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32, filter48, 48,
+              p2, p1, p0, q0);
+    DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2);
+    __lsx_vstx(p2, dst, -stride3);
+    __lsx_vstx(p1, dst, -stride2);
+    __lsx_vstx(p0, dst, -stride);
+    __lsx_vst(q0, dst, 0);
+    __lsx_vstx(q1, dst, stride);
+    __lsx_vstx(q2, dst, stride2);
+  } else {
+    dst = dst_tmp0 - stride3;
+
+    p7_l_in = (v8u16)__lsx_vsllwil_hu_bu(p7, 0);
+    p6_l_in = (v8u16)__lsx_vsllwil_hu_bu(p6, 0);
+    p5_l_in = (v8u16)__lsx_vsllwil_hu_bu(p5, 0);
+    p4_l_in = (v8u16)__lsx_vsllwil_hu_bu(p4, 0);
+    p3_l_in = (v8u16)__lsx_vsllwil_hu_bu(p3, 0);
+    p2_l_in = (v8u16)__lsx_vsllwil_hu_bu(p2, 0);
+    p1_l_in = (v8u16)__lsx_vsllwil_hu_bu(p1, 0);
+    p0_l_in = (v8u16)__lsx_vsllwil_hu_bu(p0, 0);
+    q0_l_in = (v8u16)__lsx_vsllwil_hu_bu(q0, 0);
+
+    tmp0_l = p7_l_in << 3;
+    tmp0_l -= p7_l_in;
+    tmp0_l += p6_l_in;
+    tmp0_l += q0_l_in;
+    tmp1_l = p6_l_in + p5_l_in;
+    tmp1_l += p4_l_in;
+    tmp1_l += p3_l_in;
+    tmp1_l += p2_l_in;
+    tmp1_l += p1_l_in;
+    tmp1_l += p0_l_in;
+    tmp1_l += tmp0_l;
+
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    p7_h_in = (v8u16)__lsx_vexth_hu_bu(p7);
+    p6_h_in = (v8u16)__lsx_vexth_hu_bu(p6);
+    p5_h_in = (v8u16)__lsx_vexth_hu_bu(p5);
+    p4_h_in = (v8u16)__lsx_vexth_hu_bu(p4);
+    p3_h_in = (v8u16)__lsx_vexth_hu_bu(p3);
+    p2_h_in = (v8u16)__lsx_vexth_hu_bu(p2);
+    p1_h_in = (v8u16)__lsx_vexth_hu_bu(p1);
+    p0_h_in = (v8u16)__lsx_vexth_hu_bu(p0);
+    q0_h_in = (v8u16)__lsx_vexth_hu_bu(q0);
+
+    tmp0_h = p7_h_in << 3;
+    tmp0_h -= p7_h_in;
+    tmp0_h += p6_h_in;
+    tmp0_h += q0_h_in;
+    tmp1_h = p6_h_in + p5_h_in;
+    tmp1_h += p4_h_in;
+    tmp1_h += p3_h_in;
+    tmp1_h += p2_h_in;
+    tmp1_h += p1_h_in;
+    tmp1_h += p0_h_in;
+    tmp1_h += tmp0_h;
+
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    p6 = __lsx_vbitsel_v(p6, out_l, flat2);
+    __lsx_vst(p6, dst, 0);
+    dst += stride;
+
+    /* p5 */
+    q1_l_in = (v8u16)__lsx_vsllwil_hu_bu(q1, 0);
+    tmp0_l = p5_l_in - p6_l_in;
+    tmp0_l += q1_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    q1_h_in = (v8u16)__lsx_vexth_hu_bu(q1);
+    tmp0_h = p5_h_in - p6_h_in;
+    tmp0_h += q1_h_in;
+    tmp0_h -= p7_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    p5 = __lsx_vbitsel_v(p5, out_l, flat2);
+    __lsx_vst(p5, dst, 0);
+    dst += stride;
+
+    /* p4 */
+    q2_l_in = (v8u16)__lsx_vsllwil_hu_bu(q2, 0);
+    tmp0_l = p4_l_in - p5_l_in;
+    tmp0_l += q2_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    q2_h_in = (v8u16)__lsx_vexth_hu_bu(q2);
+    tmp0_h = p4_h_in - p5_h_in;
+    tmp0_h += q2_h_in;
+    tmp0_h -= p7_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    p4 = __lsx_vbitsel_v(p4, out_l, flat2);
+    __lsx_vst(p4, dst, 0);
+    dst += stride;
+
+    /* p3 */
+    q3_l_in = (v8u16)__lsx_vsllwil_hu_bu(q3, 0);
+    tmp0_l = p3_l_in - p4_l_in;
+    tmp0_l += q3_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    q3_h_in = (v8u16)__lsx_vexth_hu_bu(q3);
+    tmp0_h = p3_h_in - p4_h_in;
+    tmp0_h += q3_h_in;
+    tmp0_h -= p7_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    p3 = __lsx_vbitsel_v(p3, out_l, flat2);
+    __lsx_vst(p3, dst, 0);
+    dst += stride;
+
+    /* p2 */
+    q4_l_in = (v8u16)__lsx_vsllwil_hu_bu(q4, 0);
+    filter8 = __lsx_vld(filter48, 0);
+    tmp0_l = p2_l_in - p3_l_in;
+    tmp0_l += q4_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    q4_h_in = (v8u16)__lsx_vexth_hu_bu(q4);
+    tmp0_h = p2_h_in - p3_h_in;
+    tmp0_h += q4_h_in;
+    tmp0_h -= p7_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+    __lsx_vst(filter8, dst, 0);
+    dst += stride;
+
+    /* p1 */
+    q5_l_in = (v8u16)__lsx_vsllwil_hu_bu(q5, 0);
+    filter8 = __lsx_vld(filter48, 16);
+    tmp0_l = p1_l_in - p2_l_in;
+    tmp0_l += q5_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    q5_h_in = (v8u16)__lsx_vexth_hu_bu(q5);
+    tmp0_h = p1_h_in - p2_h_in;
+    tmp0_h += q5_h_in;
+    tmp0_h -= p7_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+    __lsx_vst(filter8, dst, 0);
+    dst += stride;
+
+    /* p0 */
+    q6_l_in = (v8u16)__lsx_vsllwil_hu_bu(q6, 0);
+    filter8 = __lsx_vld(filter48, 32);
+    tmp0_l = p0_l_in - p1_l_in;
+    tmp0_l += q6_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    q6_h_in = (v8u16)__lsx_vexth_hu_bu(q6);
+    tmp0_h = p0_h_in - p1_h_in;
+    tmp0_h += q6_h_in;
+    tmp0_h -= p7_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+    __lsx_vst(filter8, dst, 0);
+    dst += stride;
+
+    /* q0 */
+    q7_l_in = (v8u16)__lsx_vsllwil_hu_bu(q7, 0);
+    filter8 = __lsx_vld(filter48, 48);
+    tmp0_l = q7_l_in - p0_l_in;
+    tmp0_l += q0_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    q7_h_in = (v8u16)__lsx_vexth_hu_bu(q7);
+    tmp0_h = q7_h_in - p0_h_in;
+    tmp0_h += q0_h_in;
+    tmp0_h -= p7_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+    __lsx_vst(filter8, dst, 0);
+    dst += stride;
+
+    /* q1 */
+    filter8 = __lsx_vld(filter48, 64);
+    tmp0_l = q7_l_in - q0_l_in;
+    tmp0_l += q1_l_in;
+    tmp0_l -= p6_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    tmp0_h = q7_h_in - q0_h_in;
+    tmp0_h += q1_h_in;
+    tmp0_h -= p6_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+    __lsx_vst(filter8, dst, 0);
+    dst += stride;
+
+    /* q2 */
+    filter8 = __lsx_vld(filter48, 80);
+    tmp0_l = q7_l_in - q1_l_in;
+    tmp0_l += q2_l_in;
+    tmp0_l -= p5_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    tmp0_h = q7_h_in - q1_h_in;
+    tmp0_h += q2_h_in;
+    tmp0_h -= p5_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+    __lsx_vst(filter8, dst, 0);
+    dst += stride;
+
+    /* q3 */
+    tmp0_l = q7_l_in - q2_l_in;
+    tmp0_l += q3_l_in;
+    tmp0_l -= p4_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    tmp0_h = q7_h_in - q2_h_in;
+    tmp0_h += q3_h_in;
+    tmp0_h -= p4_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    q3 = __lsx_vbitsel_v(q3, out_l, flat2);
+    __lsx_vst(q3, dst, 0);
+    dst += stride;
+
+    /* q4 */
+    tmp0_l = q7_l_in - q3_l_in;
+    tmp0_l += q4_l_in;
+    tmp0_l -= p3_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    tmp0_h = q7_h_in - q3_h_in;
+    tmp0_h += q4_h_in;
+    tmp0_h -= p3_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    q4 = __lsx_vbitsel_v(q4, out_l, flat2);
+    __lsx_vst(q4, dst, 0);
+    dst += stride;
+
+    /* q5 */
+    tmp0_l = q7_l_in - q4_l_in;
+    tmp0_l += q5_l_in;
+    tmp0_l -= p2_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    tmp0_h = q7_h_in - q4_h_in;
+    tmp0_h += q5_h_in;
+    tmp0_h -= p2_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    q5 = __lsx_vbitsel_v(q5, out_l, flat2);
+    __lsx_vst(q5, dst, 0);
+    dst += stride;
+
+    /* q6 */
+    tmp0_l = q7_l_in - q5_l_in;
+    tmp0_l += q6_l_in;
+    tmp0_l -= p1_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    tmp0_h = q7_h_in - q5_h_in;
+    tmp0_h += q6_h_in;
+    tmp0_h -= p1_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    q6 = __lsx_vbitsel_v(q6, out_l, flat2);
+    __lsx_vst(q6, dst, 0);
+  }
+}
+
+static void mb_lpf_horizontal_edge_dual(uint8_t *dst, int32_t stride,
+                                        const uint8_t *b_limit_ptr,
+                                        const uint8_t *limit_ptr,
+                                        const uint8_t *thresh_ptr) {
+  DECLARE_ALIGNED(16, uint8_t, filter48[16 * 8]);
+  uint8_t early_exit = 0;
+
+  early_exit = hz_lpf_t4_and_t8_16w(dst, stride, &filter48[0], b_limit_ptr,
+                                    limit_ptr, thresh_ptr);
+
+  if (early_exit == 0) {
+    hz_lpf_t16_16w(dst, stride, filter48);
+  }
+}
+
+static void mb_lpf_horizontal_edge(uint8_t *dst, int32_t stride,
+                                   const uint8_t *b_limit_ptr,
+                                   const uint8_t *limit_ptr,
+                                   const uint8_t *thresh_ptr, int32_t count) {
+  if (count == 1) {
+    __m128i flat2, mask, hev, flat, thresh, b_limit, limit;
+    __m128i p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
+    __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+    __m128i p0_filter16, p1_filter16;
+    __m128i p2_filter8, p1_filter8, p0_filter8;
+    __m128i q0_filter8, q1_filter8, q2_filter8;
+    __m128i p7_l, p6_l, p5_l, p4_l, q7_l, q6_l, q5_l, q4_l;
+    __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l;
+    __m128i zero = __lsx_vldi(0);
+    __m128i tmp0, tmp1, tmp2;
+
+    int32_t stride2 = stride << 1;
+    int32_t stride3 = 2 + stride;
+    int32_t stride4 = stride << 2;
+    uint8_t *dst_tmp0 = dst - stride4;
+    uint8_t *dst_tmp1 = dst + stride4;
+
+    /* load vector elements */
+    DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst,
+              -stride, p3, p2, p1, p0);
+    q0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
+    q3 = __lsx_vldx(dst, stride3);
+
+    thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+    b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+    limit = __lsx_vldrepl_b(limit_ptr, 0);
+
+    /* filter_mask* */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+                 mask, flat);
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+    flat = __lsx_vilvl_d(zero, flat);
+    if (__lsx_bz_v(flat)) {
+      __lsx_vstelm_d(p1_out, dst - stride2, 0, 0);
+      __lsx_vstelm_d(p0_out, dst - stride, 0, 0);
+      __lsx_vstelm_d(q0_out, dst, 0, 0);
+      __lsx_vstelm_d(q1_out, dst + stride, 0, 0);
+    } else {
+      /* convert 8 bit input data into 16 bit */
+      DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l,
+                p2_l, p1_l, p0_l);
+      DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l,
+                q1_l, q2_l, q3_l);
+      VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filter8,
+                  p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
+
+      /* convert 16 bit output data into 8 bit */
+      DUP4_ARG2(__lsx_vpickev_b, zero, p2_filter8, zero, p1_filter8, zero,
+                p0_filter8, zero, q0_filter8, p2_filter8, p1_filter8,
+                p0_filter8, q0_filter8);
+      DUP2_ARG2(__lsx_vpickev_b, zero, q1_filter8, zero, q2_filter8, q1_filter8,
+                q2_filter8);
+
+      /* store pixel values */
+      p2_out = __lsx_vbitsel_v(p2, p2_filter8, flat);
+      p1_out = __lsx_vbitsel_v(p1_out, p1_filter8, flat);
+      p0_out = __lsx_vbitsel_v(p0_out, p0_filter8, flat);
+      q0_out = __lsx_vbitsel_v(q0_out, q0_filter8, flat);
+      q1_out = __lsx_vbitsel_v(q1_out, q1_filter8, flat);
+      q2_out = __lsx_vbitsel_v(q2, q2_filter8, flat);
+
+      /* load 16 vector elements */
+      DUP4_ARG2(__lsx_vldx, dst_tmp0, -stride4, dst_tmp0, -stride3, dst_tmp0,
+                -stride2, dst_tmp0, -stride, p7, p6, p5, p4);
+      q4 = __lsx_vld(dst_tmp1, 0);
+      DUP2_ARG2(__lsx_vldx, dst_tmp1, stride, dst_tmp1, stride2, q5, q6);
+      q7 = __lsx_vldx(dst_tmp1, stride3);
+
+      VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+      if (__lsx_bz_v(flat2)) {
+        dst -= stride3;
+        __lsx_vstelm_d(p2_out, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p1_out, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p0_out, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(q0_out, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(q1_out, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(q2_out, dst, 0, 0);
+      } else {
+        /* LSB(right) 8 pixel operation */
+        DUP4_ARG2(__lsx_vilvl_b, zero, p7, zero, p6, zero, p5, zero, p4, p7_l,
+                  p6_l, p5_l, p4_l);
+        DUP4_ARG2(__lsx_vilvl_b, zero, q4, zero, q5, zero, q6, zero, q7, q4_l,
+                  q5_l, q6_l, q7_l);
+
+        tmp0 = __lsx_vslli_h(p7_l, 3);
+        tmp0 = __lsx_vsub_h(tmp0, p7_l);
+        tmp0 = __lsx_vadd_h(tmp0, p6_l);
+        tmp0 = __lsx_vadd_h(tmp0, q0_l);
+
+        dst = dst_tmp0 - stride3;
+
+        /* calculation of p6 and p5 */
+        tmp1 = __lsx_vadd_h(p6_l, p5_l);
+        tmp1 = __lsx_vadd_h(tmp1, p4_l);
+        tmp1 = __lsx_vadd_h(tmp1, p3_l);
+        tmp1 = __lsx_vadd_h(tmp1, p2_l);
+        tmp1 = __lsx_vadd_h(tmp1, p1_l);
+        tmp1 = __lsx_vadd_h(tmp1, p0_l);
+        tmp1 = __lsx_vadd_h(tmp1, tmp0);
+        p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+        tmp0 = __lsx_vsub_h(p5_l, p6_l);
+        tmp0 = __lsx_vadd_h(tmp0, q1_l);
+        tmp0 = __lsx_vsub_h(tmp0, p7_l);
+        tmp1 = __lsx_vadd_h(tmp1, tmp0);
+        p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+        DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+                  p0_filter16, p1_filter16);
+        DUP2_ARG3(__lsx_vbitsel_v, p6, p0_filter16, flat2, p5, p1_filter16,
+                  flat2, p0_filter16, p1_filter16);
+        __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+        dst += stride;
+
+        /* calculation of p4 and p3 */
+        tmp0 = __lsx_vsub_h(p4_l, p5_l);
+        tmp0 = __lsx_vadd_h(tmp0, q2_l);
+        tmp0 = __lsx_vsub_h(tmp0, p7_l);
+        tmp2 = __lsx_vsub_h(p3_l, p4_l);
+        tmp2 = __lsx_vadd_h(tmp2, q3_l);
+        tmp2 = __lsx_vsub_h(tmp2, p7_l);
+        tmp1 = __lsx_vadd_h(tmp1, tmp0);
+        p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+        tmp1 = __lsx_vadd_h(tmp1, tmp2);
+        p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+        DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+                  p0_filter16, p1_filter16);
+        DUP2_ARG3(__lsx_vbitsel_v, p4, p0_filter16, flat2, p3, p1_filter16,
+                  flat2, p0_filter16, p1_filter16);
+        __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+        dst += stride;
+
+        /* calculation of p2 and p1 */
+        tmp0 = __lsx_vsub_h(p2_l, p3_l);
+        tmp0 = __lsx_vadd_h(tmp0, q4_l);
+        tmp0 = __lsx_vsub_h(tmp0, p7_l);
+        tmp2 = __lsx_vsub_h(p1_l, p2_l);
+        tmp2 = __lsx_vadd_h(tmp2, q5_l);
+        tmp2 = __lsx_vsub_h(tmp2, p7_l);
+        tmp1 = __lsx_vadd_h(tmp1, tmp0);
+        p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+        tmp1 = __lsx_vadd_h(tmp1, tmp2);
+        p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+        DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+                  p0_filter16, p1_filter16);
+        DUP2_ARG3(__lsx_vbitsel_v, p2_out, p0_filter16, flat2, p1_out,
+                  p1_filter16, flat2, p0_filter16, p1_filter16);
+        __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+        dst += stride;
+
+        /* calculation of p0 and q0 */
+        tmp0 = __lsx_vsub_h(p0_l, p1_l);
+        tmp0 = __lsx_vadd_h(tmp0, q6_l);
+        tmp0 = __lsx_vsub_h(tmp0, p7_l);
+        tmp2 = __lsx_vsub_h(q7_l, p0_l);
+        tmp2 = __lsx_vadd_h(tmp2, q0_l);
+        tmp2 = __lsx_vsub_h(tmp2, p7_l);
+        tmp1 = __lsx_vadd_h(tmp1, tmp0);
+        p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+        tmp1 = __lsx_vadd_h(tmp1, tmp2);
+        p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+        DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+                  p0_filter16, p1_filter16);
+        DUP2_ARG3(__lsx_vbitsel_v, p0_out, p0_filter16, flat2, q0_out,
+                  p1_filter16, flat2, p0_filter16, p1_filter16);
+        __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+        dst += stride;
+
+        /* calculation of q1 and q2 */
+        tmp0 = __lsx_vsub_h(q7_l, q0_l);
+        tmp0 = __lsx_vadd_h(tmp0, q1_l);
+        tmp0 = __lsx_vsub_h(tmp0, p6_l);
+        tmp2 = __lsx_vsub_h(q7_l, q1_l);
+        tmp2 = __lsx_vadd_h(tmp2, q2_l);
+        tmp2 = __lsx_vsub_h(tmp2, p5_l);
+        tmp1 = __lsx_vadd_h(tmp1, tmp0);
+        p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+        tmp1 = __lsx_vadd_h(tmp1, tmp2);
+        p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+        DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+                  p0_filter16, p1_filter16);
+        DUP2_ARG3(__lsx_vbitsel_v, q1_out, p0_filter16, flat2, q2_out,
+                  p1_filter16, flat2, p0_filter16, p1_filter16);
+        __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+        dst += stride;
+
+        /* calculation of q3 and q4 */
+        tmp0 = __lsx_vsub_h(q7_l, q2_l);
+        tmp0 = __lsx_vadd_h(tmp0, q3_l);
+        tmp0 = __lsx_vsub_h(tmp0, p4_l);
+        tmp2 = __lsx_vsub_h(q7_l, q3_l);
+        tmp2 = __lsx_vadd_h(tmp2, q4_l);
+        tmp2 = __lsx_vsub_h(tmp2, p3_l);
+        tmp1 = __lsx_vadd_h(tmp1, tmp0);
+        p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+        tmp1 = __lsx_vadd_h(tmp1, tmp2);
+        p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+        DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+                  p0_filter16, p1_filter16);
+        DUP2_ARG3(__lsx_vbitsel_v, q3, p0_filter16, flat2, q4, p1_filter16,
+                  flat2, p0_filter16, p1_filter16);
+        __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+        dst += stride;
+
+        /* calculation of q5 and q6 */
+        tmp0 = __lsx_vsub_h(q7_l, q4_l);
+        tmp0 = __lsx_vadd_h(tmp0, q5_l);
+        tmp0 = __lsx_vsub_h(tmp0, p2_l);
+        tmp2 = __lsx_vsub_h(q7_l, q5_l);
+        tmp2 = __lsx_vadd_h(tmp2, q6_l);
+        tmp2 = __lsx_vsub_h(tmp2, p1_l);
+        tmp1 = __lsx_vadd_h(tmp1, tmp0);
+        p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+        tmp1 = __lsx_vadd_h(tmp1, tmp2);
+        p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+        DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+                  p0_filter16, p1_filter16);
+        DUP2_ARG3(__lsx_vbitsel_v, q5, p0_filter16, flat2, q6, p1_filter16,
+                  flat2, p0_filter16, p1_filter16);
+        __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+      }
+    }
+  } else {
+    mb_lpf_horizontal_edge_dual(dst, stride, b_limit_ptr, limit_ptr,
+                                thresh_ptr);
+  }
+}
+
+void vpx_lpf_horizontal_16_dual_lsx(uint8_t *dst, int32_t stride,
+                                    const uint8_t *b_limit_ptr,
+                                    const uint8_t *limit_ptr,
+                                    const uint8_t *thresh_ptr) {
+  mb_lpf_horizontal_edge(dst, stride, b_limit_ptr, limit_ptr, thresh_ptr, 2);
+}
+
+static void transpose_16x16(uint8_t *input, int32_t in_stride, uint8_t *output,
+                            int32_t out_stride) {
+  __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+  __m128i row8, row9, row10, row11, row12, row13, row14, row15;
+  __m128i tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
+  __m128i tmp2, tmp3;
+  __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  int32_t in_stride2 = in_stride << 1;
+  int32_t in_stride3 = in_stride2 + in_stride;
+  int32_t in_stride4 = in_stride2 << 1;
+  int32_t out_stride2 = out_stride << 1;
+  int32_t out_stride3 = out_stride2 + out_stride;
+  int32_t out_stride4 = out_stride2 << 1;
+
+  LSX_LD_8(input, in_stride, in_stride2, in_stride3, in_stride4, row0, row1,
+           row2, row3, row4, row5, row6, row7);
+  input += in_stride4;
+  LSX_LD_8(input, in_stride, in_stride2, in_stride3, in_stride4, row8, row9,
+           row10, row11, row12, row13, row14, row15);
+
+  LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+                      row9, row10, row11, row12, row13, row14, row15, p7, p6,
+                      p5, p4, p3, p2, p1, p0);
+
+  /* transpose 16x8 matrix into 8x16 */
+  /* total 8 intermediate register and 32 instructions */
+  q7 = __lsx_vpackod_d(row8, row0);
+  q6 = __lsx_vpackod_d(row9, row1);
+  q5 = __lsx_vpackod_d(row10, row2);
+  q4 = __lsx_vpackod_d(row11, row3);
+  q3 = __lsx_vpackod_d(row12, row4);
+  q2 = __lsx_vpackod_d(row13, row5);
+  q1 = __lsx_vpackod_d(row14, row6);
+  q0 = __lsx_vpackod_d(row15, row7);
+
+  DUP2_ARG2(__lsx_vpackev_b, q6, q7, q4, q5, tmp0, tmp1);
+  DUP2_ARG2(__lsx_vpackod_b, q6, q7, q4, q5, tmp4, tmp5);
+
+  DUP2_ARG2(__lsx_vpackev_b, q2, q3, q0, q1, q5, q7);
+  DUP2_ARG2(__lsx_vpackod_b, q2, q3, q0, q1, tmp6, tmp7);
+
+  DUP2_ARG2(__lsx_vpackev_h, tmp1, tmp0, q7, q5, tmp2, tmp3);
+  q0 = __lsx_vpackev_w(tmp3, tmp2);
+  q4 = __lsx_vpackod_w(tmp3, tmp2);
+
+  tmp2 = __lsx_vpackod_h(tmp1, tmp0);
+  tmp3 = __lsx_vpackod_h(q7, q5);
+  q2 = __lsx_vpackev_w(tmp3, tmp2);
+  q6 = __lsx_vpackod_w(tmp3, tmp2);
+
+  DUP2_ARG2(__lsx_vpackev_h, tmp5, tmp4, tmp7, tmp6, tmp2, tmp3);
+  q1 = __lsx_vpackev_w(tmp3, tmp2);
+  q5 = __lsx_vpackod_w(tmp3, tmp2);
+
+  tmp2 = __lsx_vpackod_h(tmp5, tmp4);
+  tmp3 = __lsx_vpackod_h(tmp7, tmp6);
+  q3 = __lsx_vpackev_w(tmp3, tmp2);
+  q7 = __lsx_vpackod_w(tmp3, tmp2);
+
+  LSX_ST_8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_stride, out_stride2,
+           out_stride3, out_stride4);
+  output += out_stride4;
+  LSX_ST_8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_stride, out_stride2,
+           out_stride3, out_stride4);
+}
+
+static int32_t vt_lpf_t4_and_t8_16w(uint8_t *dst, uint8_t *filter48,
+                                    uint8_t *dst_org, int32_t stride,
+                                    const uint8_t *b_limit_ptr,
+                                    const uint8_t *limit_ptr,
+                                    const uint8_t *thresh_ptr) {
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride2 + stride;
+  int32_t stride4 = stride2 << 1;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  __m128i flat, mask, hev, thresh, b_limit, limit;
+  __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
+  __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
+  __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
+  __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5;
+
+  /* load vector elements */
+  DUP4_ARG2(__lsx_vld, dst, -64, dst, -48, dst, -32, dst, -16, p3, p2, p1, p0);
+  DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3);
+
+  thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+  b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+  limit = __lsx_vldrepl_b(limit_ptr, 0);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  /* flat4 */
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  /* filter4 */
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  /* if flat is zero for all pixels, then no need to calculate other filter */
+  if (__lsx_bz_v(flat)) {
+    DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    vec2 = __lsx_vilvl_h(vec1, vec0);
+    vec3 = __lsx_vilvh_h(vec1, vec0);
+    DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    vec4 = __lsx_vilvl_h(vec1, vec0);
+    vec5 = __lsx_vilvh_h(vec1, vec0);
+
+    dst_org -= 2;
+    __lsx_vstelm_w(vec2, dst_org, 0, 0);
+    __lsx_vstelm_w(vec2, dst_org + stride, 0, 1);
+    __lsx_vstelm_w(vec2, dst_org + stride2, 0, 2);
+    __lsx_vstelm_w(vec2, dst_org + stride3, 0, 3);
+    dst_org += stride4;
+    __lsx_vstelm_w(vec3, dst_org, 0, 0);
+    __lsx_vstelm_w(vec3, dst_org + stride, 0, 1);
+    __lsx_vstelm_w(vec3, dst_org + stride2, 0, 2);
+    __lsx_vstelm_w(vec3, dst_org + stride3, 0, 3);
+    dst_org += stride4;
+    __lsx_vstelm_w(vec4, dst_org, 0, 0);
+    __lsx_vstelm_w(vec4, dst_org + stride, 0, 1);
+    __lsx_vstelm_w(vec4, dst_org + stride2, 0, 2);
+    __lsx_vstelm_w(vec4, dst_org + stride3, 0, 3);
+    dst_org += stride4;
+    __lsx_vstelm_w(vec5, dst_org, 0, 0);
+    __lsx_vstelm_w(vec5, dst_org + stride, 0, 1);
+    __lsx_vstelm_w(vec5, dst_org + stride2, 0, 2);
+    __lsx_vstelm_w(vec5, dst_org + stride3, 0, 3);
+
+    return 1;
+  }
+
+  DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
+            p0_l);
+  DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
+            q3_l);
+  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+              p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+  DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h);
+  DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h);
+  VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
+              p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
+
+  /* convert 16 bit output data into 8 bit */
+  DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l,
+            p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l,
+            p1_filt8_l, p0_filt8_l, q0_filt8_l);
+  DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l,
+            q1_filt8_l, q2_filt8_l);
+
+  /* store pixel values */
+  p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
+  p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
+  p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
+  q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
+  q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
+  q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
+
+  __lsx_vst(p2_out, filter48, 0);
+  __lsx_vst(p1_out, filter48, 16);
+  __lsx_vst(p0_out, filter48, 32);
+  __lsx_vst(q0_out, filter48, 48);
+  __lsx_vst(q1_out, filter48, 64);
+  __lsx_vst(q2_out, filter48, 80);
+  __lsx_vst(flat, filter48, 96);
+
+  return 0;
+}
+
+static int32_t vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, int32_t stride,
+                              uint8_t *filter48) {
+  __m128i flat, flat2, filter8;
+  __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  __m128i out_l, out_h;
+  v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
+  v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
+  v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
+  v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
+  v8u16 p7_h_in, p6_h_in, p5_h_in, p4_h_in;
+  v8u16 p3_h_in, p2_h_in, p1_h_in, p0_h_in;
+  v8u16 q7_h_in, q6_h_in, q5_h_in, q4_h_in;
+  v8u16 q3_h_in, q2_h_in, q1_h_in, q0_h_in;
+  v8u16 tmp0_l, tmp1_l, tmp0_h, tmp1_h;
+  uint8_t *dst_tmp = dst - 128;
+
+  flat = __lsx_vld(filter48, 96);
+
+  DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48, p7,
+            p6, p5, p4);
+  DUP4_ARG2(__lsx_vld, dst_tmp, 64, dst_tmp, 80, dst_tmp, 96, dst_tmp, 112, p3,
+            p2, p1, p0);
+  DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3);
+  DUP4_ARG2(__lsx_vld, dst, 64, dst, 80, dst, 96, dst, 112, q4, q5, q6, q7);
+
+  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+  /* if flat2 is zero for all pixels, then no need to calculate other filter */
+  if (__lsx_bz_v(flat2)) {
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+
+    DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32, filter48, 48,
+              p2, p1, p0, q0);
+    DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2);
+
+    DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
+    vec3 = __lsx_vilvl_h(vec1, vec0);
+    vec4 = __lsx_vilvh_h(vec1, vec0);
+    DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1);
+    vec6 = __lsx_vilvl_h(vec1, vec0);
+    vec7 = __lsx_vilvh_h(vec1, vec0);
+    vec2 = __lsx_vilvl_b(q2, q1);
+    vec5 = __lsx_vilvh_b(q2, q1);
+
+    dst_org -= 3;
+    __lsx_vstelm_w(vec3, dst_org, 0, 0);
+    __lsx_vstelm_h(vec2, dst_org, 4, 0);
+    dst_org += stride;
+    __lsx_vstelm_w(vec3, dst_org, 0, 1);
+    __lsx_vstelm_h(vec2, dst_org, 4, 1);
+    dst_org += stride;
+    __lsx_vstelm_w(vec3, dst_org, 0, 2);
+    __lsx_vstelm_h(vec2, dst_org, 4, 2);
+    dst_org += stride;
+    __lsx_vstelm_w(vec3, dst_org, 0, 3);
+    __lsx_vstelm_h(vec2, dst_org, 4, 3);
+    dst_org += stride;
+    __lsx_vstelm_w(vec4, dst_org, 0, 0);
+    __lsx_vstelm_h(vec2, dst_org, 4, 4);
+    dst_org += stride;
+    __lsx_vstelm_w(vec4, dst_org, 0, 1);
+    __lsx_vstelm_h(vec2, dst_org, 4, 5);
+    dst_org += stride;
+    __lsx_vstelm_w(vec4, dst_org, 0, 2);
+    __lsx_vstelm_h(vec2, dst_org, 4, 6);
+    dst_org += stride;
+    __lsx_vstelm_w(vec4, dst_org, 0, 3);
+    __lsx_vstelm_h(vec2, dst_org, 4, 7);
+    dst_org += stride;
+    __lsx_vstelm_w(vec6, dst_org, 0, 0);
+    __lsx_vstelm_h(vec5, dst_org, 4, 0);
+    dst_org += stride;
+    __lsx_vstelm_w(vec6, dst_org, 0, 1);
+    __lsx_vstelm_h(vec5, dst_org, 4, 1);
+    dst_org += stride;
+    __lsx_vstelm_w(vec6, dst_org, 0, 2);
+    __lsx_vstelm_h(vec5, dst_org, 4, 2);
+    dst_org += stride;
+    __lsx_vstelm_w(vec6, dst_org, 0, 3);
+    __lsx_vstelm_h(vec5, dst_org, 4, 3);
+    dst_org += stride;
+    __lsx_vstelm_w(vec7, dst_org, 0, 0);
+    __lsx_vstelm_h(vec5, dst_org, 4, 4);
+    dst_org += stride;
+    __lsx_vstelm_w(vec7, dst_org, 0, 1);
+    __lsx_vstelm_h(vec5, dst_org, 4, 5);
+    dst_org += stride;
+    __lsx_vstelm_w(vec7, dst_org, 0, 2);
+    __lsx_vstelm_h(vec5, dst_org, 4, 6);
+    dst_org += stride;
+    __lsx_vstelm_w(vec7, dst_org, 0, 3);
+    __lsx_vstelm_h(vec5, dst_org, 4, 7);
+
+    return 1;
+  }
+
+  dst -= 7 * 16;
+
+  p7_l_in = (v8u16)__lsx_vsllwil_hu_bu(p7, 0);
+  p6_l_in = (v8u16)__lsx_vsllwil_hu_bu(p6, 0);
+  p5_l_in = (v8u16)__lsx_vsllwil_hu_bu(p5, 0);
+  p4_l_in = (v8u16)__lsx_vsllwil_hu_bu(p4, 0);
+  p3_l_in = (v8u16)__lsx_vsllwil_hu_bu(p3, 0);
+  p2_l_in = (v8u16)__lsx_vsllwil_hu_bu(p2, 0);
+  p1_l_in = (v8u16)__lsx_vsllwil_hu_bu(p1, 0);
+  p0_l_in = (v8u16)__lsx_vsllwil_hu_bu(p0, 0);
+  q0_l_in = (v8u16)__lsx_vsllwil_hu_bu(q0, 0);
+
+  tmp0_l = p7_l_in << 3;
+  tmp0_l -= p7_l_in;
+  tmp0_l += p6_l_in;
+  tmp0_l += q0_l_in;
+  tmp1_l = p6_l_in + p5_l_in;
+  tmp1_l += p4_l_in;
+  tmp1_l += p3_l_in;
+  tmp1_l += p2_l_in;
+  tmp1_l += p1_l_in;
+  tmp1_l += p0_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  p7_h_in = (v8u16)__lsx_vexth_hu_bu(p7);
+  p6_h_in = (v8u16)__lsx_vexth_hu_bu(p6);
+  p5_h_in = (v8u16)__lsx_vexth_hu_bu(p5);
+  p4_h_in = (v8u16)__lsx_vexth_hu_bu(p4);
+  p3_h_in = (v8u16)__lsx_vexth_hu_bu(p3);
+  p2_h_in = (v8u16)__lsx_vexth_hu_bu(p2);
+  p1_h_in = (v8u16)__lsx_vexth_hu_bu(p1);
+  p0_h_in = (v8u16)__lsx_vexth_hu_bu(p0);
+  q0_h_in = (v8u16)__lsx_vexth_hu_bu(q0);
+
+  tmp0_h = p7_h_in << 3;
+  tmp0_h -= p7_h_in;
+  tmp0_h += p6_h_in;
+  tmp0_h += q0_h_in;
+  tmp1_h = p6_h_in + p5_h_in;
+  tmp1_h += p4_h_in;
+  tmp1_h += p3_h_in;
+  tmp1_h += p2_h_in;
+  tmp1_h += p1_h_in;
+  tmp1_h += p0_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  p6 = __lsx_vbitsel_v(p6, out_l, flat2);
+  __lsx_vst(p6, dst, 0);
+
+  /* p5 */
+  q1_l_in = (v8u16)__lsx_vsllwil_hu_bu(q1, 0);
+  tmp0_l = p5_l_in - p6_l_in;
+  tmp0_l += q1_l_in;
+  tmp0_l -= p7_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  q1_h_in = (v8u16)__lsx_vexth_hu_bu(q1);
+  tmp0_h = p5_h_in - p6_h_in;
+  tmp0_h += q1_h_in;
+  tmp0_h -= p7_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  p5 = __lsx_vbitsel_v(p5, out_l, flat2);
+  __lsx_vst(p5, dst, 16);
+
+  /* p4 */
+  q2_l_in = (v8u16)__lsx_vsllwil_hu_bu(q2, 0);
+  tmp0_l = p4_l_in - p5_l_in;
+  tmp0_l += q2_l_in;
+  tmp0_l -= p7_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  q2_h_in = (v8u16)__lsx_vexth_hu_bu(q2);
+  tmp0_h = p4_h_in - p5_h_in;
+  tmp0_h += q2_h_in;
+  tmp0_h -= p7_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  p4 = __lsx_vbitsel_v(p4, out_l, flat2);
+  __lsx_vst(p4, dst, 16 * 2);
+
+  /* p3 */
+  q3_l_in = (v8u16)__lsx_vsllwil_hu_bu(q3, 0);
+  tmp0_l = p3_l_in - p4_l_in;
+  tmp0_l += q3_l_in;
+  tmp0_l -= p7_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  q3_h_in = (v8u16)__lsx_vexth_hu_bu(q3);
+  tmp0_h = p3_h_in - p4_h_in;
+  tmp0_h += q3_h_in;
+  tmp0_h -= p7_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  p3 = __lsx_vbitsel_v(p3, out_l, flat2);
+  __lsx_vst(p3, dst, 16 * 3);
+
+  /* p2 */
+  q4_l_in = (v8u16)__lsx_vsllwil_hu_bu(q4, 0);
+  filter8 = __lsx_vld(filter48, 0);
+  tmp0_l = p2_l_in - p3_l_in;
+  tmp0_l += q4_l_in;
+  tmp0_l -= p7_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  q4_h_in = (v8u16)__lsx_vexth_hu_bu(q4);
+  tmp0_h = p2_h_in - p3_h_in;
+  tmp0_h += q4_h_in;
+  tmp0_h -= p7_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+  __lsx_vst(filter8, dst, 16 * 4);
+
+  /* p1 */
+  q5_l_in = (v8u16)__lsx_vsllwil_hu_bu(q5, 0);
+  filter8 = __lsx_vld(filter48, 16);
+  tmp0_l = p1_l_in - p2_l_in;
+  tmp0_l += q5_l_in;
+  tmp0_l -= p7_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  q5_h_in = (v8u16)__lsx_vexth_hu_bu(q5);
+  tmp0_h = p1_h_in - p2_h_in;
+  tmp0_h += q5_h_in;
+  tmp0_h -= p7_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)(tmp1_h), 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+  __lsx_vst(filter8, dst, 16 * 5);
+
+  /* p0 */
+  q6_l_in = (v8u16)__lsx_vsllwil_hu_bu(q6, 0);
+  filter8 = __lsx_vld(filter48, 32);
+  tmp0_l = p0_l_in - p1_l_in;
+  tmp0_l += q6_l_in;
+  tmp0_l -= p7_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  q6_h_in = (v8u16)__lsx_vexth_hu_bu(q6);
+  tmp0_h = p0_h_in - p1_h_in;
+  tmp0_h += q6_h_in;
+  tmp0_h -= p7_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+  __lsx_vst(filter8, dst, 16 * 6);
+
+  /* q0 */
+  q7_l_in = (v8u16)__lsx_vsllwil_hu_bu(q7, 0);
+  filter8 = __lsx_vld(filter48, 48);
+  tmp0_l = q7_l_in - p0_l_in;
+  tmp0_l += q0_l_in;
+  tmp0_l -= p7_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  q7_h_in = (v8u16)__lsx_vexth_hu_bu(q7);
+  tmp0_h = q7_h_in - p0_h_in;
+  tmp0_h += q0_h_in;
+  tmp0_h -= p7_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+  __lsx_vst(filter8, dst, 16 * 7);
+
+  /* q1 */
+  filter8 = __lsx_vld(filter48, 64);
+  tmp0_l = q7_l_in - q0_l_in;
+  tmp0_l += q1_l_in;
+  tmp0_l -= p6_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  tmp0_h = q7_h_in - q0_h_in;
+  tmp0_h += q1_h_in;
+  tmp0_h -= p6_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+  __lsx_vst(filter8, dst, 16 * 8);
+
+  /* q2 */
+  filter8 = __lsx_vld(filter48, 80);
+  tmp0_l = q7_l_in - q1_l_in;
+  tmp0_l += q2_l_in;
+  tmp0_l -= p5_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  tmp0_h = q7_h_in - q1_h_in;
+  tmp0_h += q2_h_in;
+  tmp0_h -= p5_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+  __lsx_vst(filter8, dst, 16 * 9);
+
+  /* q3 */
+  tmp0_l = q7_l_in - q2_l_in;
+  tmp0_l += q3_l_in;
+  tmp0_l -= p4_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  tmp0_h = q7_h_in - q2_h_in;
+  tmp0_h += q3_h_in;
+  tmp0_h -= p4_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  q3 = __lsx_vbitsel_v(q3, out_l, flat2);
+  __lsx_vst(q3, dst, 16 * 10);
+
+  /* q4 */
+  tmp0_l = q7_l_in - q3_l_in;
+  tmp0_l += q4_l_in;
+  tmp0_l -= p3_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  tmp0_h = q7_h_in - q3_h_in;
+  tmp0_h += q4_h_in;
+  tmp0_h -= p3_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  q4 = __lsx_vbitsel_v(q4, out_l, flat2);
+  __lsx_vst(q4, dst, 16 * 11);
+
+  /* q5 */
+  tmp0_l = q7_l_in - q4_l_in;
+  tmp0_l += q5_l_in;
+  tmp0_l -= p2_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  tmp0_h = q7_h_in - q4_h_in;
+  tmp0_h += q5_h_in;
+  tmp0_h -= p2_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  q5 = __lsx_vbitsel_v(q5, out_l, flat2);
+  __lsx_vst(q5, dst, 16 * 12);
+
+  /* q6 */
+  tmp0_l = q7_l_in - q5_l_in;
+  tmp0_l += q6_l_in;
+  tmp0_l -= p1_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  tmp0_h = q7_h_in - q5_h_in;
+  tmp0_h += q6_h_in;
+  tmp0_h -= p1_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  q6 = __lsx_vbitsel_v(q6, out_l, flat2);
+  __lsx_vst(q6, dst, 16 * 13);
+
+  return 0;
+}
+
+void vpx_lpf_vertical_16_dual_lsx(uint8_t *src, int32_t pitch,
+                                  const uint8_t *b_limit_ptr,
+                                  const uint8_t *limit_ptr,
+                                  const uint8_t *thresh_ptr) {
+  uint8_t early_exit = 0;
+  DECLARE_ALIGNED(16, uint8_t, transposed_input[16 * 24]);
+  uint8_t *filter48 = &transposed_input[16 * 16];
+
+  transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
+
+  early_exit =
+      vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
+                           pitch, b_limit_ptr, limit_ptr, thresh_ptr);
+
+  if (early_exit == 0) {
+    early_exit =
+        vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, &filter48[0]);
+
+    if (early_exit == 0) {
+      transpose_16x16(transposed_input, 16, (src - 8), pitch);
+    }
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_4_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_4_lsx.c
new file mode 100644
index 0000000000..9300b5c5ae
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_4_lsx.c
@@ -0,0 +1,214 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/loopfilter_lsx.h"
+
+void vpx_lpf_horizontal_4_lsx(uint8_t *src, int32_t pitch,
+                              const uint8_t *b_limit_ptr,
+                              const uint8_t *limit_ptr,
+                              const uint8_t *thresh_ptr) {
+  __m128i mask, hev, flat, thresh, b_limit, limit;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
+  int32_t pitch2 = pitch << 1;
+  int32_t pitch3 = pitch2 + pitch;
+  int32_t pitch4 = pitch2 << 1;
+
+  DUP4_ARG2(__lsx_vldx, src, -pitch4, src, -pitch3, src, -pitch2, src, -pitch,
+            p3, p2, p1, p0);
+  q0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch2, q1, q2);
+  q3 = __lsx_vldx(src, pitch3);
+
+  thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+  b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+  limit = __lsx_vldrepl_b(limit_ptr, 0);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  __lsx_vstelm_d(p1_out, src - pitch2, 0, 0);
+  __lsx_vstelm_d(p0_out, src - pitch, 0, 0);
+  __lsx_vstelm_d(q0_out, src, 0, 0);
+  __lsx_vstelm_d(q1_out, src + pitch, 0, 0);
+}
+
+void vpx_lpf_horizontal_4_dual_lsx(uint8_t *src, int32_t pitch,
+                                   const uint8_t *b_limit0_ptr,
+                                   const uint8_t *limit0_ptr,
+                                   const uint8_t *thresh0_ptr,
+                                   const uint8_t *b_limit1_ptr,
+                                   const uint8_t *limit1_ptr,
+                                   const uint8_t *thresh1_ptr) {
+  __m128i mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  int32_t pitch2 = pitch << 1;
+  int32_t pitch3 = pitch2 + pitch;
+  int32_t pitch4 = pitch2 << 1;
+
+  DUP4_ARG2(__lsx_vldx, src, -pitch4, src, -pitch3, src, -pitch2, src, -pitch,
+            p3, p2, p1, p0);
+  q0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch2, q1, q2);
+  q3 = __lsx_vldx(src, pitch3);
+
+  thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0);
+  thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0);
+  thresh0 = __lsx_vilvl_d(thresh1, thresh0);
+
+  b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0);
+  b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0);
+  b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
+
+  limit0 = __lsx_vldrepl_b(limit0_ptr, 0);
+  limit1 = __lsx_vldrepl_b(limit1_ptr, 0);
+  limit0 = __lsx_vilvl_d(limit1, limit0);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+               mask, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+
+  __lsx_vstx(p1, src, -pitch2);
+  __lsx_vstx(p0, src, -pitch);
+  __lsx_vst(q0, src, 0);
+  __lsx_vstx(q1, src, pitch);
+}
+
+void vpx_lpf_vertical_4_lsx(uint8_t *src, int32_t pitch,
+                            const uint8_t *b_limit_ptr,
+                            const uint8_t *limit_ptr,
+                            const uint8_t *thresh_ptr) {
+  __m128i mask, hev, flat, limit, thresh, b_limit;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i vec0, vec1, vec2, vec3;
+  int32_t pitch2 = pitch << 1;
+  int32_t pitch3 = pitch2 + pitch;
+  int32_t pitch4 = pitch2 << 1;
+  uint8_t *src_tmp = src - 4;
+
+  p3 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, p2, p1);
+  p0 = __lsx_vldx(src_tmp, pitch3);
+  src_tmp += pitch4;
+  q0 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, q1, q2);
+  q3 = __lsx_vldx(src_tmp, pitch3);
+
+  thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+  b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+  limit = __lsx_vldrepl_b(limit_ptr, 0);
+
+  LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
+                     q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+  DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, vec0, vec1);
+  vec2 = __lsx_vilvl_h(vec1, vec0);
+  vec3 = __lsx_vilvh_h(vec1, vec0);
+
+  src -= 2;
+  __lsx_vstelm_w(vec2, src, 0, 0);
+  src += pitch;
+  __lsx_vstelm_w(vec2, src, 0, 1);
+  src += pitch;
+  __lsx_vstelm_w(vec2, src, 0, 2);
+  src += pitch;
+  __lsx_vstelm_w(vec2, src, 0, 3);
+  src += pitch;
+
+  __lsx_vstelm_w(vec3, src, 0, 0);
+  __lsx_vstelm_w(vec3, src + pitch, 0, 1);
+  __lsx_vstelm_w(vec3, src + pitch2, 0, 2);
+  __lsx_vstelm_w(vec3, src + pitch3, 0, 3);
+}
+
+void vpx_lpf_vertical_4_dual_lsx(uint8_t *src, int32_t pitch,
+                                 const uint8_t *b_limit0_ptr,
+                                 const uint8_t *limit0_ptr,
+                                 const uint8_t *thresh0_ptr,
+                                 const uint8_t *b_limit1_ptr,
+                                 const uint8_t *limit1_ptr,
+                                 const uint8_t *thresh1_ptr) {
+  __m128i mask, hev, flat;
+  __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+  __m128i row8, row9, row10, row11, row12, row13, row14, row15;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  int32_t pitch2 = pitch << 1;
+  int32_t pitch3 = pitch2 + pitch;
+  int32_t pitch4 = pitch2 << 1;
+  uint8_t *src_tmp = src - 4;
+
+  row0 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row1, row2);
+  row3 = __lsx_vldx(src_tmp, pitch3);
+  src_tmp += pitch4;
+  row4 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row5, row6);
+  row7 = __lsx_vldx(src_tmp, pitch3);
+  src_tmp += pitch4;
+  row8 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row9, row10);
+  row11 = __lsx_vldx(src_tmp, pitch3);
+  src_tmp += pitch4;
+  row12 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row13, row14);
+  row15 = __lsx_vldx(src_tmp, pitch3);
+
+  LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+                      row9, row10, row11, row12, row13, row14, row15, p3, p2,
+                      p1, p0, q0, q1, q2, q3);
+
+  thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0);
+  thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0);
+  thresh0 = __lsx_vilvl_d(thresh1, thresh0);
+
+  b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0);
+  b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0);
+  b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
+
+  limit0 = __lsx_vldrepl_b(limit0_ptr, 0);
+  limit1 = __lsx_vldrepl_b(limit1_ptr, 0);
+  limit0 = __lsx_vilvl_d(limit1, limit0);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+               mask, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+  DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1);
+  tmp2 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp3 = __lsx_vilvh_h(tmp1, tmp0);
+  DUP2_ARG2(__lsx_vilvh_b, p0, p1, q1, q0, tmp0, tmp1);
+  tmp4 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp5 = __lsx_vilvh_h(tmp1, tmp0);
+
+  src -= 2;
+  __lsx_vstelm_w(tmp2, src, 0, 0);
+  __lsx_vstelm_w(tmp2, src + pitch, 0, 1);
+  __lsx_vstelm_w(tmp2, src + pitch2, 0, 2);
+  __lsx_vstelm_w(tmp2, src + pitch3, 0, 3);
+  src += pitch4;
+  __lsx_vstelm_w(tmp3, src, 0, 0);
+  __lsx_vstelm_w(tmp3, src + pitch, 0, 1);
+  __lsx_vstelm_w(tmp3, src + pitch2, 0, 2);
+  __lsx_vstelm_w(tmp3, src + pitch3, 0, 3);
+  src += pitch4;
+  __lsx_vstelm_w(tmp4, src, 0, 0);
+  __lsx_vstelm_w(tmp4, src + pitch, 0, 1);
+  __lsx_vstelm_w(tmp4, src + pitch2, 0, 2);
+  __lsx_vstelm_w(tmp4, src + pitch3, 0, 3);
+  src += pitch4;
+  __lsx_vstelm_w(tmp5, src, 0, 0);
+  __lsx_vstelm_w(tmp5, src + pitch, 0, 1);
+  __lsx_vstelm_w(tmp5, src + pitch2, 0, 2);
+  __lsx_vstelm_w(tmp5, src + pitch3, 0, 3);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_8_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_8_lsx.c
new file mode 100644
index 0000000000..00219ba71d
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_8_lsx.c
@@ -0,0 +1,458 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/loopfilter_lsx.h"
+
+void vpx_lpf_horizontal_8_lsx(uint8_t *dst, int32_t stride,
+                              const uint8_t *b_limit_ptr,
+                              const uint8_t *limit_ptr,
+                              const uint8_t *thresh_ptr) {
+  __m128i mask, hev, flat, thresh, b_limit, limit;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i p2_out, p1_out, p0_out, q0_out, q1_out;
+  __m128i p2_filter8, p1_filter8, p0_filter8;
+  __m128i q0_filter8, q1_filter8, q2_filter8;
+  __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l;
+
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride2 + stride;
+  int32_t stride4 = stride2 << 1;
+
+  /* load vector elements */
+  DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst,
+            -stride, p3, p2, p1, p0);
+  q0 = __lsx_vld(dst, 0);
+  DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
+  q3 = __lsx_vldx(dst, stride3);
+
+  thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+  b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+  limit = __lsx_vldrepl_b(limit_ptr, 0);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  flat = __lsx_vilvl_d(flat, flat);
+
+  if (__lsx_bz_v(flat)) {
+    __lsx_vstelm_d(p1_out, dst - stride2, 0, 0);
+    __lsx_vstelm_d(p0_out, dst - stride, 0, 0);
+    __lsx_vstelm_d(q0_out, dst, 0, 0);
+    __lsx_vstelm_d(q1_out, dst + stride, 0, 0);
+  } else {
+    DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
+              p0_l);
+    DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
+              q3_l);
+    VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filter8,
+                p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
+
+    DUP2_ARG2(__lsx_vpickev_b, p1_filter8, p2_filter8, q0_filter8, p0_filter8,
+              p1_filter8, q0_filter8);
+    q2_filter8 = __lsx_vpickev_b(q2_filter8, q1_filter8);
+
+    p2 = __lsx_vilvl_d(p1_out, p2);
+    p0_out = __lsx_vilvl_d(q0_out, p0_out);
+    q1_out = __lsx_vilvl_d(q2, q1_out);
+
+    DUP2_ARG3(__lsx_vbitsel_v, p2, p1_filter8, flat, p0_out, q0_filter8, flat,
+              p2_out, p1_out);
+    p0_out = __lsx_vbitsel_v(q1_out, q2_filter8, flat);
+    dst -= stride3;
+
+    __lsx_vstelm_d(p2_out, dst, 0, 0);
+    __lsx_vstelm_d(p2_out, dst + stride, 0, 1);
+    __lsx_vstelm_d(p1_out, dst + stride2, 0, 0);
+    __lsx_vstelm_d(p1_out, dst + stride3, 0, 1);
+
+    dst += stride4;
+    __lsx_vstelm_d(p0_out, dst, 0, 0);
+    dst += stride;
+    __lsx_vstelm_d(p0_out, dst, 0, 1);
+  }
+}
+
+void vpx_lpf_horizontal_8_dual_lsx(
+    uint8_t *dst, int32_t stride, const uint8_t *b_limit0,
+    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *b_limit1,
+    const uint8_t *limit1, const uint8_t *thresh1) {
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  __m128i flat, mask, hev, thresh, b_limit, limit;
+  __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
+  __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
+  __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
+  __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
+
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride2 + stride;
+  int32_t stride4 = stride2 << 1;
+
+  DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst,
+            -stride, p3, p2, p1, p0);
+  q0 = __lsx_vld(dst, 0);
+  DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
+  q3 = __lsx_vldx(dst, stride3);
+
+  thresh = __lsx_vldrepl_b(thresh0, 0);
+  p2_out = __lsx_vldrepl_b(thresh1, 0);
+  thresh = __lsx_vilvl_d(p2_out, thresh);
+
+  b_limit = __lsx_vldrepl_b(b_limit0, 0);
+  p2_out = __lsx_vldrepl_b(b_limit1, 0);
+  b_limit = __lsx_vilvl_d(p2_out, b_limit);
+
+  limit = __lsx_vldrepl_b(limit0, 0);
+  p2_out = __lsx_vldrepl_b(limit1, 0);
+  limit = __lsx_vilvl_d(p2_out, limit);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  if (__lsx_bz_v(flat)) {
+    __lsx_vst(p1_out, dst - stride2, 0);
+    __lsx_vst(p0_out, dst - stride, 0);
+    __lsx_vst(q0_out, dst, 0);
+    __lsx_vst(q1_out, dst + stride, 0);
+  } else {
+    DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
+              p0_l);
+    DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
+              q3_l);
+    VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+    DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h);
+    DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h);
+    VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
+                p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
+
+    /* convert 16 bit output data into 8 bit */
+    DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l,
+              p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l,
+              p1_filt8_l, p0_filt8_l, q0_filt8_l);
+    DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l,
+              q1_filt8_l, q2_filt8_l);
+
+    /* store pixel values */
+    p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
+    p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
+    p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
+    q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
+    q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
+    q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
+
+    __lsx_vst(p2_out, dst - stride3, 0);
+    __lsx_vst(p1_out, dst - stride2, 0);
+    __lsx_vst(p0_out, dst - stride, 0);
+    __lsx_vst(q0_out, dst, 0);
+    __lsx_vst(q1_out, dst + stride, 0);
+    __lsx_vst(q2_out, dst + stride2, 0);
+  }
+}
+
+void vpx_lpf_vertical_8_lsx(uint8_t *dst, int32_t stride,
+                            const uint8_t *b_limit_ptr,
+                            const uint8_t *limit_ptr,
+                            const uint8_t *thresh_ptr) {
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i p1_out, p0_out, q0_out, q1_out;
+  __m128i flat, mask, hev, thresh, b_limit, limit;
+  __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
+  __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  __m128i zero = __lsx_vldi(0);
+
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride2 + stride;
+  int32_t stride4 = stride2 << 1;
+  uint8_t *dst_tmp = dst - 4;
+
+  /* load vector elements */
+  p3 = __lsx_vld(dst_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p2, p1);
+  p0 = __lsx_vldx(dst_tmp, stride3);
+  dst_tmp += stride4;
+  q0 = __lsx_vld(dst_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q1, q2);
+  q3 = __lsx_vldx(dst_tmp, stride3);
+
+  LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
+                     q3);
+
+  thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+  b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+  limit = __lsx_vldrepl_b(limit_ptr, 0);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  /* flat4 */
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  /* filter4 */
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  flat = __lsx_vilvl_d(zero, flat);
+
+  /* if flat is zero for all pixels, then no need to calculate other filter */
+  if (__lsx_bz_v(flat)) {
+    /* Store 4 pixels p1-_q1 */
+    DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, p0, p1);
+    p2 = __lsx_vilvl_h(p1, p0);
+    p3 = __lsx_vilvh_h(p1, p0);
+
+    dst -= 2;
+    __lsx_vstelm_w(p2, dst, 0, 0);
+    __lsx_vstelm_w(p2, dst + stride, 0, 1);
+    __lsx_vstelm_w(p2, dst + stride2, 0, 2);
+    __lsx_vstelm_w(p2, dst + stride3, 0, 3);
+    dst += stride4;
+    __lsx_vstelm_w(p3, dst, 0, 0);
+    __lsx_vstelm_w(p3, dst + stride, 0, 1);
+    __lsx_vstelm_w(p3, dst + stride2, 0, 2);
+    __lsx_vstelm_w(p3, dst + stride3, 0, 3);
+  } else {
+    DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l,
+              p1_l, p0_l);
+    DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l,
+              q2_l, q3_l);
+    VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+    /* convert 16 bit output data into 8 bit */
+    DUP4_ARG2(__lsx_vpickev_b, p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
+              p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l, p2_filt8_l,
+              p1_filt8_l, p0_filt8_l, q0_filt8_l);
+    DUP2_ARG2(__lsx_vpickev_b, q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
+              q1_filt8_l, q2_filt8_l);
+    /* store pixel values */
+    p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
+    p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
+    p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
+    q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
+    q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
+    q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
+
+    /* Store 6 pixels p2-_q2 */
+    DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, p3, q3);
+    p1 = __lsx_vilvl_h(q3, p3);
+    p2 = __lsx_vilvh_h(q3, p3);
+    p3 = __lsx_vilvl_b(q2, q1);
+    dst -= 3;
+    __lsx_vstelm_w(p1, dst, 0, 0);
+    __lsx_vstelm_h(p3, dst, 4, 0);
+    dst += stride;
+    __lsx_vstelm_w(p1, dst, 0, 1);
+    __lsx_vstelm_h(p3, dst, 4, 1);
+    dst += stride;
+    __lsx_vstelm_w(p1, dst, 0, 2);
+    __lsx_vstelm_h(p3, dst, 4, 2);
+    dst += stride;
+    __lsx_vstelm_w(p1, dst, 0, 3);
+    __lsx_vstelm_h(p3, dst, 4, 3);
+    dst += stride;
+    __lsx_vstelm_w(p2, dst, 0, 0);
+    __lsx_vstelm_h(p3, dst, 4, 4);
+    dst += stride;
+    __lsx_vstelm_w(p2, dst, 0, 1);
+    __lsx_vstelm_h(p3, dst, 4, 5);
+    dst += stride;
+    __lsx_vstelm_w(p2, dst, 0, 2);
+    __lsx_vstelm_h(p3, dst, 4, 6);
+    dst += stride;
+    __lsx_vstelm_w(p2, dst, 0, 3);
+    __lsx_vstelm_h(p3, dst, 4, 7);
+  }
+}
+
+void vpx_lpf_vertical_8_dual_lsx(uint8_t *dst, int32_t stride,
+                                 const uint8_t *b_limit0, const uint8_t *limit0,
+                                 const uint8_t *thresh0,
+                                 const uint8_t *b_limit1, const uint8_t *limit1,
+                                 const uint8_t *thresh1) {
+  uint8_t *dst_tmp = dst - 4;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i p1_out, p0_out, q0_out, q1_out;
+  __m128i flat, mask, hev, thresh, b_limit, limit;
+  __m128i row4, row5, row6, row7, row12, row13, row14, row15;
+  __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
+  __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
+  __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
+  __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride2 + stride;
+  int32_t stride4 = stride2 << 1;
+
+  p0 = __lsx_vld(dst_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p1, p2);
+  p3 = __lsx_vldx(dst_tmp, stride3);
+  dst_tmp += stride4;
+  row4 = __lsx_vld(dst_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row5, row6);
+  row7 = __lsx_vldx(dst_tmp, stride3);
+  dst_tmp += stride4;
+
+  q3 = __lsx_vld(dst_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q2, q1);
+  q0 = __lsx_vldx(dst_tmp, stride3);
+  dst_tmp += stride4;
+  row12 = __lsx_vld(dst_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row13, row14);
+  row15 = __lsx_vldx(dst_tmp, stride3);
+
+  /* transpose 16x8 matrix into 8x16 */
+  LSX_TRANSPOSE16x8_B(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0,
+                      row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2,
+                      q3);
+
+  thresh = __lsx_vldrepl_b(thresh0, 0);
+  p1_out = __lsx_vldrepl_b(thresh1, 0);
+  thresh = __lsx_vilvl_d(p1_out, thresh);
+
+  b_limit = __lsx_vldrepl_b(b_limit0, 0);
+  p1_out = __lsx_vldrepl_b(b_limit1, 0);
+  b_limit = __lsx_vilvl_d(p1_out, b_limit);
+
+  limit = __lsx_vldrepl_b(limit0, 0);
+  p1_out = __lsx_vldrepl_b(limit1, 0);
+  limit = __lsx_vilvl_d(p1_out, limit);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  /* flat4 */
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  /* filter4 */
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+  /* if flat is zero for all pixels, then no need to calculate other filter */
+  if (__lsx_bz_v(flat)) {
+    DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, p0, p1);
+    p2 = __lsx_vilvl_h(p1, p0);
+    p3 = __lsx_vilvh_h(p1, p0);
+    DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, p0, p1);
+    q2 = __lsx_vilvl_h(p1, p0);
+    q3 = __lsx_vilvh_h(p1, p0);
+    dst -= 2;
+    __lsx_vstelm_w(p2, dst, 0, 0);
+    __lsx_vstelm_w(p2, dst + stride, 0, 1);
+    __lsx_vstelm_w(p2, dst + stride2, 0, 2);
+    __lsx_vstelm_w(p2, dst + stride3, 0, 3);
+    dst += stride4;
+    __lsx_vstelm_w(p3, dst, 0, 0);
+    __lsx_vstelm_w(p3, dst + stride, 0, 1);
+    __lsx_vstelm_w(p3, dst + stride2, 0, 2);
+    __lsx_vstelm_w(p3, dst + stride3, 0, 3);
+    dst += stride4;
+    __lsx_vstelm_w(q2, dst, 0, 0);
+    __lsx_vstelm_w(q2, dst + stride, 0, 1);
+    __lsx_vstelm_w(q2, dst + stride2, 0, 2);
+    __lsx_vstelm_w(q2, dst + stride3, 0, 3);
+    dst += stride4;
+    __lsx_vstelm_w(q3, dst, 0, 0);
+    __lsx_vstelm_w(q3, dst + stride, 0, 1);
+    __lsx_vstelm_w(q3, dst + stride2, 0, 2);
+    __lsx_vstelm_w(q3, dst + stride3, 0, 3);
+  } else {
+    DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
+              p0_l);
+    DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
+              q3_l);
+    VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+    DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h);
+    DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h);
+
+    /* filter8 */
+    VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
+                p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
+
+    /* convert 16 bit output data into 8 bit */
+    DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l,
+              p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l,
+              p1_filt8_l, p0_filt8_l, q0_filt8_l);
+    DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l,
+              q1_filt8_l, q2_filt8_l);
+
+    /* store pixel values */
+    p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
+    p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
+    p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
+    q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
+    q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
+    q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
+
+    DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, p3, q3);
+    p2_filt8_l = __lsx_vilvl_h(q3, p3);
+    p2_filt8_h = __lsx_vilvh_h(q3, p3);
+    DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, p3, q3);
+    p0_filt8_l = __lsx_vilvl_h(q3, p3);
+    p0_filt8_h = __lsx_vilvh_h(q3, p3);
+    q1_filt8_l = __lsx_vilvl_b(q2, q1);
+    q1_filt8_h = __lsx_vilvh_b(q2, q1);
+
+    dst -= 3;
+    __lsx_vstelm_w(p2_filt8_l, dst, 0, 0);
+    __lsx_vstelm_h(q1_filt8_l, dst, 4, 0);
+    dst += stride;
+    __lsx_vstelm_w(p2_filt8_l, dst, 0, 1);
+    __lsx_vstelm_h(q1_filt8_l, dst, 4, 1);
+    dst += stride;
+    __lsx_vstelm_w(p2_filt8_l, dst, 0, 2);
+    __lsx_vstelm_h(q1_filt8_l, dst, 4, 2);
+    dst += stride;
+    __lsx_vstelm_w(p2_filt8_l, dst, 0, 3);
+    __lsx_vstelm_h(q1_filt8_l, dst, 4, 3);
+    dst += stride;
+    __lsx_vstelm_w(p2_filt8_h, dst, 0, 0);
+    __lsx_vstelm_h(q1_filt8_l, dst, 4, 4);
+    dst += stride;
+    __lsx_vstelm_w(p2_filt8_h, dst, 0, 1);
+    __lsx_vstelm_h(q1_filt8_l, dst, 4, 5);
+    dst += stride;
+    __lsx_vstelm_w(p2_filt8_h, dst, 0, 2);
+    __lsx_vstelm_h(q1_filt8_l, dst, 4, 6);
+    dst += stride;
+    __lsx_vstelm_w(p2_filt8_h, dst, 0, 3);
+    __lsx_vstelm_h(q1_filt8_l, dst, 4, 7);
+    dst += stride;
+    __lsx_vstelm_w(p0_filt8_l, dst, 0, 0);
+    __lsx_vstelm_h(q1_filt8_h, dst, 4, 0);
+    dst += stride;
+    __lsx_vstelm_w(p0_filt8_l, dst, 0, 1);
+    __lsx_vstelm_h(q1_filt8_h, dst, 4, 1);
+    dst += stride;
+    __lsx_vstelm_w(p0_filt8_l, dst, 0, 2);
+    __lsx_vstelm_h(q1_filt8_h, dst, 4, 2);
+    dst += stride;
+    __lsx_vstelm_w(p0_filt8_l, dst, 0, 3);
+    __lsx_vstelm_h(q1_filt8_h, dst, 4, 3);
+    dst += stride;
+    __lsx_vstelm_w(p0_filt8_h, dst, 0, 0);
+    __lsx_vstelm_h(q1_filt8_h, dst, 4, 4);
+    dst += stride;
+    __lsx_vstelm_w(p0_filt8_h, dst, 0, 1);
+    __lsx_vstelm_h(q1_filt8_h, dst, 4, 5);
+    dst += stride;
+    __lsx_vstelm_w(p0_filt8_h, dst, 0, 2);
+    __lsx_vstelm_h(q1_filt8_h, dst, 4, 6);
+    dst += stride;
+    __lsx_vstelm_w(p0_filt8_h, dst, 0, 3);
+    __lsx_vstelm_h(q1_filt8_h, dst, 4, 7);
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_lsx.h
new file mode 100644
index 0000000000..1c43836503
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_lsx.h
@@ -0,0 +1,167 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_
+
+#include "vpx_util/loongson_intrinsics.h"
+
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
+                     limit_in, b_limit_in, thresh_in, hev_out, mask_out,     \
+                     flat_out)                                               \
+  do {                                                                       \
+    __m128i p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;          \
+    __m128i p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;          \
+                                                                             \
+    /* absolute subtraction of pixel values */                               \
+    p3_asub_p2_m = __lsx_vabsd_bu(p3_in, p2_in);                             \
+    p2_asub_p1_m = __lsx_vabsd_bu(p2_in, p1_in);                             \
+    p1_asub_p0_m = __lsx_vabsd_bu(p1_in, p0_in);                             \
+    q1_asub_q0_m = __lsx_vabsd_bu(q1_in, q0_in);                             \
+    q2_asub_q1_m = __lsx_vabsd_bu(q2_in, q1_in);                             \
+    q3_asub_q2_m = __lsx_vabsd_bu(q3_in, q2_in);                             \
+    p0_asub_q0_m = __lsx_vabsd_bu(p0_in, q0_in);                             \
+    p1_asub_q1_m = __lsx_vabsd_bu(p1_in, q1_in);                             \
+                                                                             \
+    /* calculation of hev */                                                 \
+    flat_out = __lsx_vmax_bu(p1_asub_p0_m, q1_asub_q0_m);                    \
+    hev_out = __lsx_vslt_bu(thresh_in, flat_out);                            \
+                                                                             \
+    /* calculation of mask */                                                \
+    p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p0_asub_q0_m);               \
+    p1_asub_q1_m = __lsx_vsrli_b(p1_asub_q1_m, 1);                           \
+    p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p1_asub_q1_m);               \
+    mask_out = __lsx_vslt_bu(b_limit_in, p0_asub_q0_m);                      \
+    mask_out = __lsx_vmax_bu(flat_out, mask_out);                            \
+    p3_asub_p2_m = __lsx_vmax_bu(p3_asub_p2_m, p2_asub_p1_m);                \
+    mask_out = __lsx_vmax_bu(p3_asub_p2_m, mask_out);                        \
+    q2_asub_q1_m = __lsx_vmax_bu(q2_asub_q1_m, q3_asub_q2_m);                \
+    mask_out = __lsx_vmax_bu(q2_asub_q1_m, mask_out);                        \
+                                                                             \
+    mask_out = __lsx_vslt_bu(limit_in, mask_out);                            \
+    mask_out = __lsx_vxori_b(mask_out, 0xff);                                \
+  } while (0)
+
+#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out)          \
+  do {                                                                         \
+    __m128i p2_asub_p0, q2_asub_q0, p3_asub_p0, q3_asub_q0;                    \
+    __m128i flat4_tmp = __lsx_vldi(1);                                         \
+                                                                               \
+    DUP4_ARG2(__lsx_vabsd_bu, p2_in, p0_in, q2_in, q0_in, p3_in, p0_in, q3_in, \
+              q0_in, p2_asub_p0, q2_asub_q0, p3_asub_p0, q3_asub_q0);          \
+    p2_asub_p0 = __lsx_vmax_bu(p2_asub_p0, q2_asub_q0);                        \
+    flat_out = __lsx_vmax_bu(p2_asub_p0, flat_out);                            \
+    p3_asub_p0 = __lsx_vmax_bu(p3_asub_p0, q3_asub_q0);                        \
+    flat_out = __lsx_vmax_bu(p3_asub_p0, flat_out);                            \
+                                                                               \
+    flat_out = __lsx_vslt_bu(flat4_tmp, flat_out);                             \
+    flat_out = __lsx_vxori_b(flat_out, 0xff);                                  \
+    flat_out = flat_out & (mask);                                              \
+  } while (0)
+
+#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in,      \
+                  q6_in, q7_in, flat_in, flat2_out)                            \
+  do {                                                                         \
+    __m128i flat5_tmp = __lsx_vldi(1);                                         \
+    __m128i p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0;                    \
+    __m128i p6_asub_p0, q6_asub_q0, p7_asub_p0, q7_asub_q0;                    \
+    DUP4_ARG2(__lsx_vabsd_bu, p4_in, p0_in, q4_in, q0_in, p5_in, p0_in, q5_in, \
+              q0_in, p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0);          \
+    DUP4_ARG2(__lsx_vabsd_bu, p6_in, p0_in, q6_in, q0_in, p7_in, p0_in, q7_in, \
+              q0_in, p6_asub_p0, q6_asub_q0, p7_asub_p0, q7_asub_q0);          \
+                                                                               \
+    DUP2_ARG2(__lsx_vmax_bu, p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0,   \
+              p4_asub_p0, flat2_out);                                          \
+    flat2_out = __lsx_vmax_bu(p4_asub_p0, flat2_out);                          \
+    p6_asub_p0 = __lsx_vmax_bu(p6_asub_p0, q6_asub_q0);                        \
+    flat2_out = __lsx_vmax_bu(p6_asub_p0, flat2_out);                          \
+    p7_asub_p0 = __lsx_vmax_bu(p7_asub_p0, q7_asub_q0);                        \
+    flat2_out = __lsx_vmax_bu(p7_asub_p0, flat2_out);                          \
+    flat2_out = __lsx_vslt_bu(flat5_tmp, flat2_out);                           \
+    flat2_out = __lsx_vxori_b(flat2_out, 0xff);                                \
+    flat2_out = flat2_out & flat_in;                                           \
+  } while (0)
+
+#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask, hev, p1_out,  \
+                           p0_out, q0_out, q1_out)                         \
+  do {                                                                     \
+    __m128i p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2;               \
+    const __m128i cnst4b = __lsx_vldi(4);                                  \
+    const __m128i cnst3b = __lsx_vldi(3);                                  \
+    DUP4_ARG2(__lsx_vxori_b, p1_in, 0x80, p0_in, 0x80, q0_in, 0x80, q1_in, \
+              0x80, p1_m, p0_m, q0_m, q1_m);                               \
+    filt = __lsx_vssub_b(p1_m, q1_m);                                      \
+    filt &= hev;                                                           \
+                                                                           \
+    q0_sub_p0 = __lsx_vssub_b(q0_m, p0_m);                                 \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);                                 \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);                                 \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);                                 \
+    filt &= mask;                                                          \
+    DUP2_ARG2(__lsx_vsadd_b, filt, cnst4b, filt, cnst3b, t1, t2);          \
+    DUP2_ARG2(__lsx_vsrai_b, t1, 3, t2, 3, t1, t2);                        \
+                                                                           \
+    q0_m = __lsx_vssub_b(q0_m, t1);                                        \
+    p0_m = __lsx_vsadd_b(p0_m, t2);                                        \
+    DUP2_ARG2(__lsx_vxori_b, q0_m, 0x80, p0_m, 0x80, q0_out, p0_out);      \
+                                                                           \
+    filt = __lsx_vsrari_b(t1, 1);                                          \
+    hev = __lsx_vxori_b(hev, 0xff);                                        \
+    filt &= hev;                                                           \
+    q1_m = __lsx_vssub_b(q1_m, filt);                                      \
+    p1_m = __lsx_vsadd_b(p1_m, filt);                                      \
+    DUP2_ARG2(__lsx_vxori_b, q1_m, 0x80, p1_m, 0x80, q1_out, p1_out);      \
+  } while (0)
+
+#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
+                    p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \
+                    q1_filt8_out, q2_filt8_out)                             \
+  do {                                                                      \
+    __m128i tmp_filt8_0, tmp_filt8_1, tmp_filt8_2;                          \
+                                                                            \
+    tmp_filt8_2 = __lsx_vadd_h(p2_in, p1_in);                               \
+    tmp_filt8_2 = __lsx_vadd_h(tmp_filt8_2, p0_in);                         \
+    tmp_filt8_0 = __lsx_vslli_h(p3_in, 1);                                  \
+                                                                            \
+    tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_2);                   \
+    tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, q0_in);                         \
+    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, p3_in);                         \
+    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, p2_in);                         \
+    p2_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3);                          \
+                                                                            \
+    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, p1_in);                         \
+    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, q1_in);                         \
+    p1_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3);                          \
+                                                                            \
+    tmp_filt8_1 = __lsx_vadd_h(q2_in, q1_in);                               \
+    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, q0_in);                         \
+    tmp_filt8_2 = __lsx_vadd_h(tmp_filt8_2, tmp_filt8_1);                   \
+    tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_2, p0_in);                         \
+    tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, p3_in);                         \
+    p0_filt8_out = __lsx_vsrari_h(tmp_filt8_0, 3);                          \
+                                                                            \
+    tmp_filt8_0 = __lsx_vadd_h(q2_in, q3_in);                               \
+    tmp_filt8_0 = __lsx_vadd_h(p0_in, tmp_filt8_0);                         \
+    tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_1);                   \
+    tmp_filt8_1 = __lsx_vadd_h(q3_in, q3_in);                               \
+    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, tmp_filt8_0);                   \
+    q2_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3);                          \
+                                                                            \
+    tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_2, q3_in);                         \
+    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, q0_in);                         \
+    q0_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3);                          \
+                                                                            \
+    tmp_filt8_1 = __lsx_vsub_h(tmp_filt8_0, p2_in);                         \
+    tmp_filt8_0 = __lsx_vadd_h(q1_in, q3_in);                               \
+    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_1);                   \
+    q1_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3);                          \
+  } while (0)
+
+#endif  // VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/quantize_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/quantize_lsx.c
new file mode 100644
index 0000000000..1299e75e9a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/quantize_lsx.c
@@ -0,0 +1,244 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
+
+static INLINE __m128i calculate_qcoeff(__m128i coeff, __m128i coeff_abs,
+                                       __m128i round, __m128i quant,
+                                       __m128i shift, __m128i cmp_mask) {
+  __m128i rounded, qcoeff;
+
+  rounded = __lsx_vsadd_h(coeff_abs, round);
+  qcoeff = __lsx_vmuh_h(rounded, quant);
+  qcoeff = __lsx_vadd_h(rounded, qcoeff);
+  qcoeff = __lsx_vmuh_h(qcoeff, shift);
+  qcoeff = __lsx_vsigncov_h(coeff, qcoeff);
+  qcoeff = __lsx_vand_v(qcoeff, cmp_mask);
+
+  return qcoeff;
+}
+
+static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant,
+                                               int16_t *dqcoeff) {
+  __m128i dqcoeff16 = __lsx_vmul_h(qcoeff, dequant);
+  __lsx_vst(dqcoeff16, dqcoeff, 0);
+}
+
+static INLINE void calculate_dqcoeff_and_store_32x32(__m128i qcoeff,
+                                                     __m128i dequant,
+                                                     int16_t *dqcoeff) {
+  // Un-sign to bias rounding like C.
+  __m128i low, high, dqcoeff32_0, dqcoeff32_1, res;
+  __m128i zero = __lsx_vldi(0);
+  __m128i coeff = __lsx_vabsd_h(qcoeff, zero);
+
+  const __m128i sign_0 = __lsx_vilvl_h(qcoeff, zero);
+  const __m128i sign_1 = __lsx_vilvh_h(qcoeff, zero);
+
+  low = __lsx_vmul_h(coeff, dequant);
+  high = __lsx_vmuh_h(coeff, dequant);
+  dqcoeff32_0 = __lsx_vilvl_h(high, low);
+  dqcoeff32_1 = __lsx_vilvh_h(high, low);
+
+  // "Divide" by 2.
+  dqcoeff32_0 = __lsx_vsrai_w(dqcoeff32_0, 1);
+  dqcoeff32_1 = __lsx_vsrai_w(dqcoeff32_1, 1);
+  dqcoeff32_0 = __lsx_vsigncov_w(sign_0, dqcoeff32_0);
+  dqcoeff32_1 = __lsx_vsigncov_w(sign_1, dqcoeff32_1);
+  res = __lsx_vpickev_h(dqcoeff32_1, dqcoeff32_0);
+  __lsx_vst(res, dqcoeff, 0);
+}
+
+static INLINE __m128i scan_for_eob(__m128i coeff0, __m128i coeff1,
+                                   const int16_t *scan, int index,
+                                   __m128i zero) {
+  const __m128i zero_coeff0 = __lsx_vseq_h(coeff0, zero);
+  const __m128i zero_coeff1 = __lsx_vseq_h(coeff1, zero);
+  __m128i scan0 = __lsx_vld(scan + index, 0);
+  __m128i scan1 = __lsx_vld(scan + index + 8, 0);
+  __m128i eob0, eob1;
+
+  eob0 = __lsx_vandn_v(zero_coeff0, scan0);
+  eob1 = __lsx_vandn_v(zero_coeff1, scan1);
+  return __lsx_vmax_h(eob0, eob1);
+}
+
+static INLINE int16_t accumulate_eob(__m128i eob) {
+  __m128i eob_shuffled;
+  int16_t res_m;
+
+  eob_shuffled = __lsx_vshuf4i_w(eob, 0xe);
+  eob = __lsx_vmax_h(eob, eob_shuffled);
+  eob_shuffled = __lsx_vshuf4i_h(eob, 0xe);
+  eob = __lsx_vmax_h(eob, eob_shuffled);
+  eob_shuffled = __lsx_vshuf4i_h(eob, 0x1);
+  eob = __lsx_vmax_h(eob, eob_shuffled);
+  res_m = __lsx_vpickve2gr_h(eob, 1);
+
+  return res_m;
+}
+
+#if !CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_quantize_b_lsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                        const struct macroblock_plane *const mb_plane,
+                        tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                        const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                        const struct ScanOrder *const scan_order) {
+  __m128i zero = __lsx_vldi(0);
+  int index = 16;
+  const int16_t *iscan = scan_order->iscan;
+
+  __m128i zbin, round, quant, dequant, quant_shift;
+  __m128i coeff0, coeff1;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i eob, eob0;
+
+  zbin = __lsx_vld(mb_plane->zbin, 0);
+  round = __lsx_vld(mb_plane->round, 0);
+  quant = __lsx_vld(mb_plane->quant, 0);
+  dequant = __lsx_vld(dequant_ptr, 0);
+  quant_shift = __lsx_vld(mb_plane->quant_shift, 0);
+  // Handle one DC and first 15 AC.
+  DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1);
+  qcoeff0 = __lsx_vabsd_h(coeff0, zero);
+  qcoeff1 = __lsx_vabsd_h(coeff1, zero);
+
+  cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0);
+  zbin = __lsx_vilvh_d(zbin, zbin);
+  cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
+
+  qcoeff0 =
+      calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
+  round = __lsx_vilvh_d(round, round);
+  quant = __lsx_vilvh_d(quant, quant);
+  quant_shift = __lsx_vilvh_d(quant_shift, quant_shift);
+  qcoeff1 =
+      calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
+
+  __lsx_vst(qcoeff0, qcoeff_ptr, 0);
+  __lsx_vst(qcoeff1, qcoeff_ptr, 16);
+
+  calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);
+  dequant = __lsx_vilvh_d(dequant, dequant);
+  calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
+
+  eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero);
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = __lsx_vld(coeff_ptr + index, 0);
+    coeff1 = __lsx_vld(coeff_ptr + index + 8, 0);
+
+    qcoeff0 = __lsx_vabsd_h(coeff0, zero);
+    qcoeff1 = __lsx_vabsd_h(coeff1, zero);
+
+    cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0);
+    cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
+
+    qcoeff0 =
+        calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
+    qcoeff1 =
+        calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
+
+    __lsx_vst(qcoeff0, qcoeff_ptr + index, 0);
+    __lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0);
+
+    calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
+    calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
+
+    eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero);
+    eob = __lsx_vmax_h(eob, eob0);
+
+    index += 16;
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
+
+void vpx_quantize_b_32x32_lsx(const tran_low_t *coeff_ptr,
+                              const struct macroblock_plane *const mb_plane,
+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                              const struct ScanOrder *const scan_order) {
+  __m128i zero = __lsx_vldi(0);
+  int index;
+  const int16_t *iscan = scan_order->iscan;
+
+  __m128i zbin, round, quant, dequant, quant_shift;
+  __m128i coeff0, coeff1, qcoeff0, qcoeff1, cmp_mask0, cmp_mask1;
+  __m128i eob = zero, eob0;
+
+  zbin = __lsx_vld(mb_plane->zbin, 0);
+  zbin = __lsx_vsrari_h(zbin, 1);
+  round = __lsx_vld(mb_plane->round, 0);
+  round = __lsx_vsrari_h(round, 1);
+
+  quant = __lsx_vld(mb_plane->quant, 0);
+  dequant = __lsx_vld(dequant_ptr, 0);
+  quant_shift = __lsx_vld(mb_plane->quant_shift, 0);
+  quant_shift = __lsx_vslli_h(quant_shift, 1);
+  // Handle one DC and first 15 AC.
+  DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1);
+  qcoeff0 = __lsx_vabsd_h(coeff0, zero);
+  qcoeff1 = __lsx_vabsd_h(coeff1, zero);
+
+  cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0);
+  // remove DC from zbin
+  zbin = __lsx_vilvh_d(zbin, zbin);
+  cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
+
+  qcoeff0 =
+      calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
+  // remove DC in quant_shift, quant, quant_shift
+  round = __lsx_vilvh_d(round, round);
+  quant = __lsx_vilvh_d(quant, quant);
+  quant_shift = __lsx_vilvh_d(quant_shift, quant_shift);
+  qcoeff1 =
+      calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
+  __lsx_vst(qcoeff0, qcoeff_ptr, 0);
+  __lsx_vst(qcoeff1, qcoeff_ptr, 16);
+
+  calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr);
+  dequant = __lsx_vilvh_d(dequant, dequant);
+  calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, dqcoeff_ptr + 8);
+  eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero);
+  // AC only loop.
+  for (index = 16; index < 32 * 32; index += 16) {
+    coeff0 = __lsx_vld(coeff_ptr + index, 0);
+    coeff1 = __lsx_vld(coeff_ptr + index + 8, 0);
+
+    qcoeff0 = __lsx_vabsd_h(coeff0, zero);
+    qcoeff1 = __lsx_vabsd_h(coeff1, zero);
+
+    cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0);
+    cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
+
+    qcoeff0 =
+        calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
+    qcoeff1 =
+        calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
+    __lsx_vst(qcoeff0, qcoeff_ptr + index, 0);
+    __lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0);
+
+    calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr + index);
+    calculate_dqcoeff_and_store_32x32(qcoeff1, dequant,
+                                      dqcoeff_ptr + 8 + index);
+    eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero);
+    eob = __lsx_vmax_h(eob, eob0);
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
+#endif
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/sad_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/sad_lsx.c
new file mode 100644
index 0000000000..b6fbedb0d0
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/sad_lsx.c
@@ -0,0 +1,717 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static INLINE __m128i sad_ub2_uh(__m128i in0, __m128i in1, __m128i ref0,
+                                 __m128i ref1) {
+  __m128i diff0_m, diff1_m, sad_m0;
+  __m128i sad_m = __lsx_vldi(0);
+
+  diff0_m = __lsx_vabsd_bu(in0, ref0);
+  diff1_m = __lsx_vabsd_bu(in1, ref1);
+
+  sad_m0 = __lsx_vhaddw_hu_bu(diff0_m, diff0_m);
+  sad_m = __lsx_vadd_h(sad_m, sad_m0);
+  sad_m0 = __lsx_vhaddw_hu_bu(diff1_m, diff1_m);
+  sad_m = __lsx_vadd_h(sad_m, sad_m0);
+
+  return sad_m;
+}
+
+static INLINE uint32_t hadd_uw_u32(__m128i in) {
+  __m128i res0_m;
+  uint32_t sum_m;
+
+  res0_m = __lsx_vhaddw_du_wu(in, in);
+  res0_m = __lsx_vhaddw_qu_du(res0_m, res0_m);
+  sum_m = __lsx_vpickve2gr_w(res0_m, 0);
+
+  return sum_m;
+}
+
+static INLINE uint32_t hadd_uh_u32(__m128i in) {
+  __m128i res_m;
+  uint32_t sum_m;
+
+  res_m = __lsx_vhaddw_wu_hu(in, in);
+  sum_m = hadd_uw_u32(res_m);
+
+  return sum_m;
+}
+
+static INLINE int32_t hadd_sw_s32(__m128i in) {
+  __m128i res0_m;
+  int32_t sum_m;
+
+  res0_m = __lsx_vhaddw_d_w(in, in);
+  res0_m = __lsx_vhaddw_q_d(res0_m, res0_m);
+  sum_m = __lsx_vpickve2gr_w(res0_m, 0);
+
+  return sum_m;
+}
+
+static uint32_t sad_8width_lsx(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               int32_t height) {
+  int32_t ht_cnt;
+  uint32_t res;
+  __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3, sad_tmp;
+  __m128i sad = __lsx_vldi(0);
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0);
+    src += src_stride;
+    ref += ref_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src1, ref1);
+    src += src_stride;
+    ref += ref_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src2, ref2);
+    src += src_stride;
+    ref += ref_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src3, ref3);
+    src += src_stride;
+    ref += ref_stride;
+    DUP4_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+              src0, src1, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+  }
+  res = hadd_uh_u32(sad);
+  return res;
+}
+
+static uint32_t sad_16width_lsx(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *ref, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt = (height >> 2);
+  uint32_t res;
+  __m128i src0, src1, ref0, ref1, sad_tmp;
+  __m128i sad = __lsx_vldi(0);
+  int32_t src_stride2 = src_stride << 1;
+  int32_t ref_stride2 = ref_stride << 1;
+
+  for (; ht_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1);
+    src += src_stride2;
+    ref += ref_stride2;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+
+    DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1);
+    src += src_stride2;
+    ref += ref_stride2;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+  }
+
+  res = hadd_uh_u32(sad);
+  return res;
+}
+
+static uint32_t sad_32width_lsx(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *ref, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt = (height >> 2);
+  uint32_t res;
+  __m128i src0, src1, ref0, ref1;
+  __m128i sad_tmp;
+  __m128i sad = __lsx_vldi(0);
+
+  for (; ht_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
+    ref += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
+    ref += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
+    ref += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
+    ref += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+  }
+  res = hadd_uh_u32(sad);
+  return res;
+}
+
+static uint32_t sad_64width_lsx(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *ref, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt = (height >> 1);
+  uint32_t sad = 0;
+  __m128i src0, src1, src2, src3;
+  __m128i ref0, ref1, ref2, ref3;
+  __m128i sad_tmp;
+  __m128i sad0 = __lsx_vldi(0);
+  __m128i sad1 = sad0;
+
+  for (; ht_cnt--;) {
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+              ref3);
+    ref += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+    sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+              ref3);
+    ref += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+    sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+  }
+
+  sad = hadd_uh_u32(sad0);
+  sad += hadd_uh_u32(sad1);
+
+  return sad;
+}
+
+static void sad_8width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                               const uint8_t *const aref_ptr[],
+                               int32_t ref_stride, int32_t height,
+                               uint32_t *sad_array) {
+  int32_t ht_cnt = (height >> 2);
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  __m128i src0, src1, src2, src3, sad_tmp;
+  __m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+  __m128i ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
+  __m128i sad0 = __lsx_vldi(0);
+  __m128i sad1 = sad0;
+  __m128i sad2 = sad0;
+  __m128i sad3 = sad0;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t ref_stride2 = ref_stride << 1;
+  int32_t ref_stride3 = ref_stride2 + ref_stride;
+  int32_t ref_stride4 = ref_stride2 << 1;
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (; ht_cnt--;) {
+    src0 = __lsx_vld(src_ptr, 0);
+    DUP2_ARG2(__lsx_vldx, src_ptr, src_stride, src_ptr, src_stride2, src1,
+              src2);
+    src3 = __lsx_vldx(src_ptr, src_stride3);
+    src_ptr += src_stride4;
+    ref0 = __lsx_vld(ref0_ptr, 0);
+    DUP2_ARG2(__lsx_vldx, ref0_ptr, ref_stride, ref0_ptr, ref_stride2, ref1,
+              ref2);
+    ref3 = __lsx_vldx(ref0_ptr, ref_stride3);
+    ref0_ptr += ref_stride4;
+    ref4 = __lsx_vld(ref1_ptr, 0);
+    DUP2_ARG2(__lsx_vldx, ref1_ptr, ref_stride, ref1_ptr, ref_stride2, ref5,
+              ref6);
+    ref7 = __lsx_vldx(ref1_ptr, ref_stride3);
+    ref1_ptr += ref_stride4;
+    ref8 = __lsx_vld(ref2_ptr, 0);
+    DUP2_ARG2(__lsx_vldx, ref2_ptr, ref_stride, ref2_ptr, ref_stride2, ref9,
+              ref10);
+    ref11 = __lsx_vldx(ref2_ptr, ref_stride3);
+    ref2_ptr += ref_stride4;
+    ref12 = __lsx_vld(ref3_ptr, 0);
+    DUP2_ARG2(__lsx_vldx, ref3_ptr, ref_stride, ref3_ptr, ref_stride2, ref13,
+              ref14);
+    ref15 = __lsx_vldx(ref3_ptr, ref_stride3);
+    ref3_ptr += ref_stride4;
+
+    DUP2_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, src0, src1);
+    DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+
+    DUP2_ARG2(__lsx_vpickev_d, ref5, ref4, ref7, ref6, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+    DUP2_ARG2(__lsx_vpickev_d, ref9, ref8, ref11, ref10, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad2 = __lsx_vadd_h(sad2, sad_tmp);
+
+    DUP2_ARG2(__lsx_vpickev_d, ref13, ref12, ref15, ref14, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad3 = __lsx_vadd_h(sad3, sad_tmp);
+  }
+  sad_array[0] = hadd_uh_u32(sad0);
+  sad_array[1] = hadd_uh_u32(sad1);
+  sad_array[2] = hadd_uh_u32(sad2);
+  sad_array[3] = hadd_uh_u32(sad3);
+}
+
+static void sad_16width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                                const uint8_t *const aref_ptr[],
+                                int32_t ref_stride, int32_t height,
+                                uint32_t *sad_array) {
+  int32_t ht_cnt = (height >> 1);
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  __m128i src, ref0, ref1, ref2, ref3, diff, sad_tmp;
+  __m128i sad0 = __lsx_vldi(0);
+  __m128i sad1 = sad0;
+  __m128i sad2 = sad0;
+  __m128i sad3 = sad0;
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (; ht_cnt--;) {
+    src = __lsx_vld(src_ptr, 0);
+    src_ptr += src_stride;
+    ref0 = __lsx_vld(ref0_ptr, 0);
+    ref0_ptr += ref_stride;
+    ref1 = __lsx_vld(ref1_ptr, 0);
+    ref1_ptr += ref_stride;
+    ref2 = __lsx_vld(ref2_ptr, 0);
+    ref2_ptr += ref_stride;
+    ref3 = __lsx_vld(ref3_ptr, 0);
+    ref3_ptr += ref_stride;
+
+    diff = __lsx_vabsd_bu(src, ref0);
+    sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+    diff = __lsx_vabsd_bu(src, ref1);
+    sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+    diff = __lsx_vabsd_bu(src, ref2);
+    sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+    sad2 = __lsx_vadd_h(sad2, sad_tmp);
+    diff = __lsx_vabsd_bu(src, ref3);
+    sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+    sad3 = __lsx_vadd_h(sad3, sad_tmp);
+
+    src = __lsx_vld(src_ptr, 0);
+    src_ptr += src_stride;
+    ref0 = __lsx_vld(ref0_ptr, 0);
+    ref0_ptr += ref_stride;
+    ref1 = __lsx_vld(ref1_ptr, 0);
+    ref1_ptr += ref_stride;
+    ref2 = __lsx_vld(ref2_ptr, 0);
+    ref2_ptr += ref_stride;
+    ref3 = __lsx_vld(ref3_ptr, 0);
+    ref3_ptr += ref_stride;
+
+    diff = __lsx_vabsd_bu(src, ref0);
+    sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+    diff = __lsx_vabsd_bu(src, ref1);
+    sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+    diff = __lsx_vabsd_bu(src, ref2);
+    sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+    sad2 = __lsx_vadd_h(sad2, sad_tmp);
+    diff = __lsx_vabsd_bu(src, ref3);
+    sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+    sad3 = __lsx_vadd_h(sad3, sad_tmp);
+  }
+  sad_array[0] = hadd_uh_u32(sad0);
+  sad_array[1] = hadd_uh_u32(sad1);
+  sad_array[2] = hadd_uh_u32(sad2);
+  sad_array[3] = hadd_uh_u32(sad3);
+}
+
+static void sad_32width_x4d_lsx(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *const aref_ptr[],
+                                int32_t ref_stride, int32_t height,
+                                uint32_t *sad_array) {
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  int32_t ht_cnt = height;
+  __m128i src0, src1, ref0, ref1, sad_tmp;
+  __m128i sad0 = __lsx_vldi(0);
+  __m128i sad1 = sad0;
+  __m128i sad2 = sad0;
+  __m128i sad3 = sad0;
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (; ht_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+    src += src_stride;
+
+    DUP2_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0, ref1);
+    ref0_ptr += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+
+    DUP2_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref0, ref1);
+    ref1_ptr += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+    DUP2_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref0, ref1);
+    ref2_ptr += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad2 = __lsx_vadd_h(sad2, sad_tmp);
+
+    DUP2_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref0, ref1);
+    ref3_ptr += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad3 = __lsx_vadd_h(sad3, sad_tmp);
+  }
+  sad_array[0] = hadd_uh_u32(sad0);
+  sad_array[1] = hadd_uh_u32(sad1);
+  sad_array[2] = hadd_uh_u32(sad2);
+  sad_array[3] = hadd_uh_u32(sad3);
+}
+
+static void sad_64width_x4d_lsx(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *const aref_ptr[],
+                                int32_t ref_stride, int32_t height,
+                                uint32_t *sad_array) {
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  int32_t ht_cnt = height;
+  __m128i src0, src1, src2, src3;
+  __m128i ref0, ref1, ref2, ref3;
+  __m128i sad, sad_tmp;
+
+  __m128i sad0_0 = __lsx_vldi(0);
+  __m128i sad0_1 = sad0_0;
+  __m128i sad1_0 = sad0_0;
+  __m128i sad1_1 = sad0_0;
+  __m128i sad2_0 = sad0_0;
+  __m128i sad2_1 = sad0_0;
+  __m128i sad3_0 = sad0_0;
+  __m128i sad3_1 = sad0_0;
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (; ht_cnt--;) {
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    src += src_stride;
+
+    DUP4_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0_ptr, 32, ref0_ptr, 48,
+              ref0, ref1, ref2, ref3);
+    ref0_ptr += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad0_0 = __lsx_vadd_h(sad0_0, sad_tmp);
+    sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
+    sad0_1 = __lsx_vadd_h(sad0_1, sad_tmp);
+
+    DUP4_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref1_ptr, 32, ref1_ptr, 48,
+              ref0, ref1, ref2, ref3);
+    ref1_ptr += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad1_0 = __lsx_vadd_h(sad1_0, sad_tmp);
+    sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
+    sad1_1 = __lsx_vadd_h(sad1_1, sad_tmp);
+
+    DUP4_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref2_ptr, 32, ref2_ptr, 48,
+              ref0, ref1, ref2, ref3);
+    ref2_ptr += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad2_0 = __lsx_vadd_h(sad2_0, sad_tmp);
+    sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
+    sad2_1 = __lsx_vadd_h(sad2_1, sad_tmp);
+
+    DUP4_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref3_ptr, 32, ref3_ptr, 48,
+              ref0, ref1, ref2, ref3);
+    ref3_ptr += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad3_0 = __lsx_vadd_h(sad3_0, sad_tmp);
+    sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
+    sad3_1 = __lsx_vadd_h(sad3_1, sad_tmp);
+  }
+  sad = __lsx_vhaddw_wu_hu(sad0_0, sad0_0);
+  sad_tmp = __lsx_vhaddw_wu_hu(sad0_1, sad0_1);
+  sad = __lsx_vadd_w(sad, sad_tmp);
+  sad_array[0] = hadd_uw_u32(sad);
+
+  sad = __lsx_vhaddw_wu_hu(sad1_0, sad1_0);
+  sad_tmp = __lsx_vhaddw_wu_hu(sad1_1, sad1_1);
+  sad = __lsx_vadd_w(sad, sad_tmp);
+  sad_array[1] = hadd_uw_u32(sad);
+
+  sad = __lsx_vhaddw_wu_hu(sad2_0, sad2_0);
+  sad_tmp = __lsx_vhaddw_wu_hu(sad2_1, sad2_1);
+  sad = __lsx_vadd_w(sad, sad_tmp);
+  sad_array[2] = hadd_uw_u32(sad);
+
+  sad = __lsx_vhaddw_wu_hu(sad3_0, sad3_0);
+  sad_tmp = __lsx_vhaddw_wu_hu(sad3_1, sad3_1);
+  sad = __lsx_vadd_w(sad, sad_tmp);
+  sad_array[3] = hadd_uw_u32(sad);
+}
+
+static uint32_t avgsad_32width_lsx(const uint8_t *src, int32_t src_stride,
+                                   const uint8_t *ref, int32_t ref_stride,
+                                   int32_t height, const uint8_t *sec_pred) {
+  int32_t res, ht_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+  __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  __m128i comp0, comp1, sad_tmp;
+  __m128i sad = __lsx_vldi(0);
+  uint8_t *src_tmp, *ref_tmp;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t ref_stride2 = ref_stride << 1;
+  int32_t ref_stride3 = ref_stride2 + ref_stride;
+  int32_t ref_stride4 = ref_stride2 << 1;
+
+  for (; ht_cnt--;) {
+    src_tmp = (uint8_t *)src + 16;
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+    src6 = __lsx_vldx(src, src_stride3);
+    src1 = __lsx_vld(src_tmp, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src3,
+              src5);
+    src7 = __lsx_vldx(src_tmp, src_stride3);
+    src += src_stride4;
+
+    ref_tmp = (uint8_t *)ref + 16;
+    ref0 = __lsx_vld(ref, 0);
+    DUP2_ARG2(__lsx_vldx, ref, ref_stride, ref, ref_stride2, ref2, ref4);
+    ref6 = __lsx_vldx(ref, ref_stride3);
+    ref1 = __lsx_vld(ref_tmp, 0);
+    DUP2_ARG2(__lsx_vldx, ref_tmp, ref_stride, ref_tmp, ref_stride2, ref3,
+              ref5);
+    ref7 = __lsx_vldx(ref_tmp, ref_stride3);
+    ref += ref_stride4;
+
+    DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 32, sec_pred, 64, sec_pred, 96,
+              pred0, pred2, pred4, pred6);
+    DUP4_ARG2(__lsx_vld, sec_pred, 16, sec_pred, 48, sec_pred, 80, sec_pred,
+              112, pred1, pred3, pred5, pred7);
+    sec_pred += 128;
+
+    DUP2_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, comp0, comp1);
+    sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+    DUP2_ARG2(__lsx_vavgr_bu, pred2, ref2, pred3, ref3, comp0, comp1);
+    sad_tmp = sad_ub2_uh(src2, src3, comp0, comp1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+    DUP2_ARG2(__lsx_vavgr_bu, pred4, ref4, pred5, ref5, comp0, comp1);
+    sad_tmp = sad_ub2_uh(src4, src5, comp0, comp1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+    DUP2_ARG2(__lsx_vavgr_bu, pred6, ref6, pred7, ref7, comp0, comp1);
+    sad_tmp = sad_ub2_uh(src6, src7, comp0, comp1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+  }
+  res = hadd_uh_u32(sad);
+  return res;
+}
+
+static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride,
+                                   const uint8_t *ref, int32_t ref_stride,
+                                   int32_t height, const uint8_t *sec_pred) {
+  int32_t res, ht_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  __m128i comp0, comp1, comp2, comp3, pred0, pred1, pred2, pred3;
+  __m128i sad, sad_tmp;
+  __m128i sad0 = __lsx_vldi(0);
+  __m128i sad1 = sad0;
+
+  for (; ht_cnt--;) {
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+              ref3);
+    ref += ref_stride;
+    DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+              pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
+              ref3, comp0, comp1, comp2, comp3);
+    sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+    sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+              ref3);
+    ref += ref_stride;
+    DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+              pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
+              ref3, comp0, comp1, comp2, comp3);
+    sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+    sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+              ref3);
+    ref += ref_stride;
+    DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+              pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
+              ref3, comp0, comp1, comp2, comp3);
+    sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+    sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+              ref3);
+    ref += ref_stride;
+    DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+              pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
+              ref3, comp0, comp1, comp2, comp3);
+    sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+    sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+  }
+  sad = __lsx_vhaddw_wu_hu(sad0, sad0);
+  sad_tmp = __lsx_vhaddw_wu_hu(sad1, sad1);
+  sad = __lsx_vadd_w(sad, sad_tmp);
+
+  res = hadd_sw_s32(sad);
+  return res;
+}
+
+#define VPX_SAD_8xHT_LSX(height)                                             \
+  uint32_t vpx_sad8x##height##_lsx(const uint8_t *src, int32_t src_stride,   \
+                                   const uint8_t *ref, int32_t ref_stride) { \
+    return sad_8width_lsx(src, src_stride, ref, ref_stride, height);         \
+  }
+
+#define VPX_SAD_16xHT_LSX(height)                                             \
+  uint32_t vpx_sad16x##height##_lsx(const uint8_t *src, int32_t src_stride,   \
+                                    const uint8_t *ref, int32_t ref_stride) { \
+    return sad_16width_lsx(src, src_stride, ref, ref_stride, height);         \
+  }
+
+#define VPX_SAD_32xHT_LSX(height)                                             \
+  uint32_t vpx_sad32x##height##_lsx(const uint8_t *src, int32_t src_stride,   \
+                                    const uint8_t *ref, int32_t ref_stride) { \
+    return sad_32width_lsx(src, src_stride, ref, ref_stride, height);         \
+  }
+
+#define VPX_SAD_64xHT_LSX(height)                                             \
+  uint32_t vpx_sad64x##height##_lsx(const uint8_t *src, int32_t src_stride,   \
+                                    const uint8_t *ref, int32_t ref_stride) { \
+    return sad_64width_lsx(src, src_stride, ref, ref_stride, height);         \
+  }
+
+#define VPX_SAD_8xHTx4D_LSX(height)                                       \
+  void vpx_sad8x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
+                                  const uint8_t *const refs[4],           \
+                                  int32_t ref_stride, uint32_t sads[4]) { \
+    sad_8width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads);  \
+  }
+
+#define VPX_SAD_16xHTx4D_LSX(height)                                       \
+  void vpx_sad16x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
+                                   const uint8_t *const refs[],            \
+                                   int32_t ref_stride, uint32_t *sads) {   \
+    sad_16width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads);  \
+  }
+
+#define VPX_SAD_32xHTx4D_LSX(height)                                       \
+  void vpx_sad32x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
+                                   const uint8_t *const refs[],            \
+                                   int32_t ref_stride, uint32_t *sads) {   \
+    sad_32width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads);  \
+  }
+
+#define VPX_SAD_64xHTx4D_LSX(height)                                       \
+  void vpx_sad64x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
+                                   const uint8_t *const refs[],            \
+                                   int32_t ref_stride, uint32_t *sads) {   \
+    sad_64width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads);  \
+  }
+
+#define VPX_AVGSAD_32xHT_LSX(height)                                    \
+  uint32_t vpx_sad32x##height##_avg_lsx(                                \
+      const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
+      int32_t ref_stride, const uint8_t *second_pred) {                 \
+    return avgsad_32width_lsx(src, src_stride, ref, ref_stride, height, \
+                              second_pred);                             \
+  }
+
+#define VPX_AVGSAD_64xHT_LSX(height)                                    \
+  uint32_t vpx_sad64x##height##_avg_lsx(                                \
+      const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
+      int32_t ref_stride, const uint8_t *second_pred) {                 \
+    return avgsad_64width_lsx(src, src_stride, ref, ref_stride, height, \
+                              second_pred);                             \
+  }
+
+#define SAD64                                                             \
+  VPX_SAD_64xHT_LSX(64) VPX_SAD_64xHTx4D_LSX(64) VPX_SAD_64xHTx4D_LSX(32) \
+      VPX_AVGSAD_64xHT_LSX(64)
+
+SAD64
+
+#define SAD32                                                             \
+  VPX_SAD_32xHT_LSX(32) VPX_SAD_32xHTx4D_LSX(32) VPX_SAD_32xHTx4D_LSX(64) \
+      VPX_AVGSAD_32xHT_LSX(32)
+
+SAD32
+
+#define SAD16 VPX_SAD_16xHT_LSX(16) VPX_SAD_16xHTx4D_LSX(16)
+
+SAD16
+
+#define SAD8 VPX_SAD_8xHT_LSX(8) VPX_SAD_8xHTx4D_LSX(8)
+
+SAD8
+
+#undef SAD64
+#undef SAD32
+#undef SAD16
+#undef SAD8
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/sub_pixel_variance_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/sub_pixel_variance_lsx.c
new file mode 100644
index 0000000000..700793531c
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/sub_pixel_variance_lsx.c
@@ -0,0 +1,874 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/loongarch/variance_lsx.h"
+#include "vpx_dsp/variance.h"
+
+static const uint8_t bilinear_filters_lsx[8][2] = {
+  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
+};
+
+#define VARIANCE_WxH(sse, diff, shift) \
+  (sse) - (((uint32_t)(diff) * (diff)) >> (shift))
+
+#define VARIANCE_LARGE_WxH(sse, diff, shift) \
+  (sse) - (((int64_t)(diff) * (diff)) >> (shift))
+
+static uint32_t avg_sse_diff_64x64_lsx(const uint8_t *src_ptr,
+                                       int32_t src_stride,
+                                       const uint8_t *ref_ptr,
+                                       int32_t ref_stride,
+                                       const uint8_t *sec_pred, int32_t *diff) {
+  int32_t res, ht_cnt = 32;
+  __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  __m128i pred0, pred1, pred2, pred3, vec, vec_tmp;
+  __m128i avg0, avg1, avg2, avg3;
+  __m128i var = __lsx_vldi(0);
+
+  avg0 = var;
+  avg1 = var;
+  avg2 = var;
+  avg3 = var;
+
+  for (; ht_cnt--;) {
+    DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+              pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+              src0, src1, src2, src3);
+    src_ptr += src_stride;
+    DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48,
+              ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+
+    DUP4_ARG2(__lsx_vavgr_bu, src0, pred0, src1, pred1, src2, pred2, src3,
+              pred3, src0, src1, src2, src3);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+
+    DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+              pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+              src0, src1, src2, src3);
+    src_ptr += src_stride;
+    DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48,
+              ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+
+    DUP4_ARG2(__lsx_vavgr_bu, src0, pred0, src1, pred1, src2, pred2, src3,
+              pred3, src0, src1, src2, src3);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+  }
+  vec = __lsx_vhaddw_w_h(avg0, avg0);
+  vec_tmp = __lsx_vhaddw_w_h(avg1, avg1);
+  vec = __lsx_vadd_w(vec, vec_tmp);
+  vec_tmp = __lsx_vhaddw_w_h(avg2, avg2);
+  vec = __lsx_vadd_w(vec, vec_tmp);
+  vec_tmp = __lsx_vhaddw_w_h(avg3, avg3);
+  vec = __lsx_vadd_w(vec, vec_tmp);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+
+  return res;
+}
+
+static uint32_t sub_pixel_sse_diff_8width_h_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  uint32_t loop_cnt = (height >> 2);
+  int32_t res;
+  __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  __m128i vec0, vec1, vec2, vec3, filt0, out, vec;
+  __m128i mask = { 0x0403030202010100, 0x0807070606050504 };
+  __m128i avg = __lsx_vldi(0);
+  __m128i var = avg;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  for (; loop_cnt--;) {
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+    src3 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+    ref0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2);
+    ref3 = __lsx_vldx(dst, dst_stride3);
+    dst += dst_stride4;
+
+    DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1);
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, vec0, vec1, vec2, vec3);
+    DUP4_ARG3(__lsx_vssrarni_bu_h, vec0, vec0, FILTER_BITS, vec1, vec1,
+              FILTER_BITS, vec2, vec2, FILTER_BITS, vec3, vec3, FILTER_BITS,
+              src0, src1, src2, src3);
+    out = __lsx_vpackev_d(src1, src0);
+    CALC_MSE_AVG_B(out, ref0, var, avg);
+    out = __lsx_vpackev_d(src3, src2);
+    CALC_MSE_AVG_B(out, ref1, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+static uint32_t sub_pixel_sse_diff_16width_h_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  uint32_t loop_cnt = (height >> 2);
+  int32_t res;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i dst0, dst1, dst2, dst3, filt0;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+  __m128i vec, var = __lsx_vldi(0);
+  __m128i avg = var;
+  __m128i mask = { 0x0403030202010100, 0x0807070606050504 };
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7);
+    src += src_stride;
+
+    dst0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+    dst3 = __lsx_vldx(dst, dst_stride3);
+    dst += dst_stride4;
+
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+    DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, out0, out1, out2, out3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, out4, out5, out6, out7);
+    DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+              FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+              src0, src1, src2, src3);
+    CALC_MSE_AVG_B(src0, dst0, var, avg);
+    CALC_MSE_AVG_B(src1, dst1, var, avg);
+    CALC_MSE_AVG_B(src2, dst2, var, avg);
+    CALC_MSE_AVG_B(src3, dst3, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+static uint32_t sub_pixel_sse_diff_32width_h_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  uint32_t sse = 0;
+  int32_t diff0[2];
+
+  sse += sub_pixel_sse_diff_16width_h_lsx(src, src_stride, dst, dst_stride,
+                                          filter, height, &diff0[0]);
+  src += 16;
+  dst += 16;
+
+  sse += sub_pixel_sse_diff_16width_h_lsx(src, src_stride, dst, dst_stride,
+                                          filter, height, &diff0[1]);
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_8width_v_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  uint32_t loop_cnt = (height >> 2);
+  int32_t res;
+  __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4;
+  __m128i vec, vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3, filt0;
+  __m128i avg = __lsx_vldi(0);
+  __m128i var = avg;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+    ref0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2);
+    ref3 = __lsx_vldx(dst, dst_stride3);
+    dst += dst_stride4;
+
+    DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1);
+    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+              vec0, vec1, vec2, vec3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+              FILTER_BITS, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    src0 = src4;
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+static uint32_t sub_pixel_sse_diff_16width_v_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  uint32_t loop_cnt = (height >> 2);
+  int32_t res;
+  __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4;
+  __m128i out0, out1, out2, out3, tmp0, tmp1, filt0, vec;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i var = __lsx_vldi(0);
+  __m128i avg = var;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+    ref0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2);
+    ref3 = __lsx_vldx(dst, dst_stride3);
+    dst += dst_stride4;
+
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    out0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    out1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    out2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    out3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    src0 = src4;
+
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+    CALC_MSE_AVG_B(out2, ref2, var, avg);
+    CALC_MSE_AVG_B(out3, ref3, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+static uint32_t sub_pixel_sse_diff_32width_v_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  uint32_t sse = 0;
+  int32_t diff0[2];
+
+  sse += sub_pixel_sse_diff_16width_v_lsx(src, src_stride, dst, dst_stride,
+                                          filter, height, &diff0[0]);
+  src += 16;
+  dst += 16;
+
+  sse += sub_pixel_sse_diff_16width_v_lsx(src, src_stride, dst, dst_stride,
+                                          filter, height, &diff0[1]);
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_8width_hv_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+    int32_t height, int32_t *diff) {
+  uint32_t loop_cnt = (height >> 2);
+  int32_t res;
+  __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4, out0, out1;
+  __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3, vec, vec0, filt_hz, filt_vt;
+  __m128i mask = { 0x0403030202010100, 0x0807070606050504 };
+  __m128i avg = __lsx_vldi(0);
+  __m128i var = avg;
+
+  filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+  filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+  HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out0);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src1, ref0);
+    src += src_stride;
+    dst += dst_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src2, ref1);
+    src += src_stride;
+    dst += dst_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src3, ref2);
+    src += src_stride;
+    dst += dst_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src4, ref3);
+    src += src_stride;
+    dst += dst_stride;
+
+    DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1);
+    HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out1);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
+    HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS, hz_out0);
+    vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+    tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS, hz_out1);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
+    HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS, hz_out0);
+    vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+    tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+              FILTER_BITS, out0, out1);
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+static uint32_t sub_pixel_sse_diff_16width_hv_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+    int32_t height, int32_t *diff) {
+  uint32_t loop_cnt = (height >> 2);
+  int32_t res;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i ref0, ref1, ref2, ref3, filt_hz, filt_vt, vec0, vec1;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1, vec;
+  __m128i var = __lsx_vldi(0);
+  __m128i avg = var;
+  __m128i mask = { 0x0403030202010100, 0x0807070606050504 };
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+  filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+  DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+  src += src_stride;
+
+  HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out0);
+  HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out2);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7);
+    src += src_stride;
+
+    ref0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2);
+    ref3 = __lsx_vldx(dst, dst_stride3);
+    dst += dst_stride4;
+
+    HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out1);
+    HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out3);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    src0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS, hz_out0);
+    HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS, hz_out2);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    src1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS, hz_out1);
+    HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS, hz_out3);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    src2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS, hz_out0);
+    HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS, hz_out2);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    src3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+    CALC_MSE_AVG_B(src2, ref2, var, avg);
+    CALC_MSE_AVG_B(src3, ref3, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+
+  return res;
+}
+
+static uint32_t sub_pixel_sse_diff_32width_hv_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+    int32_t height, int32_t *diff) {
+  uint32_t sse = 0;
+  int32_t diff0[2];
+
+  sse += sub_pixel_sse_diff_16width_hv_lsx(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert, height,
+                                           &diff0[0]);
+  src += 16;
+  dst += 16;
+
+  sse += sub_pixel_sse_diff_16width_hv_lsx(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert, height,
+                                           &diff0[1]);
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t subpel_avg_ssediff_16w_h_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff, int32_t width) {
+  uint32_t loop_cnt = (height >> 2);
+  int32_t res;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+  __m128i pred0, pred1, pred2, pred3, filt0, vec;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+  __m128i mask = { 0x403030202010100, 0x807070606050504 };
+  __m128i avg = __lsx_vldi(0);
+  __m128i var = avg;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7);
+    src += src_stride;
+
+    dst0 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    dst1 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    dst2 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    dst3 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+
+    pred0 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred1 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred2 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred3 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+    DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, out0, out1, out2, out3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, out4, out5, out6, out7);
+    DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+              FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+              tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vavgr_bu, tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3,
+              pred3, tmp0, tmp1, tmp2, tmp3);
+
+    CALC_MSE_AVG_B(tmp0, dst0, var, avg);
+    CALC_MSE_AVG_B(tmp1, dst1, var, avg);
+    CALC_MSE_AVG_B(tmp2, dst2, var, avg);
+    CALC_MSE_AVG_B(tmp3, dst3, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+
+  return res;
+}
+
+static uint32_t subpel_avg_ssediff_16w_v_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff, int32_t width) {
+  uint32_t loop_cnt = (height >> 2);
+  int32_t res;
+  __m128i ref0, ref1, ref2, ref3, pred0, pred1, pred2, pred3;
+  __m128i src0, src1, src2, src3, src4, out0, out1, out2, out3;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i tmp0, tmp1, vec, filt0;
+  __m128i avg = __lsx_vldi(0);
+  __m128i var = avg;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    src += src_stride;
+    src2 = __lsx_vld(src, 0);
+    src += src_stride;
+    src3 = __lsx_vld(src, 0);
+    src += src_stride;
+    src4 = __lsx_vld(src, 0);
+    src += src_stride;
+
+    pred0 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred1 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred2 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred3 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    out0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    out1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    out2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    out3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    src0 = src4;
+    ref0 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    ref1 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    ref2 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    ref3 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+
+    DUP4_ARG2(__lsx_vavgr_bu, out0, pred0, out1, pred1, out2, pred2, out3,
+              pred3, out0, out1, out2, out3);
+
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+    CALC_MSE_AVG_B(out2, ref2, var, avg);
+    CALC_MSE_AVG_B(out3, ref3, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+static uint32_t subpel_avg_ssediff_16w_hv_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+    const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) {
+  uint32_t loop_cnt = (height >> 2);
+  int32_t res;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i ref0, ref1, ref2, ref3, pred0, pred1, pred2, pred3;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
+  __m128i out0, out1, out2, out3, filt_hz, filt_vt, vec, vec0, vec1;
+  __m128i mask = { 0x403030202010100, 0x807070606050504 };
+  __m128i avg = __lsx_vldi(0);
+  __m128i var = avg;
+
+  filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+  filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+  DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+  src += src_stride;
+
+  HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out0);
+  HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out2);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7);
+    src += src_stride;
+
+    pred0 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred1 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred2 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred3 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+
+    HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out1);
+    HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out3);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    out0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS, hz_out0);
+    HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS, hz_out2);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    out1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS, hz_out1);
+    HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS, hz_out3);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    out2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS, hz_out0);
+    HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS, hz_out2);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    out3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    ref0 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    ref1 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    ref2 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    ref3 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+
+    DUP4_ARG2(__lsx_vavgr_bu, out0, pred0, out1, pred1, out2, pred2, out3,
+              pred3, out0, out1, out2, out3);
+
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+    CALC_MSE_AVG_B(out2, ref2, var, avg);
+    CALC_MSE_AVG_B(out3, ref3, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_h_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse +=
+        subpel_avg_ssediff_16w_h_lsx(src, src_stride, dst, dst_stride, sec_pred,
+                                     filter, height, &diff0[loop_cnt], 64);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_v_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse +=
+        subpel_avg_ssediff_16w_v_lsx(src, src_stride, dst, dst_stride, sec_pred,
+                                     filter, height, &diff0[loop_cnt], 64);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_hv_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse += subpel_avg_ssediff_16w_hv_lsx(src, src_stride, dst, dst_stride,
+                                         sec_pred, filter_horiz, filter_vert,
+                                         height, &diff0[loop_cnt], 64);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6)
+#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8)
+#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10)
+#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12)
+
+#define VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(wd, ht)                              \
+  uint32_t vpx_sub_pixel_variance##wd##x##ht##_lsx(                           \
+      const uint8_t *src, int32_t src_stride, int32_t x_offset,               \
+      int32_t y_offset, const uint8_t *ref, int32_t ref_stride,               \
+      uint32_t *sse) {                                                        \
+    int32_t diff;                                                             \
+    uint32_t var;                                                             \
+    const uint8_t *h_filter = bilinear_filters_lsx[x_offset];                 \
+    const uint8_t *v_filter = bilinear_filters_lsx[y_offset];                 \
+                                                                              \
+    if (y_offset) {                                                           \
+      if (x_offset) {                                                         \
+        *sse = sub_pixel_sse_diff_##wd##width_hv_lsx(                         \
+            src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \
+      } else {                                                                \
+        *sse = sub_pixel_sse_diff_##wd##width_v_lsx(                          \
+            src, src_stride, ref, ref_stride, v_filter, ht, &diff);           \
+      }                                                                       \
+                                                                              \
+      var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                             \
+    } else {                                                                  \
+      if (x_offset) {                                                         \
+        *sse = sub_pixel_sse_diff_##wd##width_h_lsx(                          \
+            src, src_stride, ref, ref_stride, h_filter, ht, &diff);           \
+                                                                              \
+        var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                           \
+      } else {                                                                \
+        var = vpx_variance##wd##x##ht##_lsx(src, src_stride, ref, ref_stride, \
+                                            sse);                             \
+      }                                                                       \
+    }                                                                         \
+                                                                              \
+    return var;                                                               \
+  }
+
+VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(8, 8)
+VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(16, 16)
+VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(32, 32)
+
+#define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_LSX(ht)                           \
+  uint32_t vpx_sub_pixel_avg_variance64x##ht##_lsx(                           \
+      const uint8_t *src_ptr, int32_t src_stride, int32_t x_offset,           \
+      int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride,           \
+      uint32_t *sse, const uint8_t *sec_pred) {                               \
+    int32_t diff;                                                             \
+    const uint8_t *h_filter = bilinear_filters_lsx[x_offset];                 \
+    const uint8_t *v_filter = bilinear_filters_lsx[y_offset];                 \
+                                                                              \
+    if (y_offset) {                                                           \
+      if (x_offset) {                                                         \
+        *sse = sub_pixel_avg_sse_diff_64width_hv_lsx(                         \
+            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,     \
+            v_filter, ht, &diff);                                             \
+      } else {                                                                \
+        *sse = sub_pixel_avg_sse_diff_64width_v_lsx(                          \
+            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
+            &diff);                                                           \
+      }                                                                       \
+    } else {                                                                  \
+      if (x_offset) {                                                         \
+        *sse = sub_pixel_avg_sse_diff_64width_h_lsx(                          \
+            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
+            &diff);                                                           \
+      } else {                                                                \
+        *sse = avg_sse_diff_64x##ht##_lsx(src_ptr, src_stride, ref_ptr,       \
+                                          ref_stride, sec_pred, &diff);       \
+      }                                                                       \
+    }                                                                         \
+                                                                              \
+    return VARIANCE_64Wx##ht##H(*sse, diff);                                  \
+  }
+
+VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_LSX(64)
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/subtract_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/subtract_lsx.c
new file mode 100644
index 0000000000..943a5c5a9b
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/subtract_lsx.c
@@ -0,0 +1,371 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static void sub_blk_4x4_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                            const uint8_t *pred_ptr, int32_t pred_stride,
+                            int16_t *diff_ptr, int32_t diff_stride) {
+  __m128i src0, src1, src2, src3;
+  __m128i pred0, pred1, pred2, pred3;
+  __m128i diff0, diff1;
+  __m128i reg0, reg1;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t pred_stride2 = pred_stride << 1;
+  int32_t diff_stride2 = diff_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t pred_stride3 = pred_stride2 + pred_stride;
+  int32_t diff_stride3 = diff_stride2 + diff_stride;
+
+  DUP4_ARG2(__lsx_vldrepl_w, src_ptr, 0, src_ptr + src_stride, 0,
+            src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1,
+            src2, src3);
+  DUP4_ARG2(__lsx_vldrepl_w, pred_ptr, 0, pred_ptr + pred_stride, 0,
+            pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred0,
+            pred1, pred2, pred3);
+  DUP4_ARG2(__lsx_vilvl_w, src1, src0, src3, src2, pred1, pred0, pred3, pred2,
+            src0, src2, pred0, pred2);
+  DUP2_ARG2(__lsx_vilvl_d, src2, src0, pred2, pred0, src0, pred0);
+  reg0 = __lsx_vilvl_b(src0, pred0);
+  reg1 = __lsx_vilvh_b(src0, pred0);
+  DUP2_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, diff0, diff1);
+  __lsx_vstelm_d(diff0, diff_ptr, 0, 0);
+  __lsx_vstelm_d(diff0, diff_ptr + diff_stride, 0, 1);
+  __lsx_vstelm_d(diff1, diff_ptr + diff_stride2, 0, 0);
+  __lsx_vstelm_d(diff1, diff_ptr + diff_stride3, 0, 1);
+}
+
+static void sub_blk_8x8_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                            const uint8_t *pred_ptr, int32_t pred_stride,
+                            int16_t *diff_ptr, int32_t diff_stride) {
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t pred_stride2 = pred_stride << 1;
+  int32_t dst_stride = diff_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t pred_stride3 = pred_stride2 + pred_stride;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t pred_stride4 = pred_stride2 << 1;
+  int32_t dst_stride3 = dst_stride + dst_stride2;
+
+  DUP4_ARG2(__lsx_vldrepl_d, src_ptr, 0, src_ptr + src_stride, 0,
+            src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1,
+            src2, src3);
+  DUP4_ARG2(__lsx_vldrepl_d, pred_ptr, 0, pred_ptr + pred_stride, 0,
+            pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred0,
+            pred1, pred2, pred3);
+  src_ptr += src_stride4;
+  pred_ptr += pred_stride4;
+
+  DUP4_ARG2(__lsx_vldrepl_d, src_ptr, 0, src_ptr + src_stride, 0,
+            src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src4, src5,
+            src6, src7);
+  DUP4_ARG2(__lsx_vldrepl_d, pred_ptr, 0, pred_ptr + pred_stride, 0,
+            pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred4,
+            pred5, pred6, pred7);
+
+  DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+            reg4, reg5, reg6, reg7);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3,
+            src0, src1, src2, src3);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7,
+            src4, src5, src6, src7);
+  __lsx_vst(src0, diff_ptr, 0);
+  __lsx_vstx(src1, diff_ptr, dst_stride);
+  __lsx_vstx(src2, diff_ptr, dst_stride2);
+  __lsx_vstx(src3, diff_ptr, dst_stride3);
+  diff_ptr += dst_stride2;
+  __lsx_vst(src4, diff_ptr, 0);
+  __lsx_vstx(src5, diff_ptr, dst_stride);
+  __lsx_vstx(src6, diff_ptr, dst_stride2);
+  __lsx_vstx(src7, diff_ptr, dst_stride3);
+}
+
+static void sub_blk_16x16_lsx(const uint8_t *src, int32_t src_stride,
+                              const uint8_t *pred, int32_t pred_stride,
+                              int16_t *diff, int32_t diff_stride) {
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t pred_stride2 = pred_stride << 1;
+  int32_t dst_stride = diff_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t pred_stride3 = pred_stride2 + pred_stride;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t pred_stride4 = pred_stride2 << 1;
+  int32_t dst_stride3 = dst_stride + dst_stride2;
+  int16_t *diff_tmp = diff + 8;
+
+  DUP2_ARG2(__lsx_vld, src, 0, pred, 0, src0, pred0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  DUP4_ARG2(__lsx_vldx, pred, pred_stride, pred, pred_stride2, pred,
+            pred_stride3, pred, pred_stride4, pred1, pred2, pred3, pred4);
+  src += src_stride4;
+  pred += pred_stride4;
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            pred, pred_stride, src5, src6, src7, pred5);
+  DUP2_ARG2(__lsx_vldx, pred, pred_stride2, pred, pred_stride3, pred6, pred7);
+  src += src_stride4;
+  pred += pred_stride4;
+  DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+            reg0, reg2, reg4, reg6);
+  DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+            reg1, reg3, reg5, reg7);
+  DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+            tmp0, tmp2, tmp4, tmp6);
+  DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+            tmp1, tmp3, tmp5, tmp7);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3,
+            src0, src1, src2, src3);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7,
+            src4, src5, src6, src7);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3,
+            pred0, pred1, pred2, pred3);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, tmp7,
+            pred4, pred5, pred6, pred7);
+  __lsx_vst(src0, diff, 0);
+  __lsx_vstx(src2, diff, dst_stride);
+  __lsx_vstx(src4, diff, dst_stride2);
+  __lsx_vstx(src6, diff, dst_stride3);
+  __lsx_vst(src1, diff_tmp, 0);
+  __lsx_vstx(src3, diff_tmp, dst_stride);
+  __lsx_vstx(src5, diff_tmp, dst_stride2);
+  __lsx_vstx(src7, diff_tmp, dst_stride3);
+  diff += dst_stride2;
+  diff_tmp += dst_stride2;
+  __lsx_vst(pred0, diff, 0);
+  __lsx_vstx(pred2, diff, dst_stride);
+  __lsx_vstx(pred4, diff, dst_stride2);
+  __lsx_vstx(pred6, diff, dst_stride3);
+  __lsx_vst(pred1, diff_tmp, 0);
+  __lsx_vstx(pred3, diff_tmp, dst_stride);
+  __lsx_vstx(pred5, diff_tmp, dst_stride2);
+  __lsx_vstx(pred7, diff_tmp, dst_stride3);
+  diff += dst_stride2;
+  diff_tmp += dst_stride2;
+  DUP2_ARG2(__lsx_vld, src, 0, pred, 0, src0, pred0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  DUP4_ARG2(__lsx_vldx, pred, pred_stride, pred, pred_stride2, pred,
+            pred_stride3, pred, pred_stride4, pred1, pred2, pred3, pred4);
+  src += src_stride4;
+  pred += pred_stride4;
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            pred, pred_stride, src5, src6, src7, pred5);
+  DUP2_ARG2(__lsx_vldx, pred, pred_stride2, pred, pred_stride3, pred6, pred7);
+  DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+            reg0, reg2, reg4, reg6);
+  DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+            reg1, reg3, reg5, reg7);
+  DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+            tmp0, tmp2, tmp4, tmp6);
+  DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+            tmp1, tmp3, tmp5, tmp7);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3,
+            src0, src1, src2, src3);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7,
+            src4, src5, src6, src7);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3,
+            pred0, pred1, pred2, pred3);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, tmp7,
+            pred4, pred5, pred6, pred7);
+  __lsx_vst(src0, diff, 0);
+  __lsx_vstx(src2, diff, dst_stride);
+  __lsx_vstx(src4, diff, dst_stride2);
+  __lsx_vstx(src6, diff, dst_stride3);
+  __lsx_vst(src1, diff_tmp, 0);
+  __lsx_vstx(src3, diff_tmp, dst_stride);
+  __lsx_vstx(src5, diff_tmp, dst_stride2);
+  __lsx_vstx(src7, diff_tmp, dst_stride3);
+  diff += dst_stride2;
+  diff_tmp += dst_stride2;
+  __lsx_vst(pred0, diff, 0);
+  __lsx_vstx(pred2, diff, dst_stride);
+  __lsx_vstx(pred4, diff, dst_stride2);
+  __lsx_vstx(pred6, diff, dst_stride3);
+  __lsx_vst(pred1, diff_tmp, 0);
+  __lsx_vstx(pred3, diff_tmp, dst_stride);
+  __lsx_vstx(pred5, diff_tmp, dst_stride2);
+  __lsx_vstx(pred7, diff_tmp, dst_stride3);
+}
+
+static void sub_blk_32x32_lsx(const uint8_t *src, int32_t src_stride,
+                              const uint8_t *pred, int32_t pred_stride,
+                              int16_t *diff, int32_t diff_stride) {
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  uint32_t loop_cnt;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t pred_stride2 = pred_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t pred_stride3 = pred_stride2 + pred_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t pred_stride4 = pred_stride2 << 1;
+
+  for (loop_cnt = 8; loop_cnt--;) {
+    const uint8_t *src_tmp = src + 16;
+    const uint8_t *pred_tmp = pred + 16;
+    DUP4_ARG2(__lsx_vld, src, 0, src_tmp, 0, pred, 0, pred_tmp, 0, src0, src1,
+              pred0, pred1);
+    DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src,
+              src_stride2, src_tmp, src_stride2, src2, src3, src4, src5);
+    DUP4_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, pred,
+              pred_stride, pred_tmp, pred_stride, src6, src7, pred2, pred3);
+    DUP4_ARG2(__lsx_vldx, pred, pred_stride2, pred_tmp, pred_stride2, pred,
+              pred_stride3, pred_tmp, pred_stride3, pred4, pred5, pred6, pred7);
+    DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+              reg0, reg2, reg4, reg6);
+    DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+              reg1, reg3, reg5, reg7);
+    DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+              tmp0, tmp2, tmp4, tmp6);
+    DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+              tmp1, tmp3, tmp5, tmp7);
+    DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3,
+              reg3, src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7,
+              reg7, src4, src5, src6, src7);
+    DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3,
+              tmp3, pred0, pred1, pred2, pred3);
+    DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7,
+              tmp7, pred4, pred5, pred6, pred7);
+    src += src_stride4;
+    pred += pred_stride4;
+    __lsx_vst(src0, diff, 0);
+    __lsx_vst(src1, diff, 16);
+    __lsx_vst(src2, diff, 32);
+    __lsx_vst(src3, diff, 48);
+    diff += diff_stride;
+    __lsx_vst(src4, diff, 0);
+    __lsx_vst(src5, diff, 16);
+    __lsx_vst(src6, diff, 32);
+    __lsx_vst(src7, diff, 48);
+    diff += diff_stride;
+    __lsx_vst(pred0, diff, 0);
+    __lsx_vst(pred1, diff, 16);
+    __lsx_vst(pred2, diff, 32);
+    __lsx_vst(pred3, diff, 48);
+    diff += diff_stride;
+    __lsx_vst(pred4, diff, 0);
+    __lsx_vst(pred5, diff, 16);
+    __lsx_vst(pred6, diff, 32);
+    __lsx_vst(pred7, diff, 48);
+    diff += diff_stride;
+  }
+}
+
+static void sub_blk_64x64_lsx(const uint8_t *src, int32_t src_stride,
+                              const uint8_t *pred, int32_t pred_stride,
+                              int16_t *diff, int32_t diff_stride) {
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  uint32_t loop_cnt;
+
+  for (loop_cnt = 32; loop_cnt--;) {
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    DUP4_ARG2(__lsx_vld, pred, 0, pred, 16, pred, 32, pred, 48, pred0, pred1,
+              pred2, pred3);
+    src += src_stride;
+    pred += pred_stride;
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src4, src5, src6,
+              src7);
+    DUP4_ARG2(__lsx_vld, pred, 0, pred, 16, pred, 32, pred, 48, pred4, pred5,
+              pred6, pred7);
+    src += src_stride;
+    pred += pred_stride;
+
+    DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+              reg0, reg2, reg4, reg6);
+    DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+              reg1, reg3, reg5, reg7);
+    DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+              tmp0, tmp2, tmp4, tmp6);
+    DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+              tmp1, tmp3, tmp5, tmp7);
+    DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3,
+              reg3, src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7,
+              reg7, src4, src5, src6, src7);
+    DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3,
+              tmp3, pred0, pred1, pred2, pred3);
+    DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7,
+              tmp7, pred4, pred5, pred6, pred7);
+    __lsx_vst(src0, diff, 0);
+    __lsx_vst(src1, diff, 16);
+    __lsx_vst(src2, diff, 32);
+    __lsx_vst(src3, diff, 48);
+    __lsx_vst(src4, diff, 64);
+    __lsx_vst(src5, diff, 80);
+    __lsx_vst(src6, diff, 96);
+    __lsx_vst(src7, diff, 112);
+    diff += diff_stride;
+    __lsx_vst(pred0, diff, 0);
+    __lsx_vst(pred1, diff, 16);
+    __lsx_vst(pred2, diff, 32);
+    __lsx_vst(pred3, diff, 48);
+    __lsx_vst(pred4, diff, 64);
+    __lsx_vst(pred5, diff, 80);
+    __lsx_vst(pred6, diff, 96);
+    __lsx_vst(pred7, diff, 112);
+    diff += diff_stride;
+  }
+}
+
+void vpx_subtract_block_lsx(int32_t rows, int32_t cols, int16_t *diff_ptr,
+                            ptrdiff_t diff_stride, const uint8_t *src_ptr,
+                            ptrdiff_t src_stride, const uint8_t *pred_ptr,
+                            ptrdiff_t pred_stride) {
+  if (rows == cols) {
+    switch (rows) {
+      case 4:
+        sub_blk_4x4_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                        diff_stride);
+        break;
+      case 8:
+        sub_blk_8x8_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                        diff_stride);
+        break;
+      case 16:
+        sub_blk_16x16_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                          diff_stride);
+        break;
+      case 32:
+        sub_blk_32x32_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                          diff_stride);
+        break;
+      case 64:
+        sub_blk_64x64_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                          diff_stride);
+        break;
+      default:
+        vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
+                             src_stride, pred_ptr, pred_stride);
+        break;
+    }
+  } else {
+    vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,
+                         pred_ptr, pred_stride);
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/txfm_macros_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/txfm_macros_lsx.h
new file mode 100644
index 0000000000..bd514831bf
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/txfm_macros_lsx.h
@@ -0,0 +1,48 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_TXFM_MACROS_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_TXFM_MACROS_LSX_H_
+
+#include "vpx_util/loongson_intrinsics.h"
+
+#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1)         \
+  do {                                                                \
+    __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m;                       \
+    __m128i k0_m, k1_m, k2_m, k3_m;                                   \
+                                                                      \
+    k0_m = __lsx_vreplgr2vr_h(cnst0);                                 \
+    k1_m = __lsx_vreplgr2vr_h(cnst1);                                 \
+    k2_m = __lsx_vpackev_h(k1_m, k0_m);                               \
+                                                                      \
+    DUP2_ARG2(__lsx_vilvl_h, reg1, reg0, reg0, reg1, s5_m, s3_m);     \
+    DUP2_ARG2(__lsx_vilvh_h, reg1, reg0, reg0, reg1, s4_m, s2_m);     \
+                                                                      \
+    DUP2_ARG2(__lsx_vmulwev_w_h, s5_m, k0_m, s4_m, k0_m, s1_m, s0_m); \
+    k3_m = __lsx_vmulwod_w_h(s5_m, k1_m);                             \
+    s1_m = __lsx_vsub_w(s1_m, k3_m);                                  \
+    k3_m = __lsx_vmulwod_w_h(s4_m, k1_m);                             \
+    s0_m = __lsx_vsub_w(s0_m, k3_m);                                  \
+                                                                      \
+    out0 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS);            \
+                                                                      \
+    DUP2_ARG2(__lsx_vdp2_w_h, s3_m, k2_m, s2_m, k2_m, s1_m, s0_m);    \
+    out1 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS);            \
+  } while (0)
+
+#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2, in3)                \
+  do {                                                           \
+    __m128i tp0_m, tp1_m;                                        \
+                                                                 \
+    DUP2_ARG2(__lsx_vdp2_w_h, in0, in2, in1, in2, tp1_m, tp0_m); \
+    in3 = __lsx_vssrarni_h_w(tp1_m, tp0_m, DCT_CONST_BITS);      \
+  } while (0)
+
+#endif  // VPX_VPX_DSP_LOONGARCH_TXFM_MACROS_LSX_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.c
new file mode 100644
index 0000000000..8fad342c71
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.c
@@ -0,0 +1,263 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/variance_lsx.h"
+
+#define VARIANCE_WxH(sse, diff, shift) \
+  (sse) - (((uint32_t)(diff) * (diff)) >> (shift))
+
+#define VARIANCE_LARGE_WxH(sse, diff, shift) \
+  (sse) - (((int64_t)(diff) * (diff)) >> (shift))
+
+static uint32_t sse_diff_8width_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                                    const uint8_t *ref_ptr, int32_t ref_stride,
+                                    int32_t height, int32_t *diff) {
+  int32_t res, ht_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3, vec;
+  __m128i avg = __lsx_vldi(0);
+  __m128i var = avg;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t ref_stride2 = ref_stride << 1;
+  int32_t ref_stride3 = ref_stride2 + ref_stride;
+  int32_t ref_stride4 = ref_stride2 << 1;
+
+  for (; ht_cnt--;) {
+    DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr + src_stride, 0,
+              src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1,
+              src2, src3);
+    src_ptr += src_stride4;
+    DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr + ref_stride, 0,
+              ref_ptr + ref_stride2, 0, ref_ptr + ref_stride3, 0, ref0, ref1,
+              ref2, ref3);
+    ref_ptr += ref_stride4;
+
+    DUP4_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+              src0, src1, ref0, ref1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+  }
+
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+static uint32_t sse_diff_16width_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                                     const uint8_t *ref_ptr, int32_t ref_stride,
+                                     int32_t height, int32_t *diff) {
+  int32_t res, ht_cnt = (height >> 2);
+  __m128i src, ref, vec;
+  __m128i avg = __lsx_vldi(0);
+  __m128i var = avg;
+
+  for (; ht_cnt--;) {
+    src = __lsx_vld(src_ptr, 0);
+    src_ptr += src_stride;
+    ref = __lsx_vld(ref_ptr, 0);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    src = __lsx_vld(src_ptr, 0);
+    src_ptr += src_stride;
+    ref = __lsx_vld(ref_ptr, 0);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src, ref, var, avg);
+    src = __lsx_vld(src_ptr, 0);
+    src_ptr += src_stride;
+    ref = __lsx_vld(ref_ptr, 0);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    src = __lsx_vld(src_ptr, 0);
+    src_ptr += src_stride;
+    ref = __lsx_vld(ref_ptr, 0);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src, ref, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+static uint32_t sse_diff_32width_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                                     const uint8_t *ref_ptr, int32_t ref_stride,
+                                     int32_t height, int32_t *diff) {
+  int32_t res, ht_cnt = (height >> 2);
+  __m128i avg = __lsx_vldi(0);
+  __m128i src0, src1, ref0, ref1;
+  __m128i vec;
+  __m128i var = avg;
+
+  for (; ht_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+  }
+
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+static uint32_t sse_diff_64x64_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                                   const uint8_t *ref_ptr, int32_t ref_stride,
+                                   int32_t *diff) {
+  int32_t res, ht_cnt = 32;
+  __m128i avg0 = __lsx_vldi(0);
+  __m128i src0, src1, src2, src3;
+  __m128i ref0, ref1, ref2, ref3;
+  __m128i vec0, vec1;
+  __m128i avg1 = avg0;
+  __m128i avg2 = avg0;
+  __m128i avg3 = avg0;
+  __m128i var = avg0;
+
+  for (; ht_cnt--;) {
+    DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+              src0, src1, src2, src3);
+    src_ptr += src_stride;
+    DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48,
+              ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+    DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+              src0, src1, src2, src3);
+    src_ptr += src_stride;
+    DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48,
+              ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+  }
+  vec0 = __lsx_vhaddw_w_h(avg0, avg0);
+  vec1 = __lsx_vhaddw_w_h(avg1, avg1);
+  vec0 = __lsx_vadd_w(vec0, vec1);
+  vec1 = __lsx_vhaddw_w_h(avg2, avg2);
+  vec0 = __lsx_vadd_w(vec0, vec1);
+  vec1 = __lsx_vhaddw_w_h(avg3, avg3);
+  vec0 = __lsx_vadd_w(vec0, vec1);
+  HADD_SW_S32(vec0, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6)
+#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8)
+
+#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10)
+#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12)
+
+#define VPX_VARIANCE_WDXHT_LSX(wd, ht)                                         \
+  uint32_t vpx_variance##wd##x##ht##_lsx(                                      \
+      const uint8_t *src, int32_t src_stride, const uint8_t *ref,              \
+      int32_t ref_stride, uint32_t *sse) {                                     \
+    int32_t diff;                                                              \
+                                                                               \
+    *sse =                                                                     \
+        sse_diff_##wd##width_lsx(src, src_stride, ref, ref_stride, ht, &diff); \
+                                                                               \
+    return VARIANCE_##wd##Wx##ht##H(*sse, diff);                               \
+  }
+
+static uint32_t sse_16width_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                                const uint8_t *ref_ptr, int32_t ref_stride,
+                                int32_t height) {
+  int32_t res, ht_cnt = (height >> 2);
+  __m128i src, ref;
+  __m128i var = __lsx_vldi(0);
+
+  for (; ht_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src, ref, var);
+
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src, ref, var);
+
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src, ref, var);
+
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src, ref, var);
+  }
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+VPX_VARIANCE_WDXHT_LSX(8, 8)
+VPX_VARIANCE_WDXHT_LSX(16, 16)
+VPX_VARIANCE_WDXHT_LSX(32, 32)
+
+uint32_t vpx_variance64x64_lsx(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               uint32_t *sse) {
+  int32_t diff;
+
+  *sse = sse_diff_64x64_lsx(src, src_stride, ref, ref_stride, &diff);
+
+  return VARIANCE_64Wx64H(*sse, diff);
+}
+
+uint32_t vpx_mse16x16_lsx(const uint8_t *src, int32_t src_stride,
+                          const uint8_t *ref, int32_t ref_stride,
+                          uint32_t *sse) {
+  *sse = sse_16width_lsx(src, src_stride, ref, ref_stride, 16);
+
+  return *sse;
+}
+
+void vpx_get16x16var_lsx(const uint8_t *src, int32_t src_stride,
+                         const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
+                         int32_t *sum) {
+  *sse = sse_diff_16width_lsx(src, src_stride, ref, ref_stride, 16, sum);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.h
new file mode 100644
index 0000000000..cf9e9890ff
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.h
@@ -0,0 +1,62 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_VARIANCE_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_VARIANCE_LSX_H_
+
+#include "vpx_util/loongson_intrinsics.h"
+
+#define HADD_SW_S32(in0, in1)                  \
+  do {                                         \
+    __m128i res0_m;                            \
+                                               \
+    res0_m = __lsx_vhaddw_d_w(in0, in0);       \
+    res0_m = __lsx_vhaddw_q_d(res0_m, res0_m); \
+    in1 = __lsx_vpickve2gr_w(res0_m, 0);       \
+  } while (0)
+
+#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift, in2) \
+  do {                                                        \
+    __m128i tmp0_m, tmp1_m;                                   \
+                                                              \
+    tmp0_m = __lsx_vshuf_b(in1, in0, mask);                   \
+    tmp1_m = __lsx_vdp2_h_bu(tmp0_m, coeff);                  \
+    in2 = __lsx_vsrari_h(tmp1_m, shift);                      \
+  } while (0)
+
+#define CALC_MSE_B(src, ref, var)                                         \
+  do {                                                                    \
+    __m128i src_l0_m, src_l1_m;                                           \
+    __m128i res_l0_m, res_l1_m;                                           \
+                                                                          \
+    src_l0_m = __lsx_vilvl_b(src, ref);                                   \
+    src_l1_m = __lsx_vilvh_b(src, ref);                                   \
+    DUP2_ARG2(__lsx_vhsubw_hu_bu, src_l0_m, src_l0_m, src_l1_m, src_l1_m, \
+              res_l0_m, res_l1_m);                                        \
+    var = __lsx_vdp2add_w_h(var, res_l0_m, res_l0_m);                     \
+    var = __lsx_vdp2add_w_h(var, res_l1_m, res_l1_m);                     \
+  } while (0)
+
+#define CALC_MSE_AVG_B(src, ref, var, sub)                                \
+  do {                                                                    \
+    __m128i src_l0_m, src_l1_m;                                           \
+    __m128i res_l0_m, res_l1_m;                                           \
+                                                                          \
+    src_l0_m = __lsx_vilvl_b(src, ref);                                   \
+    src_l1_m = __lsx_vilvh_b(src, ref);                                   \
+    DUP2_ARG2(__lsx_vhsubw_hu_bu, src_l0_m, src_l0_m, src_l1_m, src_l1_m, \
+              res_l0_m, res_l1_m);                                        \
+    var = __lsx_vdp2add_w_h(var, res_l0_m, res_l0_m);                     \
+    var = __lsx_vdp2add_w_h(var, res_l1_m, res_l1_m);                     \
+    sub = __lsx_vadd_h(sub, res_l0_m);                                    \
+    sub = __lsx_vadd_h(sub, res_l1_m);                                    \
+  } while (0)
+
+#endif  // VPX_VPX_DSP_LOONGARCH_VARIANCE_LSX_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c
new file mode 100644
index 0000000000..1c59228813
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c
@@ -0,0 +1,972 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+  /* 8 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+  /* 4 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+  /* 4 width cases */
+  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static void common_hz_8t_and_aver_dst_4x4_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i tmp0, tmp1;
+  __m128i dst0, dst1, dst2, dst3;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filter0, filter1, filter2, filter3, tmp0, tmp1);
+  dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst0 = __lsx_vilvl_w(dst1, dst0);
+  dst1 = __lsx_vilvl_w(dst3, dst2);
+  dst0 = __lsx_vilvl_d(dst1, dst0);
+  tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
+  tmp0 = __lsx_vxori_b(tmp0, 128);
+  dst0 = __lsx_vavgr_bu(tmp0, dst0);
+  __lsx_vstelm_w(dst0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst0, dst, 0, 3);
+}
+
+static void common_hz_8t_and_aver_dst_4x8_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3, tmp0, tmp1, tmp2, tmp3;
+  __m128i dst0, dst1;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  src += src_stride;
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  tmp0 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  tmp1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  tmp2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  tmp3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  tmp0 = __lsx_vilvl_w(tmp1, tmp0);
+  tmp1 = __lsx_vilvl_w(tmp3, tmp2);
+  dst0 = __lsx_vilvl_d(tmp1, tmp0);
+
+  tmp0 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  tmp1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  tmp2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  tmp3 = __lsx_vldrepl_w(dst_tmp, 0);
+  tmp0 = __lsx_vilvl_w(tmp1, tmp0);
+  tmp1 = __lsx_vilvl_w(tmp3, tmp2);
+  dst1 = __lsx_vilvl_d(tmp1, tmp0);
+
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filter0, filter1, filter2, filter3, tmp0, tmp1);
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filter0, filter1, filter2, filter3, tmp2, tmp3);
+  DUP4_ARG3(__lsx_vssrarni_b_h, tmp0, tmp0, 7, tmp1, tmp1, 7, tmp2, tmp2, 7,
+            tmp3, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
+  DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
+  DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+  DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp1, dst1, dst0, dst1);
+  __lsx_vstelm_w(dst0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst0, dst, 0, 3);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst1, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst1, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst1, dst, 0, 3);
+}
+
+static void common_hz_8t_and_aver_dst_4w_lsx(const uint8_t *src,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride, int8_t *filter,
+                                             int32_t height) {
+  if (height == 4) {
+    common_hz_8t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else if (height == 8) {
+    common_hz_8t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_8t_and_aver_dst_8w_lsx(const uint8_t *src,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride, int8_t *filter,
+                                             int32_t height) {
+  int32_t loop_cnt = height >> 2;
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i dst0, dst1, dst2, dst3;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  uint8_t *_src = (uint8_t *)src - 3;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+  for (; loop_cnt--;) {
+    src0 = __lsx_vld(_src, 0);
+    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+    src3 = __lsx_vldx(_src, src_stride3);
+    _src += src_stride4;
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, tmp0,
+                               tmp1, tmp2, tmp3);
+    dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp1, dst1, dst0, dst1);
+    __lsx_vstelm_d(dst0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(dst0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(dst1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(dst1, dst, 0, 1);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_8t_and_aver_dst_16w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  int32_t loop_cnt = height >> 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
+    src += src_stride;
+    dst0 = __lsx_vld(dst_tmp, 0);
+    dst1 = __lsx_vldx(dst_tmp, dst_stride);
+    dst_tmp += dst_stride2;
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2, src2,
+              mask0, src3, src3, mask0, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2, src2,
+              mask1, src3, src3, mask1, tmp4, tmp5, tmp6, tmp7);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2, src2,
+              mask2, src3, src3, mask2, tmp8, tmp9, tmp10, tmp11);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, src2, src2,
+              mask3, src3, src3, mask3, tmp12, tmp13, tmp14, tmp15);
+    DUP4_ARG2(__lsx_vdp2_h_b, tmp0, filter0, tmp1, filter0, tmp2, filter0, tmp3,
+              filter0, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vdp2_h_b, tmp8, filter2, tmp9, filter2, tmp10, filter2,
+              tmp11, filter2, tmp8, tmp9, tmp10, tmp11);
+    DUP4_ARG3(__lsx_vdp2add_h_b, tmp0, tmp4, filter1, tmp1, tmp5, filter1, tmp2,
+              tmp6, filter1, tmp3, tmp7, filter1, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG3(__lsx_vdp2add_h_b, tmp8, tmp12, filter3, tmp9, tmp13, filter3,
+              tmp10, tmp14, filter3, tmp11, tmp15, filter3, tmp4, tmp5, tmp6,
+              tmp7);
+    DUP4_ARG2(__lsx_vsadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7,
+              tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, dst2, dst3);
+    DUP2_ARG2(__lsx_vxori_b, dst2, 128, dst3, 128, dst2, dst3);
+    DUP2_ARG2(__lsx_vavgr_bu, dst0, dst2, dst1, dst3, dst0, dst1);
+    __lsx_vst(dst0, dst, 0);
+    __lsx_vstx(dst1, dst, dst_stride);
+    dst += dst_stride2;
+  }
+}
+
+static void common_hz_8t_and_aver_dst_32w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height;
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3, dst0, dst1;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+    src3 = __lsx_vld(src, 24);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst, 16, dst0, dst1);
+    dst_tmp += dst_stride;
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2, src2,
+              mask0, src3, src3, mask0, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2, src2,
+              mask1, src3, src3, mask1, tmp4, tmp5, tmp6, tmp7);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2, src2,
+              mask2, src3, src3, mask2, tmp8, tmp9, tmp10, tmp11);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, src2, src2,
+              mask3, src3, src3, mask3, tmp12, tmp13, tmp14, tmp15);
+    DUP4_ARG2(__lsx_vdp2_h_b, tmp0, filter0, tmp1, filter0, tmp2, filter0, tmp3,
+              filter0, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vdp2_h_b, tmp8, filter2, tmp9, filter2, tmp10, filter2,
+              tmp11, filter2, tmp8, tmp9, tmp10, tmp11);
+    DUP4_ARG3(__lsx_vdp2add_h_b, tmp0, tmp4, filter1, tmp1, tmp5, filter1, tmp2,
+              tmp6, filter1, tmp3, tmp7, filter1, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG3(__lsx_vdp2add_h_b, tmp8, tmp12, filter3, tmp9, tmp13, filter3,
+              tmp10, tmp14, filter3, tmp11, tmp15, filter3, tmp4, tmp5, tmp6,
+              tmp7);
+    DUP4_ARG2(__lsx_vsadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7,
+              tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vavgr_bu, dst0, tmp0, dst1, tmp1, dst0, dst1);
+    __lsx_vst(dst0, dst, 0);
+    __lsx_vst(dst1, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_8t_and_aver_dst_64w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  int32_t loop_cnt = height;
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i out0, out1, out2, out3, dst0, dst1;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+    src3 = __lsx_vld(src, 24);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1);
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, out0,
+                               out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    DUP2_ARG2(__lsx_vavgr_bu, out0, dst0, out1, dst1, out0, out1);
+    __lsx_vst(out0, dst, 0);
+    __lsx_vst(out1, dst, 16);
+
+    DUP2_ARG2(__lsx_vld, src, 32, src, 48, src0, src2);
+    src3 = __lsx_vld(src, 56);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    DUP2_ARG2(__lsx_vld, dst, 32, dst, 48, dst0, dst1);
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, out0,
+                               out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    DUP2_ARG2(__lsx_vavgr_bu, out0, dst0, out1, dst1, out0, out1);
+    __lsx_vst(out0, dst, 32);
+    __lsx_vst(out1, dst, 48);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_and_aver_dst_4x4_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  __m128i src0, src1, src2, src3, mask;
+  __m128i dst0, dst1, dst2, dst3, vec0, vec1, filt0;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  uint8_t *dst_tmp = dst;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+  dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  DUP2_ARG2(__lsx_vilvl_w, dst1, dst0, dst3, dst2, dst0, dst1);
+  dst0 = __lsx_vilvl_d(dst1, dst0);
+  DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1);
+  DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec0, vec1);
+  vec0 = __lsx_vssrarni_bu_h(vec1, vec0, FILTER_BITS);
+  vec0 = __lsx_vavgr_bu(vec0, dst0);
+  __lsx_vstelm_w(vec0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(vec0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(vec0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(vec0, dst, 0, 3);
+}
+
+static void common_hz_2t_and_aver_dst_4x8_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
+  __m128i dst0, dst1, dst2, dst3, dst4;
+  __m128i vec4, vec5, vec6, vec7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  uint8_t *src_tmp1 = (uint8_t *)src + src_stride4;
+  uint8_t *dst_tmp = dst;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+
+  src4 = __lsx_vld(src_tmp1, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src5,
+            src6);
+  src7 = __lsx_vldx(src_tmp1, src_stride3);
+
+  dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  DUP2_ARG2(__lsx_vilvl_w, dst1, dst0, dst3, dst2, dst0, dst1);
+  dst0 = __lsx_vilvl_d(dst1, dst0);
+
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst4 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  DUP2_ARG2(__lsx_vilvl_w, dst2, dst1, dst4, dst3, dst1, dst2);
+  dst1 = __lsx_vilvl_d(dst2, dst1);
+
+  DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1);
+  DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask, src7, src6, mask, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            vec4, vec5, vec6, vec7);
+  DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5,
+            FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, res0,
+            res1, res2, res3);
+  DUP2_ARG2(__lsx_vilvl_d, res1, res0, res3, res2, res0, res2);
+  DUP2_ARG2(__lsx_vavgr_bu, res0, dst0, res2, dst1, res0, res2);
+
+  __lsx_vstelm_w(res0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(res0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(res0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(res0, dst, 0, 3);
+  dst += dst_stride;
+
+  __lsx_vstelm_w(res2, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(res2, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(res2, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(res2, dst, 0, 3);
+  dst += dst_stride;
+}
+
+static void common_hz_2t_and_aver_dst_4w_lsx(const uint8_t *src,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride, int8_t *filter,
+                                             int32_t height) {
+  if (height == 4) {
+    common_hz_2t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else if (height == 8) {
+    common_hz_2t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_2t_and_aver_dst_8x4_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  __m128i src0, src1, src2, src3, mask;
+  __m128i filt0, dst0, dst1, dst2, dst3;
+  __m128i vec0, vec1, vec2, vec3;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  uint8_t *dst_tmp = dst;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+  DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+  DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            vec0, vec1, vec2, vec3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+            FILTER_BITS, vec0, vec1);
+  dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+
+  DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+  DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec1, dst1, vec0, vec1);
+  __lsx_vstelm_d(vec0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec1, dst, 0, 1);
+}
+
+static void common_hz_2t_and_aver_dst_8x8mult_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter, int32_t height) {
+  __m128i src0, src1, src2, src3, mask;
+  __m128i filt0, dst0, dst1, dst2, dst3;
+  __m128i vec0, vec1, vec2, vec3;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  uint8_t *dst_tmp = dst;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  src += src_stride;
+
+  DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+  DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            vec0, vec1, vec2, vec3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+            FILTER_BITS, vec0, vec2);
+  dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+
+  DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2);
+  __lsx_vstelm_d(vec0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec2, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec2, dst, 0, 1);
+  dst += dst_stride;
+
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  src += src_stride;
+  DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+  DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            vec0, vec1, vec2, vec3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+            FILTER_BITS, vec0, vec2);
+  dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+  DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2);
+  __lsx_vstelm_d(vec0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec2, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec2, dst, 0, 1);
+  dst += dst_stride;
+
+  if (height == 16) {
+    LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+    src += src_stride;
+
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, vec0, vec1, vec2, vec3);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+              FILTER_BITS, vec0, vec2);
+    dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+    DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2);
+    __lsx_vstelm_d(vec0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(vec0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(vec2, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(vec2, dst, 0, 1);
+    dst += dst_stride;
+
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+    src3 = __lsx_vldx(src, src_stride3);
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, vec0, vec1, vec2, vec3);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+              FILTER_BITS, vec0, vec2);
+    dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+    DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2);
+    __lsx_vstelm_d(vec0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(vec0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(vec2, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(vec2, dst, 0, 1);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_and_aver_dst_8w_lsx(const uint8_t *src,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride, int8_t *filter,
+                                             int32_t height) {
+  if (height == 4) {
+    common_hz_2t_and_aver_dst_8x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_hz_2t_and_aver_dst_8x8mult_lsx(src, src_stride, dst, dst_stride,
+                                          filter, height);
+  }
+}
+
+static void common_hz_2t_and_aver_dst_16w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 2) - 1;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt0, dst0;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  uint8_t *src_tmp1 = (uint8_t *)src + 8;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+  src6 = __lsx_vldx(src, src_stride3);
+  src += src_stride4;
+
+  src1 = __lsx_vld(src_tmp1, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
+            src5);
+  src7 = __lsx_vldx(src_tmp1, src_stride3);
+  src_tmp1 += src_stride4;
+
+  DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+  DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+  DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+  DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            res0, res1, res2, res3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, filt0,
+            res4, res5, res6, res7);
+  DUP4_ARG3(__lsx_vssrarni_bu_h, res1, res0, FILTER_BITS, res3, res2,
+            FILTER_BITS, res5, res4, FILTER_BITS, res7, res6, FILTER_BITS, res0,
+            res2, res4, res6);
+  dst0 = __lsx_vld(dst, 0);
+  res0 = __lsx_vavgr_bu(res0, dst0);
+  __lsx_vst(res0, dst, 0);
+  dst += dst_stride;
+
+  dst0 = __lsx_vld(dst, 0);
+  res2 = __lsx_vavgr_bu(res2, dst0);
+  __lsx_vst(res2, dst, 0);
+  dst += dst_stride;
+
+  dst0 = __lsx_vld(dst, 0);
+  res4 = __lsx_vavgr_bu(res4, dst0);
+  __lsx_vst(res4, dst, 0);
+  dst += dst_stride;
+
+  dst0 = __lsx_vld(dst, 0);
+  res6 = __lsx_vavgr_bu(res6, dst0);
+  __lsx_vst(res6, dst, 0);
+  dst += dst_stride;
+
+  for (; loop_cnt--;) {
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+    src6 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    src1 = __lsx_vld(src_tmp1, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
+              src5);
+    src7 = __lsx_vldx(src_tmp1, src_stride3);
+    src_tmp1 += src_stride4;
+
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+    DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, res0, res1, res2, res3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, res4, res5, res6, res7);
+
+    DUP4_ARG3(__lsx_vssrarni_bu_h, res1, res0, FILTER_BITS, res3, res2,
+              FILTER_BITS, res5, res4, FILTER_BITS, res7, res6, FILTER_BITS,
+              res0, res2, res4, res6);
+    dst0 = __lsx_vld(dst, 0);
+    res0 = __lsx_vavgr_bu(res0, dst0);
+    __lsx_vst(res0, dst, 0);
+    dst += dst_stride;
+
+    dst0 = __lsx_vld(dst, 0);
+    res2 = __lsx_vavgr_bu(res2, dst0);
+    __lsx_vst(res2, dst, 0);
+    dst += dst_stride;
+
+    dst0 = __lsx_vld(dst, 0);
+    res4 = __lsx_vavgr_bu(res4, dst0);
+    __lsx_vst(res4, dst, 0);
+    dst += dst_stride;
+
+    dst0 = __lsx_vld(dst, 0);
+    res6 = __lsx_vavgr_bu(res6, dst0);
+    __lsx_vst(res6, dst, 0);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_and_aver_dst_32w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 1);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt0, dst0, dst1;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  for (; loop_cnt--;) {
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vld, src, 16, src, 24, src2, src3);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    src += src_stride;
+    src4 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vld, src, 16, src, 24, src6, src7);
+    src5 = __lsx_vshuf_b(src6, src4, shuff);
+    src += src_stride;
+
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+    DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, res0, res1, res2, res3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, res4, res5, res6, res7);
+    DUP4_ARG3(__lsx_vssrarni_bu_h, res1, res0, FILTER_BITS, res3, res2,
+              FILTER_BITS, res5, res4, FILTER_BITS, res7, res6, FILTER_BITS,
+              res0, res2, res4, res6);
+
+    DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1);
+    res0 = __lsx_vavgr_bu(res0, dst0);
+    __lsx_vst(res0, dst, 0);
+    res2 = __lsx_vavgr_bu(res2, dst1);
+    __lsx_vst(res2, dst, 16);
+    dst += dst_stride;
+
+    DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1);
+    res4 = __lsx_vavgr_bu(res4, dst0);
+    __lsx_vst(res4, dst, 0);
+    res6 = __lsx_vavgr_bu(res6, dst1);
+    __lsx_vst(res6, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_and_aver_dst_64w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt0, dst0, dst1, dst2, dst3;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  for (; loop_cnt--;) {
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src2, src4,
+              src6);
+    src7 = __lsx_vld(src, 56);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src0, shuff, src4, src2, shuff, src1, src3);
+    src5 = __lsx_vshuf_b(src6, src4, shuff);
+    src += src_stride;
+
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+    DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, out0, out1, out2, out3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, out4, out5, out6, out7);
+
+    DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+              FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+              out0, out2, out4, out6);
+
+    DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, dst0, dst1, dst2,
+              dst3);
+    out0 = __lsx_vavgr_bu(out0, dst0);
+    __lsx_vst(out0, dst, 0);
+    out2 = __lsx_vavgr_bu(out2, dst1);
+    __lsx_vst(out2, dst, 16);
+    out4 = __lsx_vavgr_bu(out4, dst2);
+    __lsx_vst(out4, dst, 32);
+    out6 = __lsx_vavgr_bu(out6, dst3);
+    __lsx_vst(out6, dst, 48);
+    dst += dst_stride;
+  }
+}
+
+void vpx_convolve8_avg_horiz_lsx(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                 int h) {
+  const int16_t *const filter_x = filter[x0_q4];
+  int8_t cnt, filt_hor[8];
+
+  assert(x_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_hor[cnt] = filter_x[cnt];
+  }
+
+  if (vpx_get_filter_taps(filter_x) == 2) {
+    switch (w) {
+      case 4:
+        common_hz_2t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, &filt_hor[3], h);
+        break;
+      case 8:
+        common_hz_2t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, &filt_hor[3], h);
+        break;
+      case 16:
+        common_hz_2t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, &filt_hor[3], h);
+        break;
+
+      case 32:
+        common_hz_2t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, &filt_hor[3], h);
+        break;
+      case 64:
+        common_hz_2t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, &filt_hor[3], h);
+        break;
+      default:
+        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        common_hz_8t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, filt_hor, h);
+        break;
+      case 8:
+        common_hz_8t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, filt_hor, h);
+        break;
+      case 16:
+        common_hz_8t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, filt_hor, h);
+        break;
+      case 32:
+        common_hz_8t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, filt_hor, h);
+        break;
+      case 64:
+        common_hz_8t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, filt_hor, h);
+        break;
+      default:
+        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c
new file mode 100644
index 0000000000..d1abf622ad
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c
@@ -0,0 +1,737 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+  /* 8 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+  /* 4 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+  /* 4 width cases */
+  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static void common_hv_8ht_8vt_and_aver_dst_4w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) {
+  uint32_t loop_cnt = height >> 2;
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+  __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  __m128i out0, out1;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  uint8_t *_src = (uint8_t *)src - 3 - src_stride3;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+  DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4,
+            filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+
+  src0 = __lsx_vld(_src, 0);
+  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(_src, src_stride3);
+  _src += src_stride4;
+  src4 = __lsx_vld(_src, 0);
+  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
+  _src += src_stride3;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+  src6 = __lsx_vxori_b(src6, 128);
+
+  tmp0 = horiz_8tap_filt(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  tmp2 = horiz_8tap_filt(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  tmp4 = horiz_8tap_filt(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  tmp5 = horiz_8tap_filt(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3);
+  DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
+            filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+  DUP2_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
+  tmp2 = __lsx_vpackev_b(tmp5, tmp4);
+  for (; loop_cnt--;) {
+    src7 = __lsx_vld(_src, 0);
+    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
+    src10 = __lsx_vldx(_src, src_stride3);
+    _src += src_stride4;
+    src2 = __lsx_vldrepl_w(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src3 = __lsx_vldrepl_w(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src4 = __lsx_vldrepl_w(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src5 = __lsx_vldrepl_w(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_w, src3, src2, src5, src4, src2, src3);
+    src2 = __lsx_vilvl_d(src3, src2);
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    tmp3 = horiz_8tap_filt(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff);
+    tmp4 = __lsx_vpackev_b(tmp3, tmp4);
+    out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    src1 = horiz_8tap_filt(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    src0 = __lsx_vshuf_b(src1, tmp3, shuff);
+    src0 = __lsx_vpackev_b(src1, src0);
+    out1 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    out0 = __lsx_vssrarni_b_h(out1, out0, FILTER_BITS);
+    out0 = __lsx_vxori_b(out0, 128);
+    out0 = __lsx_vavgr_bu(out0, src2);
+    __lsx_vstelm_w(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 3);
+    dst += dst_stride;
+
+    tmp5 = src1;
+    tmp0 = tmp2;
+    tmp1 = tmp4;
+    tmp2 = src0;
+  }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_8w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) {
+  uint32_t loop_cnt = height >> 2;
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+  __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+  __m128i out0, out1;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  uint8_t *_src = (uint8_t *)src - 3 - src_stride3;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4,
+            filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+
+  src0 = __lsx_vld(_src, 0);
+  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(_src, src_stride3);
+  _src += src_stride4;
+  src4 = __lsx_vld(_src, 0);
+  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
+  _src += src_stride3;
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+  src6 = __lsx_vxori_b(src6, 128);
+
+  src0 = horiz_8tap_filt(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src1 = horiz_8tap_filt(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src2 = horiz_8tap_filt(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src3 = horiz_8tap_filt(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src4 = horiz_8tap_filt(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src5 = horiz_8tap_filt(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src6 = horiz_8tap_filt(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+
+  DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
+            filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+  DUP4_ARG2(__lsx_vpackev_b, src1, src0, src3, src2, src5, src4, src2, src1,
+            tmp0, tmp1, tmp2, tmp4);
+  DUP2_ARG2(__lsx_vpackev_b, src4, src3, src6, src5, tmp5, tmp6);
+
+  for (; loop_cnt--;) {
+    src7 = __lsx_vld(_src, 0);
+    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
+    src10 = __lsx_vldx(_src, src_stride3);
+    _src += src_stride4;
+
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    src7 = horiz_8tap_filt(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    tmp3 = __lsx_vpackev_b(src7, src6);
+    out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    src8 = horiz_8tap_filt(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    src0 = __lsx_vpackev_b(src8, src7);
+    out1 = filt_8tap_dpadd_s_h(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    src9 = horiz_8tap_filt(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    src1 = __lsx_vpackev_b(src9, src8);
+    src3 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    src10 = horiz_8tap_filt(src10, src10, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+    src2 = __lsx_vpackev_b(src10, src9);
+    src4 = filt_8tap_dpadd_s_h(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, FILTER_BITS, src4, src3,
+              FILTER_BITS, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    src5 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src7 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src8 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src9 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_d, src7, src5, src9, src8, src5, src7);
+    DUP2_ARG2(__lsx_vavgr_bu, out0, src5, out1, src7, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 1);
+    dst += dst_stride;
+
+    src6 = src10;
+    tmp0 = tmp2;
+    tmp1 = tmp3;
+    tmp2 = src1;
+    tmp4 = tmp6;
+    tmp5 = src0;
+    tmp6 = src2;
+  }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_16w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) {
+  common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
+                                        filter_horiz, filter_vert, height);
+  src += 8;
+  dst += 8;
+
+  common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
+                                        filter_horiz, filter_vert, height);
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_32w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) {
+  int32_t multiple8_cnt;
+
+  for (multiple8_cnt = 4; multiple8_cnt--;) {
+    common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
+                                          filter_horiz, filter_vert, height);
+    src += 8;
+    dst += 8;
+  }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_64w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) {
+  int32_t multiple8_cnt;
+
+  for (multiple8_cnt = 8; multiple8_cnt--;) {
+    common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
+                                          filter_horiz, filter_vert, height);
+
+    src += 8;
+    dst += 8;
+  }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4x4_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert) {
+  __m128i src0, src1, src2, src3, src4, mask;
+  __m128i filt_hz, filt_vt, vec0, vec1;
+  __m128i dst0, dst1, dst2, dst3;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 16);
+  /* rearranging filter */
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+
+  hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz);
+  hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz);
+  hz_out4 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+  hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
+  hz_out3 = __lsx_vpickod_d(hz_out4, hz_out2);
+  DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+
+  dst0 = __lsx_vldrepl_w(dst, 0);
+  dst1 = __lsx_vldrepl_w(dst + dst_stride, 0);
+  dst2 = __lsx_vldrepl_w(dst + dst_stride2, 0);
+  dst3 = __lsx_vldrepl_w(dst + dst_stride3, 0);
+  dst0 = __lsx_vilvl_w(dst1, dst0);
+  dst1 = __lsx_vilvl_w(dst3, dst2);
+  dst0 = __lsx_vilvl_d(dst1, dst0);
+  DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+  tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+  tmp0 = __lsx_vavgr_bu(tmp0, dst0);
+  __lsx_vstelm_w(tmp0, dst, 0, 0);
+  __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2);
+  __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4x8_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert) {
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+  __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  __m128i hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
+  __m128i dst0, dst1, dst2, dst3, dst4;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+  /* rearranging filter */
+  filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+  filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  src += src_stride4;
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src5, src6, src7, src8);
+  src += src_stride4;
+
+  hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz);
+  hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz);
+  hz_out4 = horiz_2tap_filt_uh(src4, src5, mask, filt_hz);
+  hz_out6 = horiz_2tap_filt_uh(src6, src7, mask, filt_hz);
+  hz_out8 = horiz_2tap_filt_uh(src8, src8, mask, filt_hz);
+  DUP2_ARG3(__lsx_vshuf_b, hz_out2, hz_out0, shuff, hz_out4, hz_out2, shuff,
+            hz_out1, hz_out3);
+  hz_out5 = __lsx_vshuf_b(hz_out6, hz_out4, shuff);
+  hz_out7 = __lsx_vpickod_d(hz_out8, hz_out6);
+
+  dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst0 = __lsx_vilvl_w(dst1, dst0);
+  dst1 = __lsx_vilvl_w(dst3, dst2);
+  dst0 = __lsx_vilvl_d(dst1, dst0);
+
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst4 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst1 = __lsx_vilvl_w(dst2, dst1);
+  dst2 = __lsx_vilvl_w(dst4, dst3);
+  dst1 = __lsx_vilvl_d(dst2, dst1);
+
+  DUP4_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, hz_out5,
+            hz_out4, hz_out7, hz_out6, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, vec2, filt_vt, vec3,
+            filt_vt, tmp0, tmp1, tmp2, tmp3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+            FILTER_BITS, res0, res1);
+  DUP2_ARG2(__lsx_vavgr_bu, res0, dst0, res1, dst1, res0, res1);
+
+  __lsx_vstelm_w(res0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(res0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(res0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(res0, dst, 0, 3);
+  dst += dst_stride;
+
+  __lsx_vstelm_w(res1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(res1, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(res1, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(res1, dst, 0, 3);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+  if (height == 4) {
+    common_hv_2ht_2vt_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert);
+  } else if (height == 8) {
+    common_hv_2ht_2vt_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert);
+  }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8x4_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert) {
+  __m128i src0, src1, src2, src3, src4, mask;
+  __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+  __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+  __m128i dst0, dst1, dst2, dst3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  uint8_t *dst_tmp = dst;
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+  /* rearranging filter */
+  filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+  filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+
+  dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+  DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+  hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+  hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+  vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+  tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+  hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+  vec1 = __lsx_vpackev_b(hz_out0, hz_out1);
+  tmp1 = __lsx_vdp2_h_bu(vec1, filt_vt);
+
+  hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+  vec2 = __lsx_vpackev_b(hz_out1, hz_out0);
+  tmp2 = __lsx_vdp2_h_bu(vec2, filt_vt);
+
+  hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+  vec3 = __lsx_vpackev_b(hz_out0, hz_out1);
+  tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+            FILTER_BITS, tmp0, tmp1);
+  AVG_ST4_D(tmp0, tmp1, dst0, dst1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, mask;
+  __m128i filt_hz, filt_vt, vec0;
+  __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+  __m128i dst0, dst1, dst2, dst3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  uint8_t *dst_tmp = dst;
+
+  /* rearranging filter */
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+  filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+    vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+    tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+    vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+    tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+              FILTER_BITS, tmp0, tmp1);
+
+    dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+    AVG_ST4_D(tmp0, tmp1, dst0, dst1, dst, dst_stride);
+    dst += dst_stride;
+  }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+  if (height == 4) {
+    common_hv_2ht_2vt_and_aver_dst_8x4_lsx(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert);
+  } else {
+    common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx(
+        src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height);
+  }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_16w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+  uint8_t *src_tmp1;
+  uint32_t loop_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1, tmp3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride << 2;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+  /* rearranging filter */
+  filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+  filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+  DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+  src += src_stride;
+
+  hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+  hz_out2 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+
+  for (; loop_cnt--;) {
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+    src6 = __lsx_vldx(src, src_stride3);
+    src_tmp1 = (uint8_t *)(src + 8);
+    src1 = __lsx_vld(src_tmp1, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
+              src5);
+    src7 = __lsx_vldx(src_tmp1, src_stride3);
+    src += src_stride4;
+    dst0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+    dst3 = __lsx_vldx(dst, dst_stride3);
+
+    hz_out1 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+    hz_out3 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp3 = __lsx_vavgr_bu(tmp3, dst0);
+    __lsx_vst(tmp3, dst, 0);
+
+    hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+    hz_out2 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp3 = __lsx_vavgr_bu(tmp3, dst1);
+    __lsx_vstx(tmp3, dst, dst_stride);
+
+    hz_out1 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+    hz_out3 = horiz_2tap_filt_uh(src5, src5, mask, filt_hz);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp3 = __lsx_vavgr_bu(tmp3, dst2);
+    __lsx_vstx(tmp3, dst, dst_stride2);
+
+    hz_out0 = horiz_2tap_filt_uh(src6, src6, mask, filt_hz);
+    hz_out2 = horiz_2tap_filt_uh(src7, src7, mask, filt_hz);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp3 = __lsx_vavgr_bu(tmp3, dst3);
+    __lsx_vstx(tmp3, dst, dst_stride3);
+    dst += dst_stride4;
+  }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_32w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+  common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride,
+                                         filter_horiz, filter_vert, height);
+  src += 16;
+  dst += 16;
+
+  common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride,
+                                         filter_horiz, filter_vert, height);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_64w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 4; multiple8_cnt--;) {
+    common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert, height);
+    src += 16;
+    dst += 16;
+  }
+}
+
+void vpx_convolve8_avg_lsx(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const InterpKernel *filter, int x0_q4, int x_step_q4,
+                           int y0_q4, int y_step_q4, int w, int h) {
+  const int16_t *const filter_x = filter[x0_q4];
+  const int16_t *const filter_y = filter[y0_q4];
+  int8_t cnt, filt_hor[8], filt_ver[8];
+
+  assert(x_step_q4 == 16);
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_hor[cnt] = filter_x[cnt];
+    filt_ver[cnt] = filter_y[cnt];
+  }
+  if (vpx_get_filter_taps(filter_x) == 2 &&
+      vpx_get_filter_taps(filter_y) == 2) {
+    switch (w) {
+      case 4:
+        common_hv_2ht_2vt_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+                                              (int32_t)dst_stride, &filt_hor[3],
+                                              &filt_ver[3], h);
+        break;
+      case 8:
+        common_hv_2ht_2vt_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+                                              (int32_t)dst_stride, &filt_hor[3],
+                                              &filt_ver[3], h);
+        break;
+      case 16:
+        common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+                                               (int32_t)dst_stride,
+                                               &filt_hor[3], &filt_ver[3], h);
+        break;
+      case 32:
+        common_hv_2ht_2vt_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+                                               (int32_t)dst_stride,
+                                               &filt_hor[3], &filt_ver[3], h);
+        break;
+      case 64:
+        common_hv_2ht_2vt_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+                                               (int32_t)dst_stride,
+                                               &filt_hor[3], &filt_ver[3], h);
+        break;
+      default:
+        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                            x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  } else if (vpx_get_filter_taps(filter_x) == 2 ||
+             vpx_get_filter_taps(filter_y) == 2) {
+    vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                        x_step_q4, y0_q4, y_step_q4, w, h);
+  } else {
+    switch (w) {
+      case 4:
+        common_hv_8ht_8vt_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+                                              (int32_t)dst_stride, filt_hor,
+                                              filt_ver, h);
+        break;
+      case 8:
+        common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+                                              (int32_t)dst_stride, filt_hor,
+                                              filt_ver, h);
+        break;
+      case 16:
+        common_hv_8ht_8vt_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+                                               (int32_t)dst_stride, filt_hor,
+                                               filt_ver, h);
+        break;
+      case 32:
+        common_hv_8ht_8vt_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+                                               (int32_t)dst_stride, filt_hor,
+                                               filt_ver, h);
+        break;
+      case 64:
+        common_hv_8ht_8vt_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+                                               (int32_t)dst_stride, filt_hor,
+                                               filt_ver, h);
+        break;
+      default:
+        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                            x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c
new file mode 100644
index 0000000000..5c6413df44
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c
@@ -0,0 +1,918 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static void common_vt_8t_and_aver_dst_4w_lsx(const uint8_t *src,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const int8_t *filter,
+                                             int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  __m128i reg0, reg1, reg2, reg3, reg4;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i out0, out1;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  uint8_t *src_tmp0 = (uint8_t *)src - src_stride3;
+
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+  src0 = __lsx_vld(src_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src1,
+            src2);
+  src3 = __lsx_vldx(src_tmp0, src_stride3);
+  src_tmp0 += src_stride4;
+  src4 = __lsx_vld(src_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src5,
+            src6);
+  src_tmp0 += src_stride3;
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, tmp0,
+            tmp1, tmp2, tmp3);
+  DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, tmp4, tmp5);
+  DUP2_ARG2(__lsx_vilvl_d, tmp3, tmp0, tmp4, tmp1, reg0, reg1);
+  reg2 = __lsx_vilvl_d(tmp5, tmp2);
+  DUP2_ARG2(__lsx_vxori_b, reg0, 128, reg1, 128, reg0, reg1);
+  reg2 = __lsx_vxori_b(reg2, 128);
+
+  for (; loop_cnt--;) {
+    src7 = __lsx_vld(src_tmp0, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src8,
+              src9);
+    src10 = __lsx_vldx(src_tmp0, src_stride3);
+    src_tmp0 += src_stride4;
+    src0 = __lsx_vldrepl_w(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src1 = __lsx_vldrepl_w(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src2 = __lsx_vldrepl_w(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src3 = __lsx_vldrepl_w(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_w, src1, src0, src3, src2, src0, src1);
+    src0 = __lsx_vilvl_d(src1, src0);
+    DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+              tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4);
+    DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4);
+    out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, reg3, filter0, filter1,
+                               filter2, filter3);
+    out1 = filt_8tap_dpadd_s_h(reg1, reg2, reg3, reg4, filter0, filter1,
+                               filter2, filter3);
+    out0 = __lsx_vssrarni_b_h(out1, out0, 7);
+    out0 = __lsx_vxori_b(out0, 128);
+    out0 = __lsx_vavgr_bu(out0, src0);
+    __lsx_vstelm_w(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 3);
+    dst += dst_stride;
+    reg0 = reg2;
+    reg1 = reg3;
+    reg2 = reg4;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_and_aver_dst_8w_lsx(const uint8_t *src,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const int8_t *filter,
+                                             int32_t height) {
+  uint32_t loop_cnt = height >> 2;
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i out0, out1, out2, out3;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  uint8_t *src_tmp0 = (uint8_t *)src - src_stride3;
+
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  src0 = __lsx_vld(src_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src1,
+            src2);
+  src3 = __lsx_vldx(src_tmp0, src_stride3);
+  src_tmp0 += src_stride4;
+  src4 = __lsx_vld(src_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src5,
+            src6);
+  src_tmp0 += src_stride3;
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+  src6 = __lsx_vxori_b(src6, 128);
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, reg0,
+            reg1, reg2, reg3);
+  DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
+
+  for (; loop_cnt--;) {
+    src7 = __lsx_vld(src_tmp0, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src8,
+              src9);
+    src10 = __lsx_vldx(src_tmp0, src_stride3);
+    src_tmp0 += src_stride4;
+    src0 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src1 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src2 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src3 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_d, src1, src0, src3, src2, src0, src1);
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+              tmp0, tmp1, tmp2, tmp3);
+    out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, tmp0, filter0, filter1,
+                               filter2, filter3);
+    out1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, tmp1, filter0, filter1,
+                               filter2, filter3);
+    out2 = filt_8tap_dpadd_s_h(reg1, reg2, tmp0, tmp2, filter0, filter1,
+                               filter2, filter3);
+    out3 = filt_8tap_dpadd_s_h(reg4, reg5, tmp1, tmp3, filter0, filter1,
+                               filter2, filter3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    DUP2_ARG2(__lsx_vavgr_bu, out0, src0, out1, src1, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 1);
+    dst += dst_stride;
+    reg0 = reg2;
+    reg1 = tmp0;
+    reg2 = tmp2;
+    reg3 = reg5;
+    reg4 = tmp1;
+    reg5 = tmp3;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_and_aver_dst_16w_mult_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    const int8_t *filter, int32_t height, int32_t width) {
+  uint8_t *src_tmp;
+  uint32_t cnt = width >> 4;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+  __m128i reg6, reg7, reg8, reg9, reg10, reg11;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+  uint8_t *src_tmp0 = (uint8_t *)src - src_stride3;
+
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+  for (; cnt--;) {
+    uint32_t loop_cnt = height >> 2;
+    uint8_t *dst_reg = dst;
+
+    src_tmp = src_tmp0;
+    src0 = __lsx_vld(src_tmp, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src1,
+              src2);
+    src3 = __lsx_vldx(src_tmp, src_stride3);
+    src_tmp += src_stride4;
+    src4 = __lsx_vld(src_tmp, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+              src6);
+    src_tmp += src_stride3;
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+    src6 = __lsx_vxori_b(src6, 128);
+    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+              reg0, reg1, reg2, reg3);
+    DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
+    DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
+              reg6, reg7, reg8, reg9);
+    DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11);
+    for (; loop_cnt--;) {
+      src7 = __lsx_vld(src_tmp, 0);
+      DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src8,
+                src9);
+      src10 = __lsx_vldx(src_tmp, src_stride3);
+      src_tmp += src_stride4;
+      DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
+                src7, src8, src9, src10);
+      DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+                src0, src1, src2, src3);
+      DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9,
+                src4, src5, src7, src8);
+      tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1,
+                                 filter2, filter3);
+      tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1,
+                                 filter2, filter3);
+      tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1,
+                                 filter2, filter3);
+      tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1,
+                                 filter2, filter3);
+      DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+      DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+      tmp2 = __lsx_vld(dst_reg, 0);
+      tmp3 = __lsx_vldx(dst_reg, dst_stride);
+      DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1);
+      __lsx_vst(tmp0, dst_reg, 0);
+      __lsx_vstx(tmp1, dst_reg, dst_stride);
+      tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1,
+                                 filter2, filter3);
+      tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1,
+                                 filter2, filter3);
+      tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1,
+                                 filter2, filter3);
+      tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1,
+                                 filter2, filter3);
+      DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+      DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+      tmp2 = __lsx_vldx(dst_reg, dst_stride2);
+      tmp3 = __lsx_vldx(dst_reg, dst_stride3);
+      DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1);
+      __lsx_vstx(tmp0, dst_reg, dst_stride2);
+      __lsx_vstx(tmp1, dst_reg, dst_stride3);
+      dst_reg += dst_stride4;
+
+      reg0 = reg2;
+      reg1 = src0;
+      reg2 = src2;
+      reg3 = reg5;
+      reg4 = src1;
+      reg5 = src3;
+      reg6 = reg8;
+      reg7 = src4;
+      reg8 = src7;
+      reg9 = reg11;
+      reg10 = src5;
+      reg11 = src8;
+      src6 = src10;
+    }
+    src_tmp0 += 16;
+    dst += 16;
+  }
+}
+
+static void common_vt_8t_and_aver_dst_16w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height) {
+  common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride,
+                                         filter, height, 16);
+}
+
+static void common_vt_8t_and_aver_dst_32w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height) {
+  common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride,
+                                         filter, height, 32);
+}
+
+static void common_vt_8t_and_aver_dst_64w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height) {
+  common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride,
+                                         filter, height, 64);
+}
+
+static void common_vt_2t_and_aver_dst_4x4_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  __m128i src0, src1, src2, src3, src4;
+  __m128i dst0, dst1, dst2, dst3, out, filt0, src2110, src4332;
+  __m128i src10_r, src32_r, src21_r, src43_r;
+  __m128i tmp0, tmp1;
+  uint8_t *dst_tmp = dst;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+  src += src_stride4;
+  src4 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst0 = __lsx_vilvl_w(dst1, dst0);
+  dst1 = __lsx_vilvl_w(dst3, dst2);
+  dst0 = __lsx_vilvl_d(dst1, dst0);
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+            src10_r, src21_r, src32_r, src43_r);
+  DUP2_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r, src2110,
+            src4332);
+  DUP2_ARG2(__lsx_vdp2_h_bu, src2110, filt0, src4332, filt0, tmp0, tmp1);
+  tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+  out = __lsx_vavgr_bu(tmp0, dst0);
+  __lsx_vstelm_w(out, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out, dst, 0, 3);
+  dst += dst_stride;
+}
+
+static void common_vt_2t_and_aver_dst_4x8_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  __m128i dst0, dst1, dst2, dst3, dst4;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
+  __m128i src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+  __m128i src2110, src4332, src6554, src8776, filt0;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  uint8_t *dst_tmp = dst;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+  src += src_stride4;
+  src4 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+  src7 = __lsx_vldx(src, src_stride3);
+  src += src_stride4;
+  src8 = __lsx_vld(src, 0);
+
+  dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst0 = __lsx_vilvl_w(dst1, dst0);
+  dst1 = __lsx_vilvl_w(dst3, dst2);
+  dst0 = __lsx_vilvl_d(dst1, dst0);
+
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst4 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst1 = __lsx_vilvl_w(dst2, dst1);
+  dst2 = __lsx_vilvl_w(dst4, dst3);
+  dst1 = __lsx_vilvl_d(dst2, dst1);
+
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+            src10_r, src21_r, src32_r, src43_r);
+  DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
+            src54_r, src65_r, src76_r, src87_r);
+  DUP4_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+            src87_r, src76_r, src2110, src4332, src6554, src8776);
+  DUP4_ARG2(__lsx_vdp2_h_bu, src2110, filt0, src4332, filt0, src6554, filt0,
+            src8776, filt0, tmp0, tmp1, tmp2, tmp3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+            FILTER_BITS, tmp0, tmp2);
+  DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp2, dst1, tmp0, tmp2);
+  __lsx_vstelm_w(tmp0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(tmp0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(tmp0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(tmp0, dst, 0, 3);
+  dst += dst_stride;
+
+  __lsx_vstelm_w(tmp2, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(tmp2, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(tmp2, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(tmp2, dst, 0, 3);
+}
+
+static void common_vt_2t_and_aver_dst_4w_lsx(const uint8_t *src,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride, int8_t *filter,
+                                             int32_t height) {
+  if (height == 4) {
+    common_vt_2t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else if (height == 8) {
+    common_vt_2t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_vt_2t_and_aver_dst_8x4_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  __m128i src0, src1, src2, src3, src4;
+  __m128i dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  uint8_t *dst_tmp = dst;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+  DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec1);
+  DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            tmp0, tmp1, tmp2, tmp3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+            FILTER_BITS, tmp0, tmp2);
+  DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp2, dst1, tmp0, tmp2);
+  __lsx_vstelm_d(tmp0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(tmp0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_d(tmp2, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(tmp2, dst, 0, 1);
+}
+
+static void common_vt_2t_and_aver_dst_8x8mult_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 3);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  __m128i dst0, dst1, dst2, dst3, dst4, dst5;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  uint8_t *dst_tmp = dst;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+    src5 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src6, src7);
+    src8 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+
+    dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst4 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst5 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_d, dst3, dst2, dst5, dst4, dst2, dst3);
+
+    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+              vec0, vec1, vec2, vec3);
+    DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
+              vec4, vec5, vec6, vec7);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, tmp0, tmp1, tmp2, tmp3);
+
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+              FILTER_BITS, tmp0, tmp2);
+    DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp2, dst1, tmp0, tmp2);
+    __lsx_vstelm_d(tmp0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp2, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp2, dst, 0, 1);
+    dst += dst_stride;
+
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+              FILTER_BITS, tmp0, tmp2);
+    DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst2, tmp2, dst3, tmp0, tmp2);
+    __lsx_vstelm_d(tmp0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp2, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp2, dst, 0, 1);
+    dst += dst_stride;
+
+    src0 = src8;
+  }
+}
+
+static void common_vt_2t_and_aver_dst_8w_lsx(const uint8_t *src,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride, int8_t *filter,
+                                             int32_t height) {
+  if (height == 4) {
+    common_vt_2t_and_aver_dst_8x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_vt_2t_and_aver_dst_8x8mult_lsx(src, src_stride, dst, dst_stride,
+                                          filter, height);
+  }
+}
+
+static void common_vt_2t_and_aver_dst_16w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i tmp0, tmp1;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    dst0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+    dst3 = __lsx_vldx(dst, dst_stride3);
+
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst0);
+    __lsx_vst(tmp0, dst, 0);
+    dst += dst_stride;
+
+    DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst1);
+    __lsx_vst(tmp0, dst, 0);
+    dst += dst_stride;
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst2);
+    __lsx_vst(tmp0, dst, 0);
+    dst += dst_stride;
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst3);
+    __lsx_vst(tmp0, dst, 0);
+    dst += dst_stride;
+
+    src0 = src4;
+  }
+}
+
+static void common_vt_2t_and_aver_dst_32w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  uint8_t *src_tmp1;
+  uint8_t *dst_tmp1;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+  __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  __m128i tmp0, tmp1;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src5);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+
+    dst0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+    dst3 = __lsx_vldx(dst, dst_stride3);
+
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+
+    src_tmp1 = src + 16;
+    src6 = __lsx_vld(src_tmp1, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src7,
+              src8);
+    src9 = __lsx_vldx(src_tmp1, src_stride3);
+
+    dst_tmp1 = dst + 16;
+    dst4 = __lsx_vld(dst_tmp1, 0);
+    DUP2_ARG2(__lsx_vldx, dst_tmp1, dst_stride, dst_tmp1, dst_stride2, dst5,
+              dst6);
+    dst7 = __lsx_vldx(dst_tmp1, dst_stride3);
+    src += src_stride4;
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst0);
+    __lsx_vst(tmp0, dst, 0);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst1);
+    __lsx_vstx(tmp0, dst, dst_stride);
+
+    DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst2);
+    __lsx_vstx(tmp0, dst, dst_stride2);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst3);
+    __lsx_vstx(tmp0, dst, dst_stride3);
+
+    DUP2_ARG2(__lsx_vilvl_b, src6, src5, src7, src6, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src6, src5, src7, src6, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst4);
+    __lsx_vst(tmp0, dst, 16);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst5);
+    dst += dst_stride;
+    __lsx_vst(tmp0, dst, 16);
+
+    DUP2_ARG2(__lsx_vilvl_b, src8, src7, src9, src8, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src8, src7, src9, src8, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst6);
+    dst += dst_stride;
+    __lsx_vst(tmp0, dst, 16);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst7);
+    dst += dst_stride;
+    __lsx_vst(tmp0, dst, 16);
+    dst += dst_stride;
+
+    src0 = src4;
+    src5 = src9;
+  }
+}
+
+static void common_vt_2t_and_aver_dst_64w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 1);
+  int32_t src_stride2 = src_stride << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  uint8_t *src_tmp1;
+  uint8_t *dst_tmp1;
+  __m128i src0, src1, src2, src3, src4, src5;
+  __m128i src6, src7, src8, src9, src10, src11, filt0;
+  __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i tmp0, tmp1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src3, src6,
+            src9);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src2 = __lsx_vldx(src, src_stride);
+    dst1 = __lsx_vldx(dst, dst_stride);
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src1, src4, src7,
+              src10);
+    DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, dst0, dst2, dst4,
+              dst6);
+    src_tmp1 = (uint8_t *)src + 16;
+    src5 = __lsx_vldx(src_tmp1, src_stride);
+    src_tmp1 = src_tmp1 + 16;
+    src8 = __lsx_vldx(src_tmp1, src_stride);
+    src_tmp1 = src_tmp1 + 16;
+    src11 = __lsx_vldx(src_tmp1, src_stride);
+
+    dst_tmp1 = dst + 16;
+    dst3 = __lsx_vldx(dst_tmp1, dst_stride);
+    dst_tmp1 = dst + 32;
+    dst5 = __lsx_vldx(dst_tmp1, dst_stride);
+    dst_tmp1 = dst + 48;
+    dst7 = __lsx_vldx(dst_tmp1, dst_stride);
+    src += src_stride2;
+
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst0);
+    __lsx_vst(tmp0, dst, 0);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst1);
+    __lsx_vstx(tmp0, dst, dst_stride);
+
+    DUP2_ARG2(__lsx_vilvl_b, src4, src3, src5, src4, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src4, src3, src5, src4, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst2);
+    __lsx_vst(tmp0, dst, 16);
+
+    dst_tmp1 = dst + 16;
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst3);
+    __lsx_vstx(tmp0, dst_tmp1, dst_stride);
+
+    DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst4);
+    __lsx_vst(tmp0, dst, 32);
+
+    dst_tmp1 = dst_tmp1 + 16;
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst5);
+    __lsx_vstx(tmp0, dst_tmp1, dst_stride);
+
+    DUP2_ARG2(__lsx_vilvl_b, src10, src9, src11, src10, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src10, src9, src11, src10, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst6);
+    __lsx_vst(tmp0, dst, 48);
+
+    dst_tmp1 = dst_tmp1 + 16;
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst7);
+    __lsx_vstx(tmp0, dst_tmp1, dst_stride);
+    dst += dst_stride2;
+
+    src0 = src2;
+    src3 = src5;
+    src6 = src8;
+    src9 = src11;
+  }
+}
+
+void vpx_convolve8_avg_vert_lsx(const uint8_t *src, ptrdiff_t src_stride,
+                                uint8_t *dst, ptrdiff_t dst_stride,
+                                const InterpKernel *filter, int x0_q4,
+                                int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                int h) {
+  const int16_t *const filter_y = filter[y0_q4];
+  int8_t cnt, filt_ver[8];
+
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_ver[cnt] = filter_y[cnt];
+  }
+
+  if (vpx_get_filter_taps(filter_y) == 2) {
+    switch (w) {
+      case 4:
+        common_vt_2t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, &filt_ver[3], h);
+        break;
+      case 8:
+        common_vt_2t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, &filt_ver[3], h);
+        break;
+      case 16:
+        common_vt_2t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, &filt_ver[3], h);
+        break;
+      case 32:
+        common_vt_2t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, &filt_ver[3], h);
+        break;
+      case 64:
+        common_vt_2t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, &filt_ver[3], h);
+        break;
+      default:
+        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        common_vt_8t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, filt_ver, h);
+        break;
+      case 8:
+        common_vt_8t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, filt_ver, h);
+        break;
+      case 16:
+        common_vt_8t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, filt_ver, h);
+
+        break;
+      case 32:
+        common_vt_8t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, filt_ver, h);
+        break;
+      case 64:
+        common_vt_8t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, filt_ver, h);
+        break;
+      default:
+        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c
new file mode 100644
index 0000000000..2c6459a978
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c
@@ -0,0 +1,814 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+  /* 8 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+  /* 4 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+  /* 4 width cases */
+  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static void common_hz_8t_4x4_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter) {
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i out, out0, out1;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+  src -= 3;
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filter0, filter1, filter2, filter3, out0, out1);
+  out = __lsx_vssrarni_b_h(out1, out0, 7);
+  out = __lsx_vxori_b(out, 128);
+  __lsx_vstelm_w(out, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out, dst, 0, 3);
+}
+
+static void common_hz_8t_4x8_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter) {
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i out0, out1, out2, out3;
+  uint8_t *_src = (uint8_t *)src - 3;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  src0 = __lsx_vld(_src, 0);
+  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(_src, src_stride3);
+  _src += src_stride4;
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filter0, filter1, filter2, filter3, out0, out1);
+  src0 = __lsx_vld(_src, 0);
+  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(_src, src_stride3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filter0, filter1, filter2, filter3, out2, out3);
+  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+
+  __lsx_vstelm_w(out0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 3);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 3);
+}
+
+static void common_hz_8t_4w_lsx(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  if (height == 4) {
+    common_hz_8t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else if (height == 8) {
+    common_hz_8t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_8t_8x4_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter) {
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i out0, out1, out2, out3;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filter0, filter1, filter2, filter3, out0, out1,
+                             out2, out3);
+  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+  __lsx_vstelm_d(out0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(out0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_d(out1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(out1, dst, 0, 1);
+}
+
+static void common_hz_8t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height >> 2;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i out0, out1, out2, out3;
+  uint8_t *_src = (uint8_t *)src - 3;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  for (; loop_cnt--;) {
+    src0 = __lsx_vld(_src, 0);
+    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+    src3 = __lsx_vldx(_src, src_stride3);
+    _src += src_stride4;
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, out0,
+                               out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 1);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_8t_8w_lsx(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  if (height == 4) {
+    common_hz_8t_8x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_hz_8t_8x8mult_lsx(src, src_stride, dst, dst_stride, filter, height);
+  }
+}
+
+static void common_hz_8t_16w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height >> 1;
+  int32_t stride = src_stride << 1;
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i out0, out1, out2, out3;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  for (; loop_cnt--;) {
+    const uint8_t *_src = src + src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src0, src2);
+    DUP2_ARG2(__lsx_vld, src, 8, _src, 8, src1, src3);
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, out0,
+                               out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vst(out0, dst, 0);
+    dst += dst_stride;
+    __lsx_vst(out1, dst, 0);
+    dst += dst_stride;
+    src += stride;
+  }
+}
+
+static void common_hz_8t_32w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height >> 1;
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i out0, out1, out2, out3;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+    src3 = __lsx_vld(src, 24);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, out0,
+                               out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vst(out0, dst, 0);
+    __lsx_vst(out1, dst, 16);
+
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+    src3 = __lsx_vld(src, 24);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    src += src_stride;
+
+    dst += dst_stride;
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, out0,
+                               out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vst(out0, dst, 0);
+    __lsx_vst(out1, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_8t_64w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height) {
+  int32_t loop_cnt = height;
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i out0, out1, out2, out3;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+    src3 = __lsx_vld(src, 24);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, out0,
+                               out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vst(out0, dst, 0);
+    __lsx_vst(out1, dst, 16);
+
+    DUP2_ARG2(__lsx_vld, src, 32, src, 48, src0, src2);
+    src3 = __lsx_vld(src, 56);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, out0,
+                               out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vst(out0, dst, 32);
+    __lsx_vst(out1, dst, 48);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_4x4_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  __m128i src0, src1, src2, src3, mask;
+  __m128i filt0, vec0, vec1, vec2, vec3, res0, res1;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride + dst_stride2;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 16);
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+  DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1);
+  DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, vec3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, vec2, vec2, FILTER_BITS, vec3, vec3,
+            FILTER_BITS, res0, res1);
+
+  __lsx_vstelm_w(res0, dst, 0, 0);
+  __lsx_vstelm_w(res0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(res1, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_w(res1, dst + dst_stride3, 0, 1);
+}
+
+static void common_hz_2t_4x8_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i res0, res1, res2, res3, filt0;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride + dst_stride2;
+
+  uint8_t *src_tmp1 = src + src_stride4;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src5,
+            src6);
+  src7 = __lsx_vldx(src_tmp1, src_stride3);
+
+  DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, src5, src4, mask,
+            src7, src6, mask, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            vec4, vec5, vec6, vec7);
+  DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5,
+            FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, res0,
+            res1, res2, res3);
+
+  __lsx_vstelm_w(res0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(res0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(res1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(res1, dst, 0, 1);
+  dst += dst_stride;
+
+  __lsx_vstelm_w(res2, dst, 0, 0);
+  __lsx_vstelm_w(res2, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(res3, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_w(res3, dst + dst_stride3, 0, 1);
+}
+
+static void common_hz_2t_4w_lsx(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (height == 4) {
+    common_hz_2t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else if (height == 8) {
+    common_hz_2t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_2t_8x4_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  __m128i filt0, mask;
+  __m128i src0, src1, src2, src3;
+  __m128i vec0, vec1, vec2, vec3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+
+  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask,
+            src3, src3, mask, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            vec0, vec1, vec2, vec3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+            FILTER_BITS, vec0, vec1);
+
+  __lsx_vstelm_d(vec0, dst, 0, 0);
+  __lsx_vstelm_d(vec0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_d(vec1, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_d(vec1, dst + dst_stride3, 0, 1);
+}
+
+static void common_hz_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter, int32_t height) {
+  __m128i filt0, mask;
+  __m128i src0, src1, src2, src3, out0, out1;
+  __m128i vec0, vec1, vec2, vec3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+  src += src_stride4;
+
+  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask,
+            src3, src3, mask, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            vec0, vec1, vec2, vec3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+            FILTER_BITS, out0, out1);
+
+  __lsx_vstelm_d(out0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(out0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_d(out1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(out1, dst, 0, 1);
+  dst += dst_stride;
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+  src += src_stride4;
+
+  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask,
+            src3, src3, mask, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            vec0, vec1, vec2, vec3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+            FILTER_BITS, out0, out1);
+
+  __lsx_vstelm_d(out0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(out0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_d(out1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(out1, dst, 0, 1);
+  dst += dst_stride;
+
+  if (height == 16) {
+    uint8_t *dst_tmp1 = dst + dst_stride4;
+
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+    src3 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
+              mask, src3, src3, mask, vec0, vec1, vec2, vec3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, vec0, vec1, vec2, vec3);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+              FILTER_BITS, out0, out1);
+
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
+    __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1);
+
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+    src3 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
+              mask, src3, src3, mask, vec0, vec1, vec2, vec3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, vec0, vec1, vec2, vec3);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+              FILTER_BITS, out0, out1);
+
+    __lsx_vstelm_d(out0, dst_tmp1, 0, 0);
+    __lsx_vstelm_d(out0, dst_tmp1 + dst_stride, 0, 1);
+    __lsx_vstelm_d(out1, dst_tmp1 + dst_stride2, 0, 0);
+    __lsx_vstelm_d(out1, dst_tmp1 + dst_stride3, 0, 1);
+  }
+}
+
+static void common_hz_2t_8w_lsx(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (height == 4) {
+    common_hz_2t_8x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_hz_2t_8x8mult_lsx(src, src_stride, dst, dst_stride, filter, height);
+  }
+}
+
+static void common_hz_2t_16w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 2) - 1;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  uint8_t *src_tmp1 = src + 8;
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+  src6 = __lsx_vldx(src, src_stride3);
+  src1 = __lsx_vld(src_tmp1, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
+            src5);
+  src7 = __lsx_vldx(src_tmp1, src_stride3);
+  src += src_stride4;
+
+  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask,
+            src3, src3, mask, vec0, vec1, vec2, vec3);
+  DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6, mask,
+            src7, src7, mask, vec4, vec5, vec6, vec7);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            out0, out1, out2, out3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, filt0,
+            out4, out5, out6, out7);
+  DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+            FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, out0,
+            out1, out2, out3);
+
+  __lsx_vst(out0, dst, 0);
+  dst += dst_stride;
+  __lsx_vst(out1, dst, 0);
+  dst += dst_stride;
+  __lsx_vst(out2, dst, 0);
+  dst += dst_stride;
+  __lsx_vst(out3, dst, 0);
+  dst += dst_stride;
+
+  for (; loop_cnt--;) {
+    src_tmp1 += src_stride4;
+
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+    src6 = __lsx_vldx(src, src_stride3);
+
+    src1 = __lsx_vld(src_tmp1, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
+              src5);
+    src7 = __lsx_vldx(src_tmp1, src_stride3);
+    src += src_stride4;
+
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
+              mask, src3, src3, mask, vec0, vec1, vec2, vec3);
+    DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6,
+              mask, src7, src7, mask, vec4, vec5, vec6, vec7);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, out0, out1, out2, out3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, out4, out5, out6, out7);
+    DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+              FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+              out0, out1, out2, out3);
+
+    __lsx_vst(out0, dst, 0);
+    dst += dst_stride;
+    __lsx_vst(out1, dst, 0);
+    dst += dst_stride;
+    __lsx_vst(out2, dst, 0);
+    dst += dst_stride;
+    __lsx_vst(out3, dst, 0);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_32w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 1);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+    src3 = __lsx_vld(src, 24);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src4, src6);
+    src7 = __lsx_vld(src, 24);
+    src5 = __lsx_vshuf_b(src6, src4, shuff);
+    src += src_stride;
+
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
+              mask, src3, src3, mask, vec0, vec1, vec2, vec3);
+    DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6,
+              mask, src7, src7, mask, vec4, vec5, vec6, vec7);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, out0, out1, out2, out3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, out4, out5, out6, out7);
+    DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+              FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+              out0, out1, out2, out3);
+
+    __lsx_vst(out0, dst, 0);
+    __lsx_vst(out1, dst, 16);
+    dst += dst_stride;
+
+    __lsx_vst(out2, dst, 0);
+    __lsx_vst(out3, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_64w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  for (; loop_cnt--;) {
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src2, src4,
+              src6);
+    src7 = __lsx_vld(src, 56);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src0, shuff, src4, src2, shuff, src1, src3);
+    src5 = __lsx_vshuf_b(src6, src4, shuff);
+    src += src_stride;
+
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
+              mask, src3, src3, mask, vec0, vec1, vec2, vec3);
+    DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6,
+              mask, src7, src7, mask, vec4, vec5, vec6, vec7);
+
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, out0, out1, out2, out3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, out4, out5, out6, out7);
+    DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+              FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+              out0, out1, out2, out3);
+
+    __lsx_vst(out0, dst, 0);
+    __lsx_vst(out1, dst, 16);
+    __lsx_vst(out2, dst, 32);
+    __lsx_vst(out3, dst, 48);
+    dst += dst_stride;
+  }
+}
+
+void vpx_convolve8_horiz_lsx(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
+                             int h) {
+  const int16_t *const filter_x = filter[x0_q4];
+  int8_t cnt, filt_hor[8];
+
+  assert(x_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_hor[cnt] = filter_x[cnt];
+  }
+  if (vpx_get_filter_taps(filter_x) == 2) {
+    switch (w) {
+      case 4:
+        common_hz_2t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            &filt_hor[3], h);
+        break;
+      case 8:
+        common_hz_2t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            &filt_hor[3], h);
+        break;
+      case 16:
+        common_hz_2t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_hor[3], h);
+        break;
+      case 32:
+        common_hz_2t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_hor[3], h);
+        break;
+      case 64:
+        common_hz_2t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_hor[3], h);
+        break;
+      default:
+        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                              x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        common_hz_8t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            filt_hor, h);
+        break;
+      case 8:
+        common_hz_8t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            filt_hor, h);
+        break;
+
+      case 16:
+        common_hz_8t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_hor, h);
+        break;
+
+      case 32:
+        common_hz_8t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_hor, h);
+        break;
+
+      case 64:
+        common_hz_8t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_hor, h);
+        break;
+      default:
+        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                              x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_lsx.c
new file mode 100644
index 0000000000..9f5cd6cfe9
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_lsx.c
@@ -0,0 +1,697 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+  /* 8 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+  /* 4 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+  /* 4 width cases */
+  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static void common_hv_8ht_8vt_4w_lsx(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter_horiz, int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+  __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  __m128i out0, out1;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+  src -= (3 + 3 * src_stride);
+  DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4,
+            filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  src += src_stride;
+  src4 = __lsx_vld(src, 0);
+  src += src_stride;
+  src5 = __lsx_vld(src, 0);
+  src += src_stride;
+  src6 = __lsx_vld(src, 0);
+  src += src_stride;
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+  src6 = __lsx_vxori_b(src6, 128);
+
+  tmp0 = horiz_8tap_filt(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  tmp2 = horiz_8tap_filt(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  tmp4 = horiz_8tap_filt(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  tmp5 = horiz_8tap_filt(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3);
+  DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
+            filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+  DUP2_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
+  tmp2 = __lsx_vpackev_b(tmp5, tmp4);
+
+  for (; loop_cnt--;) {
+    LSX_LD_4(src, src_stride, src7, src8, src9, src10);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    tmp3 = horiz_8tap_filt(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff);
+    tmp4 = __lsx_vpackev_b(tmp3, tmp4);
+    out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    src1 = horiz_8tap_filt(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    src0 = __lsx_vshuf_b(src1, tmp3, shuff);
+    src0 = __lsx_vpackev_b(src1, src0);
+    out1 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    out0 = __lsx_vssrarni_b_h(out1, out0, 7);
+    out0 = __lsx_vxori_b(out0, 128);
+    __lsx_vstelm_w(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 3);
+    dst += dst_stride;
+
+    tmp5 = src1;
+    tmp0 = tmp2;
+    tmp1 = tmp4;
+    tmp2 = src0;
+  }
+}
+
+static void common_hv_8ht_8vt_8w_lsx(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter_horiz, int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+  __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+  __m128i out0, out1;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  src -= (3 + 3 * src_stride);
+  DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4,
+            filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  src += src_stride;
+  src4 = __lsx_vld(src, 0);
+  src += src_stride;
+  src5 = __lsx_vld(src, 0);
+  src += src_stride;
+  src6 = __lsx_vld(src, 0);
+  src += src_stride;
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+  src6 = __lsx_vxori_b(src6, 128);
+
+  src0 = horiz_8tap_filt(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src1 = horiz_8tap_filt(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src2 = horiz_8tap_filt(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src3 = horiz_8tap_filt(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src4 = horiz_8tap_filt(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src5 = horiz_8tap_filt(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src6 = horiz_8tap_filt(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+
+  DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
+            filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+  DUP4_ARG2(__lsx_vpackev_b, src1, src0, src3, src2, src5, src4, src2, src1,
+            tmp0, tmp1, tmp2, tmp4);
+  DUP2_ARG2(__lsx_vpackev_b, src4, src3, src6, src5, tmp5, tmp6);
+
+  for (; loop_cnt--;) {
+    LSX_LD_4(src, src_stride, src7, src8, src9, src10);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    src7 = horiz_8tap_filt(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    tmp3 = __lsx_vpackev_b(src7, src6);
+    out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    src8 = horiz_8tap_filt(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    src0 = __lsx_vpackev_b(src8, src7);
+    out1 = filt_8tap_dpadd_s_h(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    src9 = horiz_8tap_filt(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    src1 = __lsx_vpackev_b(src9, src8);
+    src3 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    src10 = horiz_8tap_filt(src10, src10, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+    src2 = __lsx_vpackev_b(src10, src9);
+    src4 = filt_8tap_dpadd_s_h(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, src4, src3, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 1);
+    dst += dst_stride;
+
+    src6 = src10;
+    tmp0 = tmp2;
+    tmp1 = tmp3;
+    tmp2 = src1;
+    tmp4 = tmp6;
+    tmp5 = src0;
+    tmp6 = src2;
+  }
+}
+
+static void common_hv_8ht_8vt_16w_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                           filter_vert, height);
+  src += 8;
+  dst += 8;
+
+  common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                           filter_vert, height);
+  src += 8;
+  dst += 8;
+}
+
+static void common_hv_8ht_8vt_32w_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 4; multiple8_cnt--;) {
+    common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                             filter_vert, height);
+    src += 8;
+    dst += 8;
+  }
+}
+
+static void common_hv_8ht_8vt_64w_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 8; multiple8_cnt--;) {
+    common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                             filter_vert, height);
+    src += 8;
+    dst += 8;
+  }
+}
+
+static void common_hv_2ht_2vt_4x4_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz,
+                                      int8_t *filter_vert) {
+  __m128i src0, src1, src2, src3, src4, mask;
+  __m128i filt_vt, filt_hz, vec0, vec1;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+  /* rearranging filter */
+  filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+  filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz);
+  hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz);
+  hz_out4 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+
+  hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
+  hz_out3 = __lsx_vpickod_d(hz_out4, hz_out2);
+
+  DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+  DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp0, tmp0, FILTER_BITS, tmp1, tmp1,
+            FILTER_BITS, tmp0, tmp1);
+
+  __lsx_vstelm_w(tmp0, dst, 0, 0);
+  __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(tmp1, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_w(tmp1, dst + dst_stride3, 0, 1);
+}
+
+static void common_hv_2ht_2vt_4x8_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz,
+                                      int8_t *filter_vert) {
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+  __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  __m128i hz_out7, hz_out8, vec4, vec5, vec6, vec7;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+  /* rearranging filter */
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  src += src_stride4;
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src5, src6, src7, src8);
+  src += src_stride4;
+
+  hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz);
+  hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz);
+  hz_out4 = horiz_2tap_filt_uh(src4, src5, mask, filt_hz);
+  hz_out6 = horiz_2tap_filt_uh(src6, src7, mask, filt_hz);
+  hz_out8 = horiz_2tap_filt_uh(src8, src8, mask, filt_hz);
+
+  DUP2_ARG3(__lsx_vshuf_b, hz_out2, hz_out0, shuff, hz_out4, hz_out2, shuff,
+            hz_out1, hz_out3);
+  hz_out5 = __lsx_vshuf_b(hz_out6, hz_out4, shuff);
+  hz_out7 = __lsx_vpickod_d(hz_out8, hz_out6);
+  DUP4_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, hz_out5,
+            hz_out4, hz_out7, hz_out6, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, vec2, filt_vt, vec3,
+            filt_vt, vec4, vec5, vec6, vec7);
+  DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5,
+            FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, vec4,
+            vec5, vec6, vec7);
+
+  __lsx_vstelm_w(vec4, dst, 0, 0);
+  __lsx_vstelm_w(vec4, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(vec5, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_w(vec5, dst + dst_stride3, 0, 1);
+  dst += dst_stride4;
+  __lsx_vstelm_w(vec6, dst, 0, 0);
+  __lsx_vstelm_w(vec6, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(vec7, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_w(vec7, dst + dst_stride3, 0, 1);
+}
+
+static void common_hv_2ht_2vt_4w_lsx(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter_horiz, int8_t *filter_vert,
+                                     int32_t height) {
+  if (height == 4) {
+    common_hv_2ht_2vt_4x4_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                              filter_vert);
+  } else if (height == 8) {
+    common_hv_2ht_2vt_4x8_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                              filter_vert);
+  }
+}
+
+static void common_hv_2ht_2vt_8x4_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz,
+                                      int8_t *filter_vert) {
+  __m128i src0, src1, src2, src3, src4, mask;
+  __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+  __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+
+  hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+  hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+  vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+  tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+  hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+  vec1 = __lsx_vpackev_b(hz_out0, hz_out1);
+  tmp1 = __lsx_vdp2_h_bu(vec1, filt_vt);
+
+  hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+  vec2 = __lsx_vpackev_b(hz_out1, hz_out0);
+  tmp2 = __lsx_vdp2_h_bu(vec2, filt_vt);
+
+  hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+  vec3 = __lsx_vpackev_b(hz_out0, hz_out1);
+  tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt);
+
+  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+            FILTER_BITS, tmp0, tmp1);
+
+  __lsx_vstelm_d(tmp0, dst, 0, 0);
+  __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_d(tmp1, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_d(tmp1, dst + dst_stride3, 0, 1);
+}
+
+static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src,
+                                          int32_t src_stride, uint8_t *dst,
+                                          int32_t dst_stride,
+                                          int8_t *filter_horiz,
+                                          int8_t *filter_vert, int32_t height) {
+  uint32_t loop_cnt = (height >> 3);
+  __m128i src0, src1, src2, src3, src4, mask;
+  __m128i filt_hz, filt_vt, vec0;
+  __m128i hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
+
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+    vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+    tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+    vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+    tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp2, tmp1, FILTER_BITS, tmp4, tmp3,
+              FILTER_BITS, tmp1, tmp2);
+
+    __lsx_vstelm_d(tmp1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp1, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp2, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp2, dst, 0, 1);
+    dst += dst_stride;
+
+    hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+    vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+    tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+    vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+    tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp2, tmp1, FILTER_BITS, tmp4, tmp3,
+              FILTER_BITS, tmp1, tmp2);
+
+    __lsx_vstelm_d(tmp1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp1, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp2, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp2, dst, 0, 1);
+    dst += dst_stride;
+  }
+}
+
+static void common_hv_2ht_2vt_8w_lsx(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter_horiz, int8_t *filter_vert,
+                                     int32_t height) {
+  if (height == 4) {
+    common_hv_2ht_2vt_8x4_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                              filter_vert);
+  } else {
+    common_hv_2ht_2vt_8x8mult_lsx(src, src_stride, dst, dst_stride,
+                                  filter_horiz, filter_vert, height);
+  }
+}
+
+static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt_hz, filt_vt, vec0, vec1;
+  __m128i tmp, tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
+
+  DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+  src += src_stride;
+
+  hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+  hz_out2 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+
+  for (; loop_cnt--;) {
+    uint8_t *src_tmp0 = src + 8;
+
+    DUP2_ARG2(__lsx_vld, src, 0, src_tmp0, 0, src0, src1);
+    DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp0, src_stride, src,
+              src_stride2, src_tmp0, src_stride2, src2, src3, src4, src5);
+    DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp0, src_stride3, src6, src7);
+    src += src_stride4;
+
+    hz_out1 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+    hz_out3 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
+    tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
+    dst += dst_stride;
+
+    hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+    hz_out2 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
+    tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
+    dst += dst_stride;
+
+    hz_out1 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+    hz_out3 = horiz_2tap_filt_uh(src5, src5, mask, filt_hz);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
+    tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
+    dst += dst_stride;
+
+    hz_out0 = horiz_2tap_filt_uh(src6, src6, mask, filt_hz);
+    hz_out2 = horiz_2tap_filt_uh(src7, src7, mask, filt_hz);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
+    tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
+    dst += dst_stride;
+  }
+}
+
+static void common_hv_2ht_2vt_32w_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                            filter_vert, height);
+  src += 16;
+  dst += 16;
+
+  common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                            filter_vert, height);
+}
+
+static void common_hv_2ht_2vt_64w_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 4; multiple8_cnt--;) {
+    common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                              filter_vert, height);
+    src += 16;
+    dst += 16;
+  }
+}
+
+void vpx_convolve8_lsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                       ptrdiff_t dst_stride, const InterpKernel *filter,
+                       int x0_q4, int32_t x_step_q4, int y0_q4,
+                       int32_t y_step_q4, int32_t w, int32_t h) {
+  const int16_t *const filter_x = filter[x0_q4];
+  const int16_t *const filter_y = filter[y0_q4];
+  int8_t cnt, filt_hor[8], filt_ver[8];
+
+  assert(x_step_q4 == 16);
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_hor[cnt] = filter_x[cnt];
+    filt_ver[cnt] = filter_y[cnt];
+  }
+
+  if (vpx_get_filter_taps(filter_x) == 2 &&
+      vpx_get_filter_taps(filter_y) == 2) {
+    switch (w) {
+      case 4:
+        common_hv_2ht_2vt_4w_lsx(src, (int32_t)src_stride, dst,
+                                 (int32_t)dst_stride, &filt_hor[3],
+                                 &filt_ver[3], (int32_t)h);
+        break;
+      case 8:
+        common_hv_2ht_2vt_8w_lsx(src, (int32_t)src_stride, dst,
+                                 (int32_t)dst_stride, &filt_hor[3],
+                                 &filt_ver[3], (int32_t)h);
+        break;
+      case 16:
+        common_hv_2ht_2vt_16w_lsx(src, (int32_t)src_stride, dst,
+                                  (int32_t)dst_stride, &filt_hor[3],
+                                  &filt_ver[3], (int32_t)h);
+        break;
+      case 32:
+        common_hv_2ht_2vt_32w_lsx(src, (int32_t)src_stride, dst,
+                                  (int32_t)dst_stride, &filt_hor[3],
+                                  &filt_ver[3], (int32_t)h);
+        break;
+      case 64:
+        common_hv_2ht_2vt_64w_lsx(src, (int32_t)src_stride, dst,
+                                  (int32_t)dst_stride, &filt_hor[3],
+                                  &filt_ver[3], (int32_t)h);
+        break;
+      default:
+        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                        x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  } else if (vpx_get_filter_taps(filter_x) == 2 ||
+             vpx_get_filter_taps(filter_y) == 2) {
+    vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                    y0_q4, y_step_q4, w, h);
+  } else {
+    switch (w) {
+      case 4:
+        common_hv_8ht_8vt_4w_lsx(src, (int32_t)src_stride, dst,
+                                 (int32_t)dst_stride, filt_hor, filt_ver,
+                                 (int32_t)h);
+        break;
+      case 8:
+        common_hv_8ht_8vt_8w_lsx(src, (int32_t)src_stride, dst,
+                                 (int32_t)dst_stride, filt_hor, filt_ver,
+                                 (int32_t)h);
+        break;
+      case 16:
+        common_hv_8ht_8vt_16w_lsx(src, (int32_t)src_stride, dst,
+                                  (int32_t)dst_stride, filt_hor, filt_ver,
+                                  (int32_t)h);
+        break;
+      case 32:
+        common_hv_8ht_8vt_32w_lsx(src, (int32_t)src_stride, dst,
+                                  (int32_t)dst_stride, filt_hor, filt_ver,
+                                  (int32_t)h);
+        break;
+      case 64:
+        common_hv_8ht_8vt_64w_lsx(src, (int32_t)src_stride, dst,
+                                  (int32_t)dst_stride, filt_hor, filt_ver,
+                                  (int32_t)h);
+        break;
+      default:
+        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                        x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c
new file mode 100644
index 0000000000..6022e43c83
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c
@@ -0,0 +1,825 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static void common_vt_8t_4w_lsx(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height >> 2;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  __m128i reg0, reg1, reg2, reg3, reg4;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i out0, out1;
+  uint8_t *_src = (uint8_t *)src - src_stride3;
+
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+  src0 = __lsx_vld(_src, 0);
+  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(_src, src_stride3);
+  _src += src_stride4;
+  src4 = __lsx_vld(_src, 0);
+  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
+  _src += src_stride3;
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, tmp0,
+            tmp1, tmp2, tmp3);
+  DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, tmp4, tmp5);
+  DUP2_ARG2(__lsx_vilvl_d, tmp3, tmp0, tmp4, tmp1, reg0, reg1);
+  reg2 = __lsx_vilvl_d(tmp5, tmp2);
+  DUP2_ARG2(__lsx_vxori_b, reg0, 128, reg1, 128, reg0, reg1);
+  reg2 = __lsx_vxori_b(reg2, 128);
+
+  for (; loop_cnt--;) {
+    src7 = __lsx_vld(_src, 0);
+    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
+    src10 = __lsx_vldx(_src, src_stride3);
+    _src += src_stride4;
+    DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+              tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4);
+    DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4);
+    out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, reg3, filter0, filter1,
+                               filter2, filter3);
+    out1 = filt_8tap_dpadd_s_h(reg1, reg2, reg3, reg4, filter0, filter1,
+                               filter2, filter3);
+    out0 = __lsx_vssrarni_b_h(out1, out0, 7);
+    out0 = __lsx_vxori_b(out0, 128);
+    __lsx_vstelm_w(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 3);
+    dst += dst_stride;
+
+    reg0 = reg2;
+    reg1 = reg3;
+    reg2 = reg4;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_8w_lsx(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height >> 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i out0, out1, out2, out3;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  src = src - src_stride3;
+
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+  src += src_stride4;
+  src4 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+  src += src_stride3;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+  src6 = __lsx_vxori_b(src6, 128);
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, reg0,
+            reg1, reg2, reg3);
+  DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
+
+  for (; loop_cnt--;) {
+    src7 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src8, src9);
+    src10 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+              tmp0, tmp1, tmp2, tmp3);
+    out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, tmp0, filter0, filter1,
+                               filter2, filter3);
+    out1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, tmp1, filter0, filter1,
+                               filter2, filter3);
+    out2 = filt_8tap_dpadd_s_h(reg1, reg2, tmp0, tmp2, filter0, filter1,
+                               filter2, filter3);
+    out3 = filt_8tap_dpadd_s_h(reg4, reg5, tmp1, tmp3, filter0, filter1,
+                               filter2, filter3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 1);
+    dst += dst_stride;
+
+    reg0 = reg2;
+    reg1 = tmp0;
+    reg2 = tmp2;
+    reg3 = reg5;
+    reg4 = tmp1;
+    reg5 = tmp3;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_16w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height >> 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+  __m128i reg6, reg7, reg8, reg9, reg10, reg11;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  // uint8_t *_src = (uint8_t *)src - src_stride3;
+  src -= src_stride3;
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  src += src_stride4;
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+  src += src_stride3;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+  src6 = __lsx_vxori_b(src6, 128);
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, reg0,
+            reg1, reg2, reg3);
+  DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
+  DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1, reg6,
+            reg7, reg8, reg9);
+  DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11);
+
+  for (; loop_cnt--;) {
+    src7 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src8, src9);
+    src10 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+              src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9,
+              src4, src5, src7, src8);
+    tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1,
+                               filter2, filter3);
+    tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1,
+                               filter2, filter3);
+    tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1,
+                               filter2, filter3);
+    tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1,
+                               filter2, filter3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+    __lsx_vst(tmp0, dst, 0);
+    dst += dst_stride;
+    __lsx_vst(tmp1, dst, 0);
+    dst += dst_stride;
+    tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1,
+                               filter2, filter3);
+    tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1,
+                               filter2, filter3);
+    tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1,
+                               filter2, filter3);
+    tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1,
+                               filter2, filter3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+    __lsx_vst(tmp0, dst, 0);
+    dst += dst_stride;
+    __lsx_vst(tmp1, dst, 0);
+    dst += dst_stride;
+
+    reg0 = reg2;
+    reg1 = src0;
+    reg2 = src2;
+    reg3 = reg5;
+    reg4 = src1;
+    reg5 = src3;
+    reg6 = reg8;
+    reg7 = src4;
+    reg8 = src7;
+    reg9 = reg11;
+    reg10 = src5;
+    reg11 = src8;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_16w_mult_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter, int32_t height,
+                                      int32_t width) {
+  uint8_t *src_tmp;
+  uint8_t *dst_tmp;
+  uint32_t cnt = width >> 4;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+  __m128i reg6, reg7, reg8, reg9, reg10, reg11;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+  src -= src_stride3;
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  for (; cnt--;) {
+    uint32_t loop_cnt = height >> 2;
+
+    src_tmp = src;
+    dst_tmp = dst;
+
+    src0 = __lsx_vld(src_tmp, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src1,
+              src2);
+    src3 = __lsx_vldx(src_tmp, src_stride3);
+    src_tmp += src_stride4;
+    src4 = __lsx_vld(src_tmp, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+              src6);
+    src_tmp += src_stride3;
+
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+    src6 = __lsx_vxori_b(src6, 128);
+    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+              reg0, reg1, reg2, reg3);
+    DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
+    DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
+              reg6, reg7, reg8, reg9);
+    DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11);
+
+    for (; loop_cnt--;) {
+      src7 = __lsx_vld(src_tmp, 0);
+      DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src8,
+                src9);
+      src10 = __lsx_vldx(src_tmp, src_stride3);
+      src_tmp += src_stride4;
+      DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
+                src7, src8, src9, src10);
+      DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+                src0, src1, src2, src3);
+      DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9,
+                src4, src5, src7, src8);
+      tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1,
+                                 filter2, filter3);
+      tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1,
+                                 filter2, filter3);
+      tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1,
+                                 filter2, filter3);
+      tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1,
+                                 filter2, filter3);
+      DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+      DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+      __lsx_vst(tmp0, dst_tmp, 0);
+      __lsx_vstx(tmp1, dst_tmp, dst_stride);
+      tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1,
+                                 filter2, filter3);
+      tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1,
+                                 filter2, filter3);
+      tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1,
+                                 filter2, filter3);
+      tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1,
+                                 filter2, filter3);
+      DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+      DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+      __lsx_vstx(tmp0, dst_tmp, dst_stride2);
+      __lsx_vstx(tmp1, dst_tmp, dst_stride3);
+      dst_tmp += dst_stride4;
+
+      reg0 = reg2;
+      reg1 = src0;
+      reg2 = src2;
+      reg3 = reg5;
+      reg4 = src1;
+      reg5 = src3;
+      reg6 = reg8;
+      reg7 = src4;
+      reg8 = src7;
+      reg9 = reg11;
+      reg10 = src5;
+      reg11 = src8;
+      src6 = src10;
+    }
+    src += 16;
+    dst += 16;
+  }
+}
+
+static void common_vt_8t_32w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  common_vt_8t_16w_mult_lsx(src, src_stride, dst, dst_stride, filter, height,
+                            32);
+}
+
+static void common_vt_8t_64w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  common_vt_8t_16w_mult_lsx(src, src_stride, dst, dst_stride, filter, height,
+                            64);
+}
+
+static void common_vt_2t_4x4_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  __m128i src0, src1, src2, src3, src4;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5;
+  __m128i filt0, tmp0, tmp1;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  src += (src_stride4 + src_stride);
+
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0,
+            vec1, vec2, vec3);
+  DUP2_ARG2(__lsx_vilvl_d, vec1, vec0, vec3, vec2, vec4, vec5);
+  DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+  tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+  __lsx_vstelm_w(tmp0, dst, 0, 0);
+  __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2);
+  __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3);
+}
+
+static void common_vt_2t_4x8_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5;
+  __m128i vec6, vec7, vec8, vec9, vec10, vec11;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i filt0;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+  uint8_t *dst_tmp1 = dst + dst_stride4;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  src += src_stride4;
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src5, src6, src7, src8);
+  src += (src_stride4 + src_stride);
+
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0,
+            vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, vec4,
+            vec5, vec6, vec7);
+  DUP4_ARG2(__lsx_vilvl_d, vec1, vec0, vec3, vec2, vec5, vec4, vec7, vec6, vec8,
+            vec9, vec10, vec11);
+
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec8, filt0, vec9, filt0, vec10, filt0, vec11,
+            filt0, tmp0, tmp1, tmp2, tmp3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+            FILTER_BITS, tmp0, tmp1);
+
+  __lsx_vstelm_w(tmp0, dst, 0, 0);
+  __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2);
+  __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3);
+
+  __lsx_vstelm_w(tmp1, dst_tmp1, 0, 0);
+  __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride, 0, 1);
+  __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride2, 0, 2);
+  __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride3, 0, 3);
+}
+
+static void common_vt_2t_4w_lsx(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (height == 4) {
+    common_vt_2t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else if (height == 8) {
+    common_vt_2t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_vt_2t_8x4_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  __m128i src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
+  __m128i out0, out1, tmp0, tmp1, tmp2, tmp3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0,
+            vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            tmp0, tmp1, tmp2, tmp3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+            FILTER_BITS, out0, out1);
+
+  __lsx_vstelm_d(out0, dst, 0, 0);
+  __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1);
+}
+
+static void common_vt_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 3);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  __m128i out0, out1, tmp0, tmp1, tmp2, tmp3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+    src5 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src6, src7)
+    src8 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+              vec0, vec1, vec2, vec3);
+    DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
+              vec4, vec5, vec6, vec7);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+              FILTER_BITS, out0, out1);
+
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
+    __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1);
+    dst += dst_stride4;
+
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+              FILTER_BITS, out0, out1);
+
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
+    __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1);
+    dst += dst_stride4;
+
+    src0 = src8;
+  }
+}
+
+static void common_vt_2t_8w_lsx(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (height == 4) {
+    common_vt_2t_8x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_vt_2t_8x8mult_lsx(src, src_stride, dst, dst_stride, filter, height);
+  }
+}
+
+static void common_vt_2t_16w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, tmp, tmp0, tmp1;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
+    dst += dst_stride;
+
+    DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
+    dst += dst_stride;
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
+    dst += dst_stride;
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
+    dst += dst_stride;
+
+    src0 = src4;
+  }
+}
+
+static void common_vt_2t_32w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  __m128i tmp, tmp0, tmp1;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  uint8_t *src_tmp;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src5);
+  src += src_stride;
+  src_tmp = src + 16;
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src_tmp, 0, src1, src6);
+    DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src,
+              src_stride2, src_tmp, src_stride2, src2, src7, src3, src8);
+    DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, src4, src9);
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+    src += src_stride4;
+    src_tmp += src_stride4;
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vstx(tmp, dst, dst_stride);
+
+    DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vstx(tmp, dst, dst_stride2);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vstx(tmp, dst, dst_stride3);
+
+    DUP2_ARG2(__lsx_vilvl_b, src6, src5, src7, src6, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src6, src5, src7, src6, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 16);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    dst += dst_stride;
+    __lsx_vst(tmp, dst, 16);
+
+    DUP2_ARG2(__lsx_vilvl_b, src8, src7, src9, src8, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src8, src7, src9, src8, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    dst += dst_stride;
+    __lsx_vst(tmp, dst, 16);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    dst += dst_stride;
+    __lsx_vst(tmp, dst, 16);
+
+    dst += dst_stride;
+
+    src0 = src4;
+    src5 = src9;
+  }
+}
+
+static void common_vt_2t_64w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 1);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  __m128i tmp, tmp0, tmp1;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  uint8_t *dst_tmp1 = dst + dst_stride;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src3, src6,
+            src9);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    uint8_t *src_tmp0 = src + src_stride;
+
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src1, src4, src7,
+              src10);
+    DUP4_ARG2(__lsx_vld, src_tmp0, 0, src_tmp0, 16, src_tmp0, 32, src_tmp0, 48,
+              src2, src5, src8, src11);
+    src += src_stride2;
+
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst_tmp1, 0);
+
+    DUP2_ARG2(__lsx_vilvl_b, src4, src3, src5, src4, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src4, src3, src5, src4, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 16);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst_tmp1, 16);
+
+    DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 32);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst_tmp1, 32);
+
+    DUP2_ARG2(__lsx_vilvl_b, src10, src9, src11, src10, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src10, src9, src11, src10, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 48);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst_tmp1, 48);
+    dst += dst_stride2;
+    dst_tmp1 += dst_stride2;
+
+    src0 = src2;
+    src3 = src5;
+    src6 = src8;
+    src9 = src11;
+  }
+}
+
+void vpx_convolve8_vert_lsx(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const InterpKernel *filter, int x0_q4,
+                            int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
+                            int h) {
+  const int16_t *const filter_y = filter[y0_q4];
+  int8_t cnt, filt_ver[8];
+
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+  for (cnt = 8; cnt--;) {
+    filt_ver[cnt] = filter_y[cnt];
+  }
+
+  if (vpx_get_filter_taps(filter_y) == 2) {
+    switch (w) {
+      case 4:
+        common_vt_2t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            &filt_ver[3], h);
+        break;
+      case 8:
+        common_vt_2t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            &filt_ver[3], h);
+        break;
+      case 16:
+        common_vt_2t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_ver[3], h);
+        break;
+      case 32:
+        common_vt_2t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_ver[3], h);
+        break;
+      case 64:
+        common_vt_2t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_ver[3], h);
+        break;
+      default:
+        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                             x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        common_vt_8t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            filt_ver, h);
+        break;
+      case 8:
+        common_vt_8t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            filt_ver, h);
+        break;
+      case 16:
+        common_vt_8t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_ver, h);
+        break;
+      case 32:
+        common_vt_8t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_ver, h);
+        break;
+      case 64:
+        common_vt_8t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_ver, h);
+        break;
+      default:
+        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                             x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c
new file mode 100644
index 0000000000..1dad29eeed
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c
@@ -0,0 +1,321 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static void avg_width4_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
+                           int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  __m128i src0, src1;
+  __m128i dst0, dst1;
+
+  int32_t src_stride2 = src_stride << 1;
+
+  if ((height % 2) == 0) {
+    for (cnt = (height / 2); cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      src1 = __lsx_vldx(src, src_stride);
+      src += src_stride2;
+
+      dst0 = __lsx_vld(dst, 0);
+      dst1 = __lsx_vldx(dst, dst_stride);
+      DUP2_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, dst0, dst1);
+
+      __lsx_vstelm_w(dst0, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_w(dst1, dst, 0, 0);
+      dst += dst_stride;
+    }
+  }
+}
+
+static void avg_width8_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
+                           int32_t dst_stride, int32_t height) {
+  int32_t cnt = (height / 4);
+  __m128i src0, src1, src2, src3;
+  __m128i dst0, dst1, dst2, dst3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+  for (; cnt--;) {
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+    src3 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    dst0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+    dst3 = __lsx_vldx(dst, dst_stride3);
+
+    DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+              dst0, dst1, dst2, dst3);
+
+    __lsx_vstelm_d(dst0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(dst1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(dst2, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(dst3, dst, 0, 0);
+    dst += dst_stride;
+  }
+}
+
+static void avg_width16_lsx(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt = (height / 8);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  for (; cnt--;) {
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+    src3 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+    src4 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+    src7 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    dst0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+    dst3 = __lsx_vldx(dst, dst_stride3);
+    dst += dst_stride4;
+    dst4 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst5, dst6);
+    dst7 = __lsx_vldx(dst, dst_stride3);
+    dst -= dst_stride4;
+
+    DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+              dst0, dst1, dst2, dst3);
+    DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+              dst4, dst5, dst6, dst7);
+
+    __lsx_vst(dst0, dst, 0);
+    __lsx_vstx(dst1, dst, dst_stride);
+    __lsx_vstx(dst2, dst, dst_stride2);
+    __lsx_vstx(dst3, dst, dst_stride3);
+    dst += dst_stride4;
+    __lsx_vst(dst4, dst, 0);
+    __lsx_vstx(dst5, dst, dst_stride);
+    __lsx_vstx(dst6, dst, dst_stride2);
+    __lsx_vstx(dst7, dst, dst_stride3);
+    dst += dst_stride4;
+  }
+}
+
+static void avg_width32_lsx(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt = (height / 8);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i src8, src9, src10, src11, src12, src13, src14, src15;
+  __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  __m128i dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  for (; cnt--;) {
+    uint8_t *dst_tmp = dst;
+    uint8_t *dst_tmp1 = dst_tmp + 16;
+    uint8_t *src_tmp = src + 16;
+
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vld, src, 0, src_tmp, 0, src0, src1);
+    DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src,
+              src_stride2, src_tmp, src_stride2, src2, src3, src4, src5);
+    DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, src6, src7);
+    src += src_stride4;
+
+    DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp1, 0, dst0, dst1);
+    DUP4_ARG2(__lsx_vldx, dst_tmp, dst_stride, dst_tmp1, dst_stride, dst_tmp,
+              dst_stride2, dst_tmp1, dst_stride2, dst2, dst3, dst4, dst5);
+    DUP2_ARG2(__lsx_vldx, dst_tmp, dst_stride3, dst_tmp1, dst_stride3, dst6,
+              dst7);
+    dst_tmp += dst_stride4;
+    dst_tmp1 += dst_stride4;
+
+    src_tmp = src + 16;
+    DUP2_ARG2(__lsx_vld, src, 0, src_tmp, 0, src8, src9);
+    DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src,
+              src_stride2, src_tmp, src_stride2, src10, src11, src12, src13);
+    DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, src14, src15);
+    src += src_stride4;
+
+    DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp1, 0, dst8, dst9);
+    DUP4_ARG2(__lsx_vldx, dst_tmp, dst_stride, dst_tmp1, dst_stride, dst_tmp,
+              dst_stride2, dst_tmp1, dst_stride2, dst10, dst11, dst12, dst13);
+    DUP2_ARG2(__lsx_vldx, dst_tmp, dst_stride3, dst_tmp1, dst_stride3, dst14,
+              dst15);
+    DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+              dst0, dst1, dst2, dst3);
+    DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+              dst4, dst5, dst6, dst7);
+    DUP4_ARG2(__lsx_vavgr_bu, src8, dst8, src9, dst9, src10, dst10, src11,
+              dst11, dst8, dst9, dst10, dst11);
+    DUP4_ARG2(__lsx_vavgr_bu, src12, dst12, src13, dst13, src14, dst14, src15,
+              dst15, dst12, dst13, dst14, dst15);
+
+    dst_tmp = dst + 16;
+    __lsx_vst(dst0, dst, 0);
+    __lsx_vstx(dst2, dst, dst_stride);
+    __lsx_vstx(dst4, dst, dst_stride2);
+    __lsx_vstx(dst6, dst, dst_stride3);
+    __lsx_vst(dst1, dst_tmp, 0);
+    __lsx_vstx(dst3, dst_tmp, dst_stride);
+    __lsx_vstx(dst5, dst_tmp, dst_stride2);
+    __lsx_vstx(dst7, dst_tmp, dst_stride3);
+    dst += dst_stride4;
+
+    __lsx_vst(dst8, dst, 0);
+    __lsx_vstx(dst10, dst, dst_stride);
+    __lsx_vstx(dst12, dst, dst_stride2);
+    __lsx_vstx(dst14, dst, dst_stride3);
+    __lsx_vst(dst9, dst_tmp1, 0);
+    __lsx_vstx(dst11, dst_tmp1, dst_stride);
+    __lsx_vstx(dst13, dst_tmp1, dst_stride2);
+    __lsx_vstx(dst15, dst_tmp1, dst_stride3);
+    dst += dst_stride4;
+  }
+}
+
+static void avg_width64_lsx(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt = (height / 4);
+  uint8_t *dst_tmp = dst;
+
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i src8, src9, src10, src11, src12, src13, src14, src15;
+  __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  __m128i dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+
+  for (; cnt--;) {
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src4, src5, src6,
+              src7);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src8, src9, src10,
+              src11);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src12, src13, src14,
+              src15);
+    src += src_stride;
+
+    DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
+              dst0, dst1, dst2, dst3);
+    dst_tmp += dst_stride;
+    DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
+              dst4, dst5, dst6, dst7);
+    dst_tmp += dst_stride;
+    DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
+              dst8, dst9, dst10, dst11);
+    dst_tmp += dst_stride;
+    DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
+              dst12, dst13, dst14, dst15);
+    dst_tmp += dst_stride;
+
+    DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+              dst0, dst1, dst2, dst3);
+    DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+              dst4, dst5, dst6, dst7);
+    DUP4_ARG2(__lsx_vavgr_bu, src8, dst8, src9, dst9, src10, dst10, src11,
+              dst11, dst8, dst9, dst10, dst11);
+    DUP4_ARG2(__lsx_vavgr_bu, src12, dst12, src13, dst13, src14, dst14, src15,
+              dst15, dst12, dst13, dst14, dst15);
+
+    __lsx_vst(dst0, dst, 0);
+    __lsx_vst(dst1, dst, 16);
+    __lsx_vst(dst2, dst, 32);
+    __lsx_vst(dst3, dst, 48);
+    dst += dst_stride;
+    __lsx_vst(dst4, dst, 0);
+    __lsx_vst(dst5, dst, 16);
+    __lsx_vst(dst6, dst, 32);
+    __lsx_vst(dst7, dst, 48);
+    dst += dst_stride;
+    __lsx_vst(dst8, dst, 0);
+    __lsx_vst(dst9, dst, 16);
+    __lsx_vst(dst10, dst, 32);
+    __lsx_vst(dst11, dst, 48);
+    dst += dst_stride;
+    __lsx_vst(dst12, dst, 0);
+    __lsx_vst(dst13, dst, 16);
+    __lsx_vst(dst14, dst, 32);
+    __lsx_vst(dst15, dst, 48);
+    dst += dst_stride;
+  }
+}
+
+void vpx_convolve_avg_lsx(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const InterpKernel *filter, int x0_q4,
+                          int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
+                          int32_t w, int32_t h) {
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+  switch (w) {
+    case 4: {
+      avg_width4_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+
+    case 8: {
+      avg_width8_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 16: {
+      avg_width16_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 32: {
+      avg_width32_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 64: {
+      avg_width64_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    default: {
+      int32_t lp, cnt;
+      for (cnt = h; cnt--;) {
+        for (lp = 0; lp < w; ++lp) {
+          dst[lp] = (((dst[lp] + src[lp]) + 1) >> 1);
+        }
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    }
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c
new file mode 100644
index 0000000000..53dc7097ed
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c
@@ -0,0 +1,437 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static void copy_width8_lsx(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  if ((height % 12) == 0) {
+    for (cnt = (height / 12); cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+                src, src_stride4, src1, src2, src3, src4);
+      src += src_stride4;
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+      src += src_stride2;
+      src7 = __lsx_vldx(src, src_stride);
+      src += src_stride2;
+
+      __lsx_vstelm_d(src0, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src1, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src2, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src3, dst, 0, 0);
+      dst += dst_stride;
+
+      __lsx_vstelm_d(src4, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src5, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src6, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src7, dst, 0, 0);
+      dst += dst_stride;
+
+      src0 = __lsx_vld(src, 0);
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+      src3 = __lsx_vldx(src, src_stride3);
+      src += src_stride4;
+
+      __lsx_vstelm_d(src0, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src1, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src2, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src3, dst, 0, 0);
+      dst += dst_stride;
+    }
+  } else if ((height % 8) == 0) {
+    for (cnt = height >> 3; cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+                src, src_stride4, src1, src2, src3, src4);
+      src += src_stride4;
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+      src += src_stride2;
+      src7 = __lsx_vldx(src, src_stride);
+      src += src_stride2;
+
+      __lsx_vstelm_d(src0, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src1, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src2, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src3, dst, 0, 0);
+      dst += dst_stride;
+
+      __lsx_vstelm_d(src4, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src5, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src6, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src7, dst, 0, 0);
+      dst += dst_stride;
+    }
+  } else if ((height % 4) == 0) {
+    for (cnt = (height / 4); cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+      src3 = __lsx_vldx(src, src_stride3);
+      src += src_stride4;
+
+      __lsx_vstelm_d(src0, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src1, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src2, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src3, dst, 0, 0);
+      dst += dst_stride;
+    }
+  } else if ((height % 2) == 0) {
+    for (cnt = (height / 2); cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      src1 = __lsx_vldx(src, src_stride);
+      src += src_stride2;
+
+      __lsx_vstelm_d(src0, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src1, dst, 0, 0);
+      dst += dst_stride;
+    }
+  }
+}
+
+static void copy_16multx8mult_lsx(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  int32_t height, int32_t width) {
+  int32_t cnt, loop_cnt;
+  uint8_t *src_tmp;
+  uint8_t *dst_tmp;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  for (cnt = (width >> 4); cnt--;) {
+    src_tmp = (uint8_t *)src;
+    dst_tmp = dst;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+      src0 = __lsx_vld(src_tmp, 0);
+      DUP4_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src_tmp,
+                src_stride3, src_tmp, src_stride4, src1, src2, src3, src4);
+      src_tmp += src_stride4;
+      DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+                src6);
+      src_tmp += src_stride2;
+      src7 = __lsx_vldx(src_tmp, src_stride);
+      src_tmp += src_stride2;
+
+      __lsx_vst(src0, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src1, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src2, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src3, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src4, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src5, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src6, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src7, dst_tmp, 0);
+      dst_tmp += dst_stride;
+    }
+    src += 16;
+    dst += 16;
+  }
+}
+
+static void copy_width16_lsx(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  if ((height % 12) == 0) {
+    for (cnt = (height / 12); cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+                src, src_stride4, src1, src2, src3, src4);
+      src += src_stride4;
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+      src += src_stride2;
+      src7 = __lsx_vldx(src, src_stride);
+      src += src_stride2;
+
+      __lsx_vst(src0, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src1, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src2, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src3, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src4, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src5, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src6, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src7, dst, 0);
+      dst += dst_stride;
+
+      src0 = __lsx_vld(src, 0);
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+      src3 = __lsx_vldx(src, src_stride3);
+      src += src_stride4;
+
+      __lsx_vst(src0, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src1, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src2, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src3, dst, 0);
+      dst += dst_stride;
+    }
+  } else if ((height % 8) == 0) {
+    copy_16multx8mult_lsx(src, src_stride, dst, dst_stride, height, 16);
+  } else if ((height % 4) == 0) {
+    for (cnt = (height >> 2); cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+      src3 = __lsx_vldx(src, src_stride3);
+      src += src_stride4;
+
+      __lsx_vst(src0, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src1, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src2, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src3, dst, 0);
+      dst += dst_stride;
+    }
+  }
+}
+
+static void copy_width32_lsx(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  uint8_t *src_tmp;
+  uint8_t *dst_tmp;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  if ((height % 12) == 0) {
+    for (cnt = (height / 12); cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+      src3 = __lsx_vldx(src, src_stride3);
+
+      src_tmp = (uint8_t *)src + 16;
+      src4 = __lsx_vld(src_tmp, 0);
+      DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+                src6);
+      src7 = __lsx_vldx(src_tmp, src_stride3);
+      src += src_stride4;
+
+      __lsx_vst(src0, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src1, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src2, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src3, dst, 0);
+      dst += dst_stride;
+
+      dst_tmp = dst + 16;
+      __lsx_vst(src4, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src5, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src6, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src7, dst_tmp, 0);
+      dst_tmp += dst_stride;
+
+      src0 = __lsx_vld(src, 0);
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+      src3 = __lsx_vldx(src, src_stride3);
+
+      src_tmp = (uint8_t *)src + 16;
+      src4 = __lsx_vld(src_tmp, 0);
+      DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+                src6);
+      src7 = __lsx_vldx(src_tmp, src_stride3);
+      src += src_stride4;
+
+      __lsx_vst(src0, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src1, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src2, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src3, dst, 0);
+      dst += dst_stride;
+
+      dst_tmp = dst + 16;
+      __lsx_vst(src4, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src5, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src6, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src7, dst_tmp, 0);
+      dst_tmp += dst_stride;
+
+      src0 = __lsx_vld(src, 0);
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+      src3 = __lsx_vldx(src, src_stride3);
+
+      src_tmp = (uint8_t *)src + 16;
+      src4 = __lsx_vld(src_tmp, 0);
+      DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+                src6);
+      src7 = __lsx_vldx(src_tmp, src_stride3);
+      src += src_stride4;
+
+      __lsx_vst(src0, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src1, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src2, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src3, dst, 0);
+      dst += dst_stride;
+
+      dst_tmp = dst + 16;
+      __lsx_vst(src4, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src5, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src6, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src7, dst_tmp, 0);
+      dst_tmp += dst_stride;
+    }
+  } else if ((height % 8) == 0) {
+    copy_16multx8mult_lsx(src, src_stride, dst, dst_stride, height, 32);
+  } else if ((height % 4) == 0) {
+    for (cnt = (height >> 2); cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+      src3 = __lsx_vldx(src, src_stride3);
+
+      src_tmp = (uint8_t *)src + 16;
+      src4 = __lsx_vld(src_tmp, 0);
+      DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+                src6);
+      src7 = __lsx_vldx(src_tmp, src_stride3);
+      src += src_stride4;
+
+      __lsx_vst(src0, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src1, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src2, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src3, dst, 0);
+      dst += dst_stride;
+
+      dst_tmp = dst + 16;
+      __lsx_vst(src4, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src5, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src6, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src7, dst_tmp, 0);
+      dst_tmp += dst_stride;
+    }
+  }
+}
+
+static void copy_width64_lsx(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride, int32_t height) {
+  copy_16multx8mult_lsx(src, src_stride, dst, dst_stride, height, 64);
+}
+
+void vpx_convolve_copy_lsx(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const InterpKernel *filter, int x0_q4,
+                           int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
+                           int32_t w, int32_t h) {
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  switch (w) {
+    case 4: {
+      uint32_t cnt;
+      __m128i tmp;
+      for (cnt = h; cnt--;) {
+        tmp = __lsx_vldrepl_w(src, 0);
+        __lsx_vstelm_w(tmp, dst, 0, 0);
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    }
+    case 8: {
+      copy_width8_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 16: {
+      copy_width16_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 32: {
+      copy_width32_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 64: {
+      copy_width64_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    default: {
+      uint32_t cnt;
+      for (cnt = h; cnt--;) {
+        memcpy(dst, src, w);
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    }
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_lsx.h
new file mode 100644
index 0000000000..d886b00198
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_lsx.h
@@ -0,0 +1,138 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_
+
+#include "./vpx_config.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static INLINE __m128i filt_8tap_dpadd_s_h(__m128i _reg0, __m128i _reg1,
+                                          __m128i _reg2, __m128i _reg3,
+                                          __m128i _filter0, __m128i _filter1,
+                                          __m128i _filter2, __m128i _filter3) {
+  __m128i _vec0, _vec1;
+
+  _vec0 = __lsx_vdp2_h_b(_reg0, _filter0);
+  _vec0 = __lsx_vdp2add_h_b(_vec0, _reg1, _filter1);
+  _vec1 = __lsx_vdp2_h_b(_reg2, _filter2);
+  _vec1 = __lsx_vdp2add_h_b(_vec1, _reg3, _filter3);
+  return __lsx_vsadd_h(_vec0, _vec1);
+}
+
+static INLINE __m128i horiz_8tap_filt(__m128i _src0, __m128i _src1,
+                                      __m128i _mask0, __m128i _mask1,
+                                      __m128i _mask2, __m128i _mask3,
+                                      __m128i _filt_h0, __m128i _filt_h1,
+                                      __m128i _filt_h2, __m128i _filt_h3) {
+  __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+  __m128i _out;
+
+  DUP4_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src1, _src0, _mask1, _src1,
+            _src0, _mask2, _src1, _src0, _mask3, _tmp0, _tmp1, _tmp2, _tmp3);
+  _out = filt_8tap_dpadd_s_h(_tmp0, _tmp1, _tmp2, _tmp3, _filt_h0, _filt_h1,
+                             _filt_h2, _filt_h3);
+  _out = __lsx_vsrari_h(_out, FILTER_BITS);
+  return __lsx_vsat_h(_out, 7);
+}
+
+static INLINE __m128i horiz_2tap_filt_uh(__m128i in0, __m128i in1, __m128i mask,
+                                         __m128i coeff) {
+  __m128i tmp0_m, tmp1_m;
+
+  tmp0_m = __lsx_vshuf_b(in1, in0, mask);
+  tmp1_m = __lsx_vdp2_h_bu(tmp0_m, coeff);
+  return __lsx_vsrari_h(tmp1_m, FILTER_BITS);
+}
+
+#define LSX_LD_4(_src, _stride, _src0, _src1, _src2, _src3) \
+  do {                                                      \
+    _src0 = __lsx_vld(_src, 0);                             \
+    _src += _stride;                                        \
+    _src1 = __lsx_vld(_src, 0);                             \
+    _src += _stride;                                        \
+    _src2 = __lsx_vld(_src, 0);                             \
+    _src += _stride;                                        \
+    _src3 = __lsx_vld(_src, 0);                             \
+  } while (0)
+
+#define HORIZ_8TAP_4WID_4VECS_FILT(_src0, _src1, _src2, _src3, _mask0, _mask1, \
+                                   _mask2, _mask3, _filter0, _filter1,         \
+                                   _filter2, _filter3, _out0, _out1)           \
+  do {                                                                         \
+    __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7;            \
+    __m128i _reg0, _reg1, _reg2, _reg3;                                        \
+                                                                               \
+    DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src3, _src2, _mask0,       \
+              _tmp0, _tmp1);                                                   \
+    DUP2_ARG2(__lsx_vdp2_h_b, _tmp0, _filter0, _tmp1, _filter0, _reg0, _reg1); \
+    DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask1, _src3, _src2, _mask1,       \
+              _tmp2, _tmp3);                                                   \
+    DUP2_ARG3(__lsx_vdp2add_h_b, _reg0, _tmp2, _filter1, _reg1, _tmp3,         \
+              _filter1, _reg0, _reg1);                                         \
+    DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask2, _src3, _src2, _mask2,       \
+              _tmp4, _tmp5);                                                   \
+    DUP2_ARG2(__lsx_vdp2_h_b, _tmp4, _filter2, _tmp5, _filter2, _reg2, _reg3); \
+    DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask3, _src3, _src2, _mask3,       \
+              _tmp6, _tmp7);                                                   \
+    DUP2_ARG3(__lsx_vdp2add_h_b, _reg2, _tmp6, _filter3, _reg3, _tmp7,         \
+              _filter3, _reg2, _reg3);                                         \
+    DUP2_ARG2(__lsx_vsadd_h, _reg0, _reg2, _reg1, _reg3, _out0, _out1);        \
+  } while (0)
+
+#define HORIZ_8TAP_8WID_4VECS_FILT(                                            \
+    _src0, _src1, _src2, _src3, _mask0, _mask1, _mask2, _mask3, _filter0,      \
+    _filter1, _filter2, _filter3, _out0, _out1, _out2, _out3)                  \
+  do {                                                                         \
+    __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7;            \
+    __m128i _reg0, _reg1, _reg2, _reg3, _reg4, _reg5, _reg6, _reg7;            \
+                                                                               \
+    DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask0, _src1, _src1, _mask0,       \
+              _src2, _src2, _mask0, _src3, _src3, _mask0, _tmp0, _tmp1, _tmp2, \
+              _tmp3);                                                          \
+    DUP4_ARG2(__lsx_vdp2_h_b, _tmp0, _filter0, _tmp1, _filter0, _tmp2,         \
+              _filter0, _tmp3, _filter0, _reg0, _reg1, _reg2, _reg3);          \
+    DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask2, _src1, _src1, _mask2,       \
+              _src2, _src2, _mask2, _src3, _src3, _mask2, _tmp0, _tmp1, _tmp2, \
+              _tmp3);                                                          \
+    DUP4_ARG2(__lsx_vdp2_h_b, _tmp0, _filter2, _tmp1, _filter2, _tmp2,         \
+              _filter2, _tmp3, _filter2, _reg4, _reg5, _reg6, _reg7);          \
+    DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask1, _src1, _src1, _mask1,       \
+              _src2, _src2, _mask1, _src3, _src3, _mask1, _tmp4, _tmp5, _tmp6, \
+              _tmp7);                                                          \
+    DUP4_ARG3(__lsx_vdp2add_h_b, _reg0, _tmp4, _filter1, _reg1, _tmp5,         \
+              _filter1, _reg2, _tmp6, _filter1, _reg3, _tmp7, _filter1, _reg0, \
+              _reg1, _reg2, _reg3);                                            \
+    DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask3, _src1, _src1, _mask3,       \
+              _src2, _src2, _mask3, _src3, _src3, _mask3, _tmp4, _tmp5, _tmp6, \
+              _tmp7);                                                          \
+    DUP4_ARG3(__lsx_vdp2add_h_b, _reg4, _tmp4, _filter3, _reg5, _tmp5,         \
+              _filter3, _reg6, _tmp6, _filter3, _reg7, _tmp7, _filter3, _reg4, \
+              _reg5, _reg6, _reg7);                                            \
+    DUP4_ARG2(__lsx_vsadd_h, _reg0, _reg4, _reg1, _reg5, _reg2, _reg6, _reg3,  \
+              _reg7, _out0, _out1, _out2, _out3);                              \
+  } while (0)
+
+#define AVG_ST4_D(in0, in1, dst0, dst1, pdst, stride)                \
+  do {                                                               \
+    __m128i tmp0_m, tmp1_m;                                          \
+                                                                     \
+    DUP2_ARG2(__lsx_vavgr_bu, in0, dst0, in1, dst1, tmp0_m, tmp1_m); \
+    __lsx_vstelm_d(tmp0_m, pdst, 0, 0);                              \
+    pdst += stride;                                                  \
+    __lsx_vstelm_d(tmp0_m, pdst, 0, 1);                              \
+    pdst += stride;                                                  \
+    __lsx_vstelm_d(tmp1_m, pdst, 0, 0);                              \
+    pdst += stride;                                                  \
+    __lsx_vstelm_d(tmp1_m, pdst, 0, 1);                              \
+  } while (0)
+
+#endif  // VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/loopfilter.c b/media/libvpx/libvpx/vpx_dsp/loopfilter.c
index 9866ea37d6..d6504aab1f 100644
--- a/media/libvpx/libvpx/vpx_dsp/loopfilter.c
+++ b/media/libvpx/libvpx/vpx_dsp/loopfilter.c
@@ -81,11 +81,11 @@ static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
                            uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
   int8_t filter1, filter2;
 
-  const int8_t ps1 = (int8_t)*op1 ^ 0x80;
-  const int8_t ps0 = (int8_t)*op0 ^ 0x80;
-  const int8_t qs0 = (int8_t)*oq0 ^ 0x80;
-  const int8_t qs1 = (int8_t)*oq1 ^ 0x80;
-  const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1);
+  const int8_t ps1 = (int8_t)(*op1 ^ 0x80);
+  const int8_t ps0 = (int8_t)(*op0 ^ 0x80);
+  const int8_t qs0 = (int8_t)(*oq0 ^ 0x80);
+  const int8_t qs1 = (int8_t)(*oq1 ^ 0x80);
+  const int8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1);
 
   // add outer taps if we have high edge variance
   int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
@@ -99,39 +99,40 @@ static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
   filter1 = signed_char_clamp(filter + 4) >> 3;
   filter2 = signed_char_clamp(filter + 3) >> 3;
 
-  *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;
-  *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;
+  *oq0 = (uint8_t)(signed_char_clamp(qs0 - filter1) ^ 0x80);
+  *op0 = (uint8_t)(signed_char_clamp(ps0 + filter2) ^ 0x80);
 
   // outer tap adjustments
   filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
 
-  *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
-  *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
+  *oq1 = (uint8_t)(signed_char_clamp(qs1 - filter) ^ 0x80);
+  *op1 = (uint8_t)(signed_char_clamp(ps1 + filter) ^ 0x80);
 }
 
-void vpx_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
-                            const uint8_t *blimit, const uint8_t *limit,
-                            const uint8_t *thresh) {
+void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
+                            const uint8_t *limit, const uint8_t *thresh) {
   int i;
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < 8; ++i) {
-    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+    const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],
+                  p0 = s[-pitch];
+    const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],
+                  q3 = s[3 * pitch];
     const int8_t mask =
         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
-    filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
+    filter4(mask, *thresh, s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch);
     ++s;
   }
 }
 
-void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
                                  const uint8_t *limit0, const uint8_t *thresh0,
                                  const uint8_t *blimit1, const uint8_t *limit1,
                                  const uint8_t *thresh1) {
-  vpx_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0);
-  vpx_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1);
+  vpx_lpf_horizontal_4_c(s, pitch, blimit0, limit0, thresh0);
+  vpx_lpf_horizontal_4_c(s + 8, pitch, blimit1, limit1, thresh1);
 }
 
 void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
@@ -158,7 +159,7 @@ void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
   vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
 }
 
-static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
+static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat,
                            uint8_t *op3, uint8_t *op2, uint8_t *op1,
                            uint8_t *op0, uint8_t *oq0, uint8_t *oq1,
                            uint8_t *oq2, uint8_t *oq3) {
@@ -178,31 +179,33 @@ static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
   }
 }
 
-void vpx_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
                             const uint8_t *limit, const uint8_t *thresh) {
   int i;
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < 8; ++i) {
-    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+    const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],
+                  p0 = s[-pitch];
+    const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],
+                  q3 = s[3 * pitch];
 
     const int8_t mask =
         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
     const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
-            s + 1 * p, s + 2 * p, s + 3 * p);
+    filter8(mask, *thresh, flat, s - 4 * pitch, s - 3 * pitch, s - 2 * pitch,
+            s - 1 * pitch, s, s + 1 * pitch, s + 2 * pitch, s + 3 * pitch);
     ++s;
   }
 }
 
-void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
                                  const uint8_t *limit0, const uint8_t *thresh0,
                                  const uint8_t *blimit1, const uint8_t *limit1,
                                  const uint8_t *thresh1) {
-  vpx_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0);
-  vpx_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1);
+  vpx_lpf_horizontal_8_c(s, pitch, blimit0, limit0, thresh0);
+  vpx_lpf_horizontal_8_c(s + 8, pitch, blimit1, limit1, thresh1);
 }
 
 void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
@@ -229,8 +232,8 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
   vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
 }
 
-static INLINE void filter16(int8_t mask, uint8_t thresh, uint8_t flat,
-                            uint8_t flat2, uint8_t *op7, uint8_t *op6,
+static INLINE void filter16(int8_t mask, uint8_t thresh, int8_t flat,
+                            int8_t flat2, uint8_t *op7, uint8_t *op6,
                             uint8_t *op5, uint8_t *op4, uint8_t *op3,
                             uint8_t *op2, uint8_t *op1, uint8_t *op0,
                             uint8_t *oq0, uint8_t *oq1, uint8_t *oq2,
@@ -283,7 +286,8 @@ static INLINE void filter16(int8_t mask, uint8_t thresh, uint8_t flat,
   }
 }
 
-static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
+static void mb_lpf_horizontal_edge_w(uint8_t *s, int pitch,
+                                     const uint8_t *blimit,
                                      const uint8_t *limit,
                                      const uint8_t *thresh, int count) {
   int i;
@@ -291,34 +295,37 @@ static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < 8 * count; ++i) {
-    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+    const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],
+                  p0 = s[-pitch];
+    const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],
+                  q3 = s[3 * pitch];
     const int8_t mask =
         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
     const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat2 =
-        flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0,
-                   s[4 * p], s[5 * p], s[6 * p], s[7 * p]);
+    const int8_t flat2 = flat_mask5(
+        1, s[-8 * pitch], s[-7 * pitch], s[-6 * pitch], s[-5 * pitch], p0, q0,
+        s[4 * pitch], s[5 * pitch], s[6 * pitch], s[7 * pitch]);
 
-    filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p,
-             s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
-             s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p,
-             s + 7 * p);
+    filter16(mask, *thresh, flat, flat2, s - 8 * pitch, s - 7 * pitch,
+             s - 6 * pitch, s - 5 * pitch, s - 4 * pitch, s - 3 * pitch,
+             s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch, s + 2 * pitch,
+             s + 3 * pitch, s + 4 * pitch, s + 5 * pitch, s + 6 * pitch,
+             s + 7 * pitch);
     ++s;
   }
 }
 
-void vpx_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit,
                              const uint8_t *limit, const uint8_t *thresh) {
-  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
+  mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1);
 }
 
-void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit,
                                   const uint8_t *limit, const uint8_t *thresh) {
-  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2);
+  mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 2);
 }
 
-static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
+static void mb_lpf_vertical_edge_w(uint8_t *s, int pitch, const uint8_t *blimit,
                                    const uint8_t *limit, const uint8_t *thresh,
                                    int count) {
   int i;
@@ -335,18 +342,18 @@ static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
     filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, s - 4,
              s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6,
              s + 7);
-    s += p;
+    s += pitch;
   }
 }
 
-void vpx_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit,
                            const uint8_t *limit, const uint8_t *thresh) {
-  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8);
+  mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 8);
 }
 
-void vpx_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit,
                                 const uint8_t *limit, const uint8_t *thresh) {
-  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16);
+  mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 16);
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -416,7 +423,7 @@ static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
   const int16_t ps0 = (int16_t)*op0 - (0x80 << shift);
   const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift);
   const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift);
-  const uint16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd);
+  const int16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd);
 
   // Add outer taps if we have high edge variance.
   int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev;
@@ -440,7 +447,7 @@ static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
   *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
 }
 
-void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch,
                                    const uint8_t *blimit, const uint8_t *limit,
                                    const uint8_t *thresh, int bd) {
   int i;
@@ -448,27 +455,28 @@ void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < 8; ++i) {
-    const uint16_t p3 = s[-4 * p];
-    const uint16_t p2 = s[-3 * p];
-    const uint16_t p1 = s[-2 * p];
-    const uint16_t p0 = s[-p];
-    const uint16_t q0 = s[0 * p];
-    const uint16_t q1 = s[1 * p];
-    const uint16_t q2 = s[2 * p];
-    const uint16_t q3 = s[3 * p];
+    const uint16_t p3 = s[-4 * pitch];
+    const uint16_t p2 = s[-3 * pitch];
+    const uint16_t p1 = s[-2 * pitch];
+    const uint16_t p0 = s[-pitch];
+    const uint16_t q0 = s[0 * pitch];
+    const uint16_t q1 = s[1 * pitch];
+    const uint16_t q2 = s[2 * pitch];
+    const uint16_t q3 = s[3 * pitch];
     const int8_t mask =
         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
+    highbd_filter4(mask, *thresh, s - 2 * pitch, s - 1 * pitch, s,
+                   s + 1 * pitch, bd);
     ++s;
   }
 }
 
 void vpx_highbd_lpf_horizontal_4_dual_c(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
     const uint8_t *thresh1, int bd) {
-  vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd);
-  vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, bd);
+  vpx_highbd_lpf_horizontal_4_c(s, pitch, blimit0, limit0, thresh0, bd);
+  vpx_highbd_lpf_horizontal_4_c(s + 8, pitch, blimit1, limit1, thresh1, bd);
 }
 
 void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
@@ -497,7 +505,7 @@ void vpx_highbd_lpf_vertical_4_dual_c(
                               bd);
 }
 
-static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat,
+static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat,
                                   uint16_t *op3, uint16_t *op2, uint16_t *op1,
                                   uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
                                   uint16_t *oq2, uint16_t *oq3, int bd) {
@@ -517,33 +525,36 @@ static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat,
   }
 }
 
-void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
-                                   const uint8_t *limit, const uint8_t *thresh,
-                                   int bd) {
+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch,
+                                   const uint8_t *blimit, const uint8_t *limit,
+                                   const uint8_t *thresh, int bd) {
   int i;
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < 8; ++i) {
-    const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+    const uint16_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],
+                   p0 = s[-pitch];
+    const uint16_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],
+                   q3 = s[3 * pitch];
 
     const int8_t mask =
         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
     const int8_t flat =
         highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    highbd_filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p,
-                   s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
+    highbd_filter8(mask, *thresh, flat, s - 4 * pitch, s - 3 * pitch,
+                   s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch,
+                   s + 2 * pitch, s + 3 * pitch, bd);
     ++s;
   }
 }
 
 void vpx_highbd_lpf_horizontal_8_dual_c(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
     const uint8_t *thresh1, int bd) {
-  vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd);
-  vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd);
+  vpx_highbd_lpf_horizontal_8_c(s, pitch, blimit0, limit0, thresh0, bd);
+  vpx_highbd_lpf_horizontal_8_c(s + 8, pitch, blimit1, limit1, thresh1, bd);
 }
 
 void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
@@ -573,8 +584,8 @@ void vpx_highbd_lpf_vertical_8_dual_c(
                               bd);
 }
 
-static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, uint8_t flat,
-                                   uint8_t flat2, uint16_t *op7, uint16_t *op6,
+static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, int8_t flat,
+                                   int8_t flat2, uint16_t *op7, uint16_t *op6,
                                    uint16_t *op5, uint16_t *op4, uint16_t *op3,
                                    uint16_t *op2, uint16_t *op1, uint16_t *op0,
                                    uint16_t *oq0, uint16_t *oq1, uint16_t *oq2,
@@ -639,7 +650,7 @@ static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, uint8_t flat,
   }
 }
 
-static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
+static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int pitch,
                                             const uint8_t *blimit,
                                             const uint8_t *limit,
                                             const uint8_t *thresh, int count,
@@ -649,44 +660,45 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < 8 * count; ++i) {
-    const uint16_t p3 = s[-4 * p];
-    const uint16_t p2 = s[-3 * p];
-    const uint16_t p1 = s[-2 * p];
-    const uint16_t p0 = s[-p];
-    const uint16_t q0 = s[0 * p];
-    const uint16_t q1 = s[1 * p];
-    const uint16_t q2 = s[2 * p];
-    const uint16_t q3 = s[3 * p];
+    const uint16_t p3 = s[-4 * pitch];
+    const uint16_t p2 = s[-3 * pitch];
+    const uint16_t p1 = s[-2 * pitch];
+    const uint16_t p0 = s[-pitch];
+    const uint16_t q0 = s[0 * pitch];
+    const uint16_t q1 = s[1 * pitch];
+    const uint16_t q2 = s[2 * pitch];
+    const uint16_t q3 = s[3 * pitch];
     const int8_t mask =
         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
     const int8_t flat =
         highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    const int8_t flat2 =
-        highbd_flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0,
-                          s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd);
+    const int8_t flat2 = highbd_flat_mask5(
+        1, s[-8 * pitch], s[-7 * pitch], s[-6 * pitch], s[-5 * pitch], p0, q0,
+        s[4 * pitch], s[5 * pitch], s[6 * pitch], s[7 * pitch], bd);
 
-    highbd_filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p,
-                    s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
-                    s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p,
-                    s + 6 * p, s + 7 * p, bd);
+    highbd_filter16(mask, *thresh, flat, flat2, s - 8 * pitch, s - 7 * pitch,
+                    s - 6 * pitch, s - 5 * pitch, s - 4 * pitch, s - 3 * pitch,
+                    s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch,
+                    s + 2 * pitch, s + 3 * pitch, s + 4 * pitch, s + 5 * pitch,
+                    s + 6 * pitch, s + 7 * pitch, bd);
     ++s;
   }
 }
 
-void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int p, const uint8_t *blimit,
-                                    const uint8_t *limit, const uint8_t *thresh,
-                                    int bd) {
-  highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch,
+                                    const uint8_t *blimit, const uint8_t *limit,
+                                    const uint8_t *thresh, int bd) {
+  highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1, bd);
 }
 
-void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int p,
+void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch,
                                          const uint8_t *blimit,
                                          const uint8_t *limit,
                                          const uint8_t *thresh, int bd) {
-  highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd);
+  highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 2, bd);
 }
 
-static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
+static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int pitch,
                                           const uint8_t *blimit,
                                           const uint8_t *limit,
                                           const uint8_t *thresh, int count,
@@ -712,20 +724,20 @@ static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
     highbd_filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5,
                     s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4,
                     s + 5, s + 6, s + 7, bd);
-    s += p;
+    s += pitch;
   }
 }
 
-void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit,
                                   const uint8_t *limit, const uint8_t *thresh,
                                   int bd) {
-  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);
+  highbd_mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 8, bd);
 }
 
-void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p,
+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch,
                                        const uint8_t *blimit,
                                        const uint8_t *limit,
                                        const uint8_t *thresh, int bd) {
-  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd);
+  highbd_mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 16, bd);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/add_noise_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/add_noise_msa.c
index 43d2c1146e..97541411e4 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/add_noise_msa.c
+++ b/media/libvpx/libvpx/vpx_dsp/mips/add_noise_msa.c
@@ -9,7 +9,9 @@
  */
 
 #include <stdlib.h>
-#include "./macros_msa.h"
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
 
 void vpx_plane_add_noise_msa(uint8_t *start_ptr, const int8_t *noise,
                              int blackclamp, int whiteclamp, int width,
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/avg_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/avg_msa.c
index 52a24ed379..3fd18dec56 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/avg_msa.c
+++ b/media/libvpx/libvpx/vpx_dsp/mips/avg_msa.c
@@ -7,7 +7,9 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <stdlib.h>
 
+#include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/macros_msa.h"
 
@@ -54,3 +56,676 @@ uint32_t vpx_avg_4x4_msa(const uint8_t *src, int32_t src_stride) {
 
   return sum_out;
 }
+
+#if !CONFIG_VP9_HIGHBITDEPTH
+void vpx_hadamard_8x8_msa(const int16_t *src, ptrdiff_t src_stride,
+                          int16_t *dst) {
+  v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+  LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+  BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+              tmp6, tmp7, tmp5, tmp3, tmp1);
+  BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+              src5, src7, src6, src3, src2);
+  BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
+              tmp4, tmp5, tmp1, tmp6, tmp2);
+  TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
+                     src2, src3, src4, src5, src6, src7);
+  BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+              tmp6, tmp7, tmp5, tmp3, tmp1);
+  BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+              src5, src7, src6, src3, src2);
+  BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
+              tmp4, tmp5, tmp1, tmp6, tmp2);
+  TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
+                     src2, src3, src4, src5, src6, src7);
+  ST_SH8(src0, src1, src2, src3, src4, src5, src6, src7, dst, 8);
+}
+
+void vpx_hadamard_16x16_msa(const int16_t *src, ptrdiff_t src_stride,
+                            int16_t *dst) {
+  v8i16 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v8i16 src11, src12, src13, src14, src15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  v8i16 tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+
+  LD_SH2(src, 8, src0, src8);
+  src += src_stride;
+  LD_SH2(src, 8, src1, src9);
+  src += src_stride;
+  LD_SH2(src, 8, src2, src10);
+  src += src_stride;
+  LD_SH2(src, 8, src3, src11);
+  src += src_stride;
+  LD_SH2(src, 8, src4, src12);
+  src += src_stride;
+  LD_SH2(src, 8, src5, src13);
+  src += src_stride;
+  LD_SH2(src, 8, src6, src14);
+  src += src_stride;
+  LD_SH2(src, 8, src7, src15);
+  src += src_stride;
+
+  BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+              tmp6, tmp7, tmp5, tmp3, tmp1);
+  BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10,
+              tmp12, tmp14, tmp15, tmp13, tmp11, tmp9);
+
+  BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+              src5, src7, src6, src3, src2);
+  BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
+              tmp4, tmp5, tmp1, tmp6, tmp2);
+  TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
+                     src2, src3, src4, src5, src6, src7);
+  BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+              tmp6, tmp7, tmp5, tmp3, tmp1);
+  BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+              src5, src7, src6, src3, src2);
+  BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
+              tmp4, tmp5, tmp1, tmp6, tmp2);
+  TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
+                     src2, src11, src4, src5, src6, src7);
+  ST_SH8(src0, src1, src2, src11, src4, src5, src6, src7, dst, 8);
+
+  BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9,
+              src12, src13, src15, src14, src11, src10);
+  BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15,
+              tmp11, tmp12, tmp13, tmp9, tmp14, tmp10);
+  TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, src8,
+                     src9, src10, src11, src12, src13, src14, src15);
+  BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10,
+              tmp12, tmp14, tmp15, tmp13, tmp11, tmp9);
+  BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9,
+              src12, src13, src15, src14, src11, src10);
+  BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15,
+              tmp11, tmp12, tmp13, tmp9, tmp14, tmp10);
+  TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, res0,
+                     res1, res2, res3, res4, res5, res6, res7);
+
+  LD_SH2(src, 8, src0, src8);
+  src += src_stride;
+  LD_SH2(src, 8, src1, src9);
+  src += src_stride;
+  LD_SH2(src, 8, src2, src10);
+  src += src_stride;
+  LD_SH2(src, 8, src3, src11);
+  src += src_stride;
+
+  ST_SH8(res0, res1, res2, res3, res4, res5, res6, res7, dst + 64, 8);
+
+  LD_SH2(src, 8, src4, src12);
+  src += src_stride;
+  LD_SH2(src, 8, src5, src13);
+  src += src_stride;
+  LD_SH2(src, 8, src6, src14);
+  src += src_stride;
+  LD_SH2(src, 8, src7, src15);
+  src += src_stride;
+
+  BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+              tmp6, tmp7, tmp5, tmp3, tmp1);
+  BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10,
+              tmp12, tmp14, tmp15, tmp13, tmp11, tmp9);
+
+  BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+              src5, src7, src6, src3, src2);
+  BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
+              tmp4, tmp5, tmp1, tmp6, tmp2);
+  TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
+                     src2, src3, src4, src5, src6, src7);
+  BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+              tmp6, tmp7, tmp5, tmp3, tmp1);
+  BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+              src5, src7, src6, src3, src2);
+  BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
+              tmp4, tmp5, tmp1, tmp6, tmp2);
+  TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
+                     src2, src3, src4, src5, src6, src7);
+  ST_SH8(src0, src1, src2, src3, src4, src5, src6, src7, dst + 2 * 64, 8);
+
+  BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9,
+              src12, src13, src15, src14, src11, src10);
+  BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15,
+              tmp11, tmp12, tmp13, tmp9, tmp14, tmp10);
+  TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, src8,
+                     src9, src10, src11, src12, src13, src14, src15);
+  BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10,
+              tmp12, tmp14, tmp15, tmp13, tmp11, tmp9);
+  BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9,
+              src12, src13, src15, src14, src11, src10);
+  BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15,
+              tmp11, tmp12, tmp13, tmp9, tmp14, tmp10);
+  TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, res0,
+                     res1, res2, res3, res4, res5, res6, res7);
+  ST_SH8(res0, res1, res2, res3, res4, res5, res6, res7, dst + 3 * 64, 8);
+
+  LD_SH4(dst, 64, src0, src1, src2, src3);
+  LD_SH4(dst + 8, 64, src4, src5, src6, src7);
+
+  BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+              tmp6, tmp7, tmp5, tmp3, tmp1);
+  SRA_4V(tmp0, tmp1, tmp2, tmp3, 1);
+  SRA_4V(tmp4, tmp5, tmp6, tmp7, 1);
+  BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+              src5, src7, src6, src3, src2);
+
+  ST_SH4(src0, src1, src2, src3, dst, 64);
+  ST_SH4(src4, src5, src6, src7, dst + 8, 64);
+  dst += 16;
+
+  LD_SH4(dst, 64, src0, src1, src2, src3);
+  LD_SH4(dst + 8, 64, src4, src5, src6, src7);
+
+  BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+              tmp6, tmp7, tmp5, tmp3, tmp1);
+  SRA_4V(tmp0, tmp1, tmp2, tmp3, 1);
+  SRA_4V(tmp4, tmp5, tmp6, tmp7, 1);
+  BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+              src5, src7, src6, src3, src2);
+
+  ST_SH4(src0, src1, src2, src3, dst, 64);
+  ST_SH4(src4, src5, src6, src7, dst + 8, 64);
+  dst += 16;
+
+  LD_SH4(dst, 64, src0, src1, src2, src3);
+  LD_SH4(dst + 8, 64, src4, src5, src6, src7);
+
+  BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+              tmp6, tmp7, tmp5, tmp3, tmp1);
+  SRA_4V(tmp0, tmp1, tmp2, tmp3, 1);
+  SRA_4V(tmp4, tmp5, tmp6, tmp7, 1);
+  BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+              src5, src7, src6, src3, src2);
+
+  ST_SH4(src0, src1, src2, src3, dst, 64);
+  ST_SH4(src4, src5, src6, src7, dst + 8, 64);
+  dst += 16;
+
+  LD_SH4(dst, 64, src0, src1, src2, src3);
+  LD_SH4(dst + 8, 64, src4, src5, src6, src7);
+
+  BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+              tmp6, tmp7, tmp5, tmp3, tmp1);
+  SRA_4V(tmp0, tmp1, tmp2, tmp3, 1);
+  SRA_4V(tmp4, tmp5, tmp6, tmp7, 1);
+  BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+              src5, src7, src6, src3, src2);
+
+  ST_SH4(src0, src1, src2, src3, dst, 64);
+  ST_SH4(src4, src5, src6, src7, dst + 8, 64);
+}
+
+int vpx_satd_msa(const int16_t *data, int length) {
+  int i, satd;
+  v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+  v8i16 src8, src9, src10, src11, src12, src13, src14, src15;
+  v8i16 zero = { 0 };
+  v8u16 tmp0_h, tmp1_h, tmp2_h, tmp3_h, tmp4_h, tmp5_h, tmp6_h, tmp7_h;
+  v4u32 tmp0_w = { 0 };
+
+  if (16 == length) {
+    LD_SH2(data, 8, src0, src1);
+    tmp0_h = (v8u16)__msa_asub_s_h(src0, zero);
+    tmp1_h = (v8u16)__msa_asub_s_h(src1, zero);
+    tmp0_w = __msa_hadd_u_w(tmp0_h, tmp0_h);
+    tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
+    satd = HADD_UW_U32(tmp0_w);
+  } else if (64 == length) {
+    LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7);
+
+    tmp0_h = (v8u16)__msa_asub_s_h(src0, zero);
+    tmp1_h = (v8u16)__msa_asub_s_h(src1, zero);
+    tmp2_h = (v8u16)__msa_asub_s_h(src2, zero);
+    tmp3_h = (v8u16)__msa_asub_s_h(src3, zero);
+    tmp4_h = (v8u16)__msa_asub_s_h(src4, zero);
+    tmp5_h = (v8u16)__msa_asub_s_h(src5, zero);
+    tmp6_h = (v8u16)__msa_asub_s_h(src6, zero);
+    tmp7_h = (v8u16)__msa_asub_s_h(src7, zero);
+
+    tmp0_w = __msa_hadd_u_w(tmp0_h, tmp0_h);
+    tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
+    tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
+    tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
+    tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
+    tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
+    tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
+    tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
+
+    satd = HADD_UW_U32(tmp0_w);
+  } else if (256 == length) {
+    for (i = 0; i < 2; ++i) {
+      LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7);
+      data += 8 * 8;
+      LD_SH8(data, 8, src8, src9, src10, src11, src12, src13, src14, src15);
+      data += 8 * 8;
+
+      tmp0_h = (v8u16)__msa_asub_s_h(src0, zero);
+      tmp1_h = (v8u16)__msa_asub_s_h(src1, zero);
+      tmp2_h = (v8u16)__msa_asub_s_h(src2, zero);
+      tmp3_h = (v8u16)__msa_asub_s_h(src3, zero);
+      tmp4_h = (v8u16)__msa_asub_s_h(src4, zero);
+      tmp5_h = (v8u16)__msa_asub_s_h(src5, zero);
+      tmp6_h = (v8u16)__msa_asub_s_h(src6, zero);
+      tmp7_h = (v8u16)__msa_asub_s_h(src7, zero);
+
+      tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h);
+      tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
+      tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
+      tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
+      tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
+      tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
+      tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
+      tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
+
+      tmp0_h = (v8u16)__msa_asub_s_h(src8, zero);
+      tmp1_h = (v8u16)__msa_asub_s_h(src9, zero);
+      tmp2_h = (v8u16)__msa_asub_s_h(src10, zero);
+      tmp3_h = (v8u16)__msa_asub_s_h(src11, zero);
+      tmp4_h = (v8u16)__msa_asub_s_h(src12, zero);
+      tmp5_h = (v8u16)__msa_asub_s_h(src13, zero);
+      tmp6_h = (v8u16)__msa_asub_s_h(src14, zero);
+      tmp7_h = (v8u16)__msa_asub_s_h(src15, zero);
+
+      tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h);
+      tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
+      tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
+      tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
+      tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
+      tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
+      tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
+      tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
+    }
+
+    satd = HADD_UW_U32(tmp0_w);
+  } else if (1024 == length) {
+    for (i = 0; i < 8; ++i) {
+      LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7);
+      data += 8 * 8;
+      LD_SH8(data, 8, src8, src9, src10, src11, src12, src13, src14, src15);
+      data += 8 * 8;
+
+      tmp0_h = (v8u16)__msa_asub_s_h(src0, zero);
+      tmp1_h = (v8u16)__msa_asub_s_h(src1, zero);
+      tmp2_h = (v8u16)__msa_asub_s_h(src2, zero);
+      tmp3_h = (v8u16)__msa_asub_s_h(src3, zero);
+      tmp4_h = (v8u16)__msa_asub_s_h(src4, zero);
+      tmp5_h = (v8u16)__msa_asub_s_h(src5, zero);
+      tmp6_h = (v8u16)__msa_asub_s_h(src6, zero);
+      tmp7_h = (v8u16)__msa_asub_s_h(src7, zero);
+
+      tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h);
+      tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
+      tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
+      tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
+      tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
+      tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
+      tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
+      tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
+
+      tmp0_h = (v8u16)__msa_asub_s_h(src8, zero);
+      tmp1_h = (v8u16)__msa_asub_s_h(src9, zero);
+      tmp2_h = (v8u16)__msa_asub_s_h(src10, zero);
+      tmp3_h = (v8u16)__msa_asub_s_h(src11, zero);
+      tmp4_h = (v8u16)__msa_asub_s_h(src12, zero);
+      tmp5_h = (v8u16)__msa_asub_s_h(src13, zero);
+      tmp6_h = (v8u16)__msa_asub_s_h(src14, zero);
+      tmp7_h = (v8u16)__msa_asub_s_h(src15, zero);
+
+      tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h);
+      tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
+      tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
+      tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
+      tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
+      tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
+      tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
+      tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
+    }
+
+    satd = HADD_UW_U32(tmp0_w);
+  } else {
+    satd = 0;
+
+    for (i = 0; i < length; ++i) {
+      satd += abs(data[i]);
+    }
+  }
+
+  return satd;
+}
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_int_pro_row_msa(int16_t hbuf[16], const uint8_t *ref,
+                         const int ref_stride, const int height) {
+  int i;
+  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+  v8i16 hbuf_r = { 0 };
+  v8i16 hbuf_l = { 0 };
+  v8i16 ref0_r, ref0_l, ref1_r, ref1_l, ref2_r, ref2_l, ref3_r, ref3_l;
+  v8i16 ref4_r, ref4_l, ref5_r, ref5_l, ref6_r, ref6_l, ref7_r, ref7_l;
+
+  if (16 == height) {
+    for (i = 2; i--;) {
+      LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+      ref += 8 * ref_stride;
+      UNPCK_UB_SH(ref0, ref0_r, ref0_l);
+      UNPCK_UB_SH(ref1, ref1_r, ref1_l);
+      UNPCK_UB_SH(ref2, ref2_r, ref2_l);
+      UNPCK_UB_SH(ref3, ref3_r, ref3_l);
+      UNPCK_UB_SH(ref4, ref4_r, ref4_l);
+      UNPCK_UB_SH(ref5, ref5_r, ref5_l);
+      UNPCK_UB_SH(ref6, ref6_r, ref6_l);
+      UNPCK_UB_SH(ref7, ref7_r, ref7_l);
+      ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+    }
+
+    SRA_2V(hbuf_r, hbuf_l, 3);
+    ST_SH2(hbuf_r, hbuf_l, hbuf, 8);
+  } else if (32 == height) {
+    for (i = 2; i--;) {
+      LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+      ref += 8 * ref_stride;
+      UNPCK_UB_SH(ref0, ref0_r, ref0_l);
+      UNPCK_UB_SH(ref1, ref1_r, ref1_l);
+      UNPCK_UB_SH(ref2, ref2_r, ref2_l);
+      UNPCK_UB_SH(ref3, ref3_r, ref3_l);
+      UNPCK_UB_SH(ref4, ref4_r, ref4_l);
+      UNPCK_UB_SH(ref5, ref5_r, ref5_l);
+      UNPCK_UB_SH(ref6, ref6_r, ref6_l);
+      UNPCK_UB_SH(ref7, ref7_r, ref7_l);
+      ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+      ref += 8 * ref_stride;
+      UNPCK_UB_SH(ref0, ref0_r, ref0_l);
+      UNPCK_UB_SH(ref1, ref1_r, ref1_l);
+      UNPCK_UB_SH(ref2, ref2_r, ref2_l);
+      UNPCK_UB_SH(ref3, ref3_r, ref3_l);
+      UNPCK_UB_SH(ref4, ref4_r, ref4_l);
+      UNPCK_UB_SH(ref5, ref5_r, ref5_l);
+      UNPCK_UB_SH(ref6, ref6_r, ref6_l);
+      UNPCK_UB_SH(ref7, ref7_r, ref7_l);
+      ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+    }
+
+    SRA_2V(hbuf_r, hbuf_l, 4);
+    ST_SH2(hbuf_r, hbuf_l, hbuf, 8);
+  } else if (64 == height) {
+    for (i = 4; i--;) {
+      LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+      ref += 8 * ref_stride;
+      UNPCK_UB_SH(ref0, ref0_r, ref0_l);
+      UNPCK_UB_SH(ref1, ref1_r, ref1_l);
+      UNPCK_UB_SH(ref2, ref2_r, ref2_l);
+      UNPCK_UB_SH(ref3, ref3_r, ref3_l);
+      UNPCK_UB_SH(ref4, ref4_r, ref4_l);
+      UNPCK_UB_SH(ref5, ref5_r, ref5_l);
+      UNPCK_UB_SH(ref6, ref6_r, ref6_l);
+      UNPCK_UB_SH(ref7, ref7_r, ref7_l);
+      ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+      ref += 8 * ref_stride;
+      UNPCK_UB_SH(ref0, ref0_r, ref0_l);
+      UNPCK_UB_SH(ref1, ref1_r, ref1_l);
+      UNPCK_UB_SH(ref2, ref2_r, ref2_l);
+      UNPCK_UB_SH(ref3, ref3_r, ref3_l);
+      UNPCK_UB_SH(ref4, ref4_r, ref4_l);
+      UNPCK_UB_SH(ref5, ref5_r, ref5_l);
+      UNPCK_UB_SH(ref6, ref6_r, ref6_l);
+      UNPCK_UB_SH(ref7, ref7_r, ref7_l);
+      ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+      ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
+           hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+    }
+
+    SRA_2V(hbuf_r, hbuf_l, 5);
+    ST_SH2(hbuf_r, hbuf_l, hbuf, 8);
+  } else {
+    const int norm_factor = height >> 1;
+    int cnt;
+
+    for (cnt = 0; cnt < 16; cnt++) {
+      hbuf[cnt] = 0;
+    }
+
+    for (i = 0; i < height; ++i) {
+      for (cnt = 0; cnt < 16; cnt++) {
+        hbuf[cnt] += ref[cnt];
+      }
+
+      ref += ref_stride;
+    }
+
+    for (cnt = 0; cnt < 16; cnt++) {
+      hbuf[cnt] /= norm_factor;
+    }
+  }
+}
+
+int16_t vpx_int_pro_col_msa(const uint8_t *ref, const int width) {
+  int16_t sum;
+  v16u8 ref0, ref1, ref2, ref3;
+  v8u16 ref0_h;
+
+  if (16 == width) {
+    ref0 = LD_UB(ref);
+    ref0_h = __msa_hadd_u_h(ref0, ref0);
+    sum = HADD_UH_U32(ref0_h);
+  } else if (32 == width) {
+    LD_UB2(ref, 16, ref0, ref1);
+    ref0_h = __msa_hadd_u_h(ref0, ref0);
+    ref0_h += __msa_hadd_u_h(ref1, ref1);
+    sum = HADD_UH_U32(ref0_h);
+  } else if (64 == width) {
+    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+    ref0_h = __msa_hadd_u_h(ref0, ref0);
+    ref0_h += __msa_hadd_u_h(ref1, ref1);
+    ref0_h += __msa_hadd_u_h(ref2, ref2);
+    ref0_h += __msa_hadd_u_h(ref3, ref3);
+    sum = HADD_UH_U32(ref0_h);
+  } else {
+    int idx;
+
+    sum = 0;
+    for (idx = 0; idx < width; ++idx) {
+      sum += ref[idx];
+    }
+  }
+
+  return sum;
+}
+
+int vpx_vector_var_msa(const int16_t *ref, const int16_t *src, const int bwl) {
+  int sse, mean, var;
+  v8i16 src0, src1, src2, src3, src4, src5, src6, src7, ref0, ref1, ref2;
+  v8i16 ref3, ref4, ref5, ref6, ref7, src_l0_m, src_l1_m, src_l2_m, src_l3_m;
+  v8i16 src_l4_m, src_l5_m, src_l6_m, src_l7_m;
+  v4i32 res_l0_m, res_l1_m, res_l2_m, res_l3_m, res_l4_m, res_l5_m, res_l6_m;
+  v4i32 res_l7_m, mean_v;
+  v2i64 sse_v;
+
+  if (2 == bwl) {
+    LD_SH2(src, 8, src0, src1);
+    LD_SH2(ref, 8, ref0, ref1);
+
+    ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m);
+    ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m);
+    HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m);
+    HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m);
+    sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m);
+    sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m);
+    DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v);
+    mean_v = res_l0_m + res_l1_m;
+    mean_v += res_l2_m + res_l3_m;
+
+    sse_v += __msa_splati_d(sse_v, 1);
+    sse = __msa_copy_s_w((v4i32)sse_v, 0);
+
+    mean = HADD_SW_S32(mean_v);
+  } else if (3 == bwl) {
+    LD_SH4(src, 8, src0, src1, src2, src3);
+    LD_SH4(ref, 8, ref0, ref1, ref2, ref3);
+
+    ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m);
+    ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m);
+    ILVRL_H2_SH(src2, ref2, src_l4_m, src_l5_m);
+    ILVRL_H2_SH(src3, ref3, src_l6_m, src_l7_m);
+    HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m);
+    HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m);
+    HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m);
+    HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m);
+    sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m);
+    sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m);
+    DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v);
+    DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v);
+    DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v);
+    mean_v = res_l0_m + res_l1_m;
+    mean_v += res_l2_m + res_l3_m;
+    mean_v += res_l4_m + res_l5_m;
+    mean_v += res_l6_m + res_l7_m;
+
+    sse_v += __msa_splati_d(sse_v, 1);
+    sse = __msa_copy_s_w((v4i32)sse_v, 0);
+
+    mean = HADD_SW_S32(mean_v);
+  } else if (4 == bwl) {
+    LD_SH8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
+    LD_SH8(ref, 8, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+
+    ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m);
+    ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m);
+    ILVRL_H2_SH(src2, ref2, src_l4_m, src_l5_m);
+    ILVRL_H2_SH(src3, ref3, src_l6_m, src_l7_m);
+    HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m);
+    HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m);
+    HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m);
+    HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m);
+    sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m);
+    sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m);
+    DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v);
+    DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v);
+    DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v);
+    mean_v = res_l0_m + res_l1_m;
+    mean_v += res_l2_m + res_l3_m;
+    mean_v += res_l4_m + res_l5_m;
+    mean_v += res_l6_m + res_l7_m;
+
+    ILVRL_H2_SH(src4, ref4, src_l0_m, src_l1_m);
+    ILVRL_H2_SH(src5, ref5, src_l2_m, src_l3_m);
+    ILVRL_H2_SH(src6, ref6, src_l4_m, src_l5_m);
+    ILVRL_H2_SH(src7, ref7, src_l6_m, src_l7_m);
+    HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m);
+    HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m);
+    HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m);
+    HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m);
+    DPADD_SD2_SD(res_l0_m, res_l1_m, sse_v, sse_v);
+    DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v);
+    DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v);
+    DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v);
+    mean_v += res_l0_m + res_l1_m;
+    mean_v += res_l2_m + res_l3_m;
+    mean_v += res_l4_m + res_l5_m;
+    mean_v += res_l6_m + res_l7_m;
+
+    sse_v += __msa_splati_d(sse_v, 1);
+    sse = __msa_copy_s_w((v4i32)sse_v, 0);
+
+    mean = HADD_SW_S32(mean_v);
+  } else {
+    int i;
+    const int width = 4 << bwl;
+
+    sse = 0;
+    mean = 0;
+
+    for (i = 0; i < width; ++i) {
+      const int diff = ref[i] - src[i];
+
+      mean += diff;
+      sse += diff * diff;
+    }
+  }
+
+  var = sse - ((mean * mean) >> (bwl + 2));
+
+  return var;
+}
+
+void vpx_minmax_8x8_msa(const uint8_t *s, int p, const uint8_t *d, int dp,
+                        int *min, int *max) {
+  v16u8 s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7;
+  v16u8 diff0, diff1, diff2, diff3, min0, min1, max0, max1;
+
+  LD_UB8(s, p, s0, s1, s2, s3, s4, s5, s6, s7);
+  LD_UB8(d, dp, d0, d1, d2, d3, d4, d5, d6, d7);
+  PCKEV_D4_UB(s1, s0, s3, s2, s5, s4, s7, s6, s0, s1, s2, s3);
+  PCKEV_D4_UB(d1, d0, d3, d2, d5, d4, d7, d6, d0, d1, d2, d3);
+
+  diff0 = __msa_asub_u_b(s0, d0);
+  diff1 = __msa_asub_u_b(s1, d1);
+  diff2 = __msa_asub_u_b(s2, d2);
+  diff3 = __msa_asub_u_b(s3, d3);
+
+  min0 = __msa_min_u_b(diff0, diff1);
+  min1 = __msa_min_u_b(diff2, diff3);
+  min0 = __msa_min_u_b(min0, min1);
+
+  max0 = __msa_max_u_b(diff0, diff1);
+  max1 = __msa_max_u_b(diff2, diff3);
+  max0 = __msa_max_u_b(max0, max1);
+
+  min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 8);
+  min0 = __msa_min_u_b(min0, min1);
+  max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 8);
+  max0 = __msa_max_u_b(max0, max1);
+
+  min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 4);
+  min0 = __msa_min_u_b(min0, min1);
+  max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 4);
+  max0 = __msa_max_u_b(max0, max1);
+
+  min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 2);
+  min0 = __msa_min_u_b(min0, min1);
+  max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 2);
+  max0 = __msa_max_u_b(max0, max1);
+
+  min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 1);
+  min0 = __msa_min_u_b(min0, min1);
+  max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 1);
+  max0 = __msa_max_u_b(max0, max1);
+
+  *min = min0[0];
+  *max = max0[0];
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/common_dspr2.h b/media/libvpx/libvpx/vpx_dsp/mips/common_dspr2.h
index 0a42f5cec2..87a5bbab56 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/common_dspr2.h
+++ b/media/libvpx/libvpx/vpx_dsp/mips/common_dspr2.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_COMMON_MIPS_DSPR2_H_
-#define VPX_COMMON_MIPS_DSPR2_H_
+#ifndef VPX_VPX_DSP_MIPS_COMMON_DSPR2_H_
+#define VPX_VPX_DSP_MIPS_COMMON_DSPR2_H_
 
 #include <assert.h>
 #include "./vpx_config.h"
@@ -45,4 +45,4 @@ static INLINE void prefetch_store_streamed(unsigned char *dst) {
 }  // extern "C"
 #endif
 
-#endif  // VPX_COMMON_MIPS_DSPR2_H_
+#endif  // VPX_VPX_DSP_MIPS_COMMON_DSPR2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c
index ae88eddfd6..18e7d5375d 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c
+++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c
@@ -219,9 +219,10 @@ static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
 
 void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4, int w,
-                                  int h) {
+                                  const InterpKernel *filter, int x0_q4,
+                                  int32_t x_step_q4, int y0_q4, int y_step_q4,
+                                  int w, int h) {
+  const int16_t *const filter_y = filter[y0_q4];
   uint32_t pos = 38;
 
   assert(y_step_q4 == 16);
@@ -247,8 +248,8 @@ void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                     h);
       break;
     default:
-      vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                               x_step_q4, filter_y, y_step_q4, w, h);
+      vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                               x_step_q4, y0_q4, y_step_q4, w, h);
       break;
   }
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c
index e944207b6e..7dcb662d7f 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c
+++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c
@@ -751,9 +751,10 @@ static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
 
 void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int x_step_q4,
-                                   const int16_t *filter_y, int y_step_q4,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int32_t x_step_q4, int y0_q4, int y_step_q4,
                                    int w, int h) {
+  const int16_t *const filter_x = filter[x0_q4];
   uint32_t pos = 38;
 
   assert(x_step_q4 == 16);
@@ -793,8 +794,8 @@ void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                      h);
       break;
     default:
-      vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                                x_step_q4, filter_y, y_step_q4, w, h);
+      vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                                x_step_q4, y0_q4, y_step_q4, w, h);
       break;
   }
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c
index 5cc06b5f26..9e65a8f50f 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c
+++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c
@@ -628,9 +628,10 @@ static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
 
 void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4, int w,
-                               int h) {
+                               const InterpKernel *filter, int x0_q4,
+                               int32_t x_step_q4, int y0_q4, int y_step_q4,
+                               int w, int h) {
+  const int16_t *const filter_x = filter[x0_q4];
   uint32_t pos = 38;
 
   assert(x_step_q4 == 16);
@@ -672,8 +673,8 @@ void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                  (int32_t)dst_stride, filter_x, (int32_t)h);
       break;
     default:
-      vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                            x_step_q4, filter_y, y_step_q4, w, h);
+      vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                            x_step_q4, y0_q4, y_step_q4, w, h);
       break;
   }
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c
index eb1975e447..a3e967b405 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c
+++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c
@@ -201,9 +201,10 @@ static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
 
 void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
-                              int h) {
+                              const InterpKernel *filter, int x0_q4,
+                              int32_t x_step_q4, int y0_q4, int y_step_q4,
+                              int w, int h) {
+  const int16_t *const filter_y = filter[y0_q4];
   uint32_t pos = 38;
 
   assert(y_step_q4 == 16);
@@ -228,8 +229,8 @@ void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
       convolve_bi_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
       break;
     default:
-      vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                           x_step_q4, filter_y, y_step_q4, w, h);
+      vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                           x_step_q4, y0_q4, y_step_q4, w, h);
       break;
   }
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c
index b4ed6ee850..cc458c8618 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c
+++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c
@@ -15,6 +15,7 @@
 #include "vpx_dsp/mips/convolve_common_dspr2.h"
 #include "vpx_dsp/vpx_convolve.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
 #include "vpx_ports/mem.h"
 
 #if HAVE_DSPR2
@@ -334,15 +335,16 @@ static void convolve_avg_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
 
 void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4, int w,
-                                  int h) {
+                                  const InterpKernel *filter, int x0_q4,
+                                  int32_t x_step_q4, int y0_q4, int y_step_q4,
+                                  int w, int h) {
+  const int16_t *const filter_y = filter[y0_q4];
   assert(y_step_q4 == 16);
   assert(((const int32_t *)filter_y)[1] != 0x800000);
 
-  if (((const int32_t *)filter_y)[0] == 0) {
-    vpx_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                                 x_step_q4, filter_y, y_step_q4, w, h);
+  if (vpx_get_filter_taps(filter_y) == 2) {
+    vpx_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
   } else {
     uint32_t pos = 38;
 
@@ -367,8 +369,8 @@ void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                    h);
         break;
       default:
-        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                                 x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   }
@@ -376,8 +378,8 @@ void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 
 void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int x_step_q4,
-                             const int16_t *filter_y, int y_step_q4, int w,
+                             const InterpKernel *filter, int x0_q4,
+                             int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
                              int h) {
   /* Fixed size intermediate buffer places limits on parameters. */
   DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
@@ -390,24 +392,26 @@ void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 
   if (intermediate_height < h) intermediate_height = h;
 
-  vpx_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter_x,
-                      x_step_q4, filter_y, y_step_q4, w, intermediate_height);
+  vpx_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter,
+                      x0_q4, x_step_q4, y0_q4, y_step_q4, w,
+                      intermediate_height);
 
-  vpx_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter_x,
-                         x_step_q4, filter_y, y_step_q4, w, h);
+  vpx_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter, x0_q4,
+                         x_step_q4, y0_q4, y_step_q4, w, h);
 }
 
 void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int filter_x_stride,
-                            const int16_t *filter_y, int filter_y_stride, int w,
+                            const InterpKernel *filter, int x0_q4,
+                            int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
                             int h) {
   int x, y;
   uint32_t tp1, tp2, tn1, tp3, tp4, tn2;
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
 
   /* prefetch data to cache memory */
   prefetch_load(src);
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
index 9a9bab25a5..7a9aa49d8a 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
+++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
@@ -15,6 +15,7 @@
 #include "vpx_dsp/mips/convolve_common_dspr2.h"
 #include "vpx_dsp/vpx_convolve.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
 #include "vpx_ports/mem.h"
 
 #if HAVE_DSPR2
@@ -938,15 +939,16 @@ static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,
 
 void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int x_step_q4,
-                                   const int16_t *filter_y, int y_step_q4,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int32_t x_step_q4, int y0_q4, int y_step_q4,
                                    int w, int h) {
+  const int16_t *const filter_x = filter[x0_q4];
   assert(x_step_q4 == 16);
   assert(((const int32_t *)filter_x)[1] != 0x800000);
 
-  if (((const int32_t *)filter_x)[0] == 0) {
-    vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                                  x_step_q4, filter_y, y_step_q4, w, h);
+  if (vpx_get_filter_taps(filter_x) == 2) {
+    vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter,
+                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
   } else {
     uint32_t pos = 38;
 
@@ -987,9 +989,8 @@ void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                     h);
         break;
       default:
-        vpx_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride,
-                                  filter_x, x_step_q4, filter_y, y_step_q4, w,
-                                  h);
+        vpx_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride, filter,
+                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   }
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve8_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_dspr2.c
index 8d35b6394e..1e7052f6c5 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/convolve8_dspr2.c
+++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_dspr2.c
@@ -1296,9 +1296,11 @@ void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
 }
 
 void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                         ptrdiff_t dst_stride, const InterpKernel *filter,
+                         int x0_q4, int32_t x_step_q4, int y0_q4, int y_step_q4,
                          int w, int h) {
+  const int16_t *const filter_x = filter[x0_q4];
+  const int16_t *const filter_y = filter[y0_q4];
   DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
   int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
   uint32_t pos = 38;
@@ -1320,7 +1322,7 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
   if (filter_x[3] == 0x80) {
     copy_horiz_transposed(src - src_stride * 3, src_stride, temp,
                           intermediate_height, w, intermediate_height);
-  } else if (((const int32_t *)filter_x)[0] == 0) {
+  } else if (vpx_get_filter_taps(filter_x) == 2) {
     vpx_convolve2_dspr2(src - src_stride * 3, src_stride, temp,
                         intermediate_height, filter_x, w, intermediate_height);
   } else {
@@ -1363,7 +1365,7 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
   /* copy the src to dst */
   if (filter_y[3] == 0x80) {
     copy_horiz_transposed(temp + 3, intermediate_height, dst, dst_stride, h, w);
-  } else if (((const int32_t *)filter_y)[0] == 0) {
+  } else if (vpx_get_filter_taps(filter_y) == 2) {
     vpx_convolve2_dspr2(temp + 3, intermediate_height, dst, dst_stride,
                         filter_y, h, w);
   } else {
@@ -1395,14 +1397,15 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
 
 void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int filter_x_stride,
-                             const int16_t *filter_y, int filter_y_stride,
-                             int w, int h) {
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
+                             int h) {
   int x, y;
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
 
   /* prefetch data to cache memory */
   prefetch_load(src);
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c
index 196a0a2f0b..09d6f36e56 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c
+++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c
@@ -818,15 +818,16 @@ static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride,
 
 void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4, int w,
+                               const InterpKernel *filter, int x0_q4,
+                               int x_step_q4, int y0_q4, int y_step_q4, int w,
                                int h) {
+  const int16_t *const filter_x = filter[x0_q4];
   assert(x_step_q4 == 16);
   assert(((const int32_t *)filter_x)[1] != 0x800000);
 
-  if (((const int32_t *)filter_x)[0] == 0) {
-    vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                              x_step_q4, filter_y, y_step_q4, w, h);
+  if (vpx_get_filter_taps(filter_x) == 2) {
+    vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4,
+                              x_step_q4, y0_q4, y_step_q4, w, h);
   } else {
     uint32_t pos = 38;
 
@@ -868,8 +869,8 @@ void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                 (int32_t)dst_stride, filter_x, (int32_t)h);
         break;
       default:
-        vpx_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter_x,
-                              x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter,
+                              x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   }
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c
index ad107d5c47..fd977b5336 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c
+++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c
@@ -318,15 +318,16 @@ static void convolve_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
 
 void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
+                              const InterpKernel *filter, int x0_q4,
+                              int x_step_q4, int y0_q4, int y_step_q4, int w,
                               int h) {
+  const int16_t *const filter_y = filter[y0_q4];
   assert(y_step_q4 == 16);
   assert(((const int32_t *)filter_y)[1] != 0x800000);
 
-  if (((const int32_t *)filter_y)[0] == 0) {
-    vpx_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                             x_step_q4, filter_y, y_step_q4, w, h);
+  if (vpx_get_filter_taps(filter_y) == 2) {
+    vpx_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4,
+                             x_step_q4, y0_q4, y_step_q4, w, h);
   } else {
     uint32_t pos = 38;
 
@@ -349,8 +350,8 @@ void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
         convolve_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
         break;
       default:
-        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                             x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                             x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   }
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve_common_dspr2.h b/media/libvpx/libvpx/vpx_dsp/mips/convolve_common_dspr2.h
index 4eee3bd5e1..14b65bc650 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/convolve_common_dspr2.h
+++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve_common_dspr2.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_MIPS_VPX_COMMON_DSPR2_H_
-#define VPX_DSP_MIPS_VPX_COMMON_DSPR2_H_
+#ifndef VPX_VPX_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_
+#define VPX_VPX_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_
 
 #include <assert.h>
 
@@ -24,21 +24,21 @@ extern "C" {
 #if HAVE_DSPR2
 void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4, int w,
-                               int h);
+                               const InterpKernel *filter, int x0_q4,
+                               int32_t x_step_q4, int y0_q4, int y_step_q4,
+                               int w, int h);
 
 void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int x_step_q4,
-                                   const int16_t *filter_y, int y_step_q4,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int32_t x_step_q4, int y0_q4, int y_step_q4,
                                    int w, int h);
 
 void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4, int w,
-                                  int h);
+                                  const InterpKernel *filter, int x0_q4,
+                                  int32_t x_step_q4, int y0_q4, int y_step_q4,
+                                  int w, int h);
 
 void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                          ptrdiff_t dst_stride, const int16_t *filter, int w,
@@ -46,13 +46,13 @@ void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
 
 void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
-                              int h);
+                              const InterpKernel *filter, int x0_q4,
+                              int32_t x_step_q4, int y0_q4, int y_step_q4,
+                              int w, int h);
 
 #endif  // #if HAVE_DSPR2
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VPX_DSP_MIPS_VPX_COMMON_DSPR2_H_
+#endif  // VPX_VPX_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/deblock_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/deblock_msa.c
index e33ea740a9..4e93ff594d 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/deblock_msa.c
+++ b/media/libvpx/libvpx/vpx_dsp/mips/deblock_msa.c
@@ -9,42 +9,43 @@
  */
 
 #include <stdlib.h>
-#include "./macros_msa.h"
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
 
 extern const int16_t vpx_rv[];
 
-#define VPX_TRANSPOSE8x16_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, out0,  \
-                                out1, out2, out3, out4, out5, out6, out7,      \
-                                out8, out9, out10, out11, out12, out13, out14, \
-                                out15)                                         \
-  {                                                                            \
-    v8i16 temp0, temp1, temp2, temp3, temp4;                                   \
-    v8i16 temp5, temp6, temp7, temp8, temp9;                                   \
-                                                                               \
-    ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2,    \
-               temp3);                                                         \
-    ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                      \
-    ILVRL_W2_SH(temp5, temp4, temp6, temp7);                                   \
-    ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                      \
-    ILVRL_W2_SH(temp5, temp4, temp8, temp9);                                   \
-    ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2,    \
-               temp3);                                                         \
-    ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                      \
-    ILVRL_W2_UB(temp5, temp4, out8, out10);                                    \
-    ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                      \
-    ILVRL_W2_UB(temp5, temp4, out12, out14);                                   \
-    out0 = (v16u8)temp6;                                                       \
-    out2 = (v16u8)temp7;                                                       \
-    out4 = (v16u8)temp8;                                                       \
-    out6 = (v16u8)temp9;                                                       \
-    out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8);                      \
-    out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10);                   \
-    out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12);                   \
-    out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14);                   \
-    out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0);                      \
-    out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2);                      \
-    out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4);                      \
-    out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6);                      \
+#define VPX_TRANSPOSE8x16_UB_UB(                                            \
+    in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, out4,   \
+    out5, out6, out7, out8, out9, out10, out11, out12, out13, out14, out15) \
+  {                                                                         \
+    v8i16 temp0, temp1, temp2, temp3, temp4;                                \
+    v8i16 temp5, temp6, temp7, temp8, temp9;                                \
+                                                                            \
+    ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2, \
+               temp3);                                                      \
+    ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                   \
+    ILVRL_W2_SH(temp5, temp4, temp6, temp7);                                \
+    ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                   \
+    ILVRL_W2_SH(temp5, temp4, temp8, temp9);                                \
+    ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2, \
+               temp3);                                                      \
+    ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                   \
+    ILVRL_W2_UB(temp5, temp4, out8, out10);                                 \
+    ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                   \
+    ILVRL_W2_UB(temp5, temp4, out12, out14);                                \
+    out0 = (v16u8)temp6;                                                    \
+    out2 = (v16u8)temp7;                                                    \
+    out4 = (v16u8)temp8;                                                    \
+    out6 = (v16u8)temp9;                                                    \
+    out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8);                   \
+    out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10);                \
+    out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12);                \
+    out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14);                \
+    out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0);                   \
+    out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2);                   \
+    out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4);                   \
+    out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6);                   \
   }
 
 #define VPX_AVER_IF_RETAIN(above2_in, above1_in, src_in, below1_in, below2_in, \
@@ -295,6 +296,7 @@ static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
   uint8_t *p_dst_st = dst_ptr;
   uint8_t *f_orig = f;
   uint16_t col;
+  uint64_t out0, out1, out2, out3;
   v16u8 above2, above1, below2, below1;
   v16u8 src, ref, ref_temp;
   v16u8 inter0, inter1, inter2, inter3, inter4, inter5, inter6;
@@ -346,6 +348,67 @@ static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
     f += 16;
   }
 
+  if (0 != (cols / 16)) {
+    ref = LD_UB(f);
+    LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
+    src = LD_UB(p_src);
+    LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
+    VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
+    above2 = LD_UB(p_src + 3 * src_stride);
+    VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
+    above1 = LD_UB(p_src + 4 * src_stride);
+    VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
+    src = LD_UB(p_src + 5 * src_stride);
+    VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
+    below1 = LD_UB(p_src + 6 * src_stride);
+    VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
+    below2 = LD_UB(p_src + 7 * src_stride);
+    VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
+    above2 = LD_UB(p_src + 8 * src_stride);
+    VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
+    above1 = LD_UB(p_src + 9 * src_stride);
+    VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
+    src = LD_UB(p_src + 10 * src_stride);
+    VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter8);
+    below1 = LD_UB(p_src + 11 * src_stride);
+    VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter9);
+    below2 = LD_UB(p_src + 12 * src_stride);
+    VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter10);
+    above2 = LD_UB(p_src + 13 * src_stride);
+    VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter11);
+    above1 = LD_UB(p_src + 14 * src_stride);
+    VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter12);
+    src = LD_UB(p_src + 15 * src_stride);
+    VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter13);
+    below1 = LD_UB(p_src + 16 * src_stride);
+    VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter14);
+    below2 = LD_UB(p_src + 17 * src_stride);
+    VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter15);
+    out0 = __msa_copy_u_d((v2i64)inter0, 0);
+    out1 = __msa_copy_u_d((v2i64)inter1, 0);
+    out2 = __msa_copy_u_d((v2i64)inter2, 0);
+    out3 = __msa_copy_u_d((v2i64)inter3, 0);
+    SD4(out0, out1, out2, out3, p_dst, dst_stride);
+
+    out0 = __msa_copy_u_d((v2i64)inter4, 0);
+    out1 = __msa_copy_u_d((v2i64)inter5, 0);
+    out2 = __msa_copy_u_d((v2i64)inter6, 0);
+    out3 = __msa_copy_u_d((v2i64)inter7, 0);
+    SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride);
+
+    out0 = __msa_copy_u_d((v2i64)inter8, 0);
+    out1 = __msa_copy_u_d((v2i64)inter9, 0);
+    out2 = __msa_copy_u_d((v2i64)inter10, 0);
+    out3 = __msa_copy_u_d((v2i64)inter11, 0);
+    SD4(out0, out1, out2, out3, p_dst + 8 * dst_stride, dst_stride);
+
+    out0 = __msa_copy_u_d((v2i64)inter12, 0);
+    out1 = __msa_copy_u_d((v2i64)inter13, 0);
+    out2 = __msa_copy_u_d((v2i64)inter14, 0);
+    out3 = __msa_copy_u_d((v2i64)inter15, 0);
+    SD4(out0, out1, out2, out3, p_dst + 12 * dst_stride, dst_stride);
+  }
+
   f = f_orig;
   p_dst = dst_ptr - 2;
   LD_UB8(p_dst, dst_stride, inter0, inter1, inter2, inter3, inter4, inter5,
@@ -446,11 +509,11 @@ void vpx_post_proc_down_and_across_mb_row_msa(uint8_t *src, uint8_t *dst,
   }
 }
 
-void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
-                                   int32_t rows, int32_t cols, int32_t flimit) {
+void vpx_mbpost_proc_across_ip_msa(uint8_t *src, int32_t pitch, int32_t rows,
+                                   int32_t cols, int32_t flimit) {
   int32_t row, col, cnt;
-  uint8_t *src_dup = src_ptr;
-  v16u8 src0, src, tmp_orig;
+  uint8_t *src_dup = src;
+  v16u8 src0, src1, tmp_orig;
   v16u8 tmp = { 0 };
   v16i8 zero = { 0 };
   v8u16 sum_h, src_r_h, src_l_h;
@@ -469,13 +532,13 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
     src_dup[cols + 16] = src_dup[cols - 1];
     tmp_orig = (v16u8)__msa_ldi_b(0);
     tmp_orig[15] = tmp[15];
-    src = LD_UB(src_dup - 8);
-    src[15] = 0;
-    ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
+    src1 = LD_UB(src_dup - 8);
+    src1[15] = 0;
+    ILVRL_B2_UH(zero, src1, src_r_h, src_l_h);
     src_r_w = __msa_dotp_u_w(src_r_h, src_r_h);
     src_r_w += __msa_dotp_u_w(src_l_h, src_l_h);
     sum_sq = HADD_SW_S32(src_r_w) + 16;
-    sum_h = __msa_hadd_u_h(src, src);
+    sum_h = __msa_hadd_u_h(src1, src1);
     sum = HADD_UH_U32(sum_h);
     {
       v16u8 src7, src8, src_r, src_l;
@@ -504,8 +567,8 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
           sum_l[cnt + 1] = sum_l[cnt] + sub_l[cnt + 1];
         }
         sum = sum_l[7];
-        src = LD_UB(src_dup + 16 * col);
-        ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
+        src1 = LD_UB(src_dup + 16 * col);
+        ILVRL_B2_UH(zero, src1, src_r_h, src_l_h);
         src7 = (v16u8)((const8 + sum_r + (v8i16)src_r_h) >> 4);
         src8 = (v16u8)((const8 + sum_l + (v8i16)src_l_h) >> 4);
         tmp = (v16u8)__msa_pckev_b((v16i8)src8, (v16i8)src7);
@@ -551,7 +614,7 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
         total3 = (total3 < flimit_vec);
         PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
         mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0);
-        tmp = __msa_bmz_v(tmp, src, (v16u8)mask);
+        tmp = __msa_bmz_v(tmp, src1, (v16u8)mask);
 
         if (col == 0) {
           uint64_t src_d;
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c
index e41a904808..36583e2d24 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c
+++ b/media/libvpx/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/fwd_txfm_msa.h"
 
 static void fdct8x32_1d_column_load_butterfly(const int16_t *input,
@@ -927,21 +928,21 @@ void vpx_fdct32x32_rd_msa(const int16_t *input, int16_t *out,
 }
 
 void vpx_fdct32x32_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
-  int sum = LD_HADD(input, stride);
-  sum += LD_HADD(input + 8, stride);
-  sum += LD_HADD(input + 16, stride);
-  sum += LD_HADD(input + 24, stride);
-  sum += LD_HADD(input + 32 * 8, stride);
-  sum += LD_HADD(input + 32 * 8 + 8, stride);
-  sum += LD_HADD(input + 32 * 8 + 16, stride);
-  sum += LD_HADD(input + 32 * 8 + 24, stride);
-  sum += LD_HADD(input + 32 * 16, stride);
-  sum += LD_HADD(input + 32 * 16 + 8, stride);
-  sum += LD_HADD(input + 32 * 16 + 16, stride);
-  sum += LD_HADD(input + 32 * 16 + 24, stride);
-  sum += LD_HADD(input + 32 * 24, stride);
-  sum += LD_HADD(input + 32 * 24 + 8, stride);
-  sum += LD_HADD(input + 32 * 24 + 16, stride);
-  sum += LD_HADD(input + 32 * 24 + 24, stride);
+  int sum, i;
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v4i32 vec_w = { 0 };
+
+  for (i = 0; i < 16; ++i) {
+    LD_SH4(input, 8, in0, in1, in2, in3);
+    input += stride;
+    LD_SH4(input, 8, in4, in5, in6, in7);
+    input += stride;
+    ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6);
+    ADD2(in0, in2, in4, in6, in0, in4);
+    vec_w += __msa_hadd_s_w(in0, in0);
+    vec_w += __msa_hadd_s_w(in4, in4);
+  }
+
+  sum = HADD_SW_S32(vec_w);
   out[0] = (int16_t)(sum >> 3);
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.c
index fdead50503..5a6dfcef2f 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.c
+++ b/media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.c
@@ -8,8 +8,23 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/fwd_txfm_msa.h"
 
+void vpx_fdct8x8_1_msa(const int16_t *input, tran_low_t *out, int32_t stride) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v4i32 vec_w;
+
+  LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7);
+  ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6);
+  ADD2(in0, in2, in4, in6, in0, in4);
+  vec_w = __msa_hadd_s_w(in0, in0);
+  vec_w += __msa_hadd_s_w(in4, in4);
+  out[0] = HADD_SW_S32(vec_w);
+  out[1] = 0;
+}
+
+#if !CONFIG_VP9_HIGHBITDEPTH
 void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
                         int32_t src_stride) {
   v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
@@ -215,11 +230,6 @@ void vpx_fdct8x8_msa(const int16_t *input, int16_t *output,
   ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8);
 }
 
-void vpx_fdct8x8_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
-  out[0] = LD_HADD(input, stride);
-  out[1] = 0;
-}
-
 void vpx_fdct16x16_msa(const int16_t *input, int16_t *output,
                        int32_t src_stride) {
   int32_t i;
@@ -237,9 +247,26 @@ void vpx_fdct16x16_msa(const int16_t *input, int16_t *output,
 }
 
 void vpx_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
-  int sum = LD_HADD(input, stride);
-  sum += LD_HADD(input + 8, stride);
-  sum += LD_HADD(input + 16 * 8, stride);
-  sum += LD_HADD(input + 16 * 8 + 8, stride);
+  int sum, i;
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v4i32 vec_w = { 0 };
+
+  for (i = 0; i < 4; ++i) {
+    LD_SH2(input, 8, in0, in1);
+    input += stride;
+    LD_SH2(input, 8, in2, in3);
+    input += stride;
+    LD_SH2(input, 8, in4, in5);
+    input += stride;
+    LD_SH2(input, 8, in6, in7);
+    input += stride;
+    ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6);
+    ADD2(in0, in2, in4, in6, in0, in4);
+    vec_w += __msa_hadd_s_w(in0, in0);
+    vec_w += __msa_hadd_s_w(in4, in4);
+  }
+
+  sum = HADD_SW_S32(vec_w);
   out[0] = (int16_t)(sum >> 1);
 }
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.h b/media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.h
index db5e90e7b9..c0be56b819 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.h
+++ b/media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.h
@@ -8,28 +8,12 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_MIPS_FWD_TXFM_MSA_H_
-#define VPX_DSP_MIPS_FWD_TXFM_MSA_H_
+#ifndef VPX_VPX_DSP_MIPS_FWD_TXFM_MSA_H_
+#define VPX_VPX_DSP_MIPS_FWD_TXFM_MSA_H_
 
 #include "vpx_dsp/mips/txfm_macros_msa.h"
 #include "vpx_dsp/txfm_common.h"
 
-#define LD_HADD(psrc, stride)                                                  \
-  ({                                                                           \
-    v8i16 in0_m, in1_m, in2_m, in3_m, in4_m, in5_m, in6_m, in7_m;              \
-    v4i32 vec_w_m;                                                             \
-                                                                               \
-    LD_SH4((psrc), stride, in0_m, in1_m, in2_m, in3_m);                        \
-    ADD2(in0_m, in1_m, in2_m, in3_m, in0_m, in2_m);                            \
-    LD_SH4(((psrc) + 4 * stride), stride, in4_m, in5_m, in6_m, in7_m);         \
-    ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m, in4_m, in6_m, \
-         in0_m, in4_m);                                                        \
-    in0_m += in4_m;                                                            \
-                                                                               \
-    vec_w_m = __msa_hadd_s_w(in0_m, in0_m);                                    \
-    HADD_SW_S32(vec_w_m);                                                      \
-  })
-
 #define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3)                  \
   {                                                                            \
     v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m;                                  \
@@ -377,4 +361,4 @@
 void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
                         int32_t src_stride);
 void fdct16x8_1d_row(int16_t *input, int16_t *output);
-#endif  // VPX_DSP_MIPS_FWD_TXFM_MSA_H_
+#endif  // VPX_VPX_DSP_MIPS_FWD_TXFM_MSA_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/idct16x16_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/idct16x16_msa.c
index 2a211c5677..7ca61a28ec 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/idct16x16_msa.c
+++ b/media/libvpx/libvpx/vpx_dsp/mips/idct16x16_msa.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/inv_txfm_msa.h"
 
 void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output) {
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/idct32x32_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/idct32x32_msa.c
index 2ea6136f9b..053948183a 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/idct32x32_msa.c
+++ b/media/libvpx/libvpx/vpx_dsp/mips/idct32x32_msa.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/inv_txfm_msa.h"
 
 static void idct32x8_row_transpose_store(const int16_t *input,
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/idct4x4_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/idct4x4_msa.c
index 0a85742f10..56ffec3cba 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/idct4x4_msa.c
+++ b/media/libvpx/libvpx/vpx_dsp/mips/idct4x4_msa.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/inv_txfm_msa.h"
 
 void vpx_iwht4x4_16_add_msa(const int16_t *input, uint8_t *dst,
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/idct8x8_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/idct8x8_msa.c
index 7f77d20191..a383ff2066 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/idct8x8_msa.c
+++ b/media/libvpx/libvpx/vpx_dsp/mips/idct8x8_msa.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/inv_txfm_msa.h"
 
 void vpx_idct8x8_64_add_msa(const int16_t *input, uint8_t *dst,
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h b/media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h
index 27881f0db6..cbea22f20f 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h
+++ b/media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_MIPS_INV_TXFM_DSPR2_H_
-#define VPX_DSP_MIPS_INV_TXFM_DSPR2_H_
+#ifndef VPX_VPX_DSP_MIPS_INV_TXFM_DSPR2_H_
+#define VPX_VPX_DSP_MIPS_INV_TXFM_DSPR2_H_
 
 #include <assert.h>
 
@@ -25,7 +25,6 @@ extern "C" {
 #if HAVE_DSPR2
 #define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input)                         \
   ({                                                                           \
-                                                                               \
     int32_t tmp, out;                                                          \
     int dct_cost_rounding = DCT_CONST_ROUNDING;                                \
     int in = input;                                                            \
@@ -73,4 +72,4 @@ void iadst16_dspr2(const int16_t *input, int16_t *output);
 }  // extern "C"
 #endif
 
-#endif  // VPX_DSP_MIPS_INV_TXFM_DSPR2_H_
+#endif  // VPX_VPX_DSP_MIPS_INV_TXFM_DSPR2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_msa.h b/media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_msa.h
index 1fe9b28e8a..3b66249ef2 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_msa.h
+++ b/media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_msa.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_MIPS_INV_TXFM_MSA_H_
-#define VPX_DSP_MIPS_INV_TXFM_MSA_H_
+#ifndef VPX_VPX_DSP_MIPS_INV_TXFM_MSA_H_
+#define VPX_VPX_DSP_MIPS_INV_TXFM_MSA_H_
 
 #include "vpx_dsp/mips/macros_msa.h"
 #include "vpx_dsp/mips/txfm_macros_msa.h"
@@ -408,4 +408,4 @@ void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output);
 void vpx_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
                                        int32_t dst_stride);
 void vpx_iadst16_1d_rows_msa(const int16_t *input, int16_t *output);
-#endif  // VPX_DSP_MIPS_INV_TXFM_MSA_H_
+#endif  // VPX_VPX_DSP_MIPS_INV_TXFM_MSA_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/itrans4_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/itrans4_dspr2.c
index 3f985b847b..e214b538d4 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/itrans4_dspr2.c
+++ b/media/libvpx/libvpx/vpx_dsp/mips/itrans4_dspr2.c
@@ -343,6 +343,7 @@ void iadst4_dspr2(const int16_t *input, int16_t *output) {
     return;
   }
 
+  // 32-bit result is enough for the following multiplications.
   s0 = sinpi_1_9 * x0;
   s1 = sinpi_2_9 * x0;
   s2 = sinpi_3_9 * x1;
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_16_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_16_msa.c
index b73d56bd55..b1731f2345 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_16_msa.c
+++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_16_msa.c
@@ -8,13 +8,15 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vpx_ports/mem.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/loopfilter_msa.h"
+#include "vpx_ports/mem.h"
 
-int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48,
-                                 const uint8_t *b_limit_ptr,
-                                 const uint8_t *limit_ptr,
-                                 const uint8_t *thresh_ptr) {
+static int32_t hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch,
+                                    uint8_t *filter48,
+                                    const uint8_t *b_limit_ptr,
+                                    const uint8_t *limit_ptr,
+                                    const uint8_t *thresh_ptr) {
   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
   v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
   v16u8 flat, mask, hev, thresh, b_limit, limit;
@@ -77,7 +79,7 @@ int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48,
   }
 }
 
-void vpx_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
+static void hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
   v16u8 flat, flat2, filter8;
   v16i8 zero = { 0 };
   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
@@ -413,11 +415,11 @@ static void mb_lpf_horizontal_edge_dual(uint8_t *src, int32_t pitch,
 
   (void)count;
 
-  early_exit = vpx_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr,
-                                        limit_ptr, thresh_ptr);
+  early_exit = hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr,
+                                    limit_ptr, thresh_ptr);
 
   if (0 == early_exit) {
-    vpx_hz_lpf_t16_16w(src, pitch, filter48);
+    hz_lpf_t16_16w(src, pitch, filter48);
   }
 }
 
@@ -753,11 +755,11 @@ static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output,
   ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
 }
 
-int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
-                                uint8_t *src_org, int32_t pitch_org,
-                                const uint8_t *b_limit_ptr,
-                                const uint8_t *limit_ptr,
-                                const uint8_t *thresh_ptr) {
+static int32_t vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
+                                   uint8_t *src_org, int32_t pitch_org,
+                                   const uint8_t *b_limit_ptr,
+                                   const uint8_t *limit_ptr,
+                                   const uint8_t *thresh_ptr) {
   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
   v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
   v16u8 flat, mask, hev, thresh, b_limit, limit;
@@ -820,8 +822,8 @@ int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
   }
 }
 
-int32_t vpx_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
-                          uint8_t *filter48) {
+static int32_t vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
+                             uint8_t *filter48) {
   v16i8 zero = { 0 };
   v16u8 filter8, flat, flat2;
   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
@@ -1051,12 +1053,12 @@ void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
   transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
 
   early_exit =
-      vpx_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src,
-                              pitch, b_limit_ptr, limit_ptr, thresh_ptr);
+      vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src, pitch,
+                          b_limit_ptr, limit_ptr, thresh_ptr);
 
   if (0 == early_exit) {
-    early_exit = vpx_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
-                                   &filter48[0]);
+    early_exit =
+        vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch, &filter48[0]);
 
     if (0 == early_exit) {
       transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
@@ -1064,11 +1066,11 @@ void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
   }
 }
 
-int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
-                                 uint8_t *src_org, int32_t pitch,
-                                 const uint8_t *b_limit_ptr,
-                                 const uint8_t *limit_ptr,
-                                 const uint8_t *thresh_ptr) {
+static int32_t vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
+                                    uint8_t *src_org, int32_t pitch,
+                                    const uint8_t *b_limit_ptr,
+                                    const uint8_t *limit_ptr,
+                                    const uint8_t *thresh_ptr) {
   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
   v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
   v16u8 flat, mask, hev, thresh, b_limit, limit;
@@ -1141,8 +1143,8 @@ int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
   }
 }
 
-int32_t vpx_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
-                           uint8_t *filter48) {
+static int32_t vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
+                              uint8_t *filter48) {
   v16u8 flat, flat2, filter8;
   v16i8 zero = { 0 };
   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
@@ -1473,12 +1475,12 @@ void vpx_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch,
   transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
 
   early_exit =
-      vpx_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
-                               pitch, b_limit_ptr, limit_ptr, thresh_ptr);
+      vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
+                           pitch, b_limit_ptr, limit_ptr, thresh_ptr);
 
   if (0 == early_exit) {
-    early_exit = vpx_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
-                                    &filter48[0]);
+    early_exit =
+        vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, &filter48[0]);
 
     if (0 == early_exit) {
       transpose_16x16(transposed_input, 16, (src - 8), pitch);
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_4_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_4_msa.c
index 9500cd2fd8..0eff2b6ca9 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_4_msa.c
+++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_4_msa.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/loopfilter_msa.h"
 
 void vpx_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_8_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_8_msa.c
index a22c62bb3a..703fcce8a7 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_8_msa.c
+++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_8_msa.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/loopfilter_msa.h"
 
 void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.h b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.h
index 5b0c73345b..ec339be868 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.h
+++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
-#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
+#ifndef VPX_VPX_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
+#define VPX_VPX_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
 
 #include <stdlib.h>
 
@@ -731,4 +731,4 @@ static INLINE void wide_mbfilter_dspr2(
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
+#endif  // VPX_VPX_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_macros_dspr2.h b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_macros_dspr2.h
index 38ed0b2a63..9af0b42360 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_macros_dspr2.h
+++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_macros_dspr2.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MACROS_DSPR2_H_
-#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MACROS_DSPR2_H_
+#ifndef VPX_VPX_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_
+#define VPX_VPX_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_
 
 #include <stdlib.h>
 
@@ -432,4 +432,4 @@ extern "C" {
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MACROS_DSPR2_H_
+#endif  // VPX_VPX_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h
index ee11142266..24c492bea0 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h
+++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_
-#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_
+#ifndef VPX_VPX_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
+#define VPX_VPX_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
 
 #include <stdlib.h>
 
@@ -352,4 +352,4 @@ static INLINE void flatmask5(uint32_t p4, uint32_t p3, uint32_t p2, uint32_t p1,
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_
+#endif  // VPX_VPX_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_msa.h b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_msa.h
index 49fd74c25a..1ea05e0b0b 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_msa.h
+++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_msa.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_LOOPFILTER_MSA_H_
-#define VPX_DSP_LOOPFILTER_MSA_H_
+#ifndef VPX_VPX_DSP_MIPS_LOOPFILTER_MSA_H_
+#define VPX_VPX_DSP_MIPS_LOOPFILTER_MSA_H_
 
 #include "vpx_dsp/mips/macros_msa.h"
 
@@ -174,4 +174,4 @@
     mask_out = limit_in < (v16u8)mask_out;                                   \
     mask_out = __msa_xori_b(mask_out, 0xff);                                 \
   }
-#endif /* VPX_DSP_LOOPFILTER_MSA_H_ */
+#endif  // VPX_VPX_DSP_MIPS_LOOPFILTER_MSA_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/macros_msa.h b/media/libvpx/libvpx/vpx_dsp/mips/macros_msa.h
index 002e574aa8..da79b40a7a 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/macros_msa.h
+++ b/media/libvpx/libvpx/vpx_dsp/mips/macros_msa.h
@@ -8,215 +8,159 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_MIPS_MACROS_MSA_H_
-#define VPX_DSP_MIPS_MACROS_MSA_H_
+#ifndef VPX_VPX_DSP_MIPS_MACROS_MSA_H_
+#define VPX_VPX_DSP_MIPS_MACROS_MSA_H_
 
 #include <msa.h>
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
-#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
-#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
-#define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
+#define LD_V(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_UB(...) LD_V(v16u8, __VA_ARGS__)
+#define LD_SB(...) LD_V(v16i8, __VA_ARGS__)
+#define LD_UH(...) LD_V(v8u16, __VA_ARGS__)
+#define LD_SH(...) LD_V(v8i16, __VA_ARGS__)
+#define LD_SW(...) LD_V(v4i32, __VA_ARGS__)
 
-#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))
-#define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
-#define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
-
-#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
-#define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
-
-#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
-#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
-#define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
-
-#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
-#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
-
-#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
-#define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
+#define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_UB(...) ST_V(v16u8, __VA_ARGS__)
+#define ST_SB(...) ST_V(v16i8, __VA_ARGS__)
+#define ST_SH(...) ST_V(v8i16, __VA_ARGS__)
+#define ST_SW(...) ST_V(v4i32, __VA_ARGS__)
 
 #if (__mips_isa_rev >= 6)
-#define LH(psrc)                                          \
-  ({                                                      \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
-    uint16_t val_m;                                       \
-                                                          \
-    __asm__ __volatile__("lh  %[val_m],  %[psrc_m]  \n\t" \
-                                                          \
-                         : [val_m] "=r"(val_m)            \
-                         : [psrc_m] "m"(*psrc_m));        \
-                                                          \
-    val_m;                                                \
+#define LH(psrc)                                   \
+  ({                                               \
+    uint16_t val_lh_m = *(const uint16_t *)(psrc); \
+    val_lh_m;                                      \
   })
 
-#define LW(psrc)                                          \
-  ({                                                      \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
-    uint32_t val_m;                                       \
-                                                          \
-    __asm__ __volatile__("lw  %[val_m],  %[psrc_m]  \n\t" \
-                                                          \
-                         : [val_m] "=r"(val_m)            \
-                         : [psrc_m] "m"(*psrc_m));        \
-                                                          \
-    val_m;                                                \
+#define LW(psrc)                                   \
+  ({                                               \
+    uint32_t val_lw_m = *(const uint32_t *)(psrc); \
+    val_lw_m;                                      \
   })
 
 #if (__mips == 64)
-#define LD(psrc)                                          \
-  ({                                                      \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
-    uint64_t val_m = 0;                                   \
-                                                          \
-    __asm__ __volatile__("ld  %[val_m],  %[psrc_m]  \n\t" \
-                                                          \
-                         : [val_m] "=r"(val_m)            \
-                         : [psrc_m] "m"(*psrc_m));        \
-                                                          \
-    val_m;                                                \
+#define LD(psrc)                                   \
+  ({                                               \
+    uint64_t val_ld_m = *(const uint64_t *)(psrc); \
+    val_ld_m;                                      \
   })
 #else  // !(__mips == 64)
-#define LD(psrc)                                            \
-  ({                                                        \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);        \
-    uint32_t val0_m, val1_m;                                \
-    uint64_t val_m = 0;                                     \
-                                                            \
-    val0_m = LW(psrc_m);                                    \
-    val1_m = LW(psrc_m + 4);                                \
-                                                            \
-    val_m = (uint64_t)(val1_m);                             \
-    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
-    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           \
-                                                            \
-    val_m;                                                  \
+#define LD(psrc)                                                  \
+  ({                                                              \
+    const uint8_t *psrc_ld_m = (const uint8_t *)(psrc);           \
+    uint32_t val0_ld_m, val1_ld_m;                                \
+    uint64_t val_ld_m = 0;                                        \
+                                                                  \
+    val0_ld_m = LW(psrc_ld_m);                                    \
+    val1_ld_m = LW(psrc_ld_m + 4);                                \
+                                                                  \
+    val_ld_m = (uint64_t)(val1_ld_m);                             \
+    val_ld_m = (uint64_t)((val_ld_m << 32) & 0xFFFFFFFF00000000); \
+    val_ld_m = (uint64_t)(val_ld_m | (uint64_t)val0_ld_m);        \
+                                                                  \
+    val_ld_m;                                                     \
   })
 #endif  // (__mips == 64)
 
-#define SH(val, pdst)                                     \
-  {                                                       \
-    uint8_t *pdst_m = (uint8_t *)(pdst);                  \
-    const uint16_t val_m = (val);                         \
-                                                          \
-    __asm__ __volatile__("sh  %[val_m],  %[pdst_m]  \n\t" \
-                                                          \
-                         : [pdst_m] "=m"(*pdst_m)         \
-                         : [val_m] "r"(val_m));           \
-  }
-
-#define SW(val, pdst)                                     \
-  {                                                       \
-    uint8_t *pdst_m = (uint8_t *)(pdst);                  \
-    const uint32_t val_m = (val);                         \
-                                                          \
-    __asm__ __volatile__("sw  %[val_m],  %[pdst_m]  \n\t" \
-                                                          \
-                         : [pdst_m] "=m"(*pdst_m)         \
-                         : [val_m] "r"(val_m));           \
-  }
-
-#define SD(val, pdst)                                     \
-  {                                                       \
-    uint8_t *pdst_m = (uint8_t *)(pdst);                  \
-    const uint64_t val_m = (val);                         \
-                                                          \
-    __asm__ __volatile__("sd  %[val_m],  %[pdst_m]  \n\t" \
-                                                          \
-                         : [pdst_m] "=m"(*pdst_m)         \
-                         : [val_m] "r"(val_m));           \
-  }
+#define SH(val, pdst) *(uint16_t *)(pdst) = (val);
+#define SW(val, pdst) *(uint32_t *)(pdst) = (val);
+#define SD(val, pdst) *(uint64_t *)(pdst) = (val);
 #else  // !(__mips_isa_rev >= 6)
-#define LH(psrc)                                           \
-  ({                                                       \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
-    uint16_t val_m;                                        \
-                                                           \
-    __asm__ __volatile__("ulh  %[val_m],  %[psrc_m]  \n\t" \
-                                                           \
-                         : [val_m] "=r"(val_m)             \
-                         : [psrc_m] "m"(*psrc_m));         \
-                                                           \
-    val_m;                                                 \
+#define LH(psrc)                                                 \
+  ({                                                             \
+    const uint8_t *psrc_lh_m = (const uint8_t *)(psrc);          \
+    uint16_t val_lh_m;                                           \
+                                                                 \
+    __asm__ __volatile__("ulh  %[val_lh_m],  %[psrc_lh_m]  \n\t" \
+                                                                 \
+                         : [val_lh_m] "=r"(val_lh_m)             \
+                         : [psrc_lh_m] "m"(*psrc_lh_m));         \
+                                                                 \
+    val_lh_m;                                                    \
   })
 
-#define LW(psrc)                                           \
-  ({                                                       \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
-    uint32_t val_m;                                        \
-                                                           \
-    __asm__ __volatile__("ulw  %[val_m],  %[psrc_m]  \n\t" \
-                                                           \
-                         : [val_m] "=r"(val_m)             \
-                         : [psrc_m] "m"(*psrc_m));         \
-                                                           \
-    val_m;                                                 \
+#define LW(psrc)                                        \
+  ({                                                    \
+    const uint8_t *psrc_lw_m = (const uint8_t *)(psrc); \
+    uint32_t val_lw_m;                                  \
+                                                        \
+    __asm__ __volatile__(                               \
+        "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t"         \
+        "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t"         \
+        : [val_lw_m] "=&r"(val_lw_m)                    \
+        : [psrc_lw_m] "r"(psrc_lw_m));                  \
+                                                        \
+    val_lw_m;                                           \
   })
 
 #if (__mips == 64)
-#define LD(psrc)                                           \
-  ({                                                       \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
-    uint64_t val_m = 0;                                    \
-                                                           \
-    __asm__ __volatile__("uld  %[val_m],  %[psrc_m]  \n\t" \
-                                                           \
-                         : [val_m] "=r"(val_m)             \
-                         : [psrc_m] "m"(*psrc_m));         \
-                                                           \
-    val_m;                                                 \
+#define LD(psrc)                                        \
+  ({                                                    \
+    const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \
+    uint64_t val_ld_m = 0;                              \
+                                                        \
+    __asm__ __volatile__(                               \
+        "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t"         \
+        "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t"         \
+        : [val_ld_m] "=&r"(val_ld_m)                    \
+        : [psrc_ld_m] "r"(psrc_ld_m));                  \
+                                                        \
+    val_ld_m;                                           \
   })
 #else  // !(__mips == 64)
-#define LD(psrc)                                                              \
-  ({                                                                          \
-    const uint8_t *psrc_m1 = (const uint8_t *)(psrc);                         \
-    uint32_t val0_m, val1_m;                                                  \
-    uint64_t val_m_combined = 0;                                              \
-                                                                              \
-    val0_m = LW(psrc_m1);                                                     \
-    val1_m = LW(psrc_m1 + 4);                                                 \
-                                                                              \
-    val_m_combined = (uint64_t)(val1_m);                                      \
-    val_m_combined = (uint64_t)((val_m_combined << 32) & 0xFFFFFFFF00000000); \
-    val_m_combined = (uint64_t)(val_m_combined | (uint64_t)val0_m);           \
-                                                                              \
-    val_m_combined;                                                           \
+#define LD(psrc)                                                  \
+  ({                                                              \
+    const uint8_t *psrc_ld_m = (const uint8_t *)(psrc);           \
+    uint32_t val0_ld_m, val1_ld_m;                                \
+    uint64_t val_ld_m = 0;                                        \
+                                                                  \
+    val0_ld_m = LW(psrc_ld_m);                                    \
+    val1_ld_m = LW(psrc_ld_m + 4);                                \
+                                                                  \
+    val_ld_m = (uint64_t)(val1_ld_m);                             \
+    val_ld_m = (uint64_t)((val_ld_m << 32) & 0xFFFFFFFF00000000); \
+    val_ld_m = (uint64_t)(val_ld_m | (uint64_t)val0_ld_m);        \
+                                                                  \
+    val_ld_m;                                                     \
   })
 #endif  // (__mips == 64)
 
-#define SH(val, pdst)                                      \
-  {                                                        \
-    uint8_t *pdst_m = (uint8_t *)(pdst);                   \
-    const uint16_t val_m = (val);                          \
-                                                           \
-    __asm__ __volatile__("ush  %[val_m],  %[pdst_m]  \n\t" \
-                                                           \
-                         : [pdst_m] "=m"(*pdst_m)          \
-                         : [val_m] "r"(val_m));            \
+#define SH(val, pdst)                                            \
+  {                                                              \
+    uint8_t *pdst_sh_m = (uint8_t *)(pdst);                      \
+    const uint16_t val_sh_m = (val);                             \
+                                                                 \
+    __asm__ __volatile__("ush  %[val_sh_m],  %[pdst_sh_m]  \n\t" \
+                                                                 \
+                         : [pdst_sh_m] "=m"(*pdst_sh_m)          \
+                         : [val_sh_m] "r"(val_sh_m));            \
   }
 
-#define SW(val, pdst)                                      \
-  {                                                        \
-    uint8_t *pdst_m = (uint8_t *)(pdst);                   \
-    const uint32_t val_m = (val);                          \
-                                                           \
-    __asm__ __volatile__("usw  %[val_m],  %[pdst_m]  \n\t" \
-                                                           \
-                         : [pdst_m] "=m"(*pdst_m)          \
-                         : [val_m] "r"(val_m));            \
+#define SW(val, pdst)                                            \
+  {                                                              \
+    uint8_t *pdst_sw_m = (uint8_t *)(pdst);                      \
+    const uint32_t val_sw_m = (val);                             \
+                                                                 \
+    __asm__ __volatile__("usw  %[val_sw_m],  %[pdst_sw_m]  \n\t" \
+                                                                 \
+                         : [pdst_sw_m] "=m"(*pdst_sw_m)          \
+                         : [val_sw_m] "r"(val_sw_m));            \
   }
 
-#define SD(val, pdst)                                        \
-  {                                                          \
-    uint8_t *pdst_m1 = (uint8_t *)(pdst);                    \
-    uint32_t val0_m, val1_m;                                 \
-                                                             \
-    val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
-    val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
-                                                             \
-    SW(val0_m, pdst_m1);                                     \
-    SW(val1_m, pdst_m1 + 4);                                 \
+#define SD(val, pdst)                                           \
+  {                                                             \
+    uint8_t *pdst_sd_m = (uint8_t *)(pdst);                     \
+    uint32_t val0_sd_m, val1_sd_m;                              \
+                                                                \
+    val0_sd_m = (uint32_t)((val) & 0x00000000FFFFFFFF);         \
+    val1_sd_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
+                                                                \
+    SW(val0_sd_m, pdst_sd_m);                                   \
+    SW(val1_sd_m, pdst_sd_m + 4);                               \
   }
 #endif  // (__mips_isa_rev >= 6)
 
@@ -283,97 +227,73 @@
     SD(in3, (pdst) + 3 * stride);             \
   }
 
-/* Description : Load vectors with 16 byte elements with stride
+/* Description : Load vector elements with stride
    Arguments   : Inputs  - psrc, stride
                  Outputs - out0, out1
                  Return Type - as per RTYPE
    Details     : Load 16 byte elements in 'out0' from (psrc)
                  Load 16 byte elements in 'out1' from (psrc + stride)
 */
-#define LD_B2(RTYPE, psrc, stride, out0, out1) \
+#define LD_V2(RTYPE, psrc, stride, out0, out1) \
   {                                            \
-    out0 = LD_B(RTYPE, (psrc));                \
-    out1 = LD_B(RTYPE, (psrc) + stride);       \
+    out0 = LD_V(RTYPE, (psrc));                \
+    out1 = LD_V(RTYPE, (psrc) + stride);       \
   }
-#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
-#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
+#define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__)
+#define LD_SB2(...) LD_V2(v16i8, __VA_ARGS__)
+#define LD_SH2(...) LD_V2(v8i16, __VA_ARGS__)
+#define LD_SW2(...) LD_V2(v4i32, __VA_ARGS__)
 
-#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \
+#define LD_V3(RTYPE, psrc, stride, out0, out1, out2) \
   {                                                  \
-    LD_B2(RTYPE, (psrc), stride, out0, out1);        \
-    out2 = LD_B(RTYPE, (psrc) + 2 * stride);         \
+    LD_V2(RTYPE, (psrc), stride, out0, out1);        \
+    out2 = LD_V(RTYPE, (psrc) + 2 * stride);         \
   }
-#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
+#define LD_UB3(...) LD_V3(v16u8, __VA_ARGS__)
 
-#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
+#define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3) \
   {                                                        \
-    LD_B2(RTYPE, (psrc), stride, out0, out1);              \
-    LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
+    LD_V2(RTYPE, (psrc), stride, out0, out1);              \
+    LD_V2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
   }
-#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
-#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
+#define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__)
+#define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__)
+#define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__)
 
-#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
+#define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
   {                                                              \
-    LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);        \
-    out4 = LD_B(RTYPE, (psrc) + 4 * stride);                     \
+    LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3);        \
+    out4 = LD_V(RTYPE, (psrc) + 4 * stride);                     \
   }
-#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
-#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
+#define LD_UB5(...) LD_V5(v16u8, __VA_ARGS__)
+#define LD_SB5(...) LD_V5(v16i8, __VA_ARGS__)
 
-#define LD_B7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \
+#define LD_V7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \
   {                                                                          \
-    LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);              \
-    LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);                   \
+    LD_V5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);              \
+    LD_V2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);                   \
   }
-#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
+#define LD_SB7(...) LD_V7(v16i8, __VA_ARGS__)
 
-#define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
+#define LD_V8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
               out7)                                                          \
   {                                                                          \
-    LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);                    \
-    LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);       \
+    LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3);                    \
+    LD_V4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);       \
   }
-#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
-#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
+#define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__)
+#define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__)
+#define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__)
 
-/* Description : Load vectors with 8 halfword elements with stride
-   Arguments   : Inputs  - psrc, stride
-                 Outputs - out0, out1
-   Details     : Load 8 halfword elements in 'out0' from (psrc)
-                 Load 8 halfword elements in 'out1' from (psrc + stride)
-*/
-#define LD_H2(RTYPE, psrc, stride, out0, out1) \
-  {                                            \
-    out0 = LD_H(RTYPE, (psrc));                \
-    out1 = LD_H(RTYPE, (psrc) + (stride));     \
-  }
-#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
-
-#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \
-  {                                                        \
-    LD_H2(RTYPE, (psrc), stride, out0, out1);              \
-    LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
-  }
-#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
-
-#define LD_H8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
-              out7)                                                          \
-  {                                                                          \
-    LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);                    \
-    LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);       \
-  }
-#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
-
-#define LD_H16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6,  \
+#define LD_V16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6,  \
                out7, out8, out9, out10, out11, out12, out13, out14, out15)     \
   {                                                                            \
-    LD_H8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6,     \
+    LD_V8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6,     \
           out7);                                                               \
-    LD_H8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \
+    LD_V8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \
           out13, out14, out15);                                                \
   }
-#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
+#define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__)
 
 /* Description : Load 4x4 block of signed halfword elements from 1D source
                  data into 4 vectors (Each vector with 4 signed halfwords)
@@ -388,79 +308,35 @@
     out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
   }
 
-/* Description : Load 2 vectors of signed word elements with stride
-   Arguments   : Inputs  - psrc, stride
-                 Outputs - out0, out1
-                 Return Type - signed word
-*/
-#define LD_SW2(psrc, stride, out0, out1) \
-  {                                      \
-    out0 = LD_SW((psrc));                \
-    out1 = LD_SW((psrc) + stride);       \
-  }
-
-/* Description : Store vectors of 16 byte elements with stride
+/* Description : Store vectors with stride
    Arguments   : Inputs - in0, in1, pdst, stride
    Details     : Store 16 byte elements from 'in0' to (pdst)
                  Store 16 byte elements from 'in1' to (pdst + stride)
 */
-#define ST_B2(RTYPE, in0, in1, pdst, stride) \
+#define ST_V2(RTYPE, in0, in1, pdst, stride) \
   {                                          \
-    ST_B(RTYPE, in0, (pdst));                \
-    ST_B(RTYPE, in1, (pdst) + stride);       \
+    ST_V(RTYPE, in0, (pdst));                \
+    ST_V(RTYPE, in1, (pdst) + stride);       \
   }
-#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
+#define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__)
+#define ST_SH2(...) ST_V2(v8i16, __VA_ARGS__)
+#define ST_SW2(...) ST_V2(v4i32, __VA_ARGS__)
 
-#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
+#define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
   {                                                      \
-    ST_B2(RTYPE, in0, in1, (pdst), stride);              \
-    ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
+    ST_V2(RTYPE, in0, in1, (pdst), stride);              \
+    ST_V2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
   }
-#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
+#define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__)
+#define ST_SH4(...) ST_V4(v8i16, __VA_ARGS__)
 
-#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
+#define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
   {                                                                        \
-    ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                        \
-    ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);         \
-  }
-#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
-
-/* Description : Store vectors of 8 halfword elements with stride
-   Arguments   : Inputs - in0, in1, pdst, stride
-   Details     : Store 8 halfword elements from 'in0' to (pdst)
-                 Store 8 halfword elements from 'in1' to (pdst + stride)
-*/
-#define ST_H2(RTYPE, in0, in1, pdst, stride) \
-  {                                          \
-    ST_H(RTYPE, in0, (pdst));                \
-    ST_H(RTYPE, in1, (pdst) + stride);       \
-  }
-#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
-
-#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
-  {                                                      \
-    ST_H2(RTYPE, in0, in1, (pdst), stride);              \
-    ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
-  }
-#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
-
-#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
-  {                                                                        \
-    ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);                      \
-    ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);         \
-  }
-#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
-
-/* Description : Store vectors of word elements with stride
-   Arguments   : Inputs - in0, in1, pdst, stride
-   Details     : Store 4 word elements from 'in0' to (pdst)
-                 Store 4 word elements from 'in1' to (pdst + stride)
-*/
-#define ST_SW2(in0, in1, pdst, stride) \
-  {                                    \
-    ST_SW(in0, (pdst));                \
-    ST_SW(in1, (pdst) + stride);       \
+    ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride);                        \
+    ST_V4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);         \
   }
+#define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__)
+#define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__)
 
 /* Description : Store 2x4 byte block to destination memory from input vector
    Arguments   : Inputs - in, stidx, pdst, stride
@@ -605,17 +481,17 @@
                  with rounding is calculated and written to 'out0'
 */
 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)   \
-  {                                                       \
+  do {                                                    \
     out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \
     out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \
-  }
+  } while (0)
 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
 
 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
                  out2, out3)                                                \
   {                                                                         \
-    AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)                         \
-    AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3)                         \
+    AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
+    AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
   }
 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
 
@@ -681,6 +557,7 @@
 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
+#define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
 
 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \
                 out3)                                                          \
@@ -897,16 +774,16 @@
    Details     : 4 signed word elements of 'in' vector are added together and
                  the resulting integer sum is returned
 */
-#define HADD_SW_S32(in)                            \
-  ({                                               \
-    v2i64 res0_m, res1_m;                          \
-    int32_t sum_m;                                 \
-                                                   \
-    res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \
-    res1_m = __msa_splati_d(res0_m, 1);            \
-    res0_m = res0_m + res1_m;                      \
-    sum_m = __msa_copy_s_w((v4i32)res0_m, 0);      \
-    sum_m;                                         \
+#define HADD_SW_S32(in)                                               \
+  ({                                                                  \
+    v2i64 hadd_sw_s32_res0_m, hadd_sw_s32_res1_m;                     \
+    int32_t hadd_sw_s32_sum_m;                                        \
+                                                                      \
+    hadd_sw_s32_res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in);        \
+    hadd_sw_s32_res1_m = __msa_splati_d(hadd_sw_s32_res0_m, 1);       \
+    hadd_sw_s32_res0_m = hadd_sw_s32_res0_m + hadd_sw_s32_res1_m;     \
+    hadd_sw_s32_sum_m = __msa_copy_s_w((v4i32)hadd_sw_s32_res0_m, 0); \
+    hadd_sw_s32_sum_m;                                                \
   })
 
 /* Description : Horizontal addition of 4 unsigned word elements
@@ -916,16 +793,16 @@
    Details     : 4 unsigned word elements of 'in' vector are added together and
                  the resulting integer sum is returned
 */
-#define HADD_UW_U32(in)                               \
-  ({                                                  \
-    v2u64 res0_m, res1_m;                             \
-    uint32_t sum_m;                                   \
-                                                      \
-    res0_m = __msa_hadd_u_d((v4u32)in, (v4u32)in);    \
-    res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \
-    res0_m += res1_m;                                 \
-    sum_m = __msa_copy_u_w((v4i32)res0_m, 0);         \
-    sum_m;                                            \
+#define HADD_UW_U32(in)                                                       \
+  ({                                                                          \
+    v2u64 hadd_uw_u32_res0_m, hadd_uw_u32_res1_m;                             \
+    uint32_t hadd_uw_u32_sum_m;                                               \
+                                                                              \
+    hadd_uw_u32_res0_m = __msa_hadd_u_d((v4u32)in, (v4u32)in);                \
+    hadd_uw_u32_res1_m = (v2u64)__msa_splati_d((v2i64)hadd_uw_u32_res0_m, 1); \
+    hadd_uw_u32_res0_m += hadd_uw_u32_res1_m;                                 \
+    hadd_uw_u32_sum_m = __msa_copy_u_w((v4i32)hadd_uw_u32_res0_m, 0);         \
+    hadd_uw_u32_sum_m;                                                        \
   })
 
 /* Description : Horizontal addition of 8 unsigned halfword elements
@@ -935,14 +812,14 @@
    Details     : 8 unsigned halfword elements of 'in' vector are added
                  together and the resulting integer sum is returned
 */
-#define HADD_UH_U32(in)                           \
-  ({                                              \
-    v4u32 res_m;                                  \
-    uint32_t sum_m;                               \
-                                                  \
-    res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \
-    sum_m = HADD_UW_U32(res_m);                   \
-    sum_m;                                        \
+#define HADD_UH_U32(in)                                       \
+  ({                                                          \
+    v4u32 hadd_uh_u32_res_m;                                  \
+    uint32_t hadd_uh_u32_sum_m;                               \
+                                                              \
+    hadd_uh_u32_res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \
+    hadd_uh_u32_sum_m = HADD_UW_U32(hadd_uh_u32_res_m);       \
+    hadd_uh_u32_sum_m;                                        \
   })
 
 /* Description : Horizontal addition of unsigned byte vector elements
@@ -1049,6 +926,7 @@
   }
 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
+#define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__)
 
 /* Description : Interleave even byte elements from vectors
    Arguments   : Inputs  - in0, in1, in2, in3
@@ -1307,6 +1185,7 @@
     out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
   }
 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
+#define ILVRL_W2_SB(...) ILVRL_W2(v16i8, __VA_ARGS__)
 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
 
@@ -1559,6 +1438,12 @@
    Details     : Each element of vector 'in0' is right shifted by 'shift' and
                  the result is written in-place. 'shift' is a GP variable.
 */
+#define SRA_2V(in0, in1, shift) \
+  {                             \
+    in0 = in0 >> shift;         \
+    in1 = in1 >> shift;         \
+  }
+
 #define SRA_4V(in0, in1, in2, in3, shift) \
   {                                       \
     in0 = in0 >> shift;                   \
@@ -1578,15 +1463,15 @@
                  'shift' is a vector.
 */
 #define SRAR_W2(RTYPE, in0, in1, shift)                  \
-  {                                                      \
+  do {                                                   \
     in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \
     in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \
-  }
+  } while (0)
 
 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
   {                                               \
-    SRAR_W2(RTYPE, in0, in1, shift)               \
-    SRAR_W2(RTYPE, in2, in3, shift)               \
+    SRAR_W2(RTYPE, in0, in1, shift);              \
+    SRAR_W2(RTYPE, in2, in3, shift);              \
   }
 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
 
@@ -1714,6 +1599,25 @@
     out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \
   }
 
+/* Description : Sign extend byte elements from input vector and return
+                 halfword results in pair of vectors
+   Arguments   : Input   - in           (byte vector)
+                 Outputs - out0, out1   (sign extended halfword vectors)
+                 Return Type - signed halfword
+   Details     : Sign bit of byte elements from input vector 'in' is
+                 extracted and interleaved right with same vector 'in0' to
+                 generate 8 signed halfword elements in 'out0'
+                 Then interleaved left with same vector 'in0' to
+                 generate 8 signed halfword elements in 'out1'
+*/
+#define UNPCK_SB_SH(in, out0, out1)       \
+  {                                       \
+    v16i8 tmp_m;                          \
+                                          \
+    tmp_m = __msa_clti_s_b((v16i8)in, 0); \
+    ILVRL_B2_SH(tmp_m, in, out0, out1);   \
+  }
+
 /* Description : Zero extend unsigned byte elements to halfword elements
    Arguments   : Input   - in          (unsigned byte vector)
                  Outputs - out0, out1  (unsigned  halfword vectors)
@@ -1872,8 +1776,6 @@
     out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
                                                                               \
     tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);              \
-    tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);              \
-    tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);              \
     tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);              \
     out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
     out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
@@ -2027,19 +1929,17 @@
 
 /* Description : Converts inputs to unsigned bytes, interleave, average & store
                  as 8x4 unsigned byte block
-   Arguments   : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,
-                          pdst, stride
+   Arguments   : Inputs  - in0, in1, in2, in3, dst0, dst1, pdst, stride
 */
-#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, dst2, dst3, \
-                                pdst, stride)                               \
-  {                                                                         \
-    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
-                                                                            \
-    tmp0_m = PCKEV_XORI128_UB(in0, in1);                                    \
-    tmp1_m = PCKEV_XORI128_UB(in2, in3);                                    \
-    ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                     \
-    AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);            \
-    ST8x4_UB(tmp0_m, tmp1_m, pdst, stride);                                 \
+#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, pdst, stride) \
+  {                                                                           \
+    v16u8 tmp0_m, tmp1_m;                                                     \
+    uint8_t *pdst_m = (uint8_t *)(pdst);                                      \
+                                                                              \
+    tmp0_m = PCKEV_XORI128_UB(in0, in1);                                      \
+    tmp1_m = PCKEV_XORI128_UB(in2, in3);                                      \
+    AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m);                  \
+    ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                                 \
   }
 
 /* Description : Pack even byte elements and store byte vector in destination
@@ -2068,4 +1968,4 @@
                                                                 \
     tmp1_m;                                                     \
   })
-#endif /* VPX_DSP_MIPS_MACROS_MSA_H_ */
+#endif  // VPX_VPX_DSP_MIPS_MACROS_MSA_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/sad_mmi.c b/media/libvpx/libvpx/vpx_dsp/mips/sad_mmi.c
new file mode 100644
index 0000000000..7f5882bca3
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/sad_mmi.c
@@ -0,0 +1,807 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/asmdefs_mmi.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+#define SAD_SRC_REF_ABS_SUB_64                                      \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x0f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x08(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp3],   0x07(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x00(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x0f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x08(%[ref])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x17(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x10(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x1f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x18(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp3],   0x17(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x10(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x1f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x18(%[ref])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x27(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x20(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x2f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x28(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp3],   0x27(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x20(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x2f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x28(%[ref])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x37(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x30(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x3f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x38(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp3],   0x37(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x30(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x3f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x38(%[ref])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"
+
+#define SAD_SRC_REF_ABS_SUB_32                                      \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x0f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x08(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp3],   0x07(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x00(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x0f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x08(%[ref])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x17(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x10(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x1f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x18(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp3],   0x17(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x10(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x1f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x18(%[ref])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"
+
+#define SAD_SRC_REF_ABS_SUB_16                                      \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x0f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x08(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp3],   0x07(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x00(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x0f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x08(%[ref])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"
+
+#define SAD_SRC_REF_ABS_SUB_8                                       \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x07(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x00(%[ref])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "paddw      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t"
+
+#if _MIPS_SIM == _ABIO32
+#define SAD_SRC_REF_ABS_SUB_4                                       \
+  "ulw        %[tmp0],    0x00(%[src])                        \n\t" \
+  "mtc1       %[tmp0],    %[ftmp1]                            \n\t" \
+  "ulw        %[tmp0],    0x00(%[ref])                        \n\t" \
+  "mtc1       %[tmp0],    %[ftmp2]                            \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "mthc1      $0,         %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "paddw      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t"
+#else /* _MIPS_SIM == _ABI64 || _MIPS_SIM == _ABIN32 */
+#define SAD_SRC_REF_ABS_SUB_4                                       \
+  "gslwlc1    %[ftmp1],   0x03(%[src])                        \n\t" \
+  "gslwrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
+  "gslwlc1    %[ftmp2],   0x03(%[ref])                        \n\t" \
+  "gslwrc1    %[ftmp2],   0x00(%[ref])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "mthc1      $0,         %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "paddw      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t"
+#endif /* _MIPS_SIM == _ABIO32 */
+
+#define SAD_SRC_AVGREF_ABS_SUB_64                                   \
+  "gsldlc1    %[ftmp1],   0x07(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp2],   0x0f(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp2],   0x08(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp3],   0x07(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x00(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x0f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x08(%[ref])                        \n\t" \
+  "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x0f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x08(%[src])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x17(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp1],   0x10(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp2],   0x1f(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp2],   0x18(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp3],   0x17(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x10(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x1f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x18(%[ref])                        \n\t" \
+  "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x17(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x10(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x1f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x18(%[src])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x27(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp1],   0x20(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp2],   0x2f(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp2],   0x28(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp3],   0x27(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x20(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x2f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x28(%[ref])                        \n\t" \
+  "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x27(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x20(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x2f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x28(%[src])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x37(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp1],   0x30(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp2],   0x3f(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp2],   0x38(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp3],   0x37(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x30(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x3f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x38(%[ref])                        \n\t" \
+  "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x37(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x30(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x3f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x38(%[src])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"
+
+#define SAD_SRC_AVGREF_ABS_SUB_32                                   \
+  "gsldlc1    %[ftmp1],   0x07(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp2],   0x0f(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp2],   0x08(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp3],   0x07(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x00(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x0f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x08(%[ref])                        \n\t" \
+  "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x0f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x08(%[src])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x17(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp1],   0x10(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp2],   0x1f(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp2],   0x18(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp3],   0x17(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x10(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x1f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x18(%[ref])                        \n\t" \
+  "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x17(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x10(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x1f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x18(%[src])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"
+
+#define SAD_SRC_AVGREF_ABS_SUB_16                                   \
+  "gsldlc1    %[ftmp1],   0x07(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp2],   0x0f(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp2],   0x08(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp3],   0x07(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x00(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x0f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x08(%[ref])                        \n\t" \
+  "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x0f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x08(%[src])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"
+
+#define SAD_SRC_AVGREF_ABS_SUB_8                                    \
+  "gsldlc1    %[ftmp1],   0x07(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp2],   0x07(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x00(%[ref])                        \n\t" \
+  "pavgb      %[ftmp2],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "paddw      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t"
+
+#if _MIPS_SIM == _ABIO32
+#define SAD_SRC_AVGREF_ABS_SUB_4                                    \
+  "ulw        %[tmp0],    0x00(%[second_pred])                \n\t" \
+  "mtc1       %[tmp0],    %[ftmp1]                            \n\t" \
+  "ulw        %[tmp0],    0x00(%[ref])                        \n\t" \
+  "mtc1       %[tmp0],    %[ftmp2]                            \n\t" \
+  "pavgb      %[ftmp2],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "mthc1      $0,         %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "paddw      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t"
+#else /* _MIPS_SIM == _ABI64 || _MIPS_SIM == _ABIN32 */
+#define SAD_SRC_AVGREF_ABS_SUB_4                                    \
+  "gslwlc1    %[ftmp1],   0x03(%[second_pred])                \n\t" \
+  "gslwrc1    %[ftmp1],   0x00(%[second_pred])                \n\t" \
+  "gslwlc1    %[ftmp2],   0x03(%[ref])                        \n\t" \
+  "gslwrc1    %[ftmp2],   0x00(%[ref])                        \n\t" \
+  "pavgb      %[ftmp2],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "mthc1      $0,         %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "paddw      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t"
+#endif /* _MIPS_SIM == _ABIO32 */
+
+#define sadMxNx4D_mmi(m, n)                                                  \
+  void vpx_sad##m##x##n##x4d_mmi(const uint8_t *src, int src_stride,         \
+                                 const uint8_t *const ref_array[],           \
+                                 int ref_stride, uint32_t *sad_array) {      \
+    int i;                                                                   \
+    for (i = 0; i < 4; ++i)                                                  \
+      sad_array[i] =                                                         \
+          vpx_sad##m##x##n##_mmi(src, src_stride, ref_array[i], ref_stride); \
+  }
+
+static inline unsigned int vpx_sad64x(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride,
+                                      int counter) {
+  unsigned int sad;
+  double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+  mips_reg l_counter = counter;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "pxor       %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
+    "1:                                                         \n\t"
+    // Include two loop body, to reduce loop time.
+    SAD_SRC_REF_ABS_SUB_64
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    SAD_SRC_REF_ABS_SUB_64
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    MMI_ADDIU(%[counter], %[counter], -0x02)
+    "bnez       %[counter], 1b                                  \n\t"
+    "mfc1       %[sad],     %[ftmp5]                            \n\t"
+    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+      [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+      [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride)
+  );
+  /* clang-format on */
+
+  return sad;
+}
+
+#define vpx_sad64xN(H)                                                   \
+  unsigned int vpx_sad64x##H##_mmi(const uint8_t *src, int src_stride,   \
+                                   const uint8_t *ref, int ref_stride) { \
+    return vpx_sad64x(src, src_stride, ref, ref_stride, H);              \
+  }
+
+vpx_sad64xN(64);
+vpx_sad64xN(32);
+sadMxNx4D_mmi(64, 64);
+sadMxNx4D_mmi(64, 32);
+
+static inline unsigned int vpx_sad_avg64x(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred,
+                                          int counter) {
+  unsigned int sad;
+  double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+  mips_reg l_counter = counter;
+  mips_reg l_second_pred = (mips_reg)second_pred;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "pxor       %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
+    "1:                                                         \n\t"
+    // Include two loop body, to reduce loop time.
+    SAD_SRC_AVGREF_ABS_SUB_64
+    MMI_ADDIU(%[second_pred], %[second_pred], 0x40)
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    SAD_SRC_AVGREF_ABS_SUB_64
+    MMI_ADDIU(%[second_pred], %[second_pred], 0x40)
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    MMI_ADDIU(%[counter], %[counter], -0x02)
+    "bnez       %[counter], 1b                                  \n\t"
+    "mfc1       %[sad],     %[ftmp5]                            \n\t"
+    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+      [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+      [src]"+&r"(src), [ref]"+&r"(ref),
+      [second_pred]"+&r"(l_second_pred),
+      [sad]"=&r"(sad)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride)
+  );
+  /* clang-format on */
+
+  return sad;
+}
+
+#define vpx_sad_avg64xN(H)                                                   \
+  unsigned int vpx_sad64x##H##_avg_mmi(const uint8_t *src, int src_stride,   \
+                                       const uint8_t *ref, int ref_stride,   \
+                                       const uint8_t *second_pred) {         \
+    return vpx_sad_avg64x(src, src_stride, ref, ref_stride, second_pred, H); \
+  }
+
+vpx_sad_avg64xN(64);
+vpx_sad_avg64xN(32);
+
+static inline unsigned int vpx_sad32x(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride,
+                                      int counter) {
+  unsigned int sad;
+  double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+  mips_reg l_counter = counter;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "pxor       %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
+    "1:                                                         \n\t"
+    // Include two loop body, to reduce loop time.
+    SAD_SRC_REF_ABS_SUB_32
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    SAD_SRC_REF_ABS_SUB_32
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    MMI_ADDIU(%[counter], %[counter], -0x02)
+    "bnez       %[counter], 1b                                  \n\t"
+    "mfc1       %[sad],     %[ftmp5]                            \n\t"
+    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+      [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+      [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride)
+  );
+  /* clang-format on */
+
+  return sad;
+}
+
+#define vpx_sad32xN(H)                                                   \
+  unsigned int vpx_sad32x##H##_mmi(const uint8_t *src, int src_stride,   \
+                                   const uint8_t *ref, int ref_stride) { \
+    return vpx_sad32x(src, src_stride, ref, ref_stride, H);              \
+  }
+
+vpx_sad32xN(64);
+vpx_sad32xN(32);
+vpx_sad32xN(16);
+sadMxNx4D_mmi(32, 64);
+sadMxNx4D_mmi(32, 32);
+sadMxNx4D_mmi(32, 16);
+
+static inline unsigned int vpx_sad_avg32x(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred,
+                                          int counter) {
+  unsigned int sad;
+  double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+  mips_reg l_counter = counter;
+  mips_reg l_second_pred = (mips_reg)second_pred;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "pxor       %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
+    "1:                                                         \n\t"
+    // Include two loop body, to reduce loop time.
+    SAD_SRC_AVGREF_ABS_SUB_32
+    MMI_ADDIU(%[second_pred], %[second_pred], 0x20)
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    SAD_SRC_AVGREF_ABS_SUB_32
+    MMI_ADDIU(%[second_pred], %[second_pred], 0x20)
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    MMI_ADDIU(%[counter], %[counter], -0x02)
+    "bnez       %[counter], 1b                                  \n\t"
+    "mfc1       %[sad],     %[ftmp5]                            \n\t"
+    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+      [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+      [src]"+&r"(src), [ref]"+&r"(ref),
+      [second_pred]"+&r"(l_second_pred),
+      [sad]"=&r"(sad)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride)
+  );
+  /* clang-format on */
+
+  return sad;
+}
+
+#define vpx_sad_avg32xN(H)                                                   \
+  unsigned int vpx_sad32x##H##_avg_mmi(const uint8_t *src, int src_stride,   \
+                                       const uint8_t *ref, int ref_stride,   \
+                                       const uint8_t *second_pred) {         \
+    return vpx_sad_avg32x(src, src_stride, ref, ref_stride, second_pred, H); \
+  }
+
+vpx_sad_avg32xN(64);
+vpx_sad_avg32xN(32);
+vpx_sad_avg32xN(16);
+
+static inline unsigned int vpx_sad16x(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride,
+                                      int counter) {
+  unsigned int sad;
+  double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+  mips_reg l_counter = counter;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "pxor       %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
+    "1:                                                         \n\t"
+    // Include two loop body, to reduce loop time.
+    SAD_SRC_REF_ABS_SUB_16
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    SAD_SRC_REF_ABS_SUB_16
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    MMI_ADDIU(%[counter], %[counter], -0x02)
+    "bnez       %[counter], 1b                                  \n\t"
+    "mfc1       %[sad],     %[ftmp5]                            \n\t"
+    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+      [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+      [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride)
+  );
+  /* clang-format on */
+
+  return sad;
+}
+
+#define vpx_sad16xN(H)                                                   \
+  unsigned int vpx_sad16x##H##_mmi(const uint8_t *src, int src_stride,   \
+                                   const uint8_t *ref, int ref_stride) { \
+    return vpx_sad16x(src, src_stride, ref, ref_stride, H);              \
+  }
+
+vpx_sad16xN(32);
+vpx_sad16xN(16);
+vpx_sad16xN(8);
+sadMxNx4D_mmi(16, 32);
+sadMxNx4D_mmi(16, 16);
+sadMxNx4D_mmi(16, 8);
+
+static inline unsigned int vpx_sad_avg16x(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred,
+                                          int counter) {
+  unsigned int sad;
+  double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+  mips_reg l_counter = counter;
+  mips_reg l_second_pred = (mips_reg)second_pred;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "pxor       %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
+    "1:                                                         \n\t"
+    // Include two loop body, to reduce loop time.
+    SAD_SRC_AVGREF_ABS_SUB_16
+    MMI_ADDIU(%[second_pred], %[second_pred], 0x10)
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    SAD_SRC_AVGREF_ABS_SUB_16
+    MMI_ADDIU(%[second_pred], %[second_pred], 0x10)
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    MMI_ADDIU(%[counter], %[counter], -0x02)
+    "bnez       %[counter], 1b                                  \n\t"
+    "mfc1       %[sad],     %[ftmp5]                            \n\t"
+    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+      [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+      [src]"+&r"(src), [ref]"+&r"(ref),
+      [second_pred]"+&r"(l_second_pred),
+      [sad]"=&r"(sad)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride)
+  );
+  /* clang-format on */
+
+  return sad;
+}
+
+#define vpx_sad_avg16xN(H)                                                   \
+  unsigned int vpx_sad16x##H##_avg_mmi(const uint8_t *src, int src_stride,   \
+                                       const uint8_t *ref, int ref_stride,   \
+                                       const uint8_t *second_pred) {         \
+    return vpx_sad_avg16x(src, src_stride, ref, ref_stride, second_pred, H); \
+  }
+
+vpx_sad_avg16xN(32);
+vpx_sad_avg16xN(16);
+vpx_sad_avg16xN(8);
+
+static inline unsigned int vpx_sad8x(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride,
+                                     int counter) {
+  unsigned int sad;
+  double ftmp1, ftmp2, ftmp3;
+  mips_reg l_counter = counter;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "pxor       %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
+    "1:                                                         \n\t"
+    // Include two loop body, to reduce loop time.
+    SAD_SRC_REF_ABS_SUB_8
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    SAD_SRC_REF_ABS_SUB_8
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    MMI_ADDIU(%[counter], %[counter], -0x02)
+    "bnez       %[counter], 1b                                  \n\t"
+    "mfc1       %[sad],     %[ftmp3]                            \n\t"
+    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+      [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
+      [sad]"=&r"(sad)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride)
+  );
+  /* clang-format on */
+
+  return sad;
+}
+
+#define vpx_sad8xN(H)                                                   \
+  unsigned int vpx_sad8x##H##_mmi(const uint8_t *src, int src_stride,   \
+                                  const uint8_t *ref, int ref_stride) { \
+    return vpx_sad8x(src, src_stride, ref, ref_stride, H);              \
+  }
+
+vpx_sad8xN(16);
+vpx_sad8xN(8);
+vpx_sad8xN(4);
+sadMxNx4D_mmi(8, 16);
+sadMxNx4D_mmi(8, 8);
+sadMxNx4D_mmi(8, 4);
+
+static inline unsigned int vpx_sad_avg8x(const uint8_t *src, int src_stride,
+                                         const uint8_t *ref, int ref_stride,
+                                         const uint8_t *second_pred,
+                                         int counter) {
+  unsigned int sad;
+  double ftmp1, ftmp2, ftmp3;
+  mips_reg l_counter = counter;
+  mips_reg l_second_pred = (mips_reg)second_pred;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "pxor       %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
+    "1:                                                         \n\t"
+    // Include two loop body, to reduce loop time.
+    SAD_SRC_AVGREF_ABS_SUB_8
+    MMI_ADDIU(%[second_pred], %[second_pred], 0x08)
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    SAD_SRC_AVGREF_ABS_SUB_8
+    MMI_ADDIU(%[second_pred], %[second_pred], 0x08)
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    MMI_ADDIU(%[counter], %[counter], -0x02)
+    "bnez       %[counter], 1b                                  \n\t"
+    "mfc1       %[sad],     %[ftmp3]                            \n\t"
+    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+      [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
+      [second_pred]"+&r"(l_second_pred),
+      [sad]"=&r"(sad)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride)
+  );
+  /* clang-format on */
+
+  return sad;
+}
+
+#define vpx_sad_avg8xN(H)                                                   \
+  unsigned int vpx_sad8x##H##_avg_mmi(const uint8_t *src, int src_stride,   \
+                                      const uint8_t *ref, int ref_stride,   \
+                                      const uint8_t *second_pred) {         \
+    return vpx_sad_avg8x(src, src_stride, ref, ref_stride, second_pred, H); \
+  }
+
+vpx_sad_avg8xN(16);
+vpx_sad_avg8xN(8);
+vpx_sad_avg8xN(4);
+
+static inline unsigned int vpx_sad4x(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride,
+                                     int counter) {
+  unsigned int sad;
+  double ftmp1, ftmp2, ftmp3;
+  mips_reg l_counter = counter;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "pxor       %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
+    "1:                                                         \n\t"
+    // Include two loop body, to reduce loop time.
+    SAD_SRC_REF_ABS_SUB_4
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    SAD_SRC_REF_ABS_SUB_4
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    MMI_ADDIU(%[counter], %[counter], -0x02)
+    "bnez       %[counter], 1b                                  \n\t"
+    "mfc1       %[sad],     %[ftmp3]                            \n\t"
+    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+      [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
+      [sad]"=&r"(sad)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride)
+  );
+  /* clang-format on */
+
+  return sad;
+}
+
+#define vpx_sad4xN(H)                                                   \
+  unsigned int vpx_sad4x##H##_mmi(const uint8_t *src, int src_stride,   \
+                                  const uint8_t *ref, int ref_stride) { \
+    return vpx_sad4x(src, src_stride, ref, ref_stride, H);              \
+  }
+
+vpx_sad4xN(8);
+vpx_sad4xN(4);
+sadMxNx4D_mmi(4, 8);
+sadMxNx4D_mmi(4, 4);
+
+static inline unsigned int vpx_sad_avg4x(const uint8_t *src, int src_stride,
+                                         const uint8_t *ref, int ref_stride,
+                                         const uint8_t *second_pred,
+                                         int counter) {
+  unsigned int sad;
+  double ftmp1, ftmp2, ftmp3;
+  mips_reg l_counter = counter;
+  mips_reg l_second_pred = (mips_reg)second_pred;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "pxor       %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
+    "1:                                                         \n\t"
+    // Include two loop body, to reduce loop time.
+    SAD_SRC_AVGREF_ABS_SUB_4
+    MMI_ADDIU(%[second_pred], %[second_pred], 0x04)
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    SAD_SRC_AVGREF_ABS_SUB_4
+    MMI_ADDIU(%[second_pred], %[second_pred], 0x04)
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    MMI_ADDIU(%[counter], %[counter], -0x02)
+    "bnez       %[counter], 1b                                  \n\t"
+    "mfc1       %[sad],     %[ftmp3]                            \n\t"
+    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+      [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
+      [second_pred]"+&r"(l_second_pred),
+      [sad]"=&r"(sad)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride)
+  );
+  /* clang-format on */
+
+  return sad;
+}
+
+#define vpx_sad_avg4xN(H)                                                   \
+  unsigned int vpx_sad4x##H##_avg_mmi(const uint8_t *src, int src_stride,   \
+                                      const uint8_t *ref, int ref_stride,   \
+                                      const uint8_t *second_pred) {         \
+    return vpx_sad_avg4x(src, src_stride, ref, ref_stride, second_pred, H); \
+  }
+
+vpx_sad_avg4xN(8);
+vpx_sad_avg4xN(4);
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/sad_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/sad_msa.c
index e295123acf..b0f8ff1fd9 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/sad_msa.c
+++ b/media/libvpx/libvpx/vpx_dsp/mips/sad_msa.c
@@ -159,640 +159,6 @@ static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride,
   return sad;
 }
 
-static void sad_4width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
-                              const uint8_t *ref_ptr, int32_t ref_stride,
-                              int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  uint32_t src0, src1, src2, src3;
-  v16u8 src = { 0 };
-  v16u8 ref = { 0 };
-  v16u8 ref0, ref1, ref2, ref3, diff;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LW4(src_ptr, src_stride, src0, src1, src2, src3);
-    src_ptr += (4 * src_stride);
-    INSERT_W4_UB(src0, src1, src2, src3, src);
-
-    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    ref_ptr += (4 * ref_stride);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-}
-
-static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride,
-                              const uint8_t *ref, int32_t ref_stride,
-                              int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
-    ref += (4 * ref_stride);
-    PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
-                ref0, ref1);
-    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-}
-
-static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
-                               const uint8_t *ref_ptr, int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src, ref, ref0, ref1, diff;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-
-  for (ht_cnt = (height >> 1); ht_cnt--;) {
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-
-    diff = __msa_asub_u_b(src, ref0);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-
-    diff = __msa_asub_u_b(src, ref0);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-}
-
-static void sad_32width_x3_msa(const uint8_t *src, int32_t src_stride,
-                               const uint8_t *ref, int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, ref0_0, ref0_1, ref0_2, ref0, ref1;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-
-  for (ht_cnt = height >> 1; ht_cnt--;) {
-    LD_UB2(src, 16, src0, src1);
-    src += src_stride;
-    LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
-    ref += ref_stride;
-
-    sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
-    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
-    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    LD_UB2(src, 16, src0, src1);
-    src += src_stride;
-    LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
-    ref += ref_stride;
-
-    sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
-    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
-    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-}
-
-static void sad_64width_x3_msa(const uint8_t *src, int32_t src_stride,
-                               const uint8_t *ref, int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4, ref0, ref1, ref2, ref3;
-  v8u16 sad0_0 = { 0 };
-  v8u16 sad0_1 = { 0 };
-  v8u16 sad1_0 = { 0 };
-  v8u16 sad1_1 = { 0 };
-  v8u16 sad2_0 = { 0 };
-  v8u16 sad2_1 = { 0 };
-  v4u32 sad;
-
-  for (ht_cnt = height; ht_cnt--;) {
-    LD_UB4(src, 16, src0, src1, src2, src3);
-    src += src_stride;
-    LD_UB4(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3);
-    ref0_4 = LD_UB(ref + 64);
-    ref += ref_stride;
-
-    sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
-    sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
-    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
-    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-  }
-
-  sad = __msa_hadd_u_w(sad0_0, sad0_0);
-  sad += __msa_hadd_u_w(sad0_1, sad0_1);
-  sad_array[0] = HADD_SW_S32((v4i32)sad);
-
-  sad = __msa_hadd_u_w(sad1_0, sad1_0);
-  sad += __msa_hadd_u_w(sad1_1, sad1_1);
-  sad_array[1] = HADD_SW_S32((v4i32)sad);
-
-  sad = __msa_hadd_u_w(sad2_0, sad2_0);
-  sad += __msa_hadd_u_w(sad2_1, sad2_1);
-  sad_array[2] = HADD_SW_S32((v4i32)sad);
-}
-
-static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
-                              const uint8_t *ref_ptr, int32_t ref_stride,
-                              int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  uint32_t src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref2, ref3, diff;
-  v16u8 src = { 0 };
-  v16u8 ref = { 0 };
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-  v8u16 sad3 = { 0 };
-  v8u16 sad4 = { 0 };
-  v8u16 sad5 = { 0 };
-  v8u16 sad6 = { 0 };
-  v8u16 sad7 = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LW4(src_ptr, src_stride, src0, src1, src2, src3);
-    INSERT_W4_UB(src0, src1, src2, src3, src);
-    src_ptr += (4 * src_stride);
-    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    ref_ptr += (4 * ref_stride);
-
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad3 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad4 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad5 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad6 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad7 += __msa_hadd_u_h(diff, diff);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
-  sad_array[4] = HADD_UH_U32(sad4);
-  sad_array[5] = HADD_UH_U32(sad5);
-  sad_array[6] = HADD_UH_U32(sad6);
-  sad_array[7] = HADD_UH_U32(sad7);
-}
-
-static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride,
-                              const uint8_t *ref, int32_t ref_stride,
-                              int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-  v8u16 sad3 = { 0 };
-  v8u16 sad4 = { 0 };
-  v8u16 sad5 = { 0 };
-  v8u16 sad6 = { 0 };
-  v8u16 sad7 = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
-    ref += (4 * ref_stride);
-    PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
-                ref0, ref1);
-    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
-  sad_array[4] = HADD_UH_U32(sad4);
-  sad_array[5] = HADD_UH_U32(sad5);
-  sad_array[6] = HADD_UH_U32(sad6);
-  sad_array[7] = HADD_UH_U32(sad7);
-}
-
-static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
-                               const uint8_t *ref_ptr, int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src, ref0, ref1, ref;
-  v16u8 diff;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-  v8u16 sad3 = { 0 };
-  v8u16 sad4 = { 0 };
-  v8u16 sad5 = { 0 };
-  v8u16 sad6 = { 0 };
-  v8u16 sad7 = { 0 };
-
-  for (ht_cnt = (height >> 1); ht_cnt--;) {
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-
-    diff = __msa_asub_u_b(src, ref0);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
-    diff = __msa_asub_u_b(src, ref);
-    sad3 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
-    diff = __msa_asub_u_b(src, ref);
-    sad4 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
-    diff = __msa_asub_u_b(src, ref);
-    sad5 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
-    diff = __msa_asub_u_b(src, ref);
-    sad6 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
-    diff = __msa_asub_u_b(src, ref);
-    sad7 += __msa_hadd_u_h(diff, diff);
-
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-
-    diff = __msa_asub_u_b(src, ref0);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
-    diff = __msa_asub_u_b(src, ref);
-    sad3 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
-    diff = __msa_asub_u_b(src, ref);
-    sad4 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
-    diff = __msa_asub_u_b(src, ref);
-    sad5 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
-    diff = __msa_asub_u_b(src, ref);
-    sad6 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
-    diff = __msa_asub_u_b(src, ref);
-    sad7 += __msa_hadd_u_h(diff, diff);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
-  sad_array[4] = HADD_UH_U32(sad4);
-  sad_array[5] = HADD_UH_U32(sad5);
-  sad_array[6] = HADD_UH_U32(sad6);
-  sad_array[7] = HADD_UH_U32(sad7);
-}
-
-static void sad_32width_x8_msa(const uint8_t *src, int32_t src_stride,
-                               const uint8_t *ref, int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src0, src1;
-  v16u8 ref0, ref1, ref0_0, ref0_1, ref0_2;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-  v8u16 sad3 = { 0 };
-  v8u16 sad4 = { 0 };
-  v8u16 sad5 = { 0 };
-  v8u16 sad6 = { 0 };
-  v8u16 sad7 = { 0 };
-
-  for (ht_cnt = height; ht_cnt--;) {
-    LD_UB2(src, 16, src0, src1);
-    src += src_stride;
-    LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
-    ref += ref_stride;
-
-    sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
-    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
-    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
-    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
-    sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
-    sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
-    sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
-    sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
-  sad_array[4] = HADD_UH_U32(sad4);
-  sad_array[5] = HADD_UH_U32(sad5);
-  sad_array[6] = HADD_UH_U32(sad6);
-  sad_array[7] = HADD_UH_U32(sad7);
-}
-
-static void sad_64width_x8_msa(const uint8_t *src, int32_t src_stride,
-                               const uint8_t *ref, int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
-  const uint8_t *src_dup, *ref_dup;
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4;
-  v16u8 ref0, ref1, ref2, ref3;
-  v8u16 sad0_0 = { 0 };
-  v8u16 sad0_1 = { 0 };
-  v8u16 sad1_0 = { 0 };
-  v8u16 sad1_1 = { 0 };
-  v8u16 sad2_0 = { 0 };
-  v8u16 sad2_1 = { 0 };
-  v8u16 sad3_0 = { 0 };
-  v8u16 sad3_1 = { 0 };
-  v4u32 sad;
-
-  src_dup = src;
-  ref_dup = ref;
-
-  for (ht_cnt = height; ht_cnt--;) {
-    LD_UB4(src, 16, src0, src1, src2, src3);
-    src += src_stride;
-    LD_UB5(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
-    ref += ref_stride;
-
-    sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
-    sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
-    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
-    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 3);
-    sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-  }
-
-  sad = __msa_hadd_u_w(sad0_0, sad0_0);
-  sad += __msa_hadd_u_w(sad0_1, sad0_1);
-  sad_array[0] = HADD_SW_S32(sad);
-
-  sad = __msa_hadd_u_w(sad1_0, sad1_0);
-  sad += __msa_hadd_u_w(sad1_1, sad1_1);
-  sad_array[1] = HADD_SW_S32(sad);
-
-  sad = __msa_hadd_u_w(sad2_0, sad2_0);
-  sad += __msa_hadd_u_w(sad2_1, sad2_1);
-  sad_array[2] = HADD_SW_S32(sad);
-
-  sad = __msa_hadd_u_w(sad3_0, sad3_0);
-  sad += __msa_hadd_u_w(sad3_1, sad3_1);
-  sad_array[3] = HADD_SW_S32(sad);
-
-  sad0_0 = (v8u16)__msa_ldi_h(0);
-  sad0_1 = (v8u16)__msa_ldi_h(0);
-  sad1_0 = (v8u16)__msa_ldi_h(0);
-  sad1_1 = (v8u16)__msa_ldi_h(0);
-  sad2_0 = (v8u16)__msa_ldi_h(0);
-  sad2_1 = (v8u16)__msa_ldi_h(0);
-  sad3_0 = (v8u16)__msa_ldi_h(0);
-  sad3_1 = (v8u16)__msa_ldi_h(0);
-
-  for (ht_cnt = 64; ht_cnt--;) {
-    LD_UB4(src_dup, 16, src0, src1, src2, src3);
-    src_dup += src_stride;
-    LD_UB5(ref_dup, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
-    ref_dup += ref_stride;
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 4);
-    sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 5);
-    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 6);
-    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 7);
-    sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-  }
-
-  sad = __msa_hadd_u_w(sad0_0, sad0_0);
-  sad += __msa_hadd_u_w(sad0_1, sad0_1);
-  sad_array[4] = HADD_SW_S32(sad);
-
-  sad = __msa_hadd_u_w(sad1_0, sad1_0);
-  sad += __msa_hadd_u_w(sad1_1, sad1_1);
-  sad_array[5] = HADD_SW_S32(sad);
-
-  sad = __msa_hadd_u_w(sad2_0, sad2_0);
-  sad += __msa_hadd_u_w(sad2_1, sad2_1);
-  sad_array[6] = HADD_SW_S32(sad);
-
-  sad = __msa_hadd_u_w(sad3_0, sad3_0);
-  sad += __msa_hadd_u_w(sad3_1, sad3_1);
-  sad_array[7] = HADD_SW_S32(sad);
-}
-
 static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
                                const uint8_t *const aref_ptr[],
                                int32_t ref_stride, int32_t height,
@@ -1297,108 +663,38 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
     return sad_64width_msa(src, src_stride, ref, ref_stride, height);         \
   }
 
-#define VPX_SAD_4xHEIGHTx3_MSA(height)                                   \
-  void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
-                                 const uint8_t *ref, int32_t ref_stride, \
-                                 uint32_t *sads) {                       \
-    sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define VPX_SAD_8xHEIGHTx3_MSA(height)                                   \
-  void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
-                                 const uint8_t *ref, int32_t ref_stride, \
-                                 uint32_t *sads) {                       \
-    sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define VPX_SAD_16xHEIGHTx3_MSA(height)                                   \
-  void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t *sads) {                       \
-    sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define VPX_SAD_32xHEIGHTx3_MSA(height)                                   \
-  void vpx_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t *sads) {                       \
-    sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define VPX_SAD_64xHEIGHTx3_MSA(height)                                   \
-  void vpx_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t *sads) {                       \
-    sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define VPX_SAD_4xHEIGHTx8_MSA(height)                                   \
-  void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
-                                 const uint8_t *ref, int32_t ref_stride, \
-                                 uint32_t *sads) {                       \
-    sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define VPX_SAD_8xHEIGHTx8_MSA(height)                                   \
-  void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
-                                 const uint8_t *ref, int32_t ref_stride, \
-                                 uint32_t *sads) {                       \
-    sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define VPX_SAD_16xHEIGHTx8_MSA(height)                                   \
-  void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t *sads) {                       \
-    sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define VPX_SAD_32xHEIGHTx8_MSA(height)                                   \
-  void vpx_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t *sads) {                       \
-    sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define VPX_SAD_64xHEIGHTx8_MSA(height)                                   \
-  void vpx_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t *sads) {                       \
-    sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
 #define VPX_SAD_4xHEIGHTx4D_MSA(height)                                   \
   void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *const refs[],            \
-                                  int32_t ref_stride, uint32_t *sads) {   \
+                                  const uint8_t *const refs[4],           \
+                                  int32_t ref_stride, uint32_t sads[4]) { \
     sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
   }
 
 #define VPX_SAD_8xHEIGHTx4D_MSA(height)                                   \
   void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *const refs[],            \
-                                  int32_t ref_stride, uint32_t *sads) {   \
+                                  const uint8_t *const refs[4],           \
+                                  int32_t ref_stride, uint32_t sads[4]) { \
     sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
   }
 
 #define VPX_SAD_16xHEIGHTx4D_MSA(height)                                   \
   void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                   const uint8_t *const refs[],            \
-                                   int32_t ref_stride, uint32_t *sads) {   \
+                                   const uint8_t *const refs[4],           \
+                                   int32_t ref_stride, uint32_t sads[4]) { \
     sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
   }
 
 #define VPX_SAD_32xHEIGHTx4D_MSA(height)                                   \
   void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                   const uint8_t *const refs[],            \
-                                   int32_t ref_stride, uint32_t *sads) {   \
+                                   const uint8_t *const refs[4],           \
+                                   int32_t ref_stride, uint32_t sads[4]) { \
     sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
   }
 
 #define VPX_SAD_64xHEIGHTx4D_MSA(height)                                   \
   void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                   const uint8_t *const refs[],            \
-                                   int32_t ref_stride, uint32_t *sads) {   \
+                                   const uint8_t *const refs[4],           \
+                                   int32_t ref_stride, uint32_t sads[4]) { \
     sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
   }
 
@@ -1444,91 +740,65 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
 
 // 64x64
 VPX_SAD_64xHEIGHT_MSA(64);
-VPX_SAD_64xHEIGHTx3_MSA(64);
-VPX_SAD_64xHEIGHTx8_MSA(64);
 VPX_SAD_64xHEIGHTx4D_MSA(64);
 VPX_AVGSAD_64xHEIGHT_MSA(64);
 
 // 64x32
 VPX_SAD_64xHEIGHT_MSA(32);
-VPX_SAD_64xHEIGHTx3_MSA(32);
-VPX_SAD_64xHEIGHTx8_MSA(32);
 VPX_SAD_64xHEIGHTx4D_MSA(32);
 VPX_AVGSAD_64xHEIGHT_MSA(32);
 
 // 32x64
 VPX_SAD_32xHEIGHT_MSA(64);
-VPX_SAD_32xHEIGHTx3_MSA(64);
-VPX_SAD_32xHEIGHTx8_MSA(64);
 VPX_SAD_32xHEIGHTx4D_MSA(64);
 VPX_AVGSAD_32xHEIGHT_MSA(64);
 
 // 32x32
 VPX_SAD_32xHEIGHT_MSA(32);
-VPX_SAD_32xHEIGHTx3_MSA(32);
-VPX_SAD_32xHEIGHTx8_MSA(32);
 VPX_SAD_32xHEIGHTx4D_MSA(32);
 VPX_AVGSAD_32xHEIGHT_MSA(32);
 
 // 32x16
 VPX_SAD_32xHEIGHT_MSA(16);
-VPX_SAD_32xHEIGHTx3_MSA(16);
-VPX_SAD_32xHEIGHTx8_MSA(16);
 VPX_SAD_32xHEIGHTx4D_MSA(16);
 VPX_AVGSAD_32xHEIGHT_MSA(16);
 
 // 16x32
 VPX_SAD_16xHEIGHT_MSA(32);
-VPX_SAD_16xHEIGHTx3_MSA(32);
-VPX_SAD_16xHEIGHTx8_MSA(32);
 VPX_SAD_16xHEIGHTx4D_MSA(32);
 VPX_AVGSAD_16xHEIGHT_MSA(32);
 
 // 16x16
 VPX_SAD_16xHEIGHT_MSA(16);
-VPX_SAD_16xHEIGHTx3_MSA(16);
-VPX_SAD_16xHEIGHTx8_MSA(16);
 VPX_SAD_16xHEIGHTx4D_MSA(16);
 VPX_AVGSAD_16xHEIGHT_MSA(16);
 
 // 16x8
 VPX_SAD_16xHEIGHT_MSA(8);
-VPX_SAD_16xHEIGHTx3_MSA(8);
-VPX_SAD_16xHEIGHTx8_MSA(8);
 VPX_SAD_16xHEIGHTx4D_MSA(8);
 VPX_AVGSAD_16xHEIGHT_MSA(8);
 
 // 8x16
 VPX_SAD_8xHEIGHT_MSA(16);
-VPX_SAD_8xHEIGHTx3_MSA(16);
-VPX_SAD_8xHEIGHTx8_MSA(16);
 VPX_SAD_8xHEIGHTx4D_MSA(16);
 VPX_AVGSAD_8xHEIGHT_MSA(16);
 
 // 8x8
 VPX_SAD_8xHEIGHT_MSA(8);
-VPX_SAD_8xHEIGHTx3_MSA(8);
-VPX_SAD_8xHEIGHTx8_MSA(8);
 VPX_SAD_8xHEIGHTx4D_MSA(8);
 VPX_AVGSAD_8xHEIGHT_MSA(8);
 
 // 8x4
 VPX_SAD_8xHEIGHT_MSA(4);
-VPX_SAD_8xHEIGHTx3_MSA(4);
-VPX_SAD_8xHEIGHTx8_MSA(4);
 VPX_SAD_8xHEIGHTx4D_MSA(4);
 VPX_AVGSAD_8xHEIGHT_MSA(4);
 
 // 4x8
 VPX_SAD_4xHEIGHT_MSA(8);
-VPX_SAD_4xHEIGHTx3_MSA(8);
-VPX_SAD_4xHEIGHTx8_MSA(8);
 VPX_SAD_4xHEIGHTx4D_MSA(8);
 VPX_AVGSAD_4xHEIGHT_MSA(8);
 
 // 4x4
 VPX_SAD_4xHEIGHT_MSA(4);
-VPX_SAD_4xHEIGHTx3_MSA(4);
-VPX_SAD_4xHEIGHTx8_MSA(4);
 VPX_SAD_4xHEIGHTx4D_MSA(4);
 VPX_AVGSAD_4xHEIGHT_MSA(4);
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c
index 313e06f92d..572fcabfc0 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c
+++ b/media/libvpx/libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c
@@ -27,13 +27,14 @@ static const uint8_t bilinear_filters_msa[8][2] = {
     HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);            \
     DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
                                                                     \
-    sub += res_l0_m + res_l1_m;                                     \
+    (sub) += res_l0_m + res_l1_m;                                   \
   }
 
-#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift)
+#define VARIANCE_WxH(sse, diff, shift) \
+  (sse) - (((uint32_t)(diff) * (diff)) >> (shift))
 
 #define VARIANCE_LARGE_WxH(sse, diff, shift) \
-  sse - (((int64_t)diff * diff) >> shift)
+  (sse) - (((int64_t)(diff) * (diff)) >> (shift))
 
 static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr,
                                         int32_t src_stride,
@@ -1619,16 +1620,16 @@ static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa(
 
 #define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht)                              \
   uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa(                           \
-      const uint8_t *src, int32_t src_stride, int32_t xoffset,                \
-      int32_t yoffset, const uint8_t *ref, int32_t ref_stride,                \
+      const uint8_t *src, int32_t src_stride, int32_t x_offset,               \
+      int32_t y_offset, const uint8_t *ref, int32_t ref_stride,               \
       uint32_t *sse) {                                                        \
     int32_t diff;                                                             \
     uint32_t var;                                                             \
-    const uint8_t *h_filter = bilinear_filters_msa[xoffset];                  \
-    const uint8_t *v_filter = bilinear_filters_msa[yoffset];                  \
+    const uint8_t *h_filter = bilinear_filters_msa[x_offset];                 \
+    const uint8_t *v_filter = bilinear_filters_msa[y_offset];                 \
                                                                               \
-    if (yoffset) {                                                            \
-      if (xoffset) {                                                          \
+    if (y_offset) {                                                           \
+      if (x_offset) {                                                         \
         *sse = sub_pixel_sse_diff_##wd##width_hv_msa(                         \
             src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \
       } else {                                                                \
@@ -1638,7 +1639,7 @@ static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa(
                                                                               \
       var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                             \
     } else {                                                                  \
-      if (xoffset) {                                                          \
+      if (x_offset) {                                                         \
         *sse = sub_pixel_sse_diff_##wd##width_h_msa(                          \
             src, src_stride, ref, ref_stride, h_filter, ht, &diff);           \
                                                                               \
@@ -1672,15 +1673,15 @@ VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);
 
 #define VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht)                          \
   uint32_t vpx_sub_pixel_avg_variance##wd##x##ht##_msa(                       \
-      const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset,            \
-      int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride,            \
+      const uint8_t *src_ptr, int32_t src_stride, int32_t x_offset,           \
+      int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride,           \
       uint32_t *sse, const uint8_t *sec_pred) {                               \
     int32_t diff;                                                             \
-    const uint8_t *h_filter = bilinear_filters_msa[xoffset];                  \
-    const uint8_t *v_filter = bilinear_filters_msa[yoffset];                  \
+    const uint8_t *h_filter = bilinear_filters_msa[x_offset];                 \
+    const uint8_t *v_filter = bilinear_filters_msa[y_offset];                 \
                                                                               \
-    if (yoffset) {                                                            \
-      if (xoffset) {                                                          \
+    if (y_offset) {                                                           \
+      if (x_offset) {                                                         \
         *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa(                     \
             src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,     \
             v_filter, ht, &diff);                                             \
@@ -1690,7 +1691,7 @@ VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);
             &diff);                                                           \
       }                                                                       \
     } else {                                                                  \
-      if (xoffset) {                                                          \
+      if (x_offset) {                                                         \
         *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa(                      \
             src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
             &diff);                                                           \
@@ -1719,16 +1720,16 @@ VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32);
 
 uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
                                              int32_t src_stride,
-                                             int32_t xoffset, int32_t yoffset,
+                                             int32_t x_offset, int32_t y_offset,
                                              const uint8_t *ref_ptr,
                                              int32_t ref_stride, uint32_t *sse,
                                              const uint8_t *sec_pred) {
   int32_t diff;
-  const uint8_t *h_filter = bilinear_filters_msa[xoffset];
-  const uint8_t *v_filter = bilinear_filters_msa[yoffset];
+  const uint8_t *h_filter = bilinear_filters_msa[x_offset];
+  const uint8_t *v_filter = bilinear_filters_msa[y_offset];
 
-  if (yoffset) {
-    if (xoffset) {
+  if (y_offset) {
+    if (x_offset) {
       *sse = sub_pixel_avg_sse_diff_32width_hv_msa(
           src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,
           v_filter, 64, &diff);
@@ -1738,7 +1739,7 @@ uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
                                                   v_filter, 64, &diff);
     }
   } else {
-    if (xoffset) {
+    if (x_offset) {
       *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, ref_ptr,
                                                   ref_stride, sec_pred,
                                                   h_filter, 64, &diff);
@@ -1753,15 +1754,15 @@ uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
 
 #define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht)                           \
   uint32_t vpx_sub_pixel_avg_variance64x##ht##_msa(                           \
-      const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset,            \
-      int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride,            \
+      const uint8_t *src_ptr, int32_t src_stride, int32_t x_offset,           \
+      int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride,           \
       uint32_t *sse, const uint8_t *sec_pred) {                               \
     int32_t diff;                                                             \
-    const uint8_t *h_filter = bilinear_filters_msa[xoffset];                  \
-    const uint8_t *v_filter = bilinear_filters_msa[yoffset];                  \
+    const uint8_t *h_filter = bilinear_filters_msa[x_offset];                 \
+    const uint8_t *v_filter = bilinear_filters_msa[y_offset];                 \
                                                                               \
-    if (yoffset) {                                                            \
-      if (xoffset) {                                                          \
+    if (y_offset) {                                                           \
+      if (x_offset) {                                                         \
         *sse = sub_pixel_avg_sse_diff_64width_hv_msa(                         \
             src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,     \
             v_filter, ht, &diff);                                             \
@@ -1771,7 +1772,7 @@ uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
             &diff);                                                           \
       }                                                                       \
     } else {                                                                  \
-      if (xoffset) {                                                          \
+      if (x_offset) {                                                         \
         *sse = sub_pixel_avg_sse_diff_64width_h_msa(                          \
             src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
             &diff);                                                           \
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/subtract_mmi.c b/media/libvpx/libvpx/vpx_dsp/mips/subtract_mmi.c
new file mode 100644
index 0000000000..8bd7e6977c
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/subtract_mmi.c
@@ -0,0 +1,306 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+void vpx_subtract_block_mmi(int rows, int cols, int16_t *diff,
+                            ptrdiff_t diff_stride, const uint8_t *src,
+                            ptrdiff_t src_stride, const uint8_t *pred,
+                            ptrdiff_t pred_stride) {
+  double ftmp[13];
+  uint32_t tmp[1];
+
+  if (rows == cols) {
+    switch (rows) {
+      case 4:
+        __asm__ volatile(
+            "pxor       %[ftmp0],   %[ftmp0],           %[ftmp0]        \n\t"
+#if _MIPS_SIM == _ABIO32
+            "ulw        %[tmp0],    0x00(%[src])                        \n\t"
+            "mtc1       %[tmp0],    %[ftmp1]                            \n\t"
+            "ulw        %[tmp0],    0x00(%[pred])                       \n\t"
+            "mtc1       %[tmp0],    %[ftmp2]                            \n\t"
+#else
+            "gslwlc1    %[ftmp1],   0x03(%[src])                        \n\t"
+            "gslwrc1    %[ftmp1],   0x00(%[src])                        \n\t"
+            "gslwlc1    %[ftmp2],   0x03(%[pred])                       \n\t"
+            "gslwrc1    %[ftmp2],   0x00(%[pred])                       \n\t"
+#endif
+            MMI_ADDU(%[src], %[src], %[src_stride])
+            MMI_ADDU(%[pred], %[pred], %[pred_stride])
+
+#if _MIPS_SIM == _ABIO32
+            "ulw        %[tmp0],    0x00(%[src])                        \n\t"
+            "mtc1       %[tmp0],    %[ftmp3]                            \n\t"
+            "ulw        %[tmp0],    0x00(%[pred])                       \n\t"
+            "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
+#else
+            "gslwlc1    %[ftmp3],   0x03(%[src])                        \n\t"
+            "gslwrc1    %[ftmp3],   0x00(%[src])                        \n\t"
+            "gslwlc1    %[ftmp4],   0x03(%[pred])                       \n\t"
+            "gslwrc1    %[ftmp4],   0x00(%[pred])                       \n\t"
+#endif
+            MMI_ADDU(%[src], %[src], %[src_stride])
+            MMI_ADDU(%[pred], %[pred], %[pred_stride])
+
+#if _MIPS_SIM == _ABIO32
+            "ulw        %[tmp0],    0x00(%[src])                        \n\t"
+            "mtc1       %[tmp0],    %[ftmp5]                            \n\t"
+            "ulw        %[tmp0],    0x00(%[pred])                       \n\t"
+            "mtc1       %[tmp0],    %[ftmp6]                            \n\t"
+#else
+            "gslwlc1    %[ftmp5],   0x03(%[src])                        \n\t"
+            "gslwrc1    %[ftmp5],   0x00(%[src])                        \n\t"
+            "gslwlc1    %[ftmp6],   0x03(%[pred])                       \n\t"
+            "gslwrc1    %[ftmp6],   0x00(%[pred])                       \n\t"
+#endif
+            MMI_ADDU(%[src], %[src], %[src_stride])
+            MMI_ADDU(%[pred], %[pred], %[pred_stride])
+
+#if _MIPS_SIM == _ABIO32
+            "ulw        %[tmp0],    0x00(%[src])                        \n\t"
+            "mtc1       %[tmp0],    %[ftmp7]                            \n\t"
+            "ulw        %[tmp0],    0x00(%[pred])                       \n\t"
+            "mtc1       %[tmp0],    %[ftmp8]                            \n\t"
+#else
+            "gslwlc1    %[ftmp7],   0x03(%[src])                        \n\t"
+            "gslwrc1    %[ftmp7],   0x00(%[src])                        \n\t"
+            "gslwlc1    %[ftmp8],   0x03(%[pred])                       \n\t"
+            "gslwrc1    %[ftmp8],   0x00(%[pred])                       \n\t"
+#endif
+            "punpcklbh  %[ftmp9],   %[ftmp1],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp10],  %[ftmp2],           %[ftmp0]        \n\t"
+            "psubh      %[ftmp11],  %[ftmp9],           %[ftmp10]       \n\t"
+            "gssdlc1    %[ftmp11],  0x07(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp11],  0x00(%[diff])                       \n\t"
+            MMI_ADDU(%[diff], %[diff], %[diff_stride])
+            "punpcklbh  %[ftmp9],   %[ftmp3],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp10],  %[ftmp4],           %[ftmp0]        \n\t"
+            "psubh      %[ftmp11],  %[ftmp9],           %[ftmp10]       \n\t"
+            "gssdlc1    %[ftmp11],  0x07(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp11],  0x00(%[diff])                       \n\t"
+            MMI_ADDU(%[diff], %[diff], %[diff_stride])
+            "punpcklbh  %[ftmp9],   %[ftmp5],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp10],  %[ftmp6],           %[ftmp0]        \n\t"
+            "psubh      %[ftmp11],  %[ftmp9],           %[ftmp10]       \n\t"
+            "gssdlc1    %[ftmp11],  0x07(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp11],  0x00(%[diff])                       \n\t"
+            MMI_ADDU(%[diff], %[diff], %[diff_stride])
+            "punpcklbh  %[ftmp9],   %[ftmp7],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp10],  %[ftmp8],           %[ftmp0]        \n\t"
+            "psubh      %[ftmp11],  %[ftmp9],           %[ftmp10]       \n\t"
+            "gssdlc1    %[ftmp11],  0x07(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp11],  0x00(%[diff])                       \n\t"
+            : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]),
+              [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]),
+              [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+              [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]),
+              [ftmp8] "=&f"(ftmp[8]), [ftmp9] "=&f"(ftmp[9]),
+              [ftmp10] "=&f"(ftmp[10]), [ftmp11] "=&f"(ftmp[11]),
+#if _MIPS_SIM == _ABIO32
+              [tmp0] "=&r"(tmp[0]),
+#endif
+              [src] "+&r"(src), [pred] "+&r"(pred), [diff] "+&r"(diff)
+            : [src_stride] "r"((mips_reg)src_stride),
+              [pred_stride] "r"((mips_reg)pred_stride),
+              [diff_stride] "r"((mips_reg)(diff_stride * 2))
+            : "memory");
+        break;
+      case 8:
+        __asm__ volatile(
+            "pxor       %[ftmp0],   %[ftmp0],           %[ftmp0]        \n\t"
+            "li         %[tmp0],    0x02                                \n\t"
+            "1:                                                         \n\t"
+            "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t"
+            "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t"
+            "gsldlc1    %[ftmp2],   0x07(%[pred])                       \n\t"
+            "gsldrc1    %[ftmp2],   0x00(%[pred])                       \n\t"
+            MMI_ADDU(%[src], %[src], %[src_stride])
+            MMI_ADDU(%[pred], %[pred], %[pred_stride])
+            "gsldlc1    %[ftmp3],   0x07(%[src])                        \n\t"
+            "gsldrc1    %[ftmp3],   0x00(%[src])                        \n\t"
+            "gsldlc1    %[ftmp4],   0x07(%[pred])                       \n\t"
+            "gsldrc1    %[ftmp4],   0x00(%[pred])                       \n\t"
+            MMI_ADDU(%[src], %[src], %[src_stride])
+            MMI_ADDU(%[pred], %[pred], %[pred_stride])
+            "gsldlc1    %[ftmp5],   0x07(%[src])                        \n\t"
+            "gsldrc1    %[ftmp5],   0x00(%[src])                        \n\t"
+            "gsldlc1    %[ftmp6],   0x07(%[pred])                       \n\t"
+            "gsldrc1    %[ftmp6],   0x00(%[pred])                       \n\t"
+            MMI_ADDU(%[src], %[src], %[src_stride])
+            MMI_ADDU(%[pred], %[pred], %[pred_stride])
+            "gsldlc1    %[ftmp7],   0x07(%[src])                        \n\t"
+            "gsldrc1    %[ftmp7],   0x00(%[src])                        \n\t"
+            "gsldlc1    %[ftmp8],   0x07(%[pred])                       \n\t"
+            "gsldrc1    %[ftmp8],   0x00(%[pred])                       \n\t"
+            MMI_ADDU(%[src], %[src], %[src_stride])
+            MMI_ADDU(%[pred], %[pred], %[pred_stride])
+            "punpcklbh  %[ftmp9],   %[ftmp1],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp10],  %[ftmp1],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp11],  %[ftmp2],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp12],  %[ftmp2],           %[ftmp0]        \n\t"
+            "psubsh     %[ftmp9],   %[ftmp9],           %[ftmp11]       \n\t"
+            "psubsh     %[ftmp10],  %[ftmp10],          %[ftmp12]       \n\t"
+            "gssdlc1    %[ftmp9],   0x07(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp9],   0x00(%[diff])                       \n\t"
+            "gssdlc1    %[ftmp10],  0x0f(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp10],  0x08(%[diff])                       \n\t"
+            MMI_ADDU(%[diff], %[diff], %[diff_stride])
+            "punpcklbh  %[ftmp9],   %[ftmp3],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp10],  %[ftmp3],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp11],  %[ftmp4],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp12],  %[ftmp4],           %[ftmp0]        \n\t"
+            "psubsh     %[ftmp9],   %[ftmp9],           %[ftmp11]       \n\t"
+            "psubsh     %[ftmp10],  %[ftmp10],          %[ftmp12]       \n\t"
+            "gssdlc1    %[ftmp9],   0x07(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp9],   0x00(%[diff])                       \n\t"
+            "gssdlc1    %[ftmp10],  0x0f(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp10],  0x08(%[diff])                       \n\t"
+            MMI_ADDU(%[diff], %[diff], %[diff_stride])
+            "punpcklbh  %[ftmp9],   %[ftmp5],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp10],  %[ftmp5],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp11],  %[ftmp6],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp12],  %[ftmp6],           %[ftmp0]        \n\t"
+            "psubsh     %[ftmp9],   %[ftmp9],           %[ftmp11]       \n\t"
+            "psubsh     %[ftmp10],  %[ftmp10],          %[ftmp12]       \n\t"
+            "gssdlc1    %[ftmp9],   0x07(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp9],   0x00(%[diff])                       \n\t"
+            "gssdlc1    %[ftmp10],  0x0f(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp10],  0x08(%[diff])                       \n\t"
+            MMI_ADDU(%[diff], %[diff], %[diff_stride])
+            "punpcklbh  %[ftmp9],   %[ftmp7],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp10],  %[ftmp7],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp11],  %[ftmp8],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp12],  %[ftmp8],           %[ftmp0]        \n\t"
+            "psubsh     %[ftmp9],   %[ftmp9],           %[ftmp11]       \n\t"
+            "psubsh     %[ftmp10],  %[ftmp10],          %[ftmp12]       \n\t"
+            "gssdlc1    %[ftmp9],   0x07(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp9],   0x00(%[diff])                       \n\t"
+            "gssdlc1    %[ftmp10],  0x0f(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp10],  0x08(%[diff])                       \n\t"
+            MMI_ADDU(%[diff], %[diff], %[diff_stride])
+            "addiu      %[tmp0],    %[tmp0],            -0x01           \n\t"
+            "bnez       %[tmp0],    1b                                  \n\t"
+            : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]),
+              [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]),
+              [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+              [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]),
+              [ftmp8] "=&f"(ftmp[8]), [ftmp9] "=&f"(ftmp[9]),
+              [ftmp10] "=&f"(ftmp[10]), [ftmp11] "=&f"(ftmp[11]),
+              [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]), [src] "+&r"(src),
+              [pred] "+&r"(pred), [diff] "+&r"(diff)
+            : [pred_stride] "r"((mips_reg)pred_stride),
+              [src_stride] "r"((mips_reg)src_stride),
+              [diff_stride] "r"((mips_reg)(diff_stride * 2))
+            : "memory");
+        break;
+      case 16:
+        __asm__ volatile(
+            "pxor       %[ftmp0],   %[ftmp0],           %[ftmp0]        \n\t"
+            "li         %[tmp0],    0x08                                \n\t"
+            "1:                                                         \n\t"
+            "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t"
+            "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t"
+            "gsldlc1    %[ftmp2],   0x07(%[pred])                       \n\t"
+            "gsldrc1    %[ftmp2],   0x00(%[pred])                       \n\t"
+            "gsldlc1    %[ftmp3],   0x0f(%[src])                        \n\t"
+            "gsldrc1    %[ftmp3],   0x08(%[src])                        \n\t"
+            "gsldlc1    %[ftmp4],   0x0f(%[pred])                       \n\t"
+            "gsldrc1    %[ftmp4],   0x08(%[pred])                       \n\t"
+            MMI_ADDU(%[src], %[src], %[src_stride])
+            MMI_ADDU(%[pred], %[pred], %[pred_stride])
+            "gsldlc1    %[ftmp5],   0x07(%[src])                        \n\t"
+            "gsldrc1    %[ftmp5],   0x00(%[src])                        \n\t"
+            "gsldlc1    %[ftmp6],   0x07(%[pred])                       \n\t"
+            "gsldrc1    %[ftmp6],   0x00(%[pred])                       \n\t"
+            "gsldlc1    %[ftmp7],   0x0f(%[src])                        \n\t"
+            "gsldrc1    %[ftmp7],   0x08(%[src])                        \n\t"
+            "gsldlc1    %[ftmp8],   0x0f(%[pred])                       \n\t"
+            "gsldrc1    %[ftmp8],   0x08(%[pred])                       \n\t"
+            MMI_ADDU(%[src], %[src], %[src_stride])
+            MMI_ADDU(%[pred], %[pred], %[pred_stride])
+            "punpcklbh  %[ftmp9],   %[ftmp1],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp10],  %[ftmp1],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp11],  %[ftmp2],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp12],  %[ftmp2],           %[ftmp0]        \n\t"
+            "psubsh     %[ftmp9],   %[ftmp9],           %[ftmp11]       \n\t"
+            "psubsh     %[ftmp10],  %[ftmp10],          %[ftmp12]       \n\t"
+            "gssdlc1    %[ftmp9],   0x07(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp9],   0x00(%[diff])                       \n\t"
+            "gssdlc1    %[ftmp10],  0x0f(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp10],  0x08(%[diff])                       \n\t"
+            "punpcklbh  %[ftmp9],   %[ftmp3],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp10],  %[ftmp3],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp11],  %[ftmp4],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp12],  %[ftmp4],           %[ftmp0]        \n\t"
+            "psubsh     %[ftmp9],   %[ftmp9],           %[ftmp11]       \n\t"
+            "psubsh     %[ftmp10],  %[ftmp10],          %[ftmp12]       \n\t"
+            "gssdlc1    %[ftmp9],   0x17(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp9],   0x10(%[diff])                       \n\t"
+            "gssdlc1    %[ftmp10],  0x1f(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp10],  0x18(%[diff])                       \n\t"
+            MMI_ADDU(%[diff], %[diff], %[diff_stride])
+            "punpcklbh  %[ftmp9],   %[ftmp5],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp10],  %[ftmp5],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp11],  %[ftmp6],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp12],  %[ftmp6],           %[ftmp0]        \n\t"
+            "psubsh     %[ftmp9],   %[ftmp9],           %[ftmp11]       \n\t"
+            "psubsh     %[ftmp10],  %[ftmp10],          %[ftmp12]       \n\t"
+            "gssdlc1    %[ftmp9],   0x07(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp9],   0x00(%[diff])                       \n\t"
+            "gssdlc1    %[ftmp10],  0x0f(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp10],  0x08(%[diff])                       \n\t"
+            "punpcklbh  %[ftmp9],   %[ftmp7],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp10],  %[ftmp7],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp11],  %[ftmp8],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp12],  %[ftmp8],           %[ftmp0]        \n\t"
+            "psubsh     %[ftmp9],   %[ftmp9],           %[ftmp11]       \n\t"
+            "psubsh     %[ftmp10],  %[ftmp10],          %[ftmp12]       \n\t"
+            "gssdlc1    %[ftmp9],   0x17(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp9],   0x10(%[diff])                       \n\t"
+            "gssdlc1    %[ftmp10],  0x1f(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp10],  0x18(%[diff])                       \n\t"
+            MMI_ADDU(%[diff], %[diff], %[diff_stride])
+            "addiu      %[tmp0],    %[tmp0],            -0x01           \n\t"
+            "bnez       %[tmp0],    1b                                  \n\t"
+            : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]),
+              [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]),
+              [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+              [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]),
+              [ftmp8] "=&f"(ftmp[8]), [ftmp9] "=&f"(ftmp[9]),
+              [ftmp10] "=&f"(ftmp[10]), [ftmp11] "=&f"(ftmp[11]),
+              [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]), [src] "+&r"(src),
+              [pred] "+&r"(pred), [diff] "+&r"(diff)
+            : [pred_stride] "r"((mips_reg)pred_stride),
+              [src_stride] "r"((mips_reg)src_stride),
+              [diff_stride] "r"((mips_reg)(diff_stride * 2))
+            : "memory");
+        break;
+      case 32:
+        vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride,
+                             pred, pred_stride);
+        break;
+      case 64:
+        vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride,
+                             pred, pred_stride);
+        break;
+      default:
+        vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride,
+                             pred, pred_stride);
+        break;
+    }
+  } else {
+    vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride, pred,
+                         pred_stride);
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/sum_squares_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/sum_squares_msa.c
new file mode 100644
index 0000000000..d4563dc410
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/sum_squares_msa.c
@@ -0,0 +1,129 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "./macros_msa.h"
+
+uint64_t vpx_sum_squares_2d_i16_msa(const int16_t *src, int src_stride,
+                                    int size) {
+  int row, col;
+  uint64_t ss_res = 0;
+  v4i32 mul0, mul1;
+  v2i64 res0 = { 0 };
+
+  if (4 == size) {
+    uint64_t src0, src1, src2, src3;
+    v8i16 diff0 = { 0 };
+    v8i16 diff1 = { 0 };
+
+    LD4(src, src_stride, src0, src1, src2, src3);
+    INSERT_D2_SH(src0, src1, diff0);
+    INSERT_D2_SH(src2, src3, diff1);
+    DOTP_SH2_SW(diff0, diff1, diff0, diff1, mul0, mul1);
+    mul0 += mul1;
+    res0 = __msa_hadd_s_d(mul0, mul0);
+    res0 += __msa_splati_d(res0, 1);
+    ss_res = (uint64_t)__msa_copy_s_d(res0, 0);
+  } else if (8 == size) {
+    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+    DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+    DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+    DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+    mul0 += mul1;
+    res0 = __msa_hadd_s_d(mul0, mul0);
+    res0 += __msa_splati_d(res0, 1);
+    ss_res = (uint64_t)__msa_copy_s_d(res0, 0);
+  } else if (16 == size) {
+    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+    DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+    DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+    DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+    LD_SH8(src + 8, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += 8 * src_stride;
+    DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+    DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+    DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+    DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+    LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+    DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+    DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+    DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+    LD_SH8(src + 8, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+    DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+    DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+    DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+    mul0 += mul1;
+    res0 += __msa_hadd_s_d(mul0, mul0);
+
+    res0 += __msa_splati_d(res0, 1);
+    ss_res = (uint64_t)__msa_copy_s_d(res0, 0);
+  } else if (0 == (size % 16)) {
+    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    for (row = 0; row < (size >> 4); row++) {
+      for (col = 0; col < size; col += 16) {
+        const int16_t *src_ptr = src + col;
+        LD_SH8(src_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6,
+               src7);
+        DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+        DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+        DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+        DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+        LD_SH8(src_ptr + 8, src_stride, src0, src1, src2, src3, src4, src5,
+               src6, src7);
+        src_ptr += 8 * src_stride;
+        DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+        DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+        DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+        DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+        LD_SH8(src_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6,
+               src7);
+        DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+        DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+        DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+        DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+        LD_SH8(src_ptr + 8, src_stride, src0, src1, src2, src3, src4, src5,
+               src6, src7);
+        DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+        DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+        DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+        DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+        mul0 += mul1;
+        res0 += __msa_hadd_s_d(mul0, mul0);
+      }
+
+      src += 16 * src_stride;
+    }
+
+    res0 += __msa_splati_d(res0, 1);
+    ss_res = (uint64_t)__msa_copy_s_d(res0, 0);
+  } else {
+    int16_t val;
+
+    for (row = 0; row < size; row++) {
+      for (col = 0; col < size; col++) {
+        val = src[col];
+        ss_res += val * val;
+      }
+
+      src += src_stride;
+    }
+  }
+
+  return ss_res;
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/txfm_macros_msa.h b/media/libvpx/libvpx/vpx_dsp/mips/txfm_macros_msa.h
index f077fa4814..f27504a207 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/txfm_macros_msa.h
+++ b/media/libvpx/libvpx/vpx_dsp/mips/txfm_macros_msa.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_
-#define VPX_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_
+#ifndef VPX_VPX_DSP_MIPS_TXFM_MACROS_MSA_H_
+#define VPX_VPX_DSP_MIPS_TXFM_MACROS_MSA_H_
 
 #include "vpx_dsp/mips/macros_msa.h"
 
@@ -98,4 +98,4 @@
     SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS);                  \
     PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3);                      \
   }
-#endif  // VPX_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_
+#endif  // VPX_VPX_DSP_MIPS_TXFM_MACROS_MSA_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/variance_mmi.c b/media/libvpx/libvpx/vpx_dsp/mips/variance_mmi.c
new file mode 100644
index 0000000000..6428bc7006
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/variance_mmi.c
@@ -0,0 +1,1357 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/variance.h"
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+static const uint8_t bilinear_filters[8][2] = {
+  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
+};
+
+/* Use VARIANCE_SSE_SUM_8_FOR_W64 in vpx_variance64x64,vpx_variance64x32,
+   vpx_variance32x64. VARIANCE_SSE_SUM_8 will lead to sum overflow. */
+#define VARIANCE_SSE_SUM_8_FOR_W64                                  \
+  /* sse */                                                         \
+  "pasubub    %[ftmp3],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "punpcklbh  %[ftmp4],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "pmaddhw    %[ftmp6],   %[ftmp4],       %[ftmp4]            \n\t" \
+  "pmaddhw    %[ftmp7],   %[ftmp5],       %[ftmp5]            \n\t" \
+  "paddw      %[ftmp10],  %[ftmp10],      %[ftmp6]            \n\t" \
+  "paddw      %[ftmp10],  %[ftmp10],      %[ftmp7]            \n\t" \
+                                                                    \
+  /* sum */                                                         \
+  "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpcklbh  %[ftmp5],   %[ftmp2],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]            \n\t" \
+  "punpcklhw  %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "punpckhhw  %[ftmp2],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "punpcklhw  %[ftmp7],   %[ftmp5],       %[ftmp0]            \n\t" \
+  "punpckhhw  %[ftmp8],   %[ftmp5],       %[ftmp0]            \n\t" \
+  "psubw      %[ftmp3],   %[ftmp1],       %[ftmp7]            \n\t" \
+  "psubw      %[ftmp5],   %[ftmp2],       %[ftmp8]            \n\t" \
+  "punpcklhw  %[ftmp1],   %[ftmp4],       %[ftmp0]            \n\t" \
+  "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp0]            \n\t" \
+  "punpcklhw  %[ftmp7],   %[ftmp6],       %[ftmp0]            \n\t" \
+  "punpckhhw  %[ftmp8],   %[ftmp6],       %[ftmp0]            \n\t" \
+  "psubw      %[ftmp4],   %[ftmp1],       %[ftmp7]            \n\t" \
+  "psubw      %[ftmp6],   %[ftmp2],       %[ftmp8]            \n\t" \
+  "paddw      %[ftmp9],   %[ftmp9],       %[ftmp3]            \n\t" \
+  "paddw      %[ftmp9],   %[ftmp9],       %[ftmp4]            \n\t" \
+  "paddw      %[ftmp9],   %[ftmp9],       %[ftmp5]            \n\t" \
+  "paddw      %[ftmp9],   %[ftmp9],       %[ftmp6]            \n\t"
+
+#define VARIANCE_SSE_SUM_4                                          \
+  /* sse */                                                         \
+  "pasubub    %[ftmp3],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "punpcklbh  %[ftmp4],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "pmaddhw    %[ftmp5],   %[ftmp4],       %[ftmp4]            \n\t" \
+  "paddw      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t" \
+                                                                    \
+  /* sum */                                                         \
+  "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpcklbh  %[ftmp4],   %[ftmp2],       %[ftmp0]            \n\t" \
+  "paddh      %[ftmp7],   %[ftmp7],       %[ftmp3]            \n\t" \
+  "paddh      %[ftmp8],   %[ftmp8],       %[ftmp4]            \n\t"
+
+#define VARIANCE_SSE_SUM_8                                          \
+  /* sse */                                                         \
+  "pasubub    %[ftmp3],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "punpcklbh  %[ftmp4],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "pmaddhw    %[ftmp6],   %[ftmp4],       %[ftmp4]            \n\t" \
+  "pmaddhw    %[ftmp7],   %[ftmp5],       %[ftmp5]            \n\t" \
+  "paddw      %[ftmp8],   %[ftmp8],       %[ftmp6]            \n\t" \
+  "paddw      %[ftmp8],   %[ftmp8],       %[ftmp7]            \n\t" \
+                                                                    \
+  /* sum */                                                         \
+  "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpcklbh  %[ftmp5],   %[ftmp2],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]            \n\t" \
+  "paddh      %[ftmp10],  %[ftmp10],      %[ftmp3]            \n\t" \
+  "paddh      %[ftmp10],  %[ftmp10],      %[ftmp4]            \n\t" \
+  "paddh      %[ftmp12],  %[ftmp12],      %[ftmp5]            \n\t" \
+  "paddh      %[ftmp12],  %[ftmp12],      %[ftmp6]            \n\t"
+
+#define VARIANCE_SSE_8                                              \
+  "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t" \
+  "gsldlc1    %[ftmp2],   0x07(%[ref_ptr])                    \n\t" \
+  "gsldrc1    %[ftmp2],   0x00(%[ref_ptr])                    \n\t" \
+  "pasubub    %[ftmp3],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "punpcklbh  %[ftmp4],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "pmaddhw    %[ftmp6],   %[ftmp4],       %[ftmp4]            \n\t" \
+  "pmaddhw    %[ftmp7],   %[ftmp5],       %[ftmp5]            \n\t" \
+  "paddw      %[ftmp8],   %[ftmp8],       %[ftmp6]            \n\t" \
+  "paddw      %[ftmp8],   %[ftmp8],       %[ftmp7]            \n\t"
+
+#define VARIANCE_SSE_16                                             \
+  VARIANCE_SSE_8                                                    \
+  "gsldlc1    %[ftmp1],   0x0f(%[src_ptr])                    \n\t" \
+  "gsldrc1    %[ftmp1],   0x08(%[src_ptr])                    \n\t" \
+  "gsldlc1    %[ftmp2],   0x0f(%[ref_ptr])                    \n\t" \
+  "gsldrc1    %[ftmp2],   0x08(%[ref_ptr])                    \n\t" \
+  "pasubub    %[ftmp3],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "punpcklbh  %[ftmp4],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "pmaddhw    %[ftmp6],   %[ftmp4],       %[ftmp4]            \n\t" \
+  "pmaddhw    %[ftmp7],   %[ftmp5],       %[ftmp5]            \n\t" \
+  "paddw      %[ftmp8],   %[ftmp8],       %[ftmp6]            \n\t" \
+  "paddw      %[ftmp8],   %[ftmp8],       %[ftmp7]            \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A                       \
+  /* calculate fdata3[0]~fdata3[3], store at ftmp2*/                \
+  "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t" \
+  "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x08(%[src_ptr])                    \n\t" \
+  "gsldrc1    %[ftmp1],   0x01(%[src_ptr])                    \n\t" \
+  "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "pmullh     %[ftmp2],   %[ftmp2],       %[filter_x0]        \n\t" \
+  "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp3],   %[ftmp3],       %[filter_x1]        \n\t" \
+  "paddh      %[ftmp2],   %[ftmp2],       %[ftmp3]            \n\t" \
+  "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B                       \
+  /* calculate fdata3[0]~fdata3[3], store at ftmp4*/                \
+  "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t" \
+  "punpcklbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x08(%[src_ptr])                    \n\t" \
+  "gsldrc1    %[ftmp1],   0x01(%[src_ptr])                    \n\t" \
+  "punpcklbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "pmullh     %[ftmp4],   %[ftmp4],       %[filter_x0]        \n\t" \
+  "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp5],   %[ftmp5],       %[filter_x1]        \n\t" \
+  "paddh      %[ftmp4],   %[ftmp4],       %[ftmp5]            \n\t" \
+  "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp6]            \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A                      \
+  /* calculate: temp2[0] ~ temp2[3] */                              \
+  "pmullh     %[ftmp2],   %[ftmp2],       %[filter_y0]        \n\t" \
+  "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp1],   %[ftmp4],       %[filter_y1]        \n\t" \
+  "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t" \
+  "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t" \
+                                                                    \
+  /* store: temp2[0] ~ temp2[3] */                                  \
+  "pand       %[ftmp2],   %[ftmp2],       %[mask]             \n\t" \
+  "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t" \
+  "gssdrc1    %[ftmp2],   0x00(%[temp2_ptr])                  \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B                      \
+  /* calculate: temp2[0] ~ temp2[3] */                              \
+  "pmullh     %[ftmp4],   %[ftmp4],       %[filter_y0]        \n\t" \
+  "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp1],   %[ftmp2],       %[filter_y1]        \n\t" \
+  "paddh      %[ftmp4],   %[ftmp4],       %[ftmp1]            \n\t" \
+  "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp6]            \n\t" \
+                                                                    \
+  /* store: temp2[0] ~ temp2[3] */                                  \
+  "pand       %[ftmp4],   %[ftmp4],       %[mask]             \n\t" \
+  "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t" \
+  "gssdrc1    %[ftmp4],   0x00(%[temp2_ptr])                  \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A                       \
+  /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/      \
+  "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t" \
+  "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x08(%[src_ptr])                    \n\t" \
+  "gsldrc1    %[ftmp1],   0x01(%[src_ptr])                    \n\t" \
+  "punpcklbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "pmullh     %[ftmp2],   %[ftmp2],       %[filter_x0]        \n\t" \
+  "pmullh     %[ftmp3],   %[ftmp3],       %[filter_x0]        \n\t" \
+  "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_40]         \n\t" \
+  "paddh      %[ftmp3],   %[ftmp3],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp4],   %[ftmp4],       %[filter_x1]        \n\t" \
+  "pmullh     %[ftmp5],   %[ftmp5],       %[filter_x1]        \n\t" \
+  "paddh      %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t" \
+  "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp14]           \n\t" \
+  "psrlh      %[ftmp3],   %[ftmp3],       %[ftmp14]           \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B                       \
+  /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/      \
+  "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t" \
+  "punpcklbh  %[ftmp8],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp9],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x08(%[src_ptr])                    \n\t" \
+  "gsldrc1    %[ftmp1],   0x01(%[src_ptr])                    \n\t" \
+  "punpcklbh  %[ftmp10],  %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp11],  %[ftmp1],       %[ftmp0]            \n\t" \
+  "pmullh     %[ftmp8],   %[ftmp8],       %[filter_x0]        \n\t" \
+  "pmullh     %[ftmp9],   %[ftmp9],       %[filter_x0]        \n\t" \
+  "paddh      %[ftmp8],   %[ftmp8],       %[ff_ph_40]         \n\t" \
+  "paddh      %[ftmp9],   %[ftmp9],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp10],  %[ftmp10],      %[filter_x1]        \n\t" \
+  "pmullh     %[ftmp11],  %[ftmp11],      %[filter_x1]        \n\t" \
+  "paddh      %[ftmp8],   %[ftmp8],       %[ftmp10]           \n\t" \
+  "paddh      %[ftmp9],   %[ftmp9],       %[ftmp11]           \n\t" \
+  "psrlh      %[ftmp8],   %[ftmp8],       %[ftmp14]           \n\t" \
+  "psrlh      %[ftmp9],   %[ftmp9],       %[ftmp14]           \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A                      \
+  /* calculate: temp2[0] ~ temp2[3] */                              \
+  "pmullh     %[ftmp2],   %[ftmp2],       %[filter_y0]        \n\t" \
+  "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp1],   %[ftmp8],       %[filter_y1]        \n\t" \
+  "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t" \
+  "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp14]           \n\t" \
+                                                                    \
+  /* calculate: temp2[4] ~ temp2[7] */                              \
+  "pmullh     %[ftmp3],   %[ftmp3],       %[filter_y0]        \n\t" \
+  "paddh      %[ftmp3],   %[ftmp3],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp1],   %[ftmp9],       %[filter_y1]        \n\t" \
+  "paddh      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t" \
+  "psrlh      %[ftmp3],   %[ftmp3],       %[ftmp14]           \n\t" \
+                                                                    \
+  /* store: temp2[0] ~ temp2[7] */                                  \
+  "pand       %[ftmp2],   %[ftmp2],       %[mask]             \n\t" \
+  "pand       %[ftmp3],   %[ftmp3],       %[mask]             \n\t" \
+  "packushb   %[ftmp2],   %[ftmp2],       %[ftmp3]            \n\t" \
+  "gssdlc1    %[ftmp2],   0x07(%[temp2_ptr])                  \n\t" \
+  "gssdrc1    %[ftmp2],   0x00(%[temp2_ptr])                  \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B                      \
+  /* calculate: temp2[0] ~ temp2[3] */                              \
+  "pmullh     %[ftmp8],   %[ftmp8],       %[filter_y0]        \n\t" \
+  "paddh      %[ftmp8],   %[ftmp8],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp1],   %[ftmp2],       %[filter_y1]        \n\t" \
+  "paddh      %[ftmp8],   %[ftmp8],       %[ftmp1]            \n\t" \
+  "psrlh      %[ftmp8],   %[ftmp8],       %[ftmp14]           \n\t" \
+                                                                    \
+  /* calculate: temp2[4] ~ temp2[7] */                              \
+  "pmullh     %[ftmp9],   %[ftmp9],       %[filter_y0]        \n\t" \
+  "paddh      %[ftmp9],   %[ftmp9],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp1],   %[ftmp3],       %[filter_y1]        \n\t" \
+  "paddh      %[ftmp9],   %[ftmp9],       %[ftmp1]            \n\t" \
+  "psrlh      %[ftmp9],   %[ftmp9],       %[ftmp14]           \n\t" \
+                                                                    \
+  /* store: temp2[0] ~ temp2[7] */                                  \
+  "pand       %[ftmp8],   %[ftmp8],       %[mask]             \n\t" \
+  "pand       %[ftmp9],   %[ftmp9],       %[mask]             \n\t" \
+  "packushb   %[ftmp8],   %[ftmp8],       %[ftmp9]            \n\t" \
+  "gssdlc1    %[ftmp8],   0x07(%[temp2_ptr])                  \n\t" \
+  "gssdrc1    %[ftmp8],   0x00(%[temp2_ptr])                  \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A                      \
+  /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/      \
+  VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A                             \
+                                                                    \
+  /* calculate fdata3[8]~fdata3[15], store at ftmp4 and ftmp5*/     \
+  "gsldlc1    %[ftmp1],   0x0f(%[src_ptr])                    \n\t" \
+  "gsldrc1    %[ftmp1],   0x08(%[src_ptr])                    \n\t" \
+  "punpcklbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x10(%[src_ptr])                    \n\t" \
+  "gsldrc1    %[ftmp1],   0x09(%[src_ptr])                    \n\t" \
+  "punpcklbh  %[ftmp6],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp7],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "pmullh     %[ftmp4],   %[ftmp4],       %[filter_x0]        \n\t" \
+  "pmullh     %[ftmp5],   %[ftmp5],       %[filter_x0]        \n\t" \
+  "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_40]         \n\t" \
+  "paddh      %[ftmp5],   %[ftmp5],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp6],   %[ftmp6],       %[filter_x1]        \n\t" \
+  "pmullh     %[ftmp7],   %[ftmp7],       %[filter_x1]        \n\t" \
+  "paddh      %[ftmp4],   %[ftmp4],       %[ftmp6]            \n\t" \
+  "paddh      %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t" \
+  "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp14]           \n\t" \
+  "psrlh      %[ftmp5],   %[ftmp5],       %[ftmp14]           \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B                      \
+  /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/      \
+  VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B                             \
+                                                                    \
+  /* calculate fdata3[8]~fdata3[15], store at ftmp10 and ftmp11*/   \
+  "gsldlc1    %[ftmp1],   0x0f(%[src_ptr])                    \n\t" \
+  "gsldrc1    %[ftmp1],   0x08(%[src_ptr])                    \n\t" \
+  "punpcklbh  %[ftmp10],  %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp11],  %[ftmp1],       %[ftmp0]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x10(%[src_ptr])                    \n\t" \
+  "gsldrc1    %[ftmp1],   0x09(%[src_ptr])                    \n\t" \
+  "punpcklbh  %[ftmp12],  %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp13],  %[ftmp1],       %[ftmp0]            \n\t" \
+  "pmullh     %[ftmp10],  %[ftmp10],      %[filter_x0]        \n\t" \
+  "pmullh     %[ftmp11],  %[ftmp11],      %[filter_x0]        \n\t" \
+  "paddh      %[ftmp10],  %[ftmp10],      %[ff_ph_40]         \n\t" \
+  "paddh      %[ftmp11],  %[ftmp11],      %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp12],  %[ftmp12],      %[filter_x1]        \n\t" \
+  "pmullh     %[ftmp13],  %[ftmp13],      %[filter_x1]        \n\t" \
+  "paddh      %[ftmp10],  %[ftmp10],      %[ftmp12]           \n\t" \
+  "paddh      %[ftmp11],  %[ftmp11],      %[ftmp13]           \n\t" \
+  "psrlh      %[ftmp10],  %[ftmp10],      %[ftmp14]           \n\t" \
+  "psrlh      %[ftmp11],  %[ftmp11],      %[ftmp14]           \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A                     \
+  VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A                            \
+                                                                    \
+  /* calculate: temp2[8] ~ temp2[11] */                             \
+  "pmullh     %[ftmp4],   %[ftmp4],       %[filter_y0]        \n\t" \
+  "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp1],   %[ftmp10],      %[filter_y1]        \n\t" \
+  "paddh      %[ftmp4],   %[ftmp4],       %[ftmp1]            \n\t" \
+  "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp14]           \n\t" \
+                                                                    \
+  /* calculate: temp2[12] ~ temp2[15] */                            \
+  "pmullh     %[ftmp5],   %[ftmp5],       %[filter_y0]        \n\t" \
+  "paddh      %[ftmp5],   %[ftmp5],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp1],   %[ftmp11],       %[filter_y1]       \n\t" \
+  "paddh      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "psrlh      %[ftmp5],   %[ftmp5],       %[ftmp14]           \n\t" \
+                                                                    \
+  /* store: temp2[8] ~ temp2[15] */                                 \
+  "pand       %[ftmp4],   %[ftmp4],       %[mask]             \n\t" \
+  "pand       %[ftmp5],   %[ftmp5],       %[mask]             \n\t" \
+  "packushb   %[ftmp4],   %[ftmp4],       %[ftmp5]            \n\t" \
+  "gssdlc1    %[ftmp4],   0x0f(%[temp2_ptr])                  \n\t" \
+  "gssdrc1    %[ftmp4],   0x08(%[temp2_ptr])                  \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B                     \
+  VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B                            \
+                                                                    \
+  /* calculate: temp2[8] ~ temp2[11] */                             \
+  "pmullh     %[ftmp10],  %[ftmp10],      %[filter_y0]        \n\t" \
+  "paddh      %[ftmp10],  %[ftmp10],      %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp1],   %[ftmp4],       %[filter_y1]        \n\t" \
+  "paddh      %[ftmp10],  %[ftmp10],      %[ftmp1]            \n\t" \
+  "psrlh      %[ftmp10],  %[ftmp10],      %[ftmp14]           \n\t" \
+                                                                    \
+  /* calculate: temp2[12] ~ temp2[15] */                            \
+  "pmullh     %[ftmp11],  %[ftmp11],      %[filter_y0]        \n\t" \
+  "paddh      %[ftmp11],  %[ftmp11],      %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp1],   %[ftmp5],       %[filter_y1]        \n\t" \
+  "paddh      %[ftmp11],  %[ftmp11],      %[ftmp1]            \n\t" \
+  "psrlh      %[ftmp11],  %[ftmp11],      %[ftmp14]           \n\t" \
+                                                                    \
+  /* store: temp2[8] ~ temp2[15] */                                 \
+  "pand       %[ftmp10],  %[ftmp10],      %[mask]             \n\t" \
+  "pand       %[ftmp11],  %[ftmp11],      %[mask]             \n\t" \
+  "packushb   %[ftmp10],  %[ftmp10],      %[ftmp11]           \n\t" \
+  "gssdlc1    %[ftmp10],  0x0f(%[temp2_ptr])                  \n\t" \
+  "gssdrc1    %[ftmp10],  0x08(%[temp2_ptr])                  \n\t"
+
+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the first-pass of 2-D separable filter.
+//
+// Produces int16_t output to retain precision for the next pass. Two filter
+// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
+// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
+// It defines the offset required to move from one input to the next.
+static void var_filter_block2d_bil_first_pass(
+    const uint8_t *src_ptr, uint16_t *ref_ptr, unsigned int src_pixels_per_line,
+    int pixel_step, unsigned int output_height, unsigned int output_width,
+    const uint8_t *filter) {
+  unsigned int i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      ref_ptr[j] = ROUND_POWER_OF_TWO(
+          (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
+          FILTER_BITS);
+
+      ++src_ptr;
+    }
+
+    src_ptr += src_pixels_per_line - output_width;
+    ref_ptr += output_width;
+  }
+}
+
+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the second-pass of 2-D separable filter.
+//
+// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
+// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
+// filter is applied horizontally (pixel_step = 1) or vertically
+// (pixel_step = stride). It defines the offset required to move from one input
+// to the next. Output is 8-bit.
+static void var_filter_block2d_bil_second_pass(
+    const uint16_t *src_ptr, uint8_t *ref_ptr, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter) {
+  unsigned int i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      ref_ptr[j] = ROUND_POWER_OF_TWO(
+          (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
+          FILTER_BITS);
+      ++src_ptr;
+    }
+
+    src_ptr += src_pixels_per_line - output_width;
+    ref_ptr += output_width;
+  }
+}
+
+static inline uint32_t vpx_variance64x(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
+                                       uint32_t *sse, int high) {
+  int sum;
+  double ftmp[12];
+  uint32_t tmp[3];
+
+  *sse = 0;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "li         %[tmp0],    0x20                                \n\t"
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    MMI_L(%[tmp0], %[high], 0x00)
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "pxor       %[ftmp9],   %[ftmp9],       %[ftmp9]            \n\t"
+    "pxor       %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
+    "1:                                                         \n\t"
+    "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x07(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[ref_ptr])                    \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "gsldlc1    %[ftmp1],   0x0f(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x08(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x0f(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x08(%[ref_ptr])                    \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "gsldlc1    %[ftmp1],   0x17(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x10(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x17(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x10(%[ref_ptr])                    \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "gsldlc1    %[ftmp1],   0x1f(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x18(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x1f(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x18(%[ref_ptr])                    \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "gsldlc1    %[ftmp1],   0x27(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x20(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x27(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x20(%[ref_ptr])                    \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "gsldlc1    %[ftmp1],   0x2f(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x28(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x2f(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x28(%[ref_ptr])                    \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "gsldlc1    %[ftmp1],   0x37(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x30(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x37(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x30(%[ref_ptr])                    \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "gsldlc1    %[ftmp1],   0x3f(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x38(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x3f(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x38(%[ref_ptr])                    \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+    MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
+    "bnez       %[tmp0],    1b                                  \n\t"
+
+    "mfc1       %[tmp1],    %[ftmp9]                            \n\t"
+    "mfhc1      %[tmp2],    %[ftmp9]                            \n\t"
+    "addu       %[sum],     %[tmp1],        %[tmp2]             \n\t"
+    "ssrld      %[ftmp1],   %[ftmp10],      %[ftmp11]           \n\t"
+    "paddw      %[ftmp1],   %[ftmp1],       %[ftmp10]           \n\t"
+    "swc1       %[ftmp1],   0x00(%[sse])                        \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+      [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+      [tmp2]"=&r"(tmp[2]),
+      [src_ptr]"+&r"(src_ptr),          [ref_ptr]"+&r"(ref_ptr),
+      [sum]"=&r"(sum)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride),
+      [high]"r"(&high), [sse]"r"(sse)
+    : "memory"
+  );
+  /* clang-format on */
+
+  return *sse - (((int64_t)sum * sum) / (64 * high));
+}
+
+#define VPX_VARIANCE64XN(n)                                                   \
+  uint32_t vpx_variance64x##n##_mmi(const uint8_t *src_ptr, int src_stride,   \
+                                    const uint8_t *ref_ptr, int ref_stride,   \
+                                    uint32_t *sse) {                          \
+    return vpx_variance64x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
+  }
+
+VPX_VARIANCE64XN(64)
+VPX_VARIANCE64XN(32)
+
+uint32_t vpx_variance32x64_mmi(const uint8_t *src_ptr, int src_stride,
+                               const uint8_t *ref_ptr, int ref_stride,
+                               uint32_t *sse) {
+  int sum;
+  double ftmp[12];
+  uint32_t tmp[3];
+
+  *sse = 0;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "li         %[tmp0],    0x20                                \n\t"
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    "li         %[tmp0],    0x40                                \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "pxor       %[ftmp9],   %[ftmp9],       %[ftmp9]            \n\t"
+    "pxor       %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
+    "1:                                                         \n\t"
+    "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x07(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[ref_ptr])                    \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "gsldlc1    %[ftmp1],   0x0f(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x08(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x0f(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x08(%[ref_ptr])                    \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "gsldlc1    %[ftmp1],   0x17(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x10(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x17(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x10(%[ref_ptr])                    \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "gsldlc1    %[ftmp1],   0x1f(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x18(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x1f(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x18(%[ref_ptr])                    \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+    MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
+    "bnez       %[tmp0],    1b                                  \n\t"
+
+    "mfc1       %[tmp1],    %[ftmp9]                            \n\t"
+    "mfhc1      %[tmp2],    %[ftmp9]                            \n\t"
+    "addu       %[sum],     %[tmp1],        %[tmp2]             \n\t"
+    "ssrld      %[ftmp1],   %[ftmp10],      %[ftmp11]           \n\t"
+    "paddw      %[ftmp1],   %[ftmp1],       %[ftmp10]           \n\t"
+    "swc1       %[ftmp1],   0x00(%[sse])                        \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+      [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+      [tmp2]"=&r"(tmp[2]),
+      [src_ptr]"+&r"(src_ptr),          [ref_ptr]"+&r"(ref_ptr),
+      [sum]"=&r"(sum)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride),
+      [sse]"r"(sse)
+    : "memory"
+  );
+  /* clang-format on */
+
+  return *sse - (((int64_t)sum * sum) / 2048);
+}
+
+static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
+                                       uint32_t *sse, int high) {
+  int sum;
+  double ftmp[13];
+  uint32_t tmp[3];
+
+  *sse = 0;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "li         %[tmp0],    0x20                                \n\t"
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    MMI_L(%[tmp0], %[high], 0x00)
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "pxor       %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
+    "pxor       %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
+    "pxor       %[ftmp12],  %[ftmp12],      %[ftmp12]           \n\t"
+    "1:                                                         \n\t"
+    "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x07(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[ref_ptr])                    \n\t"
+    VARIANCE_SSE_SUM_8
+    "gsldlc1    %[ftmp1],   0x0f(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x08(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x0f(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x08(%[ref_ptr])                    \n\t"
+    VARIANCE_SSE_SUM_8
+    "gsldlc1    %[ftmp1],   0x17(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x10(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x17(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x10(%[ref_ptr])                    \n\t"
+    VARIANCE_SSE_SUM_8
+    "gsldlc1    %[ftmp1],   0x1f(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x18(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x1f(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x18(%[ref_ptr])                    \n\t"
+    VARIANCE_SSE_SUM_8
+
+    "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+    MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
+    "bnez       %[tmp0],    1b                                  \n\t"
+
+    "ssrld      %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
+    "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
+    "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
+
+    "punpcklhw  %[ftmp3],   %[ftmp10],      %[ftmp0]            \n\t"
+    "punpckhhw  %[ftmp4],   %[ftmp10],      %[ftmp0]            \n\t"
+    "punpcklhw  %[ftmp5],   %[ftmp12],      %[ftmp0]            \n\t"
+    "punpckhhw  %[ftmp6],   %[ftmp12],      %[ftmp0]            \n\t"
+    "paddw      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
+    "psubw      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
+    "psubw      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
+    "ssrld      %[ftmp0],   %[ftmp3],       %[ftmp11]           \n\t"
+    "paddw      %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
+    "swc1       %[ftmp0],   0x00(%[sum])                        \n\t"
+
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+      [ftmp12]"=&f"(ftmp[12]),          [tmp0]"=&r"(tmp[0]),
+      [src_ptr]"+&r"(src_ptr),          [ref_ptr]"+&r"(ref_ptr)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride),
+      [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
+    : "memory"
+  );
+  /* clang-format on */
+
+  return *sse - (((int64_t)sum * sum) / (32 * high));
+}
+
+#define VPX_VARIANCE32XN(n)                                                   \
+  uint32_t vpx_variance32x##n##_mmi(const uint8_t *src_ptr, int src_stride,   \
+                                    const uint8_t *ref_ptr, int ref_stride,   \
+                                    uint32_t *sse) {                          \
+    return vpx_variance32x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
+  }
+
+VPX_VARIANCE32XN(32)
+VPX_VARIANCE32XN(16)
+
+static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
+                                       uint32_t *sse, int high) {
+  int sum;
+  double ftmp[13];
+  uint32_t tmp[3];
+
+  *sse = 0;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "li         %[tmp0],    0x20                                \n\t"
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    MMI_L(%[tmp0], %[high], 0x00)
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "pxor       %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
+    "pxor       %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
+    "pxor       %[ftmp12],  %[ftmp12],      %[ftmp12]           \n\t"
+    "1:                                                         \n\t"
+    "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x07(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[ref_ptr])                    \n\t"
+    VARIANCE_SSE_SUM_8
+    "gsldlc1    %[ftmp1],   0x0f(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x08(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x0f(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x08(%[ref_ptr])                    \n\t"
+    VARIANCE_SSE_SUM_8
+
+    "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+    MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
+    "bnez       %[tmp0],    1b                                  \n\t"
+
+    "ssrld      %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
+    "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
+    "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
+
+    "punpcklhw  %[ftmp3],   %[ftmp10],      %[ftmp0]            \n\t"
+    "punpckhhw  %[ftmp4],   %[ftmp10],      %[ftmp0]            \n\t"
+    "punpcklhw  %[ftmp5],   %[ftmp12],      %[ftmp0]            \n\t"
+    "punpckhhw  %[ftmp6],   %[ftmp12],      %[ftmp0]            \n\t"
+    "paddw      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
+    "psubw      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
+    "psubw      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
+    "ssrld      %[ftmp0],   %[ftmp3],       %[ftmp11]           \n\t"
+    "paddw      %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
+    "swc1       %[ftmp0],   0x00(%[sum])                        \n\t"
+
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+      [ftmp12]"=&f"(ftmp[12]),          [tmp0]"=&r"(tmp[0]),
+      [src_ptr]"+&r"(src_ptr),          [ref_ptr]"+&r"(ref_ptr)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride),
+      [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
+    : "memory"
+  );
+  /* clang-format on */
+
+  return *sse - (((int64_t)sum * sum) / (16 * high));
+}
+
+#define VPX_VARIANCE16XN(n)                                                   \
+  uint32_t vpx_variance16x##n##_mmi(const uint8_t *src_ptr, int src_stride,   \
+                                    const uint8_t *ref_ptr, int ref_stride,   \
+                                    uint32_t *sse) {                          \
+    return vpx_variance16x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
+  }
+
+VPX_VARIANCE16XN(32)
+VPX_VARIANCE16XN(16)
+VPX_VARIANCE16XN(8)
+
+static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride,
+                                      const uint8_t *ref_ptr, int ref_stride,
+                                      uint32_t *sse, int high) {
+  int sum;
+  double ftmp[13];
+  uint32_t tmp[3];
+
+  *sse = 0;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "li         %[tmp0],    0x20                                \n\t"
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    MMI_L(%[tmp0], %[high], 0x00)
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "pxor       %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
+    "pxor       %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
+    "pxor       %[ftmp12],  %[ftmp12],      %[ftmp12]           \n\t"
+    "1:                                                         \n\t"
+    "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x07(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[ref_ptr])                    \n\t"
+    VARIANCE_SSE_SUM_8
+
+    "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+    MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
+    "bnez       %[tmp0],    1b                                  \n\t"
+
+    "ssrld      %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
+    "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
+    "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
+
+    "punpcklhw  %[ftmp3],   %[ftmp10],      %[ftmp0]            \n\t"
+    "punpckhhw  %[ftmp4],   %[ftmp10],      %[ftmp0]            \n\t"
+    "punpcklhw  %[ftmp5],   %[ftmp12],      %[ftmp0]            \n\t"
+    "punpckhhw  %[ftmp6],   %[ftmp12],      %[ftmp0]            \n\t"
+    "paddw      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
+    "psubw      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
+    "psubw      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
+    "ssrld      %[ftmp0],   %[ftmp3],       %[ftmp11]           \n\t"
+    "paddw      %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
+    "swc1       %[ftmp0],   0x00(%[sum])                        \n\t"
+
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+      [ftmp12]"=&f"(ftmp[12]),          [tmp0]"=&r"(tmp[0]),
+      [src_ptr]"+&r"(src_ptr),          [ref_ptr]"+&r"(ref_ptr)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride),
+      [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
+    : "memory"
+  );
+  /* clang-format on */
+
+  return *sse - (((int64_t)sum * sum) / (8 * high));
+}
+
+#define VPX_VARIANCE8XN(n)                                                   \
+  uint32_t vpx_variance8x##n##_mmi(const uint8_t *src_ptr, int src_stride,   \
+                                   const uint8_t *ref_ptr, int ref_stride,   \
+                                   uint32_t *sse) {                          \
+    return vpx_variance8x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
+  }
+
+VPX_VARIANCE8XN(16)
+VPX_VARIANCE8XN(8)
+VPX_VARIANCE8XN(4)
+
+static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride,
+                                      const uint8_t *ref_ptr, int ref_stride,
+                                      uint32_t *sse, int high) {
+  int sum;
+  double ftmp[12];
+  uint32_t tmp[3];
+
+  *sse = 0;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "li         %[tmp0],    0x20                                \n\t"
+    "mtc1       %[tmp0],    %[ftmp10]                           \n\t"
+    MMI_L(%[tmp0], %[high], 0x00)
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "pxor       %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+    "pxor       %[ftmp7],   %[ftmp7],       %[ftmp7]            \n\t"
+    "pxor       %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
+    "1:                                                         \n\t"
+    "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x07(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[ref_ptr])                    \n\t"
+    VARIANCE_SSE_SUM_4
+
+    "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+    MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
+    "bnez       %[tmp0],    1b                                  \n\t"
+
+    "ssrld      %[ftmp9],   %[ftmp6],       %[ftmp10]           \n\t"
+    "paddw      %[ftmp9],   %[ftmp9],       %[ftmp6]            \n\t"
+    "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
+
+    "punpcklhw  %[ftmp3],   %[ftmp7],       %[ftmp0]            \n\t"
+    "punpckhhw  %[ftmp4],   %[ftmp7],       %[ftmp0]            \n\t"
+    "punpcklhw  %[ftmp5],   %[ftmp8],       %[ftmp0]            \n\t"
+    "punpckhhw  %[ftmp6],   %[ftmp8],       %[ftmp0]            \n\t"
+    "paddw      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
+    "psubw      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
+    "psubw      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
+    "ssrld      %[ftmp0],   %[ftmp3],       %[ftmp10]           \n\t"
+    "paddw      %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
+    "swc1       %[ftmp0],   0x00(%[sum])                        \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),
+      [tmp0]"=&r"(tmp[0]),
+      [src_ptr]"+&r"(src_ptr),          [ref_ptr]"+&r"(ref_ptr)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride),
+      [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
+    : "memory"
+  );
+  /* clang-format on */
+
+  return *sse - (((int64_t)sum * sum) / (4 * high));
+}
+
+#define VPX_VARIANCE4XN(n)                                                   \
+  uint32_t vpx_variance4x##n##_mmi(const uint8_t *src_ptr, int src_stride,   \
+                                   const uint8_t *ref_ptr, int ref_stride,   \
+                                   uint32_t *sse) {                          \
+    return vpx_variance4x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
+  }
+
+VPX_VARIANCE4XN(8)
+VPX_VARIANCE4XN(4)
+
+static inline uint32_t vpx_mse16x(const uint8_t *src_ptr, int src_stride,
+                                  const uint8_t *ref_ptr, int ref_stride,
+                                  uint32_t *sse, uint64_t high) {
+  double ftmp[12];
+  uint32_t tmp[1];
+
+  *sse = 0;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "li         %[tmp0],    0x20                                \n\t"
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    MMI_L(%[tmp0], %[high], 0x00)
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "pxor       %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
+
+    "1:                                                         \n\t"
+    VARIANCE_SSE_16
+
+    "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+    MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
+    "bnez       %[tmp0],    1b                                  \n\t"
+
+    "ssrld      %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
+    "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
+    "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+      [tmp0]"=&r"(tmp[0]),
+      [src_ptr]"+&r"(src_ptr),          [ref_ptr]"+&r"(ref_ptr)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride),
+      [high]"r"(&high), [sse]"r"(sse)
+    : "memory"
+  );
+  /* clang-format on */
+
+  return *sse;
+}
+
+#define vpx_mse16xN(n)                                                   \
+  uint32_t vpx_mse16x##n##_mmi(const uint8_t *src_ptr, int src_stride,   \
+                               const uint8_t *ref_ptr, int ref_stride,   \
+                               uint32_t *sse) {                          \
+    return vpx_mse16x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
+  }
+
+vpx_mse16xN(16);
+vpx_mse16xN(8);
+
+static inline uint32_t vpx_mse8x(const uint8_t *src_ptr, int src_stride,
+                                 const uint8_t *ref_ptr, int ref_stride,
+                                 uint32_t *sse, uint64_t high) {
+  double ftmp[12];
+  uint32_t tmp[1];
+
+  *sse = 0;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "li         %[tmp0],    0x20                                \n\t"
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    MMI_L(%[tmp0], %[high], 0x00)
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "pxor       %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
+
+    "1:                                                         \n\t"
+    VARIANCE_SSE_8
+
+    "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+    MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
+    "bnez       %[tmp0],    1b                                  \n\t"
+
+    "ssrld      %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
+    "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
+    "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+      [tmp0]"=&r"(tmp[0]),
+      [src_ptr]"+&r"(src_ptr),          [ref_ptr]"+&r"(ref_ptr)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride),
+      [high]"r"(&high), [sse]"r"(sse)
+    : "memory"
+  );
+  /* clang-format on */
+
+  return *sse;
+}
+
+#define vpx_mse8xN(n)                                                   \
+  uint32_t vpx_mse8x##n##_mmi(const uint8_t *src_ptr, int src_stride,   \
+                              const uint8_t *ref_ptr, int ref_stride,   \
+                              uint32_t *sse) {                          \
+    return vpx_mse8x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
+  }
+
+vpx_mse8xN(16);
+vpx_mse8xN(8);
+
+#define SUBPIX_VAR(W, H)                                                       \
+  uint32_t vpx_sub_pixel_variance##W##x##H##_mmi(                              \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {                 \
+    uint16_t fdata3[((H) + 1) * (W)];                                          \
+    uint8_t temp2[(H) * (W)];                                                  \
+                                                                               \
+    var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, (H) + 1, \
+                                      W, bilinear_filters[x_offset]);          \
+    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,              \
+                                       bilinear_filters[y_offset]);            \
+                                                                               \
+    return vpx_variance##W##x##H##_mmi(temp2, W, ref_ptr, ref_stride, sse);    \
+  }
+
+SUBPIX_VAR(64, 64)
+SUBPIX_VAR(64, 32)
+SUBPIX_VAR(32, 64)
+SUBPIX_VAR(32, 32)
+SUBPIX_VAR(32, 16)
+SUBPIX_VAR(16, 32)
+
+static inline void var_filter_block2d_bil_16x(const uint8_t *src_ptr,
+                                              int src_stride, int x_offset,
+                                              int y_offset, uint8_t *temp2,
+                                              int counter) {
+  uint8_t *temp2_ptr = temp2;
+  mips_reg l_counter = counter;
+  double ftmp[15];
+  double ff_ph_40, mask;
+  double filter_x0, filter_x1, filter_y0, filter_y1;
+  mips_reg tmp[2];
+  uint64_t x0, x1, y0, y1, all;
+
+  const uint8_t *filter_x = bilinear_filters[x_offset];
+  const uint8_t *filter_y = bilinear_filters[y_offset];
+  x0 = (uint64_t)filter_x[0];
+  x1 = (uint64_t)filter_x[1];
+  y0 = (uint64_t)filter_y[0];
+  y1 = (uint64_t)filter_y[1];
+  all = x0 | x1 << 8 | y0 << 16 | y1 << 24;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    MMI_MTC1(%[all], %[ftmp14])
+    "punpcklbh  %[ftmp14],  %[ftmp14],      %[ftmp0]            \n\t"
+    "pshufh     %[filter_x0], %[ftmp14],    %[ftmp0]            \n\t"
+    MMI_LI(%[tmp0], 0x10)
+    MMI_MTC1(%[tmp0], %[mask])
+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
+    "pshufh     %[filter_x1], %[ftmp14],    %[ftmp0]            \n\t"
+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
+    "pshufh     %[filter_y0], %[ftmp14],    %[ftmp0]            \n\t"
+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
+    "pshufh     %[filter_y1], %[ftmp14],    %[ftmp0]            \n\t"
+    MMI_LI(%[tmp0], 0x07)
+    MMI_MTC1(%[tmp0], %[ftmp14])
+    MMI_LI(%[tmp0], 0x0040004000400040)
+    MMI_MTC1(%[tmp0], %[ff_ph_40])
+    MMI_LI(%[tmp0], 0x00ff00ff00ff00ff)
+    MMI_MTC1(%[tmp0], %[mask])
+    // fdata3: fdata3[0] ~ fdata3[15]
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
+
+    // fdata3 +src_stride*1: fdata3[0] ~ fdata3[15]
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B
+    // temp2: temp2[0] ~ temp2[15]
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A
+
+    // fdata3 +src_stride*2: fdata3[0] ~ fdata3[15]
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
+    // temp2+16*1: temp2[0] ~ temp2[15]
+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B
+
+    "1:                                                         \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B
+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A
+
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B
+    "addiu      %[counter], %[counter],     -0x01               \n\t"
+    "bnez       %[counter], 1b                                  \n\t"
+    : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+      [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+      [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
+      [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
+      [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
+      [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
+      [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr),
+      [counter]"+&r"(l_counter), [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask),
+      [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1),
+      [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1)
+    : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all)
+    : "memory"
+  );
+  /* clang-format on */
+}
+
+#define SUBPIX_VAR16XN(H)                                                      \
+  uint32_t vpx_sub_pixel_variance16x##H##_mmi(                                 \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {                 \
+    uint8_t temp2[16 * (H)];                                                   \
+    var_filter_block2d_bil_16x(src_ptr, src_stride, x_offset, y_offset, temp2, \
+                               ((H) - 2) / 2);                                 \
+                                                                               \
+    return vpx_variance16x##H##_mmi(temp2, 16, ref_ptr, ref_stride, sse);      \
+  }
+
+SUBPIX_VAR16XN(16)
+SUBPIX_VAR16XN(8)
+
+static inline void var_filter_block2d_bil_8x(const uint8_t *src_ptr,
+                                             int src_stride, int x_offset,
+                                             int y_offset, uint8_t *temp2,
+                                             int counter) {
+  uint8_t *temp2_ptr = temp2;
+  mips_reg l_counter = counter;
+  double ftmp[15];
+  mips_reg tmp[2];
+  double ff_ph_40, mask;
+  uint64_t x0, x1, y0, y1, all;
+  double filter_x0, filter_x1, filter_y0, filter_y1;
+  const uint8_t *filter_x = bilinear_filters[x_offset];
+  const uint8_t *filter_y = bilinear_filters[y_offset];
+  x0 = (uint64_t)filter_x[0];
+  x1 = (uint64_t)filter_x[1];
+  y0 = (uint64_t)filter_y[0];
+  y1 = (uint64_t)filter_y[1];
+  all = x0 | x1 << 8 | y0 << 16 | y1 << 24;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    MMI_MTC1(%[all], %[ftmp14])
+    "punpcklbh  %[ftmp14],  %[ftmp14],      %[ftmp0]            \n\t"
+    "pshufh     %[filter_x0], %[ftmp14],    %[ftmp0]            \n\t"
+    MMI_LI(%[tmp0], 0x10)
+    MMI_MTC1(%[tmp0], %[mask])
+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
+    "pshufh     %[filter_x1], %[ftmp14],    %[ftmp0]            \n\t"
+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
+    "pshufh     %[filter_y0], %[ftmp14],    %[ftmp0]            \n\t"
+    "ssrld      %[ftmp14],  %[ftmp14],      %[mask]             \n\t"
+    "pshufh     %[filter_y1], %[ftmp14],    %[ftmp0]            \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    MMI_LI(%[tmp0], 0x07)
+    MMI_MTC1(%[tmp0], %[ftmp14])
+    MMI_LI(%[tmp0], 0x0040004000400040)
+    MMI_MTC1(%[tmp0], %[ff_ph_40])
+    MMI_LI(%[tmp0], 0x00ff00ff00ff00ff)
+    MMI_MTC1(%[tmp0], %[mask])
+
+    // fdata3: fdata3[0] ~ fdata3[7]
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
+
+    // fdata3 +src_stride*1: fdata3[0] ~ fdata3[7]
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B
+    // temp2: temp2[0] ~ temp2[7]
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A
+
+    // fdata3 +src_stride*2: fdata3[0] ~ fdata3[7]
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
+    // temp2+8*1: temp2[0] ~ temp2[7]
+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B
+
+    "1:                                                         \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B
+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A
+
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B
+    "addiu      %[counter], %[counter],     -0x01               \n\t"
+    "bnez       %[counter], 1b                                  \n\t"
+    : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+      [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+      [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
+      [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
+      [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
+      [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
+      [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr),
+      [counter]"+&r"(l_counter), [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask),
+      [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1),
+      [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1)
+    : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all)
+    : "memory"
+  );
+  /* clang-format on */
+}
+
+#define SUBPIX_VAR8XN(H)                                                      \
+  uint32_t vpx_sub_pixel_variance8x##H##_mmi(                                 \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,     \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {                \
+    uint8_t temp2[8 * (H)];                                                   \
+    var_filter_block2d_bil_8x(src_ptr, src_stride, x_offset, y_offset, temp2, \
+                              ((H) - 2) / 2);                                 \
+                                                                              \
+    return vpx_variance8x##H##_mmi(temp2, 8, ref_ptr, ref_stride, sse);       \
+  }
+
+SUBPIX_VAR8XN(16)
+SUBPIX_VAR8XN(8)
+SUBPIX_VAR8XN(4)
+
+static inline void var_filter_block2d_bil_4x(const uint8_t *src_ptr,
+                                             int src_stride, int x_offset,
+                                             int y_offset, uint8_t *temp2,
+                                             int counter) {
+  uint8_t *temp2_ptr = temp2;
+  mips_reg l_counter = counter;
+  double ftmp[7];
+  mips_reg tmp[2];
+  double ff_ph_40, mask;
+  uint64_t x0, x1, y0, y1, all;
+  double filter_x0, filter_x1, filter_y0, filter_y1;
+  const uint8_t *filter_x = bilinear_filters[x_offset];
+  const uint8_t *filter_y = bilinear_filters[y_offset];
+  x0 = (uint64_t)filter_x[0];
+  x1 = (uint64_t)filter_x[1];
+  y0 = (uint64_t)filter_y[0];
+  y1 = (uint64_t)filter_y[1];
+  all = x0 | x1 << 8 | y0 << 16 | y1 << 24;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    MMI_MTC1(%[all], %[ftmp6])
+    "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+    "pshufh     %[filter_x0], %[ftmp6],     %[ftmp0]            \n\t"
+    MMI_LI(%[tmp0], 0x10)
+    MMI_MTC1(%[tmp0], %[mask])
+    "ssrld      %[ftmp6],   %[ftmp6],       %[mask]             \n\t"
+    "pshufh     %[filter_x1], %[ftmp6],     %[ftmp0]            \n\t"
+    "ssrld      %[ftmp6],   %[ftmp6],       %[mask]             \n\t"
+    "pshufh     %[filter_y0], %[ftmp6],     %[ftmp0]            \n\t"
+    "ssrld      %[ftmp6],   %[ftmp6],       %[mask]             \n\t"
+    "pshufh     %[filter_y1], %[ftmp6],     %[ftmp0]            \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    MMI_LI(%[tmp0], 0x07)
+    MMI_MTC1(%[tmp0], %[ftmp6])
+    MMI_LI(%[tmp0], 0x0040004000400040)
+    MMI_MTC1(%[tmp0], %[ff_ph_40])
+    MMI_LI(%[tmp0], 0x00ff00ff00ff00ff)
+    MMI_MTC1(%[tmp0], %[mask])
+    // fdata3: fdata3[0] ~ fdata3[3]
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
+
+    // fdata3 +src_stride*1: fdata3[0] ~ fdata3[3]
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B
+    // temp2: temp2[0] ~ temp2[7]
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A
+
+    // fdata3 +src_stride*2: fdata3[0] ~ fdata3[3]
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
+    // temp2+4*1: temp2[0] ~ temp2[7]
+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B
+
+    "1:                                                         \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B
+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A
+
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B
+    "addiu      %[counter], %[counter],     -0x01               \n\t"
+    "bnez       %[counter], 1b                                  \n\t"
+    : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+      [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+      [ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr),
+      [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter),
+      [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask),
+      [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1),
+      [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1)
+    : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all)
+    : "memory"
+  );
+  /* clang-format on */
+}
+
+#define SUBPIX_VAR4XN(H)                                                      \
+  uint32_t vpx_sub_pixel_variance4x##H##_mmi(                                 \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,     \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {                \
+    uint8_t temp2[4 * (H)];                                                   \
+    var_filter_block2d_bil_4x(src_ptr, src_stride, x_offset, y_offset, temp2, \
+                              ((H) - 2) / 2);                                 \
+                                                                              \
+    return vpx_variance4x##H##_mmi(temp2, 4, ref_ptr, ref_stride, sse);       \
+  }
+
+SUBPIX_VAR4XN(8)
+SUBPIX_VAR4XN(4)
+
+#define SUBPIX_AVG_VAR(W, H)                                                   \
+  uint32_t vpx_sub_pixel_avg_variance##W##x##H##_mmi(                          \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                   \
+      const uint8_t *second_pred) {                                            \
+    uint16_t fdata3[((H) + 1) * (W)];                                          \
+    uint8_t temp2[(H) * (W)];                                                  \
+    DECLARE_ALIGNED(16, uint8_t, temp3[(H) * (W)]);                            \
+                                                                               \
+    var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, (H) + 1, \
+                                      W, bilinear_filters[x_offset]);          \
+    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,              \
+                                       bilinear_filters[y_offset]);            \
+                                                                               \
+    vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W);                   \
+                                                                               \
+    return vpx_variance##W##x##H##_mmi(temp3, W, ref_ptr, ref_stride, sse);    \
+  }
+
+SUBPIX_AVG_VAR(64, 64)
+SUBPIX_AVG_VAR(64, 32)
+SUBPIX_AVG_VAR(32, 64)
+SUBPIX_AVG_VAR(32, 32)
+SUBPIX_AVG_VAR(32, 16)
+SUBPIX_AVG_VAR(16, 32)
+SUBPIX_AVG_VAR(16, 16)
+SUBPIX_AVG_VAR(16, 8)
+SUBPIX_AVG_VAR(8, 16)
+SUBPIX_AVG_VAR(8, 8)
+SUBPIX_AVG_VAR(8, 4)
+SUBPIX_AVG_VAR(4, 8)
+SUBPIX_AVG_VAR(4, 4)
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/variance_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/variance_msa.c
index 49b2f99230..444b086a6e 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/variance_msa.c
+++ b/media/libvpx/libvpx/vpx_dsp/mips/variance_msa.c
@@ -33,10 +33,11 @@
     sub += res_l0_m + res_l1_m;                                     \
   }
 
-#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift)
+#define VARIANCE_WxH(sse, diff, shift) \
+  (sse) - (((uint32_t)(diff) * (diff)) >> (shift))
 
 #define VARIANCE_LARGE_WxH(sse, diff, shift) \
-  sse - (((int64_t)diff * diff) >> shift)
+  (sse) - (((int64_t)(diff) * (diff)) >> (shift))
 
 static uint32_t sse_diff_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
                                     const uint8_t *ref_ptr, int32_t ref_stride,
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
index ad2af28669..5b5a1cbc3a 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
+++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
@@ -16,8 +16,9 @@ static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
                                               int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               int8_t *filter) {
+  uint32_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
-  v16u8 dst0, dst1, dst2, dst3, res2, res3;
+  v16u8 dst0 = { 0 }, res;
   v16u8 mask0, mask1, mask2, mask3;
   v8i16 filt, res0, res1;
 
@@ -36,23 +37,23 @@ static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
   XORI_B4_128_SB(src0, src1, src2, src3);
   HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
                              filt0, filt1, filt2, filt3, res0, res1);
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
   SRARI_H2_SH(res0, res1, FILTER_BITS);
   SAT_SH2_SH(res0, res1, 7);
-  PCKEV_B2_UB(res0, res0, res1, res1, res2, res3);
-  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
-  XORI_B2_128_UB(res2, res3);
-  AVER_UB2_UB(res2, dst0, res3, dst2, res2, res3);
-  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+  res = PCKEV_XORI128_UB(res0, res1);
+  res = (v16u8)__msa_aver_u_b(res, dst0);
+  ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
 }
 
 static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
                                               int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               int8_t *filter) {
+  uint32_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
   v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3;
-  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  v16u8 dst0 = { 0 }, dst1 = { 0 };
   v8i16 filt, vec0, vec1, vec2, vec3;
 
   mask0 = LD_UB(&mc_filt_mask_arr[16]);
@@ -69,7 +70,10 @@ static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
   LD_SB4(src, src_stride, src0, src1, src2, src3);
   XORI_B4_128_SB(src0, src1, src2, src3);
   src += (4 * src_stride);
-  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+  LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
   HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
                              filt0, filt1, filt2, filt3, vec0, vec1);
   LD_SB4(src, src_stride, src0, src1, src2, src3);
@@ -82,10 +86,7 @@ static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
               res3);
   ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
   XORI_B2_128_UB(res0, res2);
-  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
-             dst6);
-  ILVR_D2_UB(dst2, dst0, dst6, dst4, dst0, dst4);
-  AVER_UB2_UB(res0, dst0, res2, dst4, res0, res2);
+  AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2);
   ST4x8_UB(res0, res2, dst, dst_stride);
 }
 
@@ -105,8 +106,9 @@ static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
                                              int32_t dst_stride, int8_t *filter,
                                              int32_t height) {
   int32_t loop_cnt;
+  int64_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
-  v16u8 mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3;
+  v16u8 mask0, mask1, mask2, mask3, dst0 = { 0 }, dst1 = { 0 };
   v8i16 filt, out0, out1, out2, out3;
 
   mask0 = LD_UB(&mc_filt_mask_arr[0]);
@@ -127,10 +129,12 @@ static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
     HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
                                mask3, filt0, filt1, filt2, filt3, out0, out1,
                                out2, out3);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
     SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
     SAT_SH4_SH(out0, out1, out2, out3, 7);
-    CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst,
+    CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst,
                             dst_stride);
     dst += (4 * dst_stride);
   }
@@ -309,8 +313,9 @@ static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
                                               int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               int8_t *filter) {
+  uint32_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, mask;
-  v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1;
+  v16u8 filt0, dst0 = { 0 }, vec0, vec1, res;
   v8u16 vec2, vec3, filt;
 
   mask = LD_SB(&mc_filt_mask_arr[16]);
@@ -320,23 +325,24 @@ static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
 
   LD_SB4(src, src_stride, src0, src1, src2, src3);
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
   VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
   DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
   SRARI_H2_UH(vec2, vec3, FILTER_BITS);
-  PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
-  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
-  AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
-  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+  res = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+  res = (v16u8)__msa_aver_u_b(res, dst0);
+  ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
 }
 
 static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
                                               int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               int8_t *filter) {
+  uint32_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
   v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
-  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  v16u8 dst0 = { 0 }, dst1 = { 0 };
   v8u16 vec4, vec5, vec6, vec7, filt;
 
   mask = LD_SB(&mc_filt_mask_arr[16]);
@@ -346,7 +352,10 @@ static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
 
   LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
-  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+  LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
   VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
   VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
@@ -354,13 +363,9 @@ static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
   SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
   PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
               res3);
-  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
-             dst6);
-  AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2,
-              res3);
-  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
-  dst += (4 * dst_stride);
-  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+  ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
+  AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2);
+  ST4x8_UB(res0, res2, dst, dst_stride);
 }
 
 static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src,
@@ -378,8 +383,9 @@ static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
                                               int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               int8_t *filter) {
+  int64_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, mask;
-  v16u8 filt0, dst0, dst1, dst2, dst3;
+  v16u8 filt0, dst0 = { 0 }, dst1 = { 0 };
   v8u16 vec0, vec1, vec2, vec3, filt;
 
   mask = LD_SB(&mc_filt_mask_arr[0]);
@@ -394,16 +400,18 @@ static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
               vec2, vec3);
   SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
-                     dst_stride);
+  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_D2_UB(tp0, tp1, dst0);
+  INSERT_D2_UB(tp2, tp3, dst1);
+  PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
 }
 
 static void common_hz_2t_and_aver_dst_8x8mult_msa(
     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
     int8_t *filter, int32_t height) {
+  int64_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, mask;
-  v16u8 filt0, dst0, dst1, dst2, dst3;
+  v16u8 filt0, dst0 = { 0 }, dst1 = { 0 };
   v8u16 vec0, vec1, vec2, vec3, filt;
 
   mask = LD_SB(&mc_filt_mask_arr[0]);
@@ -419,11 +427,12 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(
   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
               vec2, vec3);
   SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_D2_UB(tp0, tp1, dst0);
+  INSERT_D2_UB(tp2, tp3, dst1);
   LD_SB4(src, src_stride, src0, src1, src2, src3);
   src += (4 * src_stride);
-  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
-                     dst_stride);
+  PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
   dst += (4 * dst_stride);
 
   VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
@@ -431,9 +440,10 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(
   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
               vec2, vec3);
   SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
-                     dst_stride);
+  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_D2_UB(tp0, tp1, dst0);
+  INSERT_D2_UB(tp2, tp3, dst1);
+  PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
   dst += (4 * dst_stride);
 
   if (16 == height) {
@@ -445,10 +455,11 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(
     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
                 vec2, vec3);
     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
     LD_SB4(src, src_stride, src0, src1, src2, src3);
-    PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
-                       dst_stride);
+    PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
     dst += (4 * dst_stride);
 
     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
@@ -456,9 +467,10 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(
     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
                 vec2, vec3);
     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
-                       dst_stride);
+    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+    PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
   }
 }
 
@@ -633,9 +645,10 @@ static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
 
 void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                 const int16_t *filter_x, int x_step_q4,
-                                 const int16_t *filter_y, int y_step_q4, int w,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
                                  int h) {
+  const int16_t *const filter_x = filter[x0_q4];
   int8_t cnt, filt_hor[8];
 
   assert(x_step_q4 == 16);
@@ -645,7 +658,7 @@ void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
     filt_hor[cnt] = filter_x[cnt];
   }
 
-  if (((const int32_t *)filter_x)[0] == 0) {
+  if (vpx_get_filter_taps(filter_x) == 2) {
     switch (w) {
       case 4:
         common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
@@ -668,8 +681,8 @@ void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                                           (int32_t)dst_stride, &filt_hor[3], h);
         break;
       default:
-        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                                  x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   } else {
@@ -695,8 +708,8 @@ void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                                           (int32_t)dst_stride, filt_hor, h);
         break;
       default:
-        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                                  x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   }
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c
index 1cfa63201c..ba816192a1 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c
+++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c
@@ -16,8 +16,9 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa(
     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
     int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
   uint32_t loop_cnt;
+  uint32_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1;
+  v16u8 dst0 = { 0 }, mask0, mask1, mask2, mask3, res;
   v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
   v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
   v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
@@ -59,7 +60,8 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa(
     XORI_B4_128_SB(src7, src8, src9, src10);
     src += (4 * src_stride);
 
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
     hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
                               filt_hz1, filt_hz2, filt_hz3);
     hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
@@ -73,14 +75,12 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa(
     vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
     res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
                                filt_vt2, filt_vt3);
-    ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
 
     SRARI_H2_SH(res0, res1, FILTER_BITS);
     SAT_SH2_SH(res0, res1, 7);
-    PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1);
-    XORI_B2_128_UB(tmp0, tmp1);
-    AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1);
-    ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
+    res = PCKEV_XORI128_UB(res0, res1);
+    res = (v16u8)__msa_aver_u_b(res, dst0);
+    ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
     dst += (4 * dst_stride);
 
     hz_out5 = hz_out9;
@@ -94,10 +94,11 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa(
     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
     int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
   uint32_t loop_cnt;
+  uint64_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
   v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
   v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
-  v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3;
+  v16u8 dst0 = { 0 }, dst1 = { 0 }, mask0, mask1, mask2, mask3;
   v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
   v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
   v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
@@ -144,7 +145,9 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa(
     XORI_B4_128_SB(src7, src8, src9, src10);
     src += (4 * src_stride);
 
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
 
     hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
                               filt_hz1, filt_hz2, filt_hz3);
@@ -172,7 +175,7 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa(
 
     SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
     SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
-    CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3, dst,
+    CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst,
                             dst_stride);
     dst += (4 * dst_stride);
 
@@ -225,9 +228,10 @@ static void common_hv_8ht_8vt_and_aver_dst_64w_msa(
 static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(
     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
     int8_t *filter_horiz, int8_t *filter_vert) {
+  uint32_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, src4, mask;
   v16u8 filt_hz, filt_vt, vec0, vec1;
-  v16u8 dst0, dst1, dst2, dst3, res0, res1;
+  v16u8 dst0 = { 0 }, out;
   v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt;
 
   mask = LD_SB(&mc_filt_mask_arr[16]);
@@ -248,21 +252,22 @@ static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(
   hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
   ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
 
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
   DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
   SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-  PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
-  AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
-  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+  out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+  out = __msa_aver_u_b(out, dst0);
+  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 }
 
 static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(
     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
     int8_t *filter_horiz, int8_t *filter_vert) {
+  uint32_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
-  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
-  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1;
+  v16u8 dst0 = { 0 }, dst1 = { 0 };
   v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
   v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
   v8i16 filt;
@@ -289,21 +294,18 @@ static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(
              hz_out3, hz_out5, 8);
   hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
 
-  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
-  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
-             dst6);
+  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+  LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
   ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
   ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, tmp0,
               tmp1, tmp2, tmp3);
   SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-  PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1, res2,
-              res3);
-  AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2,
-              res3);
-  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
-  dst += (4 * dst_stride);
-  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, res0, res1);
+  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+  ST4x8_UB(res0, res1, dst, dst_stride);
 }
 
 static void common_hv_2ht_2vt_and_aver_dst_4w_msa(
@@ -321,8 +323,9 @@ static void common_hv_2ht_2vt_and_aver_dst_4w_msa(
 static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(
     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
     int8_t *filter_horiz, int8_t *filter_vert) {
+  uint64_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, src4, mask;
-  v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
+  v16u8 filt_hz, filt_vt, dst0 = { 0 }, dst1 = { 0 }, vec0, vec1, vec2, vec3;
   v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
   v8i16 filt;
 
@@ -338,7 +341,9 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(
   LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
   src += (5 * src_stride);
 
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_D2_UB(tp0, tp1, dst0);
+  INSERT_D2_UB(tp2, tp3, dst1);
   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
   hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
   vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
@@ -357,16 +362,16 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(
   tmp3 = __msa_dotp_u_h(vec3, filt_vt);
 
   SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-  PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
-                     dst_stride);
+  PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
 }
 
 static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
     int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
   uint32_t loop_cnt;
+  uint64_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, src4, mask;
-  v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3;
+  v16u8 filt_hz, filt_vt, vec0, dst0 = { 0 }, dst1 = { 0 };
   v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
   v8i16 filt;
 
@@ -407,9 +412,10 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
     tmp3 = __msa_dotp_u_h(vec0, filt_vt);
 
     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
-                       dst_stride);
+    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+    PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
     dst += (4 * dst_stride);
   }
 }
@@ -516,9 +522,10 @@ static void common_hv_2ht_2vt_and_aver_dst_64w_msa(
 
 void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4, int w,
-                           int h) {
+                           const InterpKernel *filter, int x0_q4, int x_step_q4,
+                           int y0_q4, int y_step_q4, int w, int h) {
+  const int16_t *const filter_x = filter[x0_q4];
+  const int16_t *const filter_y = filter[y0_q4];
   int8_t cnt, filt_hor[8], filt_ver[8];
 
   assert(x_step_q4 == 16);
@@ -531,8 +538,8 @@ void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
     filt_ver[cnt] = filter_y[cnt];
   }
 
-  if (((const int32_t *)filter_x)[0] == 0 &&
-      ((const int32_t *)filter_y)[0] == 0) {
+  if (vpx_get_filter_taps(filter_x) == 2 &&
+      vpx_get_filter_taps(filter_y) == 2) {
     switch (w) {
       case 4:
         common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
@@ -560,14 +567,14 @@ void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
                                                &filt_hor[3], &filt_ver[3], h);
         break;
       default:
-        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
-                            x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                            x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
-  } else if (((const int32_t *)filter_x)[0] == 0 ||
-             ((const int32_t *)filter_y)[0] == 0) {
-    vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                        filter_y, y_step_q4, w, h);
+  } else if (vpx_get_filter_taps(filter_x) == 2 ||
+             vpx_get_filter_taps(filter_y) == 2) {
+    vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                        x_step_q4, y0_q4, y_step_q4, w, h);
   } else {
     switch (w) {
       case 4:
@@ -596,8 +603,8 @@ void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
                                                filt_ver, h);
         break;
       default:
-        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
-                            x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                            x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   }
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
index 146ce3b2f5..e6a790dfc6 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
+++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
@@ -17,8 +17,9 @@ static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
                                              int32_t dst_stride, int8_t *filter,
                                              int32_t height) {
   uint32_t loop_cnt;
+  uint32_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16u8 dst0, dst1, dst2, dst3, out;
+  v16u8 dst0 = { 0 }, out;
   v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
   v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
   v16i8 src10998, filt0, filt1, filt2, filt3;
@@ -43,7 +44,8 @@ static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
     LD_SB4(src, src_stride, src7, src8, src9, src10);
     src += (4 * src_stride);
 
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
     ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
                src87_r, src98_r, src109_r);
     ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
@@ -55,9 +57,6 @@ static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
     SRARI_H2_SH(out10, out32, FILTER_BITS);
     SAT_SH2_SH(out10, out32, 7);
     out = PCKEV_XORI128_UB(out10, out32);
-    ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
-
-    dst0 = (v16u8)__msa_ilvr_d((v2i64)dst2, (v2i64)dst0);
     out = __msa_aver_u_b(out, dst0);
 
     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
@@ -75,8 +74,9 @@ static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
                                              int32_t dst_stride, int8_t *filter,
                                              int32_t height) {
   uint32_t loop_cnt;
+  uint64_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16u8 dst0, dst1, dst2, dst3;
+  v16u8 dst0 = { 0 }, dst1 = { 0 };
   v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
   v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
   v8i16 filt, out0, out1, out2, out3;
@@ -98,7 +98,9 @@ static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
     LD_SB4(src, src_stride, src7, src8, src9, src10);
     src += (4 * src_stride);
 
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
     XORI_B4_128_SB(src7, src8, src9, src10);
     ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
                src87_r, src98_r, src109_r);
@@ -112,7 +114,7 @@ static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
                                filt1, filt2, filt3);
     SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
     SAT_SH4_SH(out0, out1, out2, out3, 7);
-    CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst,
+    CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst,
                             dst_stride);
     dst += (4 * dst_stride);
 
@@ -246,8 +248,9 @@ static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
                                               int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               int8_t *filter) {
+  uint32_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, src4;
-  v16u8 dst0, dst1, dst2, dst3, out, filt0, src2110, src4332;
+  v16u8 dst0 = { 0 }, out, filt0, src2110, src4332;
   v16i8 src10_r, src32_r, src21_r, src43_r;
   v8i16 filt;
   v8u16 tmp0, tmp1;
@@ -261,9 +264,8 @@ static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
   src4 = LD_SB(src);
   src += src_stride;
 
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
-  dst0 = (v16u8)__msa_ilvr_d((v2i64)dst1, (v2i64)dst0);
+  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
   ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
              src32_r, src43_r);
   ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
@@ -280,7 +282,8 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
                                               int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               int8_t *filter) {
-  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  uint32_t tp0, tp1, tp2, tp3;
+  v16u8 dst0 = { 0 }, dst1 = { 0 };
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
   v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
   v16u8 src2110, src4332, src6554, src8776, filt0;
@@ -294,10 +297,10 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
   src += (8 * src_stride);
   src8 = LD_SB(src);
 
-  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
-  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1, dst2,
-             dst3);
-  ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+  LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
   ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
              src32_r, src43_r);
   ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
@@ -309,9 +312,7 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
   SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
   PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
   AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
-  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
-  dst += (4 * dst_stride);
-  ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst, dst_stride);
+  ST4x8_UB(src2110, src4332, dst, dst_stride);
 }
 
 static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src,
@@ -329,8 +330,9 @@ static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
                                               int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               int8_t *filter) {
+  int64_t tp0, tp1, tp2, tp3;
   v16u8 src0, src1, src2, src3, src4;
-  v16u8 dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0;
+  v16u8 dst0 = { 0 }, dst1 = { 0 }, vec0, vec1, vec2, vec3, filt0;
   v8u16 tmp0, tmp1, tmp2, tmp3;
   v8i16 filt;
 
@@ -339,22 +341,24 @@ static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
   filt0 = (v16u8)__msa_splati_h(filt, 0);
 
   LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_D2_UB(tp0, tp1, dst0);
+  INSERT_D2_UB(tp2, tp3, dst1);
   ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
   ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
               tmp2, tmp3);
   SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-  PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
-                     dst_stride);
+  PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
 }
 
 static void common_vt_2t_and_aver_dst_8x8mult_msa(
     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
     int8_t *filter, int32_t height) {
   uint32_t loop_cnt;
+  int64_t tp0, tp1, tp2, tp3;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
-  v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+  v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
   v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
   v8u16 tmp0, tmp1, tmp2, tmp3;
   v8i16 filt;
@@ -369,7 +373,12 @@ static void common_vt_2t_and_aver_dst_8x8mult_msa(
   for (loop_cnt = (height >> 3); loop_cnt--;) {
     LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
     src += (8 * src_stride);
-    LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8);
+    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+    LD4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst2);
+    INSERT_D2_UB(tp2, tp3, dst3);
 
     ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
                vec3);
@@ -378,15 +387,13 @@ static void common_vt_2t_and_aver_dst_8x8mult_msa(
     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
                 tmp2, tmp3);
     SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-    PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4, dst,
-                       dst_stride);
+    PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
     dst += (4 * dst_stride);
 
     DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
                 tmp2, tmp3);
     SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-    PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8, dst,
-                       dst_stride);
+    PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst2, dst3, dst, dst_stride);
     dst += (4 * dst_stride);
 
     src0 = src8;
@@ -605,9 +612,10 @@ static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src,
 
 void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                const int16_t *filter_x, int x_step_q4,
-                                const int16_t *filter_y, int y_step_q4, int w,
+                                const InterpKernel *filter, int x0_q4,
+                                int x_step_q4, int y0_q4, int y_step_q4, int w,
                                 int h) {
+  const int16_t *const filter_y = filter[y0_q4];
   int8_t cnt, filt_ver[8];
 
   assert(y_step_q4 == 16);
@@ -617,7 +625,7 @@ void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
     filt_ver[cnt] = filter_y[cnt];
   }
 
-  if (((const int32_t *)filter_y)[0] == 0) {
+  if (vpx_get_filter_taps(filter_y) == 2) {
     switch (w) {
       case 4:
         common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
@@ -640,8 +648,8 @@ void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                                           (int32_t)dst_stride, &filt_ver[3], h);
         break;
       default:
-        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                                 x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   } else {
@@ -668,8 +676,8 @@ void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                                           (int32_t)dst_stride, filt_ver, h);
         break;
       default:
-        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                                 x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   }
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
index 9e8bf7b519..792c0f709c 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
+++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
@@ -621,9 +621,10 @@ static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
 
 void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int x_step_q4,
-                             const int16_t *filter_y, int y_step_q4, int w,
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
                              int h) {
+  const int16_t *const filter_x = filter[x0_q4];
   int8_t cnt, filt_hor[8];
 
   assert(x_step_q4 == 16);
@@ -633,7 +634,7 @@ void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
     filt_hor[cnt] = filter_x[cnt];
   }
 
-  if (((const int32_t *)filter_x)[0] == 0) {
+  if (vpx_get_filter_taps(filter_x) == 2) {
     switch (w) {
       case 4:
         common_hz_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
@@ -656,8 +657,8 @@ void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                              &filt_hor[3], h);
         break;
       default:
-        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                              x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                              x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   } else {
@@ -683,8 +684,8 @@ void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                              filt_hor, h);
         break;
       default:
-        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                              x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                              x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   }
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c
new file mode 100644
index 0000000000..cb7bca5589
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c
@@ -0,0 +1,716 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <string.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/asmdefs_mmi.h"
+#include "vpx_ports/mem.h"
+
+#define GET_DATA_H_MMI                                     \
+  "pmaddhw    %[ftmp4],    %[ftmp4],   %[filter1]    \n\t" \
+  "pmaddhw    %[ftmp5],    %[ftmp5],   %[filter2]    \n\t" \
+  "paddw      %[ftmp4],    %[ftmp4],   %[ftmp5]      \n\t" \
+  "punpckhwd  %[ftmp5],    %[ftmp4],   %[ftmp0]      \n\t" \
+  "paddw      %[ftmp4],    %[ftmp4],   %[ftmp5]      \n\t" \
+  "pmaddhw    %[ftmp6],    %[ftmp6],   %[filter1]    \n\t" \
+  "pmaddhw    %[ftmp7],    %[ftmp7],   %[filter2]    \n\t" \
+  "paddw      %[ftmp6],    %[ftmp6],   %[ftmp7]      \n\t" \
+  "punpckhwd  %[ftmp7],    %[ftmp6],   %[ftmp0]      \n\t" \
+  "paddw      %[ftmp6],    %[ftmp6],   %[ftmp7]      \n\t" \
+  "punpcklwd  %[srcl],     %[ftmp4],   %[ftmp6]      \n\t" \
+  "pmaddhw    %[ftmp8],    %[ftmp8],   %[filter1]    \n\t" \
+  "pmaddhw    %[ftmp9],    %[ftmp9],   %[filter2]    \n\t" \
+  "paddw      %[ftmp8],    %[ftmp8],   %[ftmp9]      \n\t" \
+  "punpckhwd  %[ftmp9],    %[ftmp8],   %[ftmp0]      \n\t" \
+  "paddw      %[ftmp8],    %[ftmp8],   %[ftmp9]      \n\t" \
+  "pmaddhw    %[ftmp10],   %[ftmp10],  %[filter1]    \n\t" \
+  "pmaddhw    %[ftmp11],   %[ftmp11],  %[filter2]    \n\t" \
+  "paddw      %[ftmp10],   %[ftmp10],  %[ftmp11]     \n\t" \
+  "punpckhwd  %[ftmp11],   %[ftmp10],  %[ftmp0]      \n\t" \
+  "paddw      %[ftmp10],   %[ftmp10],  %[ftmp11]     \n\t" \
+  "punpcklwd  %[srch],     %[ftmp8],   %[ftmp10]     \n\t"
+
+#define GET_DATA_V_MMI                                     \
+  "punpcklhw  %[srcl],     %[ftmp4],   %[ftmp5]      \n\t" \
+  "pmaddhw    %[srcl],     %[srcl],    %[filter10]   \n\t" \
+  "punpcklhw  %[ftmp12],   %[ftmp6],   %[ftmp7]      \n\t" \
+  "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter32]   \n\t" \
+  "paddw      %[srcl],     %[srcl],    %[ftmp12]     \n\t" \
+  "punpcklhw  %[ftmp12],   %[ftmp8],   %[ftmp9]      \n\t" \
+  "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter54]   \n\t" \
+  "paddw      %[srcl],     %[srcl],    %[ftmp12]     \n\t" \
+  "punpcklhw  %[ftmp12],   %[ftmp10],  %[ftmp11]     \n\t" \
+  "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter76]   \n\t" \
+  "paddw      %[srcl],     %[srcl],    %[ftmp12]     \n\t" \
+  "punpckhhw  %[srch],     %[ftmp4],   %[ftmp5]      \n\t" \
+  "pmaddhw    %[srch],     %[srch],    %[filter10]   \n\t" \
+  "punpckhhw  %[ftmp12],   %[ftmp6],   %[ftmp7]      \n\t" \
+  "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter32]   \n\t" \
+  "paddw      %[srch],     %[srch],    %[ftmp12]     \n\t" \
+  "punpckhhw  %[ftmp12],   %[ftmp8],   %[ftmp9]      \n\t" \
+  "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter54]   \n\t" \
+  "paddw      %[srch],     %[srch],    %[ftmp12]     \n\t" \
+  "punpckhhw  %[ftmp12],   %[ftmp10],  %[ftmp11]     \n\t" \
+  "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter76]   \n\t" \
+  "paddw      %[srch],     %[srch],    %[ftmp12]     \n\t"
+
+/* clang-format off */
+#define ROUND_POWER_OF_TWO_MMI                             \
+  /* Add para[0] */                                        \
+  "lw         %[tmp0],     0x00(%[para])             \n\t" \
+  MMI_MTC1(%[tmp0],     %[ftmp6])                          \
+  "punpcklwd  %[ftmp6],    %[ftmp6],    %[ftmp6]     \n\t" \
+  "paddw      %[srcl],     %[srcl],     %[ftmp6]     \n\t" \
+  "paddw      %[srch],     %[srch],     %[ftmp6]     \n\t" \
+  /* Arithmetic right shift para[1] bits */                \
+  "lw         %[tmp0],     0x04(%[para])             \n\t" \
+  MMI_MTC1(%[tmp0],     %[ftmp5])                          \
+  "psraw      %[srcl],     %[srcl],     %[ftmp5]     \n\t" \
+  "psraw      %[srch],     %[srch],     %[ftmp5]     \n\t"
+/* clang-format on */
+
+#define CLIP_PIXEL_MMI                                     \
+  /* Staturated operation */                               \
+  "packsswh   %[srcl],     %[srcl],     %[srch]      \n\t" \
+  "packushb   %[ftmp12],   %[srcl],     %[ftmp0]     \n\t"
+
+static void convolve_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const InterpKernel *filter, int x0_q4,
+                               int x_step_q4, int32_t w, int32_t h) {
+  const int16_t *filter_x = filter[x0_q4];
+  double ftmp[14];
+  uint32_t tmp[2];
+  uint32_t para[5];
+  para[0] = (1 << ((FILTER_BITS)-1));
+  para[1] = FILTER_BITS;
+  src -= SUBPEL_TAPS / 2 - 1;
+  src_stride -= w;
+  dst_stride -= w;
+  (void)x_step_q4;
+
+  /* clang-format off */
+  __asm__ volatile(
+    "move       %[tmp1],    %[width]                   \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],    %[ftmp0]      \n\t"
+    "gsldlc1    %[filter1], 0x03(%[filter])            \n\t"
+    "gsldrc1    %[filter1], 0x00(%[filter])            \n\t"
+    "gsldlc1    %[filter2], 0x0b(%[filter])            \n\t"
+    "gsldrc1    %[filter2], 0x08(%[filter])            \n\t"
+    "1:                                                \n\t"
+    /* Get 8 data per row */
+    "gsldlc1    %[ftmp5],   0x07(%[src])               \n\t"
+    "gsldrc1    %[ftmp5],   0x00(%[src])               \n\t"
+    "gsldlc1    %[ftmp7],   0x08(%[src])               \n\t"
+    "gsldrc1    %[ftmp7],   0x01(%[src])               \n\t"
+    "gsldlc1    %[ftmp9],   0x09(%[src])               \n\t"
+    "gsldrc1    %[ftmp9],   0x02(%[src])               \n\t"
+    "gsldlc1    %[ftmp11],  0x0A(%[src])               \n\t"
+    "gsldrc1    %[ftmp11],  0x03(%[src])               \n\t"
+    "punpcklbh  %[ftmp4],   %[ftmp5],    %[ftmp0]      \n\t"
+    "punpckhbh  %[ftmp5],   %[ftmp5],    %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp6],   %[ftmp7],    %[ftmp0]      \n\t"
+    "punpckhbh  %[ftmp7],   %[ftmp7],    %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp8],   %[ftmp9],    %[ftmp0]      \n\t"
+    "punpckhbh  %[ftmp9],   %[ftmp9],    %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp10],  %[ftmp11],   %[ftmp0]      \n\t"
+    "punpckhbh  %[ftmp11],  %[ftmp11],   %[ftmp0]      \n\t"
+    MMI_ADDIU(%[width],   %[width],    -0x04)
+    /* Get raw data */
+    GET_DATA_H_MMI
+    ROUND_POWER_OF_TWO_MMI
+    CLIP_PIXEL_MMI
+    "swc1       %[ftmp12],  0x00(%[dst])               \n\t"
+    MMI_ADDIU(%[dst],     %[dst],      0x04)
+    MMI_ADDIU(%[src],     %[src],      0x04)
+    /* Loop count */
+    "bnez       %[width],   1b                         \n\t"
+    "move       %[width],   %[tmp1]                    \n\t"
+    MMI_ADDU(%[src],      %[src],      %[src_stride])
+    MMI_ADDU(%[dst],      %[dst],      %[dst_stride])
+    MMI_ADDIU(%[height],  %[height],   -0x01)
+    "bnez       %[height],  1b                         \n\t"
+    : [srcl]"=&f"(ftmp[0]),     [srch]"=&f"(ftmp[1]),
+      [filter1]"=&f"(ftmp[2]),  [filter2]"=&f"(ftmp[3]),
+      [ftmp0]"=&f"(ftmp[4]),    [ftmp4]"=&f"(ftmp[5]),
+      [ftmp5]"=&f"(ftmp[6]),    [ftmp6]"=&f"(ftmp[7]),
+      [ftmp7]"=&f"(ftmp[8]),    [ftmp8]"=&f"(ftmp[9]),
+      [ftmp9]"=&f"(ftmp[10]),   [ftmp10]"=&f"(ftmp[11]),
+      [ftmp11]"=&f"(ftmp[12]),  [ftmp12]"=&f"(ftmp[13]),
+      [tmp0]"=&r"(tmp[0]),      [tmp1]"=&r"(tmp[1]),
+      [src]"+&r"(src),          [width]"+&r"(w),
+      [dst]"+&r"(dst),          [height]"+&r"(h)
+    : [filter]"r"(filter_x),    [para]"r"(para),
+      [src_stride]"r"((mips_reg)src_stride),
+      [dst_stride]"r"((mips_reg)dst_stride)
+    : "memory"
+  );
+  /* clang-format on */
+}
+
+static void convolve_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const InterpKernel *filter, int y0_q4,
+                              int y_step_q4, int32_t w, int32_t h) {
+  const int16_t *filter_y = filter[y0_q4];
+  double ftmp[16];
+  uint32_t tmp[1];
+  uint32_t para[2];
+  ptrdiff_t addr = src_stride;
+  para[0] = (1 << ((FILTER_BITS)-1));
+  para[1] = FILTER_BITS;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  src_stride -= w;
+  dst_stride -= w;
+  (void)y_step_q4;
+
+  __asm__ volatile(
+    "pxor       %[ftmp0],    %[ftmp0],   %[ftmp0]      \n\t"
+    "gsldlc1    %[ftmp4],    0x03(%[filter])           \n\t"
+    "gsldrc1    %[ftmp4],    0x00(%[filter])           \n\t"
+    "gsldlc1    %[ftmp5],    0x0b(%[filter])           \n\t"
+    "gsldrc1    %[ftmp5],    0x08(%[filter])           \n\t"
+    "punpcklwd  %[filter10], %[ftmp4],   %[ftmp4]      \n\t"
+    "punpckhwd  %[filter32], %[ftmp4],   %[ftmp4]      \n\t"
+    "punpcklwd  %[filter54], %[ftmp5],   %[ftmp5]      \n\t"
+    "punpckhwd  %[filter76], %[ftmp5],   %[ftmp5]      \n\t"
+    "1:                                                \n\t"
+    /* Get 8 data per column */
+    "gsldlc1    %[ftmp4],    0x07(%[src])              \n\t"
+    "gsldrc1    %[ftmp4],    0x00(%[src])              \n\t"
+    MMI_ADDU(%[tmp0],     %[src],     %[addr])
+    "gsldlc1    %[ftmp5],    0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp5],    0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp6],    0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp6],    0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp7],    0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp7],    0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp8],    0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp8],    0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp9],    0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp9],    0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp10],   0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp10],   0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp11],   0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp11],   0x00(%[tmp0])             \n\t"
+    "punpcklbh  %[ftmp4],    %[ftmp4],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp5],    %[ftmp5],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp6],    %[ftmp6],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp7],    %[ftmp7],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp8],    %[ftmp8],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp9],    %[ftmp9],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp10],   %[ftmp10],  %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp11],   %[ftmp11],  %[ftmp0]      \n\t"
+    MMI_ADDIU(%[width],   %[width],   -0x04)
+    /* Get raw data */
+    GET_DATA_V_MMI
+    ROUND_POWER_OF_TWO_MMI
+    CLIP_PIXEL_MMI
+    "swc1       %[ftmp12],   0x00(%[dst])              \n\t"
+    MMI_ADDIU(%[dst],     %[dst],      0x04)
+    MMI_ADDIU(%[src],     %[src],      0x04)
+    /* Loop count */
+    "bnez       %[width],    1b                        \n\t"
+    MMI_SUBU(%[width],    %[addr],     %[src_stride])
+    MMI_ADDU(%[src],      %[src],      %[src_stride])
+    MMI_ADDU(%[dst],      %[dst],      %[dst_stride])
+    MMI_ADDIU(%[height],  %[height],   -0x01)
+    "bnez       %[height],   1b                        \n\t"
+    : [srcl]"=&f"(ftmp[0]),     [srch]"=&f"(ftmp[1]),
+      [filter10]"=&f"(ftmp[2]), [filter32]"=&f"(ftmp[3]),
+      [filter54]"=&f"(ftmp[4]), [filter76]"=&f"(ftmp[5]),
+      [ftmp0]"=&f"(ftmp[6]),    [ftmp4]"=&f"(ftmp[7]),
+      [ftmp5]"=&f"(ftmp[8]),    [ftmp6]"=&f"(ftmp[9]),
+      [ftmp7]"=&f"(ftmp[10]),   [ftmp8]"=&f"(ftmp[11]),
+      [ftmp9]"=&f"(ftmp[12]),   [ftmp10]"=&f"(ftmp[13]),
+      [ftmp11]"=&f"(ftmp[14]),  [ftmp12]"=&f"(ftmp[15]),
+      [src]"+&r"(src),          [dst]"+&r"(dst),
+      [width]"+&r"(w),          [height]"+&r"(h),
+      [tmp0]"=&r"(tmp[0])
+    : [filter]"r"(filter_y),    [para]"r"(para),
+      [src_stride]"r"((mips_reg)src_stride),
+      [dst_stride]"r"((mips_reg)dst_stride),
+      [addr]"r"((mips_reg)addr)
+    : "memory"
+  );
+}
+
+static void convolve_avg_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int x_step_q4, int32_t w, int32_t h) {
+  const int16_t *filter_x = filter[x0_q4];
+  double ftmp[14];
+  uint32_t tmp[2];
+  uint32_t para[2];
+  para[0] = (1 << ((FILTER_BITS)-1));
+  para[1] = FILTER_BITS;
+  src -= SUBPEL_TAPS / 2 - 1;
+  src_stride -= w;
+  dst_stride -= w;
+  (void)x_step_q4;
+
+  __asm__ volatile(
+    "move       %[tmp1],    %[width]                   \n\t"
+    "pxor       %[ftmp0],   %[ftmp0],    %[ftmp0]      \n\t"
+    "gsldlc1    %[filter1], 0x03(%[filter])            \n\t"
+    "gsldrc1    %[filter1], 0x00(%[filter])            \n\t"
+    "gsldlc1    %[filter2], 0x0b(%[filter])            \n\t"
+    "gsldrc1    %[filter2], 0x08(%[filter])            \n\t"
+    "1:                                                \n\t"
+    /* Get 8 data per row */
+    "gsldlc1    %[ftmp5],   0x07(%[src])               \n\t"
+    "gsldrc1    %[ftmp5],   0x00(%[src])               \n\t"
+    "gsldlc1    %[ftmp7],   0x08(%[src])               \n\t"
+    "gsldrc1    %[ftmp7],   0x01(%[src])               \n\t"
+    "gsldlc1    %[ftmp9],   0x09(%[src])               \n\t"
+    "gsldrc1    %[ftmp9],   0x02(%[src])               \n\t"
+    "gsldlc1    %[ftmp11],  0x0A(%[src])               \n\t"
+    "gsldrc1    %[ftmp11],  0x03(%[src])               \n\t"
+    "punpcklbh  %[ftmp4],   %[ftmp5],    %[ftmp0]      \n\t"
+    "punpckhbh  %[ftmp5],   %[ftmp5],    %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp6],   %[ftmp7],    %[ftmp0]      \n\t"
+    "punpckhbh  %[ftmp7],   %[ftmp7],    %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp8],   %[ftmp9],    %[ftmp0]      \n\t"
+    "punpckhbh  %[ftmp9],   %[ftmp9],    %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp10],  %[ftmp11],   %[ftmp0]      \n\t"
+    "punpckhbh  %[ftmp11],  %[ftmp11],   %[ftmp0]      \n\t"
+    MMI_ADDIU(%[width],   %[width],    -0x04)
+    /* Get raw data */
+    GET_DATA_H_MMI
+    ROUND_POWER_OF_TWO_MMI
+    CLIP_PIXEL_MMI
+    "punpcklbh  %[ftmp12],  %[ftmp12],   %[ftmp0]      \n\t"
+    "gsldlc1    %[ftmp4],   0x07(%[dst])               \n\t"
+    "gsldrc1    %[ftmp4],   0x00(%[dst])               \n\t"
+    "punpcklbh  %[ftmp4],   %[ftmp4],    %[ftmp0]      \n\t"
+    "paddh      %[ftmp12],  %[ftmp12],   %[ftmp4]      \n\t"
+    "li         %[tmp0],    0x10001                    \n\t"
+    MMI_MTC1(%[tmp0],     %[ftmp5])
+    "punpcklhw  %[ftmp5],   %[ftmp5],    %[ftmp5]      \n\t"
+    "paddh      %[ftmp12],  %[ftmp12],   %[ftmp5]      \n\t"
+    "psrah      %[ftmp12],  %[ftmp12],   %[ftmp5]      \n\t"
+    "packushb   %[ftmp12],  %[ftmp12],   %[ftmp0]      \n\t"
+    "swc1       %[ftmp12],  0x00(%[dst])               \n\t"
+    MMI_ADDIU(%[dst],     %[dst],      0x04)
+    MMI_ADDIU(%[src],     %[src],      0x04)
+    /* Loop count */
+    "bnez       %[width],   1b                         \n\t"
+    "move       %[width],   %[tmp1]                    \n\t"
+    MMI_ADDU(%[src],      %[src],      %[src_stride])
+    MMI_ADDU(%[dst],      %[dst],      %[dst_stride])
+    MMI_ADDIU(%[height],  %[height],   -0x01)
+    "bnez       %[height],  1b                         \n\t"
+    : [srcl]"=&f"(ftmp[0]),     [srch]"=&f"(ftmp[1]),
+      [filter1]"=&f"(ftmp[2]),  [filter2]"=&f"(ftmp[3]),
+      [ftmp0]"=&f"(ftmp[4]),    [ftmp4]"=&f"(ftmp[5]),
+      [ftmp5]"=&f"(ftmp[6]),    [ftmp6]"=&f"(ftmp[7]),
+      [ftmp7]"=&f"(ftmp[8]),    [ftmp8]"=&f"(ftmp[9]),
+      [ftmp9]"=&f"(ftmp[10]),   [ftmp10]"=&f"(ftmp[11]),
+      [ftmp11]"=&f"(ftmp[12]),  [ftmp12]"=&f"(ftmp[13]),
+      [tmp0]"=&r"(tmp[0]),      [tmp1]"=&r"(tmp[1]),
+      [src]"+&r"(src),          [width]"+&r"(w),
+      [dst]"+&r"(dst),          [height]"+&r"(h)
+    : [filter]"r"(filter_x),    [para]"r"(para),
+      [src_stride]"r"((mips_reg)src_stride),
+      [dst_stride]"r"((mips_reg)dst_stride)
+    : "memory"
+  );
+}
+
+static void convolve_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const InterpKernel *filter, int y0_q4,
+                                  int y_step_q4, int32_t w, int32_t h) {
+  const int16_t *filter_y = filter[y0_q4];
+  double ftmp[16];
+  uint32_t tmp[1];
+  uint32_t para[2];
+  ptrdiff_t addr = src_stride;
+  para[0] = (1 << ((FILTER_BITS)-1));
+  para[1] = FILTER_BITS;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  src_stride -= w;
+  dst_stride -= w;
+  (void)y_step_q4;
+
+  __asm__ volatile(
+    "pxor       %[ftmp0],    %[ftmp0],   %[ftmp0]      \n\t"
+    "gsldlc1    %[ftmp4],    0x03(%[filter])           \n\t"
+    "gsldrc1    %[ftmp4],    0x00(%[filter])           \n\t"
+    "gsldlc1    %[ftmp5],    0x0b(%[filter])           \n\t"
+    "gsldrc1    %[ftmp5],    0x08(%[filter])           \n\t"
+    "punpcklwd  %[filter10], %[ftmp4],   %[ftmp4]      \n\t"
+    "punpckhwd  %[filter32], %[ftmp4],   %[ftmp4]      \n\t"
+    "punpcklwd  %[filter54], %[ftmp5],   %[ftmp5]      \n\t"
+    "punpckhwd  %[filter76], %[ftmp5],   %[ftmp5]      \n\t"
+    "1:                                                \n\t"
+    /* Get 8 data per column */
+    "gsldlc1    %[ftmp4],    0x07(%[src])              \n\t"
+    "gsldrc1    %[ftmp4],    0x00(%[src])              \n\t"
+    MMI_ADDU(%[tmp0],     %[src],     %[addr])
+    "gsldlc1    %[ftmp5],    0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp5],    0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp6],    0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp6],    0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp7],    0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp7],    0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp8],    0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp8],    0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp9],    0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp9],    0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp10],   0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp10],   0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp11],   0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp11],   0x00(%[tmp0])             \n\t"
+    "punpcklbh  %[ftmp4],    %[ftmp4],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp5],    %[ftmp5],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp6],    %[ftmp6],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp7],    %[ftmp7],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp8],    %[ftmp8],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp9],    %[ftmp9],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp10],   %[ftmp10],  %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp11],   %[ftmp11],  %[ftmp0]      \n\t"
+    MMI_ADDIU(%[width],   %[width],   -0x04)
+    /* Get raw data */
+    GET_DATA_V_MMI
+    ROUND_POWER_OF_TWO_MMI
+    CLIP_PIXEL_MMI
+    "punpcklbh  %[ftmp12],   %[ftmp12],  %[ftmp0]      \n\t"
+    "gsldlc1    %[ftmp4],    0x07(%[dst])              \n\t"
+    "gsldrc1    %[ftmp4],    0x00(%[dst])              \n\t"
+    "punpcklbh  %[ftmp4],    %[ftmp4],   %[ftmp0]      \n\t"
+    "paddh      %[ftmp12],   %[ftmp12],  %[ftmp4]      \n\t"
+    "li         %[tmp0],     0x10001                   \n\t"
+    MMI_MTC1(%[tmp0],     %[ftmp5])
+    "punpcklhw  %[ftmp5],    %[ftmp5],   %[ftmp5]      \n\t"
+    "paddh      %[ftmp12],   %[ftmp12],  %[ftmp5]      \n\t"
+    "psrah      %[ftmp12],   %[ftmp12],  %[ftmp5]      \n\t"
+    "packushb   %[ftmp12],   %[ftmp12],  %[ftmp0]      \n\t"
+    "swc1       %[ftmp12],   0x00(%[dst])              \n\t"
+    MMI_ADDIU(%[dst],     %[dst],      0x04)
+    MMI_ADDIU(%[src],     %[src],      0x04)
+    /* Loop count */
+    "bnez       %[width],    1b                        \n\t"
+    MMI_SUBU(%[width],    %[addr],     %[src_stride])
+    MMI_ADDU(%[src],      %[src],      %[src_stride])
+    MMI_ADDU(%[dst],      %[dst],      %[dst_stride])
+    MMI_ADDIU(%[height],  %[height],   -0x01)
+    "bnez       %[height],   1b                        \n\t"
+    : [srcl]"=&f"(ftmp[0]),     [srch]"=&f"(ftmp[1]),
+      [filter10]"=&f"(ftmp[2]), [filter32]"=&f"(ftmp[3]),
+      [filter54]"=&f"(ftmp[4]), [filter76]"=&f"(ftmp[5]),
+      [ftmp0]"=&f"(ftmp[6]),    [ftmp4]"=&f"(ftmp[7]),
+      [ftmp5]"=&f"(ftmp[8]),    [ftmp6]"=&f"(ftmp[9]),
+      [ftmp7]"=&f"(ftmp[10]),   [ftmp8]"=&f"(ftmp[11]),
+      [ftmp9]"=&f"(ftmp[12]),   [ftmp10]"=&f"(ftmp[13]),
+      [ftmp11]"=&f"(ftmp[14]),  [ftmp12]"=&f"(ftmp[15]),
+      [src]"+&r"(src),          [dst]"+&r"(dst),
+      [width]"+&r"(w),          [height]"+&r"(h),
+      [tmp0]"=&r"(tmp[0])
+    : [filter]"r"(filter_y),    [para]"r"(para),
+      [src_stride]"r"((mips_reg)src_stride),
+      [dst_stride]"r"((mips_reg)dst_stride),
+      [addr]"r"((mips_reg)addr)
+    : "memory"
+  );
+}
+
+void vpx_convolve_avg_mmi(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const InterpKernel *filter, int x0_q4, int x_step_q4,
+                          int y0_q4, int y_step_q4, int w, int h) {
+  int x, y;
+
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  if (w & 0x03) {
+    for (y = 0; y < h; ++y) {
+      for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
+      src += src_stride;
+      dst += dst_stride;
+    }
+  } else {
+    double ftmp[4];
+    uint32_t tmp[2];
+    src_stride -= w;
+    dst_stride -= w;
+
+    __asm__ volatile(
+      "move       %[tmp1],    %[width]                  \n\t"
+      "pxor       %[ftmp0],   %[ftmp0],   %[ftmp0]      \n\t"
+      "li         %[tmp0],    0x10001                   \n\t"
+      MMI_MTC1(%[tmp0],    %[ftmp3])
+      "punpcklhw  %[ftmp3],   %[ftmp3],   %[ftmp3]      \n\t"
+      "1:                                               \n\t"
+      "gsldlc1    %[ftmp1],   0x07(%[src])              \n\t"
+      "gsldrc1    %[ftmp1],   0x00(%[src])              \n\t"
+      "gsldlc1    %[ftmp2],   0x07(%[dst])              \n\t"
+      "gsldrc1    %[ftmp2],   0x00(%[dst])              \n\t"
+      "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]      \n\t"
+      "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]      \n\t"
+      "paddh      %[ftmp1],   %[ftmp1],   %[ftmp2]      \n\t"
+      "paddh      %[ftmp1],   %[ftmp1],   %[ftmp3]      \n\t"
+      "psrah      %[ftmp1],   %[ftmp1],   %[ftmp3]      \n\t"
+      "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]      \n\t"
+      "swc1       %[ftmp1],   0x00(%[dst])              \n\t"
+      MMI_ADDIU(%[width],  %[width],   -0x04)
+      MMI_ADDIU(%[dst],    %[dst],     0x04)
+      MMI_ADDIU(%[src],    %[src],     0x04)
+      "bnez       %[width],   1b                        \n\t"
+      "move       %[width],   %[tmp1]                   \n\t"
+      MMI_ADDU(%[dst],     %[dst],     %[dst_stride])
+      MMI_ADDU(%[src],     %[src],     %[src_stride])
+      MMI_ADDIU(%[height], %[height],  -0x01)
+      "bnez       %[height],  1b                        \n\t"
+      : [ftmp0]"=&f"(ftmp[0]),  [ftmp1]"=&f"(ftmp[1]),
+        [ftmp2]"=&f"(ftmp[2]),  [ftmp3]"=&f"(ftmp[3]),
+        [tmp0]"=&r"(tmp[0]),    [tmp1]"=&r"(tmp[1]),
+        [src]"+&r"(src),        [dst]"+&r"(dst),
+        [width]"+&r"(w),        [height]"+&r"(h)
+      : [src_stride]"r"((mips_reg)src_stride),
+        [dst_stride]"r"((mips_reg)dst_stride)
+      : "memory"
+    );
+  }
+}
+
+static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const InterpKernel *x_filters, int x0_q4,
+                           int x_step_q4, int w, int h) {
+  int x, y;
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  for (y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; ++x) {
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const InterpKernel *y_filters, int y0_q4,
+                          int y_step_q4, int w, int h) {
+  int x, y;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+  for (x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (y = 0; y < h; ++y) {
+      const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_y[k * src_stride] * y_filter[k];
+      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const InterpKernel *y_filters, int y0_q4,
+                              int y_step_q4, int w, int h) {
+  int x, y;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+  for (x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (y = 0; y < h; ++y) {
+      const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_y[k * src_stride] * y_filter[k];
+      dst[y * dst_stride] = ROUND_POWER_OF_TWO(
+          dst[y * dst_stride] +
+              clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
+          1);
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const InterpKernel *x_filters, int x0_q4,
+                               int x_step_q4, int w, int h) {
+  int x, y;
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  for (y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; ++x) {
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+      dst[x] = ROUND_POWER_OF_TWO(
+          dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vpx_convolve8_mmi(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                       ptrdiff_t dst_stride, const InterpKernel *filter,
+                       int x0_q4, int32_t x_step_q4, int y0_q4,
+                       int32_t y_step_q4, int32_t w, int32_t h) {
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  // When calling in frame scaling function, the smallest scaling factor is x1/4
+  // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+  // big enough.
+  uint8_t temp[64 * 135];
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+  assert(x_step_q4 <= 64);
+
+  if (w & 0x03) {
+    convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp,
+                   64, filter, x0_q4, x_step_q4, w, intermediate_height);
+    convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
+                  filter, y0_q4, y_step_q4, w, h);
+  } else {
+    convolve_horiz_mmi(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
+                       temp, 64, filter, x0_q4, x_step_q4, w,
+                       intermediate_height);
+    convolve_vert_mmi(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
+                      filter, y0_q4, y_step_q4, w, h);
+  }
+}
+
+void vpx_convolve8_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const InterpKernel *filter, int x0_q4,
+                             int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
+                             int32_t w, int32_t h) {
+  (void)y0_q4;
+  (void)y_step_q4;
+  if (w & 0x03)
+    convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                   w, h);
+  else
+    convolve_horiz_mmi(src, src_stride, dst, dst_stride, filter, x0_q4,
+                       x_step_q4, w, h);
+}
+
+void vpx_convolve8_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const InterpKernel *filter, int x0_q4,
+                            int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
+                            int h) {
+  (void)x0_q4;
+  (void)x_step_q4;
+  if (w & 0x03)
+    convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w,
+                  h);
+  else
+    convolve_vert_mmi(src, src_stride, dst, dst_stride, filter, y0_q4,
+                      y_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int32_t x_step_q4, int y0_q4, int y_step_q4,
+                                 int w, int h) {
+  (void)y0_q4;
+  (void)y_step_q4;
+  if (w & 0x03)
+    convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4,
+                       x_step_q4, w, h);
+  else
+    convolve_avg_horiz_mmi(src, src_stride, dst, dst_stride, filter, x0_q4,
+                           x_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
+                                uint8_t *dst, ptrdiff_t dst_stride,
+                                const InterpKernel *filter, int x0_q4,
+                                int32_t x_step_q4, int y0_q4, int y_step_q4,
+                                int w, int h) {
+  (void)x0_q4;
+  (void)x_step_q4;
+  if (w & 0x03)
+    convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4,
+                      y_step_q4, w, h);
+  else
+    convolve_avg_vert_mmi(src, src_stride, dst, dst_stride, filter, y0_q4,
+                          y_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_mmi(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const InterpKernel *filter, int x0_q4,
+                           int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
+                           int32_t w, int32_t h) {
+  // Fixed size intermediate buffer places limits on parameters.
+  DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
+  assert(w <= 64);
+  assert(h <= 64);
+
+  vpx_convolve8_mmi(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4,
+                    y_step_q4, w, h);
+  vpx_convolve_avg_mmi(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c
index b16ec57886..c942167587 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c
+++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c
@@ -541,9 +541,11 @@ static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride,
 }
 
 void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                       ptrdiff_t dst_stride, const int16_t *filter_x,
-                       int32_t x_step_q4, const int16_t *filter_y,
+                       ptrdiff_t dst_stride, const InterpKernel *filter,
+                       int x0_q4, int32_t x_step_q4, int y0_q4,
                        int32_t y_step_q4, int32_t w, int32_t h) {
+  const int16_t *const filter_x = filter[x0_q4];
+  const int16_t *const filter_y = filter[y0_q4];
   int8_t cnt, filt_hor[8], filt_ver[8];
 
   assert(x_step_q4 == 16);
@@ -556,8 +558,8 @@ void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     filt_ver[cnt] = filter_y[cnt];
   }
 
-  if (((const int32_t *)filter_x)[0] == 0 &&
-      ((const int32_t *)filter_y)[0] == 0) {
+  if (vpx_get_filter_taps(filter_x) == 2 &&
+      vpx_get_filter_taps(filter_y) == 2) {
     switch (w) {
       case 4:
         common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride, dst,
@@ -585,14 +587,14 @@ void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                                   &filt_ver[3], (int32_t)h);
         break;
       default:
-        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                        filter_y, y_step_q4, w, h);
+        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                        x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
-  } else if (((const int32_t *)filter_x)[0] == 0 ||
-             ((const int32_t *)filter_y)[0] == 0) {
-    vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                    filter_y, y_step_q4, w, h);
+  } else if (vpx_get_filter_taps(filter_x) == 2 ||
+             vpx_get_filter_taps(filter_y) == 2) {
+    vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                    y0_q4, y_step_q4, w, h);
   } else {
     switch (w) {
       case 4:
@@ -621,9 +623,605 @@ void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                                   (int32_t)h);
         break;
       default:
-        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                        filter_y, y_step_q4, w, h);
+        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                        x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   }
 }
+
+static void filter_horiz_w4_msa(const uint8_t *src_x, ptrdiff_t src_pitch,
+                                uint8_t *dst, const int16_t *x_filter) {
+  uint64_t srcd0, srcd1, srcd2, srcd3;
+  uint32_t res;
+  v16u8 src0 = { 0 }, src1 = { 0 }, dst0;
+  v16i8 out0, out1;
+  v16i8 shf1 = { 0, 8, 16, 24, 4, 12, 20, 28, 1, 9, 17, 25, 5, 13, 21, 29 };
+  v16i8 shf2 = shf1 + 2;
+  v16i8 filt_shf0 = { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9 };
+  v16i8 filt_shf1 = filt_shf0 + 2;
+  v16i8 filt_shf2 = filt_shf0 + 4;
+  v16i8 filt_shf3 = filt_shf0 + 6;
+  v8i16 filt, src0_h, src1_h, src2_h, src3_h, filt0, filt1, filt2, filt3;
+
+  LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3);
+  INSERT_D2_UB(srcd0, srcd1, src0);
+  INSERT_D2_UB(srcd2, srcd3, src1);
+  VSHF_B2_SB(src0, src1, src0, src1, shf1, shf2, out0, out1);
+  XORI_B2_128_SB(out0, out1);
+  UNPCK_SB_SH(out0, src0_h, src1_h);
+  UNPCK_SB_SH(out1, src2_h, src3_h);
+
+  filt = LD_SH(x_filter);
+  VSHF_B2_SH(filt, filt, filt, filt, filt_shf0, filt_shf1, filt0, filt1);
+  VSHF_B2_SH(filt, filt, filt, filt, filt_shf2, filt_shf3, filt2, filt3);
+
+  src0_h *= filt0;
+  src0_h += src1_h * filt1;
+  src0_h += src2_h * filt2;
+  src0_h += src3_h * filt3;
+
+  src1_h = (v8i16)__msa_sldi_b((v16i8)src0_h, (v16i8)src0_h, 8);
+
+  src0_h = __msa_adds_s_h(src0_h, src1_h);
+  src0_h = __msa_srari_h(src0_h, FILTER_BITS);
+  src0_h = __msa_sat_s_h(src0_h, 7);
+  dst0 = PCKEV_XORI128_UB(src0_h, src0_h);
+  res = __msa_copy_u_w((v4i32)dst0, 0);
+  SW(res, dst);
+}
+
+static void filter_horiz_w8_msa(const uint8_t *src_x, ptrdiff_t src_pitch,
+                                uint8_t *dst, const int16_t *x_filter) {
+  uint64_t srcd0, srcd1, srcd2, srcd3;
+  v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
+  v16u8 tmp0, tmp1, tmp2, tmp3, dst0;
+  v16i8 out0, out1, out2, out3;
+  v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 };
+  v16i8 shf2 = shf1 + 4;
+  v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h;
+  v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7;
+
+  LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3);
+  INSERT_D2_UB(srcd0, srcd1, src0);
+  INSERT_D2_UB(srcd2, srcd3, src1);
+  LD4(src_x + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
+  INSERT_D2_UB(srcd0, srcd1, src2);
+  INSERT_D2_UB(srcd2, srcd3, src3);
+
+  filt = LD_SH(x_filter);
+  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+  SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7);
+
+  // transpose
+  VSHF_B2_UB(src0, src1, src0, src1, shf1, shf2, tmp0, tmp1);
+  VSHF_B2_UB(src2, src3, src2, src3, shf1, shf2, tmp2, tmp3);
+  ILVRL_W2_SB(tmp2, tmp0, out0, out1);
+  ILVRL_W2_SB(tmp3, tmp1, out2, out3);
+
+  XORI_B4_128_SB(out0, out1, out2, out3);
+  UNPCK_SB_SH(out0, src0_h, src1_h);
+  UNPCK_SB_SH(out1, src2_h, src3_h);
+  UNPCK_SB_SH(out2, src4_h, src5_h);
+  UNPCK_SB_SH(out3, src6_h, src7_h);
+
+  src0_h *= filt0;
+  src4_h *= filt4;
+  src0_h += src1_h * filt1;
+  src4_h += src5_h * filt5;
+  src0_h += src2_h * filt2;
+  src4_h += src6_h * filt6;
+  src0_h += src3_h * filt3;
+  src4_h += src7_h * filt7;
+
+  src0_h = __msa_adds_s_h(src0_h, src4_h);
+  src0_h = __msa_srari_h(src0_h, FILTER_BITS);
+  src0_h = __msa_sat_s_h(src0_h, 7);
+  dst0 = PCKEV_XORI128_UB(src0_h, src0_h);
+  ST8x1_UB(dst0, dst);
+}
+
+static void filter_horiz_w16_msa(const uint8_t *src_x, ptrdiff_t src_pitch,
+                                 uint8_t *dst, const int16_t *x_filter) {
+  uint64_t srcd0, srcd1, srcd2, srcd3;
+  v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
+  v16u8 src4 = { 0 }, src5 = { 0 }, src6 = { 0 }, src7 = { 0 };
+  v16u8 tmp0, tmp1, tmp2, tmp3, dst0;
+  v16i8 out0, out1, out2, out3, out4, out5, out6, out7;
+  v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 };
+  v16i8 shf2 = shf1 + 4;
+  v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h;
+  v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7;
+  v8i16 dst0_h, dst1_h, dst2_h, dst3_h;
+
+  LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3);
+  INSERT_D2_UB(srcd0, srcd1, src0);
+  INSERT_D2_UB(srcd2, srcd3, src1);
+  LD4(src_x + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
+  INSERT_D2_UB(srcd0, srcd1, src2);
+  INSERT_D2_UB(srcd2, srcd3, src3);
+  LD4(src_x + 8 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
+  INSERT_D2_UB(srcd0, srcd1, src4);
+  INSERT_D2_UB(srcd2, srcd3, src5);
+  LD4(src_x + 12 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
+  INSERT_D2_UB(srcd0, srcd1, src6);
+  INSERT_D2_UB(srcd2, srcd3, src7);
+
+  filt = LD_SH(x_filter);
+  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+  SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7);
+
+  // transpose
+  VSHF_B2_UB(src0, src1, src0, src1, shf1, shf2, tmp0, tmp1);
+  VSHF_B2_UB(src2, src3, src2, src3, shf1, shf2, tmp2, tmp3);
+  ILVRL_W2_SB(tmp2, tmp0, out0, out1);
+  ILVRL_W2_SB(tmp3, tmp1, out2, out3);
+  XORI_B4_128_SB(out0, out1, out2, out3);
+
+  UNPCK_SB_SH(out0, src0_h, src1_h);
+  UNPCK_SB_SH(out1, src2_h, src3_h);
+  UNPCK_SB_SH(out2, src4_h, src5_h);
+  UNPCK_SB_SH(out3, src6_h, src7_h);
+
+  VSHF_B2_UB(src4, src5, src4, src5, shf1, shf2, tmp0, tmp1);
+  VSHF_B2_UB(src6, src7, src6, src7, shf1, shf2, tmp2, tmp3);
+  ILVRL_W2_SB(tmp2, tmp0, out4, out5);
+  ILVRL_W2_SB(tmp3, tmp1, out6, out7);
+  XORI_B4_128_SB(out4, out5, out6, out7);
+
+  dst0_h = src0_h * filt0;
+  dst1_h = src4_h * filt4;
+  dst0_h += src1_h * filt1;
+  dst1_h += src5_h * filt5;
+  dst0_h += src2_h * filt2;
+  dst1_h += src6_h * filt6;
+  dst0_h += src3_h * filt3;
+  dst1_h += src7_h * filt7;
+
+  UNPCK_SB_SH(out4, src0_h, src1_h);
+  UNPCK_SB_SH(out5, src2_h, src3_h);
+  UNPCK_SB_SH(out6, src4_h, src5_h);
+  UNPCK_SB_SH(out7, src6_h, src7_h);
+
+  dst2_h = src0_h * filt0;
+  dst3_h = src4_h * filt4;
+  dst2_h += src1_h * filt1;
+  dst3_h += src5_h * filt5;
+  dst2_h += src2_h * filt2;
+  dst3_h += src6_h * filt6;
+  dst2_h += src3_h * filt3;
+  dst3_h += src7_h * filt7;
+
+  ADDS_SH2_SH(dst0_h, dst1_h, dst2_h, dst3_h, dst0_h, dst2_h);
+  SRARI_H2_SH(dst0_h, dst2_h, FILTER_BITS);
+  SAT_SH2_SH(dst0_h, dst2_h, 7);
+  dst0 = PCKEV_XORI128_UB(dst0_h, dst2_h);
+  ST_UB(dst0, dst);
+}
+
+static void transpose4x4_to_dst(const uint8_t *src, uint8_t *dst,
+                                ptrdiff_t dst_stride) {
+  v16u8 in0;
+  v16i8 out0 = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
+
+  in0 = LD_UB(src);
+  out0 = __msa_vshf_b(out0, (v16i8)in0, (v16i8)in0);
+  ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void transpose8x8_to_dst(const uint8_t *src, uint8_t *dst,
+                                ptrdiff_t dst_stride) {
+  v16u8 in0, in1, in2, in3, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3;
+  v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 };
+  v16i8 shf2 = shf1 + 4;
+
+  LD_UB4(src, 16, in0, in1, in2, in3);
+  VSHF_B2_UB(in0, in1, in0, in1, shf1, shf2, tmp0, tmp1);
+  VSHF_B2_UB(in2, in3, in2, in3, shf1, shf2, tmp2, tmp3);
+  ILVRL_W2_UB(tmp2, tmp0, out0, out1);
+  ILVRL_W2_UB(tmp3, tmp1, out2, out3);
+  ST8x4_UB(out0, out1, dst, dst_stride);
+  ST8x4_UB(out2, out3, dst + 4 * dst_stride, dst_stride);
+}
+
+static void transpose16x16_to_dst(const uint8_t *src, uint8_t *dst,
+                                  ptrdiff_t dst_stride) {
+  v16u8 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12;
+  v16u8 in13, in14, in15, out0, out1, out2, out3, out4, out5, out6, out7, out8;
+  v16u8 out9, out10, out11, out12, out13, out14, out15;
+
+  LD_UB8(src, 16, in0, in1, in2, in3, in4, in5, in6, in7);
+  LD_UB8(src + 16 * 8, 16, in8, in9, in10, in11, in12, in13, in14, in15);
+
+  TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+                      in11, in12, in13, in14, in15, out0, out1, out2, out3,
+                      out4, out5, out6, out7);
+  ST_UB8(out0, out1, out2, out3, out4, out5, out6, out7, dst, dst_stride);
+  dst += 8 * dst_stride;
+
+  SLDI_B4_0_UB(in0, in1, in2, in3, in0, in1, in2, in3, 8);
+  SLDI_B4_0_UB(in4, in5, in6, in7, in4, in5, in6, in7, 8);
+  SLDI_B4_0_UB(in8, in9, in10, in11, in8, in9, in10, in11, 8);
+  SLDI_B4_0_UB(in12, in13, in14, in15, in12, in13, in14, in15, 8);
+
+  TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+                      in11, in12, in13, in14, in15, out8, out9, out10, out11,
+                      out12, out13, out14, out15);
+  ST_UB8(out8, out9, out10, out11, out12, out13, out14, out15, dst, dst_stride);
+}
+
+static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
+                                    uint8_t *dst, ptrdiff_t dst_stride,
+                                    const InterpKernel *x_filters, int x0_q4,
+                                    int x_step_q4, int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
+  int y, z, i;
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  for (y = 0; y < h; y += 4) {
+    int x_q4 = x0_q4;
+    for (z = 0; z < 4; ++z) {
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+
+      if (x_q4 & SUBPEL_MASK) {
+        filter_horiz_w4_msa(src_x, src_stride, temp + (z * 4), x_filter);
+      } else {
+        for (i = 0; i < 4; ++i) {
+          temp[z * 4 + i] = src_x[i * src_stride + 3];
+        }
+      }
+
+      x_q4 += x_step_q4;
+    }
+
+    transpose4x4_to_dst(temp, dst, dst_stride);
+
+    src += src_stride * 4;
+    dst += dst_stride * 4;
+  }
+}
+
+static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
+                                    uint8_t *dst, ptrdiff_t dst_stride,
+                                    const InterpKernel *x_filters, int x0_q4,
+                                    int x_step_q4, int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
+  int y, z, i;
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  // This function processes 8x8 areas. The intermediate height is not always
+  // a multiple of 8, so force it to be a multiple of 8 here.
+  y = h + (8 - (h & 0x7));
+
+  do {
+    int x_q4 = x0_q4;
+    for (z = 0; z < 8; ++z) {
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+
+      if (x_q4 & SUBPEL_MASK) {
+        filter_horiz_w8_msa(src_x, src_stride, temp + (z * 8), x_filter);
+      } else {
+        for (i = 0; i < 8; ++i) {
+          temp[z * 8 + i] = src_x[3 + i * src_stride];
+        }
+      }
+
+      x_q4 += x_step_q4;
+    }
+
+    transpose8x8_to_dst(temp, dst, dst_stride);
+
+    src += src_stride * 8;
+    dst += dst_stride * 8;
+  } while (y -= 8);
+}
+
+static void scaledconvolve_horiz_mul16(const uint8_t *src, ptrdiff_t src_stride,
+                                       uint8_t *dst, ptrdiff_t dst_stride,
+                                       const InterpKernel *x_filters, int x0_q4,
+                                       int x_step_q4, int w, int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[16 * 16]);
+  int x, y, z, i;
+
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  // This function processes 16x16 areas.  The intermediate height is not always
+  // a multiple of 16, so force it to be a multiple of 8 here.
+  y = h + (16 - (h & 0xF));
+
+  do {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; x += 16) {
+      for (z = 0; z < 16; ++z) {
+        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+        const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+
+        if (x_q4 & SUBPEL_MASK) {
+          filter_horiz_w16_msa(src_x, src_stride, temp + (z * 16), x_filter);
+        } else {
+          for (i = 0; i < 16; ++i) {
+            temp[z * 16 + i] = src_x[3 + i * src_stride];
+          }
+        }
+
+        x_q4 += x_step_q4;
+      }
+
+      transpose16x16_to_dst(temp, dst + x, dst_stride);
+    }
+
+    src += src_stride * 16;
+    dst += dst_stride * 16;
+  } while (y -= 16);
+}
+
+static void filter_vert_w4_msa(const uint8_t *src_y, ptrdiff_t src_pitch,
+                               uint8_t *dst, const int16_t *y_filter) {
+  uint32_t srcw0, srcw1, srcw2, srcw3, srcw4, srcw5, srcw6, srcw7;
+  uint32_t res;
+  v16u8 src0 = { 0 }, src1 = { 0 }, dst0;
+  v16i8 out0, out1;
+  v16i8 shf1 = { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 };
+  v16i8 shf2 = shf1 + 8;
+  v16i8 filt_shf0 = { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9 };
+  v16i8 filt_shf1 = filt_shf0 + 2;
+  v16i8 filt_shf2 = filt_shf0 + 4;
+  v16i8 filt_shf3 = filt_shf0 + 6;
+  v8i16 filt, src0_h, src1_h, src2_h, src3_h;
+  v8i16 filt0, filt1, filt2, filt3;
+
+  LW4(src_y, src_pitch, srcw0, srcw1, srcw2, srcw3);
+  LW4(src_y + 4 * src_pitch, src_pitch, srcw4, srcw5, srcw6, srcw7);
+  INSERT_W4_UB(srcw0, srcw1, srcw2, srcw3, src0);
+  INSERT_W4_UB(srcw4, srcw5, srcw6, srcw7, src1);
+  VSHF_B2_SB(src0, src1, src0, src1, shf1, shf2, out0, out1);
+  XORI_B2_128_SB(out0, out1);
+  UNPCK_SB_SH(out0, src0_h, src1_h);
+  UNPCK_SB_SH(out1, src2_h, src3_h);
+
+  filt = LD_SH(y_filter);
+  VSHF_B2_SH(filt, filt, filt, filt, filt_shf0, filt_shf1, filt0, filt1);
+  VSHF_B2_SH(filt, filt, filt, filt, filt_shf2, filt_shf3, filt2, filt3);
+
+  src0_h *= filt0;
+  src0_h += src1_h * filt1;
+  src0_h += src2_h * filt2;
+  src0_h += src3_h * filt3;
+
+  src1_h = (v8i16)__msa_sldi_b((v16i8)src0_h, (v16i8)src0_h, 8);
+
+  src0_h = __msa_adds_s_h(src0_h, src1_h);
+  src0_h = __msa_srari_h(src0_h, FILTER_BITS);
+  src0_h = __msa_sat_s_h(src0_h, 7);
+  dst0 = PCKEV_XORI128_UB(src0_h, src0_h);
+  res = __msa_copy_u_w((v4i32)dst0, 0);
+  SW(res, dst);
+}
+
+static void filter_vert_w8_msa(const uint8_t *src_y, ptrdiff_t src_pitch,
+                               uint8_t *dst, const int16_t *y_filter) {
+  uint64_t srcd0, srcd1, srcd2, srcd3;
+  v16u8 dst0;
+  v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
+  v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h;
+  v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7;
+
+  LD4(src_y, src_pitch, srcd0, srcd1, srcd2, srcd3);
+  INSERT_D2_SB(srcd0, srcd1, src0);
+  INSERT_D2_SB(srcd2, srcd3, src1);
+  LD4(src_y + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
+  INSERT_D2_SB(srcd0, srcd1, src2);
+  INSERT_D2_SB(srcd2, srcd3, src3);
+
+  filt = LD_SH(y_filter);
+  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+  SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7);
+
+  XORI_B4_128_SB(src0, src1, src2, src3);
+  UNPCK_SB_SH(src0, src0_h, src1_h);
+  UNPCK_SB_SH(src1, src2_h, src3_h);
+  UNPCK_SB_SH(src2, src4_h, src5_h);
+  UNPCK_SB_SH(src3, src6_h, src7_h);
+
+  src0_h *= filt0;
+  src4_h *= filt4;
+  src0_h += src1_h * filt1;
+  src4_h += src5_h * filt5;
+  src0_h += src2_h * filt2;
+  src4_h += src6_h * filt6;
+  src0_h += src3_h * filt3;
+  src4_h += src7_h * filt7;
+
+  src0_h = __msa_adds_s_h(src0_h, src4_h);
+  src0_h = __msa_srari_h(src0_h, FILTER_BITS);
+  src0_h = __msa_sat_s_h(src0_h, 7);
+  dst0 = PCKEV_XORI128_UB(src0_h, src0_h);
+  ST8x1_UB(dst0, dst);
+}
+
+static void filter_vert_mul_w16_msa(const uint8_t *src_y, ptrdiff_t src_pitch,
+                                    uint8_t *dst, const int16_t *y_filter,
+                                    int w) {
+  int x;
+  v16u8 dst0;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h;
+  v8i16 src8_h, src9_h, src10_h, src11_h, src12_h, src13_h, src14_h, src15_h;
+  v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7;
+
+  filt = LD_SH(y_filter);
+  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+  SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7);
+
+  for (x = 0; x < w; x += 16) {
+    LD_SB8(src_y, src_pitch, src0, src1, src2, src3, src4, src5, src6, src7);
+    src_y += 16;
+
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    XORI_B4_128_SB(src4, src5, src6, src7);
+    UNPCK_SB_SH(src0, src0_h, src1_h);
+    UNPCK_SB_SH(src1, src2_h, src3_h);
+    UNPCK_SB_SH(src2, src4_h, src5_h);
+    UNPCK_SB_SH(src3, src6_h, src7_h);
+    UNPCK_SB_SH(src4, src8_h, src9_h);
+    UNPCK_SB_SH(src5, src10_h, src11_h);
+    UNPCK_SB_SH(src6, src12_h, src13_h);
+    UNPCK_SB_SH(src7, src14_h, src15_h);
+
+    src0_h *= filt0;
+    src1_h *= filt0;
+    src8_h *= filt4;
+    src9_h *= filt4;
+    src0_h += src2_h * filt1;
+    src1_h += src3_h * filt1;
+    src8_h += src10_h * filt5;
+    src9_h += src11_h * filt5;
+    src0_h += src4_h * filt2;
+    src1_h += src5_h * filt2;
+    src8_h += src12_h * filt6;
+    src9_h += src13_h * filt6;
+    src0_h += src6_h * filt3;
+    src1_h += src7_h * filt3;
+    src8_h += src14_h * filt7;
+    src9_h += src15_h * filt7;
+
+    ADDS_SH2_SH(src0_h, src8_h, src1_h, src9_h, src0_h, src1_h);
+    SRARI_H2_SH(src0_h, src1_h, FILTER_BITS);
+    SAT_SH2_SH(src0_h, src1_h, 7);
+    dst0 = PCKEV_XORI128_UB(src0_h, src1_h);
+    ST_UB(dst0, dst);
+    dst += 16;
+  }
+}
+
+static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const InterpKernel *y_filters, int y0_q4,
+                                   int y_step_q4, int h) {
+  int y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+  for (y = 0; y < h; ++y) {
+    const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+
+    if (y_q4 & SUBPEL_MASK) {
+      filter_vert_w4_msa(src_y, src_stride, &dst[y * dst_stride], y_filter);
+    } else {
+      uint32_t srcd = LW(src_y + 3 * src_stride);
+      SW(srcd, dst + y * dst_stride);
+    }
+
+    y_q4 += y_step_q4;
+  }
+}
+
+static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const InterpKernel *y_filters, int y0_q4,
+                                   int y_step_q4, int h) {
+  int y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+  for (y = 0; y < h; ++y) {
+    const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+
+    if (y_q4 & SUBPEL_MASK) {
+      filter_vert_w8_msa(src_y, src_stride, &dst[y * dst_stride], y_filter);
+    } else {
+      uint64_t srcd = LD(src_y + 3 * src_stride);
+      SD(srcd, dst + y * dst_stride);
+    }
+
+    y_q4 += y_step_q4;
+  }
+}
+
+static void scaledconvolve_vert_mul16(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const InterpKernel *y_filters, int y0_q4,
+                                      int y_step_q4, int w, int h) {
+  int x, y;
+  int y_q4 = y0_q4;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+  for (y = 0; y < h; ++y) {
+    const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+
+    if (y_q4 & SUBPEL_MASK) {
+      filter_vert_mul_w16_msa(src_y, src_stride, &dst[y * dst_stride], y_filter,
+                              w);
+    } else {
+      for (x = 0; x < w; ++x) {
+        dst[x + y * dst_stride] = src_y[x + 3 * src_stride];
+      }
+    }
+
+    y_q4 += y_step_q4;
+  }
+}
+
+void vpx_scaled_2d_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                       ptrdiff_t dst_stride, const InterpKernel *filter,
+                       int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+                       int w, int h) {
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  // --Require an additional 8 rows for the horiz_w8 transpose tail.
+  DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+  assert(x_step_q4 <= 64);
+
+  if ((0 == x0_q4) && (16 == x_step_q4) && (0 == y0_q4) && (16 == y_step_q4)) {
+    vpx_convolve_copy_msa(src, src_stride, dst, dst_stride, filter, x0_q4,
+                          x_step_q4, y0_q4, y_step_q4, w, h);
+  } else {
+    if (w >= 16) {
+      scaledconvolve_horiz_mul16(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                                 src_stride, temp, 64, filter, x0_q4, x_step_q4,
+                                 w, intermediate_height);
+    } else if (w == 8) {
+      scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                              src_stride, temp, 64, filter, x0_q4, x_step_q4,
+                              intermediate_height);
+    } else {
+      scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                              src_stride, temp, 64, filter, x0_q4, x_step_q4,
+                              intermediate_height);
+    }
+
+    if (w >= 16) {
+      scaledconvolve_vert_mul16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                                dst_stride, filter, y0_q4, y_step_q4, w, h);
+    } else if (w == 8) {
+      scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                             dst_stride, filter, y0_q4, y_step_q4, h);
+    } else {
+      scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                             dst_stride, filter, y0_q4, y_step_q4, h);
+    }
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c
index 410682271f..195228689e 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c
+++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c
@@ -628,9 +628,10 @@ static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride,
 
 void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4, int w,
+                            const InterpKernel *filter, int x0_q4,
+                            int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
                             int h) {
+  const int16_t *const filter_y = filter[y0_q4];
   int8_t cnt, filt_ver[8];
 
   assert(y_step_q4 == 16);
@@ -640,7 +641,7 @@ void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
     filt_ver[cnt] = filter_y[cnt];
   }
 
-  if (((const int32_t *)filter_y)[0] == 0) {
+  if (vpx_get_filter_taps(filter_y) == 2) {
     switch (w) {
       case 4:
         common_vt_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
@@ -663,8 +664,8 @@ void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                              &filt_ver[3], h);
         break;
       default:
-        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                             x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                             x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   } else {
@@ -690,8 +691,8 @@ void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                              filt_ver, h);
         break;
       default:
-        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                             x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                             x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   }
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c
index 45399bad85..ce649935da 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c
+++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/macros_msa.h"
 
 static void avg_width4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst,
@@ -188,13 +189,14 @@ static void avg_width64_msa(const uint8_t *src, int32_t src_stride,
 
 void vpx_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
-                          const int16_t *filter_x, int32_t filter_x_stride,
-                          const int16_t *filter_y, int32_t filter_y_stride,
+                          const InterpKernel *filter, int x0_q4,
+                          int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
                           int32_t w, int32_t h) {
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
 
   switch (w) {
     case 4: {
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c
index c3d87a4ab8..c2ab33a2f4 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c
+++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c
@@ -9,6 +9,7 @@
  */
 
 #include <string.h>
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/macros_msa.h"
 
 static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
@@ -198,13 +199,14 @@ static void copy_width64_msa(const uint8_t *src, int32_t src_stride,
 
 void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int32_t filter_x_stride,
-                           const int16_t *filter_y, int32_t filter_y_stride,
+                           const InterpKernel *filter, int x0_q4,
+                           int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
                            int32_t w, int32_t h) {
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
 
   switch (w) {
     case 4: {
diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_msa.h b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_msa.h
index f75679521a..a0280c5434 100644
--- a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_msa.h
+++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_msa.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_
-#define VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_
+#ifndef VPX_VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_
+#define VPX_VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_
 
 #include "vpx_dsp/mips/macros_msa.h"
 #include "vpx_dsp/vpx_filter.h"
@@ -110,14 +110,13 @@ extern const uint8_t mc_filt_mask_arr[16 * 3];
     ST_UB(tmp_m, (pdst));                                 \
   }
 
-#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, pdst, \
-                           stride)                                           \
-  {                                                                          \
-    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
-                                                                             \
-    PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m);                         \
-    PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                     \
-    AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);             \
-    ST8x4_UB(tmp0_m, tmp1_m, pdst, stride);                                  \
+#define PCKEV_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, pdst, stride) \
+  {                                                                      \
+    v16u8 tmp0_m, tmp1_m;                                                \
+    uint8_t *pdst_m = (uint8_t *)(pdst);                                 \
+                                                                         \
+    PCKEV_B2_UB(in1, in0, in3, in2, tmp0_m, tmp1_m);                     \
+    AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m);             \
+    ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                            \
   }
-#endif /* VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ */
+#endif  // VPX_VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/postproc.h b/media/libvpx/libvpx/vpx_dsp/postproc.h
index 43cb5c8e8d..37f993f814 100644
--- a/media/libvpx/libvpx/vpx_dsp/postproc.h
+++ b/media/libvpx/libvpx/vpx_dsp/postproc.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_POSTPROC_H_
-#define VPX_DSP_POSTPROC_H_
+#ifndef VPX_VPX_DSP_POSTPROC_H_
+#define VPX_VPX_DSP_POSTPROC_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -22,4 +22,4 @@ int vpx_setup_noise(double sigma, int8_t *noise, int size);
 }
 #endif
 
-#endif  // VPX_DSP_POSTPROC_H_
+#endif  // VPX_VPX_DSP_POSTPROC_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/bitdepth_conversion_vsx.h b/media/libvpx/libvpx/vpx_dsp/ppc/bitdepth_conversion_vsx.h
new file mode 100644
index 0000000000..7ac873f9fc
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/ppc/bitdepth_conversion_vsx.h
@@ -0,0 +1,47 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_
+#define VPX_VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+// Load 8 16 bit values. If the source is 32 bits then pack down with
+// saturation.
+static INLINE int16x8_t load_tran_low(int32_t c, const tran_low_t *s) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  int32x4_t u = vec_vsx_ld(c, s);
+  int32x4_t v = vec_vsx_ld(c, s + 4);
+  return vec_packs(u, v);
+#else
+  return vec_vsx_ld(c, s);
+#endif
+}
+
+// Store 8 16 bit values. If the destination is 32 bits then sign extend the
+// values by multiplying by 1.
+static INLINE void store_tran_low(int16x8_t v, int32_t c, tran_low_t *s) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int16x8_t one = vec_splat_s16(1);
+  const int32x4_t even = vec_mule(v, one);
+  const int32x4_t odd = vec_mulo(v, one);
+  const int32x4_t high = vec_mergeh(even, odd);
+  const int32x4_t low = vec_mergel(even, odd);
+  vec_vsx_st(high, c, s);
+  vec_vsx_st(low, c, s + 4);
+#else
+  vec_vsx_st(v, c, s);
+#endif
+}
+
+#endif  // VPX_VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/deblock_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/deblock_vsx.c
new file mode 100644
index 0000000000..2129911696
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/ppc/deblock_vsx.c
@@ -0,0 +1,374 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+extern const int16_t vpx_rv[];
+
+static const uint8x16_t load_merge = { 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A,
+                                       0x0C, 0x0E, 0x18, 0x19, 0x1A, 0x1B,
+                                       0x1C, 0x1D, 0x1E, 0x1F };
+
+static const uint8x16_t st8_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                     0x06, 0x07, 0x18, 0x19, 0x1A, 0x1B,
+                                     0x1C, 0x1D, 0x1E, 0x1F };
+
+static INLINE uint8x16_t apply_filter(uint8x16_t ctx[4], uint8x16_t v,
+                                      uint8x16_t filter) {
+  const uint8x16_t k1 = vec_avg(ctx[0], ctx[1]);
+  const uint8x16_t k2 = vec_avg(ctx[3], ctx[2]);
+  const uint8x16_t k3 = vec_avg(k1, k2);
+  const uint8x16_t f_a = vec_max(vec_absd(v, ctx[0]), vec_absd(v, ctx[1]));
+  const uint8x16_t f_b = vec_max(vec_absd(v, ctx[2]), vec_absd(v, ctx[3]));
+  const bool8x16_t mask = vec_cmplt(vec_max(f_a, f_b), filter);
+  return vec_sel(v, vec_avg(k3, v), mask);
+}
+
+static INLINE void vert_ctx(uint8x16_t ctx[4], int col, uint8_t *src,
+                            int stride) {
+  ctx[0] = vec_vsx_ld(col - 2 * stride, src);
+  ctx[1] = vec_vsx_ld(col - stride, src);
+  ctx[2] = vec_vsx_ld(col + stride, src);
+  ctx[3] = vec_vsx_ld(col + 2 * stride, src);
+}
+
+static INLINE void horz_ctx(uint8x16_t ctx[4], uint8x16_t left_ctx,
+                            uint8x16_t v, uint8x16_t right_ctx) {
+  static const uint8x16_t l2_perm = { 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13,
+                                      0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
+                                      0x1A, 0x1B, 0x1C, 0x1D };
+
+  static const uint8x16_t l1_perm = { 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14,
+                                      0x15, 0x16, 0x17, 0x18, 0x19, 0x1A,
+                                      0x1B, 0x1C, 0x1D, 0x1E };
+
+  static const uint8x16_t r1_perm = { 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
+                                      0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C,
+                                      0x0D, 0x0E, 0x0F, 0x10 };
+
+  static const uint8x16_t r2_perm = { 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+                                      0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                      0x0E, 0x0F, 0x10, 0x11 };
+  ctx[0] = vec_perm(left_ctx, v, l2_perm);
+  ctx[1] = vec_perm(left_ctx, v, l1_perm);
+  ctx[2] = vec_perm(v, right_ctx, r1_perm);
+  ctx[3] = vec_perm(v, right_ctx, r2_perm);
+}
+void vpx_post_proc_down_and_across_mb_row_vsx(unsigned char *src_ptr,
+                                              unsigned char *dst_ptr,
+                                              int src_pixels_per_line,
+                                              int dst_pixels_per_line, int cols,
+                                              unsigned char *f, int size) {
+  int row, col;
+  uint8x16_t ctx[4], out, v, left_ctx;
+
+  for (row = 0; row < size; row++) {
+    for (col = 0; col < cols - 8; col += 16) {
+      const uint8x16_t filter = vec_vsx_ld(col, f);
+      v = vec_vsx_ld(col, src_ptr);
+      vert_ctx(ctx, col, src_ptr, src_pixels_per_line);
+      vec_vsx_st(apply_filter(ctx, v, filter), col, dst_ptr);
+    }
+
+    if (col != cols) {
+      const uint8x16_t filter = vec_vsx_ld(col, f);
+      v = vec_vsx_ld(col, src_ptr);
+      vert_ctx(ctx, col, src_ptr, src_pixels_per_line);
+      out = apply_filter(ctx, v, filter);
+      vec_vsx_st(vec_perm(out, v, st8_perm), col, dst_ptr);
+    }
+
+    /* now post_proc_across */
+    left_ctx = vec_splats(dst_ptr[0]);
+    v = vec_vsx_ld(0, dst_ptr);
+    for (col = 0; col < cols - 8; col += 16) {
+      const uint8x16_t filter = vec_vsx_ld(col, f);
+      const uint8x16_t right_ctx = (col + 16 == cols)
+                                       ? vec_splats(dst_ptr[cols - 1])
+                                       : vec_vsx_ld(col, dst_ptr + 16);
+      horz_ctx(ctx, left_ctx, v, right_ctx);
+      vec_vsx_st(apply_filter(ctx, v, filter), col, dst_ptr);
+      left_ctx = v;
+      v = right_ctx;
+    }
+
+    if (col != cols) {
+      const uint8x16_t filter = vec_vsx_ld(col, f);
+      const uint8x16_t right_ctx = vec_splats(dst_ptr[cols - 1]);
+      horz_ctx(ctx, left_ctx, v, right_ctx);
+      out = apply_filter(ctx, v, filter);
+      vec_vsx_st(vec_perm(out, v, st8_perm), col, dst_ptr);
+    }
+
+    src_ptr += src_pixels_per_line;
+    dst_ptr += dst_pixels_per_line;
+  }
+}
+
+// C: s[c + 7]
+static INLINE int16x8_t next7l_s16(uint8x16_t c) {
+  static const uint8x16_t next7_perm = {
+    0x07, 0x10, 0x08, 0x11, 0x09, 0x12, 0x0A, 0x13,
+    0x0B, 0x14, 0x0C, 0x15, 0x0D, 0x16, 0x0E, 0x17,
+  };
+  return (int16x8_t)vec_perm(c, vec_zeros_u8, next7_perm);
+}
+
+// Slide across window and add.
+static INLINE int16x8_t slide_sum_s16(int16x8_t x) {
+  // x = A B C D E F G H
+  //
+  // 0 A B C D E F G
+  const int16x8_t sum1 = vec_add(x, vec_slo(x, vec_splats((int8_t)(2 << 3))));
+  // 0 0 A B C D E F
+  const int16x8_t sum2 = vec_add(vec_slo(x, vec_splats((int8_t)(4 << 3))),
+                                 // 0 0 0 A B C D E
+                                 vec_slo(x, vec_splats((int8_t)(6 << 3))));
+  // 0 0 0 0 A B C D
+  const int16x8_t sum3 = vec_add(vec_slo(x, vec_splats((int8_t)(8 << 3))),
+                                 // 0 0 0 0 0 A B C
+                                 vec_slo(x, vec_splats((int8_t)(10 << 3))));
+  // 0 0 0 0 0 0 A B
+  const int16x8_t sum4 = vec_add(vec_slo(x, vec_splats((int8_t)(12 << 3))),
+                                 // 0 0 0 0 0 0 0 A
+                                 vec_slo(x, vec_splats((int8_t)(14 << 3))));
+  return vec_add(vec_add(sum1, sum2), vec_add(sum3, sum4));
+}
+
+// Slide across window and add.
+static INLINE int32x4_t slide_sumsq_s32(int32x4_t xsq_even, int32x4_t xsq_odd) {
+  //   0 A C E
+  // + 0 B D F
+  int32x4_t sumsq_1 = vec_add(vec_slo(xsq_even, vec_splats((int8_t)(4 << 3))),
+                              vec_slo(xsq_odd, vec_splats((int8_t)(4 << 3))));
+  //   0 0 A C
+  // + 0 0 B D
+  int32x4_t sumsq_2 = vec_add(vec_slo(xsq_even, vec_splats((int8_t)(8 << 3))),
+                              vec_slo(xsq_odd, vec_splats((int8_t)(8 << 3))));
+  //   0 0 0 A
+  // + 0 0 0 B
+  int32x4_t sumsq_3 = vec_add(vec_slo(xsq_even, vec_splats((int8_t)(12 << 3))),
+                              vec_slo(xsq_odd, vec_splats((int8_t)(12 << 3))));
+  sumsq_1 = vec_add(sumsq_1, xsq_even);
+  sumsq_2 = vec_add(sumsq_2, sumsq_3);
+  return vec_add(sumsq_1, sumsq_2);
+}
+
+// C: (b + sum + val) >> 4
+static INLINE int16x8_t filter_s16(int16x8_t b, int16x8_t sum, int16x8_t val) {
+  return vec_sra(vec_add(vec_add(b, sum), val), vec_splats((uint16_t)4));
+}
+
+// C: sumsq * 15 - sum * sum
+static INLINE bool16x8_t mask_s16(int32x4_t sumsq_even, int32x4_t sumsq_odd,
+                                  int16x8_t sum, int32x4_t lim) {
+  static const uint8x16_t mask_merge = { 0x00, 0x01, 0x10, 0x11, 0x04, 0x05,
+                                         0x14, 0x15, 0x08, 0x09, 0x18, 0x19,
+                                         0x0C, 0x0D, 0x1C, 0x1D };
+  const int32x4_t sumsq_odd_scaled =
+      vec_mul(sumsq_odd, vec_splats((int32_t)15));
+  const int32x4_t sumsq_even_scaled =
+      vec_mul(sumsq_even, vec_splats((int32_t)15));
+  const int32x4_t thres_odd = vec_sub(sumsq_odd_scaled, vec_mulo(sum, sum));
+  const int32x4_t thres_even = vec_sub(sumsq_even_scaled, vec_mule(sum, sum));
+
+  const bool32x4_t mask_odd = vec_cmplt(thres_odd, lim);
+  const bool32x4_t mask_even = vec_cmplt(thres_even, lim);
+  return vec_perm((bool16x8_t)mask_even, (bool16x8_t)mask_odd, mask_merge);
+}
+
+void vpx_mbpost_proc_across_ip_vsx(unsigned char *src, int pitch, int rows,
+                                   int cols, int flimit) {
+  int row, col;
+  const int32x4_t lim = vec_splats(flimit);
+
+  // 8 columns are processed at a time.
+  assert(cols % 8 == 0);
+
+  for (row = 0; row < rows; row++) {
+    // The sum is signed and requires at most 13 bits.
+    // (8 bits + sign) * 15 (4 bits)
+    int16x8_t sum;
+    // The sum of squares requires at most 20 bits.
+    // (16 bits + sign) * 15 (4 bits)
+    int32x4_t sumsq_even, sumsq_odd;
+
+    // Fill left context with first col.
+    int16x8_t left_ctx = vec_splats((int16_t)src[0]);
+    int16_t s = src[0] * 9;
+    int32_t ssq = src[0] * src[0] * 9 + 16;
+
+    // Fill the next 6 columns of the sliding window with cols 2 to 7.
+    for (col = 1; col <= 6; ++col) {
+      s += src[col];
+      ssq += src[col] * src[col];
+    }
+    // Set this sum to every element in the window.
+    sum = vec_splats(s);
+    sumsq_even = vec_splats(ssq);
+    sumsq_odd = vec_splats(ssq);
+
+    for (col = 0; col < cols; col += 8) {
+      bool16x8_t mask;
+      int16x8_t filtered, masked;
+      uint8x16_t out;
+
+      const uint8x16_t val = vec_vsx_ld(0, src + col);
+      const int16x8_t val_high = unpack_to_s16_h(val);
+
+      // C: s[c + 7]
+      const int16x8_t right_ctx = (col + 8 == cols)
+                                      ? vec_splats((int16_t)src[col + 7])
+                                      : next7l_s16(val);
+
+      // C: x = s[c + 7] - s[c - 8];
+      const int16x8_t x = vec_sub(right_ctx, left_ctx);
+      const int32x4_t xsq_even =
+          vec_sub(vec_mule(right_ctx, right_ctx), vec_mule(left_ctx, left_ctx));
+      const int32x4_t xsq_odd =
+          vec_sub(vec_mulo(right_ctx, right_ctx), vec_mulo(left_ctx, left_ctx));
+
+      const int32x4_t sumsq_tmp = slide_sumsq_s32(xsq_even, xsq_odd);
+      // A C E G
+      // 0 B D F
+      // 0 A C E
+      // 0 0 B D
+      // 0 0 A C
+      // 0 0 0 B
+      // 0 0 0 A
+      sumsq_even = vec_add(sumsq_even, sumsq_tmp);
+      // B D F G
+      // A C E G
+      // 0 B D F
+      // 0 A C E
+      // 0 0 B D
+      // 0 0 A C
+      // 0 0 0 B
+      // 0 0 0 A
+      sumsq_odd = vec_add(sumsq_odd, vec_add(sumsq_tmp, xsq_odd));
+
+      sum = vec_add(sum, slide_sum_s16(x));
+
+      // C: (8 + sum + s[c]) >> 4
+      filtered = filter_s16(vec_splats((int16_t)8), sum, val_high);
+      // C: sumsq * 15 - sum * sum
+      mask = mask_s16(sumsq_even, sumsq_odd, sum, lim);
+      masked = vec_sel(val_high, filtered, mask);
+
+      out = vec_perm((uint8x16_t)masked, vec_vsx_ld(0, src + col), load_merge);
+      vec_vsx_st(out, 0, src + col);
+
+      // Update window sum and square sum
+      sum = vec_splat(sum, 7);
+      sumsq_even = vec_splat(sumsq_odd, 3);
+      sumsq_odd = vec_splat(sumsq_odd, 3);
+
+      // C: s[c - 8] (for next iteration)
+      left_ctx = val_high;
+    }
+    src += pitch;
+  }
+}
+
+void vpx_mbpost_proc_down_vsx(uint8_t *dst, int pitch, int rows, int cols,
+                              int flimit) {
+  int col, row, i;
+  int16x8_t window[16];
+  const int32x4_t lim = vec_splats(flimit);
+
+  // 8 columns are processed at a time.
+  assert(cols % 8 == 0);
+  // If rows is less than 8 the bottom border extension fails.
+  assert(rows >= 8);
+
+  for (col = 0; col < cols; col += 8) {
+    // The sum is signed and requires at most 13 bits.
+    // (8 bits + sign) * 15 (4 bits)
+    int16x8_t r1, sum;
+    // The sum of squares requires at most 20 bits.
+    // (16 bits + sign) * 15 (4 bits)
+    int32x4_t sumsq_even, sumsq_odd;
+
+    r1 = unpack_to_s16_h(vec_vsx_ld(0, dst));
+    // Fill sliding window with first row.
+    for (i = 0; i <= 8; i++) {
+      window[i] = r1;
+    }
+    // First 9 rows of the sliding window are the same.
+    // sum = r1 * 9
+    sum = vec_mladd(r1, vec_splats((int16_t)9), vec_zeros_s16);
+
+    // sumsq = r1 * r1 * 9
+    sumsq_even = vec_mule(sum, r1);
+    sumsq_odd = vec_mulo(sum, r1);
+
+    // Fill the next 6 rows of the sliding window with rows 2 to 7.
+    for (i = 1; i <= 6; ++i) {
+      const int16x8_t next_row = unpack_to_s16_h(vec_vsx_ld(i * pitch, dst));
+      window[i + 8] = next_row;
+      sum = vec_add(sum, next_row);
+      sumsq_odd = vec_add(sumsq_odd, vec_mulo(next_row, next_row));
+      sumsq_even = vec_add(sumsq_even, vec_mule(next_row, next_row));
+    }
+
+    for (row = 0; row < rows; row++) {
+      int32x4_t d15_even, d15_odd, d0_even, d0_odd;
+      bool16x8_t mask;
+      int16x8_t filtered, masked;
+      uint8x16_t out;
+
+      const int16x8_t rv = vec_vsx_ld(0, vpx_rv + (row & 127));
+
+      // Move the sliding window
+      if (row + 7 < rows) {
+        window[15] = unpack_to_s16_h(vec_vsx_ld((row + 7) * pitch, dst));
+      } else {
+        window[15] = window[14];
+      }
+
+      // C: sum += s[7 * pitch] - s[-8 * pitch];
+      sum = vec_add(sum, vec_sub(window[15], window[0]));
+
+      // C: sumsq += s[7 * pitch] * s[7 * pitch] - s[-8 * pitch] * s[-8 *
+      // pitch];
+      // Optimization Note: Caching a squared-window for odd and even is
+      // slower than just repeating the multiplies.
+      d15_odd = vec_mulo(window[15], window[15]);
+      d15_even = vec_mule(window[15], window[15]);
+      d0_odd = vec_mulo(window[0], window[0]);
+      d0_even = vec_mule(window[0], window[0]);
+      sumsq_odd = vec_add(sumsq_odd, vec_sub(d15_odd, d0_odd));
+      sumsq_even = vec_add(sumsq_even, vec_sub(d15_even, d0_even));
+
+      // C: (vpx_rv[(r & 127) + (c & 7)] + sum + s[0]) >> 4
+      filtered = filter_s16(rv, sum, window[8]);
+
+      // C: sumsq * 15 - sum * sum
+      mask = mask_s16(sumsq_even, sumsq_odd, sum, lim);
+      masked = vec_sel(window[8], filtered, mask);
+
+      // TODO(ltrudeau) If cols % 16 == 0, we could just process 16 per
+      // iteration
+      out = vec_perm((uint8x16_t)masked, vec_vsx_ld(0, dst + row * pitch),
+                     load_merge);
+      vec_vsx_st(out, 0, dst + row * pitch);
+
+      // Optimization Note: Turns out that the following loop is faster than
+      // using pointers to manage the sliding window.
+      for (i = 1; i < 16; i++) {
+        window[i - 1] = window[i];
+      }
+    }
+    dst += 8;
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/fdct32x32_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/fdct32x32_vsx.c
new file mode 100644
index 0000000000..328b0e3130
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/ppc/fdct32x32_vsx.c
@@ -0,0 +1,553 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/ppc/transpose_vsx.h"
+#include "vpx_dsp/ppc/txfm_common_vsx.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+// Returns ((a +/- b) * cospi16 + (2 << 13)) >> 14.
+static INLINE void single_butterfly(int16x8_t a, int16x8_t b, int16x8_t *add,
+                                    int16x8_t *sub) {
+  // Since a + b can overflow 16 bits, the multiplication is distributed
+  // (a * c +/- b * c).
+  const int32x4_t ac_e = vec_mule(a, cospi16_v);
+  const int32x4_t ac_o = vec_mulo(a, cospi16_v);
+  const int32x4_t bc_e = vec_mule(b, cospi16_v);
+  const int32x4_t bc_o = vec_mulo(b, cospi16_v);
+
+  // Reuse the same multiplies for sum and difference.
+  const int32x4_t sum_e = vec_add(ac_e, bc_e);
+  const int32x4_t sum_o = vec_add(ac_o, bc_o);
+  const int32x4_t diff_e = vec_sub(ac_e, bc_e);
+  const int32x4_t diff_o = vec_sub(ac_o, bc_o);
+
+  // Add rounding offset
+  const int32x4_t rsum_o = vec_add(sum_o, vec_dct_const_rounding);
+  const int32x4_t rsum_e = vec_add(sum_e, vec_dct_const_rounding);
+  const int32x4_t rdiff_o = vec_add(diff_o, vec_dct_const_rounding);
+  const int32x4_t rdiff_e = vec_add(diff_e, vec_dct_const_rounding);
+
+  const int32x4_t ssum_o = vec_sra(rsum_o, vec_dct_const_bits);
+  const int32x4_t ssum_e = vec_sra(rsum_e, vec_dct_const_bits);
+  const int32x4_t sdiff_o = vec_sra(rdiff_o, vec_dct_const_bits);
+  const int32x4_t sdiff_e = vec_sra(rdiff_e, vec_dct_const_bits);
+
+  // There's no pack operation for even and odd, so we need to permute.
+  *add = (int16x8_t)vec_perm(ssum_e, ssum_o, vec_perm_odd_even_pack);
+  *sub = (int16x8_t)vec_perm(sdiff_e, sdiff_o, vec_perm_odd_even_pack);
+}
+
+// Returns (a * c1 +/- b * c2 + (2 << 13)) >> 14
+static INLINE void double_butterfly(int16x8_t a, int16x8_t c1, int16x8_t b,
+                                    int16x8_t c2, int16x8_t *add,
+                                    int16x8_t *sub) {
+  const int32x4_t ac1_o = vec_mulo(a, c1);
+  const int32x4_t ac1_e = vec_mule(a, c1);
+  const int32x4_t ac2_o = vec_mulo(a, c2);
+  const int32x4_t ac2_e = vec_mule(a, c2);
+
+  const int32x4_t bc1_o = vec_mulo(b, c1);
+  const int32x4_t bc1_e = vec_mule(b, c1);
+  const int32x4_t bc2_o = vec_mulo(b, c2);
+  const int32x4_t bc2_e = vec_mule(b, c2);
+
+  const int32x4_t sum_o = vec_add(ac1_o, bc2_o);
+  const int32x4_t sum_e = vec_add(ac1_e, bc2_e);
+  const int32x4_t diff_o = vec_sub(ac2_o, bc1_o);
+  const int32x4_t diff_e = vec_sub(ac2_e, bc1_e);
+
+  // Add rounding offset
+  const int32x4_t rsum_o = vec_add(sum_o, vec_dct_const_rounding);
+  const int32x4_t rsum_e = vec_add(sum_e, vec_dct_const_rounding);
+  const int32x4_t rdiff_o = vec_add(diff_o, vec_dct_const_rounding);
+  const int32x4_t rdiff_e = vec_add(diff_e, vec_dct_const_rounding);
+
+  const int32x4_t ssum_o = vec_sra(rsum_o, vec_dct_const_bits);
+  const int32x4_t ssum_e = vec_sra(rsum_e, vec_dct_const_bits);
+  const int32x4_t sdiff_o = vec_sra(rdiff_o, vec_dct_const_bits);
+  const int32x4_t sdiff_e = vec_sra(rdiff_e, vec_dct_const_bits);
+
+  // There's no pack operation for even and odd, so we need to permute.
+  *add = (int16x8_t)vec_perm(ssum_e, ssum_o, vec_perm_odd_even_pack);
+  *sub = (int16x8_t)vec_perm(sdiff_e, sdiff_o, vec_perm_odd_even_pack);
+}
+
+// While other architecture combine the load and the stage 1 operations, Power9
+// benchmarking show no benefit in such an approach.
+static INLINE void load(const int16_t *a, int stride, int16x8_t *b) {
+  // Tried out different combinations of load and shift instructions, this is
+  // the fastest one.
+  {
+    const int16x8_t l0 = vec_vsx_ld(0, a);
+    const int16x8_t l1 = vec_vsx_ld(0, a + stride);
+    const int16x8_t l2 = vec_vsx_ld(0, a + 2 * stride);
+    const int16x8_t l3 = vec_vsx_ld(0, a + 3 * stride);
+    const int16x8_t l4 = vec_vsx_ld(0, a + 4 * stride);
+    const int16x8_t l5 = vec_vsx_ld(0, a + 5 * stride);
+    const int16x8_t l6 = vec_vsx_ld(0, a + 6 * stride);
+    const int16x8_t l7 = vec_vsx_ld(0, a + 7 * stride);
+
+    const int16x8_t l8 = vec_vsx_ld(0, a + 8 * stride);
+    const int16x8_t l9 = vec_vsx_ld(0, a + 9 * stride);
+    const int16x8_t l10 = vec_vsx_ld(0, a + 10 * stride);
+    const int16x8_t l11 = vec_vsx_ld(0, a + 11 * stride);
+    const int16x8_t l12 = vec_vsx_ld(0, a + 12 * stride);
+    const int16x8_t l13 = vec_vsx_ld(0, a + 13 * stride);
+    const int16x8_t l14 = vec_vsx_ld(0, a + 14 * stride);
+    const int16x8_t l15 = vec_vsx_ld(0, a + 15 * stride);
+
+    b[0] = vec_sl(l0, vec_dct_scale_log2);
+    b[1] = vec_sl(l1, vec_dct_scale_log2);
+    b[2] = vec_sl(l2, vec_dct_scale_log2);
+    b[3] = vec_sl(l3, vec_dct_scale_log2);
+    b[4] = vec_sl(l4, vec_dct_scale_log2);
+    b[5] = vec_sl(l5, vec_dct_scale_log2);
+    b[6] = vec_sl(l6, vec_dct_scale_log2);
+    b[7] = vec_sl(l7, vec_dct_scale_log2);
+
+    b[8] = vec_sl(l8, vec_dct_scale_log2);
+    b[9] = vec_sl(l9, vec_dct_scale_log2);
+    b[10] = vec_sl(l10, vec_dct_scale_log2);
+    b[11] = vec_sl(l11, vec_dct_scale_log2);
+    b[12] = vec_sl(l12, vec_dct_scale_log2);
+    b[13] = vec_sl(l13, vec_dct_scale_log2);
+    b[14] = vec_sl(l14, vec_dct_scale_log2);
+    b[15] = vec_sl(l15, vec_dct_scale_log2);
+  }
+  {
+    const int16x8_t l16 = vec_vsx_ld(0, a + 16 * stride);
+    const int16x8_t l17 = vec_vsx_ld(0, a + 17 * stride);
+    const int16x8_t l18 = vec_vsx_ld(0, a + 18 * stride);
+    const int16x8_t l19 = vec_vsx_ld(0, a + 19 * stride);
+    const int16x8_t l20 = vec_vsx_ld(0, a + 20 * stride);
+    const int16x8_t l21 = vec_vsx_ld(0, a + 21 * stride);
+    const int16x8_t l22 = vec_vsx_ld(0, a + 22 * stride);
+    const int16x8_t l23 = vec_vsx_ld(0, a + 23 * stride);
+
+    const int16x8_t l24 = vec_vsx_ld(0, a + 24 * stride);
+    const int16x8_t l25 = vec_vsx_ld(0, a + 25 * stride);
+    const int16x8_t l26 = vec_vsx_ld(0, a + 26 * stride);
+    const int16x8_t l27 = vec_vsx_ld(0, a + 27 * stride);
+    const int16x8_t l28 = vec_vsx_ld(0, a + 28 * stride);
+    const int16x8_t l29 = vec_vsx_ld(0, a + 29 * stride);
+    const int16x8_t l30 = vec_vsx_ld(0, a + 30 * stride);
+    const int16x8_t l31 = vec_vsx_ld(0, a + 31 * stride);
+
+    b[16] = vec_sl(l16, vec_dct_scale_log2);
+    b[17] = vec_sl(l17, vec_dct_scale_log2);
+    b[18] = vec_sl(l18, vec_dct_scale_log2);
+    b[19] = vec_sl(l19, vec_dct_scale_log2);
+    b[20] = vec_sl(l20, vec_dct_scale_log2);
+    b[21] = vec_sl(l21, vec_dct_scale_log2);
+    b[22] = vec_sl(l22, vec_dct_scale_log2);
+    b[23] = vec_sl(l23, vec_dct_scale_log2);
+
+    b[24] = vec_sl(l24, vec_dct_scale_log2);
+    b[25] = vec_sl(l25, vec_dct_scale_log2);
+    b[26] = vec_sl(l26, vec_dct_scale_log2);
+    b[27] = vec_sl(l27, vec_dct_scale_log2);
+    b[28] = vec_sl(l28, vec_dct_scale_log2);
+    b[29] = vec_sl(l29, vec_dct_scale_log2);
+    b[30] = vec_sl(l30, vec_dct_scale_log2);
+    b[31] = vec_sl(l31, vec_dct_scale_log2);
+  }
+}
+
+static INLINE void store(tran_low_t *a, const int16x8_t *b) {
+  vec_vsx_st(b[0], 0, a);
+  vec_vsx_st(b[8], 0, a + 8);
+  vec_vsx_st(b[16], 0, a + 16);
+  vec_vsx_st(b[24], 0, a + 24);
+
+  vec_vsx_st(b[1], 0, a + 32);
+  vec_vsx_st(b[9], 0, a + 40);
+  vec_vsx_st(b[17], 0, a + 48);
+  vec_vsx_st(b[25], 0, a + 56);
+
+  vec_vsx_st(b[2], 0, a + 64);
+  vec_vsx_st(b[10], 0, a + 72);
+  vec_vsx_st(b[18], 0, a + 80);
+  vec_vsx_st(b[26], 0, a + 88);
+
+  vec_vsx_st(b[3], 0, a + 96);
+  vec_vsx_st(b[11], 0, a + 104);
+  vec_vsx_st(b[19], 0, a + 112);
+  vec_vsx_st(b[27], 0, a + 120);
+
+  vec_vsx_st(b[4], 0, a + 128);
+  vec_vsx_st(b[12], 0, a + 136);
+  vec_vsx_st(b[20], 0, a + 144);
+  vec_vsx_st(b[28], 0, a + 152);
+
+  vec_vsx_st(b[5], 0, a + 160);
+  vec_vsx_st(b[13], 0, a + 168);
+  vec_vsx_st(b[21], 0, a + 176);
+  vec_vsx_st(b[29], 0, a + 184);
+
+  vec_vsx_st(b[6], 0, a + 192);
+  vec_vsx_st(b[14], 0, a + 200);
+  vec_vsx_st(b[22], 0, a + 208);
+  vec_vsx_st(b[30], 0, a + 216);
+
+  vec_vsx_st(b[7], 0, a + 224);
+  vec_vsx_st(b[15], 0, a + 232);
+  vec_vsx_st(b[23], 0, a + 240);
+  vec_vsx_st(b[31], 0, a + 248);
+}
+
+// Returns 1 if negative 0 if positive
+static INLINE int16x8_t vec_sign_s16(int16x8_t a) {
+  return vec_sr(a, vec_shift_sign_s16);
+}
+
+// Add 2 if positive, 1 if negative, and shift by 2.
+static INLINE int16x8_t sub_round_shift(const int16x8_t a) {
+  const int16x8_t sign = vec_sign_s16(a);
+  return vec_sra(vec_sub(vec_add(a, vec_twos_s16), sign), vec_dct_scale_log2);
+}
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift without rounding.
+static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) {
+  const int16x8_t sign = vec_sign_s16(a);
+  return vec_sra(vec_add(vec_add(a, vec_ones_s16), sign), vec_dct_scale_log2);
+}
+
+static void fdct32_vsx(const int16x8_t *in, int16x8_t *out, int pass) {
+  int16x8_t temp0[32];  // Hold stages: 1, 4, 7
+  int16x8_t temp1[32];  // Hold stages: 2, 5
+  int16x8_t temp2[32];  // Hold stages: 3, 6
+  int i;
+
+  // Stage 1
+  // Unrolling this loops actually slows down Power9 benchmarks
+  for (i = 0; i < 16; i++) {
+    temp0[i] = vec_add(in[i], in[31 - i]);
+    // pass through to stage 3.
+    temp1[i + 16] = vec_sub(in[15 - i], in[i + 16]);
+  }
+
+  // Stage 2
+  // Unrolling this loops actually slows down Power9 benchmarks
+  for (i = 0; i < 8; i++) {
+    temp1[i] = vec_add(temp0[i], temp0[15 - i]);
+    temp1[i + 8] = vec_sub(temp0[7 - i], temp0[i + 8]);
+  }
+
+  // Apply butterflies (in place) on pass through to stage 3.
+  single_butterfly(temp1[27], temp1[20], &temp1[27], &temp1[20]);
+  single_butterfly(temp1[26], temp1[21], &temp1[26], &temp1[21]);
+  single_butterfly(temp1[25], temp1[22], &temp1[25], &temp1[22]);
+  single_butterfly(temp1[24], temp1[23], &temp1[24], &temp1[23]);
+
+  // dump the magnitude by 4, hence the intermediate values are within
+  // the range of 16 bits.
+  if (pass) {
+    temp1[0] = add_round_shift_s16(temp1[0]);
+    temp1[1] = add_round_shift_s16(temp1[1]);
+    temp1[2] = add_round_shift_s16(temp1[2]);
+    temp1[3] = add_round_shift_s16(temp1[3]);
+    temp1[4] = add_round_shift_s16(temp1[4]);
+    temp1[5] = add_round_shift_s16(temp1[5]);
+    temp1[6] = add_round_shift_s16(temp1[6]);
+    temp1[7] = add_round_shift_s16(temp1[7]);
+    temp1[8] = add_round_shift_s16(temp1[8]);
+    temp1[9] = add_round_shift_s16(temp1[9]);
+    temp1[10] = add_round_shift_s16(temp1[10]);
+    temp1[11] = add_round_shift_s16(temp1[11]);
+    temp1[12] = add_round_shift_s16(temp1[12]);
+    temp1[13] = add_round_shift_s16(temp1[13]);
+    temp1[14] = add_round_shift_s16(temp1[14]);
+    temp1[15] = add_round_shift_s16(temp1[15]);
+
+    temp1[16] = add_round_shift_s16(temp1[16]);
+    temp1[17] = add_round_shift_s16(temp1[17]);
+    temp1[18] = add_round_shift_s16(temp1[18]);
+    temp1[19] = add_round_shift_s16(temp1[19]);
+    temp1[20] = add_round_shift_s16(temp1[20]);
+    temp1[21] = add_round_shift_s16(temp1[21]);
+    temp1[22] = add_round_shift_s16(temp1[22]);
+    temp1[23] = add_round_shift_s16(temp1[23]);
+    temp1[24] = add_round_shift_s16(temp1[24]);
+    temp1[25] = add_round_shift_s16(temp1[25]);
+    temp1[26] = add_round_shift_s16(temp1[26]);
+    temp1[27] = add_round_shift_s16(temp1[27]);
+    temp1[28] = add_round_shift_s16(temp1[28]);
+    temp1[29] = add_round_shift_s16(temp1[29]);
+    temp1[30] = add_round_shift_s16(temp1[30]);
+    temp1[31] = add_round_shift_s16(temp1[31]);
+  }
+
+  // Stage 3
+  temp2[0] = vec_add(temp1[0], temp1[7]);
+  temp2[1] = vec_add(temp1[1], temp1[6]);
+  temp2[2] = vec_add(temp1[2], temp1[5]);
+  temp2[3] = vec_add(temp1[3], temp1[4]);
+  temp2[5] = vec_sub(temp1[2], temp1[5]);
+  temp2[6] = vec_sub(temp1[1], temp1[6]);
+  temp2[8] = temp1[8];
+  temp2[9] = temp1[9];
+
+  single_butterfly(temp1[13], temp1[10], &temp2[13], &temp2[10]);
+  single_butterfly(temp1[12], temp1[11], &temp2[12], &temp2[11]);
+  temp2[14] = temp1[14];
+  temp2[15] = temp1[15];
+
+  temp2[18] = vec_add(temp1[18], temp1[21]);
+  temp2[19] = vec_add(temp1[19], temp1[20]);
+
+  temp2[20] = vec_sub(temp1[19], temp1[20]);
+  temp2[21] = vec_sub(temp1[18], temp1[21]);
+
+  temp2[26] = vec_sub(temp1[29], temp1[26]);
+  temp2[27] = vec_sub(temp1[28], temp1[27]);
+
+  temp2[28] = vec_add(temp1[28], temp1[27]);
+  temp2[29] = vec_add(temp1[29], temp1[26]);
+
+  // Pass through Stage 4
+  temp0[7] = vec_sub(temp1[0], temp1[7]);
+  temp0[4] = vec_sub(temp1[3], temp1[4]);
+  temp0[16] = vec_add(temp1[16], temp1[23]);
+  temp0[17] = vec_add(temp1[17], temp1[22]);
+  temp0[22] = vec_sub(temp1[17], temp1[22]);
+  temp0[23] = vec_sub(temp1[16], temp1[23]);
+  temp0[24] = vec_sub(temp1[31], temp1[24]);
+  temp0[25] = vec_sub(temp1[30], temp1[25]);
+  temp0[30] = vec_add(temp1[30], temp1[25]);
+  temp0[31] = vec_add(temp1[31], temp1[24]);
+
+  // Stage 4
+  temp0[0] = vec_add(temp2[0], temp2[3]);
+  temp0[1] = vec_add(temp2[1], temp2[2]);
+  temp0[2] = vec_sub(temp2[1], temp2[2]);
+  temp0[3] = vec_sub(temp2[0], temp2[3]);
+  single_butterfly(temp2[6], temp2[5], &temp0[6], &temp0[5]);
+
+  temp0[9] = vec_add(temp2[9], temp2[10]);
+  temp0[10] = vec_sub(temp2[9], temp2[10]);
+  temp0[13] = vec_sub(temp2[14], temp2[13]);
+  temp0[14] = vec_add(temp2[14], temp2[13]);
+
+  double_butterfly(temp2[29], cospi8_v, temp2[18], cospi24_v, &temp0[29],
+                   &temp0[18]);
+  double_butterfly(temp2[28], cospi8_v, temp2[19], cospi24_v, &temp0[28],
+                   &temp0[19]);
+  double_butterfly(temp2[27], cospi24_v, temp2[20], cospi8m_v, &temp0[27],
+                   &temp0[20]);
+  double_butterfly(temp2[26], cospi24_v, temp2[21], cospi8m_v, &temp0[26],
+                   &temp0[21]);
+
+  // Pass through Stage 5
+  temp1[8] = vec_add(temp2[8], temp2[11]);
+  temp1[11] = vec_sub(temp2[8], temp2[11]);
+  temp1[12] = vec_sub(temp2[15], temp2[12]);
+  temp1[15] = vec_add(temp2[15], temp2[12]);
+
+  // Stage 5
+  // 0 and 1 pass through to 0 and 16 at the end
+  single_butterfly(temp0[0], temp0[1], &out[0], &out[16]);
+
+  // 2 and 3 pass through to 8 and 24 at the end
+  double_butterfly(temp0[3], cospi8_v, temp0[2], cospi24_v, &out[8], &out[24]);
+
+  temp1[4] = vec_add(temp0[4], temp0[5]);
+  temp1[5] = vec_sub(temp0[4], temp0[5]);
+  temp1[6] = vec_sub(temp0[7], temp0[6]);
+  temp1[7] = vec_add(temp0[7], temp0[6]);
+
+  double_butterfly(temp0[14], cospi8_v, temp0[9], cospi24_v, &temp1[14],
+                   &temp1[9]);
+  double_butterfly(temp0[13], cospi24_v, temp0[10], cospi8m_v, &temp1[13],
+                   &temp1[10]);
+
+  temp1[17] = vec_add(temp0[17], temp0[18]);
+  temp1[18] = vec_sub(temp0[17], temp0[18]);
+
+  temp1[21] = vec_sub(temp0[22], temp0[21]);
+  temp1[22] = vec_add(temp0[22], temp0[21]);
+
+  temp1[25] = vec_add(temp0[25], temp0[26]);
+  temp1[26] = vec_sub(temp0[25], temp0[26]);
+
+  temp1[29] = vec_sub(temp0[30], temp0[29]);
+  temp1[30] = vec_add(temp0[30], temp0[29]);
+
+  // Pass through Stage 6
+  temp2[16] = vec_add(temp0[16], temp0[19]);
+  temp2[19] = vec_sub(temp0[16], temp0[19]);
+  temp2[20] = vec_sub(temp0[23], temp0[20]);
+  temp2[23] = vec_add(temp0[23], temp0[20]);
+  temp2[24] = vec_add(temp0[24], temp0[27]);
+  temp2[27] = vec_sub(temp0[24], temp0[27]);
+  temp2[28] = vec_sub(temp0[31], temp0[28]);
+  temp2[31] = vec_add(temp0[31], temp0[28]);
+
+  // Stage 6
+  // 4 and 7 pass through to 4 and 28 at the end
+  double_butterfly(temp1[7], cospi4_v, temp1[4], cospi28_v, &out[4], &out[28]);
+  // 5 and 6 pass through to 20 and 12 at the end
+  double_butterfly(temp1[6], cospi20_v, temp1[5], cospi12_v, &out[20],
+                   &out[12]);
+  temp2[8] = vec_add(temp1[8], temp1[9]);
+  temp2[9] = vec_sub(temp1[8], temp1[9]);
+  temp2[10] = vec_sub(temp1[11], temp1[10]);
+  temp2[11] = vec_add(temp1[11], temp1[10]);
+  temp2[12] = vec_add(temp1[12], temp1[13]);
+  temp2[13] = vec_sub(temp1[12], temp1[13]);
+  temp2[14] = vec_sub(temp1[15], temp1[14]);
+  temp2[15] = vec_add(temp1[15], temp1[14]);
+
+  double_butterfly(temp1[30], cospi4_v, temp1[17], cospi28_v, &temp2[30],
+                   &temp2[17]);
+  double_butterfly(temp1[29], cospi28_v, temp1[18], cospi4m_v, &temp2[29],
+                   &temp2[18]);
+  double_butterfly(temp1[26], cospi20_v, temp1[21], cospi12_v, &temp2[26],
+                   &temp2[21]);
+  double_butterfly(temp1[25], cospi12_v, temp1[22], cospi20m_v, &temp2[25],
+                   &temp2[22]);
+
+  // Stage 7
+  double_butterfly(temp2[15], cospi2_v, temp2[8], cospi30_v, &out[2], &out[30]);
+  double_butterfly(temp2[14], cospi18_v, temp2[9], cospi14_v, &out[18],
+                   &out[14]);
+  double_butterfly(temp2[13], cospi10_v, temp2[10], cospi22_v, &out[10],
+                   &out[22]);
+  double_butterfly(temp2[12], cospi26_v, temp2[11], cospi6_v, &out[26],
+                   &out[6]);
+
+  temp0[16] = vec_add(temp2[16], temp2[17]);
+  temp0[17] = vec_sub(temp2[16], temp2[17]);
+  temp0[18] = vec_sub(temp2[19], temp2[18]);
+  temp0[19] = vec_add(temp2[19], temp2[18]);
+  temp0[20] = vec_add(temp2[20], temp2[21]);
+  temp0[21] = vec_sub(temp2[20], temp2[21]);
+  temp0[22] = vec_sub(temp2[23], temp2[22]);
+  temp0[23] = vec_add(temp2[23], temp2[22]);
+  temp0[24] = vec_add(temp2[24], temp2[25]);
+  temp0[25] = vec_sub(temp2[24], temp2[25]);
+  temp0[26] = vec_sub(temp2[27], temp2[26]);
+  temp0[27] = vec_add(temp2[27], temp2[26]);
+  temp0[28] = vec_add(temp2[28], temp2[29]);
+  temp0[29] = vec_sub(temp2[28], temp2[29]);
+  temp0[30] = vec_sub(temp2[31], temp2[30]);
+  temp0[31] = vec_add(temp2[31], temp2[30]);
+
+  // Final stage --- outputs indices are bit-reversed.
+  double_butterfly(temp0[31], cospi1_v, temp0[16], cospi31_v, &out[1],
+                   &out[31]);
+  double_butterfly(temp0[30], cospi17_v, temp0[17], cospi15_v, &out[17],
+                   &out[15]);
+  double_butterfly(temp0[29], cospi9_v, temp0[18], cospi23_v, &out[9],
+                   &out[23]);
+  double_butterfly(temp0[28], cospi25_v, temp0[19], cospi7_v, &out[25],
+                   &out[7]);
+  double_butterfly(temp0[27], cospi5_v, temp0[20], cospi27_v, &out[5],
+                   &out[27]);
+  double_butterfly(temp0[26], cospi21_v, temp0[21], cospi11_v, &out[21],
+                   &out[11]);
+  double_butterfly(temp0[25], cospi13_v, temp0[22], cospi19_v, &out[13],
+                   &out[19]);
+  double_butterfly(temp0[24], cospi29_v, temp0[23], cospi3_v, &out[29],
+                   &out[3]);
+
+  if (pass == 0) {
+    for (i = 0; i < 32; i++) {
+      out[i] = sub_round_shift(out[i]);
+    }
+  }
+}
+
+void vpx_fdct32x32_rd_vsx(const int16_t *input, tran_low_t *out, int stride) {
+  int16x8_t temp0[32];
+  int16x8_t temp1[32];
+  int16x8_t temp2[32];
+  int16x8_t temp3[32];
+  int16x8_t temp4[32];
+  int16x8_t temp5[32];
+  int16x8_t temp6[32];
+
+  // Process in 8x32 columns.
+  load(input, stride, temp0);
+  fdct32_vsx(temp0, temp1, 0);
+
+  load(input + 8, stride, temp0);
+  fdct32_vsx(temp0, temp2, 0);
+
+  load(input + 16, stride, temp0);
+  fdct32_vsx(temp0, temp3, 0);
+
+  load(input + 24, stride, temp0);
+  fdct32_vsx(temp0, temp4, 0);
+
+  // Generate the top row by munging the first set of 8 from each one
+  // together.
+  transpose_8x8(&temp1[0], &temp0[0]);
+  transpose_8x8(&temp2[0], &temp0[8]);
+  transpose_8x8(&temp3[0], &temp0[16]);
+  transpose_8x8(&temp4[0], &temp0[24]);
+
+  fdct32_vsx(temp0, temp5, 1);
+
+  transpose_8x8(&temp5[0], &temp6[0]);
+  transpose_8x8(&temp5[8], &temp6[8]);
+  transpose_8x8(&temp5[16], &temp6[16]);
+  transpose_8x8(&temp5[24], &temp6[24]);
+
+  store(out, temp6);
+
+  // Second row of 8x32.
+  transpose_8x8(&temp1[8], &temp0[0]);
+  transpose_8x8(&temp2[8], &temp0[8]);
+  transpose_8x8(&temp3[8], &temp0[16]);
+  transpose_8x8(&temp4[8], &temp0[24]);
+
+  fdct32_vsx(temp0, temp5, 1);
+
+  transpose_8x8(&temp5[0], &temp6[0]);
+  transpose_8x8(&temp5[8], &temp6[8]);
+  transpose_8x8(&temp5[16], &temp6[16]);
+  transpose_8x8(&temp5[24], &temp6[24]);
+
+  store(out + 8 * 32, temp6);
+
+  // Third row of 8x32
+  transpose_8x8(&temp1[16], &temp0[0]);
+  transpose_8x8(&temp2[16], &temp0[8]);
+  transpose_8x8(&temp3[16], &temp0[16]);
+  transpose_8x8(&temp4[16], &temp0[24]);
+
+  fdct32_vsx(temp0, temp5, 1);
+
+  transpose_8x8(&temp5[0], &temp6[0]);
+  transpose_8x8(&temp5[8], &temp6[8]);
+  transpose_8x8(&temp5[16], &temp6[16]);
+  transpose_8x8(&temp5[24], &temp6[24]);
+
+  store(out + 16 * 32, temp6);
+
+  // Final row of 8x32.
+  transpose_8x8(&temp1[24], &temp0[0]);
+  transpose_8x8(&temp2[24], &temp0[8]);
+  transpose_8x8(&temp3[24], &temp0[16]);
+  transpose_8x8(&temp4[24], &temp0[24]);
+
+  fdct32_vsx(temp0, temp5, 1);
+
+  transpose_8x8(&temp5[0], &temp6[0]);
+  transpose_8x8(&temp5[8], &temp6[8]);
+  transpose_8x8(&temp5[16], &temp6[16]);
+  transpose_8x8(&temp5[24], &temp6[24]);
+
+  store(out + 24 * 32, temp6);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/hadamard_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/hadamard_vsx.c
new file mode 100644
index 0000000000..e279b30478
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/ppc/hadamard_vsx.c
@@ -0,0 +1,119 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+#include "vpx_dsp/ppc/transpose_vsx.h"
+#include "vpx_dsp/ppc/bitdepth_conversion_vsx.h"
+
+static void vpx_hadamard_s16_8x8_one_pass(int16x8_t v[8]) {
+  const int16x8_t b0 = vec_add(v[0], v[1]);
+  const int16x8_t b1 = vec_sub(v[0], v[1]);
+  const int16x8_t b2 = vec_add(v[2], v[3]);
+  const int16x8_t b3 = vec_sub(v[2], v[3]);
+  const int16x8_t b4 = vec_add(v[4], v[5]);
+  const int16x8_t b5 = vec_sub(v[4], v[5]);
+  const int16x8_t b6 = vec_add(v[6], v[7]);
+  const int16x8_t b7 = vec_sub(v[6], v[7]);
+
+  const int16x8_t c0 = vec_add(b0, b2);
+  const int16x8_t c1 = vec_add(b1, b3);
+  const int16x8_t c2 = vec_sub(b0, b2);
+  const int16x8_t c3 = vec_sub(b1, b3);
+  const int16x8_t c4 = vec_add(b4, b6);
+  const int16x8_t c5 = vec_add(b5, b7);
+  const int16x8_t c6 = vec_sub(b4, b6);
+  const int16x8_t c7 = vec_sub(b5, b7);
+
+  v[0] = vec_add(c0, c4);
+  v[1] = vec_sub(c2, c6);
+  v[2] = vec_sub(c0, c4);
+  v[3] = vec_add(c2, c6);
+  v[4] = vec_add(c3, c7);
+  v[5] = vec_sub(c3, c7);
+  v[6] = vec_sub(c1, c5);
+  v[7] = vec_add(c1, c5);
+}
+
+void vpx_hadamard_8x8_vsx(const int16_t *src_diff, ptrdiff_t src_stride,
+                          tran_low_t *coeff) {
+  int16x8_t v[8];
+
+  v[0] = vec_vsx_ld(0, src_diff);
+  v[1] = vec_vsx_ld(0, src_diff + src_stride);
+  v[2] = vec_vsx_ld(0, src_diff + (2 * src_stride));
+  v[3] = vec_vsx_ld(0, src_diff + (3 * src_stride));
+  v[4] = vec_vsx_ld(0, src_diff + (4 * src_stride));
+  v[5] = vec_vsx_ld(0, src_diff + (5 * src_stride));
+  v[6] = vec_vsx_ld(0, src_diff + (6 * src_stride));
+  v[7] = vec_vsx_ld(0, src_diff + (7 * src_stride));
+
+  vpx_hadamard_s16_8x8_one_pass(v);
+
+  vpx_transpose_s16_8x8(v);
+
+  vpx_hadamard_s16_8x8_one_pass(v);
+
+  store_tran_low(v[0], 0, coeff);
+  store_tran_low(v[1], 0, coeff + 8);
+  store_tran_low(v[2], 0, coeff + 16);
+  store_tran_low(v[3], 0, coeff + 24);
+  store_tran_low(v[4], 0, coeff + 32);
+  store_tran_low(v[5], 0, coeff + 40);
+  store_tran_low(v[6], 0, coeff + 48);
+  store_tran_low(v[7], 0, coeff + 56);
+}
+
+void vpx_hadamard_16x16_vsx(const int16_t *src_diff, ptrdiff_t src_stride,
+                            tran_low_t *coeff) {
+  int i;
+  const uint16x8_t ones = vec_splat_u16(1);
+
+  /* Rearrange 16x16 to 8x32 and remove stride.
+   * Top left first. */
+  vpx_hadamard_8x8_vsx(src_diff, src_stride, coeff);
+  /* Top right. */
+  vpx_hadamard_8x8_vsx(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64);
+  /* Bottom left. */
+  vpx_hadamard_8x8_vsx(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128);
+  /* Bottom right. */
+  vpx_hadamard_8x8_vsx(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
+
+  /* Overlay the 8x8 blocks and combine. */
+  for (i = 0; i < 64; i += 8) {
+    const int16x8_t a0 = load_tran_low(0, coeff);
+    const int16x8_t a1 = load_tran_low(0, coeff + 64);
+    const int16x8_t a2 = load_tran_low(0, coeff + 128);
+    const int16x8_t a3 = load_tran_low(0, coeff + 192);
+
+    /* Prevent the result from escaping int16_t. */
+    const int16x8_t b0 = vec_sra(a0, ones);
+    const int16x8_t b1 = vec_sra(a1, ones);
+    const int16x8_t b2 = vec_sra(a2, ones);
+    const int16x8_t b3 = vec_sra(a3, ones);
+
+    const int16x8_t c0 = vec_add(b0, b1);
+    const int16x8_t c2 = vec_add(b2, b3);
+    const int16x8_t c1 = vec_sub(b0, b1);
+    const int16x8_t c3 = vec_sub(b2, b3);
+
+    const int16x8_t d0 = vec_add(c0, c2);
+    const int16x8_t d1 = vec_add(c1, c3);
+    const int16x8_t d2 = vec_sub(c0, c2);
+    const int16x8_t d3 = vec_sub(c1, c3);
+
+    store_tran_low(d0, 0, coeff);
+    store_tran_low(d1, 0, coeff + 64);
+    store_tran_low(d2, 0, coeff + 128);
+    store_tran_low(d3, 0, coeff + 192);
+
+    coeff += 8;
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/intrapred_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/intrapred_vsx.c
new file mode 100644
index 0000000000..a4c8322ff2
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/ppc/intrapred_vsx.c
@@ -0,0 +1,767 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+void vpx_v_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d = vec_vsx_ld(0, above);
+  int i;
+  (void)left;
+
+  for (i = 0; i < 16; i++, dst += stride) {
+    vec_vsx_st(d, 0, dst);
+  }
+}
+
+void vpx_v_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vec_vsx_ld(0, above);
+  const uint8x16_t d1 = vec_vsx_ld(16, above);
+  int i;
+  (void)left;
+
+  for (i = 0; i < 32; i++, dst += stride) {
+    vec_vsx_st(d0, 0, dst);
+    vec_vsx_st(d1, 16, dst);
+  }
+}
+
+// TODO(crbug.com/webm/1522): Fix test failures.
+#if 0
+static const uint32x4_t mask4 = { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
+
+void vpx_h_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d = vec_vsx_ld(0, left);
+  const uint8x16_t v0 = vec_splat(d, 0);
+  const uint8x16_t v1 = vec_splat(d, 1);
+  const uint8x16_t v2 = vec_splat(d, 2);
+  const uint8x16_t v3 = vec_splat(d, 3);
+
+  (void)above;
+
+  vec_vsx_st(vec_sel(v0, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
+  dst += stride;
+  vec_vsx_st(vec_sel(v1, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
+  dst += stride;
+  vec_vsx_st(vec_sel(v2, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
+  dst += stride;
+  vec_vsx_st(vec_sel(v3, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
+}
+
+void vpx_h_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d = vec_vsx_ld(0, left);
+  const uint8x16_t v0 = vec_splat(d, 0);
+  const uint8x16_t v1 = vec_splat(d, 1);
+  const uint8x16_t v2 = vec_splat(d, 2);
+  const uint8x16_t v3 = vec_splat(d, 3);
+
+  const uint8x16_t v4 = vec_splat(d, 4);
+  const uint8x16_t v5 = vec_splat(d, 5);
+  const uint8x16_t v6 = vec_splat(d, 6);
+  const uint8x16_t v7 = vec_splat(d, 7);
+
+  (void)above;
+
+  vec_vsx_st(xxpermdi(v0, vec_vsx_ld(0, dst), 1), 0, dst);
+  dst += stride;
+  vec_vsx_st(xxpermdi(v1, vec_vsx_ld(0, dst), 1), 0, dst);
+  dst += stride;
+  vec_vsx_st(xxpermdi(v2, vec_vsx_ld(0, dst), 1), 0, dst);
+  dst += stride;
+  vec_vsx_st(xxpermdi(v3, vec_vsx_ld(0, dst), 1), 0, dst);
+  dst += stride;
+  vec_vsx_st(xxpermdi(v4, vec_vsx_ld(0, dst), 1), 0, dst);
+  dst += stride;
+  vec_vsx_st(xxpermdi(v5, vec_vsx_ld(0, dst), 1), 0, dst);
+  dst += stride;
+  vec_vsx_st(xxpermdi(v6, vec_vsx_ld(0, dst), 1), 0, dst);
+  dst += stride;
+  vec_vsx_st(xxpermdi(v7, vec_vsx_ld(0, dst), 1), 0, dst);
+}
+#endif
+
+void vpx_h_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d = vec_vsx_ld(0, left);
+  const uint8x16_t v0 = vec_splat(d, 0);
+  const uint8x16_t v1 = vec_splat(d, 1);
+  const uint8x16_t v2 = vec_splat(d, 2);
+  const uint8x16_t v3 = vec_splat(d, 3);
+
+  const uint8x16_t v4 = vec_splat(d, 4);
+  const uint8x16_t v5 = vec_splat(d, 5);
+  const uint8x16_t v6 = vec_splat(d, 6);
+  const uint8x16_t v7 = vec_splat(d, 7);
+
+  const uint8x16_t v8 = vec_splat(d, 8);
+  const uint8x16_t v9 = vec_splat(d, 9);
+  const uint8x16_t v10 = vec_splat(d, 10);
+  const uint8x16_t v11 = vec_splat(d, 11);
+
+  const uint8x16_t v12 = vec_splat(d, 12);
+  const uint8x16_t v13 = vec_splat(d, 13);
+  const uint8x16_t v14 = vec_splat(d, 14);
+  const uint8x16_t v15 = vec_splat(d, 15);
+
+  (void)above;
+
+  vec_vsx_st(v0, 0, dst);
+  dst += stride;
+  vec_vsx_st(v1, 0, dst);
+  dst += stride;
+  vec_vsx_st(v2, 0, dst);
+  dst += stride;
+  vec_vsx_st(v3, 0, dst);
+  dst += stride;
+  vec_vsx_st(v4, 0, dst);
+  dst += stride;
+  vec_vsx_st(v5, 0, dst);
+  dst += stride;
+  vec_vsx_st(v6, 0, dst);
+  dst += stride;
+  vec_vsx_st(v7, 0, dst);
+  dst += stride;
+  vec_vsx_st(v8, 0, dst);
+  dst += stride;
+  vec_vsx_st(v9, 0, dst);
+  dst += stride;
+  vec_vsx_st(v10, 0, dst);
+  dst += stride;
+  vec_vsx_st(v11, 0, dst);
+  dst += stride;
+  vec_vsx_st(v12, 0, dst);
+  dst += stride;
+  vec_vsx_st(v13, 0, dst);
+  dst += stride;
+  vec_vsx_st(v14, 0, dst);
+  dst += stride;
+  vec_vsx_st(v15, 0, dst);
+}
+
+#define H_PREDICTOR_32(v) \
+  vec_vsx_st(v, 0, dst);  \
+  vec_vsx_st(v, 16, dst); \
+  dst += stride
+
+void vpx_h_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vec_vsx_ld(0, left);
+  const uint8x16_t d1 = vec_vsx_ld(16, left);
+
+  const uint8x16_t v0_0 = vec_splat(d0, 0);
+  const uint8x16_t v1_0 = vec_splat(d0, 1);
+  const uint8x16_t v2_0 = vec_splat(d0, 2);
+  const uint8x16_t v3_0 = vec_splat(d0, 3);
+  const uint8x16_t v4_0 = vec_splat(d0, 4);
+  const uint8x16_t v5_0 = vec_splat(d0, 5);
+  const uint8x16_t v6_0 = vec_splat(d0, 6);
+  const uint8x16_t v7_0 = vec_splat(d0, 7);
+  const uint8x16_t v8_0 = vec_splat(d0, 8);
+  const uint8x16_t v9_0 = vec_splat(d0, 9);
+  const uint8x16_t v10_0 = vec_splat(d0, 10);
+  const uint8x16_t v11_0 = vec_splat(d0, 11);
+  const uint8x16_t v12_0 = vec_splat(d0, 12);
+  const uint8x16_t v13_0 = vec_splat(d0, 13);
+  const uint8x16_t v14_0 = vec_splat(d0, 14);
+  const uint8x16_t v15_0 = vec_splat(d0, 15);
+
+  const uint8x16_t v0_1 = vec_splat(d1, 0);
+  const uint8x16_t v1_1 = vec_splat(d1, 1);
+  const uint8x16_t v2_1 = vec_splat(d1, 2);
+  const uint8x16_t v3_1 = vec_splat(d1, 3);
+  const uint8x16_t v4_1 = vec_splat(d1, 4);
+  const uint8x16_t v5_1 = vec_splat(d1, 5);
+  const uint8x16_t v6_1 = vec_splat(d1, 6);
+  const uint8x16_t v7_1 = vec_splat(d1, 7);
+  const uint8x16_t v8_1 = vec_splat(d1, 8);
+  const uint8x16_t v9_1 = vec_splat(d1, 9);
+  const uint8x16_t v10_1 = vec_splat(d1, 10);
+  const uint8x16_t v11_1 = vec_splat(d1, 11);
+  const uint8x16_t v12_1 = vec_splat(d1, 12);
+  const uint8x16_t v13_1 = vec_splat(d1, 13);
+  const uint8x16_t v14_1 = vec_splat(d1, 14);
+  const uint8x16_t v15_1 = vec_splat(d1, 15);
+
+  (void)above;
+
+  H_PREDICTOR_32(v0_0);
+  H_PREDICTOR_32(v1_0);
+  H_PREDICTOR_32(v2_0);
+  H_PREDICTOR_32(v3_0);
+
+  H_PREDICTOR_32(v4_0);
+  H_PREDICTOR_32(v5_0);
+  H_PREDICTOR_32(v6_0);
+  H_PREDICTOR_32(v7_0);
+
+  H_PREDICTOR_32(v8_0);
+  H_PREDICTOR_32(v9_0);
+  H_PREDICTOR_32(v10_0);
+  H_PREDICTOR_32(v11_0);
+
+  H_PREDICTOR_32(v12_0);
+  H_PREDICTOR_32(v13_0);
+  H_PREDICTOR_32(v14_0);
+  H_PREDICTOR_32(v15_0);
+
+  H_PREDICTOR_32(v0_1);
+  H_PREDICTOR_32(v1_1);
+  H_PREDICTOR_32(v2_1);
+  H_PREDICTOR_32(v3_1);
+
+  H_PREDICTOR_32(v4_1);
+  H_PREDICTOR_32(v5_1);
+  H_PREDICTOR_32(v6_1);
+  H_PREDICTOR_32(v7_1);
+
+  H_PREDICTOR_32(v8_1);
+  H_PREDICTOR_32(v9_1);
+  H_PREDICTOR_32(v10_1);
+  H_PREDICTOR_32(v11_1);
+
+  H_PREDICTOR_32(v12_1);
+  H_PREDICTOR_32(v13_1);
+  H_PREDICTOR_32(v14_1);
+  H_PREDICTOR_32(v15_1);
+}
+
+// TODO(crbug.com/webm/1522): Fix test failures.
+#if 0
+void vpx_tm_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
+  const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left));
+  const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above));
+  int16x8_t tmp, val;
+  uint8x16_t d;
+
+  d = vec_vsx_ld(0, dst);
+  tmp = unpack_to_s16_l(d);
+  val = vec_sub(vec_add(vec_splat(l, 0), a), tl);
+  vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
+  dst += stride;
+
+  d = vec_vsx_ld(0, dst);
+  tmp = unpack_to_s16_l(d);
+  val = vec_sub(vec_add(vec_splat(l, 1), a), tl);
+  vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
+  dst += stride;
+
+  d = vec_vsx_ld(0, dst);
+  tmp = unpack_to_s16_l(d);
+  val = vec_sub(vec_add(vec_splat(l, 2), a), tl);
+  vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
+  dst += stride;
+
+  d = vec_vsx_ld(0, dst);
+  tmp = unpack_to_s16_l(d);
+  val = vec_sub(vec_add(vec_splat(l, 3), a), tl);
+  vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
+}
+
+void vpx_tm_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
+  const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left));
+  const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above));
+  int16x8_t tmp, val;
+
+  tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
+  val = vec_sub(vec_add(vec_splat(l, 0), a), tl);
+  vec_vsx_st(vec_packsu(val, tmp), 0, dst);
+  dst += stride;
+
+  tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
+  val = vec_sub(vec_add(vec_splat(l, 1), a), tl);
+  vec_vsx_st(vec_packsu(val, tmp), 0, dst);
+  dst += stride;
+
+  tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
+  val = vec_sub(vec_add(vec_splat(l, 2), a), tl);
+  vec_vsx_st(vec_packsu(val, tmp), 0, dst);
+  dst += stride;
+
+  tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
+  val = vec_sub(vec_add(vec_splat(l, 3), a), tl);
+  vec_vsx_st(vec_packsu(val, tmp), 0, dst);
+  dst += stride;
+
+  tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
+  val = vec_sub(vec_add(vec_splat(l, 4), a), tl);
+  vec_vsx_st(vec_packsu(val, tmp), 0, dst);
+  dst += stride;
+
+  tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
+  val = vec_sub(vec_add(vec_splat(l, 5), a), tl);
+  vec_vsx_st(vec_packsu(val, tmp), 0, dst);
+  dst += stride;
+
+  tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
+  val = vec_sub(vec_add(vec_splat(l, 6), a), tl);
+  vec_vsx_st(vec_packsu(val, tmp), 0, dst);
+  dst += stride;
+
+  tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
+  val = vec_sub(vec_add(vec_splat(l, 7), a), tl);
+  vec_vsx_st(vec_packsu(val, tmp), 0, dst);
+}
+#endif
+
+static void tm_predictor_16x8(uint8_t *dst, const ptrdiff_t stride, int16x8_t l,
+                              int16x8_t ah, int16x8_t al, int16x8_t tl) {
+  int16x8_t vh, vl, ls;
+
+  ls = vec_splat(l, 0);
+  vh = vec_sub(vec_add(ls, ah), tl);
+  vl = vec_sub(vec_add(ls, al), tl);
+  vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+  dst += stride;
+
+  ls = vec_splat(l, 1);
+  vh = vec_sub(vec_add(ls, ah), tl);
+  vl = vec_sub(vec_add(ls, al), tl);
+  vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+  dst += stride;
+
+  ls = vec_splat(l, 2);
+  vh = vec_sub(vec_add(ls, ah), tl);
+  vl = vec_sub(vec_add(ls, al), tl);
+  vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+  dst += stride;
+
+  ls = vec_splat(l, 3);
+  vh = vec_sub(vec_add(ls, ah), tl);
+  vl = vec_sub(vec_add(ls, al), tl);
+  vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+  dst += stride;
+
+  ls = vec_splat(l, 4);
+  vh = vec_sub(vec_add(ls, ah), tl);
+  vl = vec_sub(vec_add(ls, al), tl);
+  vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+  dst += stride;
+
+  ls = vec_splat(l, 5);
+  vh = vec_sub(vec_add(ls, ah), tl);
+  vl = vec_sub(vec_add(ls, al), tl);
+  vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+  dst += stride;
+
+  ls = vec_splat(l, 6);
+  vh = vec_sub(vec_add(ls, ah), tl);
+  vl = vec_sub(vec_add(ls, al), tl);
+  vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+  dst += stride;
+
+  ls = vec_splat(l, 7);
+  vh = vec_sub(vec_add(ls, ah), tl);
+  vl = vec_sub(vec_add(ls, al), tl);
+  vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+}
+
+void vpx_tm_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
+  const uint8x16_t l = vec_vsx_ld(0, left);
+  const int16x8_t lh = unpack_to_s16_h(l);
+  const int16x8_t ll = unpack_to_s16_l(l);
+  const uint8x16_t a = vec_vsx_ld(0, above);
+  const int16x8_t ah = unpack_to_s16_h(a);
+  const int16x8_t al = unpack_to_s16_l(a);
+
+  tm_predictor_16x8(dst, stride, lh, ah, al, tl);
+
+  dst += stride * 8;
+
+  tm_predictor_16x8(dst, stride, ll, ah, al, tl);
+}
+
+static INLINE void tm_predictor_32x1(uint8_t *dst, const int16x8_t ls,
+                                     const int16x8_t a0h, const int16x8_t a0l,
+                                     const int16x8_t a1h, const int16x8_t a1l,
+                                     const int16x8_t tl) {
+  int16x8_t vh, vl;
+
+  vh = vec_sub(vec_add(ls, a0h), tl);
+  vl = vec_sub(vec_add(ls, a0l), tl);
+  vec_vsx_st(vec_packsu(vh, vl), 0, dst);
+  vh = vec_sub(vec_add(ls, a1h), tl);
+  vl = vec_sub(vec_add(ls, a1l), tl);
+  vec_vsx_st(vec_packsu(vh, vl), 16, dst);
+}
+
+static void tm_predictor_32x8(uint8_t *dst, const ptrdiff_t stride,
+                              const int16x8_t l, const uint8x16_t a0,
+                              const uint8x16_t a1, const int16x8_t tl) {
+  const int16x8_t a0h = unpack_to_s16_h(a0);
+  const int16x8_t a0l = unpack_to_s16_l(a0);
+  const int16x8_t a1h = unpack_to_s16_h(a1);
+  const int16x8_t a1l = unpack_to_s16_l(a1);
+
+  tm_predictor_32x1(dst, vec_splat(l, 0), a0h, a0l, a1h, a1l, tl);
+  dst += stride;
+
+  tm_predictor_32x1(dst, vec_splat(l, 1), a0h, a0l, a1h, a1l, tl);
+  dst += stride;
+
+  tm_predictor_32x1(dst, vec_splat(l, 2), a0h, a0l, a1h, a1l, tl);
+  dst += stride;
+
+  tm_predictor_32x1(dst, vec_splat(l, 3), a0h, a0l, a1h, a1l, tl);
+  dst += stride;
+
+  tm_predictor_32x1(dst, vec_splat(l, 4), a0h, a0l, a1h, a1l, tl);
+  dst += stride;
+
+  tm_predictor_32x1(dst, vec_splat(l, 5), a0h, a0l, a1h, a1l, tl);
+  dst += stride;
+
+  tm_predictor_32x1(dst, vec_splat(l, 6), a0h, a0l, a1h, a1l, tl);
+  dst += stride;
+
+  tm_predictor_32x1(dst, vec_splat(l, 7), a0h, a0l, a1h, a1l, tl);
+}
+
+void vpx_tm_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
+  const uint8x16_t l0 = vec_vsx_ld(0, left);
+  const uint8x16_t l1 = vec_vsx_ld(16, left);
+  const uint8x16_t a0 = vec_vsx_ld(0, above);
+  const uint8x16_t a1 = vec_vsx_ld(16, above);
+
+  tm_predictor_32x8(dst, stride, unpack_to_s16_h(l0), a0, a1, tl);
+  dst += stride * 8;
+
+  tm_predictor_32x8(dst, stride, unpack_to_s16_l(l0), a0, a1, tl);
+  dst += stride * 8;
+
+  tm_predictor_32x8(dst, stride, unpack_to_s16_h(l1), a0, a1, tl);
+  dst += stride * 8;
+
+  tm_predictor_32x8(dst, stride, unpack_to_s16_l(l1), a0, a1, tl);
+}
+
+static INLINE void dc_fill_predictor_8x8(uint8_t *dst, const ptrdiff_t stride,
+                                         const uint8x16_t val) {
+  int i;
+
+  for (i = 0; i < 8; i++, dst += stride) {
+    const uint8x16_t d = vec_vsx_ld(0, dst);
+    vec_vsx_st(xxpermdi(val, d, 1), 0, dst);
+  }
+}
+
+static INLINE void dc_fill_predictor_16x16(uint8_t *dst, const ptrdiff_t stride,
+                                           const uint8x16_t val) {
+  int i;
+
+  for (i = 0; i < 16; i++, dst += stride) {
+    vec_vsx_st(val, 0, dst);
+  }
+}
+
+void vpx_dc_128_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7));
+  (void)above;
+  (void)left;
+
+  dc_fill_predictor_16x16(dst, stride, v128);
+}
+
+static INLINE void dc_fill_predictor_32x32(uint8_t *dst, const ptrdiff_t stride,
+                                           const uint8x16_t val) {
+  int i;
+
+  for (i = 0; i < 32; i++, dst += stride) {
+    vec_vsx_st(val, 0, dst);
+    vec_vsx_st(val, 16, dst);
+  }
+}
+
+void vpx_dc_128_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7));
+  (void)above;
+  (void)left;
+
+  dc_fill_predictor_32x32(dst, stride, v128);
+}
+
+static uint8x16_t avg16(const uint8_t *values) {
+  const int32x4_t sum4s =
+      (int32x4_t)vec_sum4s(vec_vsx_ld(0, values), vec_splat_u32(0));
+  const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, vec_splat_s32(8));
+  const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4));
+
+  return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
+                   3);
+}
+
+void vpx_dc_left_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+
+  dc_fill_predictor_16x16(dst, stride, avg16(left));
+}
+
+void vpx_dc_top_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  dc_fill_predictor_16x16(dst, stride, avg16(above));
+}
+
+static uint8x16_t avg32(const uint8_t *values) {
+  const uint8x16_t v0 = vec_vsx_ld(0, values);
+  const uint8x16_t v1 = vec_vsx_ld(16, values);
+  const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4));
+  const int32x4_t sum4s =
+      (int32x4_t)vec_sum4s(v0, vec_sum4s(v1, vec_splat_u32(0)));
+  const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16);
+  const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5));
+
+  return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
+                   3);
+}
+
+void vpx_dc_left_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+
+  dc_fill_predictor_32x32(dst, stride, avg32(left));
+}
+
+void vpx_dc_top_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  dc_fill_predictor_32x32(dst, stride, avg32(above));
+}
+
+// TODO(crbug.com/webm/1522): Fix test failures.
+#if 0
+static uint8x16_t dc_avg8(const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t a0 = vec_vsx_ld(0, above);
+  const uint8x16_t l0 = vec_vsx_ld(0, left);
+  const int32x4_t sum4s =
+      (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0)));
+  const int32x4_t sum4s8 = xxpermdi(sum4s, vec_splat_s32(0), 1);
+  const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s8, vec_splat_s32(8));
+  const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4));
+
+  return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
+                   3);
+}
+#endif
+
+static uint8x16_t dc_avg16(const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t a0 = vec_vsx_ld(0, above);
+  const uint8x16_t l0 = vec_vsx_ld(0, left);
+  const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4));
+  const int32x4_t sum4s =
+      (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0)));
+  const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16);
+  const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5));
+
+  return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
+                   3);
+}
+
+// TODO(crbug.com/webm/1522): Fix test failures.
+#if 0
+void vpx_dc_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  dc_fill_predictor_8x8(dst, stride, dc_avg8(above, left));
+}
+#endif
+
+void vpx_dc_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  dc_fill_predictor_16x16(dst, stride, dc_avg16(above, left));
+}
+
+static uint8x16_t dc_avg32(const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t a0 = vec_vsx_ld(0, above);
+  const uint8x16_t a1 = vec_vsx_ld(16, above);
+  const uint8x16_t l0 = vec_vsx_ld(0, left);
+  const uint8x16_t l1 = vec_vsx_ld(16, left);
+  const int32x4_t v32 = vec_sl(vec_splat_s32(1), vec_splat_u32(5));
+  const uint32x4_t a_sum = vec_sum4s(a0, vec_sum4s(a1, vec_splat_u32(0)));
+  const int32x4_t sum4s = (int32x4_t)vec_sum4s(l0, vec_sum4s(l1, a_sum));
+  const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v32);
+  const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(6));
+
+  return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
+                   3);
+}
+
+void vpx_dc_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  dc_fill_predictor_32x32(dst, stride, dc_avg32(above, left));
+}
+
+static uint8x16_t avg3(const uint8x16_t a, const uint8x16_t b,
+                       const uint8x16_t c) {
+  const uint8x16_t ac =
+      vec_adds(vec_and(a, c), vec_sr(vec_xor(a, c), vec_splat_u8(1)));
+
+  return vec_avg(ac, b);
+}
+
+// Workaround vec_sld/vec_xxsldi/vec_lsdoi being missing or broken.
+static const uint8x16_t sl1 = { 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
+                                0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x10 };
+
+// TODO(crbug.com/webm/1522): Fix test failures.
+#if 0
+void vpx_d45_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t af = vec_vsx_ld(0, above);
+  const uint8x16_t above_right = vec_splat(af, 7);
+  const uint8x16_t a = xxpermdi(af, above_right, 1);
+  const uint8x16_t b = vec_perm(a, above_right, sl1);
+  const uint8x16_t c = vec_perm(b, above_right, sl1);
+  uint8x16_t row = avg3(a, b, c);
+  int i;
+  (void)left;
+
+  for (i = 0; i < 8; i++) {
+    const uint8x16_t d = vec_vsx_ld(0, dst);
+    vec_vsx_st(xxpermdi(row, d, 1), 0, dst);
+    dst += stride;
+    row = vec_perm(row, above_right, sl1);
+  }
+}
+#endif
+
+void vpx_d45_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t a = vec_vsx_ld(0, above);
+  const uint8x16_t above_right = vec_splat(a, 15);
+  const uint8x16_t b = vec_perm(a, above_right, sl1);
+  const uint8x16_t c = vec_perm(b, above_right, sl1);
+  uint8x16_t row = avg3(a, b, c);
+  int i;
+  (void)left;
+
+  for (i = 0; i < 16; i++) {
+    vec_vsx_st(row, 0, dst);
+    dst += stride;
+    row = vec_perm(row, above_right, sl1);
+  }
+}
+
+void vpx_d45_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t a0 = vec_vsx_ld(0, above);
+  const uint8x16_t a1 = vec_vsx_ld(16, above);
+  const uint8x16_t above_right = vec_splat(a1, 15);
+  const uint8x16_t b0 = vec_perm(a0, a1, sl1);
+  const uint8x16_t b1 = vec_perm(a1, above_right, sl1);
+  const uint8x16_t c0 = vec_perm(b0, b1, sl1);
+  const uint8x16_t c1 = vec_perm(b1, above_right, sl1);
+  uint8x16_t row0 = avg3(a0, b0, c0);
+  uint8x16_t row1 = avg3(a1, b1, c1);
+  int i;
+  (void)left;
+
+  for (i = 0; i < 32; i++) {
+    vec_vsx_st(row0, 0, dst);
+    vec_vsx_st(row1, 16, dst);
+    dst += stride;
+    row0 = vec_perm(row0, row1, sl1);
+    row1 = vec_perm(row1, above_right, sl1);
+  }
+}
+
+// TODO(crbug.com/webm/1522): Fix test failures.
+#if 0
+void vpx_d63_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t af = vec_vsx_ld(0, above);
+  const uint8x16_t above_right = vec_splat(af, 9);
+  const uint8x16_t a = xxpermdi(af, above_right, 1);
+  const uint8x16_t b = vec_perm(a, above_right, sl1);
+  const uint8x16_t c = vec_perm(b, above_right, sl1);
+  uint8x16_t row0 = vec_avg(a, b);
+  uint8x16_t row1 = avg3(a, b, c);
+  int i;
+  (void)left;
+
+  for (i = 0; i < 4; i++) {
+    const uint8x16_t d0 = vec_vsx_ld(0, dst);
+    const uint8x16_t d1 = vec_vsx_ld(0, dst + stride);
+    vec_vsx_st(xxpermdi(row0, d0, 1), 0, dst);
+    vec_vsx_st(xxpermdi(row1, d1, 1), 0, dst + stride);
+    dst += stride * 2;
+    row0 = vec_perm(row0, above_right, sl1);
+    row1 = vec_perm(row1, above_right, sl1);
+  }
+}
+#endif
+
+void vpx_d63_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t a0 = vec_vsx_ld(0, above);
+  const uint8x16_t a1 = vec_vsx_ld(16, above);
+  const uint8x16_t above_right = vec_splat(a1, 0);
+  const uint8x16_t b = vec_perm(a0, above_right, sl1);
+  const uint8x16_t c = vec_perm(b, above_right, sl1);
+  uint8x16_t row0 = vec_avg(a0, b);
+  uint8x16_t row1 = avg3(a0, b, c);
+  int i;
+  (void)left;
+
+  for (i = 0; i < 8; i++) {
+    vec_vsx_st(row0, 0, dst);
+    vec_vsx_st(row1, 0, dst + stride);
+    dst += stride * 2;
+    row0 = vec_perm(row0, above_right, sl1);
+    row1 = vec_perm(row1, above_right, sl1);
+  }
+}
+
+void vpx_d63_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t a0 = vec_vsx_ld(0, above);
+  const uint8x16_t a1 = vec_vsx_ld(16, above);
+  const uint8x16_t a2 = vec_vsx_ld(32, above);
+  const uint8x16_t above_right = vec_splat(a2, 0);
+  const uint8x16_t b0 = vec_perm(a0, a1, sl1);
+  const uint8x16_t b1 = vec_perm(a1, above_right, sl1);
+  const uint8x16_t c0 = vec_perm(b0, b1, sl1);
+  const uint8x16_t c1 = vec_perm(b1, above_right, sl1);
+  uint8x16_t row0_0 = vec_avg(a0, b0);
+  uint8x16_t row0_1 = vec_avg(a1, b1);
+  uint8x16_t row1_0 = avg3(a0, b0, c0);
+  uint8x16_t row1_1 = avg3(a1, b1, c1);
+  int i;
+  (void)left;
+
+  for (i = 0; i < 16; i++) {
+    vec_vsx_st(row0_0, 0, dst);
+    vec_vsx_st(row0_1, 16, dst);
+    vec_vsx_st(row1_0, 0, dst + stride);
+    vec_vsx_st(row1_1, 16, dst + stride);
+    dst += stride * 2;
+    row0_0 = vec_perm(row0_0, row0_1, sl1);
+    row0_1 = vec_perm(row0_1, above_right, sl1);
+    row1_0 = vec_perm(row1_0, row1_1, sl1);
+    row1_1 = vec_perm(row1_1, above_right, sl1);
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c
new file mode 100644
index 0000000000..8c4e603dda
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c
@@ -0,0 +1,1828 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx_dsp/ppc/bitdepth_conversion_vsx.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+#include "vpx_dsp/ppc/inv_txfm_vsx.h"
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static const int16x8_t cospi1_v = { 16364, 16364, 16364, 16364,
+                                    16364, 16364, 16364, 16364 };
+static const int16x8_t cospi1m_v = { -16364, -16364, -16364, -16364,
+                                     -16364, -16364, -16364, -16364 };
+static const int16x8_t cospi2_v = { 16305, 16305, 16305, 16305,
+                                    16305, 16305, 16305, 16305 };
+static const int16x8_t cospi2m_v = { -16305, -16305, -16305, -16305,
+                                     -16305, -16305, -16305, -16305 };
+static const int16x8_t cospi3_v = { 16207, 16207, 16207, 16207,
+                                    16207, 16207, 16207, 16207 };
+static const int16x8_t cospi4_v = { 16069, 16069, 16069, 16069,
+                                    16069, 16069, 16069, 16069 };
+static const int16x8_t cospi4m_v = { -16069, -16069, -16069, -16069,
+                                     -16069, -16069, -16069, -16069 };
+static const int16x8_t cospi5_v = { 15893, 15893, 15893, 15893,
+                                    15893, 15893, 15893, 15893 };
+static const int16x8_t cospi5m_v = { -15893, -15893, -15893, -15893,
+                                     -15893, -15893, -15893, -15893 };
+static const int16x8_t cospi6_v = { 15679, 15679, 15679, 15679,
+                                    15679, 15679, 15679, 15679 };
+static const int16x8_t cospi7_v = { 15426, 15426, 15426, 15426,
+                                    15426, 15426, 15426, 15426 };
+static const int16x8_t cospi8_v = { 15137, 15137, 15137, 15137,
+                                    15137, 15137, 15137, 15137 };
+static const int16x8_t cospi8m_v = { -15137, -15137, -15137, -15137,
+                                     -15137, -15137, -15137, -15137 };
+static const int16x8_t cospi9_v = { 14811, 14811, 14811, 14811,
+                                    14811, 14811, 14811, 14811 };
+static const int16x8_t cospi9m_v = { -14811, -14811, -14811, -14811,
+                                     -14811, -14811, -14811, -14811 };
+static const int16x8_t cospi10_v = { 14449, 14449, 14449, 14449,
+                                     14449, 14449, 14449, 14449 };
+static const int16x8_t cospi10m_v = { -14449, -14449, -14449, -14449,
+                                      -14449, -14449, -14449, -14449 };
+static const int16x8_t cospi11_v = { 14053, 14053, 14053, 14053,
+                                     14053, 14053, 14053, 14053 };
+static const int16x8_t cospi12_v = { 13623, 13623, 13623, 13623,
+                                     13623, 13623, 13623, 13623 };
+static const int16x8_t cospi12m_v = { -13623, -13623, -13623, -13623,
+                                      -13623, -13623, -13623, -13623 };
+static const int16x8_t cospi13_v = { 13160, 13160, 13160, 13160,
+                                     13160, 13160, 13160, 13160 };
+static const int16x8_t cospi13m_v = { -13160, -13160, -13160, -13160,
+                                      -13160, -13160, -13160, -13160 };
+static const int16x8_t cospi14_v = { 12665, 12665, 12665, 12665,
+                                     12665, 12665, 12665, 12665 };
+static const int16x8_t cospi15_v = { 12140, 12140, 12140, 12140,
+                                     12140, 12140, 12140, 12140 };
+static const int16x8_t cospi16_v = { 11585, 11585, 11585, 11585,
+                                     11585, 11585, 11585, 11585 };
+static const int16x8_t cospi16m_v = { -11585, -11585, -11585, -11585,
+                                      -11585, -11585, -11585, -11585 };
+static const int16x8_t cospi17_v = { 11003, 11003, 11003, 11003,
+                                     11003, 11003, 11003, 11003 };
+static const int16x8_t cospi17m_v = { -11003, -11003, -11003, -11003,
+                                      -11003, -11003, -11003, -11003 };
+static const int16x8_t cospi18_v = { 10394, 10394, 10394, 10394,
+                                     10394, 10394, 10394, 10394 };
+static const int16x8_t cospi18m_v = { -10394, -10394, -10394, -10394,
+                                      -10394, -10394, -10394, -10394 };
+static const int16x8_t cospi19_v = { 9760, 9760, 9760, 9760,
+                                     9760, 9760, 9760, 9760 };
+static const int16x8_t cospi20_v = { 9102, 9102, 9102, 9102,
+                                     9102, 9102, 9102, 9102 };
+static const int16x8_t cospi20m_v = { -9102, -9102, -9102, -9102,
+                                      -9102, -9102, -9102, -9102 };
+static const int16x8_t cospi21_v = { 8423, 8423, 8423, 8423,
+                                     8423, 8423, 8423, 8423 };
+static const int16x8_t cospi21m_v = { -8423, -8423, -8423, -8423,
+                                      -8423, -8423, -8423, -8423 };
+static const int16x8_t cospi22_v = { 7723, 7723, 7723, 7723,
+                                     7723, 7723, 7723, 7723 };
+static const int16x8_t cospi23_v = { 7005, 7005, 7005, 7005,
+                                     7005, 7005, 7005, 7005 };
+static const int16x8_t cospi24_v = { 6270, 6270, 6270, 6270,
+                                     6270, 6270, 6270, 6270 };
+static const int16x8_t cospi24m_v = { -6270, -6270, -6270, -6270,
+                                      -6270, -6270, -6270, -6270 };
+static const int16x8_t cospi25_v = { 5520, 5520, 5520, 5520,
+                                     5520, 5520, 5520, 5520 };
+static const int16x8_t cospi25m_v = { -5520, -5520, -5520, -5520,
+                                      -5520, -5520, -5520, -5520 };
+static const int16x8_t cospi26_v = { 4756, 4756, 4756, 4756,
+                                     4756, 4756, 4756, 4756 };
+static const int16x8_t cospi26m_v = { -4756, -4756, -4756, -4756,
+                                      -4756, -4756, -4756, -4756 };
+static const int16x8_t cospi27_v = { 3981, 3981, 3981, 3981,
+                                     3981, 3981, 3981, 3981 };
+static const int16x8_t cospi28_v = { 3196, 3196, 3196, 3196,
+                                     3196, 3196, 3196, 3196 };
+static const int16x8_t cospi28m_v = { -3196, -3196, -3196, -3196,
+                                      -3196, -3196, -3196, -3196 };
+static const int16x8_t cospi29_v = { 2404, 2404, 2404, 2404,
+                                     2404, 2404, 2404, 2404 };
+static const int16x8_t cospi29m_v = { -2404, -2404, -2404, -2404,
+                                      -2404, -2404, -2404, -2404 };
+static const int16x8_t cospi30_v = { 1606, 1606, 1606, 1606,
+                                     1606, 1606, 1606, 1606 };
+static const int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 };
+
+static const int16x8_t sinpi_1_9_v = { 5283, 5283, 5283, 5283,
+                                       5283, 5283, 5283, 5283 };
+static const int16x8_t sinpi_2_9_v = { 9929, 9929, 9929, 9929,
+                                       9929, 9929, 9929, 9929 };
+static const int16x8_t sinpi_3_9_v = { 13377, 13377, 13377, 13377,
+                                       13377, 13377, 13377, 13377 };
+static const int16x8_t sinpi_4_9_v = { 15212, 15212, 15212, 15212,
+                                       15212, 15212, 15212, 15212 };
+
+static uint8x16_t tr8_mask0 = {
+  0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
+  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+};
+
+static uint8x16_t tr8_mask1 = {
+  0x8,  0x9,  0xA,  0xB,  0xC,  0xD,  0xE,  0xF,
+  0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F
+};
+
+#define ROUND_SHIFT_INIT                                               \
+  const int32x4_t shift = vec_sl(vec_splat_s32(1), vec_splat_u32(13)); \
+  const uint32x4_t shift14 = vec_splat_u32(14);
+
+#define DCT_CONST_ROUND_SHIFT(vec) vec = vec_sra(vec_add(vec, shift), shift14);
+
+#define PIXEL_ADD_INIT               \
+  int16x8_t add8 = vec_splat_s16(8); \
+  uint16x8_t shift4 = vec_splat_u16(4);
+
+#define PIXEL_ADD4(out, in) out = vec_sra(vec_add(in, add8), shift4);
+
+#define IDCT4(in0, in1, out0, out1)                                           \
+  t0 = vec_add(in0, in1);                                                     \
+  t1 = vec_sub(in0, in1);                                                     \
+  tmp16_0 = vec_mergeh(t0, t1);                                               \
+  temp1 = vec_sra(vec_add(vec_mule(tmp16_0, cospi16_v), shift), shift14);     \
+  temp2 = vec_sra(vec_add(vec_mulo(tmp16_0, cospi16_v), shift), shift14);     \
+                                                                              \
+  tmp16_0 = vec_mergel(in0, in1);                                             \
+  temp3 = vec_sub(vec_mule(tmp16_0, cospi24_v), vec_mulo(tmp16_0, cospi8_v)); \
+  DCT_CONST_ROUND_SHIFT(temp3);                                               \
+  temp4 = vec_add(vec_mule(tmp16_0, cospi8_v), vec_mulo(tmp16_0, cospi24_v)); \
+  DCT_CONST_ROUND_SHIFT(temp4);                                               \
+                                                                              \
+  step0 = vec_packs(temp1, temp2);                                            \
+  step1 = vec_packs(temp4, temp3);                                            \
+  out0 = vec_add(step0, step1);                                               \
+  out1 = vec_sub(step0, step1);                                               \
+  out1 = vec_perm(out1, out1, mask0);
+
+#define PACK_STORE(v0, v1)                                \
+  tmp16_0 = vec_add(vec_perm(d_u0, d_u1, tr8_mask0), v0); \
+  tmp16_1 = vec_add(vec_perm(d_u2, d_u3, tr8_mask0), v1); \
+  output_v = vec_packsu(tmp16_0, tmp16_1);                \
+                                                          \
+  vec_vsx_st(output_v, 0, tmp_dest);                      \
+  for (i = 0; i < 4; i++)                                 \
+    for (j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i];
+
+void vpx_round_store4x4_vsx(int16x8_t *in, int16x8_t *out, uint8_t *dest,
+                            int stride) {
+  int i, j;
+  uint8x16_t dest0 = vec_vsx_ld(0, dest);
+  uint8x16_t dest1 = vec_vsx_ld(stride, dest);
+  uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest);
+  uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest);
+  uint8x16_t zerov = vec_splat_u8(0);
+  int16x8_t d_u0 = (int16x8_t)vec_mergeh(dest0, zerov);
+  int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov);
+  int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov);
+  int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov);
+  int16x8_t tmp16_0, tmp16_1;
+  uint8x16_t output_v;
+  uint8_t tmp_dest[16];
+  PIXEL_ADD_INIT;
+
+  PIXEL_ADD4(out[0], in[0]);
+  PIXEL_ADD4(out[1], in[1]);
+
+  PACK_STORE(out[0], out[1]);
+}
+
+void vpx_idct4_vsx(int16x8_t *in, int16x8_t *out) {
+  int32x4_t temp1, temp2, temp3, temp4;
+  int16x8_t step0, step1, tmp16_0;
+  uint8x16_t mask0 = { 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF,
+                       0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 };
+  int16x8_t t0 = vec_mergeh(in[0], in[1]);
+  int16x8_t t1 = vec_mergel(in[0], in[1]);
+  ROUND_SHIFT_INIT
+
+  in[0] = vec_mergeh(t0, t1);
+  in[1] = vec_mergel(t0, t1);
+
+  IDCT4(in[0], in[1], out[0], out[1]);
+}
+
+void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  int16x8_t in[2], out[2];
+
+  in[0] = load_tran_low(0, input);
+  in[1] = load_tran_low(8 * sizeof(*input), input);
+  // Rows
+  vpx_idct4_vsx(in, out);
+
+  // Columns
+  vpx_idct4_vsx(out, in);
+
+  vpx_round_store4x4_vsx(in, out, dest, stride);
+}
+
+#define TRANSPOSE8x8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
+                     out3, out4, out5, out6, out7)                             \
+  out0 = vec_mergeh(in0, in1);                                                 \
+  out1 = vec_mergel(in0, in1);                                                 \
+  out2 = vec_mergeh(in2, in3);                                                 \
+  out3 = vec_mergel(in2, in3);                                                 \
+  out4 = vec_mergeh(in4, in5);                                                 \
+  out5 = vec_mergel(in4, in5);                                                 \
+  out6 = vec_mergeh(in6, in7);                                                 \
+  out7 = vec_mergel(in6, in7);                                                 \
+  in0 = (int16x8_t)vec_mergeh((int32x4_t)out0, (int32x4_t)out2);               \
+  in1 = (int16x8_t)vec_mergel((int32x4_t)out0, (int32x4_t)out2);               \
+  in2 = (int16x8_t)vec_mergeh((int32x4_t)out1, (int32x4_t)out3);               \
+  in3 = (int16x8_t)vec_mergel((int32x4_t)out1, (int32x4_t)out3);               \
+  in4 = (int16x8_t)vec_mergeh((int32x4_t)out4, (int32x4_t)out6);               \
+  in5 = (int16x8_t)vec_mergel((int32x4_t)out4, (int32x4_t)out6);               \
+  in6 = (int16x8_t)vec_mergeh((int32x4_t)out5, (int32x4_t)out7);               \
+  in7 = (int16x8_t)vec_mergel((int32x4_t)out5, (int32x4_t)out7);               \
+  out0 = vec_perm(in0, in4, tr8_mask0);                                        \
+  out1 = vec_perm(in0, in4, tr8_mask1);                                        \
+  out2 = vec_perm(in1, in5, tr8_mask0);                                        \
+  out3 = vec_perm(in1, in5, tr8_mask1);                                        \
+  out4 = vec_perm(in2, in6, tr8_mask0);                                        \
+  out5 = vec_perm(in2, in6, tr8_mask1);                                        \
+  out6 = vec_perm(in3, in7, tr8_mask0);                                        \
+  out7 = vec_perm(in3, in7, tr8_mask1);
+
+/* for the: temp1 = step[x] * cospi_q - step[y] * cospi_z
+ *          temp2 = step[x] * cospi_z + step[y] * cospi_q */
+#define STEP8_0(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1)             \
+  tmp16_0 = vec_mergeh(inpt0, inpt1);                                     \
+  tmp16_1 = vec_mergel(inpt0, inpt1);                                     \
+  temp10 = vec_sub(vec_mule(tmp16_0, cospi0), vec_mulo(tmp16_0, cospi1)); \
+  temp11 = vec_sub(vec_mule(tmp16_1, cospi0), vec_mulo(tmp16_1, cospi1)); \
+  DCT_CONST_ROUND_SHIFT(temp10);                                          \
+  DCT_CONST_ROUND_SHIFT(temp11);                                          \
+  outpt0 = vec_packs(temp10, temp11);                                     \
+  temp10 = vec_add(vec_mule(tmp16_0, cospi1), vec_mulo(tmp16_0, cospi0)); \
+  temp11 = vec_add(vec_mule(tmp16_1, cospi1), vec_mulo(tmp16_1, cospi0)); \
+  DCT_CONST_ROUND_SHIFT(temp10);                                          \
+  DCT_CONST_ROUND_SHIFT(temp11);                                          \
+  outpt1 = vec_packs(temp10, temp11);
+
+#define STEP8_1(inpt0, inpt1, outpt0, outpt1, cospi) \
+  tmp16_2 = vec_sub(inpt0, inpt1);                   \
+  tmp16_3 = vec_add(inpt0, inpt1);                   \
+  tmp16_0 = vec_mergeh(tmp16_2, tmp16_3);            \
+  tmp16_1 = vec_mergel(tmp16_2, tmp16_3);            \
+  temp10 = vec_mule(tmp16_0, cospi);                 \
+  temp11 = vec_mule(tmp16_1, cospi);                 \
+  DCT_CONST_ROUND_SHIFT(temp10);                     \
+  DCT_CONST_ROUND_SHIFT(temp11);                     \
+  outpt0 = vec_packs(temp10, temp11);                \
+  temp10 = vec_mulo(tmp16_0, cospi);                 \
+  temp11 = vec_mulo(tmp16_1, cospi);                 \
+  DCT_CONST_ROUND_SHIFT(temp10);                     \
+  DCT_CONST_ROUND_SHIFT(temp11);                     \
+  outpt1 = vec_packs(temp10, temp11);
+
+#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7)    \
+  /* stage 1 */                                          \
+  step0 = in0;                                           \
+  step2 = in4;                                           \
+  step1 = in2;                                           \
+  step3 = in6;                                           \
+                                                         \
+  STEP8_0(in1, in7, step4, step7, cospi28_v, cospi4_v);  \
+  STEP8_0(in5, in3, step5, step6, cospi12_v, cospi20_v); \
+                                                         \
+  /* stage 2 */                                          \
+  STEP8_1(step0, step2, in1, in0, cospi16_v);            \
+  STEP8_0(step1, step3, in2, in3, cospi24_v, cospi8_v);  \
+  in4 = vec_add(step4, step5);                           \
+  in5 = vec_sub(step4, step5);                           \
+  in6 = vec_sub(step7, step6);                           \
+  in7 = vec_add(step6, step7);                           \
+                                                         \
+  /* stage 3 */                                          \
+  step0 = vec_add(in0, in3);                             \
+  step1 = vec_add(in1, in2);                             \
+  step2 = vec_sub(in1, in2);                             \
+  step3 = vec_sub(in0, in3);                             \
+  step4 = in4;                                           \
+  STEP8_1(in6, in5, step5, step6, cospi16_v);            \
+  step7 = in7;                                           \
+                                                         \
+  /* stage 4 */                                          \
+  in0 = vec_add(step0, step7);                           \
+  in1 = vec_add(step1, step6);                           \
+  in2 = vec_add(step2, step5);                           \
+  in3 = vec_add(step3, step4);                           \
+  in4 = vec_sub(step3, step4);                           \
+  in5 = vec_sub(step2, step5);                           \
+  in6 = vec_sub(step1, step6);                           \
+  in7 = vec_sub(step0, step7);
+
+#define PIXEL_ADD(in, out, add, shiftx) \
+  out = vec_add(vec_sra(vec_add(in, add), shiftx), out);
+
+void vpx_idct8_vsx(int16x8_t *in, int16x8_t *out) {
+  int16x8_t step0, step1, step2, step3, step4, step5, step6, step7;
+  int16x8_t tmp16_0, tmp16_1, tmp16_2, tmp16_3;
+  int32x4_t temp10, temp11;
+  ROUND_SHIFT_INIT;
+
+  TRANSPOSE8x8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], out[0],
+               out[1], out[2], out[3], out[4], out[5], out[6], out[7]);
+
+  IDCT8(out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7]);
+}
+
+void vpx_round_store8x8_vsx(int16x8_t *in, uint8_t *dest, int stride) {
+  uint8x16_t zerov = vec_splat_u8(0);
+  uint8x16_t dest0 = vec_vsx_ld(0, dest);
+  uint8x16_t dest1 = vec_vsx_ld(stride, dest);
+  uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest);
+  uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest);
+  uint8x16_t dest4 = vec_vsx_ld(4 * stride, dest);
+  uint8x16_t dest5 = vec_vsx_ld(5 * stride, dest);
+  uint8x16_t dest6 = vec_vsx_ld(6 * stride, dest);
+  uint8x16_t dest7 = vec_vsx_ld(7 * stride, dest);
+  int16x8_t d_u0 = (int16x8_t)vec_mergeh(dest0, zerov);
+  int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov);
+  int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov);
+  int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov);
+  int16x8_t d_u4 = (int16x8_t)vec_mergeh(dest4, zerov);
+  int16x8_t d_u5 = (int16x8_t)vec_mergeh(dest5, zerov);
+  int16x8_t d_u6 = (int16x8_t)vec_mergeh(dest6, zerov);
+  int16x8_t d_u7 = (int16x8_t)vec_mergeh(dest7, zerov);
+  int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(1));
+  uint16x8_t shift5 = vec_splat_u16(5);
+  uint8x16_t output0, output1, output2, output3;
+
+  PIXEL_ADD(in[0], d_u0, add, shift5);
+  PIXEL_ADD(in[1], d_u1, add, shift5);
+  PIXEL_ADD(in[2], d_u2, add, shift5);
+  PIXEL_ADD(in[3], d_u3, add, shift5);
+  PIXEL_ADD(in[4], d_u4, add, shift5);
+  PIXEL_ADD(in[5], d_u5, add, shift5);
+  PIXEL_ADD(in[6], d_u6, add, shift5);
+  PIXEL_ADD(in[7], d_u7, add, shift5);
+  output0 = vec_packsu(d_u0, d_u1);
+  output1 = vec_packsu(d_u2, d_u3);
+  output2 = vec_packsu(d_u4, d_u5);
+  output3 = vec_packsu(d_u6, d_u7);
+
+  vec_vsx_st(xxpermdi(output0, dest0, 1), 0, dest);
+  vec_vsx_st(xxpermdi(output0, dest1, 3), stride, dest);
+  vec_vsx_st(xxpermdi(output1, dest2, 1), 2 * stride, dest);
+  vec_vsx_st(xxpermdi(output1, dest3, 3), 3 * stride, dest);
+  vec_vsx_st(xxpermdi(output2, dest4, 1), 4 * stride, dest);
+  vec_vsx_st(xxpermdi(output2, dest5, 3), 5 * stride, dest);
+  vec_vsx_st(xxpermdi(output3, dest6, 1), 6 * stride, dest);
+  vec_vsx_st(xxpermdi(output3, dest7, 3), 7 * stride, dest);
+}
+
+void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  int16x8_t src[8], tmp[8];
+
+  src[0] = load_tran_low(0, input);
+  src[1] = load_tran_low(8 * sizeof(*input), input);
+  src[2] = load_tran_low(16 * sizeof(*input), input);
+  src[3] = load_tran_low(24 * sizeof(*input), input);
+  src[4] = load_tran_low(32 * sizeof(*input), input);
+  src[5] = load_tran_low(40 * sizeof(*input), input);
+  src[6] = load_tran_low(48 * sizeof(*input), input);
+  src[7] = load_tran_low(56 * sizeof(*input), input);
+
+  vpx_idct8_vsx(src, tmp);
+  vpx_idct8_vsx(tmp, src);
+
+  vpx_round_store8x8_vsx(src, dest, stride);
+}
+
+#define STEP16_1(inpt0, inpt1, outpt0, outpt1, cospi) \
+  tmp16_0 = vec_mergeh(inpt0, inpt1);                 \
+  tmp16_1 = vec_mergel(inpt0, inpt1);                 \
+  temp10 = vec_mule(tmp16_0, cospi);                  \
+  temp11 = vec_mule(tmp16_1, cospi);                  \
+  temp20 = vec_mulo(tmp16_0, cospi);                  \
+  temp21 = vec_mulo(tmp16_1, cospi);                  \
+  temp30 = vec_sub(temp10, temp20);                   \
+  temp10 = vec_add(temp10, temp20);                   \
+  temp20 = vec_sub(temp11, temp21);                   \
+  temp21 = vec_add(temp11, temp21);                   \
+  DCT_CONST_ROUND_SHIFT(temp30);                      \
+  DCT_CONST_ROUND_SHIFT(temp20);                      \
+  outpt0 = vec_packs(temp30, temp20);                 \
+  DCT_CONST_ROUND_SHIFT(temp10);                      \
+  DCT_CONST_ROUND_SHIFT(temp21);                      \
+  outpt1 = vec_packs(temp10, temp21);
+
+#define IDCT16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, inA, inB,     \
+               inC, inD, inE, inF, out0, out1, out2, out3, out4, out5, out6,   \
+               out7, out8, out9, outA, outB, outC, outD, outE, outF)           \
+  /* stage 1 */                                                                \
+  /* out0 = in0; */                                                            \
+  out1 = in8;                                                                  \
+  out2 = in4;                                                                  \
+  out3 = inC;                                                                  \
+  out4 = in2;                                                                  \
+  out5 = inA;                                                                  \
+  out6 = in6;                                                                  \
+  out7 = inE;                                                                  \
+  out8 = in1;                                                                  \
+  out9 = in9;                                                                  \
+  outA = in5;                                                                  \
+  outB = inD;                                                                  \
+  outC = in3;                                                                  \
+  outD = inB;                                                                  \
+  outE = in7;                                                                  \
+  outF = inF;                                                                  \
+                                                                               \
+  /* stage 2 */                                                                \
+  /* in0 = out0; */                                                            \
+  in1 = out1;                                                                  \
+  in2 = out2;                                                                  \
+  in3 = out3;                                                                  \
+  in4 = out4;                                                                  \
+  in5 = out5;                                                                  \
+  in6 = out6;                                                                  \
+  in7 = out7;                                                                  \
+                                                                               \
+  STEP8_0(out8, outF, in8, inF, cospi30_v, cospi2_v);                          \
+  STEP8_0(out9, outE, in9, inE, cospi14_v, cospi18_v);                         \
+  STEP8_0(outA, outD, inA, inD, cospi22_v, cospi10_v);                         \
+  STEP8_0(outB, outC, inB, inC, cospi6_v, cospi26_v);                          \
+                                                                               \
+  /* stage 3 */                                                                \
+  out0 = in0;                                                                  \
+  out1 = in1;                                                                  \
+  out2 = in2;                                                                  \
+  out3 = in3;                                                                  \
+                                                                               \
+  STEP8_0(in4, in7, out4, out7, cospi28_v, cospi4_v);                          \
+  STEP8_0(in5, in6, out5, out6, cospi12_v, cospi20_v);                         \
+                                                                               \
+  out8 = vec_add(in8, in9);                                                    \
+  out9 = vec_sub(in8, in9);                                                    \
+  outA = vec_sub(inB, inA);                                                    \
+  outB = vec_add(inA, inB);                                                    \
+  outC = vec_add(inC, inD);                                                    \
+  outD = vec_sub(inC, inD);                                                    \
+  outE = vec_sub(inF, inE);                                                    \
+  outF = vec_add(inE, inF);                                                    \
+                                                                               \
+  /* stage 4 */                                                                \
+  STEP16_1(out0, out1, in1, in0, cospi16_v);                                   \
+  STEP8_0(out2, out3, in2, in3, cospi24_v, cospi8_v);                          \
+  in4 = vec_add(out4, out5);                                                   \
+  in5 = vec_sub(out4, out5);                                                   \
+  in6 = vec_sub(out7, out6);                                                   \
+  in7 = vec_add(out6, out7);                                                   \
+                                                                               \
+  in8 = out8;                                                                  \
+  inF = outF;                                                                  \
+  tmp16_0 = vec_mergeh(out9, outE);                                            \
+  tmp16_1 = vec_mergel(out9, outE);                                            \
+  temp10 = vec_sub(vec_mulo(tmp16_0, cospi24_v), vec_mule(tmp16_0, cospi8_v)); \
+  temp11 = vec_sub(vec_mulo(tmp16_1, cospi24_v), vec_mule(tmp16_1, cospi8_v)); \
+  DCT_CONST_ROUND_SHIFT(temp10);                                               \
+  DCT_CONST_ROUND_SHIFT(temp11);                                               \
+  in9 = vec_packs(temp10, temp11);                                             \
+  temp10 = vec_add(vec_mule(tmp16_0, cospi24_v), vec_mulo(tmp16_0, cospi8_v)); \
+  temp11 = vec_add(vec_mule(tmp16_1, cospi24_v), vec_mulo(tmp16_1, cospi8_v)); \
+  DCT_CONST_ROUND_SHIFT(temp10);                                               \
+  DCT_CONST_ROUND_SHIFT(temp11);                                               \
+  inE = vec_packs(temp10, temp11);                                             \
+                                                                               \
+  tmp16_0 = vec_mergeh(outA, outD);                                            \
+  tmp16_1 = vec_mergel(outA, outD);                                            \
+  temp10 =                                                                     \
+      vec_sub(vec_mule(tmp16_0, cospi24m_v), vec_mulo(tmp16_0, cospi8_v));     \
+  temp11 =                                                                     \
+      vec_sub(vec_mule(tmp16_1, cospi24m_v), vec_mulo(tmp16_1, cospi8_v));     \
+  DCT_CONST_ROUND_SHIFT(temp10);                                               \
+  DCT_CONST_ROUND_SHIFT(temp11);                                               \
+  inA = vec_packs(temp10, temp11);                                             \
+  temp10 = vec_sub(vec_mulo(tmp16_0, cospi24_v), vec_mule(tmp16_0, cospi8_v)); \
+  temp11 = vec_sub(vec_mulo(tmp16_1, cospi24_v), vec_mule(tmp16_1, cospi8_v)); \
+  DCT_CONST_ROUND_SHIFT(temp10);                                               \
+  DCT_CONST_ROUND_SHIFT(temp11);                                               \
+  inD = vec_packs(temp10, temp11);                                             \
+                                                                               \
+  inB = outB;                                                                  \
+  inC = outC;                                                                  \
+                                                                               \
+  /* stage 5 */                                                                \
+  out0 = vec_add(in0, in3);                                                    \
+  out1 = vec_add(in1, in2);                                                    \
+  out2 = vec_sub(in1, in2);                                                    \
+  out3 = vec_sub(in0, in3);                                                    \
+  out4 = in4;                                                                  \
+  STEP16_1(in6, in5, out5, out6, cospi16_v);                                   \
+  out7 = in7;                                                                  \
+                                                                               \
+  out8 = vec_add(in8, inB);                                                    \
+  out9 = vec_add(in9, inA);                                                    \
+  outA = vec_sub(in9, inA);                                                    \
+  outB = vec_sub(in8, inB);                                                    \
+  outC = vec_sub(inF, inC);                                                    \
+  outD = vec_sub(inE, inD);                                                    \
+  outE = vec_add(inD, inE);                                                    \
+  outF = vec_add(inC, inF);                                                    \
+                                                                               \
+  /* stage 6 */                                                                \
+  in0 = vec_add(out0, out7);                                                   \
+  in1 = vec_add(out1, out6);                                                   \
+  in2 = vec_add(out2, out5);                                                   \
+  in3 = vec_add(out3, out4);                                                   \
+  in4 = vec_sub(out3, out4);                                                   \
+  in5 = vec_sub(out2, out5);                                                   \
+  in6 = vec_sub(out1, out6);                                                   \
+  in7 = vec_sub(out0, out7);                                                   \
+  in8 = out8;                                                                  \
+  in9 = out9;                                                                  \
+  STEP16_1(outD, outA, inA, inD, cospi16_v);                                   \
+  STEP16_1(outC, outB, inB, inC, cospi16_v);                                   \
+  inE = outE;                                                                  \
+  inF = outF;                                                                  \
+                                                                               \
+  /* stage 7 */                                                                \
+  out0 = vec_add(in0, inF);                                                    \
+  out1 = vec_add(in1, inE);                                                    \
+  out2 = vec_add(in2, inD);                                                    \
+  out3 = vec_add(in3, inC);                                                    \
+  out4 = vec_add(in4, inB);                                                    \
+  out5 = vec_add(in5, inA);                                                    \
+  out6 = vec_add(in6, in9);                                                    \
+  out7 = vec_add(in7, in8);                                                    \
+  out8 = vec_sub(in7, in8);                                                    \
+  out9 = vec_sub(in6, in9);                                                    \
+  outA = vec_sub(in5, inA);                                                    \
+  outB = vec_sub(in4, inB);                                                    \
+  outC = vec_sub(in3, inC);                                                    \
+  outD = vec_sub(in2, inD);                                                    \
+  outE = vec_sub(in1, inE);                                                    \
+  outF = vec_sub(in0, inF);
+
+#define PIXEL_ADD_STORE16(in0, in1, dst, offset) \
+  d_uh = (int16x8_t)vec_mergeh(dst, zerov);      \
+  d_ul = (int16x8_t)vec_mergel(dst, zerov);      \
+  PIXEL_ADD(in0, d_uh, add, shift6);             \
+  PIXEL_ADD(in1, d_ul, add, shift6);             \
+  vec_vsx_st(vec_packsu(d_uh, d_ul), offset, dest);
+
+static void half_idct16x8_vsx(int16x8_t *src) {
+  int16x8_t tmp0[8], tmp1[8];
+  int32x4_t temp10, temp11, temp20, temp21, temp30;
+  int16x8_t tmp16_0, tmp16_1;
+  ROUND_SHIFT_INIT;
+
+  TRANSPOSE8x8(src[0], src[2], src[4], src[6], src[8], src[10], src[12],
+               src[14], tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5],
+               tmp0[6], tmp0[7]);
+  TRANSPOSE8x8(src[1], src[3], src[5], src[7], src[9], src[11], src[13],
+               src[15], tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5],
+               tmp1[6], tmp1[7]);
+  IDCT16(tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], tmp0[6], tmp0[7],
+         tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], tmp1[6], tmp1[7],
+         src[0], src[2], src[4], src[6], src[8], src[10], src[12], src[14],
+         src[1], src[3], src[5], src[7], src[9], src[11], src[13], src[15]);
+}
+
+void vpx_idct16_vsx(int16x8_t *src0, int16x8_t *src1) {
+  int16x8_t tmp0[8], tmp1[8], tmp2[8], tmp3[8];
+  int32x4_t temp10, temp11, temp20, temp21, temp30;
+  int16x8_t tmp16_0, tmp16_1;
+  ROUND_SHIFT_INIT;
+
+  TRANSPOSE8x8(src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12],
+               src0[14], tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5],
+               tmp0[6], tmp0[7]);
+  TRANSPOSE8x8(src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13],
+               src0[15], tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5],
+               tmp1[6], tmp1[7]);
+  TRANSPOSE8x8(src1[0], src1[2], src1[4], src1[6], src1[8], src1[10], src1[12],
+               src1[14], tmp2[0], tmp2[1], tmp2[2], tmp2[3], tmp2[4], tmp2[5],
+               tmp2[6], tmp2[7]);
+  TRANSPOSE8x8(src1[1], src1[3], src1[5], src1[7], src1[9], src1[11], src1[13],
+               src1[15], tmp3[0], tmp3[1], tmp3[2], tmp3[3], tmp3[4], tmp3[5],
+               tmp3[6], tmp3[7]);
+
+  IDCT16(tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], tmp0[6], tmp0[7],
+         tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], tmp1[6], tmp1[7],
+         src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12],
+         src0[14], src1[0], src1[2], src1[4], src1[6], src1[8], src1[10],
+         src1[12], src1[14]);
+
+  IDCT16(tmp2[0], tmp2[1], tmp2[2], tmp2[3], tmp2[4], tmp2[5], tmp2[6], tmp2[7],
+         tmp3[0], tmp3[1], tmp3[2], tmp3[3], tmp3[4], tmp3[5], tmp3[6], tmp3[7],
+         src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13],
+         src0[15], src1[1], src1[3], src1[5], src1[7], src1[9], src1[11],
+         src1[13], src1[15]);
+}
+
+void vpx_round_store16x16_vsx(int16x8_t *src0, int16x8_t *src1, uint8_t *dest,
+                              int stride) {
+  uint8x16_t destv[16];
+  int16x8_t d_uh, d_ul;
+  uint8x16_t zerov = vec_splat_u8(0);
+  uint16x8_t shift6 = vec_splat_u16(6);
+  int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2));
+
+  // load dest
+  LOAD_INPUT16(vec_vsx_ld, dest, 0, stride, destv);
+
+  PIXEL_ADD_STORE16(src0[0], src0[1], destv[0], 0);
+  PIXEL_ADD_STORE16(src0[2], src0[3], destv[1], stride);
+  PIXEL_ADD_STORE16(src0[4], src0[5], destv[2], 2 * stride);
+  PIXEL_ADD_STORE16(src0[6], src0[7], destv[3], 3 * stride);
+  PIXEL_ADD_STORE16(src0[8], src0[9], destv[4], 4 * stride);
+  PIXEL_ADD_STORE16(src0[10], src0[11], destv[5], 5 * stride);
+  PIXEL_ADD_STORE16(src0[12], src0[13], destv[6], 6 * stride);
+  PIXEL_ADD_STORE16(src0[14], src0[15], destv[7], 7 * stride);
+
+  PIXEL_ADD_STORE16(src1[0], src1[1], destv[8], 8 * stride);
+  PIXEL_ADD_STORE16(src1[2], src1[3], destv[9], 9 * stride);
+  PIXEL_ADD_STORE16(src1[4], src1[5], destv[10], 10 * stride);
+  PIXEL_ADD_STORE16(src1[6], src1[7], destv[11], 11 * stride);
+  PIXEL_ADD_STORE16(src1[8], src1[9], destv[12], 12 * stride);
+  PIXEL_ADD_STORE16(src1[10], src1[11], destv[13], 13 * stride);
+  PIXEL_ADD_STORE16(src1[12], src1[13], destv[14], 14 * stride);
+  PIXEL_ADD_STORE16(src1[14], src1[15], destv[15], 15 * stride);
+}
+void vpx_idct16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest,
+                               int stride) {
+  int16x8_t src0[16], src1[16];
+  int16x8_t tmp0[8], tmp1[8], tmp2[8], tmp3[8];
+  int32x4_t temp10, temp11, temp20, temp21, temp30;
+  int16x8_t tmp16_0, tmp16_1;
+  ROUND_SHIFT_INIT;
+
+  LOAD_INPUT16(load_tran_low, input, 0, 8 * sizeof(*input), src0);
+  LOAD_INPUT16(load_tran_low, input, 8 * 8 * 2 * sizeof(*input),
+               8 * sizeof(*input), src1);
+
+  // transform rows
+  // transform the upper half of 16x16 matrix
+  half_idct16x8_vsx(src0);
+  TRANSPOSE8x8(src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12],
+               src0[14], tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5],
+               tmp0[6], tmp0[7]);
+  TRANSPOSE8x8(src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13],
+               src0[15], tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5],
+               tmp1[6], tmp1[7]);
+
+  // transform the lower half of 16x16 matrix
+  half_idct16x8_vsx(src1);
+  TRANSPOSE8x8(src1[0], src1[2], src1[4], src1[6], src1[8], src1[10], src1[12],
+               src1[14], tmp2[0], tmp2[1], tmp2[2], tmp2[3], tmp2[4], tmp2[5],
+               tmp2[6], tmp2[7]);
+  TRANSPOSE8x8(src1[1], src1[3], src1[5], src1[7], src1[9], src1[11], src1[13],
+               src1[15], tmp3[0], tmp3[1], tmp3[2], tmp3[3], tmp3[4], tmp3[5],
+               tmp3[6], tmp3[7]);
+
+  // transform columns
+  // left half first
+  IDCT16(tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], tmp0[6], tmp0[7],
+         tmp2[0], tmp2[1], tmp2[2], tmp2[3], tmp2[4], tmp2[5], tmp2[6], tmp2[7],
+         src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12],
+         src0[14], src1[0], src1[2], src1[4], src1[6], src1[8], src1[10],
+         src1[12], src1[14]);
+  // right half
+  IDCT16(tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], tmp1[6], tmp1[7],
+         tmp3[0], tmp3[1], tmp3[2], tmp3[3], tmp3[4], tmp3[5], tmp3[6], tmp3[7],
+         src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13],
+         src0[15], src1[1], src1[3], src1[5], src1[7], src1[9], src1[11],
+         src1[13], src1[15]);
+
+  vpx_round_store16x16_vsx(src0, src1, dest, stride);
+}
+
+#define LOAD_8x32(load, in00, in01, in02, in03, in10, in11, in12, in13, in20, \
+                  in21, in22, in23, in30, in31, in32, in33, in40, in41, in42, \
+                  in43, in50, in51, in52, in53, in60, in61, in62, in63, in70, \
+                  in71, in72, in73, offset)                                   \
+  /* load the first row from the 8x32 block*/                                 \
+  in00 = load(offset, input);                                                 \
+  in01 = load(offset + 16, input);                                            \
+  in02 = load(offset + 2 * 16, input);                                        \
+  in03 = load(offset + 3 * 16, input);                                        \
+                                                                              \
+  in10 = load(offset + 4 * 16, input);                                        \
+  in11 = load(offset + 5 * 16, input);                                        \
+  in12 = load(offset + 6 * 16, input);                                        \
+  in13 = load(offset + 7 * 16, input);                                        \
+                                                                              \
+  in20 = load(offset + 8 * 16, input);                                        \
+  in21 = load(offset + 9 * 16, input);                                        \
+  in22 = load(offset + 10 * 16, input);                                       \
+  in23 = load(offset + 11 * 16, input);                                       \
+                                                                              \
+  in30 = load(offset + 12 * 16, input);                                       \
+  in31 = load(offset + 13 * 16, input);                                       \
+  in32 = load(offset + 14 * 16, input);                                       \
+  in33 = load(offset + 15 * 16, input);                                       \
+                                                                              \
+  in40 = load(offset + 16 * 16, input);                                       \
+  in41 = load(offset + 17 * 16, input);                                       \
+  in42 = load(offset + 18 * 16, input);                                       \
+  in43 = load(offset + 19 * 16, input);                                       \
+                                                                              \
+  in50 = load(offset + 20 * 16, input);                                       \
+  in51 = load(offset + 21 * 16, input);                                       \
+  in52 = load(offset + 22 * 16, input);                                       \
+  in53 = load(offset + 23 * 16, input);                                       \
+                                                                              \
+  in60 = load(offset + 24 * 16, input);                                       \
+  in61 = load(offset + 25 * 16, input);                                       \
+  in62 = load(offset + 26 * 16, input);                                       \
+  in63 = load(offset + 27 * 16, input);                                       \
+                                                                              \
+  /* load the last row from the 8x32 block*/                                  \
+  in70 = load(offset + 28 * 16, input);                                       \
+  in71 = load(offset + 29 * 16, input);                                       \
+  in72 = load(offset + 30 * 16, input);                                       \
+  in73 = load(offset + 31 * 16, input);
+
+/* for the: temp1 = -step[x] * cospi_q + step[y] * cospi_z
+ *          temp2 = step[x] * cospi_z + step[y] * cospi_q */
+#define STEP32(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1)              \
+  tmp16_0 = vec_mergeh(inpt0, inpt1);                                     \
+  tmp16_1 = vec_mergel(inpt0, inpt1);                                     \
+  temp10 = vec_sub(vec_mulo(tmp16_0, cospi1), vec_mule(tmp16_0, cospi0)); \
+  temp11 = vec_sub(vec_mulo(tmp16_1, cospi1), vec_mule(tmp16_1, cospi0)); \
+  DCT_CONST_ROUND_SHIFT(temp10);                                          \
+  DCT_CONST_ROUND_SHIFT(temp11);                                          \
+  outpt0 = vec_packs(temp10, temp11);                                     \
+  temp10 = vec_add(vec_mule(tmp16_0, cospi1), vec_mulo(tmp16_0, cospi0)); \
+  temp11 = vec_add(vec_mule(tmp16_1, cospi1), vec_mulo(tmp16_1, cospi0)); \
+  DCT_CONST_ROUND_SHIFT(temp10);                                          \
+  DCT_CONST_ROUND_SHIFT(temp11);                                          \
+  outpt1 = vec_packs(temp10, temp11);
+
+/* for the: temp1 = -step[x] * cospi_q - step[y] * cospi_z
+ *          temp2 = -step[x] * cospi_z + step[y] * cospi_q */
+#define STEP32_1(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1, cospi1m)    \
+  tmp16_0 = vec_mergeh(inpt0, inpt1);                                      \
+  tmp16_1 = vec_mergel(inpt0, inpt1);                                      \
+  temp10 = vec_sub(vec_mulo(tmp16_0, cospi1m), vec_mule(tmp16_0, cospi0)); \
+  temp11 = vec_sub(vec_mulo(tmp16_1, cospi1m), vec_mule(tmp16_1, cospi0)); \
+  DCT_CONST_ROUND_SHIFT(temp10);                                           \
+  DCT_CONST_ROUND_SHIFT(temp11);                                           \
+  outpt0 = vec_packs(temp10, temp11);                                      \
+  temp10 = vec_sub(vec_mulo(tmp16_0, cospi0), vec_mule(tmp16_0, cospi1));  \
+  temp11 = vec_sub(vec_mulo(tmp16_1, cospi0), vec_mule(tmp16_1, cospi1));  \
+  DCT_CONST_ROUND_SHIFT(temp10);                                           \
+  DCT_CONST_ROUND_SHIFT(temp11);                                           \
+  outpt1 = vec_packs(temp10, temp11);
+
+#define IDCT32(in0, in1, in2, in3, out)                                \
+                                                                       \
+  /* stage 1 */                                                        \
+  /* out[0][0] = in[0][0]; */                                          \
+  out[0][1] = in2[0];                                                  \
+  out[0][2] = in1[0];                                                  \
+  out[0][3] = in3[0];                                                  \
+  out[0][4] = in0[4];                                                  \
+  out[0][5] = in2[4];                                                  \
+  out[0][6] = in1[4];                                                  \
+  out[0][7] = in3[4];                                                  \
+  out[1][0] = in0[2];                                                  \
+  out[1][1] = in2[2];                                                  \
+  out[1][2] = in1[2];                                                  \
+  out[1][3] = in3[2];                                                  \
+  out[1][4] = in0[6];                                                  \
+  out[1][5] = in2[6];                                                  \
+  out[1][6] = in1[6];                                                  \
+  out[1][7] = in3[6];                                                  \
+                                                                       \
+  STEP8_0(in0[1], in3[7], out[2][0], out[3][7], cospi31_v, cospi1_v);  \
+  STEP8_0(in2[1], in1[7], out[2][1], out[3][6], cospi15_v, cospi17_v); \
+  STEP8_0(in1[1], in2[7], out[2][2], out[3][5], cospi23_v, cospi9_v);  \
+  STEP8_0(in3[1], in0[7], out[2][3], out[3][4], cospi7_v, cospi25_v);  \
+  STEP8_0(in0[5], in3[3], out[2][4], out[3][3], cospi27_v, cospi5_v);  \
+  STEP8_0(in2[5], in1[3], out[2][5], out[3][2], cospi11_v, cospi21_v); \
+  STEP8_0(in1[5], in2[3], out[2][6], out[3][1], cospi19_v, cospi13_v); \
+  STEP8_0(in3[5], in0[3], out[2][7], out[3][0], cospi3_v, cospi29_v);  \
+                                                                       \
+  /* stage 2 */                                                        \
+  /* in0[0] = out[0][0]; */                                            \
+  in0[1] = out[0][1];                                                  \
+  in0[2] = out[0][2];                                                  \
+  in0[3] = out[0][3];                                                  \
+  in0[4] = out[0][4];                                                  \
+  in0[5] = out[0][5];                                                  \
+  in0[6] = out[0][6];                                                  \
+  in0[7] = out[0][7];                                                  \
+                                                                       \
+  STEP8_0(out[1][0], out[1][7], in1[0], in1[7], cospi30_v, cospi2_v);  \
+  STEP8_0(out[1][1], out[1][6], in1[1], in1[6], cospi14_v, cospi18_v); \
+  STEP8_0(out[1][2], out[1][5], in1[2], in1[5], cospi22_v, cospi10_v); \
+  STEP8_0(out[1][3], out[1][4], in1[3], in1[4], cospi6_v, cospi26_v);  \
+                                                                       \
+  in2[0] = vec_add(out[2][0], out[2][1]);                              \
+  in2[1] = vec_sub(out[2][0], out[2][1]);                              \
+  in2[2] = vec_sub(out[2][3], out[2][2]);                              \
+  in2[3] = vec_add(out[2][3], out[2][2]);                              \
+  in2[4] = vec_add(out[2][4], out[2][5]);                              \
+  in2[5] = vec_sub(out[2][4], out[2][5]);                              \
+  in2[6] = vec_sub(out[2][7], out[2][6]);                              \
+  in2[7] = vec_add(out[2][7], out[2][6]);                              \
+  in3[0] = vec_add(out[3][0], out[3][1]);                              \
+  in3[1] = vec_sub(out[3][0], out[3][1]);                              \
+  in3[2] = vec_sub(out[3][3], out[3][2]);                              \
+  in3[3] = vec_add(out[3][3], out[3][2]);                              \
+  in3[4] = vec_add(out[3][4], out[3][5]);                              \
+  in3[5] = vec_sub(out[3][4], out[3][5]);                              \
+  in3[6] = vec_sub(out[3][7], out[3][6]);                              \
+  in3[7] = vec_add(out[3][6], out[3][7]);                              \
+                                                                       \
+  /* stage 3 */                                                        \
+  out[0][0] = in0[0];                                                  \
+  out[0][1] = in0[1];                                                  \
+  out[0][2] = in0[2];                                                  \
+  out[0][3] = in0[3];                                                  \
+                                                                       \
+  STEP8_0(in0[4], in0[7], out[0][4], out[0][7], cospi28_v, cospi4_v);  \
+  STEP8_0(in0[5], in0[6], out[0][5], out[0][6], cospi12_v, cospi20_v); \
+                                                                       \
+  out[1][0] = vec_add(in1[0], in1[1]);                                 \
+  out[1][1] = vec_sub(in1[0], in1[1]);                                 \
+  out[1][2] = vec_sub(in1[3], in1[2]);                                 \
+  out[1][3] = vec_add(in1[2], in1[3]);                                 \
+  out[1][4] = vec_add(in1[4], in1[5]);                                 \
+  out[1][5] = vec_sub(in1[4], in1[5]);                                 \
+  out[1][6] = vec_sub(in1[7], in1[6]);                                 \
+  out[1][7] = vec_add(in1[6], in1[7]);                                 \
+                                                                       \
+  out[2][0] = in2[0];                                                  \
+  out[3][7] = in3[7];                                                  \
+  STEP32(in2[1], in3[6], out[2][1], out[3][6], cospi4_v, cospi28_v);   \
+  STEP32_1(in2[2], in3[5], out[2][2], out[3][5], cospi28_v, cospi4_v,  \
+           cospi4m_v);                                                 \
+  out[2][3] = in2[3];                                                  \
+  out[2][4] = in2[4];                                                  \
+  STEP32(in2[5], in3[2], out[2][5], out[3][2], cospi20_v, cospi12_v);  \
+  STEP32_1(in2[6], in3[1], out[2][6], out[3][1], cospi12_v, cospi20_v, \
+           cospi20m_v);                                                \
+  out[2][7] = in2[7];                                                  \
+  out[3][0] = in3[0];                                                  \
+  out[3][3] = in3[3];                                                  \
+  out[3][4] = in3[4];                                                  \
+                                                                       \
+  /* stage 4 */                                                        \
+  STEP16_1(out[0][0], out[0][1], in0[1], in0[0], cospi16_v);           \
+  STEP8_0(out[0][2], out[0][3], in0[2], in0[3], cospi24_v, cospi8_v);  \
+  in0[4] = vec_add(out[0][4], out[0][5]);                              \
+  in0[5] = vec_sub(out[0][4], out[0][5]);                              \
+  in0[6] = vec_sub(out[0][7], out[0][6]);                              \
+  in0[7] = vec_add(out[0][7], out[0][6]);                              \
+                                                                       \
+  in1[0] = out[1][0];                                                  \
+  in1[7] = out[1][7];                                                  \
+  STEP32(out[1][1], out[1][6], in1[1], in1[6], cospi8_v, cospi24_v);   \
+  STEP32_1(out[1][2], out[1][5], in1[2], in1[5], cospi24_v, cospi8_v,  \
+           cospi8m_v);                                                 \
+  in1[3] = out[1][3];                                                  \
+  in1[4] = out[1][4];                                                  \
+                                                                       \
+  in2[0] = vec_add(out[2][0], out[2][3]);                              \
+  in2[1] = vec_add(out[2][1], out[2][2]);                              \
+  in2[2] = vec_sub(out[2][1], out[2][2]);                              \
+  in2[3] = vec_sub(out[2][0], out[2][3]);                              \
+  in2[4] = vec_sub(out[2][7], out[2][4]);                              \
+  in2[5] = vec_sub(out[2][6], out[2][5]);                              \
+  in2[6] = vec_add(out[2][5], out[2][6]);                              \
+  in2[7] = vec_add(out[2][4], out[2][7]);                              \
+                                                                       \
+  in3[0] = vec_add(out[3][0], out[3][3]);                              \
+  in3[1] = vec_add(out[3][1], out[3][2]);                              \
+  in3[2] = vec_sub(out[3][1], out[3][2]);                              \
+  in3[3] = vec_sub(out[3][0], out[3][3]);                              \
+  in3[4] = vec_sub(out[3][7], out[3][4]);                              \
+  in3[5] = vec_sub(out[3][6], out[3][5]);                              \
+  in3[6] = vec_add(out[3][5], out[3][6]);                              \
+  in3[7] = vec_add(out[3][4], out[3][7]);                              \
+                                                                       \
+  /* stage 5 */                                                        \
+  out[0][0] = vec_add(in0[0], in0[3]);                                 \
+  out[0][1] = vec_add(in0[1], in0[2]);                                 \
+  out[0][2] = vec_sub(in0[1], in0[2]);                                 \
+  out[0][3] = vec_sub(in0[0], in0[3]);                                 \
+  out[0][4] = in0[4];                                                  \
+  STEP16_1(in0[6], in0[5], out[0][5], out[0][6], cospi16_v);           \
+  out[0][7] = in0[7];                                                  \
+                                                                       \
+  out[1][0] = vec_add(in1[0], in1[3]);                                 \
+  out[1][1] = vec_add(in1[1], in1[2]);                                 \
+  out[1][2] = vec_sub(in1[1], in1[2]);                                 \
+  out[1][3] = vec_sub(in1[0], in1[3]);                                 \
+  out[1][4] = vec_sub(in1[7], in1[4]);                                 \
+  out[1][5] = vec_sub(in1[6], in1[5]);                                 \
+  out[1][6] = vec_add(in1[5], in1[6]);                                 \
+  out[1][7] = vec_add(in1[4], in1[7]);                                 \
+                                                                       \
+  out[2][0] = in2[0];                                                  \
+  out[2][1] = in2[1];                                                  \
+  STEP32(in2[2], in3[5], out[2][2], out[3][5], cospi8_v, cospi24_v);   \
+  STEP32(in2[3], in3[4], out[2][3], out[3][4], cospi8_v, cospi24_v);   \
+  STEP32_1(in2[4], in3[3], out[2][4], out[3][3], cospi24_v, cospi8_v,  \
+           cospi8m_v);                                                 \
+  STEP32_1(in2[5], in3[2], out[2][5], out[3][2], cospi24_v, cospi8_v,  \
+           cospi8m_v);                                                 \
+  out[2][6] = in2[6];                                                  \
+  out[2][7] = in2[7];                                                  \
+  out[3][0] = in3[0];                                                  \
+  out[3][1] = in3[1];                                                  \
+  out[3][6] = in3[6];                                                  \
+  out[3][7] = in3[7];                                                  \
+                                                                       \
+  /* stage 6 */                                                        \
+  in0[0] = vec_add(out[0][0], out[0][7]);                              \
+  in0[1] = vec_add(out[0][1], out[0][6]);                              \
+  in0[2] = vec_add(out[0][2], out[0][5]);                              \
+  in0[3] = vec_add(out[0][3], out[0][4]);                              \
+  in0[4] = vec_sub(out[0][3], out[0][4]);                              \
+  in0[5] = vec_sub(out[0][2], out[0][5]);                              \
+  in0[6] = vec_sub(out[0][1], out[0][6]);                              \
+  in0[7] = vec_sub(out[0][0], out[0][7]);                              \
+  in1[0] = out[1][0];                                                  \
+  in1[1] = out[1][1];                                                  \
+  STEP16_1(out[1][5], out[1][2], in1[2], in1[5], cospi16_v);           \
+  STEP16_1(out[1][4], out[1][3], in1[3], in1[4], cospi16_v);           \
+  in1[6] = out[1][6];                                                  \
+  in1[7] = out[1][7];                                                  \
+                                                                       \
+  in2[0] = vec_add(out[2][0], out[2][7]);                              \
+  in2[1] = vec_add(out[2][1], out[2][6]);                              \
+  in2[2] = vec_add(out[2][2], out[2][5]);                              \
+  in2[3] = vec_add(out[2][3], out[2][4]);                              \
+  in2[4] = vec_sub(out[2][3], out[2][4]);                              \
+  in2[5] = vec_sub(out[2][2], out[2][5]);                              \
+  in2[6] = vec_sub(out[2][1], out[2][6]);                              \
+  in2[7] = vec_sub(out[2][0], out[2][7]);                              \
+                                                                       \
+  in3[0] = vec_sub(out[3][7], out[3][0]);                              \
+  in3[1] = vec_sub(out[3][6], out[3][1]);                              \
+  in3[2] = vec_sub(out[3][5], out[3][2]);                              \
+  in3[3] = vec_sub(out[3][4], out[3][3]);                              \
+  in3[4] = vec_add(out[3][4], out[3][3]);                              \
+  in3[5] = vec_add(out[3][5], out[3][2]);                              \
+  in3[6] = vec_add(out[3][6], out[3][1]);                              \
+  in3[7] = vec_add(out[3][7], out[3][0]);                              \
+                                                                       \
+  /* stage 7 */                                                        \
+  out[0][0] = vec_add(in0[0], in1[7]);                                 \
+  out[0][1] = vec_add(in0[1], in1[6]);                                 \
+  out[0][2] = vec_add(in0[2], in1[5]);                                 \
+  out[0][3] = vec_add(in0[3], in1[4]);                                 \
+  out[0][4] = vec_add(in0[4], in1[3]);                                 \
+  out[0][5] = vec_add(in0[5], in1[2]);                                 \
+  out[0][6] = vec_add(in0[6], in1[1]);                                 \
+  out[0][7] = vec_add(in0[7], in1[0]);                                 \
+  out[1][0] = vec_sub(in0[7], in1[0]);                                 \
+  out[1][1] = vec_sub(in0[6], in1[1]);                                 \
+  out[1][2] = vec_sub(in0[5], in1[2]);                                 \
+  out[1][3] = vec_sub(in0[4], in1[3]);                                 \
+  out[1][4] = vec_sub(in0[3], in1[4]);                                 \
+  out[1][5] = vec_sub(in0[2], in1[5]);                                 \
+  out[1][6] = vec_sub(in0[1], in1[6]);                                 \
+  out[1][7] = vec_sub(in0[0], in1[7]);                                 \
+                                                                       \
+  out[2][0] = in2[0];                                                  \
+  out[2][1] = in2[1];                                                  \
+  out[2][2] = in2[2];                                                  \
+  out[2][3] = in2[3];                                                  \
+  STEP16_1(in3[3], in2[4], out[2][4], out[3][3], cospi16_v);           \
+  STEP16_1(in3[2], in2[5], out[2][5], out[3][2], cospi16_v);           \
+  STEP16_1(in3[1], in2[6], out[2][6], out[3][1], cospi16_v);           \
+  STEP16_1(in3[0], in2[7], out[2][7], out[3][0], cospi16_v);           \
+  out[3][4] = in3[4];                                                  \
+  out[3][5] = in3[5];                                                  \
+  out[3][6] = in3[6];                                                  \
+  out[3][7] = in3[7];                                                  \
+                                                                       \
+  /* final */                                                          \
+  in0[0] = vec_add(out[0][0], out[3][7]);                              \
+  in0[1] = vec_add(out[0][1], out[3][6]);                              \
+  in0[2] = vec_add(out[0][2], out[3][5]);                              \
+  in0[3] = vec_add(out[0][3], out[3][4]);                              \
+  in0[4] = vec_add(out[0][4], out[3][3]);                              \
+  in0[5] = vec_add(out[0][5], out[3][2]);                              \
+  in0[6] = vec_add(out[0][6], out[3][1]);                              \
+  in0[7] = vec_add(out[0][7], out[3][0]);                              \
+  in1[0] = vec_add(out[1][0], out[2][7]);                              \
+  in1[1] = vec_add(out[1][1], out[2][6]);                              \
+  in1[2] = vec_add(out[1][2], out[2][5]);                              \
+  in1[3] = vec_add(out[1][3], out[2][4]);                              \
+  in1[4] = vec_add(out[1][4], out[2][3]);                              \
+  in1[5] = vec_add(out[1][5], out[2][2]);                              \
+  in1[6] = vec_add(out[1][6], out[2][1]);                              \
+  in1[7] = vec_add(out[1][7], out[2][0]);                              \
+  in2[0] = vec_sub(out[1][7], out[2][0]);                              \
+  in2[1] = vec_sub(out[1][6], out[2][1]);                              \
+  in2[2] = vec_sub(out[1][5], out[2][2]);                              \
+  in2[3] = vec_sub(out[1][4], out[2][3]);                              \
+  in2[4] = vec_sub(out[1][3], out[2][4]);                              \
+  in2[5] = vec_sub(out[1][2], out[2][5]);                              \
+  in2[6] = vec_sub(out[1][1], out[2][6]);                              \
+  in2[7] = vec_sub(out[1][0], out[2][7]);                              \
+  in3[0] = vec_sub(out[0][7], out[3][0]);                              \
+  in3[1] = vec_sub(out[0][6], out[3][1]);                              \
+  in3[2] = vec_sub(out[0][5], out[3][2]);                              \
+  in3[3] = vec_sub(out[0][4], out[3][3]);                              \
+  in3[4] = vec_sub(out[0][3], out[3][4]);                              \
+  in3[5] = vec_sub(out[0][2], out[3][5]);                              \
+  in3[6] = vec_sub(out[0][1], out[3][6]);                              \
+  in3[7] = vec_sub(out[0][0], out[3][7]);
+
+// NOT A FULL TRANSPOSE! Transposes just each 8x8 block in each row,
+// does not transpose rows
+#define TRANSPOSE_8x32(in, out)                                                \
+  /* transpose 4 of 8x8 blocks */                                              \
+  TRANSPOSE8x8(in[0][0], in[0][1], in[0][2], in[0][3], in[0][4], in[0][5],     \
+               in[0][6], in[0][7], out[0][0], out[0][1], out[0][2], out[0][3], \
+               out[0][4], out[0][5], out[0][6], out[0][7]);                    \
+  TRANSPOSE8x8(in[1][0], in[1][1], in[1][2], in[1][3], in[1][4], in[1][5],     \
+               in[1][6], in[1][7], out[1][0], out[1][1], out[1][2], out[1][3], \
+               out[1][4], out[1][5], out[1][6], out[1][7]);                    \
+  TRANSPOSE8x8(in[2][0], in[2][1], in[2][2], in[2][3], in[2][4], in[2][5],     \
+               in[2][6], in[2][7], out[2][0], out[2][1], out[2][2], out[2][3], \
+               out[2][4], out[2][5], out[2][6], out[2][7]);                    \
+  TRANSPOSE8x8(in[3][0], in[3][1], in[3][2], in[3][3], in[3][4], in[3][5],     \
+               in[3][6], in[3][7], out[3][0], out[3][1], out[3][2], out[3][3], \
+               out[3][4], out[3][5], out[3][6], out[3][7]);
+
+#define PIXEL_ADD_STORE32(in0, in1, in2, in3, step)          \
+  dst = vec_vsx_ld((step) * stride, dest);                   \
+  d_uh = (int16x8_t)vec_mergeh(dst, zerov);                  \
+  d_ul = (int16x8_t)vec_mergel(dst, zerov);                  \
+  PIXEL_ADD(in0, d_uh, add, shift6);                         \
+  PIXEL_ADD(in1, d_ul, add, shift6);                         \
+  vec_vsx_st(vec_packsu(d_uh, d_ul), (step) * stride, dest); \
+  dst = vec_vsx_ld((step) * stride + 16, dest);              \
+  d_uh = (int16x8_t)vec_mergeh(dst, zerov);                  \
+  d_ul = (int16x8_t)vec_mergel(dst, zerov);                  \
+  PIXEL_ADD(in2, d_uh, add, shift6);                         \
+  PIXEL_ADD(in3, d_ul, add, shift6);                         \
+  vec_vsx_st(vec_packsu(d_uh, d_ul), (step) * stride + 16, dest);
+
+#define ADD_STORE_BLOCK(in, offset)                                        \
+  PIXEL_ADD_STORE32(in[0][0], in[1][0], in[2][0], in[3][0], (offset) + 0); \
+  PIXEL_ADD_STORE32(in[0][1], in[1][1], in[2][1], in[3][1], (offset) + 1); \
+  PIXEL_ADD_STORE32(in[0][2], in[1][2], in[2][2], in[3][2], (offset) + 2); \
+  PIXEL_ADD_STORE32(in[0][3], in[1][3], in[2][3], in[3][3], (offset) + 3); \
+  PIXEL_ADD_STORE32(in[0][4], in[1][4], in[2][4], in[3][4], (offset) + 4); \
+  PIXEL_ADD_STORE32(in[0][5], in[1][5], in[2][5], in[3][5], (offset) + 5); \
+  PIXEL_ADD_STORE32(in[0][6], in[1][6], in[2][6], in[3][6], (offset) + 6); \
+  PIXEL_ADD_STORE32(in[0][7], in[1][7], in[2][7], in[3][7], (offset) + 7);
+
+void vpx_idct32x32_1024_add_vsx(const tran_low_t *input, uint8_t *dest,
+                                int stride) {
+  int16x8_t src0[4][8], src1[4][8], src2[4][8], src3[4][8], tmp[4][8];
+  int16x8_t tmp16_0, tmp16_1;
+  int32x4_t temp10, temp11, temp20, temp21, temp30;
+  uint8x16_t dst;
+  int16x8_t d_uh, d_ul;
+  int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2));
+  uint16x8_t shift6 = vec_splat_u16(6);
+  uint8x16_t zerov = vec_splat_u8(0);
+
+  ROUND_SHIFT_INIT;
+
+  LOAD_8x32(load_tran_low, src0[0][0], src0[1][0], src0[2][0], src0[3][0],
+            src0[0][1], src0[1][1], src0[2][1], src0[3][1], src0[0][2],
+            src0[1][2], src0[2][2], src0[3][2], src0[0][3], src0[1][3],
+            src0[2][3], src0[3][3], src0[0][4], src0[1][4], src0[2][4],
+            src0[3][4], src0[0][5], src0[1][5], src0[2][5], src0[3][5],
+            src0[0][6], src0[1][6], src0[2][6], src0[3][6], src0[0][7],
+            src0[1][7], src0[2][7], src0[3][7], 0);
+  // Rows
+  // transpose the first row of 8x8 blocks
+  TRANSPOSE_8x32(src0, tmp);
+  // transform the 32x8 column
+  IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src0);
+  TRANSPOSE_8x32(tmp, src0);
+
+  LOAD_8x32(load_tran_low, src1[0][0], src1[1][0], src1[2][0], src1[3][0],
+            src1[0][1], src1[1][1], src1[2][1], src1[3][1], src1[0][2],
+            src1[1][2], src1[2][2], src1[3][2], src1[0][3], src1[1][3],
+            src1[2][3], src1[3][3], src1[0][4], src1[1][4], src1[2][4],
+            src1[3][4], src1[0][5], src1[1][5], src1[2][5], src1[3][5],
+            src1[0][6], src1[1][6], src1[2][6], src1[3][6], src1[0][7],
+            src1[1][7], src1[2][7], src1[3][7], 512);
+  TRANSPOSE_8x32(src1, tmp);
+  IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src1);
+  TRANSPOSE_8x32(tmp, src1);
+
+  LOAD_8x32(load_tran_low, src2[0][0], src2[1][0], src2[2][0], src2[3][0],
+            src2[0][1], src2[1][1], src2[2][1], src2[3][1], src2[0][2],
+            src2[1][2], src2[2][2], src2[3][2], src2[0][3], src2[1][3],
+            src2[2][3], src2[3][3], src2[0][4], src2[1][4], src2[2][4],
+            src2[3][4], src2[0][5], src2[1][5], src2[2][5], src2[3][5],
+            src2[0][6], src2[1][6], src2[2][6], src2[3][6], src2[0][7],
+            src2[1][7], src2[2][7], src2[3][7], 1024);
+  TRANSPOSE_8x32(src2, tmp);
+  IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src2);
+  TRANSPOSE_8x32(tmp, src2);
+
+  LOAD_8x32(load_tran_low, src3[0][0], src3[1][0], src3[2][0], src3[3][0],
+            src3[0][1], src3[1][1], src3[2][1], src3[3][1], src3[0][2],
+            src3[1][2], src3[2][2], src3[3][2], src3[0][3], src3[1][3],
+            src3[2][3], src3[3][3], src3[0][4], src3[1][4], src3[2][4],
+            src3[3][4], src3[0][5], src3[1][5], src3[2][5], src3[3][5],
+            src3[0][6], src3[1][6], src3[2][6], src3[3][6], src3[0][7],
+            src3[1][7], src3[2][7], src3[3][7], 1536);
+  TRANSPOSE_8x32(src3, tmp);
+  IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src3);
+  TRANSPOSE_8x32(tmp, src3);
+
+  // Columns
+  IDCT32(src0[0], src1[0], src2[0], src3[0], tmp);
+  IDCT32(src0[1], src1[1], src2[1], src3[1], tmp);
+  IDCT32(src0[2], src1[2], src2[2], src3[2], tmp);
+  IDCT32(src0[3], src1[3], src2[3], src3[3], tmp);
+
+  ADD_STORE_BLOCK(src0, 0);
+  ADD_STORE_BLOCK(src1, 8);
+  ADD_STORE_BLOCK(src2, 16);
+  ADD_STORE_BLOCK(src3, 24);
+}
+
+#define TRANSFORM_COLS           \
+  v32_a = vec_add(v32_a, v32_c); \
+  v32_d = vec_sub(v32_d, v32_b); \
+  v32_e = vec_sub(v32_a, v32_d); \
+  v32_e = vec_sra(v32_e, one);   \
+  v32_b = vec_sub(v32_e, v32_b); \
+  v32_c = vec_sub(v32_e, v32_c); \
+  v32_a = vec_sub(v32_a, v32_b); \
+  v32_d = vec_add(v32_d, v32_c); \
+  v_a = vec_packs(v32_a, v32_b); \
+  v_c = vec_packs(v32_c, v32_d);
+
+#define TRANSPOSE_WHT             \
+  tmp_a = vec_mergeh(v_a, v_c);   \
+  tmp_c = vec_mergel(v_a, v_c);   \
+  v_a = vec_mergeh(tmp_a, tmp_c); \
+  v_c = vec_mergel(tmp_a, tmp_c);
+
+void vpx_iwht4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  int16x8_t v_a = load_tran_low(0, input);
+  int16x8_t v_c = load_tran_low(8 * sizeof(*input), input);
+  int16x8_t tmp_a, tmp_c;
+  uint16x8_t two = vec_splat_u16(2);
+  uint32x4_t one = vec_splat_u32(1);
+  int16x8_t tmp16_0, tmp16_1;
+  int32x4_t v32_a, v32_c, v32_d, v32_b, v32_e;
+  uint8x16_t dest0 = vec_vsx_ld(0, dest);
+  uint8x16_t dest1 = vec_vsx_ld(stride, dest);
+  uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest);
+  uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest);
+  int16x8_t d_u0 = (int16x8_t)unpack_to_u16_h(dest0);
+  int16x8_t d_u1 = (int16x8_t)unpack_to_u16_h(dest1);
+  int16x8_t d_u2 = (int16x8_t)unpack_to_u16_h(dest2);
+  int16x8_t d_u3 = (int16x8_t)unpack_to_u16_h(dest3);
+  uint8x16_t output_v;
+  uint8_t tmp_dest[16];
+  int i, j;
+
+  v_a = vec_sra(v_a, two);
+  v_c = vec_sra(v_c, two);
+
+  TRANSPOSE_WHT;
+
+  v32_a = vec_unpackh(v_a);
+  v32_c = vec_unpackl(v_a);
+
+  v32_d = vec_unpackh(v_c);
+  v32_b = vec_unpackl(v_c);
+
+  TRANSFORM_COLS;
+
+  TRANSPOSE_WHT;
+
+  v32_a = vec_unpackh(v_a);
+  v32_c = vec_unpackl(v_a);
+  v32_d = vec_unpackh(v_c);
+  v32_b = vec_unpackl(v_c);
+
+  TRANSFORM_COLS;
+
+  PACK_STORE(v_a, v_c);
+}
+
+void vp9_iadst4_vsx(int16x8_t *in, int16x8_t *out) {
+  int16x8_t sinpi_1_3_v, sinpi_4_2_v, sinpi_2_3_v, sinpi_1_4_v, sinpi_12_n3_v;
+  int32x4_t v_v[5], u_v[4];
+  int32x4_t zerov = vec_splat_s32(0);
+  int16x8_t tmp0, tmp1;
+  int16x8_t zero16v = vec_splat_s16(0);
+  uint32x4_t shift16 = vec_sl(vec_splat_u32(8), vec_splat_u32(1));
+  ROUND_SHIFT_INIT;
+
+  sinpi_1_3_v = vec_mergel(sinpi_1_9_v, sinpi_3_9_v);
+  sinpi_4_2_v = vec_mergel(sinpi_4_9_v, sinpi_2_9_v);
+  sinpi_2_3_v = vec_mergel(sinpi_2_9_v, sinpi_3_9_v);
+  sinpi_1_4_v = vec_mergel(sinpi_1_9_v, sinpi_4_9_v);
+  sinpi_12_n3_v = vec_mergel(vec_add(sinpi_1_9_v, sinpi_2_9_v),
+                             vec_sub(zero16v, sinpi_3_9_v));
+
+  tmp0 = (int16x8_t)vec_mergeh((int32x4_t)in[0], (int32x4_t)in[1]);
+  tmp1 = (int16x8_t)vec_mergel((int32x4_t)in[0], (int32x4_t)in[1]);
+  in[0] = (int16x8_t)vec_mergeh((int32x4_t)tmp0, (int32x4_t)tmp1);
+  in[1] = (int16x8_t)vec_mergel((int32x4_t)tmp0, (int32x4_t)tmp1);
+
+  v_v[0] = vec_msum(in[0], sinpi_1_3_v, zerov);
+  v_v[1] = vec_msum(in[1], sinpi_4_2_v, zerov);
+  v_v[2] = vec_msum(in[0], sinpi_2_3_v, zerov);
+  v_v[3] = vec_msum(in[1], sinpi_1_4_v, zerov);
+  v_v[4] = vec_msum(in[0], sinpi_12_n3_v, zerov);
+
+  in[0] = vec_sub(in[0], in[1]);
+  in[1] = (int16x8_t)vec_sra((int32x4_t)in[1], shift16);
+  in[0] = vec_add(in[0], in[1]);
+  in[0] = (int16x8_t)vec_sl((int32x4_t)in[0], shift16);
+
+  u_v[0] = vec_add(v_v[0], v_v[1]);
+  u_v[1] = vec_sub(v_v[2], v_v[3]);
+  u_v[2] = vec_msum(in[0], sinpi_1_3_v, zerov);
+  u_v[3] = vec_sub(v_v[1], v_v[3]);
+  u_v[3] = vec_add(u_v[3], v_v[4]);
+
+  DCT_CONST_ROUND_SHIFT(u_v[0]);
+  DCT_CONST_ROUND_SHIFT(u_v[1]);
+  DCT_CONST_ROUND_SHIFT(u_v[2]);
+  DCT_CONST_ROUND_SHIFT(u_v[3]);
+
+  out[0] = vec_packs(u_v[0], u_v[1]);
+  out[1] = vec_packs(u_v[2], u_v[3]);
+}
+
+#define MSUM_ROUND_SHIFT(a, b, cospi) \
+  b = vec_msums(a, cospi, zerov);     \
+  DCT_CONST_ROUND_SHIFT(b);
+
+#define IADST_WRAPLOW(in0, in1, tmp0, tmp1, out, cospi) \
+  MSUM_ROUND_SHIFT(in0, tmp0, cospi);                   \
+  MSUM_ROUND_SHIFT(in1, tmp1, cospi);                   \
+  out = vec_packs(tmp0, tmp1);
+
+void vp9_iadst8_vsx(int16x8_t *in, int16x8_t *out) {
+  int32x4_t tmp0[16], tmp1[16];
+
+  int32x4_t zerov = vec_splat_s32(0);
+  int16x8_t zero16v = vec_splat_s16(0);
+  int16x8_t cospi_p02_p30_v = vec_mergel(cospi2_v, cospi30_v);
+  int16x8_t cospi_p30_m02_v = vec_mergel(cospi30_v, cospi2m_v);
+  int16x8_t cospi_p10_p22_v = vec_mergel(cospi10_v, cospi22_v);
+  int16x8_t cospi_p22_m10_v = vec_mergel(cospi22_v, cospi10m_v);
+  int16x8_t cospi_p18_p14_v = vec_mergel(cospi18_v, cospi14_v);
+  int16x8_t cospi_p14_m18_v = vec_mergel(cospi14_v, cospi18m_v);
+  int16x8_t cospi_p26_p06_v = vec_mergel(cospi26_v, cospi6_v);
+  int16x8_t cospi_p06_m26_v = vec_mergel(cospi6_v, cospi26m_v);
+  int16x8_t cospi_p08_p24_v = vec_mergel(cospi8_v, cospi24_v);
+  int16x8_t cospi_p24_m08_v = vec_mergel(cospi24_v, cospi8m_v);
+  int16x8_t cospi_m24_p08_v = vec_mergel(cospi24m_v, cospi8_v);
+  int16x8_t cospi_p16_m16_v = vec_mergel(cospi16_v, cospi16m_v);
+  ROUND_SHIFT_INIT;
+
+  TRANSPOSE8x8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], out[0],
+               out[1], out[2], out[3], out[4], out[5], out[6], out[7]);
+
+  // stage 1
+  // interleave and multiply/add into 32-bit integer
+  in[0] = vec_mergeh(out[7], out[0]);
+  in[1] = vec_mergel(out[7], out[0]);
+  in[2] = vec_mergeh(out[5], out[2]);
+  in[3] = vec_mergel(out[5], out[2]);
+  in[4] = vec_mergeh(out[3], out[4]);
+  in[5] = vec_mergel(out[3], out[4]);
+  in[6] = vec_mergeh(out[1], out[6]);
+  in[7] = vec_mergel(out[1], out[6]);
+
+  tmp1[0] = vec_msum(in[0], cospi_p02_p30_v, zerov);
+  tmp1[1] = vec_msum(in[1], cospi_p02_p30_v, zerov);
+  tmp1[2] = vec_msum(in[0], cospi_p30_m02_v, zerov);
+  tmp1[3] = vec_msum(in[1], cospi_p30_m02_v, zerov);
+  tmp1[4] = vec_msum(in[2], cospi_p10_p22_v, zerov);
+  tmp1[5] = vec_msum(in[3], cospi_p10_p22_v, zerov);
+  tmp1[6] = vec_msum(in[2], cospi_p22_m10_v, zerov);
+  tmp1[7] = vec_msum(in[3], cospi_p22_m10_v, zerov);
+  tmp1[8] = vec_msum(in[4], cospi_p18_p14_v, zerov);
+  tmp1[9] = vec_msum(in[5], cospi_p18_p14_v, zerov);
+  tmp1[10] = vec_msum(in[4], cospi_p14_m18_v, zerov);
+  tmp1[11] = vec_msum(in[5], cospi_p14_m18_v, zerov);
+  tmp1[12] = vec_msum(in[6], cospi_p26_p06_v, zerov);
+  tmp1[13] = vec_msum(in[7], cospi_p26_p06_v, zerov);
+  tmp1[14] = vec_msum(in[6], cospi_p06_m26_v, zerov);
+  tmp1[15] = vec_msum(in[7], cospi_p06_m26_v, zerov);
+
+  tmp0[0] = vec_add(tmp1[0], tmp1[8]);
+  tmp0[1] = vec_add(tmp1[1], tmp1[9]);
+  tmp0[2] = vec_add(tmp1[2], tmp1[10]);
+  tmp0[3] = vec_add(tmp1[3], tmp1[11]);
+  tmp0[4] = vec_add(tmp1[4], tmp1[12]);
+  tmp0[5] = vec_add(tmp1[5], tmp1[13]);
+  tmp0[6] = vec_add(tmp1[6], tmp1[14]);
+  tmp0[7] = vec_add(tmp1[7], tmp1[15]);
+  tmp0[8] = vec_sub(tmp1[0], tmp1[8]);
+  tmp0[9] = vec_sub(tmp1[1], tmp1[9]);
+  tmp0[10] = vec_sub(tmp1[2], tmp1[10]);
+  tmp0[11] = vec_sub(tmp1[3], tmp1[11]);
+  tmp0[12] = vec_sub(tmp1[4], tmp1[12]);
+  tmp0[13] = vec_sub(tmp1[5], tmp1[13]);
+  tmp0[14] = vec_sub(tmp1[6], tmp1[14]);
+  tmp0[15] = vec_sub(tmp1[7], tmp1[15]);
+
+  // shift and rounding
+  DCT_CONST_ROUND_SHIFT(tmp0[0]);
+  DCT_CONST_ROUND_SHIFT(tmp0[1]);
+  DCT_CONST_ROUND_SHIFT(tmp0[2]);
+  DCT_CONST_ROUND_SHIFT(tmp0[3]);
+  DCT_CONST_ROUND_SHIFT(tmp0[4]);
+  DCT_CONST_ROUND_SHIFT(tmp0[5]);
+  DCT_CONST_ROUND_SHIFT(tmp0[6]);
+  DCT_CONST_ROUND_SHIFT(tmp0[7]);
+  DCT_CONST_ROUND_SHIFT(tmp0[8]);
+  DCT_CONST_ROUND_SHIFT(tmp0[9]);
+  DCT_CONST_ROUND_SHIFT(tmp0[10]);
+  DCT_CONST_ROUND_SHIFT(tmp0[11]);
+  DCT_CONST_ROUND_SHIFT(tmp0[12]);
+  DCT_CONST_ROUND_SHIFT(tmp0[13]);
+  DCT_CONST_ROUND_SHIFT(tmp0[14]);
+  DCT_CONST_ROUND_SHIFT(tmp0[15]);
+
+  // back to 16-bit
+  out[0] = vec_packs(tmp0[0], tmp0[1]);
+  out[1] = vec_packs(tmp0[2], tmp0[3]);
+  out[2] = vec_packs(tmp0[4], tmp0[5]);
+  out[3] = vec_packs(tmp0[6], tmp0[7]);
+  out[4] = vec_packs(tmp0[8], tmp0[9]);
+  out[5] = vec_packs(tmp0[10], tmp0[11]);
+  out[6] = vec_packs(tmp0[12], tmp0[13]);
+  out[7] = vec_packs(tmp0[14], tmp0[15]);
+
+  // stage 2
+  in[0] = vec_add(out[0], out[2]);
+  in[1] = vec_add(out[1], out[3]);
+  in[2] = vec_sub(out[0], out[2]);
+  in[3] = vec_sub(out[1], out[3]);
+  in[4] = vec_mergeh(out[4], out[5]);
+  in[5] = vec_mergel(out[4], out[5]);
+  in[6] = vec_mergeh(out[6], out[7]);
+  in[7] = vec_mergel(out[6], out[7]);
+
+  tmp1[0] = vec_msum(in[4], cospi_p08_p24_v, zerov);
+  tmp1[1] = vec_msum(in[5], cospi_p08_p24_v, zerov);
+  tmp1[2] = vec_msum(in[4], cospi_p24_m08_v, zerov);
+  tmp1[3] = vec_msum(in[5], cospi_p24_m08_v, zerov);
+  tmp1[4] = vec_msum(in[6], cospi_m24_p08_v, zerov);
+  tmp1[5] = vec_msum(in[7], cospi_m24_p08_v, zerov);
+  tmp1[6] = vec_msum(in[6], cospi_p08_p24_v, zerov);
+  tmp1[7] = vec_msum(in[7], cospi_p08_p24_v, zerov);
+
+  tmp0[0] = vec_add(tmp1[0], tmp1[4]);
+  tmp0[1] = vec_add(tmp1[1], tmp1[5]);
+  tmp0[2] = vec_add(tmp1[2], tmp1[6]);
+  tmp0[3] = vec_add(tmp1[3], tmp1[7]);
+  tmp0[4] = vec_sub(tmp1[0], tmp1[4]);
+  tmp0[5] = vec_sub(tmp1[1], tmp1[5]);
+  tmp0[6] = vec_sub(tmp1[2], tmp1[6]);
+  tmp0[7] = vec_sub(tmp1[3], tmp1[7]);
+
+  DCT_CONST_ROUND_SHIFT(tmp0[0]);
+  DCT_CONST_ROUND_SHIFT(tmp0[1]);
+  DCT_CONST_ROUND_SHIFT(tmp0[2]);
+  DCT_CONST_ROUND_SHIFT(tmp0[3]);
+  DCT_CONST_ROUND_SHIFT(tmp0[4]);
+  DCT_CONST_ROUND_SHIFT(tmp0[5]);
+  DCT_CONST_ROUND_SHIFT(tmp0[6]);
+  DCT_CONST_ROUND_SHIFT(tmp0[7]);
+
+  in[4] = vec_packs(tmp0[0], tmp0[1]);
+  in[5] = vec_packs(tmp0[2], tmp0[3]);
+  in[6] = vec_packs(tmp0[4], tmp0[5]);
+  in[7] = vec_packs(tmp0[6], tmp0[7]);
+
+  // stage 3
+  out[0] = vec_mergeh(in[2], in[3]);
+  out[1] = vec_mergel(in[2], in[3]);
+  out[2] = vec_mergeh(in[6], in[7]);
+  out[3] = vec_mergel(in[6], in[7]);
+
+  IADST_WRAPLOW(out[0], out[1], tmp0[0], tmp0[1], in[2], cospi16_v);
+  IADST_WRAPLOW(out[0], out[1], tmp0[0], tmp0[1], in[3], cospi_p16_m16_v);
+  IADST_WRAPLOW(out[2], out[3], tmp0[0], tmp0[1], in[6], cospi16_v);
+  IADST_WRAPLOW(out[2], out[3], tmp0[0], tmp0[1], in[7], cospi_p16_m16_v);
+
+  out[0] = in[0];
+  out[2] = in[6];
+  out[4] = in[3];
+  out[6] = in[5];
+
+  out[1] = vec_sub(zero16v, in[4]);
+  out[3] = vec_sub(zero16v, in[2]);
+  out[5] = vec_sub(zero16v, in[7]);
+  out[7] = vec_sub(zero16v, in[1]);
+}
+
+static void iadst16x8_vsx(int16x8_t *in, int16x8_t *out) {
+  int32x4_t tmp0[32], tmp1[32];
+  int16x8_t tmp16_0[8];
+  int16x8_t cospi_p01_p31 = vec_mergel(cospi1_v, cospi31_v);
+  int16x8_t cospi_p31_m01 = vec_mergel(cospi31_v, cospi1m_v);
+  int16x8_t cospi_p05_p27 = vec_mergel(cospi5_v, cospi27_v);
+  int16x8_t cospi_p27_m05 = vec_mergel(cospi27_v, cospi5m_v);
+  int16x8_t cospi_p09_p23 = vec_mergel(cospi9_v, cospi23_v);
+  int16x8_t cospi_p23_m09 = vec_mergel(cospi23_v, cospi9m_v);
+  int16x8_t cospi_p13_p19 = vec_mergel(cospi13_v, cospi19_v);
+  int16x8_t cospi_p19_m13 = vec_mergel(cospi19_v, cospi13m_v);
+  int16x8_t cospi_p17_p15 = vec_mergel(cospi17_v, cospi15_v);
+  int16x8_t cospi_p15_m17 = vec_mergel(cospi15_v, cospi17m_v);
+  int16x8_t cospi_p21_p11 = vec_mergel(cospi21_v, cospi11_v);
+  int16x8_t cospi_p11_m21 = vec_mergel(cospi11_v, cospi21m_v);
+  int16x8_t cospi_p25_p07 = vec_mergel(cospi25_v, cospi7_v);
+  int16x8_t cospi_p07_m25 = vec_mergel(cospi7_v, cospi25m_v);
+  int16x8_t cospi_p29_p03 = vec_mergel(cospi29_v, cospi3_v);
+  int16x8_t cospi_p03_m29 = vec_mergel(cospi3_v, cospi29m_v);
+  int16x8_t cospi_p04_p28 = vec_mergel(cospi4_v, cospi28_v);
+  int16x8_t cospi_p28_m04 = vec_mergel(cospi28_v, cospi4m_v);
+  int16x8_t cospi_p20_p12 = vec_mergel(cospi20_v, cospi12_v);
+  int16x8_t cospi_p12_m20 = vec_mergel(cospi12_v, cospi20m_v);
+  int16x8_t cospi_m28_p04 = vec_mergel(cospi28m_v, cospi4_v);
+  int16x8_t cospi_m12_p20 = vec_mergel(cospi12m_v, cospi20_v);
+  int16x8_t cospi_p08_p24 = vec_mergel(cospi8_v, cospi24_v);
+  int16x8_t cospi_p24_m08 = vec_mergel(cospi24_v, cospi8m_v);
+  int16x8_t cospi_m24_p08 = vec_mergel(cospi24m_v, cospi8_v);
+  int32x4_t zerov = vec_splat_s32(0);
+  ROUND_SHIFT_INIT;
+
+  tmp16_0[0] = vec_mergeh(in[15], in[0]);
+  tmp16_0[1] = vec_mergel(in[15], in[0]);
+  tmp16_0[2] = vec_mergeh(in[13], in[2]);
+  tmp16_0[3] = vec_mergel(in[13], in[2]);
+  tmp16_0[4] = vec_mergeh(in[11], in[4]);
+  tmp16_0[5] = vec_mergel(in[11], in[4]);
+  tmp16_0[6] = vec_mergeh(in[9], in[6]);
+  tmp16_0[7] = vec_mergel(in[9], in[6]);
+  tmp16_0[8] = vec_mergeh(in[7], in[8]);
+  tmp16_0[9] = vec_mergel(in[7], in[8]);
+  tmp16_0[10] = vec_mergeh(in[5], in[10]);
+  tmp16_0[11] = vec_mergel(in[5], in[10]);
+  tmp16_0[12] = vec_mergeh(in[3], in[12]);
+  tmp16_0[13] = vec_mergel(in[3], in[12]);
+  tmp16_0[14] = vec_mergeh(in[1], in[14]);
+  tmp16_0[15] = vec_mergel(in[1], in[14]);
+
+  tmp0[0] = vec_msum(tmp16_0[0], cospi_p01_p31, zerov);
+  tmp0[1] = vec_msum(tmp16_0[1], cospi_p01_p31, zerov);
+  tmp0[2] = vec_msum(tmp16_0[0], cospi_p31_m01, zerov);
+  tmp0[3] = vec_msum(tmp16_0[1], cospi_p31_m01, zerov);
+  tmp0[4] = vec_msum(tmp16_0[2], cospi_p05_p27, zerov);
+  tmp0[5] = vec_msum(tmp16_0[3], cospi_p05_p27, zerov);
+  tmp0[6] = vec_msum(tmp16_0[2], cospi_p27_m05, zerov);
+  tmp0[7] = vec_msum(tmp16_0[3], cospi_p27_m05, zerov);
+  tmp0[8] = vec_msum(tmp16_0[4], cospi_p09_p23, zerov);
+  tmp0[9] = vec_msum(tmp16_0[5], cospi_p09_p23, zerov);
+  tmp0[10] = vec_msum(tmp16_0[4], cospi_p23_m09, zerov);
+  tmp0[11] = vec_msum(tmp16_0[5], cospi_p23_m09, zerov);
+  tmp0[12] = vec_msum(tmp16_0[6], cospi_p13_p19, zerov);
+  tmp0[13] = vec_msum(tmp16_0[7], cospi_p13_p19, zerov);
+  tmp0[14] = vec_msum(tmp16_0[6], cospi_p19_m13, zerov);
+  tmp0[15] = vec_msum(tmp16_0[7], cospi_p19_m13, zerov);
+  tmp0[16] = vec_msum(tmp16_0[8], cospi_p17_p15, zerov);
+  tmp0[17] = vec_msum(tmp16_0[9], cospi_p17_p15, zerov);
+  tmp0[18] = vec_msum(tmp16_0[8], cospi_p15_m17, zerov);
+  tmp0[19] = vec_msum(tmp16_0[9], cospi_p15_m17, zerov);
+  tmp0[20] = vec_msum(tmp16_0[10], cospi_p21_p11, zerov);
+  tmp0[21] = vec_msum(tmp16_0[11], cospi_p21_p11, zerov);
+  tmp0[22] = vec_msum(tmp16_0[10], cospi_p11_m21, zerov);
+  tmp0[23] = vec_msum(tmp16_0[11], cospi_p11_m21, zerov);
+  tmp0[24] = vec_msum(tmp16_0[12], cospi_p25_p07, zerov);
+  tmp0[25] = vec_msum(tmp16_0[13], cospi_p25_p07, zerov);
+  tmp0[26] = vec_msum(tmp16_0[12], cospi_p07_m25, zerov);
+  tmp0[27] = vec_msum(tmp16_0[13], cospi_p07_m25, zerov);
+  tmp0[28] = vec_msum(tmp16_0[14], cospi_p29_p03, zerov);
+  tmp0[29] = vec_msum(tmp16_0[15], cospi_p29_p03, zerov);
+  tmp0[30] = vec_msum(tmp16_0[14], cospi_p03_m29, zerov);
+  tmp0[31] = vec_msum(tmp16_0[15], cospi_p03_m29, zerov);
+
+  tmp1[0] = vec_add(tmp0[0], tmp0[16]);
+  tmp1[1] = vec_add(tmp0[1], tmp0[17]);
+  tmp1[2] = vec_add(tmp0[2], tmp0[18]);
+  tmp1[3] = vec_add(tmp0[3], tmp0[19]);
+  tmp1[4] = vec_add(tmp0[4], tmp0[20]);
+  tmp1[5] = vec_add(tmp0[5], tmp0[21]);
+  tmp1[6] = vec_add(tmp0[6], tmp0[22]);
+  tmp1[7] = vec_add(tmp0[7], tmp0[23]);
+  tmp1[8] = vec_add(tmp0[8], tmp0[24]);
+  tmp1[9] = vec_add(tmp0[9], tmp0[25]);
+  tmp1[10] = vec_add(tmp0[10], tmp0[26]);
+  tmp1[11] = vec_add(tmp0[11], tmp0[27]);
+  tmp1[12] = vec_add(tmp0[12], tmp0[28]);
+  tmp1[13] = vec_add(tmp0[13], tmp0[29]);
+  tmp1[14] = vec_add(tmp0[14], tmp0[30]);
+  tmp1[15] = vec_add(tmp0[15], tmp0[31]);
+  tmp1[16] = vec_sub(tmp0[0], tmp0[16]);
+  tmp1[17] = vec_sub(tmp0[1], tmp0[17]);
+  tmp1[18] = vec_sub(tmp0[2], tmp0[18]);
+  tmp1[19] = vec_sub(tmp0[3], tmp0[19]);
+  tmp1[20] = vec_sub(tmp0[4], tmp0[20]);
+  tmp1[21] = vec_sub(tmp0[5], tmp0[21]);
+  tmp1[22] = vec_sub(tmp0[6], tmp0[22]);
+  tmp1[23] = vec_sub(tmp0[7], tmp0[23]);
+  tmp1[24] = vec_sub(tmp0[8], tmp0[24]);
+  tmp1[25] = vec_sub(tmp0[9], tmp0[25]);
+  tmp1[26] = vec_sub(tmp0[10], tmp0[26]);
+  tmp1[27] = vec_sub(tmp0[11], tmp0[27]);
+  tmp1[28] = vec_sub(tmp0[12], tmp0[28]);
+  tmp1[29] = vec_sub(tmp0[13], tmp0[29]);
+  tmp1[30] = vec_sub(tmp0[14], tmp0[30]);
+  tmp1[31] = vec_sub(tmp0[15], tmp0[31]);
+
+  DCT_CONST_ROUND_SHIFT(tmp1[0]);
+  DCT_CONST_ROUND_SHIFT(tmp1[1]);
+  DCT_CONST_ROUND_SHIFT(tmp1[2]);
+  DCT_CONST_ROUND_SHIFT(tmp1[3]);
+  DCT_CONST_ROUND_SHIFT(tmp1[4]);
+  DCT_CONST_ROUND_SHIFT(tmp1[5]);
+  DCT_CONST_ROUND_SHIFT(tmp1[6]);
+  DCT_CONST_ROUND_SHIFT(tmp1[7]);
+  DCT_CONST_ROUND_SHIFT(tmp1[8]);
+  DCT_CONST_ROUND_SHIFT(tmp1[9]);
+  DCT_CONST_ROUND_SHIFT(tmp1[10]);
+  DCT_CONST_ROUND_SHIFT(tmp1[11]);
+  DCT_CONST_ROUND_SHIFT(tmp1[12]);
+  DCT_CONST_ROUND_SHIFT(tmp1[13]);
+  DCT_CONST_ROUND_SHIFT(tmp1[14]);
+  DCT_CONST_ROUND_SHIFT(tmp1[15]);
+  DCT_CONST_ROUND_SHIFT(tmp1[16]);
+  DCT_CONST_ROUND_SHIFT(tmp1[17]);
+  DCT_CONST_ROUND_SHIFT(tmp1[18]);
+  DCT_CONST_ROUND_SHIFT(tmp1[19]);
+  DCT_CONST_ROUND_SHIFT(tmp1[20]);
+  DCT_CONST_ROUND_SHIFT(tmp1[21]);
+  DCT_CONST_ROUND_SHIFT(tmp1[22]);
+  DCT_CONST_ROUND_SHIFT(tmp1[23]);
+  DCT_CONST_ROUND_SHIFT(tmp1[24]);
+  DCT_CONST_ROUND_SHIFT(tmp1[25]);
+  DCT_CONST_ROUND_SHIFT(tmp1[26]);
+  DCT_CONST_ROUND_SHIFT(tmp1[27]);
+  DCT_CONST_ROUND_SHIFT(tmp1[28]);
+  DCT_CONST_ROUND_SHIFT(tmp1[29]);
+  DCT_CONST_ROUND_SHIFT(tmp1[30]);
+  DCT_CONST_ROUND_SHIFT(tmp1[31]);
+
+  in[0] = vec_packs(tmp1[0], tmp1[1]);
+  in[1] = vec_packs(tmp1[2], tmp1[3]);
+  in[2] = vec_packs(tmp1[4], tmp1[5]);
+  in[3] = vec_packs(tmp1[6], tmp1[7]);
+  in[4] = vec_packs(tmp1[8], tmp1[9]);
+  in[5] = vec_packs(tmp1[10], tmp1[11]);
+  in[6] = vec_packs(tmp1[12], tmp1[13]);
+  in[7] = vec_packs(tmp1[14], tmp1[15]);
+  in[8] = vec_packs(tmp1[16], tmp1[17]);
+  in[9] = vec_packs(tmp1[18], tmp1[19]);
+  in[10] = vec_packs(tmp1[20], tmp1[21]);
+  in[11] = vec_packs(tmp1[22], tmp1[23]);
+  in[12] = vec_packs(tmp1[24], tmp1[25]);
+  in[13] = vec_packs(tmp1[26], tmp1[27]);
+  in[14] = vec_packs(tmp1[28], tmp1[29]);
+  in[15] = vec_packs(tmp1[30], tmp1[31]);
+
+  // stage 2
+  tmp16_0[0] = vec_mergeh(in[8], in[9]);
+  tmp16_0[1] = vec_mergel(in[8], in[9]);
+  tmp16_0[2] = vec_mergeh(in[10], in[11]);
+  tmp16_0[3] = vec_mergel(in[10], in[11]);
+  tmp16_0[4] = vec_mergeh(in[12], in[13]);
+  tmp16_0[5] = vec_mergel(in[12], in[13]);
+  tmp16_0[6] = vec_mergeh(in[14], in[15]);
+  tmp16_0[7] = vec_mergel(in[14], in[15]);
+
+  tmp0[0] = vec_msum(tmp16_0[0], cospi_p04_p28, zerov);
+  tmp0[1] = vec_msum(tmp16_0[1], cospi_p04_p28, zerov);
+  tmp0[2] = vec_msum(tmp16_0[0], cospi_p28_m04, zerov);
+  tmp0[3] = vec_msum(tmp16_0[1], cospi_p28_m04, zerov);
+  tmp0[4] = vec_msum(tmp16_0[2], cospi_p20_p12, zerov);
+  tmp0[5] = vec_msum(tmp16_0[3], cospi_p20_p12, zerov);
+  tmp0[6] = vec_msum(tmp16_0[2], cospi_p12_m20, zerov);
+  tmp0[7] = vec_msum(tmp16_0[3], cospi_p12_m20, zerov);
+  tmp0[8] = vec_msum(tmp16_0[4], cospi_m28_p04, zerov);
+  tmp0[9] = vec_msum(tmp16_0[5], cospi_m28_p04, zerov);
+  tmp0[10] = vec_msum(tmp16_0[4], cospi_p04_p28, zerov);
+  tmp0[11] = vec_msum(tmp16_0[5], cospi_p04_p28, zerov);
+  tmp0[12] = vec_msum(tmp16_0[6], cospi_m12_p20, zerov);
+  tmp0[13] = vec_msum(tmp16_0[7], cospi_m12_p20, zerov);
+  tmp0[14] = vec_msum(tmp16_0[6], cospi_p20_p12, zerov);
+  tmp0[15] = vec_msum(tmp16_0[7], cospi_p20_p12, zerov);
+
+  tmp1[0] = vec_add(tmp0[0], tmp0[8]);
+  tmp1[1] = vec_add(tmp0[1], tmp0[9]);
+  tmp1[2] = vec_add(tmp0[2], tmp0[10]);
+  tmp1[3] = vec_add(tmp0[3], tmp0[11]);
+  tmp1[4] = vec_add(tmp0[4], tmp0[12]);
+  tmp1[5] = vec_add(tmp0[5], tmp0[13]);
+  tmp1[6] = vec_add(tmp0[6], tmp0[14]);
+  tmp1[7] = vec_add(tmp0[7], tmp0[15]);
+  tmp1[8] = vec_sub(tmp0[0], tmp0[8]);
+  tmp1[9] = vec_sub(tmp0[1], tmp0[9]);
+  tmp1[10] = vec_sub(tmp0[2], tmp0[10]);
+  tmp1[11] = vec_sub(tmp0[3], tmp0[11]);
+  tmp1[12] = vec_sub(tmp0[4], tmp0[12]);
+  tmp1[13] = vec_sub(tmp0[5], tmp0[13]);
+  tmp1[14] = vec_sub(tmp0[6], tmp0[14]);
+  tmp1[15] = vec_sub(tmp0[7], tmp0[15]);
+
+  DCT_CONST_ROUND_SHIFT(tmp1[0]);
+  DCT_CONST_ROUND_SHIFT(tmp1[1]);
+  DCT_CONST_ROUND_SHIFT(tmp1[2]);
+  DCT_CONST_ROUND_SHIFT(tmp1[3]);
+  DCT_CONST_ROUND_SHIFT(tmp1[4]);
+  DCT_CONST_ROUND_SHIFT(tmp1[5]);
+  DCT_CONST_ROUND_SHIFT(tmp1[6]);
+  DCT_CONST_ROUND_SHIFT(tmp1[7]);
+  DCT_CONST_ROUND_SHIFT(tmp1[8]);
+  DCT_CONST_ROUND_SHIFT(tmp1[9]);
+  DCT_CONST_ROUND_SHIFT(tmp1[10]);
+  DCT_CONST_ROUND_SHIFT(tmp1[11]);
+  DCT_CONST_ROUND_SHIFT(tmp1[12]);
+  DCT_CONST_ROUND_SHIFT(tmp1[13]);
+  DCT_CONST_ROUND_SHIFT(tmp1[14]);
+  DCT_CONST_ROUND_SHIFT(tmp1[15]);
+
+  tmp16_0[0] = vec_add(in[0], in[4]);
+  tmp16_0[1] = vec_add(in[1], in[5]);
+  tmp16_0[2] = vec_add(in[2], in[6]);
+  tmp16_0[3] = vec_add(in[3], in[7]);
+  tmp16_0[4] = vec_sub(in[0], in[4]);
+  tmp16_0[5] = vec_sub(in[1], in[5]);
+  tmp16_0[6] = vec_sub(in[2], in[6]);
+  tmp16_0[7] = vec_sub(in[3], in[7]);
+  tmp16_0[8] = vec_packs(tmp1[0], tmp1[1]);
+  tmp16_0[9] = vec_packs(tmp1[2], tmp1[3]);
+  tmp16_0[10] = vec_packs(tmp1[4], tmp1[5]);
+  tmp16_0[11] = vec_packs(tmp1[6], tmp1[7]);
+  tmp16_0[12] = vec_packs(tmp1[8], tmp1[9]);
+  tmp16_0[13] = vec_packs(tmp1[10], tmp1[11]);
+  tmp16_0[14] = vec_packs(tmp1[12], tmp1[13]);
+  tmp16_0[15] = vec_packs(tmp1[14], tmp1[15]);
+
+  // stage 3
+  in[0] = vec_mergeh(tmp16_0[4], tmp16_0[5]);
+  in[1] = vec_mergel(tmp16_0[4], tmp16_0[5]);
+  in[2] = vec_mergeh(tmp16_0[6], tmp16_0[7]);
+  in[3] = vec_mergel(tmp16_0[6], tmp16_0[7]);
+  in[4] = vec_mergeh(tmp16_0[12], tmp16_0[13]);
+  in[5] = vec_mergel(tmp16_0[12], tmp16_0[13]);
+  in[6] = vec_mergeh(tmp16_0[14], tmp16_0[15]);
+  in[7] = vec_mergel(tmp16_0[14], tmp16_0[15]);
+
+  tmp0[0] = vec_msum(in[0], cospi_p08_p24, zerov);
+  tmp0[1] = vec_msum(in[1], cospi_p08_p24, zerov);
+  tmp0[2] = vec_msum(in[0], cospi_p24_m08, zerov);
+  tmp0[3] = vec_msum(in[1], cospi_p24_m08, zerov);
+  tmp0[4] = vec_msum(in[2], cospi_m24_p08, zerov);
+  tmp0[5] = vec_msum(in[3], cospi_m24_p08, zerov);
+  tmp0[6] = vec_msum(in[2], cospi_p08_p24, zerov);
+  tmp0[7] = vec_msum(in[3], cospi_p08_p24, zerov);
+  tmp0[8] = vec_msum(in[4], cospi_p08_p24, zerov);
+  tmp0[9] = vec_msum(in[5], cospi_p08_p24, zerov);
+  tmp0[10] = vec_msum(in[4], cospi_p24_m08, zerov);
+  tmp0[11] = vec_msum(in[5], cospi_p24_m08, zerov);
+  tmp0[12] = vec_msum(in[6], cospi_m24_p08, zerov);
+  tmp0[13] = vec_msum(in[7], cospi_m24_p08, zerov);
+  tmp0[14] = vec_msum(in[6], cospi_p08_p24, zerov);
+  tmp0[15] = vec_msum(in[7], cospi_p08_p24, zerov);
+
+  tmp1[0] = vec_add(tmp0[0], tmp0[4]);
+  tmp1[1] = vec_add(tmp0[1], tmp0[5]);
+  tmp1[2] = vec_add(tmp0[2], tmp0[6]);
+  tmp1[3] = vec_add(tmp0[3], tmp0[7]);
+  tmp1[4] = vec_sub(tmp0[0], tmp0[4]);
+  tmp1[5] = vec_sub(tmp0[1], tmp0[5]);
+  tmp1[6] = vec_sub(tmp0[2], tmp0[6]);
+  tmp1[7] = vec_sub(tmp0[3], tmp0[7]);
+  tmp1[8] = vec_add(tmp0[8], tmp0[12]);
+  tmp1[9] = vec_add(tmp0[9], tmp0[13]);
+  tmp1[10] = vec_add(tmp0[10], tmp0[14]);
+  tmp1[11] = vec_add(tmp0[11], tmp0[15]);
+  tmp1[12] = vec_sub(tmp0[8], tmp0[12]);
+  tmp1[13] = vec_sub(tmp0[9], tmp0[13]);
+  tmp1[14] = vec_sub(tmp0[10], tmp0[14]);
+  tmp1[15] = vec_sub(tmp0[11], tmp0[15]);
+
+  DCT_CONST_ROUND_SHIFT(tmp1[0]);
+  DCT_CONST_ROUND_SHIFT(tmp1[1]);
+  DCT_CONST_ROUND_SHIFT(tmp1[2]);
+  DCT_CONST_ROUND_SHIFT(tmp1[3]);
+  DCT_CONST_ROUND_SHIFT(tmp1[4]);
+  DCT_CONST_ROUND_SHIFT(tmp1[5]);
+  DCT_CONST_ROUND_SHIFT(tmp1[6]);
+  DCT_CONST_ROUND_SHIFT(tmp1[7]);
+  DCT_CONST_ROUND_SHIFT(tmp1[8]);
+  DCT_CONST_ROUND_SHIFT(tmp1[9]);
+  DCT_CONST_ROUND_SHIFT(tmp1[10]);
+  DCT_CONST_ROUND_SHIFT(tmp1[11]);
+  DCT_CONST_ROUND_SHIFT(tmp1[12]);
+  DCT_CONST_ROUND_SHIFT(tmp1[13]);
+  DCT_CONST_ROUND_SHIFT(tmp1[14]);
+  DCT_CONST_ROUND_SHIFT(tmp1[15]);
+
+  in[0] = vec_add(tmp16_0[0], tmp16_0[2]);
+  in[1] = vec_add(tmp16_0[1], tmp16_0[3]);
+  in[2] = vec_sub(tmp16_0[0], tmp16_0[2]);
+  in[3] = vec_sub(tmp16_0[1], tmp16_0[3]);
+  in[4] = vec_packs(tmp1[0], tmp1[1]);
+  in[5] = vec_packs(tmp1[2], tmp1[3]);
+  in[6] = vec_packs(tmp1[4], tmp1[5]);
+  in[7] = vec_packs(tmp1[6], tmp1[7]);
+  in[8] = vec_add(tmp16_0[8], tmp16_0[10]);
+  in[9] = vec_add(tmp16_0[9], tmp16_0[11]);
+  in[10] = vec_sub(tmp16_0[8], tmp16_0[10]);
+  in[11] = vec_sub(tmp16_0[9], tmp16_0[11]);
+  in[12] = vec_packs(tmp1[8], tmp1[9]);
+  in[13] = vec_packs(tmp1[10], tmp1[11]);
+  in[14] = vec_packs(tmp1[12], tmp1[13]);
+  in[15] = vec_packs(tmp1[14], tmp1[15]);
+
+  // stage 4
+  out[0] = vec_mergeh(in[2], in[3]);
+  out[1] = vec_mergel(in[2], in[3]);
+  out[2] = vec_mergeh(in[6], in[7]);
+  out[3] = vec_mergel(in[6], in[7]);
+  out[4] = vec_mergeh(in[10], in[11]);
+  out[5] = vec_mergel(in[10], in[11]);
+  out[6] = vec_mergeh(in[14], in[15]);
+  out[7] = vec_mergel(in[14], in[15]);
+}
+
+void vpx_iadst16_vsx(int16x8_t *src0, int16x8_t *src1) {
+  int16x8_t tmp0[16], tmp1[16], tmp2[8];
+  int32x4_t tmp3, tmp4;
+  int16x8_t zero16v = vec_splat_s16(0);
+  int32x4_t zerov = vec_splat_s32(0);
+  int16x8_t cospi_p16_m16 = vec_mergel(cospi16_v, cospi16m_v);
+  int16x8_t cospi_m16_p16 = vec_mergel(cospi16m_v, cospi16_v);
+  ROUND_SHIFT_INIT;
+
+  TRANSPOSE8x8(src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12],
+               src0[14], tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5],
+               tmp0[6], tmp0[7]);
+  TRANSPOSE8x8(src1[0], src1[2], src1[4], src1[6], src1[8], src1[10], src1[12],
+               src1[14], tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5],
+               tmp1[6], tmp1[7]);
+  TRANSPOSE8x8(src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13],
+               src0[15], tmp0[8], tmp0[9], tmp0[10], tmp0[11], tmp0[12],
+               tmp0[13], tmp0[14], tmp0[15]);
+  TRANSPOSE8x8(src1[1], src1[3], src1[5], src1[7], src1[9], src1[11], src1[13],
+               src1[15], tmp1[8], tmp1[9], tmp1[10], tmp1[11], tmp1[12],
+               tmp1[13], tmp1[14], tmp1[15]);
+
+  iadst16x8_vsx(tmp0, tmp2);
+  IADST_WRAPLOW(tmp2[0], tmp2[1], tmp3, tmp4, src0[14], cospi16m_v);
+  IADST_WRAPLOW(tmp2[0], tmp2[1], tmp3, tmp4, src1[0], cospi_p16_m16);
+  IADST_WRAPLOW(tmp2[2], tmp2[3], tmp3, tmp4, src0[8], cospi16_v);
+  IADST_WRAPLOW(tmp2[2], tmp2[3], tmp3, tmp4, src1[6], cospi_m16_p16);
+  IADST_WRAPLOW(tmp2[4], tmp2[5], tmp3, tmp4, src0[12], cospi16_v);
+  IADST_WRAPLOW(tmp2[4], tmp2[5], tmp3, tmp4, src1[2], cospi_m16_p16);
+  IADST_WRAPLOW(tmp2[6], tmp2[7], tmp3, tmp4, src0[10], cospi16m_v);
+  IADST_WRAPLOW(tmp2[6], tmp2[7], tmp3, tmp4, src1[4], cospi_p16_m16);
+
+  src0[0] = tmp0[0];
+  src0[2] = vec_sub(zero16v, tmp0[8]);
+  src0[4] = tmp0[12];
+  src0[6] = vec_sub(zero16v, tmp0[4]);
+  src1[8] = tmp0[5];
+  src1[10] = vec_sub(zero16v, tmp0[13]);
+  src1[12] = tmp0[9];
+  src1[14] = vec_sub(zero16v, tmp0[1]);
+
+  iadst16x8_vsx(tmp1, tmp2);
+  IADST_WRAPLOW(tmp2[0], tmp2[1], tmp3, tmp4, src0[15], cospi16m_v);
+  IADST_WRAPLOW(tmp2[0], tmp2[1], tmp3, tmp4, src1[1], cospi_p16_m16);
+  IADST_WRAPLOW(tmp2[2], tmp2[3], tmp3, tmp4, src0[9], cospi16_v);
+  IADST_WRAPLOW(tmp2[2], tmp2[3], tmp3, tmp4, src1[7], cospi_m16_p16);
+  IADST_WRAPLOW(tmp2[4], tmp2[5], tmp3, tmp4, src0[13], cospi16_v);
+  IADST_WRAPLOW(tmp2[4], tmp2[5], tmp3, tmp4, src1[3], cospi_m16_p16);
+  IADST_WRAPLOW(tmp2[6], tmp2[7], tmp3, tmp4, src0[11], cospi16m_v);
+  IADST_WRAPLOW(tmp2[6], tmp2[7], tmp3, tmp4, src1[5], cospi_p16_m16);
+
+  src0[1] = tmp1[0];
+  src0[3] = vec_sub(zero16v, tmp1[8]);
+  src0[5] = tmp1[12];
+  src0[7] = vec_sub(zero16v, tmp1[4]);
+  src1[9] = tmp1[5];
+  src1[11] = vec_sub(zero16v, tmp1[13]);
+  src1[13] = tmp1[9];
+  src1[15] = vec_sub(zero16v, tmp1[1]);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.h b/media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.h
new file mode 100644
index 0000000000..7031742c1c
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.h
@@ -0,0 +1,48 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_PPC_INV_TXFM_VSX_H_
+#define VPX_VPX_DSP_PPC_INV_TXFM_VSX_H_
+
+#include "vpx_dsp/ppc/types_vsx.h"
+
+void vpx_round_store4x4_vsx(int16x8_t *in, int16x8_t *out, uint8_t *dest,
+                            int stride);
+void vpx_idct4_vsx(int16x8_t *in, int16x8_t *out);
+void vp9_iadst4_vsx(int16x8_t *in, int16x8_t *out);
+
+void vpx_round_store8x8_vsx(int16x8_t *in, uint8_t *dest, int stride);
+void vpx_idct8_vsx(int16x8_t *in, int16x8_t *out);
+void vp9_iadst8_vsx(int16x8_t *in, int16x8_t *out);
+
+#define LOAD_INPUT16(load, source, offset, step, in) \
+  in[0] = load(offset, source);                      \
+  in[1] = load((step) + (offset), source);           \
+  in[2] = load(2 * (step) + (offset), source);       \
+  in[3] = load(3 * (step) + (offset), source);       \
+  in[4] = load(4 * (step) + (offset), source);       \
+  in[5] = load(5 * (step) + (offset), source);       \
+  in[6] = load(6 * (step) + (offset), source);       \
+  in[7] = load(7 * (step) + (offset), source);       \
+  in[8] = load(8 * (step) + (offset), source);       \
+  in[9] = load(9 * (step) + (offset), source);       \
+  in[10] = load(10 * (step) + (offset), source);     \
+  in[11] = load(11 * (step) + (offset), source);     \
+  in[12] = load(12 * (step) + (offset), source);     \
+  in[13] = load(13 * (step) + (offset), source);     \
+  in[14] = load(14 * (step) + (offset), source);     \
+  in[15] = load(15 * (step) + (offset), source);
+
+void vpx_round_store16x16_vsx(int16x8_t *src0, int16x8_t *src1, uint8_t *dest,
+                              int stride);
+void vpx_idct16_vsx(int16x8_t *src0, int16x8_t *src1);
+void vpx_iadst16_vsx(int16x8_t *src0, int16x8_t *src1);
+
+#endif  // VPX_VPX_DSP_PPC_INV_TXFM_VSX_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/quantize_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/quantize_vsx.c
new file mode 100644
index 0000000000..ab71f6e235
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/ppc/quantize_vsx.c
@@ -0,0 +1,301 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+// Negate 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative.
+static INLINE int16x8_t vec_sign(int16x8_t a, int16x8_t b) {
+  const int16x8_t mask = vec_sra(b, vec_shift_sign_s16);
+  return vec_xor(vec_add(a, mask), mask);
+}
+
+// Sets the value of a 32-bit integers to 1 when the corresponding value in a is
+// negative.
+static INLINE int32x4_t vec_is_neg(int32x4_t a) {
+  return vec_sr(a, vec_shift_sign_s32);
+}
+
+// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit
+// integers, and return the high 16 bits of the intermediate integers.
+// (a * b) >> 16
+static INLINE int16x8_t vec_mulhi(int16x8_t a, int16x8_t b) {
+  // madds does ((A * B) >>15) + C, we need >> 16, so we perform an extra right
+  // shift.
+  return vec_sra(vec_madds(a, b, vec_zeros_s16), vec_ones_u16);
+}
+
+// Quantization function used for 4x4, 8x8 and 16x16 blocks.
+static INLINE int16x8_t quantize_coeff(int16x8_t coeff, int16x8_t coeff_abs,
+                                       int16x8_t round, int16x8_t quant,
+                                       int16x8_t quant_shift, bool16x8_t mask) {
+  const int16x8_t rounded = vec_vaddshs(coeff_abs, round);
+  int16x8_t qcoeff = vec_mulhi(rounded, quant);
+  qcoeff = vec_add(qcoeff, rounded);
+  qcoeff = vec_mulhi(qcoeff, quant_shift);
+  qcoeff = vec_sign(qcoeff, coeff);
+  return vec_and(qcoeff, mask);
+}
+
+// Quantization function used for 32x32 blocks.
+static INLINE int16x8_t quantize_coeff_32(int16x8_t coeff, int16x8_t coeff_abs,
+                                          int16x8_t round, int16x8_t quant,
+                                          int16x8_t quant_shift,
+                                          bool16x8_t mask) {
+  const int16x8_t rounded = vec_vaddshs(coeff_abs, round);
+  int16x8_t qcoeff = vec_mulhi(rounded, quant);
+  qcoeff = vec_add(qcoeff, rounded);
+  // 32x32 blocks require an extra multiplication by 2, this compensates for the
+  // extra right shift added in vec_mulhi, as such vec_madds can be used
+  // directly instead of vec_mulhi (((a * b) >> 15) >> 1) << 1 == (a * b >> 15)
+  qcoeff = vec_madds(qcoeff, quant_shift, vec_zeros_s16);
+  qcoeff = vec_sign(qcoeff, coeff);
+  return vec_and(qcoeff, mask);
+}
+
+// DeQuantization function used for 32x32 blocks. Quantized coeff of 32x32
+// blocks are twice as big as for other block sizes. As such, using
+// vec_mladd results in overflow.
+static INLINE int16x8_t dequantize_coeff_32(int16x8_t qcoeff,
+                                            int16x8_t dequant) {
+  int32x4_t dqcoeffe = vec_mule(qcoeff, dequant);
+  int32x4_t dqcoeffo = vec_mulo(qcoeff, dequant);
+  // Add 1 if negative to round towards zero because the C uses division.
+  dqcoeffe = vec_add(dqcoeffe, vec_is_neg(dqcoeffe));
+  dqcoeffo = vec_add(dqcoeffo, vec_is_neg(dqcoeffo));
+  dqcoeffe = vec_sra(dqcoeffe, vec_ones_u32);
+  dqcoeffo = vec_sra(dqcoeffo, vec_ones_u32);
+  return (int16x8_t)vec_perm(dqcoeffe, dqcoeffo, vec_perm_odd_even_pack);
+}
+
+static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff,
+                                          const int16_t *iscan_ptr, int index) {
+  int16x8_t scan = vec_vsx_ld(index, iscan_ptr);
+  bool16x8_t zero_coeff = vec_cmpeq(qcoeff, vec_zeros_s16);
+  return vec_andc(scan, zero_coeff);
+}
+
+// Compare packed 16-bit integers across a, and return the maximum value in
+// every element. Returns a vector containing the biggest value across vector a.
+static INLINE int16x8_t vec_max_across(int16x8_t a) {
+  a = vec_max(a, vec_perm(a, a, vec_perm64));
+  a = vec_max(a, vec_perm(a, a, vec_perm32));
+  return vec_max(a, vec_perm(a, a, vec_perm16));
+}
+
+void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                        const int16_t *zbin_ptr, const int16_t *round_ptr,
+                        const int16_t *quant_ptr,
+                        const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                        tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                        uint16_t *eob_ptr, const int16_t *scan_ptr,
+                        const int16_t *iscan_ptr) {
+  int16x8_t qcoeff0, qcoeff1, dqcoeff0, dqcoeff1, eob;
+  bool16x8_t zero_mask0, zero_mask1;
+
+  // First set of 8 coeff starts with DC + 7 AC
+  int16x8_t zbin = vec_vsx_ld(0, zbin_ptr);
+  int16x8_t round = vec_vsx_ld(0, round_ptr);
+  int16x8_t quant = vec_vsx_ld(0, quant_ptr);
+  int16x8_t dequant = vec_vsx_ld(0, dequant_ptr);
+  int16x8_t quant_shift = vec_vsx_ld(0, quant_shift_ptr);
+
+  int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr);
+  int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr);
+
+  int16x8_t coeff0_abs = vec_abs(coeff0);
+  int16x8_t coeff1_abs = vec_abs(coeff1);
+
+  zero_mask0 = vec_cmpge(coeff0_abs, zbin);
+  zbin = vec_splat(zbin, 1);
+  zero_mask1 = vec_cmpge(coeff1_abs, zbin);
+
+  (void)scan_ptr;
+
+  qcoeff0 =
+      quantize_coeff(coeff0, coeff0_abs, round, quant, quant_shift, zero_mask0);
+  vec_vsx_st(qcoeff0, 0, qcoeff_ptr);
+  round = vec_splat(round, 1);
+  quant = vec_splat(quant, 1);
+  quant_shift = vec_splat(quant_shift, 1);
+  qcoeff1 =
+      quantize_coeff(coeff1, coeff1_abs, round, quant, quant_shift, zero_mask1);
+  vec_vsx_st(qcoeff1, 16, qcoeff_ptr);
+
+  dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16);
+  vec_vsx_st(dqcoeff0, 0, dqcoeff_ptr);
+  dequant = vec_splat(dequant, 1);
+  dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16);
+  vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr);
+
+  eob = vec_max(nonzero_scanindex(qcoeff0, iscan_ptr, 0),
+                nonzero_scanindex(qcoeff1, iscan_ptr, 16));
+
+  if (n_coeffs > 16) {
+    int index = 16;
+    int off0 = 32;
+    int off1 = 48;
+    int off2 = 64;
+    do {
+      int16x8_t coeff2, coeff2_abs, qcoeff2, dqcoeff2, eob2;
+      bool16x8_t zero_mask2;
+      coeff0 = vec_vsx_ld(off0, coeff_ptr);
+      coeff1 = vec_vsx_ld(off1, coeff_ptr);
+      coeff2 = vec_vsx_ld(off2, coeff_ptr);
+      coeff0_abs = vec_abs(coeff0);
+      coeff1_abs = vec_abs(coeff1);
+      coeff2_abs = vec_abs(coeff2);
+      zero_mask0 = vec_cmpge(coeff0_abs, zbin);
+      zero_mask1 = vec_cmpge(coeff1_abs, zbin);
+      zero_mask2 = vec_cmpge(coeff2_abs, zbin);
+      qcoeff0 = quantize_coeff(coeff0, coeff0_abs, round, quant, quant_shift,
+                               zero_mask0);
+      qcoeff1 = quantize_coeff(coeff1, coeff1_abs, round, quant, quant_shift,
+                               zero_mask1);
+      qcoeff2 = quantize_coeff(coeff2, coeff2_abs, round, quant, quant_shift,
+                               zero_mask2);
+      vec_vsx_st(qcoeff0, off0, qcoeff_ptr);
+      vec_vsx_st(qcoeff1, off1, qcoeff_ptr);
+      vec_vsx_st(qcoeff2, off2, qcoeff_ptr);
+
+      dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16);
+      dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16);
+      dqcoeff2 = vec_mladd(qcoeff2, dequant, vec_zeros_s16);
+
+      vec_vsx_st(dqcoeff0, off0, dqcoeff_ptr);
+      vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr);
+      vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr);
+
+      eob = vec_max(eob, nonzero_scanindex(qcoeff0, iscan_ptr, off0));
+      eob2 = vec_max(nonzero_scanindex(qcoeff1, iscan_ptr, off1),
+                     nonzero_scanindex(qcoeff2, iscan_ptr, off2));
+      eob = vec_max(eob, eob2);
+
+      index += 24;
+      off0 += 48;
+      off1 += 48;
+      off2 += 48;
+    } while (index < n_coeffs);
+  }
+
+  eob = vec_max_across(eob);
+  *eob_ptr = eob[0];
+}
+
+void vpx_quantize_b_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                              const int16_t *zbin_ptr, const int16_t *round_ptr,
+                              const int16_t *quant_ptr,
+                              const int16_t *quant_shift_ptr,
+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                              const int16_t *scan_ptr,
+                              const int16_t *iscan_ptr) {
+  // In stage 1, we quantize 16 coeffs (DC + 15 AC)
+  // In stage 2, we loop 42 times and quantize 24 coeffs per iteration
+  // (32 * 32 - 16) / 24 = 42
+  int num_itr = 42;
+  // Offsets are in bytes, 16 coeffs = 32 bytes
+  int off0 = 32;
+  int off1 = 48;
+  int off2 = 64;
+
+  int16x8_t qcoeff0, qcoeff1, eob;
+  bool16x8_t zero_mask0, zero_mask1;
+
+  int16x8_t zbin = vec_vsx_ld(0, zbin_ptr);
+  int16x8_t round = vec_vsx_ld(0, round_ptr);
+  int16x8_t quant = vec_vsx_ld(0, quant_ptr);
+  int16x8_t dequant = vec_vsx_ld(0, dequant_ptr);
+  int16x8_t quant_shift = vec_vsx_ld(0, quant_shift_ptr);
+
+  int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr);
+  int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr);
+
+  int16x8_t coeff0_abs = vec_abs(coeff0);
+  int16x8_t coeff1_abs = vec_abs(coeff1);
+
+  (void)scan_ptr;
+  (void)n_coeffs;
+
+  // 32x32 quantization requires that zbin and round be divided by 2
+  zbin = vec_sra(vec_add(zbin, vec_ones_s16), vec_ones_u16);
+  round = vec_sra(vec_add(round, vec_ones_s16), vec_ones_u16);
+
+  zero_mask0 = vec_cmpge(coeff0_abs, zbin);
+  zbin = vec_splat(zbin, 1);  // remove DC from zbin
+  zero_mask1 = vec_cmpge(coeff1_abs, zbin);
+
+  qcoeff0 = quantize_coeff_32(coeff0, coeff0_abs, round, quant, quant_shift,
+                              zero_mask0);
+  round = vec_splat(round, 1);              // remove DC from round
+  quant = vec_splat(quant, 1);              // remove DC from quant
+  quant_shift = vec_splat(quant_shift, 1);  // remove DC from quant_shift
+  qcoeff1 = quantize_coeff_32(coeff1, coeff1_abs, round, quant, quant_shift,
+                              zero_mask1);
+
+  vec_vsx_st(qcoeff0, 0, qcoeff_ptr);
+  vec_vsx_st(qcoeff1, 16, qcoeff_ptr);
+
+  vec_vsx_st(dequantize_coeff_32(qcoeff0, dequant), 0, dqcoeff_ptr);
+  dequant = vec_splat(dequant, 1);  // remove DC from dequant
+  vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), 16, dqcoeff_ptr);
+
+  eob = vec_max(nonzero_scanindex(qcoeff0, iscan_ptr, 0),
+                nonzero_scanindex(qcoeff1, iscan_ptr, 16));
+
+  do {
+    int16x8_t coeff2, coeff2_abs, qcoeff2, eob2;
+    bool16x8_t zero_mask2;
+
+    coeff0 = vec_vsx_ld(off0, coeff_ptr);
+    coeff1 = vec_vsx_ld(off1, coeff_ptr);
+    coeff2 = vec_vsx_ld(off2, coeff_ptr);
+
+    coeff0_abs = vec_abs(coeff0);
+    coeff1_abs = vec_abs(coeff1);
+    coeff2_abs = vec_abs(coeff2);
+
+    zero_mask0 = vec_cmpge(coeff0_abs, zbin);
+    zero_mask1 = vec_cmpge(coeff1_abs, zbin);
+    zero_mask2 = vec_cmpge(coeff2_abs, zbin);
+
+    qcoeff0 = quantize_coeff_32(coeff0, coeff0_abs, round, quant, quant_shift,
+                                zero_mask0);
+    qcoeff1 = quantize_coeff_32(coeff1, coeff1_abs, round, quant, quant_shift,
+                                zero_mask1);
+    qcoeff2 = quantize_coeff_32(coeff2, coeff2_abs, round, quant, quant_shift,
+                                zero_mask2);
+
+    vec_vsx_st(qcoeff0, off0, qcoeff_ptr);
+    vec_vsx_st(qcoeff1, off1, qcoeff_ptr);
+    vec_vsx_st(qcoeff2, off2, qcoeff_ptr);
+
+    vec_vsx_st(dequantize_coeff_32(qcoeff0, dequant), off0, dqcoeff_ptr);
+    vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), off1, dqcoeff_ptr);
+    vec_vsx_st(dequantize_coeff_32(qcoeff2, dequant), off2, dqcoeff_ptr);
+
+    eob = vec_max(eob, nonzero_scanindex(qcoeff0, iscan_ptr, off0));
+    eob2 = vec_max(nonzero_scanindex(qcoeff1, iscan_ptr, off1),
+                   nonzero_scanindex(qcoeff2, iscan_ptr, off2));
+    eob = vec_max(eob, eob2);
+
+    // 24 int16_t is 48 bytes
+    off0 += 48;
+    off1 += 48;
+    off2 += 48;
+    num_itr--;
+  } while (num_itr != 0);
+
+  eob = vec_max_across(eob);
+  *eob_ptr = eob[0];
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/sad_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/sad_vsx.c
new file mode 100644
index 0000000000..a08ae12413
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/ppc/sad_vsx.c
@@ -0,0 +1,261 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/ppc/types_vsx.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+#define PROCESS16(offset)      \
+  v_a = vec_vsx_ld(offset, a); \
+  v_b = vec_vsx_ld(offset, b); \
+  v_abs = vec_absd(v_a, v_b);  \
+  v_sad = vec_sum4s(v_abs, v_sad);
+
+#define SAD8(height)                                                     \
+  unsigned int vpx_sad8x##height##_vsx(const uint8_t *a, int a_stride,   \
+                                       const uint8_t *b, int b_stride) { \
+    int y = 0;                                                           \
+    uint8x16_t v_a, v_b, v_abs;                                          \
+    uint32x4_t v_sad = vec_zeros_u32;                                    \
+                                                                         \
+    do {                                                                 \
+      PROCESS16(0)                                                       \
+                                                                         \
+      a += a_stride;                                                     \
+      b += b_stride;                                                     \
+      y++;                                                               \
+    } while (y < height);                                                \
+                                                                         \
+    return v_sad[1] + v_sad[0];                                          \
+  }
+
+#define SAD16(height)                                                     \
+  unsigned int vpx_sad16x##height##_vsx(const uint8_t *a, int a_stride,   \
+                                        const uint8_t *b, int b_stride) { \
+    int y = 0;                                                            \
+    uint8x16_t v_a, v_b, v_abs;                                           \
+    uint32x4_t v_sad = vec_zeros_u32;                                     \
+                                                                          \
+    do {                                                                  \
+      PROCESS16(0);                                                       \
+                                                                          \
+      a += a_stride;                                                      \
+      b += b_stride;                                                      \
+      y++;                                                                \
+    } while (y < height);                                                 \
+                                                                          \
+    return v_sad[3] + v_sad[2] + v_sad[1] + v_sad[0];                     \
+  }
+
+#define SAD32(height)                                                     \
+  unsigned int vpx_sad32x##height##_vsx(const uint8_t *a, int a_stride,   \
+                                        const uint8_t *b, int b_stride) { \
+    int y = 0;                                                            \
+    uint8x16_t v_a, v_b, v_abs;                                           \
+    uint32x4_t v_sad = vec_zeros_u32;                                     \
+                                                                          \
+    do {                                                                  \
+      PROCESS16(0);                                                       \
+      PROCESS16(16);                                                      \
+                                                                          \
+      a += a_stride;                                                      \
+      b += b_stride;                                                      \
+      y++;                                                                \
+    } while (y < height);                                                 \
+                                                                          \
+    return v_sad[3] + v_sad[2] + v_sad[1] + v_sad[0];                     \
+  }
+
+#define SAD64(height)                                                     \
+  unsigned int vpx_sad64x##height##_vsx(const uint8_t *a, int a_stride,   \
+                                        const uint8_t *b, int b_stride) { \
+    int y = 0;                                                            \
+    uint8x16_t v_a, v_b, v_abs;                                           \
+    uint32x4_t v_sad = vec_zeros_u32;                                     \
+                                                                          \
+    do {                                                                  \
+      PROCESS16(0);                                                       \
+      PROCESS16(16);                                                      \
+      PROCESS16(32);                                                      \
+      PROCESS16(48);                                                      \
+                                                                          \
+      a += a_stride;                                                      \
+      b += b_stride;                                                      \
+      y++;                                                                \
+    } while (y < height);                                                 \
+                                                                          \
+    return v_sad[3] + v_sad[2] + v_sad[1] + v_sad[0];                     \
+  }
+
+SAD8(4);
+SAD8(8);
+SAD8(16);
+SAD16(8);
+SAD16(16);
+SAD16(32);
+SAD32(16);
+SAD32(32);
+SAD32(64);
+SAD64(32);
+SAD64(64);
+
+#define SAD16AVG(height)                                                      \
+  unsigned int vpx_sad16x##height##_avg_vsx(                                  \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred) {                                           \
+    DECLARE_ALIGNED(16, uint8_t, comp_pred[16 * (height)]);                   \
+    vpx_comp_avg_pred_vsx(comp_pred, second_pred, 16, height, ref,            \
+                          ref_stride);                                        \
+                                                                              \
+    return vpx_sad16x##height##_vsx(src, src_stride, comp_pred, 16);          \
+  }
+
+#define SAD32AVG(height)                                                      \
+  unsigned int vpx_sad32x##height##_avg_vsx(                                  \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred) {                                           \
+    DECLARE_ALIGNED(32, uint8_t, comp_pred[32 * (height)]);                   \
+    vpx_comp_avg_pred_vsx(comp_pred, second_pred, 32, height, ref,            \
+                          ref_stride);                                        \
+                                                                              \
+    return vpx_sad32x##height##_vsx(src, src_stride, comp_pred, 32);          \
+  }
+
+#define SAD64AVG(height)                                                      \
+  unsigned int vpx_sad64x##height##_avg_vsx(                                  \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred) {                                           \
+    DECLARE_ALIGNED(64, uint8_t, comp_pred[64 * (height)]);                   \
+    vpx_comp_avg_pred_vsx(comp_pred, second_pred, 64, height, ref,            \
+                          ref_stride);                                        \
+    return vpx_sad64x##height##_vsx(src, src_stride, comp_pred, 64);          \
+  }
+
+SAD16AVG(8);
+SAD16AVG(16);
+SAD16AVG(32);
+SAD32AVG(16);
+SAD32AVG(32);
+SAD32AVG(64);
+SAD64AVG(32);
+SAD64AVG(64);
+
+#define PROCESS16_4D(offset, ref, v_h, v_l) \
+  v_b = vec_vsx_ld(offset, ref);            \
+  v_bh = unpack_to_s16_h(v_b);              \
+  v_bl = unpack_to_s16_l(v_b);              \
+  v_subh = vec_sub(v_h, v_bh);              \
+  v_subl = vec_sub(v_l, v_bl);              \
+  v_absh = vec_abs(v_subh);                 \
+  v_absl = vec_abs(v_subl);                 \
+  v_sad = vec_sum4s(v_absh, v_sad);         \
+  v_sad = vec_sum4s(v_absl, v_sad);
+
+#define UNPACK_SRC(offset, srcv_h, srcv_l) \
+  v_a = vec_vsx_ld(offset, src);           \
+  srcv_h = unpack_to_s16_h(v_a);           \
+  srcv_l = unpack_to_s16_l(v_a);
+
+#define SAD16_4D(height)                                                  \
+  void vpx_sad16x##height##x4d_vsx(const uint8_t *src, int src_stride,    \
+                                   const uint8_t *const ref_array[],      \
+                                   int ref_stride, uint32_t *sad_array) { \
+    int i;                                                                \
+    int y;                                                                \
+    unsigned int sad[4];                                                  \
+    uint8x16_t v_a, v_b;                                                  \
+    int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl;     \
+                                                                          \
+    for (i = 0; i < 4; i++) sad_array[i] = 0;                             \
+                                                                          \
+    for (y = 0; y < height; y++) {                                        \
+      UNPACK_SRC(y *src_stride, v_ah, v_al);                              \
+      for (i = 0; i < 4; i++) {                                           \
+        int32x4_t v_sad = vec_splat_s32(0);                               \
+        PROCESS16_4D(y *ref_stride, ref_array[i], v_ah, v_al);            \
+                                                                          \
+        vec_vsx_st((uint32x4_t)v_sad, 0, sad);                            \
+        sad_array[i] += (sad[3] + sad[2] + sad[1] + sad[0]);              \
+      }                                                                   \
+    }                                                                     \
+  }
+
+#define SAD32_4D(height)                                                  \
+  void vpx_sad32x##height##x4d_vsx(const uint8_t *src, int src_stride,    \
+                                   const uint8_t *const ref_array[],      \
+                                   int ref_stride, uint32_t *sad_array) { \
+    int i;                                                                \
+    int y;                                                                \
+    unsigned int sad[4];                                                  \
+    uint8x16_t v_a, v_b;                                                  \
+    int16x8_t v_ah1, v_al1, v_ah2, v_al2, v_bh, v_bl;                     \
+    int16x8_t v_absh, v_absl, v_subh, v_subl;                             \
+                                                                          \
+    for (i = 0; i < 4; i++) sad_array[i] = 0;                             \
+                                                                          \
+    for (y = 0; y < height; y++) {                                        \
+      UNPACK_SRC(y *src_stride, v_ah1, v_al1);                            \
+      UNPACK_SRC(y *src_stride + 16, v_ah2, v_al2);                       \
+      for (i = 0; i < 4; i++) {                                           \
+        int32x4_t v_sad = vec_splat_s32(0);                               \
+        PROCESS16_4D(y *ref_stride, ref_array[i], v_ah1, v_al1);          \
+        PROCESS16_4D(y *ref_stride + 16, ref_array[i], v_ah2, v_al2);     \
+                                                                          \
+        vec_vsx_st((uint32x4_t)v_sad, 0, sad);                            \
+        sad_array[i] += (sad[3] + sad[2] + sad[1] + sad[0]);              \
+      }                                                                   \
+    }                                                                     \
+  }
+
+#define SAD64_4D(height)                                                  \
+  void vpx_sad64x##height##x4d_vsx(const uint8_t *src, int src_stride,    \
+                                   const uint8_t *const ref_array[],      \
+                                   int ref_stride, uint32_t *sad_array) { \
+    int i;                                                                \
+    int y;                                                                \
+    unsigned int sad[4];                                                  \
+    uint8x16_t v_a, v_b;                                                  \
+    int16x8_t v_ah1, v_al1, v_ah2, v_al2, v_bh, v_bl;                     \
+    int16x8_t v_ah3, v_al3, v_ah4, v_al4;                                 \
+    int16x8_t v_absh, v_absl, v_subh, v_subl;                             \
+                                                                          \
+    for (i = 0; i < 4; i++) sad_array[i] = 0;                             \
+                                                                          \
+    for (y = 0; y < height; y++) {                                        \
+      UNPACK_SRC(y *src_stride, v_ah1, v_al1);                            \
+      UNPACK_SRC(y *src_stride + 16, v_ah2, v_al2);                       \
+      UNPACK_SRC(y *src_stride + 32, v_ah3, v_al3);                       \
+      UNPACK_SRC(y *src_stride + 48, v_ah4, v_al4);                       \
+      for (i = 0; i < 4; i++) {                                           \
+        int32x4_t v_sad = vec_splat_s32(0);                               \
+        PROCESS16_4D(y *ref_stride, ref_array[i], v_ah1, v_al1);          \
+        PROCESS16_4D(y *ref_stride + 16, ref_array[i], v_ah2, v_al2);     \
+        PROCESS16_4D(y *ref_stride + 32, ref_array[i], v_ah3, v_al3);     \
+        PROCESS16_4D(y *ref_stride + 48, ref_array[i], v_ah4, v_al4);     \
+                                                                          \
+        vec_vsx_st((uint32x4_t)v_sad, 0, sad);                            \
+        sad_array[i] += (sad[3] + sad[2] + sad[1] + sad[0]);              \
+      }                                                                   \
+    }                                                                     \
+  }
+
+SAD16_4D(8);
+SAD16_4D(16);
+SAD16_4D(32);
+SAD32_4D(16);
+SAD32_4D(32);
+SAD32_4D(64);
+SAD64_4D(32);
+SAD64_4D(64);
diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/subtract_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/subtract_vsx.c
new file mode 100644
index 0000000000..76ad302da6
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/ppc/subtract_vsx.c
@@ -0,0 +1,117 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+static VPX_FORCE_INLINE void subtract_block4x4(
+    int16_t *diff, ptrdiff_t diff_stride, const uint8_t *src,
+    ptrdiff_t src_stride, const uint8_t *pred, ptrdiff_t pred_stride) {
+  int16_t *diff1 = diff + 2 * diff_stride;
+  const uint8_t *src1 = src + 2 * src_stride;
+  const uint8_t *pred1 = pred + 2 * pred_stride;
+
+  const int16x8_t d0 = vec_vsx_ld(0, diff);
+  const int16x8_t d1 = vec_vsx_ld(0, diff + diff_stride);
+  const int16x8_t d2 = vec_vsx_ld(0, diff1);
+  const int16x8_t d3 = vec_vsx_ld(0, diff1 + diff_stride);
+
+  const uint8x16_t s0 = read4x2(src, (int)src_stride);
+  const uint8x16_t p0 = read4x2(pred, (int)pred_stride);
+  const uint8x16_t s1 = read4x2(src1, (int)src_stride);
+  const uint8x16_t p1 = read4x2(pred1, (int)pred_stride);
+
+  const int16x8_t da = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0));
+  const int16x8_t db = vec_sub(unpack_to_s16_h(s1), unpack_to_s16_h(p1));
+
+  vec_vsx_st(xxpermdi(da, d0, 1), 0, diff);
+  vec_vsx_st(xxpermdi(da, d1, 3), 0, diff + diff_stride);
+  vec_vsx_st(xxpermdi(db, d2, 1), 0, diff1);
+  vec_vsx_st(xxpermdi(db, d3, 3), 0, diff1 + diff_stride);
+}
+
+void vpx_subtract_block_vsx(int rows, int cols, int16_t *diff,
+                            ptrdiff_t diff_stride, const uint8_t *src,
+                            ptrdiff_t src_stride, const uint8_t *pred,
+                            ptrdiff_t pred_stride) {
+  int r = rows, c;
+
+  switch (cols) {
+    case 64:
+    case 32:
+      do {
+        for (c = 0; c < cols; c += 32) {
+          const uint8x16_t s0 = vec_vsx_ld(0, src + c);
+          const uint8x16_t s1 = vec_vsx_ld(16, src + c);
+          const uint8x16_t p0 = vec_vsx_ld(0, pred + c);
+          const uint8x16_t p1 = vec_vsx_ld(16, pred + c);
+          const int16x8_t d0l =
+              vec_sub(unpack_to_s16_l(s0), unpack_to_s16_l(p0));
+          const int16x8_t d0h =
+              vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0));
+          const int16x8_t d1l =
+              vec_sub(unpack_to_s16_l(s1), unpack_to_s16_l(p1));
+          const int16x8_t d1h =
+              vec_sub(unpack_to_s16_h(s1), unpack_to_s16_h(p1));
+          vec_vsx_st(d0h, 0, diff + c);
+          vec_vsx_st(d0l, 16, diff + c);
+          vec_vsx_st(d1h, 0, diff + c + 16);
+          vec_vsx_st(d1l, 16, diff + c + 16);
+        }
+        diff += diff_stride;
+        pred += pred_stride;
+        src += src_stride;
+      } while (--r);
+      break;
+    case 16:
+      do {
+        const uint8x16_t s0 = vec_vsx_ld(0, src);
+        const uint8x16_t p0 = vec_vsx_ld(0, pred);
+        const int16x8_t d0l = vec_sub(unpack_to_s16_l(s0), unpack_to_s16_l(p0));
+        const int16x8_t d0h = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0));
+        vec_vsx_st(d0h, 0, diff);
+        vec_vsx_st(d0l, 16, diff);
+        diff += diff_stride;
+        pred += pred_stride;
+        src += src_stride;
+      } while (--r);
+      break;
+    case 8:
+      do {
+        const uint8x16_t s0 = vec_vsx_ld(0, src);
+        const uint8x16_t p0 = vec_vsx_ld(0, pred);
+        const int16x8_t d0h = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0));
+        vec_vsx_st(d0h, 0, diff);
+        diff += diff_stride;
+        pred += pred_stride;
+        src += src_stride;
+      } while (--r);
+      break;
+    case 4:
+      subtract_block4x4(diff, diff_stride, src, src_stride, pred, pred_stride);
+      if (r > 4) {
+        diff += 4 * diff_stride;
+        pred += 4 * pred_stride;
+        src += 4 * src_stride;
+
+        subtract_block4x4(diff, diff_stride,
+
+                          src, src_stride,
+
+                          pred, pred_stride);
+      }
+      break;
+    default: assert(0);  // unreachable
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/transpose_vsx.h b/media/libvpx/libvpx/vpx_dsp/ppc/transpose_vsx.h
new file mode 100644
index 0000000000..4883b734ad
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/ppc/transpose_vsx.h
@@ -0,0 +1,133 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_PPC_TRANSPOSE_VSX_H_
+#define VPX_VPX_DSP_PPC_TRANSPOSE_VSX_H_
+
+#include "./vpx_config.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+static INLINE void vpx_transpose_s16_8x8(int16x8_t v[8]) {
+  // d = vec_mergeh(a,b):
+  // The even elements of the result are obtained left-to-right,
+  // from the high elements of a.
+  // The odd elements of the result are obtained left-to-right,
+  // from the high elements of b.
+  //
+  // d = vec_mergel(a,b):
+  // The even elements of the result are obtained left-to-right,
+  // from the low elements of a.
+  // The odd elements of the result are obtained left-to-right,
+  // from the low elements of b.
+
+  // Example, starting with:
+  // v[0]: 00 01 02 03 04 05 06 07
+  // v[1]: 10 11 12 13 14 15 16 17
+  // v[2]: 20 21 22 23 24 25 26 27
+  // v[3]: 30 31 32 33 34 35 36 37
+  // v[4]: 40 41 42 43 44 45 46 47
+  // v[5]: 50 51 52 53 54 55 56 57
+  // v[6]: 60 61 62 63 64 65 66 67
+  // v[7]: 70 71 72 73 74 75 76 77
+
+  int16x8_t b0, b1, b2, b3, b4, b5, b6, b7;
+  int16x8_t c0, c1, c2, c3, c4, c5, c6, c7;
+
+  b0 = vec_mergeh(v[0], v[4]);
+  b1 = vec_mergel(v[0], v[4]);
+  b2 = vec_mergeh(v[1], v[5]);
+  b3 = vec_mergel(v[1], v[5]);
+  b4 = vec_mergeh(v[2], v[6]);
+  b5 = vec_mergel(v[2], v[6]);
+  b6 = vec_mergeh(v[3], v[7]);
+  b7 = vec_mergel(v[3], v[7]);
+
+  // After first merge operation
+  // b0: 00 40 01 41 02 42 03 43
+  // b1: 04 44 05 45 06 46 07 47
+  // b2: 10 50 11 51 12 52 13 53
+  // b3: 14 54 15 55 16 56 17 57
+  // b4: 20 60 21 61 22 62 23 63
+  // b5: 24 64 25 65 26 66 27 67
+  // b6: 30 70 31 71 32 62 33 73
+  // b7: 34 74 35 75 36 76 37 77
+
+  c0 = vec_mergeh(b0, b4);
+  c1 = vec_mergel(b0, b4);
+  c2 = vec_mergeh(b1, b5);
+  c3 = vec_mergel(b1, b5);
+  c4 = vec_mergeh(b2, b6);
+  c5 = vec_mergel(b2, b6);
+  c6 = vec_mergeh(b3, b7);
+  c7 = vec_mergel(b3, b7);
+
+  // After second merge operation
+  // c0: 00 20 40 60 01 21 41 61
+  // c1: 02 22 42 62 03 23 43 63
+  // c2: 04 24 44 64 05 25 45 65
+  // c3: 06 26 46 66 07 27 47 67
+  // c4: 10 30 50 70 11 31 51 71
+  // c5: 12 32 52 72 13 33 53 73
+  // c6: 14 34 54 74 15 35 55 75
+  // c7: 16 36 56 76 17 37 57 77
+
+  v[0] = vec_mergeh(c0, c4);
+  v[1] = vec_mergel(c0, c4);
+  v[2] = vec_mergeh(c1, c5);
+  v[3] = vec_mergel(c1, c5);
+  v[4] = vec_mergeh(c2, c6);
+  v[5] = vec_mergel(c2, c6);
+  v[6] = vec_mergeh(c3, c7);
+  v[7] = vec_mergel(c3, c7);
+
+  // After last merge operation
+  // v[0]: 00 10 20 30 40 50 60 70
+  // v[1]: 01 11 21 31 41 51 61 71
+  // v[2]: 02 12 22 32 42 52 62 72
+  // v[3]: 03 13 23 33 43 53 63 73
+  // v[4]: 04 14 24 34 44 54 64 74
+  // v[5]: 05 15 25 35 45 55 65 75
+  // v[6]: 06 16 26 36 46 56 66 76
+  // v[7]: 07 17 27 37 47 57 67 77
+}
+
+static INLINE void transpose_8x8(const int16x8_t *a, int16x8_t *b) {
+  // Stage 1
+  const int16x8_t s1_0 = vec_mergeh(a[0], a[4]);
+  const int16x8_t s1_1 = vec_mergel(a[0], a[4]);
+  const int16x8_t s1_2 = vec_mergeh(a[1], a[5]);
+  const int16x8_t s1_3 = vec_mergel(a[1], a[5]);
+  const int16x8_t s1_4 = vec_mergeh(a[2], a[6]);
+  const int16x8_t s1_5 = vec_mergel(a[2], a[6]);
+  const int16x8_t s1_6 = vec_mergeh(a[3], a[7]);
+  const int16x8_t s1_7 = vec_mergel(a[3], a[7]);
+
+  // Stage 2
+  const int16x8_t s2_0 = vec_mergeh(s1_0, s1_4);
+  const int16x8_t s2_1 = vec_mergel(s1_0, s1_4);
+  const int16x8_t s2_2 = vec_mergeh(s1_1, s1_5);
+  const int16x8_t s2_3 = vec_mergel(s1_1, s1_5);
+  const int16x8_t s2_4 = vec_mergeh(s1_2, s1_6);
+  const int16x8_t s2_5 = vec_mergel(s1_2, s1_6);
+  const int16x8_t s2_6 = vec_mergeh(s1_3, s1_7);
+  const int16x8_t s2_7 = vec_mergel(s1_3, s1_7);
+
+  // Stage 2
+  b[0] = vec_mergeh(s2_0, s2_4);
+  b[1] = vec_mergel(s2_0, s2_4);
+  b[2] = vec_mergeh(s2_1, s2_5);
+  b[3] = vec_mergel(s2_1, s2_5);
+  b[4] = vec_mergeh(s2_2, s2_6);
+  b[5] = vec_mergel(s2_2, s2_6);
+  b[6] = vec_mergeh(s2_3, s2_7);
+  b[7] = vec_mergel(s2_3, s2_7);
+}
+
+#endif  // VPX_VPX_DSP_PPC_TRANSPOSE_VSX_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/txfm_common_vsx.h b/media/libvpx/libvpx/vpx_dsp/ppc/txfm_common_vsx.h
new file mode 100644
index 0000000000..2907a1fe40
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/ppc/txfm_common_vsx.h
@@ -0,0 +1,90 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_PPC_TXFM_COMMON_VSX_H_
+#define VPX_VPX_DSP_PPC_TXFM_COMMON_VSX_H_
+
+#include "vpx_dsp/ppc/types_vsx.h"
+
+static const int32x4_t vec_dct_const_rounding = { 8192, 8192, 8192, 8192 };
+
+static const uint32x4_t vec_dct_const_bits = { 14, 14, 14, 14 };
+
+static const uint16x8_t vec_dct_scale_log2 = { 2, 2, 2, 2, 2, 2, 2, 2 };
+
+static const int16x8_t cospi1_v = { 16364, 16364, 16364, 16364,
+                                    16364, 16364, 16364, 16364 };
+static const int16x8_t cospi2_v = { 16305, 16305, 16305, 16305,
+                                    16305, 16305, 16305, 16305 };
+static const int16x8_t cospi3_v = { 16207, 16207, 16207, 16207,
+                                    16207, 16207, 16207, 16207 };
+static const int16x8_t cospi4_v = { 16069, 16069, 16069, 16069,
+                                    16069, 16069, 16069, 16069 };
+static const int16x8_t cospi4m_v = { -16069, -16069, -16069, -16069,
+                                     -16069, -16069, -16069, -16069 };
+static const int16x8_t cospi5_v = { 15893, 15893, 15893, 15893,
+                                    15893, 15893, 15893, 15893 };
+static const int16x8_t cospi6_v = { 15679, 15679, 15679, 15679,
+                                    15679, 15679, 15679, 15679 };
+static const int16x8_t cospi7_v = { 15426, 15426, 15426, 15426,
+                                    15426, 15426, 15426, 15426 };
+static const int16x8_t cospi8_v = { 15137, 15137, 15137, 15137,
+                                    15137, 15137, 15137, 15137 };
+static const int16x8_t cospi8m_v = { -15137, -15137, -15137, -15137,
+                                     -15137, -15137, -15137, -15137 };
+static const int16x8_t cospi9_v = { 14811, 14811, 14811, 14811,
+                                    14811, 14811, 14811, 14811 };
+static const int16x8_t cospi10_v = { 14449, 14449, 14449, 14449,
+                                     14449, 14449, 14449, 14449 };
+static const int16x8_t cospi11_v = { 14053, 14053, 14053, 14053,
+                                     14053, 14053, 14053, 14053 };
+static const int16x8_t cospi12_v = { 13623, 13623, 13623, 13623,
+                                     13623, 13623, 13623, 13623 };
+static const int16x8_t cospi13_v = { 13160, 13160, 13160, 13160,
+                                     13160, 13160, 13160, 13160 };
+static const int16x8_t cospi14_v = { 12665, 12665, 12665, 12665,
+                                     12665, 12665, 12665, 12665 };
+static const int16x8_t cospi15_v = { 12140, 12140, 12140, 12140,
+                                     12140, 12140, 12140, 12140 };
+static const int16x8_t cospi16_v = { 11585, 11585, 11585, 11585,
+                                     11585, 11585, 11585, 11585 };
+static const int16x8_t cospi17_v = { 11003, 11003, 11003, 11003,
+                                     11003, 11003, 11003, 11003 };
+static const int16x8_t cospi18_v = { 10394, 10394, 10394, 10394,
+                                     10394, 10394, 10394, 10394 };
+static const int16x8_t cospi19_v = { 9760, 9760, 9760, 9760,
+                                     9760, 9760, 9760, 9760 };
+static const int16x8_t cospi20_v = { 9102, 9102, 9102, 9102,
+                                     9102, 9102, 9102, 9102 };
+static const int16x8_t cospi20m_v = { -9102, -9102, -9102, -9102,
+                                      -9102, -9102, -9102, -9102 };
+static const int16x8_t cospi21_v = { 8423, 8423, 8423, 8423,
+                                     8423, 8423, 8423, 8423 };
+static const int16x8_t cospi22_v = { 7723, 7723, 7723, 7723,
+                                     7723, 7723, 7723, 7723 };
+static const int16x8_t cospi23_v = { 7005, 7005, 7005, 7005,
+                                     7005, 7005, 7005, 7005 };
+static const int16x8_t cospi24_v = { 6270, 6270, 6270, 6270,
+                                     6270, 6270, 6270, 6270 };
+static const int16x8_t cospi25_v = { 5520, 5520, 5520, 5520,
+                                     5520, 5520, 5520, 5520 };
+static const int16x8_t cospi26_v = { 4756, 4756, 4756, 4756,
+                                     4756, 4756, 4756, 4756 };
+static const int16x8_t cospi27_v = { 3981, 3981, 3981, 3981,
+                                     3981, 3981, 3981, 3981 };
+static const int16x8_t cospi28_v = { 3196, 3196, 3196, 3196,
+                                     3196, 3196, 3196, 3196 };
+static const int16x8_t cospi29_v = { 2404, 2404, 2404, 2404,
+                                     2404, 2404, 2404, 2404 };
+static const int16x8_t cospi30_v = { 1606, 1606, 1606, 1606,
+                                     1606, 1606, 1606, 1606 };
+static const int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 };
+
+#endif  // VPX_VPX_DSP_PPC_TXFM_COMMON_VSX_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/types_vsx.h b/media/libvpx/libvpx/vpx_dsp/ppc/types_vsx.h
new file mode 100644
index 0000000000..9806b81cd0
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/ppc/types_vsx.h
@@ -0,0 +1,108 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_PPC_TYPES_VSX_H_
+#define VPX_VPX_DSP_PPC_TYPES_VSX_H_
+
+#include <altivec.h>
+
+typedef vector signed char int8x16_t;
+typedef vector unsigned char uint8x16_t;
+typedef vector signed short int16x8_t;
+typedef vector unsigned short uint16x8_t;
+typedef vector signed int int32x4_t;
+typedef vector unsigned int uint32x4_t;
+typedef vector bool char bool8x16_t;
+typedef vector bool short bool16x8_t;
+typedef vector bool int bool32x4_t;
+
+#if defined(__clang__) && __clang_major__ < 6
+static const uint8x16_t xxpermdi0_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                           0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
+                                           0x14, 0x15, 0x16, 0x17 };
+static const uint8x16_t xxpermdi1_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                           0x06, 0x07, 0x18, 0x19, 0x1A, 0x1B,
+                                           0x1C, 0x1D, 0x1E, 0x1F };
+static const uint8x16_t xxpermdi2_perm = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                           0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13,
+                                           0x14, 0x15, 0x16, 0x17 };
+static const uint8x16_t xxpermdi3_perm = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                           0x0E, 0x0F, 0x18, 0x19, 0x1A, 0x1B,
+                                           0x1C, 0x1D, 0x1E, 0x1F };
+#define xxpermdi(a, b, c) vec_perm(a, b, xxpermdi##c##_perm)
+#elif defined(__GNUC__) && \
+    (__GNUC__ > 6 || (__GNUC__ == 6 && __GNUC_MINOR__ >= 3))
+#define xxpermdi(a, b, c) vec_xxpermdi(a, b, c)
+#endif
+
+#ifdef WORDS_BIGENDIAN
+#define unpack_to_u16_h(v) \
+  (uint16x8_t) vec_mergeh(vec_splat_u8(0), (uint8x16_t)v)
+#define unpack_to_u16_l(v) \
+  (uint16x8_t) vec_mergel(vec_splat_u8(0), (uint8x16_t)v)
+#define unpack_to_s16_h(v) \
+  (int16x8_t) vec_mergeh(vec_splat_u8(0), (uint8x16_t)v)
+#define unpack_to_s16_l(v) \
+  (int16x8_t) vec_mergel(vec_splat_u8(0), (uint8x16_t)v)
+#ifndef xxpermdi
+#define xxpermdi(a, b, c) vec_xxpermdi(a, b, c)
+#endif
+#else
+#define unpack_to_u16_h(v) \
+  (uint16x8_t) vec_mergeh((uint8x16_t)v, vec_splat_u8(0))
+#define unpack_to_u16_l(v) \
+  (uint16x8_t) vec_mergel((uint8x16_t)v, vec_splat_u8(0))
+#define unpack_to_s16_h(v) \
+  (int16x8_t) vec_mergeh((uint8x16_t)v, vec_splat_u8(0))
+#define unpack_to_s16_l(v) \
+  (int16x8_t) vec_mergel((uint8x16_t)v, vec_splat_u8(0))
+#ifndef xxpermdi
+#define xxpermdi(a, b, c) vec_xxpermdi(b, a, (((c) >> 1) | ((c) & 1) << 1) ^ 3)
+#endif
+#endif
+
+static INLINE uint8x16_t read4x2(const uint8_t *a, int stride) {
+  const uint32x4_t a0 = (uint32x4_t)vec_vsx_ld(0, a);
+  const uint32x4_t a1 = (uint32x4_t)vec_vsx_ld(0, a + stride);
+
+  return (uint8x16_t)vec_mergeh(a0, a1);
+}
+
+#ifndef __POWER9_VECTOR__
+#define vec_absd(a, b) vec_sub(vec_max(a, b), vec_min(a, b))
+#endif
+
+static const uint8x16_t vec_zeros_u8 = { 0, 0, 0, 0, 0, 0, 0, 0,
+                                         0, 0, 0, 0, 0, 0, 0, 0 };
+static const int16x8_t vec_zeros_s16 = { 0, 0, 0, 0, 0, 0, 0, 0 };
+static const int16x8_t vec_ones_s16 = { 1, 1, 1, 1, 1, 1, 1, 1 };
+static const int16x8_t vec_twos_s16 = { 2, 2, 2, 2, 2, 2, 2, 2 };
+static const uint16x8_t vec_ones_u16 = { 1, 1, 1, 1, 1, 1, 1, 1 };
+static const uint32x4_t vec_ones_u32 = { 1, 1, 1, 1 };
+static const int32x4_t vec_zeros_s32 = { 0, 0, 0, 0 };
+static const uint32x4_t vec_zeros_u32 = { 0, 0, 0, 0 };
+static const uint16x8_t vec_shift_sign_s16 = { 15, 15, 15, 15, 15, 15, 15, 15 };
+static const uint32x4_t vec_shift_sign_s32 = { 31, 31, 31, 31 };
+static const uint8x16_t vec_perm64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                       0x0E, 0x0F, 0x00, 0x01, 0x02, 0x03,
+                                       0x04, 0x05, 0x06, 0x07 };
+static const uint8x16_t vec_perm32 = { 0x04, 0x05, 0x06, 0x07, 0x08, 0x09,
+                                       0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                       0x00, 0x01, 0x02, 0x03 };
+static const uint8x16_t vec_perm16 = { 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+                                       0x08, 0x09, 0x0A, 0x0B, 0x0E, 0x0D,
+                                       0x0E, 0x0F, 0x00, 0x01 };
+
+static const uint8x16_t vec_perm_odd_even_pack = { 0x00, 0x01, 0x10, 0x11,
+                                                   0x04, 0x05, 0x14, 0x15,
+                                                   0x08, 0x09, 0x18, 0x19,
+                                                   0x0C, 0x0D, 0x1C, 0x1D };
+
+#endif  // VPX_VPX_DSP_PPC_TYPES_VSX_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/variance_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/variance_vsx.c
new file mode 100644
index 0000000000..6c6bc9a301
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/ppc/variance_vsx.c
@@ -0,0 +1,271 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+uint32_t vpx_get4x4sse_cs_vsx(const uint8_t *src_ptr, int src_stride,
+                              const uint8_t *ref_ptr, int ref_stride) {
+  int distortion;
+
+  const int16x8_t a0 = unpack_to_s16_h(read4x2(src_ptr, src_stride));
+  const int16x8_t a1 =
+      unpack_to_s16_h(read4x2(src_ptr + src_stride * 2, src_stride));
+  const int16x8_t b0 = unpack_to_s16_h(read4x2(ref_ptr, ref_stride));
+  const int16x8_t b1 =
+      unpack_to_s16_h(read4x2(ref_ptr + ref_stride * 2, ref_stride));
+  const int16x8_t d0 = vec_sub(a0, b0);
+  const int16x8_t d1 = vec_sub(a1, b1);
+  const int32x4_t ds = vec_msum(d1, d1, vec_msum(d0, d0, vec_splat_s32(0)));
+  const int32x4_t d = vec_splat(vec_sums(ds, vec_splat_s32(0)), 3);
+
+  vec_ste(d, 0, &distortion);
+
+  return distortion;
+}
+
+// TODO(lu_zero): Unroll
+uint32_t vpx_get_mb_ss_vsx(const int16_t *src_ptr) {
+  unsigned int i, sum = 0;
+  int32x4_t s = vec_splat_s32(0);
+
+  for (i = 0; i < 256; i += 8) {
+    const int16x8_t v = vec_vsx_ld(0, src_ptr + i);
+    s = vec_msum(v, v, s);
+  }
+
+  s = vec_splat(vec_sums(s, vec_splat_s32(0)), 3);
+
+  vec_ste((uint32x4_t)s, 0, &sum);
+
+  return sum;
+}
+
+void vpx_comp_avg_pred_vsx(uint8_t *comp_pred, const uint8_t *pred, int width,
+                           int height, const uint8_t *ref, int ref_stride) {
+  int i, j;
+  /* comp_pred and pred must be 16 byte aligned. */
+  assert(((intptr_t)comp_pred & 0xf) == 0);
+  assert(((intptr_t)pred & 0xf) == 0);
+  if (width >= 16) {
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; j += 16) {
+        const uint8x16_t v = vec_avg(vec_vsx_ld(j, pred), vec_vsx_ld(j, ref));
+        vec_vsx_st(v, j, comp_pred);
+      }
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    }
+  } else if (width == 8) {
+    // Process 2 lines at time
+    for (i = 0; i < height / 2; ++i) {
+      const uint8x16_t r0 = vec_vsx_ld(0, ref);
+      const uint8x16_t r1 = vec_vsx_ld(0, ref + ref_stride);
+      const uint8x16_t r = xxpermdi(r0, r1, 0);
+      const uint8x16_t v = vec_avg(vec_vsx_ld(0, pred), r);
+      vec_vsx_st(v, 0, comp_pred);
+      comp_pred += 16;  // width * 2;
+      pred += 16;       // width * 2;
+      ref += ref_stride * 2;
+    }
+  } else {
+    assert(width == 4);
+    // process 4 lines at time
+    for (i = 0; i < height / 4; ++i) {
+      const uint32x4_t r0 = (uint32x4_t)vec_vsx_ld(0, ref);
+      const uint32x4_t r1 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride);
+      const uint32x4_t r2 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride * 2);
+      const uint32x4_t r3 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride * 3);
+      const uint8x16_t r =
+          (uint8x16_t)xxpermdi(vec_mergeh(r0, r1), vec_mergeh(r2, r3), 0);
+      const uint8x16_t v = vec_avg(vec_vsx_ld(0, pred), r);
+      vec_vsx_st(v, 0, comp_pred);
+      comp_pred += 16;  // width * 4;
+      pred += 16;       // width * 4;
+      ref += ref_stride * 4;
+    }
+  }
+}
+
+static INLINE void variance_inner_32(const uint8_t *src_ptr,
+                                     const uint8_t *ref_ptr,
+                                     int32x4_t *sum_squared, int32x4_t *sum) {
+  int32x4_t s = *sum;
+  int32x4_t ss = *sum_squared;
+
+  const uint8x16_t va0 = vec_vsx_ld(0, src_ptr);
+  const uint8x16_t vb0 = vec_vsx_ld(0, ref_ptr);
+  const uint8x16_t va1 = vec_vsx_ld(16, src_ptr);
+  const uint8x16_t vb1 = vec_vsx_ld(16, ref_ptr);
+
+  const int16x8_t a0 = unpack_to_s16_h(va0);
+  const int16x8_t b0 = unpack_to_s16_h(vb0);
+  const int16x8_t a1 = unpack_to_s16_l(va0);
+  const int16x8_t b1 = unpack_to_s16_l(vb0);
+  const int16x8_t a2 = unpack_to_s16_h(va1);
+  const int16x8_t b2 = unpack_to_s16_h(vb1);
+  const int16x8_t a3 = unpack_to_s16_l(va1);
+  const int16x8_t b3 = unpack_to_s16_l(vb1);
+  const int16x8_t d0 = vec_sub(a0, b0);
+  const int16x8_t d1 = vec_sub(a1, b1);
+  const int16x8_t d2 = vec_sub(a2, b2);
+  const int16x8_t d3 = vec_sub(a3, b3);
+
+  s = vec_sum4s(d0, s);
+  ss = vec_msum(d0, d0, ss);
+  s = vec_sum4s(d1, s);
+  ss = vec_msum(d1, d1, ss);
+  s = vec_sum4s(d2, s);
+  ss = vec_msum(d2, d2, ss);
+  s = vec_sum4s(d3, s);
+  ss = vec_msum(d3, d3, ss);
+  *sum = s;
+  *sum_squared = ss;
+}
+
+static INLINE void variance(const uint8_t *src_ptr, int src_stride,
+                            const uint8_t *ref_ptr, int ref_stride, int w,
+                            int h, uint32_t *sse, int *sum) {
+  int i;
+
+  int32x4_t s = vec_splat_s32(0);
+  int32x4_t ss = vec_splat_s32(0);
+
+  switch (w) {
+    case 4:
+      for (i = 0; i < h / 2; ++i) {
+        const int16x8_t a0 = unpack_to_s16_h(read4x2(src_ptr, src_stride));
+        const int16x8_t b0 = unpack_to_s16_h(read4x2(ref_ptr, ref_stride));
+        const int16x8_t d = vec_sub(a0, b0);
+        s = vec_sum4s(d, s);
+        ss = vec_msum(d, d, ss);
+        src_ptr += src_stride * 2;
+        ref_ptr += ref_stride * 2;
+      }
+      break;
+    case 8:
+      for (i = 0; i < h; ++i) {
+        const int16x8_t a0 = unpack_to_s16_h(vec_vsx_ld(0, src_ptr));
+        const int16x8_t b0 = unpack_to_s16_h(vec_vsx_ld(0, ref_ptr));
+        const int16x8_t d = vec_sub(a0, b0);
+
+        s = vec_sum4s(d, s);
+        ss = vec_msum(d, d, ss);
+        src_ptr += src_stride;
+        ref_ptr += ref_stride;
+      }
+      break;
+    case 16:
+      for (i = 0; i < h; ++i) {
+        const uint8x16_t va = vec_vsx_ld(0, src_ptr);
+        const uint8x16_t vb = vec_vsx_ld(0, ref_ptr);
+        const int16x8_t a0 = unpack_to_s16_h(va);
+        const int16x8_t b0 = unpack_to_s16_h(vb);
+        const int16x8_t a1 = unpack_to_s16_l(va);
+        const int16x8_t b1 = unpack_to_s16_l(vb);
+        const int16x8_t d0 = vec_sub(a0, b0);
+        const int16x8_t d1 = vec_sub(a1, b1);
+
+        s = vec_sum4s(d0, s);
+        ss = vec_msum(d0, d0, ss);
+        s = vec_sum4s(d1, s);
+        ss = vec_msum(d1, d1, ss);
+
+        src_ptr += src_stride;
+        ref_ptr += ref_stride;
+      }
+      break;
+    case 32:
+      for (i = 0; i < h; ++i) {
+        variance_inner_32(src_ptr, ref_ptr, &ss, &s);
+        src_ptr += src_stride;
+        ref_ptr += ref_stride;
+      }
+      break;
+    case 64:
+      for (i = 0; i < h; ++i) {
+        variance_inner_32(src_ptr, ref_ptr, &ss, &s);
+        variance_inner_32(src_ptr + 32, ref_ptr + 32, &ss, &s);
+
+        src_ptr += src_stride;
+        ref_ptr += ref_stride;
+      }
+      break;
+  }
+
+  s = vec_splat(vec_sums(s, vec_splat_s32(0)), 3);
+
+  vec_ste(s, 0, sum);
+
+  ss = vec_splat(vec_sums(ss, vec_splat_s32(0)), 3);
+
+  vec_ste((uint32x4_t)ss, 0, sse);
+}
+
+/* Identical to the variance call except it takes an additional parameter, sum,
+ * and returns that value using pass-by-reference instead of returning
+ * sse - sum^2 / w*h
+ */
+#define GET_VAR(W, H)                                                    \
+  void vpx_get##W##x##H##var_vsx(const uint8_t *src_ptr, int src_stride, \
+                                 const uint8_t *ref_ptr, int ref_stride, \
+                                 uint32_t *sse, int *sum) {              \
+    variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, sum);  \
+  }
+
+/* Identical to the variance call except it does not calculate the
+ * sse - sum^2 / w*h and returns sse in addition to modifying the passed in
+ * variable.
+ */
+#define MSE(W, H)                                                         \
+  uint32_t vpx_mse##W##x##H##_vsx(const uint8_t *src_ptr, int src_stride, \
+                                  const uint8_t *ref_ptr, int ref_stride, \
+                                  uint32_t *sse) {                        \
+    int sum;                                                              \
+    variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum);  \
+    return *sse;                                                          \
+  }
+
+#define VAR(W, H)                                                              \
+  uint32_t vpx_variance##W##x##H##_vsx(const uint8_t *src_ptr, int src_stride, \
+                                       const uint8_t *ref_ptr, int ref_stride, \
+                                       uint32_t *sse) {                        \
+    int sum;                                                                   \
+    variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum);       \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / ((W) * (H)));              \
+  }
+
+#define VARIANCES(W, H) VAR(W, H)
+
+VARIANCES(64, 64)
+VARIANCES(64, 32)
+VARIANCES(32, 64)
+VARIANCES(32, 32)
+VARIANCES(32, 16)
+VARIANCES(16, 32)
+VARIANCES(16, 16)
+VARIANCES(16, 8)
+VARIANCES(8, 16)
+VARIANCES(8, 8)
+VARIANCES(8, 4)
+VARIANCES(4, 8)
+VARIANCES(4, 4)
+
+GET_VAR(16, 16)
+GET_VAR(8, 8)
+
+MSE(16, 16)
+MSE(16, 8)
+MSE(8, 16)
+MSE(8, 8)
diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c
new file mode 100644
index 0000000000..2dc66055cc
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c
@@ -0,0 +1,408 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <assert.h>
+#include <string.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+#include "vpx_dsp/vpx_filter.h"
+
+// TODO(lu_zero): unroll
+static VPX_FORCE_INLINE void copy_w16(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      int32_t h) {
+  int i;
+
+  for (i = h; i--;) {
+    vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static VPX_FORCE_INLINE void copy_w32(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      int32_t h) {
+  int i;
+
+  for (i = h; i--;) {
+    vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
+    vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static VPX_FORCE_INLINE void copy_w64(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      int32_t h) {
+  int i;
+
+  for (i = h; i--;) {
+    vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
+    vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
+    vec_vsx_st(vec_vsx_ld(32, src), 32, dst);
+    vec_vsx_st(vec_vsx_ld(48, src), 48, dst);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vpx_convolve_copy_vsx(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const InterpKernel *filter, int x0_q4, int x_step_q4,
+                           int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) {
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  switch (w) {
+    case 16: {
+      copy_w16(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 32: {
+      copy_w32(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 64: {
+      copy_w64(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    default: {
+      int i;
+      for (i = h; i--;) {
+        memcpy(dst, src, w);
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    }
+  }
+}
+
+static VPX_FORCE_INLINE void avg_w16(const uint8_t *src, ptrdiff_t src_stride,
+                                     uint8_t *dst, ptrdiff_t dst_stride,
+                                     int32_t h) {
+  int i;
+
+  for (i = h; i--;) {
+    const uint8x16_t v = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst));
+    vec_vsx_st(v, 0, dst);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static VPX_FORCE_INLINE void avg_w32(const uint8_t *src, ptrdiff_t src_stride,
+                                     uint8_t *dst, ptrdiff_t dst_stride,
+                                     int32_t h) {
+  int i;
+
+  for (i = h; i--;) {
+    const uint8x16_t v0 = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst));
+    const uint8x16_t v1 = vec_avg(vec_vsx_ld(16, src), vec_vsx_ld(16, dst));
+    vec_vsx_st(v0, 0, dst);
+    vec_vsx_st(v1, 16, dst);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static VPX_FORCE_INLINE void avg_w64(const uint8_t *src, ptrdiff_t src_stride,
+                                     uint8_t *dst, ptrdiff_t dst_stride,
+                                     int32_t h) {
+  int i;
+
+  for (i = h; i--;) {
+    const uint8x16_t v0 = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst));
+    const uint8x16_t v1 = vec_avg(vec_vsx_ld(16, src), vec_vsx_ld(16, dst));
+    const uint8x16_t v2 = vec_avg(vec_vsx_ld(32, src), vec_vsx_ld(32, dst));
+    const uint8x16_t v3 = vec_avg(vec_vsx_ld(48, src), vec_vsx_ld(48, dst));
+    vec_vsx_st(v0, 0, dst);
+    vec_vsx_st(v1, 16, dst);
+    vec_vsx_st(v2, 32, dst);
+    vec_vsx_st(v3, 48, dst);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vpx_convolve_avg_vsx(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const InterpKernel *filter, int x0_q4, int x_step_q4,
+                          int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) {
+  switch (w) {
+    case 16: {
+      avg_w16(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 32: {
+      avg_w32(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 64: {
+      avg_w64(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    default: {
+      vpx_convolve_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                         x_step_q4, y0_q4, y_step_q4, w, h);
+      break;
+    }
+  }
+}
+
+static VPX_FORCE_INLINE void convolve_line(uint8_t *dst, const int16x8_t s,
+                                           const int16x8_t f) {
+  const int32x4_t sum = vec_msum(s, f, vec_splat_s32(0));
+  const int32x4_t bias =
+      vec_sl(vec_splat_s32(1), vec_splat_u32(FILTER_BITS - 1));
+  const int32x4_t avg = vec_sr(vec_sums(sum, bias), vec_splat_u32(FILTER_BITS));
+  const uint8x16_t v = vec_splat(
+      vec_packsu(vec_pack(avg, vec_splat_s32(0)), vec_splat_s16(0)), 3);
+  vec_ste(v, 0, dst);
+}
+
+static VPX_FORCE_INLINE void convolve_line_h(uint8_t *dst,
+                                             const uint8_t *const src_x,
+                                             const int16_t *const x_filter) {
+  const int16x8_t s = unpack_to_s16_h(vec_vsx_ld(0, src_x));
+  const int16x8_t f = vec_vsx_ld(0, x_filter);
+
+  convolve_line(dst, s, f);
+}
+
+// TODO(lu_zero): Implement 8x8 and bigger block special cases
+static VPX_FORCE_INLINE void convolve_horiz(const uint8_t *src,
+                                            ptrdiff_t src_stride, uint8_t *dst,
+                                            ptrdiff_t dst_stride,
+                                            const InterpKernel *x_filters,
+                                            int x0_q4, int x_step_q4, int w,
+                                            int h) {
+  int x, y;
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  for (y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; ++x) {
+      convolve_line_h(dst + x, &src[x_q4 >> SUBPEL_BITS],
+                      x_filters[x_q4 & SUBPEL_MASK]);
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static VPX_FORCE_INLINE void convolve_avg_horiz(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
+    int x_step_q4, int w, int h) {
+  int x, y;
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  for (y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; ++x) {
+      uint8_t v;
+      convolve_line_h(&v, &src[x_q4 >> SUBPEL_BITS],
+                      x_filters[x_q4 & SUBPEL_MASK]);
+      dst[x] = ROUND_POWER_OF_TWO(dst[x] + v, 1);
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static uint8x16_t transpose_line_u8_8x8(uint8x16_t a, uint8x16_t b,
+                                        uint8x16_t c, uint8x16_t d,
+                                        uint8x16_t e, uint8x16_t f,
+                                        uint8x16_t g, uint8x16_t h) {
+  uint16x8_t ab = (uint16x8_t)vec_mergeh(a, b);
+  uint16x8_t cd = (uint16x8_t)vec_mergeh(c, d);
+  uint16x8_t ef = (uint16x8_t)vec_mergeh(e, f);
+  uint16x8_t gh = (uint16x8_t)vec_mergeh(g, h);
+
+  uint32x4_t abcd = (uint32x4_t)vec_mergeh(ab, cd);
+  uint32x4_t efgh = (uint32x4_t)vec_mergeh(ef, gh);
+
+  return (uint8x16_t)vec_mergeh(abcd, efgh);
+}
+
+static VPX_FORCE_INLINE void convolve_line_v(uint8_t *dst,
+                                             const uint8_t *const src_y,
+                                             ptrdiff_t src_stride,
+                                             const int16_t *const y_filter) {
+  uint8x16_t s0 = vec_vsx_ld(0, src_y + 0 * src_stride);
+  uint8x16_t s1 = vec_vsx_ld(0, src_y + 1 * src_stride);
+  uint8x16_t s2 = vec_vsx_ld(0, src_y + 2 * src_stride);
+  uint8x16_t s3 = vec_vsx_ld(0, src_y + 3 * src_stride);
+  uint8x16_t s4 = vec_vsx_ld(0, src_y + 4 * src_stride);
+  uint8x16_t s5 = vec_vsx_ld(0, src_y + 5 * src_stride);
+  uint8x16_t s6 = vec_vsx_ld(0, src_y + 6 * src_stride);
+  uint8x16_t s7 = vec_vsx_ld(0, src_y + 7 * src_stride);
+  const int16x8_t f = vec_vsx_ld(0, y_filter);
+  uint8_t buf[16];
+  const uint8x16_t s = transpose_line_u8_8x8(s0, s1, s2, s3, s4, s5, s6, s7);
+
+  vec_vsx_st(s, 0, buf);
+
+  convolve_line(dst, unpack_to_s16_h(s), f);
+}
+
+static VPX_FORCE_INLINE void convolve_vert(const uint8_t *src,
+                                           ptrdiff_t src_stride, uint8_t *dst,
+                                           ptrdiff_t dst_stride,
+                                           const InterpKernel *y_filters,
+                                           int y0_q4, int y_step_q4, int w,
+                                           int h) {
+  int x, y;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+  for (x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (y = 0; y < h; ++y) {
+      convolve_line_v(dst + y * dst_stride,
+                      &src[(y_q4 >> SUBPEL_BITS) * src_stride], src_stride,
+                      y_filters[y_q4 & SUBPEL_MASK]);
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+static VPX_FORCE_INLINE void convolve_avg_vert(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
+    int y_step_q4, int w, int h) {
+  int x, y;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+  for (x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (y = 0; y < h; ++y) {
+      uint8_t v;
+      convolve_line_v(&v, &src[(y_q4 >> SUBPEL_BITS) * src_stride], src_stride,
+                      y_filters[y_q4 & SUBPEL_MASK]);
+      dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + v, 1);
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+static VPX_FORCE_INLINE void convolve(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const InterpKernel *const filter,
+                                      int x0_q4, int x_step_q4, int y0_q4,
+                                      int y_step_q4, int w, int h) {
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  DECLARE_ALIGNED(16, uint8_t, temp[64 * 135]);
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(y_step_q4 <= 32);
+  assert(x_step_q4 <= 32);
+
+  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
+                 filter, x0_q4, x_step_q4, w, intermediate_height);
+  convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter,
+                y0_q4, y_step_q4, w, h);
+}
+
+void vpx_convolve8_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
+                             int h) {
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w,
+                 h);
+}
+
+void vpx_convolve8_avg_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                 int h) {
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                     w, h);
+}
+
+void vpx_convolve8_vert_vsx(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const InterpKernel *filter, int x0_q4,
+                            int x_step_q4, int y0_q4, int y_step_q4, int w,
+                            int h) {
+  (void)x0_q4;
+  (void)x_step_q4;
+
+  convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w,
+                h);
+}
+
+void vpx_convolve8_avg_vert_vsx(const uint8_t *src, ptrdiff_t src_stride,
+                                uint8_t *dst, ptrdiff_t dst_stride,
+                                const InterpKernel *filter, int x0_q4,
+                                int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                int h) {
+  (void)x0_q4;
+  (void)x_step_q4;
+
+  convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4,
+                    w, h);
+}
+
+void vpx_convolve8_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                       ptrdiff_t dst_stride, const InterpKernel *filter,
+                       int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+                       int w, int h) {
+  convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4,
+           y_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_vsx(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const InterpKernel *filter, int x0_q4, int x_step_q4,
+                           int y0_q4, int y_step_q4, int w, int h) {
+  // Fixed size intermediate buffer places limits on parameters.
+  DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
+  assert(w <= 64);
+  assert(h <= 64);
+
+  vpx_convolve8_vsx(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4,
+                    y_step_q4, w, h);
+  vpx_convolve_avg_vsx(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/prob.h b/media/libvpx/libvpx/vpx_dsp/prob.h
index 5656ddbab4..a1c7de1674 100644
--- a/media/libvpx/libvpx/vpx_dsp/prob.h
+++ b/media/libvpx/libvpx/vpx_dsp/prob.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_PROB_H_
-#define VPX_DSP_PROB_H_
+#ifndef VPX_VPX_DSP_PROB_H_
+#define VPX_VPX_DSP_PROB_H_
 
 #include <assert.h>
 
@@ -30,9 +30,9 @@ typedef uint8_t vpx_prob;
 
 typedef int8_t vpx_tree_index;
 
-#define TREE_SIZE(leaf_count) (2 * (leaf_count)-2)
+#define TREE_SIZE(leaf_count) (2 * (leaf_count) - 2)
 
-#define vpx_complement(x) (255 - x)
+#define vpx_complement(x) (255 - (x))
 
 #define MODE_MV_COUNT_SAT 20
 
@@ -48,7 +48,7 @@ typedef const vpx_tree_index vpx_tree[];
 static INLINE vpx_prob get_prob(unsigned int num, unsigned int den) {
   assert(den != 0);
   {
-    const int p = (int)(((int64_t)num * 256 + (den >> 1)) / den);
+    const int p = (int)(((uint64_t)num * 256 + (den >> 1)) / den);
     // (p > 255) ? 255 : (p < 1) ? 1 : p;
     const int clipped_prob = p | ((255 - p) >> 23) | (p == 0);
     return (vpx_prob)clipped_prob;
@@ -103,4 +103,4 @@ DECLARE_ALIGNED(16, extern const uint8_t, vpx_norm[256]);
 }  // extern "C"
 #endif
 
-#endif  // VPX_DSP_PROB_H_
+#endif  // VPX_VPX_DSP_PROB_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/psnr.c b/media/libvpx/libvpx/vpx_dsp/psnr.c
index 47afd4388a..af1e638604 100644
--- a/media/libvpx/libvpx/vpx_dsp/psnr.c
+++ b/media/libvpx/libvpx/vpx_dsp/psnr.c
@@ -1,12 +1,12 @@
 /*
-*  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
-*
-*  Use of this source code is governed by a BSD-style license
-*  that can be found in the LICENSE file in the root of the source
-*  tree. An additional intellectual property rights grant can be found
-*  in the file PATENTS.  All contributing project authors may
-*  be found in the AUTHORS file in the root of the source tree.
-*/
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
 
 #include <math.h>
 #include <assert.h>
@@ -24,59 +24,46 @@ double vpx_sse_to_psnr(double samples, double peak, double sse) {
 }
 
 /* TODO(yaowu): The block_variance calls the unoptimized versions of variance()
-* and highbd_8_variance(). It should not.
-*/
-static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b,
-                             int b_stride, int w, int h, unsigned int *sse,
-                             int *sum) {
+ * and highbd_8_variance(). It should not.
+ */
+static int64_t encoder_sse(const uint8_t *a, int a_stride, const uint8_t *b,
+                           int b_stride, int w, int h) {
   int i, j;
-
-  *sum = 0;
-  *sse = 0;
+  int64_t sse = 0;
 
   for (i = 0; i < h; i++) {
     for (j = 0; j < w; j++) {
       const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
+      sse += diff * diff;
     }
 
     a += a_stride;
     b += b_stride;
   }
+
+  return sse;
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static void encoder_highbd_variance64(const uint8_t *a8, int a_stride,
-                                      const uint8_t *b8, int b_stride, int w,
-                                      int h, uint64_t *sse, int64_t *sum) {
+static int64_t encoder_highbd_sse(const uint8_t *a8, int a_stride,
+                                  const uint8_t *b8, int b_stride, int w,
+                                  int h) {
   int i, j;
+  int64_t sse = 0;
 
-  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  *sum = 0;
-  *sse = 0;
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
 
   for (i = 0; i < h; i++) {
     for (j = 0; j < w; j++) {
       const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
+      sse += diff * diff;
     }
     a += a_stride;
     b += b_stride;
   }
-}
 
-static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride,
-                                      const uint8_t *b8, int b_stride, int w,
-                                      int h, unsigned int *sse, int *sum) {
-  uint64_t sse_long = 0;
-  int64_t sum_long = 0;
-  encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long,
-                            &sum_long);
-  *sse = (unsigned int)sse_long;
-  *sum = (int)sum_long;
+  return sse;
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -85,29 +72,24 @@ static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
   const int dw = width % 16;
   const int dh = height % 16;
   int64_t total_sse = 0;
-  unsigned int sse = 0;
-  int sum = 0;
   int x, y;
 
   if (dw > 0) {
-    encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, dw,
-                     height, &sse, &sum);
-    total_sse += sse;
+    total_sse += encoder_sse(&a[width - dw], a_stride, &b[width - dw], b_stride,
+                             dw, height);
   }
 
   if (dh > 0) {
-    encoder_variance(&a[(height - dh) * a_stride], a_stride,
-                     &b[(height - dh) * b_stride], b_stride, width - dw, dh,
-                     &sse, &sum);
-    total_sse += sse;
+    total_sse +=
+        encoder_sse(&a[(height - dh) * a_stride], a_stride,
+                    &b[(height - dh) * b_stride], b_stride, width - dw, dh);
   }
 
   for (y = 0; y < height / 16; ++y) {
     const uint8_t *pa = a;
     const uint8_t *pb = b;
     for (x = 0; x < width / 16; ++x) {
-      vpx_mse16x16(pa, a_stride, pb, b_stride, &sse);
-      total_sse += sse;
+      total_sse += vpx_sse(pa, a_stride, pb, b_stride, 16, 16);
 
       pa += 16;
       pb += 16;
@@ -146,25 +128,20 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
   int x, y;
   const int dw = width % 16;
   const int dh = height % 16;
-  unsigned int sse = 0;
-  int sum = 0;
   if (dw > 0) {
-    encoder_highbd_8_variance(&a[width - dw], a_stride, &b[width - dw],
-                              b_stride, dw, height, &sse, &sum);
-    total_sse += sse;
+    total_sse += encoder_highbd_sse(&a[width - dw], a_stride, &b[width - dw],
+                                    b_stride, dw, height);
   }
   if (dh > 0) {
-    encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
-                              &b[(height - dh) * b_stride], b_stride,
-                              width - dw, dh, &sse, &sum);
-    total_sse += sse;
+    total_sse += encoder_highbd_sse(&a[(height - dh) * a_stride], a_stride,
+                                    &b[(height - dh) * b_stride], b_stride,
+                                    width - dw, dh);
   }
   for (y = 0; y < height / 16; ++y) {
     const uint8_t *pa = a;
     const uint8_t *pb = b;
     for (x = 0; x < width / 16; ++x) {
-      vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
-      total_sse += sse;
+      total_sse += vpx_highbd_sse(pa, a_stride, pb, b_stride, 16, 16);
       pa += 16;
       pb += 16;
     }
@@ -200,7 +177,8 @@ int64_t vpx_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
                           const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
-                          uint32_t bit_depth, uint32_t in_bit_depth) {
+                          uint32_t bit_depth, uint32_t in_bit_depth,
+                          int spatial_layer_id) {
   const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
   const int heights[3] = { a->y_crop_height, a->uv_crop_height,
                            a->uv_crop_height };
@@ -242,12 +220,13 @@ void vpx_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
   psnr->samples[0] = total_samples;
   psnr->psnr[0] =
       vpx_sse_to_psnr((double)total_samples, peak, (double)total_sse);
+  psnr->spatial_layer_id = spatial_layer_id;
 }
 
 #endif  // !CONFIG_VP9_HIGHBITDEPTH
 
 void vpx_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
-                   PSNR_STATS *psnr) {
+                   PSNR_STATS *psnr, int spatial_layer_id) {
   static const double peak = 255.0;
   const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
   const int heights[3] = { a->y_crop_height, a->uv_crop_height,
@@ -278,4 +257,5 @@ void vpx_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
   psnr->samples[0] = total_samples;
   psnr->psnr[0] =
       vpx_sse_to_psnr((double)total_samples, peak, (double)total_sse);
+  psnr->spatial_layer_id = spatial_layer_id;
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/psnr.h b/media/libvpx/libvpx/vpx_dsp/psnr.h
index f321131d0b..88fde5bb7e 100644
--- a/media/libvpx/libvpx/vpx_dsp/psnr.h
+++ b/media/libvpx/libvpx/vpx_dsp/psnr.h
@@ -1,17 +1,18 @@
 /*
-*  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
-*
-*  Use of this source code is governed by a BSD-style license
-*  that can be found in the LICENSE file in the root of the source
-*  tree. An additional intellectual property rights grant can be found
-*  in the file PATENTS.  All contributing project authors may
-*  be found in the AUTHORS file in the root of the source tree.
-*/
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
 
-#ifndef VPX_DSP_PSNR_H_
-#define VPX_DSP_PSNR_H_
+#ifndef VPX_VPX_DSP_PSNR_H_
+#define VPX_VPX_DSP_PSNR_H_
 
 #include "vpx_scale/yv12config.h"
+#include "vpx/vpx_encoder.h"
 
 #define MAX_PSNR 100.0
 
@@ -19,22 +20,18 @@
 extern "C" {
 #endif
 
-typedef struct {
-  double psnr[4];       // total/y/u/v
-  uint64_t sse[4];      // total/y/u/v
-  uint32_t samples[4];  // total/y/u/v
-} PSNR_STATS;
+typedef struct vpx_psnr_pkt PSNR_STATS;
 
 // TODO(dkovalev) change vpx_sse_to_psnr signature: double -> int64_t
 
 /*!\brief Converts SSE to PSNR
-*
-* Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR).
-*
-* \param[in]    samples       Number of samples
-* \param[in]    peak          Max sample value
-* \param[in]    sse           Sum of squared errors
-*/
+ *
+ * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PSNR).
+ *
+ * \param[in]    samples       Number of samples
+ * \param[in]    peak          Max sample value
+ * \param[in]    sse           Sum of squared errors
+ */
 double vpx_sse_to_psnr(double samples, double peak, double sse);
 int64_t vpx_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -42,10 +39,11 @@ int64_t vpx_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
                              const YV12_BUFFER_CONFIG *b);
 void vpx_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
                           const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
-                          unsigned int bit_depth, unsigned int in_bit_depth);
+                          unsigned int bit_depth, unsigned int in_bit_depth,
+                          int spatial_layer_id);
 #endif
 void vpx_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
-                   PSNR_STATS *psnr);
+                   PSNR_STATS *psnr, int spatial_layer_id);
 
 double vpx_psnrhvs(const YV12_BUFFER_CONFIG *source,
                    const YV12_BUFFER_CONFIG *dest, double *phvs_y,
@@ -54,4 +52,4 @@ double vpx_psnrhvs(const YV12_BUFFER_CONFIG *source,
 #ifdef __cplusplus
 }  // extern "C"
 #endif
-#endif  // VPX_DSP_PSNR_H_
+#endif  // VPX_VPX_DSP_PSNR_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/psnrhvs.c b/media/libvpx/libvpx/vpx_dsp/psnrhvs.c
index b3910152c4..d7ec1a429a 100644
--- a/media/libvpx/libvpx/vpx_dsp/psnrhvs.c
+++ b/media/libvpx/libvpx/vpx_dsp/psnrhvs.c
@@ -126,8 +126,10 @@ static double calc_psnrhvs(const unsigned char *src, int _systride,
   const uint8_t *_dst8 = dst;
   const uint16_t *_src16 = CONVERT_TO_SHORTPTR(src);
   const uint16_t *_dst16 = CONVERT_TO_SHORTPTR(dst);
-  int16_t dct_s[8 * 8], dct_d[8 * 8];
-  tran_low_t dct_s_coef[8 * 8], dct_d_coef[8 * 8];
+  DECLARE_ALIGNED(16, int16_t, dct_s[8 * 8]);
+  DECLARE_ALIGNED(16, int16_t, dct_d[8 * 8]);
+  DECLARE_ALIGNED(16, tran_low_t, dct_s_coef[8 * 8]);
+  DECLARE_ALIGNED(16, tran_low_t, dct_d_coef[8 * 8]);
   double mask[8][8];
   int pixels;
   int x;
@@ -142,7 +144,7 @@ static double calc_psnrhvs(const unsigned char *src, int _systride,
    been normalized and then squared." Their CSF matrix (from PSNR-HVS)
    was also constructed from the JPEG matrices. I can not find any obvious
    scheme of normalizing to produce their table, but if I multiply their
-   CSF by 0.38857 and square the result I get their masking table.
+   CSF by 0.3885746225901003 and square the result I get their masking table.
    I have no idea where this constant comes from, but deviating from it
    too greatly hurts MOS agreement.
 
@@ -150,11 +152,15 @@ static double calc_psnrhvs(const unsigned char *src, int _systride,
    Jaakko Astola, Vladimir Lukin, "On between-coefficient contrast masking
    of DCT basis functions", CD-ROM Proceedings of the Third
    International Workshop on Video Processing and Quality Metrics for Consumer
-   Electronics VPQM-07, Scottsdale, Arizona, USA, 25-26 January, 2007, 4 p.*/
+   Electronics VPQM-07, Scottsdale, Arizona, USA, 25-26 January, 2007, 4 p.
+
+   Suggested in aomedia issue #2363:
+   0.3885746225901003 is a reciprocal of the maximum coefficient (2.573509)
+   of the old JPEG based matrix from the paper. Since you are not using that,
+   divide by actual maximum coefficient. */
   for (x = 0; x < 8; x++)
     for (y = 0; y < 8; y++)
-      mask[x][y] =
-          (_csf[x][y] * 0.3885746225901003) * (_csf[x][y] * 0.3885746225901003);
+      mask[x][y] = (_csf[x][y] / _csf[1][0]) * (_csf[x][y] / _csf[1][0]);
   for (y = 0; y < _h - 7; y += _step) {
     for (x = 0; x < _w - 7; x += _step) {
       int i;
diff --git a/media/libvpx/libvpx/vpx_dsp/quantize.c b/media/libvpx/libvpx/vpx_dsp/quantize.c
index 3c7f9832f7..fac9136f8c 100644
--- a/media/libvpx/libvpx/vpx_dsp/quantize.c
+++ b/media/libvpx/libvpx/vpx_dsp/quantize.c
@@ -8,14 +8,19 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
+
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/quantize.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 
-void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
+void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
                      const int16_t *round_ptr, const int16_t quant,
                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                     const int16_t dequant_ptr, uint16_t *eob_ptr) {
+                     const int16_t dequant, uint16_t *eob_ptr) {
   const int rc = 0;
   const int coeff = coeff_ptr[rc];
   const int coeff_sign = (coeff >> 31);
@@ -25,45 +30,44 @@ void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-    tmp = (tmp * quant) >> 16;
-    qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;
-    if (tmp) eob = 0;
-  }
+  tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+  tmp = (tmp * quant) >> 16;
+  qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+  dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
+  if (tmp) eob = 0;
+
   *eob_ptr = eob + 1;
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
-                            int skip_block, const int16_t *round_ptr,
-                            const int16_t quant, tran_low_t *qcoeff_ptr,
-                            tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
-                            uint16_t *eob_ptr) {
+                            const int16_t *round_ptr, const int16_t quant,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t dequant, uint16_t *eob_ptr) {
   int eob = -1;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
+  {
     const int coeff = coeff_ptr[0];
     const int coeff_sign = (coeff >> 31);
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
     const int64_t tmp = abs_coeff + round_ptr[0];
     const int abs_qcoeff = (int)((tmp * quant) >> 16);
     qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr;
+    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant;
     if (abs_qcoeff) eob = 0;
   }
+
   *eob_ptr = eob + 1;
 }
 #endif
 
-void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr,
                            const int16_t *round_ptr, const int16_t quant,
                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr) {
+                           const int16_t dequant, uint16_t *eob_ptr) {
   const int n_coeffs = 1024;
   const int rc = 0;
   const int coeff = coeff_ptr[rc];
@@ -74,89 +78,87 @@ void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
-                INT16_MIN, INT16_MAX);
-    tmp = (tmp * quant) >> 15;
-    qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
-    if (tmp) eob = 0;
-  }
+  tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1), INT16_MIN,
+              INT16_MAX);
+  tmp = (tmp * quant) >> 15;
+  qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+  dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / 2;
+  if (tmp) eob = 0;
+
   *eob_ptr = eob + 1;
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
                                   const int16_t *round_ptr, const int16_t quant,
                                   tran_low_t *qcoeff_ptr,
                                   tran_low_t *dqcoeff_ptr,
-                                  const int16_t dequant_ptr,
-                                  uint16_t *eob_ptr) {
+                                  const int16_t dequant, uint16_t *eob_ptr) {
   const int n_coeffs = 1024;
   int eob = -1;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
+  {
     const int coeff = coeff_ptr[0];
     const int coeff_sign = (coeff >> 31);
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
     const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1);
     const int abs_qcoeff = (int)((tmp * quant) >> 15);
     qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2;
+    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant / 2;
     if (abs_qcoeff) eob = 0;
   }
+
   *eob_ptr = eob + 1;
 }
 #endif
 
 void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                      int skip_block, const int16_t *zbin_ptr,
-                      const int16_t *round_ptr, const int16_t *quant_ptr,
-                      const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-                      tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                      uint16_t *eob_ptr, const int16_t *scan,
-                      const int16_t *iscan) {
+                      const struct macroblock_plane *const mb_plane,
+                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                      const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                      const struct ScanOrder *const scan_order) {
   int i, non_zero_count = (int)n_coeffs, eob = -1;
-  const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+  const int zbins[2] = { mb_plane->zbin[0], mb_plane->zbin[1] };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
-  (void)iscan;
+  const int16_t *round_ptr = mb_plane->round;
+  const int16_t *quant_ptr = mb_plane->quant;
+  const int16_t *quant_shift_ptr = mb_plane->quant_shift;
+  const int16_t *scan = scan_order->scan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = (int)n_coeffs - 1; i >= 0; i--) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
+  // Pre-scan pass
+  for (i = (int)n_coeffs - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
 
-      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
-        non_zero_count--;
-      else
-        break;
-    }
+    if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
+      non_zero_count--;
+    else
+      break;
+  }
 
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < non_zero_count; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  for (i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
 
-      if (abs_coeff >= zbins[rc != 0]) {
-        int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-        tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
-               quant_shift_ptr[rc != 0]) >>
-              16;  // quantization
-        qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+    if (abs_coeff >= zbins[rc != 0]) {
+      int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+      tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+             quant_shift_ptr[rc != 0]) >>
+            16;  // quantization
+      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = (tran_low_t)(qcoeff_ptr[rc] * dequant_ptr[rc != 0]);
 
-        if (tmp) eob = i;
-      }
+      if (tmp) eob = i;
     }
   }
   *eob_ptr = eob + 1;
@@ -164,156 +166,158 @@ void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             int skip_block, const int16_t *zbin_ptr,
-                             const int16_t *round_ptr, const int16_t *quant_ptr,
-                             const int16_t *quant_shift_ptr,
+                             const struct macroblock_plane *const mb_plane,
                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                             const int16_t *scan, const int16_t *iscan) {
+                             const struct ScanOrder *const scan_order) {
   int i, non_zero_count = (int)n_coeffs, eob = -1;
-  const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+  const int zbins[2] = { mb_plane->zbin[0], mb_plane->zbin[1] };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
-  (void)iscan;
+  const int16_t *round_ptr = mb_plane->round;
+  const int16_t *quant_ptr = mb_plane->quant;
+  const int16_t *quant_shift_ptr = mb_plane->quant_shift;
+  const int16_t *scan = scan_order->scan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = (int)n_coeffs - 1; i >= 0; i--) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
+  // Pre-scan pass
+  for (i = (int)n_coeffs - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
 
-      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
-        non_zero_count--;
-      else
-        break;
-    }
+    if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
+      non_zero_count--;
+    else
+      break;
+  }
 
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < non_zero_count; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  for (i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
 
-      if (abs_coeff >= zbins[rc != 0]) {
-        const int64_t tmp1 = abs_coeff + round_ptr[rc != 0];
-        const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
-        const uint32_t abs_qcoeff =
-            (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 16);
-        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
-        if (abs_qcoeff) eob = i;
-      }
+    if (abs_coeff >= zbins[rc != 0]) {
+      const int64_t tmp1 = abs_coeff + round_ptr[rc != 0];
+      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+      const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 16);
+      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+      if (abs_qcoeff) eob = i;
     }
   }
   *eob_ptr = eob + 1;
 }
 #endif
 
-void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            int skip_block, const int16_t *zbin_ptr,
-                            const int16_t *round_ptr, const int16_t *quant_ptr,
-                            const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
+                            const struct macroblock_plane *const mb_plane,
                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                            const int16_t *scan, const int16_t *iscan) {
-  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
-                         ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
+                            const struct ScanOrder *const scan_order) {
+  const int n_coeffs = 32 * 32;
+  const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1),
+                         ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  const int16_t *round_ptr = mb_plane->round;
+  const int16_t *quant_ptr = mb_plane->quant;
+  const int16_t *quant_shift_ptr = mb_plane->quant_shift;
+  const int16_t *scan = scan_order->scan;
 
   int idx = 0;
-  int idx_arr[1024];
+  int idx_arr[32 * 32 /* n_coeffs */];
   int i, eob = -1;
-  (void)iscan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
+  // Pre-scan pass
+  for (i = 0; i < n_coeffs; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
 
-      // If the coefficient is out of the base ZBIN range, keep it for
-      // quantization.
-      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
-        idx_arr[idx++] = i;
-    }
+    // If the coefficient is out of the base ZBIN range, keep it for
+    // quantization.
+    if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) idx_arr[idx++] = i;
+  }
 
-    // Quantization pass: only process the coefficients selected in
-    // pre-scan pass. Note: idx can be zero.
-    for (i = 0; i < idx; i++) {
-      const int rc = scan[idx_arr[i]];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      int tmp;
-      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-      abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
-      tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) *
-             quant_shift_ptr[rc != 0]) >>
-            15;
+  // Quantization pass: only process the coefficients selected in
+  // pre-scan pass. Note: idx can be zero.
+  for (i = 0; i < idx; i++) {
+    const int rc = scan[idx_arr[i]];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    int tmp;
+    int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+    abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+    tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) *
+           quant_shift_ptr[rc != 0]) >>
+          15;
 
-      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+    qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+#if (VPX_ARCH_X86 || VPX_ARCH_X86_64) && !CONFIG_VP9_HIGHBITDEPTH
+    // When tran_low_t is only 16 bits dqcoeff can outrange it. Rather than
+    // truncating with a cast, saturate the value. This is easier to implement
+    // on x86 and preserves the sign of the value.
+    dqcoeff_ptr[rc] =
+        clamp(qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2, INT16_MIN, INT16_MAX);
+#else
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+#endif  // VPX_ARCH_X86 && CONFIG_VP9_HIGHBITDEPTH
 
-      if (tmp) eob = idx_arr[i];
-    }
+    if (tmp) eob = idx_arr[i];
   }
   *eob_ptr = eob + 1;
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_b_32x32_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan) {
-  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
-                         ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
+    const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+    uint16_t *eob_ptr, const struct ScanOrder *const scan_order) {
+  const intptr_t n_coeffs = 32 * 32;
+  const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1),
+                         ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  const int16_t *round_ptr = mb_plane->round;
+  const int16_t *quant_ptr = mb_plane->quant;
+  const int16_t *quant_shift_ptr = mb_plane->quant_shift;
+  const int16_t *scan = scan_order->scan;
 
   int idx = 0;
   int idx_arr[1024];
   int i, eob = -1;
-  (void)iscan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
+  // Pre-scan pass
+  for (i = 0; i < n_coeffs; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
 
-      // If the coefficient is out of the base ZBIN range, keep it for
-      // quantization.
-      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
-        idx_arr[idx++] = i;
-    }
+    // If the coefficient is out of the base ZBIN range, keep it for
+    // quantization.
+    if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) idx_arr[idx++] = i;
+  }
 
-    // Quantization pass: only process the coefficients selected in
-    // pre-scan pass. Note: idx can be zero.
-    for (i = 0; i < idx; i++) {
-      const int rc = scan[idx_arr[i]];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      const int64_t tmp1 =
-          abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
-      const uint32_t abs_qcoeff =
-          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
-      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-      if (abs_qcoeff) eob = idx_arr[i];
-    }
+  // Quantization pass: only process the coefficients selected in
+  // pre-scan pass. Note: idx can be zero.
+  for (i = 0; i < idx; i++) {
+    const int rc = scan[idx_arr[i]];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+    const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+    const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+    qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+    if (abs_qcoeff) eob = idx_arr[i];
   }
   *eob_ptr = eob + 1;
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/quantize.h b/media/libvpx/libvpx/vpx_dsp/quantize.h
index e132845463..8e138445e2 100644
--- a/media/libvpx/libvpx/vpx_dsp/quantize.h
+++ b/media/libvpx/libvpx/vpx_dsp/quantize.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_QUANTIZE_H_
-#define VPX_DSP_QUANTIZE_H_
+#ifndef VPX_VPX_DSP_QUANTIZE_H_
+#define VPX_VPX_DSP_QUANTIZE_H_
 
 #include "./vpx_config.h"
 #include "vpx_dsp/vpx_dsp_common.h"
@@ -18,31 +18,29 @@
 extern "C" {
 #endif
 
-void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
-                     const int16_t *round_ptr, const int16_t quant_ptr,
+void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
+                     const int16_t *round_ptr, const int16_t quant,
                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                     const int16_t dequant_ptr, uint16_t *eob_ptr);
-void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                           const int16_t *round_ptr, const int16_t quant_ptr,
+                     const int16_t dequant, uint16_t *eob_ptr);
+void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr,
+                           const int16_t *round_ptr, const int16_t quant,
                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr);
+                           const int16_t dequant, uint16_t *eob_ptr);
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
-                            int skip_block, const int16_t *round_ptr,
-                            const int16_t quant_ptr, tran_low_t *qcoeff_ptr,
-                            tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
-                            uint16_t *eob_ptr);
-void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                                  const int16_t *round_ptr,
-                                  const int16_t quant_ptr,
+                            const int16_t *round_ptr, const int16_t quant,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t dequant, uint16_t *eob_ptr);
+void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
+                                  const int16_t *round_ptr, const int16_t quant,
                                   tran_low_t *qcoeff_ptr,
                                   tran_low_t *dqcoeff_ptr,
-                                  const int16_t dequant_ptr, uint16_t *eob_ptr);
+                                  const int16_t dequant, uint16_t *eob_ptr);
 #endif
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VPX_DSP_QUANTIZE_H_
+#endif  // VPX_VPX_DSP_QUANTIZE_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/sad.c b/media/libvpx/libvpx/vpx_dsp/sad.c
index c80ef729bf..2a4c81d588 100644
--- a/media/libvpx/libvpx/vpx_dsp/sad.c
+++ b/media/libvpx/libvpx/vpx_dsp/sad.c
@@ -17,61 +17,63 @@
 #include "vpx_ports/mem.h"
 
 /* Sum the difference between every corresponding element of the buffers. */
-static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,
-                               int b_stride, int width, int height) {
+static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride,
+                               const uint8_t *ref_ptr, int ref_stride,
+                               int width, int height) {
   int y, x;
   unsigned int sad = 0;
 
   for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
+    for (x = 0; x < width; x++) sad += abs(src_ptr[x] - ref_ptr[x]);
 
-    a += a_stride;
-    b += b_stride;
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
   }
   return sad;
 }
 
-#define sadMxN(m, n)                                                        \
-  unsigned int vpx_sad##m##x##n##_c(const uint8_t *src, int src_stride,     \
-                                    const uint8_t *ref, int ref_stride) {   \
-    return sad(src, src_stride, ref, ref_stride, m, n);                     \
-  }                                                                         \
-  unsigned int vpx_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
-                                        const uint8_t *ref, int ref_stride, \
-                                        const uint8_t *second_pred) {       \
-    uint8_t comp_pred[m * n];                                               \
-    vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride);     \
-    return sad(src, src_stride, comp_pred, m, m, n);                        \
+#define sadMxN(m, n)                                                          \
+  unsigned int vpx_sad##m##x##n##_c(const uint8_t *src_ptr, int src_stride,   \
+                                    const uint8_t *ref_ptr, int ref_stride) { \
+    return sad(src_ptr, src_stride, ref_ptr, ref_stride, m, n);               \
+  }                                                                           \
+  unsigned int vpx_sad##m##x##n##_avg_c(                                      \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride, const uint8_t *second_pred) {                           \
+    DECLARE_ALIGNED(32, uint8_t, comp_pred[m * n]);                           \
+    vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref_ptr, ref_stride);   \
+    return sad(src_ptr, src_stride, comp_pred, m, m, n);                      \
+  }                                                                           \
+  unsigned int vpx_sad_skip_##m##x##n##_c(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride) {                                                       \
+    return 2 * sad(src_ptr, 2 * src_stride, ref_ptr, 2 * ref_stride, (m),     \
+                   (n / 2));                                                  \
   }
 
-// depending on call sites, pass **ref_array to avoid & in subsequent call and
-// de-dup with 4D below.
-#define sadMxNxK(m, n, k)                                                   \
-  void vpx_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride,       \
-                                  const uint8_t *ref_array, int ref_stride, \
-                                  uint32_t *sad_array) {                    \
-    int i;                                                                  \
-    for (i = 0; i < k; ++i)                                                 \
-      sad_array[i] =                                                        \
-          vpx_sad##m##x##n##_c(src, src_stride, &ref_array[i], ref_stride); \
-  }
-
-// This appears to be equivalent to the above when k == 4 and refs is const
-#define sadMxNx4D(m, n)                                                    \
-  void vpx_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,         \
-                               const uint8_t *const ref_array[],           \
-                               int ref_stride, uint32_t *sad_array) {      \
-    int i;                                                                 \
-    for (i = 0; i < 4; ++i)                                                \
-      sad_array[i] =                                                       \
-          vpx_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \
+// Compare |src_ptr| to 4 distinct references in |ref_array[4]|
+#define sadMxNx4D(m, n)                                                        \
+  void vpx_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride,         \
+                               const uint8_t *const ref_array[4],              \
+                               int ref_stride, uint32_t sad_array[4]) {        \
+    int i;                                                                     \
+    for (i = 0; i < 4; ++i)                                                    \
+      sad_array[i] =                                                           \
+          vpx_sad##m##x##n##_c(src_ptr, src_stride, ref_array[i], ref_stride); \
+  }                                                                            \
+  void vpx_sad_skip_##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride,   \
+                                     const uint8_t *const ref_array[4],        \
+                                     int ref_stride, uint32_t sad_array[4]) {  \
+    int i;                                                                     \
+    for (i = 0; i < 4; ++i) {                                                  \
+      sad_array[i] = 2 * sad(src_ptr, 2 * src_stride, ref_array[i],            \
+                             2 * ref_stride, (m), (n / 2));                    \
+    }                                                                          \
   }
 
 /* clang-format off */
 // 64x64
 sadMxN(64, 64)
-sadMxNxK(64, 64, 3)
-sadMxNxK(64, 64, 8)
 sadMxNx4D(64, 64)
 
 // 64x32
@@ -84,8 +86,6 @@ sadMxNx4D(32, 64)
 
 // 32x32
 sadMxN(32, 32)
-sadMxNxK(32, 32, 3)
-sadMxNxK(32, 32, 8)
 sadMxNx4D(32, 32)
 
 // 32x16
@@ -98,118 +98,110 @@ sadMxNx4D(16, 32)
 
 // 16x16
 sadMxN(16, 16)
-sadMxNxK(16, 16, 3)
-sadMxNxK(16, 16, 8)
 sadMxNx4D(16, 16)
 
 // 16x8
 sadMxN(16, 8)
-sadMxNxK(16, 8, 3)
-sadMxNxK(16, 8, 8)
 sadMxNx4D(16, 8)
 
 // 8x16
 sadMxN(8, 16)
-sadMxNxK(8, 16, 3)
-sadMxNxK(8, 16, 8)
 sadMxNx4D(8, 16)
 
 // 8x8
 sadMxN(8, 8)
-sadMxNxK(8, 8, 3)
-sadMxNxK(8, 8, 8)
 sadMxNx4D(8, 8)
 
 // 8x4
 sadMxN(8, 4)
-sadMxNxK(8, 4, 8)
 sadMxNx4D(8, 4)
 
 // 4x8
 sadMxN(4, 8)
-sadMxNxK(4, 8, 8)
 sadMxNx4D(4, 8)
 
 // 4x4
 sadMxN(4, 4)
-sadMxNxK(4, 4, 3)
-sadMxNxK(4, 4, 8)
 sadMxNx4D(4, 4)
 /* clang-format on */
 
 #if CONFIG_VP9_HIGHBITDEPTH
         static INLINE
-    unsigned int highbd_sad(const uint8_t *a8, int a_stride, const uint8_t *b8,
-                            int b_stride, int width, int height) {
+    unsigned int highbd_sad(const uint8_t *src8_ptr, int src_stride,
+                            const uint8_t *ref8_ptr, int ref_stride, int width,
+                            int height) {
   int y, x;
   unsigned int sad = 0;
-  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
+  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref8_ptr);
   for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
+    for (x = 0; x < width; x++) sad += abs(src[x] - ref_ptr[x]);
 
-    a += a_stride;
-    b += b_stride;
+    src += src_stride;
+    ref_ptr += ref_stride;
   }
   return sad;
 }
 
-static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
-                                       const uint16_t *b, int b_stride,
+static INLINE unsigned int highbd_sadb(const uint8_t *src8_ptr, int src_stride,
+                                       const uint16_t *ref_ptr, int ref_stride,
                                        int width, int height) {
   int y, x;
   unsigned int sad = 0;
-  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
   for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
+    for (x = 0; x < width; x++) sad += abs(src[x] - ref_ptr[x]);
 
-    a += a_stride;
-    b += b_stride;
+    src += src_stride;
+    ref_ptr += ref_stride;
   }
   return sad;
 }
 
 #define highbd_sadMxN(m, n)                                                    \
-  unsigned int vpx_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
-                                           const uint8_t *ref,                 \
-                                           int ref_stride) {                   \
-    return highbd_sad(src, src_stride, ref, ref_stride, m, n);                 \
+  unsigned int vpx_highbd_sad##m##x##n##_c(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,          \
+      int ref_stride) {                                                        \
+    return highbd_sad(src_ptr, src_stride, ref_ptr, ref_stride, m, n);         \
   }                                                                            \
   unsigned int vpx_highbd_sad##m##x##n##_avg_c(                                \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
-      const uint8_t *second_pred) {                                            \
-    uint16_t comp_pred[m * n];                                                 \
-    vpx_highbd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \
-    return highbd_sadb(src, src_stride, comp_pred, m, m, n);                   \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,          \
+      int ref_stride, const uint8_t *second_pred) {                            \
+    DECLARE_ALIGNED(16, uint16_t, comp_pred[m * n]);                           \
+    vpx_highbd_comp_avg_pred_c(comp_pred, CONVERT_TO_SHORTPTR(second_pred), m, \
+                               n, CONVERT_TO_SHORTPTR(ref_ptr), ref_stride);   \
+    return highbd_sadb(src_ptr, src_stride, comp_pred, m, m, n);               \
+  }                                                                            \
+  unsigned int vpx_highbd_sad_skip_##m##x##n##_c(                              \
+      const uint8_t *src, int src_stride, const uint8_t *ref,                  \
+      int ref_stride) {                                                        \
+    return 2 *                                                                 \
+           highbd_sad(src, 2 * src_stride, ref, 2 * ref_stride, (m), (n / 2)); \
   }
 
-#define highbd_sadMxNxK(m, n, k)                                             \
-  void vpx_highbd_sad##m##x##n##x##k##_c(                                    \
-      const uint8_t *src, int src_stride, const uint8_t *ref_array,          \
-      int ref_stride, uint32_t *sad_array) {                                 \
-    int i;                                                                   \
-    for (i = 0; i < k; ++i) {                                                \
-      sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride,            \
-                                                 &ref_array[i], ref_stride); \
-    }                                                                        \
-  }
-
-#define highbd_sadMxNx4D(m, n)                                               \
-  void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,    \
-                                      const uint8_t *const ref_array[],      \
-                                      int ref_stride, uint32_t *sad_array) { \
-    int i;                                                                   \
-    for (i = 0; i < 4; ++i) {                                                \
-      sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride,            \
-                                                 ref_array[i], ref_stride);  \
-    }                                                                        \
+#define highbd_sadMxNx4D(m, n)                                                 \
+  void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride,  \
+                                      const uint8_t *const ref_array[4],       \
+                                      int ref_stride, uint32_t sad_array[4]) { \
+    int i;                                                                     \
+    for (i = 0; i < 4; ++i) {                                                  \
+      sad_array[i] = vpx_highbd_sad##m##x##n##_c(src_ptr, src_stride,          \
+                                                 ref_array[i], ref_stride);    \
+    }                                                                          \
+  }                                                                            \
+  void vpx_highbd_sad_skip_##m##x##n##x4d_c(                                   \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4],   \
+      int ref_stride, uint32_t sad_array[4]) {                                 \
+    int i;                                                                     \
+    for (i = 0; i < 4; ++i) {                                                  \
+      sad_array[i] = vpx_highbd_sad_skip_##m##x##n##_c(                        \
+          src, src_stride, ref_array[i], ref_stride);                          \
+    }                                                                          \
   }
 
 /* clang-format off */
 // 64x64
 highbd_sadMxN(64, 64)
-highbd_sadMxNxK(64, 64, 3)
-highbd_sadMxNxK(64, 64, 8)
 highbd_sadMxNx4D(64, 64)
 
 // 64x32
@@ -222,8 +214,6 @@ highbd_sadMxNx4D(32, 64)
 
 // 32x32
 highbd_sadMxN(32, 32)
-highbd_sadMxNxK(32, 32, 3)
-highbd_sadMxNxK(32, 32, 8)
 highbd_sadMxNx4D(32, 32)
 
 // 32x16
@@ -236,42 +226,30 @@ highbd_sadMxNx4D(16, 32)
 
 // 16x16
 highbd_sadMxN(16, 16)
-highbd_sadMxNxK(16, 16, 3)
-highbd_sadMxNxK(16, 16, 8)
 highbd_sadMxNx4D(16, 16)
 
 // 16x8
 highbd_sadMxN(16, 8)
-highbd_sadMxNxK(16, 8, 3)
-highbd_sadMxNxK(16, 8, 8)
 highbd_sadMxNx4D(16, 8)
 
 // 8x16
 highbd_sadMxN(8, 16)
-highbd_sadMxNxK(8, 16, 3)
-highbd_sadMxNxK(8, 16, 8)
 highbd_sadMxNx4D(8, 16)
 
 // 8x8
 highbd_sadMxN(8, 8)
-highbd_sadMxNxK(8, 8, 3)
-highbd_sadMxNxK(8, 8, 8)
 highbd_sadMxNx4D(8, 8)
 
 // 8x4
 highbd_sadMxN(8, 4)
-highbd_sadMxNxK(8, 4, 8)
 highbd_sadMxNx4D(8, 4)
 
 // 4x8
 highbd_sadMxN(4, 8)
-highbd_sadMxNxK(4, 8, 8)
 highbd_sadMxNx4D(4, 8)
 
 // 4x4
 highbd_sadMxN(4, 4)
-highbd_sadMxNxK(4, 4, 3)
-highbd_sadMxNxK(4, 4, 8)
 highbd_sadMxNx4D(4, 4)
 /* clang-format on */
 
diff --git a/media/libvpx/libvpx/vpx_dsp/skin_detection.c b/media/libvpx/libvpx/vpx_dsp/skin_detection.c
new file mode 100644
index 0000000000..bbbb6c3a17
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/skin_detection.c
@@ -0,0 +1,79 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/skin_detection.h"
+
+#define MODEL_MODE 1
+
+// Fixed-point skin color model parameters.
+static const int skin_mean[5][2] = { { 7463, 9614 },
+                                     { 6400, 10240 },
+                                     { 7040, 10240 },
+                                     { 8320, 9280 },
+                                     { 6800, 9614 } };
+static const int skin_inv_cov[4] = { 4107, 1663, 1663, 2157 };  // q16
+static const int skin_threshold[6] = { 1570636, 1400000, 800000,
+                                       800000,  800000,  800000 };  // q18
+// Thresholds on luminance.
+static const int y_low = 40;
+static const int y_high = 220;
+
+// Evaluates the Mahalanobis distance measure for the input CbCr values.
+static int vpx_evaluate_skin_color_difference(const int cb, const int cr,
+                                              const int idx) {
+  const int cb_q6 = cb << 6;
+  const int cr_q6 = cr << 6;
+  const int cb_diff_q12 =
+      (cb_q6 - skin_mean[idx][0]) * (cb_q6 - skin_mean[idx][0]);
+  const int cbcr_diff_q12 =
+      (cb_q6 - skin_mean[idx][0]) * (cr_q6 - skin_mean[idx][1]);
+  const int cr_diff_q12 =
+      (cr_q6 - skin_mean[idx][1]) * (cr_q6 - skin_mean[idx][1]);
+  const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10;
+  const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10;
+  const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10;
+  const int skin_diff =
+      skin_inv_cov[0] * cb_diff_q2 + skin_inv_cov[1] * cbcr_diff_q2 +
+      skin_inv_cov[2] * cbcr_diff_q2 + skin_inv_cov[3] * cr_diff_q2;
+  return skin_diff;
+}
+
+// Checks if the input yCbCr values corresponds to skin color.
+int vpx_skin_pixel(const int y, const int cb, const int cr, int motion) {
+  if (y < y_low || y > y_high) {
+    return 0;
+  } else if (MODEL_MODE == 0) {
+    return (vpx_evaluate_skin_color_difference(cb, cr, 0) < skin_threshold[0]);
+  } else {
+    int i = 0;
+    // Exit on grey.
+    if (cb == 128 && cr == 128) return 0;
+    // Exit on very strong cb.
+    if (cb > 150 && cr < 110) return 0;
+    for (; i < 5; ++i) {
+      int skin_color_diff = vpx_evaluate_skin_color_difference(cb, cr, i);
+      if (skin_color_diff < skin_threshold[i + 1]) {
+        if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2)) {
+          return 0;
+        } else if (motion == 0 &&
+                   skin_color_diff > (skin_threshold[i + 1] >> 1)) {
+          return 0;
+        } else {
+          return 1;
+        }
+      }
+      // Exit if difference is much large than the threshold.
+      if (skin_color_diff > (skin_threshold[i + 1] << 3)) {
+        return 0;
+      }
+    }
+    return 0;
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/skin_detection.h b/media/libvpx/libvpx/vpx_dsp/skin_detection.h
new file mode 100644
index 0000000000..91640c33d5
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/skin_detection.h
@@ -0,0 +1,24 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_SKIN_DETECTION_H_
+#define VPX_VPX_DSP_SKIN_DETECTION_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int vpx_skin_pixel(const int y, const int cb, const int cr, int motion);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VPX_DSP_SKIN_DETECTION_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/sse.c b/media/libvpx/libvpx/vpx_dsp/sse.c
new file mode 100644
index 0000000000..c9d751859d
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/sse.c
@@ -0,0 +1,59 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * Sum the square of the difference between every corresponding element of the
+ * buffers.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+int64_t vpx_sse_c(const uint8_t *a, int a_stride, const uint8_t *b,
+                  int b_stride, int width, int height) {
+  int y, x;
+  int64_t sse = 0;
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      const int32_t diff = abs(a[x] - b[x]);
+      sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sse;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+int64_t vpx_highbd_sse_c(const uint8_t *a8, int a_stride, const uint8_t *b8,
+                         int b_stride, int width, int height) {
+  int y, x;
+  int64_t sse = 0;
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      const int32_t diff = (int32_t)(a[x]) - (int32_t)(b[x]);
+      sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sse;
+}
+#endif
diff --git a/media/libvpx/libvpx/vpx_dsp/ssim.c b/media/libvpx/libvpx/vpx_dsp/ssim.c
index 7a29bd29f9..4a31f3d223 100644
--- a/media/libvpx/libvpx/vpx_dsp/ssim.c
+++ b/media/libvpx/libvpx/vpx_dsp/ssim.c
@@ -15,21 +15,6 @@
 #include "vpx_ports/mem.h"
 #include "vpx_ports/system_state.h"
 
-void vpx_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
-                            uint32_t *sum_s, uint32_t *sum_r,
-                            uint32_t *sum_sq_s, uint32_t *sum_sq_r,
-                            uint32_t *sum_sxr) {
-  int i, j;
-  for (i = 0; i < 16; i++, s += sp, r += rp) {
-    for (j = 0; j < 16; j++) {
-      *sum_s += s[j];
-      *sum_r += r[j];
-      *sum_sq_s += s[j] * s[j];
-      *sum_sq_r += r[j] * r[j];
-      *sum_sxr += s[j] * r[j];
-    }
-  }
-}
 void vpx_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
                           uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
                           uint32_t *sum_sq_r, uint32_t *sum_sxr) {
@@ -73,7 +58,7 @@ static const int64_t cc2_12 = 61817334;  // (64^2*(.03*4095)^2
 static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
                          uint32_t sum_sq_r, uint32_t sum_sxr, int count,
                          uint32_t bd) {
-  int64_t ssim_n, ssim_d;
+  double ssim_n, ssim_d;
   int64_t c1, c2;
   if (bd == 8) {
     // scale the constants by number of pixels
@@ -90,14 +75,14 @@ static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
     assert(0);
   }
 
-  ssim_n = (2 * sum_s * sum_r + c1) *
-           ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2);
+  ssim_n = (2.0 * sum_s * sum_r + c1) *
+           (2.0 * count * sum_sxr - 2.0 * sum_s * sum_r + c2);
 
-  ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) *
-           ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s +
-            (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2);
+  ssim_d = ((double)sum_s * sum_s + (double)sum_r * sum_r + c1) *
+           ((double)count * sum_sq_s - (double)sum_s * sum_s +
+            (double)count * sum_sq_r - (double)sum_r * sum_r + c2);
 
-  return ssim_n * 1.0 / ssim_d;
+  return ssim_n / ssim_d;
 }
 
 static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) {
@@ -284,7 +269,7 @@ double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
   for (i = 0; i < height;
        i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
     for (j = 0; j < width; j += 4, ++c) {
-      Ssimv sv = { 0 };
+      Ssimv sv = { 0, 0, 0, 0, 0, 0 };
       double ssim;
       double ssim2;
       double dssim;
diff --git a/media/libvpx/libvpx/vpx_dsp/ssim.h b/media/libvpx/libvpx/vpx_dsp/ssim.h
index 4f2bb1d556..c382237fc6 100644
--- a/media/libvpx/libvpx/vpx_dsp/ssim.h
+++ b/media/libvpx/libvpx/vpx_dsp/ssim.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_SSIM_H_
-#define VPX_DSP_SSIM_H_
+#ifndef VPX_VPX_DSP_SSIM_H_
+#define VPX_VPX_DSP_SSIM_H_
 
 #define MAX_SSIM_DB 100.0;
 
@@ -84,4 +84,4 @@ double vpx_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
 }  // extern "C"
 #endif
 
-#endif  // VPX_DSP_SSIM_H_
+#endif  // VPX_VPX_DSP_SSIM_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/subtract.c b/media/libvpx/libvpx/vpx_dsp/subtract.c
index 95e7071b27..45c819e67a 100644
--- a/media/libvpx/libvpx/vpx_dsp/subtract.c
+++ b/media/libvpx/libvpx/vpx_dsp/subtract.c
@@ -16,37 +16,37 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
 
-void vpx_subtract_block_c(int rows, int cols, int16_t *diff,
-                          ptrdiff_t diff_stride, const uint8_t *src,
-                          ptrdiff_t src_stride, const uint8_t *pred,
+void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr,
+                          ptrdiff_t diff_stride, const uint8_t *src_ptr,
+                          ptrdiff_t src_stride, const uint8_t *pred_ptr,
                           ptrdiff_t pred_stride) {
   int r, c;
 
   for (r = 0; r < rows; r++) {
-    for (c = 0; c < cols; c++) diff[c] = src[c] - pred[c];
+    for (c = 0; c < cols; c++) diff_ptr[c] = src_ptr[c] - pred_ptr[c];
 
-    diff += diff_stride;
-    pred += pred_stride;
-    src += src_stride;
+    diff_ptr += diff_stride;
+    pred_ptr += pred_stride;
+    src_ptr += src_stride;
   }
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff,
-                                 ptrdiff_t diff_stride, const uint8_t *src8,
-                                 ptrdiff_t src_stride, const uint8_t *pred8,
+void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr,
+                                 ptrdiff_t diff_stride, const uint8_t *src8_ptr,
+                                 ptrdiff_t src_stride, const uint8_t *pred8_ptr,
                                  ptrdiff_t pred_stride, int bd) {
   int r, c;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8_ptr);
   (void)bd;
 
   for (r = 0; r < rows; r++) {
     for (c = 0; c < cols; c++) {
-      diff[c] = src[c] - pred[c];
+      diff_ptr[c] = src[c] - pred[c];
     }
 
-    diff += diff_stride;
+    diff_ptr += diff_stride;
     pred += pred_stride;
     src += src_stride;
   }
diff --git a/media/libvpx/libvpx/vpx_dsp/sum_squares.c b/media/libvpx/libvpx/vpx_dsp/sum_squares.c
index 7c535ac2db..b80cd588e4 100644
--- a/media/libvpx/libvpx/vpx_dsp/sum_squares.c
+++ b/media/libvpx/libvpx/vpx_dsp/sum_squares.c
@@ -10,8 +10,7 @@
 
 #include "./vpx_dsp_rtcd.h"
 
-uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int src_stride,
-                                  int size) {
+uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size) {
   int r, c;
   uint64_t ss = 0;
 
@@ -20,7 +19,7 @@ uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int src_stride,
       const int16_t v = src[c];
       ss += v * v;
     }
-    src += src_stride;
+    src += stride;
   }
 
   return ss;
diff --git a/media/libvpx/libvpx/vpx_dsp/txfm_common.h b/media/libvpx/libvpx/vpx_dsp/txfm_common.h
index fd27f928ed..25f4fdb327 100644
--- a/media/libvpx/libvpx/vpx_dsp/txfm_common.h
+++ b/media/libvpx/libvpx/vpx_dsp/txfm_common.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_TXFM_COMMON_H_
-#define VPX_DSP_TXFM_COMMON_H_
+#ifndef VPX_VPX_DSP_TXFM_COMMON_H_
+#define VPX_VPX_DSP_TXFM_COMMON_H_
 
 #include "vpx_dsp/vpx_dsp_common.h"
 
@@ -25,42 +25,42 @@
 //    printf("static const int cospi_%d_64 = %.0f;\n", i,
 //           round(16384 * cos(i*M_PI/64)));
 // Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
-static const tran_high_t cospi_1_64 = 16364;
-static const tran_high_t cospi_2_64 = 16305;
-static const tran_high_t cospi_3_64 = 16207;
-static const tran_high_t cospi_4_64 = 16069;
-static const tran_high_t cospi_5_64 = 15893;
-static const tran_high_t cospi_6_64 = 15679;
-static const tran_high_t cospi_7_64 = 15426;
-static const tran_high_t cospi_8_64 = 15137;
-static const tran_high_t cospi_9_64 = 14811;
-static const tran_high_t cospi_10_64 = 14449;
-static const tran_high_t cospi_11_64 = 14053;
-static const tran_high_t cospi_12_64 = 13623;
-static const tran_high_t cospi_13_64 = 13160;
-static const tran_high_t cospi_14_64 = 12665;
-static const tran_high_t cospi_15_64 = 12140;
-static const tran_high_t cospi_16_64 = 11585;
-static const tran_high_t cospi_17_64 = 11003;
-static const tran_high_t cospi_18_64 = 10394;
-static const tran_high_t cospi_19_64 = 9760;
-static const tran_high_t cospi_20_64 = 9102;
-static const tran_high_t cospi_21_64 = 8423;
-static const tran_high_t cospi_22_64 = 7723;
-static const tran_high_t cospi_23_64 = 7005;
-static const tran_high_t cospi_24_64 = 6270;
-static const tran_high_t cospi_25_64 = 5520;
-static const tran_high_t cospi_26_64 = 4756;
-static const tran_high_t cospi_27_64 = 3981;
-static const tran_high_t cospi_28_64 = 3196;
-static const tran_high_t cospi_29_64 = 2404;
-static const tran_high_t cospi_30_64 = 1606;
-static const tran_high_t cospi_31_64 = 804;
+static const tran_coef_t cospi_1_64 = 16364;
+static const tran_coef_t cospi_2_64 = 16305;
+static const tran_coef_t cospi_3_64 = 16207;
+static const tran_coef_t cospi_4_64 = 16069;
+static const tran_coef_t cospi_5_64 = 15893;
+static const tran_coef_t cospi_6_64 = 15679;
+static const tran_coef_t cospi_7_64 = 15426;
+static const tran_coef_t cospi_8_64 = 15137;
+static const tran_coef_t cospi_9_64 = 14811;
+static const tran_coef_t cospi_10_64 = 14449;
+static const tran_coef_t cospi_11_64 = 14053;
+static const tran_coef_t cospi_12_64 = 13623;
+static const tran_coef_t cospi_13_64 = 13160;
+static const tran_coef_t cospi_14_64 = 12665;
+static const tran_coef_t cospi_15_64 = 12140;
+static const tran_coef_t cospi_16_64 = 11585;
+static const tran_coef_t cospi_17_64 = 11003;
+static const tran_coef_t cospi_18_64 = 10394;
+static const tran_coef_t cospi_19_64 = 9760;
+static const tran_coef_t cospi_20_64 = 9102;
+static const tran_coef_t cospi_21_64 = 8423;
+static const tran_coef_t cospi_22_64 = 7723;
+static const tran_coef_t cospi_23_64 = 7005;
+static const tran_coef_t cospi_24_64 = 6270;
+static const tran_coef_t cospi_25_64 = 5520;
+static const tran_coef_t cospi_26_64 = 4756;
+static const tran_coef_t cospi_27_64 = 3981;
+static const tran_coef_t cospi_28_64 = 3196;
+static const tran_coef_t cospi_29_64 = 2404;
+static const tran_coef_t cospi_30_64 = 1606;
+static const tran_coef_t cospi_31_64 = 804;
 
 //  16384 * sqrt(2) * sin(kPi/9) * 2 / 3
-static const tran_high_t sinpi_1_9 = 5283;
-static const tran_high_t sinpi_2_9 = 9929;
-static const tran_high_t sinpi_3_9 = 13377;
-static const tran_high_t sinpi_4_9 = 15212;
+static const tran_coef_t sinpi_1_9 = 5283;
+static const tran_coef_t sinpi_2_9 = 9929;
+static const tran_coef_t sinpi_3_9 = 13377;
+static const tran_coef_t sinpi_4_9 = 15212;
 
-#endif  // VPX_DSP_TXFM_COMMON_H_
+#endif  // VPX_VPX_DSP_TXFM_COMMON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/variance.c b/media/libvpx/libvpx/vpx_dsp/variance.c
index d1fa0560d5..1c476542fa 100644
--- a/media/libvpx/libvpx/vpx_dsp/variance.c
+++ b/media/libvpx/libvpx/vpx_dsp/variance.c
@@ -21,36 +21,37 @@ static const uint8_t bilinear_filters[8][2] = {
   { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
 };
 
-uint32_t vpx_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b,
-                            int b_stride) {
+uint32_t vpx_get4x4sse_cs_c(const uint8_t *src_ptr, int src_stride,
+                            const uint8_t *ref_ptr, int ref_stride) {
   int distortion = 0;
   int r, c;
 
   for (r = 0; r < 4; ++r) {
     for (c = 0; c < 4; ++c) {
-      int diff = a[c] - b[c];
+      int diff = src_ptr[c] - ref_ptr[c];
       distortion += diff * diff;
     }
 
-    a += a_stride;
-    b += b_stride;
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
   }
 
   return distortion;
 }
 
-uint32_t vpx_get_mb_ss_c(const int16_t *a) {
+uint32_t vpx_get_mb_ss_c(const int16_t *src_ptr) {
   unsigned int i, sum = 0;
 
   for (i = 0; i < 256; ++i) {
-    sum += a[i] * a[i];
+    sum += src_ptr[i] * src_ptr[i];
   }
 
   return sum;
 }
 
-static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
-                     int b_stride, int w, int h, uint32_t *sse, int *sum) {
+static void variance(const uint8_t *src_ptr, int src_stride,
+                     const uint8_t *ref_ptr, int ref_stride, int w, int h,
+                     uint32_t *sse, int *sum) {
   int i, j;
 
   *sum = 0;
@@ -58,13 +59,13 @@ static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
 
   for (i = 0; i < h; ++i) {
     for (j = 0; j < w; ++j) {
-      const int diff = a[j] - b[j];
+      const int diff = src_ptr[j] - ref_ptr[j];
       *sum += diff;
       *sse += diff * diff;
     }
 
-    a += a_stride;
-    b += b_stride;
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
   }
 }
 
@@ -76,24 +77,23 @@ static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
 // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
 // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
 // It defines the offset required to move from one input to the next.
-static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b,
-                                              unsigned int src_pixels_per_line,
-                                              int pixel_step,
-                                              unsigned int output_height,
-                                              unsigned int output_width,
-                                              const uint8_t *filter) {
+static void var_filter_block2d_bil_first_pass(
+    const uint8_t *src_ptr, uint16_t *ref_ptr, unsigned int src_pixels_per_line,
+    int pixel_step, unsigned int output_height, unsigned int output_width,
+    const uint8_t *filter) {
   unsigned int i, j;
 
   for (i = 0; i < output_height; ++i) {
     for (j = 0; j < output_width; ++j) {
-      b[j] = ROUND_POWER_OF_TWO(
-          (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
+      ref_ptr[j] = ROUND_POWER_OF_TWO(
+          (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
+          FILTER_BITS);
 
-      ++a;
+      ++src_ptr;
     }
 
-    a += src_pixels_per_line - output_width;
-    b += output_width;
+    src_ptr += src_pixels_per_line - output_width;
+    ref_ptr += output_width;
   }
 }
 
@@ -106,91 +106,90 @@ static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b,
 // filter is applied horizontally (pixel_step = 1) or vertically
 // (pixel_step = stride). It defines the offset required to move from one input
 // to the next. Output is 8-bit.
-static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
-                                               unsigned int src_pixels_per_line,
-                                               unsigned int pixel_step,
-                                               unsigned int output_height,
-                                               unsigned int output_width,
-                                               const uint8_t *filter) {
+static void var_filter_block2d_bil_second_pass(
+    const uint16_t *src_ptr, uint8_t *ref_ptr, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter) {
   unsigned int i, j;
 
   for (i = 0; i < output_height; ++i) {
     for (j = 0; j < output_width; ++j) {
-      b[j] = ROUND_POWER_OF_TWO(
-          (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
-      ++a;
+      ref_ptr[j] = ROUND_POWER_OF_TWO(
+          (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
+          FILTER_BITS);
+      ++src_ptr;
     }
 
-    a += src_pixels_per_line - output_width;
-    b += output_width;
+    src_ptr += src_pixels_per_line - output_width;
+    ref_ptr += output_width;
   }
 }
 
-#define VAR(W, H)                                                    \
-  uint32_t vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
-                                     const uint8_t *b, int b_stride, \
-                                     uint32_t *sse) {                \
-    int sum;                                                         \
-    variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
-    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));        \
+#define VAR(W, H)                                                            \
+  uint32_t vpx_variance##W##x##H##_c(const uint8_t *src_ptr, int src_stride, \
+                                     const uint8_t *ref_ptr, int ref_stride, \
+                                     uint32_t *sse) {                        \
+    int sum;                                                                 \
+    variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum);     \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));                \
   }
 
-#define SUBPIX_VAR(W, H)                                                \
-  uint32_t vpx_sub_pixel_variance##W##x##H##_c(                         \
-      const uint8_t *a, int a_stride, int xoffset, int yoffset,         \
-      const uint8_t *b, int b_stride, uint32_t *sse) {                  \
-    uint16_t fdata3[(H + 1) * W];                                       \
-    uint8_t temp2[H * W];                                               \
-                                                                        \
-    var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
-                                      bilinear_filters[xoffset]);       \
-    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
-                                       bilinear_filters[yoffset]);      \
-                                                                        \
-    return vpx_variance##W##x##H##_c(temp2, W, b, b_stride, sse);       \
+#define SUBPIX_VAR(W, H)                                                     \
+  uint32_t vpx_sub_pixel_variance##W##x##H##_c(                              \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,    \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {               \
+    uint16_t fdata3[(H + 1) * W];                                            \
+    uint8_t temp2[H * W];                                                    \
+                                                                             \
+    var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, H + 1, \
+                                      W, bilinear_filters[x_offset]);        \
+    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,            \
+                                       bilinear_filters[y_offset]);          \
+                                                                             \
+    return vpx_variance##W##x##H##_c(temp2, W, ref_ptr, ref_stride, sse);    \
   }
 
-#define SUBPIX_AVG_VAR(W, H)                                            \
-  uint32_t vpx_sub_pixel_avg_variance##W##x##H##_c(                     \
-      const uint8_t *a, int a_stride, int xoffset, int yoffset,         \
-      const uint8_t *b, int b_stride, uint32_t *sse,                    \
-      const uint8_t *second_pred) {                                     \
-    uint16_t fdata3[(H + 1) * W];                                       \
-    uint8_t temp2[H * W];                                               \
-    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                         \
-                                                                        \
-    var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
-                                      bilinear_filters[xoffset]);       \
-    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
-                                       bilinear_filters[yoffset]);      \
-                                                                        \
-    vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W);              \
-                                                                        \
-    return vpx_variance##W##x##H##_c(temp3, W, b, b_stride, sse);       \
+#define SUBPIX_AVG_VAR(W, H)                                                 \
+  uint32_t vpx_sub_pixel_avg_variance##W##x##H##_c(                          \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,    \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                 \
+      const uint8_t *second_pred) {                                          \
+    uint16_t fdata3[(H + 1) * W];                                            \
+    uint8_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(32, uint8_t, temp3[H * W]);                              \
+                                                                             \
+    var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, H + 1, \
+                                      W, bilinear_filters[x_offset]);        \
+    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,            \
+                                       bilinear_filters[y_offset]);          \
+                                                                             \
+    vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W);                 \
+                                                                             \
+    return vpx_variance##W##x##H##_c(temp3, W, ref_ptr, ref_stride, sse);    \
   }
 
 /* Identical to the variance call except it takes an additional parameter, sum,
  * and returns that value using pass-by-reference instead of returning
  * sse - sum^2 / w*h
  */
-#define GET_VAR(W, H)                                                         \
-  void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride,                \
-                               const uint8_t *b, int b_stride, uint32_t *sse, \
-                               int *sum) {                                    \
-    variance(a, a_stride, b, b_stride, W, H, sse, sum);                       \
+#define GET_VAR(W, H)                                                   \
+  void vpx_get##W##x##H##var_c(const uint8_t *src_ptr, int src_stride,  \
+                               const uint8_t *ref_ptr, int ref_stride,  \
+                               uint32_t *sse, int *sum) {               \
+    variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, sum); \
   }
 
 /* Identical to the variance call except it does not calculate the
- * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
+ * sse - sum^2 / w*h and returns sse in addition to modifying the passed in
  * variable.
  */
-#define MSE(W, H)                                               \
-  uint32_t vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
-                                const uint8_t *b, int b_stride, \
-                                uint32_t *sse) {                \
-    int sum;                                                    \
-    variance(a, a_stride, b, b_stride, W, H, sse, &sum);        \
-    return *sse;                                                \
+#define MSE(W, H)                                                        \
+  uint32_t vpx_mse##W##x##H##_c(const uint8_t *src_ptr, int src_stride,  \
+                                const uint8_t *ref_ptr, int ref_stride,  \
+                                uint32_t *sse) {                         \
+    int sum;                                                             \
+    variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum); \
+    return *sse;                                                         \
   }
 
 /* All three forms of the variance are available in the same sizes. */
@@ -237,128 +236,140 @@ void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static void highbd_variance64(const uint8_t *a8, int a_stride,
-                              const uint8_t *b8, int b_stride, int w, int h,
-                              uint64_t *sse, int64_t *sum) {
+static void highbd_variance64(const uint8_t *src8_ptr, int src_stride,
+                              const uint8_t *ref8_ptr, int ref_stride, int w,
+                              int h, uint64_t *sse, int64_t *sum) {
   int i, j;
 
-  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8_ptr);
+  uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref8_ptr);
   *sum = 0;
   *sse = 0;
 
   for (i = 0; i < h; ++i) {
     for (j = 0; j < w; ++j) {
-      const int diff = a[j] - b[j];
+      const int diff = src_ptr[j] - ref_ptr[j];
       *sum += diff;
       *sse += diff * diff;
     }
-    a += a_stride;
-    b += b_stride;
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
   }
 }
 
-static void highbd_8_variance(const uint8_t *a8, int a_stride,
-                              const uint8_t *b8, int b_stride, int w, int h,
-                              uint32_t *sse, int *sum) {
+static void highbd_8_variance(const uint8_t *src8_ptr, int src_stride,
+                              const uint8_t *ref8_ptr, int ref_stride, int w,
+                              int h, uint32_t *sse, int *sum) {
   uint64_t sse_long = 0;
   int64_t sum_long = 0;
-  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long,
+                    &sum_long);
   *sse = (uint32_t)sse_long;
   *sum = (int)sum_long;
 }
 
-static void highbd_10_variance(const uint8_t *a8, int a_stride,
-                               const uint8_t *b8, int b_stride, int w, int h,
-                               uint32_t *sse, int *sum) {
+static void highbd_10_variance(const uint8_t *src8_ptr, int src_stride,
+                               const uint8_t *ref8_ptr, int ref_stride, int w,
+                               int h, uint32_t *sse, int *sum) {
   uint64_t sse_long = 0;
   int64_t sum_long = 0;
-  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long,
+                    &sum_long);
   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
 }
 
-static void highbd_12_variance(const uint8_t *a8, int a_stride,
-                               const uint8_t *b8, int b_stride, int w, int h,
-                               uint32_t *sse, int *sum) {
+static void highbd_12_variance(const uint8_t *src8_ptr, int src_stride,
+                               const uint8_t *ref8_ptr, int ref_stride, int w,
+                               int h, uint32_t *sse, int *sum) {
   uint64_t sse_long = 0;
   int64_t sum_long = 0;
-  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long,
+                    &sum_long);
   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
 }
 
-#define HIGHBD_VAR(W, H)                                                       \
-  uint32_t vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride,  \
-                                              const uint8_t *b, int b_stride,  \
-                                              uint32_t *sse) {                 \
-    int sum;                                                                   \
-    highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum);              \
-    return *sse - (((int64_t)sum * sum) / (W * H));                            \
-  }                                                                            \
-                                                                               \
-  uint32_t vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
-                                               const uint8_t *b, int b_stride, \
-                                               uint32_t *sse) {                \
-    int sum;                                                                   \
-    int64_t var;                                                               \
-    highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
-    return (var >= 0) ? (uint32_t)var : 0;                                     \
-  }                                                                            \
-                                                                               \
-  uint32_t vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
-                                               const uint8_t *b, int b_stride, \
-                                               uint32_t *sse) {                \
-    int sum;                                                                   \
-    int64_t var;                                                               \
-    highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
-    return (var >= 0) ? (uint32_t)var : 0;                                     \
+#define HIGHBD_VAR(W, H)                                                    \
+  uint32_t vpx_highbd_8_variance##W##x##H##_c(                              \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse) {                                      \
+    int sum;                                                                \
+    highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse,  \
+                      &sum);                                                \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));               \
+  }                                                                         \
+                                                                            \
+  uint32_t vpx_highbd_10_variance##W##x##H##_c(                             \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse) {                                      \
+    int sum;                                                                \
+    int64_t var;                                                            \
+    highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+                       &sum);                                               \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));               \
+    return (var >= 0) ? (uint32_t)var : 0;                                  \
+  }                                                                         \
+                                                                            \
+  uint32_t vpx_highbd_12_variance##W##x##H##_c(                             \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse) {                                      \
+    int sum;                                                                \
+    int64_t var;                                                            \
+    highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+                       &sum);                                               \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));               \
+    return (var >= 0) ? (uint32_t)var : 0;                                  \
   }
 
-#define HIGHBD_GET_VAR(S)                                                    \
-  void vpx_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride,  \
-                                        const uint8_t *ref, int ref_stride,  \
-                                        uint32_t *sse, int *sum) {           \
-    highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum);     \
-  }                                                                          \
-                                                                             \
-  void vpx_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
-                                         const uint8_t *ref, int ref_stride, \
-                                         uint32_t *sse, int *sum) {          \
-    highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum);    \
-  }                                                                          \
-                                                                             \
-  void vpx_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
-                                         const uint8_t *ref, int ref_stride, \
-                                         uint32_t *sse, int *sum) {          \
-    highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum);    \
+#define HIGHBD_GET_VAR(S)                                                   \
+  void vpx_highbd_8_get##S##x##S##var_c(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse, int *sum) {                            \
+    highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse,  \
+                      sum);                                                 \
+  }                                                                         \
+                                                                            \
+  void vpx_highbd_10_get##S##x##S##var_c(                                   \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse, int *sum) {                            \
+    highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \
+                       sum);                                                \
+  }                                                                         \
+                                                                            \
+  void vpx_highbd_12_get##S##x##S##var_c(                                   \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse, int *sum) {                            \
+    highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \
+                       sum);                                                \
   }
 
-#define HIGHBD_MSE(W, H)                                                      \
-  uint32_t vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride,  \
-                                         const uint8_t *ref, int ref_stride,  \
-                                         uint32_t *sse) {                     \
-    int sum;                                                                  \
-    highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);     \
-    return *sse;                                                              \
-  }                                                                           \
-                                                                              \
-  uint32_t vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
-                                          const uint8_t *ref, int ref_stride, \
-                                          uint32_t *sse) {                    \
-    int sum;                                                                  \
-    highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);    \
-    return *sse;                                                              \
-  }                                                                           \
-                                                                              \
-  uint32_t vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
-                                          const uint8_t *ref, int ref_stride, \
-                                          uint32_t *sse) {                    \
-    int sum;                                                                  \
-    highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);    \
-    return *sse;                                                              \
+#define HIGHBD_MSE(W, H)                                                    \
+  uint32_t vpx_highbd_8_mse##W##x##H##_c(                                   \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse) {                                      \
+    int sum;                                                                \
+    highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse,  \
+                      &sum);                                                \
+    return *sse;                                                            \
+  }                                                                         \
+                                                                            \
+  uint32_t vpx_highbd_10_mse##W##x##H##_c(                                  \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse) {                                      \
+    int sum;                                                                \
+    highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+                       &sum);                                               \
+    return *sse;                                                            \
+  }                                                                         \
+                                                                            \
+  uint32_t vpx_highbd_12_mse##W##x##H##_c(                                  \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse) {                                      \
+    int sum;                                                                \
+    highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+                       &sum);                                               \
+    return *sse;                                                            \
   }
 
 static void highbd_var_filter_block2d_bil_first_pass(
@@ -403,111 +414,111 @@ static void highbd_var_filter_block2d_bil_second_pass(
   }
 }
 
-#define HIGHBD_SUBPIX_VAR(W, H)                                              \
-  uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_c(                     \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
-      const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
-    uint16_t fdata3[(H + 1) * W];                                            \
-    uint16_t temp2[H * W];                                                   \
-                                                                             \
-    highbd_var_filter_block2d_bil_first_pass(                                \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]);    \
-    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,     \
-                                              bilinear_filters[yoffset]);    \
-                                                                             \
-    return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W,  \
-                                              dst, dst_stride, sse);         \
-  }                                                                          \
-                                                                             \
-  uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_c(                    \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
-      const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
-    uint16_t fdata3[(H + 1) * W];                                            \
-    uint16_t temp2[H * W];                                                   \
-                                                                             \
-    highbd_var_filter_block2d_bil_first_pass(                                \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]);    \
-    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,     \
-                                              bilinear_filters[yoffset]);    \
-                                                                             \
-    return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
-                                               dst, dst_stride, sse);        \
-  }                                                                          \
-                                                                             \
-  uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_c(                    \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
-      const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
-    uint16_t fdata3[(H + 1) * W];                                            \
-    uint16_t temp2[H * W];                                                   \
-                                                                             \
-    highbd_var_filter_block2d_bil_first_pass(                                \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]);    \
-    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,     \
-                                              bilinear_filters[yoffset]);    \
-                                                                             \
-    return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
-                                               dst, dst_stride, sse);        \
+#define HIGHBD_SUBPIX_VAR(W, H)                                                \
+  uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_c(                       \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {                 \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+                                                                               \
+    highbd_var_filter_block2d_bil_first_pass(                                  \
+        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
+                                              bilinear_filters[y_offset]);     \
+                                                                               \
+    return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W,    \
+                                              ref_ptr, ref_stride, sse);       \
+  }                                                                            \
+                                                                               \
+  uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_c(                      \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {                 \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+                                                                               \
+    highbd_var_filter_block2d_bil_first_pass(                                  \
+        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
+                                              bilinear_filters[y_offset]);     \
+                                                                               \
+    return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W,   \
+                                               ref_ptr, ref_stride, sse);      \
+  }                                                                            \
+                                                                               \
+  uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_c(                      \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {                 \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+                                                                               \
+    highbd_var_filter_block2d_bil_first_pass(                                  \
+        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
+                                              bilinear_filters[y_offset]);     \
+                                                                               \
+    return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W,   \
+                                               ref_ptr, ref_stride, sse);      \
   }
 
-#define HIGHBD_SUBPIX_AVG_VAR(W, H)                                          \
-  uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_c(                 \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
-      const uint8_t *dst, int dst_stride, uint32_t *sse,                     \
-      const uint8_t *second_pred) {                                          \
-    uint16_t fdata3[(H + 1) * W];                                            \
-    uint16_t temp2[H * W];                                                   \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                             \
-                                                                             \
-    highbd_var_filter_block2d_bil_first_pass(                                \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]);    \
-    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,     \
-                                              bilinear_filters[yoffset]);    \
-                                                                             \
-    vpx_highbd_comp_avg_pred(temp3, second_pred, W, H,                       \
-                             CONVERT_TO_BYTEPTR(temp2), W);                  \
-                                                                             \
-    return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
-                                              dst, dst_stride, sse);         \
-  }                                                                          \
-                                                                             \
-  uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c(                \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
-      const uint8_t *dst, int dst_stride, uint32_t *sse,                     \
-      const uint8_t *second_pred) {                                          \
-    uint16_t fdata3[(H + 1) * W];                                            \
-    uint16_t temp2[H * W];                                                   \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                             \
-                                                                             \
-    highbd_var_filter_block2d_bil_first_pass(                                \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]);    \
-    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,     \
-                                              bilinear_filters[yoffset]);    \
-                                                                             \
-    vpx_highbd_comp_avg_pred(temp3, second_pred, W, H,                       \
-                             CONVERT_TO_BYTEPTR(temp2), W);                  \
-                                                                             \
-    return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
-                                               dst, dst_stride, sse);        \
-  }                                                                          \
-                                                                             \
-  uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_c(                \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
-      const uint8_t *dst, int dst_stride, uint32_t *sse,                     \
-      const uint8_t *second_pred) {                                          \
-    uint16_t fdata3[(H + 1) * W];                                            \
-    uint16_t temp2[H * W];                                                   \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                             \
-                                                                             \
-    highbd_var_filter_block2d_bil_first_pass(                                \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]);    \
-    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,     \
-                                              bilinear_filters[yoffset]);    \
-                                                                             \
-    vpx_highbd_comp_avg_pred(temp3, second_pred, W, H,                       \
-                             CONVERT_TO_BYTEPTR(temp2), W);                  \
-                                                                             \
-    return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
-                                               dst, dst_stride, sse);        \
+#define HIGHBD_SUBPIX_AVG_VAR(W, H)                                            \
+  uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_c(                   \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                   \
+      const uint8_t *second_pred) {                                            \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
+                                                                               \
+    highbd_var_filter_block2d_bil_first_pass(                                  \
+        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
+                                              bilinear_filters[y_offset]);     \
+                                                                               \
+    vpx_highbd_comp_avg_pred_c(temp3, CONVERT_TO_SHORTPTR(second_pred), W, H,  \
+                               temp2, W);                                      \
+                                                                               \
+    return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,    \
+                                              ref_ptr, ref_stride, sse);       \
+  }                                                                            \
+                                                                               \
+  uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c(                  \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                   \
+      const uint8_t *second_pred) {                                            \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
+                                                                               \
+    highbd_var_filter_block2d_bil_first_pass(                                  \
+        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
+                                              bilinear_filters[y_offset]);     \
+                                                                               \
+    vpx_highbd_comp_avg_pred_c(temp3, CONVERT_TO_SHORTPTR(second_pred), W, H,  \
+                               temp2, W);                                      \
+                                                                               \
+    return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
+                                               ref_ptr, ref_stride, sse);      \
+  }                                                                            \
+                                                                               \
+  uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_c(                  \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                   \
+      const uint8_t *second_pred) {                                            \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
+                                                                               \
+    highbd_var_filter_block2d_bil_first_pass(                                  \
+        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
+                                              bilinear_filters[y_offset]);     \
+                                                                               \
+    vpx_highbd_comp_avg_pred_c(temp3, CONVERT_TO_SHORTPTR(second_pred), W, H,  \
+                               temp2, W);                                      \
+                                                                               \
+    return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
+                                               ref_ptr, ref_stride, sse);      \
   }
 
 /* All three forms of the variance are available in the same sizes. */
@@ -538,12 +549,10 @@ HIGHBD_MSE(16, 8)
 HIGHBD_MSE(8, 16)
 HIGHBD_MSE(8, 8)
 
-void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
-                              int width, int height, const uint8_t *ref8,
-                              int ref_stride) {
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint16_t *pred,
+                                int width, int height, const uint16_t *ref,
+                                int ref_stride) {
   int i, j;
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
   for (i = 0; i < height; ++i) {
     for (j = 0; j < width; ++j) {
       const int tmp = pred[j] + ref[j];
diff --git a/media/libvpx/libvpx/vpx_dsp/variance.h b/media/libvpx/libvpx/vpx_dsp/variance.h
index 4c482551e0..ccdb2f90ba 100644
--- a/media/libvpx/libvpx/vpx_dsp/variance.h
+++ b/media/libvpx/libvpx/vpx_dsp/variance.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_VARIANCE_H_
-#define VPX_DSP_VARIANCE_H_
+#ifndef VPX_VPX_DSP_VARIANCE_H_
+#define VPX_VPX_DSP_VARIANCE_H_
 
 #include "./vpx_config.h"
 
@@ -22,46 +22,45 @@ extern "C" {
 #define FILTER_BITS 7
 #define FILTER_WEIGHT 128
 
-typedef unsigned int (*vpx_sad_fn_t)(const uint8_t *a, int a_stride,
-                                     const uint8_t *b_ptr, int b_stride);
+typedef unsigned int (*vpx_sad_fn_t)(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *ref_ptr, int ref_stride);
 
-typedef unsigned int (*vpx_sad_avg_fn_t)(const uint8_t *a_ptr, int a_stride,
-                                         const uint8_t *b_ptr, int b_stride,
+typedef unsigned int (*vpx_sad_avg_fn_t)(const uint8_t *src_ptr, int src_stride,
+                                         const uint8_t *ref_ptr, int ref_stride,
                                          const uint8_t *second_pred);
 
-typedef void (*vp8_copy32xn_fn_t)(const uint8_t *a, int a_stride, uint8_t *b,
-                                  int b_stride, int n);
+typedef void (*vp8_copy32xn_fn_t)(const uint8_t *src_ptr, int src_stride,
+                                  uint8_t *ref_ptr, int ref_stride, int n);
 
-typedef void (*vpx_sad_multi_fn_t)(const uint8_t *a, int a_stride,
-                                   const uint8_t *b, int b_stride,
+typedef void (*vpx_sad_multi_fn_t)(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *ref_ptr, int ref_stride,
                                    unsigned int *sad_array);
 
-typedef void (*vpx_sad_multi_d_fn_t)(const uint8_t *a, int a_stride,
+typedef void (*vpx_sad_multi_d_fn_t)(const uint8_t *src_ptr, int src_stride,
                                      const uint8_t *const b_array[],
-                                     int b_stride, unsigned int *sad_array);
+                                     int ref_stride, unsigned int *sad_array);
 
-typedef unsigned int (*vpx_variance_fn_t)(const uint8_t *a, int a_stride,
-                                          const uint8_t *b, int b_stride,
-                                          unsigned int *sse);
+typedef unsigned int (*vpx_variance_fn_t)(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *ref_ptr,
+                                          int ref_stride, unsigned int *sse);
 
-typedef unsigned int (*vpx_subpixvariance_fn_t)(const uint8_t *a, int a_stride,
-                                                int xoffset, int yoffset,
-                                                const uint8_t *b, int b_stride,
-                                                unsigned int *sse);
+typedef unsigned int (*vpx_subpixvariance_fn_t)(
+    const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,
+    const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
 typedef unsigned int (*vpx_subp_avg_variance_fn_t)(
-    const uint8_t *a_ptr, int a_stride, int xoffset, int yoffset,
-    const uint8_t *b_ptr, int b_stride, unsigned int *sse,
+    const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,
+    const uint8_t *ref_ptr, int ref_stride, unsigned int *sse,
     const uint8_t *second_pred);
+
 #if CONFIG_VP8
 typedef struct variance_vtable {
   vpx_sad_fn_t sdf;
   vpx_variance_fn_t vf;
   vpx_subpixvariance_fn_t svf;
-  vpx_sad_multi_fn_t sdx3f;
-  vpx_sad_multi_fn_t sdx8f;
   vpx_sad_multi_d_fn_t sdx4df;
-#if ARCH_X86 || ARCH_X86_64
+#if VPX_ARCH_X86 || VPX_ARCH_X86_64
   vp8_copy32xn_fn_t copymem;
 #endif
 } vp8_variance_fn_ptr_t;
@@ -70,13 +69,15 @@ typedef struct variance_vtable {
 #if CONFIG_VP9
 typedef struct vp9_variance_vtable {
   vpx_sad_fn_t sdf;
+  // Same as normal sad, but downsample the rows by a factor of 2.
+  vpx_sad_fn_t sdsf;
   vpx_sad_avg_fn_t sdaf;
   vpx_variance_fn_t vf;
   vpx_subpixvariance_fn_t svf;
   vpx_subp_avg_variance_fn_t svaf;
-  vpx_sad_multi_fn_t sdx3f;
-  vpx_sad_multi_fn_t sdx8f;
   vpx_sad_multi_d_fn_t sdx4df;
+  // Same as sadx4, but downsample the rows by a factor of 2.
+  vpx_sad_multi_d_fn_t sdsx4df;
 } vp9_variance_fn_ptr_t;
 #endif  // CONFIG_VP9
 
@@ -84,4 +85,4 @@ typedef struct vp9_variance_vtable {
 }  // extern "C"
 #endif
 
-#endif  // VPX_DSP_VARIANCE_H_
+#endif  // VPX_VPX_DSP_VARIANCE_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_convolve.c b/media/libvpx/libvpx/vpx_dsp/vpx_convolve.c
index cab6368e60..e55a963f9d 100644
--- a/media/libvpx/libvpx/vpx_dsp/vpx_convolve.c
+++ b/media/libvpx/libvpx/vpx_dsp/vpx_convolve.c
@@ -113,11 +113,52 @@ static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                     ptrdiff_t dst_stride, const InterpKernel *const x_filters,
-                     int x0_q4, int x_step_q4,
-                     const InterpKernel *const y_filters, int y0_q4,
-                     int y_step_q4, int w, int h) {
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const InterpKernel *filter, int x0_q4, int x_step_q4,
+                           int y0_q4, int y_step_q4, int w, int h) {
+  (void)y0_q4;
+  (void)y_step_q4;
+  convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w,
+                 h);
+}
+
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const InterpKernel *filter, int x0_q4,
+                               int x_step_q4, int y0_q4, int y_step_q4, int w,
+                               int h) {
+  (void)y0_q4;
+  (void)y_step_q4;
+  convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                     w, h);
+}
+
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const InterpKernel *filter, int x0_q4, int x_step_q4,
+                          int y0_q4, int y_step_q4, int w, int h) {
+  (void)x0_q4;
+  (void)x_step_q4;
+  convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w,
+                h);
+}
+
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const InterpKernel *filter, int x0_q4,
+                              int x_step_q4, int y0_q4, int y_step_q4, int w,
+                              int h) {
+  (void)x0_q4;
+  (void)x_step_q4;
+  convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4,
+                    w, h);
+}
+
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                     ptrdiff_t dst_stride, const InterpKernel *filter,
+                     int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w,
+                     int h) {
   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
   // 2d filtering proceeds in 2 steps:
   //   (1) Interpolate horizontally into an intermediate buffer, temp.
@@ -130,118 +171,49 @@ static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
   // --Must round-up because block may be located at sub-pixel position.
   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  // When calling in frame scaling function, the smallest scaling factor is x1/4
+  // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+  // big enough.
   uint8_t temp[64 * 135];
   const int intermediate_height =
       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
 
   assert(w <= 64);
   assert(h <= 64);
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
+  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+  assert(x_step_q4 <= 64);
 
   convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
-                 x_filters, x0_q4, x_step_q4, w, intermediate_height);
-  convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
-                y_filters, y0_q4, y_step_q4, w, h);
-}
-
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4, int w,
-                           int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  (void)filter_y;
-  (void)y_step_q4;
-
-  convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
-                 w, h);
-}
-
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4, int w,
-                               int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  (void)filter_y;
-  (void)y_step_q4;
-
-  convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                     x_step_q4, w, h);
-}
-
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                          uint8_t *dst, ptrdiff_t dst_stride,
-                          const int16_t *filter_x, int x_step_q4,
-                          const int16_t *filter_y, int y_step_q4, int w,
-                          int h) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  (void)filter_x;
-  (void)x_step_q4;
-
-  convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
-                w, h);
-}
-
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
-                              int h) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  (void)filter_x;
-  (void)x_step_q4;
-
-  convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
-                    y_step_q4, w, h);
-}
-
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                     ptrdiff_t dst_stride, const int16_t *filter_x,
-                     int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                     int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
-           filters_y, y0_q4, y_step_q4, w, h);
+                 filter, x0_q4, x_step_q4, w, intermediate_height);
+  convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter,
+                y0_q4, y_step_q4, w, h);
 }
 
 void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                         ptrdiff_t dst_stride, const InterpKernel *filter,
+                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
                          int w, int h) {
   // Fixed size intermediate buffer places limits on parameters.
   DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
   assert(w <= 64);
   assert(h <= 64);
 
-  vpx_convolve8_c(src, src_stride, temp, 64, filter_x, x_step_q4, filter_y,
+  vpx_convolve8_c(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4,
                   y_step_q4, w, h);
-  vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
+  vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);
 }
 
 void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int filter_x_stride, const int16_t *filter_y,
-                         int filter_y_stride, int w, int h) {
+                         ptrdiff_t dst_stride, const InterpKernel *filter,
+                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+                         int w, int h) {
   int r;
 
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
 
   for (r = h; r > 0; --r) {
     memcpy(dst, src, w);
@@ -251,15 +223,16 @@ void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
 }
 
 void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                        ptrdiff_t dst_stride, const int16_t *filter_x,
-                        int filter_x_stride, const int16_t *filter_y,
-                        int filter_y_stride, int w, int h) {
+                        ptrdiff_t dst_stride, const InterpKernel *filter,
+                        int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+                        int w, int h) {
   int x, y;
 
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
 
   for (y = 0; y < h; ++y) {
     for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
@@ -269,63 +242,60 @@ void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
 }
 
 void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                        ptrdiff_t dst_stride, const int16_t *filter_x,
-                        int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                        ptrdiff_t dst_stride, const InterpKernel *filter,
+                        int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
                         int w, int h) {
-  vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                        filter_y, y_step_q4, w, h);
+  vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                        x_step_q4, y0_q4, y_step_q4, w, h);
 }
 
 void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                       ptrdiff_t dst_stride, const int16_t *filter_x,
-                       int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                       ptrdiff_t dst_stride, const InterpKernel *filter,
+                       int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
                        int w, int h) {
-  vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                       filter_y, y_step_q4, w, h);
+  vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                       x_step_q4, y0_q4, y_step_q4, w, h);
 }
 
 void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                     ptrdiff_t dst_stride, const int16_t *filter_x,
-                     int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                     int w, int h) {
-  vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                  filter_y, y_step_q4, w, h);
+                     ptrdiff_t dst_stride, const InterpKernel *filter,
+                     int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w,
+                     int h) {
+  vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                  y0_q4, y_step_q4, w, h);
 }
 
 void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4, int w,
+                            const InterpKernel *filter, int x0_q4,
+                            int x_step_q4, int y0_q4, int y_step_q4, int w,
                             int h) {
-  vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                            x_step_q4, filter_y, y_step_q4, w, h);
+  vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                            x_step_q4, y0_q4, y_step_q4, w, h);
 }
 
 void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4, int w,
-                           int h) {
-  vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                           x_step_q4, filter_y, y_step_q4, w, h);
+                           const InterpKernel *filter, int x0_q4, int x_step_q4,
+                           int y0_q4, int y_step_q4, int w, int h) {
+  vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                           x_step_q4, y0_q4, y_step_q4, w, h);
 }
 
 void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                         ptrdiff_t dst_stride, const InterpKernel *filter,
+                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
                          int w, int h) {
-  vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                      filter_y, y_step_q4, w, h);
+  vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                      x_step_q4, y0_q4, y_step_q4, w, h);
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
-                                  uint8_t *dst8, ptrdiff_t dst_stride,
+static void highbd_convolve_horiz(const uint16_t *src, ptrdiff_t src_stride,
+                                  uint16_t *dst, ptrdiff_t dst_stride,
                                   const InterpKernel *x_filters, int x0_q4,
                                   int x_step_q4, int w, int h, int bd) {
   int x, y;
-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   src -= SUBPEL_TAPS / 2 - 1;
 
   for (y = 0; y < h; ++y) {
@@ -343,13 +313,11 @@ static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
   }
 }
 
-static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
-                                      uint8_t *dst8, ptrdiff_t dst_stride,
+static void highbd_convolve_avg_horiz(const uint16_t *src, ptrdiff_t src_stride,
+                                      uint16_t *dst, ptrdiff_t dst_stride,
                                       const InterpKernel *x_filters, int x0_q4,
                                       int x_step_q4, int w, int h, int bd) {
   int x, y;
-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   src -= SUBPEL_TAPS / 2 - 1;
 
   for (y = 0; y < h; ++y) {
@@ -369,13 +337,11 @@ static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
   }
 }
 
-static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
-                                 uint8_t *dst8, ptrdiff_t dst_stride,
+static void highbd_convolve_vert(const uint16_t *src, ptrdiff_t src_stride,
+                                 uint16_t *dst, ptrdiff_t dst_stride,
                                  const InterpKernel *y_filters, int y0_q4,
                                  int y_step_q4, int w, int h, int bd) {
   int x, y;
-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
 
   for (x = 0; x < w; ++x) {
@@ -395,13 +361,11 @@ static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
   }
 }
 
-static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
-                                     uint8_t *dst8, ptrdiff_t dst_stride,
+static void highbd_convolve_avg_vert(const uint16_t *src, ptrdiff_t src_stride,
+                                     uint16_t *dst, ptrdiff_t dst_stride,
                                      const InterpKernel *y_filters, int y0_q4,
                                      int y_step_q4, int w, int h, int bd) {
   int x, y;
-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
 
   for (x = 0; x < w; ++x) {
@@ -423,11 +387,11 @@ static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
   }
 }
 
-static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const InterpKernel *const x_filters, int x0_q4,
-                            int x_step_q4, const InterpKernel *const y_filters,
-                            int y0_q4, int y_step_q4, int w, int h, int bd) {
+static void highbd_convolve(const uint16_t *src, ptrdiff_t src_stride,
+                            uint16_t *dst, ptrdiff_t dst_stride,
+                            const InterpKernel *filter, int x0_q4,
+                            int x_step_q4, int y0_q4, int y_step_q4, int w,
+                            int h, int bd) {
   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
   // 2d filtering proceeds in 2 steps:
   //   (1) Interpolate horizontally into an intermediate buffer, temp.
@@ -450,116 +414,97 @@ static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
   assert(x_step_q4 <= 32);
 
   highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
-                        CONVERT_TO_BYTEPTR(temp), 64, x_filters, x0_q4,
-                        x_step_q4, w, intermediate_height, bd);
-  highbd_convolve_vert(CONVERT_TO_BYTEPTR(temp) + 64 * (SUBPEL_TAPS / 2 - 1),
-                       64, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h,
-                       bd);
+                        temp, 64, filter, x0_q4, x_step_q4, w,
+                        intermediate_height, bd);
+  highbd_convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
+                       filter, y0_q4, y_step_q4, w, h, bd);
 }
 
-void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4, int w,
-                                  int h, int bd) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  (void)filter_y;
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride,
+                                  uint16_t *dst, ptrdiff_t dst_stride,
+                                  const InterpKernel *filter, int x0_q4,
+                                  int x_step_q4, int y0_q4, int y_step_q4,
+                                  int w, int h, int bd) {
+  (void)y0_q4;
   (void)y_step_q4;
 
-  highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+  highbd_convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4,
                         x_step_q4, w, h, bd);
 }
 
-void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                                      uint8_t *dst, ptrdiff_t dst_stride,
-                                      const int16_t *filter_x, int x_step_q4,
-                                      const int16_t *filter_y, int y_step_q4,
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride,
+                                      uint16_t *dst, ptrdiff_t dst_stride,
+                                      const InterpKernel *filter, int x0_q4,
+                                      int x_step_q4, int y0_q4, int y_step_q4,
                                       int w, int h, int bd) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  (void)filter_y;
+  (void)y0_q4;
   (void)y_step_q4;
 
-  highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+  highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4,
                             x_step_q4, w, h, bd);
 }
 
-void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const int16_t *filter_x, int x_step_q4,
-                                 const int16_t *filter_y, int y_step_q4, int w,
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride,
+                                 uint16_t *dst, ptrdiff_t dst_stride,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
                                  int h, int bd) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  (void)filter_x;
+  (void)x0_q4;
   (void)x_step_q4;
 
-  highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
+  highbd_convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4,
                        y_step_q4, w, h, bd);
 }
 
-void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                                     uint8_t *dst, ptrdiff_t dst_stride,
-                                     const int16_t *filter_x, int x_step_q4,
-                                     const int16_t *filter_y, int y_step_q4,
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride,
+                                     uint16_t *dst, ptrdiff_t dst_stride,
+                                     const InterpKernel *filter, int x0_q4,
+                                     int x_step_q4, int y0_q4, int y_step_q4,
                                      int w, int h, int bd) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  (void)filter_x;
+  (void)x0_q4;
   (void)x_step_q4;
 
-  highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
+  highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4,
                            y_step_q4, w, h, bd);
 }
 
-void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4, int w,
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride,
+                            uint16_t *dst, ptrdiff_t dst_stride,
+                            const InterpKernel *filter, int x0_q4,
+                            int x_step_q4, int y0_q4, int y_step_q4, int w,
                             int h, int bd) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
-                  filters_y, y0_q4, y_step_q4, w, h, bd);
+  highbd_convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                  y0_q4, y_step_q4, w, h, bd);
 }
 
-void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride,
-                                const int16_t *filter_x, int x_step_q4,
-                                const int16_t *filter_y, int y_step_q4, int w,
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride,
+                                uint16_t *dst, ptrdiff_t dst_stride,
+                                const InterpKernel *filter, int x0_q4,
+                                int x_step_q4, int y0_q4, int y_step_q4, int w,
                                 int h, int bd) {
   // Fixed size intermediate buffer places limits on parameters.
   DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]);
   assert(w <= 64);
   assert(h <= 64);
 
-  vpx_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64,
-                         filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
-  vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride, NULL,
-                            0, NULL, 0, w, h, bd);
+  vpx_highbd_convolve8_c(src, src_stride, temp, 64, filter, x0_q4, x_step_q4,
+                         y0_q4, y_step_q4, w, h, bd);
+  vpx_highbd_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h,
+                            bd);
 }
 
-void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
-                                uint8_t *dst8, ptrdiff_t dst_stride,
-                                const int16_t *filter_x, int filter_x_stride,
-                                const int16_t *filter_y, int filter_y_stride,
-                                int w, int h, int bd) {
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride,
+                                uint16_t *dst, ptrdiff_t dst_stride,
+                                const InterpKernel *filter, int x0_q4,
+                                int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                int h, int bd) {
   int r;
-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
 
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
   (void)bd;
 
   for (r = h; r > 0; --r) {
@@ -569,19 +514,18 @@ void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
   }
 }
 
-void vpx_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
-                               uint8_t *dst8, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int filter_x_stride,
-                               const int16_t *filter_y, int filter_y_stride,
-                               int w, int h, int bd) {
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride,
+                               uint16_t *dst, ptrdiff_t dst_stride,
+                               const InterpKernel *filter, int x0_q4,
+                               int x_step_q4, int y0_q4, int y_step_q4, int w,
+                               int h, int bd) {
   int x, y;
-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
 
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
   (void)bd;
 
   for (y = 0; y < h; ++y) {
diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_convolve.h b/media/libvpx/libvpx/vpx_dsp/vpx_convolve.h
index ee9744b3ae..d5793e17ad 100644
--- a/media/libvpx/libvpx/vpx_dsp/vpx_convolve.h
+++ b/media/libvpx/libvpx/vpx_dsp/vpx_convolve.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef VPX_DSP_VPX_CONVOLVE_H_
-#define VPX_DSP_VPX_CONVOLVE_H_
+#ifndef VPX_VPX_DSP_VPX_CONVOLVE_H_
+#define VPX_VPX_DSP_VPX_CONVOLVE_H_
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
@@ -19,15 +19,15 @@ extern "C" {
 
 typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
+                              const InterpKernel *filter, int x0_q4,
+                              int x_step_q4, int y0_q4, int y_step_q4, int w,
                               int h);
 
 #if CONFIG_VP9_HIGHBITDEPTH
-typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
-                                     uint8_t *dst, ptrdiff_t dst_stride,
-                                     const int16_t *filter_x, int x_step_q4,
-                                     const int16_t *filter_y, int y_step_q4,
+typedef void (*highbd_convolve_fn_t)(const uint16_t *src, ptrdiff_t src_stride,
+                                     uint16_t *dst, ptrdiff_t dst_stride,
+                                     const InterpKernel *filter, int x0_q4,
+                                     int x_step_q4, int y0_q4, int y_step_q4,
                                      int w, int h, int bd);
 #endif
 
@@ -35,4 +35,4 @@ typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
 }  // extern "C"
 #endif
 
-#endif  // VPX_DSP_VPX_CONVOLVE_H_
+#endif  // VPX_VPX_DSP_VPX_CONVOLVE_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk b/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk
index 18c225658e..357ad08508 100644
--- a/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk
+++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk
@@ -13,6 +13,13 @@ DSP_SRCS-yes += vpx_dsp_common.h
 
 DSP_SRCS-$(HAVE_MSA)    += mips/macros_msa.h
 
+DSP_SRCS-$(HAVE_AVX2)   += x86/bitdepth_conversion_avx2.h
+DSP_SRCS-$(HAVE_SSE2)   += x86/bitdepth_conversion_sse2.h
+# This file is included in libs.mk. Including it here would cause it to be
+# compiled into an object. Even as an empty file, this would create an
+# executable section on the stack.
+#DSP_SRCS-$(HAVE_SSE2)   += x86/bitdepth_conversion_sse2$(ASM)
+
 # bit reader
 DSP_SRCS-yes += prob.h
 DSP_SRCS-yes += prob.c
@@ -24,10 +31,15 @@ DSP_SRCS-yes += bitwriter_buffer.c
 DSP_SRCS-yes += bitwriter_buffer.h
 DSP_SRCS-yes += psnr.c
 DSP_SRCS-yes += psnr.h
+DSP_SRCS-yes += sse.c
 DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.c
 DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.h
 DSP_SRCS-$(CONFIG_INTERNAL_STATS) += psnrhvs.c
 DSP_SRCS-$(CONFIG_INTERNAL_STATS) += fastssim.c
+DSP_SRCS-$(HAVE_NEON) += arm/sse_neon.c
+DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/sse_neon_dotprod.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/sse_sse4.c
+DSP_SRCS-$(HAVE_AVX2) += x86/sse_avx2.c
 endif
 
 ifeq ($(CONFIG_DECODERS),yes)
@@ -40,14 +52,14 @@ endif
 # intra predictions
 DSP_SRCS-yes += intrapred.c
 
-DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm
-DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm
+DSP_SRCS-$(HAVE_VSX) += ppc/intrapred_vsx.c
 
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-DSP_SRCS-$(HAVE_SSE)  += x86/highbd_intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_intrin_sse2.c
+DSP_SRCS-$(HAVE_SSSE3) += x86/highbd_intrapred_intrin_ssse3.c
 DSP_SRCS-$(HAVE_NEON) += arm/highbd_intrapred_neon.c
 endif  # CONFIG_VP9_HIGHBITDEPTH
 
@@ -60,11 +72,14 @@ DSP_SRCS-$(HAVE_MSA) += mips/deblock_msa.c
 DSP_SRCS-$(HAVE_NEON) += arm/deblock_neon.c
 DSP_SRCS-$(HAVE_SSE2) += x86/add_noise_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/deblock_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/post_proc_sse2.c
+DSP_SRCS-$(HAVE_VSX) += ppc/deblock_vsx.c
 endif # CONFIG_POSTPROC
 
 DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM)
 DSP_SRCS-$(HAVE_NEON) += arm/intrapred_neon.c
 DSP_SRCS-$(HAVE_MSA) += mips/intrapred_msa.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/intrapred_lsx.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred4_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred8_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred16_dspr2.c
@@ -72,14 +87,19 @@ DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred16_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/common_dspr2.h
 DSP_SRCS-$(HAVE_DSPR2)  += mips/common_dspr2.c
 
+DSP_SRCS-yes += vpx_filter.h
+ifeq ($(CONFIG_VP9),yes)
 # interpolation filters
 DSP_SRCS-yes += vpx_convolve.c
 DSP_SRCS-yes += vpx_convolve.h
-DSP_SRCS-yes += vpx_filter.h
 
-DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/convolve.h
-DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/vpx_asm_stubs.c
+DSP_SRCS-$(VPX_ARCH_X86)$(VPX_ARCH_X86_64) += x86/convolve.h
+
+DSP_SRCS-$(HAVE_SSE2) += x86/convolve_sse2.h
+DSP_SRCS-$(HAVE_SSSE3) += x86/convolve_ssse3.h
+DSP_SRCS-$(HAVE_AVX2) += x86/convolve_avx2.h
 DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_subpixel_8t_sse2.asm
+DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_subpixel_4t_intrin_sse2.c
 DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_subpixel_bilinear_sse2.asm
 DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm
 DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_bilinear_ssse3.asm
@@ -88,19 +108,30 @@ DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_intrin_ssse3.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_high_subpixel_8t_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_high_subpixel_bilinear_sse2.asm
+DSP_SRCS-$(HAVE_AVX2)  += x86/highbd_convolve_avx2.c
 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_vpx_convolve_copy_neon.c
 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_vpx_convolve_avg_neon.c
 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_vpx_convolve8_neon.c
-DSP_SRCS-$(HAVE_NEON)  += arm/highbd_vpx_convolve_neon.c
+DSP_SRCS-$(HAVE_SVE)   += arm/highbd_vpx_convolve8_sve.c
+DSP_SRCS-$(HAVE_SVE2)  += arm/highbd_vpx_convolve8_sve2.c
 endif
 
 DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_convolve_copy_sse2.asm
+DSP_SRCS-$(HAVE_NEON)  += arm/vpx_scaled_convolve8_neon.c
 
 ifeq ($(HAVE_NEON_ASM),yes)
 DSP_SRCS-yes += arm/vpx_convolve_copy_neon_asm$(ASM)
-DSP_SRCS-yes += arm/vpx_convolve8_avg_neon_asm$(ASM)
-DSP_SRCS-yes += arm/vpx_convolve8_neon_asm$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_horiz_filter_type2_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_vert_filter_type2_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_horiz_filter_type1_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_vert_filter_type1_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_avg_horiz_filter_type2_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_avg_vert_filter_type2_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_avg_horiz_filter_type1_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_avg_vert_filter_type1_neon$(ASM)
 DSP_SRCS-yes += arm/vpx_convolve_avg_neon_asm$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_neon_asm.c
+DSP_SRCS-yes += arm/vpx_convolve8_neon_asm.h
 DSP_SRCS-yes += arm/vpx_convolve_neon.c
 else
 ifeq ($(HAVE_NEON),yes)
@@ -108,6 +139,8 @@ DSP_SRCS-yes += arm/vpx_convolve_copy_neon.c
 DSP_SRCS-yes += arm/vpx_convolve8_neon.c
 DSP_SRCS-yes += arm/vpx_convolve_avg_neon.c
 DSP_SRCS-yes += arm/vpx_convolve_neon.c
+DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/vpx_convolve8_neon_dotprod.c
+DSP_SRCS-$(HAVE_NEON_I8MM) += arm/vpx_convolve8_neon_i8mm.c
 endif  # HAVE_NEON
 endif  # HAVE_NEON_ASM
 
@@ -121,6 +154,7 @@ DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_vert_msa.c
 DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_avg_msa.c
 DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_copy_msa.c
 DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_msa.h
+DSP_SRCS-$(HAVE_MMI) += mips/vpx_convolve8_mmi.c
 
 # common (dspr2)
 DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve_common_dspr2.h
@@ -135,11 +169,24 @@ DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_horiz_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_vert_dspr2.c
 
+DSP_SRCS-$(HAVE_VSX)  += ppc/vpx_convolve_vsx.c
+
+# common (lsx)
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_avg_horiz_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_avg_vert_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_horiz_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_vert_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_avg_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve_avg_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve_copy_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve_lsx.h
+
 # loop filters
 DSP_SRCS-yes += loopfilter.c
 
-DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64)   += x86/loopfilter_intrin_sse2.c
-DSP_SRCS-$(HAVE_AVX2)                += x86/loopfilter_avx2.c
+DSP_SRCS-$(HAVE_SSE2)  += x86/loopfilter_sse2.c
+DSP_SRCS-$(HAVE_AVX2)  += x86/loopfilter_avx2.c
 
 ifeq ($(HAVE_NEON_ASM),yes)
 DSP_SRCS-yes  += arm/loopfilter_16_neon$(ASM)
@@ -161,14 +208,21 @@ DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_horiz_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_vert_dspr2.c
 
+DSP_SRCS-$(HAVE_LSX)    += loongarch/loopfilter_lsx.h
+DSP_SRCS-$(HAVE_LSX)    += loongarch/loopfilter_16_lsx.c
+DSP_SRCS-$(HAVE_LSX)    += loongarch/loopfilter_8_lsx.c
+DSP_SRCS-$(HAVE_LSX)    += loongarch/loopfilter_4_lsx.c
+
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_NEON)   += arm/highbd_loopfilter_neon.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_loopfilter_sse2.c
 endif  # CONFIG_VP9_HIGHBITDEPTH
+endif # CONFIG_VP9
 
 DSP_SRCS-yes            += txfm_common.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/txfm_common_sse2.h
 DSP_SRCS-$(HAVE_MSA)    += mips/txfm_macros_msa.h
+DSP_SRCS-$(HAVE_LSX)    += loongarch/txfm_macros_lsx.h
 # forward transform
 ifeq ($(CONFIG_VP9_ENCODER),yes)
 DSP_SRCS-yes            += fwd_txfm.c
@@ -177,15 +231,27 @@ DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_sse2.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_impl_sse2.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_dct32x32_impl_sse2.h
-ifeq ($(ARCH_X86_64),yes)
+ifeq ($(VPX_ARCH_X86_64),yes)
 DSP_SRCS-$(HAVE_SSSE3)  += x86/fwd_txfm_ssse3_x86_64.asm
 endif
-DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_txfm_avx2.c
 DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_dct32x32_impl_avx2.h
-DSP_SRCS-$(HAVE_NEON)   += arm/fwd_txfm_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/fdct4x4_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/fdct8x8_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/fdct16x16_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/fdct32x32_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/fdct_partial_neon.c
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.h
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.c
+DSP_SRCS-$(HAVE_LSX)    += loongarch/fwd_txfm_lsx.h
+DSP_SRCS-$(HAVE_LSX)    += loongarch/fwd_txfm_lsx.c
+
+ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_txfm_avx2.c
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_dct32x32_msa.c
+DSP_SRCS-$(HAVE_LSX)    += loongarch/fwd_dct32x32_lsx.c
+endif  # !CONFIG_VP9_HIGHBITDEPTH
+
+DSP_SRCS-$(HAVE_VSX)    += ppc/fdct32x32_vsx.c
 endif  # CONFIG_VP9_ENCODER
 
 # inverse transform
@@ -194,13 +260,15 @@ DSP_SRCS-yes            += inv_txfm.h
 DSP_SRCS-yes            += inv_txfm.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/inv_txfm_avx2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/inv_wht_sse2.asm
-ifeq ($(ARCH_X86_64),yes)
-DSP_SRCS-$(HAVE_SSSE3)  += x86/inv_txfm_ssse3_x86_64.asm
-endif  # ARCH_X86_64
+DSP_SRCS-$(HAVE_SSSE3)  += x86/inv_txfm_ssse3.h
+DSP_SRCS-$(HAVE_SSSE3)  += x86/inv_txfm_ssse3.c
 
 DSP_SRCS-$(HAVE_NEON_ASM) += arm/save_reg_neon$(ASM)
 
+DSP_SRCS-$(HAVE_VSX) += ppc/inv_txfm_vsx.c
+
 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_MSA)   += mips/inv_txfm_msa.h
 DSP_SRCS-$(HAVE_MSA)   += mips/idct4x4_msa.c
@@ -214,29 +282,42 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/itrans8_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c
+
+DSP_SRCS-$(HAVE_LSX)   += loongarch/idct32x32_lsx.c
 else  # CONFIG_VP9_HIGHBITDEPTH
 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct4x4_add_neon.c
 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct8x8_add_neon.c
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct16x16_add_neon.c
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct32x32_add_neon.c
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct32x32_34_add_neon.c
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct32x32_135_add_neon.c
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct32x32_1024_add_neon.c
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct_neon.h
+DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_inv_txfm_sse2.h
+DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct4x4_add_sse2.c
+DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct8x8_add_sse2.c
+DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct16x16_add_sse2.c
+DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct32x32_add_sse2.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_inv_txfm_sse4.h
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct4x4_add_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct8x8_add_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct16x16_add_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct32x32_add_sse4.c
 endif  # !CONFIG_VP9_HIGHBITDEPTH
 
 ifeq ($(HAVE_NEON_ASM),yes)
 DSP_SRCS-yes += arm/idct_neon$(ASM)
 DSP_SRCS-yes += arm/idct4x4_1_add_neon$(ASM)
 DSP_SRCS-yes += arm/idct4x4_add_neon$(ASM)
-DSP_SRCS-yes += arm/idct8x8_1_add_neon$(ASM)
-DSP_SRCS-yes += arm/idct8x8_add_neon$(ASM)
-DSP_SRCS-yes += arm/idct16x16_1_add_neon$(ASM)
-DSP_SRCS-yes += arm/idct16x16_add_neon$(ASM)
 else
 DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_1_add_neon.c
 DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_add_neon.c
+endif  # HAVE_NEON_ASM
+DSP_SRCS-$(HAVE_NEON) += arm/idct_neon.h
 DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_1_add_neon.c
 DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_add_neon.c
 DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_1_add_neon.c
 DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_add_neon.c
-endif  # HAVE_NEON_ASM
-DSP_SRCS-$(HAVE_NEON) += arm/idct_neon.h
-DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_neon.c
 DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_1_add_neon.c
 DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_34_add_neon.c
 DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_135_add_neon.c
@@ -249,56 +330,89 @@ ifeq ($(CONFIG_VP9_ENCODER),yes)
 DSP_SRCS-yes            += quantize.c
 DSP_SRCS-yes            += quantize.h
 
-DSP_SRCS-$(HAVE_SSE2)   += x86/fdct.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/quantize_sse2.c
+DSP_SRCS-$(HAVE_SSE2)   += x86/quantize_sse2.h
+DSP_SRCS-$(HAVE_SSSE3)  += x86/quantize_ssse3.c
+DSP_SRCS-$(HAVE_SSSE3)  += x86/quantize_ssse3.h
+DSP_SRCS-$(HAVE_AVX)    += x86/quantize_avx.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/quantize_avx2.c
+DSP_SRCS-$(HAVE_NEON)   += arm/quantize_neon.c
+DSP_SRCS-$(HAVE_VSX)    += ppc/quantize_vsx.c
+DSP_SRCS-$(HAVE_LSX)    += loongarch/quantize_lsx.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c
-endif
-ifeq ($(ARCH_X86_64),yes)
-DSP_SRCS-$(HAVE_SSSE3)  += x86/quantize_ssse3_x86_64.asm
-DSP_SRCS-$(HAVE_AVX)    += x86/quantize_avx_x86_64.asm
+DSP_SRCS-$(HAVE_AVX2)   += x86/highbd_quantize_intrin_avx2.c
+DSP_SRCS-$(HAVE_NEON)   += arm/highbd_quantize_neon.c
 endif
 
 # avg
 DSP_SRCS-yes           += avg.c
 DSP_SRCS-$(HAVE_SSE2)  += x86/avg_intrin_sse2.c
+DSP_SRCS-$(HAVE_AVX2)  += x86/avg_intrin_avx2.c
 DSP_SRCS-$(HAVE_NEON)  += arm/avg_neon.c
-DSP_SRCS-$(HAVE_MSA)   += mips/avg_msa.c
 DSP_SRCS-$(HAVE_NEON)  += arm/hadamard_neon.c
-ifeq ($(ARCH_X86_64),yes)
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_hadamard_neon.c
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_avg_neon.c
+endif
+DSP_SRCS-$(HAVE_MSA)   += mips/avg_msa.c
+DSP_SRCS-$(HAVE_LSX)   += loongarch/avg_lsx.c
+ifeq ($(VPX_ARCH_X86_64),yes)
 DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm
 endif
+DSP_SRCS-$(HAVE_VSX)   += ppc/hadamard_vsx.c
 
 endif  # CONFIG_VP9_ENCODER
 
+# skin detection
+DSP_SRCS-yes            += skin_detection.h
+DSP_SRCS-yes            += skin_detection.c
+
 ifeq ($(CONFIG_ENCODERS),yes)
 DSP_SRCS-yes            += sad.c
 DSP_SRCS-yes            += subtract.c
 DSP_SRCS-yes            += sum_squares.c
+DSP_SRCS-$(HAVE_NEON)   += arm/sum_squares_neon.c
+DSP_SRCS-$(HAVE_SVE)    += arm/sum_squares_sve.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/sum_squares_sse2.c
+DSP_SRCS-$(HAVE_MSA)    += mips/sum_squares_msa.c
 
 DSP_SRCS-$(HAVE_NEON)   += arm/sad4d_neon.c
+DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/sad4d_neon_dotprod.c
 DSP_SRCS-$(HAVE_NEON)   += arm/sad_neon.c
+DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/sad_neon_dotprod.c
 DSP_SRCS-$(HAVE_NEON)   += arm/subtract_neon.c
 
 DSP_SRCS-$(HAVE_MSA)    += mips/sad_msa.c
 DSP_SRCS-$(HAVE_MSA)    += mips/subtract_msa.c
 
-DSP_SRCS-$(HAVE_SSE3)   += x86/sad_sse3.asm
-DSP_SRCS-$(HAVE_SSSE3)  += x86/sad_ssse3.asm
-DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm
+DSP_SRCS-$(HAVE_LSX)    += loongarch/sad_lsx.c
+
+DSP_SRCS-$(HAVE_MMI)    += mips/sad_mmi.c
+DSP_SRCS-$(HAVE_MMI)    += mips/subtract_mmi.c
+
 DSP_SRCS-$(HAVE_AVX2)   += x86/sad4d_avx2.c
 DSP_SRCS-$(HAVE_AVX2)   += x86/sad_avx2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/subtract_avx2.c
+DSP_SRCS-$(HAVE_AVX512) += x86/sad4d_avx512.c
+DSP_SRCS-$(HAVE_AVX512) += x86/sad_avx512.c
 
-DSP_SRCS-$(HAVE_SSE)    += x86/sad4d_sse2.asm
-DSP_SRCS-$(HAVE_SSE)    += x86/sad_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/sad_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/subtract_sse2.asm
 
+DSP_SRCS-$(HAVE_VSX) += ppc/sad_vsx.c
+DSP_SRCS-$(HAVE_VSX) += ppc/subtract_vsx.c
+
+DSP_SRCS-$(HAVE_LSX)    += loongarch/subtract_lsx.c
+
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad4d_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad_neon.c
+DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad4d_avx2.c
+DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad_avx2.c
 endif  # CONFIG_VP9_HIGHBITDEPTH
 
 endif  # CONFIG_ENCODERS
@@ -307,33 +421,64 @@ ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)
 DSP_SRCS-yes            += variance.c
 DSP_SRCS-yes            += variance.h
 
+DSP_SRCS-$(HAVE_NEON)   += arm/avg_pred_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/subpel_variance_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/variance_neon.c
+DSP_SRCS-$(HAVE_NEON_DOTPROD)   += arm/variance_neon_dotprod.c
 
 DSP_SRCS-$(HAVE_MSA)    += mips/variance_msa.c
 DSP_SRCS-$(HAVE_MSA)    += mips/sub_pixel_variance_msa.c
 
-DSP_SRCS-$(HAVE_SSE)    += x86/variance_sse2.c
+DSP_SRCS-$(HAVE_LSX)    += loongarch/variance_lsx.h
+DSP_SRCS-$(HAVE_LSX)    += loongarch/variance_lsx.c
+DSP_SRCS-$(HAVE_LSX)    += loongarch/sub_pixel_variance_lsx.c
+DSP_SRCS-$(HAVE_LSX)    += loongarch/avg_pred_lsx.c
+
+DSP_SRCS-$(HAVE_MMI)    += mips/variance_mmi.c
+
+DSP_SRCS-$(HAVE_SSE2)   += x86/avg_pred_sse2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/avg_pred_avx2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/variance_sse2.c  # Contains SSE2 and SSSE3
 DSP_SRCS-$(HAVE_AVX2)   += x86/variance_avx2.c
-DSP_SRCS-$(HAVE_AVX2)   += x86/variance_impl_avx2.c
+DSP_SRCS-$(HAVE_VSX)    += ppc/variance_vsx.c
 
-ifeq ($(ARCH_X86_64),yes)
+ifeq ($(VPX_ARCH_X86_64),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/ssim_opt_x86_64.asm
-endif  # ARCH_X86_64
+endif  # VPX_ARCH_X86_64
 
-DSP_SRCS-$(HAVE_SSE)    += x86/subpel_variance_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/subpel_variance_sse2.asm  # Contains SSE2 and SSSE3
 
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_impl_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_subpel_variance_impl_sse2.asm
+DSP_SRCS-$(HAVE_NEON)   += arm/highbd_avg_pred_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/highbd_sse_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/highbd_variance_neon.c
+DSP_SRCS-$(HAVE_NEON_DOTPROD)   += arm/highbd_variance_neon_dotprod.c
+DSP_SRCS-$(HAVE_SVE)    += arm/highbd_variance_sve.c
+DSP_SRCS-$(HAVE_NEON)   += arm/highbd_subpel_variance_neon.c
 endif  # CONFIG_VP9_HIGHBITDEPTH
 endif  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
 
 # Neon utilities
+DSP_SRCS-$(HAVE_NEON) += arm/mem_neon.h
+DSP_SRCS-$(HAVE_NEON) += arm/sum_neon.h
 DSP_SRCS-$(HAVE_NEON) += arm/transpose_neon.h
+DSP_SRCS-$(HAVE_NEON) += arm/vpx_convolve8_neon.h
+
+# PPC VSX utilities
+DSP_SRCS-$(HAVE_VSX)  += ppc/types_vsx.h
+DSP_SRCS-$(HAVE_VSX)  += ppc/txfm_common_vsx.h
+DSP_SRCS-$(HAVE_VSX)  += ppc/transpose_vsx.h
+DSP_SRCS-$(HAVE_VSX)  += ppc/bitdepth_conversion_vsx.h
+
+# X86 utilities
+DSP_SRCS-$(HAVE_SSE2) += x86/mem_sse2.h
+DSP_SRCS-$(HAVE_SSE2) += x86/transpose_sse2.h
+
+# LSX utilities
+DSP_SRCS-$(HAVE_LSX)  += loongarch/bitdepth_conversion_lsx.h
 
 DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
 
diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_common.h b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_common.h
index 49d36e5458..9d2668277a 100644
--- a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_common.h
+++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_common.h
@@ -8,8 +8,10 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_VPX_DSP_COMMON_H_
-#define VPX_DSP_VPX_DSP_COMMON_H_
+#ifndef VPX_VPX_DSP_VPX_DSP_COMMON_H_
+#define VPX_VPX_DSP_VPX_DSP_COMMON_H_
+
+#include <limits.h>
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
@@ -25,8 +27,8 @@ extern "C" {
 #define VPX_SWAP(type, a, b) \
   do {                       \
     type c = (b);            \
-    b = a;                   \
-    a = c;                   \
+    (b) = a;                 \
+    (a) = c;                 \
   } while (0)
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -43,9 +45,22 @@ typedef int32_t tran_high_t;
 typedef int16_t tran_low_t;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+typedef int16_t tran_coef_t;
+
+// Visual Studio 2022 (cl.exe) < 17.7 targeting AArch64 with optimizations
+// enabled produces invalid code for clip_pixel() when the return type is
+// uint8_t. See:
+// https://developercommunity.visualstudio.com/t/Misoptimization-for-ARM64-in-VS-2022-17/10363361
+#if defined(_MSC_VER) && _MSC_VER < 1937 && defined(_M_ARM64) && \
+    !defined(__clang__)
+static INLINE int clip_pixel(int val) {
+  return (val > 255) ? 255 : (val < 0) ? 0 : val;
+}
+#else
 static INLINE uint8_t clip_pixel(int val) {
   return (val > 255) ? 255 : (val < 0) ? 0 : val;
 }
+#endif
 
 static INLINE int clamp(int value, int low, int high) {
   return value < low ? low : (value > high ? high : value);
@@ -55,7 +70,10 @@ static INLINE double fclamp(double value, double low, double high) {
   return value < low ? low : (value > high ? high : value);
 }
 
-#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE int64_t lclamp(int64_t value, int64_t low, int64_t high) {
+  return value < low ? low : (value > high ? high : value);
+}
+
 static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
   switch (bd) {
     case 8:
@@ -64,10 +82,15 @@ static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
     case 12: return (uint16_t)clamp(val, 0, 4095);
   }
 }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+// Returns the saturating cast of a double value to int.
+static INLINE int saturate_cast_double_to_int(double d) {
+  if (d > INT_MAX) return INT_MAX;
+  return (int)d;
+}
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VPX_DSP_VPX_DSP_COMMON_H_
+#endif  // VPX_VPX_DSP_VPX_DSP_COMMON_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c
index 030c456d39..2b8c656afb 100644
--- a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c
+++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c
@@ -12,4 +12,4 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/vpx_once.h"
 
-void vpx_dsp_rtcd() { once(setup_rtcd_internal); }
+void vpx_dsp_rtcd(void) { once(setup_rtcd_internal); }
diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
index ed7dd4da55..db508e9a9d 100644
--- a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1,3 +1,13 @@
+##
+##  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
 sub vpx_dsp_forward_decls() {
 print <<EOF
 /*
@@ -6,6 +16,11 @@ print <<EOF
 
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#if CONFIG_VP9_ENCODER
+ struct macroblock_plane;
+ struct ScanOrder;
+#endif
 
 EOF
 }
@@ -19,430 +34,415 @@ if ($opts{arch} eq "x86_64") {
   $ssse3_x86_64 = 'ssse3';
   $avx_x86_64 = 'avx';
   $avx2_x86_64 = 'avx2';
+  $avx512_x86_64 = 'avx512';
 }
 
 #
 # Intra prediction
 #
 
-add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d207_predictor_4x4 sse2/;
+add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d207_predictor_4x4 neon sse2/;
 
-add_proto qw/void vpx_d207e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-
-add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d45_predictor_4x4 neon sse2/;
 
-add_proto qw/void vpx_d45e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d45e_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 
-add_proto qw/void vpx_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d63_predictor_4x4 ssse3/;
+add_proto qw/void vpx_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d63_predictor_4x4 neon ssse3/;
 
-add_proto qw/void vpx_d63e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d63e_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 
-add_proto qw/void vpx_d63f_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-
-add_proto qw/void vpx_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+# TODO(crbug.com/webm/1522): Re-enable vsx implementation.
 specialize qw/vpx_h_predictor_4x4 neon dspr2 msa sse2/;
 
-add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 
-add_proto qw/void vpx_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d117_predictor_4x4 neon/;
 
-add_proto qw/void vpx_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d135_predictor_4x4 neon/;
 
-add_proto qw/void vpx_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d153_predictor_4x4 ssse3/;
+add_proto qw/void vpx_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d153_predictor_4x4 neon ssse3/;
 
-add_proto qw/void vpx_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_v_predictor_4x4 neon msa sse2/;
 
-add_proto qw/void vpx_ve_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_ve_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 
-add_proto qw/void vpx_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+# TODO(crbug.com/webm/1522): Re-enable vsx implementation.
 specialize qw/vpx_tm_predictor_4x4 neon dspr2 msa sse2/;
 
-add_proto qw/void vpx_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_predictor_4x4 dspr2 msa neon sse2/;
 
-add_proto qw/void vpx_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_top_predictor_4x4 msa neon sse2/;
 
-add_proto qw/void vpx_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_left_predictor_4x4 msa neon sse2/;
 
-add_proto qw/void vpx_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_128_predictor_4x4 msa neon sse2/;
 
-add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d207_predictor_8x8 ssse3/;
+add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d207_predictor_8x8 neon ssse3/;
 
-add_proto qw/void vpx_d207e_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-
-add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+# TODO(crbug.com/webm/1522): Re-enable vsx implementation.
 specialize qw/vpx_d45_predictor_8x8 neon sse2/;
 
-add_proto qw/void vpx_d45e_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+# TODO(crbug.com/webm/1522): Re-enable vsx implementation.
+specialize qw/vpx_d63_predictor_8x8 neon ssse3/;
 
-add_proto qw/void vpx_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d63_predictor_8x8 ssse3/;
-
-add_proto qw/void vpx_d63e_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-
-add_proto qw/void vpx_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+# TODO(crbug.com/webm/1522): Re-enable vsx implementation.
 specialize qw/vpx_h_predictor_8x8 neon dspr2 msa sse2/;
 
-add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d117_predictor_8x8 neon/;
 
-add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d135_predictor_8x8 neon/;
 
-add_proto qw/void vpx_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d153_predictor_8x8 ssse3/;
+add_proto qw/void vpx_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d153_predictor_8x8 neon ssse3/;
 
-add_proto qw/void vpx_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_v_predictor_8x8 neon msa sse2/;
 
-add_proto qw/void vpx_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+# TODO(crbug.com/webm/1522): Re-enable vsx implementation.
 specialize qw/vpx_tm_predictor_8x8 neon dspr2 msa sse2/;
 
-add_proto qw/void vpx_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa sse2/;
+add_proto qw/void vpx_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+# TODO(crbug.com/webm/1522): Re-enable vsx implementation.
+specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa sse2 lsx/;
 
-add_proto qw/void vpx_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_top_predictor_8x8 neon msa sse2/;
 
-add_proto qw/void vpx_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_left_predictor_8x8 neon msa sse2/;
 
-add_proto qw/void vpx_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_128_predictor_8x8 neon msa sse2/;
 
-add_proto qw/void vpx_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d207_predictor_16x16 ssse3/;
+add_proto qw/void vpx_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d207_predictor_16x16 neon ssse3/;
 
-add_proto qw/void vpx_d207e_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d45_predictor_16x16 neon ssse3 vsx/;
 
-add_proto qw/void vpx_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d45_predictor_16x16 neon ssse3/;
+add_proto qw/void vpx_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d63_predictor_16x16 neon ssse3 vsx/;
 
-add_proto qw/void vpx_d45e_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_h_predictor_16x16 neon dspr2 msa sse2 vsx/;
 
-add_proto qw/void vpx_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d63_predictor_16x16 ssse3/;
+add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d117_predictor_16x16 neon/;
 
-add_proto qw/void vpx_d63e_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-
-add_proto qw/void vpx_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_h_predictor_16x16 neon dspr2 msa sse2/;
-
-add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-
-add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d135_predictor_16x16 neon/;
 
-add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d153_predictor_16x16 ssse3/;
+add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d153_predictor_16x16 neon ssse3/;
 
-add_proto qw/void vpx_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_v_predictor_16x16 neon msa sse2/;
+add_proto qw/void vpx_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_v_predictor_16x16 neon msa sse2 vsx/;
 
-add_proto qw/void vpx_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_tm_predictor_16x16 neon msa sse2/;
+add_proto qw/void vpx_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_tm_predictor_16x16 neon msa sse2 vsx/;
 
-add_proto qw/void vpx_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_predictor_16x16 dspr2 neon msa sse2/;
+add_proto qw/void vpx_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_predictor_16x16 dspr2 neon msa sse2 vsx lsx/;
 
-add_proto qw/void vpx_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_top_predictor_16x16 neon msa sse2/;
+add_proto qw/void vpx_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_top_predictor_16x16 neon msa sse2 vsx/;
 
-add_proto qw/void vpx_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_left_predictor_16x16 neon msa sse2/;
+add_proto qw/void vpx_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_left_predictor_16x16 neon msa sse2 vsx/;
 
-add_proto qw/void vpx_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_128_predictor_16x16 neon msa sse2/;
+add_proto qw/void vpx_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_128_predictor_16x16 neon msa sse2 vsx/;
 
-add_proto qw/void vpx_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d207_predictor_32x32 ssse3/;
+add_proto qw/void vpx_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d207_predictor_32x32 neon ssse3/;
 
-add_proto qw/void vpx_d207e_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d45_predictor_32x32 neon ssse3 vsx/;
 
-add_proto qw/void vpx_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d45_predictor_32x32 neon ssse3/;
+add_proto qw/void vpx_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d63_predictor_32x32 neon ssse3 vsx/;
 
-add_proto qw/void vpx_d45e_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_h_predictor_32x32 neon msa sse2 vsx/;
 
-add_proto qw/void vpx_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d63_predictor_32x32 ssse3/;
+add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d117_predictor_32x32 neon/;
 
-add_proto qw/void vpx_d63e_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-
-add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_h_predictor_32x32 neon msa sse2/;
-
-add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-
-add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d135_predictor_32x32 neon/;
 
-add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d153_predictor_32x32 ssse3/;
+add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d153_predictor_32x32 neon ssse3/;
 
-add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_v_predictor_32x32 neon msa sse2/;
+add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_v_predictor_32x32 neon msa sse2 vsx/;
 
-add_proto qw/void vpx_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_tm_predictor_32x32 neon msa sse2/;
+add_proto qw/void vpx_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_tm_predictor_32x32 neon msa sse2 vsx/;
 
-add_proto qw/void vpx_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_predictor_32x32 msa neon sse2/;
+add_proto qw/void vpx_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_predictor_32x32 msa neon sse2 vsx/;
 
-add_proto qw/void vpx_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_top_predictor_32x32 msa neon sse2/;
+add_proto qw/void vpx_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_top_predictor_32x32 msa neon sse2 vsx/;
 
-add_proto qw/void vpx_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_left_predictor_32x32 msa neon sse2/;
+add_proto qw/void vpx_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_left_predictor_32x32 msa neon sse2 vsx/;
 
-add_proto qw/void vpx_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_128_predictor_32x32 msa neon sse2/;
+add_proto qw/void vpx_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_dc_128_predictor_32x32 msa neon sse2 vsx/;
 
 # High bitdepth functions
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/void vpx_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d207_predictor_4x4 neon sse2/;
 
-  add_proto qw/void vpx_highbd_d207e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d45_predictor_4x4 neon ssse3/;
 
-  add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d45_predictor_4x4 neon/;
+  add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d63_predictor_4x4 neon sse2/;
 
-  add_proto qw/void vpx_highbd_d45e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_h_predictor_4x4 neon sse2/;
 
-  add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d117_predictor_4x4 neon sse2/;
 
-  add_proto qw/void vpx_highbd_d63e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d135_predictor_4x4 neon sse2/;
 
-  add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_h_predictor_4x4 neon/;
+  add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d153_predictor_4x4 neon sse2/;
 
-  add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-
-  add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d135_predictor_4x4 neon/;
-
-  add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-
-  add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_v_predictor_4x4 neon sse2/;
 
-  add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_tm_predictor_4x4 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_predictor_4x4 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_top_predictor_4x4 neon/;
+  add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_top_predictor_4x4 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_left_predictor_4x4 neon/;
+  add_proto qw/void vpx_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_left_predictor_4x4 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_128_predictor_4x4 neon/;
+  add_proto qw/void vpx_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_128_predictor_4x4 neon sse2/;
 
-  add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d207_predictor_8x8 neon ssse3/;
 
-  add_proto qw/void vpx_highbd_d207e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d45_predictor_8x8 neon ssse3/;
 
-  add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d45_predictor_8x8 neon/;
+  add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d63_predictor_8x8 neon ssse3/;
 
-  add_proto qw/void vpx_highbd_d45e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_h_predictor_8x8 neon sse2/;
 
-  add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d117_predictor_8x8 neon ssse3/;
 
-  add_proto qw/void vpx_highbd_d63e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d135_predictor_8x8 neon ssse3/;
 
-  add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_h_predictor_8x8 neon/;
+  add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d153_predictor_8x8 neon ssse3/;
 
-  add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-
-  add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d135_predictor_8x8 neon/;
-
-  add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-
-  add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_v_predictor_8x8 neon sse2/;
 
-  add_proto qw/void vpx_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_tm_predictor_8x8 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_predictor_8x8 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_top_predictor_8x8 neon/;
+  add_proto qw/void vpx_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_top_predictor_8x8 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_left_predictor_8x8 neon/;
+  add_proto qw/void vpx_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_left_predictor_8x8 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_128_predictor_8x8 neon/;
+  add_proto qw/void vpx_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_128_predictor_8x8 neon sse2/;
 
-  add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d207_predictor_16x16 neon ssse3/;
 
-  add_proto qw/void vpx_highbd_d207e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d45_predictor_16x16 neon ssse3/;
 
-  add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d45_predictor_16x16 neon/;
+  add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d63_predictor_16x16 neon ssse3/;
 
-  add_proto qw/void vpx_highbd_d45e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_h_predictor_16x16 neon sse2/;
 
-  add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d117_predictor_16x16 neon ssse3/;
 
-  add_proto qw/void vpx_highbd_d63e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d135_predictor_16x16 neon ssse3/;
 
-  add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_h_predictor_16x16 neon/;
+  add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d153_predictor_16x16 neon ssse3/;
 
-  add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-
-  add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d135_predictor_16x16 neon/;
-
-  add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-
-  add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_v_predictor_16x16 neon sse2/;
 
-  add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_tm_predictor_16x16 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_predictor_16x16 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_top_predictor_16x16 neon/;
+  add_proto qw/void vpx_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_top_predictor_16x16 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_left_predictor_16x16 neon/;
+  add_proto qw/void vpx_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_left_predictor_16x16 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_128_predictor_16x16 neon/;
+  add_proto qw/void vpx_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_128_predictor_16x16 neon sse2/;
 
-  add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d207_predictor_32x32 neon ssse3/;
 
-  add_proto qw/void vpx_highbd_d207e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d45_predictor_32x32 neon ssse3/;
 
-  add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d45_predictor_32x32 neon/;
+  add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d63_predictor_32x32 neon ssse3/;
 
-  add_proto qw/void vpx_highbd_d45e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_h_predictor_32x32 neon sse2/;
 
-  add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d117_predictor_32x32 neon ssse3/;
 
-  add_proto qw/void vpx_highbd_d63e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d135_predictor_32x32 neon ssse3/;
 
-  add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_h_predictor_32x32 neon/;
+  add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d153_predictor_32x32 neon ssse3/;
 
-  add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-
-  add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d135_predictor_32x32 neon/;
-
-  add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-
-  add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_v_predictor_32x32 neon sse2/;
 
-  add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_tm_predictor_32x32 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_predictor_32x32 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_top_predictor_32x32 neon/;
+  add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_top_predictor_32x32 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_left_predictor_32x32 neon/;
+  add_proto qw/void vpx_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_left_predictor_32x32 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_128_predictor_32x32 neon/;
+  add_proto qw/void vpx_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_128_predictor_32x32 neon sse2/;
 }  # CONFIG_VP9_HIGHBITDEPTH
 
+if (vpx_config("CONFIG_VP9") eq "yes") {
 #
 # Sub Pixel Filters
 #
-add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve_copy neon dspr2 msa sse2/;
+add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve_copy neon dspr2 msa sse2 vsx lsx/;
 
-add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve_avg neon dspr2 msa sse2/;
+add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx mmi lsx/;
 
-add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa/;
+add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/;
 
-add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon dspr2 msa/;
+add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/;
 
-add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa/;
+add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/;
 
-add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg sse2 ssse3 neon dspr2 msa/;
+add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/;
 
-add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 neon dspr2 msa/;
+add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/;
 
-add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_vert sse2 ssse3 neon dspr2 msa/;
+add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/;
 
-add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_scaled_2d ssse3/;
+add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_scaled_2d ssse3 neon msa/;
 
-add_proto qw/void vpx_scaled_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 
-add_proto qw/void vpx_scaled_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 
-add_proto qw/void vpx_scaled_avg_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_avg_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 
-add_proto qw/void vpx_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 
-add_proto qw/void vpx_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+} #CONFIG_VP9
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   #
   # Sub Pixel Filters
   #
-  add_proto qw/void vpx_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vpx_highbd_convolve_copy sse2 neon/;
+  add_proto qw/void vpx_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
+  specialize qw/vpx_highbd_convolve_copy sse2 avx2 neon/;
 
-  add_proto qw/void vpx_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vpx_highbd_convolve_avg sse2 neon/;
+  add_proto qw/void vpx_highbd_convolve_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
+  specialize qw/vpx_highbd_convolve_avg sse2 avx2 neon/;
 
-  add_proto qw/void vpx_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vpx_highbd_convolve8 neon/, "$sse2_x86_64";
+  add_proto qw/void vpx_highbd_convolve8/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
+  specialize qw/vpx_highbd_convolve8 avx2 neon sve2/, "$sse2_x86_64";
 
-  add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vpx_highbd_convolve8_horiz neon/, "$sse2_x86_64";
+  add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
+  specialize qw/vpx_highbd_convolve8_horiz avx2 neon sve/, "$sse2_x86_64";
 
-  add_proto qw/void vpx_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vpx_highbd_convolve8_vert neon/, "$sse2_x86_64";
+  add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
+  specialize qw/vpx_highbd_convolve8_vert avx2 neon sve2/, "$sse2_x86_64";
 
-  add_proto qw/void vpx_highbd_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vpx_highbd_convolve8_avg neon/, "$sse2_x86_64";
+  add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
+  specialize qw/vpx_highbd_convolve8_avg avx2 neon sve2/, "$sse2_x86_64";
 
-  add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vpx_highbd_convolve8_avg_horiz neon/, "$sse2_x86_64";
+  add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
+  specialize qw/vpx_highbd_convolve8_avg_horiz avx2 neon sve/, "$sse2_x86_64";
 
-  add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-  specialize qw/vpx_highbd_convolve8_avg_vert neon/, "$sse2_x86_64";
+  add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
+  specialize qw/vpx_highbd_convolve8_avg_vert avx2 neon sve2/, "$sse2_x86_64";
 }  # CONFIG_VP9_HIGHBITDEPTH
 
+if (vpx_config("CONFIG_VP9") eq "yes") {
 #
 # Loopfilter
 #
@@ -450,37 +450,38 @@ add_proto qw/void vpx_lpf_vertical_16/, "uint8_t *s, int pitch, const uint8_t *b
 specialize qw/vpx_lpf_vertical_16 sse2 neon dspr2 msa/;
 
 add_proto qw/void vpx_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_vertical_16_dual sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_vertical_16_dual sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_vertical_8 sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_vertical_8 sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vpx_lpf_vertical_8_dual sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_vertical_8_dual sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_vertical_4 sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_vertical_4 sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vpx_lpf_vertical_4_dual sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_vertical_4_dual sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/vpx_lpf_horizontal_16 sse2 avx2 neon dspr2 msa/;
 
 add_proto qw/void vpx_lpf_horizontal_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_horizontal_16_dual sse2 avx2 neon dspr2 msa/;
+specialize qw/vpx_lpf_horizontal_16_dual sse2 avx2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vpx_lpf_horizontal_8_dual sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_horizontal_8_dual sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_horizontal_4 sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_horizontal_4 sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vpx_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_horizontal_4_dual sse2 neon dspr2 msa lsx/;
+} #CONFIG_VP9
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_lpf_vertical_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
@@ -530,277 +531,190 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct4x4 sse2/;
+  specialize qw/vpx_fdct4x4 neon sse2/;
 
   add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct4x4_1 sse2/;
+  specialize qw/vpx_fdct4x4_1 sse2 neon/;
+  specialize qw/vpx_highbd_fdct4x4_1 neon/;
+  $vpx_highbd_fdct4x4_1_neon=vpx_fdct4x4_1_neon;
 
   add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct8x8 sse2/;
+  specialize qw/vpx_fdct8x8 neon sse2/;
 
   add_proto qw/void vpx_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct8x8_1 sse2/;
+  specialize qw/vpx_fdct8x8_1 neon sse2 msa/;
 
   add_proto qw/void vpx_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct16x16 sse2/;
+  specialize qw/vpx_fdct16x16 neon sse2/;
 
   add_proto qw/void vpx_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct16x16_1 sse2/;
+  specialize qw/vpx_fdct16x16_1 sse2 neon/;
 
   add_proto qw/void vpx_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct32x32 sse2/;
+  specialize qw/vpx_fdct32x32 neon sse2/;
 
   add_proto qw/void vpx_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct32x32_rd sse2/;
+  specialize qw/vpx_fdct32x32_rd neon sse2/;
 
   add_proto qw/void vpx_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct32x32_1 sse2/;
+  specialize qw/vpx_fdct32x32_1 sse2 neon/;
 
   add_proto qw/void vpx_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_highbd_fdct4x4 sse2/;
+  specialize qw/vpx_highbd_fdct4x4 sse2 neon/;
 
   add_proto qw/void vpx_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_highbd_fdct8x8 sse2/;
+  specialize qw/vpx_highbd_fdct8x8 sse2 neon/;
 
   add_proto qw/void vpx_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_highbd_fdct8x8_1 neon/;
+  $vpx_highbd_fdct8x8_1_neon=vpx_fdct8x8_1_neon;
 
   add_proto qw/void vpx_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_highbd_fdct16x16 sse2/;
+  specialize qw/vpx_highbd_fdct16x16 sse2 neon/;
 
   add_proto qw/void vpx_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_highbd_fdct16x16_1 neon/;
 
   add_proto qw/void vpx_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_highbd_fdct32x32 sse2/;
+  specialize qw/vpx_highbd_fdct32x32 sse2 neon/;
 
   add_proto qw/void vpx_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_highbd_fdct32x32_rd sse2/;
+  specialize qw/vpx_highbd_fdct32x32_rd sse2 neon/;
 
   add_proto qw/void vpx_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_highbd_fdct32x32_1 neon/;
 } else {
   add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct4x4 sse2 msa/;
+  specialize qw/vpx_fdct4x4 neon sse2 msa lsx/;
 
   add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct4x4_1 sse2/;
+  specialize qw/vpx_fdct4x4_1 sse2 neon/;
 
   add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct8x8 sse2 neon msa/, "$ssse3_x86_64";
+  specialize qw/vpx_fdct8x8 sse2 neon msa lsx/, "$ssse3_x86_64";
 
   add_proto qw/void vpx_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_fdct8x8_1 sse2 neon msa/;
 
   add_proto qw/void vpx_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct16x16 sse2 msa/;
+  specialize qw/vpx_fdct16x16 neon sse2 avx2 msa lsx/;
 
   add_proto qw/void vpx_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct16x16_1 sse2 msa/;
+  specialize qw/vpx_fdct16x16_1 sse2 neon msa/;
 
   add_proto qw/void vpx_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct32x32 sse2 avx2 msa/;
+  specialize qw/vpx_fdct32x32 neon sse2 avx2 msa lsx/;
 
   add_proto qw/void vpx_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct32x32_rd sse2 avx2 msa/;
+  specialize qw/vpx_fdct32x32_rd sse2 avx2 neon msa vsx lsx/;
 
   add_proto qw/void vpx_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct32x32_1 sse2 msa/;
+  specialize qw/vpx_fdct32x32_1 sse2 neon msa/;
 }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_VP9_ENCODER
 
 #
 # Inverse transform
 if (vpx_config("CONFIG_VP9") eq "yes") {
+
+add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+
+if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
+  # Note that there are more specializations appended when
+  # CONFIG_VP9_HIGHBITDEPTH is off.
+  specialize qw/vpx_idct4x4_16_add neon sse2 vsx/;
+  specialize qw/vpx_idct4x4_1_add neon sse2/;
+  specialize qw/vpx_idct8x8_64_add neon sse2 vsx/;
+  specialize qw/vpx_idct8x8_12_add neon sse2 ssse3/;
+  specialize qw/vpx_idct8x8_1_add neon sse2/;
+  specialize qw/vpx_idct16x16_256_add neon sse2 avx2 vsx/;
+  specialize qw/vpx_idct16x16_38_add neon sse2/;
+  specialize qw/vpx_idct16x16_10_add neon sse2/;
+  specialize qw/vpx_idct16x16_1_add neon sse2/;
+  specialize qw/vpx_idct32x32_1024_add neon sse2 avx2 vsx/;
+  specialize qw/vpx_idct32x32_135_add neon sse2 ssse3 avx2/;
+  specialize qw/vpx_idct32x32_34_add neon sse2 ssse3/;
+  specialize qw/vpx_idct32x32_1_add neon sse2/;
+  specialize qw/vpx_iwht4x4_16_add sse2 vsx/;
+
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
+    # Note that these specializations are appended to the above ones.
+    specialize qw/vpx_idct4x4_16_add dspr2 msa/;
+    specialize qw/vpx_idct4x4_1_add dspr2 msa/;
+    specialize qw/vpx_idct8x8_64_add dspr2 msa/;
+    specialize qw/vpx_idct8x8_12_add dspr2 msa/;
+    specialize qw/vpx_idct8x8_1_add dspr2 msa/;
+    specialize qw/vpx_idct16x16_256_add dspr2 msa/;
+    specialize qw/vpx_idct16x16_38_add dspr2 msa/;
+    $vpx_idct16x16_38_add_dspr2=vpx_idct16x16_256_add_dspr2;
+    $vpx_idct16x16_38_add_msa=vpx_idct16x16_256_add_msa;
+    specialize qw/vpx_idct16x16_10_add dspr2 msa/;
+    specialize qw/vpx_idct16x16_1_add dspr2 msa/;
+    specialize qw/vpx_idct32x32_1024_add dspr2 msa lsx/;
+    specialize qw/vpx_idct32x32_135_add dspr2 msa/;
+    $vpx_idct32x32_135_add_dspr2=vpx_idct32x32_1024_add_dspr2;
+    $vpx_idct32x32_135_add_msa=vpx_idct32x32_1024_add_msa;
+    $vpx_idct32x32_135_add_lsx=vpx_idct32x32_1024_add_lsx;
+    specialize qw/vpx_idct32x32_34_add dspr2 msa lsx/;
+    specialize qw/vpx_idct32x32_1_add dspr2 msa lsx/;
+    specialize qw/vpx_iwht4x4_16_add msa/;
+    specialize qw/vpx_iwht4x4_1_add msa/;
+  } # !CONFIG_VP9_HIGHBITDEPTH
+}  # !CONFIG_EMULATE_HARDWARE
+
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   # Note as optimized versions of these functions are added we need to add a check to ensure
   # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
-  add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
 
-  add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-  specialize qw/vpx_iwht4x4_16_add sse2/;
-
-  add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
-  specialize qw/vpx_highbd_idct4x4_1_add neon/;
-
-  add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
-  specialize qw/vpx_highbd_idct8x8_1_add neon/;
-
-  add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
-
-  add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
-
-  add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
-
-  add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
-  specialize qw/vpx_highbd_idct32x32_1_add sse2/;
-
-  add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
-
-  add_proto qw/void vpx_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
-
-  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
-  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
-    add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
-    add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
-    add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
-    add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
-    add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
-    add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
-    add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
-    add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
-    add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
-    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
-    add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
-    add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
-    add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
-
-    add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
-
-    add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
-
-    add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
-
-    add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
-  } else {
-    add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-    specialize qw/vpx_idct4x4_16_add neon sse2/;
-
-    add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-    specialize qw/vpx_idct4x4_1_add neon sse2/;
-
-    add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-    specialize qw/vpx_idct8x8_64_add neon sse2/, "$ssse3_x86_64";
-
-    add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-    specialize qw/vpx_idct8x8_12_add neon sse2/, "$ssse3_x86_64";
-
-    add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-    specialize qw/vpx_idct8x8_1_add neon sse2/;
-
-    add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-    specialize qw/vpx_idct16x16_256_add neon sse2/;
-
-    add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-    specialize qw/vpx_idct16x16_10_add neon sse2/;
-
-    add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-    specialize qw/vpx_idct16x16_1_add neon sse2/;
-
-    add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-    specialize qw/vpx_idct32x32_1024_add neon sse2/, "$ssse3_x86_64";
-
-    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-    specialize qw/vpx_idct32x32_135_add neon sse2/, "$ssse3_x86_64";
-    # Need to add 135 eob idct32x32 implementations.
-    $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
-
-    add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-    specialize qw/vpx_idct32x32_34_add neon sse2/, "$ssse3_x86_64";
-
-    add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-    specialize qw/vpx_idct32x32_1_add neon sse2/;
-
-    add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
-    specialize qw/vpx_highbd_idct4x4_16_add neon sse2/;
-
-    add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
-    specialize qw/vpx_highbd_idct8x8_64_add neon sse2/;
-
-    add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
-    specialize qw/vpx_highbd_idct8x8_12_add neon sse2/;
-
-    add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
-    specialize qw/vpx_highbd_idct16x16_256_add sse2/;
-
-    add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
-    specialize qw/vpx_highbd_idct16x16_10_add sse2/;
-  }  # CONFIG_EMULATE_HARDWARE
-} else {
-  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
-  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
-    add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
-    add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
-    add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
-    add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
-    add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
-    add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
-    add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
-    add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
-    add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
-    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
-    add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
-    add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
-    add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-
-    add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-  } else {
-    add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-    specialize qw/vpx_idct4x4_1_add sse2 neon dspr2 msa/;
-
-    add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-    specialize qw/vpx_idct4x4_16_add sse2 neon dspr2 msa/;
-
-    add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-    specialize qw/vpx_idct8x8_1_add sse2 neon dspr2 msa/;
-
-    add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-    specialize qw/vpx_idct8x8_64_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
-
-    add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-    specialize qw/vpx_idct8x8_12_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
-
-    add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-    specialize qw/vpx_idct16x16_1_add sse2 neon dspr2 msa/;
-
-    add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-    specialize qw/vpx_idct16x16_256_add sse2 neon dspr2 msa/;
-
-    add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-    specialize qw/vpx_idct16x16_10_add sse2 neon dspr2 msa/;
-
-    add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-    specialize qw/vpx_idct32x32_1024_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
-
-    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-    specialize qw/vpx_idct32x32_135_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
-    $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
-    $vpx_idct32x32_135_add_dspr2=vpx_idct32x32_1024_add_dspr2;
-    $vpx_idct32x32_135_add_msa=vpx_idct32x32_1024_add_msa;
-
-    add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-    specialize qw/vpx_idct32x32_34_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
-
-    add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-    specialize qw/vpx_idct32x32_1_add sse2 neon dspr2 msa/;
-
-    add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-    specialize qw/vpx_iwht4x4_1_add msa/;
-
-    add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
-    specialize qw/vpx_iwht4x4_16_add msa sse2/;
-  }  # CONFIG_EMULATE_HARDWARE
+  add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+  specialize qw/vpx_highbd_idct4x4_1_add neon sse2/;
+
+  add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+  specialize qw/vpx_highbd_idct8x8_1_add neon sse2/;
+
+  add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_idct16x16_38_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+  specialize qw/vpx_highbd_idct16x16_1_add neon sse2/;
+
+  add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_idct32x32_135_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+  specialize qw/vpx_highbd_idct32x32_1_add neon sse2/;
+
+  add_proto qw/void vpx_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+  add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
+
+  if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
+    specialize qw/vpx_highbd_idct4x4_16_add neon sse2 sse4_1/;
+    specialize qw/vpx_highbd_idct8x8_64_add neon sse2 sse4_1/;
+    specialize qw/vpx_highbd_idct8x8_12_add neon sse2 sse4_1/;
+    specialize qw/vpx_highbd_idct16x16_256_add neon sse2 sse4_1/;
+    specialize qw/vpx_highbd_idct16x16_38_add neon sse2 sse4_1/;
+    specialize qw/vpx_highbd_idct16x16_10_add neon sse2 sse4_1/;
+    specialize qw/vpx_highbd_idct32x32_1024_add neon sse2 sse4_1/;
+    specialize qw/vpx_highbd_idct32x32_135_add neon sse2 sse4_1/;
+    specialize qw/vpx_highbd_idct32x32_34_add neon sse2 sse4_1/;
+  }  # !CONFIG_EMULATE_HARDWARE
 }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_VP9
 
@@ -808,19 +722,23 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 # Quantization
 #
 if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
-  add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b sse2/, "$ssse3_x86_64", "$avx_x86_64";
+  add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
+  specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx/;
 
-  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b_32x32/, "$ssse3_x86_64", "$avx_x86_64";
+  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
+  specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx/;
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    specialize qw/vpx_highbd_quantize_b sse2/;
+    add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
+    specialize qw/vpx_highbd_quantize_b neon sse2 avx2/;
 
-    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    specialize qw/vpx_highbd_quantize_b_32x32 sse2/;
-  }  # CONFIG_VP9_HIGHBITDEPTH
+    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
+    specialize qw/vpx_highbd_quantize_b_32x32 neon sse2 avx2/;
+  } else {
+    specialize qw/vpx_quantize_b lsx/;
+
+    specialize qw/vpx_quantize_b_32x32 lsx/;
+  } # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_VP9_ENCODER
 
 if (vpx_config("CONFIG_ENCODERS") eq "yes") {
@@ -828,49 +746,91 @@ if (vpx_config("CONFIG_ENCODERS") eq "yes") {
 # Block subtraction
 #
 add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
-specialize qw/vpx_subtract_block neon msa sse2/;
+specialize qw/vpx_subtract_block neon msa mmi sse2 avx2 vsx lsx/;
+
+add_proto qw/int64_t/, "vpx_sse", "const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height";
+specialize qw/vpx_sse sse4_1 avx2 neon neon_dotprod/;
 
 #
 # Single block SAD
 #
 add_proto qw/unsigned int vpx_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad64x64 avx2 neon msa sse2/;
+specialize qw/vpx_sad64x64 neon neon_dotprod avx512 avx2 msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad64x32 avx2 msa sse2/;
+specialize qw/vpx_sad64x32 neon neon_dotprod avx512 avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x64 avx2 msa sse2/;
+specialize qw/vpx_sad32x64 neon neon_dotprod avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x32 avx2 neon msa sse2/;
+specialize qw/vpx_sad32x32 neon neon_dotprod avx2 msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x16 avx2 msa sse2/;
+specialize qw/vpx_sad32x16 neon neon_dotprod avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x32 msa sse2/;
+specialize qw/vpx_sad16x32 neon neon_dotprod msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x16 neon msa sse2/;
+specialize qw/vpx_sad16x16 neon neon_dotprod msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x8 neon msa sse2/;
+specialize qw/vpx_sad16x8 neon neon_dotprod msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x16 neon msa sse2/;
+specialize qw/vpx_sad8x16 neon msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x8 neon msa sse2/;
+specialize qw/vpx_sad8x8 neon msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x4 msa sse2/;
+specialize qw/vpx_sad8x4 neon msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad4x8 msa sse2/;
+specialize qw/vpx_sad4x8 neon msa sse2 mmi/;
 
 add_proto qw/unsigned int vpx_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad4x4 neon msa sse2/;
+specialize qw/vpx_sad4x4 neon msa sse2 mmi/;
+
+add_proto qw/unsigned int vpx_sad_skip_64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_64x64 neon neon_dotprod avx512 avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_64x32 neon neon_dotprod avx512 avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_32x64 neon neon_dotprod avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_32x32 neon neon_dotprod avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_32x16 neon neon_dotprod avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_16x32 neon neon_dotprod sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_16x16 neon neon_dotprod sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_16x8 neon neon_dotprod sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_8x16 neon sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_8x8 neon sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_8x4 neon/;
+
+add_proto qw/unsigned int vpx_sad_skip_4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_4x8 neon sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_4x4 neon/;
 
 #
 # Avg
@@ -883,163 +843,177 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
   specialize qw/vpx_avg_4x4 sse2 neon msa/;
 
   add_proto qw/void vpx_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
-  specialize qw/vpx_minmax_8x8 sse2 neon/;
+  specialize qw/vpx_minmax_8x8 sse2 neon msa/;
 
-  add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
-  specialize qw/vpx_hadamard_8x8 sse2 neon/, "$ssse3_x86_64";
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+    specialize qw/vpx_hadamard_8x8 sse2 neon vsx lsx/, "$ssse3_x86_64";
 
-  add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
-  specialize qw/vpx_hadamard_16x16 sse2 neon/;
+    add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+    specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx lsx/;
 
-  add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
-  specialize qw/vpx_satd sse2 neon/;
+    add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+    specialize qw/vpx_hadamard_32x32 sse2 avx2 neon/;
 
-  add_proto qw/void vpx_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height";
-  specialize qw/vpx_int_pro_row sse2 neon/;
+    add_proto qw/void vpx_highbd_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+    specialize qw/vpx_highbd_hadamard_8x8 avx2 neon/;
 
+    add_proto qw/void vpx_highbd_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+    specialize qw/vpx_highbd_hadamard_16x16 avx2 neon/;
+
+    add_proto qw/void vpx_highbd_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+    specialize qw/vpx_highbd_hadamard_32x32 avx2 neon/;
+
+    add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";
+    specialize qw/vpx_satd avx2 sse2 neon/;
+
+    add_proto qw/int vpx_highbd_satd/, "const tran_low_t *coeff, int length";
+    specialize qw/vpx_highbd_satd avx2 neon/;
+  } else {
+    add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
+    specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx lsx/, "$ssse3_x86_64";
+
+    add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
+    specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx lsx/;
+
+    add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
+    specialize qw/vpx_hadamard_32x32 sse2 avx2 neon/;
+
+    add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
+    specialize qw/vpx_satd avx2 sse2 neon msa/;
+  }
+
+  add_proto qw/void vpx_int_pro_row/, "int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height";
+  specialize qw/vpx_int_pro_row neon sse2 msa/;
   add_proto qw/int16_t vpx_int_pro_col/, "const uint8_t *ref, const int width";
-  specialize qw/vpx_int_pro_col sse2 neon/;
+  specialize qw/vpx_int_pro_col neon sse2 msa/;
 
   add_proto qw/int vpx_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl";
-  specialize qw/vpx_vector_var neon sse2/;
+  specialize qw/vpx_vector_var neon sse2 msa/;
 }  # CONFIG_VP9_ENCODER
 
 add_proto qw/unsigned int vpx_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad64x64_avg avx2 msa sse2/;
+specialize qw/vpx_sad64x64_avg neon neon_dotprod avx512 avx2 msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad64x32_avg avx2 msa sse2/;
+specialize qw/vpx_sad64x32_avg neon neon_dotprod avx512 avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad32x64_avg avx2 msa sse2/;
+specialize qw/vpx_sad32x64_avg neon neon_dotprod avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad32x32_avg avx2 msa sse2/;
+specialize qw/vpx_sad32x32_avg neon neon_dotprod avx2 msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad32x16_avg avx2 msa sse2/;
+specialize qw/vpx_sad32x16_avg neon neon_dotprod avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x32_avg msa sse2/;
+specialize qw/vpx_sad16x32_avg neon neon_dotprod msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x16_avg msa sse2/;
+specialize qw/vpx_sad16x16_avg neon neon_dotprod msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x8_avg msa sse2/;
+specialize qw/vpx_sad16x8_avg neon neon_dotprod msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad8x16_avg msa sse2/;
+specialize qw/vpx_sad8x16_avg neon msa sse2 mmi/;
 
 add_proto qw/unsigned int vpx_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad8x8_avg msa sse2/;
+specialize qw/vpx_sad8x8_avg neon msa sse2 mmi/;
 
 add_proto qw/unsigned int vpx_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad8x4_avg msa sse2/;
+specialize qw/vpx_sad8x4_avg neon msa sse2 mmi/;
 
 add_proto qw/unsigned int vpx_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad4x8_avg msa sse2/;
+specialize qw/vpx_sad4x8_avg neon msa sse2 mmi/;
 
 add_proto qw/unsigned int vpx_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad4x4_avg msa sse2/;
-
-#
-# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
-#
-# Blocks of 3
-add_proto qw/void vpx_sad64x64x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x64x3 msa/;
-
-add_proto qw/void vpx_sad32x32x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x32x3 msa/;
-
-add_proto qw/void vpx_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x16x3 sse3 ssse3 msa/;
-
-add_proto qw/void vpx_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x8x3 sse3 ssse3 msa/;
-
-add_proto qw/void vpx_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x16x3 sse3 msa/;
-
-add_proto qw/void vpx_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x8x3 sse3 msa/;
-
-add_proto qw/void vpx_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x4x3 sse3 msa/;
-
-# Blocks of 8
-add_proto qw/void vpx_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x64x8 msa/;
-
-add_proto qw/void vpx_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x32x8 msa/;
-
-add_proto qw/void vpx_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x16x8 sse4_1 msa/;
-
-add_proto qw/void vpx_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x8x8 sse4_1 msa/;
-
-add_proto qw/void vpx_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x16x8 sse4_1 msa/;
-
-add_proto qw/void vpx_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x8x8 sse4_1 msa/;
-
-add_proto qw/void vpx_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x4x8 msa/;
-
-add_proto qw/void vpx_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x8x8 msa/;
-
-add_proto qw/void vpx_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x4x8 sse4_1 msa/;
+specialize qw/vpx_sad4x4_avg neon msa sse2 mmi/;
 
 #
 # Multi-block SAD, comparing a reference to N independent blocks
 #
-add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x64x4d avx2 neon msa sse2/;
+add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad64x64x4d avx512 avx2 neon neon_dotprod msa sse2 vsx mmi lsx/;
 
-add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x32x4d msa sse2/;
+add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad64x32x4d neon neon_dotprod msa sse2 vsx mmi lsx/;
 
-add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x64x4d msa sse2/;
+add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad32x64x4d neon neon_dotprod msa sse2 vsx mmi lsx/;
 
-add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x32x4d avx2 neon msa sse2/;
+add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad32x32x4d avx2 neon neon_dotprod msa sse2 vsx mmi lsx/;
 
-add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x16x4d msa sse2/;
+add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad32x16x4d neon neon_dotprod msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x32x4d msa sse2/;
+add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad16x32x4d neon neon_dotprod msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x16x4d neon msa sse2/;
+add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad16x16x4d neon neon_dotprod msa sse2 vsx mmi lsx/;
 
-add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x8x4d msa sse2/;
+add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad16x8x4d neon neon_dotprod msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x16x4d msa sse2/;
+add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad8x16x4d neon msa sse2 mmi/;
 
-add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x8x4d msa sse2/;
+add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad8x8x4d neon msa sse2 mmi lsx/;
 
-add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x4x4d msa sse2/;
+add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad8x4x4d neon msa sse2 mmi/;
 
-add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x8x4d msa sse2/;
+add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad4x8x4d neon msa sse2 mmi/;
 
-add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x4x4d msa sse2/;
+add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/;
+
+add_proto qw/void vpx_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_64x64x4d neon neon_dotprod avx512 avx2 sse2/;
+
+add_proto qw/void vpx_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_64x32x4d neon neon_dotprod avx512 avx2 sse2/;
+
+add_proto qw/void vpx_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_32x64x4d neon neon_dotprod avx2 sse2/;
+
+add_proto qw/void vpx_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_32x32x4d neon neon_dotprod avx2 sse2/;
+
+add_proto qw/void vpx_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_32x16x4d neon neon_dotprod avx2 sse2/;
+
+add_proto qw/void vpx_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_16x32x4d neon neon_dotprod sse2/;
+
+add_proto qw/void vpx_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_16x16x4d neon neon_dotprod sse2/;
+
+add_proto qw/void vpx_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_16x8x4d neon neon_dotprod sse2/;
+
+add_proto qw/void vpx_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_8x16x4d neon sse2/;
+
+add_proto qw/void vpx_sad_skip_8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_8x8x4d neon sse2/;
+
+add_proto qw/void vpx_sad_skip_8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_8x4x4d neon/;
+
+add_proto qw/void vpx_sad_skip_4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_4x8x4d neon sse2/;
+
+add_proto qw/void vpx_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_4x4x4d neon/;
 
 add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
-specialize qw/vpx_sum_squares_2d_i16 sse2/;
+specialize qw/vpx_sum_squares_2d_i16 neon sve sse2 msa/;
 
 #
 # Structured Similarity (SSIM)
@@ -1047,179 +1021,230 @@ specialize qw/vpx_sum_squares_2d_i16 sse2/;
 if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
     add_proto qw/void vpx_ssim_parms_8x8/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
     specialize qw/vpx_ssim_parms_8x8/, "$sse2_x86_64";
-
-    add_proto qw/void vpx_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
-    specialize qw/vpx_ssim_parms_16x16/, "$sse2_x86_64";
 }
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   #
   # Block subtraction
   #
-  add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
+  add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd";
+  specialize qw/vpx_highbd_subtract_block neon avx2/;
+
+  add_proto qw/int64_t/, "vpx_highbd_sse", "const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height";
+  specialize qw/vpx_highbd_sse sse4_1 avx2 neon/;
 
   #
   # Single block SAD
   #
   add_proto qw/unsigned int vpx_highbd_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad64x64 sse2/;
+  specialize qw/vpx_highbd_sad64x64 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad64x32 sse2/;
+  specialize qw/vpx_highbd_sad64x32 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad32x64 sse2/;
+  specialize qw/vpx_highbd_sad32x64 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad32x32 sse2/;
+  specialize qw/vpx_highbd_sad32x32 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad32x16 sse2/;
+  specialize qw/vpx_highbd_sad32x16 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad16x32 sse2/;
+  specialize qw/vpx_highbd_sad16x32 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad16x16 sse2/;
+  specialize qw/vpx_highbd_sad16x16 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad16x8 sse2/;
+  specialize qw/vpx_highbd_sad16x8 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad8x16 sse2/;
+  specialize qw/vpx_highbd_sad8x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad8x8 sse2/;
+  specialize qw/vpx_highbd_sad8x8 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad8x4 sse2/;
+  specialize qw/vpx_highbd_sad8x4 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad4x8 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad4x4 neon/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_64x64 neon sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_64x32 neon sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_32x64 neon sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_32x32 neon sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_32x16 neon sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_16x32 neon sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_16x16 neon sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_16x8 neon sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_8x16 neon sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_8x8 neon sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_8x4 neon/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_4x8 neon/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_4x4 neon/;
 
   #
   # Avg
   #
-  add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *, int p";
-  add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *, int p";
-  add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+  add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *s8, int p";
+  specialize qw/vpx_highbd_avg_8x8 sse2 neon/;
+
+  add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *s8, int p";
+  specialize qw/vpx_highbd_avg_4x4 sse2 neon/;
+
+  add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max";
+  specialize qw/vpx_highbd_minmax_8x8 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad64x64_avg sse2/;
+  specialize qw/vpx_highbd_sad64x64_avg sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad64x32_avg sse2/;
+  specialize qw/vpx_highbd_sad64x32_avg sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad32x64_avg sse2/;
+  specialize qw/vpx_highbd_sad32x64_avg sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad32x32_avg sse2/;
+  specialize qw/vpx_highbd_sad32x32_avg sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad32x16_avg sse2/;
+  specialize qw/vpx_highbd_sad32x16_avg sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad16x32_avg sse2/;
+  specialize qw/vpx_highbd_sad16x32_avg sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad16x16_avg sse2/;
+  specialize qw/vpx_highbd_sad16x16_avg sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad16x8_avg sse2/;
+  specialize qw/vpx_highbd_sad16x8_avg sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad8x16_avg sse2/;
+  specialize qw/vpx_highbd_sad8x16_avg sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad8x8_avg sse2/;
+  specialize qw/vpx_highbd_sad8x8_avg sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad8x4_avg sse2/;
+  specialize qw/vpx_highbd_sad8x4_avg sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad4x8_avg neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-
-  #
-  # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
-  #
-  # Blocks of 3
-  add_proto qw/void vpx_highbd_sad64x64x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
-  add_proto qw/void vpx_highbd_sad32x32x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
-  add_proto qw/void vpx_highbd_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
-  add_proto qw/void vpx_highbd_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
-  add_proto qw/void vpx_highbd_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
-  add_proto qw/void vpx_highbd_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
-  add_proto qw/void vpx_highbd_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
-  # Blocks of 8
-  add_proto qw/void vpx_highbd_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
-  add_proto qw/void vpx_highbd_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
-  add_proto qw/void vpx_highbd_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
-  add_proto qw/void vpx_highbd_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
-  add_proto qw/void vpx_highbd_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
-  add_proto qw/void vpx_highbd_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
-  add_proto qw/void vpx_highbd_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
-  add_proto qw/void vpx_highbd_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
-  add_proto qw/void vpx_highbd_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad4x4_avg neon/;
 
   #
   # Multi-block SAD, comparing a reference to N independent blocks
   #
-  add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad64x64x4d sse2/;
+  add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad64x64x4d sse2 neon avx2/;
 
-  add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad64x32x4d sse2/;
+  add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad64x32x4d sse2 neon avx2/;
 
-  add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad32x64x4d sse2/;
+  add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad32x64x4d sse2 neon avx2/;
 
-  add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad32x32x4d sse2/;
+  add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad32x32x4d sse2 neon avx2/;
 
-  add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad32x16x4d sse2/;
+  add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad32x16x4d sse2 neon avx2/;
 
-  add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x32x4d sse2/;
+  add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad16x32x4d sse2 neon avx2/;
 
-  add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x16x4d sse2/;
+  add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad16x16x4d sse2 neon avx2/;
 
-  add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x8x4d sse2/;
+  add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad16x8x4d sse2 neon avx2/;
 
-  add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x16x4d sse2/;
+  add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad8x16x4d sse2 neon/;
 
-  add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x8x4d sse2/;
+  add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad8x8x4d sse2 neon/;
 
-  add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x4x4d sse2/;
+  add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad8x4x4d sse2 neon/;
 
-  add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad4x8x4d sse2/;
+  add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad4x8x4d sse2 neon/;
 
-  add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad4x4x4d sse2/;
+  add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad4x4x4d sse2 neon/;
+
+  add_proto qw/void vpx_highbd_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_64x64x4d neon sse2 avx2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_64x32x4d neon sse2 avx2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_32x64x4d neon sse2 avx2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_32x32x4d neon sse2 avx2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_32x16x4d neon sse2 avx2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_16x32x4d neon sse2 avx2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_16x16x4d neon sse2 avx2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_16x8x4d neon sse2 avx2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_8x16x4d neon sse2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_8x8x4d neon sse2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_8x4x4d neon/;
+
+  add_proto qw/void vpx_highbd_sad_skip_4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_4x8x4d neon sse2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_4x4x4d neon/;
 
   #
   # Structured Similarity (SSIM)
@@ -1235,511 +1260,549 @@ if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq "
 #
 # Variance
 #
-add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance64x64 sse2 avx2 neon msa/;
+add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance64x64 sse2 avx2 neon neon_dotprod msa mmi vsx lsx/;
 
-add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance64x32 sse2 avx2 neon msa/;
+add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance64x32 sse2 avx2 neon neon_dotprod msa mmi vsx/;
 
-add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x64 sse2 neon msa/;
+add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance32x64 sse2 avx2 neon neon_dotprod msa mmi vsx/;
 
-add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x32 sse2 avx2 neon msa/;
+add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance32x32 sse2 avx2 neon neon_dotprod msa mmi vsx lsx/;
 
-add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x16 sse2 avx2 msa/;
+add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance32x16 sse2 avx2 neon neon_dotprod msa mmi vsx/;
 
-add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x32 sse2 msa/;
+add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance16x32 sse2 avx2 neon neon_dotprod msa mmi vsx/;
 
-add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x16 sse2 avx2 neon msa/;
+add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance16x16 sse2 avx2 neon neon_dotprod msa mmi vsx lsx/;
 
-add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x8 sse2 neon msa/;
+add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance16x8 sse2 avx2 neon neon_dotprod msa mmi vsx/;
 
-add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x16 sse2 neon msa/;
+add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance8x16 sse2 avx2 neon neon_dotprod msa mmi vsx/;
 
-add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x8 sse2 neon msa/;
+add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance8x8 sse2 avx2 neon neon_dotprod msa mmi vsx lsx/;
 
-add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x4 sse2 msa/;
+add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance8x4 sse2 avx2 neon neon_dotprod msa mmi vsx/;
 
-add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance4x8 sse2 msa/;
+add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance4x8 sse2 neon neon_dotprod msa mmi vsx/;
 
-add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance4x4 sse2 msa/;
+add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance4x4 sse2 neon neon_dotprod msa mmi vsx/;
 
 #
 # Specialty Variance
 #
-add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_get16x16var sse2 avx2 neon msa/;
+add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vpx_get16x16var sse2 avx2 neon neon_dotprod msa vsx lsx/;
 
-add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_get8x8var sse2 neon msa/;
+add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vpx_get8x8var sse2 neon neon_dotprod msa vsx/;
 
-add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse16x16 sse2 avx2 neon msa/;
+add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_mse16x16 sse2 avx2 neon neon_dotprod msa mmi vsx lsx/;
 
-add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse16x8 sse2 msa/;
+add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_mse16x8 sse2 avx2 neon neon_dotprod msa mmi vsx/;
 
-add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse8x16 sse2 msa/;
+add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_mse8x16 sse2 neon neon_dotprod msa mmi vsx/;
 
-add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse8x8 sse2 msa/;
+add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_mse8x8 sse2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
-  specialize qw/vpx_get_mb_ss sse2 msa/;
+  specialize qw/vpx_get_mb_ss sse2 msa vsx/;
 
-add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride";
-  specialize qw/vpx_get4x4sse_cs neon msa/;
+add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride";
+  specialize qw/vpx_get4x4sse_cs neon neon_dotprod msa vsx/;
 
 add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
+  specialize qw/vpx_comp_avg_pred neon sse2 avx2 vsx lsx/;
 
 #
 # Subpixel Variance
 #
-add_proto qw/uint32_t vpx_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa sse2 ssse3/;
+add_proto qw/uint32_t vpx_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance64x32 msa sse2 ssse3/;
+add_proto qw/uint32_t vpx_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance64x32 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance32x64 msa sse2 ssse3/;
+add_proto qw/uint32_t vpx_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance32x64 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa sse2 ssse3/;
+add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa mmi sse2 ssse3 lsx/;
 
-add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance32x16 msa sse2 ssse3/;
+add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance32x16 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance16x32 msa sse2 ssse3/;
+add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance16x32 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance16x16 neon msa sse2 ssse3/;
+add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance16x16 neon msa mmi sse2 ssse3 lsx/;
 
-add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance16x8 msa sse2 ssse3/;
+add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance16x8 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance8x16 msa sse2 ssse3/;
+add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance8x16 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance8x8 neon msa sse2 ssse3/;
+add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance8x8 neon msa mmi sse2 ssse3 lsx/;
 
-add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance8x4 msa sse2 ssse3/;
+add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance8x4 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance4x8 msa sse2 ssse3/;
+add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance4x8 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance4x4 msa sse2 ssse3/;
+add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance4x4 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/;
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance64x64 neon avx2 msa mmi sse2 ssse3 lsx/;
 
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance64x32 msa sse2 ssse3/;
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance64x32 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance32x64 msa sse2 ssse3/;
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance32x64 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/;
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance32x32 neon avx2 msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance32x16 msa sse2 ssse3/;
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance32x16 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance16x32 msa sse2 ssse3/;
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance16x32 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance16x16 msa sse2 ssse3/;
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance16x16 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance16x8 msa sse2 ssse3/;
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance16x8 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance8x16 msa sse2 ssse3/;
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance8x16 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance8x8 msa sse2 ssse3/;
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance8x8 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance8x4 msa sse2 ssse3/;
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance8x4 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance4x8 msa sse2 ssse3/;
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance4x8 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance4x4 msa sse2 ssse3/;
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance4x4 neon msa mmi sse2 ssse3/;
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance64x64 sse2/;
+  add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance64x64 sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance64x32 sse2/;
+  add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance64x32 sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance32x64 sse2/;
+  add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance32x64 sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance32x32 sse2/;
+  add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance32x32 sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance32x16 sse2/;
+  add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance32x16 sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance16x32 sse2/;
+  add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance16x32 sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance16x16 sse2/;
+  add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance16x16 sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance16x8 sse2/;
+  add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance16x8 sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance8x16 sse2/;
+  add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance8x16 sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance8x8 sse2/;
+  add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance8x8 sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance8x4 neon sve/;
+  add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance4x8 neon sve/;
+  add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance4x4 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance64x64 sse2/;
+  add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance64x64 sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance64x32 sse2/;
+  add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance64x32 sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance32x64 sse2/;
+  add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance32x64 sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance32x32 sse2/;
+  add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance32x32 sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance32x16 sse2/;
+  add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance32x16 sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance16x32 sse2/;
+  add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance16x32 sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance16x16 sse2/;
+  add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance16x16 sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance16x8 sse2/;
+  add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance16x8 sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance8x16 sse2/;
+  add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance8x16 sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance8x8 sse2/;
+  add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance8x8 sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance8x4 neon sve/;
+  add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance4x8 neon sve/;
+  add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance4x4 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance64x64 sse2/;
+  add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance64x64 sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance64x32 sse2/;
+  add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance64x32 sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance32x64 sse2/;
+  add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance32x64 sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance32x32 sse2/;
+  add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance32x32 sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance32x16 sse2/;
+  add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance32x16 sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance16x32 sse2/;
+  add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance16x32 sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance16x16 sse2/;
+  add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance16x16 sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance16x8 sse2/;
+  add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance16x8 sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance8x16 sse2/;
+  add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance8x16 sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance8x8 sse2/;
+  add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance8x8 sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance8x4 neon sve/;
+  add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance4x8 neon sve/;
+  add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance4x4 neon sve/;
 
-  add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vpx_highbd_8_get16x16var sse2 neon sve/;
 
-  add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vpx_highbd_8_get8x8var sse2 neon sve/;
 
-  add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vpx_highbd_10_get16x16var sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse16x16 sse2/;
+  add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vpx_highbd_10_get8x8var sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse8x8 sse2/;
+  add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vpx_highbd_12_get16x16var sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_mse16x16 sse2/;
+  add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vpx_highbd_12_get8x8var sse2 neon sve/;
 
-  add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_mse8x8 sse2/;
+  add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_mse16x16 sse2 neon neon_dotprod/;
 
-  add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_mse16x16 sse2/;
+  add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_mse16x8 neon neon_dotprod/;
+  add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_mse8x16 neon neon_dotprod/;
+  add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_mse8x8 sse2 neon neon_dotprod/;
 
-  add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_mse8x8 sse2/;
+  add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_mse16x16 sse2 neon sve/;
 
-  add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
+  add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_mse16x8 neon sve/;
+  add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_mse8x16 neon sve/;
+  add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_mse8x8 sse2 neon sve/;
+
+  add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_mse16x16 sse2 neon sve/;
+
+  add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_mse16x8 neon sve/;
+  add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_mse8x16 neon sve/;
+  add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_mse8x8 sse2 neon sve/;
+
+  add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride";
+  specialize qw/vpx_highbd_comp_avg_pred neon sse2/;
 
   #
   # Subpixel Variance
   #
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance64x64 sse2/;
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance64x64 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance64x32 sse2/;
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance64x32 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance32x64 sse2/;
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance32x64 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance32x32 sse2/;
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance32x32 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance32x16 sse2/;
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance32x16 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance16x32 sse2/;
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance16x32 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance16x16 sse2/;
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance16x16 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance16x8 sse2/;
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance16x8 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance8x16 sse2/;
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance8x16 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance8x8 sse2/;
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance8x8 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance8x4 sse2/;
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance8x4 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance4x8 neon/;
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance4x4 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance64x64 sse2/;
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance64x64 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance64x32 sse2/;
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance64x32 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance32x64 sse2/;
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance32x64 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance32x32 sse2/;
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance32x32 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance32x16 sse2/;
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance32x16 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance16x32 sse2/;
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance16x32 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance16x16 sse2/;
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance16x16 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance16x8 sse2/;
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance16x8 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance8x16 sse2/;
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance8x16 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance8x8 sse2/;
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance8x8 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance8x4 sse2/;
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance8x4 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance4x8 neon/;
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance4x4 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance64x64 sse2/;
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance64x64 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance64x32 sse2/;
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance64x32 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance32x64 sse2/;
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance32x64 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance32x32 sse2/;
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance32x32 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance32x16 sse2/;
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance32x16 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance16x32 sse2/;
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance16x32 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance16x16 sse2/;
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance16x16 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance16x8 sse2/;
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance16x8 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance8x16 sse2/;
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance8x16 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance8x8 sse2/;
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance8x8 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance8x4 sse2/;
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance8x4 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance4x8 neon/;
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance4x4 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64 sse2/;
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32 sse2/;
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64 sse2/;
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32 sse2/;
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16 sse2/;
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32 sse2/;
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16 sse2/;
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8 sse2/;
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16 sse2/;
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8 sse2/;
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4 sse2/;
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x8 neon/;
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x4 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64 sse2/;
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32 sse2/;
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64 sse2/;
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32 sse2/;
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16 sse2/;
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32 sse2/;
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16 sse2/;
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8 sse2/;
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16 sse2/;
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8 sse2/;
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4 sse2/;
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x8 neon/;
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x4 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64 sse2/;
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32 sse2/;
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64 sse2/;
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32 sse2/;
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16 sse2/;
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32 sse2/;
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16 sse2/;
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8 sse2/;
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16 sse2/;
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8 sse2/;
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4 sse2/;
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4 sse2 neon/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x8 neon/;
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x4 neon/;
 
 }  # CONFIG_VP9_HIGHBITDEPTH
 
@@ -1751,13 +1814,13 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC")
     specialize qw/vpx_plane_add_noise sse2 msa/;
 
     add_proto qw/void vpx_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
-    specialize qw/vpx_mbpost_proc_down sse2 msa/;
+    specialize qw/vpx_mbpost_proc_down sse2 neon msa vsx/;
 
-    add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
-    specialize qw/vpx_mbpost_proc_across_ip sse2 neon msa/;
+    add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *src, int pitch, int rows, int cols,int flimit";
+    specialize qw/vpx_mbpost_proc_across_ip sse2 neon msa vsx/;
 
     add_proto qw/void vpx_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size";
-    specialize qw/vpx_post_proc_down_and_across_mb_row sse2 neon msa/;
+    specialize qw/vpx_post_proc_down_and_across_mb_row sse2 neon msa vsx/;
 
 }
 
diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_filter.h b/media/libvpx/libvpx/vpx_dsp/vpx_filter.h
index 26d690501b..eb8ff06cd7 100644
--- a/media/libvpx/libvpx/vpx_dsp/vpx_filter.h
+++ b/media/libvpx/libvpx/vpx_dsp/vpx_filter.h
@@ -8,9 +8,10 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_VPX_FILTER_H_
-#define VPX_DSP_VPX_FILTER_H_
+#ifndef VPX_VPX_DSP_VPX_FILTER_H_
+#define VPX_VPX_DSP_VPX_FILTER_H_
 
+#include <assert.h>
 #include "vpx/vpx_integer.h"
 
 #ifdef __cplusplus
@@ -26,19 +27,21 @@ extern "C" {
 
 typedef int16_t InterpKernel[SUBPEL_TAPS];
 
-static INLINE const InterpKernel *get_filter_base(const int16_t *filter) {
-  // NOTE: This assumes that the filter table is 256-byte aligned.
-  // TODO(agrange) Modify to make independent of table alignment.
-  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
-}
-
-static INLINE int get_filter_offset(const int16_t *f,
-                                    const InterpKernel *base) {
-  return (int)((const InterpKernel *)(intptr_t)f - base);
+static INLINE int vpx_get_filter_taps(const int16_t *const filter) {
+  if (filter[0] | filter[7]) {
+    return 8;
+  }
+  if (filter[1] | filter[6]) {
+    return 6;
+  }
+  if (filter[2] | filter[5]) {
+    return 4;
+  }
+  return 2;
 }
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VPX_DSP_VPX_FILTER_H_
+#endif  // VPX_VPX_DSP_VPX_FILTER_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/add_noise_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/add_noise_sse2.asm
index f758da22dc..f51718cf99 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/add_noise_sse2.asm
+++ b/media/libvpx/libvpx/vpx_dsp/x86/add_noise_sse2.asm
@@ -11,10 +11,12 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
+
 ;void vpx_plane_add_noise_sse2(uint8_t *start, const int8_t *noise,
 ;                              int blackclamp, int whiteclamp,
 ;                              int width, int height, int pitch)
-global sym(vpx_plane_add_noise_sse2) PRIVATE
+globalsym(vpx_plane_add_noise_sse2)
 sym(vpx_plane_add_noise_sse2):
     push        rbp
     mov         rbp, rsp
@@ -26,13 +28,13 @@ sym(vpx_plane_add_noise_sse2):
     mov         rdx, 0x01010101
     mov         rax, arg(2)
     mul         rdx
-    movd        xmm3, rax
+    movq        xmm3, rax
     pshufd      xmm3, xmm3, 0  ; xmm3 is 16 copies of char in blackclamp
 
     mov         rdx, 0x01010101
     mov         rax, arg(3)
     mul         rdx
-    movd        xmm4, rax
+    movq        xmm4, rax
     pshufd      xmm4, xmm4, 0  ; xmm4 is 16 copies of char in whiteclamp
 
     movdqu      xmm5, xmm3     ; both clamp = black clamp + white clamp
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_avx2.c
new file mode 100644
index 0000000000..61e4e73c5b
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_avx2.c
@@ -0,0 +1,519 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/bitdepth_conversion_avx2.h"
+#include "vpx_ports/mem.h"
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_hadamard_col8_avx2(__m256i *in, int iter) {
+  __m256i a0 = in[0];
+  __m256i a1 = in[1];
+  __m256i a2 = in[2];
+  __m256i a3 = in[3];
+  __m256i a4 = in[4];
+  __m256i a5 = in[5];
+  __m256i a6 = in[6];
+  __m256i a7 = in[7];
+
+  __m256i b0 = _mm256_add_epi32(a0, a1);
+  __m256i b1 = _mm256_sub_epi32(a0, a1);
+  __m256i b2 = _mm256_add_epi32(a2, a3);
+  __m256i b3 = _mm256_sub_epi32(a2, a3);
+  __m256i b4 = _mm256_add_epi32(a4, a5);
+  __m256i b5 = _mm256_sub_epi32(a4, a5);
+  __m256i b6 = _mm256_add_epi32(a6, a7);
+  __m256i b7 = _mm256_sub_epi32(a6, a7);
+
+  a0 = _mm256_add_epi32(b0, b2);
+  a1 = _mm256_add_epi32(b1, b3);
+  a2 = _mm256_sub_epi32(b0, b2);
+  a3 = _mm256_sub_epi32(b1, b3);
+  a4 = _mm256_add_epi32(b4, b6);
+  a5 = _mm256_add_epi32(b5, b7);
+  a6 = _mm256_sub_epi32(b4, b6);
+  a7 = _mm256_sub_epi32(b5, b7);
+
+  if (iter == 0) {
+    b0 = _mm256_add_epi32(a0, a4);
+    b7 = _mm256_add_epi32(a1, a5);
+    b3 = _mm256_add_epi32(a2, a6);
+    b4 = _mm256_add_epi32(a3, a7);
+    b2 = _mm256_sub_epi32(a0, a4);
+    b6 = _mm256_sub_epi32(a1, a5);
+    b1 = _mm256_sub_epi32(a2, a6);
+    b5 = _mm256_sub_epi32(a3, a7);
+
+    a0 = _mm256_unpacklo_epi32(b0, b1);
+    a1 = _mm256_unpacklo_epi32(b2, b3);
+    a2 = _mm256_unpackhi_epi32(b0, b1);
+    a3 = _mm256_unpackhi_epi32(b2, b3);
+    a4 = _mm256_unpacklo_epi32(b4, b5);
+    a5 = _mm256_unpacklo_epi32(b6, b7);
+    a6 = _mm256_unpackhi_epi32(b4, b5);
+    a7 = _mm256_unpackhi_epi32(b6, b7);
+
+    b0 = _mm256_unpacklo_epi64(a0, a1);
+    b1 = _mm256_unpacklo_epi64(a4, a5);
+    b2 = _mm256_unpackhi_epi64(a0, a1);
+    b3 = _mm256_unpackhi_epi64(a4, a5);
+    b4 = _mm256_unpacklo_epi64(a2, a3);
+    b5 = _mm256_unpacklo_epi64(a6, a7);
+    b6 = _mm256_unpackhi_epi64(a2, a3);
+    b7 = _mm256_unpackhi_epi64(a6, a7);
+
+    in[0] = _mm256_permute2x128_si256(b0, b1, 0x20);
+    in[1] = _mm256_permute2x128_si256(b0, b1, 0x31);
+    in[2] = _mm256_permute2x128_si256(b2, b3, 0x20);
+    in[3] = _mm256_permute2x128_si256(b2, b3, 0x31);
+    in[4] = _mm256_permute2x128_si256(b4, b5, 0x20);
+    in[5] = _mm256_permute2x128_si256(b4, b5, 0x31);
+    in[6] = _mm256_permute2x128_si256(b6, b7, 0x20);
+    in[7] = _mm256_permute2x128_si256(b6, b7, 0x31);
+  } else {
+    in[0] = _mm256_add_epi32(a0, a4);
+    in[7] = _mm256_add_epi32(a1, a5);
+    in[3] = _mm256_add_epi32(a2, a6);
+    in[4] = _mm256_add_epi32(a3, a7);
+    in[2] = _mm256_sub_epi32(a0, a4);
+    in[6] = _mm256_sub_epi32(a1, a5);
+    in[1] = _mm256_sub_epi32(a2, a6);
+    in[5] = _mm256_sub_epi32(a3, a7);
+  }
+}
+
+void vpx_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+                                  tran_low_t *coeff) {
+  __m128i src16[8];
+  __m256i src32[8];
+
+  src16[0] = _mm_loadu_si128((const __m128i *)src_diff);
+  src16[1] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[2] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[3] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[7] = _mm_loadu_si128((const __m128i *)(src_diff + src_stride));
+
+  src32[0] = _mm256_cvtepi16_epi32(src16[0]);
+  src32[1] = _mm256_cvtepi16_epi32(src16[1]);
+  src32[2] = _mm256_cvtepi16_epi32(src16[2]);
+  src32[3] = _mm256_cvtepi16_epi32(src16[3]);
+  src32[4] = _mm256_cvtepi16_epi32(src16[4]);
+  src32[5] = _mm256_cvtepi16_epi32(src16[5]);
+  src32[6] = _mm256_cvtepi16_epi32(src16[6]);
+  src32[7] = _mm256_cvtepi16_epi32(src16[7]);
+
+  highbd_hadamard_col8_avx2(src32, 0);
+  highbd_hadamard_col8_avx2(src32, 1);
+
+  _mm256_storeu_si256((__m256i *)coeff, src32[0]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[1]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[2]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[3]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[4]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[5]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[6]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[7]);
+}
+
+void vpx_highbd_hadamard_16x16_avx2(const int16_t *src_diff,
+                                    ptrdiff_t src_stride, tran_low_t *coeff) {
+  int idx;
+  tran_low_t *t_coeff = coeff;
+  for (idx = 0; idx < 4; ++idx) {
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+    vpx_highbd_hadamard_8x8_avx2(src_ptr, src_stride, t_coeff + idx * 64);
+  }
+
+  for (idx = 0; idx < 64; idx += 8) {
+    __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+    __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
+    __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
+    __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
+
+    __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
+    __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
+    __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
+    __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
+
+    b0 = _mm256_srai_epi32(b0, 1);
+    b1 = _mm256_srai_epi32(b1, 1);
+    b2 = _mm256_srai_epi32(b2, 1);
+    b3 = _mm256_srai_epi32(b3, 1);
+
+    coeff0 = _mm256_add_epi32(b0, b2);
+    coeff1 = _mm256_add_epi32(b1, b3);
+    coeff2 = _mm256_sub_epi32(b0, b2);
+    coeff3 = _mm256_sub_epi32(b1, b3);
+
+    _mm256_storeu_si256((__m256i *)coeff, coeff0);
+    _mm256_storeu_si256((__m256i *)(coeff + 64), coeff1);
+    _mm256_storeu_si256((__m256i *)(coeff + 128), coeff2);
+    _mm256_storeu_si256((__m256i *)(coeff + 192), coeff3);
+
+    coeff += 8;
+    t_coeff += 8;
+  }
+}
+
+void vpx_highbd_hadamard_32x32_avx2(const int16_t *src_diff,
+                                    ptrdiff_t src_stride, tran_low_t *coeff) {
+  int idx;
+  tran_low_t *t_coeff = coeff;
+  for (idx = 0; idx < 4; ++idx) {
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+    vpx_highbd_hadamard_16x16_avx2(src_ptr, src_stride, t_coeff + idx * 256);
+  }
+
+  for (idx = 0; idx < 256; idx += 8) {
+    __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+    __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
+    __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
+    __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
+
+    __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
+    __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
+    __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
+    __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
+
+    b0 = _mm256_srai_epi32(b0, 2);
+    b1 = _mm256_srai_epi32(b1, 2);
+    b2 = _mm256_srai_epi32(b2, 2);
+    b3 = _mm256_srai_epi32(b3, 2);
+
+    coeff0 = _mm256_add_epi32(b0, b2);
+    coeff1 = _mm256_add_epi32(b1, b3);
+    coeff2 = _mm256_sub_epi32(b0, b2);
+    coeff3 = _mm256_sub_epi32(b1, b3);
+
+    _mm256_storeu_si256((__m256i *)coeff, coeff0);
+    _mm256_storeu_si256((__m256i *)(coeff + 256), coeff1);
+    _mm256_storeu_si256((__m256i *)(coeff + 512), coeff2);
+    _mm256_storeu_si256((__m256i *)(coeff + 768), coeff3);
+
+    coeff += 8;
+    t_coeff += 8;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE void sign_extend_16bit_to_32bit_avx2(__m256i in, __m256i zero,
+                                                   __m256i *out_lo,
+                                                   __m256i *out_hi) {
+  const __m256i sign_bits = _mm256_cmpgt_epi16(zero, in);
+  *out_lo = _mm256_unpacklo_epi16(in, sign_bits);
+  *out_hi = _mm256_unpackhi_epi16(in, sign_bits);
+}
+
+static void hadamard_col8x2_avx2(__m256i *in, int iter) {
+  __m256i a0 = in[0];
+  __m256i a1 = in[1];
+  __m256i a2 = in[2];
+  __m256i a3 = in[3];
+  __m256i a4 = in[4];
+  __m256i a5 = in[5];
+  __m256i a6 = in[6];
+  __m256i a7 = in[7];
+
+  __m256i b0 = _mm256_add_epi16(a0, a1);
+  __m256i b1 = _mm256_sub_epi16(a0, a1);
+  __m256i b2 = _mm256_add_epi16(a2, a3);
+  __m256i b3 = _mm256_sub_epi16(a2, a3);
+  __m256i b4 = _mm256_add_epi16(a4, a5);
+  __m256i b5 = _mm256_sub_epi16(a4, a5);
+  __m256i b6 = _mm256_add_epi16(a6, a7);
+  __m256i b7 = _mm256_sub_epi16(a6, a7);
+
+  a0 = _mm256_add_epi16(b0, b2);
+  a1 = _mm256_add_epi16(b1, b3);
+  a2 = _mm256_sub_epi16(b0, b2);
+  a3 = _mm256_sub_epi16(b1, b3);
+  a4 = _mm256_add_epi16(b4, b6);
+  a5 = _mm256_add_epi16(b5, b7);
+  a6 = _mm256_sub_epi16(b4, b6);
+  a7 = _mm256_sub_epi16(b5, b7);
+
+  if (iter == 0) {
+    b0 = _mm256_add_epi16(a0, a4);
+    b7 = _mm256_add_epi16(a1, a5);
+    b3 = _mm256_add_epi16(a2, a6);
+    b4 = _mm256_add_epi16(a3, a7);
+    b2 = _mm256_sub_epi16(a0, a4);
+    b6 = _mm256_sub_epi16(a1, a5);
+    b1 = _mm256_sub_epi16(a2, a6);
+    b5 = _mm256_sub_epi16(a3, a7);
+
+    a0 = _mm256_unpacklo_epi16(b0, b1);
+    a1 = _mm256_unpacklo_epi16(b2, b3);
+    a2 = _mm256_unpackhi_epi16(b0, b1);
+    a3 = _mm256_unpackhi_epi16(b2, b3);
+    a4 = _mm256_unpacklo_epi16(b4, b5);
+    a5 = _mm256_unpacklo_epi16(b6, b7);
+    a6 = _mm256_unpackhi_epi16(b4, b5);
+    a7 = _mm256_unpackhi_epi16(b6, b7);
+
+    b0 = _mm256_unpacklo_epi32(a0, a1);
+    b1 = _mm256_unpacklo_epi32(a4, a5);
+    b2 = _mm256_unpackhi_epi32(a0, a1);
+    b3 = _mm256_unpackhi_epi32(a4, a5);
+    b4 = _mm256_unpacklo_epi32(a2, a3);
+    b5 = _mm256_unpacklo_epi32(a6, a7);
+    b6 = _mm256_unpackhi_epi32(a2, a3);
+    b7 = _mm256_unpackhi_epi32(a6, a7);
+
+    in[0] = _mm256_unpacklo_epi64(b0, b1);
+    in[1] = _mm256_unpackhi_epi64(b0, b1);
+    in[2] = _mm256_unpacklo_epi64(b2, b3);
+    in[3] = _mm256_unpackhi_epi64(b2, b3);
+    in[4] = _mm256_unpacklo_epi64(b4, b5);
+    in[5] = _mm256_unpackhi_epi64(b4, b5);
+    in[6] = _mm256_unpacklo_epi64(b6, b7);
+    in[7] = _mm256_unpackhi_epi64(b6, b7);
+  } else {
+    in[0] = _mm256_add_epi16(a0, a4);
+    in[7] = _mm256_add_epi16(a1, a5);
+    in[3] = _mm256_add_epi16(a2, a6);
+    in[4] = _mm256_add_epi16(a3, a7);
+    in[2] = _mm256_sub_epi16(a0, a4);
+    in[6] = _mm256_sub_epi16(a1, a5);
+    in[1] = _mm256_sub_epi16(a2, a6);
+    in[5] = _mm256_sub_epi16(a3, a7);
+  }
+}
+
+static void hadamard_8x8x2_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+                                int16_t *coeff) {
+  __m256i src[8];
+  src[0] = _mm256_loadu_si256((const __m256i *)src_diff);
+  src[1] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[2] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[3] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[7] = _mm256_loadu_si256((const __m256i *)(src_diff + src_stride));
+
+  hadamard_col8x2_avx2(src, 0);
+  hadamard_col8x2_avx2(src, 1);
+
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[0], src[1], 0x20));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[2], src[3], 0x20));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[4], src[5], 0x20));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[6], src[7], 0x20));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[0], src[1], 0x31));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[2], src[3], 0x31));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[4], src[5], 0x31));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[6], src[7], 0x31));
+}
+
+static INLINE void hadamard_16x16_avx2(const int16_t *src_diff,
+                                       ptrdiff_t src_stride, tran_low_t *coeff,
+                                       int is_final) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
+  int16_t *t_coeff = temp_coeff;
+#else
+  int16_t *t_coeff = coeff;
+#endif
+  int16_t *coeff16 = (int16_t *)coeff;
+  int idx;
+  for (idx = 0; idx < 2; ++idx) {
+    const int16_t *src_ptr = src_diff + idx * 8 * src_stride;
+    hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2));
+  }
+
+  for (idx = 0; idx < 64; idx += 16) {
+    const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+    const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
+    const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
+    const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
+
+    __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
+    __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
+    __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
+    __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
+
+    b0 = _mm256_srai_epi16(b0, 1);
+    b1 = _mm256_srai_epi16(b1, 1);
+    b2 = _mm256_srai_epi16(b2, 1);
+    b3 = _mm256_srai_epi16(b3, 1);
+    if (is_final) {
+      store_tran_low(_mm256_add_epi16(b0, b2), coeff);
+      store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64);
+      store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128);
+      store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192);
+      coeff += 16;
+    } else {
+      _mm256_storeu_si256((__m256i *)coeff16, _mm256_add_epi16(b0, b2));
+      _mm256_storeu_si256((__m256i *)(coeff16 + 64), _mm256_add_epi16(b1, b3));
+      _mm256_storeu_si256((__m256i *)(coeff16 + 128), _mm256_sub_epi16(b0, b2));
+      _mm256_storeu_si256((__m256i *)(coeff16 + 192), _mm256_sub_epi16(b1, b3));
+      coeff16 += 16;
+    }
+    t_coeff += 16;
+  }
+}
+
+void vpx_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
+  hadamard_16x16_avx2(src_diff, src_stride, coeff, 1);
+}
+
+void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  // For high bitdepths, it is unnecessary to store_tran_low
+  // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
+  // next stage.  Output to an intermediate buffer first, then store_tran_low()
+  // in the final stage.
+  DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
+  int16_t *t_coeff = temp_coeff;
+#else
+  int16_t *t_coeff = coeff;
+#endif
+  int idx;
+  __m256i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo,
+      b3_lo;
+  __m256i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi,
+      b3_hi;
+  __m256i b0, b1, b2, b3;
+  const __m256i zero = _mm256_setzero_si256();
+  for (idx = 0; idx < 4; ++idx) {
+    // src_diff: 9 bit, dynamic range [-255, 255]
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+    hadamard_16x16_avx2(src_ptr, src_stride,
+                        (tran_low_t *)(t_coeff + idx * 256), 0);
+  }
+
+  for (idx = 0; idx < 256; idx += 16) {
+    const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+    const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
+    const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
+    const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
+
+    // Sign extend 16 bit to 32 bit.
+    sign_extend_16bit_to_32bit_avx2(coeff0, zero, &coeff0_lo, &coeff0_hi);
+    sign_extend_16bit_to_32bit_avx2(coeff1, zero, &coeff1_lo, &coeff1_hi);
+    sign_extend_16bit_to_32bit_avx2(coeff2, zero, &coeff2_lo, &coeff2_hi);
+    sign_extend_16bit_to_32bit_avx2(coeff3, zero, &coeff3_lo, &coeff3_hi);
+
+    b0_lo = _mm256_add_epi32(coeff0_lo, coeff1_lo);
+    b0_hi = _mm256_add_epi32(coeff0_hi, coeff1_hi);
+
+    b1_lo = _mm256_sub_epi32(coeff0_lo, coeff1_lo);
+    b1_hi = _mm256_sub_epi32(coeff0_hi, coeff1_hi);
+
+    b2_lo = _mm256_add_epi32(coeff2_lo, coeff3_lo);
+    b2_hi = _mm256_add_epi32(coeff2_hi, coeff3_hi);
+
+    b3_lo = _mm256_sub_epi32(coeff2_lo, coeff3_lo);
+    b3_hi = _mm256_sub_epi32(coeff2_hi, coeff3_hi);
+
+    b0_lo = _mm256_srai_epi32(b0_lo, 2);
+    b1_lo = _mm256_srai_epi32(b1_lo, 2);
+    b2_lo = _mm256_srai_epi32(b2_lo, 2);
+    b3_lo = _mm256_srai_epi32(b3_lo, 2);
+
+    b0_hi = _mm256_srai_epi32(b0_hi, 2);
+    b1_hi = _mm256_srai_epi32(b1_hi, 2);
+    b2_hi = _mm256_srai_epi32(b2_hi, 2);
+    b3_hi = _mm256_srai_epi32(b3_hi, 2);
+
+    b0 = _mm256_packs_epi32(b0_lo, b0_hi);
+    b1 = _mm256_packs_epi32(b1_lo, b1_hi);
+    b2 = _mm256_packs_epi32(b2_lo, b2_hi);
+    b3 = _mm256_packs_epi32(b3_lo, b3_hi);
+
+    store_tran_low(_mm256_add_epi16(b0, b2), coeff);
+    store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256);
+    store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 512);
+    store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 768);
+
+    coeff += 16;
+    t_coeff += 16;
+  }
+}
+
+int vpx_satd_avx2(const tran_low_t *coeff, int length) {
+  const __m256i one = _mm256_set1_epi16(1);
+  __m256i accum = _mm256_setzero_si256();
+  int i;
+
+  for (i = 0; i < length; i += 16) {
+    const __m256i src_line = load_tran_low(coeff);
+    const __m256i abs = _mm256_abs_epi16(src_line);
+    const __m256i sum = _mm256_madd_epi16(abs, one);
+    accum = _mm256_add_epi32(accum, sum);
+    coeff += 16;
+  }
+
+  {  // 32 bit horizontal add
+    const __m256i a = _mm256_srli_si256(accum, 8);
+    const __m256i b = _mm256_add_epi32(accum, a);
+    const __m256i c = _mm256_srli_epi64(b, 32);
+    const __m256i d = _mm256_add_epi32(b, c);
+    const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
+                                            _mm256_extractf128_si256(d, 1));
+    return _mm_cvtsi128_si32(accum_128);
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+int vpx_highbd_satd_avx2(const tran_low_t *coeff, int length) {
+  __m256i accum = _mm256_setzero_si256();
+  int i;
+
+  for (i = 0; i < length; i += 8, coeff += 8) {
+    const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff);
+    const __m256i abs = _mm256_abs_epi32(src_line);
+    accum = _mm256_add_epi32(accum, abs);
+  }
+
+  {  // 32 bit horizontal add
+    const __m256i a = _mm256_srli_si256(accum, 8);
+    const __m256i b = _mm256_add_epi32(accum, a);
+    const __m256i c = _mm256_srli_epi64(b, 32);
+    const __m256i d = _mm256_add_epi32(b, c);
+    const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
+                                            _mm256_extractf128_si256(d, 1));
+    return _mm_cvtsi128_si32(accum_128);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
index b0a104bad0..4447dfab7c 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
+++ b/media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
@@ -11,8 +11,18 @@
 #include <emmintrin.h>
 
 #include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 #include "vpx_ports/mem.h"
 
+static INLINE void sign_extend_16bit_to_32bit_sse2(__m128i in, __m128i zero,
+                                                   __m128i *out_lo,
+                                                   __m128i *out_hi) {
+  const __m128i sign_bits = _mm_cmplt_epi16(in, zero);
+  *out_lo = _mm_unpacklo_epi16(in, sign_bits);
+  *out_hi = _mm_unpackhi_epi16(in, sign_bits);
+}
+
 void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
                          int *min, int *max) {
   __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
@@ -136,6 +146,56 @@ unsigned int vpx_avg_4x4_sse2(const uint8_t *s, int p) {
   return (avg + 8) >> 4;
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+unsigned int vpx_highbd_avg_8x8_sse2(const uint8_t *s8, int p) {
+  __m128i s0, s1;
+  unsigned int avg;
+  const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
+  const __m128i zero = _mm_setzero_si128();
+  s0 = _mm_loadu_si128((const __m128i *)(s));
+  s1 = _mm_loadu_si128((const __m128i *)(s + p));
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_loadu_si128((const __m128i *)(s + 2 * p));
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_loadu_si128((const __m128i *)(s + 3 * p));
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_loadu_si128((const __m128i *)(s + 4 * p));
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_loadu_si128((const __m128i *)(s + 5 * p));
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_loadu_si128((const __m128i *)(s + 6 * p));
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_loadu_si128((const __m128i *)(s + 7 * p));
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpackhi_epi16(s0, zero);
+  s0 = _mm_unpacklo_epi16(s0, zero);
+  s0 = _mm_add_epi32(s0, s1);
+  s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 8));
+  s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 4));
+  avg = (unsigned int)_mm_cvtsi128_si32(s0);
+
+  return (avg + 32) >> 6;
+}
+
+unsigned int vpx_highbd_avg_4x4_sse2(const uint8_t *s8, int p) {
+  __m128i s0, s1;
+  unsigned int avg;
+  const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
+  s0 = _mm_loadl_epi64((const __m128i *)(s));
+  s1 = _mm_loadl_epi64((const __m128i *)(s + p));
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_loadl_epi64((const __m128i *)(s + 2 * p));
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_loadl_epi64((const __m128i *)(s + 3 * p));
+  s0 = _mm_adds_epu16(s0, s1);
+  s0 = _mm_add_epi16(s0, _mm_srli_si128(s0, 4));
+  s0 = _mm_add_epi16(s0, _mm_srli_si128(s0, 2));
+  avg = _mm_extract_epi16(s0, 0);
+
+  return (avg + 8) >> 4;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static void hadamard_col8_sse2(__m128i *in, int iter) {
   __m128i a0 = in[0];
   __m128i a1 = in[1];
@@ -212,8 +272,9 @@ static void hadamard_col8_sse2(__m128i *in, int iter) {
   }
 }
 
-void vpx_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
-                           int16_t *coeff) {
+static INLINE void hadamard_8x8_sse2(const int16_t *src_diff,
+                                     ptrdiff_t src_stride, tran_low_t *coeff,
+                                     int is_final) {
   __m128i src[8];
   src[0] = _mm_load_si128((const __m128i *)src_diff);
   src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
@@ -222,42 +283,79 @@ void vpx_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
   src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
   src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
   src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[7] = _mm_load_si128((const __m128i *)(src_diff + src_stride));
 
   hadamard_col8_sse2(src, 0);
   hadamard_col8_sse2(src, 1);
 
-  _mm_store_si128((__m128i *)coeff, src[0]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[1]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[2]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[3]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[4]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[5]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[6]);
-  coeff += 8;
-  _mm_store_si128((__m128i *)coeff, src[7]);
+  if (is_final) {
+    store_tran_low(src[0], coeff);
+    coeff += 8;
+    store_tran_low(src[1], coeff);
+    coeff += 8;
+    store_tran_low(src[2], coeff);
+    coeff += 8;
+    store_tran_low(src[3], coeff);
+    coeff += 8;
+    store_tran_low(src[4], coeff);
+    coeff += 8;
+    store_tran_low(src[5], coeff);
+    coeff += 8;
+    store_tran_low(src[6], coeff);
+    coeff += 8;
+    store_tran_low(src[7], coeff);
+  } else {
+    int16_t *coeff16 = (int16_t *)coeff;
+    _mm_store_si128((__m128i *)coeff16, src[0]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[1]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[2]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[3]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[4]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[5]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[6]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[7]);
+  }
 }
 
-void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
-                             int16_t *coeff) {
+void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+                           tran_low_t *coeff) {
+  hadamard_8x8_sse2(src_diff, src_stride, coeff, 1);
+}
+
+static INLINE void hadamard_16x16_sse2(const int16_t *src_diff,
+                                       ptrdiff_t src_stride, tran_low_t *coeff,
+                                       int is_final) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  // For high bitdepths, it is unnecessary to store_tran_low
+  // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
+  // next stage.  Output to an intermediate buffer first, then store_tran_low()
+  // in the final stage.
+  DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
+  int16_t *t_coeff = temp_coeff;
+#else
+  int16_t *t_coeff = coeff;
+#endif
+  int16_t *coeff16 = (int16_t *)coeff;
   int idx;
   for (idx = 0; idx < 4; ++idx) {
-    int16_t const *src_ptr =
+    const int16_t *src_ptr =
         src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
-    vpx_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
+    hadamard_8x8_sse2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 64),
+                      0);
   }
 
   for (idx = 0; idx < 64; idx += 8) {
-    __m128i coeff0 = _mm_load_si128((const __m128i *)coeff);
-    __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64));
-    __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128));
-    __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192));
+    __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
+    __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64));
+    __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128));
+    __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192));
 
     __m128i b0 = _mm_add_epi16(coeff0, coeff1);
     __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
@@ -271,25 +369,119 @@ void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
 
     coeff0 = _mm_add_epi16(b0, b2);
     coeff1 = _mm_add_epi16(b1, b3);
-    _mm_store_si128((__m128i *)coeff, coeff0);
-    _mm_store_si128((__m128i *)(coeff + 64), coeff1);
-
     coeff2 = _mm_sub_epi16(b0, b2);
     coeff3 = _mm_sub_epi16(b1, b3);
-    _mm_store_si128((__m128i *)(coeff + 128), coeff2);
-    _mm_store_si128((__m128i *)(coeff + 192), coeff3);
 
-    coeff += 8;
+    if (is_final) {
+      store_tran_low(coeff0, coeff);
+      store_tran_low(coeff1, coeff + 64);
+      store_tran_low(coeff2, coeff + 128);
+      store_tran_low(coeff3, coeff + 192);
+      coeff += 8;
+    } else {
+      _mm_store_si128((__m128i *)coeff16, coeff0);
+      _mm_store_si128((__m128i *)(coeff16 + 64), coeff1);
+      _mm_store_si128((__m128i *)(coeff16 + 128), coeff2);
+      _mm_store_si128((__m128i *)(coeff16 + 192), coeff3);
+      coeff16 += 8;
+    }
+
+    t_coeff += 8;
   }
 }
 
-int vpx_satd_sse2(const int16_t *coeff, int length) {
+void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
+  hadamard_16x16_sse2(src_diff, src_stride, coeff, 1);
+}
+
+void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  // For high bitdepths, it is unnecessary to store_tran_low
+  // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
+  // next stage.  Output to an intermediate buffer first, then store_tran_low()
+  // in the final stage.
+  DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
+  int16_t *t_coeff = temp_coeff;
+#else
+  int16_t *t_coeff = coeff;
+#endif
+  int idx;
+  __m128i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo,
+      b3_lo;
+  __m128i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi,
+      b3_hi;
+  __m128i b0, b1, b2, b3;
+  const __m128i zero = _mm_setzero_si128();
+  for (idx = 0; idx < 4; ++idx) {
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+    hadamard_16x16_sse2(src_ptr, src_stride,
+                        (tran_low_t *)(t_coeff + idx * 256), 0);
+  }
+
+  for (idx = 0; idx < 256; idx += 8) {
+    __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
+    __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 256));
+    __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512));
+    __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768));
+
+    // Sign extend 16 bit to 32 bit.
+    sign_extend_16bit_to_32bit_sse2(coeff0, zero, &coeff0_lo, &coeff0_hi);
+    sign_extend_16bit_to_32bit_sse2(coeff1, zero, &coeff1_lo, &coeff1_hi);
+    sign_extend_16bit_to_32bit_sse2(coeff2, zero, &coeff2_lo, &coeff2_hi);
+    sign_extend_16bit_to_32bit_sse2(coeff3, zero, &coeff3_lo, &coeff3_hi);
+
+    b0_lo = _mm_add_epi32(coeff0_lo, coeff1_lo);
+    b0_hi = _mm_add_epi32(coeff0_hi, coeff1_hi);
+
+    b1_lo = _mm_sub_epi32(coeff0_lo, coeff1_lo);
+    b1_hi = _mm_sub_epi32(coeff0_hi, coeff1_hi);
+
+    b2_lo = _mm_add_epi32(coeff2_lo, coeff3_lo);
+    b2_hi = _mm_add_epi32(coeff2_hi, coeff3_hi);
+
+    b3_lo = _mm_sub_epi32(coeff2_lo, coeff3_lo);
+    b3_hi = _mm_sub_epi32(coeff2_hi, coeff3_hi);
+
+    b0_lo = _mm_srai_epi32(b0_lo, 2);
+    b1_lo = _mm_srai_epi32(b1_lo, 2);
+    b2_lo = _mm_srai_epi32(b2_lo, 2);
+    b3_lo = _mm_srai_epi32(b3_lo, 2);
+
+    b0_hi = _mm_srai_epi32(b0_hi, 2);
+    b1_hi = _mm_srai_epi32(b1_hi, 2);
+    b2_hi = _mm_srai_epi32(b2_hi, 2);
+    b3_hi = _mm_srai_epi32(b3_hi, 2);
+
+    b0 = _mm_packs_epi32(b0_lo, b0_hi);
+    b1 = _mm_packs_epi32(b1_lo, b1_hi);
+    b2 = _mm_packs_epi32(b2_lo, b2_hi);
+    b3 = _mm_packs_epi32(b3_lo, b3_hi);
+
+    coeff0 = _mm_add_epi16(b0, b2);
+    coeff1 = _mm_add_epi16(b1, b3);
+    store_tran_low(coeff0, coeff);
+    store_tran_low(coeff1, coeff + 256);
+
+    coeff2 = _mm_sub_epi16(b0, b2);
+    coeff3 = _mm_sub_epi16(b1, b3);
+    store_tran_low(coeff2, coeff + 512);
+    store_tran_low(coeff3, coeff + 768);
+
+    coeff += 8;
+    t_coeff += 8;
+  }
+}
+
+int vpx_satd_sse2(const tran_low_t *coeff, int length) {
   int i;
   const __m128i zero = _mm_setzero_si128();
   __m128i accum = zero;
 
   for (i = 0; i < length; i += 8) {
-    const __m128i src_line = _mm_load_si128((const __m128i *)coeff);
+    const __m128i src_line = load_tran_low(coeff);
     const __m128i inv = _mm_sub_epi16(zero, src_line);
     const __m128i abs = _mm_max_epi16(src_line, inv);  // abs(src_line)
     const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);
@@ -309,7 +501,7 @@ int vpx_satd_sse2(const int16_t *coeff, int length) {
   return _mm_cvtsi128_si32(accum);
 }
 
-void vpx_int_pro_row_sse2(int16_t *hbuf, uint8_t const *ref,
+void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref,
                           const int ref_stride, const int height) {
   int idx;
   __m128i zero = _mm_setzero_si128();
@@ -358,16 +550,16 @@ void vpx_int_pro_row_sse2(int16_t *hbuf, uint8_t const *ref,
   _mm_storeu_si128((__m128i *)hbuf, s1);
 }
 
-int16_t vpx_int_pro_col_sse2(uint8_t const *ref, const int width) {
+int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width) {
   __m128i zero = _mm_setzero_si128();
-  __m128i src_line = _mm_load_si128((const __m128i *)ref);
+  __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
   __m128i s0 = _mm_sad_epu8(src_line, zero);
   __m128i s1;
   int i;
 
   for (i = 16; i < width; i += 16) {
     ref += 16;
-    src_line = _mm_load_si128((const __m128i *)ref);
+    src_line = _mm_loadu_si128((const __m128i *)ref);
     s1 = _mm_sad_epu8(src_line, zero);
     s0 = _mm_adds_epu16(s0, s1);
   }
@@ -378,7 +570,7 @@ int16_t vpx_int_pro_col_sse2(uint8_t const *ref, const int width) {
   return _mm_extract_epi16(s0, 0);
 }
 
-int vpx_vector_var_sse2(int16_t const *ref, int16_t const *src, const int bwl) {
+int vpx_vector_var_sse2(const int16_t *ref, const int16_t *src, const int bwl) {
   int idx;
   int width = 4 << bwl;
   int16_t mean;
@@ -416,7 +608,7 @@ int vpx_vector_var_sse2(int16_t const *ref, int16_t const *src, const int bwl) {
   v1 = _mm_srli_epi64(sse, 32);
   sse = _mm_add_epi32(sse, v1);
 
-  mean = _mm_extract_epi16(sum, 0);
+  mean = (int16_t)_mm_extract_epi16(sum, 0);
 
   return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/avg_pred_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/avg_pred_avx2.c
new file mode 100644
index 0000000000..f4357998c9
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/avg_pred_avx2.c
@@ -0,0 +1,111 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+
+void vpx_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width,
+                            int height, const uint8_t *ref, int ref_stride) {
+  int row = 0;
+  // comp_pred and pred must be 32 byte aligned.
+  assert(((intptr_t)comp_pred % 32) == 0);
+  assert(((intptr_t)pred % 32) == 0);
+
+  if (width == 8) {
+    assert(height % 4 == 0);
+    do {
+      const __m256i p = _mm256_load_si256((const __m256i *)pred);
+      const __m128i r_0 = _mm_loadl_epi64((const __m128i *)ref);
+      const __m128i r_1 =
+          _mm_loadl_epi64((const __m128i *)(ref + 2 * ref_stride));
+
+      const __m128i r1 = _mm_castps_si128(_mm_loadh_pi(
+          _mm_castsi128_ps(r_0), (const __m64 *)(ref + ref_stride)));
+      const __m128i r2 = _mm_castps_si128(_mm_loadh_pi(
+          _mm_castsi128_ps(r_1), (const __m64 *)(ref + 3 * ref_stride)));
+
+      const __m256i ref_0123 =
+          _mm256_inserti128_si256(_mm256_castsi128_si256(r1), r2, 1);
+      const __m256i avg = _mm256_avg_epu8(p, ref_0123);
+
+      _mm256_store_si256((__m256i *)comp_pred, avg);
+
+      row += 4;
+      pred += 32;
+      comp_pred += 32;
+      ref += 4 * ref_stride;
+    } while (row < height);
+  } else if (width == 16) {
+    assert(height % 4 == 0);
+    do {
+      const __m256i pred_0 = _mm256_load_si256((const __m256i *)pred);
+      const __m256i pred_1 = _mm256_load_si256((const __m256i *)(pred + 32));
+      const __m256i tmp0 =
+          _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)ref));
+      const __m256i ref_0 = _mm256_inserti128_si256(
+          tmp0, _mm_loadu_si128((const __m128i *)(ref + ref_stride)), 1);
+      const __m256i tmp1 = _mm256_castsi128_si256(
+          _mm_loadu_si128((const __m128i *)(ref + 2 * ref_stride)));
+      const __m256i ref_1 = _mm256_inserti128_si256(
+          tmp1, _mm_loadu_si128((const __m128i *)(ref + 3 * ref_stride)), 1);
+      const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
+      const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
+      _mm256_store_si256((__m256i *)comp_pred, average_0);
+      _mm256_store_si256((__m256i *)(comp_pred + 32), average_1);
+
+      row += 4;
+      pred += 64;
+      comp_pred += 64;
+      ref += 4 * ref_stride;
+    } while (row < height);
+  } else if (width == 32) {
+    assert(height % 2 == 0);
+    do {
+      const __m256i pred_0 = _mm256_load_si256((const __m256i *)pred);
+      const __m256i pred_1 = _mm256_load_si256((const __m256i *)(pred + 32));
+      const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)ref);
+      const __m256i ref_1 =
+          _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
+      const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
+      const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
+      _mm256_store_si256((__m256i *)comp_pred, average_0);
+      _mm256_store_si256((__m256i *)(comp_pred + 32), average_1);
+
+      row += 2;
+      pred += 64;
+      comp_pred += 64;
+      ref += 2 * ref_stride;
+    } while (row < height);
+  } else if (width % 64 == 0) {
+    do {
+      int x;
+      for (x = 0; x < width; x += 64) {
+        const __m256i pred_0 = _mm256_load_si256((const __m256i *)(pred + x));
+        const __m256i pred_1 =
+            _mm256_load_si256((const __m256i *)(pred + x + 32));
+        const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)(ref + x));
+        const __m256i ref_1 =
+            _mm256_loadu_si256((const __m256i *)(ref + x + 32));
+        const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
+        const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
+        _mm256_store_si256((__m256i *)(comp_pred + x), average_0);
+        _mm256_store_si256((__m256i *)(comp_pred + x + 32), average_1);
+      }
+      row++;
+      pred += width;
+      comp_pred += width;
+      ref += ref_stride;
+    } while (row < height);
+  } else {
+    vpx_comp_avg_pred_sse2(comp_pred, pred, width, height, ref, ref_stride);
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/avg_pred_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/avg_pred_sse2.c
new file mode 100644
index 0000000000..c6e70f744e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/avg_pred_sse2.c
@@ -0,0 +1,69 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+
+void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width,
+                            int height, const uint8_t *ref, int ref_stride) {
+  /* comp_pred and pred must be 16 byte aligned. */
+  assert(((intptr_t)comp_pred & 0xf) == 0);
+  assert(((intptr_t)pred & 0xf) == 0);
+  if (width > 8) {
+    int x, y;
+    for (y = 0; y < height; ++y) {
+      for (x = 0; x < width; x += 16) {
+        const __m128i p = _mm_load_si128((const __m128i *)(pred + x));
+        const __m128i r = _mm_loadu_si128((const __m128i *)(ref + x));
+        const __m128i avg = _mm_avg_epu8(p, r);
+        _mm_store_si128((__m128i *)(comp_pred + x), avg);
+      }
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    }
+  } else {  // width must be 4 or 8.
+    int i;
+    // Process 16 elements at a time. comp_pred and pred have width == stride
+    // and therefore live in contigious memory. 4*4, 4*8, 8*4, 8*8, and 8*16 are
+    // all divisible by 16 so just ref needs to be massaged when loading.
+    for (i = 0; i < width * height; i += 16) {
+      const __m128i p = _mm_load_si128((const __m128i *)pred);
+      __m128i r;
+      __m128i avg;
+      if (width == ref_stride) {
+        r = _mm_loadu_si128((const __m128i *)ref);
+        ref += 16;
+      } else if (width == 4) {
+        r = _mm_set_epi32(loadu_int32(ref + 3 * ref_stride),
+                          loadu_int32(ref + 2 * ref_stride),
+                          loadu_int32(ref + ref_stride), loadu_int32(ref));
+
+        ref += 4 * ref_stride;
+      } else {
+        const __m128i r_0 = _mm_loadl_epi64((const __m128i *)ref);
+        assert(width == 8);
+        r = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(r_0),
+                                          (const __m64 *)(ref + ref_stride)));
+
+        ref += 2 * ref_stride;
+      }
+      avg = _mm_avg_epu8(p, r);
+      _mm_store_si128((__m128i *)comp_pred, avg);
+
+      pred += 16;
+      comp_pred += 16;
+    }
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm b/media/libvpx/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm
index 26412e8e43..9122b5a401 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm
+++ b/media/libvpx/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm
@@ -8,42 +8,50 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-%define private_prefix vpx
-
 %include "third_party/x86inc/x86inc.asm"
-
-; This file provides SSSE3 version of the hadamard transformation. Part
-; of the macro definitions are originally derived from the ffmpeg project.
-; The current version applies to x86 64-bit only.
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
 
 SECTION .text
 
-%if ARCH_X86_64
+%if VPX_ARCH_X86_64
 ; matrix transpose
-%macro INTERLEAVE_2X 4
-  punpckh%1          m%4, m%2, m%3
-  punpckl%1          m%2, m%3
-  SWAP               %3,  %4
-%endmacro
+%macro TRANSPOSE8X8 10
+  ; stage 1
+  punpcklwd  m%9, m%1, m%2
+  punpcklwd  m%10, m%3, m%4
+  punpckhwd  m%1, m%2
+  punpckhwd  m%3, m%4
 
-%macro TRANSPOSE8X8 9
-  INTERLEAVE_2X  wd, %1, %2, %9
-  INTERLEAVE_2X  wd, %3, %4, %9
-  INTERLEAVE_2X  wd, %5, %6, %9
-  INTERLEAVE_2X  wd, %7, %8, %9
+  punpcklwd  m%2, m%5, m%6
+  punpcklwd  m%4, m%7, m%8
+  punpckhwd  m%5, m%6
+  punpckhwd  m%7, m%8
 
-  INTERLEAVE_2X  dq, %1, %3, %9
-  INTERLEAVE_2X  dq, %2, %4, %9
-  INTERLEAVE_2X  dq, %5, %7, %9
-  INTERLEAVE_2X  dq, %6, %8, %9
+  ; stage 2
+  punpckldq  m%6, m%9, m%10
+  punpckldq  m%8, m%1, m%3
+  punpckhdq  m%9, m%10
+  punpckhdq  m%1, m%3
 
-  INTERLEAVE_2X  qdq, %1, %5, %9
-  INTERLEAVE_2X  qdq, %3, %7, %9
-  INTERLEAVE_2X  qdq, %2, %6, %9
-  INTERLEAVE_2X  qdq, %4, %8, %9
+  punpckldq  m%10, m%2, m%4
+  punpckldq  m%3, m%5, m%7
+  punpckhdq  m%2, m%4
+  punpckhdq  m%5, m%7
 
-  SWAP  %2, %5
-  SWAP  %4, %7
+  ; stage 3
+  punpckhqdq  m%4, m%9, m%2  ; out3
+  punpcklqdq  m%9, m%2       ; out2
+  punpcklqdq  m%7, m%1, m%5  ; out6
+  punpckhqdq  m%1, m%5       ; out7
+
+  punpckhqdq  m%2, m%6, m%10 ; out1
+  punpcklqdq  m%6, m%10      ; out0
+  punpcklqdq  m%5, m%8, m%3  ; out4
+  punpckhqdq  m%8, m%3       ; out5
+
+  SWAP %6, %1
+  SWAP %3, %9
+  SWAP %8, %6
 %endmacro
 
 %macro HMD8_1D 0
@@ -87,8 +95,9 @@ SECTION .text
   SWAP               7, 9
 %endmacro
 
+
 INIT_XMM ssse3
-cglobal hadamard_8x8, 3, 5, 10, input, stride, output
+cglobal hadamard_8x8, 3, 5, 11, input, stride, output
   lea                r3, [2 * strideq]
   lea                r4, [4 * strideq]
 
@@ -105,17 +114,17 @@ cglobal hadamard_8x8, 3, 5, 10, input, stride, output
   mova               m7, [inputq + r3]
 
   HMD8_1D
-  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
+  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
   HMD8_1D
 
-  mova              [outputq +   0], m0
-  mova              [outputq +  16], m1
-  mova              [outputq +  32], m2
-  mova              [outputq +  48], m3
-  mova              [outputq +  64], m4
-  mova              [outputq +  80], m5
-  mova              [outputq +  96], m6
-  mova              [outputq + 112], m7
+  STORE_TRAN_LOW 0, outputq,  0, 8, 9
+  STORE_TRAN_LOW 1, outputq,  8, 8, 9
+  STORE_TRAN_LOW 2, outputq, 16, 8, 9
+  STORE_TRAN_LOW 3, outputq, 24, 8, 9
+  STORE_TRAN_LOW 4, outputq, 32, 8, 9
+  STORE_TRAN_LOW 5, outputq, 40, 8, 9
+  STORE_TRAN_LOW 6, outputq, 48, 8, 9
+  STORE_TRAN_LOW 7, outputq, 56, 8, 9
 
   RET
 %endif
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h b/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h
new file mode 100644
index 0000000000..c02b47a3eb
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_
+#define VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_
+
+#include <immintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+// Load 16 16 bit values. If the source is 32 bits then pack down with
+// saturation.
+static INLINE __m256i load_tran_low(const tran_low_t *a) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const __m256i a_low = _mm256_loadu_si256((const __m256i *)a);
+  const __m256i a_high = _mm256_loadu_si256((const __m256i *)(a + 8));
+  return _mm256_packs_epi32(a_low, a_high);
+#else
+  return _mm256_loadu_si256((const __m256i *)a);
+#endif
+}
+
+static INLINE void store_tran_low(__m256i a, tran_low_t *b) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i a_hi = _mm256_mulhi_epi16(a, one);
+  const __m256i a_lo = _mm256_mullo_epi16(a, one);
+  const __m256i a_1 = _mm256_unpacklo_epi16(a_lo, a_hi);
+  const __m256i a_2 = _mm256_unpackhi_epi16(a_lo, a_hi);
+  _mm256_storeu_si256((__m256i *)b, a_1);
+  _mm256_storeu_si256((__m256i *)(b + 8), a_2);
+#else
+  _mm256_storeu_si256((__m256i *)b, a);
+#endif
+}
+#endif  // VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.asm
new file mode 100644
index 0000000000..aacf71f7ac
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.asm
@@ -0,0 +1,90 @@
+;
+;  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+; TODO(johannkoenig): Add the necessary include guards to vpx_config.asm.
+; vpx_config.asm is not guarded so can not be included twice. Because this will
+; be used in conjunction with x86_abi_support.asm or x86inc.asm, it must be
+; included after those files.
+
+; Increment register by sizeof() tran_low_t * 8.
+%macro INCREMENT_TRAN_LOW 1
+%if CONFIG_VP9_HIGHBITDEPTH
+  add %1, 32
+%else
+  add %1, 16
+%endif
+%endmacro
+
+; Increment %1 by sizeof() tran_low_t * %2.
+%macro INCREMENT_ELEMENTS_TRAN_LOW 2
+%if CONFIG_VP9_HIGHBITDEPTH
+  lea %1, [%1 + %2 * 4]
+%else
+  lea %1, [%1 + %2 * 2]
+%endif
+%endmacro
+
+; Load %2 + %3 into m%1.
+; %3 is the offset in elements, not bytes.
+; If tran_low_t is 16 bits (low bit depth configuration) then load the value
+; directly. If tran_low_t is 32 bits (high bit depth configuration) then pack
+; the values down to 16 bits.
+%macro LOAD_TRAN_LOW 3
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova     m%1, [%2 + (%3) * 4]
+  packssdw m%1, [%2 + (%3) * 4 + 16]
+%else
+  mova     m%1, [%2 + (%3) * 2]
+%endif
+%endmacro
+
+; Store m%1 to %2 + %3.
+; %3 is the offset in elements, not bytes.
+; If 5 arguments are provided then m%1 is corrupted.
+; If 6 arguments are provided then m%1 is preserved.
+; If tran_low_t is 16 bits (low bit depth configuration) then store the value
+; directly. If tran_low_t is 32 bits (high bit depth configuration) then sign
+; extend the values first.
+; Uses m%4-m%6 as scratch registers for high bit depth.
+%macro STORE_TRAN_LOW 5-6
+%if CONFIG_VP9_HIGHBITDEPTH
+  pxor                      m%4, m%4
+  mova                      m%5, m%1
+  %if %0 == 6
+  mova                      m%6, m%1
+  %endif
+  pcmpgtw                   m%4, m%1
+  punpcklwd                 m%5, m%4
+  %if %0 == 5
+  punpckhwd                 m%1, m%4
+  %else
+  punpckhwd                 m%6, m%4
+  %endif
+  mova     [%2 + (%3) * 4 +  0], m%5
+  %if %0 == 5
+  mova     [%2 + (%3) * 4 + 16], m%1
+  %else
+  mova     [%2 + (%3) * 4 + 16], m%6
+  %endif
+%else
+  mova          [%2 + (%3) * 2], m%1
+%endif
+%endmacro
+
+; Store zeros (in m%1) to %2 + %3.
+; %3 is the offset in elements, not bytes.
+%macro STORE_ZERO_TRAN_LOW 3
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova     [%2 + (%3) * 4 +  0], m%1
+  mova     [%2 + (%3) * 4 + 16], m%1
+%else
+  mova          [%2 + (%3) * 2], m%1
+%endif
+%endmacro
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/fdct.h b/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h
similarity index 76%
rename from media/libvpx/libvpx/vpx_dsp/x86/fdct.h
rename to media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h
index 54a6d81fcb..74dde656b1 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/fdct.h
+++ b/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef VPX_DSP_X86_FDCT_H_
-#define VPX_DSP_X86_FDCT_H_
+#ifndef VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_
+#define VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_
 
 #include <xmmintrin.h>
 
@@ -16,13 +16,12 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
-// Load 8 16 bit values. If the source is 32 bits then cast down.
-// This does not saturate values. It only truncates.
+// Load 8 16 bit values. If the source is 32 bits then pack down with
+// saturation.
 static INLINE __m128i load_tran_low(const tran_low_t *a) {
 #if CONFIG_VP9_HIGHBITDEPTH
-  return _mm_setr_epi16((int16_t)a[0], (int16_t)a[1], (int16_t)a[2],
-                        (int16_t)a[3], (int16_t)a[4], (int16_t)a[5],
-                        (int16_t)a[6], (int16_t)a[7]);
+  const __m128i a_low = _mm_load_si128((const __m128i *)a);
+  return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
 #else
   return _mm_load_si128((const __m128i *)a);
 #endif
@@ -54,4 +53,4 @@ static INLINE void store_zero_tran_low(tran_low_t *a) {
   _mm_store_si128((__m128i *)(a), zero);
 #endif
 }
-#endif  // VPX_DSP_X86_FDCT_H_
+#endif  // VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/convolve.h b/media/libvpx/libvpx/vpx_dsp/x86/convolve.h
index d7468ad7ca..c339600556 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/convolve.h
+++ b/media/libvpx/libvpx/vpx_dsp/x86/convolve.h
@@ -7,89 +7,130 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef VPX_DSP_X86_CONVOLVE_H_
-#define VPX_DSP_X86_CONVOLVE_H_
+#ifndef VPX_VPX_DSP_X86_CONVOLVE_H_
+#define VPX_VPX_DSP_X86_CONVOLVE_H_
 
 #include <assert.h>
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
-#include "vpx_ports/mem.h"
+#include "vpx_ports/compiler_attributes.h"
 
+// TODO(chiyotsai@google.com): Refactor the code here. Currently this is pretty
+// hacky and awful to read. Note that there is a filter_x[3] == 128 check in
+// HIGHBD_FUN_CONV_2D to avoid seg fault due to the fact that the c function
+// assumes the filter is always 8 tap.
 typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
                                 uint8_t *output_ptr, ptrdiff_t out_pitch,
                                 uint32_t output_height, const int16_t *filter);
 
-#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt)         \
+// TODO(chiyotsai@google.com): Remove the is_avg argument to the MACROS once we
+// have 4-tap vert avg filter.
+#define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt, is_avg) \
   void vpx_convolve8_##name##_##opt(                                         \
       const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,          \
-      const int16_t *filter_y, int y_step_q4, int w, int h) {                \
-    (void)filter_x;                                                          \
+      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,           \
+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {               \
+    const int16_t *filter_row = filter[offset];                              \
+    (void)x0_q4;                                                             \
     (void)x_step_q4;                                                         \
-    (void)filter_y;                                                          \
+    (void)y0_q4;                                                             \
     (void)y_step_q4;                                                         \
-    assert(filter[3] != 128);                                                \
+    assert(filter_row[3] != 128);                                            \
     assert(step_q4 == 16);                                                   \
-    if (filter[0] | filter[1] | filter[2]) {                                 \
+    if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) {     \
+      const int num_taps = 8;                                                \
       while (w >= 16) {                                                      \
         vpx_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
-                                                 dst_stride, h, filter);     \
+                                                 dst_stride, h, filter_row); \
         src += 16;                                                           \
         dst += 16;                                                           \
         w -= 16;                                                             \
       }                                                                      \
       if (w == 8) {                                                          \
         vpx_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst,  \
-                                                dst_stride, h, filter);      \
+                                                dst_stride, h, filter_row);  \
       } else if (w == 4) {                                                   \
         vpx_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst,  \
-                                                dst_stride, h, filter);      \
+                                                dst_stride, h, filter_row);  \
       }                                                                      \
-    } else {                                                                 \
+      (void)num_taps;                                                        \
+    } else if (filter_row[2] | filter_row[5]) {                              \
+      const int num_taps = is_avg ? 8 : 4;                                   \
       while (w >= 16) {                                                      \
-        vpx_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst,       \
-                                                 dst_stride, h, filter);     \
+        vpx_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \
+                                                 dst_stride, h, filter_row); \
         src += 16;                                                           \
         dst += 16;                                                           \
         w -= 16;                                                             \
       }                                                                      \
       if (w == 8) {                                                          \
-        vpx_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst,        \
-                                                dst_stride, h, filter);      \
+        vpx_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst,  \
+                                                dst_stride, h, filter_row);  \
       } else if (w == 4) {                                                   \
-        vpx_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst,        \
-                                                dst_stride, h, filter);      \
+        vpx_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst,  \
+                                                dst_stride, h, filter_row);  \
       }                                                                      \
+      (void)num_taps;                                                        \
+    } else {                                                                 \
+      const int num_taps = 2;                                                \
+      while (w >= 16) {                                                      \
+        vpx_filter_block1d16_##dir##2_##avg##opt(src_start, src_stride, dst, \
+                                                 dst_stride, h, filter_row); \
+        src += 16;                                                           \
+        dst += 16;                                                           \
+        w -= 16;                                                             \
+      }                                                                      \
+      if (w == 8) {                                                          \
+        vpx_filter_block1d8_##dir##2_##avg##opt(src_start, src_stride, dst,  \
+                                                dst_stride, h, filter_row);  \
+      } else if (w == 4) {                                                   \
+        vpx_filter_block1d4_##dir##2_##avg##opt(src_start, src_stride, dst,  \
+                                                dst_stride, h, filter_row);  \
+      }                                                                      \
+      (void)num_taps;                                                        \
     }                                                                        \
   }
 
-#define FUN_CONV_2D(avg, opt)                                                 \
-  void vpx_convolve8_##avg##opt(                                              \
-      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                 \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,           \
-      const int16_t *filter_y, int y_step_q4, int w, int h) {                 \
-    assert(filter_x[3] != 128);                                               \
-    assert(filter_y[3] != 128);                                               \
-    assert(w <= 64);                                                          \
-    assert(h <= 64);                                                          \
-    assert(x_step_q4 == 16);                                                  \
-    assert(y_step_q4 == 16);                                                  \
-    if (filter_x[0] | filter_x[1] | filter_x[2]) {                            \
-      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]);                          \
-      vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
-                                filter_x, x_step_q4, filter_y, y_step_q4, w,  \
-                                h + 7);                                       \
-      vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride,   \
-                                      filter_x, x_step_q4, filter_y,          \
-                                      y_step_q4, w, h);                       \
-    } else {                                                                  \
-      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]);                          \
-      vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter_x,        \
-                                x_step_q4, filter_y, y_step_q4, w, h + 1);    \
-      vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, filter_x,  \
-                                      x_step_q4, filter_y, y_step_q4, w, h);  \
-    }                                                                         \
+#define FUN_CONV_2D(avg, opt, is_avg)                                          \
+  void vpx_convolve8_##avg##opt(                                               \
+      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                  \
+      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,             \
+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {                 \
+    const int16_t *filter_x = filter[x0_q4];                                   \
+    const int16_t *filter_y = filter[y0_q4];                                   \
+    (void)filter_y;                                                            \
+    assert(filter_x[3] != 128);                                                \
+    assert(filter_y[3] != 128);                                                \
+    assert(w <= 64);                                                           \
+    assert(h <= 64);                                                           \
+    assert(x_step_q4 == 16);                                                   \
+    assert(y_step_q4 == 16);                                                   \
+    if (filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) {               \
+      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71] VPX_UNINITIALIZED);         \
+      vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64,  \
+                                filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, \
+                                h + 7);                                        \
+      vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride,    \
+                                      filter, x0_q4, x_step_q4, y0_q4,         \
+                                      y_step_q4, w, h);                        \
+    } else if (filter_x[2] | filter_x[5]) {                                    \
+      const int num_taps = is_avg ? 8 : 4;                                     \
+      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71] VPX_UNINITIALIZED);         \
+      vpx_convolve8_horiz_##opt(                                               \
+          src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64,       \
+          filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1);    \
+      vpx_convolve8_##avg##vert_##opt(fdata2 + 64 * (num_taps / 2 - 1), 64,    \
+                                      dst, dst_stride, filter, x0_q4,          \
+                                      x_step_q4, y0_q4, y_step_q4, w, h);      \
+    } else {                                                                   \
+      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65] VPX_UNINITIALIZED);         \
+      vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, x0_q4,    \
+                                x_step_q4, y0_q4, y_step_q4, w, h + 1);        \
+      vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, filter,     \
+                                      x0_q4, x_step_q4, y0_q4, y_step_q4, w,   \
+                                      h);                                      \
+    }                                                                          \
   }
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -101,98 +142,138 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
                                        unsigned int output_height,
                                        const int16_t *filter, int bd);
 
-#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
-  void vpx_highbd_convolve8_##name##_##opt(                               \
-      const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,           \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,       \
-      const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {     \
-    if (step_q4 == 16 && filter[3] != 128) {                              \
-      uint16_t *src = CONVERT_TO_SHORTPTR(src8);                          \
-      uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                          \
-      if (filter[0] | filter[1] | filter[2]) {                            \
-        while (w >= 16) {                                                 \
-          vpx_highbd_filter_block1d16_##dir##8_##avg##opt(                \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);     \
-          src += 16;                                                      \
-          dst += 16;                                                      \
-          w -= 16;                                                        \
-        }                                                                 \
-        while (w >= 8) {                                                  \
-          vpx_highbd_filter_block1d8_##dir##8_##avg##opt(                 \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);     \
-          src += 8;                                                       \
-          dst += 8;                                                       \
-          w -= 8;                                                         \
-        }                                                                 \
-        while (w >= 4) {                                                  \
-          vpx_highbd_filter_block1d4_##dir##8_##avg##opt(                 \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);     \
-          src += 4;                                                       \
-          dst += 4;                                                       \
-          w -= 4;                                                         \
-        }                                                                 \
-      } else {                                                            \
-        while (w >= 16) {                                                 \
-          vpx_highbd_filter_block1d16_##dir##2_##avg##opt(                \
-              src, src_stride, dst, dst_stride, h, filter, bd);           \
-          src += 16;                                                      \
-          dst += 16;                                                      \
-          w -= 16;                                                        \
-        }                                                                 \
-        while (w >= 8) {                                                  \
-          vpx_highbd_filter_block1d8_##dir##2_##avg##opt(                 \
-              src, src_stride, dst, dst_stride, h, filter, bd);           \
-          src += 8;                                                       \
-          dst += 8;                                                       \
-          w -= 8;                                                         \
-        }                                                                 \
-        while (w >= 4) {                                                  \
-          vpx_highbd_filter_block1d4_##dir##2_##avg##opt(                 \
-              src, src_stride, dst, dst_stride, h, filter, bd);           \
-          src += 4;                                                       \
-          dst += 4;                                                       \
-          w -= 4;                                                         \
-        }                                                                 \
-      }                                                                   \
-    }                                                                     \
-    if (w) {                                                              \
-      vpx_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \
-                                      filter_x, x_step_q4, filter_y,      \
-                                      y_step_q4, w, h, bd);               \
-    }                                                                     \
-  }
-
-#define HIGH_FUN_CONV_2D(avg, opt)                                            \
-  void vpx_highbd_convolve8_##avg##opt(                                       \
-      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                 \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,           \
-      const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {         \
-    assert(w <= 64);                                                          \
-    assert(h <= 64);                                                          \
-    if (x_step_q4 == 16 && y_step_q4 == 16) {                                 \
-      if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) {  \
-        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]);                       \
-        vpx_highbd_convolve8_horiz_##opt(                                     \
-            src - 3 * src_stride, src_stride, CONVERT_TO_BYTEPTR(fdata2), 64, \
-            filter_x, x_step_q4, filter_y, y_step_q4, w, h + 7, bd);          \
-        vpx_highbd_convolve8_##avg##vert_##opt(                               \
-            CONVERT_TO_BYTEPTR(fdata2) + 192, 64, dst, dst_stride, filter_x,  \
-            x_step_q4, filter_y, y_step_q4, w, h, bd);                        \
+#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt,     \
+                         is_avg)                                              \
+  void vpx_highbd_convolve8_##name##_##opt(                                   \
+      const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,               \
+      ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4,     \
+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) {        \
+    const int16_t *filter_row = filter_kernel[offset];                        \
+    if (step_q4 == 16 && filter_row[3] != 128) {                              \
+      if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) {    \
+        const int num_taps = 8;                                               \
+        while (w >= 16) {                                                     \
+          vpx_highbd_filter_block1d16_##dir##8_##avg##opt(                    \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
+          src += 16;                                                          \
+          dst += 16;                                                          \
+          w -= 16;                                                            \
+        }                                                                     \
+        while (w >= 8) {                                                      \
+          vpx_highbd_filter_block1d8_##dir##8_##avg##opt(                     \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
+          src += 8;                                                           \
+          dst += 8;                                                           \
+          w -= 8;                                                             \
+        }                                                                     \
+        while (w >= 4) {                                                      \
+          vpx_highbd_filter_block1d4_##dir##8_##avg##opt(                     \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
+          src += 4;                                                           \
+          dst += 4;                                                           \
+          w -= 4;                                                             \
+        }                                                                     \
+        (void)num_taps;                                                       \
+      } else if (filter_row[2] | filter_row[5]) {                             \
+        const int num_taps = is_avg ? 8 : 4;                                  \
+        while (w >= 16) {                                                     \
+          vpx_highbd_filter_block1d16_##dir##4_##avg##opt(                    \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
+          src += 16;                                                          \
+          dst += 16;                                                          \
+          w -= 16;                                                            \
+        }                                                                     \
+        while (w >= 8) {                                                      \
+          vpx_highbd_filter_block1d8_##dir##4_##avg##opt(                     \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
+          src += 8;                                                           \
+          dst += 8;                                                           \
+          w -= 8;                                                             \
+        }                                                                     \
+        while (w >= 4) {                                                      \
+          vpx_highbd_filter_block1d4_##dir##4_##avg##opt(                     \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
+          src += 4;                                                           \
+          dst += 4;                                                           \
+          w -= 4;                                                             \
+        }                                                                     \
+        (void)num_taps;                                                       \
       } else {                                                                \
-        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]);                       \
-        vpx_highbd_convolve8_horiz_##opt(                                     \
-            src, src_stride, CONVERT_TO_BYTEPTR(fdata2), 64, filter_x,        \
-            x_step_q4, filter_y, y_step_q4, w, h + 1, bd);                    \
-        vpx_highbd_convolve8_##avg##vert_##opt(                               \
-            CONVERT_TO_BYTEPTR(fdata2), 64, dst, dst_stride, filter_x,        \
-            x_step_q4, filter_y, y_step_q4, w, h, bd);                        \
+        const int num_taps = 2;                                               \
+        while (w >= 16) {                                                     \
+          vpx_highbd_filter_block1d16_##dir##2_##avg##opt(                    \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
+          src += 16;                                                          \
+          dst += 16;                                                          \
+          w -= 16;                                                            \
+        }                                                                     \
+        while (w >= 8) {                                                      \
+          vpx_highbd_filter_block1d8_##dir##2_##avg##opt(                     \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
+          src += 8;                                                           \
+          dst += 8;                                                           \
+          w -= 8;                                                             \
+        }                                                                     \
+        while (w >= 4) {                                                      \
+          vpx_highbd_filter_block1d4_##dir##2_##avg##opt(                     \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
+          src += 4;                                                           \
+          dst += 4;                                                           \
+          w -= 4;                                                             \
+        }                                                                     \
+        (void)num_taps;                                                       \
       }                                                                       \
-    } else {                                                                  \
-      vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride,         \
-                                    filter_x, x_step_q4, filter_y, y_step_q4, \
-                                    w, h, bd);                                \
+    }                                                                         \
+    if (w) {                                                                  \
+      vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride,       \
+                                      filter_kernel, x0_q4, x_step_q4, y0_q4, \
+                                      y_step_q4, w, h, bd);                   \
     }                                                                         \
   }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#endif  // VPX_DSP_X86_CONVOLVE_H_
+#define HIGH_FUN_CONV_2D(avg, opt, is_avg)                                     \
+  void vpx_highbd_convolve8_##avg##opt(                                        \
+      const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,                \
+      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,             \
+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) {         \
+    const int16_t *filter_x = filter[x0_q4];                                   \
+    assert(w <= 64);                                                           \
+    assert(h <= 64);                                                           \
+    if (x_step_q4 == 16 && y_step_q4 == 16) {                                  \
+      if ((filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) ||           \
+          filter_x[3] == 128) {                                                \
+        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71] VPX_UNINITIALIZED);      \
+        vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride,     \
+                                         fdata2, 64, filter, x0_q4, x_step_q4, \
+                                         y0_q4, y_step_q4, w, h + 7, bd);      \
+        vpx_highbd_convolve8_##avg##vert_##opt(                                \
+            fdata2 + 192, 64, dst, dst_stride, filter, x0_q4, x_step_q4,       \
+            y0_q4, y_step_q4, w, h, bd);                                       \
+      } else if (filter_x[2] | filter_x[5]) {                                  \
+        const int num_taps = is_avg ? 8 : 4;                                   \
+        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71] VPX_UNINITIALIZED);      \
+        vpx_highbd_convolve8_horiz_##opt(                                      \
+            src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64,     \
+            filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1,   \
+            bd);                                                               \
+        vpx_highbd_convolve8_##avg##vert_##opt(                                \
+            fdata2 + 64 * (num_taps / 2 - 1), 64, dst, dst_stride, filter,     \
+            x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);                     \
+      } else {                                                                 \
+        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65] VPX_UNINITIALIZED);      \
+        vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter,  \
+                                         x0_q4, x_step_q4, y0_q4, y_step_q4,   \
+                                         w, h + 1, bd);                        \
+        vpx_highbd_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride,    \
+                                               filter, x0_q4, x_step_q4,       \
+                                               y0_q4, y_step_q4, w, h, bd);    \
+      }                                                                        \
+    } else {                                                                   \
+      vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, filter,  \
+                                    x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,  \
+                                    bd);                                       \
+    }                                                                          \
+  }
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // VPX_VPX_DSP_X86_CONVOLVE_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/convolve_avx2.h b/media/libvpx/libvpx/vpx_dsp/x86/convolve_avx2.h
new file mode 100644
index 0000000000..ebee964b18
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/convolve_avx2.h
@@ -0,0 +1,161 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_
+#define VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_
+
+#include <immintrin.h>  // AVX2
+
+#include "./vpx_config.h"
+
+#if defined(__clang__)
+#if (__clang_major__ > 0 && __clang_major__ < 3) ||            \
+    (__clang_major__ == 3 && __clang_minor__ <= 3) ||          \
+    (defined(__APPLE__) && defined(__apple_build_version__) && \
+     ((__clang_major__ == 4 && __clang_minor__ <= 2) ||        \
+      (__clang_major__ == 5 && __clang_minor__ == 0)))
+#define MM256_BROADCASTSI128_SI256(x) \
+  _mm_broadcastsi128_si256((__m128i const *)&(x))
+#else  // clang > 3.3, and not 5.0 on macosx.
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif  // clang <= 3.3
+#elif defined(__GNUC__)
+#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
+#define MM256_BROADCASTSI128_SI256(x) \
+  _mm_broadcastsi128_si256((__m128i const *)&(x))
+#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
+#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
+#else  // gcc > 4.7
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif  // gcc <= 4.6
+#else   // !(gcc || clang)
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif  // __clang__
+
+static INLINE void shuffle_filter_avx2(const int16_t *const filter,
+                                       __m256i *const f) {
+  const __m256i f_values =
+      MM256_BROADCASTSI128_SI256(_mm_load_si128((const __m128i *)filter));
+  // pack and duplicate the filter values
+  f[0] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0200u));
+  f[1] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0604u));
+  f[2] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0a08u));
+  f[3] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0e0cu));
+}
+
+static INLINE __m256i convolve8_16_avx2(const __m256i *const s,
+                                        const __m256i *const f) {
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m256i k_64 = _mm256_set1_epi16(1 << 6);
+  const __m256i x0 = _mm256_maddubs_epi16(s[0], f[0]);
+  const __m256i x1 = _mm256_maddubs_epi16(s[1], f[1]);
+  const __m256i x2 = _mm256_maddubs_epi16(s[2], f[2]);
+  const __m256i x3 = _mm256_maddubs_epi16(s[3], f[3]);
+  __m256i sum1, sum2;
+
+  // sum the results together, saturating only on the final step
+  // adding x0 with x2 and x1 with x3 is the only order that prevents
+  // outranges for all filters
+  sum1 = _mm256_add_epi16(x0, x2);
+  sum2 = _mm256_add_epi16(x1, x3);
+  // add the rounding offset early to avoid another saturated add
+  sum1 = _mm256_add_epi16(sum1, k_64);
+  sum1 = _mm256_adds_epi16(sum1, sum2);
+  // round and shift by 7 bit each 16 bit
+  sum1 = _mm256_srai_epi16(sum1, 7);
+  return sum1;
+}
+
+static INLINE __m128i convolve8_8_avx2(const __m256i *const s,
+                                       const __m256i *const f) {
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m128i k_64 = _mm_set1_epi16(1 << 6);
+  const __m128i x0 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[0]),
+                                       _mm256_castsi256_si128(f[0]));
+  const __m128i x1 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[1]),
+                                       _mm256_castsi256_si128(f[1]));
+  const __m128i x2 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[2]),
+                                       _mm256_castsi256_si128(f[2]));
+  const __m128i x3 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[3]),
+                                       _mm256_castsi256_si128(f[3]));
+  __m128i sum1, sum2;
+
+  // sum the results together, saturating only on the final step
+  // adding x0 with x2 and x1 with x3 is the only order that prevents
+  // outranges for all filters
+  sum1 = _mm_add_epi16(x0, x2);
+  sum2 = _mm_add_epi16(x1, x3);
+  // add the rounding offset early to avoid another saturated add
+  sum1 = _mm_add_epi16(sum1, k_64);
+  sum1 = _mm_adds_epi16(sum1, sum2);
+  // shift by 7 bit each 16 bit
+  sum1 = _mm_srai_epi16(sum1, 7);
+  return sum1;
+}
+
+static INLINE __m256i mm256_loadu2_si128(const void *lo, const void *hi) {
+  const __m256i tmp =
+      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)lo));
+  return _mm256_inserti128_si256(tmp, _mm_loadu_si128((const __m128i *)hi), 1);
+}
+
+static INLINE __m256i mm256_loadu2_epi64(const void *lo, const void *hi) {
+  const __m256i tmp =
+      _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)lo));
+  return _mm256_inserti128_si256(tmp, _mm_loadl_epi64((const __m128i *)hi), 1);
+}
+
+static INLINE void mm256_store2_si128(__m128i *const dst_ptr_1,
+                                      __m128i *const dst_ptr_2,
+                                      const __m256i *const src) {
+  _mm_store_si128(dst_ptr_1, _mm256_castsi256_si128(*src));
+  _mm_store_si128(dst_ptr_2, _mm256_extractf128_si256(*src, 1));
+}
+
+static INLINE void mm256_storeu2_epi64(__m128i *const dst_ptr_1,
+                                       __m128i *const dst_ptr_2,
+                                       const __m256i *const src) {
+  _mm_storel_epi64(dst_ptr_1, _mm256_castsi256_si128(*src));
+  _mm_storel_epi64(dst_ptr_2, _mm256_extractf128_si256(*src, 1));
+}
+
+static INLINE void mm256_storeu2_epi32(__m128i *const dst_ptr_1,
+                                       __m128i *const dst_ptr_2,
+                                       const __m256i *const src) {
+  *((int *)(dst_ptr_1)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*src));
+  *((int *)(dst_ptr_2)) = _mm_cvtsi128_si32(_mm256_extractf128_si256(*src, 1));
+}
+
+static INLINE __m256i mm256_round_epi32(const __m256i *const src,
+                                        const __m256i *const half_depth,
+                                        const int depth) {
+  const __m256i nearest_src = _mm256_add_epi32(*src, *half_depth);
+  return _mm256_srai_epi32(nearest_src, depth);
+}
+
+static INLINE __m256i mm256_round_epi16(const __m256i *const src,
+                                        const __m256i *const half_depth,
+                                        const int depth) {
+  const __m256i nearest_src = _mm256_adds_epi16(*src, *half_depth);
+  return _mm256_srai_epi16(nearest_src, depth);
+}
+
+static INLINE __m256i mm256_madd_add_epi32(const __m256i *const src_0,
+                                           const __m256i *const src_1,
+                                           const __m256i *const ker_0,
+                                           const __m256i *const ker_1) {
+  const __m256i tmp_0 = _mm256_madd_epi16(*src_0, *ker_0);
+  const __m256i tmp_1 = _mm256_madd_epi16(*src_1, *ker_1);
+  return _mm256_add_epi32(tmp_0, tmp_1);
+}
+
+#undef MM256_BROADCASTSI128_SI256
+
+#endif  // VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/convolve_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/convolve_sse2.h
new file mode 100644
index 0000000000..8443546394
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/convolve_sse2.h
@@ -0,0 +1,88 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_
+#define VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vpx_config.h"
+
+// Interprets the input register as 16-bit words 7 6 5 4 3 2 1 0, then returns
+// values at index 2 and 3 to return 3 2 3 2 3 2 3 2 as 16-bit words
+static INLINE __m128i extract_quarter_2_epi16_sse2(const __m128i *const reg) {
+  __m128i tmp = _mm_unpacklo_epi32(*reg, *reg);
+  return _mm_unpackhi_epi64(tmp, tmp);
+}
+
+// Interprets the input register as 16-bit words 7 6 5 4 3 2 1 0, then returns
+// values at index 2 and 3 to return 5 4 5 4 5 4 5 4 as 16-bit words.
+static INLINE __m128i extract_quarter_3_epi16_sse2(const __m128i *const reg) {
+  __m128i tmp = _mm_unpackhi_epi32(*reg, *reg);
+  return _mm_unpacklo_epi64(tmp, tmp);
+}
+
+// Interprets src as 8-bit words, zero extends to form 16-bit words, then
+// multiplies with ker and add the adjacent results to form 32-bit words.
+// Finally adds the result from 1 and 2 together.
+static INLINE __m128i mm_madd_add_epi8_sse2(const __m128i *const src_1,
+                                            const __m128i *const src_2,
+                                            const __m128i *const ker_1,
+                                            const __m128i *const ker_2) {
+  const __m128i src_1_half = _mm_unpacklo_epi8(*src_1, _mm_setzero_si128());
+  const __m128i src_2_half = _mm_unpacklo_epi8(*src_2, _mm_setzero_si128());
+  const __m128i madd_1 = _mm_madd_epi16(src_1_half, *ker_1);
+  const __m128i madd_2 = _mm_madd_epi16(src_2_half, *ker_2);
+  return _mm_add_epi32(madd_1, madd_2);
+}
+
+// Interprets src as 16-bit words, then multiplies with ker and add the
+// adjacent results to form 32-bit words. Finally adds the result from 1 and 2
+// together.
+static INLINE __m128i mm_madd_add_epi16_sse2(const __m128i *const src_1,
+                                             const __m128i *const src_2,
+                                             const __m128i *const ker_1,
+                                             const __m128i *const ker_2) {
+  const __m128i madd_1 = _mm_madd_epi16(*src_1, *ker_1);
+  const __m128i madd_2 = _mm_madd_epi16(*src_2, *ker_2);
+  return _mm_add_epi32(madd_1, madd_2);
+}
+
+static INLINE __m128i mm_madd_packs_epi16_sse2(const __m128i *const src_0,
+                                               const __m128i *const src_1,
+                                               const __m128i *const ker) {
+  const __m128i madd_1 = _mm_madd_epi16(*src_0, *ker);
+  const __m128i madd_2 = _mm_madd_epi16(*src_1, *ker);
+  return _mm_packs_epi32(madd_1, madd_2);
+}
+
+// Interleaves src_1 and src_2
+static INLINE __m128i mm_zip_epi32_sse2(const __m128i *const src_1,
+                                        const __m128i *const src_2) {
+  const __m128i tmp_1 = _mm_unpacklo_epi32(*src_1, *src_2);
+  const __m128i tmp_2 = _mm_unpackhi_epi32(*src_1, *src_2);
+  return _mm_packs_epi32(tmp_1, tmp_2);
+}
+
+static INLINE __m128i mm_round_epi32_sse2(const __m128i *const src,
+                                          const __m128i *const half_depth,
+                                          const int depth) {
+  const __m128i nearest_src = _mm_add_epi32(*src, *half_depth);
+  return _mm_srai_epi32(nearest_src, depth);
+}
+
+static INLINE __m128i mm_round_epi16_sse2(const __m128i *const src,
+                                          const __m128i *const half_depth,
+                                          const int depth) {
+  const __m128i nearest_src = _mm_adds_epi16(*src, *half_depth);
+  return _mm_srai_epi16(nearest_src, depth);
+}
+
+#endif  // VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/convolve_ssse3.h b/media/libvpx/libvpx/vpx_dsp/x86/convolve_ssse3.h
new file mode 100644
index 0000000000..8a4b165133
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/convolve_ssse3.h
@@ -0,0 +1,112 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_CONVOLVE_SSSE3_H_
+#define VPX_VPX_DSP_X86_CONVOLVE_SSSE3_H_
+
+#include <assert.h>
+#include <tmmintrin.h>  // SSSE3
+
+#include "./vpx_config.h"
+
+static INLINE void shuffle_filter_ssse3(const int16_t *const filter,
+                                        __m128i *const f) {
+  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+  // pack and duplicate the filter values
+  f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
+  f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
+  f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
+  f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
+}
+
+static INLINE void shuffle_filter_odd_ssse3(const int16_t *const filter,
+                                            __m128i *const f) {
+  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+  // pack and duplicate the filter values
+  // It utilizes the fact that the high byte of filter[3] is always 0 to clean
+  // half of f[0] and f[4].
+  assert(filter[3] >= 0 && filter[3] < 256);
+  f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0007u));
+  f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0402u));
+  f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0806u));
+  f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0c0au));
+  f[4] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x070eu));
+}
+
+static INLINE __m128i convolve8_8_ssse3(const __m128i *const s,
+                                        const __m128i *const f) {
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m128i k_64 = _mm_set1_epi16(1 << 6);
+  const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+  const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+  const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+  const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+  __m128i sum1, sum2;
+
+  // sum the results together, saturating only on the final step
+  // adding x0 with x2 and x1 with x3 is the only order that prevents
+  // outranges for all filters
+  sum1 = _mm_add_epi16(x0, x2);
+  sum2 = _mm_add_epi16(x1, x3);
+  // add the rounding offset early to avoid another saturated add
+  sum1 = _mm_add_epi16(sum1, k_64);
+  sum1 = _mm_adds_epi16(sum1, sum2);
+  // shift by 7 bit each 16 bit
+  sum1 = _mm_srai_epi16(sum1, 7);
+  return sum1;
+}
+
+static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s,
+                                                    const __m128i *const f) {
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m128i k_64 = _mm_set1_epi16(1 << 6);
+  const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+  const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+  const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+  const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+  // compensate the subtracted 64 in f[1]. x4 is always non negative.
+  const __m128i x4 = _mm_maddubs_epi16(s[1], _mm_set1_epi8(64));
+  // add and saturate the results together
+  __m128i temp = _mm_adds_epi16(x0, x3);
+  temp = _mm_adds_epi16(temp, x1);
+  temp = _mm_adds_epi16(temp, x2);
+  temp = _mm_adds_epi16(temp, x4);
+  // round and shift by 7 bit each 16 bit
+  temp = _mm_adds_epi16(temp, k_64);
+  temp = _mm_srai_epi16(temp, 7);
+  return temp;
+}
+
+static INLINE __m128i convolve8_8_odd_offset_ssse3(const __m128i *const s,
+                                                   const __m128i *const f) {
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m128i k_64 = _mm_set1_epi16(1 << 6);
+  const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+  const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+  const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+  const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+  const __m128i x4 = _mm_maddubs_epi16(s[4], f[4]);
+  // compensate the subtracted 64 in f[2]. x5 is always non negative.
+  const __m128i x5 = _mm_maddubs_epi16(s[2], _mm_set1_epi8(64));
+  __m128i temp;
+
+  // add and saturate the results together
+  temp = _mm_adds_epi16(x0, x1);
+  temp = _mm_adds_epi16(temp, x2);
+  temp = _mm_adds_epi16(temp, x3);
+  temp = _mm_adds_epi16(temp, x4);
+  temp = _mm_adds_epi16(temp, x5);
+  // round and shift by 7 bit each 16 bit
+  temp = _mm_adds_epi16(temp, k_64);
+  temp = _mm_srai_epi16(temp, 7);
+  return temp;
+}
+
+#endif  // VPX_VPX_DSP_X86_CONVOLVE_SSSE3_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/deblock_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/deblock_sse2.asm
index ebca50930a..b3af677d2e 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/deblock_sse2.asm
+++ b/media/libvpx/libvpx/vpx_dsp/x86/deblock_sse2.asm
@@ -78,11 +78,13 @@
 %endmacro
 
 %macro UPDATE_FLIMIT 0
-        movdqa      xmm2,       XMMWORD PTR [rbx]
-        movdqa      [rsp],      xmm2
+        movdqu      xmm2,       XMMWORD PTR [rbx]
+        movdqu      [rsp],      xmm2
         add         rbx,        16
 %endmacro
 
+SECTION .text
+
 ;void vpx_post_proc_down_and_across_mb_row_sse2
 ;(
 ;    unsigned char *src_ptr,
@@ -93,7 +95,7 @@
 ;    int *flimits,
 ;    int size
 ;)
-global sym(vpx_post_proc_down_and_across_mb_row_sse2) PRIVATE
+globalsym(vpx_post_proc_down_and_across_mb_row_sse2)
 sym(vpx_post_proc_down_and_across_mb_row_sse2):
     push        rbp
     mov         rbp, rsp
@@ -230,241 +232,10 @@ sym(vpx_post_proc_down_and_across_mb_row_sse2):
     ret
 %undef flimit
 
-;void vpx_mbpost_proc_down_sse2(unsigned char *dst,
-;                               int pitch, int rows, int cols,int flimit)
-extern sym(vpx_rv)
-global sym(vpx_mbpost_proc_down_sse2) PRIVATE
-sym(vpx_mbpost_proc_down_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 128+16
-
-    ; unsigned char d[16][8] at [rsp]
-    ; create flimit2 at [rsp+128]
-    mov         eax, dword ptr arg(4) ;flimit
-    mov         [rsp+128], eax
-    mov         [rsp+128+4], eax
-    mov         [rsp+128+8], eax
-    mov         [rsp+128+12], eax
-%define flimit4 [rsp+128]
-
-%if ABI_IS_32BIT=0
-    lea         r8,       [GLOBAL(sym(vpx_rv))]
-%endif
-
-    ;rows +=8;
-    add         dword arg(2), 8
-
-    ;for(c=0; c<cols; c+=8)
-.loop_col:
-            mov         rsi,        arg(0) ; s
-            pxor        xmm0,       xmm0        ;
-
-            movsxd      rax,        dword ptr arg(1) ;pitch       ;
-
-            ; this copies the last row down into the border 8 rows
-            mov         rdi,        rsi
-            mov         rdx,        arg(2)
-            sub         rdx,        9
-            imul        rdx,        rax
-            lea         rdi,        [rdi+rdx]
-            movq        xmm1,       QWORD ptr[rdi]              ; first row
-            mov         rcx,        8
-.init_borderd:                                                  ; initialize borders
-            lea         rdi,        [rdi + rax]
-            movq        [rdi],      xmm1
-
-            dec         rcx
-            jne         .init_borderd
-
-            neg         rax                                     ; rax = -pitch
-
-            ; this copies the first row up into the border 8 rows
-            mov         rdi,        rsi
-            movq        xmm1,       QWORD ptr[rdi]              ; first row
-            mov         rcx,        8
-.init_border:                                                   ; initialize borders
-            lea         rdi,        [rdi + rax]
-            movq        [rdi],      xmm1
-
-            dec         rcx
-            jne         .init_border
-
-
-
-            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
-            neg         rax
-
-            pxor        xmm5,       xmm5
-            pxor        xmm6,       xmm6        ;
-
-            pxor        xmm7,       xmm7        ;
-            mov         rdi,        rsi
-
-            mov         rcx,        15          ;
-
-.loop_initvar:
-            movq        xmm1,       QWORD PTR [rdi];
-            punpcklbw   xmm1,       xmm0        ;
-
-            paddw       xmm5,       xmm1        ;
-            pmullw      xmm1,       xmm1        ;
-
-            movdqa      xmm2,       xmm1        ;
-            punpcklwd   xmm1,       xmm0        ;
-
-            punpckhwd   xmm2,       xmm0        ;
-            paddd       xmm6,       xmm1        ;
-
-            paddd       xmm7,       xmm2        ;
-            lea         rdi,        [rdi+rax]   ;
-
-            dec         rcx
-            jne         .loop_initvar
-            ;save the var and sum
-            xor         rdx,        rdx
-.loop_row:
-            movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
-            movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
-
-            punpcklbw   xmm1,       xmm0
-            punpcklbw   xmm2,       xmm0
-
-            paddw       xmm5,       xmm2
-            psubw       xmm5,       xmm1
-
-            pmullw      xmm2,       xmm2
-            movdqa      xmm4,       xmm2
-
-            punpcklwd   xmm2,       xmm0
-            punpckhwd   xmm4,       xmm0
-
-            paddd       xmm6,       xmm2
-            paddd       xmm7,       xmm4
-
-            pmullw      xmm1,       xmm1
-            movdqa      xmm2,       xmm1
-
-            punpcklwd   xmm1,       xmm0
-            psubd       xmm6,       xmm1
-
-            punpckhwd   xmm2,       xmm0
-            psubd       xmm7,       xmm2
-
-
-            movdqa      xmm3,       xmm6
-            pslld       xmm3,       4
-
-            psubd       xmm3,       xmm6
-            movdqa      xmm1,       xmm5
-
-            movdqa      xmm4,       xmm5
-            pmullw      xmm1,       xmm1
-
-            pmulhw      xmm4,       xmm4
-            movdqa      xmm2,       xmm1
-
-            punpcklwd   xmm1,       xmm4
-            punpckhwd   xmm2,       xmm4
-
-            movdqa      xmm4,       xmm7
-            pslld       xmm4,       4
-
-            psubd       xmm4,       xmm7
-
-            psubd       xmm3,       xmm1
-            psubd       xmm4,       xmm2
-
-            psubd       xmm3,       flimit4
-            psubd       xmm4,       flimit4
-
-            psrad       xmm3,       31
-            psrad       xmm4,       31
-
-            packssdw    xmm3,       xmm4
-            packsswb    xmm3,       xmm0
-
-            movq        xmm1,       QWORD PTR [rsi+rax*8]
-
-            movq        xmm2,       xmm1
-            punpcklbw   xmm1,       xmm0
-
-            paddw       xmm1,       xmm5
-            mov         rcx,        rdx
-
-            and         rcx,        127
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
-            push        rax
-            lea         rax,        [GLOBAL(sym(vpx_rv))]
-            movdqu      xmm4,       [rax + rcx*2] ;vpx_rv[rcx*2]
-            pop         rax
-%elif ABI_IS_32BIT=0
-            movdqu      xmm4,       [r8 + rcx*2] ;vpx_rv[rcx*2]
-%else
-            movdqu      xmm4,       [sym(vpx_rv) + rcx*2]
-%endif
-
-            paddw       xmm1,       xmm4
-            ;paddw     xmm1,       eight8s
-            psraw       xmm1,       4
-
-            packuswb    xmm1,       xmm0
-            pand        xmm1,       xmm3
-
-            pandn       xmm3,       xmm2
-            por         xmm1,       xmm3
-
-            and         rcx,        15
-            movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]
-
-            cmp         edx,        8
-            jl          .skip_assignment
-
-            mov         rcx,        rdx
-            sub         rcx,        8
-            and         rcx,        15
-            movq        mm0,        [rsp + rcx*8] ;d[rcx*8]
-            movq        [rsi],      mm0
-
-.skip_assignment:
-            lea         rsi,        [rsi+rax]
-
-            lea         rdi,        [rdi+rax]
-            add         rdx,        1
-
-            cmp         edx,        dword arg(2) ;rows
-            jl          .loop_row
-
-        add         dword arg(0), 8 ; s += 8
-        sub         dword arg(3), 8 ; cols -= 8
-        cmp         dword arg(3), 0
-        jg          .loop_col
-
-    add         rsp, 128+16
-    pop         rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%undef flimit4
-
 
 ;void vpx_mbpost_proc_across_ip_sse2(unsigned char *src,
 ;                                    int pitch, int rows, int cols,int flimit)
-global sym(vpx_mbpost_proc_across_ip_sse2) PRIVATE
+globalsym(vpx_mbpost_proc_across_ip_sse2)
 sym(vpx_mbpost_proc_across_ip_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h b/media/libvpx/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
index 39d3a3f59c..f3a8020292 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
+++ b/media/libvpx/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
@@ -51,7 +51,7 @@ void FDCT32x32_2D_AVX2(const int16_t *input, int16_t *output_org, int stride) {
   //    When we use them, in one case, they are all the same. In all others
   //    it's a pair of them that we need to repeat four times. This is done
   //    by constructing the 32 bit constant corresponding to that pair.
-  const __m256i k__cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
+  const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(cospi_16_64);
   const __m256i k__cospi_p16_m16 =
       pair256_set_epi16(+cospi_16_64, -cospi_16_64);
   const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
@@ -89,7 +89,7 @@ void FDCT32x32_2D_AVX2(const int16_t *input, int16_t *output_org, int stride) {
   const __m256i k__cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64);
   const __m256i k__cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64);
   const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING);
-  const __m256i kZero = _mm256_set1_epi16(0);
+  const __m256i kZero = _mm256_setzero_si256();
   const __m256i kOne = _mm256_set1_epi16(1);
   // Do the two transform/transpose passes
   int pass;
@@ -1374,59 +1374,37 @@ void FDCT32x32_2D_AVX2(const int16_t *input, int16_t *output_org, int stride) {
         __m256i lstep1[64], lstep2[64], lstep3[64];
         __m256i u[32], v[32], sign[16];
         const __m256i K32One = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1);
+        const __m256i k__pOne_mOne = pair256_set_epi16(1, -1);
         // start using 32-bit operations
         // stage 3
         {
-          // expanding to 32-bit length priori to addition operations
-          lstep2[0] = _mm256_unpacklo_epi16(step2[0], kZero);
-          lstep2[1] = _mm256_unpackhi_epi16(step2[0], kZero);
-          lstep2[2] = _mm256_unpacklo_epi16(step2[1], kZero);
-          lstep2[3] = _mm256_unpackhi_epi16(step2[1], kZero);
-          lstep2[4] = _mm256_unpacklo_epi16(step2[2], kZero);
-          lstep2[5] = _mm256_unpackhi_epi16(step2[2], kZero);
-          lstep2[6] = _mm256_unpacklo_epi16(step2[3], kZero);
-          lstep2[7] = _mm256_unpackhi_epi16(step2[3], kZero);
-          lstep2[8] = _mm256_unpacklo_epi16(step2[4], kZero);
-          lstep2[9] = _mm256_unpackhi_epi16(step2[4], kZero);
-          lstep2[10] = _mm256_unpacklo_epi16(step2[5], kZero);
-          lstep2[11] = _mm256_unpackhi_epi16(step2[5], kZero);
-          lstep2[12] = _mm256_unpacklo_epi16(step2[6], kZero);
-          lstep2[13] = _mm256_unpackhi_epi16(step2[6], kZero);
-          lstep2[14] = _mm256_unpacklo_epi16(step2[7], kZero);
-          lstep2[15] = _mm256_unpackhi_epi16(step2[7], kZero);
-          lstep2[0] = _mm256_madd_epi16(lstep2[0], kOne);
-          lstep2[1] = _mm256_madd_epi16(lstep2[1], kOne);
-          lstep2[2] = _mm256_madd_epi16(lstep2[2], kOne);
-          lstep2[3] = _mm256_madd_epi16(lstep2[3], kOne);
-          lstep2[4] = _mm256_madd_epi16(lstep2[4], kOne);
-          lstep2[5] = _mm256_madd_epi16(lstep2[5], kOne);
-          lstep2[6] = _mm256_madd_epi16(lstep2[6], kOne);
-          lstep2[7] = _mm256_madd_epi16(lstep2[7], kOne);
-          lstep2[8] = _mm256_madd_epi16(lstep2[8], kOne);
-          lstep2[9] = _mm256_madd_epi16(lstep2[9], kOne);
-          lstep2[10] = _mm256_madd_epi16(lstep2[10], kOne);
-          lstep2[11] = _mm256_madd_epi16(lstep2[11], kOne);
-          lstep2[12] = _mm256_madd_epi16(lstep2[12], kOne);
-          lstep2[13] = _mm256_madd_epi16(lstep2[13], kOne);
-          lstep2[14] = _mm256_madd_epi16(lstep2[14], kOne);
-          lstep2[15] = _mm256_madd_epi16(lstep2[15], kOne);
+          // expanding to 32-bit length while adding and subtracting
+          lstep2[0] = _mm256_unpacklo_epi16(step2[0], step2[7]);
+          lstep2[1] = _mm256_unpackhi_epi16(step2[0], step2[7]);
+          lstep2[2] = _mm256_unpacklo_epi16(step2[1], step2[6]);
+          lstep2[3] = _mm256_unpackhi_epi16(step2[1], step2[6]);
+          lstep2[4] = _mm256_unpacklo_epi16(step2[2], step2[5]);
+          lstep2[5] = _mm256_unpackhi_epi16(step2[2], step2[5]);
+          lstep2[6] = _mm256_unpacklo_epi16(step2[3], step2[4]);
+          lstep2[7] = _mm256_unpackhi_epi16(step2[3], step2[4]);
 
-          lstep3[0] = _mm256_add_epi32(lstep2[14], lstep2[0]);
-          lstep3[1] = _mm256_add_epi32(lstep2[15], lstep2[1]);
-          lstep3[2] = _mm256_add_epi32(lstep2[12], lstep2[2]);
-          lstep3[3] = _mm256_add_epi32(lstep2[13], lstep2[3]);
-          lstep3[4] = _mm256_add_epi32(lstep2[10], lstep2[4]);
-          lstep3[5] = _mm256_add_epi32(lstep2[11], lstep2[5]);
-          lstep3[6] = _mm256_add_epi32(lstep2[8], lstep2[6]);
-          lstep3[7] = _mm256_add_epi32(lstep2[9], lstep2[7]);
-          lstep3[8] = _mm256_sub_epi32(lstep2[6], lstep2[8]);
-          lstep3[9] = _mm256_sub_epi32(lstep2[7], lstep2[9]);
-          lstep3[10] = _mm256_sub_epi32(lstep2[4], lstep2[10]);
-          lstep3[11] = _mm256_sub_epi32(lstep2[5], lstep2[11]);
-          lstep3[12] = _mm256_sub_epi32(lstep2[2], lstep2[12]);
-          lstep3[13] = _mm256_sub_epi32(lstep2[3], lstep2[13]);
-          lstep3[14] = _mm256_sub_epi32(lstep2[0], lstep2[14]);
-          lstep3[15] = _mm256_sub_epi32(lstep2[1], lstep2[15]);
+          lstep3[0] = _mm256_madd_epi16(lstep2[0], kOne);
+          lstep3[1] = _mm256_madd_epi16(lstep2[1], kOne);
+          lstep3[2] = _mm256_madd_epi16(lstep2[2], kOne);
+          lstep3[3] = _mm256_madd_epi16(lstep2[3], kOne);
+          lstep3[4] = _mm256_madd_epi16(lstep2[4], kOne);
+          lstep3[5] = _mm256_madd_epi16(lstep2[5], kOne);
+          lstep3[6] = _mm256_madd_epi16(lstep2[6], kOne);
+          lstep3[7] = _mm256_madd_epi16(lstep2[7], kOne);
+
+          lstep3[8] = _mm256_madd_epi16(lstep2[6], k__pOne_mOne);
+          lstep3[9] = _mm256_madd_epi16(lstep2[7], k__pOne_mOne);
+          lstep3[10] = _mm256_madd_epi16(lstep2[4], k__pOne_mOne);
+          lstep3[11] = _mm256_madd_epi16(lstep2[5], k__pOne_mOne);
+          lstep3[12] = _mm256_madd_epi16(lstep2[2], k__pOne_mOne);
+          lstep3[13] = _mm256_madd_epi16(lstep2[3], k__pOne_mOne);
+          lstep3[14] = _mm256_madd_epi16(lstep2[0], k__pOne_mOne);
+          lstep3[15] = _mm256_madd_epi16(lstep2[1], k__pOne_mOne);
         }
         {
           const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]);
@@ -1468,126 +1446,76 @@ void FDCT32x32_2D_AVX2(const int16_t *input, int16_t *output_org, int stride) {
           lstep3[27] = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS);
         }
         {
-          lstep2[40] = _mm256_unpacklo_epi16(step2[20], kZero);
-          lstep2[41] = _mm256_unpackhi_epi16(step2[20], kZero);
-          lstep2[42] = _mm256_unpacklo_epi16(step2[21], kZero);
-          lstep2[43] = _mm256_unpackhi_epi16(step2[21], kZero);
-          lstep2[44] = _mm256_unpacklo_epi16(step2[22], kZero);
-          lstep2[45] = _mm256_unpackhi_epi16(step2[22], kZero);
-          lstep2[46] = _mm256_unpacklo_epi16(step2[23], kZero);
-          lstep2[47] = _mm256_unpackhi_epi16(step2[23], kZero);
-          lstep2[48] = _mm256_unpacklo_epi16(step2[24], kZero);
-          lstep2[49] = _mm256_unpackhi_epi16(step2[24], kZero);
-          lstep2[50] = _mm256_unpacklo_epi16(step2[25], kZero);
-          lstep2[51] = _mm256_unpackhi_epi16(step2[25], kZero);
-          lstep2[52] = _mm256_unpacklo_epi16(step2[26], kZero);
-          lstep2[53] = _mm256_unpackhi_epi16(step2[26], kZero);
-          lstep2[54] = _mm256_unpacklo_epi16(step2[27], kZero);
-          lstep2[55] = _mm256_unpackhi_epi16(step2[27], kZero);
-          lstep2[40] = _mm256_madd_epi16(lstep2[40], kOne);
-          lstep2[41] = _mm256_madd_epi16(lstep2[41], kOne);
-          lstep2[42] = _mm256_madd_epi16(lstep2[42], kOne);
-          lstep2[43] = _mm256_madd_epi16(lstep2[43], kOne);
-          lstep2[44] = _mm256_madd_epi16(lstep2[44], kOne);
-          lstep2[45] = _mm256_madd_epi16(lstep2[45], kOne);
-          lstep2[46] = _mm256_madd_epi16(lstep2[46], kOne);
-          lstep2[47] = _mm256_madd_epi16(lstep2[47], kOne);
-          lstep2[48] = _mm256_madd_epi16(lstep2[48], kOne);
-          lstep2[49] = _mm256_madd_epi16(lstep2[49], kOne);
-          lstep2[50] = _mm256_madd_epi16(lstep2[50], kOne);
-          lstep2[51] = _mm256_madd_epi16(lstep2[51], kOne);
-          lstep2[52] = _mm256_madd_epi16(lstep2[52], kOne);
-          lstep2[53] = _mm256_madd_epi16(lstep2[53], kOne);
-          lstep2[54] = _mm256_madd_epi16(lstep2[54], kOne);
-          lstep2[55] = _mm256_madd_epi16(lstep2[55], kOne);
+          lstep1[32] = _mm256_unpacklo_epi16(step1[16], step2[23]);
+          lstep1[33] = _mm256_unpackhi_epi16(step1[16], step2[23]);
+          lstep1[34] = _mm256_unpacklo_epi16(step1[17], step2[22]);
+          lstep1[35] = _mm256_unpackhi_epi16(step1[17], step2[22]);
+          lstep1[36] = _mm256_unpacklo_epi16(step1[18], step2[21]);
+          lstep1[37] = _mm256_unpackhi_epi16(step1[18], step2[21]);
+          lstep1[38] = _mm256_unpacklo_epi16(step1[19], step2[20]);
+          lstep1[39] = _mm256_unpackhi_epi16(step1[19], step2[20]);
 
-          lstep1[32] = _mm256_unpacklo_epi16(step1[16], kZero);
-          lstep1[33] = _mm256_unpackhi_epi16(step1[16], kZero);
-          lstep1[34] = _mm256_unpacklo_epi16(step1[17], kZero);
-          lstep1[35] = _mm256_unpackhi_epi16(step1[17], kZero);
-          lstep1[36] = _mm256_unpacklo_epi16(step1[18], kZero);
-          lstep1[37] = _mm256_unpackhi_epi16(step1[18], kZero);
-          lstep1[38] = _mm256_unpacklo_epi16(step1[19], kZero);
-          lstep1[39] = _mm256_unpackhi_epi16(step1[19], kZero);
-          lstep1[56] = _mm256_unpacklo_epi16(step1[28], kZero);
-          lstep1[57] = _mm256_unpackhi_epi16(step1[28], kZero);
-          lstep1[58] = _mm256_unpacklo_epi16(step1[29], kZero);
-          lstep1[59] = _mm256_unpackhi_epi16(step1[29], kZero);
-          lstep1[60] = _mm256_unpacklo_epi16(step1[30], kZero);
-          lstep1[61] = _mm256_unpackhi_epi16(step1[30], kZero);
-          lstep1[62] = _mm256_unpacklo_epi16(step1[31], kZero);
-          lstep1[63] = _mm256_unpackhi_epi16(step1[31], kZero);
-          lstep1[32] = _mm256_madd_epi16(lstep1[32], kOne);
-          lstep1[33] = _mm256_madd_epi16(lstep1[33], kOne);
-          lstep1[34] = _mm256_madd_epi16(lstep1[34], kOne);
-          lstep1[35] = _mm256_madd_epi16(lstep1[35], kOne);
-          lstep1[36] = _mm256_madd_epi16(lstep1[36], kOne);
-          lstep1[37] = _mm256_madd_epi16(lstep1[37], kOne);
-          lstep1[38] = _mm256_madd_epi16(lstep1[38], kOne);
-          lstep1[39] = _mm256_madd_epi16(lstep1[39], kOne);
-          lstep1[56] = _mm256_madd_epi16(lstep1[56], kOne);
-          lstep1[57] = _mm256_madd_epi16(lstep1[57], kOne);
-          lstep1[58] = _mm256_madd_epi16(lstep1[58], kOne);
-          lstep1[59] = _mm256_madd_epi16(lstep1[59], kOne);
-          lstep1[60] = _mm256_madd_epi16(lstep1[60], kOne);
-          lstep1[61] = _mm256_madd_epi16(lstep1[61], kOne);
-          lstep1[62] = _mm256_madd_epi16(lstep1[62], kOne);
-          lstep1[63] = _mm256_madd_epi16(lstep1[63], kOne);
+          lstep1[56] = _mm256_unpacklo_epi16(step1[28], step2[27]);
+          lstep1[57] = _mm256_unpackhi_epi16(step1[28], step2[27]);
+          lstep1[58] = _mm256_unpacklo_epi16(step1[29], step2[26]);
+          lstep1[59] = _mm256_unpackhi_epi16(step1[29], step2[26]);
+          lstep1[60] = _mm256_unpacklo_epi16(step1[30], step2[25]);
+          lstep1[61] = _mm256_unpackhi_epi16(step1[30], step2[25]);
+          lstep1[62] = _mm256_unpacklo_epi16(step1[31], step2[24]);
+          lstep1[63] = _mm256_unpackhi_epi16(step1[31], step2[24]);
 
-          lstep3[32] = _mm256_add_epi32(lstep2[46], lstep1[32]);
-          lstep3[33] = _mm256_add_epi32(lstep2[47], lstep1[33]);
+          lstep3[32] = _mm256_madd_epi16(lstep1[32], kOne);
+          lstep3[33] = _mm256_madd_epi16(lstep1[33], kOne);
+          lstep3[34] = _mm256_madd_epi16(lstep1[34], kOne);
+          lstep3[35] = _mm256_madd_epi16(lstep1[35], kOne);
+          lstep3[36] = _mm256_madd_epi16(lstep1[36], kOne);
+          lstep3[37] = _mm256_madd_epi16(lstep1[37], kOne);
+          lstep3[38] = _mm256_madd_epi16(lstep1[38], kOne);
+          lstep3[39] = _mm256_madd_epi16(lstep1[39], kOne);
 
-          lstep3[34] = _mm256_add_epi32(lstep2[44], lstep1[34]);
-          lstep3[35] = _mm256_add_epi32(lstep2[45], lstep1[35]);
-          lstep3[36] = _mm256_add_epi32(lstep2[42], lstep1[36]);
-          lstep3[37] = _mm256_add_epi32(lstep2[43], lstep1[37]);
-          lstep3[38] = _mm256_add_epi32(lstep2[40], lstep1[38]);
-          lstep3[39] = _mm256_add_epi32(lstep2[41], lstep1[39]);
-          lstep3[40] = _mm256_sub_epi32(lstep1[38], lstep2[40]);
-          lstep3[41] = _mm256_sub_epi32(lstep1[39], lstep2[41]);
-          lstep3[42] = _mm256_sub_epi32(lstep1[36], lstep2[42]);
-          lstep3[43] = _mm256_sub_epi32(lstep1[37], lstep2[43]);
-          lstep3[44] = _mm256_sub_epi32(lstep1[34], lstep2[44]);
-          lstep3[45] = _mm256_sub_epi32(lstep1[35], lstep2[45]);
-          lstep3[46] = _mm256_sub_epi32(lstep1[32], lstep2[46]);
-          lstep3[47] = _mm256_sub_epi32(lstep1[33], lstep2[47]);
-          lstep3[48] = _mm256_sub_epi32(lstep1[62], lstep2[48]);
-          lstep3[49] = _mm256_sub_epi32(lstep1[63], lstep2[49]);
-          lstep3[50] = _mm256_sub_epi32(lstep1[60], lstep2[50]);
-          lstep3[51] = _mm256_sub_epi32(lstep1[61], lstep2[51]);
-          lstep3[52] = _mm256_sub_epi32(lstep1[58], lstep2[52]);
-          lstep3[53] = _mm256_sub_epi32(lstep1[59], lstep2[53]);
-          lstep3[54] = _mm256_sub_epi32(lstep1[56], lstep2[54]);
-          lstep3[55] = _mm256_sub_epi32(lstep1[57], lstep2[55]);
-          lstep3[56] = _mm256_add_epi32(lstep2[54], lstep1[56]);
-          lstep3[57] = _mm256_add_epi32(lstep2[55], lstep1[57]);
-          lstep3[58] = _mm256_add_epi32(lstep2[52], lstep1[58]);
-          lstep3[59] = _mm256_add_epi32(lstep2[53], lstep1[59]);
-          lstep3[60] = _mm256_add_epi32(lstep2[50], lstep1[60]);
-          lstep3[61] = _mm256_add_epi32(lstep2[51], lstep1[61]);
-          lstep3[62] = _mm256_add_epi32(lstep2[48], lstep1[62]);
-          lstep3[63] = _mm256_add_epi32(lstep2[49], lstep1[63]);
+          lstep3[40] = _mm256_madd_epi16(lstep1[38], k__pOne_mOne);
+          lstep3[41] = _mm256_madd_epi16(lstep1[39], k__pOne_mOne);
+          lstep3[42] = _mm256_madd_epi16(lstep1[36], k__pOne_mOne);
+          lstep3[43] = _mm256_madd_epi16(lstep1[37], k__pOne_mOne);
+          lstep3[44] = _mm256_madd_epi16(lstep1[34], k__pOne_mOne);
+          lstep3[45] = _mm256_madd_epi16(lstep1[35], k__pOne_mOne);
+          lstep3[46] = _mm256_madd_epi16(lstep1[32], k__pOne_mOne);
+          lstep3[47] = _mm256_madd_epi16(lstep1[33], k__pOne_mOne);
+
+          lstep3[48] = _mm256_madd_epi16(lstep1[62], k__pOne_mOne);
+          lstep3[49] = _mm256_madd_epi16(lstep1[63], k__pOne_mOne);
+          lstep3[50] = _mm256_madd_epi16(lstep1[60], k__pOne_mOne);
+          lstep3[51] = _mm256_madd_epi16(lstep1[61], k__pOne_mOne);
+          lstep3[52] = _mm256_madd_epi16(lstep1[58], k__pOne_mOne);
+          lstep3[53] = _mm256_madd_epi16(lstep1[59], k__pOne_mOne);
+          lstep3[54] = _mm256_madd_epi16(lstep1[56], k__pOne_mOne);
+          lstep3[55] = _mm256_madd_epi16(lstep1[57], k__pOne_mOne);
+
+          lstep3[56] = _mm256_madd_epi16(lstep1[56], kOne);
+          lstep3[57] = _mm256_madd_epi16(lstep1[57], kOne);
+          lstep3[58] = _mm256_madd_epi16(lstep1[58], kOne);
+          lstep3[59] = _mm256_madd_epi16(lstep1[59], kOne);
+          lstep3[60] = _mm256_madd_epi16(lstep1[60], kOne);
+          lstep3[61] = _mm256_madd_epi16(lstep1[61], kOne);
+          lstep3[62] = _mm256_madd_epi16(lstep1[62], kOne);
+          lstep3[63] = _mm256_madd_epi16(lstep1[63], kOne);
         }
 
         // stage 4
         {
-          // expanding to 32-bit length priori to addition operations
-          lstep2[16] = _mm256_unpacklo_epi16(step2[8], kZero);
-          lstep2[17] = _mm256_unpackhi_epi16(step2[8], kZero);
-          lstep2[18] = _mm256_unpacklo_epi16(step2[9], kZero);
-          lstep2[19] = _mm256_unpackhi_epi16(step2[9], kZero);
-          lstep2[28] = _mm256_unpacklo_epi16(step2[14], kZero);
-          lstep2[29] = _mm256_unpackhi_epi16(step2[14], kZero);
-          lstep2[30] = _mm256_unpacklo_epi16(step2[15], kZero);
-          lstep2[31] = _mm256_unpackhi_epi16(step2[15], kZero);
-          lstep2[16] = _mm256_madd_epi16(lstep2[16], kOne);
-          lstep2[17] = _mm256_madd_epi16(lstep2[17], kOne);
-          lstep2[18] = _mm256_madd_epi16(lstep2[18], kOne);
-          lstep2[19] = _mm256_madd_epi16(lstep2[19], kOne);
-          lstep2[28] = _mm256_madd_epi16(lstep2[28], kOne);
-          lstep2[29] = _mm256_madd_epi16(lstep2[29], kOne);
-          lstep2[30] = _mm256_madd_epi16(lstep2[30], kOne);
-          lstep2[31] = _mm256_madd_epi16(lstep2[31], kOne);
+          // expanding to 32-bit length prior to addition operations
+          sign[0] = _mm256_cmpgt_epi16(kZero, step2[8]);
+          sign[1] = _mm256_cmpgt_epi16(kZero, step2[9]);
+          sign[2] = _mm256_cmpgt_epi16(kZero, step2[14]);
+          sign[3] = _mm256_cmpgt_epi16(kZero, step2[15]);
+          lstep2[16] = _mm256_unpacklo_epi16(step2[8], sign[0]);
+          lstep2[17] = _mm256_unpackhi_epi16(step2[8], sign[0]);
+          lstep2[18] = _mm256_unpacklo_epi16(step2[9], sign[1]);
+          lstep2[19] = _mm256_unpackhi_epi16(step2[9], sign[1]);
+          lstep2[28] = _mm256_unpacklo_epi16(step2[14], sign[2]);
+          lstep2[29] = _mm256_unpackhi_epi16(step2[14], sign[2]);
+          lstep2[30] = _mm256_unpacklo_epi16(step2[15], sign[3]);
+          lstep2[31] = _mm256_unpackhi_epi16(step2[15], sign[3]);
 
           lstep1[0] = _mm256_add_epi32(lstep3[6], lstep3[0]);
           lstep1[1] = _mm256_add_epi32(lstep3[7], lstep3[1]);
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
index 3744333909..bf350b6da0 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
+++ b/media/libvpx/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
@@ -21,7 +21,7 @@
 #define ADD_EPI16 _mm_adds_epi16
 #define SUB_EPI16 _mm_subs_epi16
 #if FDCT32x32_HIGH_PRECISION
-void vpx_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) {
+static void vpx_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) {
   int i, j;
   for (i = 0; i < 32; ++i) {
     tran_high_t temp_in[32], temp_out[32];
@@ -35,7 +35,8 @@ void vpx_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) {
 #define HIGH_FDCT32x32_2D_C vpx_highbd_fdct32x32_c
 #define HIGH_FDCT32x32_2D_ROWS_C vpx_fdct32x32_rows_c
 #else
-void vpx_fdct32x32_rd_rows_c(const int16_t *intermediate, tran_low_t *out) {
+static void vpx_fdct32x32_rd_rows_c(const int16_t *intermediate,
+                                    tran_low_t *out) {
   int i, j;
   for (i = 0; i < 32; ++i) {
     tran_high_t temp_in[32], temp_out[32];
@@ -63,7 +64,7 @@ void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
   //    When we use them, in one case, they are all the same. In all others
   //    it's a pair of them that we need to repeat four times. This is done
   //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
   const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
@@ -99,8 +100,9 @@ void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
   const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
   const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i kZero = _mm_setzero_si128();
   const __m128i kOne = _mm_set1_epi16(1);
+
   // Do the two transform/transpose passes
   int pass;
 #if DCT_HIGH_BIT_DEPTH
@@ -1508,59 +1510,37 @@ void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
         __m128i lstep1[64], lstep2[64], lstep3[64];
         __m128i u[32], v[32], sign[16];
         const __m128i K32One = _mm_set_epi32(1, 1, 1, 1);
+        const __m128i k__pOne_mOne = pair_set_epi16(1, -1);
         // start using 32-bit operations
         // stage 3
         {
-          // expanding to 32-bit length priori to addition operations
-          lstep2[0] = _mm_unpacklo_epi16(step2[0], kZero);
-          lstep2[1] = _mm_unpackhi_epi16(step2[0], kZero);
-          lstep2[2] = _mm_unpacklo_epi16(step2[1], kZero);
-          lstep2[3] = _mm_unpackhi_epi16(step2[1], kZero);
-          lstep2[4] = _mm_unpacklo_epi16(step2[2], kZero);
-          lstep2[5] = _mm_unpackhi_epi16(step2[2], kZero);
-          lstep2[6] = _mm_unpacklo_epi16(step2[3], kZero);
-          lstep2[7] = _mm_unpackhi_epi16(step2[3], kZero);
-          lstep2[8] = _mm_unpacklo_epi16(step2[4], kZero);
-          lstep2[9] = _mm_unpackhi_epi16(step2[4], kZero);
-          lstep2[10] = _mm_unpacklo_epi16(step2[5], kZero);
-          lstep2[11] = _mm_unpackhi_epi16(step2[5], kZero);
-          lstep2[12] = _mm_unpacklo_epi16(step2[6], kZero);
-          lstep2[13] = _mm_unpackhi_epi16(step2[6], kZero);
-          lstep2[14] = _mm_unpacklo_epi16(step2[7], kZero);
-          lstep2[15] = _mm_unpackhi_epi16(step2[7], kZero);
-          lstep2[0] = _mm_madd_epi16(lstep2[0], kOne);
-          lstep2[1] = _mm_madd_epi16(lstep2[1], kOne);
-          lstep2[2] = _mm_madd_epi16(lstep2[2], kOne);
-          lstep2[3] = _mm_madd_epi16(lstep2[3], kOne);
-          lstep2[4] = _mm_madd_epi16(lstep2[4], kOne);
-          lstep2[5] = _mm_madd_epi16(lstep2[5], kOne);
-          lstep2[6] = _mm_madd_epi16(lstep2[6], kOne);
-          lstep2[7] = _mm_madd_epi16(lstep2[7], kOne);
-          lstep2[8] = _mm_madd_epi16(lstep2[8], kOne);
-          lstep2[9] = _mm_madd_epi16(lstep2[9], kOne);
-          lstep2[10] = _mm_madd_epi16(lstep2[10], kOne);
-          lstep2[11] = _mm_madd_epi16(lstep2[11], kOne);
-          lstep2[12] = _mm_madd_epi16(lstep2[12], kOne);
-          lstep2[13] = _mm_madd_epi16(lstep2[13], kOne);
-          lstep2[14] = _mm_madd_epi16(lstep2[14], kOne);
-          lstep2[15] = _mm_madd_epi16(lstep2[15], kOne);
+          // expanding to 32-bit length while adding and subtracting
+          lstep2[0] = _mm_unpacklo_epi16(step2[0], step2[7]);
+          lstep2[1] = _mm_unpackhi_epi16(step2[0], step2[7]);
+          lstep2[2] = _mm_unpacklo_epi16(step2[1], step2[6]);
+          lstep2[3] = _mm_unpackhi_epi16(step2[1], step2[6]);
+          lstep2[4] = _mm_unpacklo_epi16(step2[2], step2[5]);
+          lstep2[5] = _mm_unpackhi_epi16(step2[2], step2[5]);
+          lstep2[6] = _mm_unpacklo_epi16(step2[3], step2[4]);
+          lstep2[7] = _mm_unpackhi_epi16(step2[3], step2[4]);
 
-          lstep3[0] = _mm_add_epi32(lstep2[14], lstep2[0]);
-          lstep3[1] = _mm_add_epi32(lstep2[15], lstep2[1]);
-          lstep3[2] = _mm_add_epi32(lstep2[12], lstep2[2]);
-          lstep3[3] = _mm_add_epi32(lstep2[13], lstep2[3]);
-          lstep3[4] = _mm_add_epi32(lstep2[10], lstep2[4]);
-          lstep3[5] = _mm_add_epi32(lstep2[11], lstep2[5]);
-          lstep3[6] = _mm_add_epi32(lstep2[8], lstep2[6]);
-          lstep3[7] = _mm_add_epi32(lstep2[9], lstep2[7]);
-          lstep3[8] = _mm_sub_epi32(lstep2[6], lstep2[8]);
-          lstep3[9] = _mm_sub_epi32(lstep2[7], lstep2[9]);
-          lstep3[10] = _mm_sub_epi32(lstep2[4], lstep2[10]);
-          lstep3[11] = _mm_sub_epi32(lstep2[5], lstep2[11]);
-          lstep3[12] = _mm_sub_epi32(lstep2[2], lstep2[12]);
-          lstep3[13] = _mm_sub_epi32(lstep2[3], lstep2[13]);
-          lstep3[14] = _mm_sub_epi32(lstep2[0], lstep2[14]);
-          lstep3[15] = _mm_sub_epi32(lstep2[1], lstep2[15]);
+          lstep3[0] = _mm_madd_epi16(lstep2[0], kOne);
+          lstep3[1] = _mm_madd_epi16(lstep2[1], kOne);
+          lstep3[2] = _mm_madd_epi16(lstep2[2], kOne);
+          lstep3[3] = _mm_madd_epi16(lstep2[3], kOne);
+          lstep3[4] = _mm_madd_epi16(lstep2[4], kOne);
+          lstep3[5] = _mm_madd_epi16(lstep2[5], kOne);
+          lstep3[6] = _mm_madd_epi16(lstep2[6], kOne);
+          lstep3[7] = _mm_madd_epi16(lstep2[7], kOne);
+
+          lstep3[8] = _mm_madd_epi16(lstep2[6], k__pOne_mOne);
+          lstep3[9] = _mm_madd_epi16(lstep2[7], k__pOne_mOne);
+          lstep3[10] = _mm_madd_epi16(lstep2[4], k__pOne_mOne);
+          lstep3[11] = _mm_madd_epi16(lstep2[5], k__pOne_mOne);
+          lstep3[12] = _mm_madd_epi16(lstep2[2], k__pOne_mOne);
+          lstep3[13] = _mm_madd_epi16(lstep2[3], k__pOne_mOne);
+          lstep3[14] = _mm_madd_epi16(lstep2[0], k__pOne_mOne);
+          lstep3[15] = _mm_madd_epi16(lstep2[1], k__pOne_mOne);
         }
         {
           const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
@@ -1594,126 +1574,76 @@ void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
           lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
         }
         {
-          lstep2[40] = _mm_unpacklo_epi16(step2[20], kZero);
-          lstep2[41] = _mm_unpackhi_epi16(step2[20], kZero);
-          lstep2[42] = _mm_unpacklo_epi16(step2[21], kZero);
-          lstep2[43] = _mm_unpackhi_epi16(step2[21], kZero);
-          lstep2[44] = _mm_unpacklo_epi16(step2[22], kZero);
-          lstep2[45] = _mm_unpackhi_epi16(step2[22], kZero);
-          lstep2[46] = _mm_unpacklo_epi16(step2[23], kZero);
-          lstep2[47] = _mm_unpackhi_epi16(step2[23], kZero);
-          lstep2[48] = _mm_unpacklo_epi16(step2[24], kZero);
-          lstep2[49] = _mm_unpackhi_epi16(step2[24], kZero);
-          lstep2[50] = _mm_unpacklo_epi16(step2[25], kZero);
-          lstep2[51] = _mm_unpackhi_epi16(step2[25], kZero);
-          lstep2[52] = _mm_unpacklo_epi16(step2[26], kZero);
-          lstep2[53] = _mm_unpackhi_epi16(step2[26], kZero);
-          lstep2[54] = _mm_unpacklo_epi16(step2[27], kZero);
-          lstep2[55] = _mm_unpackhi_epi16(step2[27], kZero);
-          lstep2[40] = _mm_madd_epi16(lstep2[40], kOne);
-          lstep2[41] = _mm_madd_epi16(lstep2[41], kOne);
-          lstep2[42] = _mm_madd_epi16(lstep2[42], kOne);
-          lstep2[43] = _mm_madd_epi16(lstep2[43], kOne);
-          lstep2[44] = _mm_madd_epi16(lstep2[44], kOne);
-          lstep2[45] = _mm_madd_epi16(lstep2[45], kOne);
-          lstep2[46] = _mm_madd_epi16(lstep2[46], kOne);
-          lstep2[47] = _mm_madd_epi16(lstep2[47], kOne);
-          lstep2[48] = _mm_madd_epi16(lstep2[48], kOne);
-          lstep2[49] = _mm_madd_epi16(lstep2[49], kOne);
-          lstep2[50] = _mm_madd_epi16(lstep2[50], kOne);
-          lstep2[51] = _mm_madd_epi16(lstep2[51], kOne);
-          lstep2[52] = _mm_madd_epi16(lstep2[52], kOne);
-          lstep2[53] = _mm_madd_epi16(lstep2[53], kOne);
-          lstep2[54] = _mm_madd_epi16(lstep2[54], kOne);
-          lstep2[55] = _mm_madd_epi16(lstep2[55], kOne);
+          lstep1[32] = _mm_unpacklo_epi16(step1[16], step2[23]);
+          lstep1[33] = _mm_unpackhi_epi16(step1[16], step2[23]);
+          lstep1[34] = _mm_unpacklo_epi16(step1[17], step2[22]);
+          lstep1[35] = _mm_unpackhi_epi16(step1[17], step2[22]);
+          lstep1[36] = _mm_unpacklo_epi16(step1[18], step2[21]);
+          lstep1[37] = _mm_unpackhi_epi16(step1[18], step2[21]);
+          lstep1[38] = _mm_unpacklo_epi16(step1[19], step2[20]);
+          lstep1[39] = _mm_unpackhi_epi16(step1[19], step2[20]);
 
-          lstep1[32] = _mm_unpacklo_epi16(step1[16], kZero);
-          lstep1[33] = _mm_unpackhi_epi16(step1[16], kZero);
-          lstep1[34] = _mm_unpacklo_epi16(step1[17], kZero);
-          lstep1[35] = _mm_unpackhi_epi16(step1[17], kZero);
-          lstep1[36] = _mm_unpacklo_epi16(step1[18], kZero);
-          lstep1[37] = _mm_unpackhi_epi16(step1[18], kZero);
-          lstep1[38] = _mm_unpacklo_epi16(step1[19], kZero);
-          lstep1[39] = _mm_unpackhi_epi16(step1[19], kZero);
-          lstep1[56] = _mm_unpacklo_epi16(step1[28], kZero);
-          lstep1[57] = _mm_unpackhi_epi16(step1[28], kZero);
-          lstep1[58] = _mm_unpacklo_epi16(step1[29], kZero);
-          lstep1[59] = _mm_unpackhi_epi16(step1[29], kZero);
-          lstep1[60] = _mm_unpacklo_epi16(step1[30], kZero);
-          lstep1[61] = _mm_unpackhi_epi16(step1[30], kZero);
-          lstep1[62] = _mm_unpacklo_epi16(step1[31], kZero);
-          lstep1[63] = _mm_unpackhi_epi16(step1[31], kZero);
-          lstep1[32] = _mm_madd_epi16(lstep1[32], kOne);
-          lstep1[33] = _mm_madd_epi16(lstep1[33], kOne);
-          lstep1[34] = _mm_madd_epi16(lstep1[34], kOne);
-          lstep1[35] = _mm_madd_epi16(lstep1[35], kOne);
-          lstep1[36] = _mm_madd_epi16(lstep1[36], kOne);
-          lstep1[37] = _mm_madd_epi16(lstep1[37], kOne);
-          lstep1[38] = _mm_madd_epi16(lstep1[38], kOne);
-          lstep1[39] = _mm_madd_epi16(lstep1[39], kOne);
-          lstep1[56] = _mm_madd_epi16(lstep1[56], kOne);
-          lstep1[57] = _mm_madd_epi16(lstep1[57], kOne);
-          lstep1[58] = _mm_madd_epi16(lstep1[58], kOne);
-          lstep1[59] = _mm_madd_epi16(lstep1[59], kOne);
-          lstep1[60] = _mm_madd_epi16(lstep1[60], kOne);
-          lstep1[61] = _mm_madd_epi16(lstep1[61], kOne);
-          lstep1[62] = _mm_madd_epi16(lstep1[62], kOne);
-          lstep1[63] = _mm_madd_epi16(lstep1[63], kOne);
+          lstep1[56] = _mm_unpacklo_epi16(step1[28], step2[27]);
+          lstep1[57] = _mm_unpackhi_epi16(step1[28], step2[27]);
+          lstep1[58] = _mm_unpacklo_epi16(step1[29], step2[26]);
+          lstep1[59] = _mm_unpackhi_epi16(step1[29], step2[26]);
+          lstep1[60] = _mm_unpacklo_epi16(step1[30], step2[25]);
+          lstep1[61] = _mm_unpackhi_epi16(step1[30], step2[25]);
+          lstep1[62] = _mm_unpacklo_epi16(step1[31], step2[24]);
+          lstep1[63] = _mm_unpackhi_epi16(step1[31], step2[24]);
 
-          lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]);
-          lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]);
+          lstep3[32] = _mm_madd_epi16(lstep1[32], kOne);
+          lstep3[33] = _mm_madd_epi16(lstep1[33], kOne);
+          lstep3[34] = _mm_madd_epi16(lstep1[34], kOne);
+          lstep3[35] = _mm_madd_epi16(lstep1[35], kOne);
+          lstep3[36] = _mm_madd_epi16(lstep1[36], kOne);
+          lstep3[37] = _mm_madd_epi16(lstep1[37], kOne);
+          lstep3[38] = _mm_madd_epi16(lstep1[38], kOne);
+          lstep3[39] = _mm_madd_epi16(lstep1[39], kOne);
 
-          lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]);
-          lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]);
-          lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]);
-          lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]);
-          lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]);
-          lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]);
-          lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]);
-          lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]);
-          lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]);
-          lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]);
-          lstep3[44] = _mm_sub_epi32(lstep1[34], lstep2[44]);
-          lstep3[45] = _mm_sub_epi32(lstep1[35], lstep2[45]);
-          lstep3[46] = _mm_sub_epi32(lstep1[32], lstep2[46]);
-          lstep3[47] = _mm_sub_epi32(lstep1[33], lstep2[47]);
-          lstep3[48] = _mm_sub_epi32(lstep1[62], lstep2[48]);
-          lstep3[49] = _mm_sub_epi32(lstep1[63], lstep2[49]);
-          lstep3[50] = _mm_sub_epi32(lstep1[60], lstep2[50]);
-          lstep3[51] = _mm_sub_epi32(lstep1[61], lstep2[51]);
-          lstep3[52] = _mm_sub_epi32(lstep1[58], lstep2[52]);
-          lstep3[53] = _mm_sub_epi32(lstep1[59], lstep2[53]);
-          lstep3[54] = _mm_sub_epi32(lstep1[56], lstep2[54]);
-          lstep3[55] = _mm_sub_epi32(lstep1[57], lstep2[55]);
-          lstep3[56] = _mm_add_epi32(lstep2[54], lstep1[56]);
-          lstep3[57] = _mm_add_epi32(lstep2[55], lstep1[57]);
-          lstep3[58] = _mm_add_epi32(lstep2[52], lstep1[58]);
-          lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]);
-          lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]);
-          lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]);
-          lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]);
-          lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]);
+          lstep3[40] = _mm_madd_epi16(lstep1[38], k__pOne_mOne);
+          lstep3[41] = _mm_madd_epi16(lstep1[39], k__pOne_mOne);
+          lstep3[42] = _mm_madd_epi16(lstep1[36], k__pOne_mOne);
+          lstep3[43] = _mm_madd_epi16(lstep1[37], k__pOne_mOne);
+          lstep3[44] = _mm_madd_epi16(lstep1[34], k__pOne_mOne);
+          lstep3[45] = _mm_madd_epi16(lstep1[35], k__pOne_mOne);
+          lstep3[46] = _mm_madd_epi16(lstep1[32], k__pOne_mOne);
+          lstep3[47] = _mm_madd_epi16(lstep1[33], k__pOne_mOne);
+
+          lstep3[48] = _mm_madd_epi16(lstep1[62], k__pOne_mOne);
+          lstep3[49] = _mm_madd_epi16(lstep1[63], k__pOne_mOne);
+          lstep3[50] = _mm_madd_epi16(lstep1[60], k__pOne_mOne);
+          lstep3[51] = _mm_madd_epi16(lstep1[61], k__pOne_mOne);
+          lstep3[52] = _mm_madd_epi16(lstep1[58], k__pOne_mOne);
+          lstep3[53] = _mm_madd_epi16(lstep1[59], k__pOne_mOne);
+          lstep3[54] = _mm_madd_epi16(lstep1[56], k__pOne_mOne);
+          lstep3[55] = _mm_madd_epi16(lstep1[57], k__pOne_mOne);
+
+          lstep3[56] = _mm_madd_epi16(lstep1[56], kOne);
+          lstep3[57] = _mm_madd_epi16(lstep1[57], kOne);
+          lstep3[58] = _mm_madd_epi16(lstep1[58], kOne);
+          lstep3[59] = _mm_madd_epi16(lstep1[59], kOne);
+          lstep3[60] = _mm_madd_epi16(lstep1[60], kOne);
+          lstep3[61] = _mm_madd_epi16(lstep1[61], kOne);
+          lstep3[62] = _mm_madd_epi16(lstep1[62], kOne);
+          lstep3[63] = _mm_madd_epi16(lstep1[63], kOne);
         }
 
         // stage 4
         {
-          // expanding to 32-bit length priori to addition operations
-          lstep2[16] = _mm_unpacklo_epi16(step2[8], kZero);
-          lstep2[17] = _mm_unpackhi_epi16(step2[8], kZero);
-          lstep2[18] = _mm_unpacklo_epi16(step2[9], kZero);
-          lstep2[19] = _mm_unpackhi_epi16(step2[9], kZero);
-          lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero);
-          lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero);
-          lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero);
-          lstep2[31] = _mm_unpackhi_epi16(step2[15], kZero);
-          lstep2[16] = _mm_madd_epi16(lstep2[16], kOne);
-          lstep2[17] = _mm_madd_epi16(lstep2[17], kOne);
-          lstep2[18] = _mm_madd_epi16(lstep2[18], kOne);
-          lstep2[19] = _mm_madd_epi16(lstep2[19], kOne);
-          lstep2[28] = _mm_madd_epi16(lstep2[28], kOne);
-          lstep2[29] = _mm_madd_epi16(lstep2[29], kOne);
-          lstep2[30] = _mm_madd_epi16(lstep2[30], kOne);
-          lstep2[31] = _mm_madd_epi16(lstep2[31], kOne);
+          // expanding to 32-bit length prior to addition operations
+          sign[0] = _mm_cmpgt_epi16(kZero, step2[8]);
+          sign[1] = _mm_cmpgt_epi16(kZero, step2[9]);
+          sign[2] = _mm_cmpgt_epi16(kZero, step2[14]);
+          sign[3] = _mm_cmpgt_epi16(kZero, step2[15]);
+          lstep2[16] = _mm_unpacklo_epi16(step2[8], sign[0]);
+          lstep2[17] = _mm_unpackhi_epi16(step2[8], sign[0]);
+          lstep2[18] = _mm_unpacklo_epi16(step2[9], sign[1]);
+          lstep2[19] = _mm_unpackhi_epi16(step2[9], sign[1]);
+          lstep2[28] = _mm_unpacklo_epi16(step2[14], sign[2]);
+          lstep2[29] = _mm_unpackhi_epi16(step2[14], sign[2]);
+          lstep2[30] = _mm_unpacklo_epi16(step2[15], sign[3]);
+          lstep2[31] = _mm_unpackhi_epi16(step2[15], sign[3]);
 
           lstep1[0] = _mm_add_epi32(lstep3[6], lstep3[0]);
           lstep1[1] = _mm_add_epi32(lstep3[7], lstep3[1]);
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c
index 21f11f0c3e..1eb6f41166 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c
+++ b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c
@@ -8,8 +8,384 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <immintrin.h>  // AVX2
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 
+#include "vpx_dsp/txfm_common.h"
+#define ADD256_EPI16 _mm256_add_epi16
+#define SUB256_EPI16 _mm256_sub_epi16
+
+static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in,
+                                                   int stride, __m256i *out,
+                                                   int out_size, int pass) {
+  int i;
+  const __m256i kOne = _mm256_set1_epi16(1);
+  if (pass == 0) {
+    for (i = 0; i < out_size; i++) {
+      out[i] = _mm256_loadu_si256((const __m256i *)(in + i * stride));
+      // x = x << 2
+      out[i] = _mm256_slli_epi16(out[i], 2);
+    }
+  } else {
+    for (i = 0; i < out_size; i++) {
+      out[i] = _mm256_loadu_si256((const __m256i *)(in + i * 16));
+      // x = (x + 1) >> 2
+      out[i] = _mm256_add_epi16(out[i], kOne);
+      out[i] = _mm256_srai_epi16(out[i], 2);
+    }
+  }
+}
+
+static INLINE void transpose2_8x8_avx2(const __m256i *const in,
+                                       __m256i *const out) {
+  int i;
+  __m256i t[16], u[16];
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 1)   ==>  (0, 1)
+  //   (2, 3)   ==>  (2, 3)
+  //   (4, 5)   ==>  (4, 5)
+  //   (6, 7)   ==>  (6, 7)
+  for (i = 0; i < 4; i++) {
+    t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]);
+    t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]);
+  }
+
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 2)   ==>  (0, 2)
+  //   (1, 3)   ==>  (1, 3)
+  //   (4, 6)   ==>  (4, 6)
+  //   (5, 7)   ==>  (5, 7)
+  for (i = 0; i < 2; i++) {
+    u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]);
+    u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]);
+
+    u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]);
+    u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]);
+  }
+
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 4)   ==>  (0, 1)
+  //   (1, 5)   ==>  (4, 5)
+  //   (2, 6)   ==>  (2, 3)
+  //   (3, 7)   ==>  (6, 7)
+  for (i = 0; i < 2; i++) {
+    out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]);
+    out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]);
+
+    out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]);
+    out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]);
+  }
+}
+
+static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,
+                                              __m256i *const out) {
+  __m256i t[16];
+
+#define LOADL(idx)                                                            \
+  t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
+  t[idx] = _mm256_inserti128_si256(                                           \
+      t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1);
+
+#define LOADR(idx)                                                           \
+  t[8 + idx] =                                                               \
+      _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
+  t[8 + idx] = _mm256_inserti128_si256(                                      \
+      t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1);
+
+  // load left 8x16
+  LOADL(0)
+  LOADL(1)
+  LOADL(2)
+  LOADL(3)
+  LOADL(4)
+  LOADL(5)
+  LOADL(6)
+  LOADL(7)
+
+  // load right 8x16
+  LOADR(0)
+  LOADR(1)
+  LOADR(2)
+  LOADR(3)
+  LOADR(4)
+  LOADR(5)
+  LOADR(6)
+  LOADR(7)
+
+  // get the top 16x8 result
+  transpose2_8x8_avx2(t, out);
+  // get the bottom 16x8 result
+  transpose2_8x8_avx2(&t[8], &out[8]);
+}
+
+// Store 8 16-bit values. Sign extend the values.
+static INLINE void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in,
+                                                        tran_low_t *out,
+                                                        const int stride,
+                                                        const int out_size) {
+  int i;
+  for (i = 0; i < out_size; ++i) {
+    _mm256_storeu_si256((__m256i *)(out), in[i]);
+    out += stride;
+  }
+}
+
+#define PAIR256_SET_EPI16(a, b)                                            \
+  _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
+
+static INLINE __m256i mult256_round_shift(const __m256i *pin0,
+                                          const __m256i *pin1,
+                                          const __m256i *pmultiplier,
+                                          const __m256i *prounding,
+                                          const int shift) {
+  const __m256i u0 = _mm256_madd_epi16(*pin0, *pmultiplier);
+  const __m256i u1 = _mm256_madd_epi16(*pin1, *pmultiplier);
+  const __m256i v0 = _mm256_add_epi32(u0, *prounding);
+  const __m256i v1 = _mm256_add_epi32(u1, *prounding);
+  const __m256i w0 = _mm256_srai_epi32(v0, shift);
+  const __m256i w1 = _mm256_srai_epi32(v1, shift);
+  return _mm256_packs_epi32(w0, w1);
+}
+
+static INLINE void fdct16x16_1D_avx2(__m256i *input, __m256i *output) {
+  int i;
+  __m256i step2[4];
+  __m256i in[8];
+  __m256i step1[8];
+  __m256i step3[8];
+
+  const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(cospi_16_64);
+  const __m256i k__cospi_p16_m16 = PAIR256_SET_EPI16(cospi_16_64, -cospi_16_64);
+  const __m256i k__cospi_p24_p08 = PAIR256_SET_EPI16(cospi_24_64, cospi_8_64);
+  const __m256i k__cospi_p08_m24 = PAIR256_SET_EPI16(cospi_8_64, -cospi_24_64);
+  const __m256i k__cospi_m08_p24 = PAIR256_SET_EPI16(-cospi_8_64, cospi_24_64);
+  const __m256i k__cospi_p28_p04 = PAIR256_SET_EPI16(cospi_28_64, cospi_4_64);
+  const __m256i k__cospi_m04_p28 = PAIR256_SET_EPI16(-cospi_4_64, cospi_28_64);
+  const __m256i k__cospi_p12_p20 = PAIR256_SET_EPI16(cospi_12_64, cospi_20_64);
+  const __m256i k__cospi_m20_p12 = PAIR256_SET_EPI16(-cospi_20_64, cospi_12_64);
+  const __m256i k__cospi_p30_p02 = PAIR256_SET_EPI16(cospi_30_64, cospi_2_64);
+  const __m256i k__cospi_p14_p18 = PAIR256_SET_EPI16(cospi_14_64, cospi_18_64);
+  const __m256i k__cospi_m02_p30 = PAIR256_SET_EPI16(-cospi_2_64, cospi_30_64);
+  const __m256i k__cospi_m18_p14 = PAIR256_SET_EPI16(-cospi_18_64, cospi_14_64);
+  const __m256i k__cospi_p22_p10 = PAIR256_SET_EPI16(cospi_22_64, cospi_10_64);
+  const __m256i k__cospi_p06_p26 = PAIR256_SET_EPI16(cospi_6_64, cospi_26_64);
+  const __m256i k__cospi_m10_p22 = PAIR256_SET_EPI16(-cospi_10_64, cospi_22_64);
+  const __m256i k__cospi_m26_p06 = PAIR256_SET_EPI16(-cospi_26_64, cospi_6_64);
+  const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+
+  // Calculate input for the first 8 results.
+  for (i = 0; i < 8; i++) {
+    in[i] = ADD256_EPI16(input[i], input[15 - i]);
+  }
+
+  // Calculate input for the next 8 results.
+  for (i = 0; i < 8; i++) {
+    step1[i] = SUB256_EPI16(input[7 - i], input[8 + i]);
+  }
+
+  // Work on the first eight values; fdct8(input, even_results);
+  {
+    // Add/subtract
+    const __m256i q0 = ADD256_EPI16(in[0], in[7]);
+    const __m256i q1 = ADD256_EPI16(in[1], in[6]);
+    const __m256i q2 = ADD256_EPI16(in[2], in[5]);
+    const __m256i q3 = ADD256_EPI16(in[3], in[4]);
+    const __m256i q4 = SUB256_EPI16(in[3], in[4]);
+    const __m256i q5 = SUB256_EPI16(in[2], in[5]);
+    const __m256i q6 = SUB256_EPI16(in[1], in[6]);
+    const __m256i q7 = SUB256_EPI16(in[0], in[7]);
+
+    // Work on first four results
+    {
+      // Add/subtract
+      const __m256i r0 = ADD256_EPI16(q0, q3);
+      const __m256i r1 = ADD256_EPI16(q1, q2);
+      const __m256i r2 = SUB256_EPI16(q1, q2);
+      const __m256i r3 = SUB256_EPI16(q0, q3);
+
+      // Interleave to do the multiply by constants which gets us
+      // into 32 bits.
+      {
+        const __m256i t0 = _mm256_unpacklo_epi16(r0, r1);
+        const __m256i t1 = _mm256_unpackhi_epi16(r0, r1);
+        const __m256i t2 = _mm256_unpacklo_epi16(r2, r3);
+        const __m256i t3 = _mm256_unpackhi_epi16(r2, r3);
+
+        output[0] = mult256_round_shift(&t0, &t1, &k__cospi_p16_p16,
+                                        &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+        output[8] = mult256_round_shift(&t0, &t1, &k__cospi_p16_m16,
+                                        &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+        output[4] = mult256_round_shift(&t2, &t3, &k__cospi_p24_p08,
+                                        &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+        output[12] =
+            mult256_round_shift(&t2, &t3, &k__cospi_m08_p24,
+                                &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      }
+    }
+
+    // Work on next four results
+    {
+      // Interleave to do the multiply by constants which gets us
+      // into 32 bits.
+      const __m256i d0 = _mm256_unpacklo_epi16(q6, q5);
+      const __m256i d1 = _mm256_unpackhi_epi16(q6, q5);
+      const __m256i r0 = mult256_round_shift(
+          &d0, &d1, &k__cospi_p16_m16, &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      const __m256i r1 = mult256_round_shift(
+          &d0, &d1, &k__cospi_p16_p16, &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+
+      {
+        // Add/subtract
+        const __m256i x0 = ADD256_EPI16(q4, r0);
+        const __m256i x1 = SUB256_EPI16(q4, r0);
+        const __m256i x2 = SUB256_EPI16(q7, r1);
+        const __m256i x3 = ADD256_EPI16(q7, r1);
+
+        // Interleave to do the multiply by constants which gets us
+        // into 32 bits.
+        {
+          const __m256i t0 = _mm256_unpacklo_epi16(x0, x3);
+          const __m256i t1 = _mm256_unpackhi_epi16(x0, x3);
+          const __m256i t2 = _mm256_unpacklo_epi16(x1, x2);
+          const __m256i t3 = _mm256_unpackhi_epi16(x1, x2);
+          output[2] =
+              mult256_round_shift(&t0, &t1, &k__cospi_p28_p04,
+                                  &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          output[14] =
+              mult256_round_shift(&t0, &t1, &k__cospi_m04_p28,
+                                  &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          output[10] =
+              mult256_round_shift(&t2, &t3, &k__cospi_p12_p20,
+                                  &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          output[6] =
+              mult256_round_shift(&t2, &t3, &k__cospi_m20_p12,
+                                  &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+        }
+      }
+    }
+  }
+  // Work on the next eight values; step1 -> odd_results
+  {  // step 2
+    {
+      const __m256i t0 = _mm256_unpacklo_epi16(step1[5], step1[2]);
+      const __m256i t1 = _mm256_unpackhi_epi16(step1[5], step1[2]);
+      const __m256i t2 = _mm256_unpacklo_epi16(step1[4], step1[3]);
+      const __m256i t3 = _mm256_unpackhi_epi16(step1[4], step1[3]);
+      step2[0] = mult256_round_shift(&t0, &t1, &k__cospi_p16_m16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      step2[1] = mult256_round_shift(&t2, &t3, &k__cospi_p16_m16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      step2[2] = mult256_round_shift(&t0, &t1, &k__cospi_p16_p16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      step2[3] = mult256_round_shift(&t2, &t3, &k__cospi_p16_p16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+    }
+    // step 3
+    {
+      step3[0] = ADD256_EPI16(step1[0], step2[1]);
+      step3[1] = ADD256_EPI16(step1[1], step2[0]);
+      step3[2] = SUB256_EPI16(step1[1], step2[0]);
+      step3[3] = SUB256_EPI16(step1[0], step2[1]);
+      step3[4] = SUB256_EPI16(step1[7], step2[3]);
+      step3[5] = SUB256_EPI16(step1[6], step2[2]);
+      step3[6] = ADD256_EPI16(step1[6], step2[2]);
+      step3[7] = ADD256_EPI16(step1[7], step2[3]);
+    }
+    // step 4
+    {
+      const __m256i t0 = _mm256_unpacklo_epi16(step3[1], step3[6]);
+      const __m256i t1 = _mm256_unpackhi_epi16(step3[1], step3[6]);
+      const __m256i t2 = _mm256_unpacklo_epi16(step3[2], step3[5]);
+      const __m256i t3 = _mm256_unpackhi_epi16(step3[2], step3[5]);
+      step2[0] = mult256_round_shift(&t0, &t1, &k__cospi_m08_p24,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      step2[1] = mult256_round_shift(&t2, &t3, &k__cospi_p24_p08,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      step2[2] = mult256_round_shift(&t0, &t1, &k__cospi_p24_p08,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      step2[3] = mult256_round_shift(&t2, &t3, &k__cospi_p08_m24,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+    }
+    // step 5
+    {
+      step1[0] = ADD256_EPI16(step3[0], step2[0]);
+      step1[1] = SUB256_EPI16(step3[0], step2[0]);
+      step1[2] = ADD256_EPI16(step3[3], step2[1]);
+      step1[3] = SUB256_EPI16(step3[3], step2[1]);
+      step1[4] = SUB256_EPI16(step3[4], step2[3]);
+      step1[5] = ADD256_EPI16(step3[4], step2[3]);
+      step1[6] = SUB256_EPI16(step3[7], step2[2]);
+      step1[7] = ADD256_EPI16(step3[7], step2[2]);
+    }
+    // step 6
+    {
+      const __m256i t0 = _mm256_unpacklo_epi16(step1[0], step1[7]);
+      const __m256i t1 = _mm256_unpackhi_epi16(step1[0], step1[7]);
+      const __m256i t2 = _mm256_unpacklo_epi16(step1[1], step1[6]);
+      const __m256i t3 = _mm256_unpackhi_epi16(step1[1], step1[6]);
+      output[1] = mult256_round_shift(&t0, &t1, &k__cospi_p30_p02,
+                                      &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      output[9] = mult256_round_shift(&t2, &t3, &k__cospi_p14_p18,
+                                      &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      output[15] = mult256_round_shift(&t0, &t1, &k__cospi_m02_p30,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      output[7] = mult256_round_shift(&t2, &t3, &k__cospi_m18_p14,
+                                      &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+    }
+    {
+      const __m256i t0 = _mm256_unpacklo_epi16(step1[2], step1[5]);
+      const __m256i t1 = _mm256_unpackhi_epi16(step1[2], step1[5]);
+      const __m256i t2 = _mm256_unpacklo_epi16(step1[3], step1[4]);
+      const __m256i t3 = _mm256_unpackhi_epi16(step1[3], step1[4]);
+      output[5] = mult256_round_shift(&t0, &t1, &k__cospi_p22_p10,
+                                      &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      output[13] = mult256_round_shift(&t2, &t3, &k__cospi_p06_p26,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      output[11] = mult256_round_shift(&t0, &t1, &k__cospi_m10_p22,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      output[3] = mult256_round_shift(&t2, &t3, &k__cospi_m26_p06,
+                                      &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+    }
+  }
+}
+
+void vpx_fdct16x16_avx2(const int16_t *input, tran_low_t *output, int stride) {
+  int pass;
+  DECLARE_ALIGNED(32, int16_t, intermediate[256]);
+  int16_t *out0 = intermediate;
+  tran_low_t *out1 = output;
+  const int width = 16;
+  const int height = 16;
+  __m256i buf0[16], buf1[16];
+
+  // Two transform and transpose passes
+  // Process 16 columns (transposed rows in second pass) at a time.
+  for (pass = 0; pass < 2; ++pass) {
+    // Load and pre-condition input.
+    load_buffer_16bit_to_16bit_avx2(input, stride, buf1, height, pass);
+
+    // Calculate dct for 16x16 values
+    fdct16x16_1D_avx2(buf1, buf0);
+
+    // Transpose the results.
+    transpose_16bit_16x16_avx2(buf0, buf1);
+
+    if (pass == 0) {
+      store_buffer_16bit_to_32bit_w16_avx2(buf1, (tran_low_t *)out0, width,
+                                           height);
+    } else {
+      store_buffer_16bit_to_32bit_w16_avx2(buf1, out1, width, height);
+    }
+    // Setup in/out for next pass.
+    input = intermediate;
+  }
+}
+
+#if !CONFIG_VP9_HIGHBITDEPTH
 #define FDCT32x32_2D_AVX2 vpx_fdct32x32_rd_avx2
 #define FDCT32x32_HIGH_PRECISION 0
 #include "vpx_dsp/x86/fwd_dct32x32_impl_avx2.h"
@@ -21,3 +397,4 @@
 #include "vpx_dsp/x86/fwd_dct32x32_impl_avx2.h"  // NOLINT
 #undef FDCT32x32_2D_AVX2
 #undef FDCT32x32_HIGH_PRECISION
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h
index 743e55e635..d546f02a14 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h
+++ b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h
@@ -93,9 +93,9 @@ void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
 #if DCT_HIGH_BIT_DEPTH
   // Check inputs small enough to use optimised code
   cmp0 = _mm_xor_si128(_mm_cmpgt_epi16(in0, _mm_set1_epi16(0x3ff)),
-                       _mm_cmplt_epi16(in0, _mm_set1_epi16(0xfc00)));
+                       _mm_cmplt_epi16(in0, _mm_set1_epi16((int16_t)0xfc00)));
   cmp1 = _mm_xor_si128(_mm_cmpgt_epi16(in1, _mm_set1_epi16(0x3ff)),
-                       _mm_cmplt_epi16(in1, _mm_set1_epi16(0xfc00)));
+                       _mm_cmplt_epi16(in1, _mm_set1_epi16((int16_t)0xfc00)));
   test = _mm_movemask_epi8(_mm_or_si128(cmp0, cmp1));
   if (test) {
     vpx_highbd_fdct4x4_c(input, output, stride);
@@ -261,7 +261,7 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
   //    When we use them, in one case, they are all the same. In all others
   //    it's a pair of them that we need to repeat four times. This is done
   //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
@@ -582,7 +582,7 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
   //    When we use them, in one case, they are all the same. In all others
   //    it's a pair of them that we need to repeat four times. This is done
   //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
   const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
@@ -778,6 +778,7 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
             return;
           }
 #endif  // DCT_HIGH_BIT_DEPTH
+
           // Interleave to do the multiply by constants which gets us
           // into 32 bits.
           {
@@ -834,6 +835,7 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
               return;
             }
 #endif  // DCT_HIGH_BIT_DEPTH
+
             // Interleave to do the multiply by constants which gets us
             // into 32 bits.
             {
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h
index 5201e764c8..5aa2779706 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h
+++ b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_X86_FWD_TXFM_SSE2_H_
-#define VPX_DSP_X86_FWD_TXFM_SSE2_H_
+#ifndef VPX_VPX_DSP_X86_FWD_TXFM_SSE2_H_
+#define VPX_VPX_DSP_X86_FWD_TXFM_SSE2_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -36,7 +36,7 @@ static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
 static INLINE int check_epi16_overflow_x2(const __m128i *preg0,
                                           const __m128i *preg1) {
   const __m128i max_overflow = _mm_set1_epi16(0x7fff);
-  const __m128i min_overflow = _mm_set1_epi16(0x8000);
+  const __m128i min_overflow = _mm_set1_epi16((short)0x8000);
   __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
                               _mm_cmpeq_epi16(*preg0, min_overflow));
   __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
@@ -50,7 +50,7 @@ static INLINE int check_epi16_overflow_x4(const __m128i *preg0,
                                           const __m128i *preg2,
                                           const __m128i *preg3) {
   const __m128i max_overflow = _mm_set1_epi16(0x7fff);
-  const __m128i min_overflow = _mm_set1_epi16(0x8000);
+  const __m128i min_overflow = _mm_set1_epi16((short)0x8000);
   __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
                               _mm_cmpeq_epi16(*preg0, min_overflow));
   __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
@@ -368,4 +368,4 @@ static INLINE void transpose_and_output8x8(
 }  // extern "C"
 #endif
 
-#endif  // VPX_DSP_X86_FWD_TXFM_SSE2_H_
+#endif  // VPX_VPX_DSP_X86_FWD_TXFM_SSE2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm
index 78a1dbb24f..2c338fb5dd 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm
+++ b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm
@@ -10,10 +10,6 @@
 
 %include "third_party/x86inc/x86inc.asm"
 
-; This file provides SSSE3 version of the forward transformation. Part
-; of the macro definitions are originally derived from the ffmpeg project.
-; The current version applies to x86 64-bit only.
-
 SECTION_RODATA
 
 pw_11585x2: times 8 dw 23170
@@ -31,108 +27,12 @@ TRANSFORM_COEFFS  9102,  13623
 
 SECTION .text
 
-%if ARCH_X86_64
-%macro SUM_SUB 3
-  psubw  m%3, m%1, m%2
-  paddw  m%1, m%2
-  SWAP    %2, %3
-%endmacro
-
-; butterfly operation
-%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
-  pmaddwd            m%1, m%3, %5
-  pmaddwd            m%2, m%3, %6
-  paddd              m%1,  %4
-  paddd              m%2,  %4
-  psrad              m%1,  14
-  psrad              m%2,  14
-%endmacro
-
-%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
-  punpckhwd          m%6, m%2, m%1
-  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_%4_%3], [pw_%3_m%4]
-  punpcklwd          m%2, m%1
-  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_%4_%3], [pw_%3_m%4]
-  packssdw           m%1, m%7
-  packssdw           m%2, m%6
-%endmacro
-
-; matrix transpose
-%macro INTERLEAVE_2X 4
-  punpckh%1          m%4, m%2, m%3
-  punpckl%1          m%2, m%3
-  SWAP               %3,  %4
-%endmacro
-
-%macro TRANSPOSE8X8 9
-  INTERLEAVE_2X  wd, %1, %2, %9
-  INTERLEAVE_2X  wd, %3, %4, %9
-  INTERLEAVE_2X  wd, %5, %6, %9
-  INTERLEAVE_2X  wd, %7, %8, %9
-
-  INTERLEAVE_2X  dq, %1, %3, %9
-  INTERLEAVE_2X  dq, %2, %4, %9
-  INTERLEAVE_2X  dq, %5, %7, %9
-  INTERLEAVE_2X  dq, %6, %8, %9
-
-  INTERLEAVE_2X  qdq, %1, %5, %9
-  INTERLEAVE_2X  qdq, %3, %7, %9
-  INTERLEAVE_2X  qdq, %2, %6, %9
-  INTERLEAVE_2X  qdq, %4, %8, %9
-
-  SWAP  %2, %5
-  SWAP  %4, %7
-%endmacro
-
-; 1D forward 8x8 DCT transform
-%macro FDCT8_1D 1
-  SUM_SUB            0,  7,  9
-  SUM_SUB            1,  6,  9
-  SUM_SUB            2,  5,  9
-  SUM_SUB            3,  4,  9
-
-  SUM_SUB            0,  3,  9
-  SUM_SUB            1,  2,  9
-  SUM_SUB            6,  5,  9
-%if %1 == 0
-  SUM_SUB            0,  1,  9
-%endif
-
-  BUTTERFLY_4X       2,  3,  6270,  15137,  m8,  9,  10
-
-  pmulhrsw           m6, m12
-  pmulhrsw           m5, m12
-%if %1 == 0
-  pmulhrsw           m0, m12
-  pmulhrsw           m1, m12
-%else
-  BUTTERFLY_4X       1,  0,  11585, 11585,  m8,  9,  10
-  SWAP               0,  1
-%endif
-
-  SUM_SUB            4,  5,  9
-  SUM_SUB            7,  6,  9
-  BUTTERFLY_4X       4,  7,  3196,  16069,  m8,  9,  10
-  BUTTERFLY_4X       5,  6,  13623,  9102,  m8,  9,  10
-  SWAP               1,  4
-  SWAP               3,  6
-%endmacro
-
-%macro DIVIDE_ROUND_2X 4 ; dst1, dst2, tmp1, tmp2
-  psraw              m%3, m%1, 15
-  psraw              m%4, m%2, 15
-  psubw              m%1, m%3
-  psubw              m%2, m%4
-  psraw              m%1, 1
-  psraw              m%2, 1
-%endmacro
-
+%if VPX_ARCH_X86_64
 INIT_XMM ssse3
 cglobal fdct8x8, 3, 5, 13, input, output, stride
 
-  mova               m8, [pd_8192]
-  mova              m12, [pw_11585x2]
-  pxor              m11, m11
+  mova               m8, [GLOBAL(pd_8192)]
+  mova              m12, [GLOBAL(pw_11585x2)]
 
   lea                r3, [2 * strideq]
   lea                r4, [4 * strideq]
@@ -159,25 +59,303 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride
   psllw              m7, 2
 
   ; column transform
-  FDCT8_1D  0
-  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
+  ; stage 1
+  paddw m10, m0, m7
+  psubw m0, m7
 
-  FDCT8_1D  1
-  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
+  paddw m9, m1, m6
+  psubw m1, m6
 
-  DIVIDE_ROUND_2X   0, 1, 9, 10
-  DIVIDE_ROUND_2X   2, 3, 9, 10
-  DIVIDE_ROUND_2X   4, 5, 9, 10
-  DIVIDE_ROUND_2X   6, 7, 9, 10
+  paddw m7, m2, m5
+  psubw m2, m5
 
-  mova              [outputq +   0], m0
-  mova              [outputq +  16], m1
-  mova              [outputq +  32], m2
-  mova              [outputq +  48], m3
-  mova              [outputq +  64], m4
-  mova              [outputq +  80], m5
-  mova              [outputq +  96], m6
-  mova              [outputq + 112], m7
+  paddw m6, m3, m4
+  psubw m3, m4
+
+  ; stage 2
+  paddw m5, m9, m7
+  psubw m9, m7
+
+  paddw m4, m10, m6
+  psubw m10, m6
+
+  paddw m7, m1, m2
+  psubw m1, m2
+
+  ; stage 3
+  paddw m6, m4, m5
+  psubw m4, m5
+
+  pmulhrsw m1, m12
+  pmulhrsw m7, m12
+
+  ; sin(pi / 8), cos(pi / 8)
+  punpcklwd m2, m10, m9
+  punpckhwd m10, m9
+  pmaddwd m5, m2, [GLOBAL(pw_15137_6270)]
+  pmaddwd m2, [GLOBAL(pw_6270_m15137)]
+  pmaddwd m9, m10, [GLOBAL(pw_15137_6270)]
+  pmaddwd m10, [GLOBAL(pw_6270_m15137)]
+  paddd m5, m8
+  paddd m2, m8
+  paddd m9, m8
+  paddd m10, m8
+  psrad m5, 14
+  psrad m2, 14
+  psrad m9, 14
+  psrad m10, 14
+  packssdw m5, m9
+  packssdw m2, m10
+
+  pmulhrsw m6, m12
+  pmulhrsw m4, m12
+
+  paddw m9, m3, m1
+  psubw m3, m1
+
+  paddw m10, m0, m7
+  psubw m0, m7
+
+  ; stage 4
+  ; sin(pi / 16), cos(pi / 16)
+  punpcklwd m1, m10, m9
+  punpckhwd m10, m9
+  pmaddwd m7, m1, [GLOBAL(pw_16069_3196)]
+  pmaddwd m1, [GLOBAL(pw_3196_m16069)]
+  pmaddwd m9, m10, [GLOBAL(pw_16069_3196)]
+  pmaddwd m10, [GLOBAL(pw_3196_m16069)]
+  paddd m7, m8
+  paddd m1, m8
+  paddd m9, m8
+  paddd m10, m8
+  psrad m7, 14
+  psrad m1, 14
+  psrad m9, 14
+  psrad m10, 14
+  packssdw m7, m9
+  packssdw m1, m10
+
+  ; sin(3 * pi / 16), cos(3 * pi / 16)
+  punpcklwd m11, m0, m3
+  punpckhwd m0, m3
+  pmaddwd m9, m11, [GLOBAL(pw_9102_13623)]
+  pmaddwd m11, [GLOBAL(pw_13623_m9102)]
+  pmaddwd m3, m0, [GLOBAL(pw_9102_13623)]
+  pmaddwd m0, [GLOBAL(pw_13623_m9102)]
+  paddd m9, m8
+  paddd m11, m8
+  paddd m3, m8
+  paddd m0, m8
+  psrad m9, 14
+  psrad m11, 14
+  psrad m3, 14
+  psrad m0, 14
+  packssdw m9, m3
+  packssdw m11, m0
+
+  ; transpose
+  ; stage 1
+  punpcklwd m0, m6, m7
+  punpcklwd m3, m5, m11
+  punpckhwd m6, m7
+  punpckhwd m5, m11
+  punpcklwd m7, m4, m9
+  punpcklwd m10, m2, m1
+  punpckhwd m4, m9
+  punpckhwd m2, m1
+
+  ; stage 2
+  punpckldq m9, m0, m3
+  punpckldq m1, m6, m5
+  punpckhdq m0, m3
+  punpckhdq m6, m5
+  punpckldq m3, m7, m10
+  punpckldq m5, m4, m2
+  punpckhdq m7, m10
+  punpckhdq m4, m2
+
+  ; stage 3
+  punpcklqdq m10, m9, m3
+  punpckhqdq m9, m3
+  punpcklqdq m2, m0, m7
+  punpckhqdq m0, m7
+  punpcklqdq m3, m1, m5
+  punpckhqdq m1, m5
+  punpcklqdq m7, m6, m4
+  punpckhqdq m6, m4
+
+  ; row transform
+  ; stage 1
+  paddw m5, m10, m6
+  psubw m10, m6
+
+  paddw m4, m9, m7
+  psubw m9, m7
+
+  paddw m6, m2, m1
+  psubw m2, m1
+
+  paddw m7, m0, m3
+  psubw m0, m3
+
+  ;stage 2
+  paddw m1, m5, m7
+  psubw m5, m7
+
+  paddw m3, m4, m6
+  psubw m4, m6
+
+  paddw m7, m9, m2
+  psubw m9, m2
+
+  ; stage 3
+  punpcklwd m6, m1, m3
+  punpckhwd m1, m3
+  pmaddwd m2, m6, [GLOBAL(pw_11585_11585)]
+  pmaddwd m6, [GLOBAL(pw_11585_m11585)]
+  pmaddwd m3, m1, [GLOBAL(pw_11585_11585)]
+  pmaddwd m1, [GLOBAL(pw_11585_m11585)]
+  paddd m2, m8
+  paddd m6, m8
+  paddd m3, m8
+  paddd m1, m8
+  psrad m2, 14
+  psrad m6, 14
+  psrad m3, 14
+  psrad m1, 14
+  packssdw m2, m3
+  packssdw m6, m1
+
+  pmulhrsw m7, m12
+  pmulhrsw m9, m12
+
+  punpcklwd m3, m5, m4
+  punpckhwd m5, m4
+  pmaddwd m1, m3, [GLOBAL(pw_15137_6270)]
+  pmaddwd m3, [GLOBAL(pw_6270_m15137)]
+  pmaddwd m4, m5, [GLOBAL(pw_15137_6270)]
+  pmaddwd m5, [GLOBAL(pw_6270_m15137)]
+  paddd m1, m8
+  paddd m3, m8
+  paddd m4, m8
+  paddd m5, m8
+  psrad m1, 14
+  psrad m3, 14
+  psrad m4, 14
+  psrad m5, 14
+  packssdw m1, m4
+  packssdw m3, m5
+
+  paddw m4, m0, m9
+  psubw m0, m9
+
+  paddw m5, m10, m7
+  psubw m10, m7
+
+  ; stage 4
+  punpcklwd m9, m5, m4
+  punpckhwd m5, m4
+  pmaddwd m7, m9, [GLOBAL(pw_16069_3196)]
+  pmaddwd m9, [GLOBAL(pw_3196_m16069)]
+  pmaddwd m4, m5, [GLOBAL(pw_16069_3196)]
+  pmaddwd m5, [GLOBAL(pw_3196_m16069)]
+  paddd m7, m8
+  paddd m9, m8
+  paddd m4, m8
+  paddd m5, m8
+  psrad m7, 14
+  psrad m9, 14
+  psrad m4, 14
+  psrad m5, 14
+  packssdw m7, m4
+  packssdw m9, m5
+
+  punpcklwd m4, m10, m0
+  punpckhwd m10, m0
+  pmaddwd m5, m4, [GLOBAL(pw_9102_13623)]
+  pmaddwd m4, [GLOBAL(pw_13623_m9102)]
+  pmaddwd m0, m10, [GLOBAL(pw_9102_13623)]
+  pmaddwd m10, [GLOBAL(pw_13623_m9102)]
+  paddd m5, m8
+  paddd m4, m8
+  paddd m0, m8
+  paddd m10, m8
+  psrad m5, 14
+  psrad m4, 14
+  psrad m0, 14
+  psrad m10, 14
+  packssdw m5, m0
+  packssdw m4, m10
+
+  ; transpose
+  ; stage 1
+  punpcklwd m0, m2, m7
+  punpcklwd m10, m1, m4
+  punpckhwd m2, m7
+  punpckhwd m1, m4
+  punpcklwd m7, m6, m5
+  punpcklwd m4, m3, m9
+  punpckhwd m6, m5
+  punpckhwd m3, m9
+
+  ; stage 2
+  punpckldq m5, m0, m10
+  punpckldq m9, m2, m1
+  punpckhdq m0, m10
+  punpckhdq m2, m1
+  punpckldq m10, m7, m4
+  punpckldq m1, m6, m3
+  punpckhdq m7, m4
+  punpckhdq m6, m3
+
+  ; stage 3
+  punpcklqdq m4, m5, m10
+  punpckhqdq m5, m10
+  punpcklqdq m3, m0, m7
+  punpckhqdq m0, m7
+  punpcklqdq m10, m9, m1
+  punpckhqdq m9, m1
+  punpcklqdq m7, m2, m6
+  punpckhqdq m2, m6
+
+  psraw m1, m4, 15
+  psraw m6, m5, 15
+  psraw m8, m3, 15
+  psraw m11, m0, 15
+
+  psubw m4, m1
+  psubw m5, m6
+  psubw m3, m8
+  psubw m0, m11
+
+  psraw m4, 1
+  psraw m5, 1
+  psraw m3, 1
+  psraw m0, 1
+
+  psraw m1, m10, 15
+  psraw m6, m9, 15
+  psraw m8, m7, 15
+  psraw m11, m2, 15
+
+  psubw m10, m1
+  psubw m9, m6
+  psubw m7, m8
+  psubw m2, m11
+
+  psraw m10, 1
+  psraw m9, 1
+  psraw m7, 1
+  psraw m2, 1
+
+  mova              [outputq +   0], m4
+  mova              [outputq +  16], m5
+  mova              [outputq +  32], m3
+  mova              [outputq +  48], m0
+  mova              [outputq +  64], m10
+  mova              [outputq +  80], m9
+  mova              [outputq +  96], m7
+  mova              [outputq + 112], m2
 
   RET
 %endif
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
new file mode 100644
index 0000000000..01a52ec8bf
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
@@ -0,0 +1,1495 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/convolve.h"
+#include "vpx_dsp/x86/convolve_avx2.h"
+
+// -----------------------------------------------------------------------------
+// Copy and average
+
+void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
+                                   uint16_t *dst, ptrdiff_t dst_stride,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int x_step_q4, int y0_q4, int y_step_q4,
+                                   int w, int h, int bd) {
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+  (void)bd;
+
+  assert(w % 4 == 0);
+  if (w > 32) {  // w = 64
+    do {
+      const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
+      const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+      const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
+      const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)dst, p0);
+      _mm256_storeu_si256((__m256i *)(dst + 16), p1);
+      _mm256_storeu_si256((__m256i *)(dst + 32), p2);
+      _mm256_storeu_si256((__m256i *)(dst + 48), p3);
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (w > 16) {  // w = 32
+    do {
+      const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
+      const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)dst, p0);
+      _mm256_storeu_si256((__m256i *)(dst + 16), p1);
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (w > 8) {  // w = 16
+    __m256i p0, p1;
+    do {
+      p0 = _mm256_loadu_si256((const __m256i *)src);
+      src += src_stride;
+      p1 = _mm256_loadu_si256((const __m256i *)src);
+      src += src_stride;
+
+      _mm256_storeu_si256((__m256i *)dst, p0);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)dst, p1);
+      dst += dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else if (w > 4) {  // w = 8
+    __m128i p0, p1;
+    do {
+      p0 = _mm_loadu_si128((const __m128i *)src);
+      src += src_stride;
+      p1 = _mm_loadu_si128((const __m128i *)src);
+      src += src_stride;
+
+      _mm_storeu_si128((__m128i *)dst, p0);
+      dst += dst_stride;
+      _mm_storeu_si128((__m128i *)dst, p1);
+      dst += dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else {  // w = 4
+    __m128i p0, p1;
+    do {
+      p0 = _mm_loadl_epi64((const __m128i *)src);
+      src += src_stride;
+      p1 = _mm_loadl_epi64((const __m128i *)src);
+      src += src_stride;
+
+      _mm_storel_epi64((__m128i *)dst, p0);
+      dst += dst_stride;
+      _mm_storel_epi64((__m128i *)dst, p1);
+      dst += dst_stride;
+      h -= 2;
+    } while (h > 0);
+  }
+}
+
+void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
+                                  uint16_t *dst, ptrdiff_t dst_stride,
+                                  const InterpKernel *filter, int x0_q4,
+                                  int x_step_q4, int y0_q4, int y_step_q4,
+                                  int w, int h, int bd) {
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+  (void)bd;
+
+  assert(w % 4 == 0);
+  if (w > 32) {  // w = 64
+    __m256i p0, p1, p2, p3, u0, u1, u2, u3;
+    do {
+      p0 = _mm256_loadu_si256((const __m256i *)src);
+      p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+      p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
+      p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
+      src += src_stride;
+      u0 = _mm256_loadu_si256((const __m256i *)dst);
+      u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
+      u2 = _mm256_loadu_si256((const __m256i *)(dst + 32));
+      u3 = _mm256_loadu_si256((const __m256i *)(dst + 48));
+      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
+      _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
+      _mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2));
+      _mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3));
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (w > 16) {  // w = 32
+    __m256i p0, p1, u0, u1;
+    do {
+      p0 = _mm256_loadu_si256((const __m256i *)src);
+      p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+      src += src_stride;
+      u0 = _mm256_loadu_si256((const __m256i *)dst);
+      u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
+      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
+      _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (w > 8) {  // w = 16
+    __m256i p0, p1, u0, u1;
+    do {
+      p0 = _mm256_loadu_si256((const __m256i *)src);
+      p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride));
+      src += src_stride << 1;
+      u0 = _mm256_loadu_si256((const __m256i *)dst);
+      u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride));
+
+      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
+      _mm256_storeu_si256((__m256i *)(dst + dst_stride),
+                          _mm256_avg_epu16(p1, u1));
+      dst += dst_stride << 1;
+      h -= 2;
+    } while (h > 0);
+  } else if (w > 4) {  // w = 8
+    __m128i p0, p1, u0, u1;
+    do {
+      p0 = _mm_loadu_si128((const __m128i *)src);
+      p1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
+      src += src_stride << 1;
+      u0 = _mm_loadu_si128((const __m128i *)dst);
+      u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride));
+
+      _mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0));
+      _mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1));
+      dst += dst_stride << 1;
+      h -= 2;
+    } while (h > 0);
+  } else {  // w = 4
+    __m128i p0, p1, u0, u1;
+    do {
+      p0 = _mm_loadl_epi64((const __m128i *)src);
+      p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride));
+      src += src_stride << 1;
+      u0 = _mm_loadl_epi64((const __m128i *)dst);
+      u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride));
+
+      _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0));
+      _mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1));
+      dst += dst_stride << 1;
+      h -= 2;
+    } while (h > 0);
+  }
+}
+
+// -----------------------------------------------------------------------------
+// Horizontal and vertical filtering
+
+static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
+                                              7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
+                                              4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
+
+static const uint8_t signal_pattern_1[32] = { 4, 5, 6,  7,  6,  7,  8,  9,
+                                              8, 9, 10, 11, 10, 11, 12, 13,
+                                              4, 5, 6,  7,  6,  7,  8,  9,
+                                              8, 9, 10, 11, 10, 11, 12, 13 };
+
+static const uint8_t signal_pattern_2[32] = { 6,  7,  8,  9,  8,  9,  10, 11,
+                                              10, 11, 12, 13, 12, 13, 14, 15,
+                                              6,  7,  8,  9,  8,  9,  10, 11,
+                                              10, 11, 12, 13, 12, 13, 14, 15 };
+
+static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
+
+#define CONV8_ROUNDING_BITS (7)
+#define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1))
+
+// -----------------------------------------------------------------------------
+// Horizontal Filtering
+
+static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) {
+  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
+  const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0);
+  const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1);
+  const __m256i c = _mm256_permutevar8x32_epi32(*s, idx);
+
+  p[0] = _mm256_shuffle_epi8(*s, sf0);  // x0x6
+  p[1] = _mm256_shuffle_epi8(*s, sf1);  // x1x7
+  p[2] = _mm256_shuffle_epi8(c, sf0);   // x2x4
+  p[3] = _mm256_shuffle_epi8(c, sf1);   // x3x5
+}
+
+// Note:
+//  Shared by 8x2 and 16x1 block
+static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1,
+                                  __m256i *x /*x[8]*/) {
+  __m256i pp[8];
+  pack_pixels(s0, pp);
+  pack_pixels(s1, &pp[4]);
+  x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20);
+  x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20);
+  x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20);
+  x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20);
+  x[4] = x[2];
+  x[5] = x[3];
+  x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31);
+  x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31);
+}
+
+static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) {
+  __m256i pp[8];
+  __m256i s0;
+  s0 = _mm256_loadu_si256((const __m256i *)src);
+  pack_pixels(&s0, pp);
+  x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30);
+  x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30);
+  x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30);
+  x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30);
+}
+
+static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride,
+                                   __m256i *x) {
+  __m256i s0, s1;
+  s0 = _mm256_loadu_si256((const __m256i *)src);
+  s1 = _mm256_loadu_si256((const __m256i *)(src + stride));
+  pack_16_pixels(&s0, &s1, x);
+}
+
+static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) {
+  __m256i s0, s1;
+  s0 = _mm256_loadu_si256((const __m256i *)src);
+  s1 = _mm256_loadu_si256((const __m256i *)(src + 8));
+  pack_16_pixels(&s0, &s1, x);
+}
+
+// Note:
+//  Shared by horizontal and vertical filtering
+static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
+  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+  const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
+  const __m256i p0 = _mm256_set1_epi32(0x03020100);
+  const __m256i p1 = _mm256_set1_epi32(0x07060504);
+  const __m256i p2 = _mm256_set1_epi32(0x0b0a0908);
+  const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c);
+  f[0] = _mm256_shuffle_epi8(hh, p0);
+  f[1] = _mm256_shuffle_epi8(hh, p1);
+  f[2] = _mm256_shuffle_epi8(hh, p2);
+  f[3] = _mm256_shuffle_epi8(hh, p3);
+}
+
+static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
+                                     const __m256i *fil /*fil[4]*/,
+                                     __m256i *y) {
+  __m256i a, a0, a1;
+
+  a0 = _mm256_madd_epi16(fil[0], sig[0]);
+  a1 = _mm256_madd_epi16(fil[3], sig[3]);
+  a = _mm256_add_epi32(a0, a1);
+
+  a0 = _mm256_madd_epi16(fil[1], sig[1]);
+  a1 = _mm256_madd_epi16(fil[2], sig[2]);
+
+  {
+    const __m256i min = _mm256_min_epi32(a0, a1);
+    a = _mm256_add_epi32(a, min);
+  }
+  {
+    const __m256i max = _mm256_max_epi32(a0, a1);
+    a = _mm256_add_epi32(a, max);
+  }
+  {
+    const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+    a = _mm256_add_epi32(a, rounding);
+    *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS);
+  }
+}
+
+static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask,
+                                    uint16_t *dst) {
+  const __m128i a0 = _mm256_castsi256_si128(*y);
+  const __m128i a1 = _mm256_extractf128_si256(*y, 1);
+  __m128i res = _mm_packus_epi32(a0, a1);
+  res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
+  _mm_storeu_si128((__m128i *)dst, res);
+}
+
+static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1,
+                                    const __m256i *mask, uint16_t *dst,
+                                    ptrdiff_t pitch) {
+  __m256i a = _mm256_packus_epi32(*y0, *y1);
+  a = _mm256_min_epi16(a, *mask);
+  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
+  _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
+}
+
+static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1,
+                                     const __m256i *mask, uint16_t *dst) {
+  __m256i a = _mm256_packus_epi32(*y0, *y1);
+  a = _mm256_min_epi16(a, *mask);
+  _mm256_storeu_si256((__m256i *)dst, a);
+}
+
+static void vpx_highbd_filter_block1d8_h8_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[8], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff[4];
+  pack_filters(filter, ff);
+
+  src_ptr -= 3;
+  do {
+    pack_8x2_pixels(src_ptr, src_pitch, signal);
+    filter_8x1_pixels(signal, ff, &res0);
+    filter_8x1_pixels(&signal[4], ff, &res1);
+    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+    height -= 2;
+    src_ptr += src_pitch << 1;
+    dst_ptr += dst_pitch << 1;
+  } while (height > 1);
+
+  if (height > 0) {
+    pack_8x1_pixels(src_ptr, signal);
+    filter_8x1_pixels(signal, ff, &res0);
+    store_8x1_pixels(&res0, &max, dst_ptr);
+  }
+}
+
+static void vpx_highbd_filter_block1d16_h8_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[8], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff[4];
+  pack_filters(filter, ff);
+
+  src_ptr -= 3;
+  do {
+    pack_16x1_pixels(src_ptr, signal);
+    filter_8x1_pixels(signal, ff, &res0);
+    filter_8x1_pixels(&signal[4], ff, &res1);
+    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
+    height -= 1;
+    src_ptr += src_pitch;
+    dst_ptr += dst_pitch;
+  } while (height > 0);
+}
+
+// -----------------------------------------------------------------------------
+// 2-tap horizontal filtering
+
+static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) {
+  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+  const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
+  const __m256i p = _mm256_set1_epi32(0x09080706);
+  f[0] = _mm256_shuffle_epi8(hh, p);
+}
+
+// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels()
+// the difference is s0/s1 specifies first and second rows or,
+// first 16 samples and 8-sample shifted 16 samples
+static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1,
+                                     __m256i *sig) {
+  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
+  const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
+  __m256i x0 = _mm256_shuffle_epi8(*s0, sf2);
+  __m256i x1 = _mm256_shuffle_epi8(*s1, sf2);
+  __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx);
+  __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx);
+  r0 = _mm256_shuffle_epi8(r0, sf2);
+  r1 = _mm256_shuffle_epi8(r1, sf2);
+  sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20);
+  sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20);
+}
+
+static INLINE void pack_8x2_2t_pixels(const uint16_t *src,
+                                      const ptrdiff_t pitch, __m256i *sig) {
+  const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
+  const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
+  pack_16_2t_pixels(&r0, &r1, sig);
+}
+
+static INLINE void pack_16x1_2t_pixels(const uint16_t *src,
+                                       __m256i *sig /*sig[2]*/) {
+  const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
+  const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8));
+  pack_16_2t_pixels(&r0, &r1, sig);
+}
+
+static INLINE void pack_8x1_2t_pixels(const uint16_t *src,
+                                      __m256i *sig /*sig[2]*/) {
+  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
+  const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
+  __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
+  __m256i x0 = _mm256_shuffle_epi8(r0, sf2);
+  r0 = _mm256_permutevar8x32_epi32(r0, idx);
+  r0 = _mm256_shuffle_epi8(r0, sf2);
+  sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20);
+}
+
+// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels()
+static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f,
+                                       __m256i *y0, __m256i *y1) {
+  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+  __m256i x0 = _mm256_madd_epi16(sig[0], *f);
+  __m256i x1 = _mm256_madd_epi16(sig[1], *f);
+  x0 = _mm256_add_epi32(x0, rounding);
+  x1 = _mm256_add_epi32(x1, rounding);
+  *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
+  *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS);
+}
+
+static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f,
+                                        __m256i *y0) {
+  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+  __m256i x0 = _mm256_madd_epi16(sig[0], *f);
+  x0 = _mm256_add_epi32(x0, rounding);
+  *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
+}
+
+static void vpx_highbd_filter_block1d8_h2_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[2], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff;
+  pack_2t_filter(filter, &ff);
+
+  src_ptr -= 3;
+  do {
+    pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
+    filter_16_2t_pixels(signal, &ff, &res0, &res1);
+    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+    height -= 2;
+    src_ptr += src_pitch << 1;
+    dst_ptr += dst_pitch << 1;
+  } while (height > 1);
+
+  if (height > 0) {
+    pack_8x1_2t_pixels(src_ptr, signal);
+    filter_8x1_2t_pixels(signal, &ff, &res0);
+    store_8x1_pixels(&res0, &max, dst_ptr);
+  }
+}
+
+static void vpx_highbd_filter_block1d16_h2_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[2], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff;
+  pack_2t_filter(filter, &ff);
+
+  src_ptr -= 3;
+  do {
+    pack_16x1_2t_pixels(src_ptr, signal);
+    filter_16_2t_pixels(signal, &ff, &res0, &res1);
+    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
+    height -= 1;
+    src_ptr += src_pitch;
+    dst_ptr += dst_pitch;
+  } while (height > 0);
+}
+
+// -----------------------------------------------------------------------------
+// Vertical Filtering
+
+static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
+  __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src));
+  __m256i s1 =
+      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch)));
+  __m256i s2 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 2 * pitch)));
+  __m256i s3 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 3 * pitch)));
+  __m256i s4 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 4 * pitch)));
+  __m256i s5 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 5 * pitch)));
+  __m256i s6 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 6 * pitch)));
+
+  s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
+  s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1);
+  s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1);
+  s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1);
+  s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1);
+  s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1);
+
+  sig[0] = _mm256_unpacklo_epi16(s0, s1);
+  sig[4] = _mm256_unpackhi_epi16(s0, s1);
+  sig[1] = _mm256_unpacklo_epi16(s2, s3);
+  sig[5] = _mm256_unpackhi_epi16(s2, s3);
+  sig[2] = _mm256_unpacklo_epi16(s4, s5);
+  sig[6] = _mm256_unpackhi_epi16(s4, s5);
+  sig[8] = s6;
+}
+
+static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch,
+                                   __m256i *sig) {
+  // base + 7th row
+  __m256i s0 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 7 * pitch)));
+  // base + 8th row
+  __m256i s1 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 8 * pitch)));
+  __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1);
+  __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
+  sig[3] = _mm256_unpacklo_epi16(s2, s3);
+  sig[7] = _mm256_unpackhi_epi16(s2, s3);
+  sig[8] = s1;
+}
+
+static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f,
+                                     __m256i *y0, __m256i *y1) {
+  filter_8x1_pixels(sig, f, y0);
+  filter_8x1_pixels(&sig[4], f, y1);
+}
+
+static INLINE void update_pixels(__m256i *sig) {
+  int i;
+  for (i = 0; i < 3; ++i) {
+    sig[i] = sig[i + 1];
+    sig[i + 4] = sig[i + 5];
+  }
+}
+
+static void vpx_highbd_filter_block1d8_v8_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[9], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff[4];
+  pack_filters(filter, ff);
+
+  pack_8x9_init(src_ptr, src_pitch, signal);
+
+  do {
+    pack_8x9_pixels(src_ptr, src_pitch, signal);
+
+    filter_8x9_pixels(signal, ff, &res0, &res1);
+    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+    update_pixels(signal);
+
+    src_ptr += src_pitch << 1;
+    dst_ptr += dst_pitch << 1;
+    height -= 2;
+  } while (height > 0);
+}
+
+static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
+  __m256i u0, u1, u2, u3;
+  // load 0-6 rows
+  const __m256i s0 = _mm256_loadu_si256((const __m256i *)src);
+  const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
+  const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch));
+  const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch));
+  const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch));
+  const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch));
+  const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch));
+
+  u0 = _mm256_permute2x128_si256(s0, s1, 0x20);  // 0, 1 low
+  u1 = _mm256_permute2x128_si256(s0, s1, 0x31);  // 0, 1 high
+
+  u2 = _mm256_permute2x128_si256(s1, s2, 0x20);  // 1, 2 low
+  u3 = _mm256_permute2x128_si256(s1, s2, 0x31);  // 1, 2 high
+
+  sig[0] = _mm256_unpacklo_epi16(u0, u2);
+  sig[4] = _mm256_unpackhi_epi16(u0, u2);
+
+  sig[8] = _mm256_unpacklo_epi16(u1, u3);
+  sig[12] = _mm256_unpackhi_epi16(u1, u3);
+
+  u0 = _mm256_permute2x128_si256(s2, s3, 0x20);
+  u1 = _mm256_permute2x128_si256(s2, s3, 0x31);
+
+  u2 = _mm256_permute2x128_si256(s3, s4, 0x20);
+  u3 = _mm256_permute2x128_si256(s3, s4, 0x31);
+
+  sig[1] = _mm256_unpacklo_epi16(u0, u2);
+  sig[5] = _mm256_unpackhi_epi16(u0, u2);
+
+  sig[9] = _mm256_unpacklo_epi16(u1, u3);
+  sig[13] = _mm256_unpackhi_epi16(u1, u3);
+
+  u0 = _mm256_permute2x128_si256(s4, s5, 0x20);
+  u1 = _mm256_permute2x128_si256(s4, s5, 0x31);
+
+  u2 = _mm256_permute2x128_si256(s5, s6, 0x20);
+  u3 = _mm256_permute2x128_si256(s5, s6, 0x31);
+
+  sig[2] = _mm256_unpacklo_epi16(u0, u2);
+  sig[6] = _mm256_unpackhi_epi16(u0, u2);
+
+  sig[10] = _mm256_unpacklo_epi16(u1, u3);
+  sig[14] = _mm256_unpackhi_epi16(u1, u3);
+
+  sig[16] = s6;
+}
+
+static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch,
+                             __m256i *sig) {
+  // base + 7th row
+  const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch));
+  // base + 8th row
+  const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch));
+
+  __m256i u0, u1, u2, u3;
+  u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20);
+  u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31);
+
+  u2 = _mm256_permute2x128_si256(s7, s8, 0x20);
+  u3 = _mm256_permute2x128_si256(s7, s8, 0x31);
+
+  sig[3] = _mm256_unpacklo_epi16(u0, u2);
+  sig[7] = _mm256_unpackhi_epi16(u0, u2);
+
+  sig[11] = _mm256_unpacklo_epi16(u1, u3);
+  sig[15] = _mm256_unpackhi_epi16(u1, u3);
+
+  sig[16] = s8;
+}
+
+static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f,
+                                      __m256i *y0, __m256i *y1) {
+  __m256i res[4];
+  int i;
+  for (i = 0; i < 4; ++i) {
+    filter_8x1_pixels(&sig[i << 2], f, &res[i]);
+  }
+
+  {
+    const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]);
+    const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]);
+    *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20);
+    *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31);
+  }
+}
+
+static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1,
+                                     const __m256i *mask, uint16_t *dst,
+                                     ptrdiff_t pitch) {
+  __m256i p = _mm256_min_epi16(*y0, *mask);
+  _mm256_storeu_si256((__m256i *)dst, p);
+  p = _mm256_min_epi16(*y1, *mask);
+  _mm256_storeu_si256((__m256i *)(dst + pitch), p);
+}
+
+static void update_16x9_pixels(__m256i *sig) {
+  update_pixels(&sig[0]);
+  update_pixels(&sig[8]);
+}
+
+static void vpx_highbd_filter_block1d16_v8_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[17], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff[4];
+  pack_filters(filter, ff);
+
+  pack_16x9_init(src_ptr, src_pitch, signal);
+
+  do {
+    pack_16x9_pixels(src_ptr, src_pitch, signal);
+    filter_16x9_pixels(signal, ff, &res0, &res1);
+    store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+    update_16x9_pixels(signal);
+
+    src_ptr += src_pitch << 1;
+    dst_ptr += dst_pitch << 1;
+    height -= 2;
+  } while (height > 0);
+}
+
+// -----------------------------------------------------------------------------
+// 2-tap vertical filtering
+
+static void pack_16x2_init(const uint16_t *src, __m256i *sig) {
+  sig[2] = _mm256_loadu_si256((const __m256i *)src);
+}
+
+static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch,
+                                       __m256i *sig) {
+  // load the next row
+  const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch));
+  sig[0] = _mm256_unpacklo_epi16(sig[2], u);
+  sig[1] = _mm256_unpackhi_epi16(sig[2], u);
+  sig[2] = u;
+}
+
+static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f,
+                                         __m256i *y0, __m256i *y1) {
+  filter_16_2t_pixels(sig, f, y0, y1);
+}
+
+static void vpx_highbd_filter_block1d16_v2_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[3], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+  __m256i ff;
+
+  pack_2t_filter(filter, &ff);
+  pack_16x2_init(src_ptr, signal);
+
+  do {
+    pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
+    filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
+    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
+
+    src_ptr += src_pitch;
+    dst_ptr += dst_pitch;
+    height -= 1;
+  } while (height > 0);
+}
+
+static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) {
+  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+  const __m128i p = _mm_set1_epi32(0x09080706);
+  f[0] = _mm_shuffle_epi8(h, p);
+}
+
+static void pack_8x2_init(const uint16_t *src, __m128i *sig) {
+  sig[2] = _mm_loadu_si128((const __m128i *)src);
+}
+
+static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch,
+                                          __m128i *sig) {
+  // load the next row
+  const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch));
+  sig[0] = _mm_unpacklo_epi16(sig[2], u);
+  sig[1] = _mm_unpackhi_epi16(sig[2], u);
+  sig[2] = u;
+}
+
+static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f,
+                                      __m128i *y0, __m128i *y1) {
+  const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+  __m128i x0 = _mm_madd_epi16(sig[0], *f);
+  __m128i x1 = _mm_madd_epi16(sig[1], *f);
+  x0 = _mm_add_epi32(x0, rounding);
+  x1 = _mm_add_epi32(x1, rounding);
+  *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS);
+  *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS);
+}
+
+static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1,
+                                           const __m128i *mask, uint16_t *dst) {
+  __m128i res = _mm_packus_epi32(*y0, *y1);
+  res = _mm_min_epi16(res, *mask);
+  _mm_storeu_si128((__m128i *)dst, res);
+}
+
+static void vpx_highbd_filter_block1d8_v2_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m128i signal[3], res0, res1;
+  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+  __m128i ff;
+
+  pack_8x1_2t_filter(filter, &ff);
+  pack_8x2_init(src_ptr, signal);
+
+  do {
+    pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
+    filter_8_2t_pixels(signal, &ff, &res0, &res1);
+    store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr);
+
+    src_ptr += src_pitch;
+    dst_ptr += dst_pitch;
+    height -= 1;
+  } while (height > 0);
+}
+
+// Calculation with averaging the input pixels
+
+static INLINE void store_8x1_avg_pixels(const __m256i *y0, const __m256i *mask,
+                                        uint16_t *dst) {
+  const __m128i a0 = _mm256_castsi256_si128(*y0);
+  const __m128i a1 = _mm256_extractf128_si256(*y0, 1);
+  __m128i res = _mm_packus_epi32(a0, a1);
+  const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
+  res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
+  res = _mm_avg_epu16(res, pix);
+  _mm_storeu_si128((__m128i *)dst, res);
+}
+
+static INLINE void store_8x2_avg_pixels(const __m256i *y0, const __m256i *y1,
+                                        const __m256i *mask, uint16_t *dst,
+                                        ptrdiff_t pitch) {
+  __m256i a = _mm256_packus_epi32(*y0, *y1);
+  const __m128i pix0 = _mm_loadu_si128((const __m128i *)dst);
+  const __m128i pix1 = _mm_loadu_si128((const __m128i *)(dst + pitch));
+  const __m256i pix =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
+  a = _mm256_min_epi16(a, *mask);
+  a = _mm256_avg_epu16(a, pix);
+  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
+  _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
+}
+
+static INLINE void store_16x1_avg_pixels(const __m256i *y0, const __m256i *y1,
+                                         const __m256i *mask, uint16_t *dst) {
+  __m256i a = _mm256_packus_epi32(*y0, *y1);
+  const __m256i pix = _mm256_loadu_si256((const __m256i *)dst);
+  a = _mm256_min_epi16(a, *mask);
+  a = _mm256_avg_epu16(a, pix);
+  _mm256_storeu_si256((__m256i *)dst, a);
+}
+
+static INLINE void store_16x2_avg_pixels(const __m256i *y0, const __m256i *y1,
+                                         const __m256i *mask, uint16_t *dst,
+                                         ptrdiff_t pitch) {
+  const __m256i pix0 = _mm256_loadu_si256((const __m256i *)dst);
+  const __m256i pix1 = _mm256_loadu_si256((const __m256i *)(dst + pitch));
+  __m256i p = _mm256_min_epi16(*y0, *mask);
+  p = _mm256_avg_epu16(p, pix0);
+  _mm256_storeu_si256((__m256i *)dst, p);
+
+  p = _mm256_min_epi16(*y1, *mask);
+  p = _mm256_avg_epu16(p, pix1);
+  _mm256_storeu_si256((__m256i *)(dst + pitch), p);
+}
+
+static INLINE void store_8x1_2t_avg_pixels_ver(const __m128i *y0,
+                                               const __m128i *y1,
+                                               const __m128i *mask,
+                                               uint16_t *dst) {
+  __m128i res = _mm_packus_epi32(*y0, *y1);
+  const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
+  res = _mm_min_epi16(res, *mask);
+  res = _mm_avg_epu16(res, pix);
+  _mm_storeu_si128((__m128i *)dst, res);
+}
+
+static void vpx_highbd_filter_block1d8_h8_avg_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[8], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff[4];
+  pack_filters(filter, ff);
+
+  src_ptr -= 3;
+  do {
+    pack_8x2_pixels(src_ptr, src_pitch, signal);
+    filter_8x1_pixels(signal, ff, &res0);
+    filter_8x1_pixels(&signal[4], ff, &res1);
+    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+    height -= 2;
+    src_ptr += src_pitch << 1;
+    dst_ptr += dst_pitch << 1;
+  } while (height > 1);
+
+  if (height > 0) {
+    pack_8x1_pixels(src_ptr, signal);
+    filter_8x1_pixels(signal, ff, &res0);
+    store_8x1_avg_pixels(&res0, &max, dst_ptr);
+  }
+}
+
+static void vpx_highbd_filter_block1d16_h8_avg_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[8], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff[4];
+  pack_filters(filter, ff);
+
+  src_ptr -= 3;
+  do {
+    pack_16x1_pixels(src_ptr, signal);
+    filter_8x1_pixels(signal, ff, &res0);
+    filter_8x1_pixels(&signal[4], ff, &res1);
+    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
+    height -= 1;
+    src_ptr += src_pitch;
+    dst_ptr += dst_pitch;
+  } while (height > 0);
+}
+
+static void vpx_highbd_filter_block1d4_h4_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+  // We extract the middle four elements of the kernel into two registers in
+  // the form
+  // ... k[3] k[2] k[3] k[2]
+  // ... k[5] k[4] k[5] k[4]
+  // Then we shuffle the source into
+  // ... s[1] s[0] s[0] s[-1]
+  // ... s[3] s[2] s[2] s[1]
+  // Calling multiply and add gives us half of the sum. Calling add on the two
+  // halves gives us the output. Since avx2 allows us to use 256-bit buffer, we
+  // can do this two rows at a time.
+
+  __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
+  __m256i res_reg;
+  __m256i idx_shift_0 =
+      _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2,
+                       3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9);
+  __m256i idx_shift_2 =
+      _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4,
+                       5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
+
+  __m128i kernel_reg_128;  // Kernel
+  __m256i kernel_reg, kernel_reg_23,
+      kernel_reg_45;  // Segments of the kernel used
+  const __m256i reg_round =
+      _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
+  const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
+  const ptrdiff_t unrolled_src_stride = src_stride << 1;
+  const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
+  int h;
+
+  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+  src_ptr -= 1;
+
+  // Load Kernel
+  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+  kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
+  kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
+
+  for (h = height; h >= 2; h -= 2) {
+    // Load the source
+    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
+    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+    // Get the output
+    res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
+                                   &kernel_reg_23, &kernel_reg_45);
+
+    // Round the result
+    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
+
+    // Finally combine to get the final dst
+    res_reg = _mm256_packus_epi32(res_reg, res_reg);
+    res_reg = _mm256_min_epi16(res_reg, reg_max);
+    mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+                        &res_reg);
+
+    src_ptr += unrolled_src_stride;
+    dst_ptr += unrolled_dst_stride;
+  }
+
+  // Repeat for the last row if needed
+  if (h > 0) {
+    // Load the source
+    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4);
+    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+    // Get the output
+    res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
+                                   &kernel_reg_23, &kernel_reg_45);
+
+    // Round the result
+    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
+
+    // Finally combine to get the final dst
+    res_reg = _mm256_packus_epi32(res_reg, res_reg);
+    res_reg = _mm256_min_epi16(res_reg, reg_max);
+    _mm_storel_epi64((__m128i *)dst_ptr, _mm256_castsi256_si128(res_reg));
+  }
+}
+
+static void vpx_highbd_filter_block1d8_h4_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+  // We will extract the middle four elements of the kernel into two registers
+  // in the form
+  // ... k[3] k[2] k[3] k[2]
+  // ... k[5] k[4] k[5] k[4]
+  // Then we shuffle the source into
+  // ... s[1] s[0] s[0] s[-1]
+  // ... s[3] s[2] s[2] s[1]
+  // Calling multiply and add gives us half of the sum of the first half.
+  // Calling add gives us first half of the output. Repat again to get the whole
+  // output. Since avx2 allows us to use 256-bit buffer, we can do this two rows
+  // at a time.
+
+  __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
+  __m256i res_reg, res_first, res_last;
+  __m256i idx_shift_0 =
+      _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2,
+                       3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9);
+  __m256i idx_shift_2 =
+      _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4,
+                       5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
+
+  __m128i kernel_reg_128;  // Kernel
+  __m256i kernel_reg, kernel_reg_23,
+      kernel_reg_45;  // Segments of the kernel used
+  const __m256i reg_round =
+      _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
+  const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
+  const ptrdiff_t unrolled_src_stride = src_stride << 1;
+  const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
+  int h;
+
+  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+  src_ptr -= 1;
+
+  // Load Kernel
+  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+  kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
+  kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
+
+  for (h = height; h >= 2; h -= 2) {
+    // Load the source
+    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
+    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+    // Result for first half
+    res_first = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
+                                     &kernel_reg_23, &kernel_reg_45);
+
+    // Do again to get the second half of dst
+    // Load the source
+    src_reg = mm256_loadu2_si128(src_ptr + 4, src_ptr + src_stride + 4);
+    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+    // Result for second half
+    res_last = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
+                                    &kernel_reg_23, &kernel_reg_45);
+
+    // Round each result
+    res_first = mm256_round_epi32(&res_first, &reg_round, CONV8_ROUNDING_BITS);
+    res_last = mm256_round_epi32(&res_last, &reg_round, CONV8_ROUNDING_BITS);
+
+    // Finally combine to get the final dst
+    res_reg = _mm256_packus_epi32(res_first, res_last);
+    res_reg = _mm256_min_epi16(res_reg, reg_max);
+    mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+                       &res_reg);
+
+    src_ptr += unrolled_src_stride;
+    dst_ptr += unrolled_dst_stride;
+  }
+
+  // Repeat for the last row if needed
+  if (h > 0) {
+    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4);
+    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+    res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
+                                   &kernel_reg_23, &kernel_reg_45);
+
+    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
+
+    res_reg = _mm256_packus_epi32(res_reg, res_reg);
+    res_reg = _mm256_min_epi16(res_reg, reg_max);
+
+    mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + 4), &res_reg);
+  }
+}
+
+static void vpx_highbd_filter_block1d16_h4_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+  vpx_highbd_filter_block1d8_h4_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
+                                     height, kernel, bd);
+  vpx_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_stride, dst_ptr + 8,
+                                     dst_stride, height, kernel, bd);
+}
+
+static void vpx_highbd_filter_block1d8_v8_avg_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[9], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff[4];
+  pack_filters(filter, ff);
+
+  pack_8x9_init(src_ptr, src_pitch, signal);
+
+  do {
+    pack_8x9_pixels(src_ptr, src_pitch, signal);
+
+    filter_8x9_pixels(signal, ff, &res0, &res1);
+    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+    update_pixels(signal);
+
+    src_ptr += src_pitch << 1;
+    dst_ptr += dst_pitch << 1;
+    height -= 2;
+  } while (height > 0);
+}
+
+static void vpx_highbd_filter_block1d16_v8_avg_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[17], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff[4];
+  pack_filters(filter, ff);
+
+  pack_16x9_init(src_ptr, src_pitch, signal);
+
+  do {
+    pack_16x9_pixels(src_ptr, src_pitch, signal);
+    filter_16x9_pixels(signal, ff, &res0, &res1);
+    store_16x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+    update_16x9_pixels(signal);
+
+    src_ptr += src_pitch << 1;
+    dst_ptr += dst_pitch << 1;
+    height -= 2;
+  } while (height > 0);
+}
+
+static void vpx_highbd_filter_block1d8_h2_avg_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[2], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff;
+  pack_2t_filter(filter, &ff);
+
+  src_ptr -= 3;
+  do {
+    pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
+    filter_16_2t_pixels(signal, &ff, &res0, &res1);
+    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+    height -= 2;
+    src_ptr += src_pitch << 1;
+    dst_ptr += dst_pitch << 1;
+  } while (height > 1);
+
+  if (height > 0) {
+    pack_8x1_2t_pixels(src_ptr, signal);
+    filter_8x1_2t_pixels(signal, &ff, &res0);
+    store_8x1_avg_pixels(&res0, &max, dst_ptr);
+  }
+}
+
+static void vpx_highbd_filter_block1d16_h2_avg_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[2], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff;
+  pack_2t_filter(filter, &ff);
+
+  src_ptr -= 3;
+  do {
+    pack_16x1_2t_pixels(src_ptr, signal);
+    filter_16_2t_pixels(signal, &ff, &res0, &res1);
+    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
+    height -= 1;
+    src_ptr += src_pitch;
+    dst_ptr += dst_pitch;
+  } while (height > 0);
+}
+
+static void vpx_highbd_filter_block1d16_v2_avg_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[3], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+  __m256i ff;
+
+  pack_2t_filter(filter, &ff);
+  pack_16x2_init(src_ptr, signal);
+
+  do {
+    pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
+    filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
+    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
+
+    src_ptr += src_pitch;
+    dst_ptr += dst_pitch;
+    height -= 1;
+  } while (height > 0);
+}
+
+static void vpx_highbd_filter_block1d8_v2_avg_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m128i signal[3], res0, res1;
+  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+  __m128i ff;
+
+  pack_8x1_2t_filter(filter, &ff);
+  pack_8x2_init(src_ptr, signal);
+
+  do {
+    pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
+    filter_8_2t_pixels(signal, &ff, &res0, &res1);
+    store_8x1_2t_avg_pixels_ver(&res0, &res1, &max, dst_ptr);
+
+    src_ptr += src_pitch;
+    dst_ptr += dst_pitch;
+    height -= 1;
+  } while (height > 0);
+}
+
+static void vpx_highbd_filter_block1d4_v4_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+  // We will load two rows of pixels and rearrange them into the form
+  // ... s[1,0] s[0,0] s[0,0] s[-1,0]
+  // so that we can call multiply and add with the kernel partial output. Then
+  // we can call add with another row to get the output.
+
+  // Register for source s[-1:3, :]
+  __m256i src_reg_1, src_reg_2, src_reg_3;
+  // Interleaved rows of the source. lo is first half, hi second
+  __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
+  __m256i src_reg_m1001, src_reg_1223;
+
+  // Result after multiply and add
+  __m256i res_reg;
+
+  __m128i kernel_reg_128;                            // Kernel
+  __m256i kernel_reg, kernel_reg_23, kernel_reg_45;  // Segments of kernel used
+
+  const __m256i reg_round =
+      _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
+  const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
+  const ptrdiff_t src_stride_unrolled = src_stride << 1;
+  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+  int h;
+
+  // Load Kernel
+  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+  kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
+  kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
+
+  // Row -1 to row 0
+  src_reg_m10 = mm256_loadu2_epi64((const __m128i *)src_ptr,
+                                   (const __m128i *)(src_ptr + src_stride));
+
+  // Row 0 to row 1
+  src_reg_1 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
+  src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
+
+  // First three rows
+  src_reg_m1001 = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01);
+
+  for (h = height; h > 1; h -= 2) {
+    src_reg_2 = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)));
+
+    src_reg_12 = _mm256_inserti128_si256(src_reg_1,
+                                         _mm256_castsi256_si128(src_reg_2), 1);
+
+    src_reg_3 = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)));
+
+    src_reg_23 = _mm256_inserti128_si256(src_reg_2,
+                                         _mm256_castsi256_si128(src_reg_3), 1);
+
+    // Last three rows
+    src_reg_1223 = _mm256_unpacklo_epi16(src_reg_12, src_reg_23);
+
+    // Output
+    res_reg = mm256_madd_add_epi32(&src_reg_m1001, &src_reg_1223,
+                                   &kernel_reg_23, &kernel_reg_45);
+
+    // Round the words
+    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
+
+    // Combine to get the result
+    res_reg = _mm256_packus_epi32(res_reg, res_reg);
+    res_reg = _mm256_min_epi16(res_reg, reg_max);
+
+    // Save the result
+    mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+                        &res_reg);
+
+    // Update the source by two rows
+    src_ptr += src_stride_unrolled;
+    dst_ptr += dst_stride_unrolled;
+
+    src_reg_m1001 = src_reg_1223;
+    src_reg_1 = src_reg_3;
+  }
+}
+
+static void vpx_highbd_filter_block1d8_v4_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+  // We will load two rows of pixels and rearrange them into the form
+  // ... s[1,0] s[0,0] s[0,0] s[-1,0]
+  // so that we can call multiply and add with the kernel partial output. Then
+  // we can call add with another row to get the output.
+
+  // Register for source s[-1:3, :]
+  __m256i src_reg_1, src_reg_2, src_reg_3;
+  // Interleaved rows of the source. lo is first half, hi second
+  __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
+  __m256i src_reg_m1001_lo, src_reg_m1001_hi, src_reg_1223_lo, src_reg_1223_hi;
+
+  __m128i kernel_reg_128;                            // Kernel
+  __m256i kernel_reg, kernel_reg_23, kernel_reg_45;  // Segments of kernel
+
+  // Result after multiply and add
+  __m256i res_reg, res_reg_lo, res_reg_hi;
+
+  const __m256i reg_round =
+      _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
+  const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
+  const ptrdiff_t src_stride_unrolled = src_stride << 1;
+  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+  int h;
+
+  // Load Kernel
+  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+  kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
+  kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
+
+  // Row -1 to row 0
+  src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr,
+                                   (const __m128i *)(src_ptr + src_stride));
+
+  // Row 0 to row 1
+  src_reg_1 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
+  src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
+
+  // First three rows
+  src_reg_m1001_lo = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01);
+  src_reg_m1001_hi = _mm256_unpackhi_epi16(src_reg_m10, src_reg_01);
+
+  for (h = height; h > 1; h -= 2) {
+    src_reg_2 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)));
+
+    src_reg_12 = _mm256_inserti128_si256(src_reg_1,
+                                         _mm256_castsi256_si128(src_reg_2), 1);
+
+    src_reg_3 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)));
+
+    src_reg_23 = _mm256_inserti128_si256(src_reg_2,
+                                         _mm256_castsi256_si128(src_reg_3), 1);
+
+    // Last three rows
+    src_reg_1223_lo = _mm256_unpacklo_epi16(src_reg_12, src_reg_23);
+    src_reg_1223_hi = _mm256_unpackhi_epi16(src_reg_12, src_reg_23);
+
+    // Output from first half
+    res_reg_lo = mm256_madd_add_epi32(&src_reg_m1001_lo, &src_reg_1223_lo,
+                                      &kernel_reg_23, &kernel_reg_45);
+
+    // Output from second half
+    res_reg_hi = mm256_madd_add_epi32(&src_reg_m1001_hi, &src_reg_1223_hi,
+                                      &kernel_reg_23, &kernel_reg_45);
+
+    // Round the words
+    res_reg_lo =
+        mm256_round_epi32(&res_reg_lo, &reg_round, CONV8_ROUNDING_BITS);
+    res_reg_hi =
+        mm256_round_epi32(&res_reg_hi, &reg_round, CONV8_ROUNDING_BITS);
+
+    // Combine to get the result
+    res_reg = _mm256_packus_epi32(res_reg_lo, res_reg_hi);
+    res_reg = _mm256_min_epi16(res_reg, reg_max);
+
+    // Save the result
+    mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+                       &res_reg);
+
+    // Update the source by two rows
+    src_ptr += src_stride_unrolled;
+    dst_ptr += dst_stride_unrolled;
+
+    src_reg_m1001_lo = src_reg_1223_lo;
+    src_reg_m1001_hi = src_reg_1223_hi;
+    src_reg_1 = src_reg_3;
+  }
+}
+
+static void vpx_highbd_filter_block1d16_v4_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+  vpx_highbd_filter_block1d8_v4_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
+                                     height, kernel, bd);
+  vpx_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_stride, dst_ptr + 8,
+                                     dst_stride, height, kernel, bd);
+}
+
+// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2;
+
+// From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm.
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
+
+#define vpx_highbd_filter_block1d4_h8_avx2 vpx_highbd_filter_block1d4_h8_sse2
+#define vpx_highbd_filter_block1d4_h2_avx2 vpx_highbd_filter_block1d4_h2_sse2
+#define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2
+#define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2
+
+// Use the [vh]8 version because there is no [vh]4 implementation.
+#define vpx_highbd_filter_block1d16_v4_avg_avx2 \
+  vpx_highbd_filter_block1d16_v8_avg_avx2
+#define vpx_highbd_filter_block1d16_h4_avg_avx2 \
+  vpx_highbd_filter_block1d16_h8_avg_avx2
+#define vpx_highbd_filter_block1d8_v4_avg_avx2 \
+  vpx_highbd_filter_block1d8_v8_avg_avx2
+#define vpx_highbd_filter_block1d8_h4_avg_avx2 \
+  vpx_highbd_filter_block1d8_h8_avg_avx2
+#define vpx_highbd_filter_block1d4_v4_avg_avx2 \
+  vpx_highbd_filter_block1d4_v8_avg_avx2
+#define vpx_highbd_filter_block1d4_h4_avg_avx2 \
+  vpx_highbd_filter_block1d4_h8_avg_avx2
+
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0)
+HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
+                 src - src_stride * (num_taps / 2 - 1), , avx2, 0)
+HIGH_FUN_CONV_2D(, avx2, 0)
+
+// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2;
+
+// From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm.
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
+
+#define vpx_highbd_filter_block1d4_h8_avg_avx2 \
+  vpx_highbd_filter_block1d4_h8_avg_sse2
+#define vpx_highbd_filter_block1d4_h2_avg_avx2 \
+  vpx_highbd_filter_block1d4_h2_avg_sse2
+#define vpx_highbd_filter_block1d4_v8_avg_avx2 \
+  vpx_highbd_filter_block1d4_v8_avg_sse2
+#define vpx_highbd_filter_block1d4_v2_avg_avx2 \
+  vpx_highbd_filter_block1d4_v2_avg_sse2
+
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1)
+HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+                 src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1)
+HIGH_FUN_CONV_2D(avg_, avx2, 1)
+
+#undef HIGHBD_FUNC
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
new file mode 100644
index 0000000000..f4f7235d13
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
@@ -0,0 +1,355 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
+                                             __m128i *const out) {
+  // stage 5
+  out[0] = _mm_add_epi32(in[0], in[3]);
+  out[1] = _mm_add_epi32(in[1], in[2]);
+  out[2] = _mm_sub_epi32(in[1], in[2]);
+  out[3] = _mm_sub_epi32(in[0], in[3]);
+  highbd_butterfly_cospi16_sse2(in[6], in[5], &out[6], &out[5]);
+  out[8] = _mm_add_epi32(in[8], in[11]);
+  out[9] = _mm_add_epi32(in[9], in[10]);
+  out[10] = _mm_sub_epi32(in[9], in[10]);
+  out[11] = _mm_sub_epi32(in[8], in[11]);
+  out[12] = _mm_sub_epi32(in[15], in[12]);
+  out[13] = _mm_sub_epi32(in[14], in[13]);
+  out[14] = _mm_add_epi32(in[14], in[13]);
+  out[15] = _mm_add_epi32(in[15], in[12]);
+}
+
+static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
+                                             __m128i *const out) {
+  out[0] = _mm_add_epi32(in[0], in[7]);
+  out[1] = _mm_add_epi32(in[1], in[6]);
+  out[2] = _mm_add_epi32(in[2], in[5]);
+  out[3] = _mm_add_epi32(in[3], in[4]);
+  out[4] = _mm_sub_epi32(in[3], in[4]);
+  out[5] = _mm_sub_epi32(in[2], in[5]);
+  out[6] = _mm_sub_epi32(in[1], in[6]);
+  out[7] = _mm_sub_epi32(in[0], in[7]);
+  out[8] = in[8];
+  out[9] = in[9];
+  highbd_butterfly_cospi16_sse2(in[13], in[10], &out[13], &out[10]);
+  highbd_butterfly_cospi16_sse2(in[12], in[11], &out[12], &out[11]);
+  out[14] = in[14];
+  out[15] = in[15];
+}
+
+static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
+  __m128i step1[16], step2[16];
+
+  // stage 2
+  highbd_butterfly_sse2(io[1], io[15], cospi_30_64, cospi_2_64, &step2[8],
+                        &step2[15]);
+  highbd_butterfly_sse2(io[9], io[7], cospi_14_64, cospi_18_64, &step2[9],
+                        &step2[14]);
+  highbd_butterfly_sse2(io[5], io[11], cospi_22_64, cospi_10_64, &step2[10],
+                        &step2[13]);
+  highbd_butterfly_sse2(io[13], io[3], cospi_6_64, cospi_26_64, &step2[11],
+                        &step2[12]);
+
+  // stage 3
+  highbd_butterfly_sse2(io[2], io[14], cospi_28_64, cospi_4_64, &step1[4],
+                        &step1[7]);
+  highbd_butterfly_sse2(io[10], io[6], cospi_12_64, cospi_20_64, &step1[5],
+                        &step1[6]);
+  step1[8] = _mm_add_epi32(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+  step1[10] = _mm_sub_epi32(step2[10], step2[11]);  // step1[10] = -step1[10]
+  step1[11] = _mm_add_epi32(step2[10], step2[11]);
+  step1[12] = _mm_add_epi32(step2[13], step2[12]);
+  step1[13] = _mm_sub_epi32(step2[13], step2[12]);  // step1[13] = -step1[13]
+  step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+  step1[15] = _mm_add_epi32(step2[15], step2[14]);
+
+  // stage 4
+  highbd_butterfly_cospi16_sse2(io[0], io[8], &step2[0], &step2[1]);
+  highbd_butterfly_sse2(io[4], io[12], cospi_24_64, cospi_8_64, &step2[2],
+                        &step2[3]);
+  highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+                        &step2[14]);
+  highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
+                        &step2[13], &step2[10]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step1[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step1[7] = _mm_add_epi32(step1[7], step1[6]);
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  highbd_idct16_4col_stage5(step2, step1);
+  highbd_idct16_4col_stage6(step1, step2);
+  highbd_idct16_4col_stage7(step2, io);
+}
+
+static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) {
+  __m128i step1[16], step2[16];
+  __m128i temp1[2], sign[2];
+
+  // stage 2
+  highbd_partial_butterfly_sse2(io[1], cospi_30_64, cospi_2_64, &step2[8],
+                                &step2[15]);
+  highbd_partial_butterfly_neg_sse2(io[7], cospi_14_64, cospi_18_64, &step2[9],
+                                    &step2[14]);
+  highbd_partial_butterfly_sse2(io[5], cospi_22_64, cospi_10_64, &step2[10],
+                                &step2[13]);
+  highbd_partial_butterfly_neg_sse2(io[3], cospi_6_64, cospi_26_64, &step2[11],
+                                    &step2[12]);
+
+  // stage 3
+  highbd_partial_butterfly_sse2(io[2], cospi_28_64, cospi_4_64, &step1[4],
+                                &step1[7]);
+  highbd_partial_butterfly_neg_sse2(io[6], cospi_12_64, cospi_20_64, &step1[5],
+                                    &step1[6]);
+  step1[8] = _mm_add_epi32(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+  step1[10] = _mm_sub_epi32(step2[10], step2[11]);  // step1[10] = -step1[10]
+  step1[11] = _mm_add_epi32(step2[10], step2[11]);
+  step1[12] = _mm_add_epi32(step2[13], step2[12]);
+  step1[13] = _mm_sub_epi32(step2[13], step2[12]);  // step1[13] = -step1[13]
+  step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+  step1[15] = _mm_add_epi32(step2[15], step2[14]);
+
+  // stage 4
+  abs_extend_64bit_sse2(io[0], temp1, sign);
+  step2[0] = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
+  step2[1] = step2[0];
+  highbd_partial_butterfly_sse2(io[4], cospi_24_64, cospi_8_64, &step2[2],
+                                &step2[3]);
+  highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+                        &step2[14]);
+  highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
+                        &step2[13], &step2[10]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step1[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step1[7] = _mm_add_epi32(step1[7], step1[6]);
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  highbd_idct16_4col_stage5(step2, step1);
+  highbd_idct16_4col_stage6(step1, step2);
+  highbd_idct16_4col_stage7(step2, io);
+}
+
+static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) {
+  __m128i step1[16], step2[16];
+  __m128i temp[2], sign[2];
+
+  // stage 2
+  highbd_partial_butterfly_sse2(io[1], cospi_30_64, cospi_2_64, &step2[8],
+                                &step2[15]);
+  highbd_partial_butterfly_neg_sse2(io[3], cospi_6_64, cospi_26_64, &step2[11],
+                                    &step2[12]);
+
+  // stage 3
+  highbd_partial_butterfly_sse2(io[2], cospi_28_64, cospi_4_64, &step1[4],
+                                &step1[7]);
+  step1[8] = step2[8];
+  step1[9] = step2[8];
+  step1[10] =
+      _mm_sub_epi32(_mm_setzero_si128(), step2[11]);  // step1[10] = -step1[10]
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[13] =
+      _mm_sub_epi32(_mm_setzero_si128(), step2[12]);  // step1[13] = -step1[13]
+  step1[14] = step2[15];
+  step1[15] = step2[15];
+
+  // stage 4
+  abs_extend_64bit_sse2(io[0], temp, sign);
+  step2[0] = multiplication_round_shift_sse2(temp, sign, cospi_16_64);
+  step2[1] = step2[0];
+  step2[2] = _mm_setzero_si128();
+  step2[3] = _mm_setzero_si128();
+  highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+                        &step2[14]);
+  highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
+                        &step2[13], &step2[10]);
+  step2[5] = step1[4];
+  step2[6] = step1[7];
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  highbd_idct16_4col_stage5(step2, step1);
+  highbd_idct16_4col_stage6(step1, step2);
+  highbd_idct16_4col_stage7(step2, io);
+}
+
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                       int stride, int bd) {
+  int i;
+  __m128i out[16], *in;
+
+  if (bd == 8) {
+    __m128i l[16], r[16];
+
+    in = l;
+    for (i = 0; i < 2; i++) {
+      highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
+      highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);
+      idct16_8col(in, in);
+      in = r;
+      input += 128;
+    }
+
+    for (i = 0; i < 16; i += 8) {
+      int j;
+      transpose_16bit_8x8(l + i, out);
+      transpose_16bit_8x8(r + i, out + 8);
+      idct16_8col(out, out);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[4][16];
+
+    for (i = 0; i < 4; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
+      highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
+      highbd_idct16_4col(in);
+      input += 4 * 16;
+    }
+
+    for (i = 0; i < 16; i += 4) {
+      int j;
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      transpose_32bit_4x4(all[2] + i, out + 8);
+      transpose_32bit_4x4(all[3] + i, out + 12);
+      highbd_idct16_4col(out);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
+
+void vpx_highbd_idct16x16_38_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                      int stride, int bd) {
+  int i;
+  __m128i out[16];
+
+  if (bd == 8) {
+    __m128i in[16], temp[16];
+
+    highbd_load_pack_transpose_32bit_8x8(input, 16, in);
+    for (i = 8; i < 16; i++) {
+      in[i] = _mm_setzero_si128();
+    }
+    idct16_8col(in, temp);
+
+    for (i = 0; i < 16; i += 8) {
+      int j;
+      transpose_16bit_8x8(temp + i, in);
+      idct16_8col(in, out);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[2][16], *in;
+
+    for (i = 0; i < 2; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(input, 16, in);
+      highbd_idct16x16_38_4col(in);
+      input += 4 * 16;
+    }
+
+    for (i = 0; i < 16; i += 4) {
+      int j;
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      highbd_idct16x16_38_4col(out);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
+
+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                      int stride, int bd) {
+  int i;
+  __m128i out[16];
+
+  if (bd == 8) {
+    __m128i in[16], l[16];
+
+    in[0] = load_pack_8_32bit(input + 0 * 16);
+    in[1] = load_pack_8_32bit(input + 1 * 16);
+    in[2] = load_pack_8_32bit(input + 2 * 16);
+    in[3] = load_pack_8_32bit(input + 3 * 16);
+
+    idct16x16_10_pass1(in, l);
+
+    for (i = 0; i < 16; i += 8) {
+      int j;
+      idct16x16_10_pass2(l + i, in);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_8(dest + j * stride, in[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[2][16], *in;
+
+    for (i = 0; i < 2; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_4x4(input, 16, in);
+      highbd_idct16x16_10_4col(in);
+      input += 4 * 16;
+    }
+
+    for (i = 0; i < 16; i += 4) {
+      int j;
+      transpose_32bit_4x4(&all[0][i], out);
+      highbd_idct16x16_10_4col(out);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
+
+void vpx_highbd_idct16x16_1_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                     int stride, int bd) {
+  highbd_idct_1_add_kernel(input, dest, stride, bd, 16);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c
new file mode 100644
index 0000000000..7898ee12c8
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c
@@ -0,0 +1,349 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h>  // SSE4.1
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
+                                             __m128i *const out) {
+  // stage 5
+  out[0] = _mm_add_epi32(in[0], in[3]);
+  out[1] = _mm_add_epi32(in[1], in[2]);
+  out[2] = _mm_sub_epi32(in[1], in[2]);
+  out[3] = _mm_sub_epi32(in[0], in[3]);
+  highbd_butterfly_cospi16_sse4_1(in[6], in[5], &out[6], &out[5]);
+  out[8] = _mm_add_epi32(in[8], in[11]);
+  out[9] = _mm_add_epi32(in[9], in[10]);
+  out[10] = _mm_sub_epi32(in[9], in[10]);
+  out[11] = _mm_sub_epi32(in[8], in[11]);
+  out[12] = _mm_sub_epi32(in[15], in[12]);
+  out[13] = _mm_sub_epi32(in[14], in[13]);
+  out[14] = _mm_add_epi32(in[14], in[13]);
+  out[15] = _mm_add_epi32(in[15], in[12]);
+}
+
+static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
+                                             __m128i *const out) {
+  out[0] = _mm_add_epi32(in[0], in[7]);
+  out[1] = _mm_add_epi32(in[1], in[6]);
+  out[2] = _mm_add_epi32(in[2], in[5]);
+  out[3] = _mm_add_epi32(in[3], in[4]);
+  out[4] = _mm_sub_epi32(in[3], in[4]);
+  out[5] = _mm_sub_epi32(in[2], in[5]);
+  out[6] = _mm_sub_epi32(in[1], in[6]);
+  out[7] = _mm_sub_epi32(in[0], in[7]);
+  out[8] = in[8];
+  out[9] = in[9];
+  highbd_butterfly_cospi16_sse4_1(in[13], in[10], &out[13], &out[10]);
+  highbd_butterfly_cospi16_sse4_1(in[12], in[11], &out[12], &out[11]);
+  out[14] = in[14];
+  out[15] = in[15];
+}
+
+void vpx_highbd_idct16_4col_sse4_1(__m128i *const io /*io[16]*/) {
+  __m128i step1[16], step2[16];
+
+  // stage 2
+  highbd_butterfly_sse4_1(io[1], io[15], cospi_30_64, cospi_2_64, &step2[8],
+                          &step2[15]);
+  highbd_butterfly_sse4_1(io[9], io[7], cospi_14_64, cospi_18_64, &step2[9],
+                          &step2[14]);
+  highbd_butterfly_sse4_1(io[5], io[11], cospi_22_64, cospi_10_64, &step2[10],
+                          &step2[13]);
+  highbd_butterfly_sse4_1(io[13], io[3], cospi_6_64, cospi_26_64, &step2[11],
+                          &step2[12]);
+
+  // stage 3
+  highbd_butterfly_sse4_1(io[2], io[14], cospi_28_64, cospi_4_64, &step1[4],
+                          &step1[7]);
+  highbd_butterfly_sse4_1(io[10], io[6], cospi_12_64, cospi_20_64, &step1[5],
+                          &step1[6]);
+  step1[8] = _mm_add_epi32(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+  step1[10] = _mm_sub_epi32(step2[11], step2[10]);
+  step1[11] = _mm_add_epi32(step2[11], step2[10]);
+  step1[12] = _mm_add_epi32(step2[12], step2[13]);
+  step1[13] = _mm_sub_epi32(step2[12], step2[13]);
+  step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+  step1[15] = _mm_add_epi32(step2[15], step2[14]);
+
+  // stage 4
+  highbd_butterfly_cospi16_sse4_1(io[0], io[8], &step2[0], &step2[1]);
+  highbd_butterfly_sse4_1(io[4], io[12], cospi_24_64, cospi_8_64, &step2[2],
+                          &step2[3]);
+  highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
+                          &step2[9], &step2[14]);
+  highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64,
+                          &step2[13], &step2[10]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step1[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step1[7] = _mm_add_epi32(step1[7], step1[6]);
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  highbd_idct16_4col_stage5(step2, step1);
+  highbd_idct16_4col_stage6(step1, step2);
+  highbd_idct16_4col_stage7(step2, io);
+}
+
+static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) {
+  __m128i step1[16], step2[16];
+  __m128i temp1[2];
+
+  // stage 2
+  highbd_partial_butterfly_sse4_1(io[1], cospi_30_64, cospi_2_64, &step2[8],
+                                  &step2[15]);
+  highbd_partial_butterfly_sse4_1(io[7], -cospi_18_64, cospi_14_64, &step2[9],
+                                  &step2[14]);
+  highbd_partial_butterfly_sse4_1(io[5], cospi_22_64, cospi_10_64, &step2[10],
+                                  &step2[13]);
+  highbd_partial_butterfly_sse4_1(io[3], -cospi_26_64, cospi_6_64, &step2[11],
+                                  &step2[12]);
+
+  // stage 3
+  highbd_partial_butterfly_sse4_1(io[2], cospi_28_64, cospi_4_64, &step1[4],
+                                  &step1[7]);
+  highbd_partial_butterfly_sse4_1(io[6], -cospi_20_64, cospi_12_64, &step1[5],
+                                  &step1[6]);
+  step1[8] = _mm_add_epi32(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+  step1[10] = _mm_sub_epi32(step2[11], step2[10]);
+  step1[11] = _mm_add_epi32(step2[11], step2[10]);
+  step1[12] = _mm_add_epi32(step2[12], step2[13]);
+  step1[13] = _mm_sub_epi32(step2[12], step2[13]);
+  step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+  step1[15] = _mm_add_epi32(step2[15], step2[14]);
+
+  // stage 4
+  extend_64bit(io[0], temp1);
+  step2[0] = multiplication_round_shift_sse4_1(temp1, cospi_16_64);
+  step2[1] = step2[0];
+  highbd_partial_butterfly_sse4_1(io[4], cospi_24_64, cospi_8_64, &step2[2],
+                                  &step2[3]);
+  highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
+                          &step2[9], &step2[14]);
+  highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64,
+                          &step2[13], &step2[10]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step1[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step1[7] = _mm_add_epi32(step1[7], step1[6]);
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  highbd_idct16_4col_stage5(step2, step1);
+  highbd_idct16_4col_stage6(step1, step2);
+  highbd_idct16_4col_stage7(step2, io);
+}
+
+static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) {
+  __m128i step1[16], step2[16];
+  __m128i temp[2];
+
+  // stage 2
+  highbd_partial_butterfly_sse4_1(io[1], cospi_30_64, cospi_2_64, &step2[8],
+                                  &step2[15]);
+  highbd_partial_butterfly_sse4_1(io[3], -cospi_26_64, cospi_6_64, &step2[11],
+                                  &step2[12]);
+
+  // stage 3
+  highbd_partial_butterfly_sse4_1(io[2], cospi_28_64, cospi_4_64, &step1[4],
+                                  &step1[7]);
+  step1[8] = step2[8];
+  step1[9] = step2[8];
+  step1[10] = step2[11];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[13] = step2[12];
+  step1[14] = step2[15];
+  step1[15] = step2[15];
+
+  // stage 4
+  extend_64bit(io[0], temp);
+  step2[0] = multiplication_round_shift_sse4_1(temp, cospi_16_64);
+  step2[1] = step2[0];
+  step2[2] = _mm_setzero_si128();
+  step2[3] = _mm_setzero_si128();
+  highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
+                          &step2[9], &step2[14]);
+  highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64,
+                          &step2[13], &step2[10]);
+  step2[5] = step1[4];
+  step2[6] = step1[7];
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  highbd_idct16_4col_stage5(step2, step1);
+  highbd_idct16_4col_stage6(step1, step2);
+  highbd_idct16_4col_stage7(step2, io);
+}
+
+void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input,
+                                         uint16_t *dest, int stride, int bd) {
+  int i;
+  __m128i out[16], *in;
+
+  if (bd == 8) {
+    __m128i l[16], r[16];
+
+    in = l;
+    for (i = 0; i < 2; i++) {
+      highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
+      highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);
+      idct16_8col(in, in);
+      in = r;
+      input += 128;
+    }
+
+    for (i = 0; i < 16; i += 8) {
+      int j;
+      transpose_16bit_8x8(l + i, out);
+      transpose_16bit_8x8(r + i, out + 8);
+      idct16_8col(out, out);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[4][16];
+
+    for (i = 0; i < 4; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
+      highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
+      vpx_highbd_idct16_4col_sse4_1(in);
+      input += 4 * 16;
+    }
+
+    for (i = 0; i < 16; i += 4) {
+      int j;
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      transpose_32bit_4x4(all[2] + i, out + 8);
+      transpose_32bit_4x4(all[3] + i, out + 12);
+      vpx_highbd_idct16_4col_sse4_1(out);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
+
+void vpx_highbd_idct16x16_38_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                        int stride, int bd) {
+  int i;
+  __m128i out[16];
+
+  if (bd == 8) {
+    __m128i in[16], temp[16];
+
+    highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
+    for (i = 8; i < 16; i++) {
+      in[i] = _mm_setzero_si128();
+    }
+    idct16_8col(in, temp);
+
+    for (i = 0; i < 16; i += 8) {
+      int j;
+      transpose_16bit_8x8(temp + i, in);
+      idct16_8col(in, out);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[2][16], *in;
+
+    for (i = 0; i < 2; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(input, 16, in);
+      highbd_idct16x16_38_4col(in);
+      input += 4 * 16;
+    }
+
+    for (i = 0; i < 16; i += 4) {
+      int j;
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      highbd_idct16x16_38_4col(out);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
+
+void vpx_highbd_idct16x16_10_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                        int stride, int bd) {
+  int i;
+  __m128i out[16];
+
+  if (bd == 8) {
+    __m128i in[16], l[16];
+
+    in[0] = load_pack_8_32bit(input + 0 * 16);
+    in[1] = load_pack_8_32bit(input + 1 * 16);
+    in[2] = load_pack_8_32bit(input + 2 * 16);
+    in[3] = load_pack_8_32bit(input + 3 * 16);
+
+    idct16x16_10_pass1(in, l);
+
+    for (i = 0; i < 16; i += 8) {
+      int j;
+      idct16x16_10_pass2(l + i, in);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_8(dest + j * stride, in[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[2][16], *in;
+
+    for (i = 0; i < 2; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_4x4(input, 16, in);
+      highbd_idct16x16_10_4col(in);
+      input += 4 * 16;
+    }
+
+    for (i = 0; i < 16; i += 4) {
+      int j;
+      transpose_32bit_4x4(&all[0][i], out);
+      highbd_idct16x16_10_4col(out);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c
new file mode 100644
index 0000000000..c710e89954
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c
@@ -0,0 +1,782 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_idct32_4x32_quarter_2_stage_4_to_6(
+    __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) {
+  __m128i step2[32];
+
+  // stage 4
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+                        &step2[14]);
+  highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
+                        &step2[13], &step2[10]);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  // stage 5
+  step1[8] = _mm_add_epi32(step2[8], step2[11]);
+  step1[9] = _mm_add_epi32(step2[9], step2[10]);
+  step1[10] = _mm_sub_epi32(step2[9], step2[10]);
+  step1[11] = _mm_sub_epi32(step2[8], step2[11]);
+  step1[12] = _mm_sub_epi32(step2[15], step2[12]);
+  step1[13] = _mm_sub_epi32(step2[14], step2[13]);
+  step1[14] = _mm_add_epi32(step2[14], step2[13]);
+  step1[15] = _mm_add_epi32(step2[15], step2[12]);
+
+  // stage 6
+  out[8] = step1[8];
+  out[9] = step1[9];
+  highbd_butterfly_sse2(step1[13], step1[10], cospi_16_64, cospi_16_64,
+                        &out[10], &out[13]);
+  highbd_butterfly_sse2(step1[12], step1[11], cospi_16_64, cospi_16_64,
+                        &out[11], &out[12]);
+  out[14] = step1[14];
+  out[15] = step1[15];
+}
+
+static INLINE void highbd_idct32_4x32_quarter_3_4_stage_4_to_7(
+    __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step2[32];
+
+  // stage 4
+  step2[16] = _mm_add_epi32(step1[16], step1[19]);
+  step2[17] = _mm_add_epi32(step1[17], step1[18]);
+  step2[18] = _mm_sub_epi32(step1[17], step1[18]);
+  step2[19] = _mm_sub_epi32(step1[16], step1[19]);
+  step2[20] = _mm_sub_epi32(step1[20], step1[23]);  // step2[20] = -step2[20]
+  step2[21] = _mm_sub_epi32(step1[21], step1[22]);  // step2[21] = -step2[21]
+  step2[22] = _mm_add_epi32(step1[21], step1[22]);
+  step2[23] = _mm_add_epi32(step1[20], step1[23]);
+
+  step2[24] = _mm_add_epi32(step1[27], step1[24]);
+  step2[25] = _mm_add_epi32(step1[26], step1[25]);
+  step2[26] = _mm_sub_epi32(step1[26], step1[25]);  // step2[26] = -step2[26]
+  step2[27] = _mm_sub_epi32(step1[27], step1[24]);  // step2[27] = -step2[27]
+  step2[28] = _mm_sub_epi32(step1[31], step1[28]);
+  step2[29] = _mm_sub_epi32(step1[30], step1[29]);
+  step2[30] = _mm_add_epi32(step1[29], step1[30]);
+  step2[31] = _mm_add_epi32(step1[28], step1[31]);
+
+  // stage 5
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  highbd_butterfly_sse2(step2[29], step2[18], cospi_24_64, cospi_8_64,
+                        &step1[18], &step1[29]);
+  highbd_butterfly_sse2(step2[28], step2[19], cospi_24_64, cospi_8_64,
+                        &step1[19], &step1[28]);
+  highbd_butterfly_sse2(step2[20], step2[27], cospi_8_64, cospi_24_64,
+                        &step1[27], &step1[20]);
+  highbd_butterfly_sse2(step2[21], step2[26], cospi_8_64, cospi_24_64,
+                        &step1[26], &step1[21]);
+  step1[22] = step2[22];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[25] = step2[25];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+
+  // stage 6
+  step2[16] = _mm_add_epi32(step1[16], step1[23]);
+  step2[17] = _mm_add_epi32(step1[17], step1[22]);
+  step2[18] = _mm_add_epi32(step1[18], step1[21]);
+  step2[19] = _mm_add_epi32(step1[19], step1[20]);
+  step2[20] = _mm_sub_epi32(step1[19], step1[20]);
+  step2[21] = _mm_sub_epi32(step1[18], step1[21]);
+  step2[22] = _mm_sub_epi32(step1[17], step1[22]);
+  step2[23] = _mm_sub_epi32(step1[16], step1[23]);
+
+  step2[24] = _mm_sub_epi32(step1[31], step1[24]);
+  step2[25] = _mm_sub_epi32(step1[30], step1[25]);
+  step2[26] = _mm_sub_epi32(step1[29], step1[26]);
+  step2[27] = _mm_sub_epi32(step1[28], step1[27]);
+  step2[28] = _mm_add_epi32(step1[27], step1[28]);
+  step2[29] = _mm_add_epi32(step1[26], step1[29]);
+  step2[30] = _mm_add_epi32(step1[25], step1[30]);
+  step2[31] = _mm_add_epi32(step1[24], step1[31]);
+
+  // stage 7
+  out[16] = step2[16];
+  out[17] = step2[17];
+  out[18] = step2[18];
+  out[19] = step2[19];
+  highbd_butterfly_sse2(step2[27], step2[20], cospi_16_64, cospi_16_64,
+                        &out[20], &out[27]);
+  highbd_butterfly_sse2(step2[26], step2[21], cospi_16_64, cospi_16_64,
+                        &out[21], &out[26]);
+  highbd_butterfly_sse2(step2[25], step2[22], cospi_16_64, cospi_16_64,
+                        &out[22], &out[25]);
+  highbd_butterfly_sse2(step2[24], step2[23], cospi_16_64, cospi_16_64,
+                        &out[23], &out[24]);
+  out[28] = step2[28];
+  out[29] = step2[29];
+  out[30] = step2[30];
+  out[31] = step2[31];
+}
+
+// Group the coefficient calculation into smaller functions to prevent stack
+// spillover in 32x32 idct optimizations:
+// quarter_1: 0-7
+// quarter_2: 8-15
+// quarter_3_4: 16-23, 24-31
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_1(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+  __m128i step1[8], step2[8];
+
+  // stage 3
+  highbd_butterfly_sse2(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4],
+                        &step1[7]);
+  highbd_butterfly_sse2(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5],
+                        &step1[6]);
+
+  // stage 4
+  highbd_butterfly_sse2(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1],
+                        &step2[0]);
+  highbd_butterfly_sse2(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2],
+                        &step2[3]);
+  step2[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+  // stage 5
+  step1[0] = _mm_add_epi32(step2[0], step2[3]);
+  step1[1] = _mm_add_epi32(step2[1], step2[2]);
+  step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+  step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+  step1[4] = step2[4];
+  highbd_butterfly_sse2(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5],
+                        &step1[6]);
+  step1[7] = step2[7];
+
+  // stage 6
+  out[0] = _mm_add_epi32(step1[0], step1[7]);
+  out[1] = _mm_add_epi32(step1[1], step1[6]);
+  out[2] = _mm_add_epi32(step1[2], step1[5]);
+  out[3] = _mm_add_epi32(step1[3], step1[4]);
+  out[4] = _mm_sub_epi32(step1[3], step1[4]);
+  out[5] = _mm_sub_epi32(step1[2], step1[5]);
+  out[6] = _mm_sub_epi32(step1[1], step1[6]);
+  out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_2(
+    const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 2
+  highbd_butterfly_sse2(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8],
+                        &step2[15]);
+  highbd_butterfly_sse2(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9],
+                        &step2[14]);
+  highbd_butterfly_sse2(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10],
+                        &step2[13]);
+  highbd_butterfly_sse2(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11],
+                        &step2[12]);
+
+  // stage 3
+  step1[8] = _mm_add_epi32(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+  step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+  step1[15] = _mm_add_epi32(step2[15], step2[14]);
+  step1[10] = _mm_sub_epi32(step2[10], step2[11]);  // step1[10] = -step1[10]
+  step1[11] = _mm_add_epi32(step2[10], step2[11]);
+  step1[12] = _mm_add_epi32(step2[13], step2[12]);
+  step1[13] = _mm_sub_epi32(step2[13], step2[12]);  // step1[13] = -step1[13]
+
+  highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_1024_4x32_quarter_1_2(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i temp[16];
+  highbd_idct32_1024_4x32_quarter_1(in, temp);
+  highbd_idct32_1024_4x32_quarter_2(in, temp);
+  // stage 7
+  highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_3_4(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 1
+  highbd_butterfly_sse2(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16],
+                        &step1[31]);
+  highbd_butterfly_sse2(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17],
+                        &step1[30]);
+  highbd_butterfly_sse2(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18],
+                        &step1[29]);
+  highbd_butterfly_sse2(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19],
+                        &step1[28]);
+
+  highbd_butterfly_sse2(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20],
+                        &step1[27]);
+  highbd_butterfly_sse2(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21],
+                        &step1[26]);
+
+  highbd_butterfly_sse2(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22],
+                        &step1[25]);
+  highbd_butterfly_sse2(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23],
+                        &step1[24]);
+
+  // stage 2
+  step2[16] = _mm_add_epi32(step1[16], step1[17]);
+  step2[17] = _mm_sub_epi32(step1[16], step1[17]);
+  step2[18] = _mm_sub_epi32(step1[18], step1[19]);  // step2[18] = -step2[18]
+  step2[19] = _mm_add_epi32(step1[18], step1[19]);
+  step2[20] = _mm_add_epi32(step1[20], step1[21]);
+  step2[21] = _mm_sub_epi32(step1[20], step1[21]);
+  step2[22] = _mm_sub_epi32(step1[22], step1[23]);  // step2[22] = -step2[22]
+  step2[23] = _mm_add_epi32(step1[22], step1[23]);
+
+  step2[24] = _mm_add_epi32(step1[25], step1[24]);
+  step2[25] = _mm_sub_epi32(step1[25], step1[24]);  // step2[25] = -step2[25]
+  step2[26] = _mm_sub_epi32(step1[27], step1[26]);
+  step2[27] = _mm_add_epi32(step1[27], step1[26]);
+  step2[28] = _mm_add_epi32(step1[29], step1[28]);
+  step2[29] = _mm_sub_epi32(step1[29], step1[28]);  // step2[29] = -step2[29]
+  step2[30] = _mm_sub_epi32(step1[31], step1[30]);
+  step2[31] = _mm_add_epi32(step1[31], step1[30]);
+
+  // stage 3
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  highbd_butterfly_sse2(step2[30], step2[17], cospi_28_64, cospi_4_64,
+                        &step1[17], &step1[30]);
+  highbd_butterfly_sse2(step2[18], step2[29], cospi_4_64, cospi_28_64,
+                        &step1[29], &step1[18]);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  highbd_butterfly_sse2(step2[26], step2[21], cospi_12_64, cospi_20_64,
+                        &step1[21], &step1[26]);
+  highbd_butterfly_sse2(step2[22], step2[25], cospi_20_64, cospi_12_64,
+                        &step1[25], &step1[22]);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_1024_4x32(__m128i *const io /*io[32]*/) {
+  __m128i temp[32];
+
+  highbd_idct32_1024_4x32_quarter_1_2(io, temp);
+  highbd_idct32_1024_4x32_quarter_3_4(io, temp);
+  // final stage
+  highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_1024_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                        int stride, int bd) {
+  int i, j;
+
+  if (bd == 8) {
+    __m128i col[4][32], io[32];
+
+    // rows
+    for (i = 0; i < 4; i++) {
+      highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &io[0]);
+      highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &io[8]);
+      highbd_load_pack_transpose_32bit_8x8(&input[16], 32, &io[16]);
+      highbd_load_pack_transpose_32bit_8x8(&input[24], 32, &io[24]);
+      idct32_1024_8x32(io, col[i]);
+      input += 32 << 3;
+    }
+
+    // columns
+    for (i = 0; i < 32; i += 8) {
+      // Transpose 32x8 block to 8x32 block
+      transpose_16bit_8x8(col[0] + i, io);
+      transpose_16bit_8x8(col[1] + i, io + 8);
+      transpose_16bit_8x8(col[2] + i, io + 16);
+      transpose_16bit_8x8(col[3] + i, io + 24);
+      idct32_1024_8x32(io, io);
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_8(dest + j * stride, io[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[8][32], out[32], *in;
+
+    for (i = 0; i < 8; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+      highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+      highbd_load_transpose_32bit_8x4(&input[16], 32, &in[16]);
+      highbd_load_transpose_32bit_8x4(&input[24], 32, &in[24]);
+      highbd_idct32_1024_4x32(in);
+      input += 4 * 32;
+    }
+
+    for (i = 0; i < 32; i += 4) {
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      transpose_32bit_4x4(all[2] + i, out + 8);
+      transpose_32bit_4x4(all[3] + i, out + 12);
+      transpose_32bit_4x4(all[4] + i, out + 16);
+      transpose_32bit_4x4(all[5] + i, out + 20);
+      transpose_32bit_4x4(all[6] + i, out + 24);
+      transpose_32bit_4x4(all[7] + i, out + 28);
+      highbd_idct32_1024_4x32(out);
+
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
+
+// -----------------------------------------------------------------------------
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_1(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+  __m128i step1[8], step2[8];
+
+  // stage 3
+  highbd_partial_butterfly_sse2(in[4], cospi_28_64, cospi_4_64, &step1[4],
+                                &step1[7]);
+  highbd_partial_butterfly_neg_sse2(in[12], cospi_12_64, cospi_20_64, &step1[5],
+                                    &step1[6]);
+
+  // stage 4
+  highbd_partial_butterfly_sse2(in[0], cospi_16_64, cospi_16_64, &step2[1],
+                                &step2[0]);
+  highbd_partial_butterfly_sse2(in[8], cospi_24_64, cospi_8_64, &step2[2],
+                                &step2[3]);
+  step2[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+  // stage 5
+  step1[0] = _mm_add_epi32(step2[0], step2[3]);
+  step1[1] = _mm_add_epi32(step2[1], step2[2]);
+  step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+  step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+  step1[4] = step2[4];
+  highbd_butterfly_sse2(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5],
+                        &step1[6]);
+  step1[7] = step2[7];
+
+  // stage 6
+  out[0] = _mm_add_epi32(step1[0], step1[7]);
+  out[1] = _mm_add_epi32(step1[1], step1[6]);
+  out[2] = _mm_add_epi32(step1[2], step1[5]);
+  out[3] = _mm_add_epi32(step1[3], step1[4]);
+  out[4] = _mm_sub_epi32(step1[3], step1[4]);
+  out[5] = _mm_sub_epi32(step1[2], step1[5]);
+  out[6] = _mm_sub_epi32(step1[1], step1[6]);
+  out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_2(
+    const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 2
+  highbd_partial_butterfly_sse2(in[2], cospi_30_64, cospi_2_64, &step2[8],
+                                &step2[15]);
+  highbd_partial_butterfly_neg_sse2(in[14], cospi_14_64, cospi_18_64, &step2[9],
+                                    &step2[14]);
+  highbd_partial_butterfly_sse2(in[10], cospi_22_64, cospi_10_64, &step2[10],
+                                &step2[13]);
+  highbd_partial_butterfly_neg_sse2(in[6], cospi_6_64, cospi_26_64, &step2[11],
+                                    &step2[12]);
+
+  // stage 3
+  step1[8] = _mm_add_epi32(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+  step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+  step1[15] = _mm_add_epi32(step2[15], step2[14]);
+  step1[10] = _mm_sub_epi32(step2[10], step2[11]);  // step1[10] = -step1[10]
+  step1[11] = _mm_add_epi32(step2[10], step2[11]);
+  step1[12] = _mm_add_epi32(step2[13], step2[12]);
+  step1[13] = _mm_sub_epi32(step2[13], step2[12]);  // step1[13] = -step1[13]
+
+  highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_135_4x32_quarter_1_2(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i temp[16];
+  highbd_idct32_135_4x32_quarter_1(in, temp);
+  highbd_idct32_135_4x32_quarter_2(in, temp);
+  // stage 7
+  highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_3_4(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 1
+  highbd_partial_butterfly_sse2(in[1], cospi_31_64, cospi_1_64, &step1[16],
+                                &step1[31]);
+  highbd_partial_butterfly_neg_sse2(in[15], cospi_15_64, cospi_17_64,
+                                    &step1[17], &step1[30]);
+  highbd_partial_butterfly_sse2(in[9], cospi_23_64, cospi_9_64, &step1[18],
+                                &step1[29]);
+  highbd_partial_butterfly_neg_sse2(in[7], cospi_7_64, cospi_25_64, &step1[19],
+                                    &step1[28]);
+
+  highbd_partial_butterfly_sse2(in[5], cospi_27_64, cospi_5_64, &step1[20],
+                                &step1[27]);
+  highbd_partial_butterfly_neg_sse2(in[11], cospi_11_64, cospi_21_64,
+                                    &step1[21], &step1[26]);
+
+  highbd_partial_butterfly_sse2(in[13], cospi_19_64, cospi_13_64, &step1[22],
+                                &step1[25]);
+  highbd_partial_butterfly_neg_sse2(in[3], cospi_3_64, cospi_29_64, &step1[23],
+                                    &step1[24]);
+
+  // stage 2
+  step2[16] = _mm_add_epi32(step1[16], step1[17]);
+  step2[17] = _mm_sub_epi32(step1[16], step1[17]);
+  step2[18] = _mm_sub_epi32(step1[18], step1[19]);  // step2[18] = -step2[18]
+  step2[19] = _mm_add_epi32(step1[18], step1[19]);
+  step2[20] = _mm_add_epi32(step1[20], step1[21]);
+  step2[21] = _mm_sub_epi32(step1[20], step1[21]);
+  step2[22] = _mm_sub_epi32(step1[22], step1[23]);  // step2[22] = -step2[22]
+  step2[23] = _mm_add_epi32(step1[22], step1[23]);
+
+  step2[24] = _mm_add_epi32(step1[25], step1[24]);
+  step2[25] = _mm_sub_epi32(step1[25], step1[24]);  // step2[25] = -step2[25]
+  step2[26] = _mm_sub_epi32(step1[27], step1[26]);
+  step2[27] = _mm_add_epi32(step1[27], step1[26]);
+  step2[28] = _mm_add_epi32(step1[29], step1[28]);
+  step2[29] = _mm_sub_epi32(step1[29], step1[28]);  // step2[29] = -step2[29]
+  step2[30] = _mm_sub_epi32(step1[31], step1[30]);
+  step2[31] = _mm_add_epi32(step1[31], step1[30]);
+
+  // stage 3
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  highbd_butterfly_sse2(step2[30], step2[17], cospi_28_64, cospi_4_64,
+                        &step1[17], &step1[30]);
+  highbd_butterfly_sse2(step2[18], step2[29], cospi_4_64, cospi_28_64,
+                        &step1[29], &step1[18]);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  highbd_butterfly_sse2(step2[26], step2[21], cospi_12_64, cospi_20_64,
+                        &step1[21], &step1[26]);
+  highbd_butterfly_sse2(step2[22], step2[25], cospi_20_64, cospi_12_64,
+                        &step1[25], &step1[22]);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_135_4x32(__m128i *const io /*io[32]*/) {
+  __m128i temp[32];
+
+  highbd_idct32_135_4x32_quarter_1_2(io, temp);
+  highbd_idct32_135_4x32_quarter_3_4(io, temp);
+  // final stage
+  highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_135_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                       int stride, int bd) {
+  int i, j;
+
+  if (bd == 8) {
+    __m128i col[2][32], in[32], out[32];
+
+    for (i = 16; i < 32; i++) {
+      in[i] = _mm_setzero_si128();
+    }
+
+    // rows
+    for (i = 0; i < 2; i++) {
+      highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]);
+      highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &in[8]);
+      idct32_1024_8x32(in, col[i]);
+      input += 32 << 3;
+    }
+
+    // columns
+    for (i = 0; i < 32; i += 8) {
+      transpose_16bit_8x8(col[0] + i, in);
+      transpose_16bit_8x8(col[1] + i, in + 8);
+      idct32_1024_8x32(in, out);
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[8][32], out[32], *in;
+
+    for (i = 0; i < 4; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+      highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+      highbd_idct32_135_4x32(in);
+      input += 4 * 32;
+    }
+
+    for (i = 0; i < 32; i += 4) {
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      transpose_32bit_4x4(all[2] + i, out + 8);
+      transpose_32bit_4x4(all[3] + i, out + 12);
+      highbd_idct32_135_4x32(out);
+
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
+
+// -----------------------------------------------------------------------------
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_1(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+  __m128i step1[8], step2[8];
+
+  // stage 3
+  highbd_partial_butterfly_sse2(in[4], cospi_28_64, cospi_4_64, &step1[4],
+                                &step1[7]);
+
+  // stage 4
+  highbd_partial_butterfly_sse2(in[0], cospi_16_64, cospi_16_64, &step2[1],
+                                &step2[0]);
+  step2[4] = step1[4];
+  step2[5] = step1[4];
+  step2[6] = step1[7];
+  step2[7] = step1[7];
+
+  // stage 5
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[1];
+  step1[3] = step2[0];
+  step1[4] = step2[4];
+  highbd_butterfly_sse2(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5],
+                        &step1[6]);
+  step1[7] = step2[7];
+
+  // stage 6
+  out[0] = _mm_add_epi32(step1[0], step1[7]);
+  out[1] = _mm_add_epi32(step1[1], step1[6]);
+  out[2] = _mm_add_epi32(step1[2], step1[5]);
+  out[3] = _mm_add_epi32(step1[3], step1[4]);
+  out[4] = _mm_sub_epi32(step1[3], step1[4]);
+  out[5] = _mm_sub_epi32(step1[2], step1[5]);
+  out[6] = _mm_sub_epi32(step1[1], step1[6]);
+  out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_2(const __m128i *in /*in[32]*/,
+                                                   __m128i *out /*out[16]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 2
+  highbd_partial_butterfly_sse2(in[2], cospi_30_64, cospi_2_64, &step2[8],
+                                &step2[15]);
+  highbd_partial_butterfly_neg_sse2(in[6], cospi_6_64, cospi_26_64, &step2[11],
+                                    &step2[12]);
+
+  // stage 3
+  step1[8] = step2[8];
+  step1[9] = step2[8];
+  step1[14] = step2[15];
+  step1[15] = step2[15];
+  step1[10] = step2[11];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[13] = step2[12];
+
+  step1[10] =
+      _mm_sub_epi32(_mm_setzero_si128(), step1[10]);  // step1[10] = -step1[10]
+  step1[13] =
+      _mm_sub_epi32(_mm_setzero_si128(), step1[13]);  // step1[13] = -step1[13]
+  highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_34_4x32_quarter_1_2(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i temp[16];
+  highbd_idct32_34_4x32_quarter_1(in, temp);
+  highbd_idct32_34_4x32_quarter_2(in, temp);
+  // stage 7
+  highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_3_4(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 1
+  highbd_partial_butterfly_sse2(in[1], cospi_31_64, cospi_1_64, &step1[16],
+                                &step1[31]);
+  highbd_partial_butterfly_neg_sse2(in[7], cospi_7_64, cospi_25_64, &step1[19],
+                                    &step1[28]);
+
+  highbd_partial_butterfly_sse2(in[5], cospi_27_64, cospi_5_64, &step1[20],
+                                &step1[27]);
+  highbd_partial_butterfly_neg_sse2(in[3], cospi_3_64, cospi_29_64, &step1[23],
+                                    &step1[24]);
+
+  // stage 2
+  step2[16] = step1[16];
+  step2[17] = step1[16];
+  step2[18] = step1[19];
+  step2[19] = step1[19];
+  step2[20] = step1[20];
+  step2[21] = step1[20];
+  step2[22] = step1[23];
+  step2[23] = step1[23];
+
+  step2[24] = step1[24];
+  step2[25] = step1[24];
+  step2[26] = step1[27];
+  step2[27] = step1[27];
+  step2[28] = step1[28];
+  step2[29] = step1[28];
+  step2[30] = step1[31];
+  step2[31] = step1[31];
+
+  // stage 3
+  step2[18] =
+      _mm_sub_epi32(_mm_setzero_si128(), step2[18]);  // step2[18] = -step2[18]
+  step2[22] =
+      _mm_sub_epi32(_mm_setzero_si128(), step2[22]);  // step2[22] = -step2[22]
+  step2[25] =
+      _mm_sub_epi32(_mm_setzero_si128(), step2[25]);  // step2[25] = -step2[25]
+  step2[29] =
+      _mm_sub_epi32(_mm_setzero_si128(), step2[29]);  // step2[29] = -step2[29]
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  highbd_butterfly_sse2(step2[30], step2[17], cospi_28_64, cospi_4_64,
+                        &step1[17], &step1[30]);
+  highbd_butterfly_sse2(step2[18], step2[29], cospi_4_64, cospi_28_64,
+                        &step1[29], &step1[18]);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  highbd_butterfly_sse2(step2[26], step2[21], cospi_12_64, cospi_20_64,
+                        &step1[21], &step1[26]);
+  highbd_butterfly_sse2(step2[22], step2[25], cospi_20_64, cospi_12_64,
+                        &step1[25], &step1[22]);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_34_4x32(__m128i *const io /*io[32]*/) {
+  __m128i temp[32];
+
+  highbd_idct32_34_4x32_quarter_1_2(io, temp);
+  highbd_idct32_34_4x32_quarter_3_4(io, temp);
+  // final stage
+  highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_34_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                      int stride, int bd) {
+  int i, j;
+
+  if (bd == 8) {
+    __m128i col[32], in[32], out[32];
+
+    // rows
+    highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]);
+    idct32_34_8x32_sse2(in, col);
+
+    // columns
+    for (i = 0; i < 32; i += 8) {
+      transpose_16bit_8x8(col + i, in);
+      idct32_34_8x32_sse2(in, out);
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[8][32], out[32], *in;
+
+    for (i = 0; i < 4; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+      highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+      highbd_idct32_34_4x32(in);
+      input += 4 * 32;
+    }
+
+    for (i = 0; i < 32; i += 4) {
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      transpose_32bit_4x4(all[2] + i, out + 8);
+      transpose_32bit_4x4(all[3] + i, out + 12);
+      highbd_idct32_34_4x32(out);
+
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
+
+void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                     int stride, int bd) {
+  highbd_idct_1_add_kernel(input, dest, stride, bd, 32);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c
new file mode 100644
index 0000000000..2d0a53ac0a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c
@@ -0,0 +1,765 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h>  // SSE4.1
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_ssse3.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_idct32_4x32_quarter_2_stage_4_to_6(
+    __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) {
+  __m128i step2[32];
+
+  // stage 4
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
+                          &step2[9], &step2[14]);
+  highbd_butterfly_sse4_1(step1[13], step1[10], -cospi_8_64, cospi_24_64,
+                          &step2[10], &step2[13]);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  // stage 5
+  step1[8] = _mm_add_epi32(step2[8], step2[11]);
+  step1[9] = _mm_add_epi32(step2[9], step2[10]);
+  step1[10] = _mm_sub_epi32(step2[9], step2[10]);
+  step1[11] = _mm_sub_epi32(step2[8], step2[11]);
+  step1[12] = _mm_sub_epi32(step2[15], step2[12]);
+  step1[13] = _mm_sub_epi32(step2[14], step2[13]);
+  step1[14] = _mm_add_epi32(step2[14], step2[13]);
+  step1[15] = _mm_add_epi32(step2[15], step2[12]);
+
+  // stage 6
+  out[8] = step1[8];
+  out[9] = step1[9];
+  highbd_butterfly_sse4_1(step1[13], step1[10], cospi_16_64, cospi_16_64,
+                          &out[10], &out[13]);
+  highbd_butterfly_sse4_1(step1[12], step1[11], cospi_16_64, cospi_16_64,
+                          &out[11], &out[12]);
+  out[14] = step1[14];
+  out[15] = step1[15];
+}
+
+static INLINE void highbd_idct32_4x32_quarter_3_4_stage_4_to_7(
+    __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step2[32];
+
+  // stage 4
+  step2[16] = _mm_add_epi32(step1[16], step1[19]);
+  step2[17] = _mm_add_epi32(step1[17], step1[18]);
+  step2[18] = _mm_sub_epi32(step1[17], step1[18]);
+  step2[19] = _mm_sub_epi32(step1[16], step1[19]);
+  step2[20] = _mm_sub_epi32(step1[23], step1[20]);
+  step2[21] = _mm_sub_epi32(step1[22], step1[21]);
+  step2[22] = _mm_add_epi32(step1[22], step1[21]);
+  step2[23] = _mm_add_epi32(step1[23], step1[20]);
+
+  step2[24] = _mm_add_epi32(step1[24], step1[27]);
+  step2[25] = _mm_add_epi32(step1[25], step1[26]);
+  step2[26] = _mm_sub_epi32(step1[25], step1[26]);
+  step2[27] = _mm_sub_epi32(step1[24], step1[27]);
+  step2[28] = _mm_sub_epi32(step1[31], step1[28]);
+  step2[29] = _mm_sub_epi32(step1[30], step1[29]);
+  step2[30] = _mm_add_epi32(step1[29], step1[30]);
+  step2[31] = _mm_add_epi32(step1[28], step1[31]);
+
+  // stage 5
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  highbd_butterfly_sse4_1(step2[29], step2[18], cospi_24_64, cospi_8_64,
+                          &step1[18], &step1[29]);
+  highbd_butterfly_sse4_1(step2[28], step2[19], cospi_24_64, cospi_8_64,
+                          &step1[19], &step1[28]);
+  highbd_butterfly_sse4_1(step2[27], step2[20], -cospi_8_64, cospi_24_64,
+                          &step1[20], &step1[27]);
+  highbd_butterfly_sse4_1(step2[26], step2[21], -cospi_8_64, cospi_24_64,
+                          &step1[21], &step1[26]);
+  step1[22] = step2[22];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[25] = step2[25];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+
+  // stage 6
+  step2[16] = _mm_add_epi32(step1[16], step1[23]);
+  step2[17] = _mm_add_epi32(step1[17], step1[22]);
+  step2[18] = _mm_add_epi32(step1[18], step1[21]);
+  step2[19] = _mm_add_epi32(step1[19], step1[20]);
+  step2[20] = _mm_sub_epi32(step1[19], step1[20]);
+  step2[21] = _mm_sub_epi32(step1[18], step1[21]);
+  step2[22] = _mm_sub_epi32(step1[17], step1[22]);
+  step2[23] = _mm_sub_epi32(step1[16], step1[23]);
+
+  step2[24] = _mm_sub_epi32(step1[31], step1[24]);
+  step2[25] = _mm_sub_epi32(step1[30], step1[25]);
+  step2[26] = _mm_sub_epi32(step1[29], step1[26]);
+  step2[27] = _mm_sub_epi32(step1[28], step1[27]);
+  step2[28] = _mm_add_epi32(step1[27], step1[28]);
+  step2[29] = _mm_add_epi32(step1[26], step1[29]);
+  step2[30] = _mm_add_epi32(step1[25], step1[30]);
+  step2[31] = _mm_add_epi32(step1[24], step1[31]);
+
+  // stage 7
+  out[16] = step2[16];
+  out[17] = step2[17];
+  out[18] = step2[18];
+  out[19] = step2[19];
+  highbd_butterfly_sse4_1(step2[27], step2[20], cospi_16_64, cospi_16_64,
+                          &out[20], &out[27]);
+  highbd_butterfly_sse4_1(step2[26], step2[21], cospi_16_64, cospi_16_64,
+                          &out[21], &out[26]);
+  highbd_butterfly_sse4_1(step2[25], step2[22], cospi_16_64, cospi_16_64,
+                          &out[22], &out[25]);
+  highbd_butterfly_sse4_1(step2[24], step2[23], cospi_16_64, cospi_16_64,
+                          &out[23], &out[24]);
+  out[28] = step2[28];
+  out[29] = step2[29];
+  out[30] = step2[30];
+  out[31] = step2[31];
+}
+
+// Group the coefficient calculation into smaller functions to prevent stack
+// spillover in 32x32 idct optimizations:
+// quarter_1: 0-7
+// quarter_2: 8-15
+// quarter_3_4: 16-23, 24-31
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_1(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+  __m128i step1[8], step2[8];
+
+  // stage 3
+  highbd_butterfly_sse4_1(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4],
+                          &step1[7]);
+  highbd_butterfly_sse4_1(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5],
+                          &step1[6]);
+
+  // stage 4
+  highbd_butterfly_sse4_1(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1],
+                          &step2[0]);
+  highbd_butterfly_sse4_1(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2],
+                          &step2[3]);
+  step2[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+  // stage 5
+  step1[0] = _mm_add_epi32(step2[0], step2[3]);
+  step1[1] = _mm_add_epi32(step2[1], step2[2]);
+  step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+  step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+  step1[4] = step2[4];
+  highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64,
+                          &step1[5], &step1[6]);
+  step1[7] = step2[7];
+
+  // stage 6
+  out[0] = _mm_add_epi32(step1[0], step1[7]);
+  out[1] = _mm_add_epi32(step1[1], step1[6]);
+  out[2] = _mm_add_epi32(step1[2], step1[5]);
+  out[3] = _mm_add_epi32(step1[3], step1[4]);
+  out[4] = _mm_sub_epi32(step1[3], step1[4]);
+  out[5] = _mm_sub_epi32(step1[2], step1[5]);
+  out[6] = _mm_sub_epi32(step1[1], step1[6]);
+  out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_2(
+    const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 2
+  highbd_butterfly_sse4_1(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8],
+                          &step2[15]);
+  highbd_butterfly_sse4_1(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9],
+                          &step2[14]);
+  highbd_butterfly_sse4_1(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10],
+                          &step2[13]);
+  highbd_butterfly_sse4_1(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11],
+                          &step2[12]);
+
+  // stage 3
+  step1[8] = _mm_add_epi32(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+  step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+  step1[15] = _mm_add_epi32(step2[15], step2[14]);
+  step1[10] = _mm_sub_epi32(step2[11], step2[10]);
+  step1[11] = _mm_add_epi32(step2[11], step2[10]);
+  step1[12] = _mm_add_epi32(step2[12], step2[13]);
+  step1[13] = _mm_sub_epi32(step2[12], step2[13]);
+
+  highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_1024_4x32_quarter_1_2(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i temp[16];
+  highbd_idct32_1024_4x32_quarter_1(in, temp);
+  highbd_idct32_1024_4x32_quarter_2(in, temp);
+  // stage 7
+  highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_3_4(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 1
+  highbd_butterfly_sse4_1(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16],
+                          &step1[31]);
+  highbd_butterfly_sse4_1(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17],
+                          &step1[30]);
+  highbd_butterfly_sse4_1(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18],
+                          &step1[29]);
+  highbd_butterfly_sse4_1(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19],
+                          &step1[28]);
+
+  highbd_butterfly_sse4_1(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20],
+                          &step1[27]);
+  highbd_butterfly_sse4_1(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21],
+                          &step1[26]);
+
+  highbd_butterfly_sse4_1(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22],
+                          &step1[25]);
+  highbd_butterfly_sse4_1(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23],
+                          &step1[24]);
+
+  // stage 2
+  step2[16] = _mm_add_epi32(step1[16], step1[17]);
+  step2[17] = _mm_sub_epi32(step1[16], step1[17]);
+  step2[18] = _mm_sub_epi32(step1[19], step1[18]);
+  step2[19] = _mm_add_epi32(step1[19], step1[18]);
+  step2[20] = _mm_add_epi32(step1[20], step1[21]);
+  step2[21] = _mm_sub_epi32(step1[20], step1[21]);
+  step2[22] = _mm_sub_epi32(step1[23], step1[22]);
+  step2[23] = _mm_add_epi32(step1[23], step1[22]);
+
+  step2[24] = _mm_add_epi32(step1[24], step1[25]);
+  step2[25] = _mm_sub_epi32(step1[24], step1[25]);
+  step2[26] = _mm_sub_epi32(step1[27], step1[26]);
+  step2[27] = _mm_add_epi32(step1[27], step1[26]);
+  step2[28] = _mm_add_epi32(step1[28], step1[29]);
+  step2[29] = _mm_sub_epi32(step1[28], step1[29]);
+  step2[30] = _mm_sub_epi32(step1[31], step1[30]);
+  step2[31] = _mm_add_epi32(step1[31], step1[30]);
+
+  // stage 3
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64,
+                          &step1[17], &step1[30]);
+  highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64,
+                          &step1[18], &step1[29]);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64,
+                          &step1[21], &step1[26]);
+  highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64,
+                          &step1[22], &step1[25]);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_1024_4x32(__m128i *const io /*io[32]*/) {
+  __m128i temp[32];
+
+  highbd_idct32_1024_4x32_quarter_1_2(io, temp);
+  highbd_idct32_1024_4x32_quarter_3_4(io, temp);
+  // final stage
+  highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_1024_add_sse4_1(const tran_low_t *input,
+                                          uint16_t *dest, int stride, int bd) {
+  int i, j;
+
+  if (bd == 8) {
+    __m128i col[4][32], io[32];
+
+    // rows
+    for (i = 0; i < 4; i++) {
+      highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &io[0]);
+      highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &io[8]);
+      highbd_load_pack_transpose_32bit_8x8(&input[16], 32, &io[16]);
+      highbd_load_pack_transpose_32bit_8x8(&input[24], 32, &io[24]);
+      idct32_1024_8x32(io, col[i]);
+      input += 32 << 3;
+    }
+
+    // columns
+    for (i = 0; i < 32; i += 8) {
+      // Transpose 32x8 block to 8x32 block
+      transpose_16bit_8x8(col[0] + i, io);
+      transpose_16bit_8x8(col[1] + i, io + 8);
+      transpose_16bit_8x8(col[2] + i, io + 16);
+      transpose_16bit_8x8(col[3] + i, io + 24);
+      idct32_1024_8x32(io, io);
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_8(dest + j * stride, io[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[8][32], out[32], *in;
+
+    for (i = 0; i < 8; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+      highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+      highbd_load_transpose_32bit_8x4(&input[16], 32, &in[16]);
+      highbd_load_transpose_32bit_8x4(&input[24], 32, &in[24]);
+      highbd_idct32_1024_4x32(in);
+      input += 4 * 32;
+    }
+
+    for (i = 0; i < 32; i += 4) {
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      transpose_32bit_4x4(all[2] + i, out + 8);
+      transpose_32bit_4x4(all[3] + i, out + 12);
+      transpose_32bit_4x4(all[4] + i, out + 16);
+      transpose_32bit_4x4(all[5] + i, out + 20);
+      transpose_32bit_4x4(all[6] + i, out + 24);
+      transpose_32bit_4x4(all[7] + i, out + 28);
+      highbd_idct32_1024_4x32(out);
+
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
+
+// -----------------------------------------------------------------------------
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_1(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+  __m128i step1[8], step2[8];
+
+  // stage 3
+  highbd_partial_butterfly_sse4_1(in[4], cospi_28_64, cospi_4_64, &step1[4],
+                                  &step1[7]);
+  highbd_partial_butterfly_sse4_1(in[12], -cospi_20_64, cospi_12_64, &step1[5],
+                                  &step1[6]);
+
+  // stage 4
+  highbd_partial_butterfly_sse4_1(in[0], cospi_16_64, cospi_16_64, &step2[1],
+                                  &step2[0]);
+  highbd_partial_butterfly_sse4_1(in[8], cospi_24_64, cospi_8_64, &step2[2],
+                                  &step2[3]);
+  step2[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+  // stage 5
+  step1[0] = _mm_add_epi32(step2[0], step2[3]);
+  step1[1] = _mm_add_epi32(step2[1], step2[2]);
+  step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+  step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+  step1[4] = step2[4];
+  highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64,
+                          &step1[5], &step1[6]);
+  step1[7] = step2[7];
+
+  // stage 6
+  out[0] = _mm_add_epi32(step1[0], step1[7]);
+  out[1] = _mm_add_epi32(step1[1], step1[6]);
+  out[2] = _mm_add_epi32(step1[2], step1[5]);
+  out[3] = _mm_add_epi32(step1[3], step1[4]);
+  out[4] = _mm_sub_epi32(step1[3], step1[4]);
+  out[5] = _mm_sub_epi32(step1[2], step1[5]);
+  out[6] = _mm_sub_epi32(step1[1], step1[6]);
+  out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_2(
+    const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 2
+  highbd_partial_butterfly_sse4_1(in[2], cospi_30_64, cospi_2_64, &step2[8],
+                                  &step2[15]);
+  highbd_partial_butterfly_sse4_1(in[14], -cospi_18_64, cospi_14_64, &step2[9],
+                                  &step2[14]);
+  highbd_partial_butterfly_sse4_1(in[10], cospi_22_64, cospi_10_64, &step2[10],
+                                  &step2[13]);
+  highbd_partial_butterfly_sse4_1(in[6], -cospi_26_64, cospi_6_64, &step2[11],
+                                  &step2[12]);
+
+  // stage 3
+  step1[8] = _mm_add_epi32(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+  step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+  step1[15] = _mm_add_epi32(step2[15], step2[14]);
+  step1[10] = _mm_sub_epi32(step2[11], step2[10]);
+  step1[11] = _mm_add_epi32(step2[11], step2[10]);
+  step1[12] = _mm_add_epi32(step2[12], step2[13]);
+  step1[13] = _mm_sub_epi32(step2[12], step2[13]);
+
+  highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_135_4x32_quarter_1_2(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i temp[16];
+  highbd_idct32_135_4x32_quarter_1(in, temp);
+  highbd_idct32_135_4x32_quarter_2(in, temp);
+  // stage 7
+  highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_3_4(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 1
+  highbd_partial_butterfly_sse4_1(in[1], cospi_31_64, cospi_1_64, &step1[16],
+                                  &step1[31]);
+  highbd_partial_butterfly_sse4_1(in[15], -cospi_17_64, cospi_15_64, &step1[17],
+                                  &step1[30]);
+  highbd_partial_butterfly_sse4_1(in[9], cospi_23_64, cospi_9_64, &step1[18],
+                                  &step1[29]);
+  highbd_partial_butterfly_sse4_1(in[7], -cospi_25_64, cospi_7_64, &step1[19],
+                                  &step1[28]);
+
+  highbd_partial_butterfly_sse4_1(in[5], cospi_27_64, cospi_5_64, &step1[20],
+                                  &step1[27]);
+  highbd_partial_butterfly_sse4_1(in[11], -cospi_21_64, cospi_11_64, &step1[21],
+                                  &step1[26]);
+
+  highbd_partial_butterfly_sse4_1(in[13], cospi_19_64, cospi_13_64, &step1[22],
+                                  &step1[25]);
+  highbd_partial_butterfly_sse4_1(in[3], -cospi_29_64, cospi_3_64, &step1[23],
+                                  &step1[24]);
+
+  // stage 2
+  step2[16] = _mm_add_epi32(step1[16], step1[17]);
+  step2[17] = _mm_sub_epi32(step1[16], step1[17]);
+  step2[18] = _mm_sub_epi32(step1[19], step1[18]);
+  step2[19] = _mm_add_epi32(step1[19], step1[18]);
+  step2[20] = _mm_add_epi32(step1[20], step1[21]);
+  step2[21] = _mm_sub_epi32(step1[20], step1[21]);
+  step2[22] = _mm_sub_epi32(step1[23], step1[22]);
+  step2[23] = _mm_add_epi32(step1[23], step1[22]);
+
+  step2[24] = _mm_add_epi32(step1[24], step1[25]);
+  step2[25] = _mm_sub_epi32(step1[24], step1[25]);
+  step2[26] = _mm_sub_epi32(step1[27], step1[26]);
+  step2[27] = _mm_add_epi32(step1[27], step1[26]);
+  step2[28] = _mm_add_epi32(step1[28], step1[29]);
+  step2[29] = _mm_sub_epi32(step1[28], step1[29]);
+  step2[30] = _mm_sub_epi32(step1[31], step1[30]);
+  step2[31] = _mm_add_epi32(step1[31], step1[30]);
+
+  // stage 3
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64,
+                          &step1[17], &step1[30]);
+  highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64,
+                          &step1[18], &step1[29]);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64,
+                          &step1[21], &step1[26]);
+  highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64,
+                          &step1[22], &step1[25]);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_135_4x32(__m128i *const io /*io[32]*/) {
+  __m128i temp[32];
+
+  highbd_idct32_135_4x32_quarter_1_2(io, temp);
+  highbd_idct32_135_4x32_quarter_3_4(io, temp);
+  // final stage
+  highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_135_add_sse4_1(const tran_low_t *input,
+                                         uint16_t *dest, int stride, int bd) {
+  int i, j;
+
+  if (bd == 8) {
+    __m128i col[2][32], in[32], out[32];
+
+    // rows
+    for (i = 0; i < 2; i++) {
+      highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]);
+      highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &in[8]);
+      idct32_135_8x32_ssse3(in, col[i]);
+      input += 32 << 3;
+    }
+
+    // columns
+    for (i = 0; i < 32; i += 8) {
+      transpose_16bit_8x8(col[0] + i, in);
+      transpose_16bit_8x8(col[1] + i, in + 8);
+      idct32_135_8x32_ssse3(in, out);
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[8][32], out[32], *in;
+
+    for (i = 0; i < 4; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+      highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+      highbd_idct32_135_4x32(in);
+      input += 4 * 32;
+    }
+
+    for (i = 0; i < 32; i += 4) {
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      transpose_32bit_4x4(all[2] + i, out + 8);
+      transpose_32bit_4x4(all[3] + i, out + 12);
+      highbd_idct32_135_4x32(out);
+
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
+
+// -----------------------------------------------------------------------------
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_1(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+  __m128i step1[8], step2[8];
+
+  // stage 3
+  highbd_partial_butterfly_sse4_1(in[4], cospi_28_64, cospi_4_64, &step1[4],
+                                  &step1[7]);
+
+  // stage 4
+  highbd_partial_butterfly_sse4_1(in[0], cospi_16_64, cospi_16_64, &step2[1],
+                                  &step2[0]);
+  step2[4] = step1[4];
+  step2[5] = step1[4];
+  step2[6] = step1[7];
+  step2[7] = step1[7];
+
+  // stage 5
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[1];
+  step1[3] = step2[0];
+  step1[4] = step2[4];
+  highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64,
+                          &step1[5], &step1[6]);
+  step1[7] = step2[7];
+
+  // stage 6
+  out[0] = _mm_add_epi32(step1[0], step1[7]);
+  out[1] = _mm_add_epi32(step1[1], step1[6]);
+  out[2] = _mm_add_epi32(step1[2], step1[5]);
+  out[3] = _mm_add_epi32(step1[3], step1[4]);
+  out[4] = _mm_sub_epi32(step1[3], step1[4]);
+  out[5] = _mm_sub_epi32(step1[2], step1[5]);
+  out[6] = _mm_sub_epi32(step1[1], step1[6]);
+  out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_2(const __m128i *in /*in[32]*/,
+                                                   __m128i *out /*out[16]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 2
+  highbd_partial_butterfly_sse4_1(in[2], cospi_30_64, cospi_2_64, &step2[8],
+                                  &step2[15]);
+  highbd_partial_butterfly_sse4_1(in[6], -cospi_26_64, cospi_6_64, &step2[11],
+                                  &step2[12]);
+
+  // stage 3
+  step1[8] = step2[8];
+  step1[9] = step2[8];
+  step1[14] = step2[15];
+  step1[15] = step2[15];
+  step1[10] = step2[11];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[13] = step2[12];
+
+  highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_34_4x32_quarter_1_2(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i temp[16];
+  highbd_idct32_34_4x32_quarter_1(in, temp);
+  highbd_idct32_34_4x32_quarter_2(in, temp);
+  // stage 7
+  highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_3_4(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 1
+  highbd_partial_butterfly_sse4_1(in[1], cospi_31_64, cospi_1_64, &step1[16],
+                                  &step1[31]);
+  highbd_partial_butterfly_sse4_1(in[7], -cospi_25_64, cospi_7_64, &step1[19],
+                                  &step1[28]);
+
+  highbd_partial_butterfly_sse4_1(in[5], cospi_27_64, cospi_5_64, &step1[20],
+                                  &step1[27]);
+  highbd_partial_butterfly_sse4_1(in[3], -cospi_29_64, cospi_3_64, &step1[23],
+                                  &step1[24]);
+
+  // stage 2
+  step2[16] = step1[16];
+  step2[17] = step1[16];
+  step2[18] = step1[19];
+  step2[19] = step1[19];
+  step2[20] = step1[20];
+  step2[21] = step1[20];
+  step2[22] = step1[23];
+  step2[23] = step1[23];
+
+  step2[24] = step1[24];
+  step2[25] = step1[24];
+  step2[26] = step1[27];
+  step2[27] = step1[27];
+  step2[28] = step1[28];
+  step2[29] = step1[28];
+  step2[30] = step1[31];
+  step2[31] = step1[31];
+
+  // stage 3
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64,
+                          &step1[17], &step1[30]);
+  highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64,
+                          &step1[18], &step1[29]);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64,
+                          &step1[21], &step1[26]);
+  highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64,
+                          &step1[22], &step1[25]);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_34_4x32(__m128i *const io /*io[32]*/) {
+  __m128i temp[32];
+
+  highbd_idct32_34_4x32_quarter_1_2(io, temp);
+  highbd_idct32_34_4x32_quarter_3_4(io, temp);
+  // final stage
+  highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_34_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                        int stride, int bd) {
+  int i, j;
+
+  if (bd == 8) {
+    __m128i col[32], in[32], out[32];
+
+    // rows
+    highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]);
+    idct32_34_8x32_ssse3(in, col);
+
+    // columns
+    for (i = 0; i < 32; i += 8) {
+      transpose_16bit_8x8(col + i, in);
+      idct32_34_8x32_ssse3(in, out);
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[8][32], out[32], *in;
+
+    for (i = 0; i < 4; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+      highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+      highbd_idct32_34_4x32(in);
+      input += 4 * 32;
+    }
+
+    for (i = 0; i < 32; i += 4) {
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      transpose_32bit_4x4(all[2] + i, out + 8);
+      transpose_32bit_4x4(all[3] + i, out + 12);
+      highbd_idct32_34_4x32(out);
+
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
new file mode 100644
index 0000000000..b9c8884f99
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
@@ -0,0 +1,160 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+
+static INLINE __m128i dct_const_round_shift_4_sse2(const __m128i in0,
+                                                   const __m128i in1) {
+  const __m128i t0 = _mm_unpacklo_epi32(in0, in1);  // 0, 1
+  const __m128i t1 = _mm_unpackhi_epi32(in0, in1);  // 2, 3
+  const __m128i t2 = _mm_unpacklo_epi64(t0, t1);    // 0, 1, 2, 3
+  return dct_const_round_shift_sse2(t2);
+}
+
+static INLINE void highbd_idct4_small_sse2(__m128i *const io) {
+  const __m128i cospi_p16_p16 = _mm_setr_epi32(cospi_16_64, 0, cospi_16_64, 0);
+  const __m128i cospi_p08_p08 = _mm_setr_epi32(cospi_8_64, 0, cospi_8_64, 0);
+  const __m128i cospi_p24_p24 = _mm_setr_epi32(cospi_24_64, 0, cospi_24_64, 0);
+  __m128i temp1[4], temp2[4], step[4];
+
+  transpose_32bit_4x4(io, io);
+
+  // Note: There is no 32-bit signed multiply SIMD instruction in SSE2.
+  //       _mm_mul_epu32() is used which can only guarantee the lower 32-bit
+  //       (signed) result is meaningful, which is enough in this function.
+
+  // stage 1
+  temp1[0] = _mm_add_epi32(io[0], io[2]);             // input[0] + input[2]
+  temp2[0] = _mm_sub_epi32(io[0], io[2]);             // input[0] - input[2]
+  temp1[1] = _mm_srli_si128(temp1[0], 4);             // 1, 3
+  temp2[1] = _mm_srli_si128(temp2[0], 4);             // 1, 3
+  temp1[0] = _mm_mul_epu32(temp1[0], cospi_p16_p16);  // ([0] + [2])*cospi_16_64
+  temp1[1] = _mm_mul_epu32(temp1[1], cospi_p16_p16);  // ([0] + [2])*cospi_16_64
+  temp2[0] = _mm_mul_epu32(temp2[0], cospi_p16_p16);  // ([0] - [2])*cospi_16_64
+  temp2[1] = _mm_mul_epu32(temp2[1], cospi_p16_p16);  // ([0] - [2])*cospi_16_64
+  step[0] = dct_const_round_shift_4_sse2(temp1[0], temp1[1]);
+  step[1] = dct_const_round_shift_4_sse2(temp2[0], temp2[1]);
+
+  temp1[3] = _mm_srli_si128(io[1], 4);
+  temp2[3] = _mm_srli_si128(io[3], 4);
+  temp1[0] = _mm_mul_epu32(io[1], cospi_p24_p24);     // input[1] * cospi_24_64
+  temp1[1] = _mm_mul_epu32(temp1[3], cospi_p24_p24);  // input[1] * cospi_24_64
+  temp2[0] = _mm_mul_epu32(io[1], cospi_p08_p08);     // input[1] * cospi_8_64
+  temp2[1] = _mm_mul_epu32(temp1[3], cospi_p08_p08);  // input[1] * cospi_8_64
+  temp1[2] = _mm_mul_epu32(io[3], cospi_p08_p08);     // input[3] * cospi_8_64
+  temp1[3] = _mm_mul_epu32(temp2[3], cospi_p08_p08);  // input[3] * cospi_8_64
+  temp2[2] = _mm_mul_epu32(io[3], cospi_p24_p24);     // input[3] * cospi_24_64
+  temp2[3] = _mm_mul_epu32(temp2[3], cospi_p24_p24);  // input[3] * cospi_24_64
+  temp1[0] = _mm_sub_epi64(temp1[0], temp1[2]);  // [1]*cospi_24 - [3]*cospi_8
+  temp1[1] = _mm_sub_epi64(temp1[1], temp1[3]);  // [1]*cospi_24 - [3]*cospi_8
+  temp2[0] = _mm_add_epi64(temp2[0], temp2[2]);  // [1]*cospi_8 + [3]*cospi_24
+  temp2[1] = _mm_add_epi64(temp2[1], temp2[3]);  // [1]*cospi_8 + [3]*cospi_24
+  step[2] = dct_const_round_shift_4_sse2(temp1[0], temp1[1]);
+  step[3] = dct_const_round_shift_4_sse2(temp2[0], temp2[1]);
+
+  // stage 2
+  io[0] = _mm_add_epi32(step[0], step[3]);  // step[0] + step[3]
+  io[1] = _mm_add_epi32(step[1], step[2]);  // step[1] + step[2]
+  io[2] = _mm_sub_epi32(step[1], step[2]);  // step[1] - step[2]
+  io[3] = _mm_sub_epi32(step[0], step[3]);  // step[0] - step[3]
+}
+
+static INLINE void highbd_idct4_large_sse2(__m128i *const io) {
+  __m128i step[4];
+
+  transpose_32bit_4x4(io, io);
+
+  // stage 1
+  highbd_butterfly_cospi16_sse2(io[0], io[2], &step[0], &step[1]);
+  highbd_butterfly_sse2(io[1], io[3], cospi_24_64, cospi_8_64, &step[2],
+                        &step[3]);
+
+  // stage 2
+  io[0] = _mm_add_epi32(step[0], step[3]);  // step[0] + step[3]
+  io[1] = _mm_add_epi32(step[1], step[2]);  // step[1] + step[2]
+  io[2] = _mm_sub_epi32(step[1], step[2]);  // step[1] - step[2]
+  io[3] = _mm_sub_epi32(step[0], step[3]);  // step[0] - step[3]
+}
+
+void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                    int stride, int bd) {
+  int16_t max = 0, min = 0;
+  __m128i io[4], io_short[2];
+
+  io[0] = _mm_load_si128((const __m128i *)(input + 0));
+  io[1] = _mm_load_si128((const __m128i *)(input + 4));
+  io[2] = _mm_load_si128((const __m128i *)(input + 8));
+  io[3] = _mm_load_si128((const __m128i *)(input + 12));
+
+  io_short[0] = _mm_packs_epi32(io[0], io[1]);
+  io_short[1] = _mm_packs_epi32(io[2], io[3]);
+
+  if (bd != 8) {
+    __m128i max_input, min_input;
+
+    max_input = _mm_max_epi16(io_short[0], io_short[1]);
+    min_input = _mm_min_epi16(io_short[0], io_short[1]);
+    max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 8));
+    min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 8));
+    max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 4));
+    min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 4));
+    max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 2));
+    min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 2));
+    max = (int16_t)_mm_extract_epi16(max_input, 0);
+    min = (int16_t)_mm_extract_epi16(min_input, 0);
+  }
+
+  if (bd == 8 || (max < 4096 && min >= -4096)) {
+    idct4_sse2(io_short);
+    idct4_sse2(io_short);
+    io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8));
+    io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8));
+    io[0] = _mm_srai_epi16(io_short[0], 4);
+    io[1] = _mm_srai_epi16(io_short[1], 4);
+  } else {
+    if (max < 32767 && min > -32768) {
+      highbd_idct4_small_sse2(io);
+      highbd_idct4_small_sse2(io);
+    } else {
+      highbd_idct4_large_sse2(io);
+      highbd_idct4_large_sse2(io);
+    }
+    io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8));
+    io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8));
+  }
+
+  recon_and_store_4x4(io, dest, stride, bd);
+}
+
+void vpx_highbd_idct4x4_1_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                   int stride, int bd) {
+  int a1, i;
+  tran_low_t out;
+  __m128i dc, d;
+
+  out = HIGHBD_WRAPLOW(
+      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+  out =
+      HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
+  a1 = ROUND_POWER_OF_TWO(out, 4);
+  dc = _mm_set1_epi16(a1);
+
+  for (i = 0; i < 4; ++i) {
+    d = _mm_loadl_epi64((const __m128i *)dest);
+    d = add_clamp(d, dc, bd);
+    _mm_storel_epi64((__m128i *)dest, d);
+    dest += stride;
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c
new file mode 100644
index 0000000000..fe74d272ad
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c
@@ -0,0 +1,47 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h>  // SSE4.1
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+
+void vpx_highbd_idct4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                      int stride, int bd) {
+  __m128i io[4];
+
+  io[0] = _mm_load_si128((const __m128i *)(input + 0));
+  io[1] = _mm_load_si128((const __m128i *)(input + 4));
+  io[2] = _mm_load_si128((const __m128i *)(input + 8));
+  io[3] = _mm_load_si128((const __m128i *)(input + 12));
+
+  if (bd == 8) {
+    __m128i io_short[2];
+
+    io_short[0] = _mm_packs_epi32(io[0], io[1]);
+    io_short[1] = _mm_packs_epi32(io[2], io[3]);
+    idct4_sse2(io_short);
+    idct4_sse2(io_short);
+    io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8));
+    io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8));
+    io[0] = _mm_srai_epi16(io_short[0], 4);
+    io[1] = _mm_srai_epi16(io_short[1], 4);
+  } else {
+    highbd_idct4_sse4_1(io);
+    highbd_idct4_sse4_1(io);
+    io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8));
+    io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8));
+  }
+
+  recon_and_store_4x4(io, dest, stride, bd);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
new file mode 100644
index 0000000000..bb7a510e15
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
@@ -0,0 +1,213 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+
+static void highbd_idct8x8_half1d(__m128i *const io) {
+  __m128i step1[8], step2[8];
+
+  transpose_32bit_4x4x2(io, io);
+
+  // stage 1
+  step1[0] = io[0];
+  step1[2] = io[4];
+  step1[1] = io[2];
+  step1[3] = io[6];
+  highbd_butterfly_sse2(io[1], io[7], cospi_28_64, cospi_4_64, &step1[4],
+                        &step1[7]);
+  highbd_butterfly_sse2(io[5], io[3], cospi_12_64, cospi_20_64, &step1[5],
+                        &step1[6]);
+
+  // stage 2
+  highbd_butterfly_cospi16_sse2(step1[0], step1[2], &step2[0], &step2[1]);
+  highbd_butterfly_sse2(step1[1], step1[3], cospi_24_64, cospi_8_64, &step2[2],
+                        &step2[3]);
+  step2[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = _mm_add_epi32(step2[0], step2[3]);
+  step1[1] = _mm_add_epi32(step2[1], step2[2]);
+  step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+  step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+  step1[4] = step2[4];
+  highbd_butterfly_cospi16_sse2(step2[6], step2[5], &step1[6], &step1[5]);
+  step1[7] = step2[7];
+
+  // stage 4
+  highbd_idct8_stage4(step1, io);
+}
+
+static void highbd_idct8x8_12_half1d(__m128i *const io) {
+  __m128i temp1[4], sign[2], step1[8], step2[8];
+
+  transpose_32bit_4x4(io, io);
+
+  // stage 1
+  step1[0] = io[0];
+  step1[1] = io[2];
+  abs_extend_64bit_sse2(io[1], temp1, sign);
+  step1[4] = multiplication_round_shift_sse2(temp1, sign, cospi_28_64);
+  step1[7] = multiplication_round_shift_sse2(temp1, sign, cospi_4_64);
+  abs_extend_64bit_sse2(io[3], temp1, sign);
+  step1[5] = multiplication_neg_round_shift_sse2(temp1, sign, cospi_20_64);
+  step1[6] = multiplication_round_shift_sse2(temp1, sign, cospi_12_64);
+
+  // stage 2
+  abs_extend_64bit_sse2(step1[0], temp1, sign);
+  step2[0] = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
+  abs_extend_64bit_sse2(step1[1], temp1, sign);
+  step2[2] = multiplication_round_shift_sse2(temp1, sign, cospi_24_64);
+  step2[3] = multiplication_round_shift_sse2(temp1, sign, cospi_8_64);
+  step2[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = _mm_add_epi32(step2[0], step2[3]);
+  step1[1] = _mm_add_epi32(step2[0], step2[2]);
+  step1[2] = _mm_sub_epi32(step2[0], step2[2]);
+  step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+  step1[4] = step2[4];
+  highbd_butterfly_cospi16_sse2(step2[6], step2[5], &step1[6], &step1[5]);
+  step1[7] = step2[7];
+
+  // stage 4
+  highbd_idct8_stage4(step1, io);
+}
+
+void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                    int stride, int bd) {
+  __m128i io[16];
+
+  io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+  io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4));
+  io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+  io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4));
+  io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+  io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4));
+  io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+  io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4));
+
+  if (bd == 8) {
+    __m128i io_short[8];
+
+    io_short[0] = _mm_packs_epi32(io[0], io[4]);
+    io_short[1] = _mm_packs_epi32(io[1], io[5]);
+    io_short[2] = _mm_packs_epi32(io[2], io[6]);
+    io_short[3] = _mm_packs_epi32(io[3], io[7]);
+    io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+    io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+    io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+    io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+    io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+    io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+    io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+    io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+    io_short[4] = _mm_packs_epi32(io[8], io[12]);
+    io_short[5] = _mm_packs_epi32(io[9], io[13]);
+    io_short[6] = _mm_packs_epi32(io[10], io[14]);
+    io_short[7] = _mm_packs_epi32(io[11], io[15]);
+
+    vpx_idct8_sse2(io_short);
+    vpx_idct8_sse2(io_short);
+    round_shift_8x8(io_short, io);
+  } else {
+    __m128i temp[4];
+
+    highbd_idct8x8_half1d(io);
+
+    io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+    io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+    io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+    io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+    io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+    io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+    io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+    io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+    highbd_idct8x8_half1d(&io[8]);
+
+    temp[0] = io[4];
+    temp[1] = io[5];
+    temp[2] = io[6];
+    temp[3] = io[7];
+    io[4] = io[8];
+    io[5] = io[9];
+    io[6] = io[10];
+    io[7] = io[11];
+    highbd_idct8x8_half1d(io);
+
+    io[8] = temp[0];
+    io[9] = temp[1];
+    io[10] = temp[2];
+    io[11] = temp[3];
+    highbd_idct8x8_half1d(&io[8]);
+
+    highbd_idct8x8_final_round(io);
+  }
+
+  recon_and_store_8x8(io, dest, stride, bd);
+}
+
+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                    int stride, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i io[16];
+
+  io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+  io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+  io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+  io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+
+  if (bd == 8) {
+    __m128i io_short[8];
+
+    io_short[0] = _mm_packs_epi32(io[0], zero);
+    io_short[1] = _mm_packs_epi32(io[1], zero);
+    io_short[2] = _mm_packs_epi32(io[2], zero);
+    io_short[3] = _mm_packs_epi32(io[3], zero);
+
+    idct8x8_12_add_kernel_sse2(io_short);
+    round_shift_8x8(io_short, io);
+  } else {
+    __m128i temp[4];
+
+    highbd_idct8x8_12_half1d(io);
+
+    temp[0] = io[4];
+    temp[1] = io[5];
+    temp[2] = io[6];
+    temp[3] = io[7];
+    highbd_idct8x8_12_half1d(io);
+
+    io[8] = temp[0];
+    io[9] = temp[1];
+    io[10] = temp[2];
+    io[11] = temp[3];
+    highbd_idct8x8_12_half1d(&io[8]);
+
+    highbd_idct8x8_final_round(io);
+  }
+
+  recon_and_store_8x8(io, dest, stride, bd);
+}
+
+void vpx_highbd_idct8x8_1_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                   int stride, int bd) {
+  highbd_idct_1_add_kernel(input, dest, stride, bd, 8);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
new file mode 100644
index 0000000000..8b2e3d2415
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
@@ -0,0 +1,210 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h>  // SSE4.1
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_ssse3.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+
+void vpx_highbd_idct8x8_half1d_sse4_1(__m128i *const io) {
+  __m128i step1[8], step2[8];
+
+  transpose_32bit_4x4x2(io, io);
+
+  // stage 1
+  step1[0] = io[0];
+  step1[2] = io[4];
+  step1[1] = io[2];
+  step1[3] = io[6];
+  highbd_butterfly_sse4_1(io[1], io[7], cospi_28_64, cospi_4_64, &step1[4],
+                          &step1[7]);
+  highbd_butterfly_sse4_1(io[5], io[3], cospi_12_64, cospi_20_64, &step1[5],
+                          &step1[6]);
+
+  // stage 2
+  highbd_butterfly_cospi16_sse4_1(step1[0], step1[2], &step2[0], &step2[1]);
+  highbd_butterfly_sse4_1(step1[1], step1[3], cospi_24_64, cospi_8_64,
+                          &step2[2], &step2[3]);
+  step2[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = _mm_add_epi32(step2[0], step2[3]);
+  step1[1] = _mm_add_epi32(step2[1], step2[2]);
+  step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+  step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+  step1[4] = step2[4];
+  highbd_butterfly_cospi16_sse4_1(step2[6], step2[5], &step1[6], &step1[5]);
+  step1[7] = step2[7];
+
+  // stage 4
+  highbd_idct8_stage4(step1, io);
+}
+
+static void highbd_idct8x8_12_half1d(__m128i *const io) {
+  __m128i temp1[2], step1[8], step2[8];
+
+  transpose_32bit_4x4(io, io);
+
+  // stage 1
+  step1[0] = io[0];
+  step1[1] = io[2];
+  extend_64bit(io[1], temp1);
+  step1[4] = multiplication_round_shift_sse4_1(temp1, cospi_28_64);
+  step1[7] = multiplication_round_shift_sse4_1(temp1, cospi_4_64);
+  extend_64bit(io[3], temp1);
+  step1[5] = multiplication_round_shift_sse4_1(temp1, -cospi_20_64);
+  step1[6] = multiplication_round_shift_sse4_1(temp1, cospi_12_64);
+
+  // stage 2
+  extend_64bit(step1[0], temp1);
+  step2[0] = multiplication_round_shift_sse4_1(temp1, cospi_16_64);
+  extend_64bit(step1[1], temp1);
+  step2[2] = multiplication_round_shift_sse4_1(temp1, cospi_24_64);
+  step2[3] = multiplication_round_shift_sse4_1(temp1, cospi_8_64);
+  step2[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = _mm_add_epi32(step2[0], step2[3]);
+  step1[1] = _mm_add_epi32(step2[0], step2[2]);
+  step1[2] = _mm_sub_epi32(step2[0], step2[2]);
+  step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+  step1[4] = step2[4];
+  highbd_butterfly_cospi16_sse4_1(step2[6], step2[5], &step1[6], &step1[5]);
+  step1[7] = step2[7];
+
+  // stage 4
+  highbd_idct8_stage4(step1, io);
+}
+
+void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                      int stride, int bd) {
+  __m128i io[16];
+
+  io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+  io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4));
+  io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+  io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4));
+  io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+  io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4));
+  io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+  io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4));
+
+  if (bd == 8) {
+    __m128i io_short[8];
+
+    io_short[0] = _mm_packs_epi32(io[0], io[4]);
+    io_short[1] = _mm_packs_epi32(io[1], io[5]);
+    io_short[2] = _mm_packs_epi32(io[2], io[6]);
+    io_short[3] = _mm_packs_epi32(io[3], io[7]);
+    io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+    io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+    io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+    io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+    io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+    io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+    io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+    io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+    io_short[4] = _mm_packs_epi32(io[8], io[12]);
+    io_short[5] = _mm_packs_epi32(io[9], io[13]);
+    io_short[6] = _mm_packs_epi32(io[10], io[14]);
+    io_short[7] = _mm_packs_epi32(io[11], io[15]);
+
+    vpx_idct8_sse2(io_short);
+    vpx_idct8_sse2(io_short);
+    round_shift_8x8(io_short, io);
+  } else {
+    __m128i temp[4];
+
+    vpx_highbd_idct8x8_half1d_sse4_1(io);
+
+    io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+    io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+    io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+    io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+    io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+    io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+    io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+    io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+    vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
+
+    temp[0] = io[4];
+    temp[1] = io[5];
+    temp[2] = io[6];
+    temp[3] = io[7];
+    io[4] = io[8];
+    io[5] = io[9];
+    io[6] = io[10];
+    io[7] = io[11];
+    vpx_highbd_idct8x8_half1d_sse4_1(io);
+
+    io[8] = temp[0];
+    io[9] = temp[1];
+    io[10] = temp[2];
+    io[11] = temp[3];
+    vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
+
+    highbd_idct8x8_final_round(io);
+  }
+
+  recon_and_store_8x8(io, dest, stride, bd);
+}
+
+void vpx_highbd_idct8x8_12_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                      int stride, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i io[16];
+
+  io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+  io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+  io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+  io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+
+  if (bd == 8) {
+    __m128i io_short[8];
+
+    io_short[0] = _mm_packs_epi32(io[0], zero);
+    io_short[1] = _mm_packs_epi32(io[1], zero);
+    io_short[2] = _mm_packs_epi32(io[2], zero);
+    io_short[3] = _mm_packs_epi32(io[3], zero);
+
+    idct8x8_12_add_kernel_ssse3(io_short);
+    round_shift_8x8(io_short, io);
+  } else {
+    __m128i temp[4];
+
+    highbd_idct8x8_12_half1d(io);
+
+    temp[0] = io[4];
+    temp[1] = io[5];
+    temp[2] = io[6];
+    temp[3] = io[7];
+    highbd_idct8x8_12_half1d(io);
+
+    io[8] = temp[0];
+    io[9] = temp[1];
+    io[10] = temp[2];
+    io[11] = temp[3];
+    highbd_idct8x8_12_half1d(&io[8]);
+
+    highbd_idct8x8_final_round(io);
+  }
+
+  recon_and_store_8x8(io, dest, stride, bd);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c
new file mode 100644
index 0000000000..43634aea3a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c
@@ -0,0 +1,534 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left);
+  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+  (void)above;
+  (void)bd;
+  _mm_storel_epi64((__m128i *)dst, row0);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row1);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row2);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+void vpx_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+  const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+  const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+  const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+  const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+  (void)above;
+  (void)bd;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7));
+}
+
+static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride,
+                                       const __m128i *row) {
+  const __m128i val = _mm_unpacklo_epi64(*row, *row);
+  _mm_store_si128((__m128i *)*dst, val);
+  _mm_store_si128((__m128i *)(*dst + 8), val);
+  *dst += stride;
+}
+
+static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride,
+                                       const __m128i *row) {
+  const __m128i val = _mm_unpackhi_epi64(*row, *row);
+  _mm_store_si128((__m128i *)(*dst), val);
+  _mm_store_si128((__m128i *)(*dst + 8), val);
+  *dst += stride;
+}
+
+void vpx_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  int i;
+  (void)above;
+  (void)bd;
+
+  for (i = 0; i < 2; i++, left += 8) {
+    const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+    const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+    const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+    const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+    const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+    const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+    const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+    const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+    const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+    h_store_16_unpacklo(&dst, stride, &row0);
+    h_store_16_unpacklo(&dst, stride, &row1);
+    h_store_16_unpacklo(&dst, stride, &row2);
+    h_store_16_unpacklo(&dst, stride, &row3);
+    h_store_16_unpackhi(&dst, stride, &row4);
+    h_store_16_unpackhi(&dst, stride, &row5);
+    h_store_16_unpackhi(&dst, stride, &row6);
+    h_store_16_unpackhi(&dst, stride, &row7);
+  }
+}
+
+static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride,
+                                       const __m128i *row) {
+  const __m128i val = _mm_unpacklo_epi64(*row, *row);
+  _mm_store_si128((__m128i *)(*dst), val);
+  _mm_store_si128((__m128i *)(*dst + 8), val);
+  _mm_store_si128((__m128i *)(*dst + 16), val);
+  _mm_store_si128((__m128i *)(*dst + 24), val);
+  *dst += stride;
+}
+
+static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride,
+                                       const __m128i *row) {
+  const __m128i val = _mm_unpackhi_epi64(*row, *row);
+  _mm_store_si128((__m128i *)(*dst), val);
+  _mm_store_si128((__m128i *)(*dst + 8), val);
+  _mm_store_si128((__m128i *)(*dst + 16), val);
+  _mm_store_si128((__m128i *)(*dst + 24), val);
+  *dst += stride;
+}
+
+void vpx_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  int i;
+  (void)above;
+  (void)bd;
+
+  for (i = 0; i < 4; i++, left += 8) {
+    const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+    const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+    const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+    const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+    const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+    const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+    const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+    const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+    const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+    h_store_32_unpacklo(&dst, stride, &row0);
+    h_store_32_unpacklo(&dst, stride, &row1);
+    h_store_32_unpacklo(&dst, stride, &row2);
+    h_store_32_unpacklo(&dst, stride, &row3);
+    h_store_32_unpackhi(&dst, stride, &row4);
+    h_store_32_unpackhi(&dst, stride, &row5);
+    h_store_32_unpackhi(&dst, stride, &row6);
+    h_store_32_unpackhi(&dst, stride, &row7);
+  }
+}
+
+//------------------------------------------------------------------------------
+// DC 4x4
+
+static INLINE __m128i dc_sum_4(const uint16_t *ref) {
+  const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref);
+  const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
+  const __m128i a = _mm_add_epi16(_dcba, _xxdc);
+  return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
+}
+
+static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
+                                const __m128i *dc) {
+  const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
+  int i;
+  for (i = 0; i < 4; ++i, dst += stride) {
+    _mm_storel_epi64((__m128i *)dst, dc_dup);
+  }
+}
+
+void vpx_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i two = _mm_cvtsi32_si128(2);
+  const __m128i sum = dc_sum_4(left);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+  (void)above;
+  (void)bd;
+  dc_store_4x4(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i two = _mm_cvtsi32_si128(2);
+  const __m128i sum = dc_sum_4(above);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+  (void)left;
+  (void)bd;
+  dc_store_4x4(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  (void)above;
+  (void)left;
+  dc_store_4x4(dst, stride, &dc_dup);
+}
+
+//------------------------------------------------------------------------------
+// DC 8x8
+
+static INLINE __m128i dc_sum_8(const uint16_t *ref) {
+  const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref);
+  const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8));
+  const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
+  const __m128i a = _mm_add_epi16(_dcba, _xxdc);
+
+  return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
+}
+
+static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride,
+                                const __m128i *dc) {
+  const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+  const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+  int i;
+  for (i = 0; i < 8; ++i, dst += stride) {
+    _mm_store_si128((__m128i *)dst, dc_dup);
+  }
+}
+
+void vpx_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i four = _mm_cvtsi32_si128(4);
+  const __m128i sum = dc_sum_8(left);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+  (void)above;
+  (void)bd;
+  dc_store_8x8(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i four = _mm_cvtsi32_si128(4);
+  const __m128i sum = dc_sum_8(above);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+  (void)left;
+  (void)bd;
+  dc_store_8x8(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  (void)above;
+  (void)left;
+  dc_store_8x8(dst, stride, &dc_dup);
+}
+
+//------------------------------------------------------------------------------
+// DC 16x16
+
+static INLINE __m128i dc_sum_16(const uint16_t *ref) {
+  const __m128i sum_lo = dc_sum_8(ref);
+  const __m128i sum_hi = dc_sum_8(ref + 8);
+  return _mm_add_epi16(sum_lo, sum_hi);
+}
+
+static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride,
+                                  const __m128i *dc) {
+  const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+  const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+  int i;
+  for (i = 0; i < 16; ++i, dst += stride) {
+    _mm_store_si128((__m128i *)dst, dc_dup);
+    _mm_store_si128((__m128i *)(dst + 8), dc_dup);
+  }
+}
+
+void vpx_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  const __m128i eight = _mm_cvtsi32_si128(8);
+  const __m128i sum = dc_sum_16(left);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+  (void)above;
+  (void)bd;
+  dc_store_16x16(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i eight = _mm_cvtsi32_si128(8);
+  const __m128i sum = dc_sum_16(above);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+  (void)left;
+  (void)bd;
+  dc_store_16x16(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  (void)above;
+  (void)left;
+  dc_store_16x16(dst, stride, &dc_dup);
+}
+
+//------------------------------------------------------------------------------
+// DC 32x32
+
+static INLINE __m128i dc_sum_32(const uint16_t *ref) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i sum_a = dc_sum_16(ref);
+  const __m128i sum_b = dc_sum_16(ref + 16);
+  // 12 bit bd will outrange, so expand to 32 bit before adding final total
+  return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero),
+                       _mm_unpacklo_epi16(sum_b, zero));
+}
+
+static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride,
+                                  const __m128i *dc) {
+  const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+  const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+  int i;
+  for (i = 0; i < 32; ++i, dst += stride) {
+    _mm_store_si128((__m128i *)dst, dc_dup);
+    _mm_store_si128((__m128i *)(dst + 8), dc_dup);
+    _mm_store_si128((__m128i *)(dst + 16), dc_dup);
+    _mm_store_si128((__m128i *)(dst + 24), dc_dup);
+  }
+}
+
+void vpx_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  const __m128i sixteen = _mm_cvtsi32_si128(16);
+  const __m128i sum = dc_sum_32(left);
+  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+  (void)above;
+  (void)bd;
+  dc_store_32x32(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i sixteen = _mm_cvtsi32_si128(16);
+  const __m128i sum = dc_sum_32(above);
+  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+  (void)left;
+  (void)bd;
+  dc_store_32x32(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  (void)above;
+  (void)left;
+  dc_store_32x32(dst, stride, &dc_dup);
+}
+
+// -----------------------------------------------------------------------------
+/*
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+*/
+static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y,
+                                 const __m128i *z) {
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i a = _mm_avg_epu16(*x, *z);
+  const __m128i b =
+      _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one));
+  return _mm_avg_epu16(b, *y);
+}
+
+void vpx_highbd_d117_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const int I = left[0];
+  const int J = left[1];
+  const int K = left[2];
+  const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4));
+  const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0);
+  const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1);
+  const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2);
+  const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2);
+  const __m128i IXABCD00 = _mm_srli_si128(KJIXABCD, 4);
+  const __m128i avg2 = _mm_avg_epu16(KJIXABCD, JIXABCD0);
+  const __m128i avg3 = avg3_epu16(&KJIXABCD, &JIXABCD0, &IXABCD00);
+  const __m128i row0 = _mm_srli_si128(avg2, 6);
+  const __m128i row1 = _mm_srli_si128(avg3, 4);
+  const __m128i row2 = _mm_srli_si128(avg2, 4);
+  const __m128i row3 = _mm_srli_si128(avg3, 2);
+  (void)bd;
+  _mm_storel_epi64((__m128i *)dst, row0);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row1);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row2);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row3);
+
+  dst -= stride;
+  dst[0] = _mm_extract_epi16(avg3, 1);
+  dst[stride] = _mm_extract_epi16(avg3, 0);
+}
+
+void vpx_highbd_d135_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const int I = left[0];
+  const int J = left[1];
+  const int K = left[2];
+  const int L = left[3];
+  const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4));
+  const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0);
+  const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1);
+  const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2);
+  const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2);
+  const __m128i LKJIXABC = _mm_insert_epi16(_mm_slli_si128(KJIXABCD, 2), L, 0);
+  const __m128i avg3 = avg3_epu16(&JIXABCD0, &KJIXABCD, &LKJIXABC);
+  const __m128i row0 = _mm_srli_si128(avg3, 6);
+  const __m128i row1 = _mm_srli_si128(avg3, 4);
+  const __m128i row2 = _mm_srli_si128(avg3, 2);
+  const __m128i row3 = avg3;
+  (void)bd;
+  _mm_storel_epi64((__m128i *)dst, row0);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row1);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row2);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+void vpx_highbd_d153_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const int I = left[0];
+  const int J = left[1];
+  const int K = left[2];
+  const int L = left[3];
+  const __m128i XXXXXABC = _mm_castps_si128(
+      _mm_loadh_pi(_mm_setzero_ps(), (const __m64 *)(above - 1)));
+  const __m128i LXXXXABC = _mm_insert_epi16(XXXXXABC, L, 0);
+  const __m128i LKXXXABC = _mm_insert_epi16(LXXXXABC, K, 1);
+  const __m128i LKJXXABC = _mm_insert_epi16(LKXXXABC, J, 2);
+  const __m128i LKJIXABC = _mm_insert_epi16(LKJXXABC, I, 3);
+  const __m128i KJIXABC0 = _mm_srli_si128(LKJIXABC, 2);
+  const __m128i JIXABC00 = _mm_srli_si128(LKJIXABC, 4);
+  const __m128i avg3 = avg3_epu16(&LKJIXABC, &KJIXABC0, &JIXABC00);
+  const __m128i avg2 = _mm_avg_epu16(LKJIXABC, KJIXABC0);
+  const __m128i row3 = _mm_unpacklo_epi16(avg2, avg3);
+  const __m128i row2 = _mm_srli_si128(row3, 4);
+  const __m128i row1 = _mm_srli_si128(row3, 8);
+  const __m128i row0 = _mm_srli_si128(avg3, 4);
+  (void)bd;
+  _mm_storel_epi64((__m128i *)dst, row0);
+  dst[0] = _mm_extract_epi16(avg2, 3);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row1);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row2);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+void vpx_highbd_d207_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const __m128i IJKL0000 = _mm_load_si128((const __m128i *)left);
+  const __m128i LLLL0000 = _mm_shufflelo_epi16(IJKL0000, 0xff);
+  const __m128i IJKLLLLL = _mm_unpacklo_epi64(IJKL0000, LLLL0000);
+  const __m128i JKLLLLL0 = _mm_srli_si128(IJKLLLLL, 2);
+  const __m128i KLLLLL00 = _mm_srli_si128(IJKLLLLL, 4);
+  const __m128i avg3 = avg3_epu16(&IJKLLLLL, &JKLLLLL0, &KLLLLL00);
+  const __m128i avg2 = _mm_avg_epu16(IJKLLLLL, JKLLLLL0);
+  const __m128i row0 = _mm_unpacklo_epi16(avg2, avg3);
+  const __m128i row1 = _mm_srli_si128(row0, 4);
+  const __m128i row2 = _mm_srli_si128(row0, 8);
+  const __m128i row3 = LLLL0000;
+  (void)above;
+  (void)bd;
+  _mm_storel_epi64((__m128i *)dst, row0);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row1);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row2);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+void vpx_highbd_d63_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above);
+  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
+  const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4);
+  const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00);
+  const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGH0);
+  const __m128i row0 = avg2;
+  const __m128i row1 = avg3;
+  const __m128i row2 = _mm_srli_si128(avg2, 2);
+  const __m128i row3 = _mm_srli_si128(avg3, 2);
+  (void)left;
+  (void)bd;
+  _mm_storel_epi64((__m128i *)dst, row0);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row1);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row2);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row3);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c
new file mode 100644
index 0000000000..d673fac493
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c
@@ -0,0 +1,930 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <tmmintrin.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+// -----------------------------------------------------------------------------
+/*
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+*/
+static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y,
+                                 const __m128i *z) {
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i a = _mm_avg_epu16(*x, *z);
+  const __m128i b =
+      _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one));
+  return _mm_avg_epu16(b, *y);
+}
+
+void vpx_highbd_d45_predictor_4x4_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above);
+  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
+  const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4);
+  const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00);
+  (void)left;
+  (void)bd;
+  _mm_storel_epi64((__m128i *)dst, avg3);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6));
+  dst[3] = above[7];  // aka H
+}
+
+static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride,
+                               __m128i *row, const __m128i *ar) {
+  *row = _mm_alignr_epi8(*ar, *row, 2);
+  _mm_store_si128((__m128i *)*dst, *row);
+  *dst += stride;
+}
+
+void vpx_highbd_d45_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
+  const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff);
+  const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH);
+  const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2);
+  const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4);
+  __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH);
+  (void)left;
+  (void)bd;
+  _mm_store_si128((__m128i *)dst, avg3);
+  dst += stride;
+  d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+  d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+  d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+  d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+  d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+  d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+  d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+}
+
+static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride,
+                                __m128i *row_0, __m128i *row_1,
+                                const __m128i *ar) {
+  *row_0 = _mm_alignr_epi8(*row_1, *row_0, 2);
+  *row_1 = _mm_alignr_epi8(*ar, *row_1, 2);
+  _mm_store_si128((__m128i *)*dst, *row_0);
+  _mm_store_si128((__m128i *)(*dst + 8), *row_1);
+  *dst += stride;
+}
+
+void vpx_highbd_d45_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i A0 = _mm_load_si128((const __m128i *)above);
+  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+  const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff);
+  const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
+  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+  const __m128i B1 = _mm_alignr_epi8(AR, A1, 2);
+  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+  const __m128i C1 = _mm_alignr_epi8(AR, A1, 4);
+  __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  (void)left;
+  (void)bd;
+  _mm_store_si128((__m128i *)dst, avg3_0);
+  _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+  dst += stride;
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+}
+
+void vpx_highbd_d45_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i A0 = _mm_load_si128((const __m128i *)above);
+  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+  const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
+  const __m128i AR0 = _mm_shufflehi_epi16(A3, 0xff);
+  const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
+  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+  const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
+  const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
+  const __m128i B3 = _mm_alignr_epi8(AR, A3, 2);
+  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+  const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
+  const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
+  const __m128i C3 = _mm_alignr_epi8(AR, A3, 4);
+  __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+  __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+  int i;
+  (void)left;
+  (void)bd;
+  _mm_store_si128((__m128i *)dst, avg3_0);
+  _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+  _mm_store_si128((__m128i *)(dst + 16), avg3_2);
+  _mm_store_si128((__m128i *)(dst + 24), avg3_3);
+  dst += stride;
+  for (i = 1; i < 32; ++i) {
+    avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2);
+    avg3_1 = _mm_alignr_epi8(avg3_2, avg3_1, 2);
+    avg3_2 = _mm_alignr_epi8(avg3_3, avg3_2, 2);
+    avg3_3 = _mm_alignr_epi8(AR, avg3_3, 2);
+    _mm_store_si128((__m128i *)dst, avg3_0);
+    _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+    _mm_store_si128((__m128i *)(dst + 16), avg3_2);
+    _mm_store_si128((__m128i *)(dst + 24), avg3_3);
+    dst += stride;
+  }
+}
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                rotate_right_epu16[16]) = { 2,  3,  4,  5,  6,  7,  8, 9,
+                                            10, 11, 12, 13, 14, 15, 0, 1 };
+
+static INLINE __m128i rotr_epu16(__m128i *a, const __m128i *rotrw) {
+  *a = _mm_shuffle_epi8(*a, *rotrw);
+  return *a;
+}
+
+void vpx_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+  const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
+  const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
+  const __m128i IXABCDEF =
+      _mm_alignr_epi8(XABCDEFG, _mm_slli_si128(IJKLMNOP, 14), 14);
+  const __m128i avg3 = avg3_epu16(&ABCDEFGH, &XABCDEFG, &IXABCDEF);
+  const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, XABCDEFG);
+  const __m128i XIJKLMNO =
+      _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
+  const __m128i JKLMNOP0 = _mm_srli_si128(IJKLMNOP, 2);
+  __m128i avg3_left = avg3_epu16(&XIJKLMNO, &IJKLMNOP, &JKLMNOP0);
+  __m128i rowa = avg2;
+  __m128i rowb = avg3;
+  int i;
+  (void)bd;
+  for (i = 0; i < 8; i += 2) {
+    _mm_store_si128((__m128i *)dst, rowa);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, rowb);
+    dst += stride;
+    rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14);
+    rowb = _mm_alignr_epi8(rowb, rotr_epu16(&avg3_left, &rotrw), 14);
+  }
+}
+
+void vpx_highbd_d117_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+  const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i A0 = _mm_load_si128((const __m128i *)above);
+  const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7));
+  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+  const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+  const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+  const __m128i L0 = _mm_load_si128((const __m128i *)left);
+  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+  const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14);
+  const __m128i C1 = _mm_alignr_epi8(B1, B0, 14);
+  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14);
+  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+  const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2);
+  const __m128i L1_ = _mm_srli_si128(L1, 2);
+  __m128i rowa_0 = avg2_0;
+  __m128i rowa_1 = avg2_1;
+  __m128i rowb_0 = avg3_0;
+  __m128i rowb_1 = avg3_1;
+  __m128i avg3_left[2];
+  int i, j;
+  (void)bd;
+  avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_);
+  avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_);
+  for (i = 0; i < 2; ++i) {
+    __m128i avg_left = avg3_left[i];
+    for (j = 0; j < 8; j += 2) {
+      _mm_store_si128((__m128i *)dst, rowa_0);
+      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
+      dst += stride;
+      _mm_store_si128((__m128i *)dst, rowb_0);
+      _mm_store_si128((__m128i *)(dst + 8), rowb_1);
+      dst += stride;
+      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
+      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
+      rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14);
+      rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14);
+    }
+  }
+}
+
+void vpx_highbd_d117_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+  const __m128i A0 = _mm_load_si128((const __m128i *)above);
+  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+  const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
+  const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7));
+  const __m128i B2 = _mm_loadu_si128((const __m128i *)(above + 15));
+  const __m128i B3 = _mm_loadu_si128((const __m128i *)(above + 23));
+  const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+  const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+  const __m128i avg2_2 = _mm_avg_epu16(A2, B2);
+  const __m128i avg2_3 = _mm_avg_epu16(A3, B3);
+  const __m128i L0 = _mm_load_si128((const __m128i *)left);
+  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+  const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
+  const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
+  const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14);
+  const __m128i C1 = _mm_alignr_epi8(B1, B0, 14);
+  const __m128i C2 = _mm_alignr_epi8(B2, B1, 14);
+  const __m128i C3 = _mm_alignr_epi8(B3, B2, 14);
+  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+  const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14);
+  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+  const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
+  const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
+  const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2);
+  const __m128i L1_ = _mm_alignr_epi8(L2, L1, 2);
+  const __m128i L2_ = _mm_alignr_epi8(L3, L2, 2);
+  const __m128i L3_ = _mm_srli_si128(L3, 2);
+  __m128i rowa_0 = avg2_0;
+  __m128i rowa_1 = avg2_1;
+  __m128i rowa_2 = avg2_2;
+  __m128i rowa_3 = avg2_3;
+  __m128i rowb_0 = avg3_0;
+  __m128i rowb_1 = avg3_1;
+  __m128i rowb_2 = avg3_2;
+  __m128i rowb_3 = avg3_3;
+  __m128i avg3_left[4];
+  int i, j;
+  (void)bd;
+  avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_);
+  avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_);
+  avg3_left[2] = avg3_epu16(&XL2, &L2, &L2_);
+  avg3_left[3] = avg3_epu16(&XL3, &L3, &L3_);
+  for (i = 0; i < 4; ++i) {
+    __m128i avg_left = avg3_left[i];
+    for (j = 0; j < 8; j += 2) {
+      _mm_store_si128((__m128i *)dst, rowa_0);
+      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
+      _mm_store_si128((__m128i *)(dst + 16), rowa_2);
+      _mm_store_si128((__m128i *)(dst + 24), rowa_3);
+      dst += stride;
+      _mm_store_si128((__m128i *)dst, rowb_0);
+      _mm_store_si128((__m128i *)(dst + 8), rowb_1);
+      _mm_store_si128((__m128i *)(dst + 16), rowb_2);
+      _mm_store_si128((__m128i *)(dst + 24), rowb_3);
+      dst += stride;
+      rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14);
+      rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14);
+      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
+      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
+      rowb_3 = _mm_alignr_epi8(rowb_3, rowb_2, 14);
+      rowb_2 = _mm_alignr_epi8(rowb_2, rowb_1, 14);
+      rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14);
+      rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14);
+    }
+  }
+}
+
+void vpx_highbd_d135_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+  const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
+  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
+  const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
+  const __m128i XIJKLMNO =
+      _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
+  const __m128i AXIJKLMN =
+      _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(ABCDEFGH, 14), 14);
+  const __m128i avg3 = avg3_epu16(&XABCDEFG, &ABCDEFGH, &BCDEFGH0);
+  __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN);
+  __m128i rowa = avg3;
+  int i;
+  (void)bd;
+  for (i = 0; i < 8; ++i) {
+    rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14);
+    _mm_store_si128((__m128i *)dst, rowa);
+    dst += stride;
+  }
+}
+
+void vpx_highbd_d135_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i B0 = _mm_load_si128((const __m128i *)above);
+  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
+  const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8));
+  const __m128i L0 = _mm_load_si128((const __m128i *)left);
+  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+  const __m128i C0 = _mm_alignr_epi8(B1, B0, 2);
+  const __m128i C1 = _mm_srli_si128(B1, 2);
+  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
+  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+  const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14);
+  const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14);
+  __m128i rowa_0 = avg3_0;
+  __m128i rowa_1 = avg3_1;
+  __m128i avg3_left[2];
+  int i, j;
+  (void)bd;
+  avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_);
+  avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_);
+  for (i = 0; i < 2; ++i) {
+    __m128i avg_left = avg3_left[i];
+    for (j = 0; j < 8; ++j) {
+      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
+      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
+      _mm_store_si128((__m128i *)dst, rowa_0);
+      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
+      dst += stride;
+    }
+  }
+}
+
+void vpx_highbd_d135_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
+  const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15));
+  const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23));
+  const __m128i B0 = _mm_load_si128((const __m128i *)above);
+  const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8));
+  const __m128i B2 = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i B3 = _mm_load_si128((const __m128i *)(above + 24));
+  const __m128i L0 = _mm_load_si128((const __m128i *)left);
+  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+  const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
+  const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
+  const __m128i C0 = _mm_alignr_epi8(B1, B0, 2);
+  const __m128i C1 = _mm_alignr_epi8(B2, B1, 2);
+  const __m128i C2 = _mm_alignr_epi8(B3, B2, 2);
+  const __m128i C3 = _mm_srli_si128(B3, 2);
+  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+  const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
+  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+  const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
+  const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
+  const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14);
+  const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14);
+  const __m128i L2_ = _mm_alignr_epi8(XL2, XL1, 14);
+  const __m128i L3_ = _mm_alignr_epi8(XL3, XL2, 14);
+  __m128i rowa_0 = avg3_0;
+  __m128i rowa_1 = avg3_1;
+  __m128i rowa_2 = avg3_2;
+  __m128i rowa_3 = avg3_3;
+  __m128i avg3_left[4];
+  int i, j;
+  (void)bd;
+  avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_);
+  avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_);
+  avg3_left[2] = avg3_epu16(&L2, &XL2, &L2_);
+  avg3_left[3] = avg3_epu16(&L3, &XL3, &L3_);
+  for (i = 0; i < 4; ++i) {
+    __m128i avg_left = avg3_left[i];
+    for (j = 0; j < 8; ++j) {
+      rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14);
+      rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14);
+      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
+      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
+      _mm_store_si128((__m128i *)dst, rowa_0);
+      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
+      _mm_store_si128((__m128i *)(dst + 16), rowa_2);
+      _mm_store_si128((__m128i *)(dst + 24), rowa_3);
+      dst += stride;
+    }
+  }
+}
+
+void vpx_highbd_d153_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i ABCDEFG0 = _mm_srli_si128(XABCDEFG, 2);
+  const __m128i BCDEFG00 = _mm_srli_si128(XABCDEFG, 4);
+  const __m128i avg3 = avg3_epu16(&BCDEFG00, &ABCDEFG0, &XABCDEFG);
+  const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
+  const __m128i XIJKLMNO =
+      _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
+  const __m128i AXIJKLMN =
+      _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(XABCDEFG, 12), 14);
+  const __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN);
+  const __m128i avg2_left = _mm_avg_epu16(IJKLMNOP, XIJKLMNO);
+  const __m128i avg2_avg3_lo = _mm_unpacklo_epi16(avg2_left, avg3_left);
+  const __m128i avg2_avg3_hi = _mm_unpackhi_epi16(avg2_left, avg3_left);
+  const __m128i row0 =
+      _mm_alignr_epi8(avg3, _mm_slli_si128(avg2_avg3_lo, 12), 12);
+  const __m128i row1 =
+      _mm_alignr_epi8(row0, _mm_slli_si128(avg2_avg3_lo, 8), 12);
+  const __m128i row2 =
+      _mm_alignr_epi8(row1, _mm_slli_si128(avg2_avg3_lo, 4), 12);
+  const __m128i row3 = _mm_alignr_epi8(row2, avg2_avg3_lo, 12);
+  const __m128i row4 =
+      _mm_alignr_epi8(row3, _mm_slli_si128(avg2_avg3_hi, 12), 12);
+  const __m128i row5 =
+      _mm_alignr_epi8(row4, _mm_slli_si128(avg2_avg3_hi, 8), 12);
+  const __m128i row6 =
+      _mm_alignr_epi8(row5, _mm_slli_si128(avg2_avg3_hi, 4), 12);
+  const __m128i row7 = _mm_alignr_epi8(row6, avg2_avg3_hi, 12);
+  (void)bd;
+  _mm_store_si128((__m128i *)dst, row0);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row1);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row2);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row3);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row4);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row5);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row6);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row7);
+}
+
+void vpx_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
+  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+  const __m128i B1 = _mm_srli_si128(A1, 2);
+  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+  const __m128i C1 = _mm_srli_si128(A1, 4);
+  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  const __m128i L0 = _mm_load_si128((const __m128i *)left);
+  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
+  const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14);
+  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+  const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12);
+  const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0);
+  const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0);
+  const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1);
+  const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1);
+  __m128i row_0 = avg3_0;
+  __m128i row_1 = avg3_1;
+  __m128i avg2_avg3_left[2][2];
+  int i, j;
+  (void)bd;
+
+  avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0);
+  avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0);
+  avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1);
+  avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1);
+
+  for (j = 0; j < 2; ++j) {
+    for (i = 0; i < 2; ++i) {
+      const __m128i avg2_avg3 = avg2_avg3_left[j][i];
+      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12);
+      _mm_store_si128((__m128i *)dst, row_0);
+      _mm_store_si128((__m128i *)(dst + 8), row_1);
+      dst += stride;
+      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12);
+      _mm_store_si128((__m128i *)dst, row_0);
+      _mm_store_si128((__m128i *)(dst + 8), row_1);
+      dst += stride;
+      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12);
+      _mm_store_si128((__m128i *)dst, row_0);
+      _mm_store_si128((__m128i *)(dst + 8), row_1);
+      dst += stride;
+      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+      row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12);
+      _mm_store_si128((__m128i *)dst, row_0);
+      _mm_store_si128((__m128i *)(dst + 8), row_1);
+      dst += stride;
+    }
+  }
+}
+
+void vpx_highbd_d153_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
+  const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15));
+  const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23));
+  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+  const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
+  const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
+  const __m128i B3 = _mm_srli_si128(A3, 2);
+  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+  const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
+  const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
+  const __m128i C3 = _mm_srli_si128(A3, 4);
+  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+  const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+  const __m128i L0 = _mm_load_si128((const __m128i *)left);
+  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+  const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
+  const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
+  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
+  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+  const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
+  const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
+  const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14);
+  const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12);
+  const __m128i AXL2 = _mm_alignr_epi8(L2, L1, 12);
+  const __m128i AXL3 = _mm_alignr_epi8(L3, L2, 12);
+  const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0);
+  const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1);
+  const __m128i avg3_left_2 = avg3_epu16(&L2, &XL2, &AXL2);
+  const __m128i avg3_left_3 = avg3_epu16(&L3, &XL3, &AXL3);
+  const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0);
+  const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1);
+  const __m128i avg2_left_2 = _mm_avg_epu16(L2, XL2);
+  const __m128i avg2_left_3 = _mm_avg_epu16(L3, XL3);
+  __m128i row_0 = avg3_0;
+  __m128i row_1 = avg3_1;
+  __m128i row_2 = avg3_2;
+  __m128i row_3 = avg3_3;
+  __m128i avg2_avg3_left[4][2];
+  int i, j;
+  (void)bd;
+
+  avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0);
+  avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0);
+  avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1);
+  avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1);
+  avg2_avg3_left[2][0] = _mm_unpacklo_epi16(avg2_left_2, avg3_left_2);
+  avg2_avg3_left[2][1] = _mm_unpackhi_epi16(avg2_left_2, avg3_left_2);
+  avg2_avg3_left[3][0] = _mm_unpacklo_epi16(avg2_left_3, avg3_left_3);
+  avg2_avg3_left[3][1] = _mm_unpackhi_epi16(avg2_left_3, avg3_left_3);
+
+  for (j = 0; j < 4; ++j) {
+    for (i = 0; i < 2; ++i) {
+      const __m128i avg2_avg3 = avg2_avg3_left[j][i];
+      row_3 = _mm_alignr_epi8(row_3, row_2, 12);
+      row_2 = _mm_alignr_epi8(row_2, row_1, 12);
+      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12);
+      _mm_store_si128((__m128i *)dst, row_0);
+      _mm_store_si128((__m128i *)(dst + 8), row_1);
+      _mm_store_si128((__m128i *)(dst + 16), row_2);
+      _mm_store_si128((__m128i *)(dst + 24), row_3);
+      dst += stride;
+      row_3 = _mm_alignr_epi8(row_3, row_2, 12);
+      row_2 = _mm_alignr_epi8(row_2, row_1, 12);
+      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12);
+      _mm_store_si128((__m128i *)dst, row_0);
+      _mm_store_si128((__m128i *)(dst + 8), row_1);
+      _mm_store_si128((__m128i *)(dst + 16), row_2);
+      _mm_store_si128((__m128i *)(dst + 24), row_3);
+      dst += stride;
+      row_3 = _mm_alignr_epi8(row_3, row_2, 12);
+      row_2 = _mm_alignr_epi8(row_2, row_1, 12);
+      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12);
+      _mm_store_si128((__m128i *)dst, row_0);
+      _mm_store_si128((__m128i *)(dst + 8), row_1);
+      _mm_store_si128((__m128i *)(dst + 16), row_2);
+      _mm_store_si128((__m128i *)(dst + 24), row_3);
+      dst += stride;
+      row_3 = _mm_alignr_epi8(row_3, row_2, 12);
+      row_2 = _mm_alignr_epi8(row_2, row_1, 12);
+      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+      row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12);
+      _mm_store_si128((__m128i *)dst, row_0);
+      _mm_store_si128((__m128i *)(dst + 8), row_1);
+      _mm_store_si128((__m128i *)(dst + 16), row_2);
+      _mm_store_si128((__m128i *)(dst + 24), row_3);
+      dst += stride;
+    }
+  }
+}
+
+static INLINE void d207_store_4x8(uint16_t **dst, const ptrdiff_t stride,
+                                  const __m128i *a, const __m128i *b) {
+  _mm_store_si128((__m128i *)*dst, *a);
+  *dst += stride;
+  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4));
+  *dst += stride;
+  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8));
+  *dst += stride;
+  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12));
+  *dst += stride;
+}
+
+void vpx_highbd_d207_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)left);
+  const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff);
+  const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH);
+  const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2);
+  const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4);
+  const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH);
+  const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH);
+  const __m128i out_a = _mm_unpacklo_epi16(avg2, avg3);
+  const __m128i out_b = _mm_unpackhi_epi16(avg2, avg3);
+  (void)above;
+  (void)bd;
+  d207_store_4x8(&dst, stride, &out_a, &out_b);
+  d207_store_4x8(&dst, stride, &out_b, &HHHHHHHH);
+}
+
+static INLINE void d207_store_4x16(uint16_t **dst, const ptrdiff_t stride,
+                                   const __m128i *a, const __m128i *b,
+                                   const __m128i *c) {
+  _mm_store_si128((__m128i *)*dst, *a);
+  _mm_store_si128((__m128i *)(*dst + 8), *b);
+  *dst += stride;
+  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4));
+  _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 4));
+  *dst += stride;
+  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8));
+  _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 8));
+  *dst += stride;
+  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12));
+  _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 12));
+  *dst += stride;
+}
+
+void vpx_highbd_d207_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i A0 = _mm_load_si128((const __m128i *)left);
+  const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8));
+  const __m128i LR0 = _mm_shufflehi_epi16(A1, 0xff);
+  const __m128i LR = _mm_unpackhi_epi64(LR0, LR0);
+  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+  const __m128i B1 = _mm_alignr_epi8(LR, A1, 2);
+  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+  const __m128i C1 = _mm_alignr_epi8(LR, A1, 4);
+  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+  const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+  const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0);
+  const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0);
+  const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1);
+  const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1);
+  (void)above;
+  (void)bd;
+  d207_store_4x16(&dst, stride, &out_a, &out_b, &out_c);
+  d207_store_4x16(&dst, stride, &out_b, &out_c, &out_d);
+  d207_store_4x16(&dst, stride, &out_c, &out_d, &LR);
+  d207_store_4x16(&dst, stride, &out_d, &LR, &LR);
+}
+
+static INLINE void d207_store_4x32(uint16_t **dst, const ptrdiff_t stride,
+                                   const __m128i *a, const __m128i *b,
+                                   const __m128i *c, const __m128i *d,
+                                   const __m128i *e) {
+  _mm_store_si128((__m128i *)*dst, *a);
+  _mm_store_si128((__m128i *)(*dst + 8), *b);
+  _mm_store_si128((__m128i *)(*dst + 16), *c);
+  _mm_store_si128((__m128i *)(*dst + 24), *d);
+  *dst += stride;
+  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4));
+  _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 4));
+  _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 4));
+  _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 4));
+  *dst += stride;
+  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8));
+  _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 8));
+  _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 8));
+  _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 8));
+  *dst += stride;
+  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12));
+  _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 12));
+  _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 12));
+  _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 12));
+  *dst += stride;
+}
+
+void vpx_highbd_d207_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i A0 = _mm_load_si128((const __m128i *)left);
+  const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8));
+  const __m128i A2 = _mm_load_si128((const __m128i *)(left + 16));
+  const __m128i A3 = _mm_load_si128((const __m128i *)(left + 24));
+  const __m128i LR0 = _mm_shufflehi_epi16(A3, 0xff);
+  const __m128i LR = _mm_unpackhi_epi64(LR0, LR0);
+  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+  const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
+  const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
+  const __m128i B3 = _mm_alignr_epi8(LR, A3, 2);
+  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+  const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
+  const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
+  const __m128i C3 = _mm_alignr_epi8(LR, A3, 4);
+  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+  const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+  const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+  const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+  const __m128i avg2_2 = _mm_avg_epu16(A2, B2);
+  const __m128i avg2_3 = _mm_avg_epu16(A3, B3);
+  const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0);
+  const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0);
+  const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1);
+  const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1);
+  const __m128i out_e = _mm_unpacklo_epi16(avg2_2, avg3_2);
+  const __m128i out_f = _mm_unpackhi_epi16(avg2_2, avg3_2);
+  const __m128i out_g = _mm_unpacklo_epi16(avg2_3, avg3_3);
+  const __m128i out_h = _mm_unpackhi_epi16(avg2_3, avg3_3);
+  (void)above;
+  (void)bd;
+  d207_store_4x32(&dst, stride, &out_a, &out_b, &out_c, &out_d, &out_e);
+  d207_store_4x32(&dst, stride, &out_b, &out_c, &out_d, &out_e, &out_f);
+  d207_store_4x32(&dst, stride, &out_c, &out_d, &out_e, &out_f, &out_g);
+  d207_store_4x32(&dst, stride, &out_d, &out_e, &out_f, &out_g, &out_h);
+  d207_store_4x32(&dst, stride, &out_e, &out_f, &out_g, &out_h, &LR);
+  d207_store_4x32(&dst, stride, &out_f, &out_g, &out_h, &LR, &LR);
+  d207_store_4x32(&dst, stride, &out_g, &out_h, &LR, &LR, &LR);
+  d207_store_4x32(&dst, stride, &out_h, &LR, &LR, &LR, &LR);
+}
+
+static INLINE void d63_store_4x8(uint16_t **dst, const ptrdiff_t stride,
+                                 __m128i *a, __m128i *b, const __m128i *ar) {
+  _mm_store_si128((__m128i *)*dst, *a);
+  *dst += stride;
+  _mm_store_si128((__m128i *)*dst, *b);
+  *dst += stride;
+  *a = _mm_alignr_epi8(*ar, *a, 2);
+  *b = _mm_alignr_epi8(*ar, *b, 2);
+  _mm_store_si128((__m128i *)*dst, *a);
+  *dst += stride;
+  _mm_store_si128((__m128i *)*dst, *b);
+  *dst += stride;
+  *a = _mm_alignr_epi8(*ar, *a, 2);
+  *b = _mm_alignr_epi8(*ar, *b, 2);
+}
+
+void vpx_highbd_d63_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
+  const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff);
+  const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH);
+  const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2);
+  const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4);
+  __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH);
+  __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH);
+  (void)left;
+  (void)bd;
+  d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH);
+  d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH);
+}
+
+void vpx_highbd_d63_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i A0 = _mm_load_si128((const __m128i *)above);
+  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+  const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff);
+  const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
+  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+  const __m128i B1 = _mm_alignr_epi8(AR, A1, 2);
+  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+  const __m128i C1 = _mm_alignr_epi8(AR, A1, 4);
+  __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+  __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+  int i;
+  (void)left;
+  (void)bd;
+  for (i = 0; i < 14; i += 2) {
+    _mm_store_si128((__m128i *)dst, avg2_0);
+    _mm_store_si128((__m128i *)(dst + 8), avg2_1);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, avg3_0);
+    _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+    dst += stride;
+    avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2);
+    avg2_1 = _mm_alignr_epi8(AR, avg2_1, 2);
+    avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2);
+    avg3_1 = _mm_alignr_epi8(AR, avg3_1, 2);
+  }
+  _mm_store_si128((__m128i *)dst, avg2_0);
+  _mm_store_si128((__m128i *)(dst + 8), avg2_1);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, avg3_0);
+  _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+}
+
+void vpx_highbd_d63_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i A0 = _mm_load_si128((const __m128i *)above);
+  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+  const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
+  const __m128i AR0 = _mm_shufflehi_epi16(A3, 0xff);
+  const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
+  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+  const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
+  const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
+  const __m128i B3 = _mm_alignr_epi8(AR, A3, 2);
+  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+  const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
+  const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
+  const __m128i C3 = _mm_alignr_epi8(AR, A3, 4);
+  __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+  __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+  __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+  __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+  __m128i avg2_2 = _mm_avg_epu16(A2, B2);
+  __m128i avg2_3 = _mm_avg_epu16(A3, B3);
+  int i;
+  (void)left;
+  (void)bd;
+  for (i = 0; i < 30; i += 2) {
+    _mm_store_si128((__m128i *)dst, avg2_0);
+    _mm_store_si128((__m128i *)(dst + 8), avg2_1);
+    _mm_store_si128((__m128i *)(dst + 16), avg2_2);
+    _mm_store_si128((__m128i *)(dst + 24), avg2_3);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, avg3_0);
+    _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+    _mm_store_si128((__m128i *)(dst + 16), avg3_2);
+    _mm_store_si128((__m128i *)(dst + 24), avg3_3);
+    dst += stride;
+    avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2);
+    avg2_1 = _mm_alignr_epi8(avg2_2, avg2_1, 2);
+    avg2_2 = _mm_alignr_epi8(avg2_3, avg2_2, 2);
+    avg2_3 = _mm_alignr_epi8(AR, avg2_3, 2);
+    avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2);
+    avg3_1 = _mm_alignr_epi8(avg3_2, avg3_1, 2);
+    avg3_2 = _mm_alignr_epi8(avg3_3, avg3_2, 2);
+    avg3_3 = _mm_alignr_epi8(AR, avg3_3, 2);
+  }
+  _mm_store_si128((__m128i *)dst, avg2_0);
+  _mm_store_si128((__m128i *)(dst + 8), avg2_1);
+  _mm_store_si128((__m128i *)(dst + 16), avg2_2);
+  _mm_store_si128((__m128i *)(dst + 24), avg2_3);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, avg3_0);
+  _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+  _mm_store_si128((__m128i *)(dst + 16), avg3_2);
+  _mm_store_si128((__m128i *)(dst + 24), avg3_3);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm
index c61b62104f..caf506ac07 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm
@@ -256,7 +256,7 @@ cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above
   REP_RET
 
 INIT_XMM sse2
-cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps
+cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bd
   movd                  m1, [aboveq-2]
   movq                  m0, [aboveq]
   pshuflw               m1, m1, 0x0
@@ -264,7 +264,7 @@ cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps
   movlhps               m1, m1         ; tl tl tl tl tl tl tl tl
   ; Get the values to compute the maximum value at this bit depth
   pcmpeqw               m3, m3
-  movd                  m4, bpsd
+  movd                  m4, bdd
   psubw                 m0, m1         ; t1-tl t2-tl t3-tl t4-tl
   psllw                 m3, m4
   pcmpeqw               m2, m2
@@ -295,7 +295,7 @@ cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps
   RET
 
 INIT_XMM sse2
-cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one
+cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bd, one
   movd                  m1, [aboveq-2]
   mova                  m0, [aboveq]
   pshuflw               m1, m1, 0x0
@@ -304,7 +304,7 @@ cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one
   pxor                  m3, m3
   pxor                  m4, m4
   pinsrw                m3, oned, 0
-  pinsrw                m4, bpsd, 0
+  pinsrw                m4, bdd, 0
   pshuflw               m3, m3, 0x0
   DEFINE_ARGS dst, stride, line, left
   punpcklqdq            m3, m3
@@ -339,14 +339,14 @@ cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one
   REP_RET
 
 INIT_XMM sse2
-cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bps
+cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bd
   movd                  m2, [aboveq-2]
   mova                  m0, [aboveq]
   mova                  m1, [aboveq+16]
   pshuflw               m2, m2, 0x0
   ; Get the values to compute the maximum value at this bit depth
   pcmpeqw               m3, m3
-  movd                  m4, bpsd
+  movd                  m4, bdd
   punpcklqdq            m2, m2
   psllw                 m3, m4
   pcmpeqw               m5, m5
@@ -386,7 +386,7 @@ cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bps
   REP_RET
 
 INIT_XMM sse2
-cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bps
+cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bd
   movd                  m0, [aboveq-2]
   mova                  m1, [aboveq]
   mova                  m2, [aboveq+16]
@@ -395,7 +395,7 @@ cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bps
   pshuflw               m0, m0, 0x0
   ; Get the values to compute the maximum value at this bit depth
   pcmpeqw               m5, m5
-  movd                  m6, bpsd
+  movd                  m6, bdd
   psllw                 m5, m6
   pcmpeqw               m7, m7
   pxor                  m6, m6         ; min possible value
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h
new file mode 100644
index 0000000000..1d07391b02
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h
@@ -0,0 +1,404 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
+#define VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/inv_txfm.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+// Note: There is no 64-bit bit-level shifting SIMD instruction. All
+// coefficients are left shifted by 2, so that dct_const_round_shift() can be
+// done by right shifting 2 bytes.
+
+static INLINE void extend_64bit(const __m128i in,
+                                __m128i *const out /*out[2]*/) {
+  out[0] = _mm_unpacklo_epi32(in, in);  // 0, 0, 1, 1
+  out[1] = _mm_unpackhi_epi32(in, in);  // 2, 2, 3, 3
+}
+
+static INLINE __m128i wraplow_16bit_shift4(const __m128i in0, const __m128i in1,
+                                           const __m128i rounding) {
+  __m128i temp[2];
+  temp[0] = _mm_add_epi32(in0, rounding);
+  temp[1] = _mm_add_epi32(in1, rounding);
+  temp[0] = _mm_srai_epi32(temp[0], 4);
+  temp[1] = _mm_srai_epi32(temp[1], 4);
+  return _mm_packs_epi32(temp[0], temp[1]);
+}
+
+static INLINE __m128i wraplow_16bit_shift5(const __m128i in0, const __m128i in1,
+                                           const __m128i rounding) {
+  __m128i temp[2];
+  temp[0] = _mm_add_epi32(in0, rounding);
+  temp[1] = _mm_add_epi32(in1, rounding);
+  temp[0] = _mm_srai_epi32(temp[0], 5);
+  temp[1] = _mm_srai_epi32(temp[1], 5);
+  return _mm_packs_epi32(temp[0], temp[1]);
+}
+
+static INLINE __m128i dct_const_round_shift_64bit(const __m128i in) {
+  const __m128i t =
+      _mm_add_epi64(in, pair_set_epi32(DCT_CONST_ROUNDING << 2, 0));
+  return _mm_srli_si128(t, 2);
+}
+
+static INLINE __m128i pack_4(const __m128i in0, const __m128i in1) {
+  const __m128i t0 = _mm_unpacklo_epi32(in0, in1);  // 0, 2
+  const __m128i t1 = _mm_unpackhi_epi32(in0, in1);  // 1, 3
+  return _mm_unpacklo_epi32(t0, t1);                // 0, 1, 2, 3
+}
+
+static INLINE void abs_extend_64bit_sse2(const __m128i in,
+                                         __m128i *const out /*out[2]*/,
+                                         __m128i *const sign /*sign[2]*/) {
+  sign[0] = _mm_srai_epi32(in, 31);
+  out[0] = _mm_xor_si128(in, sign[0]);
+  out[0] = _mm_sub_epi32(out[0], sign[0]);
+  sign[1] = _mm_unpackhi_epi32(sign[0], sign[0]);  // 64-bit sign of 2, 3
+  sign[0] = _mm_unpacklo_epi32(sign[0], sign[0]);  // 64-bit sign of 0, 1
+  out[1] = _mm_unpackhi_epi32(out[0], out[0]);     // 2, 3
+  out[0] = _mm_unpacklo_epi32(out[0], out[0]);     // 0, 1
+}
+
+// Note: cospi must be non negative.
+static INLINE __m128i multiply_apply_sign_sse2(const __m128i in,
+                                               const __m128i sign,
+                                               const __m128i cospi) {
+  __m128i out = _mm_mul_epu32(in, cospi);
+  out = _mm_xor_si128(out, sign);
+  return _mm_sub_epi64(out, sign);
+}
+
+// Note: c must be non negative.
+static INLINE __m128i multiplication_round_shift_sse2(
+    const __m128i *const in /*in[2]*/, const __m128i *const sign /*sign[2]*/,
+    const int c) {
+  const __m128i pair_c = pair_set_epi32(c << 2, 0);
+  __m128i t0, t1;
+
+  assert(c >= 0);
+  t0 = multiply_apply_sign_sse2(in[0], sign[0], pair_c);
+  t1 = multiply_apply_sign_sse2(in[1], sign[1], pair_c);
+  t0 = dct_const_round_shift_64bit(t0);
+  t1 = dct_const_round_shift_64bit(t1);
+
+  return pack_4(t0, t1);
+}
+
+// Note: c must be non negative.
+static INLINE __m128i multiplication_neg_round_shift_sse2(
+    const __m128i *const in /*in[2]*/, const __m128i *const sign /*sign[2]*/,
+    const int c) {
+  const __m128i pair_c = pair_set_epi32(c << 2, 0);
+  __m128i t0, t1;
+
+  assert(c >= 0);
+  t0 = multiply_apply_sign_sse2(in[0], sign[0], pair_c);
+  t1 = multiply_apply_sign_sse2(in[1], sign[1], pair_c);
+  t0 = _mm_sub_epi64(_mm_setzero_si128(), t0);
+  t1 = _mm_sub_epi64(_mm_setzero_si128(), t1);
+  t0 = dct_const_round_shift_64bit(t0);
+  t1 = dct_const_round_shift_64bit(t1);
+
+  return pack_4(t0, t1);
+}
+
+// Note: c0 and c1 must be non negative.
+static INLINE void highbd_butterfly_sse2(const __m128i in0, const __m128i in1,
+                                         const int c0, const int c1,
+                                         __m128i *const out0,
+                                         __m128i *const out1) {
+  const __m128i pair_c0 = pair_set_epi32(c0 << 2, 0);
+  const __m128i pair_c1 = pair_set_epi32(c1 << 2, 0);
+  __m128i temp1[4], temp2[4], sign1[2], sign2[2];
+
+  assert(c0 >= 0);
+  assert(c1 >= 0);
+  abs_extend_64bit_sse2(in0, temp1, sign1);
+  abs_extend_64bit_sse2(in1, temp2, sign2);
+  temp1[2] = multiply_apply_sign_sse2(temp1[0], sign1[0], pair_c1);
+  temp1[3] = multiply_apply_sign_sse2(temp1[1], sign1[1], pair_c1);
+  temp1[0] = multiply_apply_sign_sse2(temp1[0], sign1[0], pair_c0);
+  temp1[1] = multiply_apply_sign_sse2(temp1[1], sign1[1], pair_c0);
+  temp2[2] = multiply_apply_sign_sse2(temp2[0], sign2[0], pair_c0);
+  temp2[3] = multiply_apply_sign_sse2(temp2[1], sign2[1], pair_c0);
+  temp2[0] = multiply_apply_sign_sse2(temp2[0], sign2[0], pair_c1);
+  temp2[1] = multiply_apply_sign_sse2(temp2[1], sign2[1], pair_c1);
+  temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]);
+  temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]);
+  temp2[0] = _mm_add_epi64(temp1[2], temp2[2]);
+  temp2[1] = _mm_add_epi64(temp1[3], temp2[3]);
+  temp1[0] = dct_const_round_shift_64bit(temp1[0]);
+  temp1[1] = dct_const_round_shift_64bit(temp1[1]);
+  temp2[0] = dct_const_round_shift_64bit(temp2[0]);
+  temp2[1] = dct_const_round_shift_64bit(temp2[1]);
+  *out0 = pack_4(temp1[0], temp1[1]);
+  *out1 = pack_4(temp2[0], temp2[1]);
+}
+
+// Note: c0 and c1 must be non negative.
+static INLINE void highbd_partial_butterfly_sse2(const __m128i in, const int c0,
+                                                 const int c1,
+                                                 __m128i *const out0,
+                                                 __m128i *const out1) {
+  __m128i temp[2], sign[2];
+
+  assert(c0 >= 0);
+  assert(c1 >= 0);
+  abs_extend_64bit_sse2(in, temp, sign);
+  *out0 = multiplication_round_shift_sse2(temp, sign, c0);
+  *out1 = multiplication_round_shift_sse2(temp, sign, c1);
+}
+
+// Note: c0 and c1 must be non negative.
+static INLINE void highbd_partial_butterfly_neg_sse2(const __m128i in,
+                                                     const int c0, const int c1,
+                                                     __m128i *const out0,
+                                                     __m128i *const out1) {
+  __m128i temp[2], sign[2];
+
+  assert(c0 >= 0);
+  assert(c1 >= 0);
+  abs_extend_64bit_sse2(in, temp, sign);
+  *out0 = multiplication_neg_round_shift_sse2(temp, sign, c1);
+  *out1 = multiplication_round_shift_sse2(temp, sign, c0);
+}
+
+static INLINE void highbd_butterfly_cospi16_sse2(const __m128i in0,
+                                                 const __m128i in1,
+                                                 __m128i *const out0,
+                                                 __m128i *const out1) {
+  __m128i temp1[2], temp2, sign[2];
+
+  temp2 = _mm_add_epi32(in0, in1);
+  abs_extend_64bit_sse2(temp2, temp1, sign);
+  *out0 = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
+  temp2 = _mm_sub_epi32(in0, in1);
+  abs_extend_64bit_sse2(temp2, temp1, sign);
+  *out1 = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
+}
+
+// Only do addition and subtraction butterfly, size = 16, 32
+static INLINE void highbd_add_sub_butterfly(const __m128i *in, __m128i *out,
+                                            int size) {
+  int i = 0;
+  const int num = size >> 1;
+  const int bound = size - 1;
+  while (i < num) {
+    out[i] = _mm_add_epi32(in[i], in[bound - i]);
+    out[bound - i] = _mm_sub_epi32(in[i], in[bound - i]);
+    i++;
+  }
+}
+
+static INLINE void highbd_idct8_stage4(const __m128i *const in,
+                                       __m128i *const out) {
+  out[0] = _mm_add_epi32(in[0], in[7]);
+  out[1] = _mm_add_epi32(in[1], in[6]);
+  out[2] = _mm_add_epi32(in[2], in[5]);
+  out[3] = _mm_add_epi32(in[3], in[4]);
+  out[4] = _mm_sub_epi32(in[3], in[4]);
+  out[5] = _mm_sub_epi32(in[2], in[5]);
+  out[6] = _mm_sub_epi32(in[1], in[6]);
+  out[7] = _mm_sub_epi32(in[0], in[7]);
+}
+
+static INLINE void highbd_idct8x8_final_round(__m128i *const io) {
+  io[0] = wraplow_16bit_shift5(io[0], io[8], _mm_set1_epi32(16));
+  io[1] = wraplow_16bit_shift5(io[1], io[9], _mm_set1_epi32(16));
+  io[2] = wraplow_16bit_shift5(io[2], io[10], _mm_set1_epi32(16));
+  io[3] = wraplow_16bit_shift5(io[3], io[11], _mm_set1_epi32(16));
+  io[4] = wraplow_16bit_shift5(io[4], io[12], _mm_set1_epi32(16));
+  io[5] = wraplow_16bit_shift5(io[5], io[13], _mm_set1_epi32(16));
+  io[6] = wraplow_16bit_shift5(io[6], io[14], _mm_set1_epi32(16));
+  io[7] = wraplow_16bit_shift5(io[7], io[15], _mm_set1_epi32(16));
+}
+
+static INLINE void highbd_idct16_4col_stage7(const __m128i *const in,
+                                             __m128i *const out) {
+  out[0] = _mm_add_epi32(in[0], in[15]);
+  out[1] = _mm_add_epi32(in[1], in[14]);
+  out[2] = _mm_add_epi32(in[2], in[13]);
+  out[3] = _mm_add_epi32(in[3], in[12]);
+  out[4] = _mm_add_epi32(in[4], in[11]);
+  out[5] = _mm_add_epi32(in[5], in[10]);
+  out[6] = _mm_add_epi32(in[6], in[9]);
+  out[7] = _mm_add_epi32(in[7], in[8]);
+  out[8] = _mm_sub_epi32(in[7], in[8]);
+  out[9] = _mm_sub_epi32(in[6], in[9]);
+  out[10] = _mm_sub_epi32(in[5], in[10]);
+  out[11] = _mm_sub_epi32(in[4], in[11]);
+  out[12] = _mm_sub_epi32(in[3], in[12]);
+  out[13] = _mm_sub_epi32(in[2], in[13]);
+  out[14] = _mm_sub_epi32(in[1], in[14]);
+  out[15] = _mm_sub_epi32(in[0], in[15]);
+}
+
+static INLINE __m128i add_clamp(const __m128i in0, const __m128i in1,
+                                const int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  // Faster than _mm_set1_epi16((1 << bd) - 1).
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+  __m128i d;
+
+  d = _mm_adds_epi16(in0, in1);
+  d = _mm_max_epi16(d, zero);
+  d = _mm_min_epi16(d, max);
+
+  return d;
+}
+
+static INLINE void highbd_idct_1_add_kernel(const tran_low_t *input,
+                                            uint16_t *dest, int stride, int bd,
+                                            const int size) {
+  int a1, i, j;
+  tran_low_t out;
+  __m128i dc, d;
+
+  out = HIGHBD_WRAPLOW(
+      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+  out =
+      HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
+  a1 = ROUND_POWER_OF_TWO(out, (size == 8) ? 5 : 6);
+  dc = _mm_set1_epi16(a1);
+
+  for (i = 0; i < size; ++i) {
+    for (j = 0; j < size; j += 8) {
+      d = _mm_load_si128((const __m128i *)(&dest[j]));
+      d = add_clamp(d, dc, bd);
+      _mm_store_si128((__m128i *)(&dest[j]), d);
+    }
+    dest += stride;
+  }
+}
+
+static INLINE void recon_and_store_4(const __m128i in, uint16_t *const dest,
+                                     const int bd) {
+  __m128i d;
+
+  d = _mm_loadl_epi64((const __m128i *)dest);
+  d = add_clamp(d, in, bd);
+  _mm_storel_epi64((__m128i *)dest, d);
+}
+
+static INLINE void recon_and_store_4x2(const __m128i in, uint16_t *const dest,
+                                       const int stride, const int bd) {
+  __m128i d;
+
+  d = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
+  d = _mm_castps_si128(
+      _mm_loadh_pi(_mm_castsi128_ps(d), (const __m64 *)(dest + 1 * stride)));
+  d = add_clamp(d, in, bd);
+  _mm_storel_epi64((__m128i *)(dest + 0 * stride), d);
+  _mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d));
+}
+
+static INLINE void recon_and_store_4x4(const __m128i *const in, uint16_t *dest,
+                                       const int stride, const int bd) {
+  recon_and_store_4x2(in[0], dest, stride, bd);
+  dest += 2 * stride;
+  recon_and_store_4x2(in[1], dest, stride, bd);
+}
+
+static INLINE void recon_and_store_8(const __m128i in, uint16_t **const dest,
+                                     const int stride, const int bd) {
+  __m128i d;
+
+  d = _mm_load_si128((const __m128i *)(*dest));
+  d = add_clamp(d, in, bd);
+  _mm_store_si128((__m128i *)(*dest), d);
+  *dest += stride;
+}
+
+static INLINE void recon_and_store_8x8(const __m128i *const in, uint16_t *dest,
+                                       const int stride, const int bd) {
+  recon_and_store_8(in[0], &dest, stride, bd);
+  recon_and_store_8(in[1], &dest, stride, bd);
+  recon_and_store_8(in[2], &dest, stride, bd);
+  recon_and_store_8(in[3], &dest, stride, bd);
+  recon_and_store_8(in[4], &dest, stride, bd);
+  recon_and_store_8(in[5], &dest, stride, bd);
+  recon_and_store_8(in[6], &dest, stride, bd);
+  recon_and_store_8(in[7], &dest, stride, bd);
+}
+
+static INLINE __m128i load_pack_8_32bit(const tran_low_t *const input) {
+  const __m128i t0 = _mm_load_si128((const __m128i *)(input + 0));
+  const __m128i t1 = _mm_load_si128((const __m128i *)(input + 4));
+  return _mm_packs_epi32(t0, t1);
+}
+
+static INLINE void highbd_load_pack_transpose_32bit_8x8(const tran_low_t *input,
+                                                        const int stride,
+                                                        __m128i *const in) {
+  in[0] = load_pack_8_32bit(input + 0 * stride);
+  in[1] = load_pack_8_32bit(input + 1 * stride);
+  in[2] = load_pack_8_32bit(input + 2 * stride);
+  in[3] = load_pack_8_32bit(input + 3 * stride);
+  in[4] = load_pack_8_32bit(input + 4 * stride);
+  in[5] = load_pack_8_32bit(input + 5 * stride);
+  in[6] = load_pack_8_32bit(input + 6 * stride);
+  in[7] = load_pack_8_32bit(input + 7 * stride);
+  transpose_16bit_8x8(in, in);
+}
+
+static INLINE void highbd_load_transpose_32bit_8x4(const tran_low_t *input,
+                                                   const int stride,
+                                                   __m128i *in) {
+  in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));
+  in[1] = _mm_load_si128((const __m128i *)(input + 0 * stride + 4));
+  in[2] = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));
+  in[3] = _mm_load_si128((const __m128i *)(input + 1 * stride + 4));
+  in[4] = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));
+  in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride + 4));
+  in[6] = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));
+  in[7] = _mm_load_si128((const __m128i *)(input + 3 * stride + 4));
+  transpose_32bit_8x4(in, in);
+}
+
+static INLINE void highbd_load_transpose_32bit_4x4(const tran_low_t *input,
+                                                   const int stride,
+                                                   __m128i *in) {
+  in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  transpose_32bit_4x4(in, in);
+}
+
+static INLINE void highbd_write_buffer_8(uint16_t *dest, const __m128i in,
+                                         const int bd) {
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  __m128i out;
+
+  out = _mm_adds_epi16(in, final_rounding);
+  out = _mm_srai_epi16(out, 6);
+  recon_and_store_8(out, &dest, 0, bd);
+}
+
+static INLINE void highbd_write_buffer_4(uint16_t *const dest, const __m128i in,
+                                         const int bd) {
+  const __m128i final_rounding = _mm_set1_epi32(1 << 5);
+  __m128i out;
+
+  out = _mm_add_epi32(in, final_rounding);
+  out = _mm_srai_epi32(out, 6);
+  out = _mm_packs_epi32(out, out);
+  recon_and_store_4(out, dest, bd);
+}
+
+#endif  // VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h b/media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h
new file mode 100644
index 0000000000..f446bb13f3
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h
@@ -0,0 +1,112 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
+#define VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
+
+#include <smmintrin.h>  // SSE4.1
+
+#include "./vpx_config.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+
+static INLINE __m128i multiplication_round_shift_sse4_1(
+    const __m128i *const in /*in[2]*/, const int c) {
+  const __m128i pair_c = pair_set_epi32(c * 4, 0);
+  __m128i t0, t1;
+
+  t0 = _mm_mul_epi32(in[0], pair_c);
+  t1 = _mm_mul_epi32(in[1], pair_c);
+  t0 = dct_const_round_shift_64bit(t0);
+  t1 = dct_const_round_shift_64bit(t1);
+
+  return pack_4(t0, t1);
+}
+
+static INLINE void highbd_butterfly_sse4_1(const __m128i in0, const __m128i in1,
+                                           const int c0, const int c1,
+                                           __m128i *const out0,
+                                           __m128i *const out1) {
+  const __m128i pair_c0 = pair_set_epi32(4 * c0, 0);
+  const __m128i pair_c1 = pair_set_epi32(4 * c1, 0);
+  __m128i temp1[4], temp2[4];
+
+  extend_64bit(in0, temp1);
+  extend_64bit(in1, temp2);
+  temp1[2] = _mm_mul_epi32(temp1[0], pair_c1);
+  temp1[3] = _mm_mul_epi32(temp1[1], pair_c1);
+  temp1[0] = _mm_mul_epi32(temp1[0], pair_c0);
+  temp1[1] = _mm_mul_epi32(temp1[1], pair_c0);
+  temp2[2] = _mm_mul_epi32(temp2[0], pair_c0);
+  temp2[3] = _mm_mul_epi32(temp2[1], pair_c0);
+  temp2[0] = _mm_mul_epi32(temp2[0], pair_c1);
+  temp2[1] = _mm_mul_epi32(temp2[1], pair_c1);
+  temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]);
+  temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]);
+  temp2[0] = _mm_add_epi64(temp1[2], temp2[2]);
+  temp2[1] = _mm_add_epi64(temp1[3], temp2[3]);
+  temp1[0] = dct_const_round_shift_64bit(temp1[0]);
+  temp1[1] = dct_const_round_shift_64bit(temp1[1]);
+  temp2[0] = dct_const_round_shift_64bit(temp2[0]);
+  temp2[1] = dct_const_round_shift_64bit(temp2[1]);
+  *out0 = pack_4(temp1[0], temp1[1]);
+  *out1 = pack_4(temp2[0], temp2[1]);
+}
+
+static INLINE void highbd_butterfly_cospi16_sse4_1(const __m128i in0,
+                                                   const __m128i in1,
+                                                   __m128i *const out0,
+                                                   __m128i *const out1) {
+  __m128i temp1[2], temp2;
+
+  temp2 = _mm_add_epi32(in0, in1);
+  extend_64bit(temp2, temp1);
+  *out0 = multiplication_round_shift_sse4_1(temp1, cospi_16_64);
+  temp2 = _mm_sub_epi32(in0, in1);
+  extend_64bit(temp2, temp1);
+  *out1 = multiplication_round_shift_sse4_1(temp1, cospi_16_64);
+}
+
+static INLINE void highbd_partial_butterfly_sse4_1(const __m128i in,
+                                                   const int c0, const int c1,
+                                                   __m128i *const out0,
+                                                   __m128i *const out1) {
+  __m128i temp[2];
+
+  extend_64bit(in, temp);
+  *out0 = multiplication_round_shift_sse4_1(temp, c0);
+  *out1 = multiplication_round_shift_sse4_1(temp, c1);
+}
+
+static INLINE void highbd_idct4_sse4_1(__m128i *const io) {
+  __m128i temp[2], step[4];
+
+  transpose_32bit_4x4(io, io);
+
+  // stage 1
+  temp[0] = _mm_add_epi32(io[0], io[2]);  // input[0] + input[2]
+  extend_64bit(temp[0], temp);
+  step[0] = multiplication_round_shift_sse4_1(temp, cospi_16_64);
+  temp[0] = _mm_sub_epi32(io[0], io[2]);  // input[0] - input[2]
+  extend_64bit(temp[0], temp);
+  step[1] = multiplication_round_shift_sse4_1(temp, cospi_16_64);
+  highbd_butterfly_sse4_1(io[1], io[3], cospi_24_64, cospi_8_64, &step[2],
+                          &step[3]);
+
+  // stage 2
+  io[0] = _mm_add_epi32(step[0], step[3]);  // step[0] + step[3]
+  io[1] = _mm_add_epi32(step[1], step[2]);  // step[1] + step[2]
+  io[2] = _mm_sub_epi32(step[1], step[2]);  // step[1] - step[2]
+  io[3] = _mm_sub_epi32(step[0], step[3]);  // step[0] - step[3]
+}
+
+void vpx_highbd_idct8x8_half1d_sse4_1(__m128i *const io);
+void vpx_highbd_idct16_4col_sse4_1(__m128i *const io /*io[16]*/);
+
+#endif  // VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c
index 8670b28958..9f45623dee 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c
@@ -12,14 +12,13 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
-#include "vpx_ports/emmintrin_compat.h"
 
 static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
   __m128i ubounded;
   __m128i lbounded;
   __m128i retval;
 
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi16(1);
   __m128i t80, max, min;
 
@@ -48,13 +47,13 @@ static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
 
 // TODO(debargha, peter): Break up large functions into smaller ones
 // in this file.
-void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
-                                       const uint8_t *_blimit,
-                                       const uint8_t *_limit,
-                                       const uint8_t *_thresh, int bd) {
-  const __m128i zero = _mm_set1_epi16(0);
+void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int pitch,
+                                       const uint8_t *blimit,
+                                       const uint8_t *limit,
+                                       const uint8_t *thresh, int bd) {
+  const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi16(1);
-  __m128i blimit, limit, thresh;
+  __m128i blimit_v, limit_v, thresh_v;
   __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;
   __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0;
   __m128i ps1, qs1, ps0, qs0;
@@ -71,35 +70,35 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
   __m128i eight, four;
 
   if (bd == 8) {
-    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
-    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
-    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+    blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero);
+    limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero);
+    thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero);
   } else if (bd == 10) {
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+    blimit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2);
+    limit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2);
+    thresh_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2);
   } else {  // bd == 12
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+    blimit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4);
+    limit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4);
+    thresh_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4);
   }
 
-  q4 = _mm_load_si128((__m128i *)(s + 4 * p));
-  p4 = _mm_load_si128((__m128i *)(s - 5 * p));
-  q3 = _mm_load_si128((__m128i *)(s + 3 * p));
-  p3 = _mm_load_si128((__m128i *)(s - 4 * p));
-  q2 = _mm_load_si128((__m128i *)(s + 2 * p));
-  p2 = _mm_load_si128((__m128i *)(s - 3 * p));
-  q1 = _mm_load_si128((__m128i *)(s + 1 * p));
-  p1 = _mm_load_si128((__m128i *)(s - 2 * p));
-  q0 = _mm_load_si128((__m128i *)(s + 0 * p));
-  p0 = _mm_load_si128((__m128i *)(s - 1 * p));
+  q4 = _mm_load_si128((__m128i *)(s + 4 * pitch));
+  p4 = _mm_load_si128((__m128i *)(s - 5 * pitch));
+  q3 = _mm_load_si128((__m128i *)(s + 3 * pitch));
+  p3 = _mm_load_si128((__m128i *)(s - 4 * pitch));
+  q2 = _mm_load_si128((__m128i *)(s + 2 * pitch));
+  p2 = _mm_load_si128((__m128i *)(s - 3 * pitch));
+  q1 = _mm_load_si128((__m128i *)(s + 1 * pitch));
+  p1 = _mm_load_si128((__m128i *)(s - 2 * pitch));
+  q0 = _mm_load_si128((__m128i *)(s + 0 * pitch));
+  p0 = _mm_load_si128((__m128i *)(s - 1 * pitch));
 
   //  highbd_filter_mask
   abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
@@ -112,14 +111,14 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
 
   //  highbd_hev_mask (in C code this is actually called from highbd_filter4)
   flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
-  hev = _mm_subs_epu16(flat, thresh);
+  hev = _mm_subs_epu16(flat, thresh_v);
   hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
 
   abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);  // abs(p0 - q0) * 2
   abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);         // abs(p1 - q1) / 2
-  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v);
   mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
-  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one));
   work = _mm_max_epi16(
       _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)),
       _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)));
@@ -133,7 +132,7 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
       _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
   mask = _mm_max_epi16(work, mask);
 
-  mask = _mm_subs_epu16(mask, limit);
+  mask = _mm_subs_epu16(mask, limit_v);
   mask = _mm_cmpeq_epi16(mask, zero);  // return ~mask
 
   // lp filter
@@ -208,12 +207,12 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
   // (because, in both vars, each block of 16 either all 1s or all 0s)
   flat = _mm_and_si128(flat, mask);
 
-  p5 = _mm_load_si128((__m128i *)(s - 6 * p));
-  q5 = _mm_load_si128((__m128i *)(s + 5 * p));
-  p6 = _mm_load_si128((__m128i *)(s - 7 * p));
-  q6 = _mm_load_si128((__m128i *)(s + 6 * p));
-  p7 = _mm_load_si128((__m128i *)(s - 8 * p));
-  q7 = _mm_load_si128((__m128i *)(s + 7 * p));
+  p5 = _mm_load_si128((__m128i *)(s - 6 * pitch));
+  q5 = _mm_load_si128((__m128i *)(s + 5 * pitch));
+  p6 = _mm_load_si128((__m128i *)(s - 7 * pitch));
+  q6 = _mm_load_si128((__m128i *)(s + 6 * pitch));
+  p7 = _mm_load_si128((__m128i *)(s - 8 * pitch));
+  q7 = _mm_load_si128((__m128i *)(s + 7 * pitch));
 
   // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7
   // but referred to as p0-p4 & q0-q4 in fn)
@@ -390,8 +389,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
   flat2_q6 = _mm_and_si128(flat2, flat2_q6);
   //  get values for when (flat2 && flat && mask)
   q6 = _mm_or_si128(q6, flat2_q6);  // full list of q6 values
-  _mm_store_si128((__m128i *)(s - 7 * p), p6);
-  _mm_store_si128((__m128i *)(s + 6 * p), q6);
+  _mm_store_si128((__m128i *)(s - 7 * pitch), p6);
+  _mm_store_si128((__m128i *)(s + 6 * pitch), q6);
 
   p5 = _mm_andnot_si128(flat2, p5);
   //  p5 remains unchanged if !(flat2 && flat && mask)
@@ -405,8 +404,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
   //  get values for when (flat2 && flat && mask)
   q5 = _mm_or_si128(q5, flat2_q5);
   //  full list of q5 values
-  _mm_store_si128((__m128i *)(s - 6 * p), p5);
-  _mm_store_si128((__m128i *)(s + 5 * p), q5);
+  _mm_store_si128((__m128i *)(s - 6 * pitch), p5);
+  _mm_store_si128((__m128i *)(s + 5 * pitch), q5);
 
   p4 = _mm_andnot_si128(flat2, p4);
   //  p4 remains unchanged if !(flat2 && flat && mask)
@@ -418,8 +417,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
   flat2_q4 = _mm_and_si128(flat2, flat2_q4);
   //  get values for when (flat2 && flat && mask)
   q4 = _mm_or_si128(q4, flat2_q4);  // full list of q4 values
-  _mm_store_si128((__m128i *)(s - 5 * p), p4);
-  _mm_store_si128((__m128i *)(s + 4 * p), q4);
+  _mm_store_si128((__m128i *)(s - 5 * pitch), p4);
+  _mm_store_si128((__m128i *)(s + 4 * pitch), q4);
 
   p3 = _mm_andnot_si128(flat2, p3);
   //  p3 takes value from highbd_filter8 if !(flat2 && flat && mask)
@@ -431,8 +430,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
   flat2_q3 = _mm_and_si128(flat2, flat2_q3);
   //  get values for when (flat2 && flat && mask)
   q3 = _mm_or_si128(q3, flat2_q3);  // full list of q3 values
-  _mm_store_si128((__m128i *)(s - 4 * p), p3);
-  _mm_store_si128((__m128i *)(s + 3 * p), q3);
+  _mm_store_si128((__m128i *)(s - 4 * pitch), p3);
+  _mm_store_si128((__m128i *)(s + 3 * pitch), q3);
 
   p2 = _mm_andnot_si128(flat2, p2);
   //  p2 takes value from highbd_filter8 if !(flat2 && flat && mask)
@@ -445,8 +444,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
   flat2_q2 = _mm_and_si128(flat2, flat2_q2);
   //  get values for when (flat2 && flat && mask)
   q2 = _mm_or_si128(q2, flat2_q2);  // full list of q2 values
-  _mm_store_si128((__m128i *)(s - 3 * p), p2);
-  _mm_store_si128((__m128i *)(s + 2 * p), q2);
+  _mm_store_si128((__m128i *)(s - 3 * pitch), p2);
+  _mm_store_si128((__m128i *)(s + 2 * pitch), q2);
 
   p1 = _mm_andnot_si128(flat2, p1);
   //  p1 takes value from highbd_filter8 if !(flat2 && flat && mask)
@@ -458,8 +457,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
   flat2_q1 = _mm_and_si128(flat2, flat2_q1);
   //  get values for when (flat2 && flat && mask)
   q1 = _mm_or_si128(q1, flat2_q1);  // full list of q1 values
-  _mm_store_si128((__m128i *)(s - 2 * p), p1);
-  _mm_store_si128((__m128i *)(s + 1 * p), q1);
+  _mm_store_si128((__m128i *)(s - 2 * pitch), p1);
+  _mm_store_si128((__m128i *)(s + 1 * pitch), q1);
 
   p0 = _mm_andnot_si128(flat2, p0);
   //  p0 takes value from highbd_filter8 if !(flat2 && flat && mask)
@@ -471,39 +470,39 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
   flat2_q0 = _mm_and_si128(flat2, flat2_q0);
   //  get values for when (flat2 && flat && mask)
   q0 = _mm_or_si128(q0, flat2_q0);  // full list of q0 values
-  _mm_store_si128((__m128i *)(s - 1 * p), p0);
-  _mm_store_si128((__m128i *)(s - 0 * p), q0);
+  _mm_store_si128((__m128i *)(s - 1 * pitch), p0);
+  _mm_store_si128((__m128i *)(s - 0 * pitch), q0);
 }
 
-void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int p,
-                                            const uint8_t *_blimit,
-                                            const uint8_t *_limit,
-                                            const uint8_t *_thresh, int bd) {
-  vpx_highbd_lpf_horizontal_16_sse2(s, p, _blimit, _limit, _thresh, bd);
-  vpx_highbd_lpf_horizontal_16_sse2(s + 8, p, _blimit, _limit, _thresh, bd);
+void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int pitch,
+                                            const uint8_t *blimit,
+                                            const uint8_t *limit,
+                                            const uint8_t *thresh, int bd) {
+  vpx_highbd_lpf_horizontal_16_sse2(s, pitch, blimit, limit, thresh, bd);
+  vpx_highbd_lpf_horizontal_16_sse2(s + 8, pitch, blimit, limit, thresh, bd);
 }
 
-void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
-                                      const uint8_t *_blimit,
-                                      const uint8_t *_limit,
-                                      const uint8_t *_thresh, int bd) {
+void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch,
+                                      const uint8_t *blimit,
+                                      const uint8_t *limit,
+                                      const uint8_t *thresh, int bd) {
   DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);
   DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);
   DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);
   DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]);
   DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
   DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
-  const __m128i zero = _mm_set1_epi16(0);
-  __m128i blimit, limit, thresh;
+  const __m128i zero = _mm_setzero_si128();
+  __m128i blimit_v, limit_v, thresh_v;
   __m128i mask, hev, flat;
-  __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p));
-  __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p));
-  __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * p));
-  __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * p));
-  __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * p));
-  __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p));
-  __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p));
-  __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p));
+  __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * pitch));
+  __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * pitch));
+  __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * pitch));
+  __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * pitch));
+  __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * pitch));
+  __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * pitch));
+  __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * pitch));
+  __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * pitch));
   const __m128i one = _mm_set1_epi16(1);
   const __m128i ffff = _mm_cmpeq_epi16(one, one);
   __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
@@ -520,25 +519,25 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
   __m128i filter1, filter2;
 
   if (bd == 8) {
-    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
-    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
-    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+    blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero);
+    limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero);
+    thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero);
     t80 = _mm_set1_epi16(0x80);
   } else if (bd == 10) {
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+    blimit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2);
+    limit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2);
+    thresh_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2);
     t80 = _mm_set1_epi16(0x200);
   } else {  // bd == 12
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+    blimit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4);
+    limit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4);
+    thresh_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4);
     t80 = _mm_set1_epi16(0x800);
   }
 
@@ -554,16 +553,16 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
   abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
   abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
   flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
-  hev = _mm_subs_epu16(flat, thresh);
+  hev = _mm_subs_epu16(flat, thresh_v);
   hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
 
   abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
   abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
-  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v);
   mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
   // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
   // So taking maximums continues to work:
-  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one));
   mask = _mm_max_epi16(abs_p1p0, mask);
   // mask |= (abs(p1 - p0) > limit) * -1;
   mask = _mm_max_epi16(abs_q1q0, mask);
@@ -577,7 +576,7 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
       _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)),
       _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
   mask = _mm_max_epi16(work, mask);
-  mask = _mm_subs_epu16(mask, limit);
+  mask = _mm_subs_epu16(mask, limit_v);
   mask = _mm_cmpeq_epi16(mask, zero);
 
   // flat_mask4
@@ -675,7 +674,7 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
   q1 = _mm_and_si128(flat, q1);
   q1 = _mm_or_si128(work_a, q1);
 
-  work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
   q2 = _mm_load_si128((__m128i *)flat_oq2);
   work_a = _mm_andnot_si128(flat, work_a);
   q2 = _mm_and_si128(flat, q2);
@@ -695,43 +694,43 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
   p1 = _mm_and_si128(flat, p1);
   p1 = _mm_or_si128(work_a, p1);
 
-  work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
   p2 = _mm_load_si128((__m128i *)flat_op2);
   work_a = _mm_andnot_si128(flat, work_a);
   p2 = _mm_and_si128(flat, p2);
   p2 = _mm_or_si128(work_a, p2);
 
-  _mm_store_si128((__m128i *)(s - 3 * p), p2);
-  _mm_store_si128((__m128i *)(s - 2 * p), p1);
-  _mm_store_si128((__m128i *)(s - 1 * p), p0);
-  _mm_store_si128((__m128i *)(s + 0 * p), q0);
-  _mm_store_si128((__m128i *)(s + 1 * p), q1);
-  _mm_store_si128((__m128i *)(s + 2 * p), q2);
+  _mm_store_si128((__m128i *)(s - 3 * pitch), p2);
+  _mm_store_si128((__m128i *)(s - 2 * pitch), p1);
+  _mm_store_si128((__m128i *)(s - 1 * pitch), p0);
+  _mm_store_si128((__m128i *)(s + 0 * pitch), q0);
+  _mm_store_si128((__m128i *)(s + 1 * pitch), q1);
+  _mm_store_si128((__m128i *)(s + 2 * pitch), q2);
 }
 
 void vpx_highbd_lpf_horizontal_8_dual_sse2(
-    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1, int bd) {
-  vpx_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
-  vpx_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  vpx_highbd_lpf_horizontal_8_sse2(s, pitch, blimit0, limit0, thresh0, bd);
+  vpx_highbd_lpf_horizontal_8_sse2(s + 8, pitch, blimit1, limit1, thresh1, bd);
 }
 
-void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
-                                      const uint8_t *_blimit,
-                                      const uint8_t *_limit,
-                                      const uint8_t *_thresh, int bd) {
-  const __m128i zero = _mm_set1_epi16(0);
-  __m128i blimit, limit, thresh;
+void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch,
+                                      const uint8_t *blimit,
+                                      const uint8_t *limit,
+                                      const uint8_t *thresh, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i blimit_v, limit_v, thresh_v;
   __m128i mask, hev, flat;
-  __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
+  __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
+  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
+  __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
+  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+  __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+  __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
   const __m128i abs_p1p0 =
       _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
   const __m128i abs_q1q0 =
@@ -761,57 +760,57 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
   __m128i filter1, filter2;
 
   if (bd == 8) {
-    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
-    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
-    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+    blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero);
+    limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero);
+    thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero);
     t80 = _mm_set1_epi16(0x80);
-    tff80 = _mm_set1_epi16(0xff80);
-    tffe0 = _mm_set1_epi16(0xffe0);
+    tff80 = _mm_set1_epi16((int16_t)0xff80);
+    tffe0 = _mm_set1_epi16((int16_t)0xffe0);
     t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8);
     t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8);
   } else if (bd == 10) {
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+    blimit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2);
+    limit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2);
+    thresh_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2);
     t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2);
-    tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2);
-    tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2);
+    tff80 = _mm_slli_epi16(_mm_set1_epi16((int16_t)0xff80), 2);
+    tffe0 = _mm_slli_epi16(_mm_set1_epi16((int16_t)0xffe0), 2);
     t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6);
     t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6);
   } else {  // bd == 12
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+    blimit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4);
+    limit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4);
+    thresh_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4);
     t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4);
-    tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4);
-    tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4);
+    tff80 = _mm_slli_epi16(_mm_set1_epi16((int16_t)0xff80), 4);
+    tffe0 = _mm_slli_epi16(_mm_set1_epi16((int16_t)0xffe0), 4);
     t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4);
     t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4);
   }
 
-  ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
-  ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
-  qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
-  qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
+  ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80);
+  ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80);
+  qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80);
+  qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80);
 
   // filter_mask and hev_mask
   flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
-  hev = _mm_subs_epu16(flat, thresh);
+  hev = _mm_subs_epu16(flat, thresh_v);
   hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
 
   abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
   abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
-  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v);
   mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
   // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
   // So taking maximums continues to work:
-  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one));
   mask = _mm_max_epi16(flat, mask);
   // mask |= (abs(p1 - p0) > limit) * -1;
   // mask |= (abs(q1 - q0) > limit) * -1;
@@ -823,7 +822,7 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
       _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)),
       _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
   mask = _mm_max_epi16(work, mask);
-  mask = _mm_subs_epu16(mask, limit);
+  mask = _mm_subs_epu16(mask, limit_v);
   mask = _mm_cmpeq_epi16(mask, zero);
 
   // filter4
@@ -873,18 +872,18 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
   p1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
                       t80);
 
-  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
-  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+  _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);
+  _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);
+  _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0);
+  _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);
 }
 
 void vpx_highbd_lpf_horizontal_4_dual_sse2(
-    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1, int bd) {
-  vpx_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
-  vpx_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  vpx_highbd_lpf_horizontal_4_sse2(s, pitch, blimit0, limit0, thresh0, bd);
+  vpx_highbd_lpf_horizontal_4_sse2(s + 8, pitch, blimit1, limit1, thresh1, bd);
 }
 
 static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[],
@@ -999,9 +998,9 @@ static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p,
   highbd_transpose(src1, in_p, dest1, out_p, 1);
 }
 
-void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
-                                    const uint8_t *limit, const uint8_t *thresh,
-                                    int bd) {
+void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int pitch,
+                                    const uint8_t *blimit, const uint8_t *limit,
+                                    const uint8_t *thresh, int bd) {
   DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
   uint16_t *src[1];
   uint16_t *dst[1];
@@ -1010,7 +1009,7 @@ void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
   src[0] = s - 4;
   dst[0] = t_dst;
 
-  highbd_transpose(src, p, dst, 8, 1);
+  highbd_transpose(src, pitch, dst, 8, 1);
 
   // Loop filtering
   vpx_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
@@ -1019,11 +1018,11 @@ void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
   dst[0] = s - 4;
 
   // Transpose back
-  highbd_transpose(src, 8, dst, p, 1);
+  highbd_transpose(src, 8, dst, pitch, 1);
 }
 
 void vpx_highbd_lpf_vertical_4_dual_sse2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
     const uint8_t *thresh1, int bd) {
   DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
@@ -1031,7 +1030,7 @@ void vpx_highbd_lpf_vertical_4_dual_sse2(
   uint16_t *dst[2];
 
   // Transpose 8x16
-  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+  highbd_transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
 
   // Loop filtering
   vpx_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
@@ -1039,15 +1038,15 @@ void vpx_highbd_lpf_vertical_4_dual_sse2(
   src[0] = t_dst;
   src[1] = t_dst + 8;
   dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
+  dst[1] = s - 4 + pitch * 8;
 
   // Transpose back
-  highbd_transpose(src, 16, dst, p, 2);
+  highbd_transpose(src, 16, dst, pitch, 2);
 }
 
-void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
-                                    const uint8_t *limit, const uint8_t *thresh,
-                                    int bd) {
+void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int pitch,
+                                    const uint8_t *blimit, const uint8_t *limit,
+                                    const uint8_t *thresh, int bd) {
   DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
   uint16_t *src[1];
   uint16_t *dst[1];
@@ -1056,7 +1055,7 @@ void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
   src[0] = s - 4;
   dst[0] = t_dst;
 
-  highbd_transpose(src, p, dst, 8, 1);
+  highbd_transpose(src, pitch, dst, 8, 1);
 
   // Loop filtering
   vpx_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
@@ -1065,11 +1064,11 @@ void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
   dst[0] = s - 4;
 
   // Transpose back
-  highbd_transpose(src, 8, dst, p, 1);
+  highbd_transpose(src, 8, dst, pitch, 1);
 }
 
 void vpx_highbd_lpf_vertical_8_dual_sse2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
     const uint8_t *thresh1, int bd) {
   DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
@@ -1077,7 +1076,7 @@ void vpx_highbd_lpf_vertical_8_dual_sse2(
   uint16_t *dst[2];
 
   // Transpose 8x16
-  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+  highbd_transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
 
   // Loop filtering
   vpx_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
@@ -1086,13 +1085,14 @@ void vpx_highbd_lpf_vertical_8_dual_sse2(
   src[1] = t_dst + 8;
 
   dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
+  dst[1] = s - 4 + pitch * 8;
 
   // Transpose back
-  highbd_transpose(src, 16, dst, p, 2);
+  highbd_transpose(src, 16, dst, pitch, 2);
 }
 
-void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
+void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int pitch,
+                                     const uint8_t *blimit,
                                      const uint8_t *limit,
                                      const uint8_t *thresh, int bd) {
   DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]);
@@ -1105,7 +1105,7 @@ void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
   dst[1] = t_dst + 8 * 8;
 
   // Transpose 16x8
-  highbd_transpose(src, p, dst, 8, 2);
+  highbd_transpose(src, pitch, dst, 8, 2);
 
   // Loop filtering
   vpx_highbd_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh,
@@ -1116,24 +1116,25 @@ void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
   dst[1] = s;
 
   // Transpose back
-  highbd_transpose(src, 8, dst, p, 2);
+  highbd_transpose(src, 8, dst, pitch, 2);
 }
 
-void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int pitch,
                                           const uint8_t *blimit,
                                           const uint8_t *limit,
                                           const uint8_t *thresh, int bd) {
   DECLARE_ALIGNED(16, uint16_t, t_dst[256]);
 
   //  Transpose 16x16
-  highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
-  highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
+  highbd_transpose8x16(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16);
+  highbd_transpose8x16(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16);
 
   //  Loop filtering
   vpx_highbd_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit,
                                          thresh, bd);
 
   //  Transpose back
-  highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
-  highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
+  highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, pitch);
+  highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch,
+                       pitch);
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
new file mode 100644
index 0000000000..35ca554049
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -0,0 +1,254 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
+
+static VPX_FORCE_INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+  const __m128i sign = _mm_srai_epi16(*p, 15);
+  const __m128i dc = _mm_unpacklo_epi16(*p, sign);
+  const __m128i ac = _mm_unpackhi_epi16(*p, sign);
+  *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
+}
+
+static VPX_FORCE_INLINE void update_qp(__m256i *qp) {
+  int i;
+  for (i = 0; i < 5; ++i) {
+    qp[i] = _mm256_permute2x128_si256(qp[i], qp[i], 0x11);
+  }
+}
+
+static VPX_FORCE_INLINE void init_qp(
+    const struct macroblock_plane *const mb_plane, const int16_t *dequant_ptr,
+    __m256i *qp, int log_scale) {
+  const __m128i zbin = _mm_loadu_si128((const __m128i *)mb_plane->zbin);
+  const __m128i round = _mm_loadu_si128((const __m128i *)mb_plane->round);
+  const __m128i quant = _mm_loadu_si128((const __m128i *)mb_plane->quant);
+  const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
+  const __m128i quant_shift =
+      _mm_loadu_si128((const __m128i *)mb_plane->quant_shift);
+  init_one_qp(&zbin, &qp[0]);
+  init_one_qp(&round, &qp[1]);
+  init_one_qp(&quant, &qp[2]);
+  init_one_qp(&dequant, &qp[3]);
+  init_one_qp(&quant_shift, &qp[4]);
+  if (log_scale > 0) {
+    const __m256i rnd = _mm256_set1_epi32((int16_t)(1 << (log_scale - 1)));
+    qp[0] = _mm256_add_epi32(qp[0], rnd);
+    qp[0] = _mm256_srai_epi32(qp[0], log_scale);
+
+    qp[1] = _mm256_add_epi32(qp[1], rnd);
+    qp[1] = _mm256_srai_epi32(qp[1], log_scale);
+  }
+  // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when
+  // calculating the zbin mask.
+  qp[0] = _mm256_sub_epi32(qp[0], _mm256_set1_epi32(1));
+}
+
+// Note:
+// *x is vector multiplied by *y which is 16 int32_t parallel multiplication
+// and right shift 16.  The output, 16 int32_t is save in *p.
+static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32(const __m256i *x,
+                                                      const __m256i *y) {
+  __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+  __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+  const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+  const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+  prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+  prod_lo = _mm256_srli_epi64(prod_lo, 16);
+  prod_lo = _mm256_and_si256(prod_lo, mask);
+  prod_hi = _mm256_srli_epi64(prod_hi, 16);
+  prod_hi = _mm256_slli_epi64(prod_hi, 32);
+  return _mm256_or_si256(prod_lo, prod_hi);
+}
+
+static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan_ptr,
+                                                 __m256i eobmax,
+                                                 __m256i nz_mask) {
+  const __m256i packed_nz_mask = _mm256_packs_epi32(nz_mask, nz_mask);
+  const __m256i packed_nz_mask_perm =
+      _mm256_permute4x64_epi64(packed_nz_mask, 0xD8);
+  const __m256i iscan =
+      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr));
+  const __m256i nz_iscan = _mm256_and_si256(iscan, packed_nz_mask_perm);
+  return _mm256_max_epi16(eobmax, nz_iscan);
+}
+
+// Get the max eob from the lower 128 bits.
+static VPX_FORCE_INLINE uint16_t get_max_eob(__m256i eob) {
+  __m256i eob_s;
+  eob_s = _mm256_shuffle_epi32(eob, 0xe);
+  eob = _mm256_max_epi16(eob, eob_s);
+  eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+  eob = _mm256_max_epi16(eob, eob_s);
+  eob_s = _mm256_shufflelo_epi16(eob, 1);
+  eob = _mm256_max_epi16(eob, eob_s);
+#if defined(_MSC_VER) && (_MSC_VER < 1910)
+  return _mm_cvtsi128_si32(_mm256_extracti128_si256(eob, 0)) & 0xffff;
+#else
+  return (uint16_t)_mm256_extract_epi16(eob, 0);
+#endif
+}
+
+static VPX_FORCE_INLINE void quantize(const __m256i *qp,
+                                      const tran_low_t *coeff_ptr,
+                                      const int16_t *iscan_ptr,
+                                      tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                                      __m256i *eob) {
+  const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+  const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]);
+
+  if (_mm256_movemask_epi8(zbin_mask) == 0) {
+    const __m256i zero = _mm256_setzero_si256();
+    _mm256_storeu_si256((__m256i *)qcoeff, zero);
+    _mm256_storeu_si256((__m256i *)dqcoeff, zero);
+    return;
+  }
+  {
+    const __m256i tmp_rnd =
+        _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask);
+    const __m256i tmp = mm256_mul_shift_epi32(&tmp_rnd, &qp[2]);
+    const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd);
+    const __m256i abs_q = mm256_mul_shift_epi32(&tmp2, &qp[4]);
+    const __m256i abs_dq = _mm256_mullo_epi32(abs_q, qp[3]);
+    const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+    const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+    const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+
+    _mm256_storeu_si256((__m256i *)qcoeff, q);
+    _mm256_storeu_si256((__m256i *)dqcoeff, dq);
+
+    *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+  }
+}
+
+void vpx_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const struct macroblock_plane *const mb_plane,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const struct ScanOrder *const scan_order) {
+  const int step = 8;
+  __m256i eob = _mm256_setzero_si256();
+  __m256i qp[5];
+  const int16_t *iscan = scan_order->iscan;
+
+  init_qp(mb_plane, dequant_ptr, qp, 0);
+
+  quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan += step;
+  n_coeffs -= step;
+
+  update_qp(qp);
+
+  while (n_coeffs > 0) {
+    quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan += step;
+    n_coeffs -= step;
+  }
+
+  *eob_ptr = get_max_eob(eob);
+}
+
+static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x,
+                                                               const __m256i *y,
+                                                               int log_scale) {
+  __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+  __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+  const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+  const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+  prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+  prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale);
+  prod_lo = _mm256_and_si256(prod_lo, mask);
+  prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale);
+  prod_hi = _mm256_slli_epi64(prod_hi, 32);
+  return _mm256_or_si256(prod_lo, prod_hi);
+}
+
+static VPX_FORCE_INLINE void quantize_b_32x32(
+    const __m256i *qp, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob) {
+  const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+  const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]);
+
+  if (_mm256_movemask_epi8(zbin_mask) == 0) {
+    const __m256i zero = _mm256_setzero_si256();
+    _mm256_storeu_si256((__m256i *)qcoeff, zero);
+    _mm256_storeu_si256((__m256i *)dqcoeff, zero);
+    return;
+  }
+
+  {
+    const __m256i tmp_rnd =
+        _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask);
+    // const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+    const __m256i tmp = mm256_mul_shift_epi32_logscale(&tmp_rnd, &qp[2], 0);
+    const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd);
+    // const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+    const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp2, &qp[4], 1);
+    const __m256i abs_dq =
+        _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, qp[3]), 1);
+    const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+    const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+    const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+
+    _mm256_storeu_si256((__m256i *)qcoeff, q);
+    _mm256_storeu_si256((__m256i *)dqcoeff, dq);
+
+    *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+  }
+}
+
+void vpx_highbd_quantize_b_32x32_avx2(
+    const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+    uint16_t *eob_ptr, const struct ScanOrder *const scan_order) {
+  const unsigned int step = 8;
+  intptr_t n_coeffs = 32 * 32;
+  const int16_t *iscan = scan_order->iscan;
+  __m256i eob = _mm256_setzero_si256();
+  __m256i qp[5];
+
+  init_qp(mb_plane, dequant_ptr, qp, 1);
+
+  quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan += step;
+  n_coeffs -= step;
+
+  update_qp(qp);
+
+  while (n_coeffs > 0) {
+    quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan += step;
+    n_coeffs -= step;
+  }
+
+  *eob_ptr = get_max_eob(eob);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
index 2362476c1f..adae60756d 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -8,24 +8,29 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 #include <emmintrin.h>
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 
-#if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
-                                int skip_block, const int16_t *zbin_ptr,
-                                const int16_t *round_ptr,
-                                const int16_t *quant_ptr,
-                                const int16_t *quant_shift_ptr,
+                                const struct macroblock_plane *mb_plane,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                const int16_t *scan, const int16_t *iscan) {
-  int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
+                                const struct ScanOrder *const scan_order) {
+  int i, j, non_zero_regs = (int)count / 4, eob_i = 0;
   __m128i zbins[2];
   __m128i nzbins[2];
+  const int16_t *iscan = scan_order->iscan;
+  const int16_t *zbin_ptr = mb_plane->zbin;
+  const int16_t *round_ptr = mb_plane->round;
+  const int16_t *quant_ptr = mb_plane->quant;
+  const int16_t *quant_shift_ptr = mb_plane->quant_shift;
 
   zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1],
                            (int)zbin_ptr[0]);
@@ -36,75 +41,72 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
   nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
   nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
 
-  (void)scan;
-
   memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = ((int)count / 4) - 1; i >= 0; i--) {
-      __m128i coeffs, cmp1, cmp2;
-      int test;
-      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
-      cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
-      cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
-      cmp1 = _mm_and_si128(cmp1, cmp2);
-      test = _mm_movemask_epi8(cmp1);
-      if (test == 0xffff)
-        non_zero_regs--;
-      else
-        break;
-    }
+  // Pre-scan pass
+  for (i = ((int)count / 4) - 1; i >= 0; i--) {
+    __m128i coeffs, cmp1, cmp2;
+    int test;
+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+    cmp1 = _mm_and_si128(cmp1, cmp2);
+    test = _mm_movemask_epi8(cmp1);
+    if (test == 0xffff)
+      non_zero_regs--;
+    else
+      break;
+  }
 
-    // Quantization pass:
-    for (i = 0; i < non_zero_regs; i++) {
-      __m128i coeffs, coeffs_sign, tmp1, tmp2;
-      int test;
-      int abs_coeff[4];
-      int coeff_sign[4];
+  // Quantization pass:
+  for (i = 0; i < non_zero_regs; i++) {
+    __m128i coeffs, coeffs_sign, tmp1, tmp2;
+    int test;
+    int abs_coeff[4];
+    int coeff_sign[4];
 
-      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
-      coeffs_sign = _mm_srai_epi32(coeffs, 31);
-      coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
-      tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
-      tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
-      tmp1 = _mm_or_si128(tmp1, tmp2);
-      test = _mm_movemask_epi8(tmp1);
-      _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
-      _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+    coeffs_sign = _mm_srai_epi32(coeffs, 31);
+    coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
+    tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
+    tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
+    tmp1 = _mm_or_si128(tmp1, tmp2);
+    test = _mm_movemask_epi8(tmp1);
+    _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
+    _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
 
-      for (j = 0; j < 4; j++) {
-        if (test & (1 << (4 * j))) {
-          int k = 4 * i + j;
-          const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
-          const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
-          const uint32_t abs_qcoeff =
-              (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
-          qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
-          dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
-          if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
-        }
+    for (j = 0; j < 4; j++) {
+      if (test & (1 << (4 * j))) {
+        int k = 4 * i + j;
+        const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
+        const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
+        const uint32_t abs_qcoeff =
+            (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
+        qcoeff_ptr[k] =
+            (int)(abs_qcoeff ^ (uint32_t)coeff_sign[j]) - coeff_sign[j];
+        dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
+        if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
       }
     }
   }
-  *eob_ptr = eob_i + 1;
+  *eob_ptr = eob_i;
 }
 
 void vpx_highbd_quantize_b_32x32_sse2(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan) {
+    const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+    uint16_t *eob_ptr, const struct ScanOrder *const scan_order) {
   __m128i zbins[2];
   __m128i nzbins[2];
   int idx = 0;
   int idx_arr[1024];
-  int i, eob = -1;
-  const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
-  const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
-  (void)scan;
+  int i, eob = 0;
+  const intptr_t n_coeffs = 32 * 32;
+  const int16_t *iscan = scan_order->iscan;
+  const int zbin0_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1);
+  const int zbin1_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1);
+
   zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
   zbins[1] = _mm_set1_epi32(zbin1_tmp);
 
@@ -116,39 +118,36 @@ void vpx_highbd_quantize_b_32x32_sse2(
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = 0; i < n_coeffs / 4; i++) {
-      __m128i coeffs, cmp1, cmp2;
-      int test;
-      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
-      cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
-      cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
-      cmp1 = _mm_and_si128(cmp1, cmp2);
-      test = _mm_movemask_epi8(cmp1);
-      if (!(test & 0xf)) idx_arr[idx++] = i * 4;
-      if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
-      if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
-      if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
-    }
-
-    // Quantization pass: only process the coefficients selected in
-    // pre-scan pass. Note: idx can be zero.
-    for (i = 0; i < idx; i++) {
-      const int rc = idx_arr[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      const int64_t tmp1 =
-          abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
-      const uint32_t abs_qcoeff =
-          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
-      qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-      if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
-    }
+  // Pre-scan pass
+  for (i = 0; i < n_coeffs / 4; i++) {
+    __m128i coeffs, cmp1, cmp2;
+    int test;
+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+    cmp1 = _mm_and_si128(cmp1, cmp2);
+    test = _mm_movemask_epi8(cmp1);
+    if (!(test & 0xf)) idx_arr[idx++] = i * 4;
+    if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
+    if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
+    if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
   }
-  *eob_ptr = eob + 1;
+
+  // Quantization pass: only process the coefficients selected in
+  // pre-scan pass. Note: idx can be zero.
+  for (i = 0; i < idx; i++) {
+    const int rc = idx_arr[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp1 =
+        abs_coeff + ROUND_POWER_OF_TWO(mb_plane->round[rc != 0], 1);
+    const int64_t tmp2 = ((tmp1 * mb_plane->quant[rc != 0]) >> 16) + tmp1;
+    const uint32_t abs_qcoeff =
+        (uint32_t)((tmp2 * mb_plane->quant_shift[rc != 0]) >> 15);
+    qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+    if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
+  }
+  *eob_ptr = eob;
 }
-#endif
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c
new file mode 100644
index 0000000000..e483fdce73
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c
@@ -0,0 +1,462 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h>  // AVX2
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static VPX_FORCE_INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
+                                          uint32_t sad_array[4]) {
+  const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]);
+  const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]);
+  const __m256i t2 = _mm256_hadd_epi32(t0, t1);
+  const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2),
+                                    _mm256_extractf128_si256(t2, 1));
+  _mm_storeu_si128((__m128i *)sad_array, sum);
+}
+
+static VPX_FORCE_INLINE void highbd_sad64xHx4d(__m256i *sums_16 /*[4]*/,
+                                               const uint16_t *src,
+                                               int src_stride,
+                                               uint16_t *refs[4],
+                                               int ref_stride, int height) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    // load src and all ref[]
+    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+    const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
+    const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
+    int x;
+
+    for (x = 0; x < 4; ++x) {
+      __m256i r[4];
+      r[0] = _mm256_loadu_si256((const __m256i *)refs[x]);
+      r[1] = _mm256_loadu_si256((const __m256i *)(refs[x] + 16));
+      r[2] = _mm256_loadu_si256((const __m256i *)(refs[x] + 32));
+      r[3] = _mm256_loadu_si256((const __m256i *)(refs[x] + 48));
+
+      // absolute differences between every ref[] to src
+      r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s0));
+      r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s1));
+      r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s2));
+      r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s3));
+
+      // sum every abs diff
+      sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[0], r[1]));
+      sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[2], r[3]));
+    }
+
+    src += src_stride;
+    refs[0] += ref_stride;
+    refs[1] += ref_stride;
+    refs[2] += ref_stride;
+    refs[3] += ref_stride;
+  }
+}
+
+static VPX_FORCE_INLINE void highbd_sad64xNx4d_avx2(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4],
+    int ref_stride, uint32_t sad_array[4], int n) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *refs[4];
+  __m256i sums_16[4];
+  __m256i sums_32[4];
+  int i;
+
+  refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+  refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+  refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+  refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+  sums_32[0] = _mm256_setzero_si256();
+  sums_32[1] = _mm256_setzero_si256();
+  sums_32[2] = _mm256_setzero_si256();
+  sums_32[3] = _mm256_setzero_si256();
+
+  for (i = 0; i < (n / 2); ++i) {
+    sums_16[0] = _mm256_setzero_si256();
+    sums_16[1] = _mm256_setzero_si256();
+    sums_16[2] = _mm256_setzero_si256();
+    sums_16[3] = _mm256_setzero_si256();
+
+    highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2);
+
+    /* sums_16 will outrange after 2 rows, so add current sums_16 to
+     * sums_32*/
+    sums_32[0] = _mm256_add_epi32(
+        sums_32[0],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))));
+    sums_32[1] = _mm256_add_epi32(
+        sums_32[1],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))));
+    sums_32[2] = _mm256_add_epi32(
+        sums_32[2],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))));
+    sums_32[3] = _mm256_add_epi32(
+        sums_32[3],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))));
+
+    src += src_stride << 1;
+  }
+  calc_final_4(sums_32, sad_array);
+}
+
+#define HIGHBD_SAD64XNX4D(n)                                                   \
+  void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src, int src_stride,      \
+                                      const uint8_t *const ref_array[4],       \
+                                      int ref_stride, uint32_t sad_array[4]) { \
+    highbd_sad64xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array,  \
+                           n);                                                 \
+  }
+
+#define HIGHBD_SADSKIP64XNx4D(n)                                             \
+  void vpx_highbd_sad_skip_64x##n##x4d_avx2(                                 \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad64xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,   \
+                           sad_array, n / 2);                                \
+    sad_array[0] <<= 1;                                                      \
+    sad_array[1] <<= 1;                                                      \
+    sad_array[2] <<= 1;                                                      \
+    sad_array[3] <<= 1;                                                      \
+  }
+
+static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/,
+                                               const uint16_t *src,
+                                               int src_stride,
+                                               uint16_t *refs[4],
+                                               int ref_stride, int height) {
+  int i;
+  for (i = 0; i < height; i++) {
+    __m256i r[8];
+
+    // load src and all ref[]
+    const __m256i s = _mm256_load_si256((const __m256i *)src);
+    const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 16));
+    r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
+    r[1] = _mm256_loadu_si256((const __m256i *)(refs[0] + 16));
+    r[2] = _mm256_loadu_si256((const __m256i *)refs[1]);
+    r[3] = _mm256_loadu_si256((const __m256i *)(refs[1] + 16));
+    r[4] = _mm256_loadu_si256((const __m256i *)refs[2]);
+    r[5] = _mm256_loadu_si256((const __m256i *)(refs[2] + 16));
+    r[6] = _mm256_loadu_si256((const __m256i *)refs[3]);
+    r[7] = _mm256_loadu_si256((const __m256i *)(refs[3] + 16));
+
+    // absolute differences between every ref[] to src
+    r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s));
+    r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s2));
+    r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s));
+    r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s2));
+    r[4] = _mm256_abs_epi16(_mm256_sub_epi16(r[4], s));
+    r[5] = _mm256_abs_epi16(_mm256_sub_epi16(r[5], s2));
+    r[6] = _mm256_abs_epi16(_mm256_sub_epi16(r[6], s));
+    r[7] = _mm256_abs_epi16(_mm256_sub_epi16(r[7], s2));
+
+    // sum every abs diff
+    sums_16[0] = _mm256_add_epi16(sums_16[0], _mm256_add_epi16(r[0], r[1]));
+    sums_16[1] = _mm256_add_epi16(sums_16[1], _mm256_add_epi16(r[2], r[3]));
+    sums_16[2] = _mm256_add_epi16(sums_16[2], _mm256_add_epi16(r[4], r[5]));
+    sums_16[3] = _mm256_add_epi16(sums_16[3], _mm256_add_epi16(r[6], r[7]));
+
+    src += src_stride;
+    refs[0] += ref_stride;
+    refs[1] += ref_stride;
+    refs[2] += ref_stride;
+    refs[3] += ref_stride;
+  }
+}
+
+static VPX_FORCE_INLINE void highbd_sad32xNx4d_avx2(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4],
+    int ref_stride, uint32_t sad_array[4], int n) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *refs[4];
+  __m256i sums_16[4];
+  __m256i sums_32[4];
+  int i;
+
+  refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+  refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+  refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+  refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+  sums_32[0] = _mm256_setzero_si256();
+  sums_32[1] = _mm256_setzero_si256();
+  sums_32[2] = _mm256_setzero_si256();
+  sums_32[3] = _mm256_setzero_si256();
+
+  for (i = 0; i < (n / 8); ++i) {
+    sums_16[0] = _mm256_setzero_si256();
+    sums_16[1] = _mm256_setzero_si256();
+    sums_16[2] = _mm256_setzero_si256();
+    sums_16[3] = _mm256_setzero_si256();
+
+    highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8);
+
+    /* sums_16 will outrange after 8 rows, so add current sums_16 to
+     * sums_32*/
+    sums_32[0] = _mm256_add_epi32(
+        sums_32[0],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))));
+    sums_32[1] = _mm256_add_epi32(
+        sums_32[1],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))));
+    sums_32[2] = _mm256_add_epi32(
+        sums_32[2],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))));
+    sums_32[3] = _mm256_add_epi32(
+        sums_32[3],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))));
+
+    src += src_stride << 3;
+  }
+  calc_final_4(sums_32, sad_array);
+}
+
+#define HIGHBD_SAD32XNX4D(n)                                                   \
+  void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src, int src_stride,      \
+                                      const uint8_t *const ref_array[4],       \
+                                      int ref_stride, uint32_t sad_array[4]) { \
+    highbd_sad32xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array,  \
+                           n);                                                 \
+  }
+
+#define HIGHBD_SADSKIP32XNx4D(n)                                             \
+  void vpx_highbd_sad_skip_32x##n##x4d_avx2(                                 \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad32xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,   \
+                           sad_array, n / 2);                                \
+    sad_array[0] <<= 1;                                                      \
+    sad_array[1] <<= 1;                                                      \
+    sad_array[2] <<= 1;                                                      \
+    sad_array[3] <<= 1;                                                      \
+  }
+
+static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/,
+                                               const uint16_t *src,
+                                               int src_stride,
+                                               uint16_t *refs[4],
+                                               int ref_stride, int height) {
+  int i;
+  for (i = 0; i < height; i++) {
+    __m256i r[4];
+
+    // load src and all ref[]
+    const __m256i s = _mm256_load_si256((const __m256i *)src);
+    r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
+    r[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
+    r[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
+    r[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
+
+    // absolute differences between every ref[] to src
+    r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s));
+    r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s));
+    r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s));
+    r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s));
+
+    // sum every abs diff
+    sums_16[0] = _mm256_add_epi16(sums_16[0], r[0]);
+    sums_16[1] = _mm256_add_epi16(sums_16[1], r[1]);
+    sums_16[2] = _mm256_add_epi16(sums_16[2], r[2]);
+    sums_16[3] = _mm256_add_epi16(sums_16[3], r[3]);
+
+    src += src_stride;
+    refs[0] += ref_stride;
+    refs[1] += ref_stride;
+    refs[2] += ref_stride;
+    refs[3] += ref_stride;
+  }
+}
+
+static VPX_FORCE_INLINE void highbd_sad16xNx4d_avx2(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4],
+    int ref_stride, uint32_t sad_array[4], int n) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *refs[4];
+  __m256i sums_16[4];
+  __m256i sums_32[4];
+  const int height = VPXMIN(16, n);
+  const int num_iters = n / height;
+  int i;
+
+  refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+  refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+  refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+  refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+  sums_32[0] = _mm256_setzero_si256();
+  sums_32[1] = _mm256_setzero_si256();
+  sums_32[2] = _mm256_setzero_si256();
+  sums_32[3] = _mm256_setzero_si256();
+
+  for (i = 0; i < num_iters; ++i) {
+    sums_16[0] = _mm256_setzero_si256();
+    sums_16[1] = _mm256_setzero_si256();
+    sums_16[2] = _mm256_setzero_si256();
+    sums_16[3] = _mm256_setzero_si256();
+
+    highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, height);
+
+    // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
+    sums_32[0] = _mm256_add_epi32(
+        sums_32[0],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))));
+    sums_32[1] = _mm256_add_epi32(
+        sums_32[1],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))));
+    sums_32[2] = _mm256_add_epi32(
+        sums_32[2],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))));
+    sums_32[3] = _mm256_add_epi32(
+        sums_32[3],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))));
+
+    src += src_stride << 4;
+  }
+  calc_final_4(sums_32, sad_array);
+}
+
+#define HIGHBD_SAD16XNX4D(n)                                                   \
+  void vpx_highbd_sad16x##n##x4d_avx2(const uint8_t *src, int src_stride,      \
+                                      const uint8_t *const ref_array[4],       \
+                                      int ref_stride, uint32_t sad_array[4]) { \
+    highbd_sad16xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array,  \
+                           n);                                                 \
+  }
+
+#define HIGHBD_SADSKIP16XNx4D(n)                                             \
+  void vpx_highbd_sad_skip_16x##n##x4d_avx2(                                 \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad16xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,   \
+                           sad_array, n / 2);                                \
+    sad_array[0] <<= 1;                                                      \
+    sad_array[1] <<= 1;                                                      \
+    sad_array[2] <<= 1;                                                      \
+    sad_array[3] <<= 1;                                                      \
+  }
+
+void vpx_highbd_sad16x16x4d_avx2(const uint8_t *src_ptr, int src_stride,
+                                 const uint8_t *const ref_array[4],
+                                 int ref_stride, uint32_t sad_array[4]) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *refs[4];
+  __m256i sums_16[4];
+
+  refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+  refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+  refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+  refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+  sums_16[0] = _mm256_setzero_si256();
+  sums_16[1] = _mm256_setzero_si256();
+  sums_16[2] = _mm256_setzero_si256();
+  sums_16[3] = _mm256_setzero_si256();
+
+  highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16);
+
+  {
+    __m256i sums_32[4];
+    sums_32[0] = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)));
+    sums_32[1] = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)));
+    sums_32[2] = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)));
+    sums_32[3] = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)));
+    calc_final_4(sums_32, sad_array);
+  }
+}
+
+void vpx_highbd_sad16x8x4d_avx2(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *const ref_array[4],
+                                int ref_stride, uint32_t sad_array[4]) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *refs[4];
+  __m256i sums_16[4];
+
+  refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+  refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+  refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+  refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+  sums_16[0] = _mm256_setzero_si256();
+  sums_16[1] = _mm256_setzero_si256();
+  sums_16[2] = _mm256_setzero_si256();
+  sums_16[3] = _mm256_setzero_si256();
+
+  highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 8);
+
+  {
+    __m256i sums_32[4];
+    sums_32[0] = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)));
+    sums_32[1] = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)));
+    sums_32[2] = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)));
+    sums_32[3] = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)));
+    calc_final_4(sums_32, sad_array);
+  }
+}
+
+// clang-format off
+HIGHBD_SAD64XNX4D(64)
+HIGHBD_SADSKIP64XNx4D(64)
+
+HIGHBD_SAD64XNX4D(32)
+HIGHBD_SADSKIP64XNx4D(32)
+
+HIGHBD_SAD32XNX4D(64)
+HIGHBD_SADSKIP32XNx4D(64)
+
+HIGHBD_SAD32XNX4D(32)
+HIGHBD_SADSKIP32XNx4D(32)
+
+HIGHBD_SAD32XNX4D(16)
+HIGHBD_SADSKIP32XNx4D(16)
+
+HIGHBD_SAD16XNX4D(32)
+HIGHBD_SADSKIP16XNx4D(32)
+
+HIGHBD_SADSKIP16XNx4D(16)
+
+HIGHBD_SADSKIP16XNx4D(8)
+    // clang-format on
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm
index 6c2a61e019..a07892d811 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm
@@ -213,7 +213,12 @@ SECTION .text
 ;                         uint8_t *ref[4], int ref_stride,
 ;                         uint32_t res[4]);
 ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
-%macro HIGH_SADNXN4D 2
+; Macro Arguments:
+;   1: Width
+;   2: Height
+;   3: If 0, then normal sad, if 2, then skip every other row
+%macro HIGH_SADNXN4D 2-3 0
+%if %3 == 0  ; normal sad
 %if UNIX64
 cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
                               res, ref2, ref3, ref4
@@ -221,6 +226,15 @@ cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
 cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
                               ref2, ref3, ref4
 %endif
+%else  ; %3 == 2, downsample
+%if UNIX64
+cglobal highbd_sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+                              res, ref2, ref3, ref4
+%else
+cglobal highbd_sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+                              ref2, ref3, ref4
+%endif  ;
+%endif  ; sad/avg/skip
 
 ; set m1
   push                srcq
@@ -229,6 +243,10 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
   pshufd                m1, m1, 0x0
   pop                 srcq
 
+%if %3 == 2  ; skip rows
+  lea          src_strided, [2*src_strided]
+  lea          ref_strided, [2*ref_strided]
+%endif  ; skip rows
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
   mov                ref2q, [ref1q+gprsize*1]
@@ -244,9 +262,15 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
   shl                ref1q, 1
 
   HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
-%rep (%2-4)/2
+%if %3 == 2  ;  Downsampling by two
+%define num_rep (%2-8)/4
+%else
+%define num_rep (%2-4)/2
+%endif
+%rep num_rep
   HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
 %endrep
+%undef rep
   HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
   ; N.B. HIGH_PROCESS outputs dwords (32 bits)
   ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
@@ -265,6 +289,9 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
   paddd                 m4, m0
   paddd                 m6, m1
   punpcklqdq            m4, m6
+%if %3 == 2  ; skip rows
+  pslld                 m4, 1
+%endif
   movifnidn             r4, r4mp
   movu                [r4], m4
   RET
@@ -285,3 +312,15 @@ HIGH_SADNXN4D  8,  8
 HIGH_SADNXN4D  8,  4
 HIGH_SADNXN4D  4,  8
 HIGH_SADNXN4D  4,  4
+
+HIGH_SADNXN4D 64, 64, 2
+HIGH_SADNXN4D 64, 32, 2
+HIGH_SADNXN4D 32, 64, 2
+HIGH_SADNXN4D 32, 32, 2
+HIGH_SADNXN4D 32, 16, 2
+HIGH_SADNXN4D 16, 32, 2
+HIGH_SADNXN4D 16, 16, 2
+HIGH_SADNXN4D 16,  8, 2
+HIGH_SADNXN4D  8, 16, 2
+HIGH_SADNXN4D  8,  8, 2
+HIGH_SADNXN4D  4,  8, 2
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_avx2.c
new file mode 100644
index 0000000000..78f8eb8bfa
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_avx2.c
@@ -0,0 +1,522 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static VPX_FORCE_INLINE unsigned int calc_final(const __m256i sums_32) {
+  const __m256i t0 = _mm256_add_epi32(sums_32, _mm256_srli_si256(sums_32, 8));
+  const __m256i t1 = _mm256_add_epi32(t0, _mm256_srli_si256(t0, 4));
+  const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t1),
+                                    _mm256_extractf128_si256(t1, 1));
+  return (unsigned int)_mm_cvtsi128_si32(sum);
+}
+
+static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16,
+                                            const uint16_t *src, int src_stride,
+                                            uint16_t *ref, int ref_stride,
+                                            int height) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    // load src and all ref[]
+    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+    const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
+    const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
+    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+    const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
+    const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
+    // absolute differences between every ref[] to src
+    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
+    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
+    const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(r2, s2));
+    const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(r3, s3));
+    // sum every abs diff
+    *sums_16 =
+        _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1));
+    *sums_16 =
+        _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3));
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+static VPX_FORCE_INLINE unsigned int highbd_sad64xN_avx2(const uint8_t *src_ptr,
+                                                         int src_stride,
+                                                         const uint8_t *ref_ptr,
+                                                         int ref_stride,
+                                                         int n) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+  __m256i sums_32 = _mm256_setzero_si256();
+  int i;
+
+  for (i = 0; i < (n / 2); ++i) {
+    __m256i sums_16 = _mm256_setzero_si256();
+
+    highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2);
+
+    /* sums_16 will outrange after 2 rows, so add current sums_16 to
+     * sums_32*/
+    sums_32 = _mm256_add_epi32(
+        sums_32,
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+    src += src_stride << 1;
+    ref += ref_stride << 1;
+  }
+  return calc_final(sums_32);
+}
+
+#define HIGHBD_SAD64XN(n)                                                      \
+  unsigned int vpx_highbd_sad64x##n##_avx2(const uint8_t *src, int src_stride, \
+                                           const uint8_t *ref,                 \
+                                           int ref_stride) {                   \
+    return highbd_sad64xN_avx2(src, src_stride, ref, ref_stride, n);           \
+  }
+
+#define HIGHBD_SADSKIP64xN(n)                                                \
+  unsigned int vpx_highbd_sad_skip_64x##n##_avx2(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,                \
+      int ref_stride) {                                                      \
+    return 2 * highbd_sad64xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
+                                   n / 2);                                   \
+  }
+
+static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16,
+                                            const uint16_t *src, int src_stride,
+                                            uint16_t *ref, int ref_stride,
+                                            int height) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    // load src and all ref[]
+    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+    // absolute differences between every ref[] to src
+    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
+    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
+    // sum every abs diff
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+static VPX_FORCE_INLINE unsigned int highbd_sad32xN_avx2(const uint8_t *src_ptr,
+                                                         int src_stride,
+                                                         const uint8_t *ref_ptr,
+                                                         int ref_stride,
+                                                         int n) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+  __m256i sums_32 = _mm256_setzero_si256();
+  int i;
+
+  for (i = 0; i < (n / 8); ++i) {
+    __m256i sums_16 = _mm256_setzero_si256();
+
+    highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8);
+
+    /* sums_16 will outrange after 8 rows, so add current sums_16 to
+     * sums_32*/
+    sums_32 = _mm256_add_epi32(
+        sums_32,
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+    src += src_stride << 3;
+    ref += ref_stride << 3;
+  }
+  return calc_final(sums_32);
+}
+
+#define HIGHBD_SAD32XN(n)                                                      \
+  unsigned int vpx_highbd_sad32x##n##_avx2(const uint8_t *src, int src_stride, \
+                                           const uint8_t *ref,                 \
+                                           int ref_stride) {                   \
+    return highbd_sad32xN_avx2(src, src_stride, ref, ref_stride, n);           \
+  }
+
+#define HIGHBD_SADSKIP32xN(n)                                                \
+  unsigned int vpx_highbd_sad_skip_32x##n##_avx2(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,                \
+      int ref_stride) {                                                      \
+    return 2 * highbd_sad32xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
+                                   n / 2);                                   \
+  }
+
+static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16,
+                                            const uint16_t *src, int src_stride,
+                                            uint16_t *ref, int ref_stride,
+                                            int height) {
+  int i;
+  for (i = 0; i < height; i += 2) {
+    // load src and all ref[]
+    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride));
+    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
+    // absolute differences between every ref[] to src
+    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
+    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
+    // sum every abs diff
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
+
+    src += src_stride << 1;
+    ref += ref_stride << 1;
+  }
+}
+
+static VPX_FORCE_INLINE unsigned int highbd_sad16xN_avx2(const uint8_t *src_ptr,
+                                                         int src_stride,
+                                                         const uint8_t *ref_ptr,
+                                                         int ref_stride,
+                                                         int n) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+  __m256i sums_32 = _mm256_setzero_si256();
+  const int height = VPXMIN(16, n);
+  const int num_iters = n / height;
+  int i;
+
+  for (i = 0; i < num_iters; ++i) {
+    __m256i sums_16 = _mm256_setzero_si256();
+
+    highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, height);
+
+    // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
+    sums_32 = _mm256_add_epi32(
+        sums_32,
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+    src += src_stride << 4;
+    ref += ref_stride << 4;
+  }
+  return calc_final(sums_32);
+}
+
+#define HIGHBD_SAD16XN(n)                                                      \
+  unsigned int vpx_highbd_sad16x##n##_avx2(const uint8_t *src, int src_stride, \
+                                           const uint8_t *ref,                 \
+                                           int ref_stride) {                   \
+    return highbd_sad16xN_avx2(src, src_stride, ref, ref_stride, n);           \
+  }
+
+#define HIGHBD_SADSKIP16xN(n)                                                \
+  unsigned int vpx_highbd_sad_skip_16x##n##_avx2(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,                \
+      int ref_stride) {                                                      \
+    return 2 * highbd_sad16xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
+                                   n / 2);                                   \
+  }
+
+unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src_ptr, int src_stride,
+                                      const uint8_t *ref_ptr, int ref_stride) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+  __m256i sums_16 = _mm256_setzero_si256();
+
+  highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16);
+
+  {
+    const __m256i sums_32 = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+    return calc_final(sums_32);
+  }
+}
+
+unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *ref_ptr, int ref_stride) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+  __m256i sums_16 = _mm256_setzero_si256();
+
+  highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 8);
+
+  {
+    const __m256i sums_32 = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+    return calc_final(sums_32);
+  }
+}
+
+// clang-format off
+HIGHBD_SAD64XN(64)
+HIGHBD_SADSKIP64xN(64)
+HIGHBD_SAD64XN(32)
+HIGHBD_SADSKIP64xN(32)
+HIGHBD_SAD32XN(64)
+HIGHBD_SADSKIP32xN(64)
+HIGHBD_SAD32XN(32)
+HIGHBD_SADSKIP32xN(32)
+HIGHBD_SAD32XN(16)
+HIGHBD_SADSKIP32xN(16)
+HIGHBD_SAD16XN(32)
+HIGHBD_SADSKIP16xN(32)
+HIGHBD_SADSKIP16xN(16)
+HIGHBD_SADSKIP16xN(8)
+//clang-format on
+
+// AVG -------------------------------------------------------------------------
+static VPX_FORCE_INLINE void highbd_sad64xH_avg(__m256i *sums_16,
+                                                const uint16_t *src,
+                                                int src_stride, uint16_t *ref,
+                                                int ref_stride, uint16_t *sec,
+                                                int height) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    // load src and all ref[]
+    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+    const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
+    const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
+    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+    const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
+    const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
+    const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
+    const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
+    const __m256i x2 = _mm256_loadu_si256((const __m256i *)(sec + 32));
+    const __m256i x3 = _mm256_loadu_si256((const __m256i *)(sec + 48));
+    const __m256i avg0 = _mm256_avg_epu16(r0, x0);
+    const __m256i avg1 = _mm256_avg_epu16(r1, x1);
+    const __m256i avg2 = _mm256_avg_epu16(r2, x2);
+    const __m256i avg3 = _mm256_avg_epu16(r3, x3);
+    // absolute differences between every ref/pred avg to src
+    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
+    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
+    const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(avg2, s2));
+    const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(avg3, s3));
+    // sum every abs diff
+    *sums_16 =
+        _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1));
+    *sums_16 =
+        _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3));
+
+    src += src_stride;
+    ref += ref_stride;
+    sec += 64;
+  }
+}
+
+#define HIGHBD_SAD64XN_AVG(n)                                                 \
+  unsigned int vpx_highbd_sad64x##n##_avg_avx2(                               \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride, const uint8_t *second_pred) {                           \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                       \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                             \
+    uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);                         \
+    __m256i sums_32 = _mm256_setzero_si256();                                 \
+    int i;                                                                    \
+                                                                              \
+    for (i = 0; i < (n / 2); ++i) {                                           \
+      __m256i sums_16 = _mm256_setzero_si256();                               \
+                                                                              \
+      highbd_sad64xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 2); \
+                                                                              \
+      /* sums_16 will outrange after 2 rows, so add current sums_16 to        \
+       * sums_32*/                                                            \
+      sums_32 = _mm256_add_epi32(                                             \
+          sums_32,                                                            \
+          _mm256_add_epi32(                                                   \
+              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),         \
+              _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));  \
+                                                                              \
+      src += src_stride << 1;                                                 \
+      ref += ref_stride << 1;                                                 \
+      sec += 64 << 1;                                                         \
+    }                                                                         \
+    return calc_final(sums_32);                                               \
+  }
+
+// 64x64
+HIGHBD_SAD64XN_AVG(64)
+
+// 64x32
+HIGHBD_SAD64XN_AVG(32)
+
+static VPX_FORCE_INLINE void highbd_sad32xH_avg(__m256i *sums_16,
+                                                const uint16_t *src,
+                                                int src_stride, uint16_t *ref,
+                                                int ref_stride, uint16_t *sec,
+                                                int height) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    // load src and all ref[]
+    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+    const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
+    const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
+    const __m256i avg0 = _mm256_avg_epu16(r0, x0);
+    const __m256i avg1 = _mm256_avg_epu16(r1, x1);
+    // absolute differences between every ref/pred avg to src
+    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
+    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
+    // sum every abs diff
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
+
+    src += src_stride;
+    ref += ref_stride;
+    sec += 32;
+  }
+}
+
+#define HIGHBD_SAD32XN_AVG(n)                                                 \
+  unsigned int vpx_highbd_sad32x##n##_avg_avx2(                               \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride, const uint8_t *second_pred) {                           \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                       \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                             \
+    uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);                         \
+    __m256i sums_32 = _mm256_setzero_si256();                                 \
+    int i;                                                                    \
+                                                                              \
+    for (i = 0; i < (n / 8); ++i) {                                           \
+      __m256i sums_16 = _mm256_setzero_si256();                               \
+                                                                              \
+      highbd_sad32xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8); \
+                                                                              \
+      /* sums_16 will outrange after 8 rows, so add current sums_16 to        \
+       * sums_32*/                                                            \
+      sums_32 = _mm256_add_epi32(                                             \
+          sums_32,                                                            \
+          _mm256_add_epi32(                                                   \
+              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),         \
+              _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));  \
+                                                                              \
+      src += src_stride << 3;                                                 \
+      ref += ref_stride << 3;                                                 \
+      sec += 32 << 3;                                                         \
+    }                                                                         \
+    return calc_final(sums_32);                                               \
+  }
+
+// 32x64
+HIGHBD_SAD32XN_AVG(64)
+
+// 32x32
+HIGHBD_SAD32XN_AVG(32)
+
+// 32x16
+HIGHBD_SAD32XN_AVG(16)
+
+static VPX_FORCE_INLINE void highbd_sad16xH_avg(__m256i *sums_16,
+                                                const uint16_t *src,
+                                                int src_stride, uint16_t *ref,
+                                                int ref_stride, uint16_t *sec,
+                                                int height) {
+  int i;
+  for (i = 0; i < height; i += 2) {
+    // load src and all ref[]
+    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride));
+    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
+    const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
+    const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
+    const __m256i avg0 = _mm256_avg_epu16(r0, x0);
+    const __m256i avg1 = _mm256_avg_epu16(r1, x1);
+    // absolute differences between every ref[] to src
+    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
+    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
+    // sum every abs diff
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
+
+    src += src_stride << 1;
+    ref += ref_stride << 1;
+    sec += 32;
+  }
+}
+
+unsigned int vpx_highbd_sad16x32_avg_avx2(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *ref_ptr,
+                                          int ref_stride,
+                                          const uint8_t *second_pred) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+  uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
+  __m256i sums_32 = _mm256_setzero_si256();
+  int i;
+
+  for (i = 0; i < 2; ++i) {
+    __m256i sums_16 = _mm256_setzero_si256();
+
+    highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16);
+
+    // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
+    sums_32 = _mm256_add_epi32(
+        sums_32,
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+    src += src_stride << 4;
+    ref += ref_stride << 4;
+    sec += 16 << 4;
+  }
+  return calc_final(sums_32);
+}
+
+unsigned int vpx_highbd_sad16x16_avg_avx2(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *ref_ptr,
+                                          int ref_stride,
+                                          const uint8_t *second_pred) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+  uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
+  __m256i sums_16 = _mm256_setzero_si256();
+
+  highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16);
+
+  {
+    const __m256i sums_32 = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+    return calc_final(sums_32);
+  }
+}
+
+unsigned int vpx_highbd_sad16x8_avg_avx2(const uint8_t *src_ptr, int src_stride,
+                                         const uint8_t *ref_ptr, int ref_stride,
+                                         const uint8_t *second_pred) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+  uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
+  __m256i sums_16 = _mm256_setzero_si256();
+
+  highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8);
+
+  {
+    const __m256i sums_32 = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+    return calc_final(sums_32);
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm
index bc4b28db24..62ad2237ff 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm
@@ -12,6 +12,11 @@
 
 SECTION .text
 
+; Macro Arguments
+; Arg 1: Width
+; Arg 2: Height
+; Arg 3: Number of general purpose registers
+; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
 %macro HIGH_SAD_FN 4
 %if %4 == 0
 %if %3 == 5
@@ -20,22 +25,33 @@ cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
 cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
                             src_stride3, ref_stride3, n_rows
 %endif ; %3 == 5/7
-%else ; avg
+%elif %4 == 1 ; avg
 %if %3 == 5
 cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
                                     second_pred, n_rows
 %else ; %3 == 7
-cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \
+cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \
                                               ref, ref_stride, \
                                               second_pred, \
                                               src_stride3, ref_stride3
-%if ARCH_X86_64
+%if VPX_ARCH_X86_64
 %define n_rowsd r7d
 %else ; x86-32
 %define n_rowsd dword r0m
 %endif ; x86-32/64
 %endif ; %3 == 5/7
-%endif ; avg/sad
+%else  ; %4 == 2, skip rows
+%if %3 == 5
+cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
+                            src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%endif ; sad/avg/skip
+%if %4 == 2  ; double the stride if we are skipping rows
+  lea          src_strided, [src_strided*2]
+  lea          ref_strided, [ref_strided*2]
+%endif
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
 %if %3 == 7
@@ -54,7 +70,11 @@ cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD64XN 1-2 0
   HIGH_SAD_FN 64, %1, 5, %2
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/2
+%else
   mov              n_rowsd, %1
+%endif
   pxor                  m0, m0
   pxor                  m6, m6
 
@@ -146,6 +166,9 @@ cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \
   punpckldq             m0, m6
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -155,13 +178,19 @@ HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
 HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
 HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
 HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
+HIGH_SAD64XN 64, 2 ; highbd_sad_skip_64x64_sse2
+HIGH_SAD64XN 32, 2 ; highbd_sad_skip_64x32_sse2
 
 
 ; unsigned int vpx_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD32XN 1-2 0
   HIGH_SAD_FN 32, %1, 5, %2
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/2
+%else
   mov              n_rowsd, %1
+%endif
   pxor                  m0, m0
   pxor                  m6, m6
 
@@ -213,6 +242,9 @@ HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
   punpckldq             m0, m6
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -224,12 +256,19 @@ HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
 HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
 HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
 HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
+HIGH_SAD32XN 64, 2 ; highbd_sad_skip_32x64_sse2
+HIGH_SAD32XN 32, 2 ; highbd_sad_skip_32x32_sse2
+HIGH_SAD32XN 16, 2 ; highbd_sad_skip_32x16_sse2
 
 ; unsigned int vpx_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD16XN 1-2 0
   HIGH_SAD_FN 16, %1, 5, %2
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/4
+%else
   mov              n_rowsd, %1/2
+%endif
   pxor                  m0, m0
   pxor                  m6, m6
 
@@ -281,6 +320,9 @@ HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
   punpckldq             m0, m6
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -292,13 +334,19 @@ HIGH_SAD16XN  8 ; highbd_sad16x8_sse2
 HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
 HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
 HIGH_SAD16XN  8, 1 ; highbd_sad16x8_avg_sse2
-
+HIGH_SAD16XN 32, 2 ; highbd_sad_skip_16x32_sse2
+HIGH_SAD16XN 16, 2 ; highbd_sad_skip_16x16_sse2
+HIGH_SAD16XN  8, 2 ; highbd_sad_skip_16x8_sse2
 
 ; unsigned int vpx_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD8XN 1-2 0
   HIGH_SAD_FN 8, %1, 7, %2
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/8
+%else
   mov              n_rowsd, %1/4
+%endif
   pxor                  m0, m0
   pxor                  m6, m6
 
@@ -350,6 +398,9 @@ HIGH_SAD16XN  8, 1 ; highbd_sad16x8_avg_sse2
   punpckldq             m0, m6
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -361,3 +412,5 @@ HIGH_SAD8XN  4 ; highbd_sad8x4_sse2
 HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
 HIGH_SAD8XN  8, 1 ; highbd_sad8x8_avg_sse2
 HIGH_SAD8XN  4, 1 ; highbd_sad8x4_avg_sse2
+HIGH_SAD8XN 16, 2 ; highbd_sad_skip_8x16_sse2
+HIGH_SAD8XN  8, 2 ; highbd_sad_skip_8x8_sse2
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
index 30ee81b688..5a3a2818de 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@@ -32,12 +32,12 @@ SECTION .text
 
 ; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
 ;                               int x_offset, int y_offset,
-;                               const uint8_t *dst, ptrdiff_t dst_stride,
+;                               const uint8_t *ref, ptrdiff_t ref_stride,
 ;                               int height, unsigned int *sse);
 ;
 ; This function returns the SE and stores SSE in the given pointer.
 
-%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
+%macro SUM_SSE 6 ; src1, ref1, src2, ref2, sum, sse
   psubw                %3, %4
   psubw                %1, %2
   mova                 %4, %3       ; make copies to manipulate to calc sum
@@ -72,13 +72,13 @@ SECTION .text
   paddd                m6, m4
   mov                  r1, ssem         ; r1 = unsigned int *sse
   movd               [r1], m7           ; store sse
-  movd                rax, m6           ; store sum as return value
+  movd                eax, m6           ; store sum as return value
 %endif
   RET
 %endmacro
 
 %macro INC_SRC_BY_SRC_STRIDE  0
-%if ARCH_X86=1 && CONFIG_PIC=1
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
   add                srcq, src_stridemp
   add                srcq, src_stridemp
 %else
@@ -91,81 +91,65 @@ SECTION .text
 %define filter_idx_shift 5
 
 
-%ifdef PIC    ; 64bit PIC
+%if VPX_ARCH_X86_64
   %if %2 == 1 ; avg
     cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
                                       x_offset, y_offset, \
-                                      dst, dst_stride, \
-                                      sec, sec_stride, height, sse
-    %define sec_str sec_strideq
+                                      ref, ref_stride, \
+                                      second_pred, second_stride, height, sse
+    %define second_str second_strideq
   %else
-    cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
-                                  y_offset, dst, dst_stride, height, sse
+    cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
+                                  x_offset, y_offset, \
+                                  ref, ref_stride, height, sse
   %endif
   %define block_height heightd
   %define bilin_filter sseq
 %else
-  %if ARCH_X86=1 && CONFIG_PIC=1
+  %if CONFIG_PIC=1
     %if %2 == 1 ; avg
       cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
-                                  x_offset, y_offset, \
-                                  dst, dst_stride, \
-                                  sec, sec_stride, \
-                                  height, sse, g_bilin_filter, g_pw_8
+                                        x_offset, y_offset, \
+                                        ref, ref_stride, \
+                                        second_pred, second_stride, height, sse
       %define block_height dword heightm
-      %define sec_str sec_stridemp
-
-      ; Store bilin_filter and pw_8 location in stack
-      %if GET_GOT_DEFINED == 1
-        GET_GOT eax
-        add esp, 4                ; restore esp
-      %endif
-
-      lea ecx, [GLOBAL(bilin_filter_m)]
-      mov g_bilin_filterm, ecx
-
-      lea ecx, [GLOBAL(pw_8)]
-      mov g_pw_8m, ecx
-
-      LOAD_IF_USED 0, 1         ; load eax, ecx back
+      %define second_str second_stridemp
     %else
       cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
-                                x_offset, y_offset, dst, dst_stride, height, \
-                                sse, g_bilin_filter, g_pw_8
+                                    x_offset, y_offset, \
+                                    ref, ref_stride, height, sse
       %define block_height heightd
-
-      ; Store bilin_filter and pw_8 location in stack
-      %if GET_GOT_DEFINED == 1
-        GET_GOT eax
-        add esp, 4                ; restore esp
-      %endif
-
-      lea ecx, [GLOBAL(bilin_filter_m)]
-      mov g_bilin_filterm, ecx
-
-      lea ecx, [GLOBAL(pw_8)]
-      mov g_pw_8m, ecx
-
-      LOAD_IF_USED 0, 1         ; load eax, ecx back
     %endif
+
+    ; reuse argument stack space
+    %define g_bilin_filterm x_offsetm
+    %define g_pw_8m y_offsetm
+
+    ; Store bilin_filter and pw_8 location in stack
+    %if GET_GOT_DEFINED == 1
+      GET_GOT eax
+      add esp, 4                ; restore esp
+    %endif
+
+    lea ecx, [GLOBAL(bilin_filter_m)]
+    mov g_bilin_filterm, ecx
+
+    lea ecx, [GLOBAL(pw_8)]
+    mov g_pw_8m, ecx
+
+    LOAD_IF_USED 0, 1         ; load eax, ecx back
   %else
     %if %2 == 1 ; avg
-      cglobal highbd_sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
-                        7 + 2 * ARCH_X86_64, 13, src, src_stride, \
-                                             x_offset, y_offset, \
-                                             dst, dst_stride, \
-                                             sec, sec_stride, \
-                                             height, sse
-      %if ARCH_X86_64
-      %define block_height heightd
-      %define sec_str sec_strideq
-      %else
+      cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+                                        x_offset, y_offset, \
+                                        ref, ref_stride, \
+                                        second_pred, second_stride, height, sse
       %define block_height dword heightm
-      %define sec_str sec_stridemp
-      %endif
+      %define second_str second_stridemp
     %else
       cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
-                              x_offset, y_offset, dst, dst_stride, height, sse
+                                    x_offset, y_offset, \
+                                    ref, ref_stride, height, sse
       %define block_height heightd
     %endif
 
@@ -181,7 +165,7 @@ SECTION .text
   sar                   block_height, 1
 %endif
 %if %2 == 1 ; avg
-  shl             sec_str, 1
+  shl             second_str, 1
 %endif
 
   ; FIXME(rbultje) replace by jumptable?
@@ -196,35 +180,35 @@ SECTION .text
 %if %1 == 16
   movu                 m0, [srcq]
   movu                 m2, [srcq + 16]
-  mova                 m1, [dstq]
-  mova                 m3, [dstq + 16]
+  mova                 m1, [refq]
+  mova                 m3, [refq + 16]
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m2, [secq+16]
+  pavgw                m0, [second_predq]
+  pavgw                m2, [second_predq+16]
 %endif
   SUM_SSE              m0, m1, m2, m3, m6, m7
 
   lea                srcq, [srcq + src_strideq*2]
-  lea                dstq, [dstq + dst_strideq*2]
+  lea                refq, [refq + ref_strideq*2]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %else ; %1 < 16
   movu                 m0, [srcq]
   movu                 m2, [srcq + src_strideq*2]
-  mova                 m1, [dstq]
-  mova                 m3, [dstq + dst_strideq*2]
+  mova                 m1, [refq]
+  mova                 m3, [refq + ref_strideq*2]
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m2, [secq]
+  pavgw                m0, [second_predq]
+  add                second_predq, second_str
+  pavgw                m2, [second_predq]
 %endif
   SUM_SSE              m0, m1, m2, m3, m6, m7
 
   lea                srcq, [srcq + src_strideq*4]
-  lea                dstq, [dstq + dst_strideq*4]
+  lea                refq, [refq + ref_strideq*4]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %endif
   dec                   block_height
@@ -242,40 +226,40 @@ SECTION .text
   movu                 m1, [srcq+16]
   movu                 m4, [srcq+src_strideq*2]
   movu                 m5, [srcq+src_strideq*2+16]
-  mova                 m2, [dstq]
-  mova                 m3, [dstq+16]
+  mova                 m2, [refq]
+  mova                 m3, [refq+16]
   pavgw                m0, m4
   pavgw                m1, m5
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m1, [secq+16]
+  pavgw                m0, [second_predq]
+  pavgw                m1, [second_predq+16]
 %endif
   SUM_SSE              m0, m2, m1, m3, m6, m7
 
   lea                srcq, [srcq + src_strideq*2]
-  lea                dstq, [dstq + dst_strideq*2]
+  lea                refq, [refq + ref_strideq*2]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %else ; %1 < 16
   movu                 m0, [srcq]
   movu                 m1, [srcq+src_strideq*2]
   movu                 m5, [srcq+src_strideq*4]
-  mova                 m2, [dstq]
-  mova                 m3, [dstq+dst_strideq*2]
+  mova                 m2, [refq]
+  mova                 m3, [refq+ref_strideq*2]
   pavgw                m0, m1
   pavgw                m1, m5
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m1, [secq]
+  pavgw                m0, [second_predq]
+  add                second_predq, second_str
+  pavgw                m1, [second_predq]
 %endif
   SUM_SSE              m0, m2, m1, m3, m6, m7
 
   lea                srcq, [srcq + src_strideq*4]
-  lea                dstq, [dstq + dst_strideq*4]
+  lea                refq, [refq + ref_strideq*4]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %endif
   dec                   block_height
@@ -284,19 +268,19 @@ SECTION .text
 
 .x_zero_y_nonhalf:
   ; x_offset == 0 && y_offset == bilin interpolation
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if VPX_ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
+%if VPX_ARCH_X86_64 && mmsize == 16
   mova                 m8, [bilin_filter+y_offsetq]
   mova                 m9, [bilin_filter+y_offsetq+16]
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_y_a m8
 %define filter_y_b m9
 %define filter_rnd m10
 %else ; x86-32 or mmx
-%if ARCH_X86=1 && CONFIG_PIC=1
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
 ; x_offset == 0, reuse x_offset reg
 %define tempq x_offsetq
   add y_offsetq, g_bilin_filterm
@@ -308,7 +292,7 @@ SECTION .text
   add           y_offsetq, bilin_filter
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -318,8 +302,8 @@ SECTION .text
   movu                 m1, [srcq + 16]
   movu                 m4, [srcq+src_strideq*2]
   movu                 m5, [srcq+src_strideq*2+16]
-  mova                 m2, [dstq]
-  mova                 m3, [dstq+16]
+  mova                 m2, [refq]
+  mova                 m3, [refq+16]
   ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
   ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
   ; instructions is the same (5), but it is 1 mul instead of 2, so might be
@@ -336,23 +320,23 @@ SECTION .text
   psrlw                m1, 4
   psrlw                m0, 4
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m1, [secq+16]
+  pavgw                m0, [second_predq]
+  pavgw                m1, [second_predq+16]
 %endif
   SUM_SSE              m0, m2, m1, m3, m6, m7
 
   lea                srcq, [srcq + src_strideq*2]
-  lea                dstq, [dstq + dst_strideq*2]
+  lea                refq, [refq + ref_strideq*2]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %else ; %1 < 16
   movu                 m0, [srcq]
   movu                 m1, [srcq+src_strideq*2]
   movu                 m5, [srcq+src_strideq*4]
   mova                 m4, m1
-  mova                 m2, [dstq]
-  mova                 m3, [dstq+dst_strideq*2]
+  mova                 m2, [refq]
+  mova                 m3, [refq+ref_strideq*2]
   pmullw               m1, filter_y_a
   pmullw               m5, filter_y_b
   paddw                m1, filter_rnd
@@ -364,16 +348,16 @@ SECTION .text
   psrlw                m1, 4
   psrlw                m0, 4
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m1, [secq]
+  pavgw                m0, [second_predq]
+  add                second_predq, second_str
+  pavgw                m1, [second_predq]
 %endif
   SUM_SSE              m0, m2, m1, m3, m6, m7
 
   lea                srcq, [srcq + src_strideq*4]
-  lea                dstq, [dstq + dst_strideq*4]
+  lea                refq, [refq + ref_strideq*4]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %endif
   dec                   block_height
@@ -397,41 +381,41 @@ SECTION .text
   movu                 m1, [srcq + 16]
   movu                 m4, [srcq + 2]
   movu                 m5, [srcq + 18]
-  mova                 m2, [dstq]
-  mova                 m3, [dstq + 16]
+  mova                 m2, [refq]
+  mova                 m3, [refq + 16]
   pavgw                m0, m4
   pavgw                m1, m5
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m1, [secq+16]
+  pavgw                m0, [second_predq]
+  pavgw                m1, [second_predq+16]
 %endif
   SUM_SSE              m0, m2, m1, m3, m6, m7
 
   lea                srcq, [srcq + src_strideq*2]
-  lea                dstq, [dstq + dst_strideq*2]
+  lea                refq, [refq + ref_strideq*2]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %else ; %1 < 16
   movu                 m0, [srcq]
   movu                 m1, [srcq + src_strideq*2]
   movu                 m4, [srcq + 2]
   movu                 m5, [srcq + src_strideq*2 + 2]
-  mova                 m2, [dstq]
-  mova                 m3, [dstq + dst_strideq*2]
+  mova                 m2, [refq]
+  mova                 m3, [refq + ref_strideq*2]
   pavgw                m0, m4
   pavgw                m1, m5
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m1, [secq]
+  pavgw                m0, [second_predq]
+  add                second_predq, second_str
+  pavgw                m1, [second_predq]
 %endif
   SUM_SSE              m0, m2, m1, m3, m6, m7
 
   lea                srcq, [srcq + src_strideq*4]
-  lea                dstq, [dstq + dst_strideq*4]
+  lea                refq, [refq + ref_strideq*4]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %endif
   dec                   block_height
@@ -460,20 +444,20 @@ SECTION .text
   pavgw                m3, m5
   pavgw                m0, m2
   pavgw                m1, m3
-  mova                 m4, [dstq]
-  mova                 m5, [dstq + 16]
+  mova                 m4, [refq]
+  mova                 m5, [refq + 16]
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m1, [secq+16]
+  pavgw                m0, [second_predq]
+  pavgw                m1, [second_predq+16]
 %endif
   SUM_SSE              m0, m4, m1, m5, m6, m7
   mova                 m0, m2
   mova                 m1, m3
 
   lea                srcq, [srcq + src_strideq*2]
-  lea                dstq, [dstq + dst_strideq*2]
+  lea                refq, [refq + ref_strideq*2]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %else ; %1 < 16
   movu                 m0, [srcq]
@@ -489,20 +473,20 @@ SECTION .text
   pavgw                m3, m5
   pavgw                m0, m2
   pavgw                m2, m3
-  mova                 m4, [dstq]
-  mova                 m5, [dstq + dst_strideq*2]
+  mova                 m4, [refq]
+  mova                 m5, [refq + ref_strideq*2]
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m2, [secq]
+  pavgw                m0, [second_predq]
+  add                second_predq, second_str
+  pavgw                m2, [second_predq]
 %endif
   SUM_SSE              m0, m4, m2, m5, m6, m7
   mova                 m0, m3
 
   lea                srcq, [srcq + src_strideq*4]
-  lea                dstq, [dstq + dst_strideq*4]
+  lea                refq, [refq + ref_strideq*4]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %endif
   dec                   block_height
@@ -511,19 +495,19 @@ SECTION .text
 
 .x_half_y_nonhalf:
   ; x_offset == 0.5 && y_offset == bilin interpolation
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if VPX_ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
+%if VPX_ARCH_X86_64 && mmsize == 16
   mova                 m8, [bilin_filter+y_offsetq]
   mova                 m9, [bilin_filter+y_offsetq+16]
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_y_a m8
 %define filter_y_b m9
 %define filter_rnd m10
 %else  ; x86_32
-%if ARCH_X86=1 && CONFIG_PIC=1
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
 ; x_offset == 0.5. We can reuse x_offset reg
 %define tempq x_offsetq
   add y_offsetq, g_bilin_filterm
@@ -535,7 +519,7 @@ SECTION .text
   add           y_offsetq, bilin_filter
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -565,21 +549,21 @@ SECTION .text
   paddw                m0, filter_rnd
   psrlw                m1, 4
   paddw                m0, m2
-  mova                 m2, [dstq]
+  mova                 m2, [refq]
   psrlw                m0, 4
-  mova                 m3, [dstq+16]
+  mova                 m3, [refq+16]
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m1, [secq+16]
+  pavgw                m0, [second_predq]
+  pavgw                m1, [second_predq+16]
 %endif
   SUM_SSE              m0, m2, m1, m3, m6, m7
   mova                 m0, m4
   mova                 m1, m5
 
   lea                srcq, [srcq + src_strideq*2]
-  lea                dstq, [dstq + dst_strideq*2]
+  lea                refq, [refq + ref_strideq*2]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %else ; %1 < 16
   movu                 m0, [srcq]
@@ -604,21 +588,21 @@ SECTION .text
   paddw                m0, filter_rnd
   psrlw                m4, 4
   paddw                m0, m2
-  mova                 m2, [dstq]
+  mova                 m2, [refq]
   psrlw                m0, 4
-  mova                 m3, [dstq+dst_strideq*2]
+  mova                 m3, [refq+ref_strideq*2]
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m4, [secq]
+  pavgw                m0, [second_predq]
+  add                second_predq, second_str
+  pavgw                m4, [second_predq]
 %endif
   SUM_SSE              m0, m2, m4, m3, m6, m7
   mova                 m0, m5
 
   lea                srcq, [srcq + src_strideq*4]
-  lea                dstq, [dstq + dst_strideq*4]
+  lea                refq, [refq + ref_strideq*4]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %endif
   dec                   block_height
@@ -633,19 +617,19 @@ SECTION .text
   jnz .x_nonhalf_y_nonzero
 
   ; x_offset == bilin interpolation && y_offset == 0
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if VPX_ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
+%if VPX_ARCH_X86_64 && mmsize == 16
   mova                 m8, [bilin_filter+x_offsetq]
   mova                 m9, [bilin_filter+x_offsetq+16]
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_rnd m10
 %else    ; x86-32
-%if ARCH_X86=1 && CONFIG_PIC=1
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
 ; y_offset == 0. We can reuse y_offset reg.
 %define tempq y_offsetq
   add x_offsetq, g_bilin_filterm
@@ -657,7 +641,7 @@ SECTION .text
   add           x_offsetq, bilin_filter
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -667,8 +651,8 @@ SECTION .text
   movu                 m1, [srcq+16]
   movu                 m2, [srcq+2]
   movu                 m3, [srcq+18]
-  mova                 m4, [dstq]
-  mova                 m5, [dstq+16]
+  mova                 m4, [refq]
+  mova                 m5, [refq+16]
   pmullw               m1, filter_x_a
   pmullw               m3, filter_x_b
   paddw                m1, filter_rnd
@@ -680,23 +664,23 @@ SECTION .text
   psrlw                m1, 4
   psrlw                m0, 4
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m1, [secq+16]
+  pavgw                m0, [second_predq]
+  pavgw                m1, [second_predq+16]
 %endif
   SUM_SSE              m0, m4, m1, m5, m6, m7
 
   lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
+  lea                refq, [refq+ref_strideq*2]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %else ; %1 < 16
   movu                 m0, [srcq]
   movu                 m1, [srcq+src_strideq*2]
   movu                 m2, [srcq+2]
   movu                 m3, [srcq+src_strideq*2+2]
-  mova                 m4, [dstq]
-  mova                 m5, [dstq+dst_strideq*2]
+  mova                 m4, [refq]
+  mova                 m5, [refq+ref_strideq*2]
   pmullw               m1, filter_x_a
   pmullw               m3, filter_x_b
   paddw                m1, filter_rnd
@@ -708,16 +692,16 @@ SECTION .text
   psrlw                m1, 4
   psrlw                m0, 4
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m1, [secq]
+  pavgw                m0, [second_predq]
+  add                second_predq, second_str
+  pavgw                m1, [second_predq]
 %endif
   SUM_SSE              m0, m4, m1, m5, m6, m7
 
   lea                srcq, [srcq+src_strideq*4]
-  lea                dstq, [dstq+dst_strideq*4]
+  lea                refq, [refq+ref_strideq*4]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %endif
   dec                   block_height
@@ -732,19 +716,19 @@ SECTION .text
   jne .x_nonhalf_y_nonhalf
 
   ; x_offset == bilin interpolation && y_offset == 0.5
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if VPX_ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
+%if VPX_ARCH_X86_64 && mmsize == 16
   mova                 m8, [bilin_filter+x_offsetq]
   mova                 m9, [bilin_filter+x_offsetq+16]
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_rnd m10
 %else    ; x86-32
-%if ARCH_X86=1 && CONFIG_PIC=1
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
 ; y_offset == 0.5. We can reuse y_offset reg.
 %define tempq y_offsetq
   add x_offsetq, g_bilin_filterm
@@ -756,7 +740,7 @@ SECTION .text
   add           x_offsetq, bilin_filter
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -789,24 +773,24 @@ SECTION .text
   paddw                m3, filter_rnd
   paddw                m2, m4
   paddw                m3, m5
-  mova                 m4, [dstq]
-  mova                 m5, [dstq+16]
+  mova                 m4, [refq]
+  mova                 m5, [refq+16]
   psrlw                m2, 4
   psrlw                m3, 4
   pavgw                m0, m2
   pavgw                m1, m3
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m1, [secq+16]
+  pavgw                m0, [second_predq]
+  pavgw                m1, [second_predq+16]
 %endif
   SUM_SSE              m0, m4, m1, m5, m6, m7
   mova                 m0, m2
   mova                 m1, m3
 
   lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
+  lea                refq, [refq+ref_strideq*2]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %else ; %1 < 16
   movu                 m0, [srcq]
@@ -830,24 +814,24 @@ SECTION .text
   paddw                m3, filter_rnd
   paddw                m2, m4
   paddw                m3, m5
-  mova                 m4, [dstq]
-  mova                 m5, [dstq+dst_strideq*2]
+  mova                 m4, [refq]
+  mova                 m5, [refq+ref_strideq*2]
   psrlw                m2, 4
   psrlw                m3, 4
   pavgw                m0, m2
   pavgw                m2, m3
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m2, [secq]
+  pavgw                m0, [second_predq]
+  add                second_predq, second_str
+  pavgw                m2, [second_predq]
 %endif
   SUM_SSE              m0, m4, m2, m5, m6, m7
   mova                 m0, m3
 
   lea                srcq, [srcq+src_strideq*4]
-  lea                dstq, [dstq+dst_strideq*4]
+  lea                refq, [refq+ref_strideq*4]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %endif
   dec                   block_height
@@ -859,24 +843,24 @@ SECTION .text
 
 .x_nonhalf_y_nonhalf:
 ; loading filter - this is same as in 8-bit depth
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if VPX_ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift ; filter_idx_shift = 5
   shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
+%if VPX_ARCH_X86_64 && mmsize == 16
   mova                 m8, [bilin_filter+x_offsetq]
   mova                 m9, [bilin_filter+x_offsetq+16]
   mova                m10, [bilin_filter+y_offsetq]
   mova                m11, [bilin_filter+y_offsetq+16]
-  mova                m12, [pw_8]
+  mova                m12, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_y_a m10
 %define filter_y_b m11
 %define filter_rnd m12
 %else   ; x86-32
-%if ARCH_X86=1 && CONFIG_PIC=1
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
 ; In this case, there is NO unused register. Used src_stride register. Later,
 ; src_stride has to be loaded from stack when it is needed.
 %define tempq src_strideq
@@ -897,7 +881,7 @@ SECTION .text
 %define filter_x_b [x_offsetq+16]
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 ; end of load filter
@@ -945,23 +929,23 @@ SECTION .text
   pmullw               m3, filter_y_b
   paddw                m0, m2
   paddw                m1, filter_rnd
-  mova                 m2, [dstq]
+  mova                 m2, [refq]
   paddw                m1, m3
   psrlw                m0, 4
   psrlw                m1, 4
-  mova                 m3, [dstq+16]
+  mova                 m3, [refq+16]
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m1, [secq+16]
+  pavgw                m0, [second_predq]
+  pavgw                m1, [second_predq+16]
 %endif
   SUM_SSE              m0, m2, m1, m3, m6, m7
   mova                 m0, m4
   mova                 m1, m5
 
   INC_SRC_BY_SRC_STRIDE
-  lea                dstq, [dstq + dst_strideq * 2]
+  lea                refq, [refq + ref_strideq * 2]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %else ; %1 < 16
   movu                 m0, [srcq]
@@ -999,23 +983,23 @@ SECTION .text
   pmullw               m3, filter_y_b
   paddw                m0, m2
   paddw                m4, filter_rnd
-  mova                 m2, [dstq]
+  mova                 m2, [refq]
   paddw                m4, m3
   psrlw                m0, 4
   psrlw                m4, 4
-  mova                 m3, [dstq+dst_strideq*2]
+  mova                 m3, [refq+ref_strideq*2]
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m4, [secq]
+  pavgw                m0, [second_predq]
+  add                second_predq, second_str
+  pavgw                m4, [second_predq]
 %endif
   SUM_SSE              m0, m2, m4, m3, m6, m7
   mova                 m0, m5
 
   INC_SRC_BY_SRC_STRIDE
-  lea                dstq, [dstq + dst_strideq * 4]
+  lea                refq, [refq + ref_strideq * 4]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %endif
   dec                   block_height
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm
index 923418a992..5bee51fa0c 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm
@@ -11,16 +11,18 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
+
 ;unsigned int vpx_highbd_calc16x16var_sse2
 ;(
 ;    unsigned char   *  src_ptr,
-;    int             source_stride,
+;    int             src_stride,
 ;    unsigned char   *  ref_ptr,
-;    int             recon_stride,
+;    int             ref_stride,
 ;    unsigned int    *  SSE,
 ;    int             *  Sum
 ;)
-global sym(vpx_highbd_calc16x16var_sse2) PRIVATE
+globalsym(vpx_highbd_calc16x16var_sse2)
 sym(vpx_highbd_calc16x16var_sse2):
     push        rbp
     mov         rbp, rsp
@@ -34,8 +36,8 @@ sym(vpx_highbd_calc16x16var_sse2):
         mov         rsi,            arg(0) ;[src_ptr]
         mov         rdi,            arg(2) ;[ref_ptr]
 
-        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
-        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
+        movsxd      rax,            DWORD PTR arg(1) ;[src_stride]
+        movsxd      rdx,            DWORD PTR arg(3) ;[ref_stride]
         add         rax,            rax ; source stride in bytes
         add         rdx,            rdx ; recon stride in bytes
 
@@ -167,13 +169,13 @@ sym(vpx_highbd_calc16x16var_sse2):
 ;unsigned int vpx_highbd_calc8x8var_sse2
 ;(
 ;    unsigned char   *  src_ptr,
-;    int             source_stride,
+;    int             src_stride,
 ;    unsigned char   *  ref_ptr,
-;    int             recon_stride,
+;    int             ref_stride,
 ;    unsigned int    *  SSE,
 ;    int             *  Sum
 ;)
-global sym(vpx_highbd_calc8x8var_sse2) PRIVATE
+globalsym(vpx_highbd_calc8x8var_sse2)
 sym(vpx_highbd_calc8x8var_sse2):
     push        rbp
     mov         rbp, rsp
@@ -187,8 +189,8 @@ sym(vpx_highbd_calc8x8var_sse2):
         mov         rsi,            arg(0) ;[src_ptr]
         mov         rdi,            arg(2) ;[ref_ptr]
 
-        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
-        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
+        movsxd      rax,            DWORD PTR arg(1) ;[src_stride]
+        movsxd      rdx,            DWORD PTR arg(3) ;[ref_stride]
         add         rax,            rax ; source stride in bytes
         add         rdx,            rdx ; recon stride in bytes
 
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_sse2.c
index 414ae5de1a..381e0ad193 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_sse2.c
+++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_sse2.c
@@ -7,8 +7,10 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include "./vpx_config.h"
+#include <emmintrin.h>  // SSE2
 
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
 
 typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride,
@@ -89,9 +91,9 @@ static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
 }
 
 #define HIGH_GET_VAR(S)                                                       \
-  void vpx_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
-                                         const uint8_t *ref8, int ref_stride, \
-                                         uint32_t *sse, int *sum) {           \
+  void vpx_highbd_8_get##S##x##S##var_sse2(                                   \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
+      int ref_stride, uint32_t *sse, int *sum) {                              \
     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                \
     uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                \
     vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
@@ -120,8 +122,8 @@ static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
     *sse = ROUND_POWER_OF_TWO(*sse, 8);                                       \
   }
 
-HIGH_GET_VAR(16);
-HIGH_GET_VAR(8);
+HIGH_GET_VAR(16)
+HIGH_GET_VAR(8)
 
 #undef HIGH_GET_VAR
 
@@ -135,7 +137,7 @@ HIGH_GET_VAR(8);
     highbd_8_variance_sse2(                                                \
         src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
         vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
-    return *sse - (((int64_t)sum * sum) >> shift);                         \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> (shift));             \
   }                                                                        \
                                                                            \
   uint32_t vpx_highbd_10_variance##w##x##h##_sse2(                         \
@@ -148,7 +150,7 @@ HIGH_GET_VAR(8);
     highbd_10_variance_sse2(                                               \
         src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
         vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> (shift));             \
     return (var >= 0) ? (uint32_t)var : 0;                                 \
   }                                                                        \
                                                                            \
@@ -162,20 +164,20 @@ HIGH_GET_VAR(8);
     highbd_12_variance_sse2(                                               \
         src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
         vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> (shift));             \
     return (var >= 0) ? (uint32_t)var : 0;                                 \
   }
 
-VAR_FN(64, 64, 16, 12);
-VAR_FN(64, 32, 16, 11);
-VAR_FN(32, 64, 16, 11);
-VAR_FN(32, 32, 16, 10);
-VAR_FN(32, 16, 16, 9);
-VAR_FN(16, 32, 16, 9);
-VAR_FN(16, 16, 16, 8);
-VAR_FN(16, 8, 8, 7);
-VAR_FN(8, 16, 8, 7);
-VAR_FN(8, 8, 8, 6);
+VAR_FN(64, 64, 16, 12)
+VAR_FN(64, 32, 16, 11)
+VAR_FN(32, 64, 16, 11)
+VAR_FN(32, 32, 16, 10)
+VAR_FN(32, 16, 16, 9)
+VAR_FN(16, 32, 16, 9)
+VAR_FN(16, 16, 16, 8)
+VAR_FN(16, 8, 8, 7)
+VAR_FN(8, 16, 8, 7)
+VAR_FN(8, 8, 8, 6)
 
 #undef VAR_FN
 
@@ -251,13 +253,13 @@ unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
 #define DECL(w, opt)                                                         \
   int vpx_highbd_sub_pixel_variance##w##xh_##opt(                            \
       const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
-      const uint16_t *dst, ptrdiff_t dst_stride, int height,                 \
+      const uint16_t *ref, ptrdiff_t ref_stride, int height,                 \
       unsigned int *sse, void *unused0, void *unused);
 #define DECLS(opt) \
-  DECL(8, opt);    \
+  DECL(8, opt)     \
   DECL(16, opt)
 
-DECLS(sse2);
+DECLS(sse2)
 
 #undef DECLS
 #undef DECL
@@ -265,61 +267,62 @@ DECLS(sse2);
 #define FN(w, h, wf, wlog2, hlog2, opt, cast)                                  \
   uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(                   \
       const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
+      const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr) {                \
     uint32_t sse;                                                              \
     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                 \
     int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                      \
-        src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL,   \
+        src, src_stride, x_offset, y_offset, ref, ref_stride, h, &sse, NULL,   \
         NULL);                                                                 \
     if (w > wf) {                                                              \
       unsigned int sse2;                                                       \
       int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h,   \
+          src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, h,   \
           &sse2, NULL, NULL);                                                  \
       se += se2;                                                               \
       sse += sse2;                                                             \
       if (w > wf * 2) {                                                        \
         se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
+            src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, h, \
             &sse2, NULL, NULL);                                                \
         se += se2;                                                             \
         sse += sse2;                                                           \
         se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
+            src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, h, \
             &sse2, NULL, NULL);                                                \
         se += se2;                                                             \
         sse += sse2;                                                           \
       }                                                                        \
     }                                                                          \
     *sse_ptr = sse;                                                            \
-    return sse - ((cast se * se) >> (wlog2 + hlog2));                          \
+    return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2));                \
   }                                                                            \
                                                                                \
   uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt(                  \
       const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
+      const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr) {                \
+    int64_t var;                                                               \
     uint32_t sse;                                                              \
     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                 \
     int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                      \
-        src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL,   \
+        src, src_stride, x_offset, y_offset, ref, ref_stride, h, &sse, NULL,   \
         NULL);                                                                 \
     if (w > wf) {                                                              \
       uint32_t sse2;                                                           \
       int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h,   \
+          src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, h,   \
           &sse2, NULL, NULL);                                                  \
       se += se2;                                                               \
       sse += sse2;                                                             \
       if (w > wf * 2) {                                                        \
         se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
+            src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, h, \
             &sse2, NULL, NULL);                                                \
         se += se2;                                                             \
         sse += sse2;                                                           \
         se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
+            src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, h, \
             &sse2, NULL, NULL);                                                \
         se += se2;                                                             \
         sse += sse2;                                                           \
@@ -328,44 +331,46 @@ DECLS(sse2);
     se = ROUND_POWER_OF_TWO(se, 2);                                            \
     sse = ROUND_POWER_OF_TWO(sse, 4);                                          \
     *sse_ptr = sse;                                                            \
-    return sse - ((cast se * se) >> (wlog2 + hlog2));                          \
+    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
+    return (var >= 0) ? (uint32_t)var : 0;                                     \
   }                                                                            \
                                                                                \
   uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt(                  \
       const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
+      const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr) {                \
     int start_row;                                                             \
     uint32_t sse;                                                              \
     int se = 0;                                                                \
+    int64_t var;                                                               \
     uint64_t long_sse = 0;                                                     \
     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                 \
     for (start_row = 0; start_row < h; start_row += 16) {                      \
       uint32_t sse2;                                                           \
       int height = h - start_row < 16 ? h - start_row : 16;                    \
       int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
           src + (start_row * src_stride), src_stride, x_offset, y_offset,      \
-          dst + (start_row * dst_stride), dst_stride, height, &sse2, NULL,     \
+          ref + (start_row * ref_stride), ref_stride, height, &sse2, NULL,     \
           NULL);                                                               \
       se += se2;                                                               \
       long_sse += sse2;                                                        \
       if (w > wf) {                                                            \
         se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
             src + 16 + (start_row * src_stride), src_stride, x_offset,         \
-            y_offset, dst + 16 + (start_row * dst_stride), dst_stride, height, \
+            y_offset, ref + 16 + (start_row * ref_stride), ref_stride, height, \
             &sse2, NULL, NULL);                                                \
         se += se2;                                                             \
         long_sse += sse2;                                                      \
         if (w > wf * 2) {                                                      \
           se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
               src + 32 + (start_row * src_stride), src_stride, x_offset,       \
-              y_offset, dst + 32 + (start_row * dst_stride), dst_stride,       \
+              y_offset, ref + 32 + (start_row * ref_stride), ref_stride,       \
               height, &sse2, NULL, NULL);                                      \
           se += se2;                                                           \
           long_sse += sse2;                                                    \
           se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
               src + 48 + (start_row * src_stride), src_stride, x_offset,       \
-              y_offset, dst + 48 + (start_row * dst_stride), dst_stride,       \
+              y_offset, ref + 48 + (start_row * ref_stride), ref_stride,       \
               height, &sse2, NULL, NULL);                                      \
           se += se2;                                                           \
           long_sse += sse2;                                                    \
@@ -375,23 +380,24 @@ DECLS(sse2);
     se = ROUND_POWER_OF_TWO(se, 4);                                            \
     sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8);                           \
     *sse_ptr = sse;                                                            \
-    return sse - ((cast se * se) >> (wlog2 + hlog2));                          \
+    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
+    return (var >= 0) ? (uint32_t)var : 0;                                     \
   }
 
-#define FNS(opt)                        \
-  FN(64, 64, 16, 6, 6, opt, (int64_t)); \
-  FN(64, 32, 16, 6, 5, opt, (int64_t)); \
-  FN(32, 64, 16, 5, 6, opt, (int64_t)); \
-  FN(32, 32, 16, 5, 5, opt, (int64_t)); \
-  FN(32, 16, 16, 5, 4, opt, (int64_t)); \
-  FN(16, 32, 16, 4, 5, opt, (int64_t)); \
-  FN(16, 16, 16, 4, 4, opt, (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt, (int64_t));  \
-  FN(8, 16, 8, 3, 4, opt, (int64_t));   \
-  FN(8, 8, 8, 3, 3, opt, (int64_t));    \
-  FN(8, 4, 8, 3, 2, opt, (int64_t));
+#define FNS(opt)                       \
+  FN(64, 64, 16, 6, 6, opt, (int64_t)) \
+  FN(64, 32, 16, 6, 5, opt, (int64_t)) \
+  FN(32, 64, 16, 5, 6, opt, (int64_t)) \
+  FN(32, 32, 16, 5, 5, opt, (int64_t)) \
+  FN(32, 16, 16, 5, 4, opt, (int64_t)) \
+  FN(16, 32, 16, 4, 5, opt, (int64_t)) \
+  FN(16, 16, 16, 4, 4, opt, (int64_t)) \
+  FN(16, 8, 16, 4, 3, opt, (int64_t))  \
+  FN(8, 16, 8, 3, 4, opt, (int64_t))   \
+  FN(8, 8, 8, 3, 3, opt, (int64_t))    \
+  FN(8, 4, 8, 3, 2, opt, (int64_t))
 
-FNS(sse2);
+FNS(sse2)
 
 #undef FNS
 #undef FN
@@ -400,79 +406,80 @@ FNS(sse2);
 #define DECL(w, opt)                                                         \
   int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(                        \
       const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
-      const uint16_t *dst, ptrdiff_t dst_stride, const uint16_t *sec,        \
-      ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0,    \
+      const uint16_t *ref, ptrdiff_t ref_stride, const uint16_t *second,     \
+      ptrdiff_t second_stride, int height, unsigned int *sse, void *unused0, \
       void *unused);
 #define DECLS(opt1) \
   DECL(16, opt1)    \
   DECL(8, opt1)
 
-DECLS(sse2);
+DECLS(sse2)
 #undef DECL
 #undef DECLS
 
 #define FN(w, h, wf, wlog2, hlog2, opt, cast)                                  \
   uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt(               \
       const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
+      const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr,                  \
       const uint8_t *sec8) {                                                   \
     uint32_t sse;                                                              \
     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                 \
     uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
     int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                  \
-        src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
+        src, src_stride, x_offset, y_offset, ref, ref_stride, sec, w, h, &sse, \
         NULL, NULL);                                                           \
     if (w > wf) {                                                              \
       uint32_t sse2;                                                           \
       int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride,      \
+          src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride,      \
           sec + 16, w, h, &sse2, NULL, NULL);                                  \
       se += se2;                                                               \
       sse += sse2;                                                             \
       if (w > wf * 2) {                                                        \
         se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride,    \
+            src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride,    \
             sec + 32, w, h, &sse2, NULL, NULL);                                \
         se += se2;                                                             \
         sse += sse2;                                                           \
         se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride,    \
+            src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride,    \
             sec + 48, w, h, &sse2, NULL, NULL);                                \
         se += se2;                                                             \
         sse += sse2;                                                           \
       }                                                                        \
     }                                                                          \
     *sse_ptr = sse;                                                            \
-    return sse - ((cast se * se) >> (wlog2 + hlog2));                          \
+    return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2));                \
   }                                                                            \
                                                                                \
   uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt(              \
       const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
+      const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr,                  \
       const uint8_t *sec8) {                                                   \
+    int64_t var;                                                               \
     uint32_t sse;                                                              \
     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                 \
     uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
     int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                  \
-        src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
+        src, src_stride, x_offset, y_offset, ref, ref_stride, sec, w, h, &sse, \
         NULL, NULL);                                                           \
     if (w > wf) {                                                              \
       uint32_t sse2;                                                           \
       int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride,      \
+          src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride,      \
           sec + 16, w, h, &sse2, NULL, NULL);                                  \
       se += se2;                                                               \
       sse += sse2;                                                             \
       if (w > wf * 2) {                                                        \
         se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride,    \
+            src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride,    \
             sec + 32, w, h, &sse2, NULL, NULL);                                \
         se += se2;                                                             \
         sse += sse2;                                                           \
         se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride,    \
+            src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride,    \
             sec + 48, w, h, &sse2, NULL, NULL);                                \
         se += se2;                                                             \
         sse += sse2;                                                           \
@@ -481,46 +488,48 @@ DECLS(sse2);
     se = ROUND_POWER_OF_TWO(se, 2);                                            \
     sse = ROUND_POWER_OF_TWO(sse, 4);                                          \
     *sse_ptr = sse;                                                            \
-    return sse - ((cast se * se) >> (wlog2 + hlog2));                          \
+    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
+    return (var >= 0) ? (uint32_t)var : 0;                                     \
   }                                                                            \
                                                                                \
   uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt(              \
       const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
+      const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr,                  \
       const uint8_t *sec8) {                                                   \
     int start_row;                                                             \
+    int64_t var;                                                               \
     uint32_t sse;                                                              \
     int se = 0;                                                                \
     uint64_t long_sse = 0;                                                     \
     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                 \
     uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
     for (start_row = 0; start_row < h; start_row += 16) {                      \
       uint32_t sse2;                                                           \
       int height = h - start_row < 16 ? h - start_row : 16;                    \
       int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
           src + (start_row * src_stride), src_stride, x_offset, y_offset,      \
-          dst + (start_row * dst_stride), dst_stride, sec + (start_row * w),   \
+          ref + (start_row * ref_stride), ref_stride, sec + (start_row * w),   \
           w, height, &sse2, NULL, NULL);                                       \
       se += se2;                                                               \
       long_sse += sse2;                                                        \
       if (w > wf) {                                                            \
         se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
             src + 16 + (start_row * src_stride), src_stride, x_offset,         \
-            y_offset, dst + 16 + (start_row * dst_stride), dst_stride,         \
+            y_offset, ref + 16 + (start_row * ref_stride), ref_stride,         \
             sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL);         \
         se += se2;                                                             \
         long_sse += sse2;                                                      \
         if (w > wf * 2) {                                                      \
           se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
               src + 32 + (start_row * src_stride), src_stride, x_offset,       \
-              y_offset, dst + 32 + (start_row * dst_stride), dst_stride,       \
+              y_offset, ref + 32 + (start_row * ref_stride), ref_stride,       \
               sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL);       \
           se += se2;                                                           \
           long_sse += sse2;                                                    \
           se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
               src + 48 + (start_row * src_stride), src_stride, x_offset,       \
-              y_offset, dst + 48 + (start_row * dst_stride), dst_stride,       \
+              y_offset, ref + 48 + (start_row * ref_stride), ref_stride,       \
               sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL);       \
           se += se2;                                                           \
           long_sse += sse2;                                                    \
@@ -530,23 +539,70 @@ DECLS(sse2);
     se = ROUND_POWER_OF_TWO(se, 4);                                            \
     sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8);                           \
     *sse_ptr = sse;                                                            \
-    return sse - ((cast se * se) >> (wlog2 + hlog2));                          \
+    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
+    return (var >= 0) ? (uint32_t)var : 0;                                     \
   }
 
-#define FNS(opt1)                        \
-  FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
-  FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
-  FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
-  FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
-  FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
-  FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
-  FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt1, (int64_t));  \
-  FN(8, 16, 8, 4, 3, opt1, (int64_t));   \
-  FN(8, 8, 8, 3, 3, opt1, (int64_t));    \
-  FN(8, 4, 8, 3, 2, opt1, (int64_t));
+#define FNS(opt1)                       \
+  FN(64, 64, 16, 6, 6, opt1, (int64_t)) \
+  FN(64, 32, 16, 6, 5, opt1, (int64_t)) \
+  FN(32, 64, 16, 5, 6, opt1, (int64_t)) \
+  FN(32, 32, 16, 5, 5, opt1, (int64_t)) \
+  FN(32, 16, 16, 5, 4, opt1, (int64_t)) \
+  FN(16, 32, 16, 4, 5, opt1, (int64_t)) \
+  FN(16, 16, 16, 4, 4, opt1, (int64_t)) \
+  FN(16, 8, 16, 4, 3, opt1, (int64_t))  \
+  FN(8, 16, 8, 4, 3, opt1, (int64_t))   \
+  FN(8, 8, 8, 3, 3, opt1, (int64_t))    \
+  FN(8, 4, 8, 3, 2, opt1, (int64_t))
 
-FNS(sse2);
+FNS(sse2)
 
 #undef FNS
 #undef FN
+
+void vpx_highbd_comp_avg_pred_sse2(uint16_t *comp_pred, const uint16_t *pred,
+                                   int width, int height, const uint16_t *ref,
+                                   int ref_stride) {
+  int i, j;
+  if (width > 8) {
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; j += 16) {
+        const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[j]);
+        const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[j + 8]);
+        const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[j]);
+        const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[j + 8]);
+        _mm_storeu_si128((__m128i *)&comp_pred[j], _mm_avg_epu16(p0, r0));
+        _mm_storeu_si128((__m128i *)&comp_pred[j + 8], _mm_avg_epu16(p1, r1));
+      }
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    }
+  } else if (width == 8) {
+    for (i = 0; i < height; i += 2) {
+      const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[0]);
+      const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[8]);
+      const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[0]);
+      const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[ref_stride]);
+      _mm_storeu_si128((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0));
+      _mm_storeu_si128((__m128i *)&comp_pred[8], _mm_avg_epu16(p1, r1));
+      comp_pred += 8 << 1;
+      pred += 8 << 1;
+      ref += ref_stride << 1;
+    }
+  } else {
+    assert(width == 4);
+    for (i = 0; i < height; i += 2) {
+      const __m128i p0 = _mm_loadl_epi64((const __m128i *)&pred[0]);
+      const __m128i p1 = _mm_loadl_epi64((const __m128i *)&pred[4]);
+      const __m128i r0 = _mm_loadl_epi64((const __m128i *)&ref[0]);
+      const __m128i r1 = _mm_loadl_epi64((const __m128i *)&ref[ref_stride]);
+      _mm_storel_epi64((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0));
+      _mm_storel_epi64((__m128i *)&comp_pred[4], _mm_avg_epu16(p1, r1));
+      comp_pred += 4 << 1;
+      pred += 4 << 1;
+      ref += ref_stride << 1;
+    }
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/intrapred_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/intrapred_sse2.asm
index c18095c287..61af6236ed 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/intrapred_sse2.asm
+++ b/media/libvpx/libvpx/vpx_dsp/x86/intrapred_sse2.asm
@@ -61,7 +61,7 @@ cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset
   psrlq                m3, 8
   movd   [dstq+strideq  ], m3
   psrlq                m0, 56
-  movd              tempq, m0
+  movd              tempd, m0
   mov    [dstq+strideq+3], tempb
 
   RESTORE_GOT
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_avx2.c
new file mode 100644
index 0000000000..752435d240
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_avx2.c
@@ -0,0 +1,626 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>  // AVX2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+
+#define PAIR256_SET_EPI16(a, b)                                            \
+  _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
+
+static INLINE void idct_load16x16(const tran_low_t *input, __m256i *in,
+                                  int stride) {
+  int i;
+  // Load 16x16 values
+  for (i = 0; i < 16; i++) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    const __m128i in0 = _mm_loadu_si128((const __m128i *)(input + i * stride));
+    const __m128i in1 =
+        _mm_loadu_si128((const __m128i *)((input + i * stride) + 4));
+    const __m128i in2 =
+        _mm_loadu_si128((const __m128i *)((input + i * stride) + 8));
+    const __m128i in3 =
+        _mm_loadu_si128((const __m128i *)((input + i * stride) + 12));
+    const __m128i ls = _mm_packs_epi32(in0, in1);
+    const __m128i rs = _mm_packs_epi32(in2, in3);
+    in[i] = _mm256_inserti128_si256(_mm256_castsi128_si256(ls), rs, 1);
+#else
+    in[i] = _mm256_load_si256((const __m256i *)(input + i * stride));
+#endif
+  }
+}
+
+static INLINE __m256i dct_round_shift_avx2(__m256i in) {
+  const __m256i t = _mm256_add_epi32(in, _mm256_set1_epi32(DCT_CONST_ROUNDING));
+  return _mm256_srai_epi32(t, DCT_CONST_BITS);
+}
+
+static INLINE __m256i idct_madd_round_shift_avx2(__m256i *in, __m256i *cospi) {
+  const __m256i t = _mm256_madd_epi16(*in, *cospi);
+  return dct_round_shift_avx2(t);
+}
+
+// Calculate the dot product between in0/1 and x and wrap to short.
+static INLINE __m256i idct_calc_wraplow_avx2(__m256i *in0, __m256i *in1,
+                                             __m256i *x) {
+  const __m256i t0 = idct_madd_round_shift_avx2(in0, x);
+  const __m256i t1 = idct_madd_round_shift_avx2(in1, x);
+  return _mm256_packs_epi32(t0, t1);
+}
+
+// Multiply elements by constants and add them together.
+static INLINE void butterfly16(__m256i in0, __m256i in1, int c0, int c1,
+                               __m256i *out0, __m256i *out1) {
+  __m256i cst0 = PAIR256_SET_EPI16(c0, -c1);
+  __m256i cst1 = PAIR256_SET_EPI16(c1, c0);
+  __m256i lo = _mm256_unpacklo_epi16(in0, in1);
+  __m256i hi = _mm256_unpackhi_epi16(in0, in1);
+  *out0 = idct_calc_wraplow_avx2(&lo, &hi, &cst0);
+  *out1 = idct_calc_wraplow_avx2(&lo, &hi, &cst1);
+}
+
+static INLINE void idct16_16col(__m256i *in, __m256i *out) {
+  __m256i step1[16], step2[16];
+
+  // stage 2
+  butterfly16(in[1], in[15], cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+  butterfly16(in[9], in[7], cospi_14_64, cospi_18_64, &step2[9], &step2[14]);
+  butterfly16(in[5], in[11], cospi_22_64, cospi_10_64, &step2[10], &step2[13]);
+  butterfly16(in[13], in[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+  // stage 3
+  butterfly16(in[2], in[14], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+  butterfly16(in[10], in[6], cospi_12_64, cospi_20_64, &step1[5], &step1[6]);
+  step1[8] = _mm256_add_epi16(step2[8], step2[9]);
+  step1[9] = _mm256_sub_epi16(step2[8], step2[9]);
+  step1[10] = _mm256_sub_epi16(step2[11], step2[10]);
+  step1[11] = _mm256_add_epi16(step2[10], step2[11]);
+  step1[12] = _mm256_add_epi16(step2[12], step2[13]);
+  step1[13] = _mm256_sub_epi16(step2[12], step2[13]);
+  step1[14] = _mm256_sub_epi16(step2[15], step2[14]);
+  step1[15] = _mm256_add_epi16(step2[14], step2[15]);
+
+  // stage 4
+  butterfly16(in[0], in[8], cospi_16_64, cospi_16_64, &step2[1], &step2[0]);
+  butterfly16(in[4], in[12], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+  butterfly16(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+              &step2[14]);
+  butterfly16(step1[10], step1[13], -cospi_8_64, -cospi_24_64, &step2[13],
+              &step2[10]);
+  step2[5] = _mm256_sub_epi16(step1[4], step1[5]);
+  step1[4] = _mm256_add_epi16(step1[4], step1[5]);
+  step2[6] = _mm256_sub_epi16(step1[7], step1[6]);
+  step1[7] = _mm256_add_epi16(step1[6], step1[7]);
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  // stage 5
+  step1[0] = _mm256_add_epi16(step2[0], step2[3]);
+  step1[1] = _mm256_add_epi16(step2[1], step2[2]);
+  step1[2] = _mm256_sub_epi16(step2[1], step2[2]);
+  step1[3] = _mm256_sub_epi16(step2[0], step2[3]);
+  butterfly16(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5],
+              &step1[6]);
+  step1[8] = _mm256_add_epi16(step2[8], step2[11]);
+  step1[9] = _mm256_add_epi16(step2[9], step2[10]);
+  step1[10] = _mm256_sub_epi16(step2[9], step2[10]);
+  step1[11] = _mm256_sub_epi16(step2[8], step2[11]);
+  step1[12] = _mm256_sub_epi16(step2[15], step2[12]);
+  step1[13] = _mm256_sub_epi16(step2[14], step2[13]);
+  step1[14] = _mm256_add_epi16(step2[14], step2[13]);
+  step1[15] = _mm256_add_epi16(step2[15], step2[12]);
+
+  // stage 6
+  step2[0] = _mm256_add_epi16(step1[0], step1[7]);
+  step2[1] = _mm256_add_epi16(step1[1], step1[6]);
+  step2[2] = _mm256_add_epi16(step1[2], step1[5]);
+  step2[3] = _mm256_add_epi16(step1[3], step1[4]);
+  step2[4] = _mm256_sub_epi16(step1[3], step1[4]);
+  step2[5] = _mm256_sub_epi16(step1[2], step1[5]);
+  step2[6] = _mm256_sub_epi16(step1[1], step1[6]);
+  step2[7] = _mm256_sub_epi16(step1[0], step1[7]);
+  butterfly16(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10],
+              &step2[13]);
+  butterfly16(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11],
+              &step2[12]);
+
+  // stage 7
+  out[0] = _mm256_add_epi16(step2[0], step1[15]);
+  out[1] = _mm256_add_epi16(step2[1], step1[14]);
+  out[2] = _mm256_add_epi16(step2[2], step2[13]);
+  out[3] = _mm256_add_epi16(step2[3], step2[12]);
+  out[4] = _mm256_add_epi16(step2[4], step2[11]);
+  out[5] = _mm256_add_epi16(step2[5], step2[10]);
+  out[6] = _mm256_add_epi16(step2[6], step1[9]);
+  out[7] = _mm256_add_epi16(step2[7], step1[8]);
+  out[8] = _mm256_sub_epi16(step2[7], step1[8]);
+  out[9] = _mm256_sub_epi16(step2[6], step1[9]);
+  out[10] = _mm256_sub_epi16(step2[5], step2[10]);
+  out[11] = _mm256_sub_epi16(step2[4], step2[11]);
+  out[12] = _mm256_sub_epi16(step2[3], step2[12]);
+  out[13] = _mm256_sub_epi16(step2[2], step2[13]);
+  out[14] = _mm256_sub_epi16(step2[1], step1[14]);
+  out[15] = _mm256_sub_epi16(step2[0], step1[15]);
+}
+
+static INLINE void recon_and_store16(uint8_t *dest, __m256i in_x) {
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i d0 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dest)));
+  d0 = _mm256_permute4x64_epi64(d0, 0xd8);
+  d0 = _mm256_unpacklo_epi8(d0, zero);
+  d0 = _mm256_add_epi16(in_x, d0);
+  d0 = _mm256_packus_epi16(
+      d0, _mm256_castsi128_si256(_mm256_extractf128_si256(d0, 1)));
+
+  _mm_storeu_si128((__m128i *)dest, _mm256_castsi256_si128(d0));
+}
+
+static INLINE void write_buffer_16x1(uint8_t *dest, __m256i in) {
+  const __m256i final_rounding = _mm256_set1_epi16(1 << 5);
+  __m256i out;
+  out = _mm256_adds_epi16(in, final_rounding);
+  out = _mm256_srai_epi16(out, 6);
+  recon_and_store16(dest, out);
+}
+
+static INLINE void store_buffer_16x32(__m256i *in, uint8_t *dst, int stride) {
+  const __m256i final_rounding = _mm256_set1_epi16(1 << 5);
+  int j = 0;
+  while (j < 32) {
+    in[j] = _mm256_adds_epi16(in[j], final_rounding);
+    in[j + 1] = _mm256_adds_epi16(in[j + 1], final_rounding);
+
+    in[j] = _mm256_srai_epi16(in[j], 6);
+    in[j + 1] = _mm256_srai_epi16(in[j + 1], 6);
+
+    recon_and_store16(dst, in[j]);
+    dst += stride;
+    recon_and_store16(dst, in[j + 1]);
+    dst += stride;
+    j += 2;
+  }
+}
+
+static INLINE void transpose2_8x8_avx2(__m256i *in, __m256i *out) {
+  int i;
+  __m256i t[16], u[16];
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 1)   ==>  (0, 1)
+  //   (2, 3)   ==>  (2, 3)
+  //   (4, 5)   ==>  (4, 5)
+  //   (6, 7)   ==>  (6, 7)
+  for (i = 0; i < 4; i++) {
+    t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]);
+    t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]);
+  }
+
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 2)   ==>  (0, 2)
+  //   (1, 3)   ==>  (1, 3)
+  //   (4, 6)   ==>  (4, 6)
+  //   (5, 7)   ==>  (5, 7)
+  for (i = 0; i < 2; i++) {
+    u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]);
+    u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]);
+
+    u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]);
+    u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]);
+  }
+
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 4)   ==>  (0, 1)
+  //   (1, 5)   ==>  (4, 5)
+  //   (2, 6)   ==>  (2, 3)
+  //   (3, 7)   ==>  (6, 7)
+  for (i = 0; i < 2; i++) {
+    out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]);
+    out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]);
+
+    out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]);
+    out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]);
+  }
+}
+
+static INLINE void transpose_16bit_16x16_avx2(__m256i *in, __m256i *out) {
+  __m256i t[16];
+
+#define LOADL(idx)                                                            \
+  t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
+  t[idx] = _mm256_inserti128_si256(                                           \
+      t[idx], _mm_load_si128((__m128i const *)&in[(idx) + 8]), 1);
+
+#define LOADR(idx)                                                           \
+  t[8 + (idx)] =                                                             \
+      _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
+  t[8 + (idx)] = _mm256_inserti128_si256(                                    \
+      t[8 + (idx)], _mm_load_si128((__m128i const *)&in[(idx) + 8] + 1), 1);
+
+  // load left 8x16
+  LOADL(0)
+  LOADL(1)
+  LOADL(2)
+  LOADL(3)
+  LOADL(4)
+  LOADL(5)
+  LOADL(6)
+  LOADL(7)
+
+  // load right 8x16
+  LOADR(0)
+  LOADR(1)
+  LOADR(2)
+  LOADR(3)
+  LOADR(4)
+  LOADR(5)
+  LOADR(6)
+  LOADR(7)
+
+  // get the top 16x8 result
+  transpose2_8x8_avx2(t, out);
+  // get the bottom 16x8 result
+  transpose2_8x8_avx2(&t[8], &out[8]);
+}
+
+void vpx_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest,
+                                int stride) {
+  int i;
+  __m256i in[16];
+
+  // Load 16x16 values
+  idct_load16x16(input, in, 16);
+
+  transpose_16bit_16x16_avx2(in, in);
+  idct16_16col(in, in);
+
+  transpose_16bit_16x16_avx2(in, in);
+  idct16_16col(in, in);
+
+  for (i = 0; i < 16; ++i) {
+    write_buffer_16x1(dest + i * stride, in[i]);
+  }
+}
+
+// Only do addition and subtraction butterfly, size = 16, 32
+static INLINE void add_sub_butterfly_avx2(__m256i *in, __m256i *out, int size) {
+  int i = 0;
+  const int num = size >> 1;
+  const int bound = size - 1;
+  while (i < num) {
+    out[i] = _mm256_add_epi16(in[i], in[bound - i]);
+    out[bound - i] = _mm256_sub_epi16(in[i], in[bound - i]);
+    i++;
+  }
+}
+
+// For each 16x32 block __m256i in[32],
+// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+// output pixels: 0-7 in __m256i out[32]
+static INLINE void idct32_1024_16x32_quarter_1(__m256i *in, __m256i *out) {
+  __m256i step1[8], step2[8];
+
+  // stage 3
+  butterfly16(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+  butterfly16(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5], &step1[6]);
+
+  // stage 4
+  butterfly16(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1], &step2[0]);
+  butterfly16(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+  step2[4] = _mm256_add_epi16(step1[4], step1[5]);
+  step2[5] = _mm256_sub_epi16(step1[4], step1[5]);
+  step2[6] = _mm256_sub_epi16(step1[7], step1[6]);
+  step2[7] = _mm256_add_epi16(step1[7], step1[6]);
+
+  // stage 5
+  step1[0] = _mm256_add_epi16(step2[0], step2[3]);
+  step1[1] = _mm256_add_epi16(step2[1], step2[2]);
+  step1[2] = _mm256_sub_epi16(step2[1], step2[2]);
+  step1[3] = _mm256_sub_epi16(step2[0], step2[3]);
+  step1[4] = step2[4];
+  butterfly16(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5],
+              &step1[6]);
+  step1[7] = step2[7];
+
+  // stage 6
+  out[0] = _mm256_add_epi16(step1[0], step1[7]);
+  out[1] = _mm256_add_epi16(step1[1], step1[6]);
+  out[2] = _mm256_add_epi16(step1[2], step1[5]);
+  out[3] = _mm256_add_epi16(step1[3], step1[4]);
+  out[4] = _mm256_sub_epi16(step1[3], step1[4]);
+  out[5] = _mm256_sub_epi16(step1[2], step1[5]);
+  out[6] = _mm256_sub_epi16(step1[1], step1[6]);
+  out[7] = _mm256_sub_epi16(step1[0], step1[7]);
+}
+
+static INLINE void idct32_16x32_quarter_2_stage_4_to_6(__m256i *step1,
+                                                       __m256i *out) {
+  __m256i step2[32];
+
+  // stage 4
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  butterfly16(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+              &step2[14]);
+  butterfly16(step1[13], step1[10], -cospi_8_64, cospi_24_64, &step2[10],
+              &step2[13]);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  // stage 5
+  step1[8] = _mm256_add_epi16(step2[8], step2[11]);
+  step1[9] = _mm256_add_epi16(step2[9], step2[10]);
+  step1[10] = _mm256_sub_epi16(step2[9], step2[10]);
+  step1[11] = _mm256_sub_epi16(step2[8], step2[11]);
+  step1[12] = _mm256_sub_epi16(step2[15], step2[12]);
+  step1[13] = _mm256_sub_epi16(step2[14], step2[13]);
+  step1[14] = _mm256_add_epi16(step2[14], step2[13]);
+  step1[15] = _mm256_add_epi16(step2[15], step2[12]);
+
+  // stage 6
+  out[8] = step1[8];
+  out[9] = step1[9];
+  butterfly16(step1[13], step1[10], cospi_16_64, cospi_16_64, &out[10],
+              &out[13]);
+  butterfly16(step1[12], step1[11], cospi_16_64, cospi_16_64, &out[11],
+              &out[12]);
+  out[14] = step1[14];
+  out[15] = step1[15];
+}
+
+// For each 16x32 block __m256i in[32],
+// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+// output pixels: 8-15 in __m256i out[32]
+static INLINE void idct32_1024_16x32_quarter_2(__m256i *in, __m256i *out) {
+  __m256i step1[16], step2[16];
+
+  // stage 2
+  butterfly16(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+  butterfly16(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], &step2[14]);
+  butterfly16(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], &step2[13]);
+  butterfly16(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+  // stage 3
+  step1[8] = _mm256_add_epi16(step2[8], step2[9]);
+  step1[9] = _mm256_sub_epi16(step2[8], step2[9]);
+  step1[10] = _mm256_sub_epi16(step2[11], step2[10]);
+  step1[11] = _mm256_add_epi16(step2[11], step2[10]);
+  step1[12] = _mm256_add_epi16(step2[12], step2[13]);
+  step1[13] = _mm256_sub_epi16(step2[12], step2[13]);
+  step1[14] = _mm256_sub_epi16(step2[15], step2[14]);
+  step1[15] = _mm256_add_epi16(step2[15], step2[14]);
+
+  idct32_16x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void idct32_16x32_quarter_3_4_stage_4_to_7(__m256i *step1,
+                                                         __m256i *out) {
+  __m256i step2[32];
+
+  // stage 4
+  step2[16] = _mm256_add_epi16(step1[16], step1[19]);
+  step2[17] = _mm256_add_epi16(step1[17], step1[18]);
+  step2[18] = _mm256_sub_epi16(step1[17], step1[18]);
+  step2[19] = _mm256_sub_epi16(step1[16], step1[19]);
+  step2[20] = _mm256_sub_epi16(step1[23], step1[20]);
+  step2[21] = _mm256_sub_epi16(step1[22], step1[21]);
+  step2[22] = _mm256_add_epi16(step1[22], step1[21]);
+  step2[23] = _mm256_add_epi16(step1[23], step1[20]);
+
+  step2[24] = _mm256_add_epi16(step1[24], step1[27]);
+  step2[25] = _mm256_add_epi16(step1[25], step1[26]);
+  step2[26] = _mm256_sub_epi16(step1[25], step1[26]);
+  step2[27] = _mm256_sub_epi16(step1[24], step1[27]);
+  step2[28] = _mm256_sub_epi16(step1[31], step1[28]);
+  step2[29] = _mm256_sub_epi16(step1[30], step1[29]);
+  step2[30] = _mm256_add_epi16(step1[29], step1[30]);
+  step2[31] = _mm256_add_epi16(step1[28], step1[31]);
+
+  // stage 5
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  butterfly16(step2[29], step2[18], cospi_24_64, cospi_8_64, &step1[18],
+              &step1[29]);
+  butterfly16(step2[28], step2[19], cospi_24_64, cospi_8_64, &step1[19],
+              &step1[28]);
+  butterfly16(step2[27], step2[20], -cospi_8_64, cospi_24_64, &step1[20],
+              &step1[27]);
+  butterfly16(step2[26], step2[21], -cospi_8_64, cospi_24_64, &step1[21],
+              &step1[26]);
+  step1[22] = step2[22];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[25] = step2[25];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+
+  // stage 6
+  out[16] = _mm256_add_epi16(step1[16], step1[23]);
+  out[17] = _mm256_add_epi16(step1[17], step1[22]);
+  out[18] = _mm256_add_epi16(step1[18], step1[21]);
+  out[19] = _mm256_add_epi16(step1[19], step1[20]);
+  step2[20] = _mm256_sub_epi16(step1[19], step1[20]);
+  step2[21] = _mm256_sub_epi16(step1[18], step1[21]);
+  step2[22] = _mm256_sub_epi16(step1[17], step1[22]);
+  step2[23] = _mm256_sub_epi16(step1[16], step1[23]);
+
+  step2[24] = _mm256_sub_epi16(step1[31], step1[24]);
+  step2[25] = _mm256_sub_epi16(step1[30], step1[25]);
+  step2[26] = _mm256_sub_epi16(step1[29], step1[26]);
+  step2[27] = _mm256_sub_epi16(step1[28], step1[27]);
+  out[28] = _mm256_add_epi16(step1[27], step1[28]);
+  out[29] = _mm256_add_epi16(step1[26], step1[29]);
+  out[30] = _mm256_add_epi16(step1[25], step1[30]);
+  out[31] = _mm256_add_epi16(step1[24], step1[31]);
+
+  // stage 7
+  butterfly16(step2[27], step2[20], cospi_16_64, cospi_16_64, &out[20],
+              &out[27]);
+  butterfly16(step2[26], step2[21], cospi_16_64, cospi_16_64, &out[21],
+              &out[26]);
+  butterfly16(step2[25], step2[22], cospi_16_64, cospi_16_64, &out[22],
+              &out[25]);
+  butterfly16(step2[24], step2[23], cospi_16_64, cospi_16_64, &out[23],
+              &out[24]);
+}
+
+static INLINE void idct32_1024_16x32_quarter_1_2(__m256i *in, __m256i *out) {
+  __m256i temp[16];
+
+  // For each 16x32 block __m256i in[32],
+  // Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+  // output pixels: 0-7 in __m256i out[32]
+  idct32_1024_16x32_quarter_1(in, temp);
+
+  // Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+  // output pixels: 8-15 in __m256i out[32]
+  idct32_1024_16x32_quarter_2(in, temp);
+
+  // stage 7
+  add_sub_butterfly_avx2(temp, out, 16);
+}
+
+// For each 16x32 block __m256i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+// output pixels: 16-23, 24-31 in __m256i out[32]
+static INLINE void idct32_1024_16x32_quarter_3_4(__m256i *in, __m256i *out) {
+  __m256i step1[32], step2[32];
+
+  // stage 1
+  butterfly16(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16], &step1[31]);
+  butterfly16(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17], &step1[30]);
+  butterfly16(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18], &step1[29]);
+  butterfly16(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]);
+
+  butterfly16(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20], &step1[27]);
+  butterfly16(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21], &step1[26]);
+
+  butterfly16(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22], &step1[25]);
+  butterfly16(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]);
+
+  // stage 2
+  step2[16] = _mm256_add_epi16(step1[16], step1[17]);
+  step2[17] = _mm256_sub_epi16(step1[16], step1[17]);
+  step2[18] = _mm256_sub_epi16(step1[19], step1[18]);
+  step2[19] = _mm256_add_epi16(step1[19], step1[18]);
+  step2[20] = _mm256_add_epi16(step1[20], step1[21]);
+  step2[21] = _mm256_sub_epi16(step1[20], step1[21]);
+  step2[22] = _mm256_sub_epi16(step1[23], step1[22]);
+  step2[23] = _mm256_add_epi16(step1[23], step1[22]);
+
+  step2[24] = _mm256_add_epi16(step1[24], step1[25]);
+  step2[25] = _mm256_sub_epi16(step1[24], step1[25]);
+  step2[26] = _mm256_sub_epi16(step1[27], step1[26]);
+  step2[27] = _mm256_add_epi16(step1[27], step1[26]);
+  step2[28] = _mm256_add_epi16(step1[28], step1[29]);
+  step2[29] = _mm256_sub_epi16(step1[28], step1[29]);
+  step2[30] = _mm256_sub_epi16(step1[31], step1[30]);
+  step2[31] = _mm256_add_epi16(step1[31], step1[30]);
+
+  // stage 3
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  butterfly16(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17],
+              &step1[30]);
+  butterfly16(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18],
+              &step1[29]);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  butterfly16(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21],
+              &step1[26]);
+  butterfly16(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22],
+              &step1[25]);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  idct32_16x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static INLINE void idct32_1024_16x32(__m256i *in, __m256i *out) {
+  __m256i temp[32];
+
+  // For each 16x32 block __m256i in[32],
+  // Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+  // output pixels: 0-7 in __m256i out[32]
+  // AND
+  // Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+  // output pixels: 8-15 in __m256i out[32]
+  idct32_1024_16x32_quarter_1_2(in, temp);
+
+  // For each 16x32 block __m256i in[32],
+  // Input with odd index,
+  // 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+  // output pixels: 16-23, 24-31 in __m256i out[32]
+  idct32_1024_16x32_quarter_3_4(in, temp);
+
+  // final stage
+  add_sub_butterfly_avx2(temp, out, 32);
+}
+
+void vpx_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest,
+                                 int stride) {
+  __m256i l[32], r[32], out[32], *in;
+  int i;
+
+  in = l;
+
+  for (i = 0; i < 2; i++) {
+    idct_load16x16(input, in, 32);
+    transpose_16bit_16x16_avx2(in, in);
+
+    idct_load16x16(input + 16, in + 16, 32);
+    transpose_16bit_16x16_avx2(in + 16, in + 16);
+    idct32_1024_16x32(in, in);
+
+    in = r;
+    input += 32 << 4;
+  }
+
+  for (i = 0; i < 32; i += 16) {
+    transpose_16bit_16x16_avx2(l + i, out);
+    transpose_16bit_16x16_avx2(r + i, out + 16);
+    idct32_1024_16x32(out, out);
+
+    store_buffer_16x32(out, dest, stride);
+    dest += 16;
+  }
+}
+
+// Case when only upper-left 16x16 has non-zero coeff
+void vpx_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest,
+                                int stride) {
+  __m256i in[32], io[32], out[32];
+  int i;
+
+  for (i = 16; i < 32; i++) {
+    in[i] = _mm256_setzero_si256();
+  }
+
+  // rows
+  idct_load16x16(input, in, 32);
+  transpose_16bit_16x16_avx2(in, in);
+  idct32_1024_16x32(in, io);
+
+  // columns
+  for (i = 0; i < 32; i += 16) {
+    transpose_16bit_16x16_avx2(io + i, in);
+    idct32_1024_16x32(in, out);
+
+    store_buffer_16x32(out, dest, stride);
+    dest += 16;
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
index 487a474a67..f42b3df849 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
@@ -8,169 +8,14 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <emmintrin.h>  // SSE2
+
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 
-#define RECON_AND_STORE4X4(dest, in_x)                    \
-  {                                                       \
-    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
-    d0 = _mm_unpacklo_epi8(d0, zero);                     \
-    d0 = _mm_add_epi16(in_x, d0);                         \
-    d0 = _mm_packus_epi16(d0, d0);                        \
-    *(int *)(dest) = _mm_cvtsi128_si32(d0);               \
-  }
-
-void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
-                             int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i eight = _mm_set1_epi16(8);
-  const __m128i cst = _mm_setr_epi16(
-      (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
-      (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
-      (int16_t)cospi_8_64, (int16_t)cospi_24_64);
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i input0, input1, input2, input3;
-
-  // Rows
-  input0 = load_input_data(input);
-  input2 = load_input_data(input + 8);
-
-  // Construct i3, i1, i3, i1, i2, i0, i2, i0
-  input0 = _mm_shufflelo_epi16(input0, 0xd8);
-  input0 = _mm_shufflehi_epi16(input0, 0xd8);
-  input2 = _mm_shufflelo_epi16(input2, 0xd8);
-  input2 = _mm_shufflehi_epi16(input2, 0xd8);
-
-  input1 = _mm_unpackhi_epi32(input0, input0);
-  input0 = _mm_unpacklo_epi32(input0, input0);
-  input3 = _mm_unpackhi_epi32(input2, input2);
-  input2 = _mm_unpacklo_epi32(input2, input2);
-
-  // Stage 1
-  input0 = _mm_madd_epi16(input0, cst);
-  input1 = _mm_madd_epi16(input1, cst);
-  input2 = _mm_madd_epi16(input2, cst);
-  input3 = _mm_madd_epi16(input3, cst);
-
-  input0 = _mm_add_epi32(input0, rounding);
-  input1 = _mm_add_epi32(input1, rounding);
-  input2 = _mm_add_epi32(input2, rounding);
-  input3 = _mm_add_epi32(input3, rounding);
-
-  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
-  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
-  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
-  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
-
-  // Stage 2
-  input0 = _mm_packs_epi32(input0, input1);
-  input1 = _mm_packs_epi32(input2, input3);
-
-  // Transpose
-  input2 = _mm_unpacklo_epi16(input0, input1);
-  input3 = _mm_unpackhi_epi16(input0, input1);
-  input0 = _mm_unpacklo_epi32(input2, input3);
-  input1 = _mm_unpackhi_epi32(input2, input3);
-
-  // Switch column2, column 3, and then, we got:
-  // input2: column1, column 0;  input3: column2, column 3.
-  input1 = _mm_shuffle_epi32(input1, 0x4e);
-  input2 = _mm_add_epi16(input0, input1);
-  input3 = _mm_sub_epi16(input0, input1);
-
-  // Columns
-  // Construct i3, i1, i3, i1, i2, i0, i2, i0
-  input0 = _mm_unpacklo_epi32(input2, input2);
-  input1 = _mm_unpackhi_epi32(input2, input2);
-  input2 = _mm_unpackhi_epi32(input3, input3);
-  input3 = _mm_unpacklo_epi32(input3, input3);
-
-  // Stage 1
-  input0 = _mm_madd_epi16(input0, cst);
-  input1 = _mm_madd_epi16(input1, cst);
-  input2 = _mm_madd_epi16(input2, cst);
-  input3 = _mm_madd_epi16(input3, cst);
-
-  input0 = _mm_add_epi32(input0, rounding);
-  input1 = _mm_add_epi32(input1, rounding);
-  input2 = _mm_add_epi32(input2, rounding);
-  input3 = _mm_add_epi32(input3, rounding);
-
-  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
-  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
-  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
-  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
-
-  // Stage 2
-  input0 = _mm_packs_epi32(input0, input2);
-  input1 = _mm_packs_epi32(input1, input3);
-
-  // Transpose
-  input2 = _mm_unpacklo_epi16(input0, input1);
-  input3 = _mm_unpackhi_epi16(input0, input1);
-  input0 = _mm_unpacklo_epi32(input2, input3);
-  input1 = _mm_unpackhi_epi32(input2, input3);
-
-  // Switch column2, column 3, and then, we got:
-  // input2: column1, column 0;  input3: column2, column 3.
-  input1 = _mm_shuffle_epi32(input1, 0x4e);
-  input2 = _mm_add_epi16(input0, input1);
-  input3 = _mm_sub_epi16(input0, input1);
-
-  // Final round and shift
-  input2 = _mm_add_epi16(input2, eight);
-  input3 = _mm_add_epi16(input3, eight);
-
-  input2 = _mm_srai_epi16(input2, 4);
-  input3 = _mm_srai_epi16(input3, 4);
-
-  // Reconstruction and Store
-  {
-    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
-    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
-    d0 = _mm_unpacklo_epi32(d0,
-                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
-    d2 = _mm_unpacklo_epi32(
-        _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);
-    d0 = _mm_unpacklo_epi8(d0, zero);
-    d2 = _mm_unpacklo_epi8(d2, zero);
-    d0 = _mm_add_epi16(d0, input2);
-    d2 = _mm_add_epi16(d2, input3);
-    d0 = _mm_packus_epi16(d0, d2);
-    // store input0
-    *(int *)dest = _mm_cvtsi128_si32(d0);
-    // store input1
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
-    // store input2
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
-    // store input3
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
-  }
-}
-
-void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
-                            int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a;
-
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 4);
-
-  dc_value = _mm_set1_epi16(a);
-
-  RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
-  RECON_AND_STORE4X4(dest + 1 * stride, dc_value);
-  RECON_AND_STORE4X4(dest + 2 * stride, dc_value);
-  RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
-}
-
-static INLINE void transpose_4x4(__m128i *res) {
+static INLINE void transpose_16bit_4(__m128i *res) {
   const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
   const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
 
@@ -178,35 +23,75 @@ static INLINE void transpose_4x4(__m128i *res) {
   res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
 }
 
-void idct4_sse2(__m128i *in) {
+void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
+                             int stride) {
+  const __m128i eight = _mm_set1_epi16(8);
+  __m128i in[2];
+
+  // Rows
+  in[0] = load_input_data8(input);
+  in[1] = load_input_data8(input + 8);
+  idct4_sse2(in);
+
+  // Columns
+  idct4_sse2(in);
+
+  // Final round and shift
+  in[0] = _mm_add_epi16(in[0], eight);
+  in[1] = _mm_add_epi16(in[1], eight);
+  in[0] = _mm_srai_epi16(in[0], 4);
+  in[1] = _mm_srai_epi16(in[1], 4);
+
+  recon_and_store4x4_sse2(in, dest, stride);
+}
+
+void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  const __m128i zero = _mm_setzero_si128();
+  int a;
+  __m128i dc_value, d[2];
+
+  a = (int)dct_const_round_shift((int16_t)input[0] * cospi_16_64);
+  a = (int)dct_const_round_shift(a * cospi_16_64);
+  a = ROUND_POWER_OF_TWO(a, 4);
+
+  dc_value = _mm_set1_epi16(a);
+
+  // Reconstruction and Store
+  d[0] = _mm_cvtsi32_si128(*(const int *)(dest));
+  d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
+  d[0] = _mm_unpacklo_epi32(d[0],
+                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
+  d[1] = _mm_unpacklo_epi32(
+      _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]);
+  d[0] = _mm_unpacklo_epi8(d[0], zero);
+  d[1] = _mm_unpacklo_epi8(d[1], zero);
+  d[0] = _mm_add_epi16(d[0], dc_value);
+  d[1] = _mm_add_epi16(d[1], dc_value);
+  d[0] = _mm_packus_epi16(d[0], d[1]);
+
+  *(int *)dest = _mm_cvtsi128_si32(d[0]);
+  d[0] = _mm_srli_si128(d[0], 4);
+  *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]);
+  d[0] = _mm_srli_si128(d[0], 4);
+  *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]);
+  d[0] = _mm_srli_si128(d[0], 4);
+  *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]);
+}
+
+void idct4_sse2(__m128i *const in) {
   const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i u[8], v[8];
+  __m128i u[2];
 
-  transpose_4x4(in);
+  transpose_16bit_4(in);
   // stage 1
   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
   u[1] = _mm_unpackhi_epi16(in[0], in[1]);
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-
-  u[0] = _mm_packs_epi32(v[0], v[1]);
-  u[1] = _mm_packs_epi32(v[3], v[2]);
+  u[0] = idct_calc_wraplow_sse2(k__cospi_p16_p16, k__cospi_p16_m16, u[0]);
+  u[1] = idct_calc_wraplow_sse2(k__cospi_p08_p24, k__cospi_p24_m08, u[1]);
 
   // stage 2
   in[0] = _mm_add_epi16(u[0], u[1]);
@@ -214,358 +99,137 @@ void idct4_sse2(__m128i *in) {
   in[1] = _mm_shuffle_epi32(in[1], 0x4E);
 }
 
-void iadst4_sse2(__m128i *in) {
-  const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
-  const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
-  const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
-  const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
-  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
-  const __m128i kZero = _mm_set1_epi16(0);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i u[8], v[8], in7;
+void iadst4_sse2(__m128i *const in) {
+  const __m128i k__sinpi_1_3 = pair_set_epi16(sinpi_1_9, sinpi_3_9);
+  const __m128i k__sinpi_4_2 = pair_set_epi16(sinpi_4_9, sinpi_2_9);
+  const __m128i k__sinpi_2_3 = pair_set_epi16(sinpi_2_9, sinpi_3_9);
+  const __m128i k__sinpi_1_4 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
+  const __m128i k__sinpi_12_n3 =
+      pair_set_epi16(sinpi_1_9 + sinpi_2_9, -sinpi_3_9);
+  __m128i u[4], v[5];
 
-  transpose_4x4(in);
-  in7 = _mm_srli_si128(in[1], 8);
-  in7 = _mm_add_epi16(in7, in[0]);
-  in7 = _mm_sub_epi16(in7, in[1]);
+  // 00 01 20 21  02 03 22 23
+  // 10 11 30 31  12 13 32 33
+  const __m128i tr0_0 = _mm_unpacklo_epi32(in[0], in[1]);
+  const __m128i tr0_1 = _mm_unpackhi_epi32(in[0], in[1]);
 
-  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
-  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
-  u[2] = _mm_unpacklo_epi16(in7, kZero);
-  u[3] = _mm_unpackhi_epi16(in[0], kZero);
+  // 00 01 10 11  20 21 30 31
+  // 02 03 12 13  22 23 32 33
+  in[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  in[1] = _mm_unpackhi_epi32(tr0_0, tr0_1);
 
-  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
-  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
-  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
-  v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
-  v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
-  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
+  v[0] = _mm_madd_epi16(in[0], k__sinpi_1_3);    // s_1 * x0 + s_3 * x1
+  v[1] = _mm_madd_epi16(in[1], k__sinpi_4_2);    // s_4 * x2 + s_2 * x3
+  v[2] = _mm_madd_epi16(in[0], k__sinpi_2_3);    // s_2 * x0 + s_3 * x1
+  v[3] = _mm_madd_epi16(in[1], k__sinpi_1_4);    // s_1 * x2 + s_4 * x3
+  v[4] = _mm_madd_epi16(in[0], k__sinpi_12_n3);  // (s_1 + s_2) * x0 - s_3 * x1
+  in[0] = _mm_sub_epi16(in[0], in[1]);           // x0 - x2
+  in[1] = _mm_srli_epi32(in[1], 16);
+  in[0] = _mm_add_epi16(in[0], in[1]);
+  in[0] = _mm_slli_epi32(in[0], 16);  // x0 - x2 + x3
 
   u[0] = _mm_add_epi32(v[0], v[1]);
-  u[1] = _mm_add_epi32(v[3], v[4]);
-  u[2] = v[2];
-  u[3] = _mm_add_epi32(u[0], u[1]);
-  u[4] = _mm_slli_epi32(v[5], 2);
-  u[5] = _mm_add_epi32(u[3], v[5]);
-  u[6] = _mm_sub_epi32(u[5], u[4]);
+  u[1] = _mm_sub_epi32(v[2], v[3]);
+  u[2] = _mm_madd_epi16(in[0], k__sinpi_1_3);
+  u[3] = _mm_sub_epi32(v[1], v[3]);
+  u[3] = _mm_add_epi32(u[3], v[4]);
 
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[0] = dct_const_round_shift_sse2(u[0]);
+  u[1] = dct_const_round_shift_sse2(u[1]);
+  u[2] = dct_const_round_shift_sse2(u[2]);
+  u[3] = dct_const_round_shift_sse2(u[3]);
 
   in[0] = _mm_packs_epi32(u[0], u[1]);
   in[1] = _mm_packs_epi32(u[2], u[3]);
 }
 
-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                      out2, out3, out4, out5, out6, out7)                 \
-  {                                                                       \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1);                   \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3);                   \
-    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1);                   \
-    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3);                   \
-    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5);                   \
-    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7);                   \
-    const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5);                   \
-    const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7);                   \
-                                                                          \
-    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);               \
-    const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);               \
-    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);               \
-    const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);               \
-    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);               \
-    const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);               \
-    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);               \
-    const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);               \
-                                                                          \
-    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4);                              \
-    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4);                              \
-    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6);                              \
-    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6);                              \
-    out4 = _mm_unpacklo_epi64(tr1_1, tr1_5);                              \
-    out5 = _mm_unpackhi_epi64(tr1_1, tr1_5);                              \
-    out6 = _mm_unpacklo_epi64(tr1_3, tr1_7);                              \
-    out7 = _mm_unpackhi_epi64(tr1_3, tr1_7);                              \
-  }
-
-#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \
-  {                                                                      \
-    const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1);                \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0);                \
-    const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3);                \
-    const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2);                \
-                                                                         \
-    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);              \
-    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);              \
-    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);              \
-    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);              \
-                                                                         \
-    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4);                             \
-    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4);                             \
-    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6);                             \
-    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6);                             \
-  }
-
-#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
-  {                                                      \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1);  \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3);  \
-    out0 = _mm_unpacklo_epi32(tr0_0, tr0_1);             \
-    out1 = _mm_unpackhi_epi32(tr0_0, tr0_1);             \
-  }
-
-// Define Macro for multiplying elements by constants and adding them together.
-#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \
-                               res0, res1, res2, res3)                         \
-  {                                                                            \
-    tmp0 = _mm_madd_epi16(lo_0, cst0);                                         \
-    tmp1 = _mm_madd_epi16(hi_0, cst0);                                         \
-    tmp2 = _mm_madd_epi16(lo_0, cst1);                                         \
-    tmp3 = _mm_madd_epi16(hi_0, cst1);                                         \
-    tmp4 = _mm_madd_epi16(lo_1, cst2);                                         \
-    tmp5 = _mm_madd_epi16(hi_1, cst2);                                         \
-    tmp6 = _mm_madd_epi16(lo_1, cst3);                                         \
-    tmp7 = _mm_madd_epi16(hi_1, cst3);                                         \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-    tmp4 = _mm_add_epi32(tmp4, rounding);                                      \
-    tmp5 = _mm_add_epi32(tmp5, rounding);                                      \
-    tmp6 = _mm_add_epi32(tmp6, rounding);                                      \
-    tmp7 = _mm_add_epi32(tmp7, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);                               \
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);                               \
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);                               \
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);                               \
-                                                                               \
-    res0 = _mm_packs_epi32(tmp0, tmp1);                                        \
-    res1 = _mm_packs_epi32(tmp2, tmp3);                                        \
-    res2 = _mm_packs_epi32(tmp4, tmp5);                                        \
-    res3 = _mm_packs_epi32(tmp6, tmp7);                                        \
-  }
-
-#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
-  {                                                                  \
-    tmp0 = _mm_madd_epi16(lo_0, cst0);                               \
-    tmp1 = _mm_madd_epi16(hi_0, cst0);                               \
-    tmp2 = _mm_madd_epi16(lo_0, cst1);                               \
-    tmp3 = _mm_madd_epi16(hi_0, cst1);                               \
-                                                                     \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                            \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                            \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                            \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                            \
-                                                                     \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                     \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                     \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                     \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                     \
-                                                                     \
-    res0 = _mm_packs_epi32(tmp0, tmp1);                              \
-    res1 = _mm_packs_epi32(tmp2, tmp3);                              \
-  }
-
-#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \
-              out4, out5, out6, out7)                                         \
-  {                                                                           \
-    /* Stage1 */                                                              \
-    {                                                                         \
-      const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7);                     \
-      const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7);                     \
-      const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5);                     \
-      const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5);                     \
-                                                                              \
-      MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, stg1_1,      \
-                             stg1_2, stg1_3, stp1_4, stp1_7, stp1_5, stp1_6)  \
-    }                                                                         \
-                                                                              \
-    /* Stage2 */                                                              \
-    {                                                                         \
-      const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4);                     \
-      const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4);                     \
-      const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6);                     \
-      const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6);                     \
-                                                                              \
-      MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1,      \
-                             stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3)  \
-                                                                              \
-      stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                 \
-      stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                 \
-      stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                 \
-      stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                 \
-    }                                                                         \
-                                                                              \
-    /* Stage3 */                                                              \
-    {                                                                         \
-      const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5);               \
-      const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5);               \
-                                                                              \
-      stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                 \
-      stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                 \
-      stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                 \
-      stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                 \
-                                                                              \
-      tmp0 = _mm_madd_epi16(lo_56, stg2_1);                                   \
-      tmp1 = _mm_madd_epi16(hi_56, stg2_1);                                   \
-      tmp2 = _mm_madd_epi16(lo_56, stg2_0);                                   \
-      tmp3 = _mm_madd_epi16(hi_56, stg2_0);                                   \
-                                                                              \
-      tmp0 = _mm_add_epi32(tmp0, rounding);                                   \
-      tmp1 = _mm_add_epi32(tmp1, rounding);                                   \
-      tmp2 = _mm_add_epi32(tmp2, rounding);                                   \
-      tmp3 = _mm_add_epi32(tmp3, rounding);                                   \
-                                                                              \
-      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                            \
-      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                            \
-      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                            \
-      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                            \
-                                                                              \
-      stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                   \
-      stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                   \
-    }                                                                         \
-                                                                              \
-    /* Stage4  */                                                             \
-    out0 = _mm_add_epi16(stp1_0, stp2_7);                                     \
-    out1 = _mm_add_epi16(stp1_1, stp1_6);                                     \
-    out2 = _mm_add_epi16(stp1_2, stp1_5);                                     \
-    out3 = _mm_add_epi16(stp1_3, stp2_4);                                     \
-    out4 = _mm_sub_epi16(stp1_3, stp2_4);                                     \
-    out5 = _mm_sub_epi16(stp1_2, stp1_5);                                     \
-    out6 = _mm_sub_epi16(stp1_1, stp1_6);                                     \
-    out7 = _mm_sub_epi16(stp1_0, stp2_7);                                     \
-  }
+static INLINE void load_buffer_8x8(const tran_low_t *const input,
+                                   __m128i *const in) {
+  in[0] = load_input_data8(input + 0 * 8);
+  in[1] = load_input_data8(input + 1 * 8);
+  in[2] = load_input_data8(input + 2 * 8);
+  in[3] = load_input_data8(input + 3 * 8);
+  in[4] = load_input_data8(input + 4 * 8);
+  in[5] = load_input_data8(input + 5 * 8);
+  in[6] = load_input_data8(input + 6 * 8);
+  in[7] = load_input_data8(input + 7 * 8);
+}
 
 void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
                              int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i in[8];
   int i;
 
   // Load input data.
-  in0 = load_input_data(input);
-  in1 = load_input_data(input + 8 * 1);
-  in2 = load_input_data(input + 8 * 2);
-  in3 = load_input_data(input + 8 * 3);
-  in4 = load_input_data(input + 8 * 4);
-  in5 = load_input_data(input + 8 * 5);
-  in6 = load_input_data(input + 8 * 6);
-  in7 = load_input_data(input + 8 * 7);
+  load_buffer_8x8(input, in);
 
   // 2-D
   for (i = 0; i < 2; i++) {
-    // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
-    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                  in4, in5, in6, in7);
-
-    // 4-stage 1D idct8x8
-    IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5,
-          in6, in7);
+    vpx_idct8_sse2(in);
   }
 
-  // Final rounding and shift
-  in0 = _mm_adds_epi16(in0, final_rounding);
-  in1 = _mm_adds_epi16(in1, final_rounding);
-  in2 = _mm_adds_epi16(in2, final_rounding);
-  in3 = _mm_adds_epi16(in3, final_rounding);
-  in4 = _mm_adds_epi16(in4, final_rounding);
-  in5 = _mm_adds_epi16(in5, final_rounding);
-  in6 = _mm_adds_epi16(in6, final_rounding);
-  in7 = _mm_adds_epi16(in7, final_rounding);
+  write_buffer_8x8(in, dest, stride);
+}
 
-  in0 = _mm_srai_epi16(in0, 5);
-  in1 = _mm_srai_epi16(in1, 5);
-  in2 = _mm_srai_epi16(in2, 5);
-  in3 = _mm_srai_epi16(in3, 5);
-  in4 = _mm_srai_epi16(in4, 5);
-  in5 = _mm_srai_epi16(in5, 5);
-  in6 = _mm_srai_epi16(in6, 5);
-  in7 = _mm_srai_epi16(in7, 5);
+void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
+                             int stride) {
+  __m128i io[8];
 
-  RECON_AND_STORE(dest + 0 * stride, in0);
-  RECON_AND_STORE(dest + 1 * stride, in1);
-  RECON_AND_STORE(dest + 2 * stride, in2);
-  RECON_AND_STORE(dest + 3 * stride, in3);
-  RECON_AND_STORE(dest + 4 * stride, in4);
-  RECON_AND_STORE(dest + 5 * stride, in5);
-  RECON_AND_STORE(dest + 6 * stride, in6);
-  RECON_AND_STORE(dest + 7 * stride, in7);
+  io[0] = load_input_data4(input + 0 * 8);
+  io[1] = load_input_data4(input + 1 * 8);
+  io[2] = load_input_data4(input + 2 * 8);
+  io[3] = load_input_data4(input + 3 * 8);
+
+  idct8x8_12_add_kernel_sse2(io);
+  write_buffer_8x8(io, dest, stride);
+}
+
+static INLINE void recon_and_store_8_dual(uint8_t *const dest,
+                                          const __m128i in_x,
+                                          const int stride) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i d0, d1;
+
+  d0 = _mm_loadl_epi64((__m128i *)(dest + 0 * stride));
+  d1 = _mm_loadl_epi64((__m128i *)(dest + 1 * stride));
+  d0 = _mm_unpacklo_epi8(d0, zero);
+  d1 = _mm_unpacklo_epi8(d1, zero);
+  d0 = _mm_add_epi16(in_x, d0);
+  d1 = _mm_add_epi16(in_x, d1);
+  d0 = _mm_packus_epi16(d0, d1);
+  _mm_storel_epi64((__m128i *)(dest + 0 * stride), d0);
+  _mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d0));
 }
 
 void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
                             int stride) {
   __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a;
+  tran_high_t a1;
+  tran_low_t out =
+      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
 
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 5);
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+  a1 = ROUND_POWER_OF_TWO(out, 5);
+  dc_value = _mm_set1_epi16((int16_t)a1);
 
-  dc_value = _mm_set1_epi16(a);
-
-  RECON_AND_STORE(dest + 0 * stride, dc_value);
-  RECON_AND_STORE(dest + 1 * stride, dc_value);
-  RECON_AND_STORE(dest + 2 * stride, dc_value);
-  RECON_AND_STORE(dest + 3 * stride, dc_value);
-  RECON_AND_STORE(dest + 4 * stride, dc_value);
-  RECON_AND_STORE(dest + 5 * stride, dc_value);
-  RECON_AND_STORE(dest + 6 * stride, dc_value);
-  RECON_AND_STORE(dest + 7 * stride, dc_value);
+  recon_and_store_8_dual(dest, dc_value, stride);
+  dest += 2 * stride;
+  recon_and_store_8_dual(dest, dc_value, stride);
+  dest += 2 * stride;
+  recon_and_store_8_dual(dest, dc_value, stride);
+  dest += 2 * stride;
+  recon_and_store_8_dual(dest, dc_value, stride);
 }
 
-void idct8_sse2(__m128i *in) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
+void vpx_idct8_sse2(__m128i *const in) {
   // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
-  TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0,
-                in1, in2, in3, in4, in5, in6, in7);
+  transpose_16bit_8x8(in, in);
 
   // 4-stage 1D idct8x8
-  IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3],
-        in[4], in[5], in[6], in[7]);
+  idct8(in, in);
 }
 
-void iadst8_sse2(__m128i *in) {
+void iadst8_sse2(__m128i *const in) {
   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
@@ -578,736 +242,279 @@ void iadst8_sse2(__m128i *in) {
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__const_0 = _mm_set1_epi16(0);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
-  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
-  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i kZero = _mm_setzero_si128();
+  __m128i s[8], u[16], v[8], w[16];
 
   // transpose
-  array_transpose_8x8(in, in);
-
-  // properly aligned for butterfly input
-  in0 = in[7];
-  in1 = in[0];
-  in2 = in[5];
-  in3 = in[2];
-  in4 = in[3];
-  in5 = in[4];
-  in6 = in[1];
-  in7 = in[6];
+  transpose_16bit_8x8(in, in);
 
   // column transformation
   // stage 1
   // interleave and multiply/add into 32-bit integer
-  s0 = _mm_unpacklo_epi16(in0, in1);
-  s1 = _mm_unpackhi_epi16(in0, in1);
-  s2 = _mm_unpacklo_epi16(in2, in3);
-  s3 = _mm_unpackhi_epi16(in2, in3);
-  s4 = _mm_unpacklo_epi16(in4, in5);
-  s5 = _mm_unpackhi_epi16(in4, in5);
-  s6 = _mm_unpacklo_epi16(in6, in7);
-  s7 = _mm_unpackhi_epi16(in6, in7);
+  s[0] = _mm_unpacklo_epi16(in[7], in[0]);
+  s[1] = _mm_unpackhi_epi16(in[7], in[0]);
+  s[2] = _mm_unpacklo_epi16(in[5], in[2]);
+  s[3] = _mm_unpackhi_epi16(in[5], in[2]);
+  s[4] = _mm_unpacklo_epi16(in[3], in[4]);
+  s[5] = _mm_unpackhi_epi16(in[3], in[4]);
+  s[6] = _mm_unpacklo_epi16(in[1], in[6]);
+  s[7] = _mm_unpackhi_epi16(in[1], in[6]);
 
-  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
-  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
-  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
-  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
-  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
-  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
-  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
-  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
-  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
-  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
-  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
-  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
-  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
-  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
-  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
-  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
+  u[0] = _mm_madd_epi16(s[0], k__cospi_p02_p30);
+  u[1] = _mm_madd_epi16(s[1], k__cospi_p02_p30);
+  u[2] = _mm_madd_epi16(s[0], k__cospi_p30_m02);
+  u[3] = _mm_madd_epi16(s[1], k__cospi_p30_m02);
+  u[4] = _mm_madd_epi16(s[2], k__cospi_p10_p22);
+  u[5] = _mm_madd_epi16(s[3], k__cospi_p10_p22);
+  u[6] = _mm_madd_epi16(s[2], k__cospi_p22_m10);
+  u[7] = _mm_madd_epi16(s[3], k__cospi_p22_m10);
+  u[8] = _mm_madd_epi16(s[4], k__cospi_p18_p14);
+  u[9] = _mm_madd_epi16(s[5], k__cospi_p18_p14);
+  u[10] = _mm_madd_epi16(s[4], k__cospi_p14_m18);
+  u[11] = _mm_madd_epi16(s[5], k__cospi_p14_m18);
+  u[12] = _mm_madd_epi16(s[6], k__cospi_p26_p06);
+  u[13] = _mm_madd_epi16(s[7], k__cospi_p26_p06);
+  u[14] = _mm_madd_epi16(s[6], k__cospi_p06_m26);
+  u[15] = _mm_madd_epi16(s[7], k__cospi_p06_m26);
 
   // addition
-  w0 = _mm_add_epi32(u0, u8);
-  w1 = _mm_add_epi32(u1, u9);
-  w2 = _mm_add_epi32(u2, u10);
-  w3 = _mm_add_epi32(u3, u11);
-  w4 = _mm_add_epi32(u4, u12);
-  w5 = _mm_add_epi32(u5, u13);
-  w6 = _mm_add_epi32(u6, u14);
-  w7 = _mm_add_epi32(u7, u15);
-  w8 = _mm_sub_epi32(u0, u8);
-  w9 = _mm_sub_epi32(u1, u9);
-  w10 = _mm_sub_epi32(u2, u10);
-  w11 = _mm_sub_epi32(u3, u11);
-  w12 = _mm_sub_epi32(u4, u12);
-  w13 = _mm_sub_epi32(u5, u13);
-  w14 = _mm_sub_epi32(u6, u14);
-  w15 = _mm_sub_epi32(u7, u15);
+  w[0] = _mm_add_epi32(u[0], u[8]);
+  w[1] = _mm_add_epi32(u[1], u[9]);
+  w[2] = _mm_add_epi32(u[2], u[10]);
+  w[3] = _mm_add_epi32(u[3], u[11]);
+  w[4] = _mm_add_epi32(u[4], u[12]);
+  w[5] = _mm_add_epi32(u[5], u[13]);
+  w[6] = _mm_add_epi32(u[6], u[14]);
+  w[7] = _mm_add_epi32(u[7], u[15]);
+  w[8] = _mm_sub_epi32(u[0], u[8]);
+  w[9] = _mm_sub_epi32(u[1], u[9]);
+  w[10] = _mm_sub_epi32(u[2], u[10]);
+  w[11] = _mm_sub_epi32(u[3], u[11]);
+  w[12] = _mm_sub_epi32(u[4], u[12]);
+  w[13] = _mm_sub_epi32(u[5], u[13]);
+  w[14] = _mm_sub_epi32(u[6], u[14]);
+  w[15] = _mm_sub_epi32(u[7], u[15]);
 
   // shift and rounding
-  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
-  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
-  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
-  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
-  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
-  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
-  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
-  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
-  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
-  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
-  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
-  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
-  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
-  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
-  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
-  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
+  u[0] = dct_const_round_shift_sse2(w[0]);
+  u[1] = dct_const_round_shift_sse2(w[1]);
+  u[2] = dct_const_round_shift_sse2(w[2]);
+  u[3] = dct_const_round_shift_sse2(w[3]);
+  u[4] = dct_const_round_shift_sse2(w[4]);
+  u[5] = dct_const_round_shift_sse2(w[5]);
+  u[6] = dct_const_round_shift_sse2(w[6]);
+  u[7] = dct_const_round_shift_sse2(w[7]);
+  u[8] = dct_const_round_shift_sse2(w[8]);
+  u[9] = dct_const_round_shift_sse2(w[9]);
+  u[10] = dct_const_round_shift_sse2(w[10]);
+  u[11] = dct_const_round_shift_sse2(w[11]);
+  u[12] = dct_const_round_shift_sse2(w[12]);
+  u[13] = dct_const_round_shift_sse2(w[13]);
+  u[14] = dct_const_round_shift_sse2(w[14]);
+  u[15] = dct_const_round_shift_sse2(w[15]);
 
   // back to 16-bit and pack 8 integers into __m128i
-  in[0] = _mm_packs_epi32(u0, u1);
-  in[1] = _mm_packs_epi32(u2, u3);
-  in[2] = _mm_packs_epi32(u4, u5);
-  in[3] = _mm_packs_epi32(u6, u7);
-  in[4] = _mm_packs_epi32(u8, u9);
-  in[5] = _mm_packs_epi32(u10, u11);
-  in[6] = _mm_packs_epi32(u12, u13);
-  in[7] = _mm_packs_epi32(u14, u15);
+  in[0] = _mm_packs_epi32(u[0], u[1]);
+  in[1] = _mm_packs_epi32(u[2], u[3]);
+  in[2] = _mm_packs_epi32(u[4], u[5]);
+  in[3] = _mm_packs_epi32(u[6], u[7]);
+  in[4] = _mm_packs_epi32(u[8], u[9]);
+  in[5] = _mm_packs_epi32(u[10], u[11]);
+  in[6] = _mm_packs_epi32(u[12], u[13]);
+  in[7] = _mm_packs_epi32(u[14], u[15]);
 
   // stage 2
-  s0 = _mm_add_epi16(in[0], in[2]);
-  s1 = _mm_add_epi16(in[1], in[3]);
-  s2 = _mm_sub_epi16(in[0], in[2]);
-  s3 = _mm_sub_epi16(in[1], in[3]);
-  u0 = _mm_unpacklo_epi16(in[4], in[5]);
-  u1 = _mm_unpackhi_epi16(in[4], in[5]);
-  u2 = _mm_unpacklo_epi16(in[6], in[7]);
-  u3 = _mm_unpackhi_epi16(in[6], in[7]);
+  s[0] = _mm_add_epi16(in[0], in[2]);
+  s[1] = _mm_add_epi16(in[1], in[3]);
+  s[2] = _mm_sub_epi16(in[0], in[2]);
+  s[3] = _mm_sub_epi16(in[1], in[3]);
+  u[0] = _mm_unpacklo_epi16(in[4], in[5]);
+  u[1] = _mm_unpackhi_epi16(in[4], in[5]);
+  u[2] = _mm_unpacklo_epi16(in[6], in[7]);
+  u[3] = _mm_unpackhi_epi16(in[6], in[7]);
 
-  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
-  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
-  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
-  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
-  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
-  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
-  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
-  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
 
-  w0 = _mm_add_epi32(v0, v4);
-  w1 = _mm_add_epi32(v1, v5);
-  w2 = _mm_add_epi32(v2, v6);
-  w3 = _mm_add_epi32(v3, v7);
-  w4 = _mm_sub_epi32(v0, v4);
-  w5 = _mm_sub_epi32(v1, v5);
-  w6 = _mm_sub_epi32(v2, v6);
-  w7 = _mm_sub_epi32(v3, v7);
+  w[0] = _mm_add_epi32(v[0], v[4]);
+  w[1] = _mm_add_epi32(v[1], v[5]);
+  w[2] = _mm_add_epi32(v[2], v[6]);
+  w[3] = _mm_add_epi32(v[3], v[7]);
+  w[4] = _mm_sub_epi32(v[0], v[4]);
+  w[5] = _mm_sub_epi32(v[1], v[5]);
+  w[6] = _mm_sub_epi32(v[2], v[6]);
+  w[7] = _mm_sub_epi32(v[3], v[7]);
 
-  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+  u[0] = dct_const_round_shift_sse2(w[0]);
+  u[1] = dct_const_round_shift_sse2(w[1]);
+  u[2] = dct_const_round_shift_sse2(w[2]);
+  u[3] = dct_const_round_shift_sse2(w[3]);
+  u[4] = dct_const_round_shift_sse2(w[4]);
+  u[5] = dct_const_round_shift_sse2(w[5]);
+  u[6] = dct_const_round_shift_sse2(w[6]);
+  u[7] = dct_const_round_shift_sse2(w[7]);
 
   // back to 16-bit intergers
-  s4 = _mm_packs_epi32(u0, u1);
-  s5 = _mm_packs_epi32(u2, u3);
-  s6 = _mm_packs_epi32(u4, u5);
-  s7 = _mm_packs_epi32(u6, u7);
+  s[4] = _mm_packs_epi32(u[0], u[1]);
+  s[5] = _mm_packs_epi32(u[2], u[3]);
+  s[6] = _mm_packs_epi32(u[4], u[5]);
+  s[7] = _mm_packs_epi32(u[6], u[7]);
 
   // stage 3
-  u0 = _mm_unpacklo_epi16(s2, s3);
-  u1 = _mm_unpackhi_epi16(s2, s3);
-  u2 = _mm_unpacklo_epi16(s6, s7);
-  u3 = _mm_unpackhi_epi16(s6, s7);
+  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
+  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
+  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
+  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
 
-  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
-  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
-  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
-  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
-  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
-  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
-  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
-  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
+  s[2] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_p16);
+  s[3] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_m16);
+  s[6] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_p16);
+  s[7] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_m16);
 
-  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
-  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
-  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
-  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
-  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
-  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
-  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
-  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
-  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
-  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
-  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
-  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
-  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
-  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
-  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
-  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
-  s2 = _mm_packs_epi32(v0, v1);
-  s3 = _mm_packs_epi32(v2, v3);
-  s6 = _mm_packs_epi32(v4, v5);
-  s7 = _mm_packs_epi32(v6, v7);
-
-  in[0] = s0;
-  in[1] = _mm_sub_epi16(k__const_0, s4);
-  in[2] = s6;
-  in[3] = _mm_sub_epi16(k__const_0, s2);
-  in[4] = s3;
-  in[5] = _mm_sub_epi16(k__const_0, s7);
-  in[6] = s5;
-  in[7] = _mm_sub_epi16(k__const_0, s1);
+  in[0] = s[0];
+  in[1] = _mm_sub_epi16(kZero, s[4]);
+  in[2] = s[6];
+  in[3] = _mm_sub_epi16(kZero, s[2]);
+  in[4] = s[3];
+  in[5] = _mm_sub_epi16(kZero, s[7]);
+  in[6] = s[5];
+  in[7] = _mm_sub_epi16(kZero, s[1]);
 }
 
-void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
-                             int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
-  // Rows. Load 4-row input data.
-  in0 = load_input_data(input);
-  in1 = load_input_data(input + 8 * 1);
-  in2 = load_input_data(input + 8 * 2);
-  in3 = load_input_data(input + 8 * 3);
-
-  // 8x4 Transpose
-  TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
-  // Stage1
-  {
-    const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
-    const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
-
-    tmp0 = _mm_madd_epi16(lo_17, stg1_0);
-    tmp2 = _mm_madd_epi16(lo_17, stg1_1);
-    tmp4 = _mm_madd_epi16(lo_35, stg1_2);
-    tmp6 = _mm_madd_epi16(lo_35, stg1_3);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
-    stp1_5 = _mm_packs_epi32(tmp4, tmp6);
-  }
-
-  // Stage2
-  {
-    const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
-    const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
-
-    tmp0 = _mm_madd_epi16(lo_04, stg2_0);
-    tmp2 = _mm_madd_epi16(lo_04, stg2_1);
-    tmp4 = _mm_madd_epi16(lo_26, stg2_2);
-    tmp6 = _mm_madd_epi16(lo_26, stg2_3);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp2_0 = _mm_packs_epi32(tmp0, tmp2);
-    stp2_2 = _mm_packs_epi32(tmp6, tmp4);
-
-    tmp0 = _mm_add_epi16(stp1_4, stp1_5);
-    tmp1 = _mm_sub_epi16(stp1_4, stp1_5);
-
-    stp2_4 = tmp0;
-    stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
-    stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
-  }
-
-  // Stage3
-  {
-    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
-
-    tmp4 = _mm_add_epi16(stp2_0, stp2_2);
-    tmp6 = _mm_sub_epi16(stp2_0, stp2_2);
-
-    stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
-    stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
-
-    tmp0 = _mm_madd_epi16(lo_56, stg3_0);
-    tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
-    stp1_5 = _mm_packs_epi32(tmp0, tmp2);
-  }
-
-  // Stage4
-  tmp0 = _mm_add_epi16(stp1_3, stp2_4);
-  tmp1 = _mm_add_epi16(stp1_2, stp1_5);
-  tmp2 = _mm_sub_epi16(stp1_3, stp2_4);
-  tmp3 = _mm_sub_epi16(stp1_2, stp1_5);
-
-  TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
-
-  IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4,
-        in5, in6, in7);
-  // Final rounding and shift
-  in0 = _mm_adds_epi16(in0, final_rounding);
-  in1 = _mm_adds_epi16(in1, final_rounding);
-  in2 = _mm_adds_epi16(in2, final_rounding);
-  in3 = _mm_adds_epi16(in3, final_rounding);
-  in4 = _mm_adds_epi16(in4, final_rounding);
-  in5 = _mm_adds_epi16(in5, final_rounding);
-  in6 = _mm_adds_epi16(in6, final_rounding);
-  in7 = _mm_adds_epi16(in7, final_rounding);
-
-  in0 = _mm_srai_epi16(in0, 5);
-  in1 = _mm_srai_epi16(in1, 5);
-  in2 = _mm_srai_epi16(in2, 5);
-  in3 = _mm_srai_epi16(in3, 5);
-  in4 = _mm_srai_epi16(in4, 5);
-  in5 = _mm_srai_epi16(in5, 5);
-  in6 = _mm_srai_epi16(in6, 5);
-  in7 = _mm_srai_epi16(in7, 5);
-
-  RECON_AND_STORE(dest + 0 * stride, in0);
-  RECON_AND_STORE(dest + 1 * stride, in1);
-  RECON_AND_STORE(dest + 2 * stride, in2);
-  RECON_AND_STORE(dest + 3 * stride, in3);
-  RECON_AND_STORE(dest + 4 * stride, in4);
-  RECON_AND_STORE(dest + 5 * stride, in5);
-  RECON_AND_STORE(dest + 6 * stride, in6);
-  RECON_AND_STORE(dest + 7 * stride, in7);
+static INLINE void idct16_load8x8(const tran_low_t *const input,
+                                  __m128i *const in) {
+  in[0] = load_input_data8(input + 0 * 16);
+  in[1] = load_input_data8(input + 1 * 16);
+  in[2] = load_input_data8(input + 2 * 16);
+  in[3] = load_input_data8(input + 3 * 16);
+  in[4] = load_input_data8(input + 4 * 16);
+  in[5] = load_input_data8(input + 5 * 16);
+  in[6] = load_input_data8(input + 6 * 16);
+  in[7] = load_input_data8(input + 7 * 16);
 }
 
-#define IDCT16                                                                 \
-  /* Stage2 */                                                                 \
-  {                                                                            \
-    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]);                 \
-    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]);                 \
-    const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);                   \
-    const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);                   \
-    const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]);                 \
-    const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]);                 \
-    const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]);                 \
-    const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]);                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, stg2_0, stg2_1,   \
-                           stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14)   \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, stg2_4, stg2_5, \
-                           stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) \
-  }                                                                            \
-                                                                               \
-  /* Stage3 */                                                                 \
-  {                                                                            \
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]);                 \
-    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]);                 \
-    const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]);                 \
-    const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]);                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, stg3_0, stg3_1, \
-                           stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6)     \
-                                                                               \
-    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);                                  \
-    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);                                    \
-    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);                                 \
-    stp1_11 = _mm_add_epi16(stp2_11, stp2_10);                                 \
-                                                                               \
-    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13);                               \
-    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);                                 \
-    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);                                 \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_14);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage4 */                                                                 \
-  {                                                                            \
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]);                   \
-    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]);                   \
-    const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]);                 \
-    const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]);                 \
-                                                                               \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, stg4_0, stg4_1,   \
-                           stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3)     \
-                                                                               \
-    stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                    \
-    stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                    \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
-                           stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10,   \
-                           stp2_13)                                            \
-  }                                                                            \
-                                                                               \
-  /* Stage5 */                                                                 \
-  {                                                                            \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
-                                                                               \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                    \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                    \
-    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                    \
-    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                    \
-                                                                               \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);                                 \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);                                \
-                                                                               \
-    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);                               \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
-    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);                               \
-  }                                                                            \
-                                                                               \
-  /* Stage6 */                                                                 \
-  {                                                                            \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
-                                                                               \
-    stp2_0 = _mm_add_epi16(stp1_0, stp2_7);                                    \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
-    stp2_3 = _mm_add_epi16(stp1_3, stp2_4);                                    \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);                                    \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-  }
-
-#define IDCT16_10                                                              \
-  /* Stage2 */                                                                 \
-  {                                                                            \
-    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero);                   \
-    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero);                   \
-    const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]);                   \
-    const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]);                   \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, stg2_0, stg2_1, \
-                           stg2_6, stg2_7, stp1_8_0, stp1_15, stp1_11,         \
-                           stp1_12_0)                                          \
-  }                                                                            \
-                                                                               \
-  /* Stage3 */                                                                 \
-  {                                                                            \
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero);                   \
-    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero);                   \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, stg3_0, stg3_1, stp2_4, stp2_7) \
-                                                                               \
-    stp1_9 = stp1_8_0;                                                         \
-    stp1_10 = stp1_11;                                                         \
-                                                                               \
-    stp1_13 = stp1_12_0;                                                       \
-    stp1_14 = stp1_15;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage4 */                                                                 \
-  {                                                                            \
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);                    \
-    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero);                    \
-                                                                               \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, stg4_0, stg4_1, stp1_0, stp1_1)   \
-    stp2_5 = stp2_4;                                                           \
-    stp2_6 = stp2_7;                                                           \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
-                           stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10,   \
-                           stp2_13)                                            \
-  }                                                                            \
-                                                                               \
-  /* Stage5 */                                                                 \
-  {                                                                            \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
-                                                                               \
-    stp1_2 = stp1_1;                                                           \
-    stp1_3 = stp1_0;                                                           \
-                                                                               \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);                                 \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);                                \
-                                                                               \
-    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);                               \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
-    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);                               \
-  }                                                                            \
-                                                                               \
-  /* Stage6 */                                                                 \
-  {                                                                            \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
-                                                                               \
-    stp2_0 = _mm_add_epi16(stp1_0, stp2_7);                                    \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
-    stp2_3 = _mm_add_epi16(stp1_3, stp2_4);                                    \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);                                    \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-  }
-
 void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
                                 int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[16], l[16], r[16], *curr1;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-      stp1_8_0, stp1_12_0;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i l[16], r[16], out[16], *in;
   int i;
 
-  curr1 = l;
+  in = l;
   for (i = 0; i < 2; i++) {
-    // 1-D idct
-
-    // Load input data.
-    in[0] = load_input_data(input);
-    in[8] = load_input_data(input + 8 * 1);
-    in[1] = load_input_data(input + 8 * 2);
-    in[9] = load_input_data(input + 8 * 3);
-    in[2] = load_input_data(input + 8 * 4);
-    in[10] = load_input_data(input + 8 * 5);
-    in[3] = load_input_data(input + 8 * 6);
-    in[11] = load_input_data(input + 8 * 7);
-    in[4] = load_input_data(input + 8 * 8);
-    in[12] = load_input_data(input + 8 * 9);
-    in[5] = load_input_data(input + 8 * 10);
-    in[13] = load_input_data(input + 8 * 11);
-    in[6] = load_input_data(input + 8 * 12);
-    in[14] = load_input_data(input + 8 * 13);
-    in[7] = load_input_data(input + 8 * 14);
-    in[15] = load_input_data(input + 8 * 15);
-
-    array_transpose_8x8(in, in);
-    array_transpose_8x8(in + 8, in + 8);
-
-    IDCT16
-
-    // Stage7
-    curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
-    curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
-    curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
-    curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
-    curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
-    curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
-    curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
-    curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
-    curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
-    curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
-    curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
-    curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
-    curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
-    curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
-    curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
-    curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-    curr1 = r;
+    idct16_load8x8(input, in);
+    transpose_16bit_8x8(in, in);
+    idct16_load8x8(input + 8, in + 8);
+    transpose_16bit_8x8(in + 8, in + 8);
+    idct16_8col(in, in);
+    in = r;
     input += 128;
   }
-  for (i = 0; i < 2; i++) {
+
+  for (i = 0; i < 16; i += 8) {
     int j;
-    // 1-D idct
-    array_transpose_8x8(l + i * 8, in);
-    array_transpose_8x8(r + i * 8, in + 8);
-
-    IDCT16
-
-    // 2-D
-    in[0] = _mm_add_epi16(stp2_0, stp1_15);
-    in[1] = _mm_add_epi16(stp2_1, stp1_14);
-    in[2] = _mm_add_epi16(stp2_2, stp2_13);
-    in[3] = _mm_add_epi16(stp2_3, stp2_12);
-    in[4] = _mm_add_epi16(stp2_4, stp2_11);
-    in[5] = _mm_add_epi16(stp2_5, stp2_10);
-    in[6] = _mm_add_epi16(stp2_6, stp1_9);
-    in[7] = _mm_add_epi16(stp2_7, stp1_8);
-    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
-    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
-    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
-    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
-    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
-    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
-    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
-    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
+    transpose_16bit_8x8(l + i, out);
+    transpose_16bit_8x8(r + i, out + 8);
+    idct16_8col(out, out);
 
     for (j = 0; j < 16; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
+      write_buffer_8x1(dest + j * stride, out[j]);
     }
 
     dest += 8;
   }
 }
 
+void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest,
+                               int stride) {
+  __m128i in[16], temp[16], out[16];
+  int i;
+
+  idct16_load8x8(input, in);
+  transpose_16bit_8x8(in, in);
+
+  for (i = 8; i < 16; i++) {
+    in[i] = _mm_setzero_si128();
+  }
+  idct16_8col(in, temp);
+
+  for (i = 0; i < 16; i += 8) {
+    int j;
+    transpose_16bit_8x8(temp + i, in);
+    idct16_8col(in, out);
+
+    for (j = 0; j < 16; ++j) {
+      write_buffer_8x1(dest + j * stride, out[j]);
+    }
+
+    dest += 8;
+  }
+}
+
+void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
+                               int stride) {
+  __m128i in[16], l[16];
+  int i;
+
+  // First 1-D inverse DCT
+  // Load input data.
+  in[0] = load_input_data4(input + 0 * 16);
+  in[1] = load_input_data4(input + 1 * 16);
+  in[2] = load_input_data4(input + 2 * 16);
+  in[3] = load_input_data4(input + 3 * 16);
+
+  idct16x16_10_pass1(in, l);
+
+  // Second 1-D inverse transform, performed per 8x16 block
+  for (i = 0; i < 16; i += 8) {
+    int j;
+    idct16x16_10_pass2(l + i, in);
+
+    for (j = 0; j < 16; ++j) {
+      write_buffer_8x1(dest + j * stride, in[j]);
+    }
+
+    dest += 8;
+  }
+}
+
+static INLINE void recon_and_store_16(uint8_t *const dest, const __m128i in_x) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i d0, d1;
+
+  d0 = _mm_load_si128((__m128i *)(dest));
+  d1 = _mm_unpackhi_epi8(d0, zero);
+  d0 = _mm_unpacklo_epi8(d0, zero);
+  d0 = _mm_add_epi16(in_x, d0);
+  d1 = _mm_add_epi16(in_x, d1);
+  d0 = _mm_packus_epi16(d0, d1);
+  _mm_store_si128((__m128i *)(dest), d0);
+}
+
 void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
                               int stride) {
   __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a, i;
+  int i;
+  tran_high_t a1;
+  tran_low_t out =
+      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
 
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 6);
-
-  dc_value = _mm_set1_epi16(a);
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+  dc_value = _mm_set1_epi16((int16_t)a1);
 
   for (i = 0; i < 16; ++i) {
-    RECON_AND_STORE(dest + 0, dc_value);
-    RECON_AND_STORE(dest + 8, dc_value);
+    recon_and_store_16(dest, dc_value);
     dest += stride;
   }
 }
 
-static void iadst16_8col(__m128i *in) {
+void vpx_iadst16_8col_sse2(__m128i *const in) {
   // perform 16x16 1-D ADST for 8 columns
   __m128i s[16], x[16], u[32], v[32];
   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
@@ -1335,12 +542,11 @@ static void iadst16_8col(__m128i *in) {
   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i kZero = _mm_setzero_si128();
 
   u[0] = _mm_unpacklo_epi16(in[15], in[0]);
   u[1] = _mm_unpackhi_epi16(in[15], in[0]);
@@ -1425,71 +631,38 @@ static void iadst16_8col(__m128i *in) {
   u[30] = _mm_sub_epi32(v[14], v[30]);
   u[31] = _mm_sub_epi32(v[15], v[31]);
 
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
-  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
-  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
-  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
-  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
-  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
-  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
-  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
-  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
-  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
-  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
-  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
-  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
-  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
-  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
-  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
-  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
-  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
-  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
-  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
-  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
-  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
-  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
-  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
-  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
-  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
-  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
-  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
-  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
-  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
-  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
+  u[0] = dct_const_round_shift_sse2(u[0]);
+  u[1] = dct_const_round_shift_sse2(u[1]);
+  u[2] = dct_const_round_shift_sse2(u[2]);
+  u[3] = dct_const_round_shift_sse2(u[3]);
+  u[4] = dct_const_round_shift_sse2(u[4]);
+  u[5] = dct_const_round_shift_sse2(u[5]);
+  u[6] = dct_const_round_shift_sse2(u[6]);
+  u[7] = dct_const_round_shift_sse2(u[7]);
+  u[8] = dct_const_round_shift_sse2(u[8]);
+  u[9] = dct_const_round_shift_sse2(u[9]);
+  u[10] = dct_const_round_shift_sse2(u[10]);
+  u[11] = dct_const_round_shift_sse2(u[11]);
+  u[12] = dct_const_round_shift_sse2(u[12]);
+  u[13] = dct_const_round_shift_sse2(u[13]);
+  u[14] = dct_const_round_shift_sse2(u[14]);
+  u[15] = dct_const_round_shift_sse2(u[15]);
+  u[16] = dct_const_round_shift_sse2(u[16]);
+  u[17] = dct_const_round_shift_sse2(u[17]);
+  u[18] = dct_const_round_shift_sse2(u[18]);
+  u[19] = dct_const_round_shift_sse2(u[19]);
+  u[20] = dct_const_round_shift_sse2(u[20]);
+  u[21] = dct_const_round_shift_sse2(u[21]);
+  u[22] = dct_const_round_shift_sse2(u[22]);
+  u[23] = dct_const_round_shift_sse2(u[23]);
+  u[24] = dct_const_round_shift_sse2(u[24]);
+  u[25] = dct_const_round_shift_sse2(u[25]);
+  u[26] = dct_const_round_shift_sse2(u[26]);
+  u[27] = dct_const_round_shift_sse2(u[27]);
+  u[28] = dct_const_round_shift_sse2(u[28]);
+  u[29] = dct_const_round_shift_sse2(u[29]);
+  u[30] = dct_const_round_shift_sse2(u[30]);
+  u[31] = dct_const_round_shift_sse2(u[31]);
 
   s[0] = _mm_packs_epi32(u[0], u[1]);
   s[1] = _mm_packs_epi32(u[2], u[3]);
@@ -1552,39 +725,22 @@ static void iadst16_8col(__m128i *in) {
   u[14] = _mm_sub_epi32(v[6], v[14]);
   u[15] = _mm_sub_epi32(v[7], v[15]);
 
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+  u[0] = dct_const_round_shift_sse2(u[0]);
+  u[1] = dct_const_round_shift_sse2(u[1]);
+  u[2] = dct_const_round_shift_sse2(u[2]);
+  u[3] = dct_const_round_shift_sse2(u[3]);
+  u[4] = dct_const_round_shift_sse2(u[4]);
+  u[5] = dct_const_round_shift_sse2(u[5]);
+  u[6] = dct_const_round_shift_sse2(u[6]);
+  u[7] = dct_const_round_shift_sse2(u[7]);
+  u[8] = dct_const_round_shift_sse2(u[8]);
+  u[9] = dct_const_round_shift_sse2(u[9]);
+  u[10] = dct_const_round_shift_sse2(u[10]);
+  u[11] = dct_const_round_shift_sse2(u[11]);
+  u[12] = dct_const_round_shift_sse2(u[12]);
+  u[13] = dct_const_round_shift_sse2(u[13]);
+  u[14] = dct_const_round_shift_sse2(u[14]);
+  u[15] = dct_const_round_shift_sse2(u[15]);
 
   x[0] = _mm_add_epi16(s[0], s[4]);
   x[1] = _mm_add_epi16(s[1], s[5]);
@@ -1647,39 +803,22 @@ static void iadst16_8col(__m128i *in) {
   u[14] = _mm_sub_epi32(v[10], v[14]);
   u[15] = _mm_sub_epi32(v[11], v[15]);
 
-  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+  v[0] = dct_const_round_shift_sse2(u[0]);
+  v[1] = dct_const_round_shift_sse2(u[1]);
+  v[2] = dct_const_round_shift_sse2(u[2]);
+  v[3] = dct_const_round_shift_sse2(u[3]);
+  v[4] = dct_const_round_shift_sse2(u[4]);
+  v[5] = dct_const_round_shift_sse2(u[5]);
+  v[6] = dct_const_round_shift_sse2(u[6]);
+  v[7] = dct_const_round_shift_sse2(u[7]);
+  v[8] = dct_const_round_shift_sse2(u[8]);
+  v[9] = dct_const_round_shift_sse2(u[9]);
+  v[10] = dct_const_round_shift_sse2(u[10]);
+  v[11] = dct_const_round_shift_sse2(u[11]);
+  v[12] = dct_const_round_shift_sse2(u[12]);
+  v[13] = dct_const_round_shift_sse2(u[13]);
+  v[14] = dct_const_round_shift_sse2(u[14]);
+  v[15] = dct_const_round_shift_sse2(u[15]);
 
   s[0] = _mm_add_epi16(x[0], x[2]);
   s[1] = _mm_add_epi16(x[1], x[3]);
@@ -1708,1718 +847,371 @@ static void iadst16_8col(__m128i *in) {
   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
 
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+  in[7] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_m16_m16);
+  in[8] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_m16);
+  in[4] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_p16);
+  in[11] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_m16_p16);
+  in[6] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p16_p16);
+  in[9] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_m16_p16);
+  in[5] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_m16_m16);
+  in[10] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_p16_m16);
 
   in[0] = s[0];
   in[1] = _mm_sub_epi16(kZero, s[8]);
   in[2] = s[12];
   in[3] = _mm_sub_epi16(kZero, s[4]);
-  in[4] = _mm_packs_epi32(v[4], v[5]);
-  in[5] = _mm_packs_epi32(v[12], v[13]);
-  in[6] = _mm_packs_epi32(v[8], v[9]);
-  in[7] = _mm_packs_epi32(v[0], v[1]);
-  in[8] = _mm_packs_epi32(v[2], v[3]);
-  in[9] = _mm_packs_epi32(v[10], v[11]);
-  in[10] = _mm_packs_epi32(v[14], v[15]);
-  in[11] = _mm_packs_epi32(v[6], v[7]);
   in[12] = s[5];
   in[13] = _mm_sub_epi16(kZero, s[13]);
   in[14] = s[9];
   in[15] = _mm_sub_epi16(kZero, s[1]);
 }
 
-static void idct16_8col(__m128i *in) {
-  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i v[16], u[16], s[16], t[16];
+void idct16_sse2(__m128i *const in0, __m128i *const in1) {
+  transpose_16bit_16x16(in0, in1);
+  idct16_8col(in0, in0);
+  idct16_8col(in1, in1);
+}
 
-  // stage 1
-  s[0] = in[0];
-  s[1] = in[8];
-  s[2] = in[4];
-  s[3] = in[12];
-  s[4] = in[2];
-  s[5] = in[10];
-  s[6] = in[6];
-  s[7] = in[14];
-  s[8] = in[1];
-  s[9] = in[9];
-  s[10] = in[5];
-  s[11] = in[13];
-  s[12] = in[3];
-  s[13] = in[11];
-  s[14] = in[7];
-  s[15] = in[15];
+void iadst16_sse2(__m128i *const in0, __m128i *const in1) {
+  transpose_16bit_16x16(in0, in1);
+  vpx_iadst16_8col_sse2(in0);
+  vpx_iadst16_8col_sse2(in1);
+}
 
-  // stage 2
-  u[0] = _mm_unpacklo_epi16(s[8], s[15]);
-  u[1] = _mm_unpackhi_epi16(s[8], s[15]);
-  u[2] = _mm_unpacklo_epi16(s[9], s[14]);
-  u[3] = _mm_unpackhi_epi16(s[9], s[14]);
-  u[4] = _mm_unpacklo_epi16(s[10], s[13]);
-  u[5] = _mm_unpackhi_epi16(s[10], s[13]);
-  u[6] = _mm_unpacklo_epi16(s[11], s[12]);
-  u[7] = _mm_unpackhi_epi16(s[11], s[12]);
+// Group the coefficient calculation into smaller functions to prevent stack
+// spillover in 32x32 idct optimizations:
+// quarter_1: 0-7
+// quarter_2: 8-15
+// quarter_3_4: 16-23, 24-31
 
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[8] = _mm_packs_epi32(u[0], u[1]);
-  s[15] = _mm_packs_epi32(u[2], u[3]);
-  s[9] = _mm_packs_epi32(u[4], u[5]);
-  s[14] = _mm_packs_epi32(u[6], u[7]);
-  s[10] = _mm_packs_epi32(u[8], u[9]);
-  s[13] = _mm_packs_epi32(u[10], u[11]);
-  s[11] = _mm_packs_epi32(u[12], u[13]);
-  s[12] = _mm_packs_epi32(u[14], u[15]);
+// For each 8x32 block __m128i in[32],
+// Input with index, 0, 4
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_1(const __m128i *const in /*in[32]*/,
+                                            __m128i *const out /*out[8]*/) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i step1[8], step2[8];
 
   // stage 3
-  t[0] = s[0];
-  t[1] = s[1];
-  t[2] = s[2];
-  t[3] = s[3];
-  u[0] = _mm_unpacklo_epi16(s[4], s[7]);
-  u[1] = _mm_unpackhi_epi16(s[4], s[7]);
-  u[2] = _mm_unpacklo_epi16(s[5], s[6]);
-  u[3] = _mm_unpackhi_epi16(s[5], s[6]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  t[4] = _mm_packs_epi32(u[0], u[1]);
-  t[7] = _mm_packs_epi32(u[2], u[3]);
-  t[5] = _mm_packs_epi32(u[4], u[5]);
-  t[6] = _mm_packs_epi32(u[6], u[7]);
-  t[8] = _mm_add_epi16(s[8], s[9]);
-  t[9] = _mm_sub_epi16(s[8], s[9]);
-  t[10] = _mm_sub_epi16(s[11], s[10]);
-  t[11] = _mm_add_epi16(s[10], s[11]);
-  t[12] = _mm_add_epi16(s[12], s[13]);
-  t[13] = _mm_sub_epi16(s[12], s[13]);
-  t[14] = _mm_sub_epi16(s[15], s[14]);
-  t[15] = _mm_add_epi16(s[14], s[15]);
+  butterfly(in[4], zero, cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
 
   // stage 4
-  u[0] = _mm_unpacklo_epi16(t[0], t[1]);
-  u[1] = _mm_unpackhi_epi16(t[0], t[1]);
-  u[2] = _mm_unpacklo_epi16(t[2], t[3]);
-  u[3] = _mm_unpackhi_epi16(t[2], t[3]);
-  u[4] = _mm_unpacklo_epi16(t[9], t[14]);
-  u[5] = _mm_unpackhi_epi16(t[9], t[14]);
-  u[6] = _mm_unpacklo_epi16(t[10], t[13]);
-  u[7] = _mm_unpackhi_epi16(t[10], t[13]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[0] = _mm_packs_epi32(u[0], u[1]);
-  s[1] = _mm_packs_epi32(u[2], u[3]);
-  s[2] = _mm_packs_epi32(u[4], u[5]);
-  s[3] = _mm_packs_epi32(u[6], u[7]);
-  s[4] = _mm_add_epi16(t[4], t[5]);
-  s[5] = _mm_sub_epi16(t[4], t[5]);
-  s[6] = _mm_sub_epi16(t[7], t[6]);
-  s[7] = _mm_add_epi16(t[6], t[7]);
-  s[8] = t[8];
-  s[15] = t[15];
-  s[9] = _mm_packs_epi32(u[8], u[9]);
-  s[14] = _mm_packs_epi32(u[10], u[11]);
-  s[10] = _mm_packs_epi32(u[12], u[13]);
-  s[13] = _mm_packs_epi32(u[14], u[15]);
-  s[11] = t[11];
-  s[12] = t[12];
+  step2[0] = butterfly_cospi16(in[0]);
+  step2[4] = step1[4];
+  step2[5] = step1[4];
+  step2[6] = step1[7];
+  step2[7] = step1[7];
 
   // stage 5
-  t[0] = _mm_add_epi16(s[0], s[3]);
-  t[1] = _mm_add_epi16(s[1], s[2]);
-  t[2] = _mm_sub_epi16(s[1], s[2]);
-  t[3] = _mm_sub_epi16(s[0], s[3]);
-  t[4] = s[4];
-  t[7] = s[7];
-
-  u[0] = _mm_unpacklo_epi16(s[5], s[6]);
-  u[1] = _mm_unpackhi_epi16(s[5], s[6]);
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  t[5] = _mm_packs_epi32(u[0], u[1]);
-  t[6] = _mm_packs_epi32(u[2], u[3]);
-
-  t[8] = _mm_add_epi16(s[8], s[11]);
-  t[9] = _mm_add_epi16(s[9], s[10]);
-  t[10] = _mm_sub_epi16(s[9], s[10]);
-  t[11] = _mm_sub_epi16(s[8], s[11]);
-  t[12] = _mm_sub_epi16(s[15], s[12]);
-  t[13] = _mm_sub_epi16(s[14], s[13]);
-  t[14] = _mm_add_epi16(s[13], s[14]);
-  t[15] = _mm_add_epi16(s[12], s[15]);
+  step1[0] = step2[0];
+  step1[1] = step2[0];
+  step1[2] = step2[0];
+  step1[3] = step2[0];
+  step1[4] = step2[4];
+  butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+  step1[7] = step2[7];
 
   // stage 6
-  s[0] = _mm_add_epi16(t[0], t[7]);
-  s[1] = _mm_add_epi16(t[1], t[6]);
-  s[2] = _mm_add_epi16(t[2], t[5]);
-  s[3] = _mm_add_epi16(t[3], t[4]);
-  s[4] = _mm_sub_epi16(t[3], t[4]);
-  s[5] = _mm_sub_epi16(t[2], t[5]);
-  s[6] = _mm_sub_epi16(t[1], t[6]);
-  s[7] = _mm_sub_epi16(t[0], t[7]);
-  s[8] = t[8];
-  s[9] = t[9];
-
-  u[0] = _mm_unpacklo_epi16(t[10], t[13]);
-  u[1] = _mm_unpackhi_epi16(t[10], t[13]);
-  u[2] = _mm_unpacklo_epi16(t[11], t[12]);
-  u[3] = _mm_unpackhi_epi16(t[11], t[12]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  s[10] = _mm_packs_epi32(u[0], u[1]);
-  s[13] = _mm_packs_epi32(u[2], u[3]);
-  s[11] = _mm_packs_epi32(u[4], u[5]);
-  s[12] = _mm_packs_epi32(u[6], u[7]);
-  s[14] = t[14];
-  s[15] = t[15];
-
-  // stage 7
-  in[0] = _mm_add_epi16(s[0], s[15]);
-  in[1] = _mm_add_epi16(s[1], s[14]);
-  in[2] = _mm_add_epi16(s[2], s[13]);
-  in[3] = _mm_add_epi16(s[3], s[12]);
-  in[4] = _mm_add_epi16(s[4], s[11]);
-  in[5] = _mm_add_epi16(s[5], s[10]);
-  in[6] = _mm_add_epi16(s[6], s[9]);
-  in[7] = _mm_add_epi16(s[7], s[8]);
-  in[8] = _mm_sub_epi16(s[7], s[8]);
-  in[9] = _mm_sub_epi16(s[6], s[9]);
-  in[10] = _mm_sub_epi16(s[5], s[10]);
-  in[11] = _mm_sub_epi16(s[4], s[11]);
-  in[12] = _mm_sub_epi16(s[3], s[12]);
-  in[13] = _mm_sub_epi16(s[2], s[13]);
-  in[14] = _mm_sub_epi16(s[1], s[14]);
-  in[15] = _mm_sub_epi16(s[0], s[15]);
+  out[0] = _mm_add_epi16(step1[0], step1[7]);
+  out[1] = _mm_add_epi16(step1[1], step1[6]);
+  out[2] = _mm_add_epi16(step1[2], step1[5]);
+  out[3] = _mm_add_epi16(step1[3], step1[4]);
+  out[4] = _mm_sub_epi16(step1[3], step1[4]);
+  out[5] = _mm_sub_epi16(step1[2], step1[5]);
+  out[6] = _mm_sub_epi16(step1[1], step1[6]);
+  out[7] = _mm_sub_epi16(step1[0], step1[7]);
 }
 
-void idct16_sse2(__m128i *in0, __m128i *in1) {
-  array_transpose_16x16(in0, in1);
-  idct16_8col(in0);
-  idct16_8col(in1);
-}
-
-void iadst16_sse2(__m128i *in0, __m128i *in1) {
-  array_transpose_16x16(in0, in1);
-  iadst16_8col(in0);
-  iadst16_8col(in1);
-}
-
-void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
-                               int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+// For each 8x32 block __m128i in[32],
+// Input with index, 2, 6
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_2(const __m128i *const in /*in[32]*/,
+                                            __m128i *const out /*out[16]*/) {
   const __m128i zero = _mm_setzero_si128();
+  __m128i step1[16], step2[16];
 
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+  // stage 2
+  butterfly(in[2], zero, cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+  butterfly(zero, in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
 
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  // stage 3
+  step1[8] = step2[8];
+  step1[9] = step2[8];
+  step1[14] = step2[15];
+  step1[15] = step2[15];
+  step1[10] = step2[11];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[13] = step2[12];
 
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  __m128i in[16], l[16];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8,
-      stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0,
-      stp1_12_0;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-  // First 1-D inverse DCT
-  // Load input data.
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 8 * 2);
-  in[2] = load_input_data(input + 8 * 4);
-  in[3] = load_input_data(input + 8 * 6);
-
-  TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
-
-  // Stage2
-  {
-    const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
-    const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
-
-    tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
-    tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
-    tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
-    tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp5 = _mm_add_epi32(tmp5, rounding);
-    tmp7 = _mm_add_epi32(tmp7, rounding);
-
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
-    stp2_8 = _mm_packs_epi32(tmp0, tmp2);
-    stp2_11 = _mm_packs_epi32(tmp5, tmp7);
-  }
-
-  // Stage3
-  {
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
-
-    tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
-    tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
-    stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
-    stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
-
-    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
-  }
-
-  // Stage4
-  {
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
-
-    tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
-    tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
-    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
-    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
-    tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
-    tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp1 = _mm_add_epi32(tmp1, rounding);
-    tmp3 = _mm_add_epi32(tmp3, rounding);
-    tmp5 = _mm_add_epi32(tmp5, rounding);
-    tmp7 = _mm_add_epi32(tmp7, rounding);
-
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
-    stp1_0 = _mm_packs_epi32(tmp0, tmp0);
-    stp1_1 = _mm_packs_epi32(tmp2, tmp2);
-    stp2_9 = _mm_packs_epi32(tmp1, tmp3);
-    stp2_10 = _mm_packs_epi32(tmp5, tmp7);
-
-    stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
-  }
-
-  // Stage5 and Stage6
-  {
-    tmp0 = _mm_add_epi16(stp2_8, stp2_11);
-    tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
-    tmp2 = _mm_add_epi16(stp2_9, stp2_10);
-    tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
-
-    stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
-    stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
-    stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
-    stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
-
-    stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
-    stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
-    stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
-    stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
-  }
-
-  // Stage6
-  {
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
-
-    tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
-    tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
-    tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
-    tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
-    tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
-    tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
-
-    tmp1 = _mm_add_epi32(tmp1, rounding);
-    tmp3 = _mm_add_epi32(tmp3, rounding);
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp1_6 = _mm_packs_epi32(tmp3, tmp1);
-
-    stp2_10 = _mm_packs_epi32(tmp0, zero);
-    stp2_13 = _mm_packs_epi32(tmp2, zero);
-    stp2_11 = _mm_packs_epi32(tmp4, zero);
-    stp2_12 = _mm_packs_epi32(tmp6, zero);
-
-    tmp0 = _mm_add_epi16(stp1_0, stp1_4);
-    tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
-    tmp2 = _mm_add_epi16(stp1_1, stp1_6);
-    tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
-
-    stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
-    stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
-    stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
-    stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
-    stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
-    stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
-    stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
-    stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
-  }
-
-  // Stage7. Left 8x16 only.
-  l[0] = _mm_add_epi16(stp2_0, stp1_15);
-  l[1] = _mm_add_epi16(stp2_1, stp1_14);
-  l[2] = _mm_add_epi16(stp2_2, stp2_13);
-  l[3] = _mm_add_epi16(stp2_3, stp2_12);
-  l[4] = _mm_add_epi16(stp2_4, stp2_11);
-  l[5] = _mm_add_epi16(stp2_5, stp2_10);
-  l[6] = _mm_add_epi16(stp2_6, stp1_9);
-  l[7] = _mm_add_epi16(stp2_7, stp1_8);
-  l[8] = _mm_sub_epi16(stp2_7, stp1_8);
-  l[9] = _mm_sub_epi16(stp2_6, stp1_9);
-  l[10] = _mm_sub_epi16(stp2_5, stp2_10);
-  l[11] = _mm_sub_epi16(stp2_4, stp2_11);
-  l[12] = _mm_sub_epi16(stp2_3, stp2_12);
-  l[13] = _mm_sub_epi16(stp2_2, stp2_13);
-  l[14] = _mm_sub_epi16(stp2_1, stp1_14);
-  l[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-  // Second 1-D inverse transform, performed per 8x16 block
-  for (i = 0; i < 2; i++) {
-    int j;
-    array_transpose_4X8(l + 8 * i, in);
-
-    IDCT16_10
-
-    // Stage7
-    in[0] = _mm_add_epi16(stp2_0, stp1_15);
-    in[1] = _mm_add_epi16(stp2_1, stp1_14);
-    in[2] = _mm_add_epi16(stp2_2, stp2_13);
-    in[3] = _mm_add_epi16(stp2_3, stp2_12);
-    in[4] = _mm_add_epi16(stp2_4, stp2_11);
-    in[5] = _mm_add_epi16(stp2_5, stp2_10);
-    in[6] = _mm_add_epi16(stp2_6, stp1_9);
-    in[7] = _mm_add_epi16(stp2_7, stp1_8);
-    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
-    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
-    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
-    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
-    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
-    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
-    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
-    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-    for (j = 0; j < 16; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
-
-    dest += 8;
-  }
+  idct32_8x32_quarter_2_stage_4_to_6(step1, out);
 }
 
-#define LOAD_DQCOEFF(reg, input)  \
-  {                               \
-    reg = load_input_data(input); \
-    input += 8;                   \
-  }
+static INLINE void idct32_34_8x32_quarter_1_2(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i temp[16];
+  idct32_34_8x32_quarter_1(in, temp);
+  idct32_34_8x32_quarter_2(in, temp);
+  // stage 7
+  add_sub_butterfly(temp, out, 16);
+}
 
-#define IDCT32_34                                                              \
-  /* Stage1 */                                                                 \
-  {                                                                            \
-    const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero);                   \
-    const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero);                   \
-                                                                               \
-    const __m128i lo_25_7 = _mm_unpacklo_epi16(zero, in[7]);                   \
-    const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]);                   \
-                                                                               \
-    const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero);                   \
-    const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero);                   \
-                                                                               \
-    const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]);                   \
-    const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]);                   \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, stg1_1, stp1_16,        \
-                             stp1_31);                                         \
-    MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, stg1_7, stp1_19,        \
-                             stp1_28);                                         \
-    MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, stg1_9, stp1_20,        \
-                             stp1_27);                                         \
-    MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, stg1_15, stp1_23,      \
-                             stp1_24);                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage2 */                                                                 \
-  {                                                                            \
-    const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero);                   \
-    const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero);                   \
-                                                                               \
-    const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]);                   \
-    const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]);                   \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, stg2_1, stp2_8,         \
-                             stp2_15);                                         \
-    MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, stg2_7, stp2_11,        \
-                             stp2_12);                                         \
-                                                                               \
-    stp2_16 = stp1_16;                                                         \
-    stp2_19 = stp1_19;                                                         \
-                                                                               \
-    stp2_20 = stp1_20;                                                         \
-    stp2_23 = stp1_23;                                                         \
-                                                                               \
-    stp2_24 = stp1_24;                                                         \
-    stp2_27 = stp1_27;                                                         \
-                                                                               \
-    stp2_28 = stp1_28;                                                         \
-    stp2_31 = stp1_31;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage3 */                                                                 \
-  {                                                                            \
-    const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero);                   \
-    const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero);                   \
-                                                                               \
-    const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31);             \
-    const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31);             \
-    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28);             \
-    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28);             \
-                                                                               \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27);             \
-    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24);             \
-    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, stg3_1, stp1_4,         \
-                             stp1_7);                                          \
-                                                                               \
-    stp1_8 = stp2_8;                                                           \
-    stp1_11 = stp2_11;                                                         \
-    stp1_12 = stp2_12;                                                         \
-    stp1_15 = stp2_15;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,     \
-                           stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18,  \
-                           stp1_29)                                            \
-    MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,     \
-                           stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
-                           stp1_25)                                            \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_31 = stp2_31;                                                         \
-    stp1_19 = stp2_19;                                                         \
-    stp1_20 = stp2_20;                                                         \
-    stp1_23 = stp2_23;                                                         \
-    stp1_24 = stp2_24;                                                         \
-    stp1_27 = stp2_27;                                                         \
-    stp1_28 = stp2_28;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage4 */                                                                 \
-  {                                                                            \
-    const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero);                   \
-    const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero);                   \
-                                                                               \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15);               \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15);               \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, stg4_1, stp2_0,         \
-                             stp2_1);                                          \
-                                                                               \
-    stp2_4 = stp1_4;                                                           \
-    stp2_5 = stp1_4;                                                           \
-    stp2_6 = stp1_7;                                                           \
-    stp2_7 = stp1_7;                                                           \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
-                           stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10,   \
-                           stp2_13)                                            \
-                                                                               \
-    stp2_8 = stp1_8;                                                           \
-    stp2_15 = stp1_15;                                                         \
-    stp2_11 = stp1_11;                                                         \
-    stp2_12 = stp1_12;                                                         \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_19);                                 \
-    stp2_17 = _mm_add_epi16(stp1_17, stp1_18);                                 \
-    stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);                                 \
-    stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);                                 \
-    stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);                                 \
-    stp2_22 = _mm_add_epi16(stp1_22, stp1_21);                                 \
-    stp2_23 = _mm_add_epi16(stp1_23, stp1_20);                                 \
-                                                                               \
-    stp2_24 = _mm_add_epi16(stp1_24, stp1_27);                                 \
-    stp2_25 = _mm_add_epi16(stp1_25, stp1_26);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);                                 \
-    stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);                                 \
-    stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);                                 \
-    stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);                                 \
-    stp2_30 = _mm_add_epi16(stp1_29, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_28, stp1_31);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage5 */                                                                 \
-  {                                                                            \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
-    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
-    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
-                                                                               \
-    const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);             \
-    const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);             \
-    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
-    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
-                                                                               \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-                                                                               \
-    stp1_0 = stp2_0;                                                           \
-    stp1_1 = stp2_1;                                                           \
-    stp1_2 = stp2_1;                                                           \
-    stp1_3 = stp2_0;                                                           \
-                                                                               \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
-                                                                               \
-    stp1_4 = stp2_4;                                                           \
-    stp1_7 = stp2_7;                                                           \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp2_8, stp2_11);                                   \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);                                  \
-    stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);                                 \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_12);                                 \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_17 = stp2_17;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,     \
-                           stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19,  \
-                           stp1_28)                                            \
-    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,     \
-                           stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21,  \
-                           stp1_26)                                            \
-                                                                               \
-    stp1_22 = stp2_22;                                                         \
-    stp1_23 = stp2_23;                                                         \
-    stp1_24 = stp2_24;                                                         \
-    stp1_25 = stp2_25;                                                         \
-    stp1_30 = stp2_30;                                                         \
-    stp1_31 = stp2_31;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage6 */                                                                 \
-  {                                                                            \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
-                                                                               \
-    stp2_0 = _mm_add_epi16(stp1_0, stp1_7);                                    \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
-    stp2_3 = _mm_add_epi16(stp1_3, stp1_4);                                    \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);                                    \
-                                                                               \
-    stp2_8 = stp1_8;                                                           \
-    stp2_9 = stp1_9;                                                           \
-    stp2_14 = stp1_14;                                                         \
-    stp2_15 = stp1_15;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_23);                                 \
-    stp2_17 = _mm_add_epi16(stp1_17, stp1_22);                                 \
-    stp2_18 = _mm_add_epi16(stp1_18, stp1_21);                                 \
-    stp2_19 = _mm_add_epi16(stp1_19, stp1_20);                                 \
-    stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);                                 \
-    stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);                                 \
-    stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);                                 \
-                                                                               \
-    stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);                                 \
-    stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);                                 \
-    stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);                                 \
-    stp2_28 = _mm_add_epi16(stp1_27, stp1_28);                                 \
-    stp2_29 = _mm_add_epi16(stp1_26, stp1_29);                                 \
-    stp2_30 = _mm_add_epi16(stp1_25, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_24, stp1_31);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage7 */                                                                 \
-  {                                                                            \
-    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
-    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-                                                                               \
-    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
-    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
-    const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);             \
-    const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);             \
-                                                                               \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_15);                                   \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_14);                                   \
-    stp1_2 = _mm_add_epi16(stp2_2, stp2_13);                                   \
-    stp1_3 = _mm_add_epi16(stp2_3, stp2_12);                                   \
-    stp1_4 = _mm_add_epi16(stp2_4, stp2_11);                                   \
-    stp1_5 = _mm_add_epi16(stp2_5, stp2_10);                                   \
-    stp1_6 = _mm_add_epi16(stp2_6, stp2_9);                                    \
-    stp1_7 = _mm_add_epi16(stp2_7, stp2_8);                                    \
-    stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);                                    \
-    stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);                                    \
-    stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);                                  \
-    stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);                                  \
-    stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);                                  \
-    stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);                                  \
-    stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);                                  \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_17 = stp2_17;                                                         \
-    stp1_18 = stp2_18;                                                         \
-    stp1_19 = stp2_19;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21,  \
-                           stp1_26)                                            \
-    MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23,  \
-                           stp1_24)                                            \
-                                                                               \
-    stp1_28 = stp2_28;                                                         \
-    stp1_29 = stp2_29;                                                         \
-    stp1_30 = stp2_30;                                                         \
-    stp1_31 = stp2_31;                                                         \
-  }
+// For each 8x32 block __m128i in[32],
+// Input with odd index, 1, 3, 5, 7
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_3_4(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i step1[32];
 
-#define IDCT32                                                                 \
-  /* Stage1 */                                                                 \
-  {                                                                            \
-    const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]);                 \
-    const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]);                 \
-    const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]);               \
-    const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]);               \
-                                                                               \
-    const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]);                 \
-    const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]);                 \
-    const __m128i lo_25_7 = _mm_unpacklo_epi16(in[25], in[7]);                 \
-    const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]);                 \
-                                                                               \
-    const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]);                 \
-    const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]);                 \
-    const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]);               \
-    const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]);               \
-                                                                               \
-    const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]);               \
-    const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]);               \
-    const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]);                 \
-    const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]);                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0,       \
-                           stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17,  \
-                           stp1_30)                                            \
-    MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, \
-                           stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) \
-    MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8,       \
-                           stg1_9, stg1_10, stg1_11, stp1_20, stp1_27,         \
-                           stp1_21, stp1_26)                                   \
-    MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12,      \
-                           stg1_13, stg1_14, stg1_15, stp1_22, stp1_25,        \
-                           stp1_23, stp1_24)                                   \
-  }                                                                            \
-                                                                               \
-  /* Stage2 */                                                                 \
-  {                                                                            \
-    const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]);                 \
-    const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]);                 \
-    const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]);               \
-    const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]);               \
-                                                                               \
-    const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]);               \
-    const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]);               \
-    const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]);                 \
-    const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]);                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0,       \
-                           stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9,    \
-                           stp2_14)                                            \
-    MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4,       \
-                           stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_17);                                 \
-    stp2_17 = _mm_sub_epi16(stp1_16, stp1_17);                                 \
-    stp2_18 = _mm_sub_epi16(stp1_19, stp1_18);                                 \
-    stp2_19 = _mm_add_epi16(stp1_19, stp1_18);                                 \
-                                                                               \
-    stp2_20 = _mm_add_epi16(stp1_20, stp1_21);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_20, stp1_21);                                 \
-    stp2_22 = _mm_sub_epi16(stp1_23, stp1_22);                                 \
-    stp2_23 = _mm_add_epi16(stp1_23, stp1_22);                                 \
-                                                                               \
-    stp2_24 = _mm_add_epi16(stp1_24, stp1_25);                                 \
-    stp2_25 = _mm_sub_epi16(stp1_24, stp1_25);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_27, stp1_26);                                 \
-    stp2_27 = _mm_add_epi16(stp1_27, stp1_26);                                 \
-                                                                               \
-    stp2_28 = _mm_add_epi16(stp1_28, stp1_29);                                 \
-    stp2_29 = _mm_sub_epi16(stp1_28, stp1_29);                                 \
-    stp2_30 = _mm_sub_epi16(stp1_31, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_31, stp1_30);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage3 */                                                                 \
-  {                                                                            \
-    const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]);                 \
-    const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]);                 \
-    const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]);               \
-    const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]);               \
-                                                                               \
-    const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30);             \
-    const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30);             \
-    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
-    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
-                                                                               \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
-    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0,       \
-                           stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5,     \
-                           stp1_6)                                             \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp2_8, stp2_9);                                    \
-    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);                                    \
-    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);                                 \
-    stp1_11 = _mm_add_epi16(stp2_11, stp2_10);                                 \
-    stp1_12 = _mm_add_epi16(stp2_12, stp2_13);                                 \
-    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);                                 \
-    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);                                 \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_14);                                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,     \
-                           stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18,  \
-                           stp1_29)                                            \
-    MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,     \
-                           stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
-                           stp1_25)                                            \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_31 = stp2_31;                                                         \
-    stp1_19 = stp2_19;                                                         \
-    stp1_20 = stp2_20;                                                         \
-    stp1_23 = stp2_23;                                                         \
-    stp1_24 = stp2_24;                                                         \
-    stp1_27 = stp2_27;                                                         \
-    stp1_28 = stp2_28;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage4 */                                                                 \
-  {                                                                            \
-    const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]);                 \
-    const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]);                 \
-    const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]);                 \
-    const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]);                 \
-                                                                               \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, \
-                           stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3)     \
-                                                                               \
-    stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                    \
-    stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                    \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
-                           stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10,   \
-                           stp2_13)                                            \
-                                                                               \
-    stp2_8 = stp1_8;                                                           \
-    stp2_15 = stp1_15;                                                         \
-    stp2_11 = stp1_11;                                                         \
-    stp2_12 = stp1_12;                                                         \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_19);                                 \
-    stp2_17 = _mm_add_epi16(stp1_17, stp1_18);                                 \
-    stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);                                 \
-    stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);                                 \
-    stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);                                 \
-    stp2_22 = _mm_add_epi16(stp1_22, stp1_21);                                 \
-    stp2_23 = _mm_add_epi16(stp1_23, stp1_20);                                 \
-                                                                               \
-    stp2_24 = _mm_add_epi16(stp1_24, stp1_27);                                 \
-    stp2_25 = _mm_add_epi16(stp1_25, stp1_26);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);                                 \
-    stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);                                 \
-    stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);                                 \
-    stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);                                 \
-    stp2_30 = _mm_add_epi16(stp1_29, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_28, stp1_31);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage5 */                                                                 \
-  {                                                                            \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
-    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
-    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
-                                                                               \
-    const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);             \
-    const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);             \
-    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
-    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
-                                                                               \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-                                                                               \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                    \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                    \
-    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                    \
-    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                    \
-                                                                               \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
-                                                                               \
-    stp1_4 = stp2_4;                                                           \
-    stp1_7 = stp2_7;                                                           \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp2_8, stp2_11);                                   \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);                                  \
-    stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);                                 \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_12);                                 \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_17 = stp2_17;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,     \
-                           stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19,  \
-                           stp1_28)                                            \
-    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,     \
-                           stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21,  \
-                           stp1_26)                                            \
-                                                                               \
-    stp1_22 = stp2_22;                                                         \
-    stp1_23 = stp2_23;                                                         \
-    stp1_24 = stp2_24;                                                         \
-    stp1_25 = stp2_25;                                                         \
-    stp1_30 = stp2_30;                                                         \
-    stp1_31 = stp2_31;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage6 */                                                                 \
-  {                                                                            \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
-                                                                               \
-    stp2_0 = _mm_add_epi16(stp1_0, stp1_7);                                    \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
-    stp2_3 = _mm_add_epi16(stp1_3, stp1_4);                                    \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);                                    \
-                                                                               \
-    stp2_8 = stp1_8;                                                           \
-    stp2_9 = stp1_9;                                                           \
-    stp2_14 = stp1_14;                                                         \
-    stp2_15 = stp1_15;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_23);                                 \
-    stp2_17 = _mm_add_epi16(stp1_17, stp1_22);                                 \
-    stp2_18 = _mm_add_epi16(stp1_18, stp1_21);                                 \
-    stp2_19 = _mm_add_epi16(stp1_19, stp1_20);                                 \
-    stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);                                 \
-    stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);                                 \
-    stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);                                 \
-                                                                               \
-    stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);                                 \
-    stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);                                 \
-    stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);                                 \
-    stp2_28 = _mm_add_epi16(stp1_27, stp1_28);                                 \
-    stp2_29 = _mm_add_epi16(stp1_26, stp1_29);                                 \
-    stp2_30 = _mm_add_epi16(stp1_25, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_24, stp1_31);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage7 */                                                                 \
-  {                                                                            \
-    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
-    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-                                                                               \
-    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
-    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
-    const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);             \
-    const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);             \
-                                                                               \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_15);                                   \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_14);                                   \
-    stp1_2 = _mm_add_epi16(stp2_2, stp2_13);                                   \
-    stp1_3 = _mm_add_epi16(stp2_3, stp2_12);                                   \
-    stp1_4 = _mm_add_epi16(stp2_4, stp2_11);                                   \
-    stp1_5 = _mm_add_epi16(stp2_5, stp2_10);                                   \
-    stp1_6 = _mm_add_epi16(stp2_6, stp2_9);                                    \
-    stp1_7 = _mm_add_epi16(stp2_7, stp2_8);                                    \
-    stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);                                    \
-    stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);                                    \
-    stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);                                  \
-    stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);                                  \
-    stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);                                  \
-    stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);                                  \
-    stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);                                  \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_17 = stp2_17;                                                         \
-    stp1_18 = stp2_18;                                                         \
-    stp1_19 = stp2_19;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21,  \
-                           stp1_26)                                            \
-    MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23,  \
-                           stp1_24)                                            \
-                                                                               \
-    stp1_28 = stp2_28;                                                         \
-    stp1_29 = stp2_29;                                                         \
-    stp1_30 = stp2_30;                                                         \
-    stp1_31 = stp2_31;                                                         \
-  }
+  // stage 1
+  butterfly(in[1], zero, cospi_31_64, cospi_1_64, &step1[16], &step1[31]);
+  butterfly(zero, in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]);
+  butterfly(in[5], zero, cospi_27_64, cospi_5_64, &step1[20], &step1[27]);
+  butterfly(zero, in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]);
+
+  // stage 3
+  butterfly(step1[31], step1[16], cospi_28_64, cospi_4_64, &step1[17],
+            &step1[30]);
+  butterfly(step1[28], step1[19], -cospi_4_64, cospi_28_64, &step1[18],
+            &step1[29]);
+  butterfly(step1[27], step1[20], cospi_12_64, cospi_20_64, &step1[21],
+            &step1[26]);
+  butterfly(step1[24], step1[23], -cospi_20_64, cospi_12_64, &step1[22],
+            &step1[25]);
+
+  idct32_8x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+void idct32_34_8x32_sse2(const __m128i *const in /*in[32]*/,
+                         __m128i *const out /*out[32]*/) {
+  __m128i temp[32];
+
+  idct32_34_8x32_quarter_1_2(in, temp);
+  idct32_34_8x32_quarter_3_4(in, temp);
+  // final stage
+  add_sub_butterfly(temp, out, 32);
+}
 
 // Only upper-left 8x8 has non-zero coeff
 void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
                                int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-
-  // idct constants for each stage
-  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[32], col[32];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-      stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
-      stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
-      stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
-      stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i io[32], col[32];
   int i;
 
   // Load input data. Only need to load the top left 8x8 block.
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 32);
-  in[2] = load_input_data(input + 64);
-  in[3] = load_input_data(input + 96);
-  in[4] = load_input_data(input + 128);
-  in[5] = load_input_data(input + 160);
-  in[6] = load_input_data(input + 192);
-  in[7] = load_input_data(input + 224);
+  load_transpose_16bit_8x8(input, 32, io);
+  idct32_34_8x32_sse2(io, col);
 
-  array_transpose_8x8(in, in);
-  IDCT32_34
-
-  // 1_D: Store 32 intermediate results for each 8x32 block.
-  col[0] = _mm_add_epi16(stp1_0, stp1_31);
-  col[1] = _mm_add_epi16(stp1_1, stp1_30);
-  col[2] = _mm_add_epi16(stp1_2, stp1_29);
-  col[3] = _mm_add_epi16(stp1_3, stp1_28);
-  col[4] = _mm_add_epi16(stp1_4, stp1_27);
-  col[5] = _mm_add_epi16(stp1_5, stp1_26);
-  col[6] = _mm_add_epi16(stp1_6, stp1_25);
-  col[7] = _mm_add_epi16(stp1_7, stp1_24);
-  col[8] = _mm_add_epi16(stp1_8, stp1_23);
-  col[9] = _mm_add_epi16(stp1_9, stp1_22);
-  col[10] = _mm_add_epi16(stp1_10, stp1_21);
-  col[11] = _mm_add_epi16(stp1_11, stp1_20);
-  col[12] = _mm_add_epi16(stp1_12, stp1_19);
-  col[13] = _mm_add_epi16(stp1_13, stp1_18);
-  col[14] = _mm_add_epi16(stp1_14, stp1_17);
-  col[15] = _mm_add_epi16(stp1_15, stp1_16);
-  col[16] = _mm_sub_epi16(stp1_15, stp1_16);
-  col[17] = _mm_sub_epi16(stp1_14, stp1_17);
-  col[18] = _mm_sub_epi16(stp1_13, stp1_18);
-  col[19] = _mm_sub_epi16(stp1_12, stp1_19);
-  col[20] = _mm_sub_epi16(stp1_11, stp1_20);
-  col[21] = _mm_sub_epi16(stp1_10, stp1_21);
-  col[22] = _mm_sub_epi16(stp1_9, stp1_22);
-  col[23] = _mm_sub_epi16(stp1_8, stp1_23);
-  col[24] = _mm_sub_epi16(stp1_7, stp1_24);
-  col[25] = _mm_sub_epi16(stp1_6, stp1_25);
-  col[26] = _mm_sub_epi16(stp1_5, stp1_26);
-  col[27] = _mm_sub_epi16(stp1_4, stp1_27);
-  col[28] = _mm_sub_epi16(stp1_3, stp1_28);
-  col[29] = _mm_sub_epi16(stp1_2, stp1_29);
-  col[30] = _mm_sub_epi16(stp1_1, stp1_30);
-  col[31] = _mm_sub_epi16(stp1_0, stp1_31);
-  for (i = 0; i < 4; i++) {
+  for (i = 0; i < 32; i += 8) {
     int j;
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(col + i * 8, in);
-    IDCT32_34
-
-    // 2_D: Calculate the results and store them to destination.
-    in[0] = _mm_add_epi16(stp1_0, stp1_31);
-    in[1] = _mm_add_epi16(stp1_1, stp1_30);
-    in[2] = _mm_add_epi16(stp1_2, stp1_29);
-    in[3] = _mm_add_epi16(stp1_3, stp1_28);
-    in[4] = _mm_add_epi16(stp1_4, stp1_27);
-    in[5] = _mm_add_epi16(stp1_5, stp1_26);
-    in[6] = _mm_add_epi16(stp1_6, stp1_25);
-    in[7] = _mm_add_epi16(stp1_7, stp1_24);
-    in[8] = _mm_add_epi16(stp1_8, stp1_23);
-    in[9] = _mm_add_epi16(stp1_9, stp1_22);
-    in[10] = _mm_add_epi16(stp1_10, stp1_21);
-    in[11] = _mm_add_epi16(stp1_11, stp1_20);
-    in[12] = _mm_add_epi16(stp1_12, stp1_19);
-    in[13] = _mm_add_epi16(stp1_13, stp1_18);
-    in[14] = _mm_add_epi16(stp1_14, stp1_17);
-    in[15] = _mm_add_epi16(stp1_15, stp1_16);
-    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
-    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
-    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
-    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
-    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
-    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
-    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
-    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
-    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
-    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
-    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
-    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
-    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
-    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
-    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
-    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
+    transpose_16bit_8x8(col + i, io);
+    idct32_34_8x32_sse2(io, io);
 
     for (j = 0; j < 32; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
+      write_buffer_8x1(dest + j * stride, io[j]);
     }
 
     dest += 8;
   }
 }
 
+// For each 8x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void idct32_1024_8x32_quarter_1(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+  __m128i step1[8], step2[8];
+
+  // stage 3
+  butterfly(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+  butterfly(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5], &step1[6]);
+
+  // stage 4
+  butterfly(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1], &step2[0]);
+  butterfly(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+  step2[4] = _mm_add_epi16(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+  step2[7] = _mm_add_epi16(step1[7], step1[6]);
+
+  // stage 5
+  step1[0] = _mm_add_epi16(step2[0], step2[3]);
+  step1[1] = _mm_add_epi16(step2[1], step2[2]);
+  step1[2] = _mm_sub_epi16(step2[1], step2[2]);
+  step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+  step1[4] = step2[4];
+  butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+  step1[7] = step2[7];
+
+  // stage 6
+  out[0] = _mm_add_epi16(step1[0], step1[7]);
+  out[1] = _mm_add_epi16(step1[1], step1[6]);
+  out[2] = _mm_add_epi16(step1[2], step1[5]);
+  out[3] = _mm_add_epi16(step1[3], step1[4]);
+  out[4] = _mm_sub_epi16(step1[3], step1[4]);
+  out[5] = _mm_sub_epi16(step1[2], step1[5]);
+  out[6] = _mm_sub_epi16(step1[1], step1[6]);
+  out[7] = _mm_sub_epi16(step1[0], step1[7]);
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void idct32_1024_8x32_quarter_2(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[16]*/) {
+  __m128i step1[16], step2[16];
+
+  // stage 2
+  butterfly(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+  butterfly(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], &step2[14]);
+  butterfly(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], &step2[13]);
+  butterfly(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+  // stage 3
+  step1[8] = _mm_add_epi16(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi16(step2[8], step2[9]);
+  step1[10] = _mm_sub_epi16(step2[11], step2[10]);
+  step1[11] = _mm_add_epi16(step2[11], step2[10]);
+  step1[12] = _mm_add_epi16(step2[12], step2[13]);
+  step1[13] = _mm_sub_epi16(step2[12], step2[13]);
+  step1[14] = _mm_sub_epi16(step2[15], step2[14]);
+  step1[15] = _mm_add_epi16(step2[15], step2[14]);
+
+  idct32_8x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void idct32_1024_8x32_quarter_1_2(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i temp[16];
+  idct32_1024_8x32_quarter_1(in, temp);
+  idct32_1024_8x32_quarter_2(in, temp);
+  // stage 7
+  add_sub_butterfly(temp, out, 16);
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void idct32_1024_8x32_quarter_3_4(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 1
+  butterfly(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16], &step1[31]);
+  butterfly(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17], &step1[30]);
+  butterfly(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18], &step1[29]);
+  butterfly(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]);
+
+  butterfly(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20], &step1[27]);
+  butterfly(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21], &step1[26]);
+
+  butterfly(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22], &step1[25]);
+  butterfly(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]);
+
+  // stage 2
+  step2[16] = _mm_add_epi16(step1[16], step1[17]);
+  step2[17] = _mm_sub_epi16(step1[16], step1[17]);
+  step2[18] = _mm_sub_epi16(step1[19], step1[18]);
+  step2[19] = _mm_add_epi16(step1[19], step1[18]);
+  step2[20] = _mm_add_epi16(step1[20], step1[21]);
+  step2[21] = _mm_sub_epi16(step1[20], step1[21]);
+  step2[22] = _mm_sub_epi16(step1[23], step1[22]);
+  step2[23] = _mm_add_epi16(step1[23], step1[22]);
+
+  step2[24] = _mm_add_epi16(step1[24], step1[25]);
+  step2[25] = _mm_sub_epi16(step1[24], step1[25]);
+  step2[26] = _mm_sub_epi16(step1[27], step1[26]);
+  step2[27] = _mm_add_epi16(step1[27], step1[26]);
+  step2[28] = _mm_add_epi16(step1[28], step1[29]);
+  step2[29] = _mm_sub_epi16(step1[28], step1[29]);
+  step2[30] = _mm_sub_epi16(step1[31], step1[30]);
+  step2[31] = _mm_add_epi16(step1[31], step1[30]);
+
+  // stage 3
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  butterfly(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17],
+            &step1[30]);
+  butterfly(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18],
+            &step1[29]);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  butterfly(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21],
+            &step1[26]);
+  butterfly(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22],
+            &step1[25]);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  idct32_8x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+void idct32_1024_8x32(const __m128i *const in /*in[32]*/,
+                      __m128i *const out /*out[32]*/) {
+  __m128i temp[32];
+
+  idct32_1024_8x32_quarter_1_2(in, temp);
+  idct32_1024_8x32_quarter_3_4(in, temp);
+  // final stage
+  add_sub_butterfly(temp, out, 32);
+}
+
 void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
                                  int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-
-  // idct constants for each stage
-  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
-  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
-  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
-  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
-  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
-  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
-  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
-  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[32], col[128], zero_idx[16];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-      stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
-      stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
-      stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
-      stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i, j, i32;
+  __m128i col[4][32], io[32];
+  int i;
 
+  // rows
   for (i = 0; i < 4; i++) {
-    i32 = (i << 5);
-    // First 1-D idct
-    // Load input data.
-    LOAD_DQCOEFF(in[0], input);
-    LOAD_DQCOEFF(in[8], input);
-    LOAD_DQCOEFF(in[16], input);
-    LOAD_DQCOEFF(in[24], input);
-    LOAD_DQCOEFF(in[1], input);
-    LOAD_DQCOEFF(in[9], input);
-    LOAD_DQCOEFF(in[17], input);
-    LOAD_DQCOEFF(in[25], input);
-    LOAD_DQCOEFF(in[2], input);
-    LOAD_DQCOEFF(in[10], input);
-    LOAD_DQCOEFF(in[18], input);
-    LOAD_DQCOEFF(in[26], input);
-    LOAD_DQCOEFF(in[3], input);
-    LOAD_DQCOEFF(in[11], input);
-    LOAD_DQCOEFF(in[19], input);
-    LOAD_DQCOEFF(in[27], input);
-
-    LOAD_DQCOEFF(in[4], input);
-    LOAD_DQCOEFF(in[12], input);
-    LOAD_DQCOEFF(in[20], input);
-    LOAD_DQCOEFF(in[28], input);
-    LOAD_DQCOEFF(in[5], input);
-    LOAD_DQCOEFF(in[13], input);
-    LOAD_DQCOEFF(in[21], input);
-    LOAD_DQCOEFF(in[29], input);
-    LOAD_DQCOEFF(in[6], input);
-    LOAD_DQCOEFF(in[14], input);
-    LOAD_DQCOEFF(in[22], input);
-    LOAD_DQCOEFF(in[30], input);
-    LOAD_DQCOEFF(in[7], input);
-    LOAD_DQCOEFF(in[15], input);
-    LOAD_DQCOEFF(in[23], input);
-    LOAD_DQCOEFF(in[31], input);
-
-    // checking if all entries are zero
-    zero_idx[0] = _mm_or_si128(in[0], in[1]);
-    zero_idx[1] = _mm_or_si128(in[2], in[3]);
-    zero_idx[2] = _mm_or_si128(in[4], in[5]);
-    zero_idx[3] = _mm_or_si128(in[6], in[7]);
-    zero_idx[4] = _mm_or_si128(in[8], in[9]);
-    zero_idx[5] = _mm_or_si128(in[10], in[11]);
-    zero_idx[6] = _mm_or_si128(in[12], in[13]);
-    zero_idx[7] = _mm_or_si128(in[14], in[15]);
-    zero_idx[8] = _mm_or_si128(in[16], in[17]);
-    zero_idx[9] = _mm_or_si128(in[18], in[19]);
-    zero_idx[10] = _mm_or_si128(in[20], in[21]);
-    zero_idx[11] = _mm_or_si128(in[22], in[23]);
-    zero_idx[12] = _mm_or_si128(in[24], in[25]);
-    zero_idx[13] = _mm_or_si128(in[26], in[27]);
-    zero_idx[14] = _mm_or_si128(in[28], in[29]);
-    zero_idx[15] = _mm_or_si128(in[30], in[31]);
-
-    zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
-    zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
-    zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
-    zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
-    zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
-    zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
-    zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
-    zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
-
-    zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
-    zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
-    zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
-    zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
-    zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
-    zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
-    zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
-
-    if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
-      col[i32 + 0] = _mm_setzero_si128();
-      col[i32 + 1] = _mm_setzero_si128();
-      col[i32 + 2] = _mm_setzero_si128();
-      col[i32 + 3] = _mm_setzero_si128();
-      col[i32 + 4] = _mm_setzero_si128();
-      col[i32 + 5] = _mm_setzero_si128();
-      col[i32 + 6] = _mm_setzero_si128();
-      col[i32 + 7] = _mm_setzero_si128();
-      col[i32 + 8] = _mm_setzero_si128();
-      col[i32 + 9] = _mm_setzero_si128();
-      col[i32 + 10] = _mm_setzero_si128();
-      col[i32 + 11] = _mm_setzero_si128();
-      col[i32 + 12] = _mm_setzero_si128();
-      col[i32 + 13] = _mm_setzero_si128();
-      col[i32 + 14] = _mm_setzero_si128();
-      col[i32 + 15] = _mm_setzero_si128();
-      col[i32 + 16] = _mm_setzero_si128();
-      col[i32 + 17] = _mm_setzero_si128();
-      col[i32 + 18] = _mm_setzero_si128();
-      col[i32 + 19] = _mm_setzero_si128();
-      col[i32 + 20] = _mm_setzero_si128();
-      col[i32 + 21] = _mm_setzero_si128();
-      col[i32 + 22] = _mm_setzero_si128();
-      col[i32 + 23] = _mm_setzero_si128();
-      col[i32 + 24] = _mm_setzero_si128();
-      col[i32 + 25] = _mm_setzero_si128();
-      col[i32 + 26] = _mm_setzero_si128();
-      col[i32 + 27] = _mm_setzero_si128();
-      col[i32 + 28] = _mm_setzero_si128();
-      col[i32 + 29] = _mm_setzero_si128();
-      col[i32 + 30] = _mm_setzero_si128();
-      col[i32 + 31] = _mm_setzero_si128();
-      continue;
-    }
-
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(in, in);
-    array_transpose_8x8(in + 8, in + 8);
-    array_transpose_8x8(in + 16, in + 16);
-    array_transpose_8x8(in + 24, in + 24);
-
-    IDCT32
-
-    // 1_D: Store 32 intermediate results for each 8x32 block.
-    col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
-    col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
-    col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
-    col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
-    col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
-    col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
-    col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
-    col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
-    col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
-    col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
-    col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
-    col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
-    col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
-    col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
-    col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
-    col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
-    col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
-    col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
-    col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
-    col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
-    col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
-    col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
-    col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
-    col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
-    col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
-    col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
-    col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
-    col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
-    col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
-    col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
-    col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
-    col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
+    load_transpose_16bit_8x8(&input[0], 32, &io[0]);
+    load_transpose_16bit_8x8(&input[8], 32, &io[8]);
+    load_transpose_16bit_8x8(&input[16], 32, &io[16]);
+    load_transpose_16bit_8x8(&input[24], 32, &io[24]);
+    idct32_1024_8x32(io, col[i]);
+    input += 32 << 3;
   }
-  for (i = 0; i < 4; i++) {
-    // Second 1-D idct
-    j = i << 3;
 
+  // columns
+  for (i = 0; i < 32; i += 8) {
     // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(col + j, in);
-    array_transpose_8x8(col + j + 32, in + 8);
-    array_transpose_8x8(col + j + 64, in + 16);
-    array_transpose_8x8(col + j + 96, in + 24);
+    transpose_16bit_8x8(col[0] + i, io);
+    transpose_16bit_8x8(col[1] + i, io + 8);
+    transpose_16bit_8x8(col[2] + i, io + 16);
+    transpose_16bit_8x8(col[3] + i, io + 24);
 
-    IDCT32
+    idct32_1024_8x32(io, io);
+    store_buffer_8x32(io, dest, stride);
+    dest += 8;
+  }
+}
 
-    // 2_D: Calculate the results and store them to destination.
-    in[0] = _mm_add_epi16(stp1_0, stp1_31);
-    in[1] = _mm_add_epi16(stp1_1, stp1_30);
-    in[2] = _mm_add_epi16(stp1_2, stp1_29);
-    in[3] = _mm_add_epi16(stp1_3, stp1_28);
-    in[4] = _mm_add_epi16(stp1_4, stp1_27);
-    in[5] = _mm_add_epi16(stp1_5, stp1_26);
-    in[6] = _mm_add_epi16(stp1_6, stp1_25);
-    in[7] = _mm_add_epi16(stp1_7, stp1_24);
-    in[8] = _mm_add_epi16(stp1_8, stp1_23);
-    in[9] = _mm_add_epi16(stp1_9, stp1_22);
-    in[10] = _mm_add_epi16(stp1_10, stp1_21);
-    in[11] = _mm_add_epi16(stp1_11, stp1_20);
-    in[12] = _mm_add_epi16(stp1_12, stp1_19);
-    in[13] = _mm_add_epi16(stp1_13, stp1_18);
-    in[14] = _mm_add_epi16(stp1_14, stp1_17);
-    in[15] = _mm_add_epi16(stp1_15, stp1_16);
-    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
-    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
-    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
-    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
-    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
-    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
-    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
-    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
-    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
-    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
-    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
-    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
-    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
-    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
-    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
-    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
+void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest,
+                                int stride) {
+  __m128i col[2][32], in[32], out[32];
+  int i;
 
-    for (j = 0; j < 32; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
+  for (i = 16; i < 32; i++) {
+    in[i] = _mm_setzero_si128();
+  }
 
+  // rows
+  for (i = 0; i < 2; i++) {
+    load_transpose_16bit_8x8(&input[0], 32, &in[0]);
+    load_transpose_16bit_8x8(&input[8], 32, &in[8]);
+    idct32_1024_8x32(in, col[i]);
+    input += 32 << 3;
+  }
+
+  // columns
+  for (i = 0; i < 32; i += 8) {
+    transpose_16bit_8x8(col[0] + i, in);
+    transpose_16bit_8x8(col[1] + i, in + 8);
+    idct32_1024_8x32(in, out);
+    store_buffer_8x32(out, dest, stride);
     dest += 8;
   }
 }
@@ -3427,611 +1219,17 @@ void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
 void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
                               int stride) {
   __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a, j;
+  int j;
+  tran_high_t a1;
+  tran_low_t out =
+      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
 
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 6);
-
-  dc_value = _mm_set1_epi16(a);
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+  dc_value = _mm_set1_epi16((int16_t)a1);
 
   for (j = 0; j < 32; ++j) {
-    RECON_AND_STORE(dest + 0 + j * stride, dc_value);
-    RECON_AND_STORE(dest + 8 + j * stride, dc_value);
-    RECON_AND_STORE(dest + 16 + j * stride, dc_value);
-    RECON_AND_STORE(dest + 24 + j * stride, dc_value);
+    recon_and_store_16(dest + j * stride + 0, dc_value);
+    recon_and_store_16(dest + j * stride + 16, dc_value);
   }
 }
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
-  __m128i ubounded, retval;
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
-  ubounded = _mm_cmpgt_epi16(value, max);
-  retval = _mm_andnot_si128(ubounded, value);
-  ubounded = _mm_and_si128(ubounded, max);
-  retval = _mm_or_si128(retval, ubounded);
-  retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
-  return retval;
-}
-
-void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                    int stride, int bd) {
-  tran_low_t out[4 * 4];
-  tran_low_t *outptr = out;
-  int i, j;
-  __m128i inptr[4];
-  __m128i sign_bits[2];
-  __m128i temp_mm, min_input, max_input;
-  int test;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  int optimised_cols = 0;
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i eight = _mm_set1_epi16(8);
-  const __m128i max = _mm_set1_epi16(12043);
-  const __m128i min = _mm_set1_epi16(-12043);
-  // Load input into __m128i
-  inptr[0] = _mm_loadu_si128((const __m128i *)input);
-  inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
-  inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
-  inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
-
-  // Pack to 16 bits
-  inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
-  inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
-
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp_mm = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp_mm);
-
-  if (!test) {
-    // Do the row transform
-    idct4_sse2(inptr);
-
-    // Check the min & max values
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp_mm = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp_mm);
-
-    if (test) {
-      transpose_4x4(inptr);
-      sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
-      sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
-      inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
-      inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
-      inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
-      inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
-      _mm_storeu_si128((__m128i *)outptr, inptr[0]);
-      _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
-      _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
-      _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 4; ++i) {
-      vpx_highbd_idct4_c(input, outptr, bd);
-      input += 4;
-      outptr += 4;
-    }
-  }
-
-  if (optimised_cols) {
-    idct4_sse2(inptr);
-
-    // Final round and shift
-    inptr[0] = _mm_add_epi16(inptr[0], eight);
-    inptr[1] = _mm_add_epi16(inptr[1], eight);
-
-    inptr[0] = _mm_srai_epi16(inptr[0], 4);
-    inptr[1] = _mm_srai_epi16(inptr[1], 4);
-
-    // Reconstruction and Store
-    {
-      __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
-      __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
-      d0 = _mm_unpacklo_epi64(
-          d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
-      d2 = _mm_unpacklo_epi64(
-          d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
-      d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
-      d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
-      // store input0
-      _mm_storel_epi64((__m128i *)dest, d0);
-      // store input1
-      d0 = _mm_srli_si128(d0, 8);
-      _mm_storel_epi64((__m128i *)(dest + stride), d0);
-      // store input2
-      _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
-      // store input3
-      d2 = _mm_srli_si128(d2, 8);
-      _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[4], temp_out[4];
-    // Columns
-    for (i = 0; i < 4; ++i) {
-      for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
-      vpx_highbd_idct4_c(temp_in, temp_out, bd);
-      for (j = 0; j < 4; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
-      }
-    }
-  }
-}
-
-void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                    int stride, int bd) {
-  tran_low_t out[8 * 8];
-  tran_low_t *outptr = out;
-  int i, j, test;
-  __m128i inptr[8];
-  __m128i min_input, max_input, temp1, temp2, sign_bits;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i sixteen = _mm_set1_epi16(16);
-  const __m128i max = _mm_set1_epi16(6201);
-  const __m128i min = _mm_set1_epi16(-6201);
-  int optimised_cols = 0;
-
-  // Load input into __m128i & pack to 16 bits
-  for (i = 0; i < 8; i++) {
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
-    inptr[i] = _mm_packs_epi32(temp1, temp2);
-  }
-
-  // Find the min & max for the row transform
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  for (i = 2; i < 8; i++) {
-    max_input = _mm_max_epi16(max_input, inptr[i]);
-    min_input = _mm_min_epi16(min_input, inptr[i]);
-  }
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp1 = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp1);
-
-  if (!test) {
-    // Do the row transform
-    idct8_sse2(inptr);
-
-    // Find the min & max for the column transform
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    for (i = 2; i < 8; i++) {
-      max_input = _mm_max_epi16(max_input, inptr[i]);
-      min_input = _mm_min_epi16(min_input, inptr[i]);
-    }
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp1 = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp1);
-
-    if (test) {
-      array_transpose_8x8(inptr, inptr);
-      for (i = 0; i < 8; i++) {
-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
-        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
-        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
-      }
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 8; ++i) {
-      vpx_highbd_idct8_c(input, outptr, bd);
-      input += 8;
-      outptr += 8;
-    }
-  }
-
-  if (optimised_cols) {
-    idct8_sse2(inptr);
-
-    // Final round & shift and Reconstruction and Store
-    {
-      __m128i d[8];
-      for (i = 0; i < 8; i++) {
-        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
-        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
-        inptr[i] = _mm_srai_epi16(inptr[i], 5);
-        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
-        // Store
-        _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
-      }
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[8], temp_out[8];
-    for (i = 0; i < 8; ++i) {
-      for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-      vpx_highbd_idct8_c(temp_in, temp_out, bd);
-      for (j = 0; j < 8; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
-      }
-    }
-  }
-}
-
-void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                    int stride, int bd) {
-  tran_low_t out[8 * 8] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j, test;
-  __m128i inptr[8];
-  __m128i min_input, max_input, temp1, temp2, sign_bits;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i sixteen = _mm_set1_epi16(16);
-  const __m128i max = _mm_set1_epi16(6201);
-  const __m128i min = _mm_set1_epi16(-6201);
-  int optimised_cols = 0;
-
-  // Load input into __m128i & pack to 16 bits
-  for (i = 0; i < 8; i++) {
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
-    inptr[i] = _mm_packs_epi32(temp1, temp2);
-  }
-
-  // Find the min & max for the row transform
-  // only first 4 row has non-zero coefs
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  for (i = 2; i < 4; i++) {
-    max_input = _mm_max_epi16(max_input, inptr[i]);
-    min_input = _mm_min_epi16(min_input, inptr[i]);
-  }
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp1 = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp1);
-
-  if (!test) {
-    // Do the row transform
-    idct8_sse2(inptr);
-
-    // Find the min & max for the column transform
-    // N.B. Only first 4 cols contain non-zero coeffs
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    for (i = 2; i < 8; i++) {
-      max_input = _mm_max_epi16(max_input, inptr[i]);
-      min_input = _mm_min_epi16(min_input, inptr[i]);
-    }
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp1 = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp1);
-
-    if (test) {
-      // Use fact only first 4 rows contain non-zero coeffs
-      array_transpose_4X8(inptr, inptr);
-      for (i = 0; i < 4; i++) {
-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
-        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
-        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
-      }
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 4; ++i) {
-      vpx_highbd_idct8_c(input, outptr, bd);
-      input += 8;
-      outptr += 8;
-    }
-  }
-
-  if (optimised_cols) {
-    idct8_sse2(inptr);
-
-    // Final round & shift and Reconstruction and Store
-    {
-      __m128i d[8];
-      for (i = 0; i < 8; i++) {
-        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
-        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
-        inptr[i] = _mm_srai_epi16(inptr[i], 5);
-        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
-        // Store
-        _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
-      }
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[8], temp_out[8];
-    for (i = 0; i < 8; ++i) {
-      for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-      vpx_highbd_idct8_c(temp_in, temp_out, bd);
-      for (j = 0; j < 8; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
-      }
-    }
-  }
-}
-
-void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                       int stride, int bd) {
-  tran_low_t out[16 * 16];
-  tran_low_t *outptr = out;
-  int i, j, test;
-  __m128i inptr[32];
-  __m128i min_input, max_input, temp1, temp2, sign_bits;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i rounding = _mm_set1_epi16(32);
-  const __m128i max = _mm_set1_epi16(3155);
-  const __m128i min = _mm_set1_epi16(-3155);
-  int optimised_cols = 0;
-
-  // Load input into __m128i & pack to 16 bits
-  for (i = 0; i < 16; i++) {
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
-    inptr[i] = _mm_packs_epi32(temp1, temp2);
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
-    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
-  }
-
-  // Find the min & max for the row transform
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  for (i = 2; i < 32; i++) {
-    max_input = _mm_max_epi16(max_input, inptr[i]);
-    min_input = _mm_min_epi16(min_input, inptr[i]);
-  }
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp1 = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp1);
-
-  if (!test) {
-    // Do the row transform
-    idct16_sse2(inptr, inptr + 16);
-
-    // Find the min & max for the column transform
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    for (i = 2; i < 32; i++) {
-      max_input = _mm_max_epi16(max_input, inptr[i]);
-      min_input = _mm_min_epi16(min_input, inptr[i]);
-    }
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp1 = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp1);
-
-    if (test) {
-      array_transpose_16x16(inptr, inptr + 16);
-      for (i = 0; i < 16; i++) {
-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
-        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
-        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
-        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
-        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
-        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
-      }
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 16; ++i) {
-      vpx_highbd_idct16_c(input, outptr, bd);
-      input += 16;
-      outptr += 16;
-    }
-  }
-
-  if (optimised_cols) {
-    idct16_sse2(inptr, inptr + 16);
-
-    // Final round & shift and Reconstruction and Store
-    {
-      __m128i d[2];
-      for (i = 0; i < 16; i++) {
-        inptr[i] = _mm_add_epi16(inptr[i], rounding);
-        inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
-        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
-        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
-        inptr[i] = _mm_srai_epi16(inptr[i], 6);
-        inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
-        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
-        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
-        // Store
-        _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
-        _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
-      }
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[16], temp_out[16];
-    for (i = 0; i < 16; ++i) {
-      for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-      vpx_highbd_idct16_c(temp_in, temp_out, bd);
-      for (j = 0; j < 16; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
-      }
-    }
-  }
-}
-
-void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                      int stride, int bd) {
-  tran_low_t out[16 * 16] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j, test;
-  __m128i inptr[32];
-  __m128i min_input, max_input, temp1, temp2, sign_bits;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i rounding = _mm_set1_epi16(32);
-  const __m128i max = _mm_set1_epi16(3155);
-  const __m128i min = _mm_set1_epi16(-3155);
-  int optimised_cols = 0;
-
-  // Load input into __m128i & pack to 16 bits
-  for (i = 0; i < 16; i++) {
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
-    inptr[i] = _mm_packs_epi32(temp1, temp2);
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
-    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
-  }
-
-  // Find the min & max for the row transform
-  // Since all non-zero dct coefficients are in upper-left 4x4 area,
-  // we only need to consider first 4 rows here.
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  for (i = 2; i < 4; i++) {
-    max_input = _mm_max_epi16(max_input, inptr[i]);
-    min_input = _mm_min_epi16(min_input, inptr[i]);
-  }
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp1 = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp1);
-
-  if (!test) {
-    // Do the row transform (N.B. This transposes inptr)
-    idct16_sse2(inptr, inptr + 16);
-
-    // Find the min & max for the column transform
-    // N.B. Only first 4 cols contain non-zero coeffs
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    for (i = 2; i < 16; i++) {
-      max_input = _mm_max_epi16(max_input, inptr[i]);
-      min_input = _mm_min_epi16(min_input, inptr[i]);
-    }
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp1 = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp1);
-
-    if (test) {
-      // Use fact only first 4 rows contain non-zero coeffs
-      array_transpose_8x8(inptr, inptr);
-      array_transpose_8x8(inptr + 8, inptr + 16);
-      for (i = 0; i < 4; i++) {
-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
-        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
-        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
-        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
-        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
-        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
-      }
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 4; ++i) {
-      vpx_highbd_idct16_c(input, outptr, bd);
-      input += 16;
-      outptr += 16;
-    }
-  }
-
-  if (optimised_cols) {
-    idct16_sse2(inptr, inptr + 16);
-
-    // Final round & shift and Reconstruction and Store
-    {
-      __m128i d[2];
-      for (i = 0; i < 16; i++) {
-        inptr[i] = _mm_add_epi16(inptr[i], rounding);
-        inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
-        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
-        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
-        inptr[i] = _mm_srai_epi16(inptr[i], 6);
-        inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
-        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
-        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
-        // Store
-        _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
-        _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
-      }
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[16], temp_out[16];
-    for (i = 0; i < 16; ++i) {
-      for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-      vpx_highbd_idct16_c(temp_in, temp_out, bd);
-      for (j = 0; j < 16; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
-      }
-    }
-  }
-}
-
-void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                     int stride, int bd) {
-  __m128i dc_value, d;
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
-  int a, i, j;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  tran_low_t out;
-
-  out = dct_const_round_shift(input[0] * cospi_16_64);
-  out = dct_const_round_shift(out * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(out, 6);
-
-  d = _mm_set1_epi32(a);
-  dc_value = _mm_packs_epi32(d, d);
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 4; ++j) {
-      d = _mm_loadu_si128((const __m128i *)(&dest[j * 8]));
-      d = _mm_adds_epi16(d, dc_value);
-      d = _mm_max_epi16(d, zero);
-      d = _mm_min_epi16(d, max);
-      _mm_storeu_si128((__m128i *)(&dest[j * 8]), d);
-    }
-    dest += stride;
-  }
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
index d762a04abc..b4bbd186d2 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
+++ b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
@@ -8,189 +8,703 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_X86_INV_TXFM_SSE2_H_
-#define VPX_DSP_X86_INV_TXFM_SSE2_H_
+#ifndef VPX_VPX_DSP_X86_INV_TXFM_SSE2_H_
+#define VPX_VPX_DSP_X86_INV_TXFM_SSE2_H_
 
 #include <emmintrin.h>  // SSE2
+
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/inv_txfm.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 
-// perform 8x8 transpose
-static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
-  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
-  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+static INLINE void idct8x8_12_transpose_16bit_4x8(const __m128i *const in,
+                                                  __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 30 31 32 33  00 01 02 03
+  // in[1]: 20 21 22 23  10 11 12 13
+  // in[2]: 40 41 42 43  70 71 72 73
+  // in[3]: 50 51 52 53  60 61 62 63
+  // to:
+  // tr0_0: 00 10 01 11  02 12 03 13
+  // tr0_1: 20 30 21 31  22 32 23 33
+  // tr0_2: 40 50 41 51  42 52 43 53
+  // tr0_3: 60 70 61 71  62 72 63 73
+  const __m128i tr0_0 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(in[1], in[0]);
+  const __m128i tr0_2 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i tr0_3 = _mm_unpackhi_epi16(in[3], in[2]);
 
+  // Unpack 32 bit elements resulting in:
+  // tr1_0: 00 10 20 30  01 11 21 31
+  // tr1_1: 02 12 22 32  03 13 23 33
+  // tr1_2: 40 50 60 70  41 51 61 71
+  // tr1_3: 42 52 62 72  43 53 63 73
   const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
   const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
 
-  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
-  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
-  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
-  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
-  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
-  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
-  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
-  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  40 50 60 70
+  // out[1]: 01 11 21 31  41 51 61 71
+  // out[2]: 02 12 22 32  42 52 62 72
+  // out[3]: 03 13 23 33  43 53 63 73
+  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+  out[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+  out[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
 }
 
-#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1)   \
-  {                                                     \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
-                                                        \
-    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \
-    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \
-  }
-
-static INLINE void array_transpose_4X8(__m128i *in, __m128i *out) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-
-  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
-  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
-  out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
-  out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
+static INLINE __m128i dct_const_round_shift_sse2(const __m128i in) {
+  const __m128i t = _mm_add_epi32(in, _mm_set1_epi32(DCT_CONST_ROUNDING));
+  return _mm_srai_epi32(t, DCT_CONST_BITS);
 }
 
-static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
-  __m128i tbuf[8];
-  array_transpose_8x8(res0, res0);
-  array_transpose_8x8(res1, tbuf);
-  array_transpose_8x8(res0 + 8, res1);
-  array_transpose_8x8(res1 + 8, res1 + 8);
-
-  res0[8] = tbuf[0];
-  res0[9] = tbuf[1];
-  res0[10] = tbuf[2];
-  res0[11] = tbuf[3];
-  res0[12] = tbuf[4];
-  res0[13] = tbuf[5];
-  res0[14] = tbuf[6];
-  res0[15] = tbuf[7];
+static INLINE __m128i idct_madd_round_shift_sse2(const __m128i in,
+                                                 const __m128i cospi) {
+  const __m128i t = _mm_madd_epi16(in, cospi);
+  return dct_const_round_shift_sse2(t);
 }
 
-// Function to allow 8 bit optimisations to be used when profile 0 is used with
+// Calculate the dot product between in0/1 and x and wrap to short.
+static INLINE __m128i idct_calc_wraplow_sse2(const __m128i in0,
+                                             const __m128i in1,
+                                             const __m128i x) {
+  const __m128i t0 = idct_madd_round_shift_sse2(in0, x);
+  const __m128i t1 = idct_madd_round_shift_sse2(in1, x);
+  return _mm_packs_epi32(t0, t1);
+}
+
+// Multiply elements by constants and add them together.
+static INLINE void butterfly(const __m128i in0, const __m128i in1, const int c0,
+                             const int c1, __m128i *const out0,
+                             __m128i *const out1) {
+  const __m128i cst0 = pair_set_epi16(c0, -c1);
+  const __m128i cst1 = pair_set_epi16(c1, c0);
+  const __m128i lo = _mm_unpacklo_epi16(in0, in1);
+  const __m128i hi = _mm_unpackhi_epi16(in0, in1);
+  *out0 = idct_calc_wraplow_sse2(lo, hi, cst0);
+  *out1 = idct_calc_wraplow_sse2(lo, hi, cst1);
+}
+
+static INLINE __m128i butterfly_cospi16(const __m128i in) {
+  const __m128i cst = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i lo = _mm_unpacklo_epi16(in, _mm_setzero_si128());
+  const __m128i hi = _mm_unpackhi_epi16(in, _mm_setzero_si128());
+  return idct_calc_wraplow_sse2(lo, hi, cst);
+}
+
+// Functions to allow 8 bit optimisations to be used when profile 0 is used with
 // highbitdepth enabled
-static INLINE __m128i load_input_data(const tran_low_t *data) {
+static INLINE __m128i load_input_data4(const tran_low_t *data) {
 #if CONFIG_VP9_HIGHBITDEPTH
-  return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5],
-                        data[6], data[7]);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i in = _mm_load_si128((const __m128i *)data);
+  return _mm_packs_epi32(in, zero);
+#else
+  return _mm_loadl_epi64((const __m128i *)data);
+#endif
+}
+
+static INLINE __m128i load_input_data8(const tran_low_t *data) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const __m128i in0 = _mm_load_si128((const __m128i *)data);
+  const __m128i in1 = _mm_load_si128((const __m128i *)(data + 4));
+  return _mm_packs_epi32(in0, in1);
 #else
   return _mm_load_si128((const __m128i *)data);
 #endif
 }
 
-static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) {
-  in[0] = load_input_data(input + 0 * 16);
-  in[1] = load_input_data(input + 1 * 16);
-  in[2] = load_input_data(input + 2 * 16);
-  in[3] = load_input_data(input + 3 * 16);
-  in[4] = load_input_data(input + 4 * 16);
-  in[5] = load_input_data(input + 5 * 16);
-  in[6] = load_input_data(input + 6 * 16);
-  in[7] = load_input_data(input + 7 * 16);
-
-  in[8] = load_input_data(input + 8 * 16);
-  in[9] = load_input_data(input + 9 * 16);
-  in[10] = load_input_data(input + 10 * 16);
-  in[11] = load_input_data(input + 11 * 16);
-  in[12] = load_input_data(input + 12 * 16);
-  in[13] = load_input_data(input + 13 * 16);
-  in[14] = load_input_data(input + 14 * 16);
-  in[15] = load_input_data(input + 15 * 16);
+static INLINE void load_transpose_16bit_8x8(const tran_low_t *input,
+                                            const int stride,
+                                            __m128i *const in) {
+  in[0] = load_input_data8(input + 0 * stride);
+  in[1] = load_input_data8(input + 1 * stride);
+  in[2] = load_input_data8(input + 2 * stride);
+  in[3] = load_input_data8(input + 3 * stride);
+  in[4] = load_input_data8(input + 4 * stride);
+  in[5] = load_input_data8(input + 5 * stride);
+  in[6] = load_input_data8(input + 6 * stride);
+  in[7] = load_input_data8(input + 7 * stride);
+  transpose_16bit_8x8(in, in);
 }
 
-#define RECON_AND_STORE(dest, in_x)                  \
-  {                                                  \
-    __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
-    d0 = _mm_unpacklo_epi8(d0, zero);                \
-    d0 = _mm_add_epi16(in_x, d0);                    \
-    d0 = _mm_packus_epi16(d0, d0);                   \
-    _mm_storel_epi64((__m128i *)(dest), d0);         \
+static INLINE void recon_and_store(uint8_t *const dest, const __m128i in_x) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i d0 = _mm_loadl_epi64((__m128i *)(dest));
+  d0 = _mm_unpacklo_epi8(d0, zero);
+  d0 = _mm_add_epi16(in_x, d0);
+  d0 = _mm_packus_epi16(d0, d0);
+  _mm_storel_epi64((__m128i *)(dest), d0);
+}
+
+static INLINE void round_shift_8x8(const __m128i *const in,
+                                   __m128i *const out) {
+  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+
+  out[0] = _mm_add_epi16(in[0], final_rounding);
+  out[1] = _mm_add_epi16(in[1], final_rounding);
+  out[2] = _mm_add_epi16(in[2], final_rounding);
+  out[3] = _mm_add_epi16(in[3], final_rounding);
+  out[4] = _mm_add_epi16(in[4], final_rounding);
+  out[5] = _mm_add_epi16(in[5], final_rounding);
+  out[6] = _mm_add_epi16(in[6], final_rounding);
+  out[7] = _mm_add_epi16(in[7], final_rounding);
+
+  out[0] = _mm_srai_epi16(out[0], 5);
+  out[1] = _mm_srai_epi16(out[1], 5);
+  out[2] = _mm_srai_epi16(out[2], 5);
+  out[3] = _mm_srai_epi16(out[3], 5);
+  out[4] = _mm_srai_epi16(out[4], 5);
+  out[5] = _mm_srai_epi16(out[5], 5);
+  out[6] = _mm_srai_epi16(out[6], 5);
+  out[7] = _mm_srai_epi16(out[7], 5);
+}
+
+static INLINE void write_buffer_8x8(const __m128i *const in,
+                                    uint8_t *const dest, const int stride) {
+  __m128i t[8];
+
+  round_shift_8x8(in, t);
+
+  recon_and_store(dest + 0 * stride, t[0]);
+  recon_and_store(dest + 1 * stride, t[1]);
+  recon_and_store(dest + 2 * stride, t[2]);
+  recon_and_store(dest + 3 * stride, t[3]);
+  recon_and_store(dest + 4 * stride, t[4]);
+  recon_and_store(dest + 5 * stride, t[5]);
+  recon_and_store(dest + 6 * stride, t[6]);
+  recon_and_store(dest + 7 * stride, t[7]);
+}
+
+static INLINE void recon_and_store4x4_sse2(const __m128i *const in,
+                                           uint8_t *const dest,
+                                           const int stride) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i d[2];
+
+  // Reconstruction and Store
+  d[0] = _mm_cvtsi32_si128(*(const int *)(dest));
+  d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
+  d[0] = _mm_unpacklo_epi32(d[0],
+                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
+  d[1] = _mm_unpacklo_epi32(
+      _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]);
+  d[0] = _mm_unpacklo_epi8(d[0], zero);
+  d[1] = _mm_unpacklo_epi8(d[1], zero);
+  d[0] = _mm_add_epi16(d[0], in[0]);
+  d[1] = _mm_add_epi16(d[1], in[1]);
+  d[0] = _mm_packus_epi16(d[0], d[1]);
+
+  *(int *)dest = _mm_cvtsi128_si32(d[0]);
+  d[0] = _mm_srli_si128(d[0], 4);
+  *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]);
+  d[0] = _mm_srli_si128(d[0], 4);
+  *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]);
+  d[0] = _mm_srli_si128(d[0], 4);
+  *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]);
+}
+
+static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) {
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  int j = 0;
+  while (j < 32) {
+    in[j] = _mm_adds_epi16(in[j], final_rounding);
+    in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding);
+
+    in[j] = _mm_srai_epi16(in[j], 6);
+    in[j + 1] = _mm_srai_epi16(in[j + 1], 6);
+
+    recon_and_store(dst, in[j]);
+    dst += stride;
+    recon_and_store(dst, in[j + 1]);
+    dst += stride;
+    j += 2;
+  }
+}
+
+static INLINE void write_buffer_8x1(uint8_t *const dest, const __m128i in) {
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  __m128i out;
+  out = _mm_adds_epi16(in, final_rounding);
+  out = _mm_srai_epi16(out, 6);
+  recon_and_store(dest, out);
+}
+
+// Only do addition and subtraction butterfly, size = 16, 32
+static INLINE void add_sub_butterfly(const __m128i *in, __m128i *out,
+                                     int size) {
+  int i = 0;
+  const int num = size >> 1;
+  const int bound = size - 1;
+  while (i < num) {
+    out[i] = _mm_add_epi16(in[i], in[bound - i]);
+    out[bound - i] = _mm_sub_epi16(in[i], in[bound - i]);
+    i++;
+  }
+}
+
+static INLINE void idct8(const __m128i *const in /*in[8]*/,
+                         __m128i *const out /*out[8]*/) {
+  __m128i step1[8], step2[8];
+
+  // stage 1
+  butterfly(in[1], in[7], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+  butterfly(in[5], in[3], cospi_12_64, cospi_20_64, &step1[5], &step1[6]);
+
+  // stage 2
+  butterfly(in[0], in[4], cospi_16_64, cospi_16_64, &step2[1], &step2[0]);
+  butterfly(in[2], in[6], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+
+  step2[4] = _mm_add_epi16(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+  step2[7] = _mm_add_epi16(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = _mm_add_epi16(step2[0], step2[3]);
+  step1[1] = _mm_add_epi16(step2[1], step2[2]);
+  step1[2] = _mm_sub_epi16(step2[1], step2[2]);
+  step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+  butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+
+  // stage 4
+  out[0] = _mm_add_epi16(step1[0], step2[7]);
+  out[1] = _mm_add_epi16(step1[1], step1[6]);
+  out[2] = _mm_add_epi16(step1[2], step1[5]);
+  out[3] = _mm_add_epi16(step1[3], step2[4]);
+  out[4] = _mm_sub_epi16(step1[3], step2[4]);
+  out[5] = _mm_sub_epi16(step1[2], step1[5]);
+  out[6] = _mm_sub_epi16(step1[1], step1[6]);
+  out[7] = _mm_sub_epi16(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_12_add_kernel_sse2(__m128i *const io /*io[8]*/) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i cp_16_16 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  __m128i step1[8], step2[8], tmp[4];
+
+  transpose_16bit_4x4(io, io);
+  // io[0]: 00 10 20 30  01 11 21 31
+  // io[1]: 02 12 22 32  03 13 23 33
+
+  // stage 1
+  {
+    const __m128i cp_28_n4 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+    const __m128i cp_4_28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+    const __m128i cp_n20_12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+    const __m128i cp_12_20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+    const __m128i lo_1 = _mm_unpackhi_epi16(io[0], zero);
+    const __m128i lo_3 = _mm_unpackhi_epi16(io[1], zero);
+    step1[4] = idct_calc_wraplow_sse2(cp_28_n4, cp_4_28, lo_1);    // step1 4&7
+    step1[5] = idct_calc_wraplow_sse2(cp_n20_12, cp_12_20, lo_3);  // step1 5&6
   }
 
-static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-  // Final rounding and shift
-  in[0] = _mm_adds_epi16(in[0], final_rounding);
-  in[1] = _mm_adds_epi16(in[1], final_rounding);
-  in[2] = _mm_adds_epi16(in[2], final_rounding);
-  in[3] = _mm_adds_epi16(in[3], final_rounding);
-  in[4] = _mm_adds_epi16(in[4], final_rounding);
-  in[5] = _mm_adds_epi16(in[5], final_rounding);
-  in[6] = _mm_adds_epi16(in[6], final_rounding);
-  in[7] = _mm_adds_epi16(in[7], final_rounding);
-  in[8] = _mm_adds_epi16(in[8], final_rounding);
-  in[9] = _mm_adds_epi16(in[9], final_rounding);
-  in[10] = _mm_adds_epi16(in[10], final_rounding);
-  in[11] = _mm_adds_epi16(in[11], final_rounding);
-  in[12] = _mm_adds_epi16(in[12], final_rounding);
-  in[13] = _mm_adds_epi16(in[13], final_rounding);
-  in[14] = _mm_adds_epi16(in[14], final_rounding);
-  in[15] = _mm_adds_epi16(in[15], final_rounding);
+  // stage 2
+  {
+    const __m128i cp_24_n8 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+    const __m128i cp_8_24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+    const __m128i lo_0 = _mm_unpacklo_epi16(io[0], zero);
+    const __m128i lo_2 = _mm_unpacklo_epi16(io[1], zero);
+    const __m128i t = idct_madd_round_shift_sse2(cp_16_16, lo_0);
+    step2[0] = _mm_packs_epi32(t, t);                            // step2 0&1
+    step2[2] = idct_calc_wraplow_sse2(cp_8_24, cp_24_n8, lo_2);  // step2 3&2
+    step2[4] = _mm_add_epi16(step1[4], step1[5]);                // step2 4&7
+    step2[5] = _mm_sub_epi16(step1[4], step1[5]);                // step2 5&6
+    step2[6] = _mm_unpackhi_epi64(step2[5], zero);               // step2 6
+  }
 
-  in[0] = _mm_srai_epi16(in[0], 6);
-  in[1] = _mm_srai_epi16(in[1], 6);
-  in[2] = _mm_srai_epi16(in[2], 6);
-  in[3] = _mm_srai_epi16(in[3], 6);
-  in[4] = _mm_srai_epi16(in[4], 6);
-  in[5] = _mm_srai_epi16(in[5], 6);
-  in[6] = _mm_srai_epi16(in[6], 6);
-  in[7] = _mm_srai_epi16(in[7], 6);
-  in[8] = _mm_srai_epi16(in[8], 6);
-  in[9] = _mm_srai_epi16(in[9], 6);
-  in[10] = _mm_srai_epi16(in[10], 6);
-  in[11] = _mm_srai_epi16(in[11], 6);
-  in[12] = _mm_srai_epi16(in[12], 6);
-  in[13] = _mm_srai_epi16(in[13], 6);
-  in[14] = _mm_srai_epi16(in[14], 6);
-  in[15] = _mm_srai_epi16(in[15], 6);
+  // stage 3
+  {
+    const __m128i lo_65 = _mm_unpacklo_epi16(step2[6], step2[5]);
+    tmp[0] = _mm_add_epi16(step2[0], step2[2]);                     // step1 0&1
+    tmp[1] = _mm_sub_epi16(step2[0], step2[2]);                     // step1 3&2
+    step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]);                  // step1 2&1
+    step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]);                  // step1 3&0
+    step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, lo_65);  // step1 5&6
+  }
 
-  RECON_AND_STORE(dest + 0 * stride, in[0]);
-  RECON_AND_STORE(dest + 1 * stride, in[1]);
-  RECON_AND_STORE(dest + 2 * stride, in[2]);
-  RECON_AND_STORE(dest + 3 * stride, in[3]);
-  RECON_AND_STORE(dest + 4 * stride, in[4]);
-  RECON_AND_STORE(dest + 5 * stride, in[5]);
-  RECON_AND_STORE(dest + 6 * stride, in[6]);
-  RECON_AND_STORE(dest + 7 * stride, in[7]);
-  RECON_AND_STORE(dest + 8 * stride, in[8]);
-  RECON_AND_STORE(dest + 9 * stride, in[9]);
-  RECON_AND_STORE(dest + 10 * stride, in[10]);
-  RECON_AND_STORE(dest + 11 * stride, in[11]);
-  RECON_AND_STORE(dest + 12 * stride, in[12]);
-  RECON_AND_STORE(dest + 13 * stride, in[13]);
-  RECON_AND_STORE(dest + 14 * stride, in[14]);
-  RECON_AND_STORE(dest + 15 * stride, in[15]);
+  // stage 4
+  tmp[0] = _mm_add_epi16(step1[3], step2[4]);  // output 3&0
+  tmp[1] = _mm_add_epi16(step1[2], step1[5]);  // output 2&1
+  tmp[2] = _mm_sub_epi16(step1[3], step2[4]);  // output 4&7
+  tmp[3] = _mm_sub_epi16(step1[2], step1[5]);  // output 5&6
+
+  idct8x8_12_transpose_16bit_4x8(tmp, io);
+  io[4] = io[5] = io[6] = io[7] = zero;
+
+  idct8(io, io);
 }
 
-void idct4_sse2(__m128i *in);
-void idct8_sse2(__m128i *in);
-void idct16_sse2(__m128i *in0, __m128i *in1);
-void iadst4_sse2(__m128i *in);
-void iadst8_sse2(__m128i *in);
-void iadst16_sse2(__m128i *in0, __m128i *in1);
+static INLINE void idct16_8col(const __m128i *const in /*in[16]*/,
+                               __m128i *const out /*out[16]*/) {
+  __m128i step1[16], step2[16];
 
-#endif  // VPX_DSP_X86_INV_TXFM_SSE2_H_
+  // stage 2
+  butterfly(in[1], in[15], cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+  butterfly(in[9], in[7], cospi_14_64, cospi_18_64, &step2[9], &step2[14]);
+  butterfly(in[5], in[11], cospi_22_64, cospi_10_64, &step2[10], &step2[13]);
+  butterfly(in[13], in[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+  // stage 3
+  butterfly(in[2], in[14], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+  butterfly(in[10], in[6], cospi_12_64, cospi_20_64, &step1[5], &step1[6]);
+  step1[8] = _mm_add_epi16(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi16(step2[8], step2[9]);
+  step1[10] = _mm_sub_epi16(step2[11], step2[10]);
+  step1[11] = _mm_add_epi16(step2[10], step2[11]);
+  step1[12] = _mm_add_epi16(step2[12], step2[13]);
+  step1[13] = _mm_sub_epi16(step2[12], step2[13]);
+  step1[14] = _mm_sub_epi16(step2[15], step2[14]);
+  step1[15] = _mm_add_epi16(step2[14], step2[15]);
+
+  // stage 4
+  butterfly(in[0], in[8], cospi_16_64, cospi_16_64, &step2[1], &step2[0]);
+  butterfly(in[4], in[12], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+  butterfly(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+            &step2[14]);
+  butterfly(step1[10], step1[13], -cospi_8_64, -cospi_24_64, &step2[13],
+            &step2[10]);
+  step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+  step1[4] = _mm_add_epi16(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+  step1[7] = _mm_add_epi16(step1[6], step1[7]);
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  // stage 5
+  step1[0] = _mm_add_epi16(step2[0], step2[3]);
+  step1[1] = _mm_add_epi16(step2[1], step2[2]);
+  step1[2] = _mm_sub_epi16(step2[1], step2[2]);
+  step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+  butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+  step1[8] = _mm_add_epi16(step2[8], step2[11]);
+  step1[9] = _mm_add_epi16(step2[9], step2[10]);
+  step1[10] = _mm_sub_epi16(step2[9], step2[10]);
+  step1[11] = _mm_sub_epi16(step2[8], step2[11]);
+  step1[12] = _mm_sub_epi16(step2[15], step2[12]);
+  step1[13] = _mm_sub_epi16(step2[14], step2[13]);
+  step1[14] = _mm_add_epi16(step2[14], step2[13]);
+  step1[15] = _mm_add_epi16(step2[15], step2[12]);
+
+  // stage 6
+  step2[0] = _mm_add_epi16(step1[0], step1[7]);
+  step2[1] = _mm_add_epi16(step1[1], step1[6]);
+  step2[2] = _mm_add_epi16(step1[2], step1[5]);
+  step2[3] = _mm_add_epi16(step1[3], step1[4]);
+  step2[4] = _mm_sub_epi16(step1[3], step1[4]);
+  step2[5] = _mm_sub_epi16(step1[2], step1[5]);
+  step2[6] = _mm_sub_epi16(step1[1], step1[6]);
+  step2[7] = _mm_sub_epi16(step1[0], step1[7]);
+  butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10],
+            &step2[13]);
+  butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11],
+            &step2[12]);
+
+  // stage 7
+  out[0] = _mm_add_epi16(step2[0], step1[15]);
+  out[1] = _mm_add_epi16(step2[1], step1[14]);
+  out[2] = _mm_add_epi16(step2[2], step2[13]);
+  out[3] = _mm_add_epi16(step2[3], step2[12]);
+  out[4] = _mm_add_epi16(step2[4], step2[11]);
+  out[5] = _mm_add_epi16(step2[5], step2[10]);
+  out[6] = _mm_add_epi16(step2[6], step1[9]);
+  out[7] = _mm_add_epi16(step2[7], step1[8]);
+  out[8] = _mm_sub_epi16(step2[7], step1[8]);
+  out[9] = _mm_sub_epi16(step2[6], step1[9]);
+  out[10] = _mm_sub_epi16(step2[5], step2[10]);
+  out[11] = _mm_sub_epi16(step2[4], step2[11]);
+  out[12] = _mm_sub_epi16(step2[3], step2[12]);
+  out[13] = _mm_sub_epi16(step2[2], step2[13]);
+  out[14] = _mm_sub_epi16(step2[1], step1[14]);
+  out[15] = _mm_sub_epi16(step2[0], step1[15]);
+}
+
+static INLINE void idct16x16_10_pass1(const __m128i *const input /*input[4]*/,
+                                      __m128i *const output /*output[16]*/) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  __m128i step1[16], step2[16];
+
+  transpose_16bit_4x4(input, output);
+
+  // stage 2
+  {
+    const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+    const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+    const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+    const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+    const __m128i lo_1_15 = _mm_unpackhi_epi16(output[0], zero);
+    const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, output[1]);
+    step2[8] = idct_calc_wraplow_sse2(k__cospi_p30_m02, k__cospi_p02_p30,
+                                      lo_1_15);  // step2 8&15
+    step2[11] = idct_calc_wraplow_sse2(k__cospi_p06_m26, k__cospi_p26_p06,
+                                       lo_13_3);  // step2 11&12
+  }
+
+  // stage 3
+  {
+    const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+    const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+    const __m128i lo_2_14 = _mm_unpacklo_epi16(output[1], zero);
+    step1[4] = idct_calc_wraplow_sse2(k__cospi_p28_m04, k__cospi_p04_p28,
+                                      lo_2_14);  // step1 4&7
+    step1[13] = _mm_unpackhi_epi64(step2[11], zero);
+    step1[14] = _mm_unpackhi_epi64(step2[8], zero);
+  }
+
+  // stage 4
+  {
+    const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+    const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+    const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+    const __m128i lo_0_8 = _mm_unpacklo_epi16(output[0], zero);
+    const __m128i lo_9_14 = _mm_unpacklo_epi16(step2[8], step1[14]);
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(step2[11], step1[13]);
+    const __m128i t = idct_madd_round_shift_sse2(lo_0_8, k__cospi_p16_p16);
+    step1[0] = _mm_packs_epi32(t, t);  // step2 0&1
+    step2[9] = idct_calc_wraplow_sse2(k__cospi_m08_p24, k__cospi_p24_p08,
+                                      lo_9_14);  // step2 9&14
+    step2[10] = idct_calc_wraplow_sse2(k__cospi_m24_m08, k__cospi_m08_p24,
+                                       lo_10_13);  // step2 10&13
+    step2[6] = _mm_unpackhi_epi64(step1[4], zero);
+  }
+
+  // stage 5
+  {
+    const __m128i lo_5_6 = _mm_unpacklo_epi16(step1[4], step2[6]);
+    step1[6] = idct_calc_wraplow_sse2(k__cospi_p16_p16, k__cospi_m16_p16,
+                                      lo_5_6);  // step1 6&5
+    step1[8] = _mm_add_epi16(step2[8], step2[11]);
+    step1[9] = _mm_add_epi16(step2[9], step2[10]);
+    step1[10] = _mm_sub_epi16(step2[9], step2[10]);
+    step1[11] = _mm_sub_epi16(step2[8], step2[11]);
+    step1[12] = _mm_unpackhi_epi64(step1[11], zero);
+    step1[13] = _mm_unpackhi_epi64(step1[10], zero);
+    step1[14] = _mm_unpackhi_epi64(step1[9], zero);
+    step1[15] = _mm_unpackhi_epi64(step1[8], zero);
+  }
+
+  // stage 6
+  {
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(step1[10], step1[13]);
+    const __m128i lo_11_12 = _mm_unpacklo_epi16(step1[11], step1[12]);
+    step2[10] = idct_calc_wraplow_sse2(k__cospi_m16_p16, k__cospi_p16_p16,
+                                       lo_10_13);  // step2 10&13
+    step2[11] = idct_calc_wraplow_sse2(k__cospi_m16_p16, k__cospi_p16_p16,
+                                       lo_11_12);  // step2 11&12
+    step2[13] = _mm_unpackhi_epi64(step2[10], zero);
+    step2[12] = _mm_unpackhi_epi64(step2[11], zero);
+    step2[3] = _mm_add_epi16(step1[0], step1[4]);
+    step2[1] = _mm_add_epi16(step1[0], step1[6]);
+    step2[6] = _mm_sub_epi16(step1[0], step1[6]);
+    step2[4] = _mm_sub_epi16(step1[0], step1[4]);
+    step2[0] = _mm_unpackhi_epi64(step2[3], zero);
+    step2[2] = _mm_unpackhi_epi64(step2[1], zero);
+    step2[5] = _mm_unpackhi_epi64(step2[6], zero);
+    step2[7] = _mm_unpackhi_epi64(step2[4], zero);
+  }
+
+  // stage 7. Left 8x16 only.
+  output[0] = _mm_add_epi16(step2[0], step1[15]);
+  output[1] = _mm_add_epi16(step2[1], step1[14]);
+  output[2] = _mm_add_epi16(step2[2], step2[13]);
+  output[3] = _mm_add_epi16(step2[3], step2[12]);
+  output[4] = _mm_add_epi16(step2[4], step2[11]);
+  output[5] = _mm_add_epi16(step2[5], step2[10]);
+  output[6] = _mm_add_epi16(step2[6], step1[9]);
+  output[7] = _mm_add_epi16(step2[7], step1[8]);
+  output[8] = _mm_sub_epi16(step2[7], step1[8]);
+  output[9] = _mm_sub_epi16(step2[6], step1[9]);
+  output[10] = _mm_sub_epi16(step2[5], step2[10]);
+  output[11] = _mm_sub_epi16(step2[4], step2[11]);
+  output[12] = _mm_sub_epi16(step2[3], step2[12]);
+  output[13] = _mm_sub_epi16(step2[2], step2[13]);
+  output[14] = _mm_sub_epi16(step2[1], step1[14]);
+  output[15] = _mm_sub_epi16(step2[0], step1[15]);
+}
+
+static INLINE void idct16x16_10_pass2(__m128i *const l /*l[8]*/,
+                                      __m128i *const io /*io[16]*/) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i step1[16], step2[16];
+
+  transpose_16bit_4x8(l, io);
+
+  // stage 2
+  butterfly(io[1], zero, cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+  butterfly(zero, io[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+  // stage 3
+  butterfly(io[2], zero, cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+
+  // stage 4
+  step1[0] = butterfly_cospi16(io[0]);
+  butterfly(step2[15], step2[8], cospi_24_64, cospi_8_64, &step2[9],
+            &step2[14]);
+  butterfly(step2[11], step2[12], -cospi_8_64, -cospi_24_64, &step2[13],
+            &step2[10]);
+
+  // stage 5
+  butterfly(step1[7], step1[4], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+  step1[8] = _mm_add_epi16(step2[8], step2[11]);
+  step1[9] = _mm_add_epi16(step2[9], step2[10]);
+  step1[10] = _mm_sub_epi16(step2[9], step2[10]);
+  step1[11] = _mm_sub_epi16(step2[8], step2[11]);
+  step1[12] = _mm_sub_epi16(step2[15], step2[12]);
+  step1[13] = _mm_sub_epi16(step2[14], step2[13]);
+  step1[14] = _mm_add_epi16(step2[14], step2[13]);
+  step1[15] = _mm_add_epi16(step2[15], step2[12]);
+
+  // stage 6
+  step2[0] = _mm_add_epi16(step1[0], step1[7]);
+  step2[1] = _mm_add_epi16(step1[0], step1[6]);
+  step2[2] = _mm_add_epi16(step1[0], step1[5]);
+  step2[3] = _mm_add_epi16(step1[0], step1[4]);
+  step2[4] = _mm_sub_epi16(step1[0], step1[4]);
+  step2[5] = _mm_sub_epi16(step1[0], step1[5]);
+  step2[6] = _mm_sub_epi16(step1[0], step1[6]);
+  step2[7] = _mm_sub_epi16(step1[0], step1[7]);
+  butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10],
+            &step2[13]);
+  butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11],
+            &step2[12]);
+
+  // stage 7
+  io[0] = _mm_add_epi16(step2[0], step1[15]);
+  io[1] = _mm_add_epi16(step2[1], step1[14]);
+  io[2] = _mm_add_epi16(step2[2], step2[13]);
+  io[3] = _mm_add_epi16(step2[3], step2[12]);
+  io[4] = _mm_add_epi16(step2[4], step2[11]);
+  io[5] = _mm_add_epi16(step2[5], step2[10]);
+  io[6] = _mm_add_epi16(step2[6], step1[9]);
+  io[7] = _mm_add_epi16(step2[7], step1[8]);
+  io[8] = _mm_sub_epi16(step2[7], step1[8]);
+  io[9] = _mm_sub_epi16(step2[6], step1[9]);
+  io[10] = _mm_sub_epi16(step2[5], step2[10]);
+  io[11] = _mm_sub_epi16(step2[4], step2[11]);
+  io[12] = _mm_sub_epi16(step2[3], step2[12]);
+  io[13] = _mm_sub_epi16(step2[2], step2[13]);
+  io[14] = _mm_sub_epi16(step2[1], step1[14]);
+  io[15] = _mm_sub_epi16(step2[0], step1[15]);
+}
+
+static INLINE void idct32_8x32_quarter_2_stage_4_to_6(
+    __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) {
+  __m128i step2[32];
+
+  // stage 4
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  butterfly(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+            &step2[14]);
+  butterfly(step1[13], step1[10], -cospi_8_64, cospi_24_64, &step2[10],
+            &step2[13]);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  // stage 5
+  step1[8] = _mm_add_epi16(step2[8], step2[11]);
+  step1[9] = _mm_add_epi16(step2[9], step2[10]);
+  step1[10] = _mm_sub_epi16(step2[9], step2[10]);
+  step1[11] = _mm_sub_epi16(step2[8], step2[11]);
+  step1[12] = _mm_sub_epi16(step2[15], step2[12]);
+  step1[13] = _mm_sub_epi16(step2[14], step2[13]);
+  step1[14] = _mm_add_epi16(step2[14], step2[13]);
+  step1[15] = _mm_add_epi16(step2[15], step2[12]);
+
+  // stage 6
+  out[8] = step1[8];
+  out[9] = step1[9];
+  butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &out[10], &out[13]);
+  butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &out[11], &out[12]);
+  out[14] = step1[14];
+  out[15] = step1[15];
+}
+
+static INLINE void idct32_8x32_quarter_3_4_stage_4_to_7(
+    __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step2[32];
+
+  // stage 4
+  step2[16] = _mm_add_epi16(step1[16], step1[19]);
+  step2[17] = _mm_add_epi16(step1[17], step1[18]);
+  step2[18] = _mm_sub_epi16(step1[17], step1[18]);
+  step2[19] = _mm_sub_epi16(step1[16], step1[19]);
+  step2[20] = _mm_sub_epi16(step1[23], step1[20]);
+  step2[21] = _mm_sub_epi16(step1[22], step1[21]);
+  step2[22] = _mm_add_epi16(step1[22], step1[21]);
+  step2[23] = _mm_add_epi16(step1[23], step1[20]);
+
+  step2[24] = _mm_add_epi16(step1[24], step1[27]);
+  step2[25] = _mm_add_epi16(step1[25], step1[26]);
+  step2[26] = _mm_sub_epi16(step1[25], step1[26]);
+  step2[27] = _mm_sub_epi16(step1[24], step1[27]);
+  step2[28] = _mm_sub_epi16(step1[31], step1[28]);
+  step2[29] = _mm_sub_epi16(step1[30], step1[29]);
+  step2[30] = _mm_add_epi16(step1[29], step1[30]);
+  step2[31] = _mm_add_epi16(step1[28], step1[31]);
+
+  // stage 5
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  butterfly(step2[29], step2[18], cospi_24_64, cospi_8_64, &step1[18],
+            &step1[29]);
+  butterfly(step2[28], step2[19], cospi_24_64, cospi_8_64, &step1[19],
+            &step1[28]);
+  butterfly(step2[27], step2[20], -cospi_8_64, cospi_24_64, &step1[20],
+            &step1[27]);
+  butterfly(step2[26], step2[21], -cospi_8_64, cospi_24_64, &step1[21],
+            &step1[26]);
+  step1[22] = step2[22];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[25] = step2[25];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+
+  // stage 6
+  out[16] = _mm_add_epi16(step1[16], step1[23]);
+  out[17] = _mm_add_epi16(step1[17], step1[22]);
+  out[18] = _mm_add_epi16(step1[18], step1[21]);
+  out[19] = _mm_add_epi16(step1[19], step1[20]);
+  step2[20] = _mm_sub_epi16(step1[19], step1[20]);
+  step2[21] = _mm_sub_epi16(step1[18], step1[21]);
+  step2[22] = _mm_sub_epi16(step1[17], step1[22]);
+  step2[23] = _mm_sub_epi16(step1[16], step1[23]);
+
+  step2[24] = _mm_sub_epi16(step1[31], step1[24]);
+  step2[25] = _mm_sub_epi16(step1[30], step1[25]);
+  step2[26] = _mm_sub_epi16(step1[29], step1[26]);
+  step2[27] = _mm_sub_epi16(step1[28], step1[27]);
+  out[28] = _mm_add_epi16(step1[27], step1[28]);
+  out[29] = _mm_add_epi16(step1[26], step1[29]);
+  out[30] = _mm_add_epi16(step1[25], step1[30]);
+  out[31] = _mm_add_epi16(step1[24], step1[31]);
+
+  // stage 7
+  butterfly(step2[27], step2[20], cospi_16_64, cospi_16_64, &out[20], &out[27]);
+  butterfly(step2[26], step2[21], cospi_16_64, cospi_16_64, &out[21], &out[26]);
+  butterfly(step2[25], step2[22], cospi_16_64, cospi_16_64, &out[22], &out[25]);
+  butterfly(step2[24], step2[23], cospi_16_64, cospi_16_64, &out[23], &out[24]);
+}
+
+void idct4_sse2(__m128i *const in);
+void vpx_idct8_sse2(__m128i *const in);
+void idct16_sse2(__m128i *const in0, __m128i *const in1);
+void iadst4_sse2(__m128i *const in);
+void iadst8_sse2(__m128i *const in);
+void vpx_iadst16_8col_sse2(__m128i *const in);
+void iadst16_sse2(__m128i *const in0, __m128i *const in1);
+void idct32_1024_8x32(const __m128i *const in, __m128i *const out);
+void idct32_34_8x32_sse2(const __m128i *const in, __m128i *const out);
+void idct32_34_8x32_ssse3(const __m128i *const in, __m128i *const out);
+
+#endif  // VPX_VPX_DSP_X86_INV_TXFM_SSE2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c
new file mode 100644
index 0000000000..6e99469b63
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c
@@ -0,0 +1,364 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <tmmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_ssse3.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void partial_butterfly_ssse3(const __m128i in, const int c0,
+                                           const int c1, __m128i *const out0,
+                                           __m128i *const out1) {
+  const __m128i cst0 = _mm_set1_epi16(2 * c0);
+  const __m128i cst1 = _mm_set1_epi16(2 * c1);
+  *out0 = _mm_mulhrs_epi16(in, cst0);
+  *out1 = _mm_mulhrs_epi16(in, cst1);
+}
+
+static INLINE __m128i partial_butterfly_cospi16_ssse3(const __m128i in) {
+  const __m128i coef_pair = _mm_set1_epi16(2 * cospi_16_64);
+  return _mm_mulhrs_epi16(in, coef_pair);
+}
+
+void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest,
+                              int stride) {
+  __m128i io[8];
+
+  io[0] = load_input_data4(input + 0 * 8);
+  io[1] = load_input_data4(input + 1 * 8);
+  io[2] = load_input_data4(input + 2 * 8);
+  io[3] = load_input_data4(input + 3 * 8);
+
+  idct8x8_12_add_kernel_ssse3(io);
+  write_buffer_8x8(io, dest, stride);
+}
+
+// Group the coefficient calculation into smaller functions to prevent stack
+// spillover in 32x32 idct optimizations:
+// quarter_1: 0-7
+// quarter_2: 8-15
+// quarter_3_4: 16-23, 24-31
+
+// For each 8x32 block __m128i in[32],
+// Input with index, 0, 4
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_1(const __m128i *const in /*in[32]*/,
+                                            __m128i *const out /*out[8]*/) {
+  __m128i step1[8], step2[8];
+
+  // stage 3
+  partial_butterfly_ssse3(in[4], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+
+  // stage 4
+  step2[0] = partial_butterfly_cospi16_ssse3(in[0]);
+  step2[4] = step1[4];
+  step2[5] = step1[4];
+  step2[6] = step1[7];
+  step2[7] = step1[7];
+
+  // stage 5
+  step1[0] = step2[0];
+  step1[1] = step2[0];
+  step1[2] = step2[0];
+  step1[3] = step2[0];
+  step1[4] = step2[4];
+  butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+  step1[7] = step2[7];
+
+  // stage 6
+  out[0] = _mm_add_epi16(step1[0], step1[7]);
+  out[1] = _mm_add_epi16(step1[1], step1[6]);
+  out[2] = _mm_add_epi16(step1[2], step1[5]);
+  out[3] = _mm_add_epi16(step1[3], step1[4]);
+  out[4] = _mm_sub_epi16(step1[3], step1[4]);
+  out[5] = _mm_sub_epi16(step1[2], step1[5]);
+  out[6] = _mm_sub_epi16(step1[1], step1[6]);
+  out[7] = _mm_sub_epi16(step1[0], step1[7]);
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with index, 2, 6
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_2(const __m128i *const in /*in[32]*/,
+                                            __m128i *const out /*out[16]*/) {
+  __m128i step1[16], step2[16];
+
+  // stage 2
+  partial_butterfly_ssse3(in[2], cospi_30_64, cospi_2_64, &step2[8],
+                          &step2[15]);
+  partial_butterfly_ssse3(in[6], -cospi_26_64, cospi_6_64, &step2[11],
+                          &step2[12]);
+
+  // stage 3
+  step1[8] = step2[8];
+  step1[9] = step2[8];
+  step1[14] = step2[15];
+  step1[15] = step2[15];
+  step1[10] = step2[11];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[13] = step2[12];
+
+  idct32_8x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void idct32_34_8x32_quarter_1_2(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i temp[16];
+  idct32_34_8x32_quarter_1(in, temp);
+  idct32_34_8x32_quarter_2(in, temp);
+  // stage 7
+  add_sub_butterfly(temp, out, 16);
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with odd index, 1, 3, 5, 7
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_3_4(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step1[32];
+
+  // stage 1
+  partial_butterfly_ssse3(in[1], cospi_31_64, cospi_1_64, &step1[16],
+                          &step1[31]);
+  partial_butterfly_ssse3(in[7], -cospi_25_64, cospi_7_64, &step1[19],
+                          &step1[28]);
+  partial_butterfly_ssse3(in[5], cospi_27_64, cospi_5_64, &step1[20],
+                          &step1[27]);
+  partial_butterfly_ssse3(in[3], -cospi_29_64, cospi_3_64, &step1[23],
+                          &step1[24]);
+
+  // stage 3
+  butterfly(step1[31], step1[16], cospi_28_64, cospi_4_64, &step1[17],
+            &step1[30]);
+  butterfly(step1[28], step1[19], -cospi_4_64, cospi_28_64, &step1[18],
+            &step1[29]);
+  butterfly(step1[27], step1[20], cospi_12_64, cospi_20_64, &step1[21],
+            &step1[26]);
+  butterfly(step1[24], step1[23], -cospi_20_64, cospi_12_64, &step1[22],
+            &step1[25]);
+
+  idct32_8x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+void idct32_34_8x32_ssse3(const __m128i *const in /*in[32]*/,
+                          __m128i *const out /*out[32]*/) {
+  __m128i temp[32];
+
+  idct32_34_8x32_quarter_1_2(in, temp);
+  idct32_34_8x32_quarter_3_4(in, temp);
+  // final stage
+  add_sub_butterfly(temp, out, 32);
+}
+
+// Only upper-left 8x8 has non-zero coeff
+void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest,
+                                int stride) {
+  __m128i io[32], col[32];
+  int i;
+
+  // Load input data. Only need to load the top left 8x8 block.
+  load_transpose_16bit_8x8(input, 32, io);
+  idct32_34_8x32_ssse3(io, col);
+
+  for (i = 0; i < 32; i += 8) {
+    int j;
+    transpose_16bit_8x8(col + i, io);
+    idct32_34_8x32_ssse3(io, io);
+
+    for (j = 0; j < 32; ++j) {
+      write_buffer_8x1(dest + j * stride, io[j]);
+    }
+
+    dest += 8;
+  }
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void idct32_135_8x32_quarter_1(const __m128i *const in /*in[32]*/,
+                                             __m128i *const out /*out[8]*/) {
+  __m128i step1[8], step2[8];
+
+  // stage 3
+  partial_butterfly_ssse3(in[4], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+  partial_butterfly_ssse3(in[12], -cospi_20_64, cospi_12_64, &step1[5],
+                          &step1[6]);
+
+  // stage 4
+  step2[0] = partial_butterfly_cospi16_ssse3(in[0]);
+  partial_butterfly_ssse3(in[8], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+  step2[4] = _mm_add_epi16(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+  step2[7] = _mm_add_epi16(step1[7], step1[6]);
+
+  // stage 5
+  step1[0] = _mm_add_epi16(step2[0], step2[3]);
+  step1[1] = _mm_add_epi16(step2[0], step2[2]);
+  step1[2] = _mm_sub_epi16(step2[0], step2[2]);
+  step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+  step1[4] = step2[4];
+  butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+  step1[7] = step2[7];
+
+  // stage 6
+  out[0] = _mm_add_epi16(step1[0], step1[7]);
+  out[1] = _mm_add_epi16(step1[1], step1[6]);
+  out[2] = _mm_add_epi16(step1[2], step1[5]);
+  out[3] = _mm_add_epi16(step1[3], step1[4]);
+  out[4] = _mm_sub_epi16(step1[3], step1[4]);
+  out[5] = _mm_sub_epi16(step1[2], step1[5]);
+  out[6] = _mm_sub_epi16(step1[1], step1[6]);
+  out[7] = _mm_sub_epi16(step1[0], step1[7]);
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void idct32_135_8x32_quarter_2(const __m128i *const in /*in[32]*/,
+                                             __m128i *const out /*out[16]*/) {
+  __m128i step1[16], step2[16];
+
+  // stage 2
+  partial_butterfly_ssse3(in[2], cospi_30_64, cospi_2_64, &step2[8],
+                          &step2[15]);
+  partial_butterfly_ssse3(in[14], -cospi_18_64, cospi_14_64, &step2[9],
+                          &step2[14]);
+  partial_butterfly_ssse3(in[10], cospi_22_64, cospi_10_64, &step2[10],
+                          &step2[13]);
+  partial_butterfly_ssse3(in[6], -cospi_26_64, cospi_6_64, &step2[11],
+                          &step2[12]);
+
+  // stage 3
+  step1[8] = _mm_add_epi16(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi16(step2[8], step2[9]);
+  step1[10] = _mm_sub_epi16(step2[11], step2[10]);
+  step1[11] = _mm_add_epi16(step2[11], step2[10]);
+  step1[12] = _mm_add_epi16(step2[12], step2[13]);
+  step1[13] = _mm_sub_epi16(step2[12], step2[13]);
+  step1[14] = _mm_sub_epi16(step2[15], step2[14]);
+  step1[15] = _mm_add_epi16(step2[15], step2[14]);
+
+  idct32_8x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void idct32_135_8x32_quarter_1_2(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i temp[16];
+  idct32_135_8x32_quarter_1(in, temp);
+  idct32_135_8x32_quarter_2(in, temp);
+  // stage 7
+  add_sub_butterfly(temp, out, 16);
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void idct32_135_8x32_quarter_3_4(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 1
+  partial_butterfly_ssse3(in[1], cospi_31_64, cospi_1_64, &step1[16],
+                          &step1[31]);
+  partial_butterfly_ssse3(in[15], -cospi_17_64, cospi_15_64, &step1[17],
+                          &step1[30]);
+  partial_butterfly_ssse3(in[9], cospi_23_64, cospi_9_64, &step1[18],
+                          &step1[29]);
+  partial_butterfly_ssse3(in[7], -cospi_25_64, cospi_7_64, &step1[19],
+                          &step1[28]);
+
+  partial_butterfly_ssse3(in[5], cospi_27_64, cospi_5_64, &step1[20],
+                          &step1[27]);
+  partial_butterfly_ssse3(in[11], -cospi_21_64, cospi_11_64, &step1[21],
+                          &step1[26]);
+
+  partial_butterfly_ssse3(in[13], cospi_19_64, cospi_13_64, &step1[22],
+                          &step1[25]);
+  partial_butterfly_ssse3(in[3], -cospi_29_64, cospi_3_64, &step1[23],
+                          &step1[24]);
+
+  // stage 2
+  step2[16] = _mm_add_epi16(step1[16], step1[17]);
+  step2[17] = _mm_sub_epi16(step1[16], step1[17]);
+  step2[18] = _mm_sub_epi16(step1[19], step1[18]);
+  step2[19] = _mm_add_epi16(step1[19], step1[18]);
+  step2[20] = _mm_add_epi16(step1[20], step1[21]);
+  step2[21] = _mm_sub_epi16(step1[20], step1[21]);
+  step2[22] = _mm_sub_epi16(step1[23], step1[22]);
+  step2[23] = _mm_add_epi16(step1[23], step1[22]);
+
+  step2[24] = _mm_add_epi16(step1[24], step1[25]);
+  step2[25] = _mm_sub_epi16(step1[24], step1[25]);
+  step2[26] = _mm_sub_epi16(step1[27], step1[26]);
+  step2[27] = _mm_add_epi16(step1[27], step1[26]);
+  step2[28] = _mm_add_epi16(step1[28], step1[29]);
+  step2[29] = _mm_sub_epi16(step1[28], step1[29]);
+  step2[30] = _mm_sub_epi16(step1[31], step1[30]);
+  step2[31] = _mm_add_epi16(step1[31], step1[30]);
+
+  // stage 3
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  butterfly(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17],
+            &step1[30]);
+  butterfly(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18],
+            &step1[29]);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  butterfly(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21],
+            &step1[26]);
+  butterfly(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22],
+            &step1[25]);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  idct32_8x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+void idct32_135_8x32_ssse3(const __m128i *const in /*in[32]*/,
+                           __m128i *const out /*out[32]*/) {
+  __m128i temp[32];
+  idct32_135_8x32_quarter_1_2(in, temp);
+  idct32_135_8x32_quarter_3_4(in, temp);
+  // final stage
+  add_sub_butterfly(temp, out, 32);
+}
+
+void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest,
+                                 int stride) {
+  __m128i col[2][32], io[32];
+  int i;
+
+  // rows
+  for (i = 0; i < 2; i++) {
+    load_transpose_16bit_8x8(&input[0], 32, &io[0]);
+    load_transpose_16bit_8x8(&input[8], 32, &io[8]);
+    idct32_135_8x32_ssse3(io, col[i]);
+    input += 32 << 3;
+  }
+
+  // columns
+  for (i = 0; i < 32; i += 8) {
+    transpose_16bit_8x8(col[0] + i, io);
+    transpose_16bit_8x8(col[1] + i, io + 8);
+    idct32_135_8x32_ssse3(io, io);
+    store_buffer_8x32(io, dest, stride);
+    dest += 8;
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h
new file mode 100644
index 0000000000..e9f0f69033
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h
@@ -0,0 +1,110 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_INV_TXFM_SSSE3_H_
+#define VPX_VPX_DSP_X86_INV_TXFM_SSSE3_H_
+
+#include <tmmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void idct8x8_12_add_kernel_ssse3(__m128i *const io /* io[8] */) {
+  const __m128i cp_28d_4d = dual_set_epi16(2 * cospi_28_64, 2 * cospi_4_64);
+  const __m128i cp_n20d_12d = dual_set_epi16(-2 * cospi_20_64, 2 * cospi_12_64);
+  const __m128i cp_8d_24d = dual_set_epi16(2 * cospi_8_64, 2 * cospi_24_64);
+  const __m128i cp_16_16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i cospi_16_64d = _mm_set1_epi16((int16_t)(2 * cospi_16_64));
+  const __m128i cospi_28_64d = _mm_set1_epi16((int16_t)(2 * cospi_28_64));
+  const __m128i cospi_4_64d = _mm_set1_epi16((int16_t)(2 * cospi_4_64));
+  const __m128i cospi_n20_64d = _mm_set1_epi16((int16_t)(-2 * cospi_20_64));
+  const __m128i cospi_12_64d = _mm_set1_epi16((int16_t)(2 * cospi_12_64));
+  const __m128i cospi_24_64d = _mm_set1_epi16((int16_t)(2 * cospi_24_64));
+  const __m128i cospi_8_64d = _mm_set1_epi16((int16_t)(2 * cospi_8_64));
+  __m128i step1[8], step2[8], tmp[4];
+
+  // pass 1
+
+  transpose_16bit_4x4(io, io);
+  // io[0]: 00 10 20 30  01 11 21 31
+  // io[1]: 02 12 22 32  03 13 23 33
+
+  // stage 1
+  tmp[0] = _mm_unpacklo_epi64(io[0], io[0]);
+  tmp[1] = _mm_unpackhi_epi64(io[0], io[0]);
+  tmp[2] = _mm_unpacklo_epi64(io[1], io[1]);
+  tmp[3] = _mm_unpackhi_epi64(io[1], io[1]);
+  step1[4] = _mm_mulhrs_epi16(tmp[1], cp_28d_4d);    // step1 4&7
+  step1[5] = _mm_mulhrs_epi16(tmp[3], cp_n20d_12d);  // step1 5&6
+
+  // stage 2
+  step2[0] = _mm_mulhrs_epi16(tmp[0], cospi_16_64d);  // step2 0&1
+  step2[2] = _mm_mulhrs_epi16(tmp[2], cp_8d_24d);     // step2 3&2
+  step2[4] = _mm_add_epi16(step1[4], step1[5]);       // step2 4&7
+  step2[5] = _mm_sub_epi16(step1[4], step1[5]);       // step2 5&6
+  step2[6] = _mm_unpackhi_epi64(step2[5], step2[5]);  // step2 6
+
+  // stage 3
+  tmp[0] = _mm_unpacklo_epi16(step2[6], step2[5]);
+  step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, tmp[0]);  // step1 5&6
+  tmp[0] = _mm_add_epi16(step2[0], step2[2]);                      // step1 0&1
+  tmp[1] = _mm_sub_epi16(step2[0], step2[2]);                      // step1 3&2
+  step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]);                   // step1 2&1
+  step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]);                   // step1 3&0
+
+  // stage 4
+  tmp[0] = _mm_add_epi16(step1[3], step2[4]);  // output 3&0
+  tmp[1] = _mm_add_epi16(step1[2], step1[5]);  // output 2&1
+  tmp[2] = _mm_sub_epi16(step1[3], step2[4]);  // output 4&7
+  tmp[3] = _mm_sub_epi16(step1[2], step1[5]);  // output 5&6
+
+  // pass 2
+
+  idct8x8_12_transpose_16bit_4x8(tmp, io);
+
+  // stage 1
+  step1[4] = _mm_mulhrs_epi16(io[1], cospi_28_64d);
+  step1[7] = _mm_mulhrs_epi16(io[1], cospi_4_64d);
+  step1[5] = _mm_mulhrs_epi16(io[3], cospi_n20_64d);
+  step1[6] = _mm_mulhrs_epi16(io[3], cospi_12_64d);
+
+  // stage 2
+  step2[0] = _mm_mulhrs_epi16(io[0], cospi_16_64d);  // step2[1] = step2[0]
+  step2[2] = _mm_mulhrs_epi16(io[2], cospi_24_64d);
+  step2[3] = _mm_mulhrs_epi16(io[2], cospi_8_64d);
+  step2[4] = _mm_add_epi16(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+  step2[7] = _mm_add_epi16(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = _mm_add_epi16(step2[0], step2[3]);
+  step1[1] = _mm_add_epi16(step2[0], step2[2]);
+  step1[2] = _mm_sub_epi16(step2[0], step2[2]);
+  step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+  butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+
+  // stage 4
+  io[0] = _mm_add_epi16(step1[0], step2[7]);
+  io[1] = _mm_add_epi16(step1[1], step1[6]);
+  io[2] = _mm_add_epi16(step1[2], step1[5]);
+  io[3] = _mm_add_epi16(step1[3], step2[4]);
+  io[4] = _mm_sub_epi16(step1[3], step2[4]);
+  io[5] = _mm_sub_epi16(step1[2], step1[5]);
+  io[6] = _mm_sub_epi16(step1[1], step1[6]);
+  io[7] = _mm_sub_epi16(step1[0], step2[7]);
+}
+
+void idct32_135_8x32_ssse3(const __m128i *const in, __m128i *const out);
+
+#endif  // VPX_VPX_DSP_X86_INV_TXFM_SSSE3_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
deleted file mode 100644
index dee64e3ad3..0000000000
--- a/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
+++ /dev/null
@@ -1,1793 +0,0 @@
-;
-;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-; This file provides SSSE3 version of the inverse transformation. Part
-; of the functions are originally derived from the ffmpeg project.
-; Note that the current version applies to x86 64-bit only.
-
-SECTION_RODATA
-
-pw_11585x2: times 8 dw 23170
-
-pw_m2404x2:  times 8 dw  -2404*2
-pw_m4756x2:  times 8 dw  -4756*2
-pw_m5520x2:  times 8 dw  -5520*2
-pw_m8423x2:  times 8 dw  -8423*2
-pw_m9102x2:  times 8 dw  -9102*2
-pw_m10394x2: times 8 dw -10394*2
-pw_m11003x2: times 8 dw -11003*2
-
-pw_16364x2: times 8 dw 16364*2
-pw_16305x2: times 8 dw 16305*2
-pw_16207x2: times 8 dw 16207*2
-pw_16069x2: times 8 dw 16069*2
-pw_15893x2: times 8 dw 15893*2
-pw_15679x2: times 8 dw 15679*2
-pw_15426x2: times 8 dw 15426*2
-pw_15137x2: times 8 dw 15137*2
-pw_14811x2: times 8 dw 14811*2
-pw_14449x2: times 8 dw 14449*2
-pw_14053x2: times 8 dw 14053*2
-pw_13623x2: times 8 dw 13623*2
-pw_13160x2: times 8 dw 13160*2
-pw_12665x2: times 8 dw 12665*2
-pw_12140x2: times 8 dw 12140*2
-pw__9760x2: times 8 dw  9760*2
-pw__7723x2: times 8 dw  7723*2
-pw__7005x2: times 8 dw  7005*2
-pw__6270x2: times 8 dw  6270*2
-pw__3981x2: times 8 dw  3981*2
-pw__3196x2: times 8 dw  3196*2
-pw__1606x2: times 8 dw  1606*2
-pw___804x2: times 8 dw   804*2
-
-pd_8192:    times 4 dd 8192
-pw_32:      times 8 dw 32
-pw_16:      times 8 dw 16
-
-%macro TRANSFORM_COEFFS 2
-pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2
-pw_m%2_%1:  dw -%2,  %1, -%2,  %1, -%2,  %1, -%2,  %1
-pw_m%1_m%2: dw -%1, -%2, -%1, -%2, -%1, -%2, -%1, -%2
-%endmacro
-
-TRANSFORM_COEFFS    6270, 15137
-TRANSFORM_COEFFS    3196, 16069
-TRANSFORM_COEFFS   13623,  9102
-
-; constants for 32x32_34
-TRANSFORM_COEFFS      804, 16364
-TRANSFORM_COEFFS    15426,  5520
-TRANSFORM_COEFFS     3981, 15893
-TRANSFORM_COEFFS    16207,  2404
-TRANSFORM_COEFFS     1606, 16305
-TRANSFORM_COEFFS    15679,  4756
-TRANSFORM_COEFFS    11585, 11585
-
-; constants for 32x32_1024
-TRANSFORM_COEFFS    12140, 11003
-TRANSFORM_COEFFS     7005, 14811
-TRANSFORM_COEFFS    14053,  8423
-TRANSFORM_COEFFS     9760, 13160
-TRANSFORM_COEFFS    12665, 10394
-TRANSFORM_COEFFS     7723, 14449
-
-%macro PAIR_PP_COEFFS 2
-dpw_%1_%2:   dw  %1,  %1,  %1,  %1,  %2,  %2,  %2,  %2
-%endmacro
-
-%macro PAIR_MP_COEFFS 2
-dpw_m%1_%2:  dw -%1, -%1, -%1, -%1,  %2,  %2,  %2,  %2
-%endmacro
-
-%macro PAIR_MM_COEFFS 2
-dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2
-%endmacro
-
-PAIR_PP_COEFFS     30274, 12540
-PAIR_PP_COEFFS      6392, 32138
-PAIR_MP_COEFFS     18204, 27246
-
-PAIR_PP_COEFFS     12540, 12540
-PAIR_PP_COEFFS     30274, 30274
-PAIR_PP_COEFFS      6392,  6392
-PAIR_PP_COEFFS     32138, 32138
-PAIR_MM_COEFFS     18204, 18204
-PAIR_PP_COEFFS     27246, 27246
-
-SECTION .text
-
-%if ARCH_X86_64
-%macro SUM_SUB 3
-  psubw  m%3, m%1, m%2
-  paddw  m%1, m%2
-  SWAP    %2, %3
-%endmacro
-
-; butterfly operation
-%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
-  pmaddwd            m%1, m%3, %5
-  pmaddwd            m%2, m%3, %6
-  paddd              m%1,  %4
-  paddd              m%2,  %4
-  psrad              m%1,  14
-  psrad              m%2,  14
-%endmacro
-
-%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
-  punpckhwd          m%6, m%2, m%1
-  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_m%4_%3], [pw_%3_%4]
-  punpcklwd          m%2, m%1
-  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_m%4_%3], [pw_%3_%4]
-  packssdw           m%1, m%7
-  packssdw           m%2, m%6
-%endmacro
-
-%macro BUTTERFLY_4Xmm 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
-  punpckhwd          m%6, m%2, m%1
-  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_m%4_%3], [pw_m%3_m%4]
-  punpcklwd          m%2, m%1
-  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_m%4_%3], [pw_m%3_m%4]
-  packssdw           m%1, m%7
-  packssdw           m%2, m%6
-%endmacro
-
-; matrix transpose
-%macro INTERLEAVE_2X 4
-  punpckh%1          m%4, m%2, m%3
-  punpckl%1          m%2, m%3
-  SWAP               %3,  %4
-%endmacro
-
-%macro TRANSPOSE8X8 9
-  INTERLEAVE_2X  wd, %1, %2, %9
-  INTERLEAVE_2X  wd, %3, %4, %9
-  INTERLEAVE_2X  wd, %5, %6, %9
-  INTERLEAVE_2X  wd, %7, %8, %9
-
-  INTERLEAVE_2X  dq, %1, %3, %9
-  INTERLEAVE_2X  dq, %2, %4, %9
-  INTERLEAVE_2X  dq, %5, %7, %9
-  INTERLEAVE_2X  dq, %6, %8, %9
-
-  INTERLEAVE_2X  qdq, %1, %5, %9
-  INTERLEAVE_2X  qdq, %3, %7, %9
-  INTERLEAVE_2X  qdq, %2, %6, %9
-  INTERLEAVE_2X  qdq, %4, %8, %9
-
-  SWAP  %2, %5
-  SWAP  %4, %7
-%endmacro
-
-%macro IDCT8_1D 0
-  SUM_SUB          0,    4,    9
-  BUTTERFLY_4X     2,    6,    6270, 15137,  m8,  9,  10
-  pmulhrsw        m0,  m12
-  pmulhrsw        m4,  m12
-  BUTTERFLY_4X     1,    7,    3196, 16069,  m8,  9,  10
-  BUTTERFLY_4X     5,    3,   13623,  9102,  m8,  9,  10
-
-  SUM_SUB          1,    5,    9
-  SUM_SUB          7,    3,    9
-  SUM_SUB          0,    6,    9
-  SUM_SUB          4,    2,    9
-  SUM_SUB          3,    5,    9
-  pmulhrsw        m3,  m12
-  pmulhrsw        m5,  m12
-
-  SUM_SUB          0,    7,    9
-  SUM_SUB          4,    3,    9
-  SUM_SUB          2,    5,    9
-  SUM_SUB          6,    1,    9
-
-  SWAP             3,    6
-  SWAP             1,    4
-%endmacro
-
-; This macro handles 8 pixels per line
-%macro ADD_STORE_8P_2X 5;  src1, src2, tmp1, tmp2, zero
-  paddw           m%1, m11
-  paddw           m%2, m11
-  psraw           m%1, 5
-  psraw           m%2, 5
-
-  movh            m%3, [outputq]
-  movh            m%4, [outputq + strideq]
-  punpcklbw       m%3, m%5
-  punpcklbw       m%4, m%5
-  paddw           m%3, m%1
-  paddw           m%4, m%2
-  packuswb        m%3, m%5
-  packuswb        m%4, m%5
-  movh               [outputq], m%3
-  movh     [outputq + strideq], m%4
-%endmacro
-
-INIT_XMM ssse3
-; full inverse 8x8 2D-DCT transform
-cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
-  mova     m8, [pd_8192]
-  mova    m11, [pw_16]
-  mova    m12, [pw_11585x2]
-
-  lea      r3, [2 * strideq]
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova     m0, [inputq +   0]
-  packssdw m0, [inputq +  16]
-  mova     m1, [inputq +  32]
-  packssdw m1, [inputq +  48]
-  mova     m2, [inputq +  64]
-  packssdw m2, [inputq +  80]
-  mova     m3, [inputq +  96]
-  packssdw m3, [inputq + 112]
-  mova     m4, [inputq + 128]
-  packssdw m4, [inputq + 144]
-  mova     m5, [inputq + 160]
-  packssdw m5, [inputq + 176]
-  mova     m6, [inputq + 192]
-  packssdw m6, [inputq + 208]
-  mova     m7, [inputq + 224]
-  packssdw m7, [inputq + 240]
-%else
-  mova     m0, [inputq +   0]
-  mova     m1, [inputq +  16]
-  mova     m2, [inputq +  32]
-  mova     m3, [inputq +  48]
-  mova     m4, [inputq +  64]
-  mova     m5, [inputq +  80]
-  mova     m6, [inputq +  96]
-  mova     m7, [inputq + 112]
-%endif
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-  IDCT8_1D
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-  IDCT8_1D
-
-  pxor    m12, m12
-  ADD_STORE_8P_2X  0, 1, 9, 10, 12
-  lea              outputq, [outputq + r3]
-  ADD_STORE_8P_2X  2, 3, 9, 10, 12
-  lea              outputq, [outputq + r3]
-  ADD_STORE_8P_2X  4, 5, 9, 10, 12
-  lea              outputq, [outputq + r3]
-  ADD_STORE_8P_2X  6, 7, 9, 10, 12
-
-  RET
-
-; inverse 8x8 2D-DCT transform with only first 12 coeffs non-zero
-cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
-  mova       m8, [pd_8192]
-  mova      m11, [pw_16]
-  mova      m12, [pw_11585x2]
-
-  lea        r3, [2 * strideq]
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova       m0, [inputq +   0]
-  packssdw   m0, [inputq +  16]
-  mova       m1, [inputq +  32]
-  packssdw   m1, [inputq +  48]
-  mova       m2, [inputq +  64]
-  packssdw   m2, [inputq +  80]
-  mova       m3, [inputq +  96]
-  packssdw   m3, [inputq + 112]
-%else
-  mova       m0, [inputq +  0]
-  mova       m1, [inputq + 16]
-  mova       m2, [inputq + 32]
-  mova       m3, [inputq + 48]
-%endif
-
-  punpcklwd  m0, m1
-  punpcklwd  m2, m3
-  punpckhdq  m9, m0, m2
-  punpckldq  m0, m2
-  SWAP       2, 9
-
-  ; m0 -> [0], [0]
-  ; m1 -> [1], [1]
-  ; m2 -> [2], [2]
-  ; m3 -> [3], [3]
-  punpckhqdq m10, m0, m0
-  punpcklqdq m0,  m0
-  punpckhqdq m9,  m2, m2
-  punpcklqdq m2,  m2
-  SWAP       1, 10
-  SWAP       3,  9
-
-  pmulhrsw   m0, m12
-  pmulhrsw   m2, [dpw_30274_12540]
-  pmulhrsw   m1, [dpw_6392_32138]
-  pmulhrsw   m3, [dpw_m18204_27246]
-
-  SUM_SUB    0, 2, 9
-  SUM_SUB    1, 3, 9
-
-  punpcklqdq m9, m3, m3
-  punpckhqdq m5, m3, m9
-
-  SUM_SUB    3, 5, 9
-  punpckhqdq m5, m3
-  pmulhrsw   m5, m12
-
-  punpckhqdq m9, m1, m5
-  punpcklqdq m1, m5
-  SWAP       5, 9
-
-  SUM_SUB    0, 5, 9
-  SUM_SUB    2, 1, 9
-
-  punpckhqdq m3, m0, m0
-  punpckhqdq m4, m1, m1
-  punpckhqdq m6, m5, m5
-  punpckhqdq m7, m2, m2
-
-  punpcklwd  m0, m3
-  punpcklwd  m7, m2
-  punpcklwd  m1, m4
-  punpcklwd  m6, m5
-
-  punpckhdq  m4, m0, m7
-  punpckldq  m0, m7
-  punpckhdq  m10, m1, m6
-  punpckldq  m5, m1, m6
-
-  punpckhqdq m1, m0, m5
-  punpcklqdq m0, m5
-  punpckhqdq m3, m4, m10
-  punpcklqdq m2, m4, m10
-
-
-  pmulhrsw   m0, m12
-  pmulhrsw   m6, m2, [dpw_30274_30274]
-  pmulhrsw   m4, m2, [dpw_12540_12540]
-
-  pmulhrsw   m7, m1, [dpw_32138_32138]
-  pmulhrsw   m1, [dpw_6392_6392]
-  pmulhrsw   m5, m3, [dpw_m18204_m18204]
-  pmulhrsw   m3, [dpw_27246_27246]
-
-  mova       m2, m0
-  SUM_SUB    0, 6, 9
-  SUM_SUB    2, 4, 9
-  SUM_SUB    1, 5, 9
-  SUM_SUB    7, 3, 9
-
-  SUM_SUB    3, 5, 9
-  pmulhrsw   m3, m12
-  pmulhrsw   m5, m12
-
-  SUM_SUB    0, 7, 9
-  SUM_SUB    2, 3, 9
-  SUM_SUB    4, 5, 9
-  SUM_SUB    6, 1, 9
-
-  SWAP       3, 6
-  SWAP       1, 2
-  SWAP       2, 4
-
-
-  pxor    m12, m12
-  ADD_STORE_8P_2X  0, 1, 9, 10, 12
-  lea              outputq, [outputq + r3]
-  ADD_STORE_8P_2X  2, 3, 9, 10, 12
-  lea              outputq, [outputq + r3]
-  ADD_STORE_8P_2X  4, 5, 9, 10, 12
-  lea              outputq, [outputq + r3]
-  ADD_STORE_8P_2X  6, 7, 9, 10, 12
-
-  RET
-
-%define  idx0 16 * 0
-%define  idx1 16 * 1
-%define  idx2 16 * 2
-%define  idx3 16 * 3
-%define  idx4 16 * 4
-%define  idx5 16 * 5
-%define  idx6 16 * 6
-%define  idx7 16 * 7
-%define  idx8 16 * 0
-%define  idx9 16 * 1
-%define idx10 16 * 2
-%define idx11 16 * 3
-%define idx12 16 * 4
-%define idx13 16 * 5
-%define idx14 16 * 6
-%define idx15 16 * 7
-%define idx16 16 * 0
-%define idx17 16 * 1
-%define idx18 16 * 2
-%define idx19 16 * 3
-%define idx20 16 * 4
-%define idx21 16 * 5
-%define idx22 16 * 6
-%define idx23 16 * 7
-%define idx24 16 * 0
-%define idx25 16 * 1
-%define idx26 16 * 2
-%define idx27 16 * 3
-%define idx28 16 * 4
-%define idx29 16 * 5
-%define idx30 16 * 6
-%define idx31 16 * 7
-
-; FROM idct32x32_add_neon.asm
-;
-; Instead of doing the transforms stage by stage, it is done by loading
-; some input values and doing as many stages as possible to minimize the
-; storing/loading of intermediate results. To fit within registers, the
-; final coefficients are cut into four blocks:
-; BLOCK A: 16-19,28-31
-; BLOCK B: 20-23,24-27
-; BLOCK C: 8-11,12-15
-; BLOCK D: 0-3,4-7
-; Blocks A and C are straight calculation through the various stages. In
-; block B, further calculations are performed using the results from
-; block A. In block D, further calculations are performed using the results
-; from block C and then the final calculations are done using results from
-; block A and B which have been combined at the end of block B.
-;
-
-%macro IDCT32X32_34 4
-  ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                m11, m1
-  pmulhrsw             m1, [pw___804x2] ; stp1_16
-  mova      [r4 +      0], m0
-  pmulhrsw            m11, [pw_16364x2] ; stp2_31
-  mova      [r4 + 16 * 2], m2
-  mova                m12, m7
-  pmulhrsw             m7, [pw_15426x2] ; stp1_28
-  mova      [r4 + 16 * 4], m4
-  pmulhrsw            m12, [pw_m5520x2] ; stp2_19
-  mova      [r4 + 16 * 6], m6
-
-  ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m2, m1   ; stp1_16
-  mova                 m0, m11  ; stp1_31
-  mova                 m4, m7   ; stp1_28
-  mova                m15, m12  ; stp1_19
-
-  ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          0,     2,   3196, 16069,  m8,  9,  10 ; stp1_17, stp1_30
-  BUTTERFLY_4Xmm        4,    15,   3196, 16069,  m8,  9,  10 ; stp1_29, stp1_18
-
-  ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               1, 12, 9 ; stp2_16, stp2_19
-  SUM_SUB               0, 15, 9 ; stp2_17, stp2_18
-  SUM_SUB              11,  7, 9 ; stp2_31, stp2_28
-  SUM_SUB               2,  4, 9 ; stp2_30, stp2_29
-
-  ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          4,    15,   6270, 15137,  m8,  9,  10 ; stp1_18, stp1_29
-  BUTTERFLY_4X          7,    12,   6270, 15137,  m8,  9,  10 ; stp1_19, stp1_28
-
-  ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m6, m5
-  pmulhrsw             m5, [pw__3981x2] ; stp1_20
-  mova [stp + %4 + idx28], m12
-  mova [stp + %4 + idx29], m15
-  pmulhrsw             m6, [pw_15893x2] ; stp2_27
-  mova [stp + %4 + idx30], m2
-  mova                 m2, m3
-  pmulhrsw             m3, [pw_m2404x2] ; stp1_23
-  mova [stp + %4 + idx31], m11
-  pmulhrsw             m2, [pw_16207x2] ; stp2_24
-
-  ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                m13, m5 ; stp1_20
-  mova                m14, m6 ; stp1_27
-  mova                m15, m3 ; stp1_23
-  mova                m11, m2 ; stp1_24
-
-  ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_21, stp1_26
-  BUTTERFLY_4Xmm       11,    15,  13623,  9102,  m8,  9,  10 ; stp1_25, stp1_22
-
-  ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               3,  5, 9 ; stp2_23, stp2_20
-  SUM_SUB              15, 14, 9 ; stp2_22, stp2_21
-  SUM_SUB               2,  6, 9 ; stp2_24, stp2_27
-  SUM_SUB              11, 13, 9 ; stp2_25, stp2_26
-
-  ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4Xmm        6,     5,   6270, 15137,  m8,  9,  10 ; stp1_27, stp1_20
-  BUTTERFLY_4Xmm       13,    14,   6270, 15137,  m8,  9,  10 ; stp1_26, stp1_21
-
-  ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               1,  3, 9 ; stp2_16, stp2_23
-  SUM_SUB               0, 15, 9 ; stp2_17, stp2_22
-  SUM_SUB               4, 14, 9 ; stp2_18, stp2_21
-  SUM_SUB               7,  5, 9 ; stp2_19, stp2_20
-  mova [stp + %3 + idx16], m1
-  mova [stp + %3 + idx17], m0
-  mova [stp + %3 + idx18], m4
-  mova [stp + %3 + idx19], m7
-
-  mova                 m4, [stp + %4 + idx28]
-  mova                 m7, [stp + %4 + idx29]
-  mova                m10, [stp + %4 + idx30]
-  mova                m12, [stp + %4 + idx31]
-  SUM_SUB               4,  6, 9 ; stp2_28, stp2_27
-  SUM_SUB               7, 13, 9 ; stp2_29, stp2_26
-  SUM_SUB              10, 11, 9 ; stp2_30, stp2_25
-  SUM_SUB              12,  2, 9 ; stp2_31, stp2_24
-  mova [stp + %4 + idx28], m4
-  mova [stp + %4 + idx29], m7
-  mova [stp + %4 + idx30], m10
-  mova [stp + %4 + idx31], m12
-
-  ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               6,  5, 9
-  pmulhrsw             m6, m10  ; stp1_27
-  pmulhrsw             m5, m10  ; stp1_20
-  SUM_SUB              13, 14,  9
-  pmulhrsw            m13, m10  ; stp1_26
-  pmulhrsw            m14, m10  ; stp1_21
-  SUM_SUB              11, 15,  9
-  pmulhrsw            m11, m10  ; stp1_25
-  pmulhrsw            m15, m10  ; stp1_22
-  SUM_SUB               2,  3,  9
-  pmulhrsw             m2, m10  ; stp1_24
-  pmulhrsw             m3, m10  ; stp1_23
-%else
-  BUTTERFLY_4X          6,     5,  11585, 11585,  m8,  9,  10 ; stp1_20, stp1_27
-  SWAP 6, 5
-  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_21, stp1_26
-  SWAP 13, 14
-  BUTTERFLY_4X         11,    15,  11585, 11585,  m8,  9,  10 ; stp1_22, stp1_25
-  SWAP 11, 15
-  BUTTERFLY_4X          2,     3,  11585, 11585,  m8,  9,  10 ; stp1_23, stp1_24
-  SWAP 2, 3
-%endif
-
-  mova [stp + %4 + idx24], m2
-  mova [stp + %4 + idx25], m11
-  mova [stp + %4 + idx26], m13
-  mova [stp + %4 + idx27], m6
-
-  ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m0, [rsp + transposed_in + 16 *  2]
-  mova                 m6, [rsp + transposed_in + 16 *  6]
-
-  mova                 m1, m0
-  pmulhrsw             m0, [pw__1606x2] ; stp1_8
-  mova [stp + %3 + idx20], m5
-  mova [stp + %3 + idx21], m14
-  pmulhrsw             m1, [pw_16305x2] ; stp2_15
-  mova [stp + %3 + idx22], m15
-  mova                 m7, m6
-  pmulhrsw             m7, [pw_m4756x2] ; stp2_11
-  mova [stp + %3 + idx23], m3
-  pmulhrsw             m6, [pw_15679x2] ; stp1_12
-
-  ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m3, m0 ; stp1_8
-  mova                 m2, m1 ; stp1_15
-
-  ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_9, stp1_14
-  mova                 m4, m7 ; stp1_11
-  mova                 m5, m6 ; stp1_12
-  BUTTERFLY_4Xmm        5,     4,   6270, 15137,  m8,  9,  10 ; stp1_13, stp1_10
-
-  ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0,  7, 9 ;  stp1_8, stp1_11
-  SUM_SUB               2,  4, 9 ;  stp1_9, stp1_10
-  SUM_SUB               1,  6, 9 ;  stp1_15, stp1_12
-  SUM_SUB               3,  5, 9 ;  stp1_14, stp1_13
-
-  ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               5,  4, 9
-  pmulhrsw             m5, m10  ; stp1_13
-  pmulhrsw             m4, m10  ; stp1_10
-  SUM_SUB               6,  7, 9
-  pmulhrsw             m6, m10  ; stp1_12
-  pmulhrsw             m7, m10  ; stp1_11
-%else
-  BUTTERFLY_4X          5,     4,  11585, 11585,  m8,  9,  10 ; stp1_10, stp1_13
-  SWAP 5, 4
-  BUTTERFLY_4X          6,     7,  11585, 11585,  m8,  9,  10 ; stp1_11, stp1_12
-  SWAP 6, 7
-%endif
-
-  ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova [stp + %2 +  idx8], m0
-  mova [stp + %2 +  idx9], m2
-  mova [stp + %2 + idx10], m4
-  mova [stp + %2 + idx11], m7
-
-  ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                m11, [rsp + transposed_in + 16 *  4]
-  mova                m12, m11
-  pmulhrsw            m11, [pw__3196x2] ; stp1_4
-  pmulhrsw            m12, [pw_16069x2] ; stp1_7
-
-  ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m0, [rsp + transposed_in + 16 *  0]
-  mova                m10, [pw_11585x2]
-  pmulhrsw             m0, m10  ; stp1_1
-
-  mova                m14, m11 ; stp1_4
-  mova                m13, m12 ; stp1_7
-
-  ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  SUM_SUB              13,   14,  9
-  pmulhrsw            m13, m10  ; stp1_6
-  pmulhrsw            m14, m10  ; stp1_5
-%else
-  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_5, stp1_6
-  SWAP 13, 14
-%endif
-  mova                 m7, m0 ; stp1_0 = stp1_1
-  mova                 m4, m0 ; stp1_1
-  mova                 m2, m7 ; stp1_0
-
-  ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0, 12, 9 ;  stp1_0, stp1_7
-  SUM_SUB               7, 13, 9 ;  stp1_1, stp1_6
-  SUM_SUB               2, 14, 9 ;  stp1_2, stp1_5
-  SUM_SUB               4, 11, 9 ;  stp1_3, stp1_4
-
-  ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0,  1, 9 ;  stp1_0, stp1_15
-  SUM_SUB               7,  3, 9 ;  stp1_1, stp1_14
-  SUM_SUB               2,  5, 9 ;  stp1_2, stp1_13
-  SUM_SUB               4,  6, 9 ;  stp1_3, stp1_12
-
-  ; 0-3, 28-31 final stage
-  mova                m15, [stp + %4 + idx30]
-  mova                m10, [stp + %4 + idx31]
-  SUM_SUB               0, 10, 9 ;  stp1_0, stp1_31
-  SUM_SUB               7, 15, 9 ;  stp1_1, stp1_30
-  mova [stp + %1 +  idx0], m0
-  mova [stp + %1 +  idx1], m7
-  mova [stp + %4 + idx30], m15
-  mova [stp + %4 + idx31], m10
-  mova                 m7, [stp + %4 + idx28]
-  mova                 m0, [stp + %4 + idx29]
-  SUM_SUB               2,  0, 9 ;  stp1_2, stp1_29
-  SUM_SUB               4,  7, 9 ;  stp1_3, stp1_28
-  mova [stp + %1 +  idx2], m2
-  mova [stp + %1 +  idx3], m4
-  mova [stp + %4 + idx28], m7
-  mova [stp + %4 + idx29], m0
-
-  ; 12-15, 16-19 final stage
-  mova                 m0, [stp + %3 + idx16]
-  mova                 m7, [stp + %3 + idx17]
-  mova                 m2, [stp + %3 + idx18]
-  mova                 m4, [stp + %3 + idx19]
-  SUM_SUB               1,  0, 9 ;  stp1_15, stp1_16
-  SUM_SUB               3,  7, 9 ;  stp1_14, stp1_17
-  SUM_SUB               5,  2, 9 ;  stp1_13, stp1_18
-  SUM_SUB               6,  4, 9 ;  stp1_12, stp1_19
-  mova [stp + %2 + idx12], m6
-  mova [stp + %2 + idx13], m5
-  mova [stp + %2 + idx14], m3
-  mova [stp + %2 + idx15], m1
-  mova [stp + %3 + idx16], m0
-  mova [stp + %3 + idx17], m7
-  mova [stp + %3 + idx18], m2
-  mova [stp + %3 + idx19], m4
-
-  mova                 m4, [stp + %2 +  idx8]
-  mova                 m5, [stp + %2 +  idx9]
-  mova                 m6, [stp + %2 + idx10]
-  mova                 m7, [stp + %2 + idx11]
-  SUM_SUB              11,  7, 9 ;  stp1_4, stp1_11
-  SUM_SUB              14,  6, 9 ;  stp1_5, stp1_10
-  SUM_SUB              13,  5, 9 ;  stp1_6, stp1_9
-  SUM_SUB              12,  4, 9 ;  stp1_7, stp1_8
-
-  ; 4-7, 24-27 final stage
-  mova                 m0, [stp + %4 + idx27]
-  mova                 m1, [stp + %4 + idx26]
-  mova                 m2, [stp + %4 + idx25]
-  mova                 m3, [stp + %4 + idx24]
-  SUM_SUB              11,  0, 9 ;  stp1_4, stp1_27
-  SUM_SUB              14,  1, 9 ;  stp1_5, stp1_26
-  SUM_SUB              13,  2, 9 ;  stp1_6, stp1_25
-  SUM_SUB              12,  3, 9 ;  stp1_7, stp1_24
-  mova [stp + %4 + idx27], m0
-  mova [stp + %4 + idx26], m1
-  mova [stp + %4 + idx25], m2
-  mova [stp + %4 + idx24], m3
-  mova [stp + %1 +  idx4], m11
-  mova [stp + %1 +  idx5], m14
-  mova [stp + %1 +  idx6], m13
-  mova [stp + %1 +  idx7], m12
-
-  ; 8-11, 20-23 final stage
-  mova                 m0, [stp + %3 + idx20]
-  mova                 m1, [stp + %3 + idx21]
-  mova                 m2, [stp + %3 + idx22]
-  mova                 m3, [stp + %3 + idx23]
-  SUM_SUB               7,  0, 9 ;  stp1_11, stp_20
-  SUM_SUB               6,  1, 9 ;  stp1_10, stp_21
-  SUM_SUB               5,  2, 9 ;   stp1_9, stp_22
-  SUM_SUB               4,  3, 9 ;   stp1_8, stp_23
-  mova [stp + %2 +  idx8], m4
-  mova [stp + %2 +  idx9], m5
-  mova [stp + %2 + idx10], m6
-  mova [stp + %2 + idx11], m7
-  mova [stp + %3 + idx20], m0
-  mova [stp + %3 + idx21], m1
-  mova [stp + %3 + idx22], m2
-  mova [stp + %3 + idx23], m3
-%endmacro
-
-%macro RECON_AND_STORE 1
-  mova            m11, [pw_32]
-  lea             stp, [rsp + %1]
-  mov              r6, 32
-  pxor             m8, m8
-%%recon_and_store:
-  mova             m0, [stp + 16 * 32 * 0]
-  mova             m1, [stp + 16 * 32 * 1]
-  mova             m2, [stp + 16 * 32 * 2]
-  mova             m3, [stp + 16 * 32 * 3]
-  add             stp, 16
-
-  paddw            m0, m11
-  paddw            m1, m11
-  paddw            m2, m11
-  paddw            m3, m11
-  psraw            m0, 6
-  psraw            m1, 6
-  psraw            m2, 6
-  psraw            m3, 6
-  movh             m4, [outputq +  0]
-  movh             m5, [outputq +  8]
-  movh             m6, [outputq + 16]
-  movh             m7, [outputq + 24]
-  punpcklbw        m4, m8
-  punpcklbw        m5, m8
-  punpcklbw        m6, m8
-  punpcklbw        m7, m8
-  paddw            m0, m4
-  paddw            m1, m5
-  paddw            m2, m6
-  paddw            m3, m7
-  packuswb         m0, m1
-  packuswb         m2, m3
-  mova [outputq +  0], m0
-  mova [outputq + 16], m2
-  lea         outputq, [outputq + strideq]
-  dec              r6
-  jnz %%recon_and_store
-%endmacro
-
-%define i32x32_size     16*32*5
-%define pass_two_start  16*32*0
-%define transposed_in   16*32*4
-%define pass_one_start  16*32*0
-%define stp r8
-
-INIT_XMM ssse3
-cglobal idct32x32_34_add, 3, 11, 16, i32x32_size, input, output, stride
-  mova            m8, [pd_8192]
-  lea            stp, [rsp + pass_one_start]
-
-idct32x32_34:
-  mov             r3, inputq
-  lea             r4, [rsp + transposed_in]
-
-idct32x32_34_transpose:
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova            m0, [r3 +       0]
-  packssdw        m0, [r3 +      16]
-  mova            m1, [r3 + 32 *  4]
-  packssdw        m1, [r3 + 32 *  4 + 16]
-  mova            m2, [r3 + 32 *  8]
-  packssdw        m2, [r3 + 32 *  8 + 16]
-  mova            m3, [r3 + 32 * 12]
-  packssdw        m3, [r3 + 32 * 12 + 16]
-  mova            m4, [r3 + 32 * 16]
-  packssdw        m4, [r3 + 32 * 16 + 16]
-  mova            m5, [r3 + 32 * 20]
-  packssdw        m5, [r3 + 32 * 20 + 16]
-  mova            m6, [r3 + 32 * 24]
-  packssdw        m6, [r3 + 32 * 24 + 16]
-  mova            m7, [r3 + 32 * 28]
-  packssdw        m7, [r3 + 32 * 28 + 16]
-%else
-  mova            m0, [r3 +       0]
-  mova            m1, [r3 + 16 *  4]
-  mova            m2, [r3 + 16 *  8]
-  mova            m3, [r3 + 16 * 12]
-  mova            m4, [r3 + 16 * 16]
-  mova            m5, [r3 + 16 * 20]
-  mova            m6, [r3 + 16 * 24]
-  mova            m7, [r3 + 16 * 28]
-%endif
-
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  IDCT32X32_34  16*0, 16*32, 16*64, 16*96
-  lea            stp, [stp + 16 * 8]
-  mov             r6, 4
-  lea            stp, [rsp + pass_one_start]
-  lea             r9, [rsp + pass_one_start]
-
-idct32x32_34_2:
-  lea             r4, [rsp + transposed_in]
-  mov             r3, r9
-
-idct32x32_34_transpose_2:
-  mova            m0, [r3 +      0]
-  mova            m1, [r3 + 16 * 1]
-  mova            m2, [r3 + 16 * 2]
-  mova            m3, [r3 + 16 * 3]
-  mova            m4, [r3 + 16 * 4]
-  mova            m5, [r3 + 16 * 5]
-  mova            m6, [r3 + 16 * 6]
-  mova            m7, [r3 + 16 * 7]
-
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  IDCT32X32_34  16*0, 16*8, 16*16, 16*24
-
-  lea            stp, [stp + 16 * 32]
-  add             r9, 16 * 32
-  dec             r6
-  jnz idct32x32_34_2
-
-  RECON_AND_STORE pass_two_start
-
-  RET
-
-%macro IDCT32X32_135 4
-  ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m1, [rsp + transposed_in + 16 *  1]
-  mova                m11, m1
-  pmulhrsw             m1, [pw___804x2] ; stp1_16
-  pmulhrsw            m11, [pw_16364x2] ; stp2_31
-
-  mova                 m7, [rsp + transposed_in + 16 *  7]
-  mova                m12, m7
-  pmulhrsw             m7, [pw_15426x2] ; stp1_28
-  pmulhrsw            m12, [pw_m5520x2] ; stp2_19
-
-  mova                 m3, [rsp + transposed_in + 16 *  9]
-  mova                 m4, m3
-  pmulhrsw             m3, [pw__7005x2] ; stp1_18
-  pmulhrsw             m4, [pw_14811x2] ; stp2_29
-
-  mova                 m0, [rsp + transposed_in + 16 * 15]
-  mova                 m2, m0
-  pmulhrsw             m0, [pw_12140x2]  ; stp1_30
-  pmulhrsw             m2, [pw_m11003x2] ; stp2_17
-
-  ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               1,  2, 9 ; stp2_16, stp2_17
-  SUM_SUB              12,  3, 9 ; stp2_19, stp2_18
-  SUM_SUB               7,  4, 9 ; stp2_28, stp2_29
-  SUM_SUB              11,  0, 9 ; stp2_31, stp2_30
-
-  ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          0,     2,   3196, 16069,  m8,  9,  10 ; stp1_17, stp1_30
-  BUTTERFLY_4Xmm        4,     3,   3196, 16069,  m8,  9,  10 ; stp1_29, stp1_18
-
-  ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               1, 12, 9 ; stp2_16, stp2_19
-  SUM_SUB               0,  3, 9 ; stp2_17, stp2_18
-  SUM_SUB              11,  7, 9 ; stp2_31, stp2_28
-  SUM_SUB               2,  4, 9 ; stp2_30, stp2_29
-
-  ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          4,     3,   6270, 15137,  m8,  9,  10 ; stp1_18, stp1_29
-  BUTTERFLY_4X          7,    12,   6270, 15137,  m8,  9,  10 ; stp1_19, stp1_28
-
-  mova [stp + %3 + idx16], m1
-  mova [stp + %3 + idx17], m0
-  mova [stp + %3 + idx18], m4
-  mova [stp + %3 + idx19], m7
-  mova [stp + %4 + idx28], m12
-  mova [stp + %4 + idx29], m3
-  mova [stp + %4 + idx30], m2
-  mova [stp + %4 + idx31], m11
-
-  ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m2, [rsp + transposed_in + 16 *  3]
-  mova                 m3, m2
-  pmulhrsw             m3, [pw_m2404x2] ; stp1_23
-  pmulhrsw             m2, [pw_16207x2] ; stp2_24
-
-  mova                 m5, [rsp + transposed_in + 16 *  5]
-  mova                 m6, m5
-  pmulhrsw             m5, [pw__3981x2] ; stp1_20
-  pmulhrsw             m6, [pw_15893x2] ; stp2_27
-
-  mova                m14, [rsp + transposed_in + 16 * 11]
-  mova                m13, m14
-  pmulhrsw            m13, [pw_m8423x2] ; stp1_21
-  pmulhrsw            m14, [pw_14053x2] ; stp2_26
-
-  mova                 m0, [rsp + transposed_in + 16 * 13]
-  mova                 m1, m0
-  pmulhrsw             m0, [pw__9760x2] ; stp1_22
-  pmulhrsw             m1, [pw_13160x2] ; stp2_25
-
-  ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               5, 13, 9 ; stp2_20, stp2_21
-  SUM_SUB               3,  0, 9 ; stp2_23, stp2_22
-  SUM_SUB               2,  1, 9 ; stp2_24, stp2_25
-  SUM_SUB               6, 14, 9 ; stp2_27, stp2_26
-
-  ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_21, stp1_26
-  BUTTERFLY_4Xmm        1,     0,  13623,  9102,  m8,  9,  10 ; stp1_25, stp1_22
-
-  ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               3,  5, 9 ; stp2_23, stp2_20
-  SUM_SUB               0, 14, 9 ; stp2_22, stp2_21
-  SUM_SUB               2,  6, 9 ; stp2_24, stp2_27
-  SUM_SUB               1, 13, 9 ; stp2_25, stp2_26
-
-  ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4Xmm        6,     5,   6270, 15137,  m8,  9,  10 ; stp1_27, stp1_20
-  BUTTERFLY_4Xmm       13,    14,   6270, 15137,  m8,  9,  10 ; stp1_26, stp1_21
-
-  ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m4, [stp + %3 + idx16]
-  mova                 m7, [stp + %3 + idx17]
-  mova                m11, [stp + %3 + idx18]
-  mova                m12, [stp + %3 + idx19]
-  SUM_SUB               4,  3, 9 ; stp2_16, stp2_23
-  SUM_SUB               7,  0, 9 ; stp2_17, stp2_22
-  SUM_SUB              11, 14, 9 ; stp2_18, stp2_21
-  SUM_SUB              12,  5, 9 ; stp2_19, stp2_20
-  mova [stp + %3 + idx16], m4
-  mova [stp + %3 + idx17], m7
-  mova [stp + %3 + idx18], m11
-  mova [stp + %3 + idx19], m12
-
-  mova                 m4, [stp + %4 + idx28]
-  mova                 m7, [stp + %4 + idx29]
-  mova                m11, [stp + %4 + idx30]
-  mova                m12, [stp + %4 + idx31]
-  SUM_SUB               4,  6, 9 ; stp2_28, stp2_27
-  SUM_SUB               7, 13, 9 ; stp2_29, stp2_26
-  SUM_SUB              11,  1, 9 ; stp2_30, stp2_25
-  SUM_SUB              12,  2, 9 ; stp2_31, stp2_24
-  mova [stp + %4 + idx28], m4
-  mova [stp + %4 + idx29], m7
-  mova [stp + %4 + idx30], m11
-  mova [stp + %4 + idx31], m12
-
-  ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               6,  5,  9
-  pmulhrsw             m6, m10  ; stp1_27
-  pmulhrsw             m5, m10  ; stp1_20
-  SUM_SUB              13, 14,  9
-  pmulhrsw            m13, m10  ; stp1_26
-  pmulhrsw            m14, m10  ; stp1_21
-  SUM_SUB               1,  0,  9
-  pmulhrsw             m1, m10  ; stp1_25
-  pmulhrsw             m0, m10  ; stp1_22
-  SUM_SUB               2,  3,  9
-  pmulhrsw             m2, m10  ; stp1_25
-  pmulhrsw             m3, m10  ; stp1_22
-%else
-  BUTTERFLY_4X          6,     5,  11585, 11585,  m8,  9,  10 ; stp1_20, stp1_27
-  SWAP  6, 5
-  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_21, stp1_26
-  SWAP 13, 14
-  BUTTERFLY_4X          1,     0,  11585, 11585,  m8,  9,  10 ; stp1_22, stp1_25
-  SWAP  1, 0
-  BUTTERFLY_4X          2,     3,  11585, 11585,  m8,  9,  10 ; stp1_23, stp1_24
-  SWAP  2, 3
-%endif
-  mova [stp + %3 + idx20], m5
-  mova [stp + %3 + idx21], m14
-  mova [stp + %3 + idx22], m0
-  mova [stp + %3 + idx23], m3
-  mova [stp + %4 + idx24], m2
-  mova [stp + %4 + idx25], m1
-  mova [stp + %4 + idx26], m13
-  mova [stp + %4 + idx27], m6
-
-  ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m0, [rsp + transposed_in + 16 *  2]
-  mova                 m1, m0
-  pmulhrsw             m0, [pw__1606x2] ; stp1_8
-  pmulhrsw             m1, [pw_16305x2] ; stp2_15
-
-  mova                 m6, [rsp + transposed_in + 16 *  6]
-  mova                 m7, m6
-  pmulhrsw             m7, [pw_m4756x2] ; stp2_11
-  pmulhrsw             m6, [pw_15679x2] ; stp1_12
-
-  mova                 m4, [rsp + transposed_in + 16 * 10]
-  mova                 m5, m4
-  pmulhrsw             m4, [pw__7723x2] ; stp1_10
-  pmulhrsw             m5, [pw_14449x2] ; stp2_13
-
-  mova                 m2, [rsp + transposed_in + 16 * 14]
-  mova                 m3, m2
-  pmulhrsw             m3, [pw_m10394x2] ; stp1_9
-  pmulhrsw             m2, [pw_12665x2] ; stp2_14
-
-  ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0,  3, 9 ;  stp1_8, stp1_9
-  SUM_SUB               7,  4, 9 ; stp1_11, stp1_10
-  SUM_SUB               6,  5, 9 ; stp1_12, stp1_13
-  SUM_SUB               1,  2, 9 ; stp1_15, stp1_14
-
-  ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_9, stp1_14
-  BUTTERFLY_4Xmm        5,     4,   6270, 15137,  m8,  9,  10 ; stp1_13, stp1_10
-
-  ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0,  7, 9 ;  stp1_8, stp1_11
-  SUM_SUB               2,  4, 9 ;  stp1_9, stp1_10
-  SUM_SUB               1,  6, 9 ;  stp1_15, stp1_12
-  SUM_SUB               3,  5, 9 ;  stp1_14, stp1_13
-
-  ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               5,    4,  9
-  pmulhrsw             m5, m10  ; stp1_13
-  pmulhrsw             m4, m10  ; stp1_10
-  SUM_SUB               6,    7,  9
-  pmulhrsw             m6, m10  ; stp1_12
-  pmulhrsw             m7, m10  ; stp1_11
-%else
-  BUTTERFLY_4X       5,     4,  11585,  11585,  m8,  9,  10 ; stp1_10, stp1_13
-  SWAP  5, 4
-  BUTTERFLY_4X       6,     7,  11585,  11585,  m8,  9,  10 ; stp1_11, stp1_12
-  SWAP  6, 7
-%endif
-  ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova [stp + %2 +  idx8], m0
-  mova [stp + %2 +  idx9], m2
-  mova [stp + %2 + idx10], m4
-  mova [stp + %2 + idx11], m7
-  mova [stp + %2 + idx12], m6
-  mova [stp + %2 + idx13], m5
-  mova [stp + %2 + idx14], m3
-  mova [stp + %2 + idx15], m1
-
-  ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                m11, [rsp + transposed_in + 16 *  4]
-  mova                m12, m11
-  pmulhrsw            m11, [pw__3196x2] ; stp1_4
-  pmulhrsw            m12, [pw_16069x2] ; stp1_7
-
-  mova                m13, [rsp + transposed_in + 16 * 12]
-  mova                m14, m13
-  pmulhrsw            m13, [pw_13623x2] ; stp1_6
-  pmulhrsw            m14, [pw_m9102x2] ; stp1_5
-
-  ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m0, [rsp + transposed_in + 16 *  0]
-  mova                 m2, [rsp + transposed_in + 16 *  8]
-  pmulhrsw             m0, [pw_11585x2]  ; stp1_1
-  mova                 m3, m2
-  pmulhrsw             m2, [pw__6270x2]  ; stp1_2
-  pmulhrsw             m3, [pw_15137x2]  ; stp1_3
-
-  SUM_SUB              11, 14, 9 ;  stp1_4, stp1_5
-  SUM_SUB              12, 13, 9 ;  stp1_7, stp1_6
-
-  ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB              13,   14,  9
-  pmulhrsw            m13, m10  ; stp1_6
-  pmulhrsw            m14, m10  ; stp1_5
-%else
-  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_5, stp1_6
-  SWAP 13, 14
-%endif
-  mova                 m1, m0    ; stp1_0 = stp1_1
-  SUM_SUB               0,  3, 9 ;  stp1_0, stp1_3
-  SUM_SUB               1,  2, 9 ;  stp1_1, stp1_2
-
-  ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0, 12, 9 ;  stp1_0, stp1_7
-  SUM_SUB               1, 13, 9 ;  stp1_1, stp1_6
-  SUM_SUB               2, 14, 9 ;  stp1_2, stp1_5
-  SUM_SUB               3, 11, 9 ;  stp1_3, stp1_4
-
-  ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m4, [stp + %2 + idx12]
-  mova                 m5, [stp + %2 + idx13]
-  mova                 m6, [stp + %2 + idx14]
-  mova                 m7, [stp + %2 + idx15]
-  SUM_SUB               0,  7, 9 ;  stp1_0, stp1_15
-  SUM_SUB               1,  6, 9 ;  stp1_1, stp1_14
-  SUM_SUB               2,  5, 9 ;  stp1_2, stp1_13
-  SUM_SUB               3,  4, 9 ;  stp1_3, stp1_12
-
-  ; 0-3, 28-31 final stage
-  mova                m10, [stp + %4 + idx31]
-  mova                m15, [stp + %4 + idx30]
-  SUM_SUB               0, 10, 9 ;  stp1_0, stp1_31
-  SUM_SUB               1, 15, 9 ;  stp1_1, stp1_30
-  mova [stp + %1 +  idx0], m0
-  mova [stp + %1 +  idx1], m1
-  mova [stp + %4 + idx31], m10
-  mova [stp + %4 + idx30], m15
-  mova                 m0, [stp + %4 + idx29]
-  mova                 m1, [stp + %4 + idx28]
-  SUM_SUB               2,  0, 9 ;  stp1_2, stp1_29
-  SUM_SUB               3,  1, 9 ;  stp1_3, stp1_28
-  mova [stp + %1 +  idx2], m2
-  mova [stp + %1 +  idx3], m3
-  mova [stp + %4 + idx29], m0
-  mova [stp + %4 + idx28], m1
-
-  ; 12-15, 16-19 final stage
-  mova                 m0, [stp + %3 + idx16]
-  mova                 m1, [stp + %3 + idx17]
-  mova                 m2, [stp + %3 + idx18]
-  mova                 m3, [stp + %3 + idx19]
-  SUM_SUB               7,  0, 9 ;  stp1_15, stp1_16
-  SUM_SUB               6,  1, 9 ;  stp1_14, stp1_17
-  SUM_SUB               5,  2, 9 ;  stp1_13, stp1_18
-  SUM_SUB               4,  3, 9 ;  stp1_12, stp1_19
-  mova [stp + %2 + idx12], m4
-  mova [stp + %2 + idx13], m5
-  mova [stp + %2 + idx14], m6
-  mova [stp + %2 + idx15], m7
-  mova [stp + %3 + idx16], m0
-  mova [stp + %3 + idx17], m1
-  mova [stp + %3 + idx18], m2
-  mova [stp + %3 + idx19], m3
-
-  mova                 m4, [stp + %2 +  idx8]
-  mova                 m5, [stp + %2 +  idx9]
-  mova                 m6, [stp + %2 + idx10]
-  mova                 m7, [stp + %2 + idx11]
-  SUM_SUB              11,  7, 9 ;  stp1_4, stp1_11
-  SUM_SUB              14,  6, 9 ;  stp1_5, stp1_10
-  SUM_SUB              13,  5, 9 ;  stp1_6, stp1_9
-  SUM_SUB              12,  4, 9 ;  stp1_7, stp1_8
-
-  ; 4-7, 24-27 final stage
-  mova                 m3, [stp + %4 + idx24]
-  mova                 m2, [stp + %4 + idx25]
-  mova                 m1, [stp + %4 + idx26]
-  mova                 m0, [stp + %4 + idx27]
-  SUM_SUB              12,  3, 9 ;  stp1_7, stp1_24
-  SUM_SUB              13,  2, 9 ;  stp1_6, stp1_25
-  SUM_SUB              14,  1, 9 ;  stp1_5, stp1_26
-  SUM_SUB              11,  0, 9 ;  stp1_4, stp1_27
-  mova [stp + %4 + idx24], m3
-  mova [stp + %4 + idx25], m2
-  mova [stp + %4 + idx26], m1
-  mova [stp + %4 + idx27], m0
-  mova [stp + %1 +  idx4], m11
-  mova [stp + %1 +  idx5], m14
-  mova [stp + %1 +  idx6], m13
-  mova [stp + %1 +  idx7], m12
-
-  ; 8-11, 20-23 final stage
-  mova                 m0, [stp + %3 + idx20]
-  mova                 m1, [stp + %3 + idx21]
-  mova                 m2, [stp + %3 + idx22]
-  mova                 m3, [stp + %3 + idx23]
-  SUM_SUB               7,  0, 9 ;  stp1_11, stp_20
-  SUM_SUB               6,  1, 9 ;  stp1_10, stp_21
-  SUM_SUB               5,  2, 9 ;   stp1_9, stp_22
-  SUM_SUB               4,  3, 9 ;   stp1_8, stp_23
-  mova [stp + %2 +  idx8], m4
-  mova [stp + %2 +  idx9], m5
-  mova [stp + %2 + idx10], m6
-  mova [stp + %2 + idx11], m7
-  mova [stp + %3 + idx20], m0
-  mova [stp + %3 + idx21], m1
-  mova [stp + %3 + idx22], m2
-  mova [stp + %3 + idx23], m3
-%endmacro
-
-INIT_XMM ssse3
-cglobal idct32x32_135_add, 3, 11, 16, i32x32_size, input, output, stride
-  mova            m8, [pd_8192]
-  mov             r6, 2
-  lea            stp, [rsp + pass_one_start]
-
-idct32x32_135:
-  mov             r3, inputq
-  lea             r4, [rsp + transposed_in]
-  mov             r7, 2
-
-idct32x32_135_transpose:
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova            m0, [r3 +       0]
-  packssdw        m0, [r3 +      16]
-  mova            m1, [r3 + 32 *  4]
-  packssdw        m1, [r3 + 32 *  4 + 16]
-  mova            m2, [r3 + 32 *  8]
-  packssdw        m2, [r3 + 32 *  8 + 16]
-  mova            m3, [r3 + 32 * 12]
-  packssdw        m3, [r3 + 32 * 12 + 16]
-  mova            m4, [r3 + 32 * 16]
-  packssdw        m4, [r3 + 32 * 16 + 16]
-  mova            m5, [r3 + 32 * 20]
-  packssdw        m5, [r3 + 32 * 20 + 16]
-  mova            m6, [r3 + 32 * 24]
-  packssdw        m6, [r3 + 32 * 24 + 16]
-  mova            m7, [r3 + 32 * 28]
-  packssdw        m7, [r3 + 32 * 28 + 16]
-%else
-  mova            m0, [r3 +       0]
-  mova            m1, [r3 + 16 *  4]
-  mova            m2, [r3 + 16 *  8]
-  mova            m3, [r3 + 16 * 12]
-  mova            m4, [r3 + 16 * 16]
-  mova            m5, [r3 + 16 * 20]
-  mova            m6, [r3 + 16 * 24]
-  mova            m7, [r3 + 16 * 28]
-%endif
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  mova [r4 +      0], m0
-  mova [r4 + 16 * 1], m1
-  mova [r4 + 16 * 2], m2
-  mova [r4 + 16 * 3], m3
-  mova [r4 + 16 * 4], m4
-  mova [r4 + 16 * 5], m5
-  mova [r4 + 16 * 6], m6
-  mova [r4 + 16 * 7], m7
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  add             r3, 32
-%else
-  add             r3, 16
-%endif
-  add             r4, 16 * 8
-  dec             r7
-  jne idct32x32_135_transpose
-
-  IDCT32X32_135 16*0, 16*32, 16*64, 16*96
-  lea            stp, [stp + 16 * 8]
-%if CONFIG_VP9_HIGHBITDEPTH
-  lea         inputq, [inputq + 32 * 32]
-%else
-  lea         inputq, [inputq + 16 * 32]
-%endif
-  dec             r6
-  jnz idct32x32_135
-
-  mov             r6, 4
-  lea            stp, [rsp + pass_one_start]
-  lea             r9, [rsp + pass_one_start]
-
-idct32x32_135_2:
-  lea             r4, [rsp + transposed_in]
-  mov             r3, r9
-  mov             r7, 2
-
-idct32x32_135_transpose_2:
-  mova            m0, [r3 +      0]
-  mova            m1, [r3 + 16 * 1]
-  mova            m2, [r3 + 16 * 2]
-  mova            m3, [r3 + 16 * 3]
-  mova            m4, [r3 + 16 * 4]
-  mova            m5, [r3 + 16 * 5]
-  mova            m6, [r3 + 16 * 6]
-  mova            m7, [r3 + 16 * 7]
-
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  mova [r4 +      0], m0
-  mova [r4 + 16 * 1], m1
-  mova [r4 + 16 * 2], m2
-  mova [r4 + 16 * 3], m3
-  mova [r4 + 16 * 4], m4
-  mova [r4 + 16 * 5], m5
-  mova [r4 + 16 * 6], m6
-  mova [r4 + 16 * 7], m7
-
-  add             r3, 16 * 8
-  add             r4, 16 * 8
-  dec             r7
-  jne idct32x32_135_transpose_2
-
-  IDCT32X32_135 16*0, 16*8, 16*16, 16*24
-
-  lea            stp, [stp + 16 * 32]
-  add             r9, 16 * 32
-  dec             r6
-  jnz idct32x32_135_2
-
-  RECON_AND_STORE pass_two_start
-
-  RET
-
-%macro IDCT32X32_1024 4
-  ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m1, [rsp + transposed_in + 16 *  1]
-  mova                m11, [rsp + transposed_in + 16 * 31]
-  BUTTERFLY_4X          1,    11,    804, 16364,  m8,  9,  10 ; stp1_16, stp1_31
-
-  mova                 m0, [rsp + transposed_in + 16 * 15]
-  mova                 m2, [rsp + transposed_in + 16 * 17]
-  BUTTERFLY_4X          2,     0,  12140, 11003,  m8,  9,  10 ; stp1_17, stp1_30
-
-  mova                 m7, [rsp + transposed_in + 16 *  7]
-  mova                m12, [rsp + transposed_in + 16 * 25]
-  BUTTERFLY_4X         12,     7,  15426,  5520,  m8,  9,  10 ; stp1_19, stp1_28
-
-  mova                 m3, [rsp + transposed_in + 16 *  9]
-  mova                 m4, [rsp + transposed_in + 16 * 23]
-  BUTTERFLY_4X          3,     4,   7005, 14811,  m8,  9,  10 ; stp1_18, stp1_29
-
-  ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               1,  2, 9 ; stp2_16, stp2_17
-  SUM_SUB              12,  3, 9 ; stp2_19, stp2_18
-  SUM_SUB               7,  4, 9 ; stp2_28, stp2_29
-  SUM_SUB              11,  0, 9 ; stp2_31, stp2_30
-
-  ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          0,     2,   3196, 16069,  m8,  9,  10 ; stp1_17, stp1_30
-  BUTTERFLY_4Xmm        4,     3,   3196, 16069,  m8,  9,  10 ; stp1_29, stp1_18
-
-  ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               1, 12, 9 ; stp2_16, stp2_19
-  SUM_SUB               0,  3, 9 ; stp2_17, stp2_18
-  SUM_SUB              11,  7, 9 ; stp2_31, stp2_28
-  SUM_SUB               2,  4, 9 ; stp2_30, stp2_29
-
-  ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          4,     3,   6270, 15137,  m8,  9,  10 ; stp1_18, stp1_29
-  BUTTERFLY_4X          7,    12,   6270, 15137,  m8,  9,  10 ; stp1_19, stp1_28
-
-  mova [stp + %3 + idx16], m1
-  mova [stp + %3 + idx17], m0
-  mova [stp + %3 + idx18], m4
-  mova [stp + %3 + idx19], m7
-  mova [stp + %4 + idx28], m12
-  mova [stp + %4 + idx29], m3
-  mova [stp + %4 + idx30], m2
-  mova [stp + %4 + idx31], m11
-
-  ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m5, [rsp + transposed_in + 16 *  5]
-  mova                 m6, [rsp + transposed_in + 16 * 27]
-  BUTTERFLY_4X          5,     6,   3981, 15893,  m8,  9,  10 ; stp1_20, stp1_27
-
-  mova                m13, [rsp + transposed_in + 16 * 21]
-  mova                m14, [rsp + transposed_in + 16 * 11]
-  BUTTERFLY_4X         13,    14,  14053,  8423,  m8,  9,  10 ; stp1_21, stp1_26
-
-  mova                 m0, [rsp + transposed_in + 16 * 13]
-  mova                 m1, [rsp + transposed_in + 16 * 19]
-  BUTTERFLY_4X          0,     1,   9760, 13160,  m8,  9,  10 ; stp1_22, stp1_25
-
-  mova                 m2, [rsp + transposed_in + 16 *  3]
-  mova                 m3, [rsp + transposed_in + 16 * 29]
-  BUTTERFLY_4X          3,     2,  16207,  2404,  m8,  9,  10 ; stp1_23, stp1_24
-
-  ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               5, 13, 9 ; stp2_20, stp2_21
-  SUM_SUB               3,  0, 9 ; stp2_23, stp2_22
-  SUM_SUB               2,  1, 9 ; stp2_24, stp2_25
-  SUM_SUB               6, 14, 9 ; stp2_27, stp2_26
-
-  ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_21, stp1_26
-  BUTTERFLY_4Xmm        1,     0,  13623,  9102,  m8,  9,  10 ; stp1_25, stp1_22
-
-  ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               3,  5, 9 ; stp2_23, stp2_20
-  SUM_SUB               0, 14, 9 ; stp2_22, stp2_21
-  SUM_SUB               2,  6, 9 ; stp2_24, stp2_27
-  SUM_SUB               1, 13, 9 ; stp2_25, stp2_26
-
-  ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4Xmm        6,     5,   6270, 15137,  m8,  9,  10 ; stp1_27, stp1_20
-  BUTTERFLY_4Xmm       13,    14,   6270, 15137,  m8,  9,  10 ; stp1_26, stp1_21
-
-  ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m4, [stp + %3 + idx16]
-  mova                 m7, [stp + %3 + idx17]
-  mova                m11, [stp + %3 + idx18]
-  mova                m12, [stp + %3 + idx19]
-  SUM_SUB               4,  3, 9 ; stp2_16, stp2_23
-  SUM_SUB               7,  0, 9 ; stp2_17, stp2_22
-  SUM_SUB              11, 14, 9 ; stp2_18, stp2_21
-  SUM_SUB              12,  5, 9 ; stp2_19, stp2_20
-  mova [stp + %3 + idx16], m4
-  mova [stp + %3 + idx17], m7
-  mova [stp + %3 + idx18], m11
-  mova [stp + %3 + idx19], m12
-
-  mova                 m4, [stp + %4 + idx28]
-  mova                 m7, [stp + %4 + idx29]
-  mova                m11, [stp + %4 + idx30]
-  mova                m12, [stp + %4 + idx31]
-  SUM_SUB               4,  6, 9 ; stp2_28, stp2_27
-  SUM_SUB               7, 13, 9 ; stp2_29, stp2_26
-  SUM_SUB              11,  1, 9 ; stp2_30, stp2_25
-  SUM_SUB              12,  2, 9 ; stp2_31, stp2_24
-  mova [stp + %4 + idx28], m4
-  mova [stp + %4 + idx29], m7
-  mova [stp + %4 + idx30], m11
-  mova [stp + %4 + idx31], m12
-
-  ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               6,  5,  9
-  pmulhrsw             m6, m10  ; stp1_27
-  pmulhrsw             m5, m10  ; stp1_20
-  SUM_SUB              13, 14,  9
-  pmulhrsw            m13, m10  ; stp1_26
-  pmulhrsw            m14, m10  ; stp1_21
-  SUM_SUB               1,  0,  9
-  pmulhrsw             m1, m10  ; stp1_25
-  pmulhrsw             m0, m10  ; stp1_22
-  SUM_SUB               2,  3,  9
-  pmulhrsw             m2, m10  ; stp1_25
-  pmulhrsw             m3, m10  ; stp1_22
-%else
-  BUTTERFLY_4X          6,     5,  11585, 11585,  m8,  9,  10 ; stp1_20, stp1_27
-  SWAP  6, 5
-  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_21, stp1_26
-  SWAP 13, 14
-  BUTTERFLY_4X          1,     0,  11585, 11585,  m8,  9,  10 ; stp1_22, stp1_25
-  SWAP  1, 0
-  BUTTERFLY_4X          2,     3,  11585, 11585,  m8,  9,  10 ; stp1_23, stp1_24
-  SWAP  2, 3
-%endif
-  mova [stp + %3 + idx20], m5
-  mova [stp + %3 + idx21], m14
-  mova [stp + %3 + idx22], m0
-  mova [stp + %3 + idx23], m3
-  mova [stp + %4 + idx24], m2
-  mova [stp + %4 + idx25], m1
-  mova [stp + %4 + idx26], m13
-  mova [stp + %4 + idx27], m6
-
-  ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m0, [rsp + transposed_in + 16 *  2]
-  mova                 m1, [rsp + transposed_in + 16 * 30]
-  BUTTERFLY_4X          0,     1,   1606, 16305,  m8,  9,  10 ; stp1_8, stp1_15
-
-  mova                 m2, [rsp + transposed_in + 16 * 14]
-  mova                 m3, [rsp + transposed_in + 16 * 18]
-  BUTTERFLY_4X          3,     2,  12665, 10394,  m8,  9,  10 ; stp1_9, stp1_14
-
-  mova                 m4, [rsp + transposed_in + 16 * 10]
-  mova                 m5, [rsp + transposed_in + 16 * 22]
-  BUTTERFLY_4X          4,     5,   7723, 14449,  m8,  9,  10 ; stp1_10, stp1_13
-
-  mova                 m6, [rsp + transposed_in + 16 *  6]
-  mova                 m7, [rsp + transposed_in + 16 * 26]
-  BUTTERFLY_4X          7,     6,  15679,  4756,  m8,  9,  10 ; stp1_11, stp1_12
-
-  ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0,  3, 9 ;  stp1_8, stp1_9
-  SUM_SUB               7,  4, 9 ; stp1_11, stp1_10
-  SUM_SUB               6,  5, 9 ; stp1_12, stp1_13
-  SUM_SUB               1,  2, 9 ; stp1_15, stp1_14
-
-  ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_9, stp1_14
-  BUTTERFLY_4Xmm        5,     4,   6270, 15137,  m8,  9,  10 ; stp1_13, stp1_10
-
-  ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0,  7, 9 ;  stp1_8, stp1_11
-  SUM_SUB               2,  4, 9 ;  stp1_9, stp1_10
-  SUM_SUB               1,  6, 9 ;  stp1_15, stp1_12
-  SUM_SUB               3,  5, 9 ;  stp1_14, stp1_13
-
-  ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               5,    4,  9
-  pmulhrsw             m5, m10  ; stp1_13
-  pmulhrsw             m4, m10  ; stp1_10
-  SUM_SUB               6,    7,  9
-  pmulhrsw             m6, m10  ; stp1_12
-  pmulhrsw             m7, m10  ; stp1_11
-%else
-  BUTTERFLY_4X       5,     4,  11585,  11585,  m8,  9,  10 ; stp1_10, stp1_13
-  SWAP  5, 4
-  BUTTERFLY_4X       6,     7,  11585,  11585,  m8,  9,  10 ; stp1_11, stp1_12
-  SWAP  6, 7
-%endif
-  ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova [stp + %2 +  idx8], m0
-  mova [stp + %2 +  idx9], m2
-  mova [stp + %2 + idx10], m4
-  mova [stp + %2 + idx11], m7
-  mova [stp + %2 + idx12], m6
-  mova [stp + %2 + idx13], m5
-  mova [stp + %2 + idx14], m3
-  mova [stp + %2 + idx15], m1
-
-  ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  ;
-  ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                m11, [rsp + transposed_in + 16 *  4]
-  mova                m12, [rsp + transposed_in + 16 * 28]
-  BUTTERFLY_4X         11,    12,   3196, 16069,  m8,  9,  10 ; stp1_4, stp1_7
-
-  mova                m13, [rsp + transposed_in + 16 * 12]
-  mova                m14, [rsp + transposed_in + 16 * 20]
-  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_5, stp1_6
-
-  ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m0, [rsp + transposed_in + 16 *  0]
-  mova                 m1, [rsp + transposed_in + 16 * 16]
-
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  mova                m10, [pw_11585x2]
-  SUM_SUB               0,    1,  9
-  pmulhrsw             m0, m10  ; stp1_1
-  pmulhrsw             m1, m10  ; stp1_0
-%else
-  BUTTERFLY_4X          0,     1,  11585, 11585,  m8,  9,  10 ; stp1_1, stp1_0
-  SWAP  0, 1
-%endif
-  mova                 m2, [rsp + transposed_in + 16 *  8]
-  mova                 m3, [rsp + transposed_in + 16 * 24]
-  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_2, stp1_3
-
-  mova                m10, [pw_11585x2]
-  SUM_SUB              11, 14, 9 ;  stp1_4, stp1_5
-  SUM_SUB              12, 13, 9 ;  stp1_7, stp1_6
-
-  ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
-  SUM_SUB              13,   14,  9
-  pmulhrsw            m13, m10  ; stp1_6
-  pmulhrsw            m14, m10  ; stp1_5
-%else
-  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_5, stp1_6
-  SWAP 13, 14
-%endif
-  SUM_SUB               0,  3, 9 ;  stp1_0, stp1_3
-  SUM_SUB               1,  2, 9 ;  stp1_1, stp1_2
-
-  ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  SUM_SUB               0, 12, 9 ;  stp1_0, stp1_7
-  SUM_SUB               1, 13, 9 ;  stp1_1, stp1_6
-  SUM_SUB               2, 14, 9 ;  stp1_2, stp1_5
-  SUM_SUB               3, 11, 9 ;  stp1_3, stp1_4
-
-  ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  mova                 m4, [stp + %2 + idx12]
-  mova                 m5, [stp + %2 + idx13]
-  mova                 m6, [stp + %2 + idx14]
-  mova                 m7, [stp + %2 + idx15]
-  SUM_SUB               0,  7, 9 ;  stp1_0, stp1_15
-  SUM_SUB               1,  6, 9 ;  stp1_1, stp1_14
-  SUM_SUB               2,  5, 9 ;  stp1_2, stp1_13
-  SUM_SUB               3,  4, 9 ;  stp1_3, stp1_12
-
-  ; 0-3, 28-31 final stage
-  mova                m10, [stp + %4 + idx31]
-  mova                m15, [stp + %4 + idx30]
-  SUM_SUB               0, 10, 9 ;  stp1_0, stp1_31
-  SUM_SUB               1, 15, 9 ;  stp1_1, stp1_30
-  mova [stp + %1 +  idx0], m0
-  mova [stp + %1 +  idx1], m1
-  mova [stp + %4 + idx31], m10
-  mova [stp + %4 + idx30], m15
-  mova                 m0, [stp + %4 + idx29]
-  mova                 m1, [stp + %4 + idx28]
-  SUM_SUB               2,  0, 9 ;  stp1_2, stp1_29
-  SUM_SUB               3,  1, 9 ;  stp1_3, stp1_28
-  mova [stp + %1 +  idx2], m2
-  mova [stp + %1 +  idx3], m3
-  mova [stp + %4 + idx29], m0
-  mova [stp + %4 + idx28], m1
-
-  ; 12-15, 16-19 final stage
-  mova                 m0, [stp + %3 + idx16]
-  mova                 m1, [stp + %3 + idx17]
-  mova                 m2, [stp + %3 + idx18]
-  mova                 m3, [stp + %3 + idx19]
-  SUM_SUB               7,  0, 9 ;  stp1_15, stp1_16
-  SUM_SUB               6,  1, 9 ;  stp1_14, stp1_17
-  SUM_SUB               5,  2, 9 ;  stp1_13, stp1_18
-  SUM_SUB               4,  3, 9 ;  stp1_12, stp1_19
-  mova [stp + %2 + idx12], m4
-  mova [stp + %2 + idx13], m5
-  mova [stp + %2 + idx14], m6
-  mova [stp + %2 + idx15], m7
-  mova [stp + %3 + idx16], m0
-  mova [stp + %3 + idx17], m1
-  mova [stp + %3 + idx18], m2
-  mova [stp + %3 + idx19], m3
-
-  mova                 m4, [stp + %2 +  idx8]
-  mova                 m5, [stp + %2 +  idx9]
-  mova                 m6, [stp + %2 + idx10]
-  mova                 m7, [stp + %2 + idx11]
-  SUM_SUB              11,  7, 9 ;  stp1_4, stp1_11
-  SUM_SUB              14,  6, 9 ;  stp1_5, stp1_10
-  SUM_SUB              13,  5, 9 ;  stp1_6, stp1_9
-  SUM_SUB              12,  4, 9 ;  stp1_7, stp1_8
-
-  ; 4-7, 24-27 final stage
-  mova                 m3, [stp + %4 + idx24]
-  mova                 m2, [stp + %4 + idx25]
-  mova                 m1, [stp + %4 + idx26]
-  mova                 m0, [stp + %4 + idx27]
-  SUM_SUB              12,  3, 9 ;  stp1_7, stp1_24
-  SUM_SUB              13,  2, 9 ;  stp1_6, stp1_25
-  SUM_SUB              14,  1, 9 ;  stp1_5, stp1_26
-  SUM_SUB              11,  0, 9 ;  stp1_4, stp1_27
-  mova [stp + %4 + idx24], m3
-  mova [stp + %4 + idx25], m2
-  mova [stp + %4 + idx26], m1
-  mova [stp + %4 + idx27], m0
-  mova [stp + %1 +  idx4], m11
-  mova [stp + %1 +  idx5], m14
-  mova [stp + %1 +  idx6], m13
-  mova [stp + %1 +  idx7], m12
-
-  ; 8-11, 20-23 final stage
-  mova                 m0, [stp + %3 + idx20]
-  mova                 m1, [stp + %3 + idx21]
-  mova                 m2, [stp + %3 + idx22]
-  mova                 m3, [stp + %3 + idx23]
-  SUM_SUB               7,  0, 9 ;  stp1_11, stp_20
-  SUM_SUB               6,  1, 9 ;  stp1_10, stp_21
-  SUM_SUB               5,  2, 9 ;   stp1_9, stp_22
-  SUM_SUB               4,  3, 9 ;   stp1_8, stp_23
-  mova [stp + %2 +  idx8], m4
-  mova [stp + %2 +  idx9], m5
-  mova [stp + %2 + idx10], m6
-  mova [stp + %2 + idx11], m7
-  mova [stp + %3 + idx20], m0
-  mova [stp + %3 + idx21], m1
-  mova [stp + %3 + idx22], m2
-  mova [stp + %3 + idx23], m3
-%endmacro
-
-INIT_XMM ssse3
-cglobal idct32x32_1024_add, 3, 11, 16, i32x32_size, input, output, stride
-  mova            m8, [pd_8192]
-  mov             r6, 4
-  lea            stp, [rsp + pass_one_start]
-
-idct32x32_1024:
-  mov             r3, inputq
-  lea             r4, [rsp + transposed_in]
-  mov             r7, 4
-
-idct32x32_1024_transpose:
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova            m0, [r3 +       0]
-  packssdw        m0, [r3 +      16]
-  mova            m1, [r3 + 32 *  4]
-  packssdw        m1, [r3 + 32 *  4 + 16]
-  mova            m2, [r3 + 32 *  8]
-  packssdw        m2, [r3 + 32 *  8 + 16]
-  mova            m3, [r3 + 32 * 12]
-  packssdw        m3, [r3 + 32 * 12 + 16]
-  mova            m4, [r3 + 32 * 16]
-  packssdw        m4, [r3 + 32 * 16 + 16]
-  mova            m5, [r3 + 32 * 20]
-  packssdw        m5, [r3 + 32 * 20 + 16]
-  mova            m6, [r3 + 32 * 24]
-  packssdw        m6, [r3 + 32 * 24 + 16]
-  mova            m7, [r3 + 32 * 28]
-  packssdw        m7, [r3 + 32 * 28 + 16]
-%else
-  mova            m0, [r3 +       0]
-  mova            m1, [r3 + 16 *  4]
-  mova            m2, [r3 + 16 *  8]
-  mova            m3, [r3 + 16 * 12]
-  mova            m4, [r3 + 16 * 16]
-  mova            m5, [r3 + 16 * 20]
-  mova            m6, [r3 + 16 * 24]
-  mova            m7, [r3 + 16 * 28]
-%endif
-
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  mova [r4 +      0], m0
-  mova [r4 + 16 * 1], m1
-  mova [r4 + 16 * 2], m2
-  mova [r4 + 16 * 3], m3
-  mova [r4 + 16 * 4], m4
-  mova [r4 + 16 * 5], m5
-  mova [r4 + 16 * 6], m6
-  mova [r4 + 16 * 7], m7
-%if CONFIG_VP9_HIGHBITDEPTH
-  add             r3, 32
-%else
-  add             r3, 16
-%endif
-  add             r4, 16 * 8
-  dec             r7
-  jne idct32x32_1024_transpose
-
-  IDCT32X32_1024 16*0, 16*32, 16*64, 16*96
-
-  lea            stp, [stp + 16 * 8]
-%if CONFIG_VP9_HIGHBITDEPTH
-  lea         inputq, [inputq + 32 * 32]
-%else
-  lea         inputq, [inputq + 16 * 32]
-%endif
-  dec             r6
-  jnz idct32x32_1024
-
-  mov             r6, 4
-  lea            stp, [rsp + pass_one_start]
-  lea             r9, [rsp + pass_one_start]
-
-idct32x32_1024_2:
-  lea             r4, [rsp + transposed_in]
-  mov             r3, r9
-  mov             r7, 4
-
-idct32x32_1024_transpose_2:
-  mova            m0, [r3 +      0]
-  mova            m1, [r3 + 16 * 1]
-  mova            m2, [r3 + 16 * 2]
-  mova            m3, [r3 + 16 * 3]
-  mova            m4, [r3 + 16 * 4]
-  mova            m5, [r3 + 16 * 5]
-  mova            m6, [r3 + 16 * 6]
-  mova            m7, [r3 + 16 * 7]
-
-  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
-
-  mova [r4 +      0], m0
-  mova [r4 + 16 * 1], m1
-  mova [r4 + 16 * 2], m2
-  mova [r4 + 16 * 3], m3
-  mova [r4 + 16 * 4], m4
-  mova [r4 + 16 * 5], m5
-  mova [r4 + 16 * 6], m6
-  mova [r4 + 16 * 7], m7
-
-  add             r3, 16 * 8
-  add             r4, 16 * 8
-  dec             r7
-  jne idct32x32_1024_transpose_2
-
-  IDCT32X32_1024 16*0, 16*8, 16*16, 16*24
-
-  lea            stp, [stp + 16 * 32]
-  add             r9, 16 * 32
-  dec             r6
-  jnz idct32x32_1024_2
-
-  RECON_AND_STORE pass_two_start
-
-  RET
-%endif
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/inv_wht_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/inv_wht_sse2.asm
index fbbcd76bd7..bcf1a6ef98 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/inv_wht_sse2.asm
+++ b/media/libvpx/libvpx/vpx_dsp/x86/inv_wht_sse2.asm
@@ -9,6 +9,7 @@
 ;
 
 %include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
 
 SECTION .text
 
@@ -82,15 +83,8 @@ SECTION .text
 
 INIT_XMM sse2
 cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova            m0,        [inputq +  0]
-  packssdw        m0,        [inputq + 16]
-  mova            m1,        [inputq + 32]
-  packssdw        m1,        [inputq + 48]
-%else
-  mova            m0,        [inputq +  0]
-  mova            m1,        [inputq + 16]
-%endif
+  LOAD_TRAN_LOW    0, inputq, 0
+  LOAD_TRAN_LOW    1, inputq, 8
   psraw           m0,        2
   psraw           m1,        2
 
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_avx2.c
index 6652a62dcf..a58fb65539 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_avx2.c
+++ b/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_avx2.c
@@ -13,38 +13,38 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
 
-void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
-                                const unsigned char *_blimit,
-                                const unsigned char *_limit,
-                                const unsigned char *_thresh) {
+void vpx_lpf_horizontal_16_avx2(unsigned char *s, int pitch,
+                                const unsigned char *blimit,
+                                const unsigned char *limit,
+                                const unsigned char *thresh) {
   __m128i mask, hev, flat, flat2;
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi8(1);
   __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
   __m128i abs_p1p0;
 
-  const __m128i thresh =
-      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
-  const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
-  const __m128i blimit =
-      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));
+  const __m128i thresh_v =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)thresh[0]));
+  const __m128i limit_v = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)limit[0]));
+  const __m128i blimit_v =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)blimit[0]));
 
-  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
+  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * pitch));
   q4p4 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
-  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+      _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * pitch)));
+  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * pitch));
   q3p3 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
-  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+      _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * pitch)));
+  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * pitch));
   q2p2 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
-  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+      _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * pitch)));
+  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * pitch));
   q1p1 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
+      _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * pitch)));
   p1q1 = _mm_shuffle_epi32(q1p1, 78);
-  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * pitch));
   q0p0 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
+      _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * pitch)));
   p0q0 = _mm_shuffle_epi32(q0p0, 78);
 
   {
@@ -52,19 +52,19 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
     abs_p1p0 =
         _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), _mm_subs_epu8(q0p0, q1p1));
     abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
-    fe = _mm_set1_epi8(0xfe);
+    fe = _mm_set1_epi8((int8_t)0xfe);
     ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
     abs_p0q0 =
         _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), _mm_subs_epu8(p0q0, q0p0));
     abs_p1q1 =
         _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1));
     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_subs_epu8(flat, thresh_v);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
 
     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
     mask = _mm_max_epu8(abs_p1p0, mask);
@@ -76,7 +76,7 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
         _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3)));
     mask = _mm_max_epu8(work, mask);
     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_subs_epu8(mask, limit_v);
     mask = _mm_cmpeq_epi8(mask, zero);
   }
 
@@ -84,7 +84,7 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
   {
     const __m128i t4 = _mm_set1_epi8(4);
     const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
+    const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
     const __m128i t1 = _mm_set1_epi16(0x1);
     __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
     __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
@@ -136,21 +136,21 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
       flat = _mm_cmpeq_epi8(flat, zero);
       flat = _mm_and_si128(flat, mask);
 
-      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
+      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * pitch));
       q5p5 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
+          _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * pitch)));
 
-      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
+      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * pitch));
       q6p6 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
+          _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * pitch)));
 
       flat2 = _mm_max_epu8(
           _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)),
           _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5)));
 
-      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
+      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * pitch));
       q7p7 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
+          _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * pitch)));
 
       work = _mm_max_epu8(
           _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)),
@@ -321,44 +321,44 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
     q6p6 = _mm_andnot_si128(flat2, q6p6);
     flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
     q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
-    _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
-    _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
+    _mm_storel_epi64((__m128i *)(s - 7 * pitch), q6p6);
+    _mm_storeh_pi((__m64 *)(s + 6 * pitch), _mm_castsi128_ps(q6p6));
 
     q5p5 = _mm_andnot_si128(flat2, q5p5);
     flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
     q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
-    _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
-    _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
+    _mm_storel_epi64((__m128i *)(s - 6 * pitch), q5p5);
+    _mm_storeh_pi((__m64 *)(s + 5 * pitch), _mm_castsi128_ps(q5p5));
 
     q4p4 = _mm_andnot_si128(flat2, q4p4);
     flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
     q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
-    _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
-    _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
+    _mm_storel_epi64((__m128i *)(s - 5 * pitch), q4p4);
+    _mm_storeh_pi((__m64 *)(s + 4 * pitch), _mm_castsi128_ps(q4p4));
 
     q3p3 = _mm_andnot_si128(flat2, q3p3);
     flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
     q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
-    _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
-    _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
+    _mm_storel_epi64((__m128i *)(s - 4 * pitch), q3p3);
+    _mm_storeh_pi((__m64 *)(s + 3 * pitch), _mm_castsi128_ps(q3p3));
 
     q2p2 = _mm_andnot_si128(flat2, q2p2);
     flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
     q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
-    _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
-    _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
+    _mm_storel_epi64((__m128i *)(s - 3 * pitch), q2p2);
+    _mm_storeh_pi((__m64 *)(s + 2 * pitch), _mm_castsi128_ps(q2p2));
 
     q1p1 = _mm_andnot_si128(flat2, q1p1);
     flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
     q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
-    _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
-    _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
+    _mm_storel_epi64((__m128i *)(s - 2 * pitch), q1p1);
+    _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(q1p1));
 
     q0p0 = _mm_andnot_si128(flat2, q0p0);
     flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
     q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
-    _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
-    _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
+    _mm_storel_epi64((__m128i *)(s - 1 * pitch), q0p0);
+    _mm_storeh_pi((__m64 *)(s - 0 * pitch), _mm_castsi128_ps(q0p0));
   }
 }
 
@@ -367,12 +367,12 @@ DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
   8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
 };
 
-void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
-                                     const unsigned char *_blimit,
-                                     const unsigned char *_limit,
-                                     const unsigned char *_thresh) {
+void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int pitch,
+                                     const unsigned char *blimit,
+                                     const unsigned char *limit,
+                                     const unsigned char *thresh) {
   __m128i mask, hev, flat, flat2;
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi8(1);
   __m128i p7, p6, p5;
   __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
@@ -380,32 +380,32 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
   __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, q256_4,
       p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0;
 
-  const __m128i thresh =
-      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
-  const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
-  const __m128i blimit =
-      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));
+  const __m128i thresh_v =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)thresh[0]));
+  const __m128i limit_v = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)limit[0]));
+  const __m128i blimit_v =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)blimit[0]));
 
-  p256_4 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 5 * p)));
-  p256_3 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p)));
-  p256_2 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
-  p256_1 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
-  p256_0 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
-  q256_0 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
-  q256_1 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
-  q256_2 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
-  q256_3 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p)));
-  q256_4 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 4 * p)));
+  p256_4 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s - 5 * pitch)));
+  p256_3 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s - 4 * pitch)));
+  p256_2 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s - 3 * pitch)));
+  p256_1 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s - 2 * pitch)));
+  p256_0 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s - 1 * pitch)));
+  q256_0 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s - 0 * pitch)));
+  q256_1 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s + 1 * pitch)));
+  q256_2 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s + 2 * pitch)));
+  q256_3 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s + 3 * pitch)));
+  q256_4 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s + 4 * pitch)));
 
   p4 = _mm256_castsi256_si128(p256_4);
   p3 = _mm256_castsi256_si128(p256_3);
@@ -423,7 +423,7 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
         _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
     const __m128i abs_q1q0 =
         _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
-    const __m128i fe = _mm_set1_epi8(0xfe);
+    const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
     const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
     __m128i abs_p0q0 =
         _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
@@ -431,12 +431,12 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
         _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
     __m128i work;
     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_subs_epu8(flat, thresh_v);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
 
     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
     mask = _mm_max_epu8(flat, mask);
@@ -450,7 +450,7 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
         _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
         _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
     mask = _mm_max_epu8(work, mask);
-    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_subs_epu8(mask, limit_v);
     mask = _mm_cmpeq_epi8(mask, zero);
   }
 
@@ -458,8 +458,8 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
   {
     const __m128i t4 = _mm_set1_epi8(4);
     const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i te0 = _mm_set1_epi8(0xe0);
+    const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+    const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
     const __m128i t1f = _mm_set1_epi8(0x1f);
     const __m128i t1 = _mm_set1_epi8(0x1);
     const __m128i t7f = _mm_set1_epi8(0x7f);
@@ -532,9 +532,9 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
       flat = _mm_and_si128(flat, mask);
 
       p256_5 = _mm256_castpd_si256(
-          _mm256_broadcast_pd((__m128d const *)(s - 6 * p)));
+          _mm256_broadcast_pd((__m128d const *)(s - 6 * pitch)));
       q256_5 = _mm256_castpd_si256(
-          _mm256_broadcast_pd((__m128d const *)(s + 5 * p)));
+          _mm256_broadcast_pd((__m128d const *)(s + 5 * pitch)));
       p5 = _mm256_castsi256_si128(p256_5);
       q5 = _mm256_castsi256_si128(q256_5);
       flat2 = _mm_max_epu8(
@@ -543,9 +543,9 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
 
       flat2 = _mm_max_epu8(work, flat2);
       p256_6 = _mm256_castpd_si256(
-          _mm256_broadcast_pd((__m128d const *)(s - 7 * p)));
+          _mm256_broadcast_pd((__m128d const *)(s - 7 * pitch)));
       q256_6 = _mm256_castpd_si256(
-          _mm256_broadcast_pd((__m128d const *)(s + 6 * p)));
+          _mm256_broadcast_pd((__m128d const *)(s + 6 * pitch)));
       p6 = _mm256_castsi256_si128(p256_6);
       q6 = _mm256_castsi256_si128(q256_6);
       work = _mm_max_epu8(
@@ -555,9 +555,9 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
       flat2 = _mm_max_epu8(work, flat2);
 
       p256_7 = _mm256_castpd_si256(
-          _mm256_broadcast_pd((__m128d const *)(s - 8 * p)));
+          _mm256_broadcast_pd((__m128d const *)(s - 8 * pitch)));
       q256_7 = _mm256_castpd_si256(
-          _mm256_broadcast_pd((__m128d const *)(s + 7 * p)));
+          _mm256_broadcast_pd((__m128d const *)(s + 7 * pitch)));
       p7 = _mm256_castsi256_si128(p256_7);
       q7 = _mm256_castsi256_si128(q256_7);
       work = _mm_max_epu8(
@@ -843,71 +843,71 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
     p6 = _mm_andnot_si128(flat2, p6);
     flat2_p6 = _mm_and_si128(flat2, flat2_p6);
     p6 = _mm_or_si128(flat2_p6, p6);
-    _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
+    _mm_storeu_si128((__m128i *)(s - 7 * pitch), p6);
 
     p5 = _mm_andnot_si128(flat2, p5);
     flat2_p5 = _mm_and_si128(flat2, flat2_p5);
     p5 = _mm_or_si128(flat2_p5, p5);
-    _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
+    _mm_storeu_si128((__m128i *)(s - 6 * pitch), p5);
 
     p4 = _mm_andnot_si128(flat2, p4);
     flat2_p4 = _mm_and_si128(flat2, flat2_p4);
     p4 = _mm_or_si128(flat2_p4, p4);
-    _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
+    _mm_storeu_si128((__m128i *)(s - 5 * pitch), p4);
 
     p3 = _mm_andnot_si128(flat2, p3);
     flat2_p3 = _mm_and_si128(flat2, flat2_p3);
     p3 = _mm_or_si128(flat2_p3, p3);
-    _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
+    _mm_storeu_si128((__m128i *)(s - 4 * pitch), p3);
 
     p2 = _mm_andnot_si128(flat2, p2);
     flat2_p2 = _mm_and_si128(flat2, flat2_p2);
     p2 = _mm_or_si128(flat2_p2, p2);
-    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+    _mm_storeu_si128((__m128i *)(s - 3 * pitch), p2);
 
     p1 = _mm_andnot_si128(flat2, p1);
     flat2_p1 = _mm_and_si128(flat2, flat2_p1);
     p1 = _mm_or_si128(flat2_p1, p1);
-    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+    _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);
 
     p0 = _mm_andnot_si128(flat2, p0);
     flat2_p0 = _mm_and_si128(flat2, flat2_p0);
     p0 = _mm_or_si128(flat2_p0, p0);
-    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+    _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);
 
     q0 = _mm_andnot_si128(flat2, q0);
     flat2_q0 = _mm_and_si128(flat2, flat2_q0);
     q0 = _mm_or_si128(flat2_q0, q0);
-    _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
+    _mm_storeu_si128((__m128i *)(s - 0 * pitch), q0);
 
     q1 = _mm_andnot_si128(flat2, q1);
     flat2_q1 = _mm_and_si128(flat2, flat2_q1);
     q1 = _mm_or_si128(flat2_q1, q1);
-    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+    _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);
 
     q2 = _mm_andnot_si128(flat2, q2);
     flat2_q2 = _mm_and_si128(flat2, flat2_q2);
     q2 = _mm_or_si128(flat2_q2, q2);
-    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+    _mm_storeu_si128((__m128i *)(s + 2 * pitch), q2);
 
     q3 = _mm_andnot_si128(flat2, q3);
     flat2_q3 = _mm_and_si128(flat2, flat2_q3);
     q3 = _mm_or_si128(flat2_q3, q3);
-    _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
+    _mm_storeu_si128((__m128i *)(s + 3 * pitch), q3);
 
     q4 = _mm_andnot_si128(flat2, q4);
     flat2_q4 = _mm_and_si128(flat2, flat2_q4);
     q4 = _mm_or_si128(flat2_q4, q4);
-    _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
+    _mm_storeu_si128((__m128i *)(s + 4 * pitch), q4);
 
     q5 = _mm_andnot_si128(flat2, q5);
     flat2_q5 = _mm_and_si128(flat2, flat2_q5);
     q5 = _mm_or_si128(flat2_q5, q5);
-    _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
+    _mm_storeu_si128((__m128i *)(s + 5 * pitch), q5);
 
     q6 = _mm_andnot_si128(flat2, q6);
     flat2_q6 = _mm_and_si128(flat2, flat2_q6);
     q6 = _mm_or_si128(flat2_q6, q6);
-    _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
+    _mm_storeu_si128((__m128i *)(s + 6 * pitch), q6);
   }
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_intrin_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_sse2.c
similarity index 80%
rename from media/libvpx/libvpx/vpx_dsp/x86/loopfilter_intrin_sse2.c
rename to media/libvpx/libvpx/vpx_dsp/x86/loopfilter_sse2.c
index 28e6fd65f9..6ea34cdd16 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_intrin_sse2.c
+++ b/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_sse2.c
@@ -13,6 +13,7 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/emmintrin_compat.h"
+#include "vpx_dsp/x86/mem_sse2.h"
 
 static INLINE __m128i abs_diff(__m128i a, __m128i b) {
   return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
@@ -30,7 +31,7 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) {
     /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */       \
     hev =                                                                     \
         _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
-    hev = _mm_cmpgt_epi16(hev, thresh);                                       \
+    hev = _mm_cmpgt_epi16(hev, thresh_v);                                     \
     hev = _mm_packs_epi16(hev, hev);                                          \
                                                                               \
     /* const int8_t mask = filter_mask(*limit, *blimit, */                    \
@@ -51,7 +52,7 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) {
     flat = _mm_max_epu8(work, flat);                                          \
     flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));                       \
     mask = _mm_unpacklo_epi64(mask, flat);                                    \
-    mask = _mm_subs_epu8(mask, limit);                                        \
+    mask = _mm_subs_epu8(mask, limit_v);                                      \
     mask = _mm_cmpeq_epi8(mask, zero);                                        \
     mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));                      \
   } while (0)
@@ -60,7 +61,7 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) {
   do {                                                                      \
     const __m128i t3t4 =                                                    \
         _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4);       \
-    const __m128i t80 = _mm_set1_epi8(0x80);                                \
+    const __m128i t80 = _mm_set1_epi8((int8_t)0x80);                        \
     __m128i filter, filter2filter1, work;                                   \
                                                                             \
     ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */                         \
@@ -103,27 +104,26 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) {
     ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */                       \
   } while (0)
 
-void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
-                               const uint8_t *_blimit, const uint8_t *_limit,
-                               const uint8_t *_thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i limit =
-      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
-                         _mm_loadl_epi64((const __m128i *)_limit));
-  const __m128i thresh =
-      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
+void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
+                               const uint8_t *limit, const uint8_t *thresh) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i limit_v =
+      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit),
+                         _mm_loadl_epi64((const __m128i *)limit));
+  const __m128i thresh_v =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)thresh), zero);
   const __m128i ff = _mm_cmpeq_epi8(zero, zero);
   __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
   __m128i mask, hev;
 
-  p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
-                            _mm_loadl_epi64((__m128i *)(s - 4 * p)));
-  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
-  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 0 * p)));
-  q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
+  p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * pitch)),
+                            _mm_loadl_epi64((__m128i *)(s - 4 * pitch)));
+  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)),
+                            _mm_loadl_epi64((__m128i *)(s + 1 * pitch)));
+  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)),
+                            _mm_loadl_epi64((__m128i *)(s + 0 * pitch)));
+  q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * pitch)),
+                            _mm_loadl_epi64((__m128i *)(s + 3 * pitch)));
   p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
   p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
   q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
@@ -132,41 +132,40 @@ void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
   FILTER_HEV_MASK;
   FILTER4;
 
-  _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0));  // *op1
-  _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);               // *op0
-  _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);               // *oq0
-  _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0));  // *oq1
+  _mm_storeh_pi((__m64 *)(s - 2 * pitch), _mm_castsi128_ps(ps1ps0));  // *op1
+  _mm_storel_epi64((__m128i *)(s - 1 * pitch), ps1ps0);               // *op0
+  _mm_storel_epi64((__m128i *)(s + 0 * pitch), qs1qs0);               // *oq0
+  _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(qs1qs0));  // *oq1
 }
 
-void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
-                             const uint8_t *_blimit, const uint8_t *_limit,
-                             const uint8_t *_thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i limit =
-      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
-                         _mm_loadl_epi64((const __m128i *)_limit));
-  const __m128i thresh =
-      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
+void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i limit_v =
+      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit),
+                         _mm_loadl_epi64((const __m128i *)limit));
+  const __m128i thresh_v =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)thresh), zero);
   const __m128i ff = _mm_cmpeq_epi8(zero, zero);
   __m128i x0, x1, x2, x3;
   __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
   __m128i mask, hev;
 
   // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-  q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)),
-                           _mm_loadl_epi64((__m128i *)(s + 1 * p - 4)));
+  q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * pitch - 4)),
+                           _mm_loadl_epi64((__m128i *)(s + 1 * pitch - 4)));
 
   // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-  x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)),
-                         _mm_loadl_epi64((__m128i *)(s + 3 * p - 4)));
+  x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * pitch - 4)),
+                         _mm_loadl_epi64((__m128i *)(s + 3 * pitch - 4)));
 
   // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
-  x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)),
-                         _mm_loadl_epi64((__m128i *)(s + 5 * p - 4)));
+  x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * pitch - 4)),
+                         _mm_loadl_epi64((__m128i *)(s + 5 * pitch - 4)));
 
   // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
-  x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)),
-                         _mm_loadl_epi64((__m128i *)(s + 7 * p - 4)));
+  x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * pitch - 4)),
+                         _mm_loadl_epi64((__m128i *)(s + 7 * pitch - 4)));
 
   // Transpose 8x8
   // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
@@ -212,69 +211,69 @@ void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
   // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
   ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
 
-  *(int *)(s + 0 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
+  storeu_int32(s + 0 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  *(int *)(s + 1 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
+  storeu_int32(s + 1 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
+  storeu_int32(s + 2 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
+  storeu_int32(s + 3 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
 
-  *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
+  storeu_int32(s + 4 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
+  storeu_int32(s + 5 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
+  storeu_int32(s + 6 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
+  storeu_int32(s + 7 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
 }
 
-void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
-                                const unsigned char *_blimit,
-                                const unsigned char *_limit,
-                                const unsigned char *_thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
+void vpx_lpf_horizontal_16_sse2(unsigned char *s, int pitch,
+                                const unsigned char *blimit,
+                                const unsigned char *limit,
+                                const unsigned char *thresh) {
+  const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi8(1);
-  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+  const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
+  const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
+  const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);
   __m128i mask, hev, flat, flat2;
   __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
   __m128i abs_p1p0;
 
-  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
+  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * pitch));
   q4p4 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
-  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+      _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * pitch)));
+  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * pitch));
   q3p3 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
-  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+      _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * pitch)));
+  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * pitch));
   q2p2 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
-  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+      _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * pitch)));
+  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * pitch));
   q1p1 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
+      _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * pitch)));
   p1q1 = _mm_shuffle_epi32(q1p1, 78);
-  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * pitch));
   q0p0 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
+      _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * pitch)));
   p0q0 = _mm_shuffle_epi32(q0p0, 78);
 
   {
     __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
     abs_p1p0 = abs_diff(q1p1, q0p0);
     abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
-    fe = _mm_set1_epi8(0xfe);
+    fe = _mm_set1_epi8((int8_t)0xfe);
     ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
     abs_p0q0 = abs_diff(q0p0, p0q0);
     abs_p1q1 = abs_diff(q1p1, p1q1);
     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_subs_epu8(flat, thresh_v);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
 
     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
     mask = _mm_max_epu8(abs_p1p0, mask);
@@ -284,7 +283,7 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
     work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
     mask = _mm_max_epu8(work, mask);
     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_subs_epu8(mask, limit_v);
     mask = _mm_cmpeq_epi8(mask, zero);
   }
 
@@ -292,7 +291,7 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
   {
     const __m128i t4 = _mm_set1_epi8(4);
     const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
+    const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
     const __m128i t1 = _mm_set1_epi16(0x1);
     __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
     __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
@@ -342,18 +341,18 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
       flat = _mm_cmpeq_epi8(flat, zero);
       flat = _mm_and_si128(flat, mask);
 
-      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
+      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * pitch));
       q5p5 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
+          _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * pitch)));
 
-      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
+      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * pitch));
       q6p6 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
+          _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * pitch)));
       flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
 
-      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
+      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * pitch));
       q7p7 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
+          _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * pitch)));
       work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
       flat2 = _mm_max_epu8(work, flat2);
       flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
@@ -520,44 +519,44 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
     q6p6 = _mm_andnot_si128(flat2, q6p6);
     flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
     q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
-    _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
-    _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
+    _mm_storel_epi64((__m128i *)(s - 7 * pitch), q6p6);
+    _mm_storeh_pi((__m64 *)(s + 6 * pitch), _mm_castsi128_ps(q6p6));
 
     q5p5 = _mm_andnot_si128(flat2, q5p5);
     flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
     q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
-    _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
-    _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
+    _mm_storel_epi64((__m128i *)(s - 6 * pitch), q5p5);
+    _mm_storeh_pi((__m64 *)(s + 5 * pitch), _mm_castsi128_ps(q5p5));
 
     q4p4 = _mm_andnot_si128(flat2, q4p4);
     flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
     q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
-    _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
-    _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
+    _mm_storel_epi64((__m128i *)(s - 5 * pitch), q4p4);
+    _mm_storeh_pi((__m64 *)(s + 4 * pitch), _mm_castsi128_ps(q4p4));
 
     q3p3 = _mm_andnot_si128(flat2, q3p3);
     flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
     q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
-    _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
-    _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
+    _mm_storel_epi64((__m128i *)(s - 4 * pitch), q3p3);
+    _mm_storeh_pi((__m64 *)(s + 3 * pitch), _mm_castsi128_ps(q3p3));
 
     q2p2 = _mm_andnot_si128(flat2, q2p2);
     flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
     q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
-    _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
-    _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
+    _mm_storel_epi64((__m128i *)(s - 3 * pitch), q2p2);
+    _mm_storeh_pi((__m64 *)(s + 2 * pitch), _mm_castsi128_ps(q2p2));
 
     q1p1 = _mm_andnot_si128(flat2, q1p1);
     flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
     q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
-    _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
-    _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
+    _mm_storel_epi64((__m128i *)(s - 2 * pitch), q1p1);
+    _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(q1p1));
 
     q0p0 = _mm_andnot_si128(flat2, q0p0);
     flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
     q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
-    _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
-    _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
+    _mm_storel_epi64((__m128i *)(s - 1 * pitch), q0p0);
+    _mm_storeh_pi((__m64 *)(s - 0 * pitch), _mm_castsi128_ps(q0p0));
   }
 }
 
@@ -591,15 +590,15 @@ static INLINE __m128i filter16_mask(const __m128i *const flat,
   return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
 }
 
-void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,
-                                     const unsigned char *_blimit,
-                                     const unsigned char *_limit,
-                                     const unsigned char *_thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
+void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int pitch,
+                                     const unsigned char *blimit,
+                                     const unsigned char *limit,
+                                     const unsigned char *thresh) {
+  const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi8(1);
-  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+  const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
+  const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
+  const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);
   __m128i mask, hev, flat, flat2;
   __m128i p7, p6, p5;
   __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
@@ -609,27 +608,27 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,
 
   __m128i max_abs_p1p0q1q0;
 
-  p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
-  p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
-  p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
-  p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-  q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
-  q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
-  q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
-  q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
+  p7 = _mm_loadu_si128((__m128i *)(s - 8 * pitch));
+  p6 = _mm_loadu_si128((__m128i *)(s - 7 * pitch));
+  p5 = _mm_loadu_si128((__m128i *)(s - 6 * pitch));
+  p4 = _mm_loadu_si128((__m128i *)(s - 5 * pitch));
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
+  q4 = _mm_loadu_si128((__m128i *)(s + 4 * pitch));
+  q5 = _mm_loadu_si128((__m128i *)(s + 5 * pitch));
+  q6 = _mm_loadu_si128((__m128i *)(s + 6 * pitch));
+  q7 = _mm_loadu_si128((__m128i *)(s + 7 * pitch));
 
   {
     const __m128i abs_p1p0 = abs_diff(p1, p0);
     const __m128i abs_q1q0 = abs_diff(q1, q0);
-    const __m128i fe = _mm_set1_epi8(0xfe);
+    const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
     const __m128i ff = _mm_cmpeq_epi8(zero, zero);
     __m128i abs_p0q0 = abs_diff(p0, q0);
     __m128i abs_p1q1 = abs_diff(p1, q1);
@@ -638,7 +637,7 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,
 
     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
     mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
@@ -648,7 +647,7 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,
     mask = _mm_max_epu8(work, mask);
     work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
     mask = _mm_max_epu8(work, mask);
-    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_subs_epu8(mask, limit_v);
     mask = _mm_cmpeq_epi8(mask, zero);
   }
 
@@ -678,8 +677,8 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,
   {
     const __m128i t4 = _mm_set1_epi8(4);
     const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i te0 = _mm_set1_epi8(0xe0);
+    const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+    const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
     const __m128i t1f = _mm_set1_epi8(0x1f);
     const __m128i t1 = _mm_set1_epi8(0x1);
     const __m128i t7f = _mm_set1_epi8(0x7f);
@@ -694,7 +693,7 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,
     oq0 = _mm_xor_si128(q0, t80);
     oq1 = _mm_xor_si128(q1, t80);
 
-    hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh);
+    hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
     filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
 
@@ -851,111 +850,111 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,
       f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
 
       p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
+      _mm_storeu_si128((__m128i *)(s - 7 * pitch), p6);
 
       f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
       f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
       p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
+      _mm_storeu_si128((__m128i *)(s - 6 * pitch), p5);
 
       f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
       f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
       p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
+      _mm_storeu_si128((__m128i *)(s - 5 * pitch), p4);
 
       f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
       f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
       p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
+      _mm_storeu_si128((__m128i *)(s - 4 * pitch), p3);
 
       f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
       f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
       op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
+      _mm_storeu_si128((__m128i *)(s - 3 * pitch), op2);
 
       f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
       f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
       op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+      _mm_storeu_si128((__m128i *)(s - 2 * pitch), op1);
 
       f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
       f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
       op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+      _mm_storeu_si128((__m128i *)(s - 1 * pitch), op0);
 
       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
       oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+      _mm_storeu_si128((__m128i *)(s - 0 * pitch), oq0);
 
       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
       oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+      _mm_storeu_si128((__m128i *)(s + 1 * pitch), oq1);
 
       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
       oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
+      _mm_storeu_si128((__m128i *)(s + 2 * pitch), oq2);
 
       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
       q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
+      _mm_storeu_si128((__m128i *)(s + 3 * pitch), q3);
 
       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
       q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
+      _mm_storeu_si128((__m128i *)(s + 4 * pitch), q4);
 
       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
       q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
+      _mm_storeu_si128((__m128i *)(s + 5 * pitch), q5);
 
       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
       q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
+      _mm_storeu_si128((__m128i *)(s + 6 * pitch), q6);
     }
     // wide flat
     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   }
 }
 
-void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
-                               const unsigned char *_blimit,
-                               const unsigned char *_limit,
-                               const unsigned char *_thresh) {
+void vpx_lpf_horizontal_8_sse2(unsigned char *s, int pitch,
+                               const unsigned char *blimit,
+                               const unsigned char *limit,
+                               const unsigned char *thresh) {
   DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
+  const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
+  const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);
   __m128i mask, hev, flat;
   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
   __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
 
-  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
-  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 2 * p)));
-  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
-  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
-                            _mm_loadl_epi64((__m128i *)(s - 0 * p)));
+  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * pitch)),
+                            _mm_loadl_epi64((__m128i *)(s + 3 * pitch)));
+  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * pitch)),
+                            _mm_loadl_epi64((__m128i *)(s + 2 * pitch)));
+  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)),
+                            _mm_loadl_epi64((__m128i *)(s + 1 * pitch)));
+  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)),
+                            _mm_loadl_epi64((__m128i *)(s - 0 * pitch)));
   p1q1 = _mm_shuffle_epi32(q1p1, 78);
   p0q0 = _mm_shuffle_epi32(q0p0, 78);
 
   {
     // filter_mask and hev_mask
     const __m128i one = _mm_set1_epi8(1);
-    const __m128i fe = _mm_set1_epi8(0xfe);
+    const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
     const __m128i ff = _mm_cmpeq_epi8(fe, fe);
     __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
     abs_p1p0 = abs_diff(q1p1, q0p0);
@@ -964,12 +963,12 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
     abs_p0q0 = abs_diff(q0p0, p0q0);
     abs_p1q1 = abs_diff(q1p1, p1q1);
     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_subs_epu8(flat, thresh_v);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
 
     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
     mask = _mm_max_epu8(abs_p1p0, mask);
@@ -979,7 +978,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
     work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
     mask = _mm_max_epu8(work, mask);
     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_subs_epu8(mask, limit_v);
     mask = _mm_cmpeq_epi8(mask, zero);
 
     // flat_mask4
@@ -997,14 +996,22 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
     unsigned char *src = s;
     {
       __m128i workp_a, workp_b, workp_shft;
-      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
-      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
-      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
-      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
-      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
-      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
-      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
-      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
+      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * pitch)),
+                             zero);
+      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * pitch)),
+                             zero);
+      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * pitch)),
+                             zero);
+      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * pitch)),
+                             zero);
+      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * pitch)),
+                             zero);
+      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * pitch)),
+                             zero);
+      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * pitch)),
+                             zero);
+      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * pitch)),
+                             zero);
 
       workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
       workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
@@ -1047,16 +1054,16 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
   {
     const __m128i t4 = _mm_set1_epi8(4);
     const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
+    const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
     const __m128i t1 = _mm_set1_epi8(0x1);
     const __m128i ps1 =
-        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), t80);
+        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)), t80);
     const __m128i ps0 =
-        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), t80);
+        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)), t80);
     const __m128i qs0 =
-        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), t80);
+        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * pitch)), t80);
     const __m128i qs1 =
-        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), t80);
+        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * pitch)), t80);
     __m128i filt;
     __m128i work_a;
     __m128i filter1, filter2;
@@ -1102,7 +1109,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
     q1 = _mm_and_si128(flat, q1);
     q1 = _mm_or_si128(work_a, q1);
 
-    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+    work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
     q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
     work_a = _mm_andnot_si128(flat, work_a);
     q2 = _mm_and_si128(flat, q2);
@@ -1120,62 +1127,60 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
     p1 = _mm_and_si128(flat, p1);
     p1 = _mm_or_si128(work_a, p1);
 
-    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+    work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
     p2 = _mm_loadl_epi64((__m128i *)flat_op2);
     work_a = _mm_andnot_si128(flat, work_a);
     p2 = _mm_and_si128(flat, p2);
     p2 = _mm_or_si128(work_a, p2);
 
-    _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
-    _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
-    _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
-    _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
-    _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
-    _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
+    _mm_storel_epi64((__m128i *)(s - 3 * pitch), p2);
+    _mm_storel_epi64((__m128i *)(s - 2 * pitch), p1);
+    _mm_storel_epi64((__m128i *)(s - 1 * pitch), p0);
+    _mm_storel_epi64((__m128i *)(s + 0 * pitch), q0);
+    _mm_storel_epi64((__m128i *)(s + 1 * pitch), q1);
+    _mm_storel_epi64((__m128i *)(s + 2 * pitch), q2);
   }
 }
 
-void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
-                                    const uint8_t *_limit0,
-                                    const uint8_t *_thresh0,
-                                    const uint8_t *_blimit1,
-                                    const uint8_t *_limit1,
-                                    const uint8_t *_thresh1) {
+void vpx_lpf_horizontal_8_dual_sse2(
+    uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1) {
   DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i blimit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
-                         _mm_load_si128((const __m128i *)_blimit1));
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0),
+                         _mm_load_si128((const __m128i *)blimit1));
   const __m128i limit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
-                         _mm_load_si128((const __m128i *)_limit1));
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)limit0),
+                         _mm_load_si128((const __m128i *)limit1));
   const __m128i thresh =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
-                         _mm_load_si128((const __m128i *)_thresh1));
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0),
+                         _mm_load_si128((const __m128i *)thresh1));
 
   __m128i mask, hev, flat;
   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
 
-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
   {
     const __m128i abs_p1p0 =
         _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
     const __m128i abs_q1q0 =
         _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
     const __m128i one = _mm_set1_epi8(1);
-    const __m128i fe = _mm_set1_epi8(0xfe);
+    const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
     const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
     __m128i abs_p0q0 =
         _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
@@ -1227,14 +1232,22 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
 
     do {
       __m128i workp_a, workp_b, workp_shft;
-      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
-      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
-      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
-      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
-      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
-      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
-      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
-      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
+      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * pitch)),
+                             zero);
+      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * pitch)),
+                             zero);
+      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * pitch)),
+                             zero);
+      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * pitch)),
+                             zero);
+      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * pitch)),
+                             zero);
+      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * pitch)),
+                             zero);
+      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * pitch)),
+                             zero);
+      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * pitch)),
+                             zero);
 
       workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
       workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
@@ -1279,20 +1292,20 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
   {
     const __m128i t4 = _mm_set1_epi8(4);
     const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i te0 = _mm_set1_epi8(0xe0);
+    const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+    const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
     const __m128i t1f = _mm_set1_epi8(0x1f);
     const __m128i t1 = _mm_set1_epi8(0x1);
     const __m128i t7f = _mm_set1_epi8(0x7f);
 
     const __m128i ps1 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80);
     const __m128i ps0 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80);
     const __m128i qs0 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80);
     const __m128i qs1 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80);
     __m128i filt;
     __m128i work_a;
     __m128i filter1, filter2;
@@ -1344,7 +1357,7 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
     q1 = _mm_and_si128(flat, q1);
     q1 = _mm_or_si128(work_a, q1);
 
-    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+    work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
     q2 = _mm_load_si128((__m128i *)flat_oq2);
     work_a = _mm_andnot_si128(flat, work_a);
     q2 = _mm_and_si128(flat, q2);
@@ -1362,49 +1375,49 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
     p1 = _mm_and_si128(flat, p1);
     p1 = _mm_or_si128(work_a, p1);
 
-    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+    work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
     p2 = _mm_load_si128((__m128i *)flat_op2);
     work_a = _mm_andnot_si128(flat, work_a);
     p2 = _mm_and_si128(flat, p2);
     p2 = _mm_or_si128(work_a, p2);
 
-    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
-    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
-    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
-    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+    _mm_storeu_si128((__m128i *)(s - 3 * pitch), p2);
+    _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);
+    _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);
+    _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0);
+    _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);
+    _mm_storeu_si128((__m128i *)(s + 2 * pitch), q2);
   }
 }
 
-void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
-                                    const unsigned char *_blimit0,
-                                    const unsigned char *_limit0,
-                                    const unsigned char *_thresh0,
-                                    const unsigned char *_blimit1,
-                                    const unsigned char *_limit1,
-                                    const unsigned char *_thresh1) {
+void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int pitch,
+                                    const unsigned char *blimit0,
+                                    const unsigned char *limit0,
+                                    const unsigned char *thresh0,
+                                    const unsigned char *blimit1,
+                                    const unsigned char *limit1,
+                                    const unsigned char *thresh1) {
   const __m128i blimit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
-                         _mm_load_si128((const __m128i *)_blimit1));
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0),
+                         _mm_load_si128((const __m128i *)blimit1));
   const __m128i limit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
-                         _mm_load_si128((const __m128i *)_limit1));
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)limit0),
+                         _mm_load_si128((const __m128i *)limit1));
   const __m128i thresh =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
-                         _mm_load_si128((const __m128i *)_thresh1));
-  const __m128i zero = _mm_set1_epi16(0);
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0),
+                         _mm_load_si128((const __m128i *)thresh1));
+  const __m128i zero = _mm_setzero_si128();
   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
   __m128i mask, hev, flat;
 
-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
 
   // filter_mask and hev_mask
   {
@@ -1412,7 +1425,7 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
         _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
     const __m128i abs_q1q0 =
         _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
-    const __m128i fe = _mm_set1_epi8(0xfe);
+    const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
     const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
     __m128i abs_p0q0 =
         _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
@@ -1448,20 +1461,20 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
   {
     const __m128i t4 = _mm_set1_epi8(4);
     const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i te0 = _mm_set1_epi8(0xe0);
+    const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+    const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
     const __m128i t1f = _mm_set1_epi8(0x1f);
     const __m128i t1 = _mm_set1_epi8(0x1);
     const __m128i t7f = _mm_set1_epi8(0x7f);
 
     const __m128i ps1 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80);
     const __m128i ps0 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80);
     const __m128i qs0 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80);
     const __m128i qs1 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80);
     __m128i filt;
     __m128i work_a;
     __m128i filter1, filter2;
@@ -1506,10 +1519,10 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
     p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
     p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
 
-    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
-    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+    _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);
+    _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);
+    _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0);
+    _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);
   }
 }
 
@@ -1626,16 +1639,12 @@ static INLINE void transpose(unsigned char *src[], int in_p,
     x5 = _mm_unpacklo_epi16(x2, x3);
     // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
     x6 = _mm_unpacklo_epi32(x4, x5);
-    _mm_storel_pd((double *)(out + 0 * out_p),
-                  _mm_castsi128_pd(x6));  // 00 10 20 30 40 50 60 70
-    _mm_storeh_pd((double *)(out + 1 * out_p),
-                  _mm_castsi128_pd(x6));  // 01 11 21 31 41 51 61 71
+    mm_storelu(out + 0 * out_p, x6);  // 00 10 20 30 40 50 60 70
+    mm_storehu(out + 1 * out_p, x6);  // 01 11 21 31 41 51 61 71
     // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
     x7 = _mm_unpackhi_epi32(x4, x5);
-    _mm_storel_pd((double *)(out + 2 * out_p),
-                  _mm_castsi128_pd(x7));  // 02 12 22 32 42 52 62 72
-    _mm_storeh_pd((double *)(out + 3 * out_p),
-                  _mm_castsi128_pd(x7));  // 03 13 23 33 43 53 63 73
+    mm_storelu(out + 2 * out_p, x7);  // 02 12 22 32 42 52 62 72
+    mm_storehu(out + 3 * out_p, x7);  // 03 13 23 33 43 53 63 73
 
     // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
     x4 = _mm_unpackhi_epi16(x0, x1);
@@ -1643,21 +1652,17 @@ static INLINE void transpose(unsigned char *src[], int in_p,
     x5 = _mm_unpackhi_epi16(x2, x3);
     // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
     x6 = _mm_unpacklo_epi32(x4, x5);
-    _mm_storel_pd((double *)(out + 4 * out_p),
-                  _mm_castsi128_pd(x6));  // 04 14 24 34 44 54 64 74
-    _mm_storeh_pd((double *)(out + 5 * out_p),
-                  _mm_castsi128_pd(x6));  // 05 15 25 35 45 55 65 75
+    mm_storelu(out + 4 * out_p, x6);  // 04 14 24 34 44 54 64 74
+    mm_storehu(out + 5 * out_p, x6);  // 05 15 25 35 45 55 65 75
     // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
     x7 = _mm_unpackhi_epi32(x4, x5);
 
-    _mm_storel_pd((double *)(out + 6 * out_p),
-                  _mm_castsi128_pd(x7));  // 06 16 26 36 46 56 66 76
-    _mm_storeh_pd((double *)(out + 7 * out_p),
-                  _mm_castsi128_pd(x7));  // 07 17 27 37 47 57 67 77
+    mm_storelu(out + 6 * out_p, x7);  // 06 16 26 36 46 56 66 76
+    mm_storehu(out + 7 * out_p, x7);  // 07 17 27 37 47 57 67 77
   } while (++idx8x8 < num_8x8_to_transpose);
 }
 
-void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
                                   const uint8_t *limit0, const uint8_t *thresh0,
                                   const uint8_t *blimit1, const uint8_t *limit1,
                                   const uint8_t *thresh1) {
@@ -1666,21 +1671,21 @@ void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
   unsigned char *dst[2];
 
   // Transpose 8x16
-  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+  transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
 
   // Loop filtering
-  vpx_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
-                                 blimit1, limit1, thresh1);
+  vpx_lpf_horizontal_4_dual(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
+                            blimit1, limit1, thresh1);
   src[0] = t_dst;
   src[1] = t_dst + 8;
   dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
+  dst[1] = s - 4 + pitch * 8;
 
   // Transpose back
-  transpose(src, 16, dst, p, 2);
+  transpose(src, 16, dst, pitch, 2);
 }
 
-void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,
+void vpx_lpf_vertical_8_sse2(unsigned char *s, int pitch,
                              const unsigned char *blimit,
                              const unsigned char *limit,
                              const unsigned char *thresh) {
@@ -1692,19 +1697,19 @@ void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,
   src[0] = s - 4;
   dst[0] = t_dst;
 
-  transpose(src, p, dst, 8, 1);
+  transpose(src, pitch, dst, 8, 1);
 
   // Loop filtering
-  vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh);
+  vpx_lpf_horizontal_8(t_dst + 4 * 8, 8, blimit, limit, thresh);
 
   src[0] = t_dst;
   dst[0] = s - 4;
 
   // Transpose back
-  transpose(src, 8, dst, p, 1);
+  transpose(src, 8, dst, pitch, 1);
 }
 
-void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
                                   const uint8_t *limit0, const uint8_t *thresh0,
                                   const uint8_t *blimit1, const uint8_t *limit1,
                                   const uint8_t *thresh1) {
@@ -1713,22 +1718,22 @@ void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
   unsigned char *dst[2];
 
   // Transpose 8x16
-  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+  transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
 
   // Loop filtering
-  vpx_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
-                                 blimit1, limit1, thresh1);
+  vpx_lpf_horizontal_8_dual(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
+                            blimit1, limit1, thresh1);
   src[0] = t_dst;
   src[1] = t_dst + 8;
 
   dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
+  dst[1] = s - 4 + pitch * 8;
 
   // Transpose back
-  transpose(src, 16, dst, p, 2);
+  transpose(src, 16, dst, pitch, 2);
 }
 
-void vpx_lpf_vertical_16_sse2(unsigned char *s, int p,
+void vpx_lpf_vertical_16_sse2(unsigned char *s, int pitch,
                               const unsigned char *blimit,
                               const unsigned char *limit,
                               const unsigned char *thresh) {
@@ -1742,10 +1747,10 @@ void vpx_lpf_vertical_16_sse2(unsigned char *s, int p,
   dst[1] = t_dst + 8 * 8;
 
   // Transpose 16x8
-  transpose(src, p, dst, 8, 2);
+  transpose(src, pitch, dst, 8, 2);
 
   // Loop filtering
-  vpx_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh);
+  vpx_lpf_horizontal_16(t_dst + 8 * 8, 8, blimit, limit, thresh);
 
   src[0] = t_dst;
   src[1] = t_dst + 8 * 8;
@@ -1753,22 +1758,22 @@ void vpx_lpf_vertical_16_sse2(unsigned char *s, int p,
   dst[1] = s;
 
   // Transpose back
-  transpose(src, 8, dst, p, 2);
+  transpose(src, 8, dst, pitch, 2);
 }
 
-void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
+void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int pitch,
                                    const uint8_t *blimit, const uint8_t *limit,
                                    const uint8_t *thresh) {
   DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
 
   // Transpose 16x16
-  transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
-  transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
+  transpose8x16(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16);
+  transpose8x16(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16);
 
   // Loop filtering
-  vpx_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh);
+  vpx_lpf_horizontal_16_dual(t_dst + 8 * 16, 16, blimit, limit, thresh);
 
   // Transpose back
-  transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
-  transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
+  transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, pitch);
+  transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch, pitch);
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/mem_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/mem_sse2.h
new file mode 100644
index 0000000000..031f361a41
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/mem_sse2.h
@@ -0,0 +1,154 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_MEM_SSE2_H_
+#define VPX_VPX_DSP_X86_MEM_SSE2_H_
+
+#include <emmintrin.h>  // SSE2
+#include <string.h>
+
+#include "./vpx_config.h"
+
+static INLINE void storeu_int32(void *dst, int32_t v) {
+  memcpy(dst, &v, sizeof(v));
+}
+
+static INLINE int32_t loadu_int32(const void *src) {
+  int32_t v;
+  memcpy(&v, src, sizeof(v));
+  return v;
+}
+
+static INLINE __m128i load_unaligned_u32(const void *a) {
+  int val;
+  memcpy(&val, a, sizeof(val));
+  return _mm_cvtsi32_si128(val);
+}
+
+static INLINE void store_unaligned_u32(void *const a, const __m128i v) {
+  const int val = _mm_cvtsi128_si32(v);
+  memcpy(a, &val, sizeof(val));
+}
+
+#define mm_storelu(dst, v) memcpy((dst), (const char *)&(v), 8)
+#define mm_storehu(dst, v) memcpy((dst), (const char *)&(v) + 8, 8)
+
+static INLINE __m128i loadh_epi64(const __m128i s, const void *const src) {
+  return _mm_castps_si128(
+      _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
+}
+
+static INLINE void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride,
+                                 __m128i *const d) {
+  d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride));
+  d[1] = _mm_cvtsi32_si128(*(const int *)(s + 1 * stride));
+  d[2] = _mm_cvtsi32_si128(*(const int *)(s + 2 * stride));
+  d[3] = _mm_cvtsi32_si128(*(const int *)(s + 3 * stride));
+}
+
+static INLINE void load_8bit_4x8(const uint8_t *const s, const ptrdiff_t stride,
+                                 __m128i *const d) {
+  load_8bit_4x4(s + 0 * stride, stride, &d[0]);
+  load_8bit_4x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void load_8bit_8x4(const uint8_t *const s, const ptrdiff_t stride,
+                                 __m128i *const d) {
+  d[0] = _mm_loadl_epi64((const __m128i *)(s + 0 * stride));
+  d[1] = _mm_loadl_epi64((const __m128i *)(s + 1 * stride));
+  d[2] = _mm_loadl_epi64((const __m128i *)(s + 2 * stride));
+  d[3] = _mm_loadl_epi64((const __m128i *)(s + 3 * stride));
+}
+
+static INLINE void load_8bit_8x8(const uint8_t *const s, const ptrdiff_t stride,
+                                 __m128i *const d) {
+  load_8bit_8x4(s + 0 * stride, stride, &d[0]);
+  load_8bit_8x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void load_8bit_16x8(const uint8_t *const s,
+                                  const ptrdiff_t stride, __m128i *const d) {
+  d[0] = _mm_load_si128((const __m128i *)(s + 0 * stride));
+  d[1] = _mm_load_si128((const __m128i *)(s + 1 * stride));
+  d[2] = _mm_load_si128((const __m128i *)(s + 2 * stride));
+  d[3] = _mm_load_si128((const __m128i *)(s + 3 * stride));
+  d[4] = _mm_load_si128((const __m128i *)(s + 4 * stride));
+  d[5] = _mm_load_si128((const __m128i *)(s + 5 * stride));
+  d[6] = _mm_load_si128((const __m128i *)(s + 6 * stride));
+  d[7] = _mm_load_si128((const __m128i *)(s + 7 * stride));
+}
+
+static INLINE void loadu_8bit_16x4(const uint8_t *const s,
+                                   const ptrdiff_t stride, __m128i *const d) {
+  d[0] = _mm_loadu_si128((const __m128i *)(s + 0 * stride));
+  d[1] = _mm_loadu_si128((const __m128i *)(s + 1 * stride));
+  d[2] = _mm_loadu_si128((const __m128i *)(s + 2 * stride));
+  d[3] = _mm_loadu_si128((const __m128i *)(s + 3 * stride));
+}
+
+static INLINE void loadu_8bit_16x8(const uint8_t *const s,
+                                   const ptrdiff_t stride, __m128i *const d) {
+  loadu_8bit_16x4(s + 0 * stride, stride, &d[0]);
+  loadu_8bit_16x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void _mm_storeh_epi64(__m128i *const d, const __m128i s) {
+  _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s));
+}
+
+static INLINE void store_8bit_4x4(const __m128i *const s, uint8_t *const d,
+                                  const ptrdiff_t stride) {
+  *(int *)(d + 0 * stride) = _mm_cvtsi128_si32(s[0]);
+  *(int *)(d + 1 * stride) = _mm_cvtsi128_si32(s[1]);
+  *(int *)(d + 2 * stride) = _mm_cvtsi128_si32(s[2]);
+  *(int *)(d + 3 * stride) = _mm_cvtsi128_si32(s[3]);
+}
+
+static INLINE void store_8bit_4x4_sse2(const __m128i s, uint8_t *const d,
+                                       const ptrdiff_t stride) {
+  __m128i ss[4];
+
+  ss[0] = s;
+  ss[1] = _mm_srli_si128(s, 4);
+  ss[2] = _mm_srli_si128(s, 8);
+  ss[3] = _mm_srli_si128(s, 12);
+  store_8bit_4x4(ss, d, stride);
+}
+
+static INLINE void store_8bit_8x4_from_16x2(const __m128i *const s,
+                                            uint8_t *const d,
+                                            const ptrdiff_t stride) {
+  _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
+  _mm_storeh_epi64((__m128i *)(d + 1 * stride), s[0]);
+  _mm_storel_epi64((__m128i *)(d + 2 * stride), s[1]);
+  _mm_storeh_epi64((__m128i *)(d + 3 * stride), s[1]);
+}
+
+static INLINE void store_8bit_8x8(const __m128i *const s, uint8_t *const d,
+                                  const ptrdiff_t stride) {
+  _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
+  _mm_storel_epi64((__m128i *)(d + 1 * stride), s[1]);
+  _mm_storel_epi64((__m128i *)(d + 2 * stride), s[2]);
+  _mm_storel_epi64((__m128i *)(d + 3 * stride), s[3]);
+  _mm_storel_epi64((__m128i *)(d + 4 * stride), s[4]);
+  _mm_storel_epi64((__m128i *)(d + 5 * stride), s[5]);
+  _mm_storel_epi64((__m128i *)(d + 6 * stride), s[6]);
+  _mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]);
+}
+
+static INLINE void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d,
+                                    const ptrdiff_t stride) {
+  _mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]);
+  _mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]);
+  _mm_storeu_si128((__m128i *)(d + 2 * stride), s[2]);
+  _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]);
+}
+
+#endif  // VPX_VPX_DSP_X86_MEM_SSE2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/post_proc_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/post_proc_sse2.c
new file mode 100644
index 0000000000..119fa7cd1a
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/post_proc_sse2.c
@@ -0,0 +1,141 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include <stdio.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+
+extern const int16_t vpx_rv[];
+
+void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows,
+                               int cols, int flimit) {
+  int col;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i f = _mm_set1_epi32(flimit);
+  DECLARE_ALIGNED(16, int16_t, above_context[8 * 8]);
+
+  // 8 columns are processed at a time.
+  // If rows is less than 8 the bottom border extension fails.
+  assert(cols % 8 == 0);
+  assert(rows >= 8);
+
+  for (col = 0; col < cols; col += 8) {
+    int row, i;
+    __m128i s = _mm_loadl_epi64((__m128i *)dst);
+    __m128i sum, sumsq_0, sumsq_1;
+    __m128i tmp_0, tmp_1;
+    __m128i below_context = _mm_setzero_si128();
+
+    s = _mm_unpacklo_epi8(s, zero);
+
+    for (i = 0; i < 8; ++i) {
+      _mm_store_si128((__m128i *)above_context + i, s);
+    }
+
+    // sum *= 9
+    sum = _mm_slli_epi16(s, 3);
+    sum = _mm_add_epi16(s, sum);
+
+    // sum^2 * 9 == (sum * 9) * sum
+    tmp_0 = _mm_mullo_epi16(sum, s);
+    tmp_1 = _mm_mulhi_epi16(sum, s);
+
+    sumsq_0 = _mm_unpacklo_epi16(tmp_0, tmp_1);
+    sumsq_1 = _mm_unpackhi_epi16(tmp_0, tmp_1);
+
+    // Prime sum/sumsq
+    for (i = 1; i <= 6; ++i) {
+      __m128i a = _mm_loadl_epi64((__m128i *)(dst + i * pitch));
+      a = _mm_unpacklo_epi8(a, zero);
+      sum = _mm_add_epi16(sum, a);
+      a = _mm_mullo_epi16(a, a);
+      sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(a, zero));
+      sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(a, zero));
+    }
+
+    for (row = 0; row < rows + 8; row++) {
+      const __m128i above =
+          _mm_load_si128((__m128i *)above_context + (row & 7));
+      __m128i this_row = _mm_loadl_epi64((__m128i *)(dst + row * pitch));
+      __m128i above_sq, below_sq;
+      __m128i mask_0, mask_1;
+      __m128i multmp_0, multmp_1;
+      __m128i rv;
+      __m128i out;
+
+      this_row = _mm_unpacklo_epi8(this_row, zero);
+
+      if (row + 7 < rows) {
+        // Instead of copying the end context we just stop loading when we get
+        // to the last one.
+        below_context = _mm_loadl_epi64((__m128i *)(dst + (row + 7) * pitch));
+        below_context = _mm_unpacklo_epi8(below_context, zero);
+      }
+
+      sum = _mm_sub_epi16(sum, above);
+      sum = _mm_add_epi16(sum, below_context);
+
+      // context^2 fits in 16 bits. Don't need to mulhi and combine. Just zero
+      // extend. Unfortunately we can't do below_sq - above_sq in 16 bits
+      // because x86 does not have unpack with sign extension.
+      above_sq = _mm_mullo_epi16(above, above);
+      sumsq_0 = _mm_sub_epi32(sumsq_0, _mm_unpacklo_epi16(above_sq, zero));
+      sumsq_1 = _mm_sub_epi32(sumsq_1, _mm_unpackhi_epi16(above_sq, zero));
+
+      below_sq = _mm_mullo_epi16(below_context, below_context);
+      sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(below_sq, zero));
+      sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(below_sq, zero));
+
+      // sumsq * 16 - sumsq == sumsq * 15
+      mask_0 = _mm_slli_epi32(sumsq_0, 4);
+      mask_0 = _mm_sub_epi32(mask_0, sumsq_0);
+      mask_1 = _mm_slli_epi32(sumsq_1, 4);
+      mask_1 = _mm_sub_epi32(mask_1, sumsq_1);
+
+      multmp_0 = _mm_mullo_epi16(sum, sum);
+      multmp_1 = _mm_mulhi_epi16(sum, sum);
+
+      mask_0 = _mm_sub_epi32(mask_0, _mm_unpacklo_epi16(multmp_0, multmp_1));
+      mask_1 = _mm_sub_epi32(mask_1, _mm_unpackhi_epi16(multmp_0, multmp_1));
+
+      // mask - f gives a negative value when mask < f
+      mask_0 = _mm_sub_epi32(mask_0, f);
+      mask_1 = _mm_sub_epi32(mask_1, f);
+
+      // Shift the sign bit down to create a mask
+      mask_0 = _mm_srai_epi32(mask_0, 31);
+      mask_1 = _mm_srai_epi32(mask_1, 31);
+
+      mask_0 = _mm_packs_epi32(mask_0, mask_1);
+
+      rv = _mm_loadu_si128((__m128i const *)(vpx_rv + (row & 127)));
+
+      mask_1 = _mm_add_epi16(rv, sum);
+      mask_1 = _mm_add_epi16(mask_1, this_row);
+      mask_1 = _mm_srai_epi16(mask_1, 4);
+
+      mask_1 = _mm_and_si128(mask_0, mask_1);
+      mask_0 = _mm_andnot_si128(mask_0, this_row);
+      out = _mm_or_si128(mask_1, mask_0);
+
+      _mm_storel_epi64((__m128i *)(dst + row * pitch),
+                       _mm_packus_epi16(out, zero));
+
+      _mm_store_si128((__m128i *)above_context + ((row + 8) & 7), this_row);
+    }
+
+    dst += 8;
+  }
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx.c b/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx.c
new file mode 100644
index 0000000000..5ff5abc110
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx.c
@@ -0,0 +1,254 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
+#include "vpx_dsp/x86/quantize_ssse3.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
+
+void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                        const struct macroblock_plane *const mb_plane,
+                        tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                        const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                        const struct ScanOrder *const scan_order) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m256i big_zero = _mm256_setzero_si256();
+  int index;
+  const int16_t *iscan = scan_order->iscan;
+
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i all_zero;
+  __m128i eob = zero, eob0;
+
+  *eob_ptr = 0;
+
+  load_b_values(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
+
+  qcoeff0 = _mm_abs_epi16(coeff0);
+  qcoeff1 = _mm_abs_epi16(coeff1);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_test_all_zeros(all_zero, all_zero)) {
+    _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    if (n_coeffs == 16) return;
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    // Reinsert signs
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr);
+    store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+    calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
+
+    eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
+  }
+
+  // AC only loop.
+  for (index = 16; index < n_coeffs; index += 16) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+    qcoeff0 = _mm_abs_epi16(coeff0);
+    qcoeff1 = _mm_abs_epi16(coeff1);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_test_all_zeros(all_zero, all_zero)) {
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      continue;
+    }
+
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr + index);
+    store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+    calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
+    calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
+
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
+    eob = _mm_max_epi16(eob, eob0);
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
+
+void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr,
+                              const struct macroblock_plane *const mb_plane,
+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                              const struct ScanOrder *const scan_order) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m256i big_zero = _mm256_setzero_si256();
+  int index;
+  const int16_t *iscan = scan_order->iscan;
+
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i all_zero;
+  __m128i eob = zero, eob0;
+
+  load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant,
+                     &shift);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
+
+  qcoeff0 = _mm_abs_epi16(coeff0);
+  qcoeff1 = _mm_abs_epi16(coeff1);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC.
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_test_all_zeros(all_zero, all_zero)) {
+    _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    // Reinsert signs.
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    // Mask out zbin threshold coeffs.
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr);
+    store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+    calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero, dqcoeff_ptr);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8);
+
+    eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
+  }
+
+  // AC only loop.
+  for (index = 16; index < 32 * 32; index += 16) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+    qcoeff0 = _mm_abs_epi16(coeff0);
+    qcoeff1 = _mm_abs_epi16(coeff1);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_test_all_zeros(all_zero, all_zero)) {
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      continue;
+    }
+
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr + index);
+    store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+    calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero,
+                                      dqcoeff_ptr + index);
+    calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero,
+                                      dqcoeff_ptr + index + 8);
+
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
+    eob = _mm_max_epi16(eob, eob0);
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx2.c
new file mode 100644
index 0000000000..d4872f6bca
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx2.c
@@ -0,0 +1,290 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
+
+static VPX_FORCE_INLINE void load_b_values_avx2(
+    const struct macroblock_plane *mb_plane, __m256i *zbin, __m256i *round,
+    __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant,
+    __m256i *shift, int log_scale) {
+  *zbin =
+      _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->zbin));
+  *zbin = _mm256_permute4x64_epi64(*zbin, 0x54);
+  if (log_scale > 0) {
+    const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1)));
+    *zbin = _mm256_add_epi16(*zbin, rnd);
+    *zbin = _mm256_srai_epi16(*zbin, log_scale);
+  }
+  // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when
+  // calculating the zbin mask. (See quantize_b_logscale{0,1,2}_16)
+  *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1));
+
+  *round =
+      _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->round));
+  *round = _mm256_permute4x64_epi64(*round, 0x54);
+  if (log_scale > 0) {
+    const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1)));
+    *round = _mm256_add_epi16(*round, rnd);
+    *round = _mm256_srai_epi16(*round, log_scale);
+  }
+
+  *quant =
+      _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->quant));
+  *quant = _mm256_permute4x64_epi64(*quant, 0x54);
+  *dequant =
+      _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
+  *dequant = _mm256_permute4x64_epi64(*dequant, 0x54);
+  *shift = _mm256_castsi128_si256(
+      _mm_load_si128((const __m128i *)mb_plane->quant_shift));
+  *shift = _mm256_permute4x64_epi64(*shift, 0x54);
+}
+
+static VPX_FORCE_INLINE __m256i
+load_coefficients_avx2(const tran_low_t *coeff_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  // typedef int32_t tran_low_t;
+  const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+  const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(coeff_ptr + 8));
+  return _mm256_packs_epi32(coeff1, coeff2);
+#else
+  // typedef int16_t tran_low_t;
+  return _mm256_loadu_si256((const __m256i *)coeff_ptr);
+#endif
+}
+
+static VPX_FORCE_INLINE void store_coefficients_avx2(__m256i coeff_vals,
+                                                     tran_low_t *coeff_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  // typedef int32_t tran_low_t;
+  __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15);
+  __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign);
+  __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign);
+  _mm256_storeu_si256((__m256i *)coeff_ptr, coeff_vals_lo);
+  _mm256_storeu_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi);
+#else
+  // typedef int16_t tran_low_t;
+  _mm256_storeu_si256((__m256i *)coeff_ptr, coeff_vals);
+#endif
+}
+
+static VPX_FORCE_INLINE __m256i
+quantize_b_16(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+              tran_low_t *dqcoeff_ptr, __m256i *v_quant, __m256i *v_dequant,
+              __m256i *v_round, __m256i *v_zbin, __m256i *v_quant_shift) {
+  const __m256i v_coeff = load_coefficients_avx2(coeff_ptr);
+  const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff);
+  const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin);
+
+  if (_mm256_movemask_epi8(v_zbin_mask) == 0) {
+    _mm256_storeu_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256());
+    _mm256_storeu_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256());
+#if CONFIG_VP9_HIGHBITDEPTH
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256());
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256());
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    return _mm256_setzero_si256();
+  }
+  {
+    // tmp = v_zbin_mask ? (int64_t)abs_coeff + log_scaled_round : 0
+    const __m256i v_tmp_rnd =
+        _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask);
+
+    const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant);
+    const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd);
+    const __m256i v_tmp32 = _mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift);
+    const __m256i v_nz_mask =
+        _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256());
+    const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff);
+#if CONFIG_VP9_HIGHBITDEPTH
+    const __m256i low = _mm256_mullo_epi16(v_qcoeff, *v_dequant);
+    const __m256i high = _mm256_mulhi_epi16(v_qcoeff, *v_dequant);
+
+    const __m256i v_dqcoeff_lo = _mm256_unpacklo_epi16(low, high);
+    const __m256i v_dqcoeff_hi = _mm256_unpackhi_epi16(low, high);
+#else
+    const __m256i v_dqcoeff = _mm256_mullo_epi16(v_qcoeff, *v_dequant);
+#endif
+
+    store_coefficients_avx2(v_qcoeff, qcoeff_ptr);
+#if CONFIG_VP9_HIGHBITDEPTH
+    _mm256_storeu_si256((__m256i *)(dqcoeff_ptr), v_dqcoeff_lo);
+    _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + 8), v_dqcoeff_hi);
+#else
+    store_coefficients_avx2(v_dqcoeff, dqcoeff_ptr);
+#endif
+    return v_nz_mask;
+  }
+}
+
+static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan,
+                                                 __m256i v_eobmax,
+                                                 __m256i v_mask) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const __m256i v_iscan = _mm256_permute4x64_epi64(
+      _mm256_loadu_si256((const __m256i *)iscan), 0xD8);
+#else
+  const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan);
+#endif
+  const __m256i v_nz_iscan = _mm256_and_si256(v_iscan, v_mask);
+  return _mm256_max_epi16(v_eobmax, v_nz_iscan);
+}
+
+static VPX_FORCE_INLINE int16_t accumulate_eob256(__m256i eob256) {
+  const __m128i eob_lo = _mm256_castsi256_si128(eob256);
+  const __m128i eob_hi = _mm256_extractf128_si256(eob256, 1);
+  __m128i eob = _mm_max_epi16(eob_lo, eob_hi);
+  __m128i eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  return _mm_extract_epi16(eob, 1);
+}
+
+void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                         const struct macroblock_plane *const mb_plane,
+                         tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                         const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                         const struct ScanOrder *const scan_order) {
+  __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift, v_nz_mask;
+  __m256i v_eobmax = _mm256_setzero_si256();
+  intptr_t count;
+  const int16_t *iscan = scan_order->iscan;
+
+  load_b_values_avx2(mb_plane, &v_zbin, &v_round, &v_quant, dequant_ptr,
+                     &v_dequant, &v_quant_shift, 0);
+  // Do DC and first 15 AC.
+  v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant,
+                            &v_dequant, &v_round, &v_zbin, &v_quant_shift);
+
+  v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask);
+
+  v_round = _mm256_unpackhi_epi64(v_round, v_round);
+  v_quant = _mm256_unpackhi_epi64(v_quant, v_quant);
+  v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant);
+  v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift);
+  v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin);
+
+  for (count = n_coeffs - 16; count > 0; count -= 16) {
+    coeff_ptr += 16;
+    qcoeff_ptr += 16;
+    dqcoeff_ptr += 16;
+    iscan += 16;
+    v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant,
+                              &v_dequant, &v_round, &v_zbin, &v_quant_shift);
+
+    v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask);
+  }
+
+  *eob_ptr = accumulate_eob256(v_eobmax);
+}
+
+static VPX_FORCE_INLINE __m256i quantize_b_32x32_16(
+    const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *iscan, __m256i *v_quant,
+    __m256i *v_dequant, __m256i *v_round, __m256i *v_zbin,
+    __m256i *v_quant_shift, __m256i *v_eobmax) {
+  const __m256i v_coeff = load_coefficients_avx2(coeff_ptr);
+  const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff);
+  const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin);
+
+  if (_mm256_movemask_epi8(v_zbin_mask) == 0) {
+    _mm256_store_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256());
+    _mm256_store_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256());
+#if CONFIG_VP9_HIGHBITDEPTH
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256());
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256());
+#endif
+    return *v_eobmax;
+  }
+  {
+    // tmp = v_zbin_mask ? (int64_t)abs_coeff + round : 0
+    const __m256i v_tmp_rnd =
+        _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask);
+    //  tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+    //                 quant_shift_ptr[rc != 0]) >> 15);
+    const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant);
+    const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd);
+    const __m256i v_tmp32_hi =
+        _mm256_slli_epi16(_mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift), 1);
+    const __m256i v_tmp32_lo =
+        _mm256_srli_epi16(_mm256_mullo_epi16(v_tmp32_b, *v_quant_shift), 15);
+    const __m256i v_tmp32 = _mm256_or_si256(v_tmp32_hi, v_tmp32_lo);
+    const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff);
+    const __m256i v_sign_lo =
+        _mm256_unpacklo_epi16(_mm256_setzero_si256(), v_coeff);
+    const __m256i v_sign_hi =
+        _mm256_unpackhi_epi16(_mm256_setzero_si256(), v_coeff);
+    const __m256i low = _mm256_mullo_epi16(v_tmp32, *v_dequant);
+    const __m256i high = _mm256_mulhi_epi16(v_tmp32, *v_dequant);
+    const __m256i v_dqcoeff_lo = _mm256_sign_epi32(
+        _mm256_srli_epi32(_mm256_unpacklo_epi16(low, high), 1), v_sign_lo);
+    const __m256i v_dqcoeff_hi = _mm256_sign_epi32(
+        _mm256_srli_epi32(_mm256_unpackhi_epi16(low, high), 1), v_sign_hi);
+    const __m256i v_nz_mask =
+        _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256());
+
+    store_coefficients_avx2(v_qcoeff, qcoeff_ptr);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    _mm256_storeu_si256((__m256i *)(dqcoeff_ptr), v_dqcoeff_lo);
+    _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + 8), v_dqcoeff_hi);
+#else
+    store_coefficients_avx2(_mm256_packs_epi32(v_dqcoeff_lo, v_dqcoeff_hi),
+                            dqcoeff_ptr);
+#endif
+
+    return get_max_lane_eob(iscan, *v_eobmax, v_nz_mask);
+  }
+}
+
+void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr,
+                               const struct macroblock_plane *const mb_plane,
+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                               const struct ScanOrder *const scan_order) {
+  __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift;
+  __m256i v_eobmax = _mm256_setzero_si256();
+  intptr_t count;
+  const int16_t *iscan = scan_order->iscan;
+
+  load_b_values_avx2(mb_plane, &v_zbin, &v_round, &v_quant, dequant_ptr,
+                     &v_dequant, &v_quant_shift, 1);
+
+  // Do DC and first 15 AC.
+  v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan,
+                                 &v_quant, &v_dequant, &v_round, &v_zbin,
+                                 &v_quant_shift, &v_eobmax);
+
+  v_round = _mm256_unpackhi_epi64(v_round, v_round);
+  v_quant = _mm256_unpackhi_epi64(v_quant, v_quant);
+  v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant);
+  v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift);
+  v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin);
+
+  for (count = (32 * 32) - 16; count > 0; count -= 16) {
+    coeff_ptr += 16;
+    qcoeff_ptr += 16;
+    dqcoeff_ptr += 16;
+    iscan += 16;
+    v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan,
+                                   &v_quant, &v_dequant, &v_round, &v_zbin,
+                                   &v_quant_shift, &v_eobmax);
+  }
+
+  *eob_ptr = accumulate_eob256(v_eobmax);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx_x86_64.asm b/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx_x86_64.asm
deleted file mode 100644
index 01c41291be..0000000000
--- a/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx_x86_64.asm
+++ /dev/null
@@ -1,544 +0,0 @@
-;
-;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro QUANTIZE_FN 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
-                                shift, qcoeff, dqcoeff, dequant, \
-                                eob, scan, iscan
-
-  vzeroupper
-
-  ; If we can skip this block, then just zero the output
-  cmp                         skipmp, 0
-  jne .blank
-
-%ifnidn %1, b_32x32
-
-  ; Special case for ncoeff == 16, as it is frequent and we can save on
-  ; not setting up a loop.
-  cmp                       ncoeffmp, 16
-  jne .generic
-
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-  ;; Special case of ncoeff == 16
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-.single:
-
-  movifnidn                   coeffq, coeffmp
-  movifnidn                    zbinq, zbinmp
-  mova                            m0, [zbinq]              ; m0 = zbin
-
-  ; Get DC and first 15 AC coeffs - in this special case, that is all.
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; coeff stored as 32bit numbers but we process them as 16 bit numbers
-  mova                            m9, [coeffq]
-  packssdw                        m9, [coeffq+16]          ; m9 = c[i]
-  mova                           m10, [coeffq+32]
-  packssdw                       m10, [coeffq+48]          ; m10 = c[i]
-%else
-  mova                            m9, [coeffq]             ; m9 = c[i]
-  mova                           m10, [coeffq+16]          ; m10 = c[i]
-%endif
-
-  mov                             r0, eobmp                ; Output pointer
-  mov                             r1, qcoeffmp             ; Output pointer
-  mov                             r2, dqcoeffmp            ; Output pointer
-
-  pxor                            m5, m5                   ; m5 = dedicated zero
-
-  pcmpeqw                         m4, m4                   ; All word lanes -1
-  paddw                           m0, m4                   ; m0 = zbin - 1
-
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  punpckhqdq                      m0, m0
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-
-  ; Check if all coeffs are less than zbin. If yes, we just write zeros
-  ; to the outputs and we are done.
-  por                            m14, m7, m12
-  ptest                          m14, m14
-  jnz .single_nonzero
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova                       [r1   ], ymm5
-  mova                       [r1+32], ymm5
-  mova                       [r2   ], ymm5
-  mova                       [r2+32], ymm5
-%else
-  mova                          [r1], ymm5
-  mova                          [r2], ymm5
-%endif
-  mov                           [r0], word 0
-
-  vzeroupper
-  RET
-
-.single_nonzero:
-
-  ; Actual quantization of size 16 block - setup pointers, rounders, etc.
-  movifnidn                       r4, roundmp
-  movifnidn                       r5, quantmp
-  mov                             r3, dequantmp
-  mov                             r6, shiftmp
-  mova                            m1, [r4]              ; m1 = round
-  mova                            m2, [r5]              ; m2 = quant
-  mova                            m3, [r3]              ; m3 = dequant
-  mova                            m4, [r6]              ; m4 = shift
-
-  mov                             r3, iscanmp
-
-  DEFINE_ARGS eob, qcoeff, dqcoeff, iscan
-
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-  paddsw                          m6, m1                   ; m6 += round
-  punpckhqdq                      m1, m1
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
-  punpckhqdq                      m2, m2
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                           m8, m6                   ; m8 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
-  punpckhqdq                      m4, m4
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                          m8, m9                   ; m8 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                            m8, m7
-  pand                           m13, m12
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m8
-  punpckhwd                       m6, m8, m6
-  pmovsxwd                       m11, m8
-  mova                  [qcoeffq   ], m11
-  mova                  [qcoeffq+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova                  [qcoeffq+32], m11
-  mova                  [qcoeffq+48], m6
-%else
-  mova                  [qcoeffq   ], m8
-  mova                  [qcoeffq+16], m13
-%endif
-
-  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
-  punpckhqdq                      m3, m3
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m8
-  punpckhwd                       m6, m8, m6
-  pmovsxwd                       m11, m8
-  mova                 [dqcoeffq   ], m11
-  mova                 [dqcoeffq+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova                 [dqcoeffq+32], m11
-  mova                 [dqcoeffq+48], m6
-%else
-  mova                 [dqcoeffq   ], m8
-  mova                 [dqcoeffq+16], m13
-%endif
-
-  mova                            m6, [iscanq]            ; m6 = scan[i]
-  mova                           m11, [iscanq+16]         ; m11 = scan[i]
-
-  pcmpeqw                         m8,  m8,  m5            ; m8 = c[i] == 0
-  pcmpeqw                        m13, m13,  m5            ; m13 = c[i] == 0
-  psubw                           m6,  m6,  m7            ; m6 = scan[i] + 1
-  psubw                          m11, m11, m12            ; m11 = scan[i] + 1
-  pandn                           m8,  m8,  m6            ; m8 = max(eob)
-  pandn                          m13, m13, m11            ; m13 = max(eob)
-  pmaxsw                          m8,  m8, m13
-
-  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  movq                           rax, m8
-  mov                         [eobq], ax
-
-  vzeroupper
-  RET
-
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-  ;; Generic case of ncoeff != 16
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-.generic:
-
-%endif ; %ifnidn %1, b_32x32
-
-DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
-            qcoeff, dqcoeff, dequant, eob, scan, iscan
-
-  ; Actual quantization loop - setup pointers, rounders, etc.
-  movifnidn                   coeffq, coeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, dequantmp
-  movifnidn                    zbinq, zbinmp
-  movifnidn                   roundq, roundmp
-  movifnidn                   quantq, quantmp
-  mova                            m0, [zbinq]              ; m0 = zbin
-  mova                            m1, [roundq]             ; m1 = round
-  mova                            m2, [quantq]             ; m2 = quant
-  mova                            m3, [r2]                 ; m3 = dequant
-  pcmpeqw                         m4, m4                   ; All lanes -1
-%ifidn %1, b_32x32
-  psubw                           m0, m4
-  psubw                           m1, m4
-  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
-  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
-%endif
-  paddw                           m0, m4                   ; m0 = m0 + 1
-
-  mov                             r2, shiftmp
-  mov                             r3, qcoeffmp
-  mova                            m4, [r2]                 ; m4 = shift
-  mov                             r4, dqcoeffmp
-  mov                             r5, iscanmp
-%ifidn %1, b_32x32
-  psllw                           m4, 1
-%endif
-  pxor                            m5, m5                   ; m5 = dedicated zero
-
-  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  lea                         coeffq, [  coeffq+ncoeffq*4]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
-%else
-  lea                         coeffq, [  coeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-%endif
-  lea                         iscanq, [  iscanq+ncoeffq*2]
-  neg                        ncoeffq
-
-  ; get DC and first 15 AC coeffs
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; coeff stored as 32bit numbers & require 16bit numbers
-  mova                            m9, [coeffq+ncoeffq*4+ 0]
-  packssdw                        m9, [coeffq+ncoeffq*4+16]
-  mova                           m10, [coeffq+ncoeffq*4+32]
-  packssdw                       m10, [coeffq+ncoeffq*4+48]
-%else
-  mova                            m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [coeffq+ncoeffq*2+16] ; m10 = c[i]
-%endif
-
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  punpckhqdq                      m0, m0
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-
-  ; Check if all coeffs are less than zbin. If yes, skip forward quickly.
-  por                            m14, m7, m12
-  ptest                          m14, m14
-  jnz .first_nonzero
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova        [qcoeffq+ncoeffq*4   ], ymm5
-  mova        [qcoeffq+ncoeffq*4+32], ymm5
-  mova       [dqcoeffq+ncoeffq*4   ], ymm5
-  mova       [dqcoeffq+ncoeffq*4+32], ymm5
-%else
-  mova           [qcoeffq+ncoeffq*2], ymm5
-  mova          [dqcoeffq+ncoeffq*2], ymm5
-%endif
-
-  add                        ncoeffq, mmsize
-
-  punpckhqdq                      m1, m1
-  punpckhqdq                      m2, m2
-  punpckhqdq                      m3, m3
-  punpckhqdq                      m4, m4
-  pxor                            m8, m8
-
-  jmp .ac_only_loop
-
-.first_nonzero:
-
-  paddsw                          m6, m1                   ; m6 += round
-  punpckhqdq                      m1, m1
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
-  punpckhqdq                      m2, m2
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                           m8, m6                   ; m8 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
-  punpckhqdq                      m4, m4
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                          m8, m9                   ; m8 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                            m8, m7
-  pand                           m13, m12
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m8
-  punpckhwd                       m6, m8, m6
-  pmovsxwd                       m11, m8
-  mova        [qcoeffq+ncoeffq*4+ 0], m11
-  mova        [qcoeffq+ncoeffq*4+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova        [qcoeffq+ncoeffq*4+32], m11
-  mova        [qcoeffq+ncoeffq*4+48], m6
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], m8
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%endif
-
-%ifidn %1, b_32x32
-  pabsw                           m8, m8
-  pabsw                          m13, m13
-%endif
-  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
-  punpckhqdq                      m3, m3
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                           m8, 1
-  psrlw                          m13, 1
-  psignw                          m8, m9
-  psignw                         m13, m10
-%endif
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m8
-  punpckhwd                       m6, m8, m6
-  pmovsxwd                       m11, m8
-  mova       [dqcoeffq+ncoeffq*4+ 0], m11
-  mova       [dqcoeffq+ncoeffq*4+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova       [dqcoeffq+ncoeffq*4+32], m11
-  mova       [dqcoeffq+ncoeffq*4+48], m6
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], m8
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-%endif
-
-  pcmpeqw                         m8, m5                    ; m8 = c[i] == 0
-  pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
-  mova                            m6, [iscanq+ncoeffq*2]    ; m6 = scan[i]
-  mova                           m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                    ; m6 = scan[i] + 1
-  psubw                          m11, m12                   ; m11 = scan[i] + 1
-  pandn                           m8, m6                    ; m8 = max(eob)
-  pandn                          m13, m11                   ; m13 = max(eob)
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-
-.ac_only_loop:
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; pack coeff from 32bit to 16bit array
-  mova                            m9, [coeffq+ncoeffq*4+ 0]
-  packssdw                        m9, [coeffq+ncoeffq*4+16]
-  mova                           m10, [coeffq+ncoeffq*4+32]
-  packssdw                       m10, [coeffq+ncoeffq*4+48]
-%else
-  mova                            m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [coeffq+ncoeffq*2+16] ; m10 = c[i]
-%endif
-
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-
-  ; Check if all coeffs are less than zbin. If yes, skip this itertion.
-  ; And just write zeros as the result would be.
-  por                            m14, m7, m12
-  ptest                          m14, m14
-  jnz .rest_nonzero
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova        [qcoeffq+ncoeffq*4+ 0], ymm5
-  mova        [qcoeffq+ncoeffq*4+32], ymm5
-  mova       [dqcoeffq+ncoeffq*4+ 0], ymm5
-  mova       [dqcoeffq+ncoeffq*4+32], ymm5
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], ymm5
-  mova       [dqcoeffq+ncoeffq*2+ 0], ymm5
-%endif
-  add                        ncoeffq, mmsize
-  jnz .ac_only_loop
-
-  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
-  mov                             r2, eobmp
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  movq                           rax, m8
-  mov                           [r2], ax
-  vzeroupper
-  RET
-
-.rest_nonzero:
-  paddsw                          m6, m1                   ; m6 += round
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                          m14, m6                   ; m14 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                         m14, m9                   ; m14 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                           m14, m7
-  pand                           m13, m12
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m14
-  punpckhwd                       m6, m14, m6
-  pmovsxwd                       m11, m14
-  mova        [qcoeffq+ncoeffq*4+ 0], m11
-  mova        [qcoeffq+ncoeffq*4+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova        [qcoeffq+ncoeffq*4+32], m11
-  mova        [qcoeffq+ncoeffq*4+48], m6
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], m14
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%endif
-
-%ifidn %1, b_32x32
-  pabsw                          m14, m14
-  pabsw                          m13, m13
-%endif
-  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                          m14, 1
-  psrlw                          m13, 1
-  psignw                         m14, m9
-  psignw                         m13, m10
-%endif
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m14
-  punpckhwd                       m6, m14, m6
-  pmovsxwd                       m11, m14
-  mova       [dqcoeffq+ncoeffq*4+ 0], m11
-  mova       [dqcoeffq+ncoeffq*4+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova       [dqcoeffq+ncoeffq*4+32], m11
-  mova       [dqcoeffq+ncoeffq*4+48], m6
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], m14
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-%endif
-
-  pcmpeqw                        m14, m5                    ; m14 = c[i] == 0
-  pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
-  mova                            m6, [iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                    ; m6 = scan[i] + 1
-  psubw                          m11, m12                   ; m11 = scan[i] + 1
-  pandn                          m14, m6                    ; m14 = max(eob)
-  pandn                          m13, m11                   ; m13 = max(eob)
-  pmaxsw                          m8, m14
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jnz .ac_only_loop
-
-  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
-  mov                             r2, eobmp
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  movq                           rax, m8
-  mov                           [r2], ax
-  vzeroupper
-  RET
-
-  ; Skip-block, i.e. just write all zeroes
-.blank:
-
-DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
-            qcoeff, dqcoeff, dequant, eob, scan, iscan
-
-  mov                             r0, dqcoeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, qcoeffmp
-  mov                             r3, eobmp
-
-DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
-%else
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-%endif
-
-  neg                        ncoeffq
-  pxor                            m7, m7
-
-.blank_loop:
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova       [dqcoeffq+ncoeffq*4+ 0], ymm7
-  mova       [dqcoeffq+ncoeffq*4+32], ymm7
-  mova        [qcoeffq+ncoeffq*4+ 0], ymm7
-  mova        [qcoeffq+ncoeffq*4+32], ymm7
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], ymm7
-  mova        [qcoeffq+ncoeffq*2+ 0], ymm7
-%endif
-  add                        ncoeffq, mmsize
-  jl .blank_loop
-
-  mov                         [eobq], word 0
-
-  vzeroupper
-  RET
-%endmacro
-
-INIT_XMM avx
-QUANTIZE_FN b, 7
-QUANTIZE_FN b_32x32, 7
-
-END
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.c
index 0580a7bd7b..64838eaa7d 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.c
+++ b/media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.c
@@ -8,216 +8,106 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 #include <emmintrin.h>
 #include <xmmintrin.h>
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
-#include "vpx_dsp/x86/fdct.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
+#include "vp9/common/vp9_scan.h"
 
 void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                         int skip_block, const int16_t *zbin_ptr,
-                         const int16_t *round_ptr, const int16_t *quant_ptr,
-                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                         uint16_t *eob_ptr, const int16_t *scan_ptr,
-                         const int16_t *iscan_ptr) {
-  __m128i zero;
-  (void)scan_ptr;
+                         const struct macroblock_plane *const mb_plane,
+                         tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                         const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                         const struct ScanOrder *const scan_order) {
+  const __m128i zero = _mm_setzero_si128();
+  int index = 16;
+  const int16_t *iscan = scan_order->iscan;
 
-  coeff_ptr += n_coeffs;
-  iscan_ptr += n_coeffs;
-  qcoeff_ptr += n_coeffs;
-  dqcoeff_ptr += n_coeffs;
-  n_coeffs = -n_coeffs;
-  zero = _mm_setzero_si128();
-  if (!skip_block) {
-    __m128i eob;
-    __m128i zbin;
-    __m128i round, quant, dequant, shift;
-    {
-      __m128i coeff0, coeff1;
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i eob, eob0;
 
-      // Setup global values
-      {
-        __m128i pw_1;
-        zbin = _mm_load_si128((const __m128i *)zbin_ptr);
-        round = _mm_load_si128((const __m128i *)round_ptr);
-        quant = _mm_load_si128((const __m128i *)quant_ptr);
-        pw_1 = _mm_set1_epi16(1);
-        zbin = _mm_sub_epi16(zbin, pw_1);
-        dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-        shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
-      }
+  // Setup global values.
+  load_b_values(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift);
 
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-        __m128i cmp_mask0, cmp_mask1;
-        // Do DC and first 15 AC
-        coeff0 = load_tran_low(coeff_ptr + n_coeffs);
-        coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
 
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+  // Poor man's abs().
+  coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
 
-        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
-        zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
-        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        round = _mm_unpackhi_epi64(round, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        quant = _mm_unpackhi_epi64(quant, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
-        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
-        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
-        shift = _mm_unpackhi_epi64(shift, shift);
-        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
 
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+  calculate_qcoeff(&qcoeff0, round, quant, shift);
 
-        // Mask out zbin threshold coeffs
-        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
-        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+  round = _mm_unpackhi_epi64(round, round);
+  quant = _mm_unpackhi_epi64(quant, quant);
+  shift = _mm_unpackhi_epi64(shift, shift);
 
-        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
-        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
+  calculate_qcoeff(&qcoeff1, round, quant, shift);
 
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        dequant = _mm_unpackhi_epi64(dequant, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+  // Reinsert signs
+  qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
 
-        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
-        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
-      }
+  // Mask out zbin threshold coeffs
+  qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+  qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
 
-      {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob = _mm_max_epi16(eob, eob1);
-      }
-      n_coeffs += 8 * 2;
-    }
+  store_tran_low(qcoeff0, qcoeff_ptr);
+  store_tran_low(qcoeff1, qcoeff_ptr + 8);
 
-    // AC only loop
-    while (n_coeffs < 0) {
-      __m128i coeff0, coeff1;
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-        __m128i cmp_mask0, cmp_mask1;
+  calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);
+  dequant = _mm_unpackhi_epi64(dequant, dequant);
+  calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
 
-        coeff0 = load_tran_low(coeff_ptr + n_coeffs);
-        coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
+  eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
 
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
 
-        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
-        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
-        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
-        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
-        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
+    coeff0_sign = _mm_srai_epi16(coeff0, 15);
+    coeff1_sign = _mm_srai_epi16(coeff1, 15);
+    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
 
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
 
-        // Mask out zbin threshold coeffs
-        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
-        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
 
-        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
-        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
+    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
 
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
 
-        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
-        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
-      }
+    store_tran_low(qcoeff0, qcoeff_ptr + index);
+    store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
 
-      {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob0, eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob0 = _mm_max_epi16(eob0, eob1);
-        eob = _mm_max_epi16(eob, eob0);
-      }
-      n_coeffs += 8 * 2;
-    }
+    calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
+    calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
 
-    // Accumulate EOB
-    {
-      __m128i eob_shuffled;
-      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      *eob_ptr = _mm_extract_epi16(eob, 1);
-    }
-  } else {
-    do {
-      store_tran_low(zero, dqcoeff_ptr + n_coeffs);
-      store_tran_low(zero, dqcoeff_ptr + n_coeffs + 8);
-      store_tran_low(zero, qcoeff_ptr + n_coeffs);
-      store_tran_low(zero, qcoeff_ptr + n_coeffs + 8);
-      n_coeffs += 8 * 2;
-    } while (n_coeffs < 0);
-    *eob_ptr = 0;
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
+    eob = _mm_max_epi16(eob, eob0);
+
+    index += 16;
   }
+
+  *eob_ptr = accumulate_eob(eob);
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.h
new file mode 100644
index 0000000000..82c755a0cf
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.h
@@ -0,0 +1,126 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_QUANTIZE_SSE2_H_
+#define VPX_VPX_DSP_X86_QUANTIZE_SSE2_H_
+
+#include <emmintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_block.h"
+
+static INLINE void load_b_values(const struct macroblock_plane *const mb_plane,
+                                 __m128i *zbin, __m128i *round, __m128i *quant,
+                                 const int16_t *dequant_ptr, __m128i *dequant,
+                                 __m128i *shift) {
+  *zbin = _mm_load_si128((const __m128i *)mb_plane->zbin);
+  *round = _mm_load_si128((const __m128i *)mb_plane->round);
+  *quant = _mm_load_si128((const __m128i *)mb_plane->quant);
+  *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1));
+  *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  *shift = _mm_load_si128((const __m128i *)mb_plane->quant_shift);
+}
+
+static INLINE void load_b_values32x32(
+    const struct macroblock_plane *const mb_plane, __m128i *zbin,
+    __m128i *round, __m128i *quant, const int16_t *dequant_ptr,
+    __m128i *dequant, __m128i *shift) {
+  const __m128i one = _mm_set1_epi16(1);
+  // The 32x32 halves zbin and round.
+  *zbin = _mm_load_si128((const __m128i *)mb_plane->zbin);
+  // Shift with rounding.
+  *zbin = _mm_add_epi16(*zbin, one);
+  *zbin = _mm_srli_epi16(*zbin, 1);
+  // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
+  // it is a strict "greater" comparison.
+  *zbin = _mm_sub_epi16(*zbin, one);
+
+  *round = _mm_load_si128((const __m128i *)mb_plane->round);
+  *round = _mm_add_epi16(*round, one);
+  *round = _mm_srli_epi16(*round, 1);
+
+  *quant = _mm_load_si128((const __m128i *)mb_plane->quant);
+  *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  *shift = _mm_load_si128((const __m128i *)mb_plane->quant_shift);
+  // I suspect this is not technically OK because quant_shift can be up
+  // to 1 << 16 and shifting up again will outrange that, but the test is not
+  // comprehensive enough to catch that and "it's been that way forever"
+  *shift = _mm_slli_epi16(*shift, 1);
+}
+
+static INLINE void load_fp_values(const struct macroblock_plane *mb_plane,
+                                  __m128i *round, __m128i *quant,
+                                  const int16_t *dequant_ptr,
+                                  __m128i *dequant) {
+  *round = _mm_load_si128((const __m128i *)mb_plane->round_fp);
+  *quant = _mm_load_si128((const __m128i *)mb_plane->quant_fp);
+  *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+}
+
+// With ssse3 and later abs() and sign() are preferred.
+static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) {
+  a = _mm_xor_si128(a, sign);
+  return _mm_sub_epi16(a, sign);
+}
+
+static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round,
+                                    const __m128i quant, const __m128i shift) {
+  __m128i tmp, qcoeff;
+  qcoeff = _mm_adds_epi16(*coeff, round);
+  tmp = _mm_mulhi_epi16(qcoeff, quant);
+  qcoeff = _mm_add_epi16(tmp, qcoeff);
+  *coeff = _mm_mulhi_epi16(qcoeff, shift);
+}
+
+static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant,
+                                               tran_low_t *dqcoeff) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const __m128i low = _mm_mullo_epi16(qcoeff, dequant);
+  const __m128i high = _mm_mulhi_epi16(qcoeff, dequant);
+
+  const __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high);
+  const __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high);
+
+  _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);
+  _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);
+#else
+  const __m128i dqcoeff16 = _mm_mullo_epi16(qcoeff, dequant);
+
+  _mm_store_si128((__m128i *)(dqcoeff), dqcoeff16);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+
+// Scan 16 values for eob reference in scan.
+static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
+                                   const int16_t *scan, const int index,
+                                   const __m128i zero) {
+  const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero);
+  const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero);
+  __m128i scan0 = _mm_load_si128((const __m128i *)(scan + index));
+  __m128i scan1 = _mm_load_si128((const __m128i *)(scan + index + 8));
+  __m128i eob0, eob1;
+  eob0 = _mm_andnot_si128(zero_coeff0, scan0);
+  eob1 = _mm_andnot_si128(zero_coeff1, scan1);
+  return _mm_max_epi16(eob0, eob1);
+}
+
+static INLINE int16_t accumulate_eob(__m128i eob) {
+  __m128i eob_shuffled;
+  eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  return _mm_extract_epi16(eob, 1);
+}
+
+#endif  // VPX_VPX_DSP_X86_QUANTIZE_SSE2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.c b/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.c
new file mode 100644
index 0000000000..2c6d851a16
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.c
@@ -0,0 +1,228 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <tmmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
+#include "vpx_dsp/x86/quantize_ssse3.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
+
+void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                          const struct macroblock_plane *const mb_plane,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const struct ScanOrder *const scan_order) {
+  const __m128i zero = _mm_setzero_si128();
+  int index = 16;
+  const int16_t *iscan = scan_order->iscan;
+
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i eob, eob0;
+
+  load_b_values(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
+
+  qcoeff0 = _mm_abs_epi16(coeff0);
+  qcoeff1 = _mm_abs_epi16(coeff1);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+  calculate_qcoeff(&qcoeff0, round, quant, shift);
+  round = _mm_unpackhi_epi64(round, round);
+  quant = _mm_unpackhi_epi64(quant, quant);
+  shift = _mm_unpackhi_epi64(shift, shift);
+  calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+  // Reinsert signs
+  qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+  qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+  // Mask out zbin threshold coeffs
+  qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+  qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+  store_tran_low(qcoeff0, qcoeff_ptr);
+  store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+  calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);
+  dequant = _mm_unpackhi_epi64(dequant, dequant);
+  calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
+
+  eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+    qcoeff0 = _mm_abs_epi16(coeff0);
+    qcoeff1 = _mm_abs_epi16(coeff1);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr + index);
+    store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+    calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
+    calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
+
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
+    eob = _mm_max_epi16(eob, eob0);
+
+    index += 16;
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
+
+void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr,
+                                const struct macroblock_plane *const mb_plane,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const struct ScanOrder *const scan_order) {
+  const __m128i zero = _mm_setzero_si128();
+  int index;
+  const int16_t *iscan = scan_order->iscan;
+
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i all_zero;
+  __m128i eob = zero, eob0;
+
+  load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant,
+                     &shift);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
+
+  qcoeff0 = _mm_abs_epi16(coeff0);
+  qcoeff1 = _mm_abs_epi16(coeff1);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC.
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_movemask_epi8(all_zero) == 0) {
+    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+#endif  // CONFIG_HIGHBITDEPTH
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    // Reinsert signs.
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    // Mask out zbin threshold coeffs.
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr);
+    store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+    calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero, dqcoeff_ptr);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8);
+
+    eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
+  }
+
+  // AC only loop.
+  for (index = 16; index < 32 * 32; index += 16) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+    qcoeff0 = _mm_abs_epi16(coeff0);
+    qcoeff1 = _mm_abs_epi16(coeff1);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_movemask_epi8(all_zero) == 0) {
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      continue;
+    }
+
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr + index);
+    store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+    calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero,
+                                      dqcoeff_ptr + index);
+    calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero,
+                                      dqcoeff_ptr + 8 + index);
+
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
+    eob = _mm_max_epi16(eob, eob0);
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.h b/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.h
new file mode 100644
index 0000000000..e8d2a05771
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.h
@@ -0,0 +1,51 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_QUANTIZE_SSSE3_H_
+#define VPX_VPX_DSP_X86_QUANTIZE_SSSE3_H_
+
+#include <emmintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
+
+static INLINE void calculate_dqcoeff_and_store_32x32(const __m128i qcoeff,
+                                                     const __m128i dequant,
+                                                     const __m128i zero,
+                                                     tran_low_t *dqcoeff) {
+  // Un-sign to bias rounding like C.
+  const __m128i coeff = _mm_abs_epi16(qcoeff);
+
+  const __m128i sign_0 = _mm_unpacklo_epi16(zero, qcoeff);
+  const __m128i sign_1 = _mm_unpackhi_epi16(zero, qcoeff);
+
+  const __m128i low = _mm_mullo_epi16(coeff, dequant);
+  const __m128i high = _mm_mulhi_epi16(coeff, dequant);
+  __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high);
+  __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high);
+
+  // "Divide" by 2.
+  dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, 1);
+  dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, 1);
+
+  dqcoeff32_0 = _mm_sign_epi32(dqcoeff32_0, sign_0);
+  dqcoeff32_1 = _mm_sign_epi32(dqcoeff32_1, sign_1);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);
+  _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);
+#else
+  _mm_store_si128((__m128i *)(dqcoeff),
+                  _mm_packs_epi32(dqcoeff32_0, dqcoeff32_1));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+
+#endif  // VPX_VPX_DSP_X86_QUANTIZE_SSSE3_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm b/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm
deleted file mode 100644
index ca21539173..0000000000
--- a/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm
+++ /dev/null
@@ -1,346 +0,0 @@
-;
-;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pw_1: times 8 dw 1
-
-SECTION .text
-
-; TODO(yunqingwang)fix quantize_b code for skip=1 case.
-%macro QUANTIZE_FN 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
-                                shift, qcoeff, dqcoeff, dequant, \
-                                eob, scan, iscan
-  cmp                    dword skipm, 0
-  jne .blank
-
-  ; actual quantize loop - setup pointers, rounders, etc.
-  movifnidn                   coeffq, coeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, dequantmp
-  movifnidn                    zbinq, zbinmp
-  movifnidn                   roundq, roundmp
-  movifnidn                   quantq, quantmp
-  mova                            m0, [zbinq]              ; m0 = zbin
-  mova                            m1, [roundq]             ; m1 = round
-  mova                            m2, [quantq]             ; m2 = quant
-%ifidn %1, b_32x32
-  pcmpeqw                         m5, m5
-  psrlw                           m5, 15
-  paddw                           m0, m5
-  paddw                           m1, m5
-  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
-  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
-%endif
-  mova                            m3, [r2q]                ; m3 = dequant
-  psubw                           m0, [pw_1]
-  mov                             r2, shiftmp
-  mov                             r3, qcoeffmp
-  mova                            m4, [r2]                 ; m4 = shift
-  mov                             r4, dqcoeffmp
-  mov                             r5, iscanmp
-%ifidn %1, b_32x32
-  psllw                           m4, 1
-%endif
-  pxor                            m5, m5                   ; m5 = dedicated zero
-  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
-%if CONFIG_VP9_HIGHBITDEPTH
-  lea                         coeffq, [  coeffq+ncoeffq*4]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
-%else
-  lea                         coeffq, [  coeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-%endif
-  lea                         iscanq, [  iscanq+ncoeffq*2]
-  neg                        ncoeffq
-
-  ; get DC and first 15 AC coeffs
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; coeff stored as 32bit numbers & require 16bit numbers
-  mova                            m9, [  coeffq+ncoeffq*4+ 0]
-  packssdw                        m9, [  coeffq+ncoeffq*4+16]
-  mova                           m10, [  coeffq+ncoeffq*4+32]
-  packssdw                       m10, [  coeffq+ncoeffq*4+48]
-%else
-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
-%endif
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  punpckhqdq                      m0, m0
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-  paddsw                          m6, m1                   ; m6 += round
-  punpckhqdq                      m1, m1
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
-  punpckhqdq                      m2, m2
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                           m8, m6                   ; m8 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
-  punpckhqdq                      m4, m4
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                          m8, m9                   ; m8 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                            m8, m7
-  pand                           m13, m12
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  mova                           m11, m8
-  mova                            m6, m8
-  pcmpgtw                         m5, m8
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova        [qcoeffq+ncoeffq*4+ 0], m11
-  mova        [qcoeffq+ncoeffq*4+16], m6
-  pxor                            m5, m5
-  mova                           m11, m13
-  mova                            m6, m13
-  pcmpgtw                         m5, m13
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova        [qcoeffq+ncoeffq*4+32], m11
-  mova        [qcoeffq+ncoeffq*4+48], m6
-  pxor                            m5, m5             ; reset m5 to zero register
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], m8
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%endif
-%ifidn %1, b_32x32
-  pabsw                           m8, m8
-  pabsw                          m13, m13
-%endif
-  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
-  punpckhqdq                      m3, m3
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                           m8, 1
-  psrlw                          m13, 1
-  psignw                          m8, m9
-  psignw                         m13, m10
-%endif
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  mova                            m11, m8
-  mova                            m6, m8
-  pcmpgtw                         m5, m8
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova       [dqcoeffq+ncoeffq*4+ 0], m11
-  mova       [dqcoeffq+ncoeffq*4+16], m6
-  pxor                            m5, m5
-  mova                           m11, m13
-  mova                            m6, m13
-  pcmpgtw                         m5, m13
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova       [dqcoeffq+ncoeffq*4+32], m11
-  mova       [dqcoeffq+ncoeffq*4+48], m6
-  pxor                            m5, m5             ; reset m5 to zero register
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], m8
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-%endif
-  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                   ; m6 = scan[i] + 1
-  psubw                          m11, m12                  ; m11 = scan[i] + 1
-  pandn                           m8, m6                   ; m8 = max(eob)
-  pandn                          m13, m11                  ; m13 = max(eob)
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jz .accumulate_eob
-
-.ac_only_loop:
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; pack coeff from 32bit to 16bit array
-  mova                            m9, [  coeffq+ncoeffq*4+ 0]
-  packssdw                        m9, [  coeffq+ncoeffq*4+16]
-  mova                           m10, [  coeffq+ncoeffq*4+32]
-  packssdw                       m10, [  coeffq+ncoeffq*4+48]
-%else
-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
-%endif
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-%ifidn %1, b_32x32
-  pmovmskb                       r6d, m7
-  pmovmskb                       r2d, m12
-  or                              r6, r2
-  jz .skip_iter
-%endif
-  paddsw                          m6, m1                   ; m6 += round
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                          m14, m6                   ; m14 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                         m14, m9                   ; m14 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                           m14, m7
-  pand                           m13, m12
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pxor                           m11, m11
-  mova                           m11, m14
-  mova                            m6, m14
-  pcmpgtw                         m5, m14
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova        [qcoeffq+ncoeffq*4+ 0], m11
-  mova        [qcoeffq+ncoeffq*4+16], m6
-  pxor                            m5, m5
-  mova                           m11, m13
-  mova                            m6, m13
-  pcmpgtw                         m5, m13
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova        [qcoeffq+ncoeffq*4+32], m11
-  mova        [qcoeffq+ncoeffq*4+48], m6
-  pxor                            m5, m5             ; reset m5 to zero register
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], m14
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%endif
-%ifidn %1, b_32x32
-  pabsw                          m14, m14
-  pabsw                          m13, m13
-%endif
-  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                          m14, 1
-  psrlw                          m13, 1
-  psignw                         m14, m9
-  psignw                         m13, m10
-%endif
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  mova                           m11, m14
-  mova                            m6, m14
-  pcmpgtw                         m5, m14
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova       [dqcoeffq+ncoeffq*4+ 0], m11
-  mova       [dqcoeffq+ncoeffq*4+16], m6
-  pxor                            m5, m5
-  mova                           m11, m13
-  mova                            m6, m13
-  pcmpgtw                         m5, m13
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova       [dqcoeffq+ncoeffq*4+32], m11
-  mova       [dqcoeffq+ncoeffq*4+48], m6
-  pxor                            m5, m5
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], m14
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-%endif
-  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                   ; m6 = scan[i] + 1
-  psubw                          m11, m12                  ; m11 = scan[i] + 1
-  pandn                          m14, m6                   ; m14 = max(eob)
-  pandn                          m13, m11                  ; m13 = max(eob)
-  pmaxsw                          m8, m14
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jl .ac_only_loop
-
-%ifidn %1, b_32x32
-  jmp .accumulate_eob
-.skip_iter:
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova        [qcoeffq+ncoeffq*4+ 0], m5
-  mova        [qcoeffq+ncoeffq*4+16], m5
-  mova        [qcoeffq+ncoeffq*4+32], m5
-  mova        [qcoeffq+ncoeffq*4+48], m5
-  mova       [dqcoeffq+ncoeffq*4+ 0], m5
-  mova       [dqcoeffq+ncoeffq*4+16], m5
-  mova       [dqcoeffq+ncoeffq*4+32], m5
-  mova       [dqcoeffq+ncoeffq*4+48], m5
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], m5
-  mova        [qcoeffq+ncoeffq*2+16], m5
-  mova       [dqcoeffq+ncoeffq*2+ 0], m5
-  mova       [dqcoeffq+ncoeffq*2+16], m5
-%endif
-  add                        ncoeffq, mmsize
-  jl .ac_only_loop
-%endif
-
-.accumulate_eob:
-  ; horizontally accumulate/max eobs and write into [eob] memory pointer
-  mov                             r2, eobmp
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  pextrw                          r6, m8, 0
-  mov                             [r2], r6
-  RET
-
-  ; skip-block, i.e. just write all zeroes
-.blank:
-  mov                             r0, dqcoeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, qcoeffmp
-  mov                             r3, eobmp
-  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
-%if CONFIG_VP9_HIGHBITDEPTH
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
-%else
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-%endif
-  neg                        ncoeffq
-  pxor                            m7, m7
-.blank_loop:
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova       [dqcoeffq+ncoeffq*4+ 0], m7
-  mova       [dqcoeffq+ncoeffq*4+16], m7
-  mova       [dqcoeffq+ncoeffq*4+32], m7
-  mova       [dqcoeffq+ncoeffq*4+48], m7
-  mova        [qcoeffq+ncoeffq*4+ 0], m7
-  mova        [qcoeffq+ncoeffq*4+16], m7
-  mova        [qcoeffq+ncoeffq*4+32], m7
-  mova        [qcoeffq+ncoeffq*4+48], m7
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], m7
-  mova       [dqcoeffq+ncoeffq*2+16], m7
-  mova        [qcoeffq+ncoeffq*2+ 0], m7
-  mova        [qcoeffq+ncoeffq*2+16], m7
-%endif
-  add                        ncoeffq, mmsize
-  jl .blank_loop
-  mov                    word [eobq], 0
-  RET
-%endmacro
-
-INIT_XMM ssse3
-QUANTIZE_FN b, 7
-QUANTIZE_FN b_32x32, 7
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx2.c
index 962b8fb11a..cf7111983b 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx2.c
+++ b/media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx2.c
@@ -11,154 +11,174 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 
-void vpx_sad32x32x4d_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t res[4]) {
-  __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
-  __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
-  __m256i sum_mlow, sum_mhigh;
-  int i;
-  const uint8_t *ref0, *ref1, *ref2, *ref3;
-
-  ref0 = ref[0];
-  ref1 = ref[1];
-  ref2 = ref[2];
-  ref3 = ref[3];
-  sum_ref0 = _mm256_set1_epi16(0);
-  sum_ref1 = _mm256_set1_epi16(0);
-  sum_ref2 = _mm256_set1_epi16(0);
-  sum_ref3 = _mm256_set1_epi16(0);
-  for (i = 0; i < 32; i++) {
-    // load src and all refs
-    src_reg = _mm256_loadu_si256((const __m256i *)src);
-    ref0_reg = _mm256_loadu_si256((const __m256i *)ref0);
-    ref1_reg = _mm256_loadu_si256((const __m256i *)ref1);
-    ref2_reg = _mm256_loadu_si256((const __m256i *)ref2);
-    ref3_reg = _mm256_loadu_si256((const __m256i *)ref3);
-    // sum of the absolute differences between every ref-i to src
-    ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
-    ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
-    ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
-    ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
-    // sum every ref-i
-    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
-    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
-    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
-    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
-
-    src += src_stride;
-    ref0 += ref_stride;
-    ref1 += ref_stride;
-    ref2 += ref_stride;
-    ref3 += ref_stride;
-  }
-  {
-    __m128i sum;
-    // in sum_ref-i the result is saved in the first 4 bytes
-    // the other 4 bytes are zeroed.
-    // sum_ref1 and sum_ref3 are shifted left by 4 bytes
-    sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
-    sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
-
-    // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
-    sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
-    sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
-
-    // merge every 64 bit from each sum_ref-i
-    sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
-    sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
-
-    // add the low 64 bit to the high 64 bit
-    sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
-
-    // add the low 128 bit to the high 128 bit
-    sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
-                        _mm256_extractf128_si256(sum_mlow, 1));
-
-    _mm_storeu_si128((__m128i *)(res), sum);
-  }
+// Note with sums[4] some versions of Visual Studio may fail due to parameter
+// alignment, though the functions should be equivalent:
+// error C2719: 'sums': formal parameter with requested alignment of 32 won't be
+// aligned
+static INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
+                                uint32_t sad_array[4]) {
+  const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]);
+  const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]);
+  const __m256i t2 = _mm256_hadd_epi32(t0, t1);
+  const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2),
+                                    _mm256_extractf128_si256(t2, 1));
+  _mm_storeu_si128((__m128i *)sad_array, sum);
 }
 
-void vpx_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t res[4]) {
-  __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg;
-  __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg;
-  __m256i ref3_reg, ref3next_reg;
-  __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
-  __m256i sum_mlow, sum_mhigh;
+static INLINE void sad32xhx4d_avx2(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *const ref_array[4],
+                                   int ref_stride, int h,
+                                   uint32_t sad_array[4]) {
   int i;
-  const uint8_t *ref0, *ref1, *ref2, *ref3;
+  const uint8_t *refs[4];
+  __m256i sums[4];
 
-  ref0 = ref[0];
-  ref1 = ref[1];
-  ref2 = ref[2];
-  ref3 = ref[3];
-  sum_ref0 = _mm256_set1_epi16(0);
-  sum_ref1 = _mm256_set1_epi16(0);
-  sum_ref2 = _mm256_set1_epi16(0);
-  sum_ref3 = _mm256_set1_epi16(0);
-  for (i = 0; i < 64; i++) {
-    // load 64 bytes from src and all refs
-    src_reg = _mm256_loadu_si256((const __m256i *)src);
-    srcnext_reg = _mm256_loadu_si256((const __m256i *)(src + 32));
-    ref0_reg = _mm256_loadu_si256((const __m256i *)ref0);
-    ref0next_reg = _mm256_loadu_si256((const __m256i *)(ref0 + 32));
-    ref1_reg = _mm256_loadu_si256((const __m256i *)ref1);
-    ref1next_reg = _mm256_loadu_si256((const __m256i *)(ref1 + 32));
-    ref2_reg = _mm256_loadu_si256((const __m256i *)ref2);
-    ref2next_reg = _mm256_loadu_si256((const __m256i *)(ref2 + 32));
-    ref3_reg = _mm256_loadu_si256((const __m256i *)ref3);
-    ref3next_reg = _mm256_loadu_si256((const __m256i *)(ref3 + 32));
-    // sum of the absolute differences between every ref-i to src
-    ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
-    ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
-    ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
-    ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
-    ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg);
-    ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg);
-    ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg);
-    ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg);
+  refs[0] = ref_array[0];
+  refs[1] = ref_array[1];
+  refs[2] = ref_array[2];
+  refs[3] = ref_array[3];
+  sums[0] = _mm256_setzero_si256();
+  sums[1] = _mm256_setzero_si256();
+  sums[2] = _mm256_setzero_si256();
+  sums[3] = _mm256_setzero_si256();
 
-    // sum every ref-i
-    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
-    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
-    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
-    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
-    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg);
-    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg);
-    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg);
-    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg);
-    src += src_stride;
-    ref0 += ref_stride;
-    ref1 += ref_stride;
-    ref2 += ref_stride;
-    ref3 += ref_stride;
+  for (i = 0; i < h; i++) {
+    __m256i r[4];
+
+    // load src and all ref[]
+    const __m256i s = _mm256_load_si256((const __m256i *)src_ptr);
+    r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
+    r[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
+    r[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
+    r[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
+
+    // sum of the absolute differences between every ref[] to src
+    r[0] = _mm256_sad_epu8(r[0], s);
+    r[1] = _mm256_sad_epu8(r[1], s);
+    r[2] = _mm256_sad_epu8(r[2], s);
+    r[3] = _mm256_sad_epu8(r[3], s);
+
+    // sum every ref[]
+    sums[0] = _mm256_add_epi32(sums[0], r[0]);
+    sums[1] = _mm256_add_epi32(sums[1], r[1]);
+    sums[2] = _mm256_add_epi32(sums[2], r[2]);
+    sums[3] = _mm256_add_epi32(sums[3], r[3]);
+
+    src_ptr += src_stride;
+    refs[0] += ref_stride;
+    refs[1] += ref_stride;
+    refs[2] += ref_stride;
+    refs[3] += ref_stride;
   }
-  {
-    __m128i sum;
 
-    // in sum_ref-i the result is saved in the first 4 bytes
-    // the other 4 bytes are zeroed.
-    // sum_ref1 and sum_ref3 are shifted left by 4 bytes
-    sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
-    sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
-
-    // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
-    sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
-    sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
-
-    // merge every 64 bit from each sum_ref-i
-    sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
-    sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
-
-    // add the low 64 bit to the high 64 bit
-    sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
-
-    // add the low 128 bit to the high 128 bit
-    sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
-                        _mm256_extractf128_si256(sum_mlow, 1));
-
-    _mm_storeu_si128((__m128i *)(res), sum);
-  }
+  calc_final_4(sums, sad_array);
 }
+
+static INLINE void sad64xhx4d_avx2(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *const ref_array[4],
+                                   int ref_stride, int h,
+                                   uint32_t sad_array[4]) {
+  __m256i sums[4];
+  int i;
+  const uint8_t *refs[4];
+
+  refs[0] = ref_array[0];
+  refs[1] = ref_array[1];
+  refs[2] = ref_array[2];
+  refs[3] = ref_array[3];
+  sums[0] = _mm256_setzero_si256();
+  sums[1] = _mm256_setzero_si256();
+  sums[2] = _mm256_setzero_si256();
+  sums[3] = _mm256_setzero_si256();
+
+  for (i = 0; i < h; i++) {
+    __m256i r_lo[4], r_hi[4];
+    // load 64 bytes from src and all ref[]
+    const __m256i s_lo = _mm256_load_si256((const __m256i *)src_ptr);
+    const __m256i s_hi = _mm256_load_si256((const __m256i *)(src_ptr + 32));
+    r_lo[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
+    r_hi[0] = _mm256_loadu_si256((const __m256i *)(refs[0] + 32));
+    r_lo[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
+    r_hi[1] = _mm256_loadu_si256((const __m256i *)(refs[1] + 32));
+    r_lo[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
+    r_hi[2] = _mm256_loadu_si256((const __m256i *)(refs[2] + 32));
+    r_lo[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
+    r_hi[3] = _mm256_loadu_si256((const __m256i *)(refs[3] + 32));
+
+    // sum of the absolute differences between every ref[] to src
+    r_lo[0] = _mm256_sad_epu8(r_lo[0], s_lo);
+    r_lo[1] = _mm256_sad_epu8(r_lo[1], s_lo);
+    r_lo[2] = _mm256_sad_epu8(r_lo[2], s_lo);
+    r_lo[3] = _mm256_sad_epu8(r_lo[3], s_lo);
+    r_hi[0] = _mm256_sad_epu8(r_hi[0], s_hi);
+    r_hi[1] = _mm256_sad_epu8(r_hi[1], s_hi);
+    r_hi[2] = _mm256_sad_epu8(r_hi[2], s_hi);
+    r_hi[3] = _mm256_sad_epu8(r_hi[3], s_hi);
+
+    // sum every ref[]
+    sums[0] = _mm256_add_epi32(sums[0], r_lo[0]);
+    sums[1] = _mm256_add_epi32(sums[1], r_lo[1]);
+    sums[2] = _mm256_add_epi32(sums[2], r_lo[2]);
+    sums[3] = _mm256_add_epi32(sums[3], r_lo[3]);
+    sums[0] = _mm256_add_epi32(sums[0], r_hi[0]);
+    sums[1] = _mm256_add_epi32(sums[1], r_hi[1]);
+    sums[2] = _mm256_add_epi32(sums[2], r_hi[2]);
+    sums[3] = _mm256_add_epi32(sums[3], r_hi[3]);
+
+    src_ptr += src_stride;
+    refs[0] += ref_stride;
+    refs[1] += ref_stride;
+    refs[2] += ref_stride;
+    refs[3] += ref_stride;
+  }
+
+  calc_final_4(sums, sad_array);
+}
+
+#define SAD64_H(h)                                                         \
+  void vpx_sad64x##h##x4d_avx2(const uint8_t *src, int src_stride,         \
+                               const uint8_t *const ref_array[4],          \
+                               int ref_stride, uint32_t sad_array[4]) {    \
+    sad64xhx4d_avx2(src, src_stride, ref_array, ref_stride, h, sad_array); \
+  }
+
+#define SAD32_H(h)                                                         \
+  void vpx_sad32x##h##x4d_avx2(const uint8_t *src, int src_stride,         \
+                               const uint8_t *const ref_array[4],          \
+                               int ref_stride, uint32_t sad_array[4]) {    \
+    sad32xhx4d_avx2(src, src_stride, ref_array, ref_stride, h, sad_array); \
+  }
+
+SAD64_H(64)
+SAD32_H(32)
+
+#define SADS64_H(h)                                                           \
+  void vpx_sad_skip_64x##h##x4d_avx2(const uint8_t *src, int src_stride,      \
+                                     const uint8_t *const ref_array[4],       \
+                                     int ref_stride, uint32_t sad_array[4]) { \
+    sad64xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,           \
+                    ((h) >> 1), sad_array);                                   \
+    sad_array[0] <<= 1;                                                       \
+    sad_array[1] <<= 1;                                                       \
+    sad_array[2] <<= 1;                                                       \
+    sad_array[3] <<= 1;                                                       \
+  }
+
+#define SADS32_H(h)                                                           \
+  void vpx_sad_skip_32x##h##x4d_avx2(const uint8_t *src, int src_stride,      \
+                                     const uint8_t *const ref_array[4],       \
+                                     int ref_stride, uint32_t sad_array[4]) { \
+    sad32xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,           \
+                    ((h) >> 1), sad_array);                                   \
+    sad_array[0] <<= 1;                                                       \
+    sad_array[1] <<= 1;                                                       \
+    sad_array[2] <<= 1;                                                       \
+    sad_array[3] <<= 1;                                                       \
+  }
+
+SADS64_H(64)
+SADS64_H(32)
+
+SADS32_H(64)
+SADS32_H(32)
+SADS32_H(16)
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx512.c b/media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx512.c
new file mode 100644
index 0000000000..cc36cae611
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx512.c
@@ -0,0 +1,105 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h>  // AVX512
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static INLINE void sad64xhx4d_avx512(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *const ref_array[4],
+                                     int ref_stride, int h,
+                                     uint32_t sad_array[4]) {
+  __m512i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
+  __m512i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
+  __m512i sum_mlow, sum_mhigh;
+  int i;
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+
+  ref0 = ref_array[0];
+  ref1 = ref_array[1];
+  ref2 = ref_array[2];
+  ref3 = ref_array[3];
+  sum_ref0 = _mm512_set1_epi16(0);
+  sum_ref1 = _mm512_set1_epi16(0);
+  sum_ref2 = _mm512_set1_epi16(0);
+  sum_ref3 = _mm512_set1_epi16(0);
+  for (i = 0; i < h; i++) {
+    // load src and all ref[]
+    src_reg = _mm512_loadu_si512((const __m512i *)src_ptr);
+    ref0_reg = _mm512_loadu_si512((const __m512i *)ref0);
+    ref1_reg = _mm512_loadu_si512((const __m512i *)ref1);
+    ref2_reg = _mm512_loadu_si512((const __m512i *)ref2);
+    ref3_reg = _mm512_loadu_si512((const __m512i *)ref3);
+    // sum of the absolute differences between every ref[] to src
+    ref0_reg = _mm512_sad_epu8(ref0_reg, src_reg);
+    ref1_reg = _mm512_sad_epu8(ref1_reg, src_reg);
+    ref2_reg = _mm512_sad_epu8(ref2_reg, src_reg);
+    ref3_reg = _mm512_sad_epu8(ref3_reg, src_reg);
+    // sum every ref[]
+    sum_ref0 = _mm512_add_epi32(sum_ref0, ref0_reg);
+    sum_ref1 = _mm512_add_epi32(sum_ref1, ref1_reg);
+    sum_ref2 = _mm512_add_epi32(sum_ref2, ref2_reg);
+    sum_ref3 = _mm512_add_epi32(sum_ref3, ref3_reg);
+
+    src_ptr += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+    ref3 += ref_stride;
+  }
+  {
+    __m256i sum256;
+    __m128i sum128;
+    // in sum_ref[] the result is saved in the first 4 bytes
+    // the other 4 bytes are zeroed.
+    // sum_ref1 and sum_ref3 are shifted left by 4 bytes
+    sum_ref1 = _mm512_bslli_epi128(sum_ref1, 4);
+    sum_ref3 = _mm512_bslli_epi128(sum_ref3, 4);
+
+    // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
+    sum_ref0 = _mm512_or_si512(sum_ref0, sum_ref1);
+    sum_ref2 = _mm512_or_si512(sum_ref2, sum_ref3);
+
+    // merge every 64 bit from each sum_ref[]
+    sum_mlow = _mm512_unpacklo_epi64(sum_ref0, sum_ref2);
+    sum_mhigh = _mm512_unpackhi_epi64(sum_ref0, sum_ref2);
+
+    // add the low 64 bit to the high 64 bit
+    sum_mlow = _mm512_add_epi32(sum_mlow, sum_mhigh);
+
+    // add the low 128 bit to the high 128 bit
+    sum256 = _mm256_add_epi32(_mm512_castsi512_si256(sum_mlow),
+                              _mm512_extracti32x8_epi32(sum_mlow, 1));
+    sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum256),
+                           _mm256_extractf128_si256(sum256, 1));
+
+    _mm_storeu_si128((__m128i *)(sad_array), sum128);
+  }
+}
+
+void vpx_sad64x64x4d_avx512(const uint8_t *src, int src_stride,
+                            const uint8_t *const ref_array[4], int ref_stride,
+                            uint32_t sad_array[4]) {
+  sad64xhx4d_avx512(src, src_stride, ref_array, ref_stride, 64, sad_array);
+}
+
+#define SADS64_H(h)                                                          \
+  void vpx_sad_skip_64x##h##x4d_avx512(                                      \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    sad64xhx4d_avx512(src, 2 * src_stride, ref_array, 2 * ref_stride,        \
+                      ((h) >> 1), sad_array);                                \
+    sad_array[0] <<= 1;                                                      \
+    sad_array[1] <<= 1;                                                      \
+    sad_array[2] <<= 1;                                                      \
+    sad_array[3] <<= 1;                                                      \
+  }
+
+SADS64_H(64)
+SADS64_H(32)
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sad4d_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/sad4d_sse2.asm
index 3f6e55ce9a..ed4ea3ef9b 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/sad4d_sse2.asm
+++ b/media/libvpx/libvpx/vpx_dsp/x86/sad4d_sse2.asm
@@ -179,13 +179,27 @@ SECTION .text
 ;                         uint8_t *ref[4], int ref_stride,
 ;                         uint32_t res[4]);
 ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4
-%macro SADNXN4D 2
+%macro SADNXN4D 2-3 0
+%if %3 == 1  ; skip rows
+%if UNIX64
+cglobal sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+                              res, ref2, ref3, ref4
+%else
+cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+                              ref2, ref3, ref4
+%endif
+%else  ; normal sad
 %if UNIX64
 cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
                               res, ref2, ref3, ref4
 %else
 cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
                               ref2, ref3, ref4
+%endif
+%endif
+%if %3 == 1
+  lea          src_strided, [2*src_strided]
+  lea          ref_strided, [2*ref_strided]
 %endif
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
@@ -195,9 +209,15 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
   mov                ref1q, [ref1q+gprsize*0]
 
   PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
-%rep (%2-4)/2
+%if %3 == 1  ; downsample number of rows by 2
+%define num_rep (%2-8)/4
+%else
+%define num_rep (%2-4)/2
+%endif
+%rep num_rep
   PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
 %endrep
+%undef num_rep
   PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
 
 %if %1 > 4
@@ -211,12 +231,19 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
   punpckhqdq            m5, m7
   movifnidn             r4, r4mp
   paddd                 m4, m5
+%if %3 == 1
+  pslld                 m4, 1
+%endif
   movu                [r4], m4
   RET
 %else
   movifnidn             r4, r4mp
   pshufd            m6, m6, 0x08
   pshufd            m7, m7, 0x08
+%if %3 == 1
+  pslld                 m6, 1
+  pslld                 m7, 1
+%endif
   movq              [r4+0], m6
   movq              [r4+8], m7
   RET
@@ -237,3 +264,15 @@ SADNXN4D  8,  8
 SADNXN4D  8,  4
 SADNXN4D  4,  8
 SADNXN4D  4,  4
+
+SADNXN4D 64, 64, 1
+SADNXN4D 64, 32, 1
+SADNXN4D 32, 64, 1
+SADNXN4D 32, 32, 1
+SADNXN4D 32, 16, 1
+SADNXN4D 16, 32, 1
+SADNXN4D 16, 16, 1
+SADNXN4D 16,  8, 1
+SADNXN4D  8, 16, 1
+SADNXN4D  8,  8, 1
+SADNXN4D  4,  8, 1
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sad_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/sad_avx2.c
index d944134305..e00494d766 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/sad_avx2.c
+++ b/media/libvpx/libvpx/vpx_dsp/x86/sad_avx2.c
@@ -11,88 +11,120 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
 
+static INLINE unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  int i, res;
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+  __m256i sum_sad = _mm256_setzero_si256();
+  __m256i sum_sad_h;
+  __m128i sum_sad128;
+  for (i = 0; i < h; i++) {
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));
+    sad1_reg =
+        _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+    sad2_reg = _mm256_sad_epu8(
+        ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));
+    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+    ref_ptr += ref_stride;
+    src_ptr += src_stride;
+  }
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+  res = _mm_cvtsi128_si32(sum_sad128);
+  return res;
+}
+
+static INLINE unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  int i, res;
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+  __m256i sum_sad = _mm256_setzero_si256();
+  __m256i sum_sad_h;
+  __m128i sum_sad128;
+  const int ref2_stride = ref_stride << 1;
+  const int src2_stride = src_stride << 1;
+  const int max = h >> 1;
+  for (i = 0; i < max; i++) {
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride));
+    sad1_reg =
+        _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+    sad2_reg = _mm256_sad_epu8(
+        ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));
+    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+    ref_ptr += ref2_stride;
+    src_ptr += src2_stride;
+  }
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+  res = _mm_cvtsi128_si32(sum_sad128);
+  return res;
+}
+
 #define FSAD64_H(h)                                                           \
   unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
                                     const uint8_t *ref_ptr, int ref_stride) { \
-    int i, res;                                                               \
-    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
-    __m256i sum_sad = _mm256_setzero_si256();                                 \
-    __m256i sum_sad_h;                                                        \
-    __m128i sum_sad128;                                                       \
-    for (i = 0; i < h; i++) {                                                 \
-      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
-      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));         \
-      sad1_reg = _mm256_sad_epu8(                                             \
-          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
-      sad2_reg = _mm256_sad_epu8(                                             \
-          ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));     \
-      sum_sad =                                                               \
-          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
-      ref_ptr += ref_stride;                                                  \
-      src_ptr += src_stride;                                                  \
-    }                                                                         \
-    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
-    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
-    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
-    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    res = _mm_cvtsi128_si32(sum_sad128);                                      \
-    return res;                                                               \
+    return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
+  }
+
+#define FSADS64_H(h)                                                          \
+  unsigned int vpx_sad_skip_64x##h##_avx2(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride) {                                                       \
+    return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+                            h / 2);                                           \
   }
 
 #define FSAD32_H(h)                                                           \
   unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
                                     const uint8_t *ref_ptr, int ref_stride) { \
-    int i, res;                                                               \
-    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
-    __m256i sum_sad = _mm256_setzero_si256();                                 \
-    __m256i sum_sad_h;                                                        \
-    __m128i sum_sad128;                                                       \
-    int ref2_stride = ref_stride << 1;                                        \
-    int src2_stride = src_stride << 1;                                        \
-    int max = h >> 1;                                                         \
-    for (i = 0; i < max; i++) {                                               \
-      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
-      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
-      sad1_reg = _mm256_sad_epu8(                                             \
-          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
-      sad2_reg = _mm256_sad_epu8(                                             \
-          ref2_reg,                                                           \
-          _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));       \
-      sum_sad =                                                               \
-          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
-      ref_ptr += ref2_stride;                                                 \
-      src_ptr += src2_stride;                                                 \
-    }                                                                         \
-    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
-    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
-    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
-    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    res = _mm_cvtsi128_si32(sum_sad128);                                      \
-    return res;                                                               \
+    return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
+  }
+
+#define FSADS32_H(h)                                                          \
+  unsigned int vpx_sad_skip_32x##h##_avx2(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride) {                                                       \
+    return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+                            h / 2);                                           \
   }
 
 #define FSAD64  \
-  FSAD64_H(64); \
-  FSAD64_H(32);
+  FSAD64_H(64)  \
+  FSAD64_H(32)  \
+  FSADS64_H(64) \
+  FSADS64_H(32)
 
 #define FSAD32  \
-  FSAD32_H(64); \
-  FSAD32_H(32); \
-  FSAD32_H(16);
+  FSAD32_H(64)  \
+  FSAD32_H(32)  \
+  FSAD32_H(16)  \
+  FSADS32_H(64) \
+  FSADS32_H(32) \
+  FSADS32_H(16)
 
-FSAD64;
-FSAD32;
+FSAD64
+FSAD32
 
 #undef FSAD64
 #undef FSAD32
 #undef FSAD64_H
 #undef FSAD32_H
+#undef FSADS64_H
+#undef FSADS32_H
 
 #define FSADAVG64_H(h)                                                        \
   unsigned int vpx_sad64x##h##_avg_avx2(                                      \
       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
       int ref_stride, const uint8_t *second_pred) {                           \
-    int i, res;                                                               \
+    int i;                                                                    \
     __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
     __m256i sum_sad = _mm256_setzero_si256();                                 \
     __m256i sum_sad_h;                                                        \
@@ -118,15 +150,14 @@ FSAD32;
     sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
     sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
     sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    res = _mm_cvtsi128_si32(sum_sad128);                                      \
-    return res;                                                               \
+    return (unsigned int)_mm_cvtsi128_si32(sum_sad128);                       \
   }
 
 #define FSADAVG32_H(h)                                                        \
   unsigned int vpx_sad32x##h##_avg_avx2(                                      \
       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
       int ref_stride, const uint8_t *second_pred) {                           \
-    int i, res;                                                               \
+    int i;                                                                    \
     __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
     __m256i sum_sad = _mm256_setzero_si256();                                 \
     __m256i sum_sad_h;                                                        \
@@ -156,21 +187,20 @@ FSAD32;
     sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
     sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
     sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    res = _mm_cvtsi128_si32(sum_sad128);                                      \
-    return res;                                                               \
+    return (unsigned int)_mm_cvtsi128_si32(sum_sad128);                       \
   }
 
-#define FSADAVG64  \
-  FSADAVG64_H(64); \
-  FSADAVG64_H(32);
+#define FSADAVG64 \
+  FSADAVG64_H(64) \
+  FSADAVG64_H(32)
 
-#define FSADAVG32  \
-  FSADAVG32_H(64); \
-  FSADAVG32_H(32); \
-  FSADAVG32_H(16);
+#define FSADAVG32 \
+  FSADAVG32_H(64) \
+  FSADAVG32_H(32) \
+  FSADAVG32_H(16)
 
-FSADAVG64;
-FSADAVG32;
+FSADAVG64
+FSADAVG32
 
 #undef FSADAVG64
 #undef FSADAVG32
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sad_avx512.c b/media/libvpx/libvpx/vpx_dsp/x86/sad_avx512.c
new file mode 100644
index 0000000000..38bd3be52b
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/sad_avx512.c
@@ -0,0 +1,88 @@
+/*
+ *  Copyright (c) 2025 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+
+static INLINE unsigned int sad64xh_avx512(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *ref_ptr,
+                                          int ref_stride, int h) {
+  int i, res;
+  __m512i sad_reg, ref_reg;
+  __m512i sum_sad = _mm512_setzero_si512();
+  for (i = 0; i < h; i++) {
+    ref_reg = _mm512_loadu_si512((const __m512i *)ref_ptr);
+    sad_reg =
+        _mm512_sad_epu8(ref_reg, _mm512_loadu_si512((__m512 const *)src_ptr));
+    sum_sad = _mm512_add_epi32(sum_sad, sad_reg);
+    ref_ptr += ref_stride;
+    src_ptr += src_stride;
+  }
+  res = _mm512_reduce_add_epi32(sum_sad);
+  return res;
+}
+
+#define FSAD64_H(h)                                                           \
+  unsigned int vpx_sad64x##h##_avx512(const uint8_t *src_ptr, int src_stride, \
+                                      const uint8_t *ref_ptr,                 \
+                                      int ref_stride) {                       \
+    return sad64xh_avx512(src_ptr, src_stride, ref_ptr, ref_stride, h);       \
+  }
+
+#define FSADS64_H(h)                                                  \
+  unsigned int vpx_sad_skip_64x##h##_avx512(                          \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride) {                                               \
+    return 2 * sad64xh_avx512(src_ptr, src_stride * 2, ref_ptr,       \
+                              ref_stride * 2, h / 2);                 \
+  }
+
+#define FSAD64  \
+  FSAD64_H(64)  \
+  FSAD64_H(32)  \
+  FSADS64_H(64) \
+  FSADS64_H(32)
+
+FSAD64
+
+#undef FSAD64
+#undef FSAD64_H
+#undef FSADS64_H
+
+#define FSADAVG64_H(h)                                                         \
+  unsigned int vpx_sad64x##h##_avg_avx512(                                     \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,          \
+      int ref_stride, const uint8_t *second_pred) {                            \
+    int i;                                                                     \
+    __m512i sad_reg, ref_reg;                                                  \
+    __m512i sum_sad = _mm512_setzero_si512();                                  \
+    for (i = 0; i < h; i++) {                                                  \
+      ref_reg = _mm512_loadu_si512((const __m512i *)ref_ptr);                  \
+      ref_reg = _mm512_avg_epu8(                                               \
+          ref_reg, _mm512_loadu_si512((const __m512i *)second_pred));          \
+      sad_reg = _mm512_sad_epu8(ref_reg,                                       \
+                                _mm512_loadu_si512((const __m512i *)src_ptr)); \
+      sum_sad = _mm512_add_epi32(sum_sad, sad_reg);                            \
+      ref_ptr += ref_stride;                                                   \
+      src_ptr += src_stride;                                                   \
+      second_pred += 64;                                                       \
+    }                                                                          \
+    return (unsigned int)_mm512_reduce_add_epi32(sum_sad);                     \
+  }
+
+#define FSADAVG64 \
+  FSADAVG64_H(64) \
+  FSADAVG64_H(32)
+
+FSADAVG64
+
+#undef FSADAVG64
+#undef FSADAVG64_H
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sad_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/sad_sse2.asm
index 1ec906c236..627e463bf8 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/sad_sse2.asm
+++ b/media/libvpx/libvpx/vpx_dsp/x86/sad_sse2.asm
@@ -12,30 +12,48 @@
 
 SECTION .text
 
+; Macro Arguments
+; Arg 1: Width
+; Arg 2: Height
+; Arg 3: Number of general purpose registers
+; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
 %macro SAD_FN 4
-%if %4 == 0
+%if %4 == 0 ; normal sad
 %if %3 == 5
 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
 %else ; %3 == 7
 cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
                             src_stride3, ref_stride3, n_rows
 %endif ; %3 == 5/7
-%else ; avg
+
+%elif %4 == 2 ; skip
+%if %3 == 5
+cglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
+                            src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+
+%else
 %if %3 == 5
 cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
                                     second_pred, n_rows
 %else ; %3 == 7
-cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \
+cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \
                                               ref, ref_stride, \
                                               second_pred, \
                                               src_stride3, ref_stride3
-%if ARCH_X86_64
+%if VPX_ARCH_X86_64
 %define n_rowsd r7d
 %else ; x86-32
 %define n_rowsd dword r0m
 %endif ; x86-32/64
 %endif ; %3 == 5/7
-%endif ; avg/sad
+%endif ; sad/avg/skip
+%if %4 == 2; skip rows so double the stride
+lea           src_strided, [src_strided*2]
+lea           ref_strided, [ref_strided*2]
+%endif ; %4 skip
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
 %if %3 == 7
@@ -48,7 +66,11 @@ cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \
 ;                                uint8_t *ref, int ref_stride);
 %macro SAD64XN 1-2 0
   SAD_FN 64, %1, 5, %2
+%if %2 == 2
+  mov              n_rowsd, %1/2
+%else
   mov              n_rowsd, %1
+%endif
   pxor                  m0, m0
 .loop:
   movu                  m1, [refq]
@@ -77,6 +99,9 @@ cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -86,12 +111,18 @@ SAD64XN 64 ; sad64x64_sse2
 SAD64XN 32 ; sad64x32_sse2
 SAD64XN 64, 1 ; sad64x64_avg_sse2
 SAD64XN 32, 1 ; sad64x32_avg_sse2
+SAD64XN  64, 2  ; sad64x64_skip_sse2
+SAD64XN  32, 2  ; sad64x32_skip_sse2
 
 ; unsigned int vpx_sad32x32_sse2(uint8_t *src, int src_stride,
 ;                                uint8_t *ref, int ref_stride);
 %macro SAD32XN 1-2 0
   SAD_FN 32, %1, 5, %2
+%if %2 == 2
+  mov              n_rowsd, %1/4
+%else
   mov              n_rowsd, %1/2
+%endif
   pxor                  m0, m0
 .loop:
   movu                  m1, [refq]
@@ -120,6 +151,9 @@ SAD64XN 32, 1 ; sad64x32_avg_sse2
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -131,12 +165,19 @@ SAD32XN 16 ; sad32x16_sse2
 SAD32XN 64, 1 ; sad32x64_avg_sse2
 SAD32XN 32, 1 ; sad32x32_avg_sse2
 SAD32XN 16, 1 ; sad32x16_avg_sse2
+SAD32XN 64, 2 ; sad32x64_skip_sse2
+SAD32XN 32, 2 ; sad32x32_skip_sse2
+SAD32XN 16, 2 ; sad32x16_skip_sse2
 
 ; unsigned int vpx_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro SAD16XN 1-2 0
   SAD_FN 16, %1, 7, %2
+%if %2 == 2
+  mov              n_rowsd, %1/8
+%else
   mov              n_rowsd, %1/4
+%endif
   pxor                  m0, m0
 
 .loop:
@@ -166,6 +207,9 @@ SAD32XN 16, 1 ; sad32x16_avg_sse2
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -177,12 +221,19 @@ SAD16XN  8 ; sad16x8_sse2
 SAD16XN 32, 1 ; sad16x32_avg_sse2
 SAD16XN 16, 1 ; sad16x16_avg_sse2
 SAD16XN  8, 1 ; sad16x8_avg_sse2
+SAD16XN 32, 2 ; sad16x32_skip_sse2
+SAD16XN 16, 2 ; sad16x16_skip_sse2
+SAD16XN  8, 2 ; sad16x8_skip_sse2
 
 ; unsigned int vpx_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
 ;                                   uint8_t *ref, int ref_stride);
 %macro SAD8XN 1-2 0
   SAD_FN 8, %1, 7, %2
+%if %2 == 2
+  mov              n_rowsd, %1/8
+%else
   mov              n_rowsd, %1/4
+%endif
   pxor                  m0, m0
 
 .loop:
@@ -210,6 +261,9 @@ SAD16XN  8, 1 ; sad16x8_avg_sse2
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -221,12 +275,18 @@ SAD8XN  4 ; sad8x4_sse2
 SAD8XN 16, 1 ; sad8x16_avg_sse2
 SAD8XN  8, 1 ; sad8x8_avg_sse2
 SAD8XN  4, 1 ; sad8x4_avg_sse2
+SAD8XN 16, 2 ; sad8x16_skip_sse2
+SAD8XN  8, 2 ; sad8x8_skip_sse2
 
 ; unsigned int vpx_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
 ;                                   uint8_t *ref, int ref_stride);
 %macro SAD4XN 1-2 0
   SAD_FN 4, %1, 7, %2
+%if %2 == 2
+  mov              n_rowsd, %1/8
+%else
   mov              n_rowsd, %1/4
+%endif
   pxor                  m0, m0
 
 .loop:
@@ -257,6 +317,9 @@ SAD8XN  4, 1 ; sad8x4_avg_sse2
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -266,3 +329,4 @@ SAD4XN  8 ; sad4x8_sse
 SAD4XN  4 ; sad4x4_sse
 SAD4XN  8, 1 ; sad4x8_avg_sse
 SAD4XN  4, 1 ; sad4x4_avg_sse
+SAD4XN  8, 2 ; sad4x8_skip_sse
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sad_sse3.asm b/media/libvpx/libvpx/vpx_dsp/x86/sad_sse3.asm
deleted file mode 100644
index 18279bdb9d..0000000000
--- a/media/libvpx/libvpx/vpx_dsp/x86/sad_sse3.asm
+++ /dev/null
@@ -1,374 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro STACK_FRAME_CREATE_X3 0
-%if ABI_IS_32BIT
-  %define     src_ptr       rsi
-  %define     src_stride    rax
-  %define     ref_ptr       rdi
-  %define     ref_stride    rdx
-  %define     end_ptr       rcx
-  %define     ret_var       rbx
-  %define     result_ptr    arg(4)
-  %define     height        dword ptr arg(4)
-    push        rbp
-    mov         rbp,        rsp
-    push        rsi
-    push        rdi
-    push        rbx
-
-    mov         rsi,        arg(0)              ; src_ptr
-    mov         rdi,        arg(2)              ; ref_ptr
-
-    movsxd      rax,        dword ptr arg(1)    ; src_stride
-    movsxd      rdx,        dword ptr arg(3)    ; ref_stride
-%else
-  %if LIBVPX_YASM_WIN64
-    SAVE_XMM 7, u
-    %define     src_ptr     rcx
-    %define     src_stride  rdx
-    %define     ref_ptr     r8
-    %define     ref_stride  r9
-    %define     end_ptr     r10
-    %define     ret_var     r11
-    %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
-    %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]
-  %else
-    %define     src_ptr     rdi
-    %define     src_stride  rsi
-    %define     ref_ptr     rdx
-    %define     ref_stride  rcx
-    %define     end_ptr     r9
-    %define     ret_var     r10
-    %define     result_ptr  r8
-    %define     height      r8
-  %endif
-%endif
-
-%endmacro
-
-%macro STACK_FRAME_DESTROY_X3 0
-  %define     src_ptr
-  %define     src_stride
-  %define     ref_ptr
-  %define     ref_stride
-  %define     end_ptr
-  %define     ret_var
-  %define     result_ptr
-  %define     height
-
-%if ABI_IS_32BIT
-    pop         rbx
-    pop         rdi
-    pop         rsi
-    pop         rbp
-%else
-  %if LIBVPX_YASM_WIN64
-    RESTORE_XMM
-  %endif
-%endif
-    ret
-%endmacro
-
-%macro PROCESS_16X2X3 5
-%if %1==0
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm5,       XMMWORD PTR [%3]
-        lddqu           xmm6,       XMMWORD PTR [%3+1]
-        lddqu           xmm7,       XMMWORD PTR [%3+2]
-
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm1,       XMMWORD PTR [%3]
-        lddqu           xmm2,       XMMWORD PTR [%3+1]
-        lddqu           xmm3,       XMMWORD PTR [%3+2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endif
-        movdqa          xmm0,       XMMWORD PTR [%2+%4]
-        lddqu           xmm1,       XMMWORD PTR [%3+%5]
-        lddqu           xmm2,       XMMWORD PTR [%3+%5+1]
-        lddqu           xmm3,       XMMWORD PTR [%3+%5+2]
-
-%if %1==0 || %1==1
-        lea             %2,         [%2+%4*2]
-        lea             %3,         [%3+%5*2]
-%endif
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endmacro
-
-%macro PROCESS_8X2X3 5
-%if %1==0
-        movq            mm0,       QWORD PTR [%2]
-        movq            mm5,       QWORD PTR [%3]
-        movq            mm6,       QWORD PTR [%3+1]
-        movq            mm7,       QWORD PTR [%3+2]
-
-        psadbw          mm5,       mm0
-        psadbw          mm6,       mm0
-        psadbw          mm7,       mm0
-%else
-        movq            mm0,       QWORD PTR [%2]
-        movq            mm1,       QWORD PTR [%3]
-        movq            mm2,       QWORD PTR [%3+1]
-        movq            mm3,       QWORD PTR [%3+2]
-
-        psadbw          mm1,       mm0
-        psadbw          mm2,       mm0
-        psadbw          mm3,       mm0
-
-        paddw           mm5,       mm1
-        paddw           mm6,       mm2
-        paddw           mm7,       mm3
-%endif
-        movq            mm0,       QWORD PTR [%2+%4]
-        movq            mm1,       QWORD PTR [%3+%5]
-        movq            mm2,       QWORD PTR [%3+%5+1]
-        movq            mm3,       QWORD PTR [%3+%5+2]
-
-%if %1==0 || %1==1
-        lea             %2,        [%2+%4*2]
-        lea             %3,        [%3+%5*2]
-%endif
-
-        psadbw          mm1,       mm0
-        psadbw          mm2,       mm0
-        psadbw          mm3,       mm0
-
-        paddw           mm5,       mm1
-        paddw           mm6,       mm2
-        paddw           mm7,       mm3
-%endmacro
-
-;void int vpx_sad16x16x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vpx_sad16x16x3_sse3) PRIVATE
-sym(vpx_sad16x16x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rcx],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rcx+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rcx+8],    xmm0
-
-    STACK_FRAME_DESTROY_X3
-
-;void int vpx_sad16x8x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vpx_sad16x8x3_sse3) PRIVATE
-sym(vpx_sad16x8x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rcx],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rcx+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rcx+8],    xmm0
-
-    STACK_FRAME_DESTROY_X3
-
-;void int vpx_sad8x16x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vpx_sad8x16x3_sse3) PRIVATE
-sym(vpx_sad8x16x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        punpckldq       mm5,        mm6
-
-        movq            [rcx],      mm5
-        movd            [rcx+8],    mm7
-
-    STACK_FRAME_DESTROY_X3
-
-;void int vpx_sad8x8x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vpx_sad8x8x3_sse3) PRIVATE
-sym(vpx_sad8x8x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        punpckldq       mm5,        mm6
-
-        movq            [rcx],      mm5
-        movd            [rcx+8],    mm7
-
-    STACK_FRAME_DESTROY_X3
-
-;void int vpx_sad4x4x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vpx_sad4x4x3_sse3) PRIVATE
-sym(vpx_sad4x4x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm1,        DWORD PTR [ref_ptr]
-
-        movd            mm2,        DWORD PTR [src_ptr+src_stride]
-        movd            mm3,        DWORD PTR [ref_ptr+ref_stride]
-
-        punpcklbw       mm0,        mm2
-        punpcklbw       mm1,        mm3
-
-        movd            mm4,        DWORD PTR [ref_ptr+1]
-        movd            mm5,        DWORD PTR [ref_ptr+2]
-
-        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
-        movd            mm3,        DWORD PTR [ref_ptr+ref_stride+2]
-
-        psadbw          mm1,        mm0
-
-        punpcklbw       mm4,        mm2
-        punpcklbw       mm5,        mm3
-
-        psadbw          mm4,        mm0
-        psadbw          mm5,        mm0
-
-        lea             src_ptr,    [src_ptr+src_stride*2]
-        lea             ref_ptr,    [ref_ptr+ref_stride*2]
-
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm2,        DWORD PTR [ref_ptr]
-
-        movd            mm3,        DWORD PTR [src_ptr+src_stride]
-        movd            mm6,        DWORD PTR [ref_ptr+ref_stride]
-
-        punpcklbw       mm0,        mm3
-        punpcklbw       mm2,        mm6
-
-        movd            mm3,        DWORD PTR [ref_ptr+1]
-        movd            mm7,        DWORD PTR [ref_ptr+2]
-
-        psadbw          mm2,        mm0
-
-        paddw           mm1,        mm2
-
-        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
-        movd            mm6,        DWORD PTR [ref_ptr+ref_stride+2]
-
-        punpcklbw       mm3,        mm2
-        punpcklbw       mm7,        mm6
-
-        psadbw          mm3,        mm0
-        psadbw          mm7,        mm0
-
-        paddw           mm3,        mm4
-        paddw           mm7,        mm5
-
-        mov             rcx,        result_ptr
-
-        punpckldq       mm1,        mm3
-
-        movq            [rcx],      mm1
-        movd            [rcx+8],    mm7
-
-    STACK_FRAME_DESTROY_X3
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sad_sse4.asm b/media/libvpx/libvpx/vpx_dsp/x86/sad_sse4.asm
deleted file mode 100644
index bc67447971..0000000000
--- a/media/libvpx/libvpx/vpx_dsp/x86/sad_sse4.asm
+++ /dev/null
@@ -1,359 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro PROCESS_16X2X8 1
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        movq            xmm2,       MMWORD PTR [rdi+16]
-        punpcklqdq      xmm1,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        movdqa          xmm2,       xmm1
-        mpsadbw         xmm1,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm1,       xmm2
-        paddw           xmm1,       xmm3
-        paddw           xmm1,       xmm4
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        movq            xmm2,       MMWORD PTR [rdi+16]
-        punpcklqdq      xmm5,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm5,       xmm2
-        paddw           xmm5,       xmm3
-        paddw           xmm5,       xmm4
-
-        paddw           xmm1,       xmm5
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        movq            xmm2,       MMWORD PTR [rdi+ rdx+16]
-        punpcklqdq      xmm5,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm5,       xmm2
-        paddw           xmm5,       xmm3
-        paddw           xmm5,       xmm4
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-%macro PROCESS_8X2X8 1
-%if %1
-        movq            xmm0,       MMWORD PTR [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm1,       xmm3
-
-        movdqa          xmm2,       xmm1
-        mpsadbw         xmm1,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm1,       xmm2
-%else
-        movq            xmm0,       MMWORD PTR [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm5,       xmm3
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm5,       xmm2
-
-        paddw           xmm1,       xmm5
-%endif
-        movq            xmm0,       MMWORD PTR [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        punpcklqdq      xmm5,       xmm3
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm5,       xmm2
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-%macro PROCESS_4X2X8 1
-%if %1
-        movd            xmm0,       [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm1,       xmm3
-
-        mpsadbw         xmm1,       xmm0,  0x0
-%else
-        movd            xmm0,       [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm5,       xmm3
-
-        mpsadbw         xmm5,       xmm0,  0x0
-
-        paddw           xmm1,       xmm5
-%endif
-        movd            xmm0,       [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        punpcklqdq      xmm5,       xmm3
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        mpsadbw         xmm5,       xmm0,  0x0
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-%macro WRITE_AS_INTS 0
-    mov             rdi,        arg(4)           ;Results
-    pxor            xmm0, xmm0
-    movdqa          xmm2, xmm1
-    punpcklwd       xmm1, xmm0
-    punpckhwd       xmm2, xmm0
-
-    movdqa          [rdi],    xmm1
-    movdqa          [rdi + 16],    xmm2
-%endmacro
-
-;void vpx_sad16x16x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array);
-global sym(vpx_sad16x16x8_sse4_1) PRIVATE
-sym(vpx_sad16x16x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_16X2X8 1
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vpx_sad16x8x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-global sym(vpx_sad16x8x8_sse4_1) PRIVATE
-sym(vpx_sad16x8x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_16X2X8 1
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vpx_sad8x8x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-global sym(vpx_sad8x8x8_sse4_1) PRIVATE
-sym(vpx_sad8x8x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_8X2X8 1
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vpx_sad8x16x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-global sym(vpx_sad8x16x8_sse4_1) PRIVATE
-sym(vpx_sad8x16x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_8X2X8 1
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vpx_sad4x4x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-global sym(vpx_sad4x4x8_sse4_1) PRIVATE
-sym(vpx_sad4x4x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_4X2X8 1
-    PROCESS_4X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sad_ssse3.asm b/media/libvpx/libvpx/vpx_dsp/x86/sad_ssse3.asm
deleted file mode 100644
index 49f204fa04..0000000000
--- a/media/libvpx/libvpx/vpx_dsp/x86/sad_ssse3.asm
+++ /dev/null
@@ -1,370 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro PROCESS_16X2X3 1
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        lddqu           xmm5,       XMMWORD PTR [rdi]
-        lddqu           xmm6,       XMMWORD PTR [rdi+1]
-        lddqu           xmm7,       XMMWORD PTR [rdi+2]
-
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        lddqu           xmm1,       XMMWORD PTR [rdi]
-        lddqu           xmm2,       XMMWORD PTR [rdi+1]
-        lddqu           xmm3,       XMMWORD PTR [rdi+2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
-        lddqu           xmm1,       XMMWORD PTR [rdi+rdx]
-        lddqu           xmm2,       XMMWORD PTR [rdi+rdx+1]
-        lddqu           xmm3,       XMMWORD PTR [rdi+rdx+2]
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endmacro
-
-%macro PROCESS_16X2X3_OFFSET 2
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movdqa          xmm4,       XMMWORD PTR [rdi]
-        movdqa          xmm7,       XMMWORD PTR [rdi+16]
-
-        movdqa          xmm5,       xmm7
-        palignr         xmm5,       xmm4,       %2
-
-        movdqa          xmm6,       xmm7
-        palignr         xmm6,       xmm4,       (%2+1)
-
-        palignr         xmm7,       xmm4,       (%2+2)
-
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movdqa          xmm4,       XMMWORD PTR [rdi]
-        movdqa          xmm3,       XMMWORD PTR [rdi+16]
-
-        movdqa          xmm1,       xmm3
-        palignr         xmm1,       xmm4,       %2
-
-        movdqa          xmm2,       xmm3
-        palignr         xmm2,       xmm4,       (%2+1)
-
-        palignr         xmm3,       xmm4,       (%2+2)
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
-        movdqa          xmm4,       XMMWORD PTR [rdi+rdx]
-        movdqa          xmm3,       XMMWORD PTR [rdi+rdx+16]
-
-        movdqa          xmm1,       xmm3
-        palignr         xmm1,       xmm4,       %2
-
-        movdqa          xmm2,       xmm3
-        palignr         xmm2,       xmm4,       (%2+1)
-
-        palignr         xmm3,       xmm4,       (%2+2)
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endmacro
-
-%macro PROCESS_16X16X3_OFFSET 2
-%2_aligned_by_%1:
-
-        sub             rdi,        %1
-
-        PROCESS_16X2X3_OFFSET 1, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-
-        jmp             %2_store_off
-
-%endmacro
-
-%macro PROCESS_16X8X3_OFFSET 2
-%2_aligned_by_%1:
-
-        sub             rdi,        %1
-
-        PROCESS_16X2X3_OFFSET 1, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-
-        jmp             %2_store_off
-
-%endmacro
-
-;void int vpx_sad16x16x3_ssse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vpx_sad16x16x3_ssse3) PRIVATE
-sym(vpx_sad16x16x3_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rcx
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        mov             rdx,        0xf
-        and             rdx,        rdi
-
-        jmp .vpx_sad16x16x3_ssse3_skiptable
-.vpx_sad16x16x3_ssse3_jumptable:
-        dd .vpx_sad16x16x3_ssse3_aligned_by_0  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_1  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_2  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_3  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_4  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_5  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_6  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_7  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_8  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_9  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_10 - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_11 - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_12 - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_13 - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_14 - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_15 - .vpx_sad16x16x3_ssse3_do_jump
-.vpx_sad16x16x3_ssse3_skiptable:
-
-        call .vpx_sad16x16x3_ssse3_do_jump
-.vpx_sad16x16x3_ssse3_do_jump:
-        pop             rcx                         ; get the address of do_jump
-        mov             rax,  .vpx_sad16x16x3_ssse3_jumptable - .vpx_sad16x16x3_ssse3_do_jump
-        add             rax,  rcx  ; get the absolute address of vpx_sad16x16x3_ssse3_jumptable
-
-        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
-        add             rcx,        rax
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        jmp             rcx
-
-        PROCESS_16X16X3_OFFSET 0,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 1,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 2,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 3,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 4,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 5,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 6,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 7,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 8,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 9,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 10, .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 11, .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 12, .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 13, .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 14, .vpx_sad16x16x3_ssse3
-
-.vpx_sad16x16x3_ssse3_aligned_by_15:
-        PROCESS_16X2X3 1
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-
-.vpx_sad16x16x3_ssse3_store_off:
-        mov             rdi,        arg(4) ;Results
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rdi],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rdi+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rdi+8],    xmm0
-
-    ; begin epilog
-    pop         rcx
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void int vpx_sad16x8x3_ssse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vpx_sad16x8x3_ssse3) PRIVATE
-sym(vpx_sad16x8x3_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rcx
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        mov             rdx,        0xf
-        and             rdx,        rdi
-
-        jmp .vpx_sad16x8x3_ssse3_skiptable
-.vpx_sad16x8x3_ssse3_jumptable:
-        dd .vpx_sad16x8x3_ssse3_aligned_by_0  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_1  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_2  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_3  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_4  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_5  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_6  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_7  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_8  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_9  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_10 - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_11 - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_12 - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_13 - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_14 - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_15 - .vpx_sad16x8x3_ssse3_do_jump
-.vpx_sad16x8x3_ssse3_skiptable:
-
-        call .vpx_sad16x8x3_ssse3_do_jump
-.vpx_sad16x8x3_ssse3_do_jump:
-        pop             rcx                         ; get the address of do_jump
-        mov             rax,  .vpx_sad16x8x3_ssse3_jumptable - .vpx_sad16x8x3_ssse3_do_jump
-        add             rax,  rcx  ; get the absolute address of vpx_sad16x8x3_ssse3_jumptable
-
-        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
-        add             rcx,        rax
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        jmp             rcx
-
-        PROCESS_16X8X3_OFFSET 0,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 1,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 2,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 3,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 4,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 5,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 6,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 7,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 8,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 9,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 10, .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 11, .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 12, .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 13, .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 14, .vpx_sad16x8x3_ssse3
-
-.vpx_sad16x8x3_ssse3_aligned_by_15:
-
-        PROCESS_16X2X3 1
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-
-.vpx_sad16x8x3_ssse3_store_off:
-        mov             rdi,        arg(4) ;Results
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rdi],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rdi+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rdi+8],    xmm0
-
-    ; begin epilog
-    pop         rcx
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sse_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/sse_avx2.c
new file mode 100644
index 0000000000..dfe45b6115
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/sse_avx2.c
@@ -0,0 +1,368 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+#include <smmintrin.h>
+#include <stdint.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+
+static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a,
+                                const uint8_t *b) {
+  const __m256i v_a0 = _mm256_loadu_si256((const __m256i *)a);
+  const __m256i v_b0 = _mm256_loadu_si256((const __m256i *)b);
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i v_a00_w = _mm256_unpacklo_epi8(v_a0, zero);
+  const __m256i v_a01_w = _mm256_unpackhi_epi8(v_a0, zero);
+  const __m256i v_b00_w = _mm256_unpacklo_epi8(v_b0, zero);
+  const __m256i v_b01_w = _mm256_unpackhi_epi8(v_b0, zero);
+  const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w);
+  const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w));
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w));
+}
+
+static INLINE int64_t summary_all_avx2(const __m256i *sum_all) {
+  int64_t sum;
+  __m256i zero = _mm256_setzero_si256();
+  const __m256i sum0_4x64 = _mm256_unpacklo_epi32(*sum_all, zero);
+  const __m256i sum1_4x64 = _mm256_unpackhi_epi32(*sum_all, zero);
+  const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
+  const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
+                                         _mm256_extracti128_si256(sum_4x64, 1));
+  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+  _mm_storel_epi64((__m128i *)&sum, sum_1x64);
+  return sum;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void summary_32_avx2(const __m256i *sum32, __m256i *sum) {
+  const __m256i sum0_4x64 =
+      _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum32));
+  const __m256i sum1_4x64 =
+      _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum32, 1));
+  const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
+  *sum = _mm256_add_epi64(*sum, sum_4x64);
+}
+
+static INLINE int64_t summary_4x64_avx2(const __m256i sum_4x64) {
+  int64_t sum;
+  const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
+                                         _mm256_extracti128_si256(sum_4x64, 1));
+  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+
+  _mm_storel_epi64((__m128i *)&sum, sum_1x64);
+  return sum;
+}
+#endif
+
+static INLINE void sse_w4x4_avx2(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, __m256i *sum) {
+  const __m128i v_a0 = load_unaligned_u32(a);
+  const __m128i v_a1 = load_unaligned_u32(a + a_stride);
+  const __m128i v_a2 = load_unaligned_u32(a + a_stride * 2);
+  const __m128i v_a3 = load_unaligned_u32(a + a_stride * 3);
+  const __m128i v_b0 = load_unaligned_u32(b);
+  const __m128i v_b1 = load_unaligned_u32(b + b_stride);
+  const __m128i v_b2 = load_unaligned_u32(b + b_stride * 2);
+  const __m128i v_b3 = load_unaligned_u32(b + b_stride * 3);
+  const __m128i v_a0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_a0, v_a1),
+                                             _mm_unpacklo_epi32(v_a2, v_a3));
+  const __m128i v_b0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_b0, v_b1),
+                                             _mm_unpacklo_epi32(v_b2, v_b3));
+  const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123);
+  const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123);
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void sse_w8x2_avx2(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, __m256i *sum) {
+  const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a);
+  const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride));
+  const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b);
+  const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride));
+  const __m256i v_a_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1));
+  const __m256i v_b_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1));
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t vpx_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
+                     int b_stride, int width, int height) {
+  int32_t y = 0;
+  int64_t sse = 0;
+  __m256i sum = _mm256_setzero_si256();
+  __m256i zero = _mm256_setzero_si256();
+  switch (width) {
+    case 4:
+      do {
+        sse_w4x4_avx2(a, a_stride, b, b_stride, &sum);
+        a += a_stride << 2;
+        b += b_stride << 2;
+        y += 4;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 8:
+      do {
+        sse_w8x2_avx2(a, a_stride, b, b_stride, &sum);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 16:
+      do {
+        const __m128i v_a0 = _mm_loadu_si128((const __m128i *)a);
+        const __m128i v_a1 = _mm_loadu_si128((const __m128i *)(a + a_stride));
+        const __m128i v_b0 = _mm_loadu_si128((const __m128i *)b);
+        const __m128i v_b1 = _mm_loadu_si128((const __m128i *)(b + b_stride));
+        const __m256i v_a =
+            _mm256_insertf128_si256(_mm256_castsi128_si256(v_a0), v_a1, 0x01);
+        const __m256i v_b =
+            _mm256_insertf128_si256(_mm256_castsi128_si256(v_b0), v_b1, 0x01);
+        const __m256i v_al = _mm256_unpacklo_epi8(v_a, zero);
+        const __m256i v_au = _mm256_unpackhi_epi8(v_a, zero);
+        const __m256i v_bl = _mm256_unpacklo_epi8(v_b, zero);
+        const __m256i v_bu = _mm256_unpackhi_epi8(v_b, zero);
+        const __m256i v_asub = _mm256_sub_epi16(v_al, v_bl);
+        const __m256i v_bsub = _mm256_sub_epi16(v_au, v_bu);
+        const __m256i temp =
+            _mm256_add_epi32(_mm256_madd_epi16(v_asub, v_asub),
+                             _mm256_madd_epi16(v_bsub, v_bsub));
+        sum = _mm256_add_epi32(sum, temp);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 32:
+      do {
+        sse_w32_avx2(&sum, a, b);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 64:
+      do {
+        sse_w32_avx2(&sum, a, b);
+        sse_w32_avx2(&sum, a + 32, b + 32);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    default:
+      if ((width & 0x07) == 0) {
+        do {
+          int i = 0;
+          do {
+            sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum);
+            i += 8;
+          } while (i < width);
+          a += a_stride << 1;
+          b += b_stride << 1;
+          y += 2;
+        } while (y < height);
+      } else {
+        do {
+          int i = 0;
+          do {
+            const uint8_t *a2;
+            const uint8_t *b2;
+            sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum);
+            a2 = a + i + (a_stride << 1);
+            b2 = b + i + (b_stride << 1);
+            sse_w8x2_avx2(a2, a_stride, b2, b_stride, &sum);
+            i += 8;
+          } while (i + 4 < width);
+          sse_w4x4_avx2(a + i, a_stride, b + i, b_stride, &sum);
+          a += a_stride << 2;
+          b += b_stride << 2;
+          y += 4;
+        } while (y < height);
+      }
+      sse = summary_all_avx2(&sum);
+      break;
+  }
+
+  return sse;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a,
+                                       const uint16_t *b) {
+  const __m256i v_a_w = _mm256_loadu_si256((const __m256i *)a);
+  const __m256i v_b_w = _mm256_loadu_si256((const __m256i *)b);
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void highbd_sse_w4x4_avx2(__m256i *sum, const uint16_t *a,
+                                        int a_stride, const uint16_t *b,
+                                        int b_stride) {
+  const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a);
+  const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride));
+  const __m128i v_a2 = _mm_loadl_epi64((const __m128i *)(a + a_stride * 2));
+  const __m128i v_a3 = _mm_loadl_epi64((const __m128i *)(a + a_stride * 3));
+  const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b);
+  const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride));
+  const __m128i v_b2 = _mm_loadl_epi64((const __m128i *)(b + b_stride * 2));
+  const __m128i v_b3 = _mm_loadl_epi64((const __m128i *)(b + b_stride * 3));
+  const __m128i v_a_hi = _mm_unpacklo_epi64(v_a0, v_a1);
+  const __m128i v_a_lo = _mm_unpacklo_epi64(v_a2, v_a3);
+  const __m256i v_a_w =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(v_a_lo), v_a_hi, 1);
+  const __m128i v_b_hi = _mm_unpacklo_epi64(v_b0, v_b1);
+  const __m128i v_b_lo = _mm_unpacklo_epi64(v_b2, v_b3);
+  const __m256i v_b_w =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(v_b_lo), v_b_hi, 1);
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void highbd_sse_w8x2_avx2(__m256i *sum, const uint16_t *a,
+                                        int a_stride, const uint16_t *b,
+                                        int b_stride) {
+  const __m128i v_a_hi = _mm_loadu_si128((const __m128i *)(a + a_stride));
+  const __m128i v_a_lo = _mm_loadu_si128((const __m128i *)a);
+  const __m256i v_a_w =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(v_a_lo), v_a_hi, 1);
+  const __m128i v_b_hi = _mm_loadu_si128((const __m128i *)(b + b_stride));
+  const __m128i v_b_lo = _mm_loadu_si128((const __m128i *)b);
+  const __m256i v_b_w =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(v_b_lo), v_b_hi, 1);
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t vpx_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8,
+                            int b_stride, int width, int height) {
+  int32_t y = 0;
+  int64_t sse = 0;
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  __m256i sum = _mm256_setzero_si256();
+  switch (width) {
+    case 4:
+      do {
+        highbd_sse_w4x4_avx2(&sum, a, a_stride, b, b_stride);
+        a += a_stride << 2;
+        b += b_stride << 2;
+        y += 4;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 8:
+      do {
+        highbd_sse_w8x2_avx2(&sum, a, a_stride, b, b_stride);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 16:
+      do {
+        highbd_sse_w16_avx2(&sum, a, b);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 32:
+      do {
+        int l = 0;
+        __m256i sum32 = _mm256_setzero_si256();
+        do {
+          highbd_sse_w16_avx2(&sum32, a, b);
+          highbd_sse_w16_avx2(&sum32, a + 16, b + 16);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 64 && l < (height - y));
+        summary_32_avx2(&sum32, &sum);
+        y += 64;
+      } while (y < height);
+      sse = summary_4x64_avx2(sum);
+      break;
+    case 64:
+      do {
+        int l = 0;
+        __m256i sum32 = _mm256_setzero_si256();
+        do {
+          highbd_sse_w16_avx2(&sum32, a, b);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 32 && l < (height - y));
+        summary_32_avx2(&sum32, &sum);
+        y += 32;
+      } while (y < height);
+      sse = summary_4x64_avx2(sum);
+      break;
+    default:
+      if (width & 0x7) {
+        do {
+          int i = 0;
+          __m256i sum32 = _mm256_setzero_si256();
+          do {
+            const uint16_t *a2;
+            const uint16_t *b2;
+            highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride);
+            a2 = a + i + (a_stride << 1);
+            b2 = b + i + (b_stride << 1);
+            highbd_sse_w8x2_avx2(&sum32, a2, a_stride, b2, b_stride);
+            i += 8;
+          } while (i + 4 < width);
+          highbd_sse_w4x4_avx2(&sum32, a + i, a_stride, b + i, b_stride);
+          summary_32_avx2(&sum32, &sum);
+          a += a_stride << 2;
+          b += b_stride << 2;
+          y += 4;
+        } while (y < height);
+      } else {
+        do {
+          int l = 0;
+          __m256i sum32 = _mm256_setzero_si256();
+          do {
+            int i = 0;
+            do {
+              highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride);
+              i += 8;
+            } while (i < width);
+            a += a_stride << 1;
+            b += b_stride << 1;
+            l += 2;
+          } while (l < 8 && l < (height - y));
+          summary_32_avx2(&sum32, &sum);
+          y += 8;
+        } while (y < height);
+      }
+      sse = summary_4x64_avx2(sum);
+      break;
+  }
+  return sse;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sse_sse4.c b/media/libvpx/libvpx/vpx_dsp/x86/sse_sse4.c
new file mode 100644
index 0000000000..4a7585c57e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/sse_sse4.c
@@ -0,0 +1,312 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+
+static INLINE int64_t summary_all_sse4(const __m128i *sum_all) {
+  int64_t sum;
+  const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all);
+  const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8));
+  const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1);
+  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+  _mm_storel_epi64((__m128i *)&sum, sum_1x64);
+  return sum;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void summary_32_sse4(const __m128i *sum32, __m128i *sum64) {
+  const __m128i sum0 = _mm_cvtepu32_epi64(*sum32);
+  const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum32, 8));
+  *sum64 = _mm_add_epi64(sum0, *sum64);
+  *sum64 = _mm_add_epi64(sum1, *sum64);
+}
+#endif
+
+static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a,
+                                  const uint8_t *b) {
+  const __m128i v_a0 = _mm_loadu_si128((const __m128i *)a);
+  const __m128i v_b0 = _mm_loadu_si128((const __m128i *)b);
+  const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0);
+  const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8));
+  const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0);
+  const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8));
+  const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w);
+  const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w));
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w));
+}
+
+static INLINE void sse4x2_sse4_1(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, __m128i *sum) {
+  const __m128i v_a0 = load_unaligned_u32(a);
+  const __m128i v_a1 = load_unaligned_u32(a + a_stride);
+  const __m128i v_b0 = load_unaligned_u32(b);
+  const __m128i v_b1 = load_unaligned_u32(b + b_stride);
+  const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1));
+  const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1));
+  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void sse8_sse4_1(const uint8_t *a, const uint8_t *b,
+                               __m128i *sum) {
+  const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a);
+  const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b);
+  const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0);
+  const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0);
+  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t vpx_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b,
+                       int b_stride, int width, int height) {
+  int y = 0;
+  int64_t sse = 0;
+  __m128i sum = _mm_setzero_si128();
+  switch (width) {
+    case 4:
+      do {
+        sse4x2_sse4_1(a, a_stride, b, b_stride, &sum);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 8:
+      do {
+        sse8_sse4_1(a, b, &sum);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 16:
+      do {
+        sse_w16_sse4_1(&sum, a, b);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 32:
+      do {
+        sse_w16_sse4_1(&sum, a, b);
+        sse_w16_sse4_1(&sum, a + 16, b + 16);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 64:
+      do {
+        sse_w16_sse4_1(&sum, a, b);
+        sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
+        sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
+        sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    default:
+      if (width & 0x07) {
+        do {
+          int i = 0;
+          do {
+            sse8_sse4_1(a + i, b + i, &sum);
+            sse8_sse4_1(a + i + a_stride, b + i + b_stride, &sum);
+            i += 8;
+          } while (i + 4 < width);
+          sse4x2_sse4_1(a + i, a_stride, b + i, b_stride, &sum);
+          a += (a_stride << 1);
+          b += (b_stride << 1);
+          y += 2;
+        } while (y < height);
+      } else {
+        do {
+          int i = 0;
+          do {
+            sse8_sse4_1(a + i, b + i, &sum);
+            i += 8;
+          } while (i < width);
+          a += a_stride;
+          b += b_stride;
+          y += 1;
+        } while (y < height);
+      }
+      sse = summary_all_sse4(&sum);
+      break;
+  }
+
+  return sse;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void highbd_sse_w4x2_sse4_1(__m128i *sum, const uint16_t *a,
+                                          int a_stride, const uint16_t *b,
+                                          int b_stride) {
+  const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a);
+  const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride));
+  const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b);
+  const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride));
+  const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1);
+  const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1);
+  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a,
+                                        const uint16_t *b) {
+  const __m128i v_a_w = _mm_loadu_si128((const __m128i *)a);
+  const __m128i v_b_w = _mm_loadu_si128((const __m128i *)b);
+  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t vpx_highbd_sse_sse4_1(const uint8_t *a8, int a_stride,
+                              const uint8_t *b8, int b_stride, int width,
+                              int height) {
+  int32_t y = 0;
+  int64_t sse = 0;
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  __m128i sum = _mm_setzero_si128();
+  switch (width) {
+    case 4:
+      do {
+        highbd_sse_w4x2_sse4_1(&sum, a, a_stride, b, b_stride);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 8:
+      do {
+        highbd_sse_w8_sse4_1(&sum, a, b);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 16:
+      do {
+        int l = 0;
+        __m128i sum32 = _mm_setzero_si128();
+        do {
+          highbd_sse_w8_sse4_1(&sum32, a, b);
+          highbd_sse_w8_sse4_1(&sum32, a + 8, b + 8);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 64 && l < (height - y));
+        summary_32_sse4(&sum32, &sum);
+        y += 64;
+      } while (y < height);
+      _mm_storel_epi64((__m128i *)&sse,
+                       _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+      break;
+    case 32:
+      do {
+        int l = 0;
+        __m128i sum32 = _mm_setzero_si128();
+        do {
+          highbd_sse_w8_sse4_1(&sum32, a, b);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 32 && l < (height - y));
+        summary_32_sse4(&sum32, &sum);
+        y += 32;
+      } while (y < height);
+      _mm_storel_epi64((__m128i *)&sse,
+                       _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+      break;
+    case 64:
+      do {
+        int l = 0;
+        __m128i sum32 = _mm_setzero_si128();
+        do {
+          highbd_sse_w8_sse4_1(&sum32, a, b);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 16 && l < (height - y));
+        summary_32_sse4(&sum32, &sum);
+        y += 16;
+      } while (y < height);
+      _mm_storel_epi64((__m128i *)&sse,
+                       _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+      break;
+    default:
+      if (width & 0x7) {
+        do {
+          __m128i sum32 = _mm_setzero_si128();
+          int i = 0;
+          do {
+            highbd_sse_w8_sse4_1(&sum32, a + i, b + i);
+            highbd_sse_w8_sse4_1(&sum32, a + i + a_stride, b + i + b_stride);
+            i += 8;
+          } while (i + 4 < width);
+          highbd_sse_w4x2_sse4_1(&sum32, a + i, a_stride, b + i, b_stride);
+          a += (a_stride << 1);
+          b += (b_stride << 1);
+          y += 2;
+          summary_32_sse4(&sum32, &sum);
+        } while (y < height);
+      } else {
+        do {
+          int l = 0;
+          __m128i sum32 = _mm_setzero_si128();
+          do {
+            int i = 0;
+            do {
+              highbd_sse_w8_sse4_1(&sum32, a + i, b + i);
+              i += 8;
+            } while (i < width);
+            a += a_stride;
+            b += b_stride;
+            l += 1;
+          } while (l < 8 && l < (height - y));
+          summary_32_sse4(&sum32, &sum);
+          y += 8;
+        } while (y < height);
+      }
+      _mm_storel_epi64((__m128i *)&sse,
+                       _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+      break;
+  }
+  return sse;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm b/media/libvpx/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm
index 6d58321e03..1ad3b88c8d 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm
+++ b/media/libvpx/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm
@@ -44,7 +44,10 @@
         paddd           %1, xmm1
         SUM_ACROSS_Q    %1
 %endmacro
-;void ssim_parms_sse2(
+
+SECTION .text
+
+;void vpx_ssim_parms_8x8_sse2(
 ;    unsigned char *s,
 ;    int sp,
 ;    unsigned char *r,
@@ -61,97 +64,7 @@
 ; or pavgb At this point this is just meant to be first pass for calculating
 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion
 ; in mode selection code.
-global sym(vpx_ssim_parms_16x16_sse2) PRIVATE
-sym(vpx_ssim_parms_16x16_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 15
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0) ;s
-    mov             rcx,        arg(1) ;sp
-    mov             rdi,        arg(2) ;r
-    mov             rax,        arg(3) ;rp
-
-    pxor            xmm0, xmm0
-    pxor            xmm15,xmm15  ;sum_s
-    pxor            xmm14,xmm14  ;sum_r
-    pxor            xmm13,xmm13  ;sum_sq_s
-    pxor            xmm12,xmm12  ;sum_sq_r
-    pxor            xmm11,xmm11  ;sum_sxr
-
-    mov             rdx, 16      ;row counter
-.NextRow:
-
-    ;grab source and reference pixels
-    movdqu          xmm5, [rsi]
-    movdqu          xmm6, [rdi]
-    movdqa          xmm3, xmm5
-    movdqa          xmm4, xmm6
-    punpckhbw       xmm3, xmm0 ; high_s
-    punpckhbw       xmm4, xmm0 ; high_r
-
-    TABULATE_SSIM
-
-    movdqa          xmm3, xmm5
-    movdqa          xmm4, xmm6
-    punpcklbw       xmm3, xmm0 ; low_s
-    punpcklbw       xmm4, xmm0 ; low_r
-
-    TABULATE_SSIM
-
-    add             rsi, rcx   ; next s row
-    add             rdi, rax   ; next r row
-
-    dec             rdx        ; counter
-    jnz .NextRow
-
-    SUM_ACROSS_W    xmm15
-    SUM_ACROSS_W    xmm14
-    SUM_ACROSS_Q    xmm13
-    SUM_ACROSS_Q    xmm12
-    SUM_ACROSS_Q    xmm11
-
-    mov             rdi,arg(4)
-    movd            [rdi], xmm15;
-    mov             rdi,arg(5)
-    movd            [rdi], xmm14;
-    mov             rdi,arg(6)
-    movd            [rdi], xmm13;
-    mov             rdi,arg(7)
-    movd            [rdi], xmm12;
-    mov             rdi,arg(8)
-    movd            [rdi], xmm11;
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void ssim_parms_sse2(
-;    unsigned char *s,
-;    int sp,
-;    unsigned char *r,
-;    int rp
-;    uint32_t *sum_s,
-;    uint32_t *sum_r,
-;    uint32_t *sum_sq_s,
-;    uint32_t *sum_sq_r,
-;    uint32_t *sum_sxr);
-;
-; TODO: Use parm passing through structure, probably don't need the pxors
-; ( calling app will initialize to 0 ) could easily fit everything in sse2
-; without too much hastle, and can probably do better estimates with psadw
-; or pavgb At this point this is just meant to be first pass for calculating
-; all the parms needed for 16x16 ssim so we can play with dssim as distortion
-; in mode selection code.
-global sym(vpx_ssim_parms_8x8_sse2) PRIVATE
+globalsym(vpx_ssim_parms_8x8_sse2)
 sym(vpx_ssim_parms_8x8_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/subpel_variance_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/subpel_variance_sse2.asm
index cee4468c1f..d1d8d3460e 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/subpel_variance_sse2.asm
+++ b/media/libvpx/libvpx/vpx_dsp/x86/subpel_variance_sse2.asm
@@ -41,12 +41,12 @@ SECTION .text
 
 ; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
 ;                               int x_offset, int y_offset,
-;                               const uint8_t *dst, ptrdiff_t dst_stride,
+;                               const uint8_t *ref, ptrdiff_t ref_stride,
 ;                               int height, unsigned int *sse);
 ;
 ; This function returns the SE and stores SSE in the given pointer.
 
-%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
+%macro SUM_SSE 6 ; src1, ref1, src2, ref2, sum, sse
   psubw                %3, %4
   psubw                %1, %2
   paddw                %5, %3
@@ -95,7 +95,7 @@ SECTION .text
 %endmacro
 
 %macro INC_SRC_BY_SRC_STRIDE  0
-%if ARCH_X86=1 && CONFIG_PIC=1
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
   add                srcq, src_stridemp
 %else
   add                srcq, src_strideq
@@ -114,84 +114,65 @@ SECTION .text
 ; 11, not 13, if the registers are ordered correctly. May make a minor speed
 ; difference on Win64
 
-%ifdef PIC    ; 64bit PIC
+%if VPX_ARCH_X86_64
   %if %2 == 1 ; avg
     cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
-                                      x_offset, y_offset, \
-                                      dst, dst_stride, \
-                                      sec, sec_stride, height, sse
-    %define sec_str sec_strideq
+                                        x_offset, y_offset, ref, ref_stride, \
+                                        second_pred, second_stride, height, sse
+    %define second_str second_strideq
   %else
-    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
-                                  y_offset, dst, dst_stride, height, sse
+    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
+                                    x_offset, y_offset, ref, ref_stride, \
+                                    height, sse
   %endif
   %define block_height heightd
   %define bilin_filter sseq
 %else
-  %if ARCH_X86=1 && CONFIG_PIC=1
+  %if CONFIG_PIC=1
     %if %2 == 1 ; avg
       cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
-                                  x_offset, y_offset, \
-                                  dst, dst_stride, \
-                                  sec, sec_stride, \
-                                  height, sse, g_bilin_filter, g_pw_8
+                                          x_offset, y_offset, ref, ref_stride, \
+                                          second_pred, second_stride, height, sse
       %define block_height dword heightm
-      %define sec_str sec_stridemp
-
-      ;Store bilin_filter and pw_8 location in stack
-      %if GET_GOT_DEFINED == 1
-        GET_GOT eax
-        add esp, 4                ; restore esp
-      %endif
-
-      lea ecx, [GLOBAL(bilin_filter_m)]
-      mov g_bilin_filterm, ecx
-
-      lea ecx, [GLOBAL(pw_8)]
-      mov g_pw_8m, ecx
-
-      LOAD_IF_USED 0, 1         ; load eax, ecx back
+      %define second_str second_stridemp
     %else
-      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
-                                y_offset, dst, dst_stride, height, sse, \
-                                g_bilin_filter, g_pw_8
+      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+                                      x_offset, y_offset, ref, ref_stride, \
+                                      height, sse
       %define block_height heightd
-
-      ;Store bilin_filter and pw_8 location in stack
-      %if GET_GOT_DEFINED == 1
-        GET_GOT eax
-        add esp, 4                ; restore esp
-      %endif
-
-      lea ecx, [GLOBAL(bilin_filter_m)]
-      mov g_bilin_filterm, ecx
-
-      lea ecx, [GLOBAL(pw_8)]
-      mov g_pw_8m, ecx
-
-      LOAD_IF_USED 0, 1         ; load eax, ecx back
     %endif
+
+    ; reuse argument stack space
+    %define g_bilin_filterm x_offsetm
+    %define g_pw_8m y_offsetm
+
+    ;Store bilin_filter and pw_8 location in stack
+    %if GET_GOT_DEFINED == 1
+      GET_GOT eax
+      add esp, 4                ; restore esp
+    %endif
+
+    lea ecx, [GLOBAL(bilin_filter_m)]
+    mov g_bilin_filterm, ecx
+
+    lea ecx, [GLOBAL(pw_8)]
+    mov g_pw_8m, ecx
+
+    LOAD_IF_USED 0, 1         ; load eax, ecx back
   %else
     %if %2 == 1 ; avg
-      cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
-                        7 + 2 * ARCH_X86_64, 13, src, src_stride, \
-                                             x_offset, y_offset, \
-                                             dst, dst_stride, \
-                                             sec, sec_stride, \
-                                             height, sse
-      %if ARCH_X86_64
-      %define block_height heightd
-      %define sec_str sec_strideq
-      %else
+      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+                                          x_offset, y_offset, \
+                                          ref, ref_stride, second_pred, second_stride, \
+                                          height, sse
       %define block_height dword heightm
-      %define sec_str sec_stridemp
-      %endif
+      %define second_str second_stridemp
     %else
-      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
-                              y_offset, dst, dst_stride, height, sse
+      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+                                      x_offset, y_offset, ref, ref_stride, \
+                                      height, sse
       %define block_height heightd
     %endif
-
     %define bilin_filter bilin_filter_m
   %endif
 %endif
@@ -211,7 +192,7 @@ SECTION .text
 %if %1 < 16
   sar                   block_height, 1
 %if %2 == 1 ; avg
-  shl             sec_str, 1
+  shl             second_str, 1
 %endif
 %endif
 
@@ -226,9 +207,9 @@ SECTION .text
 .x_zero_y_zero_loop:
 %if %1 == 16
   movu                 m0, [srcq]
-  mova                 m1, [dstq]
+  mova                 m1, [refq]
 %if %2 == 1 ; avg
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
   punpckhbw            m3, m1, m5
   punpcklbw            m1, m5
 %endif
@@ -242,7 +223,7 @@ SECTION .text
   SUM_SSE              m0, m1, m2, m3, m6, m7
 
   add                srcq, src_strideq
-  add                dstq, dst_strideq
+  add                refq, ref_strideq
 %else ; %1 < 16
   movx                 m0, [srcq]
 %if %2 == 1 ; avg
@@ -256,14 +237,14 @@ SECTION .text
   movx                 m2, [srcq+src_strideq]
 %endif
 
-  movx                 m1, [dstq]
-  movx                 m3, [dstq+dst_strideq]
+  movx                 m1, [refq]
+  movx                 m3, [refq+ref_strideq]
 
 %if %2 == 1 ; avg
 %if %1 > 4
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
 %else
-  movh                 m2, [secq]
+  movh                 m2, [second_predq]
   pavgb                m0, m2
 %endif
   punpcklbw            m3, m5
@@ -284,10 +265,10 @@ SECTION .text
   SUM_SSE              m0, m1, m2, m3, m6, m7
 
   lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
+  lea                refq, [refq+ref_strideq*2]
 %endif
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
   dec                   block_height
   jg .x_zero_y_zero_loop
@@ -302,11 +283,11 @@ SECTION .text
 %if %1 == 16
   movu                 m0, [srcq]
   movu                 m4, [srcq+src_strideq]
-  mova                 m1, [dstq]
+  mova                 m1, [refq]
   pavgb                m0, m4
   punpckhbw            m3, m1, m5
 %if %2 == 1 ; avg
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
 %endif
   punpcklbw            m1, m5
   punpckhbw            m2, m0, m5
@@ -314,7 +295,7 @@ SECTION .text
   SUM_SSE              m0, m1, m2, m3, m6, m7
 
   add                srcq, src_strideq
-  add                dstq, dst_strideq
+  add                refq, ref_strideq
 %else ; %1 < 16
   movx                 m0, [srcq]
   movx                 m2, [srcq+src_strideq]
@@ -325,22 +306,22 @@ SECTION .text
   movx                 m1, [srcq+src_strideq*2]
   punpckldq            m2, m1
 %endif
-  movx                 m1, [dstq]
+  movx                 m1, [refq]
 %if %1 > 4
   movlhps              m0, m2
 %else ; 4xh
   punpckldq            m0, m2
 %endif
-  movx                 m3, [dstq+dst_strideq]
+  movx                 m3, [refq+ref_strideq]
   pavgb                m0, m2
   punpcklbw            m1, m5
 %if %1 > 4
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
   punpcklbw            m3, m5
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %else ; 4xh
-  movh                 m4, [secq]
+  movh                 m4, [second_predq]
   pavgb                m0, m4
   punpcklbw            m3, m5
   punpcklbw            m0, m5
@@ -348,9 +329,9 @@ SECTION .text
 %endif
 %else ; !avg
   movx                 m4, [srcq+src_strideq*2]
-  movx                 m1, [dstq]
+  movx                 m1, [refq]
   pavgb                m0, m2
-  movx                 m3, [dstq+dst_strideq]
+  movx                 m3, [refq+ref_strideq]
   pavgb                m2, m4
   punpcklbw            m0, m5
   punpcklbw            m2, m5
@@ -360,10 +341,10 @@ SECTION .text
   SUM_SSE              m0, m1, m2, m3, m6, m7
 
   lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
+  lea                refq, [refq+ref_strideq*2]
 %endif
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
   dec                   block_height
   jg .x_zero_y_half_loop
@@ -371,21 +352,21 @@ SECTION .text
 
 .x_zero_y_nonhalf:
   ; x_offset == 0 && y_offset == bilin interpolation
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if VPX_ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
+%if VPX_ARCH_X86_64 && %1 > 4
   mova                 m8, [bilin_filter+y_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+y_offsetq+16]
 %endif
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_y_a m8
 %define filter_y_b m9
 %define filter_rnd m10
 %else ; x86-32 or mmx
-%if ARCH_X86=1 && CONFIG_PIC=1
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
 ; x_offset == 0, reuse x_offset reg
 %define tempq x_offsetq
   add y_offsetq, g_bilin_filterm
@@ -397,7 +378,7 @@ SECTION .text
   add           y_offsetq, bilin_filter
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -405,7 +386,7 @@ SECTION .text
 %if %1 == 16
   movu                 m0, [srcq]
   movu                 m4, [srcq+src_strideq]
-  mova                 m1, [dstq]
+  mova                 m1, [refq]
 %if cpuflag(ssse3)
   punpckhbw            m2, m0, m4
   punpcklbw            m0, m4
@@ -437,7 +418,7 @@ SECTION .text
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
   packuswb             m0, m2
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %endif
@@ -446,14 +427,14 @@ SECTION .text
   SUM_SSE              m0, m1, m2, m3, m6, m7
 
   add                srcq, src_strideq
-  add                dstq, dst_strideq
+  add                refq, ref_strideq
 %else ; %1 < 16
   movx                 m0, [srcq]
   movx                 m2, [srcq+src_strideq]
   movx                 m4, [srcq+src_strideq*2]
-  movx                 m3, [dstq+dst_strideq]
+  movx                 m3, [refq+ref_strideq]
 %if cpuflag(ssse3)
-  movx                 m1, [dstq]
+  movx                 m1, [refq]
   punpcklbw            m0, m2
   punpcklbw            m2, m4
   pmaddubsw            m0, filter_y_a
@@ -473,7 +454,7 @@ SECTION .text
   pmullw               m4, filter_y_b
   paddw                m0, m1
   paddw                m2, filter_rnd
-  movx                 m1, [dstq]
+  movx                 m1, [refq]
   paddw                m2, m4
 %endif
   psraw                m0, 4
@@ -485,11 +466,11 @@ SECTION .text
 %endif
   packuswb             m0, m2
 %if %1 > 4
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %else ; 4xh
-  movh                 m2, [secq]
+  movh                 m2, [second_predq]
   pavgb                m0, m2
   punpcklbw            m0, m5
   movhlps              m2, m0
@@ -499,10 +480,10 @@ SECTION .text
   SUM_SSE              m0, m1, m2, m3, m6, m7
 
   lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
+  lea                refq, [refq+ref_strideq*2]
 %endif
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
   dec                   block_height
   jg .x_zero_y_other_loop
@@ -523,11 +504,11 @@ SECTION .text
 %if %1 == 16
   movu                 m0, [srcq]
   movu                 m4, [srcq+1]
-  mova                 m1, [dstq]
+  mova                 m1, [refq]
   pavgb                m0, m4
   punpckhbw            m3, m1, m5
 %if %2 == 1 ; avg
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
 %endif
   punpcklbw            m1, m5
   punpckhbw            m2, m0, m5
@@ -535,7 +516,7 @@ SECTION .text
   SUM_SSE              m0, m1, m2, m3, m6, m7
 
   add                srcq, src_strideq
-  add                dstq, dst_strideq
+  add                refq, ref_strideq
 %else ; %1 < 16
   movx                 m0, [srcq]
   movx                 m4, [srcq+1]
@@ -549,17 +530,17 @@ SECTION .text
   movx                 m2, [srcq+src_strideq+1]
   punpckldq            m4, m2
 %endif
-  movx                 m1, [dstq]
-  movx                 m3, [dstq+dst_strideq]
+  movx                 m1, [refq]
+  movx                 m3, [refq+ref_strideq]
   pavgb                m0, m4
   punpcklbw            m3, m5
 %if %1 > 4
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
   punpcklbw            m1, m5
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %else ; 4xh
-  movh                 m2, [secq]
+  movh                 m2, [second_predq]
   pavgb                m0, m2
   punpcklbw            m1, m5
   punpcklbw            m0, m5
@@ -567,10 +548,10 @@ SECTION .text
 %endif
 %else ; !avg
   movx                 m2, [srcq+src_strideq]
-  movx                 m1, [dstq]
+  movx                 m1, [refq]
   pavgb                m0, m4
   movx                 m4, [srcq+src_strideq+1]
-  movx                 m3, [dstq+dst_strideq]
+  movx                 m3, [refq+ref_strideq]
   pavgb                m2, m4
   punpcklbw            m0, m5
   punpcklbw            m2, m5
@@ -580,10 +561,10 @@ SECTION .text
   SUM_SSE              m0, m1, m2, m3, m6, m7
 
   lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
+  lea                refq, [refq+ref_strideq*2]
 %endif
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
   dec                   block_height
   jg .x_half_y_zero_loop
@@ -602,13 +583,13 @@ SECTION .text
 .x_half_y_half_loop:
   movu                 m4, [srcq]
   movu                 m3, [srcq+1]
-  mova                 m1, [dstq]
+  mova                 m1, [refq]
   pavgb                m4, m3
   punpckhbw            m3, m1, m5
   pavgb                m0, m4
 %if %2 == 1 ; avg
   punpcklbw            m1, m5
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %else
@@ -620,7 +601,7 @@ SECTION .text
   mova                 m0, m4
 
   add                srcq, src_strideq
-  add                dstq, dst_strideq
+  add                refq, ref_strideq
 %else ; %1 < 16
   movx                 m0, [srcq]
   movx                 m3, [srcq+1]
@@ -647,13 +628,13 @@ SECTION .text
   punpckldq            m0, m2
   pshuflw              m4, m2, 0xe
 %endif
-  movx                 m1, [dstq]
+  movx                 m1, [refq]
   pavgb                m0, m2
-  movx                 m3, [dstq+dst_strideq]
+  movx                 m3, [refq+ref_strideq]
 %if %1 > 4
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
 %else
-  movh                 m2, [secq]
+  movh                 m2, [second_predq]
   pavgb                m0, m2
 %endif
   punpcklbw            m3, m5
@@ -672,8 +653,8 @@ SECTION .text
   pavgb                m4, m1
   pavgb                m0, m2
   pavgb                m2, m4
-  movx                 m1, [dstq]
-  movx                 m3, [dstq+dst_strideq]
+  movx                 m1, [refq]
+  movx                 m3, [refq+ref_strideq]
   punpcklbw            m0, m5
   punpcklbw            m2, m5
   punpcklbw            m3, m5
@@ -683,10 +664,10 @@ SECTION .text
   mova                 m0, m4
 
   lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
+  lea                refq, [refq+ref_strideq*2]
 %endif
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
   dec                   block_height
   jg .x_half_y_half_loop
@@ -694,21 +675,21 @@ SECTION .text
 
 .x_half_y_nonhalf:
   ; x_offset == 0.5 && y_offset == bilin interpolation
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if VPX_ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
+%if VPX_ARCH_X86_64 && %1 > 4
   mova                 m8, [bilin_filter+y_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+y_offsetq+16]
 %endif
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_y_a m8
 %define filter_y_b m9
 %define filter_rnd m10
 %else  ;x86_32
-%if ARCH_X86=1 && CONFIG_PIC=1
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
 ; x_offset == 0.5. We can reuse x_offset reg
 %define tempq x_offsetq
   add y_offsetq, g_bilin_filterm
@@ -720,7 +701,7 @@ SECTION .text
   add           y_offsetq, bilin_filter
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -732,7 +713,7 @@ SECTION .text
 .x_half_y_other_loop:
   movu                 m4, [srcq]
   movu                 m2, [srcq+1]
-  mova                 m1, [dstq]
+  mova                 m1, [refq]
   pavgb                m4, m2
 %if cpuflag(ssse3)
   punpckhbw            m2, m0, m4
@@ -762,7 +743,7 @@ SECTION .text
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
   packuswb             m0, m2
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %endif
@@ -771,7 +752,7 @@ SECTION .text
   mova                 m0, m4
 
   add                srcq, src_strideq
-  add                dstq, dst_strideq
+  add                refq, ref_strideq
 %else ; %1 < 16
   movx                 m0, [srcq]
   movx                 m3, [srcq+1]
@@ -787,9 +768,9 @@ SECTION .text
   movx                 m3, [srcq+src_strideq+1]
   pavgb                m2, m1
   pavgb                m4, m3
-  movx                 m3, [dstq+dst_strideq]
+  movx                 m3, [refq+ref_strideq]
 %if cpuflag(ssse3)
-  movx                 m1, [dstq]
+  movx                 m1, [refq]
   punpcklbw            m0, m2
   punpcklbw            m2, m4
   pmaddubsw            m0, filter_y_a
@@ -809,7 +790,7 @@ SECTION .text
   pmullw               m1, m4, filter_y_b
   paddw                m2, filter_rnd
   paddw                m2, m1
-  movx                 m1, [dstq]
+  movx                 m1, [refq]
 %endif
   psraw                m0, 4
   psraw                m2, 4
@@ -820,11 +801,11 @@ SECTION .text
 %endif
   packuswb             m0, m2
 %if %1 > 4
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %else
-  movh                 m2, [secq]
+  movh                 m2, [second_predq]
   pavgb                m0, m2
   punpcklbw            m0, m5
   movhlps              m2, m0
@@ -835,10 +816,10 @@ SECTION .text
   mova                 m0, m4
 
   lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
+  lea                refq, [refq+ref_strideq*2]
 %endif
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
   dec                   block_height
   jg .x_half_y_other_loop
@@ -852,21 +833,21 @@ SECTION .text
   jnz .x_nonhalf_y_nonzero
 
   ; x_offset == bilin interpolation && y_offset == 0
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if VPX_ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
+%if VPX_ARCH_X86_64 && %1 > 4
   mova                 m8, [bilin_filter+x_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+x_offsetq+16]
 %endif
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_rnd m10
 %else    ; x86-32
-%if ARCH_X86=1 && CONFIG_PIC=1
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
 ;y_offset == 0. We can reuse y_offset reg.
 %define tempq y_offsetq
   add x_offsetq, g_bilin_filterm
@@ -878,7 +859,7 @@ SECTION .text
   add           x_offsetq, bilin_filter
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -886,7 +867,7 @@ SECTION .text
 %if %1 == 16
   movu                 m0, [srcq]
   movu                 m4, [srcq+1]
-  mova                 m1, [dstq]
+  mova                 m1, [refq]
 %if cpuflag(ssse3)
   punpckhbw            m2, m0, m4
   punpcklbw            m0, m4
@@ -913,7 +894,7 @@ SECTION .text
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
   packuswb             m0, m2
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %endif
@@ -922,16 +903,16 @@ SECTION .text
   SUM_SSE              m0, m1, m2, m3, m6, m7
 
   add                srcq, src_strideq
-  add                dstq, dst_strideq
+  add                refq, ref_strideq
 %else ; %1 < 16
   movx                 m0, [srcq]
   movx                 m1, [srcq+1]
   movx                 m2, [srcq+src_strideq]
   movx                 m4, [srcq+src_strideq+1]
-  movx                 m3, [dstq+dst_strideq]
+  movx                 m3, [refq+ref_strideq]
 %if cpuflag(ssse3)
   punpcklbw            m0, m1
-  movx                 m1, [dstq]
+  movx                 m1, [refq]
   punpcklbw            m2, m4
   pmaddubsw            m0, filter_x_a
   pmaddubsw            m2, filter_x_a
@@ -951,7 +932,7 @@ SECTION .text
   pmullw               m4, filter_x_b
   paddw                m0, m1
   paddw                m2, filter_rnd
-  movx                 m1, [dstq]
+  movx                 m1, [refq]
   paddw                m2, m4
 %endif
   psraw                m0, 4
@@ -963,11 +944,11 @@ SECTION .text
 %endif
   packuswb             m0, m2
 %if %1 > 4
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %else
-  movh                 m2, [secq]
+  movh                 m2, [second_predq]
   pavgb                m0, m2
   punpcklbw            m0, m5
   movhlps              m2, m0
@@ -977,10 +958,10 @@ SECTION .text
   SUM_SSE              m0, m1, m2, m3, m6, m7
 
   lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
+  lea                refq, [refq+ref_strideq*2]
 %endif
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
   dec                   block_height
   jg .x_other_y_zero_loop
@@ -994,21 +975,21 @@ SECTION .text
   jne .x_nonhalf_y_nonhalf
 
   ; x_offset == bilin interpolation && y_offset == 0.5
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if VPX_ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
+%if VPX_ARCH_X86_64 && %1 > 4
   mova                 m8, [bilin_filter+x_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+x_offsetq+16]
 %endif
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_rnd m10
 %else    ; x86-32
-%if ARCH_X86=1 && CONFIG_PIC=1
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
 ; y_offset == 0.5. We can reuse y_offset reg.
 %define tempq y_offsetq
   add x_offsetq, g_bilin_filterm
@@ -1020,7 +1001,7 @@ SECTION .text
   add           x_offsetq, bilin_filter
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -1056,7 +1037,7 @@ SECTION .text
   movu                 m4, [srcq]
   movu                 m3, [srcq+1]
 %if cpuflag(ssse3)
-  mova                 m1, [dstq]
+  mova                 m1, [refq]
   punpckhbw            m2, m4, m3
   punpcklbw            m4, m3
   pmaddubsw            m2, filter_x_a
@@ -1082,7 +1063,7 @@ SECTION .text
   paddw                m2, filter_rnd
   paddw                m4, m3
   paddw                m2, m1
-  mova                 m1, [dstq]
+  mova                 m1, [refq]
   psraw                m4, 4
   psraw                m2, 4
   punpckhbw            m3, m1, m5
@@ -1096,7 +1077,7 @@ SECTION .text
 %endif
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
 %endif
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
@@ -1104,7 +1085,7 @@ SECTION .text
   mova                 m0, m4
 
   add                srcq, src_strideq
-  add                dstq, dst_strideq
+  add                refq, ref_strideq
 %else ; %1 < 16
   movx                 m0, [srcq]
   movx                 m1, [srcq+1]
@@ -1132,8 +1113,8 @@ SECTION .text
   punpcklbw            m4, m3
   pmaddubsw            m2, filter_x_a
   pmaddubsw            m4, filter_x_a
-  movx                 m1, [dstq]
-  movx                 m3, [dstq+dst_strideq]
+  movx                 m1, [refq]
+  movx                 m3, [refq+ref_strideq]
   paddw                m2, filter_rnd
   paddw                m4, filter_rnd
 %else
@@ -1148,9 +1129,9 @@ SECTION .text
   pmullw               m3, filter_x_b
   paddw                m4, filter_rnd
   paddw                m2, m1
-  movx                 m1, [dstq]
+  movx                 m1, [refq]
   paddw                m4, m3
-  movx                 m3, [dstq+dst_strideq]
+  movx                 m3, [refq+ref_strideq]
 %endif
   psraw                m2, 4
   psraw                m4, 4
@@ -1163,11 +1144,11 @@ SECTION .text
 %endif
   packuswb             m0, m2
 %if %1 > 4
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %else
-  movh                 m2, [secq]
+  movh                 m2, [second_predq]
   pavgb                m0, m2
   punpcklbw            m0, m5
   movhlps              m2, m0
@@ -1179,10 +1160,10 @@ SECTION .text
   mova                 m0, m4
 
   lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
+  lea                refq, [refq+ref_strideq*2]
 %endif
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
   dec                   block_height
   jg .x_other_y_half_loop
@@ -1192,12 +1173,12 @@ SECTION .text
   STORE_AND_RET %1
 
 .x_nonhalf_y_nonhalf:
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if VPX_ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift
   shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
+%if VPX_ARCH_X86_64 && %1 > 4
   mova                 m8, [bilin_filter+x_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+x_offsetq+16]
@@ -1206,14 +1187,14 @@ SECTION .text
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                m11, [bilin_filter+y_offsetq+16]
 %endif
-  mova                m12, [pw_8]
+  mova                m12, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_y_a m10
 %define filter_y_b m11
 %define filter_rnd m12
 %else   ; x86-32
-%if ARCH_X86=1 && CONFIG_PIC=1
+%if VPX_ARCH_X86=1 && CONFIG_PIC=1
 ; In this case, there is NO unused register. Used src_stride register. Later,
 ; src_stride has to be loaded from stack when it is needed.
 %define tempq src_strideq
@@ -1234,7 +1215,7 @@ SECTION .text
 %define filter_x_b [x_offsetq+16]
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -1273,7 +1254,7 @@ SECTION .text
 %if cpuflag(ssse3)
   movu                 m4, [srcq]
   movu                 m3, [srcq+1]
-  mova                 m1, [dstq]
+  mova                 m1, [refq]
   punpckhbw            m2, m4, m3
   punpcklbw            m4, m3
   pmaddubsw            m2, filter_x_a
@@ -1319,7 +1300,7 @@ SECTION .text
   pmullw               m0, filter_y_a
   pmullw               m3, filter_y_b
   paddw                m2, m1
-  mova                 m1, [dstq]
+  mova                 m1, [refq]
   paddw                m0, filter_rnd
   psraw                m2, 4
   paddw                m0, m3
@@ -1330,7 +1311,7 @@ SECTION .text
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
   packuswb             m0, m2
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %endif
@@ -1338,7 +1319,7 @@ SECTION .text
   mova                 m0, m4
 
   INC_SRC_BY_SRC_STRIDE
-  add                dstq, dst_strideq
+  add                refq, ref_strideq
 %else ; %1 < 16
   movx                 m0, [srcq]
   movx                 m1, [srcq+1]
@@ -1374,8 +1355,8 @@ SECTION .text
   punpcklbw            m4, m3
   pmaddubsw            m2, filter_x_a
   pmaddubsw            m4, filter_x_a
-  movx                 m3, [dstq+dst_strideq]
-  movx                 m1, [dstq]
+  movx                 m3, [refq+ref_strideq]
+  movx                 m1, [refq]
   paddw                m2, filter_rnd
   paddw                m4, filter_rnd
   psraw                m2, 4
@@ -1414,9 +1395,9 @@ SECTION .text
   pmullw               m1, m4, filter_y_b
   paddw                m2, filter_rnd
   paddw                m0, m3
-  movx                 m3, [dstq+dst_strideq]
+  movx                 m3, [refq+ref_strideq]
   paddw                m2, m1
-  movx                 m1, [dstq]
+  movx                 m1, [refq]
   psraw                m0, 4
   psraw                m2, 4
   punpcklbw            m3, m5
@@ -1429,11 +1410,11 @@ SECTION .text
 %endif
   packuswb             m0, m2
 %if %1 > 4
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %else
-  movh                 m2, [secq]
+  movh                 m2, [second_predq]
   pavgb                m0, m2
   punpcklbw            m0, m5
   movhlps              m2, m0
@@ -1443,10 +1424,10 @@ SECTION .text
   mova                 m0, m4
 
   INC_SRC_BY_SRC_STRIDE
-  lea                dstq, [dstq+dst_strideq*2]
+  lea                refq, [refq+ref_strideq*2]
 %endif
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
   dec                   block_height
   jg .x_other_y_other_loop
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/subtract_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/subtract_avx2.c
new file mode 100644
index 0000000000..4849581ed4
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/subtract_avx2.c
@@ -0,0 +1,203 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static VPX_FORCE_INLINE void subtract32_avx2(int16_t *diff_ptr,
+                                             const uint8_t *src_ptr,
+                                             const uint8_t *pred_ptr) {
+  const __m256i s = _mm256_lddqu_si256((const __m256i *)src_ptr);
+  const __m256i p = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+  const __m256i s_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s));
+  const __m256i s_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s, 1));
+  const __m256i p_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(p));
+  const __m256i p_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(p, 1));
+  const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
+  const __m256i d_1 = _mm256_sub_epi16(s_1, p_1);
+  _mm256_storeu_si256((__m256i *)diff_ptr, d_0);
+  _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d_1);
+}
+
+static VPX_FORCE_INLINE void subtract_block_16xn_avx2(
+    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  int j;
+  for (j = 0; j < rows; ++j) {
+    const __m128i s = _mm_lddqu_si128((const __m128i *)src_ptr);
+    const __m128i p = _mm_lddqu_si128((const __m128i *)pred_ptr);
+    const __m256i s_0 = _mm256_cvtepu8_epi16(s);
+    const __m256i p_0 = _mm256_cvtepu8_epi16(p);
+    const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
+    _mm256_storeu_si256((__m256i *)diff_ptr, d_0);
+    src_ptr += src_stride;
+    pred_ptr += pred_stride;
+    diff_ptr += diff_stride;
+  }
+}
+
+static VPX_FORCE_INLINE void subtract_block_32xn_avx2(
+    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  int j;
+  for (j = 0; j < rows; ++j) {
+    subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+    src_ptr += src_stride;
+    pred_ptr += pred_stride;
+    diff_ptr += diff_stride;
+  }
+}
+
+static VPX_FORCE_INLINE void subtract_block_64xn_avx2(
+    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  int j;
+  for (j = 0; j < rows; ++j) {
+    subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+    subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
+    src_ptr += src_stride;
+    pred_ptr += pred_stride;
+    diff_ptr += diff_stride;
+  }
+}
+
+void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
+                             ptrdiff_t diff_stride, const uint8_t *src_ptr,
+                             ptrdiff_t src_stride, const uint8_t *pred_ptr,
+                             ptrdiff_t pred_stride) {
+  switch (cols) {
+    case 16:
+      subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+                               pred_ptr, pred_stride);
+      break;
+    case 32:
+      subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+                               pred_ptr, pred_stride);
+      break;
+    case 64:
+      subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+                               pred_ptr, pred_stride);
+      break;
+    default:
+      vpx_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr,
+                              src_stride, pred_ptr, pred_stride);
+      break;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
+                                    ptrdiff_t diff_stride,
+                                    const uint8_t *src8_ptr,
+                                    ptrdiff_t src_stride,
+                                    const uint8_t *pred8_ptr,
+                                    ptrdiff_t pred_stride, int bd) {
+  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8_ptr);
+  uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(pred8_ptr);
+  (void)bd;
+  if (cols == 64) {
+    int j = rows;
+    do {
+      const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr);
+      const __m256i s1 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 16));
+      const __m256i s2 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 32));
+      const __m256i s3 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 48));
+      const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+      const __m256i p1 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 16));
+      const __m256i p2 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 32));
+      const __m256i p3 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 48));
+      const __m256i d0 = _mm256_sub_epi16(s0, p0);
+      const __m256i d1 = _mm256_sub_epi16(s1, p1);
+      const __m256i d2 = _mm256_sub_epi16(s2, p2);
+      const __m256i d3 = _mm256_sub_epi16(s3, p3);
+      _mm256_storeu_si256((__m256i *)diff_ptr, d0);
+      _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d1);
+      _mm256_storeu_si256((__m256i *)(diff_ptr + 32), d2);
+      _mm256_storeu_si256((__m256i *)(diff_ptr + 48), d3);
+      src_ptr += src_stride;
+      pred_ptr += pred_stride;
+      diff_ptr += diff_stride;
+    } while (--j != 0);
+  } else if (cols == 32) {
+    int j = rows;
+    do {
+      const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr);
+      const __m256i s1 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 16));
+      const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+      const __m256i p1 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 16));
+      const __m256i d0 = _mm256_sub_epi16(s0, p0);
+      const __m256i d1 = _mm256_sub_epi16(s1, p1);
+      _mm256_storeu_si256((__m256i *)diff_ptr, d0);
+      _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d1);
+      src_ptr += src_stride;
+      pred_ptr += pred_stride;
+      diff_ptr += diff_stride;
+    } while (--j != 0);
+  } else if (cols == 16) {
+    int j = rows;
+    do {
+      const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr);
+      const __m256i s1 =
+          _mm256_lddqu_si256((const __m256i *)(src_ptr + src_stride));
+      const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+      const __m256i p1 =
+          _mm256_lddqu_si256((const __m256i *)(pred_ptr + pred_stride));
+      const __m256i d0 = _mm256_sub_epi16(s0, p0);
+      const __m256i d1 = _mm256_sub_epi16(s1, p1);
+      _mm256_storeu_si256((__m256i *)diff_ptr, d0);
+      _mm256_storeu_si256((__m256i *)(diff_ptr + diff_stride), d1);
+      src_ptr += src_stride << 1;
+      pred_ptr += pred_stride << 1;
+      diff_ptr += diff_stride << 1;
+      j -= 2;
+    } while (j != 0);
+  } else if (cols == 8) {
+    int j = rows;
+    do {
+      const __m128i s0 = _mm_lddqu_si128((const __m128i *)src_ptr);
+      const __m128i s1 =
+          _mm_lddqu_si128((const __m128i *)(src_ptr + src_stride));
+      const __m128i p0 = _mm_lddqu_si128((const __m128i *)pred_ptr);
+      const __m128i p1 =
+          _mm_lddqu_si128((const __m128i *)(pred_ptr + pred_stride));
+      const __m128i d0 = _mm_sub_epi16(s0, p0);
+      const __m128i d1 = _mm_sub_epi16(s1, p1);
+      _mm_storeu_si128((__m128i *)diff_ptr, d0);
+      _mm_storeu_si128((__m128i *)(diff_ptr + diff_stride), d1);
+      src_ptr += src_stride << 1;
+      pred_ptr += pred_stride << 1;
+      diff_ptr += diff_stride << 1;
+      j -= 2;
+    } while (j != 0);
+  } else {
+    int j = rows;
+    assert(cols == 4);
+    do {
+      const __m128i s0 = _mm_loadl_epi64((const __m128i *)src_ptr);
+      const __m128i s1 =
+          _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride));
+      const __m128i p0 = _mm_loadl_epi64((const __m128i *)pred_ptr);
+      const __m128i p1 =
+          _mm_loadl_epi64((const __m128i *)(pred_ptr + pred_stride));
+      const __m128i d0 = _mm_sub_epi16(s0, p0);
+      const __m128i d1 = _mm_sub_epi16(s1, p1);
+      _mm_storel_epi64((__m128i *)diff_ptr, d0);
+      _mm_storel_epi64((__m128i *)(diff_ptr + diff_stride), d1);
+      src_ptr += src_stride << 1;
+      pred_ptr += pred_stride << 1;
+      diff_ptr += diff_stride << 1;
+      j -= 2;
+    } while (j != 0);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/subtract_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/subtract_sse2.asm
index 4273efb854..e3055ab292 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/subtract_sse2.asm
+++ b/media/libvpx/libvpx/vpx_dsp/x86/subtract_sse2.asm
@@ -124,4 +124,5 @@ INIT_MMX
   lea                predq, [predq+pred_str*2]
   sub                rowsd, 2
   jg .loop_4
+  emms
   RET
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sum_squares_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/sum_squares_sse2.c
index bc5362e10f..df6514b2c4 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/sum_squares_sse2.c
+++ b/media/libvpx/libvpx/vpx_dsp/x86/sum_squares_sse2.c
@@ -10,119 +10,96 @@
 
 #include <assert.h>
 #include <emmintrin.h>
-#include <stdio.h>
 
 #include "./vpx_dsp_rtcd.h"
-
-static uint64_t vpx_sum_squares_2d_i16_4x4_sse2(const int16_t *src,
-                                                int stride) {
-  const __m128i v_val_0_w =
-      _mm_loadl_epi64((const __m128i *)(src + 0 * stride));
-  const __m128i v_val_1_w =
-      _mm_loadl_epi64((const __m128i *)(src + 1 * stride));
-  const __m128i v_val_2_w =
-      _mm_loadl_epi64((const __m128i *)(src + 2 * stride));
-  const __m128i v_val_3_w =
-      _mm_loadl_epi64((const __m128i *)(src + 3 * stride));
-
-  const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
-  const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
-  const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
-  const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
-
-  const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
-  const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
-  const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
-
-  const __m128i v_sum_d =
-      _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32));
-
-  return (uint64_t)_mm_cvtsi128_si32(v_sum_d);
-}
-
-// TODO(jingning): Evaluate the performance impact here.
-#ifdef __GNUC__
-// This prevents GCC/Clang from inlining this function into
-// vpx_sum_squares_2d_i16_sse2, which in turn saves some stack
-// maintenance instructions in the common case of 4x4.
-__attribute__((noinline))
-#endif
-static uint64_t
-vpx_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int size) {
-  int r, c;
-  const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
-  __m128i v_acc_q = _mm_setzero_si128();
-
-  for (r = 0; r < size; r += 8) {
-    __m128i v_acc_d = _mm_setzero_si128();
-
-    for (c = 0; c < size; c += 8) {
-      const int16_t *b = src + c;
-      const __m128i v_val_0_w =
-          _mm_load_si128((const __m128i *)(b + 0 * stride));
-      const __m128i v_val_1_w =
-          _mm_load_si128((const __m128i *)(b + 1 * stride));
-      const __m128i v_val_2_w =
-          _mm_load_si128((const __m128i *)(b + 2 * stride));
-      const __m128i v_val_3_w =
-          _mm_load_si128((const __m128i *)(b + 3 * stride));
-      const __m128i v_val_4_w =
-          _mm_load_si128((const __m128i *)(b + 4 * stride));
-      const __m128i v_val_5_w =
-          _mm_load_si128((const __m128i *)(b + 5 * stride));
-      const __m128i v_val_6_w =
-          _mm_load_si128((const __m128i *)(b + 6 * stride));
-      const __m128i v_val_7_w =
-          _mm_load_si128((const __m128i *)(b + 7 * stride));
-
-      const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
-      const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
-      const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
-      const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
-      const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
-      const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
-      const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
-      const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
-
-      const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
-      const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
-      const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
-      const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
-
-      const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
-      const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
-
-      v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d);
-      v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d);
-    }
-
-    v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q));
-    v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32));
-
-    src += 8 * stride;
-  }
-
-  v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
-
-#if ARCH_X86_64
-  return (uint64_t)_mm_cvtsi128_si64(v_acc_q);
-#else
-  {
-    uint64_t tmp;
-    _mm_storel_epi64((__m128i *)&tmp, v_acc_q);
-    return tmp;
-  }
-#endif
-}
+#include "vpx_dsp/x86/mem_sse2.h"
 
 uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size) {
-  // 4 elements per row only requires half an XMM register, so this
-  // must be a special case, but also note that over 75% of all calls
-  // are with size == 4, so it is also the common case.
+  // Over 75% of all calls are with size == 4.
   if (size == 4) {
-    return vpx_sum_squares_2d_i16_4x4_sse2(src, stride);
+    __m128i s[2], sq[2], ss;
+
+    s[0] = _mm_loadl_epi64((const __m128i *)(src + 0 * stride));
+    s[0] = loadh_epi64(s[0], src + 1 * stride);
+    s[1] = _mm_loadl_epi64((const __m128i *)(src + 2 * stride));
+    s[1] = loadh_epi64(s[1], src + 3 * stride);
+    sq[0] = _mm_madd_epi16(s[0], s[0]);
+    sq[1] = _mm_madd_epi16(s[1], s[1]);
+    sq[0] = _mm_add_epi32(sq[0], sq[1]);
+    ss = _mm_add_epi32(sq[0], _mm_srli_si128(sq[0], 8));
+    ss = _mm_add_epi32(ss, _mm_srli_epi64(ss, 32));
+
+    return (uint64_t)_mm_cvtsi128_si32(ss);
   } else {
     // Generic case
-    return vpx_sum_squares_2d_i16_nxn_sse2(src, stride, size);
+    int r = size;
+    const __m128i v_zext_mask_q = _mm_set_epi32(0, -1, 0, -1);
+    __m128i v_acc_q = _mm_setzero_si128();
+
+    assert(size % 8 == 0);
+
+    do {
+      int c = 0;
+      __m128i v_acc_d = _mm_setzero_si128();
+
+      do {
+        const int16_t *const b = src + c;
+        const __m128i v_val_0_w =
+            _mm_load_si128((const __m128i *)(b + 0 * stride));
+        const __m128i v_val_1_w =
+            _mm_load_si128((const __m128i *)(b + 1 * stride));
+        const __m128i v_val_2_w =
+            _mm_load_si128((const __m128i *)(b + 2 * stride));
+        const __m128i v_val_3_w =
+            _mm_load_si128((const __m128i *)(b + 3 * stride));
+        const __m128i v_val_4_w =
+            _mm_load_si128((const __m128i *)(b + 4 * stride));
+        const __m128i v_val_5_w =
+            _mm_load_si128((const __m128i *)(b + 5 * stride));
+        const __m128i v_val_6_w =
+            _mm_load_si128((const __m128i *)(b + 6 * stride));
+        const __m128i v_val_7_w =
+            _mm_load_si128((const __m128i *)(b + 7 * stride));
+
+        const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
+        const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
+        const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
+        const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+        const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
+        const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
+        const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
+        const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
+
+        const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
+        const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
+        const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
+        const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
+
+        const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
+        const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
+
+        v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d);
+        v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d);
+        c += 8;
+      } while (c < size);
+
+      v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q));
+      v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32));
+
+      src += 8 * stride;
+      r -= 8;
+    } while (r);
+
+    v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
+
+#if VPX_ARCH_X86_64
+    return (uint64_t)_mm_cvtsi128_si64(v_acc_q);
+#else
+    {
+      uint64_t tmp;
+      _mm_storel_epi64((__m128i *)&tmp, v_acc_q);
+      return tmp;
+    }
+#endif
   }
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/transpose_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/transpose_sse2.h
new file mode 100644
index 0000000000..b4f1190d74
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/transpose_sse2.h
@@ -0,0 +1,367 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_TRANSPOSE_SSE2_H_
+#define VPX_VPX_DSP_X86_TRANSPOSE_SSE2_H_
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vpx_config.h"
+
+static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) {
+  // Unpack 8 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 10 11 12 13
+  // in[2]: 20 21 22 23
+  // in[3]: 30 31 32 33
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+
+  // Unpack 16 bit elements resulting in:
+  // 00 10 20 30  01 11 21 31  02 12 22 32  03 13 23 33
+  return _mm_unpacklo_epi16(a0, a1);
+}
+
+static INLINE void transpose_8bit_8x8(const __m128i *const in,
+                                      __m128i *const out) {
+  // Unpack 8 bit elements. Goes from:
+  // in[0]: 00 01 02 03 04 05 06 07
+  // in[1]: 10 11 12 13 14 15 16 17
+  // in[2]: 20 21 22 23 24 25 26 27
+  // in[3]: 30 31 32 33 34 35 36 37
+  // in[4]: 40 41 42 43 44 45 46 47
+  // in[5]: 50 51 52 53 54 55 56 57
+  // in[6]: 60 61 62 63 64 65 66 67
+  // in[7]: 70 71 72 73 74 75 76 77
+  // to:
+  // a0:    00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+  // a1:    20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+  // a2:    40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
+  // a3:    60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
+  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
+
+  // Unpack 16 bit elements resulting in:
+  // b0: 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
+  // b1: 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
+  // b2: 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
+  // b3: 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
+  const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
+  const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
+  const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
+  const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
+
+  // Unpack 32 bit elements resulting in:
+  // c0: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
+  // c1: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
+  // c2: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
+  // c3: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
+  const __m128i c0 = _mm_unpacklo_epi32(b0, b2);
+  const __m128i c1 = _mm_unpackhi_epi32(b0, b2);
+  const __m128i c2 = _mm_unpacklo_epi32(b1, b3);
+  const __m128i c3 = _mm_unpackhi_epi32(b1, b3);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30 40 50 60 70
+  // out[1]: 01 11 21 31 41 51 61 71
+  // out[2]: 02 12 22 32 42 52 62 72
+  // out[3]: 03 13 23 33 43 53 63 73
+  // out[4]: 04 14 24 34 44 54 64 74
+  // out[5]: 05 15 25 35 45 55 65 75
+  // out[6]: 06 16 26 36 46 56 66 76
+  // out[7]: 07 17 27 37 47 57 67 77
+  out[0] = _mm_unpacklo_epi64(c0, c0);
+  out[1] = _mm_unpackhi_epi64(c0, c0);
+  out[2] = _mm_unpacklo_epi64(c1, c1);
+  out[3] = _mm_unpackhi_epi64(c1, c1);
+  out[4] = _mm_unpacklo_epi64(c2, c2);
+  out[5] = _mm_unpackhi_epi64(c2, c2);
+  out[6] = _mm_unpacklo_epi64(c3, c3);
+  out[7] = _mm_unpackhi_epi64(c3, c3);
+}
+
+static INLINE void transpose_16bit_4x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  XX XX XX XX
+  // in[1]: 10 11 12 13  XX XX XX XX
+  // in[2]: 20 21 22 23  XX XX XX XX
+  // in[3]: 30 31 32 33  XX XX XX XX
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+
+  // Unpack 32 bit elements resulting in:
+  // out[0]: 00 10 20 30  01 11 21 31
+  // out[1]: 02 12 22 32  03 13 23 33
+  out[0] = _mm_unpacklo_epi32(a0, a1);
+  out[1] = _mm_unpackhi_epi32(a0, a1);
+}
+
+static INLINE void transpose_16bit_4x8(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  XX XX XX XX
+  // in[1]: 10 11 12 13  XX XX XX XX
+  // in[2]: 20 21 22 23  XX XX XX XX
+  // in[3]: 30 31 32 33  XX XX XX XX
+  // in[4]: 40 41 42 43  XX XX XX XX
+  // in[5]: 50 51 52 53  XX XX XX XX
+  // in[6]: 60 61 62 63  XX XX XX XX
+  // in[7]: 70 71 72 73  XX XX XX XX
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a2:    40 50 41 51  42 52 43 53
+  // a3:    60 70 61 71  62 72 63 73
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b1: 40 50 60 70  41 51 61 71
+  // b2: 02 12 22 32  03 13 23 33
+  // b3: 42 52 62 72  43 53 63 73
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+  const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  40 50 60 70
+  // out[1]: 01 11 21 31  41 51 61 71
+  // out[2]: 02 12 22 32  42 52 62 72
+  // out[3]: 03 13 23 33  43 53 63 73
+  out[0] = _mm_unpacklo_epi64(b0, b1);
+  out[1] = _mm_unpackhi_epi64(b0, b1);
+  out[2] = _mm_unpacklo_epi64(b2, b3);
+  out[3] = _mm_unpackhi_epi64(b2, b3);
+}
+
+static INLINE void transpose_16bit_8x8(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  04 05 06 07
+  // in[1]: 10 11 12 13  14 15 16 17
+  // in[2]: 20 21 22 23  24 25 26 27
+  // in[3]: 30 31 32 33  34 35 36 37
+  // in[4]: 40 41 42 43  44 45 46 47
+  // in[5]: 50 51 52 53  54 55 56 57
+  // in[6]: 60 61 62 63  64 65 66 67
+  // in[7]: 70 71 72 73  74 75 76 77
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a2:    40 50 41 51  42 52 43 53
+  // a3:    60 70 61 71  62 72 63 73
+  // a4:    04 14 05 15  06 16 07 17
+  // a5:    24 34 25 35  26 36 27 37
+  // a6:    44 54 45 55  46 56 47 57
+  // a7:    64 74 65 75  66 76 67 77
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+  const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
+  const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b1: 40 50 60 70  41 51 61 71
+  // b2: 04 14 24 34  05 15 25 35
+  // b3: 44 54 64 74  45 55 65 75
+  // b4: 02 12 22 32  03 13 23 33
+  // b5: 42 52 62 72  43 53 63 73
+  // b6: 06 16 26 36  07 17 27 37
+  // b7: 46 56 66 76  47 57 67 77
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+  const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
+  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
+  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+  const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  40 50 60 70
+  // out[1]: 01 11 21 31  41 51 61 71
+  // out[2]: 02 12 22 32  42 52 62 72
+  // out[3]: 03 13 23 33  43 53 63 73
+  // out[4]: 04 14 24 34  44 54 64 74
+  // out[5]: 05 15 25 35  45 55 65 75
+  // out[6]: 06 16 26 36  46 56 66 76
+  // out[7]: 07 17 27 37  47 57 67 77
+  out[0] = _mm_unpacklo_epi64(b0, b1);
+  out[1] = _mm_unpackhi_epi64(b0, b1);
+  out[2] = _mm_unpacklo_epi64(b4, b5);
+  out[3] = _mm_unpackhi_epi64(b4, b5);
+  out[4] = _mm_unpacklo_epi64(b2, b3);
+  out[5] = _mm_unpackhi_epi64(b2, b3);
+  out[6] = _mm_unpacklo_epi64(b6, b7);
+  out[7] = _mm_unpackhi_epi64(b6, b7);
+}
+
+// Transpose in-place
+static INLINE void transpose_16bit_16x16(__m128i *const left,
+                                         __m128i *const right) {
+  __m128i tbuf[8];
+  transpose_16bit_8x8(left, left);
+  transpose_16bit_8x8(right, tbuf);
+  transpose_16bit_8x8(left + 8, right);
+  transpose_16bit_8x8(right + 8, right + 8);
+
+  left[8] = tbuf[0];
+  left[9] = tbuf[1];
+  left[10] = tbuf[2];
+  left[11] = tbuf[3];
+  left[12] = tbuf[4];
+  left[13] = tbuf[5];
+  left[14] = tbuf[6];
+  left[15] = tbuf[7];
+}
+
+static INLINE void transpose_32bit_4x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 32 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 10 11 12 13
+  // in[2]: 20 21 22 23
+  // in[3]: 30 31 32 33
+  // to:
+  // a0:    00 10 01 11
+  // a1:    20 30 21 31
+  // a2:    02 12 03 13
+  // a3:    22 32 23 33
+
+  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
+  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
+  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  out[0] = _mm_unpacklo_epi64(a0, a1);
+  out[1] = _mm_unpackhi_epi64(a0, a1);
+  out[2] = _mm_unpacklo_epi64(a2, a3);
+  out[3] = _mm_unpackhi_epi64(a2, a3);
+}
+
+static INLINE void transpose_32bit_4x4x2(const __m128i *const in,
+                                         __m128i *const out) {
+  // Unpack 32 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 10 11 12 13
+  // in[2]: 20 21 22 23
+  // in[3]: 30 31 32 33
+  // in[4]: 04 05 06 07
+  // in[5]: 14 15 16 17
+  // in[6]: 24 25 26 27
+  // in[7]: 34 35 36 37
+  // to:
+  // a0:    00 10 01 11
+  // a1:    20 30 21 31
+  // a2:    02 12 03 13
+  // a3:    22 32 23 33
+  // a4:    04 14 05 15
+  // a5:    24 34 25 35
+  // a6:    06 16 07 17
+  // a7:    26 36 27 37
+  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
+  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
+  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
+  const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]);
+  const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]);
+  const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]);
+  const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  // out[4]: 04 14 24 34
+  // out[5]: 05 15 25 35
+  // out[6]: 06 16 26 36
+  // out[7]: 07 17 27 37
+  out[0] = _mm_unpacklo_epi64(a0, a1);
+  out[1] = _mm_unpackhi_epi64(a0, a1);
+  out[2] = _mm_unpacklo_epi64(a2, a3);
+  out[3] = _mm_unpackhi_epi64(a2, a3);
+  out[4] = _mm_unpacklo_epi64(a4, a5);
+  out[5] = _mm_unpackhi_epi64(a4, a5);
+  out[6] = _mm_unpacklo_epi64(a6, a7);
+  out[7] = _mm_unpackhi_epi64(a6, a7);
+}
+
+static INLINE void transpose_32bit_8x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 32 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 04 05 06 07
+  // in[2]: 10 11 12 13
+  // in[3]: 14 15 16 17
+  // in[4]: 20 21 22 23
+  // in[5]: 24 25 26 27
+  // in[6]: 30 31 32 33
+  // in[7]: 34 35 36 37
+  // to:
+  // a0: 00 10 01 11
+  // a1: 20 30 21 31
+  // a2: 02 12 03 13
+  // a3: 22 32 23 33
+  // a4: 04 14 05 15
+  // a5: 24 34 25 35
+  // a6: 06 16 07 17
+  // a7: 26 36 27 37
+  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]);
+  const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]);
+  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]);
+  const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]);
+  const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]);
+  const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]);
+  const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]);
+  const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  // out[4]: 04 14 24 34
+  // out[5]: 05 15 25 35
+  // out[6]: 06 16 26 36
+  // out[7]: 07 17 27 37
+  out[0] = _mm_unpacklo_epi64(a0, a1);
+  out[1] = _mm_unpackhi_epi64(a0, a1);
+  out[2] = _mm_unpacklo_epi64(a2, a3);
+  out[3] = _mm_unpackhi_epi64(a2, a3);
+  out[4] = _mm_unpacklo_epi64(a4, a5);
+  out[5] = _mm_unpackhi_epi64(a4, a5);
+  out[6] = _mm_unpacklo_epi64(a6, a7);
+  out[7] = _mm_unpackhi_epi64(a6, a7);
+}
+
+#endif  // VPX_VPX_DSP_X86_TRANSPOSE_SSE2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/txfm_common_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/txfm_common_sse2.h
index f8edb1b787..de5ce43b00 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/txfm_common_sse2.h
+++ b/media/libvpx/libvpx/vpx_dsp/x86/txfm_common_sse2.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_X86_TXFM_COMMON_SSE2_H_
-#define VPX_DSP_X86_TXFM_COMMON_SSE2_H_
+#ifndef VPX_VPX_DSP_X86_TXFM_COMMON_SSE2_H_
+#define VPX_VPX_DSP_X86_TXFM_COMMON_SSE2_H_
 
 #include <emmintrin.h>
 #include "vpx/vpx_integer.h"
@@ -18,6 +18,9 @@
   _mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
                 (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
 
+#define pair_set_epi32(a, b) \
+  _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a))
+
 #define dual_set_epi16(a, b)                                            \
   _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \
                 (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a))
@@ -26,4 +29,4 @@
   _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \
                  (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h))
 
-#endif  // VPX_DSP_X86_TXFM_COMMON_SSE2_H_
+#endif  // VPX_VPX_DSP_X86_TXFM_COMMON_SSE2_H_
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/variance_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/variance_avx2.c
index 8428e0520d..8305b9f20f 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/variance_avx2.c
+++ b/media/libvpx/libvpx/vpx_dsp/x86/variance_avx2.c
@@ -7,137 +7,852 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+
+#include <immintrin.h>  // AVX2
+
 #include "./vpx_dsp_rtcd.h"
 
-typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
-                             const uint8_t *ref, int ref_stride,
-                             unsigned int *sse, int *sum);
+/* clang-format off */
+DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
+  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,
+  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,
+  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,
+  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,
+  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,
+  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,
+  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,
+  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,
+  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+  6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10,
+  6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10,
+  4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12,
+  4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12,
+  2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14,
+  2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14,
+};
 
-void vpx_get32x32var_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *ref, int ref_stride, unsigned int *sse,
-                          int *sum);
+DECLARE_ALIGNED(32, static const int8_t, adjacent_sub_avx2[32]) = {
+  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,
+  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1
+};
+/* clang-format on */
 
-static void variance_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *ref, int ref_stride, int w, int h,
-                          unsigned int *sse, int *sum, get_var_avx2 var_fn,
-                          int block_size) {
-  int i, j;
+static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref,
+                                        __m256i *const sse,
+                                        __m256i *const sum) {
+  const __m256i adj_sub = _mm256_load_si256((__m256i const *)adjacent_sub_avx2);
 
-  *sse = 0;
-  *sum = 0;
+  // unpack into pairs of source and reference values
+  const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref);
+  const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref);
 
-  for (i = 0; i < h; i += 16) {
-    for (j = 0; j < w; j += block_size) {
-      unsigned int sse0;
-      int sum0;
-      var_fn(&src[src_stride * i + j], src_stride, &ref[ref_stride * i + j],
-             ref_stride, &sse0, &sum0);
-      *sse += sse0;
-      *sum += sum0;
-    }
+  // subtract adjacent elements using src*1 + ref*-1
+  const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub);
+  const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub);
+  const __m256i madd0 = _mm256_madd_epi16(diff0, diff0);
+  const __m256i madd1 = _mm256_madd_epi16(diff1, diff1);
+
+  // add to the running totals
+  *sum = _mm256_add_epi16(*sum, _mm256_add_epi16(diff0, diff1));
+  *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1));
+}
+
+static INLINE void variance_final_from_32bit_sum_avx2(__m256i vsse,
+                                                      __m128i vsum,
+                                                      unsigned int *const sse,
+                                                      int *const sum) {
+  // extract the low lane and add it to the high lane
+  const __m128i sse_reg_128 = _mm_add_epi32(_mm256_castsi256_si128(vsse),
+                                            _mm256_extractf128_si256(vsse, 1));
+
+  // unpack sse and sum registers and add
+  const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum);
+  const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum);
+  const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi);
+
+  // perform the final summation and extract the results
+  const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8));
+  *((int *)sse) = _mm_cvtsi128_si32(res);
+  *((int *)sum) = _mm_extract_epi32(res, 1);
+}
+
+static INLINE void variance_final_from_16bit_sum_avx2(__m256i vsse,
+                                                      __m256i vsum,
+                                                      unsigned int *const sse,
+                                                      int *const sum) {
+  // extract the low lane and add it to the high lane
+  const __m128i sum_reg_128 = _mm_add_epi16(_mm256_castsi256_si128(vsum),
+                                            _mm256_extractf128_si256(vsum, 1));
+  const __m128i sum_reg_64 =
+      _mm_add_epi16(sum_reg_128, _mm_srli_si128(sum_reg_128, 8));
+  const __m128i sum_int32 = _mm_cvtepi16_epi32(sum_reg_64);
+
+  variance_final_from_32bit_sum_avx2(vsse, sum_int32, sse, sum);
+}
+
+static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) {
+  const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum));
+  const __m256i sum_hi =
+      _mm256_cvtepi16_epi32(_mm256_extractf128_si256(sum, 1));
+  return _mm256_add_epi32(sum_lo, sum_hi);
+}
+
+static INLINE void variance8_kernel_avx2(
+    const uint8_t *const src, const int src_stride, const uint8_t *const ref,
+    const int ref_stride, __m256i *const sse, __m256i *const sum) {
+  __m128i src0, src1, ref0, ref1;
+  __m256i ss, rr, diff;
+
+  // 0 0 0.... 0 s07 s06 s05 s04 s03 s02 s01 s00
+  src0 = _mm_loadl_epi64((const __m128i *)(src + 0 * src_stride));
+
+  // 0 0 0.... 0 s17 s16 s15 s14 s13 s12 s11 s10
+  src1 = _mm_loadl_epi64((const __m128i *)(src + 1 * src_stride));
+
+  // s17 s16...s11 s10 s07 s06...s01 s00 (8bit)
+  src0 = _mm_unpacklo_epi64(src0, src1);
+
+  // s17 s16...s11 s10 s07 s06...s01 s00 (16 bit)
+  ss = _mm256_cvtepu8_epi16(src0);
+
+  // 0 0 0.... 0 r07 r06 r05 r04 r03 r02 r01 r00
+  ref0 = _mm_loadl_epi64((const __m128i *)(ref + 0 * ref_stride));
+
+  // 0 0 0.... 0 r17 r16 0 r15 0 r14 0 r13 0 r12 0 r11 0 r10
+  ref1 = _mm_loadl_epi64((const __m128i *)(ref + 1 * ref_stride));
+
+  // r17 r16...r11 r10 r07 r06...r01 r00 (8 bit)
+  ref0 = _mm_unpacklo_epi64(ref0, ref1);
+
+  // r17 r16...r11 r10 r07 r06...r01 r00 (16 bit)
+  rr = _mm256_cvtepu8_epi16(ref0);
+
+  diff = _mm256_sub_epi16(ss, rr);
+  *sse = _mm256_add_epi32(*sse, _mm256_madd_epi16(diff, diff));
+  *sum = _mm256_add_epi16(*sum, diff);
+}
+
+static INLINE void variance16_kernel_avx2(
+    const uint8_t *const src, const int src_stride, const uint8_t *const ref,
+    const int ref_stride, __m256i *const sse, __m256i *const sum) {
+  const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+  const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+  const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride));
+  const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride));
+  const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1);
+  const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1);
+  variance_kernel_avx2(s, r, sse, sum);
+}
+
+static INLINE void variance32_kernel_avx2(const uint8_t *const src,
+                                          const uint8_t *const ref,
+                                          __m256i *const sse,
+                                          __m256i *const sum) {
+  const __m256i s = _mm256_loadu_si256((__m256i const *)(src));
+  const __m256i r = _mm256_loadu_si256((__m256i const *)(ref));
+  variance_kernel_avx2(s, r, sse, sum);
+}
+
+static INLINE void variance8_avx2(const uint8_t *src, const int src_stride,
+                                  const uint8_t *ref, const int ref_stride,
+                                  const int h, __m256i *const vsse,
+                                  __m256i *const vsum) {
+  int i;
+  *vsum = _mm256_setzero_si256();
+  *vsse = _mm256_setzero_si256();
+
+  for (i = 0; i < h; i += 2) {
+    variance8_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum);
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
   }
 }
 
-unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
-                vpx_get16x16var_avx2, 16);
-  return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
+static INLINE void variance16_avx2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m256i *const vsse,
+                                   __m256i *const vsum) {
+  int i;
+  *vsum = _mm256_setzero_si256();
+  *vsse = _mm256_setzero_si256();
+
+  for (i = 0; i < h; i += 2) {
+    variance16_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum);
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+  }
 }
 
-unsigned int vpx_mse16x16_avx2(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride,
-                               unsigned int *sse) {
-  int sum;
-  vpx_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
-  return *sse;
+static INLINE void variance32_avx2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m256i *const vsse,
+                                   __m256i *const vsum) {
+  int i;
+  *vsum = _mm256_setzero_si256();
+  *vsse = _mm256_setzero_si256();
+
+  for (i = 0; i < h; i++) {
+    variance32_kernel_avx2(src, ref, vsse, vsum);
+    src += src_stride;
+    ref += ref_stride;
+  }
 }
 
-unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
+static INLINE void variance64_avx2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m256i *const vsse,
+                                   __m256i *const vsum) {
+  int i;
+  *vsum = _mm256_setzero_si256();
+
+  for (i = 0; i < h; i++) {
+    variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
+    variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+void vpx_get16x16var_avx2(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *ref_ptr, int ref_stride,
+                          unsigned int *sse, int *sum) {
+  __m256i vsse, vsum;
+  variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+  variance_final_from_16bit_sum_avx2(vsse, vsum, sse, sum);
+}
+
+#define FILTER_SRC(filter)                               \
+  /* filter the source */                                \
+  exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
+  exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
+                                                         \
+  /* add 8 to source */                                  \
+  exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);        \
+  exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);        \
+                                                         \
+  /* divide source by 16 */                              \
+  exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);         \
+  exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+#define CALC_SUM_SSE_INSIDE_LOOP                          \
+  /* expand each byte to 2 bytes */                       \
+  exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);   \
+  exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);   \
+  /* source - dest */                                     \
+  exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);  \
+  exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);  \
+  /* caculate sum */                                      \
+  *sum_reg = _mm256_add_epi16(*sum_reg, exp_src_lo);      \
+  exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
+  *sum_reg = _mm256_add_epi16(*sum_reg, exp_src_hi);      \
+  exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
+  /* calculate sse */                                     \
+  *sse_reg = _mm256_add_epi32(*sse_reg, exp_src_lo);      \
+  *sse_reg = _mm256_add_epi32(*sse_reg, exp_src_hi);
+
+// final calculation to sum and sse
+#define CALC_SUM_AND_SSE                                                   \
+  res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg);                         \
+  sse_reg_hi = _mm256_srli_si256(sse_reg, 8);                              \
+  sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp);                    \
+  sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp);                    \
+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
+  sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi);                      \
+                                                                           \
+  sse_reg_hi = _mm256_srli_si256(sse_reg, 4);                              \
+  sum_reg_hi = _mm256_srli_si256(sum_reg, 8);                              \
+                                                                           \
+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
+  *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) +     \
+                  _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
+  sum_reg_hi = _mm256_srli_si256(sum_reg, 4);                              \
+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
+  sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) +               \
+        _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
+
+static INLINE void spv32_x0_y0(const uint8_t *src, int src_stride,
+                               const uint8_t *dst, int dst_stride,
+                               const uint8_t *second_pred, int second_stride,
+                               int do_sec, int height, __m256i *sum_reg,
+                               __m256i *sse_reg) {
+  const __m256i zero_reg = _mm256_setzero_si256();
+  __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  int i;
+  for (i = 0; i < height; i++) {
+    const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+    const __m256i src_reg = _mm256_loadu_si256((__m256i const *)src);
+    if (do_sec) {
+      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
+      const __m256i avg_reg = _mm256_avg_epu8(src_reg, sec_reg);
+      exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+      exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+      second_pred += second_stride;
+    } else {
+      exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg);
+      exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg);
+    }
+    CALC_SUM_SSE_INSIDE_LOOP
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+// (x == 0, y == 4) or (x == 4, y == 0).  sstep determines the direction.
+static INLINE void spv32_half_zero(const uint8_t *src, int src_stride,
+                                   const uint8_t *dst, int dst_stride,
+                                   const uint8_t *second_pred,
+                                   int second_stride, int do_sec, int height,
+                                   __m256i *sum_reg, __m256i *sse_reg,
+                                   int sstep) {
+  const __m256i zero_reg = _mm256_setzero_si256();
+  __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  int i;
+  for (i = 0; i < height; i++) {
+    const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+    const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
+    const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + sstep));
+    const __m256i src_avg = _mm256_avg_epu8(src_0, src_1);
+    if (do_sec) {
+      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
+      const __m256i avg_reg = _mm256_avg_epu8(src_avg, sec_reg);
+      exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+      exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+      second_pred += second_stride;
+    } else {
+      exp_src_lo = _mm256_unpacklo_epi8(src_avg, zero_reg);
+      exp_src_hi = _mm256_unpackhi_epi8(src_avg, zero_reg);
+    }
+    CALC_SUM_SSE_INSIDE_LOOP
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static INLINE void spv32_x0_y4(const uint8_t *src, int src_stride,
+                               const uint8_t *dst, int dst_stride,
+                               const uint8_t *second_pred, int second_stride,
+                               int do_sec, int height, __m256i *sum_reg,
+                               __m256i *sse_reg) {
+  spv32_half_zero(src, src_stride, dst, dst_stride, second_pred, second_stride,
+                  do_sec, height, sum_reg, sse_reg, src_stride);
+}
+
+static INLINE void spv32_x4_y0(const uint8_t *src, int src_stride,
+                               const uint8_t *dst, int dst_stride,
+                               const uint8_t *second_pred, int second_stride,
+                               int do_sec, int height, __m256i *sum_reg,
+                               __m256i *sse_reg) {
+  spv32_half_zero(src, src_stride, dst, dst_stride, second_pred, second_stride,
+                  do_sec, height, sum_reg, sse_reg, 1);
+}
+
+static INLINE void spv32_x4_y4(const uint8_t *src, int src_stride,
+                               const uint8_t *dst, int dst_stride,
+                               const uint8_t *second_pred, int second_stride,
+                               int do_sec, int height, __m256i *sum_reg,
+                               __m256i *sse_reg) {
+  const __m256i zero_reg = _mm256_setzero_si256();
+  const __m256i src_a = _mm256_loadu_si256((__m256i const *)src);
+  const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1));
+  __m256i prev_src_avg = _mm256_avg_epu8(src_a, src_b);
+  __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  int i;
+  src += src_stride;
+  for (i = 0; i < height; i++) {
+    const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+    const __m256i src_0 = _mm256_loadu_si256((__m256i const *)(src));
+    const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1));
+    const __m256i src_avg = _mm256_avg_epu8(src_0, src_1);
+    const __m256i current_avg = _mm256_avg_epu8(prev_src_avg, src_avg);
+    prev_src_avg = src_avg;
+
+    if (do_sec) {
+      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
+      const __m256i avg_reg = _mm256_avg_epu8(current_avg, sec_reg);
+      exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+      exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+      second_pred += second_stride;
+    } else {
+      exp_src_lo = _mm256_unpacklo_epi8(current_avg, zero_reg);
+      exp_src_hi = _mm256_unpackhi_epi8(current_avg, zero_reg);
+    }
+    // save current source average
+    CALC_SUM_SSE_INSIDE_LOOP
+    dst += dst_stride;
+    src += src_stride;
+  }
+}
+
+// (x == 0, y == bil) or (x == 4, y == bil).  sstep determines the direction.
+static INLINE void spv32_bilin_zero(const uint8_t *src, int src_stride,
+                                    const uint8_t *dst, int dst_stride,
+                                    const uint8_t *second_pred,
+                                    int second_stride, int do_sec, int height,
+                                    __m256i *sum_reg, __m256i *sse_reg,
+                                    int offset, int sstep) {
+  const __m256i zero_reg = _mm256_setzero_si256();
+  const __m256i pw8 = _mm256_set1_epi16(8);
+  const __m256i filter = _mm256_load_si256(
+      (__m256i const *)(bilinear_filters_avx2 + (offset << 5)));
+  __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  int i;
+  for (i = 0; i < height; i++) {
+    const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+    const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
+    const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + sstep));
+    exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1);
+    exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1);
+
+    FILTER_SRC(filter)
+    if (do_sec) {
+      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
+      const __m256i exp_src = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      const __m256i avg_reg = _mm256_avg_epu8(exp_src, sec_reg);
+      second_pred += second_stride;
+      exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+      exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+    }
+    CALC_SUM_SSE_INSIDE_LOOP
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static INLINE void spv32_x0_yb(const uint8_t *src, int src_stride,
+                               const uint8_t *dst, int dst_stride,
+                               const uint8_t *second_pred, int second_stride,
+                               int do_sec, int height, __m256i *sum_reg,
+                               __m256i *sse_reg, int y_offset) {
+  spv32_bilin_zero(src, src_stride, dst, dst_stride, second_pred, second_stride,
+                   do_sec, height, sum_reg, sse_reg, y_offset, src_stride);
+}
+
+static INLINE void spv32_xb_y0(const uint8_t *src, int src_stride,
+                               const uint8_t *dst, int dst_stride,
+                               const uint8_t *second_pred, int second_stride,
+                               int do_sec, int height, __m256i *sum_reg,
+                               __m256i *sse_reg, int x_offset) {
+  spv32_bilin_zero(src, src_stride, dst, dst_stride, second_pred, second_stride,
+                   do_sec, height, sum_reg, sse_reg, x_offset, 1);
+}
+
+static INLINE void spv32_x4_yb(const uint8_t *src, int src_stride,
+                               const uint8_t *dst, int dst_stride,
+                               const uint8_t *second_pred, int second_stride,
+                               int do_sec, int height, __m256i *sum_reg,
+                               __m256i *sse_reg, int y_offset) {
+  const __m256i zero_reg = _mm256_setzero_si256();
+  const __m256i pw8 = _mm256_set1_epi16(8);
+  const __m256i filter = _mm256_load_si256(
+      (__m256i const *)(bilinear_filters_avx2 + (y_offset << 5)));
+  const __m256i src_a = _mm256_loadu_si256((__m256i const *)src);
+  const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1));
+  __m256i prev_src_avg = _mm256_avg_epu8(src_a, src_b);
+  __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  int i;
+  src += src_stride;
+  for (i = 0; i < height; i++) {
+    const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+    const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
+    const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1));
+    const __m256i src_avg = _mm256_avg_epu8(src_0, src_1);
+    exp_src_lo = _mm256_unpacklo_epi8(prev_src_avg, src_avg);
+    exp_src_hi = _mm256_unpackhi_epi8(prev_src_avg, src_avg);
+    prev_src_avg = src_avg;
+
+    FILTER_SRC(filter)
+    if (do_sec) {
+      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
+      const __m256i exp_src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      const __m256i avg_reg = _mm256_avg_epu8(exp_src_avg, sec_reg);
+      exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+      exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+      second_pred += second_stride;
+    }
+    CALC_SUM_SSE_INSIDE_LOOP
+    dst += dst_stride;
+    src += src_stride;
+  }
+}
+
+static INLINE void spv32_xb_y4(const uint8_t *src, int src_stride,
+                               const uint8_t *dst, int dst_stride,
+                               const uint8_t *second_pred, int second_stride,
+                               int do_sec, int height, __m256i *sum_reg,
+                               __m256i *sse_reg, int x_offset) {
+  const __m256i zero_reg = _mm256_setzero_si256();
+  const __m256i pw8 = _mm256_set1_epi16(8);
+  const __m256i filter = _mm256_load_si256(
+      (__m256i const *)(bilinear_filters_avx2 + (x_offset << 5)));
+  const __m256i src_a = _mm256_loadu_si256((__m256i const *)src);
+  const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1));
+  __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  __m256i src_reg, src_pack;
+  int i;
+  exp_src_lo = _mm256_unpacklo_epi8(src_a, src_b);
+  exp_src_hi = _mm256_unpackhi_epi8(src_a, src_b);
+  FILTER_SRC(filter)
+  // convert each 16 bit to 8 bit to each low and high lane source
+  src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+
+  src += src_stride;
+  for (i = 0; i < height; i++) {
+    const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+    const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
+    const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1));
+    exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1);
+    exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1);
+
+    FILTER_SRC(filter)
+
+    src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+    // average between previous pack to the current
+    src_pack = _mm256_avg_epu8(src_pack, src_reg);
+
+    if (do_sec) {
+      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
+      const __m256i avg_pack = _mm256_avg_epu8(src_pack, sec_reg);
+      exp_src_lo = _mm256_unpacklo_epi8(avg_pack, zero_reg);
+      exp_src_hi = _mm256_unpackhi_epi8(avg_pack, zero_reg);
+      second_pred += second_stride;
+    } else {
+      exp_src_lo = _mm256_unpacklo_epi8(src_pack, zero_reg);
+      exp_src_hi = _mm256_unpackhi_epi8(src_pack, zero_reg);
+    }
+    CALC_SUM_SSE_INSIDE_LOOP
+    src_pack = src_reg;
+    dst += dst_stride;
+    src += src_stride;
+  }
+}
+
+static INLINE void spv32_xb_yb(const uint8_t *src, int src_stride,
+                               const uint8_t *dst, int dst_stride,
+                               const uint8_t *second_pred, int second_stride,
+                               int do_sec, int height, __m256i *sum_reg,
+                               __m256i *sse_reg, int x_offset, int y_offset) {
+  const __m256i zero_reg = _mm256_setzero_si256();
+  const __m256i pw8 = _mm256_set1_epi16(8);
+  const __m256i xfilter = _mm256_load_si256(
+      (__m256i const *)(bilinear_filters_avx2 + (x_offset << 5)));
+  const __m256i yfilter = _mm256_load_si256(
+      (__m256i const *)(bilinear_filters_avx2 + (y_offset << 5)));
+  const __m256i src_a = _mm256_loadu_si256((__m256i const *)src);
+  const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1));
+  __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  __m256i prev_src_pack, src_pack;
+  int i;
+  exp_src_lo = _mm256_unpacklo_epi8(src_a, src_b);
+  exp_src_hi = _mm256_unpackhi_epi8(src_a, src_b);
+  FILTER_SRC(xfilter)
+  // convert each 16 bit to 8 bit to each low and high lane source
+  prev_src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+  src += src_stride;
+
+  for (i = 0; i < height; i++) {
+    const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+    const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
+    const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1));
+    exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1);
+    exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1);
+
+    FILTER_SRC(xfilter)
+    src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+
+    // merge previous pack to current pack source
+    exp_src_lo = _mm256_unpacklo_epi8(prev_src_pack, src_pack);
+    exp_src_hi = _mm256_unpackhi_epi8(prev_src_pack, src_pack);
+
+    FILTER_SRC(yfilter)
+    if (do_sec) {
+      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
+      const __m256i exp_src = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      const __m256i avg_reg = _mm256_avg_epu8(exp_src, sec_reg);
+      exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+      exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+      second_pred += second_stride;
+    }
+
+    prev_src_pack = src_pack;
+
+    CALC_SUM_SSE_INSIDE_LOOP
+    dst += dst_stride;
+    src += src_stride;
+  }
+}
+
+static INLINE int sub_pix_var32xh(const uint8_t *src, int src_stride,
+                                  int x_offset, int y_offset,
+                                  const uint8_t *dst, int dst_stride,
+                                  const uint8_t *second_pred, int second_stride,
+                                  int do_sec, int height, unsigned int *sse) {
+  const __m256i zero_reg = _mm256_setzero_si256();
+  __m256i sum_reg = _mm256_setzero_si256();
+  __m256i sse_reg = _mm256_setzero_si256();
+  __m256i sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
+  int sum;
+  // x_offset = 0 and y_offset = 0
+  if (x_offset == 0) {
+    if (y_offset == 0) {
+      spv32_x0_y0(src, src_stride, dst, dst_stride, second_pred, second_stride,
+                  do_sec, height, &sum_reg, &sse_reg);
+      // x_offset = 0 and y_offset = 4
+    } else if (y_offset == 4) {
+      spv32_x0_y4(src, src_stride, dst, dst_stride, second_pred, second_stride,
+                  do_sec, height, &sum_reg, &sse_reg);
+      // x_offset = 0 and y_offset = bilin interpolation
+    } else {
+      spv32_x0_yb(src, src_stride, dst, dst_stride, second_pred, second_stride,
+                  do_sec, height, &sum_reg, &sse_reg, y_offset);
+    }
+    // x_offset = 4  and y_offset = 0
+  } else if (x_offset == 4) {
+    if (y_offset == 0) {
+      spv32_x4_y0(src, src_stride, dst, dst_stride, second_pred, second_stride,
+                  do_sec, height, &sum_reg, &sse_reg);
+      // x_offset = 4  and y_offset = 4
+    } else if (y_offset == 4) {
+      spv32_x4_y4(src, src_stride, dst, dst_stride, second_pred, second_stride,
+                  do_sec, height, &sum_reg, &sse_reg);
+      // x_offset = 4  and y_offset = bilin interpolation
+    } else {
+      spv32_x4_yb(src, src_stride, dst, dst_stride, second_pred, second_stride,
+                  do_sec, height, &sum_reg, &sse_reg, y_offset);
+    }
+    // x_offset = bilin interpolation and y_offset = 0
+  } else {
+    if (y_offset == 0) {
+      spv32_xb_y0(src, src_stride, dst, dst_stride, second_pred, second_stride,
+                  do_sec, height, &sum_reg, &sse_reg, x_offset);
+      // x_offset = bilin interpolation and y_offset = 4
+    } else if (y_offset == 4) {
+      spv32_xb_y4(src, src_stride, dst, dst_stride, second_pred, second_stride,
+                  do_sec, height, &sum_reg, &sse_reg, x_offset);
+      // x_offset = bilin interpolation and y_offset = bilin interpolation
+    } else {
+      spv32_xb_yb(src, src_stride, dst, dst_stride, second_pred, second_stride,
+                  do_sec, height, &sum_reg, &sse_reg, x_offset, y_offset);
+    }
+  }
+  CALC_SUM_AND_SSE
+  return sum;
+}
+
+static int sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
+                                       int x_offset, int y_offset,
+                                       const uint8_t *dst, int dst_stride,
+                                       int height, unsigned int *sse) {
+  return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride,
+                         NULL, 0, 0, height, sse);
+}
+
+static int sub_pixel_avg_variance32xh_avx2(const uint8_t *src, int src_stride,
+                                           int x_offset, int y_offset,
+                                           const uint8_t *dst, int dst_stride,
+                                           const uint8_t *second_pred,
+                                           int second_stride, int height,
+                                           unsigned int *sse) {
+  return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride,
+                         second_pred, second_stride, 1, height, sse);
+}
+
+typedef void (*get_var_avx2)(const uint8_t *src_ptr, int src_stride,
+                             const uint8_t *ref_ptr, int ref_stride,
+                             unsigned int *sse, int *sum);
+
+unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride,
+                                  const uint8_t *ref_ptr, int ref_stride,
+                                  unsigned int *sse) {
+  __m256i vsse, vsum;
+  int sum;
+  variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum);
+  variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+  return *sse - ((sum * sum) >> 5);
+}
+
+unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride,
+                                  const uint8_t *ref_ptr, int ref_stride,
+                                  unsigned int *sse) {
+  __m256i vsse, vsum;
+  int sum;
+  variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+  variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+  return *sse - ((sum * sum) >> 6);
+}
+
+unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *ref_ptr, int ref_stride,
+                                   unsigned int *sse) {
+  __m256i vsse, vsum;
+  int sum;
+  variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+  variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+  return *sse - ((sum * sum) >> 7);
+}
+
+unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *ref_ptr, int ref_stride,
+                                   unsigned int *sse) {
+  int sum;
+  __m256i vsse, vsum;
+  variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+  variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+  return *sse - (uint32_t)(((int64_t)sum * sum) >> 7);
+}
+
+unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
                                     unsigned int *sse) {
   int sum;
-  variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
-                vpx_get32x32var_avx2, 32);
+  __m256i vsse, vsum;
+  variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+  variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+  return *sse - (uint32_t)(((int64_t)sum * sum) >> 8);
+}
+
+unsigned int vpx_variance16x32_avx2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  __m256i vsse, vsum;
+  variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum);
+  variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
   return *sse - (uint32_t)(((int64_t)sum * sum) >> 9);
 }
 
-unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
+unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
                                     unsigned int *sse) {
   int sum;
-  variance_avx2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
-                vpx_get32x32var_avx2, 32);
+  __m256i vsse, vsum;
+  variance32_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+  variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+  return *sse - (uint32_t)(((int64_t)sum * sum) >> 9);
+}
+
+unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  __m256i vsse, vsum;
+  __m128i vsum_128;
+  variance32_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum);
+  vsum_128 = _mm_add_epi16(_mm256_castsi256_si128(vsum),
+                           _mm256_extractf128_si256(vsum, 1));
+  vsum_128 = _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128),
+                           _mm_cvtepi16_epi32(_mm_srli_si128(vsum_128, 8)));
+  variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum);
   return *sse - (uint32_t)(((int64_t)sum * sum) >> 10);
 }
 
-unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
+unsigned int vpx_variance32x64_avx2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
                                     unsigned int *sse) {
   int sum;
-  variance_avx2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
-                vpx_get32x32var_avx2, 32);
-  return *sse - (uint32_t)(((int64_t)sum * sum) >> 12);
-}
-
-unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
-                vpx_get32x32var_avx2, 32);
+  __m256i vsse, vsum;
+  __m128i vsum_128;
+  variance32_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, &vsse, &vsum);
+  vsum = sum_to_32bit_avx2(vsum);
+  vsum_128 = _mm_add_epi32(_mm256_castsi256_si128(vsum),
+                           _mm256_extractf128_si256(vsum, 1));
+  variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum);
   return *sse - (uint32_t)(((int64_t)sum * sum) >> 11);
 }
 
-unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
-                                             int x_offset, int y_offset,
-                                             const uint8_t *dst, int dst_stride,
-                                             int height, unsigned int *sse);
+unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
+                                    unsigned int *sse) {
+  __m256i vsse = _mm256_setzero_si256();
+  __m256i vsum = _mm256_setzero_si256();
+  __m128i vsum_128;
+  int sum;
+  variance64_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum);
+  vsum = sum_to_32bit_avx2(vsum);
+  vsum_128 = _mm_add_epi32(_mm256_castsi256_si128(vsum),
+                           _mm256_extractf128_si256(vsum, 1));
+  variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum);
+  return *sse - (uint32_t)(((int64_t)sum * sum) >> 11);
+}
 
-unsigned int vpx_sub_pixel_avg_variance32xh_avx2(
-    const uint8_t *src, int src_stride, int x_offset, int y_offset,
-    const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
-    int height, unsigned int *sseptr);
+unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
+                                    unsigned int *sse) {
+  __m256i vsse = _mm256_setzero_si256();
+  __m256i vsum = _mm256_setzero_si256();
+  __m128i vsum_128;
+  int sum;
+  int i = 0;
 
-unsigned int vpx_sub_pixel_variance64x64_avx2(const uint8_t *src,
-                                              int src_stride, int x_offset,
-                                              int y_offset, const uint8_t *dst,
-                                              int dst_stride,
-                                              unsigned int *sse) {
+  for (i = 0; i < 2; i++) {
+    __m256i vsum16;
+    variance64_avx2(src_ptr + 32 * i * src_stride, src_stride,
+                    ref_ptr + 32 * i * ref_stride, ref_stride, 32, &vsse,
+                    &vsum16);
+    vsum = _mm256_add_epi32(vsum, sum_to_32bit_avx2(vsum16));
+  }
+  vsum_128 = _mm_add_epi32(_mm256_castsi256_si128(vsum),
+                           _mm256_extractf128_si256(vsum, 1));
+  variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum);
+  return *sse - (unsigned int)(((int64_t)sum * sum) >> 12);
+}
+
+unsigned int vpx_mse16x8_avx2(const uint8_t *src_ptr, int src_stride,
+                              const uint8_t *ref_ptr, int ref_stride,
+                              unsigned int *sse) {
+  int sum;
+  __m256i vsse, vsum;
+  variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+  variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+  return *sse;
+}
+
+unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int src_stride,
+                               const uint8_t *ref_ptr, int ref_stride,
+                               unsigned int *sse) {
+  int sum;
+  __m256i vsse, vsum;
+  variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+  variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+  return *sse;
+}
+
+unsigned int vpx_sub_pixel_variance64x64_avx2(
+    const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,
+    const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) {
   unsigned int sse1;
-  const int se1 = vpx_sub_pixel_variance32xh_avx2(
-      src, src_stride, x_offset, y_offset, dst, dst_stride, 64, &sse1);
+  const int se1 = sub_pixel_variance32xh_avx2(
+      src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, 64, &sse1);
   unsigned int sse2;
   const int se2 =
-      vpx_sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset,
-                                      dst + 32, dst_stride, 64, &sse2);
+      sub_pixel_variance32xh_avx2(src_ptr + 32, src_stride, x_offset, y_offset,
+                                  ref_ptr + 32, ref_stride, 64, &sse2);
   const int se = se1 + se2;
   *sse = sse1 + sse2;
   return *sse - (uint32_t)(((int64_t)se * se) >> 12);
 }
 
-unsigned int vpx_sub_pixel_variance32x32_avx2(const uint8_t *src,
-                                              int src_stride, int x_offset,
-                                              int y_offset, const uint8_t *dst,
-                                              int dst_stride,
-                                              unsigned int *sse) {
-  const int se = vpx_sub_pixel_variance32xh_avx2(
-      src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse);
+unsigned int vpx_sub_pixel_variance32x32_avx2(
+    const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,
+    const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) {
+  const int se = sub_pixel_variance32xh_avx2(
+      src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, 32, sse);
   return *sse - (uint32_t)(((int64_t)se * se) >> 10);
 }
 
 unsigned int vpx_sub_pixel_avg_variance64x64_avx2(
-    const uint8_t *src, int src_stride, int x_offset, int y_offset,
-    const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
+    const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,
+    const uint8_t *ref_ptr, int ref_stride, unsigned int *sse,
+    const uint8_t *second_pred) {
   unsigned int sse1;
-  const int se1 = vpx_sub_pixel_avg_variance32xh_avx2(
-      src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 64, 64, &sse1);
+  const int se1 = sub_pixel_avg_variance32xh_avx2(src_ptr, src_stride, x_offset,
+                                                  y_offset, ref_ptr, ref_stride,
+                                                  second_pred, 64, 64, &sse1);
   unsigned int sse2;
-  const int se2 = vpx_sub_pixel_avg_variance32xh_avx2(
-      src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, sec + 32,
-      64, 64, &sse2);
+  const int se2 = sub_pixel_avg_variance32xh_avx2(
+      src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, ref_stride,
+      second_pred + 32, 64, 64, &sse2);
   const int se = se1 + se2;
 
   *sse = sse1 + sse2;
@@ -146,10 +861,12 @@ unsigned int vpx_sub_pixel_avg_variance64x64_avx2(
 }
 
 unsigned int vpx_sub_pixel_avg_variance32x32_avx2(
-    const uint8_t *src, int src_stride, int x_offset, int y_offset,
-    const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
+    const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,
+    const uint8_t *ref_ptr, int ref_stride, unsigned int *sse,
+    const uint8_t *second_pred) {
   // Process 32 elements in parallel.
-  const int se = vpx_sub_pixel_avg_variance32xh_avx2(
-      src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse);
+  const int se = sub_pixel_avg_variance32xh_avx2(src_ptr, src_stride, x_offset,
+                                                 y_offset, ref_ptr, ref_stride,
+                                                 second_pred, 32, 32, sse);
   return *sse - (uint32_t)(((int64_t)se * se) >> 10);
 }
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/variance_impl_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/variance_impl_avx2.c
deleted file mode 100644
index 51e6b19ad1..0000000000
--- a/media/libvpx/libvpx/vpx_dsp/x86/variance_impl_avx2.c
+++ /dev/null
@@ -1,708 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <immintrin.h>  // AVX2
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_ports/mem.h"
-
-/* clang-format off */
-DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
-  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,
-  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,
-  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,
-  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,
-  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,
-  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,
-  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,
-  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,
-  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-  6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10,
-  6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10,
-  4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12,
-  4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12,
-  2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14,
-  2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14,
-};
-/* clang-format on */
-
-void vpx_get16x16var_avx2(const unsigned char *src_ptr, int source_stride,
-                          const unsigned char *ref_ptr, int recon_stride,
-                          unsigned int *SSE, int *Sum) {
-  __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
-  __m256i ref_expand_high, madd_low, madd_high;
-  unsigned int i, src_2strides, ref_2strides;
-  __m256i zero_reg = _mm256_set1_epi16(0);
-  __m256i sum_ref_src = _mm256_set1_epi16(0);
-  __m256i madd_ref_src = _mm256_set1_epi16(0);
-
-  // processing two strides in a 256 bit register reducing the number
-  // of loop stride by half (comparing to the sse2 code)
-  src_2strides = source_stride << 1;
-  ref_2strides = recon_stride << 1;
-  for (i = 0; i < 8; i++) {
-    src = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(src_ptr)));
-    src = _mm256_inserti128_si256(
-        src, _mm_loadu_si128((__m128i const *)(src_ptr + source_stride)), 1);
-
-    ref = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(ref_ptr)));
-    ref = _mm256_inserti128_si256(
-        ref, _mm_loadu_si128((__m128i const *)(ref_ptr + recon_stride)), 1);
-
-    // expanding to 16 bit each lane
-    src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
-    src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
-
-    ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
-    ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
-
-    // src-ref
-    src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
-    src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
-
-    // madd low (src - ref)
-    madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
-
-    // add high to low
-    src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
-
-    // madd high (src - ref)
-    madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
-
-    sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
-
-    // add high to low
-    madd_ref_src =
-        _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high));
-
-    src_ptr += src_2strides;
-    ref_ptr += ref_2strides;
-  }
-
-  {
-    __m128i sum_res, madd_res;
-    __m128i expand_sum_low, expand_sum_high, expand_sum;
-    __m128i expand_madd_low, expand_madd_high, expand_madd;
-    __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
-
-    // extract the low lane and add it to the high lane
-    sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src),
-                            _mm256_extractf128_si256(sum_ref_src, 1));
-
-    madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src),
-                             _mm256_extractf128_si256(madd_ref_src, 1));
-
-    // padding each 2 bytes with another 2 zeroed bytes
-    expand_sum_low =
-        _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg), sum_res);
-    expand_sum_high =
-        _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg), sum_res);
-
-    // shifting the sign 16 bits right
-    expand_sum_low = _mm_srai_epi32(expand_sum_low, 16);
-    expand_sum_high = _mm_srai_epi32(expand_sum_high, 16);
-
-    expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high);
-
-    // expand each 32 bits of the madd result to 64 bits
-    expand_madd_low =
-        _mm_unpacklo_epi32(madd_res, _mm256_castsi256_si128(zero_reg));
-    expand_madd_high =
-        _mm_unpackhi_epi32(madd_res, _mm256_castsi256_si128(zero_reg));
-
-    expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high);
-
-    ex_expand_sum_low =
-        _mm_unpacklo_epi32(expand_sum, _mm256_castsi256_si128(zero_reg));
-    ex_expand_sum_high =
-        _mm_unpackhi_epi32(expand_sum, _mm256_castsi256_si128(zero_reg));
-
-    ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
-
-    // shift 8 bytes eight
-    madd_res = _mm_srli_si128(expand_madd, 8);
-    sum_res = _mm_srli_si128(ex_expand_sum, 8);
-
-    madd_res = _mm_add_epi32(madd_res, expand_madd);
-    sum_res = _mm_add_epi32(sum_res, ex_expand_sum);
-
-    *((int *)SSE) = _mm_cvtsi128_si32(madd_res);
-
-    *((int *)Sum) = _mm_cvtsi128_si32(sum_res);
-  }
-}
-
-void vpx_get32x32var_avx2(const unsigned char *src_ptr, int source_stride,
-                          const unsigned char *ref_ptr, int recon_stride,
-                          unsigned int *SSE, int *Sum) {
-  __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
-  __m256i ref_expand_high, madd_low, madd_high;
-  unsigned int i;
-  __m256i zero_reg = _mm256_set1_epi16(0);
-  __m256i sum_ref_src = _mm256_set1_epi16(0);
-  __m256i madd_ref_src = _mm256_set1_epi16(0);
-
-  // processing 32 elements in parallel
-  for (i = 0; i < 16; i++) {
-    src = _mm256_loadu_si256((__m256i const *)(src_ptr));
-
-    ref = _mm256_loadu_si256((__m256i const *)(ref_ptr));
-
-    // expanding to 16 bit each lane
-    src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
-    src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
-
-    ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
-    ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
-
-    // src-ref
-    src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
-    src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
-
-    // madd low (src - ref)
-    madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
-
-    // add high to low
-    src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
-
-    // madd high (src - ref)
-    madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
-
-    sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
-
-    // add high to low
-    madd_ref_src =
-        _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high));
-
-    src_ptr += source_stride;
-    ref_ptr += recon_stride;
-  }
-
-  {
-    __m256i expand_sum_low, expand_sum_high, expand_sum;
-    __m256i expand_madd_low, expand_madd_high, expand_madd;
-    __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
-
-    // padding each 2 bytes with another 2 zeroed bytes
-    expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src);
-    expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src);
-
-    // shifting the sign 16 bits right
-    expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16);
-    expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16);
-
-    expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high);
-
-    // expand each 32 bits of the madd result to 64 bits
-    expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg);
-    expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg);
-
-    expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high);
-
-    ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg);
-    ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg);
-
-    ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
-
-    // shift 8 bytes eight
-    madd_ref_src = _mm256_srli_si256(expand_madd, 8);
-    sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8);
-
-    madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd);
-    sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum);
-
-    // extract the low lane and the high lane and add the results
-    *((int *)SSE) =
-        _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) +
-        _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1));
-
-    *((int *)Sum) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) +
-                    _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1));
-  }
-}
-
-#define FILTER_SRC(filter)                               \
-  /* filter the source */                                \
-  exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
-  exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
-                                                         \
-  /* add 8 to source */                                  \
-  exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);        \
-  exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);        \
-                                                         \
-  /* divide source by 16 */                              \
-  exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);         \
-  exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
-
-#define MERGE_WITH_SRC(src_reg, reg)               \
-  exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
-  exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
-
-#define LOAD_SRC_DST                                    \
-  /* load source and destination */                     \
-  src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
-  dst_reg = _mm256_loadu_si256((__m256i const *)(dst));
-
-#define AVG_NEXT_SRC(src_reg, size_stride)                                 \
-  src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
-  /* average between current and next stride source */                     \
-  src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
-
-#define MERGE_NEXT_SRC(src_reg, size_stride)                               \
-  src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
-  MERGE_WITH_SRC(src_reg, src_next_reg)
-
-#define CALC_SUM_SSE_INSIDE_LOOP                          \
-  /* expand each byte to 2 bytes */                       \
-  exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);   \
-  exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);   \
-  /* source - dest */                                     \
-  exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);  \
-  exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);  \
-  /* caculate sum */                                      \
-  sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);        \
-  exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
-  sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);        \
-  exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
-  /* calculate sse */                                     \
-  sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);        \
-  sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
-
-// final calculation to sum and sse
-#define CALC_SUM_AND_SSE                                                   \
-  res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg);                         \
-  sse_reg_hi = _mm256_srli_si256(sse_reg, 8);                              \
-  sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp);                    \
-  sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp);                    \
-  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
-  sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi);                      \
-                                                                           \
-  sse_reg_hi = _mm256_srli_si256(sse_reg, 4);                              \
-  sum_reg_hi = _mm256_srli_si256(sum_reg, 8);                              \
-                                                                           \
-  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
-  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
-  *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) +     \
-                  _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
-  sum_reg_hi = _mm256_srli_si256(sum_reg, 4);                              \
-  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
-  sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) +               \
-        _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
-
-unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
-                                             int x_offset, int y_offset,
-                                             const uint8_t *dst, int dst_stride,
-                                             int height, unsigned int *sse) {
-  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
-  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
-  __m256i zero_reg;
-  int i, sum;
-  sum_reg = _mm256_set1_epi16(0);
-  sse_reg = _mm256_set1_epi16(0);
-  zero_reg = _mm256_set1_epi16(0);
-
-  // x_offset = 0 and y_offset = 0
-  if (x_offset == 0) {
-    if (y_offset == 0) {
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = 0 and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i src_next_reg;
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, src_stride)
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = 0 and y_offset = bilin interpolation
-    } else {
-      __m256i filter, pw8, src_next_reg;
-
-      y_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, src_stride)
-        FILTER_SRC(filter)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-    }
-    // x_offset = 8  and y_offset = 0
-  } else if (x_offset == 8) {
-    if (y_offset == 0) {
-      __m256i src_next_reg;
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = 8  and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i src_next_reg, src_avg;
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height; i++) {
-        src_avg = src_reg;
-        src += src_stride;
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        // average between previous average to current average
-        src_avg = _mm256_avg_epu8(src_avg, src_reg);
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_avg, zero_reg)
-        // save current source average
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-      // x_offset = 8  and y_offset = bilin interpolation
-    } else {
-      __m256i filter, pw8, src_next_reg, src_avg;
-      y_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height; i++) {
-        // save current source average
-        src_avg = src_reg;
-        src += src_stride;
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        MERGE_WITH_SRC(src_avg, src_reg)
-        FILTER_SRC(filter)
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-    }
-    // x_offset = bilin interpolation and y_offset = 0
-  } else {
-    if (y_offset == 0) {
-      __m256i filter, pw8, src_next_reg;
-      x_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(filter)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = bilin interpolation and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i filter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      MERGE_NEXT_SRC(src_reg, 1)
-      FILTER_SRC(filter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height; i++) {
-        src += src_stride;
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(filter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        // average between previous pack to the current
-        src_pack = _mm256_avg_epu8(src_pack, src_reg);
-        MERGE_WITH_SRC(src_pack, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src_pack = src_reg;
-        dst += dst_stride;
-      }
-      // x_offset = bilin interpolation and y_offset = bilin interpolation
-    } else {
-      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      xfilter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      y_offset <<= 5;
-      yfilter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      MERGE_NEXT_SRC(src_reg, 1)
-
-      FILTER_SRC(xfilter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height; i++) {
-        src += src_stride;
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(xfilter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        // merge previous pack to current pack source
-        MERGE_WITH_SRC(src_pack, src_reg)
-        // filter the source
-        FILTER_SRC(yfilter)
-        src_pack = src_reg;
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-    }
-  }
-  CALC_SUM_AND_SSE
-  return sum;
-}
-
-unsigned int vpx_sub_pixel_avg_variance32xh_avx2(
-    const uint8_t *src, int src_stride, int x_offset, int y_offset,
-    const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
-    int height, unsigned int *sse) {
-  __m256i sec_reg;
-  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
-  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
-  __m256i zero_reg;
-  int i, sum;
-  sum_reg = _mm256_set1_epi16(0);
-  sse_reg = _mm256_set1_epi16(0);
-  zero_reg = _mm256_set1_epi16(0);
-
-  // x_offset = 0 and y_offset = 0
-  if (x_offset == 0) {
-    if (y_offset == 0) {
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec += sec_stride;
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-    } else if (y_offset == 8) {
-      __m256i src_next_reg;
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, src_stride)
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec += sec_stride;
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = 0 and y_offset = bilin interpolation
-    } else {
-      __m256i filter, pw8, src_next_reg;
-
-      y_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, src_stride)
-        FILTER_SRC(filter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec += sec_stride;
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-    }
-    // x_offset = 8  and y_offset = 0
-  } else if (x_offset == 8) {
-    if (y_offset == 0) {
-      __m256i src_next_reg;
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec += sec_stride;
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = 8  and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i src_next_reg, src_avg;
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height; i++) {
-        // save current source average
-        src_avg = src_reg;
-        src += src_stride;
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        // average between previous average to current average
-        src_avg = _mm256_avg_epu8(src_avg, src_reg);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
-        sec += sec_stride;
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_avg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-      // x_offset = 8  and y_offset = bilin interpolation
-    } else {
-      __m256i filter, pw8, src_next_reg, src_avg;
-      y_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height; i++) {
-        // save current source average
-        src_avg = src_reg;
-        src += src_stride;
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        MERGE_WITH_SRC(src_avg, src_reg)
-        FILTER_SRC(filter)
-        src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_avg, zero_reg)
-        sec += sec_stride;
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-    }
-    // x_offset = bilin interpolation and y_offset = 0
-  } else {
-    if (y_offset == 0) {
-      __m256i filter, pw8, src_next_reg;
-      x_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(filter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        sec += sec_stride;
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = bilin interpolation and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i filter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      MERGE_NEXT_SRC(src_reg, 1)
-      FILTER_SRC(filter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height; i++) {
-        src += src_stride;
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(filter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        // average between previous pack to the current
-        src_pack = _mm256_avg_epu8(src_pack, src_reg);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
-        sec += sec_stride;
-        MERGE_WITH_SRC(src_pack, zero_reg)
-        src_pack = src_reg;
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-      // x_offset = bilin interpolation and y_offset = bilin interpolation
-    } else {
-      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      xfilter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      y_offset <<= 5;
-      yfilter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      MERGE_NEXT_SRC(src_reg, 1)
-
-      FILTER_SRC(xfilter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height; i++) {
-        src += src_stride;
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(xfilter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        // merge previous pack to current pack source
-        MERGE_WITH_SRC(src_pack, src_reg)
-        // filter the source
-        FILTER_SRC(yfilter)
-        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
-        MERGE_WITH_SRC(src_pack, zero_reg)
-        src_pack = src_reg;
-        sec += sec_stride;
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-    }
-  }
-  CALC_SUM_AND_SSE
-  return sum;
-}
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/variance_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/variance_sse2.c
index 1161da4914..d6eb12da1a 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/variance_sse2.c
+++ b/media/libvpx/libvpx/vpx_dsp/x86/variance_sse2.c
@@ -8,312 +8,426 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 #include <emmintrin.h>  // SSE2
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
-
 #include "vpx_ports/mem.h"
+#include "vpx_dsp/x86/mem_sse2.h"
 
-typedef void (*getNxMvar_fn_t)(const unsigned char *src, int src_stride,
-                               const unsigned char *ref, int ref_stride,
-                               unsigned int *sse, int *sum);
+static INLINE unsigned int add32x4_sse2(__m128i val) {
+  val = _mm_add_epi32(val, _mm_srli_si128(val, 8));
+  val = _mm_add_epi32(val, _mm_srli_si128(val, 4));
+  return (unsigned int)_mm_cvtsi128_si32(val);
+}
 
-unsigned int vpx_get_mb_ss_sse2(const int16_t *src) {
+unsigned int vpx_get_mb_ss_sse2(const int16_t *src_ptr) {
   __m128i vsum = _mm_setzero_si128();
   int i;
 
   for (i = 0; i < 32; ++i) {
-    const __m128i v = _mm_loadu_si128((const __m128i *)src);
+    const __m128i v = _mm_loadu_si128((const __m128i *)src_ptr);
     vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
-    src += 8;
+    src_ptr += 8;
   }
 
-  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
-  return _mm_cvtsi128_si32(vsum);
+  return add32x4_sse2(vsum);
 }
 
-#define READ64(p, stride, i)                                  \
-  _mm_unpacklo_epi8(                                          \
-      _mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
-      _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
+static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) {
+  const __m128i p0 = _mm_cvtsi32_si128(loadu_int32(p + 0 * stride));
+  const __m128i p1 = _mm_cvtsi32_si128(loadu_int32(p + 1 * stride));
+  const __m128i p01 = _mm_unpacklo_epi32(p0, p1);
+  return _mm_unpacklo_epi8(p01, _mm_setzero_si128());
+}
 
-static void get4x4var_sse2(const uint8_t *src, int src_stride,
-                           const uint8_t *ref, int ref_stride,
-                           unsigned int *sse, int *sum) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
-  const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
-  const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
-  const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
-  const __m128i diff0 = _mm_sub_epi16(src0, ref0);
-  const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+static INLINE void variance_kernel_sse2(const __m128i src_ptr,
+                                        const __m128i ref_ptr,
+                                        __m128i *const sse,
+                                        __m128i *const sum) {
+  const __m128i diff = _mm_sub_epi16(src_ptr, ref_ptr);
+  *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff));
+  *sum = _mm_add_epi16(*sum, diff);
+}
+
+// Can handle 128 pixels' diff sum (such as 8x16 or 16x8)
+// Slightly faster than variance_final_256_pel_sse2()
+static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum,
+                                               unsigned int *const sse,
+                                               int *const sum) {
+  *sse = add32x4_sse2(vsse);
 
-  // sum
-  __m128i vsum = _mm_add_epi16(diff0, diff1);
   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
   *sum = (int16_t)_mm_extract_epi16(vsum, 0);
-
-  // sse
-  vsum =
-      _mm_add_epi32(_mm_madd_epi16(diff0, diff0), _mm_madd_epi16(diff1, diff1));
-  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
-  *sse = _mm_cvtsi128_si32(vsum);
 }
 
-void vpx_get8x8var_sse2(const uint8_t *src, int src_stride, const uint8_t *ref,
-                        int ref_stride, unsigned int *sse, int *sum) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i vsum = _mm_setzero_si128();
-  __m128i vsse = _mm_setzero_si128();
-  int i;
+// Can handle 256 pixels' diff sum (such as 16x16)
+static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum,
+                                               unsigned int *const sse,
+                                               int *const sum) {
+  *sse = add32x4_sse2(vsse);
 
-  for (i = 0; i < 8; i += 2) {
-    const __m128i src0 = _mm_unpacklo_epi8(
-        _mm_loadl_epi64((const __m128i *)(src + i * src_stride)), zero);
-    const __m128i ref0 = _mm_unpacklo_epi8(
-        _mm_loadl_epi64((const __m128i *)(ref + i * ref_stride)), zero);
-    const __m128i diff0 = _mm_sub_epi16(src0, ref0);
-
-    const __m128i src1 = _mm_unpacklo_epi8(
-        _mm_loadl_epi64((const __m128i *)(src + (i + 1) * src_stride)), zero);
-    const __m128i ref1 = _mm_unpacklo_epi8(
-        _mm_loadl_epi64((const __m128i *)(ref + (i + 1) * ref_stride)), zero);
-    const __m128i diff1 = _mm_sub_epi16(src1, ref1);
-
-    vsum = _mm_add_epi16(vsum, diff0);
-    vsum = _mm_add_epi16(vsum, diff1);
-    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
-    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
-  }
-
-  // sum
   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
   *sum = (int16_t)_mm_extract_epi16(vsum, 0);
-
-  // sse
-  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
-  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
-  *sse = _mm_cvtsi128_si32(vsse);
+  *sum += (int16_t)_mm_extract_epi16(vsum, 1);
 }
 
-void vpx_get16x16var_sse2(const uint8_t *src, int src_stride,
-                          const uint8_t *ref, int ref_stride, unsigned int *sse,
-                          int *sum) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i vsum = _mm_setzero_si128();
-  __m128i vsse = _mm_setzero_si128();
+// Can handle 512 pixels' diff sum (such as 16x32 or 32x16)
+static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum,
+                                               unsigned int *const sse,
+                                               int *const sum) {
+  *sse = add32x4_sse2(vsse);
+
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_unpacklo_epi16(vsum, vsum);
+  vsum = _mm_srai_epi32(vsum, 16);
+  *sum = (int)add32x4_sse2(vsum);
+}
+
+static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) {
+  const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16);
+  const __m128i sum_hi = _mm_srai_epi32(_mm_unpackhi_epi16(sum, sum), 16);
+  return _mm_add_epi32(sum_lo, sum_hi);
+}
+
+// Can handle 1024 pixels' diff sum (such as 32x32)
+static INLINE int sum_final_sse2(const __m128i sum) {
+  const __m128i t = sum_to_32bit_sse2(sum);
+  return (int)add32x4_sse2(t);
+}
+
+static INLINE void variance4_sse2(const uint8_t *src_ptr, const int src_stride,
+                                  const uint8_t *ref_ptr, const int ref_stride,
+                                  const int h, __m128i *const sse,
+                                  __m128i *const sum) {
   int i;
 
-  for (i = 0; i < 16; ++i) {
-    const __m128i s = _mm_loadu_si128((const __m128i *)src);
-    const __m128i r = _mm_loadu_si128((const __m128i *)ref);
+  assert(h <= 256);  // May overflow for larger height.
+  *sse = _mm_setzero_si128();
+  *sum = _mm_setzero_si128();
 
-    const __m128i src0 = _mm_unpacklo_epi8(s, zero);
-    const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
-    const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+  for (i = 0; i < h; i += 2) {
+    const __m128i s = load4x2_sse2(src_ptr, src_stride);
+    const __m128i r = load4x2_sse2(ref_ptr, ref_stride);
 
-    const __m128i src1 = _mm_unpackhi_epi8(s, zero);
-    const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
-    const __m128i diff1 = _mm_sub_epi16(src1, ref1);
-
-    vsum = _mm_add_epi16(vsum, diff0);
-    vsum = _mm_add_epi16(vsum, diff1);
-    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
-    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
-
-    src += src_stride;
-    ref += ref_stride;
-  }
-
-  // sum
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
-  *sum =
-      (int16_t)_mm_extract_epi16(vsum, 0) + (int16_t)_mm_extract_epi16(vsum, 1);
-
-  // sse
-  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
-  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
-  *sse = _mm_cvtsi128_si32(vsse);
-}
-
-static void variance_sse2(const unsigned char *src, int src_stride,
-                          const unsigned char *ref, int ref_stride, int w,
-                          int h, unsigned int *sse, int *sum,
-                          getNxMvar_fn_t var_fn, int block_size) {
-  int i, j;
-
-  *sse = 0;
-  *sum = 0;
-
-  for (i = 0; i < h; i += block_size) {
-    for (j = 0; j < w; j += block_size) {
-      unsigned int sse0;
-      int sum0;
-      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
-             ref_stride, &sse0, &sum0);
-      *sse += sse0;
-      *sum += sum0;
-    }
+    variance_kernel_sse2(s, r, sse, sum);
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
   }
 }
 
-unsigned int vpx_variance4x4_sse2(const unsigned char *src, int src_stride,
-                                  const unsigned char *ref, int ref_stride,
+static INLINE void variance8_sse2(const uint8_t *src_ptr, const int src_stride,
+                                  const uint8_t *ref_ptr, const int ref_stride,
+                                  const int h, __m128i *const sse,
+                                  __m128i *const sum) {
+  const __m128i zero = _mm_setzero_si128();
+  int i;
+
+  assert(h <= 128);  // May overflow for larger height.
+  *sse = _mm_setzero_si128();
+  *sum = _mm_setzero_si128();
+
+  for (i = 0; i < h; i++) {
+    const __m128i s =
+        _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)src_ptr), zero);
+    const __m128i r =
+        _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ref_ptr), zero);
+
+    variance_kernel_sse2(s, r, sse, sum);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  }
+}
+
+static INLINE void variance16_kernel_sse2(const uint8_t *const src_ptr,
+                                          const uint8_t *const ref_ptr,
+                                          __m128i *const sse,
+                                          __m128i *const sum) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i s = _mm_loadu_si128((const __m128i *)src_ptr);
+  const __m128i r = _mm_loadu_si128((const __m128i *)ref_ptr);
+  const __m128i src0 = _mm_unpacklo_epi8(s, zero);
+  const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
+  const __m128i src1 = _mm_unpackhi_epi8(s, zero);
+  const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
+
+  variance_kernel_sse2(src0, ref0, sse, sum);
+  variance_kernel_sse2(src1, ref1, sse, sum);
+}
+
+static INLINE void variance16_sse2(const uint8_t *src_ptr, const int src_stride,
+                                   const uint8_t *ref_ptr, const int ref_stride,
+                                   const int h, __m128i *const sse,
+                                   __m128i *const sum) {
+  int i;
+
+  assert(h <= 64);  // May overflow for larger height.
+  *sse = _mm_setzero_si128();
+  *sum = _mm_setzero_si128();
+
+  for (i = 0; i < h; ++i) {
+    variance16_kernel_sse2(src_ptr, ref_ptr, sse, sum);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  }
+}
+
+static INLINE void variance32_sse2(const uint8_t *src_ptr, const int src_stride,
+                                   const uint8_t *ref_ptr, const int ref_stride,
+                                   const int h, __m128i *const sse,
+                                   __m128i *const sum) {
+  int i;
+
+  assert(h <= 32);  // May overflow for larger height.
+  // Don't initialize sse here since it's an accumulation.
+  *sum = _mm_setzero_si128();
+
+  for (i = 0; i < h; ++i) {
+    variance16_kernel_sse2(src_ptr + 0, ref_ptr + 0, sse, sum);
+    variance16_kernel_sse2(src_ptr + 16, ref_ptr + 16, sse, sum);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  }
+}
+
+static INLINE void variance64_sse2(const uint8_t *src_ptr, const int src_stride,
+                                   const uint8_t *ref_ptr, const int ref_stride,
+                                   const int h, __m128i *const sse,
+                                   __m128i *const sum) {
+  int i;
+
+  assert(h <= 16);  // May overflow for larger height.
+  // Don't initialize sse here since it's an accumulation.
+  *sum = _mm_setzero_si128();
+
+  for (i = 0; i < h; ++i) {
+    variance16_kernel_sse2(src_ptr + 0, ref_ptr + 0, sse, sum);
+    variance16_kernel_sse2(src_ptr + 16, ref_ptr + 16, sse, sum);
+    variance16_kernel_sse2(src_ptr + 32, ref_ptr + 32, sse, sum);
+    variance16_kernel_sse2(src_ptr + 48, ref_ptr + 48, sse, sum);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  }
+}
+
+void vpx_get8x8var_sse2(const uint8_t *src_ptr, int src_stride,
+                        const uint8_t *ref_ptr, int ref_stride,
+                        unsigned int *sse, int *sum) {
+  __m128i vsse, vsum;
+  variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+  variance_final_128_pel_sse2(vsse, vsum, sse, sum);
+}
+
+void vpx_get16x16var_sse2(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *ref_ptr, int ref_stride,
+                          unsigned int *sse, int *sum) {
+  __m128i vsse, vsum;
+  variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+  variance_final_256_pel_sse2(vsse, vsum, sse, sum);
+}
+
+unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int src_stride,
+                                  const uint8_t *ref_ptr, int ref_stride,
                                   unsigned int *sse) {
+  __m128i vsse, vsum;
   int sum;
-  get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
+  variance4_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum);
+  variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
   return *sse - ((sum * sum) >> 4);
 }
 
-unsigned int vpx_variance8x4_sse2(const uint8_t *src, int src_stride,
-                                  const uint8_t *ref, int ref_stride,
+unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int src_stride,
+                                  const uint8_t *ref_ptr, int ref_stride,
                                   unsigned int *sse) {
+  __m128i vsse, vsum;
   int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 8, 4, sse, &sum,
-                get4x4var_sse2, 4);
+  variance4_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+  variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
   return *sse - ((sum * sum) >> 5);
 }
 
-unsigned int vpx_variance4x8_sse2(const uint8_t *src, int src_stride,
-                                  const uint8_t *ref, int ref_stride,
+unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride,
+                                  const uint8_t *ref_ptr, int ref_stride,
                                   unsigned int *sse) {
+  __m128i vsse, vsum;
   int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 4, 8, sse, &sum,
-                get4x4var_sse2, 4);
+  variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum);
+  variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
   return *sse - ((sum * sum) >> 5);
 }
 
-unsigned int vpx_variance8x8_sse2(const unsigned char *src, int src_stride,
-                                  const unsigned char *ref, int ref_stride,
+unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride,
+                                  const uint8_t *ref_ptr, int ref_stride,
                                   unsigned int *sse) {
+  __m128i vsse, vsum;
   int sum;
-  vpx_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
+  variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+  variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
   return *sse - ((sum * sum) >> 6);
 }
 
-unsigned int vpx_variance16x8_sse2(const unsigned char *src, int src_stride,
-                                   const unsigned char *ref, int ref_stride,
+unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *ref_ptr, int ref_stride,
                                    unsigned int *sse) {
+  __m128i vsse, vsum;
   int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 16, 8, sse, &sum,
-                vpx_get8x8var_sse2, 8);
+  variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+  variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
   return *sse - ((sum * sum) >> 7);
 }
 
-unsigned int vpx_variance8x16_sse2(const unsigned char *src, int src_stride,
-                                   const unsigned char *ref, int ref_stride,
+unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *ref_ptr, int ref_stride,
                                    unsigned int *sse) {
+  __m128i vsse, vsum;
   int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 8, 16, sse, &sum,
-                vpx_get8x8var_sse2, 8);
+  variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+  variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
   return *sse - ((sum * sum) >> 7);
 }
 
-unsigned int vpx_variance16x16_sse2(const unsigned char *src, int src_stride,
-                                    const unsigned char *ref, int ref_stride,
+unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
                                     unsigned int *sse) {
+  __m128i vsse, vsum;
   int sum;
-  vpx_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
-  return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
+  variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+  variance_final_256_pel_sse2(vsse, vsum, sse, &sum);
+  return *sse - (uint32_t)(((int64_t)sum * sum) >> 8);
 }
 
-unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
+unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
                                     unsigned int *sse) {
+  __m128i vsse, vsum;
   int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
-                vpx_get16x16var_sse2, 16);
+  variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum);
+  variance_final_512_pel_sse2(vsse, vsum, sse, &sum);
+  return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
+}
+
+unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
+                                    unsigned int *sse) {
+  __m128i vsse = _mm_setzero_si128();
+  __m128i vsum;
+  int sum;
+  variance32_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+  variance_final_512_pel_sse2(vsse, vsum, sse, &sum);
+  return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
+}
+
+unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
+                                    unsigned int *sse) {
+  __m128i vsse = _mm_setzero_si128();
+  __m128i vsum;
+  int sum;
+  variance32_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum);
+  *sse = add32x4_sse2(vsse);
+  sum = sum_final_sse2(vsum);
   return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
 }
 
-unsigned int vpx_variance32x16_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
+unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
                                     unsigned int *sse) {
+  __m128i vsse = _mm_setzero_si128();
+  __m128i vsum = _mm_setzero_si128();
   int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
-                vpx_get16x16var_sse2, 16);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
+  int i = 0;
+
+  for (i = 0; i < 2; i++) {
+    __m128i vsum16;
+    variance32_sse2(src_ptr + 32 * i * src_stride, src_stride,
+                    ref_ptr + 32 * i * ref_stride, ref_stride, 32, &vsse,
+                    &vsum16);
+    vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
+  }
+  *sse = add32x4_sse2(vsse);
+  sum = (int)add32x4_sse2(vsum);
+  return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
 }
 
-unsigned int vpx_variance16x32_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
+unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
                                     unsigned int *sse) {
+  __m128i vsse = _mm_setzero_si128();
+  __m128i vsum = _mm_setzero_si128();
   int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 16, 32, sse, &sum,
-                vpx_get16x16var_sse2, 16);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
+  int i = 0;
+
+  for (i = 0; i < 2; i++) {
+    __m128i vsum16;
+    variance64_sse2(src_ptr + 16 * i * src_stride, src_stride,
+                    ref_ptr + 16 * i * ref_stride, ref_stride, 16, &vsse,
+                    &vsum16);
+    vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
+  }
+  *sse = add32x4_sse2(vsse);
+  sum = (int)add32x4_sse2(vsum);
+  return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
 }
 
-unsigned int vpx_variance64x64_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
+unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
                                     unsigned int *sse) {
+  __m128i vsse = _mm_setzero_si128();
+  __m128i vsum = _mm_setzero_si128();
   int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
-                vpx_get16x16var_sse2, 16);
+  int i = 0;
+
+  for (i = 0; i < 4; i++) {
+    __m128i vsum16;
+    variance64_sse2(src_ptr + 16 * i * src_stride, src_stride,
+                    ref_ptr + 16 * i * ref_stride, ref_stride, 16, &vsse,
+                    &vsum16);
+    vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
+  }
+  *sse = add32x4_sse2(vsse);
+  sum = (int)add32x4_sse2(vsum);
   return *sse - (unsigned int)(((int64_t)sum * sum) >> 12);
 }
 
-unsigned int vpx_variance64x32_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
-                vpx_get16x16var_sse2, 16);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
-}
-
-unsigned int vpx_variance32x64_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 32, 64, sse, &sum,
-                vpx_get16x16var_sse2, 16);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
-}
-
-unsigned int vpx_mse8x8_sse2(const uint8_t *src, int src_stride,
-                             const uint8_t *ref, int ref_stride,
+unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int src_stride,
+                             const uint8_t *ref_ptr, int ref_stride,
                              unsigned int *sse) {
-  vpx_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
+  vpx_variance8x8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse);
   return *sse;
 }
 
-unsigned int vpx_mse8x16_sse2(const uint8_t *src, int src_stride,
-                              const uint8_t *ref, int ref_stride,
+unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int src_stride,
+                              const uint8_t *ref_ptr, int ref_stride,
                               unsigned int *sse) {
-  vpx_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
+  vpx_variance8x16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse);
   return *sse;
 }
 
-unsigned int vpx_mse16x8_sse2(const uint8_t *src, int src_stride,
-                              const uint8_t *ref, int ref_stride,
+unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int src_stride,
+                              const uint8_t *ref_ptr, int ref_stride,
                               unsigned int *sse) {
-  vpx_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
+  vpx_variance16x8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse);
   return *sse;
 }
 
-unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride,
+unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int src_stride,
+                               const uint8_t *ref_ptr, int ref_stride,
                                unsigned int *sse) {
-  vpx_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
+  vpx_variance16x16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse);
   return *sse;
 }
 
 // The 2 unused parameters are place holders for PIC enabled build.
 // These definitions are for functions defined in subpel_variance.asm
-#define DECL(w, opt)                                                           \
-  int vpx_sub_pixel_variance##w##xh_##opt(                                     \
-      const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset,    \
-      const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \
-      void *unused0, void *unused)
+#define DECL(w, opt)                                                          \
+  int vpx_sub_pixel_variance##w##xh_##opt(                                    \
+      const uint8_t *src_ptr, ptrdiff_t src_stride, int x_offset,             \
+      int y_offset, const uint8_t *ref_ptr, ptrdiff_t ref_stride, int height, \
+      unsigned int *sse, void *unused0, void *unused)
 #define DECLS(opt1, opt2) \
   DECL(4, opt1);          \
   DECL(8, opt1);          \
@@ -324,66 +438,67 @@ DECLS(ssse3, ssse3);
 #undef DECLS
 #undef DECL
 
-#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                       \
-  unsigned int vpx_sub_pixel_variance##w##x##h##_##opt(                        \
-      const uint8_t *src, int src_stride, int x_offset, int y_offset,          \
-      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {             \
-    unsigned int sse;                                                          \
-    int se = vpx_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset,   \
-                                                  y_offset, dst, dst_stride,   \
-                                                  h, &sse, NULL, NULL);        \
-    if (w > wf) {                                                              \
-      unsigned int sse2;                                                       \
-      int se2 = vpx_sub_pixel_variance##wf##xh_##opt(                          \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h,   \
-          &sse2, NULL, NULL);                                                  \
-      se += se2;                                                               \
-      sse += sse2;                                                             \
-      if (w > wf * 2) {                                                        \
-        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                            \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
-            &sse2, NULL, NULL);                                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                            \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
-            &sse2, NULL, NULL);                                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-      }                                                                        \
-    }                                                                          \
-    *sse_ptr = sse;                                                            \
-    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));   \
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                  \
+  unsigned int vpx_sub_pixel_variance##w##x##h##_##opt(                   \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
+      const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) {        \
+    unsigned int sse_tmp;                                                 \
+    int se = vpx_sub_pixel_variance##wf##xh_##opt(                        \
+        src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h,  \
+        &sse_tmp, NULL, NULL);                                            \
+    if (w > wf) {                                                         \
+      unsigned int sse2;                                                  \
+      int se2 = vpx_sub_pixel_variance##wf##xh_##opt(                     \
+          src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16,     \
+          ref_stride, h, &sse2, NULL, NULL);                              \
+      se += se2;                                                          \
+      sse_tmp += sse2;                                                    \
+      if (w > wf * 2) {                                                   \
+        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                       \
+            src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32,   \
+            ref_stride, h, &sse2, NULL, NULL);                            \
+        se += se2;                                                        \
+        sse_tmp += sse2;                                                  \
+        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                       \
+            src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48,   \
+            ref_stride, h, &sse2, NULL, NULL);                            \
+        se += se2;                                                        \
+        sse_tmp += sse2;                                                  \
+      }                                                                   \
+    }                                                                     \
+    *sse = sse_tmp;                                                       \
+    return sse_tmp -                                                      \
+           (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));    \
   }
 
-#define FNS(opt1, opt2)                              \
-  FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t));  \
-  FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t));  \
-  FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t));  \
-  FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t));  \
-  FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t));  \
-  FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t));  \
-  FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t));   \
-  FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t));    \
-  FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t));     \
-  FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t));     \
-  FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t));     \
+#define FNS(opt1, opt2)                             \
+  FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t))  \
+  FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t))  \
+  FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t))  \
+  FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t))  \
+  FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t))  \
+  FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t))  \
+  FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)) \
+  FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t))   \
+  FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t))    \
+  FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t))     \
+  FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t))     \
+  FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t))     \
   FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t))
 
-FNS(sse2, sse2);
-FNS(ssse3, ssse3);
+FNS(sse2, sse2)
+FNS(ssse3, ssse3)
 
 #undef FNS
 #undef FN
 
 // The 2 unused parameters are place holders for PIC enabled build.
-#define DECL(w, opt)                                                        \
-  int vpx_sub_pixel_avg_variance##w##xh_##opt(                              \
-      const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
-      const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec,         \
-      ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0,   \
-      void *unused)
+#define DECL(w, opt)                                                   \
+  int vpx_sub_pixel_avg_variance##w##xh_##opt(                         \
+      const uint8_t *src_ptr, ptrdiff_t src_stride, int x_offset,      \
+      int y_offset, const uint8_t *ref_ptr, ptrdiff_t ref_stride,      \
+      const uint8_t *second_pred, ptrdiff_t second_stride, int height, \
+      unsigned int *sse, void *unused0, void *unused)
 #define DECLS(opt1, opt2) \
   DECL(4, opt1);          \
   DECL(8, opt1);          \
@@ -394,56 +509,57 @@ DECLS(ssse3, ssse3);
 #undef DECL
 #undef DECLS
 
-#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                       \
-  unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt(                    \
-      const uint8_t *src, int src_stride, int x_offset, int y_offset,          \
-      const uint8_t *dst, int dst_stride, unsigned int *sseptr,                \
-      const uint8_t *sec) {                                                    \
-    unsigned int sse;                                                          \
-    int se = vpx_sub_pixel_avg_variance##wf##xh_##opt(                         \
-        src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
-        NULL, NULL);                                                           \
-    if (w > wf) {                                                              \
-      unsigned int sse2;                                                       \
-      int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(                      \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride,      \
-          sec + 16, w, h, &sse2, NULL, NULL);                                  \
-      se += se2;                                                               \
-      sse += sse2;                                                             \
-      if (w > wf * 2) {                                                        \
-        se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(                        \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride,    \
-            sec + 32, w, h, &sse2, NULL, NULL);                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-        se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(                        \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride,    \
-            sec + 48, w, h, &sse2, NULL, NULL);                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-      }                                                                        \
-    }                                                                          \
-    *sseptr = sse;                                                             \
-    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));   \
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                  \
+  unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt(               \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
+      const uint8_t *ref_ptr, int ref_stride, unsigned int *sse,          \
+      const uint8_t *second_pred) {                                       \
+    unsigned int sse_tmp;                                                 \
+    int se = vpx_sub_pixel_avg_variance##wf##xh_##opt(                    \
+        src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride,     \
+        second_pred, w, h, &sse_tmp, NULL, NULL);                         \
+    if (w > wf) {                                                         \
+      unsigned int sse2;                                                  \
+      int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(                 \
+          src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16,     \
+          ref_stride, second_pred + 16, w, h, &sse2, NULL, NULL);         \
+      se += se2;                                                          \
+      sse_tmp += sse2;                                                    \
+      if (w > wf * 2) {                                                   \
+        se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(                   \
+            src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32,   \
+            ref_stride, second_pred + 32, w, h, &sse2, NULL, NULL);       \
+        se += se2;                                                        \
+        sse_tmp += sse2;                                                  \
+        se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(                   \
+            src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48,   \
+            ref_stride, second_pred + 48, w, h, &sse2, NULL, NULL);       \
+        se += se2;                                                        \
+        sse_tmp += sse2;                                                  \
+      }                                                                   \
+    }                                                                     \
+    *sse = sse_tmp;                                                       \
+    return sse_tmp -                                                      \
+           (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));    \
   }
 
-#define FNS(opt1, opt2)                              \
-  FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t));  \
-  FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t));  \
-  FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t));  \
-  FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t));  \
-  FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t));  \
-  FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t));  \
-  FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t));  \
-  FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t));   \
-  FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t));    \
-  FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t));    \
-  FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t));    \
+#define FNS(opt1, opt2)                             \
+  FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t))  \
+  FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t))  \
+  FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t))  \
+  FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t))  \
+  FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t))  \
+  FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t))  \
+  FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)) \
+  FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t))  \
+  FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t))   \
+  FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t))    \
+  FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t))    \
+  FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t))    \
   FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t))
 
-FNS(sse2, sse);
-FNS(ssse3, ssse3);
+FNS(sse2, sse)
+FNS(ssse3, ssse3)
 
 #undef FNS
 #undef FN
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_asm_stubs.c b/media/libvpx/libvpx/vpx_dsp/x86/vpx_asm_stubs.c
deleted file mode 100644
index 727d9d1156..0000000000
--- a/media/libvpx/libvpx/vpx_dsp/x86/vpx_asm_stubs.c
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/x86/convolve.h"
-
-#if HAVE_SSE2
-filter8_1dfunction vpx_filter_block1d16_v8_sse2;
-filter8_1dfunction vpx_filter_block1d16_h8_sse2;
-filter8_1dfunction vpx_filter_block1d8_v8_sse2;
-filter8_1dfunction vpx_filter_block1d8_h8_sse2;
-filter8_1dfunction vpx_filter_block1d4_v8_sse2;
-filter8_1dfunction vpx_filter_block1d4_h8_sse2;
-filter8_1dfunction vpx_filter_block1d16_v8_avg_sse2;
-filter8_1dfunction vpx_filter_block1d16_h8_avg_sse2;
-filter8_1dfunction vpx_filter_block1d8_v8_avg_sse2;
-filter8_1dfunction vpx_filter_block1d8_h8_avg_sse2;
-filter8_1dfunction vpx_filter_block1d4_v8_avg_sse2;
-filter8_1dfunction vpx_filter_block1d4_h8_avg_sse2;
-
-filter8_1dfunction vpx_filter_block1d16_v2_sse2;
-filter8_1dfunction vpx_filter_block1d16_h2_sse2;
-filter8_1dfunction vpx_filter_block1d8_v2_sse2;
-filter8_1dfunction vpx_filter_block1d8_h2_sse2;
-filter8_1dfunction vpx_filter_block1d4_v2_sse2;
-filter8_1dfunction vpx_filter_block1d4_h2_sse2;
-filter8_1dfunction vpx_filter_block1d16_v2_avg_sse2;
-filter8_1dfunction vpx_filter_block1d16_h2_avg_sse2;
-filter8_1dfunction vpx_filter_block1d8_v2_avg_sse2;
-filter8_1dfunction vpx_filter_block1d8_h2_avg_sse2;
-filter8_1dfunction vpx_filter_block1d4_v2_avg_sse2;
-filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2;
-
-// void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
-//                               int w, int h);
-// void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                              uint8_t *dst, ptrdiff_t dst_stride,
-//                              const int16_t *filter_x, int x_step_q4,
-//                              const int16_t *filter_y, int y_step_q4,
-//                              int w, int h);
-// void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                                   uint8_t *dst, ptrdiff_t dst_stride,
-//                                   const int16_t *filter_x, int x_step_q4,
-//                                   const int16_t *filter_y, int y_step_q4,
-//                                   int w, int h);
-// void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                                  uint8_t *dst, ptrdiff_t dst_stride,
-//                                  const int16_t *filter_x, int x_step_q4,
-//                                  const int16_t *filter_y, int y_step_q4,
-//                                  int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
-
-// void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                         uint8_t *dst, ptrdiff_t dst_stride,
-//                         const int16_t *filter_x, int x_step_q4,
-//                         const int16_t *filter_y, int y_step_q4,
-//                         int w, int h);
-// void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                             uint8_t *dst, ptrdiff_t dst_stride,
-//                             const int16_t *filter_x, int x_step_q4,
-//                             const int16_t *filter_y, int y_step_q4,
-//                             int w, int h);
-FUN_CONV_2D(, sse2);
-FUN_CONV_2D(avg_, sse2);
-
-#if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
-
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
-
-// void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src,
-//                                      ptrdiff_t src_stride,
-//                                      uint8_t *dst,
-//                                      ptrdiff_t dst_stride,
-//                                      const int16_t *filter_x,
-//                                      int x_step_q4,
-//                                      const int16_t *filter_y,
-//                                      int y_step_q4,
-//                                      int w, int h, int bd);
-// void vpx_highbd_convolve8_vert_sse2(const uint8_t *src,
-//                                     ptrdiff_t src_stride,
-//                                     uint8_t *dst,
-//                                     ptrdiff_t dst_stride,
-//                                     const int16_t *filter_x,
-//                                     int x_step_q4,
-//                                     const int16_t *filter_y,
-//                                     int y_step_q4,
-//                                     int w, int h, int bd);
-// void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src,
-//                                          ptrdiff_t src_stride,
-//                                          uint8_t *dst,
-//                                          ptrdiff_t dst_stride,
-//                                          const int16_t *filter_x,
-//                                          int x_step_q4,
-//                                          const int16_t *filter_y,
-//                                          int y_step_q4,
-//                                          int w, int h, int bd);
-// void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src,
-//                                         ptrdiff_t src_stride,
-//                                         uint8_t *dst,
-//                                         ptrdiff_t dst_stride,
-//                                         const int16_t *filter_x,
-//                                         int x_step_q4,
-//                                         const int16_t *filter_y,
-//                                         int y_step_q4,
-//                                         int w, int h, int bd);
-HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
-HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
-HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
-                 sse2);
-
-// void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                                uint8_t *dst, ptrdiff_t dst_stride,
-//                                const int16_t *filter_x, int x_step_q4,
-//                                const int16_t *filter_y, int y_step_q4,
-//                                int w, int h, int bd);
-// void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                                    uint8_t *dst, ptrdiff_t dst_stride,
-//                                    const int16_t *filter_x, int x_step_q4,
-//                                    const int16_t *filter_y, int y_step_q4,
-//                                    int w, int h, int bd);
-HIGH_FUN_CONV_2D(, sse2);
-HIGH_FUN_CONV_2D(avg_, sse2);
-#endif  // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
-#endif  // HAVE_SSE2
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
index e2311c1167..3f444e2e6a 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
+++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
@@ -20,21 +20,19 @@ SECTION .text
 %endif
 %ifidn %2, highbd
 %define pavg pavgw
-cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
+cglobal %2_convolve_%1, 4, 8, 4+AUX_XMM_REGS, src, src_stride, \
                                               dst, dst_stride, \
-                                              fx, fxs, fy, fys, w, h, bd
+                                              f, fxo, fxs, fyo, fys, w, h, bd
 %else
 %define pavg pavgb
-cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
+cglobal convolve_%1, 4, 8, 4+AUX_XMM_REGS, src, src_stride, \
                                            dst, dst_stride, \
-                                           fx, fxs, fy, fys, w, h
+                                           f, fxo, fxs, fyo, fys, w, h
 %endif
   mov r4d, dword wm
 %ifidn %2, highbd
   shl r4d, 1
-  shl srcq, 1
   shl src_strideq, 1
-  shl dstq, 1
   shl dst_strideq, 1
 %else
   cmp r4d, 4
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm
index bfc816f235..fc301fb39e 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm
+++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm
@@ -45,7 +45,7 @@
 
     ;Compute max and min values of a pixel
     mov         rdx, 0x00010001
-    movsxd      rcx, DWORD PTR arg(6)      ;bps
+    movsxd      rcx, DWORD PTR arg(6)      ;bd
     movq        xmm0, rdx
     movq        xmm1, rcx
     pshufd      xmm0, xmm0, 0b
@@ -121,7 +121,7 @@
 
     ;Compute max and min values of a pixel
     mov         rdx, 0x00010001
-    movsxd      rcx, DWORD PTR arg(6)       ;bps
+    movsxd      rcx, DWORD PTR arg(6)       ;bd
     movq        xmm0, rdx
     movq        xmm1, rcx
     pshufd      xmm0, xmm0, 0b
@@ -197,7 +197,9 @@
     movdqu      [rdi + %2], xmm0
 %endm
 
-;void vpx_filter_block1d4_v8_sse2
+SECTION .text
+
+;void vpx_highbd_filter_block1d4_v8_sse2
 ;(
 ;    unsigned char *src_ptr,
 ;    unsigned int   src_pitch,
@@ -206,7 +208,7 @@
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(vpx_highbd_filter_block1d4_v8_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d4_v8_sse2)
 sym(vpx_highbd_filter_block1d4_v8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -267,7 +269,7 @@ sym(vpx_highbd_filter_block1d4_v8_sse2):
     pop         rbp
     ret
 
-;void vpx_filter_block1d8_v8_sse2
+;void vpx_highbd_filter_block1d8_v8_sse2
 ;(
 ;    unsigned char *src_ptr,
 ;    unsigned int   src_pitch,
@@ -276,7 +278,7 @@ sym(vpx_highbd_filter_block1d4_v8_sse2):
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(vpx_highbd_filter_block1d8_v8_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d8_v8_sse2)
 sym(vpx_highbd_filter_block1d8_v8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -326,7 +328,7 @@ sym(vpx_highbd_filter_block1d8_v8_sse2):
     pop         rbp
     ret
 
-;void vpx_filter_block1d16_v8_sse2
+;void vpx_highbd_filter_block1d16_v8_sse2
 ;(
 ;    unsigned char *src_ptr,
 ;    unsigned int   src_pitch,
@@ -335,7 +337,7 @@ sym(vpx_highbd_filter_block1d8_v8_sse2):
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(vpx_highbd_filter_block1d16_v8_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d16_v8_sse2)
 sym(vpx_highbd_filter_block1d16_v8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -389,7 +391,7 @@ sym(vpx_highbd_filter_block1d16_v8_sse2):
     pop         rbp
     ret
 
-global sym(vpx_highbd_filter_block1d4_v8_avg_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d4_v8_avg_sse2)
 sym(vpx_highbd_filter_block1d4_v8_avg_sse2):
     push        rbp
     mov         rbp, rsp
@@ -450,7 +452,7 @@ sym(vpx_highbd_filter_block1d4_v8_avg_sse2):
     pop         rbp
     ret
 
-global sym(vpx_highbd_filter_block1d8_v8_avg_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d8_v8_avg_sse2)
 sym(vpx_highbd_filter_block1d8_v8_avg_sse2):
     push        rbp
     mov         rbp, rsp
@@ -499,7 +501,7 @@ sym(vpx_highbd_filter_block1d8_v8_avg_sse2):
     pop         rbp
     ret
 
-global sym(vpx_highbd_filter_block1d16_v8_avg_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d16_v8_avg_sse2)
 sym(vpx_highbd_filter_block1d16_v8_avg_sse2):
     push        rbp
     mov         rbp, rsp
@@ -552,7 +554,7 @@ sym(vpx_highbd_filter_block1d16_v8_avg_sse2):
     pop         rbp
     ret
 
-;void vpx_filter_block1d4_h8_sse2
+;void vpx_highbd_filter_block1d4_h8_sse2
 ;(
 ;    unsigned char  *src_ptr,
 ;    unsigned int    src_pixels_per_line,
@@ -561,7 +563,7 @@ sym(vpx_highbd_filter_block1d16_v8_avg_sse2):
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(vpx_highbd_filter_block1d4_h8_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d4_h8_sse2)
 sym(vpx_highbd_filter_block1d4_h8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -627,7 +629,7 @@ sym(vpx_highbd_filter_block1d4_h8_sse2):
     pop         rbp
     ret
 
-;void vpx_filter_block1d8_h8_sse2
+;void vpx_highbd_filter_block1d8_h8_sse2
 ;(
 ;    unsigned char  *src_ptr,
 ;    unsigned int    src_pixels_per_line,
@@ -636,7 +638,7 @@ sym(vpx_highbd_filter_block1d4_h8_sse2):
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(vpx_highbd_filter_block1d8_h8_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d8_h8_sse2)
 sym(vpx_highbd_filter_block1d8_h8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -693,7 +695,7 @@ sym(vpx_highbd_filter_block1d8_h8_sse2):
     pop         rbp
     ret
 
-;void vpx_filter_block1d16_h8_sse2
+;void vpx_highbd_filter_block1d16_h8_sse2
 ;(
 ;    unsigned char  *src_ptr,
 ;    unsigned int    src_pixels_per_line,
@@ -702,7 +704,7 @@ sym(vpx_highbd_filter_block1d8_h8_sse2):
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(vpx_highbd_filter_block1d16_h8_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d16_h8_sse2)
 sym(vpx_highbd_filter_block1d16_h8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -770,7 +772,7 @@ sym(vpx_highbd_filter_block1d16_h8_sse2):
     pop         rbp
     ret
 
-global sym(vpx_highbd_filter_block1d4_h8_avg_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d4_h8_avg_sse2)
 sym(vpx_highbd_filter_block1d4_h8_avg_sse2):
     push        rbp
     mov         rbp, rsp
@@ -836,7 +838,7 @@ sym(vpx_highbd_filter_block1d4_h8_avg_sse2):
     pop         rbp
     ret
 
-global sym(vpx_highbd_filter_block1d8_h8_avg_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d8_h8_avg_sse2)
 sym(vpx_highbd_filter_block1d8_h8_avg_sse2):
     push        rbp
     mov         rbp, rsp
@@ -893,7 +895,7 @@ sym(vpx_highbd_filter_block1d8_h8_avg_sse2):
     pop         rbp
     ret
 
-global sym(vpx_highbd_filter_block1d16_h8_avg_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d16_h8_avg_sse2)
 sym(vpx_highbd_filter_block1d16_h8_avg_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm
index 72f2ff71da..bd51c75bcb 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm
+++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm
@@ -26,7 +26,7 @@
     pshufd      xmm3, xmm3, 0
 
     mov         rdx, 0x00010001
-    movsxd      rcx, DWORD PTR arg(6)       ;bps
+    movsxd      rcx, DWORD PTR arg(6)       ;bd
     movq        xmm5, rdx
     movq        xmm2, rcx
     pshufd      xmm5, xmm5, 0b
@@ -64,7 +64,7 @@
     dec         rcx
 %endm
 
-%if ARCH_X86_64
+%if VPX_ARCH_X86_64
 %macro HIGH_GET_PARAM 0
     mov         rdx, arg(5)                 ;filter ptr
     mov         rsi, arg(0)                 ;src_ptr
@@ -82,7 +82,7 @@
     pshufd      xmm4, xmm4, 0
 
     mov         rdx, 0x00010001
-    movsxd      rcx, DWORD PTR arg(6)       ;bps
+    movsxd      rcx, DWORD PTR arg(6)       ;bd
     movq        xmm8, rdx
     movq        xmm5, rcx
     pshufd      xmm8, xmm8, 0b
@@ -171,7 +171,9 @@
 %endm
 %endif
 
-global sym(vpx_highbd_filter_block1d4_v2_sse2) PRIVATE
+SECTION .text
+
+globalsym(vpx_highbd_filter_block1d4_v2_sse2)
 sym(vpx_highbd_filter_block1d4_v2_sse2):
     push        rbp
     mov         rbp, rsp
@@ -195,8 +197,8 @@ sym(vpx_highbd_filter_block1d4_v2_sse2):
     pop         rbp
     ret
 
-%if ARCH_X86_64
-global sym(vpx_highbd_filter_block1d8_v2_sse2) PRIVATE
+%if VPX_ARCH_X86_64
+globalsym(vpx_highbd_filter_block1d8_v2_sse2)
 sym(vpx_highbd_filter_block1d8_v2_sse2):
     push        rbp
     mov         rbp, rsp
@@ -222,7 +224,7 @@ sym(vpx_highbd_filter_block1d8_v2_sse2):
     pop         rbp
     ret
 
-global sym(vpx_highbd_filter_block1d16_v2_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d16_v2_sse2)
 sym(vpx_highbd_filter_block1d16_v2_sse2):
     push        rbp
     mov         rbp, rsp
@@ -251,7 +253,7 @@ sym(vpx_highbd_filter_block1d16_v2_sse2):
     ret
 %endif
 
-global sym(vpx_highbd_filter_block1d4_v2_avg_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d4_v2_avg_sse2)
 sym(vpx_highbd_filter_block1d4_v2_avg_sse2):
     push        rbp
     mov         rbp, rsp
@@ -275,8 +277,8 @@ sym(vpx_highbd_filter_block1d4_v2_avg_sse2):
     pop         rbp
     ret
 
-%if ARCH_X86_64
-global sym(vpx_highbd_filter_block1d8_v2_avg_sse2) PRIVATE
+%if VPX_ARCH_X86_64
+globalsym(vpx_highbd_filter_block1d8_v2_avg_sse2)
 sym(vpx_highbd_filter_block1d8_v2_avg_sse2):
     push        rbp
     mov         rbp, rsp
@@ -302,7 +304,7 @@ sym(vpx_highbd_filter_block1d8_v2_avg_sse2):
     pop         rbp
     ret
 
-global sym(vpx_highbd_filter_block1d16_v2_avg_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d16_v2_avg_sse2)
 sym(vpx_highbd_filter_block1d16_v2_avg_sse2):
     push        rbp
     mov         rbp, rsp
@@ -331,7 +333,7 @@ sym(vpx_highbd_filter_block1d16_v2_avg_sse2):
     ret
 %endif
 
-global sym(vpx_highbd_filter_block1d4_h2_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d4_h2_sse2)
 sym(vpx_highbd_filter_block1d4_h2_sse2):
     push        rbp
     mov         rbp, rsp
@@ -356,8 +358,8 @@ sym(vpx_highbd_filter_block1d4_h2_sse2):
     pop         rbp
     ret
 
-%if ARCH_X86_64
-global sym(vpx_highbd_filter_block1d8_h2_sse2) PRIVATE
+%if VPX_ARCH_X86_64
+globalsym(vpx_highbd_filter_block1d8_h2_sse2)
 sym(vpx_highbd_filter_block1d8_h2_sse2):
     push        rbp
     mov         rbp, rsp
@@ -383,7 +385,7 @@ sym(vpx_highbd_filter_block1d8_h2_sse2):
     pop         rbp
     ret
 
-global sym(vpx_highbd_filter_block1d16_h2_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d16_h2_sse2)
 sym(vpx_highbd_filter_block1d16_h2_sse2):
     push        rbp
     mov         rbp, rsp
@@ -412,7 +414,7 @@ sym(vpx_highbd_filter_block1d16_h2_sse2):
     ret
 %endif
 
-global sym(vpx_highbd_filter_block1d4_h2_avg_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d4_h2_avg_sse2)
 sym(vpx_highbd_filter_block1d4_h2_avg_sse2):
     push        rbp
     mov         rbp, rsp
@@ -437,8 +439,8 @@ sym(vpx_highbd_filter_block1d4_h2_avg_sse2):
     pop         rbp
     ret
 
-%if ARCH_X86_64
-global sym(vpx_highbd_filter_block1d8_h2_avg_sse2) PRIVATE
+%if VPX_ARCH_X86_64
+globalsym(vpx_highbd_filter_block1d8_h2_avg_sse2)
 sym(vpx_highbd_filter_block1d8_h2_avg_sse2):
     push        rbp
     mov         rbp, rsp
@@ -464,7 +466,7 @@ sym(vpx_highbd_filter_block1d8_h2_avg_sse2):
     pop         rbp
     ret
 
-global sym(vpx_highbd_filter_block1d16_h2_avg_sse2) PRIVATE
+globalsym(vpx_highbd_filter_block1d16_h2_avg_sse2)
 sym(vpx_highbd_filter_block1d16_h2_avg_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
new file mode 100644
index 0000000000..21a35ae3c3
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
@@ -0,0 +1,1161 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/convolve.h"
+#include "vpx_dsp/x86/convolve_sse2.h"
+#include "vpx_ports/mem.h"
+
+#define CONV8_ROUNDING_BITS (7)
+#define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1))
+
+static void vpx_filter_block1d16_h4_sse2(const uint8_t *src_ptr,
+                                         ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                         ptrdiff_t dst_stride, uint32_t height,
+                                         const int16_t *kernel) {
+  __m128i kernel_reg;                         // Kernel
+  __m128i kernel_reg_23, kernel_reg_45;       // Segments of the kernel used
+  const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
+  int h;
+
+  __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3;
+  __m128i dst_first, dst_second;
+  __m128i even, odd;
+
+  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+  src_ptr -= 1;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+  kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+  kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+  for (h = height; h > 0; --h) {
+    // We will load multiple shifted versions of the row and shuffle them into
+    // 16-bit words of the form
+    // ... s[2] s[1] s[0] s[-1]
+    // ... s[4] s[3] s[2] s[1]
+    // Then we call multiply and add to get partial results
+    // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2]
+    // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4]
+    // The two results are then added together for the first half of even
+    // output.
+    // Repeat multiple times to get the whole outoput
+    src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+    src_reg_shift_1 = _mm_srli_si128(src_reg, 1);
+    src_reg_shift_2 = _mm_srli_si128(src_reg, 2);
+    src_reg_shift_3 = _mm_srli_si128(src_reg, 3);
+
+    // Output 6 4 2 0
+    even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
+                                 &kernel_reg_45);
+
+    // Output 7 5 3 1
+    odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3,
+                                &kernel_reg_23, &kernel_reg_45);
+
+    // Combine to get the first half of the dst
+    dst_first = mm_zip_epi32_sse2(&even, &odd);
+
+    // Do again to get the second half of dst
+    src_reg = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
+    src_reg_shift_1 = _mm_srli_si128(src_reg, 1);
+    src_reg_shift_2 = _mm_srli_si128(src_reg, 2);
+    src_reg_shift_3 = _mm_srli_si128(src_reg, 3);
+
+    // Output 14 12 10 8
+    even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
+                                 &kernel_reg_45);
+
+    // Output 15 13 11 9
+    odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3,
+                                &kernel_reg_23, &kernel_reg_45);
+
+    // Combine to get the second half of the dst
+    dst_second = mm_zip_epi32_sse2(&even, &odd);
+
+    // Round each result
+    dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
+    dst_second = mm_round_epi16_sse2(&dst_second, &reg_32, 6);
+
+    // Finally combine to get the final dst
+    dst_first = _mm_packus_epi16(dst_first, dst_second);
+    _mm_store_si128((__m128i *)dst_ptr, dst_first);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+/* The macro used to generate functions shifts the src_ptr up by 3 rows already
+ * */
+
+static void vpx_filter_block1d16_v4_sse2(const uint8_t *src_ptr,
+                                         ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                         ptrdiff_t dst_stride, uint32_t height,
+                                         const int16_t *kernel) {
+  // Register for source s[-1:3, :]
+  __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+  // Interleaved rows of the source. lo is first half, hi second
+  __m128i src_reg_m10_lo, src_reg_m10_hi, src_reg_01_lo, src_reg_01_hi;
+  __m128i src_reg_12_lo, src_reg_12_hi, src_reg_23_lo, src_reg_23_hi;
+  // Half of half of the interleaved rows
+  __m128i src_reg_m10_lo_1, src_reg_m10_lo_2, src_reg_m10_hi_1,
+      src_reg_m10_hi_2;
+  __m128i src_reg_01_lo_1, src_reg_01_lo_2, src_reg_01_hi_1, src_reg_01_hi_2;
+  __m128i src_reg_12_lo_1, src_reg_12_lo_2, src_reg_12_hi_1, src_reg_12_hi_2;
+  __m128i src_reg_23_lo_1, src_reg_23_lo_2, src_reg_23_hi_1, src_reg_23_hi_2;
+
+  __m128i kernel_reg;                    // Kernel
+  __m128i kernel_reg_23, kernel_reg_45;  // Segments of the kernel used
+
+  // Result after multiply and add
+  __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
+  __m128i res_reg_m10_hi, res_reg_01_hi, res_reg_12_hi, res_reg_23_hi;
+  __m128i res_reg_m1012, res_reg_0123;
+  __m128i res_reg_m1012_lo, res_reg_0123_lo, res_reg_m1012_hi, res_reg_0123_hi;
+
+  const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
+
+  // We will compute the result two rows at a time
+  const ptrdiff_t src_stride_unrolled = src_stride << 1;
+  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+  int h;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+  kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+  kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+  // We will load two rows of pixels as 8-bit words, rearrange them as 16-bit
+  // words,
+  // shuffle the data into the form
+  // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
+  // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
+  // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
+  // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
+  // so that we can call multiply and add with the kernel to get 32-bit words of
+  // the form
+  // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
+  // Finally, we can add multiple rows together to get the desired output.
+
+  // First shuffle the data
+  src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
+  src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
+  src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0);
+  src_reg_m10_hi = _mm_unpackhi_epi8(src_reg_m1, src_reg_0);
+  src_reg_m10_lo_1 = _mm_unpacklo_epi8(src_reg_m10_lo, _mm_setzero_si128());
+  src_reg_m10_lo_2 = _mm_unpackhi_epi8(src_reg_m10_lo, _mm_setzero_si128());
+  src_reg_m10_hi_1 = _mm_unpacklo_epi8(src_reg_m10_hi, _mm_setzero_si128());
+  src_reg_m10_hi_2 = _mm_unpackhi_epi8(src_reg_m10_hi, _mm_setzero_si128());
+
+  // More shuffling
+  src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
+  src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1);
+  src_reg_01_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1);
+  src_reg_01_lo_1 = _mm_unpacklo_epi8(src_reg_01_lo, _mm_setzero_si128());
+  src_reg_01_lo_2 = _mm_unpackhi_epi8(src_reg_01_lo, _mm_setzero_si128());
+  src_reg_01_hi_1 = _mm_unpacklo_epi8(src_reg_01_hi, _mm_setzero_si128());
+  src_reg_01_hi_2 = _mm_unpackhi_epi8(src_reg_01_hi, _mm_setzero_si128());
+
+  for (h = height; h > 1; h -= 2) {
+    src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
+
+    src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2);
+    src_reg_12_hi = _mm_unpackhi_epi8(src_reg_1, src_reg_2);
+
+    src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
+
+    src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3);
+    src_reg_23_hi = _mm_unpackhi_epi8(src_reg_2, src_reg_3);
+
+    // Partial output from first half
+    res_reg_m10_lo = mm_madd_packs_epi16_sse2(
+        &src_reg_m10_lo_1, &src_reg_m10_lo_2, &kernel_reg_23);
+
+    res_reg_01_lo = mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, &src_reg_01_lo_2,
+                                             &kernel_reg_23);
+
+    src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128());
+    src_reg_12_lo_2 = _mm_unpackhi_epi8(src_reg_12_lo, _mm_setzero_si128());
+    res_reg_12_lo = mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, &src_reg_12_lo_2,
+                                             &kernel_reg_45);
+
+    src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128());
+    src_reg_23_lo_2 = _mm_unpackhi_epi8(src_reg_23_lo, _mm_setzero_si128());
+    res_reg_23_lo = mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, &src_reg_23_lo_2,
+                                             &kernel_reg_45);
+
+    // Add to get first half of the results
+    res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo);
+    res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo);
+
+    // Now repeat everything again for the second half
+    // Partial output for second half
+    res_reg_m10_hi = mm_madd_packs_epi16_sse2(
+        &src_reg_m10_hi_1, &src_reg_m10_hi_2, &kernel_reg_23);
+
+    res_reg_01_hi = mm_madd_packs_epi16_sse2(&src_reg_01_hi_1, &src_reg_01_hi_2,
+                                             &kernel_reg_23);
+
+    src_reg_12_hi_1 = _mm_unpacklo_epi8(src_reg_12_hi, _mm_setzero_si128());
+    src_reg_12_hi_2 = _mm_unpackhi_epi8(src_reg_12_hi, _mm_setzero_si128());
+    res_reg_12_hi = mm_madd_packs_epi16_sse2(&src_reg_12_hi_1, &src_reg_12_hi_2,
+                                             &kernel_reg_45);
+
+    src_reg_23_hi_1 = _mm_unpacklo_epi8(src_reg_23_hi, _mm_setzero_si128());
+    src_reg_23_hi_2 = _mm_unpackhi_epi8(src_reg_23_hi, _mm_setzero_si128());
+    res_reg_23_hi = mm_madd_packs_epi16_sse2(&src_reg_23_hi_1, &src_reg_23_hi_2,
+                                             &kernel_reg_45);
+
+    // Second half of the results
+    res_reg_m1012_hi = _mm_adds_epi16(res_reg_m10_hi, res_reg_12_hi);
+    res_reg_0123_hi = _mm_adds_epi16(res_reg_01_hi, res_reg_23_hi);
+
+    // Round the words
+    res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, &reg_32, 6);
+    res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, &reg_32, 6);
+    res_reg_m1012_hi = mm_round_epi16_sse2(&res_reg_m1012_hi, &reg_32, 6);
+    res_reg_0123_hi = mm_round_epi16_sse2(&res_reg_0123_hi, &reg_32, 6);
+
+    // Combine to get the result
+    res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, res_reg_m1012_hi);
+    res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, res_reg_0123_hi);
+
+    _mm_store_si128((__m128i *)dst_ptr, res_reg_m1012);
+    _mm_store_si128((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
+
+    // Update the source by two rows
+    src_ptr += src_stride_unrolled;
+    dst_ptr += dst_stride_unrolled;
+
+    src_reg_m10_lo_1 = src_reg_12_lo_1;
+    src_reg_m10_lo_2 = src_reg_12_lo_2;
+    src_reg_m10_hi_1 = src_reg_12_hi_1;
+    src_reg_m10_hi_2 = src_reg_12_hi_2;
+    src_reg_01_lo_1 = src_reg_23_lo_1;
+    src_reg_01_lo_2 = src_reg_23_lo_2;
+    src_reg_01_hi_1 = src_reg_23_hi_1;
+    src_reg_01_hi_2 = src_reg_23_hi_2;
+    src_reg_1 = src_reg_3;
+  }
+}
+
+static void vpx_filter_block1d8_h4_sse2(const uint8_t *src_ptr,
+                                        ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                        ptrdiff_t dst_stride, uint32_t height,
+                                        const int16_t *kernel) {
+  __m128i kernel_reg;                         // Kernel
+  __m128i kernel_reg_23, kernel_reg_45;       // Segments of the kernel used
+  const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
+  int h;
+
+  __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3;
+  __m128i dst_first;
+  __m128i even, odd;
+
+  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+  src_ptr -= 1;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+  kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+  kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+  for (h = height; h > 0; --h) {
+    // We will load multiple shifted versions of the row and shuffle them into
+    // 16-bit words of the form
+    // ... s[2] s[1] s[0] s[-1]
+    // ... s[4] s[3] s[2] s[1]
+    // Then we call multiply and add to get partial results
+    // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2]
+    // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4]
+    // The two results are then added together to get the even output
+    src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+    src_reg_shift_1 = _mm_srli_si128(src_reg, 1);
+    src_reg_shift_2 = _mm_srli_si128(src_reg, 2);
+    src_reg_shift_3 = _mm_srli_si128(src_reg, 3);
+
+    // Output 6 4 2 0
+    even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
+                                 &kernel_reg_45);
+
+    // Output 7 5 3 1
+    odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3,
+                                &kernel_reg_23, &kernel_reg_45);
+
+    // Combine to get the first half of the dst
+    dst_first = mm_zip_epi32_sse2(&even, &odd);
+    dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
+
+    // Saturate and convert to 8-bit words
+    dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
+
+    _mm_storel_epi64((__m128i *)dst_ptr, dst_first);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void vpx_filter_block1d8_v4_sse2(const uint8_t *src_ptr,
+                                        ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                        ptrdiff_t dst_stride, uint32_t height,
+                                        const int16_t *kernel) {
+  // Register for source s[-1:3, :]
+  __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+  // Interleaved rows of the source. lo is first half, hi second
+  __m128i src_reg_m10_lo, src_reg_01_lo;
+  __m128i src_reg_12_lo, src_reg_23_lo;
+  // Half of half of the interleaved rows
+  __m128i src_reg_m10_lo_1, src_reg_m10_lo_2;
+  __m128i src_reg_01_lo_1, src_reg_01_lo_2;
+  __m128i src_reg_12_lo_1, src_reg_12_lo_2;
+  __m128i src_reg_23_lo_1, src_reg_23_lo_2;
+
+  __m128i kernel_reg;                    // Kernel
+  __m128i kernel_reg_23, kernel_reg_45;  // Segments of the kernel used
+
+  // Result after multiply and add
+  __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
+  __m128i res_reg_m1012, res_reg_0123;
+  __m128i res_reg_m1012_lo, res_reg_0123_lo;
+
+  const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
+
+  // We will compute the result two rows at a time
+  const ptrdiff_t src_stride_unrolled = src_stride << 1;
+  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+  int h;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+  kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+  kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+  // We will load two rows of pixels as 8-bit words, rearrange them as 16-bit
+  // words,
+  // shuffle the data into the form
+  // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
+  // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
+  // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
+  // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
+  // so that we can call multiply and add with the kernel to get 32-bit words of
+  // the form
+  // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
+  // Finally, we can add multiple rows together to get the desired output.
+
+  // First shuffle the data
+  src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
+  src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
+  src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0);
+  src_reg_m10_lo_1 = _mm_unpacklo_epi8(src_reg_m10_lo, _mm_setzero_si128());
+  src_reg_m10_lo_2 = _mm_unpackhi_epi8(src_reg_m10_lo, _mm_setzero_si128());
+
+  // More shuffling
+  src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
+  src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1);
+  src_reg_01_lo_1 = _mm_unpacklo_epi8(src_reg_01_lo, _mm_setzero_si128());
+  src_reg_01_lo_2 = _mm_unpackhi_epi8(src_reg_01_lo, _mm_setzero_si128());
+
+  for (h = height; h > 1; h -= 2) {
+    src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
+
+    src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2);
+
+    src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
+
+    src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3);
+
+    // Partial output
+    res_reg_m10_lo = mm_madd_packs_epi16_sse2(
+        &src_reg_m10_lo_1, &src_reg_m10_lo_2, &kernel_reg_23);
+
+    res_reg_01_lo = mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, &src_reg_01_lo_2,
+                                             &kernel_reg_23);
+
+    src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128());
+    src_reg_12_lo_2 = _mm_unpackhi_epi8(src_reg_12_lo, _mm_setzero_si128());
+    res_reg_12_lo = mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, &src_reg_12_lo_2,
+                                             &kernel_reg_45);
+
+    src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128());
+    src_reg_23_lo_2 = _mm_unpackhi_epi8(src_reg_23_lo, _mm_setzero_si128());
+    res_reg_23_lo = mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, &src_reg_23_lo_2,
+                                             &kernel_reg_45);
+
+    // Add to get results
+    res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo);
+    res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo);
+
+    // Round the words
+    res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, &reg_32, 6);
+    res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, &reg_32, 6);
+
+    // Convert to 8-bit words
+    res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, _mm_setzero_si128());
+    res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, _mm_setzero_si128());
+
+    // Save only half of the register (8 words)
+    _mm_storel_epi64((__m128i *)dst_ptr, res_reg_m1012);
+    _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
+
+    // Update the source by two rows
+    src_ptr += src_stride_unrolled;
+    dst_ptr += dst_stride_unrolled;
+
+    src_reg_m10_lo_1 = src_reg_12_lo_1;
+    src_reg_m10_lo_2 = src_reg_12_lo_2;
+    src_reg_01_lo_1 = src_reg_23_lo_1;
+    src_reg_01_lo_2 = src_reg_23_lo_2;
+    src_reg_1 = src_reg_3;
+  }
+}
+
+static void vpx_filter_block1d4_h4_sse2(const uint8_t *src_ptr,
+                                        ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                        ptrdiff_t dst_stride, uint32_t height,
+                                        const int16_t *kernel) {
+  __m128i kernel_reg;                         // Kernel
+  __m128i kernel_reg_23, kernel_reg_45;       // Segments of the kernel used
+  const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
+  int h;
+
+  __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3;
+  __m128i dst_first;
+  __m128i tmp_0, tmp_1;
+
+  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+  src_ptr -= 1;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+  kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+  kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+  for (h = height; h > 0; --h) {
+    // We will load multiple shifted versions of the row and shuffle them into
+    // 16-bit words of the form
+    // ... s[1] s[0] s[0] s[-1]
+    // ... s[3] s[2] s[2] s[1]
+    // Then we call multiply and add to get partial results
+    // s[1]k[3]+s[0]k[2] s[0]k[3]s[-1]k[2]
+    // s[3]k[5]+s[2]k[4] s[2]k[5]s[1]k[4]
+    // The two results are then added together to get the output
+    src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+    src_reg_shift_1 = _mm_srli_si128(src_reg, 1);
+    src_reg_shift_2 = _mm_srli_si128(src_reg, 2);
+    src_reg_shift_3 = _mm_srli_si128(src_reg, 3);
+
+    // Convert to 16-bit words
+    src_reg = _mm_unpacklo_epi8(src_reg, _mm_setzero_si128());
+    src_reg_shift_1 = _mm_unpacklo_epi8(src_reg_shift_1, _mm_setzero_si128());
+    src_reg_shift_2 = _mm_unpacklo_epi8(src_reg_shift_2, _mm_setzero_si128());
+    src_reg_shift_3 = _mm_unpacklo_epi8(src_reg_shift_3, _mm_setzero_si128());
+
+    // Shuffle into the right format
+    tmp_0 = _mm_unpacklo_epi32(src_reg, src_reg_shift_1);
+    tmp_1 = _mm_unpacklo_epi32(src_reg_shift_2, src_reg_shift_3);
+
+    // Partial output
+    tmp_0 = _mm_madd_epi16(tmp_0, kernel_reg_23);
+    tmp_1 = _mm_madd_epi16(tmp_1, kernel_reg_45);
+
+    // Output
+    dst_first = _mm_add_epi32(tmp_0, tmp_1);
+    dst_first = _mm_packs_epi32(dst_first, _mm_setzero_si128());
+
+    dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
+
+    // Saturate and convert to 8-bit words
+    dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
+
+    *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void vpx_filter_block1d4_v4_sse2(const uint8_t *src_ptr,
+                                        ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                        ptrdiff_t dst_stride, uint32_t height,
+                                        const int16_t *kernel) {
+  // Register for source s[-1:3, :]
+  __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+  // Interleaved rows of the source. lo is first half, hi second
+  __m128i src_reg_m10_lo, src_reg_01_lo;
+  __m128i src_reg_12_lo, src_reg_23_lo;
+  // Half of half of the interleaved rows
+  __m128i src_reg_m10_lo_1;
+  __m128i src_reg_01_lo_1;
+  __m128i src_reg_12_lo_1;
+  __m128i src_reg_23_lo_1;
+
+  __m128i kernel_reg;                    // Kernel
+  __m128i kernel_reg_23, kernel_reg_45;  // Segments of the kernel used
+
+  // Result after multiply and add
+  __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
+  __m128i res_reg_m1012, res_reg_0123;
+  __m128i res_reg_m1012_lo, res_reg_0123_lo;
+
+  const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
+  const __m128i reg_zero = _mm_setzero_si128();
+
+  // We will compute the result two rows at a time
+  const ptrdiff_t src_stride_unrolled = src_stride << 1;
+  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+  int h;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+  kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+  kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+  // We will load two rows of pixels as 8-bit words, rearrange them as 16-bit
+  // words,
+  // shuffle the data into the form
+  // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
+  // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
+  // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
+  // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
+  // so that we can call multiply and add with the kernel to get 32-bit words of
+  // the form
+  // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
+  // Finally, we can add multiple rows together to get the desired output.
+
+  // First shuffle the data
+  src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
+  src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
+  src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0);
+  src_reg_m10_lo_1 = _mm_unpacklo_epi8(src_reg_m10_lo, _mm_setzero_si128());
+
+  // More shuffling
+  src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
+  src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1);
+  src_reg_01_lo_1 = _mm_unpacklo_epi8(src_reg_01_lo, _mm_setzero_si128());
+
+  for (h = height; h > 1; h -= 2) {
+    src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
+
+    src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2);
+
+    src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
+
+    src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3);
+
+    // Partial output
+    res_reg_m10_lo =
+        mm_madd_packs_epi16_sse2(&src_reg_m10_lo_1, &reg_zero, &kernel_reg_23);
+
+    res_reg_01_lo =
+        mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, &reg_zero, &kernel_reg_23);
+
+    src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128());
+    res_reg_12_lo =
+        mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, &reg_zero, &kernel_reg_45);
+
+    src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128());
+    res_reg_23_lo =
+        mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, &reg_zero, &kernel_reg_45);
+
+    // Add to get results
+    res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo);
+    res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo);
+
+    // Round the words
+    res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, &reg_32, 6);
+    res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, &reg_32, 6);
+
+    // Convert to 8-bit words
+    res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, reg_zero);
+    res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, reg_zero);
+
+    // Save only half of the register (8 words)
+    *((int *)(dst_ptr)) = _mm_cvtsi128_si32(res_reg_m1012);
+    *((int *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(res_reg_0123);
+
+    // Update the source by two rows
+    src_ptr += src_stride_unrolled;
+    dst_ptr += dst_stride_unrolled;
+
+    src_reg_m10_lo_1 = src_reg_12_lo_1;
+    src_reg_01_lo_1 = src_reg_23_lo_1;
+    src_reg_1 = src_reg_3;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64
+static void vpx_highbd_filter_block1d4_h4_sse2(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+  // We will load multiple shifted versions of the row and shuffle them into
+  // 16-bit words of the form
+  // ... s[2] s[1] s[0] s[-1]
+  // ... s[4] s[3] s[2] s[1]
+  // Then we call multiply and add to get partial results
+  // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2]
+  // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4]
+  // The two results are then added together to get the even output
+
+  __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3;
+  __m128i res_reg;
+  __m128i even, odd;
+
+  __m128i kernel_reg;                    // Kernel
+  __m128i kernel_reg_23, kernel_reg_45;  // Segments of the kernel used
+  const __m128i reg_round =
+      _mm_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
+  const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1);
+  const __m128i reg_zero = _mm_setzero_si128();
+  int h;
+
+  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+  src_ptr -= 1;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+  kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+  for (h = height; h > 0; --h) {
+    src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+    src_reg_shift_1 = _mm_srli_si128(src_reg, 2);
+    src_reg_shift_2 = _mm_srli_si128(src_reg, 4);
+    src_reg_shift_3 = _mm_srli_si128(src_reg, 6);
+
+    // Output 2 0
+    even = mm_madd_add_epi16_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
+                                  &kernel_reg_45);
+
+    // Output 3 1
+    odd = mm_madd_add_epi16_sse2(&src_reg_shift_1, &src_reg_shift_3,
+                                 &kernel_reg_23, &kernel_reg_45);
+
+    // Combine to get the first half of the dst
+    res_reg = _mm_unpacklo_epi32(even, odd);
+    res_reg = mm_round_epi32_sse2(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
+    res_reg = _mm_packs_epi32(res_reg, reg_zero);
+
+    // Saturate the result and save
+    res_reg = _mm_min_epi16(res_reg, reg_max);
+    res_reg = _mm_max_epi16(res_reg, reg_zero);
+    _mm_storel_epi64((__m128i *)dst_ptr, res_reg);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void vpx_highbd_filter_block1d4_v4_sse2(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+  // We will load two rows of pixels as 16-bit words, and shuffle them into the
+  // form
+  // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
+  // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
+  // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
+  // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
+  // so that we can call multiply and add with the kernel to get 32-bit words of
+  // the form
+  // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
+  // Finally, we can add multiple rows together to get the desired output.
+
+  // Register for source s[-1:3, :]
+  __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+  // Interleaved rows of the source. lo is first half, hi second
+  __m128i src_reg_m10, src_reg_01;
+  __m128i src_reg_12, src_reg_23;
+
+  __m128i kernel_reg;                    // Kernel
+  __m128i kernel_reg_23, kernel_reg_45;  // Segments of the kernel used
+
+  // Result after multiply and add
+  __m128i res_reg_m10, res_reg_01, res_reg_12, res_reg_23;
+  __m128i res_reg_m1012, res_reg_0123;
+
+  const __m128i reg_round =
+      _mm_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
+  const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1);
+  const __m128i reg_zero = _mm_setzero_si128();
+
+  // We will compute the result two rows at a time
+  const ptrdiff_t src_stride_unrolled = src_stride << 1;
+  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+  int h;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+  kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+  // First shuffle the data
+  src_reg_m1 = _mm_loadl_epi64((const __m128i *)src_ptr);
+  src_reg_0 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride));
+  src_reg_m10 = _mm_unpacklo_epi16(src_reg_m1, src_reg_0);
+
+  // More shuffling
+  src_reg_1 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 2));
+  src_reg_01 = _mm_unpacklo_epi16(src_reg_0, src_reg_1);
+
+  for (h = height; h > 1; h -= 2) {
+    src_reg_2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3));
+
+    src_reg_12 = _mm_unpacklo_epi16(src_reg_1, src_reg_2);
+
+    src_reg_3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4));
+
+    src_reg_23 = _mm_unpacklo_epi16(src_reg_2, src_reg_3);
+
+    // Partial output
+    res_reg_m10 = _mm_madd_epi16(src_reg_m10, kernel_reg_23);
+    res_reg_01 = _mm_madd_epi16(src_reg_01, kernel_reg_23);
+    res_reg_12 = _mm_madd_epi16(src_reg_12, kernel_reg_45);
+    res_reg_23 = _mm_madd_epi16(src_reg_23, kernel_reg_45);
+
+    // Add to get results
+    res_reg_m1012 = _mm_add_epi32(res_reg_m10, res_reg_12);
+    res_reg_0123 = _mm_add_epi32(res_reg_01, res_reg_23);
+
+    // Round the words
+    res_reg_m1012 =
+        mm_round_epi32_sse2(&res_reg_m1012, &reg_round, CONV8_ROUNDING_BITS);
+    res_reg_0123 =
+        mm_round_epi32_sse2(&res_reg_0123, &reg_round, CONV8_ROUNDING_BITS);
+
+    res_reg_m1012 = _mm_packs_epi32(res_reg_m1012, reg_zero);
+    res_reg_0123 = _mm_packs_epi32(res_reg_0123, reg_zero);
+
+    // Saturate according to bit depth
+    res_reg_m1012 = _mm_min_epi16(res_reg_m1012, reg_max);
+    res_reg_0123 = _mm_min_epi16(res_reg_0123, reg_max);
+    res_reg_m1012 = _mm_max_epi16(res_reg_m1012, reg_zero);
+    res_reg_0123 = _mm_max_epi16(res_reg_0123, reg_zero);
+
+    // Save only half of the register (8 words)
+    _mm_storel_epi64((__m128i *)dst_ptr, res_reg_m1012);
+    _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
+
+    // Update the source by two rows
+    src_ptr += src_stride_unrolled;
+    dst_ptr += dst_stride_unrolled;
+
+    src_reg_m10 = src_reg_12;
+    src_reg_01 = src_reg_23;
+    src_reg_1 = src_reg_3;
+  }
+}
+
+static void vpx_highbd_filter_block1d8_h4_sse2(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+  // We will load multiple shifted versions of the row and shuffle them into
+  // 16-bit words of the form
+  // ... s[2] s[1] s[0] s[-1]
+  // ... s[4] s[3] s[2] s[1]
+  // Then we call multiply and add to get partial results
+  // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2]
+  // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4]
+  // The two results are then added together for the first half of even
+  // output.
+  // Repeat multiple times to get the whole outoput
+
+  __m128i src_reg, src_reg_next, src_reg_shift_1, src_reg_shift_2,
+      src_reg_shift_3;
+  __m128i res_reg;
+  __m128i even, odd;
+  __m128i tmp_0, tmp_1;
+
+  __m128i kernel_reg;                    // Kernel
+  __m128i kernel_reg_23, kernel_reg_45;  // Segments of the kernel used
+  const __m128i reg_round =
+      _mm_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
+  const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1);
+  const __m128i reg_zero = _mm_setzero_si128();
+  int h;
+
+  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+  src_ptr -= 1;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+  kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+  for (h = height; h > 0; --h) {
+    // We will put first half in the first half of the reg, and second half in
+    // second half
+    src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+    src_reg_next = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
+
+    // Output 6 4 2 0
+    tmp_0 = _mm_srli_si128(src_reg, 4);
+    tmp_1 = _mm_srli_si128(src_reg_next, 2);
+    src_reg_shift_2 = _mm_unpacklo_epi64(tmp_0, tmp_1);
+    even = mm_madd_add_epi16_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
+                                  &kernel_reg_45);
+
+    // Output 7 5 3 1
+    tmp_0 = _mm_srli_si128(src_reg, 2);
+    tmp_1 = src_reg_next;
+    src_reg_shift_1 = _mm_unpacklo_epi64(tmp_0, tmp_1);
+
+    tmp_0 = _mm_srli_si128(src_reg, 6);
+    tmp_1 = _mm_srli_si128(src_reg_next, 4);
+    src_reg_shift_3 = _mm_unpacklo_epi64(tmp_0, tmp_1);
+
+    odd = mm_madd_add_epi16_sse2(&src_reg_shift_1, &src_reg_shift_3,
+                                 &kernel_reg_23, &kernel_reg_45);
+
+    // Combine to get the first half of the dst
+    even = mm_round_epi32_sse2(&even, &reg_round, CONV8_ROUNDING_BITS);
+    odd = mm_round_epi32_sse2(&odd, &reg_round, CONV8_ROUNDING_BITS);
+    res_reg = mm_zip_epi32_sse2(&even, &odd);
+
+    // Saturate the result and save
+    res_reg = _mm_min_epi16(res_reg, reg_max);
+    res_reg = _mm_max_epi16(res_reg, reg_zero);
+
+    _mm_store_si128((__m128i *)dst_ptr, res_reg);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void vpx_highbd_filter_block1d8_v4_sse2(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+  // We will load two rows of pixels as 16-bit words, and shuffle them into the
+  // form
+  // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
+  // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
+  // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
+  // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
+  // so that we can call multiply and add with the kernel to get 32-bit words of
+  // the form
+  // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
+  // Finally, we can add multiple rows together to get the desired output.
+
+  // Register for source s[-1:3, :]
+  __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+  // Interleaved rows of the source. lo is first half, hi second
+  __m128i src_reg_m10_lo, src_reg_01_lo, src_reg_m10_hi, src_reg_01_hi;
+  __m128i src_reg_12_lo, src_reg_23_lo, src_reg_12_hi, src_reg_23_hi;
+
+  // Result after multiply and add
+  __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
+  __m128i res_reg_m10_hi, res_reg_01_hi, res_reg_12_hi, res_reg_23_hi;
+  __m128i res_reg_m1012, res_reg_0123;
+  __m128i res_reg_m1012_lo, res_reg_0123_lo;
+  __m128i res_reg_m1012_hi, res_reg_0123_hi;
+
+  __m128i kernel_reg;                    // Kernel
+  __m128i kernel_reg_23, kernel_reg_45;  // Segments of the kernel used
+
+  const __m128i reg_round =
+      _mm_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
+  const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1);
+  const __m128i reg_zero = _mm_setzero_si128();
+
+  // We will compute the result two rows at a time
+  const ptrdiff_t src_stride_unrolled = src_stride << 1;
+  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+  int h;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+  kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+  // First shuffle the data
+  src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
+  src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
+  src_reg_m10_lo = _mm_unpacklo_epi16(src_reg_m1, src_reg_0);
+  src_reg_m10_hi = _mm_unpackhi_epi16(src_reg_m1, src_reg_0);
+
+  // More shuffling
+  src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
+  src_reg_01_lo = _mm_unpacklo_epi16(src_reg_0, src_reg_1);
+  src_reg_01_hi = _mm_unpackhi_epi16(src_reg_0, src_reg_1);
+
+  for (h = height; h > 1; h -= 2) {
+    src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
+
+    src_reg_12_lo = _mm_unpacklo_epi16(src_reg_1, src_reg_2);
+    src_reg_12_hi = _mm_unpackhi_epi16(src_reg_1, src_reg_2);
+
+    src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
+
+    src_reg_23_lo = _mm_unpacklo_epi16(src_reg_2, src_reg_3);
+    src_reg_23_hi = _mm_unpackhi_epi16(src_reg_2, src_reg_3);
+
+    // Partial output for first half
+    res_reg_m10_lo = _mm_madd_epi16(src_reg_m10_lo, kernel_reg_23);
+    res_reg_01_lo = _mm_madd_epi16(src_reg_01_lo, kernel_reg_23);
+    res_reg_12_lo = _mm_madd_epi16(src_reg_12_lo, kernel_reg_45);
+    res_reg_23_lo = _mm_madd_epi16(src_reg_23_lo, kernel_reg_45);
+
+    // Add to get results
+    res_reg_m1012_lo = _mm_add_epi32(res_reg_m10_lo, res_reg_12_lo);
+    res_reg_0123_lo = _mm_add_epi32(res_reg_01_lo, res_reg_23_lo);
+
+    // Round the words
+    res_reg_m1012_lo =
+        mm_round_epi32_sse2(&res_reg_m1012_lo, &reg_round, CONV8_ROUNDING_BITS);
+    res_reg_0123_lo =
+        mm_round_epi32_sse2(&res_reg_0123_lo, &reg_round, CONV8_ROUNDING_BITS);
+
+    // Partial output for first half
+    res_reg_m10_hi = _mm_madd_epi16(src_reg_m10_hi, kernel_reg_23);
+    res_reg_01_hi = _mm_madd_epi16(src_reg_01_hi, kernel_reg_23);
+    res_reg_12_hi = _mm_madd_epi16(src_reg_12_hi, kernel_reg_45);
+    res_reg_23_hi = _mm_madd_epi16(src_reg_23_hi, kernel_reg_45);
+
+    // Add to get results
+    res_reg_m1012_hi = _mm_add_epi32(res_reg_m10_hi, res_reg_12_hi);
+    res_reg_0123_hi = _mm_add_epi32(res_reg_01_hi, res_reg_23_hi);
+
+    // Round the words
+    res_reg_m1012_hi =
+        mm_round_epi32_sse2(&res_reg_m1012_hi, &reg_round, CONV8_ROUNDING_BITS);
+    res_reg_0123_hi =
+        mm_round_epi32_sse2(&res_reg_0123_hi, &reg_round, CONV8_ROUNDING_BITS);
+
+    // Combine the two halfs
+    res_reg_m1012 = _mm_packs_epi32(res_reg_m1012_lo, res_reg_m1012_hi);
+    res_reg_0123 = _mm_packs_epi32(res_reg_0123_lo, res_reg_0123_hi);
+
+    // Saturate according to bit depth
+    res_reg_m1012 = _mm_min_epi16(res_reg_m1012, reg_max);
+    res_reg_0123 = _mm_min_epi16(res_reg_0123, reg_max);
+    res_reg_m1012 = _mm_max_epi16(res_reg_m1012, reg_zero);
+    res_reg_0123 = _mm_max_epi16(res_reg_0123, reg_zero);
+
+    // Save only half of the register (8 words)
+    _mm_store_si128((__m128i *)dst_ptr, res_reg_m1012);
+    _mm_store_si128((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
+
+    // Update the source by two rows
+    src_ptr += src_stride_unrolled;
+    dst_ptr += dst_stride_unrolled;
+
+    src_reg_m10_lo = src_reg_12_lo;
+    src_reg_m10_hi = src_reg_12_hi;
+    src_reg_01_lo = src_reg_23_lo;
+    src_reg_01_hi = src_reg_23_hi;
+    src_reg_1 = src_reg_3;
+  }
+}
+
+static void vpx_highbd_filter_block1d16_h4_sse2(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+  vpx_highbd_filter_block1d8_h4_sse2(src_ptr, src_stride, dst_ptr, dst_stride,
+                                     height, kernel, bd);
+  vpx_highbd_filter_block1d8_h4_sse2(src_ptr + 8, src_stride, dst_ptr + 8,
+                                     dst_stride, height, kernel, bd);
+}
+
+static void vpx_highbd_filter_block1d16_v4_sse2(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+  vpx_highbd_filter_block1d8_v4_sse2(src_ptr, src_stride, dst_ptr, dst_stride,
+                                     height, kernel, bd);
+  vpx_highbd_filter_block1d8_v4_sse2(src_ptr + 8, src_stride, dst_ptr + 8,
+                                     dst_stride, height, kernel, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64
+
+// From vpx_subpixel_8t_sse2.asm.
+filter8_1dfunction vpx_filter_block1d16_v8_sse2;
+filter8_1dfunction vpx_filter_block1d16_h8_sse2;
+filter8_1dfunction vpx_filter_block1d8_v8_sse2;
+filter8_1dfunction vpx_filter_block1d8_h8_sse2;
+filter8_1dfunction vpx_filter_block1d4_v8_sse2;
+filter8_1dfunction vpx_filter_block1d4_h8_sse2;
+filter8_1dfunction vpx_filter_block1d16_v8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d16_h8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_v8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_h8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_v8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_h8_avg_sse2;
+
+// Use the [vh]8 version because there is no [vh]4 implementation.
+#define vpx_filter_block1d16_v4_avg_sse2 vpx_filter_block1d16_v8_avg_sse2
+#define vpx_filter_block1d16_h4_avg_sse2 vpx_filter_block1d16_h8_avg_sse2
+#define vpx_filter_block1d8_v4_avg_sse2 vpx_filter_block1d8_v8_avg_sse2
+#define vpx_filter_block1d8_h4_avg_sse2 vpx_filter_block1d8_h8_avg_sse2
+#define vpx_filter_block1d4_v4_avg_sse2 vpx_filter_block1d4_v8_avg_sse2
+#define vpx_filter_block1d4_h4_avg_sse2 vpx_filter_block1d4_h8_avg_sse2
+
+// From vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm.
+filter8_1dfunction vpx_filter_block1d16_v2_sse2;
+filter8_1dfunction vpx_filter_block1d16_h2_sse2;
+filter8_1dfunction vpx_filter_block1d8_v2_sse2;
+filter8_1dfunction vpx_filter_block1d8_h2_sse2;
+filter8_1dfunction vpx_filter_block1d4_v2_sse2;
+filter8_1dfunction vpx_filter_block1d4_h2_sse2;
+filter8_1dfunction vpx_filter_block1d16_v2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d16_h2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_v2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_h2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_v2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2;
+
+// void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                               uint8_t *dst, ptrdiff_t dst_stride,
+//                               const InterpKernel *filter, int x0_q4,
+//                               int32_t x_step_q4, int y0_q4, int y_step_q4,
+//                               int w, int h);
+// void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                              uint8_t *dst, ptrdiff_t dst_stride,
+//                              const InterpKernel *filter, int x0_q4,
+//                              int32_t x_step_q4, int y0_q4, int y_step_q4,
+//                              int w, int h);
+// void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                                   uint8_t *dst, ptrdiff_t dst_stride,
+//                                   const InterpKernel *filter, int x0_q4,
+//                                   int32_t x_step_q4, int y0_q4,
+//                                   int y_step_q4, int w, int h);
+// void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                                  uint8_t *dst, ptrdiff_t dst_stride,
+//                                  const InterpKernel *filter, int x0_q4,
+//                                  int32_t x_step_q4, int y0_q4, int y_step_q4,
+//                                  int w, int h);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0)
+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - (num_taps / 2 - 1) * src_stride, ,
+            sse2, 0)
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1)
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+            src - (num_taps / 2 - 1) * src_stride, avg_, sse2, 1)
+
+// void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                         uint8_t *dst, ptrdiff_t dst_stride,
+//                         const InterpKernel *filter, int x0_q4,
+//                         int32_t x_step_q4, int y0_q4, int y_step_q4,
+//                         int w, int h);
+// void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                             uint8_t *dst, ptrdiff_t dst_stride,
+//                             const InterpKernel *filter, int x0_q4,
+//                             int32_t x_step_q4, int y0_q4, int y_step_q4,
+//                             int w, int h);
+FUN_CONV_2D(, sse2, 0)
+FUN_CONV_2D(avg_, sse2, 1)
+
+#if CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64
+// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
+
+// Use the [vh]8 version because there is no [vh]4 implementation.
+#define vpx_highbd_filter_block1d16_v4_avg_sse2 \
+  vpx_highbd_filter_block1d16_v8_avg_sse2
+#define vpx_highbd_filter_block1d16_h4_avg_sse2 \
+  vpx_highbd_filter_block1d16_h8_avg_sse2
+#define vpx_highbd_filter_block1d8_v4_avg_sse2 \
+  vpx_highbd_filter_block1d8_v8_avg_sse2
+#define vpx_highbd_filter_block1d8_h4_avg_sse2 \
+  vpx_highbd_filter_block1d8_h8_avg_sse2
+#define vpx_highbd_filter_block1d4_v4_avg_sse2 \
+  vpx_highbd_filter_block1d4_v8_avg_sse2
+#define vpx_highbd_filter_block1d4_h4_avg_sse2 \
+  vpx_highbd_filter_block1d4_h8_avg_sse2
+
+// From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm.
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
+
+// void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src,
+//                                      ptrdiff_t src_stride,
+//                                      uint8_t *dst,
+//                                      ptrdiff_t dst_stride,
+//                                      const int16_t *filter_x,
+//                                      int x_step_q4,
+//                                      const int16_t *filter_y,
+//                                      int y_step_q4,
+//                                      int w, int h, int bd);
+// void vpx_highbd_convolve8_vert_sse2(const uint8_t *src,
+//                                     ptrdiff_t src_stride,
+//                                     uint8_t *dst,
+//                                     ptrdiff_t dst_stride,
+//                                     const int16_t *filter_x,
+//                                     int x_step_q4,
+//                                     const int16_t *filter_y,
+//                                     int y_step_q4,
+//                                     int w, int h, int bd);
+// void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src,
+//                                          ptrdiff_t src_stride,
+//                                          uint8_t *dst,
+//                                          ptrdiff_t dst_stride,
+//                                          const int16_t *filter_x,
+//                                          int x_step_q4,
+//                                          const int16_t *filter_y,
+//                                          int y_step_q4,
+//                                          int w, int h, int bd);
+// void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src,
+//                                         ptrdiff_t src_stride,
+//                                         uint8_t *dst,
+//                                         ptrdiff_t dst_stride,
+//                                         const int16_t *filter_x,
+//                                         int x_step_q4,
+//                                         const int16_t *filter_y,
+//                                         int y_step_q4,
+//                                         int w, int h, int bd);
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0)
+HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
+                 src - src_stride * (num_taps / 2 - 1), , sse2, 0)
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1)
+HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+                 src - src_stride * (num_taps / 2 - 1), avg_, sse2, 1)
+
+// void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                                uint8_t *dst, ptrdiff_t dst_stride,
+//                                const InterpKernel *filter, int x0_q4,
+//                                int32_t x_step_q4, int y0_q4, int y_step_q4,
+//                                int w, int h, int bd);
+// void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                                    uint8_t *dst, ptrdiff_t dst_stride,
+//                                    const InterpKernel *filter, int x0_q4,
+//                                    int32_t x_step_q4, int y0_q4,
+//                                    int y_step_q4, int w, int h, int bd);
+HIGH_FUN_CONV_2D(, sse2, 0)
+HIGH_FUN_CONV_2D(avg_, sse2, 1)
+#endif  // CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index 7c1ecc0148..526c283823 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -9,21 +9,25 @@
  */
 
 #include <immintrin.h>
+#include <stdio.h>
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/x86/convolve.h"
+#include "vpx_dsp/x86/convolve_avx2.h"
+#include "vpx_dsp/x86/convolve_sse2.h"
+#include "vpx_dsp/x86/convolve_ssse3.h"
 #include "vpx_ports/mem.h"
 
-// filters for 16_h8 and 16_v8
-DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
-  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
-  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-};
+// filters for 16_h8
+DECLARE_ALIGNED(32, static const uint8_t,
+                filt1_global_avx2[32]) = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,
+                                           6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3,
+                                           3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
 
-DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
-  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
-  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-};
+DECLARE_ALIGNED(32, static const uint8_t,
+                filt2_global_avx2[32]) = { 2, 3, 3, 4, 4,  5, 5, 6, 6, 7, 7,
+                                           8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5,
+                                           5, 6, 6, 7, 7,  8, 8, 9, 9, 10 };
 
 DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
   4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
@@ -35,540 +39,1336 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
   6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
 };
 
-#if defined(__clang__)
-#if (__clang_major__ > 0 && __clang_major__ < 3) ||            \
-    (__clang_major__ == 3 && __clang_minor__ <= 3) ||          \
-    (defined(__APPLE__) && defined(__apple_build_version__) && \
-     ((__clang_major__ == 4 && __clang_minor__ <= 2) ||        \
-      (__clang_major__ == 5 && __clang_minor__ == 0)))
-#define MM256_BROADCASTSI128_SI256(x) \
-  _mm_broadcastsi128_si256((__m128i const *)&(x))
-#else  // clang > 3.3, and not 5.0 on macosx.
-#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-#endif  // clang <= 3.3
-#elif defined(__GNUC__)
-#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
-#define MM256_BROADCASTSI128_SI256(x) \
-  _mm_broadcastsi128_si256((__m128i const *)&(x))
-#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
-#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
-#else  // gcc > 4.7
-#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-#endif  // gcc <= 4.6
-#else   // !(gcc || clang)
-#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-#endif  // __clang__
+DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[64]) = {
+  0, 1, 2, 3,  1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3,  1, 2,
+  3, 4, 2, 3,  4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7,  8, 9,
+  7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+};
 
-static void vpx_filter_block1d16_h8_avx2(
+#define CALC_CONVOLVE8_HORZ_ROW                                               \
+  srcReg = mm256_loadu2_si128(src_ptr - 3, src_ptr - 3 + src_pitch);          \
+  s1[0] = _mm256_shuffle_epi8(srcReg, filt[0]);                               \
+  s1[1] = _mm256_shuffle_epi8(srcReg, filt[1]);                               \
+  s1[2] = _mm256_shuffle_epi8(srcReg, filt[2]);                               \
+  s1[3] = _mm256_shuffle_epi8(srcReg, filt[3]);                               \
+  s1[0] = convolve8_16_avx2(s1, f1);                                          \
+  s1[0] = _mm256_packus_epi16(s1[0], s1[0]);                                  \
+  src_ptr += src_stride;                                                      \
+  _mm_storel_epi64((__m128i *)&output_ptr[0], _mm256_castsi256_si128(s1[0])); \
+  output_ptr += output_pitch;                                                 \
+  _mm_storel_epi64((__m128i *)&output_ptr[0],                                 \
+                   _mm256_extractf128_si256(s1[0], 1));                       \
+  output_ptr += output_pitch;
+
+static INLINE void vpx_filter_block1d16_h8_x_avx2(
     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
-    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
-  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
-  __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
-  __m256i srcReg32b1, srcReg32b2, filtersReg32;
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter,
+    const int avg) {
+  __m128i outReg1, outReg2;
+  __m256i outReg32b1, outReg32b2;
   unsigned int i;
   ptrdiff_t src_stride, dst_stride;
+  __m256i f[4], filt[4], s[4];
 
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to 8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 256 bit register
-  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 256 bit register
-  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
-
-  filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
-  filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
-  filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);
-  filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+  shuffle_filter_avx2(filter, f);
+  filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+  filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+  filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
 
   // multiple the size of the source and destination stride by two
   src_stride = src_pixels_per_line << 1;
   dst_stride = output_pitch << 1;
   for (i = output_height; i > 1; i -= 2) {
+    __m256i srcReg;
+
     // load the 2 strides of source
-    srcReg32b1 =
-        _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3)));
-    srcReg32b1 = _mm256_inserti128_si256(
-        srcReg32b1,
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)),
-        1);
+    srcReg = mm256_loadu2_si128(src_ptr - 3, src_ptr + src_pixels_per_line - 3);
 
     // filter the source buffer
-    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
-
-    // filter the source buffer
-    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b1_1 = _mm256_adds_epi16(
-        srcRegFilt32b1_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
+    s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
+    s[1] = _mm256_shuffle_epi8(srcReg, filt[1]);
+    s[2] = _mm256_shuffle_epi8(srcReg, filt[2]);
+    s[3] = _mm256_shuffle_epi8(srcReg, filt[3]);
+    outReg32b1 = convolve8_16_avx2(s, f);
 
     // reading 2 strides of the next 16 bytes
     // (part of it was being read by earlier read)
-    srcReg32b2 =
-        _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5)));
-    srcReg32b2 = _mm256_inserti128_si256(
-        srcReg32b2,
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)),
-        1);
-
-    // add and saturate the results together
-    srcRegFilt32b1_1 = _mm256_adds_epi16(
-        srcRegFilt32b1_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
+    srcReg = mm256_loadu2_si128(src_ptr + 5, src_ptr + src_pixels_per_line + 5);
 
     // filter the source buffer
-    srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
+    s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
+    s[1] = _mm256_shuffle_epi8(srcReg, filt[1]);
+    s[2] = _mm256_shuffle_epi8(srcReg, filt[2]);
+    s[3] = _mm256_shuffle_epi8(srcReg, filt[3]);
+    outReg32b2 = convolve8_16_avx2(s, f);
 
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
-
-    // filter the source buffer
-    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b2_1 = _mm256_adds_epi16(
-        srcRegFilt32b2_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
-    srcRegFilt32b2_1 = _mm256_adds_epi16(
-        srcRegFilt32b2_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
-
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);
-
-    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64);
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
-    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
+    // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
+    // contain the first and second convolve result respectively
+    outReg32b1 = _mm256_packus_epi16(outReg32b1, outReg32b2);
 
     src_ptr += src_stride;
 
-    // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr,
-                    _mm256_castsi256_si128(srcRegFilt32b1_1));
-
-    // save the next 16 bits
-    _mm_store_si128((__m128i *)(output_ptr + output_pitch),
-                    _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
+    if (avg) {
+      const __m256i outReg = mm256_loadu2_si128(
+          (__m128i *)output_ptr, (__m128i *)(output_ptr + output_pitch));
+      outReg32b1 = _mm256_avg_epu8(outReg32b1, outReg);
+    }
+    mm256_store2_si128((__m128i *)output_ptr,
+                       (__m128i *)(output_ptr + output_pitch), &outReg32b1);
     output_ptr += dst_stride;
   }
 
   // if the number of strides is odd.
   // process only 16 bytes
   if (i > 0) {
-    __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
-    __m128i srcRegFilt2, srcRegFilt3;
-
-    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+    const __m128i srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+    const __m128i srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
+    const __m256i srcReg =
+        _mm256_inserti128_si256(_mm256_castsi128_si256(srcReg1), srcReg2, 1);
 
     // filter the source buffer
-    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
+    s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
+    s[1] = _mm256_shuffle_epi8(srcReg, filt[1]);
+    s[2] = _mm256_shuffle_epi8(srcReg, filt[2]);
+    s[3] = _mm256_shuffle_epi8(srcReg, filt[3]);
 
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1_1 =
-        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
+    // The low and high 128-bits of each lane contain the first and second
+    // convolve result respectively
+    outReg32b1 = convolve8_16_avx2(s, f);
+    outReg1 = _mm256_castsi256_si128(outReg32b1);
+    outReg2 = _mm256_extractf128_si256(outReg32b1, 1);
 
-    // add and saturate the results together
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+    // shrink to 8 bit each 16 bits
+    outReg1 = _mm_packus_epi16(outReg1, outReg2);
 
-    // filter the source buffer
-    srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 =
-        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
-
-    // add and saturate the results together
-    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2));
-
-    // reading the next 16 bytes
-    // (part of it was being read by earlier read)
-    srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
-
-    // add and saturate the results together
-    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2));
-
-    // filter the source buffer
-    srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt2_1 =
-        _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters));
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
-
-    // add and saturate the results together
-    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
-
-    // filter the source buffer
-    srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 =
-        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
-
-    // add and saturate the results together
-    srcRegFilt2_1 =
-        _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2));
-    srcRegFilt2_1 =
-        _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2));
-
-    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg64));
-
-    srcRegFilt2_1 =
-        _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg64));
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
-    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
+    // average if necessary
+    if (avg) {
+      outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
+    }
 
     // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1);
+    _mm_store_si128((__m128i *)output_ptr, outReg1);
   }
 }
 
-static void vpx_filter_block1d16_v8_avx2(
+static void vpx_filter_block1d16_h8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *output_ptr,
+    ptrdiff_t dst_stride, uint32_t output_height, const int16_t *filter) {
+  vpx_filter_block1d16_h8_x_avx2(src_ptr, src_stride, output_ptr, dst_stride,
+                                 output_height, filter, 0);
+}
+
+static void vpx_filter_block1d16_h8_avg_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *output_ptr,
+    ptrdiff_t dst_stride, uint32_t output_height, const int16_t *filter) {
+  vpx_filter_block1d16_h8_x_avx2(src_ptr, src_stride, output_ptr, dst_stride,
+                                 output_height, filter, 1);
+}
+
+static void vpx_filter_block1d8_h8_avx2(
     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
-    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg64;
-  __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
-  __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
-  __m256i srcReg32b11, srcReg32b12, filtersReg32;
-  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m256i filt[4], f1[4], s1[4], srcReg;
+  __m128i f[4], s[4];
+  int y = output_height;
+
+  // Multiply the size of the source stride by two
+  const ptrdiff_t src_stride = src_pitch << 1;
+
+  shuffle_filter_avx2(filter, f1);
+  filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+  filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+  filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+  // Process next 4 rows
+  while (y > 3) {
+    CALC_CONVOLVE8_HORZ_ROW
+    CALC_CONVOLVE8_HORZ_ROW
+    y -= 4;
+  }
+
+  // If remaining, then process 2 rows at a time
+  while (y > 1) {
+    CALC_CONVOLVE8_HORZ_ROW
+    y -= 2;
+  }
+
+  // For the remaining height.
+  if (y > 0) {
+    const __m128i src_reg_128 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+    f[0] = _mm256_castsi256_si128(f1[0]);
+    f[1] = _mm256_castsi256_si128(f1[1]);
+    f[2] = _mm256_castsi256_si128(f1[2]);
+    f[3] = _mm256_castsi256_si128(f1[3]);
+
+    // filter the source buffer
+    s[0] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[0]));
+    s[1] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[1]));
+    s[2] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[2]));
+    s[3] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[3]));
+    s[0] = convolve8_8_ssse3(s, f);
+
+    // Saturate 16bit value to 8bit.
+    s[0] = _mm_packus_epi16(s[0], s[0]);
+
+    // Save only 8 bytes
+    _mm_storel_epi64((__m128i *)&output_ptr[0], s[0]);
+  }
+}
+
+static INLINE void vpx_filter_block1d16_v8_x_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter,
+    const int avg) {
+  __m256i srcRegHead1;
   unsigned int i;
   ptrdiff_t src_stride, dst_stride;
+  __m256i f[4], s1[4], s2[4];
 
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the
-  // same data in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 256 bit register
-  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 256 bit register
-  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
+  shuffle_filter_avx2(filter, f);
 
   // multiple the size of the source and destination stride by two
   src_stride = src_pitch << 1;
   dst_stride = out_pitch << 1;
 
-  // load 16 bytes 7 times in stride of src_pitch
-  srcReg32b1 =
-      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr)));
-  srcReg32b2 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)));
-  srcReg32b3 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)));
-  srcReg32b4 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)));
-  srcReg32b5 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
-  srcReg32b6 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
-  srcReg32b7 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
+  {
+    __m128i s[6];
+    __m256i s32b[6];
 
-  // have each consecutive loads on the same 256 register
-  srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
-                                       _mm256_castsi256_si128(srcReg32b2), 1);
-  srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
-                                       _mm256_castsi256_si128(srcReg32b3), 1);
-  srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
-                                       _mm256_castsi256_si128(srcReg32b4), 1);
-  srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
-                                       _mm256_castsi256_si128(srcReg32b5), 1);
-  srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
-                                       _mm256_castsi256_si128(srcReg32b6), 1);
-  srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
-                                       _mm256_castsi256_si128(srcReg32b7), 1);
+    // load 16 bytes 7 times in stride of src_pitch
+    s[0] = _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_pitch));
+    s[1] = _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_pitch));
+    s[2] = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_pitch));
+    s[3] = _mm_loadu_si128((const __m128i *)(src_ptr + 3 * src_pitch));
+    s[4] = _mm_loadu_si128((const __m128i *)(src_ptr + 4 * src_pitch));
+    s[5] = _mm_loadu_si128((const __m128i *)(src_ptr + 5 * src_pitch));
+    srcRegHead1 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + 6 * src_pitch)));
 
-  // merge every two consecutive registers except the last one
-  srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
-  srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
+    // have each consecutive loads on the same 256 register
+    s32b[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[1], 1);
+    s32b[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[2], 1);
+    s32b[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[3], 1);
+    s32b[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[4], 1);
+    s32b[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[5], 1);
+    s32b[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]),
+                                      _mm256_castsi256_si128(srcRegHead1), 1);
 
-  // save
-  srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
+    // merge every two consecutive registers except the last one
+    // the first lanes contain values for filtering odd rows (1,3,5...) and
+    // the second lanes contain values for filtering even rows (2,4,6...)
+    s1[0] = _mm256_unpacklo_epi8(s32b[0], s32b[1]);
+    s2[0] = _mm256_unpackhi_epi8(s32b[0], s32b[1]);
+    s1[1] = _mm256_unpacklo_epi8(s32b[2], s32b[3]);
+    s2[1] = _mm256_unpackhi_epi8(s32b[2], s32b[3]);
+    s1[2] = _mm256_unpacklo_epi8(s32b[4], s32b[5]);
+    s2[2] = _mm256_unpackhi_epi8(s32b[4], s32b[5]);
+  }
 
-  // save
-  srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
-
-  // save
-  srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
-
-  // save
-  srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
+  // The output_height is always a multiple of two.
+  assert(!(output_height & 1));
 
   for (i = output_height; i > 1; i -= 2) {
-    // load the last 2 loads of 16 bytes and have every two
+    __m256i srcRegHead2, srcRegHead3;
+
+    // load the next 2 loads of 16 bytes and have every two
     // consecutive loads in the same 256 bit register
-    srcReg32b8 = _mm256_castsi128_si256(
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
-    srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
-                                         _mm256_castsi256_si128(srcReg32b8), 1);
-    srcReg32b9 = _mm256_castsi128_si256(
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
-    srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
-                                         _mm256_castsi256_si128(srcReg32b9), 1);
+    srcRegHead2 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + 7 * src_pitch)));
+    srcRegHead1 = _mm256_inserti128_si256(
+        srcRegHead1, _mm256_castsi256_si128(srcRegHead2), 1);
+    srcRegHead3 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + 8 * src_pitch)));
+    srcRegHead2 = _mm256_inserti128_si256(
+        srcRegHead2, _mm256_castsi256_si128(srcRegHead3), 1);
 
-    // merge every two consecutive registers
-    // save
-    srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
-    srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
+    // merge the two new consecutive registers
+    // the first lane contain values for filtering odd rows (1,3,5...) and
+    // the second lane contain values for filtering even rows (2,4,6...)
+    s1[3] = _mm256_unpacklo_epi8(srcRegHead1, srcRegHead2);
+    s2[3] = _mm256_unpackhi_epi8(srcRegHead1, srcRegHead2);
 
-    // multiply 2 adjacent elements with the filter and add the result
-    srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
-    srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
+    s1[0] = convolve8_16_avx2(s1, f);
+    s2[0] = convolve8_16_avx2(s2, f);
 
-    // add and saturate the results together
-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
-    srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
-
-    // add and saturate the results together
-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
-                                    _mm256_min_epi16(srcReg32b8, srcReg32b12));
-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
-                                    _mm256_max_epi16(srcReg32b8, srcReg32b12));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
-    srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
-
-    srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
-    srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
-
-    // add and saturate the results together
-    srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
-                                   _mm256_min_epi16(srcReg32b8, srcReg32b12));
-    srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
-                                   _mm256_max_epi16(srcReg32b8, srcReg32b12));
-
-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
-    srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
-
-    // shift by 7 bit each 16 bit
-    srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
-    srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
+    // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
+    // contain the first and second convolve result respectively
+    s1[0] = _mm256_packus_epi16(s1[0], s2[0]);
 
     src_ptr += src_stride;
 
-    // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(srcReg32b1));
+    // average if necessary
+    if (avg) {
+      const __m256i outReg = mm256_loadu2_si128(
+          (__m128i *)output_ptr, (__m128i *)(output_ptr + out_pitch));
+      s1[0] = _mm256_avg_epu8(s1[0], outReg);
+    }
 
-    // save the next 16 bits
-    _mm_store_si128((__m128i *)(output_ptr + out_pitch),
-                    _mm256_extractf128_si256(srcReg32b1, 1));
+    mm256_store2_si128((__m128i *)output_ptr,
+                       (__m128i *)(output_ptr + out_pitch), s1);
 
     output_ptr += dst_stride;
 
-    // save part of the registers for next strides
-    srcReg32b10 = srcReg32b11;
-    srcReg32b1 = srcReg32b3;
-    srcReg32b11 = srcReg32b2;
-    srcReg32b3 = srcReg32b5;
-    srcReg32b2 = srcReg32b4;
-    srcReg32b5 = srcReg32b7;
-    srcReg32b7 = srcReg32b9;
+    // shift down by two rows
+    s1[0] = s1[1];
+    s2[0] = s2[1];
+    s1[1] = s1[2];
+    s2[1] = s2[2];
+    s1[2] = s1[3];
+    s2[2] = s2[3];
+    srcRegHead1 = srcRegHead3;
   }
-  if (i > 0) {
-    __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
-    __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
-    // load the last 16 bytes
-    srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
+}
 
-    // merge the last 2 results together
-    srcRegFilt4 =
-        _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
-    srcRegFilt7 =
-        _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
+                                         ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                         ptrdiff_t dst_stride, uint32_t height,
+                                         const int16_t *filter) {
+  vpx_filter_block1d16_v8_x_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
+                                 height, filter, 0);
+}
 
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
-                                    _mm256_castsi256_si128(firstFilters));
-    srcRegFilt4 =
-        _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
-    srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
-                                    _mm256_castsi256_si128(firstFilters));
-    srcRegFilt7 =
-        _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters));
+static void vpx_filter_block1d16_v8_avg_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *dst_ptr,
+    ptrdiff_t dst_stride, uint32_t height, const int16_t *filter) {
+  vpx_filter_block1d16_v8_x_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
+                                 height, filter, 1);
+}
 
-    // add and saturate the results together
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
+static void vpx_filter_block1d16_h4_avx2(const uint8_t *src_ptr,
+                                         ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                         ptrdiff_t dst_stride, uint32_t height,
+                                         const int16_t *kernel) {
+  // We will cast the kernel from 16-bit words to 8-bit words, and then extract
+  // the middle four elements of the kernel into two registers in the form
+  // ... k[3] k[2] k[3] k[2]
+  // ... k[5] k[4] k[5] k[4]
+  // Then we shuffle the source into
+  // ... s[1] s[0] s[0] s[-1]
+  // ... s[3] s[2] s[2] s[1]
+  // Calling multiply and add gives us half of the sum. Calling add gives us
+  // first half of the output. Repeat again to get the second half of the
+  // output. Finally we shuffle again to combine the two outputs.
+  // Since avx2 allows us to use 256-bit buffer, we can do this two rows at a
+  // time.
 
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
-                                    _mm256_castsi256_si128(secondFilters));
-    srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
-                                    _mm256_castsi256_si128(secondFilters));
+  __m128i kernel_reg;  // Kernel
+  __m256i kernel_reg_256, kernel_reg_23,
+      kernel_reg_45;                             // Segments of the kernel used
+  const __m256i reg_32 = _mm256_set1_epi16(32);  // Used for rounding
+  const ptrdiff_t unrolled_src_stride = src_stride << 1;
+  const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
+  int h;
 
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
-                                    _mm256_castsi256_si128(thirdFilters));
-    srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
-                                    _mm256_castsi256_si128(thirdFilters));
+  __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
+  __m256i dst_first, dst_second;
+  __m256i tmp_0, tmp_1;
+  __m256i idx_shift_0 =
+      _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1,
+                       2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+  __m256i idx_shift_2 =
+      _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3,
+                       4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
 
-    // add and saturate the results together
-    srcRegFilt1 =
-        _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt4, srcRegFilt6));
-    srcRegFilt3 =
-        _mm_adds_epi16(srcRegFilt3, _mm_min_epi16(srcRegFilt5, srcRegFilt7));
+  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+  src_ptr -= 1;
 
-    // add and saturate the results together
-    srcRegFilt1 =
-        _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt4, srcRegFilt6));
-    srcRegFilt3 =
-        _mm_adds_epi16(srcRegFilt3, _mm_max_epi16(srcRegFilt5, srcRegFilt7));
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+  kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+  kernel_reg_256 = _mm256_broadcastsi128_si256(kernel_reg);
+  kernel_reg_23 =
+      _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0302u));
+  kernel_reg_45 =
+      _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0504u));
 
-    srcRegFilt1 =
-        _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg64));
-    srcRegFilt3 =
-        _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg64));
+  for (h = height; h >= 2; h -= 2) {
+    // Load the source
+    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
+    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
 
-    // shift by 7 bit each 16 bit
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-    srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7);
+    // Partial result for first half
+    tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
+    tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
+    dst_first = _mm256_adds_epi16(tmp_0, tmp_1);
 
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
+    // Do again to get the second half of dst
+    // Load the source
+    src_reg = mm256_loadu2_si128(src_ptr + 8, src_ptr + src_stride + 8);
+    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
 
-    // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr, srcRegFilt1);
+    // Partial result for second half
+    tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
+    tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
+    dst_second = _mm256_adds_epi16(tmp_0, tmp_1);
+
+    // Round each result
+    dst_first = mm256_round_epi16(&dst_first, &reg_32, 6);
+    dst_second = mm256_round_epi16(&dst_second, &reg_32, 6);
+
+    // Finally combine to get the final dst
+    dst_first = _mm256_packus_epi16(dst_first, dst_second);
+    mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+                       &dst_first);
+
+    src_ptr += unrolled_src_stride;
+    dst_ptr += unrolled_dst_stride;
+  }
+
+  // Repeat for the last row if needed
+  if (h > 0) {
+    src_reg = _mm256_loadu_si256((const __m256i *)src_ptr);
+    // Reorder into 2 1 1 2
+    src_reg = _mm256_permute4x64_epi64(src_reg, 0x94);
+
+    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+    tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
+    tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
+    dst_first = _mm256_adds_epi16(tmp_0, tmp_1);
+
+    dst_first = mm256_round_epi16(&dst_first, &reg_32, 6);
+
+    dst_first = _mm256_packus_epi16(dst_first, dst_first);
+    dst_first = _mm256_permute4x64_epi64(dst_first, 0x8);
+
+    _mm_store_si128((__m128i *)dst_ptr, _mm256_castsi256_si128(dst_first));
+  }
+}
+
+static void vpx_filter_block1d16_v4_avx2(const uint8_t *src_ptr,
+                                         ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                         ptrdiff_t dst_stride, uint32_t height,
+                                         const int16_t *kernel) {
+  // We will load two rows of pixels as 8-bit words, rearrange them into the
+  // form
+  // ... s[1,0] s[0,0] s[0,0] s[-1,0]
+  // so that we can call multiply and add with the kernel partial output. Then
+  // we can call add with another row to get the output.
+
+  // Register for source s[-1:3, :]
+  __m256i src_reg_1, src_reg_2, src_reg_3;
+  // Interleaved rows of the source. lo is first half, hi second
+  __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
+  __m256i src_reg_m1001_lo, src_reg_m1001_hi, src_reg_1223_lo, src_reg_1223_hi;
+
+  __m128i kernel_reg;  // Kernel
+  __m256i kernel_reg_256, kernel_reg_23,
+      kernel_reg_45;  // Segments of the kernel used
+
+  // Result after multiply and add
+  __m256i res_reg_m1001_lo, res_reg_1223_lo, res_reg_m1001_hi, res_reg_1223_hi;
+  __m256i res_reg, res_reg_lo, res_reg_hi;
+
+  const __m256i reg_32 = _mm256_set1_epi16(32);  // Used for rounding
+
+  // We will compute the result two rows at a time
+  const ptrdiff_t src_stride_unrolled = src_stride << 1;
+  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+  int h;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+  kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+  kernel_reg_256 = _mm256_broadcastsi128_si256(kernel_reg);
+  kernel_reg_23 =
+      _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0302u));
+  kernel_reg_45 =
+      _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0504u));
+
+  // Row -1 to row 0
+  src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr,
+                                   (const __m128i *)(src_ptr + src_stride));
+
+  // Row 0 to row 1
+  src_reg_1 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
+  src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
+
+  // First three rows
+  src_reg_m1001_lo = _mm256_unpacklo_epi8(src_reg_m10, src_reg_01);
+  src_reg_m1001_hi = _mm256_unpackhi_epi8(src_reg_m10, src_reg_01);
+
+  for (h = height; h > 1; h -= 2) {
+    src_reg_2 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)));
+
+    src_reg_12 = _mm256_inserti128_si256(src_reg_1,
+                                         _mm256_castsi256_si128(src_reg_2), 1);
+
+    src_reg_3 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)));
+
+    src_reg_23 = _mm256_inserti128_si256(src_reg_2,
+                                         _mm256_castsi256_si128(src_reg_3), 1);
+
+    // Last three rows
+    src_reg_1223_lo = _mm256_unpacklo_epi8(src_reg_12, src_reg_23);
+    src_reg_1223_hi = _mm256_unpackhi_epi8(src_reg_12, src_reg_23);
+
+    // Output from first half
+    res_reg_m1001_lo = _mm256_maddubs_epi16(src_reg_m1001_lo, kernel_reg_23);
+    res_reg_1223_lo = _mm256_maddubs_epi16(src_reg_1223_lo, kernel_reg_45);
+    res_reg_lo = _mm256_adds_epi16(res_reg_m1001_lo, res_reg_1223_lo);
+
+    // Output from second half
+    res_reg_m1001_hi = _mm256_maddubs_epi16(src_reg_m1001_hi, kernel_reg_23);
+    res_reg_1223_hi = _mm256_maddubs_epi16(src_reg_1223_hi, kernel_reg_45);
+    res_reg_hi = _mm256_adds_epi16(res_reg_m1001_hi, res_reg_1223_hi);
+
+    // Round the words
+    res_reg_lo = mm256_round_epi16(&res_reg_lo, &reg_32, 6);
+    res_reg_hi = mm256_round_epi16(&res_reg_hi, &reg_32, 6);
+
+    // Combine to get the result
+    res_reg = _mm256_packus_epi16(res_reg_lo, res_reg_hi);
+
+    // Save the result
+    mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+                       &res_reg);
+
+    // Update the source by two rows
+    src_ptr += src_stride_unrolled;
+    dst_ptr += dst_stride_unrolled;
+
+    src_reg_m1001_lo = src_reg_1223_lo;
+    src_reg_m1001_hi = src_reg_1223_hi;
+    src_reg_1 = src_reg_3;
+  }
+}
+
+static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr,
+                                        ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                        ptrdiff_t dst_stride, uint32_t height,
+                                        const int16_t *kernel) {
+  // We will cast the kernel from 16-bit words to 8-bit words, and then extract
+  // the middle four elements of the kernel into two registers in the form
+  // ... k[3] k[2] k[3] k[2]
+  // ... k[5] k[4] k[5] k[4]
+  // Then we shuffle the source into
+  // ... s[1] s[0] s[0] s[-1]
+  // ... s[3] s[2] s[2] s[1]
+  // Calling multiply and add gives us half of the sum. Calling add gives us
+  // first half of the output. Repeat again to get the second half of the
+  // output. Finally we shuffle again to combine the two outputs.
+  // Since avx2 allows us to use 256-bit buffer, we can do this two rows at a
+  // time.
+
+  __m128i kernel_reg_128;  // Kernel
+  __m256i kernel_reg, kernel_reg_23,
+      kernel_reg_45;                             // Segments of the kernel used
+  const __m256i reg_32 = _mm256_set1_epi16(32);  // Used for rounding
+  const ptrdiff_t unrolled_src_stride = src_stride << 1;
+  const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
+  int h;
+
+  __m256i idx_shift_0 =
+      _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1,
+                       2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+  __m256i idx_shift_2 =
+      _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3,
+                       4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+
+  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+  src_ptr -= 1;
+
+  // Load Kernel
+  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1);
+  kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128);
+  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+  kernel_reg_23 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0302u));
+  kernel_reg_45 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0504u));
+
+  for (h = height; h >= 2; h -= 2) {
+    // Load the source
+    const __m256i src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
+    __m256i dst_reg;
+    __m256i tmp_0, tmp_1;
+    const __m256i src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+    const __m256i src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+    // Get the output
+    tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
+    tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
+    dst_reg = _mm256_adds_epi16(tmp_0, tmp_1);
+
+    // Round the result
+    dst_reg = mm256_round_epi16(&dst_reg, &reg_32, 6);
+
+    // Finally combine to get the final dst
+    dst_reg = _mm256_packus_epi16(dst_reg, dst_reg);
+    mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+                        &dst_reg);
+
+    src_ptr += unrolled_src_stride;
+    dst_ptr += unrolled_dst_stride;
+  }
+
+  // Repeat for the last row if needed
+  if (h > 0) {
+    const __m128i src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+    __m128i dst_reg;
+    const __m128i reg_32_128 = _mm_set1_epi16(32);  // Used for rounding
+    __m128i tmp_0, tmp_1;
+
+    __m128i src_reg_shift_0 =
+        _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(idx_shift_0));
+    __m128i src_reg_shift_2 =
+        _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(idx_shift_2));
+
+    tmp_0 = _mm_maddubs_epi16(src_reg_shift_0,
+                              _mm256_castsi256_si128(kernel_reg_23));
+    tmp_1 = _mm_maddubs_epi16(src_reg_shift_2,
+                              _mm256_castsi256_si128(kernel_reg_45));
+    dst_reg = _mm_adds_epi16(tmp_0, tmp_1);
+
+    dst_reg = mm_round_epi16_sse2(&dst_reg, &reg_32_128, 6);
+
+    dst_reg = _mm_packus_epi16(dst_reg, _mm_setzero_si128());
+
+    _mm_storel_epi64((__m128i *)dst_ptr, dst_reg);
+  }
+}
+
+static void vpx_filter_block1d8_v4_avx2(const uint8_t *src_ptr,
+                                        ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                        ptrdiff_t dst_stride, uint32_t height,
+                                        const int16_t *kernel) {
+  // We will load two rows of pixels as 8-bit words, rearrange them into the
+  // form
+  // ... s[1,0] s[0,0] s[0,0] s[-1,0]
+  // so that we can call multiply and add with the kernel partial output. Then
+  // we can call add with another row to get the output.
+
+  // Register for source s[-1:3, :]
+  __m256i src_reg_1, src_reg_2, src_reg_3;
+  // Interleaved rows of the source. lo is first half, hi second
+  __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
+  __m256i src_reg_m1001, src_reg_1223;
+
+  __m128i kernel_reg_128;  // Kernel
+  __m256i kernel_reg, kernel_reg_23,
+      kernel_reg_45;  // Segments of the kernel used
+
+  // Result after multiply and add
+  __m256i res_reg_m1001, res_reg_1223;
+  __m256i res_reg;
+
+  const __m256i reg_32 = _mm256_set1_epi16(32);  // Used for rounding
+
+  // We will compute the result two rows at a time
+  const ptrdiff_t src_stride_unrolled = src_stride << 1;
+  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+  int h;
+
+  // Load Kernel
+  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1);
+  kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128);
+  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+  kernel_reg_23 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0302u));
+  kernel_reg_45 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0504u));
+
+  // Row -1 to row 0
+  src_reg_m10 = mm256_loadu2_epi64((const __m128i *)src_ptr,
+                                   (const __m128i *)(src_ptr + src_stride));
+
+  // Row 0 to row 1
+  src_reg_1 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
+  src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
+
+  // First three rows
+  src_reg_m1001 = _mm256_unpacklo_epi8(src_reg_m10, src_reg_01);
+
+  for (h = height; h > 1; h -= 2) {
+    src_reg_2 = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)));
+
+    src_reg_12 = _mm256_inserti128_si256(src_reg_1,
+                                         _mm256_castsi256_si128(src_reg_2), 1);
+
+    src_reg_3 = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)));
+
+    src_reg_23 = _mm256_inserti128_si256(src_reg_2,
+                                         _mm256_castsi256_si128(src_reg_3), 1);
+
+    // Last three rows
+    src_reg_1223 = _mm256_unpacklo_epi8(src_reg_12, src_reg_23);
+
+    // Output
+    res_reg_m1001 = _mm256_maddubs_epi16(src_reg_m1001, kernel_reg_23);
+    res_reg_1223 = _mm256_maddubs_epi16(src_reg_1223, kernel_reg_45);
+    res_reg = _mm256_adds_epi16(res_reg_m1001, res_reg_1223);
+
+    // Round the words
+    res_reg = mm256_round_epi16(&res_reg, &reg_32, 6);
+
+    // Combine to get the result
+    res_reg = _mm256_packus_epi16(res_reg, res_reg);
+
+    // Save the result
+    mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+                        &res_reg);
+
+    // Update the source by two rows
+    src_ptr += src_stride_unrolled;
+    dst_ptr += dst_stride_unrolled;
+
+    src_reg_m1001 = src_reg_1223;
+    src_reg_1 = src_reg_3;
+  }
+}
+
+static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr,
+                                        ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                        ptrdiff_t dst_stride, uint32_t height,
+                                        const int16_t *kernel) {
+  // We will cast the kernel from 16-bit words to 8-bit words, and then extract
+  // the middle four elements of the kernel into a single register in the form
+  // k[5:2] k[5:2] k[5:2] k[5:2]
+  // Then we shuffle the source into
+  // s[5:2] s[4:1] s[3:0] s[2:-1]
+  // Calling multiply and add gives us half of the sum next to each other.
+  // Calling horizontal add then gives us the output.
+  // Since avx2 has 256-bit register, we can do 2 rows at a time.
+
+  __m128i kernel_reg_128;  // Kernel
+  __m256i kernel_reg;
+  const __m256i reg_32 = _mm256_set1_epi16(32);  // Used for rounding
+  int h;
+  const ptrdiff_t unrolled_src_stride = src_stride << 1;
+  const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
+
+  __m256i shuf_idx =
+      _mm256_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2,
+                       3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6);
+
+  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+  src_ptr -= 1;
+
+  // Load Kernel
+  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1);
+  kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128);
+  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+  kernel_reg = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi32(0x05040302u));
+
+  for (h = height; h > 1; h -= 2) {
+    // Load the source
+    const __m256i src_reg = mm256_loadu2_epi64(
+        (const __m128i *)src_ptr, (const __m128i *)(src_ptr + src_stride));
+    const __m256i src_reg_shuf = _mm256_shuffle_epi8(src_reg, shuf_idx);
+
+    // Get the result
+    __m256i dst = _mm256_maddubs_epi16(src_reg_shuf, kernel_reg);
+    dst = _mm256_hadds_epi16(dst, _mm256_setzero_si256());
+
+    // Round result
+    dst = mm256_round_epi16(&dst, &reg_32, 6);
+
+    // Pack to 8-bits
+    dst = _mm256_packus_epi16(dst, _mm256_setzero_si256());
+
+    // Save
+    mm256_storeu2_epi32((__m128i *const)dst_ptr,
+                        (__m128i *const)(dst_ptr + dst_stride), &dst);
+
+    src_ptr += unrolled_src_stride;
+    dst_ptr += unrolled_dst_stride;
+  }
+
+  if (h > 0) {
+    // Load the source
+    const __m128i reg_32_128 = _mm_set1_epi16(32);  // Used for rounding
+    __m128i src_reg = _mm_loadl_epi64((const __m128i *)src_ptr);
+    __m128i src_reg_shuf =
+        _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(shuf_idx));
+
+    // Get the result
+    __m128i dst =
+        _mm_maddubs_epi16(src_reg_shuf, _mm256_castsi256_si128(kernel_reg));
+    dst = _mm_hadds_epi16(dst, _mm_setzero_si128());
+
+    // Round result
+    dst = mm_round_epi16_sse2(&dst, &reg_32_128, 6);
+
+    // Pack to 8-bits
+    dst = _mm_packus_epi16(dst, _mm_setzero_si128());
+    *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst);
+  }
+}
+
+static void vpx_filter_block1d4_v4_avx2(const uint8_t *src_ptr,
+                                        ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                        ptrdiff_t dst_stride, uint32_t height,
+                                        const int16_t *kernel) {
+  // We will load two rows of pixels as 8-bit words, rearrange them into the
+  // form
+  // ... s[3,0] s[2,0] s[1,0] s[0,0] s[2,0] s[1,0] s[0,0] s[-1,0]
+  // so that we can call multiply and add with the kernel to get partial output.
+  // Calling horizontal add then gives us the completely output
+
+  // Register for source s[-1:3, :]
+  __m256i src_reg_1, src_reg_2, src_reg_3;
+  // Interleaved rows of the source. lo is first half, hi second
+  __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
+  __m256i src_reg_m1001, src_reg_1223, src_reg_m1012_1023;
+
+  __m128i kernel_reg_128;  // Kernel
+  __m256i kernel_reg;
+
+  // Result after multiply and add
+  __m256i res_reg;
+
+  const __m256i reg_32 = _mm256_set1_epi16(32);  // Used for rounding
+
+  // We will compute the result two rows at a time
+  const ptrdiff_t src_stride_unrolled = src_stride << 1;
+  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+  int h;
+
+  // Load Kernel
+  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1);
+  kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128);
+  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+  kernel_reg = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi32(0x05040302u));
+
+  // Row -1 to row 0
+  src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr,
+                                   (const __m128i *)(src_ptr + src_stride));
+
+  // Row 0 to row 1
+  src_reg_1 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
+  src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
+
+  // First three rows
+  src_reg_m1001 = _mm256_unpacklo_epi8(src_reg_m10, src_reg_01);
+
+  for (h = height; h > 1; h -= 2) {
+    src_reg_2 = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)));
+
+    src_reg_12 = _mm256_inserti128_si256(src_reg_1,
+                                         _mm256_castsi256_si128(src_reg_2), 1);
+
+    src_reg_3 = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)));
+
+    src_reg_23 = _mm256_inserti128_si256(src_reg_2,
+                                         _mm256_castsi256_si128(src_reg_3), 1);
+
+    // Last three rows
+    src_reg_1223 = _mm256_unpacklo_epi8(src_reg_12, src_reg_23);
+
+    // Combine all the rows
+    src_reg_m1012_1023 = _mm256_unpacklo_epi16(src_reg_m1001, src_reg_1223);
+
+    // Output
+    res_reg = _mm256_maddubs_epi16(src_reg_m1012_1023, kernel_reg);
+    res_reg = _mm256_hadds_epi16(res_reg, _mm256_setzero_si256());
+
+    // Round the words
+    res_reg = mm256_round_epi16(&res_reg, &reg_32, 6);
+
+    // Combine to get the result
+    res_reg = _mm256_packus_epi16(res_reg, res_reg);
+
+    // Save the result
+    mm256_storeu2_epi32((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+                        &res_reg);
+
+    // Update the source by two rows
+    src_ptr += src_stride_unrolled;
+    dst_ptr += dst_stride_unrolled;
+
+    src_reg_m1001 = src_reg_1223;
+    src_reg_1 = src_reg_3;
+  }
+}
+
+static void vpx_filter_block1d8_v8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m256i f[4], ss[4];
+  __m256i r[8];
+  __m128i s[9];
+
+  unsigned int y = output_height;
+  // Multiply the size of the source stride by two
+  const ptrdiff_t src_stride = src_pitch << 1;
+
+  // The output_height is always a multiple of two.
+  assert(!(output_height & 1));
+
+  shuffle_filter_avx2(filter, f);
+  s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch));
+  s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch));
+  s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch));
+  s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch));
+  s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch));
+  s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch));
+  s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch));
+
+  // merge the result together
+  // r[0]:    0 0 0 0 0 0 0 0 r17 r16 r15 r14 r13 r12 r11 r10 | 0 0 0 0 0 0 0 0
+  // r07 r06 r05 r04 r03 r02 r01 r00
+  r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[1], 1);
+
+  // r[1]:    0 0 0 0 0 0 0 0 r27 r26 r25 r24 r23 r22 r21 r20 | 0 0 0 0 0 0 0 0
+  // r17 r16 r15 r14 r13 r12 r11 r10
+  r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[2], 1);
+
+  // r[2]:    0 0 0 0 0 0 0 0 r37 r36 r35 r34 r33 r32 r31 r30 | 0 0 0 0 0 0 0 0
+  // r27 r26 r25 r24 r23 r22 r21 r20
+  r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[3], 1);
+
+  // r[3]:    0 0 0 0 0 0 0 0 r47 r46 r45 r44 r43 r42 r41 r40 | 0 0 0 0 0 0 0 0
+  // r37 r36 r35 r34 r33 r32 r31 r30
+  r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[4], 1);
+
+  // r[4]:    0 0 0 0 0 0 0 0 r57 r56 r55 r54 r53 r52 r51 r50 | 0 0 0 0 0 0 0 0
+  // r47 r46 r45 r44 r43 r42 r41 r40
+  r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[5], 1);
+
+  // r[5]:    0 0 0 0 0 0 0 0 r67 r66 r65 r64 r63 r62 r61 r60 | 0 0 0 0 0 0 0 0
+  // r57 r56 r55 r54 r53 r52 r51 r50
+  r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), s[6], 1);
+
+  // Merge together
+  // ss[0]: |r27 r17|.......|r21 r11|r20 r10 || r17 r07|.....|r12 r02|r11
+  // r01|r10 r00|
+  ss[0] = _mm256_unpacklo_epi8(r[0], r[1]);
+
+  // ss[0]: |r47 r37|.......|r41 r31|r40 r30 || r37 r27|.....|r32 r22|r31
+  // r21|r30 r20|
+  ss[1] = _mm256_unpacklo_epi8(r[2], r[3]);
+
+  // ss[2]: |r67 r57|.......|r61 r51|r60 r50 || r57 r47|.....|r52 r42|r51
+  // r41|r50 r40|
+  ss[2] = _mm256_unpacklo_epi8(r[4], r[5]);
+
+  // Process 2 rows at a time
+  do {
+    s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
+    s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch));
+
+    // r[6]:    0 0 0 0 0 0 0 0 r77 r76 r75 r74 r73 r72 r71 r70 | 0 0 0 0 0 0 0
+    // 0 r67 r66 r65 r64 r63 r62 r61 r60
+    r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[6]), s[7], 1);
+    // r[7]:    0 0 0 0 0 0 0 0 r87 r86 r85 r84 r83 r82 r81 r80 | 0 0 0 0 0 0 0
+    // 0 r77 r76 r75 r74 r73 r72 r71 r70
+    r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[7]), s[8], 1);
+
+    // ss[3] : | r87 r77 | .......| r81 r71 | r80 r70 || r77 r67 | .....| r72
+    // r62 | r71 r61|r70 r60|
+    ss[3] = _mm256_unpacklo_epi8(r[6], r[7]);
+    ss[0] = convolve8_16_avx2(ss, f);
+    ss[0] = _mm256_packus_epi16(ss[0], ss[0]);
+    src_ptr += src_stride;
+
+    /* shift down two rows */
+    s[6] = s[8];
+    _mm_storel_epi64((__m128i *)&output_ptr[0], _mm256_castsi256_si128(ss[0]));
+    output_ptr += out_pitch;
+    _mm_storel_epi64((__m128i *)&output_ptr[0],
+                     _mm256_extractf128_si256(ss[0], 1));
+    output_ptr += out_pitch;
+    ss[0] = ss[1];
+    ss[1] = ss[2];
+    ss[2] = ss[3];
+    y -= 2;
+  } while (y > 1);
+}
+
+static void vpx_filter_block1d4_h8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg64_256bit;
+  unsigned int y = output_height;
+
+  assert(output_height > 1);
+
+  addFilterReg64_256bit = _mm256_set1_epi16(32);
+
+  // f7 f6 f5 f4 f3 f2 f1 f0 (16 bit)
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  // f7 f6 f5 f4 f3 f2 f1 f0 || f7 f6 f5 f4 f3 f2 f1 f0 (8 bit each)
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+  {
+    ptrdiff_t src_stride;
+    __m256i filt1Reg, filt2Reg, firstFilters, secondFilters;
+    // have the same data in both lanes of a 256 bit register
+    // f7 f6 f5 f4 f3 f2 f1 f0 f7 f6 f5 f4 f3 f2 f1 f0 | f7 f6 f5 f4 f3 f2 f1 f0
+    // f7 f6 f5 f4 f3 f2 f1 f0 (8bit each)
+    const __m256i filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);
+
+    // duplicate only the first 32 bits
+    // f3 f2 f1 f0|f3 f2 f1 f0|f3 f2 f1 f0|f3 f2 f1 f0 | f3 f2 f1 f0|f3 f2 f1
+    // f0|f3 f2 f1 f0|f3 f2 f1 f0
+    firstFilters = _mm256_shuffle_epi32(filtersReg32, 0);
+    // duplicate only the second 32 bits
+    // f7 f6 f5 f4|f7 f6 f5 f4|f7 f6 f5 f4|f7 f6 f5 f4 | f7 f6 f5 f4|f7 f6 f5
+    // f4|f7 f6 f5 f4|f7 f6 f5 f4
+    secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55);
+
+    // s6 s5 s4 s3 s5 s4 s3 s2 s4 s3 s2 s1 s3 s2 s1 s0 | s6 s5 s4 s3 s5 s4 s3
+    // s2 s4 s3 s2 s1 s3 s2 s1 s0
+    filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2);
+
+    // s10 s9 s8 s7 s9 s8 s7 s6 s8 s7 s6 s5 s7 s6 s5 s4 | s10 s9 s8 s7 s9 s8 s7
+    // s6 s8 s7 s6 s5 s7 s6 s5 s4
+    filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32));
+
+    // multiple the size of the source and destination stride by two
+    src_stride = src_pitch << 1;
+
+    do {
+      __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcReg32b1;
+      // load the 2 strides of source
+      // r115 r114 ...... r15 r14 r13 r12 r11 r10 | r015 r014 r013 ...... r07
+      // r06 r05 r04 r03 r02 r01 r00
+      srcReg32b1 = mm256_loadu2_si128(src_ptr - 3, src_ptr - 3 + src_pitch);
+
+      // filter the source buffer
+      // r16 r15 r14 r13 r15 r14 r13 r12 r14 r13 r12 r11 r13 r12 r11 r10 | r06
+      // r05 r04 r03 r05 r04 r03 r02 r04 r03 r02 r01 r03 r02 r01 r00
+      srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+
+      // multiply 4 adjacent elements with the filter and add the result
+      // ...|f3*r14+f2*r13|f1*r13+f0*r12|f3*r13+f2*r12|f1*r11+f0*r10||...
+      // |f1*r03+f0*r02|f3*r04+f2*r03|f1*r02+f0*r01|f3*r03+f2*r02|f1*r01+f0*r00
+      srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+
+      // filter the source buffer
+      // r110 r19 r18 r17|r19 r18 r17 r16|r18 r17 r16 r15|r17 r16 r15 r14||r010
+      // r09 r08 r07|r09 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04
+      srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+
+      // multiply 4 adjacent elements with the filter and add the result
+      // r010 r09 r08 r07|r9 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04||r010
+      // r09 r08 r07|r9 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04
+      srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
+
+      srcRegFilt32b1_1 =
+          _mm256_add_epi16(srcRegFilt32b1_1, addFilterReg64_256bit);
+      srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+      srcRegFilt32b1_1 =
+          _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+      // 0 0 0 0 R13 R12 R11 R10 || 0 0 0 0 R03 R02 R01 R00 (16bit)
+      srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
+
+      // 8zeros 0 0 0 0 R13 R12 R11 R10 || 8zeros 0 0 0 0 R03 R02 R01 R00 (8bit)
+      srcRegFilt32b1_1 =
+          _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+      src_ptr += src_stride;
+      // save first row 4 values
+      *((int *)&output_ptr[0]) =
+          _mm_cvtsi128_si32(_mm256_castsi256_si128(srcRegFilt32b1_1));
+      output_ptr += output_pitch;
+
+      // save second row 4 values
+      *((int *)&output_ptr[0]) =
+          _mm_cvtsi128_si32(_mm256_extractf128_si256(srcRegFilt32b1_1, 1));
+      output_ptr += output_pitch;
+
+      y = y - 2;
+    } while (y > 1);
+
+    // For remaining height
+    if (y > 0) {
+      __m128i srcReg1, srcRegFilt1_1, addFilterReg64;
+      __m128i srcRegFilt2;
+
+      addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+
+      srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+      // filter the source buffer
+      srcRegFilt1_1 =
+          _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+
+      // multiply 4 adjacent elements with the filter and add the result
+      srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
+                                        _mm256_castsi256_si128(firstFilters));
+
+      // filter the source buffer
+      srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+
+      // multiply 4 adjacent elements with the filter and add the result
+      srcRegFilt2 =
+          _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
+
+      srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+      srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
+      // shift by 6 bit each 16 bit
+      srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
+      srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
+
+      // shrink to 8 bit each 16 bits, the first lane contain the first
+      // convolve result and the second lane contain the second convolve result
+      srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+      // save 4 bytes
+      *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
+    }
+  }
+}
+
+static void vpx_filter_block1d4_v8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m256i f[4], ss[4];
+  __m256i r[9], rr[2];
+  __m128i s[11];
+
+  unsigned int y = output_height;
+  // Multiply the size of the source stride by four
+  const ptrdiff_t src_stride = src_pitch << 2;
+  const ptrdiff_t out_stride = out_pitch << 2;
+
+  // The output_height is always a multiple of two.
+  assert(!(output_height & 0x01));
+
+  shuffle_filter_avx2(filter, f);
+
+  s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch));
+  s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch));
+  s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch));
+  s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch));
+  s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch));
+  s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch));
+  s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch));
+
+  r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[2], 1);
+  r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[3], 1);
+  r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[4], 1);
+  r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[5], 1);
+  r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[6], 1);
+
+  // r37.....r24..r33..r31 r30 r23 r22 r21 r20|r17....r14 r07..r05 r04 r13 r12
+  // r11 r10 r03 r02 r01 r00
+  rr[0] = _mm256_unpacklo_epi32(r[0], r[1]);
+
+  // r47.....r34..r43..r41 r40 r33 r32 r31 r30|r27....r24 r17..r15 r14 r23 r22
+  // r21 r20 r13 r12 r11 r10
+  rr[1] = _mm256_unpacklo_epi32(r[1], r[2]);
+
+  // r43 r33....r40 r30|r33 r23....r30 r20||r23 r13....r20 r10|r13 r03....r10
+  // r00|
+  ss[0] = _mm256_unpacklo_epi8(rr[0], rr[1]);
+
+  // r37.....r24..r33..r31 r30 r23 r22 r21 r20||r17....r14 r07..r05 r04 r13 r12
+  // r11 r10 r03 r02 r01 r00
+  rr[0] = _mm256_unpacklo_epi32(r[2], r[3]);
+
+  // r47.....r34..r43..r41 r40 r33 r32 r31 r30|r27....r24 r17..r15 r14 r23 r22
+  // r21 r20 r13 r12 r11 r10
+  rr[1] = _mm256_unpacklo_epi32(r[3], r[4]);
+
+  // r63 r53....r60 r50|r53 r43....r50 r40||r43 r33....r40 r30|r33 r23....r30
+  // r20|
+  ss[1] = _mm256_unpacklo_epi8(rr[0], rr[1]);
+  // Process 4 rows at a time
+  while (y >= 4) {
+    s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
+    s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch));
+    s[9] = _mm_loadl_epi64((const __m128i *)(src_ptr + 9 * src_pitch));
+    s[10] = _mm_loadl_epi64((const __m128i *)(src_ptr + 10 * src_pitch));
+
+    r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), s[7], 1);
+    r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[6]), s[8], 1);
+    rr[0] = _mm256_unpacklo_epi32(r[4], r[5]);
+    rr[1] = _mm256_unpacklo_epi32(r[5], r[6]);
+    ss[2] = _mm256_unpacklo_epi8(rr[0], rr[1]);
+
+    r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[7]), s[9], 1);
+    r[8] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[8]), s[10], 1);
+    rr[0] = _mm256_unpacklo_epi32(r[6], r[7]);
+    rr[1] = _mm256_unpacklo_epi32(r[7], r[8]);
+    ss[3] = _mm256_unpacklo_epi8(rr[0], rr[1]);
+
+    ss[0] = convolve8_16_avx2(ss, f);
+
+    // r3 r2 r3 r2 r1 r0 r1 r0
+    ss[0] = _mm256_packus_epi16(ss[0], ss[0]);
+    src_ptr += src_stride;
+
+    mm256_storeu2_epi32((__m128i *const)output_ptr,
+                        (__m128i *const)(output_ptr + (2 * out_pitch)), ss);
+
+    ss[0] = _mm256_srli_si256(ss[0], 4);
+
+    mm256_storeu2_epi32((__m128i *const)(output_ptr + (1 * out_pitch)),
+                        (__m128i *const)(output_ptr + (3 * out_pitch)), ss);
+
+    output_ptr += out_stride;
+
+    ss[0] = ss[2];
+    ss[1] = ss[3];
+
+    s[6] = s[10];
+    s[5] = s[9];
+
+    r[4] = r[8];
+    y -= 4;
+  }
+
+  // Process 2 rows
+  if (y == 2) {
+    __m128i ss1[4], f1[4], r1[4];
+
+    s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch));
+    s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
+    s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch));
+
+    f1[0] = _mm256_castsi256_si128(f[0]);
+    f1[1] = _mm256_castsi256_si128(f[1]);
+    f1[2] = _mm256_castsi256_si128(f[2]);
+    f1[3] = _mm256_castsi256_si128(f[3]);
+
+    r1[0] = _mm_unpacklo_epi32(s[4], s[5]);
+    r1[1] = _mm_unpacklo_epi32(s[5], s[6]);
+
+    // R7-6 xxxx .. . . x| r73 r72 r71 r70 r63 r62 r61 r60
+    r1[2] = _mm_unpacklo_epi32(s[6], s[7]);
+
+    // R8-7 xxxx .. . . x| r83 r82 r81 r80 r73 r72 r71 r70
+    r1[3] = _mm_unpacklo_epi32(s[7], s[8]);
+
+    // r23 r13....r20 r10|r13 r03....r10 r00
+    ss1[0] = _mm256_castsi256_si128(ss[0]);
+
+    // r43 r33....r40 r30|r33 r23....r30 r20
+    ss1[1] = _mm256_castsi256_si128(ss[1]);
+
+    // r63 r53....r60 r50|r53 r43....r50 r40
+    ss1[2] = _mm_unpacklo_epi8(r1[0], r1[1]);
+
+    // r83 r73....r80 r70|r73 r63....r70 r60
+    ss1[3] = _mm_unpacklo_epi8(r1[2], r1[3]);
+
+    ss1[0] = convolve8_8_ssse3(ss1, f1);
+
+    // r1 r0 r1 r0
+    ss1[0] = _mm_packus_epi16(ss1[0], ss1[0]);
+
+    // Save first row 4 values
+    *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(ss1[0]);
+    output_ptr += out_pitch;
+
+    ss1[0] = _mm_srli_si128(ss1[0], 4);
+    // Save second row 4 values
+    *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(ss1[0]);
   }
 }
 
 #if HAVE_AVX2 && HAVE_SSSE3
-filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
-#if ARCH_X86_64
+#if VPX_ARCH_X86_64
 filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
 filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
 filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
-#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_intrin_ssse3
-#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_intrin_ssse3
-#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_intrin_ssse3
-#else  // ARCH_X86
+#else   // VPX_ARCH_X86
 filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
 filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
 filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
-#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_ssse3
-#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3
-#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3
-#endif  // ARCH_X86_64
+#endif  // VPX_ARCH_X86_64
+filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3;
+#define vpx_filter_block1d8_v8_avg_avx2 vpx_filter_block1d8_v8_avg_ssse3
+#define vpx_filter_block1d8_h8_avg_avx2 vpx_filter_block1d8_h8_avg_ssse3
+#define vpx_filter_block1d4_v8_avg_avx2 vpx_filter_block1d4_v8_avg_ssse3
+#define vpx_filter_block1d4_h8_avg_avx2 vpx_filter_block1d4_h8_avg_ssse3
 filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
 filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
 filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
 filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
 filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
 filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
-#define vpx_filter_block1d4_v8_avx2 vpx_filter_block1d4_v8_ssse3
 #define vpx_filter_block1d16_v2_avx2 vpx_filter_block1d16_v2_ssse3
 #define vpx_filter_block1d16_h2_avx2 vpx_filter_block1d16_h2_ssse3
 #define vpx_filter_block1d8_v2_avx2 vpx_filter_block1d8_v2_ssse3
 #define vpx_filter_block1d8_h2_avx2 vpx_filter_block1d8_h2_ssse3
 #define vpx_filter_block1d4_v2_avx2 vpx_filter_block1d4_v2_ssse3
 #define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3
+filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
+#define vpx_filter_block1d16_v2_avg_avx2 vpx_filter_block1d16_v2_avg_ssse3
+#define vpx_filter_block1d16_h2_avg_avx2 vpx_filter_block1d16_h2_avg_ssse3
+#define vpx_filter_block1d8_v2_avg_avx2 vpx_filter_block1d8_v2_avg_ssse3
+#define vpx_filter_block1d8_h2_avg_avx2 vpx_filter_block1d8_h2_avg_ssse3
+#define vpx_filter_block1d4_v2_avg_avx2 vpx_filter_block1d4_v2_avg_ssse3
+#define vpx_filter_block1d4_h2_avg_avx2 vpx_filter_block1d4_h2_avg_ssse3
+
+#define vpx_filter_block1d16_v4_avg_avx2 vpx_filter_block1d16_v8_avg_avx2
+#define vpx_filter_block1d16_h4_avg_avx2 vpx_filter_block1d16_h8_avg_avx2
+#define vpx_filter_block1d8_v4_avg_avx2 vpx_filter_block1d8_v8_avg_avx2
+#define vpx_filter_block1d8_h4_avg_avx2 vpx_filter_block1d8_h8_avg_avx2
+#define vpx_filter_block1d4_v4_avg_avx2 vpx_filter_block1d4_v8_avg_avx2
+#define vpx_filter_block1d4_h4_avg_avx2 vpx_filter_block1d4_h8_avg_avx2
 // void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
 //                                uint8_t *dst, ptrdiff_t dst_stride,
-//                                const int16_t *filter_x, int x_step_q4,
-//                                const int16_t *filter_y, int y_step_q4,
+//                                const InterpKernel *filter, int x0_q4,
+//                                int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                                int w, int h);
 // void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
 //                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
+//                               const InterpKernel *filter, int x0_q4,
+//                               int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                               int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
+// void vpx_convolve8_avg_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                                    uint8_t *dst, ptrdiff_t dst_stride,
+//                                    const InterpKernel *filter, int x0_q4,
+//                                    int32_t x_step_q4, int y0_q4,
+//                                    int y_step_q4, int w, int h);
+// void vpx_convolve8_avg_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                                   uint8_t *dst, ptrdiff_t dst_stride,
+//                                   const InterpKernel *filter, int x0_q4,
+//                                   int32_t x_step_q4, int y0_q4,
+//                                   int y_step_q4, int w, int h);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0)
+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), ,
+            avx2, 0)
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1)
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+            src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1)
 
 // void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
 //                          uint8_t *dst, ptrdiff_t dst_stride,
-//                          const int16_t *filter_x, int x_step_q4,
-//                          const int16_t *filter_y, int y_step_q4,
+//                          const InterpKernel *filter, int x0_q4,
+//                          int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                          int w, int h);
-FUN_CONV_2D(, avx2);
+// void vpx_convolve8_avg_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                              uint8_t *dst, ptrdiff_t dst_stride,
+//                              const InterpKernel *filter, int x0_q4,
+//                              int32_t x_step_q4, int y0_q4, int y_step_q4,
+//                              int w, int h);
+FUN_CONV_2D(, avx2, 0)
+FUN_CONV_2D(avg_, avx2, 1)
 #endif  // HAVE_AX2 && HAVE_SSSE3
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
index 09c75d455c..4ea2752d38 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -8,52 +8,51 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <tmmintrin.h>
+#include <tmmintrin.h>  // SSSE3
 
+#include <string.h>
+
+#include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/vpx_filter.h"
 #include "vpx_dsp/x86/convolve.h"
+#include "vpx_dsp/x86/convolve_sse2.h"
+#include "vpx_dsp/x86/convolve_ssse3.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
-#include "vpx_ports/emmintrin_compat.h"
 
-// filters only for the 4_h8 convolution
-DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
-  0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
-};
+static INLINE __m128i shuffle_filter_convolve8_8_ssse3(
+    const __m128i *const s, const int16_t *const filter) {
+  __m128i f[4];
+  shuffle_filter_ssse3(filter, f);
+  return convolve8_8_ssse3(s, f);
+}
 
-DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
-  4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-// filters for 8_h8 and 16_h8
-DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
-  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
-  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
-  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
-  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
-};
-
-// These are reused by the avx2 intrinsics.
-filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
+// Used by the avx2 implementation.
+#if VPX_ARCH_X86_64
+// Use the intrinsics below
 filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
+#define vpx_filter_block1d4_h8_ssse3 vpx_filter_block1d4_h8_intrin_ssse3
+#define vpx_filter_block1d8_h8_ssse3 vpx_filter_block1d8_h8_intrin_ssse3
+#define vpx_filter_block1d8_v8_ssse3 vpx_filter_block1d8_v8_intrin_ssse3
+#else  // VPX_ARCH_X86
+// Use the assembly in vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm.
+filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
+#endif
 
+#if VPX_ARCH_X86_64
 void vpx_filter_block1d4_h8_intrin_ssse3(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
   __m128i firstFilters, secondFilters, shuffle1, shuffle2;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-  __m128i addFilterReg64, filtersReg, srcReg, minReg;
+  __m128i srcRegFilt1, srcRegFilt2;
+  __m128i addFilterReg64, filtersReg, srcReg;
   unsigned int i;
 
   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
@@ -75,8 +74,8 @@ void vpx_filter_block1d4_h8_intrin_ssse3(
   secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
 
   // loading the local filters
-  shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8);
-  shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
+  shuffle1 = _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6);
+  shuffle2 = _mm_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10);
 
   for (i = 0; i < output_height; i++) {
     srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
@@ -89,25 +88,23 @@ void vpx_filter_block1d4_h8_intrin_ssse3(
     srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
 
-    // extract the higher half of the lane
-    srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
-    srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
+    // sum the results together, saturating only on the final step
+    // the specific order of the additions prevents outranges
+    srcRegFilt1 = _mm_add_epi16(srcRegFilt1, srcRegFilt2);
 
-    minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
+    // extract the higher half of the register
+    srcRegFilt2 = _mm_srli_si128(srcRegFilt1, 8);
 
-    // add and saturate all the results together
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-    srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+    // add the rounding offset early to avoid another saturated add
+    srcRegFilt1 = _mm_add_epi16(srcRegFilt1, addFilterReg64);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
 
     // shift by 7 bit each 16 bits
     srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
 
     // shrink to 8 bit each 16 bits
     srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-    src_ptr += src_pixels_per_line;
+    src_ptr += src_pitch;
 
     // save only 4 bytes
     *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1);
@@ -117,77 +114,35 @@ void vpx_filter_block1d4_h8_intrin_ssse3(
 }
 
 void vpx_filter_block1d8_h8_intrin_ssse3(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
-  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-  __m128i addFilterReg64, filtersReg, minReg;
   unsigned int i;
+  __m128i f[4], filt[4], s[4];
 
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 128 bit register
-  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 128 bit register
-  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 128 bit register
-  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 128 bit register
-  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
-  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
-  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
-  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
+  shuffle_filter_ssse3(filter, f);
+  filt[0] = _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+  filt[1] = _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+  filt[2] = _mm_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12);
+  filt[3] =
+      _mm_setr_epi8(6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14);
 
   for (i = 0; i < output_height; i++) {
-    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+    const __m128i srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
 
     // filter the source buffer
-    srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg);
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
-    // filter the source buffer
-    srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg);
-    srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
-    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
-
-    // add and saturate all the results together
-    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-
-    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-    // shift by 7 bit each 16 bits
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+    s[0] = _mm_shuffle_epi8(srcReg, filt[0]);
+    s[1] = _mm_shuffle_epi8(srcReg, filt[1]);
+    s[2] = _mm_shuffle_epi8(srcReg, filt[2]);
+    s[3] = _mm_shuffle_epi8(srcReg, filt[3]);
+    s[0] = convolve8_8_ssse3(s, f);
 
     // shrink to 8 bit each 16 bits
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+    s[0] = _mm_packus_epi16(s[0], s[0]);
 
-    src_ptr += src_pixels_per_line;
+    src_ptr += src_pitch;
 
     // save only 8 bytes
-    _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
+    _mm_storel_epi64((__m128i *)&output_ptr[0], s[0]);
 
     output_ptr += output_pitch;
   }
@@ -196,94 +151,537 @@ void vpx_filter_block1d8_h8_intrin_ssse3(
 void vpx_filter_block1d8_v8_intrin_ssse3(
     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
     ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i addFilterReg64, filtersReg, minReg;
-  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
-  __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
-  __m128i srcReg8;
   unsigned int i;
+  __m128i f[4], s[8], ss[4];
 
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the first 16 bits in the filter
-  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-  // duplicate only the second 16 bits in the filter
-  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-  // duplicate only the third 16 bits in the filter
-  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits in the filter
-  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+  shuffle_filter_ssse3(filter, f);
 
   // load the first 7 rows of 8 bytes
-  srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
-  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
-  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
-  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
-  srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
-  srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
-  srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+  s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch));
+  s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch));
+  s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch));
+  s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch));
+  s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch));
+  s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch));
+  s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch));
 
   for (i = 0; i < output_height; i++) {
     // load the last 8 bytes
-    srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
+    s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
 
     // merge the result together
-    srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
-    srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
+    ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+    ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
 
     // merge the result together
-    srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
-    srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
-    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
-
-    // add and saturate the results together
-    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
-    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+    ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+    ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
 
+    ss[0] = convolve8_8_ssse3(ss, f);
     // shrink to 8 bit each 16 bits
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+    ss[0] = _mm_packus_epi16(ss[0], ss[0]);
 
     src_ptr += src_pitch;
 
     // shift down a row
-    srcReg1 = srcReg2;
-    srcReg2 = srcReg3;
-    srcReg3 = srcReg4;
-    srcReg4 = srcReg5;
-    srcReg5 = srcReg6;
-    srcReg6 = srcReg7;
-    srcReg7 = srcReg8;
+    s[0] = s[1];
+    s[1] = s[2];
+    s[2] = s[3];
+    s[3] = s[4];
+    s[4] = s[5];
+    s[5] = s[6];
+    s[6] = s[7];
 
     // save only 8 bytes convolve result
-    _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
+    _mm_storel_epi64((__m128i *)&output_ptr[0], ss[0]);
 
     output_ptr += out_pitch;
   }
 }
+#endif  // VPX_ARCH_X86_64
 
+static void vpx_filter_block1d16_h4_ssse3(const uint8_t *src_ptr,
+                                          ptrdiff_t src_stride,
+                                          uint8_t *dst_ptr,
+                                          ptrdiff_t dst_stride, uint32_t height,
+                                          const int16_t *kernel) {
+  // We will cast the kernel from 16-bit words to 8-bit words, and then extract
+  // the middle four elements of the kernel into two registers in the form
+  // ... k[3] k[2] k[3] k[2]
+  // ... k[5] k[4] k[5] k[4]
+  // Then we shuffle the source into
+  // ... s[1] s[0] s[0] s[-1]
+  // ... s[3] s[2] s[2] s[1]
+  // Calling multiply and add gives us half of the sum. Calling add gives us
+  // first half of the output. Repeat again to get the second half of the
+  // output. Finally we shuffle again to combine the two outputs.
+
+  __m128i kernel_reg;                         // Kernel
+  __m128i kernel_reg_23, kernel_reg_45;       // Segments of the kernel used
+  const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
+  int h;
+
+  __m128i src_reg, src_reg_shift_0, src_reg_shift_2;
+  __m128i dst_first, dst_second;
+  __m128i tmp_0, tmp_1;
+  __m128i idx_shift_0 =
+      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+  __m128i idx_shift_2 =
+      _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+
+  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+  src_ptr -= 1;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+  kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+  kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u));
+  kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u));
+
+  for (h = height; h > 0; --h) {
+    // Load the source
+    src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+    src_reg_shift_0 = _mm_shuffle_epi8(src_reg, idx_shift_0);
+    src_reg_shift_2 = _mm_shuffle_epi8(src_reg, idx_shift_2);
+
+    // Partial result for first half
+    tmp_0 = _mm_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
+    tmp_1 = _mm_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
+    dst_first = _mm_adds_epi16(tmp_0, tmp_1);
+
+    // Do again to get the second half of dst
+    // Load the source
+    src_reg = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
+    src_reg_shift_0 = _mm_shuffle_epi8(src_reg, idx_shift_0);
+    src_reg_shift_2 = _mm_shuffle_epi8(src_reg, idx_shift_2);
+
+    // Partial result for first half
+    tmp_0 = _mm_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
+    tmp_1 = _mm_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
+    dst_second = _mm_adds_epi16(tmp_0, tmp_1);
+
+    // Round each result
+    dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
+    dst_second = mm_round_epi16_sse2(&dst_second, &reg_32, 6);
+
+    // Finally combine to get the final dst
+    dst_first = _mm_packus_epi16(dst_first, dst_second);
+    _mm_store_si128((__m128i *)dst_ptr, dst_first);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void vpx_filter_block1d16_v4_ssse3(const uint8_t *src_ptr,
+                                          ptrdiff_t src_stride,
+                                          uint8_t *dst_ptr,
+                                          ptrdiff_t dst_stride, uint32_t height,
+                                          const int16_t *kernel) {
+  // We will load two rows of pixels as 8-bit words, rearrange them into the
+  // form
+  // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
+  // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
+  // so that we can call multiply and add with the kernel to get 16-bit words of
+  // the form
+  // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
+  // Finally, we can add multiple rows together to get the desired output.
+
+  // Register for source s[-1:3, :]
+  __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+  // Interleaved rows of the source. lo is first half, hi second
+  __m128i src_reg_m10_lo, src_reg_m10_hi, src_reg_01_lo, src_reg_01_hi;
+  __m128i src_reg_12_lo, src_reg_12_hi, src_reg_23_lo, src_reg_23_hi;
+
+  __m128i kernel_reg;                    // Kernel
+  __m128i kernel_reg_23, kernel_reg_45;  // Segments of the kernel used
+
+  // Result after multiply and add
+  __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
+  __m128i res_reg_m10_hi, res_reg_01_hi, res_reg_12_hi, res_reg_23_hi;
+  __m128i res_reg_m1012, res_reg_0123;
+  __m128i res_reg_m1012_lo, res_reg_0123_lo, res_reg_m1012_hi, res_reg_0123_hi;
+
+  const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
+
+  // We will compute the result two rows at a time
+  const ptrdiff_t src_stride_unrolled = src_stride << 1;
+  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+  int h;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+  kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+  kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u));
+  kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u));
+
+  // First shuffle the data
+  src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
+  src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
+  src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0);
+  src_reg_m10_hi = _mm_unpackhi_epi8(src_reg_m1, src_reg_0);
+
+  // More shuffling
+  src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
+  src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1);
+  src_reg_01_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1);
+
+  for (h = height; h > 1; h -= 2) {
+    src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
+
+    src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2);
+    src_reg_12_hi = _mm_unpackhi_epi8(src_reg_1, src_reg_2);
+
+    src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
+
+    src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3);
+    src_reg_23_hi = _mm_unpackhi_epi8(src_reg_2, src_reg_3);
+
+    // Partial output from first half
+    res_reg_m10_lo = _mm_maddubs_epi16(src_reg_m10_lo, kernel_reg_23);
+    res_reg_01_lo = _mm_maddubs_epi16(src_reg_01_lo, kernel_reg_23);
+
+    res_reg_12_lo = _mm_maddubs_epi16(src_reg_12_lo, kernel_reg_45);
+    res_reg_23_lo = _mm_maddubs_epi16(src_reg_23_lo, kernel_reg_45);
+
+    // Add to get first half of the results
+    res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo);
+    res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo);
+
+    // Partial output for second half
+    res_reg_m10_hi = _mm_maddubs_epi16(src_reg_m10_hi, kernel_reg_23);
+    res_reg_01_hi = _mm_maddubs_epi16(src_reg_01_hi, kernel_reg_23);
+
+    res_reg_12_hi = _mm_maddubs_epi16(src_reg_12_hi, kernel_reg_45);
+    res_reg_23_hi = _mm_maddubs_epi16(src_reg_23_hi, kernel_reg_45);
+
+    // Second half of the results
+    res_reg_m1012_hi = _mm_adds_epi16(res_reg_m10_hi, res_reg_12_hi);
+    res_reg_0123_hi = _mm_adds_epi16(res_reg_01_hi, res_reg_23_hi);
+
+    // Round the words
+    res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, &reg_32, 6);
+    res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, &reg_32, 6);
+    res_reg_m1012_hi = mm_round_epi16_sse2(&res_reg_m1012_hi, &reg_32, 6);
+    res_reg_0123_hi = mm_round_epi16_sse2(&res_reg_0123_hi, &reg_32, 6);
+
+    // Combine to get the result
+    res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, res_reg_m1012_hi);
+    res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, res_reg_0123_hi);
+
+    _mm_store_si128((__m128i *)dst_ptr, res_reg_m1012);
+    _mm_store_si128((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
+
+    // Update the source by two rows
+    src_ptr += src_stride_unrolled;
+    dst_ptr += dst_stride_unrolled;
+
+    src_reg_m10_lo = src_reg_12_lo;
+    src_reg_m10_hi = src_reg_12_hi;
+    src_reg_01_lo = src_reg_23_lo;
+    src_reg_01_hi = src_reg_23_hi;
+    src_reg_1 = src_reg_3;
+  }
+}
+
+static void vpx_filter_block1d8_h4_ssse3(const uint8_t *src_ptr,
+                                         ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                         ptrdiff_t dst_stride, uint32_t height,
+                                         const int16_t *kernel) {
+  // We will cast the kernel from 16-bit words to 8-bit words, and then extract
+  // the middle four elements of the kernel into two registers in the form
+  // ... k[3] k[2] k[3] k[2]
+  // ... k[5] k[4] k[5] k[4]
+  // Then we shuffle the source into
+  // ... s[1] s[0] s[0] s[-1]
+  // ... s[3] s[2] s[2] s[1]
+  // Calling multiply and add gives us half of the sum. Calling add gives us
+  // first half of the output. Repeat again to get the second half of the
+  // output. Finally we shuffle again to combine the two outputs.
+
+  __m128i kernel_reg;                         // Kernel
+  __m128i kernel_reg_23, kernel_reg_45;       // Segments of the kernel used
+  const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
+  int h;
+
+  __m128i src_reg, src_reg_shift_0, src_reg_shift_2;
+  __m128i dst_first;
+  __m128i tmp_0, tmp_1;
+  __m128i idx_shift_0 =
+      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+  __m128i idx_shift_2 =
+      _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+
+  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+  src_ptr -= 1;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+  kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+  kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u));
+  kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u));
+
+  for (h = height; h > 0; --h) {
+    // Load the source
+    src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+    src_reg_shift_0 = _mm_shuffle_epi8(src_reg, idx_shift_0);
+    src_reg_shift_2 = _mm_shuffle_epi8(src_reg, idx_shift_2);
+
+    // Get the result
+    tmp_0 = _mm_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
+    tmp_1 = _mm_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
+    dst_first = _mm_adds_epi16(tmp_0, tmp_1);
+
+    // Round round result
+    dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
+
+    // Pack to 8-bits
+    dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
+    _mm_storel_epi64((__m128i *)dst_ptr, dst_first);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void vpx_filter_block1d8_v4_ssse3(const uint8_t *src_ptr,
+                                         ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                         ptrdiff_t dst_stride, uint32_t height,
+                                         const int16_t *kernel) {
+  // We will load two rows of pixels as 8-bit words, rearrange them into the
+  // form
+  // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
+  // so that we can call multiply and add with the kernel to get 16-bit words of
+  // the form
+  // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
+  // Finally, we can add multiple rows together to get the desired output.
+
+  // Register for source s[-1:3, :]
+  __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+  // Interleaved rows of the source. lo is first half, hi second
+  __m128i src_reg_m10, src_reg_01;
+  __m128i src_reg_12, src_reg_23;
+
+  __m128i kernel_reg;                    // Kernel
+  __m128i kernel_reg_23, kernel_reg_45;  // Segments of the kernel used
+
+  // Result after multiply and add
+  __m128i res_reg_m10, res_reg_01, res_reg_12, res_reg_23;
+  __m128i res_reg_m1012, res_reg_0123;
+
+  const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
+
+  // We will compute the result two rows at a time
+  const ptrdiff_t src_stride_unrolled = src_stride << 1;
+  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+  int h;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+  kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+  kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u));
+  kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u));
+
+  // First shuffle the data
+  src_reg_m1 = _mm_loadl_epi64((const __m128i *)src_ptr);
+  src_reg_0 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride));
+  src_reg_m10 = _mm_unpacklo_epi8(src_reg_m1, src_reg_0);
+
+  // More shuffling
+  src_reg_1 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 2));
+  src_reg_01 = _mm_unpacklo_epi8(src_reg_0, src_reg_1);
+
+  for (h = height; h > 1; h -= 2) {
+    src_reg_2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3));
+
+    src_reg_12 = _mm_unpacklo_epi8(src_reg_1, src_reg_2);
+
+    src_reg_3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4));
+
+    src_reg_23 = _mm_unpacklo_epi8(src_reg_2, src_reg_3);
+
+    // Partial output
+    res_reg_m10 = _mm_maddubs_epi16(src_reg_m10, kernel_reg_23);
+    res_reg_01 = _mm_maddubs_epi16(src_reg_01, kernel_reg_23);
+
+    res_reg_12 = _mm_maddubs_epi16(src_reg_12, kernel_reg_45);
+    res_reg_23 = _mm_maddubs_epi16(src_reg_23, kernel_reg_45);
+
+    // Add to get entire output
+    res_reg_m1012 = _mm_adds_epi16(res_reg_m10, res_reg_12);
+    res_reg_0123 = _mm_adds_epi16(res_reg_01, res_reg_23);
+
+    // Round the words
+    res_reg_m1012 = mm_round_epi16_sse2(&res_reg_m1012, &reg_32, 6);
+    res_reg_0123 = mm_round_epi16_sse2(&res_reg_0123, &reg_32, 6);
+
+    // Pack from 16-bit to 8-bit
+    res_reg_m1012 = _mm_packus_epi16(res_reg_m1012, _mm_setzero_si128());
+    res_reg_0123 = _mm_packus_epi16(res_reg_0123, _mm_setzero_si128());
+
+    _mm_storel_epi64((__m128i *)dst_ptr, res_reg_m1012);
+    _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
+
+    // Update the source by two rows
+    src_ptr += src_stride_unrolled;
+    dst_ptr += dst_stride_unrolled;
+
+    src_reg_m10 = src_reg_12;
+    src_reg_01 = src_reg_23;
+    src_reg_1 = src_reg_3;
+  }
+}
+
+static void vpx_filter_block1d4_h4_ssse3(const uint8_t *src_ptr,
+                                         ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                         ptrdiff_t dst_stride, uint32_t height,
+                                         const int16_t *kernel) {
+  // We will cast the kernel from 16-bit words to 8-bit words, and then extract
+  // the middle four elements of the kernel into a single register in the form
+  // k[5:2] k[5:2] k[5:2] k[5:2]
+  // Then we shuffle the source into
+  // s[5:2] s[4:1] s[3:0] s[2:-1]
+  // Calling multiply and add gives us half of the sum next to each other.
+  // Calling horizontal add then gives us the output.
+
+  __m128i kernel_reg;                         // Kernel
+  const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
+  int h;
+
+  __m128i src_reg, src_reg_shuf;
+  __m128i dst_first;
+  __m128i shuf_idx =
+      _mm_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6);
+
+  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+  src_ptr -= 1;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+  kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+  kernel_reg = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi32(0x05040302u));
+
+  for (h = height; h > 0; --h) {
+    // Load the source
+    src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+    src_reg_shuf = _mm_shuffle_epi8(src_reg, shuf_idx);
+
+    // Get the result
+    dst_first = _mm_maddubs_epi16(src_reg_shuf, kernel_reg);
+    dst_first = _mm_hadds_epi16(dst_first, _mm_setzero_si128());
+
+    // Round result
+    dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
+
+    // Pack to 8-bits
+    dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
+    *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void vpx_filter_block1d4_v4_ssse3(const uint8_t *src_ptr,
+                                         ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                         ptrdiff_t dst_stride, uint32_t height,
+                                         const int16_t *kernel) {
+  // We will load two rows of pixels as 8-bit words, rearrange them into the
+  // form
+  // ... s[2,0] s[1,0] s[0,0] s[-1,0]
+  // so that we can call multiply and add with the kernel partial output. Then
+  // we can call horizontal add to get the output.
+  // Finally, we can add multiple rows together to get the desired output.
+  // This is done two rows at a time
+
+  // Register for source s[-1:3, :]
+  __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+  // Interleaved rows of the source.
+  __m128i src_reg_m10, src_reg_01;
+  __m128i src_reg_12, src_reg_23;
+  __m128i src_reg_m1001, src_reg_1223;
+  __m128i src_reg_m1012_1023_lo, src_reg_m1012_1023_hi;
+
+  __m128i kernel_reg;  // Kernel
+
+  // Result after multiply and add
+  __m128i reg_0, reg_1;
+
+  const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
+
+  // We will compute the result two rows at a time
+  const ptrdiff_t src_stride_unrolled = src_stride << 1;
+  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+  int h;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+  kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+  kernel_reg = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi32(0x05040302u));
+
+  // First shuffle the data
+  src_reg_m1 = _mm_loadl_epi64((const __m128i *)src_ptr);
+  src_reg_0 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride));
+  src_reg_m10 = _mm_unpacklo_epi32(src_reg_m1, src_reg_0);
+
+  // More shuffling
+  src_reg_1 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 2));
+  src_reg_01 = _mm_unpacklo_epi32(src_reg_0, src_reg_1);
+
+  // Put three rows next to each other
+  src_reg_m1001 = _mm_unpacklo_epi8(src_reg_m10, src_reg_01);
+
+  for (h = height; h > 1; h -= 2) {
+    src_reg_2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3));
+    src_reg_12 = _mm_unpacklo_epi32(src_reg_1, src_reg_2);
+
+    src_reg_3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4));
+    src_reg_23 = _mm_unpacklo_epi32(src_reg_2, src_reg_3);
+
+    // Put three rows next to each other
+    src_reg_1223 = _mm_unpacklo_epi8(src_reg_12, src_reg_23);
+
+    // Put all four rows next to each other
+    src_reg_m1012_1023_lo = _mm_unpacklo_epi16(src_reg_m1001, src_reg_1223);
+    src_reg_m1012_1023_hi = _mm_unpackhi_epi16(src_reg_m1001, src_reg_1223);
+
+    // Get the results
+    reg_0 = _mm_maddubs_epi16(src_reg_m1012_1023_lo, kernel_reg);
+    reg_1 = _mm_maddubs_epi16(src_reg_m1012_1023_hi, kernel_reg);
+    reg_0 = _mm_hadds_epi16(reg_0, _mm_setzero_si128());
+    reg_1 = _mm_hadds_epi16(reg_1, _mm_setzero_si128());
+
+    // Round the words
+    reg_0 = mm_round_epi16_sse2(&reg_0, &reg_32, 6);
+    reg_1 = mm_round_epi16_sse2(&reg_1, &reg_32, 6);
+
+    // Pack from 16-bit to 8-bit and put them in the right order
+    reg_0 = _mm_packus_epi16(reg_0, reg_0);
+    reg_1 = _mm_packus_epi16(reg_1, reg_1);
+
+    // Save the result
+    *((int *)(dst_ptr)) = _mm_cvtsi128_si32(reg_0);
+    *((int *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(reg_1);
+
+    // Update the source by two rows
+    src_ptr += src_stride_unrolled;
+    dst_ptr += dst_stride_unrolled;
+
+    src_reg_m1001 = src_reg_1223;
+    src_reg_1 = src_reg_3;
+  }
+}
+
+// From vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
 filter8_1dfunction vpx_filter_block1d16_v8_ssse3;
 filter8_1dfunction vpx_filter_block1d16_h8_ssse3;
-filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
-filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
 filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
-filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
 filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3;
 filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3;
 filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
@@ -291,6 +689,15 @@ filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
 filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3;
 filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3;
 
+// Use the [vh]8 version because there is no [vh]4 implementation.
+#define vpx_filter_block1d16_v4_avg_ssse3 vpx_filter_block1d16_v8_avg_ssse3
+#define vpx_filter_block1d16_h4_avg_ssse3 vpx_filter_block1d16_h8_avg_ssse3
+#define vpx_filter_block1d8_v4_avg_ssse3 vpx_filter_block1d8_v8_avg_ssse3
+#define vpx_filter_block1d8_h4_avg_ssse3 vpx_filter_block1d8_h8_avg_ssse3
+#define vpx_filter_block1d4_v4_avg_ssse3 vpx_filter_block1d4_v8_avg_ssse3
+#define vpx_filter_block1d4_h4_avg_ssse3 vpx_filter_block1d4_h8_avg_ssse3
+
+// From vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
 filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
 filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
 filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
@@ -306,149 +713,71 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
 
 // void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                                uint8_t *dst, ptrdiff_t dst_stride,
-//                                const int16_t *filter_x, int x_step_q4,
-//                                const int16_t *filter_y, int y_step_q4,
+//                                const InterpKernel *filter, int x0_q4,
+//                                int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                                int w, int h);
 // void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
+//                               const InterpKernel *filter, int x0_q4,
+//                               int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                               int w, int h);
 // void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                                    uint8_t *dst, ptrdiff_t dst_stride,
-//                                    const int16_t *filter_x, int x_step_q4,
-//                                    const int16_t *filter_y, int y_step_q4,
-//                                    int w, int h);
+//                                    const InterpKernel *filter, int x0_q4,
+//                                    int32_t x_step_q4, int y0_q4,
+//                                    int y_step_q4, int w, int h);
 // void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                                   uint8_t *dst, ptrdiff_t dst_stride,
-//                                   const int16_t *filter_x, int x_step_q4,
-//                                   const int16_t *filter_y, int y_step_q4,
-//                                   int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
-            ssse3);
+//                                   const InterpKernel *filter, int x0_q4,
+//                                   int32_t x_step_q4, int y0_q4,
+//                                   int y_step_q4, int w, int h);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3, 0)
+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), ,
+            ssse3, 0)
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3, 1)
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+            src - src_stride * (num_taps / 2 - 1), avg_, ssse3, 1)
 
-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                      out2, out3, out4, out5, out6, out7)                 \
-  {                                                                       \
-    const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1);                    \
-    const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3);                    \
-    const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5);                    \
-    const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7);                    \
-                                                                          \
-    const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1);               \
-    const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1);               \
-    const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3);               \
-    const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3);               \
-                                                                          \
-    const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2);               \
-    const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2);               \
-    const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3);               \
-    const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3);               \
-                                                                          \
-    out0 = _mm_unpacklo_epi64(tr2_0, tr2_0);                              \
-    out1 = _mm_unpackhi_epi64(tr2_0, tr2_0);                              \
-    out2 = _mm_unpacklo_epi64(tr2_1, tr2_1);                              \
-    out3 = _mm_unpackhi_epi64(tr2_1, tr2_1);                              \
-    out4 = _mm_unpacklo_epi64(tr2_2, tr2_2);                              \
-    out5 = _mm_unpackhi_epi64(tr2_2, tr2_2);                              \
-    out6 = _mm_unpacklo_epi64(tr2_3, tr2_3);                              \
-    out7 = _mm_unpackhi_epi64(tr2_3, tr2_3);                              \
-  }
+static void filter_horiz_w8_ssse3(const uint8_t *const src,
+                                  const ptrdiff_t src_stride,
+                                  uint8_t *const dst,
+                                  const int16_t *const x_filter) {
+  __m128i s[8], ss[4], temp;
 
-static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch,
-                                  uint8_t *dst, const int16_t *x_filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)x_filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_loadl_epi64((const __m128i *)src_x);
-  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch));
-  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2));
-  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3));
-  const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4));
-  const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5));
-  const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6));
-  const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7));
-  // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
-  const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
-  // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
-  const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
-  // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57
-  const __m128i tr0_2 = _mm_unpacklo_epi16(E, F);
-  // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77
-  const __m128i tr0_3 = _mm_unpacklo_epi16(G, H);
-  // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
-  const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73
-  const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
-  const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2);
-  const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2);
-  const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3);
-  const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
+  load_8bit_8x8(src, src_stride, s);
+  // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
+  // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
+  // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
+  // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
+  transpose_16bit_4x8(s, ss);
+  temp = shuffle_filter_convolve8_8_ssse3(ss, x_filter);
   // shrink to 8 bit each 16 bits
   temp = _mm_packus_epi16(temp, temp);
   // save only 8 bytes convolve result
   _mm_storel_epi64((__m128i *)dst, temp);
 }
 
-static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride) {
-  __m128i A, B, C, D, E, F, G, H;
+static void transpose8x8_to_dst(const uint8_t *const src,
+                                const ptrdiff_t src_stride, uint8_t *const dst,
+                                const ptrdiff_t dst_stride) {
+  __m128i s[8];
 
-  A = _mm_loadl_epi64((const __m128i *)src);
-  B = _mm_loadl_epi64((const __m128i *)(src + src_stride));
-  C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
-  D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
-  E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4));
-  F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5));
-  G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6));
-  H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7));
-
-  TRANSPOSE_8X8(A, B, C, D, E, F, G, H, A, B, C, D, E, F, G, H);
-
-  _mm_storel_epi64((__m128i *)dst, A);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), B);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), C);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), D);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), E);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), F);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), G);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), H);
+  load_8bit_8x8(src, src_stride, s);
+  transpose_8bit_8x8(s, s);
+  store_8bit_8x8(s, dst, dst_stride);
 }
 
-static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *x_filters, int x0_q4,
-                                    int x_step_q4, int w, int h) {
+static void scaledconvolve_horiz_w8(const uint8_t *src,
+                                    const ptrdiff_t src_stride, uint8_t *dst,
+                                    const ptrdiff_t dst_stride,
+                                    const InterpKernel *const x_filters,
+                                    const int x0_q4, const int x_step_q4,
+                                    const int w, const int h) {
   DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
   int x, y, z;
   src -= SUBPEL_TAPS / 2 - 1;
 
-  // This function processes 8x8 areas.  The intermediate height is not always
+  // This function processes 8x8 areas. The intermediate height is not always
   // a multiple of 8, so force it to be a multiple of 8 here.
   y = h + (8 - (h & 0x7));
 
@@ -479,93 +808,50 @@ static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
   } while (y -= 8);
 }
 
-static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                  uint8_t *dst, const int16_t *filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
-  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
-  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
-  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
-  // TRANSPOSE...
-  // 00 01 02 03 04 05 06 07
-  // 10 11 12 13 14 15 16 17
-  // 20 21 22 23 24 25 26 27
-  // 30 31 32 33 34 35 36 37
-  //
-  // TO
-  //
-  // 00 10 20 30
-  // 01 11 21 31
-  // 02 12 22 32
-  // 03 13 23 33
-  // 04 14 24 34
-  // 05 15 25 35
-  // 06 16 26 36
-  // 07 17 27 37
-  //
-  // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
-  const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
-  // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
-  const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
-  // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
-  const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
-  const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+static void filter_horiz_w4_ssse3(const uint8_t *const src,
+                                  const ptrdiff_t src_stride,
+                                  uint8_t *const dst,
+                                  const int16_t *const filter) {
+  __m128i s[4], ss[2];
+  __m128i temp;
+
+  load_8bit_8x4(src, src_stride, s);
+  transpose_16bit_4x4(s, ss);
+  // 00 01 10 11 20 21 30 31
+  s[0] = ss[0];
   // 02 03 12 13 22 23 32 33
-  const __m128i s3s2 = _mm_srli_si128(s1s0, 8);
+  s[1] = _mm_srli_si128(ss[0], 8);
+  // 04 05 14 15 24 25 34 35
+  s[2] = ss[1];
   // 06 07 16 17 26 27 36 37
-  const __m128i s7s6 = _mm_srli_si128(s5s4, 8);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
+  s[3] = _mm_srli_si128(ss[1], 8);
+
+  temp = shuffle_filter_convolve8_8_ssse3(s, filter);
   // shrink to 8 bit each 16 bits
   temp = _mm_packus_epi16(temp, temp);
   // save only 4 bytes
   *(int *)dst = _mm_cvtsi128_si32(temp);
 }
 
-static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride) {
-  __m128i A = _mm_cvtsi32_si128(*(const int *)src);
-  __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride));
-  __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2));
-  __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3));
-  // 00 10 01 11 02 12 03 13
-  const __m128i tr0_0 = _mm_unpacklo_epi8(A, B);
-  // 20 30 21 31 22 32 23 33
-  const __m128i tr0_1 = _mm_unpacklo_epi8(C, D);
-  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-  A = _mm_unpacklo_epi16(tr0_0, tr0_1);
-  B = _mm_srli_si128(A, 4);
-  C = _mm_srli_si128(A, 8);
-  D = _mm_srli_si128(A, 12);
+static void transpose4x4_to_dst(const uint8_t *const src,
+                                const ptrdiff_t src_stride, uint8_t *const dst,
+                                const ptrdiff_t dst_stride) {
+  __m128i s[4];
 
-  *(int *)(dst) = _mm_cvtsi128_si32(A);
-  *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B);
-  *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C);
-  *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D);
+  load_8bit_4x4(src, src_stride, s);
+  s[0] = transpose_8bit_4x4(s);
+  s[1] = _mm_srli_si128(s[0], 4);
+  s[2] = _mm_srli_si128(s[0], 8);
+  s[3] = _mm_srli_si128(s[0], 12);
+  store_8bit_4x4(s, dst, dst_stride);
 }
 
-static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *x_filters, int x0_q4,
-                                    int x_step_q4, int w, int h) {
+static void scaledconvolve_horiz_w4(const uint8_t *src,
+                                    const ptrdiff_t src_stride, uint8_t *dst,
+                                    const ptrdiff_t dst_stride,
+                                    const InterpKernel *const x_filters,
+                                    const int x0_q4, const int x_step_q4,
+                                    const int w, const int h) {
   DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
   int x, y, z;
   src -= SUBPEL_TAPS / 2 - 1;
@@ -597,50 +883,41 @@ static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                 uint8_t *dst, const int16_t *filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr);
-  const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch));
-  const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2));
-  const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3));
-  const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4));
-  const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5));
-  const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6));
-  const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7));
-  const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
-  const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
-  const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
-  const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
+static __m128i filter_vert_kernel(const __m128i *const s,
+                                  const int16_t *const filter) {
+  __m128i ss[4];
+  __m128i temp;
+
+  // 00 10 01 11 02 12 03 13
+  ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+  // 20 30 21 31 22 32 23 33
+  ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+  // 40 50 41 51 42 52 43 53
+  ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+  // 60 70 61 71 62 72 63 73
+  ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
+
+  temp = shuffle_filter_convolve8_8_ssse3(ss, filter);
   // shrink to 8 bit each 16 bits
-  temp = _mm_packus_epi16(temp, temp);
+  return _mm_packus_epi16(temp, temp);
+}
+
+static void filter_vert_w4_ssse3(const uint8_t *const src,
+                                 const ptrdiff_t src_stride, uint8_t *const dst,
+                                 const int16_t *const filter) {
+  __m128i s[8];
+  __m128i temp;
+
+  load_8bit_4x8(src, src_stride, s);
+  temp = filter_vert_kernel(s, filter);
   // save only 4 bytes
   *(int *)dst = _mm_cvtsi128_si32(temp);
 }
 
-static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const InterpKernel *y_filters, int y0_q4,
-                                   int y_step_q4, int w, int h) {
+static void scaledconvolve_vert_w4(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
   int y;
   int y_q4 = y0_q4;
 
@@ -659,50 +936,21 @@ static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                 uint8_t *dst, const int16_t *filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
-  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
-  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
-  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
-  const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
-  const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
-  const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
-  const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
-  const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
-  const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
-  const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
-  const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
-  // shrink to 8 bit each 16 bits
-  temp = _mm_packus_epi16(temp, temp);
+static void filter_vert_w8_ssse3(const uint8_t *const src,
+                                 const ptrdiff_t src_stride, uint8_t *const dst,
+                                 const int16_t *const filter) {
+  __m128i s[8], temp;
+
+  load_8bit_8x8(src, src_stride, s);
+  temp = filter_vert_kernel(s, filter);
   // save only 8 bytes convolve result
   _mm_storel_epi64((__m128i *)dst, temp);
 }
 
-static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const InterpKernel *y_filters, int y0_q4,
-                                   int y_step_q4, int w, int h) {
+static void scaledconvolve_vert_w8(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
   int y;
   int y_q4 = y0_q4;
 
@@ -719,81 +967,44 @@ static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                  uint8_t *dst, const int16_t *filter, int w) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
+static void filter_vert_w16_ssse3(const uint8_t *src,
+                                  const ptrdiff_t src_stride,
+                                  uint8_t *const dst,
+                                  const int16_t *const filter, const int w) {
   int i;
+  __m128i f[4];
+  shuffle_filter_ssse3(filter, f);
 
   for (i = 0; i < w; i += 16) {
-    const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr);
-    const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
-    const __m128i C =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
-    const __m128i D =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
-    const __m128i E =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
-    const __m128i F =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
-    const __m128i G =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
-    const __m128i H =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
-    // merge the result together
-    const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B);
-    const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H);
-    const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B);
-    const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H);
-    // multiply 2 adjacent elements with the filter and add the result
-    const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0);
-    const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6);
-    const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0);
-    const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6);
-    // add and saturate the results together
-    const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo);
-    const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi);
-    // merge the result together
-    const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D);
-    const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D);
-    // multiply 2 adjacent elements with the filter and add the result
-    const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2);
-    const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2);
-    // merge the result together
-    const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F);
-    const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F);
-    // multiply 2 adjacent elements with the filter and add the result
-    const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4);
-    const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4);
-    // add and saturate the results together
-    __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo));
-    __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi));
+    __m128i s[8], s_lo[4], s_hi[4], temp_lo, temp_hi;
 
-    // add and saturate the results together
-    temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo));
-    temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi));
-    // round and shift by 7 bit each 16 bit
-    temp_lo = _mm_mulhrs_epi16(temp_lo, k_256);
-    temp_hi = _mm_mulhrs_epi16(temp_hi, k_256);
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
+    loadu_8bit_16x8(src, src_stride, s);
+
+    // merge the result together
+    s_lo[0] = _mm_unpacklo_epi8(s[0], s[1]);
+    s_hi[0] = _mm_unpackhi_epi8(s[0], s[1]);
+    s_lo[1] = _mm_unpacklo_epi8(s[2], s[3]);
+    s_hi[1] = _mm_unpackhi_epi8(s[2], s[3]);
+    s_lo[2] = _mm_unpacklo_epi8(s[4], s[5]);
+    s_hi[2] = _mm_unpackhi_epi8(s[4], s[5]);
+    s_lo[3] = _mm_unpacklo_epi8(s[6], s[7]);
+    s_hi[3] = _mm_unpackhi_epi8(s[6], s[7]);
+    temp_lo = convolve8_8_ssse3(s_lo, f);
+    temp_hi = convolve8_8_ssse3(s_hi, f);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first convolve
+    // result and the second lane contain the second convolve result
     temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
-    src_ptr += 16;
+    src += 16;
     // save 16 bytes convolve result
     _mm_store_si128((__m128i *)&dst[i], temp_hi);
   }
 }
 
-static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *y_filters, int y0_q4,
-                                    int y_step_q4, int w, int h) {
+static void scaledconvolve_vert_w16(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
   int y;
   int y_q4 = y0_q4;
 
@@ -811,11 +1022,10 @@ static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const InterpKernel *const x_filters, int x0_q4,
-                             int x_step_q4, const InterpKernel *const y_filters,
-                             int y0_q4, int y_step_q4, int w, int h) {
+void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                         ptrdiff_t dst_stride, const InterpKernel *filter,
+                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+                         int w, int h) {
   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
   // 2d filtering proceeds in 2 steps:
   //   (1) Interpolate horizontally into an intermediate buffer, temp.
@@ -829,60 +1039,49 @@ static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
   // --Require an additional 8 rows for the horiz_w8 transpose tail.
+  // When calling in frame scaling function, the smallest scaling factor is x1/4
+  // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+  // big enough.
   DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
   const int intermediate_height =
       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
 
   assert(w <= 64);
   assert(h <= 64);
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
+  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+  assert(x_step_q4 <= 64);
 
   if (w >= 8) {
     scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
-                            w, intermediate_height);
+                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+                            intermediate_height);
   } else {
     scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
-                            w, intermediate_height);
+                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+                            intermediate_height);
   }
 
   if (w >= 16) {
     scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                            dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+                            dst_stride, filter, y0_q4, y_step_q4, w, h);
   } else if (w == 8) {
     scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                           dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+                           dst_stride, filter, y0_q4, y_step_q4, w, h);
   } else {
     scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                           dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+                           dst_stride, filter, y0_q4, y_step_q4, w, h);
   }
 }
 
-void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  scaledconvolve2d(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                   x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
-}
-
-// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                          uint8_t *dst, ptrdiff_t dst_stride,
-//                          const int16_t *filter_x, int x_step_q4,
-//                          const int16_t *filter_y, int y_step_q4,
+//                          const InterpKernel *filter, int x0_q4,
+//                          int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                          int w, int h);
 // void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                              uint8_t *dst, ptrdiff_t dst_stride,
-//                              const int16_t *filter_x, int x_step_q4,
-//                              const int16_t *filter_y, int y_step_q4,
+//                              const InterpKernel *filter, int x0_q4,
+//                              int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                              int w, int h);
-FUN_CONV_2D(, ssse3);
-FUN_CONV_2D(avg_, ssse3);
+FUN_CONV_2D(, ssse3, 0)
+FUN_CONV_2D(avg_, ssse3, 1)
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
index 08f3d6a6cf..c8455e13a2 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
+++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
@@ -176,6 +176,8 @@
     movq        [rdi + %2], xmm0
 %endm
 
+SECTION .text
+
 ;void vpx_filter_block1d4_v8_sse2
 ;(
 ;    unsigned char *src_ptr,
@@ -185,7 +187,7 @@
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(vpx_filter_block1d4_v8_sse2) PRIVATE
+globalsym(vpx_filter_block1d4_v8_sse2)
 sym(vpx_filter_block1d4_v8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -252,7 +254,7 @@ sym(vpx_filter_block1d4_v8_sse2):
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(vpx_filter_block1d8_v8_sse2) PRIVATE
+globalsym(vpx_filter_block1d8_v8_sse2)
 sym(vpx_filter_block1d8_v8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -311,7 +313,7 @@ sym(vpx_filter_block1d8_v8_sse2):
 ;    unsigned int   output_height,
 ;    short *filter
 ;)
-global sym(vpx_filter_block1d16_v8_sse2) PRIVATE
+globalsym(vpx_filter_block1d16_v8_sse2)
 sym(vpx_filter_block1d16_v8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -365,7 +367,7 @@ sym(vpx_filter_block1d16_v8_sse2):
     pop         rbp
     ret
 
-global sym(vpx_filter_block1d4_v8_avg_sse2) PRIVATE
+globalsym(vpx_filter_block1d4_v8_avg_sse2)
 sym(vpx_filter_block1d4_v8_avg_sse2):
     push        rbp
     mov         rbp, rsp
@@ -423,7 +425,7 @@ sym(vpx_filter_block1d4_v8_avg_sse2):
     pop         rbp
     ret
 
-global sym(vpx_filter_block1d8_v8_avg_sse2) PRIVATE
+globalsym(vpx_filter_block1d8_v8_avg_sse2)
 sym(vpx_filter_block1d8_v8_avg_sse2):
     push        rbp
     mov         rbp, rsp
@@ -472,7 +474,7 @@ sym(vpx_filter_block1d8_v8_avg_sse2):
     pop         rbp
     ret
 
-global sym(vpx_filter_block1d16_v8_avg_sse2) PRIVATE
+globalsym(vpx_filter_block1d16_v8_avg_sse2)
 sym(vpx_filter_block1d16_v8_avg_sse2):
     push        rbp
     mov         rbp, rsp
@@ -534,7 +536,7 @@ sym(vpx_filter_block1d16_v8_avg_sse2):
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(vpx_filter_block1d4_h8_sse2) PRIVATE
+globalsym(vpx_filter_block1d4_h8_sse2)
 sym(vpx_filter_block1d4_h8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -608,7 +610,7 @@ sym(vpx_filter_block1d4_h8_sse2):
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(vpx_filter_block1d8_h8_sse2) PRIVATE
+globalsym(vpx_filter_block1d8_h8_sse2)
 sym(vpx_filter_block1d8_h8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -683,7 +685,7 @@ sym(vpx_filter_block1d8_h8_sse2):
 ;    unsigned int    output_height,
 ;    short *filter
 ;)
-global sym(vpx_filter_block1d16_h8_sse2) PRIVATE
+globalsym(vpx_filter_block1d16_h8_sse2)
 sym(vpx_filter_block1d16_h8_sse2):
     push        rbp
     mov         rbp, rsp
@@ -769,7 +771,7 @@ sym(vpx_filter_block1d16_h8_sse2):
     pop         rbp
     ret
 
-global sym(vpx_filter_block1d4_h8_avg_sse2) PRIVATE
+globalsym(vpx_filter_block1d4_h8_avg_sse2)
 sym(vpx_filter_block1d4_h8_avg_sse2):
     push        rbp
     mov         rbp, rsp
@@ -834,7 +836,7 @@ sym(vpx_filter_block1d4_h8_avg_sse2):
     pop         rbp
     ret
 
-global sym(vpx_filter_block1d8_h8_avg_sse2) PRIVATE
+globalsym(vpx_filter_block1d8_h8_avg_sse2)
 sym(vpx_filter_block1d8_h8_avg_sse2):
     push        rbp
     mov         rbp, rsp
@@ -900,7 +902,7 @@ sym(vpx_filter_block1d8_h8_avg_sse2):
     pop         rbp
     ret
 
-global sym(vpx_filter_block1d16_h8_avg_sse2) PRIVATE
+globalsym(vpx_filter_block1d16_h8_avg_sse2)
 sym(vpx_filter_block1d16_h8_avg_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
index c1a6f23abe..fe617f1207 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
+++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
@@ -26,7 +26,7 @@ SECTION .text
 %define LOCAL_VARS_SIZE 16*6
 
 %macro SETUP_LOCAL_VARS 0
-    ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 +
+    ; TODO(slavarnway): using xmm registers for these on VPX_ARCH_X86_64 +
     ; pmaddubsw has a higher latency on some platforms, this might be eased by
     ; interleaving the instructions.
     %define    k0k1  [rsp + 16*0]
@@ -48,7 +48,7 @@ SECTION .text
     mova       k2k3, m1
     mova       k4k5, m2
     mova       k6k7, m3
-%if ARCH_X86_64
+%if VPX_ARCH_X86_64
     %define     krd  m12
     %define    tmp0  [rsp + 16*4]
     %define    tmp1  [rsp + 16*5]
@@ -68,7 +68,7 @@ SECTION .text
 %endm
 
 ;-------------------------------------------------------------------------------
-%if ARCH_X86_64
+%if VPX_ARCH_X86_64
   %define LOCAL_VARS_SIZE_H4 0
 %else
   %define LOCAL_VARS_SIZE_H4 16*4
@@ -79,7 +79,7 @@ cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \
                             src, sstride, dst, dstride, height, filter
     mova                m4, [filterq]
     packsswb            m4, m4
-%if ARCH_X86_64
+%if VPX_ARCH_X86_64
     %define       k0k1k4k5  m8
     %define       k2k3k6k7  m9
     %define            krd  m10
@@ -327,19 +327,19 @@ cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
 %endm
 
 INIT_XMM ssse3
-SUBPIX_HFILTER16 h8
-SUBPIX_HFILTER16 h8_avg
-SUBPIX_HFILTER8  h8
-SUBPIX_HFILTER8  h8_avg
-SUBPIX_HFILTER4  h8
-SUBPIX_HFILTER4  h8_avg
+SUBPIX_HFILTER16 h8      ; vpx_filter_block1d16_h8_ssse3
+SUBPIX_HFILTER16 h8_avg  ; vpx_filter_block1d16_h8_avg_ssse3
+SUBPIX_HFILTER8  h8      ; vpx_filter_block1d8_h8_ssse3
+SUBPIX_HFILTER8  h8_avg  ; vpx_filter_block1d8_h8_avg_ssse3
+SUBPIX_HFILTER4  h8      ; vpx_filter_block1d4_h8_ssse3
+SUBPIX_HFILTER4  h8_avg  ; vpx_filter_block1d4_h8_avg_ssse3
 
 ;-------------------------------------------------------------------------------
 
 ; TODO(Linfeng): Detect cpu type and choose the code with better performance.
 %define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1
 
-%if ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
+%if VPX_ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
     %define NUM_GENERAL_REG_USED 9
 %else
     %define NUM_GENERAL_REG_USED 6
@@ -359,9 +359,9 @@ cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \
 
     dec                 heightd
 
-%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
+%if VPX_ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
 
-%if ARCH_X86_64
+%if VPX_ARCH_X86_64
     %define               src1q  r7
     %define           sstride6q  r8
     %define          dst_stride  dstrideq
@@ -467,7 +467,7 @@ cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \
     movx                 [dstq], m0
 
 %else
-    ; ARCH_X86_64
+    ; VPX_ARCH_X86_64
 
     movx                     m0, [srcq                ]     ;A
     movx                     m1, [srcq + sstrideq     ]     ;B
@@ -567,7 +567,7 @@ cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \
 %endif
     movx                 [dstq], m0
 
-%endif ; ARCH_X86_64
+%endif ; VPX_ARCH_X86_64
 
 .done:
     REP_RET
@@ -581,9 +581,9 @@ cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \
     mova                     m4, [filterq]
     SETUP_LOCAL_VARS
 
-%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
+%if VPX_ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
 
-%if ARCH_X86_64
+%if VPX_ARCH_X86_64
     %define               src1q  r7
     %define           sstride6q  r8
     %define          dst_stride  dstrideq
@@ -654,7 +654,7 @@ cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \
     REP_RET
 
 %else
-    ; ARCH_X86_64
+    ; VPX_ARCH_X86_64
     dec                 heightd
 
     movu                     m1, [srcq                ]     ;A
@@ -790,14 +790,14 @@ cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \
 .done:
     REP_RET
 
-%endif ; ARCH_X86_64
+%endif ; VPX_ARCH_X86_64
 
 %endm
 
 INIT_XMM ssse3
-SUBPIX_VFILTER16     v8
-SUBPIX_VFILTER16 v8_avg
-SUBPIX_VFILTER       v8, 8
-SUBPIX_VFILTER   v8_avg, 8
-SUBPIX_VFILTER       v8, 4
-SUBPIX_VFILTER   v8_avg, 4
+SUBPIX_VFILTER16     v8     ; vpx_filter_block1d16_v8_ssse3
+SUBPIX_VFILTER16 v8_avg     ; vpx_filter_block1d16_v8_avg_ssse3
+SUBPIX_VFILTER       v8, 8  ; vpx_filter_block1d8_v8_ssse3
+SUBPIX_VFILTER   v8_avg, 8  ; vpx_filter_block1d8_v8_avg_ssse3
+SUBPIX_VFILTER       v8, 4  ; vpx_filter_block1d4_v8_ssse3
+SUBPIX_VFILTER   v8_avg, 4  ; vpx_filter_block1d4_v8_avg_ssse3
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
index a378dd0402..65790b1c21 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
+++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
@@ -131,7 +131,9 @@
     dec         rcx
 %endm
 
-global sym(vpx_filter_block1d4_v2_sse2) PRIVATE
+SECTION .text
+
+globalsym(vpx_filter_block1d4_v2_sse2)
 sym(vpx_filter_block1d4_v2_sse2):
     push        rbp
     mov         rbp, rsp
@@ -155,7 +157,7 @@ sym(vpx_filter_block1d4_v2_sse2):
     pop         rbp
     ret
 
-global sym(vpx_filter_block1d8_v2_sse2) PRIVATE
+globalsym(vpx_filter_block1d8_v2_sse2)
 sym(vpx_filter_block1d8_v2_sse2):
     push        rbp
     mov         rbp, rsp
@@ -181,7 +183,7 @@ sym(vpx_filter_block1d8_v2_sse2):
     pop         rbp
     ret
 
-global sym(vpx_filter_block1d16_v2_sse2) PRIVATE
+globalsym(vpx_filter_block1d16_v2_sse2)
 sym(vpx_filter_block1d16_v2_sse2):
     push        rbp
     mov         rbp, rsp
@@ -209,7 +211,7 @@ sym(vpx_filter_block1d16_v2_sse2):
     pop         rbp
     ret
 
-global sym(vpx_filter_block1d4_v2_avg_sse2) PRIVATE
+globalsym(vpx_filter_block1d4_v2_avg_sse2)
 sym(vpx_filter_block1d4_v2_avg_sse2):
     push        rbp
     mov         rbp, rsp
@@ -233,7 +235,7 @@ sym(vpx_filter_block1d4_v2_avg_sse2):
     pop         rbp
     ret
 
-global sym(vpx_filter_block1d8_v2_avg_sse2) PRIVATE
+globalsym(vpx_filter_block1d8_v2_avg_sse2)
 sym(vpx_filter_block1d8_v2_avg_sse2):
     push        rbp
     mov         rbp, rsp
@@ -259,7 +261,7 @@ sym(vpx_filter_block1d8_v2_avg_sse2):
     pop         rbp
     ret
 
-global sym(vpx_filter_block1d16_v2_avg_sse2) PRIVATE
+globalsym(vpx_filter_block1d16_v2_avg_sse2)
 sym(vpx_filter_block1d16_v2_avg_sse2):
     push        rbp
     mov         rbp, rsp
@@ -287,7 +289,7 @@ sym(vpx_filter_block1d16_v2_avg_sse2):
     pop         rbp
     ret
 
-global sym(vpx_filter_block1d4_h2_sse2) PRIVATE
+globalsym(vpx_filter_block1d4_h2_sse2)
 sym(vpx_filter_block1d4_h2_sse2):
     push        rbp
     mov         rbp, rsp
@@ -312,7 +314,7 @@ sym(vpx_filter_block1d4_h2_sse2):
     pop         rbp
     ret
 
-global sym(vpx_filter_block1d8_h2_sse2) PRIVATE
+globalsym(vpx_filter_block1d8_h2_sse2)
 sym(vpx_filter_block1d8_h2_sse2):
     push        rbp
     mov         rbp, rsp
@@ -339,7 +341,7 @@ sym(vpx_filter_block1d8_h2_sse2):
     pop         rbp
     ret
 
-global sym(vpx_filter_block1d16_h2_sse2) PRIVATE
+globalsym(vpx_filter_block1d16_h2_sse2)
 sym(vpx_filter_block1d16_h2_sse2):
     push        rbp
     mov         rbp, rsp
@@ -367,7 +369,7 @@ sym(vpx_filter_block1d16_h2_sse2):
     pop         rbp
     ret
 
-global sym(vpx_filter_block1d4_h2_avg_sse2) PRIVATE
+globalsym(vpx_filter_block1d4_h2_avg_sse2)
 sym(vpx_filter_block1d4_h2_avg_sse2):
     push        rbp
     mov         rbp, rsp
@@ -392,7 +394,7 @@ sym(vpx_filter_block1d4_h2_avg_sse2):
     pop         rbp
     ret
 
-global sym(vpx_filter_block1d8_h2_avg_sse2) PRIVATE
+globalsym(vpx_filter_block1d8_h2_avg_sse2)
 sym(vpx_filter_block1d8_h2_avg_sse2):
     push        rbp
     mov         rbp, rsp
@@ -419,7 +421,7 @@ sym(vpx_filter_block1d8_h2_avg_sse2):
     pop         rbp
     ret
 
-global sym(vpx_filter_block1d16_h2_avg_sse2) PRIVATE
+globalsym(vpx_filter_block1d16_h2_avg_sse2)
 sym(vpx_filter_block1d16_h2_avg_sse2):
     push        rbp
     mov         rbp, rsp
diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
index 538b2129db..32e3cd3d9f 100644
--- a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
+++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
@@ -105,7 +105,9 @@
     dec         rcx
 %endm
 
-global sym(vpx_filter_block1d4_v2_ssse3) PRIVATE
+SECTION .text
+
+globalsym(vpx_filter_block1d4_v2_ssse3)
 sym(vpx_filter_block1d4_v2_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -129,7 +131,7 @@ sym(vpx_filter_block1d4_v2_ssse3):
     pop         rbp
     ret
 
-global sym(vpx_filter_block1d8_v2_ssse3) PRIVATE
+globalsym(vpx_filter_block1d8_v2_ssse3)
 sym(vpx_filter_block1d8_v2_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -155,7 +157,7 @@ sym(vpx_filter_block1d8_v2_ssse3):
     pop         rbp
     ret
 
-global sym(vpx_filter_block1d16_v2_ssse3) PRIVATE
+globalsym(vpx_filter_block1d16_v2_ssse3)
 sym(vpx_filter_block1d16_v2_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -182,7 +184,7 @@ sym(vpx_filter_block1d16_v2_ssse3):
     pop         rbp
     ret
 
-global sym(vpx_filter_block1d4_v2_avg_ssse3) PRIVATE
+globalsym(vpx_filter_block1d4_v2_avg_ssse3)
 sym(vpx_filter_block1d4_v2_avg_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -206,7 +208,7 @@ sym(vpx_filter_block1d4_v2_avg_ssse3):
     pop         rbp
     ret
 
-global sym(vpx_filter_block1d8_v2_avg_ssse3) PRIVATE
+globalsym(vpx_filter_block1d8_v2_avg_ssse3)
 sym(vpx_filter_block1d8_v2_avg_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -232,7 +234,7 @@ sym(vpx_filter_block1d8_v2_avg_ssse3):
     pop         rbp
     ret
 
-global sym(vpx_filter_block1d16_v2_avg_ssse3) PRIVATE
+globalsym(vpx_filter_block1d16_v2_avg_ssse3)
 sym(vpx_filter_block1d16_v2_avg_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -259,7 +261,7 @@ sym(vpx_filter_block1d16_v2_avg_ssse3):
     pop         rbp
     ret
 
-global sym(vpx_filter_block1d4_h2_ssse3) PRIVATE
+globalsym(vpx_filter_block1d4_h2_ssse3)
 sym(vpx_filter_block1d4_h2_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -284,7 +286,7 @@ sym(vpx_filter_block1d4_h2_ssse3):
     pop         rbp
     ret
 
-global sym(vpx_filter_block1d8_h2_ssse3) PRIVATE
+globalsym(vpx_filter_block1d8_h2_ssse3)
 sym(vpx_filter_block1d8_h2_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -311,7 +313,7 @@ sym(vpx_filter_block1d8_h2_ssse3):
     pop         rbp
     ret
 
-global sym(vpx_filter_block1d16_h2_ssse3) PRIVATE
+globalsym(vpx_filter_block1d16_h2_ssse3)
 sym(vpx_filter_block1d16_h2_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -338,7 +340,7 @@ sym(vpx_filter_block1d16_h2_ssse3):
     pop         rbp
     ret
 
-global sym(vpx_filter_block1d4_h2_avg_ssse3) PRIVATE
+globalsym(vpx_filter_block1d4_h2_avg_ssse3)
 sym(vpx_filter_block1d4_h2_avg_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -363,7 +365,7 @@ sym(vpx_filter_block1d4_h2_avg_ssse3):
     pop         rbp
     ret
 
-global sym(vpx_filter_block1d8_h2_avg_ssse3) PRIVATE
+globalsym(vpx_filter_block1d8_h2_avg_ssse3)
 sym(vpx_filter_block1d8_h2_avg_ssse3):
     push        rbp
     mov         rbp, rsp
@@ -390,7 +392,7 @@ sym(vpx_filter_block1d8_h2_avg_ssse3):
     pop         rbp
     ret
 
-global sym(vpx_filter_block1d16_h2_avg_ssse3) PRIVATE
+globalsym(vpx_filter_block1d16_h2_avg_ssse3)
 sym(vpx_filter_block1d16_h2_avg_ssse3):
     push        rbp
     mov         rbp, rsp
diff --git a/media/libvpx/libvpx/vpx_mem/include/vpx_mem_intrnl.h b/media/libvpx/libvpx/vpx_mem/include/vpx_mem_intrnl.h
index 2c259d322e..fb75d0c808 100644
--- a/media/libvpx/libvpx/vpx_mem/include/vpx_mem_intrnl.h
+++ b/media/libvpx/libvpx/vpx_mem/include/vpx_mem_intrnl.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_
-#define VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_
+#ifndef VPX_VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_
+#define VPX_VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_
 #include "./vpx_config.h"
 
 #define ADDRESS_STORAGE_SIZE sizeof(size_t)
@@ -26,6 +26,6 @@
 
 /*returns an addr aligned to the byte boundary specified by align*/
 #define align_addr(addr, align) \
-  (void *)(((size_t)(addr) + ((align)-1)) & ~(size_t)((align)-1))
+  (void *)(((size_t)(addr) + ((align) - 1)) & ~(size_t)((align) - 1))
 
-#endif  // VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_
+#endif  // VPX_VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_
diff --git a/media/libvpx/libvpx/vpx_mem/vpx_mem.c b/media/libvpx/libvpx/vpx_mem/vpx_mem.c
index a9be086806..18abf1158b 100644
--- a/media/libvpx/libvpx/vpx_mem/vpx_mem.c
+++ b/media/libvpx/libvpx/vpx_mem/vpx_mem.c
@@ -16,12 +16,14 @@
 #include "include/vpx_mem_intrnl.h"
 #include "vpx/vpx_integer.h"
 
+#if !defined(VPX_MAX_ALLOCABLE_MEMORY)
 #if SIZE_MAX > (1ULL << 40)
 #define VPX_MAX_ALLOCABLE_MEMORY (1ULL << 40)
 #else
 // For 32-bit targets keep this below INT_MAX to avoid valgrind warnings.
 #define VPX_MAX_ALLOCABLE_MEMORY ((1ULL << 31) - (1 << 16))
 #endif
+#endif
 
 // Returns 0 in case of overflow of nmemb * size.
 static int check_size_argument_overflow(uint64_t nmemb, uint64_t size) {
@@ -82,12 +84,3 @@ void vpx_free(void *memblk) {
     free(addr);
   }
 }
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void *vpx_memset16(void *dest, int val, size_t length) {
-  size_t i;
-  uint16_t *dest16 = (uint16_t *)dest;
-  for (i = 0; i < length; i++) *dest16++ = val;
-  return dest;
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vpx_mem/vpx_mem.h b/media/libvpx/libvpx/vpx_mem/vpx_mem.h
index 733aff4885..7689a05e6e 100644
--- a/media/libvpx/libvpx/vpx_mem/vpx_mem.h
+++ b/media/libvpx/libvpx/vpx_mem/vpx_mem.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_MEM_VPX_MEM_H_
-#define VPX_MEM_VPX_MEM_H_
+#ifndef VPX_VPX_MEM_VPX_MEM_H_
+#define VPX_VPX_MEM_VPX_MEM_H_
 
 #include "vpx_config.h"
 #if defined(__uClinux__)
@@ -19,6 +19,8 @@
 #include <stdlib.h>
 #include <stddef.h>
 
+#include "vpx/vpx_integer.h"
+
 #if defined(__cplusplus)
 extern "C" {
 #endif
@@ -29,7 +31,12 @@ void *vpx_calloc(size_t num, size_t size);
 void vpx_free(void *memblk);
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void *vpx_memset16(void *dest, int val, size_t length);
+static INLINE void *vpx_memset16(void *dest, int val, size_t length) {
+  size_t i;
+  uint16_t *dest16 = (uint16_t *)dest;
+  for (i = 0; i < length; i++) *dest16++ = val;
+  return dest;
+}
 #endif
 
 #include <string.h>
@@ -42,4 +49,4 @@ void *vpx_memset16(void *dest, int val, size_t length);
 }
 #endif
 
-#endif  // VPX_MEM_VPX_MEM_H_
+#endif  // VPX_VPX_MEM_VPX_MEM_H_
diff --git a/media/libvpx/libvpx/vpx_ports/aarch32_cpudetect.c b/media/libvpx/libvpx/vpx_ports/aarch32_cpudetect.c
new file mode 100644
index 0000000000..20f688e179
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_ports/aarch32_cpudetect.c
@@ -0,0 +1,90 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+// Feature detection code for Armv7-A / AArch32.
+
+#include "./vpx_config.h"
+#include "arm_cpudetect.h"
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+static int arm_get_cpu_caps(void) {
+  // This function should actually be a no-op. There is no way to adjust any of
+  // these because the RTCD tables do not exist: the functions are called
+  // statically.
+  int flags = 0;
+#if HAVE_NEON
+  flags |= HAS_NEON;
+#endif  // HAVE_NEON
+  return flags;
+}
+
+#elif defined(_MSC_VER)  // end !CONFIG_RUNTIME_CPU_DETECT
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+#if HAVE_NEON || HAVE_NEON_ASM
+  // MSVC has no inline __asm support for Arm, but it does let you __emit
+  // instructions via their assembled hex code.
+  // All of these instructions should be essentially nops.
+  __try {
+    // VORR q0,q0,q0
+    __emit(0xF2200150);
+    flags |= HAS_NEON;
+  } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) {
+    // Ignore exception.
+  }
+#endif  // HAVE_NEON || HAVE_NEON_ASM
+  return flags;
+}
+
+#elif defined(VPX_USE_ANDROID_CPU_FEATURES)
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+#if HAVE_NEON || HAVE_NEON_ASM
+  uint64_t features = android_getCpuFeatures();
+  if (features & ANDROID_CPU_ARM_FEATURE_NEON) {
+    flags |= HAS_NEON;
+  }
+#endif  // HAVE_NEON || HAVE_NEON_ASM
+  return flags;
+}
+
+#elif defined(__linux__)  // end defined(VPX_USE_ANDROID_CPU_FEATURES)
+
+#include <sys/auxv.h>
+
+// Define hwcap values ourselves: building with an old auxv header where these
+// hwcap values are not defined should not prevent features from being enabled.
+#define VPX_AARCH32_HWCAP_NEON (1 << 12)
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+  unsigned long hwcap = getauxval(AT_HWCAP);
+#if HAVE_NEON || HAVE_NEON_ASM
+  if (hwcap & VPX_AARCH32_HWCAP_NEON) {
+    flags |= HAS_NEON;
+  }
+#endif  // HAVE_NEON || HAVE_NEON_ASM
+  return flags;
+}
+#else   // end __linux__
+#error \
+    "Runtime CPU detection selected, but no CPU detection method available" \
+"for your platform. Rerun configure with --disable-runtime-cpu-detect."
+#endif
+
+int arm_cpu_caps(void) {
+  int flags = 0;
+  if (arm_cpu_env_flags(&flags)) {
+    return flags;
+  }
+  return arm_get_cpu_caps() & arm_cpu_env_mask();
+}
diff --git a/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c b/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c
new file mode 100644
index 0000000000..df8e1e244d
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c
@@ -0,0 +1,241 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "vpx_ports/arm.h"
+#include "vpx_ports/arm_cpudetect.h"
+
+#if defined(__APPLE__)
+#include <sys/sysctl.h>
+#endif
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+static int arm_get_cpu_caps(void) {
+  // This function should actually be a no-op. There is no way to adjust any of
+  // these because the RTCD tables do not exist: the functions are called
+  // statically.
+  int flags = 0;
+#if HAVE_NEON
+  flags |= HAS_NEON;
+#endif  // HAVE_NEON
+  return flags;
+}
+
+#elif defined(__APPLE__)  // end !CONFIG_RUNTIME_CPU_DETECT
+
+// sysctlbyname() parameter documentation for instruction set characteristics:
+// https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics
+static INLINE int64_t have_feature(const char *feature) {
+  int64_t feature_present = 0;
+  size_t size = sizeof(feature_present);
+  if (sysctlbyname(feature, &feature_present, &size, NULL, 0) != 0) {
+    return 0;
+  }
+  return feature_present;
+}
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+#if HAVE_NEON
+  flags |= HAS_NEON;
+#endif  // HAVE_NEON
+#if HAVE_NEON_DOTPROD
+  if (have_feature("hw.optional.arm.FEAT_DotProd")) {
+    flags |= HAS_NEON_DOTPROD;
+  }
+#endif  // HAVE_NEON_DOTPROD
+#if HAVE_NEON_I8MM
+  if (have_feature("hw.optional.arm.FEAT_I8MM")) {
+    flags |= HAS_NEON_I8MM;
+  }
+#endif  // HAVE_NEON_I8MM
+  return flags;
+}
+
+#elif defined(_WIN32)  // end __APPLE__
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+// IsProcessorFeaturePresent() parameter documentation:
+// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-isprocessorfeaturepresent#parameters
+#if HAVE_NEON
+  flags |= HAS_NEON;  // Neon is mandatory in Armv8.0-A.
+#endif  // HAVE_NEON
+#if HAVE_NEON_DOTPROD
+// Support for PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE was added in Windows SDK
+// 20348, supported by Windows 11 and Windows Server 2022.
+#if defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)
+  if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)) {
+    flags |= HAS_NEON_DOTPROD;
+  }
+#endif  // defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)
+#endif  // HAVE_NEON_DOTPROD
+#if HAVE_NEON_I8MM
+// Support for PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE was added in Windows SDK
+// 26100.
+#if defined(PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE)
+  // There's no PF_* flag that indicates whether plain I8MM is available
+  // or not. But if SVE_I8MM is available, that also implies that
+  // regular I8MM is available.
+  if (IsProcessorFeaturePresent(PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE)) {
+    flags |= HAS_NEON_I8MM;
+  }
+#endif  // defined(PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE)
+#endif  // HAVE_NEON_I8MM
+#if HAVE_SVE
+// Support for PF_ARM_SVE_INSTRUCTIONS_AVAILABLE was added in Windows SDK 26100.
+#if defined(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE)
+  if (IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE)) {
+    flags |= HAS_SVE;
+  }
+#endif  // defined(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE)
+#endif  // HAVE_SVE
+#if HAVE_SVE2
+// Support for PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE was added in Windows SDK
+// 26100.
+#if defined(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE)
+  if (IsProcessorFeaturePresent(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE)) {
+    flags |= HAS_SVE2;
+  }
+#endif  // defined(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE)
+#endif  // HAVE_SVE2
+  return flags;
+}
+
+#elif defined(VPX_USE_ANDROID_CPU_FEATURES)
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+#if HAVE_NEON
+  flags |= HAS_NEON;  // Neon is mandatory in Armv8.0-A.
+#endif  // HAVE_NEON
+  return flags;
+}
+
+#elif defined(__linux__)  // end defined(VPX_USE_ANDROID_CPU_FEATURES)
+
+#include <sys/auxv.h>
+
+// Define hwcap values ourselves: building with an old auxv header where these
+// hwcap values are not defined should not prevent features from being enabled.
+#define VPX_AARCH64_HWCAP_ASIMDDP (1 << 20)
+#define VPX_AARCH64_HWCAP_SVE (1 << 22)
+#define VPX_AARCH64_HWCAP2_SVE2 (1 << 1)
+#define VPX_AARCH64_HWCAP2_I8MM (1 << 13)
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+#if HAVE_NEON_DOTPROD || HAVE_SVE
+  unsigned long hwcap = getauxval(AT_HWCAP);
+#endif  // HAVE_NEON_DOTPROD || HAVE_SVE
+#if HAVE_NEON_I8MM || HAVE_SVE2
+  unsigned long hwcap2 = getauxval(AT_HWCAP2);
+#endif  // HAVE_NEON_I8MM || HAVE_SVE2
+#if HAVE_NEON
+  flags |= HAS_NEON;  // Neon is mandatory in Armv8.0-A.
+#endif  // HAVE_NEON
+#if HAVE_NEON_DOTPROD
+  if (hwcap & VPX_AARCH64_HWCAP_ASIMDDP) {
+    flags |= HAS_NEON_DOTPROD;
+  }
+#endif  // HAVE_NEON_DOTPROD
+#if HAVE_NEON_I8MM
+  if (hwcap2 & VPX_AARCH64_HWCAP2_I8MM) {
+    flags |= HAS_NEON_I8MM;
+  }
+#endif  // HAVE_NEON_I8MM
+#if HAVE_SVE
+  if (hwcap & VPX_AARCH64_HWCAP_SVE) {
+    flags |= HAS_SVE;
+  }
+#endif  // HAVE_SVE
+#if HAVE_SVE2
+  if (hwcap2 & VPX_AARCH64_HWCAP2_SVE2) {
+    flags |= HAS_SVE2;
+  }
+#endif  // HAVE_SVE2
+  return flags;
+}
+
+#elif defined(__Fuchsia__)  // end __linux__
+
+#include <zircon/features.h>
+#include <zircon/syscalls.h>
+
+// Added in https://fuchsia-review.googlesource.com/c/fuchsia/+/894282.
+#ifndef ZX_ARM64_FEATURE_ISA_I8MM
+#define ZX_ARM64_FEATURE_ISA_I8MM ((uint32_t)(1u << 19))
+#endif
+// Added in https://fuchsia-review.googlesource.com/c/fuchsia/+/895083.
+#ifndef ZX_ARM64_FEATURE_ISA_SVE
+#define ZX_ARM64_FEATURE_ISA_SVE ((uint32_t)(1u << 20))
+#endif
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+#if HAVE_NEON
+  flags |= HAS_NEON;  // Neon is mandatory in Armv8.0-A.
+#endif  // HAVE_NEON
+  uint32_t features;
+  zx_status_t status = zx_system_get_features(ZX_FEATURE_KIND_CPU, &features);
+  if (status != ZX_OK) {
+    return flags;
+  }
+#if HAVE_NEON_DOTPROD
+  if (features & ZX_ARM64_FEATURE_ISA_DP) {
+    flags |= HAS_NEON_DOTPROD;
+  }
+#endif  // HAVE_NEON_DOTPROD
+#if HAVE_NEON_I8MM
+  if (features & ZX_ARM64_FEATURE_ISA_I8MM) {
+    flags |= HAS_NEON_I8MM;
+  }
+#endif  // HAVE_NEON_I8MM
+#if HAVE_SVE
+  if (features & ZX_ARM64_FEATURE_ISA_SVE) {
+    flags |= HAS_SVE;
+  }
+#endif  // HAVE_SVE
+  return flags;
+}
+
+#else  // end __Fuchsia__
+#error \
+    "Runtime CPU detection selected, but no CPU detection method available" \
+"for your platform. Rerun configure with --disable-runtime-cpu-detect."
+#endif
+
+int arm_cpu_caps(void) {
+  int flags = 0;
+  if (!arm_cpu_env_flags(&flags)) {
+    flags = arm_get_cpu_caps() & arm_cpu_env_mask();
+  }
+
+  // Restrict flags: FEAT_I8MM assumes that FEAT_DotProd is available.
+  if (!(flags & HAS_NEON_DOTPROD)) {
+    flags &= ~HAS_NEON_I8MM;
+  }
+
+  // Restrict flags: FEAT_SVE assumes that FEAT_{DotProd,I8MM} are available.
+  if (!(flags & HAS_NEON_DOTPROD)) {
+    flags &= ~HAS_SVE;
+  }
+  if (!(flags & HAS_NEON_I8MM)) {
+    flags &= ~HAS_SVE;
+  }
+
+  // Restrict flags: FEAT_SVE2 assumes that FEAT_SVE is available.
+  if (!(flags & HAS_SVE)) {
+    flags &= ~HAS_SVE2;
+  }
+
+  return flags;
+}
diff --git a/media/libvpx/libvpx/vpx_ports/arm.h b/media/libvpx/libvpx/vpx_ports/arm.h
index 7be6104a4f..814c3cc408 100644
--- a/media/libvpx/libvpx/vpx_ports/arm.h
+++ b/media/libvpx/libvpx/vpx_ports/arm.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_PORTS_ARM_H_
-#define VPX_PORTS_ARM_H_
+#ifndef VPX_VPX_PORTS_ARM_H_
+#define VPX_VPX_PORTS_ARM_H_
 #include <stdlib.h>
 #include "vpx_config.h"
 
@@ -17,12 +17,16 @@
 extern "C" {
 #endif
 
-/*ARMv5TE "Enhanced DSP" instructions.*/
-#define HAS_EDSP 0x01
-/*ARMv6 "Parallel" or "Media" instructions.*/
-#define HAS_MEDIA 0x02
-/*ARMv7 optional NEON instructions.*/
-#define HAS_NEON 0x04
+// Armv7-A optional Neon instructions, mandatory from Armv8.0-A.
+#define HAS_NEON (1 << 0)
+// Armv8.2-A optional Neon dot-product instructions, mandatory from Armv8.4-A.
+#define HAS_NEON_DOTPROD (1 << 1)
+// Armv8.2-A optional Neon i8mm instructions, mandatory from Armv8.6-A.
+#define HAS_NEON_I8MM (1 << 2)
+// Armv8.2-A optional SVE instructions, mandatory from Armv9.0-A.
+#define HAS_SVE (1 << 3)
+// Armv9.0-A SVE2 instructions.
+#define HAS_SVE2 (1 << 4)
 
 int arm_cpu_caps(void);
 
@@ -36,4 +40,4 @@ int arm_cpu_caps(void);
 }  // extern "C"
 #endif
 
-#endif  // VPX_PORTS_ARM_H_
+#endif  // VPX_VPX_PORTS_ARM_H_
diff --git a/media/libvpx/libvpx/vpx_ports/arm_cpudetect.c b/media/libvpx/libvpx/vpx_ports/arm_cpudetect.c
deleted file mode 100644
index 4f9d480ade..0000000000
--- a/media/libvpx/libvpx/vpx_ports/arm_cpudetect.c
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdlib.h>
-#include <string.h>
-
-#include "./vpx_config.h"
-#include "vpx_ports/arm.h"
-
-#ifdef WINAPI_FAMILY
-#include <winapifamily.h>
-#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
-#define getenv(x) NULL
-#endif
-#endif
-
-static int arm_cpu_env_flags(int *flags) {
-  char *env;
-  env = getenv("VPX_SIMD_CAPS");
-  if (env && *env) {
-    *flags = (int)strtol(env, NULL, 0);
-    return 0;
-  }
-  *flags = 0;
-  return -1;
-}
-
-static int arm_cpu_env_mask(void) {
-  char *env;
-  env = getenv("VPX_SIMD_CAPS_MASK");
-  return env && *env ? (int)strtol(env, NULL, 0) : ~0;
-}
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-
-int arm_cpu_caps(void) {
-  /* This function should actually be a no-op. There is no way to adjust any of
-   * these because the RTCD tables do not exist: the functions are called
-   * statically */
-  int flags;
-  int mask;
-  if (!arm_cpu_env_flags(&flags)) {
-    return flags;
-  }
-  mask = arm_cpu_env_mask();
-#if HAVE_NEON || HAVE_NEON_ASM
-  flags |= HAS_NEON;
-#endif /* HAVE_NEON  || HAVE_NEON_ASM */
-  return flags & mask;
-}
-
-#elif defined(_MSC_VER) /* end !CONFIG_RUNTIME_CPU_DETECT */
-/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#endif
-#ifndef WIN32_EXTRA_LEAN
-#define WIN32_EXTRA_LEAN
-#endif
-#include <windows.h>
-
-int arm_cpu_caps(void) {
-  int flags;
-  int mask;
-  if (!arm_cpu_env_flags(&flags)) {
-    return flags;
-  }
-  mask = arm_cpu_env_mask();
-/* MSVC has no inline __asm support for ARM, but it does let you __emit
- *  instructions via their assembled hex code.
- * All of these instructions should be essentially nops.
- */
-#if HAVE_NEON || HAVE_NEON_ASM
-  if (mask & HAS_NEON) {
-    __try {
-      /*VORR q0,q0,q0*/
-      __emit(0xF2200150);
-      flags |= HAS_NEON;
-    } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) {
-      /*Ignore exception.*/
-    }
-  }
-#endif /* HAVE_NEON || HAVE_NEON_ASM */
-  return flags & mask;
-}
-
-#elif defined(__ANDROID__) /* end _MSC_VER */
-#include <cpu-features.h>
-
-int arm_cpu_caps(void) {
-  int flags;
-  int mask;
-  uint64_t features;
-  if (!arm_cpu_env_flags(&flags)) {
-    return flags;
-  }
-  mask = arm_cpu_env_mask();
-  features = android_getCpuFeatures();
-
-#if HAVE_NEON || HAVE_NEON_ASM
-  if (features & ANDROID_CPU_ARM_FEATURE_NEON) flags |= HAS_NEON;
-#endif /* HAVE_NEON || HAVE_NEON_ASM */
-  return flags & mask;
-}
-
-#elif defined(__linux__) /* end __ANDROID__ */
-
-#include <stdio.h>
-
-int arm_cpu_caps(void) {
-  FILE *fin;
-  int flags;
-  int mask;
-  if (!arm_cpu_env_flags(&flags)) {
-    return flags;
-  }
-  mask = arm_cpu_env_mask();
-  /* Reading /proc/self/auxv would be easier, but that doesn't work reliably
-   *  on Android.
-   * This also means that detection will fail in Scratchbox.
-   */
-  fin = fopen("/proc/cpuinfo", "r");
-  if (fin != NULL) {
-    /* 512 should be enough for anybody (it's even enough for all the flags
-     * that x86 has accumulated... so far).
-     */
-    char buf[512];
-    while (fgets(buf, 511, fin) != NULL) {
-#if HAVE_NEON || HAVE_NEON_ASM
-      if (memcmp(buf, "Features", 8) == 0) {
-        char *p;
-        p = strstr(buf, " neon");
-        if (p != NULL && (p[5] == ' ' || p[5] == '\n')) {
-          flags |= HAS_NEON;
-        }
-      }
-#endif /* HAVE_NEON || HAVE_NEON_ASM */
-    }
-    fclose(fin);
-  }
-  return flags & mask;
-}
-#else  /* end __linux__ */
-#error \
-    "--enable-runtime-cpu-detect selected, but no CPU detection method " \
-"available for your platform. Reconfigure with --disable-runtime-cpu-detect."
-#endif
diff --git a/media/libvpx/libvpx/vpx_ports/arm_cpudetect.h b/media/libvpx/libvpx/vpx_ports/arm_cpudetect.h
new file mode 100644
index 0000000000..9b64a1fa2d
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_ports/arm_cpudetect.h
@@ -0,0 +1,52 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx_config.h"
+#include "vpx_ports/arm.h"
+
+#if defined(_WIN32)
+#undef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#undef WIN32_EXTRA_LEAN
+#define WIN32_EXTRA_LEAN
+#include <windows.h>
+#endif
+
+#ifdef WINAPI_FAMILY
+#include <winapifamily.h>
+#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#define getenv(x) NULL
+#endif
+#endif
+
+#if defined(__ANDROID__) && (__ANDROID_API__ < 18)
+#define VPX_USE_ANDROID_CPU_FEATURES 1
+// Use getauxval() when targeting (64-bit) Android with API level >= 18.
+// getauxval() is supported since Android API level 18 (Android 4.3.)
+// First Android version with 64-bit support was Android 5.x (API level 21).
+#include <cpu-features.h>
+#endif
+
+static INLINE int arm_cpu_env_flags(int *flags) {
+  const char *env = getenv("VPX_SIMD_CAPS");
+  if (env && *env) {
+    *flags = (int)strtol(env, NULL, 0);
+    return 1;
+  }
+  return 0;
+}
+
+static INLINE int arm_cpu_env_mask(void) {
+  const char *env = getenv("VPX_SIMD_CAPS_MASK");
+  return env && *env ? (int)strtol(env, NULL, 0) : ~0;
+}
diff --git a/media/libvpx/libvpx/vpx_ports/asmdefs_mmi.h b/media/libvpx/libvpx/vpx_ports/asmdefs_mmi.h
new file mode 100644
index 0000000000..400a51cc32
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_ports/asmdefs_mmi.h
@@ -0,0 +1,81 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_PORTS_ASMDEFS_MMI_H_
+#define VPX_VPX_PORTS_ASMDEFS_MMI_H_
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#if HAVE_MMI
+
+#if HAVE_MIPS64
+#define mips_reg int64_t
+#define MMI_ADDU(reg1, reg2, reg3) \
+  "daddu       " #reg1 ",       " #reg2 ",       " #reg3 "         \n\t"
+
+#define MMI_ADDIU(reg1, reg2, immediate) \
+  "daddiu      " #reg1 ",       " #reg2 ",       " #immediate "    \n\t"
+
+#define MMI_ADDI(reg1, reg2, immediate) \
+  "daddi       " #reg1 ",       " #reg2 ",       " #immediate "    \n\t"
+
+#define MMI_SUBU(reg1, reg2, reg3) \
+  "dsubu       " #reg1 ",       " #reg2 ",       " #reg3 "         \n\t"
+
+#define MMI_L(reg, addr, bias) \
+  "ld          " #reg ",        " #bias "(" #addr ")               \n\t"
+
+#define MMI_SRL(reg1, reg2, shift) \
+  "ssrld       " #reg1 ",       " #reg2 ",       " #shift "        \n\t"
+
+#define MMI_SLL(reg1, reg2, shift) \
+  "dsll        " #reg1 ",       " #reg2 ",       " #shift "        \n\t"
+
+#define MMI_MTC1(reg, fp) \
+  "dmtc1       " #reg ",        " #fp "                            \n\t"
+
+#define MMI_LI(reg, immediate) \
+  "dli         " #reg ",        " #immediate "                     \n\t"
+
+#else
+#define mips_reg int32_t
+#define MMI_ADDU(reg1, reg2, reg3) \
+  "addu        " #reg1 ",       " #reg2 ",       " #reg3 "         \n\t"
+
+#define MMI_ADDIU(reg1, reg2, immediate) \
+  "addiu       " #reg1 ",       " #reg2 ",       " #immediate "    \n\t"
+
+#define MMI_ADDI(reg1, reg2, immediate) \
+  "addi        " #reg1 ",       " #reg2 ",       " #immediate "    \n\t"
+
+#define MMI_SUBU(reg1, reg2, reg3) \
+  "subu        " #reg1 ",       " #reg2 ",       " #reg3 "         \n\t"
+
+#define MMI_L(reg, addr, bias) \
+  "lw          " #reg ",        " #bias "(" #addr ")               \n\t"
+
+#define MMI_SRL(reg1, reg2, shift) \
+  "ssrlw       " #reg1 ",       " #reg2 ",       " #shift "        \n\t"
+
+#define MMI_SLL(reg1, reg2, shift) \
+  "sll         " #reg1 ",       " #reg2 ",       " #shift "        \n\t"
+
+#define MMI_MTC1(reg, fp) \
+  "mtc1        " #reg ",        " #fp "                            \n\t"
+
+#define MMI_LI(reg, immediate) \
+  "li          " #reg ",        " #immediate "                     \n\t"
+
+#endif /* HAVE_MIPS64 */
+
+#endif /* HAVE_MMI */
+
+#endif  // VPX_VPX_PORTS_ASMDEFS_MMI_H_
diff --git a/media/libvpx/libvpx/vpx_ports/bitops.h b/media/libvpx/libvpx/vpx_ports/bitops.h
index 0ed7189ff6..e92c972f50 100644
--- a/media/libvpx/libvpx/vpx_ports/bitops.h
+++ b/media/libvpx/libvpx/vpx_ports/bitops.h
@@ -8,13 +8,11 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_PORTS_BITOPS_H_
-#define VPX_PORTS_BITOPS_H_
+#ifndef VPX_VPX_PORTS_BITOPS_H_
+#define VPX_VPX_PORTS_BITOPS_H_
 
 #include <assert.h>
 
-#include "vpx_ports/msvc.h"
-
 #ifdef _MSC_VER
 #if defined(_M_X64) || defined(_M_IX86)
 #include <intrin.h>
@@ -26,20 +24,32 @@
 extern "C" {
 #endif
 
-// These versions of get_msb() are only valid when n != 0 because all
-// of the optimized versions are undefined when n == 0:
+// These versions of get_lsb() and get_msb() are only valid when n != 0
+// because all of the optimized versions are undefined when n == 0:
 // https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html
 
 // use GNU builtins where available.
 #if defined(__GNUC__) && \
     ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
+static INLINE int get_lsb(unsigned int n) {
+  assert(n != 0);
+  return __builtin_ctz(n);
+}
+
 static INLINE int get_msb(unsigned int n) {
   assert(n != 0);
   return 31 ^ __builtin_clz(n);
 }
 #elif defined(USE_MSC_INTRINSICS)
+#pragma intrinsic(_BitScanForward)
 #pragma intrinsic(_BitScanReverse)
 
+static INLINE int get_lsb(unsigned int n) {
+  unsigned long first_set_bit;  // NOLINT(runtime/int)
+  _BitScanForward(&first_set_bit, n);
+  return first_set_bit;
+}
+
 static INLINE int get_msb(unsigned int n) {
   unsigned long first_set_bit;
   assert(n != 0);
@@ -48,6 +58,13 @@ static INLINE int get_msb(unsigned int n) {
 }
 #undef USE_MSC_INTRINSICS
 #else
+static INLINE int get_lsb(unsigned int n) {
+  int i;
+  assert(n != 0);
+  for (i = 0; i < 32 && !(n & 1); ++i) n >>= 1;
+  return i;
+}
+
 // Returns (int)floor(log2(n)). n must be > 0.
 static INLINE int get_msb(unsigned int n) {
   int log = 0;
@@ -72,4 +89,4 @@ static INLINE int get_msb(unsigned int n) {
 }  // extern "C"
 #endif
 
-#endif  // VPX_PORTS_BITOPS_H_
+#endif  // VPX_VPX_PORTS_BITOPS_H_
diff --git a/media/libvpx/libvpx/vpx_ports/compiler_attributes.h b/media/libvpx/libvpx/vpx_ports/compiler_attributes.h
new file mode 100644
index 0000000000..4b468749b8
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_ports/compiler_attributes.h
@@ -0,0 +1,69 @@
+/*
+ *  Copyright (c) 2020 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_PORTS_COMPILER_ATTRIBUTES_H_
+#define VPX_VPX_PORTS_COMPILER_ATTRIBUTES_H_
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif  // !defined(__has_feature)
+
+#if !defined(__has_attribute)
+#define __has_attribute(x) 0
+#endif  // !defined(__has_attribute)
+
+//------------------------------------------------------------------------------
+// Sanitizer attributes.
+
+#if __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__)
+#define VPX_WITH_ASAN 1
+#else
+#define VPX_WITH_ASAN 0
+#endif  // __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__)
+
+#if defined(__clang__) && __has_attribute(no_sanitize)
+// Both of these have defined behavior and are used in certain operations or
+// optimizations thereof. There are cases where an overflow may be unintended,
+// however, so use of these attributes should be done with care.
+#define VPX_NO_UNSIGNED_OVERFLOW_CHECK \
+  __attribute__((no_sanitize("unsigned-integer-overflow")))
+#if __clang_major__ >= 12
+#define VPX_NO_UNSIGNED_SHIFT_CHECK \
+  __attribute__((no_sanitize("unsigned-shift-base")))
+#endif  // __clang__ >= 12
+#endif  // __clang__
+
+#ifndef VPX_NO_UNSIGNED_OVERFLOW_CHECK
+#define VPX_NO_UNSIGNED_OVERFLOW_CHECK
+#endif
+#ifndef VPX_NO_UNSIGNED_SHIFT_CHECK
+#define VPX_NO_UNSIGNED_SHIFT_CHECK
+#endif
+
+//------------------------------------------------------------------------------
+// Variable attributes.
+
+#if __has_attribute(uninitialized)
+// Attribute "uninitialized" disables -ftrivial-auto-var-init=pattern for
+// the specified variable.
+//
+// -ftrivial-auto-var-init is security risk mitigation feature, so attribute
+// should not be used "just in case", but only to fix real performance
+// bottlenecks when other approaches do not work. In general the compiler is
+// quite effective at eliminating unneeded initializations introduced by the
+// flag, e.g. when they are followed by actual initialization by a program.
+// However if compiler optimization fails and code refactoring is hard, the
+// attribute can be used as a workaround.
+#define VPX_UNINITIALIZED __attribute__((uninitialized))
+#else
+#define VPX_UNINITIALIZED
+#endif  // __has_attribute(uninitialized)
+
+#endif  // VPX_VPX_PORTS_COMPILER_ATTRIBUTES_H_
diff --git a/media/libvpx/libvpx/vpx_ports/emmintrin_compat.h b/media/libvpx/libvpx/vpx_ports/emmintrin_compat.h
index 903534e0c0..d6cc68ee4d 100644
--- a/media/libvpx/libvpx/vpx_ports/emmintrin_compat.h
+++ b/media/libvpx/libvpx/vpx_ports/emmintrin_compat.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_PORTS_EMMINTRIN_COMPAT_H_
-#define VPX_PORTS_EMMINTRIN_COMPAT_H_
+#ifndef VPX_VPX_PORTS_EMMINTRIN_COMPAT_H_
+#define VPX_VPX_PORTS_EMMINTRIN_COMPAT_H_
 
 #if defined(__GNUC__) && __GNUC__ < 4
 /* From emmintrin.h (gcc 4.5.3) */
@@ -52,4 +52,4 @@ extern __inline __m128d
 }
 #endif
 
-#endif  // VPX_PORTS_EMMINTRIN_COMPAT_H_
+#endif  // VPX_VPX_PORTS_EMMINTRIN_COMPAT_H_
diff --git a/media/libvpx/libvpx/vpx_ports/emms_mmx.asm b/media/libvpx/libvpx/vpx_ports/emms_mmx.asm
new file mode 100644
index 0000000000..b31b25ebde
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_ports/emms_mmx.asm
@@ -0,0 +1,18 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+section .text
+globalsym(vpx_clear_system_state)
+sym(vpx_clear_system_state):
+    emms
+    ret
diff --git a/media/libvpx/libvpx/vpx_ports/config.h b/media/libvpx/libvpx/vpx_ports/emms_mmx.c
similarity index 66%
rename from media/libvpx/libvpx/vpx_ports/config.h
rename to media/libvpx/libvpx/vpx_ports/emms_mmx.c
index 3c1ab99f4a..79b98a75f1 100644
--- a/media/libvpx/libvpx/vpx_ports/config.h
+++ b/media/libvpx/libvpx/vpx_ports/emms_mmx.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -8,9 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_PORTS_CONFIG_H_
-#define VPX_PORTS_CONFIG_H_
+#include <mmintrin.h>
 
-#include "vpx_config.h"
+#include "vpx_ports/system_state.h"
 
-#endif  // VPX_PORTS_CONFIG_H_
+void vpx_clear_system_state(void) { _mm_empty(); }
diff --git a/media/libvpx/libvpx/vpx_ports/emms.asm b/media/libvpx/libvpx/vpx_ports/float_control_word.asm
similarity index 81%
rename from media/libvpx/libvpx/vpx_ports/emms.asm
rename to media/libvpx/libvpx/vpx_ports/float_control_word.asm
index db8da28737..bb75b7a31f 100644
--- a/media/libvpx/libvpx/vpx_ports/emms.asm
+++ b/media/libvpx/libvpx/vpx_ports/float_control_word.asm
@@ -12,14 +12,9 @@
 %include "vpx_ports/x86_abi_support.asm"
 
 section .text
-global sym(vpx_reset_mmx_state) PRIVATE
-sym(vpx_reset_mmx_state):
-    emms
-    ret
-
 
 %if LIBVPX_YASM_WIN64
-global sym(vpx_winx64_fldcw) PRIVATE
+globalsym(vpx_winx64_fldcw)
 sym(vpx_winx64_fldcw):
     sub   rsp, 8
     mov   [rsp], rcx ; win x64 specific
@@ -28,7 +23,7 @@ sym(vpx_winx64_fldcw):
     ret
 
 
-global sym(vpx_winx64_fstcw) PRIVATE
+globalsym(vpx_winx64_fstcw)
 sym(vpx_winx64_fstcw):
     sub   rsp, 8
     fstcw [rsp]
diff --git a/media/libvpx/libvpx/vpx_ports/loongarch.h b/media/libvpx/libvpx/vpx_ports/loongarch.h
new file mode 100644
index 0000000000..d93ff9f5f0
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_ports/loongarch.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Jin Bo  <jinbo@loongson.cn>
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS.  All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_PORTS_LOONGARCH_H_
+#define VPX_VPX_PORTS_LOONGARCH_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define HAS_LSX 0x01
+#define HAS_LASX 0x02
+
+int loongarch_cpu_caps(void);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VPX_PORTS_LOONGARCH_H_
diff --git a/media/libvpx/libvpx/vpx_ports/loongarch_cpudetect.c b/media/libvpx/libvpx/vpx_ports/loongarch_cpudetect.c
new file mode 100644
index 0000000000..7b4322d35e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_ports/loongarch_cpudetect.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Jin Bo  <jinbo@loongson.cn>
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS.  All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "vpx_ports/loongarch.h"
+
+#define LOONGARCH_CFG2 0x02
+#define LOONGARCH_CFG2_LSX (1 << 6)
+#define LOONGARCH_CFG2_LASX (1 << 7)
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#if defined(__loongarch__) && defined(__linux__)
+int loongarch_cpu_caps(void) {
+  int reg = 0;
+  int flag = 0;
+
+  __asm__ volatile("cpucfg %0, %1 \n\t" : "+&r"(reg) : "r"(LOONGARCH_CFG2));
+  if (reg & LOONGARCH_CFG2_LSX) flag |= HAS_LSX;
+
+  if (reg & LOONGARCH_CFG2_LASX) flag |= HAS_LASX;
+
+  return flag;
+}
+#else /* end __loongarch__ && __linux__ */
+#error \
+    "--enable-runtime-cpu-detect selected, but no CPU detection method " \
+"available for your platform. Reconfigure with --disable-runtime-cpu-detect."
+#endif
+#else /* end CONFIG_RUNTIME_CPU_DETECT */
+int loongarch_cpu_caps(void) { return 0; }
+#endif
diff --git a/media/libvpx/libvpx/vpx_ports/mem.h b/media/libvpx/libvpx/vpx_ports/mem.h
index 2d49b7a06d..5165b676ac 100644
--- a/media/libvpx/libvpx/vpx_ports/mem.h
+++ b/media/libvpx/libvpx/vpx_ports/mem.h
@@ -8,13 +8,13 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_PORTS_MEM_H_
-#define VPX_PORTS_MEM_H_
+#ifndef VPX_VPX_PORTS_MEM_H_
+#define VPX_VPX_PORTS_MEM_H_
 
 #include "vpx_config.h"
 #include "vpx/vpx_integer.h"
 
-#if (defined(__GNUC__) && __GNUC__) || defined(__SUNPRO_C)
+#if defined(__GNUC__) || defined(__SUNPRO_C)
 #define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
 #elif defined(_MSC_VER)
 #define DECLARE_ALIGNED(n, typ, val) __declspec(align(n)) typ val
@@ -23,40 +23,28 @@
 #define DECLARE_ALIGNED(n, typ, val) typ val
 #endif
 
-/* Indicates that the usage of the specified variable has been audited to assure
- * that it's safe to use uninitialized. Silences 'may be used uninitialized'
- * warnings on gcc.
- */
-#if defined(__GNUC__) && __GNUC__
-#define UNINITIALIZED_IS_SAFE(x) x = x
+#if defined(__has_builtin)
+#define VPX_HAS_BUILTIN(x) __has_builtin(x)
 #else
-#define UNINITIALIZED_IS_SAFE(x) x
+#define VPX_HAS_BUILTIN(x) 0
 #endif
 
-#if HAVE_NEON && defined(_MSC_VER)
+#if !VPX_HAS_BUILTIN(__builtin_prefetch) && !defined(__GNUC__)
 #define __builtin_prefetch(x)
 #endif
 
 /* Shift down with rounding */
-#define ROUND_POWER_OF_TWO(value, n) (((value) + (1 << ((n)-1))) >> (n))
-#define ROUND64_POWER_OF_TWO(value, n) (((value) + (1ULL << ((n)-1))) >> (n))
+#define ROUND_POWER_OF_TWO(value, n) (((value) + (1 << ((n) - 1))) >> (n))
+#define ROUND64_POWER_OF_TWO(value, n) (((value) + (1ULL << ((n) - 1))) >> (n))
 
 #define ALIGN_POWER_OF_TWO(value, n) \
   (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
 
 #define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
+#define CAST_TO_SHORTPTR(x) ((uint16_t *)((uintptr_t)(x)))
 #if CONFIG_VP9_HIGHBITDEPTH
 #define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
+#define CAST_TO_BYTEPTR(x) ((uint8_t *)((uintptr_t)(x)))
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#if !defined(__has_feature)
-#define __has_feature(x) 0
-#endif  // !defined(__has_feature)
-
-#if __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__)
-#define VPX_WITH_ASAN 1
-#else
-#define VPX_WITH_ASAN 0
-#endif  // __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__)
-
-#endif  // VPX_PORTS_MEM_H_
+#endif  // VPX_VPX_PORTS_MEM_H_
diff --git a/media/libvpx/libvpx/vpx_ports/mem_ops.h b/media/libvpx/libvpx/vpx_ports/mem_ops.h
index 343f27577c..b17015e7ec 100644
--- a/media/libvpx/libvpx/vpx_ports/mem_ops.h
+++ b/media/libvpx/libvpx/vpx_ports/mem_ops.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_PORTS_MEM_OPS_H_
-#define VPX_PORTS_MEM_OPS_H_
+#ifndef VPX_VPX_PORTS_MEM_OPS_H_
+#define VPX_VPX_PORTS_MEM_OPS_H_
 
 /* \file
  * \brief Provides portable memory access primitives
@@ -224,5 +224,4 @@ static VPX_INLINE void mem_put_le32(void *vmem, MEM_VALUE_T val) {
   mem[3] = (MAU_T)((val >> 24) & 0xff);
 }
 /* clang-format on */
-
-#endif  // VPX_PORTS_MEM_OPS_H_
+#endif  // VPX_VPX_PORTS_MEM_OPS_H_
diff --git a/media/libvpx/libvpx/vpx_ports/mem_ops_aligned.h b/media/libvpx/libvpx/vpx_ports/mem_ops_aligned.h
index ccac391ba0..8649b87623 100644
--- a/media/libvpx/libvpx/vpx_ports/mem_ops_aligned.h
+++ b/media/libvpx/libvpx/vpx_ports/mem_ops_aligned.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_PORTS_MEM_OPS_ALIGNED_H_
-#define VPX_PORTS_MEM_OPS_ALIGNED_H_
+#ifndef VPX_VPX_PORTS_MEM_OPS_ALIGNED_H_
+#define VPX_VPX_PORTS_MEM_OPS_ALIGNED_H_
 
 #include "vpx/vpx_integer.h"
 
@@ -168,4 +168,4 @@ mem_put_le_aligned_generic(32)
 #undef swap_endian_32_se
 /* clang-format on */
 
-#endif  // VPX_PORTS_MEM_OPS_ALIGNED_H_
+#endif  // VPX_VPX_PORTS_MEM_OPS_ALIGNED_H_
diff --git a/media/libvpx/libvpx/vpx_ports/mips.h b/media/libvpx/libvpx/vpx_ports/mips.h
new file mode 100644
index 0000000000..439de754fd
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_ports/mips.h
@@ -0,0 +1,27 @@
+/*
+ *  Copyright (c) 2020 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_PORTS_MIPS_H_
+#define VPX_VPX_PORTS_MIPS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define HAS_MMI 0x01
+#define HAS_MSA 0x02
+
+int mips_cpu_caps(void);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VPX_PORTS_MIPS_H_
diff --git a/media/libvpx/libvpx/vpx_ports/mips_cpudetect.c b/media/libvpx/libvpx/vpx_ports/mips_cpudetect.c
new file mode 100644
index 0000000000..e0eca2d48d
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_ports/mips_cpudetect.c
@@ -0,0 +1,57 @@
+/*
+ *  Copyright (c) 2020 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <stdio.h>
+#include <string.h>
+#include "./vpx_config.h"
+#include "vpx_ports/mips.h"
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#if defined(__mips__) && defined(__linux__)
+int mips_cpu_caps(void) {
+  char cpuinfo_line[512];
+  int flag = 0x0;
+  FILE *f = fopen("/proc/cpuinfo", "r");
+  if (!f) {
+    // Assume nothing if /proc/cpuinfo is unavailable.
+    // This will occur for Chrome sandbox for Pepper or Render process.
+    return 0;
+  }
+  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+    if (memcmp(cpuinfo_line, "cpu model", 9) == 0) {
+      // Workaround early kernel without mmi in ASEs line.
+      if (strstr(cpuinfo_line, "Loongson-3")) {
+        flag |= HAS_MMI;
+      } else if (strstr(cpuinfo_line, "Loongson-2K")) {
+        flag |= HAS_MMI | HAS_MSA;
+      }
+    }
+    if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) {
+      if (strstr(cpuinfo_line, "loongson-mmi") &&
+          strstr(cpuinfo_line, "loongson-ext")) {
+        flag |= HAS_MMI;
+      }
+      if (strstr(cpuinfo_line, "msa")) {
+        flag |= HAS_MSA;
+      }
+      // ASEs is the last line, so we can break here.
+      break;
+    }
+  }
+  fclose(f);
+  return flag;
+}
+#else /* end __mips__ && __linux__ */
+#error \
+    "--enable-runtime-cpu-detect selected, but no CPU detection method " \
+"available for your platform. Reconfigure with --disable-runtime-cpu-detect."
+#endif
+#else /* end CONFIG_RUNTIME_CPU_DETECT */
+int mips_cpu_caps(void) { return 0; }
+#endif
diff --git a/media/libvpx/libvpx/vpx_ports/msvc.h b/media/libvpx/libvpx/vpx_ports/msvc.h
deleted file mode 100644
index 3ff71474b3..0000000000
--- a/media/libvpx/libvpx/vpx_ports/msvc.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_PORTS_MSVC_H_
-#define VPX_PORTS_MSVC_H_
-#ifdef _MSC_VER
-
-#include "./vpx_config.h"
-
-#if _MSC_VER < 1900  // VS2015 provides snprintf
-#define snprintf _snprintf
-#endif  // _MSC_VER < 1900
-
-#if _MSC_VER < 1800  // VS2013 provides round
-#include <math.h>
-static INLINE double round(double x) {
-  if (x < 0)
-    return ceil(x - 0.5);
-  else
-    return floor(x + 0.5);
-}
-#endif  // _MSC_VER < 1800
-
-#endif  // _MSC_VER
-#endif  // VPX_PORTS_MSVC_H_
diff --git a/media/libvpx/libvpx/vpx_ports/ppc.h b/media/libvpx/libvpx/vpx_ports/ppc.h
new file mode 100644
index 0000000000..a11f4e8732
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_ports/ppc.h
@@ -0,0 +1,29 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_PORTS_PPC_H_
+#define VPX_VPX_PORTS_PPC_H_
+#include <stdlib.h>
+
+#include "./vpx_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define HAS_VSX 0x01
+
+int ppc_simd_caps(void);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VPX_PORTS_PPC_H_
diff --git a/media/libvpx/libvpx/vpx_ports/ppc_cpudetect.c b/media/libvpx/libvpx/vpx_ports/ppc_cpudetect.c
new file mode 100644
index 0000000000..374a0271c9
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_ports/ppc_cpudetect.c
@@ -0,0 +1,80 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <asm/cputable.h>
+#include <linux/auxvec.h>
+
+#include "./vpx_config.h"
+#include "vpx_ports/ppc.h"
+
+#if CONFIG_RUNTIME_CPU_DETECT
+static int cpu_env_flags(int *flags) {
+  char *env;
+  env = getenv("VPX_SIMD_CAPS");
+  if (env && *env) {
+    *flags = (int)strtol(env, NULL, 0);
+    return 0;
+  }
+  *flags = 0;
+  return -1;
+}
+
+static int cpu_env_mask(void) {
+  char *env;
+  env = getenv("VPX_SIMD_CAPS_MASK");
+  return env && *env ? (int)strtol(env, NULL, 0) : ~0;
+}
+
+int ppc_simd_caps(void) {
+  int flags;
+  int mask;
+  int fd;
+  ssize_t count;
+  unsigned int i;
+  uint64_t buf[64];
+
+  // If VPX_SIMD_CAPS is set then allow only those capabilities.
+  if (!cpu_env_flags(&flags)) {
+    return flags;
+  }
+
+  mask = cpu_env_mask();
+
+  fd = open("/proc/self/auxv", O_RDONLY);
+  if (fd < 0) {
+    return 0;
+  }
+
+  while ((count = read(fd, buf, sizeof(buf))) > 0) {
+    for (i = 0; i < (count / sizeof(*buf)); i += 2) {
+      if (buf[i] == AT_HWCAP) {
+#if HAVE_VSX
+        if (buf[i + 1] & PPC_FEATURE_HAS_VSX) {
+          flags |= HAS_VSX;
+        }
+#endif  // HAVE_VSX
+        goto out_close;
+      } else if (buf[i] == AT_NULL) {
+        goto out_close;
+      }
+    }
+  }
+out_close:
+  close(fd);
+  return flags & mask;
+}
+#else
+// If there is no RTCD the function pointers are not used and can not be
+// changed.
+int ppc_simd_caps(void) { return 0; }
+#endif  // CONFIG_RUNTIME_CPU_DETECT
diff --git a/media/libvpx/libvpx/vpx_ports/static_assert.h b/media/libvpx/libvpx/vpx_ports/static_assert.h
new file mode 100644
index 0000000000..f632d9f1e8
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_ports/static_assert.h
@@ -0,0 +1,30 @@
+/*
+ *  Copyright (c) 2020 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_PORTS_STATIC_ASSERT_H_
+#define VPX_VPX_PORTS_STATIC_ASSERT_H_
+
+#if defined(_MSC_VER)
+#define VPX_STATIC_ASSERT(boolexp)              \
+  do {                                          \
+    char vpx_static_assert[(boolexp) ? 1 : -1]; \
+    (void)vpx_static_assert;                    \
+  } while (0)
+#else  // !_MSC_VER
+#define VPX_STATIC_ASSERT(boolexp)                         \
+  do {                                                     \
+    struct {                                               \
+      unsigned int vpx_static_assert : (boolexp) ? 1 : -1; \
+    } vpx_static_assert;                                   \
+    (void)vpx_static_assert;                               \
+  } while (0)
+#endif  // _MSC_VER
+
+#endif  // VPX_VPX_PORTS_STATIC_ASSERT_H_
diff --git a/media/libvpx/libvpx/vpx_ports/system_state.h b/media/libvpx/libvpx/vpx_ports/system_state.h
index 086c64681f..32ebd0ed8c 100644
--- a/media/libvpx/libvpx/vpx_ports/system_state.h
+++ b/media/libvpx/libvpx/vpx_ports/system_state.h
@@ -8,15 +8,23 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_PORTS_SYSTEM_STATE_H_
-#define VPX_PORTS_SYSTEM_STATE_H_
+#ifndef VPX_VPX_PORTS_SYSTEM_STATE_H_
+#define VPX_VPX_PORTS_SYSTEM_STATE_H_
 
 #include "./vpx_config.h"
 
-#if ARCH_X86 || ARCH_X86_64
-void vpx_reset_mmx_state(void);
-#define vpx_clear_system_state() vpx_reset_mmx_state()
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if (VPX_ARCH_X86 || VPX_ARCH_X86_64) && HAVE_MMX
+extern void vpx_clear_system_state(void);
 #else
 #define vpx_clear_system_state()
-#endif  // ARCH_X86 || ARCH_X86_64
-#endif  // VPX_PORTS_SYSTEM_STATE_H_
+#endif  // (VPX_ARCH_X86 || VPX_ARCH_X86_64) && HAVE_MMX
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VPX_PORTS_SYSTEM_STATE_H_
diff --git a/media/libvpx/libvpx/vpx_ports/vpx_once.h b/media/libvpx/libvpx/vpx_ports/vpx_once.h
index 7d9fc3b406..d33eff4397 100644
--- a/media/libvpx/libvpx/vpx_ports/vpx_once.h
+++ b/media/libvpx/libvpx/vpx_ports/vpx_once.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_PORTS_VPX_ONCE_H_
-#define VPX_PORTS_VPX_ONCE_H_
+#ifndef VPX_VPX_PORTS_VPX_ONCE_H_
+#define VPX_VPX_PORTS_VPX_ONCE_H_
 
 #include "vpx_config.h"
 
@@ -91,29 +91,6 @@ static void once(void (*func)(void)) {
   return;
 }
 
-#elif CONFIG_MULTITHREAD && defined(__OS2__)
-#define INCL_DOS
-#include <os2.h>
-static void once(void (*func)(void)) {
-  static int done;
-
-  /* If the initialization is complete, return early. */
-  if (done) return;
-
-  /* Causes all other threads in the process to block themselves
-   * and give up their time slice.
-   */
-  DosEnterCritSec();
-
-  if (!done) {
-    func();
-    done = 1;
-  }
-
-  /* Restores normal thread dispatching for the current process. */
-  DosExitCritSec();
-}
-
 #elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H
 #include <pthread.h>
 static void once(void (*func)(void)) {
@@ -128,7 +105,7 @@ static void once(void (*func)(void)) {
  */
 
 static void once(void (*func)(void)) {
-  static int done;
+  static volatile int done;
 
   if (!done) {
     func();
@@ -137,4 +114,4 @@ static void once(void (*func)(void)) {
 }
 #endif
 
-#endif  // VPX_PORTS_VPX_ONCE_H_
+#endif  // VPX_VPX_PORTS_VPX_ONCE_H_
diff --git a/media/libvpx/libvpx/vpx_ports/vpx_ports.mk b/media/libvpx/libvpx/vpx_ports/vpx_ports.mk
index 36b14936df..b0ad210f60 100644
--- a/media/libvpx/libvpx/vpx_ports/vpx_ports.mk
+++ b/media/libvpx/libvpx/vpx_ports/vpx_ports.mk
@@ -12,16 +12,46 @@
 PORTS_SRCS-yes += vpx_ports.mk
 
 PORTS_SRCS-yes += bitops.h
+PORTS_SRCS-yes += compiler_attributes.h
 PORTS_SRCS-yes += mem.h
-PORTS_SRCS-yes += msvc.h
+PORTS_SRCS-yes += static_assert.h
 PORTS_SRCS-yes += system_state.h
 PORTS_SRCS-yes += vpx_timer.h
 
-ifeq ($(ARCH_X86)$(ARCH_X86_64),yes)
-PORTS_SRCS-yes += emms.asm
+ifeq ($(VPX_ARCH_X86),yes)
+PORTS_SRCS-$(HAVE_MMX) += emms_mmx.c
+endif
+ifeq ($(VPX_ARCH_X86_64),yes)
+# Visual Studio x64 does not support the _mm_empty() intrinsic.
+PORTS_SRCS-$(HAVE_MMX) += emms_mmx.asm
+endif
+
+ifeq ($(VPX_ARCH_X86_64),yes)
+PORTS_SRCS-$(CONFIG_MSVS) += float_control_word.asm
+endif
+
+ifeq ($(VPX_ARCH_X86)$(VPX_ARCH_X86_64),yes)
 PORTS_SRCS-yes += x86.h
 PORTS_SRCS-yes += x86_abi_support.asm
 endif
 
-PORTS_SRCS-$(ARCH_ARM) += arm_cpudetect.c
-PORTS_SRCS-$(ARCH_ARM) += arm.h
+ifeq ($(VPX_ARCH_AARCH64),yes)
+PORTS_SRCS-yes += aarch64_cpudetect.c
+else
+PORTS_SRCS-$(VPX_ARCH_ARM) += aarch32_cpudetect.c
+endif
+PORTS_SRCS-$(VPX_ARCH_ARM) += arm_cpudetect.h
+PORTS_SRCS-$(VPX_ARCH_ARM) += arm.h
+
+PORTS_SRCS-$(VPX_ARCH_PPC) += ppc_cpudetect.c
+PORTS_SRCS-$(VPX_ARCH_PPC) += ppc.h
+
+PORTS_SRCS-$(VPX_ARCH_MIPS) += mips_cpudetect.c
+PORTS_SRCS-$(VPX_ARCH_MIPS) += mips.h
+
+PORTS_SRCS-$(VPX_ARCH_LOONGARCH) += loongarch_cpudetect.c
+PORTS_SRCS-$(VPX_ARCH_LOONGARCH) += loongarch.h
+
+ifeq ($(VPX_ARCH_MIPS), yes)
+PORTS_SRCS-yes += asmdefs_mmi.h
+endif
diff --git a/media/libvpx/libvpx/vpx_ports/vpx_timer.h b/media/libvpx/libvpx/vpx_ports/vpx_timer.h
index c1f1b60275..55e7891e19 100644
--- a/media/libvpx/libvpx/vpx_ports/vpx_timer.h
+++ b/media/libvpx/libvpx/vpx_ports/vpx_timer.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_PORTS_VPX_TIMER_H_
-#define VPX_PORTS_VPX_TIMER_H_
+#ifndef VPX_VPX_PORTS_VPX_TIMER_H_
+#define VPX_VPX_PORTS_VPX_TIMER_H_
 
 #include "./vpx_config.h"
 
@@ -31,17 +31,17 @@
 /*
  * POSIX specific includes
  */
-#include <sys/time.h>
+#include <time.h>
 
 /* timersub is not provided by msys at this time. */
-#ifndef timersub
-#define timersub(a, b, result)                       \
+#ifndef timersub_ns
+#define timersub_ns(a, b, result)                    \
   do {                                               \
     (result)->tv_sec = (a)->tv_sec - (b)->tv_sec;    \
-    (result)->tv_usec = (a)->tv_usec - (b)->tv_usec; \
-    if ((result)->tv_usec < 0) {                     \
+    (result)->tv_nsec = (a)->tv_nsec - (b)->tv_nsec; \
+    if ((result)->tv_nsec < 0) {                     \
       --(result)->tv_sec;                            \
-      (result)->tv_usec += 1000000;                  \
+      (result)->tv_nsec += 1000000000;               \
     }                                                \
   } while (0)
 #endif
@@ -51,23 +51,27 @@ struct vpx_usec_timer {
 #if defined(_WIN32)
   LARGE_INTEGER begin, end;
 #else
-  struct timeval begin, end;
+  struct timespec begin, end;
 #endif
 };
 
 static INLINE void vpx_usec_timer_start(struct vpx_usec_timer *t) {
 #if defined(_WIN32)
   QueryPerformanceCounter(&t->begin);
+#elif defined(CLOCK_MONOTONIC_RAW)
+  clock_gettime(CLOCK_MONOTONIC_RAW, &t->begin);
 #else
-  gettimeofday(&t->begin, NULL);
+  clock_gettime(CLOCK_MONOTONIC, &t->begin);
 #endif
 }
 
 static INLINE void vpx_usec_timer_mark(struct vpx_usec_timer *t) {
 #if defined(_WIN32)
   QueryPerformanceCounter(&t->end);
+#elif defined(CLOCK_MONOTONIC_RAW)
+  clock_gettime(CLOCK_MONOTONIC_RAW, &t->end);
 #else
-  gettimeofday(&t->end, NULL);
+  clock_gettime(CLOCK_MONOTONIC, &t->end);
 #endif
 }
 
@@ -80,18 +84,18 @@ static INLINE int64_t vpx_usec_timer_elapsed(struct vpx_usec_timer *t) {
   QueryPerformanceFrequency(&freq);
   return diff.QuadPart * 1000000 / freq.QuadPart;
 #else
-  struct timeval diff;
+  struct timespec diff;
 
-  timersub(&t->end, &t->begin, &diff);
-  return diff.tv_sec * 1000000 + diff.tv_usec;
+  timersub_ns(&t->end, &t->begin, &diff);
+  return (int64_t)diff.tv_sec * 1000000 + diff.tv_nsec / 1000;
 #endif
 }
 
 #else /* CONFIG_OS_SUPPORT = 0*/
 
 /* Empty timer functions if CONFIG_OS_SUPPORT = 0 */
-#ifndef timersub
-#define timersub(a, b, result)
+#ifndef timersub_ns
+#define timersub_ns(a, b, result)
 #endif
 
 struct vpx_usec_timer {
@@ -106,4 +110,4 @@ static INLINE int vpx_usec_timer_elapsed(struct vpx_usec_timer *t) { return 0; }
 
 #endif /* CONFIG_OS_SUPPORT */
 
-#endif  // VPX_PORTS_VPX_TIMER_H_
+#endif  // VPX_VPX_PORTS_VPX_TIMER_H_
diff --git a/media/libvpx/libvpx/vpx_ports/x86.h b/media/libvpx/libvpx/vpx_ports/x86.h
index 5aabb9e3af..4e450f8b5f 100644
--- a/media/libvpx/libvpx/vpx_ports/x86.h
+++ b/media/libvpx/libvpx/vpx_ports/x86.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_PORTS_X86_H_
-#define VPX_PORTS_X86_H_
+#ifndef VPX_VPX_PORTS_X86_H_
+#define VPX_VPX_PORTS_X86_H_
 #include <stdlib.h>
 
 #if defined(_MSC_VER)
@@ -42,12 +42,12 @@ typedef enum {
   VPX_CPU_LAST
 } vpx_cpu_t;
 
-#if defined(__GNUC__) && __GNUC__ || defined(__ANDROID__)
-#if ARCH_X86_64
+#if defined(__GNUC__) || defined(__ANDROID__)
+#if VPX_ARCH_X86_64
 #define cpuid(func, func2, ax, bx, cx, dx)                      \
   __asm__ __volatile__("cpuid           \n\t"                   \
                        : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \
-                       : "a"(func), "c"(func2));
+                       : "a"(func), "c"(func2))
 #else
 #define cpuid(func, func2, ax, bx, cx, dx)     \
   __asm__ __volatile__(                        \
@@ -55,11 +55,11 @@ typedef enum {
       "cpuid              \n\t"                \
       "xchg %%edi, %%ebx  \n\t"                \
       : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \
-      : "a"(func), "c"(func2));
+      : "a"(func), "c"(func2))
 #endif
 #elif defined(__SUNPRO_C) || \
     defined(__SUNPRO_CC) /* end __GNUC__ or __ANDROID__*/
-#if ARCH_X86_64
+#if VPX_ARCH_X86_64
 #define cpuid(func, func2, ax, bx, cx, dx)     \
   asm volatile(                                \
       "xchg %rsi, %rbx \n\t"                   \
@@ -67,7 +67,7 @@ typedef enum {
       "movl %ebx, %edi \n\t"                   \
       "xchg %rsi, %rbx \n\t"                   \
       : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \
-      : "a"(func), "c"(func2));
+      : "a"(func), "c"(func2))
 #else
 #define cpuid(func, func2, ax, bx, cx, dx)     \
   asm volatile(                                \
@@ -76,10 +76,10 @@ typedef enum {
       "movl %ebx, %edi  \n\t"                  \
       "popl %ebx        \n\t"                  \
       : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \
-      : "a"(func), "c"(func2));
+      : "a"(func), "c"(func2))
 #endif
 #else /* end __SUNPRO__ */
-#if ARCH_X86_64
+#if VPX_ARCH_X86_64
 #if defined(_MSC_VER) && _MSC_VER > 1500
 #define cpuid(func, func2, a, b, c, d) \
   do {                                 \
@@ -151,78 +151,104 @@ static INLINE uint64_t xgetbv(void) {
 #endif
 #endif
 
-#define HAS_MMX 0x01
-#define HAS_SSE 0x02
-#define HAS_SSE2 0x04
-#define HAS_SSE3 0x08
-#define HAS_SSSE3 0x10
-#define HAS_SSE4_1 0x20
-#define HAS_AVX 0x40
-#define HAS_AVX2 0x80
+#define HAS_MMX 0x001
+#define HAS_SSE 0x002
+#define HAS_SSE2 0x004
+#define HAS_SSE3 0x008
+#define HAS_SSSE3 0x010
+#define HAS_SSE4_1 0x020
+#define HAS_AVX 0x040
+#define HAS_AVX2 0x080
+#define HAS_AVX512 0x100
 #ifndef BIT
-#define BIT(n) (1 << n)
+#define BIT(n) (1u << (n))
 #endif
 
+#define MMX_BITS BIT(23)
+#define SSE_BITS BIT(25)
+#define SSE2_BITS BIT(26)
+#define SSE3_BITS BIT(0)
+#define SSSE3_BITS BIT(9)
+#define SSE4_1_BITS BIT(19)
+// Bits 27 (OSXSAVE) & 28 (256-bit AVX)
+#define AVX_BITS (BIT(27) | BIT(28))
+#define AVX2_BITS BIT(5)
+// Bits 16 (AVX-512F) & 17 (AVX-512DQ) & 28 (AVX-512CD) & 30 (AVX-512BW)
+// & 31 (AVX-512VL)
+#define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))
+
+#define FEATURE_SET(reg, feature) \
+  (((reg) & (feature##_BITS)) == (feature##_BITS))
+
 static INLINE int x86_simd_caps(void) {
   unsigned int flags = 0;
-  unsigned int mask = ~0;
+  unsigned int mask = ~0u;
   unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx;
   char *env;
   (void)reg_ebx;
 
   /* See if the CPU capabilities are being overridden by the environment */
   env = getenv("VPX_SIMD_CAPS");
-
   if (env && *env) return (int)strtol(env, NULL, 0);
 
   env = getenv("VPX_SIMD_CAPS_MASK");
-
   if (env && *env) mask = (unsigned int)strtoul(env, NULL, 0);
 
   /* Ensure that the CPUID instruction supports extended features */
   cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx);
-
   if (max_cpuid_val < 1) return 0;
 
   /* Get the standard feature flags */
   cpuid(1, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
 
-  if (reg_edx & BIT(23)) flags |= HAS_MMX;
+  flags |= FEATURE_SET(reg_edx, MMX) ? HAS_MMX : 0;
+  flags |= FEATURE_SET(reg_edx, SSE) ? HAS_SSE : 0;
+  flags |= FEATURE_SET(reg_edx, SSE2) ? HAS_SSE2 : 0;
+  flags |= FEATURE_SET(reg_ecx, SSE3) ? HAS_SSE3 : 0;
+  flags |= FEATURE_SET(reg_ecx, SSSE3) ? HAS_SSSE3 : 0;
+  flags |= FEATURE_SET(reg_ecx, SSE4_1) ? HAS_SSE4_1 : 0;
 
-  if (reg_edx & BIT(25)) flags |= HAS_SSE; /* aka xmm */
-
-  if (reg_edx & BIT(26)) flags |= HAS_SSE2; /* aka wmt */
-
-  if (reg_ecx & BIT(0)) flags |= HAS_SSE3;
-
-  if (reg_ecx & BIT(9)) flags |= HAS_SSSE3;
-
-  if (reg_ecx & BIT(19)) flags |= HAS_SSE4_1;
-
-  // bits 27 (OSXSAVE) & 28 (256-bit AVX)
-  if ((reg_ecx & (BIT(27) | BIT(28))) == (BIT(27) | BIT(28))) {
+  if (FEATURE_SET(reg_ecx, AVX)) {
+    // Check for OS-support of YMM state. Necessary for AVX and AVX2.
     if ((xgetbv() & 0x6) == 0x6) {
       flags |= HAS_AVX;
-
       if (max_cpuid_val >= 7) {
         /* Get the leaf 7 feature flags. Needed to check for AVX2 support */
         cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
-
-        if (reg_ebx & BIT(5)) flags |= HAS_AVX2;
+        flags |= FEATURE_SET(reg_ebx, AVX2) ? HAS_AVX2 : 0;
+        if (FEATURE_SET(reg_ebx, AVX512)) {
+          // Check for OS-support of ZMM and YMM state. Necessary for AVX-512.
+          if ((xgetbv() & 0xe6) == 0xe6) flags |= HAS_AVX512;
+        }
       }
     }
   }
-
+  (void)reg_eax;  // Avoid compiler warning on unused-but-set variable.
   return flags & mask;
 }
 
-// Note:
-//  32-bit CPU cycle counter is light-weighted for most function performance
-//  measurement. For large function (CPU time > a couple of seconds), 64-bit
-//  counter should be used.
-// 32-bit CPU cycle counter
+// Fine-Grain Measurement Functions
+//
+// If you are timing a small region of code, access the timestamp counter
+// (TSC) via:
+//
+// unsigned int start = x86_tsc_start();
+//   ...
+// unsigned int end = x86_tsc_end();
+// unsigned int diff = end - start;
+//
+// The start/end functions introduce a few more instructions than using
+// x86_readtsc directly, but prevent the CPU's out-of-order execution from
+// affecting the measurement (by having earlier/later instructions be evaluated
+// in the time interval). See the white paper, "How to Benchmark Code
+// Execution Times on Intel(R) IA-32 and IA-64 Instruction Set Architectures" by
+// Gabriele Paoloni for more information.
+//
+// If you are timing a large function (CPU time > a couple of seconds), use
+// x86_readtsc64 to read the timestamp counter in a 64-bit integer. The
+// out-of-order leakage that can occur is minimal compared to total runtime.
 static INLINE unsigned int x86_readtsc(void) {
-#if defined(__GNUC__) && __GNUC__
+#if defined(__GNUC__)
   unsigned int tsc;
   __asm__ __volatile__("rdtsc\n\t" : "=a"(tsc) :);
   return tsc;
@@ -231,7 +257,7 @@ static INLINE unsigned int x86_readtsc(void) {
   asm volatile("rdtsc\n\t" : "=a"(tsc) :);
   return tsc;
 #else
-#if ARCH_X86_64
+#if VPX_ARCH_X86_64
   return (unsigned int)__rdtsc();
 #else
   __asm rdtsc;
@@ -240,7 +266,7 @@ static INLINE unsigned int x86_readtsc(void) {
 }
 // 64-bit CPU cycle counter
 static INLINE uint64_t x86_readtsc64(void) {
-#if defined(__GNUC__) && __GNUC__
+#if defined(__GNUC__)
   uint32_t hi, lo;
   __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
   return ((uint64_t)hi << 32) | lo;
@@ -249,7 +275,7 @@ static INLINE uint64_t x86_readtsc64(void) {
   asm volatile("rdtsc\n\t" : "=a"(lo), "=d"(hi));
   return ((uint64_t)hi << 32) | lo;
 #else
-#if ARCH_X86_64
+#if VPX_ARCH_X86_64
   return (uint64_t)__rdtsc();
 #else
   __asm rdtsc;
@@ -257,19 +283,66 @@ static INLINE uint64_t x86_readtsc64(void) {
 #endif
 }
 
-#if defined(__GNUC__) && __GNUC__
+// 32-bit CPU cycle counter with a partial fence against out-of-order execution.
+static INLINE unsigned int x86_readtscp(void) {
+#if defined(__GNUC__)
+  unsigned int tscp;
+  __asm__ __volatile__("rdtscp\n\t" : "=a"(tscp) :);
+  return tscp;
+#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+  unsigned int tscp;
+  asm volatile("rdtscp\n\t" : "=a"(tscp) :);
+  return tscp;
+#elif defined(_MSC_VER)
+  unsigned int ui;
+  return (unsigned int)__rdtscp(&ui);
+#else
+#if VPX_ARCH_X86_64
+  return (unsigned int)__rdtscp();
+#else
+  __asm rdtscp;
+#endif
+#endif
+}
+
+static INLINE unsigned int x86_tsc_start(void) {
+  unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
+  // This call should not be removed. See function notes above.
+  cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+  // Avoid compiler warnings on unused-but-set variables.
+  (void)reg_eax;
+  (void)reg_ebx;
+  (void)reg_ecx;
+  (void)reg_edx;
+  return x86_readtsc();
+}
+
+static INLINE unsigned int x86_tsc_end(void) {
+  uint32_t v = x86_readtscp();
+  unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
+  // This call should not be removed. See function notes above.
+  cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+  // Avoid compiler warnings on unused-but-set variables.
+  (void)reg_eax;
+  (void)reg_ebx;
+  (void)reg_ecx;
+  (void)reg_edx;
+  return v;
+}
+
+#if defined(__GNUC__)
 #define x86_pause_hint() __asm__ __volatile__("pause \n\t")
 #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
 #define x86_pause_hint() asm volatile("pause \n\t")
 #else
-#if ARCH_X86_64
+#if VPX_ARCH_X86_64
 #define x86_pause_hint() _mm_pause();
 #else
 #define x86_pause_hint() __asm pause
 #endif
 #endif
 
-#if defined(__GNUC__) && __GNUC__
+#if defined(__GNUC__)
 static void x87_set_control_word(unsigned short mode) {
   __asm__ __volatile__("fldcw %0" : : "m"(*&mode));
 }
@@ -287,7 +360,7 @@ static unsigned short x87_get_control_word(void) {
   asm volatile("fstcw %0\n\t" : "=m"(*&mode) :);
   return mode;
 }
-#elif ARCH_X86_64
+#elif VPX_ARCH_X86_64
 /* No fldcw intrinsics on Windows x64, punt to external asm */
 extern void vpx_winx64_fldcw(unsigned short mode);
 extern unsigned short vpx_winx64_fstcw(void);
@@ -306,14 +379,23 @@ static unsigned short x87_get_control_word(void) {
 
 static INLINE unsigned int x87_set_double_precision(void) {
   unsigned int mode = x87_get_control_word();
-  x87_set_control_word((mode & ~0x300) | 0x200);
+  // Intel 64 and IA-32 Architectures Developer's Manual: Vol. 1
+  // https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-1-manual.pdf
+  // 8.1.5.2 Precision Control Field
+  // Bits 8 and 9 (0x300) of the x87 FPU Control Word ("Precision Control")
+  // determine the number of bits used in floating point calculations. To match
+  // later SSE instructions restrict x87 operations to Double Precision (0x200).
+  // Precision                     PC Field
+  // Single Precision (24-Bits)    00B
+  // Reserved                      01B
+  // Double Precision (53-Bits)    10B
+  // Extended Precision (64-Bits)  11B
+  x87_set_control_word((mode & ~0x300u) | 0x200u);
   return mode;
 }
 
-extern void vpx_reset_mmx_state(void);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VPX_PORTS_X86_H_
+#endif  // VPX_VPX_PORTS_X86_H_
diff --git a/media/libvpx/libvpx/vpx_ports/x86_abi_support.asm b/media/libvpx/libvpx/vpx_ports/x86_abi_support.asm
index 708fa101c5..6b2d6b9684 100644
--- a/media/libvpx/libvpx/vpx_ports/x86_abi_support.asm
+++ b/media/libvpx/libvpx/vpx_ports/x86_abi_support.asm
@@ -89,49 +89,70 @@
 %define LIBVPX_YASM_WIN64 0
 %endif
 
+; Declare groups of platforms
+%ifidn   __OUTPUT_FORMAT__,elf32
+  %define LIBVPX_ELF 1
+%elifidn   __OUTPUT_FORMAT__,elfx32
+  %define LIBVPX_ELF 1
+%elifidn   __OUTPUT_FORMAT__,elf64
+  %define LIBVPX_ELF 1
+%else
+  %define LIBVPX_ELF 0
+%endif
+
+%ifidn __OUTPUT_FORMAT__,macho32
+  %define LIBVPX_MACHO 1
+%elifidn __OUTPUT_FORMAT__,macho64
+  %define LIBVPX_MACHO 1
+%else
+  %define LIBVPX_MACHO 0
+%endif
+
 ; sym()
 ; Return the proper symbol name for the target ABI.
 ;
 ; Certain ABIs, notably MS COFF and Darwin MACH-O, require that symbols
 ; with C linkage be prefixed with an underscore.
 ;
-%ifidn   __OUTPUT_FORMAT__,elf32
-%define sym(x) x
-%elifidn __OUTPUT_FORMAT__,elf64
-%define sym(x) x
-%elifidn __OUTPUT_FORMAT__,elfx32
-%define sym(x) x
-%elif LIBVPX_YASM_WIN64
-%define sym(x) x
+%if LIBVPX_ELF || LIBVPX_YASM_WIN64
+  %define sym(x) x
 %else
-%define sym(x) _ %+ x
+  ; Mach-O / COFF
+  %define sym(x) _ %+ x
 %endif
 
-;  PRIVATE
-;  Macro for the attribute to hide a global symbol for the target ABI.
-;  This is only active if CHROMIUM is defined.
+; globalsym()
+; Return a global declaration with the proper decoration for the target ABI.
 ;
-;  Chromium doesn't like exported global symbols due to symbol clashing with
-;  plugins among other things.
+; When CHROMIUM is defined, include attributes to hide the symbol from the
+; global namespace.
 ;
-;  Requires Chromium's patched copy of yasm:
-;    http://src.chromium.org/viewvc/chrome?view=rev&revision=73761
-;    http://www.tortall.net/projects/yasm/ticket/236
+; Chromium doesn't like exported global symbols due to symbol clashing with
+; plugins among other things.
+;
+; Requires Chromium's patched copy of yasm:
+;   http://src.chromium.org/viewvc/chrome?view=rev&revision=73761
+;   http://www.tortall.net/projects/yasm/ticket/236
+; or nasm > 2.14.
 ;
 %ifdef CHROMIUM
-  %ifidn   __OUTPUT_FORMAT__,elf32
-    %define PRIVATE :hidden
-  %elifidn __OUTPUT_FORMAT__,elf64
-    %define PRIVATE :hidden
-  %elifidn __OUTPUT_FORMAT__,elfx32
-    %define PRIVATE :hidden
-  %elif LIBVPX_YASM_WIN64
-    %define PRIVATE
+  %ifdef __NASM_VER__
+    %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14
+      ; nasm < 2.14 does not support :private_extern directive
+      %fatal Must use nasm 2.14 or newer
+    %endif
+  %endif
+
+  %if LIBVPX_ELF
+    %define globalsym(x) global sym(x) %+ :function hidden
+  %elif LIBVPX_MACHO
+    %define globalsym(x) global sym(x) %+ :private_extern
   %else
-    %define PRIVATE :private_extern
+    ; COFF / PE32+
+    %define globalsym(x) global sym(x)
   %endif
 %else
-  %define PRIVATE
+  %define globalsym(x) global sym(x)
 %endif
 
 ; arg()
diff --git a/media/libvpx/libvpx/vpx_scale/generic/gen_scalers.c b/media/libvpx/libvpx/vpx_scale/generic/gen_scalers.c
index b554a56e83..d8db4b3547 100644
--- a/media/libvpx/libvpx/vpx_scale/generic/gen_scalers.c
+++ b/media/libvpx/libvpx/vpx_scale/generic/gen_scalers.c
@@ -12,8 +12,8 @@
 #include "vpx_scale/vpx_scale.h"
 #include "vpx_mem/vpx_mem.h"
 /****************************************************************************
-*  Imports
-****************************************************************************/
+ *  Imports
+ ****************************************************************************/
 
 /****************************************************************************
  *
diff --git a/media/libvpx/libvpx/vpx_scale/generic/vpx_scale.c b/media/libvpx/libvpx/vpx_scale/generic/vpx_scale.c
index 20e1ff90fd..36d6d3628a 100644
--- a/media/libvpx/libvpx/vpx_scale/generic/vpx_scale.c
+++ b/media/libvpx/libvpx/vpx_scale/generic/vpx_scale.c
@@ -17,8 +17,10 @@
  ***************************************************************************/
 
 /****************************************************************************
-*  Header Files
-****************************************************************************/
+ *  Header Files
+ ****************************************************************************/
+#include <assert.h>
+
 #include "./vpx_scale_rtcd.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_scale/vpx_scale.h"
@@ -172,6 +174,7 @@ static void scale1d_c(const unsigned char *source, int source_step,
   /*assert ( (source_length-1) * dest_scale >= (dest_length-1) * source_scale
    * );*/
 
+  assert(dest_scale != 0);
   for (i = 0; i < dest_length * dest_step; i += dest_step) {
     dest[i] = (char)((left_modifier * left_pixel +
                       right_modifier * right_pixel + round_value) /
diff --git a/media/libvpx/libvpx/vpx_scale/generic/yv12config.c b/media/libvpx/libvpx/vpx_scale/generic/yv12config.c
index a674eac84b..07deeb2016 100644
--- a/media/libvpx/libvpx/vpx_scale/generic/yv12config.c
+++ b/media/libvpx/libvpx/vpx_scale/generic/yv12config.c
@@ -9,20 +9,25 @@
  */
 
 #include <assert.h>
+#include <limits.h>
+#include <stdint.h>
 
 #include "vpx_scale/yv12config.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 
+#if defined(VPX_MAX_ALLOCABLE_MEMORY)
+#include "vp9/common/vp9_onyxc_int.h"
+#endif  // VPX_MAX_ALLOCABLE_MEMORY
 /****************************************************************************
-*  Exports
-****************************************************************************/
+ *  Exports
+ ****************************************************************************/
 
 /****************************************************************************
  *
  ****************************************************************************/
 #define yv12_align_addr(addr, align) \
-  (void *)(((size_t)(addr) + ((align)-1)) & (size_t) - (align))
+  (void *)(((size_t)(addr) + ((align) - 1)) & (size_t)-(align))
 
 int vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
   if (ybf) {
@@ -53,17 +58,29 @@ int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width,
     int uv_width = aligned_width >> 1;
     int uv_height = aligned_height >> 1;
     /** There is currently a bunch of code which assumes
-      *  uv_stride == y_stride/2, so enforce this here. */
+     *  uv_stride == y_stride/2, so enforce this here. */
     int uv_stride = y_stride >> 1;
     int uvplane_size = (uv_height + border) * uv_stride;
-    const int frame_size = yplane_size + 2 * uvplane_size;
+    const size_t frame_size = yplane_size + 2 * uvplane_size;
 
     if (!ybf->buffer_alloc) {
       ybf->buffer_alloc = (uint8_t *)vpx_memalign(32, frame_size);
+      if (!ybf->buffer_alloc) {
+        ybf->buffer_alloc_sz = 0;
+        return -1;
+      }
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+      // This memset is needed for fixing the issue of using uninitialized
+      // value in msan test. It will cause a perf loss, so only do this for
+      // msan test.
+      memset(ybf->buffer_alloc, 0, frame_size);
+#endif
+#endif
       ybf->buffer_alloc_sz = frame_size;
     }
 
-    if (!ybf->buffer_alloc || ybf->buffer_alloc_sz < frame_size) return -1;
+    if (ybf->buffer_alloc_sz < frame_size) return -1;
 
     /* Only support allocating buffers that have a border that's a multiple
      * of 32. The border restriction is required to get 16-byte alignment of
@@ -141,6 +158,17 @@ int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
                              int border, int byte_alignment,
                              vpx_codec_frame_buffer_t *fb,
                              vpx_get_frame_buffer_cb_fn_t cb, void *cb_priv) {
+#if CONFIG_SIZE_LIMIT
+  if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) return -1;
+#endif
+
+  /* Only support allocating buffers that have a border that's a multiple
+   * of 32. The border restriction is required to get 16-byte alignment of
+   * the start of the chroma rows without introducing an arbitrary gap
+   * between planes, which would break the semantics of things like
+   * vpx_img_set_rect(). */
+  if (border & 0x1f) return -3;
+
   if (ybf) {
     const int vp9_byte_align = (byte_alignment == 0) ? 1 : byte_alignment;
     const int aligned_width = (width + 7) & ~7;
@@ -165,6 +193,21 @@ int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
 
     uint8_t *buf = NULL;
 
+#if defined(VPX_MAX_ALLOCABLE_MEMORY)
+    // The decoder may allocate REF_FRAMES frame buffers in the frame buffer
+    // pool. Bound the total amount of allocated memory as if these REF_FRAMES
+    // frame buffers were allocated in a single allocation.
+    if (frame_size > VPX_MAX_ALLOCABLE_MEMORY / REF_FRAMES) return -1;
+#endif  // VPX_MAX_ALLOCABLE_MEMORY
+
+#if UINT64_MAX > SIZE_MAX
+    // frame_size is stored in buffer_alloc_sz, which is a size_t. If it won't
+    // fit, fail early.
+    if (frame_size > SIZE_MAX) {
+      return -1;
+    }
+#endif
+
     if (cb != NULL) {
       const int align_addr_extra_size = 31;
       const uint64_t external_frame_size = frame_size + align_addr_extra_size;
@@ -185,20 +228,19 @@ int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
       // This memset is needed for fixing the issue of using uninitialized
       // value in msan test. It will cause a perf loss, so only do this for
       // msan test.
-      memset(ybf->buffer_alloc, 0, (int)frame_size);
+      memset(ybf->buffer_alloc, 0, (size_t)frame_size);
 #endif
 #endif
-    } else if (frame_size > (size_t)ybf->buffer_alloc_sz) {
+    } else if (frame_size > ybf->buffer_alloc_sz) {
       // Allocation to hold larger frame, or first allocation.
       vpx_free(ybf->buffer_alloc);
       ybf->buffer_alloc = NULL;
-
-      if (frame_size != (size_t)frame_size) return -1;
+      ybf->buffer_alloc_sz = 0;
 
       ybf->buffer_alloc = (uint8_t *)vpx_memalign(32, (size_t)frame_size);
       if (!ybf->buffer_alloc) return -1;
 
-      ybf->buffer_alloc_sz = (int)frame_size;
+      ybf->buffer_alloc_sz = (size_t)frame_size;
 
       // This memset is needed for fixing valgrind error from C loop filter
       // due to access uninitialized memory in frame border. It could be
@@ -206,13 +248,6 @@ int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
       memset(ybf->buffer_alloc, 0, ybf->buffer_alloc_sz);
     }
 
-    /* Only support allocating buffers that have a border that's a multiple
-     * of 32. The border restriction is required to get 16-byte alignment of
-     * the start of the chroma rows without introducing an arbitrary gap
-     * between planes, which would break the semantics of things like
-     * vpx_img_set_rect(). */
-    if (border & 0x1f) return -3;
-
     ybf->y_crop_width = width;
     ybf->y_crop_height = height;
     ybf->y_width = aligned_width;
@@ -226,7 +261,7 @@ int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
     ybf->uv_stride = uv_stride;
 
     ybf->border = border;
-    ybf->frame_size = (int)frame_size;
+    ybf->frame_size = (size_t)frame_size;
     ybf->subsampling_x = ss_x;
     ybf->subsampling_y = ss_y;
 
diff --git a/media/libvpx/libvpx/vpx_scale/generic/yv12extend.c b/media/libvpx/libvpx/vpx_scale/generic/yv12extend.c
index a6aaff95a0..e231806505 100644
--- a/media/libvpx/libvpx/vpx_scale/generic/yv12extend.c
+++ b/media/libvpx/libvpx/vpx_scale/generic/yv12extend.c
@@ -111,25 +111,6 @@ void vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
   assert(ybf->y_height - ybf->y_crop_height >= 0);
   assert(ybf->y_width - ybf->y_crop_width >= 0);
 
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    extend_plane_high(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width,
-                      ybf->y_crop_height, ybf->border, ybf->border,
-                      ybf->border + ybf->y_height - ybf->y_crop_height,
-                      ybf->border + ybf->y_width - ybf->y_crop_width);
-
-    extend_plane_high(ybf->u_buffer, ybf->uv_stride, ybf->uv_crop_width,
-                      ybf->uv_crop_height, uv_border, uv_border,
-                      uv_border + ybf->uv_height - ybf->uv_crop_height,
-                      uv_border + ybf->uv_width - ybf->uv_crop_width);
-
-    extend_plane_high(ybf->v_buffer, ybf->uv_stride, ybf->uv_crop_width,
-                      ybf->uv_crop_height, uv_border, uv_border,
-                      uv_border + ybf->uv_height - ybf->uv_crop_height,
-                      uv_border + ybf->uv_width - ybf->uv_crop_width);
-    return;
-  }
-#endif
   extend_plane(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width,
                ybf->y_crop_height, ybf->border, ybf->border,
                ybf->border + ybf->y_height - ybf->y_crop_height,
@@ -208,12 +189,55 @@ static void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) {
 // Copies the source image into the destination image and updates the
 // destination's UMV borders.
 // Note: The frames are assumed to be identical in size.
+
 void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc,
                            YV12_BUFFER_CONFIG *dst_ybc) {
   int row;
   const uint8_t *src = src_ybc->y_buffer;
   uint8_t *dst = dst_ybc->y_buffer;
 
+#if 0
+  /* These assertions are valid in the codec, but the libvpx-tester uses
+   * this code slightly differently.
+   */
+  assert(src_ybc->y_width == dst_ybc->y_width);
+  assert(src_ybc->y_height == dst_ybc->y_height);
+#endif
+
+  for (row = 0; row < src_ybc->y_height; ++row) {
+    memcpy(dst, src, src_ybc->y_width);
+    src += src_ybc->y_stride;
+    dst += dst_ybc->y_stride;
+  }
+
+  src = src_ybc->u_buffer;
+  dst = dst_ybc->u_buffer;
+
+  for (row = 0; row < src_ybc->uv_height; ++row) {
+    memcpy(dst, src, src_ybc->uv_width);
+    src += src_ybc->uv_stride;
+    dst += dst_ybc->uv_stride;
+  }
+
+  src = src_ybc->v_buffer;
+  dst = dst_ybc->v_buffer;
+
+  for (row = 0; row < src_ybc->uv_height; ++row) {
+    memcpy(dst, src, src_ybc->uv_width);
+    src += src_ybc->uv_stride;
+    dst += dst_ybc->uv_stride;
+  }
+
+  vp8_yv12_extend_frame_borders_c(dst_ybc);
+}
+
+#if CONFIG_VP9
+void vpx_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc,
+                           YV12_BUFFER_CONFIG *dst_ybc) {
+  int row;
+  const uint8_t *src = src_ybc->y_buffer;
+  uint8_t *dst = dst_ybc->y_buffer;
+
 #if 0
   /* These assertions are valid in the codec, but the libvpx-tester uses
    * this code slightly differently.
@@ -249,7 +273,7 @@ void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc,
       dst += dst_ybc->uv_stride;
     }
 
-    vp8_yv12_extend_frame_borders_c(dst_ybc);
+    vpx_extend_frame_borders_c(dst_ybc);
     return;
   } else {
     assert(!(dst_ybc->flags & YV12_FLAG_HIGHBITDEPTH));
@@ -280,8 +304,9 @@ void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc,
     dst += dst_ybc->uv_stride;
   }
 
-  vp8_yv12_extend_frame_borders_c(dst_ybc);
+  vpx_extend_frame_borders_c(dst_ybc);
 }
+#endif  // CONFIG_VP9
 
 void vpx_yv12_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc,
                        YV12_BUFFER_CONFIG *dst_ybc) {
diff --git a/media/libvpx/libvpx/vpx_scale/vpx_scale.h b/media/libvpx/libvpx/vpx_scale/vpx_scale.h
index 478a483461..fd5ba7ccdc 100644
--- a/media/libvpx/libvpx/vpx_scale/vpx_scale.h
+++ b/media/libvpx/libvpx/vpx_scale/vpx_scale.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_SCALE_VPX_SCALE_H_
-#define VPX_SCALE_VPX_SCALE_H_
+#ifndef VPX_VPX_SCALE_VPX_SCALE_H_
+#define VPX_VPX_SCALE_VPX_SCALE_H_
 
 #include "vpx_scale/yv12config.h"
 
@@ -19,4 +19,4 @@ extern void vpx_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
                             unsigned int vscale, unsigned int vratio,
                             unsigned int interlaced);
 
-#endif  // VPX_SCALE_VPX_SCALE_H_
+#endif  // VPX_VPX_SCALE_VPX_SCALE_H_
diff --git a/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c b/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c
index dc4d9593a8..706b0770c8 100644
--- a/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c
+++ b/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c
@@ -12,4 +12,4 @@
 #include "./vpx_scale_rtcd.h"
 #include "vpx_ports/vpx_once.h"
 
-void vpx_scale_rtcd() { once(setup_rtcd_internal); }
+void vpx_scale_rtcd(void) { once(setup_rtcd_internal); }
diff --git a/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.pl b/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.pl
index 44b115c7eb..1281071a7d 100644
--- a/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.pl
+++ b/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.pl
@@ -1,3 +1,13 @@
+##
+##  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
 sub vpx_scale_forward_decls() {
 print <<EOF
 struct yv12_buffer_config;
@@ -23,6 +33,8 @@ add_proto qw/void vp8_yv12_copy_frame/, "const struct yv12_buffer_config *src_yb
 add_proto qw/void vpx_yv12_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
 
 if (vpx_config("CONFIG_VP9") eq "yes") {
+    add_proto qw/void vpx_yv12_copy_frame/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
+
     add_proto qw/void vpx_extend_frame_borders/, "struct yv12_buffer_config *ybf";
     specialize qw/vpx_extend_frame_borders dspr2/;
 
diff --git a/media/libvpx/libvpx/vpx_scale/yv12config.h b/media/libvpx/libvpx/vpx_scale/yv12config.h
index b9b3362144..2cf18217f6 100644
--- a/media/libvpx/libvpx/vpx_scale/yv12config.h
+++ b/media/libvpx/libvpx/vpx_scale/yv12config.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_SCALE_YV12CONFIG_H_
-#define VPX_SCALE_YV12CONFIG_H_
+#ifndef VPX_VPX_SCALE_YV12CONFIG_H_
+#define VPX_VPX_SCALE_YV12CONFIG_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -49,9 +49,9 @@ typedef struct yv12_buffer_config {
   uint8_t *alpha_buffer;
 
   uint8_t *buffer_alloc;
-  int buffer_alloc_sz;
+  size_t buffer_alloc_sz;
   int border;
-  int frame_size;
+  size_t frame_size;
   int subsampling_x;
   int subsampling_y;
   unsigned int bit_depth;
@@ -100,4 +100,4 @@ int vpx_free_frame_buffer(YV12_BUFFER_CONFIG *ybf);
 }
 #endif
 
-#endif  // VPX_SCALE_YV12CONFIG_H_
+#endif  // VPX_VPX_SCALE_YV12CONFIG_H_
diff --git a/media/libvpx/libvpx/vpx_util/endian_inl.h b/media/libvpx/libvpx/vpx_util/endian_inl.h
index dc38774095..1b6ef56c69 100644
--- a/media/libvpx/libvpx/vpx_util/endian_inl.h
+++ b/media/libvpx/libvpx/vpx_util/endian_inl.h
@@ -9,8 +9,8 @@
 //
 // Endian related functions.
 
-#ifndef VPX_UTIL_ENDIAN_INL_H_
-#define VPX_UTIL_ENDIAN_INL_H_
+#ifndef VPX_VPX_UTIL_ENDIAN_INL_H_
+#define VPX_VPX_UTIL_ENDIAN_INL_H_
 
 #include <stdlib.h>
 #include "./vpx_config.h"
@@ -115,4 +115,4 @@ static INLINE uint64_t BSwap64(uint64_t x) {
 #endif  // HAVE_BUILTIN_BSWAP64
 }
 
-#endif  // VPX_UTIL_ENDIAN_INL_H_
+#endif  // VPX_VPX_UTIL_ENDIAN_INL_H_
diff --git a/media/libvpx/libvpx/vpx_util/loongson_intrinsics.h b/media/libvpx/libvpx/vpx_util/loongson_intrinsics.h
new file mode 100644
index 0000000000..b8b9e6db02
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_util/loongson_intrinsics.h
@@ -0,0 +1,2090 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS.  All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#ifndef VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_
+#define VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_
+
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS.  All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *                Xiwei Gu   <guxiwei-hf@loongson.cn>
+ *                Lu Wang    <wanglu@loongson.cn>
+ *
+ * This file is a header file for loongarch builtin extension.
+ *
+ */
+
+#ifndef LOONGSON_INTRINSICS_H
+#define LOONGSON_INTRINSICS_H
+
+/**
+ * MAJOR version: Macro usage changes.
+ * MINOR version: Add new functions, or bug fixes.
+ * MICRO version: Comment changes or implementation changes.
+ */
+#define LSOM_VERSION_MAJOR 1
+#define LSOM_VERSION_MINOR 2
+#define LSOM_VERSION_MICRO 1
+
+#define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
+  {                                               \
+    _OUT0 = _INS(_IN0);                           \
+    _OUT1 = _INS(_IN1);                           \
+  }
+
+#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
+  {                                                           \
+    _OUT0 = _INS(_IN0, _IN1);                                 \
+    _OUT1 = _INS(_IN2, _IN3);                                 \
+  }
+
+#define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \
+  {                                                                       \
+    _OUT0 = _INS(_IN0, _IN1, _IN2);                                       \
+    _OUT1 = _INS(_IN3, _IN4, _IN5);                                       \
+  }
+
+#define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \
+  {                                                                         \
+    DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1);                              \
+    DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3);                              \
+  }
+
+#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, \
+                  _OUT1, _OUT2, _OUT3)                                         \
+  {                                                                            \
+    DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1);                     \
+    DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3);                     \
+  }
+
+#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, \
+                  _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3)             \
+  {                                                                           \
+    DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1);        \
+    DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3);      \
+  }
+
+#ifdef __loongarch_sx
+#include <lsxintrin.h>
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs  - in_c, in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Signed byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get a result twice the size of input. Then
+ *               the results are added to signed half-word elements from in_c.
+ * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4, 1,2,3,4
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 23,40,41,26, 23,40,41,26
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h,
+                                        __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
+  out = __lsx_vmaddwod_h_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs  - in_c, in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied by
+ *               unsigned byte elements from in_l, and then added adjacent to
+ *               each other to get a result twice the size of input.
+ *               The results are added to signed half-word elements from in_c.
+ * Example     : out = __lsx_vdp2add_h_bu(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4, 1,2,3,4
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 23,40,41,26, 23,40,41,26
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h,
+                                         __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
+  out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs  - in_c, in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get a result twice the size of input.
+ *               The results are added to signed half-word elements from in_c.
+ * Example     : out = __lsx_vdp2add_h_bu_b(in_c, in_h, in_l)
+ *        in_c : 1,1,1,1, 1,1,1,1
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : -1,-2,-3,-4, -5,-6,-7,-8, 1,2,3,4, 5,6,7,8
+ *         out : -4,-24,-60,-112, 6,26,62,114
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, __m128i in_h,
+                                           __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l);
+  out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of half-word vector elements
+ * Arguments   : Inputs  - in_c, in_h, in_l
+ *               Outputs - out
+ *               Return Type - __m128i
+ * Details     : Signed half-word elements from in_h are multiplied by
+ *               signed half-word elements from in_l, and then added adjacent to
+ *               each other to get a result twice the size of input.
+ *               Then the results are added to signed word elements from in_c.
+ * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4
+ *        in_h : 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1
+ *         out : 23,40,41,26
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h,
+                                        __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
+  out = __lsx_vmaddwod_w_h(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Signed byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get a result twice the size of input.
+ * Example     : out = __lsx_vdp2_h_b(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 22,38,38,22, 22,38,38,22
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmulwev_h_b(in_h, in_l);
+  out = __lsx_vmaddwod_h_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied by
+ *               unsigned byte elements from in_l, and then added adjacent to
+ *               each other to get a result twice the size of input.
+ * Example     : out = __lsx_vdp2_h_bu(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 22,38,38,22, 22,38,38,22
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmulwev_h_bu(in_h, in_l);
+  out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get a result twice the size of input.
+ * Example     : out = __lsx_vdp2_h_bu_b(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,-1
+ *         out : 22,38,38,22, 22,38,38,6
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmulwev_h_bu_b(in_h, in_l);
+  out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Signed byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get a result twice the size of input.
+ * Example     : out = __lsx_vdp2_w_h(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1
+ *         out : 22,38,38,22
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmulwev_w_h(in_h, in_l);
+  out = __lsx_vmaddwod_w_h(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - in_h, in_l
+ *               Outputs - out
+ *               Return Type - double
+ * Details     : Signed byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get a result twice the size of input.
+ * Example     : out = __lsx_vdp2_d_w(in_h, in_l)
+ *        in_h : 1,2,3,4
+ *        in_l : 8,7,6,5
+ *         out : 22,38
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_d_w(__m128i in_h, __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmulwev_d_w(in_h, in_l);
+  out = __lsx_vmaddwod_d_w(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Clip all halfword elements of input vector between min & max
+ *               out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) :
+ *               (_in))
+ * Arguments   : Inputs  - _in  (input vector)
+ *                       - min  (min threshold)
+ *                       - max  (max threshold)
+ *               Outputs - out  (output vector with clipped elements)
+ *               Return Type - signed halfword
+ * Example     : out = __lsx_vclip_h(_in)
+ *         _in : -8,2,280,249, -8,255,280,249
+ *         min : 1,1,1,1, 1,1,1,1
+ *         max : 9,9,9,9, 9,9,9,9
+ *         out : 1,2,9,9, 1,9,9,9
+ * =============================================================================
+ */
+static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max) {
+  __m128i out;
+
+  out = __lsx_vmax_h(min, _in);
+  out = __lsx_vmin_h(max, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Set each element of vector between 0 and 255
+ * Arguments   : Inputs  - _in
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Signed byte elements from _in are clamped between 0 and 255.
+ * Example     : out = __lsx_vclip255_h(_in)
+ *         _in : -8,255,280,249, -8,255,280,249
+ *         out : 0,255,255,249, 0,255,255,249
+ * =============================================================================
+ */
+static inline __m128i __lsx_vclip255_h(__m128i _in) {
+  __m128i out;
+
+  out = __lsx_vmaxi_h(_in, 0);
+  out = __lsx_vsat_hu(out, 7);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Set each element of vector between 0 and 255
+ * Arguments   : Inputs  - _in
+ *               Outputs - out
+ *               Return Type - word
+ * Details     : Signed byte elements from _in are clamped between 0 and 255.
+ * Example     : out = __lsx_vclip255_w(_in)
+ *         _in : -8,255,280,249
+ *         out : 0,255,255,249
+ * =============================================================================
+ */
+static inline __m128i __lsx_vclip255_w(__m128i _in) {
+  __m128i out;
+
+  out = __lsx_vmaxi_w(_in, 0);
+  out = __lsx_vsat_wu(out, 7);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Swap two variables
+ * Arguments   : Inputs  - _in0, _in1
+ *               Outputs - _in0, _in1 (in-place)
+ * Details     : Swapping of two input variables using xor
+ * Example     : LSX_SWAP(_in0, _in1)
+ *        _in0 : 1,2,3,4
+ *        _in1 : 5,6,7,8
+ *   _in0(out) : 5,6,7,8
+ *   _in1(out) : 1,2,3,4
+ * =============================================================================
+ */
+#define LSX_SWAP(_in0, _in1)         \
+  {                                  \
+    _in0 = __lsx_vxor_v(_in0, _in1); \
+    _in1 = __lsx_vxor_v(_in0, _in1); \
+    _in0 = __lsx_vxor_v(_in0, _in1); \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 4x4 block with word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ * Details     :
+ * Example     :
+ *               1, 2, 3, 4            1, 5, 9,13
+ *               5, 6, 7, 8    to      2, 6,10,14
+ *               9,10,11,12  =====>    3, 7,11,15
+ *              13,14,15,16            4, 8,12,16
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    __m128i _t0, _t1, _t2, _t3;                                                \
+                                                                               \
+    _t0 = __lsx_vilvl_w(_in1, _in0);                                           \
+    _t1 = __lsx_vilvh_w(_in1, _in0);                                           \
+    _t2 = __lsx_vilvl_w(_in3, _in2);                                           \
+    _t3 = __lsx_vilvh_w(_in3, _in2);                                           \
+    _out0 = __lsx_vilvl_d(_t2, _t0);                                           \
+    _out1 = __lsx_vilvh_d(_t2, _t0);                                           \
+    _out2 = __lsx_vilvl_d(_t3, _t1);                                           \
+    _out3 = __lsx_vilvh_d(_t3, _t1);                                           \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with byte elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *               _out7
+ * Details     : The rows of the matrix become columns, and the columns
+ *               become rows.
+ * Example     : LSX_TRANSPOSE8x8_B
+ *        _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00
+ *        _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00
+ *        _in2 : 20,21,22,23,24,25,26,27, 00,00,00,00,00,00,00,00
+ *        _in3 : 30,31,32,33,34,35,36,37, 00,00,00,00,00,00,00,00
+ *        _in4 : 40,41,42,43,44,45,46,47, 00,00,00,00,00,00,00,00
+ *        _in5 : 50,51,52,53,54,55,56,57, 00,00,00,00,00,00,00,00
+ *        _in6 : 60,61,62,63,64,65,66,67, 00,00,00,00,00,00,00,00
+ *        _in7 : 70,71,72,73,74,75,76,77, 00,00,00,00,00,00,00,00
+ *
+ *      _ out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
+ *      _ out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
+ *      _ out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
+ *      _ out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
+ *      _ out4 : 04,14,24,34,44,54,64,74, 00,00,00,00,00,00,00,00
+ *      _ out5 : 05,15,25,35,45,55,65,75, 00,00,00,00,00,00,00,00
+ *      _ out6 : 06,16,26,36,46,56,66,76, 00,00,00,00,00,00,00,00
+ *      _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    __m128i zero = { 0 };                                                   \
+    __m128i shuf8 = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };             \
+    __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                         \
+                                                                            \
+    _t0 = __lsx_vilvl_b(_in2, _in0);                                        \
+    _t1 = __lsx_vilvl_b(_in3, _in1);                                        \
+    _t2 = __lsx_vilvl_b(_in6, _in4);                                        \
+    _t3 = __lsx_vilvl_b(_in7, _in5);                                        \
+    _t4 = __lsx_vilvl_b(_t1, _t0);                                          \
+    _t5 = __lsx_vilvh_b(_t1, _t0);                                          \
+    _t6 = __lsx_vilvl_b(_t3, _t2);                                          \
+    _t7 = __lsx_vilvh_b(_t3, _t2);                                          \
+    _out0 = __lsx_vilvl_w(_t6, _t4);                                        \
+    _out2 = __lsx_vilvh_w(_t6, _t4);                                        \
+    _out4 = __lsx_vilvl_w(_t7, _t5);                                        \
+    _out6 = __lsx_vilvh_w(_t7, _t5);                                        \
+    _out1 = __lsx_vshuf_b(zero, _out0, shuf8);                              \
+    _out3 = __lsx_vshuf_b(zero, _out2, shuf8);                              \
+    _out5 = __lsx_vshuf_b(zero, _out4, shuf8);                              \
+    _out7 = __lsx_vshuf_b(zero, _out6, shuf8);                              \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ * Details     :
+ * Example     :
+ *              00,01,02,03,04,05,06,07           00,10,20,30,40,50,60,70
+ *              10,11,12,13,14,15,16,17           01,11,21,31,41,51,61,71
+ *              20,21,22,23,24,25,26,27           02,12,22,32,42,52,62,72
+ *              30,31,32,33,34,35,36,37    to     03,13,23,33,43,53,63,73
+ *              40,41,42,43,44,45,46,47  ======>  04,14,24,34,44,54,64,74
+ *              50,51,52,53,54,55,56,57           05,15,25,35,45,55,65,75
+ *              60,61,62,63,64,65,66,67           06,16,26,36,46,56,66,76
+ *              70,71,72,73,74,75,76,77           07,17,27,37,47,57,67,77
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;               \
+                                                                            \
+    _s0 = __lsx_vilvl_h(_in6, _in4);                                        \
+    _s1 = __lsx_vilvl_h(_in7, _in5);                                        \
+    _t0 = __lsx_vilvl_h(_s1, _s0);                                          \
+    _t1 = __lsx_vilvh_h(_s1, _s0);                                          \
+    _s0 = __lsx_vilvh_h(_in6, _in4);                                        \
+    _s1 = __lsx_vilvh_h(_in7, _in5);                                        \
+    _t2 = __lsx_vilvl_h(_s1, _s0);                                          \
+    _t3 = __lsx_vilvh_h(_s1, _s0);                                          \
+    _s0 = __lsx_vilvl_h(_in2, _in0);                                        \
+    _s1 = __lsx_vilvl_h(_in3, _in1);                                        \
+    _t4 = __lsx_vilvl_h(_s1, _s0);                                          \
+    _t5 = __lsx_vilvh_h(_s1, _s0);                                          \
+    _s0 = __lsx_vilvh_h(_in2, _in0);                                        \
+    _s1 = __lsx_vilvh_h(_in3, _in1);                                        \
+    _t6 = __lsx_vilvl_h(_s1, _s0);                                          \
+    _t7 = __lsx_vilvh_h(_s1, _s0);                                          \
+                                                                            \
+    _out0 = __lsx_vpickev_d(_t0, _t4);                                      \
+    _out2 = __lsx_vpickev_d(_t1, _t5);                                      \
+    _out4 = __lsx_vpickev_d(_t2, _t6);                                      \
+    _out6 = __lsx_vpickev_d(_t3, _t7);                                      \
+    _out1 = __lsx_vpickod_d(_t0, _t4);                                      \
+    _out3 = __lsx_vpickod_d(_t1, _t5);                                      \
+    _out5 = __lsx_vpickod_d(_t2, _t6);                                      \
+    _out7 = __lsx_vpickod_d(_t3, _t7);                                      \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose input 8x4 byte block into 4x8
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3      (input 8x4 byte block)
+ *               Outputs - _out0, _out1, _out2, _out3  (output 4x8 byte block)
+ *               Return Type - as per RTYPE
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
+ * Example     : LSX_TRANSPOSE8x4_B
+ *        _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in2 : 20,21,22,23,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in3 : 30,31,32,33,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in4 : 40,41,42,43,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in5 : 50,51,52,53,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in6 : 60,61,62,63,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in7 : 70,71,72,73,00,00,00,00, 00,00,00,00,00,00,00,00
+ *
+ *       _out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
+ *       _out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
+ *       _out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
+ *       _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+                           _out0, _out1, _out2, _out3)                     \
+  {                                                                        \
+    __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                            \
+                                                                           \
+    _tmp0_m = __lsx_vpackev_w(_in4, _in0);                                 \
+    _tmp1_m = __lsx_vpackev_w(_in5, _in1);                                 \
+    _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m);                             \
+    _tmp0_m = __lsx_vpackev_w(_in6, _in2);                                 \
+    _tmp1_m = __lsx_vpackev_w(_in7, _in3);                                 \
+                                                                           \
+    _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m);                             \
+    _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m);                             \
+    _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m);                             \
+                                                                           \
+    _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m);                               \
+    _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m);                               \
+    _out1 = __lsx_vilvh_d(_out2, _out0);                                   \
+    _out3 = __lsx_vilvh_d(_out0, _out2);                                   \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 16x8 block with byte elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, in8
+ *                         in9, in10, in11, in12, in13, in14, in15
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ * Details     :
+ * Example     :
+ *              000,001,002,003,004,005,006,007
+ *              008,009,010,011,012,013,014,015
+ *              016,017,018,019,020,021,022,023
+ *              024,025,026,027,028,029,030,031
+ *              032,033,034,035,036,037,038,039
+ *              040,041,042,043,044,045,046,047        000,008,...,112,120
+ *              048,049,050,051,052,053,054,055        001,009,...,113,121
+ *              056,057,058,059,060,061,062,063   to   002,010,...,114,122
+ *              064,068,066,067,068,069,070,071 =====> 003,011,...,115,123
+ *              072,073,074,075,076,077,078,079        004,012,...,116,124
+ *              080,081,082,083,084,085,086,087        005,013,...,117,125
+ *              088,089,090,091,092,093,094,095        006,014,...,118,126
+ *              096,097,098,099,100,101,102,103        007,015,...,119,127
+ *              104,105,106,107,108,109,110,111
+ *              112,113,114,115,116,117,118,119
+ *              120,121,122,123,124,125,126,127
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                            _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
+                            _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+                            _out6, _out7)                                    \
+  {                                                                          \
+    __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7;          \
+    __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                          \
+    DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \
+              _tmp0, _tmp1, _tmp2, _tmp3);                                   \
+    DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15,  \
+              _in13, _tmp4, _tmp5, _tmp6, _tmp7);                            \
+    DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2);          \
+    DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3);          \
+    DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6);          \
+    DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7);          \
+    DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4);              \
+    DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6);              \
+    DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5);              \
+    DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7);              \
+    DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2);      \
+    DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3);      \
+    DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6);      \
+    DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7);      \
+  }
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 4 input vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ * Details     : Butterfly operation
+ * Example     :
+ *               out0 = in0 + in3;
+ *               out1 = in1 + in2;
+ *               out2 = in1 - in2;
+ *               out3 = in0 - in3;
+ * =============================================================================
+ */
+#define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                           \
+    _out0 = __lsx_vadd_b(_in0, _in3);                                         \
+    _out1 = __lsx_vadd_b(_in1, _in2);                                         \
+    _out2 = __lsx_vsub_b(_in1, _in2);                                         \
+    _out3 = __lsx_vsub_b(_in0, _in3);                                         \
+  }
+#define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                           \
+    _out0 = __lsx_vadd_h(_in0, _in3);                                         \
+    _out1 = __lsx_vadd_h(_in1, _in2);                                         \
+    _out2 = __lsx_vsub_h(_in1, _in2);                                         \
+    _out3 = __lsx_vsub_h(_in0, _in3);                                         \
+  }
+#define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                           \
+    _out0 = __lsx_vadd_w(_in0, _in3);                                         \
+    _out1 = __lsx_vadd_w(_in1, _in2);                                         \
+    _out2 = __lsx_vsub_w(_in1, _in2);                                         \
+    _out3 = __lsx_vsub_w(_in0, _in3);                                         \
+  }
+#define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                           \
+    _out0 = __lsx_vadd_d(_in0, _in3);                                         \
+    _out1 = __lsx_vadd_d(_in1, _in2);                                         \
+    _out2 = __lsx_vsub_d(_in1, _in2);                                         \
+    _out3 = __lsx_vsub_d(_in0, _in3);                                         \
+  }
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 8 input vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
+ *               Outputs - _out0, _out1, _out2, _out3, ~
+ * Details     : Butterfly operation
+ * Example     :
+ *              _out0 = _in0 + _in7;
+ *              _out1 = _in1 + _in6;
+ *              _out2 = _in2 + _in5;
+ *              _out3 = _in3 + _in4;
+ *              _out4 = _in3 - _in4;
+ *              _out5 = _in2 - _in5;
+ *              _out6 = _in1 - _in6;
+ *              _out7 = _in0 - _in7;
+ * =============================================================================
+ */
+#define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                          _out7)                                           \
+  {                                                                        \
+    _out0 = __lsx_vadd_b(_in0, _in7);                                      \
+    _out1 = __lsx_vadd_b(_in1, _in6);                                      \
+    _out2 = __lsx_vadd_b(_in2, _in5);                                      \
+    _out3 = __lsx_vadd_b(_in3, _in4);                                      \
+    _out4 = __lsx_vsub_b(_in3, _in4);                                      \
+    _out5 = __lsx_vsub_b(_in2, _in5);                                      \
+    _out6 = __lsx_vsub_b(_in1, _in6);                                      \
+    _out7 = __lsx_vsub_b(_in0, _in7);                                      \
+  }
+
+#define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                          _out7)                                           \
+  {                                                                        \
+    _out0 = __lsx_vadd_h(_in0, _in7);                                      \
+    _out1 = __lsx_vadd_h(_in1, _in6);                                      \
+    _out2 = __lsx_vadd_h(_in2, _in5);                                      \
+    _out3 = __lsx_vadd_h(_in3, _in4);                                      \
+    _out4 = __lsx_vsub_h(_in3, _in4);                                      \
+    _out5 = __lsx_vsub_h(_in2, _in5);                                      \
+    _out6 = __lsx_vsub_h(_in1, _in6);                                      \
+    _out7 = __lsx_vsub_h(_in0, _in7);                                      \
+  }
+
+#define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                          _out7)                                           \
+  {                                                                        \
+    _out0 = __lsx_vadd_w(_in0, _in7);                                      \
+    _out1 = __lsx_vadd_w(_in1, _in6);                                      \
+    _out2 = __lsx_vadd_w(_in2, _in5);                                      \
+    _out3 = __lsx_vadd_w(_in3, _in4);                                      \
+    _out4 = __lsx_vsub_w(_in3, _in4);                                      \
+    _out5 = __lsx_vsub_w(_in2, _in5);                                      \
+    _out6 = __lsx_vsub_w(_in1, _in6);                                      \
+    _out7 = __lsx_vsub_w(_in0, _in7);                                      \
+  }
+
+#define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                          _out7)                                           \
+  {                                                                        \
+    _out0 = __lsx_vadd_d(_in0, _in7);                                      \
+    _out1 = __lsx_vadd_d(_in1, _in6);                                      \
+    _out2 = __lsx_vadd_d(_in2, _in5);                                      \
+    _out3 = __lsx_vadd_d(_in3, _in4);                                      \
+    _out4 = __lsx_vsub_d(_in3, _in4);                                      \
+    _out5 = __lsx_vsub_d(_in2, _in5);                                      \
+    _out6 = __lsx_vsub_d(_in1, _in6);                                      \
+    _out7 = __lsx_vsub_d(_in0, _in7);                                      \
+  }
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 16 input vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
+ *               Outputs - _out0, _out1, _out2, _out3, ~
+ * Details     : Butterfly operation
+ * Example     :
+ *              _out0 = _in0 + _in15;
+ *              _out1 = _in1 + _in14;
+ *              _out2 = _in2 + _in13;
+ *              _out3 = _in3 + _in12;
+ *              _out4 = _in4 + _in11;
+ *              _out5 = _in5 + _in10;
+ *              _out6 = _in6 + _in9;
+ *              _out7 = _in7 + _in8;
+ *              _out8 = _in7 - _in8;
+ *              _out9 = _in6 - _in9;
+ *              _out10 = _in5 - _in10;
+ *              _out11 = _in4 - _in11;
+ *              _out12 = _in3 - _in12;
+ *              _out13 = _in2 - _in13;
+ *              _out14 = _in1 - _in14;
+ *              _out15 = _in0 - _in15;
+ * =============================================================================
+ */
+
+#define LSX_BUTTERFLY_16_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,     \
+                           _in8, _in9, _in10, _in11, _in12, _in13, _in14,      \
+                           _in15, _out0, _out1, _out2, _out3, _out4, _out5,    \
+                           _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
+                           _out13, _out14, _out15)                             \
+  {                                                                            \
+    _out0 = __lsx_vadd_b(_in0, _in15);                                         \
+    _out1 = __lsx_vadd_b(_in1, _in14);                                         \
+    _out2 = __lsx_vadd_b(_in2, _in13);                                         \
+    _out3 = __lsx_vadd_b(_in3, _in12);                                         \
+    _out4 = __lsx_vadd_b(_in4, _in11);                                         \
+    _out5 = __lsx_vadd_b(_in5, _in10);                                         \
+    _out6 = __lsx_vadd_b(_in6, _in9);                                          \
+    _out7 = __lsx_vadd_b(_in7, _in8);                                          \
+                                                                               \
+    _out8 = __lsx_vsub_b(_in7, _in8);                                          \
+    _out9 = __lsx_vsub_b(_in6, _in9);                                          \
+    _out10 = __lsx_vsub_b(_in5, _in10);                                        \
+    _out11 = __lsx_vsub_b(_in4, _in11);                                        \
+    _out12 = __lsx_vsub_b(_in3, _in12);                                        \
+    _out13 = __lsx_vsub_b(_in2, _in13);                                        \
+    _out14 = __lsx_vsub_b(_in1, _in14);                                        \
+    _out15 = __lsx_vsub_b(_in0, _in15);                                        \
+  }
+
+#define LSX_BUTTERFLY_16_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,     \
+                           _in8, _in9, _in10, _in11, _in12, _in13, _in14,      \
+                           _in15, _out0, _out1, _out2, _out3, _out4, _out5,    \
+                           _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
+                           _out13, _out14, _out15)                             \
+  {                                                                            \
+    _out0 = __lsx_vadd_h(_in0, _in15);                                         \
+    _out1 = __lsx_vadd_h(_in1, _in14);                                         \
+    _out2 = __lsx_vadd_h(_in2, _in13);                                         \
+    _out3 = __lsx_vadd_h(_in3, _in12);                                         \
+    _out4 = __lsx_vadd_h(_in4, _in11);                                         \
+    _out5 = __lsx_vadd_h(_in5, _in10);                                         \
+    _out6 = __lsx_vadd_h(_in6, _in9);                                          \
+    _out7 = __lsx_vadd_h(_in7, _in8);                                          \
+                                                                               \
+    _out8 = __lsx_vsub_h(_in7, _in8);                                          \
+    _out9 = __lsx_vsub_h(_in6, _in9);                                          \
+    _out10 = __lsx_vsub_h(_in5, _in10);                                        \
+    _out11 = __lsx_vsub_h(_in4, _in11);                                        \
+    _out12 = __lsx_vsub_h(_in3, _in12);                                        \
+    _out13 = __lsx_vsub_h(_in2, _in13);                                        \
+    _out14 = __lsx_vsub_h(_in1, _in14);                                        \
+    _out15 = __lsx_vsub_h(_in0, _in15);                                        \
+  }
+
+#define LSX_BUTTERFLY_16_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,     \
+                           _in8, _in9, _in10, _in11, _in12, _in13, _in14,      \
+                           _in15, _out0, _out1, _out2, _out3, _out4, _out5,    \
+                           _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
+                           _out13, _out14, _out15)                             \
+  {                                                                            \
+    _out0 = __lsx_vadd_w(_in0, _in15);                                         \
+    _out1 = __lsx_vadd_w(_in1, _in14);                                         \
+    _out2 = __lsx_vadd_w(_in2, _in13);                                         \
+    _out3 = __lsx_vadd_w(_in3, _in12);                                         \
+    _out4 = __lsx_vadd_w(_in4, _in11);                                         \
+    _out5 = __lsx_vadd_w(_in5, _in10);                                         \
+    _out6 = __lsx_vadd_w(_in6, _in9);                                          \
+    _out7 = __lsx_vadd_w(_in7, _in8);                                          \
+                                                                               \
+    _out8 = __lsx_vsub_w(_in7, _in8);                                          \
+    _out9 = __lsx_vsub_w(_in6, _in9);                                          \
+    _out10 = __lsx_vsub_w(_in5, _in10);                                        \
+    _out11 = __lsx_vsub_w(_in4, _in11);                                        \
+    _out12 = __lsx_vsub_w(_in3, _in12);                                        \
+    _out13 = __lsx_vsub_w(_in2, _in13);                                        \
+    _out14 = __lsx_vsub_w(_in1, _in14);                                        \
+    _out15 = __lsx_vsub_w(_in0, _in15);                                        \
+  }
+
+#define LSX_BUTTERFLY_16_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,     \
+                           _in8, _in9, _in10, _in11, _in12, _in13, _in14,      \
+                           _in15, _out0, _out1, _out2, _out3, _out4, _out5,    \
+                           _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
+                           _out13, _out14, _out15)                             \
+  {                                                                            \
+    _out0 = __lsx_vadd_d(_in0, _in15);                                         \
+    _out1 = __lsx_vadd_d(_in1, _in14);                                         \
+    _out2 = __lsx_vadd_d(_in2, _in13);                                         \
+    _out3 = __lsx_vadd_d(_in3, _in12);                                         \
+    _out4 = __lsx_vadd_d(_in4, _in11);                                         \
+    _out5 = __lsx_vadd_d(_in5, _in10);                                         \
+    _out6 = __lsx_vadd_d(_in6, _in9);                                          \
+    _out7 = __lsx_vadd_d(_in7, _in8);                                          \
+                                                                               \
+    _out8 = __lsx_vsub_d(_in7, _in8);                                          \
+    _out9 = __lsx_vsub_d(_in6, _in9);                                          \
+    _out10 = __lsx_vsub_d(_in5, _in10);                                        \
+    _out11 = __lsx_vsub_d(_in4, _in11);                                        \
+    _out12 = __lsx_vsub_d(_in3, _in12);                                        \
+    _out13 = __lsx_vsub_d(_in2, _in13);                                        \
+    _out14 = __lsx_vsub_d(_in1, _in14);                                        \
+    _out15 = __lsx_vsub_d(_in0, _in15);                                        \
+  }
+
+#endif  // LSX
+
+#ifdef __loongarch_asx
+#include <lasxintrin.h>
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed halfword
+ * Details     : Unsigned byte elements from in_h are multiplied with
+ *               unsigned byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then these multiplied results of adjacent odd-even elements
+ *               are added to the out vector
+ * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_h_bu(in_h, in_l);
+  out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed halfword
+ * Details     : Signed byte elements from in_h are multiplied with
+ *               signed byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then these multiplication results of adjacent odd-even elements
+ *               are added to the out vector
+ * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_h_b(in_h, in_l);
+  out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Signed halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Then these multiplied results of adjacent odd-even elements
+ *               are added to the out vector.
+ * Example     : out = __lasx_xvdp2_w_h(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 22,38,38,22, 22,38,38,22
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_w_h(in_h, in_l);
+  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of word vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed double
+ * Details     : Signed word elements from in_h are multiplied with
+ *               signed word elements from in_l producing a result
+ *               twice the size of input i.e. signed double-word.
+ *               Then these multiplied results of adjacent odd-even elements
+ *               are added to the out vector.
+ * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_d_w(in_h, in_l);
+  out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Unsigned halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. unsigned word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added to the out vector
+ * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
+  out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - halfword
+ * Details     : Signed byte elements from in_h are multiplied with
+ *               signed byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then these multiplied results of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h,
+                                          __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied with
+ *               unsigned byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then these multiplied results of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, __m256i in_h,
+                                           __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied with
+ *               signed byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then these multiplied results of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, __m256i in_h,
+                                             __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - per RTYPE
+ * Details     : Signed halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4, 1,2,3,4
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8,
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1,
+ *         out : 23,40,41,26, 23,40,41,26
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h,
+                                          __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Unsigned halfword elements from in_h are multiplied with
+ *               unsigned halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h,
+                                           __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Unsigned halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added to the in_c vector
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h,
+                                             __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Vector Unsigned Dot Product and Subtract
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - signed halfword
+ * Details     : Unsigned byte elements from in_h are multiplied with
+ *               unsigned byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added together and subtracted from double width elements
+ *               in_c vector.
+ * Example     : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h,
+                                           __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_h_bu(in_h, in_l);
+  out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+  out = __lasx_xvsub_h(in_c, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Vector Signed Dot Product and Subtract
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Signed halfword elements from in_h are multiplied with
+ *               Signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added together and subtracted from double width elements
+ *               in_c vector.
+ * Example     : out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
+ *        in_c : 0,0,0,0, 0,0,0,0
+ *        in_h : 3,1,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1
+ *        in_l : 2,1,1,0, 1,0,0,0, 0,0,1,0, 1,0,0,1
+ *         out : -7,-3,0,0, 0,-1,0,-1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h,
+                                          __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_w_h(in_h, in_l);
+  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+  out = __lasx_xvsub_w(in_c, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Signed halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               four times the size of input i.e. signed doubleword.
+ *               Then these multiplication results of four adjacent elements
+ *               are added together and stored to the out vector.
+ * Example     : out = __lasx_xvdp4_d_h(in_h, in_l)
+ *        in_h :  3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1
+ *        in_l : -2,1,1,0, 1,0,0,0, 0,0,1, 0, 1,0,0,1
+ *         out : -2,0,1,1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_w_h(in_h, in_l);
+  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+  out = __lasx_xvhaddw_d_w(out, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The high half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are added after the
+ *               higher half of the two-fold sign extension (signed byte
+ *               to signed halfword) and stored to the out vector.
+ * Example     : See out = __lasx_xvaddwh_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvilvh_b(in_h, in_l);
+  out = __lasx_xvhaddw_h_b(out, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The high half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are added after the
+ *               higher half of the two-fold sign extension (signed halfword
+ *               to signed word) and stored to the out vector.
+ * Example     : out = __lasx_xvaddwh_w_h(in_h, in_l)
+ *        in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ *        in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
+ *         out : 1,0,0,-1, 1,0,0, 2
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvilvh_h(in_h, in_l);
+  out = __lasx_xvhaddw_w_h(out, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are added after the
+ *               lower half of the two-fold sign extension (signed byte
+ *               to signed halfword) and stored to the out vector.
+ * Example     : See out = __lasx_xvaddwl_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvilvl_b(in_h, in_l);
+  out = __lasx_xvhaddw_h_b(out, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are added after the
+ *               lower half of the two-fold sign extension (signed halfword
+ *               to signed word) and stored to the out vector.
+ * Example     : out = __lasx_xvaddwl_w_h(in_h, in_l)
+ *        in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ *        in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
+ *         out : 5,-1,4,2, 1,0,2,-1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvilvl_h(in_h, in_l);
+  out = __lasx_xvhaddw_w_h(out, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The out vector and the out vector are added after the
+ *               lower half of the two-fold zero extension (unsigned byte
+ *               to unsigned halfword) and stored to the out vector.
+ * Example     : See out = __lasx_xvaddwl_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvilvl_b(in_h, in_l);
+  out = __lasx_xvhaddw_hu_bu(out, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_l vector after double zero extension (unsigned byte to
+ *               signed halfword)，added to the in_h vector.
+ * Example     : See out = __lasx_xvaddw_w_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvsllwil_hu_bu(in_l, 0);
+  out = __lasx_xvadd_h(in_h, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_l vector after double sign extension (signed halfword to
+ *               signed word), added to the in_h vector.
+ * Example     : out = __lasx_xvaddw_w_w_h(in_h, in_l)
+ *        in_h : 0, 1,0,0, -1,0,0,1,
+ *        in_l : 2,-1,1,2,  1,0,0,0, 0,0,1,0, 1,0,0,1,
+ *         out : 2, 0,1,2, -1,0,1,1,
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvsllwil_w_h(in_l, 0);
+  out = __lasx_xvadd_w(in_h, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Multiplication and addition calculation after expansion
+ *               of the lower half of the vector.
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are multiplied after
+ *               the lower half of the two-fold sign extension (signed halfword
+ *               to signed word), and the result is added to the vector in_c,
+ *               then stored to the out vector.
+ * Example     : out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4, 5,6,7,8
+ *        in_h : 1,2,3,4, 1,2,3,4, 5,6,7,8, 5,6,7,8
+ *        in_l : 200, 300, 400, 500,  2000, 3000, 4000, 5000,
+ *              -200,-300,-400,-500, -2000,-3000,-4000,-5000
+ *         out : 201, 602,1203,2004, -995, -1794,-2793,-3992
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h,
+                                          __m256i in_l) {
+  __m256i tmp0, tmp1, out;
+
+  tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
+  tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
+  tmp0 = __lasx_xvmul_w(tmp0, tmp1);
+  out = __lasx_xvadd_w(tmp0, in_c);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Multiplication and addition calculation after expansion
+ *               of the higher half of the vector.
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are multiplied after
+ *               the higher half of the two-fold sign extension (signed
+ *               halfword to signed word), and the result is added to
+ *               the vector in_c, then stored to the out vector.
+ * Example     : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h,
+                                          __m256i in_l) {
+  __m256i tmp0, tmp1, out;
+
+  tmp0 = __lasx_xvilvh_h(in_h, in_h);
+  tmp1 = __lasx_xvilvh_h(in_l, in_l);
+  tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
+  out = __lasx_xvadd_w(tmp0, in_c);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Multiplication calculation after expansion of the lower
+ *               half of the vector.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are multiplied after
+ *               the lower half of the two-fold sign extension (signed
+ *               halfword to signed word), then stored to the out vector.
+ * Example     : out = __lasx_xvmulwl_w_h(in_h, in_l)
+ *        in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ *        in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
+ *         out : 6,1,3,0, 0,0,1,0
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l) {
+  __m256i tmp0, tmp1, out;
+
+  tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
+  tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
+  out = __lasx_xvmul_w(tmp0, tmp1);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Multiplication calculation after expansion of the lower
+ *               half of the vector.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are multiplied after
+ *               the lower half of the two-fold sign extension (signed
+ *               halfword to signed word), then stored to the out vector.
+ * Example     : out = __lasx_xvmulwh_w_h(in_h, in_l)
+ *        in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ *        in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
+ *         out : 0,0,0,0, 0,0,0,1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) {
+  __m256i tmp0, tmp1, out;
+
+  tmp0 = __lasx_xvilvh_h(in_h, in_h);
+  tmp1 = __lasx_xvilvh_h(in_l, in_l);
+  out = __lasx_xvmulwev_w_h(tmp0, tmp1);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are added to the high half
+ *               after being doubled, then saturated.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector adds the in_l vector after the lower half of
+ *               the two-fold zero extension (unsigned byte to unsigned
+ *               halfword) and then saturated. The results are stored to the out
+ *               vector.
+ * Example     : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l)
+ *        in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1
+ *        in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1,
+ *               0,0,0,1
+ *        out  : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) {
+  __m256i tmp1, out;
+  __m256i zero = { 0 };
+
+  tmp1 = __lasx_xvilvl_b(zero, in_l);
+  out = __lasx_xvsadd_hu(in_h, tmp1);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Clip all halfword elements of input vector between min & max
+ *               out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
+ * Arguments   : Inputs  - in    (input vector)
+ *                       - min   (min threshold)
+ *                       - max   (max threshold)
+ *               Outputs - in    (output vector with clipped elements)
+ *               Return Type - signed halfword
+ * Example     : out = __lasx_xvclip_h(in, min, max)
+ *          in : -8,2,280,249, -8,255,280,249, 4,4,4,4, 5,5,5,5
+ *         min : 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1
+ *         max : 9,9,9,9, 9,9,9,9, 9,9,9,9, 9,9,9,9
+ *         out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max) {
+  __m256i out;
+
+  out = __lasx_xvmax_h(min, in);
+  out = __lasx_xvmin_h(max, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Clip all signed halfword elements of input vector
+ *               between 0 & 255
+ * Arguments   : Inputs  - in   (input vector)
+ *               Outputs - out  (output vector with clipped elements)
+ *               Return Type - signed halfword
+ * Example     : See out = __lasx_xvclip255_w(in)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvclip255_h(__m256i in) {
+  __m256i out;
+
+  out = __lasx_xvmaxi_h(in, 0);
+  out = __lasx_xvsat_hu(out, 7);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Clip all signed word elements of input vector
+ *               between 0 & 255
+ * Arguments   : Inputs - in   (input vector)
+ *               Output - out  (output vector with clipped elements)
+ *               Return Type - signed word
+ * Example     : out = __lasx_xvclip255_w(in)
+ *          in : -8,255,280,249, -8,255,280,249
+ *         out :  0,255,255,249,  0,255,255,249
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvclip255_w(__m256i in) {
+  __m256i out;
+
+  out = __lasx_xvmaxi_w(in, 0);
+  out = __lasx_xvsat_wu(out, 7);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Indexed halfword element values are replicated to all
+ *               elements in output vector. If 'idx < 8' use xvsplati_l_*,
+ *               if 'idx >= 8' use xvsplati_h_*.
+ * Arguments   : Inputs - in, idx
+ *               Output - out
+ * Details     : Idx element value from in vector is replicated to all
+ *               elements in out vector.
+ *               Valid index range for halfword operation is 0-7
+ * Example     : out = __lasx_xvsplati_l_h(in, idx)
+ *          in : 20,10,11,12, 13,14,15,16, 0,0,2,0, 0,0,0,0
+ *         idx : 0x02
+ *         out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx) {
+  __m256i out;
+
+  out = __lasx_xvpermi_q(in, in, 0x02);
+  out = __lasx_xvreplve_h(out, idx);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Indexed halfword element values are replicated to all
+ *               elements in output vector. If 'idx < 8' use xvsplati_l_*,
+ *               if 'idx >= 8' use xvsplati_h_*.
+ * Arguments   : Inputs - in, idx
+ *               Output - out
+ * Details     : Idx element value from in vector is replicated to all
+ *               elements in out vector.
+ *               Valid index range for halfword operation is 0-7
+ * Example     : out = __lasx_xvsplati_h_h(in, idx)
+ *          in : 20,10,11,12, 13,14,15,16, 0,2,0,0, 0,0,0,0
+ *         idx : 0x09
+ *         out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) {
+  __m256i out;
+
+  out = __lasx_xvpermi_q(in, in, 0x13);
+  out = __lasx_xvreplve_h(out, idx);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Transpose 4x4 block with double-word elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3
+ *               Outputs - _out0, _out1, _out2, _out3
+ * Example     : LASX_TRANSPOSE4x4_D
+ *        _in0 : 1,2,3,4
+ *        _in1 : 1,2,3,4
+ *        _in2 : 1,2,3,4
+ *        _in3 : 1,2,3,4
+ *
+ *       _out0 : 1,1,1,1
+ *       _out1 : 2,2,2,2
+ *       _out2 : 3,3,3,3
+ *       _out3 : 4,4,4,4
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
+                            _out3)                                       \
+  {                                                                      \
+    __m256i _tmp0, _tmp1, _tmp2, _tmp3;                                  \
+    _tmp0 = __lasx_xvilvl_d(_in1, _in0);                                 \
+    _tmp1 = __lasx_xvilvh_d(_in1, _in0);                                 \
+    _tmp2 = __lasx_xvilvl_d(_in3, _in2);                                 \
+    _tmp3 = __lasx_xvilvh_d(_in3, _in2);                                 \
+    _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20);                        \
+    _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31);                        \
+    _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20);                        \
+    _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31);                        \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with word elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *               _out7
+ * Example     : LASX_TRANSPOSE8x8_W
+ *        _in0 : 1,2,3,4,5,6,7,8
+ *        _in1 : 2,2,3,4,5,6,7,8
+ *        _in2 : 3,2,3,4,5,6,7,8
+ *        _in3 : 4,2,3,4,5,6,7,8
+ *        _in4 : 5,2,3,4,5,6,7,8
+ *        _in5 : 6,2,3,4,5,6,7,8
+ *        _in6 : 7,2,3,4,5,6,7,8
+ *        _in7 : 8,2,3,4,5,6,7,8
+ *
+ *       _out0 : 1,2,3,4,5,6,7,8
+ *       _out1 : 2,2,2,2,2,2,2,2
+ *       _out2 : 3,3,3,3,3,3,3,3
+ *       _out3 : 4,4,4,4,4,4,4,4
+ *       _out4 : 5,5,5,5,5,5,5,5
+ *       _out5 : 6,6,6,6,6,6,6,6
+ *       _out6 : 7,7,7,7,7,7,7,7
+ *       _out7 : 8,8,8,8,8,8,8,8
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                            _out7)                                           \
+  {                                                                          \
+    __m256i _s0_m, _s1_m;                                                    \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
+                                                                             \
+    _s0_m = __lasx_xvilvl_w(_in2, _in0);                                     \
+    _s1_m = __lasx_xvilvl_w(_in3, _in1);                                     \
+    _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
+    _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvh_w(_in2, _in0);                                     \
+    _s1_m = __lasx_xvilvh_w(_in3, _in1);                                     \
+    _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
+    _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvl_w(_in6, _in4);                                     \
+    _s1_m = __lasx_xvilvl_w(_in7, _in5);                                     \
+    _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
+    _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvh_w(_in6, _in4);                                     \
+    _s1_m = __lasx_xvilvh_w(_in7, _in5);                                     \
+    _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
+    _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
+    _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20);                        \
+    _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20);                        \
+    _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20);                        \
+    _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20);                        \
+    _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31);                        \
+    _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31);                        \
+    _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31);                        \
+    _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31);                        \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose input 16x8 byte block
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
+ *                         _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
+ *                         (input 16x8 byte block)
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *                         _out7 (output 8x16 byte block)
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
+ * Example     : See LASX_TRANSPOSE16x8_H
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                             _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
+                             _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+                             _out6, _out7)                                    \
+  {                                                                           \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                               \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                               \
+                                                                              \
+    _tmp0_m = __lasx_xvilvl_b(_in2, _in0);                                    \
+    _tmp1_m = __lasx_xvilvl_b(_in3, _in1);                                    \
+    _tmp2_m = __lasx_xvilvl_b(_in6, _in4);                                    \
+    _tmp3_m = __lasx_xvilvl_b(_in7, _in5);                                    \
+    _tmp4_m = __lasx_xvilvl_b(_in10, _in8);                                   \
+    _tmp5_m = __lasx_xvilvl_b(_in11, _in9);                                   \
+    _tmp6_m = __lasx_xvilvl_b(_in14, _in12);                                  \
+    _tmp7_m = __lasx_xvilvl_b(_in15, _in13);                                  \
+    _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m);                                \
+    _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m);                                \
+    _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m);                                \
+    _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m);                                \
+    _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m);                                \
+    _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m);                                \
+    _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m);                                \
+    _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m);                                \
+    _tmp0_m = __lasx_xvilvl_w(_out2, _out0);                                  \
+    _tmp2_m = __lasx_xvilvh_w(_out2, _out0);                                  \
+    _tmp4_m = __lasx_xvilvl_w(_out3, _out1);                                  \
+    _tmp6_m = __lasx_xvilvh_w(_out3, _out1);                                  \
+    _tmp1_m = __lasx_xvilvl_w(_out6, _out4);                                  \
+    _tmp3_m = __lasx_xvilvh_w(_out6, _out4);                                  \
+    _tmp5_m = __lasx_xvilvl_w(_out7, _out5);                                  \
+    _tmp7_m = __lasx_xvilvh_w(_out7, _out5);                                  \
+    _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m);                                \
+    _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m);                                \
+    _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m);                                \
+    _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m);                                \
+    _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m);                                \
+    _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m);                                \
+    _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m);                                \
+    _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m);                                \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose input 16x8 byte block
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
+ *                         _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
+ *                         (input 16x8 byte block)
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *                         _out7 (output 8x16 byte block)
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
+ * Example     : LASX_TRANSPOSE16x8_H
+ *        _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in2 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in3 : 4,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in4 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in5 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in6 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in7 : 8,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in8 : 9,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in9 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in10 : 0,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in11 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in12 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in13 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in14 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in15 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *
+ *       _out0 : 1,2,3,4,5,6,7,8,9,1,0,2,3,7,5,6
+ *       _out1 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
+ *       _out2 : 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
+ *       _out3 : 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
+ *       _out4 : 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
+ *       _out5 : 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
+ *       _out6 : 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
+ *       _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                             _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
+                             _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+                             _out6, _out7)                                    \
+  {                                                                           \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                               \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                               \
+    __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                           \
+                                                                              \
+    _tmp0_m = __lasx_xvilvl_h(_in2, _in0);                                    \
+    _tmp1_m = __lasx_xvilvl_h(_in3, _in1);                                    \
+    _tmp2_m = __lasx_xvilvl_h(_in6, _in4);                                    \
+    _tmp3_m = __lasx_xvilvl_h(_in7, _in5);                                    \
+    _tmp4_m = __lasx_xvilvl_h(_in10, _in8);                                   \
+    _tmp5_m = __lasx_xvilvl_h(_in11, _in9);                                   \
+    _tmp6_m = __lasx_xvilvl_h(_in14, _in12);                                  \
+    _tmp7_m = __lasx_xvilvl_h(_in15, _in13);                                  \
+    _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m);                                  \
+    _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m);                                  \
+    _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m);                                  \
+    _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m);                                  \
+    _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m);                                  \
+    _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m);                                  \
+    _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m);                                  \
+    _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m);                                  \
+    _tmp0_m = __lasx_xvilvl_d(_t2, _t0);                                      \
+    _tmp2_m = __lasx_xvilvh_d(_t2, _t0);                                      \
+    _tmp4_m = __lasx_xvilvl_d(_t3, _t1);                                      \
+    _tmp6_m = __lasx_xvilvh_d(_t3, _t1);                                      \
+    _tmp1_m = __lasx_xvilvl_d(_t6, _t4);                                      \
+    _tmp3_m = __lasx_xvilvh_d(_t6, _t4);                                      \
+    _tmp5_m = __lasx_xvilvl_d(_t7, _t5);                                      \
+    _tmp7_m = __lasx_xvilvh_d(_t7, _t5);                                      \
+    _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20);                         \
+    _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20);                         \
+    _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20);                         \
+    _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20);                         \
+                                                                              \
+    _tmp0_m = __lasx_xvilvh_h(_in2, _in0);                                    \
+    _tmp1_m = __lasx_xvilvh_h(_in3, _in1);                                    \
+    _tmp2_m = __lasx_xvilvh_h(_in6, _in4);                                    \
+    _tmp3_m = __lasx_xvilvh_h(_in7, _in5);                                    \
+    _tmp4_m = __lasx_xvilvh_h(_in10, _in8);                                   \
+    _tmp5_m = __lasx_xvilvh_h(_in11, _in9);                                   \
+    _tmp6_m = __lasx_xvilvh_h(_in14, _in12);                                  \
+    _tmp7_m = __lasx_xvilvh_h(_in15, _in13);                                  \
+    _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m);                                  \
+    _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m);                                  \
+    _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m);                                  \
+    _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m);                                  \
+    _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m);                                  \
+    _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m);                                  \
+    _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m);                                  \
+    _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m);                                  \
+    _tmp0_m = __lasx_xvilvl_d(_t2, _t0);                                      \
+    _tmp2_m = __lasx_xvilvh_d(_t2, _t0);                                      \
+    _tmp4_m = __lasx_xvilvl_d(_t3, _t1);                                      \
+    _tmp6_m = __lasx_xvilvh_d(_t3, _t1);                                      \
+    _tmp1_m = __lasx_xvilvl_d(_t6, _t4);                                      \
+    _tmp3_m = __lasx_xvilvh_d(_t6, _t4);                                      \
+    _tmp5_m = __lasx_xvilvl_d(_t7, _t5);                                      \
+    _tmp7_m = __lasx_xvilvh_d(_t7, _t5);                                      \
+    _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20);                         \
+    _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20);                         \
+    _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20);                         \
+    _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20);                         \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 4x4 block with halfword elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3
+ *               Outputs - _out0, _out1, _out2, _out3
+ *               Return Type - signed halfword
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
+ * Example     : See LASX_TRANSPOSE8x8_H
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
+                            _out3)                                       \
+  {                                                                      \
+    __m256i _s0_m, _s1_m;                                                \
+                                                                         \
+    _s0_m = __lasx_xvilvl_h(_in1, _in0);                                 \
+    _s1_m = __lasx_xvilvl_h(_in3, _in2);                                 \
+    _out0 = __lasx_xvilvl_w(_s1_m, _s0_m);                               \
+    _out2 = __lasx_xvilvh_w(_s1_m, _s0_m);                               \
+    _out1 = __lasx_xvilvh_d(_out0, _out0);                               \
+    _out3 = __lasx_xvilvh_d(_out2, _out2);                               \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose input 8x8 byte block
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
+ *                         (input 8x8 byte block)
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *                         _out7 (output 8x8 byte block)
+ * Example     : See LASX_TRANSPOSE8x8_H
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                            _out7)                                           \
+  {                                                                          \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
+    _tmp0_m = __lasx_xvilvl_b(_in2, _in0);                                   \
+    _tmp1_m = __lasx_xvilvl_b(_in3, _in1);                                   \
+    _tmp2_m = __lasx_xvilvl_b(_in6, _in4);                                   \
+    _tmp3_m = __lasx_xvilvl_b(_in7, _in5);                                   \
+    _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m);                             \
+    _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m);                             \
+    _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m);                             \
+    _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m);                             \
+    _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m);                               \
+    _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m);                               \
+    _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m);                               \
+    _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m);                               \
+    _out1 = __lasx_xvbsrl_v(_out0, 8);                                       \
+    _out3 = __lasx_xvbsrl_v(_out2, 8);                                       \
+    _out5 = __lasx_xvbsrl_v(_out4, 8);                                       \
+    _out7 = __lasx_xvbsrl_v(_out6, 8);                                       \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with halfword elements in vectors.
+ * Arguments   : Inputs  - _in0, _in1, ~
+ *               Outputs - _out0, _out1, ~
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
+ * Example     : LASX_TRANSPOSE8x8_H
+ *        _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
+ *        _in2 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
+ *        _in3 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        _in4 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
+ *        _in5 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        _in6 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        _in7 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
+ *
+ *       _out0 : 1,8,8,1, 9,1,1,9, 1,8,8,1, 9,1,1,9
+ *       _out1 : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
+ *       _out2 : 3,3,3,3, 3,3,3,3, 3,3,3,3, 3,3,3,3
+ *       _out3 : 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4
+ *       _out4 : 5,5,5,5, 5,5,5,5, 5,5,5,5, 5,5,5,5
+ *       _out5 : 6,6,6,6, 6,6,6,6, 6,6,6,6, 6,6,6,6
+ *       _out6 : 7,7,7,7, 7,7,7,7, 7,7,7,7, 7,7,7,7
+ *       _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                            _out7)                                           \
+  {                                                                          \
+    __m256i _s0_m, _s1_m;                                                    \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
+                                                                             \
+    _s0_m = __lasx_xvilvl_h(_in6, _in4);                                     \
+    _s1_m = __lasx_xvilvl_h(_in7, _in5);                                     \
+    _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
+    _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvh_h(_in6, _in4);                                     \
+    _s1_m = __lasx_xvilvh_h(_in7, _in5);                                     \
+    _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
+    _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
+                                                                             \
+    _s0_m = __lasx_xvilvl_h(_in2, _in0);                                     \
+    _s1_m = __lasx_xvilvl_h(_in3, _in1);                                     \
+    _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
+    _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvh_h(_in2, _in0);                                     \
+    _s1_m = __lasx_xvilvh_h(_in3, _in1);                                     \
+    _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
+    _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
+                                                                             \
+    _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m);                             \
+    _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m);                             \
+    _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m);                             \
+    _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m);                             \
+    _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m);                             \
+    _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m);                             \
+    _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m);                             \
+    _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m);                             \
+  }
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 4 input vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3
+ *               Outputs - _out0, _out1, _out2, _out3
+ * Details     : Butterfly operation
+ * Example     : LASX_BUTTERFLY_4
+ *               _out0 = _in0 + _in3;
+ *               _out1 = _in1 + _in2;
+ *               _out2 = _in1 - _in2;
+ *               _out3 = _in0 - _in3;
+ * =============================================================================
+ */
+#define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    _out0 = __lasx_xvadd_b(_in0, _in3);                                        \
+    _out1 = __lasx_xvadd_b(_in1, _in2);                                        \
+    _out2 = __lasx_xvsub_b(_in1, _in2);                                        \
+    _out3 = __lasx_xvsub_b(_in0, _in3);                                        \
+  }
+#define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    _out0 = __lasx_xvadd_h(_in0, _in3);                                        \
+    _out1 = __lasx_xvadd_h(_in1, _in2);                                        \
+    _out2 = __lasx_xvsub_h(_in1, _in2);                                        \
+    _out3 = __lasx_xvsub_h(_in0, _in3);                                        \
+  }
+#define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    _out0 = __lasx_xvadd_w(_in0, _in3);                                        \
+    _out1 = __lasx_xvadd_w(_in1, _in2);                                        \
+    _out2 = __lasx_xvsub_w(_in1, _in2);                                        \
+    _out3 = __lasx_xvsub_w(_in0, _in3);                                        \
+  }
+#define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    _out0 = __lasx_xvadd_d(_in0, _in3);                                        \
+    _out1 = __lasx_xvadd_d(_in1, _in2);                                        \
+    _out2 = __lasx_xvsub_d(_in1, _in2);                                        \
+    _out3 = __lasx_xvsub_d(_in0, _in3);                                        \
+  }
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 8 input vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
+ *               Outputs - _out0, _out1, _out2, _out3, ~
+ * Details     : Butterfly operation
+ * Example     : LASX_BUTTERFLY_8
+ *               _out0 = _in0 + _in7;
+ *               _out1 = _in1 + _in6;
+ *               _out2 = _in2 + _in5;
+ *               _out3 = _in3 + _in4;
+ *               _out4 = _in3 - _in4;
+ *               _out5 = _in2 - _in5;
+ *               _out6 = _in1 - _in6;
+ *               _out7 = _in0 - _in7;
+ * =============================================================================
+ */
+#define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    _out0 = __lasx_xvadd_b(_in0, _in7);                                     \
+    _out1 = __lasx_xvadd_b(_in1, _in6);                                     \
+    _out2 = __lasx_xvadd_b(_in2, _in5);                                     \
+    _out3 = __lasx_xvadd_b(_in3, _in4);                                     \
+    _out4 = __lasx_xvsub_b(_in3, _in4);                                     \
+    _out5 = __lasx_xvsub_b(_in2, _in5);                                     \
+    _out6 = __lasx_xvsub_b(_in1, _in6);                                     \
+    _out7 = __lasx_xvsub_b(_in0, _in7);                                     \
+  }
+
+#define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    _out0 = __lasx_xvadd_h(_in0, _in7);                                     \
+    _out1 = __lasx_xvadd_h(_in1, _in6);                                     \
+    _out2 = __lasx_xvadd_h(_in2, _in5);                                     \
+    _out3 = __lasx_xvadd_h(_in3, _in4);                                     \
+    _out4 = __lasx_xvsub_h(_in3, _in4);                                     \
+    _out5 = __lasx_xvsub_h(_in2, _in5);                                     \
+    _out6 = __lasx_xvsub_h(_in1, _in6);                                     \
+    _out7 = __lasx_xvsub_h(_in0, _in7);                                     \
+  }
+
+#define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    _out0 = __lasx_xvadd_w(_in0, _in7);                                     \
+    _out1 = __lasx_xvadd_w(_in1, _in6);                                     \
+    _out2 = __lasx_xvadd_w(_in2, _in5);                                     \
+    _out3 = __lasx_xvadd_w(_in3, _in4);                                     \
+    _out4 = __lasx_xvsub_w(_in3, _in4);                                     \
+    _out5 = __lasx_xvsub_w(_in2, _in5);                                     \
+    _out6 = __lasx_xvsub_w(_in1, _in6);                                     \
+    _out7 = __lasx_xvsub_w(_in0, _in7);                                     \
+  }
+
+#define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    _out0 = __lasx_xvadd_d(_in0, _in7);                                     \
+    _out1 = __lasx_xvadd_d(_in1, _in6);                                     \
+    _out2 = __lasx_xvadd_d(_in2, _in5);                                     \
+    _out3 = __lasx_xvadd_d(_in3, _in4);                                     \
+    _out4 = __lasx_xvsub_d(_in3, _in4);                                     \
+    _out5 = __lasx_xvsub_d(_in2, _in5);                                     \
+    _out6 = __lasx_xvsub_d(_in1, _in6);                                     \
+    _out7 = __lasx_xvsub_d(_in0, _in7);                                     \
+  }
+
+#endif  // LASX
+
+/*
+ * =============================================================================
+ * Description : Print out elements in vector.
+ * Arguments   : Inputs  - RTYPE, _element_num, _in0, _enter
+ *               Outputs -
+ * Details     : Print out '_element_num' elements in 'RTYPE' vector '_in0', if
+ *               '_enter' is TRUE, prefix "\nVP:" will be added first.
+ * Example     : VECT_PRINT(v4i32,4,in0,1); // in0: 1,2,3,4
+ *               VP:1,2,3,4,
+ * =============================================================================
+ */
+#define VECT_PRINT(RTYPE, element_num, in0, enter)                 \
+  {                                                                \
+    RTYPE _tmp0 = (RTYPE)in0;                                      \
+    int _i = 0;                                                    \
+    if (enter) printf("\nVP:");                                    \
+    for (_i = 0; _i < element_num; _i++) printf("%d,", _tmp0[_i]); \
+  }
+
+#endif /* LOONGSON_INTRINSICS_H */
+#endif /* VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_ */
diff --git a/media/libvpx/libvpx/vpx_util/vpx_atomics.h b/media/libvpx/libvpx/vpx_util/vpx_atomics.h
new file mode 100644
index 0000000000..13c1fc11f6
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_util/vpx_atomics.h
@@ -0,0 +1,110 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_UTIL_VPX_ATOMICS_H_
+#define VPX_VPX_UTIL_VPX_ATOMICS_H_
+
+#include "./vpx_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD
+
+// Look for built-in atomic support. We cannot use <stdatomic.h> or <atomic>
+// since neither is guaranteed to exist on both C and C++ platforms, and we need
+// to back the atomic type with the same type (g++ needs to be able to use
+// gcc-built code). g++ 6 doesn't support _Atomic as a keyword and can't use the
+// stdatomic.h header. Even if both <stdatomic.h> and <atomic> existed it's not
+// guaranteed that atomic_int is the same type as std::atomic_int.
+// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=60932#c13.
+#if !defined(__has_builtin)
+#define __has_builtin(x) 0  // Compatibility with non-clang compilers.
+#endif                      // !defined(__has_builtin)
+
+#if (__has_builtin(__atomic_load_n)) || \
+    (defined(__GNUC__) &&               \
+     (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)))
+// For GCC >= 4.7 and Clang versions that support __atomic builtins, use those.
+#define VPX_USE_ATOMIC_BUILTINS
+#else
+// Use platform-specific asm barriers.
+#if defined(_MSC_VER)
+// TODO(pbos): This assumes that newer versions of MSVC are building with the
+// default /volatile:ms (or older, where this is always true. Consider adding
+// support for using <atomic> instead of stdatomic.h when building C++11 under
+// MSVC. It's unclear what to do for plain C under /volatile:iso (inline asm?),
+// there're no explicit Interlocked* functions for only storing or loading
+// (presumably because volatile has historically implied that on MSVC).
+//
+// For earlier versions of MSVC or the default /volatile:ms volatile int are
+// acquire/release and require no barrier.
+#define vpx_atomic_memory_barrier() \
+  do {                              \
+  } while (0)
+#else
+#if VPX_ARCH_X86 || VPX_ARCH_X86_64
+// Use a compiler barrier on x86, no runtime penalty.
+#define vpx_atomic_memory_barrier() __asm__ __volatile__("" ::: "memory")
+#elif VPX_ARCH_ARM
+#define vpx_atomic_memory_barrier() __asm__ __volatile__("dmb ish" ::: "memory")
+#elif VPX_ARCH_MIPS
+#define vpx_atomic_memory_barrier() __asm__ __volatile__("sync" ::: "memory")
+#else
+#error Unsupported architecture!
+#endif  // VPX_ARCH_X86 || VPX_ARCH_X86_64
+#endif  // defined(_MSC_VER)
+#endif  // atomic builtin availability check
+
+// These are wrapped in a struct so that they are not easily accessed directly
+// on any platform (to discourage programmer errors by setting values directly).
+// This primitive MUST be initialized using vpx_atomic_init or VPX_ATOMIC_INIT
+// (NOT memset) and accessed through vpx_atomic_ functions.
+typedef struct vpx_atomic_int {
+  volatile int value;
+} vpx_atomic_int;
+
+#define VPX_ATOMIC_INIT(num) { num }
+
+// Initialization of an atomic int, not thread safe.
+static INLINE void vpx_atomic_init(vpx_atomic_int *atomic, int value) {
+  atomic->value = value;
+}
+
+static INLINE void vpx_atomic_store_release(vpx_atomic_int *atomic, int value) {
+#if defined(VPX_USE_ATOMIC_BUILTINS)
+  __atomic_store_n(&atomic->value, value, __ATOMIC_RELEASE);
+#else
+  vpx_atomic_memory_barrier();
+  atomic->value = value;
+#endif  // defined(VPX_USE_ATOMIC_BUILTINS)
+}
+
+static INLINE int vpx_atomic_load_acquire(const vpx_atomic_int *atomic) {
+#if defined(VPX_USE_ATOMIC_BUILTINS)
+  return __atomic_load_n(&atomic->value, __ATOMIC_ACQUIRE);
+#else
+  int v = atomic->value;
+  vpx_atomic_memory_barrier();
+  return v;
+#endif  // defined(VPX_USE_ATOMIC_BUILTINS)
+}
+
+#undef VPX_USE_ATOMIC_BUILTINS
+#undef vpx_atomic_memory_barrier
+
+#endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // VPX_VPX_UTIL_VPX_ATOMICS_H_
diff --git a/media/libvpx/libvpx/vpx_util/vpx_debug_util.c b/media/libvpx/libvpx/vpx_util/vpx_debug_util.c
new file mode 100644
index 0000000000..3ce4065ba5
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_util/vpx_debug_util.c
@@ -0,0 +1,282 @@
+/*
+ *  Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include "vpx_util/vpx_debug_util.h"
+
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+static int frame_idx_w = 0;
+static int frame_idx_r = 0;
+
+void bitstream_queue_set_frame_write(int frame_idx) { frame_idx_w = frame_idx; }
+
+int bitstream_queue_get_frame_write(void) { return frame_idx_w; }
+
+void bitstream_queue_set_frame_read(int frame_idx) { frame_idx_r = frame_idx; }
+
+int bitstream_queue_get_frame_read(void) { return frame_idx_r; }
+#endif
+
+#if CONFIG_BITSTREAM_DEBUG
+#define QUEUE_MAX_SIZE 2000000
+static int result_queue[QUEUE_MAX_SIZE];
+static int prob_queue[QUEUE_MAX_SIZE];
+
+static int queue_r = 0;
+static int queue_w = 0;
+static int queue_prev_w = -1;
+static int skip_r = 0;
+static int skip_w = 0;
+void bitstream_queue_set_skip_write(int skip) { skip_w = skip; }
+
+void bitstream_queue_set_skip_read(int skip) { skip_r = skip; }
+
+void bitstream_queue_record_write(void) { queue_prev_w = queue_w; }
+
+void bitstream_queue_reset_write(void) { queue_w = queue_prev_w; }
+
+int bitstream_queue_get_write(void) { return queue_w; }
+
+int bitstream_queue_get_read(void) { return queue_r; }
+
+void bitstream_queue_pop(int *result, int *prob) {
+  if (!skip_r) {
+    if (queue_w == queue_r) {
+      printf("buffer underflow queue_w %d queue_r %d\n", queue_w, queue_r);
+      assert(0);
+    }
+    *result = result_queue[queue_r];
+    *prob = prob_queue[queue_r];
+    queue_r = (queue_r + 1) % QUEUE_MAX_SIZE;
+  }
+}
+
+void bitstream_queue_push(int result, const int prob) {
+  if (!skip_w) {
+    result_queue[queue_w] = result;
+    prob_queue[queue_w] = prob;
+    queue_w = (queue_w + 1) % QUEUE_MAX_SIZE;
+    if (queue_w == queue_r) {
+      printf("buffer overflow queue_w %d queue_r %d\n", queue_w, queue_r);
+      assert(0);
+    }
+  }
+}
+#endif  // CONFIG_BITSTREAM_DEBUG
+
+#if CONFIG_MISMATCH_DEBUG
+static int frame_buf_idx_r = 0;
+static int frame_buf_idx_w = 0;
+#define MAX_FRAME_BUF_NUM 20
+#define MAX_FRAME_STRIDE 1920
+#define MAX_FRAME_HEIGHT 1080
+static uint16_t
+    frame_pre[MAX_FRAME_BUF_NUM][3]
+             [MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT];  // prediction only
+static uint16_t
+    frame_tx[MAX_FRAME_BUF_NUM][3]
+            [MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT];  // prediction + txfm
+static int frame_stride = MAX_FRAME_STRIDE;
+static int frame_height = MAX_FRAME_HEIGHT;
+static int frame_size = MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT;
+void mismatch_move_frame_idx_w(void) {
+  frame_buf_idx_w = (frame_buf_idx_w + 1) % MAX_FRAME_BUF_NUM;
+  if (frame_buf_idx_w == frame_buf_idx_r) {
+    printf("frame_buf overflow\n");
+    assert(0);
+  }
+}
+
+void mismatch_reset_frame(int num_planes) {
+  int plane;
+  for (plane = 0; plane < num_planes; ++plane) {
+    memset(frame_pre[frame_buf_idx_w][plane], 0,
+           sizeof(frame_pre[frame_buf_idx_w][plane][0]) * frame_size);
+    memset(frame_tx[frame_buf_idx_w][plane], 0,
+           sizeof(frame_tx[frame_buf_idx_w][plane][0]) * frame_size);
+  }
+}
+
+void mismatch_move_frame_idx_r(void) {
+  if (frame_buf_idx_w == frame_buf_idx_r) {
+    printf("frame_buf underflow\n");
+    assert(0);
+  }
+  frame_buf_idx_r = (frame_buf_idx_r + 1) % MAX_FRAME_BUF_NUM;
+}
+
+void mismatch_record_block_pre(const uint8_t *src, int src_stride, int plane,
+                               int pixel_c, int pixel_r, int blk_w, int blk_h,
+                               int highbd) {
+  const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL;
+  int r, c;
+
+  if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) {
+    printf("frame_buf undersized\n");
+    assert(0);
+  }
+
+  for (r = 0; r < blk_h; ++r) {
+    for (c = 0; c < blk_w; ++c) {
+      frame_pre[frame_buf_idx_w][plane]
+               [(r + pixel_r) * frame_stride + c + pixel_c] =
+                   src16 ? src16[r * src_stride + c] : src[r * src_stride + c];
+    }
+  }
+#if 0
+  {
+    int ref_frame_idx = 3;
+    int ref_plane = 1;
+    int ref_pixel_c = 162;
+    int ref_pixel_r = 16;
+    if (frame_idx_w == ref_frame_idx && plane == ref_plane &&
+        ref_pixel_c >= pixel_c && ref_pixel_c < pixel_c + blk_w &&
+        ref_pixel_r >= pixel_r && ref_pixel_r < pixel_r + blk_h) {
+      printf(
+          "\nrecord_block_pre frame_idx %d plane %d pixel_c %d pixel_r %d blk_w"
+          " %d blk_h %d\n",
+          frame_idx_w, plane, pixel_c, pixel_r, blk_w, blk_h);
+    }
+  }
+#endif
+}
+void mismatch_record_block_tx(const uint8_t *src, int src_stride, int plane,
+                              int pixel_c, int pixel_r, int blk_w, int blk_h,
+                              int highbd) {
+  const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL;
+  int r, c;
+  if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) {
+    printf("frame_buf undersized\n");
+    assert(0);
+  }
+
+  for (r = 0; r < blk_h; ++r) {
+    for (c = 0; c < blk_w; ++c) {
+      frame_tx[frame_buf_idx_w][plane]
+              [(r + pixel_r) * frame_stride + c + pixel_c] =
+                  src16 ? src16[r * src_stride + c] : src[r * src_stride + c];
+    }
+  }
+#if 0
+  {
+    int ref_frame_idx = 3;
+    int ref_plane = 1;
+    int ref_pixel_c = 162;
+    int ref_pixel_r = 16;
+    if (frame_idx_w == ref_frame_idx && plane == ref_plane &&
+        ref_pixel_c >= pixel_c && ref_pixel_c < pixel_c + blk_w &&
+        ref_pixel_r >= pixel_r && ref_pixel_r < pixel_r + blk_h) {
+      printf(
+          "\nrecord_block_tx frame_idx %d plane %d pixel_c %d pixel_r %d blk_w "
+          "%d blk_h %d\n",
+          frame_idx_w, plane, pixel_c, pixel_r, blk_w, blk_h);
+    }
+  }
+#endif
+}
+void mismatch_check_block_pre(const uint8_t *src, int src_stride, int plane,
+                              int pixel_c, int pixel_r, int blk_w, int blk_h,
+                              int highbd) {
+  const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL;
+  int mismatch = 0;
+  int r, c;
+  if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) {
+    printf("frame_buf undersized\n");
+    assert(0);
+  }
+
+  for (r = 0; r < blk_h; ++r) {
+    for (c = 0; c < blk_w; ++c) {
+      if (frame_pre[frame_buf_idx_r][plane]
+                   [(r + pixel_r) * frame_stride + c + pixel_c] !=
+          (uint16_t)(src16 ? src16[r * src_stride + c]
+                           : src[r * src_stride + c])) {
+        mismatch = 1;
+      }
+    }
+  }
+  if (mismatch) {
+    int rr, cc;
+    printf(
+        "\ncheck_block_pre failed frame_idx %d plane %d "
+        "pixel_c %d pixel_r "
+        "%d blk_w %d blk_h %d\n",
+        frame_idx_r, plane, pixel_c, pixel_r, blk_w, blk_h);
+    printf("enc\n");
+    for (rr = 0; rr < blk_h; ++rr) {
+      for (cc = 0; cc < blk_w; ++cc) {
+        printf("%d ", frame_pre[frame_buf_idx_r][plane]
+                               [(rr + pixel_r) * frame_stride + cc + pixel_c]);
+      }
+      printf("\n");
+    }
+
+    printf("dec\n");
+    for (rr = 0; rr < blk_h; ++rr) {
+      for (cc = 0; cc < blk_w; ++cc) {
+        printf("%d ",
+               src16 ? src16[rr * src_stride + cc] : src[rr * src_stride + cc]);
+      }
+      printf("\n");
+    }
+    assert(0);
+  }
+}
+void mismatch_check_block_tx(const uint8_t *src, int src_stride, int plane,
+                             int pixel_c, int pixel_r, int blk_w, int blk_h,
+                             int highbd) {
+  const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL;
+  int mismatch = 0;
+  int r, c;
+  if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) {
+    printf("frame_buf undersized\n");
+    assert(0);
+  }
+
+  for (r = 0; r < blk_h; ++r) {
+    for (c = 0; c < blk_w; ++c) {
+      if (frame_tx[frame_buf_idx_r][plane]
+                  [(r + pixel_r) * frame_stride + c + pixel_c] !=
+          (uint16_t)(src16 ? src16[r * src_stride + c]
+                           : src[r * src_stride + c])) {
+        mismatch = 1;
+      }
+    }
+  }
+  if (mismatch) {
+    int rr, cc;
+    printf(
+        "\ncheck_block_tx failed frame_idx %d plane %d pixel_c "
+        "%d pixel_r "
+        "%d blk_w %d blk_h %d\n",
+        frame_idx_r, plane, pixel_c, pixel_r, blk_w, blk_h);
+    printf("enc\n");
+    for (rr = 0; rr < blk_h; ++rr) {
+      for (cc = 0; cc < blk_w; ++cc) {
+        printf("%d ", frame_tx[frame_buf_idx_r][plane]
+                              [(rr + pixel_r) * frame_stride + cc + pixel_c]);
+      }
+      printf("\n");
+    }
+
+    printf("dec\n");
+    for (rr = 0; rr < blk_h; ++rr) {
+      for (cc = 0; cc < blk_w; ++cc) {
+        printf("%d ",
+               src16 ? src16[rr * src_stride + cc] : src[rr * src_stride + cc]);
+      }
+      printf("\n");
+    }
+    assert(0);
+  }
+}
+#endif  // CONFIG_MISMATCH_DEBUG
diff --git a/media/libvpx/libvpx/vpx_util/vpx_debug_util.h b/media/libvpx/libvpx/vpx_util/vpx_debug_util.h
new file mode 100644
index 0000000000..df1a1aab2c
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_util/vpx_debug_util.h
@@ -0,0 +1,70 @@
+/*
+ *  Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_UTIL_VPX_DEBUG_UTIL_H_
+#define VPX_VPX_UTIL_VPX_DEBUG_UTIL_H_
+
+#include "./vpx_config.h"
+
+#include "vpx_dsp/prob.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+void bitstream_queue_set_frame_write(int frame_idx);
+int bitstream_queue_get_frame_write(void);
+void bitstream_queue_set_frame_read(int frame_idx);
+int bitstream_queue_get_frame_read(void);
+#endif
+
+#if CONFIG_BITSTREAM_DEBUG
+/* This is a debug tool used to detect bitstream error. On encoder side, it
+ * pushes each bit and probability into a queue before the bit is written into
+ * the Arithmetic coder. On decoder side, whenever a bit is read out from the
+ * Arithmetic coder, it pops out the reference bit and probability from the
+ * queue as well. If the two results do not match, this debug tool will report
+ * an error.  This tool can be used to pin down the bitstream error precisely.
+ * By combining gdb's backtrace method, we can detect which module causes the
+ * bitstream error. */
+int bitstream_queue_get_write(void);
+int bitstream_queue_get_read(void);
+void bitstream_queue_record_write(void);
+void bitstream_queue_reset_write(void);
+void bitstream_queue_pop(int *result, int *prob);
+void bitstream_queue_push(int result, const int prob);
+void bitstream_queue_set_skip_write(int skip);
+void bitstream_queue_set_skip_read(int skip);
+#endif  // CONFIG_BITSTREAM_DEBUG
+
+#if CONFIG_MISMATCH_DEBUG
+void mismatch_move_frame_idx_w(void);
+void mismatch_move_frame_idx_r(void);
+void mismatch_reset_frame(int num_planes);
+void mismatch_record_block_pre(const uint8_t *src, int src_stride, int plane,
+                               int pixel_c, int pixel_r, int blk_w, int blk_h,
+                               int highbd);
+void mismatch_record_block_tx(const uint8_t *src, int src_stride, int plane,
+                              int pixel_c, int pixel_r, int blk_w, int blk_h,
+                              int highbd);
+void mismatch_check_block_pre(const uint8_t *src, int src_stride, int plane,
+                              int pixel_c, int pixel_r, int blk_w, int blk_h,
+                              int highbd);
+void mismatch_check_block_tx(const uint8_t *src, int src_stride, int plane,
+                             int pixel_c, int pixel_r, int blk_w, int blk_h,
+                             int highbd);
+#endif  // CONFIG_MISMATCH_DEBUG
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VPX_UTIL_VPX_DEBUG_UTIL_H_
diff --git a/media/libvpx/libvpx/vpx_util/vpx_pthread.h b/media/libvpx/libvpx/vpx_util/vpx_pthread.h
new file mode 100644
index 0000000000..4ed32d4632
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_util/vpx_pthread.h
@@ -0,0 +1,150 @@
+// Copyright 2024 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// pthread.h wrapper
+
+#ifndef VPX_VPX_UTIL_VPX_PTHREAD_H_
+#define VPX_VPX_UTIL_VPX_PTHREAD_H_
+
+#include "./vpx_config.h"
+
+#if CONFIG_MULTITHREAD
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_WIN32) && !HAVE_PTHREAD_H
+// Prevent leaking max/min macros.
+#undef NOMINMAX
+#define NOMINMAX
+#undef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#include <process.h>  // NOLINT
+#include <stddef.h>   // NOLINT
+#include <windows.h>  // NOLINT
+typedef HANDLE pthread_t;
+typedef SRWLOCK pthread_mutex_t;
+
+#if _WIN32_WINNT < 0x0600
+#error _WIN32_WINNT must target Windows Vista / Server 2008 or newer.
+#endif
+typedef CONDITION_VARIABLE pthread_cond_t;
+
+#ifndef WINAPI_FAMILY_PARTITION
+#define WINAPI_PARTITION_DESKTOP 1
+#define WINAPI_FAMILY_PARTITION(x) x
+#endif
+
+#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#define USE_CREATE_THREAD
+#endif
+
+//------------------------------------------------------------------------------
+// simplistic pthread emulation layer
+
+// _beginthreadex requires __stdcall
+#if defined(__GNUC__) && \
+    (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))
+#define THREADFN __attribute__((force_align_arg_pointer)) unsigned int __stdcall
+#else
+#define THREADFN unsigned int __stdcall
+#endif
+#define THREAD_EXIT_SUCCESS 0
+
+static INLINE int pthread_create(pthread_t *const thread, const void *attr,
+                                 unsigned int(__stdcall *start)(void *),
+                                 void *arg) {
+  (void)attr;
+#ifdef USE_CREATE_THREAD
+  *thread = CreateThread(NULL,          /* lpThreadAttributes */
+                         0,             /* dwStackSize */
+                         start, arg, 0, /* dwStackSize */
+                         NULL);         /* lpThreadId */
+#else
+  *thread = (pthread_t)_beginthreadex(NULL,          /* void *security */
+                                      0,             /* unsigned stack_size */
+                                      start, arg, 0, /* unsigned initflag */
+                                      NULL);         /* unsigned *thrdaddr */
+#endif
+  if (*thread == NULL) return 1;
+  SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL);
+  return 0;
+}
+
+static INLINE int pthread_join(pthread_t thread, void **value_ptr) {
+  (void)value_ptr;
+  return (WaitForSingleObject(thread, INFINITE) != WAIT_OBJECT_0 ||
+          CloseHandle(thread) == 0);
+}
+
+// Mutex
+static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex,
+                                     void *mutexattr) {
+  (void)mutexattr;
+  InitializeSRWLock(mutex);
+  return 0;
+}
+
+static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) {
+  AcquireSRWLockExclusive(mutex);
+  return 0;
+}
+
+static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) {
+  ReleaseSRWLockExclusive(mutex);
+  return 0;
+}
+
+static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) {
+  (void)mutex;
+  return 0;
+}
+
+// Condition
+static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) {
+  (void)condition;
+  return 0;
+}
+
+static INLINE int pthread_cond_init(pthread_cond_t *const condition,
+                                    void *cond_attr) {
+  (void)cond_attr;
+  InitializeConditionVariable(condition);
+  return 0;
+}
+
+static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {
+  WakeConditionVariable(condition);
+  return 0;
+}
+
+static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) {
+  WakeAllConditionVariable(condition);
+  return 0;
+}
+
+static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
+                                    pthread_mutex_t *const mutex) {
+  const int ok = SleepConditionVariableSRW(condition, mutex, INFINITE, 0);
+  return !ok;
+}
+#else                 // _WIN32
+#include <pthread.h>  // NOLINT
+#define THREADFN void *
+#define THREAD_EXIT_SUCCESS NULL
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // CONFIG_MULTITHREAD
+
+#endif  // VPX_VPX_UTIL_VPX_PTHREAD_H_
diff --git a/media/libvpx/libvpx/vpx_util/vpx_thread.c b/media/libvpx/libvpx/vpx_util/vpx_thread.c
index 04c5fb6f26..0d0e2f5766 100644
--- a/media/libvpx/libvpx/vpx_util/vpx_thread.c
+++ b/media/libvpx/libvpx/vpx_util/vpx_thread.c
@@ -12,10 +12,18 @@
 // Original source:
 //  https://chromium.googlesource.com/webm/libwebp
 
+// Enable GNU extensions in glibc so that we can call pthread_setname_np().
+// This must be before any #include statements.
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
 #include <assert.h>
 #include <string.h>  // for memset()
+#include "./vpx_config.h"
 #include "./vpx_thread.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_util/vpx_pthread.h"
 
 #if CONFIG_MULTITHREAD
 
@@ -31,23 +39,54 @@ static void execute(VPxWorker *const worker);  // Forward declaration.
 
 static THREADFN thread_loop(void *ptr) {
   VPxWorker *const worker = (VPxWorker *)ptr;
-  int done = 0;
-  while (!done) {
-    pthread_mutex_lock(&worker->impl_->mutex_);
-    while (worker->status_ == OK) {  // wait in idling mode
+#ifdef __APPLE__
+  if (worker->thread_name != NULL) {
+    // Apple's version of pthread_setname_np takes one argument and operates on
+    // the current thread only. The maximum size of the thread_name buffer was
+    // noted in the Chromium source code and was confirmed by experiments. If
+    // thread_name is too long, pthread_setname_np returns -1 with errno
+    // ENAMETOOLONG (63).
+    char thread_name[64];
+    strncpy(thread_name, worker->thread_name, sizeof(thread_name) - 1);
+    thread_name[sizeof(thread_name) - 1] = '\0';
+    pthread_setname_np(thread_name);
+  }
+#elif (defined(__GLIBC__) && !defined(__GNU__)) || defined(__BIONIC__)
+  if (worker->thread_name != NULL) {
+    // Linux and Android require names (with nul) fit in 16 chars, otherwise
+    // pthread_setname_np() returns ERANGE (34).
+    char thread_name[16];
+    strncpy(thread_name, worker->thread_name, sizeof(thread_name) - 1);
+    thread_name[sizeof(thread_name) - 1] = '\0';
+    pthread_setname_np(pthread_self(), thread_name);
+  }
+#endif
+  pthread_mutex_lock(&worker->impl_->mutex_);
+  for (;;) {
+    while (worker->status_ == VPX_WORKER_STATUS_OK) {  // wait in idling mode
       pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
     }
-    if (worker->status_ == WORK) {
+    if (worker->status_ == VPX_WORKER_STATUS_WORKING) {
+      // When worker->status_ is VPX_WORKER_STATUS_WORKING, the main thread
+      // doesn't change worker->status_ and will wait until the worker changes
+      // worker->status_ to VPX_WORKER_STATUS_OK. See change_state(). So the
+      // worker can safely call execute() without holding worker->impl_->mutex_.
+      // When the worker reacquires worker->impl_->mutex_, worker->status_ must
+      // still be VPX_WORKER_STATUS_WORKING.
+      pthread_mutex_unlock(&worker->impl_->mutex_);
       execute(worker);
-      worker->status_ = OK;
-    } else if (worker->status_ == NOT_OK) {  // finish the worker
-      done = 1;
+      pthread_mutex_lock(&worker->impl_->mutex_);
+      assert(worker->status_ == VPX_WORKER_STATUS_WORKING);
+      worker->status_ = VPX_WORKER_STATUS_OK;
+      // signal to the main thread that we're done (for sync())
+      pthread_cond_signal(&worker->impl_->condition_);
+    } else {
+      assert(worker->status_ == VPX_WORKER_STATUS_NOT_OK);  // finish the worker
+      break;
     }
-    // signal to the main thread that we're done (for sync())
-    pthread_cond_signal(&worker->impl_->condition_);
-    pthread_mutex_unlock(&worker->impl_->mutex_);
   }
-  return THREAD_RETURN(NULL);  // Thread is finished
+  pthread_mutex_unlock(&worker->impl_->mutex_);
+  return THREAD_EXIT_SUCCESS;  // Thread is finished
 }
 
 // main thread state control
@@ -58,13 +97,13 @@ static void change_state(VPxWorker *const worker, VPxWorkerStatus new_status) {
   if (worker->impl_ == NULL) return;
 
   pthread_mutex_lock(&worker->impl_->mutex_);
-  if (worker->status_ >= OK) {
+  if (worker->status_ >= VPX_WORKER_STATUS_OK) {
     // wait for the worker to finish
-    while (worker->status_ != OK) {
+    while (worker->status_ != VPX_WORKER_STATUS_OK) {
       pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
     }
     // assign new status and release the working thread if needed
-    if (new_status != OK) {
+    if (new_status != VPX_WORKER_STATUS_OK) {
       worker->status_ = new_status;
       pthread_cond_signal(&worker->impl_->condition_);
     }
@@ -78,21 +117,21 @@ static void change_state(VPxWorker *const worker, VPxWorkerStatus new_status) {
 
 static void init(VPxWorker *const worker) {
   memset(worker, 0, sizeof(*worker));
-  worker->status_ = NOT_OK;
+  worker->status_ = VPX_WORKER_STATUS_NOT_OK;
 }
 
 static int sync(VPxWorker *const worker) {
 #if CONFIG_MULTITHREAD
-  change_state(worker, OK);
+  change_state(worker, VPX_WORKER_STATUS_OK);
 #endif
-  assert(worker->status_ <= OK);
+  assert(worker->status_ <= VPX_WORKER_STATUS_OK);
   return !worker->had_error;
 }
 
 static int reset(VPxWorker *const worker) {
   int ok = 1;
   worker->had_error = 0;
-  if (worker->status_ < OK) {
+  if (worker->status_ < VPX_WORKER_STATUS_OK) {
 #if CONFIG_MULTITHREAD
     worker->impl_ = (VPxWorkerImpl *)vpx_calloc(1, sizeof(*worker->impl_));
     if (worker->impl_ == NULL) {
@@ -107,7 +146,7 @@ static int reset(VPxWorker *const worker) {
     }
     pthread_mutex_lock(&worker->impl_->mutex_);
     ok = !pthread_create(&worker->impl_->thread_, NULL, thread_loop, worker);
-    if (ok) worker->status_ = OK;
+    if (ok) worker->status_ = VPX_WORKER_STATUS_OK;
     pthread_mutex_unlock(&worker->impl_->mutex_);
     if (!ok) {
       pthread_mutex_destroy(&worker->impl_->mutex_);
@@ -118,12 +157,12 @@ static int reset(VPxWorker *const worker) {
       return 0;
     }
 #else
-    worker->status_ = OK;
+    worker->status_ = VPX_WORKER_STATUS_OK;
 #endif
-  } else if (worker->status_ > OK) {
+  } else if (worker->status_ > VPX_WORKER_STATUS_OK) {
     ok = sync(worker);
   }
-  assert(!ok || (worker->status_ == OK));
+  assert(!ok || (worker->status_ == VPX_WORKER_STATUS_OK));
   return ok;
 }
 
@@ -135,7 +174,7 @@ static void execute(VPxWorker *const worker) {
 
 static void launch(VPxWorker *const worker) {
 #if CONFIG_MULTITHREAD
-  change_state(worker, WORK);
+  change_state(worker, VPX_WORKER_STATUS_WORKING);
 #else
   execute(worker);
 #endif
@@ -144,7 +183,7 @@ static void launch(VPxWorker *const worker) {
 static void end(VPxWorker *const worker) {
 #if CONFIG_MULTITHREAD
   if (worker->impl_ != NULL) {
-    change_state(worker, NOT_OK);
+    change_state(worker, VPX_WORKER_STATUS_NOT_OK);
     pthread_join(worker->impl_->thread_, NULL);
     pthread_mutex_destroy(&worker->impl_->mutex_);
     pthread_cond_destroy(&worker->impl_->condition_);
@@ -152,10 +191,10 @@ static void end(VPxWorker *const worker) {
     worker->impl_ = NULL;
   }
 #else
-  worker->status_ = NOT_OK;
+  worker->status_ = VPX_WORKER_STATUS_NOT_OK;
   assert(worker->impl_ == NULL);
 #endif
-  assert(worker->status_ == NOT_OK);
+  assert(worker->status_ == VPX_WORKER_STATUS_NOT_OK);
 }
 
 //------------------------------------------------------------------------------
diff --git a/media/libvpx/libvpx/vpx_util/vpx_thread.h b/media/libvpx/libvpx/vpx_util/vpx_thread.h
index 53a5f4966a..22051eae41 100644
--- a/media/libvpx/libvpx/vpx_util/vpx_thread.h
+++ b/media/libvpx/libvpx/vpx_util/vpx_thread.h
@@ -12,350 +12,23 @@
 // Original source:
 //  https://chromium.googlesource.com/webm/libwebp
 
-#ifndef VPX_THREAD_H_
-#define VPX_THREAD_H_
-
-#include "./vpx_config.h"
+#ifndef VPX_VPX_UTIL_VPX_THREAD_H_
+#define VPX_VPX_UTIL_VPX_THREAD_H_
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-// Set maximum decode threads to be 8 due to the limit of frame buffers
-// and not enough semaphores in the emulation layer on windows.
-#define MAX_DECODE_THREADS 8
-
-#if CONFIG_MULTITHREAD
-
-#if defined(_WIN32) && !HAVE_PTHREAD_H
-#include <errno.h>    // NOLINT
-#include <process.h>  // NOLINT
-#include <windows.h>  // NOLINT
-typedef HANDLE pthread_t;
-typedef CRITICAL_SECTION pthread_mutex_t;
-
-#if _WIN32_WINNT >= 0x0600  // Windows Vista / Server 2008 or greater
-#define USE_WINDOWS_CONDITION_VARIABLE
-typedef CONDITION_VARIABLE pthread_cond_t;
-#else
-typedef struct {
-  HANDLE waiting_sem_;
-  HANDLE received_sem_;
-  HANDLE signal_event_;
-} pthread_cond_t;
-#endif  // _WIN32_WINNT >= 0x600
-
-#ifndef WINAPI_FAMILY_PARTITION
-#define WINAPI_PARTITION_DESKTOP 1
-#define WINAPI_FAMILY_PARTITION(x) x
-#endif
-
-#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
-#define USE_CREATE_THREAD
-#endif
-
-//------------------------------------------------------------------------------
-// simplistic pthread emulation layer
-
-// _beginthreadex requires __stdcall
-#if defined(__GNUC__) && \
-    (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))
-#define THREADFN __attribute__((force_align_arg_pointer)) unsigned int __stdcall
-#else
-#define THREADFN unsigned int __stdcall
-#endif
-#define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val)
-
-#if _WIN32_WINNT >= 0x0501  // Windows XP or greater
-#define WaitForSingleObject(obj, timeout) \
-  WaitForSingleObjectEx(obj, timeout, FALSE /*bAlertable*/)
-#endif
-
-static INLINE int pthread_create(pthread_t *const thread, const void *attr,
-                                 unsigned int(__stdcall *start)(void *),
-                                 void *arg) {
-  (void)attr;
-#ifdef USE_CREATE_THREAD
-  *thread = CreateThread(NULL,          /* lpThreadAttributes */
-                         0,             /* dwStackSize */
-                         start, arg, 0, /* dwStackSize */
-                         NULL);         /* lpThreadId */
-#else
-  *thread = (pthread_t)_beginthreadex(NULL,          /* void *security */
-                                      0,             /* unsigned stack_size */
-                                      start, arg, 0, /* unsigned initflag */
-                                      NULL);         /* unsigned *thrdaddr */
-#endif
-  if (*thread == NULL) return 1;
-  SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL);
-  return 0;
-}
-
-static INLINE int pthread_join(pthread_t thread, void **value_ptr) {
-  (void)value_ptr;
-  return (WaitForSingleObject(thread, INFINITE) != WAIT_OBJECT_0 ||
-          CloseHandle(thread) == 0);
-}
-
-// Mutex
-static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex,
-                                     void *mutexattr) {
-  (void)mutexattr;
-#if _WIN32_WINNT >= 0x0600  // Windows Vista / Server 2008 or greater
-  InitializeCriticalSectionEx(mutex, 0 /*dwSpinCount*/, 0 /*Flags*/);
-#else
-  InitializeCriticalSection(mutex);
-#endif
-  return 0;
-}
-
-static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) {
-  return TryEnterCriticalSection(mutex) ? 0 : EBUSY;
-}
-
-static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) {
-  EnterCriticalSection(mutex);
-  return 0;
-}
-
-static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) {
-  LeaveCriticalSection(mutex);
-  return 0;
-}
-
-static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) {
-  DeleteCriticalSection(mutex);
-  return 0;
-}
-
-// Condition
-static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) {
-  int ok = 1;
-#ifdef USE_WINDOWS_CONDITION_VARIABLE
-  (void)condition;
-#else
-  ok &= (CloseHandle(condition->waiting_sem_) != 0);
-  ok &= (CloseHandle(condition->received_sem_) != 0);
-  ok &= (CloseHandle(condition->signal_event_) != 0);
-#endif
-  return !ok;
-}
-
-static INLINE int pthread_cond_init(pthread_cond_t *const condition,
-                                    void *cond_attr) {
-  (void)cond_attr;
-#ifdef USE_WINDOWS_CONDITION_VARIABLE
-  InitializeConditionVariable(condition);
-#else
-  condition->waiting_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL);
-  condition->received_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL);
-  condition->signal_event_ = CreateEvent(NULL, FALSE, FALSE, NULL);
-  if (condition->waiting_sem_ == NULL || condition->received_sem_ == NULL ||
-      condition->signal_event_ == NULL) {
-    pthread_cond_destroy(condition);
-    return 1;
-  }
-#endif
-  return 0;
-}
-
-static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {
-  int ok = 1;
-#ifdef USE_WINDOWS_CONDITION_VARIABLE
-  WakeConditionVariable(condition);
-#else
-  if (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) {
-    // a thread is waiting in pthread_cond_wait: allow it to be notified
-    ok = SetEvent(condition->signal_event_);
-    // wait until the event is consumed so the signaler cannot consume
-    // the event via its own pthread_cond_wait.
-    ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) !=
-           WAIT_OBJECT_0);
-  }
-#endif
-  return !ok;
-}
-
-static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
-                                    pthread_mutex_t *const mutex) {
-  int ok;
-#ifdef USE_WINDOWS_CONDITION_VARIABLE
-  ok = SleepConditionVariableCS(condition, mutex, INFINITE);
-#else
-  // note that there is a consumer available so the signal isn't dropped in
-  // pthread_cond_signal
-  if (!ReleaseSemaphore(condition->waiting_sem_, 1, NULL)) return 1;
-  // now unlock the mutex so pthread_cond_signal may be issued
-  pthread_mutex_unlock(mutex);
-  ok = (WaitForSingleObject(condition->signal_event_, INFINITE) ==
-        WAIT_OBJECT_0);
-  ok &= ReleaseSemaphore(condition->received_sem_, 1, NULL);
-  pthread_mutex_lock(mutex);
-#endif
-  return !ok;
-}
-#elif defined(__OS2__)
-#define INCL_DOS
-#include <os2.h>  // NOLINT
-
-#include <errno.h>        // NOLINT
-#include <stdlib.h>       // NOLINT
-#include <sys/builtin.h>  // NOLINT
-
-#define pthread_t TID
-#define pthread_mutex_t HMTX
-
-typedef struct {
-  HEV event_sem_;
-  HEV ack_sem_;
-  volatile unsigned wait_count_;
-} pthread_cond_t;
-
-//------------------------------------------------------------------------------
-// simplistic pthread emulation layer
-
-#define THREADFN void *
-#define THREAD_RETURN(val) (val)
-
-typedef struct {
-  void *(*start_)(void *);
-  void *arg_;
-} thread_arg;
-
-static void thread_start(void *arg) {
-  thread_arg targ = *(thread_arg *)arg;
-  free(arg);
-
-  targ.start_(targ.arg_);
-}
-
-static INLINE int pthread_create(pthread_t *const thread, const void *attr,
-                                 void *(*start)(void *), void *arg) {
-  int tid;
-  thread_arg *targ = (thread_arg *)malloc(sizeof(*targ));
-  if (targ == NULL) return 1;
-
-  (void)attr;
-
-  targ->start_ = start;
-  targ->arg_ = arg;
-  tid = (pthread_t)_beginthread(thread_start, NULL, 1024 * 1024, targ);
-  if (tid == -1) {
-    free(targ);
-    return 1;
-  }
-
-  *thread = tid;
-  return 0;
-}
-
-static INLINE int pthread_join(pthread_t thread, void **value_ptr) {
-  (void)value_ptr;
-  return DosWaitThread(&thread, DCWW_WAIT) != 0;
-}
-
-// Mutex
-static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex,
-                                     void *mutexattr) {
-  (void)mutexattr;
-  return DosCreateMutexSem(NULL, mutex, 0, FALSE) != 0;
-}
-
-static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) {
-  return DosRequestMutexSem(*mutex, SEM_IMMEDIATE_RETURN) == 0 ? 0 : EBUSY;
-}
-
-static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) {
-  return DosRequestMutexSem(*mutex, SEM_INDEFINITE_WAIT) != 0;
-}
-
-static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) {
-  return DosReleaseMutexSem(*mutex) != 0;
-}
-
-static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) {
-  return DosCloseMutexSem(*mutex) != 0;
-}
-
-// Condition
-static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) {
-  int ok = 1;
-  ok &= DosCloseEventSem(condition->event_sem_) == 0;
-  ok &= DosCloseEventSem(condition->ack_sem_) == 0;
-  return !ok;
-}
-
-static INLINE int pthread_cond_init(pthread_cond_t *const condition,
-                                    void *cond_attr) {
-  int ok = 1;
-  (void)cond_attr;
-
-  ok &=
-      DosCreateEventSem(NULL, &condition->event_sem_, DCE_POSTONE, FALSE) == 0;
-  ok &= DosCreateEventSem(NULL, &condition->ack_sem_, DCE_POSTONE, FALSE) == 0;
-  if (!ok) {
-    pthread_cond_destroy(condition);
-    return 1;
-  }
-  condition->wait_count_ = 0;
-  return 0;
-}
-
-static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {
-  int ok = 1;
-
-  if (!__atomic_cmpxchg32(&condition->wait_count_, 0, 0)) {
-    ok &= DosPostEventSem(condition->event_sem_) == 0;
-    ok &= DosWaitEventSem(condition->ack_sem_, SEM_INDEFINITE_WAIT) == 0;
-  }
-
-  return !ok;
-}
-
-static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) {
-  int ok = 1;
-
-  while (!__atomic_cmpxchg32(&condition->wait_count_, 0, 0))
-    ok &= pthread_cond_signal(condition) == 0;
-
-  return !ok;
-}
-
-static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
-                                    pthread_mutex_t *const mutex) {
-  int ok = 1;
-
-  __atomic_increment(&condition->wait_count_);
-
-  ok &= pthread_mutex_unlock(mutex) == 0;
-
-  ok &= DosWaitEventSem(condition->event_sem_, SEM_INDEFINITE_WAIT) == 0;
-
-  __atomic_decrement(&condition->wait_count_);
-
-  ok &= DosPostEventSem(condition->ack_sem_) == 0;
-
-  pthread_mutex_lock(mutex);
-
-  return !ok;
-}
-#else                 // _WIN32
-#include <pthread.h>  // NOLINT
-#define THREADFN void *
-#define THREAD_RETURN(val) val
-#endif
-
-#endif  // CONFIG_MULTITHREAD
-
 // State of the worker thread object
 typedef enum {
-  NOT_OK = 0,  // object is unusable
-  OK,          // ready to work
-  WORK         // busy finishing the current task
+  VPX_WORKER_STATUS_NOT_OK = 0,  // object is unusable
+  VPX_WORKER_STATUS_OK,          // ready to work
+  VPX_WORKER_STATUS_WORKING      // busy finishing the current task
 } VPxWorkerStatus;
 
 // Function to be called by the worker thread. Takes two opaque pointers as
-// arguments (data1 and data2), and should return false in case of error.
+// arguments (data1 and data2). Should return true on success and return false
+// in case of error.
 typedef int (*VPxWorkerHook)(void *, void *);
 
 // Platform-dependent implementation details for the worker.
@@ -365,10 +38,14 @@ typedef struct VPxWorkerImpl VPxWorkerImpl;
 typedef struct {
   VPxWorkerImpl *impl_;
   VPxWorkerStatus status_;
+  // Thread name for the debugger. If not NULL, must point to a string that
+  // outlives the worker thread. For portability, use a name <= 15 characters
+  // long (not including the terminating NUL character).
+  const char *thread_name;
   VPxWorkerHook hook;  // hook to call
   void *data1;         // first argument passed to 'hook'
   void *data2;         // second argument passed to 'hook'
-  int had_error;       // return value of the last call to 'hook'
+  int had_error;       // true if a call to 'hook' returned false
 } VPxWorker;
 
 // The interface for all thread-worker related functions. All these functions
@@ -412,4 +89,4 @@ const VPxWorkerInterface *vpx_get_worker_interface(void);
 }  // extern "C"
 #endif
 
-#endif  // VPX_THREAD_H_
+#endif  // VPX_VPX_UTIL_VPX_THREAD_H_
diff --git a/media/libvpx/libvpx/vpx_util/vpx_timestamp.h b/media/libvpx/libvpx/vpx_util/vpx_timestamp.h
new file mode 100644
index 0000000000..5296458fad
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_util/vpx_timestamp.h
@@ -0,0 +1,49 @@
+/*
+ *  Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_UTIL_VPX_TIMESTAMP_H_
+#define VPX_VPX_UTIL_VPX_TIMESTAMP_H_
+
+#include <assert.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Rational Number with an int64 numerator
+typedef struct vpx_rational64 {
+  int64_t num;       // fraction numerator
+  int den;           // fraction denominator
+} vpx_rational64_t;  // alias for struct vpx_rational64_t
+
+static INLINE int gcd(int64_t a, int b) {
+  int r;  // remainder
+  assert(a >= 0);
+  assert(b > 0);
+  while (b != 0) {
+    r = (int)(a % b);
+    a = b;
+    b = r;
+  }
+
+  return (int)a;
+}
+
+static INLINE void reduce_ratio(vpx_rational64_t *ratio) {
+  const int denom = gcd(ratio->num, ratio->den);
+  ratio->num /= denom;
+  ratio->den /= denom;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // VPX_VPX_UTIL_VPX_TIMESTAMP_H_
diff --git a/media/libvpx/libvpx/vpx_util/vpx_util.mk b/media/libvpx/libvpx/vpx_util/vpx_util.mk
index c0ef8d3362..948e6d6f89 100644
--- a/media/libvpx/libvpx/vpx_util/vpx_util.mk
+++ b/media/libvpx/libvpx/vpx_util/vpx_util.mk
@@ -8,7 +8,14 @@
 ##  be found in the AUTHORS file in the root of the source tree.
 ##
 
+UTIL_SRCS-yes += vpx_atomics.h
 UTIL_SRCS-yes += vpx_util.mk
+UTIL_SRCS-yes += vpx_pthread.h
 UTIL_SRCS-yes += vpx_thread.c
 UTIL_SRCS-yes += vpx_thread.h
 UTIL_SRCS-yes += endian_inl.h
+UTIL_SRCS-yes += vpx_write_yuv_frame.h
+UTIL_SRCS-yes += vpx_write_yuv_frame.c
+UTIL_SRCS-yes += vpx_timestamp.h
+UTIL_SRCS-$(or $(CONFIG_BITSTREAM_DEBUG),$(CONFIG_MISMATCH_DEBUG)) += vpx_debug_util.h
+UTIL_SRCS-$(or $(CONFIG_BITSTREAM_DEBUG),$(CONFIG_MISMATCH_DEBUG)) += vpx_debug_util.c
diff --git a/media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.c b/media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.c
new file mode 100644
index 0000000000..4ef57a2fee
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.c
@@ -0,0 +1,46 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/skin_detection.h"
+#include "vpx_util/vpx_write_yuv_frame.h"
+
+void vpx_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s) {
+#if defined(OUTPUT_YUV_SRC) || defined(OUTPUT_YUV_DENOISED) || \
+    defined(OUTPUT_YUV_SKINMAP) || defined(OUTPUT_YUV_SVC_SRC)
+
+  unsigned char *src = s->y_buffer;
+  int h = s->y_crop_height;
+
+  do {
+    fwrite(src, s->y_width, 1, yuv_file);
+    src += s->y_stride;
+  } while (--h);
+
+  src = s->u_buffer;
+  h = s->uv_crop_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, yuv_file);
+    src += s->uv_stride;
+  } while (--h);
+
+  src = s->v_buffer;
+  h = s->uv_crop_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, yuv_file);
+    src += s->uv_stride;
+  } while (--h);
+
+#else
+  (void)yuv_file;
+  (void)s;
+#endif
+}
diff --git a/media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.h b/media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.h
new file mode 100644
index 0000000000..ce1102458e
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.h
@@ -0,0 +1,27 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_UTIL_VPX_WRITE_YUV_FRAME_H_
+#define VPX_VPX_UTIL_VPX_WRITE_YUV_FRAME_H_
+
+#include <stdio.h>
+#include "vpx_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vpx_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VPX_UTIL_VPX_WRITE_YUV_FRAME_H_
diff --git a/media/libvpx/libvpx/vpx_version.h b/media/libvpx/libvpx/vpx_version.h
new file mode 100644
index 0000000000..ba9b63a4a3
--- /dev/null
+++ b/media/libvpx/libvpx/vpx_version.h
@@ -0,0 +1,11 @@
+// This file is generated. Do not edit.
+#ifndef VPX_VERSION_H_
+#define VPX_VERSION_H_
+#define VERSION_MAJOR  1
+#define VERSION_MINOR  16
+#define VERSION_PATCH  0
+#define VERSION_EXTRA  ""
+#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
+#define VERSION_STRING_NOSP "v1.16.0"
+#define VERSION_STRING      " v1.16.0"
+#endif  // VPX_VERSION_H_
diff --git a/media/libvpx/libvpx/vpxdec.c b/media/libvpx/libvpx/vpxdec.c
index 2cdb69d5a3..bfe6c1d6ba 100644
--- a/media/libvpx/libvpx/vpxdec.c
+++ b/media/libvpx/libvpx/vpxdec.c
@@ -47,6 +47,8 @@ struct VpxDecInputContext {
   struct WebmInputContext *webm_ctx;
 };
 
+static const arg_def_t help =
+    ARG_DEF(NULL, "help", 0, "Show usage options and exit");
 static const arg_def_t looparg =
     ARG_DEF(NULL, "loops", 1, "Number of times to decode the file");
 static const arg_def_t codecarg = ARG_DEF(NULL, "codec", 1, "Codec to use");
@@ -75,7 +77,7 @@ static const arg_def_t outputfile =
 static const arg_def_t threadsarg =
     ARG_DEF("t", "threads", 1, "Max threads to use");
 static const arg_def_t frameparallelarg =
-    ARG_DEF(NULL, "frame-parallel", 0, "Frame parallel decode");
+    ARG_DEF(NULL, "frame-parallel", 0, "Frame parallel decode (ignored)");
 static const arg_def_t verbosearg =
     ARG_DEF("v", "verbose", 0, "Show version string");
 static const arg_def_t error_concealment =
@@ -94,17 +96,43 @@ static const arg_def_t outbitdeptharg =
 #endif
 static const arg_def_t svcdecodingarg = ARG_DEF(
     NULL, "svc-decode-layer", 1, "Decode SVC stream up to given spatial layer");
+static const arg_def_t framestatsarg =
+    ARG_DEF(NULL, "framestats", 1, "Output per-frame stats (.csv format)");
+static const arg_def_t rowmtarg =
+    ARG_DEF(NULL, "row-mt", 1, "Enable multi-threading to run row-wise in VP9");
+static const arg_def_t lpfoptarg =
+    ARG_DEF(NULL, "lpf-opt", 1,
+            "Do loopfilter without waiting for all threads to sync.");
 
-static const arg_def_t *all_args[] = {
-  &codecarg,       &use_yv12,    &use_i420,   &flipuvarg,         &rawvideo,
-  &noblitarg,      &progressarg, &limitarg,   &skiparg,           &postprocarg,
-  &summaryarg,     &outputfile,  &threadsarg, &frameparallelarg,  &verbosearg,
-  &scalearg,       &fb_arg,      &md5arg,     &error_concealment, &continuearg,
+static const arg_def_t *all_args[] = { &help,
+                                       &codecarg,
+                                       &use_yv12,
+                                       &use_i420,
+                                       &flipuvarg,
+                                       &rawvideo,
+                                       &noblitarg,
+                                       &progressarg,
+                                       &limitarg,
+                                       &skiparg,
+                                       &postprocarg,
+                                       &summaryarg,
+                                       &outputfile,
+                                       &threadsarg,
+                                       &frameparallelarg,
+                                       &verbosearg,
+                                       &scalearg,
+                                       &fb_arg,
+                                       &md5arg,
+                                       &error_concealment,
+                                       &continuearg,
 #if CONFIG_VP9_HIGHBITDEPTH
-  &outbitdeptharg,
+                                       &outbitdeptharg,
 #endif
-  &svcdecodingarg, NULL
-};
+                                       &svcdecodingarg,
+                                       &framestatsarg,
+                                       &rowmtarg,
+                                       &lpfoptarg,
+                                       NULL };
 
 #if CONFIG_VP8_DECODER
 static const arg_def_t addnoise_level =
@@ -147,41 +175,47 @@ static INLINE int libyuv_scale(vpx_image_t *src, vpx_image_t *dst,
                    dst->d_h, mode);
 }
 #endif
-
-void usage_exit(void) {
+static void show_help(FILE *fout, int shorthelp) {
   int i;
 
-  fprintf(stderr,
-          "Usage: %s <options> filename\n\n"
-          "Options:\n",
-          exec_name);
-  arg_show_usage(stderr, all_args);
+  fprintf(fout, "Usage: %s <options> filename\n\n", exec_name);
+
+  if (shorthelp) {
+    fprintf(fout, "Use --help to see the full list of options.\n");
+    return;
+  }
+
+  fprintf(fout, "Options:\n");
+  arg_show_usage(fout, all_args);
 #if CONFIG_VP8_DECODER
-  fprintf(stderr, "\nVP8 Postprocessing Options:\n");
-  arg_show_usage(stderr, vp8_pp_args);
+  fprintf(fout, "\nVP8 Postprocessing Options:\n");
+  arg_show_usage(fout, vp8_pp_args);
 #endif
-  fprintf(stderr,
+  fprintf(fout,
           "\nOutput File Patterns:\n\n"
           "  The -o argument specifies the name of the file(s) to "
           "write to. If the\n  argument does not include any escape "
           "characters, the output will be\n  written to a single file. "
           "Otherwise, the filename will be calculated by\n  expanding "
           "the following escape characters:\n");
-  fprintf(stderr,
+  fprintf(fout,
           "\n\t%%w   - Frame width"
           "\n\t%%h   - Frame height"
           "\n\t%%<n> - Frame number, zero padded to <n> places (1..9)"
           "\n\n  Pattern arguments are only supported in conjunction "
           "with the --yv12 and\n  --i420 options. If the -o option is "
           "not specified, the output will be\n  directed to stdout.\n");
-  fprintf(stderr, "\nIncluded decoders:\n\n");
+  fprintf(fout, "\nIncluded decoders:\n\n");
 
   for (i = 0; i < get_vpx_decoder_count(); ++i) {
     const VpxInterface *const decoder = get_vpx_decoder_by_index(i);
-    fprintf(stderr, "    %-6s - %s\n", decoder->name,
+    fprintf(fout, "    %-6s - %s\n", decoder->name,
             vpx_codec_iface_name(decoder->codec_interface()));
   }
+}
 
+void usage_exit(void) {
+  show_help(stderr, 1);
   exit(EXIT_FAILURE);
 }
 
@@ -225,13 +259,14 @@ static int raw_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
       return 1;
     }
     *bytes_read = frame_size;
+    return 0;
   }
 
-  return 0;
+  return 1;
 }
 
-static int read_frame(struct VpxDecInputContext *input, uint8_t **buf,
-                      size_t *bytes_in_buffer, size_t *buffer_size) {
+static int dec_read_frame(struct VpxDecInputContext *input, uint8_t **buf,
+                          size_t *bytes_in_buffer, size_t *buffer_size) {
   switch (input->vpx_input_ctx->file_type) {
 #if CONFIG_WEBM_IO
     case FILE_TYPE_WEBM:
@@ -411,7 +446,7 @@ static void generate_filename(const char *pattern, char *out, size_t q_len,
         case '7': snprintf(q, q_len - 1, "%07d", frame_in); break;
         case '8': snprintf(q, q_len - 1, "%08d", frame_in); break;
         case '9': snprintf(q, q_len - 1, "%09d", frame_in); break;
-        default: die("Unrecognized pattern %%%c\n", p[1]); break;
+        default: die("Unrecognized pattern %%%c\n", p[1]);
       }
 
       pat_len = strlen(q);
@@ -488,11 +523,13 @@ static int main_loop(int argc, const char **argv_) {
   size_t bytes_in_buffer = 0, buffer_size = 0;
   FILE *infile;
   int frame_in = 0, frame_out = 0, flipuv = 0, noblit = 0;
-  int do_md5 = 0, progress = 0, frame_parallel = 0;
+  int do_md5 = 0, progress = 0;
   int stop_after = 0, postproc = 0, summary = 0, quiet = 1;
   int arg_skip = 0;
   int ec_enabled = 0;
   int keep_going = 0;
+  int enable_row_mt = 0;
+  int enable_lpf_opt = 0;
   const VpxInterface *interface = NULL;
   const VpxInterface *fourcc_interface = NULL;
   uint64_t dx_time = 0;
@@ -527,6 +564,8 @@ static int main_loop(int argc, const char **argv_) {
   char outfile_name[PATH_MAX] = { 0 };
   FILE *outfile = NULL;
 
+  FILE *framestats_file = NULL;
+
   MD5Context md5_ctx;
   unsigned char md5_digest[16];
 
@@ -542,12 +581,18 @@ static int main_loop(int argc, const char **argv_) {
   /* Parse command line */
   exec_name = argv_[0];
   argv = argv_dup(argc - 1, argv_ + 1);
-
+  if (!argv) {
+    fprintf(stderr, "Error allocating argument list\n");
+    return EXIT_FAILURE;
+  }
   for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
     memset(&arg, 0, sizeof(arg));
     arg.argv_step = 1;
 
-    if (arg_match(&arg, &codecarg, argi)) {
+    if (arg_match(&arg, &help, argi)) {
+      show_help(stdout, 0);
+      exit(EXIT_SUCCESS);
+    } else if (arg_match(&arg, &codecarg, argi)) {
       interface = get_vpx_decoder_by_name(arg.val);
       if (!interface)
         die("Error: Unrecognized argument (%s) to --codec\n", arg.val);
@@ -584,8 +629,9 @@ static int main_loop(int argc, const char **argv_) {
     else if (arg_match(&arg, &threadsarg, argi))
       cfg.threads = arg_parse_uint(&arg);
 #if CONFIG_VP9_DECODER
-    else if (arg_match(&arg, &frameparallelarg, argi))
-      frame_parallel = 1;
+    else if (arg_match(&arg, &frameparallelarg, argi)) {
+      /* ignored for compatibility */
+    }
 #endif
     else if (arg_match(&arg, &verbosearg, argi))
       quiet = 0;
@@ -603,6 +649,16 @@ static int main_loop(int argc, const char **argv_) {
     else if (arg_match(&arg, &svcdecodingarg, argi)) {
       svc_decoding = 1;
       svc_spatial_layer = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &framestatsarg, argi)) {
+      framestats_file = fopen(arg.val, "w");
+      if (!framestats_file) {
+        die("Error: Could not open --framestats file (%s) for writing.\n",
+            arg.val);
+      }
+    } else if (arg_match(&arg, &rowmtarg, argi)) {
+      enable_row_mt = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &lpfoptarg, argi)) {
+      enable_lpf_opt = arg_parse_uint(&arg);
     }
 #if CONFIG_VP8_DECODER
     else if (arg_match(&arg, &addnoise_level, argi)) {
@@ -637,6 +693,7 @@ static int main_loop(int argc, const char **argv_) {
 
   if (!fn) {
     free(argv);
+    fprintf(stderr, "No input file specified!\n");
     usage_exit();
   }
   /* Open file */
@@ -668,6 +725,7 @@ static int main_loop(int argc, const char **argv_) {
 #if !CONFIG_WEBM_IO
     fprintf(stderr, "vpxdec was built without WebM container support.\n");
 #endif
+    free(argv);
     return EXIT_FAILURE;
   }
 
@@ -712,8 +770,7 @@ static int main_loop(int argc, const char **argv_) {
   if (!interface) interface = get_vpx_decoder_by_index(0);
 
   dec_flags = (postproc ? VPX_CODEC_USE_POSTPROC : 0) |
-              (ec_enabled ? VPX_CODEC_USE_ERROR_CONCEALMENT : 0) |
-              (frame_parallel ? VPX_CODEC_USE_FRAME_THREADING : 0);
+              (ec_enabled ? VPX_CODEC_USE_ERROR_CONCEALMENT : 0);
   if (vpx_codec_dec_init(&decoder, interface->codec_interface(), &cfg,
                          dec_flags)) {
     fprintf(stderr, "Failed to initialize decoder: %s\n",
@@ -728,6 +785,18 @@ static int main_loop(int argc, const char **argv_) {
       goto fail;
     }
   }
+  if (interface->fourcc == VP9_FOURCC &&
+      vpx_codec_control(&decoder, VP9D_SET_ROW_MT, enable_row_mt)) {
+    fprintf(stderr, "Failed to set decoder in row multi-thread mode: %s\n",
+            vpx_codec_error(&decoder));
+    goto fail;
+  }
+  if (interface->fourcc == VP9_FOURCC &&
+      vpx_codec_control(&decoder, VP9D_SET_LOOP_FILTER_OPT, enable_lpf_opt)) {
+    fprintf(stderr, "Failed to set decoder in optimized loopfilter mode: %s\n",
+            vpx_codec_error(&decoder));
+    goto fail;
+  }
   if (!quiet) fprintf(stderr, "%s\n", decoder.name);
 
 #if CONFIG_VP8_DECODER
@@ -741,7 +810,7 @@ static int main_loop(int argc, const char **argv_) {
 
   if (arg_skip) fprintf(stderr, "Skipping first %d frames.\n", arg_skip);
   while (arg_skip) {
-    if (read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) break;
+    if (dec_read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) break;
     arg_skip--;
   }
 
@@ -749,6 +818,10 @@ static int main_loop(int argc, const char **argv_) {
     ext_fb_list.num_external_frame_buffers = num_external_frame_buffers;
     ext_fb_list.ext_fb = (struct ExternalFrameBuffer *)calloc(
         num_external_frame_buffers, sizeof(*ext_fb_list.ext_fb));
+    if (!ext_fb_list.ext_fb) {
+      fprintf(stderr, "Failed to allocate ExternalFrameBuffer\n");
+      goto fail;
+    }
     if (vpx_codec_set_frame_buffer_functions(&decoder, get_vp9_frame_buffer,
                                              release_vp9_frame_buffer,
                                              &ext_fb_list)) {
@@ -761,6 +834,8 @@ static int main_loop(int argc, const char **argv_) {
   frame_avail = 1;
   got_data = 0;
 
+  if (framestats_file) fprintf(framestats_file, "bytes,qp\n");
+
   /* Decode file */
   while (frame_avail || got_data) {
     vpx_codec_iter_t iter = NULL;
@@ -770,7 +845,7 @@ static int main_loop(int argc, const char **argv_) {
 
     frame_avail = 0;
     if (!stop_after || frame_in < stop_after) {
-      if (!read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) {
+      if (!dec_read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) {
         frame_avail = 1;
         frame_in++;
 
@@ -786,6 +861,16 @@ static int main_loop(int argc, const char **argv_) {
           if (!keep_going) goto fail;
         }
 
+        if (framestats_file) {
+          int qp;
+          if (vpx_codec_control(&decoder, VPXD_GET_LAST_QUANTIZER, &qp)) {
+            warn("Failed VPXD_GET_LAST_QUANTIZER: %s",
+                 vpx_codec_error(&decoder));
+            if (!keep_going) goto fail;
+          }
+          fprintf(framestats_file, "%d,%d\n", (int)bytes_in_buffer, qp);
+        }
+
         vpx_usec_timer_mark(&timer);
         dx_time += vpx_usec_timer_elapsed(&timer);
       } else {
@@ -815,7 +900,7 @@ static int main_loop(int argc, const char **argv_) {
     vpx_usec_timer_mark(&timer);
     dx_time += (unsigned int)vpx_usec_timer_elapsed(&timer);
 
-    if (!frame_parallel && !corrupted &&
+    if (!corrupted &&
         vpx_codec_control(&decoder, VP8D_GET_FRAME_CORRUPTED, &corrupted)) {
       warn("Failed VP8_GET_FRAME_CORRUPTED: %s", vpx_codec_error(&decoder));
       if (!keep_going) goto fail;
@@ -852,6 +937,11 @@ static int main_loop(int argc, const char **argv_) {
           }
           scaled_img =
               vpx_img_alloc(NULL, img->fmt, render_width, render_height, 16);
+          if (!scaled_img) {
+            fprintf(stderr, "Failed to allocate scaled image (%d x %d)\n",
+                    render_width, render_height);
+            goto fail;
+          }
           scaled_img->bit_depth = img->bit_depth;
         }
 
@@ -888,6 +978,10 @@ static int main_loop(int argc, const char **argv_) {
         if (!img_shifted) {
           img_shifted =
               vpx_img_alloc(NULL, shifted_fmt, img->d_w, img->d_h, 16);
+          if (!img_shifted) {
+            fprintf(stderr, "Failed to allocate image\n");
+            goto fail;
+          }
           img_shifted->bit_depth = output_bit_depth;
         }
         if (output_bit_depth > img->bit_depth) {
@@ -902,7 +996,7 @@ static int main_loop(int argc, const char **argv_) {
 
       if (single_file) {
         if (use_y4m) {
-          char buf[Y4M_BUFFER_SIZE] = { 0 };
+          char y4m_buf[Y4M_BUFFER_SIZE] = { 0 };
           size_t len = 0;
           if (img->fmt == VPX_IMG_FMT_I440 || img->fmt == VPX_IMG_FMT_I44016) {
             fprintf(stderr, "Cannot produce y4m output for 440 sampling.\n");
@@ -911,21 +1005,22 @@ static int main_loop(int argc, const char **argv_) {
           if (frame_out == 1) {
             // Y4M file header
             len = y4m_write_file_header(
-                buf, sizeof(buf), vpx_input_ctx.width, vpx_input_ctx.height,
-                &vpx_input_ctx.framerate, img->fmt, img->bit_depth);
+                y4m_buf, sizeof(y4m_buf), vpx_input_ctx.width,
+                vpx_input_ctx.height, &vpx_input_ctx.framerate, img->fmt,
+                img->bit_depth);
             if (do_md5) {
-              MD5Update(&md5_ctx, (md5byte *)buf, (unsigned int)len);
+              MD5Update(&md5_ctx, (md5byte *)y4m_buf, (unsigned int)len);
             } else {
-              fputs(buf, outfile);
+              fputs(y4m_buf, outfile);
             }
           }
 
           // Y4M frame header
-          len = y4m_write_frame_header(buf, sizeof(buf));
+          len = y4m_write_frame_header(y4m_buf, sizeof(y4m_buf));
           if (do_md5) {
-            MD5Update(&md5_ctx, (md5byte *)buf, (unsigned int)len);
+            MD5Update(&md5_ctx, (md5byte *)y4m_buf, (unsigned int)len);
           } else {
-            fputs(buf, outfile);
+            fputs(y4m_buf, outfile);
           }
         } else {
           if (frame_out == 1) {
@@ -952,7 +1047,7 @@ static int main_loop(int argc, const char **argv_) {
         if (do_md5) {
           update_image_md5(img, planes, &md5_ctx);
         } else {
-          write_image_file(img, planes, outfile);
+          if (!corrupted) write_image_file(img, planes, outfile);
         }
       } else {
         generate_filename(outfile_pattern, outfile_name, PATH_MAX, img->d_w,
@@ -1018,6 +1113,8 @@ fail2:
   free(ext_fb_list.ext_fb);
 
   fclose(infile);
+  if (framestats_file) fclose(framestats_file);
+
   free(argv);
 
   return ret;
@@ -1030,6 +1127,10 @@ int main(int argc, const char **argv_) {
   int error = 0;
 
   argv = argv_dup(argc - 1, argv_ + 1);
+  if (!argv) {
+    fprintf(stderr, "Error allocating argument list\n");
+    return EXIT_FAILURE;
+  }
   for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
     memset(&arg, 0, sizeof(arg));
     arg.argv_step = 1;
diff --git a/media/libvpx/libvpx/vpxenc.c b/media/libvpx/libvpx/vpxenc.c
index a0f760574c..bf5f1ad790 100644
--- a/media/libvpx/libvpx/vpxenc.c
+++ b/media/libvpx/libvpx/vpxenc.c
@@ -50,12 +50,6 @@
 #endif
 #include "./y4minput.h"
 
-/* Swallow warnings about unused results of fread/fwrite */
-static size_t wrap_fread(void *ptr, size_t size, size_t nmemb, FILE *stream) {
-  return fread(ptr, size, nmemb, stream);
-}
-#define fread wrap_fread
-
 static size_t wrap_fwrite(const void *ptr, size_t size, size_t nmemb,
                           FILE *stream) {
   return fwrite(ptr, size, nmemb, stream);
@@ -64,8 +58,8 @@ static size_t wrap_fwrite(const void *ptr, size_t size, size_t nmemb,
 
 static const char *exec_name;
 
-static void warn_or_exit_on_errorv(vpx_codec_ctx_t *ctx, int fatal,
-                                   const char *s, va_list ap) {
+static VPX_TOOLS_FORMAT_PRINTF(3, 0) void warn_or_exit_on_errorv(
+    vpx_codec_ctx_t *ctx, int fatal, const char *s, va_list ap) {
   if (ctx->err) {
     const char *detail = vpx_codec_error_detail(ctx);
 
@@ -78,7 +72,9 @@ static void warn_or_exit_on_errorv(vpx_codec_ctx_t *ctx, int fatal,
   }
 }
 
-static void ctx_exit_on_error(vpx_codec_ctx_t *ctx, const char *s, ...) {
+static VPX_TOOLS_FORMAT_PRINTF(2,
+                               3) void ctx_exit_on_error(vpx_codec_ctx_t *ctx,
+                                                         const char *s, ...) {
   va_list ap;
 
   va_start(ap, s);
@@ -86,8 +82,8 @@ static void ctx_exit_on_error(vpx_codec_ctx_t *ctx, const char *s, ...) {
   va_end(ap);
 }
 
-static void warn_or_exit_on_error(vpx_codec_ctx_t *ctx, int fatal,
-                                  const char *s, ...) {
+static VPX_TOOLS_FORMAT_PRINTF(3, 4) void warn_or_exit_on_error(
+    vpx_codec_ctx_t *ctx, int fatal, const char *s, ...) {
   va_list ap;
 
   va_start(ap, s);
@@ -95,38 +91,14 @@ static void warn_or_exit_on_error(vpx_codec_ctx_t *ctx, int fatal,
   va_end(ap);
 }
 
-static int read_frame(struct VpxInputContext *input_ctx, vpx_image_t *img) {
-  FILE *f = input_ctx->file;
-  y4m_input *y4m = &input_ctx->y4m;
-  int shortread = 0;
-
-  if (input_ctx->file_type == FILE_TYPE_Y4M) {
-    if (y4m_input_fetch_frame(y4m, f, img) < 1) return 0;
-  } else {
-    shortread = read_yuv_frame(input_ctx, img);
-  }
-
-  return !shortread;
-}
-
-static int file_is_y4m(const char detect[4]) {
-  if (memcmp(detect, "YUV4", 4) == 0) {
-    return 1;
-  }
-  return 0;
-}
-
-static int fourcc_is_ivf(const char detect[4]) {
-  if (memcmp(detect, "DKIF", 4) == 0) {
-    return 1;
-  }
-  return 0;
-}
-
+static const arg_def_t help =
+    ARG_DEF(NULL, "help", 0, "Show usage options and exit");
 static const arg_def_t debugmode =
     ARG_DEF("D", "debug", 0, "Debug mode (makes output deterministic)");
 static const arg_def_t outputfile =
     ARG_DEF("o", "output", 1, "Output filename");
+static const arg_def_t use_nv12 =
+    ARG_DEF(NULL, "nv12", 0, "Input file is NV12 ");
 static const arg_def_t use_yv12 =
     ARG_DEF(NULL, "yv12", 0, "Input file is YV12 ");
 static const arg_def_t use_i420 =
@@ -144,10 +116,6 @@ static const arg_def_t pass_arg =
     ARG_DEF(NULL, "pass", 1, "Pass to execute (1/2)");
 static const arg_def_t fpf_name =
     ARG_DEF(NULL, "fpf", 1, "First pass statistics file name");
-#if CONFIG_FP_MB_STATS
-static const arg_def_t fpmbf_name =
-    ARG_DEF(NULL, "fpmbf", 1, "First pass block statistics file name");
-#endif
 static const arg_def_t limit =
     ARG_DEF(NULL, "limit", 1, "Stop encoding after n input frames");
 static const arg_def_t skip =
@@ -199,7 +167,8 @@ static const arg_def_t test16bitinternalarg = ARG_DEF(
     NULL, "test-16bit-internal", 0, "Force use of 16 bit internal buffer");
 #endif
 
-static const arg_def_t *main_args[] = { &debugmode,
+static const arg_def_t *main_args[] = { &help,
+                                        &debugmode,
                                         &outputfile,
                                         &codecarg,
                                         &passes,
@@ -251,7 +220,8 @@ static const arg_def_t error_resilient =
 static const arg_def_t lag_in_frames =
     ARG_DEF(NULL, "lag-in-frames", 1, "Max number of frames to lag");
 
-static const arg_def_t *global_args[] = { &use_yv12,
+static const arg_def_t *global_args[] = { &use_nv12,
+                                          &use_yv12,
                                           &use_i420,
                                           &use_i422,
                                           &use_i444,
@@ -315,14 +285,75 @@ static const arg_def_t *rc_args[] = {
   &buf_sz,           &buf_initial_sz,     &buf_optimal_sz, NULL
 };
 
+#if CONFIG_VP9_ENCODER
+static const arg_def_t use_vizier_rc_params =
+    ARG_DEF(NULL, "use-vizier-rc-params", 1, "Use vizier rc params");
+static const arg_def_t active_wq_factor =
+    ARG_DEF(NULL, "active-wq-factor", 1, "Active worst quality factor");
+static const arg_def_t err_per_mb_factor =
+    ARG_DEF(NULL, "err-per-mb-factor", 1, "Error per macroblock factor");
+static const arg_def_t sr_default_decay_limit = ARG_DEF(
+    NULL, "sr-default-decay-limit", 1, "Second reference default decay limit");
+static const arg_def_t sr_diff_factor =
+    ARG_DEF(NULL, "sr-diff-factor", 1, "Second reference diff factor");
+static const arg_def_t kf_err_per_mb_factor = ARG_DEF(
+    NULL, "kf-err-per-mb-factor", 1, "Keyframe error per macroblock factor");
+static const arg_def_t kf_frame_min_boost_factor =
+    ARG_DEF(NULL, "kf-frame-min-boost-factor", 1, "Keyframe min boost");
+static const arg_def_t kf_frame_max_boost_first_factor =
+    ARG_DEF(NULL, "kf-frame-max-boost-first-factor", 1,
+            "Max keyframe boost adjustment factor for first frame");
+static const arg_def_t kf_frame_max_boost_subs_factor =
+    ARG_DEF(NULL, "kf-frame-max-boost-subs-factor", 1,
+            "Max boost adjustment factor for subsequent KFs");
+static const arg_def_t kf_max_total_boost_factor = ARG_DEF(
+    NULL, "kf-max-total-boost-factor", 1, "Keyframe max total boost factor");
+static const arg_def_t gf_max_total_boost_factor =
+    ARG_DEF(NULL, "gf-max-total-boost-factor", 1,
+            "Golden frame max total boost factor");
+static const arg_def_t gf_frame_max_boost_factor =
+    ARG_DEF(NULL, "gf-frame-max-boost-factor", 1,
+            "Golden frame max per frame boost factor");
+static const arg_def_t zm_factor =
+    ARG_DEF(NULL, "zm-factor", 1, "Zero motion power factor");
+static const arg_def_t rd_mult_inter_qp_fac =
+    ARG_DEF(NULL, "rd-mult-inter-qp-fac", 1,
+            "RD multiplier adjustment for inter frames");
+static const arg_def_t rd_mult_arf_qp_fac =
+    ARG_DEF(NULL, "rd-mult-arf-qp-fac", 1,
+            "RD multiplier adjustment for alt-ref frames");
+static const arg_def_t rd_mult_key_qp_fac = ARG_DEF(
+    NULL, "rd-mult-key-qp-fac", 1, "RD multiplier adjustment for key frames");
+static const arg_def_t *vizier_rc_args[] = { &use_vizier_rc_params,
+                                             &active_wq_factor,
+                                             &err_per_mb_factor,
+                                             &sr_default_decay_limit,
+                                             &sr_diff_factor,
+                                             &kf_err_per_mb_factor,
+                                             &kf_frame_min_boost_factor,
+                                             &kf_frame_max_boost_first_factor,
+                                             &kf_frame_max_boost_subs_factor,
+                                             &kf_max_total_boost_factor,
+                                             &gf_max_total_boost_factor,
+                                             &gf_frame_max_boost_factor,
+                                             &zm_factor,
+                                             &rd_mult_inter_qp_fac,
+                                             &rd_mult_arf_qp_fac,
+                                             &rd_mult_key_qp_fac,
+                                             NULL };
+#endif
+
 static const arg_def_t bias_pct =
     ARG_DEF(NULL, "bias-pct", 1, "CBR/VBR bias (0=CBR, 100=VBR)");
 static const arg_def_t minsection_pct =
     ARG_DEF(NULL, "minsection-pct", 1, "GOP min bitrate (% of target)");
 static const arg_def_t maxsection_pct =
     ARG_DEF(NULL, "maxsection-pct", 1, "GOP max bitrate (% of target)");
+static const arg_def_t corpus_complexity =
+    ARG_DEF(NULL, "corpus-complexity", 1, "corpus vbr complexity midpoint");
 static const arg_def_t *rc_twopass_args[] = { &bias_pct, &minsection_pct,
-                                              &maxsection_pct, NULL };
+                                              &maxsection_pct,
+                                              &corpus_complexity, NULL };
 
 static const arg_def_t kf_min_dist =
     ARG_DEF(NULL, "kf-min-dist", 1, "Minimum keyframe interval (frames)");
@@ -336,19 +367,19 @@ static const arg_def_t *kf_args[] = { &kf_min_dist, &kf_max_dist, &kf_disabled,
 static const arg_def_t noise_sens =
     ARG_DEF(NULL, "noise-sensitivity", 1, "Noise sensitivity (frames to blur)");
 static const arg_def_t sharpness =
-    ARG_DEF(NULL, "sharpness", 1, "Loop filter sharpness (0..7)");
+    ARG_DEF(NULL, "sharpness", 1,
+            "Increase sharpness at the expense of lower PSNR. (0..7)");
 static const arg_def_t static_thresh =
     ARG_DEF(NULL, "static-thresh", 1, "Motion detection threshold");
-static const arg_def_t auto_altref =
-    ARG_DEF(NULL, "auto-alt-ref", 1, "Enable automatic alt reference frames");
 static const arg_def_t arnr_maxframes =
     ARG_DEF(NULL, "arnr-maxframes", 1, "AltRef max frames (0..15)");
 static const arg_def_t arnr_strength =
     ARG_DEF(NULL, "arnr-strength", 1, "AltRef filter strength (0..6)");
-static const arg_def_t arnr_type = ARG_DEF(NULL, "arnr-type", 1, "AltRef type");
-static const struct arg_enum_list tuning_enum[] = {
-  { "psnr", VP8_TUNE_PSNR }, { "ssim", VP8_TUNE_SSIM }, { NULL, 0 }
-};
+static const arg_def_t arnr_type =
+    ARG_DEF(NULL, "arnr-type", 1, "AltRef filter type (1..3)");
+static const struct arg_enum_list tuning_enum[] = { { "psnr", VP8_TUNE_PSNR },
+                                                    { "ssim", VP8_TUNE_SSIM },
+                                                    { NULL, 0 } };
 static const arg_def_t tune_ssim =
     ARG_DEF_ENUM(NULL, "tune", 1, "Material to favor", tuning_enum);
 static const arg_def_t cq_level =
@@ -361,12 +392,14 @@ static const arg_def_t gf_cbr_boost_pct = ARG_DEF(
 #if CONFIG_VP8_ENCODER
 static const arg_def_t cpu_used_vp8 =
     ARG_DEF(NULL, "cpu-used", 1, "CPU Used (-16..16)");
+static const arg_def_t auto_altref_vp8 = ARG_DEF(
+    NULL, "auto-alt-ref", 1, "Enable automatic alt reference frames. (0..1)");
 static const arg_def_t token_parts =
     ARG_DEF(NULL, "token-parts", 1, "Number of token partitions to use, log2");
 static const arg_def_t screen_content_mode =
     ARG_DEF(NULL, "screen-content-mode", 1, "Screen content mode");
 static const arg_def_t *vp8_args[] = { &cpu_used_vp8,
-                                       &auto_altref,
+                                       &auto_altref_vp8,
                                        &noise_sens,
                                        &sharpness,
                                        &static_thresh,
@@ -399,12 +432,22 @@ static const int vp8_arg_ctrl_map[] = { VP8E_SET_CPUUSED,
 
 #if CONFIG_VP9_ENCODER
 static const arg_def_t cpu_used_vp9 =
-    ARG_DEF(NULL, "cpu-used", 1, "CPU Used (-8..8)");
+    ARG_DEF(NULL, "cpu-used", 1, "CPU Used (-9..9)");
+static const arg_def_t auto_altref_vp9 = ARG_DEF(
+    NULL, "auto-alt-ref", 1,
+    "Enable automatic alt reference frames, 2+ enables multi-layer. (0..6)");
 static const arg_def_t tile_cols =
     ARG_DEF(NULL, "tile-columns", 1, "Number of tile columns to use, log2");
 static const arg_def_t tile_rows =
     ARG_DEF(NULL, "tile-rows", 1,
             "Number of tile rows to use, log2 (set to 0 while threads > 1)");
+
+static const arg_def_t enable_tpl_model =
+    ARG_DEF(NULL, "enable-tpl", 1, "Enable temporal dependency model");
+static const arg_def_t enable_keyframe_filtering =
+    ARG_DEF(NULL, "enable-keyframe-filtering", 1,
+            "Enable key frame temporal filtering (0: off (default), 1: on)");
+
 static const arg_def_t lossless =
     ARG_DEF(NULL, "lossless", 1, "Lossless mode (0: false (default), 1: true)");
 static const arg_def_t frame_parallel_decoding = ARG_DEF(
@@ -441,8 +484,8 @@ static const struct arg_enum_list color_space_enum[] = {
 };
 
 static const arg_def_t input_color_space =
-    ARG_DEF_ENUM(NULL, "color-space", 1, "The color space of input content:",
-                 color_space_enum);
+    ARG_DEF_ENUM(NULL, "color-space", 1,
+                 "The color space of input content:", color_space_enum);
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static const struct arg_enum_list bitdepth_enum[] = {
@@ -460,6 +503,7 @@ static const arg_def_t inbitdeptharg =
 static const struct arg_enum_list tune_content_enum[] = {
   { "default", VP9E_CONTENT_DEFAULT },
   { "screen", VP9E_CONTENT_SCREEN },
+  { "film", VP9E_CONTENT_FILM },
   { NULL, 0 }
 };
 
@@ -468,17 +512,39 @@ static const arg_def_t tune_content = ARG_DEF_ENUM(
 
 static const arg_def_t target_level = ARG_DEF(
     NULL, "target-level", 1,
-    "Target level (255: off (default); 0: only keep level stats; 10: level 1.0;"
-    " 11: level 1.1; ... 62: level 6.2)");
+    "Target level\n"
+    "                                        255: off (default)\n"
+    "                                          0: only keep level stats\n"
+    "                                          1: adaptively set alt-ref "
+    "distance and column tile limit based on picture size, and keep"
+    " level stats\n"
+    "                                         10: level 1.0  11: level 1.1  "
+    "...  62: level 6.2");
+
+static const arg_def_t row_mt =
+    ARG_DEF(NULL, "row-mt", 1,
+            "Enable row based non-deterministic multi-threading in VP9");
+
+static const arg_def_t disable_loopfilter =
+    ARG_DEF(NULL, "disable-loopfilter", 1,
+            "Control Loopfilter in VP9:\n"
+            "                                          "
+            "0: Loopfilter on for all frames (default)\n"
+            "                                          "
+            "1: Loopfilter off for non reference frames\n"
+            "                                          "
+            "2: Loopfilter off for all frames");
 #endif
 
 #if CONFIG_VP9_ENCODER
 static const arg_def_t *vp9_args[] = { &cpu_used_vp9,
-                                       &auto_altref,
+                                       &auto_altref_vp9,
                                        &sharpness,
                                        &static_thresh,
                                        &tile_cols,
                                        &tile_rows,
+                                       &enable_tpl_model,
+                                       &enable_keyframe_filtering,
                                        &arnr_maxframes,
                                        &arnr_strength,
                                        &arnr_type,
@@ -498,6 +564,11 @@ static const arg_def_t *vp9_args[] = { &cpu_used_vp9,
                                        &min_gf_interval,
                                        &max_gf_interval,
                                        &target_level,
+                                       &row_mt,
+                                       &disable_loopfilter,
+// NOTE: The entries above have a corresponding entry in vp9_arg_ctrl_map. The
+// entries below do not have a corresponding entry in vp9_arg_ctrl_map. They
+// must be listed at the end of vp9_args.
 #if CONFIG_VP9_HIGHBITDEPTH
                                        &bitdeptharg,
                                        &inbitdeptharg,
@@ -509,6 +580,8 @@ static const int vp9_arg_ctrl_map[] = { VP8E_SET_CPUUSED,
                                         VP8E_SET_STATIC_THRESHOLD,
                                         VP9E_SET_TILE_COLUMNS,
                                         VP9E_SET_TILE_ROWS,
+                                        VP9E_SET_TPL,
+                                        VP9E_SET_KEY_FRAME_FILTERING,
                                         VP8E_SET_ARNR_MAXFRAMES,
                                         VP8E_SET_ARNR_STRENGTH,
                                         VP8E_SET_ARNR_TYPE,
@@ -528,278 +601,66 @@ static const int vp9_arg_ctrl_map[] = { VP8E_SET_CPUUSED,
                                         VP9E_SET_MIN_GF_INTERVAL,
                                         VP9E_SET_MAX_GF_INTERVAL,
                                         VP9E_SET_TARGET_LEVEL,
+                                        VP9E_SET_ROW_MT,
+                                        VP9E_SET_DISABLE_LOOPFILTER,
                                         0 };
 #endif
 
 static const arg_def_t *no_args[] = { NULL };
 
-void usage_exit(void) {
+static void show_help(FILE *fout, int shorthelp) {
   int i;
   const int num_encoder = get_vpx_encoder_count();
 
-  fprintf(stderr, "Usage: %s <options> -o dst_filename src_filename \n",
+  fprintf(fout, "Usage: %s <options> -o dst_filename src_filename \n",
           exec_name);
 
-  fprintf(stderr, "\nOptions:\n");
-  arg_show_usage(stderr, main_args);
-  fprintf(stderr, "\nEncoder Global Options:\n");
-  arg_show_usage(stderr, global_args);
-  fprintf(stderr, "\nRate Control Options:\n");
-  arg_show_usage(stderr, rc_args);
-  fprintf(stderr, "\nTwopass Rate Control Options:\n");
-  arg_show_usage(stderr, rc_twopass_args);
-  fprintf(stderr, "\nKeyframe Placement Options:\n");
-  arg_show_usage(stderr, kf_args);
+  if (shorthelp) {
+    fprintf(fout, "Use --help to see the full list of options.\n");
+    return;
+  }
+
+  fprintf(fout, "\nOptions:\n");
+  arg_show_usage(fout, main_args);
+  fprintf(fout, "\nEncoder Global Options:\n");
+  arg_show_usage(fout, global_args);
+  fprintf(fout, "\nRate Control Options:\n");
+  arg_show_usage(fout, rc_args);
+  fprintf(fout, "\nTwopass Rate Control Options:\n");
+  arg_show_usage(fout, rc_twopass_args);
+  fprintf(fout, "\nKeyframe Placement Options:\n");
+  arg_show_usage(fout, kf_args);
 #if CONFIG_VP8_ENCODER
-  fprintf(stderr, "\nVP8 Specific Options:\n");
-  arg_show_usage(stderr, vp8_args);
+  fprintf(fout, "\nVP8 Specific Options:\n");
+  arg_show_usage(fout, vp8_args);
 #endif
 #if CONFIG_VP9_ENCODER
-  fprintf(stderr, "\nVP9 Specific Options:\n");
-  arg_show_usage(stderr, vp9_args);
+  fprintf(fout, "\nVP9 Specific Options:\n");
+  arg_show_usage(fout, vp9_args);
+  fprintf(fout, "\nVizier Rate Control Options:\n");
+  arg_show_usage(fout, vizier_rc_args);
 #endif
-  fprintf(stderr,
+  fprintf(fout,
           "\nStream timebase (--timebase):\n"
           "  The desired precision of timestamps in the output, expressed\n"
           "  in fractional seconds. Default is 1/1000.\n");
-  fprintf(stderr, "\nIncluded encoders:\n\n");
+  fprintf(fout, "\nIncluded encoders:\n\n");
 
   for (i = 0; i < num_encoder; ++i) {
     const VpxInterface *const encoder = get_vpx_encoder_by_index(i);
     const char *defstr = (i == (num_encoder - 1)) ? "(default)" : "";
-    fprintf(stderr, "    %-6s - %s %s\n", encoder->name,
+    fprintf(fout, "    %-6s - %s %s\n", encoder->name,
             vpx_codec_iface_name(encoder->codec_interface()), defstr);
   }
-  fprintf(stderr, "\n        ");
-  fprintf(stderr, "Use --codec to switch to a non-default encoder.\n\n");
+  fprintf(fout, "\n        ");
+  fprintf(fout, "Use --codec to switch to a non-default encoder.\n\n");
+}
 
+void usage_exit(void) {
+  show_help(stderr, 1);
   exit(EXIT_FAILURE);
 }
 
-#define mmin(a, b) ((a) < (b) ? (a) : (b))
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void find_mismatch_high(const vpx_image_t *const img1,
-                               const vpx_image_t *const img2, int yloc[4],
-                               int uloc[4], int vloc[4]) {
-  uint16_t *plane1, *plane2;
-  uint32_t stride1, stride2;
-  const uint32_t bsize = 64;
-  const uint32_t bsizey = bsize >> img1->y_chroma_shift;
-  const uint32_t bsizex = bsize >> img1->x_chroma_shift;
-  const uint32_t c_w =
-      (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
-  const uint32_t c_h =
-      (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
-  int match = 1;
-  uint32_t i, j;
-  yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1;
-  plane1 = (uint16_t *)img1->planes[VPX_PLANE_Y];
-  plane2 = (uint16_t *)img2->planes[VPX_PLANE_Y];
-  stride1 = img1->stride[VPX_PLANE_Y] / 2;
-  stride2 = img2->stride[VPX_PLANE_Y] / 2;
-  for (i = 0, match = 1; match && i < img1->d_h; i += bsize) {
-    for (j = 0; match && j < img1->d_w; j += bsize) {
-      int k, l;
-      const int si = mmin(i + bsize, img1->d_h) - i;
-      const int sj = mmin(j + bsize, img1->d_w) - j;
-      for (k = 0; match && k < si; ++k) {
-        for (l = 0; match && l < sj; ++l) {
-          if (*(plane1 + (i + k) * stride1 + j + l) !=
-              *(plane2 + (i + k) * stride2 + j + l)) {
-            yloc[0] = i + k;
-            yloc[1] = j + l;
-            yloc[2] = *(plane1 + (i + k) * stride1 + j + l);
-            yloc[3] = *(plane2 + (i + k) * stride2 + j + l);
-            match = 0;
-            break;
-          }
-        }
-      }
-    }
-  }
-
-  uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1;
-  plane1 = (uint16_t *)img1->planes[VPX_PLANE_U];
-  plane2 = (uint16_t *)img2->planes[VPX_PLANE_U];
-  stride1 = img1->stride[VPX_PLANE_U] / 2;
-  stride2 = img2->stride[VPX_PLANE_U] / 2;
-  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
-    for (j = 0; match && j < c_w; j += bsizex) {
-      int k, l;
-      const int si = mmin(i + bsizey, c_h - i);
-      const int sj = mmin(j + bsizex, c_w - j);
-      for (k = 0; match && k < si; ++k) {
-        for (l = 0; match && l < sj; ++l) {
-          if (*(plane1 + (i + k) * stride1 + j + l) !=
-              *(plane2 + (i + k) * stride2 + j + l)) {
-            uloc[0] = i + k;
-            uloc[1] = j + l;
-            uloc[2] = *(plane1 + (i + k) * stride1 + j + l);
-            uloc[3] = *(plane2 + (i + k) * stride2 + j + l);
-            match = 0;
-            break;
-          }
-        }
-      }
-    }
-  }
-
-  vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1;
-  plane1 = (uint16_t *)img1->planes[VPX_PLANE_V];
-  plane2 = (uint16_t *)img2->planes[VPX_PLANE_V];
-  stride1 = img1->stride[VPX_PLANE_V] / 2;
-  stride2 = img2->stride[VPX_PLANE_V] / 2;
-  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
-    for (j = 0; match && j < c_w; j += bsizex) {
-      int k, l;
-      const int si = mmin(i + bsizey, c_h - i);
-      const int sj = mmin(j + bsizex, c_w - j);
-      for (k = 0; match && k < si; ++k) {
-        for (l = 0; match && l < sj; ++l) {
-          if (*(plane1 + (i + k) * stride1 + j + l) !=
-              *(plane2 + (i + k) * stride2 + j + l)) {
-            vloc[0] = i + k;
-            vloc[1] = j + l;
-            vloc[2] = *(plane1 + (i + k) * stride1 + j + l);
-            vloc[3] = *(plane2 + (i + k) * stride2 + j + l);
-            match = 0;
-            break;
-          }
-        }
-      }
-    }
-  }
-}
-#endif
-
-static void find_mismatch(const vpx_image_t *const img1,
-                          const vpx_image_t *const img2, int yloc[4],
-                          int uloc[4], int vloc[4]) {
-  const uint32_t bsize = 64;
-  const uint32_t bsizey = bsize >> img1->y_chroma_shift;
-  const uint32_t bsizex = bsize >> img1->x_chroma_shift;
-  const uint32_t c_w =
-      (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
-  const uint32_t c_h =
-      (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
-  int match = 1;
-  uint32_t i, j;
-  yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1;
-  for (i = 0, match = 1; match && i < img1->d_h; i += bsize) {
-    for (j = 0; match && j < img1->d_w; j += bsize) {
-      int k, l;
-      const int si = mmin(i + bsize, img1->d_h) - i;
-      const int sj = mmin(j + bsize, img1->d_w) - j;
-      for (k = 0; match && k < si; ++k) {
-        for (l = 0; match && l < sj; ++l) {
-          if (*(img1->planes[VPX_PLANE_Y] +
-                (i + k) * img1->stride[VPX_PLANE_Y] + j + l) !=
-              *(img2->planes[VPX_PLANE_Y] +
-                (i + k) * img2->stride[VPX_PLANE_Y] + j + l)) {
-            yloc[0] = i + k;
-            yloc[1] = j + l;
-            yloc[2] = *(img1->planes[VPX_PLANE_Y] +
-                        (i + k) * img1->stride[VPX_PLANE_Y] + j + l);
-            yloc[3] = *(img2->planes[VPX_PLANE_Y] +
-                        (i + k) * img2->stride[VPX_PLANE_Y] + j + l);
-            match = 0;
-            break;
-          }
-        }
-      }
-    }
-  }
-
-  uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1;
-  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
-    for (j = 0; match && j < c_w; j += bsizex) {
-      int k, l;
-      const int si = mmin(i + bsizey, c_h - i);
-      const int sj = mmin(j + bsizex, c_w - j);
-      for (k = 0; match && k < si; ++k) {
-        for (l = 0; match && l < sj; ++l) {
-          if (*(img1->planes[VPX_PLANE_U] +
-                (i + k) * img1->stride[VPX_PLANE_U] + j + l) !=
-              *(img2->planes[VPX_PLANE_U] +
-                (i + k) * img2->stride[VPX_PLANE_U] + j + l)) {
-            uloc[0] = i + k;
-            uloc[1] = j + l;
-            uloc[2] = *(img1->planes[VPX_PLANE_U] +
-                        (i + k) * img1->stride[VPX_PLANE_U] + j + l);
-            uloc[3] = *(img2->planes[VPX_PLANE_U] +
-                        (i + k) * img2->stride[VPX_PLANE_U] + j + l);
-            match = 0;
-            break;
-          }
-        }
-      }
-    }
-  }
-  vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1;
-  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
-    for (j = 0; match && j < c_w; j += bsizex) {
-      int k, l;
-      const int si = mmin(i + bsizey, c_h - i);
-      const int sj = mmin(j + bsizex, c_w - j);
-      for (k = 0; match && k < si; ++k) {
-        for (l = 0; match && l < sj; ++l) {
-          if (*(img1->planes[VPX_PLANE_V] +
-                (i + k) * img1->stride[VPX_PLANE_V] + j + l) !=
-              *(img2->planes[VPX_PLANE_V] +
-                (i + k) * img2->stride[VPX_PLANE_V] + j + l)) {
-            vloc[0] = i + k;
-            vloc[1] = j + l;
-            vloc[2] = *(img1->planes[VPX_PLANE_V] +
-                        (i + k) * img1->stride[VPX_PLANE_V] + j + l);
-            vloc[3] = *(img2->planes[VPX_PLANE_V] +
-                        (i + k) * img2->stride[VPX_PLANE_V] + j + l);
-            match = 0;
-            break;
-          }
-        }
-      }
-    }
-  }
-}
-
-static int compare_img(const vpx_image_t *const img1,
-                       const vpx_image_t *const img2) {
-  uint32_t l_w = img1->d_w;
-  uint32_t c_w = (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
-  const uint32_t c_h =
-      (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
-  uint32_t i;
-  int match = 1;
-
-  match &= (img1->fmt == img2->fmt);
-  match &= (img1->d_w == img2->d_w);
-  match &= (img1->d_h == img2->d_h);
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (img1->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
-    l_w *= 2;
-    c_w *= 2;
-  }
-#endif
-
-  for (i = 0; i < img1->d_h; ++i)
-    match &= (memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y],
-                     img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y],
-                     l_w) == 0);
-
-  for (i = 0; i < c_h; ++i)
-    match &= (memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U],
-                     img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U],
-                     c_w) == 0);
-
-  for (i = 0; i < c_h; ++i)
-    match &= (memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V],
-                     img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V],
-                     c_w) == 0);
-
-  return match;
-}
-
 #define NELEMENTS(x) (sizeof(x) / sizeof(x[0]))
 #if CONFIG_VP9_ENCODER
 #define ARG_CTRL_CNT_MAX NELEMENTS(vp9_arg_ctrl_map)
@@ -819,9 +680,6 @@ struct stream_config {
   struct vpx_codec_enc_cfg cfg;
   const char *out_fn;
   const char *stats_fn;
-#if CONFIG_FP_MB_STATS
-  const char *fpmb_stats_fn;
-#endif
   stereo_format_t stereo_fmt;
   int arg_ctrls[ARG_CTRL_CNT_MAX][2];
   int arg_ctrl_cnt;
@@ -849,9 +707,6 @@ struct stream_state {
   uint64_t cx_time;
   size_t nbytes;
   stats_io_t stats;
-#if CONFIG_FP_MB_STATS
-  stats_io_t fpmb_stats;
-#endif
   struct vpx_image *img;
   vpx_codec_ctx_t decoder;
   int mismatch_seen;
@@ -864,7 +719,7 @@ static void validate_positive_rational(const char *msg,
     rat->den *= -1;
   }
 
-  if (rat->num < 0) die("Error: %s must be positive\n", msg);
+  if (rat->num <= 0) die("Error: %s must be positive\n", msg);
 
   if (!rat->den) die("Error: %s has zero denominator\n", msg);
 }
@@ -887,7 +742,10 @@ static void parse_global_config(struct VpxEncoderConfig *global, char **argv) {
   for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
     arg.argv_step = 1;
 
-    if (arg_match(&arg, &codecarg, argi)) {
+    if (arg_match(&arg, &help, argi)) {
+      show_help(stdout, 0);
+      exit(EXIT_SUCCESS);
+    } else if (arg_match(&arg, &codecarg, argi)) {
       global->codec = get_vpx_encoder_by_name(arg.val);
       if (!global->codec)
         die("Error: Unrecognized argument (%s) to --codec\n", arg.val);
@@ -913,6 +771,8 @@ static void parse_global_config(struct VpxEncoderConfig *global, char **argv) {
       global->deadline = VPX_DL_REALTIME;
     else if (arg_match(&arg, &use_yv12, argi))
       global->color_type = YV12;
+    else if (arg_match(&arg, &use_nv12, argi))
+      global->color_type = NV12;
     else if (arg_match(&arg, &use_i420, argi))
       global->color_type = I420;
     else if (arg_match(&arg, &use_i422, argi))
@@ -982,57 +842,6 @@ static void parse_global_config(struct VpxEncoderConfig *global, char **argv) {
   }
 }
 
-static void open_input_file(struct VpxInputContext *input) {
-  /* Parse certain options from the input file, if possible */
-  input->file = strcmp(input->filename, "-") ? fopen(input->filename, "rb")
-                                             : set_binary_mode(stdin);
-
-  if (!input->file) fatal("Failed to open input file");
-
-  if (!fseeko(input->file, 0, SEEK_END)) {
-    /* Input file is seekable. Figure out how long it is, so we can get
-     * progress info.
-     */
-    input->length = ftello(input->file);
-    rewind(input->file);
-  }
-
-  /* Default to 1:1 pixel aspect ratio. */
-  input->pixel_aspect_ratio.numerator = 1;
-  input->pixel_aspect_ratio.denominator = 1;
-
-  /* For RAW input sources, these bytes will applied on the first frame
-   *  in read_frame().
-   */
-  input->detect.buf_read = fread(input->detect.buf, 1, 4, input->file);
-  input->detect.position = 0;
-
-  if (input->detect.buf_read == 4 && file_is_y4m(input->detect.buf)) {
-    if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4,
-                       input->only_i420) >= 0) {
-      input->file_type = FILE_TYPE_Y4M;
-      input->width = input->y4m.pic_w;
-      input->height = input->y4m.pic_h;
-      input->pixel_aspect_ratio.numerator = input->y4m.par_n;
-      input->pixel_aspect_ratio.denominator = input->y4m.par_d;
-      input->framerate.numerator = input->y4m.fps_n;
-      input->framerate.denominator = input->y4m.fps_d;
-      input->fmt = input->y4m.vpx_fmt;
-      input->bit_depth = input->y4m.bit_depth;
-    } else
-      fatal("Unsupported Y4M stream.");
-  } else if (input->detect.buf_read == 4 && fourcc_is_ivf(input->detect.buf)) {
-    fatal("IVF is not supported as input.");
-  } else {
-    input->file_type = FILE_TYPE_RAW;
-  }
-}
-
-static void close_input_file(struct VpxInputContext *input) {
-  fclose(input->file);
-  if (input->file_type == FILE_TYPE_Y4M) y4m_input_close(&input->y4m);
-}
-
 static struct stream_state *new_stream(struct VpxEncoderConfig *global,
                                        struct stream_state *prev) {
   struct stream_state *stream;
@@ -1043,7 +852,7 @@ static struct stream_state *new_stream(struct VpxEncoderConfig *global,
   }
 
   if (prev) {
-    memcpy(stream, prev, sizeof(*stream));
+    *stream = *prev;
     stream->index++;
     prev->next = stream;
   } else {
@@ -1079,7 +888,7 @@ static struct stream_state *new_stream(struct VpxEncoderConfig *global,
 
     /* Default lag_in_frames is 0 in realtime mode CBR mode*/
     if (global->deadline == VPX_DL_REALTIME &&
-        stream->config.cfg.rc_end_usage == 1)
+        stream->config.cfg.rc_end_usage == VPX_CBR)
       stream->config.cfg.g_lag_in_frames = 0;
   }
 
@@ -1094,8 +903,8 @@ static int parse_stream_params(struct VpxEncoderConfig *global,
                                struct stream_state *stream, char **argv) {
   char **argi, **argj;
   struct arg arg;
-  static const arg_def_t **ctrl_args = no_args;
-  static const int *ctrl_args_map = NULL;
+  const arg_def_t **ctrl_args = no_args;
+  const int *ctrl_args_map = NULL;
   struct stream_config *config = &stream->config;
   int eos_mark_found = 0;
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -1134,10 +943,6 @@ static int parse_stream_params(struct VpxEncoderConfig *global,
       config->out_fn = arg.val;
     } else if (arg_match(&arg, &fpf_name, argi)) {
       config->stats_fn = arg.val;
-#if CONFIG_FP_MB_STATS
-    } else if (arg_match(&arg, &fpmbf_name, argi)) {
-      config->fpmb_stats_fn = arg.val;
-#endif
     } else if (arg_match(&arg, &use_webm, argi)) {
 #if CONFIG_WEBM_IO
       config->write_webm = 1;
@@ -1221,6 +1026,11 @@ static int parse_stream_params(struct VpxEncoderConfig *global,
     } else if (arg_match(&arg, &maxsection_pct, argi)) {
       config->cfg.rc_2pass_vbr_maxsection_pct = arg_parse_uint(&arg);
 
+      if (global->passes < 2)
+        warn("option %s ignored in one-pass mode.\n", arg.name);
+    } else if (arg_match(&arg, &corpus_complexity, argi)) {
+      config->cfg.rc_2pass_vbr_corpus_complexity = arg_parse_uint(&arg);
+
       if (global->passes < 2)
         warn("option %s ignored in one-pass mode.\n", arg.name);
     } else if (arg_match(&arg, &kf_min_dist, argi)) {
@@ -1229,6 +1039,40 @@ static int parse_stream_params(struct VpxEncoderConfig *global,
       config->cfg.kf_max_dist = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &kf_disabled, argi)) {
       config->cfg.kf_mode = VPX_KF_DISABLED;
+#if CONFIG_VP9_ENCODER
+    } else if (arg_match(&arg, &use_vizier_rc_params, argi)) {
+      config->cfg.use_vizier_rc_params = arg_parse_int(&arg);
+    } else if (arg_match(&arg, &active_wq_factor, argi)) {
+      config->cfg.active_wq_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &err_per_mb_factor, argi)) {
+      config->cfg.err_per_mb_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &sr_default_decay_limit, argi)) {
+      config->cfg.sr_default_decay_limit = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &sr_diff_factor, argi)) {
+      config->cfg.sr_diff_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &kf_err_per_mb_factor, argi)) {
+      config->cfg.kf_err_per_mb_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &kf_frame_min_boost_factor, argi)) {
+      config->cfg.kf_frame_min_boost_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &kf_frame_max_boost_first_factor, argi)) {
+      config->cfg.kf_frame_max_boost_first_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &kf_frame_max_boost_subs_factor, argi)) {
+      config->cfg.kf_frame_max_boost_subs_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &kf_max_total_boost_factor, argi)) {
+      config->cfg.kf_max_total_boost_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &gf_max_total_boost_factor, argi)) {
+      config->cfg.gf_max_total_boost_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &gf_frame_max_boost_factor, argi)) {
+      config->cfg.gf_frame_max_boost_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &zm_factor, argi)) {
+      config->cfg.zm_factor = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &rd_mult_inter_qp_fac, argi)) {
+      config->cfg.rd_mult_inter_qp_fac = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &rd_mult_arf_qp_fac, argi)) {
+      config->cfg.rd_mult_arf_qp_fac = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &rd_mult_key_qp_fac, argi)) {
+      config->cfg.rd_mult_key_qp_fac = arg_parse_rational(&arg);
+#endif
 #if CONFIG_VP9_HIGHBITDEPTH
     } else if (arg_match(&arg, &test16bitinternalarg, argi)) {
       if (strcmp(global->codec->name, "vp9") == 0) {
@@ -1243,8 +1087,8 @@ static int parse_stream_params(struct VpxEncoderConfig *global,
           match = 1;
 
           /* Point either to the next free element or the first
-          * instance of this control.
-          */
+           * instance of this control.
+           */
           for (j = 0; j < config->arg_ctrl_cnt; j++)
             if (ctrl_args_map != NULL &&
                 config->arg_ctrls[j][0] == ctrl_args_map[i])
@@ -1321,17 +1165,6 @@ static void validate_stream_config(const struct stream_state *stream,
         fatal("Stream %d: duplicate stats file (from stream %d)",
               streami->index, stream->index);
     }
-
-#if CONFIG_FP_MB_STATS
-    /* Check for two streams sharing a mb stats file. */
-    if (streami != stream) {
-      const char *a = stream->config.fpmb_stats_fn;
-      const char *b = streami->config.fpmb_stats_fn;
-      if (a && b && !strcmp(a, b))
-        fatal("Stream %d: duplicate mb stats file (from stream %d)",
-              streami->index, stream->index);
-    }
-#endif
   }
 }
 
@@ -1419,9 +1252,14 @@ static void show_stream_config(struct stream_state *stream,
   SHOW(rc_2pass_vbr_bias_pct);
   SHOW(rc_2pass_vbr_minsection_pct);
   SHOW(rc_2pass_vbr_maxsection_pct);
+  SHOW(rc_2pass_vbr_corpus_complexity);
   SHOW(kf_mode);
   SHOW(kf_min_dist);
   SHOW(kf_max_dist);
+  // Temporary use for debug
+  SHOW(use_vizier_rc_params);
+  SHOW(active_wq_factor.num);
+  SHOW(active_wq_factor.den);
 }
 
 static void open_output_file(struct stream_state *stream,
@@ -1485,26 +1323,11 @@ static void setup_pass(struct stream_state *stream,
       fatal("Failed to open statistics store");
   }
 
-#if CONFIG_FP_MB_STATS
-  if (stream->config.fpmb_stats_fn) {
-    if (!stats_open_file(&stream->fpmb_stats, stream->config.fpmb_stats_fn,
-                         pass))
-      fatal("Failed to open mb statistics store");
-  } else {
-    if (!stats_open_mem(&stream->fpmb_stats, pass))
-      fatal("Failed to open mb statistics store");
-  }
-#endif
-
   stream->config.cfg.g_pass = global->passes == 2
                                   ? pass ? VPX_RC_LAST_PASS : VPX_RC_FIRST_PASS
                                   : VPX_RC_ONE_PASS;
   if (pass) {
     stream->config.cfg.rc_twopass_stats_in = stats_get(&stream->stats);
-#if CONFIG_FP_MB_STATS
-    stream->config.cfg.rc_firstpass_mb_stats_in =
-        stats_get(&stream->fpmb_stats);
-#endif
   }
 
   stream->cx_time = 0;
@@ -1578,14 +1401,14 @@ static void encode_frame(struct stream_state *stream,
             vpx_img_alloc(NULL, VPX_IMG_FMT_I42016, cfg->g_w, cfg->g_h, 16);
       }
       I420Scale_16(
-          (uint16 *)img->planes[VPX_PLANE_Y], img->stride[VPX_PLANE_Y] / 2,
-          (uint16 *)img->planes[VPX_PLANE_U], img->stride[VPX_PLANE_U] / 2,
-          (uint16 *)img->planes[VPX_PLANE_V], img->stride[VPX_PLANE_V] / 2,
-          img->d_w, img->d_h, (uint16 *)stream->img->planes[VPX_PLANE_Y],
+          (uint16_t *)img->planes[VPX_PLANE_Y], img->stride[VPX_PLANE_Y] / 2,
+          (uint16_t *)img->planes[VPX_PLANE_U], img->stride[VPX_PLANE_U] / 2,
+          (uint16_t *)img->planes[VPX_PLANE_V], img->stride[VPX_PLANE_V] / 2,
+          img->d_w, img->d_h, (uint16_t *)stream->img->planes[VPX_PLANE_Y],
           stream->img->stride[VPX_PLANE_Y] / 2,
-          (uint16 *)stream->img->planes[VPX_PLANE_U],
+          (uint16_t *)stream->img->planes[VPX_PLANE_U],
           stream->img->stride[VPX_PLANE_U] / 2,
-          (uint16 *)stream->img->planes[VPX_PLANE_V],
+          (uint16_t *)stream->img->planes[VPX_PLANE_V],
           stream->img->stride[VPX_PLANE_V] / 2, stream->img->d_w,
           stream->img->d_h, kFilterBox);
       img = stream->img;
@@ -1657,7 +1480,7 @@ static void get_cx_data(struct stream_state *stream,
   *got_data = 0;
   while ((pkt = vpx_codec_get_cx_data(&stream->encoder, &iter))) {
     static size_t fsize = 0;
-    static int64_t ivf_header_pos = 0;
+    static FileOffset ivf_header_pos = 0;
 
     switch (pkt->kind) {
       case VPX_CODEC_CX_FRAME_PKT:
@@ -1683,7 +1506,7 @@ static void get_cx_data(struct stream_state *stream,
             fsize += pkt->data.frame.sz;
 
             if (!(pkt->data.frame.flags & VPX_FRAME_IS_FRAGMENT)) {
-              const int64_t currpos = ftello(stream->file);
+              const FileOffset currpos = ftello(stream->file);
               fseeko(stream->file, ivf_header_pos, SEEK_SET);
               ivf_write_frame_size(stream->file, fsize);
               fseeko(stream->file, currpos, SEEK_SET);
@@ -1716,13 +1539,6 @@ static void get_cx_data(struct stream_state *stream,
                     pkt->data.twopass_stats.sz);
         stream->nbytes += pkt->data.raw.sz;
         break;
-#if CONFIG_FP_MB_STATS
-      case VPX_CODEC_FPMB_STATS_PKT:
-        stats_write(&stream->fpmb_stats, pkt->data.firstpass_mb_stats.buf,
-                    pkt->data.firstpass_mb_stats.sz);
-        stream->nbytes += pkt->data.raw.sz;
-        break;
-#endif
       case VPX_CODEC_PSNR_PKT:
 
         if (global->show_psnr) {
@@ -1775,13 +1591,14 @@ static void test_decode(struct stream_state *stream,
   /* Get the internal reference frame */
   if (strcmp(codec->name, "vp8") == 0) {
     struct vpx_ref_frame ref_enc, ref_dec;
-    int width, height;
+    unsigned int aligned_width = (stream->config.cfg.g_w + 15u) & ~15u;
+    unsigned int aligned_height = (stream->config.cfg.g_h + 15u) & ~15u;
 
-    width = (stream->config.cfg.g_w + 15) & ~15;
-    height = (stream->config.cfg.g_h + 15) & ~15;
-    vpx_img_alloc(&ref_enc.img, VPX_IMG_FMT_I420, width, height, 1);
+    vpx_img_alloc(&ref_enc.img, VPX_IMG_FMT_I420, aligned_width, aligned_height,
+                  1);
     enc_img = ref_enc.img;
-    vpx_img_alloc(&ref_dec.img, VPX_IMG_FMT_I420, width, height, 1);
+    vpx_img_alloc(&ref_dec.img, VPX_IMG_FMT_I420, aligned_width, aligned_height,
+                  1);
     dec_img = ref_dec.img;
 
     ref_enc.frame_type = VP8_LAST_FRAME;
@@ -1881,10 +1698,9 @@ int main(int argc, const char **argv_) {
   int res = 0;
 
   memset(&input, 0, sizeof(input));
+  memset(&raw, 0, sizeof(raw));
   exec_name = argv_[0];
 
-  if (argc < 3) usage_exit();
-
   /* Setup default input stream settings */
   input.framerate.numerator = 30;
   input.framerate.denominator = 1;
@@ -1896,14 +1712,21 @@ int main(int argc, const char **argv_) {
    * codec.
    */
   argv = argv_dup(argc - 1, argv_ + 1);
+  if (!argv) {
+    fprintf(stderr, "Error allocating argument list\n");
+    return EXIT_FAILURE;
+  }
   parse_global_config(&global, argv);
 
+  if (argc < 3) usage_exit();
+
   switch (global.color_type) {
     case I420: input.fmt = VPX_IMG_FMT_I420; break;
     case I422: input.fmt = VPX_IMG_FMT_I422; break;
     case I444: input.fmt = VPX_IMG_FMT_I444; break;
     case I440: input.fmt = VPX_IMG_FMT_I440; break;
     case YV12: input.fmt = VPX_IMG_FMT_YV12; break;
+    case NV12: input.fmt = VPX_IMG_FMT_NV12; break;
   }
 
   {
@@ -1931,7 +1754,10 @@ int main(int argc, const char **argv_) {
   /* Handle non-option arguments */
   input.filename = argv[0];
 
-  if (!input.filename) usage_exit();
+  if (!input.filename) {
+    fprintf(stderr, "No input file specified!\n");
+    usage_exit();
+  }
 
   /* Decide if other chroma subsamplings than 4:2:0 are supported */
   if (global.codec->fourcc == VP9_FOURCC) input.only_i420 = 0;
@@ -2022,14 +1848,10 @@ int main(int argc, const char **argv_) {
       FOREACH_STREAM(show_stream_config(stream, &global, &input));
 
     if (pass == (global.pass ? global.pass - 1 : 0)) {
-      if (input.file_type == FILE_TYPE_Y4M)
-        /*The Y4M reader does its own allocation.
-          Just initialize this here to avoid problems if we never read any
-           frames.*/
-        memset(&raw, 0, sizeof(raw));
-      else
+      // The Y4M reader does its own allocation.
+      if (input.file_type != FILE_TYPE_Y4M) {
         vpx_img_alloc(&raw, input.fmt, input.width, input.height, 32);
-
+      }
       FOREACH_STREAM(stream->rate_hist = init_rate_histogram(
                          &stream->config.cfg, &global.framerate));
     }
@@ -2153,10 +1975,9 @@ int main(int argc, const char **argv_) {
           } else {
             const int64_t input_pos = ftello(input.file);
             const int64_t input_pos_lagged = input_pos - lagged_count;
-            const int64_t limit = input.length;
 
             rate = cx_time ? input_pos_lagged * (int64_t)1000000 / cx_time : 0;
-            remaining = limit - input_pos + lagged_count;
+            remaining = input.length - input_pos + lagged_count;
           }
 
           average_rate =
@@ -2176,9 +1997,9 @@ int main(int argc, const char **argv_) {
 
     if (!global.quiet) {
       FOREACH_STREAM(fprintf(
-          stderr, "\rPass %d/%d frame %4d/%-4d %7" PRId64 "B %7" PRId64
-                  "b/f %7" PRId64 "b/s"
-                  " %7" PRId64 " %s (%.2f fps)\033[K\n",
+          stderr,
+          "\rPass %d/%d frame %4d/%-4d %7" PRId64 "B %7" PRId64 "b/f %7" PRId64
+          "b/s %7" PRId64 " %s (%.2f fps)\033[K\n",
           pass + 1, global.passes, frames_in, stream->frames_out,
           (int64_t)stream->nbytes,
           seen_frames ? (int64_t)(stream->nbytes * 8 / seen_frames) : 0,
@@ -2215,10 +2036,6 @@ int main(int argc, const char **argv_) {
 
     FOREACH_STREAM(stats_close(&stream->stats, global.passes - 1));
 
-#if CONFIG_FP_MB_STATS
-    FOREACH_STREAM(stats_close(&stream->fpmb_stats, global.passes - 1));
-#endif
-
     if (global.pass) break;
   }
 
diff --git a/media/libvpx/libvpx/vpxenc.h b/media/libvpx/libvpx/vpxenc.h
index d867e9d954..be54840f7d 100644
--- a/media/libvpx/libvpx/vpxenc.h
+++ b/media/libvpx/libvpx/vpxenc.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef VPXENC_H_
-#define VPXENC_H_
+#ifndef VPX_VPXENC_H_
+#define VPX_VPXENC_H_
 
 #include "vpx/vpx_encoder.h"
 
@@ -28,6 +28,7 @@ typedef enum {
   I444,  // 4:4:4 8+ bit-depth
   I440,  // 4:4:0 8+ bit-depth
   YV12,  // 4:2:0 with uv flipped, only 8-bit depth
+  NV12,  // 4:2:0 with uv interleaved
 } ColorInputType;
 
 struct VpxInterface;
@@ -61,4 +62,4 @@ struct VpxEncoderConfig {
 }  // extern "C"
 #endif
 
-#endif  // VPXENC_H_
+#endif  // VPX_VPXENC_H_
diff --git a/media/libvpx/libvpx/vpxstats.c b/media/libvpx/libvpx/vpxstats.c
index 142e367bb4..c0dd14e450 100644
--- a/media/libvpx/libvpx/vpxstats.c
+++ b/media/libvpx/libvpx/vpxstats.c
@@ -41,7 +41,7 @@ int stats_open_file(stats_io_t *stats, const char *fpf, int pass) {
     stats->buf.buf = malloc(stats->buf_alloc_sz);
 
     if (!stats->buf.buf)
-      fatal("Failed to allocate first-pass stats buffer (%lu bytes)",
+      fatal("Failed to allocate first-pass stats buffer (%u bytes)",
             (unsigned int)stats->buf_alloc_sz);
 
     nbytes = fread(stats->buf.buf, 1, stats->buf.sz, stats->file);
diff --git a/media/libvpx/libvpx/vpxstats.h b/media/libvpx/libvpx/vpxstats.h
index 5c9ea34f71..3625ee3291 100644
--- a/media/libvpx/libvpx/vpxstats.h
+++ b/media/libvpx/libvpx/vpxstats.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPXSTATS_H_
-#define VPXSTATS_H_
+#ifndef VPX_VPXSTATS_H_
+#define VPX_VPXSTATS_H_
 
 #include <stdio.h>
 
@@ -40,4 +40,4 @@ vpx_fixed_buf_t stats_get(stats_io_t *stats);
 }  // extern "C"
 #endif
 
-#endif  // VPXSTATS_H_
+#endif  // VPX_VPXSTATS_H_
diff --git a/media/libvpx/libvpx/warnings.c b/media/libvpx/libvpx/warnings.c
index a80da527f7..3e6e702536 100644
--- a/media/libvpx/libvpx/warnings.c
+++ b/media/libvpx/libvpx/warnings.c
@@ -98,7 +98,7 @@ void check_encoder_config(int disable_prompt,
   /* Count and print warnings. */
   for (warning = warning_list.warning_node; warning != NULL;
        warning = warning->next_warning, ++num_warnings) {
-    warn(warning->warning_string);
+    warn("%s", warning->warning_string);
   }
 
   free_warning_list(&warning_list);
diff --git a/media/libvpx/libvpx/warnings.h b/media/libvpx/libvpx/warnings.h
index 6b8ae6796f..15558c6437 100644
--- a/media/libvpx/libvpx/warnings.h
+++ b/media/libvpx/libvpx/warnings.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef WARNINGS_H_
-#define WARNINGS_H_
+#ifndef VPX_WARNINGS_H_
+#define VPX_WARNINGS_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -30,4 +30,4 @@ void check_encoder_config(int disable_prompt,
 }  // extern "C"
 #endif
 
-#endif  // WARNINGS_H_
+#endif  // VPX_WARNINGS_H_
diff --git a/media/libvpx/libvpx/webmdec.cc b/media/libvpx/libvpx/webmdec.cc
index ed4bd700dd..0576bb4978 100644
--- a/media/libvpx/libvpx/webmdec.cc
+++ b/media/libvpx/libvpx/webmdec.cc
@@ -19,25 +19,25 @@
 namespace {
 
 void reset(struct WebmInputContext *const webm_ctx) {
-  if (webm_ctx->reader != NULL) {
+  if (webm_ctx->reader != nullptr) {
     mkvparser::MkvReader *const reader =
         reinterpret_cast<mkvparser::MkvReader *>(webm_ctx->reader);
     delete reader;
   }
-  if (webm_ctx->segment != NULL) {
+  if (webm_ctx->segment != nullptr) {
     mkvparser::Segment *const segment =
         reinterpret_cast<mkvparser::Segment *>(webm_ctx->segment);
     delete segment;
   }
-  if (webm_ctx->buffer != NULL) {
+  if (webm_ctx->buffer != nullptr) {
     delete[] webm_ctx->buffer;
   }
-  webm_ctx->reader = NULL;
-  webm_ctx->segment = NULL;
-  webm_ctx->buffer = NULL;
-  webm_ctx->cluster = NULL;
-  webm_ctx->block_entry = NULL;
-  webm_ctx->block = NULL;
+  webm_ctx->reader = nullptr;
+  webm_ctx->segment = nullptr;
+  webm_ctx->buffer = nullptr;
+  webm_ctx->cluster = nullptr;
+  webm_ctx->block_entry = nullptr;
+  webm_ctx->block = nullptr;
   webm_ctx->block_frame_index = 0;
   webm_ctx->video_track_index = 0;
   webm_ctx->timestamp_ns = 0;
@@ -84,7 +84,7 @@ int file_is_webm(struct WebmInputContext *webm_ctx,
   }
 
   const mkvparser::Tracks *const tracks = segment->GetTracks();
-  const mkvparser::VideoTrack *video_track = NULL;
+  const mkvparser::VideoTrack *video_track = nullptr;
   for (unsigned long i = 0; i < tracks->GetTracksCount(); ++i) {
     const mkvparser::Track *const track = tracks->GetTrackByIndex(i);
     if (track->GetType() == mkvparser::Track::kVideo) {
@@ -94,7 +94,7 @@ int file_is_webm(struct WebmInputContext *webm_ctx,
     }
   }
 
-  if (video_track == NULL || video_track->GetCodecId() == NULL) {
+  if (video_track == nullptr || video_track->GetCodecId() == nullptr) {
     rewind_and_reset(webm_ctx, vpx_ctx);
     return 0;
   }
@@ -137,12 +137,12 @@ int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer,
   do {
     long status = 0;
     bool get_new_block = false;
-    if (block_entry == NULL && !block_entry_eos) {
+    if (block_entry == nullptr && !block_entry_eos) {
       status = cluster->GetFirst(block_entry);
       get_new_block = true;
     } else if (block_entry_eos || block_entry->EOS()) {
       cluster = segment->GetNext(cluster);
-      if (cluster == NULL || cluster->EOS()) {
+      if (cluster == nullptr || cluster->EOS()) {
         *buffer_size = 0;
         webm_ctx->reached_eos = 1;
         return 1;
@@ -150,25 +150,26 @@ int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer,
       status = cluster->GetFirst(block_entry);
       block_entry_eos = false;
       get_new_block = true;
-    } else if (block == NULL ||
+    } else if (block == nullptr ||
                webm_ctx->block_frame_index == block->GetFrameCount() ||
                block->GetTrackNumber() != webm_ctx->video_track_index) {
       status = cluster->GetNext(block_entry, block_entry);
-      if (block_entry == NULL || block_entry->EOS()) {
+      if (block_entry == nullptr || block_entry->EOS()) {
         block_entry_eos = true;
         continue;
       }
       get_new_block = true;
     }
-    if (status || block_entry == NULL) {
+    if (status || block_entry == nullptr) {
       return -1;
     }
     if (get_new_block) {
       block = block_entry->GetBlock();
+      if (block == nullptr) return -1;
       webm_ctx->block_frame_index = 0;
     }
-  } while (block->GetTrackNumber() != webm_ctx->video_track_index ||
-           block_entry_eos);
+  } while (block_entry_eos ||
+           block->GetTrackNumber() != webm_ctx->video_track_index);
 
   webm_ctx->cluster = cluster;
   webm_ctx->block_entry = block_entry;
@@ -178,9 +179,9 @@ int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer,
       block->GetFrame(webm_ctx->block_frame_index);
   ++webm_ctx->block_frame_index;
   if (frame.len > static_cast<long>(*buffer_size)) {
-    delete[] * buffer;
+    delete[] *buffer;
     *buffer = new uint8_t[frame.len];
-    if (*buffer == NULL) {
+    if (*buffer == nullptr) {
       return -1;
     }
     webm_ctx->buffer = *buffer;
@@ -197,7 +198,7 @@ int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer,
 int webm_guess_framerate(struct WebmInputContext *webm_ctx,
                          struct VpxInputContext *vpx_ctx) {
   uint32_t i = 0;
-  uint8_t *buffer = NULL;
+  uint8_t *buffer = nullptr;
   size_t buffer_size = 0;
   while (webm_ctx->timestamp_ns < 1000000000 && i < 50) {
     if (webm_read_frame(webm_ctx, &buffer, &buffer_size)) {
@@ -209,10 +210,12 @@ int webm_guess_framerate(struct WebmInputContext *webm_ctx,
   vpx_ctx->framerate.denominator =
       static_cast<int>(webm_ctx->timestamp_ns / 1000);
   delete[] buffer;
+  // webm_ctx->buffer is assigned to the buffer pointer in webm_read_frame().
+  webm_ctx->buffer = nullptr;
 
   get_first_cluster(webm_ctx);
-  webm_ctx->block = NULL;
-  webm_ctx->block_entry = NULL;
+  webm_ctx->block = nullptr;
+  webm_ctx->block_entry = nullptr;
   webm_ctx->block_frame_index = 0;
   webm_ctx->timestamp_ns = 0;
   webm_ctx->reached_eos = 0;
diff --git a/media/libvpx/libvpx/webmdec.h b/media/libvpx/libvpx/webmdec.h
index 7dcb170caf..6ae7ee16d0 100644
--- a/media/libvpx/libvpx/webmdec.h
+++ b/media/libvpx/libvpx/webmdec.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef WEBMDEC_H_
-#define WEBMDEC_H_
+#ifndef VPX_WEBMDEC_H_
+#define VPX_WEBMDEC_H_
 
 #include "./tools_common.h"
 
@@ -27,7 +27,7 @@ struct WebmInputContext {
   const void *block;
   int block_frame_index;
   int video_track_index;
-  uint64_t timestamp_ns;
+  int64_t timestamp_ns;
   int is_key_frame;
   int reached_eos;
 };
@@ -66,4 +66,4 @@ void webm_free(struct WebmInputContext *webm_ctx);
 }  // extern "C"
 #endif
 
-#endif  // WEBMDEC_H_
+#endif  // VPX_WEBMDEC_H_
diff --git a/media/libvpx/libvpx/webmenc.cc b/media/libvpx/libvpx/webmenc.cc
index 66606674b0..c718ab5a9f 100644
--- a/media/libvpx/libvpx/webmenc.cc
+++ b/media/libvpx/libvpx/webmenc.cc
@@ -90,6 +90,6 @@ void write_webm_file_footer(struct WebmOutputContext *webm_ctx) {
   segment->Finalize();
   delete segment;
   delete writer;
-  webm_ctx->writer = NULL;
-  webm_ctx->segment = NULL;
+  webm_ctx->writer = nullptr;
+  webm_ctx->segment = nullptr;
 }
diff --git a/media/libvpx/libvpx/webmenc.h b/media/libvpx/libvpx/webmenc.h
index b4a9e357bb..4176e82081 100644
--- a/media/libvpx/libvpx/webmenc.h
+++ b/media/libvpx/libvpx/webmenc.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef WEBMENC_H_
-#define WEBMENC_H_
+#ifndef VPX_WEBMENC_H_
+#define VPX_WEBMENC_H_
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -52,4 +52,4 @@ void write_webm_file_footer(struct WebmOutputContext *webm_ctx);
 }  // extern "C"
 #endif
 
-#endif  // WEBMENC_H_
+#endif  // VPX_WEBMENC_H_
diff --git a/media/libvpx/libvpx/y4menc.c b/media/libvpx/libvpx/y4menc.c
index e26fcaf6ea..3940249cb9 100644
--- a/media/libvpx/libvpx/y4menc.c
+++ b/media/libvpx/libvpx/y4menc.c
@@ -9,6 +9,7 @@
  */
 
 #include <assert.h>
+#include <stdio.h>
 #include "./y4menc.h"
 
 int y4m_write_file_header(char *buf, size_t len, int width, int height,
@@ -17,41 +18,34 @@ int y4m_write_file_header(char *buf, size_t len, int width, int height,
   const char *color;
   switch (bit_depth) {
     case 8:
-      color = fmt == VPX_IMG_FMT_444A
-                  ? "C444alpha\n"
-                  : fmt == VPX_IMG_FMT_I444 ? "C444\n" : fmt == VPX_IMG_FMT_I422
-                                                             ? "C422\n"
-                                                             : "C420jpeg\n";
+      color = fmt == VPX_IMG_FMT_I444   ? "C444\n"
+              : fmt == VPX_IMG_FMT_I422 ? "C422\n"
+                                        : "C420jpeg\n";
       break;
     case 9:
-      color = fmt == VPX_IMG_FMT_I44416
-                  ? "C444p9 XYSCSS=444P9\n"
-                  : fmt == VPX_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9\n"
-                                              : "C420p9 XYSCSS=420P9\n";
+      color = fmt == VPX_IMG_FMT_I44416   ? "C444p9 XYSCSS=444P9\n"
+              : fmt == VPX_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9\n"
+                                          : "C420p9 XYSCSS=420P9\n";
       break;
     case 10:
-      color = fmt == VPX_IMG_FMT_I44416
-                  ? "C444p10 XYSCSS=444P10\n"
-                  : fmt == VPX_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10\n"
-                                              : "C420p10 XYSCSS=420P10\n";
+      color = fmt == VPX_IMG_FMT_I44416   ? "C444p10 XYSCSS=444P10\n"
+              : fmt == VPX_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10\n"
+                                          : "C420p10 XYSCSS=420P10\n";
       break;
     case 12:
-      color = fmt == VPX_IMG_FMT_I44416
-                  ? "C444p12 XYSCSS=444P12\n"
-                  : fmt == VPX_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12\n"
-                                              : "C420p12 XYSCSS=420P12\n";
+      color = fmt == VPX_IMG_FMT_I44416   ? "C444p12 XYSCSS=444P12\n"
+              : fmt == VPX_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12\n"
+                                          : "C420p12 XYSCSS=420P12\n";
       break;
     case 14:
-      color = fmt == VPX_IMG_FMT_I44416
-                  ? "C444p14 XYSCSS=444P14\n"
-                  : fmt == VPX_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14\n"
-                                              : "C420p14 XYSCSS=420P14\n";
+      color = fmt == VPX_IMG_FMT_I44416   ? "C444p14 XYSCSS=444P14\n"
+              : fmt == VPX_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14\n"
+                                          : "C420p14 XYSCSS=420P14\n";
       break;
     case 16:
-      color = fmt == VPX_IMG_FMT_I44416
-                  ? "C444p16 XYSCSS=444P16\n"
-                  : fmt == VPX_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16\n"
-                                              : "C420p16 XYSCSS=420P16\n";
+      color = fmt == VPX_IMG_FMT_I44416   ? "C444p16 XYSCSS=444P16\n"
+              : fmt == VPX_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16\n"
+                                          : "C420p16 XYSCSS=420P16\n";
       break;
     default: color = NULL; assert(0);
   }
diff --git a/media/libvpx/libvpx/y4menc.h b/media/libvpx/libvpx/y4menc.h
index 69d590413e..9a367e34c6 100644
--- a/media/libvpx/libvpx/y4menc.h
+++ b/media/libvpx/libvpx/y4menc.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef Y4MENC_H_
-#define Y4MENC_H_
+#ifndef VPX_Y4MENC_H_
+#define VPX_Y4MENC_H_
 
 #include "./tools_common.h"
 
@@ -30,4 +30,4 @@ int y4m_write_frame_header(char *buf, size_t len);
 }  // extern "C"
 #endif
 
-#endif  // Y4MENC_H_
+#endif  // VPX_Y4MENC_H_
diff --git a/media/libvpx/libvpx/y4minput.c b/media/libvpx/libvpx/y4minput.c
index acf7d69fe9..40f152d057 100644
--- a/media/libvpx/libvpx/y4minput.c
+++ b/media/libvpx/libvpx/y4minput.c
@@ -10,7 +10,9 @@
  *  Based on code from the OggTheora software codec source code,
  *  Copyright (C) 2002-2010 The Xiph.Org Foundation and contributors.
  */
+#include <assert.h>
 #include <errno.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
@@ -20,12 +22,13 @@
 // Reads 'size' bytes from 'file' into 'buf' with some fault tolerance.
 // Returns true on success.
 static int file_read(void *buf, size_t size, FILE *file) {
-  const int kMaxRetries = 5;
-  int retry_count = 0;
-  int file_error;
+  const int kMaxTries = 5;
+  int try_count = 0;
+  int file_error = 0;
   size_t len = 0;
-  do {
+  while (!feof(file) && len < size && try_count < kMaxTries) {
     const size_t n = fread((uint8_t *)buf + len, 1, size - len, file);
+    ++try_count;
     len += n;
     file_error = ferror(file);
     if (file_error) {
@@ -38,28 +41,21 @@ static int file_read(void *buf, size_t size, FILE *file) {
         return 0;
       }
     }
-  } while (!feof(file) && len < size && ++retry_count < kMaxRetries);
+  }
 
   if (!feof(file) && len != size) {
     fprintf(stderr,
             "Error reading file: %u of %u bytes read,"
-            " error: %d, retries: %d, %d: %s\n",
-            (uint32_t)len, (uint32_t)size, file_error, retry_count, errno,
+            " error: %d, tries: %d, %d: %s\n",
+            (uint32_t)len, (uint32_t)size, file_error, try_count, errno,
             strerror(errno));
   }
   return len == size;
 }
 
 static int y4m_parse_tags(y4m_input *_y4m, char *_tags) {
-  int got_w;
-  int got_h;
-  int got_fps;
-  int got_interlace;
-  int got_par;
-  int got_chroma;
   char *p;
   char *q;
-  got_w = got_h = got_fps = got_interlace = got_par = got_chroma = 0;
   for (p = _tags;; p = q) {
     /*Skip any leading spaces.*/
     while (*p == ' ') p++;
@@ -72,52 +68,119 @@ static int y4m_parse_tags(y4m_input *_y4m, char *_tags) {
     switch (p[0]) {
       case 'W': {
         if (sscanf(p + 1, "%d", &_y4m->pic_w) != 1) return -1;
-        got_w = 1;
         break;
       }
       case 'H': {
         if (sscanf(p + 1, "%d", &_y4m->pic_h) != 1) return -1;
-        got_h = 1;
         break;
       }
       case 'F': {
         if (sscanf(p + 1, "%d:%d", &_y4m->fps_n, &_y4m->fps_d) != 2) {
           return -1;
         }
-        got_fps = 1;
         break;
       }
       case 'I': {
         _y4m->interlace = p[1];
-        got_interlace = 1;
         break;
       }
       case 'A': {
         if (sscanf(p + 1, "%d:%d", &_y4m->par_n, &_y4m->par_d) != 2) {
           return -1;
         }
-        got_par = 1;
         break;
       }
       case 'C': {
         if (q - p > 16) return -1;
         memcpy(_y4m->chroma_type, p + 1, q - p - 1);
         _y4m->chroma_type[q - p - 1] = '\0';
-        got_chroma = 1;
         break;
       }
         /*Ignore unknown tags.*/
     }
   }
-  if (!got_w || !got_h || !got_fps) return -1;
-  if (!got_interlace) _y4m->interlace = '?';
-  if (!got_par) _y4m->par_n = _y4m->par_d = 0;
-  /*Chroma-type is not specified in older files, e.g., those generated by
-     mplayer.*/
-  if (!got_chroma) strcpy(_y4m->chroma_type, "420");
   return 0;
 }
 
+// Copy a single tag into the buffer, along with a null character.
+// Returns 0 if any file IO errors occur.
+static int copy_tag(char *buf, size_t buf_len, char *end_tag, FILE *file) {
+  size_t i;
+  assert(buf_len >= 1);
+  // Skip leading space characters.
+  do {
+    if (!file_read(buf, 1, file)) {
+      return 0;
+    }
+  } while (buf[0] == ' ');
+
+  // If we hit the newline, treat this as the "empty" tag.
+  if (buf[0] == '\n') {
+    buf[0] = '\0';
+    *end_tag = '\n';
+    return 1;
+  }
+
+  // Copy over characters until a space is hit, or the buffer is exhausted.
+  for (i = 1; i < buf_len; ++i) {
+    if (!file_read(buf + i, 1, file)) {
+      return 0;
+    }
+    if (buf[i] == ' ' || buf[i] == '\n') {
+      break;
+    }
+  }
+  if (i == buf_len) {
+    fprintf(stderr, "Error: Y4M header tags must be less than %lu characters\n",
+            (unsigned long)i);
+    return 0;
+  }
+  *end_tag = buf[i];
+  buf[i] = '\0';
+  return 1;
+}
+
+/* Returns 1 if tags were parsed successfully, 0 otherwise. */
+static int parse_tags(y4m_input *y4m_ctx, FILE *file) {
+  char tag[256];
+  char end; /* Character denoting the end of the tag, ' ' or '\n'. */
+  /* Set Y4M tags to defaults, updating them as processing occurs. Mandatory
+     fields are marked with -1 and will be checked after the tags are parsed. */
+  y4m_ctx->pic_w = -1;
+  y4m_ctx->pic_h = -1;
+  y4m_ctx->fps_n = -1; /* Also serves as marker for fps_d */
+  y4m_ctx->par_n = 0;
+  y4m_ctx->par_d = 0;
+  y4m_ctx->interlace = '?';
+  snprintf(y4m_ctx->chroma_type, sizeof(y4m_ctx->chroma_type), "420");
+
+  /* Find one tag at a time. */
+  do {
+    if (!copy_tag(tag, sizeof(tag), &end, file)) {
+      return 0;
+    }
+    /* y4m_parse_tags returns 0 on success. */
+    if (y4m_parse_tags(y4m_ctx, tag)) {
+      return 0;
+    }
+  } while (end != '\n');
+
+  /* Check the mandatory fields. */
+  if (y4m_ctx->pic_w == -1) {
+    fprintf(stderr, "Width field missing\n");
+    return 0;
+  }
+  if (y4m_ctx->pic_h == -1) {
+    fprintf(stderr, "Height field missing\n");
+    return 0;
+  }
+  if (y4m_ctx->fps_n == -1) {
+    fprintf(stderr, "FPS field missing\n");
+    return 0;
+  }
+  return 1;
+}
+
 /*All anti-aliasing filters in the following conversion functions are based on
    one of two window functions:
   The 6-tap Lanczos window (for down-sampling and shifts):
@@ -130,8 +193,8 @@ static int y4m_parse_tags(y4m_input *_y4m, char *_tags) {
   The number of taps is intentionally kept small to reduce computational
    overhead and limit ringing.
 
-  The taps from these filters are scaled so that their sum is 1, and the result
-   is scaled by 128 and rounded to integers to create a filter whose
+  The taps from these filters are scaled so that their sum is 1, and the
+  result is scaled by 128 and rounded to integers to create a filter whose
    intermediate values fit inside 16 bits.
   Coefficients are rounded in such a way as to ensure their sum is still 128,
    which is usually equivalent to normal rounding.
@@ -139,7 +202,6 @@ static int y4m_parse_tags(y4m_input *_y4m, char *_tags) {
   Conversions which require both horizontal and vertical filtering could
    have these steps pipelined, for less memory consumption and better cache
    performance, but we do them separately for simplicity.*/
-
 #define OC_MINI(_a, _b) ((_a) > (_b) ? (_b) : (_a))
 #define OC_MAXI(_a, _b) ((_a) < (_b) ? (_b) : (_a))
 #define OC_CLAMPI(_a, _b, _c) (OC_MAXI(_a, OC_MINI(_b, _c)))
@@ -195,26 +257,29 @@ static void y4m_42xmpeg2_42xjpeg_helper(unsigned char *_dst,
        window.*/
     for (x = 0; x < OC_MINI(_c_w, 2); x++) {
       _dst[x] = (unsigned char)OC_CLAMPI(
-          0, (4 * _src[0] - 17 * _src[OC_MAXI(x - 1, 0)] + 114 * _src[x] +
-              35 * _src[OC_MINI(x + 1, _c_w - 1)] -
-              9 * _src[OC_MINI(x + 2, _c_w - 1)] +
-              _src[OC_MINI(x + 3, _c_w - 1)] + 64) >>
-                 7,
+          0,
+          (4 * _src[0] - 17 * _src[OC_MAXI(x - 1, 0)] + 114 * _src[x] +
+           35 * _src[OC_MINI(x + 1, _c_w - 1)] -
+           9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[OC_MINI(x + 3, _c_w - 1)] +
+           64) >>
+              7,
           255);
     }
     for (; x < _c_w - 3; x++) {
       _dst[x] = (unsigned char)OC_CLAMPI(
-          0, (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] +
-              35 * _src[x + 1] - 9 * _src[x + 2] + _src[x + 3] + 64) >>
-                 7,
+          0,
+          (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] +
+           35 * _src[x + 1] - 9 * _src[x + 2] + _src[x + 3] + 64) >>
+              7,
           255);
     }
     for (; x < _c_w; x++) {
       _dst[x] = (unsigned char)OC_CLAMPI(
-          0, (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] +
-              35 * _src[OC_MINI(x + 1, _c_w - 1)] -
-              9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[_c_w - 1] + 64) >>
-                 7,
+          0,
+          (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] +
+           35 * _src[OC_MINI(x + 1, _c_w - 1)] -
+           9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[_c_w - 1] + 64) >>
+              7,
           255);
     }
     _dst += _c_w;
@@ -222,26 +287,6 @@ static void y4m_42xmpeg2_42xjpeg_helper(unsigned char *_dst,
   }
 }
 
-/*Handles both 422 and 420mpeg2 to 422jpeg and 420jpeg, respectively.*/
-static void y4m_convert_42xmpeg2_42xjpeg(y4m_input *_y4m, unsigned char *_dst,
-                                         unsigned char *_aux) {
-  int c_w;
-  int c_h;
-  int c_sz;
-  int pli;
-  /*Skip past the luma data.*/
-  _dst += _y4m->pic_w * _y4m->pic_h;
-  /*Compute the size of each chroma plane.*/
-  c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;
-  c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
-  c_sz = c_w * c_h;
-  for (pli = 1; pli < 3; pli++) {
-    y4m_42xmpeg2_42xjpeg_helper(_dst, _aux, c_w, c_h);
-    _dst += c_sz;
-    _aux += c_sz;
-  }
-}
-
 /*This format is only used for interlaced content, but is included for
    completeness.
 
@@ -314,28 +359,31 @@ static void y4m_convert_42xpaldv_42xjpeg(y4m_input *_y4m, unsigned char *_dst,
         for (x = 0; x < c_w; x++) {
           for (y = 0; y < OC_MINI(c_h, 3); y++) {
             _dst[y * c_w] = (unsigned char)OC_CLAMPI(
-                0, (tmp[0] - 9 * tmp[OC_MAXI(y - 2, 0) * c_w] +
-                    35 * tmp[OC_MAXI(y - 1, 0) * c_w] + 114 * tmp[y * c_w] -
-                    17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] +
-                    4 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + 64) >>
-                       7,
+                0,
+                (tmp[0] - 9 * tmp[OC_MAXI(y - 2, 0) * c_w] +
+                 35 * tmp[OC_MAXI(y - 1, 0) * c_w] + 114 * tmp[y * c_w] -
+                 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] +
+                 4 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + 64) >>
+                    7,
                 255);
           }
           for (; y < c_h - 2; y++) {
             _dst[y * c_w] = (unsigned char)OC_CLAMPI(
-                0, (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] +
-                    35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] -
-                    17 * tmp[(y + 1) * c_w] + 4 * tmp[(y + 2) * c_w] + 64) >>
-                       7,
+                0,
+                (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] +
+                 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] -
+                 17 * tmp[(y + 1) * c_w] + 4 * tmp[(y + 2) * c_w] + 64) >>
+                    7,
                 255);
           }
           for (; y < c_h; y++) {
             _dst[y * c_w] = (unsigned char)OC_CLAMPI(
-                0, (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] +
-                    35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] -
-                    17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] +
-                    4 * tmp[(c_h - 1) * c_w] + 64) >>
-                       7,
+                0,
+                (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] +
+                 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] -
+                 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] +
+                 4 * tmp[(c_h - 1) * c_w] + 64) >>
+                    7,
                 255);
           }
           _dst++;
@@ -361,10 +409,11 @@ static void y4m_convert_42xpaldv_42xjpeg(y4m_input *_y4m, unsigned char *_dst,
           }
           for (; y < c_h - 3; y++) {
             _dst[y * c_w] = (unsigned char)OC_CLAMPI(
-                0, (4 * tmp[(y - 2) * c_w] - 17 * tmp[(y - 1) * c_w] +
-                    114 * tmp[y * c_w] + 35 * tmp[(y + 1) * c_w] -
-                    9 * tmp[(y + 2) * c_w] + tmp[(y + 3) * c_w] + 64) >>
-                       7,
+                0,
+                (4 * tmp[(y - 2) * c_w] - 17 * tmp[(y - 1) * c_w] +
+                 114 * tmp[y * c_w] + 35 * tmp[(y + 1) * c_w] -
+                 9 * tmp[(y + 2) * c_w] + tmp[(y + 3) * c_w] + 64) >>
+                    7,
                 255);
           }
           for (; y < c_h; y++) {
@@ -404,18 +453,20 @@ static void y4m_422jpeg_420jpeg_helper(unsigned char *_dst,
   for (x = 0; x < _c_w; x++) {
     for (y = 0; y < OC_MINI(_c_h, 2); y += 2) {
       _dst[(y >> 1) * _c_w] =
-          OC_CLAMPI(0, (64 * _src[0] + 78 * _src[OC_MINI(1, _c_h - 1) * _c_w] -
-                        17 * _src[OC_MINI(2, _c_h - 1) * _c_w] +
-                        3 * _src[OC_MINI(3, _c_h - 1) * _c_w] + 64) >>
-                           7,
+          OC_CLAMPI(0,
+                    (64 * _src[0] + 78 * _src[OC_MINI(1, _c_h - 1) * _c_w] -
+                     17 * _src[OC_MINI(2, _c_h - 1) * _c_w] +
+                     3 * _src[OC_MINI(3, _c_h - 1) * _c_w] + 64) >>
+                        7,
                     255);
     }
     for (; y < _c_h - 3; y += 2) {
       _dst[(y >> 1) * _c_w] =
-          OC_CLAMPI(0, (3 * (_src[(y - 2) * _c_w] + _src[(y + 3) * _c_w]) -
-                        17 * (_src[(y - 1) * _c_w] + _src[(y + 2) * _c_w]) +
-                        78 * (_src[y * _c_w] + _src[(y + 1) * _c_w]) + 64) >>
-                           7,
+          OC_CLAMPI(0,
+                    (3 * (_src[(y - 2) * _c_w] + _src[(y + 3) * _c_w]) -
+                     17 * (_src[(y - 1) * _c_w] + _src[(y + 2) * _c_w]) +
+                     78 * (_src[y * _c_w] + _src[(y + 1) * _c_w]) + 64) >>
+                        7,
                     255);
     }
     for (; y < _c_h; y += 2) {
@@ -642,33 +693,38 @@ static void y4m_convert_411_420jpeg(y4m_input *_y4m, unsigned char *_dst,
          4-tap Mitchell window.*/
       for (x = 0; x < OC_MINI(c_w, 1); x++) {
         tmp[x << 1] = (unsigned char)OC_CLAMPI(
-            0, (111 * _aux[0] + 18 * _aux[OC_MINI(1, c_w - 1)] -
-                _aux[OC_MINI(2, c_w - 1)] + 64) >>
-                   7,
+            0,
+            (111 * _aux[0] + 18 * _aux[OC_MINI(1, c_w - 1)] -
+             _aux[OC_MINI(2, c_w - 1)] + 64) >>
+                7,
             255);
         tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI(
-            0, (47 * _aux[0] + 86 * _aux[OC_MINI(1, c_w - 1)] -
-                5 * _aux[OC_MINI(2, c_w - 1)] + 64) >>
-                   7,
+            0,
+            (47 * _aux[0] + 86 * _aux[OC_MINI(1, c_w - 1)] -
+             5 * _aux[OC_MINI(2, c_w - 1)] + 64) >>
+                7,
             255);
       }
       for (; x < c_w - 2; x++) {
         tmp[x << 1] =
-            (unsigned char)OC_CLAMPI(0, (_aux[x - 1] + 110 * _aux[x] +
-                                         18 * _aux[x + 1] - _aux[x + 2] + 64) >>
-                                            7,
+            (unsigned char)OC_CLAMPI(0,
+                                     (_aux[x - 1] + 110 * _aux[x] +
+                                      18 * _aux[x + 1] - _aux[x + 2] + 64) >>
+                                         7,
                                      255);
         tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI(
-            0, (-3 * _aux[x - 1] + 50 * _aux[x] + 86 * _aux[x + 1] -
-                5 * _aux[x + 2] + 64) >>
-                   7,
+            0,
+            (-3 * _aux[x - 1] + 50 * _aux[x] + 86 * _aux[x + 1] -
+             5 * _aux[x + 2] + 64) >>
+                7,
             255);
       }
       for (; x < c_w; x++) {
         tmp[x << 1] = (unsigned char)OC_CLAMPI(
-            0, (_aux[x - 1] + 110 * _aux[x] +
-                18 * _aux[OC_MINI(x + 1, c_w - 1)] - _aux[c_w - 1] + 64) >>
-                   7,
+            0,
+            (_aux[x - 1] + 110 * _aux[x] + 18 * _aux[OC_MINI(x + 1, c_w - 1)] -
+             _aux[c_w - 1] + 64) >>
+                7,
             255);
         if ((x << 1 | 1) < dst_c_w) {
           tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI(
@@ -718,27 +774,29 @@ static void y4m_convert_444_420jpeg(y4m_input *_y4m, unsigned char *_dst,
     /*Filter: [3 -17 78 78 -17 3]/128, derived from a 6-tap Lanczos window.*/
     for (y = 0; y < c_h; y++) {
       for (x = 0; x < OC_MINI(c_w, 2); x += 2) {
-        tmp[x >> 1] =
-            OC_CLAMPI(0, (64 * _aux[0] + 78 * _aux[OC_MINI(1, c_w - 1)] -
-                          17 * _aux[OC_MINI(2, c_w - 1)] +
-                          3 * _aux[OC_MINI(3, c_w - 1)] + 64) >>
-                             7,
-                      255);
+        tmp[x >> 1] = OC_CLAMPI(0,
+                                (64 * _aux[0] + 78 * _aux[OC_MINI(1, c_w - 1)] -
+                                 17 * _aux[OC_MINI(2, c_w - 1)] +
+                                 3 * _aux[OC_MINI(3, c_w - 1)] + 64) >>
+                                    7,
+                                255);
       }
       for (; x < c_w - 3; x += 2) {
-        tmp[x >> 1] = OC_CLAMPI(0, (3 * (_aux[x - 2] + _aux[x + 3]) -
-                                    17 * (_aux[x - 1] + _aux[x + 2]) +
-                                    78 * (_aux[x] + _aux[x + 1]) + 64) >>
-                                       7,
+        tmp[x >> 1] = OC_CLAMPI(0,
+                                (3 * (_aux[x - 2] + _aux[x + 3]) -
+                                 17 * (_aux[x - 1] + _aux[x + 2]) +
+                                 78 * (_aux[x] + _aux[x + 1]) + 64) >>
+                                    7,
                                 255);
       }
       for (; x < c_w; x += 2) {
-        tmp[x >> 1] = OC_CLAMPI(
-            0, (3 * (_aux[x - 2] + _aux[c_w - 1]) -
-                17 * (_aux[x - 1] + _aux[OC_MINI(x + 2, c_w - 1)]) +
-                78 * (_aux[x] + _aux[OC_MINI(x + 1, c_w - 1)]) + 64) >>
-                   7,
-            255);
+        tmp[x >> 1] =
+            OC_CLAMPI(0,
+                      (3 * (_aux[x - 2] + _aux[c_w - 1]) -
+                       17 * (_aux[x - 1] + _aux[OC_MINI(x + 2, c_w - 1)]) +
+                       78 * (_aux[x] + _aux[OC_MINI(x + 1, c_w - 1)]) + 64) >>
+                          7,
+                      255);
       }
       tmp += dst_c_w;
       _aux += c_w;
@@ -769,299 +827,277 @@ static void y4m_convert_null(y4m_input *_y4m, unsigned char *_dst,
   (void)_aux;
 }
 
-int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
-                   int only_420) {
-  char buffer[80] = { 0 };
-  int ret;
-  int i;
-  /*Read until newline, or 80 cols, whichever happens first.*/
-  for (i = 0; i < 79; i++) {
-    if (_nskip > 0) {
-      buffer[i] = *_skip++;
-      _nskip--;
-    } else {
-      if (!file_read(buffer + i, 1, _fin)) return -1;
-    }
-    if (buffer[i] == '\n') break;
+static const char TAG[] = "YUV4MPEG2";
+
+int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer,
+                   int num_skip, int only_420) {
+  // File must start with |TAG|.
+  char tag_buffer[9];  // 9 == strlen(TAG)
+  // Read as much as possible from |skip_buffer|, which were characters
+  // that were previously read from the file to do input-type detection.
+  assert(num_skip >= 0 && num_skip <= 8);
+  if (num_skip > 0) {
+    memcpy(tag_buffer, skip_buffer, num_skip);
   }
-  /*We skipped too much header data.*/
-  if (_nskip > 0) return -1;
-  if (i == 79) {
-    fprintf(stderr, "Error parsing header; not a YUV2MPEG2 file?\n");
+  // Start reading from the file now that the |skip_buffer| is depleted.
+  if (!file_read(tag_buffer + num_skip, 9 - num_skip, file)) {
     return -1;
   }
-  buffer[i] = '\0';
-  if (memcmp(buffer, "YUV4MPEG", 8)) {
-    fprintf(stderr, "Incomplete magic for YUV4MPEG file.\n");
+  if (memcmp(TAG, tag_buffer, 9) != 0) {
+    fprintf(stderr, "Error parsing header: must start with %s\n", TAG);
     return -1;
   }
-  if (buffer[8] != '2') {
-    fprintf(stderr, "Incorrect YUV input file version; YUV4MPEG2 required.\n");
+  // Next character must be a space.
+  if (!file_read(tag_buffer, 1, file) || tag_buffer[0] != ' ') {
+    fprintf(stderr, "Error parsing header: space must follow %s\n", TAG);
+    return -1;
   }
-  ret = y4m_parse_tags(_y4m, buffer + 5);
-  if (ret < 0) {
-    fprintf(stderr, "Error parsing YUV4MPEG2 header.\n");
-    return ret;
+  if (!parse_tags(y4m_ctx, file)) {
+    fprintf(stderr, "Error parsing %s header.\n", TAG);
   }
-  if (_y4m->interlace == '?') {
+  if (y4m_ctx->interlace == '?') {
     fprintf(stderr,
             "Warning: Input video interlacing format unknown; "
             "assuming progressive scan.\n");
-  } else if (_y4m->interlace != 'p') {
+  } else if (y4m_ctx->interlace != 'p') {
     fprintf(stderr,
             "Input video is interlaced; "
             "Only progressive scan handled.\n");
     return -1;
   }
-  _y4m->vpx_fmt = VPX_IMG_FMT_I420;
-  _y4m->bps = 12;
-  _y4m->bit_depth = 8;
-  if (strcmp(_y4m->chroma_type, "420") == 0 ||
-      strcmp(_y4m->chroma_type, "420jpeg") == 0) {
-    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v =
-        _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz =
-        _y4m->pic_w * _y4m->pic_h +
-        2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
+  y4m_ctx->vpx_fmt = VPX_IMG_FMT_I420;
+  y4m_ctx->bps = 12;
+  y4m_ctx->bit_depth = 8;
+  y4m_ctx->aux_buf = NULL;
+  y4m_ctx->dst_buf = NULL;
+  if (strcmp(y4m_ctx->chroma_type, "420") == 0 ||
+      strcmp(y4m_ctx->chroma_type, "420jpeg") == 0 ||
+      strcmp(y4m_ctx->chroma_type, "420mpeg2") == 0) {
+    y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_v =
+        y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz =
+        y4m_ctx->pic_w * y4m_ctx->pic_h +
+        2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2);
     /* Natively supported: no conversion required. */
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
-  } else if (strcmp(_y4m->chroma_type, "420p10") == 0) {
-    _y4m->src_c_dec_h = 2;
-    _y4m->dst_c_dec_h = 2;
-    _y4m->src_c_dec_v = 2;
-    _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz =
-        2 * (_y4m->pic_w * _y4m->pic_h +
-             2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2));
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
+  } else if (strcmp(y4m_ctx->chroma_type, "420p10") == 0) {
+    y4m_ctx->src_c_dec_h = 2;
+    y4m_ctx->dst_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 2;
+    y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz =
+        2 * (y4m_ctx->pic_w * y4m_ctx->pic_h +
+             2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2));
     /* Natively supported: no conversion required. */
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
-    _y4m->bit_depth = 10;
-    _y4m->bps = 15;
-    _y4m->vpx_fmt = VPX_IMG_FMT_I42016;
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
+    y4m_ctx->bit_depth = 10;
+    y4m_ctx->bps = 15;
+    y4m_ctx->vpx_fmt = VPX_IMG_FMT_I42016;
     if (only_420) {
       fprintf(stderr, "Unsupported conversion from 420p10 to 420jpeg\n");
       return -1;
     }
-  } else if (strcmp(_y4m->chroma_type, "420p12") == 0) {
-    _y4m->src_c_dec_h = 2;
-    _y4m->dst_c_dec_h = 2;
-    _y4m->src_c_dec_v = 2;
-    _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz =
-        2 * (_y4m->pic_w * _y4m->pic_h +
-             2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2));
+  } else if (strcmp(y4m_ctx->chroma_type, "420p12") == 0) {
+    y4m_ctx->src_c_dec_h = 2;
+    y4m_ctx->dst_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 2;
+    y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz =
+        2 * (y4m_ctx->pic_w * y4m_ctx->pic_h +
+             2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2));
     /* Natively supported: no conversion required. */
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
-    _y4m->bit_depth = 12;
-    _y4m->bps = 18;
-    _y4m->vpx_fmt = VPX_IMG_FMT_I42016;
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
+    y4m_ctx->bit_depth = 12;
+    y4m_ctx->bps = 18;
+    y4m_ctx->vpx_fmt = VPX_IMG_FMT_I42016;
     if (only_420) {
       fprintf(stderr, "Unsupported conversion from 420p12 to 420jpeg\n");
       return -1;
     }
-  } else if (strcmp(_y4m->chroma_type, "420mpeg2") == 0) {
-    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v =
-        _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
-    /*Chroma filter required: read into the aux buf first.*/
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz =
-        2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
-    _y4m->convert = y4m_convert_42xmpeg2_42xjpeg;
-  } else if (strcmp(_y4m->chroma_type, "420paldv") == 0) {
-    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v =
-        _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+  } else if (strcmp(y4m_ctx->chroma_type, "420paldv") == 0) {
+    y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_v =
+        y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
     /*Chroma filter required: read into the aux buf first.
       We need to make two filter passes, so we need some extra space in the
        aux buffer.*/
-    _y4m->aux_buf_sz = 3 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
-    _y4m->aux_buf_read_sz =
-        2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
-    _y4m->convert = y4m_convert_42xpaldv_42xjpeg;
-  } else if (strcmp(_y4m->chroma_type, "422jpeg") == 0) {
-    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = 2;
-    _y4m->src_c_dec_v = 1;
-    _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+    y4m_ctx->aux_buf_sz =
+        3 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2);
+    y4m_ctx->aux_buf_read_sz =
+        2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2);
+    y4m_ctx->convert = y4m_convert_42xpaldv_42xjpeg;
+  } else if (strcmp(y4m_ctx->chroma_type, "422jpeg") == 0) {
+    y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 1;
+    y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
     /*Chroma filter required: read into the aux buf first.*/
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz =
-        2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-    _y4m->convert = y4m_convert_422jpeg_420jpeg;
-  } else if (strcmp(_y4m->chroma_type, "422") == 0) {
-    _y4m->src_c_dec_h = 2;
-    _y4m->src_c_dec_v = 1;
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz =
+        2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
+    y4m_ctx->convert = y4m_convert_422jpeg_420jpeg;
+  } else if (strcmp(y4m_ctx->chroma_type, "422") == 0) {
+    y4m_ctx->src_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 1;
     if (only_420) {
-      _y4m->dst_c_dec_h = 2;
-      _y4m->dst_c_dec_v = 2;
-      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+      y4m_ctx->dst_c_dec_h = 2;
+      y4m_ctx->dst_c_dec_v = 2;
+      y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
       /*Chroma filter required: read into the aux buf first.
         We need to make two filter passes, so we need some extra space in the
          aux buffer.*/
-      _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-      _y4m->aux_buf_sz =
-          _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-      _y4m->convert = y4m_convert_422_420jpeg;
+      y4m_ctx->aux_buf_read_sz =
+          2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
+      y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz +
+                            ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
+      y4m_ctx->convert = y4m_convert_422_420jpeg;
     } else {
-      _y4m->vpx_fmt = VPX_IMG_FMT_I422;
-      _y4m->bps = 16;
-      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
-      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
-      _y4m->dst_buf_read_sz =
-          _y4m->pic_w * _y4m->pic_h + 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      y4m_ctx->vpx_fmt = VPX_IMG_FMT_I422;
+      y4m_ctx->bps = 16;
+      y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+      y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+      y4m_ctx->dst_buf_read_sz =
+          y4m_ctx->pic_w * y4m_ctx->pic_h +
+          2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
       /*Natively supported: no conversion required.*/
-      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-      _y4m->convert = y4m_convert_null;
+      y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+      y4m_ctx->convert = y4m_convert_null;
     }
-  } else if (strcmp(_y4m->chroma_type, "422p10") == 0) {
-    _y4m->src_c_dec_h = 2;
-    _y4m->src_c_dec_v = 1;
-    _y4m->vpx_fmt = VPX_IMG_FMT_I42216;
-    _y4m->bps = 20;
-    _y4m->bit_depth = 10;
-    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
-    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
-    _y4m->dst_buf_read_sz = 2 * (_y4m->pic_w * _y4m->pic_h +
-                                 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h);
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
+  } else if (strcmp(y4m_ctx->chroma_type, "422p10") == 0) {
+    y4m_ctx->src_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 1;
+    y4m_ctx->vpx_fmt = VPX_IMG_FMT_I42216;
+    y4m_ctx->bps = 20;
+    y4m_ctx->bit_depth = 10;
+    y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+    y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+    y4m_ctx->dst_buf_read_sz =
+        2 * (y4m_ctx->pic_w * y4m_ctx->pic_h +
+             2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h);
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
     if (only_420) {
       fprintf(stderr, "Unsupported conversion from 422p10 to 420jpeg\n");
       return -1;
     }
-  } else if (strcmp(_y4m->chroma_type, "422p12") == 0) {
-    _y4m->src_c_dec_h = 2;
-    _y4m->src_c_dec_v = 1;
-    _y4m->vpx_fmt = VPX_IMG_FMT_I42216;
-    _y4m->bps = 24;
-    _y4m->bit_depth = 12;
-    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
-    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
-    _y4m->dst_buf_read_sz = 2 * (_y4m->pic_w * _y4m->pic_h +
-                                 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h);
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
+  } else if (strcmp(y4m_ctx->chroma_type, "422p12") == 0) {
+    y4m_ctx->src_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 1;
+    y4m_ctx->vpx_fmt = VPX_IMG_FMT_I42216;
+    y4m_ctx->bps = 24;
+    y4m_ctx->bit_depth = 12;
+    y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+    y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+    y4m_ctx->dst_buf_read_sz =
+        2 * (y4m_ctx->pic_w * y4m_ctx->pic_h +
+             2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h);
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
     if (only_420) {
       fprintf(stderr, "Unsupported conversion from 422p12 to 420jpeg\n");
       return -1;
     }
-  } else if (strcmp(_y4m->chroma_type, "411") == 0) {
-    _y4m->src_c_dec_h = 4;
-    _y4m->dst_c_dec_h = 2;
-    _y4m->src_c_dec_v = 1;
-    _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+  } else if (strcmp(y4m_ctx->chroma_type, "411") == 0) {
+    y4m_ctx->src_c_dec_h = 4;
+    y4m_ctx->dst_c_dec_h = 2;
+    y4m_ctx->src_c_dec_v = 1;
+    y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
     /*Chroma filter required: read into the aux buf first.
       We need to make two filter passes, so we need some extra space in the
        aux buffer.*/
-    _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 3) / 4) * _y4m->pic_h;
-    _y4m->aux_buf_sz =
-        _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-    _y4m->convert = y4m_convert_411_420jpeg;
-  } else if (strcmp(_y4m->chroma_type, "444") == 0) {
-    _y4m->src_c_dec_h = 1;
-    _y4m->src_c_dec_v = 1;
+    y4m_ctx->aux_buf_read_sz = 2 * ((y4m_ctx->pic_w + 3) / 4) * y4m_ctx->pic_h;
+    y4m_ctx->aux_buf_sz =
+        y4m_ctx->aux_buf_read_sz + ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
+    y4m_ctx->convert = y4m_convert_411_420jpeg;
+    fprintf(stderr, "Unsupported conversion from yuv 411\n");
+    return -1;
+  } else if (strcmp(y4m_ctx->chroma_type, "444") == 0) {
+    y4m_ctx->src_c_dec_h = 1;
+    y4m_ctx->src_c_dec_v = 1;
     if (only_420) {
-      _y4m->dst_c_dec_h = 2;
-      _y4m->dst_c_dec_v = 2;
-      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+      y4m_ctx->dst_c_dec_h = 2;
+      y4m_ctx->dst_c_dec_v = 2;
+      y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
       /*Chroma filter required: read into the aux buf first.
         We need to make two filter passes, so we need some extra space in the
          aux buffer.*/
-      _y4m->aux_buf_read_sz = 2 * _y4m->pic_w * _y4m->pic_h;
-      _y4m->aux_buf_sz =
-          _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-      _y4m->convert = y4m_convert_444_420jpeg;
+      y4m_ctx->aux_buf_read_sz = 2 * y4m_ctx->pic_w * y4m_ctx->pic_h;
+      y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz +
+                            ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h;
+      y4m_ctx->convert = y4m_convert_444_420jpeg;
     } else {
-      _y4m->vpx_fmt = VPX_IMG_FMT_I444;
-      _y4m->bps = 24;
-      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
-      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
-      _y4m->dst_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h;
+      y4m_ctx->vpx_fmt = VPX_IMG_FMT_I444;
+      y4m_ctx->bps = 24;
+      y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+      y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+      y4m_ctx->dst_buf_read_sz = 3 * y4m_ctx->pic_w * y4m_ctx->pic_h;
       /*Natively supported: no conversion required.*/
-      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-      _y4m->convert = y4m_convert_null;
+      y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+      y4m_ctx->convert = y4m_convert_null;
     }
-  } else if (strcmp(_y4m->chroma_type, "444p10") == 0) {
-    _y4m->src_c_dec_h = 1;
-    _y4m->src_c_dec_v = 1;
-    _y4m->vpx_fmt = VPX_IMG_FMT_I44416;
-    _y4m->bps = 30;
-    _y4m->bit_depth = 10;
-    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
-    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
-    _y4m->dst_buf_read_sz = 2 * 3 * _y4m->pic_w * _y4m->pic_h;
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
+  } else if (strcmp(y4m_ctx->chroma_type, "444p10") == 0) {
+    y4m_ctx->src_c_dec_h = 1;
+    y4m_ctx->src_c_dec_v = 1;
+    y4m_ctx->vpx_fmt = VPX_IMG_FMT_I44416;
+    y4m_ctx->bps = 30;
+    y4m_ctx->bit_depth = 10;
+    y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+    y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+    y4m_ctx->dst_buf_read_sz = 2 * 3 * y4m_ctx->pic_w * y4m_ctx->pic_h;
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
     if (only_420) {
       fprintf(stderr, "Unsupported conversion from 444p10 to 420jpeg\n");
       return -1;
     }
-  } else if (strcmp(_y4m->chroma_type, "444p12") == 0) {
-    _y4m->src_c_dec_h = 1;
-    _y4m->src_c_dec_v = 1;
-    _y4m->vpx_fmt = VPX_IMG_FMT_I44416;
-    _y4m->bps = 36;
-    _y4m->bit_depth = 12;
-    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
-    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
-    _y4m->dst_buf_read_sz = 2 * 3 * _y4m->pic_w * _y4m->pic_h;
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_null;
+  } else if (strcmp(y4m_ctx->chroma_type, "444p12") == 0) {
+    y4m_ctx->src_c_dec_h = 1;
+    y4m_ctx->src_c_dec_v = 1;
+    y4m_ctx->vpx_fmt = VPX_IMG_FMT_I44416;
+    y4m_ctx->bps = 36;
+    y4m_ctx->bit_depth = 12;
+    y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h;
+    y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v;
+    y4m_ctx->dst_buf_read_sz = 2 * 3 * y4m_ctx->pic_w * y4m_ctx->pic_h;
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_null;
     if (only_420) {
       fprintf(stderr, "Unsupported conversion from 444p12 to 420jpeg\n");
       return -1;
     }
-  } else if (strcmp(_y4m->chroma_type, "444alpha") == 0) {
-    _y4m->src_c_dec_h = 1;
-    _y4m->src_c_dec_v = 1;
-    if (only_420) {
-      _y4m->dst_c_dec_h = 2;
-      _y4m->dst_c_dec_v = 2;
-      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
-      /*Chroma filter required: read into the aux buf first.
-        We need to make two filter passes, so we need some extra space in the
-         aux buffer.
-        The extra plane also gets read into the aux buf.
-        It will be discarded.*/
-      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h;
-      _y4m->convert = y4m_convert_444_420jpeg;
-    } else {
-      _y4m->vpx_fmt = VPX_IMG_FMT_444A;
-      _y4m->bps = 32;
-      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
-      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
-      _y4m->dst_buf_read_sz = 4 * _y4m->pic_w * _y4m->pic_h;
-      /*Natively supported: no conversion required.*/
-      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-      _y4m->convert = y4m_convert_null;
-    }
-  } else if (strcmp(_y4m->chroma_type, "mono") == 0) {
-    _y4m->src_c_dec_h = _y4m->src_c_dec_v = 0;
-    _y4m->dst_c_dec_h = _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+  } else if (strcmp(y4m_ctx->chroma_type, "mono") == 0) {
+    y4m_ctx->src_c_dec_h = y4m_ctx->src_c_dec_v = 0;
+    y4m_ctx->dst_c_dec_h = y4m_ctx->dst_c_dec_v = 2;
+    y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h;
     /*No extra space required, but we need to clear the chroma planes.*/
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
-    _y4m->convert = y4m_convert_mono_420jpeg;
+    y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0;
+    y4m_ctx->convert = y4m_convert_mono_420jpeg;
   } else {
-    fprintf(stderr, "Unknown chroma sampling type: %s\n", _y4m->chroma_type);
+    fprintf(stderr, "Unknown chroma sampling type: %s\n", y4m_ctx->chroma_type);
     return -1;
   }
   /*The size of the final frame buffers is always computed from the
      destination chroma decimation type.*/
-  _y4m->dst_buf_sz =
-      _y4m->pic_w * _y4m->pic_h +
-      2 * ((_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h) *
-          ((_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v);
-  if (_y4m->bit_depth == 8)
-    _y4m->dst_buf = (unsigned char *)malloc(_y4m->dst_buf_sz);
+  y4m_ctx->dst_buf_sz =
+      y4m_ctx->pic_w * y4m_ctx->pic_h +
+      2 * ((y4m_ctx->pic_w + y4m_ctx->dst_c_dec_h - 1) / y4m_ctx->dst_c_dec_h) *
+          ((y4m_ctx->pic_h + y4m_ctx->dst_c_dec_v - 1) / y4m_ctx->dst_c_dec_v);
+  if (y4m_ctx->bit_depth == 8)
+    y4m_ctx->dst_buf = (unsigned char *)malloc(y4m_ctx->dst_buf_sz);
   else
-    _y4m->dst_buf = (unsigned char *)malloc(2 * _y4m->dst_buf_sz);
+    y4m_ctx->dst_buf = (unsigned char *)malloc(2 * y4m_ctx->dst_buf_sz);
+  if (!y4m_ctx->dst_buf) return -1;
 
-  if (_y4m->aux_buf_sz > 0)
-    _y4m->aux_buf = (unsigned char *)malloc(_y4m->aux_buf_sz);
+  if (y4m_ctx->aux_buf_sz > 0) {
+    y4m_ctx->aux_buf = (unsigned char *)malloc(y4m_ctx->aux_buf_sz);
+    if (!y4m_ctx->aux_buf) {
+      free(y4m_ctx->dst_buf);
+      return -1;
+    }
+  }
   return 0;
 }
 
@@ -1113,6 +1149,7 @@ int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *_img) {
   _img->fmt = _y4m->vpx_fmt;
   _img->w = _img->d_w = _y4m->pic_w;
   _img->h = _img->d_h = _y4m->pic_h;
+  _img->bit_depth = _y4m->bit_depth;
   _img->x_chroma_shift = _y4m->dst_c_dec_h >> 1;
   _img->y_chroma_shift = _y4m->dst_c_dec_v >> 1;
   _img->bps = _y4m->bps;
diff --git a/media/libvpx/libvpx/y4minput.h b/media/libvpx/libvpx/y4minput.h
index 9e69ceb835..573750d749 100644
--- a/media/libvpx/libvpx/y4minput.h
+++ b/media/libvpx/libvpx/y4minput.h
@@ -11,8 +11,8 @@
  *  Copyright (C) 2002-2010 The Xiph.Org Foundation and contributors.
  */
 
-#ifndef Y4MINPUT_H_
-#define Y4MINPUT_H_
+#ifndef VPX_Y4MINPUT_H_
+#define VPX_Y4MINPUT_H_
 
 #include <stdio.h>
 #include "vpx/vpx_image.h"
@@ -56,8 +56,16 @@ struct y4m_input {
   unsigned int bit_depth;
 };
 
-int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
-                   int only_420);
+/**
+ * Open the input file, treating it as Y4M. |y4m_ctx| is filled in after
+ * reading it. The |skip_buffer| indicates bytes that were previously read
+ * from |file|, to do input-type detection; this buffer will be read before
+ * the |file| is read. It is of size |num_skip|, which *must* be 8 or less.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer,
+                   int num_skip, int only_420);
 void y4m_input_close(y4m_input *_y4m);
 int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *img);
 
@@ -65,4 +73,4 @@ int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *img);
 }  // extern "C"
 #endif
 
-#endif  // Y4MINPUT_H_
+#endif  // VPX_Y4MINPUT_H_
diff --git a/media/libvpx/moz.build b/media/libvpx/moz.build
index 529d93741a..11d53f74c3 100644
--- a/media/libvpx/moz.build
+++ b/media/libvpx/moz.build
@@ -66,8 +66,37 @@ elif CONFIG['CPU_ARCH'] == 'arm':
         ASFLAGS += [
             '-no-integrated-as',
         ]
+elif CONFIG['CPU_ARCH'] == 'aarch64':
+    EXPORTS.vpx += files['AARCH64_EXPORTS']
+    SOURCES += files['AARCH64_SOURCES']
+    if CONFIG['OS_TARGET'] == 'Darwin':
+        ASFLAGS += [ '-I%s/media/libvpx/config/mac/arm64/' % TOPSRCDIR ]
+        CFLAGS += [ '-I%s/media/libvpx/config/mac/arm64/' % TOPSRCDIR ]
+    else: # Linux, BSDs, etc.
+        ASFLAGS += [ '-I%s/media/libvpx/config/linux/arm64/' % TOPSRCDIR ]
+        CFLAGS += [ '-I%s/media/libvpx/config/linux/arm64/' % TOPSRCDIR ]
+elif CONFIG['CPU_ARCH'] == 'mips32':
+    EXPORTS.vpx += files['MIPS32_EXPORTS']
+    SOURCES += files['MIPS32_SOURCES']
+    ASFLAGS += [ '-I%s/media/libvpx/config/linux/mips32/' % TOPSRCDIR ]
+    CFLAGS += [ '-I%s/media/libvpx/config/linux/mips32/' % TOPSRCDIR ]
+elif CONFIG['CPU_ARCH'] == 'mips64':
+    EXPORTS.vpx += files['MIPS64_EXPORTS']
+    SOURCES += files['MIPS64_SOURCES']
+    ASFLAGS += [ '-I%s/media/libvpx/config/linux/mips64/' % TOPSRCDIR ]
+    CFLAGS += [ '-I%s/media/libvpx/config/linux/mips64/' % TOPSRCDIR ]
+elif CONFIG['CPU_ARCH'].startswith('ppc'):
+    EXPORTS.vpx += files['PPC64LE_EXPORTS']
+    SOURCES += files['PPC64LE_SOURCES']
+    ASFLAGS += [ '-I%s/media/libvpx/config/linux/ppc64le/' % TOPSRCDIR ]
+    CFLAGS += [ '-I%s/media/libvpx/config/linux/ppc64le/' % TOPSRCDIR ]
+elif CONFIG['CPU_ARCH'].startswith('loongarch64'):
+    EXPORTS.vpx += files['LOONGARCH64_EXPORTS']
+    SOURCES += files['LOONGARCH64_SOURCES']
+    ASFLAGS += [ '-I%s/media/libvpx/config/linux/loongarch64/' % TOPSRCDIR ]
+    CFLAGS += [ '-I%s/media/libvpx/config/linux/loongarch64/' % TOPSRCDIR ]
 else:
-    # Generic C-only configuration
+    # Generic C-only configuration used by unsupported targets.
     EXPORTS.vpx += files['GENERIC_EXPORTS']
     SOURCES += files['GENERIC_SOURCES']
     ASFLAGS += [ '-I%s/media/libvpx/config/generic/' % TOPSRCDIR ]
@@ -106,6 +135,30 @@ if CONFIG['CLANG_CL'] or not CONFIG['_MSC_VER']:
                 SOURCES[f].flags += ['-mavx']
             if 'avx2.c' in f:
                 SOURCES[f].flags += ['-mavx2']
+            if 'avx512.c' in f:
+                SOURCES[f].flags += [
+                    '-mavx512f',
+                    '-mavx512cd',
+                    '-mavx512bw',
+                    '-mavx512dq',
+                    '-mavx512vl',
+                ]
+            if 'neon_dotprod.c' in f:
+                SOURCES[f].flags += ['-march=armv8.2-a+dotprod']
+            if 'neon_i8mm.c' in f:
+                SOURCES[f].flags += ['-march=armv8.2-a+dotprod+i8mm']
+            if '_sve.c' in f:
+                SOURCES[f].flags += ['-march=armv8.2-a+dotprod+i8mm+sve']
+            if '_sve2.c' in f:
+                SOURCES[f].flags += ['-march=armv9-a+i8mm+sve2']
+            if '_vsx.c' in f:
+                SOURCES[f].flags += ['-maltivec', '-mvsx']
+            if '_msa.c' in f:
+                SOURCES[f].flags += ['-mmsa']
+            if '_lsx.c' in f:
+                SOURCES[f].flags += ['-mlsx']
+            if '_lasx.c' in f:
+                SOURCES[f].flags += ['-mlasx']
 
 # Suppress warnings in third-party code.
 if CONFIG['GNU_CC'] or CONFIG['CLANG_CL']:
diff --git a/media/libvpx/sources.mozbuild b/media/libvpx/sources.mozbuild
index 2a7f7f6e23..71b3f91e85 100644
--- a/media/libvpx/sources.mozbuild
+++ b/media/libvpx/sources.mozbuild
@@ -8,13 +8,17 @@ files = {
     'libvpx/vpx/vpx_codec.h',
     'libvpx/vpx/vpx_decoder.h',
     'libvpx/vpx/vpx_encoder.h',
+    'libvpx/vpx/vpx_ext_ratectrl.h',
     'libvpx/vpx/vpx_frame_buffer.h',
     'libvpx/vpx/vpx_image.h',
     'libvpx/vpx/vpx_integer.h',
+    'libvpx/vpx/vpx_tpl.h',
     'libvpx/vpx_mem/include/vpx_mem_intrnl.h',
     'libvpx/vpx_mem/vpx_mem.h',
     'libvpx/vpx_ports/bitops.h',
+    'libvpx/vpx_ports/compiler_attributes.h',
     'libvpx/vpx_ports/mem.h',
+    'libvpx/vpx_ports/static_assert.h',
     'libvpx/vpx_ports/system_state.h',
     'libvpx/vpx_ports/vpx_timer.h',
     'libvpx/vpx_ports/x86.h',
@@ -24,7 +28,6 @@ files = {
   'X64_SOURCES': [
     'libvpx/vp8/common/alloccommon.c',
     'libvpx/vp8/common/blockd.c',
-    'libvpx/vp8/common/copy_c.c',
     'libvpx/vp8/common/dequantize.c',
     'libvpx/vp8/common/entropy.c',
     'libvpx/vp8/common/entropymode.c',
@@ -49,10 +52,9 @@ files = {
     'libvpx/vp8/common/swapyv12buffer.c',
     'libvpx/vp8/common/treecoder.c',
     'libvpx/vp8/common/vp8_loopfilter.c',
-    'libvpx/vp8/common/x86/copy_sse2.asm',
-    'libvpx/vp8/common/x86/copy_sse3.asm',
+    'libvpx/vp8/common/vp8_skin_detection.c',
+    'libvpx/vp8/common/x86/bilinear_filter_sse2.c',
     'libvpx/vp8/common/x86/dequantize_mmx.asm',
-    'libvpx/vp8/common/x86/filter_x86.c',
     'libvpx/vp8/common/x86/idct_blk_mmx.c',
     'libvpx/vp8/common/x86/idct_blk_sse2.c',
     'libvpx/vp8/common/x86/idctllm_mmx.asm',
@@ -76,6 +78,7 @@ files = {
     'libvpx/vp8/decoder/threading.c',
     'libvpx/vp8/encoder/bitstream.c',
     'libvpx/vp8/encoder/boolhuff.c',
+    'libvpx/vp8/encoder/copy_c.c',
     'libvpx/vp8/encoder/dct.c',
     'libvpx/vp8/encoder/denoising.c',
     'libvpx/vp8/encoder/encodeframe.c',
@@ -98,17 +101,17 @@ files = {
     'libvpx/vp8/encoder/tokenize.c',
     'libvpx/vp8/encoder/treewriter.c',
     'libvpx/vp8/encoder/vp8_quantize.c',
+    'libvpx/vp8/encoder/x86/block_error_sse2.asm',
+    'libvpx/vp8/encoder/x86/copy_sse2.asm',
+    'libvpx/vp8/encoder/x86/copy_sse3.asm',
     'libvpx/vp8/encoder/x86/dct_sse2.asm',
     'libvpx/vp8/encoder/x86/denoising_sse2.c',
-    'libvpx/vp8/encoder/x86/encodeopt.asm',
     'libvpx/vp8/encoder/x86/fwalsh_sse2.asm',
-    'libvpx/vp8/encoder/x86/quantize_mmx.asm',
     'libvpx/vp8/encoder/x86/quantize_sse4.c',
-    'libvpx/vp8/encoder/x86/quantize_ssse3.c',
     'libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm',
-    'libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c',
     'libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c',
     'libvpx/vp8/encoder/x86/vp8_quantize_sse2.c',
+    'libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c',
     'libvpx/vp8/vp8_cx_iface.c',
     'libvpx/vp8/vp8_dx_iface.c',
     'libvpx/vp9/common/vp9_alloccommon.c',
@@ -141,7 +144,7 @@ files = {
     'libvpx/vp9/decoder/vp9_decoder.c',
     'libvpx/vp9/decoder/vp9_detokenize.c',
     'libvpx/vp9/decoder/vp9_dsubexp.c',
-    'libvpx/vp9/decoder/vp9_dthread.c',
+    'libvpx/vp9/decoder/vp9_job_queue.c',
     'libvpx/vp9/encoder/vp9_alt_ref_aq.c',
     'libvpx/vp9/encoder/vp9_aq_360.c',
     'libvpx/vp9/encoder/vp9_aq_complexity.c',
@@ -156,11 +159,14 @@ files = {
     'libvpx/vp9/encoder/vp9_encodemv.c',
     'libvpx/vp9/encoder/vp9_encoder.c',
     'libvpx/vp9/encoder/vp9_ethread.c',
+    'libvpx/vp9/encoder/vp9_ext_ratectrl.c',
     'libvpx/vp9/encoder/vp9_extend.c',
     'libvpx/vp9/encoder/vp9_firstpass.c',
+    'libvpx/vp9/encoder/vp9_frame_scale.c',
     'libvpx/vp9/encoder/vp9_lookahead.c',
     'libvpx/vp9/encoder/vp9_mbgraph.c',
     'libvpx/vp9/encoder/vp9_mcomp.c',
+    'libvpx/vp9/encoder/vp9_multi_thread.c',
     'libvpx/vp9/encoder/vp9_noise_estimate.c',
     'libvpx/vp9/encoder/vp9_picklpf.c',
     'libvpx/vp9/encoder/vp9_pickmode.c',
@@ -176,19 +182,22 @@ files = {
     'libvpx/vp9/encoder/vp9_svc_layercontext.c',
     'libvpx/vp9/encoder/vp9_temporal_filter.c',
     'libvpx/vp9/encoder/vp9_tokenize.c',
+    'libvpx/vp9/encoder/vp9_tpl_model.c',
     'libvpx/vp9/encoder/vp9_treewriter.c',
+    'libvpx/vp9/encoder/x86/temporal_filter_avx2.c',
+    'libvpx/vp9/encoder/x86/temporal_filter_sse4.c',
+    'libvpx/vp9/encoder/x86/temporal_filter_ssse3.c',
     'libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c',
     'libvpx/vp9/encoder/x86/vp9_dct_sse2.asm',
-    'libvpx/vp9/encoder/x86/vp9_dct_ssse3.c',
-    'libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c',
-    'libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c',
+    'libvpx/vp9/encoder/x86/vp9_error_avx2.c',
     'libvpx/vp9/encoder/x86/vp9_error_sse2.asm',
     'libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c',
+    'libvpx/vp9/encoder/x86/vp9_quantize_avx2.c',
     'libvpx/vp9/encoder/x86/vp9_quantize_sse2.c',
-    'libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm',
-    'libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm',
+    'libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c',
     'libvpx/vp9/vp9_cx_iface.c',
     'libvpx/vp9/vp9_dx_iface.c',
+    'libvpx/vp9/vp9_iface_common.c',
     'libvpx/vpx/src/vpx_codec.c',
     'libvpx/vpx/src/vpx_decoder.c',
     'libvpx/vpx/src/vpx_encoder.c',
@@ -208,13 +217,18 @@ files = {
     'libvpx/vpx_dsp/psnr.c',
     'libvpx/vpx_dsp/quantize.c',
     'libvpx/vpx_dsp/sad.c',
+    'libvpx/vpx_dsp/skin_detection.c',
+    'libvpx/vpx_dsp/sse.c',
     'libvpx/vpx_dsp/subtract.c',
     'libvpx/vpx_dsp/sum_squares.c',
     'libvpx/vpx_dsp/variance.c',
     'libvpx/vpx_dsp/vpx_convolve.c',
     'libvpx/vpx_dsp/vpx_dsp_rtcd.c',
     'libvpx/vpx_dsp/x86/add_noise_sse2.asm',
+    'libvpx/vpx_dsp/x86/avg_intrin_avx2.c',
     'libvpx/vpx_dsp/x86/avg_intrin_sse2.c',
+    'libvpx/vpx_dsp/x86/avg_pred_avx2.c',
+    'libvpx/vpx_dsp/x86/avg_pred_sse2.c',
     'libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm',
     'libvpx/vpx_dsp/x86/deblock_sse2.asm',
     'libvpx/vpx_dsp/x86/fwd_txfm_avx2.c',
@@ -222,30 +236,34 @@ files = {
     'libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm',
     'libvpx/vpx_dsp/x86/intrapred_sse2.asm',
     'libvpx/vpx_dsp/x86/intrapred_ssse3.asm',
+    'libvpx/vpx_dsp/x86/inv_txfm_avx2.c',
     'libvpx/vpx_dsp/x86/inv_txfm_sse2.c',
-    'libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm',
+    'libvpx/vpx_dsp/x86/inv_txfm_ssse3.c',
     'libvpx/vpx_dsp/x86/inv_wht_sse2.asm',
     'libvpx/vpx_dsp/x86/loopfilter_avx2.c',
-    'libvpx/vpx_dsp/x86/loopfilter_intrin_sse2.c',
-    'libvpx/vpx_dsp/x86/quantize_avx_x86_64.asm',
+    'libvpx/vpx_dsp/x86/loopfilter_sse2.c',
+    'libvpx/vpx_dsp/x86/post_proc_sse2.c',
+    'libvpx/vpx_dsp/x86/quantize_avx.c',
+    'libvpx/vpx_dsp/x86/quantize_avx2.c',
     'libvpx/vpx_dsp/x86/quantize_sse2.c',
-    'libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm',
+    'libvpx/vpx_dsp/x86/quantize_ssse3.c',
     'libvpx/vpx_dsp/x86/sad4d_avx2.c',
+    'libvpx/vpx_dsp/x86/sad4d_avx512.c',
     'libvpx/vpx_dsp/x86/sad4d_sse2.asm',
     'libvpx/vpx_dsp/x86/sad_avx2.c',
+    'libvpx/vpx_dsp/x86/sad_avx512.c',
     'libvpx/vpx_dsp/x86/sad_sse2.asm',
-    'libvpx/vpx_dsp/x86/sad_sse3.asm',
-    'libvpx/vpx_dsp/x86/sad_sse4.asm',
-    'libvpx/vpx_dsp/x86/sad_ssse3.asm',
+    'libvpx/vpx_dsp/x86/sse_avx2.c',
+    'libvpx/vpx_dsp/x86/sse_sse4.c',
     'libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm',
     'libvpx/vpx_dsp/x86/subpel_variance_sse2.asm',
+    'libvpx/vpx_dsp/x86/subtract_avx2.c',
     'libvpx/vpx_dsp/x86/subtract_sse2.asm',
     'libvpx/vpx_dsp/x86/sum_squares_sse2.c',
     'libvpx/vpx_dsp/x86/variance_avx2.c',
-    'libvpx/vpx_dsp/x86/variance_impl_avx2.c',
     'libvpx/vpx_dsp/x86/variance_sse2.c',
-    'libvpx/vpx_dsp/x86/vpx_asm_stubs.c',
     'libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm',
+    'libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c',
     'libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c',
     'libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c',
     'libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm',
@@ -253,7 +271,7 @@ files = {
     'libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm',
     'libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm',
     'libvpx/vpx_mem/vpx_mem.c',
-    'libvpx/vpx_ports/emms.asm',
+    'libvpx/vpx_ports/emms_mmx.asm',
     'libvpx/vpx_ports/x86_abi_support.asm',
     'libvpx/vpx_scale/generic/gen_scalers.c',
     'libvpx/vpx_scale/generic/vpx_scale.c',
@@ -261,6 +279,7 @@ files = {
     'libvpx/vpx_scale/generic/yv12extend.c',
     'libvpx/vpx_scale/vpx_scale_rtcd.c',
     'libvpx/vpx_util/vpx_thread.c',
+    'libvpx/vpx_util/vpx_write_yuv_frame.c',
 ],
   'IA32_EXPORTS': [
     'libvpx/vpx/vp8.h',
@@ -269,13 +288,17 @@ files = {
     'libvpx/vpx/vpx_codec.h',
     'libvpx/vpx/vpx_decoder.h',
     'libvpx/vpx/vpx_encoder.h',
+    'libvpx/vpx/vpx_ext_ratectrl.h',
     'libvpx/vpx/vpx_frame_buffer.h',
     'libvpx/vpx/vpx_image.h',
     'libvpx/vpx/vpx_integer.h',
+    'libvpx/vpx/vpx_tpl.h',
     'libvpx/vpx_mem/include/vpx_mem_intrnl.h',
     'libvpx/vpx_mem/vpx_mem.h',
     'libvpx/vpx_ports/bitops.h',
+    'libvpx/vpx_ports/compiler_attributes.h',
     'libvpx/vpx_ports/mem.h',
+    'libvpx/vpx_ports/static_assert.h',
     'libvpx/vpx_ports/system_state.h',
     'libvpx/vpx_ports/vpx_timer.h',
     'libvpx/vpx_ports/x86.h',
@@ -285,7 +308,6 @@ files = {
   'IA32_SOURCES': [
     'libvpx/vp8/common/alloccommon.c',
     'libvpx/vp8/common/blockd.c',
-    'libvpx/vp8/common/copy_c.c',
     'libvpx/vp8/common/dequantize.c',
     'libvpx/vp8/common/entropy.c',
     'libvpx/vp8/common/entropymode.c',
@@ -310,10 +332,9 @@ files = {
     'libvpx/vp8/common/swapyv12buffer.c',
     'libvpx/vp8/common/treecoder.c',
     'libvpx/vp8/common/vp8_loopfilter.c',
-    'libvpx/vp8/common/x86/copy_sse2.asm',
-    'libvpx/vp8/common/x86/copy_sse3.asm',
+    'libvpx/vp8/common/vp8_skin_detection.c',
+    'libvpx/vp8/common/x86/bilinear_filter_sse2.c',
     'libvpx/vp8/common/x86/dequantize_mmx.asm',
-    'libvpx/vp8/common/x86/filter_x86.c',
     'libvpx/vp8/common/x86/idct_blk_mmx.c',
     'libvpx/vp8/common/x86/idct_blk_sse2.c',
     'libvpx/vp8/common/x86/idctllm_mmx.asm',
@@ -336,6 +357,7 @@ files = {
     'libvpx/vp8/decoder/threading.c',
     'libvpx/vp8/encoder/bitstream.c',
     'libvpx/vp8/encoder/boolhuff.c',
+    'libvpx/vp8/encoder/copy_c.c',
     'libvpx/vp8/encoder/dct.c',
     'libvpx/vp8/encoder/denoising.c',
     'libvpx/vp8/encoder/encodeframe.c',
@@ -358,17 +380,17 @@ files = {
     'libvpx/vp8/encoder/tokenize.c',
     'libvpx/vp8/encoder/treewriter.c',
     'libvpx/vp8/encoder/vp8_quantize.c',
+    'libvpx/vp8/encoder/x86/block_error_sse2.asm',
+    'libvpx/vp8/encoder/x86/copy_sse2.asm',
+    'libvpx/vp8/encoder/x86/copy_sse3.asm',
     'libvpx/vp8/encoder/x86/dct_sse2.asm',
     'libvpx/vp8/encoder/x86/denoising_sse2.c',
-    'libvpx/vp8/encoder/x86/encodeopt.asm',
     'libvpx/vp8/encoder/x86/fwalsh_sse2.asm',
-    'libvpx/vp8/encoder/x86/quantize_mmx.asm',
     'libvpx/vp8/encoder/x86/quantize_sse4.c',
-    'libvpx/vp8/encoder/x86/quantize_ssse3.c',
     'libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm',
-    'libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c',
     'libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c',
     'libvpx/vp8/encoder/x86/vp8_quantize_sse2.c',
+    'libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c',
     'libvpx/vp8/vp8_cx_iface.c',
     'libvpx/vp8/vp8_dx_iface.c',
     'libvpx/vp9/common/vp9_alloccommon.c',
@@ -401,7 +423,7 @@ files = {
     'libvpx/vp9/decoder/vp9_decoder.c',
     'libvpx/vp9/decoder/vp9_detokenize.c',
     'libvpx/vp9/decoder/vp9_dsubexp.c',
-    'libvpx/vp9/decoder/vp9_dthread.c',
+    'libvpx/vp9/decoder/vp9_job_queue.c',
     'libvpx/vp9/encoder/vp9_alt_ref_aq.c',
     'libvpx/vp9/encoder/vp9_aq_360.c',
     'libvpx/vp9/encoder/vp9_aq_complexity.c',
@@ -416,11 +438,14 @@ files = {
     'libvpx/vp9/encoder/vp9_encodemv.c',
     'libvpx/vp9/encoder/vp9_encoder.c',
     'libvpx/vp9/encoder/vp9_ethread.c',
+    'libvpx/vp9/encoder/vp9_ext_ratectrl.c',
     'libvpx/vp9/encoder/vp9_extend.c',
     'libvpx/vp9/encoder/vp9_firstpass.c',
+    'libvpx/vp9/encoder/vp9_frame_scale.c',
     'libvpx/vp9/encoder/vp9_lookahead.c',
     'libvpx/vp9/encoder/vp9_mbgraph.c',
     'libvpx/vp9/encoder/vp9_mcomp.c',
+    'libvpx/vp9/encoder/vp9_multi_thread.c',
     'libvpx/vp9/encoder/vp9_noise_estimate.c',
     'libvpx/vp9/encoder/vp9_picklpf.c',
     'libvpx/vp9/encoder/vp9_pickmode.c',
@@ -436,18 +461,22 @@ files = {
     'libvpx/vp9/encoder/vp9_svc_layercontext.c',
     'libvpx/vp9/encoder/vp9_temporal_filter.c',
     'libvpx/vp9/encoder/vp9_tokenize.c',
+    'libvpx/vp9/encoder/vp9_tpl_model.c',
     'libvpx/vp9/encoder/vp9_treewriter.c',
+    'libvpx/vp9/encoder/x86/temporal_filter_avx2.c',
+    'libvpx/vp9/encoder/x86/temporal_filter_sse4.c',
+    'libvpx/vp9/encoder/x86/temporal_filter_ssse3.c',
     'libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c',
     'libvpx/vp9/encoder/x86/vp9_dct_sse2.asm',
-    'libvpx/vp9/encoder/x86/vp9_dct_ssse3.c',
-    'libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c',
-    'libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c',
+    'libvpx/vp9/encoder/x86/vp9_error_avx2.c',
     'libvpx/vp9/encoder/x86/vp9_error_sse2.asm',
     'libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c',
+    'libvpx/vp9/encoder/x86/vp9_quantize_avx2.c',
     'libvpx/vp9/encoder/x86/vp9_quantize_sse2.c',
-    'libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm',
+    'libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c',
     'libvpx/vp9/vp9_cx_iface.c',
     'libvpx/vp9/vp9_dx_iface.c',
+    'libvpx/vp9/vp9_iface_common.c',
     'libvpx/vpx/src/vpx_codec.c',
     'libvpx/vpx/src/vpx_decoder.c',
     'libvpx/vpx/src/vpx_encoder.c',
@@ -467,38 +496,50 @@ files = {
     'libvpx/vpx_dsp/psnr.c',
     'libvpx/vpx_dsp/quantize.c',
     'libvpx/vpx_dsp/sad.c',
+    'libvpx/vpx_dsp/skin_detection.c',
+    'libvpx/vpx_dsp/sse.c',
     'libvpx/vpx_dsp/subtract.c',
     'libvpx/vpx_dsp/sum_squares.c',
     'libvpx/vpx_dsp/variance.c',
     'libvpx/vpx_dsp/vpx_convolve.c',
     'libvpx/vpx_dsp/vpx_dsp_rtcd.c',
     'libvpx/vpx_dsp/x86/add_noise_sse2.asm',
+    'libvpx/vpx_dsp/x86/avg_intrin_avx2.c',
     'libvpx/vpx_dsp/x86/avg_intrin_sse2.c',
+    'libvpx/vpx_dsp/x86/avg_pred_avx2.c',
+    'libvpx/vpx_dsp/x86/avg_pred_sse2.c',
     'libvpx/vpx_dsp/x86/deblock_sse2.asm',
     'libvpx/vpx_dsp/x86/fwd_txfm_avx2.c',
     'libvpx/vpx_dsp/x86/fwd_txfm_sse2.c',
     'libvpx/vpx_dsp/x86/intrapred_sse2.asm',
     'libvpx/vpx_dsp/x86/intrapred_ssse3.asm',
+    'libvpx/vpx_dsp/x86/inv_txfm_avx2.c',
     'libvpx/vpx_dsp/x86/inv_txfm_sse2.c',
+    'libvpx/vpx_dsp/x86/inv_txfm_ssse3.c',
     'libvpx/vpx_dsp/x86/inv_wht_sse2.asm',
     'libvpx/vpx_dsp/x86/loopfilter_avx2.c',
-    'libvpx/vpx_dsp/x86/loopfilter_intrin_sse2.c',
+    'libvpx/vpx_dsp/x86/loopfilter_sse2.c',
+    'libvpx/vpx_dsp/x86/post_proc_sse2.c',
+    'libvpx/vpx_dsp/x86/quantize_avx.c',
+    'libvpx/vpx_dsp/x86/quantize_avx2.c',
     'libvpx/vpx_dsp/x86/quantize_sse2.c',
+    'libvpx/vpx_dsp/x86/quantize_ssse3.c',
     'libvpx/vpx_dsp/x86/sad4d_avx2.c',
+    'libvpx/vpx_dsp/x86/sad4d_avx512.c',
     'libvpx/vpx_dsp/x86/sad4d_sse2.asm',
     'libvpx/vpx_dsp/x86/sad_avx2.c',
+    'libvpx/vpx_dsp/x86/sad_avx512.c',
     'libvpx/vpx_dsp/x86/sad_sse2.asm',
-    'libvpx/vpx_dsp/x86/sad_sse3.asm',
-    'libvpx/vpx_dsp/x86/sad_sse4.asm',
-    'libvpx/vpx_dsp/x86/sad_ssse3.asm',
+    'libvpx/vpx_dsp/x86/sse_avx2.c',
+    'libvpx/vpx_dsp/x86/sse_sse4.c',
     'libvpx/vpx_dsp/x86/subpel_variance_sse2.asm',
+    'libvpx/vpx_dsp/x86/subtract_avx2.c',
     'libvpx/vpx_dsp/x86/subtract_sse2.asm',
     'libvpx/vpx_dsp/x86/sum_squares_sse2.c',
     'libvpx/vpx_dsp/x86/variance_avx2.c',
-    'libvpx/vpx_dsp/x86/variance_impl_avx2.c',
     'libvpx/vpx_dsp/x86/variance_sse2.c',
-    'libvpx/vpx_dsp/x86/vpx_asm_stubs.c',
     'libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm',
+    'libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c',
     'libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c',
     'libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c',
     'libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm',
@@ -506,7 +547,7 @@ files = {
     'libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm',
     'libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm',
     'libvpx/vpx_mem/vpx_mem.c',
-    'libvpx/vpx_ports/emms.asm',
+    'libvpx/vpx_ports/emms_mmx.c',
     'libvpx/vpx_ports/x86_abi_support.asm',
     'libvpx/vpx_scale/generic/gen_scalers.c',
     'libvpx/vpx_scale/generic/vpx_scale.c',
@@ -514,6 +555,7 @@ files = {
     'libvpx/vpx_scale/generic/yv12extend.c',
     'libvpx/vpx_scale/vpx_scale_rtcd.c',
     'libvpx/vpx_util/vpx_thread.c',
+    'libvpx/vpx_util/vpx_write_yuv_frame.c',
 ],
   'ARM_EXPORTS': [
     'libvpx/vpx/vp8.h',
@@ -522,14 +564,19 @@ files = {
     'libvpx/vpx/vpx_codec.h',
     'libvpx/vpx/vpx_decoder.h',
     'libvpx/vpx/vpx_encoder.h',
+    'libvpx/vpx/vpx_ext_ratectrl.h',
     'libvpx/vpx/vpx_frame_buffer.h',
     'libvpx/vpx/vpx_image.h',
     'libvpx/vpx/vpx_integer.h',
+    'libvpx/vpx/vpx_tpl.h',
     'libvpx/vpx_mem/include/vpx_mem_intrnl.h',
     'libvpx/vpx_mem/vpx_mem.h',
     'libvpx/vpx_ports/arm.h',
+    'libvpx/vpx_ports/arm_cpudetect.h',
     'libvpx/vpx_ports/bitops.h',
+    'libvpx/vpx_ports/compiler_attributes.h',
     'libvpx/vpx_ports/mem.h',
+    'libvpx/vpx_ports/static_assert.h',
     'libvpx/vpx_ports/system_state.h',
     'libvpx/vpx_ports/vpx_timer.h',
     'libvpx/vpx_scale/vpx_scale.h',
@@ -544,8 +591,6 @@ files = {
     'libvpx/vp8/common/arm/neon/dequant_idct_neon.c',
     'libvpx/vp8/common/arm/neon/dequantizeb_neon.c',
     'libvpx/vp8/common/arm/neon/idct_blk_neon.c',
-    'libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c',
-    'libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c',
     'libvpx/vp8/common/arm/neon/iwalsh_neon.c',
     'libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c',
     'libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c',
@@ -554,7 +599,6 @@ files = {
     'libvpx/vp8/common/arm/neon/sixtappredict_neon.c',
     'libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c',
     'libvpx/vp8/common/blockd.c',
-    'libvpx/vp8/common/copy_c.c',
     'libvpx/vp8/common/dequantize.c',
     'libvpx/vp8/common/entropy.c',
     'libvpx/vp8/common/entropymode.c',
@@ -577,6 +621,7 @@ files = {
     'libvpx/vp8/common/swapyv12buffer.c',
     'libvpx/vp8/common/treecoder.c',
     'libvpx/vp8/common/vp8_loopfilter.c',
+    'libvpx/vp8/common/vp8_skin_detection.c',
     'libvpx/vp8/decoder/dboolhuff.c',
     'libvpx/vp8/decoder/decodeframe.c',
     'libvpx/vp8/decoder/decodemv.c',
@@ -589,6 +634,7 @@ files = {
     'libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c',
     'libvpx/vp8/encoder/bitstream.c',
     'libvpx/vp8/encoder/boolhuff.c',
+    'libvpx/vp8/encoder/copy_c.c',
     'libvpx/vp8/encoder/dct.c',
     'libvpx/vp8/encoder/denoising.c',
     'libvpx/vp8/encoder/encodeframe.c',
@@ -611,6 +657,7 @@ files = {
     'libvpx/vp8/encoder/vp8_quantize.c',
     'libvpx/vp8/vp8_cx_iface.c',
     'libvpx/vp8/vp8_dx_iface.c',
+    'libvpx/vp9/common/arm/neon/vp9_iht16x16_add_neon.c',
     'libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c',
     'libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c',
     'libvpx/vp9/common/vp9_alloccommon.c',
@@ -639,15 +686,13 @@ files = {
     'libvpx/vp9/decoder/vp9_decoder.c',
     'libvpx/vp9/decoder/vp9_detokenize.c',
     'libvpx/vp9/decoder/vp9_dsubexp.c',
-    'libvpx/vp9/decoder/vp9_dthread.c',
+    'libvpx/vp9/decoder/vp9_job_queue.c',
     'libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c',
+    'libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c',
     'libvpx/vp9/encoder/arm/neon/vp9_error_neon.c',
+    'libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c',
     'libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c',
-    'libvpx/vp9/encoder/vp9_alt_ref_aq.c',
-    'libvpx/vp9/encoder/vp9_aq_360.c',
-    'libvpx/vp9/encoder/vp9_aq_complexity.c',
     'libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c',
-    'libvpx/vp9/encoder/vp9_aq_variance.c',
     'libvpx/vp9/encoder/vp9_bitstream.c',
     'libvpx/vp9/encoder/vp9_context_tree.c',
     'libvpx/vp9/encoder/vp9_cost.c',
@@ -657,11 +702,12 @@ files = {
     'libvpx/vp9/encoder/vp9_encodemv.c',
     'libvpx/vp9/encoder/vp9_encoder.c',
     'libvpx/vp9/encoder/vp9_ethread.c',
+    'libvpx/vp9/encoder/vp9_ext_ratectrl.c',
     'libvpx/vp9/encoder/vp9_extend.c',
-    'libvpx/vp9/encoder/vp9_firstpass.c',
+    'libvpx/vp9/encoder/vp9_frame_scale.c',
     'libvpx/vp9/encoder/vp9_lookahead.c',
-    'libvpx/vp9/encoder/vp9_mbgraph.c',
     'libvpx/vp9/encoder/vp9_mcomp.c',
+    'libvpx/vp9/encoder/vp9_multi_thread.c',
     'libvpx/vp9/encoder/vp9_noise_estimate.c',
     'libvpx/vp9/encoder/vp9_picklpf.c',
     'libvpx/vp9/encoder/vp9_pickmode.c',
@@ -675,46 +721,62 @@ files = {
     'libvpx/vp9/encoder/vp9_speed_features.c',
     'libvpx/vp9/encoder/vp9_subexp.c',
     'libvpx/vp9/encoder/vp9_svc_layercontext.c',
-    'libvpx/vp9/encoder/vp9_temporal_filter.c',
     'libvpx/vp9/encoder/vp9_tokenize.c',
+    'libvpx/vp9/encoder/vp9_tpl_model.c',
     'libvpx/vp9/encoder/vp9_treewriter.c',
     'libvpx/vp9/vp9_cx_iface.c',
     'libvpx/vp9/vp9_dx_iface.c',
+    'libvpx/vp9/vp9_iface_common.c',
     'libvpx/vpx/src/vpx_codec.c',
     'libvpx/vpx/src/vpx_decoder.c',
     'libvpx/vpx/src/vpx_encoder.c',
     'libvpx/vpx/src/vpx_image.c',
     'libvpx/vpx_dsp/arm/avg_neon.c',
-    'libvpx/vpx_dsp/arm/fwd_txfm_neon.c',
+    'libvpx/vpx_dsp/arm/avg_pred_neon.c',
+    'libvpx/vpx_dsp/arm/fdct16x16_neon.c',
+    'libvpx/vpx_dsp/arm/fdct32x32_neon.c',
+    'libvpx/vpx_dsp/arm/fdct4x4_neon.c',
+    'libvpx/vpx_dsp/arm/fdct8x8_neon.c',
+    'libvpx/vpx_dsp/arm/fdct_partial_neon.c',
     'libvpx/vpx_dsp/arm/hadamard_neon.c',
-    'libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm',
-    'libvpx/vpx_dsp/arm/idct16x16_add_neon.asm',
-    'libvpx/vpx_dsp/arm/idct16x16_neon.c',
+    'libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c',
+    'libvpx/vpx_dsp/arm/idct16x16_add_neon.c',
     'libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c',
     'libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c',
     'libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c',
     'libvpx/vpx_dsp/arm/idct32x32_add_neon.c',
     'libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm',
     'libvpx/vpx_dsp/arm/idct4x4_add_neon.asm',
-    'libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm',
-    'libvpx/vpx_dsp/arm/idct8x8_add_neon.asm',
+    'libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c',
+    'libvpx/vpx_dsp/arm/idct8x8_add_neon.c',
     'libvpx/vpx_dsp/arm/idct_neon.asm',
     'libvpx/vpx_dsp/arm/intrapred_neon.c',
     'libvpx/vpx_dsp/arm/intrapred_neon_asm.asm',
     'libvpx/vpx_dsp/arm/loopfilter_16_neon.asm',
     'libvpx/vpx_dsp/arm/loopfilter_4_neon.asm',
     'libvpx/vpx_dsp/arm/loopfilter_8_neon.asm',
+    'libvpx/vpx_dsp/arm/quantize_neon.c',
     'libvpx/vpx_dsp/arm/sad4d_neon.c',
     'libvpx/vpx_dsp/arm/sad_neon.c',
     'libvpx/vpx_dsp/arm/save_reg_neon.asm',
+    'libvpx/vpx_dsp/arm/sse_neon.c',
     'libvpx/vpx_dsp/arm/subpel_variance_neon.c',
     'libvpx/vpx_dsp/arm/subtract_neon.c',
+    'libvpx/vpx_dsp/arm/sum_squares_neon.c',
     'libvpx/vpx_dsp/arm/variance_neon.c',
-    'libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm',
-    'libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm',
+    'libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm',
+    'libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm',
+    'libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm',
+    'libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm',
+    'libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm',
+    'libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm',
+    'libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c',
+    'libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm',
+    'libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm',
     'libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm',
     'libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm',
     'libvpx/vpx_dsp/arm/vpx_convolve_neon.c',
+    'libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c',
     'libvpx/vpx_dsp/avg.c',
     'libvpx/vpx_dsp/bitreader.c',
     'libvpx/vpx_dsp/bitreader_buffer.c',
@@ -728,43 +790,65 @@ files = {
     'libvpx/vpx_dsp/psnr.c',
     'libvpx/vpx_dsp/quantize.c',
     'libvpx/vpx_dsp/sad.c',
+    'libvpx/vpx_dsp/skin_detection.c',
+    'libvpx/vpx_dsp/sse.c',
     'libvpx/vpx_dsp/subtract.c',
     'libvpx/vpx_dsp/sum_squares.c',
     'libvpx/vpx_dsp/variance.c',
     'libvpx/vpx_dsp/vpx_convolve.c',
     'libvpx/vpx_dsp/vpx_dsp_rtcd.c',
     'libvpx/vpx_mem/vpx_mem.c',
-    'libvpx/vpx_ports/arm_cpudetect.c',
+    'libvpx/vpx_ports/aarch32_cpudetect.c',
     'libvpx/vpx_scale/generic/gen_scalers.c',
     'libvpx/vpx_scale/generic/vpx_scale.c',
     'libvpx/vpx_scale/generic/yv12config.c',
     'libvpx/vpx_scale/generic/yv12extend.c',
     'libvpx/vpx_scale/vpx_scale_rtcd.c',
     'libvpx/vpx_util/vpx_thread.c',
+    'libvpx/vpx_util/vpx_write_yuv_frame.c',
 ],
-  'GENERIC_EXPORTS': [
+  'AARCH64_EXPORTS': [
     'libvpx/vpx/vp8.h',
     'libvpx/vpx/vp8cx.h',
     'libvpx/vpx/vp8dx.h',
     'libvpx/vpx/vpx_codec.h',
     'libvpx/vpx/vpx_decoder.h',
     'libvpx/vpx/vpx_encoder.h',
+    'libvpx/vpx/vpx_ext_ratectrl.h',
     'libvpx/vpx/vpx_frame_buffer.h',
     'libvpx/vpx/vpx_image.h',
     'libvpx/vpx/vpx_integer.h',
+    'libvpx/vpx/vpx_tpl.h',
     'libvpx/vpx_mem/include/vpx_mem_intrnl.h',
     'libvpx/vpx_mem/vpx_mem.h',
+    'libvpx/vpx_ports/arm.h',
+    'libvpx/vpx_ports/arm_cpudetect.h',
     'libvpx/vpx_ports/bitops.h',
+    'libvpx/vpx_ports/compiler_attributes.h',
     'libvpx/vpx_ports/mem.h',
+    'libvpx/vpx_ports/static_assert.h',
     'libvpx/vpx_ports/system_state.h',
     'libvpx/vpx_ports/vpx_timer.h',
     'libvpx/vpx_scale/vpx_scale.h',
     'libvpx/vpx_scale/yv12config.h',
 ],
-  'GENERIC_SOURCES': [
+  'AARCH64_SOURCES': [
     'libvpx/vp8/common/alloccommon.c',
+    'libvpx/vp8/common/arm/loopfilter_arm.c',
+    'libvpx/vp8/common/arm/neon/bilinearpredict_neon.c',
+    'libvpx/vp8/common/arm/neon/copymem_neon.c',
+    'libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c',
+    'libvpx/vp8/common/arm/neon/dequant_idct_neon.c',
+    'libvpx/vp8/common/arm/neon/dequantizeb_neon.c',
+    'libvpx/vp8/common/arm/neon/idct_blk_neon.c',
+    'libvpx/vp8/common/arm/neon/iwalsh_neon.c',
+    'libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c',
+    'libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c',
+    'libvpx/vp8/common/arm/neon/mbloopfilter_neon.c',
+    'libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c',
+    'libvpx/vp8/common/arm/neon/sixtappredict_neon.c',
+    'libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c',
     'libvpx/vp8/common/blockd.c',
-    'libvpx/vp8/common/copy_c.c',
     'libvpx/vp8/common/dequantize.c',
     'libvpx/vp8/common/entropy.c',
     'libvpx/vp8/common/entropymode.c',
@@ -787,6 +871,238 @@ files = {
     'libvpx/vp8/common/swapyv12buffer.c',
     'libvpx/vp8/common/treecoder.c',
     'libvpx/vp8/common/vp8_loopfilter.c',
+    'libvpx/vp8/common/vp8_skin_detection.c',
+    'libvpx/vp8/decoder/dboolhuff.c',
+    'libvpx/vp8/decoder/decodeframe.c',
+    'libvpx/vp8/decoder/decodemv.c',
+    'libvpx/vp8/decoder/detokenize.c',
+    'libvpx/vp8/decoder/onyxd_if.c',
+    'libvpx/vp8/decoder/threading.c',
+    'libvpx/vp8/encoder/arm/neon/denoising_neon.c',
+    'libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c',
+    'libvpx/vp8/encoder/arm/neon/shortfdct_neon.c',
+    'libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c',
+    'libvpx/vp8/encoder/bitstream.c',
+    'libvpx/vp8/encoder/boolhuff.c',
+    'libvpx/vp8/encoder/copy_c.c',
+    'libvpx/vp8/encoder/dct.c',
+    'libvpx/vp8/encoder/denoising.c',
+    'libvpx/vp8/encoder/encodeframe.c',
+    'libvpx/vp8/encoder/encodeintra.c',
+    'libvpx/vp8/encoder/encodemb.c',
+    'libvpx/vp8/encoder/encodemv.c',
+    'libvpx/vp8/encoder/ethreading.c',
+    'libvpx/vp8/encoder/lookahead.c',
+    'libvpx/vp8/encoder/mcomp.c',
+    'libvpx/vp8/encoder/modecosts.c',
+    'libvpx/vp8/encoder/mr_dissim.c',
+    'libvpx/vp8/encoder/onyx_if.c',
+    'libvpx/vp8/encoder/pickinter.c',
+    'libvpx/vp8/encoder/picklpf.c',
+    'libvpx/vp8/encoder/ratectrl.c',
+    'libvpx/vp8/encoder/rdopt.c',
+    'libvpx/vp8/encoder/segmentation.c',
+    'libvpx/vp8/encoder/tokenize.c',
+    'libvpx/vp8/encoder/treewriter.c',
+    'libvpx/vp8/encoder/vp8_quantize.c',
+    'libvpx/vp8/vp8_cx_iface.c',
+    'libvpx/vp8/vp8_dx_iface.c',
+    'libvpx/vp9/common/arm/neon/vp9_iht16x16_add_neon.c',
+    'libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c',
+    'libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c',
+    'libvpx/vp9/common/vp9_alloccommon.c',
+    'libvpx/vp9/common/vp9_blockd.c',
+    'libvpx/vp9/common/vp9_common_data.c',
+    'libvpx/vp9/common/vp9_entropy.c',
+    'libvpx/vp9/common/vp9_entropymode.c',
+    'libvpx/vp9/common/vp9_entropymv.c',
+    'libvpx/vp9/common/vp9_filter.c',
+    'libvpx/vp9/common/vp9_frame_buffers.c',
+    'libvpx/vp9/common/vp9_idct.c',
+    'libvpx/vp9/common/vp9_loopfilter.c',
+    'libvpx/vp9/common/vp9_mvref_common.c',
+    'libvpx/vp9/common/vp9_pred_common.c',
+    'libvpx/vp9/common/vp9_quant_common.c',
+    'libvpx/vp9/common/vp9_reconinter.c',
+    'libvpx/vp9/common/vp9_reconintra.c',
+    'libvpx/vp9/common/vp9_rtcd.c',
+    'libvpx/vp9/common/vp9_scale.c',
+    'libvpx/vp9/common/vp9_scan.c',
+    'libvpx/vp9/common/vp9_seg_common.c',
+    'libvpx/vp9/common/vp9_thread_common.c',
+    'libvpx/vp9/common/vp9_tile_common.c',
+    'libvpx/vp9/decoder/vp9_decodeframe.c',
+    'libvpx/vp9/decoder/vp9_decodemv.c',
+    'libvpx/vp9/decoder/vp9_decoder.c',
+    'libvpx/vp9/decoder/vp9_detokenize.c',
+    'libvpx/vp9/decoder/vp9_dsubexp.c',
+    'libvpx/vp9/decoder/vp9_job_queue.c',
+    'libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c',
+    'libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c',
+    'libvpx/vp9/encoder/arm/neon/vp9_error_neon.c',
+    'libvpx/vp9/encoder/arm/neon/vp9_error_sve.c',
+    'libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c',
+    'libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c',
+    'libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c',
+    'libvpx/vp9/encoder/vp9_bitstream.c',
+    'libvpx/vp9/encoder/vp9_context_tree.c',
+    'libvpx/vp9/encoder/vp9_cost.c',
+    'libvpx/vp9/encoder/vp9_dct.c',
+    'libvpx/vp9/encoder/vp9_encodeframe.c',
+    'libvpx/vp9/encoder/vp9_encodemb.c',
+    'libvpx/vp9/encoder/vp9_encodemv.c',
+    'libvpx/vp9/encoder/vp9_encoder.c',
+    'libvpx/vp9/encoder/vp9_ethread.c',
+    'libvpx/vp9/encoder/vp9_ext_ratectrl.c',
+    'libvpx/vp9/encoder/vp9_extend.c',
+    'libvpx/vp9/encoder/vp9_frame_scale.c',
+    'libvpx/vp9/encoder/vp9_lookahead.c',
+    'libvpx/vp9/encoder/vp9_mcomp.c',
+    'libvpx/vp9/encoder/vp9_multi_thread.c',
+    'libvpx/vp9/encoder/vp9_noise_estimate.c',
+    'libvpx/vp9/encoder/vp9_picklpf.c',
+    'libvpx/vp9/encoder/vp9_pickmode.c',
+    'libvpx/vp9/encoder/vp9_quantize.c',
+    'libvpx/vp9/encoder/vp9_ratectrl.c',
+    'libvpx/vp9/encoder/vp9_rd.c',
+    'libvpx/vp9/encoder/vp9_rdopt.c',
+    'libvpx/vp9/encoder/vp9_resize.c',
+    'libvpx/vp9/encoder/vp9_segmentation.c',
+    'libvpx/vp9/encoder/vp9_skin_detection.c',
+    'libvpx/vp9/encoder/vp9_speed_features.c',
+    'libvpx/vp9/encoder/vp9_subexp.c',
+    'libvpx/vp9/encoder/vp9_svc_layercontext.c',
+    'libvpx/vp9/encoder/vp9_tokenize.c',
+    'libvpx/vp9/encoder/vp9_tpl_model.c',
+    'libvpx/vp9/encoder/vp9_treewriter.c',
+    'libvpx/vp9/vp9_cx_iface.c',
+    'libvpx/vp9/vp9_dx_iface.c',
+    'libvpx/vp9/vp9_iface_common.c',
+    'libvpx/vpx/src/vpx_codec.c',
+    'libvpx/vpx/src/vpx_decoder.c',
+    'libvpx/vpx/src/vpx_encoder.c',
+    'libvpx/vpx/src/vpx_image.c',
+    'libvpx/vpx_dsp/arm/avg_neon.c',
+    'libvpx/vpx_dsp/arm/avg_pred_neon.c',
+    'libvpx/vpx_dsp/arm/fdct16x16_neon.c',
+    'libvpx/vpx_dsp/arm/fdct32x32_neon.c',
+    'libvpx/vpx_dsp/arm/fdct4x4_neon.c',
+    'libvpx/vpx_dsp/arm/fdct8x8_neon.c',
+    'libvpx/vpx_dsp/arm/fdct_partial_neon.c',
+    'libvpx/vpx_dsp/arm/hadamard_neon.c',
+    'libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c',
+    'libvpx/vpx_dsp/arm/idct16x16_add_neon.c',
+    'libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c',
+    'libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c',
+    'libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c',
+    'libvpx/vpx_dsp/arm/idct32x32_add_neon.c',
+    'libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c',
+    'libvpx/vpx_dsp/arm/idct4x4_add_neon.c',
+    'libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c',
+    'libvpx/vpx_dsp/arm/idct8x8_add_neon.c',
+    'libvpx/vpx_dsp/arm/intrapred_neon.c',
+    'libvpx/vpx_dsp/arm/loopfilter_neon.c',
+    'libvpx/vpx_dsp/arm/quantize_neon.c',
+    'libvpx/vpx_dsp/arm/sad4d_neon.c',
+    'libvpx/vpx_dsp/arm/sad4d_neon_dotprod.c',
+    'libvpx/vpx_dsp/arm/sad_neon.c',
+    'libvpx/vpx_dsp/arm/sad_neon_dotprod.c',
+    'libvpx/vpx_dsp/arm/sse_neon.c',
+    'libvpx/vpx_dsp/arm/sse_neon_dotprod.c',
+    'libvpx/vpx_dsp/arm/subpel_variance_neon.c',
+    'libvpx/vpx_dsp/arm/subtract_neon.c',
+    'libvpx/vpx_dsp/arm/sum_squares_neon.c',
+    'libvpx/vpx_dsp/arm/sum_squares_sve.c',
+    'libvpx/vpx_dsp/arm/variance_neon.c',
+    'libvpx/vpx_dsp/arm/variance_neon_dotprod.c',
+    'libvpx/vpx_dsp/arm/vpx_convolve8_neon.c',
+    'libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c',
+    'libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c',
+    'libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c',
+    'libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c',
+    'libvpx/vpx_dsp/arm/vpx_convolve_neon.c',
+    'libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c',
+    'libvpx/vpx_dsp/avg.c',
+    'libvpx/vpx_dsp/bitreader.c',
+    'libvpx/vpx_dsp/bitreader_buffer.c',
+    'libvpx/vpx_dsp/bitwriter.c',
+    'libvpx/vpx_dsp/bitwriter_buffer.c',
+    'libvpx/vpx_dsp/fwd_txfm.c',
+    'libvpx/vpx_dsp/intrapred.c',
+    'libvpx/vpx_dsp/inv_txfm.c',
+    'libvpx/vpx_dsp/loopfilter.c',
+    'libvpx/vpx_dsp/prob.c',
+    'libvpx/vpx_dsp/psnr.c',
+    'libvpx/vpx_dsp/quantize.c',
+    'libvpx/vpx_dsp/sad.c',
+    'libvpx/vpx_dsp/skin_detection.c',
+    'libvpx/vpx_dsp/sse.c',
+    'libvpx/vpx_dsp/subtract.c',
+    'libvpx/vpx_dsp/sum_squares.c',
+    'libvpx/vpx_dsp/variance.c',
+    'libvpx/vpx_dsp/vpx_convolve.c',
+    'libvpx/vpx_dsp/vpx_dsp_rtcd.c',
+    'libvpx/vpx_mem/vpx_mem.c',
+    'libvpx/vpx_ports/aarch64_cpudetect.c',
+    'libvpx/vpx_scale/generic/gen_scalers.c',
+    'libvpx/vpx_scale/generic/vpx_scale.c',
+    'libvpx/vpx_scale/generic/yv12config.c',
+    'libvpx/vpx_scale/generic/yv12extend.c',
+    'libvpx/vpx_scale/vpx_scale_rtcd.c',
+    'libvpx/vpx_util/vpx_thread.c',
+    'libvpx/vpx_util/vpx_write_yuv_frame.c',
+],
+  'MIPS32_EXPORTS': [
+    'libvpx/vpx/vp8.h',
+    'libvpx/vpx/vp8cx.h',
+    'libvpx/vpx/vp8dx.h',
+    'libvpx/vpx/vpx_codec.h',
+    'libvpx/vpx/vpx_decoder.h',
+    'libvpx/vpx/vpx_encoder.h',
+    'libvpx/vpx/vpx_ext_ratectrl.h',
+    'libvpx/vpx/vpx_frame_buffer.h',
+    'libvpx/vpx/vpx_image.h',
+    'libvpx/vpx/vpx_integer.h',
+    'libvpx/vpx/vpx_tpl.h',
+    'libvpx/vpx_mem/include/vpx_mem_intrnl.h',
+    'libvpx/vpx_mem/vpx_mem.h',
+    'libvpx/vpx_ports/asmdefs_mmi.h',
+    'libvpx/vpx_ports/bitops.h',
+    'libvpx/vpx_ports/compiler_attributes.h',
+    'libvpx/vpx_ports/mem.h',
+    'libvpx/vpx_ports/mips.h',
+    'libvpx/vpx_ports/static_assert.h',
+    'libvpx/vpx_ports/system_state.h',
+    'libvpx/vpx_ports/vpx_timer.h',
+    'libvpx/vpx_scale/vpx_scale.h',
+    'libvpx/vpx_scale/yv12config.h',
+],
+  'MIPS32_SOURCES': [
+    'libvpx/vp8/common/alloccommon.c',
+    'libvpx/vp8/common/blockd.c',
+    'libvpx/vp8/common/dequantize.c',
+    'libvpx/vp8/common/entropy.c',
+    'libvpx/vp8/common/entropymode.c',
+    'libvpx/vp8/common/entropymv.c',
+    'libvpx/vp8/common/extend.c',
+    'libvpx/vp8/common/filter.c',
+    'libvpx/vp8/common/findnearmv.c',
+    'libvpx/vp8/common/generic/systemdependent.c',
+    'libvpx/vp8/common/idct_blk.c',
+    'libvpx/vp8/common/idctllm.c',
+    'libvpx/vp8/common/loopfilter_filters.c',
+    'libvpx/vp8/common/mbpitch.c',
+    'libvpx/vp8/common/modecont.c',
+    'libvpx/vp8/common/quant_common.c',
+    'libvpx/vp8/common/reconinter.c',
+    'libvpx/vp8/common/reconintra.c',
+    'libvpx/vp8/common/reconintra4x4.c',
+    'libvpx/vp8/common/rtcd.c',
+    'libvpx/vp8/common/setupintrarecon.c',
+    'libvpx/vp8/common/swapyv12buffer.c',
+    'libvpx/vp8/common/treecoder.c',
+    'libvpx/vp8/common/vp8_loopfilter.c',
+    'libvpx/vp8/common/vp8_skin_detection.c',
     'libvpx/vp8/decoder/dboolhuff.c',
     'libvpx/vp8/decoder/decodeframe.c',
     'libvpx/vp8/decoder/decodemv.c',
@@ -795,6 +1111,7 @@ files = {
     'libvpx/vp8/decoder/threading.c',
     'libvpx/vp8/encoder/bitstream.c',
     'libvpx/vp8/encoder/boolhuff.c',
+    'libvpx/vp8/encoder/copy_c.c',
     'libvpx/vp8/encoder/dct.c',
     'libvpx/vp8/encoder/denoising.c',
     'libvpx/vp8/encoder/encodeframe.c',
@@ -845,7 +1162,7 @@ files = {
     'libvpx/vp9/decoder/vp9_decoder.c',
     'libvpx/vp9/decoder/vp9_detokenize.c',
     'libvpx/vp9/decoder/vp9_dsubexp.c',
-    'libvpx/vp9/decoder/vp9_dthread.c',
+    'libvpx/vp9/decoder/vp9_job_queue.c',
     'libvpx/vp9/encoder/vp9_alt_ref_aq.c',
     'libvpx/vp9/encoder/vp9_aq_360.c',
     'libvpx/vp9/encoder/vp9_aq_complexity.c',
@@ -860,11 +1177,14 @@ files = {
     'libvpx/vp9/encoder/vp9_encodemv.c',
     'libvpx/vp9/encoder/vp9_encoder.c',
     'libvpx/vp9/encoder/vp9_ethread.c',
+    'libvpx/vp9/encoder/vp9_ext_ratectrl.c',
     'libvpx/vp9/encoder/vp9_extend.c',
     'libvpx/vp9/encoder/vp9_firstpass.c',
+    'libvpx/vp9/encoder/vp9_frame_scale.c',
     'libvpx/vp9/encoder/vp9_lookahead.c',
     'libvpx/vp9/encoder/vp9_mbgraph.c',
     'libvpx/vp9/encoder/vp9_mcomp.c',
+    'libvpx/vp9/encoder/vp9_multi_thread.c',
     'libvpx/vp9/encoder/vp9_noise_estimate.c',
     'libvpx/vp9/encoder/vp9_picklpf.c',
     'libvpx/vp9/encoder/vp9_pickmode.c',
@@ -880,9 +1200,11 @@ files = {
     'libvpx/vp9/encoder/vp9_svc_layercontext.c',
     'libvpx/vp9/encoder/vp9_temporal_filter.c',
     'libvpx/vp9/encoder/vp9_tokenize.c',
+    'libvpx/vp9/encoder/vp9_tpl_model.c',
     'libvpx/vp9/encoder/vp9_treewriter.c',
     'libvpx/vp9/vp9_cx_iface.c',
     'libvpx/vp9/vp9_dx_iface.c',
+    'libvpx/vp9/vp9_iface_common.c',
     'libvpx/vpx/src/vpx_codec.c',
     'libvpx/vpx/src/vpx_decoder.c',
     'libvpx/vpx/src/vpx_encoder.c',
@@ -900,6 +1222,844 @@ files = {
     'libvpx/vpx_dsp/psnr.c',
     'libvpx/vpx_dsp/quantize.c',
     'libvpx/vpx_dsp/sad.c',
+    'libvpx/vpx_dsp/skin_detection.c',
+    'libvpx/vpx_dsp/sse.c',
+    'libvpx/vpx_dsp/subtract.c',
+    'libvpx/vpx_dsp/sum_squares.c',
+    'libvpx/vpx_dsp/variance.c',
+    'libvpx/vpx_dsp/vpx_convolve.c',
+    'libvpx/vpx_dsp/vpx_dsp_rtcd.c',
+    'libvpx/vpx_mem/vpx_mem.c',
+    'libvpx/vpx_ports/mips_cpudetect.c',
+    'libvpx/vpx_scale/generic/gen_scalers.c',
+    'libvpx/vpx_scale/generic/vpx_scale.c',
+    'libvpx/vpx_scale/generic/yv12config.c',
+    'libvpx/vpx_scale/generic/yv12extend.c',
+    'libvpx/vpx_scale/vpx_scale_rtcd.c',
+    'libvpx/vpx_util/vpx_thread.c',
+    'libvpx/vpx_util/vpx_write_yuv_frame.c',
+],
+  'MIPS64_EXPORTS': [
+    'libvpx/vpx/vp8.h',
+    'libvpx/vpx/vp8cx.h',
+    'libvpx/vpx/vp8dx.h',
+    'libvpx/vpx/vpx_codec.h',
+    'libvpx/vpx/vpx_decoder.h',
+    'libvpx/vpx/vpx_encoder.h',
+    'libvpx/vpx/vpx_ext_ratectrl.h',
+    'libvpx/vpx/vpx_frame_buffer.h',
+    'libvpx/vpx/vpx_image.h',
+    'libvpx/vpx/vpx_integer.h',
+    'libvpx/vpx/vpx_tpl.h',
+    'libvpx/vpx_mem/include/vpx_mem_intrnl.h',
+    'libvpx/vpx_mem/vpx_mem.h',
+    'libvpx/vpx_ports/asmdefs_mmi.h',
+    'libvpx/vpx_ports/bitops.h',
+    'libvpx/vpx_ports/compiler_attributes.h',
+    'libvpx/vpx_ports/mem.h',
+    'libvpx/vpx_ports/mips.h',
+    'libvpx/vpx_ports/static_assert.h',
+    'libvpx/vpx_ports/system_state.h',
+    'libvpx/vpx_ports/vpx_timer.h',
+    'libvpx/vpx_scale/vpx_scale.h',
+    'libvpx/vpx_scale/yv12config.h',
+],
+  'MIPS64_SOURCES': [
+    'libvpx/vp8/common/alloccommon.c',
+    'libvpx/vp8/common/blockd.c',
+    'libvpx/vp8/common/dequantize.c',
+    'libvpx/vp8/common/entropy.c',
+    'libvpx/vp8/common/entropymode.c',
+    'libvpx/vp8/common/entropymv.c',
+    'libvpx/vp8/common/extend.c',
+    'libvpx/vp8/common/filter.c',
+    'libvpx/vp8/common/findnearmv.c',
+    'libvpx/vp8/common/generic/systemdependent.c',
+    'libvpx/vp8/common/idct_blk.c',
+    'libvpx/vp8/common/idctllm.c',
+    'libvpx/vp8/common/loopfilter_filters.c',
+    'libvpx/vp8/common/mbpitch.c',
+    'libvpx/vp8/common/mips/mmi/copymem_mmi.c',
+    'libvpx/vp8/common/mips/mmi/dequantize_mmi.c',
+    'libvpx/vp8/common/mips/mmi/idct_blk_mmi.c',
+    'libvpx/vp8/common/mips/mmi/idctllm_mmi.c',
+    'libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c',
+    'libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c',
+    'libvpx/vp8/common/mips/msa/bilinear_filter_msa.c',
+    'libvpx/vp8/common/mips/msa/copymem_msa.c',
+    'libvpx/vp8/common/mips/msa/idct_msa.c',
+    'libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c',
+    'libvpx/vp8/common/mips/msa/sixtap_filter_msa.c',
+    'libvpx/vp8/common/modecont.c',
+    'libvpx/vp8/common/quant_common.c',
+    'libvpx/vp8/common/reconinter.c',
+    'libvpx/vp8/common/reconintra.c',
+    'libvpx/vp8/common/reconintra4x4.c',
+    'libvpx/vp8/common/rtcd.c',
+    'libvpx/vp8/common/setupintrarecon.c',
+    'libvpx/vp8/common/swapyv12buffer.c',
+    'libvpx/vp8/common/treecoder.c',
+    'libvpx/vp8/common/vp8_loopfilter.c',
+    'libvpx/vp8/common/vp8_skin_detection.c',
+    'libvpx/vp8/decoder/dboolhuff.c',
+    'libvpx/vp8/decoder/decodeframe.c',
+    'libvpx/vp8/decoder/decodemv.c',
+    'libvpx/vp8/decoder/detokenize.c',
+    'libvpx/vp8/decoder/onyxd_if.c',
+    'libvpx/vp8/decoder/threading.c',
+    'libvpx/vp8/encoder/bitstream.c',
+    'libvpx/vp8/encoder/boolhuff.c',
+    'libvpx/vp8/encoder/copy_c.c',
+    'libvpx/vp8/encoder/dct.c',
+    'libvpx/vp8/encoder/denoising.c',
+    'libvpx/vp8/encoder/encodeframe.c',
+    'libvpx/vp8/encoder/encodeintra.c',
+    'libvpx/vp8/encoder/encodemb.c',
+    'libvpx/vp8/encoder/encodemv.c',
+    'libvpx/vp8/encoder/ethreading.c',
+    'libvpx/vp8/encoder/firstpass.c',
+    'libvpx/vp8/encoder/lookahead.c',
+    'libvpx/vp8/encoder/mcomp.c',
+    'libvpx/vp8/encoder/mips/mmi/dct_mmi.c',
+    'libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c',
+    'libvpx/vp8/encoder/mips/msa/dct_msa.c',
+    'libvpx/vp8/encoder/mips/msa/denoising_msa.c',
+    'libvpx/vp8/encoder/mips/msa/encodeopt_msa.c',
+    'libvpx/vp8/encoder/mips/msa/quantize_msa.c',
+    'libvpx/vp8/encoder/mips/msa/temporal_filter_msa.c',
+    'libvpx/vp8/encoder/modecosts.c',
+    'libvpx/vp8/encoder/mr_dissim.c',
+    'libvpx/vp8/encoder/onyx_if.c',
+    'libvpx/vp8/encoder/pickinter.c',
+    'libvpx/vp8/encoder/picklpf.c',
+    'libvpx/vp8/encoder/ratectrl.c',
+    'libvpx/vp8/encoder/rdopt.c',
+    'libvpx/vp8/encoder/segmentation.c',
+    'libvpx/vp8/encoder/temporal_filter.c',
+    'libvpx/vp8/encoder/tokenize.c',
+    'libvpx/vp8/encoder/treewriter.c',
+    'libvpx/vp8/encoder/vp8_quantize.c',
+    'libvpx/vp8/vp8_cx_iface.c',
+    'libvpx/vp8/vp8_dx_iface.c',
+    'libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c',
+    'libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c',
+    'libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c',
+    'libvpx/vp9/common/vp9_alloccommon.c',
+    'libvpx/vp9/common/vp9_blockd.c',
+    'libvpx/vp9/common/vp9_common_data.c',
+    'libvpx/vp9/common/vp9_entropy.c',
+    'libvpx/vp9/common/vp9_entropymode.c',
+    'libvpx/vp9/common/vp9_entropymv.c',
+    'libvpx/vp9/common/vp9_filter.c',
+    'libvpx/vp9/common/vp9_frame_buffers.c',
+    'libvpx/vp9/common/vp9_idct.c',
+    'libvpx/vp9/common/vp9_loopfilter.c',
+    'libvpx/vp9/common/vp9_mvref_common.c',
+    'libvpx/vp9/common/vp9_pred_common.c',
+    'libvpx/vp9/common/vp9_quant_common.c',
+    'libvpx/vp9/common/vp9_reconinter.c',
+    'libvpx/vp9/common/vp9_reconintra.c',
+    'libvpx/vp9/common/vp9_rtcd.c',
+    'libvpx/vp9/common/vp9_scale.c',
+    'libvpx/vp9/common/vp9_scan.c',
+    'libvpx/vp9/common/vp9_seg_common.c',
+    'libvpx/vp9/common/vp9_thread_common.c',
+    'libvpx/vp9/common/vp9_tile_common.c',
+    'libvpx/vp9/decoder/vp9_decodeframe.c',
+    'libvpx/vp9/decoder/vp9_decodemv.c',
+    'libvpx/vp9/decoder/vp9_decoder.c',
+    'libvpx/vp9/decoder/vp9_detokenize.c',
+    'libvpx/vp9/decoder/vp9_dsubexp.c',
+    'libvpx/vp9/decoder/vp9_job_queue.c',
+    'libvpx/vp9/encoder/mips/msa/vp9_error_msa.c',
+    'libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c',
+    'libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c',
+    'libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c',
+    'libvpx/vp9/encoder/vp9_alt_ref_aq.c',
+    'libvpx/vp9/encoder/vp9_aq_360.c',
+    'libvpx/vp9/encoder/vp9_aq_complexity.c',
+    'libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c',
+    'libvpx/vp9/encoder/vp9_aq_variance.c',
+    'libvpx/vp9/encoder/vp9_bitstream.c',
+    'libvpx/vp9/encoder/vp9_context_tree.c',
+    'libvpx/vp9/encoder/vp9_cost.c',
+    'libvpx/vp9/encoder/vp9_dct.c',
+    'libvpx/vp9/encoder/vp9_encodeframe.c',
+    'libvpx/vp9/encoder/vp9_encodemb.c',
+    'libvpx/vp9/encoder/vp9_encodemv.c',
+    'libvpx/vp9/encoder/vp9_encoder.c',
+    'libvpx/vp9/encoder/vp9_ethread.c',
+    'libvpx/vp9/encoder/vp9_ext_ratectrl.c',
+    'libvpx/vp9/encoder/vp9_extend.c',
+    'libvpx/vp9/encoder/vp9_firstpass.c',
+    'libvpx/vp9/encoder/vp9_frame_scale.c',
+    'libvpx/vp9/encoder/vp9_lookahead.c',
+    'libvpx/vp9/encoder/vp9_mbgraph.c',
+    'libvpx/vp9/encoder/vp9_mcomp.c',
+    'libvpx/vp9/encoder/vp9_multi_thread.c',
+    'libvpx/vp9/encoder/vp9_noise_estimate.c',
+    'libvpx/vp9/encoder/vp9_picklpf.c',
+    'libvpx/vp9/encoder/vp9_pickmode.c',
+    'libvpx/vp9/encoder/vp9_quantize.c',
+    'libvpx/vp9/encoder/vp9_ratectrl.c',
+    'libvpx/vp9/encoder/vp9_rd.c',
+    'libvpx/vp9/encoder/vp9_rdopt.c',
+    'libvpx/vp9/encoder/vp9_resize.c',
+    'libvpx/vp9/encoder/vp9_segmentation.c',
+    'libvpx/vp9/encoder/vp9_skin_detection.c',
+    'libvpx/vp9/encoder/vp9_speed_features.c',
+    'libvpx/vp9/encoder/vp9_subexp.c',
+    'libvpx/vp9/encoder/vp9_svc_layercontext.c',
+    'libvpx/vp9/encoder/vp9_temporal_filter.c',
+    'libvpx/vp9/encoder/vp9_tokenize.c',
+    'libvpx/vp9/encoder/vp9_tpl_model.c',
+    'libvpx/vp9/encoder/vp9_treewriter.c',
+    'libvpx/vp9/vp9_cx_iface.c',
+    'libvpx/vp9/vp9_dx_iface.c',
+    'libvpx/vp9/vp9_iface_common.c',
+    'libvpx/vpx/src/vpx_codec.c',
+    'libvpx/vpx/src/vpx_decoder.c',
+    'libvpx/vpx/src/vpx_encoder.c',
+    'libvpx/vpx/src/vpx_image.c',
+    'libvpx/vpx_dsp/avg.c',
+    'libvpx/vpx_dsp/bitreader.c',
+    'libvpx/vpx_dsp/bitreader_buffer.c',
+    'libvpx/vpx_dsp/bitwriter.c',
+    'libvpx/vpx_dsp/bitwriter_buffer.c',
+    'libvpx/vpx_dsp/fwd_txfm.c',
+    'libvpx/vpx_dsp/intrapred.c',
+    'libvpx/vpx_dsp/inv_txfm.c',
+    'libvpx/vpx_dsp/loopfilter.c',
+    'libvpx/vpx_dsp/mips/avg_msa.c',
+    'libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c',
+    'libvpx/vpx_dsp/mips/fwd_txfm_msa.c',
+    'libvpx/vpx_dsp/mips/idct16x16_msa.c',
+    'libvpx/vpx_dsp/mips/idct32x32_msa.c',
+    'libvpx/vpx_dsp/mips/idct4x4_msa.c',
+    'libvpx/vpx_dsp/mips/idct8x8_msa.c',
+    'libvpx/vpx_dsp/mips/intrapred_msa.c',
+    'libvpx/vpx_dsp/mips/loopfilter_16_msa.c',
+    'libvpx/vpx_dsp/mips/loopfilter_4_msa.c',
+    'libvpx/vpx_dsp/mips/loopfilter_8_msa.c',
+    'libvpx/vpx_dsp/mips/sad_mmi.c',
+    'libvpx/vpx_dsp/mips/sad_msa.c',
+    'libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c',
+    'libvpx/vpx_dsp/mips/subtract_mmi.c',
+    'libvpx/vpx_dsp/mips/subtract_msa.c',
+    'libvpx/vpx_dsp/mips/sum_squares_msa.c',
+    'libvpx/vpx_dsp/mips/variance_mmi.c',
+    'libvpx/vpx_dsp/mips/variance_msa.c',
+    'libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c',
+    'libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c',
+    'libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c',
+    'libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c',
+    'libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c',
+    'libvpx/vpx_dsp/mips/vpx_convolve8_msa.c',
+    'libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c',
+    'libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c',
+    'libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c',
+    'libvpx/vpx_dsp/prob.c',
+    'libvpx/vpx_dsp/psnr.c',
+    'libvpx/vpx_dsp/quantize.c',
+    'libvpx/vpx_dsp/sad.c',
+    'libvpx/vpx_dsp/skin_detection.c',
+    'libvpx/vpx_dsp/sse.c',
+    'libvpx/vpx_dsp/subtract.c',
+    'libvpx/vpx_dsp/sum_squares.c',
+    'libvpx/vpx_dsp/variance.c',
+    'libvpx/vpx_dsp/vpx_convolve.c',
+    'libvpx/vpx_dsp/vpx_dsp_rtcd.c',
+    'libvpx/vpx_mem/vpx_mem.c',
+    'libvpx/vpx_ports/mips_cpudetect.c',
+    'libvpx/vpx_scale/generic/gen_scalers.c',
+    'libvpx/vpx_scale/generic/vpx_scale.c',
+    'libvpx/vpx_scale/generic/yv12config.c',
+    'libvpx/vpx_scale/generic/yv12extend.c',
+    'libvpx/vpx_scale/vpx_scale_rtcd.c',
+    'libvpx/vpx_util/vpx_thread.c',
+    'libvpx/vpx_util/vpx_write_yuv_frame.c',
+],
+  'PPC64LE_EXPORTS': [
+    'libvpx/vpx/vp8.h',
+    'libvpx/vpx/vp8cx.h',
+    'libvpx/vpx/vp8dx.h',
+    'libvpx/vpx/vpx_codec.h',
+    'libvpx/vpx/vpx_decoder.h',
+    'libvpx/vpx/vpx_encoder.h',
+    'libvpx/vpx/vpx_ext_ratectrl.h',
+    'libvpx/vpx/vpx_frame_buffer.h',
+    'libvpx/vpx/vpx_image.h',
+    'libvpx/vpx/vpx_integer.h',
+    'libvpx/vpx/vpx_tpl.h',
+    'libvpx/vpx_mem/include/vpx_mem_intrnl.h',
+    'libvpx/vpx_mem/vpx_mem.h',
+    'libvpx/vpx_ports/bitops.h',
+    'libvpx/vpx_ports/compiler_attributes.h',
+    'libvpx/vpx_ports/mem.h',
+    'libvpx/vpx_ports/ppc.h',
+    'libvpx/vpx_ports/static_assert.h',
+    'libvpx/vpx_ports/system_state.h',
+    'libvpx/vpx_ports/vpx_timer.h',
+    'libvpx/vpx_scale/vpx_scale.h',
+    'libvpx/vpx_scale/yv12config.h',
+],
+  'PPC64LE_SOURCES': [
+    'libvpx/vp8/common/alloccommon.c',
+    'libvpx/vp8/common/blockd.c',
+    'libvpx/vp8/common/dequantize.c',
+    'libvpx/vp8/common/entropy.c',
+    'libvpx/vp8/common/entropymode.c',
+    'libvpx/vp8/common/entropymv.c',
+    'libvpx/vp8/common/extend.c',
+    'libvpx/vp8/common/filter.c',
+    'libvpx/vp8/common/findnearmv.c',
+    'libvpx/vp8/common/generic/systemdependent.c',
+    'libvpx/vp8/common/idct_blk.c',
+    'libvpx/vp8/common/idctllm.c',
+    'libvpx/vp8/common/loopfilter_filters.c',
+    'libvpx/vp8/common/mbpitch.c',
+    'libvpx/vp8/common/modecont.c',
+    'libvpx/vp8/common/quant_common.c',
+    'libvpx/vp8/common/reconinter.c',
+    'libvpx/vp8/common/reconintra.c',
+    'libvpx/vp8/common/reconintra4x4.c',
+    'libvpx/vp8/common/rtcd.c',
+    'libvpx/vp8/common/setupintrarecon.c',
+    'libvpx/vp8/common/swapyv12buffer.c',
+    'libvpx/vp8/common/treecoder.c',
+    'libvpx/vp8/common/vp8_loopfilter.c',
+    'libvpx/vp8/common/vp8_skin_detection.c',
+    'libvpx/vp8/decoder/dboolhuff.c',
+    'libvpx/vp8/decoder/decodeframe.c',
+    'libvpx/vp8/decoder/decodemv.c',
+    'libvpx/vp8/decoder/detokenize.c',
+    'libvpx/vp8/decoder/onyxd_if.c',
+    'libvpx/vp8/decoder/threading.c',
+    'libvpx/vp8/encoder/bitstream.c',
+    'libvpx/vp8/encoder/boolhuff.c',
+    'libvpx/vp8/encoder/copy_c.c',
+    'libvpx/vp8/encoder/dct.c',
+    'libvpx/vp8/encoder/denoising.c',
+    'libvpx/vp8/encoder/encodeframe.c',
+    'libvpx/vp8/encoder/encodeintra.c',
+    'libvpx/vp8/encoder/encodemb.c',
+    'libvpx/vp8/encoder/encodemv.c',
+    'libvpx/vp8/encoder/ethreading.c',
+    'libvpx/vp8/encoder/firstpass.c',
+    'libvpx/vp8/encoder/lookahead.c',
+    'libvpx/vp8/encoder/mcomp.c',
+    'libvpx/vp8/encoder/modecosts.c',
+    'libvpx/vp8/encoder/mr_dissim.c',
+    'libvpx/vp8/encoder/onyx_if.c',
+    'libvpx/vp8/encoder/pickinter.c',
+    'libvpx/vp8/encoder/picklpf.c',
+    'libvpx/vp8/encoder/ratectrl.c',
+    'libvpx/vp8/encoder/rdopt.c',
+    'libvpx/vp8/encoder/segmentation.c',
+    'libvpx/vp8/encoder/temporal_filter.c',
+    'libvpx/vp8/encoder/tokenize.c',
+    'libvpx/vp8/encoder/treewriter.c',
+    'libvpx/vp8/encoder/vp8_quantize.c',
+    'libvpx/vp8/vp8_cx_iface.c',
+    'libvpx/vp8/vp8_dx_iface.c',
+    'libvpx/vp9/common/ppc/vp9_idct_vsx.c',
+    'libvpx/vp9/common/vp9_alloccommon.c',
+    'libvpx/vp9/common/vp9_blockd.c',
+    'libvpx/vp9/common/vp9_common_data.c',
+    'libvpx/vp9/common/vp9_entropy.c',
+    'libvpx/vp9/common/vp9_entropymode.c',
+    'libvpx/vp9/common/vp9_entropymv.c',
+    'libvpx/vp9/common/vp9_filter.c',
+    'libvpx/vp9/common/vp9_frame_buffers.c',
+    'libvpx/vp9/common/vp9_idct.c',
+    'libvpx/vp9/common/vp9_loopfilter.c',
+    'libvpx/vp9/common/vp9_mvref_common.c',
+    'libvpx/vp9/common/vp9_pred_common.c',
+    'libvpx/vp9/common/vp9_quant_common.c',
+    'libvpx/vp9/common/vp9_reconinter.c',
+    'libvpx/vp9/common/vp9_reconintra.c',
+    'libvpx/vp9/common/vp9_rtcd.c',
+    'libvpx/vp9/common/vp9_scale.c',
+    'libvpx/vp9/common/vp9_scan.c',
+    'libvpx/vp9/common/vp9_seg_common.c',
+    'libvpx/vp9/common/vp9_thread_common.c',
+    'libvpx/vp9/common/vp9_tile_common.c',
+    'libvpx/vp9/decoder/vp9_decodeframe.c',
+    'libvpx/vp9/decoder/vp9_decodemv.c',
+    'libvpx/vp9/decoder/vp9_decoder.c',
+    'libvpx/vp9/decoder/vp9_detokenize.c',
+    'libvpx/vp9/decoder/vp9_dsubexp.c',
+    'libvpx/vp9/decoder/vp9_job_queue.c',
+    'libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c',
+    'libvpx/vp9/encoder/vp9_alt_ref_aq.c',
+    'libvpx/vp9/encoder/vp9_aq_360.c',
+    'libvpx/vp9/encoder/vp9_aq_complexity.c',
+    'libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c',
+    'libvpx/vp9/encoder/vp9_aq_variance.c',
+    'libvpx/vp9/encoder/vp9_bitstream.c',
+    'libvpx/vp9/encoder/vp9_context_tree.c',
+    'libvpx/vp9/encoder/vp9_cost.c',
+    'libvpx/vp9/encoder/vp9_dct.c',
+    'libvpx/vp9/encoder/vp9_encodeframe.c',
+    'libvpx/vp9/encoder/vp9_encodemb.c',
+    'libvpx/vp9/encoder/vp9_encodemv.c',
+    'libvpx/vp9/encoder/vp9_encoder.c',
+    'libvpx/vp9/encoder/vp9_ethread.c',
+    'libvpx/vp9/encoder/vp9_ext_ratectrl.c',
+    'libvpx/vp9/encoder/vp9_extend.c',
+    'libvpx/vp9/encoder/vp9_firstpass.c',
+    'libvpx/vp9/encoder/vp9_frame_scale.c',
+    'libvpx/vp9/encoder/vp9_lookahead.c',
+    'libvpx/vp9/encoder/vp9_mbgraph.c',
+    'libvpx/vp9/encoder/vp9_mcomp.c',
+    'libvpx/vp9/encoder/vp9_multi_thread.c',
+    'libvpx/vp9/encoder/vp9_noise_estimate.c',
+    'libvpx/vp9/encoder/vp9_picklpf.c',
+    'libvpx/vp9/encoder/vp9_pickmode.c',
+    'libvpx/vp9/encoder/vp9_quantize.c',
+    'libvpx/vp9/encoder/vp9_ratectrl.c',
+    'libvpx/vp9/encoder/vp9_rd.c',
+    'libvpx/vp9/encoder/vp9_rdopt.c',
+    'libvpx/vp9/encoder/vp9_resize.c',
+    'libvpx/vp9/encoder/vp9_segmentation.c',
+    'libvpx/vp9/encoder/vp9_skin_detection.c',
+    'libvpx/vp9/encoder/vp9_speed_features.c',
+    'libvpx/vp9/encoder/vp9_subexp.c',
+    'libvpx/vp9/encoder/vp9_svc_layercontext.c',
+    'libvpx/vp9/encoder/vp9_temporal_filter.c',
+    'libvpx/vp9/encoder/vp9_tokenize.c',
+    'libvpx/vp9/encoder/vp9_tpl_model.c',
+    'libvpx/vp9/encoder/vp9_treewriter.c',
+    'libvpx/vp9/vp9_cx_iface.c',
+    'libvpx/vp9/vp9_dx_iface.c',
+    'libvpx/vp9/vp9_iface_common.c',
+    'libvpx/vpx/src/vpx_codec.c',
+    'libvpx/vpx/src/vpx_decoder.c',
+    'libvpx/vpx/src/vpx_encoder.c',
+    'libvpx/vpx/src/vpx_image.c',
+    'libvpx/vpx_dsp/avg.c',
+    'libvpx/vpx_dsp/bitreader.c',
+    'libvpx/vpx_dsp/bitreader_buffer.c',
+    'libvpx/vpx_dsp/bitwriter.c',
+    'libvpx/vpx_dsp/bitwriter_buffer.c',
+    'libvpx/vpx_dsp/fwd_txfm.c',
+    'libvpx/vpx_dsp/intrapred.c',
+    'libvpx/vpx_dsp/inv_txfm.c',
+    'libvpx/vpx_dsp/loopfilter.c',
+    'libvpx/vpx_dsp/ppc/fdct32x32_vsx.c',
+    'libvpx/vpx_dsp/ppc/hadamard_vsx.c',
+    'libvpx/vpx_dsp/ppc/intrapred_vsx.c',
+    'libvpx/vpx_dsp/ppc/inv_txfm_vsx.c',
+    'libvpx/vpx_dsp/ppc/quantize_vsx.c',
+    'libvpx/vpx_dsp/ppc/sad_vsx.c',
+    'libvpx/vpx_dsp/ppc/subtract_vsx.c',
+    'libvpx/vpx_dsp/ppc/variance_vsx.c',
+    'libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c',
+    'libvpx/vpx_dsp/prob.c',
+    'libvpx/vpx_dsp/psnr.c',
+    'libvpx/vpx_dsp/quantize.c',
+    'libvpx/vpx_dsp/sad.c',
+    'libvpx/vpx_dsp/skin_detection.c',
+    'libvpx/vpx_dsp/sse.c',
+    'libvpx/vpx_dsp/subtract.c',
+    'libvpx/vpx_dsp/sum_squares.c',
+    'libvpx/vpx_dsp/variance.c',
+    'libvpx/vpx_dsp/vpx_convolve.c',
+    'libvpx/vpx_dsp/vpx_dsp_rtcd.c',
+    'libvpx/vpx_mem/vpx_mem.c',
+    'libvpx/vpx_ports/ppc_cpudetect.c',
+    'libvpx/vpx_scale/generic/gen_scalers.c',
+    'libvpx/vpx_scale/generic/vpx_scale.c',
+    'libvpx/vpx_scale/generic/yv12config.c',
+    'libvpx/vpx_scale/generic/yv12extend.c',
+    'libvpx/vpx_scale/vpx_scale_rtcd.c',
+    'libvpx/vpx_util/vpx_thread.c',
+    'libvpx/vpx_util/vpx_write_yuv_frame.c',
+],
+  'LOONGARCH64_EXPORTS': [
+    'libvpx/vpx/vp8.h',
+    'libvpx/vpx/vp8cx.h',
+    'libvpx/vpx/vp8dx.h',
+    'libvpx/vpx/vpx_codec.h',
+    'libvpx/vpx/vpx_decoder.h',
+    'libvpx/vpx/vpx_encoder.h',
+    'libvpx/vpx/vpx_ext_ratectrl.h',
+    'libvpx/vpx/vpx_frame_buffer.h',
+    'libvpx/vpx/vpx_image.h',
+    'libvpx/vpx/vpx_integer.h',
+    'libvpx/vpx/vpx_tpl.h',
+    'libvpx/vpx_mem/include/vpx_mem_intrnl.h',
+    'libvpx/vpx_mem/vpx_mem.h',
+    'libvpx/vpx_ports/bitops.h',
+    'libvpx/vpx_ports/compiler_attributes.h',
+    'libvpx/vpx_ports/loongarch.h',
+    'libvpx/vpx_ports/mem.h',
+    'libvpx/vpx_ports/static_assert.h',
+    'libvpx/vpx_ports/system_state.h',
+    'libvpx/vpx_ports/vpx_timer.h',
+    'libvpx/vpx_scale/vpx_scale.h',
+    'libvpx/vpx_scale/yv12config.h',
+],
+  'LOONGARCH64_SOURCES': [
+    'libvpx/vp8/common/alloccommon.c',
+    'libvpx/vp8/common/blockd.c',
+    'libvpx/vp8/common/dequantize.c',
+    'libvpx/vp8/common/entropy.c',
+    'libvpx/vp8/common/entropymode.c',
+    'libvpx/vp8/common/entropymv.c',
+    'libvpx/vp8/common/extend.c',
+    'libvpx/vp8/common/filter.c',
+    'libvpx/vp8/common/findnearmv.c',
+    'libvpx/vp8/common/generic/systemdependent.c',
+    'libvpx/vp8/common/idct_blk.c',
+    'libvpx/vp8/common/idctllm.c',
+    'libvpx/vp8/common/loongarch/idct_lsx.c',
+    'libvpx/vp8/common/loongarch/loopfilter_filters_lsx.c',
+    'libvpx/vp8/common/loongarch/sixtap_filter_lsx.c',
+    'libvpx/vp8/common/loopfilter_filters.c',
+    'libvpx/vp8/common/mbpitch.c',
+    'libvpx/vp8/common/modecont.c',
+    'libvpx/vp8/common/quant_common.c',
+    'libvpx/vp8/common/reconinter.c',
+    'libvpx/vp8/common/reconintra.c',
+    'libvpx/vp8/common/reconintra4x4.c',
+    'libvpx/vp8/common/rtcd.c',
+    'libvpx/vp8/common/setupintrarecon.c',
+    'libvpx/vp8/common/swapyv12buffer.c',
+    'libvpx/vp8/common/treecoder.c',
+    'libvpx/vp8/common/vp8_loopfilter.c',
+    'libvpx/vp8/common/vp8_skin_detection.c',
+    'libvpx/vp8/decoder/dboolhuff.c',
+    'libvpx/vp8/decoder/decodeframe.c',
+    'libvpx/vp8/decoder/decodemv.c',
+    'libvpx/vp8/decoder/detokenize.c',
+    'libvpx/vp8/decoder/onyxd_if.c',
+    'libvpx/vp8/decoder/threading.c',
+    'libvpx/vp8/encoder/bitstream.c',
+    'libvpx/vp8/encoder/boolhuff.c',
+    'libvpx/vp8/encoder/copy_c.c',
+    'libvpx/vp8/encoder/dct.c',
+    'libvpx/vp8/encoder/denoising.c',
+    'libvpx/vp8/encoder/encodeframe.c',
+    'libvpx/vp8/encoder/encodeintra.c',
+    'libvpx/vp8/encoder/encodemb.c',
+    'libvpx/vp8/encoder/encodemv.c',
+    'libvpx/vp8/encoder/ethreading.c',
+    'libvpx/vp8/encoder/firstpass.c',
+    'libvpx/vp8/encoder/lookahead.c',
+    'libvpx/vp8/encoder/loongarch/dct_lsx.c',
+    'libvpx/vp8/encoder/loongarch/encodeopt_lsx.c',
+    'libvpx/vp8/encoder/loongarch/vp8_quantize_lsx.c',
+    'libvpx/vp8/encoder/mcomp.c',
+    'libvpx/vp8/encoder/modecosts.c',
+    'libvpx/vp8/encoder/mr_dissim.c',
+    'libvpx/vp8/encoder/onyx_if.c',
+    'libvpx/vp8/encoder/pickinter.c',
+    'libvpx/vp8/encoder/picklpf.c',
+    'libvpx/vp8/encoder/ratectrl.c',
+    'libvpx/vp8/encoder/rdopt.c',
+    'libvpx/vp8/encoder/segmentation.c',
+    'libvpx/vp8/encoder/temporal_filter.c',
+    'libvpx/vp8/encoder/tokenize.c',
+    'libvpx/vp8/encoder/treewriter.c',
+    'libvpx/vp8/encoder/vp8_quantize.c',
+    'libvpx/vp8/vp8_cx_iface.c',
+    'libvpx/vp8/vp8_dx_iface.c',
+    'libvpx/vp9/common/vp9_alloccommon.c',
+    'libvpx/vp9/common/vp9_blockd.c',
+    'libvpx/vp9/common/vp9_common_data.c',
+    'libvpx/vp9/common/vp9_entropy.c',
+    'libvpx/vp9/common/vp9_entropymode.c',
+    'libvpx/vp9/common/vp9_entropymv.c',
+    'libvpx/vp9/common/vp9_filter.c',
+    'libvpx/vp9/common/vp9_frame_buffers.c',
+    'libvpx/vp9/common/vp9_idct.c',
+    'libvpx/vp9/common/vp9_loopfilter.c',
+    'libvpx/vp9/common/vp9_mvref_common.c',
+    'libvpx/vp9/common/vp9_pred_common.c',
+    'libvpx/vp9/common/vp9_quant_common.c',
+    'libvpx/vp9/common/vp9_reconinter.c',
+    'libvpx/vp9/common/vp9_reconintra.c',
+    'libvpx/vp9/common/vp9_rtcd.c',
+    'libvpx/vp9/common/vp9_scale.c',
+    'libvpx/vp9/common/vp9_scan.c',
+    'libvpx/vp9/common/vp9_seg_common.c',
+    'libvpx/vp9/common/vp9_thread_common.c',
+    'libvpx/vp9/common/vp9_tile_common.c',
+    'libvpx/vp9/decoder/vp9_decodeframe.c',
+    'libvpx/vp9/decoder/vp9_decodemv.c',
+    'libvpx/vp9/decoder/vp9_decoder.c',
+    'libvpx/vp9/decoder/vp9_detokenize.c',
+    'libvpx/vp9/decoder/vp9_dsubexp.c',
+    'libvpx/vp9/decoder/vp9_job_queue.c',
+    'libvpx/vp9/encoder/vp9_alt_ref_aq.c',
+    'libvpx/vp9/encoder/vp9_aq_360.c',
+    'libvpx/vp9/encoder/vp9_aq_complexity.c',
+    'libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c',
+    'libvpx/vp9/encoder/vp9_aq_variance.c',
+    'libvpx/vp9/encoder/vp9_bitstream.c',
+    'libvpx/vp9/encoder/vp9_context_tree.c',
+    'libvpx/vp9/encoder/vp9_cost.c',
+    'libvpx/vp9/encoder/vp9_dct.c',
+    'libvpx/vp9/encoder/vp9_encodeframe.c',
+    'libvpx/vp9/encoder/vp9_encodemb.c',
+    'libvpx/vp9/encoder/vp9_encodemv.c',
+    'libvpx/vp9/encoder/vp9_encoder.c',
+    'libvpx/vp9/encoder/vp9_ethread.c',
+    'libvpx/vp9/encoder/vp9_ext_ratectrl.c',
+    'libvpx/vp9/encoder/vp9_extend.c',
+    'libvpx/vp9/encoder/vp9_firstpass.c',
+    'libvpx/vp9/encoder/vp9_frame_scale.c',
+    'libvpx/vp9/encoder/vp9_lookahead.c',
+    'libvpx/vp9/encoder/vp9_mbgraph.c',
+    'libvpx/vp9/encoder/vp9_mcomp.c',
+    'libvpx/vp9/encoder/vp9_multi_thread.c',
+    'libvpx/vp9/encoder/vp9_noise_estimate.c',
+    'libvpx/vp9/encoder/vp9_picklpf.c',
+    'libvpx/vp9/encoder/vp9_pickmode.c',
+    'libvpx/vp9/encoder/vp9_quantize.c',
+    'libvpx/vp9/encoder/vp9_ratectrl.c',
+    'libvpx/vp9/encoder/vp9_rd.c',
+    'libvpx/vp9/encoder/vp9_rdopt.c',
+    'libvpx/vp9/encoder/vp9_resize.c',
+    'libvpx/vp9/encoder/vp9_segmentation.c',
+    'libvpx/vp9/encoder/vp9_skin_detection.c',
+    'libvpx/vp9/encoder/vp9_speed_features.c',
+    'libvpx/vp9/encoder/vp9_subexp.c',
+    'libvpx/vp9/encoder/vp9_svc_layercontext.c',
+    'libvpx/vp9/encoder/vp9_temporal_filter.c',
+    'libvpx/vp9/encoder/vp9_tokenize.c',
+    'libvpx/vp9/encoder/vp9_tpl_model.c',
+    'libvpx/vp9/encoder/vp9_treewriter.c',
+    'libvpx/vp9/vp9_cx_iface.c',
+    'libvpx/vp9/vp9_dx_iface.c',
+    'libvpx/vp9/vp9_iface_common.c',
+    'libvpx/vpx/src/vpx_codec.c',
+    'libvpx/vpx/src/vpx_decoder.c',
+    'libvpx/vpx/src/vpx_encoder.c',
+    'libvpx/vpx/src/vpx_image.c',
+    'libvpx/vpx_dsp/avg.c',
+    'libvpx/vpx_dsp/bitreader.c',
+    'libvpx/vpx_dsp/bitreader_buffer.c',
+    'libvpx/vpx_dsp/bitwriter.c',
+    'libvpx/vpx_dsp/bitwriter_buffer.c',
+    'libvpx/vpx_dsp/fwd_txfm.c',
+    'libvpx/vpx_dsp/intrapred.c',
+    'libvpx/vpx_dsp/inv_txfm.c',
+    'libvpx/vpx_dsp/loongarch/avg_lsx.c',
+    'libvpx/vpx_dsp/loongarch/avg_pred_lsx.c',
+    'libvpx/vpx_dsp/loongarch/fwd_dct32x32_lsx.c',
+    'libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.c',
+    'libvpx/vpx_dsp/loongarch/idct32x32_lsx.c',
+    'libvpx/vpx_dsp/loongarch/intrapred_lsx.c',
+    'libvpx/vpx_dsp/loongarch/loopfilter_16_lsx.c',
+    'libvpx/vpx_dsp/loongarch/loopfilter_4_lsx.c',
+    'libvpx/vpx_dsp/loongarch/loopfilter_8_lsx.c',
+    'libvpx/vpx_dsp/loongarch/quantize_lsx.c',
+    'libvpx/vpx_dsp/loongarch/sad_lsx.c',
+    'libvpx/vpx_dsp/loongarch/sub_pixel_variance_lsx.c',
+    'libvpx/vpx_dsp/loongarch/subtract_lsx.c',
+    'libvpx/vpx_dsp/loongarch/variance_lsx.c',
+    'libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c',
+    'libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c',
+    'libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c',
+    'libvpx/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c',
+    'libvpx/vpx_dsp/loongarch/vpx_convolve8_lsx.c',
+    'libvpx/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c',
+    'libvpx/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c',
+    'libvpx/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c',
+    'libvpx/vpx_dsp/loopfilter.c',
+    'libvpx/vpx_dsp/prob.c',
+    'libvpx/vpx_dsp/psnr.c',
+    'libvpx/vpx_dsp/quantize.c',
+    'libvpx/vpx_dsp/sad.c',
+    'libvpx/vpx_dsp/skin_detection.c',
+    'libvpx/vpx_dsp/sse.c',
+    'libvpx/vpx_dsp/subtract.c',
+    'libvpx/vpx_dsp/sum_squares.c',
+    'libvpx/vpx_dsp/variance.c',
+    'libvpx/vpx_dsp/vpx_convolve.c',
+    'libvpx/vpx_dsp/vpx_dsp_rtcd.c',
+    'libvpx/vpx_mem/vpx_mem.c',
+    'libvpx/vpx_ports/loongarch_cpudetect.c',
+    'libvpx/vpx_scale/generic/gen_scalers.c',
+    'libvpx/vpx_scale/generic/vpx_scale.c',
+    'libvpx/vpx_scale/generic/yv12config.c',
+    'libvpx/vpx_scale/generic/yv12extend.c',
+    'libvpx/vpx_scale/vpx_scale_rtcd.c',
+    'libvpx/vpx_util/vpx_thread.c',
+    'libvpx/vpx_util/vpx_write_yuv_frame.c',
+],
+  'GENERIC_EXPORTS': [
+    'libvpx/vpx/vp8.h',
+    'libvpx/vpx/vp8cx.h',
+    'libvpx/vpx/vp8dx.h',
+    'libvpx/vpx/vpx_codec.h',
+    'libvpx/vpx/vpx_decoder.h',
+    'libvpx/vpx/vpx_encoder.h',
+    'libvpx/vpx/vpx_ext_ratectrl.h',
+    'libvpx/vpx/vpx_frame_buffer.h',
+    'libvpx/vpx/vpx_image.h',
+    'libvpx/vpx/vpx_integer.h',
+    'libvpx/vpx/vpx_tpl.h',
+    'libvpx/vpx_mem/include/vpx_mem_intrnl.h',
+    'libvpx/vpx_mem/vpx_mem.h',
+    'libvpx/vpx_ports/bitops.h',
+    'libvpx/vpx_ports/compiler_attributes.h',
+    'libvpx/vpx_ports/mem.h',
+    'libvpx/vpx_ports/static_assert.h',
+    'libvpx/vpx_ports/system_state.h',
+    'libvpx/vpx_ports/vpx_timer.h',
+    'libvpx/vpx_scale/vpx_scale.h',
+    'libvpx/vpx_scale/yv12config.h',
+],
+  'GENERIC_SOURCES': [
+    'libvpx/vp8/common/alloccommon.c',
+    'libvpx/vp8/common/blockd.c',
+    'libvpx/vp8/common/dequantize.c',
+    'libvpx/vp8/common/entropy.c',
+    'libvpx/vp8/common/entropymode.c',
+    'libvpx/vp8/common/entropymv.c',
+    'libvpx/vp8/common/extend.c',
+    'libvpx/vp8/common/filter.c',
+    'libvpx/vp8/common/findnearmv.c',
+    'libvpx/vp8/common/generic/systemdependent.c',
+    'libvpx/vp8/common/idct_blk.c',
+    'libvpx/vp8/common/idctllm.c',
+    'libvpx/vp8/common/loopfilter_filters.c',
+    'libvpx/vp8/common/mbpitch.c',
+    'libvpx/vp8/common/modecont.c',
+    'libvpx/vp8/common/quant_common.c',
+    'libvpx/vp8/common/reconinter.c',
+    'libvpx/vp8/common/reconintra.c',
+    'libvpx/vp8/common/reconintra4x4.c',
+    'libvpx/vp8/common/rtcd.c',
+    'libvpx/vp8/common/setupintrarecon.c',
+    'libvpx/vp8/common/swapyv12buffer.c',
+    'libvpx/vp8/common/treecoder.c',
+    'libvpx/vp8/common/vp8_loopfilter.c',
+    'libvpx/vp8/common/vp8_skin_detection.c',
+    'libvpx/vp8/decoder/dboolhuff.c',
+    'libvpx/vp8/decoder/decodeframe.c',
+    'libvpx/vp8/decoder/decodemv.c',
+    'libvpx/vp8/decoder/detokenize.c',
+    'libvpx/vp8/decoder/onyxd_if.c',
+    'libvpx/vp8/decoder/threading.c',
+    'libvpx/vp8/encoder/bitstream.c',
+    'libvpx/vp8/encoder/boolhuff.c',
+    'libvpx/vp8/encoder/copy_c.c',
+    'libvpx/vp8/encoder/dct.c',
+    'libvpx/vp8/encoder/denoising.c',
+    'libvpx/vp8/encoder/encodeframe.c',
+    'libvpx/vp8/encoder/encodeintra.c',
+    'libvpx/vp8/encoder/encodemb.c',
+    'libvpx/vp8/encoder/encodemv.c',
+    'libvpx/vp8/encoder/ethreading.c',
+    'libvpx/vp8/encoder/firstpass.c',
+    'libvpx/vp8/encoder/lookahead.c',
+    'libvpx/vp8/encoder/mcomp.c',
+    'libvpx/vp8/encoder/modecosts.c',
+    'libvpx/vp8/encoder/mr_dissim.c',
+    'libvpx/vp8/encoder/onyx_if.c',
+    'libvpx/vp8/encoder/pickinter.c',
+    'libvpx/vp8/encoder/picklpf.c',
+    'libvpx/vp8/encoder/ratectrl.c',
+    'libvpx/vp8/encoder/rdopt.c',
+    'libvpx/vp8/encoder/segmentation.c',
+    'libvpx/vp8/encoder/temporal_filter.c',
+    'libvpx/vp8/encoder/tokenize.c',
+    'libvpx/vp8/encoder/treewriter.c',
+    'libvpx/vp8/encoder/vp8_quantize.c',
+    'libvpx/vp8/vp8_cx_iface.c',
+    'libvpx/vp8/vp8_dx_iface.c',
+    'libvpx/vp9/common/vp9_alloccommon.c',
+    'libvpx/vp9/common/vp9_blockd.c',
+    'libvpx/vp9/common/vp9_common_data.c',
+    'libvpx/vp9/common/vp9_entropy.c',
+    'libvpx/vp9/common/vp9_entropymode.c',
+    'libvpx/vp9/common/vp9_entropymv.c',
+    'libvpx/vp9/common/vp9_filter.c',
+    'libvpx/vp9/common/vp9_frame_buffers.c',
+    'libvpx/vp9/common/vp9_idct.c',
+    'libvpx/vp9/common/vp9_loopfilter.c',
+    'libvpx/vp9/common/vp9_mvref_common.c',
+    'libvpx/vp9/common/vp9_pred_common.c',
+    'libvpx/vp9/common/vp9_quant_common.c',
+    'libvpx/vp9/common/vp9_reconinter.c',
+    'libvpx/vp9/common/vp9_reconintra.c',
+    'libvpx/vp9/common/vp9_rtcd.c',
+    'libvpx/vp9/common/vp9_scale.c',
+    'libvpx/vp9/common/vp9_scan.c',
+    'libvpx/vp9/common/vp9_seg_common.c',
+    'libvpx/vp9/common/vp9_thread_common.c',
+    'libvpx/vp9/common/vp9_tile_common.c',
+    'libvpx/vp9/decoder/vp9_decodeframe.c',
+    'libvpx/vp9/decoder/vp9_decodemv.c',
+    'libvpx/vp9/decoder/vp9_decoder.c',
+    'libvpx/vp9/decoder/vp9_detokenize.c',
+    'libvpx/vp9/decoder/vp9_dsubexp.c',
+    'libvpx/vp9/decoder/vp9_job_queue.c',
+    'libvpx/vp9/encoder/vp9_alt_ref_aq.c',
+    'libvpx/vp9/encoder/vp9_aq_360.c',
+    'libvpx/vp9/encoder/vp9_aq_complexity.c',
+    'libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c',
+    'libvpx/vp9/encoder/vp9_aq_variance.c',
+    'libvpx/vp9/encoder/vp9_bitstream.c',
+    'libvpx/vp9/encoder/vp9_context_tree.c',
+    'libvpx/vp9/encoder/vp9_cost.c',
+    'libvpx/vp9/encoder/vp9_dct.c',
+    'libvpx/vp9/encoder/vp9_encodeframe.c',
+    'libvpx/vp9/encoder/vp9_encodemb.c',
+    'libvpx/vp9/encoder/vp9_encodemv.c',
+    'libvpx/vp9/encoder/vp9_encoder.c',
+    'libvpx/vp9/encoder/vp9_ethread.c',
+    'libvpx/vp9/encoder/vp9_ext_ratectrl.c',
+    'libvpx/vp9/encoder/vp9_extend.c',
+    'libvpx/vp9/encoder/vp9_firstpass.c',
+    'libvpx/vp9/encoder/vp9_frame_scale.c',
+    'libvpx/vp9/encoder/vp9_lookahead.c',
+    'libvpx/vp9/encoder/vp9_mbgraph.c',
+    'libvpx/vp9/encoder/vp9_mcomp.c',
+    'libvpx/vp9/encoder/vp9_multi_thread.c',
+    'libvpx/vp9/encoder/vp9_noise_estimate.c',
+    'libvpx/vp9/encoder/vp9_picklpf.c',
+    'libvpx/vp9/encoder/vp9_pickmode.c',
+    'libvpx/vp9/encoder/vp9_quantize.c',
+    'libvpx/vp9/encoder/vp9_ratectrl.c',
+    'libvpx/vp9/encoder/vp9_rd.c',
+    'libvpx/vp9/encoder/vp9_rdopt.c',
+    'libvpx/vp9/encoder/vp9_resize.c',
+    'libvpx/vp9/encoder/vp9_segmentation.c',
+    'libvpx/vp9/encoder/vp9_skin_detection.c',
+    'libvpx/vp9/encoder/vp9_speed_features.c',
+    'libvpx/vp9/encoder/vp9_subexp.c',
+    'libvpx/vp9/encoder/vp9_svc_layercontext.c',
+    'libvpx/vp9/encoder/vp9_temporal_filter.c',
+    'libvpx/vp9/encoder/vp9_tokenize.c',
+    'libvpx/vp9/encoder/vp9_tpl_model.c',
+    'libvpx/vp9/encoder/vp9_treewriter.c',
+    'libvpx/vp9/vp9_cx_iface.c',
+    'libvpx/vp9/vp9_dx_iface.c',
+    'libvpx/vp9/vp9_iface_common.c',
+    'libvpx/vpx/src/vpx_codec.c',
+    'libvpx/vpx/src/vpx_decoder.c',
+    'libvpx/vpx/src/vpx_encoder.c',
+    'libvpx/vpx/src/vpx_image.c',
+    'libvpx/vpx_dsp/avg.c',
+    'libvpx/vpx_dsp/bitreader.c',
+    'libvpx/vpx_dsp/bitreader_buffer.c',
+    'libvpx/vpx_dsp/bitwriter.c',
+    'libvpx/vpx_dsp/bitwriter_buffer.c',
+    'libvpx/vpx_dsp/fwd_txfm.c',
+    'libvpx/vpx_dsp/intrapred.c',
+    'libvpx/vpx_dsp/inv_txfm.c',
+    'libvpx/vpx_dsp/loopfilter.c',
+    'libvpx/vpx_dsp/prob.c',
+    'libvpx/vpx_dsp/psnr.c',
+    'libvpx/vpx_dsp/quantize.c',
+    'libvpx/vpx_dsp/sad.c',
+    'libvpx/vpx_dsp/skin_detection.c',
+    'libvpx/vpx_dsp/sse.c',
     'libvpx/vpx_dsp/subtract.c',
     'libvpx/vpx_dsp/sum_squares.c',
     'libvpx/vpx_dsp/variance.c',
@@ -912,5 +2072,6 @@ files = {
     'libvpx/vpx_scale/generic/yv12extend.c',
     'libvpx/vpx_scale/vpx_scale_rtcd.c',
     'libvpx/vpx_util/vpx_thread.c',
+    'libvpx/vpx_util/vpx_write_yuv_frame.c',
 ],
 }